Version in base suite: 10.11.11-0+deb12u1

Base version: mariadb_10.11.11-0+deb12u1
Target version: mariadb_10.11.13-0+deb12u1
Base file: /srv/ftp-master.debian.org/ftp/pool/main/m/mariadb/mariadb_10.11.11-0+deb12u1.dsc
Target file: /srv/ftp-master.debian.org/policy/pool/main/m/mariadb/mariadb_10.11.13-0+deb12u1.dsc

 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png                               |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png                                |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png                              |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png           |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png            |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png           |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png                 |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png               |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png                      |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png                       |binary
 /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png                       |binary
 mariadb-10.11.13/CMakeLists.txt                                                                                                                                  |   10 
 mariadb-10.11.13/Docs/INFO_SRC                                                                                                                                   |   10 
 mariadb-10.11.13/VERSION                                                                                                                                         |    2 
 mariadb-10.11.13/appveyor.yml                                                                                                                                    |   38 
 mariadb-10.11.13/client/mysql_upgrade.c                                                                                                                          |   14 
 mariadb-10.11.13/client/mysqlbinlog.cc                                                                                                                           |   61 
 mariadb-10.11.13/client/mysqldump.c                                                                                                                              |   15 
 mariadb-10.11.13/client/mysqlslap.c                                                                                                                              |   12 
 mariadb-10.11.13/client/mysqltest.cc                                                                                                                             |   16 
 mariadb-10.11.13/cmake/cpack_rpm.cmake                                                                                                                           |    4 
 mariadb-10.11.13/cmake/libfmt.cmake                                                                                                                              |    5 
 mariadb-10.11.13/cmake/os/Windows.cmake                                                                                                                          |  520 
 mariadb-10.11.13/cmake/os/WindowsCache.cmake                                                                                                                     |   19 
 mariadb-10.11.13/cmake/pcre.cmake                                                                                                                                |   12 
 mariadb-10.11.13/cmake/plugin.cmake                                                                                                                              |    5 
 mariadb-10.11.13/config.h.cmake                                                                                                                                  |   48 
 mariadb-10.11.13/debian/changelog                                                                                                                                |   39 
 mariadb-10.11.13/debian/mariadb-server-core.postinst                                                                                                             |   49 
 mariadb-10.11.13/debian/patches/fix-reproducible-builds-rocksdb.patch                                                                                            |   26 
 mariadb-10.11.13/debian/patches/fix-spelling-rocksdb.patch                                                                                                       |   38 
 mariadb-10.11.13/debian/patches/rocksdb-kfreebsd.patch                                                                                                           |  150 
 mariadb-10.11.13/debian/patches/series                                                                                                                           |    3 
 mariadb-10.11.13/debian/salsa-ci-enable-sec-and-update-repos.sh                                                                                                  |   12 
 mariadb-10.11.13/debian/salsa-ci.yml                                                                                                                             |   11 
 mariadb-10.11.13/debian/tests/traces/mariadb-verbose-help.expected                                                                                               |    5 
 mariadb-10.11.13/debian/tests/traces/mariadbd-verbose-help.expected                                                                                              |   21 
 mariadb-10.11.13/extra/mariabackup/backup_mysql.cc                                                                                                               |    2 
 mariadb-10.11.13/extra/mariabackup/common_engine.cc                                                                                                              |    6 
 mariadb-10.11.13/extra/mariabackup/innobackupex.cc                                                                                                               |    7 
 mariadb-10.11.13/extra/mariabackup/write_filt.cc                                                                                                                 |   12 
 mariadb-10.11.13/extra/mariabackup/xtrabackup.cc                                                                                                                 |   80 
 mariadb-10.11.13/include/json_lib.h                                                                                                                              |    5 
 mariadb-10.11.13/include/my_base.h                                                                                                                               |    5 
 mariadb-10.11.13/include/my_cpu.h                                                                                                                                |    7 
 mariadb-10.11.13/include/my_stack_alloc.h                                                                                                                        |    2 
 mariadb-10.11.13/include/my_sys.h                                                                                                                                |    8 
 mariadb-10.11.13/include/my_virtual_mem.h                                                                                                                        |   37 
 mariadb-10.11.13/include/source_revision.h                                                                                                                       |    2 
 mariadb-10.11.13/include/sslopt-longopts.h                                                                                                                       |    3 
 mariadb-10.11.13/libmariadb/CMakeLists.txt                                                                                                                       |    2 
 mariadb-10.11.13/libmariadb/include/errmsg.h                                                                                                                     |    3 
 mariadb-10.11.13/libmariadb/include/ma_context.h                                                                                                                 |   25 
 mariadb-10.11.13/libmariadb/include/mariadb_com.h                                                                                                                |   22 
 mariadb-10.11.13/libmariadb/libmariadb/CMakeLists.txt                                                                                                            |    6 
 mariadb-10.11.13/libmariadb/libmariadb/ma_context.c                                                                                                              |   38 
 mariadb-10.11.13/libmariadb/libmariadb/ma_errmsg.c                                                                                                               |    2 
 mariadb-10.11.13/libmariadb/libmariadb/mariadb_lib.c                                                                                                             |    9 
 mariadb-10.11.13/libmariadb/libmariadb/mariadb_stmt.c                                                                                                            |    6 
 mariadb-10.11.13/libmariadb/plugins/pvio/pvio_socket.c                                                                                                           |   10 
 mariadb-10.11.13/libmariadb/unittest/libmariadb/connection.c                                                                                                     |   83 
 mariadb-10.11.13/libmariadb/unittest/libmariadb/errors.c                                                                                                         |   74 
 mariadb-10.11.13/libmariadb/unittest/libmariadb/ps_bugs.c                                                                                                        |   52 
 mariadb-10.11.13/mysql-test/CMakeLists.txt                                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/include/long_test.inc                                                                                                                |    2 
 mariadb-10.11.13/mysql-test/lib/My/SafeProcess.pm                                                                                                                |    3 
 mariadb-10.11.13/mysql-test/lib/My/SafeProcess/safe_process.cc                                                                                                   |   17 
 mariadb-10.11.13/mysql-test/main/backup_locks.test                                                                                                               |    1 
 mariadb-10.11.13/mysql-test/main/comment_database.result                                                                                                         |   13 
 mariadb-10.11.13/mysql-test/main/comment_database.test                                                                                                           |    8 
 mariadb-10.11.13/mysql-test/main/ctype_utf8_def_upgrade.result                                                                                                   |    2 
 mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.result                                                                                                    |  194 
 mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.test                                                                                                      |   22 
 mariadb-10.11.13/mysql-test/main/derived_view.result                                                                                                             |    2 
 mariadb-10.11.13/mysql-test/main/func_json.result                                                                                                                |   37 
 mariadb-10.11.13/mysql-test/main/func_json.test                                                                                                                  |   22 
 mariadb-10.11.13/mysql-test/main/func_like.result                                                                                                                |   19 
 mariadb-10.11.13/mysql-test/main/func_like.test                                                                                                                  |   15 
 mariadb-10.11.13/mysql-test/main/func_regexp_pcre.result                                                                                                         |   28 
 mariadb-10.11.13/mysql-test/main/func_regexp_pcre.test                                                                                                           |    2 
 mariadb-10.11.13/mysql-test/main/gis-precise.result                                                                                                              |    8 
 mariadb-10.11.13/mysql-test/main/gis-precise.test                                                                                                                |    8 
 mariadb-10.11.13/mysql-test/main/gis.result                                                                                                                      |   32 
 mariadb-10.11.13/mysql-test/main/gis.test                                                                                                                        |   32 
 mariadb-10.11.13/mysql-test/main/group_by.result                                                                                                                 |   74 
 mariadb-10.11.13/mysql-test/main/group_by.test                                                                                                                   |   22 
 mariadb-10.11.13/mysql-test/main/group_min_max.result                                                                                                            |   24 
 mariadb-10.11.13/mysql-test/main/group_min_max.test                                                                                                              |   36 
 mariadb-10.11.13/mysql-test/main/insert.result                                                                                                                   |   72 
 mariadb-10.11.13/mysql-test/main/insert.test                                                                                                                     |   56 
 mariadb-10.11.13/mysql-test/main/insert_returning.result                                                                                                         |    2 
 mariadb-10.11.13/mysql-test/main/insert_returning.test                                                                                                           |    2 
 mariadb-10.11.13/mysql-test/main/insert_select.result                                                                                                            |  135 
 mariadb-10.11.13/mysql-test/main/insert_select.test                                                                                                              |   56 
 mariadb-10.11.13/mysql-test/main/join.result                                                                                                                     |   29 
 mariadb-10.11.13/mysql-test/main/join.test                                                                                                                       |   25 
 mariadb-10.11.13/mysql-test/main/join_cache.result                                                                                                               |   26 
 mariadb-10.11.13/mysql-test/main/join_cache.test                                                                                                                 |   27 
 mariadb-10.11.13/mysql-test/main/join_nested.result                                                                                                              |   12 
 mariadb-10.11.13/mysql-test/main/join_nested.test                                                                                                                |   13 
 mariadb-10.11.13/mysql-test/main/join_nested_jcl6.result                                                                                                         |   12 
 mariadb-10.11.13/mysql-test/main/large_pages.opt                                                                                                                 |    2 
 mariadb-10.11.13/mysql-test/main/large_pages.result                                                                                                              |    1 
 mariadb-10.11.13/mysql-test/main/large_pages.test                                                                                                                |    4 
 mariadb-10.11.13/mysql-test/main/long_unique.result                                                                                                              |   22 
 mariadb-10.11.13/mysql-test/main/long_unique.test                                                                                                                |   22 
 mariadb-10.11.13/mysql-test/main/lowercase_table2.result                                                                                                         |    2 
 mariadb-10.11.13/mysql-test/main/lowercase_view.result                                                                                                           |   12 
 mariadb-10.11.13/mysql-test/main/lowercase_view.test                                                                                                             |   12 
 mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.result                                                                                                  |   35 
 mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.test                                                                                                    |  113 
 mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.result                                                                                                         |   21 
 mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.test                                                                                                           |   22 
 mariadb-10.11.13/mysql-test/main/mdl_sync.result                                                                                                                 |    5 
 mariadb-10.11.13/mysql-test/main/mdl_sync.test                                                                                                                   |    8 
 mariadb-10.11.13/mysql-test/main/merge.result                                                                                                                    |   17 
 mariadb-10.11.13/mysql-test/main/merge.test                                                                                                                      |   17 
 mariadb-10.11.13/mysql-test/main/multi_update.result                                                                                                             |   20 
 mariadb-10.11.13/mysql-test/main/multi_update.test                                                                                                               |   28 
 mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.opt                                                                                                  |    1 
 mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.result                                                                                               |    8 
 mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.test                                                                                                 |    8 
 mariadb-10.11.13/mysql-test/main/myisam-big.result                                                                                                               |    8 
 mariadb-10.11.13/mysql-test/main/myisam-big.test                                                                                                                 |   13 
 mariadb-10.11.13/mysql-test/main/mysql-interactive.result                                                                                                        |    4 
 mariadb-10.11.13/mysql-test/main/mysql-interactive.test                                                                                                          |   11 
 mariadb-10.11.13/mysql-test/main/mysql_upgrade-34014.result                                                                                                      |    2 
 mariadb-10.11.13/mysql-test/main/mysql_upgrade.result                                                                                                            |   23 
 mariadb-10.11.13/mysql-test/main/mysql_upgrade.test                                                                                                              |   27 
 mariadb-10.11.13/mysql-test/main/mysqld--help.result                                                                                                             |    3 
 mariadb-10.11.13/mysql-test/main/mysqldump-system.result                                                                                                         |    6 
 mariadb-10.11.13/mysql-test/main/mysqldump.result                                                                                                                |   33 
 mariadb-10.11.13/mysql-test/main/mysqldump.test                                                                                                                  |   11 
 mariadb-10.11.13/mysql-test/main/mysqlslap.result                                                                                                                |    3 
 mariadb-10.11.13/mysql-test/main/mysqlslap.test                                                                                                                  |    6 
 mariadb-10.11.13/mysql-test/main/mysqltest.result                                                                                                                |    9 
 mariadb-10.11.13/mysql-test/main/mysqltest.test                                                                                                                  |    6 
 mariadb-10.11.13/mysql-test/main/partition_myisam.result                                                                                                         |   21 
 mariadb-10.11.13/mysql-test/main/partition_myisam.test                                                                                                           |   28 
 mariadb-10.11.13/mysql-test/main/query_cache.result                                                                                                              |   23 
 mariadb-10.11.13/mysql-test/main/query_cache.test                                                                                                                |   22 
 mariadb-10.11.13/mysql-test/main/range_notembedded.result                                                                                                        |   67 
 mariadb-10.11.13/mysql-test/main/range_notembedded.test                                                                                                          |   48 
 mariadb-10.11.13/mysql-test/main/secondary_key_costs.result                                                                                                      |   76 
 mariadb-10.11.13/mysql-test/main/secondary_key_costs.test                                                                                                        |   37 
 mariadb-10.11.13/mysql-test/main/skip_grants.result                                                                                                              |    8 
 mariadb-10.11.13/mysql-test/main/skip_grants.test                                                                                                                |   11 
 mariadb-10.11.13/mysql-test/main/sp-bugs.result                                                                                                                  |    9 
 mariadb-10.11.13/mysql-test/main/sp-bugs.test                                                                                                                    |   20 
 mariadb-10.11.13/mysql-test/main/sp-row.result                                                                                                                   |   41 
 mariadb-10.11.13/mysql-test/main/sp-row.test                                                                                                                     |   61 
 mariadb-10.11.13/mysql-test/main/subselect.result                                                                                                                |   20 
 mariadb-10.11.13/mysql-test/main/subselect.test                                                                                                                  |   10 
 mariadb-10.11.13/mysql-test/main/subselect_elimination.result                                                                                                    |   12 
 mariadb-10.11.13/mysql-test/main/subselect_elimination.test                                                                                                      |    7 
 mariadb-10.11.13/mysql-test/main/subselect_no_exists_to_in.result                                                                                                |   20 
 mariadb-10.11.13/mysql-test/main/subselect_no_mat.result                                                                                                         |   20 
 mariadb-10.11.13/mysql-test/main/subselect_no_opts.result                                                                                                        |   20 
 mariadb-10.11.13/mysql-test/main/subselect_no_scache.result                                                                                                      |   20 
 mariadb-10.11.13/mysql-test/main/subselect_no_semijoin.result                                                                                                    |   20 
 mariadb-10.11.13/mysql-test/main/temp_table_frm.result                                                                                                           |    6 
 mariadb-10.11.13/mysql-test/main/temp_table_frm.test                                                                                                             |   13 
 mariadb-10.11.13/mysql-test/main/timezone.test                                                                                                                   |    2 
 mariadb-10.11.13/mysql-test/main/trigger_null.result                                                                                                             |   15 
 mariadb-10.11.13/mysql-test/main/trigger_null.test                                                                                                               |   11 
 mariadb-10.11.13/mysql-test/main/type_binary.result                                                                                                              |   58 
 mariadb-10.11.13/mysql-test/main/type_binary.test                                                                                                                |   11 
 mariadb-10.11.13/mysql-test/main/type_blob.result                                                                                                                |  190 
 mariadb-10.11.13/mysql-test/main/type_blob.test                                                                                                                  |   45 
 mariadb-10.11.13/mysql-test/main/type_num_innodb.result                                                                                                          |  128 
 mariadb-10.11.13/mysql-test/main/type_varbinary.result                                                                                                           |   42 
 mariadb-10.11.13/mysql-test/main/type_varbinary.test                                                                                                             |   10 
 mariadb-10.11.13/mysql-test/main/update.result                                                                                                                   |   80 
 mariadb-10.11.13/mysql-test/main/update.test                                                                                                                     |   40 
 mariadb-10.11.13/mysql-test/main/userstat.result                                                                                                                 |    7 
 mariadb-10.11.13/mysql-test/main/userstat.test                                                                                                                   |    7 
 mariadb-10.11.13/mysql-test/main/view.result                                                                                                                     |   49 
 mariadb-10.11.13/mysql-test/main/view.test                                                                                                                       |   30 
 mariadb-10.11.13/mysql-test/main/view_grant.result                                                                                                               |   46 
 mariadb-10.11.13/mysql-test/main/view_grant.test                                                                                                                 |   47 
 mariadb-10.11.13/mysql-test/mariadb-test-run.pl                                                                                                                  |   36 
 mariadb-10.11.13/mysql-test/std_data/galera_certs/galera.root.crt                                                                                                |   24 
 mariadb-10.11.13/mysql-test/suite/archive/archive-big.test                                                                                                       |    3 
 mariadb-10.11.13/mysql-test/suite/atomic/README.txt                                                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table.inc                                                                                                         |  198 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table.opt                                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table.result                                                                                                      | 3135 -----
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table.test                                                                                                        |  198 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_aria.test                                                                                                   |    2 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.opt                                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.result                                                                                               | 1396 ++
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.test                                                                                                 |    7 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.result                                                                                               | 1741 +++
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.test                                                                                                 |    6 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_rocksdb.test                                                                                                |    2 
 mariadb-10.11.13/mysql-test/suite/atomic/alter_table_trigger.test                                                                                                |    2 
 mariadb-10.11.13/mysql-test/suite/atomic/create_table.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/atomic/drop_table.test                                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/atomic/rename_table.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_commit_fail.result                                                                                             |  116 
 mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result                                                                          |   45 
 mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_commit_fail.test                                                                                               |  135 
 mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test                                                                            |   13 
 mariadb-10.11.13/mysql-test/suite/binlog_encryption/encrypted_master.test                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result                                                                     |    7 
 mariadb-10.11.13/mysql-test/suite/encryption/r/doublewrite_debug.result                                                                                          |   24 
 mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.opt                                                                                             |    2 
 mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.test                                                                                            |   30 
 mariadb-10.11.13/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt                                                                               |    2 
 mariadb-10.11.13/mysql-test/suite/engines/iuds/r/insert_time.result                                                                                              |    4 
 mariadb-10.11.13/mysql-test/suite/federated/federatedx.result                                                                                                    |    2 
 mariadb-10.11.13/mysql-test/suite/federated/federatedx.test                                                                                                      |    2 
 mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.result                                                                                    |    4 
 mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.test                                                                                      |   11 
 mariadb-10.11.13/mysql-test/suite/funcs_2/t/innodb_charset.test                                                                                                  |    2 
 mariadb-10.11.13/mysql-test/suite/galera/disabled.def                                                                                                            |    6 
 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes.cnf                                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_master.cnf                                                                                             |    4 
 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf                                                                                   |    4 
 mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_slave.cnf                                                                                              |    4 
 mariadb-10.11.13/mysql-test/suite/galera/galera_3nodes_as_slave.cnf                                                                                              |    6 
 mariadb-10.11.13/mysql-test/suite/galera/galera_4nodes.cnf                                                                                                       |    8 
 mariadb-10.11.13/mysql-test/suite/galera/include/auto_increment_offset_save.inc                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera/include/galera_dump_sr_table.inc                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera/include/galera_start_replication.inc                                                                                    |    4 
 mariadb-10.11.13/mysql-test/suite/galera/include/galera_wsrep_recover.inc                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc                                                                          |   35 
 mariadb-10.11.13/mysql-test/suite/galera/r/GAL-401.result                                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20225.result                                                                                                     |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20793.result                                                                                                     |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-21479.result                                                                                                     |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-25389.result                                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-26266.result                                                                                                     |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-33136.result                                                                                                     |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-34647.result                                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35748.result                                                                                                     |   31 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35946.result                                                                                                     |   16 
 mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-36116.result                                                                                                     |   22 
 mariadb-10.11.13/mysql-test/suite/galera/r/MW-284.result                                                                                                         |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/MW-329.result                                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/MW-329F.result                                                                                                        |   25 
 mariadb-10.11.13/mysql-test/suite/galera/r/MW-416.result                                                                                                         |    5 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_2primary_replica.result                                                                                        |    5 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_alter_engine_myisam.result                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result                                                                               |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_as_slave_nonprim.result                                                                                        |    3 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result                                                                                   |  685 -
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result                                                                                    |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff                                                                                            |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill_debug.result                                                                                           |    5 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_checksum.result                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_circular_replication.result                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result                                                                                         |    3 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_defaults.result                                                                                                |    3 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_gcs_fragment.result                                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff                                                                                      |   15 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff                                                                                    |   15 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump.result                                                                                           |   10 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_nonPK_and_PA.result                                                                                            |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result                                                                               |    7 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_simple.result                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_partitioned_tables.result                                                                                      |  176 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_restart_replica.result                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequence_engine.result                                                                                         |    8 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff                                                                                      |   11 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences.result                                                                                               |   16 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_bf_kill.result                                                                                       |  152 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_transaction.result                                                                                   |  350 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_slave_replay.result                                                                                            |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_split_brain.result                                                                                             |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl.result                                                                                                     |    3 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_cipher.result                                                                                              |   30 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_compression.result                                                                                         |    3 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_upgrade.result                                                                                             |    3 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff                                                                                    |    6 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff                                                                               |  210 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result                                                                                    |  534 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff                                                                         |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff                                                                                      |   15 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff                                                                                    |   15 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump.result                                                                                           |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff                                                                             |   15 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff                                                                           |   15 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff                                                                                     |  210 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result                                                                                          |  534 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff                                                                                |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_innodb.result                                                                                   |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_primary_key.result                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result                                                                                  |   80 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result                                                                                 |   27 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_slave_threads.result                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_during_ist.result                                                                                         |  112 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_apply.result                                                                                       |   94 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_skip.result                                                                                        |  102 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_wan.result                                                                                                     |   12 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result                                                                           |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result                                                                                   |    9 
 mariadb-10.11.13/mysql-test/suite/galera/r/mdev-29775.result                                                                                                     |   84 
 mariadb-10.11.13/mysql-test/suite/galera/r/mdev-30653.result                                                                                                     |    2 
 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#198.result                                                                                                |    3 
 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff                                                                                            |   12 
 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff                                                                                          |   15 
 mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33.result                                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result                                                                                  |    2 
 mariadb-10.11.13/mysql-test/suite/galera/suite.pm                                                                                                                |   80 
 mariadb-10.11.13/mysql-test/suite/galera/t/GAL-401.test                                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/GCF-939.test                                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-10715.cnf                                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-15443.cnf                                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-18832.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20225.test                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20793.test                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-21479.test                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22227.test                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22708.cnf                                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24143.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24327.cnf                                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-25389.test                                                                                                       |    7 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26266.test                                                                                                       |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26597.test                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.opt                                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.test                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27123.opt                                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27862.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-28053.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29293.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29512.cnf                                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-32549.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33136.test                                                                                                       |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.cnf                                                                                                        |    9 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.cnf                                                                                                        |   13 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.test                                                                                                       |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.opt                                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.test                                                                                                       |   22 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35946.test                                                                                                       |   39 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-36116.test                                                                                                       |   43 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.cnf                                                                                                         |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.test                                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-259.test                                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-284.test                                                                                                           |    7 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-313.cnf                                                                                                            |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.cnf                                                                                                            |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.test                                                                                                           |   10 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.cnf                                                                                                           |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.test                                                                                                          |  105 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-360-master.opt                                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-369.inc                                                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-416.test                                                                                                           |   73 
 mariadb-10.11.13/mysql-test/suite/galera/t/MW-86-wait8.cnf                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/binlog_checksum.test                                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/create.test                                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera#414.cnf                                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera#500.test                                                                                                       |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_2primary_replica.test                                                                                          |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_MDEV-29512.cnf                                                                                                 |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_alter_engine_myisam.test                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf                                                                                  |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test                                                                                 |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_ctas.test                                                                                             |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_nonprim.test                                                                                          |   11 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf                                                                                    |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_backup_stage.test                                                                                              |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test                                                                                       |   11 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test                                                                                      |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf                                                                                                |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf                                                                                   |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.test                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill.test                                                                                                   |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill_debug.test                                                                                             |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_lock_wait.test                                                                                              |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.cnf                                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.test                                                                                           |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf                                                                                  |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf                                                                                  |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_row_image.test                                                                                          |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test                                                                                       |    8 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_cache_index.test                                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_can_run_toi.test                                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_change_user.test                                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_circular_replication.test                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_concurrent_ctas.test                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_create_trigger.test                                                                                            |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ctas.test                                                                                                      |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf                                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_multiline.test                                                                                             |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.cnf                                                                                                   |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.test                                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_disallow_local_gtid.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_fk_truncate.cnf                                                                                                |    9 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_flush_local.test                                                                                               |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_forced_binlog_format.test                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.cnf                                                                                             |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.test                                                                                            |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_fragment.cnf                                                                                               |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf                                                                                        |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid.cnf                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_server_id.cnf                                                                                             |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave.cnf                                                                                                 |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test                                                                                      |   15 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test                                                                                       |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_ignore.test                                                                                             |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_multi.test                                                                                              |    7 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf                                                                                             |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf                                                                                             |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf                                                                                            |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf                                                                          |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf                                                                                              |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.test                                                                                             |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_progress.cnf                                                                                               |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf                                                                                              |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf                                                                                         |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync.cnf                                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_applier.cnf                                                                                               |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_smallchanges.test                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_load_data.test                                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_log_bin_opt.cnf                                                                                                |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_many_rows.test                                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.cnf                                                                                                 |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.test                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_15611.cnf                                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdl_race.test                                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_nonPK_and_PA.test                                                                                              |    7 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_nopk_unicode.test                                                                                              |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test                                                                                 |   11 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test                                                                                  |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_simple.test                                                                                           |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_partitioned_tables.test                                                                                        |  133 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf                                                                                               |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_recovery.test                                                                                               |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache.cnf                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_invalidate.test                                                                                    |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_read_only.test                                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_nochanges.test                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.cnf                                                                                            |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.test                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_savepoint_replay.test                                                                                          |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequence_engine.test                                                                                           |   13 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.cnf                                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.combinations                                                                                         |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.test                                                                                                 |   27 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf                                                                                          |    9 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations                                                                                 |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.test                                                                                         |  115 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.cnf                                                                                      |    9 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.combinations                                                                             |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.test                                                                                     |  255 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_server.test                                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_slave_replay.test                                                                                              |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sp_bf_abort.inc                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_split_brain.test                                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test                                                                                          |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.cnf                                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.test                                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.cnf                                                                                                 |   11 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.test                                                                                                |   82 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.cnf                                                                                            |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.test                                                                                           |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf                                                                                                |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.test                                                                                               |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_encrypted.cnf                                                                                              |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf                                                                                            |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf                                                                                   |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf                                                                             |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf                                                                                       |   28 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test                                                                                      |   29 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf                                                                              |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf                                                                                 |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test                                                                                |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf                                                                                              |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync.cnf                                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync2.cnf                                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf                                                                                         |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf                                                                              |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf                                                                              |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf                                                                                             |   23 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test                                                                                            |   29 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf                                                                                      |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf                                                                                        |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_cluster.test                                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_index.test                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_state.test                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_innodb.test                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_primary_key.test                                                                                |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_suspend_slave.test                                                                                             |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt                                                                                      |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto.test                                                                                            |    3 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_table_with_hyphen.inc                                                                                          |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_temporary_sequences.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.cnf                                                                                                 |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.test                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test                                                                                    |   60 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ftwrl.test                                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_transaction_read_only.test                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_udf.cnf                                                                                                        |    7 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_unicode_identifiers.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_v1_row_events.cnf                                                                                              |    7 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_OSU_method2.test                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test                                                                                  |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf                                                                                    |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test                                                                                   |   39 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test                                                                                   |   86 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_slave_threads.test                                                                                         |   17 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_wsrep_mode.test                                                                                            |    6 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.cnf                                                                                            |   20 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.test                                                                                           |  165 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf                                                                                          |   21 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.test                                                                                         |   73 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_begin.inc                                                                                          |   79 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_end.inc                                                                                            |   33 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf                                                                                           |   21 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.test                                                                                          |  100 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test                                                                                           |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.cnf                                                                                                        |    9 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.test                                                                                                       |   12 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf                                                                                            |    9 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf                                                                                            |    9 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf                                                                                         |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_mode.test                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test                                                                             |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test                                                                                     |   14 
 mariadb-10.11.13/mysql-test/suite/galera/t/mdev-29775.test                                                                                                       |   81 
 mariadb-10.11.13/mysql-test/suite/galera/t/mdev-30653.test                                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/galera/t/mdev-31285.test                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.cnf                                                                                                   |    7 
 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.test                                                                                                  |   11 
 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#201.cnf                                                                                                   |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#247.test                                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#31.test                                                                                                   |    2 
 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#33.cnf                                                                                                    |    5 
 mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#332.test                                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/rename.test                                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/view.test                                                                                                             |    1 
 mariadb-10.11.13/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/disabled.def                                                                                                     |    3 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf                                                                                              |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_3nodes.cnf                                                                                                |    9 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/MDEV-36360.result                                                                                              |   61 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera-features#115.result                                                                                     |   41 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result                                                                                        |   35 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd.result                                                                                            |    8 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result                                                                                     |    8 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result                                                                                   |    8 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result                                                                                   |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result                                                                                   |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result                                                                                   |    8 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result                                                                                        |   10 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result                                                                                |    4 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result                                                                               |   26 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/suite.pm                                                                                                         |   82 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GAL-501.cnf                                                                                                    |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GCF-354.cnf                                                                                                    |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/MDEV-36360.test                                                                                                |  110 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.test                                                                                       |   89 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#119.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf                                                                                           |    9 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations                                                                                  |    5 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test                                                                                          |   75 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf                                                                                           |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test                                                                            |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf                                                                                    |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test                                                                                   |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd.test                                                                                              |   11 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test                                                                                       |   28 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test                                                                                     |   80 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf                                                                                    |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf                                                                            |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf                                                                                      |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test                                                                                     |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf                                                                                          |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf                                                                                  |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf                                                                                 |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test                                                                                     |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test                                                                                     |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test                                                                                     |    8 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test                                                                                          |   10 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test                                                                                  |    5 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf                                                                                          |    6 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test                                                                                 |   64 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test                                                                                  |    2 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf                                                                                     |    3 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result                                                                                           |   10 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result                                                                    |    2 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/suite.pm                                                                                                      |   39 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test                                                                                             |    8 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test                                                                                   |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test                                                              |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test                                                             |    2 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test                                                                      |    3 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf                                                                                |    3 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt                                                                                   |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/disabled.def                                                                                                         |    4 
 mariadb-10.11.13/mysql-test/suite/galera_sr/r/MENT-2042.result                                                                                                   |    9 
 mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result                                                                                         |    2 
 mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_myisam.result                                                                                            |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result                                                                                    |    2 
 mariadb-10.11.13/mysql-test/suite/galera_sr/suite.pm                                                                                                             |   80 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-27615.test                                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-28971.test                                                                                                    |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/MENT-2042.test                                                                                                     |   23 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test                                                                                           |    2 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf                                                                                  |    3 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test                                                                                 |    3 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf                                                                                           |    2 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_myisam.test                                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf                                                                                        |    5 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf                                                                                         |    7 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test                                                                                      |    2 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test                                                                                       |    4 
 mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_basic.result                                                                                             |    2 
 mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_stats.result                                                                                             |   52 
 mariadb-10.11.13/mysql-test/suite/gcol/t/innodb_virtual_basic.test                                                                                               |   37 
 mariadb-10.11.13/mysql-test/suite/innodb/r/alter_copy_bulk.result                                                                                                |   21 
 mariadb-10.11.13/mysql-test/suite/innodb/r/alter_partitioned_debug.result                                                                                        |   26 
 mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff                                                                                            |   91 
 mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist.result                                                                                                |   37 
 mariadb-10.11.13/mysql-test/suite/innodb/r/buf_pool_resize_oom.result                                                                                            |    8 
 mariadb-10.11.13/mysql-test/suite/innodb/r/doublewrite.result                                                                                                    |   18 
 mariadb-10.11.13/mysql-test/suite/innodb/r/foreign_key.result                                                                                                    |   21 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb-index-online.result                                                                                            |   26 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result                                                                                        |    4 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result                                                                                      |   47 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result                                                                              |   14 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result                                                                            |   25 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result                                                                          |   26 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_bug52663.result                                                                                                |    4 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result                                                                                        |   40 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result                                                                        |    6 
 mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_fetch.result                                                                                             |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff                                                                                   |    7 
 mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug.result                                                                                            |   11 
 mariadb-10.11.13/mysql-test/suite/innodb/r/lock_isolation.result                                                                                                 |   88 
 mariadb-10.11.13/mysql-test/suite/innodb/r/lock_memory_debug.result                                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/r/log_upgrade_101_flags.result                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff                                                                                              |   11 
 mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure.result                                                                                                   |   33 
 mariadb-10.11.13/mysql-test/suite/innodb/r/page_cleaner.result                                                                                                   |   15 
 mariadb-10.11.13/mysql-test/suite/innodb/r/recovery_memory.result                                                                                                |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,16k.rdiff                                                                                                     |   16 
 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,32k.rdiff                                                                                                     |   16 
 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,4k.rdiff                                                                                                      |   16 
 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,64k.rdiff                                                                                                     |   16 
 mariadb-10.11.13/mysql-test/suite/innodb/r/restart,8k.rdiff                                                                                                      |   16 
 mariadb-10.11.13/mysql-test/suite/innodb/r/restart.result                                                                                                        |   13 
 mariadb-10.11.13/mysql-test/suite/innodb/r/stat_tables.result                                                                                                    |   10 
 mariadb-10.11.13/mysql-test/suite/innodb/r/stats_persistent.result                                                                                               |   10 
 mariadb-10.11.13/mysql-test/suite/innodb/t/alter_copy_bulk.test                                                                                                  |   21 
 mariadb-10.11.13/mysql-test/suite/innodb/t/alter_partitioned_debug.test                                                                                          |   42 
 mariadb-10.11.13/mysql-test/suite/innodb/t/autoinc_persist.test                                                                                                  |   21 
 mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt                                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.test                                                                                              |   27 
 mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.combinations                                                                                              |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.test                                                                                                      |   45 
 mariadb-10.11.13/mysql-test/suite/innodb/t/foreign_key.test                                                                                                      |   30 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.opt                                                                                               |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.test                                                                                              |   25 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-table-online-master.opt                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test                                                                                          |    4 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test                                                                                        |   73 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt                                                                                 |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test                                                                                |   28 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt                                                                                   |    1 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt                                                                               |    1 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test                                                                              |   35 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt                                                                             |    3 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test                                                                            |   61 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_bug52663.test                                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test                                                                                          |   45 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test                                                                          |   14 
 mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_fetch.test                                                                                               |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/instant_alter_debug.test                                                                                              |   12 
 mariadb-10.11.13/mysql-test/suite/innodb/t/lock_isolation.test                                                                                                   |  134 
 mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.opt                                                                                                 |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.test                                                                                                |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/log_upgrade_101_flags.test                                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/mdev-15707.opt                                                                                                        |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.opt                                                                                                      |    3 
 mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.test                                                                                                     |   36 
 mariadb-10.11.13/mysql-test/suite/innodb/t/page_cleaner.test                                                                                                     |   25 
 mariadb-10.11.13/mysql-test/suite/innodb/t/purge_secondary.opt                                                                                                   |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/recovery_memory.test                                                                                                  |    6 
 mariadb-10.11.13/mysql-test/suite/innodb/t/restart.opt                                                                                                           |    2 
 mariadb-10.11.13/mysql-test/suite/innodb/t/restart.test                                                                                                          |   25 
 mariadb-10.11.13/mysql-test/suite/innodb/t/stat_tables.test                                                                                                      |    9 
 mariadb-10.11.13/mysql-test/suite/innodb/t/stats_persistent.test                                                                                                 |   12 
 mariadb-10.11.13/mysql-test/suite/innodb/t/update_time-master.opt                                                                                                |    1 
 mariadb-10.11.13/mysql-test/suite/innodb_fts/r/index_table.result                                                                                                |    3 
 mariadb-10.11.13/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result                                                                                        |    9 
 mariadb-10.11.13/mysql-test/suite/innodb_fts/t/index_table.test                                                                                                  |    6 
 mariadb-10.11.13/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test                                                                                          |    9 
 mariadb-10.11.13/mysql-test/suite/innodb_gis/r/rollback.result                                                                                                   |   13 
 mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rollback.test                                                                                                     |   13 
 mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rtree_purge.test                                                                                                  |    1 
 mariadb-10.11.13/mysql-test/suite/json/r/json_no_table.result                                                                                                    |    2 
 mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.result                                                                                                 |    5 
 mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.test                                                                                                   |    3 
 mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.result                                                                                      |    3 
 mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.test                                                                                        |   12 
 mariadb-10.11.13/mysql-test/suite/mariabackup/log_page_corruption.test                                                                                           |    2 
 mariadb-10.11.13/mysql-test/suite/mariabackup/partial.result                                                                                                     |    4 
 mariadb-10.11.13/mysql-test/suite/mariabackup/partial_exclude.result                                                                                             |    2 
 mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.result                                                                                            |   11 
 mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.test                                                                                              |   25 
 mariadb-10.11.13/mysql-test/suite/mariabackup/unsupported_redo.result                                                                                            |    4 
 mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.opt                                                                                              |    1 
 mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.result                                                                                           |   18 
 mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.test                                                                                             |   38 
 mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.cnf                                                                                          |   13 
 mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.result                                                                                       |   45 
 mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.test                                                                                         |   83 
 mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_innodb.test                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_memory.test                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_myisam.test                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/perfschema/r/threads_innodb.result                                                                                             |   10 
 mariadb-10.11.13/mysql-test/suite/perfschema/t/threads_innodb.test                                                                                               |    2 
 mariadb-10.11.13/mysql-test/suite/plugins/r/server_audit.result                                                                                                  |    3 
 mariadb-10.11.13/mysql-test/suite/plugins/t/server_audit.test                                                                                                    |    4 
 mariadb-10.11.13/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_create_select_row.result                                                                                             |  158 
 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_gtid_crash.result                                                                                                    |    2 
 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_master_pos_wait.result                                                                                               |    3 
 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result                                                                                 |    7 
 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result                                                                           |   41 
 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result                                                                                            |   53 
 mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result                                                                                           |   26 
 mariadb-10.11.13/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test                                                                                            |    6 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_create_select_row.test                                                                                               |  161 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt                                                                                                 |    2 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash.test                                                                                                      |    2 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test                                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_master_pos_wait.test                                                                                                 |    1 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test                                                                                   |   12 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test                                                                                            |    1 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test                                                                             |   68 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test                                                                                              |  100 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_typeconv.test                                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test                                                                                             |   63 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.opt                                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.result                                                                                                      |   26 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.test                                                                                                        |   19 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.result                                                                                                      |   47 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.test                                                                                                        |   50 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/gtid.result                                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/other.result                                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/other.test                                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/replication.result                                                                                                |    2 
 mariadb-10.11.13/mysql-test/suite/sql_sequence/view.test                                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result                                                                                |   30 
 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff                                                                                          |  125 
 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb.result                                                                                               |   46 
 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result                                                                                      |    4 
 mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result                                                                                   |    4 
 mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result                                                                             |   51 
 mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result                                                                                 |   15 
 mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt                                                                            |    1 
 mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt                                                                                   |    1 
 mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test                                                                                  |   37 
 mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.opt                                                                                                  |    4 
 mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.test                                                                                                 |    4 
 mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test                                                                               |   46 
 mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test                                                                                   |   19 
 mariadb-10.11.13/mysql-test/suite/versioning/r/partition.result                                                                                                  |   35 
 mariadb-10.11.13/mysql-test/suite/versioning/t/partition.test                                                                                                    |   43 
 mariadb-10.11.13/mysql-test/suite/wsrep/README                                                                                                                   |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/include/check_galera_version.inc                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/r/plugin.result                                                                                                          |    2 
 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result                                                                                     |   18 
 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result                                                                                              |   65 
 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff                                                                                       |    2 
 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result                                                                                      |   51 
 mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result                                                                                        |    8 
 mariadb-10.11.13/mysql-test/suite/wsrep/suite.pm                                                                                                                 |    6 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/binlog_format.cnf                                                                                                      |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/foreign_key.test                                                                                                       |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_10186.test                                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_7798.cnf                                                                                                          |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/plugin.test                                                                                                            |    2 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/pool_of_threads.test                                                                                                   |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/variables.test                                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/variables_debug.test                                                                                                   |    3 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf                                                                                        |   10 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test                                                                                       |   28 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf                                                                                                 |   14 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test                                                                                                |   73 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover.cnf                                                                                                      |    2 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf                                                                                         |    7 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test                                                                                        |   48 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf                                                                                           |    6 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt                                                                                           |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test                                                                                          |   11 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_rpl.test                                                                                                         |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test                                                                                        |    1 
 mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf                                                                                          |    1 
 mariadb-10.11.13/mysys/CMakeLists.txt                                                                                                                            |    5 
 mariadb-10.11.13/mysys/mf_keycache.c                                                                                                                             |    9 
 mariadb-10.11.13/mysys/my_default.c                                                                                                                              |    3 
 mariadb-10.11.13/mysys/my_getopt.c                                                                                                                               |    3 
 mariadb-10.11.13/mysys/my_largepage.c                                                                                                                            |  111 
 mariadb-10.11.13/mysys/my_pread.c                                                                                                                                |    9 
 mariadb-10.11.13/mysys/my_virtual_mem.c                                                                                                                          |  201 
 mariadb-10.11.13/plugin/auth_examples/auth_0x0100.c                                                                                                              |    4 
 mariadb-10.11.13/plugin/server_audit/server_audit.c                                                                                                              |   12 
 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.result                                                                                         |   23 
 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.test                                                                                           |   12 
 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc                                                                                    |   13 
 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result                                                                                  |   12 
 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result                                                                                  |   12 
 mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result                                                                                  |   12 
 mariadb-10.11.13/plugin/userstat/client_stats.cc                                                                                                                 |    4 
 mariadb-10.11.13/plugin/versioning/versioning.cc                                                                                                                 |    1 
 mariadb-10.11.13/scripts/mysqlhotcopy.sh                                                                                                                         |    2 
 mariadb-10.11.13/scripts/wsrep_sst_common.sh                                                                                                                     |   13 
 mariadb-10.11.13/scripts/wsrep_sst_mariabackup.sh                                                                                                                |    2 
 mariadb-10.11.13/scripts/wsrep_sst_mysqldump.sh                                                                                                                  |    4 
 mariadb-10.11.13/scripts/wsrep_sst_rsync.sh                                                                                                                      |    2 
 mariadb-10.11.13/sql/filesort.cc                                                                                                                                 |   49 
 mariadb-10.11.13/sql/ha_partition.cc                                                                                                                             |   40 
 mariadb-10.11.13/sql/ha_sequence.cc                                                                                                                              |    6 
 mariadb-10.11.13/sql/ha_sequence.h                                                                                                                               |    3 
 mariadb-10.11.13/sql/handle_connections_win.cc                                                                                                                   |    3 
 mariadb-10.11.13/sql/handler.cc                                                                                                                                  |   71 
 mariadb-10.11.13/sql/handler.h                                                                                                                                   |    4 
 mariadb-10.11.13/sql/item.cc                                                                                                                                     |   14 
 mariadb-10.11.13/sql/item.h                                                                                                                                      |   47 
 mariadb-10.11.13/sql/item_cmpfunc.h                                                                                                                              |   30 
 mariadb-10.11.13/sql/item_func.cc                                                                                                                                |   10 
 mariadb-10.11.13/sql/item_func.h                                                                                                                                 |    7 
 mariadb-10.11.13/sql/item_geofunc.cc                                                                                                                             |   26 
 mariadb-10.11.13/sql/item_jsonfunc.cc                                                                                                                            |  114 
 mariadb-10.11.13/sql/item_strfunc.cc                                                                                                                             |   10 
 mariadb-10.11.13/sql/item_subselect.cc                                                                                                                           |   24 
 mariadb-10.11.13/sql/item_subselect.h                                                                                                                            |    1 
 mariadb-10.11.13/sql/lex_string.h                                                                                                                                |    2 
 mariadb-10.11.13/sql/log.cc                                                                                                                                      |   41 
 mariadb-10.11.13/sql/log.h                                                                                                                                       |    1 
 mariadb-10.11.13/sql/mysql_install_db.cc                                                                                                                         |   23 
 mariadb-10.11.13/sql/mysql_upgrade_service.cc                                                                                                                    |  129 
 mariadb-10.11.13/sql/mysqld.cc                                                                                                                                   |   41 
 mariadb-10.11.13/sql/mysqld.h                                                                                                                                    |    1 
 mariadb-10.11.13/sql/net_serv.cc                                                                                                                                 |   25 
 mariadb-10.11.13/sql/opt_range.cc                                                                                                                                |   91 
 mariadb-10.11.13/sql/opt_range.h                                                                                                                                 |   29 
 mariadb-10.11.13/sql/rpl_injector.h                                                                                                                              |    1 
 mariadb-10.11.13/sql/rpl_mi.cc                                                                                                                                   |   67 
 mariadb-10.11.13/sql/rpl_mi.h                                                                                                                                    |   11 
 mariadb-10.11.13/sql/rpl_parallel.cc                                                                                                                             |   16 
 mariadb-10.11.13/sql/semisync_master.cc                                                                                                                          |    4 
 mariadb-10.11.13/sql/semisync_slave.cc                                                                                                                           |    9 
 mariadb-10.11.13/sql/semisync_slave.h                                                                                                                            |    2 
 mariadb-10.11.13/sql/signal_handler.cc                                                                                                                           |    2 
 mariadb-10.11.13/sql/slave.cc                                                                                                                                    |   72 
 mariadb-10.11.13/sql/sp_head.cc                                                                                                                                  |   12 
 mariadb-10.11.13/sql/sql_acl.cc                                                                                                                                  |   17 
 mariadb-10.11.13/sql/sql_base.cc                                                                                                                                 |  257 
 mariadb-10.11.13/sql/sql_base.h                                                                                                                                  |   21 
 mariadb-10.11.13/sql/sql_cache.cc                                                                                                                                |    2 
 mariadb-10.11.13/sql/sql_class.cc                                                                                                                                |   18 
 mariadb-10.11.13/sql/sql_class.h                                                                                                                                 |    5 
 mariadb-10.11.13/sql/sql_cmd.h                                                                                                                                   |    1 
 mariadb-10.11.13/sql/sql_db.cc                                                                                                                                   |   46 
 mariadb-10.11.13/sql/sql_db.h                                                                                                                                    |    4 
 mariadb-10.11.13/sql/sql_error.cc                                                                                                                                |   22 
 mariadb-10.11.13/sql/sql_insert.cc                                                                                                                               |  145 
 mariadb-10.11.13/sql/sql_insert.h                                                                                                                                |    2 
 mariadb-10.11.13/sql/sql_lex.cc                                                                                                                                  |   45 
 mariadb-10.11.13/sql/sql_lex.h                                                                                                                                   |    6 
 mariadb-10.11.13/sql/sql_parse.cc                                                                                                                                |   19 
 mariadb-10.11.13/sql/sql_prepare.cc                                                                                                                              |    4 
 mariadb-10.11.13/sql/sql_priv.h                                                                                                                                  |    1 
 mariadb-10.11.13/sql/sql_reload.cc                                                                                                                               |    2 
 mariadb-10.11.13/sql/sql_select.cc                                                                                                                               |  164 
 mariadb-10.11.13/sql/sql_show.cc                                                                                                                                 |   31 
 mariadb-10.11.13/sql/sql_statistics.cc                                                                                                                           |   11 
 mariadb-10.11.13/sql/sql_string.h                                                                                                                                |    2 
 mariadb-10.11.13/sql/sql_table.cc                                                                                                                                |   85 
 mariadb-10.11.13/sql/sql_trigger.cc                                                                                                                              |    7 
 mariadb-10.11.13/sql/sql_truncate.cc                                                                                                                             |   35 
 mariadb-10.11.13/sql/sql_update.cc                                                                                                                               |    5 
 mariadb-10.11.13/sql/sql_view.cc                                                                                                                                 |   15 
 mariadb-10.11.13/sql/sql_yacc.yy                                                                                                                                 |    2 
 mariadb-10.11.13/sql/structs.h                                                                                                                                   |    2 
 mariadb-10.11.13/sql/sys_vars.cc                                                                                                                                 |   11 
 mariadb-10.11.13/sql/table.cc                                                                                                                                    |   48 
 mariadb-10.11.13/sql/table.h                                                                                                                                     |   13 
 mariadb-10.11.13/sql/vers_string.h                                                                                                                               |    2 
 mariadb-10.11.13/sql/wsrep_applier.cc                                                                                                                            |   15 
 mariadb-10.11.13/sql/wsrep_client_service.cc                                                                                                                     |    6 
 mariadb-10.11.13/sql/wsrep_high_priority_service.cc                                                                                                              |    4 
 mariadb-10.11.13/sql/wsrep_mysqld.cc                                                                                                                             |  114 
 mariadb-10.11.13/sql/wsrep_mysqld.h                                                                                                                              |    3 
 mariadb-10.11.13/sql/wsrep_server_service.cc                                                                                                                     |    1 
 mariadb-10.11.13/sql/wsrep_sst.cc                                                                                                                                |   11 
 mariadb-10.11.13/sql/wsrep_thd.h                                                                                                                                 |   66 
 mariadb-10.11.13/sql/wsrep_trans_observer.h                                                                                                                      |   15 
 mariadb-10.11.13/sql/wsrep_var.cc                                                                                                                                |   55 
 mariadb-10.11.13/sql/wsrep_var.h                                                                                                                                 |    3 
 mariadb-10.11.13/sql/wsrep_xid.cc                                                                                                                                |   43 
 mariadb-10.11.13/sql/wsrep_xid.h                                                                                                                                 |    4 
 mariadb-10.11.13/sql/yy_mariadb.cc                                                                                                                               |    2 
 mariadb-10.11.13/sql/yy_oracle.cc                                                                                                                                |    2 
 mariadb-10.11.13/storage/connect/CMakeLists.txt                                                                                                                  |    6 
 mariadb-10.11.13/storage/connect/connect.cc                                                                                                                      |    8 
 mariadb-10.11.13/storage/connect/plgxml.h                                                                                                                        |    4 
 mariadb-10.11.13/storage/connect/tabxml.cpp                                                                                                                      |    3 
 mariadb-10.11.13/storage/connect/user_connect.cc                                                                                                                 |   19 
 mariadb-10.11.13/storage/federatedx/federatedx_io.cc                                                                                                             |    1 
 mariadb-10.11.13/storage/federatedx/ha_federatedx.cc                                                                                                             |   23 
 mariadb-10.11.13/storage/innobase/CMakeLists.txt                                                                                                                 |    1 
 mariadb-10.11.13/storage/innobase/btr/btr0sea.cc                                                                                                                 |  104 
 mariadb-10.11.13/storage/innobase/buf/buf0buddy.cc                                                                                                               |  327 
 mariadb-10.11.13/storage/innobase/buf/buf0buf.cc                                                                                                                 | 2299 +---
 mariadb-10.11.13/storage/innobase/buf/buf0dblwr.cc                                                                                                               |   73 
 mariadb-10.11.13/storage/innobase/buf/buf0dump.cc                                                                                                                |    8 
 mariadb-10.11.13/storage/innobase/buf/buf0flu.cc                                                                                                                 |  239 
 mariadb-10.11.13/storage/innobase/buf/buf0lru.cc                                                                                                                 |  139 
 mariadb-10.11.13/storage/innobase/buf/buf0rea.cc                                                                                                                 |    7 
 mariadb-10.11.13/storage/innobase/dict/dict0defrag_bg.cc                                                                                                         |  116 
 mariadb-10.11.13/storage/innobase/dict/dict0dict.cc                                                                                                              |  244 
 mariadb-10.11.13/storage/innobase/dict/dict0load.cc                                                                                                              |    2 
 mariadb-10.11.13/storage/innobase/dict/dict0stats.cc                                                                                                             |  730 -
 mariadb-10.11.13/storage/innobase/dict/dict0stats_bg.cc                                                                                                          |   22 
 mariadb-10.11.13/storage/innobase/fsp/fsp0fsp.cc                                                                                                                 |   33 
 mariadb-10.11.13/storage/innobase/fts/fts0config.cc                                                                                                              |    2 
 mariadb-10.11.13/storage/innobase/fts/fts0fts.cc                                                                                                                 |   13 
 mariadb-10.11.13/storage/innobase/fts/fts0opt.cc                                                                                                                 |    2 
 mariadb-10.11.13/storage/innobase/gis/gis0sea.cc                                                                                                                 |   24 
 mariadb-10.11.13/storage/innobase/handler/ha_innodb.cc                                                                                                           | 1556 +-
 mariadb-10.11.13/storage/innobase/handler/ha_innodb.h                                                                                                            |    3 
 mariadb-10.11.13/storage/innobase/handler/handler0alter.cc                                                                                                       |  138 
 mariadb-10.11.13/storage/innobase/handler/i_s.cc                                                                                                                 |  130 
 mariadb-10.11.13/storage/innobase/ibuf/ibuf0ibuf.cc                                                                                                              |   30 
 mariadb-10.11.13/storage/innobase/include/btr0sea.h                                                                                                              |   10 
 mariadb-10.11.13/storage/innobase/include/buf0buddy.h                                                                                                            |   40 
 mariadb-10.11.13/storage/innobase/include/buf0buf.h                                                                                                              |  446 
 mariadb-10.11.13/storage/innobase/include/buf0buf.inl                                                                                                            |    2 
 mariadb-10.11.13/storage/innobase/include/buf0dblwr.h                                                                                                            |    3 
 mariadb-10.11.13/storage/innobase/include/buf0lru.h                                                                                                              |    4 
 mariadb-10.11.13/storage/innobase/include/dict0dict.h                                                                                                            |   53 
 mariadb-10.11.13/storage/innobase/include/dict0dict.inl                                                                                                          |    4 
 mariadb-10.11.13/storage/innobase/include/dict0mem.h                                                                                                             |  105 
 mariadb-10.11.13/storage/innobase/include/dict0stats.h                                                                                                           |  141 
 mariadb-10.11.13/storage/innobase/include/dict0stats.inl                                                                                                         |  219 
 mariadb-10.11.13/storage/innobase/include/fil0fil.h                                                                                                              |    9 
 mariadb-10.11.13/storage/innobase/include/fsp0fsp.h                                                                                                              |    6 
 mariadb-10.11.13/storage/innobase/include/ibuf0ibuf.h                                                                                                            |   10 
 mariadb-10.11.13/storage/innobase/include/log0log.h                                                                                                              |  191 
 mariadb-10.11.13/storage/innobase/include/log0recv.h                                                                                                             |   12 
 mariadb-10.11.13/storage/innobase/include/mtr0mtr.h                                                                                                              |    9 
 mariadb-10.11.13/storage/innobase/include/os0file.h                                                                                                              |    2 
 mariadb-10.11.13/storage/innobase/include/row0row.h                                                                                                              |   16 
 mariadb-10.11.13/storage/innobase/include/row0row.inl                                                                                                            |   49 
 mariadb-10.11.13/storage/innobase/include/row0sel.h                                                                                                              |    5 
 mariadb-10.11.13/storage/innobase/include/srv0srv.h                                                                                                              |   21 
 mariadb-10.11.13/storage/innobase/include/trx0trx.h                                                                                                              |   26 
 mariadb-10.11.13/storage/innobase/include/trx0types.h                                                                                                            |    9 
 mariadb-10.11.13/storage/innobase/include/ut0new.h                                                                                                               |    1 
 mariadb-10.11.13/storage/innobase/lock/lock0lock.cc                                                                                                              |   72 
 mariadb-10.11.13/storage/innobase/log/log0crypt.cc                                                                                                               |    2 
 mariadb-10.11.13/storage/innobase/log/log0log.cc                                                                                                                 |  283 
 mariadb-10.11.13/storage/innobase/log/log0recv.cc                                                                                                                |  159 
 mariadb-10.11.13/storage/innobase/mtr/mtr0mtr.cc                                                                                                                 |  272 
 mariadb-10.11.13/storage/innobase/os/os0file.cc                                                                                                                  |   22 
 mariadb-10.11.13/storage/innobase/pars/pars0pars.cc                                                                                                              |    5 
 mariadb-10.11.13/storage/innobase/row/row0ins.cc                                                                                                                 |  151 
 mariadb-10.11.13/storage/innobase/row/row0log.cc                                                                                                                 |   13 
 mariadb-10.11.13/storage/innobase/row/row0mysql.cc                                                                                                               |   20 
 mariadb-10.11.13/storage/innobase/row/row0purge.cc                                                                                                               |    2 
 mariadb-10.11.13/storage/innobase/row/row0sel.cc                                                                                                                 |  120 
 mariadb-10.11.13/storage/innobase/row/row0uins.cc                                                                                                                |   10 
 mariadb-10.11.13/storage/innobase/row/row0umod.cc                                                                                                                |    7 
 mariadb-10.11.13/storage/innobase/row/row0upd.cc                                                                                                                 |    4 
 mariadb-10.11.13/storage/innobase/srv/srv0mon.cc                                                                                                                 |   17 
 mariadb-10.11.13/storage/innobase/srv/srv0srv.cc                                                                                                                 |   35 
 mariadb-10.11.13/storage/innobase/srv/srv0start.cc                                                                                                               |   46 
 mariadb-10.11.13/storage/innobase/trx/trx0purge.cc                                                                                                               |   83 
 mariadb-10.11.13/storage/innobase/trx/trx0rec.cc                                                                                                                 |   26 
 mariadb-10.11.13/storage/innobase/trx/trx0trx.cc                                                                                                                 |    5 
 mariadb-10.11.13/storage/innobase/ut/ut0rnd.cc                                                                                                                   |    2 
 mariadb-10.11.13/storage/maria/ma_control_file.c                                                                                                                 |   45 
 mariadb-10.11.13/storage/maria/ma_pagecache.c                                                                                                                    |    8 
 mariadb-10.11.13/storage/maria/ma_unique.c                                                                                                                       |    6 
 mariadb-10.11.13/storage/mroonga/CMakeLists.txt                                                                                                                  |    2 
 mariadb-10.11.13/storage/mroonga/ha_mroonga.cpp                                                                                                                  |    8 
 mariadb-10.11.13/storage/mroonga/vendor/groonga/CMakeLists.txt                                                                                                   |    2 
 mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/db.c                                                                                                         |    4 
 mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/hash.c                                                                                                       |   14 
 mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/ii.c                                                                                                         |    4 
 mariadb-10.11.13/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt                                                           |    2 
 mariadb-10.11.13/storage/myisam/mi_unique.c                                                                                                                      |    6 
 mariadb-10.11.13/storage/rocksdb/build_rocksdb.cmake                                                                                                             |  112 
 mariadb-10.11.13/storage/rocksdb/ha_rocksdb.cc                                                                                                                   |   11 
 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result                                                                          |   10 
 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result                                                                                         |   10 
 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result                                                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result                                                                                     |    2 
 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result                                                                                     |   10 
 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc                                                                                            |   15 
 mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result                                                                  |   20 
 mariadb-10.11.13/storage/rocksdb/rdb_i_s.cc                                                                                                                      |    5 
 mariadb-10.11.13/storage/rocksdb/rdb_source_revision.h                                                                                                           |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain                                                                                             |   54 
 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/config.yml                                                                                                    |  872 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt                                                                                    |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1                                                                                            |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1                                                                                            |   35 
 mariadb-10.11.13/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml                                                                                      |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/.gitignore                                                                                                              |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/.travis.yml                                                                                                             |  256 
 mariadb-10.11.13/storage/rocksdb/rocksdb/CMakeLists.txt                                                                                                          |  523 
 mariadb-10.11.13/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md                                                                                              |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/HISTORY.md                                                                                                              |  836 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/INSTALL.md                                                                                                              |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md                                                                                                    |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/Makefile                                                                                                                | 1873 ++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/PLUGINS.md                                                                                                              |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/README.md                                                                                                               |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/TARGETS                                                                                                                 | 1090 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/USERS.md                                                                                                                |   30 
 mariadb-10.11.13/storage/rocksdb/rocksdb/WINDOWS_PORT.md                                                                                                         |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/appveyor.yml                                                                                                            |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py                                                                                            |  142 
 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh                                                                                         |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_builder.py                                                                                            |   64 
 mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_cfg.py                                                                                                |  125 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/build_detect_platform                                                                                       |  282 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/check-sources.sh                                                                                            |   36 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/dependencies_platform009.sh                                                                                 |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config.sh                                                                                            |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config4.8.1.sh                                                                                       |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform007.sh                                                                                |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform009.sh                                                                                |  179 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/format-diff.sh                                                                                              |  148 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/gnu_parallel                                                                                                |   52 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/make_package.sh                                                                                             |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh                                                                                    |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator                                                                                   | 1063 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1                                                                                          |   30 
 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh                                                                                            |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache.cc                                                                                                          |   72 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench.cc                                                                                                    |  275 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc                                                                                               |  794 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc                                                                                              |   70 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.h                                                                                               |  134 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_stats.h                                                                                               |  183 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_helpers.h                                                                                                   |  125 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.cc                                                                                                      |  271 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.h                                                                                                       |  132 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc                                                                                      |  188 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h                                                                                       |  191 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc                                                                                 |  506 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_test.cc                                                                                                     |  129 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/clock_cache.cc                                                                                                    |  180 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.cc                                                                                                      |  496 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.h                                                                                                       |  192 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache_test.cc                                                                                                 | 1660 +++
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.cc                                                                                                  |  112 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.h                                                                                                   |   75 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in                                                                                            |   51 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake                                                                                            |    7 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake                                                                                          |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake                                                                                          |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake                                                                                          |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake                                                                                           |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/coverage_test.sh                                                                                               |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py                                                                                           |   22 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc                                                                                             |  115 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h                                                                                              |   45 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_constants.h                                                                                                |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h                                                                                        |  146 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc                                                                                  |  326 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc                                                                                                 |   34 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h                                                                                                  |   37 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc                                                                                           |  156 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h                                                                                            |   67 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc                                                                                      |  210 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc                                                                                            |  375 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h                                                                                             |  103 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc                                                                                       |  672 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc                                                                                              |  102 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h                                                                                               |   52 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc                                                                                         |  268 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h                                                                                 |  101 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc                                                                                            |  134 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h                                                                                             |   57 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc                                                                                       |  173 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc                                                                                               |   62 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h                                                                                                |  170 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc                                                                                             |  582 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h                                                                                              |  106 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc                                                                                        |  974 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc                                                                                           |  100 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h                                                                                            |  102 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc                                                                                      |  196 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_index.h                                                                                                    |  187 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc                                                                                              |  145 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.h                                                                                               |  149 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc                                                                                   |  132 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h                                                                                    |   83 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc                                                                                              |  172 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h                                                                                               |   83 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc                                                                                           | 1026 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc                                                                                      |  718 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc                                                                                      |   82 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc                                                                                           |  572 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc                                                                                   |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h                                                                                    |   38 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob_index.h                                                                                                         |  179 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.cc                                                                                                           |  335 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.h                                                                                                            |   56 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/c.cc                                                                                                                 | 1248 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/c_test.c                                                                                                             | 1230 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.cc                                                                                                     |  338 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.h                                                                                                      |  176 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family_test.cc                                                                                                |  264 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compact_files_test.cc                                                                                                |  126 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.cc                                                                                                 |  160 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.h                                                                                                  |  113 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h                                                                                       |  275 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc                                                                                 |  258 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.cc                                                                                             |  140 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.h                                                                                              |   49 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h                                                                              |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc                                                                                    |  853 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h                                                                                     |  289 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc                                                                               |  500 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc                                                                                         | 1893 ++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.h                                                                                          |  253 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc                                                                              |   61 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc                                                                                    |  478 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc                                                                                      |  115 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h                                                                                       |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc                                                                                 |  182 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h                                                                                  |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc                                                                                |  179 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h                                                                                 |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc                                                                                 | 1111 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc                                                                            |  437 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h                                                                             |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc                                                                                |  825 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/file_pri.h                                                                                                |   92 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc                                                                                        |   90 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/comparator_db_test.cc                                                                                                |    8 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/convenience.cc                                                                                                       |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/corruption_test.cc                                                                                                   |  541 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc                                                                                              |   65 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_basic_test.cc                                                                                                     | 2583 +++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_blob_index_test.cc                                                                                                |  436 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_block_cache_test.cc                                                                                               | 1228 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc                                                                                              |  952 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc                                                                                         |  289 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_test.cc                                                                                                | 3058 ++++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc                                                                                             |   85 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_encryption_test.cc                                                                                                |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_filesnapshot.cc                                                                                                   |  413 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_flush_test.cc                                                                                                     | 1958 +++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc                                                                                         |  173 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h                                                                                          |  118 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc                                                                                                   | 1790 ++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.h                                                                                                    |  559 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc                                                                                  | 1399 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc                                                                                             |   72 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc                                                                                      |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc                                                                                             |  465 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc                                                                                              |  858 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc                                                                                          |  103 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h                                                                                           |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc                                                                                         |  245 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h                                                                                          |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc                                                                                             |  601 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc                                                                                         |  869 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.cc                                                                                                    |   63 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.h                                                                                                     |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc                                                                                            |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_io_failure_test.cc                                                                                                |  135 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.cc                                                                                                           |  686 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.h                                                                                                            |   91 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc                                                                                               |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_test.cc                                                                                                      |  581 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iterator_test.cc                                                                                                  |  402 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc                                                                                               |  197 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_log_iter_test.cc                                                                                                  |  127 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc                                                                                  |  513 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_memtable_test.cc                                                                                                  |   89 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc                                                                                             |  264 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc                                                                                            |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_options_test.cc                                                                                                   |  473 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_properties_test.cc                                                                                                |  520 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_range_del_test.cc                                                                                                 |  407 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_secondary_test.cc                                                                                                 | 1260 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_sst_test.cc                                                                                                       |  640 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_statistics_test.cc                                                                                                |   72 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_table_properties_test.cc                                                                                          |  347 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc                                                                                              |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test.cc                                                                                                           | 1522 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test2.cc                                                                                                          | 2788 ++++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.cc                                                                                                      |  344 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.h                                                                                                       |  475 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc                                                                                      |  383 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_wal_test.cc                                                                                                       | 1151 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc                                                                                      | 3217 +++++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc                                                                                 |  121 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc                                                                                      |  793 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_test.cc                                                                                                     |  199 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.cc                                                                                                          |   81 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.h                                                                                                           |  233 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat_test.cc                                                                                                     |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/deletefile_test.cc                                                                                                   |  108 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.cc                                                                                                     |  504 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.h                                                                                                      |   68 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc                                                                                             | 2663 ++++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_test.cc                                                                                                |  871 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.cc                                                                                                     |  167 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.h                                                                                                      |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc                                                                                      |  732 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc                                                                                   |  389 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h                                                                                    |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_test.cc                                                                                            |  531 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/fault_injection_test.cc                                                                                              |  116 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/filename_test.cc                                                                                                     |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.cc                                                                                                         |  688 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.h                                                                                                          |   70 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job_test.cc                                                                                                    |  358 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_scheduler.h                                                                                                    |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.cc                                                                                                  |   99 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.h                                                                                                   |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc                                                                                            |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.cc                                                                                          |   80 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.h                                                                                           |   22 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_test.cc                                                                                         |  151 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.cc                                                                                                    |  559 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.h                                                                                                     |  167 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/job_context.h                                                                                                        |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/kv_checksum.h                                                                                                        |  394 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/listener_test.cc                                                                                                     |  681 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.cc                                                                                                        |   64 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.h                                                                                                         |    7 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_test.cc                                                                                                          |  171 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.cc                                                                                                        |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.h                                                                                                         |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h                                                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/lookup_key.h                                                                                                         |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/malloc_stats.cc                                                                                                      |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/manual_compaction_test.cc                                                                                            |  209 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.cc                                                                                                          |  353 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.h                                                                                                           |  163 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.cc                                                                                                     |  439 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.h                                                                                                      |  109 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list_test.cc                                                                                                |  247 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_context.h                                                                                                      |   28 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.cc                                                                                                      |   97 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.h                                                                                                       |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper_test.cc                                                                                                 |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_test.cc                                                                                                        |  247 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/obsolete_files_test.cc                                                                                               |  161 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/options_file_test.cc                                                                                                 |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.cc                                                                                                  |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.h                                                                                                   |   48 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/perf_context_test.cc                                                                                                 |  108 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc                                                                                           |  117 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h                                                                                            |   78 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc                                                                                      |  236 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h                                                                                           |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/plain_table_db_test.cc                                                                                               |  148 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/pre_release_callback.h                                                                                               |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/prefix_test.cc                                                                                                       |   75 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.cc                                                                                              |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.h                                                                                               |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc                                                                                        |   22 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc                                                                                         |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc                                                                                        |   22 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h                                                                                         |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc                                                                                   |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/read_callback.h                                                                                                      |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair.cc                                                                                                            |  227 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair_test.cc                                                                                                       |  173 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/snapshot_impl.h                                                                                                      |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.cc                                                                                                       |  268 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.h                                                                                                        |  121 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.cc                                                                                        |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.h                                                                                         |   88 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc                                                                                   |   91 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.cc                                                                                              |   19 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.h                                                                                               |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.cc                                                                                                   | 1324 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.h                                                                                                    |   49 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder_test.cc                                                                                              | 1452 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.cc                                                                                                      |  366 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.h                                                                                                       |  241 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.cc                                                                                              |  980 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.h                                                                                               |  309 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_test.cc                                                                                                 |  373 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.cc                                                                                                       | 2965 ++---
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.h                                                                                                        |  477 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set_test.cc                                                                                                  | 2140 +++
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.cc                                                                                                          |  204 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.h                                                                                                           |  166 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit_test.cc                                                                                                     |  214 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.cc                                                                                                       |   78 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.h                                                                                                        |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager_test.cc                                                                                                  |   99 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch.cc                                                                                                       |  963 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_internal.h                                                                                               |  181 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_test.cc                                                                                                  |  414 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_callback_test.cc                                                                                               |  501 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.cc                                                                                                  |  101 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.h                                                                                                   |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller_test.cc                                                                                             |  328 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.cc                                                                                                      |   31 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.h                                                                                                       |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt                                                                                           |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc                                                                                    |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc                                                                                 |   67 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc                                                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc                                                                                      |  138 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h                                                                                       |  125 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h                                                                            |   90 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc                                                                                      |   74 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h                                                                                  |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc                                                                                      |  289 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc                                                                                    |  148 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h                                                                                     |   64 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc                                                                                |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h                                                                                 |  207 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc                                                                                        |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h                                                                                         |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h                                                                   |   65 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc                                                                                   | 1122 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h                                                                                    |   28 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc                                                                                        |  144 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc                                                                                        |  616 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h                                                                                         |  287 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc                                                                                 | 1037 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h                                                                                  |  302 
 mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc                                                                                 |  282 
 mariadb-10.11.13/storage/rocksdb/rocksdb/defs.bzl                                                                                                                |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile                                                                                                            |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile.lock                                                                                                       |  331 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_config.yml                                                                                                        |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/authors.yml                                                                                                  |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/nav.yml                                                                                                      |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_docs/getting-started.md                                                                                           |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_includes/doc.html                                                                                                 |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown                                                                           |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown                                                                    |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown                                                                           |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown                                                                                |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown                                                                  |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown                                                                  |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown                                                                      |  101 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown                                                                       |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown                                                                 |  195 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown                                                                  |  157 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown                                                                           |  281 
 mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_top-level/support.md                                                                                              |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env.cc                                                                                                    |  464 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env_wrapper.h                                                                                             | 1101 --
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/emulated_clock.h                                                                                                    |  114 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env.cc                                                                                                              |  960 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_basic_test.cc                                                                                                   |  204 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.cc                                                                                                       |  349 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.h                                                                                                        |   35 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption.cc                                                                                                   | 1385 +-
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption_ctr.h                                                                                                |  116 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_hdfs.cc                                                                                                         |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_posix.cc                                                                                                        |  340 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_test.cc                                                                                                         | 1320 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system.cc                                                                                                      |  189 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.cc                                                                                               |  519 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.h                                                                                                |  447 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_posix.cc                                                                                                         |  416 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_readonly.h                                                                                                       |  107 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.cc                                                                                                         |  306 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.h                                                                                                          |  139 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.cc                                                                                                         |  518 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.h                                                                                                          |  141 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix_test.cc                                                                                                    |  140 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.cc                                                                                                         |  742 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.h                                                                                                          |  191 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env_test.cc                                                                                                    |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.cc                                                                                                    |  164 
 mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.h                                                                                                     |   71 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/CMakeLists.txt                                                                                                 |   45 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/Makefile                                                                                                       |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/c_simple_example.c                                                                                             |   25 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/column_families_example.cc                                                                                     |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compact_files_example.cc                                                                                       |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc                                                                                   |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/multi_processes_example.cc                                                                                     |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc                                                                              |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/options_file_example.cc                                                                                        |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/simple_example.cc                                                                                              |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/examples/transaction_example.cc                                                                                         |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.cc                                                                                                |  103 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.h                                                                                                 |   27 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc                                                                                           |   53 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc                                                                                            |   70 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h                                                                                             |  118 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.cc                                                                                                       |  208 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.h                                                                                                        |   71 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.cc                                                                                                        |  114 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.h                                                                                                         |   48 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.cc                                                                                                |   68 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.h                                                                                                 |   59 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/prefetch_test.cc                                                                                                   | 1004 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.cc                                                                                       |  363 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.h                                                                                        |  120 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc                                                                                  |  483 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.cc                                                                                                 |   41 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.h                                                                                                  |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_file_info.h                                                                                              |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.cc                                                                                                   |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.h                                                                                                    |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.cc                                                                                            |   66 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.h                                                                                             |   84 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc                                                                                           |  104 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h                                                                                            |   57 
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.cc                                                                                            |  599 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.h                                                                                             |  179 
 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/Makefile                                                                                                           |   61 
 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/README.md                                                                                                          |  160 
 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc                                                                                                       |  164 
 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc                                                                                                   |  107 
 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto                                                                                           |   28 
 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc                                                                                          |  185 
 mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/util.h                                                                                                             |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/hdfs/env_hdfs.h                                                                                                         |   66 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h                                                                                      |  286 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/c.h                                                                                                     |  618 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache.h                                                                                                 |  247 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h                                                                                      |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h                                                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h                                                                                     |  146 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h                                                                                  |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/comparator.h                                                                                            |   41 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h                                                                                      |   40 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h                                                                               |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/configurable.h                                                                                          |  397 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/convenience.h                                                                                           |  199 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/customizable.h                                                                                          |  233 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h                                                                                        |   51 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/db.h                                                                                                    |  420 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env.h                                                                                                   |  465 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h                                                                                        |  465 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h                                                                                         |   92 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_system.h                                                                                           |  423 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h                                                                                         |  150 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h                                                                                    |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h                                                                                       |   56 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/io_status.h                                                                                             |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h                                                                                       |   38 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iterator.h                                                                                              |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/listener.h                                                                                              |  342 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h                                                                                      |   42 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h                                                                                           |   65 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h                                                                                        |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/metadata.h                                                                                              |  278 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/options.h                                                                                               |  534 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h                                                                                          |   25 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h                                                                                      |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h                                                                                          |   41 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h                                                                                     |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h                                                                                       |   85 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h                                                                                       |   40 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h                                                                                         |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h                                                                                      |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h                                                                                       |   28 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h                                                                                       |  142 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/statistics.h                                                                                            |  196 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h                                                                                         |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/status.h                                                                                                |  222 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h                                                                                          |  116 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table.h                                                                                                 |  308 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h                                                                                      |   98 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h                                                                                         |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h                                                                                   |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h                                                                                          |  247 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h                                                                                   |  187 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h                                                                                       |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/types.h                                                                                                 |   58 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h                                                                                             |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h                                                                                  |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h                                                                               |  616 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h                                                                               |  335 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h                                                                             |  142 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h                                                                                  |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h                                                                           |  368 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h                                                                                      |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h                                                                                |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h                                                                                     |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h                                                                      |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h                                                                             |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h                                                                             |  473 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h                                                                   |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h                                                                                |  946 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h                                                                                |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h                                                                                       |   48 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h                                                                                    |   87 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h                                                                                   |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h                                                                                |   62 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h                                                                 |   58 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h                                                                                 |   90 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h                                                                              |  145 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h                                                                        |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h                                                                      |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/version.h                                                                                               |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h                                                                                            |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h                                                                                           |   98 
 mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h                                                                                  |  144 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/CMakeLists.txt                                                                                                     |   66 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/Makefile                                                                                                           |  213 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh                                                                            |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh                                                                            |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/README.md                                                                                                      |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/pom.xml                                                                                                        |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java                                                          |  100 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java                                                                    |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/pom.xml.template                                                                                                   |  178 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni.pom                                                                                                       |  150 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/cache.cc                                                                                                  |   35 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc                                                                                    |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc                                                                                |   90 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc                                                                                         |   88 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc                                                                                         |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc                                                                             |  502 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h                                                                              |  122 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc                                                                                               |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc                                                                                            |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h                                                                                             |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc                                                                                            |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc                                                                                         |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options.cc                                                                                                | 1388 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc                                                                                           |   64 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/portal.h                                                                                                  | 1215 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc                                                                                  |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc                                                                                               |  735 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/slice.cc                                                                                                  |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc                                                                               |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc                                                                                        |   42 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc                                                                                          |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h                                                                                           |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/table.cc                                                                                                  |   25 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc                                                                                |  216 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc                                                                                            |   98 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc                                                                                                    |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc                                                                                            |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc                                                                                       |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc                                                                                 |   34 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc                                                                                   |    7 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc                                                                           |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h                                                                            |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java                                                               |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java                                                                           |   31 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java                                                               |  334 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java                                                              |  148 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java                                                               |    7 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java                                                                  |   56 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java                                                |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java                                         |  254 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java                                                               |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java                                                               |  167 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java                                                                 |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java                                                                               |   27 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java                                                                        |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java                                                                  |   40 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java                                                                 |  448 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java                                                        |   87 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java                                                                 |    7 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java                                                                   |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java                                                                    |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java                                                                     |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java                                                               |   38 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java                                                           |   42 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java                                                                       |   47 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java                                                                           |  227 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java                                                                  |  295 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java                                                                         |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java                                                                                 |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java                                                                       |  335 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java                                                           |  103 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java                                                                   |  112 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java                                                                        |  186 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java                                                                         |   53 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java                                                                       |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java                                                                 |   60 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java                                                                           |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java                                                                         |   36 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java                                                                        |  103 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java                                                          |  157 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java                                                 |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java                                                                    |   35 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java                                                           |   45 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java                                                                  |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java                                                                 |  109 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java                                                                        |  256 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java                                                                             |  387 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java                                                                         |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java                                                                         |  271 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java                                                                 |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java                                                                             |  805 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java                                                                       |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java                                                              |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java                                                                         |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java                                                                         |   41 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java                                                               |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java                                                                       |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java                                                               |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java                                                    |   19 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java                                                                              |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java                                                                |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java                                                          |  107 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java                                                               |   86 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java                                                             |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java                                                               |   86 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java                                                                     |  112 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java                                                                          |   74 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java                                                                        |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java                                                                         |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java                                                                       |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java                                                                     |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java                                                                               |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java                                                                   |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java                                                                          |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java                                                                 |   92 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java                                                                 |   63 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java                                                                  |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java                                                                        |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java                                                                 |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java                                                                      |   75 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java                                                                    |   30 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java                                                                     |  313 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java                                                           |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java                                                    |  126 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java                                                             |   85 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java                                                                    |  513 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java                                                         |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java                                                           |   50 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java                                                                       |  130 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java                                                                   |  763 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java                                                                     |  654 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java                                                                        |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java                                                                      |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java                                                                           |   45 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java                                                                |   70 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java                                                                        |  525 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java                                                      |   94 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java                                                            |  397 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java                                                         |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java                                                                         |  188 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java                                                                        |  191 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java                                                                     |   62 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java                                                                         |   87 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java                                                                   |   21 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java                                                                     |  135 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java                                                                  |   72 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java                                                                     |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java                                                                      |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java                                                             |  193 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java                                                          |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java                                                         |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java                                                     |   28 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java                                                                |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java                                                                       |   19 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java                                                               |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/java/understanding_options.md                                                                                           |   79 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc                                                                                             |   79 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.h                                                                                              |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc                                                                                        |  154 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger.h                                                                                                    |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger_test.cc                                                                                              |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/event_logger.cc                                                                                                 |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/logging.h                                                                                                       |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/logging/posix_logger.h                                                                                                  |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.cc                                                                                                         |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.h                                                                                                          |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/concurrent_arena.h                                                                                               |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc                                                                                     |  281 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h                                                                                      |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc                                                                                        |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h                                                                                         |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator.cc                                                                                              |   91 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc                                                                                         |  243 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_usage.h                                                                                                   |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc                                                                                           |   83 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h                                                                                            |   49 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc                                                                                           |   58 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h                                                                                            |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist.h                                                                                               |   53 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc                                                                                         |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc                                                                                           |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplist_test.cc                                                                                               |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplistrep.cc                                                                                                 |   87 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/vectorrep.cc                                                                                                   |   30 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc                                                                                        |  225 
 mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc                                                                                   |  236 
 mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/CMakeLists.txt                                                                                               |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc                                                                                            |  134 
 mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc                                                                                              |  156 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.cc                                                                                                 |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.h                                                                                                  |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_test.cc                                                                                            |   38 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc                                                                                       |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h                                                                                        |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc                                                                                        |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h                                                                                         |   42 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context.cc                                                                                           |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h                                                                                        |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context.cc                                                                                              |   40 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h                                                                                           |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h                                                                                            |   36 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc                                                                                  |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.cc                                                                                                |  116 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.h                                                                                                 |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics_test.cc                                                                                           |   49 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc                                                                                        |  396 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc                                                                                     |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc                                                                               |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc                                                                                        |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc                                                                                  |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.cc                                                                                                   |  935 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.h                                                                                                    |  145 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable.cc                                                                                                 |  785 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_helper.h                                                                                           |  187 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.cc                                                                                            |  880 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.h                                                                                             |  126 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable.cc                                                                                                 |  137 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable_test.cc                                                                                            | 2132 +++
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.cc                                                                                                   |  785 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.h                                                                                                    |   53 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options.cc                                                                                                      |   91 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.cc                                                                                               | 2621 +---
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.h                                                                                                |  204 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.cc                                                                                               |  559 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.h                                                                                                |   66 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.cc                                                                                         |   38 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.h                                                                                          |   50 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_settable_test.cc                                                                                        |  189 
 mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_test.cc                                                                                                 | 3434 +++++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/plugin/README.md                                                                                                        |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/jemalloc_helper.h                                                                                                  |   65 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/lang.h                                                                                                             |   64 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_example.h                                                                                                     |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.cc                                                                                                      |   67 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.h                                                                                                       |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.cc                                                                                                     |   75 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.h                                                                                                      |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/sys_time.h                                                                                                         |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_default.cc                                                                                                 |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.cc                                                                                                     | 1027 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.h                                                                                                      |  413 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.cc                                                                                                      |  603 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.h                                                                                                       |  270 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.cc                                                                                                    |  126 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.h                                                                                                     |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc                                                                                                |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.cc                                                                                                  |   36 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.h                                                                                                   |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.cc                                                                                                  |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.h                                                                                                   |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.cc                                                                                                  |   19 
 mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.h                                                                                                   |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/src.mk                                                                                                                  |  239 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc                                                                                |   27 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h                                                                                 |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc                                                                         |   73 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h                                                                          |   48 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.cc                                                                                              |  499 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.h                                                                                               |  339 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc                                                                           |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h                                                                            |   19 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc                                                                      |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc                                                                          | 1682 ++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h                                                                           |   90 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc                                                                          |  754 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h                                                                           |  152 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc                                                                         |  382 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h                                                                          |  273 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc                                                                           | 3001 +----
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h                                                                            |  503 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h                                                                       |  163 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc                                                                      |  357 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.cc                                                                                      |   74 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.h                                                                                       |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h                                                                                   |  225 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc                                                                                   |  100 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h                                                                                    |   66 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_test.cc                                                                                         |   67 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_type.h                                                                                          |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h                                                                                      |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc                                                                         |   75 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block.h                                                                                        |   51 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc                                                                         |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc                                                                                      | 1057 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h                                                                              |  117 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc                                                                                 |   64 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h                                                                                  |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc                                                                                  |   95 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h                                                                                   |   34 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc                                                                             |   25 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc                                                                                  |  147 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h                                                                                   |   49 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.cc                                                                                      |   47 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.h                                                                                       |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc                                                                                |   55 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h                                                                                 |   85 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h                                                                              |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h                                                                            |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc                                                                           |  252 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h                                                                            |   54 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc                                                                      |   30 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc                                                                         |  162 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h                                                                          |  159 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc                                                                           |  207 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h                                                                            |   54 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.cc                                                                                      |   52 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.h                                                                                       |   38 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc                                                                          |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h                                                                           |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.cc                                                                                                  |  257 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.h                                                                                                   |   69 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher_test.cc                                                                                             |  521 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc                                                                                    |   79 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h                                                                                     |   27 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc                                                                               |  134 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc                                                                                    |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h                                                                                     |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc                                                                                     |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h                                                                                      |    8 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc                                                                                |   98 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.cc                                                                                                         |  496 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.h                                                                                                          |  261 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.cc                                                                                                    |  192 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.h                                                                                                     |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/internal_iterator.h                                                                                               |   67 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/iterator_wrapper.h                                                                                                |   45 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/merger_test.cc                                                                                                    |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.cc                                                                                               |   51 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.h                                                                                                |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.cc                                                                                                    |  363 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.h                                                                                                     |   71 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.cc                                                                                                     |  255 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.h                                                                                                      |  178 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/multiget_context.h                                                                                                |   94 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc                                                                                        |   56 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_options.h                                                                                        |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h                                                                                         |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc                                                                                      |  179 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h                                                                                       |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc                                                                                      |  340 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h                                                                                       |   61 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc                                                                                        |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.h                                                                                         |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc                                                                                   |   58 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h                                                                                    |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc                                                                                       |   51 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h                                                                                        |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.cc                                                                                                |  502 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.h                                                                                                 |   97 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader.cc                                                                                                |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc                                                                                           |  266 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer.cc                                                                                                |  185 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h                                                                                      |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_builder.h                                                                                                   |  139 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_factory.cc                                                                                                  |   65 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties.cc                                                                                               |  139 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties_internal.h                                                                                       |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader.h                                                                                                    |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_bench.cc                                                                                             |   65 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_caller.h                                                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_test.cc                                                                                                     | 2268 ++--
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/two_level_iterator.cc                                                                                             |    8 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id.cc                                                                                                      |  166 
 mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id_impl.h                                                                                                  |   59 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc                                                                                   |  437 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h                                                                                    |  225 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.cc                                                                                              |   38 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.h                                                                                               |   70 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.cc                                                                                                 |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.h                                                                                                  |   70 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc                                                                                            |   44 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.h                                                                                             |   35 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.cc                                                                                                |   58 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.h                                                                                                 |   71 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.cc                                                                                                   |  464 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.h                                                                                                    |  677 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil_test.cc                                                                                              |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc                                                                                      |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h                                                                                   |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h                                                                               |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp                                                                                |   42 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h                                                                                    |  118 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h                                                                         |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h                                                          |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp                                                   |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gcc/ppc-asm.h                                                                                               |  390 
 mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt                                                                  |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/CMakeLists.txt                                                                                                    |   35 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/advisor/README.md                                                                                                 |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/backup_db.sh                                                                                                      |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/benchmark.sh                                                                                                      |  343 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/blob_dump.cc                                                                                                      |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc                                                                |   18 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h                                                                 |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc                                                           |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_all_python.py                                                                                               |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_format_compatible.sh                                                                                        |  361 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench.cc                                                                                                       |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool.cc                                                                                                  | 2086 +++
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool_test.cc                                                                                             |   98 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_crashtest.py                                                                                                   |  442 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_repl_stress.cc                                                                                                 |  121 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser.cc                                                                                               |   25 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_test.cc                                                                                          |  189 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc                                                                                          |  144 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.h                                                                                           |   40 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd.cc                                                                                                        |  932 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_impl.h                                                                                                    |   78 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_test.cc                                                                                                   |  539 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_test.py                                                                                                       |  208 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_tool.cc                                                                                                       |   37 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.cc                                                                                                 |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.h                                                                                                  |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/rdb.cc                                                                                                        |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/reduce_levels_test.cc                                                                                             |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/regression_test.sh                                                                                                |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/report_lite_binary_size.sh                                                                                        |   42 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/restore_db.sh                                                                                                     |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/run_blob_bench.sh                                                                                                 |  195 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc                                                                                   |  246 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h                                                                                    |  126 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump.cc                                                                                                       |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_test.cc                                                                                                  |  245 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool.cc                                                                                                  |  660 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool_imp.h                                                                                               |   87 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_test.cc                                                                                            |  279 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.cc                                                                                            |  585 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.h                                                                                             |  142 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_external_sst.sh                                                                                             |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress.cc                                                                                                   |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress_runner.py                                                                                            |    8 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc                                                                                      |   22 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.h                                                                                       |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc                                                                                 |   36 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.cc                                                                                               |  303 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.h                                                                                                |  185 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer_test.cc                                                                                          |  352 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record.cc                                                                                            |  206 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.cc                                                                                    |  190 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.h                                                                                     |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_result.cc                                                                                     |  146 
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.cc                                                                                            |  817 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.h                                                                                             |  172 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/aligned_buffer.h                                                                                                   |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector.h                                                                                                       |   40 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector_test.cc                                                                                                 |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_impl.h                                                                                                       |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_test.cc                                                                                                      |  724 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.cc.in                                                                                                |   74 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.h                                                                                                    |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/cast_util.h                                                                                                        |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/channel.h                                                                                                          |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding.h                                                                                                           |  107 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_lean.h                                                                                                      |  101 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_test.cc                                                                                                     |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/compaction_job_stats_impl.cc                                                                                       |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/comparator.cc                                                                                                      |   90 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression.h                                                                                                      |  221 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression_context_cache.cc                                                                                       |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.cc                                                                                                          |  221 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.h                                                                                                           |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.cc                                                                                                    |  146 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.h                                                                                                     |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.c                                                                                                       |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.h                                                                                                       |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc_asm.S                                                                                                   |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_test.cc                                                                                                     |   47 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer.h                                                                                                            |   31 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer_test.cc                                                                                                      |   11 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/duplicate_detector.h                                                                                               |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom.h                                                                                                    |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom_test.cc                                                                                              |   10 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/fastrange.h                                                                                                        |  114 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.cc                                                                                            |   95 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.h                                                                                             |   95 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_reader_writer_test.cc                                                                                         |  613 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/filelock_test.cc                                                                                                   |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/filter_bench.cc                                                                                                    |   94 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/gflags_compat.h                                                                                                    |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.cc                                                                                                            |  128 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.h                                                                                                             |  103 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash128.h                                                                                                          |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_map.h                                                                                                         |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_test.cc                                                                                                       |  545 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/heap.h                                                                                                             |    7 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/kv_map.h                                                                                                           |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/log_write_bench.cc                                                                                                 |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/math.h                                                                                                             |  242 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/math128.h                                                                                                          |  310 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.cc                                                                                                      |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.h                                                                                                       |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/mutexlock.h                                                                                                        |   51 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.cc                                                                                                          |   27 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.h                                                                                                           |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.cc                                                                                                    |  439 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.h                                                                                                     |   86 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter_test.cc                                                                                               |  386 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/regex.cc                                                                                                           |   50 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread.h                                                                                                |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread_test.cc                                                                                          |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_alg.h                                                                                                       | 1225 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.cc                                                                                                   |  506 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.h                                                                                                    |  182 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_impl.h                                                                                                      | 1137 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_test.cc                                                                                                     | 1308 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/set_comparator.h                                                                                                   |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice.cc                                                                                                           |  223 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice_test.cc                                                                                                      |   54 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/status.cc                                                                                                          |   37 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/stop_watch.h                                                                                                       |   36 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.cc                                                                                                     |  106 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.h                                                                                                      |   55 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_guard.h                                                                                                     |   41 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_list_test.cc                                                                                                |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.cc                                                                                                    |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.h                                                                                                     |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local_test.cc                                                                                               |   51 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.cc                                                                                                  |   72 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.h                                                                                                   |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer.h                                                                                                            |  331 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer_test.cc                                                                                                      |  402 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/user_comparator_wrapper.h                                                                                          |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/util.h                                                                                                             |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/vector_iterator.h                                                                                                  |   46 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue.h                                                                                                       |  150 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue_test.cc                                                                                                 |  268 
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxh3p.h                                                                                                            | 1648 ---
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.cc                                                                                                          | 1181 --
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.h                                                                                                           | 5444 +++++++++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxph3.h                                                                                                            | 1762 +++
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db.cc                                                                                   | 2543 +++-
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_impl.h                                                                               |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_test.cc                                                                              | 2557 ++++
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc                                                                             |  409 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h                                                                              |  130 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.cc                                                                                            |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.h                                                                                             |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h                                                                                    |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc                                                                                       |  268 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h                                                                                        |   34 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc                                                                          |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h                                                                                    |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h                                                                                    |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc                                                                                       |  567 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc                                                                                     |   28 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h                                                                                      |    3 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.cc                                                                                          |  106 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.h                                                                                           |   30 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.cc                                                                                    |  149 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.h                                                                                     |  133 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.cc                                                                                    |  105 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.h                                                                                     |   82 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.cc                                                                                    |  139 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.h                                                                                     |   94 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load.cc                                                                                            |   69 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc                                                                                       |  489 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.h                                                                                        |  365 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc                                                                      |   73 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h                                                                       |   27 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_format_test.cc                                                                            |   28 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc                                                                        |  206 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h                                                                                 |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc                                                                         |   22 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_serialize_test.cc                                                                         |    1 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/format.h                                                                                            |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc                                                                                   |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.h                                                                                    |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/serialize.h                                                                                         |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.cc                                                                                       |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.h                                                                                        |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc                                                                                 |  340 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h                                                                                  |   34 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc                                                                                 |  173 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters.cc                                                                                         |   56 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h                                                           |   41 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc                                                      |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h                                                       |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc                                                                                |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/debug.cc                                                                                                      |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.cc                                                                                               |  104 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.md                                                                                               |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados_test.cc                                                                                          |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_mirror.cc                                                                                                 |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.cc                                                                                                  |  286 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.h                                                                                                   |   97 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed_test.cc                                                                                             |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.cc                                                                                        |  548 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.h                                                                                         |  258 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.cc                                                                                         |  994 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.h                                                                                          |  582 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc                                                                            |  110 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h                                                                             |   94 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory/memory_test.cc                                                                                         |   50 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory_allocators.h                                                                                           |  104 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.cc                                                                                            |  120 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.h                                                                                             |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.cc                                                                                   |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h                                                                                    |    8 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/max.cc                                                                                        |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/put.cc                                                                                        |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc                                                                                   |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.h                                                                                    |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc                                                                 |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h                                                                  |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc                                                                |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h                                                                 |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc                                                            |  252 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc                                                                                  |   24 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry.cc                                                                                            |  227 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry_test.cc                                                                                       |  619 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc                                                            |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc                                                       |   87 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util.cc                                                                                       |   83 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util_test.cc                                                                                  |  519 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc                                                                          |    5 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h                                                                           |   19 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc                                                                     |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h                                                                      |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h                                                                  |    8 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h                                                                       |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc                                                                    |   17 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc                                                                     |   98 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h                                                                      |    9 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc                                                                     |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h                                                                      |   12 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc                                                                        |    6 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h                                                                         |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc                                                                            |   32 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc                                                                       |    2 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc                                                                                  |   84 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc                                                                             |   57 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc                                                  |  199 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h                                                   |   14 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc                                             |  138 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc                                                                             |   43 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h                                                                              |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.cc                                                                                        |  316 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.h                                                                                         |   86 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc                                                                             |   29 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h                                                                              |   82 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_tracker.h                                                                              |  209 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc                                                                 |  718 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h                                                                  |  223 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc                                                            |  181 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h                                                             |  319 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc                                                                 |  270 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h                                                                  |   99 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h                                                                  |   30 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc                                                                 |  422 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3                                                         |  661 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2                                                       |  174 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2                                                          |  339 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README                                                                 |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h                                                                   |   76 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h                                                        |  138 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h                                                         |  102 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc                                            |  139 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h                                             |  174 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc                                                   |  222 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h                                                    |  141 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc                                               |  525 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h                                                |  253 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc                                                   | 1024 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h                                                    |  580 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc                                                    |  527 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc                                               |  265 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h                                                |  178 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc                                                   |  520 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h                                                    |  302 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc                                                  |  120 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h                                                   |   92 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc                                                        |  213 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h                                                         |  124 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h                                                   |  215 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h                                        |   39 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h                                              |  130 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h                                    |   83 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h                                     |  286 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h                                         |   87 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h                                             |  520 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h                                          |  179 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h                                                |  176 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h                                                |   27 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc                                                     |  132 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc                                                            |  153 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h                                                             |   98 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h                                                  |  144 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc                                                       |  201 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h                                                        |  141 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h                                                             |  794 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h                                                        | 1295 ++
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h                                             |  165 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h                                                          |   76 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc                                                 |  503 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h                                                  |  137 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc                                                 |  156 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h                                                  |  146 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc                                                                        |   33 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h                                                                 |   16 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc                                                                   |  672 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc                                                                       |  200 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h                                                                        |   15 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc                                                                    |  107 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h                                                                     |   20 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.cc                                                                              |  267 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.h                                                                               |   56 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.cc                                                                          |  745 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.h                                                                           |  158 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.cc                                                                              |  568 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.h                                                                               |   45 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.cc                                                                              |   70 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.h                                                                               |   52 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc                                                               |  772 +
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc                                                                            |   83 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc                                                                         |   78 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h                                                                          |   13 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc                                                             |  105 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc                                                                          |  143 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h                                                                           |    4 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc                                                                       |   26 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h                                                                        |   40 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc                                                                                            |  375 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h                                                                                             |  231 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/ttl_test.cc                                                                                               |  276 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/wal_filter.cc                                                                                                 |   23 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc                                                              |  669 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc                                                     |  655 -
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h                                                      |  196 
 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc                                                         | 1766 ++-
 mariadb-10.11.13/storage/spider/CMakeLists.txt                                                                                                                   |   11 
 mariadb-10.11.13/storage/spider/ha_spider.cc                                                                                                                     |   27 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/basic_sql.test                                                                                            |    3 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha.test                                                                                                   |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha_part.test                                                                                              |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_comma_pwd_init.inc                                                              |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_tmp_table_init.inc                                                              |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_comma_pwd.result                                                                      |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_tmp_table.result                                                                      |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_26345.result                                                                                     |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29002.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29163.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29502.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29605.result                                                                                     |   19 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29962.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30392.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30408.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31338.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31645.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_34003.result                                                                                     |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35807.result                                                                                     |   16 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35874.result                                                                                     |   51 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35959.result                                                                                     |   25 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/subquery.result                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early.result                                                                           |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early_init_file.result                                                                 |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/checksum_table_with_quick_mode_3.test                                                                 |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/cp932_column.test                                                                                     |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/delete_with_float_column.inc                                                                          |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/group_by_order_by_limit.test                                                                          |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/insert_select.test                                                                                    |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_19866.test                                                                                       |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20100.test                                                                                       |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20502.test                                                                                       |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_21884.test                                                                                       |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_26345.test                                                                                       |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_27172.test                                                                                       |    8 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29002.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29008.test                                                                                       |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29163.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29502.test                                                                                       |    5 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29605.test                                                                                       |   25 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29962.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30392.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30408.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30649.test                                                                                       |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30727.test                                                                                       |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31338.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31645.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34003.test                                                                                       |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34659.test                                                                                       |    3 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35807.test                                                                                       |   21 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35874.test                                                                                       |   53 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35959.test                                                                                       |   30 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_0.test                                                                                     |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_1.test                                                                                     |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_2.test                                                                                     |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_3.test                                                                                     |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_insert.test                                                                         |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_update.test                                                                         |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_by_null.test                                                                                   |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_with_backquote.test                                                                            |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test                                                                              |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc                                                                                          |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test                                                                                  |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/subquery.test                                                                                         |    1 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/udf_mysql_func_early.test                                                                             |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/wrapper_mariadb.test                                                                                  |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/xa_cmd.test                                                                                           |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/feature/r/pushdown_case.result                                                                                 |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/checksum_table_parallel.inc                                                                          |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/pushdown_case.test                                                                                   |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_key.test                                                                |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_pkey.test                                                               |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/load_data.inc                                                                               |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/group_by_order_by_limit_ok.test                                                           |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/load_data_part.inc                                                                        |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/auto_increment.test                                                                                          |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/checksum_table_with_quick_mode_3.test                                                                        |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join.test                                                                                             |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join_using.test                                                                                       |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join.test                                                                                        |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join_nullable.test                                                                               |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_join_nullable.test                                                                         |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_left_join_nullable.test                                                                    |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join.test                                                                                       |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join_nullable.test                                                                              |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_join_nullable.test                                                                         |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_right_join_nullable.test                                                                   |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha.test                                                                                                      |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha_part.test                                                                                                 |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_cond_push.test                                                                                     |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_fulltext.test                                                                                      |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_join_pushdown_for_single_partition.test                                                            |    6 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/pushdown_not_like.test                                                                                       |    4 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_0.test                                                                                            |   10 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_1.test                                                                                            |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_2.test                                                                                            |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_3.test                                                                                            |   12 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/slave_trx_isolation.test                                                                                     |    2 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/timestamp.test                                                                                               |   28 
 mariadb-10.11.13/storage/spider/mysql-test/spider/t/udf_pushdown.inc                                                                                             |    4 
 mariadb-10.11.13/storage/spider/spd_db_conn.cc                                                                                                                   |   39 
 mariadb-10.11.13/storage/spider/spd_db_include.h                                                                                                                 |    4 
 mariadb-10.11.13/storage/spider/spd_db_mysql.cc                                                                                                                  |    4 
 mariadb-10.11.13/storage/spider/spd_direct_sql.cc                                                                                                                |    4 
 mariadb-10.11.13/storage/spider/spd_group_by_handler.cc                                                                                                          |   10 
 mariadb-10.11.13/storage/spider/spd_table.cc                                                                                                                     |    4 
 mariadb-10.11.13/storage/spider/spd_trx.cc                                                                                                                       |  219 
 mariadb-10.11.13/storage/spider/spd_trx.h                                                                                                                        |    5 
 mariadb-10.11.13/strings/ctype-bin.c                                                                                                                             |    2 
 mariadb-10.11.13/strings/ctype-latin1.c                                                                                                                          |    3 
 mariadb-10.11.13/strings/ctype-mb.c                                                                                                                              |    2 
 mariadb-10.11.13/strings/ctype-simple.c                                                                                                                          |    2 
 mariadb-10.11.13/strings/ctype-uca.inl                                                                                                                           |    2 
 mariadb-10.11.13/strings/ctype-ucs2.c                                                                                                                            |   10 
 mariadb-10.11.13/strings/ctype-utf8.c                                                                                                                            |    4 
 mariadb-10.11.13/strings/json_lib.c                                                                                                                              |   10 
 mariadb-10.11.13/strings/strings_def.h                                                                                                                           |    2 
 mariadb-10.11.13/support-files/mariadb.service.in                                                                                                                |    8 
 mariadb-10.11.13/support-files/mariadb@.service.in                                                                                                               |    8 
 mariadb-10.11.13/support-files/rpm/server-prein.sh                                                                                                               |   23 
 mariadb-10.11.13/tests/mysql_client_fw.c                                                                                                                         |    4 
 mariadb-10.11.13/tests/mysql_client_test.c                                                                                                                       |  210 
 mariadb-10.11.13/tpool/aio_liburing.cc                                                                                                                           |   10 
 mariadb-10.11.13/tpool/tpool_generic.cc                                                                                                                          |    1 
 mariadb-10.11.13/win/packaging/ca/CMakeLists.txt                                                                                                                 |    5 
 mariadb-10.11.13/win/upgrade_wizard/CMakeLists.txt                                                                                                               |   20 
 mariadb-10.11.13/wsrep-lib/.github/workflows/cmake.yml                                                                                                           |   71 
 mariadb-10.11.13/wsrep-lib/.gitignore                                                                                                                            |    3 
 mariadb-10.11.13/wsrep-lib/CMakeLists.txt                                                                                                                        |    2 
 mariadb-10.11.13/wsrep-lib/CONTRIBUTORS.txt                                                                                                                      |    1 
 mariadb-10.11.13/wsrep-lib/cmake/boost.cmake                                                                                                                     |    2 
 mariadb-10.11.13/wsrep-lib/include/wsrep/client_state.hpp                                                                                                        |    6 
 mariadb-10.11.13/wsrep-lib/include/wsrep/connection_monitor_service.hpp                                                                                          |   71 
 mariadb-10.11.13/wsrep-lib/include/wsrep/id.hpp                                                                                                                  |    5 
 mariadb-10.11.13/wsrep-lib/include/wsrep/provider.hpp                                                                                                            |   26 
 mariadb-10.11.13/wsrep-lib/include/wsrep/seqno.hpp                                                                                                               |    5 
 mariadb-10.11.13/wsrep-lib/include/wsrep/server_state.hpp                                                                                                        |   44 
 mariadb-10.11.13/wsrep-lib/include/wsrep/storage_service.hpp                                                                                                     |   11 
 mariadb-10.11.13/wsrep-lib/include/wsrep/transaction.hpp                                                                                                         |    6 
 mariadb-10.11.13/wsrep-lib/include/wsrep/view.hpp                                                                                                                |    4 
 mariadb-10.11.13/wsrep-lib/src/CMakeLists.txt                                                                                                                    |    1 
 mariadb-10.11.13/wsrep-lib/src/client_state.cpp                                                                                                                  |   16 
 mariadb-10.11.13/wsrep-lib/src/config_service_v1.cpp                                                                                                             |    5 
 mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.cpp                                                                                                 |  142 
 mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.hpp                                                                                                 |   56 
 mariadb-10.11.13/wsrep-lib/src/id.cpp                                                                                                                            |   34 
 mariadb-10.11.13/wsrep-lib/src/provider.cpp                                                                                                                      |    7 
 mariadb-10.11.13/wsrep-lib/src/server_state.cpp                                                                                                                  |   47 
 mariadb-10.11.13/wsrep-lib/src/transaction.cpp                                                                                                                   |   21 
 mariadb-10.11.13/wsrep-lib/src/view.cpp                                                                                                                          |    2 
 mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.cpp                                                                                                            |   33 
 mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.hpp                                                                                                            |    2 
 mariadb-10.11.13/wsrep-lib/test/id_test.cpp                                                                                                                      |   54 
 mariadb-10.11.13/wsrep-lib/test/mock_provider.hpp                                                                                                                |   22 
 mariadb-10.11.13/wsrep-lib/test/mock_server_state.hpp                                                                                                            |   29 
 mariadb-10.11.13/wsrep-lib/test/test_utils.cpp                                                                                                                   |   16 
 mariadb-10.11.13/wsrep-lib/test/test_utils.hpp                                                                                                                   |    2 
 mariadb-10.11.13/wsrep-lib/test/transaction_test.cpp                                                                                                             |    4 
 mariadb-10.11.13/wsrep-lib/test/transaction_test_2pc.cpp                                                                                                         |   45 
 mariadb-10.11.13/wsrep-lib/test/transaction_test_xa.cpp                                                                                                          |   29 
 mariadb-10.11.13/wsrep-lib/wsrep-API/v26/CONTRIBUTORS.txt                                                                                                        |    1 
 mariadb-10.11.13/wsrep-lib/wsrep-API/v26/wsrep_connection_monitor_service.h                                                                                      |  134 
 2471 files changed, 239030 insertions(+), 65503 deletions(-)

diff -Nru mariadb-10.11.11/CMakeLists.txt mariadb-10.11.13/CMakeLists.txt
--- mariadb-10.11.11/CMakeLists.txt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/CMakeLists.txt	2025-05-19 16:14:23.000000000 +0000
@@ -14,7 +14,7 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335 USA
 
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12)
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8...3.12)
 
 IF(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   # Setting build type to RelWithDebInfo as none was specified.
@@ -31,7 +31,7 @@
 # in RPM's:
 
 #set(CPACK_RPM_SPEC_MORE_DEFINE "%define __spec_install_post /bin/true")
-FOREACH(p CMP0022 CMP0046 CMP0040 CMP0048 CMP0054 CMP0067 CMP0074 CMP0075 CMP0069 CMP0135)
+FOREACH(p CMP0022 CMP0046 CMP0040 CMP0048 CMP0054 CMP0056 CMP0067 CMP0074 CMP0075 CMP0069 CMP0135 CMP0091)
   IF(POLICY ${p})
     CMAKE_POLICY(SET ${p} NEW)
   ENDIF()
@@ -246,7 +246,7 @@
 
 OPTION(WITH_MSAN "Enable memory sanitizer" OFF)
 IF (WITH_MSAN)
-  MY_CHECK_AND_SET_COMPILER_FLAG("-fsanitize=memory -fsanitize-memory-track-origins -U_FORTIFY_SOURCE" DEBUG RELWITHDEBINFO)
+  MY_CHECK_AND_SET_COMPILER_FLAG("-fsanitize=memory -fsanitize-memory-track-origins -U_FORTIFY_SOURCE")
   IF(NOT (have_C__fsanitize_memory__fsanitize_memory_track_origins__U_FORTIFY_SOURCE
           AND have_CXX__fsanitize_memory__fsanitize_memory_track_origins__U_FORTIFY_SOURCE))
     MESSAGE(FATAL_ERROR "Compiler doesn't support -fsanitize=memory flags")
@@ -256,7 +256,7 @@
     MESSAGE(FATAL_ERROR "C++ Compiler requires support for -stdlib=libc++")
   ENDIF()
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
-  MY_CHECK_AND_SET_LINKER_FLAG("-fsanitize=memory" DEBUG RELWITHDEBINFO)
+  MY_CHECK_AND_SET_LINKER_FLAG("-fsanitize=memory")
   IF(NOT HAVE_LINK_FLAG__fsanitize_memory)
     MESSAGE(FATAL_ERROR "Linker doesn't support -fsanitize=memory flags")
   ENDIF()
@@ -633,7 +633,7 @@
    perror
    replace)
   IF(WIN32)
-    ADD_DEPENDENCIES(minbuild echo mariadb-install-db my_safe_kill)
+    ADD_DEPENDENCIES(minbuild echo mariadb-install-db my_safe_kill mariadb-upgrade-service)
   ENDIF()
   ADD_CUSTOM_TARGET(smoketest
     COMMAND perl ./mysql-test-run.pl main.1st
diff -Nru mariadb-10.11.11/Docs/INFO_SRC mariadb-10.11.13/Docs/INFO_SRC
--- mariadb-10.11.11/Docs/INFO_SRC	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/Docs/INFO_SRC	2025-05-19 16:14:28.000000000 +0000
@@ -1,8 +1,8 @@
-commit: e69f8cae1a15e15b9e4f5e0f8497e1f17bdc81a4
-date: 2025-01-30 11:55:13 +0100
-build-date: 2025-01-30 11:01:27 +0000 
-short: e69f8cae1a1
+commit: 8fb09426b98583916ccfd4f8c49741adc115bac3
+date: 2025-05-13 12:27:50 +0300
+build-date: 2025-05-19 16:14:28 +0000 
+short: 8fb09426b98
 branch: HEAD
 
 
-MariaDB source 10.11.11
+MariaDB source 10.11.13
diff -Nru mariadb-10.11.11/VERSION mariadb-10.11.13/VERSION
--- mariadb-10.11.11/VERSION	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/VERSION	2025-05-19 16:14:23.000000000 +0000
@@ -1,4 +1,4 @@
 MYSQL_VERSION_MAJOR=10
 MYSQL_VERSION_MINOR=11
-MYSQL_VERSION_PATCH=11
+MYSQL_VERSION_PATCH=13
 SERVER_MATURITY=stable
diff -Nru mariadb-10.11.11/appveyor.yml mariadb-10.11.13/appveyor.yml
--- mariadb-10.11.11/appveyor.yml	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/appveyor.yml	2025-05-19 16:14:23.000000000 +0000
@@ -1,6 +1,42 @@
 version: build-{build}~branch-{branch}
 
-clone_depth: 1
+clone_depth: 10
+
+skip_branch_with_pr: true
+before_build:
+  - ps: |
+      function Get-Remote-Ref($ref) {
+        try {
+          $result = git ls-remote origin $ref 2>$null
+          if (-not $result) {
+            "Warning: Could not fetch remote ref '$ref'"
+            return $null
+          }
+          return ($result -split "`t")[0]
+        } catch {
+          "Warning: Exception while running git ls-remote for '$ref': $_"
+          return $null
+        }
+      }
+      Get-ChildItem Env: | Where-Object { $_.Name -like 'APPVEYOR*COMMIT' } | ForEach-Object { "$($_.Name)=$($_.Value)" }
+      $commit = $env:APPVEYOR_REPO_COMMIT
+      $commit2 = $env:APPVEYOR_PULL_REQUEST_HEAD_COMMIT
+      $branch = $env:APPVEYOR_REPO_BRANCH
+      $latest = $null
+      $mainBranch = $branch -match '^(main|\d+\.\d+)$'
+      if ($env:APPVEYOR_PULL_REQUEST_NUMBER -eq $null) {
+        "Branch build detected"
+        $latest = Get-Remote-Ref "refs/heads/$branch"
+      } else {
+        $pr = $env:APPVEYOR_PULL_REQUEST_NUMBER
+        $latest = Get-Remote-Ref "refs/pull/$pr/head"
+        $mainBranch = $False
+        "Pull Request build detected"
+      }
+      if ($latest -and ($commit -ne $latest) -and ($commit2 -ne $latest) -and (-not $mainBranch)) {
+        "Skipping outdated commit (latest is $latest)"
+        Exit-AppVeyorBuild
+      }
 
 build_script:
   # dump some system info
diff -Nru mariadb-10.11.11/client/mysql_upgrade.c mariadb-10.11.13/client/mysql_upgrade.c
--- mariadb-10.11.11/client/mysql_upgrade.c	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/client/mysql_upgrade.c	2025-05-19 16:14:24.000000000 +0000
@@ -855,8 +855,7 @@
   s= strchr(version, '.');
   s= strchr(s + 1, '.');
 
-  if (strncmp(upgrade_from_version, version,
-              (size_t)(s - version + 1)))
+  if (strncmp(upgrade_from_version, version, (size_t)(s - version + 1)))
   {
     if (calc_server_version(upgrade_from_version) <= MYSQL_VERSION_ID)
     {
@@ -870,9 +869,14 @@
   }
   if (!silent)
   {
-    verbose("This installation of MariaDB is already upgraded to %s.\n"
-            "There is no need to run mysql_upgrade again for %s.",
-            upgrade_from_version, version);
+    if (strcmp(upgrade_from_version, version))
+      verbose("This installation of MariaDB is already upgraded to %s.\n"
+              "There is no need to run mysql_upgrade again for %s, because "
+              "they're both %.*s.",
+              upgrade_from_version, version, (int)(s - version), version);
+    else
+      verbose("This installation of MariaDB is already upgraded to %s.\n"
+              "There is no need to run mysql_upgrade again.", version);
     if (!opt_check_upgrade)
       verbose("You can use --force if you still want to run mysql_upgrade");
   }
diff -Nru mariadb-10.11.11/client/mysqlbinlog.cc mariadb-10.11.13/client/mysqlbinlog.cc
--- mariadb-10.11.11/client/mysqlbinlog.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/client/mysqlbinlog.cc	2025-05-19 16:14:24.000000000 +0000
@@ -160,7 +160,13 @@
 
 static char *start_datetime_str, *stop_datetime_str;
 static my_time_t start_datetime= 0, stop_datetime= MY_TIME_T_MAX;
-static my_time_t last_processed_datetime= MY_TIME_T_MAX;
+
+typedef struct _last_processed_ev_t
+{
+  ulonglong position;
+  my_time_t datetime;
+} last_processed_ev_t;
+static last_processed_ev_t last_processed_ev= {0, MY_TIME_T_MAX};
 
 static ulonglong rec_count= 0;
 static MYSQL* mysql = NULL;
@@ -1611,7 +1617,19 @@
 end:
   rec_count++;
 end_skip_count:
-  last_processed_datetime= ev_when;
+  /*
+    Update the last_processed_ev, unless the event is a fake event (i.e. format
+    description (ev pointer is reset to 0) or rotate event (ev->when is 0)), or
+    the event is encrypted (i.e. type is Unknown).
+  */
+  if (ev &&
+      !(ev_type == UNKNOWN_EVENT &&
+        ((Unknown_log_event *) ev)->what == Unknown_log_event::ENCRYPTED) &&
+      !(ev_type == ROTATE_EVENT && !ev->when))
+  {
+    last_processed_ev.position= pos + ev->data_written;
+    last_processed_ev.datetime= ev_when;
+  }
 
   DBUG_PRINT("info", ("end event processing"));
   /*
@@ -2925,6 +2943,9 @@
       if (old_off != BIN_LOG_HEADER_SIZE)
         *len= 1;         // fake event, don't increment old_off
     }
+    DBUG_ASSERT(old_off + ev->data_written == old_off + (*len - 1) ||
+                (*len == 1 &&
+                 (type == ROTATE_EVENT || type == FORMAT_DESCRIPTION_EVENT)));
     Exit_status retval= process_event(print_event_info, ev, old_off, logname);
     if (retval != OK_CONTINUE)
       DBUG_RETURN(retval);
@@ -2943,6 +2964,9 @@
       DBUG_RETURN(ERROR_STOP);
     }
 
+    DBUG_ASSERT(old_off + ev->data_written == old_off + (*len - 1) ||
+                (*len == 1 &&
+                 (type == ROTATE_EVENT || type == FORMAT_DESCRIPTION_EVENT)));
     retval= process_event(print_event_info, ev, old_off, logname);
     if (retval != OK_CONTINUE)
     {
@@ -3342,6 +3366,8 @@
             the new one, so we should not do it ourselves in this
             case.
           */
+          DBUG_ASSERT(tmp_pos + new_description_event->data_written ==
+                      my_b_tell(file));
           Exit_status retval= process_event(print_event_info,
                                             new_description_event, tmp_pos,
                                             logname);
@@ -3495,20 +3521,17 @@
       }
       // else read_error == 0 means EOF, that's OK, we break in this case
 
-      /*
-        Emit a warning in the event that we finished processing input
-        before reaching the boundary indicated by --stop-position.
-      */
-      if (((longlong)stop_position != stop_position_default) &&
-          stop_position > my_b_tell(file))
-      {
-          retval = OK_STOP;
-          warning("Did not reach stop position %llu before "
-                  "end of input", stop_position);
-      }
-
       goto end;
     }
+
+    /*
+      The real location that we have read up to in the file should align with
+      the size of the event, unless the event is encrypted.
+    */
+    DBUG_ASSERT(
+        ((ev->get_type_code() == UNKNOWN_EVENT &&
+          ((Unknown_log_event *) ev)->what == Unknown_log_event::ENCRYPTED)) ||
+        old_off + ev->data_written == my_b_tell(file));
     if ((retval= process_event(print_event_info, ev, old_off, logname)) !=
         OK_CONTINUE)
       goto end;
@@ -3687,10 +3710,18 @@
     start_position= BIN_LOG_HEADER_SIZE;
   }
 
+  /*
+    Emit a warning if we finished processing input before reaching the stop
+    boundaries indicated by --stop-datetime or --stop-position.
+  */
   if (stop_datetime != MY_TIME_T_MAX &&
-      stop_datetime > last_processed_datetime)
+      stop_datetime > last_processed_ev.datetime)
     warning("Did not reach stop datetime '%s' before end of input",
             stop_datetime_str);
+  if ((static_cast<longlong>(stop_position) != stop_position_default) &&
+      stop_position > last_processed_ev.position)
+    warning("Did not reach stop position %llu before end of input",
+            stop_position);
 
   /*
     If enable flashback, need to print the events from the end to the
diff -Nru mariadb-10.11.11/client/mysqldump.c mariadb-10.11.13/client/mysqldump.c
--- mariadb-10.11.11/client/mysqldump.c	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/client/mysqldump.c	2025-05-19 16:14:24.000000000 +0000
@@ -2158,7 +2158,7 @@
       *to++='\\';
     }
     if (*name == '\'')
-      *to++= '\\';
+      *to++= '\'';
     *to++= *name++;
   }
   to[0]= '\'';
@@ -3713,7 +3713,7 @@
 
   fprintf(sql_file,
           "DELIMITER ;;\n"
-          "/*!50003 SET SESSION SQL_MODE=\"%s\" */;;\n"
+          "/*!50003 SET SESSION SQL_MODE='%s' */;;\n"
           "/*!50003 CREATE */ ",
           (*show_trigger_row)[6]);
 
@@ -4730,17 +4730,19 @@
     return 1;
   while ((row= mysql_fetch_row(tableres)))
   {
+    char buf[200];
     if (opt_replace_into)
       /* Protection against removing the current import user */
       /* MySQL-8.0 export capability */
       fprintf(md_result_file,
         "DELIMITER |\n"
-        "/*M!100101 IF current_user()=\"%s\" THEN\n"
+        "/*M!100101 IF current_user()=%s THEN\n"
         "  SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001,"
         " MESSAGE_TEXT=\"Don't remove current user %s'\";\n"
         "END IF */|\n"
         "DELIMITER ;\n"
-        "/*!50701 DROP USER IF EXISTS %s */;\n", row[0], row[0], row[0]);
+        "/*!50701 DROP USER IF EXISTS %s */;\n",
+        quote_for_equal(row[0],buf), row[0], row[0]);
     if (dump_create_user(row[0]))
       result= 1;
     /* if roles exist, defer dumping grants until after roles created */
@@ -6858,6 +6860,7 @@
   char       *result_table, *opt_quoted_table;
   char       table_buff[NAME_LEN*2+3];
   char       table_buff2[NAME_LEN*2+3];
+  char       temp_buff[NAME_LEN*2 + 3], temp_buff2[NAME_LEN*2 + 3];
   char       query[QUERY_LENGTH];
   FILE       *sql_file= md_result_file;
   DBUG_ENTER("get_view_structure");
@@ -6918,7 +6921,9 @@
               "SELECT CHECK_OPTION, DEFINER, SECURITY_TYPE, "
               "       CHARACTER_SET_CLIENT, COLLATION_CONNECTION "
               "FROM information_schema.views "
-              "WHERE table_name=\"%s\" AND table_schema=\"%s\"", table, db);
+              "WHERE table_name=%s AND table_schema=%s",
+              quote_for_equal(table, temp_buff2),
+              quote_for_equal(db, temp_buff));
 
   if (mysql_query(mysql, query))
   {
diff -Nru mariadb-10.11.11/client/mysqlslap.c mariadb-10.11.13/client/mysqlslap.c
--- mariadb-10.11.11/client/mysqlslap.c	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/client/mysqlslap.c	2025-05-19 16:14:24.000000000 +0000
@@ -2237,6 +2237,13 @@
   stats *ptr;
   unsigned int x;
 
+  if (eng && eng->string)
+    con->engine= eng->string;
+
+  /* Early return when iterations is 0 to avoid accessing uninitialized sptr */
+  if (iterations == 0)
+    return;
+
   con->min_timing= sptr->timing; 
   con->max_timing= sptr->timing;
   con->min_rows= sptr->rows;
@@ -2257,11 +2264,6 @@
       con->min_timing= ptr->timing;
   }
   con->avg_timing= con->avg_timing/iterations;
-
-  if (eng && eng->string)
-    con->engine= eng->string;
-  else
-    con->engine= NULL;
 }
 
 void
diff -Nru mariadb-10.11.11/client/mysqltest.cc mariadb-10.11.13/client/mysqltest.cc
--- mariadb-10.11.11/client/mysqltest.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/client/mysqltest.cc	2025-05-19 16:14:24.000000000 +0000
@@ -6744,7 +6744,7 @@
   my_bool have_slash= FALSE;
   
   enum {R_NORMAL, R_Q, R_SLASH_IN_Q,
-        R_COMMENT, R_LINE_START} state= R_LINE_START;
+        R_COMMENT, R_LINE_START, R_CSTYLE_COMMENT} state= R_LINE_START;
   DBUG_ENTER("read_line");
 
   *p= 0;
@@ -6831,9 +6831,23 @@
 	  state= R_Q;
 	}
       }
+      else if (c == '*' && last_char == '/')
+      {
+        state= R_CSTYLE_COMMENT;
+        break;
+      }
       have_slash= is_escape_char(c, last_quote);
       break;
 
+    case R_CSTYLE_COMMENT:
+      if (c == '!')
+        // Got the hint introducer '/*!'. Switch to normal processing of
+        // next following characters
+        state= R_NORMAL;
+      else if (c == '/' && last_char == '*')
+        state= R_NORMAL;
+      break;
+
     case R_COMMENT:
       if (c == '\n')
       {
diff -Nru mariadb-10.11.11/cmake/cpack_rpm.cmake mariadb-10.11.13/cmake/cpack_rpm.cmake
--- mariadb-10.11.11/cmake/cpack_rpm.cmake	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/cmake/cpack_rpm.cmake	2025-05-19 16:14:24.000000000 +0000
@@ -245,7 +245,7 @@
     "galera-4" "rsync" "grep" "gawk" "iproute"
     "coreutils" "findutils" "tar")
   SETA(CPACK_RPM_server_PACKAGE_RECOMMENDS "lsof" "socat" "pv")
-  SETA(CPACK_RPM_test_PACKAGE_REQUIRES "socat")
+  SETA(CPACK_RPM_test_PACKAGE_REQUIRES "${CPACK_RPM_PACKAGE_REQUIRES}" "socat")
 ENDIF()
 
 SET(CPACK_RPM_server_PRE_INSTALL_SCRIPT_FILE ${CMAKE_SOURCE_DIR}/support-files/rpm/server-prein.sh)
@@ -292,7 +292,7 @@
   ALTERNATIVE_NAME("server" "mariadb-server")
   ALTERNATIVE_NAME("server" "mysql-compat-server")
   ALTERNATIVE_NAME("test"   "mariadb-test")
-ELSEIF(RPM MATCHES "(rhel|centos|rocky)[89]")
+ELSEIF(RPM MATCHES "(rhel|centos|rocky)")
   SET(epoch 3:)
   ALTERNATIVE_NAME("backup" "mariadb-backup")
   ALTERNATIVE_NAME("client" "mariadb")
diff -Nru mariadb-10.11.11/cmake/libfmt.cmake mariadb-10.11.13/cmake/libfmt.cmake
--- mariadb-10.11.11/cmake/libfmt.cmake	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/cmake/libfmt.cmake	2025-05-19 16:14:24.000000000 +0000
@@ -28,15 +28,14 @@
   IF(WITH_LIBFMT STREQUAL "system" OR WITH_LIBFMT STREQUAL "auto")
     SET(CMAKE_REQUIRED_INCLUDES ${LIBFMT_INCLUDE_DIR})
     CHECK_CXX_SOURCE_RUNS(
-    "#define FMT_STATIC_THOUSANDS_SEPARATOR ','
-     #define FMT_HEADER_ONLY 1
+    "#define FMT_HEADER_ONLY 1
      #include <fmt/args.h>
      int main() {
        using ArgStore= fmt::dynamic_format_arg_store<fmt::format_context>;
        ArgStore arg_store;
        int answer= 4321;
        arg_store.push_back(answer);
-       return fmt::vformat(\"{:L}\", arg_store).compare(\"4,321\");
+       return fmt::vformat(\"{}\", arg_store).compare(\"4321\");
      }" HAVE_SYSTEM_LIBFMT)
     SET(CMAKE_REQUIRED_INCLUDES)
   ENDIF()
diff -Nru mariadb-10.11.11/cmake/os/Windows.cmake mariadb-10.11.13/cmake/os/Windows.cmake
--- mariadb-10.11.11/cmake/os/Windows.cmake	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/cmake/os/Windows.cmake	2025-05-19 16:14:24.000000000 +0000
@@ -15,352 +15,212 @@
 
 # This file includes Windows specific hacks, mostly around compiler flags
 
-INCLUDE (CheckCSourceCompiles)
-INCLUDE (CheckCXXSourceCompiles)
-INCLUDE (CheckStructHasMember)
-INCLUDE (CheckLibraryExists)
-INCLUDE (CheckFunctionExists)
-INCLUDE (CheckCSourceRuns)
-INCLUDE (CheckSymbolExists)
-INCLUDE (CheckTypeSize)
-
-IF(MSVC)
-  IF(CMAKE_CXX_COMPILER_ARCHITECTURE_ID STREQUAL ARM64)
-   SET(MSVC_ARM64 1)
-   SET(MSVC_INTEL 0)
-  ELSE()
-   SET(MSVC_INTEL 1)
-  ENDIF()
-ENDIF()
+
+if(MSVC)
+  if(CMAKE_CXX_COMPILER_ARCHITECTURE_ID STREQUAL ARM64)
+   set(MSVC_ARM64 1)
+   set(MSVC_INTEL 0)
+  else()
+   set(MSVC_INTEL 1)
+  endif()
+  if(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+   set(CLANG_CL TRUE)
+  endif()
+endif()
 
 # avoid running system checks by using pre-cached check results
 # system checks are expensive on VS since every tiny program is to be compiled in 
 # a VC solution.
-GET_FILENAME_COMPONENT(_SCRIPT_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
-INCLUDE(${_SCRIPT_DIR}/WindowsCache.cmake)
- 
+get_filename_component(_SCRIPT_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
+include(${_SCRIPT_DIR}/WindowsCache.cmake)
 
 # OS display name (version_compile_os etc).
-# Used by the test suite to ignore bugs on some platforms, 
-IF(CMAKE_SIZEOF_VOID_P MATCHES 8)
-  SET(SYSTEM_TYPE "Win64")
-ELSE()
-  SET(SYSTEM_TYPE "Win32")
-ENDIF()
-
-# Intel compiler is almost Visual C++
-# (same compile flags etc). Set MSVC flag
-IF(CMAKE_C_COMPILER MATCHES "icl")
- SET(MSVC TRUE)
-ENDIF()
-
-IF(MSVC  AND CMAKE_CXX_COMPILER_ID MATCHES Clang)
- SET(CLANG_CL TRUE)
-ENDIF()
-
-ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE)
-ADD_DEFINITIONS(-D_WIN32_WINNT=0x0A00)
-# We do not want the windows.h , or winsvc.h macros min/max
-ADD_DEFINITIONS(-DNOMINMAX -DNOSERVICE)
-# Speed up build process excluding unused header files
-ADD_DEFINITIONS(-DWIN32_LEAN_AND_MEAN)
-  
-# Adjust compiler and linker flags
-IF(MINGW AND CMAKE_SIZEOF_VOID_P EQUAL 4)
-   # mininal architecture flags, i486 enables GCC atomics
-  ADD_DEFINITIONS(-march=i486)
-ENDIF()
-
-MACRO(ENABLE_SANITIZERS)
-  IF(NOT  MSVC)
-    MESSAGE(FATAL_ERROR "clang-cl or MSVC necessary to enable asan/ubsan")
-  ENDIF()
-  # currently, asan is broken with static CRT.
-  IF(CLANG_CL AND NOT(MSVC_CRT_TYPE STREQUAL "/MD"))
-    SET(MSVC_CRT_TYPE "/MD" CACHE  INTERNAL "" FORCE)
-  ENDIF()
-  IF(CMAKE_SIZEOF_VOID_P EQUAL 4)
-    SET(ASAN_ARCH i386)
-  ELSE()
-    SET(ASAN_ARCH x86_64)
-  ENDIF()
-
-   # After installation, clang lib directory should be added to PATH
-  # (e.g C:/Program Files/LLVM/lib/clang/5.0.1/lib/windows)
-  SET(SANITIZER_LIBS)
-  SET(SANITIZER_LINK_LIBRARIES)
-  SET(SANITIZER_COMPILE_FLAGS)
-  IF(WITH_ASAN)
-    IF(CLANG_CL)
-      LIST(APPEND SANITIZER_LIBS
-        clang_rt.asan_dynamic-${ASAN_ARCH}.lib clang_rt.asan_dynamic_runtime_thunk-${ASAN_ARCH}.lib)
-    ENDIF()
-    STRING(APPEND SANITIZER_COMPILE_FLAGS " -fsanitize=address")
-  ENDIF()
-  IF(WITH_UBSAN)
-    STRING(APPEND SANITIZER_COMPILE_FLAGS " -fsanitize=undefined -fno-sanitize=alignment")
-  ENDIF()
-  FOREACH(lib ${SANITIZER_LIBS})
-    FIND_LIBRARY(${lib}_fullpath ${lib})
-    IF(NOT ${lib}_fullpath)
-      MESSAGE(FATAL_ERROR "Can't enable sanitizer : missing ${lib}")
+# Used by the test suite to ignore bugs on some platforms
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set(SYSTEM_TYPE "Win64")
+else()
+  set(SYSTEM_TYPE "Win32")
+endif()
+
+function(find_asan_runtime result_list)
+  set(${result_list} "" PARENT_SCOPE)
+  if(CMAKE_C_COMPILER_VERSION)
+    set(CLANG_VERSION "${CMAKE_C_COMPILER_VERSION}")
+  else()
+    return()
+  endif()
+
+  get_filename_component(CLANG_BIN_DIR "${CMAKE_C_COMPILER}" DIRECTORY)
+  get_filename_component(LLVM_ROOT "${CLANG_BIN_DIR}" DIRECTORY)
+
+  # Determine target architecture
+  execute_process(
+    COMMAND "${CMAKE_C_COMPILER}" --version
+    OUTPUT_VARIABLE CLANG_VERSION_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+  )
+
+  if(CLANG_VERSION_OUTPUT MATCHES "x86_64")
+    set(ARCH_SUFFIX "x86_64")
+  elseif(CLANG_VERSION_OUTPUT MATCHES "i686|i386")
+    set(ARCH_SUFFIX "i386")
+  elseif(CLANG_VERSION_OUTPUT MATCHES "aarch64")
+    set(ARCH_SUFFIX "aarch64")
+  else()
+    message(FATAL_ERROR "unknown arch")
+  endif()
+
+  string(REGEX MATCH "^[0-9]+" CLANG_MAJOR_VERSION "${CMAKE_C_COMPILER_VERSION}")
+  set(CLANG_VERSION_DIR "${LLVM_ROOT}/lib/clang/${CLANG_MAJOR_VERSION}")
+
+  set(out)
+  foreach(name clang_rt.asan_dynamic-${ARCH_SUFFIX}.lib
+               clang_rt.asan_dynamic_runtime_thunk-${ARCH_SUFFIX}.lib)
+    set(path "${CLANG_VERSION_DIR}/lib/windows/${name}")
+    if(EXISTS "${path}")
+      list(APPEND out ${path})
+    else()
+      message(FATAL_ERROR "expected library ${path} not found")
     ENDIF()
-    LIST(APPEND CMAKE_REQUIRED_LIBRARIES ${${lib}_fullpath})
-    STRING(APPEND CMAKE_C_STANDARD_LIBRARIES " \"${${lib}_fullpath}\" ")
-    STRING(APPEND CMAKE_CXX_STANDARD_LIBRARIES " \"${${lib}_fullpath}\" ")
-  ENDFOREACH()
-  STRING(APPEND CMAKE_C_FLAGS ${SANITIZER_COMPILE_FLAGS})
-  STRING(APPEND CMAKE_CXX_FLAGS ${SANITIZER_COMPILE_FLAGS})
-ENDMACRO()
+  endforeach()
+  set(${result_list} ${out} PARENT_SCOPE)
+endfunction()
+
+macro(enable_sanitizers)
+  # Remove the runtime checks from the compiler flags
+  # ASAN does the same thing, in many cases better
+  foreach(lang C CXX)
+    foreach(suffix "_DEBUG" "_DEBUG_INIT")
+      string(REGEX REPLACE "/RTC[1su]" "" CMAKE_${lang}_FLAGS${suffix} "${CMAKE_${lang}_FLAGS${suffix}}")
+    endforeach()
+  endforeach()
+
+  if(WITH_ASAN)
+    add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/fsanitize=address>)
+  endif()
+  if(WITH_UBSAN)
+    include(CheckCCompilerFlag)
+    check_c_compiler_flag(/fsanitize=undefined HAVE_fsanitize_undefined)
+    if (HAVE_fsanitize_undefined)
+      add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/fsanitize=undefined>)
+    else()
+      message(FATAL_ERROR "UBSAN not supported by this compiler yet")
+    endif()
+  endif()
+  if(CLANG_CL)
+    find_asan_runtime(asan_libs)
+    foreach(lib ${asan_libs})
+      link_libraries(${lib})
+      string(APPEND CMAKE_C_STANDARD_LIBRARIES " \"${lib}\"")
+      string(APPEND CMAKE_CXX_STANDARD_LIBRARIES " \"${lib}\"")
+    endforeach()
+  else()
+    add_link_options(/INCREMENTAL:NO)
+  endif()
+endmacro()
 
 
-IF(MSVC)
-  IF(MSVC_VERSION LESS 1920)
-    MESSAGE(FATAL_ERROR "Visual Studio 2019 or later is required")
-  ENDIF()
+if(MSVC)
   # Disable mingw based pkg-config found in Strawberry perl
-  SET(PKG_CONFIG_EXECUTABLE 0 CACHE INTERNAL "")
+  set(PKG_CONFIG_EXECUTABLE 0 CACHE INTERNAL "")
 
-  SET(MSVC_CRT_TYPE /MT CACHE STRING
-    "Runtime library - specify runtime library for linking (/MT,/MTd,/MD,/MDd)"
-  )
-  SET(VALID_CRT_TYPES /MTd /MDd /MD /MT)
-  IF (NOT ";${VALID_CRT_TYPES};" MATCHES ";${MSVC_CRT_TYPE};")
-    MESSAGE(FATAL_ERROR "Invalid value ${MSVC_CRT_TYPE} for MSVC_CRT_TYPE, choose one of /MT,/MTd,/MD,/MDd ")
-  ENDIF()
-
-  IF(MSVC_CRT_TYPE MATCHES "/MD")
-   # Dynamic runtime (DLLs), need to install CRT libraries.
-   SET(CMAKE_INSTALL_SYSTEM_RUNTIME_COMPONENT VCCRT)
-   SET(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS TRUE)
-   IF(MSVC_CRT_TYPE STREQUAL "/MDd")
-     SET (CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY TRUE)
-   ENDIF()
-   INCLUDE(InstallRequiredSystemLibraries)
-  ENDIF()
-
-  IF(WITH_ASAN AND (NOT CLANG_CL))
-    SET(DYNAMIC_UCRT_LINK_DEFAULT OFF)
-  ELSE()
-    SET(DYNAMIC_UCRT_LINK_DEFAULT ON)
-  ENDIF()
-
-  OPTION(DYNAMIC_UCRT_LINK "Link Universal CRT dynamically, if MSVC_CRT_TYPE=/MT" ${DYNAMIC_UCRT_LINK_DEFAULT})
-  SET(DYNAMIC_UCRT_LINKER_OPTION " /NODEFAULTLIB:libucrt.lib /DEFAULTLIB:ucrt.lib")
-
-  # Enable debug info also in Release build,
-  # and create PDB to be able to analyze crashes.
-  FOREACH(type EXE SHARED MODULE)
-   SET(CMAKE_${type}_LINKER_FLAGS_RELEASE
-     "${CMAKE_${type}_LINKER_FLAGS_RELEASE} /debug")
-   SET(CMAKE_${type}_LINKER_FLAGS_MINSIZEREL
-     "${CMAKE_${type}_LINKER_FLAGS_MINSIZEREL} /debug")
-  ENDFOREACH()
-  
-  # Force runtime libraries
-  # Compile with /Zi to get debugging information
+  if(NOT DEFINED CMAKE_MSVC_RUNTIME_LIBRARY)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
+  endif()
+
+  if(CMAKE_MSVC_RUNTIME_LIBRARY MATCHES "DLL")
+    # Dynamic runtime (DLLs), need to install CRT libraries.
+    set(CMAKE_INSTALL_SYSTEM_RUNTIME_COMPONENT VCCRT)
+    set(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS TRUE)
+    if(CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebugDLL")
+      set(CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY TRUE)
+    endif()
+    include(InstallRequiredSystemLibraries)
+  endif()
 
-  FOREACH(lang C CXX)
-    SET(CMAKE_${lang}_FLAGS_RELEASE "${CMAKE_${lang}_FLAGS_RELEASE} /Zi")
-  ENDFOREACH()
-  FOREACH(flag
-   CMAKE_C_FLAGS CMAKE_CXX_FLAGS
-   CMAKE_C_FLAGS_INIT CMAKE_CXX_FLAGS_INIT
-   CMAKE_C_FLAGS_RELEASE    CMAKE_C_FLAGS_RELWITHDEBINFO 
-   CMAKE_C_FLAGS_DEBUG      CMAKE_C_FLAGS_DEBUG_INIT 
-   CMAKE_CXX_FLAGS_RELEASE  CMAKE_CXX_FLAGS_RELWITHDEBINFO
-   CMAKE_CXX_FLAGS_DEBUG    CMAKE_CXX_FLAGS_DEBUG_INIT
-   CMAKE_C_FLAGS_MINSIZEREL  CMAKE_CXX_FLAGS_MINSIZEREL
-   )
-   STRING(REGEX REPLACE "/M[TD][d]?"  "${MSVC_CRT_TYPE}" "${flag}"  "${${flag}}" )
-   STRING(REPLACE "/ZI " "/Zi "  "${flag}"  "${${flag}}")
-   IF((NOT "${${flag}}" MATCHES "/Zi") AND (NOT "${${flag}}" MATCHES "/Z7"))
-    STRING(APPEND ${flag} " /Zi")
-   ENDIF()
-   # Remove inlining flags, added by CMake, if any.
-   # Compiler default is fine.
-   STRING(REGEX REPLACE "/Ob[0-3]" "" "${flag}"  "${${flag}}" )
-  ENDFOREACH()
-
-  # Allow to overwrite the inlining flag
-  SET(MSVC_INLINE "" CACHE STRING
-    "MSVC Inlining option, either empty, or one of /Ob0,/Ob1,/Ob2,/Ob3")
-  IF(MSVC_INLINE MATCHES "/Ob[0-3]")
-    ADD_COMPILE_OPTIONS(${MSVC_INLINE})
-  ELSEIF(NOT(MSVC_INLINE STREQUAL ""))
-    MESSAGE(FATAL_ERROR "Invalid option for MSVC_INLINE")
-  ENDIF()
+  # Compile with /Zi to get debugging information
+  if (NOT DEFINED CMAKE_MSVC_DEBUG_INFORMATION_FORMAT)
+    set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "ProgramDatabase")
+    add_link_options(/DEBUG) # Ensure debugging info at link time
+  endif()
 
-  IF(WITH_ASAN OR WITH_UBSAN)
+  if(WITH_ASAN OR WITH_UBSAN)
     # Workaround something Linux specific
-    SET(SECURITY_HARDENED 0 CACHE INTERNAL "" FORCE)
-    ENABLE_SANITIZERS()
-  ENDIF()
-
-  IF(CLANG_CL)
-     SET(CLANG_CL_FLAGS
-"-Wno-unknown-warning-option -Wno-unused-private-field \
--Wno-unused-parameter -Wno-inconsistent-missing-override \
--Wno-unused-command-line-argument -Wno-pointer-sign \
--Wno-deprecated-register -Wno-missing-braces \
--Wno-unused-function -Wno-unused-local-typedef -msse4.2 "
+    set(SECURITY_HARDENED 0 CACHE INTERNAL "" FORCE)
+    enable_sanitizers()
+  endif()
+
+  add_compile_definitions(
+    _CRT_SECURE_NO_DEPRECATE
+    _CRT_NONSTDC_NO_WARNINGS
+    _WIN32_WINNT=0x0A00
+    # We do not want the windows.h , or winsvc.h macros min/max
+    NOMINMAX NOSERVICE
+    # Speed up build process excluding unused header files
+    WIN32_LEAN_AND_MEAN
+  )
+  if(CLANG_CL)
+    add_compile_options(
+      -Wno-unknown-warning-option
+      -Wno-unused-private-field
+      -Wno-unused-parameter
+      -Wno-inconsistent-missing-override
+      -Wno-unused-command-line-argument
+      -Wno-pointer-sign
+      -Wno-deprecated-register
+      -Wno-missing-braces
+      -Wno-unused-function
+      -Wno-unused-local-typedef
+      -Wno-microsoft-static-assert
+      -Wno-c++17-extensions
+      -msse4.2
     )
-    IF(CMAKE_SIZEOF_VOID_P MATCHES 8)
-      STRING(APPEND CLANG_CL_FLAGS "-mpclmul ")
-    ENDIF()
-    STRING(APPEND CMAKE_C_FLAGS " ${CLANG_CL_FLAGS} ${MSVC_CRT_TYPE}")
-    STRING(APPEND CMAKE_CXX_FLAGS " ${CLANG_CL_FLAGS}  ${MSVC_CRT_TYPE}")
-  ENDIF()
-
-  FOREACH(type EXE SHARED MODULE)
-   STRING(REGEX REPLACE "/STACK:([^ ]+)" "" CMAKE_${type}_LINKER_FLAGS "${CMAKE_${type}_LINKER_FLAGS}")
-   IF(WITH_ASAN)
-     SET(build_types RELWITHDEBINFO DEBUG)
-   ELSE()
-     SET(build_types RELWITHDEBINFO)
-   ENDIF()
-   FOREACH(btype ${build_types})
-     STRING(REGEX REPLACE "/INCREMENTAL:([^ ]+)" "/INCREMENTAL:NO" CMAKE_${type}_LINKER_FLAGS_${btype} "${CMAKE_${type}_LINKER_FLAGS_${btype}}")
-     STRING(REGEX REPLACE "/INCREMENTAL$" "/INCREMENTAL:NO" CMAKE_${type}_LINKER_FLAGS_${btype} "${CMAKE_${type}_LINKER_FLAGS_${btype}}")
-   ENDFOREACH()
-   IF(NOT CLANG_CL)
-     STRING(APPEND CMAKE_${type}_LINKER_FLAGS_RELWITHDEBINFO " /release /OPT:REF,ICF")
-   ENDIF()
-   IF(DYNAMIC_UCRT_LINK AND (MSVC_CRT_TYPE STREQUAL "/MT"))
-     FOREACH(config RELEASE RELWITHDEBINFO DEBUG MINSIZEREL)
-       STRING(APPEND CMAKE_${type}_LINKER_FLAGS_${config} ${DYNAMIC_UCRT_LINKER_OPTION})
-     ENDFOREACH()
-   ENDIF()
-  ENDFOREACH()
+    if((CMAKE_SIZEOF_VOID_P MATCHES 8) AND MSVC_INTEL)
+      add_compile_options(-mpclmul)
+    endif()
+  endif()
 
-  
   # Mark 32 bit executables large address aware so they can 
   # use > 2GB address space
-  IF(CMAKE_SIZEOF_VOID_P MATCHES 4)
-   STRING(APPEND CMAKE_EXE_LINKER_FLAGS " /LARGEADDRESSAWARE")
-  ENDIF()
-  
-  # Speed up multiprocessor build
-  IF (NOT CLANG_CL)
-    STRING(APPEND CMAKE_C_FLAGS " /MP")
-    STRING(APPEND CMAKE_CXX_FLAGS " /MP")
-    STRING(APPEND CMAKE_C_FLAGS_RELWITHDEBINFO " /Gw")
-    STRING(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /Gw")
-  ENDIF()
-  
-  #TODO: update the code and remove the disabled warnings
-  STRING(APPEND CMAKE_C_FLAGS " /we4700 /we4311 /we4477 /we4302 /we4090")
-  STRING(APPEND CMAKE_CXX_FLAGS " /we4099 /we4700 /we4311 /we4477 /we4302 /we4090")
-  IF(MSVC_VERSION GREATER 1910  AND NOT CLANG_CL)
-    STRING(APPEND CMAKE_CXX_FLAGS " /permissive-")
-    STRING(APPEND CMAKE_C_FLAGS " /diagnostics:caret")
-    STRING(APPEND CMAKE_CXX_FLAGS " /diagnostics:caret")
-  ENDIF()
-  ADD_DEFINITIONS(-D_CRT_NONSTDC_NO_WARNINGS)
-  IF(MYSQL_MAINTAINER_MODE MATCHES "ERR")
-    STRING(APPEND CMAKE_C_FLAGS " /WX")
-    STRING(APPEND CMAKE_CXX_FLAGS " /WX")
-    FOREACH(type EXE SHARED MODULE)
-      FOREACH(cfg RELEASE DEBUG RELWITHDEBINFO)
-        SET(CMAKE_${type}_LINKER_FLAGS_${cfg} "${CMAKE_${type}_LINKER_FLAGS_${cfg}} /WX")
-      ENDFOREACH()
-    ENDFOREACH()
-  ENDIF()
-
-  IF(FAST_BUILD)
-    STRING (REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  ELSEIF (NOT CLANG_CL)
-    STRING(APPEND CMAKE_CXX_FLAGS_RELEASE " /d2OptimizeHugeFunctions")
-    STRING(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /d2OptimizeHugeFunctions")
-  ENDIF()
-  ADD_COMPILE_OPTIONS($<$<COMPILE_LANGUAGE:C,CXX>:/utf-8>)
-ENDIF()
-
-# Always link with socket/synchronization libraries
-STRING(APPEND CMAKE_C_STANDARD_LIBRARIES " ws2_32.lib synchronization.lib")
-STRING(APPEND CMAKE_CXX_STANDARD_LIBRARIES " ws2_32.lib synchronization.lib")
-
-# System checks
-SET(SIGNAL_WITH_VIO_CLOSE 1) # Something that runtime team needs
-
-# IPv6 constants appeared in Vista SDK first. We need to define them in any case if they are 
-# not in headers, to handle dual mode sockets correctly.
-CHECK_SYMBOL_EXISTS(IPPROTO_IPV6 "winsock2.h" HAVE_IPPROTO_IPV6)
-IF(NOT HAVE_IPPROTO_IPV6)
-  SET(HAVE_IPPROTO_IPV6 41)
-ENDIF()
-CHECK_SYMBOL_EXISTS(IPV6_V6ONLY  "winsock2.h;ws2ipdef.h" HAVE_IPV6_V6ONLY)
-IF(NOT HAVE_IPV6_V6ONLY)
-  SET(IPV6_V6ONLY 27)
-ENDIF()
-
-# Some standard functions exist there under different
-# names (e.g popen is _popen or strok_r is _strtok_s)
-# If a replacement function exists, HAVE_FUNCTION is
-# defined to 1. CMake variable <function_name> will also
-# be defined to the replacement name.
-# So for example, CHECK_FUNCTION_REPLACEMENT(popen _popen)
-# will define HAVE_POPEN to 1 and set variable named popen
-# to _popen. If the header template, one needs to have
-# cmakedefine popen @popen@ which will expand to 
-# define popen _popen after CONFIGURE_FILE
-
-MACRO(CHECK_FUNCTION_REPLACEMENT function replacement)
-  STRING(TOUPPER ${function} function_upper)
-  CHECK_FUNCTION_EXISTS(${function} HAVE_${function_upper})
-  IF(NOT HAVE_${function_upper})
-    CHECK_FUNCTION_EXISTS(${replacement}  HAVE_${replacement})
-    IF(HAVE_${replacement})
-      SET(HAVE_${function_upper} 1 )
-      SET(${function} ${replacement})
-    ENDIF()
-  ENDIF()
-ENDMACRO()
-MACRO(CHECK_SYMBOL_REPLACEMENT symbol replacement header)
-  STRING(TOUPPER ${symbol} symbol_upper)
-  CHECK_SYMBOL_EXISTS(${symbol} ${header} HAVE_${symbol_upper})
-  IF(NOT HAVE_${symbol_upper})
-    CHECK_SYMBOL_EXISTS(${replacement} ${header} HAVE_${replacement})
-    IF(HAVE_${replacement})
-      SET(HAVE_${symbol_upper} 1)
-      SET(${symbol} ${replacement})
-    ENDIF()
-  ENDIF()
-ENDMACRO()
+  if(CMAKE_SIZEOF_VOID_P MATCHES 4)
+    add_link_options(/LARGEADDRESSAWARE)
+  endif()
+
+  # RelWithDebInfo is deoptimized wrt inlining.
+  # Fix it to default
+  foreach(lang C CXX)
+    foreach(suffix "_RELWITHDEBINFO" "_RELWITHDEBINFO_INIT")
+      string(REGEX REPLACE "/Ob[0-1]" "" CMAKE_${lang}_FLAGS${suffix} "${CMAKE_${lang}_FLAGS${suffix}}")
+    endforeach()
+  endforeach()
+
+  if(NOT CLANG_CL)
+    add_link_options("$<$<CONFIG:Release,RelWithDebInfo>:/INCREMENTAL:NO;/RELEASE;/OPT:REF,ICF>")
+    add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:$<$<CONFIG:Release,RelWithDebInfo>:/Gw>>)
+    add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/MP>)
+    add_compile_options("$<$<COMPILE_LANGUAGE:C,CXX>:/we4099;/we4700;/we4311;/we4477;/we4302;/we4090>")
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/permissive->)
+    add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/diagnostics:caret>)
+    add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:/utf-8>)
+    if(NOT FAST_BUILD)
+      add_compile_options($<$<CONFIG:Release,RelWithDebInfo>:$<$<COMPILE_LANGUAGE:C,CXX>:/d2OptimizeHugeFunctions>>)
+    endif()
+  endif()
+
+  if(MYSQL_MAINTAINER_MODE MATCHES "ERR")
+    set(CMAKE_COMPILE_WARNING_AS_ERROR ON)
+    add_link_options(/WX)
+  endif()
+endif()
+
+# avoid running system checks by using pre-cached check results
+# system checks are expensive on VS generator
+get_filename_component(_SCRIPT_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
+include(${_SCRIPT_DIR}/WindowsCache.cmake)
+
+# this is out of place, not really a system check
+set(FN_NO_CASE_SENSE 1)
+set(USE_SYMDIR 1)
+set(HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT 1)
 
-CHECK_SYMBOL_REPLACEMENT(S_IROTH _S_IREAD sys/stat.h)
-CHECK_SYMBOL_REPLACEMENT(S_IFIFO _S_IFIFO sys/stat.h)
-CHECK_SYMBOL_REPLACEMENT(SIGQUIT SIGTERM signal.h)
-CHECK_SYMBOL_REPLACEMENT(SIGPIPE SIGINT signal.h)
-CHECK_FUNCTION_REPLACEMENT(popen _popen)
-CHECK_FUNCTION_REPLACEMENT(pclose _pclose)
-CHECK_FUNCTION_REPLACEMENT(access _access)
-CHECK_FUNCTION_REPLACEMENT(strcasecmp _stricmp)
-CHECK_FUNCTION_REPLACEMENT(strncasecmp _strnicmp)
-CHECK_SYMBOL_REPLACEMENT(snprintf _snprintf stdio.h)
-CHECK_FUNCTION_REPLACEMENT(strtok_r strtok_s)
-CHECK_FUNCTION_REPLACEMENT(strtoll _strtoi64)
-CHECK_FUNCTION_REPLACEMENT(strtoull _strtoui64)
-CHECK_FUNCTION_REPLACEMENT(vsnprintf _vsnprintf)
-CHECK_TYPE_SIZE(ssize_t SIZE_OF_SSIZE_T)
-IF(NOT HAVE_SIZE_OF_SSIZE_T)
- SET(ssize_t SSIZE_T)
-ENDIF()
-
-SET(FN_NO_CASE_SENSE 1)
-SET(USE_SYMDIR 1)
-
-# Force static C runtime for targets in current directory
-# (useful to get rid of MFC dll's dependency, or in installer)
-MACRO(FORCE_STATIC_CRT)
-  FOREACH(flag
-    CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_RELWITHDEBINFO
-    CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_DEBUG_INIT
-    CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_RELWITHDEBINFO
-    CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_DEBUG_INIT
-    CMAKE_C_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_MINSIZEREL
-  )
-    STRING(REGEX REPLACE "/MD[d]?"  "/MT" "${flag}"  "${${flag}}" )
-    STRING(REPLACE "${DYNAMIC_UCRT_LINKER_OPTION}" "" "${flag}" "${${flag}}")
-  ENDFOREACH()
-ENDMACRO()
diff -Nru mariadb-10.11.11/cmake/os/WindowsCache.cmake mariadb-10.11.13/cmake/os/WindowsCache.cmake
--- mariadb-10.11.11/cmake/os/WindowsCache.cmake	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/cmake/os/WindowsCache.cmake	2025-05-19 16:14:24.000000000 +0000
@@ -203,10 +203,10 @@
 SET(HAVE_STRNDUP CACHE  INTERNAL "")
 SET(HAVE_STRNLEN 1 CACHE  INTERNAL "")
 SET(HAVE_STRPBRK 1 CACHE  INTERNAL "")
-SET(HAVE_STRTOK_R CACHE  INTERNAL "")
-SET(HAVE_STRTOLL CACHE  INTERNAL "")
+SET(HAVE_STRTOK_R 1 CACHE  INTERNAL "")
+SET(HAVE_STRTOLL 1 CACHE  INTERNAL "")
 SET(HAVE_STRTOUL 1 CACHE  INTERNAL "")
-SET(HAVE_STRTOULL CACHE  INTERNAL "")
+SET(HAVE_STRTOULL 1 CACHE  INTERNAL "")
 SET(HAVE_SYNCH_H CACHE  INTERNAL "")
 SET(HAVE_SYSENT_H CACHE  INTERNAL "")
 SET(HAVE_SYS_DIR_H CACHE  INTERNAL "")
@@ -294,6 +294,7 @@
 SET(HAVE_LINUX_UNISTD_H CACHE INTERNAL "")
 SET(HAVE_SYS_UTSNAME_H CACHE INTERNAL "")
 SET(HAVE_PTHREAD_ATTR_GETGUARDSIZE CACHE INTERNAL "")
+SET(HAVE_PTHREAD_GETATTR_NP CACHE INTERNAL "")
 SET(HAVE_SOCKPEERCRED CACHE INTERNAL "")
 SET(HAVE_ABI_CXA_DEMANGLE CACHE INTERNAL "")
 SET(HAVE_GCC_C11_ATOMICS CACHE INTERNAL "")
@@ -348,4 +349,16 @@
 SET(HAVE_GETPAGESIZES CACHE INTERNAL "")
 SET(HAVE_LINUX_LIMITS_H CACHE INTERNAL "")
 SET(HAVE_FILE_UCONTEXT_H CACHE INTERNAL "")
+SET(have_C__Werror CACHE INTERNAL "")
+SET(HAVE_SIGNAL_H 1 CACHE INTERNAL "")
+SET(HAVE_UINT CACHE INTERNAL "")
+SET(HAVE_SOCKET_LEN_T CACHE INTERNAL "")
+SET(HAVE_GETTHRID CACHE INTERNAL "")
+SET(HAVE_THREAD_LOCAL 1 CACHE INTERNAL "")
+SET(have_CXX__Wno_unused_but_set_variable CACHE INTERNAL "")
+SET(HAVE_UNISTD_H CACHE INTERNAL "")
+SET(HAVE_LINUX_UNISTD_H CACHE INTERNAL "")
+SET(OFF64_T CACHE INTERNAL "")
+SET(Z_HAVE_UNISTD_H CACHE INTERNAL "")
+SET(HAVE_OFF64_T CACHE FALSE INTERNAL "")
 ENDIF(MSVC)
diff -Nru mariadb-10.11.11/cmake/pcre.cmake mariadb-10.11.13/cmake/pcre.cmake
--- mariadb-10.11.11/cmake/pcre.cmake	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/cmake/pcre.cmake	2025-05-19 16:14:24.000000000 +0000
@@ -54,11 +54,18 @@
     ENDIF()
   ENDFOREACH()
 
+  IF(CMAKE_MSVC_RUNTIME_LIBRARY)
+    SET(CMAKE_MSVC_RUNTIME_LIBRARY_ARG
+      "-DCMAKE_MSVC_RUNTIME_LIBRARY=${CMAKE_MSVC_RUNTIME_LIBRARY}")
+  ELSE()
+    SET(CMAKE_MSVC_RUNTIME_LIBRARY_ARG)
+  ENDIF()
+
   ExternalProject_Add(
     pcre2
     PREFIX   "${dir}"
-    URL "https://github.com/PCRE2Project/pcre2/releases/download/pcre2-10.44/pcre2-10.44.zip"
-    URL_MD5 dfab8313154b3377a6959c3b6377841e
+    URL "https://github.com/PCRE2Project/pcre2/releases/download/pcre2-10.45/pcre2-10.45.zip"
+    URL_MD5 873da56c6469ec207ca5c5ae9688b83a
     INSTALL_COMMAND ""
     CMAKE_ARGS
       "-DCMAKE_WARN_DEPRECATED=FALSE"
@@ -72,6 +79,7 @@
       "-DCMAKE_C_FLAGS_RELEASE=${pcre2_flags_RELEASE}"
       "-DCMAKE_C_FLAGS_MINSIZEREL=${pcre2_flags_MINSIZEREL}"
       "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+      ${CMAKE_MSVC_RUNTIME_LIBRARY_ARG}
       ${stdlibs}
       ${byproducts}
   )
diff -Nru mariadb-10.11.11/cmake/plugin.cmake mariadb-10.11.13/cmake/plugin.cmake
--- mariadb-10.11.11/cmake/plugin.cmake	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/cmake/plugin.cmake	2025-05-19 16:14:24.000000000 +0000
@@ -214,6 +214,11 @@
 
     TARGET_LINK_LIBRARIES (${target} mysqlservices ${ARG_LINK_LIBRARIES})
 
+    IF(WIN32)
+      # A popular library, turns out many plugins need it for gethostname()
+      TARGET_LINK_LIBRARIES (${target} ws2_32)
+    ENDIF()
+
     IF(CMAKE_SYSTEM_NAME MATCHES AIX)
       TARGET_LINK_OPTIONS(${target} PRIVATE "-Wl,-bE:${CMAKE_SOURCE_DIR}/libservices/mysqlservices_aix.def")
     ENDIF()
diff -Nru mariadb-10.11.11/config.h.cmake mariadb-10.11.13/config.h.cmake
--- mariadb-10.11.11/config.h.cmake	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/config.h.cmake	2025-05-19 16:14:24.000000000 +0000
@@ -402,38 +402,27 @@
 #cmakedefine SIGNAL_WITH_VIO_CLOSE 1
 
 /* Windows stuff, mostly functions, that have Posix analogs but named differently */
-#cmakedefine S_IROTH @S_IROTH@
-#cmakedefine S_IFIFO @S_IFIFO@
-#cmakedefine IPPROTO_IPV6 @IPPROTO_IPV6@
-#cmakedefine IPV6_V6ONLY @IPV6_V6ONLY@
-#cmakedefine sigset_t @sigset_t@
-#cmakedefine mode_t @mode_t@
-#cmakedefine SIGQUIT @SIGQUIT@
-#cmakedefine SIGPIPE @SIGPIPE@
-#cmakedefine popen @popen@
-#cmakedefine pclose @pclose@
-#cmakedefine ssize_t @ssize_t@
-#cmakedefine strcasecmp @strcasecmp@
-#cmakedefine strncasecmp @strncasecmp@
-#cmakedefine snprintf @snprintf@
-#cmakedefine strtok_r @strtok_r@
-#cmakedefine strtoll @strtoll@
-#cmakedefine strtoull @strtoull@
-#cmakedefine vsnprintf @vsnprintf@
-#if defined(_MSC_VER) && (_MSC_VER > 1800)
+#ifdef _WIN32
+#define S_IROTH _S_IREAD
+#define S_IFIFO _S_IFIFO
+#define SIGQUIT SIGTERM
+#define SIGPIPE SIGINT
+#define sigset_t int
+#define mode_t int
+#define popen _popen
+#define pclose _pclose
+#define ssize_t SSIZE_T
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define strtok_r strtok_s
 #define tzname _tzname
 #define P_tmpdir "C:\\TEMP"
-#endif
-#if defined(_MSC_VER) && (_MSC_VER > 1310)
-# define HAVE_SETENV
 #define setenv(a,b,c) _putenv_s(a,b)
-#endif
-#define PSAPI_VERSION 1     /* for GetProcessMemoryInfo() */
 
-/* We don't want the min/max macros */
-#ifdef _WIN32
+#define HAVE_SETENV
 #define NOMINMAX 1
-#endif
+#define PSAPI_VERSION 2     /* for GetProcessMemoryInfo() */
+#endif /* _WIN32 */
 
 /*
   MySQL features
@@ -457,6 +446,11 @@
 /* This should mean case insensitive file system */
 #cmakedefine FN_NO_CASE_SENSE 1
 
+/* Whether an anonymous private mapping is unaccessible after
+madvise(MADV_DONTNEED) or madvise(MADV_FREE) or similar has been invoked;
+this is the case with Microsoft Windows VirtualFree(MEM_DECOMMIT) */
+#cmakedefine HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT 1
+
 #cmakedefine HAVE_CHARSET_armscii8 1
 #cmakedefine HAVE_CHARSET_ascii 1
 #cmakedefine HAVE_CHARSET_big5 1
diff -Nru mariadb-10.11.11/debian/changelog mariadb-10.11.13/debian/changelog
--- mariadb-10.11.11/debian/changelog	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/changelog	2025-05-23 21:26:02.000000000 +0000
@@ -1,3 +1,39 @@
+mariadb (1:10.11.13-0+deb12u1) bookworm; urgency=medium
+
+  * New upstream version 10.11.13. Includes fixes for several severe regressions
+    as noted at https://mariadb.com/kb/en/mariadb-10-11-13-release-notes/, which
+    were discovered soon after the 10.11.12 release, which was skipped in Debian
+    intentionally.
+  * This release includes upstream version 10.11.12, with fixes for regressions
+    as noted at https://mariadb.com/kb/en/mariadb-10-11-12-release-notes/
+    well as security issues (Closes: #1100437, #1105976):
+    - CVE-2023-52969
+    - CVE-2023-52970
+    - CVE-2023-52971
+    - CVE-2025-30693
+    - CVE-2025-30722
+  * Drop all RocksDB patches now upstream due to update to version 6.29fb
+  * New upstream version has now CEST as allowed in main.timezone test
+    (Closes: #1084293)
+  * New upstream includes systemd service fix for restarts on crashes
+    (Closes: #1073847)
+  * New upstream also fixes regression in INSERT SELECT on NOT NULL columns
+    while having BEFORE UPDATE trigger (Closes: #1099515)
+  * Revert "Set CAP_IPC_LOCK capability if possible" because of MDEV-36229
+    (Closes: #1100575)
+  * Update configuration traces to have --ssl-verify-server-cert from MDEV-28908
+  * Update configuration traces to include new upstream system variables:
+    - innodb-buffer-pool-size-auto-min (default: 0)
+    - innodb-buffer-pool-size-max (default: 0)
+    - innodb-log-checkpoint-now (default: FALSE)
+  * Also update configuration traces to match that in 10.11.12 the variables
+    innodb-buffer-pool-chunk-size and innodb-log-spin-wait-delay are advertised
+    as deprecated.
+  * Fix changelog entry formatting in 1:10.11.11-0+deb12u1
+  * Salsa CI: Adapt piuparts helper script to new source format in Bookworm
+
+ -- Otto Kekäläinen <otto@debian.org>  Fri, 23 May 2025 14:26:02 -0700
+
 mariadb (1:10.11.11-0+deb12u1) bookworm; urgency=medium
 
   [ Otto Kekäläinen ]
@@ -27,7 +63,8 @@
     unstable in MariaDB 11.4 for a long time, and which are likely needed to
     avoid occasional shutdown issues, in particular on upgrades (LP: #2034125)
     in both Debian and Ubuntu
-    - Make SysV init more verbose in case of MariaDB start failures (Related: #1033234)
+    - Make SysV init more verbose in case of MariaDB start failures
+      (Related: #1033234)
     - Limit check of running mysqld/mariadbd to system users (Closes: #1032047)
     - When shutting down 'mariadbd', fallback to 'mysqld'
   * Add Lintian overrides for new upstream documentation JavaScript files
diff -Nru mariadb-10.11.11/debian/mariadb-server-core.postinst mariadb-10.11.13/debian/mariadb-server-core.postinst
--- mariadb-10.11.11/debian/mariadb-server-core.postinst	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/mariadb-server-core.postinst	1970-01-01 00:00:00.000000000 +0000
@@ -1,49 +0,0 @@
-#!/bin/bash
-set -e
-
-# shellcheck source=/dev/null
-. /usr/share/debconf/confmodule
-
-if [ -n "$DEBIAN_SCRIPT_DEBUG" ]
-then
-  set -v -x
-  DEBIAN_SCRIPT_TRACE=1
-fi
-
-${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2}
-
-export PATH=$PATH:/sbin:/usr/sbin:/bin:/usr/bin
-
-# inspired by iputils-ping
-#
-# cap_ipc_lock is required if a user wants to use --memlock
-# and has insufficient RLIMIT_MEMLOCK (MDEV-33301)
-
-PROGRAM=$(dpkg-divert --truename /usr/sbin/mysqld)
-
-case "$1" in
-  configure)
-    # If we have setcap installed, try setting
-    # which allows us to install our binaries without the setuid
-    # bit.
-    if command -v setcap > /dev/null
-    then
-      if ! setcap cap_ipc_lock+ep "$PROGRAM"
-      then
-        echo "Setcap failed on $PROGRAM, required with --memlock if insufficent RLIMIT_MEMLOCK" >&2
-      fi
-    fi
-    ;;
-
-  abort-upgrade|abort-remove|abort-configure|triggered)
-    ;;
-
-  *)
-    echo "postinst called with unknown argument '$1'" 1>&2
-    exit 1
-    ;;
-esac
-
-db_stop # in case invoke fails
-
-#DEBHELPER#
diff -Nru mariadb-10.11.11/debian/patches/fix-reproducible-builds-rocksdb.patch mariadb-10.11.13/debian/patches/fix-reproducible-builds-rocksdb.patch
--- mariadb-10.11.11/debian/patches/fix-reproducible-builds-rocksdb.patch	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/patches/fix-reproducible-builds-rocksdb.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,26 +0,0 @@
-Origin: https://github.com/facebook/rocksdb/commit/0a9a05ae12943b1529ef1eabbca5ce5a71c986bf
-# Merged in RocksDB 6.19.3, but not updated into MariaDB yet
-Bug: https://github.com/facebook/rocksdb/issues/7035
-Author: Otto Kekäläinen <otto@kekalainen.net>
-Subject: Make RocksDB build reproducible
-
-The RocksDB binary included a string with the build timestamp:
-> rocksdb_build_git_date:@2021-05-23·16:04:38@
-
-As this changes from build to build, it makes the builds unreproducible.
-Simply removing it solves the issue.
-
-This temporary fix can be removed when a proper fix already done in upstream
-lands in MariaDB when the RocksDB submodule is updated to a newer release.
-
---- a/storage/rocksdb/rocksdb/util/build_version.cc.in
-+++ b/storage/rocksdb/rocksdb/util/build_version.cc.in
-@@ -1,5 +1,5 @@
- // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
- #include "build_version.h"
--const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:@@GIT_SHA@@";
--const char* rocksdb_build_git_date = "rocksdb_build_git_date:@@GIT_DATE_TIME@@";
--const char* rocksdb_build_compile_date = __DATE__;
-+const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:REDACTED";
-+const char* rocksdb_build_git_date = "rocksdb_build_git_date:REDACTED";
-+const char* rocksdb_build_compile_date = "REDACTED";
diff -Nru mariadb-10.11.11/debian/patches/fix-spelling-rocksdb.patch mariadb-10.11.13/debian/patches/fix-spelling-rocksdb.patch
--- mariadb-10.11.11/debian/patches/fix-spelling-rocksdb.patch	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/patches/fix-spelling-rocksdb.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-Forwarded: https://github.com/facebook/rocksdb/pull/9653
-Origin: https://patch-diff.githubusercontent.com/raw/facebook/rocksdb/pull/9653.patch
-From: Otto Kekäläinen <otto@debian.org>
-Date: Wed, 2 Mar 2022 18:13:18 -0800
-Subject: Fix various spelling errors still found in code
-  Two upstream PRs remain that have been merged, but not imported on MariaDB yet.
-
---- a/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc
-+++ b/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc
-@@ -46,7 +46,7 @@
-             TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
-         f.cf_id != cfd_->GetID()) {
-       return Status::InvalidArgument(
--          "External file column family id dont match");
-+          "External file column family id don't match");
-     }
-   }
- 
-@@ -646,7 +646,7 @@
-     return Status::InvalidArgument("Global seqno is required, but disabled");
-   } else if (file_to_ingest->global_seqno_offset == 0) {
-     return Status::InvalidArgument(
--        "Trying to set global seqno for a file that dont have a global seqno "
-+        "Trying to set global seqno for a file that don't have a global seqno "
-         "field");
-   }
- 
---- a/storage/rocksdb/rocksdb/include/rocksdb/cache.h
-+++ b/storage/rocksdb/rocksdb/include/rocksdb/cache.h
-@@ -60,7 +60,7 @@
-   // If greater than zero, the LRU list will be split into a high-pri
-   // list and a low-pri list. High-pri entries will be insert to the
-   // tail of high-pri list, while low-pri entries will be first inserted to
--  // the low-pri list (the midpoint). This is refered to as
-+  // the low-pri list (the midpoint). This is referred to as
-   // midpoint insertion strategy to make entries never get hit in cache
-   // age out faster.
-   //
diff -Nru mariadb-10.11.11/debian/patches/rocksdb-kfreebsd.patch mariadb-10.11.13/debian/patches/rocksdb-kfreebsd.patch
--- mariadb-10.11.11/debian/patches/rocksdb-kfreebsd.patch	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/patches/rocksdb-kfreebsd.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,150 +0,0 @@
-Forwarded: https://github.com/facebook/rocksdb/pull/6992
-From: Andrew Kryczka <andrewkr@fb.com>
-Date: Tue, 16 Jun 2020 19:34:21 -0700
-# Merged in RocksDB 6.13.fb, but not updated into MariaDB yet
-Bug: https://jira.mariadb.org/browse/MDEV-19251
-Description:
- Upstream has merged this but we still need to wait for it to be included
- in a RocksDB release and imported into MariaDB and then into Debian.
---- a/storage/rocksdb/build_rocksdb.cmake
-+++ b/storage/rocksdb/build_rocksdb.cmake
-@@ -90,6 +90,8 @@
-   add_definitions(-DOS_LINUX)
- elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS")
-   add_definitions(-DOS_SOLARIS)
-+elseif(CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
-+  add_definitions(-DOS_GNU_KFREEBSD)
- elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-   add_definitions(-DOS_FREEBSD)
- elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
---- a/storage/rocksdb/rocksdb/CMakeLists.txt
-+++ b/storage/rocksdb/rocksdb/CMakeLists.txt
-@@ -91,7 +91,7 @@
-   option(WITH_XPRESS "build with windows built in compression" OFF)
-   include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
- else()
--  if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-+  if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
-     # FreeBSD has jemalloc as default malloc
-     # but it does not have all the jemalloc files in include/...
-     set(WITH_JEMALLOC ON)
-@@ -413,6 +413,8 @@
-   add_definitions(-DOS_LINUX)
- elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS")
-   add_definitions(-DOS_SOLARIS)
-+elseif(CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
-+  add_definitions(-DOS_GNU_KFREEBSD)
- elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-   add_definitions(-DOS_FREEBSD)
- elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
---- a/storage/rocksdb/rocksdb/build_tools/build_detect_platform
-+++ b/storage/rocksdb/rocksdb/build_tools/build_detect_platform
-@@ -190,6 +190,17 @@
-         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
-         # PORT_FILES=port/freebsd/freebsd_specific.cc
-         ;;
-+    GNU/kFreeBSD)
-+        PLATFORM=OS_GNU_KFREEBSD
-+        COMMON_FLAGS="$COMMON_FLAGS -DOS_GNU_KFREEBSD"
-+        if [ -z "$USE_CLANG" ]; then
-+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
-+        else
-+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
-+        fi
-+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
-+        # PORT_FILES=port/gnu_kfreebsd/gnu_kfreebsd_specific.cc
-+        ;;
-     NetBSD)
-         PLATFORM=OS_NETBSD
-         COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
---- a/storage/rocksdb/rocksdb/env/env_posix.cc
-+++ b/storage/rocksdb/rocksdb/env/env_posix.cc
-@@ -41,7 +41,7 @@
- #include <time.h>
- #include <algorithm>
- // Get nano time includes
--#if defined(OS_LINUX) || defined(OS_FREEBSD)
-+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
- #elif defined(__MACH__)
- #include <Availability.h>
- #include <mach/clock.h>
-@@ -287,7 +287,8 @@
-   }
- 
-   uint64_t NowNanos() override {
--#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX)
-+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
-+    defined(OS_AIX)
-     struct timespec ts;
-     clock_gettime(CLOCK_MONOTONIC, &ts);
-     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-@@ -307,8 +308,8 @@
-   }
- 
-   uint64_t NowCPUNanos() override {
--#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \
--    (defined(__MACH__) && defined(__MAC_10_12))
-+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
-+    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
-     struct timespec ts;
-     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
-     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
---- a/storage/rocksdb/rocksdb/port/stack_trace.cc
-+++ b/storage/rocksdb/rocksdb/port/stack_trace.cc
-@@ -32,7 +32,7 @@
- 
- namespace {
- 
--#if defined(OS_LINUX) || defined(OS_FREEBSD)
-+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
- const char* GetExecutableName() {
-   static char name[1024];
- 
---- a/storage/rocksdb/rdb_io_watchdog.h
-+++ b/storage/rocksdb/rdb_io_watchdog.h
-@@ -56,19 +56,19 @@
-   int stop_timers() {
-     int ret = 0;
- 
--    if (m_io_check_watchdog_timer) {
-+    if (m_io_check_watchdog_timer != reinterpret_cast<timer_t>(-1)) {
-       ret = timer_delete(m_io_check_watchdog_timer);
- 
-       if (!ret) {
--        m_io_check_watchdog_timer = nullptr;
-+        m_io_check_watchdog_timer = reinterpret_cast<timer_t>(-1);
-       }
-     }
- 
--    if (m_io_check_timer && !ret) {
-+    if (m_io_check_timer != reinterpret_cast<timer_t>(-1) && !ret) {
-       ret = timer_delete(m_io_check_timer);
- 
-       if (!ret) {
--        m_io_check_timer = nullptr;
-+        m_io_check_timer = reinterpret_cast<timer_t>(-1);
-       }
-     }
- 
-@@ -93,8 +93,8 @@
- 
-  public:
-   explicit Rdb_io_watchdog(std::vector<std::string> &&directories)
--      : m_io_check_timer(nullptr),
--        m_io_check_watchdog_timer(nullptr),
-+      : m_io_check_timer(reinterpret_cast<timer_t>(-1)),
-+        m_io_check_watchdog_timer(reinterpret_cast<timer_t>(-1)),
-         m_io_in_progress(false),
-         m_dirs_to_check(std::move(directories)),
-         m_buf(nullptr) {
---- a/storage/rocksdb/rdb_io_watchdog.cc
-+++ b/storage/rocksdb/rdb_io_watchdog.cc
-@@ -111,7 +111,7 @@
-     sql_print_warning("Deleting the watchdog I/O timer failed with %d.", errno);
-   }
- 
--  m_io_check_watchdog_timer = nullptr;
-+  m_io_check_watchdog_timer = reinterpret_cast<timer_t>(-1);
- 
-   RDB_MUTEX_UNLOCK_CHECK(m_reset_mutex);
- }
diff -Nru mariadb-10.11.11/debian/patches/series mariadb-10.11.13/debian/patches/series
--- mariadb-10.11.11/debian/patches/series	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/patches/series	2025-05-23 21:26:02.000000000 +0000
@@ -1,5 +1,2 @@
-rocksdb-kfreebsd.patch
 env-perl-usr-bin-perl.patch
-fix-spelling-rocksdb.patch
-fix-reproducible-builds-rocksdb.patch
 mroonga-mrn-lib-dirs-path-reproducible-build.patch
diff -Nru mariadb-10.11.11/debian/salsa-ci-enable-sec-and-update-repos.sh mariadb-10.11.13/debian/salsa-ci-enable-sec-and-update-repos.sh
--- mariadb-10.11.11/debian/salsa-ci-enable-sec-and-update-repos.sh	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/salsa-ci-enable-sec-and-update-repos.sh	2025-05-23 21:26:02.000000000 +0000
@@ -1,10 +1,14 @@
 #!/bin/sh
 
-set -x
-set -e
+echo "Running salsa-ci-enable-sec-and-update-repos.sh to enable the same"
+echo "repositories thate were available at build time in e.g."
+echo "registry.salsa.debian.org/salsa-ci-team/pipeline/base:bullseye"
+
+# Debug what repositories are available to begin
+head /etc/apt/sources.list /etc/apt/sources.list.d/* || true
 
-# Debug what repositories are available to begin with
-grep -r "^deb " /etc/apt/sources.*
+# Fail on non-zero exit codes from this point onward
+set -e
 
 # Enable the same repositories that were available at build time in
 # registry.salsa.debian.org/salsa-ci-team/pipeline/base:bullseye
diff -Nru mariadb-10.11.11/debian/salsa-ci.yml mariadb-10.11.13/debian/salsa-ci.yml
--- mariadb-10.11.11/debian/salsa-ci.yml	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/salsa-ci.yml	2025-05-23 21:26:02.000000000 +0000
@@ -24,10 +24,17 @@
   # For unknown reason Lintian v2.116.3 in Bookworm errors on valid changelog entry
   SALSA_CI_LINTIAN_SUPPRESS_TAGS: 'bad-distribution-in-changes-file'
 
-# Extend Salsa-CI build jobs to have longer timeout as the default GitLab
-# timeout (1h) is often not enough
 .build-package:
+  # Extend Salsa CI build jobs to have longer timeout as the default GitLab
+  # timeout (1h) is often not enough
   timeout: 3h
+  # Default 5G sporadically fails builds on not having enough disk space
+  variables:
+    CCACHE_MAXSIZE: 3G
+  # Salsa instance runners typically have 30G volumes with 14G free disk space
+  before_script:
+    - echo "Total and free disk space:"
+    - df -h .
 
 stages:
   - provisioning
diff -Nru mariadb-10.11.11/debian/tests/traces/mariadb-verbose-help.expected mariadb-10.11.13/debian/tests/traces/mariadb-verbose-help.expected
--- mariadb-10.11.11/debian/tests/traces/mariadb-verbose-help.expected	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/tests/traces/mariadb-verbose-help.expected	2025-05-23 21:26:02.000000000 +0000
@@ -156,9 +156,8 @@
   --ssl-crlpath=name  Certificate revocation list path (implies --ssl).
   --tls-version=name  TLS protocol version for secure connection.
   --ssl-verify-server-cert 
-                      Verify server's "Common Name" in its cert against
-                      hostname used when connecting. This option is disabled by
-                      default.
+                      Verify server's certificate to prevent man-in-the-middle
+                      attacks
   -t, --table         Output in table format.
   --tee=name          Append everything into outfile. See interactive help (\h)
                       also. Does not work in batch mode. Disable with
diff -Nru mariadb-10.11.11/debian/tests/traces/mariadbd-verbose-help.expected mariadb-10.11.13/debian/tests/traces/mariadbd-verbose-help.expected
--- mariadb-10.11.11/debian/tests/traces/mariadbd-verbose-help.expected	2025-02-19 00:56:41.000000000 +0000
+++ mariadb-10.11.13/debian/tests/traces/mariadbd-verbose-help.expected	2025-05-23 21:26:02.000000000 +0000
@@ -575,9 +575,7 @@
                       FORCE_PLUS_PERMANENT (like FORCE, but the plugin can not
                       be uninstalled).
   --innodb-buffer-pool-chunk-size=# 
-                      Size of a single memory chunk for resizing buffer pool.
-                      Online buffer pool resizing happens at this granularity.
-                      0 means autosize this variable based on buffer pool size.
+                      Deprecated parameter with no effect
   --innodb-buffer-pool-dump-at-shutdown 
                       Dump the buffer pool into a file named
                       @@innodb_buffer_pool_filename
@@ -603,6 +601,11 @@
   --innodb-buffer-pool-size=# 
                       The size of the memory buffer InnoDB uses to cache data
                       and indexes of its tables.
+  --innodb-buffer-pool-size-auto-min=# 
+                      Minimum innodb_buffer_pool_size for dynamic shrinking on
+                      memory pressure
+  --innodb-buffer-pool-size-max=# 
+                      Maximum innodb_buffer_pool_size
   --innodb-buffer-pool-stats[=name] 
                       Enable or disable INNODB_BUFFER_POOL_STATS plugin. One
                       of: ON, OFF, FORCE (don't start if the plugin fails to
@@ -883,6 +886,9 @@
                       be uninstalled).
   --innodb-log-buffer-size=# 
                       Redo log buffer size in bytes.
+  --innodb-log-checkpoint-now 
+                      Write back dirty pages from the buffer pool and update
+                      the log checkpoint
   --innodb-log-file-buffering 
                       Whether the file system cache for ib_logfile0 is enabled
   --innodb-log-file-mmap 
@@ -894,8 +900,7 @@
   --innodb-log-group-home-dir=name 
                       Path to ib_logfile0
   --innodb-log-spin-wait-delay[=#] 
-                      Delay between log buffer spin lock polls (0 to use a
-                      blocking latch)
+                      Deprecated parameter with no effect
   --innodb-log-write-ahead-size=# 
                       Redo log write size to avoid read-on-write; must be a
                       power of two
@@ -1449,7 +1454,8 @@
                       keys. fix_reuse_range_for_ref = Do a better job at
                       reusing range access estimates when estimating ref
                       access. fix_card_multiplier = Fix the computation in
-                      selectivity_for_indexes. selectivity_multiplier. This
+                      selectivity_for_indexes. fix_derived_table_read_cost =
+                      Fix the cost of reading materialized derived table. This
                       variable will be deleted in MariaDB 11.0 as it is not
                       needed with the new 11.0 optimizer.
   Use 'ALL' to set all combinations.
@@ -2611,6 +2617,8 @@
 innodb-buffer-pool-load-at-startup                           TRUE
 innodb-buffer-pool-load-now                                  FALSE
 innodb-buffer-pool-size                                      134217728
+innodb-buffer-pool-size-auto-min                             0
+innodb-buffer-pool-size-max                                  0
 innodb-buffer-pool-stats                                     ON
 innodb-change-buffer-max-size                                25
 innodb-change-buffering                                      none
@@ -2685,6 +2693,7 @@
 innodb-lock-waits                                            ON
 innodb-locks                                                 ON
 innodb-log-buffer-size                                       16777216
+innodb-log-checkpoint-now                                    FALSE
 innodb-log-file-buffering                                    FALSE
 innodb-log-file-mmap                                         TRUE
 innodb-log-file-size                                         100663296
diff -Nru mariadb-10.11.11/extra/mariabackup/backup_mysql.cc mariadb-10.11.13/extra/mariabackup/backup_mysql.cc
--- mariadb-10.11.11/extra/mariabackup/backup_mysql.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/extra/mariabackup/backup_mysql.cc	2025-05-19 16:14:24.000000000 +0000
@@ -1893,7 +1893,7 @@
 		srv_log_file_size,
 		srv_page_size,
 		srv_undo_dir,
-                (uint) srv_undo_tablespaces,
+		srv_undo_tablespaces,
 		page_zip_level,
 		innobase_buffer_pool_filename ?
 			"innodb_buffer_pool_filename=" : "",
diff -Nru mariadb-10.11.11/extra/mariabackup/common_engine.cc mariadb-10.11.13/extra/mariabackup/common_engine.cc
--- mariadb-10.11.11/extra/mariabackup/common_engine.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/extra/mariabackup/common_engine.cc	2025-05-19 16:14:24.000000000 +0000
@@ -64,8 +64,10 @@
 	for (const auto &fname : m_fnames) {
 		File file = mysql_file_open(0, fname.c_str(),O_RDONLY | O_SHARE, MYF(0));
 		if (file < 0) {
-			msg(thread_num, "Error on file %s open during %s table copy",
-				fname.c_str(), full_tname.c_str());
+			char buf[MYSYS_STRERROR_SIZE];
+			msg(thread_num, "Error %i on file %s open during %s table copy: %s",
+			    errno, fname.c_str(), full_tname.c_str(),
+			    my_strerror(buf, sizeof(buf), errno));
 			goto exit;
 		}
 		files.push_back(file);
diff -Nru mariadb-10.11.11/extra/mariabackup/innobackupex.cc mariadb-10.11.13/extra/mariabackup/innobackupex.cc
--- mariadb-10.11.11/extra/mariabackup/innobackupex.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/extra/mariabackup/innobackupex.cc	2025-05-19 16:14:24.000000000 +0000
@@ -44,8 +44,8 @@
 #include <string.h>
 #include <mysql.h>
 #include <my_dir.h>
-#include <ut0mem.h>
 #include <os0file.h>
+#include "buf0buf.h"
 #include <srv0start.h>
 #include <algorithm>
 #include <mysqld.h>
@@ -594,8 +594,9 @@
 	 "--apply-log.",
 	 (uchar*) &ibx_xtrabackup_use_memory,
 	 (uchar*) &ibx_xtrabackup_use_memory,
-	 0, GET_LL, REQUIRED_ARG, 100*1024*1024L, 1024*1024L, LONGLONG_MAX, 0,
-	 1024*1024L, 0},
+	 0, GET_LL, REQUIRED_ARG, 96 << 20,
+	 innodb_buffer_pool_extent_size, SIZE_T_MAX, 0,
+	 innodb_buffer_pool_extent_size, 0},
 
 	{"innodb-force-recovery", OPT_INNODB_FORCE_RECOVERY,
 	 "This option starts up the embedded InnoDB instance in crash "
diff -Nru mariadb-10.11.11/extra/mariabackup/write_filt.cc mariadb-10.11.13/extra/mariabackup/write_filt.cc
--- mariadb-10.11.11/extra/mariabackup/write_filt.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/extra/mariabackup/write_filt.cc	2025-05-19 16:14:24.000000000 +0000
@@ -144,18 +144,6 @@
 			return false;
 		}
 
-		/* Check whether TRX_SYS page has been changed */
-		if (mach_read_from_4(page + FIL_PAGE_SPACE_ID)
-				== TRX_SYS_SPACE
-		    && mach_read_from_4(page + FIL_PAGE_OFFSET)
-				== TRX_SYS_PAGE_NO) {
-			msg(cursor->thread_n,
-			    "--incremental backup is impossible if "
-			    "the server had been restarted with "
-			    "different innodb_undo_tablespaces.");
-			return false;
-		}
-
 		/* updated page */
 		if (cp->npages == page_size / 4) {
 			/* flush buffer */
diff -Nru mariadb-10.11.11/extra/mariabackup/xtrabackup.cc mariadb-10.11.13/extra/mariabackup/xtrabackup.cc
--- mariadb-10.11.11/extra/mariabackup/xtrabackup.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/extra/mariabackup/xtrabackup.cc	2025-05-19 16:14:24.000000000 +0000
@@ -201,8 +201,6 @@
 	xb_filter_entry_t *name_hash;
 };
 
-lsn_t checkpoint_lsn_start;
-lsn_t checkpoint_no_start;
 /** whether log_copying_thread() is active; protected by recv_sys.mutex */
 static bool log_copying_running;
 /** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */
@@ -1383,6 +1381,7 @@
   OPT_XTRA_MYSQLD_ARGS,
   OPT_XB_IGNORE_INNODB_PAGE_CORRUPTION,
   OPT_INNODB_FORCE_RECOVERY,
+  OPT_INNODB_CHECKPOINT,
   OPT_ARIA_LOG_DIR_PATH
 };
 
@@ -1414,8 +1413,9 @@
      "The value is used in place of innodb_buffer_pool_size. "
      "This option is only relevant when the --prepare option is specified.",
      (G_PTR *) &xtrabackup_use_memory, (G_PTR *) &xtrabackup_use_memory, 0,
-     GET_LL, REQUIRED_ARG, 100 * 1024 * 1024L, 1024 * 1024L, LONGLONG_MAX, 0,
-     1024 * 1024L, 0},
+     GET_ULL, REQUIRED_ARG, 96 << 20, innodb_buffer_pool_extent_size,
+     size_t(-ssize_t(innodb_buffer_pool_extent_size)),
+     0, innodb_buffer_pool_extent_size, 0},
     {"throttle", OPT_XTRA_THROTTLE,
      "limit count of IO operations (pairs of read&write) per second to IOS "
      "values (for '--backup')",
@@ -1787,10 +1787,7 @@
 static const char *dbug_option;
 #endif
 
-#ifdef HAVE_URING
-extern const char *io_uring_may_be_unsafe;
-bool innodb_use_native_aio_default();
-#endif
+static my_bool innodb_log_checkpoint_now;
 
 struct my_option xb_server_options[] =
 {
@@ -1927,12 +1924,7 @@
    "Use native AIO if supported on this platform.",
    (G_PTR*) &srv_use_native_aio,
    (G_PTR*) &srv_use_native_aio, 0, GET_BOOL, NO_ARG,
-#ifdef HAVE_URING
-   innodb_use_native_aio_default(),
-#else
-   TRUE,
-#endif
-   0, 0, 0, 0, 0},
+   TRUE, 0, 0, 0, 0, 0},
   {"innodb_page_size", OPT_INNODB_PAGE_SIZE,
    "The universal page size of the database.",
    (G_PTR*) &innobase_page_size, (G_PTR*) &innobase_page_size, 0,
@@ -2019,6 +2011,12 @@
    (G_PTR*)&srv_force_recovery,
    0, GET_ULONG, OPT_ARG, 0, 0, SRV_FORCE_IGNORE_CORRUPT, 0, 0, 0},
 
+  {"innodb_log_checkpoint_now", OPT_INNODB_CHECKPOINT,
+   "(for --backup): Force an InnoDB checkpoint",
+   (G_PTR*)&innodb_log_checkpoint_now,
+   (G_PTR*)&innodb_log_checkpoint_now,
+   0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
+
     {"mysqld-args", OPT_XTRA_MYSQLD_ARGS,
      "All arguments that follow this argument are considered as server "
      "options, and if some of them are not supported by mariabackup, they "
@@ -2482,7 +2480,7 @@
 	}
 
 	srv_sys_space.normalize_size();
-	srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
+	srv_lock_table_size = 5 * buf_pool.curr_size();
 
 	/* -------------- Log files ---------------------------*/
 
@@ -2504,11 +2502,8 @@
 
 	srv_adaptive_flushing = FALSE;
 
-        /* We set srv_pool_size here in units of 1 kB. InnoDB internally
-        changes the value so that it becomes the number of database pages. */
-
-	srv_buf_pool_size = (ulint) xtrabackup_use_memory;
-	srv_buf_pool_chunk_unit = srv_buf_pool_size;
+	buf_pool.size_in_bytes_max = size_t(xtrabackup_use_memory);
+	buf_pool.size_in_bytes_requested = buf_pool.size_in_bytes_max;
 
 	srv_n_read_io_threads = (uint) innobase_read_io_threads;
 	srv_n_write_io_threads = (uint) innobase_write_io_threads;
@@ -2534,12 +2529,8 @@
 		msg("InnoDB: Using Linux native AIO");
 	}
 #elif defined(HAVE_URING)
-	if (!srv_use_native_aio) {
-	} else if (io_uring_may_be_unsafe) {
-		msg("InnoDB: Using liburing on this kernel %s may cause hangs;"
-		    " see https://jira.mariadb.org/browse/MDEV-26674",
-		    io_uring_may_be_unsafe);
-	} else {
+
+	if (srv_use_native_aio) {
 		msg("InnoDB: Using liburing");
 	}
 #else
@@ -2679,7 +2670,7 @@
   }
 
   recv_sys.lsn= log_sys.next_checkpoint_lsn=
-    log_sys.get_lsn() - SIZE_OF_FILE_CHECKPOINT;
+    log_get_lsn() - SIZE_OF_FILE_CHECKPOINT;
   log_sys.set_latest_format(false); // not encrypted
   log_hdr_init();
   byte *b= &log_hdr_buf[log_t::START_OFFSET];
@@ -2946,6 +2937,15 @@
 	const regex_list_t& list,
 	const char* name)
 {
+	if (list.empty()) return (FALSE);
+
+	/*
+	  regexec/pcre2_regexec is not threadsafe, also documented.
+	  Serialize access from multiple threads to compiled regexes.
+	*/
+	static std::mutex regex_match_mutex;
+	std::lock_guard<std::mutex> lock(regex_match_mutex);
+
 	regmatch_t tables_regmatch[1];
 	for (regex_list_t::const_iterator i = list.begin(), end = list.end();
 	     i != end; ++i) {
@@ -5405,6 +5405,14 @@
 	}
 	msg("cd to %s", mysql_real_data_home);
 	encryption_plugin_backup_init(mysql_connection);
+	if (innodb_log_checkpoint_now != false && mysql_send_query(
+		    mysql_connection,
+		    C_STRING_WITH_LEN("SET GLOBAL "
+				      "innodb_log_checkpoint_now=ON;"))) {
+		msg("initiating checkpoint failed");
+		return(false);
+	}
+
 	msg("open files limit requested %lu, set to %lu",
 	    xb_open_files_limit,
 	    xb_set_max_open_files(xb_open_files_limit));
@@ -5517,6 +5525,11 @@
 		goto fail;
 	}
 
+	/* try to wait for a log checkpoint, but do not fail if the
+	server does not support this */
+	if (innodb_log_checkpoint_now != false) {
+		mysql_read_query_result(mysql_connection);
+	}
 	/* label it */
 	recv_sys.file_checkpoint = log_sys.next_checkpoint_lsn;
 	log_hdr_init();
@@ -6230,9 +6243,22 @@
 					buf + FSP_HEADER_OFFSET + FSP_SIZE);
 				if (mach_read_from_4(buf
 						     + FIL_PAGE_SPACE_ID)) {
+#ifdef _WIN32
+					os_offset_t last_page =
+					  os_file_get_size(dst_file) /
+					  page_size;
+
+					/* os_file_set_size() would
+					shrink the size of the file */
+					if (last_page < n_pages &&
+					    !os_file_set_size(
+					       dst_path, dst_file,
+					       n_pages * page_size))
+#else
 					if (!os_file_set_size(
 						    dst_path, dst_file,
 						    n_pages * page_size))
+#endif /* _WIN32 */
 						goto error;
 				} else if (fil_space_t* space
 					   = fil_system.sys_space) {
diff -Nru mariadb-10.11.11/include/json_lib.h mariadb-10.11.13/include/json_lib.h
--- mariadb-10.11.11/include/json_lib.h	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/include/json_lib.h	2025-05-19 16:14:24.000000000 +0000
@@ -387,7 +387,7 @@
   Returns negative integer in the case of an error,
   the length of the result otherwise.
 */
-int json_unescape(CHARSET_INFO *json_cs,
+int __attribute__((warn_unused_result)) json_unescape(CHARSET_INFO *json_cs,
                   const uchar *json_str, const uchar *json_end,
                   CHARSET_INFO *res_cs,
                   uchar *res, uchar *res_end);
@@ -401,7 +401,8 @@
     JSON_ERROR_OUT_OF_SPACE    Not enough space in the provided buffer
     JSON_ERROR_ILLEGAL_SYMBOL  Source symbol cannot be represented in JSON
 */
-int json_escape(CHARSET_INFO *str_cs, const uchar *str, const uchar *str_end,
+int  __attribute__((warn_unused_result)) json_escape(CHARSET_INFO *str_cs,
+		const uchar *str, const uchar *str_end,
                 CHARSET_INFO *json_cs, uchar *json, uchar *json_end);
 
 
diff -Nru mariadb-10.11.11/include/my_base.h mariadb-10.11.13/include/my_base.h
--- mariadb-10.11.11/include/my_base.h	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/include/my_base.h	2025-05-19 16:14:24.000000000 +0000
@@ -219,7 +219,10 @@
   /** Start writing rows during ALTER TABLE...ALGORITHM=COPY. */
   HA_EXTRA_BEGIN_ALTER_COPY,
   /** Finish writing rows during ALTER TABLE...ALGORITHM=COPY. */
-  HA_EXTRA_END_ALTER_COPY
+  HA_EXTRA_END_ALTER_COPY,
+  /** Abort of writing rows during ALTER TABLE..ALGORITHM=COPY or
+  CREATE..SELCT */
+  HA_EXTRA_ABORT_ALTER_COPY
 };
 
 /* Compatible option, to be deleted in 6.0 */
diff -Nru mariadb-10.11.11/include/my_cpu.h mariadb-10.11.13/include/my_cpu.h
--- mariadb-10.11.11/include/my_cpu.h	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/include/my_cpu.h	2025-05-19 16:14:24.000000000 +0000
@@ -97,7 +97,12 @@
   /* Changed from __ppc_get_timebase for musl and clang compatibility */
   __builtin_ppc_get_timebase();
 #elif defined __GNUC__ && defined __riscv
-  __builtin_riscv_pause();
+  /* The GCC-only __builtin_riscv_pause() or the pause instruction is
+  encoded like a fence instruction with special parameters. On RISC-V
+  implementations that do not support arch=+zihintpause this
+  instruction could be interpreted as a more expensive memory fence;
+  it should not be an illegal instruction. */
+  __asm__ volatile(".long 0x0100000f" ::: "memory");
 #elif defined __GNUC__
   /* Mainly, prevent the compiler from optimizing away delay loops */
   __asm__ __volatile__ ("":::"memory");
diff -Nru mariadb-10.11.11/include/my_stack_alloc.h mariadb-10.11.13/include/my_stack_alloc.h
--- mariadb-10.11.11/include/my_stack_alloc.h	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/include/my_stack_alloc.h	2025-05-19 16:14:24.000000000 +0000
@@ -38,6 +38,8 @@
 #if defined(__GNUC__) || defined(__clang__) /* GCC and Clang compilers */
 #if defined(__i386__) /* Intel x86 (32-bit) */
   __asm__ volatile ("movl %%esp, %0" : "=r" (stack_ptr));
+#elif defined(__x86_64__) && defined (__ILP32__) /* Intel x86-64 (64-bit), X32 ABI */
+  __asm__ volatile ("movl %%esp, %0" : "=r" (stack_ptr));
 #elif defined(__x86_64__) /* Intel x86-64 (64-bit) */
   __asm__ volatile ("movq %%rsp, %0" : "=r" (stack_ptr));
 #elif defined(__powerpc__) /* PowerPC (32-bit) */
diff -Nru mariadb-10.11.11/include/my_sys.h mariadb-10.11.13/include/my_sys.h
--- mariadb-10.11.11/include/my_sys.h	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/include/my_sys.h	2025-05-19 16:14:24.000000000 +0000
@@ -173,9 +173,15 @@
 extern void *my_memdup(PSI_memory_key key, const void *from,size_t length,myf MyFlags);
 extern char *my_strdup(PSI_memory_key key, const char *from,myf MyFlags);
 extern char *my_strndup(PSI_memory_key key, const char *from, size_t length, myf MyFlags);
+extern my_bool my_use_large_pages;
 
-int my_init_large_pages(my_bool super_large_pages);
+int my_init_large_pages(void);
 uchar *my_large_malloc(size_t *size, myf my_flags);
+#ifdef _WIN32
+/* On Windows, use my_virtual_mem_reserve() and my_virtual_mem_commit(). */
+#else
+char *my_large_virtual_alloc(size_t *size);
+#endif
 void my_large_free(void *ptr, size_t size);
 void my_large_page_truncate(size_t *size);
 
diff -Nru mariadb-10.11.11/include/my_virtual_mem.h mariadb-10.11.13/include/my_virtual_mem.h
--- mariadb-10.11.11/include/my_virtual_mem.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/include/my_virtual_mem.h	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,37 @@
+/* Copyright (c) 2025, MariaDB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#pragma once
+/*
+  Functionality for handling virtual memory
+  (reserve, commit, decommit, release)
+*/
+#include <stddef.h> /*size_t*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+# ifdef _WIN32
+char *my_virtual_mem_reserve(size_t *size);
+# endif
+char *my_virtual_mem_commit(char *ptr, size_t size);
+void my_virtual_mem_decommit(char *ptr, size_t size);
+void my_virtual_mem_release(char *ptr, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
diff -Nru mariadb-10.11.11/include/source_revision.h mariadb-10.11.13/include/source_revision.h
--- mariadb-10.11.11/include/source_revision.h	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/include/source_revision.h	2025-05-19 16:14:28.000000000 +0000
@@ -1 +1 @@
-#define SOURCE_REVISION "e69f8cae1a15e15b9e4f5e0f8497e1f17bdc81a4"
+#define SOURCE_REVISION "8fb09426b98583916ccfd4f8c49741adc115bac3"
diff -Nru mariadb-10.11.11/include/sslopt-longopts.h mariadb-10.11.13/include/sslopt-longopts.h
--- mariadb-10.11.11/include/sslopt-longopts.h	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/include/sslopt-longopts.h	2025-05-19 16:14:24.000000000 +0000
@@ -51,8 +51,7 @@
 
 #ifdef MYSQL_CLIENT
   {"ssl-verify-server-cert", 0,
-   "Verify server's \"Common Name\" in its cert against hostname used "
-   "when connecting. This option is disabled by default.",
+   "Verify server's certificate to prevent man-in-the-middle attacks",
    &opt_ssl_verify_server_cert, &opt_ssl_verify_server_cert,
    0, GET_BOOL, OPT_ARG, 0, 0, 0, 0, 0, 0},
 #endif
diff -Nru mariadb-10.11.11/libmariadb/CMakeLists.txt mariadb-10.11.13/libmariadb/CMakeLists.txt
--- mariadb-10.11.11/libmariadb/CMakeLists.txt	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -52,7 +52,7 @@
 
 SET(CPACK_PACKAGE_VERSION_MAJOR 3)
 SET(CPACK_PACKAGE_VERSION_MINOR 3)
-SET(CPACK_PACKAGE_VERSION_PATCH 14)
+SET(CPACK_PACKAGE_VERSION_PATCH 16)
 SET(CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
 MATH(EXPR MARIADB_PACKAGE_VERSION_ID "${CPACK_PACKAGE_VERSION_MAJOR} * 10000 +
                             ${CPACK_PACKAGE_VERSION_MINOR} * 100   +
diff -Nru mariadb-10.11.11/libmariadb/include/errmsg.h mariadb-10.11.13/libmariadb/include/errmsg.h
--- mariadb-10.11.11/libmariadb/include/errmsg.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/include/errmsg.h	2025-05-19 16:14:27.000000000 +0000
@@ -115,10 +115,11 @@
 #define CR_BINLOG_INVALID_FILE 5022
 #define CR_BINLOG_SEMI_SYNC_ERROR 5023
 #define CR_INVALID_CLIENT_FLAG 5024
+#define CR_ERR_MISSING_ERROR_INFO 5026
 
 /* Always last, if you add new error codes please update the
    value for CR_MARIADB_LAST_ERROR */
-#define CR_MARIADB_LAST_ERROR CR_INVALID_CLIENT_FLAG
+#define CR_MARIADB_LAST_ERROR CR_ERR_MISSING_ERROR_INFO
 
 #endif
 
diff -Nru mariadb-10.11.11/libmariadb/include/ma_context.h mariadb-10.11.13/libmariadb/include/ma_context.h
--- mariadb-10.11.11/libmariadb/include/ma_context.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/include/ma_context.h	2025-05-19 16:14:27.000000000 +0000
@@ -26,8 +26,33 @@
   (This particular implementation uses Posix ucontext swapcontext().)
 */
 
+
+/*
+  When running with address sanitizer, the stack switching can cause confusion
+  unless the __sanitizer_{start,finish}_switch_fiber() functions are used
+  (CONC-618).
+
+  In this case prefer the use of boost::context or ucontext, which should have
+  this instrumentation, over our custom assembler variants.
+*/
+#ifdef __has_feature
+   /* Clang */
+#  if __has_feature(address_sanitizer)
+#    define ASAN_PREFER_NON_ASM 1
+#  endif
+#else
+   /* GCC */
+#  ifdef __SANITIZE_ADDRESS__
+#    define ASAN_PREFER_NON_ASM 1
+#  endif
+#endif
+
 #ifdef _WIN32
 #define MY_CONTEXT_USE_WIN32_FIBERS 1
+#elif defined(ASAN_PREFER_NON_ASM) && defined(HAVE_BOOST_CONTEXT_H)
+#define MY_CONTEXT_USE_BOOST_CONTEXT
+#elif defined(ASAN_PREFER_NON_ASM) && defined(HAVE_UCONTEXT_H)
+#define MY_CONTEXT_USE_UCONTEXT
 #elif defined(__GNUC__) && __GNUC__ >= 3 && defined(__x86_64__) && !defined(__ILP32__)
 #define MY_CONTEXT_USE_X86_64_GCC_ASM
 #elif defined(__GNUC__) && __GNUC__ >= 3 && defined(__i386__)
diff -Nru mariadb-10.11.11/libmariadb/include/mariadb_com.h mariadb-10.11.13/libmariadb/include/mariadb_com.h
--- mariadb-10.11.11/libmariadb/include/mariadb_com.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/include/mariadb_com.h	2025-05-19 16:14:27.000000000 +0000
@@ -423,6 +423,28 @@
   double max_value_dbl;
 };
 
+  /* The following is for user defined functions */
+
+typedef struct st_udf_args
+{
+  unsigned int arg_count;		/* Number of arguments */
+  enum Item_result *arg_type;		/* Pointer to item_results */
+  char **args;				/* Pointer to argument */
+  unsigned long *lengths;		/* Length of string arguments */
+  char *maybe_null;			/* Set to 1 for all maybe_null args */
+} UDF_ARGS;
+
+  /* This holds information about the result */
+
+typedef struct st_udf_init
+{
+  my_bool maybe_null;			/* 1 if function can return NULL */
+  unsigned int decimals;		/* for real functions */
+  unsigned int max_length;		/* For string functions */
+  char	  *ptr;				/* free pointer for function data */
+  my_bool const_item;			/* 0 if result is independent of arguments */
+} UDF_INIT;
+
 /* Connection types */
 #define MARIADB_CONNECTION_UNIXSOCKET   0
 #define MARIADB_CONNECTION_TCP          1
diff -Nru mariadb-10.11.11/libmariadb/libmariadb/CMakeLists.txt mariadb-10.11.13/libmariadb/libmariadb/CMakeLists.txt
--- mariadb-10.11.11/libmariadb/libmariadb/CMakeLists.txt	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/libmariadb/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -168,12 +168,6 @@
  mysql_use_result
  mysql_warning_count)
 
-# some gcc versions fail to compile asm parts of my_context.c,
-# if build type is "Release" (see CONC-133), so we need to add -g flag
-IF(CMAKE_COMPILER_IS_GNUCC AND CMAKE_BUILD_TYPE MATCHES "Release")
-  SET_SOURCE_FILES_PROPERTIES(my_context.c PROPERTIES COMPILE_FLAGS -g)
-ENDIF()
-
 IF(ZLIB_FOUND AND WITH_EXTERNAL_ZLIB)
   INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
 ELSE()
diff -Nru mariadb-10.11.11/libmariadb/libmariadb/ma_context.c mariadb-10.11.13/libmariadb/libmariadb/ma_context.c
--- mariadb-10.11.11/libmariadb/libmariadb/ma_context.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/libmariadb/ma_context.c	2025-05-19 16:14:27.000000000 +0000
@@ -105,9 +105,23 @@
   c->user_func= f;
   c->user_data= d;
   c->active= 1;
+  u.a[1]= 0;   /* Otherwise can give uninitialized warnings on 32-bit. */
   u.p= c;
+  /*
+    makecontext function expects function pointer to receive multiple
+    ints as an arguments, however is declared in ucontext.h header with
+    a void (empty) argument list. Ignore clang cast-function-type-strict
+    warning for this function call.
+  */
+# ifdef __clang__
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wcast-function-type-strict"
+# endif
   makecontext(&c->spawned_context, (uc_func_t)my_context_spawn_internal, 2,
               u.a[0], u.a[1]);
+# ifdef __clang__
+#  pragma clang diagnostic pop
+# endif
 
   return my_context_continue(c);
 }
@@ -204,7 +218,7 @@
     (
      "movq %%rsp, (%[save])\n\t"
      "movq %[stack], %%rsp\n\t"
-#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) || __clang__) && !defined(__INTEL_COMPILER)
+#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || (defined(__clang__) && __clang_major__ < 13)
      /*
        This emits a DWARF DW_CFA_undefined directive to make the return address
        undefined. This indicates that this is the top of the stack frame, and
@@ -440,7 +454,7 @@
     (
      "movl %%esp, (%[save])\n\t"
      "movl %[stack], %%esp\n\t"
-#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) || __clang__) && !defined(__INTEL_COMPILER)
+#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || (defined(__clang__) && __clang_major__ < 13)
      /*
        This emits a DWARF DW_CFA_undefined directive to make the return address
        undefined. This indicates that this is the top of the stack frame, and
@@ -675,7 +689,7 @@
     (
      "mov x10, sp\n\t"
      "mov sp, %[stack]\n\t"
-#if !defined(__INTEL_COMPILER)
+#if defined(__GCC_HAVE_DWARF2_CFI_ASM) || (defined(__clang__) && __clang_major__ < 13)
      /*
        This emits a DWARF DW_CFA_undefined directive to make the return address
        (UNW_AARCH64_X30) undefined. This indicates that this is the top of the
@@ -724,7 +738,11 @@
        [stack] "+r" (stack)
      : [save] "r" (save)
      : "x3", "x4", "x5", "x6", "x7",
-       "x9", "x10", "x11", "x14", "x15", "x18", "x30",
+       "x9", "x10", "x11", "x14", "x15",
+#if defined(__linux__) && !defined(__ANDROID__)
+       "x18",
+#endif
+       "x30",
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
        "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
@@ -827,7 +845,11 @@
      : [ret] "=r" (ret)
      : [save] "r" (save)
      : "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-       "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x18", "x30",
+       "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+#if defined(__linux__) && !defined(__ANDROID__)
+       "x18",
+#endif
+       "x30",
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
        "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
@@ -904,7 +926,11 @@
      :
      : [save] "r" (save)
      : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-       "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x18", "x30",
+       "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+#if defined(__linux__) && !defined(__ANDROID__)
+       "x18",
+#endif
+       "x30",
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
        "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
diff -Nru mariadb-10.11.11/libmariadb/libmariadb/ma_errmsg.c mariadb-10.11.13/libmariadb/libmariadb/ma_errmsg.c
--- mariadb-10.11.11/libmariadb/libmariadb/ma_errmsg.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/libmariadb/ma_errmsg.c	2025-05-19 16:14:27.000000000 +0000
@@ -119,6 +119,8 @@
   /* 5022 */ "File '%s' is not a binary log file",
   /* 5023 */ "Semi sync request error: %s",
   /* 5024 */ "Invalid client flags (%lu) specified. Supported flags: %lu",
+  /* 5025 */ "",
+  /* 5026 */ "Server returned an error packet without further information",
   ""
 };
 
diff -Nru mariadb-10.11.11/libmariadb/libmariadb/mariadb_lib.c mariadb-10.11.13/libmariadb/libmariadb/mariadb_lib.c
--- mariadb-10.11.11/libmariadb/libmariadb/mariadb_lib.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/libmariadb/mariadb_lib.c	2025-05-19 16:14:27.000000000 +0000
@@ -81,7 +81,7 @@
 #define strncasecmp _strnicmp
 #endif
 
-#define ASYNC_CONTEXT_DEFAULT_STACK_SIZE (4096*15)
+#define ASYNC_CONTEXT_DEFAULT_STACK_SIZE (256*1024)
 #define MA_RPL_VERSION_HACK "5.5.5-"
 
 #define CHARSET_NAME_LEN 64
@@ -274,6 +274,11 @@
         ma_strmake(net->last_error,(char*) pos,
                 min(len,sizeof(net->last_error)-1));
       }
+      /* MDEV-35935: if server sends error packet without error, we have to
+         set error manually */
+      if (!net->last_errno) {
+        my_set_error(mysql, CR_ERR_MISSING_ERROR_INFO, SQLSTATE_UNKNOWN, 0);
+      }
     }
     else
     {
@@ -402,7 +407,7 @@
 
   /* CONC-589: If reconnect option was specified, we have to check if the connection
                (socket) is still available */
-  if (command != COM_QUIT && mysql->options.reconnect && ma_pvio_is_alive(mysql->net.pvio))
+  if (command != COM_QUIT && mysql->options.reconnect && !ma_pvio_is_alive(mysql->net.pvio))
   {
     ma_pvio_close(mysql->net.pvio);
     mysql->net.pvio= NULL;
diff -Nru mariadb-10.11.11/libmariadb/libmariadb/mariadb_stmt.c mariadb-10.11.13/libmariadb/libmariadb/mariadb_stmt.c
--- mariadb-10.11.11/libmariadb/libmariadb/mariadb_stmt.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/libmariadb/mariadb_stmt.c	2025-05-19 16:14:27.000000000 +0000
@@ -425,6 +425,9 @@
           stmt->bind[i].is_null= &stmt->bind[i].is_null_value;
         *stmt->bind[i].is_null= 1;
         stmt->bind[i].u.row_ptr= NULL;
+        if (!stmt->bind[i].length)
+          stmt->bind[i].length= &stmt->bind[i].length_value;
+        *stmt->bind[i].length= stmt->bind[i].length_value= 0;
       }
     } else
     {
@@ -437,6 +440,9 @@
         if (stmt->result_callback)
           stmt->result_callback(stmt->user_data, i, &row);
         else {
+          if (!stmt->bind[i].is_null)
+            stmt->bind[i].is_null= &stmt->bind[i].is_null_value;
+          *stmt->bind[i].is_null= 0;
           if (mysql_ps_fetch_functions[stmt->fields[i].type].pack_len >= 0)
             length= mysql_ps_fetch_functions[stmt->fields[i].type].pack_len;
           else
diff -Nru mariadb-10.11.11/libmariadb/plugins/pvio/pvio_socket.c mariadb-10.11.13/libmariadb/plugins/pvio/pvio_socket.c
--- mariadb-10.11.11/libmariadb/plugins/pvio/pvio_socket.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/plugins/pvio/pvio_socket.c	2025-05-19 16:14:27.000000000 +0000
@@ -1101,10 +1101,10 @@
 
   res= poll(&poll_fd, 1, 0);
   if (res <= 0) /* timeout or error */
-    return FALSE;
+    return TRUE;
   if (!(poll_fd.revents & (POLLIN | POLLPRI)))
-    return FALSE;
-  return TRUE;
+    return TRUE;
+  return FALSE;
 #else
   /* We can't use the WSAPoll function, it's broken :-(
      (see Windows 8 Bugs 309411 - WSAPoll does not report failed connections)
@@ -1117,8 +1117,8 @@
 
   res= select((int)csock->socket + 1, &sfds, NULL, NULL, &tv);
   if (res > 0 && FD_ISSET(csock->socket, &sfds))
-    return TRUE;
-  return FALSE;
+    return FALSE;
+  return TRUE;
 #endif
 }
 /* }}} */
diff -Nru mariadb-10.11.11/libmariadb/unittest/libmariadb/connection.c mariadb-10.11.13/libmariadb/unittest/libmariadb/connection.c
--- mariadb-10.11.11/libmariadb/unittest/libmariadb/connection.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/unittest/libmariadb/connection.c	2025-05-19 16:14:27.000000000 +0000
@@ -2339,6 +2339,7 @@
   MYSQL *mysql;
   int i;
   const char *ciphers[3]= {"TLS_AES_128_GCM_SHA256", "TLS_AES_256_GCM_SHA384", "TLS_CHACHA20_POLY1305_SHA256"};
+  my_bool verify= 0;
 
   SKIP_MAXSCALE;
 
@@ -2348,6 +2349,7 @@
     mysql= mysql_init(NULL);
 
     mysql_ssl_set(mysql, NULL, NULL, NULL, NULL, NULL);
+    mysql_optionsv(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT, &verify);
     mysql_optionsv(mysql, MYSQL_OPT_SSL_CIPHER, ciphers[i]);
 
     if (!my_test_connect(mysql, hostname, username,
@@ -2370,7 +2372,6 @@
 static int test_conc589(MYSQL *my)
 {
   MYSQL *mysql= mysql_init(NULL);
-  MYSQL_RES *result;
   int rc;
   my_bool reconnect= 1, verify= 0;
   unsigned long last_thread_id= 0;
@@ -2391,15 +2392,85 @@
   check_mysql_rc(rc, mysql);
 
   last_thread_id= mysql_thread_id(mysql);
+  rc= mysql_query(mysql, "SET @a:=1");
+  check_mysql_rc(rc, mysql);
+
+  sleep(10);
+
+  rc= mysql_query(mysql, "SET @a:=2");
+  check_mysql_rc(rc, mysql);
+  FAIL_IF(mysql_thread_id(mysql) == last_thread_id, "Expected new connection id");
+  last_thread_id= mysql_thread_id(mysql);
+
+  mysql_kill(my, last_thread_id);
+
+  sleep(10);
+
+  rc= mysql_query(mysql, "SET @a:=3");
+  check_mysql_rc(rc, mysql);
+  FAIL_IF(mysql_thread_id(mysql) == last_thread_id, "Expected new connection id");
+  mysql_close(mysql);
+  return OK;
+}
+
+#ifdef WIN32
+static int test_conc760(MYSQL *my)
+{
+  MYSQL *mysql= mysql_init(NULL);
+  MYSQL_RES *result;
+  MYSQL_ROW row;
+  int rc;
+  char named_pipe_name[128];
+  my_bool reconnect= 1, verify= 0;
+  unsigned long last_thread_id= 0;
+  unsigned int protocol= MYSQL_PROTOCOL_PIPE;
+  my_bool have_named_pipe= 0;
+
+  SKIP_MAXSCALE;
+
+  rc= mysql_query(my, "select @@named_pipe, @@socket");
+  check_mysql_rc(rc, mysql);
+
+  if ((result= mysql_store_result(my)))
+  {
+    if((row= mysql_fetch_row(result)))
+      have_named_pipe= atoi(row[0]);
+    strncpy(named_pipe_name, row[1], sizeof(named_pipe_name)-1);
+    named_pipe_name[sizeof(named_pipe_name)-1]= '\0';
+    mysql_free_result(result);
+  }
+
+  if (!have_named_pipe)
+  {
+    diag("Server doesn't support named pipes");
+    return SKIP;
+  }
+
+  mysql_options(mysql, MYSQL_OPT_RECONNECT, &reconnect);
+  mysql_options(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT, &verify);
+  mysql_options(mysql, MYSQL_OPT_PROTOCOL, &protocol);
+
+  if (!my_test_connect(mysql, hostname, username,
+                       password, schema, port, named_pipe_name, CLIENT_REMEMBER_OPTIONS))
+  {
+    diag("error: %s", mysql_error(mysql));
+    return FAIL;
+  }
+
+  rc= mysql_query(mysql, "SET SESSION wait_timeout=5");
+  check_mysql_rc(rc, mysql);
+
+  last_thread_id= mysql_thread_id(mysql);
   if ((rc= mysql_query(mysql, "SELECT 1")) || (result= mysql_store_result(mysql)) == NULL)
     check_mysql_rc(rc, mysql);
 
   mysql_free_result(result);
   sleep(10);
 
-  if ((rc= mysql_query(mysql, "SELECT 2")) || (result= mysql_store_result(mysql)) == NULL)
-    check_mysql_rc(rc, mysql);
-  mysql_free_result(result);
+  rc= mysql_query(mysql, "SELECT 2");
+  check_mysql_rc(rc, mysql);
+  if (result= mysql_store_result(mysql))
+    mysql_free_result(result);
   FAIL_IF(mysql_thread_id(mysql) == last_thread_id, "Expected new connection id");
   last_thread_id= mysql_thread_id(mysql);
 
@@ -2414,8 +2485,12 @@
   mysql_close(mysql);
   return OK;
 }
+#endif
 
 struct my_tests_st my_tests[] = {
+#ifdef WIN32
+  {"test_conc760", test_conc760, TEST_CONNECTION_DEFAULT, 0, NULL, NULL},
+#endif
   {"test_conc589", test_conc589, TEST_CONNECTION_DEFAULT, 0, NULL, NULL},
 #ifdef HAVE_test_conc748
   {"test_conc748", test_conc748, TEST_CONNECTION_NONE, 0, NULL, NULL},
diff -Nru mariadb-10.11.11/libmariadb/unittest/libmariadb/errors.c mariadb-10.11.13/libmariadb/unittest/libmariadb/errors.c
--- mariadb-10.11.11/libmariadb/unittest/libmariadb/errors.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/unittest/libmariadb/errors.c	2025-05-19 16:14:27.000000000 +0000
@@ -272,8 +272,82 @@
   return OK;
 }
 
+#define TEST_ARRAY_SIZE 1024
+
+static int test_mdev35935(MYSQL *mysql)
+{
+  MYSQL_STMT *stmt= mysql_stmt_init(mysql);
+  const char *stmt_str= "INSERT INTO bulk1 (a,b) VALUES (?,?)";
+  unsigned int array_size= TEST_ARRAY_SIZE;
+  int rc;
+  unsigned int i;
+  char **buffer;
+  unsigned long *lengths;
+  unsigned int *vals;
+  MYSQL_BIND bind[2];
+  const char *data= "test";
+
+  SKIP_MAXSCALE;
+  SKIP_MYSQL(mysql);
+
+  rc= mysql_select_db(mysql, schema);
+
+  rc= mysql_query(mysql, "DROP TABLE IF EXISTS bulk1");
+  check_mysql_rc(rc, mysql);
+
+  rc= mysql_query(mysql, "CREATE TABLE bulk1 (a int , b VARCHAR(255))");
+  check_mysql_rc(rc, mysql);
+
+  rc= mysql_stmt_prepare(stmt, SL(stmt_str));
+  check_stmt_rc(rc, stmt);
+
+  rc= mysql_query(mysql, "ALTER TABLE bulk1 ADD c int");
+  check_mysql_rc(rc, mysql);
+
+  /* allocate memory */
+  buffer= calloc(TEST_ARRAY_SIZE, sizeof(char *));
+  lengths= calloc(TEST_ARRAY_SIZE, sizeof *lengths);
+  vals= calloc(TEST_ARRAY_SIZE, sizeof *vals);
+
+  for (i=0; i < TEST_ARRAY_SIZE; i++)
+  {
+    buffer[i]= (void *)data;
+    lengths[i]= -1;
+    vals[i]= i;
+  }
+
+  memset(bind, 0, sizeof(MYSQL_BIND) * 2);
+  bind[0].buffer_type= MYSQL_TYPE_LONG;
+  bind[0].buffer= vals;
+  bind[1].buffer_type= MYSQL_TYPE_STRING;
+  bind[1].buffer= (void *)buffer;
+  bind[1].length= (unsigned long *)lengths;
+
+  rc= mysql_stmt_attr_set(stmt, STMT_ATTR_ARRAY_SIZE, &array_size);
+  check_stmt_rc(rc, stmt);
+
+  rc= mysql_stmt_bind_param(stmt, bind);
+  check_stmt_rc(rc, stmt);
+
+  if ((rc= mysql_stmt_execute(stmt)))
+  {
+    FAIL_IF((!mysql_stmt_errno(stmt) || !mysql_errno(mysql)), "Error number > 0 expected");
+  }
+
+  mysql_stmt_close(stmt);
+  rc= mysql_query(mysql, "DROP TABLE IF EXISTS bulk1");
+  check_mysql_rc(rc, mysql);
+
+  free(buffer);
+  free(lengths);
+  free(vals);
+  return OK;
+}
+
+
 
 struct my_tests_st my_tests[] = {
+  {"test_mdev35935", test_mdev35935, TEST_CONNECTION_DEFAULT, 0, NULL , NULL},
   {"test_client_warnings", test_client_warnings, TEST_CONNECTION_DEFAULT, 0, NULL , NULL},
   {"test_ps_client_warnings", test_ps_client_warnings, TEST_CONNECTION_DEFAULT, 0, NULL , NULL},
   {"test_server_warnings", test_server_warnings, TEST_CONNECTION_DEFAULT, 0, NULL , NULL},
diff -Nru mariadb-10.11.11/libmariadb/unittest/libmariadb/ps_bugs.c mariadb-10.11.13/libmariadb/unittest/libmariadb/ps_bugs.c
--- mariadb-10.11.11/libmariadb/unittest/libmariadb/ps_bugs.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/libmariadb/unittest/libmariadb/ps_bugs.c	2025-05-19 16:14:27.000000000 +0000
@@ -5001,7 +5001,7 @@
 
   for (i=0; i < 10; i++, frac=frac*10+i)
   {
-    unsigned long expected= 0;
+    unsigned int expected= frac;
     sprintf(query, "SELECT '2018-11-05 22:25:59.%ld'", frac);
 
     diag("%d: %s", i, query);
@@ -5027,11 +5027,15 @@
 
     diag("second_part: %ld", tm.second_part);
 
-    expected= i > 6 ? 123456 : frac * (unsigned int)powl(10, (6 - i));
+    while (expected && expected < 100000)
+      expected *= 10;
+    while (expected >= 1000000)
+      expected /= 10;
 
     if (tm.second_part != expected)
     {
-      diag("Error: tm.second_part=%ld expected=%ld", tm.second_part, expected);
+      diag("Error: tm.second_part=%ld expected=%d", tm.second_part, expected);
+      mysql_stmt_close(stmt);
       return FAIL;
     }
   }
@@ -5618,6 +5622,7 @@
   rc= mysql_stmt_attr_set(stmt, STMT_ATTR_CB_PARAM, conc623_param_callback);
   check_stmt_rc(rc, stmt);
 
+  memset(&bind, 0, sizeof(MYSQL_BIND));
   bind.buffer_type= MYSQL_TYPE_LONG;
   rc= mysql_stmt_bind_param(stmt, &bind);
   check_stmt_rc(rc, stmt);
@@ -5910,9 +5915,50 @@
   return OK;
 }
 
+static int test_conc762(MYSQL *mysql)
+{
+  int rc;
+  MYSQL_STMT *stmt= mysql_stmt_init(mysql);
+  MYSQL_BIND bind[2];
+  my_bool is_null[2]= {1,1};
+  unsigned long length[2]= {1,1};
+
+  rc= mysql_stmt_prepare(stmt, SL("SELECT NULL, 'foo'"));
+  check_stmt_rc(rc, stmt);
+
+  memset(&bind, 0, sizeof(MYSQL_BIND) * 2);
+
+  bind[0].buffer_type = MYSQL_TYPE_STRING;
+  bind[1].buffer_type = MYSQL_TYPE_STRING;
+  bind[0].is_null= &is_null[0];
+  bind[1].is_null= &is_null[1];
+  bind[0].buffer_length= bind[1].buffer_length= 0;
+  bind[0].length= &length[0];
+  bind[1].length= &length[1];
+
+  rc= mysql_stmt_execute(stmt);
+  check_stmt_rc(rc, stmt);
+
+  rc= mysql_stmt_bind_result(stmt, bind);
+
+  mysql_stmt_fetch(stmt);
+  FAIL_IF(is_null[0]==0, "Expected NULL value");
+  FAIL_IF(is_null[1]==1, "Expected non NULL value");
+  FAIL_IF(length[0]!=0, "Expected length=0");
+  FAIL_IF(length[1]!=3, "Expected length=3");
+
+//  FAIL_IF(length[0] != 0, "Expected length=0");
+  
+//FAIL_IF(length[1] != 3, "Expected length=3)";
+
+  mysql_stmt_close(stmt);
+  return OK;
+}
+
 
 struct my_tests_st my_tests[] = {
   {"test_conc702", test_conc702, TEST_CONNECTION_DEFAULT, 0, NULL, NULL},
+  {"test_conc762", test_conc762, TEST_CONNECTION_DEFAULT, 0, NULL, NULL},
   {"test_conc176", test_conc176, TEST_CONNECTION_DEFAULT, 0, NULL, NULL},
   {"test_conc739", test_conc739, TEST_CONNECTION_DEFAULT, 0, NULL, NULL},
   {"test_conc633", test_conc633, TEST_CONNECTION_DEFAULT, 0, NULL, NULL},
diff -Nru mariadb-10.11.11/mysql-test/CMakeLists.txt mariadb-10.11.13/mysql-test/CMakeLists.txt
--- mariadb-10.11.11/mysql-test/CMakeLists.txt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/CMakeLists.txt	2025-05-19 16:14:24.000000000 +0000
@@ -14,7 +14,7 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335 USA
 
-INSTALL_MYSQL_TEST("." ".")
+INSTALL_MYSQL_TEST("." "")
 
 IF(NOT ${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
   # Enable running mtr from build directory
diff -Nru mariadb-10.11.11/mysql-test/include/long_test.inc mariadb-10.11.13/mysql-test/include/long_test.inc
--- mariadb-10.11.11/mysql-test/include/long_test.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/include/long_test.inc	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 # We use this --source include to mark a test as taking long to run.
 # We can use this to schedule such test early (to not be left with
-# only one or two long tests running, and rests of works idle), or to
+# only one or two long tests running, and rests of workers idle), or to
 # run a quick test skipping long-running test cases.
 
 --source include/no_valgrind_without_big.inc
diff -Nru mariadb-10.11.11/mysql-test/lib/My/SafeProcess/safe_process.cc mariadb-10.11.13/mysql-test/lib/My/SafeProcess/safe_process.cc
--- mariadb-10.11.11/mysql-test/lib/My/SafeProcess/safe_process.cc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/lib/My/SafeProcess/safe_process.cc	2025-05-19 16:14:24.000000000 +0000
@@ -220,6 +220,7 @@
   pid_t own_pid= getpid();
   pid_t parent_pid= getppid();
   bool nocore = false;
+  int open_files_limit = 1024;
   struct sigaction sa,sa_abort;
 
   sa.sa_handler= handle_signal;
@@ -268,7 +269,14 @@
       }
       else if ( strncmp (arg, "--env ", 6) == 0 )
       {
-	putenv(strdup(arg+6));
+        putenv(strdup(arg+6));
+      }
+      else if ( strncmp(arg, "--open-files-limit=", 19) == 0 )
+      {
+        const char* start = arg + 19;
+        open_files_limit = atoi(start);
+        if (open_files_limit <= 0)
+          die("Invalid value '%s' passed to --open-files-limit", start);
       }
       else
         die("Unknown option: %s", arg);
@@ -318,11 +326,8 @@
     if (nocore)
       setlimit(RLIMIT_CORE, 0, 0);
 
-    /*
-      mysqld defaults depend on that. make test results stable and independent
-      from the environment
-    */
-    setlimit(RLIMIT_NOFILE, 1024, 1024);
+    // Set open files limit 
+    setlimit(RLIMIT_NOFILE, open_files_limit, open_files_limit);
 
     // Signal that child is ready
     buf= 37;
diff -Nru mariadb-10.11.11/mysql-test/lib/My/SafeProcess.pm mariadb-10.11.13/mysql-test/lib/My/SafeProcess.pm
--- mariadb-10.11.11/mysql-test/lib/My/SafeProcess.pm	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/lib/My/SafeProcess.pm	2025-05-19 16:14:24.000000000 +0000
@@ -138,6 +138,7 @@
   my $error    = delete($opts{'error'});
   my $verbose  = delete($opts{'verbose'}) || $::opt_verbose;
   my $nocore   = delete($opts{'nocore'});
+  my $open_files_limit = delete($opts{'open_files_limit'});
   my $host     = delete($opts{'host'});
   my $shutdown = delete($opts{'shutdown'});
   my $user_data= delete($opts{'user_data'});
@@ -161,6 +162,8 @@
   push(@safe_args, "--verbose") if $verbose > 0;
   push(@safe_args, "--nocore") if $nocore;
 
+  push(@safe_args, "--open-files-limit=$open_files_limit") if $open_files_limit;
+
   # Point the safe_process at the right parent if running on cygwin
   push(@safe_args, "--parent-pid=".Cygwin::pid_to_winpid($$)) if IS_CYGWIN;
 
diff -Nru mariadb-10.11.11/mysql-test/main/backup_locks.test mariadb-10.11.13/mysql-test/main/backup_locks.test
--- mariadb-10.11.11/mysql-test/main/backup_locks.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/backup_locks.test	2025-05-19 16:14:24.000000000 +0000
@@ -2,6 +2,7 @@
 # Tests BACKUP STAGE locking
 ########################################################################
 
+--source include/long_test.inc
 --source include/have_innodb.inc
 --source include/have_metadata_lock_info.inc
 --source include/not_embedded.inc
diff -Nru mariadb-10.11.11/mysql-test/main/comment_database.result mariadb-10.11.13/mysql-test/main/comment_database.result
--- mariadb-10.11.11/mysql-test/main/comment_database.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/comment_database.result	2025-05-19 16:14:24.000000000 +0000
@@ -76,3 +76,16 @@
 CATALOG_NAME	SCHEMA_NAME	DEFAULT_CHARACTER_SET_NAME	DEFAULT_COLLATION_NAME	SQL_PATH	SCHEMA_COMMENT
 def	comment	latin2	latin2_general_ci	NULL	comment
 DROP DATABASE comment;
+CREATE DATABASE db1;
+# restart
+SHOW CREATE DATABASE db1;
+Database	Create Database
+db1	CREATE DATABASE `db1` /*!40100 DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci */
+Warnings:
+Note	1105	Database 'db1' does not have a db.opt file. You can create one with ALTER DATABASE if needed
+SHOW CREATE DATABASE db1;
+Database	Create Database
+db1	CREATE DATABASE `db1` /*!40100 DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci */
+Warnings:
+Note	1105	Database 'db1' does not have a db.opt file. You can create one with ALTER DATABASE if needed
+DROP DATABASE db1;
diff -Nru mariadb-10.11.11/mysql-test/main/comment_database.test mariadb-10.11.13/mysql-test/main/comment_database.test
--- mariadb-10.11.11/mysql-test/main/comment_database.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/comment_database.test	2025-05-19 16:14:24.000000000 +0000
@@ -63,3 +63,11 @@
 WHERE schema_name='comment';
 DROP DATABASE comment;
 --enable_service_connection
+
+CREATE DATABASE db1;
+--remove_file $MARIADB_DATADIR/db1/db.opt
+--source include/restart_mysqld.inc
+# We need to call this two times to ensure all code paths are used
+SHOW CREATE DATABASE db1;
+SHOW CREATE DATABASE db1;
+DROP DATABASE db1;
diff -Nru mariadb-10.11.11/mysql-test/main/ctype_utf8_def_upgrade.result mariadb-10.11.13/mysql-test/main/ctype_utf8_def_upgrade.result
--- mariadb-10.11.11/mysql-test/main/ctype_utf8_def_upgrade.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/ctype_utf8_def_upgrade.result	2025-05-19 16:14:24.000000000 +0000
@@ -53,6 +53,8 @@
 SHOW CREATE DATABASE db1;
 Database	Create Database
 db1	CREATE DATABASE `db1` /*!40100 DEFAULT CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci */
+Warnings:
+Note	1105	Database 'db1' does not have a db.opt file. You can create one with ALTER DATABASE if needed
 USE db1;
 SELECT @@character_set_database, 'taken from defaults' AS comment;
 @@character_set_database	comment
diff -Nru mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.result mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.result
--- mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.result	2025-05-19 16:14:24.000000000 +0000
@@ -11761,9 +11761,8 @@
 EXPLAIN INSERT INTO t1
 SELECT * FROM ( SELECT t1.f FROM v1 JOIN t1 ) AS t WHERE f IS NOT NULL;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
-1	PRIMARY	<derived2>	ALL	NULL	NULL	NULL	NULL	144	Using where
-2	DERIVED	<derived4>	ALL	NULL	NULL	NULL	NULL	12	
-2	DERIVED	t1	ALL	NULL	NULL	NULL	NULL	12	Using where; Using join buffer (flat, BNL join)
+1	PRIMARY	<derived4>	ALL	NULL	NULL	NULL	NULL	12	Using temporary
+1	PRIMARY	t1	ALL	NULL	NULL	NULL	NULL	12	Using where; Using join buffer (flat, BNL join)
 4	DERIVED	t1	ALL	NULL	NULL	NULL	NULL	12	
 EXPLAIN FORMAT=JSON INSERT INTO t1
 SELECT * FROM ( SELECT t1.f FROM v1 JOIN t1 ) AS t WHERE f IS NOT NULL;
@@ -11771,61 +11770,47 @@
 {
   "query_block": {
     "select_id": 1,
-    "nested_loop": [
-      {
-        "table": {
-          "table_name": "<derived2>",
-          "access_type": "ALL",
-          "rows": 144,
-          "filtered": 100,
-          "attached_condition": "t.f is not null",
-          "materialized": {
-            "query_block": {
-              "select_id": 2,
-              "nested_loop": [
-                {
-                  "table": {
-                    "table_name": "<derived4>",
-                    "access_type": "ALL",
-                    "rows": 12,
-                    "filtered": 100,
-                    "materialized": {
-                      "query_block": {
-                        "select_id": 4,
-                        "nested_loop": [
-                          {
-                            "table": {
-                              "table_name": "t1",
-                              "access_type": "ALL",
-                              "rows": 12,
-                              "filtered": 100
-                            }
-                          }
-                        ]
-                      }
-                    }
-                  }
-                },
-                {
-                  "block-nl-join": {
+    "temporary_table": {
+      "nested_loop": [
+        {
+          "table": {
+            "table_name": "<derived4>",
+            "access_type": "ALL",
+            "rows": 12,
+            "filtered": 100,
+            "materialized": {
+              "query_block": {
+                "select_id": 4,
+                "nested_loop": [
+                  {
                     "table": {
                       "table_name": "t1",
                       "access_type": "ALL",
                       "rows": 12,
-                      "filtered": 100,
-                      "attached_condition": "t1.f is not null"
-                    },
-                    "buffer_type": "flat",
-                    "buffer_size": "64",
-                    "join_type": "BNL"
+                      "filtered": 100
+                    }
                   }
-                }
-              ]
+                ]
+              }
             }
           }
+        },
+        {
+          "block-nl-join": {
+            "table": {
+              "table_name": "t1",
+              "access_type": "ALL",
+              "rows": 12,
+              "filtered": 100,
+              "attached_condition": "t1.f is not null"
+            },
+            "buffer_type": "flat",
+            "buffer_size": "64",
+            "join_type": "BNL"
+          }
         }
-      }
-    ]
+      ]
+    }
   }
 }
 SELECT * FROM t1;
@@ -11854,62 +11839,48 @@
 {
   "query_block": {
     "select_id": 1,
-    "nested_loop": [
-      {
-        "table": {
-          "table_name": "<derived2>",
-          "access_type": "ALL",
-          "rows": 16,
-          "filtered": 100,
-          "attached_condition": "t.f is not null",
-          "materialized": {
-            "query_block": {
-              "select_id": 2,
-              "nested_loop": [
-                {
-                  "table": {
-                    "table_name": "t1",
-                    "access_type": "ALL",
-                    "rows": 8,
-                    "filtered": 100,
-                    "attached_condition": "t1.f is not null"
-                  }
-                },
-                {
-                  "table": {
-                    "table_name": "<derived4>",
-                    "access_type": "ref",
-                    "possible_keys": ["key0"],
-                    "key": "key0",
-                    "key_length": "4",
-                    "used_key_parts": ["f"],
-                    "ref": ["test.t1.f"],
-                    "rows": 2,
-                    "filtered": 100,
-                    "materialized": {
-                      "query_block": {
-                        "select_id": 4,
-                        "nested_loop": [
-                          {
-                            "table": {
-                              "table_name": "t1",
-                              "access_type": "ALL",
-                              "rows": 8,
-                              "filtered": 100,
-                              "attached_condition": "t1.f is not null"
-                            }
-                          }
-                        ]
-                      }
+    "temporary_table": {
+      "nested_loop": [
+        {
+          "table": {
+            "table_name": "t1",
+            "access_type": "ALL",
+            "rows": 8,
+            "filtered": 100,
+            "attached_condition": "t1.f is not null"
+          }
+        },
+        {
+          "table": {
+            "table_name": "<derived4>",
+            "access_type": "ref",
+            "possible_keys": ["key0"],
+            "key": "key0",
+            "key_length": "4",
+            "used_key_parts": ["f"],
+            "ref": ["test.t1.f"],
+            "rows": 2,
+            "filtered": 100,
+            "materialized": {
+              "query_block": {
+                "select_id": 4,
+                "nested_loop": [
+                  {
+                    "table": {
+                      "table_name": "t1",
+                      "access_type": "ALL",
+                      "rows": 8,
+                      "filtered": 100,
+                      "attached_condition": "t1.f is not null"
                     }
                   }
-                }
-              ]
+                ]
+              }
             }
           }
         }
-      }
-    ]
+      ]
+    }
   }
 }
 SELECT * FROM t1;
@@ -21669,6 +21640,27 @@
 GROUP BY 1 ;
 ( SELECT 1 FROM ( SELECT 1 FROM cte1) dt GROUP BY x HAVING x= 1 )
 1
+create table t1 (f int);
+create view v1 as select f, count(*) c from t1 group by f;
+#
+# MDEV-25012 Server crash in find_field_in_tables, Assertion `name' failed in find_field_in_table_ref
+#
+select * from v1 where export_set(1, default(f), 'x', aes_decrypt('secret', f));
+f	c
+show warnings;
+Level	Code	Message
+drop view v1;
+drop table t1;
+create table t(c3 longtext) ;
+with cte1 as
+(
+select default(c3) as a
+from t group by 1
+)
+select * from cte1
+where cte1.a >= 1;
+a
+drop table t;
 # End of 10.5 tests
 #
 # MDEV-28958: condition pushable into view after simplification
diff -Nru mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.test mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.test
--- mariadb-10.11.11/mysql-test/main/derived_cond_pushdown.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/derived_cond_pushdown.test	2025-05-19 16:14:24.000000000 +0000
@@ -4271,6 +4271,28 @@
 FROM cte2
   GROUP BY 1 ;
 
+create table t1 (f int);
+create view v1 as select f, count(*) c from t1 group by f;
+
+--echo #
+--echo # MDEV-25012 Server crash in find_field_in_tables, Assertion `name' failed in find_field_in_table_ref
+--echo #
+select * from v1 where export_set(1, default(f), 'x', aes_decrypt('secret', f));
+show warnings;
+# cleanup
+drop view v1;
+drop table t1;
+
+create table t(c3 longtext) ;
+with cte1 as
+(
+  select default(c3) as a
+  from t group by 1
+)
+select * from cte1
+where cte1.a >= 1;
+drop table t;
+
 --echo # End of 10.5 tests
 
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/main/derived_view.result mariadb-10.11.13/mysql-test/main/derived_view.result
--- mariadb-10.11.11/mysql-test/main/derived_view.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/derived_view.result	2025-05-19 16:14:24.000000000 +0000
@@ -2461,6 +2461,8 @@
 a
 1
 1
+1
+1
 drop table t1,t2;
 set optimizer_switch=@save968720_optimizer_switch;
 #
diff -Nru mariadb-10.11.11/mysql-test/main/func_json.result mariadb-10.11.13/mysql-test/main/func_json.result
--- mariadb-10.11.11/mysql-test/main/func_json.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/func_json.result	2025-05-19 16:14:24.000000000 +0000
@@ -1766,6 +1766,43 @@
 data
 <root language="de"></root>
 #
+# MDEV-35614 JSON_UNQUOTE doesn't work with emojis
+#
+SELECT HEX(JSON_UNQUOTE('"\\ud83d\\ude0a"')) as hex_smiley;
+hex_smiley
+F09F988A
+set names utf8mb4;
+SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') as smiley;
+smiley
+😊
+SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') = JSON_UNQUOTE('"\\ud83d\\ude0a"') as equal_smileys;
+equal_smileys
+1
+SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') <= JSON_UNQUOTE('"\\ud83d\\ude0a"') as less_or_equal_smileys;
+less_or_equal_smileys
+1
+set @v='{ "color":"😊" }';
+select @v as v, collation(@v) as collation_v;
+v	collation_v
+{ "color":"😊" }	utf8mb4_general_ci
+select json_valid(@v) as valid;
+valid
+1
+select json_extract(@v,'$.color') as color_extraction, collation(json_extract(@v,'$.color')) as color_extraction_collation;
+color_extraction	color_extraction_collation
+"😊"	utf8mb4_general_ci
+select json_unquote(json_extract(@v,'$.color')) as unquoted, collation(json_unquote(json_extract(@v,'$.color'))) as unquoted_collation;
+unquoted	unquoted_collation
+😊	utf8mb4_bin
+SELECT JSON_UNQUOTE('"\\uc080\\ude0a"') as invalid_utf8mb4;
+invalid_utf8mb4
+"\uc080\ude0a"
+Warnings:
+Warning	4035	Broken JSON string in argument 1 to function 'json_unquote' at position 13
+show warnings;
+Level	Code	Message
+Warning	4035	Broken JSON string in argument 1 to function 'json_unquote' at position 13
+#
 # End of 10.6 tests
 #
 #
diff -Nru mariadb-10.11.11/mysql-test/main/func_json.test mariadb-10.11.13/mysql-test/main/func_json.test
--- mariadb-10.11.11/mysql-test/main/func_json.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/func_json.test	2025-05-19 16:14:24.000000000 +0000
@@ -1194,6 +1194,7 @@
 
 SET @@collation_connection= @save_collation_connection;
 
+
 --echo #
 --echo # End of 10.5 tests
 --echo #
@@ -1231,6 +1232,27 @@
   data
 FROM JSON_TABLE (@data, '$[*]' COLUMNS (data text PATH '$.Data')) AS t;
 
+
+--echo #
+--echo # MDEV-35614 JSON_UNQUOTE doesn't work with emojis
+--echo #
+
+SELECT HEX(JSON_UNQUOTE('"\\ud83d\\ude0a"')) as hex_smiley;
+set names utf8mb4;
+SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') as smiley;
+
+SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') = JSON_UNQUOTE('"\\ud83d\\ude0a"') as equal_smileys;
+SELECT JSON_UNQUOTE('"\\ud83d\\ude0a"') <= JSON_UNQUOTE('"\\ud83d\\ude0a"') as less_or_equal_smileys;
+
+set @v='{ "color":"😊" }';
+select @v as v, collation(@v) as collation_v;
+select json_valid(@v) as valid;
+select json_extract(@v,'$.color') as color_extraction, collation(json_extract(@v,'$.color')) as color_extraction_collation;
+select json_unquote(json_extract(@v,'$.color')) as unquoted, collation(json_unquote(json_extract(@v,'$.color'))) as unquoted_collation;
+
+SELECT JSON_UNQUOTE('"\\uc080\\ude0a"') as invalid_utf8mb4;
+show warnings;
+
 --echo #
 --echo # End of 10.6 tests
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/main/func_like.result mariadb-10.11.13/mysql-test/main/func_like.result
--- mariadb-10.11.11/mysql-test/main/func_like.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/func_like.result	2025-05-19 16:14:24.000000000 +0000
@@ -424,3 +424,22 @@
 Note	1003	select 1 like `test`.`t1`.`c1` | `test`.`t1`.`c2` AS `1 LIKE c1|c2`,1 like `test`.`t1`.`c1` & `test`.`t1`.`c2` AS `1 LIKE c1&c2`,1 like `test`.`t1`.`c2` >> `test`.`t1`.`c1` AS `1 LIKE c2>>c1`,2 like `test`.`t1`.`c2` << `test`.`t1`.`c1` AS `2 LIKE c2<<c1`,1 like `test`.`t1`.`c1` or `test`.`t1`.`c2` <> 0 AS `1 LIKE c1||c2`,2 like `test`.`t1`.`c1` + `test`.`t1`.`c2` AS `2 LIKE c1+c2`,-1 like `test`.`t1`.`c1` - `test`.`t1`.`c2` AS `-1 LIKE c1-c2`,2 like `test`.`t1`.`c1` * `test`.`t1`.`c2` AS `2 LIKE c1*c2`,0.5000 like `test`.`t1`.`c1` / `test`.`t1`.`c2` AS `0.5000 LIKE c1/c2`,0 like `test`.`t1`.`c1` DIV `test`.`t1`.`c2` AS `0 LIKE c1 DIV c2`,0 like `test`.`t1`.`c1` MOD `test`.`t1`.`c2` AS `0 LIKE c1 MOD c2` from `test`.`t1` order by `test`.`t1`.`c2`
 DROP VIEW v1;
 DROP TABLE t1;
+#
+# MDEV-36211 Incorrect query result for binary_column NOT LIKE binary_column
+#
+CREATE TABLE t1 (c1 BLOB NOT NULL);
+INSERT INTO t1 (c1) VALUES (1);
+SELECT c1 FROM t1 WHERE c1 NOT LIKE c1;
+c1
+SELECT c1 FROM t1 WHERE c1 LIKE c1;
+c1
+1
+DROP TABLE t1;
+CREATE TABLE t1 (c1 BLOB);
+INSERT INTO t1 (c1) VALUES (1);
+SELECT c1 FROM t1 WHERE c1 NOT LIKE c1;
+c1
+SELECT c1 FROM t1 WHERE c1 LIKE c1;
+c1
+1
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/func_like.test mariadb-10.11.13/mysql-test/main/func_like.test
--- mariadb-10.11.11/mysql-test/main/func_like.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/func_like.test	2025-05-19 16:14:24.000000000 +0000
@@ -291,3 +291,18 @@
 EXPLAIN EXTENDED SELECT * FROM v1;
 DROP VIEW v1;
 DROP TABLE t1;
+
+--echo #
+--echo # MDEV-36211 Incorrect query result for binary_column NOT LIKE binary_column
+--echo #
+CREATE TABLE t1 (c1 BLOB NOT NULL);
+INSERT INTO t1 (c1) VALUES (1);
+SELECT c1 FROM t1 WHERE c1 NOT LIKE c1;
+SELECT c1 FROM t1 WHERE c1 LIKE c1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 BLOB);
+INSERT INTO t1 (c1) VALUES (1);
+SELECT c1 FROM t1 WHERE c1 NOT LIKE c1;
+SELECT c1 FROM t1 WHERE c1 LIKE c1;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/func_regexp_pcre.result mariadb-10.11.13/mysql-test/main/func_regexp_pcre.result
--- mariadb-10.11.11/mysql-test/main/func_regexp_pcre.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/func_regexp_pcre.result	2025-05-19 16:14:24.000000000 +0000
@@ -60,7 +60,7 @@
 INSERT INTO t2 VALUES ('\\p{Cyrillic}'),('\\p{Greek}'),('\\p{Latin}');
 INSERT INTO t2 VALUES ('\\p{Han}'),('\\p{Hangul}');
 INSERT INTO t2 VALUES ('\\p{Sinhala}'), ('\\p{Tamil}');
-INSERT INTO t2 VALUES ('\\p{L}'),('\\p{Ll}'),('\\p{Lu}'),('\\p{L&}');
+INSERT INTO t2 VALUES ('\\p{L}'), /* buggy before v10.45 ('\\p{Ll}'),('\\p{Lu}'),*/ ('\\p{L&}');
 INSERT INTO t2 VALUES ('[[:alpha:]]'),('[[:digit:]]');
 SELECT class, ch, ch RLIKE class FROM t1, t2 ORDER BY class, BINARY ch;
 class	ch	ch RLIKE class
@@ -168,32 +168,6 @@
 \p{Latin}	ප	0
 \p{Latin}	㐗	0
 \p{Latin}	갷	0
-\p{Ll}	1	0
-\p{Ll}	A	0
-\p{Ll}	a	1
-\p{Ll}	À	0
-\p{Ll}	à	1
-\p{Ll}	Σ	0
-\p{Ll}	σ	1
-\p{Ll}	Я	0
-\p{Ll}	я	1
-\p{Ll}	௨	0
-\p{Ll}	ප	0
-\p{Ll}	㐗	0
-\p{Ll}	갷	0
-\p{Lu}	1	0
-\p{Lu}	A	1
-\p{Lu}	a	0
-\p{Lu}	À	1
-\p{Lu}	à	0
-\p{Lu}	Σ	1
-\p{Lu}	σ	0
-\p{Lu}	Я	1
-\p{Lu}	я	0
-\p{Lu}	௨	0
-\p{Lu}	ප	0
-\p{Lu}	㐗	0
-\p{Lu}	갷	0
 \p{L}	1	0
 \p{L}	A	1
 \p{L}	a	1
diff -Nru mariadb-10.11.11/mysql-test/main/func_regexp_pcre.test mariadb-10.11.13/mysql-test/main/func_regexp_pcre.test
--- mariadb-10.11.11/mysql-test/main/func_regexp_pcre.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/func_regexp_pcre.test	2025-05-19 16:14:24.000000000 +0000
@@ -41,7 +41,7 @@
 INSERT INTO t2 VALUES ('\\p{Cyrillic}'),('\\p{Greek}'),('\\p{Latin}');
 INSERT INTO t2 VALUES ('\\p{Han}'),('\\p{Hangul}');
 INSERT INTO t2 VALUES ('\\p{Sinhala}'), ('\\p{Tamil}');
-INSERT INTO t2 VALUES ('\\p{L}'),('\\p{Ll}'),('\\p{Lu}'),('\\p{L&}');
+INSERT INTO t2 VALUES ('\\p{L}'), /* buggy before v10.45 ('\\p{Ll}'),('\\p{Lu}'),*/ ('\\p{L&}');
 INSERT INTO t2 VALUES ('[[:alpha:]]'),('[[:digit:]]');
 SELECT class, ch, ch RLIKE class FROM t1, t2 ORDER BY class, BINARY ch;
 DROP TABLE t1, t2;
diff -Nru mariadb-10.11.11/mysql-test/main/gis-precise.result mariadb-10.11.13/mysql-test/main/gis-precise.result
--- mariadb-10.11.11/mysql-test/main/gis-precise.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/gis-precise.result	2025-05-19 16:14:24.000000000 +0000
@@ -776,7 +776,7 @@
 ST_DISTANCE_SPHERE(1, 1, NULL)
 NULL
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(1 0)'), ST_GEOMFROMTEXT('LINESTRING(0 0, 1 1)')) as result;
-ERROR HY000: Internal error: st_distance_sphere
+ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments.
 # Test Points and radius
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)')) as result;
 result
@@ -788,9 +788,9 @@
 result
 0.024682056391766436
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), 0) as result;
-ERROR HY000: Internal error: Radius must be greater than zero.
+ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments.
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), -1) as result;
-ERROR HY000: Internal error: Radius must be greater than zero.
+ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments.
 # Test longitude/lattitude
 SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 1)'), ST_GEOMFROMTEXT('POINT(1 2)')), 10) as result;
 result
@@ -843,7 +843,7 @@
 result
 0.04933028646581131
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )'),0) as result;
-ERROR HY000: Internal error: Radius must be greater than zero.
+ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments.
 set @pt1 = ST_GeomFromText('POINT(190 -30)');
 set @pt2 = ST_GeomFromText('POINT(-30  50)');
 SELECT ST_Distance_Sphere(@pt1, @pt2);
diff -Nru mariadb-10.11.11/mysql-test/main/gis-precise.test mariadb-10.11.13/mysql-test/main/gis-precise.test
--- mariadb-10.11.11/mysql-test/main/gis-precise.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/gis-precise.test	2025-05-19 16:14:24.000000000 +0000
@@ -422,7 +422,7 @@
 # Return NULL if radius is NULL
 SELECT ST_DISTANCE_SPHERE(1, 1, NULL);
 # Wrong geometry
---error ER_INTERNAL_ERROR
+--error ER_GIS_UNSUPPORTED_ARGUMENT
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(1 0)'), ST_GEOMFROMTEXT('LINESTRING(0 0, 1 1)')) as result;
 
 --echo # Test Points and radius
@@ -430,9 +430,9 @@
 # make bb x86 happy
 SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(-1 -1)'), ST_GEOMFROMTEXT('POINT(-2 -2)')), 10) as result;
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), 1) as result;
---error ER_INTERNAL_ERROR
+--error ER_GIS_UNSUPPORTED_ARGUMENT
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), 0) as result;
---error ER_INTERNAL_ERROR
+--error ER_GIS_UNSUPPORTED_ARGUMENT
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('POINT(0 0)'), ST_GEOMFROMTEXT('POINT(1 1)'), -1) as result;
 --echo # Test longitude/lattitude
 # make bb x86 happy
@@ -456,7 +456,7 @@
 SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )')), 10) as result;
 # make bb x86 happy
 SELECT TRUNCATE(ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )'),1), 17) as result;
---error ER_INTERNAL_ERROR
+--error ER_GIS_UNSUPPORTED_ARGUMENT
 SELECT ST_DISTANCE_SPHERE(ST_GEOMFROMTEXT('MULTIPOINT(1 2,1 1 )'), ST_GEOMFROMTEXT('MULTIPOINT(8 9,3 4 )'),0) as result;
 
 # Longitude out of range [-180,180]
diff -Nru mariadb-10.11.11/mysql-test/main/gis.result mariadb-10.11.13/mysql-test/main/gis.result
--- mariadb-10.11.11/mysql-test/main/gis.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/gis.result	2025-05-19 16:14:24.000000000 +0000
@@ -5474,4 +5474,36 @@
 SELECT NTH_VALUE(a,b) OVER () FROM t;
 ERROR HY000: Illegal parameter data types point and bigint for operation '-'
 DROP TABLE t;
+#
+# MDEV-32619 Settng SRID on geometry with ST_*FromWKKB(g, srid)
+#
+SELECT
+ST_SRID(g1),
+ST_SRID(ST_GeomFromWKB(g1, 4326)),
+ST_SRID(ST_GeomFromWKB(g1)),
+ST_AsText(g1),
+ST_SRID(ST_PointFromWKB(g2, 4326)),
+ST_SRID(g2),
+ST_SRID(ST_LineStringFromWKB(g3, 3)),
+ST_SRID(ST_PolygonFromWKB(g4, 4)),
+ST_SRID(ST_MultiPointFromWKB(g5, 5)),
+ST_SRID(ST_MultiLineStringFromWKB(g6, 6)),
+ST_SRID(ST_MultiPolygonFromWKB(g7, 7))
+FROM (
+SELECT
+POINT(1, 2) AS g1,
+POINT(4, 3) AS g2,
+LINESTRING(POINT(4, 3), POINT(4, 4)) AS g3,
+POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3))) AS g4,
+MULTIPOINT(POINT(4, 3)) AS g5,
+MULTILINESTRING(LINESTRING(POINT(4, 3), POINT(4, 4))) AS g6,
+MULTIPOLYGON(POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3)))) AS g7
+) AS t;
+ST_SRID(g1)	ST_SRID(ST_GeomFromWKB(g1, 4326))	ST_SRID(ST_GeomFromWKB(g1))	ST_AsText(g1)	ST_SRID(ST_PointFromWKB(g2, 4326))	ST_SRID(g2)	ST_SRID(ST_LineStringFromWKB(g3, 3))	ST_SRID(ST_PolygonFromWKB(g4, 4))	ST_SRID(ST_MultiPointFromWKB(g5, 5))	ST_SRID(ST_MultiLineStringFromWKB(g6, 6))	ST_SRID(ST_MultiPolygonFromWKB(g7, 7))
+0	4326	0	POINT(1 2)	4326	0	3	4	5	6	7
+#
+# MDEV-35117 Error message "ERROR 1815 (HY000): Internal error: st_distance_sphere' could be improved
+#
+SELECT ST_DISTANCE_SPHERE(st_geomfromtext('linestring( 2 2, 2 8) '), ST_GeomFromText('POINT(18.413076 43.856258)')) ;
+ERROR HY000: Calling geometry function st_distance_sphere with unsupported types of arguments.
 # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/gis.test mariadb-10.11.13/mysql-test/main/gis.test
--- mariadb-10.11.11/mysql-test/main/gis.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/gis.test	2025-05-19 16:14:24.000000000 +0000
@@ -3482,4 +3482,36 @@
 SELECT NTH_VALUE(a,b) OVER () FROM t;
 DROP TABLE t;
 
+--echo #
+--echo # MDEV-32619 Settng SRID on geometry with ST_*FromWKKB(g, srid)
+--echo #
+SELECT
+  ST_SRID(g1),
+  ST_SRID(ST_GeomFromWKB(g1, 4326)),
+  ST_SRID(ST_GeomFromWKB(g1)),
+  ST_AsText(g1),
+  ST_SRID(ST_PointFromWKB(g2, 4326)),
+  ST_SRID(g2),
+  ST_SRID(ST_LineStringFromWKB(g3, 3)),
+  ST_SRID(ST_PolygonFromWKB(g4, 4)),
+  ST_SRID(ST_MultiPointFromWKB(g5, 5)),
+  ST_SRID(ST_MultiLineStringFromWKB(g6, 6)),
+  ST_SRID(ST_MultiPolygonFromWKB(g7, 7))
+FROM (
+  SELECT
+    POINT(1, 2) AS g1,
+    POINT(4, 3) AS g2,
+    LINESTRING(POINT(4, 3), POINT(4, 4)) AS g3,
+    POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3))) AS g4,
+    MULTIPOINT(POINT(4, 3)) AS g5,
+    MULTILINESTRING(LINESTRING(POINT(4, 3), POINT(4, 4))) AS g6,
+    MULTIPOLYGON(POLYGON(LINESTRING(POINT(4, 3), POINT(4, 4), POINT(3, 4), POINT(4, 3)))) AS g7
+) AS t;
+
+--echo #
+--echo # MDEV-35117 Error message "ERROR 1815 (HY000): Internal error: st_distance_sphere' could be improved
+--echo #
+--error ER_GIS_UNSUPPORTED_ARGUMENT
+SELECT ST_DISTANCE_SPHERE(st_geomfromtext('linestring( 2 2, 2 8) '), ST_GeomFromText('POINT(18.413076 43.856258)')) ;
+
 --echo # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/group_by.result mariadb-10.11.13/mysql-test/main/group_by.result
--- mariadb-10.11.11/mysql-test/main/group_by.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/group_by.result	2025-05-19 16:14:24.000000000 +0000
@@ -2997,5 +2997,79 @@
 ERROR 42S22: Reference 'c' not supported (forward reference in item list)
 DROP TABLE t1;
 #
+# MDEV-35238: Wrong results from a tables with a single record and an aggregate
+#
+CREATE OR REPLACE TABLE t1 (a int) ENGINE=myisam;
+SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+1+0	min(1)
+1	NULL
+explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "message": "Impossible WHERE noticed after reading const tables"
+    }
+  }
+}
+INSERT INTO t1 VALUES (NULL);
+SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+1+0	min(1)
+1	NULL
+explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "pseudo_bits_condition": "if(uuid_short(),NULL,1)",
+    "nested_loop": [
+      {
+        "table": {
+          "table_name": "t1",
+          "access_type": "system",
+          "rows": 1,
+          "filtered": 100
+        }
+      }
+    ]
+  }
+}
+DROP TABLE t1;
+CREATE TABLE t1 (a int PRIMARY KEY) ENGINE=myisam;
+INSERT INTO t1 VALUES (1);
+CREATE TABLE t2 (a int NOT NULL) ENGINE=myisam;
+INSERT INTO t2 VALUES (10);
+SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand();
+1+0	MIN(t1.a)
+1	1
+explain format=json SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand();
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "pseudo_bits_condition": "10 = rand()",
+    "nested_loop": [
+      {
+        "table": {
+          "table_name": "t1",
+          "access_type": "system",
+          "rows": 1,
+          "filtered": 100
+        }
+      },
+      {
+        "table": {
+          "table_name": "t2",
+          "access_type": "system",
+          "rows": 1,
+          "filtered": 100
+        }
+      }
+    ]
+  }
+}
+DROP TABLE t1,t2;
+#
 # End of 10.5 tests
 #
diff -Nru mariadb-10.11.11/mysql-test/main/group_by.test mariadb-10.11.13/mysql-test/main/group_by.test
--- mariadb-10.11.11/mysql-test/main/group_by.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/group_by.test	2025-05-19 16:14:24.000000000 +0000
@@ -2153,5 +2153,27 @@
 DROP TABLE t1;
 
 --echo #
+--echo # MDEV-35238: Wrong results from a tables with a single record and an aggregate
+--echo #
+CREATE OR REPLACE TABLE t1 (a int) ENGINE=myisam;
+SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+INSERT INTO t1 VALUES (NULL);
+SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+explain format=json SELECT 1+0, min(1) FROM t1 WHERE if(uuid_short(), a,1);
+DROP TABLE t1;
+
+CREATE TABLE t1 (a int PRIMARY KEY) ENGINE=myisam;
+INSERT INTO t1 VALUES (1);
+
+CREATE TABLE t2 (a int NOT NULL) ENGINE=myisam;
+INSERT INTO t2 VALUES (10);
+
+SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand();
+explain format=json SELECT 1+0, MIN(t1.a) FROM t1,t2 WHERE t2.a = rand();
+
+DROP TABLE t1,t2;
+
+--echo #
 --echo # End of 10.5 tests
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/main/group_min_max.result mariadb-10.11.13/mysql-test/main/group_min_max.result
--- mariadb-10.11.11/mysql-test/main/group_min_max.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/group_min_max.result	2025-05-19 16:14:24.000000000 +0000
@@ -4349,3 +4349,27 @@
 #
 # End of 10.6 tests
 #
+#
+# MDEV-36118 Wrong result in loose index scan
+#
+CREATE TABLE t1 (a int, b int, KEY (a, b));
+insert into t1 values (1, 3), (1, 1);
+SELECT MAX(b) FROM t1 WHERE (b > 2 AND b < 4) OR (b = 5) GROUP BY a;
+MAX(b)
+3
+drop table t1;
+#
+# MDEV-36220 ASAN unknown-crash in loose index scan of MIN with IS NULL
+#
+CREATE TABLE t1 (a int, b int, KEY (a, b));
+insert into t1 values (4, NULL), (1, 14), (4, 3);
+SELECT MIN(b) FROM t1 WHERE b = 3 OR b IS NULL GROUP BY a;
+MIN(b)
+3
+SELECT MIN(b) FROM t1 WHERE b IS NULL GROUP BY a;
+MIN(b)
+NULL
+drop table t1;
+#
+# End of 10.11 tests
+#
diff -Nru mariadb-10.11.11/mysql-test/main/group_min_max.test mariadb-10.11.13/mysql-test/main/group_min_max.test
--- mariadb-10.11.11/mysql-test/main/group_min_max.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/group_min_max.test	2025-05-19 16:14:24.000000000 +0000
@@ -2007,3 +2007,39 @@
 --echo #
 --echo # End of 10.6 tests
 --echo #
+
+--echo #
+--echo # MDEV-36118 Wrong result in loose index scan
+--echo #
+
+CREATE TABLE t1 (a int, b int, KEY (a, b));
+insert into t1 values (1, 3), (1, 1);
+--source include/maybe_debug.inc
+if ($have_debug) {
+  --disable_query_log
+  set @old_debug=@@debug;
+  set debug="+d,force_group_by";
+  --enable_query_log
+}
+SELECT MAX(b) FROM t1 WHERE (b > 2 AND b < 4) OR (b = 5) GROUP BY a;
+if ($have_debug) {
+  --disable_query_log
+  set debug=@old_debug;
+  --enable_query_log
+}
+
+drop table t1;
+
+--echo #
+--echo # MDEV-36220 ASAN unknown-crash in loose index scan of MIN with IS NULL
+--echo #
+
+CREATE TABLE t1 (a int, b int, KEY (a, b));
+insert into t1 values (4, NULL), (1, 14), (4, 3);
+SELECT MIN(b) FROM t1 WHERE b = 3 OR b IS NULL GROUP BY a;
+SELECT MIN(b) FROM t1 WHERE b IS NULL GROUP BY a;
+drop table t1;
+
+--echo #
+--echo # End of 10.11 tests
+--echo #
diff -Nru mariadb-10.11.11/mysql-test/main/insert.result mariadb-10.11.13/mysql-test/main/insert.result
--- mariadb-10.11.11/mysql-test/main/insert.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/insert.result	2025-05-19 16:14:24.000000000 +0000
@@ -806,5 +806,75 @@
 8
 drop table t1;
 #
-# End of 10.5 tests
+# MDEV-32086 Server crash when inserting from derived table containing insert target table
+# (part 2)
+#
+create table t1 (pk int, id int);
+insert into t1 values (2,2), (3,3), (4,4);
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+select 101+count(*)
+from
+(
+select dt2.id
+from (select id from t1) dt2, t1 t where t.id=dt2.id
+) dt
+where dt.id<1000;
+101+count(*)
+104
+prepare s from '
+insert into t1 values(
+  (select 101+count(*)
+   from
+   (
+      select dt2.id
+      from (select id from t1) dt2, t1 t where t.id=dt2.id
+   ) dt
+   where dt.id<1000
+  ), 123
+)
+';
+execute s;
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+104	123
+select 101+count(*)
+from
+(
+select dt2.id
+from (select id from t1) dt2, t1 t where t.id=dt2.id
+) dt
+where dt.id<1000;
+101+count(*)
+105
+execute s;
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+104	123
+105	123
+drop table t1;
 #
+# Try this: INSERT INTO t1 VALUES ... reference to t1
+#           RETURNING (subquery not touching t1)
+create table t1 (a int, b int);
+create table t2 (a int, b int);
+# This is accepted:
+insert into t1 (a) values 
+(3),
+((select max(a) from t1)) 
+returning 
+a, b, (select max(a) from t2);
+a	b	(select max(a) from t2)
+3	NULL	NULL
+NULL	NULL	NULL
+drop table t1,t2;
+# End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/insert.test mariadb-10.11.13/mysql-test/main/insert.test
--- mariadb-10.11.11/mysql-test/main/insert.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/insert.test	2025-05-19 16:14:24.000000000 +0000
@@ -675,5 +675,59 @@
 drop table t1;
 
 --echo #
---echo # End of 10.5 tests
+--echo # MDEV-32086 Server crash when inserting from derived table containing insert target table
+--echo # (part 2)
+--echo #
+
+create table t1 (pk int, id int);
+insert into t1 values (2,2), (3,3), (4,4);
+select * from t1;
+select 101+count(*)
+   from
+   (
+      select dt2.id
+      from (select id from t1) dt2, t1 t where t.id=dt2.id
+   ) dt
+   where dt.id<1000;
+prepare s from '
+insert into t1 values(
+  (select 101+count(*)
+   from
+   (
+      select dt2.id
+      from (select id from t1) dt2, t1 t where t.id=dt2.id
+   ) dt
+   where dt.id<1000
+  ), 123
+)
+';
+execute s;
+select * from t1;
+select 101+count(*)
+   from
+   (
+      select dt2.id
+      from (select id from t1) dt2, t1 t where t.id=dt2.id
+   ) dt
+   where dt.id<1000;
+execute s;
+select * from t1;
+
+drop table t1;
+
 --echo #
+--echo # Try this: INSERT INTO t1 VALUES ... reference to t1
+--echo #           RETURNING (subquery not touching t1)
+create table t1 (a int, b int);
+create table t2 (a int, b int);
+
+--echo # This is accepted:
+insert into t1 (a) values 
+  (3),
+  ((select max(a) from t1)) 
+returning 
+  a, b, (select max(a) from t2);
+
+drop table t1,t2;
+
+--echo # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/insert_returning.result mariadb-10.11.13/mysql-test/main/insert_returning.result
--- mariadb-10.11.11/mysql-test/main/insert_returning.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/insert_returning.result	2025-05-19 16:14:24.000000000 +0000
@@ -498,6 +498,8 @@
 5	6
 INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT id2 FROM t2);
 ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT 1 UNION SELECT id2 FROM t2);
+ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
 INSERT INTO t2 (id2, val2) VALUES (6,'f') RETURNING t1.*;
 ERROR 42S02: Unknown table 'test.t1'
 #
diff -Nru mariadb-10.11.11/mysql-test/main/insert_returning.test mariadb-10.11.13/mysql-test/main/insert_returning.test
--- mariadb-10.11.11/mysql-test/main/insert_returning.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/insert_returning.test	2025-05-19 16:14:24.000000000 +0000
@@ -199,6 +199,8 @@
 t1 WHERE id1=1);
 --error ER_UPDATE_TABLE_USED
 INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT id2 FROM t2);
+--error ER_UPDATE_TABLE_USED
+INSERT INTO t2(id2,val2) VALUES(5,'f') RETURNING (SELECT 1 UNION SELECT id2 FROM t2);
 --error ER_BAD_TABLE_ERROR
 INSERT INTO t2 (id2, val2) VALUES (6,'f') RETURNING t1.*;
 
diff -Nru mariadb-10.11.11/mysql-test/main/insert_select.result mariadb-10.11.13/mysql-test/main/insert_select.result
--- mariadb-10.11.11/mysql-test/main/insert_select.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/insert_select.result	2025-05-19 16:14:24.000000000 +0000
@@ -1030,6 +1030,139 @@
 3
 DROP VIEW v1;
 DROP TABLE t1;
+create table t1 (pk int, id int);
+insert into t1 values (2,2), (3,3), (4,4);
+insert into t1
+select 1,10
+from
+(
+select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id
+) dt
+where dt.id=3;
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+1	10
+explain insert into t1
+select 1,10
+from
+(
+select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id
+) dt
+where dt.id=3;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	4	Using where; Using temporary
+1	SIMPLE	t	ALL	NULL	NULL	NULL	NULL	4	Using where; Using join buffer (flat, BNL join)
+explain format=json insert into t1
+select 1,10
+from
+(
+select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id
+) dt
+where dt.id=3;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "temporary_table": {
+      "nested_loop": [
+        {
+          "table": {
+            "table_name": "t1",
+            "access_type": "ALL",
+            "rows": 4,
+            "filtered": 100,
+            "attached_condition": "t1.`id` = 3"
+          }
+        },
+        {
+          "block-nl-join": {
+            "table": {
+              "table_name": "t",
+              "access_type": "ALL",
+              "rows": 4,
+              "filtered": 100,
+              "attached_condition": "t.`id` = 3"
+            },
+            "buffer_type": "flat",
+            "buffer_size": "65",
+            "join_type": "BNL"
+          }
+        }
+      ]
+    }
+  }
+}
+prepare stmt from "insert into t1
+select 1,10
+from
+(
+select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id
+) dt
+where dt.id=3";
+execute stmt;
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+1	10
+1	10
+execute stmt;
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+1	10
+1	10
+1	10
+deallocate prepare stmt;
+create procedure p() insert into t1
+select 1,10
+from
+(
+select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id
+) dt
+where dt.id=3;
+call p();
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+1	10
+1	10
+1	10
+1	10
+call p();
+select * from t1;
+pk	id
+2	2
+3	3
+4	4
+1	10
+1	10
+1	10
+1	10
+1	10
+drop procedure p;
+drop table t1;
 #
-# End of 10.5 test
+# MDEV-33139: Crash of INSERT SELECT when preparing structures for
+# split optimization
 #
+CREATE TABLE v0 ( v1 INT UNIQUE ) ;
+INSERT INTO v0 ( v1 ) VALUES
+( ( SELECT 1
+FROM
+( SELECT v1
+FROM v0 GROUP BY v1 ) AS v6 NATURAL JOIN
+v0 AS v2 NATURAL JOIN
+v0 AS v4 NATURAL JOIN
+v0 AS v3 NATURAL JOIN
+( SELECT v1 FROM v0 ) AS v7 ) ) ;
+DROP TABLE v0;
+# End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/insert_select.test mariadb-10.11.13/mysql-test/main/insert_select.test
--- mariadb-10.11.11/mysql-test/main/insert_select.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/insert_select.test	2025-05-19 16:14:24.000000000 +0000
@@ -591,6 +591,60 @@
 DROP VIEW v1;
 DROP TABLE t1;
 
+#
+# MDEV-32086: condition pushdown into two mergeable derived tables,
+#             one containing the other, when they are forced to be
+#             materialized in INSERT
+#
+create table t1 (pk int, id int);
+insert into t1 values (2,2), (3,3), (4,4);
+
+let $q=
+insert into t1
+  select 1,10
+  from
+  (
+    select dt2.id from (select id from t1) dt2, t1 t where t.id=dt2.id
+  ) dt
+  where dt.id=3;
+
+eval $q;
+select * from t1;
+
+eval explain $q;
+eval explain format=json $q;
+
+eval prepare stmt from "$q";
+execute stmt;
+select * from t1;
+execute stmt;
+select * from t1;
+deallocate prepare stmt;
+
+eval create procedure p() $q;
+call p();
+select * from t1;
+call p();
+select * from t1;
+drop procedure p;
+
+drop table t1;
+
 --echo #
---echo # End of 10.5 test
+--echo # MDEV-33139: Crash of INSERT SELECT when preparing structures for
+--echo # split optimization
 --echo #
+
+CREATE TABLE v0 ( v1 INT UNIQUE ) ;
+INSERT INTO v0 ( v1 ) VALUES
+  ( ( SELECT 1
+      FROM
+        ( SELECT v1
+          FROM v0 GROUP BY v1 ) AS v6 NATURAL JOIN
+               v0 AS v2 NATURAL JOIN
+               v0 AS v4 NATURAL JOIN
+               v0 AS v3 NATURAL JOIN
+               ( SELECT v1 FROM v0 ) AS v7 ) ) ;
+DROP TABLE v0;
+
+--echo # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/join.result mariadb-10.11.13/mysql-test/main/join.result
--- mariadb-10.11.11/mysql-test/main/join.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/join.result	2025-05-19 16:14:24.000000000 +0000
@@ -3611,3 +3611,32 @@
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
 1	SIMPLE	t2	ref	kp1	kp1	5	test.t1.a	1	Using index condition
 drop table t1,t2;
+#
+# MDEV-36592: If the join_condition is specified via USING (column_list), the query plan depends ...
+#
+CREATE TABLE t1 (
+id int(11),
+f1 char(255),
+PRIMARY KEY  (id)
+);
+INSERT INTO t1 (id) VALUES (1),(2),(3);
+UPDATE t1 SET f1=REPEAT('a',250);
+CREATE TABLE t2 (id  int(11), f2 INT NOT NULL);
+INSERT INTO t2 select seq, seq from seq_1_to_20;
+ANALYZE TABLE t1, t2;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+test.t2	analyze	status	Engine-independent statistics collected
+test.t2	analyze	status	OK
+# In both queries, t1 should use type=index, not type=ALL:
+EXPLAIN SELECT count(*) FROM t2 JOIN t1 USING (id);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	index	PRIMARY	PRIMARY	4	NULL	3	Using index
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	20	Using where; Using join buffer (flat, BNL join)
+EXPLAIN SELECT count(*) FROM t1 JOIN t2 USING (id);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	index	PRIMARY	PRIMARY	4	NULL	3	Using index
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	20	Using where; Using join buffer (flat, BNL join)
+DROP TABLE t1,t2;
+# End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/main/join.test mariadb-10.11.13/mysql-test/main/join.test
--- mariadb-10.11.11/mysql-test/main/join.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/join.test	2025-05-19 16:14:24.000000000 +0000
@@ -2015,3 +2015,28 @@
   t2.kp1=t1.a and t2.kp1<=100 and t2.kp2<=20;
 
 drop table t1,t2;
+
+--echo #
+--echo # MDEV-36592: If the join_condition is specified via USING (column_list), the query plan depends ...
+--echo #
+CREATE TABLE t1 (
+  id int(11),
+  f1 char(255),
+  PRIMARY KEY  (id)
+);
+INSERT INTO t1 (id) VALUES (1),(2),(3);
+UPDATE t1 SET f1=REPEAT('a',250);
+
+CREATE TABLE t2 (id  int(11), f2 INT NOT NULL);
+INSERT INTO t2 select seq, seq from seq_1_to_20;
+
+ANALYZE TABLE t1, t2;
+
+--echo # In both queries, t1 should use type=index, not type=ALL:
+EXPLAIN SELECT count(*) FROM t2 JOIN t1 USING (id);
+EXPLAIN SELECT count(*) FROM t1 JOIN t2 USING (id);
+
+DROP TABLE t1,t2;
+
+--echo # End of 10.11 tests
+
diff -Nru mariadb-10.11.11/mysql-test/main/join_cache.result mariadb-10.11.13/mysql-test/main/join_cache.result
--- mariadb-10.11.11/mysql-test/main/join_cache.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/join_cache.result	2025-05-19 16:14:24.000000000 +0000
@@ -6443,3 +6443,29 @@
 #
 # End of 10.5 tests
 #
+#
+# MDEV-36165: BKA join cache buffer is employed despite join_cache_level=3 (flat BNLH)
+#
+CREATE TABLE t1(a INT);
+INSERT INTO t1 VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+CREATE TABLE t2(a INT, b INT);
+INSERT INTO t2 SELECT a, a from t1;
+CREATE TABLE t3(a INT, b INT, c INT, key (a,b));
+INSERT INTO t3 select a, a, a FROM t1;
+SET optimizer_switch = 'join_cache_hashed=off,join_cache_bka=on,mrr=on';
+SET join_cache_level = 3;
+EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	10	Using where
+1	SIMPLE	t3	ref	a	a	5	test.t2.a	1	Using index condition
+SET join_cache_level = 4;
+EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	10	Using where
+1	SIMPLE	t3	ref	a	a	5	test.t2.a	1	Using index condition
+SET join_cache_level = default;
+SET optimizer_switch = default;
+DROP TABLE t1, t2, t3;
+#
+# End of 10.11 tests
+#
diff -Nru mariadb-10.11.11/mysql-test/main/join_cache.test mariadb-10.11.13/mysql-test/main/join_cache.test
--- mariadb-10.11.11/mysql-test/main/join_cache.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/join_cache.test	2025-05-19 16:14:24.000000000 +0000
@@ -4321,3 +4321,30 @@
 --echo #
 --echo # End of 10.5 tests
 --echo #
+
+--echo #
+--echo # MDEV-36165: BKA join cache buffer is employed despite join_cache_level=3 (flat BNLH)
+--echo #
+--source include/have_sequence.inc
+CREATE TABLE t1(a INT);
+INSERT INTO t1 VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+CREATE TABLE t2(a INT, b INT);
+INSERT INTO t2 SELECT a, a from t1;
+CREATE TABLE t3(a INT, b INT, c INT, key (a,b));
+INSERT INTO t3 select a, a, a FROM t1;
+
+SET optimizer_switch = 'join_cache_hashed=off,join_cache_bka=on,mrr=on';
+
+SET join_cache_level = 3;
+EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1);
+
+SET join_cache_level = 4;
+EXPLAIN SELECT * FROM t2, t3 WHERE t2.a=t3.a AND (t3.b+1 <= t2.b+1);
+
+SET join_cache_level = default;
+SET optimizer_switch = default;
+DROP TABLE t1, t2, t3;
+
+--echo #
+--echo # End of 10.11 tests
+--echo #
diff -Nru mariadb-10.11.11/mysql-test/main/join_nested.result mariadb-10.11.13/mysql-test/main/join_nested.result
--- mariadb-10.11.11/mysql-test/main/join_nested.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/join_nested.result	2025-05-19 16:14:24.000000000 +0000
@@ -2051,3 +2051,15 @@
 DROP TABLE t1, t2, t3;
 set join_cache_level= @save_join_cache_level;
 # end of 10.3 tests
+#
+# MDEV-32084: Assertion in best_extension_by_limited_search(), or crash elsewhere in release
+#
+CREATE TABLE t1 (i int);
+INSERT INTO t1 values (1),(2);
+SELECT 1 FROM  t1 WHERE i IN
+(SELECT 1 FROM t1 c
+LEFT JOIN  (t1 a LEFT JOIN t1 b ON t1.i = b.i)  ON c.i = t1.i);
+1
+1
+DROP TABLE t1;
+# end of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/main/join_nested.test mariadb-10.11.13/mysql-test/main/join_nested.test
--- mariadb-10.11.11/mysql-test/main/join_nested.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/join_nested.test	2025-05-19 16:14:24.000000000 +0000
@@ -1458,3 +1458,16 @@
 set join_cache_level= @save_join_cache_level;
 
 --echo # end of 10.3 tests
+
+--echo #
+--echo # MDEV-32084: Assertion in best_extension_by_limited_search(), or crash elsewhere in release
+--echo #
+CREATE TABLE t1 (i int);
+INSERT INTO t1 values (1),(2);
+
+SELECT 1 FROM  t1 WHERE i IN
+  (SELECT 1 FROM t1 c
+    LEFT JOIN  (t1 a LEFT JOIN t1 b ON t1.i = b.i)  ON c.i = t1.i);
+
+DROP TABLE t1;
+--echo # end of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/main/join_nested_jcl6.result mariadb-10.11.13/mysql-test/main/join_nested_jcl6.result
--- mariadb-10.11.11/mysql-test/main/join_nested_jcl6.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/join_nested_jcl6.result	2025-05-19 16:14:24.000000000 +0000
@@ -2060,6 +2060,18 @@
 DROP TABLE t1, t2, t3;
 set join_cache_level= @save_join_cache_level;
 # end of 10.3 tests
+#
+# MDEV-32084: Assertion in best_extension_by_limited_search(), or crash elsewhere in release
+#
+CREATE TABLE t1 (i int);
+INSERT INTO t1 values (1),(2);
+SELECT 1 FROM  t1 WHERE i IN
+(SELECT 1 FROM t1 c
+LEFT JOIN  (t1 a LEFT JOIN t1 b ON t1.i = b.i)  ON c.i = t1.i);
+1
+1
+DROP TABLE t1;
+# end of 10.11 tests
 CREATE TABLE t5 (a int, b int, c int, PRIMARY KEY(a), KEY b_i (b));
 CREATE TABLE t6 (a int, b int, c int, PRIMARY KEY(a), KEY b_i (b));
 CREATE TABLE t7 (a int, b int, c int, PRIMARY KEY(a), KEY b_i (b));
diff -Nru mariadb-10.11.11/mysql-test/main/large_pages.opt mariadb-10.11.13/mysql-test/main/large_pages.opt
--- mariadb-10.11.11/mysql-test/main/large_pages.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/large_pages.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1 +1 @@
---large-pages
+--large-pages --loose-innodb-buffer-pool-size-max=16m
diff -Nru mariadb-10.11.11/mysql-test/main/large_pages.result mariadb-10.11.13/mysql-test/main/large_pages.result
--- mariadb-10.11.11/mysql-test/main/large_pages.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/large_pages.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 call mtr.add_suppression("\\[Warning\\] (mysqld|mariadbd): Couldn't allocate [0-9]+ bytes \\((Large/HugeTLB memory|MEMLOCK) page size [0-9]+\\).*");
+call mtr.add_suppression("\\[ERROR\\]*Lock Pages in memory access rights required.*");
 create table t1 (
 a int not null auto_increment,
 b char(16) not null,
diff -Nru mariadb-10.11.11/mysql-test/main/large_pages.test mariadb-10.11.13/mysql-test/main/large_pages.test
--- mariadb-10.11.11/mysql-test/main/large_pages.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/large_pages.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,11 +1,9 @@
 # Test of large pages (or at least the fallback to conventional allocation)
 
-# Windows needs SeLockMemoryPrivilege
---source include/not_windows.inc
 --source include/have_innodb.inc
 
 call mtr.add_suppression("\\[Warning\\] (mysqld|mariadbd): Couldn't allocate [0-9]+ bytes \\((Large/HugeTLB memory|MEMLOCK) page size [0-9]+\\).*");
-
+call mtr.add_suppression("\\[ERROR\\]*Lock Pages in memory access rights required.*");
 create table t1 (
   a int not null auto_increment,
   b char(16) not null,
diff -Nru mariadb-10.11.11/mysql-test/main/long_unique.result mariadb-10.11.13/mysql-test/main/long_unique.result
--- mariadb-10.11.11/mysql-test/main/long_unique.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/long_unique.result	2025-05-19 16:14:24.000000000 +0000
@@ -1452,4 +1452,26 @@
 #
 CREATE TABLE t1 (pk INT, a TEXT NOT NULL DEFAULT '', PRIMARY KEY (pk), b INT AUTO_INCREMENT, UNIQUE(b), UNIQUE (a,b)) ENGINE=myisam;
 ERROR HY000: AUTO_INCREMENT column `b` cannot be used in the UNIQUE index `a`
+#
+# MDEV-35620 UBSAN: runtime error: applying zero offset to null pointer in _ma_unique_hash, skip_trailing_space, my_hash_sort_mb_nopad_bin and my_strnncollsp_utf8mb4_bin
+#
+# Disable result log. The exact result is not important.
+# We just need to make sure UBSAN nullptr-with-offset is not reported.
+SELECT DISTINCT user,authentication_string FROM mysql.user;
+SELECT DISTINCT USER,PASSWORD FROM mysql.user;
+SELECT DISTINCT USER,plugin FROM mysql.user;
+# Enabling result log again.
+create or replace table t1 (t text) engine=aria;
+insert into t1 values ('');
+insert into t1 values (NULL);
+select distinct t from t1;
+t
+
+NULL
+alter table t1 ENGINE=MyISAM;
+select distinct t from t1;
+t
+
+NULL
+DROP TABLE t1;
 # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/long_unique.test mariadb-10.11.13/mysql-test/main/long_unique.test
--- mariadb-10.11.11/mysql-test/main/long_unique.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/long_unique.test	2025-05-19 16:14:24.000000000 +0000
@@ -551,4 +551,26 @@
 --error ER_NO_AUTOINCREMENT_WITH_UNIQUE
 CREATE TABLE t1 (pk INT, a TEXT NOT NULL DEFAULT '', PRIMARY KEY (pk), b INT AUTO_INCREMENT, UNIQUE(b), UNIQUE (a,b)) ENGINE=myisam;
 
+--echo #
+--echo # MDEV-35620 UBSAN: runtime error: applying zero offset to null pointer in _ma_unique_hash, skip_trailing_space, my_hash_sort_mb_nopad_bin and my_strnncollsp_utf8mb4_bin
+--echo #
+
+--echo # Disable result log. The exact result is not important.
+--echo # We just need to make sure UBSAN nullptr-with-offset is not reported.
+--disable_result_log
+SELECT DISTINCT user,authentication_string FROM mysql.user;
+SELECT DISTINCT USER,PASSWORD FROM mysql.user;
+SELECT DISTINCT USER,plugin FROM mysql.user;
+--enable_result_log
+--echo # Enabling result log again.
+
+create or replace table t1 (t text) engine=aria;
+insert into t1 values ('');
+insert into t1 values (NULL);
+select distinct t from t1;
+alter table t1 ENGINE=MyISAM;
+select distinct t from t1;
+DROP TABLE t1;
+
+
 --echo # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/lowercase_table2.result mariadb-10.11.13/mysql-test/main/lowercase_table2.result
--- mariadb-10.11.11/mysql-test/main/lowercase_table2.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/lowercase_table2.result	2025-05-19 16:14:24.000000000 +0000
@@ -185,7 +185,7 @@
 select TABLE_SCHEMA,TABLE_NAME FROM information_schema.TABLES
 where TABLE_SCHEMA ='mysqltest_LC2';
 TABLE_SCHEMA	TABLE_NAME
-mysqltest_lc2	myUC
+mysqltest_LC2	myUC
 use test;
 drop database mysqltest_LC2;
 #
diff -Nru mariadb-10.11.11/mysql-test/main/lowercase_view.result mariadb-10.11.13/mysql-test/main/lowercase_view.result
--- mariadb-10.11.11/mysql-test/main/lowercase_view.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/lowercase_view.result	2025-05-19 16:14:24.000000000 +0000
@@ -16,29 +16,17 @@
 create view v2aA as select * from v1aA;
 create view v3Aa as select v2Aa.col1 from v2aA,t2Aa where v2Aa.col1 = t2aA.col1;
 insert into v2Aa values ((select max(col1) from v1aA));
-ERROR HY000: The definition of table 'v1aA' prevents operation INSERT on table 'v2Aa'
 insert into t1aA values ((select max(col1) from v1Aa));
-ERROR HY000: The definition of table 'v1Aa' prevents operation INSERT on table 't1aA'
 insert into v2aA values ((select max(col1) from v1aA));
-ERROR HY000: The definition of table 'v1aA' prevents operation INSERT on table 'v2aA'
 insert into v2Aa values ((select max(col1) from t1Aa));
-ERROR HY000: The definition of table 'v2Aa' prevents operation INSERT on table 'v2Aa'
 insert into t1aA values ((select max(col1) from t1Aa));
-ERROR HY000: Table 't1aA' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into v2aA values ((select max(col1) from t1aA));
-ERROR HY000: The definition of table 'v2aA' prevents operation INSERT on table 'v2aA'
 insert into v2Aa values ((select max(col1) from v2aA));
-ERROR HY000: Table 'v2Aa' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into t1Aa values ((select max(col1) from v2Aa));
-ERROR HY000: The definition of table 'v2Aa' prevents operation INSERT on table 't1Aa'
 insert into v2aA values ((select max(col1) from v2Aa));
-ERROR HY000: Table 'v2aA' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into v3Aa (col1) values ((select max(col1) from v1Aa));
-ERROR HY000: The definition of table 'v1Aa' prevents operation INSERT on table 'v3Aa'
 insert into v3aA (col1) values ((select max(col1) from t1aA));
-ERROR HY000: The definition of table 'v3aA' prevents operation INSERT on table 'v3aA'
 insert into v3Aa (col1) values ((select max(col1) from v2aA));
-ERROR HY000: The definition of table 'v2aA' prevents operation INSERT on table 'v3Aa'
 drop view v3aA,v2Aa,v1aA;
 drop table t1Aa,t2Aa;
 create table t1Aa (col1 int);
diff -Nru mariadb-10.11.11/mysql-test/main/lowercase_view.test mariadb-10.11.13/mysql-test/main/lowercase_view.test
--- mariadb-10.11.11/mysql-test/main/lowercase_view.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/lowercase_view.test	2025-05-19 16:14:24.000000000 +0000
@@ -23,29 +23,17 @@
 create view v1Aa as select * from t1aA;
 create view v2aA as select * from v1aA;
 create view v3Aa as select v2Aa.col1 from v2aA,t2Aa where v2Aa.col1 = t2aA.col1;
--- error 1443
 insert into v2Aa values ((select max(col1) from v1aA));
--- error 1443
 insert into t1aA values ((select max(col1) from v1Aa));
--- error 1443
 insert into v2aA values ((select max(col1) from v1aA));
--- error 1443
 insert into v2Aa values ((select max(col1) from t1Aa));
--- error 1093
 insert into t1aA values ((select max(col1) from t1Aa));
--- error 1443
 insert into v2aA values ((select max(col1) from t1aA));
--- error 1093
 insert into v2Aa values ((select max(col1) from v2aA));
--- error 1443
 insert into t1Aa values ((select max(col1) from v2Aa));
--- error 1093
 insert into v2aA values ((select max(col1) from v2Aa));
--- error 1443
 insert into v3Aa (col1) values ((select max(col1) from v1Aa));
--- error 1443
 insert into v3aA (col1) values ((select max(col1) from t1aA));
--- error 1443
 insert into v3Aa (col1) values ((select max(col1) from v2aA));
 drop view v3aA,v2Aa,v1aA;
 drop table t1Aa,t2Aa;
diff -Nru mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.result mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.result
--- mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,35 @@
+use mysql;
+# run mysql_install_db with --service parameter
+# Start service
+# -- Upgrade service (online) --
+Phase 1/10: Stopping service
+Phase 2/10: Start and stop server in the old version, to avoid crash recovery (skipped)
+Phase 3/10: Fixing server config file
+Phase 4/10: Starting mysqld for upgrade
+Phase 5/10: Waiting for startup to complete
+Phase 6/10: Running mysql_upgrade
+Phase 7/10: Changing service configuration
+Phase 8/10: Initiating server shutdown
+Phase 9/10: Waiting for shutdown to complete
+Phase 10/10: Starting service
+Service 'SERVICE_NAME' successfully upgraded.
+Log file is written to UPGRADE_LOG
+# upgrade_success(online)=1
+# Service stopped
+# -- Upgrade service (offline) --
+Phase 1/10: Stopping service
+Phase 2/10: Start and stop server in the old version, to avoid crash recovery ,this can take some time
+Phase 3/10: Fixing server config file
+Phase 4/10: Starting mysqld for upgrade
+Phase 5/10: Waiting for startup to complete
+Phase 6/10: Running mysql_upgrade
+Phase 7/10: Changing service configuration
+Phase 8/10: Initiating server shutdown
+Phase 9/10: Waiting for shutdown to complete
+Phase 10/10: Starting service (skipped)
+Service 'SERVICE_NAME' successfully upgraded.
+Log file is written to UPGRADE_LOG
+# upgrade_success(offline)=1
+# Delete service
+connection default;
+# restart
diff -Nru mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.test mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.test
--- mariadb-10.11.11/mysql-test/main/mariadb-upgrade-service.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mariadb-upgrade-service.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,113 @@
+source include/windows.inc;
+let $datadir_name=data;
+let $service_name_prefix=mariadb;
+let $password=password;
+
+source include/check_windows_admin.inc;
+
+# The test uses return code from sc.exe utility, which are as follows
+let $ERROR_SERVICE_DOES_NOT_EXIST= 1060;
+let $ERROR_SERVICE_CANNOT_ACCEPT_CTRL=1061;# intermediate, during start or stop
+let $ERROR_SERVICE_NOT_ACTIVE=1062;# service stopped
+let $ERROR_INVALID_SERVICE_CONTROL=1052; # The requested control is not valid for this service
+
+let $sc_exe= C:\Windows\System32\sc.exe;
+let $ddir= $MYSQLTEST_VARDIR/tmp/$datadir_name;
+let $service_name=$service_name_prefix$MASTER_MYPORT;
+let TMP= $MYSQLTEST_VARDIR/tmp;
+let $upgrade_log=$TMP/mysql_upgrade_service.$service_name.log;
+
+use mysql;
+error 0,1;
+rmdir $ddir;
+
+--disable_result_log
+error 0,$ERROR_SERVICE_DOES_NOT_EXIST;
+exec $sc_exe delete $service_name;
+--enable_result_log
+
+source include/shutdown_mysqld.inc;
+echo # run mysql_install_db with --service parameter;
+--disable_result_log
+exec $MYSQL_INSTALL_DB_EXE --datadir=$ddir --port=$MASTER_MYPORT --password=$password --service=$service_name --verbose-bootstrap -R;
+--enable_result_log
+
+echo # Start service;
+--disable_result_log
+exec $sc_exe start $service_name;
+--enable_result_log
+
+enable_reconnect;
+source include/wait_until_connected_again.inc;
+disable_reconnect;
+
+echo # -- Upgrade service (online) --;
+--replace_result $upgrade_log UPGRADE_LOG  $service_name SERVICE_NAME
+let $sys_errno=0;
+let $upgrade_success = 1;
+error 0,1;
+exec $MARIADB_UPGRADE_SERVICE_EXE --service=$service_name;
+
+if($sys_errno != 0)
+{
+  let $upgrade_success = 0;
+}
+
+echo # upgrade_success(online)=$upgrade_success;
+file_exists $upgrade_log;
+if ($upgrade_success == 0)
+{
+  echo --detailed error(online upgrade)--;
+  cat_file $upgrade_log;
+}
+# stop service
+--disable_result_log
+# Wait until stopped
+let $sys_errno=0;
+while($sys_errno != $ERROR_SERVICE_NOT_ACTIVE)
+{
+  --error 0,$ERROR_SERVICE_CANNOT_ACCEPT_CTRL,$ERROR_SERVICE_NOT_ACTIVE, $ERROR_INVALID_SERVICE_CONTROL
+  exec $sc_exe stop $service_name;
+  if($sys_errno != $ERROR_SERVICE_NOT_ACTIVE)
+  {
+    --real_sleep 0.1
+  }
+}
+--enable_result_log
+echo # Service stopped;
+
+echo # -- Upgrade service (offline) --;
+--replace_result $upgrade_log UPGRADE_LOG  $service_name SERVICE_NAME
+let $sys_errno=0;
+let $upgrade_success = 1;
+error 0,1;
+exec $MARIADB_UPGRADE_SERVICE_EXE --service=$service_name;
+
+if($sys_errno != 0)
+{
+  let $upgrade_success = 0;
+}
+
+echo # upgrade_success(offline)=$upgrade_success;
+file_exists $upgrade_log;
+if ($upgrade_success == 0)
+{
+  echo --detailed error(online upgrade)--;
+  cat_file $upgrade_log;
+}
+
+echo # Delete service;
+let $sys_errno=0;
+--disable_result_log
+exec $sc_exe delete $service_name;
+--enable_result_log
+
+# Cleanup
+source include/wait_until_disconnected.inc;
+rmdir $ddir;
+remove_file $upgrade_log;
+let TEMP=$old_temp;
+
+#restart original server
+connection default;
+source include/start_mysqld.inc;
diff -Nru mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.result mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.result
--- mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,21 @@
+CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB;
+INSERT INTO t VALUES ('a','b');
+DROP TABLE t;
+CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB;
+DELETE FROM t;
+DROP TABLE t;
+CREATE TABLE t (a INT(1),d INT(1),b VARCHAR(1),c CHAR(1),c3 INT(1) GENERATED ALWAYS AS ((a + LENGTH (d))) STORED,c2 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,k1 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,PRIMARY KEY(b (1),a,d),KEY d (d),KEY a (a),KEY c_renamed (c (1),b (1)),KEY b (b (1),c (1),a),KEY k1 (k1),KEY a_2 (a,k1),KEY k1_2 (k1,d)) DEFAULT CHARSET=latin1 ENGINE=InnoDB;
+DELETE FROM t;
+DROP TABLE t;
+CREATE TABLE t (a INT,ROW_START TIMESTAMP(6) AS ROW START,ROW_END TIMESTAMP(6) AS ROW END,PERIOD FOR SYSTEM_TIME(ROW_START,ROW_END),INDEX (ROW_START),INDEX (ROW_END),PRIMARY KEY(ROW_END,a,ROW_START),INDEX (ROW_END,ROW_START,a)) WITH SYSTEM VERSIONING ENGINE=InnoDB;
+SHOW INDEX FROM t;
+Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment	Ignored
+t	0	PRIMARY	1	ROW_END	A	0	NULL	NULL		BTREE			NO
+t	0	PRIMARY	2	a	A	0	NULL	NULL		BTREE			NO
+t	0	PRIMARY	3	ROW_START	A	0	NULL	NULL		BTREE			NO
+t	1	ROW_START	1	ROW_START	A	0	NULL	NULL		BTREE			NO
+t	1	ROW_END	1	ROW_END	A	0	NULL	NULL		BTREE			NO
+t	1	ROW_END_2	1	ROW_END	A	0	NULL	NULL		BTREE			NO
+t	1	ROW_END_2	2	ROW_START	A	0	NULL	NULL		BTREE			NO
+t	1	ROW_END_2	3	a	A	0	NULL	NULL		BTREE			NO
+DROP TABLE t;
diff -Nru mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.test mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.test
--- mariadb-10.11.11/mysql-test/main/mdev-35721-ubsan.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mdev-35721-ubsan.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,22 @@
+
+--source include/have_innodb.inc
+
+CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB;
+INSERT INTO t VALUES ('a','b');
+
+DROP TABLE t;
+
+CREATE TABLE t (c1 VARCHAR(10),c2 VARCHAR(10),PRIMARY KEY(c1,c2),FULLTEXT KEY k (c2)) ENGINE=InnoDB;
+DELETE FROM t;
+
+DROP TABLE t;
+
+CREATE TABLE t (a INT(1),d INT(1),b VARCHAR(1),c CHAR(1),c3 INT(1) GENERATED ALWAYS AS ((a + LENGTH (d))) STORED,c2 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,k1 CHAR(1) GENERATED ALWAYS AS (SUBSTR(b,0,0)) VIRTUAL,PRIMARY KEY(b (1),a,d),KEY d (d),KEY a (a),KEY c_renamed (c (1),b (1)),KEY b (b (1),c (1),a),KEY k1 (k1),KEY a_2 (a,k1),KEY k1_2 (k1,d)) DEFAULT CHARSET=latin1 ENGINE=InnoDB;
+DELETE FROM t;
+
+DROP TABLE t;
+
+CREATE TABLE t (a INT,ROW_START TIMESTAMP(6) AS ROW START,ROW_END TIMESTAMP(6) AS ROW END,PERIOD FOR SYSTEM_TIME(ROW_START,ROW_END),INDEX (ROW_START),INDEX (ROW_END),PRIMARY KEY(ROW_END,a,ROW_START),INDEX (ROW_END,ROW_START,a)) WITH SYSTEM VERSIONING ENGINE=InnoDB;
+SHOW INDEX FROM t;
+
+DROP TABLE t;
diff -Nru mariadb-10.11.11/mysql-test/main/mdl_sync.result mariadb-10.11.13/mysql-test/main/mdl_sync.result
--- mariadb-10.11.11/mysql-test/main/mdl_sync.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mdl_sync.result	2025-05-19 16:14:24.000000000 +0000
@@ -2431,9 +2431,6 @@
 create table t2 (a int) stats_persistent=0, engine=innodb;
 insert into t1 values (1);
 insert into t2 values (1);
-connect  con1, localhost, root;
-start transaction with consistent snapshot;
-connection default;
 SET DEBUG_SYNC= 'after_open_table_mdl_shared SIGNAL table_opened WAIT_FOR grlwait execute 2';
 update t1,t2 set t1.a=2,t2.a=3;
 connection con2;
@@ -2456,6 +2453,7 @@
 SET DEBUG_SYNC= 'now WAIT_FOR table_opened';
 SET DEBUG_SYNC= 'mdl_acquire_lock_wait SIGNAL grlwait';
 FLUSH TABLES WITH READ LOCK;
+InnoDB		0 transactions not purged
 SELECT LOCK_MODE, LOCK_TYPE, TABLE_SCHEMA, TABLE_NAME FROM information_schema.metadata_lock_info;
 LOCK_MODE	LOCK_TYPE	TABLE_SCHEMA	TABLE_NAME
 MDL_BACKUP_FTWRL2	Backup lock		
@@ -2465,7 +2463,6 @@
 SET DEBUG_SYNC= 'RESET';
 drop table t1,t2;
 disconnect con2;
-disconnect con1;
 #
 # Bug#50786 Assertion `thd->mdl_context.trans_sentinel() == __null' 
 #           failed in open_ltable()
diff -Nru mariadb-10.11.11/mysql-test/main/mdl_sync.test mariadb-10.11.13/mysql-test/main/mdl_sync.test
--- mariadb-10.11.11/mysql-test/main/mdl_sync.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mdl_sync.test	2025-05-19 16:14:24.000000000 +0000
@@ -3115,12 +3115,6 @@
 insert into t1 values (1);
 insert into t2 values (1);
 
-connect (con1, localhost, root);
-# disable innodb purge thread, otherwise it might start purging t2,
-# and will take an mdl, affecting metadata_lock_info output.
-start transaction with consistent snapshot;
-connection default;
-
 SET DEBUG_SYNC= 'after_open_table_mdl_shared SIGNAL table_opened WAIT_FOR grlwait execute 2';
 --send update t1,t2 set t1.a=2,t2.a=3
 
@@ -3156,6 +3150,7 @@
 
 let $wait_condition= SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info;
 --source include/wait_condition.inc
+--source ../suite/innodb/include/wait_all_purged.inc
 SELECT LOCK_MODE, LOCK_TYPE, TABLE_SCHEMA, TABLE_NAME FROM information_schema.metadata_lock_info;
 
 unlock tables;
@@ -3166,7 +3161,6 @@
 SET DEBUG_SYNC= 'RESET';
 drop table t1,t2;
 disconnect con2;
-disconnect con1;
 
 --echo #
 --echo # Bug#50786 Assertion `thd->mdl_context.trans_sentinel() == __null' 
diff -Nru mariadb-10.11.11/mysql-test/main/merge.result mariadb-10.11.13/mysql-test/main/merge.result
--- mariadb-10.11.11/mysql-test/main/merge.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/merge.result	2025-05-19 16:14:24.000000000 +0000
@@ -3678,33 +3678,22 @@
 insert into t1 (a) values (1);
 insert into t3 (b) values (1);
 insert into m1 (a) values ((select max(a) from m1));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from m2));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from t1));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from t2));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from t3, m1));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from t3, m2));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from t3, t1));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from t3, t2));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from tmp, m1));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from tmp, m2));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from tmp, t1));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from tmp, t2));
-ERROR HY000: Table 'm1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into m1 (a) values ((select max(a) from v1));
-ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'm1'
 insert into m1 (a) values ((select max(a) from tmp, v1));
-ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'm1'
+select count(*) from m1;
+count(*)
+15
 drop view v1;
 drop temporary table tmp;
 drop table t1, t2, t3, m1, m2;
diff -Nru mariadb-10.11.11/mysql-test/main/merge.test mariadb-10.11.13/mysql-test/main/merge.test
--- mariadb-10.11.11/mysql-test/main/merge.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/merge.test	2025-05-19 16:14:24.000000000 +0000
@@ -2670,37 +2670,24 @@
 
 insert into t1 (a) values (1);
 insert into t3 (b) values (1);
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from m1));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from m2));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from t1));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from t2));
 
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from t3, m1));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from t3, m2));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from t3, t1));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from t3, t2));
 
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from tmp, m1));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from tmp, m2));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from tmp, t1));
---error ER_UPDATE_TABLE_USED
 insert into m1 (a) values ((select max(a) from tmp, t2));
- 
---error ER_VIEW_PREVENT_UPDATE
+
 insert into m1 (a) values ((select max(a) from v1));
---error ER_VIEW_PREVENT_UPDATE
 insert into m1 (a) values ((select max(a) from tmp, v1));
+select count(*) from m1;
 
 
 drop view v1;
diff -Nru mariadb-10.11.11/mysql-test/main/multi_update.result mariadb-10.11.13/mysql-test/main/multi_update.result
--- mariadb-10.11.11/mysql-test/main/multi_update.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/multi_update.result	2025-05-19 16:14:24.000000000 +0000
@@ -1389,3 +1389,23 @@
 12	5	8
 drop table t1,t2,t3,t;
 # End of 10.4 tests
+#
+# MDEV-31647 Stack looping and SIGSEGV in Item_args::walk_args on UPDATE
+#
+create table t1 (c int, c2 int) engine=innodb;
+update t1 set c=0 where c=(
+select 1 from (select 1 as v1) as v2
+natural join t1) order by last_value (c2) over (order by c2);
+ERROR HY000: Invalid use of group function
+update t1 set c=0 where c=(
+select 1 from (select 1 as v1) as v2
+natural join t1) order by last_value (c2) over ();
+ERROR HY000: Invalid use of group function
+update t1 set c=0 where c=(
+select 1 from (select 1 as v1) as v2
+natural join t1) order by c2;
+select 1 from (select 1 as v1) as v2
+natural join t1 order by last_value (c2) over (order by c2);
+1
+drop table t1;
+# End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/multi_update.test mariadb-10.11.13/mysql-test/main/multi_update.test
--- mariadb-10.11.11/mysql-test/main/multi_update.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/multi_update.test	2025-05-19 16:14:24.000000000 +0000
@@ -1200,3 +1200,31 @@
 drop table t1,t2,t3,t;
 
 --echo # End of 10.4 tests
+
+--echo #
+--echo # MDEV-31647 Stack looping and SIGSEGV in Item_args::walk_args on UPDATE
+--echo #
+--source include/have_innodb.inc
+create table t1 (c int, c2 int) engine=innodb;
+
+--error ER_INVALID_GROUP_FUNC_USE
+update t1 set c=0 where c=(
+  select 1 from (select 1 as v1) as v2
+  natural join t1) order by last_value (c2) over (order by c2);
+
+--error ER_INVALID_GROUP_FUNC_USE
+update t1 set c=0 where c=(
+  select 1 from (select 1 as v1) as v2
+  natural join t1) order by last_value (c2) over ();
+
+update t1 set c=0 where c=(
+  select 1 from (select 1 as v1) as v2
+  natural join t1) order by c2;
+
+select 1 from (select 1 as v1) as v2
+  natural join t1 order by last_value (c2) over (order by c2);
+
+
+drop table t1;
+
+--echo # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.opt mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.opt
--- mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.opt	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1 @@
+--slOw_QuEry_loG=OFF
diff -Nru mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.result mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.result
--- mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,8 @@
+#
+# MDEV-27126: my_getopt compares option names case sensitively
+#
+# Check if the variable is set correctly from options
+SELECT @@GLOBAL.slow_query_log;
+@@GLOBAL.slow_query_log
+0
+# End of test.
diff -Nru mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.test mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.test
--- mariadb-10.11.11/mysql-test/main/my_getopt_case_insensitive.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/my_getopt_case_insensitive.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,8 @@
+--echo #
+--echo # MDEV-27126: my_getopt compares option names case sensitively
+--echo #
+
+--echo # Check if the variable is set correctly from options
+SELECT @@GLOBAL.slow_query_log;
+
+--echo # End of test.
diff -Nru mariadb-10.11.11/mysql-test/main/myisam-big.result mariadb-10.11.13/mysql-test/main/myisam-big.result
--- mariadb-10.11.11/mysql-test/main/myisam-big.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/myisam-big.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,7 @@
 drop table if exists t1,t2;
+call mtr.add_suppression("Index.*try to repair it");
+call mtr.add_suppression("Disk got full");
+call mtr.add_suppression("Got an error from thread_id");
 create table t1 (id int, sometext varchar(100)) engine=myisam;
 insert into t1 values (1, "hello"),(2, "hello2"),(4, "hello3"),(4, "hello4");
 create table t2 like t1;
@@ -43,4 +46,9 @@
 connection con2;
 disconnect con2;
 connection default;
+SET @saved_dbug = @@SESSION.debug_dbug;
+SET debug_dbug='+d,simulate_file_pwrite_error';
+insert into t1 select * from t2;
+ERROR HY000: Disk got full writing 'test.t1' (Errcode: 28 "No space left on device")
+SET debug_dbug= @saved_dbug;
 drop table t1,t2;
diff -Nru mariadb-10.11.11/mysql-test/main/myisam-big.test mariadb-10.11.13/mysql-test/main/myisam-big.test
--- mariadb-10.11.11/mysql-test/main/myisam-big.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/myisam-big.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,12 +1,17 @@
 #
 # Test bugs in the MyISAM code that require more space/time
 --source include/big_test.inc
+--source include/have_debug.inc
 
 # Initialise
 --disable_warnings
 drop table if exists t1,t2;
 --enable_warnings
 
+call mtr.add_suppression("Index.*try to repair it");
+call mtr.add_suppression("Disk got full");
+call mtr.add_suppression("Got an error from thread_id");
+
 #
 # BUG#925377:
 # Querying myisam table metadata while 'alter table..enable keys' is
@@ -61,4 +66,12 @@
 reap;
 disconnect con2;
 connection default;
+
+#
+# Test error message from disk full
+SET @saved_dbug = @@SESSION.debug_dbug;
+SET debug_dbug='+d,simulate_file_pwrite_error';
+--error ER_DISK_FULL
+insert into t1 select * from t2;
+SET debug_dbug= @saved_dbug;
 drop table t1,t2;
diff -Nru mariadb-10.11.11/mysql-test/main/mysql-interactive.result mariadb-10.11.13/mysql-test/main/mysql-interactive.result
--- mariadb-10.11.11/mysql-test/main/mysql-interactive.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysql-interactive.result	2025-05-19 16:14:24.000000000 +0000
@@ -4,6 +4,7 @@
 delimiter $
 select 1;
 $
+exit
 Welcome to the MariaDB monitor.  Commands end with ; or \g.
 Your MariaDB connection id is X
 Server version: Y
@@ -21,4 +22,5 @@
 +---+
 1 row in set
 
-MariaDB [(none)]> 
\ No newline at end of file
+MariaDB [(none)]> exit
+Bye
diff -Nru mariadb-10.11.11/mysql-test/main/mysql-interactive.test mariadb-10.11.13/mysql-test/main/mysql-interactive.test
--- mariadb-10.11.11/mysql-test/main/mysql-interactive.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysql-interactive.test	2025-05-19 16:14:24.000000000 +0000
@@ -6,23 +6,16 @@
 # this would need an instrumented ncurses library
 source include/not_msan.inc;
 
-error 0,1;
-exec $MYSQL -V|grep -q readline;
-if ($sys_errno == 1)
-{
-  # strangely enough
-  skip does not work with libedit;
-}
-
 write_file $MYSQL_TMP_DIR/mysql_in;
 delimiter $
 select 1;
 $
+exit
 EOF
 let TERM=dumb;
 replace_regex /id is \d+/id is X/ /Server version: .*/Server version: Y/ / \(\d+\.\d+ sec\)//;
 error 0,127;
-exec socat EXEC:"$MYSQL",pty STDIO < $MYSQL_TMP_DIR/mysql_in;
+exec socat -t10 EXEC:"$MYSQL",pty STDIO < $MYSQL_TMP_DIR/mysql_in;
 if ($sys_errno == 127)
 {
   remove_file $MYSQL_TMP_DIR/mysql_in;
diff -Nru mariadb-10.11.11/mysql-test/main/mysql_upgrade-34014.result mariadb-10.11.13/mysql-test/main/mysql_upgrade-34014.result
--- mariadb-10.11.11/mysql-test/main/mysql_upgrade-34014.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysql_upgrade-34014.result	2025-05-19 16:14:24.000000000 +0000
@@ -12,6 +12,8 @@
 SHOW CREATE DATABASE sys;
 Database	Create Database
 sys	CREATE DATABASE `sys` /*!40100 DEFAULT CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_ci */
+Warnings:
+Note	1105	Database 'sys' does not have a db.opt file. You can create one with ALTER DATABASE if needed
 Phase 1/8: Checking and upgrading mysql database
 Processing databases
 mysql
diff -Nru mariadb-10.11.11/mysql-test/main/mysql_upgrade.result mariadb-10.11.13/mysql-test/main/mysql_upgrade.result
--- mariadb-10.11.11/mysql-test/main/mysql_upgrade.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysql_upgrade.result	2025-05-19 16:14:24.000000000 +0000
@@ -151,7 +151,8 @@
 Phase 8/8: Running 'FLUSH PRIVILEGES'
 OK
 Run it again - should say already completed
-This installation of MariaDB is already upgraded to VERSION.There is no need to run mysql_upgrade again for VERSION.
+This installation of MariaDB is already upgraded to X.Y.Z-MariaDB.
+There is no need to run mysql_upgrade again.
 You can use --force if you still want to run mysql_upgrade
 Force should run it regardless of whether it has been run before
 Phase 1/8: Checking and upgrading mysql database
@@ -1911,11 +1912,11 @@
 #
 # MDEV-27279: mariadb_upgrade add --check-if-upgrade-is-needed
 #
-This installation of MariaDB is already upgraded to MariaDB .
-There is no need to run mysql_upgrade again for MariaDB .
+This installation of MariaDB is already upgraded to X.Y.Z-MariaDB.
+There is no need to run mysql_upgrade again.
 Looking for 'mariadb' as: mariadb
-This installation of MariaDB is already upgraded to MariaDB .
-There is no need to run mysql_upgrade again for MariaDB .
+This installation of MariaDB is already upgraded to X.Y.Z-MariaDB.
+There is no need to run mysql_upgrade again.
 #
 # MDEV-27279: mariadb_upgrade check-if-upgrade absence is do it
 #
@@ -1925,17 +1926,17 @@
 # MDEV-27279: mariadb_upgrade check-if-upgrade with minor version change
 #
 Looking for 'mariadb' as: mariadb
-This installation of MariaDB is already upgraded to MariaDB .
-There is no need to run mysql_upgrade again for MariaDB .
-This installation of MariaDB is already upgraded to MariaDB .
-There is no need to run mysql_upgrade again for MariaDB .
+This installation of MariaDB is already upgraded to X.Y.0-MariaDB.
+There is no need to run mysql_upgrade again for X.Y.Z-MariaDB, because they're both X.Y.
+This installation of MariaDB is already upgraded to X.Y.0-MariaDB.
+There is no need to run mysql_upgrade again for X.Y.Z-MariaDB, because they're both X.Y.
 You can use --force if you still want to run mysql_upgrade
 #
 # MDEV-27279: mariadb_upgrade check-if-upgrade with major version change
 #
-Major version upgrade detected from MariaDB  to MariaDB . Check required!
+Major version upgrade detected from X.0.99 to X.Y.Z-MariaDB. Check required!
 Looking for 'mysql' as: mysql
-Major version upgrade detected from MariaDB  to MariaDB . Check required!
+Major version upgrade detected from X.0.99 to X.Y.Z-MariaDB. Check required!
 drop table mysql.global_priv;
 rename table mysql.global_priv_bak to mysql.global_priv;
 # End of 10.2 tests
diff -Nru mariadb-10.11.11/mysql-test/main/mysql_upgrade.test mariadb-10.11.13/mysql-test/main/mysql_upgrade.test
--- mariadb-10.11.11/mysql-test/main/mysql_upgrade.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysql_upgrade.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,8 +1,12 @@
+--source include/long_test.inc
 -- source include/mysql_upgrade_preparation.inc
 -- source include/have_working_dns.inc
 -- source include/have_innodb.inc
 -- source include/have_partition.inc
--- source include/no_valgrind_without_big.inc
+
+let majorminor=`select substring_index(version(), '.', 2)`;
+# for major upgrade test, see below
+let major=`select substring_index(version(), '.', 1) - (version() like '%.0.%')`;
 
 set sql_mode="";
 
@@ -19,7 +23,7 @@
 file_exists $MYSQLD_DATADIR/mysql_upgrade_info;
 
 --echo Run it again - should say already completed
---replace_regex /upgraded to [^\n].*/upgraded to VERSION./ /again for [^\n]*/again for VERSION./
+--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB
 --exec $MYSQL_UPGRADE 2>&1
 
 # It should have created a file in the MySQL Servers datadir
@@ -289,10 +293,11 @@
 
 --error 1
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --silent
---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB /
+--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB
 --error 1
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed
---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/
+--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB
+--replace_regex /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/
 --error 1
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --verbose
 
@@ -320,16 +325,18 @@
   my $file= $ENV{'DATADIR'} or die "MYSQLD_DATADIR not set";
   $ver =~ s/^(\d*)\.(\d*).(\d*)(.*)/$1.$2.0$4/;
   open(FILE, ">$file/mysql_upgrade_info") or die "Failed to open $file";
+  binmode FILE;
   print FILE "$ver\n";
   close(FILE);
 EOF
 
 --error 1
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --silent
---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/
+--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $majorminor X.Y
+--replace_regex /'mariadb.* as:[^\n]*/'mariadb' as: mariadb/
 --error 1
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --verbose
---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB /
+--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $majorminor X.Y
 --exec $MYSQL_UPGRADE
 --remove_file $MYSQLD_DATADIR/mysql_upgrade_info
 
@@ -344,16 +351,18 @@
 perl;
   my $ver= $ENV{'MYSQL_SERVER_VERSION'} or die "MYSQL_SERVER_VERSION not set";
   my $file= $ENV{'DATADIR'} or die "MYSQLD_DATADIR not set";
-  $ver =~ s/^(\d*)\.(\d*).(\d*)(.*)/$1.0.$3$4/;
+  $ver =~ s/^(\d*)\.(\d*).(\d*)(.*)/$1.0.99/;
   open(FILE, ">$file/mysql_upgrade_info") or die "Failed to open $file";
+  binmode FILE;
   print FILE "$ver\n";
   close(FILE);
 EOF
 
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --silent
---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB /
+--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $major X
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed
---replace_regex /\d\d\.\d*\.\d*[^ .\n]*/MariaDB / /'mariadb.* as:[^\n]*/'mysql' as: mysql/
+--replace_result $MYSQL_SERVER_VERSION X.Y.Z-MariaDB $major X
+--replace_regex /'mariadb.* as:[^\n]*/'mysql' as: mysql/
 --exec $MYSQL_UPGRADE --check-if-upgrade-is-needed --verbose
 --remove_file $MYSQLD_DATADIR/mysql_upgrade_info
 drop table mysql.global_priv;
diff -Nru mariadb-10.11.11/mysql-test/main/mysqld--help.result mariadb-10.11.13/mysql-test/main/mysqld--help.result
--- mariadb-10.11.11/mysql-test/main/mysqld--help.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqld--help.result	2025-05-19 16:14:24.000000000 +0000
@@ -748,7 +748,8 @@
  keys. fix_reuse_range_for_ref = Do a better job at
  reusing range access estimates when estimating ref
  access. fix_card_multiplier = Fix the computation in
- selectivity_for_indexes. selectivity_multiplier. This
+ selectivity_for_indexes. fix_derived_table_read_cost =
+ Fix the cost of reading materialized derived table. This
  variable will be deleted in MariaDB 11.0 as it is not
  needed with the new 11.0 optimizer.
  Use 'ALL' to set all combinations.
diff -Nru mariadb-10.11.11/mysql-test/main/mysqldump-system.result mariadb-10.11.13/mysql-test/main/mysqldump-system.result
--- mariadb-10.11.11/mysql-test/main/mysqldump-system.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqldump-system.result	2025-05-19 16:14:24.000000000 +0000
@@ -650,21 +650,21 @@
 /*M!100401 UNINSTALL PLUGIN IF EXIST cleartext_plugin_server */;
 INSTALL PLUGIN cleartext_plugin_server  SONAME 'AUTH_TEST_PLUGIN_LIB';
 DELIMITER |
-/*M!100101 IF current_user()="'mariadb.sys'@'localhost'" THEN
+/*M!100101 IF current_user()='''mariadb.sys''@''localhost''' THEN
   SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001, MESSAGE_TEXT="Don't remove current user 'mariadb.sys'@'localhost''";
 END IF */|
 DELIMITER ;
 /*!50701 DROP USER IF EXISTS 'mariadb.sys'@'localhost' */;
 CREATE /*M!100103 OR REPLACE */ USER `mariadb.sys`@`localhost` PASSWORD EXPIRE;
 DELIMITER |
-/*M!100101 IF current_user()="'root'@'localhost'" THEN
+/*M!100101 IF current_user()='''root''@''localhost''' THEN
   SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001, MESSAGE_TEXT="Don't remove current user 'root'@'localhost''";
 END IF */|
 DELIMITER ;
 /*!50701 DROP USER IF EXISTS 'root'@'localhost' */;
 CREATE /*M!100103 OR REPLACE */ USER `root`@`localhost`;
 DELIMITER |
-/*M!100101 IF current_user()="'foobar'@'%'" THEN
+/*M!100101 IF current_user()='''foobar''@''%''' THEN
   SIGNAL SQLSTATE '45000' SET MYSQL_ERRNO=30001, MESSAGE_TEXT="Don't remove current user 'foobar'@'%''";
 END IF */|
 DELIMITER ;
diff -Nru mariadb-10.11.11/mysql-test/main/mysqldump.result mariadb-10.11.13/mysql-test/main/mysqldump.result
--- mariadb-10.11.11/mysql-test/main/mysqldump.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqldump.result	2025-05-19 16:14:24.000000000 +0000
@@ -6747,6 +6747,39 @@
 /*!40101 SET character_set_client = @saved_cs_client */;
 ERROR at line 9: Not allowed in the sandbox mode
 drop table t1;
+#
+# MDEV-36268 mariadb-dump used wrong quoting character
+#
+create table t1 (a int);
+create view `v'1"2` as select * from t1 with check option;
+/*M!999999\- enable the sandbox mode */ 
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!40101 SET character_set_client = utf8mb4 */;
+CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+SET @saved_cs_client     = @@character_set_client;
+SET character_set_client = utf8mb4;
+/*!50001 CREATE VIEW `v'1"2` AS SELECT
+ 1 AS `a` */;
+SET character_set_client = @saved_cs_client;
+/*!50001 DROP VIEW IF EXISTS `v'1"2`*/;
+/*!50001 SET @saved_cs_client          = @@character_set_client */;
+/*!50001 SET @saved_cs_results         = @@character_set_results */;
+/*!50001 SET @saved_col_connection     = @@collation_connection */;
+/*!50001 SET character_set_client      = utf8mb3 */;
+/*!50001 SET character_set_results     = utf8mb3 */;
+/*!50001 SET collation_connection      = utf8mb3_general_ci */;
+/*!50001 CREATE ALGORITHM=UNDEFINED */
+/*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */
+/*!50001 VIEW `v'1"2` AS select `t1`.`a` AS `a` from `t1` */
+/*!50002 WITH CASCADED CHECK OPTION */;
+/*!50001 SET character_set_client      = @saved_cs_client */;
+/*!50001 SET character_set_results     = @saved_cs_results */;
+/*!50001 SET collation_connection      = @saved_col_connection */;
+drop view `v'1"2`;
+drop table t1;
 # End of 10.5 tests
 #
 # MDEV-16733 mysqldump --tab and --xml options are conflicting
diff -Nru mariadb-10.11.11/mysql-test/main/mysqldump.test mariadb-10.11.13/mysql-test/main/mysqldump.test
--- mariadb-10.11.11/mysql-test/main/mysqldump.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqldump.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,4 @@
---source include/no_valgrind_without_big.inc
+--source include/long_test.inc
 --source include/have_utf8mb4.inc
 
 call mtr.add_suppression("@003f.frm' \\(errno: 22\\)");
@@ -3029,6 +3029,15 @@
 --remove_file $MYSQLTEST_VARDIR/tmp/mdev33727.sql
 drop table t1;
 
+--echo #
+--echo # MDEV-36268 mariadb-dump used wrong quoting character
+--echo #
+create table t1 (a int);
+create view `v'1"2` as select * from t1 with check option; # "'
+--exec $MYSQL_DUMP --compact test
+drop view `v'1"2`; # "'
+drop table t1;
+
 --echo # End of 10.5 tests
 
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/main/mysqlslap.result mariadb-10.11.13/mysql-test/main/mysqlslap.result
--- mariadb-10.11.11/mysql-test/main/mysqlslap.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqlslap.result	2025-05-19 16:14:24.000000000 +0000
@@ -260,3 +260,6 @@
 #
 # Bug MDEV-15789 (Upstream: #80329): MYSQLSLAP OPTIONS --AUTO-GENERATE-SQL-GUID-PRIMARY and --AUTO-GENERATE-SQL-SECONDARY-INDEXES DONT WORK
 #
+#
+# Bug MDEV-34621: Fix division by zero in mariadb-slap when iterations=0
+#
diff -Nru mariadb-10.11.11/mysql-test/main/mysqlslap.test mariadb-10.11.13/mysql-test/main/mysqlslap.test
--- mariadb-10.11.11/mysql-test/main/mysqlslap.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqlslap.test	2025-05-19 16:14:24.000000000 +0000
@@ -88,3 +88,9 @@
 --exec $MYSQL_SLAP --concurrency=1 --silent --iterations=1 --number-int-cols=2 --number-char-cols=3 --auto-generate-sql --auto-generate-sql-guid-primary --create-schema=slap
 
 --exec $MYSQL_SLAP --concurrency=1 --silent --iterations=1 --number-int-cols=2 --number-char-cols=3 --auto-generate-sql --auto-generate-sql-secondary-indexes=1 --create-schema=slap
+
+--echo #
+--echo # Bug MDEV-34621: Fix division by zero in mariadb-slap when iterations=0
+--echo #
+
+--exec $MYSQL_SLAP -i0 --only-print
diff -Nru mariadb-10.11.11/mysql-test/main/mysqltest.result mariadb-10.11.13/mysql-test/main/mysqltest.result
--- mariadb-10.11.11/mysql-test/main/mysqltest.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqltest.result	2025-05-19 16:14:24.000000000 +0000
@@ -989,4 +989,13 @@
 foo\"bar
 foo\"bar
 set sql_mode=default;
+#
+# MDEV-29344: engines/iuds.insert_time cannot run with PS protocol (syntax error)
+#
+SELECT 1 /* doesn't throw error */;
+1
+1
+SELECT 1 /* doesn't throw error */;
+1
+1
 End of tests
diff -Nru mariadb-10.11.11/mysql-test/main/mysqltest.test mariadb-10.11.13/mysql-test/main/mysqltest.test
--- mariadb-10.11.11/mysql-test/main/mysqltest.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/mysqltest.test	2025-05-19 16:14:24.000000000 +0000
@@ -2954,6 +2954,12 @@
 select "foo\""bar";
 set sql_mode=default;
 
+--echo #
+--echo # MDEV-29344: engines/iuds.insert_time cannot run with PS protocol (syntax error)
+--echo #
+SELECT 1 /* doesn't throw error */;
+SELECT 1 /* doesn't throw error */;
+
 --echo End of tests
 
 # Wait till we reached the initial number of concurrent sessions
diff -Nru mariadb-10.11.11/mysql-test/main/partition_myisam.result mariadb-10.11.13/mysql-test/main/partition_myisam.result
--- mariadb-10.11.11/mysql-test/main/partition_myisam.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/partition_myisam.result	2025-05-19 16:14:24.000000000 +0000
@@ -259,3 +259,24 @@
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
 DROP TABLE t1;
+#
+# MDEV-31122 Server crash in get_lock_data / mysql_lock_abort_for_thread
+#
+CREATE TABLE t1 (a INT);
+CREATE TABLE t2 (b INT, c varchar(5))
+PARTITION BY RANGE COLUMNS(c)
+SUBPARTITION by key(b) SUBPARTITIONS 2 (
+PARTITION p0 VALUES LESS THAN ('m'),
+PARTITION p1 VALUES LESS THAN ('z')
+);
+connect  con1,localhost,root,,;
+HANDLER t1 OPEN;
+SELECT b FROM t2 PARTITION (p0);
+connection default;
+SET lock_wait_timeout= 1;
+ALTER TABLE t1 FORCE;
+connection con1;
+b
+disconnect con1;
+connection default;
+DROP TABLE t2, t1;
diff -Nru mariadb-10.11.11/mysql-test/main/partition_myisam.test mariadb-10.11.13/mysql-test/main/partition_myisam.test
--- mariadb-10.11.11/mysql-test/main/partition_myisam.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/partition_myisam.test	2025-05-19 16:14:24.000000000 +0000
@@ -249,3 +249,31 @@
 ALTER TABLE `t1` REMOVE PARTITIONING;
 CHECK TABLE `t1` EXTENDED;
 DROP TABLE t1;
+
+--echo #
+--echo # MDEV-31122 Server crash in get_lock_data / mysql_lock_abort_for_thread
+--echo #
+CREATE TABLE t1 (a INT);
+
+CREATE TABLE t2 (b INT, c varchar(5))
+  PARTITION BY RANGE COLUMNS(c)
+  SUBPARTITION by key(b) SUBPARTITIONS 2 (
+    PARTITION p0 VALUES LESS THAN ('m'),
+    PARTITION p1 VALUES LESS THAN ('z')
+  );
+
+--connect (con1,localhost,root,,)
+HANDLER t1 OPEN;
+--send
+  SELECT b FROM t2 PARTITION (p0);
+
+--connection default
+SET lock_wait_timeout= 1;
+--error 0,ER_STATEMENT_TIMEOUT,ER_LOCK_WAIT_TIMEOUT
+ALTER TABLE t1 FORCE;
+
+--connection con1
+--reap
+--disconnect con1
+--connection default
+DROP TABLE t2, t1;
diff -Nru mariadb-10.11.11/mysql-test/main/query_cache.result mariadb-10.11.13/mysql-test/main/query_cache.result
--- mariadb-10.11.11/mysql-test/main/query_cache.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/query_cache.result	2025-05-19 16:14:24.000000000 +0000
@@ -2241,6 +2241,29 @@
 set global Query_cache_size=18446744073709547520;
 SET GLOBAL query_cache_size= @qc;
 #
+# MDEV-34075 corruption when query cache cannot allocate block
+#
+set global query_cache_type=1;
+create table t1 (c1 smallint null, c2 binary (25) not null, c3 tinyint(4) null, c4 binary (15) not null primary key, c5 smallint not null unique key,c6 decimal(10,8) not null default 3.141592) engine=innodb;
+set global query_cache_size=81920;
+select * from t1 where b=1 and c=1;
+ERROR 42S22: Unknown column 'b' in 'WHERE'
+set session query_cache_type=1;
+drop table t1;
+create table t1 (c1 int not null, c2 char(5)) engine=innodb partition by linear key(c1) partitions 99;
+select * from t1 where c1 <='1998-12-29 00:00:00' order by c1,c2;
+c1	c2
+select group_concat(a separator '###') as names from t1 having left(names, 1)='j';
+ERROR 42S22: Unknown column 'a' in 'SELECT'
+select * from t1;
+c1	c2
+select count(*) from t1;
+count(*)
+0
+select G.a, c.a from t1 c, t1 G;
+ERROR 42S22: Unknown column 'G.a' in 'SELECT'
+drop table t1;
+#
 # End of 10.5 tests
 #
 #
diff -Nru mariadb-10.11.11/mysql-test/main/query_cache.test mariadb-10.11.13/mysql-test/main/query_cache.test
--- mariadb-10.11.11/mysql-test/main/query_cache.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/query_cache.test	2025-05-19 16:14:24.000000000 +0000
@@ -2,6 +2,8 @@
 -- source include/long_test.inc
 -- source include/no_valgrind_without_big.inc
 -- source include/no_view_protocol.inc
+-- source include/have_partition.inc
+-- source include/have_innodb.inc
 
 --disable_ps2_protocol
 set @save_query_cache_size=@@query_cache_size;
@@ -1853,6 +1855,26 @@
 --enable_warnings
 
 --echo #
+--echo # MDEV-34075 corruption when query cache cannot allocate block
+--echo #
+set global query_cache_type=1;
+create table t1 (c1 smallint null, c2 binary (25) not null, c3 tinyint(4) null, c4 binary (15) not null primary key, c5 smallint not null unique key,c6 decimal(10,8) not null default 3.141592) engine=innodb;
+set global query_cache_size=81920;
+--error ER_BAD_FIELD_ERROR
+select * from t1 where b=1 and c=1;
+set session query_cache_type=1;
+drop table t1;
+create table t1 (c1 int not null, c2 char(5)) engine=innodb partition by linear key(c1) partitions 99;
+select * from t1 where c1 <='1998-12-29 00:00:00' order by c1,c2;
+--error ER_BAD_FIELD_ERROR
+select group_concat(a separator '###') as names from t1 having left(names, 1)='j';
+select * from t1;
+select count(*) from t1;
+--error ER_BAD_FIELD_ERROR
+select G.a, c.a from t1 c, t1 G;
+drop table t1;
+
+--echo #
 --echo # End of 10.5 tests
 --echo #
 
diff -Nru mariadb-10.11.11/mysql-test/main/range_notembedded.result mariadb-10.11.13/mysql-test/main/range_notembedded.result
--- mariadb-10.11.11/mysql-test/main/range_notembedded.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/range_notembedded.result	2025-05-19 16:14:24.000000000 +0000
@@ -247,3 +247,70 @@
 id
 5
 DROP TABLE t1;
+#
+# MDEV-34620: Many index_merge variants made and discarded for a big OR
+#
+CREATE TABLE t1 (
+a1 int NOT NULL,
+a2 int NOT NULL,
+filler char(100),
+KEY key1 (a1,a2),
+KEY key2 (a2,a1)
+);
+insert into t1 (a1,a2) values (1,1),(2,2),(3,3);
+set @query= concat(
+"explain select * from t1 where\n",
+(select
+group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " )
+from seq_1_to_30)
+);
+set optimizer_trace=1;
+prepare s from @query;
+execute s;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	key1,key2	NULL	NULL	NULL	3	Using where
+set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis');
+# Observe that "key1" is a a part of several index_merge_union:
+select json_pretty(json_search(@trace, 'all', 'key1'));
+json_pretty(json_search(@trace, 'all', 'key1'))
+[
+    "$[0].potential_range_indexes[0].index",
+    "$[0].analyzing_range_alternatives.range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[0].range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[0].index_to_merge",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[1].range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[0].indexes_to_merge[1].index_to_merge",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[0].range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[0].index_to_merge",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[1].range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[1].index_to_merge",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[2].range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[1].indexes_to_merge[2].index_to_merge",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[0].range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[0].index_to_merge",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[1].range_scan_alternatives[0].index",
+    "$[0].analyzing_range_alternatives.analyzing_index_merge_union[2].indexes_to_merge[1].index_to_merge"
+]
+#
+# Now, same as above but for a long IN-list
+#
+set @query= concat(
+"explain select * from t1 where\n",
+(select
+group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " )
+from seq_1_to_120)
+);
+set optimizer_trace=1;
+prepare s from @query;
+execute s;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	key1,key2	NULL	NULL	NULL	3	Using where
+set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis');
+# Observe that there are NO index_merge_union candidates. Only one potential range scan:
+select json_pretty(json_search(@trace, 'all', 'key1'));
+json_pretty(json_search(@trace, 'all', 'key1'))
+[
+    "$[0].potential_range_indexes[0].index",
+    "$[0].analyzing_range_alternatives.range_scan_alternatives[0].index"
+]
+drop table t1;
diff -Nru mariadb-10.11.11/mysql-test/main/range_notembedded.test mariadb-10.11.13/mysql-test/main/range_notembedded.test
--- mariadb-10.11.11/mysql-test/main/range_notembedded.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/range_notembedded.test	2025-05-19 16:14:24.000000000 +0000
@@ -162,3 +162,51 @@
 SELECT id FROM t1 WHERE id IS NULL OR id NOT BETWEEN 1 AND 4;
 DROP TABLE t1;
 
+--echo #
+--echo # MDEV-34620: Many index_merge variants made and discarded for a big OR
+--echo #
+
+CREATE TABLE t1 (
+  a1 int NOT NULL,
+  a2 int NOT NULL,
+  filler char(100),
+  KEY key1 (a1,a2),
+  KEY key2 (a2,a1)
+);
+insert into t1 (a1,a2) values (1,1),(2,2),(3,3);
+
+
+set @query= concat(
+  "explain select * from t1 where\n",
+  (select
+     group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " )
+   from seq_1_to_30)
+  );
+
+set optimizer_trace=1;
+prepare s from @query;
+execute s;
+set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis');
+
+--echo # Observe that "key1" is a a part of several index_merge_union:
+select json_pretty(json_search(@trace, 'all', 'key1'));
+
+--echo #
+--echo # Now, same as above but for a long IN-list
+--echo #
+set @query= concat(
+  "explain select * from t1 where\n",
+  (select
+     group_concat(concat("a1=", seq, " and a2=", seq, " ") separator "\nor " )
+   from seq_1_to_120)
+  );
+
+set optimizer_trace=1;
+prepare s from @query;
+execute s;
+set @trace=json_extract((select trace from information_schema.optimizer_trace), '$**.range_analysis');
+
+--echo # Observe that there are NO index_merge_union candidates. Only one potential range scan:
+select json_pretty(json_search(@trace, 'all', 'key1'));
+drop table t1;
+
diff -Nru mariadb-10.11.11/mysql-test/main/secondary_key_costs.result mariadb-10.11.13/mysql-test/main/secondary_key_costs.result
--- mariadb-10.11.11/mysql-test/main/secondary_key_costs.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/secondary_key_costs.result	2025-05-19 16:14:24.000000000 +0000
@@ -177,4 +177,80 @@
 drop table t1,t2;
 set global userstat=@save_userstat;
 set global innodb_stats_persistent_sample_pages=@save_ispsp;
+#
+# MDEV-35958: Cost estimates for materialized derived tables are poor
+#
+set optimizer_trace=1;
+create table t1 (
+a int
+);
+insert into t1 select seq from seq_1_to_10000;
+explain
+select *
+from
+t1 as t1_base,
+(select a from t1 limit 10000) as TBL;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	PRIMARY	t1_base	ALL	NULL	NULL	NULL	NULL	10000	
+1	PRIMARY	<derived2>	ALL	NULL	NULL	NULL	NULL	10000	Using join buffer (flat, BNL join)
+2	DERIVED	t1	ALL	NULL	NULL	NULL	NULL	10000	
+set @trace=(select trace from information_schema.optimizer_trace);
+# BEFORE, without fix_derived_table_read_cost: derived2 has cost=rows=10000
+select json_detailed(
+json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]')
+) as Trace;
+Trace
+[
+    {
+        "table": "t1_base",
+        "table_scan": 
+        {
+            "rows": 10000,
+            "cost": 19.08984375
+        }
+    },
+    {
+        "table": "<derived2>",
+        "table_scan": 
+        {
+            "rows": 10000,
+            "cost": 10000
+        }
+    }
+]
+set optimizer_adjust_secondary_key_costs='fix_derived_table_read_cost';
+explain
+select *
+from
+t1 as t1_base,
+(select a from t1 limit 10000) as TBL;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	PRIMARY	t1_base	ALL	NULL	NULL	NULL	NULL	10000	
+1	PRIMARY	<derived2>	ALL	NULL	NULL	NULL	NULL	10000	Using join buffer (flat, BNL join)
+2	DERIVED	t1	ALL	NULL	NULL	NULL	NULL	10000	
+set @trace=(select trace from information_schema.optimizer_trace);
+# AFTER, with fix_derived_table_read_cost: derived2 has more realistic cost
+select json_detailed(
+json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]')
+) as Trace;
+Trace
+[
+    {
+        "table": "t1_base",
+        "table_scan": 
+        {
+            "rows": 10000,
+            "cost": 19.08984375
+        }
+    },
+    {
+        "table": "<derived2>",
+        "table_scan": 
+        {
+            "rows": 10000,
+            "cost": 501
+        }
+    }
+]
+drop table t1;
 set @@optimizer_adjust_secondary_key_costs=default;
diff -Nru mariadb-10.11.11/mysql-test/main/secondary_key_costs.test mariadb-10.11.13/mysql-test/main/secondary_key_costs.test
--- mariadb-10.11.11/mysql-test/main/secondary_key_costs.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/secondary_key_costs.test	2025-05-19 16:14:24.000000000 +0000
@@ -109,4 +109,41 @@
 set global userstat=@save_userstat;
 set global innodb_stats_persistent_sample_pages=@save_ispsp;
 
+--echo #
+--echo # MDEV-35958: Cost estimates for materialized derived tables are poor
+--echo #
+set optimizer_trace=1;
+create table t1 (
+  a int
+);
+insert into t1 select seq from seq_1_to_10000;
+
+explain
+select *
+from
+  t1 as t1_base,
+  (select a from t1 limit 10000) as TBL;
+
+set @trace=(select trace from information_schema.optimizer_trace);
+--echo # BEFORE, without fix_derived_table_read_cost: derived2 has cost=rows=10000
+select json_detailed(
+  json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]')
+  ) as Trace;
+
+set optimizer_adjust_secondary_key_costs='fix_derived_table_read_cost';
+
+explain
+select *
+from
+  t1 as t1_base,
+  (select a from t1 limit 10000) as TBL;
+
+set @trace=(select trace from information_schema.optimizer_trace);
+--echo # AFTER, with fix_derived_table_read_cost: derived2 has more realistic cost
+select json_detailed(
+  json_extract(json_extract(@trace, '$**.rows_estimation'), '$[1]')
+  ) as Trace;
+
+drop table t1;
+
 set @@optimizer_adjust_secondary_key_costs=default;
diff -Nru mariadb-10.11.11/mysql-test/main/skip_grants.result mariadb-10.11.13/mysql-test/main/skip_grants.result
--- mariadb-10.11.11/mysql-test/main/skip_grants.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/skip_grants.result	2025-05-19 16:14:24.000000000 +0000
@@ -138,6 +138,14 @@
 # End of 10.3 tests
 #
 #
+# MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables
+#
+CREATE DEFINER=a PROCEDURE p() SELECT 1;
+CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100;
+DROP PROCEDURE p;
+DROP FUNCTION f;
+# End of 10.5 tests
+#
 # MDEV-24815 Show "--skip-grant-tables" state in SYSTEM VARIABLES
 #
 SELECT @@skip_grant_tables AS EXPECT_1;
diff -Nru mariadb-10.11.11/mysql-test/main/skip_grants.test mariadb-10.11.13/mysql-test/main/skip_grants.test
--- mariadb-10.11.11/mysql-test/main/skip_grants.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/skip_grants.test	2025-05-19 16:14:24.000000000 +0000
@@ -170,6 +170,17 @@
 --echo #
 
 --echo #
+--echo # MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables
+--echo #
+CREATE DEFINER=a PROCEDURE p() SELECT 1;
+CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100;
+
+DROP PROCEDURE p;
+DROP FUNCTION f;
+
+--echo # End of 10.5 tests
+
+--echo #
 --echo # MDEV-24815 Show "--skip-grant-tables" state in SYSTEM VARIABLES
 --echo #
 
diff -Nru mariadb-10.11.11/mysql-test/main/sp-bugs.result mariadb-10.11.13/mysql-test/main/sp-bugs.result
--- mariadb-10.11.11/mysql-test/main/sp-bugs.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/sp-bugs.result	2025-05-19 16:14:24.000000000 +0000
@@ -388,5 +388,14 @@
 DROP PROCEDURE p2;
 DROP TABLE t1, t2;
 #
+# MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables
+#
+# This test is a duplicate of the one located in the file skip_grants.test
+# and placed here to check the same test case against embedded-server
+CREATE DEFINER=a PROCEDURE p() SELECT 1;
+CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100;
+DROP PROCEDURE p;
+DROP FUNCTION f;
+#
 # End of 10.5 tests
 #
diff -Nru mariadb-10.11.11/mysql-test/main/sp-bugs.test mariadb-10.11.13/mysql-test/main/sp-bugs.test
--- mariadb-10.11.11/mysql-test/main/sp-bugs.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/sp-bugs.test	2025-05-19 16:14:24.000000000 +0000
@@ -415,5 +415,25 @@
 DROP TABLE t1, t2;
 
 --echo #
+--echo # MDEV-34501: SIGSEGV in pfs_start_mutex_wait_v1, __strlen_avx2, or __strlen_evex from safe_mutex_lock on CREATE DEFINER when using skip-grant-tables
+--echo #
+--echo # This test is a duplicate of the one located in the file skip_grants.test
+--echo # and placed here to check the same test case against embedded-server
+
+# Disable warnings before running the following CREATE PROCEDURE/FUNCTION
+# statement since the warning message
+#   "The user specified as a definer ('a'@'%') does not exist"
+# is output in case the test be run against a regular server
+# and isn't output if embedded server is used (@sa sp_process_definer()
+# in sql_parse.cc).
+--disable_warnings
+CREATE DEFINER=a PROCEDURE p() SELECT 1;
+CREATE DEFINER=a FUNCTION f() RETURNS INT RETURN 100;
+--enable_warnings
+
+DROP PROCEDURE p;
+DROP FUNCTION f;
+
+--echo #
 --echo # End of 10.5 tests
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/main/sp-row.result mariadb-10.11.13/mysql-test/main/sp-row.result
--- mariadb-10.11.11/mysql-test/main/sp-row.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/sp-row.result	2025-05-19 16:14:24.000000000 +0000
@@ -2313,3 +2313,44 @@
 END;
 $$
 ERROR 21000: Operand should contain 1 column(s)
+# Start of 10.6 tests
+#
+# MDEV-36179 Assertion `0' failed in virtual bool Type_handler_row::Item_save_in_value(THD*, Item*, st_value*) const
+#
+CREATE PROCEDURE p0 (IN a ROW(a INT,b INT))
+BEGIN
+SET a=ROW(0,0);
+END;
+/
+PREPARE s0 FROM 'CALL p0(?)';
+EXECUTE s0 USING @a;
+ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?'
+DROP PROCEDURE p0;
+CREATE PROCEDURE p0 (INOUT a ROW(a INT,b INT))
+BEGIN
+SET a=ROW(0,0);
+END;
+/
+PREPARE s0 FROM 'CALL p0(?)';
+EXECUTE s0 USING @a;
+ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?'
+DROP PROCEDURE p0;
+CREATE PROCEDURE p0 (OUT a ROW(a INT,b INT))
+BEGIN
+SET a=ROW(0,0);
+END;
+/
+PREPARE s0 FROM 'CALL p0(?)';
+EXECUTE s0 USING @a;
+ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?'
+DROP PROCEDURE p0;
+CREATE FUNCTION f0(a ROW(a INT,b INT)) RETURNS BOOLEAN
+BEGIN
+RETURN FALSE;
+END;
+/
+PREPARE s0 FROM 'SELECT f0(?)';
+EXECUTE s0 USING @a;
+ERROR HY000: Illegal parameter data type row for operation 'EXECUTE ... USING ?'
+DROP FUNCTION f0;
+# End of 10.6 tests
diff -Nru mariadb-10.11.11/mysql-test/main/sp-row.test mariadb-10.11.13/mysql-test/main/sp-row.test
--- mariadb-10.11.11/mysql-test/main/sp-row.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/sp-row.test	2025-05-19 16:14:24.000000000 +0000
@@ -1544,3 +1544,64 @@
 END;
 $$
 DELIMITER ;$$
+
+
+--echo # Start of 10.6 tests
+
+
+--echo #
+--echo # MDEV-36179 Assertion `0' failed in virtual bool Type_handler_row::Item_save_in_value(THD*, Item*, st_value*) const
+--echo #
+
+DELIMITER /;
+CREATE PROCEDURE p0 (IN a ROW(a INT,b INT))
+BEGIN
+  SET a=ROW(0,0);
+END;
+/
+DELIMITER ;/
+PREPARE s0 FROM 'CALL p0(?)';
+--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION
+EXECUTE s0 USING @a;
+DROP PROCEDURE p0;
+
+
+DELIMITER /;
+CREATE PROCEDURE p0 (INOUT a ROW(a INT,b INT))
+BEGIN
+  SET a=ROW(0,0);
+END;
+/
+DELIMITER ;/
+PREPARE s0 FROM 'CALL p0(?)';
+--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION
+EXECUTE s0 USING @a;
+DROP PROCEDURE p0;
+
+
+DELIMITER /;
+CREATE PROCEDURE p0 (OUT a ROW(a INT,b INT))
+BEGIN
+  SET a=ROW(0,0);
+END;
+/
+DELIMITER ;/
+PREPARE s0 FROM 'CALL p0(?)';
+--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION
+EXECUTE s0 USING @a;
+DROP PROCEDURE p0;
+
+
+DELIMITER /;
+CREATE FUNCTION f0(a ROW(a INT,b INT)) RETURNS BOOLEAN
+BEGIN
+  RETURN FALSE;
+END;
+/
+DELIMITER ;/
+PREPARE s0 FROM 'SELECT f0(?)';
+--error ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION
+EXECUTE s0 USING @a;
+DROP FUNCTION f0;
+
+--echo # End of 10.6 tests
diff -Nru mariadb-10.11.11/mysql-test/main/subselect.result mariadb-10.11.13/mysql-test/main/subselect.result
--- mariadb-10.11.11/mysql-test/main/subselect.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect.result	2025-05-19 16:14:24.000000000 +0000
@@ -679,22 +679,24 @@
 insert into t2 values (1);
 insert into t3 values (1),(2);
 INSERT INTO t1 (x) VALUES ((SELECT x FROM t1));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 INSERT INTO t1 (x) VALUES ((SELECT b FROM t3));
 ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t1 (x) VALUES ((SELECT a FROM t2));
 select * from t1;
 x
+NULL
 1
 insert into t2 values (1);
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -702,6 +704,7 @@
 INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -711,6 +714,7 @@
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -727,7 +731,7 @@
 select * from t1;
 x	y
 replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 23000: Column 'x' cannot be null
 replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2));
 ERROR 21000: Subquery returns more than 1 row
 replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2));
@@ -795,13 +799,21 @@
 id
 2
 INSERT INTO t2 VALUES ((SELECT * FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t2 VALUES ((SELECT id FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
+select * from t2;
+id
+1
+2
+INSERT INTO t2 VALUES ((SELECT count(*) FROM t2));
+INSERT INTO t2 VALUES ((SELECT max(id) FROM t2));
 SELECT * FROM t2;
 id
 1
 2
+2
+2
 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1;
 INSERT INTO t1 values (1),(1);
 UPDATE t2 SET id=(SELECT * FROM t1);
diff -Nru mariadb-10.11.11/mysql-test/main/subselect.test mariadb-10.11.13/mysql-test/main/subselect.test
--- mariadb-10.11.11/mysql-test/main/subselect.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect.test	2025-05-19 16:14:24.000000000 +0000
@@ -419,7 +419,6 @@
 create table t3 (b int);
 insert into t2 values (1);
 insert into t3 values (1),(2);
--- error ER_UPDATE_TABLE_USED
 INSERT INTO t1 (x) VALUES ((SELECT x FROM t1));
 -- error ER_SUBQUERY_NO_1_ROW
 INSERT INTO t1 (x) VALUES ((SELECT b FROM t3));
@@ -454,7 +453,7 @@
 insert into t2 values (1);
 insert into t3 values (1),(2);
 select * from t1;
--- error ER_UPDATE_TABLE_USED
+-- error ER_BAD_NULL_ERROR
 replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2));
 -- error ER_SUBQUERY_NO_1_ROW
 replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2));
@@ -494,10 +493,13 @@
 --disable_prepare_warnings
 SELECT * FROM t2 WHERE id IN (SELECT 5 UNION SELECT 3);
 SELECT * FROM t2 WHERE id IN (SELECT 5 UNION SELECT 2);
--- error ER_UPDATE_TABLE_USED
+-- error ER_SUBQUERY_NO_1_ROW
 INSERT INTO t2 VALUES ((SELECT * FROM t2));
--- error ER_UPDATE_TABLE_USED
+-- error ER_SUBQUERY_NO_1_ROW
 INSERT INTO t2 VALUES ((SELECT id FROM t2));
+select * from t2;
+INSERT INTO t2 VALUES ((SELECT count(*) FROM t2));
+INSERT INTO t2 VALUES ((SELECT max(id) FROM t2));
 SELECT * FROM t2;
 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1;
 INSERT INTO t1 values (1),(1);
diff -Nru mariadb-10.11.11/mysql-test/main/subselect_elimination.result mariadb-10.11.13/mysql-test/main/subselect_elimination.result
--- mariadb-10.11.11/mysql-test/main/subselect_elimination.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect_elimination.result	2025-05-19 16:14:24.000000000 +0000
@@ -136,12 +136,22 @@
 #  access within null pointer
 CREATE TABLE x (x INT) ENGINE=InnoDB;
 INSERT INTO x (x) VALUES (0);
+select NULL IN (SELECT (SELECT x FROM (SELECT x FROM
+(SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT
+(SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN
+(SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x)
+AS x) IN (SELECT 0 AS x) AS x FROM x) as exp;
+exp
+NULL
 INSERT INTO x (x) VALUES (x IN (SELECT (SELECT x FROM (SELECT x FROM
 (SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT
 (SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN
 (SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x)
 AS x) IN (SELECT 0 AS x) AS x FROM x));
-ERROR HY000: Table 'x' is specified twice, both as a target for 'INSERT' and as a separate source for data
+select * from x;
+x
+0
+NULL
 DROP TABLE x;
 # MDEV-28622: Item_subselect eliminated flag set but Item still
 #   evaluated/used.
diff -Nru mariadb-10.11.11/mysql-test/main/subselect_elimination.test mariadb-10.11.13/mysql-test/main/subselect_elimination.test
--- mariadb-10.11.11/mysql-test/main/subselect_elimination.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect_elimination.test	2025-05-19 16:14:24.000000000 +0000
@@ -133,12 +133,17 @@
 
 CREATE TABLE x (x INT) ENGINE=InnoDB;
 INSERT INTO x (x) VALUES (0);
---error ER_UPDATE_TABLE_USED
+select NULL IN (SELECT (SELECT x FROM (SELECT x FROM
+(SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT
+(SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN
+(SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x)
+AS x) IN (SELECT 0 AS x) AS x FROM x) as exp;
 INSERT INTO x (x) VALUES (x IN (SELECT (SELECT x FROM (SELECT x FROM
 (SELECT 0 IN (SELECT x=0 FROM (SELECT x FROM (SELECT (SELECT (SELECT (SELECT
 (SELECT 0 AS x) FROM x AS x) IN (SELECT 0 AS x) AS x) FROM x AS x) IN
 (SELECT x WHERE x=0) AS x FROM x AS x) AS x) AS x GROUP BY x) AS x FROM x) AS x)
 AS x) IN (SELECT 0 AS x) AS x FROM x));
+select * from x;
 DROP TABLE x;
 
 --echo # MDEV-28622: Item_subselect eliminated flag set but Item still
diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_exists_to_in.result mariadb-10.11.13/mysql-test/main/subselect_no_exists_to_in.result
--- mariadb-10.11.11/mysql-test/main/subselect_no_exists_to_in.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect_no_exists_to_in.result	2025-05-19 16:14:24.000000000 +0000
@@ -683,22 +683,24 @@
 insert into t2 values (1);
 insert into t3 values (1),(2);
 INSERT INTO t1 (x) VALUES ((SELECT x FROM t1));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 INSERT INTO t1 (x) VALUES ((SELECT b FROM t3));
 ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t1 (x) VALUES ((SELECT a FROM t2));
 select * from t1;
 x
+NULL
 1
 insert into t2 values (1);
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -706,6 +708,7 @@
 INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -715,6 +718,7 @@
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -731,7 +735,7 @@
 select * from t1;
 x	y
 replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 23000: Column 'x' cannot be null
 replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2));
 ERROR 21000: Subquery returns more than 1 row
 replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2));
@@ -799,13 +803,21 @@
 id
 2
 INSERT INTO t2 VALUES ((SELECT * FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t2 VALUES ((SELECT id FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
+select * from t2;
+id
+1
+2
+INSERT INTO t2 VALUES ((SELECT count(*) FROM t2));
+INSERT INTO t2 VALUES ((SELECT max(id) FROM t2));
 SELECT * FROM t2;
 id
 1
 2
+2
+2
 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1;
 INSERT INTO t1 values (1),(1);
 UPDATE t2 SET id=(SELECT * FROM t1);
diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_mat.result mariadb-10.11.13/mysql-test/main/subselect_no_mat.result
--- mariadb-10.11.11/mysql-test/main/subselect_no_mat.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect_no_mat.result	2025-05-19 16:14:24.000000000 +0000
@@ -686,22 +686,24 @@
 insert into t2 values (1);
 insert into t3 values (1),(2);
 INSERT INTO t1 (x) VALUES ((SELECT x FROM t1));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 INSERT INTO t1 (x) VALUES ((SELECT b FROM t3));
 ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t1 (x) VALUES ((SELECT a FROM t2));
 select * from t1;
 x
+NULL
 1
 insert into t2 values (1);
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -709,6 +711,7 @@
 INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -718,6 +721,7 @@
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -734,7 +738,7 @@
 select * from t1;
 x	y
 replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 23000: Column 'x' cannot be null
 replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2));
 ERROR 21000: Subquery returns more than 1 row
 replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2));
@@ -802,13 +806,21 @@
 id
 2
 INSERT INTO t2 VALUES ((SELECT * FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t2 VALUES ((SELECT id FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
+select * from t2;
+id
+1
+2
+INSERT INTO t2 VALUES ((SELECT count(*) FROM t2));
+INSERT INTO t2 VALUES ((SELECT max(id) FROM t2));
 SELECT * FROM t2;
 id
 1
 2
+2
+2
 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1;
 INSERT INTO t1 values (1),(1);
 UPDATE t2 SET id=(SELECT * FROM t1);
diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_opts.result mariadb-10.11.13/mysql-test/main/subselect_no_opts.result
--- mariadb-10.11.11/mysql-test/main/subselect_no_opts.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect_no_opts.result	2025-05-19 16:14:24.000000000 +0000
@@ -682,22 +682,24 @@
 insert into t2 values (1);
 insert into t3 values (1),(2);
 INSERT INTO t1 (x) VALUES ((SELECT x FROM t1));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 INSERT INTO t1 (x) VALUES ((SELECT b FROM t3));
 ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t1 (x) VALUES ((SELECT a FROM t2));
 select * from t1;
 x
+NULL
 1
 insert into t2 values (1);
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -705,6 +707,7 @@
 INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -714,6 +717,7 @@
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -730,7 +734,7 @@
 select * from t1;
 x	y
 replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 23000: Column 'x' cannot be null
 replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2));
 ERROR 21000: Subquery returns more than 1 row
 replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2));
@@ -798,13 +802,21 @@
 id
 2
 INSERT INTO t2 VALUES ((SELECT * FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t2 VALUES ((SELECT id FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
+select * from t2;
+id
+1
+2
+INSERT INTO t2 VALUES ((SELECT count(*) FROM t2));
+INSERT INTO t2 VALUES ((SELECT max(id) FROM t2));
 SELECT * FROM t2;
 id
 1
 2
+2
+2
 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1;
 INSERT INTO t1 values (1),(1);
 UPDATE t2 SET id=(SELECT * FROM t1);
diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_scache.result mariadb-10.11.13/mysql-test/main/subselect_no_scache.result
--- mariadb-10.11.11/mysql-test/main/subselect_no_scache.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect_no_scache.result	2025-05-19 16:14:24.000000000 +0000
@@ -685,22 +685,24 @@
 insert into t2 values (1);
 insert into t3 values (1),(2);
 INSERT INTO t1 (x) VALUES ((SELECT x FROM t1));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 INSERT INTO t1 (x) VALUES ((SELECT b FROM t3));
 ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t1 (x) VALUES ((SELECT a FROM t2));
 select * from t1;
 x
+NULL
 1
 insert into t2 values (1);
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -708,6 +710,7 @@
 INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -717,6 +720,7 @@
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -733,7 +737,7 @@
 select * from t1;
 x	y
 replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 23000: Column 'x' cannot be null
 replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2));
 ERROR 21000: Subquery returns more than 1 row
 replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2));
@@ -801,13 +805,21 @@
 id
 2
 INSERT INTO t2 VALUES ((SELECT * FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t2 VALUES ((SELECT id FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
+select * from t2;
+id
+1
+2
+INSERT INTO t2 VALUES ((SELECT count(*) FROM t2));
+INSERT INTO t2 VALUES ((SELECT max(id) FROM t2));
 SELECT * FROM t2;
 id
 1
 2
+2
+2
 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1;
 INSERT INTO t1 values (1),(1);
 UPDATE t2 SET id=(SELECT * FROM t1);
diff -Nru mariadb-10.11.11/mysql-test/main/subselect_no_semijoin.result mariadb-10.11.13/mysql-test/main/subselect_no_semijoin.result
--- mariadb-10.11.11/mysql-test/main/subselect_no_semijoin.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/subselect_no_semijoin.result	2025-05-19 16:14:24.000000000 +0000
@@ -682,22 +682,24 @@
 insert into t2 values (1);
 insert into t3 values (1),(2);
 INSERT INTO t1 (x) VALUES ((SELECT x FROM t1));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 INSERT INTO t1 (x) VALUES ((SELECT b FROM t3));
 ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t1 (x) VALUES ((SELECT a FROM t2));
 select * from t1;
 x
+NULL
 1
 insert into t2 values (1);
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 INSERT INTO t1 (x) select (SELECT SUM(a)+1 FROM t2) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -705,6 +707,7 @@
 INSERT INTO t1 (x) select (SELECT SUM(x)+2 FROM t1) FROM t2;
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -714,6 +717,7 @@
 INSERT DELAYED INTO t1 (x) VALUES ((SELECT SUM(a) FROM t2));
 select * from t1;
 x
+NULL
 1
 2
 3
@@ -730,7 +734,7 @@
 select * from t1;
 x	y
 replace into t1 (x, y) VALUES ((SELECT x FROM t1), (SELECT a+1 FROM t2));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 23000: Column 'x' cannot be null
 replace into t1 (x, y) VALUES ((SELECT a FROM t3), (SELECT a+1 FROM t2));
 ERROR 21000: Subquery returns more than 1 row
 replace into t1 (x, y) VALUES ((SELECT a FROM t2), (SELECT a+1 FROM t2));
@@ -798,13 +802,21 @@
 id
 2
 INSERT INTO t2 VALUES ((SELECT * FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
 INSERT INTO t2 VALUES ((SELECT id FROM t2));
-ERROR HY000: Table 't2' is specified twice, both as a target for 'INSERT' and as a separate source for data
+ERROR 21000: Subquery returns more than 1 row
+select * from t2;
+id
+1
+2
+INSERT INTO t2 VALUES ((SELECT count(*) FROM t2));
+INSERT INTO t2 VALUES ((SELECT max(id) FROM t2));
 SELECT * FROM t2;
 id
 1
 2
+2
+2
 CREATE TABLE t1 (id int(11) default NULL, KEY id (id)) ENGINE=MyISAM CHARSET=latin1;
 INSERT INTO t1 values (1),(1);
 UPDATE t2 SET id=(SELECT * FROM t1);
diff -Nru mariadb-10.11.11/mysql-test/main/temp_table_frm.result mariadb-10.11.13/mysql-test/main/temp_table_frm.result
--- mariadb-10.11.11/mysql-test/main/temp_table_frm.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/temp_table_frm.result	2025-05-19 16:14:24.000000000 +0000
@@ -25,3 +25,9 @@
 set @@use_stat_tables= @save_use_stat_tables;
 set @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
 drop table t1;
+#
+# MDEV-36138 Server null-pointer crash at startup when tmptables left in --tmpdir
+#
+create table t1 (c int);
+drop table t1;
+# restart
diff -Nru mariadb-10.11.11/mysql-test/main/temp_table_frm.test mariadb-10.11.13/mysql-test/main/temp_table_frm.test
--- mariadb-10.11.11/mysql-test/main/temp_table_frm.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/temp_table_frm.test	2025-05-19 16:14:24.000000000 +0000
@@ -24,4 +24,15 @@
 from information_schema.session_status join t1 using (variable_name);
 set @@use_stat_tables= @save_use_stat_tables;
 set @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
-drop table t1;
\ No newline at end of file
+drop table t1;
+
+--echo #
+--echo # MDEV-36138 Server null-pointer crash at startup when tmptables left in --tmpdir
+--echo #
+
+create table t1 (c int);
+let $MYSQLD_TMPDIR=`SELECT @@tmpdir`;
+let $MYSQLD_DATADIR=`SELECT @@datadir`;
+--copy_file $MYSQLD_DATADIR/test/t1.frm $MYSQLD_TMPDIR/#sqlt1.frm
+drop table t1;
+--source include/restart_mysqld.inc
diff -Nru mariadb-10.11.11/mysql-test/main/timezone.test mariadb-10.11.13/mysql-test/main/timezone.test
--- mariadb-10.11.11/mysql-test/main/timezone.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/timezone.test	2025-05-19 16:14:24.000000000 +0000
@@ -8,7 +8,7 @@
 enable_query_log;
 
 # The following is because of daylight saving time
---replace_result MEST CET MET CET
+--replace_result MEST CET MET CET CEST CET
 show variables like "system_time_zone";
 
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/main/trigger_null.result mariadb-10.11.13/mysql-test/main/trigger_null.result
--- mariadb-10.11.11/mysql-test/main/trigger_null.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/trigger_null.result	2025-05-19 16:14:24.000000000 +0000
@@ -399,4 +399,19 @@
 Warning	1364	Field 'c5' doesn't have a default value
 drop table t1;
 set sql_mode=default;
+#
+# MDEV-36026 Problem with INSERT SELECT on NOT NULL columns while having BEFORE UPDATE trigger
+#
+create table t1 (b int(11) not null);
+create trigger t1bu before update on t1 for each row begin end;
+insert t1 (b) select 1 union select 2;
+create trigger trgi before insert on t1 for each row set new.b=ifnull(new.b,10);
+insert t1 (b) select NULL union select 11;
+select * from t1;
+b
+1
+2
+10
+11
+drop table t1;
 # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/trigger_null.test mariadb-10.11.13/mysql-test/main/trigger_null.test
--- mariadb-10.11.11/mysql-test/main/trigger_null.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/trigger_null.test	2025-05-19 16:14:24.000000000 +0000
@@ -425,4 +425,15 @@
 drop table t1;
 set sql_mode=default;
 
+--echo #
+--echo # MDEV-36026 Problem with INSERT SELECT on NOT NULL columns while having BEFORE UPDATE trigger
+--echo #
+create table t1 (b int(11) not null);
+create trigger t1bu before update on t1 for each row begin end;
+insert t1 (b) select 1 union select 2;
+create trigger trgi before insert on t1 for each row set new.b=ifnull(new.b,10);
+insert t1 (b) select NULL union select 11;
+select * from t1;
+drop table t1;
+
 --echo # End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/main/type_binary.result mariadb-10.11.13/mysql-test/main/type_binary.result
--- mariadb-10.11.11/mysql-test/main/type_binary.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/type_binary.result	2025-05-19 16:14:24.000000000 +0000
@@ -397,3 +397,61 @@
 DROP TABLE t2;
 DROP TABLE t1;
 SET note_verbosity=DEFAULT;
+#
+# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+#
+CREATE TABLE t1 (c1 BINARY(16), UNIQUE (c1));
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2);
+SELECT HEX(c1) FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+HEX(c1)
+31000000000000000000000000000000
+32000000000000000000000000000000
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1);
+HEX(c1)
+31000000000000000000000000000000
+32000000000000000000000000000000
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+SELECT HEX(c1) FROM t1 WHERE '#' BETWEEN c1 AND 0;
+HEX(c1)
+2D310000000000000000000000000000
+2D320000000000000000000000000000
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0;
+HEX(c1)
+2D320000000000000000000000000000
+2D310000000000000000000000000000
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '-2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '-1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/type_binary.test mariadb-10.11.13/mysql-test/main/type_binary.test
--- mariadb-10.11.11/mysql-test/main/type_binary.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/type_binary.test	2025-05-19 16:14:24.000000000 +0000
@@ -178,3 +178,14 @@
 --source unusable_keys_joins.inc
 DROP TABLE t1;
 SET note_verbosity=DEFAULT;
+
+--echo #
+--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+--echo #
+CREATE TABLE t1 (c1 BINARY(16), UNIQUE (c1));
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2);
+SELECT HEX(c1) FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1);
+SELECT HEX(c1) FROM t1 WHERE '#' BETWEEN c1 AND 0;
+SELECT HEX(c1) FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/type_blob.result mariadb-10.11.13/mysql-test/main/type_blob.result
--- mariadb-10.11.11/mysql-test/main/type_blob.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/type_blob.result	2025-05-19 16:14:24.000000000 +0000
@@ -1419,3 +1419,193 @@
 DROP TABLE t2;
 DROP TABLE t1;
 SET note_verbosity=DEFAULT;
+#
+# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+#
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam;
+INSERT INTO t1 (c1) VALUES (1);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+DROP TABLE t1;
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2)));
+INSERT INTO t1 (c1) VALUES (1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+3
+4
+5
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 WHERE 3 BETWEEN 10*POW(-1,c1) AND (c1);
+c1
+3
+5
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 10*POW(-1,c1) AND (c1);
+c1
+1
+3
+5
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+DROP TABLE t1;
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam;
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+3
+4
+5
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+c1
+-2
+-1
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+DROP TABLE t1;
+CREATE TABLE t1 (c1 TINYBLOB NOT NULL);
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+3
+4
+5
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+c1
+-2
+-1
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+DROP TABLE t1;
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=innodb;
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+3
+4
+5
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+c1
+-2
+-1
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+ALTER TABLE t1 engine=myisam;
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+3
+4
+5
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+c1
+-2
+-1
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+DROP TABLE t1;
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1)) engine=innodb;
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+3
+4
+5
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+c1
+-2
+-1
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/type_blob.test mariadb-10.11.13/mysql-test/main/type_blob.test
--- mariadb-10.11.11/mysql-test/main/type_blob.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/type_blob.test	2025-05-19 16:14:24.000000000 +0000
@@ -808,3 +808,48 @@
 --source unusable_keys_joins.inc
 DROP TABLE t1;
 SET note_verbosity=DEFAULT;
+
+--echo #
+--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+--echo #
+# myisam has a special optimization for tables with one row
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam;
+INSERT INTO t1 (c1) VALUES (1);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+DROP TABLE t1;
+
+# This case shows that we don't transform the entire WHERE clause
+# into a range condition.
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2)));
+INSERT INTO t1 (c1) VALUES (1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 WHERE 3 BETWEEN 10*POW(-1,c1) AND (c1);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 10*POW(-1,c1) AND (c1);
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=myisam;
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 TINYBLOB NOT NULL);
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1(2))) engine=innodb;
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+ALTER TABLE t1 engine=myisam;
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+DROP TABLE t1;
+
+CREATE TABLE t1 (c1 TINYBLOB, UNIQUE (c1)) engine=innodb;
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2),(3),(4),(5);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/type_num_innodb.result mariadb-10.11.13/mysql-test/main/type_num_innodb.result
--- mariadb-10.11.11/mysql-test/main/type_num_innodb.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/type_num_innodb.result	2025-05-19 16:14:24.000000000 +0000
@@ -46,23 +46,70 @@
 SELECT * FROM t1,t2 WHERE a=d;
 a	b	c	pk	d	e
 Warnings:
-Warning	1292	Truncated incorrect DECIMAL value: 'd'
-Warning	1292	Truncated incorrect DECIMAL value: 'd'
-Warning	1292	Truncated incorrect DECIMAL value: 'f'
-Warning	1292	Truncated incorrect DECIMAL value: 'f'
-Warning	1292	Truncated incorrect DECIMAL value: 'g'
-Warning	1292	Truncated incorrect DECIMAL value: 'k'
-Warning	1292	Truncated incorrect DECIMAL value: 'm'
-Warning	1292	Truncated incorrect DECIMAL value: 'm'
-Warning	1292	Truncated incorrect DECIMAL value: 'm'
-Warning	1292	Truncated incorrect DECIMAL value: 'o'
-Warning	1292	Truncated incorrect DECIMAL value: 'q'
-Warning	1292	Truncated incorrect DECIMAL value: 'r'
-Warning	1292	Truncated incorrect DECIMAL value: 'u'
-Warning	1292	Truncated incorrect DECIMAL value: 'w'
-Warning	1292	Truncated incorrect DECIMAL value: 'x'
-Warning	1292	Truncated incorrect DECIMAL value: 'x'
-Warning	1292	Truncated incorrect DECIMAL value: 'y'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'g'
+Warning	1292	Truncated incorrect DOUBLE value: 'k'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'o'
+Warning	1292	Truncated incorrect DOUBLE value: 'q'
+Warning	1292	Truncated incorrect DOUBLE value: 'r'
+Warning	1292	Truncated incorrect DOUBLE value: 'u'
+Warning	1292	Truncated incorrect DOUBLE value: 'w'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'y'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'g'
+Warning	1292	Truncated incorrect DOUBLE value: 'k'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'o'
+Warning	1292	Truncated incorrect DOUBLE value: 'q'
+Warning	1292	Truncated incorrect DOUBLE value: 'r'
+Warning	1292	Truncated incorrect DOUBLE value: 'u'
+Warning	1292	Truncated incorrect DOUBLE value: 'w'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'y'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'g'
+Warning	1292	Truncated incorrect DOUBLE value: 'k'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'o'
+Warning	1292	Truncated incorrect DOUBLE value: 'q'
+Warning	1292	Truncated incorrect DOUBLE value: 'r'
+Warning	1292	Truncated incorrect DOUBLE value: 'u'
+Warning	1292	Truncated incorrect DOUBLE value: 'w'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'y'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'g'
+Warning	1292	Truncated incorrect DOUBLE value: 'k'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'o'
+Warning	1292	Truncated incorrect DOUBLE value: 'q'
+Warning	1292	Truncated incorrect DOUBLE value: 'r'
+Warning	1292	Truncated incorrect DOUBLE value: 'u'
 ALTER TABLE t1 MODIFY a DOUBLE;
 SELECT * FROM t1,t2 WHERE a=d;
 a	b	c	pk	d	e
@@ -84,6 +131,53 @@
 Warning	1292	Truncated incorrect DOUBLE value: 'x'
 Warning	1292	Truncated incorrect DOUBLE value: 'x'
 Warning	1292	Truncated incorrect DOUBLE value: 'y'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'g'
+Warning	1292	Truncated incorrect DOUBLE value: 'k'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'o'
+Warning	1292	Truncated incorrect DOUBLE value: 'q'
+Warning	1292	Truncated incorrect DOUBLE value: 'r'
+Warning	1292	Truncated incorrect DOUBLE value: 'u'
+Warning	1292	Truncated incorrect DOUBLE value: 'w'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'y'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'g'
+Warning	1292	Truncated incorrect DOUBLE value: 'k'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'o'
+Warning	1292	Truncated incorrect DOUBLE value: 'q'
+Warning	1292	Truncated incorrect DOUBLE value: 'r'
+Warning	1292	Truncated incorrect DOUBLE value: 'u'
+Warning	1292	Truncated incorrect DOUBLE value: 'w'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'x'
+Warning	1292	Truncated incorrect DOUBLE value: 'y'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'd'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'f'
+Warning	1292	Truncated incorrect DOUBLE value: 'g'
+Warning	1292	Truncated incorrect DOUBLE value: 'k'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'm'
+Warning	1292	Truncated incorrect DOUBLE value: 'o'
+Warning	1292	Truncated incorrect DOUBLE value: 'q'
+Warning	1292	Truncated incorrect DOUBLE value: 'r'
+Warning	1292	Truncated incorrect DOUBLE value: 'u'
 DROP TABLE t1,t2;
 #
 # End of 10.2 tests
diff -Nru mariadb-10.11.11/mysql-test/main/type_varbinary.result mariadb-10.11.13/mysql-test/main/type_varbinary.result
--- mariadb-10.11.11/mysql-test/main/type_varbinary.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/type_varbinary.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,42 @@
+#
+# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+#
+CREATE TABLE t1 (c1 VARBINARY(10), UNIQUE (c1));
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1);
+c1
+1
+2
+Warnings:
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+Warning	1292	Truncated incorrect DOUBLE value: 'a'
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+c1
+-1
+-2
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+SELECT c1 FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0;
+c1
+-2
+-1
+Warnings:
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+Warning	1292	Truncated incorrect DECIMAL value: '#'
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/type_varbinary.test mariadb-10.11.13/mysql-test/main/type_varbinary.test
--- mariadb-10.11.11/mysql-test/main/type_varbinary.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/type_varbinary.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,10 @@
+--echo #
+--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+--echo #
+CREATE TABLE t1 (c1 VARBINARY(10), UNIQUE (c1));
+INSERT INTO t1 (c1) VALUES (-2),(-1),(1),(2);
+SELECT c1 FROM t1 WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 IGNORE KEY(c1) WHERE 'a' BETWEEN 0 AND (c1);
+SELECT c1 FROM t1 WHERE '#' BETWEEN c1 AND 0;
+SELECT c1 FROM t1 IGNORE KEY(c1) WHERE '#' BETWEEN c1 AND 0;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/main/update.result mariadb-10.11.13/mysql-test/main/update.result
--- mariadb-10.11.11/mysql-test/main/update.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/update.result	2025-05-19 16:14:24.000000000 +0000
@@ -765,3 +765,83 @@
 u	xxb
 drop table t1;
 # End of MariaDB 10.4 tests
+#
+# MDEV-35955 Wrong result for UPDATE ... ORDER BY LIMIT which uses tmp.table
+#
+create table t1 (id int primary key, v int);
+create table t2 (id int primary key, v int);
+insert into t1 (id, v) values (2,3),(1,4);
+insert into t2 (id, v) values (5,5),(6,6);
+select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2;
+id	v	id	v
+1	4	5	5
+1	4	6	6
+UPDATE t1, t2 SET t1.v=-1, t2.v=-1 ORDER BY t1.id, t2.id LIMIT 2;
+select * from t1;
+id	v
+2	3
+1	-1
+select * from t2;
+id	v
+5	-1
+6	-1
+drop table t1, t2;
+create table t1 (id int primary key, v text) engine=myisam;
+create table t2 (id int primary key, v text) engine=myisam;
+insert into t1 (id, v) values (1,'b'),(2,'fo'),(3,'bar'),(4,'barr'),(5,'bazzz');
+insert into t2 (id, v) values (6,'quxqux'),(7,'foofoof'),(8,'barbarba'),(9,'quxquxqux'),(10,'bazbazbazb');
+select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2;
+id	v	id	v
+1	b	6	quxqux
+1	b	7	foofoof
+update t1, t2 set t1.v='DELETED', t2.v='DELETED' order by t1.id, t2.id limit 2;
+select * from t1;
+id	v
+1	DELETED
+2	fo
+3	bar
+4	barr
+5	bazzz
+select * from t2;
+id	v
+6	DELETED
+7	DELETED
+8	barbarba
+9	quxquxqux
+10	bazbazbazb
+drop table t1, t2;
+create table t1 (id int primary key, v int);
+create table t2 (id int primary key, v int);
+create table t3 (id int primary key, v int);
+insert into t1 (id, v) values (1, 1000), (2, 2000), (3, 3000), (4, 4000), (5, 5000);
+insert into t2 (id, v) values (10, 100), (20, 200), (30, 300), (40, 400), (50, 500);
+insert into t3 (id, v) values (11, 111), (22, 222), (33, 333), (44, 444), (55, 555);
+select t1.*, t2.*, t3.* from t1, t2, t3 order by t1.id, t2.id, t3.id limit 3;
+id	v	id	v	id	v
+1	1000	10	100	11	111
+1	1000	10	100	22	222
+1	1000	10	100	33	333
+UPDATE t1, t2, t3 SET t1.v=-1, t2.v=-2, t3.v=-3 ORDER BY t1.id, t2.id, t3.id LIMIT 3;
+select * from t1;
+id	v
+1	-1
+2	2000
+3	3000
+4	4000
+5	5000
+select * from t2;
+id	v
+10	-2
+20	200
+30	300
+40	400
+50	500
+select * from t3;
+id	v
+11	-3
+22	-3
+33	-3
+44	444
+55	555
+drop table t1, t2, t3;
+# End of MariaDB 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/main/update.test mariadb-10.11.13/mysql-test/main/update.test
--- mariadb-10.11.11/mysql-test/main/update.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/update.test	2025-05-19 16:14:24.000000000 +0000
@@ -707,3 +707,43 @@
 drop table t1;
 
 --echo # End of MariaDB 10.4 tests
+
+--echo #
+--echo # MDEV-35955 Wrong result for UPDATE ... ORDER BY LIMIT which uses tmp.table
+--echo #
+
+create table t1 (id int primary key, v int);
+create table t2 (id int primary key, v int);
+insert into t1 (id, v) values (2,3),(1,4);
+insert into t2 (id, v) values (5,5),(6,6);
+select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2;
+UPDATE t1, t2 SET t1.v=-1, t2.v=-1 ORDER BY t1.id, t2.id LIMIT 2;
+select * from t1;
+select * from t2;
+
+drop table t1, t2;
+create table t1 (id int primary key, v text) engine=myisam;
+create table t2 (id int primary key, v text) engine=myisam;
+insert into t1 (id, v) values (1,'b'),(2,'fo'),(3,'bar'),(4,'barr'),(5,'bazzz');
+insert into t2 (id, v) values (6,'quxqux'),(7,'foofoof'),(8,'barbarba'),(9,'quxquxqux'),(10,'bazbazbazb');
+select t1.*, t2.* from t1, t2 order by t1.id, t2.id limit 2;
+update t1, t2 set t1.v='DELETED', t2.v='DELETED' order by t1.id, t2.id limit 2;
+select * from t1;
+select * from t2;
+
+drop table t1, t2;
+create table t1 (id int primary key, v int);
+create table t2 (id int primary key, v int);
+create table t3 (id int primary key, v int);
+insert into t1 (id, v) values (1, 1000), (2, 2000), (3, 3000), (4, 4000), (5, 5000);
+insert into t2 (id, v) values (10, 100), (20, 200), (30, 300), (40, 400), (50, 500);
+insert into t3 (id, v) values (11, 111), (22, 222), (33, 333), (44, 444), (55, 555);
+select t1.*, t2.*, t3.* from t1, t2, t3 order by t1.id, t2.id, t3.id limit 3;
+UPDATE t1, t2, t3 SET t1.v=-1, t2.v=-2, t3.v=-3 ORDER BY t1.id, t2.id, t3.id LIMIT 3;
+select * from t1;
+select * from t2;
+select * from t3;
+
+drop table t1, t2, t3;
+
+--echo # End of MariaDB 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/main/userstat.result mariadb-10.11.13/mysql-test/main/userstat.result
--- mariadb-10.11.11/mysql-test/main/userstat.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/userstat.result	2025-05-19 16:14:24.000000000 +0000
@@ -247,6 +247,11 @@
 ERROR 21000: Subquery returns more than 1 row
 set global userstat= 0;
 drop function f;
-#
 # End of 10.2 tests
 #
+# MDEV-36586 USER_STATISTICS.BUSY_TIME is in microseconds
+#
+select distinct busy_time>1e5, cpu_time>1e5 from information_schema.user_statistics;
+busy_time>1e5	cpu_time>1e5
+0	0
+# End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/main/userstat.test mariadb-10.11.13/mysql-test/main/userstat.test
--- mariadb-10.11.11/mysql-test/main/userstat.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/userstat.test	2025-05-19 16:14:24.000000000 +0000
@@ -135,6 +135,11 @@
 drop function f;
 --enable_ps2_protocol
 
---echo #
 --echo # End of 10.2 tests
+
 --echo #
+--echo # MDEV-36586 USER_STATISTICS.BUSY_TIME is in microseconds
+--echo #
+select distinct busy_time>1e5, cpu_time>1e5 from information_schema.user_statistics;
+
+--echo # End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/main/view.result mariadb-10.11.13/mysql-test/main/view.result
--- mariadb-10.11.11/mysql-test/main/view.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/view.result	2025-05-19 16:14:24.000000000 +0000
@@ -944,31 +944,19 @@
 create view v2 as select * from v1;
 create view v3 as select v2.col1 from v2,t2 where v2.col1 = t2.col1;
 insert into v2 values ((select max(col1) from v1));
-ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'v2'
 insert into t1 values ((select max(col1) from v1));
-ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 't1'
 insert into v2 values ((select max(col1) from v1));
-ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'v2'
 insert into v2 values ((select max(col1) from t1));
-ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v2'
 insert into t1 values ((select max(col1) from t1));
-ERROR HY000: Table 't1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into v2 values ((select max(col1) from t1));
-ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v2'
 insert into v2 values ((select max(col1) from v2));
-ERROR HY000: Table 'v2' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into t1 values ((select max(col1) from v2));
-ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 't1'
 insert into v2 values ((select max(col1) from v2));
-ERROR HY000: Table 'v2' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into v3 (col1) values ((select max(col1) from v1));
-ERROR HY000: The definition of table 'v1' prevents operation INSERT on table 'v3'
 insert into v3 (col1) values ((select max(col1) from t1));
-ERROR HY000: The definition of table 'v3' prevents operation INSERT on table 'v3'
 insert into v3 (col1) values ((select max(col1) from v2));
-ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v3'
-insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2));
-ERROR HY000: The definition of table 'v2' prevents operation INSERT on table 'v3'
+insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2 LIMIT 1));
+ERROR 22003: Out of range value for column 'col1' at row 3
 insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2));
 insert into t3 values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2));
 ERROR 23000: Column 'col1' cannot be null
@@ -978,6 +966,18 @@
 select * from t1;
 col1
 NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
 1
 2
 3
@@ -1332,9 +1332,26 @@
 insert into v3 values (30);
 ERROR HY000: The target table v3 of the INSERT is not insertable-into
 create view v4 as select * from v2 where 20 < (select (s1) from t1);
+select * from t1;
+s1
 insert into v4 values (30);
-ERROR HY000: The target table v4 of the INSERT is not insertable-into
-drop view v4, v3, v2, v1;
+select * from t1;
+s1
+30
+create view v5 as select * from v2 where s1 < (select min(s1) from t1) WITH CHECK OPTION;
+# can't insert only less then minimum
+insert into v5 values (40);
+ERROR 44000: CHECK OPTION failed `test`.`v5`
+# allow insert the new minimum
+insert into v5 values (10);
+# always emply view (can't be something less than minimum)
+select * from v5;
+s1
+select * from t1;
+s1
+30
+10
+drop view v5, v4, v3, v2, v1;
 drop table t1;
 create table t1 (a int);
 create view v1 as select * from t1;
diff -Nru mariadb-10.11.11/mysql-test/main/view.test mariadb-10.11.13/mysql-test/main/view.test
--- mariadb-10.11.11/mysql-test/main/view.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/view.test	2025-05-19 16:14:24.000000000 +0000
@@ -866,33 +866,21 @@
 create view v1 as select * from t1;
 create view v2 as select * from v1;
 create view v3 as select v2.col1 from v2,t2 where v2.col1 = t2.col1;
--- error ER_VIEW_PREVENT_UPDATE
 insert into v2 values ((select max(col1) from v1));
--- error ER_VIEW_PREVENT_UPDATE
 insert into t1 values ((select max(col1) from v1));
--- error ER_VIEW_PREVENT_UPDATE
 insert into v2 values ((select max(col1) from v1));
--- error ER_VIEW_PREVENT_UPDATE
 insert into v2 values ((select max(col1) from t1));
--- error ER_UPDATE_TABLE_USED
 insert into t1 values ((select max(col1) from t1));
--- error ER_VIEW_PREVENT_UPDATE
 insert into v2 values ((select max(col1) from t1));
--- error ER_UPDATE_TABLE_USED
 insert into v2 values ((select max(col1) from v2));
--- error ER_VIEW_PREVENT_UPDATE
 insert into t1 values ((select max(col1) from v2));
--- error ER_UPDATE_TABLE_USED
 insert into v2 values ((select max(col1) from v2));
--- error ER_VIEW_PREVENT_UPDATE
 insert into v3 (col1) values ((select max(col1) from v1));
--- error ER_VIEW_PREVENT_UPDATE
 insert into v3 (col1) values ((select max(col1) from t1));
--- error ER_VIEW_PREVENT_UPDATE
 insert into v3 (col1) values ((select max(col1) from v2));
 # check with TZ tables in list
--- error ER_VIEW_PREVENT_UPDATE
-insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2));
+--error ER_WARN_DATA_OUT_OF_RANGE
+insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from v2 LIMIT 1));
 insert into v3 (col1) values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2));
 -- error ER_BAD_NULL_ERROR
 insert into t3 values ((select CONVERT_TZ('20050101000000','UTC','MET') from t2));
@@ -1210,9 +1198,19 @@
 -- error ER_NON_INSERTABLE_TABLE
 insert into v3 values (30);
 create view v4 as select * from v2 where 20 < (select (s1) from t1);
--- error ER_NON_INSERTABLE_TABLE
+select * from t1;
 insert into v4 values (30);
-drop view v4, v3, v2, v1;
+select * from t1;
+create view v5 as select * from v2 where s1 < (select min(s1) from t1) WITH CHECK OPTION;
+--echo # can't insert only less then minimum
+--error ER_VIEW_CHECK_FAILED
+insert into v5 values (40);
+--echo # allow insert the new minimum
+insert into v5 values (10);
+--echo # always emply view (can't be something less than minimum)
+select * from v5;
+select * from t1;
+drop view v5, v4, v3, v2, v1;
 drop table t1;
 
 #
diff -Nru mariadb-10.11.11/mysql-test/main/view_grant.result mariadb-10.11.13/mysql-test/main/view_grant.result
--- mariadb-10.11.11/mysql-test/main/view_grant.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/view_grant.result	2025-05-19 16:14:24.000000000 +0000
@@ -1982,6 +1982,52 @@
 DROP VIEW v1;
 DROP USER foo;
 DROP USER FOO;
+#
+# MDEV-36380: User has unauthorized access to a sequence through
+# a view with security invoker
+#
+create database db;
+use db;
+create sequence s;
+create sql security invoker view vin as select nextval(s);
+create sql security definer view vdn as select nextval(s);
+create sql security invoker view vil as select lastval(s);
+create sql security definer view vdl as select lastval(s);
+create sql security invoker view vis as select setval(s,20);
+create sql security definer view vds as select setval(s,30);
+create user u@localhost;
+grant select on db.vin to u@localhost;
+grant select on db.vdn to u@localhost;
+grant select on db.vil to u@localhost;
+grant select on db.vdl to u@localhost;
+grant select on db.vis to u@localhost;
+grant select on db.vds to u@localhost;
+connect  con1,localhost,u,,db;
+select nextval(s);
+ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `db`.`s`
+select * from vin;
+ERROR HY000: View 'db.vin' references invalid table(s) or column(s) or function(s) or definer/invoker of view lack rights to use them
+select * from vdn;
+nextval(s)
+1
+select lastval(s);
+ERROR 42000: SELECT command denied to user 'u'@'localhost' for table `db`.`s`
+select * from vil;
+ERROR HY000: View 'db.vil' references invalid table(s) or column(s) or function(s) or definer/invoker of view lack rights to use them
+select * from vdl;
+lastval(s)
+1
+select setval(s,10);
+ERROR 42000: INSERT command denied to user 'u'@'localhost' for table `db`.`s`
+select * from vis;
+ERROR HY000: View 'db.vis' references invalid table(s) or column(s) or function(s) or definer/invoker of view lack rights to use them
+select * from vds;
+setval(s,30)
+30
+disconnect con1;
+connection default;
+drop database db;
+drop user u@localhost;
 # End of 10.5 tests
 # Check that a user without access to the schema 'foo' cannot query
 # a JSON_TABLE view in that schema.
diff -Nru mariadb-10.11.11/mysql-test/main/view_grant.test mariadb-10.11.13/mysql-test/main/view_grant.test
--- mariadb-10.11.11/mysql-test/main/view_grant.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/main/view_grant.test	2025-05-19 16:14:24.000000000 +0000
@@ -2237,6 +2237,53 @@
 DROP USER foo;
 DROP USER FOO;
 
+--echo #
+--echo # MDEV-36380: User has unauthorized access to a sequence through
+--echo # a view with security invoker
+--echo #
+create database db;
+use db;
+create sequence s;
+create sql security invoker view vin as select nextval(s);
+create sql security definer view vdn as select nextval(s);
+create sql security invoker view vil as select lastval(s);
+create sql security definer view vdl as select lastval(s);
+create sql security invoker view vis as select setval(s,20);
+create sql security definer view vds as select setval(s,30);
+create user u@localhost;
+grant select on db.vin to u@localhost;
+grant select on db.vdn to u@localhost;
+grant select on db.vil to u@localhost;
+grant select on db.vdl to u@localhost;
+grant select on db.vis to u@localhost;
+grant select on db.vds to u@localhost;
+
+--connect (con1,localhost,u,,db)
+--error ER_TABLEACCESS_DENIED_ERROR
+select nextval(s);
+--error ER_VIEW_INVALID
+select * from vin;
+--disable_ps2_protocol
+select * from vdn;
+--enable_ps2_protocol
+
+--error ER_TABLEACCESS_DENIED_ERROR
+select lastval(s);
+--error ER_VIEW_INVALID
+select * from vil;
+select * from vdl;
+
+--error ER_TABLEACCESS_DENIED_ERROR
+select setval(s,10);
+--error ER_VIEW_INVALID
+select * from vis;
+select * from vds;
+
+--disconnect con1
+--connection default
+drop database db;
+drop user u@localhost;
+
 --echo # End of 10.5 tests
 
 --echo # Check that a user without access to the schema 'foo' cannot query
diff -Nru mariadb-10.11.11/mysql-test/mariadb-test-run.pl mariadb-10.11.13/mysql-test/mariadb-test-run.pl
--- mariadb-10.11.11/mysql-test/mariadb-test-run.pl	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/mariadb-test-run.pl	2025-05-19 16:14:24.000000000 +0000
@@ -130,6 +130,8 @@
 our $path_current_testlog;
 our $path_testlog;
 
+our $opt_open_files_limit;
+
 our $default_vardir;
 our $opt_vardir;                # Path to use for var/ dir
 our $plugindir;
@@ -268,6 +270,9 @@
 our $opt_skip_not_found= 0;
 our $opt_mem= $ENV{'MTR_MEM'};
 our $opt_clean_vardir= $ENV{'MTR_CLEAN_VARDIR'};
+our $opt_catalogs= 0;
+our $opt_catalog_name="";
+our $catalog_name="def";
 
 our $opt_gcov;
 our $opt_gprof;
@@ -1274,6 +1279,7 @@
 	     'list-options'             => \$opt_list_options,
              'skip-test-list=s'         => \@opt_skip_test_list,
              'xml-report=s'             => \$opt_xml_report,
+             'open-files-limit=i',      => \$opt_open_files_limit,
 
              My::Debugger::options(),
              My::CoreDump::options(),
@@ -2223,6 +2229,9 @@
   {
      $ENV{'MYSQL_INSTALL_DB_EXE'}=  mtr_exe_exists("$bindir/sql$multiconfig/mariadb-install-db",
        "$bindir/bin/mariadb-install-db");
+     $ENV{'MARIADB_UPGRADE_SERVICE_EXE'}= mtr_exe_exists("$bindir/sql$multiconfig/mariadb-upgrade-service",
+      "$bindir/bin/mariadb-upgrade-service");
+     $ENV{'MARIADB_UPGRADE_EXE'}= mtr_exe_exists("$path_client_bindir/mariadb-upgrade");
   }
 
   my $client_config_exe=
@@ -3945,6 +3954,23 @@
       }
     }
 
+    # Set up things for catalogs
+    # The values of MARIADB_TOPDIR and MARIAD_DATADIR should
+    # be taken from the values used by the default (first)
+    # connection that is used by mariadb-test.
+    my ($mysqld, @servers);
+    @servers= all_servers();
+    $mysqld= $servers[0];
+    $ENV{'MARIADB_TOPDIR'}= $mysqld->value('datadir');
+    if (!$opt_catalogs)
+    {
+      $ENV{'MARIADB_DATADIR'}= $mysqld->value('datadir');
+    }
+    else
+    {
+      $ENV{'MARIADB_DATADIR'}= $mysqld->value('datadir') . "/" . $catalog_name;
+    }
+
     # Write start of testcase to log
     mark_log($path_current_testlog, $tinfo);
 
@@ -4458,14 +4484,13 @@
     (
      @global_suppressions,
      qr/error .*connecting to master/,
-     qr/InnoDB: Error: in ALTER TABLE `test`.`t[12]`/,
-     qr/InnoDB: Error: table `test`.`t[12]` .*does not exist in the InnoDB internal/,
-     qr/InnoDB: Warning: a long semaphore wait:/,
      qr/InnoDB: Dumping buffer pool.*/,
      qr/InnoDB: Buffer pool.*/,
      qr/InnoDB: Could not free any blocks in the buffer pool!/,
-     qr/InnoDB: Warning: Writer thread is waiting this semaphore:/,
      qr/InnoDB: innodb_open_files .* should not be greater than/,
+     qr/InnoDB: Trying to delete tablespace.*but there are.*pending/,
+     qr/InnoDB: Tablespace 1[0-9]* was not found at .*, and innodb_force_recovery was set/,
+     qr/InnoDB: Long wait \([0-9]+ seconds\) for double-write buffer flush/,
      qr/Slave: Unknown table 't1' .* 1051/,
      qr/Slave SQL:.*(Internal MariaDB error code: [[:digit:]]+|Query:.*)/,
      qr/slave SQL thread aborted/,
@@ -5745,6 +5770,7 @@
      append        => 1,
      error         => $path_current_testlog,
      verbose       => $opt_verbose,
+     open_files_limit => $opt_open_files_limit,
     );
   mtr_verbose("Started $proc");
   return $proc;
@@ -6043,6 +6069,8 @@
   timediff              With --timestamp, also print time passed since
                         *previous* test started
   max-connections=N     Max number of open connection to server in mysqltest
+  open-files-limit=N    Max number of open files allowed for any of the children
+                        of my_safe_process. Default is 1024.
   report-times          Report how much time has been spent on different
                         phases of test execution.
   stress=ARGS           Run stress test, providing options to
diff -Nru mariadb-10.11.11/mysql-test/std_data/galera_certs/galera.root.crt mariadb-10.11.13/mysql-test/std_data/galera_certs/galera.root.crt
--- mariadb-10.11.11/mysql-test/std_data/galera_certs/galera.root.crt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/std_data/galera_certs/galera.root.crt	2025-05-19 16:14:24.000000000 +0000
@@ -2,7 +2,7 @@
 MIIFlTCCA32gAwIBAgIUKCF88W+48rZzdfgYpE2dXVMGSKgwDQYJKoZIhvcNAQEL
 BQAwWjELMAkGA1UEBhMCRkkxETAPBgNVBAgMCEhlbHNpbmtpMREwDwYDVQQHDAhI
 ZWxzaW5raTEPMA0GA1UECgwGR2FsZXJhMRQwEgYDVQQDDAtnYWxlcmEucm9vdDAe
-Fw0yMTAyMDQxMzE3MDJaFw0yMzExMjUxMzE3MDJaMFoxCzAJBgNVBAYTAkZJMREw
+Fw0yMzEyMDExMzQzNDBaFw0zMzExMjgxMzQzNDBaMFoxCzAJBgNVBAYTAkZJMREw
 DwYDVQQIDAhIZWxzaW5raTERMA8GA1UEBwwISGVsc2lua2kxDzANBgNVBAoMBkdh
 bGVyYTEUMBIGA1UEAwwLZ2FsZXJhLnJvb3QwggIiMA0GCSqGSIb3DQEBAQUAA4IC
 DwAwggIKAoICAQDKqL45jbaq8RLOj+DeilPcEnBN5gn/y9V3IfZ0BQCd4bR09zLz
@@ -18,15 +18,15 @@
 F+XZTdTiaOWPEmvFFGLLUQxKl4w872hJaupqfteqdiZ+3ICVIUI8qnXHmwIDAQAB
 o1MwUTAdBgNVHQ4EFgQUs75v/MgjJ5RHGE6+0qdiVo4BwlowHwYDVR0jBBgwFoAU
 s75v/MgjJ5RHGE6+0qdiVo4BwlowDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0B
-AQsFAAOCAgEAOVhBs28dwwvD5q2r7oVVcxLc+tb8zu4XxpXT1p6hiZYUyPguCh00
-GVdXCgR4JMI/NcyM5fBAbF3S8oK3+9rw2kW09afVV06Qf/8o3nIyOiDl7598tGIP
-CCK4QsUW/dGajx5kvhtQ7qce+u9KfFTof6lq2xkYtFBBhmBdSv9A1jAZJMw2x3bc
-nr99PS8XZMphS0MIExHKj6Ry5DdYm722zZHyIEiiEGyMViDm2m1iug5r/LPH5Z56
-BjQiH4VP+0y5mevBOUGuH8ID+J9Hu9BeoXLhkv+W2Ljs/S6wqzjinMBqVG+wwe0Y
-a8F5pABkl5uX38nMQ7CikSbLxSbn7nRf+sux1sbzqjMldeCSqiv9mI5Ysq97+Ni1
-5qMxNxNc0u/wGRnrXH8fWfxBKPP5moA7DQfVcUWPgDGQwDpA8kn8RlJxFk3g4yaK
-+NMwk5MORKyx3tz/A3Yhs9AUXk3okvmQCT2YVSHcKUB8PAU+TaKqbr3wk07Y/tL/
-jFPHS+t3eD91Y05KGUXjdtGi+33zpV0biHmTWAZT78VQowDNvEpTnXhkSx8HGHYR
-nqSMU2m2LboHSatY113RYznx0LJ1azczRlJdGs8oyPWLPDD2JCesZaQqGZVRJoms
-lK4EzYEb5mZTCRgtgoiO+iKcf6XifuOCrWZXoLm4FlLEfOQ3b8yAFlo=
+AQsFAAOCAgEAKLV6mkWb88HEJXo1XlmAzznIYNfilrvvxwcjhceluDE8s8sPSpYM
+Bz5ebWlHCgEkC/ezhA/PDtZsZlQKwv4jb++lAlFSlebT1GW77xKkdRBTKgkFAaOA
+pF5eZao6IP8l76fA4OoI2Tttw5jeb23kOoklDp/8VS0JEAT3wm/hZiE20aUbAFC+
+kPiCucBztzaTHQud9CgtxRH/B3D9FaPuwae/H6FYrvQVNVjcaHTIUh9fTcyKRXYm
+oYbvK7fIhCjZkG2LRWRU9Kirivb+ktO4POsuK4BgYrsFaOBf9HYsojA7llyGDopN
+cfw9jtb27Qb/uMKJnClFg14u685CU5JAzY31E5OQPPUUx9PqP4Z9PgXRQ0xI6H/4
+sejlcQuqGCDKiL2lOzUjbT86EjO4ZfiKHR+lKOIuT5mXiR8cbS1JeyX3Mrv1Ds4r
+UVcdtSXTy6/XYWFIzhu+MrsFon6VX0HkmSH1HjSoLMOZcHAZIFZZ/uAahLmMNaEG
+lV15fD5+t5QRKwqmdFUW2ETiqSJxRs6Y++ptxpiiH38QVWPvBWeRgcPpf3A478Bl
+iGO0xn0N57TnhFs3g0C0xyZgTBMozfVostYpps1Tqqz0VOhtmURxTZm9JZgTb7qv
+nMURY0SIQKXpHCcJuNtxZcDSu8uxgUcMsLSSC7Zmk7/cSeUfmOgZVzU=
 -----END CERTIFICATE-----
diff -Nru mariadb-10.11.11/mysql-test/suite/archive/archive-big.test mariadb-10.11.13/mysql-test/suite/archive/archive-big.test
--- mariadb-10.11.11/mysql-test/suite/archive/archive-big.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/archive/archive-big.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,7 @@
 --source include/big_test.inc
-# Valgrind is to slow for this test
+# Valgrind and msan is to slow for this test
 --source include/not_valgrind.inc
+--source include/not_msan.inc
 --source include/have_archive.inc
 CREATE TABLE t1(a BLOB) ENGINE=ARCHIVE;
 --disable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/README.txt mariadb-10.11.13/mysql-test/suite/atomic/README.txt
--- mariadb-10.11.11/mysql-test/suite/atomic/README.txt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/README.txt	2025-05-19 16:14:24.000000000 +0000
@@ -3,7 +3,7 @@
 
 - Add # before --exec echo "restart" ...
 - Force $e (engine), $c (crash point) and $r (crash position) to the values
-  where things goes wrong. See comments in alter_table.test for how to do this.
+  where things goes wrong. See comments in alter_table.inc for how to do this.
 - start mariadbd in a debugger
 
 run the following in the debugger
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.inc mariadb-10.11.13/mysql-test/suite/atomic/alter_table.inc
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.inc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.inc	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,198 @@
+--source include/long_test.inc
+--source include/have_debug.inc
+--source include/have_log_bin.inc
+
+if (!$BIG_TEST)
+{
+  --source include/not_valgrind.inc
+  --source include/not_msan.inc
+}
+
+#
+# Testing of atomic create table with crashes in a lot of different places
+#
+# Things tested:
+# With myisam and InnoDB engines to ensure that cover both normal and
+# online alter table paths.
+# Alter table with new columns
+# Alter table which only touches .frm
+# Alter table disable keys (has it own code path)
+# Alter table with rename
+# Alter table with rename and only options that touches .frm
+# Alter table with rename and add new columns
+# Alter table with storage engine change (with and without column definition
+# changes)
+# Alter table with storage engine change and rename
+# Alter table to another database
+
+--disable_query_log
+call mtr.add_suppression("InnoDB: .* does not exist in the InnoDB internal");
+# Speed up wait_until_connected_again.inc
+let NO_WSREP=1;
+--enable_query_log
+let $MYSQLD_DATADIR= `SELECT @@datadir`;
+
+create database test2;
+RESET MASTER;
+
+if ($engine_count == "")
+{
+  let $engine_count=2;
+  let $engines='myisam','innodb';
+}
+if ($extra_engine == "")
+{
+  let $extra_engine=aria;
+}
+
+let $crash_count=13;
+let $crash_points='ddl_log_alter_after_create_frm', 'ddl_log_alter_after_create_table', 'ddl_log_alter_after_prepare_inplace','ddl_log_alter_after_copy', 'ddl_log_alter_after_log', 'ddl_log_alter_after_rename_to_backup', 'ddl_log_alter_after_rename_to_backup_log', 'ddl_log_alter_rename_frm', 'ddl_log_alter_after_rename_to_original', 'ddl_log_alter_before_rename_triggers', 'ddl_log_alter_after_rename_triggers', 'ddl_log_alter_after_delete_backup', 'ddl_log_alter_after_drop_original_table';
+
+let $statement_count=16;
+let $statements='ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"',
+                'ALTER TABLE t1 COMMENT "new"',
+                'ALTER TABLE t1 change column a c int COMMENT "new"',
+                'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2',
+                'ALTER TABLE t1 disable keys',
+                'ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"',
+                'ALTER TABLE t1 rename t2',
+                'ALTER TABLE t1 COMMENT "new", rename t2',
+                'ALTER TABLE t1 change column a c int COMMENT "new", rename t2',
+                'ALTER TABLE t1 ENGINE=$extra_engine, COMMENT "new"',
+                'ALTER TABLE t1 change column a c int COMMENT "new", engine=$extra_engine',
+                'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=$extra_engine',
+                'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2',
+                'ALTER TABLE t1 COMMENT "new", rename test2.t2',
+                'ALTER TABLE t1 ADD key(b), COMMENT "new"',
+                'ALTER TABLE t1 DROP INDEX a';
+
+# If there is a need of testing one specific state (crash point and query),
+# one can use the comments below to execute one specific test combination
+#let $crash_count=1;
+#let $crash_points='ddl_log_alter_after_create_frm';
+#let $statement_count= 1;
+#let $statements='ALTER TABLE t1 ADD COLUMN c int, COMMENT "new"';
+#let $engine_count=1;
+#let $engines='rocksdb';
+#--source include/have_rocksdb.inc
+
+let $old_debug=`select @@debug_dbug`;
+let $e=0;
+let $keep_include_silent=1;
+let $grep_script=ALTER;
+--disable_query_log
+
+while ($e < $engine_count)
+{
+  inc $e;
+  let $engine=`select ELT($e, $engines)`;
+  let $default_engine=$engine;
+
+  --echo
+  --echo engine: $engine
+  --echo
+
+  let $r=0;
+  while ($r < $statement_count)
+  {
+    inc $r;
+    let $statement=`select ELT($r, $statements)`;
+    --echo
+    --echo query: $statement
+    --echo
+    let $c=0;
+    while ($c < $crash_count)
+    {
+      inc $c;
+      let $crash=`select ELT($c, $crash_points)`;
+
+      --eval create table t1 (a int, b int, key(a)) engine=$engine
+      insert into t1 values (1,1),(2,2);
+      commit;
+      flush tables;
+
+      FLUSH BINARY LOGS;
+      --let $start_binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+      --echo crash point: $crash
+      if ($crash_count > 1)
+      {
+         --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+      }
+#     The following can be used for testing one specific failure
+#     if ($crash == "ddl_log_alter_after_log")
+#     {
+#       if ($r == 2)
+#       {
+#         --remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+#       }
+#     }
+      --disable_reconnect
+      --eval set @@debug_dbug="+d,$crash",@debug_crash_counter=1
+      let $errno=0;
+      --error 0,2013
+      --eval $statement;
+      let $error=$errno;
+      --enable_reconnect
+      --source include/wait_until_connected_again.inc
+      --disable_query_log
+      --eval set @@debug_dbug="$old_debug"
+
+      if ($error == 0)
+      {
+        echo "No crash!";
+      }
+      if ($error != 0)
+      {
+        --list_files $MYSQLD_DATADIR/test t*
+        --list_files $MYSQLD_DATADIR/test *sql*
+        --list_files $MYSQLD_DATADIR/test2 t*
+        --list_files $MYSQLD_DATADIR/test2 *sql*
+        # Check which tables still exists
+        --error 0,1
+        --file_exists $MYSQLD_DATADIR/test/t1.frm
+        let $error2=$errno;
+        if ($error2 == 0)
+        {
+          show create table t1;
+          select count(*) from t1;
+        }
+        if ($error2 == 1)
+        {
+          --error 0,1
+          --file_exists $MYSQLD_DATADIR/test/t2.frm
+          let $error3=$errno;
+          if ($error3 == 0)
+          {
+            show create table t2;
+            select count(*) from t2;
+          }
+          if ($error3 == 1)
+          {
+            --echo "Table is in test2"
+            show create table test2.t2;
+            select count(*) from test2.t2;
+          }
+        }
+        --let $binlog_file=$start_binlog_file
+        --let $binlog_output_name=master-bin.000001
+
+        --source include/show_binlog_events.inc
+        if ($error)
+        {
+          --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+          --let $binlog_output_name=master-bin.000002
+          if ($binlog_file != $start_binlog_file)
+          {
+            --source include/show_binlog_events.inc
+          }
+        }
+      }
+      --disable_warnings
+      drop table if exists t1,t2;
+      drop table if exists test2.t2;
+      --enable_warnings
+    }
+  }
+}
+drop database test2;
+--enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.opt mariadb-10.11.13/mysql-test/suite/atomic/alter_table.opt
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
---innodb-max-dirty-pages-pct=0
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.result mariadb-10.11.13/mysql-test/suite/atomic/alter_table.result
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.result	1970-01-01 00:00:00.000000000 +0000
@@ -1,3135 +0,0 @@
-create database test2;
-RESET MASTER;
-
-engine: myisam
-
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-crash point: ddl_log_alter_after_drop_original_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-
-query: ALTER TABLE t1 COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new"
-crash point: ddl_log_alter_after_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 change column a c int COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new"
-crash point: ddl_log_alter_after_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_to_backup
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_to_backup_log
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_before_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_delete_backup
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_drop_original_table
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-
-query: ALTER TABLE t1 disable keys
-
-crash point: ddl_log_alter_after_create_frm
-"No crash!"
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-"No crash!"
-crash point: ddl_log_alter_after_log
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_after_drop_original_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-
-query: ALTER TABLE t1 rename t2
-
-crash point: ddl_log_alter_after_create_frm
-"No crash!"
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-"No crash!"
-crash point: ddl_log_alter_after_log
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_rename_triggers
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 COMMENT "new", rename t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
-crash point: ddl_log_alter_after_log
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-crash point: ddl_log_alter_after_log
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_after_drop_original_table
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-
-query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_after_drop_original_table
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_rename_to_backup
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_rename_to_backup_log
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_before_rename_triggers
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_rename_triggers
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_delete_backup
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_drop_original_table
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_to_backup
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_to_backup_log
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_before_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_delete_backup
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_drop_original_table
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-
-query: ALTER TABLE t1 COMMENT "new", rename test2.t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.MYD
-t2.MYI
-t2.frm
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 ADD key(b), COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`),
-  KEY `b` (`b`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`),
-  KEY `b` (`b`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`),
-  KEY `b` (`b`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`),
-  KEY `b` (`b`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`),
-  KEY `b` (`b`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
-crash point: ddl_log_alter_after_drop_original_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`),
-  KEY `b` (`b`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
-
-query: ALTER TABLE t1 DROP INDEX a
-
-crash point: ddl_log_alter_after_create_frm
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
-crash point: ddl_log_alter_after_drop_original_table
-t1.MYD
-t1.MYI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL
-) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
-
-engine: innodb
-
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 change column a c int COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 disable keys
-
-crash point: ddl_log_alter_after_create_frm
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_create_table
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_copy
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_log
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-Warnings:
-Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
-"No crash!"
-
-query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-crash point: ddl_log_alter_after_drop_original_table
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
-
-query: ALTER TABLE t1 rename t2
-
-crash point: ddl_log_alter_after_create_frm
-"No crash!"
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-"No crash!"
-crash point: ddl_log_alter_after_log
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_rename_triggers
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 COMMENT "new", rename t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.frm
-t2.ibd
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-crash point: ddl_log_alter_after_drop_original_table
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
-
-query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_after_rename_to_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_after_rename_to_backup_log
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-crash point: ddl_log_alter_after_drop_original_table
-t1.MAD
-t1.MAI
-t1.frm
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `c` int(11) DEFAULT NULL COMMENT 'new',
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`c`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_prepare_inplace
-"No crash!"
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_rename_to_backup
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_rename_to_backup_log
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_before_rename_triggers
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_rename_triggers
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_delete_backup
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-crash point: ddl_log_alter_after_drop_original_table
-t2.MAD
-t2.MAI
-t2.frm
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
-
-query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.frm
-t2.ibd
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.frm
-t2.ibd
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  `c` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 COMMENT "new", rename test2.t2
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-t2.frm
-t2.ibd
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_rename_triggers
-t2.frm
-t2.ibd
-"Table is in test2"
-Table	Create Table
-t2	CREATE TABLE `t2` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 ADD key(b), COMMENT "new"
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`),
-  KEY `b` (`b`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
-
-query: ALTER TABLE t1 DROP INDEX a
-
-crash point: ddl_log_alter_after_create_frm
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_create_table
-"No crash!"
-crash point: ddl_log_alter_after_prepare_inplace
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_copy
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL,
-  KEY `a` (`a`)
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-crash point: ddl_log_alter_after_log
-t1.frm
-t1.ibd
-Table	Create Table
-t1	CREATE TABLE `t1` (
-  `a` int(11) DEFAULT NULL,
-  `b` int(11) DEFAULT NULL
-) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-count(*)
-2
-master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
-crash point: ddl_log_alter_after_rename_to_backup
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_backup_log
-"No crash!"
-crash point: ddl_log_alter_rename_frm
-"No crash!"
-crash point: ddl_log_alter_after_rename_to_original
-"No crash!"
-crash point: ddl_log_alter_before_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_rename_triggers
-"No crash!"
-crash point: ddl_log_alter_after_delete_backup
-"No crash!"
-crash point: ddl_log_alter_after_drop_original_table
-"No crash!"
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table.test
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table.test	1970-01-01 00:00:00.000000000 +0000
@@ -1,198 +0,0 @@
---source include/have_debug.inc
---source include/have_innodb.inc
---source include/have_log_bin.inc
-
-if (!$BIG_TEST)
-{
-  --source include/not_valgrind.inc
-  --source include/not_msan.inc
-}
-
-#
-# Testing of atomic create table with crashes in a lot of different places
-#
-# Things tested:
-# With myisam and InnoDB engines to ensure that cover both normal and
-# online alter table paths.
-# Alter table with new columns
-# Alter table which only touches .frm
-# Alter table disable keys (has it own code path)
-# Alter table with rename
-# Alter table with rename and only options that touches .frm
-# Alter table with rename and add new columns
-# Alter table with storage engine change (with and without column definition
-# changes)
-# Alter table with storage engine change and rename
-# Alter table to another database
-
---disable_query_log
-call mtr.add_suppression("InnoDB: .* does not exist in the InnoDB internal");
-# Speed up wait_until_connected_again.inc
-let NO_WSREP=1;
---enable_query_log
-let $MYSQLD_DATADIR= `SELECT @@datadir`;
-
-create database test2;
-RESET MASTER;
-
-if ($engine_count == "")
-{
-  let $engine_count=2;
-  let $engines='myisam','innodb';
-}
-if ($extra_engine == "")
-{
-  let $extra_engine=aria;
-}
-
-let $crash_count=13;
-let $crash_points='ddl_log_alter_after_create_frm', 'ddl_log_alter_after_create_table', 'ddl_log_alter_after_prepare_inplace','ddl_log_alter_after_copy', 'ddl_log_alter_after_log', 'ddl_log_alter_after_rename_to_backup', 'ddl_log_alter_after_rename_to_backup_log', 'ddl_log_alter_rename_frm', 'ddl_log_alter_after_rename_to_original', 'ddl_log_alter_before_rename_triggers', 'ddl_log_alter_after_rename_triggers', 'ddl_log_alter_after_delete_backup', 'ddl_log_alter_after_drop_original_table';
-
-let $statement_count=16;
-let $statements='ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"',
-                'ALTER TABLE t1 COMMENT "new"',
-                'ALTER TABLE t1 change column a c int COMMENT "new"',
-                'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2',
-                'ALTER TABLE t1 disable keys',
-                'ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"',
-                'ALTER TABLE t1 rename t2',
-                'ALTER TABLE t1 COMMENT "new", rename t2',
-                'ALTER TABLE t1 change column a c int COMMENT "new", rename t2',
-                'ALTER TABLE t1 ENGINE=$extra_engine, COMMENT "new"',
-                'ALTER TABLE t1 change column a c int COMMENT "new", engine=$extra_engine',
-                'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=$extra_engine',
-                'ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2',
-                'ALTER TABLE t1 COMMENT "new", rename test2.t2',
-                'ALTER TABLE t1 ADD key(b), COMMENT "new"',
-                'ALTER TABLE t1 DROP INDEX a';
-
-# If there is a need of testing one specific state (crash point and query),
-# one can use the comments below to execute one specific test combination
-#let $crash_count=1;
-#let $crash_points='ddl_log_alter_after_create_frm';
-#let $statement_count= 1;
-#let $statements='ALTER TABLE t1 ADD COLUMN c int, COMMENT "new"';
-#let $engine_count=1;
-#let $engines='rocksdb';
-#--source include/have_rocksdb.inc
-
-let $old_debug=`select @@debug_dbug`;
-let $e=0;
-let $keep_include_silent=1;
-let $grep_script=ALTER;
---disable_query_log
-
-while ($e < $engine_count)
-{
-  inc $e;
-  let $engine=`select ELT($e, $engines)`;
-  let $default_engine=$engine;
-
-  --echo
-  --echo engine: $engine
-  --echo
-
-  let $r=0;
-  while ($r < $statement_count)
-  {
-    inc $r;
-    let $statement=`select ELT($r, $statements)`;
-    --echo
-    --echo query: $statement
-    --echo
-    let $c=0;
-    while ($c < $crash_count)
-    {
-      inc $c;
-      let $crash=`select ELT($c, $crash_points)`;
-
-      --eval create table t1 (a int, b int, key(a)) engine=$engine
-      insert into t1 values (1,1),(2,2);
-      commit;
-      flush tables;
-
-      FLUSH BINARY LOGS;
-      --let $start_binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
-      --echo crash point: $crash
-      if ($crash_count > 1)
-      {
-         --exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-      }
-#     The following can be used for testing one specific failure
-#     if ($crash == "ddl_log_alter_after_log")
-#     {
-#       if ($r == 2)
-#       {
-#         --remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-#       }
-#     }
-      --disable_reconnect
-      --eval set @@debug_dbug="+d,$crash",@debug_crash_counter=1
-      let $errno=0;
-      --error 0,2013
-      --eval $statement;
-      let $error=$errno;
-      --enable_reconnect
-      --source include/wait_until_connected_again.inc
-      --disable_query_log
-      --eval set @@debug_dbug="$old_debug"
-
-      if ($error == 0)
-      {
-        echo "No crash!";
-      }
-      if ($error != 0)
-      {
-        --list_files $MYSQLD_DATADIR/test t*
-        --list_files $MYSQLD_DATADIR/test *sql*
-        --list_files $MYSQLD_DATADIR/test2 t*
-        --list_files $MYSQLD_DATADIR/test2 *sql*
-        # Check which tables still exists
-        --error 0,1
-        --file_exists $MYSQLD_DATADIR/test/t1.frm
-        let $error2=$errno;
-        if ($error2 == 0)
-        {
-          show create table t1;
-          select count(*) from t1;
-        }
-        if ($error2 == 1)
-        {
-          --error 0,1
-          --file_exists $MYSQLD_DATADIR/test/t2.frm
-          let $error3=$errno;
-          if ($error3 == 0)
-          {
-            show create table t2;
-            select count(*) from t2;
-          }
-          if ($error3 == 1)
-          {
-            --echo "Table is in test2"
-            show create table test2.t2;
-            select count(*) from test2.t2;
-          }
-        }
-        --let $binlog_file=$start_binlog_file
-        --let $binlog_output_name=master-bin.000001
-
-        --source include/show_binlog_events.inc
-        if ($error)
-        {
-          --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
-          --let $binlog_output_name=master-bin.000002
-          if ($binlog_file != $start_binlog_file)
-          {
-            --source include/show_binlog_events.inc
-          }
-        }
-      }
-      --disable_warnings
-      drop table if exists t1,t2;
-      drop table if exists test2.t2;
-      --enable_warnings
-    }
-  }
-}
-drop database test2;
---enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_aria.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_aria.test
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_aria.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_aria.test	2025-05-19 16:14:24.000000000 +0000
@@ -4,4 +4,4 @@
 let $engine_count=1;
 let $engines='aria';
 let $extra_engine=myisam;
---source alter_table.test
+--source alter_table.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.opt mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.opt
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.opt	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1 @@
+--innodb-max-dirty-pages-pct=0
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.result mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.result
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,1396 @@
+create database test2;
+RESET MASTER;
+
+engine: innodb
+
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 change column a c int COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 disable keys
+
+crash point: ddl_log_alter_after_create_frm
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_create_table
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_copy
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_log
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+Warnings:
+Note	1031	Storage engine InnoDB of the table `test`.`t1` doesn't have this option
+"No crash!"
+
+query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_after_drop_original_table
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+
+query: ALTER TABLE t1 rename t2
+
+crash point: ddl_log_alter_after_create_frm
+"No crash!"
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+"No crash!"
+crash point: ddl_log_alter_after_log
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_rename_triggers
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 COMMENT "new", rename t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.frm
+t2.ibd
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_after_drop_original_table
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+
+query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_after_drop_original_table
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_rename_to_backup
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_rename_to_backup_log
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_before_rename_triggers
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_rename_triggers
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_delete_backup
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_drop_original_table
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.frm
+t2.ibd
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.frm
+t2.ibd
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 COMMENT "new", rename test2.t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.frm
+t2.ibd
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.frm
+t2.ibd
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 ADD key(b), COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`),
+  KEY `b` (`b`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 DROP INDEX a
+
+crash point: ddl_log_alter_after_create_frm
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.frm
+t1.ibd
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.test
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_innodb.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_innodb.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,7 @@
+#
+# Test atomic alter table with InnoDB
+
+--source include/have_innodb.inc
+let $engine_count=1;
+let $engines='innodb';
+--source alter_table.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.result mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.result
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,1741 @@
+create database test2;
+RESET MASTER;
+
+engine: myisam
+
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+crash point: ddl_log_alter_after_drop_original_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new"
+
+query: ALTER TABLE t1 COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new"
+crash point: ddl_log_alter_after_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 change column a c int COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new"
+crash point: ddl_log_alter_after_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_to_backup
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_to_backup_log
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_before_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_delete_backup
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+crash point: ddl_log_alter_after_drop_original_table
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2
+
+query: ALTER TABLE t1 disable keys
+
+crash point: ddl_log_alter_after_create_frm
+"No crash!"
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+"No crash!"
+crash point: ddl_log_alter_after_log
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+crash point: ddl_log_alter_after_drop_original_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, ALGORITHM=copy, COMMENT "new"
+
+query: ALTER TABLE t1 rename t2
+
+crash point: ddl_log_alter_after_create_frm
+"No crash!"
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+"No crash!"
+crash point: ddl_log_alter_after_log
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_rename_triggers
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 COMMENT "new", rename t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
+crash point: ddl_log_alter_after_log
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+crash point: ddl_log_alter_after_log
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", rename t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+crash point: ddl_log_alter_after_drop_original_table
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ENGINE=aria, COMMENT "new"
+
+query: ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+crash point: ddl_log_alter_after_drop_original_table
+t1.MAD
+t1.MAI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `c` int(11) DEFAULT NULL COMMENT 'new',
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`c`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 change column a c int COMMENT "new", engine=aria
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_rename_to_backup
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_rename_to_backup_log
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_before_rename_triggers
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_rename_triggers
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_delete_backup
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+crash point: ddl_log_alter_after_drop_original_table
+t2.MAD
+t2.MAI
+t2.frm
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci PAGE_CHECKSUM=1 COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename t2, engine=aria
+
+query: ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_to_backup
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_to_backup_log
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_before_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_delete_backup
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_drop_original_table
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  `c` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD COLUMN c INT, COMMENT "new", rename test2.t2
+
+query: ALTER TABLE t1 COMMENT "new", rename test2.t2
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+"No crash!"
+crash point: ddl_log_alter_after_prepare_inplace
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_to_backup
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_backup_log
+"No crash!"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+"No crash!"
+crash point: ddl_log_alter_before_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_rename_triggers
+t2.MYD
+t2.MYI
+t2.frm
+"Table is in test2"
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 COMMENT "new", rename test2.t2
+crash point: ddl_log_alter_after_delete_backup
+"No crash!"
+crash point: ddl_log_alter_after_drop_original_table
+"No crash!"
+
+query: ALTER TABLE t1 ADD key(b), COMMENT "new"
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`),
+  KEY `b` (`b`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`),
+  KEY `b` (`b`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`),
+  KEY `b` (`b`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`),
+  KEY `b` (`b`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`),
+  KEY `b` (`b`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
+crash point: ddl_log_alter_after_drop_original_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`),
+  KEY `b` (`b`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci COMMENT='new'
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 ADD key(b), COMMENT "new"
+
+query: ALTER TABLE t1 DROP INDEX a
+
+crash point: ddl_log_alter_after_create_frm
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_create_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_prepare_inplace
+"No crash!"
+crash point: ddl_log_alter_after_copy
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL,
+  KEY `a` (`a`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+crash point: ddl_log_alter_after_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
+crash point: ddl_log_alter_after_rename_to_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
+crash point: ddl_log_alter_after_rename_to_backup_log
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
+crash point: ddl_log_alter_rename_frm
+"No crash!"
+crash point: ddl_log_alter_after_rename_to_original
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
+crash point: ddl_log_alter_before_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_rename_triggers
+"No crash!"
+crash point: ddl_log_alter_after_delete_backup
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
+crash point: ddl_log_alter_after_drop_original_table
+t1.MYD
+t1.MYI
+t1.frm
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `a` int(11) DEFAULT NULL,
+  `b` int(11) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+count(*)
+2
+master-bin.000002	#	Query	#	#	use `test`; ALTER TABLE t1 DROP INDEX a
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.test
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_myisam.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_myisam.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,6 @@
+#
+# Test atomic alter table with MyISAM
+
+let $engine_count=1;
+let $engines='myisam';
+--source alter_table.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_rocksdb.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_rocksdb.test
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_rocksdb.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_rocksdb.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,4 +3,4 @@
 let $engine_count=1;
 let $engines='rocksdb';
 set global rocksdb_flush_log_at_trx_commit=1;
---source alter_table.test
+--source alter_table.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/alter_table_trigger.test mariadb-10.11.13/mysql-test/suite/atomic/alter_table_trigger.test
--- mariadb-10.11.11/mysql-test/suite/atomic/alter_table_trigger.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/alter_table_trigger.test	2025-05-19 16:14:24.000000000 +0000
@@ -7,7 +7,7 @@
 #
 # Testing of atomic create table with crashes in a lot of different places
 #
-# This is very similar to the alter_table.test, but includes testing of
+# This is very similar to the alter_table.inc, but includes testing of
 # triggers in with ALTER TABLE .. RENAME.
 #
 
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/create_table.test mariadb-10.11.13/mysql-test/suite/atomic/create_table.test
--- mariadb-10.11.11/mysql-test/suite/atomic/create_table.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/create_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_debug.inc
 --source include/have_sequence.inc
 --source include/have_innodb.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/drop_table.test mariadb-10.11.13/mysql-test/suite/atomic/drop_table.test
--- mariadb-10.11.11/mysql-test/suite/atomic/drop_table.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/drop_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_debug.inc
 --source include/have_innodb.inc
 --source include/have_csv.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/atomic/rename_table.test mariadb-10.11.13/mysql-test/suite/atomic/rename_table.test
--- mariadb-10.11.11/mysql-test/suite/atomic/rename_table.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/atomic/rename_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_debug.inc
 --source include/have_innodb.inc
 --source include/have_csv.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_commit_fail.result mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_commit_fail.result
--- mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_commit_fail.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_commit_fail.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,116 @@
+set @@session.gtid_domain_id=1;
+set @save_gtid_stric_mode=@@global.gtid_strict_mode;
+create table ta (a int) engine=aria;
+create table ti (a int) engine=innodb;
+create table ti_pk (a int primary key) engine=innodb;
+create table t  (a int) engine=innodb;
+create function f_i()
+returns integer
+begin
+insert into ti set a=1;
+return 1;
+end |
+create function f_ia(arg int)
+returns integer
+begin
+insert into ti_pk set a=1;
+insert into ta set a=1;
+insert into ti_pk set a=arg;
+return 1;
+end |
+call mtr.add_suppression("Error writing file");
+select count(*) as zero from t;
+zero
+0
+select count(*) as zero from ta;
+zero
+0
+select count(*) as zero from ti;
+zero
+0
+# 1. simple Innodb test
+set @@global.gtid_strict_mode=0;
+set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+insert into t set a=1;
+ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled
+# observe effective rollback
+select count(*) as zero from t;
+zero
+0
+# 2. simple Aira test
+set @@global.gtid_strict_mode=0;
+set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+insert into ta values (1),(2);
+ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled
+# note no rollback
+select count(*) as '*NON-zero*' from ta;
+*NON-zero*
+2
+delete from ta;
+# 3. multi-engine test
+set @@global.gtid_strict_mode=0;
+set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+insert into ta set a=f_i();
+ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled
+# note no rollback..
+select count(*) as one from ta;
+one
+1
+# ..except transactional engine
+select count(*) as zero from ti;
+zero
+0
+delete from ta;
+set @@global.gtid_strict_mode=0;
+set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+insert into t set a=f_ia(0);
+ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled
+# note no rollback..
+select count(*) as one from ta;
+one
+1
+# ..except transactional engine
+select count(*) as zero from t;
+zero
+0
+select count(*) as zero from ti_pk;
+zero
+0
+delete from ta;
+# 4. create-table-select-f()
+set @@global.gtid_strict_mode=0;
+set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+create table f_x (a int) select f_i() as a;
+ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled
+# rollback indeed takes place in the pure transactional case
+select count(*) as zero from ti;
+zero
+0
+set @@global.gtid_strict_mode=0;
+set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+create table t_x (a int) engine=aria select f_ia(0) as a;
+ERROR HY000: An attempt was made to binlog GTID VALUE which would create an out-of-order sequence number with existing GTID VALUE, and gtid strict mode is enabled
+select * from t_x;
+ERROR 42S02: Table 'test.t_x' doesn't exist
+# **TODO**: fix MDEV-36027
+# **TODO**: the empty binlog is buggy ..
+include/show_binlog_events.inc
+# .. as non-transactional `ta` (and `t_x` sic!) are modified
+select count(*) as one from ta;
+one
+1
+select count(*) as zero from ti;
+zero
+0
+delete from ta;
+#.
+set  @@global.gtid_strict_mode=@save_gtid_stric_mode;
+drop function f_i;
+drop function f_ia;
+drop table t, ta, ti, ti_pk;
diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result
--- mariadb-10.11.11/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/binlog/r/binlog_mysqlbinlog_warn_stop_position.result	2025-05-19 16:14:24.000000000 +0000
@@ -18,6 +18,51 @@
 # Ensuring file offset of binlog_f2_mid < binlog_f1_end
 #
 #
+# Test using --read-from-remote-server
+#
+connection default;
+#
+# --stop-position tests
+#
+#  Case 1.a) With one binlog file, a --stop-position before the end of
+# the file should not result in a warning
+# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f1_pre_rotate binlog_f1_full --result-file=tmp/warn_position_test_file.out 2>&1
+#
+#  Case 1.b) With one binlog file, a --stop-position at the exact end of
+# the file should not result in a warning
+# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f1_end binlog_f1_full --result-file=tmp/warn_position_test_file.out 2>&1
+#
+#  Case 1.c) With one binlog file, a --stop-position past the end of the
+# file should(!) result in a warning
+# MYSQL_BINLOG --read-from-remote-server --short-form --stop-position=binlog_f1_over_eof binlog_f1_full --result-file=tmp/warn_position_test_file.out 2>&1
+WARNING: Did not reach stop position <BINLOG_F1_OVER_EOF> before end of input
+#
+#  Case 2.a) With two binlog files, a --stop-position targeting b2 which
+# exists in the size of b1 should:
+#    1) not provide any warnings
+#    2) not prevent b2 from outputting its desired events before the
+#       stop position
+# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f2_mid binlog_f1_full binlog_f2_full --result-file=tmp/warn_position_test_file.out 2>&1
+include/assert_grep.inc [Ensure all intended GTIDs are present]
+include/assert_grep.inc [Ensure the next GTID binlogged is _not_ present]
+#
+#  Case 2.b) With two binlog files, a --stop-position targeting the end
+# of binlog 2 should:
+#    1) not provide any warnings
+#    2) not prevent b2 from outputting its entire binary log
+# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f2_end binlog_f1_full binlog_f2_full --result-file=tmp/warn_position_test_file.out 2>&1
+include/assert_grep.inc [Ensure a GTID exists for each transaction]
+include/assert_grep.inc [Ensure the last GTID binlogged is present]
+#
+#  Case 2.c) With two binlog files, a --stop-position targeting beyond
+# the eof of binlog 2 should:
+#    1) provide a warning that the stop position was not reached
+#    2) not prevent b2 from outputting its entire binary log
+# MYSQL_BINLOG --read-from-remote-server --stop-position=binlog_f2_over_eof binlog_f1_full binlog_f2_full --result-file=tmp/warn_position_test_file.out 2>&1
+WARNING: Did not reach stop position <BINLOG_F2_OVER_EOF> before end of input
+include/assert_grep.inc [Ensure a GTID exists for each transaction]
+#
+#
 # Test using local binlog files
 #
 connection default;
diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_commit_fail.test mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_commit_fail.test
--- mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_commit_fail.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_commit_fail.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,135 @@
+# Tests of commit time failures.
+# At committing of an auto-commit statement a failure to commit in its
+# binlog branch should rollback at least the transactional part of the statement.
+#
+# References:
+# MDEV-35506 commit policy of one-phase-commit even at errored-out binlogging leads to assert
+# MDEV-36027 Errored-out CREATE-SELECT does not binlog results of non-transactional table modification
+
+source include/have_innodb.inc;
+source include/have_binlog_format_row.inc;
+
+set @@session.gtid_domain_id=1;
+set @save_gtid_stric_mode=@@global.gtid_strict_mode;
+
+create table ta (a int) engine=aria;
+create table ti (a int) engine=innodb;
+create table ti_pk (a int primary key) engine=innodb;
+create table t  (a int) engine=innodb;
+delimiter |;
+create function f_i()
+returns integer
+begin
+  insert into ti set a=1;
+return 1;
+end |
+create function f_ia(arg int)
+returns integer
+begin
+  insert into ti_pk set a=1;
+  insert into ta set a=1;
+  insert into ti_pk set a=arg;
+  return 1;
+end |
+delimiter ;|
+
+call mtr.add_suppression("Error writing file");
+
+# Naturally all empty now
+select count(*) as zero from t;
+select count(*) as zero from ta;
+select count(*) as zero from ti;
+
+# Force manual value assignement to gtid::seq_no while in the strict mode
+# so that the value is rejected. Despite the errorred out statement
+# being at its commit phase it will eventually be rolled back.
+# Side effects of non-transactional engines, like Aria, are displayed.
+--echo # 1. simple Innodb test
+set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+# mask possible allowed seq_no shift
+--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/
+--error ER_GTID_STRICT_OUT_OF_ORDER
+insert into t set a=1;
+
+--echo # observe effective rollback
+select count(*) as zero from t;
+
+--echo # 2. simple Aira test
+set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/
+--error ER_GTID_STRICT_OUT_OF_ORDER
+insert into ta values (1),(2);
+
+--echo # note no rollback
+select count(*) as '*NON-zero*' from ta;
+# local cleanup
+delete from ta;
+
+--echo # 3. multi-engine test
+# A. non-transactional top-level
+set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/
+--error ER_GTID_STRICT_OUT_OF_ORDER
+insert into ta set a=f_i();
+--echo # note no rollback..
+select count(*) as one from ta;
+--echo # ..except transactional engine
+select count(*) as zero from ti;
+delete from ta;
+
+# B. non-transactional in the leaf
+set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/
+--error ER_GTID_STRICT_OUT_OF_ORDER
+insert into t set a=f_ia(0);
+
+--echo # note no rollback..
+select count(*) as one from ta;
+--echo # ..except transactional engine
+select count(*) as zero from t;
+select count(*) as zero from ti_pk;
+delete from ta;
+
+--echo # 4. create-table-select-f()
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+# A. two phase commit branch
+set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/
+--error ER_GTID_STRICT_OUT_OF_ORDER
+create table f_x (a int) select f_i() as a;
+--echo # rollback indeed takes place in the pure transactional case
+select count(*) as zero from ti;
+
+# B. one phase commit branch
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+set @@global.gtid_strict_mode=0; set @@session.gtid_seq_no=1;
+set @@global.gtid_strict_mode=1;
+--replace_regex /GTID 1-1-[0-9]+/GTID VALUE/
+--error ER_GTID_STRICT_OUT_OF_ORDER
+create table t_x (a int) engine=aria select f_ia(0) as a;
+--error ER_NO_SUCH_TABLE
+select * from t_x;
+
+--echo # **TODO**: fix MDEV-36027
+--echo # **TODO**: the empty binlog is buggy ..
+--source include/show_binlog_events.inc
+--echo # .. as non-transactional `ta` (and `t_x` sic!) are modified
+select count(*) as one from ta;
+select count(*) as zero from ti;
+
+delete from ta;
+--echo #.
+
+# cleanup
+
+set  @@global.gtid_strict_mode=@save_gtid_stric_mode;
+drop function f_i;
+drop function f_ia;
+drop table t, ta, ti, ti_pk;
diff -Nru mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test
--- mariadb-10.11.11/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/binlog/t/binlog_mysqlbinlog_warn_stop_position.test	2025-05-19 16:14:24.000000000 +0000
@@ -64,13 +64,12 @@
   --die Mid point chosen to end in binlog 2 does not exist in earlier binlog
 }
 
-#--echo #
-#--echo #
-#--echo # Test using --read-from-remote-server
-#--echo #
-#--let $read_from_remote_server= 1
-#--emit warning is not supported by --read-from-remote-server now
-#--source binlog_mysqlbinlog_warn_stop_position.inc
+--echo #
+--echo #
+--echo # Test using --read-from-remote-server
+--echo #
+--let $read_from_remote_server= 1
+--source binlog_mysqlbinlog_warn_stop_position.inc
 
 --echo #
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/suite/binlog_encryption/encrypted_master.test mariadb-10.11.13/mysql-test/suite/binlog_encryption/encrypted_master.test
--- mariadb-10.11.11/mysql-test/suite/binlog_encryption/encrypted_master.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/binlog_encryption/encrypted_master.test	2025-05-19 16:14:24.000000000 +0000
@@ -18,6 +18,7 @@
 # - with annotated events, default checksums and minimal binlog row image
 #
 
+--source include/long_test.inc
 # The test can take very long time with valgrind
 --source include/not_valgrind.inc
 
diff -Nru mariadb-10.11.11/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result mariadb-10.11.13/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result
--- mariadb-10.11.11/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/binlog_encryption/rpl_parallel_innodb_lock_conflict.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,16 +1,15 @@
 ***MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts ***
 include/master-slave.inc
 [connection master]
-connection server_2;
-SET sql_log_bin=0;
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CALL mtr.add_suppression("InnoDB: Transaction was aborted due to ");
 CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
-SET sql_log_bin=1;
+connection server_2;
 SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=10;
 CHANGE MASTER TO master_use_gtid=slave_pos;
 connection server_1;
-ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB;
 INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
 connect  con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,;
diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/r/doublewrite_debug.result mariadb-10.11.13/mysql-test/suite/encryption/r/doublewrite_debug.result
--- mariadb-10.11.11/mysql-test/suite/encryption/r/doublewrite_debug.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/encryption/r/doublewrite_debug.result	2025-05-19 16:14:24.000000000 +0000
@@ -3,8 +3,9 @@
 call mtr.add_suppression("InnoDB: Plugin initialization aborted");
 call mtr.add_suppression("Plugin 'InnoDB' init function returned error");
 call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed");
-create table t1 (f1 int primary key, f2 blob)page_compressed = 1 engine=innodb stats_persistent=0;
-create table t2(f1 int primary key, f2 blob)engine=innodb stats_persistent=0;
+create table t1 (f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=yes stats_persistent=0;
+create table t2(f1 int primary key, f2 blob)engine=innodb encrypted=yes stats_persistent=0;
+create table t3(f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=no stats_persistent=0;
 start transaction;
 insert into t1 values(1, repeat('#',12));
 insert into t1 values(2, repeat('+',12));
@@ -12,29 +13,37 @@
 insert into t1 values(4, repeat('-',12));
 insert into t1 values(5, repeat('.',12));
 insert into t2 select * from t1;
+insert into t3 select * from t1;
 commit work;
 SET GLOBAL innodb_fast_shutdown = 0;
 # restart: --debug_dbug=+d,ib_log_checkpoint_avoid_hard --innodb_flush_sync=0
 select space into @t1_space_id from information_schema.innodb_sys_tablespaces where name='test/t1';
 select space into @t2_space_id from information_schema.innodb_sys_tablespaces where name='test/t2';
+select space into @t3_space_id from information_schema.innodb_sys_tablespaces where name='test/t3';
 begin;
 insert into t1 values (6, repeat('%', 400));
 insert into t2 values (6, repeat('%', 400));
+insert into t3 values (6, repeat('%', 400));
 # xtrabackup prepare
 set global innodb_saved_page_number_debug = 3;
 set global innodb_fil_make_page_dirty_debug = @t1_space_id;
 set global innodb_saved_page_number_debug = 3;
 set global innodb_fil_make_page_dirty_debug = @t2_space_id;
+set global innodb_saved_page_number_debug = 3;
+set global innodb_fil_make_page_dirty_debug = @t3_space_id;
 set global innodb_buf_flush_list_now = 1;
 # Kill the server
 # restart
-FOUND 2 /InnoDB: Recovered page \[page id: space=[1-9]*, page number=3\]/ in mysqld.1.err
+FOUND 3 /InnoDB: Recovered page \[page id: space=[1-9]*, page number=3\]/ in mysqld.1.err
 check table t1;
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
 check table t2;
 Table	Op	Msg_type	Msg_text
 test.t2	check	status	OK
+check table t3;
+Table	Op	Msg_type	Msg_text
+test.t3	check	status	OK
 select f1, f2 from t1;
 f1	f2
 1	############
@@ -49,6 +58,13 @@
 3	////////////
 4	------------
 5	............
+select f1, f2 from t3;
+f1	f2
+1	############
+2	++++++++++++
+3	////////////
+4	------------
+5	............
 SET GLOBAL innodb_fast_shutdown = 0;
 # shutdown server
 # remove datadir
@@ -78,4 +94,4 @@
 3	////////////
 4	------------
 5	............
-drop table t2, t1;
+drop table t3, t2, t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.opt mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.opt
--- mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,3 +1,3 @@
 --innodb-use-atomic-writes=0
---innodb-encrypt-tables=FORCE
+--innodb-encrypt-tables=on
 --innodb_sys_tablespaces
diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.test mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.test
--- mariadb-10.11.11/mysql-test/suite/encryption/t/doublewrite_debug.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/encryption/t/doublewrite_debug.test	2025-05-19 16:14:24.000000000 +0000
@@ -12,8 +12,9 @@
 let MYSQLD_DATADIR=`select @@datadir`;
 let ALGO=`select @@innodb_checksum_algorithm`;
 
-create table t1 (f1 int primary key, f2 blob)page_compressed = 1 engine=innodb stats_persistent=0;
-create table t2(f1 int primary key, f2 blob)engine=innodb stats_persistent=0;
+create table t1 (f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=yes stats_persistent=0;
+create table t2(f1 int primary key, f2 blob)engine=innodb encrypted=yes stats_persistent=0;
+create table t3(f1 int primary key, f2 blob)page_compressed=1 engine=innodb encrypted=no stats_persistent=0;
 
 start transaction;
 insert into t1 values(1, repeat('#',12));
@@ -22,6 +23,7 @@
 insert into t1 values(4, repeat('-',12));
 insert into t1 values(5, repeat('.',12));
 insert into t2 select * from t1;
+insert into t3 select * from t1;
 commit work;
 
 # Slow shutdown and restart to make sure ibuf merge is finished
@@ -33,15 +35,17 @@
 
 select space into @t1_space_id from information_schema.innodb_sys_tablespaces where name='test/t1';
 select space into @t2_space_id from information_schema.innodb_sys_tablespaces where name='test/t2';
+select space into @t3_space_id from information_schema.innodb_sys_tablespaces where name='test/t3';
 
 begin;
 insert into t1 values (6, repeat('%', 400));
 insert into t2 values (6, repeat('%', 400));
+insert into t3 values (6, repeat('%', 400));
 
-# Copy the t1.ibd, t2.ibd file
+# Copy the t1.ibd, t2.ibd, t3.ibd file
 let $targetdir=$MYSQLTEST_VARDIR/tmp/backup_1;
 --disable_result_log
-exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir;
+exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --skip-innodb-log-checkpoint-now --target-dir=$targetdir;
 --enable_result_log
 
 echo # xtrabackup prepare;
@@ -54,8 +58,11 @@
 set global innodb_saved_page_number_debug = 3;
 set global innodb_fil_make_page_dirty_debug = @t2_space_id;
 
+set global innodb_saved_page_number_debug = 3;
+set global innodb_fil_make_page_dirty_debug = @t3_space_id;
+
 set global innodb_buf_flush_list_now = 1;
---let CLEANUP_IF_CHECKPOINT=drop table t1, t2, unexpected_checkpoint;
+--let CLEANUP_IF_CHECKPOINT=drop table t1, t2, t3, unexpected_checkpoint;
 --source ../../suite/innodb/include/no_checkpoint_end.inc
 # Corrupt the page 3 in t1.ibd, t2.ibd file
 perl;
@@ -103,6 +110,15 @@
 sysseek(FILE, 3*$page_size, 0);
 print FILE chr(0) x ($ENV{'INNODB_PAGE_SIZE'});
 close FILE;
+
+# Zero the complete page
+my $fname= "$ENV{'MYSQLD_DATADIR'}test/t3.ibd";
+open(FILE, "+<", $fname) or die;
+FILE->autoflush(1);
+binmode FILE;
+sysseek(FILE, 3*$page_size, 0);
+print FILE chr(0) x ($ENV{'INNODB_PAGE_SIZE'});
+close FILE;
 EOF
 
 # Successful recover from doublewrite buffer
@@ -114,8 +130,10 @@
 
 check table t1;
 check table t2;
+check table t3;
 select f1, f2 from t1;
 select f1, f2 from t2;
+select f1, f2 from t3;
 
 SET GLOBAL innodb_fast_shutdown = 0;
 let $shutdown_timeout=;
@@ -220,4 +238,4 @@
 
 --source ../../mariabackup/include/restart_and_restore.inc
 select * from t1;
-drop table t2, t1;
+drop table t3, t2, t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt mariadb-10.11.13/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt
--- mariadb-10.11.11/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/encryption/t/innodb_encrypt_temporary_tables.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,2 +1,2 @@
---innodb_buffer_pool_size=5M
+--innodb_buffer_pool_size=6M
 --innodb_encrypt_temporary_tables=1
diff -Nru mariadb-10.11.11/mysql-test/suite/engines/iuds/r/insert_time.result mariadb-10.11.13/mysql-test/suite/engines/iuds/r/insert_time.result
--- mariadb-10.11.11/mysql-test/suite/engines/iuds/r/insert_time.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/engines/iuds/r/insert_time.result	2025-05-19 16:14:24.000000000 +0000
@@ -5073,10 +5073,14 @@
 INSERT INTO t3(c1,c2) VALUES('34 9:23','34 9:23') /* throws error as row exists with c1='34 9:23',c2='34 9:23' */;
 ERROR 23000: Duplicate entry '825:23:00-825:23:00' for key 'idx'
 INSERT IGNORE INTO t1(c1,c2) VALUES('10:22:33','10:22:34') /* doesn't throw error */;
+Warnings:
+Warning	1062	Duplicate entry '10:22:33' for key 'PRIMARY'
 INSERT IGNORE INTO t2(c1,c2) VALUES('12:34:56.78','12:34:56.78') /*doesn't throw error */;
 Warnings:
 Warning	1062	Duplicate entry '12:34:56-12:34:56' for key 'PRIMARY'
 INSERT IGNORE INTO t1(c1,c2) VALUES('10:22:34','34 9:23') /*doesn't throw error */;
+Warnings:
+Warning	1062	Duplicate entry '825:23:00' for key 'c2'
 INSERT IGNORE INTO t3(c1,c2) VALUES('34 9:23','34 9:23') /*doesn't throw error */;
 Warnings:
 Warning	1062	Duplicate entry '825:23:00-825:23:00' for key 'idx'
diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx.result mariadb-10.11.13/mysql-test/suite/federated/federatedx.result
--- mariadb-10.11.11/mysql-test/suite/federated/federatedx.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/federated/federatedx.result	2025-05-19 16:14:24.000000000 +0000
@@ -79,7 +79,7 @@
 `name` varchar(32) NOT NULL default ''
     )
 ENGINE="FEDERATED" DEFAULT CHARSET=latin1
-CONNECTION='mysql://root@127.0.0.1:SLAVE_PORT/federated/t1';
+CONNECTION='mariadb://root@127.0.0.1:SLAVE_PORT/federated/t1';
 INSERT INTO federated.t1 (id, name) VALUES (1, 'foo');
 INSERT INTO federated.t1 (id, name) VALUES (2, 'fee');
 INSERT INTO federated.t1 (id, `group`) VALUES (3, 42);
diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx.test mariadb-10.11.13/mysql-test/suite/federated/federatedx.test
--- mariadb-10.11.11/mysql-test/suite/federated/federatedx.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/federated/federatedx.test	2025-05-19 16:14:24.000000000 +0000
@@ -92,7 +92,7 @@
     `name` varchar(32) NOT NULL default ''
     )
   ENGINE="FEDERATED" DEFAULT CHARSET=latin1
-  CONNECTION='mysql://root@127.0.0.1:$SLAVE_MYPORT/federated/t1';
+  CONNECTION='mariadb://root@127.0.0.1:$SLAVE_MYPORT/federated/t1';
 
 INSERT INTO federated.t1 (id, name) VALUES (1, 'foo');
 INSERT INTO federated.t1 (id, name) VALUES (2, 'fee');
diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.result mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.result
--- mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.result	2025-05-19 16:14:24.000000000 +0000
@@ -479,12 +479,12 @@
 INSERT INTO federated.t3 VALUES (1),(2),(3);
 CREATE TABLE federated.t4 (a INT);
 connection master;
-CREATE SERVER fedlink FOREIGN DATA WRAPPER mysql 
+CREATE SERVER fedlink FOREIGN DATA WRAPPER mariadb
 OPTIONS (USER 'root', HOST '127.0.0.1', DATABASE 'federated',
 PORT SLAVE_PORT);
 CREATE TABLE federated.t3 (a INT)
 ENGINE=FEDERATED
-CONNECTION='mysql://root@127.0.0.1:$SLAVE_MYPORT/federated/t3'
+CONNECTION='mariadb://root@127.0.0.1:$SLAVE_MYPORT/federated/t3'
   PARTITION BY list (a)
 (PARTITION p1 VALUES IN (1) CONNECTION='fedlink/t3',
 PARTITION p2 VALUES IN (2) CONNECTION='fedlink/t4');
diff -Nru mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.test mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.test
--- mariadb-10.11.11/mysql-test/suite/federated/federatedx_create_handlers.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/federated/federatedx_create_handlers.test	2025-05-19 16:14:24.000000000 +0000
@@ -7,9 +7,6 @@
 
 set global federated_pushdown=1;
 
-#Enable after fix MDEV-31846 or in v. 10.5 and later
---disable_cursor_protocol
-
 connection slave;
 
 DROP TABLE IF EXISTS federated.t1;
@@ -168,11 +165,13 @@
 --sorted_result
 select * from federated.t4;
 
+--disable_cursor_protocol
 select name into @var from federated.t1 where id=3 limit 1 ;
 select @var;
 --disable_ps2_protocol
 select name into outfile 'tmp.txt' from federated.t1;
 --enable_ps2_protocol
+--enable_cursor_protocol
 
 let $path=`select concat(@@datadir, 'test/tmp.txt')`;
 remove_file $path;
@@ -307,13 +306,13 @@
 
 connection master;
 --replace_result $SLAVE_MYPORT SLAVE_PORT
-eval CREATE SERVER fedlink FOREIGN DATA WRAPPER mysql 
+eval CREATE SERVER fedlink FOREIGN DATA WRAPPER mariadb
   OPTIONS (USER 'root', HOST '127.0.0.1', DATABASE 'federated',
   PORT $SLAVE_MYPORT);
 
 CREATE TABLE federated.t3 (a INT)
   ENGINE=FEDERATED
-  CONNECTION='mysql://root@127.0.0.1:$SLAVE_MYPORT/federated/t3'
+  CONNECTION='mariadb://root@127.0.0.1:$SLAVE_MYPORT/federated/t3'
   PARTITION BY list (a)
   (PARTITION p1 VALUES IN (1) CONNECTION='fedlink/t3',
    PARTITION p2 VALUES IN (2) CONNECTION='fedlink/t4');
@@ -439,7 +438,5 @@
 
 set global federated_pushdown=0;
 
---enable_cursor_protocol
-
 source include/federated_cleanup.inc;
 
diff -Nru mariadb-10.11.11/mysql-test/suite/funcs_2/t/innodb_charset.test mariadb-10.11.13/mysql-test/suite/funcs_2/t/innodb_charset.test
--- mariadb-10.11.11/mysql-test/suite/funcs_2/t/innodb_charset.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/funcs_2/t/innodb_charset.test	2025-05-19 16:14:24.000000000 +0000
@@ -6,7 +6,7 @@
 # Checking of other prerequisites is in charset_master.test                    #
 ################################################################################
 
---source include/no_valgrind_without_big.inc
+--source include/long_test.inc
 --source include/have_innodb.inc
 
 # Starting with MariaDB 10.6, ensure that DDL recovery will have completed
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/disabled.def mariadb-10.11.13/mysql-test/suite/galera/disabled.def
--- mariadb-10.11.11/mysql-test/suite/galera/disabled.def	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/disabled.def	2025-05-19 16:14:24.000000000 +0000
@@ -10,5 +10,7 @@
 #
 ##############################################################################
 
-galera_sequences : MDEV-35934/MDEV-33850 For Galera, create sequence with low cache got signal 6 error: [ERROR] WSREP: FSM: no such a transition REPLICATING -> COMMITTED
-MDEV-26266 : MDEV-26266
+galera_wan : MDEV-35940 Unallowed state transition: donor -> synced in galera_wan
+galera_vote_rejoin_ddl : MDEV-35940 Unallowed state transition: donor -> synced in galera_wan
+MW-329 : MDEV-35951 Complete freeze during MW-329 test
+galera_vote_rejoin_dml : MDEV-35964 Assertion `ist_seqno >= cc_seqno' failed in galera_vote_rejoin_dml
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -17,7 +17,7 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address=gcomm://
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
@@ -28,7 +28,7 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_master.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_master.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_master.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_master.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -25,7 +25,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address=gcomm://
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
@@ -38,7 +38,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_replica_2primary.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -24,7 +24,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address=gcomm://
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
@@ -37,7 +37,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_slave.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_slave.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/galera_2nodes_as_slave.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/galera_2nodes_as_slave.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -24,7 +24,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address=gcomm://
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
@@ -37,7 +37,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_3nodes_as_slave.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_3nodes_as_slave.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/galera_3nodes_as_slave.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/galera_3nodes_as_slave.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -24,7 +24,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address=gcomm://
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
@@ -37,7 +37,7 @@
 #sst_port=@OPT.port
 wsrep_provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port'
@@ -50,7 +50,7 @@
 #sst_port=@OPT.port
 wsrep-provider=@ENV.WSREP_PROVIDER
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.3.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.3.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.3.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/galera_4nodes.cnf mariadb-10.11.13/mysql-test/suite/galera/galera_4nodes.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/galera_4nodes.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/galera_4nodes.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -18,7 +18,7 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address=gcomm://
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
@@ -30,7 +30,7 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port'
@@ -42,7 +42,7 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.3.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.3.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.3.#sst_port'
@@ -54,7 +54,7 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.4.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.4.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.4.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/auto_increment_offset_save.inc mariadb-10.11.13/mysql-test/suite/galera/include/auto_increment_offset_save.inc
--- mariadb-10.11.11/mysql-test/suite/galera/include/auto_increment_offset_save.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/include/auto_increment_offset_save.inc	2025-05-19 16:14:24.000000000 +0000
@@ -42,4 +42,3 @@
   --connection $node_4
   let $auto_increment_offset_node_4 = `SELECT @@global.auto_increment_offset`;
 }
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_dump_sr_table.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_dump_sr_table.inc
--- mariadb-10.11.11/mysql-test/suite/galera/include/galera_dump_sr_table.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_dump_sr_table.inc	2025-05-19 16:14:24.000000000 +0000
@@ -25,4 +25,3 @@
 
   --inc $seqno
 }
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc
--- mariadb-10.11.11/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_st_shutdown_slave.inc	2025-05-19 16:14:24.000000000 +0000
@@ -118,4 +118,3 @@
 SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
 DROP TABLE t1;
 COMMIT;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_start_replication.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_start_replication.inc
--- mariadb-10.11.11/mysql-test/suite/galera/include/galera_start_replication.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_start_replication.inc	2025-05-19 16:14:24.000000000 +0000
@@ -41,9 +41,9 @@
 
   my $counter = 1000;
   #my $found = false
-  
+
   while ($counter > 0) {
- 
+
     open(FILE, "$logfile") or die("Unable to open $logfile : $!\n");
     my $new_sync_count = () = grep(/Synchronized with group/g,<FILE>);
     close(FILE);
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/galera_wsrep_recover.inc mariadb-10.11.13/mysql-test/suite/galera/include/galera_wsrep_recover.inc
--- mariadb-10.11.11/mysql-test/suite/galera/include/galera_wsrep_recover.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/include/galera_wsrep_recover.inc	2025-05-19 16:14:24.000000000 +0000
@@ -9,14 +9,14 @@
 }
 
 --perl
-	use strict;
+        use strict;
         my $wsrep_start_position_str = "grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'";
         my $wsrep_start_position = `grep -a 'WSREP: Recovered position:' $ENV{MYSQL_TMP_DIR}/galera_wsrep_recover.log | sed 's/.*WSREP\:\ Recovered\ position://' | sed 's/^[ \t]*//'`;
         chomp($wsrep_start_position);
 
         die if $wsrep_start_position eq '';
 
-	open(FILE, ">", "$ENV{MYSQL_TMP_DIR}/galera_wsrep_start_position.inc") or die;
+        open(FILE, ">", "$ENV{MYSQL_TMP_DIR}/galera_wsrep_start_position.inc") or die;
         print FILE "--let \$galera_wsrep_start_position = $wsrep_start_position\n";
         close FILE;
 EOF
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc mariadb-10.11.13/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc
--- mariadb-10.11.11/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/include/wait_condition_with_debug_and_kill.inc	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,35 @@
+# include/wait_condition_with_debug_and_kill.inc
+#
+# SUMMARY
+#
+#    Waits until the passed statement returns true, or the operation
+#    times out.  If the operation times out, the additional error
+#    statement will be executed and server is killed.
+#
+# USAGE
+#
+#    let $wait_condition=
+#      SELECT c = 3 FROM t;
+#    let $wait_condition_on_error_output= select count(*) from t;
+#    [let $explicit_default_wait_timeout= N] # to override the default reset
+#    --source include/wait_condition_with_debug_and_kill.inc
+#
+#   OR
+#
+#    let $wait_timeout= 60; # Override default 30 seconds with 60.
+#    let $wait_condition=
+#      SELECT c = 3 FROM t;
+#    let $wait_condition_on_error_output= select count(*) from t;
+#    --source include/wait_condition_with_debug_and_kill.inc
+#    --echo Executed the test condition $wait_condition_reps times
+#
+#
+# EXAMPLE
+#    events_bugs.test, events_time_zone.test
+#
+
+--source include/wait_condition_with_debug.inc
+if (!$success)
+{
+  --source include/kill_galera.inc
+}
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/GAL-401.result mariadb-10.11.13/mysql-test/suite/galera/r/GAL-401.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/GAL-401.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/GAL-401.result	2025-05-19 16:14:24.000000000 +0000
@@ -24,6 +24,6 @@
   PRIMARY KEY (`f1`)
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
 DROP TABLE t1;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 connection node_1;
 SET GLOBAL wsrep_provider_options = 'pc.ignore_sb=false';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20225.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20225.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20225.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20225.result	2025-05-19 16:14:24.000000000 +0000
@@ -15,7 +15,7 @@
 SET GLOBAL debug_dbug = 'RESET';
 SET DEBUG_SYNC = 'now SIGNAL signal.mdev_20225_continue';
 SET DEBUG_SYNC = 'RESET';
-SET GLOBAL wsrep_slave_threads = 1;
+SET GLOBAL wsrep_slave_threads = DEFAULT;
 connection node_2;
 SHOW TRIGGERS;
 Trigger	Event	Table	Statement	Timing	Created	sql_mode	Definer	character_set_client	collation_connection	Database Collation
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20793.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20793.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-20793.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-20793.result	2025-05-19 16:14:24.000000000 +0000
@@ -41,4 +41,4 @@
 ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
 SET debug_sync = "RESET";
 DROP TABLE t1;
-SET GLOBAL wsrep_slave_threads = 1;
+SET GLOBAL wsrep_slave_threads = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-21479.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-21479.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-21479.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-21479.result	2025-05-19 16:14:24.000000000 +0000
@@ -66,7 +66,7 @@
 Variable_name	Value
 wsrep_desync_count	0
 SET @@global.wsrep_desync = 0;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 connection node_1;
 # Wait until both nodes are back to cluster
 SET GLOBAL wsrep_provider_options = 'pc.ignore_sb=false';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-25389.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-25389.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-25389.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-25389.result	2025-05-19 16:14:24.000000000 +0000
@@ -15,3 +15,4 @@
 SELECT @@wsrep_slave_threads;
 @@wsrep_slave_threads
 1
+connection node_2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-26266.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-26266.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-26266.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-26266.result	2025-05-19 16:14:24.000000000 +0000
@@ -19,5 +19,5 @@
 INSERT INTO t2 VALUES (4);
 INSERT INTO t2 VALUES (5);
 CREATE VIEW v1 AS SELECT c1 FROM t1 WHERE c1 IN (SELECT a FROM t2) GROUP BY c1;
-ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+DROP VIEW v1;
 DROP TABLE t1,t2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-33136.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-33136.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-33136.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-33136.result	2025-05-19 16:14:24.000000000 +0000
@@ -4,7 +4,7 @@
 connection node_1;
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
 connection node_1a;
-TRUNCATE TABLE t1;
+RENAME TABLE t1 TO tmp, tmp TO t1;
 SET SESSION wsrep_retry_autocommit = 0;
 SET DEBUG_SYNC = 'dict_stats_mdl_acquired SIGNAL may_toi WAIT_FOR bf_abort';
 INSERT INTO t1 VALUES (1);
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-34647.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-34647.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-34647.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-34647.result	2025-05-19 16:14:24.000000000 +0000
@@ -95,7 +95,6 @@
 4	d
 5	d
 6	d
-set global wsrep_mode=default;
 connection node_1;
 drop table t1,t2,t3,t4,t5;
 set global wsrep_mode=default;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35748.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35748.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35748.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35748.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,31 @@
+connection node_2;
+connection node_1;
+connection node_1;
+INSTALL PLUGIN IF NOT EXISTS connect SONAME 'ha_connect';
+CREATE TABLE t1 (f INT) ENGINE=CONNECT;
+Warnings:
+Warning	1105	No table_type. Will be set to DOS
+Warning	1105	No file name. Table will use t1.dos
+CREATE TABLE t2 (f INT) ENGINE=ROCKSDB;
+CREATE TABLE t3 (f INT) ENGINE=SEQUENCE;
+ERROR 42000: This version of MariaDB doesn't yet support 'non-InnoDB sequences in Galera cluster'
+show warnings;
+Level	Code	Message
+Error	1235	This version of MariaDB doesn't yet support 'non-InnoDB sequences in Galera cluster'
+Note	1235	ENGINE=SEQUENCE not supported by Galera
+connection node_2;
+show create table t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `f` int(11) DEFAULT NULL
+) ENGINE=CONNECT DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+show create table t2;
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `f` int(11) DEFAULT NULL
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+show create table t3;
+ERROR 42S02: Table 'test.t3' doesn't exist
+connection node_1;
+DROP TABLE t1, t2;
+UNINSTALL PLUGIN IF EXISTS connect;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35946.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35946.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-35946.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-35946.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,16 @@
+connection node_2;
+connection node_1;
+connection node_1;
+connection node_2;
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1';
+SET SESSION wsrep_sync_wait=0;
+SET SESSION wsrep_sync_wait=DEFAULT;
+DELETE FROM mysql.wsrep_streaming_log;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction
+SET SESSION wsrep_sync_wait=0;
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0';
+SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+VARIABLE_VALUE
+Primary
+SET SESSION wsrep_sync_wait=DEFAULT;
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-36116.result mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-36116.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MDEV-36116.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MDEV-36116.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,22 @@
+connection node_2;
+connection node_1;
+connect con1,127.0.0.1,root,,test,$NODE_MYPORT_1;
+connection node_1;
+CALL mtr.add_suppression("CREATE TABLE isolation failure");
+SET DEBUG_SYNC = 'wsrep_kill_thd_before_enter_toi SIGNAL may_kill WAIT_FOR continue';
+CREATE TABLE t1 (a INT) ENGINE=InnoDB;
+connection con1;
+SET DEBUG_SYNC = 'now WAIT_FOR may_kill';
+SET DEBUG_SYNC = 'now SIGNAL continue';
+connection node_1;
+Got one of the listed errors
+connection node_2;
+SHOW TABLES LIKE 't1';
+Tables_in_test (t1)
+connection con1;
+SHOW TABLES LIKE 't1';
+Tables_in_test (t1)
+SET DEBUG_SYNC = 'RESET';
+disconnect con1;
+disconnect node_2;
+disconnect node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-284.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-284.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MW-284.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-284.result	2025-05-19 16:14:24.000000000 +0000
@@ -13,7 +13,7 @@
 SELECT @@wsrep_on;
 @@wsrep_on
 0
-call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use (server_errno=1047)");
+call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use \\(server_errno ?= ?1047\\)");
 START SLAVE;
 include/wait_for_slave_param.inc [Slave_IO_Running]
 connection node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-329.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-329.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MW-329.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-329.result	2025-05-19 16:14:24.000000000 +0000
@@ -18,5 +18,6 @@
 connection node_1;
 DROP PROCEDURE proc_insert;
 DROP TABLE t1;
+disconnect node_1b;
 CALL mtr.add_suppression("WSREP: .* conflict state after post commit ");
 set global innodb_status_output=Default;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-329F.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-329F.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MW-329F.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-329F.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,25 @@
+connection node_2;
+connection node_1;
+CREATE TABLE t1 (f1 INTEGER, f2 CHAR(20) DEFAULT 'abc') ENGINE=InnoDB;
+INSERT INTO t1 (f1) VALUES (1),(65535);
+CREATE PROCEDURE proc_insert (repeat_count int)
+BEGIN
+DECLARE current_num int;
+DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+SET current_num = 0;
+SET SESSION wsrep_sync_wait = 0;
+WHILE current_num < repeat_count do
+INSERT INTO t1 (f1) VALUES (FLOOR( 1 + RAND( ) * 65535 ));
+SELECT SLEEP(0.1);
+SET current_num = current_num + 1;
+END WHILE;
+END|
+connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+connection node_1b;
+connection node_1b;
+connection node_1;
+DROP PROCEDURE proc_insert;
+DROP TABLE t1;
+disconnect node_1b;
+CALL mtr.add_suppression("WSREP: .* conflict state after post commit ");
+set global innodb_status_output=Default;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/MW-416.result mariadb-10.11.13/mysql-test/suite/galera/r/MW-416.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/MW-416.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/MW-416.result	2025-05-19 16:14:24.000000000 +0000
@@ -20,13 +20,13 @@
 Got one of the listed errors
 CREATE DATABASE db;
 Got one of the listed errors
-CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP  DO SELECT 1;
+CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP DO SELECT 1;
 Got one of the listed errors
 CREATE FUNCTION fun1() RETURNS int RETURN(1);
 Got one of the listed errors
 CREATE FUNCTION fun1 RETURNS STRING SONAME 'funlib.so';
 Got one of the listed errors
-CREATE PROCEDURE proc1()  BEGIN END;
+CREATE PROCEDURE proc1() BEGIN END;
 Got one of the listed errors
 CREATE INDEX idx ON tbl(id);
 Got one of the listed errors
@@ -100,3 +100,4 @@
 performance_schema
 sys
 test
+disconnect userMW416;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_2primary_replica.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_2primary_replica.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_2primary_replica.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_2primary_replica.result	2025-05-19 16:14:24.000000000 +0000
@@ -13,10 +13,13 @@
 connect replica, 127.0.0.1, root, , test, $NODE_MYPORT_1;
 connection replica;
 connection node_2;
+connection primary1;
+connection primary2;
 connection replica;
 # Galera replica changing master to primary1
-SET @@default_master_connection='stream2';
+SET @@default_master_connection='stream1';
 # Primary node changing master to primary2
+SET @@default_master_connection='stream2';
 START ALL SLAVES;
 Warnings:
 Note	1937	SLAVE 'stream1' started
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_alter_engine_myisam.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_alter_engine_myisam.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_alter_engine_myisam.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_alter_engine_myisam.result	2025-05-19 16:14:24.000000000 +0000
@@ -26,3 +26,4 @@
 1
 DROP TABLE t1;
 connection node_1;
+SET GLOBAL wsrep_mode = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_applier_ftwrl_table_alter.result	2025-05-19 16:14:24.000000000 +0000
@@ -13,7 +13,7 @@
 SELECT 1 FROM DUAL;
 1
 1
-SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock';
+SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 COUNT(*) = 1
 1
 UNLOCK TABLES;
@@ -25,7 +25,7 @@
   `f2` int(11) DEFAULT NULL,
   PRIMARY KEY (`f1`)
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock';
+SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 COUNT(*) = 0
 1
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_as_slave_nonprim.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_as_slave_nonprim.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_as_slave_nonprim.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_as_slave_nonprim.result	2025-05-19 16:14:24.000000000 +0000
@@ -12,7 +12,6 @@
 connection node_4;
 INSERT INTO t1 VALUES (1),(2),(3),(4),(5);
 connection node_2;
-connection node_1;
 expected_error
 1
 connection node_2;
@@ -27,7 +26,7 @@
 RESET SLAVE ALL;
 CALL mtr.add_suppression("Slave SQL: Error 'Unknown command' on query");
 CALL mtr.add_suppression("Slave: Unknown command Error_code: 1047");
-CALL mtr.add_suppression("Transport endpoint is not connected");
+CALL mtr.add_suppression("(Transport endpoint|Socket) is not connected");
 CALL mtr.add_suppression("Slave SQL: Error in Xid_log_event: Commit could not be completed, 'Deadlock found when trying to get lock; try restarting transaction', Error_code: 1213");
 CALL mtr.add_suppression("Slave SQL: Node has dropped from cluster, Error_code: 1047");
 connection node_4;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_group_commit.result	1970-01-01 00:00:00.000000000 +0000
@@ -1,685 +0,0 @@
-SET SESSION wsrep_sync_wait = 0;
-galera_sr_bf_abort_at_commit = 0
-after_replicate_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,after_replicate_sync';
-INSERT INTO t1 VALUES (3);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=after_replicate_sync';
-ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-local_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,local_monitor_master_enter_sync';
-INSERT INTO t1 VALUES (3);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=local_monitor_master_enter_sync';
-ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-apply_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_master_enter_sync';
-INSERT INTO t1 VALUES (3);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_master_enter_sync';
-ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-commit_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync';
-INSERT INTO t1 VALUES (3);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync';
-ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-galera_sr_bf_abort_at_commit = 1
-after_replicate_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,after_replicate_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=after_replicate_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-local_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,local_monitor_master_enter_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=local_monitor_master_enter_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-apply_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_master_enter_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_master_enter_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-commit_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 1;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-galera_sr_bf_abort_at_commit = 1
-after_replicate_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 0;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,after_replicate_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=after_replicate_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-local_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 0;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,local_monitor_master_enter_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=local_monitor_master_enter_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-apply_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 0;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_master_enter_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_master_enter_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-commit_monitor_master_enter_sync
-CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
-SET SESSION wsrep_trx_fragment_size = 0;
-SET AUTOCOMMIT=OFF;
-INSERT INTO t1 VALUES (1);
-SELECT * FROM t1 FOR UPDATE;
-f1
-1
-SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
-SET AUTOCOMMIT=ON;
-INSERT INTO t1 VALUES (2);
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync';
-COMMIT;
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
-SET SESSION wsrep_on = 0;
-SET SESSION wsrep_on = 1;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync';
-ROLLBACK;
-SET GLOBAL wsrep_provider_options = 'dbug=';
-SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT * FROM t1;
-f1
-1
-2
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 2;
-COUNT(*) = 1
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log;
-COUNT(*) = 0
-1
-SET AUTOCOMMIT=ON;
-SET SESSION wsrep_trx_fragment_size = 0;
-DELETE FROM t1;
-DROP TABLE t1;
-CALL mtr.add_suppression("WSREP: fragment replication failed: 1");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_lock_table.result	2025-05-19 16:14:24.000000000 +0000
@@ -7,6 +7,7 @@
 connection node_1;
 INSERT INTO t1 VALUES (2);
 connection node_2;
+SET SESSION wsrep_sync_wait = 0;
 UNLOCK TABLES;
 COMMIT;
 SELECT COUNT(*) = 1 FROM t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_abort_mariabackup.result	2025-05-19 16:14:24.000000000 +0000
@@ -53,7 +53,7 @@
 FOUND 1 /Server not desynched from group at BLOCK_DDL because WSREP_MODE_BF_MARIABACKUP is used./ in mysqld.2.err
 # Should return FOUND 1 as server did desync and pause at BLOCK_COMMIT
 FOUND 1 /Server desynched from group during BACKUP STAGE BLOCK_COMMIT./ in mysqld.2.err
-SET GLOBAL wsrep_mode = "";
+SET GLOBAL wsrep_mode = DEFAULT;
 connection node_1;
 DROP TABLE t;
 disconnect node_2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
---- a/home/panda/mariadb-10.5/mysql-test/suite/galera/r/galera_bf_kill.result
-+++ b/home/panda/mariadb-10.5/mysql-test/suite/galera/r/galera_bf_kill.reject
+--- r/galera_bf_kill.result
++++ r/galera_bf_kill,debug.reject
 @@ -77,4 +77,34 @@ a	b
  5	2
  disconnect node_2a;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill_debug.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill_debug.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_bf_kill_debug.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_bf_kill_debug.result	2025-05-19 16:14:24.000000000 +0000
@@ -40,18 +40,19 @@
 disconnect node_2a;
 connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
 connection node_2a;
-CREATE TABLE t1 (i int primary key);
+CREATE TABLE t1 (i int primary key) engine=innodb;
 SET DEBUG_SYNC = "before_wsrep_ordered_commit SIGNAL bwoc_reached WAIT_FOR bwoc_continue";
 INSERT INTO t1 VALUES (1);
 connection node_2;
 SET DEBUG_SYNC = "now WAIT_FOR bwoc_reached";
 SET DEBUG_SYNC = "now SIGNAL bwoc_continue";
-SET DEBUG_SYNC='RESET';
 connection node_2a;
 connection node_2;
+SET DEBUG_SYNC='RESET';
 select * from t1;
 i
 1
 disconnect node_2a;
+disconnect node_2b;
 connection node_1;
 drop table t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_checksum.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_checksum.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_checksum.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_checksum.result	2025-05-19 16:14:24.000000000 +0000
@@ -27,4 +27,5 @@
 1
 connection node_1;
 DROP TABLE t1;
+SET @@global.wsrep_mode=DEFAULT;
 # End of tests.
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_binlog_stmt_autoinc.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,11 +1,11 @@
 connection node_2;
 connection node_1;
 connection node_1;
-SET GLOBAL auto_increment_offset=1;
 connection node_2;
-SET GLOBAL auto_increment_offset=2;
 connection node_1;
+SET GLOBAL auto_increment_offset=1;
 connection node_2;
+SET GLOBAL auto_increment_offset=2;
 connection node_2;
 SET GLOBAL wsrep_forced_binlog_format='STATEMENT';
 connection node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_circular_replication.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_circular_replication.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_circular_replication.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_circular_replication.result	2025-05-19 16:14:24.000000000 +0000
@@ -12,6 +12,7 @@
 connection replica1;
 connection node_2;
 connection primary2;
+connection primary1;
 connection replica1;
 # Galera replica changing master to primary1
 START SLAVE;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ddl_fk_conflict.result	2025-05-19 16:14:24.000000000 +0000
@@ -298,6 +298,7 @@
 ######################################################################
 connection node_1;
 SET SESSION wsrep_sync_wait=0;
+FLUSH STATUS;
 CREATE TABLE p1 (pk INTEGER PRIMARY KEY, f2 CHAR(30));
 INSERT INTO p1 VALUES (1, 'INITIAL VALUE');
 CREATE TABLE p2 (pk INTEGER PRIMARY KEY, f2 CHAR(30));
@@ -491,6 +492,7 @@
 ######################################################################
 connection node_1;
 SET SESSION wsrep_sync_wait=0;
+FLUSH STATUS;
 CREATE TABLE p1 (pk INTEGER PRIMARY KEY, f2 CHAR(30));
 INSERT INTO p1 VALUES (1, 'INITIAL VALUE');
 CREATE TABLE p2 (pk INTEGER PRIMARY KEY, f2 CHAR(30));
@@ -684,6 +686,7 @@
 ######################################################################
 connection node_1;
 SET SESSION wsrep_sync_wait=0;
+FLUSH STATUS;
 CREATE TABLE p1 (pk INTEGER PRIMARY KEY, f2 CHAR(30));
 INSERT INTO p1 VALUES (1, 'INITIAL VALUE');
 CREATE TABLE p2 (pk INTEGER PRIMARY KEY, f2 CHAR(30));
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_defaults.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_defaults.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_defaults.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_defaults.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,9 @@
 connection node_2;
 connection node_1;
 # Correct Galera library found
+SELECT COUNT(*) `expect 51` FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%';
+expect 51
+51
 SELECT VARIABLE_NAME, VARIABLE_VALUE
 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
 WHERE VARIABLE_NAME LIKE 'wsrep_%'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_gcs_fragment.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_gcs_fragment.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_gcs_fragment.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_gcs_fragment.result	2025-05-19 16:14:24.000000000 +0000
@@ -22,7 +22,7 @@
 connection node_1a;
 SET GLOBAL wsrep_provider_options = 'signal=gcs_core_after_frag_send';
 connection node_1;
-ERROR HY000: Got error 6 "No such device or address" during COMMIT
+ERROR HY000: Error while appending streaming replication fragment(provider status: Not connected to Primary Component)
 INSERT INTO t1 VALUES (3, "cccccaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
 SELECT * FROM t1;
 f1	f2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_inject_bf_long_wait.result	2025-05-19 16:14:24.000000000 +0000
@@ -3,8 +3,11 @@
 CREATE TABLE t1(id int not null primary key, b int) engine=InnoDB;
 INSERT INTO t1 VALUES (0,0),(1,1),(2,2),(3,3);
 BEGIN;
+SET DEBUG_SYNC = 'wsrep_after_statement_enter SIGNAL blocked';
 UPDATE t1 set b = 100 where id between 1 and 2;;
 connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET DEBUG_SYNC = 'now WAIT_FOR blocked';
+SET DEBUG_SYNC = 'wsrep_after_statement_enter CLEAR';
 connection node_1b;
 SET @save_dbug = @@SESSION.debug_dbug;
 SET @@SESSION.innodb_lock_wait_timeout=2;
@@ -20,5 +23,6 @@
 1	100
 2	100
 3	3
+SET DEBUG_SYNC = 'RESET';
 disconnect node_1b;
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28423,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
---- suite/galera/r/galera_ist_MDEV-28423.result	2022-06-13 09:40:33.073863796 +0300
-+++ suite/galera/r/galera_ist_MDEV-28423.reject	2022-06-13 09:58:59.936874991 +0300
+--- r/galera_ist_MDEV-28423.result
++++ r/galera_ist_MDEV-28423,debug.reject
 @@ -517,3 +517,187 @@
  1
  DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_MDEV-28583,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
---- suite/galera/r/galera_ist_MDEV-28583.result	2022-06-11 10:48:16.875034382 +0300
-+++ suite/galera/r/galera_ist_MDEV-28583,debug.reject	2022-06-11 11:25:55.616481509 +0300
+--- r/galera_ist_MDEV-28583.result
++++ r/galera_ist_MDEV-28583,debug.reject
 @@ -517,3 +517,187 @@
  1
  DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,13 +1,12 @@
 --- r/galera_ist_mysqldump.result
 +++ r/galera_ist_mysqldump,debug.reject
-@@ -354,11 +354,195 @@
+@@ -354,6 +354,190 @@
  1
  DROP TABLE t1;
  COMMIT;
 +Performing State Transfer on a server that has been killed and restarted
 +while a DDL was in progress on it
- connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
++connection node_1;
 +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
 +SET AUTOCOMMIT=OFF;
 +START TRANSACTION;
@@ -189,12 +188,6 @@
 +DROP TABLE t1;
 +COMMIT;
 +SET GLOBAL debug_dbug = $debug_orig;
-+connection node_1;
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+ connection node_1;
+ CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
  DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump,release.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
---- r/galera_ist_mysqldump.result
-+++ r/galera_ist_mysqldump.reject
-@@ -355,10 +355,10 @@
- DROP TABLE t1;
- COMMIT;
- connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ist_mysqldump.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ist_mysqldump.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,8 @@
 connection node_2;
 connection node_1;
+call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ");
+connection node_1;
+connection node_2;
 Setting SST method to mysqldump ...
 call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to '127\\.0\\.0\\.1'");
 call mtr.add_suppression("Failed to load slave replication state from table mysql\\.gtid_slave_pos");
@@ -9,9 +12,6 @@
 SET GLOBAL wsrep_sst_auth = 'sst:';
 connection node_2;
 SET GLOBAL wsrep_sst_method = 'mysqldump';
-call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ");
-connection node_1;
-connection node_2;
 Performing State Transfer on a server that has been shut down cleanly and restarted
 connection node_1;
 CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
@@ -355,10 +355,10 @@
 DROP TABLE t1;
 COMMIT;
 connection node_1;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 DROP USER sst;
 connection node_2;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
 CALL mtr.add_suppression("Can't open and lock time zone table");
 CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_nonPK_and_PA.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_nonPK_and_PA.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_nonPK_and_PA.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_nonPK_and_PA.result	2025-05-19 16:14:24.000000000 +0000
@@ -8,7 +8,7 @@
 SET SESSION wsrep_sync_wait = 0;
 SET GLOBAL wsrep_slave_threads = 2;
 ***************************************************************
-scenario 1,  conflicting UPDATE
+scenario 1, conflicting UPDATE
 ***************************************************************
 SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_slave_enter_sync';
 connection node_1;
@@ -31,7 +31,7 @@
 SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_slave_enter_sync';
 SET GLOBAL wsrep_provider_options = 'dbug=';
 ***************************************************************
-scenario 2,  conflicting DELETE
+scenario 2, conflicting DELETE
 ***************************************************************
 SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_slave_enter_sync';
 connection node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_apply_lock_table.result	2025-05-19 16:14:24.000000000 +0000
@@ -10,10 +10,10 @@
 INSERT INTO t2 VALUES (1);
 connection node_2a;
 SET SESSION wsrep_sync_wait=0;
-SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE 'Commit' or STATE = 'Waiting for certification');
+SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification');
 EXPECT_1
 1
-SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE '%Waiting for table metadata lock%';
+SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 EXPECT_1
 1
 SELECT COUNT(*) AS EXPECT_0 FROM t1;
@@ -32,9 +32,8 @@
 SELECT COUNT(*) AS EXPECT_1 FROM t2;
 EXPECT_1
 1
-SELECT COUNT(*) AS EXPECT_2  FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE '%committed%' or STATE = 'Waiting for certification');
+SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification');
 EXPECT_2
 2
-SET GLOBAL wsrep_slave_threads = 1;;
 DROP TABLE t1;
 DROP TABLE t2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_simple.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_simple.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_parallel_simple.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_parallel_simple.result	2025-05-19 16:14:24.000000000 +0000
@@ -34,6 +34,5 @@
 SELECT COUNT(*) as expect_20 FROM t2;
 expect_20
 20
-SET GLOBAL wsrep_slave_threads = 1;;
 DROP TABLE t1;
 DROP TABLE t2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_partitioned_tables.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_partitioned_tables.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_partitioned_tables.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_partitioned_tables.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,176 @@
+connection node_2;
+connection node_1;
+call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine partition for table");
+# wsrep-mode= DEFAULT
+SET GLOBAL wsrep_mode = "";
+SELECT @@wsrep_mode;
+@@wsrep_mode
+
+CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB
+PARTITION BY KEY (v1)
+PARTITIONS 2;
+CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+PARTITION BY KEY (v1)
+PARTITIONS 2;
+ALTER TABLE t1 ADD COLUMN v2 int;
+ALTER TABLE t2 ADD COLUMN v2 int;
+INSERT INTO t1 VALUES (1,1),(2,2);
+INSERT INTO t2 VALUES (1,1),(2,2);
+ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM;
+ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria;
+UPDATE t1 SET v3 = 3;
+UPDATE t2 SET v3 = 3;
+CREATE INDEX xx1 ON t1(v2);
+CREATE INDEX xx2 ON t2(v2);
+DROP INDEX xx1 ON t1;
+DROP INDEX xx2 ON t2;
+TRUNCATE TABLE t1;
+TRUNCATE TABLE t2;
+RENAME TABLE t1 TO t1_v2;
+RENAME TABLE t2 TO t2_v2;
+CREATE VIEW x1 AS SELECT * FROM t1_v2;
+CREATE VIEW x2 AS SELECT * FROM t2_v2;
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1
+AFTER INSERT ON t1_v2 FOR EACH ROW
+UPDATE t1_v2 SET t1_v2.v3 = t1_v2.v3+1;
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2
+AFTER INSERT ON t2_v2 FOR EACH ROW
+UPDATE t2_v2 SET t2_v2.v3 = t2_v2.v3+1;
+connection node_2;
+SHOW CREATE TABLE t1_v2;
+Table	Create Table
+t1_v2	CREATE TABLE `t1_v2` (
+  `v1` int(11) NOT NULL,
+  `v2` int(11) DEFAULT NULL,
+  `v3` int(11) DEFAULT NULL,
+  PRIMARY KEY (`v1`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+ PARTITION BY KEY (`v1`)
+PARTITIONS 2
+SHOW CREATE TABLE t2_v2;
+Table	Create Table
+t2_v2	CREATE TABLE `t2_v2` (
+  `v1` int(11) NOT NULL,
+  `v2` int(11) DEFAULT NULL,
+  `v3` int(11) DEFAULT NULL,
+  PRIMARY KEY (`v1`)
+) ENGINE=Aria DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+ PARTITION BY KEY (`v1`)
+PARTITIONS 2
+SHOW CREATE VIEW x1;
+View	Create View	character_set_client	collation_connection
+x1	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `x1` AS select `t1_v2`.`v1` AS `v1`,`t1_v2`.`v2` AS `v2`,`t1_v2`.`v3` AS `v3` from `t1_v2`	latin1	latin1_swedish_ci
+SHOW CREATE VIEW x2;
+View	Create View	character_set_client	collation_connection
+x2	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `x2` AS select `t2_v2`.`v1` AS `v1`,`t2_v2`.`v2` AS `v2`,`t2_v2`.`v3` AS `v3` from `t2_v2`	latin1	latin1_swedish_ci
+SELECT * FROM t1_v2;
+v1	v2	v3
+SELECT * FROM t2_v2;
+v1	v2	v3
+connection node_1;
+DROP VIEW x1;
+DROP VIEW x2;
+DROP TRIGGER increment_before_t1;
+DROP TRIGGER increment_before_t2;
+DROP TABLE t1_v2;
+DROP TABLE t2_v2;
+SET GLOBAL wsrep_mode = "";
+CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+PARTITION BY KEY (v1)
+PARTITIONS 2;
+# wsrep-mode= STRICT_REPLICATION
+SET GLOBAL wsrep_mode = "STRICT_REPLICATION";
+SELECT @@wsrep_mode;
+@@wsrep_mode
+STRICT_REPLICATION
+CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB
+PARTITION BY KEY (v1)
+PARTITIONS 2;
+CREATE OR REPLACE TABLE t3 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+PARTITION BY KEY (v1)
+PARTITIONS 2;
+ERROR HY000: Galera replication not supported
+ALTER TABLE t1 ADD COLUMN v2 int;
+ALTER TABLE t2 ADD COLUMN v2 int;
+ERROR HY000: Galera replication not supported
+INSERT INTO t1 VALUES (1,1),(2,2);
+Warnings:
+Warning	1290	WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t1' is not supported in Galera
+INSERT INTO t2 VALUES (1),(2);
+Warnings:
+Warning	1290	WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t2' is not supported in Galera
+ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM;
+ERROR HY000: Galera replication not supported
+ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria;
+ERROR HY000: Galera replication not supported
+UPDATE t1 SET v2 = v2 + 3;
+Warnings:
+Warning	1290	WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t1' is not supported in Galera
+UPDATE t2 SET v1 = v1 + 3;
+Warnings:
+Warning	1290	WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine partition for table 'test'.'t2' is not supported in Galera
+CREATE INDEX xx1 ON t1(v2);
+CREATE INDEX xx2 ON t2(v2);
+ERROR HY000: Galera replication not supported
+DROP INDEX xx1 ON t1;
+DROP INDEX xx2 on t2;
+ERROR HY000: Galera replication not supported
+TRUNCATE TABLE t1;
+TRUNCATE TABLE t2;
+ERROR HY000: Galera replication not supported
+RENAME TABLE t1 TO t1_v2;
+RENAME TABLE t2 TO t2_v2;
+RENAME TABLE t2_v2 TO t2;
+CREATE VIEW x1 AS SELECT * FROM t1_v2;
+CREATE VIEW x2 AS SELECT * FROM t2;
+ERROR HY000: Galera replication not supported
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1
+AFTER INSERT ON t1_v2 FOR EACH ROW
+UPDATE t1_v2 SET t1_v2.v2 = t1_v2.v2+1;
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2
+AFTER INSERT ON t2 FOR EACH ROW
+UPDATE t2 SET t2.v1 = t2.v1+1;
+ERROR HY000: Galera replication not supported
+connection node_2;
+SHOW CREATE TABLE t1_v2;
+Table	Create Table
+t1_v2	CREATE TABLE `t1_v2` (
+  `v1` int(11) NOT NULL,
+  `v2` int(11) DEFAULT NULL,
+  PRIMARY KEY (`v1`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+ PARTITION BY KEY (`v1`)
+PARTITIONS 2
+SHOW CREATE TABLE t2;
+Table	Create Table
+t2	CREATE TABLE `t2` (
+  `v1` int(11) NOT NULL,
+  `v2` int(11) DEFAULT NULL,
+  PRIMARY KEY (`v1`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+ PARTITION BY KEY (`v1`)
+PARTITIONS 2
+SHOW CREATE VIEW x1;
+View	Create View	character_set_client	collation_connection
+x1	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `x1` AS select `t1_v2`.`v1` AS `v1`,`t1_v2`.`v2` AS `v2` from `t1_v2`	latin1	latin1_swedish_ci
+SELECT * FROM t1_v2;
+v1	v2
+SELECT * FROM t2;
+v1	v2
+connection node_1;
+DROP VIEW x1;
+DROP TRIGGER increment_before_t1;
+DROP TABLE t1_v2;
+DROP TABLE t2;
+SET GLOBAL wsrep_mode = "";
+CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+PARTITION BY KEY (v1)
+PARTITIONS 2;
+# wsrep-mode= STRICT_REPLICATION
+SET GLOBAL wsrep_mode = "STRICT_REPLICATION";
+SELECT @@wsrep_mode;
+@@wsrep_mode
+STRICT_REPLICATION
+ALTER TABLE t2 ENGINE=InnoDB;
+DROP TABLE t2;
+SET GLOBAL wsrep_mode = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_restart_replica.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_restart_replica.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_restart_replica.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_restart_replica.result	2025-05-19 16:14:24.000000000 +0000
@@ -7,6 +7,7 @@
 ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 connection node_1;
 connection replica;
+connection primary;
 connection replica;
 START SLAVE;
 connection primary;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequence_engine.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequence_engine.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequence_engine.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequence_engine.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,10 @@
 connection node_2;
 connection node_1;
+connection node_2;
+SET GLOBAL wsrep_ignore_apply_errors=0;
+connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
+connection node_2a;
+SET SESSION wsrep_sync_wait=0;
 SET GLOBAL wsrep_ignore_apply_errors=0;
 SET SESSION AUTOCOMMIT=0;
 SET SESSION max_error_count=0;
@@ -8,5 +13,4 @@
 connection node_2;
 SHOW CREATE TABLE t0;
 ERROR 42S02: Table 'test.t0' doesn't exist
-connection node_1;
-SET GLOBAL wsrep_ignore_apply_errors=DEFAULT;
+disconnect node_2a;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences,binlogoff.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,11 @@
+--- r/galera_sequences.result
++++ r/galera_sequences,binlogoff.reject
+@@ -313,7 +313,7 @@
+ 7	4
+ SELECT NEXTVAL(t);
+ NEXTVAL(t)
+-42
++2
+ connection node_1;
+ DROP TABLE t1;
+ DROP SEQUENCE t;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences.result	2025-05-19 16:14:24.000000000 +0000
@@ -47,6 +47,9 @@
 NEXT VALUE FOR Seq1_1
 4
 connection node_1;
+SHOW CREATE SEQUENCE Seq1_1;
+Table	Create Table
+Seq1_1	CREATE SEQUENCE `Seq1_1` start with 1 minvalue 1 maxvalue 9223372036854775806 increment by 1 nocache nocycle ENGINE=InnoDB
 DROP SEQUENCE Seq1_1;
 connection node_1;
 CREATE TABLE t2 (d CHAR(1)KEY);
@@ -279,6 +282,9 @@
 connection node_1;
 DROP TABLE t1;
 DROP SEQUENCE t;
+connection node_2;
+SET SESSION wsrep_sync_wait=15;
+connection node_1;
 CREATE SEQUENCE t INCREMENT BY 0 CACHE=20 ENGINE=INNODB;
 CREATE TABLE t1(a int not null primary key default nextval(t), b int) engine=innodb;
 BEGIN;
@@ -324,4 +330,14 @@
 ALTER SEQUENCE IF EXISTS t MINVALUE=1;
 ERROR 42000: This version of MariaDB doesn't yet support 'CACHE without INCREMENT BY 0 in Galera cluster'
 DROP TABLE t;
+
+MDEV-32631:
+
+CREATE OR REPLACE TABLE t1(c INT ) ENGINE=ARIA;
+SET SESSION WSREP_OSU_METHOD=RSU;
+INSERT INTO t1 SELECT seq,concat(seq,1) FROM seq_1_to_100;
+ERROR 42000: This version of MariaDB doesn't yet support 'RSU on this table engine'
+SET SESSION WSREP_OSU_METHOD=TOI;
+DROP TABLE t1;
+
 End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_bf_kill.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_bf_kill.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_bf_kill.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_bf_kill.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,152 @@
+connection node_2;
+connection node_1;
+connection node_1;
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY, f2 INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, 0), (3, 0);
+connection node_1;
+START TRANSACTION;
+INSERT INTO t1 VALUES (4, next value for s);
+INSERT INTO t1 VALUES (5, next value for s);
+INSERT INTO t1 VALUES (6, next value for s);
+INSERT INTO t1 VALUES (7, next value for s);
+INSERT INTO t1 VALUES (8, next value for s);
+INSERT INTO t1 VALUES (9, next value for s);
+INSERT INTO t1 VALUES (10, next value for s);
+INSERT INTO t1 VALUES (11, next value for s);
+INSERT INTO t1 VALUES (12, next value for s);
+INSERT INTO t1 VALUES (13, next value for s);
+INSERT INTO t1 VALUES (14, next value for s);
+SELECT * FROM t1 WHERE f1 > 0 FOR UPDATE;
+f1	f2
+1	0
+3	0
+4	1
+5	3
+6	5
+7	7
+8	9
+9	11
+10	13
+11	15
+12	17
+13	19
+14	21
+connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET SESSION wsrep_sync_wait=0;
+SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
+connection node_2;
+INSERT INTO t1 VALUES (2, 2);
+connection node_1a;
+SET SESSION wsrep_on = 0;
+SET SESSION wsrep_on = 1;
+SET GLOBAL wsrep_provider_options = 'dbug=';
+SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync';
+connection node_1;
+COMMIT;
+connection node_1a;
+SET SESSION wsrep_on = 0;
+SET SESSION wsrep_on = 1;
+SET GLOBAL wsrep_provider_options = 'dbug=';
+SET GLOBAL wsrep_provider_options = 'dbug=d,abort_trx_end';
+SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
+SET SESSION wsrep_on = 0;
+SET SESSION wsrep_on = 1;
+SET GLOBAL wsrep_provider_options = 'dbug=';
+SET GLOBAL wsrep_provider_options = 'signal=abort_trx_end';
+SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync';
+connection node_1;
+wsrep_local_replays
+1
+INSERT INTO t1 VALUES (22, next value for s);
+INSERT INTO t1 VALUES (23, next value for s);
+INSERT INTO t1 VALUES (24, next value for s);
+INSERT INTO t1 VALUES (25, next value for s);
+INSERT INTO t1 VALUES (26, next value for s);
+INSERT INTO t1 VALUES (27, next value for s);
+INSERT INTO t1 VALUES (28, next value for s);
+INSERT INTO t1 VALUES (29, next value for s);
+INSERT INTO t1 VALUES (30, next value for s);
+INSERT INTO t1 VALUES (31, next value for s);
+INSERT INTO t1 VALUES (32, next value for s);
+INSERT INTO t1 VALUES (33, next value for s);
+INSERT INTO t1 VALUES (34, next value for s);
+INSERT INTO t1 VALUES (35, next value for s);
+connection node_1;
+SELECT * FROM t1;
+f1	f2
+1	0
+2	2
+3	0
+4	1
+5	3
+6	5
+7	7
+8	9
+9	11
+10	13
+11	15
+12	17
+13	19
+14	21
+22	31
+23	33
+24	35
+25	37
+26	39
+27	41
+28	43
+29	45
+30	47
+31	49
+32	51
+33	53
+34	55
+35	57
+SELECT LASTVAL(s);
+LASTVAL(s)
+57
+connection node_2;
+SELECT * FROM t1;
+f1	f2
+1	0
+2	2
+3	0
+4	1
+5	3
+6	5
+7	7
+8	9
+9	11
+10	13
+11	15
+12	17
+13	19
+14	21
+22	31
+23	33
+24	35
+25	37
+26	39
+27	41
+28	43
+29	45
+30	47
+31	49
+32	51
+33	53
+34	55
+35	57
+SELECT LASTVAL(s);
+LASTVAL(s)
+NULL
+connection node_1;
+SELECT NEXTVAL(s);
+NEXTVAL(s)
+59
+connection node_2;
+SELECT NEXTVAL(s);
+NEXTVAL(s)
+62
+DROP SEQUENCE s;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_transaction.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_transaction.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sequences_transaction.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sequences_transaction.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,350 @@
+connection node_2;
+connection node_1;
+connection node_1;
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB;
+connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
+connection node_1;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+connection node_2;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+connection node_2a;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+connection node_1a;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+connection node_2;
+SELECT LASTVAL(s);
+LASTVAL(s)
+40
+connection node_1;
+SELECT LASTVAL(s);
+LASTVAL(s)
+19
+connection node_2a;
+SELECT LASTVAL(s);
+LASTVAL(s)
+60
+connection node_1a;
+SELECT LASTVAL(s);
+LASTVAL(s)
+79
+connection node_1;
+SELECT * FROM t1;
+f1	f2
+1	1
+3	1
+5	1
+7	1
+9	1
+11	1
+13	1
+15	1
+17	1
+19	1
+22	1
+24	1
+26	1
+28	1
+30	1
+32	1
+34	1
+36	1
+38	1
+40	1
+42	1
+44	1
+46	1
+48	1
+50	1
+52	1
+54	1
+56	1
+58	1
+60	1
+61	1
+63	1
+65	1
+67	1
+69	1
+71	1
+73	1
+75	1
+77	1
+79	1
+connection node_2;
+SELECT * FROM t1;
+f1	f2
+1	1
+3	1
+5	1
+7	1
+9	1
+11	1
+13	1
+15	1
+17	1
+19	1
+22	1
+24	1
+26	1
+28	1
+30	1
+32	1
+34	1
+36	1
+38	1
+40	1
+42	1
+44	1
+46	1
+48	1
+50	1
+52	1
+54	1
+56	1
+58	1
+60	1
+61	1
+63	1
+65	1
+67	1
+69	1
+71	1
+73	1
+75	1
+77	1
+79	1
+connection node_1;
+DROP TABLE t1;
+DROP SEQUENCE s;
+connection node_1;
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB;
+connection node_1;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+connection node_2;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+connection node_2a;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+connection node_1a;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+connection node_2;
+SELECT LASTVAL(s);
+LASTVAL(s)
+20
+connection node_1;
+SELECT LASTVAL(s);
+LASTVAL(s)
+19
+connection node_2a;
+SELECT LASTVAL(s);
+LASTVAL(s)
+40
+connection node_1a;
+SELECT LASTVAL(s);
+LASTVAL(s)
+39
+connection node_1;
+SELECT * FROM t1;
+f1	f2
+connection node_2;
+SELECT * FROM t1;
+f1	f2
+connection node_1;
+DROP TABLE t1;
+DROP SEQUENCE s;
+connection node_1;
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB;
+connection node_1;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+connection node_1a;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+connection node_2a;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+connection node_2;
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+connection node_1;
+COMMIT;
+connection node_1a;
+ROLLBACK;
+connection node_2;
+COMMIT;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+connection node_2a;
+ROLLBACK;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+connection node_2;
+SELECT LASTVAL(s);
+LASTVAL(s)
+40
+connection node_1;
+SELECT LASTVAL(s);
+LASTVAL(s)
+19
+connection node_2a;
+SELECT LASTVAL(s);
+LASTVAL(s)
+20
+connection node_1a;
+SELECT LASTVAL(s);
+LASTVAL(s)
+39
+connection node_1;
+SELECT * FROM t1;
+f1	f2
+1	1
+3	1
+5	1
+7	1
+9	1
+11	1
+13	1
+15	1
+17	1
+19	1
+connection node_2;
+SELECT * FROM t1;
+f1	f2
+1	1
+3	1
+5	1
+7	1
+9	1
+11	1
+13	1
+15	1
+17	1
+19	1
+connection node_1;
+DROP TABLE t1;
+DROP SEQUENCE s;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_slave_replay.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_slave_replay.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_slave_replay.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_slave_replay.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
-connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
-connection node_2a;
 connection node_2;
 connection node_1;
+connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
+connection node_2a;
 ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3;
 connection node_3;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_split_brain.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_split_brain.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_split_brain.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_split_brain.result	2025-05-19 16:14:24.000000000 +0000
@@ -2,6 +2,7 @@
 connection node_1;
 connection node_1;
 connection node_2;
+connection node_2;
 call mtr.add_suppression("WSREP: TO isolation failed for: ");
 connection node_1;
 call mtr.add_suppression("CREATE TABLE isolation failure");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,8 @@
 connection node_2;
 connection node_1;
+SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%';
+expect 0
+0
 SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
 VARIABLE_VALUE = 'Synced'
 1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_cipher.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_cipher.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_cipher.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_cipher.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,30 @@
+connection node_2;
+connection node_1;
+# Correct Galera library found
+connection node_1;
+connection node_2;
+connection node_1;
+connection node_2;
+SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
+VARIABLE_VALUE = 'Synced'
+1
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+VARIABLE_VALUE = 2
+1
+connection node_1;
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+VARIABLE_VALUE = 2
+1
+connection node_2;
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+VARIABLE_VALUE = 2
+1
+connection node_1;
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+VARIABLE_VALUE = 2
+1
+connection node_2;
+connection node_1;
+call mtr.add_suppression("WSREP: write_handler\\(\\)");
+connection node_2;
+call mtr.add_suppression("WSREP: write_handler\\(\\)");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_compression.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_compression.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_compression.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_compression.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,8 @@
 connection node_2;
 connection node_1;
+SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%';
+expect 0
+0
 SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
 VARIABLE_VALUE = 'Synced'
 1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_upgrade.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_upgrade.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_ssl_upgrade.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_ssl_upgrade.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,8 @@
 connection node_2;
 connection node_1;
+SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%';
+expect 0
+0
 connection node_1;
 connection node_2;
 connection node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
---- galera/r/galera_sst_mariabackup.result	2024-04-11 09:53:12.950512316 +0300
-+++ galera/r/galera_sst_mariabackup,debug.reject	2024-04-11 10:00:36.771144955 +0300
-@@ -524,6 +524,190 @@
+--- r/galera_sst_mariabackup.result
++++ r/galera_sst_mariabackup,debug.reject
+@@ -516,5 +516,189 @@
  1
  DROP TABLE t1;
  COMMIT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_force_recovery,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
---- r/galera_sst_mariabackup.result
-+++ r/galera_sst_mariabackup,debug.reject
+--- r/galera_sst_mariabackup_force_recovery.result
++++ r/galera_sst_mariabackup_force_recovery,debug.reject
 @@ -516,5 +516,189 @@
  1
  DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,210 @@
+--- r/galera_sst_mariabackup_gtid.result
++++ r/galera_sst_mariabackup_gtid,debug.reject
+@@ -516,19 +516,203 @@
+ 1
+ DROP TABLE t1;
+ COMMIT;
++Performing State Transfer on a server that has been killed and restarted
++while a DDL was in progress on it
++connection node_1;
++CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 VALUES (1,'node1_committed_before');
++INSERT INTO t1 VALUES (2,'node1_committed_before');
++INSERT INTO t1 VALUES (3,'node1_committed_before');
++INSERT INTO t1 VALUES (4,'node1_committed_before');
++INSERT INTO t1 VALUES (5,'node1_committed_before');
++connection node_2;
++START TRANSACTION;
++INSERT INTO t1 VALUES (6,'node2_committed_before');
++INSERT INTO t1 VALUES (7,'node2_committed_before');
++INSERT INTO t1 VALUES (8,'node2_committed_before');
++INSERT INTO t1 VALUES (9,'node2_committed_before');
++INSERT INTO t1 VALUES (10,'node2_committed_before');
++COMMIT;
++SET GLOBAL debug_dbug = 'd,sync.alter_opened_table';
++connection node_1;
++ALTER TABLE t1 ADD COLUMN f2 INTEGER;
++connection node_2;
++SET wsrep_sync_wait = 0;
++Killing server ...
++connection node_1;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (11,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (12,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (13,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (14,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (15,'node1_committed_during');
++COMMIT;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (16,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (17,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (18,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (19,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (20,'node1_to_be_committed_after');
++connect node_1a_galera_st_kill_slave_ddl, 127.0.0.1, root, , test, $NODE_MYPORT_1;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (21,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (22,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (23,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (24,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (25,'node1_to_be_rollbacked_after');
++connection node_2;
++Performing --wsrep-recover ...
++connection node_2;
++Starting server ...
++Using --wsrep-start-position when starting mysqld ...
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (26,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (27,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (28,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (29,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (30,'node2_committed_after');
++COMMIT;
++connection node_1;
++INSERT INTO t1 (id,f1) VALUES (31,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (32,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (33,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (34,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (35,'node1_to_be_committed_after');
++COMMIT;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (36,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (37,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (38,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (39,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (40,'node1_committed_after');
++COMMIT;
++connection node_1a_galera_st_kill_slave_ddl;
++INSERT INTO t1 (id,f1) VALUES (41,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (42,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (43,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (44,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (45,'node1_to_be_rollbacked_after');
++ROLLBACK;
++SET AUTOCOMMIT=ON;
++SET SESSION wsrep_sync_wait=15;
++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
++EXPECT_3
++3
++SELECT COUNT(*) AS EXPECT_35 FROM t1;
++EXPECT_35
++35
++SELECT * FROM t1;
++id	f1	f2
++1	node1_committed_before	NULL
++2	node1_committed_before	NULL
++3	node1_committed_before	NULL
++4	node1_committed_before	NULL
++5	node1_committed_before	NULL
++6	node2_committed_before	NULL
++7	node2_committed_before	NULL
++8	node2_committed_before	NULL
++9	node2_committed_before	NULL
++10	node2_committed_before	NULL
++11	node1_committed_during	NULL
++12	node1_committed_during	NULL
++13	node1_committed_during	NULL
++14	node1_committed_during	NULL
++15	node1_committed_during	NULL
++16	node1_to_be_committed_after	NULL
++17	node1_to_be_committed_after	NULL
++18	node1_to_be_committed_after	NULL
++19	node1_to_be_committed_after	NULL
++20	node1_to_be_committed_after	NULL
++26	node2_committed_after	NULL
++27	node2_committed_after	NULL
++28	node2_committed_after	NULL
++29	node2_committed_after	NULL
++30	node2_committed_after	NULL
++31	node1_to_be_committed_after	NULL
++32	node1_to_be_committed_after	NULL
++33	node1_to_be_committed_after	NULL
++34	node1_to_be_committed_after	NULL
++35	node1_to_be_committed_after	NULL
++36	node1_committed_after	NULL
++37	node1_committed_after	NULL
++38	node1_committed_after	NULL
++39	node1_committed_after	NULL
++40	node1_committed_after	NULL
++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
++COUNT(*) = 0
++1
++COMMIT;
++connection node_1;
++SET AUTOCOMMIT=ON;
++SET SESSION wsrep_sync_wait=15;
++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
++EXPECT_3
++3
++SELECT COUNT(*) AS EXPECT_35 FROM t1;
++EXPECT_35
++35
++SELECT * FROM t1;
++id	f1	f2
++1	node1_committed_before	NULL
++2	node1_committed_before	NULL
++3	node1_committed_before	NULL
++4	node1_committed_before	NULL
++5	node1_committed_before	NULL
++6	node2_committed_before	NULL
++7	node2_committed_before	NULL
++8	node2_committed_before	NULL
++9	node2_committed_before	NULL
++10	node2_committed_before	NULL
++11	node1_committed_during	NULL
++12	node1_committed_during	NULL
++13	node1_committed_during	NULL
++14	node1_committed_during	NULL
++15	node1_committed_during	NULL
++16	node1_to_be_committed_after	NULL
++17	node1_to_be_committed_after	NULL
++18	node1_to_be_committed_after	NULL
++19	node1_to_be_committed_after	NULL
++20	node1_to_be_committed_after	NULL
++26	node2_committed_after	NULL
++27	node2_committed_after	NULL
++28	node2_committed_after	NULL
++29	node2_committed_after	NULL
++30	node2_committed_after	NULL
++31	node1_to_be_committed_after	NULL
++32	node1_to_be_committed_after	NULL
++33	node1_to_be_committed_after	NULL
++34	node1_to_be_committed_after	NULL
++35	node1_to_be_committed_after	NULL
++36	node1_committed_after	NULL
++37	node1_committed_after	NULL
++38	node1_committed_after	NULL
++39	node1_committed_after	NULL
++40	node1_committed_after	NULL
++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
++COUNT(*) = 0
++1
++DROP TABLE t1;
++COMMIT;
++SET GLOBAL debug_dbug = $debug_orig;
+ connection node_1;
+ # Node_1
+ SHOW global variables like 'gtid%pos';
+ Variable_name	Value
+-gtid_binlog_pos	100-10-24
+-gtid_current_pos	100-10-24
++gtid_binlog_pos	100-10-33
++gtid_current_pos	100-10-33
+ gtid_slave_pos	
+ connection node_2;
+ # Node_2
+ SHOW global variables like 'gtid%pos';
+ Variable_name	Value
+-gtid_binlog_pos	100-10-24
+-gtid_current_pos	100-10-24
++gtid_binlog_pos	100-10-33
++gtid_current_pos	100-10-33
+ gtid_slave_pos	
+ disconnect node_2;
+ disconnect node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_gtid.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,534 @@
+connection node_2;
+connection node_1;
+connection node_1;
+connection node_2;
+Performing State Transfer on a server that has been shut down cleanly and restarted
+connection node_1;
+CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1,'node1_committed_before');
+INSERT INTO t1 VALUES (2,'node1_committed_before');
+INSERT INTO t1 VALUES (3,'node1_committed_before');
+INSERT INTO t1 VALUES (4,'node1_committed_before');
+INSERT INTO t1 VALUES (5,'node1_committed_before');
+COMMIT;
+connection node_2;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (6,'node2_committed_before');
+INSERT INTO t1 VALUES (7,'node2_committed_before');
+INSERT INTO t1 VALUES (8,'node2_committed_before');
+INSERT INTO t1 VALUES (9,'node2_committed_before');
+INSERT INTO t1 VALUES (10,'node2_committed_before');
+COMMIT;
+Shutting down server ...
+connection node_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (11,'node1_committed_during');
+INSERT INTO t1 VALUES (12,'node1_committed_during');
+INSERT INTO t1 VALUES (13,'node1_committed_during');
+INSERT INTO t1 VALUES (14,'node1_committed_during');
+INSERT INTO t1 VALUES (15,'node1_committed_during');
+COMMIT;
+START TRANSACTION;
+INSERT INTO t1 VALUES (16,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (17,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (18,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (19,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (20,'node1_to_be_committed_after');
+connect node_1a_galera_st_shutdown_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after');
+connection node_2;
+Starting server ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (26,'node2_committed_after');
+INSERT INTO t1 VALUES (27,'node2_committed_after');
+INSERT INTO t1 VALUES (28,'node2_committed_after');
+INSERT INTO t1 VALUES (29,'node2_committed_after');
+INSERT INTO t1 VALUES (30,'node2_committed_after');
+COMMIT;
+connection node_1;
+INSERT INTO t1 VALUES (31,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (32,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (33,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (34,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (35,'node1_to_be_committed_after');
+COMMIT;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (36,'node1_committed_after');
+INSERT INTO t1 VALUES (37,'node1_committed_after');
+INSERT INTO t1 VALUES (38,'node1_committed_after');
+INSERT INTO t1 VALUES (39,'node1_committed_after');
+INSERT INTO t1 VALUES (40,'node1_committed_after');
+COMMIT;
+connection node_1a_galera_st_shutdown_slave;
+INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after');
+ROLLBACK;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_15 FROM t1;
+EXPECT_15
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+COMMIT;
+connection node_1;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_15 FROM t1;
+EXPECT_15
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+DROP TABLE t1;
+COMMIT;
+Performing State Transfer on a server that starts from a clean var directory
+This is accomplished by shutting down node #2 and removing its var directory before restarting it
+connection node_1;
+CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1,'node1_committed_before');
+INSERT INTO t1 VALUES (2,'node1_committed_before');
+INSERT INTO t1 VALUES (3,'node1_committed_before');
+INSERT INTO t1 VALUES (4,'node1_committed_before');
+INSERT INTO t1 VALUES (5,'node1_committed_before');
+COMMIT;
+connection node_2;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (6,'node2_committed_before');
+INSERT INTO t1 VALUES (7,'node2_committed_before');
+INSERT INTO t1 VALUES (8,'node2_committed_before');
+INSERT INTO t1 VALUES (9,'node2_committed_before');
+INSERT INTO t1 VALUES (10,'node2_committed_before');
+COMMIT;
+Shutting down server ...
+connection node_1;
+Cleaning var directory ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (11,'node1_committed_during');
+INSERT INTO t1 VALUES (12,'node1_committed_during');
+INSERT INTO t1 VALUES (13,'node1_committed_during');
+INSERT INTO t1 VALUES (14,'node1_committed_during');
+INSERT INTO t1 VALUES (15,'node1_committed_during');
+COMMIT;
+START TRANSACTION;
+INSERT INTO t1 VALUES (16,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (17,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (18,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (19,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (20,'node1_to_be_committed_after');
+connect node_1a_galera_st_clean_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after');
+connection node_2;
+Starting server ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (26,'node2_committed_after');
+INSERT INTO t1 VALUES (27,'node2_committed_after');
+INSERT INTO t1 VALUES (28,'node2_committed_after');
+INSERT INTO t1 VALUES (29,'node2_committed_after');
+INSERT INTO t1 VALUES (30,'node2_committed_after');
+COMMIT;
+connection node_1;
+INSERT INTO t1 VALUES (31,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (32,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (33,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (34,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (35,'node1_to_be_committed_after');
+COMMIT;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (36,'node1_committed_after');
+INSERT INTO t1 VALUES (37,'node1_committed_after');
+INSERT INTO t1 VALUES (38,'node1_committed_after');
+INSERT INTO t1 VALUES (39,'node1_committed_after');
+INSERT INTO t1 VALUES (40,'node1_committed_after');
+COMMIT;
+connection node_1a_galera_st_clean_slave;
+INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after');
+ROLLBACK;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+COMMIT;
+connection node_1;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+DROP TABLE t1;
+COMMIT;
+Performing State Transfer on a server that has been killed and restarted
+connection node_1;
+CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1,'node1_committed_before');
+INSERT INTO t1 VALUES (2,'node1_committed_before');
+INSERT INTO t1 VALUES (3,'node1_committed_before');
+INSERT INTO t1 VALUES (4,'node1_committed_before');
+INSERT INTO t1 VALUES (5,'node1_committed_before');
+COMMIT;
+connection node_2;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (6,'node2_committed_before');
+INSERT INTO t1 VALUES (7,'node2_committed_before');
+INSERT INTO t1 VALUES (8,'node2_committed_before');
+INSERT INTO t1 VALUES (9,'node2_committed_before');
+INSERT INTO t1 VALUES (10,'node2_committed_before');
+COMMIT;
+Killing server ...
+connection node_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (11,'node1_committed_during');
+INSERT INTO t1 VALUES (12,'node1_committed_during');
+INSERT INTO t1 VALUES (13,'node1_committed_during');
+INSERT INTO t1 VALUES (14,'node1_committed_during');
+INSERT INTO t1 VALUES (15,'node1_committed_during');
+COMMIT;
+START TRANSACTION;
+INSERT INTO t1 VALUES (16,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (17,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (18,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (19,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (20,'node1_to_be_committed_after');
+connect node_1a_galera_st_kill_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after');
+connection node_2;
+Performing --wsrep-recover ...
+Starting server ...
+Using --wsrep-start-position when starting mysqld ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (26,'node2_committed_after');
+INSERT INTO t1 VALUES (27,'node2_committed_after');
+INSERT INTO t1 VALUES (28,'node2_committed_after');
+INSERT INTO t1 VALUES (29,'node2_committed_after');
+INSERT INTO t1 VALUES (30,'node2_committed_after');
+COMMIT;
+connection node_1;
+INSERT INTO t1 VALUES (31,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (32,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (33,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (34,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (35,'node1_to_be_committed_after');
+COMMIT;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (36,'node1_committed_after');
+INSERT INTO t1 VALUES (37,'node1_committed_after');
+INSERT INTO t1 VALUES (38,'node1_committed_after');
+INSERT INTO t1 VALUES (39,'node1_committed_after');
+INSERT INTO t1 VALUES (40,'node1_committed_after');
+COMMIT;
+connection node_1a_galera_st_kill_slave;
+INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (46,'node1_to_be_rollbacked_after');
+ROLLBACK;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * FROM t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+COMMIT;
+connection node_1;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * FROM t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+DROP TABLE t1;
+COMMIT;
+connection node_1;
+# Node_1
+SHOW global variables like 'gtid%pos';
+Variable_name	Value
+gtid_binlog_pos	100-10-24
+gtid_current_pos	100-10-24
+gtid_slave_pos	
+connection node_2;
+# Node_2
+SHOW global variables like 'gtid%pos';
+Variable_name	Value
+gtid_binlog_pos	100-10-24
+gtid_current_pos	100-10-24
+gtid_slave_pos	
+disconnect node_2;
+disconnect node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_logarchive,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
 --- r/galera_sst_mariabackup_logarchive.result
-+++ r/galera_sst_mariabackup_logarchive.reject
++++ r/galera_sst_mariabackup_logarchive,debug.reject
 @@ -516,5 +516,189 @@
  1
  DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mariabackup_use_memory.result	2025-05-19 16:14:24.000000000 +0000
@@ -8,6 +8,6 @@
 Cleaning var directory ...
 connection node_2;
 Starting server ...
-include/assert_grep.inc [mariabackup: Using 128974848 bytes for buffer pool \(set by --use-memory parameter\)]
+include/assert_grep.inc [mariabackup: Using 134217728 bytes for buffer pool \(set by --use-memory parameter\)]
 disconnect node_2;
 disconnect node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,13 +1,12 @@
 --- r/galera_sst_mysqldump.result
 +++ r/galera_sst_mysqldump,debug.reject
-@@ -698,11 +698,195 @@
+@@ -698,6 +698,190 @@
  1
  DROP TABLE t1;
  COMMIT;
 +Performing State Transfer on a server that has been killed and restarted
 +while a DDL was in progress on it
- connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
++connection node_1;
 +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
 +SET AUTOCOMMIT=OFF;
 +START TRANSACTION;
@@ -189,12 +188,6 @@
 +DROP TABLE t1;
 +COMMIT;
 +SET GLOBAL debug_dbug = $debug_orig;
-+connection node_1;
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+ connection node_1;
+ CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
  DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump,release.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
---- r/galera_sst_mysqldump.result
-+++ r/galera_sst_mysqldump.reject
-@@ -699,10 +699,10 @@
- DROP TABLE t1;
- COMMIT;
- connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump.result	2025-05-19 16:14:24.000000000 +0000
@@ -699,10 +699,10 @@
 DROP TABLE t1;
 COMMIT;
 connection node_1;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 DROP USER sst;
 connection node_2;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
 CALL mtr.add_suppression("Can't open and lock time zone table");
 CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,13 +1,12 @@
 --- r/galera_sst_mysqldump_with_key.result
 +++ r/galera_sst_mysqldump_with_key,debug.reject
-@@ -358,11 +358,195 @@
+@@ -358,6 +358,190 @@
  1
  DROP TABLE t1;
  COMMIT;
 +Performing State Transfer on a server that has been killed and restarted
 +while a DDL was in progress on it
- connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
++connection node_1;
 +CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
 +SET AUTOCOMMIT=OFF;
 +START TRANSACTION;
@@ -189,12 +188,6 @@
 +DROP TABLE t1;
 +COMMIT;
 +SET GLOBAL debug_dbug = $debug_orig;
-+connection node_1;
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+ connection node_1;
+ CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
  DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key,release.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
---- r/galera_sst_mysqldump_with_key.result
-+++ r/galera_sst_mysqldump_with_key.reject
-@@ -359,10 +359,10 @@
- DROP TABLE t1;
- COMMIT;
- connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_mysqldump_with_key.result	2025-05-19 16:14:24.000000000 +0000
@@ -359,10 +359,10 @@
 DROP TABLE t1;
 COMMIT;
 connection node_1;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 DROP USER sst;
 connection node_2;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
 CALL mtr.add_suppression("Can't open and lock time zone table");
 CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
 --- galera_sst_rsync.result
-+++ galera_sst_rsync.reject
++++ galera_sst_rsync,debug.reject
 @@ -516,3 +516,187 @@
  1
  DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,210 @@
+--- r/galera_sst_rsync_gtid.result
++++ r/galera_sst_rsync_gtid,debug.reject
+@@ -516,19 +516,203 @@
+ 1
+ DROP TABLE t1;
+ COMMIT;
++Performing State Transfer on a server that has been killed and restarted
++while a DDL was in progress on it
++connection node_1;
++CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 VALUES (1,'node1_committed_before');
++INSERT INTO t1 VALUES (2,'node1_committed_before');
++INSERT INTO t1 VALUES (3,'node1_committed_before');
++INSERT INTO t1 VALUES (4,'node1_committed_before');
++INSERT INTO t1 VALUES (5,'node1_committed_before');
++connection node_2;
++START TRANSACTION;
++INSERT INTO t1 VALUES (6,'node2_committed_before');
++INSERT INTO t1 VALUES (7,'node2_committed_before');
++INSERT INTO t1 VALUES (8,'node2_committed_before');
++INSERT INTO t1 VALUES (9,'node2_committed_before');
++INSERT INTO t1 VALUES (10,'node2_committed_before');
++COMMIT;
++SET GLOBAL debug_dbug = 'd,sync.alter_opened_table';
++connection node_1;
++ALTER TABLE t1 ADD COLUMN f2 INTEGER;
++connection node_2;
++SET wsrep_sync_wait = 0;
++Killing server ...
++connection node_1;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (11,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (12,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (13,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (14,'node1_committed_during');
++INSERT INTO t1 (id,f1) VALUES (15,'node1_committed_during');
++COMMIT;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (16,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (17,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (18,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (19,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (20,'node1_to_be_committed_after');
++connect node_1a_galera_st_kill_slave_ddl, 127.0.0.1, root, , test, $NODE_MYPORT_1;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (21,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (22,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (23,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (24,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (25,'node1_to_be_rollbacked_after');
++connection node_2;
++Performing --wsrep-recover ...
++connection node_2;
++Starting server ...
++Using --wsrep-start-position when starting mysqld ...
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (26,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (27,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (28,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (29,'node2_committed_after');
++INSERT INTO t1 (id,f1) VALUES (30,'node2_committed_after');
++COMMIT;
++connection node_1;
++INSERT INTO t1 (id,f1) VALUES (31,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (32,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (33,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (34,'node1_to_be_committed_after');
++INSERT INTO t1 (id,f1) VALUES (35,'node1_to_be_committed_after');
++COMMIT;
++SET AUTOCOMMIT=OFF;
++START TRANSACTION;
++INSERT INTO t1 (id,f1) VALUES (36,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (37,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (38,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (39,'node1_committed_after');
++INSERT INTO t1 (id,f1) VALUES (40,'node1_committed_after');
++COMMIT;
++connection node_1a_galera_st_kill_slave_ddl;
++INSERT INTO t1 (id,f1) VALUES (41,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (42,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (43,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (44,'node1_to_be_rollbacked_after');
++INSERT INTO t1 (id,f1) VALUES (45,'node1_to_be_rollbacked_after');
++ROLLBACK;
++SET AUTOCOMMIT=ON;
++SET SESSION wsrep_sync_wait=15;
++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
++EXPECT_3
++3
++SELECT COUNT(*) AS EXPECT_35 FROM t1;
++EXPECT_35
++35
++SELECT * FROM t1;
++id	f1	f2
++1	node1_committed_before	NULL
++2	node1_committed_before	NULL
++3	node1_committed_before	NULL
++4	node1_committed_before	NULL
++5	node1_committed_before	NULL
++6	node2_committed_before	NULL
++7	node2_committed_before	NULL
++8	node2_committed_before	NULL
++9	node2_committed_before	NULL
++10	node2_committed_before	NULL
++11	node1_committed_during	NULL
++12	node1_committed_during	NULL
++13	node1_committed_during	NULL
++14	node1_committed_during	NULL
++15	node1_committed_during	NULL
++16	node1_to_be_committed_after	NULL
++17	node1_to_be_committed_after	NULL
++18	node1_to_be_committed_after	NULL
++19	node1_to_be_committed_after	NULL
++20	node1_to_be_committed_after	NULL
++26	node2_committed_after	NULL
++27	node2_committed_after	NULL
++28	node2_committed_after	NULL
++29	node2_committed_after	NULL
++30	node2_committed_after	NULL
++31	node1_to_be_committed_after	NULL
++32	node1_to_be_committed_after	NULL
++33	node1_to_be_committed_after	NULL
++34	node1_to_be_committed_after	NULL
++35	node1_to_be_committed_after	NULL
++36	node1_committed_after	NULL
++37	node1_committed_after	NULL
++38	node1_committed_after	NULL
++39	node1_committed_after	NULL
++40	node1_committed_after	NULL
++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
++COUNT(*) = 0
++1
++COMMIT;
++connection node_1;
++SET AUTOCOMMIT=ON;
++SET SESSION wsrep_sync_wait=15;
++SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
++EXPECT_3
++3
++SELECT COUNT(*) AS EXPECT_35 FROM t1;
++EXPECT_35
++35
++SELECT * FROM t1;
++id	f1	f2
++1	node1_committed_before	NULL
++2	node1_committed_before	NULL
++3	node1_committed_before	NULL
++4	node1_committed_before	NULL
++5	node1_committed_before	NULL
++6	node2_committed_before	NULL
++7	node2_committed_before	NULL
++8	node2_committed_before	NULL
++9	node2_committed_before	NULL
++10	node2_committed_before	NULL
++11	node1_committed_during	NULL
++12	node1_committed_during	NULL
++13	node1_committed_during	NULL
++14	node1_committed_during	NULL
++15	node1_committed_during	NULL
++16	node1_to_be_committed_after	NULL
++17	node1_to_be_committed_after	NULL
++18	node1_to_be_committed_after	NULL
++19	node1_to_be_committed_after	NULL
++20	node1_to_be_committed_after	NULL
++26	node2_committed_after	NULL
++27	node2_committed_after	NULL
++28	node2_committed_after	NULL
++29	node2_committed_after	NULL
++30	node2_committed_after	NULL
++31	node1_to_be_committed_after	NULL
++32	node1_to_be_committed_after	NULL
++33	node1_to_be_committed_after	NULL
++34	node1_to_be_committed_after	NULL
++35	node1_to_be_committed_after	NULL
++36	node1_committed_after	NULL
++37	node1_committed_after	NULL
++38	node1_committed_after	NULL
++39	node1_committed_after	NULL
++40	node1_committed_after	NULL
++SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
++COUNT(*) = 0
++1
++DROP TABLE t1;
++COMMIT;
++SET GLOBAL debug_dbug = $debug_orig;
+ connection node_1;
+ # Node_1
+ SHOW global variables like 'gtid%pos';
+ Variable_name	Value
+-gtid_binlog_pos	100-10-24
+-gtid_current_pos	100-10-24
++gtid_binlog_pos	100-10-33
++gtid_current_pos	100-10-33
+ gtid_slave_pos	
+ connection node_2;
+ # Node_2
+ SHOW global variables like 'gtid%pos';
+ Variable_name	Value
+-gtid_binlog_pos	100-10-24
+-gtid_current_pos	100-10-24
++gtid_binlog_pos	100-10-33
++gtid_current_pos	100-10-33
+ gtid_slave_pos	
+ disconnect node_2;
+ disconnect node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_gtid.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,534 @@
+connection node_2;
+connection node_1;
+connection node_1;
+connection node_2;
+Performing State Transfer on a server that has been shut down cleanly and restarted
+connection node_1;
+CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1,'node1_committed_before');
+INSERT INTO t1 VALUES (2,'node1_committed_before');
+INSERT INTO t1 VALUES (3,'node1_committed_before');
+INSERT INTO t1 VALUES (4,'node1_committed_before');
+INSERT INTO t1 VALUES (5,'node1_committed_before');
+COMMIT;
+connection node_2;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (6,'node2_committed_before');
+INSERT INTO t1 VALUES (7,'node2_committed_before');
+INSERT INTO t1 VALUES (8,'node2_committed_before');
+INSERT INTO t1 VALUES (9,'node2_committed_before');
+INSERT INTO t1 VALUES (10,'node2_committed_before');
+COMMIT;
+Shutting down server ...
+connection node_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (11,'node1_committed_during');
+INSERT INTO t1 VALUES (12,'node1_committed_during');
+INSERT INTO t1 VALUES (13,'node1_committed_during');
+INSERT INTO t1 VALUES (14,'node1_committed_during');
+INSERT INTO t1 VALUES (15,'node1_committed_during');
+COMMIT;
+START TRANSACTION;
+INSERT INTO t1 VALUES (16,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (17,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (18,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (19,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (20,'node1_to_be_committed_after');
+connect node_1a_galera_st_shutdown_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after');
+connection node_2;
+Starting server ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (26,'node2_committed_after');
+INSERT INTO t1 VALUES (27,'node2_committed_after');
+INSERT INTO t1 VALUES (28,'node2_committed_after');
+INSERT INTO t1 VALUES (29,'node2_committed_after');
+INSERT INTO t1 VALUES (30,'node2_committed_after');
+COMMIT;
+connection node_1;
+INSERT INTO t1 VALUES (31,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (32,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (33,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (34,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (35,'node1_to_be_committed_after');
+COMMIT;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (36,'node1_committed_after');
+INSERT INTO t1 VALUES (37,'node1_committed_after');
+INSERT INTO t1 VALUES (38,'node1_committed_after');
+INSERT INTO t1 VALUES (39,'node1_committed_after');
+INSERT INTO t1 VALUES (40,'node1_committed_after');
+COMMIT;
+connection node_1a_galera_st_shutdown_slave;
+INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after');
+ROLLBACK;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_15 FROM t1;
+EXPECT_15
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+COMMIT;
+connection node_1;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_15 FROM t1;
+EXPECT_15
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+DROP TABLE t1;
+COMMIT;
+Performing State Transfer on a server that starts from a clean var directory
+This is accomplished by shutting down node #2 and removing its var directory before restarting it
+connection node_1;
+CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1,'node1_committed_before');
+INSERT INTO t1 VALUES (2,'node1_committed_before');
+INSERT INTO t1 VALUES (3,'node1_committed_before');
+INSERT INTO t1 VALUES (4,'node1_committed_before');
+INSERT INTO t1 VALUES (5,'node1_committed_before');
+COMMIT;
+connection node_2;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (6,'node2_committed_before');
+INSERT INTO t1 VALUES (7,'node2_committed_before');
+INSERT INTO t1 VALUES (8,'node2_committed_before');
+INSERT INTO t1 VALUES (9,'node2_committed_before');
+INSERT INTO t1 VALUES (10,'node2_committed_before');
+COMMIT;
+Shutting down server ...
+connection node_1;
+Cleaning var directory ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (11,'node1_committed_during');
+INSERT INTO t1 VALUES (12,'node1_committed_during');
+INSERT INTO t1 VALUES (13,'node1_committed_during');
+INSERT INTO t1 VALUES (14,'node1_committed_during');
+INSERT INTO t1 VALUES (15,'node1_committed_during');
+COMMIT;
+START TRANSACTION;
+INSERT INTO t1 VALUES (16,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (17,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (18,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (19,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (20,'node1_to_be_committed_after');
+connect node_1a_galera_st_clean_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after');
+connection node_2;
+Starting server ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (26,'node2_committed_after');
+INSERT INTO t1 VALUES (27,'node2_committed_after');
+INSERT INTO t1 VALUES (28,'node2_committed_after');
+INSERT INTO t1 VALUES (29,'node2_committed_after');
+INSERT INTO t1 VALUES (30,'node2_committed_after');
+COMMIT;
+connection node_1;
+INSERT INTO t1 VALUES (31,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (32,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (33,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (34,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (35,'node1_to_be_committed_after');
+COMMIT;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (36,'node1_committed_after');
+INSERT INTO t1 VALUES (37,'node1_committed_after');
+INSERT INTO t1 VALUES (38,'node1_committed_after');
+INSERT INTO t1 VALUES (39,'node1_committed_after');
+INSERT INTO t1 VALUES (40,'node1_committed_after');
+COMMIT;
+connection node_1a_galera_st_clean_slave;
+INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (44,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after');
+ROLLBACK;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+COMMIT;
+connection node_1;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * from t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+DROP TABLE t1;
+COMMIT;
+Performing State Transfer on a server that has been killed and restarted
+connection node_1;
+CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (1,'node1_committed_before');
+INSERT INTO t1 VALUES (2,'node1_committed_before');
+INSERT INTO t1 VALUES (3,'node1_committed_before');
+INSERT INTO t1 VALUES (4,'node1_committed_before');
+INSERT INTO t1 VALUES (5,'node1_committed_before');
+COMMIT;
+connection node_2;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (6,'node2_committed_before');
+INSERT INTO t1 VALUES (7,'node2_committed_before');
+INSERT INTO t1 VALUES (8,'node2_committed_before');
+INSERT INTO t1 VALUES (9,'node2_committed_before');
+INSERT INTO t1 VALUES (10,'node2_committed_before');
+COMMIT;
+Killing server ...
+connection node_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (11,'node1_committed_during');
+INSERT INTO t1 VALUES (12,'node1_committed_during');
+INSERT INTO t1 VALUES (13,'node1_committed_during');
+INSERT INTO t1 VALUES (14,'node1_committed_during');
+INSERT INTO t1 VALUES (15,'node1_committed_during');
+COMMIT;
+START TRANSACTION;
+INSERT INTO t1 VALUES (16,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (17,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (18,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (19,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (20,'node1_to_be_committed_after');
+connect node_1a_galera_st_kill_slave, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (21,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (22,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (23,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (24,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (25,'node1_to_be_rollbacked_after');
+connection node_2;
+Performing --wsrep-recover ...
+Starting server ...
+Using --wsrep-start-position when starting mysqld ...
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (26,'node2_committed_after');
+INSERT INTO t1 VALUES (27,'node2_committed_after');
+INSERT INTO t1 VALUES (28,'node2_committed_after');
+INSERT INTO t1 VALUES (29,'node2_committed_after');
+INSERT INTO t1 VALUES (30,'node2_committed_after');
+COMMIT;
+connection node_1;
+INSERT INTO t1 VALUES (31,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (32,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (33,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (34,'node1_to_be_committed_after');
+INSERT INTO t1 VALUES (35,'node1_to_be_committed_after');
+COMMIT;
+SET AUTOCOMMIT=OFF;
+START TRANSACTION;
+INSERT INTO t1 VALUES (36,'node1_committed_after');
+INSERT INTO t1 VALUES (37,'node1_committed_after');
+INSERT INTO t1 VALUES (38,'node1_committed_after');
+INSERT INTO t1 VALUES (39,'node1_committed_after');
+INSERT INTO t1 VALUES (40,'node1_committed_after');
+COMMIT;
+connection node_1a_galera_st_kill_slave;
+INSERT INTO t1 VALUES (41,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (42,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (43,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (45,'node1_to_be_rollbacked_after');
+INSERT INTO t1 VALUES (46,'node1_to_be_rollbacked_after');
+ROLLBACK;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * FROM t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+COMMIT;
+connection node_1;
+SET AUTOCOMMIT=ON;
+SET SESSION wsrep_sync_wait=15;
+SELECT COUNT(*) AS EXPECT_35 FROM t1;
+EXPECT_35
+35
+SELECT * FROM t1;
+id	f1
+1	node1_committed_before
+2	node1_committed_before
+3	node1_committed_before
+4	node1_committed_before
+5	node1_committed_before
+6	node2_committed_before
+7	node2_committed_before
+8	node2_committed_before
+9	node2_committed_before
+10	node2_committed_before
+11	node1_committed_during
+12	node1_committed_during
+13	node1_committed_during
+14	node1_committed_during
+15	node1_committed_during
+16	node1_to_be_committed_after
+17	node1_to_be_committed_after
+18	node1_to_be_committed_after
+19	node1_to_be_committed_after
+20	node1_to_be_committed_after
+26	node2_committed_after
+27	node2_committed_after
+28	node2_committed_after
+29	node2_committed_after
+30	node2_committed_after
+31	node1_to_be_committed_after
+32	node1_to_be_committed_after
+33	node1_to_be_committed_after
+34	node1_to_be_committed_after
+35	node1_to_be_committed_after
+36	node1_committed_after
+37	node1_committed_after
+38	node1_committed_after
+39	node1_committed_after
+40	node1_committed_after
+SELECT COUNT(*) = 0 FROM (SELECT COUNT(*) AS c, f1 FROM t1 GROUP BY f1 HAVING c NOT IN (5, 10)) AS a1;
+COUNT(*) = 0
+1
+DROP TABLE t1;
+COMMIT;
+connection node_1;
+# Node_1
+SHOW global variables like 'gtid%pos';
+Variable_name	Value
+gtid_binlog_pos	100-10-24
+gtid_current_pos	100-10-24
+gtid_slave_pos	
+connection node_2;
+# Node_2
+SHOW global variables like 'gtid%pos';
+Variable_name	Value
+gtid_binlog_pos	100-10-24
+gtid_current_pos	100-10-24
+gtid_slave_pos	
+disconnect node_2;
+disconnect node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_sst_rsync_recv_auto,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,3 +1,5 @@
+--- r/galera_sst_rsync_recv_auto.result
++++ r/galera_sst_rsync_recv_auto,debug.reject
 @@ -516,3 +516,187 @@
  1
  DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_innodb.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_innodb.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_innodb.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_innodb.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 connection node_2;
 connection node_1;
-call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine .*");
+call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine ");
 CREATE TABLE t1(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=INNODB;
 CREATE TABLE t2(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=MYISAM;
 CREATE TABLE t3(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=ARIA;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_primary_key.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_primary_key.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_strict_require_primary_key.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_strict_require_primary_key.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 connection node_2;
 connection node_1;
-call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled. Table .*");
+call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled\\. Table ");
 CREATE TABLE t1(a int, b varchar(50)) ENGINE=INNODB;
 CREATE TABLE t2(a int, b varchar(50)) ENGINE=MYISAM;
 CREATE TABLE t3(a int, b varchar(50)) ENGINE=MEMORY;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_toi_ddl_nonconflicting.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,29 +1,69 @@
 connection node_2;
 connection node_1;
+connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
+connection node_1;
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY AUTO_INCREMENT, f2 INTEGER);
+INSERT INTO t1(f2) SELECT seq FROM seq_1_to_1000;
+connection node_2a;
+SET SESSION wsrep_sync_wait=0;
+connection node_1a;
+# Block the applier on node_1 and issue a ddl from node_2
+SET SESSION wsrep_sync_wait=0;
+SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
 connection node_2;
-ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 123);;
+# DDL 1
+ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 VALUES (NULL, 10000, 10000);;
+connection node_1a;
+SET SESSION wsrep_on = 0;
+SET SESSION wsrep_on = 1;
+SET GLOBAL wsrep_provider_options = 'dbug=';
+# This will block on acquiring total order isolation
 connection node_1;
+# DDL 2
 CREATE UNIQUE INDEX i1 ON t1(f2);;
+connection node_1a;
+# Signal DDL 1
+SET GLOBAL wsrep_provider_options = 'dbug=';
+SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
+connection node_2;
+connection node_1;
 connection node_2;
-INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 234);
-SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
-COUNT(*) = 3
-1
-SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
-COUNT(*) = 2
-1
-SELECT COUNT(*) = 2 FROM t1;
-COUNT(*) = 2
-1
+SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
+EXPECT_3
+3
+SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
+EXPECT_2
+2
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `f1` int(11) NOT NULL AUTO_INCREMENT,
+  `f2` int(11) DEFAULT NULL,
+  `f3` int(11) DEFAULT NULL,
+  PRIMARY KEY (`f1`),
+  UNIQUE KEY `i1` (`f2`)
+) ENGINE=InnoDB AUTO_INCREMENT=2002 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+SELECT COUNT(*) AS EXPECT_1001 FROM t1;
+EXPECT_1001
+1001
 connection node_1;
-SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
-COUNT(*) = 3
-1
-SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
-COUNT(*) = 2
-1
-SELECT COUNT(*) = 2 FROM t1;
-COUNT(*) = 2
-1
+SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
+EXPECT_3
+3
+SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
+EXPECT_2
+2
+SHOW CREATE TABLE t1;
+Table	Create Table
+t1	CREATE TABLE `t1` (
+  `f1` int(11) NOT NULL AUTO_INCREMENT,
+  `f2` int(11) DEFAULT NULL,
+  `f3` int(11) DEFAULT NULL,
+  PRIMARY KEY (`f1`),
+  UNIQUE KEY `i1` (`f2`)
+) ENGINE=InnoDB AUTO_INCREMENT=2047 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
+SELECT COUNT(*) AS EXPECT_1001 FROM t1;
+EXPECT_1001
+1001
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_replicate_myisam_on.result	2025-05-19 16:14:24.000000000 +0000
@@ -52,8 +52,8 @@
 0
 DROP TABLE t1;
 connection node_1;
-CREATE TABLE t1 (f1 INTEGER) ENGINE=MyISAM;
-CREATE TABLE t2 (f1 INTEGER) ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=MyISAM;
+CREATE TABLE t2 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=InnoDB;
 SET AUTOCOMMIT=OFF;
 START TRANSACTION;
 INSERT INTO t1 VALUES (1);
@@ -203,6 +203,9 @@
 3	200
 4	5
 connection node_2;
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+10
 SELECT * FROM t1 ORDER BY id;
 id	b
 1	1
@@ -224,15 +227,29 @@
 DROP TRIGGER tr1;
 DROP TRIGGER tr2;
 DROP TRIGGER tr3;
-DROP TABLE t1,t2;
+DROP TABLE t1, t2;
+CREATE TABLE t1 (a INT, b INT, UNIQUE(a)) ENGINE=MyISAM;
+CREATE TRIGGER tr1 BEFORE INSERT ON t1 FOR EACH ROW SET NEW.a=1;
+INSERT INTO t1  (a,b) VALUES (10,20);
+SELECT * from t1;
+a	b
+1	20
+connection node_2;
+SELECT * from t1;
+a	b
+1	20
+connection node_1;
+DROP TABLE t1;
 #
 # MDEV-11152: wsrep_replicate_myisam: SELECT gets replicated using TO
 #
 connection node_1;
-CREATE TABLE t1 (i INT) ENGINE=INNODB;
+CREATE TABLE t1 (i INT NOT NULL PRIMARY KEY) ENGINE=INNODB;
 INSERT INTO t1 VALUES(1);
 SELECT * FROM t1;
 i
 1
 DROP TABLE t1;
-connection node_1;
+SET GLOBAL wsrep_mode = DEFAULT;
+connection node_2;
+SET GLOBAL wsrep_mode = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_slave_threads.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_slave_threads.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_var_slave_threads.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_var_slave_threads.result	2025-05-19 16:14:24.000000000 +0000
@@ -33,7 +33,6 @@
 SELECT COUNT(*) FROM t2;
 COUNT(*)
 70
-SET GLOBAL wsrep_slave_threads = 1;
 DROP TABLE t1;
 DROP TABLE t2;
 #
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_during_ist.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_during_ist.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_during_ist.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_during_ist.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,112 @@
+connection node_4;
+connection node_3;
+connection node_2;
+connection node_1;
+# Correct Galera library found
+connection node_1;
+connection node_2;
+connection node_3;
+connection node_4;
+connection node_1;
+CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
+CREATE PROCEDURE p1(IN max INT)
+BEGIN
+DECLARE i INT;
+DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+SET i = 0;
+WHILE i < max DO
+INSERT IGNORE INTO t1 VALUES (DEFAULT);
+SET i = i + 1;
+END WHILE;
+END|
+CALL p1(130);
+connection node_4;
+Shutting down server 4...
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_2;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_3;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+Server 4 left the cluster
+connection node_1;
+CALL p1(130);
+connection node_1;
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+connection node_2;
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+connection node_3;
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+INSERT INTO t2 VALUES (DEFAULT);
+CALL p1(130);
+connection node_1;
+SET GLOBAL debug = "+d,sync.wsrep_sst_donor_after_donation";
+Restarting server 4
+Wait for server 1 to become a donor
+SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached";
+Server 1 got SST request from server 4
+SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_sst_donor_after_donation_continue";
+SET GLOBAL debug = "";
+SET DEBUG_SYNC='RESET';
+Waiting for server 4 to leave the cluster
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_2;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_3;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_4;
+Server 4 left the cluster, killing it...
+Killed server 4...
+Restarting server 4...
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_1;
+SELECT count(*) AS expect1_390 FROM t1;
+expect1_390
+390
+SELECT count(*) AS expect1_1 FROM t2;
+expect1_1
+1
+connection node_2;
+SELECT count(*) AS expect2_390 FROM t1;
+expect2_390
+390
+SELECT count(*) AS expect2_1 FROM t2;
+expect2_1
+1
+connection node_3;
+SELECT count(*) AS expect3_390 FROM t1;
+expect3_390
+390
+SELECT count(*) AS expect3_1 FROM t2;
+expect3_1
+1
+connection node_4;
+SELECT count(*) AS expect4_390 FROM t1;
+expect4_390
+390
+SELECT count(*) AS expect4_1 FROM t2;
+expect4_1
+1
+DROP TABLE t1;
+DROP TABLE t2;
+DROP PROCEDURE p1;
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+CALL mtr.add_suppression("Inconsistency detected: Failed on preordered");
+CALL mtr.add_suppression("Failed to apply write set");
+CALL mtr.add_suppression("Sending JOIN failed: -103");
+CALL mtr.add_suppression("Failed to JOIN the cluster after SST");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_apply.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_apply.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_apply.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_apply.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,94 @@
+connection node_4;
+connection node_3;
+connection node_2;
+connection node_1;
+# Correct Galera library found
+connection node_1;
+connection node_2;
+connection node_3;
+connection node_4;
+connection node_1;
+CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
+CREATE PROCEDURE p1(IN max INT)
+BEGIN
+DECLARE i INT;
+DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+SET i = 0;
+WHILE i < max DO
+INSERT IGNORE INTO t1 VALUES (DEFAULT);
+SET i = i + 1;
+END WHILE;
+END|
+CALL p1(130);
+connection node_4;
+Shutting down server 4...
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+SET GLOBAL debug = "+d,sync.wsrep_donor_state";
+connection node_4;
+Restarting server 4...
+connection node_1;
+SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached";
+Tables on server 1 flushed and locked for SST to server 4
+SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state";
+SET GLOBAL debug = "";
+SET DEBUG_SYNC='RESET';
+Wait for the state snapshot to be copied to server 4
+SST script unlocked server 1
+connection node_1;
+CALL p1(130);
+connection node_1;
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+connection node_2;
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+connection node_3;
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+INSERT INTO t2 VALUES (DEFAULT);
+CALL p1(130);
+Waiting for server 4 to leave the cluster
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_2;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_4;
+Server 4 left the cluster, killing it...
+Killed server 4...
+Restarting server 4...
+DROP TABLE t2;
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_1;
+SELECT count(*) AS expect1_390 FROM t1;
+expect1_390
+390
+connection node_2;
+SELECT count(*) AS expect2_390 FROM t1;
+expect2_390
+390
+connection node_3;
+SELECT count(*) AS expect3_390 FROM t1;
+expect3_390
+390
+connection node_4;
+SELECT count(*) AS expect4_390 FROM t1;
+expect4_390
+390
+DROP TABLE t1;
+DROP PROCEDURE p1;
+connection node_4;
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+CALL mtr.add_suppression("Inconsistency detected: Inconsistent by consensus");
+CALL mtr.add_suppression("Failed to apply write set: gtid:");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_skip.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_skip.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_vote_joined_skip.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_vote_joined_skip.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,102 @@
+connection node_4;
+connection node_3;
+connection node_2;
+connection node_1;
+# Correct Galera library found
+connection node_1;
+connection node_2;
+connection node_3;
+connection node_4;
+connection node_1;
+CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
+CREATE PROCEDURE p1(IN max INT)
+BEGIN
+DECLARE i INT;
+DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+SET i = 0;
+WHILE i < max DO
+INSERT IGNORE INTO t1 VALUES (DEFAULT);
+SET i = i + 1;
+END WHILE;
+END|
+CALL p1(130);
+connection node_4;
+Shutting down server 4...
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+SET GLOBAL debug = "+d,sync.wsrep_donor_state";
+connection node_4;
+Restarting server 4...
+connection node_1;
+SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached";
+Tables on server 1 flushed and locked for SST to server 4
+SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state";
+SET GLOBAL debug = "";
+SET DEBUG_SYNC='RESET';
+Wait for the state snapshot to be copied to server 4
+SST script unlocked server 1
+connection node_1;
+CALL p1(130);
+connection node_3;
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+INSERT INTO t2 VALUES (DEFAULT);
+SET SESSION wsrep_on = OFF;
+connection node_1;
+CALL p1(130);
+Waiting for server 3 to leave the cluster
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_2;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_4;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_3;
+Server 3 left the cluster, killing it...
+Killed server 3.
+Restarting server 3...
+Waiting for server 3 to rejoin the cluster
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_3;
+sleeping for 20
+Waiting ready
+Server 3 restarted.
+connection node_1;
+SET SESSION wsrep_on = ON;
+SET SESSION wsrep_sync_wait = 15;
+connection node_1;
+SELECT count(*) AS expect1_390 FROM t1;
+expect1_390
+390
+connection node_2;
+SELECT count(*) AS expect2_390 FROM t1;
+expect2_390
+390
+connection node_3;
+SELECT count(*) AS expect3_390 FROM t1;
+expect3_390
+390
+connection node_4;
+SELECT count(*) AS expect4_390 FROM t1;
+expect4_390
+390
+DROP TABLE t1;
+DROP PROCEDURE p1;
+connection node_1;
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+connection node_2;
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+connection node_3;
+CALL mtr.add_suppression("Vote 0 \\(success\\) on .+ is inconsistent with group");
+connection node_4;
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_wan.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_wan.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_wan.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_wan.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,9 +1,9 @@
 connection node_2;
 connection node_1;
-CALL mtr.add_suppression("WSREP: Stray state UUID msg:");
-CALL mtr.add_suppression("Sending JOIN failed: ");
-CALL mtr.add_suppression("WSREP: .* sending install message failed: Socket is not connected");
-CALL mtr.add_suppression("There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside");
+CALL mtr.add_suppression("WSREP: Stray state UUID msg: ");
+CALL mtr.add_suppression("WSREP: .*Sending JOIN failed: ");
+CALL mtr.add_suppression("WSREP: .*sending install message failed: (Transport endpoint|Socket) is not connected");
+CALL mtr.add_suppression("WSREP: .*There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside");
 SELECT VARIABLE_VALUE = 4 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
 VARIABLE_VALUE = 4
 1
@@ -36,8 +36,8 @@
 1
 DROP TABLE t1;
 connection node_1;
-call mtr.add_suppression("WSREP: read_completion_condition.*");
-call mtr.add_suppression("WSREP: read_handler.*");
+call mtr.add_suppression("WSREP: read_completion_condition");
+call mtr.add_suppression("WSREP: read_handler");
 disconnect node_3;
 disconnect node_4;
 disconnect node_2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_provider_options_syntax.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 connection node_2;
 connection node_1;
-call mtr.add_suppression("WSREP\: Unknown parameter 'gmcasts\\.segment'");
-call mtr.add_suppression("WSREP\: Set options returned 7");
+call mtr.add_suppression("WSREP: Unknown parameter 'gmcasts\\.segment'");
+call mtr.add_suppression("WSREP: Set options returned 7");
 SET GLOBAL wsrep_provider_options="gmcasts.segment=1";
 ERROR HY000: Incorrect arguments to SET
 Unhandled exceptions: 0
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/galera_wsrep_schema_detached.result	2025-05-19 16:14:24.000000000 +0000
@@ -3,10 +3,17 @@
 connection node_1;
 connection node_2;
 connection node_1;
-call mtr.add_suppression("WSREP:.*");
+call mtr.add_suppression("WSREP: async IST sender failed to serve");
+call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused");
+call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect");
+call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error");
 SET @wsrep_provider_options_orig = @@GLOBAL.wsrep_provider_options;
 SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true;pc.weight=2';
 connection node_2;
+call mtr.add_suppression("WSREP: async IST sender failed to serve");
+call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused");
+call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect");
+call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error");
 SET @wsrep_cluster_address_orig = @@GLOBAL.wsrep_cluster_address;
 SET GLOBAL WSREP_ON=0;
 SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mdev-29775.result mariadb-10.11.13/mysql-test/suite/galera/r/mdev-29775.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/mdev-29775.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/mdev-29775.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,84 @@
+connection node_2;
+connection node_1;
+SET GLOBAL wsrep_mode=REPLICATE_MYISAM;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+f0
+NULL
+connection node_2;
+SELECT * FROM t;
+f0
+NULL
+DROP TABLE t;
+connection node_1;
+SET GLOBAL wsrep_mode=REPLICATE_MYISAM;
+SET GLOBAL wsrep_forced_binlog_format=ROW;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+f0
+NULL
+connection node_2;
+SELECT * FROM t;
+f0
+NULL
+DROP TABLE t;
+connection node_1;
+SET GLOBAL wsrep_mode=REPLICATE_ARIA;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+f0
+NULL
+connection node_2;
+SELECT * FROM t;
+f0
+NULL
+DROP TABLE t;
+connection node_1;
+SET GLOBAL wsrep_mode=REPLICATE_ARIA;
+SET GLOBAL wsrep_forced_binlog_format=ROW;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+f0
+NULL
+connection node_2;
+SELECT * FROM t;
+f0
+NULL
+DROP TABLE t;
+connection node_1;
+SET GLOBAL wsrep_mode=REPLICATE_MYISAM;
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA]
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA]
+SET GLOBAL wsrep_mode=REPLICATE_ARIA;
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA]
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+ERROR HY000: wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA]
+SET GLOBAL wsrep_mode=DEFAULT;
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW]
+SET GLOBAL wsrep_mode = REPLICATE_ARIA;
+ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW]
+SET GLOBAL wsrep_mode=DEFAULT;
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW]
+SET GLOBAL wsrep_mode = REPLICATE_ARIA;
+ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW]
+SET GLOBAL wsrep_forced_binlog_format=DEFAULT;
+SET GLOBAL wsrep_mode=DEFAULT;
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW]
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+ERROR HY000: wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] can't be enabled if wsrep_forced_binlog != [NONE|ROW]
+SET GLOBAL wsrep_forced_binlog_format=DEFAULT;
+SET GLOBAL wsrep_mode=DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mdev-30653.result mariadb-10.11.13/mysql-test/suite/galera/r/mdev-30653.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/mdev-30653.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/mdev-30653.result	2025-05-19 16:14:24.000000000 +0000
@@ -4,7 +4,7 @@
 create table t2 (id serial, val int) engine=aria;
 insert into t1 values(1, 23);
 insert into t2 values(2, 42);
-call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental. Storage engine Aria for table 'test'.'t2' is not supported in Galera");
+call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental\\. Storage engine Aria for table 'test'\\.'t2' is not supported in Galera");
 begin;
 update t1 set val=24 where id=1;
 update t2 set val=41 where id=2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#198.result mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#198.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#198.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#198.result	2025-05-19 16:14:24.000000000 +0000
@@ -31,3 +31,6 @@
 test.t2	repair	note	The storage engine for the table doesn't support repair
 DROP TABLE t1;
 DROP TABLE t2;
+connection node_1;
+disconnect node_2a;
+disconnect node_2b;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,debug.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 --- r/mysql-wsrep#33.result
 +++ r/mysql-wsrep#33,debug.reject
-@@ -698,12 +698,196 @@
+@@ -698,6 +698,190 @@
  1
  DROP TABLE t1;
  COMMIT;
@@ -190,12 +190,4 @@
 +SET GLOBAL debug_dbug = $debug_orig;
  connection node_2;
  connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
+ CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff
--- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33,release.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
---- r/mysql-wsrep#33.result
-+++ r/mysql-wsrep#33.reject
-@@ -700,10 +700,10 @@
- COMMIT;
- connection node_2;
- connection node_1;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- DROP USER sst;
- connection node_2;
--CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
-+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
- CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
- CALL mtr.add_suppression("Can't open and lock time zone table");
- CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33.result mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/mysql-wsrep#33.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/mysql-wsrep#33.result	2025-05-19 16:14:24.000000000 +0000
@@ -700,10 +700,10 @@
 COMMIT;
 connection node_2;
 connection node_1;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 DROP USER sst;
 connection node_2;
-CALL mtr.add_suppression("Slave SQL: Error 'The MySQL server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
+CALL mtr.add_suppression("Slave SQL: Error 'The MariaDB server is running with the --skip-grant-tables option so it cannot execute this statement' on query");
 CALL mtr.add_suppression("InnoDB: Error: Table \"mysql\"\\.\"innodb_index_stats\" not found");
 CALL mtr.add_suppression("Can't open and lock time zone table");
 CALL mtr.add_suppression("Can't open and lock privilege tables");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result mariadb-10.11.13/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result
--- mariadb-10.11.11/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/r/wsrep_mode_strict_replication.result	2025-05-19 16:14:24.000000000 +0000
@@ -32,6 +32,8 @@
 Level	Code	Message
 Error	4165	Galera replication not supported
 Warning	1031	WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine MyISAM not supported.
+Error	4165	Galera replication not supported
+Warning	1031	WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine MyISAM not supported.
 SHOW CREATE TABLE t2;
 Table	Create Table
 t2	CREATE TABLE `t2` (
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/suite.pm mariadb-10.11.13/mysql-test/suite/galera/suite.pm
--- mariadb-10.11.11/mysql-test/suite/galera/suite.pm	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/suite.pm	2025-05-19 16:14:24.000000000 +0000
@@ -10,61 +10,61 @@
 
 push @::global_suppressions,
   (
-     qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1),
-     qr(WSREP: Could not open saved state file for reading: .*),
-     qr(WSREP: Could not open state file for reading: .*),
-     qr(WSREP: Gap in state sequence. Need state transfer.),
+     qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1),
+     qr(WSREP: Could not open saved state file for reading: ),
+     qr(WSREP: Could not open state file for reading: ),
+     qr(WSREP: Gap in state sequence\. Need state transfer\.),
      qr(WSREP: Failed to prepare for incremental state transfer:),
-     qr(WSREP:.*down context.*),
+     qr(WSREP: .*down context.*),
      qr(WSREP: Failed to send state UUID:),
-     qr(WSREP: last inactive check more than .* skipping check),
-     qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.),
-     qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|,
+     qr(WSREP: last inactive check more than .+ skipping check),
+     qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.),
+     qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|,
      qr(WSREP: Quorum: No node with complete state),
      qr(WSREP: Initial position was provided by configuration or SST, avoiding override),
-     qr|WSREP: discarding established \(time wait\) .*|,
-     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.),
+     qr|WSREP: discarding established \(time wait\) |,
+     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.),
      qr(WSREP: evs::proto.*),
-     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|,
+     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|,
      qr(WSREP: no nodes coming from prim view, prim not possible),
-     qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable),
+     qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable),
      qr(WSREP: user message in state LEAVING),
-     qr(WSREP: .* sending install message failed: Transport endpoint is not connected),
+     qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected),
      qr(WSREP: .* sending install message failed: Resource temporarily unavailable),
-     qr(WSREP: Maximum writeset size exceeded by .*),
-     qr(WSREP: transaction size exceeded.*),
-     qr(WSREP: RBR event .*),
-     qr(WSREP: Ignoring error for TO isolated action: .*),
-     qr(WSREP: transaction size limit .*),
-     qr(WSREP: rbr write fail, .*),
-     qr(WSREP: .*Backend not supported: foo.*),
-     qr(WSREP: .*Failed to initialize backend using .*),
-     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*),
+     qr(WSREP: Maximum writeset size exceeded by ),
+     qr(WSREP: transaction size exceeded),
+     qr(WSREP: RBR event ),
+     qr(WSREP: Ignoring error for TO isolated action: ),
+     qr(WSREP: transaction size limit ),
+     qr(WSREP: rbr write fail, ),
+     qr(WSREP: .*Backend not supported: foo),
+     qr(WSREP: .*Failed to initialize backend using ),
+     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ),
      qr(WSREP: gcs connect failed: Socket type not supported),
      qr(WSREP: failed to open gcomm backend connection: 110: failed to reach primary view: 110 .*),
-     qr(WSREP: .*Failed to open backend connection: -110 .*),
-     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*),
+     qr(WSREP: .*Failed to open backend connection: -110 ),
+     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ),
      qr(WSREP: gcs connect failed: Connection timed out),
      qr|WSREP: wsrep::connect\(.*\) failed: 7|,
-     qr(WSREP: SYNC message from member .* in non-primary configuration. Ignored.),
+     qr(WSREP: SYNC message from member .+ ?in non-primary configuration\. Ignored\.),
      qr(WSREP: Could not find peer:),
-     qr(WSREP: TO isolation failed for: .*),
-     qr|WSREP: gcs_caused\(\) returned .*|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|,
-     qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled. Expect abort.|,
+     qr(WSREP: TO isolation failed for: ),
+     qr|WSREP: gcs_caused\(\) returned |,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|,
+     qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled\. Expect abort\.|,
      qr(WSREP: Action message in non-primary configuration from member [0-9]*),
      qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*),
-     qr(WSREP: discarding established .*),
-     qr|WSREP: .*core_handle_uuid_msg.*|,
-     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on),
-     qr|WSREP: JOIN message from member .* in non-primary configuration. Ignored.|,
-     qr|Query apply failed:*|,
-     qr(WSREP: Ignoring error*),
-     qr(WSREP: Failed to remove page file .*),
-     qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to .*),
-     qr|WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.|,
-     qr|WSREP: Send action \{.* STATE_REQUEST} returned -107 \(Transport endpoint is not connected\)|,
+     qr(WSREP: discarding established ),
+     qr|WSREP: .*core_handle_uuid_msg|,
+     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on),
+     qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|,
+     qr|WSREP: .*Query apply failed:|,
+     qr(WSREP: Ignoring error),
+     qr(WSREP: Failed to remove page file ),
+     qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ),
+     qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+,
+     qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+,
      qr|WSREP: Trying to continue unpaused monitor|,
      qr|WSREP: Wait for gtid returned error 3 while waiting for prior transactions to commit before setting position|,
      qr|WSREP: Failed to report last committed|,
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/GAL-401.test mariadb-10.11.13/mysql-test/suite/galera/t/GAL-401.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/GAL-401.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/GAL-401.test	2025-05-19 16:14:24.000000000 +0000
@@ -48,7 +48,7 @@
 SET SESSION wsrep_sync_wait=15;
 SHOW CREATE TABLE t1;
 DROP TABLE t1;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 
 --connection node_1
 --let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/GCF-939.test mariadb-10.11.13/mysql-test/suite/galera/t/GCF-939.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/GCF-939.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/GCF-939.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,7 @@
 #
 
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 
 --exec rm -rf $MYSQLTEST_VARDIR/mysqld.2/data/GRA_*.log
 
@@ -30,5 +31,6 @@
 
 DROP TABLE t1;
 CALL mtr.add_suppression("Ignoring error 'Unknown table 'test\\.t1'' on query");
+
 --connection node_2
 CALL mtr.add_suppression("Error 'Unknown table 'test\\.t1'' on query");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-10715.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-10715.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-10715.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-10715.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,11 +4,13 @@
 log-bin=mysqld-bin
 log-slave-updates
 binlog-format=ROW
+
 [mysqld.1]
 gtid-domain-id=1
 wsrep_gtid_mode=1
 wsrep_gtid_domain_id=1
+
 [mysqld.2]
 gtid-domain-id=1
 wsrep_gtid_mode=1
-wsrep_gtid_domain_id=1
\ No newline at end of file
+wsrep_gtid_domain_id=1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-15443.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-15443.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-15443.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-15443.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,7 @@
 !include ../galera_2nodes.cnf
+
 [mysqld.1]
 wsrep_auto_increment_control=OFF
+
 [mysqld.2]
 wsrep_auto_increment_control=OFF
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-18832.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-18832.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-18832.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-18832.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,6 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
+--source include/have_sequence.inc
 
 CREATE SEQUENCE Seq1_1  START WITH 1  INCREMENT BY 1 NOCACHE;
 CREATE TABLE t1 (Id int(11) NOT NULL, PRIMARY KEY (Id));
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20225.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20225.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20225.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20225.test	2025-05-19 16:14:24.000000000 +0000
@@ -41,7 +41,7 @@
 SET GLOBAL debug_dbug = 'RESET';
 SET DEBUG_SYNC = 'now SIGNAL signal.mdev_20225_continue';
 SET DEBUG_SYNC = 'RESET';
-SET GLOBAL wsrep_slave_threads = 1;
+SET GLOBAL wsrep_slave_threads = DEFAULT;
 
 --connection node_2
 # Trigger should now be dropped on node_2.
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20793.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20793.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-20793.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-20793.test	2025-05-19 16:14:24.000000000 +0000
@@ -99,4 +99,4 @@
 SET debug_sync = "RESET";
 
 DROP TABLE t1;
-SET GLOBAL wsrep_slave_threads = 1;
+SET GLOBAL wsrep_slave_threads = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-21479.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-21479.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-21479.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-21479.test	2025-05-19 16:14:24.000000000 +0000
@@ -77,7 +77,7 @@
 --let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
 --source include/wait_condition.inc
 
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender (.*) is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 
 --connection node_1
 --echo # Wait until both nodes are back to cluster
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22227.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22227.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22227.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22227.test	2025-05-19 16:14:24.000000000 +0000
@@ -13,7 +13,7 @@
 --connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1
 --connection node_1b
 SET SESSION wsrep_sync_wait = 0;
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = 'Waiting for table level lock'
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table level lock'
 --source include/wait_condition.inc
 
 --connection node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22708.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22708.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-22708.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-22708.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,4 @@
 !include ../galera_2nodes.cnf
 
 [mysqld]
-log-bin
\ No newline at end of file
+log-bin
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24143.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24143.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24143.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24143.test	2025-05-19 16:14:24.000000000 +0000
@@ -21,4 +21,3 @@
 ALTER TABLE t1 DROP COLUMN c2;
 SELECT get_lock ('test', 1.5);
 DROP TABLE t1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24327.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24327.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-24327.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-24327.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -3,4 +3,3 @@
 [mysqld.1]
 log-bin=mariadb-bin
 log-slave-updates=OFF
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-25389.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-25389.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-25389.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-25389.test	2025-05-19 16:14:24.000000000 +0000
@@ -7,6 +7,8 @@
 --source ../galera/include/auto_increment_offset_save.inc
 
 --connection node_2
+--let $wsrep_slave_threads_orig = `SELECT @@wsrep_slave_threads`
+
 call mtr.add_suppression("WSREP: Failed to create/initialize system thread");
 SET GLOBAL debug_dbug='+d,wsrep_simulate_failed_connection_1';
 --error ER_WRONG_ARGUMENTS
@@ -21,4 +23,9 @@
 # issue is fixed.
 --source include/restart_mysqld.inc
 
+--connection node_2
+--disable_query_log
+--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig;
+--enable_query_log
+
 --source ../galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26266.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26266.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26266.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26266.test	2025-05-19 16:14:24.000000000 +0000
@@ -31,7 +31,6 @@
 INSERT INTO t2 VALUES (3);
 INSERT INTO t2 VALUES (4);
 INSERT INTO t2 VALUES (5);
---error ER_LOCK_DEADLOCK
 CREATE VIEW v1 AS SELECT c1 FROM t1 WHERE c1 IN (SELECT a FROM t2) GROUP BY c1;
-
+DROP VIEW v1;
 DROP TABLE t1,t2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26597.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26597.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-26597.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-26597.test	2025-05-19 16:14:24.000000000 +0000
@@ -28,5 +28,3 @@
 --source ../../galera/include/auto_increment_offset_restore.inc
 --connection node_1
 DROP TABLE t3;
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.opt mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.opt
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1 +1 @@
---partition=ON
\ No newline at end of file
+--partition=ON
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27001.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27001.test	2025-05-19 16:14:24.000000000 +0000
@@ -4,4 +4,4 @@
 CREATE TABLE t3 (c INT) PARTITION BY RANGE (c) (PARTITION p1 VALUES LESS THAN (1000));
 CREATE TABLE tp2 (c INT);
 ALTER TABLE t3 CONVERT TABLE tp2 TO PARTITION p2 VALUES LESS THAN (2000);
-DROP TABLE t3;
\ No newline at end of file
+DROP TABLE t3;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27123.opt mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27123.opt
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27123.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27123.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,2 +1 @@
 --wsrep_auto_increment_control=OFF --auto_increment_increment=3 --auto_increment_offset=3
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27862.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27862.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-27862.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-27862.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,6 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
+--source include/have_sequence.inc
 
 --disable_ps2_protocol
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-28053.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-28053.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-28053.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-28053.test	2025-05-19 16:14:24.000000000 +0000
@@ -39,6 +39,7 @@
 --disable_result_log
 --eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3;
 START SLAVE;
+
 --eval SELECT MASTER_GTID_WAIT('$gtid', 600)
 --enable_result_log
 --enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29293.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29293.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29293.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29293.test	2025-05-19 16:14:24.000000000 +0000
@@ -38,4 +38,3 @@
 --reap 
 DROP TABLE t1;
 SET DEBUG_SYNC= 'RESET';
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29512.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29512.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-29512.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-29512.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -10,6 +10,4 @@
 max-binlog-size=4096
 expire-logs-days=1
 
-
 [mysqld.2]
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-32549.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-32549.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-32549.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-32549.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,7 @@
 # statement is rolled back
 #
 --source include/galera_cluster.inc
+--source include/have_aria.inc
 
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) engine=innodb;
 CREATE TABLE t2 (f1 INTEGER PRIMARY KEY) engine=aria;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33136.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33136.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33136.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33136.test	2025-05-19 16:14:24.000000000 +0000
@@ -10,6 +10,7 @@
 # transaction in the MDL conflict handling code.
 
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_debug_sync.inc
 --source include/have_debug.inc
 
@@ -19,8 +20,8 @@
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
 
 --connection node_1a
-TRUNCATE TABLE t1;
-# TRUNCATE forces the next statement to re-read statistics from persistent storage,
+RENAME TABLE t1 TO tmp, tmp TO t1;
+# RENAME forces the next statement to re-read statistics from persistent storage,
 # which will acquire MDL locks on the statistics tables in InnoDB.
 SET SESSION wsrep_retry_autocommit = 0;
 SET DEBUG_SYNC = 'dict_stats_mdl_acquired SIGNAL may_toi WAIT_FOR bf_abort';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -2,3 +2,12 @@
 
 [mysqld]
 log-bin
+log-slave-updates
+
+[mysqld.1]
+auto-increment-increment=2
+auto-increment-offset=1
+
+[mysqld.2]
+auto-increment-increment=2
+auto-increment-offset=2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-33828.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-33828.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,5 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
---source include/have_aria.inc
 
 SET AUTOCOMMIT=ON;
 SELECT @@autocommit;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,13 @@
+!include ../galera_2nodes.cnf
+
+[mysqld]
+log-bin
+log-slave-updates
+
+[mysqld.1]
+auto-increment-increment=2
+auto-increment-offset=1
+
+[mysqld.2]
+auto-increment-increment=2
+auto-increment-offset=2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-34647.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-34647.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_aria.inc
 
 create table t1(id serial, val varchar(100)) engine=myisam;
@@ -38,14 +39,12 @@
 insert into t5 select null, 'd' from t5;
 select * from t2;
 
-
 --connection node_2
 select * from t1;
 select * from t2;
 select * from t3;
 select * from t4;
 select * from t5;
-set global wsrep_mode=default;
 
 --connection node_1
 drop table t1,t2,t3,t4,t5;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.opt mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.opt
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.opt	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1 @@
+--plugin-load=$HA_ROCKSDB_SO
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35748.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35748.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,22 @@
+--source include/galera_cluster.inc
+--source include/have_sequence.inc
+--source include/have_rocksdb.inc
+
+--connection node_1
+INSTALL PLUGIN IF NOT EXISTS connect SONAME 'ha_connect';
+
+CREATE TABLE t1 (f INT) ENGINE=CONNECT;
+CREATE TABLE t2 (f INT) ENGINE=ROCKSDB;
+--error ER_NOT_SUPPORTED_YET
+CREATE TABLE t3 (f INT) ENGINE=SEQUENCE;
+show warnings;
+
+--connection node_2
+show create table t1;
+show create table t2;
+--error ER_NO_SUCH_TABLE
+show create table t3;
+
+--connection node_1
+DROP TABLE t1, t2;
+UNINSTALL PLUGIN IF EXISTS connect;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35946.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35946.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-35946.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-35946.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,39 @@
+#
+# MDEV-35946: Assertion `thd->is_error()' failed in Sql_cmd_dml::prepare
+#
+--source include/have_innodb.inc
+--source include/galera_cluster.inc
+
+# Save original auto_increment_offset values.
+--let $node_1=node_1
+--let $node_2=node_2
+--source include/auto_increment_offset_save.inc
+
+#
+# Disconnect from the cluster
+#
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1';
+SET SESSION wsrep_sync_wait=0;
+--let $wait_condition = SELECT VARIABLE_VALUE = 'non-Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+--source include/wait_condition.inc
+SET SESSION wsrep_sync_wait=DEFAULT;
+
+#
+# If bug is present, assertion will fire
+# during the execution of the following DELETE
+#
+--error ER_LOCK_WAIT_TIMEOUT
+DELETE FROM mysql.wsrep_streaming_log;
+
+#
+# Reconnect to the cluster
+#
+SET SESSION wsrep_sync_wait=0;
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0';
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+--source include/wait_condition.inc
+SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+SET SESSION wsrep_sync_wait=DEFAULT;
+
+--source include/auto_increment_offset_restore.inc
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender .+ ?is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-36116.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-36116.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-36116.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-36116.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,43 @@
+#
+# MDEV-36116: TOI crashes in debug assert if executing thread is killed.
+#
+
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_debug_sync.inc
+--source include/have_debug.inc
+
+--connect con1,127.0.0.1,root,,test,$NODE_MYPORT_1
+
+# Start TOI operation and wait for the thread to be killed.
+--connection node_1
+CALL mtr.add_suppression("CREATE TABLE isolation failure");
+
+--let $connection_id = `SELECT CONNECTION_ID()`
+SET DEBUG_SYNC = 'wsrep_kill_thd_before_enter_toi SIGNAL may_kill WAIT_FOR continue';
+--send
+  CREATE TABLE t1 (a INT) ENGINE=InnoDB;
+
+# Kill the thread and let it continue.
+--connection con1
+SET DEBUG_SYNC = 'now WAIT_FOR may_kill';
+--disable_query_log
+--eval KILL CONNECTION $connection_id
+--enable_query_log
+SET DEBUG_SYNC = 'now SIGNAL continue';
+
+--connection node_1
+--error 2013,2026
+--reap
+
+# Verify no tables created on either nodes.
+--connection node_2
+SHOW TABLES LIKE 't1';
+
+--connection con1
+SHOW TABLES LIKE 't1';
+
+# Cleanup
+SET DEBUG_SYNC = 'RESET';
+--disconnect con1
+--source include/galera_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,8 @@
 !include ../galera_2nodes_as_slave.cnf
 
+[mysqld.1]
+wsrep-slave-threads=10
+
 [mysqld.2]
 slave-parallel-threads=2
 slave-parallel-mode=optimistic
-[mysqld.1]
-wsrep-slave-threads=10
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.test mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MDEV-6860.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MDEV-6860.test	2025-05-19 16:14:24.000000000 +0000
@@ -4,7 +4,7 @@
 
 --connection node_2
 --disable_query_log
---eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3, MASTER_USE_GTID=slave_pos;
+--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3, master_use_gtid=slave_pos;
 --enable_query_log
 START SLAVE;
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-259.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-259.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-259.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-259.test	2025-05-19 16:14:24.000000000 +0000
@@ -39,4 +39,3 @@
 
 # Cleanup
 SET DEBUG_SYNC= 'RESET';
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-284.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-284.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-284.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-284.test	2025-05-19 16:14:24.000000000 +0000
@@ -2,15 +2,16 @@
 # MW-284 Slave I/O retry on ER_COM_UNKNOWN_ERROR
 #
 
---source include/have_log_bin.inc
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_log_bin.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
 call mtr.add_suppression("\\[ERROR\\] Error reading packet from server: WSREP has not yet prepared node for application use ");
 call mtr.add_suppression("WSREP has not yet prepared node for application use");
 
 --disable_query_log
---eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_PORT=$NODE_MYPORT_1, MASTER_USER='root', MASTER_CONNECT_RETRY=1;
+--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_1, master_connect_retry=1;
 --enable_query_log
 
 --connection node_1
@@ -29,7 +30,7 @@
 --connection node_3
 SELECT @@wsrep_on;
 --sleep 1
-call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use (server_errno=1047)");
+call mtr.add_suppression("Error reading packet from server: WSREP has not yet prepared node for application use \\(server_errno ?= ?1047\\)");
 START SLAVE;
 --let $slave_param= Slave_IO_Running
 --let $slave_param_value= Connecting
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-313.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-313.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-313.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-313.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,6 +7,3 @@
 [mysqld.2]
 log-bin
 log-slave-updates
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,6 +4,3 @@
 wsrep-retry-autocommit=0
 
 [mysqld.2]
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
 #
-# #MW-329 Fix incorrect affected rows count after replay
+# MW-329 Fix incorrect affected rows count after replay.
 #
 
 --source include/galera_cluster.inc
@@ -11,7 +11,7 @@
 INSERT INTO t1 (f1) VALUES (1),(65535);
 
 #
-# Run concurrent INSERTs 
+# Run concurrent INSERTs
 #
 
 DELIMITER |;
@@ -86,6 +86,10 @@
 --eval KILL CONNECTION $connection_id
 --enable_query_log
 
+#
+# getting execution results for --send
+#
+
 --connection node_1b
 --error 0,1317,2013,2026
 --reap
@@ -96,6 +100,8 @@
 DROP PROCEDURE proc_insert;
 DROP TABLE t1;
 
+--disconnect node_1b
+
 # Due to MW-330, Multiple "conflict state 3 after post commit" warnings if table is dropped while SP is running
 CALL mtr.add_suppression("WSREP: .* conflict state after post commit ");
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,6 @@
+!include ../galera_2nodes.cnf
+
+[mysqld.1]
+wsrep-retry-autocommit=0
+
+[mysqld.2]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-329F.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-329F.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,105 @@
+#
+# MW-329F Fix incorrect affected rows count after replay.
+#
+# This is a version of MW-329 without the infinite loop that
+# in the original test is closed by killing the connection.
+#
+
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+
+CREATE TABLE t1 (f1 INTEGER, f2 CHAR(20) DEFAULT 'abc') ENGINE=InnoDB;
+
+# We start with a populated table
+INSERT INTO t1 (f1) VALUES (1),(65535);
+
+#
+# Run concurrent INSERTs
+#
+
+DELIMITER |;
+CREATE PROCEDURE proc_insert (repeat_count int)
+BEGIN
+        DECLARE current_num int;
+        DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+        SET current_num = 0;
+        SET SESSION wsrep_sync_wait = 0;
+        WHILE current_num < repeat_count do
+            INSERT INTO t1 (f1) VALUES (FLOOR( 1 + RAND( ) * 65535 ));
+            SELECT SLEEP(0.1);
+            SET current_num = current_num + 1;
+        END WHILE;
+END|
+DELIMITER ;|
+
+--connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1
+--connection node_1b
+--let $connection_id = `SELECT CONNECTION_ID()`
+--disable_query_log
+--disable_result_log
+--send CALL proc_insert(500);
+
+#
+# Run concurrent UPDATEs. We expect that each UPDATE will report that
+# some rows were matched and updated
+#
+
+--connection node_2
+--let $count = 2
+--let $wsrep_local_replays_old = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'`
+
+while ($count)
+{
+	--let $signature = `SELECT LEFT(MD5(RAND()), 10)`
+	--disable_query_log
+	--error 0,ER_LOCK_DEADLOCK
+	--eval UPDATE t1 SET f2 = '$signature'
+	--enable_query_log
+	--let $row_count = `SELECT ROW_COUNT()`
+	if (`SELECT @@error_count = 0`) {
+		if (`SELECT $row_count = 0`) {
+			--die ROW_COUNT() = 0
+		}
+	}
+
+        #
+        # Ensure at least one replay happens
+        #
+
+        --let $wsrep_replays = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'`
+        --disable_query_log
+        if (`SELECT $wsrep_replays - $wsrep_local_replays_old > 0`) {
+                --dec $count
+        }
+        --enable_query_log
+}
+
+#
+# Confirm that some transaction replays occurred
+#
+
+--let $wsrep_local_replays_new = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'`
+--disable_query_log
+--eval SELECT $wsrep_local_replays_new - $wsrep_local_replays_old > 0 AS wsrep_local_replays;
+--enable_query_log
+
+#
+# getting execution results for --send
+#
+
+--connection node_1b
+--error 0,1317,2013,2026
+--reap
+--enable_query_log
+--enable_result_log
+
+--connection node_1
+DROP PROCEDURE proc_insert;
+DROP TABLE t1;
+
+--disconnect node_1b
+
+# Due to MW-330, Multiple "conflict state 3 after post commit" warnings if table is dropped while SP is running
+CALL mtr.add_suppression("WSREP: .* conflict state after post commit ");
+
+set global innodb_status_output=Default;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-360-master.opt mariadb-10.11.13/mysql-test/suite/galera/t/MW-360-master.opt
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-360-master.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-360-master.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,2 +1 @@
 --gtid-domain-id=1 --log-bin --log-slave-updates
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-369.inc mariadb-10.11.13/mysql-test/suite/galera/t/MW-369.inc
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-369.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-369.inc	2025-05-19 16:14:24.000000000 +0000
@@ -80,5 +80,3 @@
 
 SET GLOBAL DEBUG_DBUG = "";
 SET DEBUG_SYNC = 'RESET';
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-416.test mariadb-10.11.13/mysql-test/suite/galera/t/MW-416.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-416.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-416.test	2025-05-19 16:14:24.000000000 +0000
@@ -21,73 +21,71 @@
 #ALTER INSTANCE ROTATE INNODB MASTER KEY;
 --error 1044,1227,1370
 ALTER PROCEDURE proc1 COMMENT 'foo';
---error 1044,1227,1370
+--error 1044,1227
 ALTER SERVER srv OPTIONS (USER 'sally');
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 ALTER TABLE tbl DROP COLUMN col;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 ALTER VIEW vw AS SELECT 1;
 
---error 1044,1227,1370
+--error 1044,1227
 CREATE DATABASE db;
---error 1044,1227,1370
-CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP  DO SELECT 1;
+--error 1044,1227
+CREATE EVENT ev1 ON SCHEDULE AT CURRENT_TIMESTAMP DO SELECT 1;
 --error 1044,1227,1370
 CREATE FUNCTION fun1() RETURNS int RETURN(1);
 --error 1044,1227,1370
 CREATE FUNCTION fun1 RETURNS STRING SONAME 'funlib.so';
 --error 1044,1227,1370
-CREATE PROCEDURE proc1()  BEGIN END;
---error 1044,1142,1227,1370
+CREATE PROCEDURE proc1() BEGIN END;
+--error 1044,1142,1227
 CREATE INDEX idx ON tbl(id);
---error 1044,1142,1227,1370
+--error 1044,1227
 CREATE SERVER srv FOREIGN DATA WRAPPER 'fdw' OPTIONS (USER 'user');
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 CREATE TABLE t (i int);
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 CREATE TRIGGER trg BEFORE UPDATE ON t FOR EACH ROW BEGIN END;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 CREATE VIEW vw AS SELECT 1;
 
-
-
---error 1044,1142,1227,1370
+--error 1044,1227
 DROP DATABASE db;
---error 1044,1142,1227,1370
+--error 1044,1227
 DROP EVENT ev;
---error 1044,1142,1227,1370
+--error 1044,1227,1370
 DROP FUNCTION fun1;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 DROP INDEX idx ON t0;
---error 1044,1142,1227,1370
+--error 1044,1227,1370
 DROP PROCEDURE proc1;
---error 1044,1142,1227,1370
+--error 1044,1227
 DROP SERVEr srv;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 DROP TABLE t0;
---error 1044,1142,1227,1360,1370
+--error 1044,1227,1360
 DROP TRIGGER trg;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 DROP VIEW vw;
 
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 RENAME TABLE t0 TO t1;
 
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 TRUNCATE TABLE t0;
 
 # DCL
 
 # account management
---error 1044,1142,1227,1370,1064
+--error 1044,1227,1064
 ALTER USER myuser PASSWORD EXPIRE;
---error 1044,1142,1227,1370
+--error 1044,1227
 CREATE USER myuser IDENTIFIED BY 'pass';
---error 1044,1142,1227,1370
+--error 1044,1227
 DROP USER myuser;
---error 1044,1045,1142,1227,1370
+--error 1044,1045,1227
 GRANT ALL ON *.* TO 'myuser';
---error 1044,1142,1227,1370
+--error 1044,1227
 RENAME USER myuser TO mariauser;
 --error 1044,1142,1227,1370
 REVOKE SELECT ON test FROM myuser;
@@ -97,24 +95,25 @@
 REVOKE PROXY ON myuser FROM myuser;
 
 # table maintenance
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 ANALYZE TABLE db.tbl;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 CHECK TABLE db.tbl;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 CHECKSUM TABLE db.tbl;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 OPTIMIZE TABLE db.tbl;
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 REPAIR TABLE db.tbl;
 
 # plugin and user defined functions
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 INSTALL PLUGIN plg SONAME 'plg.so';
---error 1044,1142,1227,1370
+--error 1044,1142,1227
 UNINSTALL PLUGIN plg;
 
 --connection node_1
 DROP USER 'userMW416'@'localhost';
 SHOW DATABASES;
 
+--disconnect userMW416
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/MW-86-wait8.cnf mariadb-10.11.13/mysql-test/suite/galera/t/MW-86-wait8.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/MW-86-wait8.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/MW-86-wait8.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,4 +7,3 @@
 [mysqld.2]
 log-bin
 log-slave-updates
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/binlog_checksum.test mariadb-10.11.13/mysql-test/suite/galera/t/binlog_checksum.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/binlog_checksum.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/binlog_checksum.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 
 --echo # On node_1
 --connection node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/create.test mariadb-10.11.13/mysql-test/suite/galera/t/create.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/create.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/create.test	2025-05-19 16:14:24.000000000 +0000
@@ -86,4 +86,3 @@
 
 --source include/galera_end.inc
 --echo # End of tests
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera#414.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera#414.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera#414.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera#414.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcs.max_packet_size=2'
+wsrep_provider_options='gcs.max_packet_size=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcs.max_packet_size=2'
+wsrep_provider_options='gcs.max_packet_size=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera#500.test mariadb-10.11.13/mysql-test/suite/galera/t/galera#500.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera#500.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera#500.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,7 +3,12 @@
 # thrown from gcomm background thread, the provider terminates properly
 # and wsrep_ready becomes 0.
 #
+# Not to be run with ASAN. Provider leaks memory when gcomm
+# thread is aborted forcifully and ASAN crashes during leak report
+# after provider is unloaded.
+#
 
+--source include/not_asan.inc
 --source include/have_innodb.inc
 --source include/galera_cluster.inc
 --source include/galera_have_debug_sync.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_2primary_replica.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_2primary_replica.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_2primary_replica.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_2primary_replica.test	2025-05-19 16:14:24.000000000 +0000
@@ -41,17 +41,19 @@
 
 --let $node_1 = replica
 --let $node_2 = node_2
+--let $node_3 = primary1
+--let $node_4 = primary2
 --source include/auto_increment_offset_save.inc
 
 --connection replica
 --echo # Galera replica changing master to primary1
---disable_query_log
 SET @@default_master_connection='stream1';
+--disable_query_log
 --eval CHANGE MASTER 'stream1' TO master_host='127.0.0.1', master_user='repl', master_password='repl', master_port=$NODE_MYPORT_3, master_use_gtid=slave_pos;
 --enable_query_log
 
-SET @@default_master_connection='stream2';
 --echo # Primary node changing master to primary2
+SET @@default_master_connection='stream2';
 --disable_query_log
 --eval CHANGE MASTER 'stream2' TO master_host='127.0.0.1', master_user='repl2', master_password='repl2', master_port=$NODE_MYPORT_4, master_use_gtid=slave_pos;
 --enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_MDEV-29512.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_MDEV-29512.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_MDEV-29512.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_MDEV-29512.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -10,6 +10,4 @@
 max-binlog-size=4096
 expire-logs-days=1
 
-
 [mysqld.2]
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_alter_engine_myisam.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_alter_engine_myisam.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_alter_engine_myisam.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_alter_engine_myisam.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_aria.inc
 
 #
@@ -35,7 +36,4 @@
 DROP TABLE t1;
 
 --connection node_1
---disable_query_log
 SET GLOBAL wsrep_mode = DEFAULT;
---enable_query_log
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -20,7 +20,7 @@
 --connection node_1a
 SET SESSION wsrep_sync_wait = 0;
 
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 --source include/wait_condition.inc
 
 SELECT COUNT(*) = 0 FROM t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -9,6 +9,3 @@
 lock_wait_timeout=5
 innodb_lock_wait_timeout=5
 wait_timeout=5
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_applier_ftwrl_table_alter.test	2025-05-19 16:14:24.000000000 +0000
@@ -27,16 +27,16 @@
 --connection node_1
 SELECT 1 FROM DUAL;
 # Wait
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock';
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 --source include/wait_condition.inc
 
-SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock';
+SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 
 UNLOCK TABLES;
 
 SET SESSION wsrep_sync_wait = 15;
 
 SHOW CREATE TABLE t1;
-SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock';
+SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_ctas.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_ctas.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_ctas.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_ctas.test	2025-05-19 16:14:24.000000000 +0000
@@ -73,4 +73,3 @@
 
 --connection node_3
 RESET MASTER;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_nonprim.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_nonprim.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_as_slave_nonprim.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_as_slave_nonprim.test	2025-05-19 16:14:24.000000000 +0000
@@ -2,7 +2,7 @@
 # Test the behavior of a Galera async slave if it goes non-prim. Async replication
 # should abort with an error but it should be possible to restart it.
 #
-# The galera/galera_2node_slave.cnf describes the setup of the nodes
+# The galera_3nodes_as_slave.cnf describes the setup of the nodes
 #
 
 --source include/have_innodb.inc
@@ -17,9 +17,10 @@
 
 --connection node_2
 --disable_query_log
---eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_PORT=$NODE_MYPORT_4, MASTER_USER='root';
+--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_4;
 --enable_query_log
 START SLAVE;
+
 SET SESSION wsrep_sync_wait = 0;
 
 --connection node_4
@@ -44,9 +45,8 @@
 INSERT INTO t1 VALUES (1),(2),(3),(4),(5);
 
 --connection node_2
---sleep 5
+wait_for_slave_to_stop;
 --let $value = query_get_value(SHOW SLAVE STATUS, Last_SQL_Error, 1)
---connection node_1
 --disable_query_log
 --eval SELECT "$value" IN ("Error 'Unknown command' on query. Default database: 'test'. Query: 'BEGIN'", "Node has dropped from cluster") AS expected_error
 --enable_query_log
@@ -74,7 +74,6 @@
 --connection node_4
 DROP TABLE t1;
 
---sleep 2
 --connection node_2
 --let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
 --source include/wait_condition.inc
@@ -84,7 +83,7 @@
 
 CALL mtr.add_suppression("Slave SQL: Error 'Unknown command' on query");
 CALL mtr.add_suppression("Slave: Unknown command Error_code: 1047");
-CALL mtr.add_suppression("Transport endpoint is not connected");
+CALL mtr.add_suppression("(Transport endpoint|Socket) is not connected");
 CALL mtr.add_suppression("Slave SQL: Error in Xid_log_event: Commit could not be completed, 'Deadlock found when trying to get lock; try restarting transaction', Error_code: 1213");
 CALL mtr.add_suppression("Slave SQL: Node has dropped from cluster, Error_code: 1047");
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_autoinc_sst_mariabackup.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,7 +5,7 @@
 wsrep_sst_auth="root:"
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=10M;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=10M;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_backup_stage.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_backup_stage.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_backup_stage.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_backup_stage.test	2025-05-19 16:14:24.000000000 +0000
@@ -56,7 +56,7 @@
 # reach commit stage. In the unlikely case the interleaving is different, the
 # result of the test should not change.
 --connection node_1c
---let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (State='Commit' OR State='Waiting for certification') AND ID=$insert_id
+--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification') AND ID=$insert_id
 --source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info WHERE TABLE_NAME='t1' AND THREAD_ID=$insert_id
 --source include/wait_condition.inc
@@ -83,11 +83,11 @@
 
 # wait for insert to get blocked
 --connection node_1c
---let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (State='Commit' OR State='Waiting for certification') AND ID=$insert_id
+--let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.processlist WHERE (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification') AND ID=$insert_id
 --source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*)=1 FROM information_schema.metadata_lock_info WHERE TABLE_NAME='t1' AND THREAD_ID=$insert_id
 --source include/wait_condition.inc
---let $wait_condition = SELECT COUNT(*)=2 FROM information_schema.processlist WHERE Info like 'INSERT INTO t1 (f1) values("node1%")' AND (State = 'Commit' OR State='Waiting for certification')
+--let $wait_condition = SELECT COUNT(*)=2 FROM information_schema.processlist WHERE Info like 'INSERT INTO t1 (f1) values("node1%")' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification')
 --source include/wait_condition.inc
 
 # nothing after BLOCK_DDL is applied
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_at_after_statement.test	2025-05-19 16:14:24.000000000 +0000
@@ -55,4 +55,3 @@
 
 --disconnect node_2a
 --disconnect node_2b
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_flush_for_export.test	2025-05-19 16:14:24.000000000 +0000
@@ -17,12 +17,12 @@
 
 --connection node_2
 SET SESSION wsrep_sync_wait = 0;
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 --source include/wait_condition.inc
 
 UNLOCK TABLES;
 
---let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 --source include/wait_condition.inc
 
 COMMIT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_lock_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -16,13 +16,16 @@
 INSERT INTO t1 VALUES (2);
 
 --connection node_2
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'
---source include/wait_condition.inc
+SET SESSION wsrep_sync_wait = 0;
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
+--let $wait_condition_on_error_output = SELECT * FROM INFORMATION_SCHEMA.PROCESSLIST
+--source include/wait_condition_with_debug.inc
 
 UNLOCK TABLES;
 
---let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE = 'Waiting for table metadata lock'
---source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
+--let $wait_condition_on_error_output = SELECT * FROM INFORMATION_SCHEMA.PROCESSLIST
+--source include/wait_condition_with_debug.inc
 
 COMMIT;
 SELECT COUNT(*) = 1 FROM t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,10 +6,10 @@
 wsrep_debug=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_mariabackup.test	2025-05-19 16:14:24.000000000 +0000
@@ -129,7 +129,7 @@
 let SEARCH_PATTERN = Server desynched from group during BACKUP STAGE BLOCK_COMMIT.;
 --source include/search_pattern_in_file.inc
 
-SET GLOBAL wsrep_mode = "";
+SET GLOBAL wsrep_mode = DEFAULT;
 
 --connection node_1
 DROP TABLE t;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_abort_ps.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,3 +1,4 @@
 !include ../galera_2nodes.cnf
+
 [mysqltest]
-ps-protocol
\ No newline at end of file
+ps-protocol
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,5 +5,3 @@
 
 [mysqld.2]
 innodb_stats_persistent=ON
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_background_statistics.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_background_statistics.test	2025-05-19 16:14:24.000000000 +0000
@@ -46,4 +46,3 @@
 --enable_query_log
 
 DROP TABLE t1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill.test	2025-05-19 16:14:24.000000000 +0000
@@ -113,7 +113,7 @@
 
 --connection node_2b
 SET SESSION wsrep_sync_wait=0;
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = 'Waiting for table metadata lock';
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%';
 --source include/wait_condition.inc
 
 --connection node_2a
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill_debug.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill_debug.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_kill_debug.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_kill_debug.test	2025-05-19 16:14:24.000000000 +0000
@@ -110,7 +110,7 @@
 --connection node_2a
 --let $connection_id = `SELECT CONNECTION_ID()`
 
-CREATE TABLE t1 (i int primary key);
+CREATE TABLE t1 (i int primary key) engine=innodb;
 
 # Set up sync point
 SET DEBUG_SYNC = "before_wsrep_ordered_commit SIGNAL bwoc_reached WAIT_FOR bwoc_continue";
@@ -129,17 +129,17 @@
 --enable_query_log
 
 SET DEBUG_SYNC = "now SIGNAL bwoc_continue";
-SET DEBUG_SYNC='RESET';
 --connection node_2a
 --error 0,1213,2013,2026
 --reap
 
 --connection node_2
+SET DEBUG_SYNC='RESET';
 # victim was able to complete the INSERT
 select * from t1;
 
 --disconnect node_2a
+--disconnect node_2b
 
 --connection node_1
 drop table t1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_lock_wait.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_lock_wait.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_bf_lock_wait.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_bf_lock_wait.test	2025-05-19 16:14:24.000000000 +0000
@@ -97,4 +97,3 @@
 --disconnect node_1_p2
 --disconnect node_2_p1
 --disconnect node_2_p2
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -9,5 +9,3 @@
 binlog-checksum=CRC32
 master-verify-checksum=1
 slave-sql-verify-checksum=1
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_checksum.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_checksum.test	2025-05-19 16:14:24.000000000 +0000
@@ -38,8 +38,6 @@
 --connection node_1
 DROP TABLE t1;
 
---disable_query_log
 SET @@global.wsrep_mode=DEFAULT;
---enable_query_log
 
 --echo # End of tests.
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_max.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,6 +4,3 @@
 binlog-row-event-max-size=4294967040
 
 [mysqld.2]
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,6 +4,3 @@
 binlog-row-event-max-size=256
 
 [mysqld.2]
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_event_max_size_min.test	2025-05-19 16:14:24.000000000 +0000
@@ -12,4 +12,3 @@
 SELECT COUNT(*) = 1 FROM t1 WHERE f1 = REPEAT('x', 1000);
 
 DROP TABLE t1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_row_image.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_row_image.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_row_image.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_row_image.test	2025-05-19 16:14:24.000000000 +0000
@@ -94,7 +94,3 @@
 
 DROP TABLE t1;
 DROP TABLE t2;
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_binlog_stmt_autoinc.test	2025-05-19 16:14:24.000000000 +0000
@@ -5,15 +5,15 @@
 --source include/galera_cluster.inc
 --source include/force_restart.inc
 
+--let $node_1=node_1
+--let $node_2=node_2
+--source include/auto_increment_offset_save.inc
+
 --connection node_1
 SET GLOBAL auto_increment_offset=1;
 --connection node_2
 SET GLOBAL auto_increment_offset=2;
 
---let $node_1=node_1
---let $node_2=node_2
---source include/auto_increment_offset_save.inc
-
 ##
 ## Verify the correct operation of the auto-increment when the binlog
 ## format artificially set to the 'STATEMENT' (although this mode is
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_cache_index.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_cache_index.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_cache_index.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_cache_index.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 
 CREATE TABLE t1 (c1 int, UNIQUE INDEX (c1)) engine=innodb;
 INSERT INTO t1 VALUES (1),(2),(3),(4),(5);
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_can_run_toi.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_can_run_toi.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_can_run_toi.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_can_run_toi.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 #
 # MDEV-24833 : Signal 11 on wsrep_can_run_in_toi at wsrep_mysqld.cc:1994
 #
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_change_user.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_change_user.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_change_user.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_change_user.test	2025-05-19 16:14:24.000000000 +0000
@@ -26,4 +26,3 @@
 --connection node_1
 DROP TABLE t1;
 DROP USER user1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_circular_replication.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_circular_replication.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_circular_replication.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_circular_replication.test	2025-05-19 16:14:24.000000000 +0000
@@ -45,6 +45,7 @@
 --let $node_1 = replica1
 --let $node_2 = node_2
 --let $node_3 = primary2
+--let $node_4 = primary1
 --source include/auto_increment_offset_save.inc
 
 --connection replica1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_concurrent_ctas.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_concurrent_ctas.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_concurrent_ctas.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_concurrent_ctas.test	2025-05-19 16:14:24.000000000 +0000
@@ -98,4 +98,3 @@
 
 --source include/galera_end.inc
 --echo # End of test
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_create_trigger.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_create_trigger.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_create_trigger.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_create_trigger.test	2025-05-19 16:14:24.000000000 +0000
@@ -41,4 +41,3 @@
 DROP TABLE definer_default;
 
 DROP USER 'user1';
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ctas.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ctas.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ctas.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ctas.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,6 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_aria.inc
 
 --connection node_1
 create table t1_Aria(a int, count int, b int, key(b)) engine=Aria;
@@ -36,4 +38,3 @@
 
 DROP TABLE t2, t3,t4;
 DROP TABLE t1_MyISAM, t1_Aria,t1_InnoDB;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,5 +7,3 @@
 [mysqld.2]
 wsrep-debug=1
 loose-galera-ddl-fk-conflict=1
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_fk_conflict.test	2025-05-19 16:14:24.000000000 +0000
@@ -43,4 +43,3 @@
 --source galera_ddl_fk_conflict_with_tmp.inc
 
 # CHECK and ANALYZE are not affected
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_multiline.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_multiline.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ddl_multiline.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ddl_multiline.test	2025-05-19 16:14:24.000000000 +0000
@@ -51,4 +51,3 @@
 
 --connection node_1
 DROP TABLE t1, t2, t3, t4, t5, t6;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_defaults.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_defaults.test	2025-05-19 16:14:24.000000000 +0000
@@ -13,11 +13,13 @@
 --source include/force_restart.inc
 
 # Make sure that the test is operating on the right version of galera library.
---let $galera_version=26.4.11
+--let $galera_version=26.4.21
 source ../wsrep/include/check_galera_version.inc;
 
 # Global Variables
 
+SELECT COUNT(*) `expect 51` FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%';
+
 SELECT VARIABLE_NAME, VARIABLE_VALUE
 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
 WHERE VARIABLE_NAME LIKE 'wsrep_%'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_disallow_local_gtid.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_disallow_local_gtid.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_disallow_local_gtid.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_disallow_local_gtid.test	2025-05-19 16:14:24.000000000 +0000
@@ -101,4 +101,3 @@
 DROP TABLE tab1;
 DROP TABLE tab2;
 DROP TABLE tab3;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_encrypt_tmp_files.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 !include ../galera_2nodes.cnf
-[mysqld]
 
+[mysqld]
 encrypt-tmp-files = 1
 plugin-load-add= @ENV.FILE_KEY_MANAGEMENT_SO
 file-key-management
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_fk_truncate.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_fk_truncate.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_fk_truncate.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_fk_truncate.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,9 @@
+!include ../galera_2nodes.cnf
+
+[mysqld.1]
+auto_increment_offset=1
+auto_increment_increment=1
+
+[mysqld.2]
+auto_increment_offset=2
+auto_increment_increment=1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_flush_local.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_flush_local.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_flush_local.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_flush_local.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,7 @@
 # PXC-391
 
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_query_cache.inc
 
 --disable_warnings
@@ -72,7 +73,6 @@
 SELECT COUNT(*) AS EXPECT_10000 FROM t2;
 SELECT COUNT(*) AS EXPECT_10 FROM x2;
 
-
 --connection node_1
 DROP TABLE t1, t2, x1, x2;
 CREATE TABLE t1 (f1 INTEGER);
@@ -144,4 +144,3 @@
 --disable_query_log
 SET GLOBAL wsrep_mode = DEFAULT;
 --enable_query_log
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_forced_binlog_format.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_forced_binlog_format.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_forced_binlog_format.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_forced_binlog_format.test	2025-05-19 16:14:24.000000000 +0000
@@ -49,4 +49,3 @@
 
 #--source include/galera_end.inc
 --echo # End of tests
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.recover=yes;pc.ignore_sb=true'
+wsrep_provider_options='gcache.recover=yes;gcache.size=128M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;pc.wait_prim_timeout=PT60S'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.recover=yes'
+wsrep_provider_options='gcache.recover=yes;gcache.size=128M;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;pc.wait_prim_timeout=PT60S'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,7 @@
 #
 
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/big_test.inc
 
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -3,7 +3,7 @@
 [mysqld.1]
 max_allowed_packet=10M
 innodb_log_file_size=220M
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.recover=yes;pc.ignore_sb=true;gcache.size=10M'
+wsrep_provider_options='gcache.recover=yes;gcache.size=10M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;pc.wait_prim_timeout=PT60S'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.recover=yes;pc.ignore_sb=true;gcache.size=10M'
+wsrep_provider_options='gcache.recover=yes;gcache.size=10M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;pc.wait_prim_timeout=PT60S'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_full_gcache.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,7 @@
 #
 
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/big_test.inc
 
 SET SESSION wsrep_sync_wait = 0;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -2,8 +2,8 @@
 
 [mysqld.1]
 innodb_log_file_size=220M
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.recover=yes;pc.ignore_sb=true;'
+wsrep_provider_options='gcache.recover=yes;gcache.size=128M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;pc.wait_prim_timeout=PT60S'
 
 [mysqld.2]
 innodb_log_file_size=220M
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.recover=yes;pc.ignore_sb=true;'
+wsrep_provider_options='gcache.recover=yes;gcache.size=128M;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;pc.wait_prim_timeout=PT60S'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcache_recover_manytrx.test	2025-05-19 16:14:24.000000000 +0000
@@ -5,6 +5,7 @@
 
 --source include/galera_cluster.inc
 --source include/big_test.inc
+--source include/have_innodb.inc
 --source include/have_log_bin.inc
 
 SET SESSION wsrep_sync_wait = 0;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_fragment.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_fragment.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_fragment.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_fragment.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,7 @@
 !include ../galera_2nodes.cnf
+
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcs.max_packet_size=64'
+wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcs.max_packet_size=64'
+wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gcs_max_packet_size.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,7 @@
 !include ../galera_2nodes.cnf
+
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcs.max_packet_size=64;gcache.size=10M'
+wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcs.max_packet_size=64;gcache.size=10M'
+wsrep_provider_options='gcs.max_packet_size=64;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,4 +7,3 @@
 [mysqld.2]
 log-bin
 log-slave-updates
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_server_id.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_server_id.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_server_id.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_server_id.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -13,4 +13,3 @@
 server-id=12
 log_slave_updates
 log_bin
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,13 +4,16 @@
 log-bin=mysqld-bin
 log-slave-updates
 binlog-format=ROW
+
 [mysqld.1]
 gtid-domain-id=1
 wsrep_gtid_mode=1
 wsrep_gtid_domain_id=1
+
 [mysqld.2]
 gtid-domain-id=1
 wsrep_gtid_mode=1
 wsrep_gtid_domain_id=1
+
 [mysqld.3]
 gtid-domain-id=2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,14 +5,16 @@
 log-slave-updates
 binlog-format=ROW
 wsrep_sst_method=rsync
+
 [mysqld.1]
 gtid-domain-id=1
 wsrep_gtid_mode=1
 wsrep_gtid_domain_id=1
+
 [mysqld.2]
 gtid-domain-id=1
 wsrep_gtid_mode=1
 wsrep_gtid_domain_id=1
+
 [mysqld.3]
 gtid-domain-id=2
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_slave_sst_rsync.test	2025-05-19 16:14:24.000000000 +0000
@@ -13,6 +13,7 @@
 # As node #3 is not a Galera node, and galera_cluster.inc does not open connetion to it
 # we open the node_3 connection here
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 --echo #Connection 2
 --connection node_2
 --disable_query_log
@@ -30,6 +31,7 @@
 
 SELECT @@global.gtid_binlog_state;
 --source include/save_master_gtid.inc
+
 --echo #Connection 2
 --connection node_2
 --source include/sync_with_master_gtid.inc
@@ -39,6 +41,7 @@
 INSERT INTO t2 VALUES(5,55);
 INSERT INTO t2 VALUES(6,66);
 SELECT @@global.gtid_binlog_state;
+
 --echo #Connection 1
 --connection node_1
 --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME= 't2';
@@ -60,6 +63,7 @@
 INSERT INTO t1 VALUES ('node1_committed_before');
 COMMIT;
 --source include/save_master_gtid.inc
+
 --echo #Connection 2
 --connection node_2
 --source include/sync_with_master_gtid.inc
@@ -68,6 +72,7 @@
 INSERT INTO t1 VALUES ('node2_committed_before');
 INSERT INTO t1 VALUES ('node2_committed_before');
 COMMIT;
+
 --echo #Connection 1
 --connection node_1
 --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME= 't1';
@@ -77,10 +82,12 @@
 --let $node_1= node_1
 --let $node_2= node_2
 --source include/auto_increment_offset_save.inc
+
 --echo #Connection 2
 --connection node_2
 --echo Shutting down server ...
 --source include/shutdown_mysqld.inc
+
 --echo #Connection 1
 --connection node_1
 --let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'
@@ -90,6 +97,7 @@
 INSERT INTO t1 VALUES ('node1_committed_during');
 INSERT INTO t1 VALUES ('node1_committed_during');
 COMMIT;
+
 --echo #Connection 2
 --connection node_2
 --echo Starting server ...
@@ -103,11 +111,13 @@
 INSERT INTO t1 VALUES ('node2_committed_after');
 INSERT INTO t1 VALUES ('node2_committed_after');
 COMMIT;
+
 --echo #Connection 1
 --connection node_1
 --let $wait_condition = SELECT COUNT(*) = 8 FROM t1;
 --source include/wait_condition.inc
 Select * from t1 order by f1;
+
 --echo #Connection 2
 --connection node_2
 Select * from t1 order by f1;
@@ -153,12 +163,14 @@
 INSERT INTO t1 VALUES ('node2_slave_started');
 SELECT count(*) from t1;
 SELECT @@global.gtid_binlog_state;
+
 --echo #Connection 1
 --connection node_1
 --let $wait_condition = SELECT COUNT(*) = 12 FROM t1;
 --source include/wait_condition.inc
 SELECT count(*) from t1;
 SELECT @@global.gtid_binlog_state;
+
 --echo #Connection 3
 --connection node_3
 DROP TABLE t2,t1;
@@ -173,10 +185,12 @@
 --connection node_2
 --let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2';
 --source include/wait_condition.inc
+
 --echo #Connection 1
 --connection node_1
 --let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
 --source include/wait_condition.inc
+
 --echo #Connection 2
 --connection node_2
 STOP SLAVE;
@@ -194,6 +208,7 @@
 set global wsrep_on=OFF;
 reset master;
 set global wsrep_on=ON;
+
 --echo #Connection 3
 --connection node_3
 reset master;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_gtid_trx_conflict.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,10 +4,12 @@
 log-bin=mysqld-bin
 log-slave-updates
 binlog-format=ROW
+
 [mysqld.1]
 gtid-domain-id=1
 wsrep_gtid_mode=1
 wsrep_gtid_domain_id=1
+
 [mysqld.2]
 gtid-domain-id=1
 wsrep_gtid_mode=1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_inject_bf_long_wait.test	2025-05-19 16:14:24.000000000 +0000
@@ -6,9 +6,14 @@
 INSERT INTO t1 VALUES (0,0),(1,1),(2,2),(3,3);
 
 BEGIN;
+SET DEBUG_SYNC = 'wsrep_after_statement_enter SIGNAL blocked';
 --send UPDATE t1 set b = 100 where id between 1 and 2;
 
 --connect node_1b, 127.0.0.1, root, , test, $NODE_MYPORT_1
+
+SET DEBUG_SYNC = 'now WAIT_FOR blocked';
+SET DEBUG_SYNC = 'wsrep_after_statement_enter CLEAR';
+
 --connection node_1b
 SET @save_dbug = @@SESSION.debug_dbug;
 SET @@SESSION.innodb_lock_wait_timeout=2;
@@ -21,5 +26,6 @@
 --reap
 COMMIT;
 SELECT * FROM t1;
+SET DEBUG_SYNC = 'RESET';
 --disconnect node_1b
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_ignore.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_ignore.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_ignore.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_ignore.test	2025-05-19 16:14:24.000000000 +0000
@@ -57,4 +57,3 @@
 DROP TABLE t2;
 DROP TABLE t3;
 --eval SET GLOBAL wsrep_sync_wait = $wsrep_sync_wait_orig
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_multi.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_multi.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_insert_multi.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_insert_multi.test	2025-05-19 16:14:24.000000000 +0000
@@ -113,10 +113,3 @@
 SELECT COUNT(*) = 2 FROM t1;
 
 DROP TABLE t1;
-
-
-
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28423.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -35,10 +35,10 @@
 log_bin=binlog
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_MDEV-28583.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -35,10 +35,10 @@
 log_bin=binlog
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,10 +5,10 @@
 wsrep_sst_auth=root:
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_innodb_flush_logs.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,10 +7,10 @@
 innodb_flush_log_at_trx_commit=0
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mariabackup_verify_ca.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -9,11 +9,11 @@
 ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 loose-innodb-log-file-buffering
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 loose-innodb-log-file-buffering
 
 [sst]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,7 +4,7 @@
 # causes the first MTR connection to be forefully dropped by Galera, which in turn confuses MTR
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_mysqldump.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_mysqldump.test	2025-05-19 16:14:24.000000000 +0000
@@ -2,14 +2,14 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
 
---source suite/galera/include/galera_sst_set_mysqldump.inc
-
 call mtr.add_suppression("WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ");
 
 --let $node_1=node_1
 --let $node_2=node_2
 --source include/auto_increment_offset_save.inc
 
+--source suite/galera/include/galera_sst_set_mysqldump.inc
+
 # mysql-wsrep#33 - nnoDB: Failing assertion: xid_seqno > trx_sys_cur_xid_seqno in trx_sys_update_wsrep_checkpoint with mysqldump IST
 # --source suite/galera/include/galera_st_disconnect_slave.inc
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_progress.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_progress.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_progress.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_progress.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,4 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
-
-
-
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_recv_bind.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,8 +1,7 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;ist.recv_bind=127.0.0.1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;ist.recv_bind=127.0.0.1;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;ist.recv_bind=127.0.0.1'
-
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;ist.recv_bind=127.0.0.1;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_restart_joiner.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,4 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,9 +4,9 @@
 wsrep_sst_method=rsync
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sync_wait=1
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sync_wait=1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ist_rsync_verify_ca.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -8,10 +8,10 @@
 ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 ssl-mode=VERIFY_CA
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_applier.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_applier.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_applier.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_applier.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,9 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
+wsrep_slave_threads=1
 wsrep-debug=1
 
 [mysqld.2]
+wsrep_slave_threads=1
 wsrep-debug=1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_smallchanges.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_smallchanges.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_kill_smallchanges.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_kill_smallchanges.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,7 @@
 #
 
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 
 # Save original auto_increment_offset values.
 --let $node_1=node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_load_data.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_load_data.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_load_data.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_load_data.test	2025-05-19 16:14:24.000000000 +0000
@@ -397,4 +397,3 @@
 --connection node_1
 use test;
 drop database cardtest02;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_log_bin_opt.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_log_bin_opt.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_log_bin_opt.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_log_bin_opt.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,10 +5,10 @@
 wsrep_sst_auth="root:"
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_many_rows.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_many_rows.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_many_rows.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_many_rows.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,6 @@
 --source include/big_test.inc
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 
 # Save original auto_increment_offset values.
 --let $node_1=node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,5 +5,3 @@
 
 [mysqld.2]
 innodb-stats-persistent=1
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_13787.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_13787.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,6 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
+
 --connection node_1
 create table t(a int);
 insert into t select 1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_15611.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_15611.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdev_15611.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdev_15611.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 !include ../galera_2nodes.cnf
+
 [mysqld.1]
 
 [mysqld.2]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdl_race.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdl_race.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_mdl_race.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_mdl_race.test	2025-05-19 16:14:24.000000000 +0000
@@ -91,4 +91,3 @@
 --disconnect node_1a
 --disconnect node_1b
 --disconnect node_1c
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_nonPK_and_PA.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_nonPK_and_PA.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_nonPK_and_PA.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_nonPK_and_PA.test	2025-05-19 16:14:24.000000000 +0000
@@ -26,7 +26,6 @@
 --source include/have_debug_sync.inc
 --source include/galera_have_debug_sync.inc
 
-
 # Setup
 
 CREATE TABLE t1 (f1 VARCHAR(32) NOT NULL) ENGINE=InnoDB;
@@ -44,7 +43,7 @@
 SET GLOBAL wsrep_slave_threads = 2;
 
 --echo ***************************************************************
---echo scenario 1,  conflicting UPDATE
+--echo scenario 1, conflicting UPDATE
 --echo ***************************************************************
 
 # Set up a synchronization point to catch the first transaction
@@ -99,9 +98,8 @@
 --source include/galera_signal_sync_point.inc
 --source include/galera_clear_sync_point.inc
 
-
 --echo ***************************************************************
---echo scenario 2,  conflicting DELETE
+--echo scenario 2, conflicting DELETE
 --echo ***************************************************************
 
 # Set up a synchronization point to catch the first transaction
@@ -164,5 +162,6 @@
 
 DROP TABLE t1;
 DROP TABLE t2;
+
 --connection node_2
 SET GLOBAL wsrep_slave_threads = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_nopk_unicode.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_nopk_unicode.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_nopk_unicode.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_nopk_unicode.test	2025-05-19 16:14:24.000000000 +0000
@@ -39,5 +39,4 @@
 SELECT f1 = 'текст2' FROM t1;
 SELECT f1 = 'текст2' FROM t1 WHERE f1 = 'текст2';
 
-
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_apply_lock_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -29,11 +29,13 @@
 --let $galera_connection_name = node_2a
 --let $galera_server_number = 2
 --source include/galera_connect.inc
+
 --connection node_2a
 --sleep 1
 SET SESSION wsrep_sync_wait=0;
-SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE 'Commit' or STATE = 'Waiting for certification');
-SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE '%Waiting for table metadata lock%';
+SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification');
+SELECT COUNT(*) AS EXPECT_1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
+
 SELECT COUNT(*) AS EXPECT_0 FROM t1;
 SELECT COUNT(*) AS EXPECT_0 FROM t2;
 
@@ -44,8 +46,11 @@
 --eval SET SESSION wsrep_sync_wait = $wsrep_sync_wait_orig;
 SELECT COUNT(*) AS EXPECT_1 FROM t1;
 SELECT COUNT(*) AS EXPECT_1 FROM t2;
-SELECT COUNT(*) AS EXPECT_2  FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE '%committed%' or STATE = 'Waiting for certification');
+SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification');
 
+--disable_query_log
 --eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig;
+--enable_query_log
+
 DROP TABLE t1;
 DROP TABLE t2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_largetrx.test	2025-05-19 16:14:24.000000000 +0000
@@ -67,4 +67,3 @@
 --connection default
 DROP TABLE t1;
 DROP TABLE ten;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_autoinc_manytrx.test	2025-05-19 16:14:24.000000000 +0000
@@ -91,5 +91,3 @@
 DROP TABLE t1;
 DROP TABLE ten;
 DROP PROCEDURE p1;
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_simple.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_simple.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_parallel_simple.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_parallel_simple.test	2025-05-19 16:14:24.000000000 +0000
@@ -48,10 +48,10 @@
 --connection node_2
 SET SESSION wsrep_sync_wait = 0;
 
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table metadata lock%';
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%');
 --source include/wait_condition.inc
 
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE (STATE LIKE 'Commit' or STATE = 'Waiting for certification');
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committing%' OR STATE LIKE 'Commit' OR STATE LIKE 'Waiting for certification');
 --source include/wait_condition.inc
 
 UNLOCK TABLES;
@@ -61,7 +61,9 @@
 SELECT COUNT(*) as expect_20 FROM t1;
 SELECT COUNT(*) as expect_20 FROM t2;
 
+--disable_query_log
 --eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig;
+--enable_query_log
 
 DROP TABLE t1;
 DROP TABLE t2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_partitioned_tables.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_partitioned_tables.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_partitioned_tables.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_partitioned_tables.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,133 @@
+--source include/galera_cluster.inc
+--source include/have_partition.inc
+--source include/have_innodb.inc
+--source include/have_aria.inc
+
+call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine partition for table");
+
+--echo # wsrep-mode= DEFAULT
+SET GLOBAL wsrep_mode = "";
+SELECT @@wsrep_mode;
+CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB
+  PARTITION BY KEY (v1)
+  PARTITIONS 2;
+CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+  PARTITION BY KEY (v1)
+  PARTITIONS 2;
+ALTER TABLE t1 ADD COLUMN v2 int;
+ALTER TABLE t2 ADD COLUMN v2 int;
+INSERT INTO t1 VALUES (1,1),(2,2);
+INSERT INTO t2 VALUES (1,1),(2,2);
+ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM;
+ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria;
+UPDATE t1 SET v3 = 3;
+UPDATE t2 SET v3 = 3;
+CREATE INDEX xx1 ON t1(v2);
+CREATE INDEX xx2 ON t2(v2);
+DROP INDEX xx1 ON t1;
+DROP INDEX xx2 ON t2;
+TRUNCATE TABLE t1;
+TRUNCATE TABLE t2;
+RENAME TABLE t1 TO t1_v2;
+RENAME TABLE t2 TO t2_v2;
+CREATE VIEW x1 AS SELECT * FROM t1_v2;
+CREATE VIEW x2 AS SELECT * FROM t2_v2;
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1
+  AFTER INSERT ON t1_v2 FOR EACH ROW
+   UPDATE t1_v2 SET t1_v2.v3 = t1_v2.v3+1;
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2
+  AFTER INSERT ON t2_v2 FOR EACH ROW
+   UPDATE t2_v2 SET t2_v2.v3 = t2_v2.v3+1;
+
+--connection node_2
+SHOW CREATE TABLE t1_v2;
+SHOW CREATE TABLE t2_v2;
+SHOW CREATE VIEW x1;
+SHOW CREATE VIEW x2;
+
+SELECT * FROM t1_v2;
+SELECT * FROM t2_v2;
+
+--connection node_1
+DROP VIEW x1;
+DROP VIEW x2;
+DROP TRIGGER increment_before_t1;
+DROP TRIGGER increment_before_t2;
+DROP TABLE t1_v2;
+DROP TABLE t2_v2;
+
+SET GLOBAL wsrep_mode = "";
+CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+  PARTITION BY KEY (v1)
+  PARTITIONS 2;
+--echo # wsrep-mode= STRICT_REPLICATION
+SET GLOBAL wsrep_mode = "STRICT_REPLICATION";
+SELECT @@wsrep_mode;
+CREATE OR REPLACE TABLE t1 (v1 INT NOT NULL PRIMARY KEY) ENGINE=InnoDB
+  PARTITION BY KEY (v1)
+  PARTITIONS 2;
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+CREATE OR REPLACE TABLE t3 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+  PARTITION BY KEY (v1)
+  PARTITIONS 2;
+ALTER TABLE t1 ADD COLUMN v2 int;
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+ALTER TABLE t2 ADD COLUMN v2 int;
+INSERT INTO t1 VALUES (1,1),(2,2);
+INSERT INTO t2 VALUES (1),(2);
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+ALTER TABLE t1 ADD COLUMN v3 int, ENGINE=MyISAM;
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+ALTER TABLE t2 ADD COLUMN v3 int, ENGINE=Aria;
+UPDATE t1 SET v2 = v2 + 3;
+UPDATE t2 SET v1 = v1 + 3;
+CREATE INDEX xx1 ON t1(v2);
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+CREATE INDEX xx2 ON t2(v2);
+DROP INDEX xx1 ON t1;
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+DROP INDEX xx2 on t2;
+TRUNCATE TABLE t1;
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+TRUNCATE TABLE t2;
+# At the moment can't restrict rename
+RENAME TABLE t1 TO t1_v2;
+RENAME TABLE t2 TO t2_v2;
+RENAME TABLE t2_v2 TO t2;
+CREATE VIEW x1 AS SELECT * FROM t1_v2;
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+CREATE VIEW x2 AS SELECT * FROM t2;
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t1
+  AFTER INSERT ON t1_v2 FOR EACH ROW
+   UPDATE t1_v2 SET t1_v2.v2 = t1_v2.v2+1;
+--error ER_GALERA_REPLICATION_NOT_SUPPORTED
+CREATE DEFINER=`root`@`localhost` TRIGGER increment_before_t2
+  AFTER INSERT ON t2 FOR EACH ROW
+   UPDATE t2 SET t2.v1 = t2.v1+1;
+
+--connection node_2
+SHOW CREATE TABLE t1_v2;
+SHOW CREATE TABLE t2;
+SHOW CREATE VIEW x1;
+
+SELECT * FROM t1_v2;
+SELECT * FROM t2;
+
+--connection node_1
+DROP VIEW x1;
+DROP TRIGGER increment_before_t1;
+DROP TABLE t1_v2;
+# We allow dropping table
+DROP TABLE t2;
+SET GLOBAL wsrep_mode = "";
+
+CREATE OR REPLACE TABLE t2 (v1 INT NOT NULL PRIMARY KEY) ENGINE=MyISAM
+  PARTITION BY KEY (v1)
+  PARTITIONS 2;
+--echo # wsrep-mode= STRICT_REPLICATION
+SET GLOBAL wsrep_mode = "STRICT_REPLICATION";
+SELECT @@wsrep_mode;
+ALTER TABLE t2 ENGINE=InnoDB;
+DROP TABLE t2;
+
+SET GLOBAL wsrep_mode = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_ignore_sb.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,8 +4,7 @@
 wsrep_debug=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
-
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_recovery.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_recovery.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_pc_recovery.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_pc_recovery.test	2025-05-19 16:14:24.000000000 +0000
@@ -33,8 +33,8 @@
 
 # Perform --wsrep-recover and preserve the positions into variables by placing them in $MYSQL_TMP_DIR/galera_wsrep_start_position.inc and then --source'ing it
 
---exec $MYSQLD --defaults-group-suffix=.1 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.1.log > $MYSQL_TMP_DIR/galera_wsrep_recover.1.log 2>&1
---exec $MYSQLD --defaults-group-suffix=.2 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --innodb --wsrep-recover --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.2.log > $MYSQL_TMP_DIR/galera_wsrep_recover.2.log 2>&1
+--exec $MYSQLD --defaults-group-suffix=.1 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --wsrep-recover --loose-innodb --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.1.log > $MYSQL_TMP_DIR/galera_wsrep_recover.1.log 2>&1
+--exec $MYSQLD --defaults-group-suffix=.2 --defaults-file=$MYSQLTEST_VARDIR/my.cnf --wsrep-recover --loose-innodb --log-error=$MYSQL_TMP_DIR/galera_wsrep_recover.2.log > $MYSQL_TMP_DIR/galera_wsrep_recover.2.log 2>&1
 
 --perl
         use strict;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,4 +7,3 @@
 [mysqld.2]
 query_cache_type=1
 query_cache_size=1355776
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_invalidate.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_invalidate.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_invalidate.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_invalidate.test	2025-05-19 16:14:24.000000000 +0000
@@ -29,7 +29,7 @@
 --connection node_3
 
 --disable_query_log
---eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', MASTER_PORT=$NODE_MYPORT_1, master_use_gtid=current_pos
+--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_1, master_use_gtid=current_pos;
 --enable_query_log
 START SLAVE;
 --source include/wait_for_slave_to_start.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_query_cache_sync_wait.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,4 +7,3 @@
 [mysqld.2]
 query_cache_type=1
 query_cache_size=1355776
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_read_only.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_read_only.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_read_only.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_read_only.test	2025-05-19 16:14:24.000000000 +0000
@@ -48,4 +48,3 @@
 SET GLOBAL read_only=FALSE;
 DROP TABLE t1;
 DROP USER foo@localhost;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_repl_key_format_flat16.test	2025-05-19 16:14:24.000000000 +0000
@@ -25,7 +25,6 @@
 SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 234;
 SELECT COUNT(*) = 1 FROM t2 WHERE f1 = REPEAT('b', 256);
 
-
 --disable_query_log
 --eval SET GLOBAL wsrep_provider_options = '$wsrep_provider_options_orig';
 --enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_nochanges.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_nochanges.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_nochanges.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_nochanges.test	2025-05-19 16:14:24.000000000 +0000
@@ -37,4 +37,3 @@
 --source include/auto_increment_offset_restore.inc
 
 --source include/galera_end.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -17,4 +17,3 @@
 wsrep_gtid_domain_id=16
 gtid_domain_id=11
 gtid_strict_mode=1
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_restart_replica.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_restart_replica.test	2025-05-19 16:14:24.000000000 +0000
@@ -40,6 +40,7 @@
 
 --let $node_1 = node_1
 --let $node_2 = replica
+--let $node_3 = primary
 --source include/auto_increment_offset_save.inc
 
 --connection replica
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_savepoint_replay.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_savepoint_replay.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_savepoint_replay.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_savepoint_replay.test	2025-05-19 16:14:24.000000000 +0000
@@ -83,4 +83,3 @@
 SELECT COUNT(*) = 1 FROM t1 WHERE f2 = 'c';
 
 DROP TABLE t1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequence_engine.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequence_engine.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequence_engine.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequence_engine.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,13 @@
 --source include/galera_cluster.inc
 --source include/have_sequence.inc
 
+--connection node_2
+let $restore_wsrep_ignore_apply_errors=`SELECT @@GLOBAL.wsrep_ignore_apply_errors`;
+SET GLOBAL wsrep_ignore_apply_errors=0;
+
+--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
+--connection node_2a
+SET SESSION wsrep_sync_wait=0;
 SET GLOBAL wsrep_ignore_apply_errors=0;
 SET SESSION AUTOCOMMIT=0;
 SET SESSION max_error_count=0;
@@ -11,6 +18,8 @@
 --error ER_NO_SUCH_TABLE
 SHOW CREATE TABLE t0;
 
---connection node_1
-SET GLOBAL wsrep_ignore_apply_errors=DEFAULT;
+--disable_query_log
+--eval SET GLOBAL wsrep_ignore_apply_errors=$restore_wsrep_ignore_apply_errors
+--enable_query_log
 
+--disconnect node_2a
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,13 +1,9 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-log-bin
-log-slave-updates
 auto-increment-increment=2
 auto-increment-offset=1
 
 [mysqld.2]
-log-bin
-log-slave-updates
 auto-increment-increment=2
 auto-increment-offset=2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.combinations mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.combinations
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.combinations	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.combinations	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,5 @@
+[binlogon]
+log-bin
+log-slave-updates
+
+[binlogoff]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,9 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
+--source include/have_sequence.inc
+--source include/have_aria.inc
 
+--disable_ps2_protocol
 #
 # MDEV-19353 : Alter Sequence do not replicate to another nodes with in Galera Cluster
 #
@@ -45,6 +48,7 @@
 select NEXT VALUE FOR Seq1_1;
 
 --connection node_1
+SHOW CREATE SEQUENCE Seq1_1;
 DROP SEQUENCE Seq1_1;
 
 #
@@ -316,6 +320,12 @@
 DROP TABLE t1;
 DROP SEQUENCE t;
 
+--connection node_2
+--let $wsrep_sync_wait_orig_2 = `SELECT @@wsrep_sync_wait`
+SET SESSION wsrep_sync_wait=15;
+
+--connection node_1
+
 CREATE SEQUENCE t INCREMENT BY 0 CACHE=20 ENGINE=INNODB;
 CREATE TABLE t1(a int not null primary key default nextval(t), b int) engine=innodb;
 #
@@ -338,6 +348,10 @@
 SELECT * FROM t1;
 SELECT NEXTVAL(t);
 
+--disable_query_log
+--eval SET SESSION wsrep_sync_wait = $wsrep_sync_wait_orig_2
+--enable_query_log
+
 --connection node_1
 DROP TABLE t1;
 DROP SEQUENCE t;
@@ -355,4 +369,17 @@
 ALTER SEQUENCE IF EXISTS t MINVALUE=1;
 
 DROP TABLE t;
+
+--echo
+--echo MDEV-32631:
+--echo
+
+CREATE OR REPLACE TABLE t1(c INT ) ENGINE=ARIA;
+SET SESSION WSREP_OSU_METHOD=RSU;
+--error ER_NOT_SUPPORTED_YET
+INSERT INTO t1 SELECT seq,concat(seq,1) FROM seq_1_to_100;
+SET SESSION WSREP_OSU_METHOD=TOI;
+DROP TABLE t1;
+
+--echo
 --echo End of 10.5 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,9 @@
+!include ../galera_2nodes.cnf
+
+[mysqld.1]
+auto-increment-increment=2
+auto-increment-offset=1
+
+[mysqld.2]
+auto-increment-increment=2
+auto-increment-offset=2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.combinations	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,5 @@
+[binlogon]
+log-bin
+log-slave-updates
+
+[binlogoff]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_bf_kill.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_bf_kill.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,115 @@
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+--source include/galera_have_debug_sync.inc
+
+--disable_ps2_protocol
+#
+# We create InnoDB seqeuence with small cache that is then
+# used as default value for column in table.
+#
+--connection node_1
+--let $wsrep_local_replays_old = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'`
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY, f2 INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, 0), (3, 0);
+--connection node_1
+START TRANSACTION;
+INSERT INTO t1 VALUES (4, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (5, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (6, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (7, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (8, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (9, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (10, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (11, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (12, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (13, next value for s); # No conflict in cert
+INSERT INTO t1 VALUES (14, next value for s); # No conflict in cert
+SELECT * FROM t1 WHERE f1 > 0 FOR UPDATE; # Should cause GAP lock between 1 and 3
+
+--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
+SET SESSION wsrep_sync_wait=0;
+# Block the applier on node #1 and issue a conflicting update on node #2
+--let $galera_sync_point = apply_monitor_slave_enter_sync
+--source include/galera_set_sync_point.inc
+
+#
+# Send conflicting INSERT
+#
+--connection node_2
+INSERT INTO t1 VALUES (2, 2); # This should BF abort because of GAP lock
+
+--connection node_1a
+--source include/galera_wait_sync_point.inc
+--source include/galera_clear_sync_point.inc
+
+# Block the commit, send the COMMIT and wait until it gets blocked
+--let $galera_sync_point = commit_monitor_master_enter_sync
+--source include/galera_set_sync_point.inc
+
+--connection node_1
+--send COMMIT
+
+--connection node_1a
+
+--let $galera_sync_point = apply_monitor_slave_enter_sync commit_monitor_master_enter_sync
+--source include/galera_wait_sync_point.inc
+--source include/galera_clear_sync_point.inc
+
+--let $galera_sync_point = abort_trx_end
+--source include/galera_set_sync_point.inc
+--let $galera_sync_point = apply_monitor_slave_enter_sync
+--source include/galera_signal_sync_point.inc
+--let $galera_sync_point = abort_trx_end commit_monitor_master_enter_sync
+--source include/galera_wait_sync_point.inc
+
+# Let the transactions proceed
+--source include/galera_clear_sync_point.inc
+--let $galera_sync_point = abort_trx_end
+--source include/galera_signal_sync_point.inc
+--let $galera_sync_point = commit_monitor_master_enter_sync
+--source include/galera_signal_sync_point.inc
+
+# Commit succeeds
+--connection node_1
+--reap
+
+# wsrep_local_replays has increased by 1
+--let $wsrep_local_replays_new = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_replays'`
+--disable_query_log
+--eval SELECT $wsrep_local_replays_new - $wsrep_local_replays_old = 1 AS wsrep_local_replays;
+--enable_query_log
+
+INSERT INTO t1 VALUES (22, next value for s);
+INSERT INTO t1 VALUES (23, next value for s);
+INSERT INTO t1 VALUES (24, next value for s);
+INSERT INTO t1 VALUES (25, next value for s);
+INSERT INTO t1 VALUES (26, next value for s);
+INSERT INTO t1 VALUES (27, next value for s);
+INSERT INTO t1 VALUES (28, next value for s);
+INSERT INTO t1 VALUES (29, next value for s);
+INSERT INTO t1 VALUES (30, next value for s);
+INSERT INTO t1 VALUES (31, next value for s);
+INSERT INTO t1 VALUES (32, next value for s);
+INSERT INTO t1 VALUES (33, next value for s);
+INSERT INTO t1 VALUES (34, next value for s);
+INSERT INTO t1 VALUES (35, next value for s);
+
+--connection node_1
+SELECT * FROM t1;
+SELECT LASTVAL(s);
+
+--connection node_2
+SELECT * FROM t1;
+SELECT LASTVAL(s);
+
+--connection node_1
+SELECT NEXTVAL(s);
+
+--connection node_2
+SELECT NEXTVAL(s);
+
+DROP SEQUENCE s;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,9 @@
+!include ../galera_2nodes.cnf
+
+[mysqld.1]
+auto-increment-increment=2
+auto-increment-offset=1
+
+[mysqld.2]
+auto-increment-increment=2
+auto-increment-offset=2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.combinations mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.combinations
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.combinations	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.combinations	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,5 @@
+[binlogon]
+log-bin
+log-slave-updates
+
+[binlogoff]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sequences_transaction.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sequences_transaction.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,255 @@
+--source include/galera_cluster.inc
+--source include/have_sequence.inc
+
+--disable_ps2_protocol
+#
+# Case 1: Separate transactions from few connections
+#
+--connection node_1
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB;
+
+--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
+--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
+
+--connection node_1
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+
+--connection node_2
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+
+--connection node_2a
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+
+--connection node_1a
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+COMMIT;
+
+--connection node_2
+SELECT LASTVAL(s);
+--connection node_1
+SELECT LASTVAL(s);
+--connection node_2a
+SELECT LASTVAL(s);
+--connection node_1a
+SELECT LASTVAL(s);
+
+--connection node_1
+SELECT * FROM t1;
+--connection node_2
+SELECT * FROM t1;
+
+--connection node_1
+DROP TABLE t1;
+DROP SEQUENCE s;
+
+#
+# Case 2: All rollback
+#
+--connection node_1
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB;
+
+--connection node_1
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+
+--connection node_2
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+
+--connection node_2a
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+
+--connection node_1a
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+ROLLBACK;
+
+--connection node_2
+SELECT LASTVAL(s);
+--connection node_1
+SELECT LASTVAL(s);
+--connection node_2a
+SELECT LASTVAL(s);
+--connection node_1a
+SELECT LASTVAL(s);
+
+--connection node_1
+SELECT * FROM t1;
+--connection node_2
+SELECT * FROM t1;
+
+--connection node_1
+DROP TABLE t1;
+DROP SEQUENCE s;
+#
+# Case 3: Mixed transactions
+#
+--connection node_1
+CREATE SEQUENCE s INCREMENT=0 CACHE=5 ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INT PRIMARY KEY DEFAULT NEXTVAL(s), f2 INT) ENGINE=InnoDB;
+
+--connection node_1
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+
+--connection node_1a
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+
+--connection node_2a
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+
+--connection node_2
+BEGIN;
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+INSERT INTO t1(f2) values (1);
+
+--connection node_1
+COMMIT;
+--connection node_1a
+ROLLBACK;
+--connection node_2
+--error ER_LOCK_DEADLOCK
+COMMIT;
+--connection node_2a
+--error ER_LOCK_DEADLOCK
+ROLLBACK;
+
+--connection node_2
+SELECT LASTVAL(s);
+--connection node_1
+SELECT LASTVAL(s);
+--connection node_2a
+SELECT LASTVAL(s);
+--connection node_1a
+SELECT LASTVAL(s);
+
+--connection node_1
+SELECT * FROM t1;
+--connection node_2
+SELECT * FROM t1;
+
+--connection node_1
+DROP TABLE t1;
+DROP SEQUENCE s;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_server.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_server.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_server.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_server.test	2025-05-19 16:14:24.000000000 +0000
@@ -25,4 +25,3 @@
 
 --source include/galera_end.inc
 --echo # End of test
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_set_position_after_cert_failure.test	2025-05-19 16:14:24.000000000 +0000
@@ -95,4 +95,5 @@
 --connection node_1
 DROP TABLE t1;
 SET GLOBAL wsrep_slave_threads = DEFAULT;
+
 --source include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_slave_replay.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_slave_replay.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_slave_replay.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_slave_replay.test	2025-05-19 16:14:24.000000000 +0000
@@ -6,6 +6,7 @@
 # or rollback and replay (depending on the nature of lock conflict).
 #
 
+--source include/galera_cluster.inc
 --source include/have_innodb.inc
 --source include/have_log_bin.inc
 --source include/have_debug.inc
@@ -13,9 +14,7 @@
 --source include/galera_have_debug_sync.inc
 
 --connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
-
 --connection node_2a
---source include/galera_cluster.inc
 
 ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sp_bf_abort.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_sp_bf_abort.inc
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sp_bf_abort.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sp_bf_abort.inc	2025-05-19 16:14:24.000000000 +0000
@@ -35,4 +35,3 @@
 --source include/galera_signal_sync_point.inc
 --let $galera_sync_point = after_replicate_sync
 --source include/galera_signal_sync_point.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_split_brain.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_split_brain.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_split_brain.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_split_brain.test	2025-05-19 16:14:24.000000000 +0000
@@ -13,6 +13,7 @@
 --let $node_2=node_2
 --source include/auto_increment_offset_save.inc
 
+--connection node_2
 call mtr.add_suppression("WSREP: TO isolation failed for: ");
 
 --connection node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sql_log_bin_zero.test	2025-05-19 16:14:24.000000000 +0000
@@ -17,7 +17,6 @@
 
 INSERT INTO t1 VALUES (2);
 
-
 --connection node_2
 SELECT COUNT(*) = 2 FROM t1;
 SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=10M;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=10M;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl.test	2025-05-19 16:14:24.000000000 +0000
@@ -9,6 +9,10 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
 --source include/big_test.inc
+--source include/have_perfschema.inc
+
+# Verify that SSL is handled by the provider.
+SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%';
 
 SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
 SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,11 @@
+!include ../galera_2nodes.cnf
+
+[mysqld]
+loose-galera-ssl-cipher=1
+wsrep-debug=1
+
+[mysqld.1]
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;cert.log_conflicts=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+
+[mysqld.2]
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;cert.log_conflicts=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_cipher.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_cipher.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,82 @@
+#
+# Test upgrading the SSL chipher
+#
+
+--source include/galera_cluster.inc
+--source include/have_ssl_communication.inc
+--source include/have_openssl.inc
+--source include/force_restart.inc
+
+#
+# Lowest supported Galera library version
+#
+--let $galera_version=26.4.21
+source ../wsrep/include/check_galera_version.inc;
+
+# Save original auto_increment_offset values.
+--let $node_1=node_1
+--let $node_2=node_2
+--source include/auto_increment_offset_save.inc
+
+# Setup galera ports
+--connection node_1
+--source suite/galera/include/galera_base_port.inc
+--let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT
+
+--connection node_2
+--source suite/galera/include/galera_base_port.inc
+--let $NODE_GALERAPORT_2 = $_NODE_GALERAPORT
+
+SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+
+# 2. Restart node #1 with a socket.ssl_cipher
+
+--connection node_1
+--source include/shutdown_mysqld.inc
+--let $restart_noprint = 1
+--let $start_mysqld_params = --wsrep-cluster-address=gcomm://127.0.0.1:$NODE_GALERAPORT_2 --wsrep_provider_options=base_port=$NODE_GALERAPORT_1;socket.ssl=yes;socket.ssl_ca=$MYSQL_TEST_DIR/std_data/galera-upgrade-ca-cert.pem;socket.ssl_cert=$MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=$MYSQL_TEST_DIR/std_data/galera-key.pem;socket.ssl_cipher=AES256-SHA
+--source include/start_mysqld.inc
+--source include/wait_until_connected_again.inc
+
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
+--source include/wait_condition.inc
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+
+# 3. Restart node #2 with the new socket.ssl_ca , socket.ssl_cert, socket.ssl_key and socket.ssl_cipher
+
+--connection node_2
+--source include/shutdown_mysqld.inc
+--let $start_mysqld_params = --wsrep_provider_options=base_port=$NODE_GALERAPORT_2;socket.ssl=yes;socket.ssl_ca=$MYSQL_TEST_DIR/std_data/galera-upgrade-ca-cert.pem;socket.ssl_cert=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-cert.pem;socket.ssl_key=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-key.pem;socket.ssl_cipher=AES256-SHA
+--source include/start_mysqld.inc
+--source include/wait_until_connected_again.inc
+
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
+--source include/wait_condition.inc
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+
+# 4. Restart node #1 with the new socket.ssl_ca , socket.ssl_cert, socket.ssl_key and socket.ssl_cipher
+
+--connection node_1
+--source include/shutdown_mysqld.inc
+--let $start_mysqld_params = --wsrep-cluster-address=gcomm://127.0.0.1:$NODE_GALERAPORT_2 --wsrep_provider_options=base_port=$NODE_GALERAPORT_1;socket.ssl=yes;socket.ssl_ca=$MYSQL_TEST_DIR/std_data/galera-upgrade-ca-cert.pem;socket.ssl_cert=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-cert.pem;socket.ssl_key=$MYSQL_TEST_DIR/std_data/galera-upgrade-server-key.pem;socket.ssl_cipher=AES256-SHA
+--source include/start_mysqld.inc
+--source include/wait_until_connected_again.inc
+
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
+--source include/wait_condition.inc
+SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+
+# 5. Make sure node_2 is ready as well
+--connection node_2
+--source include/galera_wait_ready.inc
+
+# Upgrade complete. Both nodes now use the new key and certificate
+
+# Restore original auto_increment_offset values.
+--source include/auto_increment_offset_restore.inc
+
+--connection node_1
+call mtr.add_suppression("WSREP: write_handler\\(\\)");
+--connection node_2
+call mtr.add_suppression("WSREP: write_handler\\(\\)");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/cakey.pem;socket.ssl_compression=YES;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_compression.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_compression.test	2025-05-19 16:14:24.000000000 +0000
@@ -8,6 +8,10 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
 --source include/big_test.inc
+--source include/have_perfschema.inc
+
+# Verify that SSL is handled by the provider.
+SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%';
 
 SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
 SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,7 +5,7 @@
 wsrep-debug=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_ssl_upgrade.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_ssl_upgrade.test	2025-05-19 16:14:24.000000000 +0000
@@ -10,6 +10,9 @@
 --source include/have_openssl.inc
 --source include/force_restart.inc
 
+# Verify that SSL is handled by the provider.
+SELECT COUNT(*) `expect 0` FROM performance_schema.socket_instances WHERE EVENT_NAME LIKE '%wsrep%';
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_encrypted.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_encrypted.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_encrypted.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_encrypted.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -11,7 +11,7 @@
 wsrep_sst_method=rsync
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,12 +6,12 @@
 wsrep_debug=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 innodb_fast_shutdown=0
 innodb_undo_tablespaces=0
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 innodb_fast_shutdown=0
 innodb_undo_tablespaces=3
 loose_innodb_log_file_buffering
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_data_dir.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,11 +6,11 @@
 wsrep_debug=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
 innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/data_dir_test
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_force_recovery.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,10 +6,10 @@
 wsrep_debug=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,28 @@
+!include ../galera_2nodes.cnf
+
+[mysqld]
+wsrep_sst_method=mariabackup
+wsrep_sst_auth="root:"
+gtid_strict_mode=ON
+wsrep-gtid_mode=ON
+log-bin
+log-slave_updates
+loose-galera-sst-mariabackup-gtid=1
+
+[mysqld.1]
+wsrep_provider_options='pc.weight=2;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+gtid_domain_id=10
+wsrep_gtid_domain_id=100
+wsrep_slave_threads=4
+server-id=10
+
+[mysqld.2]
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+gtid_domain_id=10
+wsrep_gtid_domain_id=100
+wsrep_slave_threads=4
+server-id=10
+
+[sst]
+transferfmt=@ENV.MTR_GALERA_TFMT
+streamfmt=mbstream
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_gtid.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,29 @@
+--source include/big_test.inc
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_mariabackup.inc
+--source include/force_restart.inc
+
+# Save original auto_increment_offset values.
+--let $node_1=node_1
+--let $node_2=node_2
+--source include/auto_increment_offset_save.inc
+
+--source suite/galera/include/galera_st_shutdown_slave.inc
+--source suite/galera/include/galera_st_clean_slave.inc
+
+--source suite/galera/include/galera_st_kill_slave.inc
+--source suite/galera/include/galera_st_kill_slave_ddl.inc
+
+# Restore original auto_increment_offset values.
+--source include/auto_increment_offset_restore.inc
+
+--connection node_1
+--echo # Node_1
+SHOW global variables like 'gtid%pos';
+
+--connection node_2
+--echo # Node_2
+SHOW global variables like 'gtid%pos';
+
+--source include/galera_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_logarchive.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,10 +6,10 @@
 wsrep_debug=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_lost_found.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,10 +5,10 @@
 wsrep_sst_auth="root:"
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_table_options.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,10 +7,10 @@
 innodb-file-per-table=ON
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [sst]
 transferfmt=@ENV.MTR_GALERA_TFMT
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,4 +5,4 @@
 wsrep_sst_auth="root:"
 
 [mariabackup]
-use_memory=123m
+use_memory=129m
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mariabackup_use_memory.test	2025-05-19 16:14:24.000000000 +0000
@@ -40,8 +40,8 @@
 --source include/wait_condition.inc
 
 # Confirm that IST did not take place
---let $assert_text = mariabackup: Using 128974848 bytes for buffer pool \(set by --use-memory parameter\)
---let $assert_select = mariabackup: Using 128974848 bytes for buffer pool \(set by --use-memory parameter\)
+--let $assert_text = mariabackup: Using 134217728 bytes for buffer pool \(set by --use-memory parameter\)
+--let $assert_select = mariabackup: Using 134217728 bytes for buffer pool \(set by --use-memory parameter\)
 --let $assert_count = 1
 --let $assert_file = $MYSQLTEST_VARDIR/mysqld.2/data/mariabackup.prepare.log
 --let $assert_only_after = Starting InnoDB instance for recovery
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,6 +4,7 @@
 # causes the first MTR connection to be forefully dropped by Galera, which in turn confuses MTR
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_mysqldump_with_key.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -9,10 +9,10 @@
 loose-galera_sst_mysqldump_with_key=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [client]
 ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,7 +4,7 @@
 wsrep_sst_method=rsync
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync2.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync2.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync2.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync2.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,11 +4,11 @@
 wsrep_sst_method=rsync
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 log_bin=@ENV.MYSQLTEST_VARDIR/mysqld.1/server1_binlog
 log_bin_index=@ENV.MYSQLTEST_VARDIR/tmp/server1_binlog_index.index
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 log_bin=@ENV.MYSQLTEST_VARDIR/mysqld.2/server2_binlog
 log_bin_index=@ENV.MYSQLTEST_VARDIR/tmp/server2_binlog_index.index
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_binlogname.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,9 +4,9 @@
 wsrep_sst_method=rsync
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 log_bin=server1_binlog
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 log_bin=server2_binlog
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_data_dir.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,11 +4,11 @@
 wsrep_sst_method=rsync
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 innodb_data_home_dir=@ENV.MYSQL_TMP_DIR/rsync_test_2
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
 
 [sst]
 backup_threads=2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_capath.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -14,7 +14,7 @@
 ssl-mode=VERIFY_CA
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_key.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -8,7 +8,7 @@
 tcert=@ENV.MYSQL_TEST_DIR/std_data/server-cert.pem
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_encrypt_with_server.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -10,7 +10,7 @@
 ssl-mode=VERIFY_CA
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,23 @@
+!include ../galera_2nodes.cnf
+
+[mysqld]
+wsrep_sst_method=rsync
+gtid_strict_mode=ON
+wsrep-gtid_mode=ON
+log-bin
+log-slave_updates
+loose-galera-sst-rsync-gtid=1
+
+[mysqld.1]
+wsrep_provider_options='pc.weight=2;pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+gtid_domain_id=10
+wsrep_gtid_domain_id=100
+wsrep_slave_threads=4
+server-id=10
+
+[mysqld.2]
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+gtid_domain_id=10
+wsrep_gtid_domain_id=100
+wsrep_slave_threads=4
+server-id=10
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_gtid.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,29 @@
+--source include/big_test.inc
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_mariabackup.inc
+--source include/force_restart.inc
+
+# Save original auto_increment_offset values.
+--let $node_1=node_1
+--let $node_2=node_2
+--source include/auto_increment_offset_save.inc
+
+--source suite/galera/include/galera_st_shutdown_slave.inc
+--source suite/galera/include/galera_st_clean_slave.inc
+
+--source suite/galera/include/galera_st_kill_slave.inc
+--source suite/galera/include/galera_st_kill_slave_ddl.inc
+
+# Restore original auto_increment_offset values.
+--source include/auto_increment_offset_restore.inc
+
+--connection node_1
+--echo # Node_1
+SHOW global variables like 'gtid%pos';
+
+--connection node_2
+--echo # Node_2
+SHOW global variables like 'gtid%pos';
+
+--source include/galera_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_logbasename.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,11 +4,11 @@
 wsrep_sst_method=rsync
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 log_basename=server1
 log_bin
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 log_basename=server2
 log_bin
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sst_rsync_recv_auto.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,14 +5,14 @@
 bind-address=::
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
 wsrep_node_address=::1
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
 
 [mysqld.2]
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+wsrep_node_incoming_address='[::1]:@mysqld.2.port'
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;gcache.size=1;pc.ignore_sb=true'
 wsrep_node_address=::1
-wsrep_node_incoming_address='[::1]:@mysqld.2.port'
 wsrep_sst_receive_address=AUTO
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_cluster.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_cluster.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_cluster.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_cluster.test	2025-05-19 16:14:24.000000000 +0000
@@ -14,5 +14,3 @@
 
 SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
 SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_index.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_index.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_index.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_index.test	2025-05-19 16:14:24.000000000 +0000
@@ -12,7 +12,6 @@
 --connection node_2
 INSERT INTO wsrep_local_indexes VALUES ((SELECT variable_value FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE variable_name = 'wsrep_local_index'));
 
-
 --connection node_1
 SELECT COUNT(*) = 2 FROM wsrep_local_indexes;
 SELECT COUNT(DISTINCT wsrep_local_index) = 2 FROM wsrep_local_indexes;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_state.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_state.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_status_local_state.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_status_local_state.test	2025-05-19 16:14:24.000000000 +0000
@@ -22,7 +22,3 @@
 --source include/wait_condition.inc
 
 SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_innodb.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_innodb.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_innodb.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_innodb.test	2025-05-19 16:14:24.000000000 +0000
@@ -15,9 +15,10 @@
 # In both cases apply flood control if >= 10 same warning
 #
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_aria.inc
 
-call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled. Storage engine .*");
+call mtr.add_suppression("WSREP: wsrep_mode = STRICT_REPLICATION enabled\\. Storage engine ");
 
 CREATE TABLE t1(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=INNODB;
 CREATE TABLE t2(a int NOT NULL PRIMARY KEY, b varchar(50)) ENGINE=MYISAM;
@@ -114,4 +115,3 @@
 SET GLOBAL log_warnings=DEFAULT;
 SET GLOBAL wsrep_mode=DEFAULT;
 --disable_query_log
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_primary_key.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_primary_key.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_strict_require_primary_key.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_strict_require_primary_key.test	2025-05-19 16:14:24.000000000 +0000
@@ -14,9 +14,10 @@
 # In both cases apply flood control if >= 10 same warning
 #
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_aria.inc
 
-call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled. Table .*");
+call mtr.add_suppression("WSREP: wsrep_mode = REQUIRED_PRIMARY_KEY enabled\\. Table ");
 
 CREATE TABLE t1(a int, b varchar(50)) ENGINE=INNODB;
 CREATE TABLE t2(a int, b varchar(50)) ENGINE=MYISAM;
@@ -140,4 +141,3 @@
 SET GLOBAL log_warnings=DEFAULT;
 SET GLOBAL wsrep_mode=DEFAULT;
 --disable_query_log
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_suspend_slave.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_suspend_slave.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_suspend_slave.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_suspend_slave.test	2025-05-19 16:14:24.000000000 +0000
@@ -67,4 +67,3 @@
 # Restore original auto_increment_offset values.
 --let $node_2=node_2a
 --source include/auto_increment_offset_restore.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto-master.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1 +1 @@
---wsrep-sync-wait=0 --wsrep-causal-reads=OFF
\ No newline at end of file
+--wsrep-sync-wait=0 --wsrep-causal-reads=OFF
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_sync_wait_upto.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_sync_wait_upto.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,8 +3,8 @@
 #
 
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_debug.inc
---source include/have_debug_sync.inc
 
 CREATE TABLE t1 (f1 INTEGER) ENGINE=InnoDB;
 INSERT INTO t1 VALUES (1);
@@ -44,7 +44,6 @@
 --eval SELECT WSREP_SYNC_WAIT_UPTO_GTID('$wsrep_last_committed_gtid') AS WSREP_SYNC_WAIT_UPTO;
 --enable_query_log
 
-
 # Timeout if GTID is not received on time
 
 --disable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_table_with_hyphen.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_table_with_hyphen.inc
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_table_with_hyphen.inc	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_table_with_hyphen.inc	2025-05-19 16:14:24.000000000 +0000
@@ -45,4 +45,3 @@
 --connection node_2
 --eval drop table `$fk_child`
 --eval drop table `$fk_parent`
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_temporary_sequences.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_temporary_sequences.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_temporary_sequences.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_temporary_sequences.test	2025-05-19 16:14:24.000000000 +0000
@@ -30,7 +30,6 @@
 SHOW CREATE TABLE seq1;
 SHOW CREATE TABLE seq2;
 
-
 --connection node_1
 DROP TABLE t;
 DROP SEQUENCE seq1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -13,5 +13,3 @@
 log_slave_updates=ON
 wsrep_sst_method=rsync
 thread_handling = pool-of-threads
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_threadpool.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_threadpool.test	2025-05-19 16:14:24.000000000 +0000
@@ -5,7 +5,6 @@
 
 --let $node_1 = node_1
 --let $node_2 = node_2
-
 --source ../galera/include/auto_increment_offset_save.inc
 
 #
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ddl_nonconflicting.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,43 +1,81 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
+--source include/have_sequence.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+--source include/galera_have_debug_sync.inc
 
 #
 # In this test, we simultaneously send two non-conflicting ALTER TABLE statements
 #
+--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
+--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
 
+--connection node_1
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY AUTO_INCREMENT, f2 INTEGER);
+INSERT INTO t1(f2) SELECT seq FROM seq_1_to_1000;
 
---connection node_2
+--connection node_2a
+SET SESSION wsrep_sync_wait=0;
 --let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
 --source include/wait_condition.inc
---send ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 123);
+--let $wait_condition = SELECT COUNT(*) = 1000 FROM t1;
+--source include/wait_condition.inc
+
+--connection node_1a
+--echo # Block the applier on node_1 and issue a ddl from node_2
+SET SESSION wsrep_sync_wait=0;
+--let $galera_sync_point = apply_monitor_slave_enter_sync
+--source include/galera_set_sync_point.inc
 
+--connection node_2
+--echo # DDL 1
+--send ALTER TABLE t1 ADD COLUMN f3 INTEGER; INSERT INTO t1 VALUES (NULL, 10000, 10000);
+
+--connection node_1a
+--source include/galera_wait_sync_point.inc
+--source include/galera_clear_sync_point.inc
+
+--echo # This will block on acquiring total order isolation
 --connection node_1
+--echo # DDL 2
 --send CREATE UNIQUE INDEX i1 ON t1(f2);
 
+--connection node_1a
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'acquiring total order%' or STATE LIKE 'Waiting for table metadata%'
+--source include/wait_condition.inc
+
+--echo # Signal DDL 1
+--source include/galera_clear_sync_point.inc
+--let $galera_sync_point = apply_monitor_slave_enter_sync
+--source include/galera_signal_sync_point.inc
+
 --connection node_2
 --reap
-INSERT INTO t1 (f1, f2) VALUES (DEFAULT, 234);
 
+--connection node_1
+--reap
+
+--connection node_2
 --let $wait_condition = SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
 --source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
 --source include/wait_condition.inc
 
-SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
-SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
-SELECT COUNT(*) = 2 FROM t1;
+SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
+SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
+SHOW CREATE TABLE t1;
+SELECT COUNT(*) AS EXPECT_1001 FROM t1;
 
 --connection node_1
---reap
-
 --let $wait_condition = SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
 --source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
 --source include/wait_condition.inc
 
-SELECT COUNT(*) = 3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
-SELECT COUNT(*) = 2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
-SELECT COUNT(*) = 2 FROM t1;
+SELECT COUNT(*) AS EXPECT_3 FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 't1';
+SELECT COUNT(*) AS EXPECT_2 FROM INFORMATION_SCHEMA.STATISTICS WHERE TABLE_NAME = 't1';
+SHOW CREATE TABLE t1;
+SELECT COUNT(*) AS EXPECT_1001 FROM t1;
 
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ftwrl.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ftwrl.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_toi_ftwrl.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_toi_ftwrl.test	2025-05-19 16:14:24.000000000 +0000
@@ -19,4 +19,3 @@
 SHOW CREATE TABLE t1;
 
 DROP TABLE t1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_transaction_read_only.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_transaction_read_only.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_transaction_read_only.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_transaction_read_only.test	2025-05-19 16:14:24.000000000 +0000
@@ -55,4 +55,3 @@
 --enable_query_log
 
 DROP TABLE t1;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_udf.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_udf.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_udf.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_udf.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,10 +6,3 @@
 
 [mysqld.2]
 query_cache_type=1
-
-
-
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_unicode_identifiers.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_unicode_identifiers.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_unicode_identifiers.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_unicode_identifiers.test	2025-05-19 16:14:24.000000000 +0000
@@ -75,4 +75,3 @@
 DROP DATABASE `база`;
 DROP DATABASE `втора база`;
 --eval SET GLOBAL wsrep_sync_wait = $wsrep_sync_wait_orig
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_v1_row_events.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_v1_row_events.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_v1_row_events.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_v1_row_events.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,10 +4,3 @@
 log-bin-use-v1-row-events=1
 
 [mysqld.2]
-
-
-
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_OSU_method2.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_OSU_method2.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_OSU_method2.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_OSU_method2.test	2025-05-19 16:14:24.000000000 +0000
@@ -44,4 +44,3 @@
 
 --connection node_1a
 SET DEBUG_SYNC= 'RESET';
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_off.test	2025-05-19 16:14:24.000000000 +0000
@@ -94,11 +94,13 @@
 --eval SET GLOBAL wsrep_auto_increment_control = $auto_increment_control_orig
 --eval SET GLOBAL auto_increment_increment = $auto_increment_increment_node1
 --eval SET GLOBAL auto_increment_offset = $auto_increment_offset_node1
+--disconnect node_1a
 
 --connection node_2
 --eval SET GLOBAL wsrep_auto_increment_control = $auto_increment_control_orig
 --eval SET GLOBAL auto_increment_increment = $auto_increment_increment_node2
 --eval SET GLOBAL auto_increment_offset = $auto_increment_offset_node2
+--disconnect node_2a
 
 --enable_query_log
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_auto_inc_control_on.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,9 +5,3 @@
 
 [mysqld.2]
 wsrep-auto-increment-control=ON
-
-
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_ignore_apply_errors.test	2025-05-19 16:14:24.000000000 +0000
@@ -22,6 +22,8 @@
 DROP TABLE t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
 SHOW TABLES;
 
 # Drop schema that does not exist
@@ -33,6 +35,8 @@
 DROP SCHEMA s1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME LIKE 's1';
+--source include/wait_condition.inc
 SHOW SCHEMAS;
 
 # Drop index that does not exist using DROP INDEX
@@ -45,6 +49,10 @@
 DROP INDEX idx1 ON t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_INDEXES WHERE NAME LIKE 'idx1';
+--source include/wait_condition.inc
 SHOW CREATE TABLE t1;
 DROP TABLE t1;
 
@@ -58,6 +66,10 @@
 ALTER TABLE t1 DROP INDEX idx1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_INDEXES WHERE NAME LIKE 'idx1';
+--source include/wait_condition.inc
 SHOW CREATE TABLE t1;
 DROP TABLE t1;
 
@@ -71,6 +83,11 @@
 ALTER TABLE t1 DROP COLUMN f2;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.INNODB_SYS_COLUMNS WHERE NAME LIKE 'f2';
+--source include/wait_condition.inc
+
 SHOW CREATE TABLE t1;
 DROP TABLE t1;
 
@@ -93,6 +110,10 @@
 SELECT COUNT(*) AS expect_0 FROM t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 0 FROM t1;
+--source include/wait_condition.inc
 SELECT COUNT(*) AS expect_0 FROM t1;
 DROP TABLE t1;
 
@@ -112,6 +133,10 @@
 SELECT COUNT(*) AS expect_1 FROM t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
+--source include/wait_condition.inc
 SELECT COUNT(*) AS expect_1 FROM t1;
 DROP TABLE t1;
 
@@ -136,6 +161,8 @@
 SELECT COUNT(*) AS expect_0 FROM t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*) = 0 FROM t1;
 --source include/wait_condition.inc
 SELECT VARIABLE_VALUE expect_Primary FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status';
@@ -171,6 +198,8 @@
 SELECT COUNT(*) AS expect_0 FROM t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*) = 0 FROM t1;
 --source include/wait_condition.inc
 SELECT VARIABLE_VALUE expect_Primary FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status';
@@ -202,6 +231,8 @@
 SELECT COUNT(*) expect_0 FROM t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/t1';
+--source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*) = 0 FROM t1;
 --source include/wait_condition.inc
 SELECT VARIABLE_VALUE = 'Primary' FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status';
@@ -219,6 +250,10 @@
 INSERT INTO child VALUES (1,1),(2,2),(3,3);
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/parent';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/child';
+--source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*) = 3 FROM child;
 --source include/wait_condition.inc
 
@@ -233,6 +268,10 @@
 SELECT COUNT(*) AS expect_0 FROM child;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/parent';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.INNODB_SYS_TABLES WHERE NAME LIKE 'test/child';
+--source include/wait_condition.inc
 --let $wait_condition = SELECT COUNT(*) = 0 FROM child;
 --source include/wait_condition.inc
 SELECT VARIABLE_VALUE = 'Primary' FROM performance_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_notify_ssl_ipv6.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,7 +6,7 @@
 ssl-ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem
 
 [mysqld.1]
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;repl.causal_read_timeout=PT90S;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
 wsrep_node_address=[::1]:@mysqld.1.#galera_port
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
@@ -14,7 +14,7 @@
 
 [mysqld.2]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;repl.causal_read_timeout=PT90S;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_incoming_address='[::1]:@mysqld.2.port'
 wsrep_node_address=[::1]:@mysqld.2.#galera_port
 wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_off.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,7 +3,6 @@
 #
 
 --source include/galera_cluster.inc
---source include/have_innodb.inc
 --source include/have_aria.inc
 
 CREATE TABLE t1 (f1 INT PRIMARY KEY) Engine=Aria;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_aria_on.test	2025-05-19 16:14:24.000000000 +0000
@@ -234,4 +234,3 @@
 --connection node_2
 SET GLOBAL wsrep_mode = DEFAULT;
 --enable_query_log
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_off.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,7 +3,6 @@
 #
 
 --source include/galera_cluster.inc
---source include/have_innodb.inc
 
 CREATE TABLE t1 (f1 INT PRIMARY KEY) Engine=MyISAM;
 INSERT INTO t1 VALUES (1);
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_replicate_myisam_on.test	2025-05-19 16:14:24.000000000 +0000
@@ -21,6 +21,11 @@
 INSERT INTO t1 SELECT 4 FROM DUAL UNION ALL SELECT 5 FROM DUAL;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 5 FROM t1;
+--source include/wait_condition.inc
+
 SELECT COUNT(*) AS EXPECT_5 FROM t1;
 
 DROP TABLE t1;
@@ -36,6 +41,13 @@
 REPLACE INTO t1 SELECT 3, 'yyy' FROM DUAL;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 3 FROM t1;
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t1 WHERE f1 = 3 AND f2 = 'yyy';
+--source include/wait_condition.inc
+
 SELECT COUNT(*) AS EXPECT_3 FROM t1;
 SELECT COUNT(*) AS EXPECT_1 FROM t1 WHERE f1 = 1 AND f2 = 'klm';
 SELECT COUNT(*) AS EXPECT_1 FROM t1 WHERE f1 = 2 AND f2 = 'xyz';
@@ -49,6 +61,9 @@
 UPDATE t1 SET f2 = 'zzz' WHERE f2 = 'yyy';
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t1 WHERE f2 = 'zzz';
+--source include/wait_condition.inc
+
 SELECT COUNT(*) AS EXPECT_1 FROM t1 WHERE f2 = 'zzz';
 
 #
@@ -59,6 +74,9 @@
 DELETE FROM t1 WHERE f2 = 'zzz';
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 0 FROM t1 WHERE f2 = 'zzz';
+--source include/wait_condition.inc
+
 SELECT COUNT(*) AS EXPECT_0 FROM t1 WHERE f2 = 'zzz';
 
 #
@@ -69,6 +87,9 @@
 TRUNCATE TABLE t1;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 0 FROM t1;
+--source include/wait_condition.inc
+
 SELECT COUNT(*) AS EXPECT_0 FROM t1;
 DROP TABLE t1;
 
@@ -77,8 +98,8 @@
 #
 
 --connection node_1
-CREATE TABLE t1 (f1 INTEGER) ENGINE=MyISAM;
-CREATE TABLE t2 (f1 INTEGER) ENGINE=InnoDB;
+CREATE TABLE t1 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=MyISAM;
+CREATE TABLE t2 (f1 INTEGER NOT NULL PRIMARY KEY) ENGINE=InnoDB;
 SET AUTOCOMMIT=OFF;
 START TRANSACTION;
 INSERT INTO t1 VALUES (1);
@@ -86,6 +107,15 @@
 COMMIT;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t2;
+--source include/wait_condition.inc
+
 SELECT COUNT(*) AS EXPECT_1 FROM t1;
 SELECT COUNT(*) AS EXPECT_1 FROM t2;
 
@@ -100,6 +130,11 @@
 ROLLBACK;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 2 FROM t1;
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t2;
+--source include/wait_condition.inc
+
 SELECT COUNT(*) AS EXPECT_2 FROM t1;
 SELECT COUNT(*) AS EXPECT_1 FROM t2;
 
@@ -119,13 +154,20 @@
 INSERT INTO t2 VALUES (1);
 
 --connection node_2
-# The MyISAM update is replicated immediately, so a duplicate key error happens even before the COMMIT
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
+--source include/wait_condition.inc
+# The MyISAM update is replicated when executed, so a duplicate key error happens even before the COMMIT
 --error ER_DUP_ENTRY
 INSERT INTO t1 VALUES (1);
 
 --connection node_1
 COMMIT;
 DROP TABLE t1, t2;
+
 #
 # Test prepared staments
 #
@@ -146,6 +188,10 @@
 SELECT * FROM t1 ORDER BY id;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 11 FROM t1;
+--source include/wait_condition.inc
 SELECT * FROM t1 ORDER BY id;
 
 DROP TABLE t1;
@@ -172,6 +218,10 @@
 SELECT * FROM t1 ORDER BY id;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 11 FROM t1;
+--source include/wait_condition.inc
 SELECT * FROM t1 ORDER BY id;
 
 DROP PROCEDURE proc;
@@ -195,26 +245,46 @@
 SELECT * FROM t2 ORDER BY id;
 
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't2';
+--source include/wait_condition.inc
+SELECT COUNT(*) FROM t1;
+--let $wait_condition = SELECT COUNT(*) = 10 FROM t1;
+--source include/wait_condition.inc
 SELECT * FROM t1 ORDER BY id;
 SELECT * FROM t2 ORDER BY id;
 DROP TRIGGER tr1;
 DROP TRIGGER tr2;
 DROP TRIGGER tr3;
-DROP TABLE t1,t2;
+DROP TABLE t1, t2;
+
+CREATE TABLE t1 (a INT, b INT, UNIQUE(a)) ENGINE=MyISAM;
+CREATE TRIGGER tr1 BEFORE INSERT ON t1 FOR EACH ROW SET NEW.a=1;
+INSERT INTO t1  (a,b) VALUES (10,20);
+SELECT * from t1;
+
+--connection node_2
+--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
+--source include/wait_condition.inc
+
+SELECT * from t1;
+--connection node_1
+DROP TABLE t1;
 
 --echo #
 --echo # MDEV-11152: wsrep_replicate_myisam: SELECT gets replicated using TO
 --echo #
 --connection node_1
-CREATE TABLE t1 (i INT) ENGINE=INNODB;
+CREATE TABLE t1 (i INT NOT NULL PRIMARY KEY) ENGINE=INNODB;
 INSERT INTO t1 VALUES(1);
 # This command should not get replicated.
 SELECT * FROM t1;
 DROP TABLE t1;
 
---connection node_1
---disable_query_log
 SET GLOBAL wsrep_mode = DEFAULT;
+
 --connection node_2
 SET GLOBAL wsrep_mode = DEFAULT;
---enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_slave_threads.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_slave_threads.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_slave_threads.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_slave_threads.test	2025-05-19 16:14:24.000000000 +0000
@@ -15,11 +15,15 @@
 
 --connection node_1
 --let $wsrep_slave_threads_orig = `SELECT @@wsrep_slave_threads`
+
 CREATE TABLE t1 (f1 INT PRIMARY KEY) Engine=InnoDB;
 CREATE TABLE t2 (f1 INT AUTO_INCREMENT PRIMARY KEY) Engine=InnoDB;
 
 --connection node_2
+--let $wsrep_slave_threads_orig_2 = `SELECT @@wsrep_slave_threads`
+
 CALL mtr.add_suppression("WSREP: Refusing exit for the last slave thread\\.");
+
 # Setting wsrep_slave_threads to zero triggers a warning
 SET GLOBAL wsrep_slave_threads = 0;
 SHOW WARNINGS;
@@ -74,7 +78,9 @@
 --let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_applier_thread_count';
 --source include/wait_condition.inc
 
---eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig
+--disable_query_log
+--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig_2;
+--enable_query_log
 
 DROP TABLE t1;
 DROP TABLE t2;
@@ -94,6 +100,11 @@
 SET GLOBAL wsrep_slave_threads = 1;
 
 --connection node_1
+
+--disable_query_log
+--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig;
+--enable_query_log
+
 INSERT INTO t1 VALUES (DEFAULT);
 INSERT INTO t1 VALUES (DEFAULT);
 INSERT INTO t1 VALUES (DEFAULT);
@@ -106,6 +117,10 @@
 
 --connection node_2
 
+--disable_query_log
+--eval SET GLOBAL wsrep_slave_threads = $wsrep_slave_threads_orig_2;
+--enable_query_log
+
 # Wait until above DDL is replicated
 #
 # make sure that we are left with exactly one applier thread before we leaving the test
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_wsrep_mode.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_wsrep_mode.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_var_wsrep_mode.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_var_wsrep_mode.test	2025-05-19 16:14:24.000000000 +0000
@@ -57,9 +57,3 @@
 
 # reset
 SET GLOBAL wsrep_mode=DEFAULT;
-
-
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,20 @@
+!include ../galera_4nodes.cnf
+
+[mysqld]
+wsrep-ignore-apply-errors=0
+
+[mysqld.1]
+wsrep_node_name='node_1'
+
+[mysqld.2]
+wsrep_node_name='node_2'
+
+[mysqld.3]
+wsrep_node_name='node_3'
+
+[mysqld.4]
+wsrep_node_name='node_4'
+wsrep_sst_donor='node_1'
+
+[ENV]
+galera_cluster_size=4
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_during_ist.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_during_ist.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,165 @@
+#
+# Test a case where a joiner encounters an error during IST
+# Instead of voting it should assume error and  bail out.
+#
+
+--source include/galera_cluster.inc
+--source include/big_test.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+# Make sure that the test is operating on the right version of galera library.
+--let $galera_version=26.4.19
+source ../wsrep/include/check_galera_version.inc;
+
+--let $node_1=node_1
+--let $node_2=node_2
+--let $node_3=node_3
+--let $node_4=node_4
+--source ../include/auto_increment_offset_save.inc
+
+# create table t1 and procedure p1 to generate wirtesets
+--connection node_1
+CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
+
+DELIMITER |;
+CREATE PROCEDURE p1(IN max INT)
+BEGIN
+  DECLARE i INT;
+  DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+
+  SET i = 0;
+  WHILE i < max DO
+    INSERT IGNORE INTO t1 VALUES (DEFAULT);
+    SET i = i + 1;
+  END WHILE;
+END|
+DELIMITER ;|
+
+CALL p1(130);
+
+--connection node_4
+--echo Shutting down server 4...
+--let $node_4_server_id= `SELECT @@server_id`
+--let $node_4_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_4_server_id.expect
+--let $node_4_pid_file= `SELECT @@pid_file`
+--source include/shutdown_mysqld.inc
+
+# Wait for node #4 to leave cluster
+--let $members = 3
+--connection node_1
+--source include/wsrep_wait_membership.inc
+--connection node_2
+--source include/wsrep_wait_membership.inc
+--connection node_3
+--source include/wsrep_wait_membership.inc
+--echo Server 4 left the cluster
+
+# Create some writesets for IST
+--connection node_1
+CALL p1(130);
+
+# Create a writeset that node 4 won't be able to apply by creating a table
+# that won't be present in the replication stream
+--connection node_1
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+
+--connection node_2
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+
+--connection node_3
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+
+# This should cause error during IST
+INSERT INTO t2 VALUES (DEFAULT);
+
+# make sure nodes 1,2,3 progress far enough for commit cut update
+CALL p1(130);
+
+--connection node_1
+# prepare to stop SST donor thread when it receives a request from starting node #4
+SET GLOBAL debug = "+d,sync.wsrep_sst_donor_after_donation";
+
+--echo Restarting server 4
+# Need to use this form instead of start_mysqld.inc because the latter is blocking
+--exec echo "restart:$start_mysqld_params" > $node_4_expect_file_name
+
+--echo Wait for server 1 to become a donor
+SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached";
+--echo Server 1 got SST request from server 4
+SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_sst_donor_after_donation_continue";
+SET GLOBAL debug = "";
+SET DEBUG_SYNC='RESET';
+
+#
+# After this point node #4 shall proceed to IST and bail out
+#
+
+--echo Waiting for server 4 to leave the cluster
+--let $members = 3
+--source include/wsrep_wait_membership.inc
+--connection node_2
+--source include/wsrep_wait_membership.inc
+--connection node_3
+--source include/wsrep_wait_membership.inc
+
+--connection node_4
+--echo Server 4 left the cluster, killing it...
+
+# Kill the connected server
+--exec echo "wait" > $node_4_expect_file_name
+--let KILL_NODE_PIDFILE = $node_4_pid_file
+--perl
+        my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
+        my $mysqld_pid = `cat $pid_filename`;
+        chomp($mysqld_pid);
+        system("kill -9 $mysqld_pid");
+        exit(0);
+EOF
+--echo Killed server 4...
+--source include/wait_until_disconnected.inc
+--echo Restarting server 4...
+--source include/start_mysqld.inc
+--source include/galera_wait_ready.inc
+
+# Confirm node #4 has rejoined
+--connection node_1
+--let $members = 4
+--source include/wsrep_wait_membership.inc
+
+# Confirm that all is good and all nodes have identical data
+
+--connection node_1
+SELECT count(*) AS expect1_390 FROM t1;
+SELECT count(*) AS expect1_1 FROM t2;
+
+--connection node_2
+SELECT count(*) AS expect2_390 FROM t1;
+SELECT count(*) AS expect2_1 FROM t2;
+
+--connection node_3
+SELECT count(*) AS expect3_390 FROM t1;
+SELECT count(*) AS expect3_1 FROM t2;
+
+--connection node_4
+SELECT count(*) AS expect4_390 FROM t1;
+SELECT count(*) AS expect4_1 FROM t2;
+
+DROP TABLE t1;
+DROP TABLE t2;
+DROP PROCEDURE p1;
+
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+CALL mtr.add_suppression("Inconsistency detected: Failed on preordered");
+CALL mtr.add_suppression("Failed to apply write set");
+CALL mtr.add_suppression("Sending JOIN failed: -103");
+CALL mtr.add_suppression("Failed to JOIN the cluster after SST");
+
+--source ../include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,21 @@
+!include ../galera_4nodes.cnf
+
+[mysqld]
+wsrep-ignore-apply-errors=0
+
+[mysqld.1]
+wsrep_node_name='node_1'
+
+[mysqld.2]
+wsrep_node_name='node_2'
+
+[mysqld.3]
+wsrep_node_name='node_3'
+
+[mysqld.4]
+wsrep_node_name='node_4'
+wsrep_sst_donor='node_1'
+
+[ENV]
+galera_cluster_size=4
+MTR_SST_JOINER_DELAY=20
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_apply.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_apply.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,73 @@
+#
+# Test a case where a vote happens in JOINED state after SST on a writeset
+# that should be applied.
+#
+
+--source galera_vote_joined_begin.inc
+#
+# At this point state snapshot has been copied, node 1 is operational and
+# we have about 10 seconds while everything we do will go into the replication
+# queue on node 4 which it will have to apply on top of the snapshot.
+#
+
+# Increase replication queue on node_4
+--connection node_1
+CALL p1(130);
+
+# Create a writeset that node 4 won't be able to apply by creating a table
+# that won't be present in the replication stream
+--connection node_1
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+
+--connection node_2
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+
+--connection node_3
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+
+# This should cause node #4 to initiate a vote and leave the cluster
+INSERT INTO t2 VALUES (DEFAULT);
+
+# make sure nodes 1,2,3 progress far enough for commit cut update
+CALL p1(130);
+
+--echo Waiting for server 4 to leave the cluster
+--let $members = 3
+--source include/wsrep_wait_membership.inc
+--connection node_2
+--source include/wsrep_wait_membership.inc
+--connection node_1
+--source include/wsrep_wait_membership.inc
+
+--connection node_4
+--echo Server 4 left the cluster, killing it...
+# Kill the connected server
+--exec echo "wait" > $node_4_expect_file_name
+--let KILL_NODE_PIDFILE = $node_4_pid_file
+--perl
+        my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
+        my $mysqld_pid = `cat $pid_filename`;
+        chomp($mysqld_pid);
+        system("kill -9 $mysqld_pid");
+        exit(0);
+EOF
+--echo Killed server 4...
+--source include/wait_until_disconnected.inc
+--echo Restarting server 4...
+--source include/start_mysqld.inc
+--source include/galera_wait_ready.inc
+DROP TABLE t2;
+
+--source galera_vote_joined_end.inc
+
+--connection node_4
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+CALL mtr.add_suppression("Inconsistency detected: Inconsistent by consensus");
+CALL mtr.add_suppression("Failed to apply write set: gtid:");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_begin.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_begin.inc
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_begin.inc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_begin.inc	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,79 @@
+# This file purpose is to set up node 4 to require SST which is artificaially
+# prolonged and as a result accumulate sufficient relication queue.
+# The contents of the qeuee are controlled in the sourcing test files.
+
+--source include/galera_cluster.inc
+--source include/big_test.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+# Make sure that the test is operating on the right version of galera library.
+--let $galera_version=26.4.19
+source ../wsrep/include/check_galera_version.inc;
+
+--let $node_1=node_1
+--let $node_2=node_2
+--let $node_3=node_3
+--let $node_4=node_4
+--source ../include/auto_increment_offset_save.inc
+
+# create table t1 and procedure p1 to generate wirtesets
+--connection node_1
+CREATE TABLE t1(pk INT AUTO_INCREMENT PRIMARY KEY);
+
+DELIMITER |;
+CREATE PROCEDURE p1(IN max INT)
+BEGIN
+  DECLARE i INT;
+  DECLARE CONTINUE HANDLER FOR SQLEXCEPTION BEGIN END;
+
+  SET i = 0;
+  WHILE i < max DO
+    INSERT IGNORE INTO t1 VALUES (DEFAULT);
+    SET i = i + 1;
+  END WHILE;
+END|
+DELIMITER ;|
+
+# 130 events move the commit cut, it is essential in voting
+CALL p1(130);
+
+--connection node_4
+--echo Shutting down server 4...
+--let $node_4_server_id= `SELECT @@server_id`
+--let $node_4_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_4_server_id.expect
+--let $node_4_pid_file= `SELECT @@pid_file`
+--source include/shutdown_mysqld.inc
+# enforce SST
+--exec rm -rf $MYSQLTEST_VARDIR/mysqld.4/data/grastate.dat
+
+# Wait for node #4 to leave cluster
+--connection node_1
+--let $members = 3
+--source include/wsrep_wait_membership.inc
+
+# prepare to stop SST donor thread when node is in donor state
+SET GLOBAL debug = "+d,sync.wsrep_donor_state";
+
+--connection node_4
+--echo Restarting server 4...
+# Need to use this form instead of start_mysqld.inc because the latter is blocking
+--exec echo "restart:$start_mysqld_params" > $node_4_expect_file_name
+
+# Wait for node #1 to become a donor
+--connection node_1
+SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_donor_state_reached";
+--echo Tables on server 1 flushed and locked for SST to server 4
+SET SESSION DEBUG_SYNC = "now SIGNAL signal.wsrep_donor_state";
+SET GLOBAL debug = "";
+SET DEBUG_SYNC='RESET';
+
+--echo Wait for the state snapshot to be copied to server 4
+--source include/galera_wait_ready.inc
+--echo SST script unlocked server 1
+
+#
+# At this point state snapshot has been copied, node 1 is operational and
+# we have about 20 seconds while everything we do will go into the replication
+# queue on node 4 which it will have to apply on top of the snapshot.
+#
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_end.inc mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_end.inc
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_end.inc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_end.inc	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,33 @@
+# Confirm node #4 has rejoined
+--connection node_1
+--let $members = 4
+--source include/wsrep_wait_membership.inc
+#DROP TABLE IF EXISTS t2;
+
+# Confirm that all is good and all nodes have identical data
+
+--connection node_1
+SELECT count(*) AS expect1_390 FROM t1;
+
+#CALL mtr.add_suppression("Replica SQL: Could not execute Delete_rows");
+#CALL mtr.add_suppression("Event 3 Delete_rows apply failed: 120, seqno [0-9]+");
+
+--connection node_2
+SELECT count(*) AS expect2_390 FROM t1;
+
+#CALL mtr.add_suppression("mysqld: Can't find record in 't1'");
+#CALL mtr.add_suppression("Replica SQL: Could not execute Delete_rows");
+#CALL mtr.add_suppression("Event 3 Delete_rows apply failed: 120, seqno seqno [0-9]+");
+
+--connection node_3
+SELECT count(*) AS expect3_390 FROM t1;
+
+--connection node_4
+SELECT count(*) AS expect4_390 FROM t1;
+
+DROP TABLE t1;
+DROP PROCEDURE p1;
+
+#CALL mtr.add_suppression("inconsistent with group");
+
+--source ../include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,21 @@
+!include ../galera_4nodes.cnf
+
+[mysqld]
+wsrep-ignore-apply-errors=0
+
+[mysqld.1]
+wsrep_node_name='node_1'
+
+[mysqld.2]
+wsrep_node_name='node_2'
+
+[mysqld.3]
+wsrep_node_name='node_3'
+
+[mysqld.4]
+wsrep_node_name='node_4'
+wsrep_sst_donor='node_1'
+
+[ENV]
+galera_cluster_size=4
+MTR_SST_JOINER_DELAY=20
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_joined_skip.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_joined_skip.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,100 @@
+#
+# Test a case where a vote happens in JOINED state after SST on a writeset
+# that should be skipped. I.e. JOINED node should continue operation.
+#
+
+--source galera_vote_joined_begin.inc
+#
+# At this point state snapshot has been copied, node 1 is operational and
+# we have about 10 seconds while everything we do will go into the replication
+# queue on node 4 which it will have to apply on top of the snapshot.
+#
+
+# Increase replication queue on node_4
+--connection node_1
+CALL p1(130);
+
+#
+# Create a writeset that node 4 won't be able to apply by making node 3
+# inconsisitent
+#
+--connection node_3
+--let $node_3_server_id= `SELECT @@server_id`
+--let $node_3_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$node_3_server_id.expect
+--let $node_3_pid_file= `SELECT @@pid_file`
+SET SESSION wsrep_on = OFF;
+CREATE TABLE t2(pk INT AUTO_INCREMENT PRIMARY KEY);
+SET SESSION wsrep_on = ON;
+
+# This should cause nodes #1 and #2 to initiate a vote and kick node #3
+# out of the cluster, node #4 should recover the vote when fails to apply
+# the event and continue
+INSERT INTO t2 VALUES (DEFAULT);
+SET SESSION wsrep_on = OFF;
+
+# make sure nodes 1,2 progress far enough for commit cut update
+--connection node_1
+CALL p1(130);
+
+--let $members = 3
+--echo Waiting for server 3 to leave the cluster
+--connection node_1
+--source include/wsrep_wait_membership.inc
+--connection node_2
+--source include/wsrep_wait_membership.inc
+--connection node_4
+# need to wait for extra SST delay on joiner
+--sleep $MTR_SST_JOINER_DELAY
+--sleep $MTR_SST_JOINER_DELAY
+--enable_reconnect
+--let $wait_timeout = 60
+--source include/wsrep_wait_membership.inc
+
+--connection node_3
+--echo Server 3 left the cluster, killing it...
+# Kill the connected server
+--exec echo "wait" > $node_3_expect_file_name
+--let KILL_NODE_PIDFILE = $node_3_pid_file
+--perl
+        my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
+        my $mysqld_pid = `cat $pid_filename`;
+        chomp($mysqld_pid);
+        system("kill -9 $mysqld_pid");
+        exit(0);
+EOF
+--echo Killed server 3.
+--source include/wait_until_disconnected.inc
+--echo Restarting server 3...
+--exec echo "restart:$start_mysqld_params" > $node_3_expect_file_name
+
+--echo Waiting for server 3 to rejoin the cluster
+--connection node_1
+--let $members = 3
+--source include/wsrep_wait_membership.inc
+
+--connection node_3
+--echo sleeping for $MTR_SST_JOINER_DELAY
+# need to wait for extra SST delay on joiner
+--sleep $MTR_SST_JOINER_DELAY
+--sleep $MTR_SST_JOINER_DELAY
+--echo Waiting ready
+--enable_reconnect
+--source include/galera_wait_ready.inc
+--echo Server 3 restarted.
+
+--source galera_vote_joined_end.inc
+
+--connection node_1
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+
+--connection node_2
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
+
+--connection node_3
+CALL mtr.add_suppression("Vote 0 \\(success\\) on .+ is inconsistent with group");
+
+--connection node_4
+CALL mtr.add_suppression("BF applier thread=.+ failed to open_and_lock_tables for Table ");
+CALL mtr.add_suppression("Event 3 Write_rows_v1 apply failed: 1146");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_vote_rejoin_ddl.test	2025-05-19 16:14:24.000000000 +0000
@@ -91,10 +91,6 @@
 
 DROP TABLE t2;
 
---let $node_3=node_3
---let $auto_increment_offset_node_3 = 3;
---let $node_4=node_4
---let $auto_increment_offset_node_4 = 4;
 --source suite/galera/include/auto_increment_offset_restore.inc
 
 --disconnect node_3
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,14 +4,13 @@
 loose-galera-wan=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=10M;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=10M;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.3]
-wsrep_provider_options='base_port=@mysqld.3.#galera_port;gcache.size=10M;gmcast.segment=2'
+wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.4]
-wsrep_provider_options='base_port=@mysqld.4.#galera_port;gcache.size=10M;gmcast.segment=3'
-
+wsrep_provider_options='gmcast.segment=3;repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan.test	2025-05-19 16:14:24.000000000 +0000
@@ -10,10 +10,10 @@
 --source include/have_innodb.inc
 --source include/force_restart.inc
 
-CALL mtr.add_suppression("WSREP: Stray state UUID msg:");
-CALL mtr.add_suppression("Sending JOIN failed: ");
-CALL mtr.add_suppression("WSREP: .* sending install message failed: Socket is not connected");
-CALL mtr.add_suppression("There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside");
+CALL mtr.add_suppression("WSREP: Stray state UUID msg: ");
+CALL mtr.add_suppression("WSREP: .*Sending JOIN failed: ");
+CALL mtr.add_suppression("WSREP: .*sending install message failed: (Transport endpoint|Socket) is not connected");
+CALL mtr.add_suppression("WSREP: .*There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside");
 
 --let $wait_condition = SELECT VARIABLE_VALUE = 4 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
 --source include/wait_condition.inc
@@ -42,8 +42,8 @@
 DROP TABLE t1;
 
 --connection node_1
-call mtr.add_suppression("WSREP: read_completion_condition.*");
-call mtr.add_suppression("WSREP: read_handler.*");
+call mtr.add_suppression("WSREP: read_completion_condition");
+call mtr.add_suppression("WSREP: read_handler");
 
 --disconnect node_3
 --disconnect node_4
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_ist.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,14 +4,13 @@
 loose-galera-wan-restart-ist=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.3]
-wsrep_provider_options='base_port=@mysqld.3.#galera_port;gmcast.segment=2'
+wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.4]
-wsrep_provider_options='base_port=@mysqld.4.#galera_port;gmcast.segment=2'
-
+wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wan_restart_sst.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,14 +4,13 @@
 loose-galera-wan-restart-sst=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.segment=1'
+wsrep_provider_options='gmcast.segment=1;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.3]
-wsrep_provider_options='base_port=@mysqld.3.#galera_port;gmcast.segment=2'
+wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.4]
-wsrep_provider_options='base_port=@mysqld.4.#galera_port;gmcast.segment=2'
-
+wsrep_provider_options='gmcast.segment=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_log_conficts.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,8 +5,3 @@
 
 [mysqld.2]
 wsrep_log_conflicts=ON
-
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_mode.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_mode.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_mode.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_mode.test	2025-05-19 16:14:24.000000000 +0000
@@ -16,7 +16,6 @@
 DROP TABLE t1;
 SET GLOBAL wsrep_mode = default;
 
-
 # MDEV-25698 SIGSEGV in wsrep_should_replicate_ddl
 
 SET GLOBAL wsrep_mode = STRICT_REPLICATION;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_provider_options_syntax.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,10 +3,11 @@
 #
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
+
 --let LOGF=$MYSQLTEST_VARDIR/log/mysqld.1.err
 --disable_info
-call mtr.add_suppression("WSREP\: Unknown parameter 'gmcasts\\.segment'");
-call mtr.add_suppression("WSREP\: Set options returned 7");
+call mtr.add_suppression("WSREP: Unknown parameter 'gmcasts\\.segment'");
+call mtr.add_suppression("WSREP: Set options returned 7");
 --error ER_WRONG_ARGUMENTS  
 SET GLOBAL wsrep_provider_options="gmcasts.segment=1";
 # Search for unhandled exception message.
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/galera_wsrep_schema_detached.test	2025-05-19 16:14:24.000000000 +0000
@@ -6,11 +6,22 @@
 --source include/auto_increment_offset_save.inc
 
 --connection node_1
-call mtr.add_suppression("WSREP:.*");
+
+call mtr.add_suppression("WSREP: async IST sender failed to serve");
+call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused");
+call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect");
+call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error");
+
 SET @wsrep_provider_options_orig = @@GLOBAL.wsrep_provider_options;
 SET GLOBAL wsrep_provider_options ='pc.ignore_sb=true;pc.weight=2';
 
 --connection node_2
+
+call mtr.add_suppression("WSREP: async IST sender failed to serve");
+call mtr.add_suppression("WSREP: Failed to establish connection: Connection refused");
+call mtr.add_suppression("WSREP: IST failed: IST sender, failed to connect");
+call mtr.add_suppression("WSREP: .*State transfer.* failed: Protocol error");
+
 SET @wsrep_cluster_address_orig = @@GLOBAL.wsrep_cluster_address;
 SET GLOBAL WSREP_ON=0;
 SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
@@ -22,6 +33,7 @@
 SET GLOBAL wsrep_cluster_address = @wsrep_cluster_address_orig;
 SELECT 1;
 DELETE FROM mysql.wsrep_allowlist;
+
 --connection node_2
 --source include/kill_galera.inc
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mdev-29775.test mariadb-10.11.13/mysql-test/suite/galera/t/mdev-29775.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/mdev-29775.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mdev-29775.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,81 @@
+--source include/galera_cluster.inc
+--source include/have_aria.inc
+
+#
+# MDEV-29775 : Assertion `0' failed in void Protocol::end_statement() when adding data to the MyISAM table after setting wsrep_mode=replicate_myisam
+#
+SET GLOBAL wsrep_mode=REPLICATE_MYISAM;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+--connection node_2
+SELECT * FROM t;
+DROP TABLE t;
+
+--connection node_1
+SET GLOBAL wsrep_mode=REPLICATE_MYISAM;
+SET GLOBAL wsrep_forced_binlog_format=ROW;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=MyISAM;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+--connection node_2
+SELECT * FROM t;
+DROP TABLE t;
+
+--connection node_1
+SET GLOBAL wsrep_mode=REPLICATE_ARIA;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+--connection node_2
+SELECT * FROM t;
+DROP TABLE t;
+
+--connection node_1
+SET GLOBAL wsrep_mode=REPLICATE_ARIA;
+SET GLOBAL wsrep_forced_binlog_format=ROW;
+CREATE TABLE t (f0 CHAR(0)) ENGINE=Aria;
+INSERT INTO t VALUES();
+SELECT * FROM t;
+--connection node_2
+SELECT * FROM t;
+DROP TABLE t;
+
+--connection node_1
+SET GLOBAL wsrep_mode=REPLICATE_MYISAM;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+
+SET GLOBAL wsrep_mode=REPLICATE_ARIA;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+
+SET GLOBAL wsrep_mode=DEFAULT;
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_mode = REPLICATE_ARIA;
+
+SET GLOBAL wsrep_mode=DEFAULT;
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_mode = REPLICATE_ARIA;
+
+SET GLOBAL wsrep_forced_binlog_format=DEFAULT;
+SET GLOBAL wsrep_mode=DEFAULT;
+SET GLOBAL wsrep_forced_binlog_format=MIXED;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+SET GLOBAL wsrep_forced_binlog_format=STATEMENT;
+--error ER_WRONG_ARGUMENTS
+SET GLOBAL wsrep_mode = REPLICATE_MYISAM;
+
+SET GLOBAL wsrep_forced_binlog_format=DEFAULT;
+SET GLOBAL wsrep_mode=DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mdev-30653.test mariadb-10.11.13/mysql-test/suite/galera/t/mdev-30653.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/mdev-30653.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mdev-30653.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_aria.inc
 
 create table t1 (id serial, val int) engine=innodb;
@@ -6,7 +7,8 @@
 
 insert into t1 values(1, 23);
 insert into t2 values(2, 42);
-call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental. Storage engine Aria for table 'test'.'t2' is not supported in Galera");
+
+call mtr.add_suppression("WSREP: Replication of non-transactional engines is experimental\\. Storage engine Aria for table 'test'\\.'t2' is not supported in Galera");
 
 begin;
 update t1 set val=24 where id=1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mdev-31285.test mariadb-10.11.13/mysql-test/suite/galera/t/mdev-31285.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/mdev-31285.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mdev-31285.test	2025-05-19 16:14:24.000000000 +0000
@@ -11,5 +11,3 @@
 --connection node_2
 --error ER_NO_SUCH_TABLE
 SHOW CREATE TABLE t;
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.cnf mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,9 +1,4 @@
 !include ../galera_2nodes.cnf
 
-[mysqld.1]
+[mysqld]
 log-bin
-wsrep-debug=1
-
-[mysqld.1]
-log-bin
-wsrep-debug=1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#198.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#198.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,5 @@
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
---source include/force_restart.inc
 
 CREATE TABLE t1 (id INT PRIMARY KEY) ENGINE=InnoDB;
 CREATE TABLE t2 (id INT PRIMARY KEY) ENGINE=InnoDB;
@@ -21,8 +20,9 @@
 
 --connection node_2
 SET SESSION wsrep_sync_wait = 0;
---let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = 'Waiting for table metadata lock'
---source include/wait_condition.inc
+--let $wait_condition = SELECT COUNT(*) BETWEEN 1 AND 2 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for table metadata lock%' OR STATE LIKE 'Waiting to execute in isolation%';
+--let $wait_condition_on_error_output = SELECT * FROM INFORMATION_SCHEMA.PROCESSLIST
+--source include/wait_condition_with_debug_and_kill.inc
 
 --connection node_1
 INSERT INTO t2 VALUES (1);
@@ -38,3 +38,8 @@
 
 DROP TABLE t1;
 DROP TABLE t2;
+
+--connection node_1
+
+--disconnect node_2a
+--disconnect node_2b
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#201.cnf mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#201.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#201.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#201.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,5 +5,3 @@
 
 [mysqld.2]
 query_cache_type=1
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#247.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#247.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#247.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#247.test	2025-05-19 16:14:24.000000000 +0000
@@ -20,4 +20,3 @@
 --sleep 1
 DROP TABLE t1;
 SHOW VARIABLES LIKE 'wsrep_desync';
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#31.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#31.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#31.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#31.test	2025-05-19 16:14:24.000000000 +0000
@@ -49,5 +49,3 @@
 --source include/auto_increment_offset_restore.inc
 
 --source include/galera_end.inc
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#33.cnf mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#33.cnf
--- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#33.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#33.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,8 +1,7 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;pc.ignore_sb=true'
-
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#332.test mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#332.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/mysql-wsrep#332.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/mysql-wsrep#332.test	2025-05-19 16:14:24.000000000 +0000
@@ -216,4 +216,3 @@
 DROP TABLE c;
 DROP TABLE p1;
 DROP TABLE p2;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/rename.test mariadb-10.11.13/mysql-test/suite/galera/t/rename.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/rename.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/rename.test	2025-05-19 16:14:24.000000000 +0000
@@ -50,4 +50,3 @@
 DROP TABLE t2;
 
 --echo # End of tests
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/view.test mariadb-10.11.13/mysql-test/suite/galera/t/view.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/view.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/view.test	2025-05-19 16:14:24.000000000 +0000
@@ -47,4 +47,3 @@
 DROP TABLE t1;
 
 --echo # End of tests
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test mariadb-10.11.13/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test
--- mariadb-10.11.11/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera/t/wsrep_mode_strict_replication.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_aria.inc
 
 call mtr.add_suppression("WSREP: ALTER TABLE isolation failure");
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/disabled.def mariadb-10.11.13/mysql-test/suite/galera_3nodes/disabled.def
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/disabled.def	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/disabled.def	2025-05-19 16:14:24.000000000 +0000
@@ -9,6 +9,3 @@
 #  Do not use any TAB characters for whitespace.
 #
 ##############################################################################
-
-galera_2_cluster : MDEV-32631 galera_2_cluster: before_rollback(): Assertion `0' failed
-galera_nbo_master_phase_two_crash : MENT-2215 Test failure on galera_3nodes.galera_nbo_master_non_prim_failure
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_2x3nodes.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -24,6 +24,7 @@
 #sst_port=@OPT.port
 wsrep_cluster_address=gcomm://
 wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_3nodes.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_3nodes.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/galera_3nodes.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/galera_3nodes.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -19,10 +19,11 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address=gcomm://
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.1.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.1.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.1.#sst_port'
+wsrep_node_name=node1
 
 [mysqld.2]
 wsrep-on=1
@@ -30,10 +31,11 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.2.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.2.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.2.#sst_port'
+wsrep_node_name=node2
 
 [mysqld.3]
 wsrep-on=1
@@ -41,10 +43,11 @@
 #ist_port=@OPT.port
 #sst_port=@OPT.port
 wsrep_cluster_address='gcomm://127.0.0.1:@mysqld.1.#galera_port'
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;gcache.size=10M'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_node_address='127.0.0.1:@mysqld.3.#galera_port'
 wsrep_node_incoming_address=127.0.0.1:@mysqld.3.port
 wsrep_sst_receive_address='127.0.0.1:@mysqld.3.#sst_port'
+wsrep_node_name=node3
 
 [sst]
 sst-log-archive-dir=@ENV.MYSQLTEST_VARDIR/log
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/MDEV-36360.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/MDEV-36360.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/MDEV-36360.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/MDEV-36360.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,61 @@
+connection node_2;
+connection node_1;
+connection node_1;
+connection node_2;
+connection node_3;
+connection node_1;
+CREATE TABLE parent (
+id INT PRIMARY KEY
+) ENGINE=InnoDB;
+CREATE TABLE child (
+id INT PRIMARY KEY,
+parent_id INT,
+KEY (parent_id),
+CONSTRAINT FOREIGN KEY (parent_id) REFERENCES parent(id)
+) ENGINE=InnoDB;
+INSERT INTO parent VALUES (1), (2);
+connection node_3;
+SET SESSION wsrep_on = OFF;
+DELETE FROM parent WHERE id = 1;
+SET SESSION wsrep_on = ON;
+Restarting server 3 with one applier thread having FK and UK checks disabled
+SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_after_write_row';
+connection node_1;
+INSERT INTO child VALUES (1, 1);
+connection node_3;
+SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_after_write_row_reached';
+SET GLOBAL DEBUG_DBUG = '';
+SET wsrep_sync_wait = 0;
+SET DEBUG_SYNC = 'ib_after_row_insert SIGNAL signal.wsrep_after_write_row';
+INSERT INTO child VALUES (2, 2);
+SET DEBUG_SYNC = 'RESET';
+include/assert_grep.inc [no FK constraint failure]
+Server 3
+SELECT COUNT(*) AS EXPECT_1 FROM parent;
+EXPECT_1
+1
+SELECT COUNT(*) AS EXPECT_2 FROM child;
+EXPECT_2
+2
+connection node_1;
+Server 1
+SET wsrep_sync_wait = 15;
+SELECT COUNT(*) AS EXPECT_2 FROM parent;
+EXPECT_2
+2
+SELECT COUNT(*) AS EXPECT_2 FROM child;
+EXPECT_2
+2
+connection node_2;
+Server 2
+SET wsrep_sync_wait = 15;
+SELECT COUNT(*) AS EXPECT_2 FROM parent;
+EXPECT_2
+2
+SELECT COUNT(*) AS EXPECT_2 FROM child;
+EXPECT_2
+2
+DROP TABLE child;
+DROP TABLE parent;
+disconnect node_2;
+disconnect node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera-features#115.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera-features#115.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera-features#115.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera-features#115.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,41 @@
+connection node_2;
+connection node_1;
+connection node_1;
+connection node_2;
+connection node_3;
+connection node_2;
+SET GLOBAL wsrep_on=OFF;
+DROP SCHEMA test;
+connection node_3;
+SET GLOBAL wsrep_on=OFF;
+CREATE TABLE t1 (f1 INTEGER);
+connection node_1;
+CREATE TABLE t1 (f1 INTEGER);
+connection node_1;
+SET SESSION wsrep_sync_wait=0;
+connection node_2;
+SET SESSION wsrep_sync_wait=0;
+connection node_3;
+SET SESSION wsrep_sync_wait=0;
+connection node_1;
+SET GLOBAL wsrep_provider_options='pc.bootstrap=YES';
+connection node_2;
+disconnect node_2;
+connect node_2, 127.0.0.1, root, , mysql, $NODE_MYPORT_2;
+# restart
+connection node_3;
+# restart
+connection node_1;
+DROP TABLE test.t1;
+connection node_2;
+CALL mtr.add_suppression("Inconsistent by consensus\\.");
+CALL mtr.add_suppression("Error_code: 1049");
+CALL mtr.add_suppression("WSREP: Failed to apply trx: source: ");
+CALL mtr.add_suppression("WSREP: Failed to apply app buffer");
+CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\.");
+connection node_3;
+CALL mtr.add_suppression("Inconsistent by consensus\\.");
+CALL mtr.add_suppression("Error_code: 1050");
+CALL mtr.add_suppression("WSREP: Failed to apply trx: source: ");
+CALL mtr.add_suppression("WSREP: Failed to apply app buffer");
+CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_2_cluster.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,9 +1,9 @@
 connection node_2;
 connection node_1;
+connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6;
 connect node_5, 127.0.0.1, root, , test, $NODE_MYPORT_5;
 connect node_4, 127.0.0.1, root, , test, $NODE_MYPORT_4;
 connection node_4;
-CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_1, master_use_gtid=current_pos;;
 START SLAVE;
 include/wait_for_slave_to_start.inc
 connection node_1;
@@ -21,7 +21,6 @@
 SELECT COUNT(*) = 1 FROM t1;
 COUNT(*) = 1
 1
-connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6;
 connection node_6;
 SELECT COUNT(*) = 1 FROM t1;
 COUNT(*) = 1
@@ -51,18 +50,30 @@
 COUNT(*) = 3
 1
 connection node_2;
+connection node_1;
+connection node_3;
+connection node_4;
+connection node_5;
+connection node_6;
+connection node_2;
 OPTIMIZE TABLE t1;
 Table	Op	Msg_type	Msg_text
 test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
 test.t1	optimize	status	OK
+Warnings:
+Note	1592	Unsafe statement written to the binary log using statement format since BINLOG_FORMAT = STATEMENT. Statement is unsafe because it uses a system variable that may have a different value on the slave
 connection node_1;
+connection node_3;
 connection node_4;
+connection node_5;
 connection node_6;
 connection node_1;
 DROP TABLE t1;
 connection node_4;
 STOP SLAVE;
 RESET SLAVE;
+Warnings:
+Note	4190	RESET SLAVE is implicitly changing the value of 'Using_Gtid' from 'Current_Pos' to 'Slave_Pos'
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
@@ -75,19 +86,33 @@
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
-CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
 connection node_3;
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
-CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
 connection node_5;
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
-CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
 connection node_6;
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
+connection node_1;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+connection node_2;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+connection node_3;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+connection node_4;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+connection node_5;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+connection node_6;
 CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd.result	2025-05-19 16:14:24.000000000 +0000
@@ -2,8 +2,6 @@
 connection node_1;
 connection node_1;
 connection node_2;
-connection node_1;
-connection node_2;
 connection node_3;
 Killing node #3 to free ports for garbd ...
 connection node_3;
@@ -26,8 +24,8 @@
 Restarting node #3 to satisfy MTR's end-of-test checks
 connection node_3;
 connection node_1;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 connection node_2;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 connection node_3;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_garbd_backup.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,6 @@
 connection node_2;
 connection node_1;
 connection node_1;
-connection node_1;
 connection node_2;
 connection node_3;
 connection node_1;
@@ -12,7 +11,6 @@
 CREATE TABLE ten (f1 INTEGER) ENGINE=InnoDB;
 INSERT INTO ten VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10);
 INSERT INTO t1 (f2) SELECT REPEAT('x', 1024) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4;
-connection node_2;
 Killing node #3 to free ports for garbd ...
 connection node_3;
 connection node_1;
@@ -34,8 +32,8 @@
 connection node_3;
 connection node_1;
 connection node_1;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 connection node_2;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 connection node_3;
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_gtid_2_cluster.result	2025-05-19 16:14:24.000000000 +0000
@@ -35,7 +35,7 @@
 Variable_name	Value
 wsrep_cluster_size	3
 connection node_1;
-change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_4, master_use_gtid=current_pos, ignore_server_ids=(12,13);;
+--- ignore_server_ids=(12,13)
 start slave;
 include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
@@ -45,7 +45,7 @@
 @@gtid_slave_pos
 
 connection node_4;
-change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_1, master_use_gtid=current_pos, ignore_server_ids=(22,23);;
+--- ignore_server_ids=(22,23)
 start slave;
 include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
@@ -262,7 +262,7 @@
 reset master;
 set global wsrep_on=ON;
 connection node_1;
-change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_6, master_use_gtid=current_pos, ignore_server_ids=(12,13);;
+--- ignore_server_ids=(12,13)
 start slave;
 include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
@@ -272,7 +272,7 @@
 @@gtid_slave_pos
 
 connection node_4;
-change master to master_host='127.0.0.1', master_user='root', master_port=NODE_MYPORT_3, master_use_gtid=current_pos, ignore_server_ids=(22,23);;
+--- ignore_server_ids=(22,23)
 start slave;
 include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_A.result	2025-05-19 16:14:24.000000000 +0000
@@ -77,8 +77,8 @@
 SET GLOBAL wsrep_provider_options = 'dbug=';
 connection node_1;
 DROP TABLE t1;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 connection node_2;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 connection node_3;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_B.result	2025-05-19 16:14:24.000000000 +0000
@@ -87,11 +87,11 @@
 SET GLOBAL wsrep_provider_options = 'dbug=';
 connection node_1;
 DROP TABLE t1;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 connection node_2;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 connection node_3;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 disconnect node_1a;
 disconnect node_3;
 disconnect node_2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_join_with_cc_C.result	2025-05-19 16:14:24.000000000 +0000
@@ -94,9 +94,9 @@
 SET GLOBAL wsrep_provider_options = 'signal=after_shift_to_joining';
 connection node_1;
 DROP TABLE t1;
-call mtr.add_suppression("WSREP: Send action {(.*), STATE_REQUEST} returned -107 \\(Transport endpoint is not connected\\)");
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Send action {.* STATE_REQUEST} returned -107 \\((Transport endpoint|Socket) is not connected\\)");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 connection node_2;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 connection node_3;
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_parallel_apply_3nodes.result	2025-05-19 16:14:24.000000000 +0000
@@ -26,7 +26,7 @@
 SELECT f1 = 111 FROM t1;
 f1 = 111
 1
-SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE LIKE '%committed%';
+SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification');
 COUNT(*) IN (1, 2)
 1
 SET GLOBAL wsrep_slave_threads = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_pc_weight.result	2025-05-19 16:14:24.000000000 +0000
@@ -157,10 +157,10 @@
 CALL mtr.add_suppression('WSREP: gcs_caused\\(\\) returned -1');
 connection node_2;
 CALL mtr.add_suppression('SYNC message from member');
-CALL mtr.add_suppression('user message in state LEAVING');
-CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)');
-CALL mtr.add_suppression('overriding reported weight for');
+CALL mtr.add_suppression('WSREP: user message in state LEAVING');
+CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected');
+CALL mtr.add_suppression('overriding reported weight for ');
 connection node_3;
 CALL mtr.add_suppression('WSREP: user message in state LEAVING');
-CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)');
-CALL mtr.add_suppression('overriding reported weight for');
+CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected');
+CALL mtr.add_suppression('overriding reported weight for ');
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_safe_to_bootstrap.result	2025-05-19 16:14:24.000000000 +0000
@@ -47,7 +47,7 @@
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\.");
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\.");
 CALL mtr.add_suppression("Failed to initialize plugins\\.");
-CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)");
+CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)");
 connection node_3;
 CALL mtr.add_suppression("WSREP: no nodes coming from prim view, prim not possible");
 CALL mtr.add_suppression("WSREP: It may not be safe to bootstrap the cluster from this node");
@@ -61,7 +61,7 @@
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\.");
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\.");
 CALL mtr.add_suppression("Failed to initialize plugins\\.");
-CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)");
+CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)");
 SHOW CREATE TABLE t1;
 Table	Create Table
 t1	CREATE TABLE `t1` (
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_sst_donor_non_prim.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,26 @@
+connection node_2;
+connection node_1;
+connection node_1;
+connection node_2;
+connection node_3;
+connection node_2;
+connection node_1;
+SET GLOBAL debug_dbug = '+d,sync.wsrep_sst_donor_after_donation';
+connection node_2;
+# restart
+connection node_1;
+SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached';
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1';
+SET SESSION wsrep_sync_wait=0;
+SET DEBUG_SYNC = 'now SIGNAL signal.wsrep_sst_donor_after_donation_continue';
+SET DEBUG_SYNC = 'RESET';
+SET GLOBAL debug_dbug = '';
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0';
+SET SESSION wsrep_sync_wait=15;
+connection node_1;
+connection node_2;
+connection node_3;
+connection node_1;
+connection node_1;
+CALL mtr.add_suppression("WSREP: sst sent called when not SST donor, state CONNECTED");
+CALL mtr.add_suppression("WSREP: .* returned an error: Not connected to Primary Component");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/r/galera_vote_rejoin_mysqldump.result	2025-05-19 16:14:24.000000000 +0000
@@ -57,7 +57,6 @@
 t1	CREATE TABLE `t1` (
   `f1` int(11) DEFAULT NULL
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
-CALL mtr.add_suppression("is inconsistent with group");
 connection node_3;
 SHOW CREATE TABLE t1;
 Table	Create Table
@@ -80,4 +79,5 @@
 CALL mtr.add_suppression("Table 'mysql\\.gtid_slave_pos' doesn't exist");
 connection node_2;
 # restart
+CALL mtr.add_suppression("WSREP: .+ is inconsistent with group");
 connection node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/suite.pm mariadb-10.11.13/mysql-test/suite/galera_3nodes/suite.pm
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/suite.pm	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/suite.pm	2025-05-19 16:14:24.000000000 +0000
@@ -9,69 +9,71 @@
 
 push @::global_suppressions,
   (
-     qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1),
-     qr(WSREP: Could not open saved state file for reading: .*),
-     qr(WSREP: Could not open state file for reading: .*),
-     qr(WSREP: Gap in state sequence. Need state transfer.),
+     qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1),
+     qr(WSREP: Could not open saved state file for reading: ),
+     qr(WSREP: Could not open state file for reading: ),
+     qr(WSREP: Gap in state sequence\. Need state transfer\.),
      qr(WSREP: Failed to prepare for incremental state transfer:),
-     qr(WSREP:.*down context.*),
+     qr(WSREP: .*down context.*),
      qr(WSREP: Failed to send state UUID:),
-     qr(WSREP: last inactive check more than .* skipping check),
-     qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.),
-     qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|,
+     qr(WSREP: last inactive check more than .+ skipping check),
+     qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.),
+     qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|,
      qr(WSREP: Quorum: No node with complete state),
      qr(WSREP: Initial position was provided by configuration or SST, avoiding override),
-     qr|WSREP: discarding established \(time wait\) .*|,
-     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.),
+     qr|WSREP: discarding established \(time wait\) |,
+     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.),
      qr(WSREP: evs::proto.*),
-     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|,
+     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|,
      qr(WSREP: no nodes coming from prim view, prim not possible),
-     qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable),
+     qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable),
      qr(WSREP: user message in state LEAVING),
-     qr(WSREP: .* sending install message failed: Transport endpoint is not connected),
+     qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected),
      qr(WSREP: .* sending install message failed: Resource temporarily unavailable),
-     qr(WSREP: Maximum writeset size exceeded by .*),
-     qr(WSREP: transaction size exceeded.*),
-     qr(WSREP: RBR event .*),
-     qr(WSREP: Ignoring error for TO isolated action: .*),
-     qr(WSREP: transaction size limit .*),
-     qr(WSREP: rbr write fail, .*),
-     qr(WSREP: .*Backend not supported: foo.*),
-     qr(WSREP: .*Failed to initialize backend using .*),
-     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*),
+     qr(WSREP: Maximum writeset size exceeded by ),
+     qr(WSREP: transaction size exceeded),
+     qr(WSREP: RBR event ),
+     qr(WSREP: Ignoring error for TO isolated action: ),
+     qr(WSREP: transaction size limit ),
+     qr(WSREP: rbr write fail, ),
+     qr(WSREP: .*Backend not supported: foo),
+     qr(WSREP: .*Failed to initialize backend using ),
+     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ),
      qr(WSREP: gcs connect failed: Socket type not supported),
      qr(WSREP: failed to open gcomm backend connection: 110: failed to reach primary view: 110 .*),
-     qr(WSREP: .*Failed to open backend connection: -110 .*),
-     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*),
+     qr(WSREP: .*Failed to open backend connection: -110 ),
+     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ),
      qr(WSREP: gcs connect failed: Connection timed out),
      qr|WSREP: wsrep::connect\(.*\) failed: 7|,
-     qr(WSREP: SYNC message from member .* in non-primary configuration. Ignored.),
+     qr(WSREP: SYNC message from member .+ ?in non-primary configuration\. Ignored\.),
      qr(WSREP: Could not find peer:),
-     qr(WSREP: TO isolation failed for: .*),
-     qr|WSREP: gcs_caused\(\) returned .*|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|,
-     qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled. Expect abort.|,
+     qr(WSREP: TO isolation failed for: ),
+     qr|WSREP: gcs_caused\(\) returned |,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|,
+     qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled\. Expect abort\.|,
      qr(WSREP: Action message in non-primary configuration from member [0-9]*),
      qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*),
-     qr(WSREP: discarding established .*),
-     qr|WSREP: .*core_handle_uuid_msg.*|,
-     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on),
-     qr|WSREP: JOIN message from member .* in non-primary configuration. Ignored.|,
-     qr|Query apply failed:*|,
-     qr(WSREP: Ignoring error*),
-     qr(WSREP: Failed to remove page file .*),
-     qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to .*),
-     qr|WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.|,
+     qr(WSREP: discarding established ),
+     qr|WSREP: .*core_handle_uuid_msg|,
+     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on),
+     qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|,
+     qr|WSREP: .*Query apply failed:|,
+     qr(WSREP: Ignoring error),
+     qr(WSREP: Failed to remove page file ),
+     qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ),
+     qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+,
+     qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+,
      qr|WSREP: Trying to continue unpaused monitor|,
      qr|WSREP: Wait for gtid returned error 3 while waiting for prior transactions to commit before setting position|,
+     qr|WSREP: Failed to report last committed|,
    );
 
 sub which($) { return `sh -c "command -v $_[0]"` }
 
 sub skip_combinations {
   my %skip = ();
-  $skip{'include/have_mariabackup.inc'} = 'Need ss'
+  $skip{'include/have_mariabackup.inc'} = 'Need socket statistics utility'
             unless which("lsof") || which("sockstat") || which("ss");
   %skip;
 }
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GAL-501.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GAL-501.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GAL-501.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GAL-501.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,7 +6,7 @@
 [mysqld.1]
 wsrep-cluster-address=gcomm://
 wsrep_node_address=[::1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
 bind-address=::
@@ -14,7 +14,7 @@
 [mysqld.2]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
 wsrep_node_address=[::1]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port'
 wsrep_node_incoming_address='[::1]:@mysqld.2.port'
 bind-address=::
@@ -22,7 +22,7 @@
 [mysqld.3]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
 wsrep_node_address=[::1]
-wsrep_provider_options='base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port'
 wsrep_node_incoming_address='[::1]:@mysqld.3.port'
 bind-address=::
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GCF-354.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GCF-354.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/GCF-354.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/GCF-354.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,10 +7,10 @@
 wsrep-debug=1
 
 [mysqld.1]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1G;pc.weight=4'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;pc.weight=4;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=256M'
 
 [mysqld.2]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1G'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=256M'
 
 [mysqld.3]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1G'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=256M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/MDEV-36360.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/MDEV-36360.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/MDEV-36360.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/MDEV-36360.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,110 @@
+#
+# MDEV-36360: Don't grab table-level X locks for applied inserts.
+#
+# It prevents a debug crash in wsrep_report_error() which happened when appliers would run
+# with FK and UK checks disabled and erroneously execute plain inserts as bulk inserts.
+#
+# Moreover, in release builds such a behavior could lead to deadlocks between two applier
+# threads if a thread waiting for a table-level lock was ordered before the lock holder.
+# In that case the lock holder would proceed to commit order and wait forever for the
+# now-blocked other applier thread to commit before.
+#
+
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_debug_sync.inc
+--source include/have_debug.inc
+
+--let $galera_connection_name = node_3
+--let $galera_server_number = 3
+--source include/galera_connect.inc
+
+# Save original auto_increment_offset values.
+--let $node_1=node_1
+--let $node_2=node_2
+--let $node_3=node_3
+--source ../galera/include/auto_increment_offset_save.inc
+
+# Create parent and child tables.
+--connection node_1
+CREATE TABLE parent (
+	id INT PRIMARY KEY
+) ENGINE=InnoDB;
+
+CREATE TABLE child (
+	id INT PRIMARY KEY,
+	parent_id INT,
+	KEY (parent_id),
+	CONSTRAINT FOREIGN KEY (parent_id) REFERENCES parent(id)
+) ENGINE=InnoDB;
+
+# Fill the parent table with rows that will later be used by the child.
+INSERT INTO parent VALUES (1), (2);
+
+# Wait until the rows are replicated on node #3.
+--connection node_3
+--let $wait_condition = SELECT COUNT(*) = 2 FROM parent
+--source include/wait_condition.inc
+
+# Delete one row from the parent table on node #3 and rejoin the cluster.
+SET SESSION wsrep_on = OFF;
+DELETE FROM parent WHERE id = 1;
+SET SESSION wsrep_on = ON;
+--echo Restarting server 3 with one applier thread having FK and UK checks disabled
+--source include/shutdown_mysqld.inc
+--let $start_mysqld_params = --wsrep_slave_FK_checks=0 --wsrep_slave_UK_checks=0
+--source ../galera/include/start_mysqld.inc
+
+# Stop the applier after writing a row into the child table.
+SET GLOBAL DEBUG_DBUG = 'd,sync.wsrep_after_write_row';
+
+# Insert a child row that will be applied on node #3, but should not
+# grab table-level X-lock.
+--connection node_1
+INSERT INTO child VALUES (1, 1);
+
+--connection node_3
+SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_after_write_row_reached';
+# Now that the applier has hit the global sync point wait, reset it
+# so that the upcoming insert avoids it.
+SET GLOBAL DEBUG_DBUG = '';
+# Don't wait for applied insert to commit.
+SET wsrep_sync_wait = 0;
+SET DEBUG_SYNC = 'ib_after_row_insert SIGNAL signal.wsrep_after_write_row';
+# The insert should pass the sync point, as otherwise if the applied insert
+# grabs table-level X-lock, they'll both deadlock forever.
+INSERT INTO child VALUES (2, 2);
+SET DEBUG_SYNC = 'RESET';
+
+--let $assert_select = foreign key constraint fails
+--let $assert_count = 0
+--let $assert_text = no FK constraint failure
+--let $assert_only_after = CURRENT_TEST
+--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.3.err
+--source include/assert_grep.inc
+
+# Child row insert is applied even though there's no parent row.
+--echo Server 3
+SELECT COUNT(*) AS EXPECT_1 FROM parent;
+SELECT COUNT(*) AS EXPECT_2 FROM child;
+
+# Check other nodes have both parent and child rows.
+--connection node_1
+--echo Server 1
+SET wsrep_sync_wait = 15;
+SELECT COUNT(*) AS EXPECT_2 FROM parent;
+SELECT COUNT(*) AS EXPECT_2 FROM child;
+
+--connection node_2
+--echo Server 2
+SET wsrep_sync_wait = 15;
+SELECT COUNT(*) AS EXPECT_2 FROM parent;
+SELECT COUNT(*) AS EXPECT_2 FROM child;
+
+DROP TABLE child;
+DROP TABLE parent;
+
+# Restore original auto_increment_offset values.
+--source ../galera/include/auto_increment_offset_restore.inc
+
+--source include/galera_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,4 @@
+!include ../galera_3nodes.cnf
+
+[mysqld]
+wsrep-ignore-apply-errors=0
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#115.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#115.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,89 @@
+#
+# This test tests that one successful node wins over two nodes that fail for
+# different reasons
+#
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+
+--let $galera_connection_name = node_3
+--let $galera_server_number = 3
+--source include/galera_connect.inc
+
+--let $node_1=node_1
+--let $node_2=node_2
+--let $node_3=node_3
+--source suite/galera/include/auto_increment_offset_save.inc
+
+# create inconsistency on node 2
+--connection node_2
+SET GLOBAL wsrep_on=OFF;
+DROP SCHEMA test;
+
+# create inconsistency on node 3
+--connection node_3
+SET GLOBAL wsrep_on=OFF;
+CREATE TABLE t1 (f1 INTEGER);
+
+--connection node_1
+CREATE TABLE t1 (f1 INTEGER);
+
+# check that nodes 2 and 3 leave the cluster, and node_1 is Primary by itself
+
+--connection node_1
+SET SESSION wsrep_sync_wait=0;
+--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+--source include/wait_condition.inc
+
+--connection node_2
+SET SESSION wsrep_sync_wait=0;
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Disconnected' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+--source include/wait_condition.inc
+
+--connection node_3
+SET SESSION wsrep_sync_wait=0;
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Disconnected' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+--source include/wait_condition.inc
+
+--connection node_1
+# this is a workaround for "sending install message failed" BUG:
+# https://github.com/codership/galera/issues/174
+# When it happens, node_1 becomes non-prim
+SET GLOBAL wsrep_provider_options='pc.bootstrap=YES';
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+--source include/wait_condition.inc
+
+# restart nodes 2 and 3, since they failed
+
+--connection node_2
+# need to reinitialize connection due to a "Bad handshake" bug.
+# we reconnect using the 'mysql' database as 'test' was dropped.
+--disconnect node_2
+--connect node_2, 127.0.0.1, root, , mysql, $NODE_MYPORT_2
+ --source include/restart_mysqld.inc
+
+--connection node_3
+ --source include/restart_mysqld.inc
+
+--connection node_1
+--let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
+--source include/wait_condition.inc
+--let $wait_condition = SELECT VARIABLE_VALUE = 'Primary' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_status';
+--source include/wait_condition.inc
+
+DROP TABLE test.t1;
+
+--source suite/galera/include/auto_increment_offset_restore.inc
+
+--connection node_2
+CALL mtr.add_suppression("Inconsistent by consensus\\.");
+CALL mtr.add_suppression("Error_code: 1049");
+CALL mtr.add_suppression("WSREP: Failed to apply trx: source: ");
+CALL mtr.add_suppression("WSREP: Failed to apply app buffer");
+CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\.");
+
+--connection node_3
+CALL mtr.add_suppression("Inconsistent by consensus\\.");
+CALL mtr.add_suppression("Error_code: 1050");
+CALL mtr.add_suppression("WSREP: Failed to apply trx: source: ");
+CALL mtr.add_suppression("WSREP: Failed to apply app buffer");
+CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#119.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#119.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera-features#119.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera-features#119.test	2025-05-19 16:14:24.000000000 +0000
@@ -66,6 +66,5 @@
 CALL mtr.add_suppression("WSREP: Node consistency compromized, leaving cluster\\.\\.\\.");
 CALL mtr.add_suppression("WSREP: Failed to apply write set: ");
 
-
 # Restore original auto_increment_offset values.
 --source ../galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,25 +1,34 @@
 !include ../galera_2x3nodes.cnf
 
+[mysqld]
+wsrep-debug=1
+
 [mysqld.1]
 wsrep_gtid_domain_id=1
 server-id=11
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M;pc.weight=2'
 
 [mysqld.2]
 wsrep_gtid_domain_id=1
 server-id=12
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M'
 
 [mysqld.3]
 wsrep_gtid_domain_id=1
 server-id=13
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M'
 
 [mysqld.4]
 wsrep_gtid_domain_id=2
 server-id=21
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.4.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M'
 
 [mysqld.5]
 wsrep_gtid_domain_id=2
 server-id=22
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.5.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M'
 
 [mysqld.6]
 wsrep_gtid_domain_id=2
 server-id=23
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.6.#galera_port;evs.suspect_timeout=PT20S;evs.inactive_timeout=PT30S;evs.install_timeout=PT25S;pc.wait_prim_timeout=PT60S;gcache.size=128M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.combinations	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,5 @@
+[binlogon]
+log-bin
+log-slave-updates
+
+[binlogoff]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_2_cluster.test	2025-05-19 16:14:24.000000000 +0000
@@ -9,14 +9,17 @@
 --source include/big_test.inc
 --source include/galera_cluster.inc
 --source include/have_innodb.inc
+--source include/force_restart.inc
 
+--connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6
 --connect node_5, 127.0.0.1, root, , test, $NODE_MYPORT_5
-
 --connect node_4, 127.0.0.1, root, , test, $NODE_MYPORT_4
+
 --connection node_4
 
---replace_result $NODE_MYPORT_1 NODE_MYPORT_1
+--disable_query_log
 --eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_1, master_use_gtid=current_pos;
+--enable_query_log
 START SLAVE;
 --source include/wait_for_slave_to_start.inc
 
@@ -42,7 +45,6 @@
 
 SELECT COUNT(*) = 1 FROM t1;
 
---connect node_6, 127.0.0.1, root, , test, $NODE_MYPORT_6
 --connection node_6
 
 SELECT COUNT(*) = 1 FROM t1;
@@ -81,23 +83,46 @@
 #
 
 --connection node_2
+--let $wsrep_last_committed_before_2 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'`
+
+--connection node_1
+--let $wsrep_last_committed_before_1 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'`
+
+--connection node_3
+--let $wsrep_last_committed_before_3 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'`
+
+--connection node_4
+--let $wsrep_last_committed_before_4 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'`
 
---let $wsrep_last_committed_before = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'`
+--connection node_5
+--let $wsrep_last_committed_before_5 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'`
+
+--connection node_6
+--let $wsrep_last_committed_before_6 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'`
+
+--connection node_2
 OPTIMIZE TABLE t1;
+--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_2 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
+--source include/wait_condition.inc
 
 --connection node_1
+--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_1 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
+--source include/wait_condition.inc
 
---let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
+--connection node_3
+--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_3 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
 --source include/wait_condition.inc
 
 --connection node_4
+--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_4 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
+--source include/wait_condition.inc
 
---let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
+--connection node_5
+--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_5 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
 --source include/wait_condition.inc
 
 --connection node_6
-
---let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
+--let $wait_condition = SELECT VARIABLE_VALUE >= $wsrep_last_committed_before_6 + 1 FROM INFORMATION_SCHEMA.SESSION_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed'
 --source include/wait_condition.inc
 
 #
@@ -115,6 +140,7 @@
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
+--source include/wait_until_ready.inc
 SET GLOBAL GTID_SLAVE_POS="";
 
 --connection node_1
@@ -122,35 +148,56 @@
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
+--source include/wait_until_ready.inc
 
 --connection node_2
 
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
-
-CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+--source include/wait_until_ready.inc
 
 --connection node_3
 
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
-
-CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+--source include/wait_until_ready.inc
 
 --connection node_5
 
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
-
-CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+--source include/wait_until_ready.inc
 
 --connection node_6
 
 SET GLOBAL wsrep_on = OFF;
 RESET MASTER;
 SET GLOBAL wsrep_on = ON;
+--source include/wait_until_ready.inc
+
+connection node_1;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+
+connection node_2;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+
+connection node_3;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+
+connection node_4;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
+
+connection node_5;
+CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
 
+connection node_6;
 CALL mtr.add_suppression("Ignoring server id .* for non bootstrap node");
+CALL mtr.add_suppression("Unsafe statement written to the binary log using statement format since ");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_allowlist.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -7,7 +7,7 @@
 wsrep_allowlist="127.0.0.1,127.0.0.2,127.0.0.3"
 
 [mysqld.2]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=127.0.0.2;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=127.0.0.2;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 # Variable is only used on bootstrap node, so this will be ignored
 wsrep_allowlist="127.0.0.1,127.0.0.2,127.0.0.3,127.0.0.4,127.0.0.5"
@@ -18,9 +18,9 @@
 wsrep_sst_receive_address='127.0.0.2:@mysqld.2.#sst_port'
 
 [mysqld.3]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=127.0.0.3;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_quorum=TRUE;pc.wait_prim=FALSE'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=127.0.0.3;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_quorum=TRUE;pc.wait_prim=FALSE;gcache.size=10M'
 
 wsrep_node_address=127.0.0.3
 wsrep_sst_receive_address=127.0.0.3:@mysqld.3.#sst_port
 wsrep_node_incoming_address=127.0.0.3:@mysqld.3.port
-wsrep_sst_receive_address='127.0.0.3:@mysqld.3.#sst_port'
\ No newline at end of file
+wsrep_sst_receive_address='127.0.0.3:@mysqld.3.#sst_port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_certification_ccc.test	2025-05-19 16:14:24.000000000 +0000
@@ -50,4 +50,3 @@
 --source ../galera/include/auto_increment_offset_restore.inc
 
 --source include/galera_end.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_duplicate_primary_value.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/galera_cluster.inc
+--source include/have_innodb.inc
 --source include/have_debug.inc
 --source include/have_debug_sync.inc
 --source include/big_test.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,12 +5,12 @@
 
 [mysqld.1]
 wsrep_node_name='node.1'
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
 wsrep_node_name='node.2'
-wsrep_provider_options='base_port=@mysqld.2.#galera_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.3]
 wsrep_node_name='node.3'
-wsrep_provider_options='base_port=@mysqld.3.#galera_port;socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/galera-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/galera-key.pem;socket.dynamic=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_dynamic_protocol.test	2025-05-19 16:14:24.000000000 +0000
@@ -14,7 +14,6 @@
 --let $node_3 = node_3
 --source ../galera/include/auto_increment_offset_save.inc
 
-
 --connection node_1
 --let $wait_condition = SELECT VARIABLE_VALUE = 'Synced' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_state_comment';
 --source include/wait_condition.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_evs_suspect_timeout.test	2025-05-19 16:14:24.000000000 +0000
@@ -87,5 +87,6 @@
 --source include/wait_condition.inc
 
 DROP TABLE t1;
+
 # Restore original auto_increment_offset values.
 --source ../galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd.test	2025-05-19 16:14:24.000000000 +0000
@@ -9,14 +9,9 @@
 --source include/big_test.inc
 
 # Save galera ports
---connection node_1
 --source suite/galera/include/galera_base_port.inc
 --let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT
 
---connection node_2
---source suite/galera/include/galera_base_port.inc
---let $NODE_GALERAPORT_2 = $_NODE_GALERAPORT
-
 --let $galera_connection_name = node_3
 --let $galera_server_number = 3
 --source include/galera_connect.inc
@@ -81,10 +76,10 @@
 # Workaround for galera#101
 
 --connection node_1
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 
 --connection node_2
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 
 --connection node_3
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_garbd_backup.test	2025-05-19 16:14:24.000000000 +0000
@@ -10,11 +10,9 @@
 --source include/have_debug.inc
 --source include/have_debug_sync.inc
 
---connection node_1
-# Save original auto_increment_offset values.
---let $node_1=node_1
---let $node_2=node_2
---let $node_3=node_3
+# Save galera ports
+--source suite/galera/include/galera_base_port.inc
+--let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT
 
 --let $galera_connection_name = node_3
 --let $galera_server_number = 3
@@ -22,12 +20,13 @@
 --source suite/galera/include/galera_base_port.inc
 --let $NODE_GALERAPORT_3 = $_NODE_GALERAPORT
 
+# Save original auto_increment_offset values.
+--let $node_1=node_1
+--let $node_2=node_2
+--let $node_3=node_3
 --source ../galera/include/auto_increment_offset_save.inc
 
-# Save galera ports
 --connection node_1
---source suite/galera/include/galera_base_port.inc
---let $NODE_GALERAPORT_1 = $_NODE_GALERAPORT
 --let $datadir= `SELECT @@datadir`
 
 --let $innodb_max_dirty_pages_pct = `SELECT @@innodb_max_dirty_pages_pct`
@@ -41,10 +40,6 @@
 CREATE TABLE ten (f1 INTEGER) ENGINE=InnoDB;
 INSERT INTO ten VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10);
 INSERT INTO t1 (f2) SELECT REPEAT('x', 1024) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4;
- 
---connection node_2
---source suite/galera/include/galera_base_port.inc
---let $NODE_GALERAPORT_2 = $_NODE_GALERAPORT
 
 --echo Killing node #3 to free ports for garbd ...
 --connection node_3
@@ -124,13 +119,16 @@
 --eval SET GLOBAL innodb_max_dirty_pages_pct_lwm = $innodb_max_dirty_pages_pct_lwm
 --enable_query_log
 
+# Restore original auto_increment_offset values.
 --source ../galera/include/auto_increment_offset_restore.inc
 
+# Workaround for galera#101
+
 --connection node_1
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 
 --connection node_2
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
 
 --connection node_3
-CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0 (.*) is not in state transfer \\(SYNCED\\)");
+CALL mtr.add_suppression("WSREP: Protocol violation\\. JOIN message sender 1\\.0( \\(.*\\))? is not in state transfer \\(SYNCED\\)\\. Message ignored\\.");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,7 @@
 # following tests such as galera_3nodes.galera_var_dirty_reads2
 
 !include ../galera_2x3nodes.cnf
+
 [mysqld.1]
 wsrep_gtid_domain_id=1
 server-id=11
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_gtid_2_cluster.test	2025-05-19 16:14:24.000000000 +0000
@@ -42,8 +42,10 @@
 SHOW STATUS LIKE 'wsrep_cluster_size';
 #--disable_parsing
 --connection node_1
---replace_result $NODE_MYPORT_4 NODE_MYPORT_4
+--echo --- ignore_server_ids=(12,13)
+--disable_query_log
 --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_4, master_use_gtid=current_pos, ignore_server_ids=(12,13);
+--enable_query_log
 start slave;
 --source include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
@@ -51,8 +53,10 @@
 #--query_vertical SHOW SLAVE STATUS;
 
 --connection node_4
---replace_result $NODE_MYPORT_1 NODE_MYPORT_1
+--echo --- ignore_server_ids=(22,23)
+--disable_query_log
 --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_1, master_use_gtid=current_pos, ignore_server_ids=(22,23);
+--enable_query_log
 start slave;
 --source include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
@@ -73,6 +77,8 @@
 
 --echo cluster 2 node 1
 --connection node_4
+--let $wait_condition = SELECT COUNT(*) = 1 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (2, 21, 1);
 select @@gtid_binlog_state;
@@ -81,11 +87,16 @@
 --source include/save_master_gtid.inc
 --connection node_4
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 
 --echo cluster 1 node 2
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1;
+--source include/wait_condition.inc
+
 select @@gtid_binlog_state;
 insert into t1 values (1, 12, 3);
 select @@gtid_binlog_state;
@@ -95,10 +106,14 @@
 --source include/save_master_gtid.inc
 --connection node_4
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 1 node 3
 --connection node_3
+--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (1, 13, 4);
 select @@gtid_binlog_state;
@@ -108,10 +123,14 @@
 --source include/save_master_gtid.inc
 --connection node_4
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 2 node 2
 --connection node_5
+--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (2, 22, 2);
 select @@gtid_binlog_state;
@@ -121,37 +140,55 @@
 --source include/save_master_gtid.inc
 --connection node_1
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 2 node 3
 --connection node_6
+--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (2, 23, 3);
 select @@gtid_binlog_state;
 
 --echo #wait for sync  cluster 2 and 1
 --connection node_4
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 --source include/save_master_gtid.inc
 --connection node_1
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo # check other nodes are consistent
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 --connection node_3
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 --connection node_5
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 --connection node_6
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 1 node 1
 --connection node_1
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 drop table t1;
 stop slave;
@@ -210,8 +247,10 @@
 # Then we will kill node D and set up the replication between A and E
 # To see whether fail over works or not.
 --connection node_1
---replace_result $NODE_MYPORT_6 NODE_MYPORT_6
+--echo --- ignore_server_ids=(12,13)
+--disable_query_log
 --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_6, master_use_gtid=current_pos, ignore_server_ids=(12,13);
+--enable_query_log
 start slave;
 --source include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
@@ -219,8 +258,10 @@
 #--query_vertical SHOW SLAVE STATUS;
 
 --connection node_4
---replace_result $NODE_MYPORT_3 NODE_MYPORT_3
+--echo --- ignore_server_ids=(22,23)
+--disable_query_log
 --eval change master to master_host='127.0.0.1', master_user='root', master_port=$NODE_MYPORT_3, master_use_gtid=current_pos, ignore_server_ids=(22,23);
+--enable_query_log
 start slave;
 --source include/wait_for_slave_to_start.inc
 select @@gtid_binlog_state;
@@ -242,6 +283,8 @@
 --sleep 2
 --echo cluster 2 node 1
 --connection node_4
+--let $wait_condition = SELECT COUNT(*) = 1 FROM test.t1;
+--source include/wait_condition.inc
 insert into t1 values (2, 21, 1);
 select @@gtid_binlog_state;
 
@@ -250,11 +293,16 @@
 --source include/save_master_gtid.inc
 --connection node_4
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1;
+--source include/wait_condition.inc
+
 select * from t1 order by 1, 2, 3;
 
 
 --echo cluster 1 node 2
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 2 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (1, 12, 3);
 select @@gtid_binlog_state;
@@ -264,10 +312,14 @@
 --source include/save_master_gtid.inc
 --connection node_4
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 1 node 3
 --connection node_3
+--let $wait_condition = SELECT COUNT(*) = 3 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (1, 13, 4);
 select @@gtid_binlog_state;
@@ -277,10 +329,14 @@
 --source include/save_master_gtid.inc
 --connection node_4
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 2 node 2
 --connection node_5
+--let $wait_condition = SELECT COUNT(*) = 4 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (2, 22, 2);
 select @@gtid_binlog_state;
@@ -290,10 +346,14 @@
 --source include/save_master_gtid.inc
 --connection node_1
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 2 node 3
 --connection node_6
+--let $wait_condition = SELECT COUNT(*) = 5 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 insert into t1 values (2, 23, 3);
 select @@gtid_binlog_state;
@@ -303,24 +363,36 @@
 --source include/save_master_gtid.inc
 --connection node_1
 --source include/sync_with_master_gtid.inc
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select * from t1 order by 1, 2, 3;
 
 --echo # check other nodes are consistent
 --connection node_2
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 --connection node_3
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 --connection node_5
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 --connection node_6
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 select * from t1 order by 1, 2, 3;
 
 --echo cluster 1 node 1
 --connection node_1
+--let $wait_condition = SELECT COUNT(*) = 6 FROM test.t1;
+--source include/wait_condition.inc
 select @@gtid_binlog_state;
 drop table t1;
 stop slave;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -6,7 +6,7 @@
 
 [mysqld.1]
 wsrep-cluster-address=gcomm://
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
@@ -15,7 +15,7 @@
 
 [mysqld.2]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.2.port'
@@ -25,7 +25,7 @@
 
 [mysqld.3]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.3.port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mariabackup_section.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -9,7 +9,7 @@
 
 [mysqld.1]
 wsrep-cluster-address=gcomm://
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
@@ -18,7 +18,7 @@
 
 [mysqld.2]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.2.port'
@@ -28,7 +28,7 @@
 
 [mysqld.3]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.3.port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,7 +5,7 @@
 
 [mysqld.1]
 wsrep-cluster-address=gcomm://
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
@@ -13,7 +13,7 @@
 
 [mysqld.2]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.2.port'
@@ -21,7 +21,7 @@
 
 [mysqld.3]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.3.port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_mysqldump.test	2025-05-19 16:14:24.000000000 +0000
@@ -20,7 +20,6 @@
 CREATE USER 'sst';
 GRANT ALL PRIVILEGES ON *.* TO 'sst';
 
---let $wsrep_sst_auth_orig = `SELECT @@wsrep_sst_auth`
 SET GLOBAL wsrep_sst_auth = 'sst:';
 
 --connection node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,7 +5,7 @@
 
 [mysqld.1]
 wsrep-cluster-address=gcomm://
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
@@ -13,7 +13,7 @@
 
 [mysqld.2]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.2.port'
@@ -21,7 +21,7 @@
 
 [mysqld.3]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.3.port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ipv6_rsync_section.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -10,7 +10,7 @@
 
 [mysqld.1]
 wsrep-cluster-address=gcomm://
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1]:@mysqld.1.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.1.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.1.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.1.port'
@@ -18,7 +18,7 @@
 
 [mysqld.2]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1]:@mysqld.2.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.2.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.2.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.2.port'
@@ -26,7 +26,7 @@
 
 [mysqld.3]
 wsrep_cluster_address='gcomm://[::1]:@mysqld.1.#galera_port'
-wsrep_provider_options='base_host=[::1];base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1]:@mysqld.3.#ist_port'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;gmcast.listen_addr=tcp://[::]:@mysqld.3.#galera_port;ist.recv_addr=[::1];evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 wsrep_sst_receive_address='[::1]:@mysqld.3.#sst_port'
 wsrep_node_address=::1
 wsrep_node_incoming_address='[::1]:@mysqld.3.port'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ist_gcache_rollover.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 !include ../galera_3nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_sb=true;gcache.size=1M'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1M'
 auto_increment_increment=1
 auto_increment_offset=1
 # this will force server restarts before this test
@@ -9,14 +9,14 @@
 wsrep-debug=1
 
 [mysqld.2]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_sb=true;gcache.size=1M'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1M'
 auto_increment_increment=2
 auto_increment_offset=2
 loose-galera-ist-gcache-rollover=2
 wsrep-debug=1
 
 [mysqld.3]
-wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.ignore_sb=true;gcache.size=1M'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=1M'
 auto_increment_increment=3
 auto_increment_offset=3
 loose-galera-ist-gcache-rollover=3
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_A.test	2025-05-19 16:14:24.000000000 +0000
@@ -259,12 +259,12 @@
 
 DROP TABLE t1;
 
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --connection node_2
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --connection node_3
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --source ../galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_B.test	2025-05-19 16:14:24.000000000 +0000
@@ -270,13 +270,13 @@
 
 DROP TABLE t1;
 
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --connection node_2
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --connection node_3
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --disconnect node_1a
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_join_with_cc_C.test	2025-05-19 16:14:24.000000000 +0000
@@ -295,13 +295,13 @@
 
 DROP TABLE t1;
 
-call mtr.add_suppression("WSREP: Send action {(.*), STATE_REQUEST} returned -107 \\(Transport endpoint is not connected\\)");
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Send action {.* STATE_REQUEST} returned -107 \\((Transport endpoint|Socket) is not connected\\)");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --connection node_2
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --connection node_3
-call mtr.add_suppression("WSREP: Rejecting JOIN message from (.*): new State Transfer required\\.");
+call mtr.add_suppression("WSREP: Rejecting JOIN message from .+: new State Transfer required\\.");
 
 --source ../galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_parallel_apply_3nodes.test	2025-05-19 16:14:24.000000000 +0000
@@ -65,7 +65,7 @@
 
 --connection node_3
 SELECT f1 = 111 FROM t1;
-SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND STATE LIKE '%committed%';
+SELECT COUNT(*) IN (1, 2) FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'system user' AND (STATE LIKE '%committed%' OR STATE LIKE 'Waiting for certification');
 
 SET GLOBAL wsrep_slave_threads = DEFAULT;
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_bootstrap.test	2025-05-19 16:14:24.000000000 +0000
@@ -17,7 +17,6 @@
 --let $node_1 = node_1
 --let $node_2 = node_2
 --let $node_3 = node_3
-
 --source ../galera/include/auto_increment_offset_save.inc
 
 --connection node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_pc_weight.test	2025-05-19 16:14:24.000000000 +0000
@@ -132,11 +132,11 @@
 
 --connection node_2
 CALL mtr.add_suppression('SYNC message from member');
-CALL mtr.add_suppression('user message in state LEAVING');
-CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)');
-CALL mtr.add_suppression('overriding reported weight for');
+CALL mtr.add_suppression('WSREP: user message in state LEAVING');
+CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected');
+CALL mtr.add_suppression('overriding reported weight for ');
 
 --connection node_3
 CALL mtr.add_suppression('WSREP: user message in state LEAVING');
-CALL mtr.add_suppression('sending install message failed: (Transport endpoint is not connected|Socket is not connected)');
-CALL mtr.add_suppression('overriding reported weight for');
+CALL mtr.add_suppression('sending install message failed: (Transport endpoint|Socket) is not connected');
+CALL mtr.add_suppression('overriding reported weight for ');
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_safe_to_bootstrap.test	2025-05-19 16:14:24.000000000 +0000
@@ -14,7 +14,6 @@
 --let $node_1 = node_1
 --let $node_2 = node_2
 --let $node_3 = node_3
-
 --source ../galera/include/auto_increment_offset_save.inc
 
 --connection node_1
@@ -195,7 +194,7 @@
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\.");
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\.");
 CALL mtr.add_suppression("Failed to initialize plugins\\.");
-CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)");
+CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)");
 
 --connection node_3
 CALL mtr.add_suppression("WSREP: no nodes coming from prim view, prim not possible");
@@ -210,7 +209,7 @@
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a STORAGE ENGINE failed\\.");
 CALL mtr.add_suppression("Plugin 'wsrep' registration as a FUNCTION failed\\.");
 CALL mtr.add_suppression("Failed to initialize plugins\\.");
-CALL mtr.add_suppression("WSREP: gcs/src/gcs_core.cpp:core_handle_uuid_msg\\(\\)");
+CALL mtr.add_suppression("WSREP: gcs/src/gcs_core\\.cpp:core_handle_uuid_msg\\(\\)");
 
 SHOW CREATE TABLE t1;
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_ssl_reload.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,10 +5,10 @@
 loose-galera-ssl-reload=1
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.3]
-wsrep_provider_options='base_port=@mysqld.3.#galera_port;socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem'
+wsrep_provider_options='socket.ssl=yes;socket.ssl_ca=@ENV.MYSQL_TEST_DIR/std_data/cacert.pem;socket.ssl_cert=@ENV.MYSQL_TEST_DIR/std_data/client-cert.pem;socket.ssl_key=@ENV.MYSQL_TEST_DIR/std_data/client-key.pem;repl.causal_read_timeout=PT90S;base_port=@mysqld.3.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,4 @@
+!include ../galera_3nodes.cnf
+
+[mysqld.2]
+wsrep_sst_donor=node1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_sst_donor_non_prim.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,64 @@
+#
+# Construct a situation where Donor node partitions in the
+# middle of SST. The Donor should stay in non-Primary state instead of
+# crashing in assertion in wsrep-lib.
+#
+# In the test, node_2 is restarted and node_1 configured to be
+# the donor. Node_1 execution is stopped before sst_sent() is
+# called and node_1 is made to partition from the cluster.
+#
+
+--source include/galera_cluster.inc
+--source include/have_innodb.inc
+--source include/have_debug_sync.inc
+--source include/big_test.inc
+
+--let $galera_connection_name = node_3
+--let $galera_server_number = 3
+--source include/galera_connect.inc
+
+--let $node_1=node_1
+--let $node_2=node_2
+--let $node_3=node_3
+--source ../galera/include/auto_increment_offset_save.inc
+
+--connection node_2
+--source include/shutdown_mysqld.inc
+--remove_file $MYSQLTEST_VARDIR/mysqld.2/data/grastate.dat
+
+--connection node_1
+SET GLOBAL debug_dbug = '+d,sync.wsrep_sst_donor_after_donation';
+
+--connection node_2
+--source include/start_mysqld.inc
+
+--connection node_1
+SET DEBUG_SYNC = 'now WAIT_FOR sync.wsrep_sst_donor_after_donation_reached';
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=1';
+SET SESSION wsrep_sync_wait=0;
+--let $wait_condition = SELECT VARIABLE_VALUE = 'non-Primary' FROM information_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_status'
+--source include/wait_condition.inc
+
+SET DEBUG_SYNC = 'now SIGNAL signal.wsrep_sst_donor_after_donation_continue';
+SET DEBUG_SYNC = 'RESET';
+SET GLOBAL debug_dbug = '';
+
+SET GLOBAL wsrep_provider_options = 'gmcast.isolate=0';
+SET SESSION wsrep_sync_wait=15;
+
+--let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM information_schema.global_status WHERE VARIABLE_NAME = 'wsrep_cluster_size'
+--connection node_1
+--source include/wait_condition.inc
+--connection node_2
+--source include/wait_condition.inc
+--connection node_3
+
+--connection node_1
+--let $wait_condition = SELECT VARIABLE_VALUE = 'ON' FROM information_schema.global_status WHERE VARIABLE_NAME = 'wsrep_ready'
+--source include/wait_condition.inc
+
+--source ../galera/include/auto_increment_offset_restore.inc
+
+--connection node_1
+CALL mtr.add_suppression("WSREP: sst sent called when not SST donor, state CONNECTED");
+CALL mtr.add_suppression("WSREP: .* returned an error: Not connected to Primary Component");
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_vote_rejoin_mysqldump.test	2025-05-19 16:14:24.000000000 +0000
@@ -69,7 +69,6 @@
 
 --connection node_2
 SHOW CREATE TABLE t1;
-CALL mtr.add_suppression("is inconsistent with group");
 
 --connection node_3
 SHOW CREATE TABLE t1;
@@ -83,6 +82,7 @@
 # restart node so we don't fail on WSREP_START_POSITION internal check
 --source include/restart_mysqld.inc
 --source include/wait_until_connected_again.inc
+CALL mtr.add_suppression("WSREP: .+ is inconsistent with group");
 
 --connection node_1
 --let $wait_condition = SELECT VARIABLE_VALUE = 3 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema.test	2025-05-19 16:14:24.000000000 +0000
@@ -9,6 +9,7 @@
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
 --connection node_1
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/galera_wsrep_schema_init.test	2025-05-19 16:14:24.000000000 +0000
@@ -10,6 +10,7 @@
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
 --connection node_1
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
@@ -55,4 +56,3 @@
 SELECT cluster_uuid = (SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_state_uuid') FROM mysql.wsrep_cluster_members;
 
 --source ../galera/include/auto_increment_offset_restore.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes/t/inconsistency_shutdown.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -5,5 +5,4 @@
 wsrep-ignore-apply-errors=0
 
 [ENV]
-galera_cluster_size = 3
-
+galera_cluster_size=3
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/MDEV-26707.result	2025-05-19 16:14:24.000000000 +0000
@@ -2,7 +2,7 @@
 connection node_1;
 connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
 connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2;
-connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3;
+connect node_3,  127.0.0.1, root, , test, $NODE_MYPORT_3;
 connect node_3a, 127.0.0.1, root, , test, $NODE_MYPORT_3;
 connection node_1;
 connection node_2;
@@ -45,7 +45,7 @@
 SET SESSION wsrep_sync_wait = DEFAULT;
 SET DEBUG_SYNC = 'now SIGNAL continue';
 connection node_2;
-ERROR HY000: Got error 6 "No such device or address" during COMMIT
+ERROR HY000: Error while appending streaming replication fragment(provider status: Not connected to Primary Component)
 connection node_2a;
 SET DEBUG_SYNC = 'RESET';
 connection node_1a;
@@ -74,15 +74,15 @@
 SET SESSION wsrep_sync_wait = 0;
 SET SESSION wsrep_sync_wait = DEFAULT;
 connection node_1a;
-SELECT COUNT(*) AS EXPECT_0 FROM  mysql.wsrep_streaming_log;
+SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 EXPECT_0
 0
 connection node_2a;
-SELECT COUNT(*) AS EXPECT_0 FROM  mysql.wsrep_streaming_log;
+SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 EXPECT_0
 0
 connection node_3a;
-SELECT COUNT(*) AS EXPECT_0 FROM  mysql.wsrep_streaming_log;
+SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 EXPECT_0
 0
 connection node_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/r/galera_sr_kill_slave_before_apply.result	2025-05-19 16:14:24.000000000 +0000
@@ -44,7 +44,7 @@
 SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 EXPECT_0
 0
-call mtr.add_suppression("WSREP: node uuid:.*");
+call mtr.add_suppression("WSREP: node uuid:");
 connection node_1;
 DROP TABLE t1;
 DROP TABLE t2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/suite.pm mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/suite.pm
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/suite.pm	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/suite.pm	2025-05-19 16:14:24.000000000 +0000
@@ -9,38 +9,39 @@
 
 push @::global_suppressions,
   (
-     qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1),
-     qr(WSREP: Could not open saved state file for reading: .*),
-     qr(WSREP: Could not open state file for reading: .*),
-     qr(WSREP: Gap in state sequence. Need state transfer.),
+     qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1),
+     qr(WSREP: Could not open saved state file for reading: ),
+     qr(WSREP: Could not open state file for reading: ),
+     qr(WSREP: Gap in state sequence\. Need state transfer\.),
      qr(WSREP: Failed to prepare for incremental state transfer:),
-     qr(WSREP:.*down context.*),
+     qr(WSREP: .*down context.*),
      qr(WSREP: Failed to send state UUID:),
      qr(WSREP: last inactive check more than .* skipping check),
      qr(WSREP: SQL statement was ineffective),
-     qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.),
-     qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|,
+     qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.),
+     qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|,
      qr(WSREP: Quorum: No node with complete state),
      qr(WSREP: Initial position was provided by configuration or SST, avoiding override),
-     qr|WSREP: discarding established \(time wait\) .*|,
-     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.),
+     qr|WSREP: discarding established \(time wait\) |,
+     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.),
      qr(WSREP: evs::proto.*),
-     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|,
+     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|,
      qr(WSREP: no nodes coming from prim view, prim not possible),
-     qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable),
+     qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable),
      qr(WSREP: user message in state LEAVING),
-     qr(WSREP: .* sending install message failed: Transport endpoint is not connected),
+     qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected),
      qr(WSREP: .* sending install message failed: Resource temporarily unavailable),
-     qr(WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.),
      qr(WSREP: Could not find peer:),
-     qr|WSREP: gcs_caused\(\) returned .*|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|,
+     qr|WSREP: gcs_caused\(\) returned |,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|,
      qr(WSREP: Action message in non-primary configuration from member [0-9]*),
      qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*),
-     qr|WSREP: .*core_handle_uuid_msg.*|,
-     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on),
-     qr(WSREP: JOIN message from member .* in non-primary configuration. Ignored.),
+     qr|WSREP: .*core_handle_uuid_msg|,
+     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on),
+     qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|,
+     qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+,
+     qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+,
    );
 
 bless { };
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-606.test	2025-05-19 16:14:24.000000000 +0000
@@ -85,4 +85,5 @@
 
 --connection node_2
 CALL mtr.add_suppression("WSREP: failed to send SR rollback for ");
+
 --source ../galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-817.test	2025-05-19 16:14:24.000000000 +0000
@@ -5,6 +5,7 @@
 --source include/galera_cluster.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/GCF-832.test	2025-05-19 16:14:24.000000000 +0000
@@ -7,6 +7,7 @@
 --source include/force_restart.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/MDEV-26707.test	2025-05-19 16:14:24.000000000 +0000
@@ -21,7 +21,7 @@
 
 --connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
 --connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
---connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+--connect node_3,  127.0.0.1, root, , test, $NODE_MYPORT_3
 --connect node_3a, 127.0.0.1, root, , test, $NODE_MYPORT_3
 
 # Save original auto_increment_offset values.
@@ -158,15 +158,15 @@
 --connection node_1a
 --let $wait_condition = SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log
 --source include/wait_condition.inc
-SELECT COUNT(*) AS EXPECT_0 FROM  mysql.wsrep_streaming_log;
+SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 --connection node_2a
 --let $wait_condition = SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log
 --source include/wait_condition.inc
-SELECT COUNT(*) AS EXPECT_0 FROM  mysql.wsrep_streaming_log;
+SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 --connection node_3a
 --let $wait_condition = SELECT COUNT(*) = 0 FROM mysql.wsrep_streaming_log
 --source include/wait_condition.inc
-SELECT COUNT(*) AS EXPECT_0 FROM  mysql.wsrep_streaming_log;
+SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 
 
 --connection node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_isolate_master.test	2025-05-19 16:14:24.000000000 +0000
@@ -6,6 +6,7 @@
 # Test the effect of gmcast.isolate on master during an SR transaction
 #
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_join_slave.test	2025-05-19 16:14:24.000000000 +0000
@@ -9,6 +9,7 @@
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
 --connection node_1
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_master.test	2025-05-19 16:14:24.000000000 +0000
@@ -6,6 +6,7 @@
 --source include/have_innodb.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback.test	2025-05-19 16:14:24.000000000 +0000
@@ -9,6 +9,7 @@
 --source include/have_innodb.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_after_apply_rollback2.test	2025-05-19 16:14:24.000000000 +0000
@@ -8,6 +8,7 @@
 --source include/have_innodb.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
@@ -65,4 +66,5 @@
 --connection node_1
 --disconnect node_1a
 DROP TABLE t1;
+
 --source ../galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_kill_slave_before_apply.test	2025-05-19 16:14:24.000000000 +0000
@@ -9,6 +9,7 @@
 --source include/force_restart.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
@@ -88,7 +89,7 @@
 --connection node_2
 SELECT COUNT(*) AS EXPECT_0 FROM mysql.wsrep_streaming_log;
 # As noted above sometimes node delivers the same view twice
-call mtr.add_suppression("WSREP: node uuid:.*");
+call mtr.add_suppression("WSREP: node uuid:");
 
 --connection node_1
 DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,4 @@
 !include ../galera_3nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.weight=3'
-
+wsrep_provider_options='pc.weight=3;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_sr_threeway_split.test	2025-05-19 16:14:24.000000000 +0000
@@ -7,6 +7,7 @@
 --source include/have_innodb.inc
 
 --connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
+
 # Save original auto_increment_offset values.
 --let $node_1=node_1
 --let $node_2=node_2
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt
--- mariadb-10.11.11/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_3nodes_sr/t/galera_vote_sr-master.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,2 +1 @@
 --wsrep-ignore-apply-errors=0
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/disabled.def mariadb-10.11.13/mysql-test/suite/galera_sr/disabled.def
--- mariadb-10.11.11/mysql-test/suite/galera_sr/disabled.def	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/disabled.def	2025-05-19 16:14:24.000000000 +0000
@@ -9,7 +9,3 @@
 #  Do not use any TAB characters for whitespace.
 #
 ##############################################################################
-
-GCF-1060 : MDEV-32160 GCF-1060 test failure due to wsrep MDL conflict
-# Links to below failures in MDEV-30172
-MDEV-25718 : timeout related to wsrep_sync_wait and DEBUG_SYNC
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/MENT-2042.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/MENT-2042.result
--- mariadb-10.11.11/mysql-test/suite/galera_sr/r/MENT-2042.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/MENT-2042.result	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,9 @@
+connection node_2;
+connection node_1;
+connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
+connection node_1;
+CREATE TABLE t1 (f1 INTEGER PRIMARY KEY);
+XA START 'a';
+ERROR 42000: This version of MariaDB doesn't yet support 'XA transactions with Galera replication'
+DROP TABLE t1;
+disconnect node_1a;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result
--- mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_cc_master.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 connection node_2;
 connection node_1;
-CALL mtr.add_suppression("WSREP: discarding established.*");
+CALL mtr.add_suppression("WSREP: discarding established");
 connection node_1;
 connection node_2;
 connection node_2;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result
--- mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_kill_all_norecovery.result	2025-05-19 16:14:24.000000000 +0000
@@ -2,6 +2,7 @@
 connection node_1;
 connection node_1;
 connection node_2;
+connection node_1;
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
 SET SESSION wsrep_trx_fragment_size = 1;
 SET AUTOCOMMIT=OFF;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_myisam.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_myisam.result
--- mariadb-10.11.11/mysql-test/suite/galera_sr/r/galera_sr_myisam.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/galera_sr_myisam.result	2025-05-19 16:14:24.000000000 +0000
@@ -14,3 +14,4 @@
 1
 DROP TABLE t1;
 connection node_1;
+SET GLOBAL wsrep_mode = DEFAULT;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result mariadb-10.11.13/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result
--- mariadb-10.11.11/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/r/mysql-wsrep-features#148.result	2025-05-19 16:14:24.000000000 +0000
@@ -25,7 +25,7 @@
 connection node_1;
 Got one of the listed errors
 connection node_2;
-SET GLOBAL wsrep_slave_threads = 1;
+SET GLOBAL wsrep_slave_threads = DEFAULT;
 SET GLOBAL debug_dbug = '';
 SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb';
 SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/suite.pm mariadb-10.11.13/mysql-test/suite/galera_sr/suite.pm
--- mariadb-10.11.11/mysql-test/suite/galera_sr/suite.pm	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/suite.pm	2025-05-19 16:14:24.000000000 +0000
@@ -9,62 +9,64 @@
 
 push @::global_suppressions,
   (
-     qr(WSREP: wsrep_sst_receive_address is set to '127.0.0.1),
-     qr(WSREP: Could not open saved state file for reading: .*),
-     qr(WSREP: Could not open state file for reading: .*),
-     qr(WSREP: Gap in state sequence. Need state transfer.),
+     qr(WSREP: wsrep_sst_receive_address is set to '127\.0\.0\.1),
+     qr(WSREP: Could not open saved state file for reading: ),
+     qr(WSREP: Could not open state file for reading: ),
+     qr(WSREP: Gap in state sequence\. Need state transfer\.),
      qr(WSREP: Failed to prepare for incremental state transfer:),
-     qr(WSREP:.*down context.*),
+     qr(WSREP: .*down context.*),
      qr(WSREP: Failed to send state UUID:),
-     qr(WSREP: last inactive check more than .* skipping check),
-     qr(WSREP: Releasing seqno [0-9]* before [0-9]* was assigned.),
-     qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|,
+     qr(WSREP: last inactive check more than .+ skipping check),
+     qr(WSREP: Releasing seqno [0-9]+ before [0-9]+ was assigned\.),
+     qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|,
      qr(WSREP: Quorum: No node with complete state),
      qr(WSREP: Initial position was provided by configuration or SST, avoiding override),
-     qr|WSREP: discarding established \(time wait\) .*|,
-     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside. Will use that one.),
+     qr|WSREP: discarding established \(time wait\) |,
+     qr(WSREP: There are no nodes in the same segment that will ever be able to become donors, yet there is a suitable donor outside\. Will use that one\.),
      qr(WSREP: evs::proto.*),
-     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:.*|,
+     qr|WSREP: Ignoring possible split-brain \(allowed by configuration\) from view:|,
      qr(WSREP: no nodes coming from prim view, prim not possible),
-     qr(WSREP: Member .* requested state transfer from .* but it is impossible to select State Transfer donor: Resource temporarily unavailable),
+     qr(WSREP: Member .+ ?requested state transfer from .+ but it is impossible to select State Transfer donor: Resource temporarily unavailable),
      qr(WSREP: user message in state LEAVING),
-     qr(WSREP: .* sending install message failed: Transport endpoint is not connected),
+     qr(WSREP: .* sending install message failed: (Transport endpoint|Socket) is not connected),
      qr(WSREP: .* sending install message failed: Resource temporarily unavailable),
-     qr(WSREP: Maximum writeset size exceeded by .*),
-     qr(WSREP: transaction size exceeded.*),
-     qr(WSREP: RBR event .*),
-     qr(WSREP: Ignoring error for TO isolated action: .*),
-     qr(WSREP: transaction size limit .*),
-     qr(WSREP: rbr write fail, .*),
-     qr(WSREP: .*Backend not supported: foo.*),
-     qr(WSREP: .*Failed to initialize backend using .*),
-     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*),
+     qr(WSREP: Maximum writeset size exceeded by ),
+     qr(WSREP: transaction size exceeded),
+     qr(WSREP: RBR event ),
+     qr(WSREP: Ignoring error for TO isolated action: ),
+     qr(WSREP: transaction size limit ),
+     qr(WSREP: rbr write fail, ),
+     qr(WSREP: .*Backend not supported: foo),
+     qr(WSREP: .*Failed to initialize backend using ),
+     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ),
      qr(WSREP: gcs connect failed: Socket type not supported),
      qr(WSREP: failed to open gcomm backend connection: 110: failed to reach primary view: 110 .*),
-     qr(WSREP: .*Failed to open backend connection: -110 .*),
-     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at .*),
+     qr(WSREP: .*Failed to open backend connection: -110 ),
+     qr(WSREP: .*Failed to open channel 'my_wsrep_cluster' at ),
      qr(WSREP: gcs connect failed: Connection timed out),
      qr|WSREP: wsrep::connect\(.*\) failed: 7|,
-     qr(WSREP: SYNC message from member .* in non-primary configuration. Ignored.),
+     qr(WSREP: SYNC message from member .+ ?in non-primary configuration\. Ignored\.),
      qr(WSREP: Could not find peer:),
-     qr(WSREP: TO isolation failed for: .*),
-     qr|WSREP: gcs_caused\(\) returned .*|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(SYNCED\). Message ignored.|,
-     qr|WSREP: Protocol violation. JOIN message sender .* is not in state transfer \(JOINED\). Message ignored.|,
-     qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled. Expect abort.|,
+     qr(WSREP: TO isolation failed for: ),
+     qr|WSREP: gcs_caused\(\) returned |,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(SYNCED\)\. Message ignored\.|,
+     qr|WSREP: Protocol violation\. JOIN message sender .+ ?is not in state transfer \(JOINED\)\. Message ignored\.|,
+     qr|WSREP: Unsupported protocol downgrade: incremental data collection disabled\. Expect abort\.|,
      qr(WSREP: Action message in non-primary configuration from member [0-9]*),
      qr(WSREP: Last Applied Action message in non-primary configuration from member [0-9]*),
-     qr(WSREP: discarding established .*),
-     qr|WSREP: .*core_handle_uuid_msg.*|,
-     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0. WSREP_SYNC_WAIT_BEFORE_READ is on),
-     qr|WSREP: JOIN message from member .* in non-primary configuration. Ignored.|,
-     qr|Query apply failed:*|,
-     qr(WSREP: Ignoring error*),
-     qr(WSREP: Failed to remove page file .*),
-     qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to .*),
-     qr|WSREP: Sending JOIN failed: -107 \(Transport endpoint is not connected\). Will retry in new primary component.|,
+     qr(WSREP: discarding established ),
+     qr|WSREP: .*core_handle_uuid_msg|,
+     qr(WSREP: --wsrep-causal-reads=ON takes precedence over --wsrep-sync-wait=0\. WSREP_SYNC_WAIT_BEFORE_READ is on),
+     qr|WSREP: JOIN message from member .+ ?in non-primary configuration\. Ignored\.|,
+     qr|WSREP: .*Query apply failed:|,
+     qr(WSREP: Ignoring error),
+     qr(WSREP: Failed to remove page file ),
+     qr(WSREP: wsrep_sst_method is set to 'mysqldump' yet mysqld bind_address is set to ),
+     qr+WSREP: Sending JOIN failed: -107 \((Transport endpoint|Socket) is not connected\)\. Will retry in new primary component\.+,
+     qr+WSREP: Send action \{.* STATE_REQUEST\} returned -107 \((Transport endpoint|Socket) is not connected\)+,
      qr|WSREP: Trying to continue unpaused monitor|,
      qr|WSREP: Wait for gtid returned error 3 while waiting for prior transactions to commit before setting position|,
+     qr|WSREP: Failed to report last committed|,
    );
 
 bless { };
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-27615.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-27615.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-27615.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-27615.test	2025-05-19 16:14:24.000000000 +0000
@@ -69,5 +69,4 @@
 --disconnect node_2
 --connect node_2, 127.0.0.1, root, , test, $NODE_MYPORT_2
 
-
 --source suite/galera/include/auto_increment_offset_restore.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-28971.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-28971.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/MDEV-28971.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/MDEV-28971.test	2025-05-19 16:14:24.000000000 +0000
@@ -4,6 +4,7 @@
 #
 
 --source include/galera_cluster.inc
+--source include/have_sequence.inc
 
 CREATE SEQUENCE SEQ NOCACHE ENGINE=InnoDB;
 SET SESSION wsrep_trx_fragment_size=1;
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/MENT-2042.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/MENT-2042.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/MENT-2042.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/MENT-2042.test	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,23 @@
+#
+# MENT-2042 Assertion `bf_aborted()' failed in  wsrep::transaction::xa_replay_common()
+#
+
+--source include/galera_cluster.inc
+--source include/have_debug_sync.inc
+
+--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
+
+--connection node_1
+--let connection_id = `SELECT CONNECTION_ID()`
+
+CREATE TABLE t1 (f1 INTEGER PRIMARY KEY);
+
+#
+# Execute XA transaction up to COMMIT
+#
+
+--error ER_NOT_SUPPORTED_YET
+XA START 'a';
+
+DROP TABLE t1;
+--disconnect node_1a
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_cc_master.test	2025-05-19 16:14:24.000000000 +0000
@@ -7,7 +7,7 @@
 # leave the cluster.
 #
 
-CALL mtr.add_suppression("WSREP: discarding established.*");
+CALL mtr.add_suppression("WSREP: discarding established");
 
 # Save original auto_increment_offset values.
 --let $node_1=node_1
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_gtid-master.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1 +1 @@
- --log-bin --log-slave-updates --loose-galera-sr-gtid-unique
+--log-bin --log-slave-updates --loose-galera-sr-gtid-unique
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,8 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.recovery=false'
+wsrep_provider_options='pc.recovery=false;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
+
 auto_increment_offset=1
 
 [mysqld.2]
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_all_norecovery.test	2025-05-19 16:14:24.000000000 +0000
@@ -11,6 +11,8 @@
 --let $node_2=node_2
 --source ../../galera/include/auto_increment_offset_save.inc
 
+--connection node_1
+
 CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
 SET SESSION wsrep_trx_fragment_size = 1;
 SET AUTOCOMMIT=OFF;
@@ -26,7 +28,6 @@
 --let $wait_condition = SELECT COUNT(*) > 0 FROM mysql.wsrep_streaming_log;
 --source include/wait_condition.inc
 
-
 #
 # Kill the entire cluster and restart
 #
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_kill_slave.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,4 @@
 !include ../galera_2nodes.cnf
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;pc.weight=2'
+wsrep_provider_options='pc.weight=2;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_myisam.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_myisam.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_myisam.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_myisam.test	2025-05-19 16:14:24.000000000 +0000
@@ -22,6 +22,4 @@
 DROP TABLE t1;
 
 --connection node_1
---disable_query_log
 SET GLOBAL wsrep_mode = DEFAULT;
---enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -4,8 +4,7 @@
 # causes the first MTR connection to be forefully dropped by Galera, which in turn confuses MTR
 
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
 
 [mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
-
+wsrep_provider_options='pc.ignore_sb=true;repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=10M'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_mysqldump_sst.test	2025-05-19 16:14:24.000000000 +0000
@@ -85,4 +85,3 @@
 
 # Restore original auto_increment_offset values.
 --source ../galera/include/auto_increment_offset_restore.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_shutdown_slave.test	2025-05-19 16:14:24.000000000 +0000
@@ -8,6 +8,7 @@
 --let $node_1=node_1
 --let $node_2=node_2
 --source ../galera/include/auto_increment_offset_save.inc
+
 --connection node_2
 call mtr.add_suppression("WSREP: Failed to scan the last segment to the end\\. Last events may be missing\\. Last recovered event: ");
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/galera_sr_small_gcache.cnf	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,7 @@
 !include ../galera_2nodes.cnf
+
 [mysqld.1]
-wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=16K'
-[mysqld.2]
-wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=16K'
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.1.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=16K'
 
+[mysqld.2]
+wsrep_provider_options='repl.causal_read_timeout=PT90S;base_port=@mysqld.2.#galera_port;evs.suspect_timeout=PT10S;evs.inactive_timeout=PT30S;evs.install_timeout=PT15S;pc.wait_prim_timeout=PT60S;gcache.size=16K'
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#14.test	2025-05-19 16:14:24.000000000 +0000
@@ -18,4 +18,3 @@
 
 --connection node_2
 --source include/galera_wait_ready.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#148.test	2025-05-19 16:14:24.000000000 +0000
@@ -47,7 +47,7 @@
 --reap
 
 --connection node_2
-SET GLOBAL wsrep_slave_threads = 1;
+SET GLOBAL wsrep_slave_threads = DEFAULT;
 SET GLOBAL debug_dbug = '';
 SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb';
 SET DEBUG_SYNC='now SIGNAL signal.wsrep_apply_cb';
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#22.test	2025-05-19 16:14:24.000000000 +0000
@@ -40,7 +40,6 @@
 --connection node_1
 SELECT COUNT(*) = 6 FROM t1;
 
-
 --connection node_2
 SELECT COUNT(*) = 6 FROM t1;
 
diff -Nru mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test
--- mariadb-10.11.11/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/galera_sr/t/mysql-wsrep-features#96.test	2025-05-19 16:14:24.000000000 +0000
@@ -39,7 +39,3 @@
 --connection node_1
 DROP TABLE t1;
 DROP TABLE t2;
-
-
-
-
diff -Nru mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_basic.result mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_basic.result
--- mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_basic.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_basic.result	2025-05-19 16:14:24.000000000 +0000
@@ -86,6 +86,8 @@
 DROP INDEX idx1 ON t;
 DROP INDEX idx2 ON t;
 DROP TABLE t;
+# restart
+set default_storage_engine=innodb;
 /* Test large BLOB data */
 CREATE TABLE `t` (
 `a` BLOB,
diff -Nru mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_stats.result mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_stats.result
--- mariadb-10.11.11/mysql-test/suite/gcol/r/innodb_virtual_stats.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/gcol/r/innodb_virtual_stats.result	2025-05-19 16:14:24.000000000 +0000
@@ -38,6 +38,10 @@
 idxa	n_diff_pfx02	a,DB_ROW_ID
 idxa	n_leaf_pages	Number of leaf pages in the index
 idxa	size	Number of pages in the index
+idxb	n_diff_pfx01	b
+idxb	n_diff_pfx02	b,DB_ROW_ID
+idxb	n_leaf_pages	Number of leaf pages in the index
+idxb	size	Number of pages in the index
 vidxcd	n_diff_pfx01	c
 vidxcd	n_diff_pfx02	c,d
 vidxcd	n_diff_pfx03	c,d,DB_ROW_ID
@@ -54,6 +58,14 @@
 GEN_CLUST_INDEX	n_diff_pfx01	DB_ROW_ID
 GEN_CLUST_INDEX	n_leaf_pages	Number of leaf pages in the index
 GEN_CLUST_INDEX	size	Number of pages in the index
+idxb	n_diff_pfx01	b
+idxb	n_diff_pfx02	b,DB_ROW_ID
+idxb	n_leaf_pages	Number of leaf pages in the index
+idxb	size	Number of pages in the index
+vidxcd	n_diff_pfx01	d
+vidxcd	n_diff_pfx02	d,DB_ROW_ID
+vidxcd	n_leaf_pages	Number of leaf pages in the index
+vidxcd	size	Number of pages in the index
 ALTER TABLE t ADD INDEX vidxe (e), ALGORITHM=INPLACE;
 select count(*) from t;
 count(*)
@@ -65,6 +77,18 @@
 GEN_CLUST_INDEX	n_diff_pfx01	DB_ROW_ID
 GEN_CLUST_INDEX	n_leaf_pages	Number of leaf pages in the index
 GEN_CLUST_INDEX	size	Number of pages in the index
+idxb	n_diff_pfx01	b
+idxb	n_diff_pfx02	b,DB_ROW_ID
+idxb	n_leaf_pages	Number of leaf pages in the index
+idxb	size	Number of pages in the index
+vidxcd	n_diff_pfx01	d
+vidxcd	n_diff_pfx02	d,DB_ROW_ID
+vidxcd	n_leaf_pages	Number of leaf pages in the index
+vidxcd	size	Number of pages in the index
+vidxe	n_diff_pfx01	e
+vidxe	n_diff_pfx02	e,DB_ROW_ID
+vidxe	n_leaf_pages	Number of leaf pages in the index
+vidxe	size	Number of pages in the index
 ALTER TABLE t ADD COLUMN f INT GENERATED ALWAYS AS(a + a), ADD INDEX vidxf (f), ALGORITHM=INPLACE;
 select count(*) from t;
 count(*)
@@ -76,6 +100,22 @@
 GEN_CLUST_INDEX	n_diff_pfx01	DB_ROW_ID
 GEN_CLUST_INDEX	n_leaf_pages	Number of leaf pages in the index
 GEN_CLUST_INDEX	size	Number of pages in the index
+idxb	n_diff_pfx01	b
+idxb	n_diff_pfx02	b,DB_ROW_ID
+idxb	n_leaf_pages	Number of leaf pages in the index
+idxb	size	Number of pages in the index
+vidxcd	n_diff_pfx01	d
+vidxcd	n_diff_pfx02	d,DB_ROW_ID
+vidxcd	n_leaf_pages	Number of leaf pages in the index
+vidxcd	size	Number of pages in the index
+vidxe	n_diff_pfx01	e
+vidxe	n_diff_pfx02	e,DB_ROW_ID
+vidxe	n_leaf_pages	Number of leaf pages in the index
+vidxe	size	Number of pages in the index
+vidxf	n_diff_pfx01	f
+vidxf	n_diff_pfx02	f,DB_ROW_ID
+vidxf	n_leaf_pages	Number of leaf pages in the index
+vidxf	size	Number of pages in the index
 ALTER TABLE t DROP INDEX vidxcd;
 SELECT index_name, stat_name, stat_description
 FROM mysql.innodb_index_stats
@@ -84,4 +124,16 @@
 GEN_CLUST_INDEX	n_diff_pfx01	DB_ROW_ID
 GEN_CLUST_INDEX	n_leaf_pages	Number of leaf pages in the index
 GEN_CLUST_INDEX	size	Number of pages in the index
+idxb	n_diff_pfx01	b
+idxb	n_diff_pfx02	b,DB_ROW_ID
+idxb	n_leaf_pages	Number of leaf pages in the index
+idxb	size	Number of pages in the index
+vidxe	n_diff_pfx01	e
+vidxe	n_diff_pfx02	e,DB_ROW_ID
+vidxe	n_leaf_pages	Number of leaf pages in the index
+vidxe	size	Number of pages in the index
+vidxf	n_diff_pfx01	f
+vidxf	n_diff_pfx02	f,DB_ROW_ID
+vidxf	n_leaf_pages	Number of leaf pages in the index
+vidxf	size	Number of pages in the index
 DROP TABLE t;
diff -Nru mariadb-10.11.11/mysql-test/suite/gcol/t/innodb_virtual_basic.test mariadb-10.11.13/mysql-test/suite/gcol/t/innodb_virtual_basic.test
--- mariadb-10.11.11/mysql-test/suite/gcol/t/innodb_virtual_basic.test	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/gcol/t/innodb_virtual_basic.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 --source include/have_innodb.inc
 --source include/have_partition.inc
---source include/big_test.inc
+--source include/not_embedded.inc
 
 call mtr.add_suppression("\\[Warning\\] InnoDB: Compute virtual");
 
@@ -66,6 +66,41 @@
 DROP INDEX idx2 ON t;
 DROP TABLE t;
 
+let MYSQLD_DATADIR=`select @@datadir`;
+let PAGE_SIZE=`select @@innodb_page_size`;
+--source include/shutdown_mysqld.inc
+perl;
+do "$ENV{MTR_SUITE_DIR}/../innodb/include/crc32.pl";
+my $file = "$ENV{MYSQLD_DATADIR}/ibdata1";
+open(FILE, "+<$file") || die "Unable to open $file";
+binmode FILE;
+my $ps= $ENV{PAGE_SIZE};
+my $page;
+die "Unable to read $file" unless sysread(FILE, $page, $ps) == $ps;
+my $full_crc32 = unpack("N",substr($page,54,4)) & 0x10; # FIL_SPACE_FLAGS
+sysseek(FILE, 7*$ps, 0) || die "Unable to seek $file\n";
+die "Unable to read $file" unless sysread(FILE, $page, $ps) == $ps;
+substr($page,54,4)=pack("N",0xc001cafe); # 32 MSB of 64-bit DICT_HDR_INDEX_ID
+my $polynomial = 0x82f63b78; # CRC-32C
+if ($full_crc32)
+{
+    my $ck = mycrc32(substr($page, 0, $ps-4), 0, $polynomial);
+    substr($page, $ps-4, 4) = pack("N", $ck);
+}
+else
+{
+    my $ck= pack("N",mycrc32(substr($page, 4, 22), 0, $polynomial) ^
+		 mycrc32(substr($page, 38, $ps - 38 - 8), 0, $polynomial));
+    substr($page,0,4)=$ck;
+    substr($page,$ps-8,4)=$ck;
+}
+sysseek(FILE, 7*$ps, 0) || die "Unable to rewind $file\n";
+syswrite(FILE, $page, $ps)==$ps || die "Unable to write $file\n";
+close(FILE) || die "Unable to close $file";
+EOF
+--source include/start_mysqld.inc
+set default_storage_engine=innodb;
+
 /* Test large BLOB data */
 CREATE TABLE `t` (
   `a` BLOB,
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/alter_copy_bulk.result mariadb-10.11.13/mysql-test/suite/innodb/r/alter_copy_bulk.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/alter_copy_bulk.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/alter_copy_bulk.result	2025-05-19 16:14:24.000000000 +0000
@@ -91,3 +91,24 @@
 ALTER TABLE t1 FORCE, ALGORITHM=COPY;
 DROP TABLE t1;
 SET GLOBAL innodb_stats_persistent=@default_stats_persistent;
+#
+#  MDEV-36504 Memory leak after insert into empty table
+#
+CREATE TABLE t1 (k INT PRIMARY KEY)ENGINE=InnoDB;
+INSERT INTO t1 SET k= 1;
+START TRANSACTION;
+INSERT INTO t1 SET k= 2;
+SELECT COUNT(*) > 0 FROM mysql.innodb_index_stats LOCK IN SHARE MODE;
+COUNT(*) > 0
+1
+connect con1,localhost,root,,,;
+SET innodb_lock_wait_timeout=0;
+CREATE TABLE t2(f1 INT DEFAULT 1 PRIMARY KEY)
+STATS_PERSISTENT= 1 ENGINE=InnoDB as SELECT k FROM t1;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction
+disconnect con1;
+connection default;
+SET innodb_lock_wait_timeout=default;
+DROP TABLE t1;
+DROP TABLE IF EXISTS t2;
+# restart
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/alter_partitioned_debug.result mariadb-10.11.13/mysql-test/suite/innodb/r/alter_partitioned_debug.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/alter_partitioned_debug.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/alter_partitioned_debug.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 CREATE TABLE t1 (a INT, b VARCHAR(10)) ENGINE=InnoDB
+STATS_PERSISTENT=1 STATS_AUTO_RECALC=0
 PARTITION BY RANGE(a)
 (PARTITION pa VALUES LESS THAN (3),
 PARTITION pb VALUES LESS THAN (5));
@@ -19,9 +20,30 @@
 ERROR 23000: Duplicate entry '2-two' for key 'a'
 connection default;
 DELETE FROM t1;
-disconnect ddl;
 SET DEBUG_SYNC = 'RESET';
 CHECK TABLE t1;
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
-DROP TABLE t1;
+CREATE TABLE t(a INT, b VARCHAR(10)) ENGINE=InnoDB
+STATS_PERSISTENT=1 STATS_AUTO_RECALC=1;
+RENAME TABLE t TO u;
+DELETE FROM mysql.innodb_table_stats WHERE table_name='u';
+DELETE FROM mysql.innodb_index_stats WHERE table_name='u';
+SET STATEMENT debug_dbug='+d,dict_stats_save_exit_notify_and_wait' FOR
+SELECT * FROM u;
+connection ddl;
+SET DEBUG_SYNC='open_tables_after_open_and_process_table
+WAIT_FOR dict_stats_save_finished';
+ALTER TABLE t1 EXCHANGE PARTITION pb WITH TABLE u;
+connect sync,localhost,root;
+SET DEBUG_SYNC='now SIGNAL dict_stats_save_unblock';
+disconnect sync;
+connection default;
+a	b
+connection ddl;
+disconnect ddl;
+connection default;
+SELECT * FROM u;
+a	b
+SET DEBUG_SYNC = 'RESET';
+DROP TABLE t1,u;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist,desc.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,6 @@
-@@ -13,212 +13,212 @@
+--- autoinc_persist.result
++++ autoinc_persist.result,desc
+@@ -13,224 +13,224 @@
  #
  # Pre-create several tables
  SET SQL_MODE='STRICT_ALL_TABLES';
@@ -296,8 +298,7 @@
 +2
 +1
 +CREATE TABLE t11(a FLOAT AUTO_INCREMENT, PRIMARY KEY(a DESC)) ENGINE = InnoDB;
- INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0),
- (20), (30), (31);
+ INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
  SELECT * FROM t11;
  a
 --10
@@ -310,7 +311,7 @@
 -20
 -30
  31
--CREATE TABLE t12(a DOUBLE AUTO_INCREMENT KEY) ENGINE = InnoDB;
+-CREATE TABLE t11u(a FLOAT UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB;
 +30
 +20
 +5
@@ -320,9 +321,30 @@
 +1
 +-1
 +-10
++CREATE TABLE t11u(a FLOAT UNSIGNED AUTO_INCREMENT, PRIMARY KEY(a DESC)) ENGINE = InnoDB;
+ INSERT INTO t11u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
+ ERROR 22003: Out of range value for column 'a' at row 5
+ INSERT INTO t11u VALUES(0), (0), (0), (0), (0), (20), (30), (31);
+ SELECT * FROM t11u;
+ a
+-11
+-12
+-13
+-14
+-15
+-20
+-30
+ 31
+-CREATE TABLE t12(a DOUBLE AUTO_INCREMENT KEY) ENGINE = InnoDB;
++30
++20
++15
++14
++13
++12
++11
 +CREATE TABLE t12(a DOUBLE AUTO_INCREMENT, PRIMARY KEY(a DESC)) ENGINE = InnoDB;
- INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0),
- (20), (30), (31);
+ INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
  SELECT * FROM t12;
  a
 --10
@@ -344,10 +366,10 @@
 +1
 +-1
 +-10
- # Scenario 1: Normal restart, to test if the counters are persisted
- # Scenario 2: Delete some values, to test the counters should not be the
- # one which is the largest in current table
-@@ -242,14 +242,14 @@
+ CREATE TABLE t12u(a DOUBLE UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB;
+ INSERT INTO t12u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
+ ERROR 22003: Out of range value for column 'a' at row 5
+@@ -268,14 +268,14 @@
  SELECT MAX(a) AS `Expect 100000000000` FROM t9;
  Expect 100000000000
  100000000000
@@ -364,7 +386,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=1234 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  INSERT INTO t13 VALUES(0);
  SELECT a AS `Expect 1234` FROM t13;
-@@ -464,28 +464,28 @@
+@@ -490,28 +490,28 @@
  INSERT INTO t1 VALUES(0), (0);
  SELECT * FROM t1;
  a
@@ -398,7 +420,7 @@
  # Ensure that all changes before the server is killed are persisted.
  set global innodb_flush_log_at_trx_commit=1;
  TRUNCATE TABLE t1;
-@@ -498,63 +498,63 @@
+@@ -524,63 +524,63 @@
  INSERT INTO t19 VALUES(0), (0);
  SELECT * FROM t19;
  a
@@ -481,7 +503,7 @@
  DELETE FROM t3 WHERE a > 300;
  SELECT MAX(a) AS `Expect 200` FROM t3;
  Expect 200
-@@ -566,7 +566,7 @@
+@@ -592,7 +592,7 @@
  Table	Create Table
  t3	CREATE TABLE `t3` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -490,7 +512,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=201 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  INSERT INTO t3 VALUES(0);
  SELECT MAX(a) AS `Expect 201` FROM t3;
-@@ -579,7 +579,7 @@
+@@ -605,7 +605,7 @@
  Table	Create Table
  t3	CREATE TABLE `t3` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -499,7 +521,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=500 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  INSERT INTO t3 VALUES(0);
  SELECT MAX(a) AS `Expect 500` FROM t3;
-@@ -591,13 +591,13 @@
+@@ -617,13 +617,13 @@
  Table	Create Table
  t3	CREATE TABLE `t3` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -515,7 +537,7 @@
  INSERT INTO t3 VALUES(150), (180);
  UPDATE t3 SET a = 200 WHERE a = 150;
  INSERT INTO t3 VALUES(220);
-@@ -607,7 +607,7 @@
+@@ -633,7 +633,7 @@
  Table	Create Table
  t3	CREATE TABLE `t3` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -524,7 +546,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=221 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  INSERT INTO t3 VALUES(0);
  SELECT MAX(a) AS `Expect 221` FROM t3;
-@@ -619,7 +619,7 @@
+@@ -645,7 +645,7 @@
  Table	Create Table
  t3	CREATE TABLE `t3` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -533,7 +555,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=120 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  # MDEV-6076: Test adding an AUTO_INCREMENT COLUMN
  CREATE TABLE mdev6076a (b INT) ENGINE=InnoDB;
-@@ -669,18 +669,18 @@
+@@ -695,18 +695,18 @@
  INSERT INTO t_inplace SELECT * FROM t3;
  SELECT * FROM t_inplace;
  a
@@ -559,7 +581,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=211 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  # This will keep the autoinc counter
  ALTER TABLE t_inplace AUTO_INCREMENT = 250, ALGORITHM = INPLACE;
-@@ -689,7 +689,7 @@
+@@ -715,7 +715,7 @@
  Table	Create Table
  t_inplace	CREATE TABLE `t_inplace` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -568,7 +590,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=250 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  # This should keep the autoinc counter as well
  ALTER TABLE t_inplace ADD COLUMN b INT, ALGORITHM = INPLACE;
-@@ -699,16 +699,16 @@
+@@ -725,16 +725,16 @@
  t_inplace	CREATE TABLE `t_inplace` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
    `b` int(11) DEFAULT NULL,
@@ -590,7 +612,7 @@
  # This should reset the autoinc counter to the one specified
  # Since it's smaller than current one but bigger than existing
  # biggest counter in the table
-@@ -719,7 +719,7 @@
+@@ -745,7 +745,7 @@
  t_inplace	CREATE TABLE `t_inplace` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
    `b` int(11) DEFAULT NULL,
@@ -599,7 +621,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=180 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  # This should reset the autoinc counter to the next value of
  # current max counter in the table, since the specified value
-@@ -730,7 +730,7 @@
+@@ -756,7 +756,7 @@
  Table	Create Table
  t_inplace	CREATE TABLE `t_inplace` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -608,7 +630,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=123 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  INSERT INTO t_inplace VALUES(0), (0);
  SELECT MAX(a) AS `Expect 124` FROM t_inplace;
-@@ -757,18 +757,18 @@
+@@ -783,18 +783,18 @@
  INSERT INTO t_copy SELECT * FROM t3;
  SELECT * FROM t_copy;
  a
@@ -634,7 +656,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=211 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  # This will keep the autoinc counter
  ALTER TABLE t_copy AUTO_INCREMENT = 250, ALGORITHM = COPY;
-@@ -777,7 +777,7 @@
+@@ -803,7 +803,7 @@
  Table	Create Table
  t_copy	CREATE TABLE `t_copy` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -643,7 +665,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=250 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  # This should keep the autoinc counter as well
  ALTER TABLE t_copy ADD COLUMN b INT, ALGORITHM = COPY;
-@@ -787,16 +787,16 @@
+@@ -813,16 +813,16 @@
  t_copy	CREATE TABLE `t_copy` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
    `b` int(11) DEFAULT NULL,
@@ -665,7 +687,7 @@
  # This should reset the autoinc counter to the one specified
  # Since it's smaller than current one but bigger than existing
  # biggest counter in the table
-@@ -807,7 +807,7 @@
+@@ -833,7 +833,7 @@
  t_copy	CREATE TABLE `t_copy` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
    `b` int(11) DEFAULT NULL,
@@ -674,7 +696,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=180 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  # This should reset the autoinc counter to the next value of
  # current max counter in the table, since the specified value
-@@ -818,7 +818,7 @@
+@@ -844,7 +844,7 @@
  Table	Create Table
  t_copy	CREATE TABLE `t_copy` (
    `a` smallint(6) NOT NULL AUTO_INCREMENT,
@@ -683,7 +705,7 @@
  ) ENGINE=InnoDB AUTO_INCREMENT=123 DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci
  INSERT INTO t_copy VALUES(0), (0);
  SELECT MAX(a) AS `Expect 124` FROM t_copy;
-@@ -842,7 +842,7 @@
+@@ -868,7 +868,7 @@
  126
  DROP TABLE t_copy, it_copy;
  # Scenario 9: Test the sql_mode = NO_AUTO_VALUE_ON_ZERO
@@ -692,7 +714,7 @@
  set SQL_MODE = NO_AUTO_VALUE_ON_ZERO;
  INSERT INTO t30 VALUES(NULL, 1), (200, 2), (0, 3);
  INSERT INTO t30(b) VALUES(4), (5), (6), (7);
-@@ -869,20 +869,20 @@
+@@ -895,20 +895,20 @@
  set global innodb_flush_log_at_trx_commit=1;
  CREATE TABLE t31 (a INT) ENGINE = InnoDB;
  INSERT INTO t31 VALUES(1), (2);
@@ -719,7 +741,7 @@
  INSERT INTO t32 VALUES(0), (0);
  # Ensure that all changes before the server is killed are persisted.
  set global innodb_flush_log_at_trx_commit=1;
-@@ -897,7 +897,7 @@
+@@ -923,7 +923,7 @@
  # increasing the counter
  CREATE TABLE t33 (
  a BIGINT NOT NULL PRIMARY KEY,
@@ -728,7 +750,7 @@
  INSERT INTO t33 VALUES(1, NULL);
  INSERT INTO t33 VALUES(2, NULL);
  INSERT INTO t33 VALUES(2, NULL);
-@@ -920,13 +920,13 @@
+@@ -946,13 +946,13 @@
  INSERT INTO t31(a) VALUES(6), (0);
  SELECT * FROM t31;
  a	b
@@ -748,7 +770,7 @@
  DROP TABLE t31;
  set SQL_MODE = NO_AUTO_VALUE_ON_ZERO;
  DELETE FROM t30 WHERE a = 0;
-@@ -965,7 +965,7 @@
+@@ -991,7 +991,7 @@
  DROP TABLE t33;
  CREATE TABLE t33 (
  a BIGINT NOT NULL PRIMARY KEY,
@@ -757,7 +779,7 @@
  ALTER TABLE t33 DISCARD TABLESPACE;
  restore: t33 .ibd and .cfg files
  ALTER TABLE t33 IMPORT TABLESPACE;
-@@ -975,7 +975,7 @@
+@@ -1001,8 +1001,8 @@
  4
  SELECT * FROM t33;
  a	b
@@ -766,4 +788,5 @@
  3	4
 +2	2
 +10	1
- DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t30, t32, t33;
+ DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t11u, t12u,
+ t30, t32, t33;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist.result mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/autoinc_persist.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/autoinc_persist.result	2025-05-19 16:14:24.000000000 +0000
@@ -190,8 +190,7 @@
 100000000000
 100000000006
 CREATE TABLE t11(a FLOAT AUTO_INCREMENT KEY) ENGINE = InnoDB;
-INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0),
-(20), (30), (31);
+INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
 SELECT * FROM t11;
 a
 -10
@@ -204,9 +203,22 @@
 20
 30
 31
+CREATE TABLE t11u(a FLOAT UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB;
+INSERT INTO t11u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
+ERROR 22003: Out of range value for column 'a' at row 5
+INSERT INTO t11u VALUES(0), (0), (0), (0), (0), (20), (30), (31);
+SELECT * FROM t11u;
+a
+11
+12
+13
+14
+15
+20
+30
+31
 CREATE TABLE t12(a DOUBLE AUTO_INCREMENT KEY) ENGINE = InnoDB;
-INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0),
-(20), (30), (31);
+INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
 SELECT * FROM t12;
 a
 -10
@@ -219,6 +231,20 @@
 20
 30
 31
+CREATE TABLE t12u(a DOUBLE UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB;
+INSERT INTO t12u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
+ERROR 22003: Out of range value for column 'a' at row 5
+INSERT INTO t12u VALUES(0), (0), (0), (0), (0), (20), (30), (31);
+SELECT * FROM t12u;
+a
+11
+12
+13
+14
+15
+20
+30
+31
 # Scenario 1: Normal restart, to test if the counters are persisted
 # Scenario 2: Delete some values, to test the counters should not be the
 # one which is the largest in current table
@@ -978,4 +1004,5 @@
 10	1
 2	2
 3	4
-DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t30, t32, t33;
+DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t11u, t12u,
+t30, t32, t33;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/buf_pool_resize_oom.result mariadb-10.11.13/mysql-test/suite/innodb/r/buf_pool_resize_oom.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/buf_pool_resize_oom.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/buf_pool_resize_oom.result	1970-01-01 00:00:00.000000000 +0000
@@ -1,8 +0,0 @@
-#
-# Bug #21348684 SIGABRT DURING RESIZING THE INNODB BUFFER POOL
-# ONLINE WITH MEMORY FULL CONDITION
-#
-call mtr.add_suppression("InnoDB: failed to allocate the chunk array");
-SET GLOBAL debug_dbug='+d,buf_pool_resize_chunk_null';
-SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + 1048576;
-# restart
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/doublewrite.result mariadb-10.11.13/mysql-test/suite/innodb/r/doublewrite.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/doublewrite.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/doublewrite.result	2025-05-19 16:14:24.000000000 +0000
@@ -11,9 +11,11 @@
 commit work;
 SET GLOBAL innodb_fast_shutdown = 0;
 # restart
+SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0;
+SET GLOBAL innodb_max_dirty_pages_pct=99;
 connect  dml,localhost,root,,;
 XA START 'x';
-insert into t1 values (6, repeat('%', @@innodb_page_size/2));
+insert into t1 values(6, repeat('%', @@innodb_page_size/2));
 XA END 'x';
 XA PREPARE 'x';
 disconnect dml;
@@ -23,7 +25,6 @@
 # restart
 FOUND 1 /InnoDB: Recovered page \[page id: space=[1-9][0-9]*, page number=0\]/ in mysqld.1.err
 # restart
-XA ROLLBACK 'x';
 check table t1;
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
@@ -34,18 +35,13 @@
 3	////////////
 4	------------
 5	............
-connect  dml,localhost,root,,;
-XA START 'x';
-insert into t1 values (6, repeat('%', @@innodb_page_size/2));
-XA END 'x';
-XA PREPARE 'x';
-disconnect dml;
-connection default;
-flush table t1 for export;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0;
+SET GLOBAL innodb_max_dirty_pages_pct=99;
+XA ROLLBACK 'x';
+FLUSH TABLE t1 FOR EXPORT;
 # Kill the server
 # restart
 FOUND 4 /InnoDB: Recovered page \[page id: space=[1-9][0-9]*, page number=[03]\]/ in mysqld.1.err
-XA ROLLBACK 'x';
 check table t1;
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/foreign_key.result mariadb-10.11.13/mysql-test/suite/innodb/r/foreign_key.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/foreign_key.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/foreign_key.result	2025-05-19 16:14:24.000000000 +0000
@@ -155,7 +155,6 @@
 FLUSH TABLES;
 # restart
 disconnect incomplete;
-SET @save_stats_persistent = @@GLOBAL.innodb_stats_persistent;
 SET GLOBAL innodb_stats_persistent = 0;
 INSERT INTO child SET a=0;
 INSERT INTO child SET a=1;
@@ -1182,6 +1181,25 @@
 ALTER TABLE t2 ADD KEY(b), ALGORITHM=NOCOPY;
 DELETE FROM t1;
 DROP TABLE t2, t1;
+#
+# MDEV-33167 ASAN errors after failing to load foreign key
+#   relation for the table
+#
+call mtr.add_suppression("InnoDB: Load table `test`.`t3` failed, the table has missing foreign key indexes. Turn off 'foreign_key_checks' and try again.");
+SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR
+CREATE TABLE t1(f1 VARCHAR(8),
+FOREIGN KEY(f1) REFERENCES test.t3(f1))ENGINE=InnoDB;
+SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR
+CREATE TABLE t2(f1 VARCHAR(8),
+FOREIGN KEY(f1) REFERENCES test.t3(f1))
+ENGINE=InnoDB DEFAULT CHARSET=utf8mb3;
+SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR
+CREATE TABLE t3(f1 VARCHAR(8) PRIMARY KEY)
+ENGINE=InnoDB DEFAULT CHARSET=latin1;
+set GLOBAL innodb_fast_shutdown=0;
+# restart
+ALTER TABLE t2 FORCE;
+DROP TABLE t2, t1, t3;
 # End of 10.6 tests
 CREATE TABLE t1
 (
@@ -1204,5 +1222,4 @@
 ADD UNIQUE INDEX(f3);
 ERROR HY000: Cannot delete rows from table which is parent in a foreign key constraint 't1_ibfk_1' of table 't1'
 drop table t1, t2;
-SET GLOBAL innodb_stats_persistent = @save_stats_persistent;
 # End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb-index-online.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb-index-online.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb-index-online.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb-index-online.result	2025-05-19 16:14:24.000000000 +0000
@@ -534,7 +534,6 @@
 ROLLBACK;
 SET DEBUG_SYNC = 'now SIGNAL inserted';
 connection con1;
-disconnect con1;
 connection default;
 SELECT * FROM t1;
 a	b
@@ -543,6 +542,31 @@
 Table	Op	Msg_type	Msg_text
 test.t1	check	status	OK
 DROP TABLE t1;
+#
+# MDEV-36281 DML aborts during online virtual index
+#
+CREATE TABLE t1(f1 INT NOT NULL PRIMARY KEY, f2 INT NOT NULL,
+f3 INT NOT NULL, f4 INT AS (f3) VIRTUAL,
+f5 INT AS (f1) VIRTUAL, INDEX(f4))ENGINE=InnoDB;
+INSERT INTO t1(f1, f2, f3) VALUES(1, 2, 3);
+SET DEBUG_SYNC = 'innodb_inplace_alter_table_enter SIGNAL dml_start WAIT_FOR dml_finish';
+ALTER TABLE t1 ADD INDEX v1(f5, f2, f4), ADD INDEX v2(f3, f5);
+connection con1;
+set DEBUG_SYNC="now WAIT_FOR dml_start";
+UPDATE t1 SET f3= f3 + 1;
+set DEBUG_SYNC="now SIGNAL dml_finish";
+disconnect con1;
+connection default;
+CHECK TABLE t1 EXTENDED;
+Table	Op	Msg_type	Msg_text
+test.t1	check	status	OK
+SELECT f5, f2, f4 FROM t1 USE INDEX(v1);
+f5	f2	f4
+1	2	4
+SELECT f3, f5 FROM t1 USE INDEX(v2);
+f3	f5
+4	1
+DROP TABLE t1;
 SET DEBUG_SYNC = 'RESET';
 SET GLOBAL innodb_file_per_table = @global_innodb_file_per_table_orig;
 SET GLOBAL innodb_monitor_enable  = default;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_fail.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,4 @@
-call mtr.add_suppression("InnoDB: Cannot allocate memory for the buffer pool");
+call mtr.add_suppression("InnoDB: Cannot map innodb_buffer_pool_size_max=");
 call mtr.add_suppression("InnoDB: Plugin initialization aborted at srv0start.cc.*");
 call mtr.add_suppression("Plugin 'InnoDB' init function returned error.");
 call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed.");
@@ -6,4 +6,4 @@
 # MDEV-25019 memory allocation failures during startup cause server failure in different, confusing ways
 #
 # restart: --debug_dbug=+d,ib_buf_chunk_init_fails
-FOUND 1 /\[ERROR\] InnoDB: Cannot allocate memory for the buffer pool/ in mysqld.1.err
+FOUND 1 /\[ERROR\] InnoDB: Cannot map innodb_buffer_pool_size_max=16m/ in mysqld.1.err
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,32 +1,51 @@
+#
+# MDEV-29445: Reorganize buffer pool (and remove chunks)
+#
 set global innodb_adaptive_hash_index=ON;
 select @@innodb_buffer_pool_size;
 @@innodb_buffer_pool_size
 8388608
+set global innodb_buffer_pool_size = 9437184;
 set global innodb_buffer_pool_size = 10485760;
 select @@innodb_buffer_pool_size;
 @@innodb_buffer_pool_size
 10485760
-create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED;
-create or replace view view0 as select 1 union all select 1;
-set @`v_id` := 0;
-set @`v_val` := 0;
-replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17;
-set global innodb_buffer_pool_size = 64 * 1024 * 1024 + 512 * 1024;
-Warnings:
-Warning	1292	Truncated incorrect innodb_buffer_pool_size value: '67633152'
-select @@innodb_buffer_pool_size;
-@@innodb_buffer_pool_size
-68157440
+create table t1 (id int primary key, val int not null)
+ENGINE=InnoDB ROW_FORMAT=COMPRESSED;
+create table t2 (id int primary key, val int not null)
+ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=$kbs;
+SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR
+INSERT INTO t1 SELECT seq*4,seq*4 FROM seq_1_to_262144;
+SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR
+INSERT INTO t2 SELECT seq*4,seq*4 FROM seq_1_to_16384;
+set global innodb_buffer_pool_size = 7340032;
 select count(val) from t1;
 count(val)
 262144
+select count(val) from t2;
+count(val)
+16384
 set global innodb_adaptive_hash_index=OFF;
-set global innodb_buffer_pool_size = 25165824;
+set global innodb_buffer_pool_size = 24117248;
+set global innodb_buffer_pool_size = 26214400;
+Warnings:
+Warning	1292	Truncated incorrect innodb_buffer_pool_size value: '26214400'
 select @@innodb_buffer_pool_size;
 @@innodb_buffer_pool_size
 25165824
 select count(val) from t1;
 count(val)
 262144
-drop table t1;
-drop view view0;
+select count(val) from t2;
+count(val)
+16384
+drop table t1,t2;
+SET GLOBAL innodb_max_purge_lag_wait = 0;
+SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct;
+SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0;
+SET GLOBAL innodb_max_dirty_pages_pct = 0.0;
+SET GLOBAL innodb_buffer_pool_size = @old_innodb_buffer_pool_size;
+SET GLOBAL innodb_adaptive_hash_index = @old_innodb_adaptive_hash_index;
+SET GLOBAL innodb_max_dirty_pages_pct = @save_pct;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_bigtest.result	1970-01-01 00:00:00.000000000 +0000
@@ -1,14 +0,0 @@
-SET @save_size=@@innodb_buffer_pool_size;
-#
-# MDEV-27891: Delayed SIGSEGV in InnoDB buffer pool resize
-# after or during DROP TABLE
-#
-select @@innodb_buffer_pool_chunk_size;
-@@innodb_buffer_pool_chunk_size
-1048576
-CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
-SET GLOBAL innodb_buffer_pool_size=256*1024*1024;
-DROP TABLE t1;
-SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + @@innodb_buffer_pool_chunk_size;
-# End of 10.6 tests
-SET GLOBAL innodb_buffer_pool_size=@save_size;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_temporary.result	2025-05-19 16:14:24.000000000 +0000
@@ -4,7 +4,32 @@
 SET GLOBAL innodb_buffer_pool_size=16777216;
 CREATE TEMPORARY TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
 INSERT INTO t1 SELECT seq FROM seq_1_to_200;
+SET GLOBAL innodb_max_purge_lag_wait=0;
+SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct;
+SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0;
+SET GLOBAL innodb_max_dirty_pages_pct = 0.0;
+SHOW STATUS LIKE 'innodb_buffer_pool_resize_status';
+Variable_name	Value
+Innodb_buffer_pool_resize_status	
+connect con1,localhost,root;
+SET DEBUG_SYNC='buf_pool_shrink_before_wakeup SIGNAL blocked WAIT_FOR go';
 SET GLOBAL innodb_buffer_pool_size=8388608;
+connection default;
+SET DEBUG_SYNC='now WAIT_FOR blocked';
+SHOW STATUS LIKE 'innodb_buffer_pool_resize_status';
+Variable_name	Value
+Innodb_buffer_pool_resize_status	Withdrawing blocks. (505/505).
+SET DEBUG_SYNC='now SIGNAL go';
+connection con1;
+disconnect con1;
+connection default;
+SHOW STATUS LIKE 'innodb_buffer_pool_resize_status';
+Variable_name	Value
+Innodb_buffer_pool_resize_status	
+SET DEBUG_SYNC=RESET;
+SET GLOBAL innodb_max_dirty_pages_pct = @save_pct;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm;
 SELECT COUNT(*),MIN(a),MAX(a) FROM t1;
 COUNT(*)	MIN(a)	MAX(a)
 200	1	200
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_buffer_pool_resize_with_chunks.result	1970-01-01 00:00:00.000000000 +0000
@@ -1,26 +0,0 @@
-select @@innodb_buffer_pool_chunk_size;
-@@innodb_buffer_pool_chunk_size
-4194304
-create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED;
-create or replace view view0 as select 1 union all select 1;
-set @`v_id` := 0;
-set @`v_val` := 0;
-replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17;
-set global innodb_buffer_pool_size = 7340032;
-Warnings:
-Warning	1292	Truncated incorrect innodb_buffer_pool_size value: '7340032'
-select count(val) from t1;
-count(val)
-262144
-set global innodb_buffer_pool_size = 16777216;
-select count(val) from t1;
-count(val)
-262144
-drop table t1;
-drop view view0;
-set global innodb_buffer_pool_size = 2*1048576;
-Warnings:
-Warning	1292	Truncated incorrect innodb_buffer_pool_size value: '2097152'
-select @@innodb_buffer_pool_size;
-@@innodb_buffer_pool_size
-4194304
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_bug52663.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_bug52663.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_bug52663.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_bug52663.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,10 +1,11 @@
+SET @save_innodb_timeout=@@innodb_lock_wait_timeout;
+SET GLOBAL innodb_lock_wait_timeout=1;
 set session transaction isolation level read committed;
 create table innodb_bug52663 (what varchar(5), id integer, count integer, primary key
 (what, id)) engine=innodb;
 insert into innodb_bug52663 values ('total', 0, 0);
 begin;
 connect  addconroot, localhost, root,,;
-connection addconroot;
 set session transaction isolation level read committed;
 begin;
 connection default;
@@ -31,3 +32,4 @@
 what	id	count
 total	0	2
 drop table innodb_bug52663;
+SET GLOBAL innodb_lock_wait_timeout=@save_innodb_timeout;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_row_lock_time_ms.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,10 +1,18 @@
 CREATE TABLE `t`(`id` INT, PRIMARY KEY(`id`)) ENGINE=InnoDB STATS_PERSISTENT=0;
 INSERT INTO t VALUES (1);
-SET GLOBAL innodb_monitor_reset = "module_innodb";
+SET GLOBAL innodb_monitor_disable="lock_row_lock_time";
+SET GLOBAL innodb_monitor_disable="lock_row_lock_time_max";
+SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time';
+SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time_max';
+SET GLOBAL innodb_monitor_enable="lock_row_lock_time";
+SET GLOBAL innodb_monitor_enable="lock_row_lock_time_max";
 BEGIN;
 SELECT * FROM t FOR UPDATE;
 id
 1
+SELECT @innodb_row_lock_time_before := variable_value
+FROM information_schema.global_status
+WHERE LOWER(variable_name) = 'innodb_row_lock_time';
 connect con1,localhost,root,,;
 SET innodb_lock_wait_timeout = 1;
 SELECT * FROM t FOR UPDATE;
@@ -12,29 +20,27 @@
 disconnect con1;
 connection default;
 COMMIT;
-SELECT variable_value > 100 FROM information_schema.global_status
+SELECT variable_value - @innodb_row_lock_time_before > 100
+FROM information_schema.global_status
 WHERE LOWER(variable_name) = 'innodb_row_lock_time';
-variable_value > 100
+variable_value - @innodb_row_lock_time_before > 100
 1
-SELECT variable_value > 100 FROM information_schema.global_status
+SELECT  variable_value > 100
+FROM information_schema.global_status
 WHERE LOWER(variable_name) = 'innodb_row_lock_time_max';
 variable_value > 100
 1
-SELECT variable_value > 100 FROM information_schema.global_status
-WHERE LOWER(variable_name) = 'innodb_row_lock_time_avg';
-variable_value > 100
-1
-SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS
-WHERE NAME="lock_row_lock_time";
-count_reset > 100
-1
-SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS
-WHERE NAME="lock_row_lock_time_max";
+SELECT count_reset > 100
+FROM INFORMATION_SCHEMA.INNODB_METRICS
+WHERE NAME='lock_row_lock_time';
 count_reset > 100
 1
-SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS
-WHERE NAME="lock_row_lock_time_avg";
+SELECT count_reset > 100
+FROM INFORMATION_SCHEMA.INNODB_METRICS
+WHERE NAME='lock_row_lock_time_max';
 count_reset > 100
 1
 DROP TABLE t;
-SET GLOBAL innodb_monitor_reset=default;
+SET GLOBAL innodb_monitor_enable=default;
+SET GLOBAL innodb_monitor_disable=default;
+SET GLOBAL innodb_monitor_reset_all=default;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_auto_recalc_on_nonexistent.result	2025-05-19 16:14:24.000000000 +0000
@@ -5,13 +5,13 @@
 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't';
 COUNT(*)	3
 SELECT * FROM t;
-FLUSH TABLE t;
 DELETE FROM mysql.innodb_index_stats WHERE table_name = 't';
 DELETE FROM mysql.innodb_table_stats WHERE table_name = 't';
 SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't';
 COUNT(*)	0
 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't';
 COUNT(*)	0
+RENAME TABLE t TO tmp, tmp TO t;
 SELECT * FROM t;
 SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't';
 COUNT(*)	1
@@ -25,13 +25,13 @@
 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't';
 COUNT(*)	3
 SELECT * FROM t;
-FLUSH TABLE t;
 DELETE FROM mysql.innodb_index_stats WHERE table_name = 't';
 DELETE FROM mysql.innodb_table_stats WHERE table_name = 't';
 SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't';
 COUNT(*)	0
 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't';
 COUNT(*)	0
+RENAME TABLE t TO tmp, tmp TO t;
 SELECT * FROM t;
 SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't';
 COUNT(*)	1
@@ -45,13 +45,13 @@
 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't';
 COUNT(*)	3
 SELECT * FROM t;
-FLUSH TABLE t;
 DELETE FROM mysql.innodb_index_stats WHERE table_name = 't';
 DELETE FROM mysql.innodb_table_stats WHERE table_name = 't';
 SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't';
 COUNT(*)	0
 SELECT COUNT(*) FROM mysql.innodb_index_stats WHERE table_name = 't';
 COUNT(*)	0
+RENAME TABLE t TO tmp, tmp TO t;
 SELECT * FROM t;
 SELECT COUNT(*) FROM mysql.innodb_table_stats WHERE table_name = 't';
 COUNT(*)	0
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_fetch.result mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_fetch.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/innodb_stats_fetch.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/innodb_stats_fetch.result	2025-05-19 16:14:24.000000000 +0000
@@ -125,7 +125,7 @@
 table_name = 'test_ps_fetch' AND
 index_name = 'idx' AND
 stat_name = 'n_diff_pfx02';
-FLUSH TABLE test_ps_fetch;
+RENAME TABLE test_ps_fetch TO tmp, tmp TO test_ps_fetch;
 SELECT seq_in_index, column_name, cardinality
 FROM information_schema.statistics WHERE table_name = 'test_ps_fetch'
 ORDER BY index_name, seq_in_index;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug,redundant.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -1,8 +1,9 @@
-@@ -527,6 +527,6 @@
+@@ -576,7 +576,7 @@
  FROM information_schema.global_status
  WHERE variable_name = 'innodb_instant_alter_column';
  instants
 -37
 +38
- SET GLOBAL innodb_stats_persistent = @save_stats_persistent;
- # End of 10.6 tests
+ CREATE TABLE t1(f1 INT, f2 TEXT)ENGINE=InnoDB;
+ INSERT INTO t1 VALUES(1, 'a');
+ ALTER TABLE t1 ADD COLUMN f3 TEXT FIRST;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug.result mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/instant_alter_debug.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/instant_alter_debug.result	2025-05-19 16:14:24.000000000 +0000
@@ -575,5 +575,16 @@
 WHERE variable_name = 'innodb_instant_alter_column';
 instants
 37
+CREATE TABLE t1(f1 INT, f2 TEXT)ENGINE=InnoDB;
+INSERT INTO t1 VALUES(1, 'a');
+ALTER TABLE t1 ADD COLUMN f3 TEXT FIRST;
+SET STATEMENT DEBUG_DBUG="+d,instant_insert_fail" FOR
+ALTER TABLE t1 DROP COLUMN f1;
+ERROR HY000: Internal error: InnoDB: Insert into SYS_COLUMNS failed
+ALTER TABLE t1 DROP COLUMN f1;
+CHECK TABLE t1;
+Table	Op	Msg_type	Msg_text
+test.t1	check	status	OK
+DROP TABLE t1;
 SET GLOBAL innodb_stats_persistent = @save_stats_persistent;
 # End of 10.6 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/lock_isolation.result mariadb-10.11.13/mysql-test/suite/innodb/r/lock_isolation.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/lock_isolation.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/lock_isolation.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,3 +1,6 @@
+connect disable_purging,localhost,root;
+START TRANSACTION WITH CONSISTENT SNAPSHOT;
+connection default;
 #
 # MDEV-26642 Weird SELECT view when a record is
 #            modified to the same value by two transactions
@@ -52,15 +55,17 @@
 # MDEV-26643 Inconsistent behaviors of UPDATE under
 #            READ UNCOMMITTED and READ COMMITTED isolation level
 #
-CREATE TABLE t(a INT, b INT) ENGINE=InnoDB;
+CREATE TABLE t(a INT, b INT) ENGINE=InnoDB STATS_PERSISTENT=0;
 INSERT INTO t VALUES(NULL, 1), (2, 2);
 SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
 BEGIN;
 UPDATE t SET a = 10;
 connection consistent;
 SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 UPDATE t SET b = 20 WHERE a;
 connection default;
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 COMMIT;
 connection consistent;
 SELECT * FROM t;
@@ -74,8 +79,10 @@
 UPDATE t SET a = 10;
 connection consistent;
 SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 UPDATE t SET b = 20 WHERE a;
 connection default;
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 COMMIT;
 connection consistent;
 SELECT * FROM t;
@@ -89,8 +96,10 @@
 UPDATE t SET a = 10;
 connection con_weird;
 SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 UPDATE t SET b = 20 WHERE a;
 connection default;
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 SELECT * FROM t;
 a	b
 10	1
@@ -113,8 +122,10 @@
 connection consistent;
 SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
 BEGIN;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 UPDATE t SET b = 2 WHERE a;
 connection default;
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 UPDATE t SET a = 1;
 COMMIT;
 connection consistent;
@@ -128,20 +139,25 @@
 #
 # MDEV-33802 Weird read view after ROLLBACK of other transactions
 #
-CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB;
-INSERT INTO t SET a=1;
-BEGIN;
-INSERT INTO t SET a=2;
+CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB STATS_PERSISTENT=0;
 connection consistent;
 START TRANSACTION WITH CONSISTENT SNAPSHOT;
+connection default;
+INSERT INTO t SET a=1;
+connection consistent;
 SAVEPOINT sp1;
 SELECT * FROM t FORCE INDEX (b) FOR UPDATE;
 ERROR HY000: Record has changed since last read in table 't'
 SAVEPOINT sp1;
+connection default;
+BEGIN;
+INSERT INTO t SET a=2;
 connection con_weird;
 START TRANSACTION WITH CONSISTENT SNAPSHOT;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 SELECT * FROM t FORCE INDEX (b) FOR UPDATE;
 connection default;
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 ROLLBACK;
 connection con_weird;
 a	b
@@ -149,12 +165,74 @@
 SELECT * FROM t FORCE INDEX (b) FOR UPDATE;
 a	b
 1	NULL
+COMMIT;
 disconnect con_weird;
 connection consistent;
 SELECT * FROM t FORCE INDEX (b) FOR UPDATE;
 a	b
 1	NULL
+COMMIT;
+connection default;
+TRUNCATE TABLE t;
+#
+# MDEV-36639 innodb_snapshot_isolation=1 gives error for not comitted row changes
+#
+INSERT INTO t VALUES (1,1),(2,2);
+connection default;
+# Case 1: Transaction A modifies a record, transaction B with snapshot
+# isolation level is blocked by A, then A is committed.
+# Expected behaviour: B gets ER_CHECKREAD.
+BEGIN;
+UPDATE t SET b=3 WHERE a = 1;
+connection consistent;
+SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+BEGIN;
+SELECT * FROM t;
+a	b
+1	1
+2	2
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
+SELECT * FROM t WHERE a=1 FOR UPDATE;
+connection default;
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
+COMMIT;
+connection consistent;
+ERROR HY000: Record has changed since last read in table 't'
+# Case 2: Transaction A modifies a record, transaction B with snapshot
+# isolation level is blocked by A, then A is rolled back.
+# Expected behaviour: B continues execution.
+connection default;
+BEGIN;
+UPDATE t SET b=4 WHERE a=1;
+connection consistent;
+BEGIN;
+SELECT * FROM t;
+a	b
+2	2
+1	3
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
+SELECT * FROM t WHERE a=1 FOR UPDATE;
+connection default;
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
+ROLLBACK;
+connection consistent;
+a	b
+1	3
+ROLLBACK;
+# Case 3: Transaction B with snapshot isolation level started with
+# consistent snapshot. Transaction A modifies a record and is committed.
+# Both B tries to read modified by A record.
+# Expected behavior: B gets ER_CHECKREAD.
+connection consistent;
+START TRANSACTION WITH CONSISTENT SNAPSHOT;
+connection default;
+UPDATE t SET b=4 WHERE a=1;
+connection consistent;
+SELECT * FROM t WHERE a=1 FOR UPDATE;
+ERROR HY000: Record has changed since last read in table 't'
 disconnect consistent;
+disconnect disable_purging;
 connection default;
+SET DEBUG_SYNC="RESET";
 DROP TABLE t;
 # End of 10.6 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/lock_memory_debug.result mariadb-10.11.13/mysql-test/suite/innodb/r/lock_memory_debug.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/lock_memory_debug.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/lock_memory_debug.result	2025-05-19 16:14:24.000000000 +0000
@@ -5,7 +5,7 @@
 CREATE TABLE t1 (col1 INT) ENGINE=InnoDB;
 INSERT INTO t1 VALUES (1),(2),(3),(4),(5);
 SET STATEMENT debug_dbug='+d,innodb_skip_lock_bitmap' FOR
-INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g LIMIT 45000;
+INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g;
 ERROR HY000: The total number of locks exceeds the lock table size
 SELECT COUNT(*) FROM t1;
 COUNT(*)
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/log_upgrade_101_flags.result mariadb-10.11.13/mysql-test/suite/innodb/r/log_upgrade_101_flags.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/log_upgrade_101_flags.result	2025-01-30 11:01:23.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/log_upgrade_101_flags.result	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,7 @@
 call mtr.add_suppression("InnoDB: The change buffer is corrupted");
 call mtr.add_suppression("InnoDB: Tablespace size stored in header is 768 pages, but the sum of data file sizes is 384 pages");
 call mtr.add_suppression("InnoDB: adjusting FSP_SPACE_FLAGS of file");
-# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=10M
+# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_upgrade --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=11M
 SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
 WHERE engine = 'innodb'
 AND support IN ('YES', 'DEFAULT', 'ENABLED');
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure,32bit.rdiff	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,11 @@
+--- mem_pressure.result
++++ mem_pressure,32bit.result
+@@ -11,7 +11,7 @@
+ @@GLOBAL.innodb_buffer_pool_size_auto_min,
+ @@GLOBAL.innodb_buffer_pool_size_max;
+ @@GLOBAL.innodb_buffer_pool_size	@@GLOBAL.innodb_buffer_pool_size_auto_min	@@GLOBAL.innodb_buffer_pool_size_max
+-17825792	16777216	25165824
++17825792	16777216	18874368
+ CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
+ SET GLOBAL innodb_limit_optimistic_insert_debug=2;
+ SET STATEMENT unique_checks=0, foreign_key_checks=0 FOR
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure.result mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/mem_pressure.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/mem_pressure.result	2025-05-19 16:14:24.000000000 +0000
@@ -4,23 +4,34 @@
 set @save_dbug=@@debug_dbug;
 set @save_limit=@@GLOBAL.innodb_limit_optimistic_insert_debug;
 set GLOBAL innodb_max_purge_lag_wait=0;
+SET @innodb_buffer_pool_size= @@GLOBAL.innodb_buffer_pool_size;
+SET @innodb_buffer_pool_size_min= @@GLOBAL.innodb_buffer_pool_size_auto_min;
+SELECT
+@@GLOBAL.innodb_buffer_pool_size,
+@@GLOBAL.innodb_buffer_pool_size_auto_min,
+@@GLOBAL.innodb_buffer_pool_size_max;
+@@GLOBAL.innodb_buffer_pool_size	@@GLOBAL.innodb_buffer_pool_size_auto_min	@@GLOBAL.innodb_buffer_pool_size_max
+17825792	16777216	25165824
 CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
 SET GLOBAL innodb_limit_optimistic_insert_debug=2;
 SET STATEMENT unique_checks=0, foreign_key_checks=0 FOR
 INSERT INTO t1 SELECT * FROM seq_1_to_1000;
 SET GLOBAL innodb_limit_optimistic_insert_debug=@save_limit;
 DROP TABLE t1;
-SELECT CAST(VARIABLE_VALUE AS INTEGER) INTO @dirty_prev
-FROM INFORMATION_SCHEMA.GLOBAL_STATUS
-WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty';
-set debug_dbug="d,trigger_garbage_collection";
-SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size;
-FOUND 1 /[Mm]emory pressure.*/ in mysqld.1.err
-SELECT CAST(VARIABLE_VALUE AS INTEGER) < @dirty_prev AS LESS_DIRTY_IS_GOOD
-FROM INFORMATION_SCHEMA.GLOBAL_STATUS
-WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty';
-LESS_DIRTY_IS_GOOD
+SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR
+SET GLOBAL innodb_buffer_pool_size=@innodb_buffer_pool_size;
+FOUND 1 /Memory pressure event disregarded.*/ in mysqld.1.err
+SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR
+SET GLOBAL innodb_buffer_pool_size_auto_min=
+CAST(@innodb_buffer_pool_size/2 AS UNSIGNED),
+innodb_buffer_pool_size=@innodb_buffer_pool_size;
+Warnings:
+Warning	1292	Truncated incorrect innodb_buffer_pool_size_auto_min value: '8912896'
+select @@global.innodb_buffer_pool_size < @innodb_buffer_pool_size;
+@@global.innodb_buffer_pool_size < @innodb_buffer_pool_size
 1
-FOUND 1 /InnoDB: Memory pressure event freed.*/ in mysqld.1.err
+FOUND 1 /InnoDB: Memory pressure event shrunk.*/ in mysqld.1.err
 set debug_dbug=@save_dbug;
+SET GLOBAL innodb_buffer_pool_size= @innodb_buffer_pool_size;
+SET GLOBAL innodb_buffer_pool_size_auto_min=@innodb_buffer_pool_size_min;
 # End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/page_cleaner.result mariadb-10.11.13/mysql-test/suite/innodb/r/page_cleaner.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/page_cleaner.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/page_cleaner.result	2025-05-19 16:14:24.000000000 +0000
@@ -2,8 +2,21 @@
 SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm;
 SET GLOBAL innodb_max_dirty_pages_pct_lwm=0.0;
 SET GLOBAL innodb_max_dirty_pages_pct=0.0;
+CREATE TABLE t(a INT) ENGINE=InnoDB STATS_PERSISTENT=0;
+connect  prevent_purge,localhost,root;
+START TRANSACTION WITH CONSISTENT SNAPSHOT;
+connection default;
+SET GLOBAL innodb_max_purge_lag_wait=0;
 SET GLOBAL innodb_max_dirty_pages_pct=90.0;
-CREATE TABLE t ENGINE=InnoDB SELECT * FROM seq_1_to_10000;
+SELECT variable_value INTO @log_writes FROM information_schema.global_status
+WHERE variable_name='innodb_log_writes';
+BEGIN;
+ROLLBACK;
+SELECT if(variable_value-@log_writes<500,'ok',variable_value-@log_writes)
+FROM information_schema.global_status WHERE variable_name='innodb_log_writes';
+if(variable_value-@log_writes<500,'ok',variable_value-@log_writes)
+ok
+disconnect prevent_purge;
 SELECT variable_value>0 FROM information_schema.global_status
 WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY';
 variable_value>0
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/recovery_memory.result mariadb-10.11.13/mysql-test/suite/innodb/r/recovery_memory.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/recovery_memory.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/recovery_memory.result	2025-05-19 16:14:24.000000000 +0000
@@ -12,7 +12,7 @@
 connect con1,localhost,root,,,;
 CALL dorepeat();
 connection default;
-# restart: --innodb_buffer_pool_size=5242880
+# restart: --innodb_buffer_pool_size=6m
 DROP TABLE t1;
 DROP PROCEDURE dorepeat;
 #
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,16k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,16k.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,16k.rdiff	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,16k.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
---- ./suite/innodb/r/restart.result
-+++ suite/innodb/r/restart.reject
-@@ -32,10 +32,10 @@
- SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig;
- SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size;
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1);
--ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '5242879'
- SHOW WARNINGS;
- Level	Code	Message
--Warning	1210	innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE
--Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+Warning	1210	innodb_buffer_pool_size must be at least 5242880 for innodb_page_size=16384
-+Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of '5242879'
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size);
- SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,32k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,32k.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,32k.rdiff	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,32k.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
---- ./suite/innodb/r/restart.result
-+++ suite/innodb/r/restart.reject
-@@ -32,10 +32,10 @@
- SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig;
- SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size;
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1);
--ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '10485759'
- SHOW WARNINGS;
- Level	Code	Message
--Warning	1210	innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE
--Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+Warning	1210	innodb_buffer_pool_size must be at least 10485760 for innodb_page_size=32768
-+Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of '10485759'
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size);
- SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,4k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,4k.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,4k.rdiff	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,4k.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
---- ./suite/innodb/r/restart.result
-+++ suite/innodb/r/restart.reject
-@@ -32,10 +32,10 @@
- SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig;
- SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size;
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1);
--ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '2097151'
- SHOW WARNINGS;
- Level	Code	Message
--Warning	1210	innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE
--Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+Warning	1210	innodb_buffer_pool_size must be at least 2097152 for innodb_page_size=4096
-+Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of '2097151'
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size);
- SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,64k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,64k.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,64k.rdiff	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,64k.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
---- ./suite/innodb/r/restart.result
-+++ suite/innodb/r/restart.reject
-@@ -32,10 +32,10 @@
- SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig;
- SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size;
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1);
--ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '20971519'
- SHOW WARNINGS;
- Level	Code	Message
--Warning	1210	innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE
--Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+Warning	1210	innodb_buffer_pool_size must be at least 20971520 for innodb_page_size=65536
-+Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of '20971519'
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size);
- SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart,8k.rdiff mariadb-10.11.13/mysql-test/suite/innodb/r/restart,8k.rdiff
--- mariadb-10.11.11/mysql-test/suite/innodb/r/restart,8k.rdiff	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart,8k.rdiff	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
---- ./suite/innodb/r/restart.result
-+++ suite/innodb/r/restart.reject
-@@ -32,10 +32,10 @@
- SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig;
- SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size;
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1);
--ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of '3145727'
- SHOW WARNINGS;
- Level	Code	Message
--Warning	1210	innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE
--Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-+Warning	1210	innodb_buffer_pool_size must be at least 3145728 for innodb_page_size=8192
-+Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of '3145727'
- EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size);
- SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/restart.result mariadb-10.11.13/mysql-test/suite/innodb/r/restart.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/restart.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/restart.result	2025-05-19 16:14:24.000000000 +0000
@@ -30,19 +30,6 @@
 a
 DROP TABLE tr,tc,td;
 #
-# MDEV-27467 innodb to enfore the minimum innodb_buffer_pool_size in SET (resize) the same as startup
-#
-SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig;
-SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size;
-EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1);
-ERROR 42000: Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-SHOW WARNINGS;
-Level	Code	Message
-Warning	1210	innodb_buffer_pool_size must be at least MIN_VAL for innodb_page_size=PAGE_SIZE
-Error	1231	Variable 'innodb_buffer_pool_size' can't be set to the value of 'WRONG_VALUE'
-EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size);
-SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig;
-#
 # MDEV-27882 Innodb - recognise MySQL-8.0 innodb flags and give a specific error message
 #
 FOUND 1 /InnoDB: MySQL-8\.0 tablespace in \./ibdata1/ in attempted_start.err
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/stat_tables.result mariadb-10.11.13/mysql-test/suite/innodb/r/stat_tables.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/stat_tables.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/stat_tables.result	2025-05-19 16:14:24.000000000 +0000
@@ -101,3 +101,13 @@
 CREATE TABLE t1 (c1 INT) ENGINE=InnoDB STATS_PERSISTENT 1;
 DROP TABLE t1;
 # End of 10.6 tests
+#
+# MDEV-36373 Warning: ... persistent statistics storage is corrupted
+#
+CREATE TABLE t1 (c INT) ENGINE=InnoDB;
+SET STATEMENT tx_read_only=1 FOR ANALYZE TABLE t1;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+DROP TABLE t1;
+# End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/r/stats_persistent.result mariadb-10.11.13/mysql-test/suite/innodb/r/stats_persistent.result
--- mariadb-10.11.11/mysql-test/suite/innodb/r/stats_persistent.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/r/stats_persistent.result	2025-05-19 16:14:24.000000000 +0000
@@ -17,3 +17,13 @@
 test.t1	analyze	status	OK
 SET DEBUG_SYNC= 'RESET';
 DROP TABLE t1;
+#
+# MDEV-36649 dict_acquire_mdl_shared() aborts when table
+#      mode is DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
+#
+set @old_defragment_stats_accuracy= @@innodb_defragment_stats_accuracy;
+SET GLOBAL innodb_defragment_stats_accuracy=1;
+CREATE TABLE t (a INT ) ENGINE=INNODB;
+INSERT INTO t SELECT * FROM seq_1_to_1000;
+DROP TABLE t;
+set global innodb_defragment_stats_accuracy= @old_defragment_stats_accuracy;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/alter_copy_bulk.test mariadb-10.11.13/mysql-test/suite/innodb/t/alter_copy_bulk.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/alter_copy_bulk.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/alter_copy_bulk.test	2025-05-19 16:14:24.000000000 +0000
@@ -109,3 +109,24 @@
 ALTER TABLE t1 FORCE, ALGORITHM=COPY;
 DROP TABLE t1;
 SET GLOBAL innodb_stats_persistent=@default_stats_persistent;
+
+--echo #
+--echo #  MDEV-36504 Memory leak after insert into empty table
+--echo #
+CREATE TABLE t1 (k INT PRIMARY KEY)ENGINE=InnoDB;
+INSERT INTO t1 SET k= 1;
+START TRANSACTION;
+INSERT INTO t1 SET k= 2;
+SELECT COUNT(*) > 0 FROM mysql.innodb_index_stats LOCK IN SHARE MODE;
+
+connect(con1,localhost,root,,,);
+SET innodb_lock_wait_timeout=0;
+--error ER_LOCK_WAIT_TIMEOUT
+CREATE TABLE t2(f1 INT DEFAULT 1 PRIMARY KEY)
+  STATS_PERSISTENT= 1 ENGINE=InnoDB as SELECT k FROM t1;
+disconnect con1;
+connection default;
+SET innodb_lock_wait_timeout=default;
+DROP TABLE t1;
+DROP TABLE IF EXISTS t2;
+--source include/restart_mysqld.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/alter_partitioned_debug.test mariadb-10.11.13/mysql-test/suite/innodb/t/alter_partitioned_debug.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/alter_partitioned_debug.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/alter_partitioned_debug.test	2025-05-19 16:14:24.000000000 +0000
@@ -4,6 +4,7 @@
 --source include/have_debug_sync.inc
 
 CREATE TABLE t1 (a INT, b VARCHAR(10)) ENGINE=InnoDB
+STATS_PERSISTENT=1 STATS_AUTO_RECALC=0
 PARTITION BY RANGE(a)
 (PARTITION pa VALUES LESS THAN (3),
 PARTITION pb VALUES LESS THAN (5));
@@ -26,9 +27,46 @@
 
 connection default;
 DELETE FROM t1;
-disconnect ddl;
 
 SET DEBUG_SYNC = 'RESET';
 
 CHECK TABLE t1;
-DROP TABLE t1;
+
+CREATE TABLE t(a INT, b VARCHAR(10)) ENGINE=InnoDB
+STATS_PERSISTENT=1 STATS_AUTO_RECALC=1;
+RENAME TABLE t TO u;
+DELETE FROM mysql.innodb_table_stats WHERE table_name='u';
+DELETE FROM mysql.innodb_index_stats WHERE table_name='u';
+
+send SET STATEMENT debug_dbug='+d,dict_stats_save_exit_notify_and_wait' FOR
+SELECT * FROM u;
+
+connection ddl;
+SET DEBUG_SYNC='open_tables_after_open_and_process_table
+WAIT_FOR dict_stats_save_finished';
+send ALTER TABLE t1 EXCHANGE PARTITION pb WITH TABLE u;
+
+connect sync,localhost,root;
+let $wait_condition=
+  select count(*) = 1 from information_schema.processlist
+  where state = 'debug sync point: now'
+  and info like 'SET STATEMENT debug_dbug%SELECT * FROM u';
+--source include/wait_condition.inc
+let $wait_condition=
+  select count(*) = 1 from information_schema.processlist
+  where state = 'Waiting for table metadata lock'
+  and info like 'ALTER TABLE t1 EXCHANGE PARTITION pb WITH TABLE u';
+--source include/wait_condition.inc
+SET DEBUG_SYNC='now SIGNAL dict_stats_save_unblock';
+disconnect sync;
+
+connection default;
+reap;
+connection ddl;
+reap;
+disconnect ddl;
+connection default;
+SELECT * FROM u;
+SET DEBUG_SYNC = 'RESET';
+
+DROP TABLE t1,u;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/autoinc_persist.test mariadb-10.11.13/mysql-test/suite/innodb/t/autoinc_persist.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/autoinc_persist.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/autoinc_persist.test	2025-05-19 16:14:24.000000000 +0000
@@ -95,15 +95,25 @@
 SELECT * FROM t10;
 
 eval CREATE TABLE t11(a FLOAT $AUTO_INCREMENT_KEY_a) ENGINE = InnoDB;
-INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0),
-(20), (30), (31);
+INSERT INTO t11 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
 SELECT * FROM t11;
 
+eval CREATE TABLE t11u(a FLOAT UNSIGNED $AUTO_INCREMENT_KEY_a) ENGINE = InnoDB;
+--error ER_WARN_DATA_OUT_OF_RANGE
+INSERT INTO t11u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
+INSERT INTO t11u VALUES(0), (0), (0), (0), (0), (20), (30), (31);
+SELECT * FROM t11u;
+
 eval CREATE TABLE t12(a DOUBLE $AUTO_INCREMENT_KEY_a) ENGINE = InnoDB;
-INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0),
-(20), (30), (31);
+INSERT INTO t12 VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
 SELECT * FROM t12;
 
+CREATE TABLE t12u(a DOUBLE UNSIGNED AUTO_INCREMENT KEY) ENGINE = InnoDB;
+--error ER_WARN_DATA_OUT_OF_RANGE
+INSERT INTO t12u VALUES(0), (0), (0), (0), (-1), (-10), (0), (20), (30), (31);
+INSERT INTO t12u VALUES(0), (0), (0), (0), (0), (20), (30), (31);
+SELECT * FROM t12u;
+
 --echo # Scenario 1: Normal restart, to test if the counters are persisted
 --echo # Scenario 2: Delete some values, to test the counters should not be the
 --echo # one which is the largest in current table
@@ -566,4 +576,5 @@
 SELECT MAX(b) AS `Expect 4` FROM t33;
 SELECT * FROM t33;
 
-DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t30, t32, t33;
+DROP TABLE t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t11u, t12u,
+t30, t32, t33;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
---innodb-buffer-pool-size=8m --innodb-buffer-pool-chunk-size=1m
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.test mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/buf_pool_resize_oom.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/buf_pool_resize_oom.test	1970-01-01 00:00:00.000000000 +0000
@@ -1,27 +0,0 @@
---source include/have_innodb.inc
---source include/have_debug.inc
---source include/not_embedded.inc
-
---echo #
---echo # Bug #21348684 SIGABRT DURING RESIZING THE INNODB BUFFER POOL
---echo # ONLINE WITH MEMORY FULL CONDITION
---echo #
-
-call mtr.add_suppression("InnoDB: failed to allocate the chunk array");
-
-SET GLOBAL debug_dbug='+d,buf_pool_resize_chunk_null';
-
---disable_warnings
-SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + 1048576;
---enable_warnings
-
-let $wait_timeout = 60;
-let $wait_condition =
-  SELECT SUBSTR(variable_value, 1, 27) = 'Resizing buffer pool failed'
-  FROM information_schema.global_status
-  WHERE variable_name = 'INNODB_BUFFER_POOL_RESIZE_STATUS';
-
---source include/wait_condition.inc
-# Restart the server, because the buffer pool would not necessarily be
-# shrunk afterwards even if we request it.
---source include/restart_mysqld.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.combinations mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.combinations
--- mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.combinations	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.combinations	2025-05-19 16:14:24.000000000 +0000
@@ -1,7 +1,9 @@
 [strict_crc32]
 --innodb-checksum-algorithm=strict_crc32
 --innodb-use-atomic-writes=0
+--innodb-undo-tablespaces=0
 
 [strict_full_crc32]
 --innodb-checksum-algorithm=strict_full_crc32
 --innodb-use-atomic-writes=0
+--innodb-undo-tablespaces=0
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.test mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/doublewrite.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/doublewrite.test	2025-05-19 16:14:24.000000000 +0000
@@ -42,10 +42,17 @@
 SET GLOBAL innodb_fast_shutdown = 0;
 let $shutdown_timeout=;
 --source include/restart_mysqld.inc
+SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0;
+let $wait_condition =
+SELECT variable_value = 0
+FROM information_schema.global_status
+WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY';
+--source include/wait_condition.inc
+SET GLOBAL innodb_max_dirty_pages_pct=99;
 --source ../include/no_checkpoint_start.inc
 connect (dml,localhost,root,,);
 XA START 'x';
-insert into t1 values (6, repeat('%', @@innodb_page_size/2));
+insert into t1 values(6, repeat('%', @@innodb_page_size/2));
 XA END 'x';
 XA PREPARE 'x';
 disconnect dml;
@@ -53,10 +60,12 @@
 
 flush table t1 for export;
 
-let $restart_parameters=;
---let CLEANUP_IF_CHECKPOINT=XA COMMIT 'x';drop table t1;
+--let CLEANUP_IF_CHECKPOINT=drop table t1, unexpected_checkpoint;
 --source ../include/no_checkpoint_end.inc
 
+--copy_file $MYSQLD_DATADIR/ibdata1 $MYSQLD_DATADIR/ibdata1.bak
+--copy_file $MYSQLD_DATADIR/ib_logfile0 $MYSQLD_DATADIR/ib_logfile0.bak
+
 perl;
 use IO::Handle;
 do "$ENV{MTR_SUITE_DIR}/include/crc32.pl";
@@ -145,6 +154,12 @@
 --source include/shutdown_mysqld.inc
 let $shutdown_timeout=;
 # Corrupt the file in a better way.
+
+--remove_file $MYSQLD_DATADIR/ibdata1
+--remove_file $MYSQLD_DATADIR/ib_logfile0
+--move_file $MYSQLD_DATADIR/ibdata1.bak $MYSQLD_DATADIR/ibdata1
+--move_file $MYSQLD_DATADIR/ib_logfile0.bak $MYSQLD_DATADIR/ib_logfile0
+
 perl;
 use IO::Handle;
 my $fname= "$ENV{'MYSQLD_DATADIR'}test/t1.ibd";
@@ -157,22 +172,23 @@
 close FILE;
 EOF
 --source include/start_mysqld.inc
-XA ROLLBACK 'x';
 check table t1;
 select f1, f2 from t1;
 
+SET GLOBAL innodb_max_dirty_pages_pct_lwm=0,innodb_max_dirty_pages_pct=0;
+let $wait_condition =
+SELECT variable_value = 0
+FROM information_schema.global_status
+WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY';
+--source include/wait_condition.inc
+SET GLOBAL innodb_max_dirty_pages_pct=99;
 --source ../include/no_checkpoint_start.inc
-connect (dml,localhost,root,,);
-XA START 'x';
-insert into t1 values (6, repeat('%', @@innodb_page_size/2));
-XA END 'x';
-XA PREPARE 'x';
-disconnect dml;
-connection default;
-
-flush table t1 for export;
+XA ROLLBACK 'x';
+FLUSH TABLE t1 FOR EXPORT;
 
-let $restart_parameters=;
+# If we are skipping the test at this point due to an unexpected
+# checkpoint, we will already have tested a part of this functionality.
+--let CLEANUP_IF_CHECKPOINT=drop table t1;
 --source ../include/no_checkpoint_end.inc
 
 # Zero out the first page in file and try to recover from dblwr
@@ -186,7 +202,6 @@
 --source include/start_mysqld.inc
 let SEARCH_PATTERN=InnoDB: Recovered page \\[page id: space=[1-9][0-9]*, page number=[03]\\];
 --source include/search_pattern_in_file.inc
-XA ROLLBACK 'x';
 check table t1;
 select f1, f2 from t1;
 drop table t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/foreign_key.test mariadb-10.11.13/mysql-test/suite/innodb/t/foreign_key.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/foreign_key.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/foreign_key.test	2025-05-19 16:14:24.000000000 +0000
@@ -133,7 +133,6 @@
 --let $shutdown_timeout=
 disconnect incomplete;
 
-SET @save_stats_persistent = @@GLOBAL.innodb_stats_persistent;
 SET GLOBAL innodb_stats_persistent = 0;
 
 INSERT INTO child SET a=0;
@@ -1245,6 +1244,33 @@
 DELETE FROM t1;
 DROP TABLE t2, t1;
 
+--echo #
+--echo # MDEV-33167 ASAN errors after failing to load foreign key
+--echo #   relation for the table
+--echo #
+call mtr.add_suppression("InnoDB: Load table `test`.`t3` failed, the table has missing foreign key indexes. Turn off 'foreign_key_checks' and try again.");
+SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR
+CREATE TABLE t1(f1 VARCHAR(8),
+	        FOREIGN KEY(f1) REFERENCES test.t3(f1))ENGINE=InnoDB;
+
+SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR
+CREATE TABLE t2(f1 VARCHAR(8),
+	        FOREIGN KEY(f1) REFERENCES test.t3(f1))
+                ENGINE=InnoDB DEFAULT CHARSET=utf8mb3;
+
+SET STATEMENT FOREIGN_KEY_CHECKS = 0 FOR
+CREATE TABLE t3(f1 VARCHAR(8) PRIMARY KEY)
+		ENGINE=InnoDB DEFAULT CHARSET=latin1;
+
+set GLOBAL innodb_fast_shutdown=0;
+--let $shutdown_timeout=
+--source include/restart_mysqld.inc
+# Error encountered while loading the foreign key
+# constraint for t3. t1 wasn't loaded into memory yet
+# t2 failed to find index for foreign key relation
+ALTER TABLE t2 FORCE;
+DROP TABLE t2, t1, t3;
+
 --echo # End of 10.6 tests
 
 CREATE TABLE t1
@@ -1270,7 +1296,5 @@
 ALTER TABLE t2 ADD FOREIGN KEY (f2) REFERENCES t2 (f2),
 ADD UNIQUE INDEX(f3);
 drop table t1, t2;
-SET GLOBAL innodb_stats_persistent = @save_stats_persistent;
 
 --echo # End of 10.11 tests
---source include/wait_until_count_sessions.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,5 @@
 --loose-innodb-sort-buffer-size=64k
 --loose-innodb-online-alter-log-max-size=128k
---loose-innodb-buffer-pool-size=5M
+--loose-innodb-buffer-pool-size=6M
 --loose-innodb-sys-indexes
 --loose-innodb-sys-fields
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-index-online.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-index-online.test	2025-05-19 16:14:24.000000000 +0000
@@ -510,12 +510,35 @@
 
 connection con1;
 reap;
-disconnect con1;
 
 connection default;
 SELECT * FROM t1;
 CHECK TABLE t1;
 DROP TABLE t1;
+
+--echo #
+--echo # MDEV-36281 DML aborts during online virtual index
+--echo #
+CREATE TABLE t1(f1 INT NOT NULL PRIMARY KEY, f2 INT NOT NULL,
+		f3 INT NOT NULL, f4 INT AS (f3) VIRTUAL,
+		f5 INT AS (f1) VIRTUAL, INDEX(f4))ENGINE=InnoDB;
+INSERT INTO t1(f1, f2, f3) VALUES(1, 2, 3);
+SET DEBUG_SYNC = 'innodb_inplace_alter_table_enter SIGNAL dml_start WAIT_FOR dml_finish';
+send ALTER TABLE t1 ADD INDEX v1(f5, f2, f4), ADD INDEX v2(f3, f5);
+
+connection con1;
+set DEBUG_SYNC="now WAIT_FOR dml_start";
+UPDATE t1 SET f3= f3 + 1;
+set DEBUG_SYNC="now SIGNAL dml_finish";
+
+disconnect con1;
+connection default;
+reap;
+CHECK TABLE t1 EXTENDED;
+SELECT f5, f2, f4 FROM t1 USE INDEX(v1);
+SELECT f3, f5 FROM t1 USE INDEX(v2);
+DROP TABLE t1;
+
 SET DEBUG_SYNC = 'RESET';
 
 # Check that all connections opened by test cases in this file are really
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-table-online-master.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-table-online-master.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb-table-online-master.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb-table-online-master.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1 +1 @@
---innodb-sort-buffer-size=64k --innodb-online-alter-log-max-size=512k --innodb-buffer-pool-size=5M
+--innodb-sort-buffer-size=64k --innodb-online-alter-log-max-size=512k --innodb-buffer-pool-size=6M
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.opt	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1 @@
+--innodb-buffer-pool-size-max=16m
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_fail.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,6 +1,6 @@
 --source include/have_innodb.inc
 --source include/have_debug.inc
-call mtr.add_suppression("InnoDB: Cannot allocate memory for the buffer pool");
+call mtr.add_suppression("InnoDB: Cannot map innodb_buffer_pool_size_max=");
 call mtr.add_suppression("InnoDB: Plugin initialization aborted at srv0start.cc.*");
 call mtr.add_suppression("Plugin 'InnoDB' init function returned error.");
 call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed.");
@@ -10,5 +10,5 @@
 let restart_parameters=--debug_dbug=+d,ib_buf_chunk_init_fails;
 --source include/restart_mysqld.inc
 let SEARCH_FILE = $MYSQLTEST_VARDIR/log/mysqld.1.err;
-let SEARCH_PATTERN=\[ERROR\] InnoDB: Cannot allocate memory for the buffer pool;
+let SEARCH_PATTERN=\[ERROR\] InnoDB: Cannot map innodb_buffer_pool_size_max=16m;
 --source include/search_pattern_in_file.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,2 +1,3 @@
 --innodb-buffer-pool-size=8M
+--innodb-buffer-pool-size-max=25M
 --innodb-page-size=4k
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,17 +1,13 @@
-#
-# WL6117 : Resize the InnoDB Buffer Pool Online
-#
-
 --source include/have_innodb.inc
---source include/big_test.inc
+--source include/have_sequence.inc
 
-let $wait_timeout = 180;
-let $wait_condition =
-  SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool'
-  FROM information_schema.global_status
-  WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status';
+--echo #
+--echo # MDEV-29445: Reorganize buffer pool (and remove chunks)
+--echo #
 
 --disable_query_log
+call mtr.add_suppression("InnoDB: Over 67 percent of the buffer pool is occupied by lock heaps");
+call mtr.add_suppression("innodb_buffer_pool_size change aborted");
 set @old_innodb_buffer_pool_size = @@innodb_buffer_pool_size;
 set @old_innodb_adaptive_hash_index = @@innodb_adaptive_hash_index;
 --enable_query_log
@@ -21,52 +17,63 @@
 select @@innodb_buffer_pool_size;
 
 # Expand buffer pool
+set global innodb_buffer_pool_size = 9437184;
 set global innodb_buffer_pool_size = 10485760;
 
---source include/wait_condition.inc
-
 select @@innodb_buffer_pool_size;
+let $kbs=`SELECT CAST(@@innodb_page_size / 1024 AS INT)`;
 
 # fill buffer pool
 --disable_query_log
 SET @save_innodb_read_only_compressed=@@GLOBAL.innodb_read_only_compressed;
 SET GLOBAL innodb_read_only_compressed=OFF;
 --enable_query_log
-create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED;
-create or replace view view0 as select 1 union all select 1;
+create table t1 (id int primary key, val int not null)
+ENGINE=InnoDB ROW_FORMAT=COMPRESSED;
+evalp create table t2 (id int primary key, val int not null)
+ENGINE=InnoDB ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=$kbs;
+
+SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR
+INSERT INTO t1 SELECT seq*4,seq*4 FROM seq_1_to_262144;
+SET STATEMENT foreign_key_checks=0, unique_checks=0 FOR
+INSERT INTO t2 SELECT seq*4,seq*4 FROM seq_1_to_16384;
 
-set @`v_id` := 0;
-set @`v_val` := 0;
-
-# 2^18 == 262144 records
-replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17;
 --disable_query_log
 SET GLOBAL innodb_read_only_compressed=@save_innodb_read_only_compressed;
 --enable_query_log
 
-# Shrink buffer pool
-set global innodb_buffer_pool_size = 64 * 1024 * 1024 + 512 * 1024;
---source include/wait_condition.inc
-
-select @@innodb_buffer_pool_size;
+# Attempt to shrink the buffer pool. This may occasionally fail.
+--error 0,ER_WRONG_USAGE
+set global innodb_buffer_pool_size = 7340032;
 
 select count(val) from t1;
+select count(val) from t2;
 
 set global innodb_adaptive_hash_index=OFF;
 
-# Expand buffer pool to 24MB
-set global innodb_buffer_pool_size = 25165824;
---source include/wait_condition.inc
+# Expand buffer pool to 23 and then 24 MiB (requesting 25 MiB)
+set global innodb_buffer_pool_size = 24117248;
+set global innodb_buffer_pool_size = 26214400;
 
 select @@innodb_buffer_pool_size;
 
 select count(val) from t1;
+select count(val) from t2;
 
-drop table t1;
-drop view view0;
+drop table t1,t2;
 
---disable_query_log
-set global innodb_adaptive_hash_index = @old_innodb_adaptive_hash_index;
-set global innodb_buffer_pool_size = @old_innodb_buffer_pool_size;
---enable_query_log
+SET GLOBAL innodb_max_purge_lag_wait = 0;
+SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct;
+SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm;
+
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0;
+SET GLOBAL innodb_max_dirty_pages_pct = 0.0;
+let $wait_condition =
+SELECT variable_value = 0
+FROM information_schema.global_status
+WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY';
 --source include/wait_condition.inc
+SET GLOBAL innodb_buffer_pool_size = @old_innodb_buffer_pool_size;
+SET GLOBAL innodb_adaptive_hash_index = @old_innodb_adaptive_hash_index;
+SET GLOBAL innodb_max_dirty_pages_pct = @save_pct;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1,2 +0,0 @@
---innodb-buffer-pool-chunk-size=1M
---loose-skip-innodb-disable-resize_buffer_pool_debug
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_bigtest.test	1970-01-01 00:00:00.000000000 +0000
@@ -1,28 +0,0 @@
---source include/have_innodb.inc
---source include/big_test.inc
-
-SET @save_size=@@innodb_buffer_pool_size;
-
-let $wait_timeout = 60;
-let $wait_condition =
-  SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool'
-  FROM information_schema.global_status
-  WHERE variable_name = 'INNODB_BUFFER_POOL_RESIZE_STATUS';
-
---echo #
---echo # MDEV-27891: Delayed SIGSEGV in InnoDB buffer pool resize
---echo # after or during DROP TABLE
---echo #
-
-select @@innodb_buffer_pool_chunk_size;
-CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
-SET GLOBAL innodb_buffer_pool_size=256*1024*1024;
-DROP TABLE t1;
---source include/wait_condition.inc
-SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size + @@innodb_buffer_pool_chunk_size;
---source include/wait_condition.inc
-
---echo # End of 10.6 tests
-
-SET GLOBAL innodb_buffer_pool_size=@save_size;
---source include/wait_condition.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_debug.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
---innodb-buffer-pool-size=8M --innodb-buffer-pool-chunk-size=2M
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.opt	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1 @@
+--innodb-buffer-pool-size-max=16m
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_temporary.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,24 +1,43 @@
 --source include/have_innodb.inc
 --source include/have_sequence.inc
 --source include/have_debug.inc
+--source include/have_debug_sync.inc
 
 SET @save_limit=@@GLOBAL.innodb_limit_optimistic_insert_debug;
 SET @save_size=@@GLOBAL.innodb_buffer_pool_size;
 SET GLOBAL innodb_limit_optimistic_insert_debug=2;
-
 SET GLOBAL innodb_buffer_pool_size=16777216;
 
 CREATE TEMPORARY TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
 INSERT INTO t1 SELECT seq FROM seq_1_to_200;
 
-SET GLOBAL innodb_buffer_pool_size=8388608;
+# Flush the buffer pool to prevent
+# "innodb_buffer_pool_size change aborted" error with ./mtr --repeat=3
+SET GLOBAL innodb_max_purge_lag_wait=0;
+SET @save_pct= @@GLOBAL.innodb_max_dirty_pages_pct;
+SET @save_pct_lwm= @@GLOBAL.innodb_max_dirty_pages_pct_lwm;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = 0.0;
+SET GLOBAL innodb_max_dirty_pages_pct = 0.0;
+
+SHOW STATUS LIKE 'innodb_buffer_pool_resize_status';
+connect con1,localhost,root;
+SET DEBUG_SYNC='buf_pool_shrink_before_wakeup SIGNAL blocked WAIT_FOR go';
+send SET GLOBAL innodb_buffer_pool_size=8388608;
+connection default;
+SET DEBUG_SYNC='now WAIT_FOR blocked';
+# adjust for 32-bit and SUX_LOCK_GENERIC
+--replace_regex /(5..)\/\1/505\/505/
+SHOW STATUS LIKE 'innodb_buffer_pool_resize_status';
+SET DEBUG_SYNC='now SIGNAL go';
+connection con1;
+reap;
+disconnect con1;
+connection default;
+SHOW STATUS LIKE 'innodb_buffer_pool_resize_status';
+SET DEBUG_SYNC=RESET;
 
-let $wait_timeout = 60;
-let $wait_condition =
-  SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool'
-  FROM information_schema.global_status
-  WHERE variable_name = 'INNODB_BUFFER_POOL_RESIZE_STATUS';
---source include/wait_condition.inc
+SET GLOBAL innodb_max_dirty_pages_pct = @save_pct;
+SET GLOBAL innodb_max_dirty_pages_pct_lwm = @save_pct_lwm;
 
 SELECT COUNT(*),MIN(a),MAX(a) FROM t1;
 DROP TEMPORARY TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1,3 +0,0 @@
---innodb-buffer-pool-size=16M
---innodb-buffer-pool-chunk-size=4M
---innodb-page-size=4k
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_buffer_pool_resize_with_chunks.test	1970-01-01 00:00:00.000000000 +0000
@@ -1,61 +0,0 @@
-#
-# WL6117 : Resize the InnoDB Buffer Pool Online
-# (innodb_buffer_pool_chunk_size used case)
-#
-
---source include/have_innodb.inc
---source include/big_test.inc
-
-let $wait_timeout = 180;
-let $wait_condition =
-  SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool'
-  FROM information_schema.global_status
-  WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status';
-
---disable_query_log
-set @old_innodb_buffer_pool_size = @@innodb_buffer_pool_size;
---enable_query_log
-
-select @@innodb_buffer_pool_chunk_size;
-
-# fill buffer pool
---disable_query_log
-SET @save_innodb_read_only_compressed=@@GLOBAL.innodb_read_only_compressed;
-SET GLOBAL innodb_read_only_compressed=OFF;
---enable_query_log
-create table t1 (id int not null, val int not null default '0', primary key (id)) ENGINE=InnoDB ROW_FORMAT=COMPRESSED;
-create or replace view view0 as select 1 union all select 1;
-
-set @`v_id` := 0;
-set @`v_val` := 0;
-
-# 2^18 == 262144 records
-replace into t1 select (@`v_id` := (@`v_id` + 4) mod 4294967296) as id, (@`v_val` := (@`v_val` + 4) mod 4294967296) as val from view0 v0, view0 v1, view0 v2, view0 v3, view0 v4, view0 v5, view0 v6, view0 v7, view0 v8, view0 v9, view0 v10, view0 v11, view0 v12, view0 v13, view0 v14, view0 v15, view0 v16, view0 v17;
---disable_query_log
-SET GLOBAL innodb_read_only_compressed=@save_innodb_read_only_compressed;
---enable_query_log
-
-# Shrink buffer pool to 7MB
-set global innodb_buffer_pool_size = 7340032;
---source include/wait_condition.inc
-
-select count(val) from t1;
-
-# Expand buffer pool to 16MB
-set global innodb_buffer_pool_size = 16777216;
---source include/wait_condition.inc
-
-select count(val) from t1;
-
-drop table t1;
-drop view view0;
-
-# Try to shrink buffer pool to smaller than chunk size
-set global innodb_buffer_pool_size = 2*1048576;
---source include/wait_condition.inc
-select @@innodb_buffer_pool_size;
-
---disable_query_log
-set global innodb_buffer_pool_size = @old_innodb_buffer_pool_size;
---enable_query_log
---source include/wait_condition.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_bug52663.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_bug52663.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_bug52663.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_bug52663.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,5 +1,7 @@
 --source include/have_innodb.inc
 
+SET @save_innodb_timeout=@@innodb_lock_wait_timeout;
+SET GLOBAL innodb_lock_wait_timeout=1;
 set session transaction isolation level read committed;
 
 create table innodb_bug52663 (what varchar(5), id integer, count integer, primary key
@@ -8,7 +10,6 @@
 begin;
 
 connect (addconroot, localhost, root,,);
-connection addconroot;
 set session transaction isolation level read committed;
 begin;
 
@@ -32,3 +33,4 @@
 connection default;
 select * from innodb_bug52663;
 drop table innodb_bug52663;
+SET GLOBAL innodb_lock_wait_timeout=@save_innodb_timeout;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_row_lock_time_ms.test	2025-05-19 16:14:24.000000000 +0000
@@ -5,11 +5,26 @@
 
 INSERT INTO t VALUES (1);
 
-SET GLOBAL innodb_monitor_reset = "module_innodb";
+SET GLOBAL innodb_monitor_disable="lock_row_lock_time";
+SET GLOBAL innodb_monitor_disable="lock_row_lock_time_max";
+SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time';
+SET GLOBAL innodb_monitor_reset_all='lock_row_lock_time_max';
+SET GLOBAL innodb_monitor_enable="lock_row_lock_time";
+SET GLOBAL innodb_monitor_enable="lock_row_lock_time_max";
 
 BEGIN;
 SELECT * FROM t FOR UPDATE;
 
+# We can't predict (innodb/lock)_row_lock_time_avg value, because it's counted
+# as the whole waiting time divided by the amount of waits. The
+# corresponding counters in lock_sys can't be reset with any query.
+
+--disable_result_log
+SELECT @innodb_row_lock_time_before := variable_value
+  FROM information_schema.global_status
+  WHERE LOWER(variable_name) = 'innodb_row_lock_time';
+--enable_result_log
+
 --connect(con1,localhost,root,,)
 SET innodb_lock_wait_timeout = 1;
 --error ER_LOCK_WAIT_TIMEOUT
@@ -19,24 +34,28 @@
 --connection default
 COMMIT;
 
-SELECT variable_value > 100 FROM information_schema.global_status
+SELECT variable_value - @innodb_row_lock_time_before > 100
+  FROM information_schema.global_status
   WHERE LOWER(variable_name) = 'innodb_row_lock_time';
-SELECT variable_value > 100 FROM information_schema.global_status
+# We can't use 'variable_value - @innodb_row_lock_time_max_before' trick for
+# innodb_row_lock_time_max, because we can't reset it, and we don't know the
+# initial value at the moment of the test execution.
+SELECT  variable_value > 100
+  FROM information_schema.global_status
   WHERE LOWER(variable_name) = 'innodb_row_lock_time_max';
-SELECT variable_value > 100 FROM information_schema.global_status
-  WHERE LOWER(variable_name) = 'innodb_row_lock_time_avg';
-
-SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS
-  WHERE NAME="lock_row_lock_time";
-SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS
-  WHERE NAME="lock_row_lock_time_max";
-SELECT count_reset > 100 FROM INFORMATION_SCHEMA.INNODB_METRICS
-  WHERE NAME="lock_row_lock_time_avg";
+SELECT count_reset > 100
+  FROM INFORMATION_SCHEMA.INNODB_METRICS
+  WHERE NAME='lock_row_lock_time';
+SELECT count_reset > 100
+  FROM INFORMATION_SCHEMA.INNODB_METRICS
+  WHERE NAME='lock_row_lock_time_max';
 
 DROP TABLE t;
 
 --disable_warnings
-SET GLOBAL innodb_monitor_reset=default;
+SET GLOBAL innodb_monitor_enable=default;
+SET GLOBAL innodb_monitor_disable=default;
+SET GLOBAL innodb_monitor_reset_all=default;
 --enable_warnings
 
 --source include/wait_until_count_sessions.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_auto_recalc_on_nonexistent.test	2025-05-19 16:14:24.000000000 +0000
@@ -17,9 +17,7 @@
 -- eval $check_stats1
 -- eval $check_stats2
 
-# open and close the table
 SELECT * FROM t;
-FLUSH TABLE t;
 
 DELETE FROM mysql.innodb_index_stats WHERE table_name = 't';
 DELETE FROM mysql.innodb_table_stats WHERE table_name = 't';
@@ -27,7 +25,8 @@
 -- eval $check_stats1
 -- eval $check_stats2
 
-# open the table, causing stats recalc/save
+# rename and open the table, causing stats recalc/save
+RENAME TABLE t TO tmp, tmp TO t;
 SELECT * FROM t;
 
 -- eval $check_stats1
@@ -43,9 +42,7 @@
 -- eval $check_stats1
 -- eval $check_stats2
 
-# open and close the table
 SELECT * FROM t;
-FLUSH TABLE t;
 
 DELETE FROM mysql.innodb_index_stats WHERE table_name = 't';
 DELETE FROM mysql.innodb_table_stats WHERE table_name = 't';
@@ -53,7 +50,7 @@
 -- eval $check_stats1
 -- eval $check_stats2
 
-# open the table, causing stats recalc/save
+RENAME TABLE t TO tmp, tmp TO t;
 SELECT * FROM t;
 
 -- eval $check_stats1
@@ -69,9 +66,7 @@
 -- eval $check_stats1
 -- eval $check_stats2
 
-# open and close the table
 SELECT * FROM t;
-FLUSH TABLE t;
 
 DELETE FROM mysql.innodb_index_stats WHERE table_name = 't';
 DELETE FROM mysql.innodb_table_stats WHERE table_name = 't';
@@ -79,7 +74,8 @@
 -- eval $check_stats1
 -- eval $check_stats2
 
-# open the table, stats should not be present, since autorecalc is disabled
+# rename the table, stats should not be present, since autorecalc is disabled
+RENAME TABLE t TO tmp, tmp TO t;
 SELECT * FROM t;
 
 -- eval $check_stats1
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_fetch.test mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_fetch.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/innodb_stats_fetch.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/innodb_stats_fetch.test	2025-05-19 16:14:24.000000000 +0000
@@ -69,7 +69,7 @@
 index_name = 'idx' AND
 stat_name = 'n_diff_pfx02';
 
-FLUSH TABLE test_ps_fetch;
+RENAME TABLE test_ps_fetch TO tmp, tmp TO test_ps_fetch;
 
 SELECT seq_in_index, column_name, cardinality
 FROM information_schema.statistics WHERE table_name = 'test_ps_fetch'
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/instant_alter_debug.test mariadb-10.11.13/mysql-test/suite/innodb/t/instant_alter_debug.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/instant_alter_debug.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/instant_alter_debug.test	2025-05-19 16:14:24.000000000 +0000
@@ -657,11 +657,19 @@
 SET DEBUG_SYNC=RESET;
 
 --echo # End of 10.5 tests
-
 SELECT variable_value-@old_instant instants
 FROM information_schema.global_status
 WHERE variable_name = 'innodb_instant_alter_column';
 
-SET GLOBAL innodb_stats_persistent = @save_stats_persistent;
+CREATE TABLE t1(f1 INT, f2 TEXT)ENGINE=InnoDB;
+INSERT INTO t1 VALUES(1, 'a');
+ALTER TABLE t1 ADD COLUMN f3 TEXT FIRST;
+--error ER_INTERNAL_ERROR
+SET STATEMENT DEBUG_DBUG="+d,instant_insert_fail" FOR
+ALTER TABLE t1 DROP COLUMN f1;
+ALTER TABLE t1 DROP COLUMN f1;
+CHECK TABLE t1;
+DROP TABLE t1;
 
+SET GLOBAL innodb_stats_persistent = @save_stats_persistent;
 --echo # End of 10.6 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/lock_isolation.test mariadb-10.11.13/mysql-test/suite/innodb/t/lock_isolation.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/lock_isolation.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/lock_isolation.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,9 +1,16 @@
 --source include/have_innodb.inc
+--source include/count_sessions.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
 
 --disable_query_log
 call mtr.add_suppression("InnoDB: Transaction was aborted due to ");
 --enable_query_log
 
+--connect disable_purging,localhost,root
+START TRANSACTION WITH CONSISTENT SNAPSHOT;
+
+--connection default
 --echo #
 --echo # MDEV-26642 Weird SELECT view when a record is
 --echo #            modified to the same value by two transactions
@@ -41,22 +48,18 @@
 --echo #            READ UNCOMMITTED and READ COMMITTED isolation level
 --echo #
 
-CREATE TABLE t(a INT, b INT) ENGINE=InnoDB;
+CREATE TABLE t(a INT, b INT) ENGINE=InnoDB STATS_PERSISTENT=0;
 INSERT INTO t VALUES(NULL, 1), (2, 2);
 SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
 BEGIN; UPDATE t SET a = 10;
 
 --connection consistent
 SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 --send UPDATE t SET b = 20 WHERE a
 
 --connection default
-let $wait_condition=
-  select count(*) = 1 from information_schema.processlist
-  where state = 'Updating'
-  and info = 'UPDATE t SET b = 20 WHERE a';
---source include/wait_condition.inc
-
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 COMMIT;
 
 --connection consistent
@@ -70,14 +73,11 @@
 
 --connection consistent
 SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 --send UPDATE t SET b = 20 WHERE a
 
 --connection default
-let $wait_condition=
-  select count(*) = 1 from information_schema.processlist
-  where info = 'UPDATE t SET b = 20 WHERE a';
---source include/wait_condition.inc
-
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 COMMIT;
 
 --connection consistent
@@ -91,15 +91,11 @@
 
 --connection con_weird
 SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 send UPDATE t SET b = 20 WHERE a;
 
 --connection default
-let $wait_condition=
-  select count(*) = 1 from information_schema.processlist
-  where state = 'Updating'
-  and info = 'UPDATE t SET b = 20 WHERE a';
---source include/wait_condition.inc
-
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 SELECT * FROM t;
 COMMIT;
 
@@ -123,14 +119,11 @@
 BEGIN;
 # As semi-consistent read is disabled for innodb_snapshot_isolation=ON, the
 # following UPDATE must be blocked on the first record.
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
 --send UPDATE t SET b = 2 WHERE a
 
 --connection default
-let $wait_condition=
-  select count(*) = 1 from information_schema.processlist
-  where state = 'Updating' and info = 'UPDATE t SET b = 2 WHERE a';
---source include/wait_condition.inc
-
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 UPDATE t SET a = 1;
 COMMIT;
 --connection consistent
@@ -149,13 +142,15 @@
 --echo # MDEV-33802 Weird read view after ROLLBACK of other transactions
 --echo #
 
-CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB;
-INSERT INTO t SET a=1;
-
-BEGIN; INSERT INTO t SET a=2;
+CREATE TABLE t(a INT PRIMARY KEY, b INT UNIQUE) ENGINE=InnoDB STATS_PERSISTENT=0;
 
 --connection consistent
 START TRANSACTION WITH CONSISTENT SNAPSHOT;
+
+--connection default
+INSERT INTO t SET a=1;
+
+--connection consistent
 SAVEPOINT sp1;
 --disable_ps2_protocol
 --error ER_CHECKREAD
@@ -163,29 +158,100 @@
 --enable_ps2_protocol
 SAVEPOINT sp1;
 
+--connection default
+BEGIN; INSERT INTO t SET a=2;
+
 --connection con_weird
 START TRANSACTION WITH CONSISTENT SNAPSHOT;
-send
-SELECT * FROM t FORCE INDEX (b) FOR UPDATE;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
+--send SELECT * FROM t FORCE INDEX (b) FOR UPDATE
 
 --connection default
-let $wait_condition=
-  select count(*) = 1 from information_schema.processlist
-  where state = 'Sending data'
-  and info LIKE 'SELECT * FROM t %';
---source include/wait_condition.inc
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
 ROLLBACK;
 
 --connection con_weird
 --reap
 SELECT * FROM t FORCE INDEX (b) FOR UPDATE;
+COMMIT;
 --disconnect con_weird
 
 --connection consistent
 SELECT * FROM t FORCE INDEX (b) FOR UPDATE;
+COMMIT;
+
+--connection default
+TRUNCATE TABLE t;
+
+--echo #
+--echo # MDEV-36639 innodb_snapshot_isolation=1 gives error for not comitted row changes
+--echo #
+INSERT INTO t VALUES (1,1),(2,2);
+
+--connection default
+--echo # Case 1: Transaction A modifies a record, transaction B with snapshot
+--echo # isolation level is blocked by A, then A is committed.
+--echo # Expected behaviour: B gets ER_CHECKREAD.
+BEGIN;
+UPDATE t SET b=3 WHERE a = 1;
+
+--connection consistent
+SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+BEGIN;
+SELECT * FROM t;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
+--send SELECT * FROM t WHERE a=1 FOR UPDATE
+
+--connection default
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
+COMMIT;
+
+--connection consistent
+--error ER_CHECKREAD
+--reap
+
+--echo # Case 2: Transaction A modifies a record, transaction B with snapshot
+--echo # isolation level is blocked by A, then A is rolled back.
+--echo # Expected behaviour: B continues execution.
+
+--connection default
+BEGIN;
+UPDATE t SET b=4 WHERE a=1;
+
+--connection consistent
+BEGIN;
+SELECT * FROM t;
+SET DEBUG_SYNC="lock_wait_before_suspend SIGNAL select_blocked";
+--send SELECT * FROM t WHERE a=1 FOR UPDATE
+
+--connection default
+SET DEBUG_SYNC="now WAIT_FOR select_blocked";
+ROLLBACK;
+
+--connection consistent
+--reap
+ROLLBACK;
+
+--echo # Case 3: Transaction B with snapshot isolation level started with
+--echo # consistent snapshot. Transaction A modifies a record and is committed.
+--echo # Both B tries to read modified by A record.
+--echo # Expected behavior: B gets ER_CHECKREAD.
+
+--connection consistent
+START TRANSACTION WITH CONSISTENT SNAPSHOT;
+
+--connection default
+UPDATE t SET b=4 WHERE a=1;
+
+--connection consistent
+--error ER_CHECKREAD
+SELECT * FROM t WHERE a=1 FOR UPDATE;
 --disconnect consistent
+--disconnect disable_purging
 
 --connection default
+SET DEBUG_SYNC="RESET";
 DROP TABLE t;
 
+--source include/wait_until_count_sessions.inc
 --echo # End of 10.6 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.opt mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1 +1 @@
---innodb_buffer_pool_size=5M
+--innodb_buffer_pool_size=6M
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.test mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/lock_memory_debug.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/lock_memory_debug.test	2025-05-19 16:14:24.000000000 +0000
@@ -15,7 +15,7 @@
 
 --error ER_LOCK_TABLE_FULL
 SET STATEMENT debug_dbug='+d,innodb_skip_lock_bitmap' FOR
-INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g LIMIT 45000;
+INSERT INTO t1 SELECT a.* FROM t1 a, t1 b, t1 c, t1 d, t1 e, t1 f, t1 g;
 
 SELECT COUNT(*) FROM t1;
 
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/log_upgrade_101_flags.test mariadb-10.11.13/mysql-test/suite/innodb/t/log_upgrade_101_flags.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/log_upgrade_101_flags.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/log_upgrade_101_flags.test	2025-05-19 16:14:24.000000000 +0000
@@ -73,7 +73,7 @@
 close OUT or die;
 EOF
 
---let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=10M
+--let $restart_parameters= $dirs --innodb-force-recovery=5 --innodb-log-file-size=4m --innodb_page_size=32k --innodb_buffer_pool_size=11M
 --source include/start_mysqld.inc
 SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES
 WHERE engine = 'innodb'
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/mdev-15707.opt mariadb-10.11.13/mysql-test/suite/innodb/t/mdev-15707.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/mdev-15707.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/mdev-15707.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1 +1 @@
---innodb --innodb-buffer-pool-size=5MB --innodb-read-io-threads=1 --innodb-doublewrite=0 --innodb-flush-log-at-trx-commit=0
\ No newline at end of file
+--innodb --innodb-buffer-pool-size=6MB --innodb-read-io-threads=1 --innodb-doublewrite=0 --innodb-flush-log-at-trx-commit=0
\ No newline at end of file
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.opt mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.opt	2025-05-19 16:14:24.000000000 +0000
@@ -0,0 +1,3 @@
+--loose-innodb-buffer-pool-size-auto-min=17m
+--innodb-buffer-pool-size-max=17m
+--innodb-buffer-pool-size=17m
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.test mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/mem_pressure.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/mem_pressure.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,8 +1,8 @@
 --source include/have_debug.inc
---source include/have_cgroupv2.inc
 --source include/not_embedded.inc
 --source include/have_innodb.inc
 --source include/have_sequence.inc
+--source include/word_size.inc
 
 --echo #
 --echo # MDEV-24670 avoid OOM by linux kernel co-operative memory management
@@ -15,6 +15,13 @@
 # This is not an actual parameter, so there is no need to restore it.
 set GLOBAL innodb_max_purge_lag_wait=0;
 
+SET @innodb_buffer_pool_size= @@GLOBAL.innodb_buffer_pool_size;
+SET @innodb_buffer_pool_size_min= @@GLOBAL.innodb_buffer_pool_size_auto_min;
+SELECT
+@@GLOBAL.innodb_buffer_pool_size,
+@@GLOBAL.innodb_buffer_pool_size_auto_min,
+@@GLOBAL.innodb_buffer_pool_size_max;
+
 CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
 SET GLOBAL innodb_limit_optimistic_insert_debug=2;
 SET STATEMENT unique_checks=0, foreign_key_checks=0 FOR
@@ -24,32 +31,31 @@
 
 DROP TABLE t1;
 
---disable_cursor_protocol
-SELECT CAST(VARIABLE_VALUE AS INTEGER) INTO @dirty_prev
-FROM INFORMATION_SCHEMA.GLOBAL_STATUS
-WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty';
---enable_cursor_protocol
-
-set debug_dbug="d,trigger_garbage_collection";
-SET GLOBAL innodb_buffer_pool_size=@@innodb_buffer_pool_size;
+SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR
+SET GLOBAL innodb_buffer_pool_size=@innodb_buffer_pool_size;
 
 let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err;
-# either a fail or the pressure event
-let SEARCH_PATTERN= [Mm]emory pressure.*;
+let SEARCH_PATTERN= Memory pressure event disregarded.*;
+let SEARCH_WAIT= FOUND;
 --source include/search_pattern_in_file.inc
 
+SET STATEMENT debug_dbug="d,trigger_garbage_collection" FOR
+SET GLOBAL innodb_buffer_pool_size_auto_min=
+CAST(@innodb_buffer_pool_size/2 AS UNSIGNED),
+innodb_buffer_pool_size=@innodb_buffer_pool_size;
+
 # The garbage collection happens asynchronously after trigger, in a background
 # thread. So wait for it to happen to avoid sporadic failure.
 let $wait_condition=
-  SELECT CAST(VARIABLE_VALUE AS INTEGER) < @dirty_prev AS LESS_DIRTY_IS_GOOD
-  FROM INFORMATION_SCHEMA.GLOBAL_STATUS
-  WHERE VARIABLE_NAME='Innodb_buffer_pool_pages_dirty';
+  select @@global.innodb_buffer_pool_size < @innodb_buffer_pool_size;
 --source include/wait_condition.inc
 eval $wait_condition;
-let SEARCH_PATTERN= InnoDB: Memory pressure event freed.*;
+let SEARCH_PATTERN= InnoDB: Memory pressure event shrunk.*;
 let SEARCH_WAIT= FOUND;
 --source include/search_pattern_in_file.inc
 
 set debug_dbug=@save_dbug;
+SET GLOBAL innodb_buffer_pool_size= @innodb_buffer_pool_size;
+SET GLOBAL innodb_buffer_pool_size_auto_min=@innodb_buffer_pool_size_min;
 
 --echo # End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/page_cleaner.test mariadb-10.11.13/mysql-test/suite/innodb/t/page_cleaner.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/page_cleaner.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/page_cleaner.test	2025-05-19 16:14:24.000000000 +0000
@@ -7,6 +7,12 @@
 SET GLOBAL innodb_max_dirty_pages_pct_lwm=0.0;
 SET GLOBAL innodb_max_dirty_pages_pct=0.0;
 
+CREATE TABLE t(a INT) ENGINE=InnoDB STATS_PERSISTENT=0;
+--connect (prevent_purge,localhost,root)
+START TRANSACTION WITH CONSISTENT SNAPSHOT;
+--connection default
+SET GLOBAL innodb_max_purge_lag_wait=0;
+
 let $wait_condition =
 SELECT variable_value = 0
 FROM information_schema.global_status
@@ -15,7 +21,24 @@
 
 SET GLOBAL innodb_max_dirty_pages_pct=90.0;
 
-CREATE TABLE t ENGINE=InnoDB SELECT * FROM seq_1_to_10000;
+--disable_cursor_protocol
+SELECT variable_value INTO @log_writes FROM information_schema.global_status
+WHERE variable_name='innodb_log_writes';
+--enable_cursor_protocol
+
+BEGIN;
+--disable_query_log
+let $N=500;
+while ($N) {
+  INSERT INTO t SELECT * FROM seq_1_to_10;
+  dec $N;
+}
+--enable_query_log
+ROLLBACK;
+
+SELECT if(variable_value-@log_writes<500,'ok',variable_value-@log_writes)
+FROM information_schema.global_status WHERE variable_name='innodb_log_writes';
+--disconnect prevent_purge
 
 SELECT variable_value>0 FROM information_schema.global_status
 WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY';
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/purge_secondary.opt mariadb-10.11.13/mysql-test/suite/innodb/t/purge_secondary.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/purge_secondary.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/purge_secondary.opt	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,4 @@
 --innodb-sys-tablestats
---innodb_buffer_pool_size=5M
+--innodb_buffer_pool_size=6M
 --innodb_monitor_enable=module_buffer
 --skip-innodb-stats-persistent
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/recovery_memory.test mariadb-10.11.13/mysql-test/suite/innodb/t/recovery_memory.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/recovery_memory.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/recovery_memory.test	2025-05-19 16:14:24.000000000 +0000
@@ -22,7 +22,7 @@
 connection default;
 sleep 10;
 let $shutdown_timeout=0;
-let $restart_parameters=--innodb_buffer_pool_size=5242880;
+let $restart_parameters=--innodb_buffer_pool_size=6m;
 --source include/restart_mysqld.inc
 DROP TABLE t1;
 DROP PROCEDURE dorepeat;
@@ -33,11 +33,11 @@
 --echo #
 if ($have_debug) {
 SET DEBUG_DBUG="+d,ib_log_checkpoint_avoid_hard";
-let $restart_parameters=--innodb_buffer_pool_size=5242880 --debug_dbug=+d,ibuf_init_corrupt;
+let $restart_parameters=--innodb_buffer_pool_size=6m --debug_dbug=+d,ibuf_init_corrupt;
 }
 if (!$have_debug) {
 --echo SET DEBUG_DBUG="+d,ib_log_checkpoint_avoid_hard";
-let $restart_parameters=--innodb_buffer_pool_size=5242880;
+let $restart_parameters=--innodb_buffer_pool_size=6m;
 }
 CREATE TABLE t1(f1 INT NOT NULL)ENGINE=InnoDB;
 INSERT INTO t1 SELECT * FROM seq_1_to_65536;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/restart.opt mariadb-10.11.13/mysql-test/suite/innodb/t/restart.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/restart.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/restart.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1,2 +0,0 @@
---loose-innodb_disable_resize_buffer_pool_debug=0
---innodb-buffer-pool-chunk-size=1M
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/restart.test mariadb-10.11.13/mysql-test/suite/innodb/t/restart.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/restart.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/restart.test	2025-05-19 16:14:24.000000000 +0000
@@ -93,31 +93,6 @@
 DROP TABLE tr,tc,td;
 
 --echo #
---echo # MDEV-27467 innodb to enfore the minimum innodb_buffer_pool_size in SET (resize) the same as startup
---echo #
-
-let $wait_timeout = 180;
-let $wait_condition =
-  SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool'
-  FROM information_schema.global_status
-  WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status';
-
---disable_cursor_protocol
-SELECT @@innodb_buffer_pool_size INTO @innodb_buffer_pool_size_orig;
-SELECT CEILING((256 + 64) * @@innodb_page_size / 1048576) * 1048576 INTO @min_pool_size;
---enable_cursor_protocol
---error ER_WRONG_VALUE_FOR_VAR
-EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size -1);
-
-SHOW WARNINGS;
-
-EXECUTE IMMEDIATE 'SET GLOBAL innodb_buffer_pool_size = ?' USING (@min_pool_size);
-
---source include/wait_condition.inc
-
-SET GLOBAL innodb_buffer_pool_size = @innodb_buffer_pool_size_orig;
-
---echo #
 --echo # MDEV-27882 Innodb - recognise MySQL-8.0 innodb flags and give a specific error message
 --echo #
 
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/stat_tables.test mariadb-10.11.13/mysql-test/suite/innodb/t/stat_tables.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/stat_tables.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/stat_tables.test	2025-05-19 16:14:24.000000000 +0000
@@ -110,3 +110,12 @@
 DROP TABLE t1;
 
 --echo # End of 10.6 tests
+
+--echo #
+--echo # MDEV-36373 Warning: ... persistent statistics storage is corrupted
+--echo #
+CREATE TABLE t1 (c INT) ENGINE=InnoDB;
+SET STATEMENT tx_read_only=1 FOR ANALYZE TABLE t1;
+DROP TABLE t1;
+
+--echo # End of 10.11 tests
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/stats_persistent.test mariadb-10.11.13/mysql-test/suite/innodb/t/stats_persistent.test
--- mariadb-10.11.11/mysql-test/suite/innodb/t/stats_persistent.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/stats_persistent.test	2025-05-19 16:14:24.000000000 +0000
@@ -1,4 +1,5 @@
 --source include/have_innodb.inc
+--source include/have_sequence.inc
 --source include/have_debug.inc
 --source include/have_debug_sync.inc
 --source include/count_sessions.inc
@@ -26,3 +27,14 @@
 DROP TABLE t1;
 
 --source include/wait_until_count_sessions.inc
+
+--echo #
+--echo # MDEV-36649 dict_acquire_mdl_shared() aborts when table
+--echo #      mode is DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
+--echo #
+set @old_defragment_stats_accuracy= @@innodb_defragment_stats_accuracy;
+SET GLOBAL innodb_defragment_stats_accuracy=1;
+CREATE TABLE t (a INT ) ENGINE=INNODB;
+INSERT INTO t SELECT * FROM seq_1_to_1000;
+DROP TABLE t;
+set global innodb_defragment_stats_accuracy= @old_defragment_stats_accuracy;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb/t/update_time-master.opt mariadb-10.11.13/mysql-test/suite/innodb/t/update_time-master.opt
--- mariadb-10.11.11/mysql-test/suite/innodb/t/update_time-master.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb/t/update_time-master.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
---innodb-buffer-pool-size=5M
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/r/index_table.result mariadb-10.11.13/mysql-test/suite/innodb_fts/r/index_table.result
--- mariadb-10.11.11/mysql-test/suite/innodb_fts/r/index_table.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb_fts/r/index_table.result	2025-05-19 16:14:24.000000000 +0000
@@ -5,6 +5,9 @@
 title VARCHAR(200),
 content TEXT
 ) ENGINE= InnoDB;
+SET STATEMENT debug_dbug='+d,innodb_report_deadlock' FOR
+CREATE FULLTEXT INDEX idx ON articles (title, content);
+ERROR HY000: Got error 11 "Resource temporarily unavailable" from storage engine InnoDB
 CREATE FULLTEXT INDEX idx ON articles (title, content);
 INSERT INTO articles (title, content) VALUES
 ('MySQL Tutorial','DBMS stands for MySQL DataBase ...'),
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result mariadb-10.11.13/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result
--- mariadb-10.11.11/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb_fts/r/innodb_ft_aux_table.result	2025-05-19 16:14:24.000000000 +0000
@@ -118,4 +118,13 @@
 SELECT @@GLOBAL.innodb_ft_aux_table;
 @@GLOBAL.innodb_ft_aux_table
 test/t1
+CREATE TABLE t(a INT) ENGINE=InnoDB;
+SET GLOBAL innodb_ft_aux_table='test/t';
+ERROR 42000: Variable 'innodb_ft_aux_table' can't be set to the value of 'test/t'
+DROP TABLE t;
+SET GLOBAL innodb_ft_aux_table='test/t';
+ERROR 42000: Variable 'innodb_ft_aux_table' can't be set to the value of 'test/t'
+SELECT @@GLOBAL.innodb_ft_aux_table;
+@@GLOBAL.innodb_ft_aux_table
+test/t1
 SET GLOBAL innodb_ft_aux_table = @save_ft_aux_table;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/t/index_table.test mariadb-10.11.13/mysql-test/suite/innodb_fts/t/index_table.test
--- mariadb-10.11.11/mysql-test/suite/innodb_fts/t/index_table.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb_fts/t/index_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -3,6 +3,9 @@
 
 -- source include/have_innodb.inc
 -- source include/have_debug.inc
+--disable_query_log
+call mtr.add_suppression("InnoDB: \\(Deadlock\\) writing `use_stopword'");
+--enable_query_log
 
 SET @optimize=@@GLOBAL.INNODB_OPTIMIZE_FULLTEXT_ONLY;
 SET GLOBAL INNODB_OPTIMIZE_FULLTEXT_ONLY=1;
@@ -14,6 +17,9 @@
 	content TEXT
 	) ENGINE= InnoDB;
 
+--error ER_GET_ERRNO
+SET STATEMENT debug_dbug='+d,innodb_report_deadlock' FOR
+CREATE FULLTEXT INDEX idx ON articles (title, content);
 CREATE FULLTEXT INDEX idx ON articles (title, content);
 
 INSERT INTO articles (title, content) VALUES
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test mariadb-10.11.13/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test
--- mariadb-10.11.11/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb_fts/t/innodb_ft_aux_table.test	2025-05-19 16:14:24.000000000 +0000
@@ -41,4 +41,13 @@
 SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE;
 SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_CONFIG;
 SELECT @@GLOBAL.innodb_ft_aux_table;
+
+CREATE TABLE t(a INT) ENGINE=InnoDB;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_ft_aux_table='test/t';
+DROP TABLE t;
+--error ER_WRONG_VALUE_FOR_VAR
+SET GLOBAL innodb_ft_aux_table='test/t';
+SELECT @@GLOBAL.innodb_ft_aux_table;
+
 SET GLOBAL innodb_ft_aux_table = @save_ft_aux_table;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_gis/r/rollback.result mariadb-10.11.13/mysql-test/suite/innodb_gis/r/rollback.result
--- mariadb-10.11.11/mysql-test/suite/innodb_gis/r/rollback.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb_gis/r/rollback.result	2025-05-19 16:14:25.000000000 +0000
@@ -412,3 +412,16 @@
 ERROR HY000: Lost connection to server during query
 insert into t1 values(5, point(5,5), point(5,5), 5);
 drop table t1;
+#
+# MDEV-35420 Server aborts while deleting the record
+#               in spatial index
+#
+CREATE TABLE t1 (c POINT NOT NULL, SPATIAL(c)) engine=InnoDB;
+CHECK TABLE t1;
+Table	Op	Msg_type	Msg_text
+test.t1	check	status	OK
+SET STATEMENT unique_checks=0,foreign_key_checks=0 FOR
+START TRANSACTION;
+INSERT INTO t1 SELECT ST_GeomFromText('POINT(114368751 656950466)') FROM seq_1_to_512;
+ROLLBACK;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rollback.test mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rollback.test
--- mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rollback.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rollback.test	2025-05-19 16:14:25.000000000 +0000
@@ -8,6 +8,7 @@
 # Avoid CrashReporter popup on Mac
 --source include/not_crashrep.inc
 --source include/have_innodb_16k.inc
+--source include/have_sequence.inc
 
 CREATE TABLE t4 (id bigint(12) unsigned NOT NULL auto_increment,
   c2 varchar(15) collate utf8_bin default NULL,
@@ -475,3 +476,15 @@
 insert into t1 values(5, point(5,5), point(5,5), 5);
 
 drop table t1;
+
+--echo #
+--echo # MDEV-35420 Server aborts while deleting the record
+--echo #               in spatial index
+--echo #
+CREATE TABLE t1 (c POINT NOT NULL, SPATIAL(c)) engine=InnoDB;
+CHECK TABLE t1;
+SET STATEMENT unique_checks=0,foreign_key_checks=0 FOR
+START TRANSACTION;
+INSERT INTO t1 SELECT ST_GeomFromText('POINT(114368751 656950466)') FROM seq_1_to_512;
+ROLLBACK;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rtree_purge.test mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rtree_purge.test
--- mariadb-10.11.11/mysql-test/suite/innodb_gis/t/rtree_purge.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/innodb_gis/t/rtree_purge.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,5 +1,6 @@
 # This test case will test R-tree purge.
 
+--source include/long_test.inc
 --source include/innodb_page_size.inc
 --source include/have_sequence.inc
 --source include/not_valgrind.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/json/r/json_no_table.result mariadb-10.11.13/mysql-test/suite/json/r/json_no_table.result
--- mariadb-10.11.11/mysql-test/suite/json/r/json_no_table.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/json/r/json_no_table.result	2025-05-19 16:14:25.000000000 +0000
@@ -2886,7 +2886,7 @@
 ["a", "b", "c"]
 select charset(json_unquote('"abc"'));
 charset(json_unquote('"abc"'))
-utf8mb3
+utf8mb4
 select json_quote(convert(X'e68891' using utf8));
 json_quote(convert(X'e68891' using utf8))
 "我"
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.result mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.result
--- mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.result	2025-05-19 16:14:25.000000000 +0000
@@ -1,7 +1,12 @@
 CREATE TABLE t(i INT) ENGINE INNODB;
 INSERT INTO t VALUES(1);
+SET GLOBAL innodb_max_purge_lag_wait=0;
 # xtrabackup backup
 NOT FOUND /InnoDB: Allocated tablespace ID/ in backup.log
+SELECT variable_value FROM information_schema.global_status
+WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY';
+variable_value
+0
 INSERT INTO t VALUES(2);
 # xtrabackup prepare
 # shutdown server
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.test mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.test
--- mariadb-10.11.11/mysql-test/suite/mariabackup/full_backup.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/full_backup.test	2025-05-19 16:14:25.000000000 +0000
@@ -2,6 +2,7 @@
 
 CREATE TABLE t(i INT) ENGINE INNODB;
 INSERT INTO t VALUES(1);
+SET GLOBAL innodb_max_purge_lag_wait=0;
 echo # xtrabackup backup;
 let $targetdir=$MYSQLTEST_VARDIR/tmp/backup;
 --let $backup_log=$MYSQLTEST_VARDIR/tmp/backup.log
@@ -18,6 +19,8 @@
 --source include/search_pattern_in_file.inc
 --remove_file $backup_log
 
+SELECT variable_value FROM information_schema.global_status
+WHERE variable_name = 'INNODB_BUFFER_POOL_PAGES_DIRTY';
 INSERT INTO t VALUES(2);
 
 
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.result mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.result
--- mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.result	2025-05-19 16:14:25.000000000 +0000
@@ -4,6 +4,9 @@
 #
 CREATE TABLE t (pk INT PRIMARY KEY) ENGINE=InnoDB ROW_FORMAT=COMPRESSED;
 ALTER TABLE t PARTITION BY KEY(pk);
+# Incremental backup
+# Prepare fullbackup
+# Prepare incremental backup
 # shutdown server
 # remove datadir
 # xtrabackup move back
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.test mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.test
--- mariadb-10.11.11/mysql-test/suite/mariabackup/incremental_compressed.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/incremental_compressed.test	2025-05-19 16:14:25.000000000 +0000
@@ -16,12 +16,18 @@
 
 ALTER TABLE t PARTITION BY KEY(pk);
 
+--echo # Incremental backup
 --exec $XTRABACKUP --backup --target-dir=$incremental_dir --incremental-basedir=$basedir --protocol=tcp --port=$MASTER_MYPORT --user=root > $incremental_dir.log 2>&1
+--echo # Prepare fullbackup
 --exec $XTRABACKUP --prepare --target-dir=$basedir --user=root > $MYSQL_TMP_DIR/backup_prepare_0.log 2>&1
---exec $XTRABACKUP --prepare --target-dir=$basedir --incremental-dir=$incremental_dir --user=root >  $MYSQL_TMP_DIR/backup_prepare_1.log
---cat_file $MYSQL_TMP_DIR/backup_prepare_1.log
+--echo # Prepare incremental backup
+--exec $XTRABACKUP --prepare --target-dir=$basedir --incremental-dir=$incremental_dir --user=root >  $MYSQL_TMP_DIR/backup_prepare_1.log 2>&1
 let $targetdir=$basedir;
 -- source include/restart_and_restore.inc
-
 SHOW CREATE TABLE t;
 DROP TABLE t;
+remove_file $incremental_dir.log;
+remove_file $MYSQL_TMP_DIR/backup_prepare_0.log;
+remove_file $MYSQL_TMP_DIR/backup_prepare_1.log;
+rmdir $basedir;
+rmdir $incremental_dir;
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/log_page_corruption.test mariadb-10.11.13/mysql-test/suite/mariabackup/log_page_corruption.test
--- mariadb-10.11.11/mysql-test/suite/mariabackup/log_page_corruption.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/log_page_corruption.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,5 +1,5 @@
+--source include/long_test.inc
 --source include/have_debug.inc
---source include/no_valgrind_without_big.inc
 --source include/innodb_undo_tablespaces.inc
 
 --echo ########
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partial.result mariadb-10.11.13/mysql-test/suite/mariabackup/partial.result
--- mariadb-10.11.11/mysql-test/suite/mariabackup/partial.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/partial.result	2025-05-19 16:14:25.000000000 +0000
@@ -4,8 +4,8 @@
 INSERT INTO t21 VALUES(1);
 CREATE TABLE t2(i int) ENGINE INNODB;
 # xtrabackup backup
-t1.new
-t21.new
+t1.ibd
+t21.ibd
 # xtrabackup prepare
 t1.cfg
 t21.cfg
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partial_exclude.result mariadb-10.11.13/mysql-test/suite/mariabackup/partial_exclude.result
--- mariadb-10.11.11/mysql-test/suite/mariabackup/partial_exclude.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/partial_exclude.result	2025-05-19 16:14:25.000000000 +0000
@@ -14,7 +14,7 @@
 INSERT INTO test.t2 VALUES(20);
 # xtrabackup backup
 COMMIT;
-t1.new
+t1.ibd
 DROP TABLE t1;
 DROP TABLE t2;
 DROP DATABASE db2;
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.result mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.result
--- mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,11 @@
+#
+# MDEV-36437 mariabackup - confusing error message when running out of file handles with partitioned MyISAM
+#
+create table t1 (
+id bigint(20) not null auto_increment,
+primary key (id)
+) engine=myisam
+partition by hash (id)
+partitions 600;
+FOUND 1 /Error 24 on file ./test/t1#P#p\d+\.MY[DI] open during `test`.`t1` table copy: Too many open files/ in backup.log
+drop table t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.test mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.test
--- mariadb-10.11.11/mysql-test/suite/mariabackup/partition_notwin.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/partition_notwin.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,25 @@
+source include/not_windows.inc;
+source include/have_partition.inc;
+let $targetdir=$MYSQLTEST_VARDIR/tmp/backup;
+let $log=$MYSQL_TMP_DIR/backup.log;
+
+--echo #
+--echo # MDEV-36437 mariabackup - confusing error message when running out of file handles with partitioned MyISAM
+--echo #
+
+create table t1 (
+  id bigint(20) not null auto_increment,
+  primary key (id)
+) engine=myisam
+ partition by hash (id)
+ partitions 600;
+
+error 1;
+exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir > $log 2>&1;
+let SEARCH_FILE=$log;
+let SEARCH_PATTERN=Error 24 on file ./test/t1#P#p\d+\.MY[DI] open during `test`.`t1` table copy: Too many open files;
+source include/search_pattern_in_file.inc;
+
+rmdir $targetdir;
+#remove_file $log;
+drop table t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/mariabackup/unsupported_redo.result mariadb-10.11.13/mysql-test/suite/mariabackup/unsupported_redo.result
--- mariadb-10.11.11/mysql-test/suite/mariabackup/unsupported_redo.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/mariabackup/unsupported_redo.result	2025-05-19 16:14:25.000000000 +0000
@@ -22,8 +22,8 @@
 ALTER TABLE t21 FORCE, ALGORITHM=INPLACE;
 # Create partial backup (excluding table t21), Ignore the
 # unsupported redo log for the table t21.
-t1.new
-t2.new
+t1.ibd
+t2.ibd
 # Prepare the full backup
 t1.ibd
 t2.ibd
diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.opt mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.opt
--- mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.opt	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1 @@
+--master-info-file=$MYSQL_TMP_DIR/master_info_file.txt
diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.result mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.result
--- mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,18 @@
+CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_1;
+CHANGE MASTER 'named' TO master_host='localhost', master_user='test', master_port=SERVER_MYPORT_2;
+--list_files @@datadir *.info
+relay-log-named.info
+relay-log.info
+--list_files MYSQL_TMP_DIR *.txt
+master_info_file-named.txt
+master_info_file.txt
+multi-master_info_file.txt
+--cat_file MYSQL_TMP_DIR/multi-master_info_file.txt
+named
+FOUND 1 matches in master_info_file.txt
+FOUND 1 matches in master_info_file.txt
+FOUND 1 matches in master_info_file.txt
+FOUND 1 matches in master_info_file-named.txt
+FOUND 1 matches in master_info_file-named.txt
+FOUND 1 matches in master_info_file-named.txt
+RESET REPLICA 'named' ALL;
diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.test mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.test
--- mariadb-10.11.11/mysql-test/suite/multi_source/master_info_file.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/multi_source/master_info_file.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,38 @@
+# MDEV-36238: Test `--master-info-file`
+#
+# Other tests (such as `info_logs`) work explicitly with `(multi-)master.info`.
+# This test sees that `--master-info-file` moves/renames this file.
+
+--source include/not_embedded.inc
+--replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1
+--eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=$SERVER_MYPORT_1
+--replace_result $SERVER_MYPORT_2 SERVER_MYPORT_2
+--eval CHANGE MASTER 'named' TO master_host='localhost', master_user='test', master_port=$SERVER_MYPORT_2
+
+--let $datadir = `SELECT @@datadir`
+--echo --list_files @@datadir *.info
+--list_files $datadir *.info
+--echo --list_files MYSQL_TMP_DIR *.txt
+--list_files $MYSQL_TMP_DIR *.txt
+
+--echo --cat_file MYSQL_TMP_DIR/multi-master_info_file.txt
+--cat_file $MYSQL_TMP_DIR/multi-master_info_file.txt
+--let SEARCH_OUTPUT= count
+
+--let SEARCH_FILE= $MYSQL_TMP_DIR/master_info_file.txt
+--let SEARCH_PATTERN= \\n127.0.0.1\\n
+--source include/search_pattern_in_file.inc
+--let SEARCH_PATTERN= \\nroot\\n
+--source include/search_pattern_in_file.inc
+--let SEARCH_PATTERN= \\n$SERVER_MYPORT_1\\n
+--source include/search_pattern_in_file.inc
+
+--let SEARCH_FILE= $MYSQL_TMP_DIR/master_info_file-named.txt
+--let SEARCH_PATTERN= \\nlocalhost\\n
+--source include/search_pattern_in_file.inc
+--let SEARCH_PATTERN= \\ntest\\n
+--source include/search_pattern_in_file.inc
+--let SEARCH_PATTERN= \\n$SERVER_MYPORT_2\\n
+--source include/search_pattern_in_file.inc
+
+RESET REPLICA 'named' ALL;
diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.cnf mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.cnf
--- mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,13 @@
+!include ./my.cnf
+
+[mysqld.1]
+show-slave-auth-info
+
+[mysqld.4]
+server-id=4
+log-warnings=2
+report-user=my_user
+report-password=my_password
+
+[ENV]
+SERVER_MYPORT_4= @mysqld.4.port
diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.result mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.result
--- mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,45 @@
+# Setup
+connect  master1,127.0.0.1,root,,,$SERVER_MYPORT_1;
+connect  master2,127.0.0.1,root,,,$SERVER_MYPORT_2;
+connect  slave1,127.0.0.1,root,,,$SERVER_MYPORT_3;
+connect  slave2,127.0.0.1,root,,,$SERVER_MYPORT_4;
+connection slave2;
+CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_1;
+CHANGE MASTER 'control sample' TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_2;
+START ALL SLAVES;
+connection slave1;
+CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_1;
+CHANGE MASTER 'control sample' TO master_host='127.0.0.1', master_user='root', master_port=SERVER_MYPORT_2;
+START ALL SLAVES;
+# Test
+connection master2;
+SHOW SLAVE HOSTS;
+Server_id	Host	Port	Master_id
+3	localhost	SERVER_MYPORT_3	2
+4	localhost	SERVER_MYPORT_4	2
+connection master1;
+SHOW SLAVE HOSTS;
+Server_id	Host	User	Password	Port	Master_id
+3	localhost			SERVER_MYPORT_3	1
+4	localhost	my_user	my_password	SERVER_MYPORT_4	1
+SHOW REPLICA HOSTS;
+Server_id	Host	User	Password	Port	Master_id
+3	localhost			SERVER_MYPORT_3	1
+4	localhost	my_user	my_password	SERVER_MYPORT_4	1
+# Cleanup
+connection slave2;
+STOP ALL SLAVES;
+include/wait_for_slave_to_stop.inc
+SET @@SESSION.default_master_connection= 'control sample';
+include/wait_for_slave_to_stop.inc
+RESET SLAVE ALL;
+connection slave1;
+STOP ALL SLAVES;
+include/wait_for_slave_to_stop.inc
+SET @@SESSION.default_master_connection= 'control sample';
+include/wait_for_slave_to_stop.inc
+RESET SLAVE ALL;
+disconnect master1;
+disconnect master2;
+disconnect slave1;
+disconnect slave2;
diff -Nru mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.test mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.test
--- mariadb-10.11.11/mysql-test/suite/multi_source/show_slave_auth_info.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/multi_source/show_slave_auth_info.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,83 @@
+# MDEV-36238: Test `--show-slave-auth-info` (and `--report-user`/`password`)
+#
+# `rpl.rpl_show_slave_hosts` and `rpl.rpl_slave_alias_replica`
+# (and several others) test SHOW SLAVE HOSTS without `--show-slave-auth-info`.
+# This test supplements them with a comparison between with and without.
+
+# SHOW SLAVE HOSTS is agnostic of binlog formats
+--source include/have_binlog_format_mixed.inc
+
+--echo # Setup
+
+# This server has `--show-slave-auth-info`.
+--connect (master1,127.0.0.1,root,,,$SERVER_MYPORT_1)
+# This `--show-slave-auth-info`-less server asserts that it is per-master.
+--connect (master2,127.0.0.1,root,,,$SERVER_MYPORT_2)
+# This is a non-reporting slave.
+--connect (slave1,127.0.0.1,root,,,$SERVER_MYPORT_3)
+# This is a self-reporting slave.
+--connect (slave2,127.0.0.1,root,,,$SERVER_MYPORT_4)
+
+--let $rpl_server_number= 2
+while ($rpl_server_number)
+{
+  --connection slave$rpl_server_number
+
+  --replace_result $SERVER_MYPORT_1 SERVER_MYPORT_1
+  --eval CHANGE MASTER TO master_host='127.0.0.1', master_user='root', master_port=$SERVER_MYPORT_1
+  --replace_result $SERVER_MYPORT_2 SERVER_MYPORT_2
+  --eval CHANGE MASTER 'control sample' TO master_host='127.0.0.1', master_user='root', master_port=$SERVER_MYPORT_2
+  --disable_warnings
+  START ALL SLAVES;
+  --enable_warnings
+
+  --dec $rpl_server_number
+}
+
+--echo # Test
+
+--let $rpl_server_number= 2
+while ($rpl_server_number)
+{
+  --connection master$rpl_server_number
+
+  # Make sure the master's synced up
+  --let $show_statement= SHOW SLAVE HOSTS
+  --let $field= Server_id
+  --let $condition= =3
+  --source include/wait_show_condition.inc
+  --let $condition= =4
+  --source include/wait_show_condition.inc
+
+  --replace_result $SERVER_MYPORT_3 SERVER_MYPORT_3 $SERVER_MYPORT_4 SERVER_MYPORT_4
+  SHOW SLAVE HOSTS;
+
+  --dec $rpl_server_number
+}
+
+# MDEV-20601 Make REPLICA a synonym for SLAVE in SQL statements
+--replace_result $SERVER_MYPORT_3 SERVER_MYPORT_3 $SERVER_MYPORT_4 SERVER_MYPORT_4
+SHOW REPLICA HOSTS;
+
+--echo # Cleanup
+
+--let $rpl_server_number= 2
+while ($rpl_server_number)
+{
+  --connection slave$rpl_server_number
+
+  --disable_warnings
+  STOP ALL SLAVES;
+  --enable_warnings
+  --source include/wait_for_slave_to_stop.inc
+  SET @@SESSION.default_master_connection= 'control sample';
+  --source include/wait_for_slave_to_stop.inc
+  RESET SLAVE ALL;
+
+  --dec $rpl_server_number
+}
+
+--disconnect master1
+--disconnect master2
+--disconnect slave1
+--disconnect slave2
diff -Nru mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_innodb.test mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_innodb.test
--- mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_innodb.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_innodb.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_innodb.inc
 --source include/have_partition.inc
 --source include/have_debug_sync.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_memory.test mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_memory.test
--- mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_memory.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_memory.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_partition.inc
 --source include/have_debug_sync.inc
 
diff -Nru mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_myisam.test mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_myisam.test
--- mariadb-10.11.11/mysql-test/suite/parts/t/partition_exchange_myisam.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/parts/t/partition_exchange_myisam.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_partition.inc
 --source include/have_debug_sync.inc
 
diff -Nru mariadb-10.11.11/mysql-test/suite/perfschema/r/threads_innodb.result mariadb-10.11.13/mysql-test/suite/perfschema/r/threads_innodb.result
--- mariadb-10.11.11/mysql-test/suite/perfschema/r/threads_innodb.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/perfschema/r/threads_innodb.result	2025-05-19 16:14:25.000000000 +0000
@@ -1,10 +1,10 @@
 SELECT name, type, processlist_user, processlist_host, processlist_db,
-processlist_command, processlist_time, processlist_state, processlist_info,
+processlist_command, processlist_time, processlist_info,
 parent_thread_id, role, instrumented
 FROM performance_schema.threads
 WHERE name LIKE 'thread/innodb/%'
 GROUP BY name;
-name	type	processlist_user	processlist_host	processlist_db	processlist_command	processlist_time	processlist_state	processlist_info	parent_thread_id	role	instrumented
-thread/innodb/page_cleaner_thread	BACKGROUND	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	YES
-thread/innodb/page_encrypt_thread	BACKGROUND	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	YES
-thread/innodb/thread_pool_thread	BACKGROUND	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	YES
+name	type	processlist_user	processlist_host	processlist_db	processlist_command	processlist_time	processlist_info	parent_thread_id	role	instrumented
+thread/innodb/page_cleaner_thread	BACKGROUND	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	YES
+thread/innodb/page_encrypt_thread	BACKGROUND	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	YES
+thread/innodb/thread_pool_thread	BACKGROUND	NULL	NULL	NULL	NULL	NULL	NULL	NULL	NULL	YES
diff -Nru mariadb-10.11.11/mysql-test/suite/perfschema/t/threads_innodb.test mariadb-10.11.13/mysql-test/suite/perfschema/t/threads_innodb.test
--- mariadb-10.11.11/mysql-test/suite/perfschema/t/threads_innodb.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/perfschema/t/threads_innodb.test	2025-05-19 16:14:25.000000000 +0000
@@ -14,7 +14,7 @@
 # We suppress here duplicates rows with the goal to avoid that the test fails
 # in case some defaults are changed.
 SELECT name, type, processlist_user, processlist_host, processlist_db,
-       processlist_command, processlist_time, processlist_state, processlist_info,
+       processlist_command, processlist_time, processlist_info,
        parent_thread_id, role, instrumented
 FROM performance_schema.threads
 WHERE name LIKE 'thread/innodb/%'
diff -Nru mariadb-10.11.11/mysql-test/suite/plugins/r/server_audit.result mariadb-10.11.13/mysql-test/suite/plugins/r/server_audit.result
--- mariadb-10.11.11/mysql-test/suite/plugins/r/server_audit.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/plugins/r/server_audit.result	2025-05-19 16:14:25.000000000 +0000
@@ -20,6 +20,9 @@
 set global server_audit_incl_users=null;
 set global server_audit_file_path='server_audit.log';
 set global server_audit_output_type=file;
+set global server_audit_file_path=REPEAT(REPEAT('new_file_name', 50), 50);
+Warnings:
+Warning	1	server_audit_file_path can't exceed FN_LEN characters.
 set global server_audit_logging=on;
 set global server_audit_incl_users= repeat("'root',", 10000);
 ERROR 42000: Variable 'server_audit_incl_users' can't be set to the value of ''root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','root','...'
diff -Nru mariadb-10.11.11/mysql-test/suite/plugins/t/server_audit.test mariadb-10.11.13/mysql-test/suite/plugins/t/server_audit.test
--- mariadb-10.11.11/mysql-test/suite/plugins/t/server_audit.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/plugins/t/server_audit.test	2025-05-19 16:14:25.000000000 +0000
@@ -20,6 +20,10 @@
 set global server_audit_incl_users=null;
 set global server_audit_file_path='server_audit.log';
 set global server_audit_output_type=file;
+
+--replace_regex /[1-9][0-9][0-9]+/FN_LEN/
+set global server_audit_file_path=REPEAT(REPEAT('new_file_name', 50), 50);
+
 set global server_audit_logging=on;
 
 --error ER_WRONG_VALUE_FOR_VAR
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result mariadb-10.11.13/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/parallel_backup_xa_debug.result	2025-05-19 16:14:25.000000000 +0000
@@ -4,6 +4,7 @@
 CREATE TABLE t (a INT) ENGINE = innodb;
 connection slave;
 include/stop_slave.inc
+SET STATEMENT sql_log_bin= 0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads;
 SET @old_parallel_mode   = @@GLOBAL.slave_parallel_mode;
 SET @@global.slave_parallel_threads= 2;
@@ -19,6 +20,7 @@
 connection slave;
 SET @@global.debug_dbug="+d,hold_worker_on_schedule";
 start slave;
+SET debug_sync = 'now WAIT_FOR reached_pause';
 connection slave1;
 backup stage start;
 backup stage block_commit;
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_create_select_row.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_create_select_row.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_create_select_row.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_create_select_row.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,158 @@
+include/master-slave.inc
+[connection master]
+connection master;
+set @max_binlog_cache_size         = @@global.max_binlog_cache_size;
+set @binlog_cache_size             = @@global.binlog_cache_size;
+set @@global.max_binlog_cache_size = 4096;
+set @@global.    binlog_cache_size = 4096;
+#
+# MDEV-35207 ignored error at binlogging by CREATE-TABLE-SELECT leads to assert
+#
+connect  conn_err,localhost,root,,;
+call mtr.add_suppression("Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage");
+create table t engine=myisam select repeat ('a',4096*3) AS a;
+ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again
+create table t engine=innodb select repeat ('a',4096*3) AS a;
+ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again
+create table t (a int unique, b char) select 1 AS a, 'b' as b union select 1 as a, 'c' as b;
+ERROR 23000: Duplicate entry '1' for key 'a'
+select * from t;
+ERROR 42S02: Table 'test.t' doesn't exist
+disconnect conn_err;
+connection master;
+
+#
+# MDEV-35499 errored CREATE-OR-REPLACE-SELECT does not DROP table in binlog
+#
+#
+# Engine = innodb
+#
+set statement binlog_format=statement for create table t (a int) select 1 as a;
+set statement binlog_format=row for create or replace table t (a int primary key, b char) engine=innodb select 1 AS a, 'b' as b union select 1 as a, 'c' as b;
+ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
+select * from t;
+ERROR 42S02: Table 'test.t' doesn't exist
+#
+# Prove an expected lonely `DROP table t'
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+master-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+master-bin.000001	#	Query	#	#	use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */
+master-bin.000001	#	Query	#	#	ROLLBACK
+set statement binlog_format=statement for create table t (a int) select 1 as a;
+set statement binlog_format=row for create or replace table t (a text) engine=innodb select repeat ('a',1024) AS a union select repeat ('a',3*4096) AS a union select repeat ('a',3*4096) AS a;
+ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again
+select * from t;
+ERROR 42S02: Table 'test.t' doesn't exist
+#
+# Prove an expected lonely `DROP table t'
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+master-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+master-bin.000001	#	Query	#	#	use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */
+master-bin.000001	#	Query	#	#	ROLLBACK
+set statement binlog_format=statement for create table t (a int) select 1 as a;
+set statement binlog_format=row for create or replace table t (a text) engine=innodb select repeat ('a',4096*3) AS a;;
+ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again
+select * from t;
+ERROR 42S02: Table 'test.t' doesn't exist
+#
+# Prove an expected lonely `DROP table t'
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+master-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+master-bin.000001	#	Query	#	#	use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */
+master-bin.000001	#	Query	#	#	ROLLBACK
+#
+# Engine = myisam
+#
+set statement binlog_format=statement for create table t (a int) select 1 as a;
+set statement binlog_format=row for create or replace table t (a int primary key, b char) engine=myisam select 1 AS a, 'b' as b union select 1 as a, 'c' as b;
+ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
+select * from t;
+ERROR 42S02: Table 'test.t' doesn't exist
+#
+# Prove an expected lonely `DROP table t'
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+master-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+master-bin.000001	#	Query	#	#	use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */
+master-bin.000001	#	Query	#	#	ROLLBACK
+set statement binlog_format=statement for create table t (a int) select 1 as a;
+set statement binlog_format=row for create or replace table t (a text) engine=myisam select repeat ('a',1024) AS a union select repeat ('a',3*4096) AS a union select repeat ('a',3*4096) AS a;
+ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again
+select * from t;
+ERROR 42S02: Table 'test.t' doesn't exist
+#
+# Prove an expected lonely `DROP table t'
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+master-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+master-bin.000001	#	Query	#	#	use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */
+master-bin.000001	#	Query	#	#	ROLLBACK
+set statement binlog_format=statement for create table t (a int) select 1 as a;
+set statement binlog_format=row for create or replace table t (a text) engine=myisam select repeat ('a',4096*3) AS a;;
+ERROR HY000: Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage; increase this mariadbd variable and try again
+select * from t;
+ERROR 42S02: Table 'test.t' doesn't exist
+#
+# Prove an expected lonely `DROP table t'
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+master-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+master-bin.000001	#	Query	#	#	use `test`; DROP TABLE IF EXISTS `test`.`t`/* Generated to handle failed CREATE OR REPLACE */
+master-bin.000001	#	Query	#	#	ROLLBACK
+create table ti_pk (a int primary key) engine=innodb;
+create table ta (a int) engine=aria;
+create function f_ia(arg int)
+returns integer
+begin
+insert into ti_pk set a=1;
+insert into ta set a=1;
+insert into ti_pk set a=arg;
+return 1;
+end |
+set statement binlog_format = ROW for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a;
+ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
+select * from t_y;
+ERROR 42S02: Table 'test.t_y' doesn't exist
+# correct execution: `ta` is modified and its new record is binlogged
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+master-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+master-bin.000001	#	Table_map	#	#	table_id: # (test.ta)
+master-bin.000001	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
+master-bin.000001	#	Query	#	#	COMMIT
+select * from ta;
+a
+1
+select * from ti_pk;
+a
+connection slave;
+include/diff_tables.inc [master:ta,slave:ta]
+connection master;
+delete from ta;
+connection slave;
+connection master;
+set statement binlog_format = STATEMENT for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a;
+ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
+select * from t_y;
+ERROR 42S02: Table 'test.t_y' doesn't exist
+# ***TODO: fix MDEV-36027***. As of now `ta` is modified but that's not binlogged
+include/show_binlog_events.inc
+select *,'on_master' from ta;
+a	on_master
+1	on_master
+select * from ti_pk;
+a
+connection slave;
+select *,'on_slave' from ta;
+a	on_slave
+connection master;
+drop function f_ia;
+drop table ti_pk, ta;
+SET @@global.max_binlog_cache_size = @max_binlog_cache_size;
+SET @@global.    binlog_cache_size = @binlog_cache_size;
+connection slave;
+End of the tests
+include/rpl_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_gtid_crash.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_gtid_crash.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_gtid_crash.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_gtid_crash.result	2025-05-19 16:14:25.000000000 +0000
@@ -12,6 +12,8 @@
 connection server_2;
 SET sql_log_bin=0;
 call mtr.add_suppression('Master command COM_REGISTER_SLAVE failed: failed registering on master, reconnecting to try again');
+call mtr.add_suppression('Slave I/O: .*Lost connection to server during query');
+call mtr.add_suppression("Slave I/O thread couldn't register on master");
 SET sql_log_bin=1;
 include/stop_slave.inc
 CHANGE MASTER TO master_host = '127.0.0.1', master_port = MASTER_PORT,
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_master_pos_wait.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_master_pos_wait.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_master_pos_wait.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_master_pos_wait.result	2025-05-19 16:14:25.000000000 +0000
@@ -43,6 +43,9 @@
 select master_pos_wait('master-bin.000001',1000000,1,"my_slave");
 master_pos_wait('master-bin.000001',1000000,1,"my_slave")
 -1
+select master_pos_wait('master-bin.000001',1000000,1,"MY_SLAVE");
+master_pos_wait('master-bin.000001',1000000,1,"MY_SLAVE")
+-1
 STOP SLAVE 'my_slave';
 RESET SLAVE 'my_slave' ALL;
 change master to master_port=MASTER_MYPORT, master_host='127.0.0.1', master_user='root';
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_parallel_innodb_lock_conflict.result	2025-05-19 16:14:25.000000000 +0000
@@ -1,16 +1,15 @@
 ***MDEV-5914: Parallel replication deadlock due to InnoDB lock conflicts ***
 include/master-slave.inc
 [connection master]
-connection server_2;
-SET sql_log_bin=0;
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CALL mtr.add_suppression("InnoDB: Transaction was aborted due to ");
 CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
-SET sql_log_bin=1;
+connection server_2;
 SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=10;
 CHANGE MASTER TO master_use_gtid=slave_pos;
 connection server_1;
-ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB;
 INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
 connect  con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,;
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_master_disable_with_slave.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,41 @@
+# Set up Semi-Sync with rpl_semi_sync_master_wait_no_slave=0
+include/master-slave.inc
+[connection master]
+SET @@GLOBAL.rpl_semi_sync_master_enabled= 1;
+SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= 0;
+connection slave;
+SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1;
+include/start_slave.inc
+connection master;
+connection slave;
+connection master;
+SELECT ID INTO @binlog_dump_tid
+FROM information_schema.PROCESSLIST WHERE COMMAND = 'Binlog Dump';
+# Control State
+SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid;
+STATE
+Master has sent all binlog to slave; waiting for more updates
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+Variable_name	Value
+Rpl_semi_sync_master_clients	1
+# Disable Semi-Sync while the dump thread is still connected to its slave
+SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
+SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid;
+STATE
+Master has sent all binlog to slave; waiting for more updates
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+Variable_name	Value
+Rpl_semi_sync_master_clients	1
+# Disconnect the slave and wait until the master's dump thread is gone
+connection slave;
+STOP SLAVE;
+connection master;
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+Variable_name	Value
+Rpl_semi_sync_master_clients	0
+# Cleanup
+SET @@GLOBAL.rpl_semi_sync_master_enabled= 0;
+SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= 1;
+connection slave;
+SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
+include/rpl_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_semi_sync_ssl_stop.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,53 @@
+# Skip starting the slave because we manually start with SSL later
+include/master-slave.inc
+[connection master]
+#
+# Setup
+connection master;
+CREATE USER replssl@localhost;
+GRANT REPLICATION SLAVE on *.* to replssl@localhost REQUIRE SSL;
+set @orig_master_enabled= @@GLOBAL.rpl_semi_sync_master_enabled;
+SET @@GLOBAL.rpl_semi_sync_master_enabled= 1;
+connection slave;
+CHANGE MASTER TO
+master_user='replssl',
+master_password='',
+master_ssl=1,
+master_ssl_ca='MYSQL_TEST_DIR/std_data/cacert.pem',
+master_ssl_cert='MYSQL_TEST_DIR/std_data/client-cert.pem',
+master_ssl_key='MYSQL_TEST_DIR/std_data/client-key.pem';
+set @orig_slave_enabled= @@GLOBAL.rpl_semi_sync_slave_enabled;
+SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1;
+include/start_slave.inc
+connection master;
+# Verify Semi-Sync is active
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+Variable_name	Value
+Rpl_semi_sync_master_clients	1
+# Create some table so slave can be seen as up-to-date and working
+connection master;
+CREATE TABLE t1 (a INT);
+connection slave;
+# Disconnect the slave and wait until the master's dump thread is gone
+connection slave;
+STOP SLAVE;
+connection master;
+# MDEV-36663: Verifying dump thread connection is killed..
+# ..done
+# Cleanup
+connection master;
+SET @@GLOBAL.rpl_semi_sync_master_enabled= @orig_master_enabled;
+DROP USER replssl@localhost;
+DROP TABLE t1;
+connection slave;
+SET @@GLOBAL.rpl_semi_sync_slave_enabled= @orig_slave_enabled;
+CHANGE MASTER TO
+master_user='root',
+master_ssl=0,
+master_ssl_ca='',
+master_ssl_cert='',
+master_ssl_key='';
+connection slave;
+include/start_slave.inc
+include/rpl_end.inc
+# End of rpl_semi_sync_ssl_stop.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result
--- mariadb-10.11.11/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/r/rpl_xa_2pc_multi_engine.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,26 @@
+include/master-slave.inc
+[connection master]
+connection master;
+create table t1 (a int primary key, b int) engine=innodb;
+insert t1 values (1,1),(3,3),(5,5),(7,7);
+create table t2 (m int) engine=aria;
+# Create multi-engine, two-phase XA transaction (T1)
+xa start '1';
+insert t2 values (1);
+update t1 set b=50 where b=5;
+xa end '1';
+xa prepare '1';
+# Create T2
+connection server_1;
+update t1 set b=10 where a=5;
+connection master;
+xa commit '1';
+connection server_1;
+include/save_master_gtid.inc
+# This would hang prior to MDEV-21117
+connection slave;
+include/sync_with_master_gtid.inc
+connection master;
+drop table t1, t2;
+include/rpl_end.inc
+# End of rpl_xa_2pc_multi_engine.test
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test mariadb-10.11.13/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/parallel_backup_xa_debug.test	2025-05-19 16:14:25.000000000 +0000
@@ -11,6 +11,7 @@
 
 --sync_slave_with_master
 --source include/stop_slave.inc
+SET STATEMENT sql_log_bin= 0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads;
 SET @old_parallel_mode   = @@GLOBAL.slave_parallel_mode;
 SET @@global.slave_parallel_threads= 2;
@@ -28,20 +29,21 @@
 --connection slave
 SET @@global.debug_dbug="+d,hold_worker_on_schedule";
 start slave;
+SET debug_sync = 'now WAIT_FOR reached_pause';
 --let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to commit"
 --source include/wait_condition.inc
 
 --connection slave1
 backup stage start;
---send backup stage block_commit
+backup stage block_commit;
 
 --connection slave
 --let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for backup lock"
 SET debug_sync = 'now SIGNAL continue_worker';
+--source include/wait_condition.inc
 SET debug_sync = RESET;
 
 --connection slave1
-reap;
 backup stage end;
 
 --connection master
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_create_select_row.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_create_select_row.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_create_select_row.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_create_select_row.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,161 @@
+--source include/have_binlog_format_row.inc
+--source include/have_innodb.inc
+--source include/master-slave.inc
+
+--connection master
+set @max_binlog_cache_size         = @@global.max_binlog_cache_size;
+set @binlog_cache_size             = @@global.binlog_cache_size;
+set @@global.max_binlog_cache_size = 4096;
+set @@global.    binlog_cache_size = 4096;
+
+--echo #
+--echo # MDEV-35207 ignored error at binlogging by CREATE-TABLE-SELECT leads to assert
+--echo #
+# fix the current (write) binlog position
+--let $binlog_file_0= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $binlog_start_0 = query_get_value(SHOW MASTER STATUS, Position, 1)
+
+# use a separate connection also to validate its close will be clean
+connect (conn_err,localhost,root,,);
+
+call mtr.add_suppression("Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage");
+--error ER_TRANS_CACHE_FULL
+create table t engine=myisam select repeat ('a',4096*3) AS a;
+
+--error ER_TRANS_CACHE_FULL
+create table t engine=innodb select repeat ('a',4096*3) AS a;
+
+--error ER_DUP_ENTRY
+create table t (a int unique, b char) select 1 AS a, 'b' as b union select 1 as a, 'c' as b;
+--error ER_NO_SUCH_TABLE
+select * from t;
+
+--disconnect conn_err
+
+--connection master
+--let $binlog_file_1= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $binlog_start_1= query_get_value(SHOW MASTER STATUS, Position, 1)
+
+--let $cmp = `select strcmp('$binlog_file_1', '$binlog_file_0') <> 0 OR $binlog_start_1 <> $binlog_start_0`
+if (!$cmp)
+{
+   --echo *** Error: unexpected advance of binlog position
+   --die
+}
+
+--echo
+--echo #
+--echo # MDEV-35499 errored CREATE-OR-REPLACE-SELECT does not DROP table in binlog
+--echo #
+--let $i = 2
+while ($i)
+{
+  --let $engine=`select if($i % 2, "myisam", "innodb")`
+  --echo #
+  --echo # Engine = $engine
+  --echo #
+  set statement binlog_format=statement for create table t (a int) select 1 as a;
+  --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+  --let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+  --error ER_DUP_ENTRY
+  --eval set statement binlog_format=row for create or replace table t (a int primary key, b char) engine=$engine select 1 AS a, 'b' as b union select 1 as a, 'c' as b
+  --error ER_NO_SUCH_TABLE
+  select * from t;
+  --echo #
+  --echo # Prove an expected lonely `DROP table t'
+  --source include/show_binlog_events.inc
+
+  # error before stmt commit
+  set statement binlog_format=statement for create table t (a int) select 1 as a;
+  --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+  --let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+  --error ER_TRANS_CACHE_FULL
+  --eval set statement binlog_format=row for create or replace table t (a text) engine=$engine select repeat ('a',1024) AS a union select repeat ('a',3*4096) AS a union select repeat ('a',3*4096) AS a
+  --error ER_NO_SUCH_TABLE
+  select * from t;
+  --echo #
+  --echo # Prove an expected lonely `DROP table t'
+  --source include/show_binlog_events.inc
+
+  # error at stmt commit
+  set statement binlog_format=statement for create table t (a int) select 1 as a;
+  --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+  --let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+  --error ER_TRANS_CACHE_FULL
+  --eval set statement binlog_format=row for create or replace table t (a text) engine=$engine select repeat ('a',4096*3) AS a;
+  --error ER_NO_SUCH_TABLE
+  select * from t;
+  --echo #
+  --echo # Prove an expected lonely `DROP table t'
+  --source include/show_binlog_events.inc
+
+--dec $i
+}
+
+# Tests of mixed engines to demonstrate non-transaction table updates
+# are binlogged or otherwise MDEV-36027.
+create table ti_pk (a int primary key) engine=innodb;
+create table ta (a int) engine=aria;
+delimiter |;
+create function f_ia(arg int)
+returns integer
+begin
+  insert into ti_pk set a=1;
+  insert into ta set a=1;
+  insert into ti_pk set a=arg;
+  return 1;
+end |
+delimiter ;|
+
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+
+--error ER_DUP_ENTRY
+set statement binlog_format = ROW for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a;
+--error ER_NO_SUCH_TABLE
+select * from t_y;
+
+--echo # correct execution: `ta` is modified and its new record is binlogged
+--source include/show_binlog_events.inc
+select * from ta;
+select * from ti_pk;
+
+--sync_slave_with_master
+--let  $diff_tables=master:ta,slave:ta
+--source include/diff_tables.inc
+
+--connection master
+delete from ta;
+--sync_slave_with_master
+
+--connection master
+# MDEV-36027 Errored-out CREATE-SELECT does not binlog results of any function modifying non-transactional table
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $binlog_start = query_get_value(SHOW MASTER STATUS, Position, 1)
+--error ER_DUP_ENTRY
+set statement binlog_format = STATEMENT for create table t_y (a int) engine=aria select f_ia(1 /* err */) as a;
+--error ER_NO_SUCH_TABLE
+select * from t_y;
+
+--echo # ***TODO: fix MDEV-36027***. As of now `ta` is modified but that's not binlogged
+--source include/show_binlog_events.inc
+select *,'on_master' from ta;
+select * from ti_pk;
+
+--sync_slave_with_master
+select *,'on_slave' from ta;
+
+# Cleanup
+--connection master
+drop function f_ia;
+drop table ti_pk, ta;
+
+SET @@global.max_binlog_cache_size = @max_binlog_cache_size;
+SET @@global.    binlog_cache_size = @binlog_cache_size;
+
+# test that binlog replicates correctly to slave
+# --connection slave
+--sync_slave_with_master
+
+--echo End of the tests
+--source include/rpl_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash-slave.opt	2025-05-19 16:14:25.000000000 +0000
@@ -1 +1 @@
---master-retry-count=100 --slave-net-timeout=10
+--master-retry-count=500 --slave-net-timeout=10
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_gtid_crash.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_gtid_crash.test	2025-05-19 16:14:25.000000000 +0000
@@ -24,6 +24,8 @@
 --sync_with_master
 SET sql_log_bin=0;
 call mtr.add_suppression('Master command COM_REGISTER_SLAVE failed: failed registering on master, reconnecting to try again');
+call mtr.add_suppression('Slave I/O: .*Lost connection to server during query');
+call mtr.add_suppression("Slave I/O thread couldn't register on master");
 SET sql_log_bin=1;
 --source include/stop_slave.inc
 --replace_result $MASTER_MYPORT MASTER_PORT
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_heartbeat_basic.test	2025-05-19 16:14:25.000000000 +0000
@@ -9,6 +9,7 @@
 # * Various states of master and heartbeat
 # * Circular replication
 #############################################################
+--source include/long_test.inc
 --source include/master-slave.inc
 #
 # The test runs long and does not have any specifics to 
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_master_pos_wait.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_master_pos_wait.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_master_pos_wait.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_master_pos_wait.test	2025-05-19 16:14:25.000000000 +0000
@@ -48,6 +48,7 @@
 
 --echo # Call with a valid connection name -- hangs before MDEV-7130 fix (expected -1)
 select master_pos_wait('master-bin.000001',1000000,1,"my_slave");
+select master_pos_wait('master-bin.000001',1000000,1,"MY_SLAVE");
 
 STOP SLAVE 'my_slave';
 RESET SLAVE 'my_slave' ALL;
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_parallel_innodb_lock_conflict.test	2025-05-19 16:14:25.000000000 +0000
@@ -5,21 +5,19 @@
 --source include/have_debug_sync.inc
 --source include/master-slave.inc
 
---disable_query_log
-call mtr.add_suppression("InnoDB: Transaction was aborted due to ");
---enable_query_log
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CALL mtr.add_suppression("InnoDB: Transaction was aborted due to ");
+CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+--save_master_pos
 
 --connection server_2
-SET sql_log_bin=0;
-CALL mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
-SET sql_log_bin=1;
+--sync_with_master
 SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
 --source include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=10;
 CHANGE MASTER TO master_use_gtid=slave_pos;
 
 --connection server_1
-ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
 CREATE TABLE t4 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB;
 INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
 --connect (con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_row_drop_create_temp_table.test	2025-05-19 16:14:25.000000000 +0000
@@ -3,6 +3,7 @@
 # tables. Specifically when drop temporary tables and create temporary tables
 # are used.
 ###################################################################################
+--source include/long_test.inc
 --source include/have_binlog_format_row.inc
 --source include/have_innodb.inc
 --source include/master-slave.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync.test	2025-05-19 16:14:25.000000000 +0000
@@ -4,6 +4,7 @@
 # Please check all dependent tests after modifying it
 #
 
+source include/long_test.inc;
 source include/not_embedded.inc;
 source include/have_innodb.inc;
 source include/master-slave.inc;
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_binlog_format_statement.inc
 set global rpl_semi_sync_master_wait_point=AFTER_SYNC;
 source rpl_semi_sync.test;
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_after_sync_row.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/long_test.inc
 --source include/have_binlog_format_row.inc
 set global rpl_semi_sync_master_wait_point=AFTER_SYNC;
 source rpl_semi_sync.test;
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_master_disable_with_slave.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,68 @@
+# MDEV-36359: Master crashes when reverting to async after Semi-Sync disabled.
+#
+# Assert behavior of turning Semi-Sync off on
+# the master when still connected to a slave
+
+--source include/have_binlog_format_mixed.inc # format-agnostic
+
+--echo # Set up Semi-Sync with rpl_semi_sync_master_wait_no_slave=0
+--let $rpl_skip_start_slave= 1
+--source include/master-slave.inc
+
+--let $orig_master_enabled=`SELECT @@GLOBAL.rpl_semi_sync_master_enabled`
+SET @@GLOBAL.rpl_semi_sync_master_enabled= 1;
+--let $orig_wait_no_slave=`SELECT @@GLOBAL.rpl_semi_sync_master_wait_no_slave`
+SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= 0;
+
+--connection slave
+--let $orig_slave_enabled=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled`
+SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1;
+--source include/start_slave.inc
+
+--connection master
+# Make sure Semi-Sync is active
+--let $status_var= Rpl_semi_sync_master_status
+--let $status_var_value= ON
+--source include/wait_for_status_var.inc
+
+--sync_slave_with_master
+--connection master
+
+--disable_cursor_protocol
+SELECT ID INTO @binlog_dump_tid
+  FROM information_schema.PROCESSLIST WHERE COMMAND = 'Binlog Dump';
+--enable_cursor_protocol
+
+--echo # Control State
+SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid;
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+
+--echo # Disable Semi-Sync while the dump thread is still connected to its slave
+SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
+--let $status_var_value= OFF
+--source include/wait_for_status_var.inc
+
+SELECT STATE FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid;
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+
+--echo # Disconnect the slave and wait until the master's dump thread is gone
+--connection slave
+STOP SLAVE;
+# Starting with MDEV-13073,
+# Semi-Sync STOP SLAVE also terminates its dump thread on the master.
+--connection master
+
+# MDEV-36359: The disconnection would crash the master and leave the wait with
+# error 2013 'Lost connection to server during query'
+--let $wait_condition= SELECT COUNT(*)=0 FROM information_schema.PROCESSLIST WHERE ID = @binlog_dump_tid
+--source include/wait_condition.inc
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+
+--echo # Cleanup
+--eval SET @@GLOBAL.rpl_semi_sync_master_enabled= $orig_master_enabled
+--eval SET @@GLOBAL.rpl_semi_sync_master_wait_no_slave= $orig_wait_no_slave
+--connection slave
+--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled= $orig_slave_enabled
+
+--let $rpl_only_running_threads= 1
+--source include/rpl_end.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_semi_sync_ssl_stop.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,100 @@
+#
+#   This test verifies that semi-sync setups configured to use SSL can kill
+# the replication connection when the IO thread is stopped (e.g. from
+# STOP SLAVE). The way it should happen, is that the IO thread creates a new
+# connection to the primary which issues KILL on the connection id of the
+# replication connection. MDEV-36663 reported an issue where this new
+# kill-oriented connection could not connect to a primary when it requires
+# connections to use SSL.
+#
+#   This test sets up a semi-sync SSL master-slave topology, and stops the
+# slave IO thread. It then validates that the connection was killed by using
+# the wait_condition.inc utility to wait for the binlog dump thread to die,
+# and also validates that the status variable Rpl_semi_sync_master_clients
+# reports as 0.
+#
+# References:
+#   MDEV-36663: Semi-sync Replica Can't Kill Dump Thread When Using SSL
+#
+--source include/have_binlog_format_mixed.inc # format-agnostic
+--source include/have_ssl_communication.inc
+
+--echo # Skip starting the slave because we manually start with SSL later
+--let $rpl_skip_start_slave= 1
+--source include/master-slave.inc
+
+--echo #
+--echo # Setup
+--connection master
+CREATE USER replssl@localhost;
+GRANT REPLICATION SLAVE on *.* to replssl@localhost REQUIRE SSL;
+
+set @orig_master_enabled= @@GLOBAL.rpl_semi_sync_master_enabled;
+SET @@GLOBAL.rpl_semi_sync_master_enabled= 1;
+
+--connection slave
+--replace_result $MYSQL_TEST_DIR MYSQL_TEST_DIR
+eval CHANGE MASTER TO
+  master_user='replssl',
+  master_password='',
+  master_ssl=1,
+  master_ssl_ca='$MYSQL_TEST_DIR/std_data/cacert.pem',
+  master_ssl_cert='$MYSQL_TEST_DIR/std_data/client-cert.pem',
+  master_ssl_key='$MYSQL_TEST_DIR/std_data/client-key.pem';
+
+set @orig_slave_enabled= @@GLOBAL.rpl_semi_sync_slave_enabled;
+SET @@GLOBAL.rpl_semi_sync_slave_enabled= 1;
+
+--source include/start_slave.inc
+
+--connection master
+--echo # Verify Semi-Sync is active
+--let $status_var= Rpl_semi_sync_master_clients
+--let $status_var_value= 1
+--source include/wait_for_status_var.inc
+SHOW STATUS LIKE 'Rpl_semi_sync_master_clients';
+
+--echo # Create some table so slave can be seen as up-to-date and working
+--connection master
+CREATE TABLE t1 (a INT);
+--sync_slave_with_master
+
+--echo # Disconnect the slave and wait until the master's dump thread is gone
+--connection slave
+STOP SLAVE;
+--connection master
+
+--echo # MDEV-36663: Verifying dump thread connection is killed..
+# Prior to MDEV-36663 fixes, this would time out and
+# Rpl_semi_sync_master_clients would remain 1.
+--let $wait_condition= SELECT COUNT(*)=0 FROM information_schema.PROCESSLIST WHERE USER = 'replssl'
+--source include/wait_condition.inc
+
+--let $n_master_clients= query_get_value(SHOW STATUS LIKE 'Rpl_semi_sync_master_clients', Value, 1)
+if ($n_master_clients)
+{
+  --echo # Rpl_semi_sync_master_clients: $n_master_clients
+  --die Semi-sync dump thread connection not killed
+}
+--echo # ..done
+
+--echo # Cleanup
+--connection master
+SET @@GLOBAL.rpl_semi_sync_master_enabled= @orig_master_enabled;
+DROP USER replssl@localhost;
+DROP TABLE t1;
+
+--connection slave
+SET @@GLOBAL.rpl_semi_sync_slave_enabled= @orig_slave_enabled;
+CHANGE MASTER TO
+  master_user='root',
+  master_ssl=0,
+  master_ssl_ca='',
+  master_ssl_cert='',
+  master_ssl_key='';
+
+--connection slave
+--source include/start_slave.inc
+
+--source include/rpl_end.inc
+--echo # End of rpl_semi_sync_ssl_stop.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_typeconv.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_typeconv.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_typeconv.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_typeconv.test	2025-05-19 16:14:25.000000000 +0000
@@ -4,6 +4,7 @@
 # Please check all dependent tests after modifying it
 #
 
+--source include/long_test.inc
 --source include/have_binlog_format_row.inc
 --source include/master-slave.inc
 
diff -Nru mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test
--- mariadb-10.11.11/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/rpl/t/rpl_xa_2pc_multi_engine.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,63 @@
+#
+# This test ensures binlog order is correct for multi-engine, two-phase XA
+# transactions. MDEV-26652 exposed a race condition which would allow
+# concurrent transactions which modify the same table record to binlog in
+# the "opposite" order, i.e. what _should_ be:
+#   T1 XA PREPARE
+#   T1 XA COMMIT
+#   T2
+#
+# was binlogged as
+#   T1 XA PREPARE
+#   T2
+#   T1 XA COMMIT
+#
+# which would break replication.
+#
+# Note that the actual fix for this issue was done with MDEV-21117.
+#
+# References:
+#   MDEV-26652: xa transactions binlogged in wrong order
+#   MDEV-21117: refine the server binlog-based recovery for semisync
+#
+source include/have_binlog_format_row.inc;
+source include/have_innodb.inc;
+source include/master-slave.inc;
+
+--connection master
+create table t1 (a int primary key, b int) engine=innodb;
+insert t1 values (1,1),(3,3),(5,5),(7,7);
+create table t2 (m int) engine=aria;
+
+
+--echo # Create multi-engine, two-phase XA transaction (T1)
+xa start '1';
+insert t2 values (1);
+update t1 set b=50 where b=5;
+xa end '1';
+
+# Aria doesn't support XA PREPARE, so disable warnings
+--disable_warnings
+xa prepare '1';
+--enable_warnings
+
+--echo # Create T2
+--connection server_1
+--send update t1 set b=10 where a=5
+
+--connection master
+xa commit '1';
+
+--connection server_1
+--reap
+--source include/save_master_gtid.inc
+
+--echo # This would hang prior to MDEV-21117
+--connection slave
+--source include/sync_with_master_gtid.inc
+
+--connection master
+drop table t1, t2;
+
+--source include/rpl_end.inc
+--echo # End of rpl_xa_2pc_multi_engine.test
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.opt mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.opt
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.opt	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1 @@
+--innodb-sys-tables
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.result mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.result
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.result	2025-05-19 16:14:25.000000000 +0000
@@ -166,6 +166,32 @@
 select next value for t1;
 next value for t1
 11
+$check_innodb_flags;
+is_sequence
+12288
+alter table t1 sequence=0;
+begin;
+delete from t1;
+rollback;
+$check_innodb_flags;
+is_sequence
+0
+alter table t1 sequence=1;
+$check_innodb_flags;
+is_sequence
+12288
+alter table t1 sequence=0, algorithm=copy;
+$check_innodb_flags;
+is_sequence
+0
+alter table t1 sequence=1, algorithm=inplace;
+ERROR 0A000: ALGORITHM=INPLACE is not supported. Reason: SEQUENCE. Try ALGORITHM=COPY
+alter table t1 sequence=1, algorithm=copy;
+$check_innodb_flags;
+is_sequence
+12288
+alter table t1 sequence=0, algorithm=inplace;
+ERROR 0A000: ALGORITHM=INPLACE is not supported. Reason: SEQUENCE. Try ALGORITHM=COPY
 drop sequence t1;
 #
 # ALTER TABLE
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.test mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.test
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/alter.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/alter.test	2025-05-19 16:14:25.000000000 +0000
@@ -80,6 +80,25 @@
 show create sequence t1;
 select * from t1;
 select next value for t1;
+let $check_innodb_flags =
+select flag & 12288 is_sequence from information_schema.innodb_sys_tables
+where name='test/t1';
+evalp $check_innodb_flags;
+alter table t1 sequence=0;
+begin;
+delete from t1;
+rollback;
+evalp $check_innodb_flags;
+alter table t1 sequence=1;
+evalp $check_innodb_flags;
+alter table t1 sequence=0, algorithm=copy;
+evalp $check_innodb_flags;
+--error ER_ALTER_OPERATION_NOT_SUPPORTED_REASON
+alter table t1 sequence=1, algorithm=inplace;
+alter table t1 sequence=1, algorithm=copy;
+evalp $check_innodb_flags;
+--error ER_ALTER_OPERATION_NOT_SUPPORTED_REASON
+alter table t1 sequence=0, algorithm=inplace;
 drop sequence t1;
 
 --echo #
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.result mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.result
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.result	2025-05-19 16:14:25.000000000 +0000
@@ -47,14 +47,57 @@
 11	1	9223372036854775806	1	1	1000	0	0
 connection only_alter;
 select next value for s1;
-ERROR 42000: INSERT command denied to user 'only_alter'@'localhost' for table `mysqltest_1`.`s1`
+ERROR 42000: SELECT, INSERT command denied to user 'only_alter'@'localhost' for table `mysqltest_1`.`s1`
 alter sequence s1 restart= 11;
 select * from s1;
 ERROR 42000: SELECT command denied to user 'only_alter'@'localhost' for table `mysqltest_1`.`s1`
 connection default;
-drop database mysqltest_1;
 drop user 'normal'@'%';
 drop user 'read_only'@'%';
 drop user 'read_write'@'%';
 drop user 'alter'@'%';
 drop user 'only_alter'@'%';
+drop sequence s1;
+#
+# MDEV-36413  User without any privileges to a sequence can read from
+# it and modify it via column default
+#
+create sequence s1;
+create sequence s2;
+select * from s2;
+next_not_cached_value	minimum_value	maximum_value	start_value	increment	cache_size	cycle_option	cycle_count
+1	1	9223372036854775806	1	1	1000	0	0
+create table t2 (a int not null default(nextval(s1)));
+insert into t2 values();
+create user u;
+grant create, insert, select, drop on mysqltest_1.t1 to u;
+grant insert, select on mysqltest_1.s1 to u;
+grant select on mysqltest_1.t2 to u;
+connect con1,localhost,u,,mysqltest_1;
+select nextval(s2);
+ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2`
+show create sequence s2;
+ERROR 42000: SHOW command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2`
+create table t1 (a int not null default(nextval(s1)));
+drop table t1;
+create table t1 (a int not null default(nextval(s1))) select a from t2;
+insert into t1 values();
+select * from t1;
+a
+1
+2
+drop table t1;
+create table t1 (a int not null default(nextval(s1))) select a from (select t2.a from t2,t2 as t3 where t2.a=t3.a) as t4;
+drop table t1;
+create table t1 (a int not null default(nextval(s2)));
+ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2`
+create table t1 (a int not null default(nextval(s1)),
+b int not null default(nextval(s2)));
+ERROR 42000: SELECT, INSERT command denied to user 'u'@'localhost' for table `mysqltest_1`.`s2`
+disconnect con1;
+connection default;
+drop user u;
+drop database mysqltest_1;
+#
+# End of 10.11 tests
+#
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.test mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.test
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/grant.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/grant.test	2025-05-19 16:14:25.000000000 +0000
@@ -60,10 +60,58 @@
 #
 
 connection default;
-drop database mysqltest_1;
 drop user 'normal'@'%';
 drop user 'read_only'@'%';
 drop user 'read_write'@'%';
 drop user 'alter'@'%';
 drop user 'only_alter'@'%';
+drop sequence s1;
+
+--echo #
+--echo # MDEV-36413  User without any privileges to a sequence can read from
+--echo # it and modify it via column default
+--echo #
+
+create sequence s1;
+create sequence s2;
+select * from s2;
+create table t2 (a int not null default(nextval(s1)));
+insert into t2 values();
+
+create user u;
+grant create, insert, select, drop on mysqltest_1.t1 to u;
+grant insert, select on mysqltest_1.s1 to u;
+grant select on mysqltest_1.t2 to u;
+
+--connect(con1,localhost,u,,mysqltest_1)
+--error ER_TABLEACCESS_DENIED_ERROR
+select nextval(s2);
+--error ER_TABLEACCESS_DENIED_ERROR
+show create sequence s2;
+
+create table t1 (a int not null default(nextval(s1)));
+drop table t1;
+create table t1 (a int not null default(nextval(s1))) select a from t2;
+insert into t1 values();
+select * from t1;
+drop table t1;
+create table t1 (a int not null default(nextval(s1))) select a from (select t2.a from t2,t2 as t3 where t2.a=t3.a) as t4;
+drop table t1;
+--error ER_TABLEACCESS_DENIED_ERROR
+create table t1 (a int not null default(nextval(s2)));
+--error ER_TABLEACCESS_DENIED_ERROR
+create table t1 (a int not null default(nextval(s1)),
+                 b int not null default(nextval(s2)));
+--disconnect con1
+--connection default
+drop user u;
+
+#
+# Cleanup
+#
+
+drop database mysqltest_1;
 
+--echo #
+--echo # End of 10.11 tests
+--echo #
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/gtid.result mariadb-10.11.13/mysql-test/suite/sql_sequence/gtid.result
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/gtid.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/gtid.result	2025-05-19 16:14:25.000000000 +0000
@@ -174,7 +174,7 @@
 drop sequence s_db.s2;
 connection m_normal_2;
 select next value for s_db.s1;
-ERROR 42000: INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1`
+ERROR 42000: SELECT, INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1`
 create sequence s_db.s2;
 ERROR 42000: CREATE command denied to user 'normal_2'@'localhost' for table `s_db`.`s2`
 connection m_normal_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/other.result mariadb-10.11.13/mysql-test/suite/sql_sequence/other.result
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/other.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/other.result	2025-05-19 16:14:25.000000000 +0000
@@ -48,7 +48,6 @@
 insert into s1 (next_not_cached_value, minimum_value) values (100,1000);
 ERROR HY000: Field 'maximum_value' doesn't have a default value
 insert into s1 values (next value for s1, 1,9223372036854775806,1,1,1000,0,0);
-ERROR HY000: Table 's1' is specified twice, both as a target for 'INSERT' and as a separate source for data
 insert into s1 values(1000,9223372036854775806,1,1,1,1000,0,0);
 ERROR HY000: Sequence 'test.s1' has out of range value for options
 insert into s1 values(0,9223372036854775806,1,1,1,1000,0,0);
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/other.test mariadb-10.11.13/mysql-test/suite/sql_sequence/other.test
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/other.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/other.test	2025-05-19 16:14:25.000000000 +0000
@@ -38,7 +38,6 @@
 create sequence s2;
 --error ER_NO_DEFAULT_FOR_FIELD
 insert into s1 (next_not_cached_value, minimum_value) values (100,1000);
---error ER_UPDATE_TABLE_USED
 insert into s1 values (next value for s1, 1,9223372036854775806,1,1,1000,0,0);
 --error ER_SEQUENCE_INVALID_DATA
 insert into s1 values(1000,9223372036854775806,1,1,1,1000,0,0);
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/replication.result mariadb-10.11.13/mysql-test/suite/sql_sequence/replication.result
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/replication.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/replication.result	2025-05-19 16:14:25.000000000 +0000
@@ -285,7 +285,7 @@
 drop sequence s_db.s2;
 connection m_normal_2;
 select NEXT VALUE for s_db.s1;
-ERROR 42000: INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1`
+ERROR 42000: SELECT, INSERT command denied to user 'normal_2'@'localhost' for table `s_db`.`s1`
 create sequence s_db.s2;
 ERROR 42000: CREATE command denied to user 'normal_2'@'localhost' for table `s_db`.`s2`
 connection m_normal_1;
diff -Nru mariadb-10.11.11/mysql-test/suite/sql_sequence/view.test mariadb-10.11.13/mysql-test/suite/sql_sequence/view.test
--- mariadb-10.11.11/mysql-test/suite/sql_sequence/view.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sql_sequence/view.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,5 +1,4 @@
 --source include/have_sequence.inc
---source include/have_innodb.inc
 
 #
 # Test sequences with views
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result
--- mariadb-10.11.11/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/innodb_buffer_pool_size_basic.result	2025-05-19 16:14:25.000000000 +0000
@@ -1,16 +1,17 @@
 SET @start_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size;
-'#---------------------BS_STVARS_022_01----------------------#'
-SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size);
-COUNT(@@GLOBAL.innodb_buffer_pool_size)
-1
-1 Expected
 '#---------------------BS_STVARS_022_02----------------------#'
-SET @@GLOBAL.innodb_buffer_pool_size=10485760;
-Expected succeeded
-SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size);
-COUNT(@@GLOBAL.innodb_buffer_pool_size)
+SELECT @@GLOBAL.innodb_buffer_pool_size_max;
+@@GLOBAL.innodb_buffer_pool_size_max
+8388608
+SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max;
+@@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max
+1
+SET GLOBAL innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max + 1048576;
+Warnings:
+Warning	1292	Truncated incorrect innodb_buffer_pool_size value: '9437184'
+SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max;
+@@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max
 1
-1 Expected
 '#---------------------BS_STVARS_022_03----------------------#'
 SELECT @@GLOBAL.innodb_buffer_pool_size = VARIABLE_VALUE
 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
@@ -18,10 +19,6 @@
 @@GLOBAL.innodb_buffer_pool_size = VARIABLE_VALUE
 1
 1 Expected
-SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size);
-COUNT(@@GLOBAL.innodb_buffer_pool_size)
-1
-1 Expected
 SELECT COUNT(VARIABLE_VALUE)
 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
 WHERE VARIABLE_NAME='innodb_buffer_pool_size';
@@ -50,4 +47,7 @@
 1 Expected
 SELECT innodb_buffer_pool_size = @@SESSION.innodb_buffer_pool_size;
 ERROR 42S22: Unknown column 'innodb_buffer_pool_size' in 'SELECT'
-# restart
+SET GLOBAL innodb_buffer_pool_size = @start_buffer_pool_size;
+SELECT @@innodb_buffer_pool_size = @start_buffer_pool_size;
+@@innodb_buffer_pool_size = @start_buffer_pool_size
+1
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff
--- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff	2025-05-19 16:14:25.000000000 +0000
@@ -9,7 +9,7 @@
  VARIABLE_COMMENT	Number of InnoDB Adaptive Hash Index Partitions (default 8)
  NUMERIC_MIN_VALUE	1
  NUMERIC_MAX_VALUE	512
-@@ -71,7 +71,7 @@
+@@ -83,7 +83,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	1
  VARIABLE_SCOPE	GLOBAL
@@ -18,20 +18,20 @@
  VARIABLE_COMMENT	The AUTOINC lock modes supported by InnoDB: 0 => Old style AUTOINC locking (for backward compatibility); 1 => New style AUTOINC locking; 2 => No AUTOINC locking (unsafe for SBR)
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	2
-@@ -83,10 +83,10 @@
+@@ -95,10 +95,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	0
  VARIABLE_SCOPE	GLOBAL
 -VARIABLE_TYPE	BIGINT UNSIGNED
 +VARIABLE_TYPE	INT UNSIGNED
- VARIABLE_COMMENT	Size of a single memory chunk for resizing buffer pool. Online buffer pool resizing happens at this granularity. 0 means autosize this variable based on buffer pool size.
+ VARIABLE_COMMENT	Deprecated parameter with no effect
  NUMERIC_MIN_VALUE	0
 -NUMERIC_MAX_VALUE	18446744073709551615
 +NUMERIC_MAX_VALUE	4294967295
  NUMERIC_BLOCK_SIZE	1048576
  ENUM_VALUE_LIST	NULL
  READ_ONLY	YES
-@@ -119,7 +119,7 @@
+@@ -131,7 +131,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	25
  VARIABLE_SCOPE	GLOBAL
@@ -40,7 +40,50 @@
  VARIABLE_COMMENT	Dump only the hottest N% of each buffer pool, defaults to 25
  NUMERIC_MIN_VALUE	1
  NUMERIC_MAX_VALUE	100
-@@ -203,7 +203,7 @@
+@@ -203,10 +203,10 @@
+ SESSION_VALUE	NULL
+ DEFAULT_VALUE	134217728
+ VARIABLE_SCOPE	GLOBAL
+-VARIABLE_TYPE	BIGINT UNSIGNED
++VARIABLE_TYPE	INT UNSIGNED
+ VARIABLE_COMMENT	The size of the memory buffer InnoDB uses to cache data and indexes of its tables.
+ NUMERIC_MIN_VALUE	2097152
+-NUMERIC_MAX_VALUE	18446744073701163008
++NUMERIC_MAX_VALUE	4292870144
+ NUMERIC_BLOCK_SIZE	1048576
+ ENUM_VALUE_LIST	NULL
+ READ_ONLY	NO
+@@ -215,11 +215,11 @@
+ SESSION_VALUE	NULL
+ DEFAULT_VALUE	0
+ VARIABLE_SCOPE	GLOBAL
+-VARIABLE_TYPE	BIGINT UNSIGNED
++VARIABLE_TYPE	INT UNSIGNED
+ VARIABLE_COMMENT	Minimum innodb_buffer_pool_size for dynamic shrinking on memory pressure
+ NUMERIC_MIN_VALUE	0
+-NUMERIC_MAX_VALUE	18446744073701163008
+-NUMERIC_BLOCK_SIZE	8388608
++NUMERIC_MAX_VALUE	4292870144
++NUMERIC_BLOCK_SIZE	2097152
+ ENUM_VALUE_LIST	NULL
+ READ_ONLY	NO
+ COMMAND_LINE_ARGUMENT	REQUIRED
+@@ -227,11 +227,11 @@
+ SESSION_VALUE	NULL
+ DEFAULT_VALUE	0
+ VARIABLE_SCOPE	GLOBAL
+-VARIABLE_TYPE	BIGINT UNSIGNED
++VARIABLE_TYPE	INT UNSIGNED
+ VARIABLE_COMMENT	Maximum innodb_buffer_pool_size
+ NUMERIC_MIN_VALUE	0
+-NUMERIC_MAX_VALUE	18446744073701163008
+-NUMERIC_BLOCK_SIZE	8388608
++NUMERIC_MAX_VALUE	4292870144
++NUMERIC_BLOCK_SIZE	2097152
+ ENUM_VALUE_LIST	NULL
+ READ_ONLY	YES
+ COMMAND_LINE_ARGUMENT	REQUIRED
+@@ -239,7 +239,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	0
  VARIABLE_SCOPE	GLOBAL
@@ -49,7 +92,7 @@
  VARIABLE_COMMENT	A number between [0, 100] that tells how oftern buffer pool dump status in percentages should be printed. E.g. 10 means that buffer pool dump status is printed when every 10% of number of buffer pool pages are dumped. Default is 0 (only start and end status is printed).
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	100
-@@ -323,7 +323,7 @@
+@@ -359,7 +359,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	5
  VARIABLE_SCOPE	GLOBAL
@@ -58,7 +101,7 @@
  VARIABLE_COMMENT	If the compression failure rate of a table is greater than this number more padding is added to the pages to reduce the failures. A value of zero implies no padding
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	100
-@@ -347,7 +347,7 @@
+@@ -383,7 +383,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	50
  VARIABLE_SCOPE	GLOBAL
@@ -67,7 +110,7 @@
  VARIABLE_COMMENT	Percentage of empty space on a data page that can be reserved to make the page compressible.
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	75
-@@ -623,7 +623,7 @@
+@@ -671,7 +671,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	600
  VARIABLE_SCOPE	GLOBAL
@@ -76,7 +119,7 @@
  VARIABLE_COMMENT	Maximum number of seconds that semaphore times out in InnoDB.
  NUMERIC_MIN_VALUE	1
  NUMERIC_MAX_VALUE	4294967295
-@@ -671,7 +671,7 @@
+@@ -719,7 +719,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	30
  VARIABLE_SCOPE	GLOBAL
@@ -85,7 +128,7 @@
  VARIABLE_COMMENT	Number of iterations over which the background flushing is averaged.
  NUMERIC_MIN_VALUE	1
  NUMERIC_MAX_VALUE	1000
-@@ -695,7 +695,7 @@
+@@ -743,7 +743,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	1
  VARIABLE_SCOPE	GLOBAL
@@ -94,7 +137,7 @@
  VARIABLE_COMMENT	Controls the durability/speed trade-off for commits. Set to 0 (write and flush redo log to disk only once per second), 1 (flush to disk at each commit), 2 (write to log at commit but flush to disk only once per second) or 3 (flush to disk at prepare and at commit, slower and usually redundant). 1 and 3 guarantees that after a crash, committed transactions will not be lost and will be consistent with the binlog and other transactional engines. 2 can get inconsistent and lose transactions if there is a power failure or kernel crash but not if mysqld crashes. 0 has no guarantees in case of crash. 0 and 2 can be faster than 1 or 3.
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	3
-@@ -719,7 +719,7 @@
+@@ -767,7 +767,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	1
  VARIABLE_SCOPE	GLOBAL
@@ -103,7 +146,7 @@
  VARIABLE_COMMENT	Set to 0 (don't flush neighbors from buffer pool), 1 (flush contiguous neighbors from buffer pool) or 2 (flush neighbors from buffer pool), when flushing a block
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	2
-@@ -755,7 +755,7 @@
+@@ -803,7 +803,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	0
  VARIABLE_SCOPE	GLOBAL
@@ -112,7 +155,7 @@
  VARIABLE_COMMENT	Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data.
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	6
-@@ -779,10 +779,10 @@
+@@ -827,10 +827,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	8000000
  VARIABLE_SCOPE	GLOBAL
@@ -125,7 +168,7 @@
  NUMERIC_BLOCK_SIZE	0
  ENUM_VALUE_LIST	NULL
  READ_ONLY	NO
-@@ -815,7 +815,7 @@
+@@ -863,7 +863,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	84
  VARIABLE_SCOPE	GLOBAL
@@ -134,7 +177,7 @@
  VARIABLE_COMMENT	InnoDB Fulltext search maximum token size in characters
  NUMERIC_MIN_VALUE	10
  NUMERIC_MAX_VALUE	84
-@@ -827,7 +827,7 @@
+@@ -875,7 +875,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	3
  VARIABLE_SCOPE	GLOBAL
@@ -143,7 +186,7 @@
  VARIABLE_COMMENT	InnoDB Fulltext search minimum token size in characters
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	16
-@@ -839,7 +839,7 @@
+@@ -887,7 +887,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	2000
  VARIABLE_SCOPE	GLOBAL
@@ -152,7 +195,7 @@
  VARIABLE_COMMENT	InnoDB Fulltext search number of words to optimize for each optimize table call 
  NUMERIC_MIN_VALUE	1000
  NUMERIC_MAX_VALUE	10000
-@@ -851,10 +851,10 @@
+@@ -899,10 +899,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	2000000000
  VARIABLE_SCOPE	GLOBAL
@@ -165,7 +208,7 @@
  NUMERIC_BLOCK_SIZE	0
  ENUM_VALUE_LIST	NULL
  READ_ONLY	NO
-@@ -875,7 +875,7 @@
+@@ -923,7 +923,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	2
  VARIABLE_SCOPE	GLOBAL
@@ -174,7 +217,7 @@
  VARIABLE_COMMENT	InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number
  NUMERIC_MIN_VALUE	1
  NUMERIC_MAX_VALUE	16
-@@ -887,10 +887,10 @@
+@@ -935,10 +935,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	640000000
  VARIABLE_SCOPE	GLOBAL
@@ -187,7 +230,7 @@
  NUMERIC_BLOCK_SIZE	0
  ENUM_VALUE_LIST	NULL
  READ_ONLY	NO
-@@ -935,22 +935,22 @@
+@@ -983,7 +983,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	200
  VARIABLE_SCOPE	GLOBAL
@@ -195,27 +238,17 @@
 +VARIABLE_TYPE	INT UNSIGNED
  VARIABLE_COMMENT	Number of IOPs the server can do. Tunes the background IO rate
  NUMERIC_MIN_VALUE	100
--NUMERIC_MAX_VALUE	18446744073709551615
-+NUMERIC_MAX_VALUE	4294967295
- NUMERIC_BLOCK_SIZE	0
- ENUM_VALUE_LIST	NULL
- READ_ONLY	NO
- COMMAND_LINE_ARGUMENT	REQUIRED
- VARIABLE_NAME	INNODB_IO_CAPACITY_MAX
+ NUMERIC_MAX_VALUE	4294967295
+@@ -995,7 +995,7 @@
  SESSION_VALUE	NULL
--DEFAULT_VALUE	18446744073709551615
-+DEFAULT_VALUE	4294967295
+ DEFAULT_VALUE	4294967295
  VARIABLE_SCOPE	GLOBAL
 -VARIABLE_TYPE	BIGINT UNSIGNED
 +VARIABLE_TYPE	INT UNSIGNED
  VARIABLE_COMMENT	Limit to which innodb_io_capacity can be inflated.
  NUMERIC_MIN_VALUE	100
--NUMERIC_MAX_VALUE	18446744073709551615
-+NUMERIC_MAX_VALUE	4294967295
- NUMERIC_BLOCK_SIZE	0
- ENUM_VALUE_LIST	NULL
- READ_ONLY	NO
-@@ -1043,10 +1043,10 @@
+ NUMERIC_MAX_VALUE	4294967295
+@@ -1115,10 +1115,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	32
  VARIABLE_SCOPE	GLOBAL
@@ -228,7 +261,7 @@
  NUMERIC_BLOCK_SIZE	0
  ENUM_VALUE_LIST	NULL
  READ_ONLY	NO
-@@ -1055,10 +1055,10 @@
+@@ -1127,10 +1127,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	1536
  VARIABLE_SCOPE	GLOBAL
@@ -241,7 +274,7 @@
  NUMERIC_BLOCK_SIZE	0
  ENUM_VALUE_LIST	NULL
  READ_ONLY	NO
-@@ -1091,10 +1091,10 @@
+@@ -1163,10 +1163,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	0
  VARIABLE_SCOPE	GLOBAL
@@ -254,7 +287,7 @@
  NUMERIC_BLOCK_SIZE	0
  ENUM_VALUE_LIST	NULL
  READ_ONLY	NO
-@@ -1103,7 +1103,7 @@
+@@ -1175,7 +1175,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	0
  VARIABLE_SCOPE	GLOBAL
@@ -263,7 +296,7 @@
  VARIABLE_COMMENT	Maximum delay of user threads in micro-seconds
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	10000000
-@@ -1235,10 +1235,10 @@
+@@ -1307,10 +1307,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	0
  VARIABLE_SCOPE	GLOBAL
@@ -276,7 +309,7 @@
  NUMERIC_BLOCK_SIZE	0
  ENUM_VALUE_LIST	NULL
  READ_ONLY	YES
-@@ -1259,7 +1259,7 @@
+@@ -1331,7 +1331,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	16384
  VARIABLE_SCOPE	GLOBAL
@@ -285,16 +318,16 @@
  VARIABLE_COMMENT	Page size to use for all InnoDB tablespaces.
  NUMERIC_MIN_VALUE	4096
  NUMERIC_MAX_VALUE	65536
-@@ -1295,7 +1295,7 @@
+@@ -1367,7 +1367,7 @@
  SESSION_VALUE	NULL
- DEFAULT_VALUE	1000
+ DEFAULT_VALUE	127
  VARIABLE_SCOPE	GLOBAL
 -VARIABLE_TYPE	BIGINT UNSIGNED
 +VARIABLE_TYPE	INT UNSIGNED
  VARIABLE_COMMENT	Number of UNDO log pages to purge in one batch from the history list.
  NUMERIC_MIN_VALUE	1
  NUMERIC_MAX_VALUE	5000
-@@ -1307,7 +1307,7 @@
+@@ -1379,7 +1379,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	128
  VARIABLE_SCOPE	GLOBAL
@@ -303,7 +336,7 @@
  VARIABLE_COMMENT	Deprecated parameter with no effect
  NUMERIC_MIN_VALUE	1
  NUMERIC_MAX_VALUE	128
-@@ -1343,7 +1343,7 @@
+@@ -1415,7 +1415,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	56
  VARIABLE_SCOPE	GLOBAL
@@ -312,7 +345,7 @@
  VARIABLE_COMMENT	Number of pages that must be accessed sequentially for InnoDB to trigger a readahead.
  NUMERIC_MIN_VALUE	0
  NUMERIC_MAX_VALUE	64
-@@ -1427,7 +1427,7 @@
+@@ -1499,7 +1499,7 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	1048576
  VARIABLE_SCOPE	GLOBAL
@@ -321,7 +354,7 @@
  VARIABLE_COMMENT	Memory buffer size for index creation
  NUMERIC_MIN_VALUE	65536
  NUMERIC_MAX_VALUE	67108864
-@@ -1595,10 +1595,10 @@
+@@ -1667,10 +1667,10 @@
  SESSION_VALUE	NULL
  DEFAULT_VALUE	30
  VARIABLE_SCOPE	GLOBAL
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb.result
--- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_innodb.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_innodb.result	2025-05-19 16:14:25.000000000 +0000
@@ -96,7 +96,7 @@
 DEFAULT_VALUE	0
 VARIABLE_SCOPE	GLOBAL
 VARIABLE_TYPE	BIGINT UNSIGNED
-VARIABLE_COMMENT	Size of a single memory chunk for resizing buffer pool. Online buffer pool resizing happens at this granularity. 0 means autosize this variable based on buffer pool size.
+VARIABLE_COMMENT	Deprecated parameter with no effect
 NUMERIC_MIN_VALUE	0
 NUMERIC_MAX_VALUE	18446744073709551615
 NUMERIC_BLOCK_SIZE	1048576
@@ -206,11 +206,35 @@
 VARIABLE_TYPE	BIGINT UNSIGNED
 VARIABLE_COMMENT	The size of the memory buffer InnoDB uses to cache data and indexes of its tables.
 NUMERIC_MIN_VALUE	2097152
-NUMERIC_MAX_VALUE	9223372036854775807
+NUMERIC_MAX_VALUE	18446744073701163008
 NUMERIC_BLOCK_SIZE	1048576
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	REQUIRED
+VARIABLE_NAME	INNODB_BUFFER_POOL_SIZE_AUTO_MIN
+SESSION_VALUE	NULL
+DEFAULT_VALUE	0
+VARIABLE_SCOPE	GLOBAL
+VARIABLE_TYPE	BIGINT UNSIGNED
+VARIABLE_COMMENT	Minimum innodb_buffer_pool_size for dynamic shrinking on memory pressure
+NUMERIC_MIN_VALUE	0
+NUMERIC_MAX_VALUE	18446744073701163008
+NUMERIC_BLOCK_SIZE	8388608
+ENUM_VALUE_LIST	NULL
+READ_ONLY	NO
+COMMAND_LINE_ARGUMENT	REQUIRED
+VARIABLE_NAME	INNODB_BUFFER_POOL_SIZE_MAX
+SESSION_VALUE	NULL
+DEFAULT_VALUE	0
+VARIABLE_SCOPE	GLOBAL
+VARIABLE_TYPE	BIGINT UNSIGNED
+VARIABLE_COMMENT	Maximum innodb_buffer_pool_size
+NUMERIC_MIN_VALUE	0
+NUMERIC_MAX_VALUE	18446744073701163008
+NUMERIC_BLOCK_SIZE	8388608
+ENUM_VALUE_LIST	NULL
+READ_ONLY	YES
+COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	INNODB_BUF_DUMP_STATUS_FREQUENCY
 SESSION_VALUE	NULL
 DEFAULT_VALUE	0
@@ -962,19 +986,19 @@
 VARIABLE_TYPE	BIGINT UNSIGNED
 VARIABLE_COMMENT	Number of IOPs the server can do. Tunes the background IO rate
 NUMERIC_MIN_VALUE	100
-NUMERIC_MAX_VALUE	18446744073709551615
+NUMERIC_MAX_VALUE	4294967295
 NUMERIC_BLOCK_SIZE	0
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	INNODB_IO_CAPACITY_MAX
 SESSION_VALUE	NULL
-DEFAULT_VALUE	18446744073709551615
+DEFAULT_VALUE	4294967295
 VARIABLE_SCOPE	GLOBAL
 VARIABLE_TYPE	BIGINT UNSIGNED
 VARIABLE_COMMENT	Limit to which innodb_io_capacity can be inflated.
 NUMERIC_MIN_VALUE	100
-NUMERIC_MAX_VALUE	18446744073709551615
+NUMERIC_MAX_VALUE	4294967295
 NUMERIC_BLOCK_SIZE	0
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
@@ -1020,7 +1044,7 @@
 DEFAULT_VALUE	OFF
 VARIABLE_SCOPE	GLOBAL
 VARIABLE_TYPE	BOOLEAN
-VARIABLE_COMMENT	Force checkpoint now
+VARIABLE_COMMENT	Write back dirty pages from the buffer pool and update the log checkpoint
 NUMERIC_MIN_VALUE	NULL
 NUMERIC_MAX_VALUE	NULL
 NUMERIC_BLOCK_SIZE	NULL
@@ -1068,7 +1092,7 @@
 DEFAULT_VALUE	0
 VARIABLE_SCOPE	GLOBAL
 VARIABLE_TYPE	INT UNSIGNED
-VARIABLE_COMMENT	Delay between log buffer spin lock polls (0 to use a blocking latch)
+VARIABLE_COMMENT	Deprecated parameter with no effect
 NUMERIC_MIN_VALUE	0
 NUMERIC_MAX_VALUE	6000
 NUMERIC_BLOCK_SIZE	0
@@ -1571,10 +1595,10 @@
 SESSION_VALUE	NULL
 DEFAULT_VALUE	20
 VARIABLE_SCOPE	GLOBAL
-VARIABLE_TYPE	BIGINT UNSIGNED
+VARIABLE_TYPE	INT UNSIGNED
 VARIABLE_COMMENT	The number of leaf index pages to sample when calculating persistent statistics (by ANALYZE, default 20)
 NUMERIC_MIN_VALUE	1
-NUMERIC_MAX_VALUE	18446744073709551615
+NUMERIC_MAX_VALUE	4294967295
 NUMERIC_BLOCK_SIZE	0
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
@@ -1595,10 +1619,10 @@
 SESSION_VALUE	NULL
 DEFAULT_VALUE	8
 VARIABLE_SCOPE	GLOBAL
-VARIABLE_TYPE	BIGINT UNSIGNED
+VARIABLE_TYPE	INT UNSIGNED
 VARIABLE_COMMENT	The number of leaf index pages to sample when calculating transient statistics (if persistent statistics are not used, default 8)
 NUMERIC_MIN_VALUE	1
-NUMERIC_MAX_VALUE	18446744073709551615
+NUMERIC_MAX_VALUE	4294967295
 NUMERIC_BLOCK_SIZE	0
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result
--- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result	2025-05-19 16:14:25.000000000 +0000
@@ -2325,11 +2325,11 @@
 VARIABLE_NAME	OPTIMIZER_ADJUST_SECONDARY_KEY_COSTS
 VARIABLE_SCOPE	SESSION
 VARIABLE_TYPE	SET
-VARIABLE_COMMENT	A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. selectivity_multiplier. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer.
+VARIABLE_COMMENT	A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. fix_derived_table_read_cost = Fix the cost of reading materialized derived table. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer.
 NUMERIC_MIN_VALUE	NULL
 NUMERIC_MAX_VALUE	NULL
 NUMERIC_BLOCK_SIZE	NULL
-ENUM_VALUE_LIST	adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier
+ENUM_VALUE_LIST	adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier,fix_derived_table_read_cost
 READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	OPTIMIZER_EXTRA_PRUNING_DEPTH
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
--- mariadb-10.11.11/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result	2025-05-19 16:14:25.000000000 +0000
@@ -2495,11 +2495,11 @@
 VARIABLE_NAME	OPTIMIZER_ADJUST_SECONDARY_KEY_COSTS
 VARIABLE_SCOPE	SESSION
 VARIABLE_TYPE	SET
-VARIABLE_COMMENT	A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. selectivity_multiplier. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer.
+VARIABLE_COMMENT	A bit field with the following values: adjust_secondary_key_cost = Update secondary key costs for ranges to be at least 5x of clustered primary key costs. disable_max_seek = Disable 'max_seek optimization' for secondary keys and slight adjustment of filter cost. disable_forced_index_in_group_by = Disable automatic forced index in GROUP BY. fix_innodb_cardinality = Disable doubling of the Cardinality for InnoDB secondary keys. fix_reuse_range_for_ref = Do a better job at reusing range access estimates when estimating ref access. fix_card_multiplier = Fix the computation in selectivity_for_indexes. fix_derived_table_read_cost = Fix the cost of reading materialized derived table. This variable will be deleted in MariaDB 11.0 as it is not needed with the new 11.0 optimizer.
 NUMERIC_MIN_VALUE	NULL
 NUMERIC_MAX_VALUE	NULL
 NUMERIC_BLOCK_SIZE	NULL
-ENUM_VALUE_LIST	adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier
+ENUM_VALUE_LIST	adjust_secondary_key_cost,disable_max_seek,disable_forced_index_in_group_by,fix_innodb_cardinality,fix_reuse_range_for_ref,fix_card_multiplier,fix_derived_table_read_cost
 READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	REQUIRED
 VARIABLE_NAME	OPTIMIZER_EXTRA_PRUNING_DEPTH
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result
--- mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_forced_binlog_format_basic.result	1970-01-01 00:00:00.000000000 +0000
@@ -1,51 +0,0 @@
-#
-# wsrep_forced_binlog_format
-#
-# save the initial value
-SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format;
-# default
-SELECT @@global.wsrep_forced_binlog_format;
-@@global.wsrep_forced_binlog_format
-NONE
-
-# scope
-SELECT @@session.wsrep_forced_binlog_format;
-ERROR HY000: Variable 'wsrep_forced_binlog_format' is a GLOBAL variable
-SET @@global.wsrep_forced_binlog_format=STATEMENT;
-SELECT @@global.wsrep_forced_binlog_format;
-@@global.wsrep_forced_binlog_format
-STATEMENT
-
-# valid values
-SET @@global.wsrep_forced_binlog_format=STATEMENT;
-SELECT @@global.wsrep_forced_binlog_format;
-@@global.wsrep_forced_binlog_format
-STATEMENT
-SET @@global.wsrep_forced_binlog_format=ROW;
-SELECT @@global.wsrep_forced_binlog_format;
-@@global.wsrep_forced_binlog_format
-ROW
-SET @@global.wsrep_forced_binlog_format=MIXED;
-SELECT @@global.wsrep_forced_binlog_format;
-@@global.wsrep_forced_binlog_format
-MIXED
-SET @@global.wsrep_forced_binlog_format=NONE;
-SELECT @@global.wsrep_forced_binlog_format;
-@@global.wsrep_forced_binlog_format
-NONE
-SET @@global.wsrep_forced_binlog_format=default;
-SELECT @@global.wsrep_forced_binlog_format;
-@@global.wsrep_forced_binlog_format
-NONE
-
-# invalid values
-SET @@global.wsrep_forced_binlog_format=NULL;
-ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'NULL'
-SET @@global.wsrep_forced_binlog_format='junk';
-ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'junk'
-SET @@global.wsrep_forced_binlog_format=ON;
-ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'ON'
-
-# restore the initial value
-SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved;
-# End of test
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result
--- mariadb-10.11.11/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/r/wsrep_replicate_myisam_basic.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,15 @@
+#
+# wsrep_replicate_myisam
+#
+# save the initial value
+SET @wsrep_mode_saved = @@global.wsrep_mode;
+
+# scope and valid values
+SET @@global.wsrep_mode=REPLICATE_MYISAM;
+SELECT @@global.wsrep_mode;
+@@global.wsrep_mode
+REPLICATE_MYISAM
+
+# restore the initial value
+SET @@global.wsrep_mode = @wsrep_mode_saved;
+# End of test
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt
--- mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic-master.opt	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
---innodb-buffer-pool-chunk-size=2M
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt
--- mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.opt	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1 @@
+--innodb-buffer-pool-size-max=8m
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test
--- mariadb-10.11.11/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/innodb_buffer_pool_size_basic.test	2025-05-19 16:14:25.000000000 +0000
@@ -24,35 +24,19 @@
 
 --source include/have_innodb.inc
 
-let $wait_condition =
-  SELECT SUBSTR(variable_value, 1, 30) = 'Completed resizing buffer pool'
-  FROM information_schema.global_status
-  WHERE LOWER(variable_name) = 'innodb_buffer_pool_resize_status';
-
 SET @start_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size;
 
---echo '#---------------------BS_STVARS_022_01----------------------#'
-####################################################################
-#   Displaying default value                                       #
-####################################################################
-SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size);
---echo 1 Expected
-
-
 --echo '#---------------------BS_STVARS_022_02----------------------#'
 ####################################################################
 #   Check if Value can set                                         #
 ####################################################################
 
-SET @@GLOBAL.innodb_buffer_pool_size=10485760;
---echo Expected succeeded
---source include/wait_condition.inc
-
-SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size);
---echo 1 Expected
-
-
-
+--enable_warnings
+SELECT @@GLOBAL.innodb_buffer_pool_size_max;
+SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max;
+SET GLOBAL innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max + 1048576;
+SELECT @@GLOBAL.innodb_buffer_pool_size = @@GLOBAL.innodb_buffer_pool_size_max;
+--disable_warnings
 
 --echo '#---------------------BS_STVARS_022_03----------------------#'
 #################################################################
@@ -66,9 +50,6 @@
 --enable_warnings
 --echo 1 Expected
 
-SELECT COUNT(@@GLOBAL.innodb_buffer_pool_size);
---echo 1 Expected
-
 --disable_warnings
 SELECT COUNT(VARIABLE_VALUE)
 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES 
@@ -76,8 +57,6 @@
 --enable_warnings
 --echo 1 Expected
 
-
-
 --echo '#---------------------BS_STVARS_022_04----------------------#'
 ################################################################################
 #  Check if accessing variable with and without GLOBAL point to same variable  #
@@ -111,4 +90,6 @@
 
 
 # Restore the original buffer pool size.
---source include/restart_mysqld.inc
+SET GLOBAL innodb_buffer_pool_size = @start_buffer_pool_size;
+
+SELECT @@innodb_buffer_pool_size = @start_buffer_pool_size;
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.opt mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.opt
--- mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.opt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.opt	2025-05-19 16:14:25.000000000 +0000
@@ -1,2 +1,4 @@
---loose-innodb-flush-log-at-timeout=3
+--innodb
+--innodb-purge-rseg-truncate-frequency=64
+--innodb-flush-log-at-timeout=3
 --table_open_cache=200
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.test
--- mariadb-10.11.11/mysql-test/suite/sys_vars/t/sysvars_innodb.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/sysvars_innodb.test	2025-05-19 16:14:25.000000000 +0000
@@ -3,6 +3,10 @@
 --source include/not_valgrind.inc
 --source include/word_size.inc
 
+--disable_query_log
+call mtr.add_suppression("'innodb-purge-rseg-truncate-frequency' was removed");
+--enable_query_log
+
 --vertical_results
 --replace_regex /^\/\S+/PATH/ /\.\//PATH/
 select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYPE, VARIABLE_COMMENT, NUMERIC_MIN_VALUE, NUMERIC_MAX_VALUE, NUMERIC_BLOCK_SIZE, ENUM_VALUE_LIST, READ_ONLY, COMMAND_LINE_ARGUMENT from information_schema.system_variables
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test
--- mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_forced_binlog_format_basic.test	1970-01-01 00:00:00.000000000 +0000
@@ -1,46 +0,0 @@
---source include/have_wsrep.inc
-
---echo #
---echo # wsrep_forced_binlog_format
---echo #
-
---echo # save the initial value
-SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format;
-
---echo # default
-SELECT @@global.wsrep_forced_binlog_format;
-
---echo
---echo # scope
---error ER_INCORRECT_GLOBAL_LOCAL_VAR
-SELECT @@session.wsrep_forced_binlog_format;
-SET @@global.wsrep_forced_binlog_format=STATEMENT;
-SELECT @@global.wsrep_forced_binlog_format;
-
---echo
---echo # valid values
-SET @@global.wsrep_forced_binlog_format=STATEMENT;
-SELECT @@global.wsrep_forced_binlog_format;
-SET @@global.wsrep_forced_binlog_format=ROW;
-SELECT @@global.wsrep_forced_binlog_format;
-SET @@global.wsrep_forced_binlog_format=MIXED;
-SELECT @@global.wsrep_forced_binlog_format;
-SET @@global.wsrep_forced_binlog_format=NONE;
-SELECT @@global.wsrep_forced_binlog_format;
-SET @@global.wsrep_forced_binlog_format=default;
-SELECT @@global.wsrep_forced_binlog_format;
-
---echo
---echo # invalid values
---error ER_WRONG_VALUE_FOR_VAR
-SET @@global.wsrep_forced_binlog_format=NULL;
---error ER_WRONG_VALUE_FOR_VAR
-SET @@global.wsrep_forced_binlog_format='junk';
---error ER_WRONG_VALUE_FOR_VAR
-SET @@global.wsrep_forced_binlog_format=ON;
-
---echo
---echo # restore the initial value
-SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved;
-
---echo # End of test
diff -Nru mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test
--- mariadb-10.11.11/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/sys_vars/t/wsrep_replicate_myisam_basic.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,19 @@
+--source include/have_wsrep.inc
+
+--echo #
+--echo # wsrep_replicate_myisam
+--echo #
+
+--echo # save the initial value
+SET @wsrep_mode_saved = @@global.wsrep_mode;
+
+--echo
+--echo # scope and valid values
+SET @@global.wsrep_mode=REPLICATE_MYISAM;
+SELECT @@global.wsrep_mode;
+
+--echo
+--echo # restore the initial value
+SET @@global.wsrep_mode = @wsrep_mode_saved;
+
+--echo # End of test
diff -Nru mariadb-10.11.11/mysql-test/suite/versioning/r/partition.result mariadb-10.11.13/mysql-test/suite/versioning/r/partition.result
--- mariadb-10.11.11/mysql-test/suite/versioning/r/partition.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/versioning/r/partition.result	2025-05-19 16:14:25.000000000 +0000
@@ -3445,6 +3445,20 @@
 create table t (a int) with system versioning partition by system_time partitions 3;
 ERROR HY000: Maybe missing parameters: no rotation condition for multiple HISTORY partitions.
 #
+# MDEV-36115 InnoDB: assertion: node->pcur->rel_pos == BTR_PCUR_ON
+#            in row_update_for_mysql
+#
+create table t (a int key) engine=innodb
+with system versioning
+partition by key() partitions 3;
+start transaction;
+insert into t values (1),(2),(3),(4),(5),(6),(7),(8);
+set timestamp=+1;
+delete from t;
+insert into t values (1),(2);
+DELETE from t;
+drop table t;
+#
 # End of 10.5 tests
 #
 #
@@ -3470,4 +3484,25 @@
 #
 # End of 10.9 tests
 #
+#
+# MDEV-34775 Wrong reopen of already open routine due to auto-create in SP
+#
+create table t (a int) with system versioning
+partition by system_time
+interval 1 minute auto;
+create function f()
+returns int
+begin
+replace into t select * from t;
+return 0;
+end $
+set timestamp= @@timestamp + 61;
+select f();
+f()
+0
+drop table t;
+drop function f;
+#
+# End of 10.11 tests
+#
 set global innodb_stats_persistent= @save_persistent;
diff -Nru mariadb-10.11.11/mysql-test/suite/versioning/t/partition.test mariadb-10.11.13/mysql-test/suite/versioning/t/partition.test
--- mariadb-10.11.11/mysql-test/suite/versioning/t/partition.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/versioning/t/partition.test	2025-05-19 16:14:25.000000000 +0000
@@ -2676,6 +2676,22 @@
 create table t (a int) with system versioning partition by system_time partitions 3;
 
 --echo #
+--echo # MDEV-36115 InnoDB: assertion: node->pcur->rel_pos == BTR_PCUR_ON
+--echo #            in row_update_for_mysql
+--echo #
+create table t (a int key) engine=innodb
+with system versioning
+partition by key() partitions 3;
+
+start transaction;
+insert into t values (1),(2),(3),(4),(5),(6),(7),(8);
+set timestamp=+1;
+delete from t;
+insert into t values (1),(2);
+DELETE from t;
+drop table t;
+
+--echo #
 --echo # End of 10.5 tests
 --echo #
 
@@ -2717,5 +2733,32 @@
 --echo # End of 10.9 tests
 --echo #
 
+--echo #
+--echo # MDEV-34775 Wrong reopen of already open routine due to auto-create in SP
+--echo #
+
+create table t (a int) with system versioning
+partition by system_time
+interval 1 minute auto;
+
+--delimiter $
+create function f()
+returns int
+begin
+  replace into t select * from t;
+  return 0;
+end $
+--delimiter ;
+
+set timestamp= @@timestamp + 61;
+select f();
+
+drop table t;
+drop function f;
+
+--echo #
+--echo # End of 10.11 tests
+--echo #
+
 set global innodb_stats_persistent= @save_persistent;
 --source suite/versioning/common_finish.inc
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/README mariadb-10.11.13/mysql-test/suite/wsrep/README
--- mariadb-10.11.11/mysql-test/suite/wsrep/README	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/README	2025-05-19 16:14:25.000000000 +0000
@@ -4,4 +4,3 @@
 * As these tests are specific to wsrep-related functionalities, they must skip
   on server built without wsrep patch (vanilla). (-DWITH_WSREP=OFF)
   See : include/have_wsrep.inc, include/have_wsrep_enabled.inc, not_wsrep.inc
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/include/check_galera_version.inc mariadb-10.11.13/mysql-test/suite/wsrep/include/check_galera_version.inc
--- mariadb-10.11.11/mysql-test/suite/wsrep/include/check_galera_version.inc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/include/check_galera_version.inc	2025-05-19 16:14:25.000000000 +0000
@@ -44,4 +44,3 @@
 }
 
 --echo # Correct Galera library found
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/plugin.result mariadb-10.11.13/mysql-test/suite/wsrep/r/plugin.result
--- mariadb-10.11.11/mysql-test/suite/wsrep/r/plugin.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/r/plugin.result	2025-05-19 16:14:25.000000000 +0000
@@ -1,3 +1,3 @@
-SELECT plugin_name,plugin_version,plugin_maturity   FROM information_schema.plugins    where plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name;
+SELECT plugin_name,plugin_version,plugin_maturity FROM information_schema.plugins WHERE plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name;
 plugin_name	plugin_version	plugin_maturity
 wsrep	1.0	Stable
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result
--- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid-nobinlog.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,18 @@
+CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
+# Case 1: Server goes through graceful shutdown and is restarted
+connection default;
+INSERT INTO t1 VALUES (1);
+Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+WSREP_LAST_SEEN_GTID()
+100-10-2
+Performing --wsrep-recover ...
+Using --wsrep-start-position when starting mysqld ...
+Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+WSREP_LAST_SEEN_GTID()
+100-10-2
+SELECT * FROM t1;
+f1
+1
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result
--- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-gtid.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,65 @@
+CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
+# Case 1: Server goes through graceful shutdown and is restarted
+connection default;
+INSERT INTO t1 VALUES (1);
+Performing --wsrep-recover ...
+Using --wsrep-start-position when starting mysqld ...
+Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+WSREP_LAST_SEEN_GTID()
+100-10-2
+SELECT @@GLOBAL.gtid_binlog_pos;
+@@GLOBAL.gtid_binlog_pos
+100-10-2
+SELECT * FROM t1;
+f1
+1
+# Case 2: Server is killed after the transaction gets prepared
+# but before it is written into binlog. As there is not GTID assigned,
+# the transaction must be rolled back during recovery.
+connect con, localhost, root;
+SET DEBUG_SYNC = "ha_commit_trans_after_prepare SIGNAL reached WAIT_FOR continue";
+INSERT INTO t1 VALUES (2);
+connection default;
+SET DEBUG_SYNC = "now WAIT_FOR reached";
+# Kill the server
+Performing --wsrep-recover ...
+Using --wsrep-start-position when starting mysqld ...
+Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+WSREP_LAST_SEEN_GTID()
+100-10-2
+SELECT @@GLOBAL.gtid_binlog_pos;
+@@GLOBAL.gtid_binlog_pos
+100-10-2
+Expect 1
+SELECT * FROM t1;
+f1
+1
+disconnect con;
+# Case 3: Server is killed after the transaction gets written into binlog
+# but before it is committed in storage engine. In this case the
+# transaction must be committed during recovery as it had a valid
+# GTID assigned.
+connect con, localhost, root;
+SET DEBUG_SYNC = "commit_before_get_LOCK_commit_ordered SIGNAL reached WAIT_FOR continue";
+INSERT INTO t1 VALUES (3);
+connection default;
+SET DEBUG_SYNC = "now WAIT_FOR reached";
+# Kill the server
+Performing --wsrep-recover ...
+Using --wsrep-start-position when starting mysqld ...
+Expect 100-10-3
+SELECT WSREP_LAST_SEEN_GTID();
+WSREP_LAST_SEEN_GTID()
+100-10-3
+SELECT @@GLOBAL.gtid_binlog_pos;
+@@GLOBAL.gtid_binlog_pos
+100-10-3
+Expect 1 3
+SELECT * FROM t1;
+f1
+1
+3
+disconnect con;
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff
--- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep-recover-v25,binlogon.rdiff	2025-05-19 16:14:25.000000000 +0000
@@ -1,5 +1,5 @@
 --- r/wsrep-recover-v25.result
-+++ r/wsrep-recover-v25.reject
++++ r/wsrep-recover-v25,binlogoin.reject
 @@ -12,4 +12,16 @@
  SELECT VARIABLE_VALUE `expect 6` FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_last_committed';
  expect 6
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result
--- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_forced_binlog_format.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,51 @@
+#
+# wsrep_forced_binlog_format
+#
+# save the initial value
+SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format;
+# default
+SELECT @@global.wsrep_forced_binlog_format;
+@@global.wsrep_forced_binlog_format
+NONE
+
+# scope
+SELECT @@session.wsrep_forced_binlog_format;
+ERROR HY000: Variable 'wsrep_forced_binlog_format' is a GLOBAL variable
+SET @@global.wsrep_forced_binlog_format=STATEMENT;
+SELECT @@global.wsrep_forced_binlog_format;
+@@global.wsrep_forced_binlog_format
+STATEMENT
+
+# valid values
+SET @@global.wsrep_forced_binlog_format=STATEMENT;
+SELECT @@global.wsrep_forced_binlog_format;
+@@global.wsrep_forced_binlog_format
+STATEMENT
+SET @@global.wsrep_forced_binlog_format=ROW;
+SELECT @@global.wsrep_forced_binlog_format;
+@@global.wsrep_forced_binlog_format
+ROW
+SET @@global.wsrep_forced_binlog_format=MIXED;
+SELECT @@global.wsrep_forced_binlog_format;
+@@global.wsrep_forced_binlog_format
+MIXED
+SET @@global.wsrep_forced_binlog_format=NONE;
+SELECT @@global.wsrep_forced_binlog_format;
+@@global.wsrep_forced_binlog_format
+NONE
+SET @@global.wsrep_forced_binlog_format=default;
+SELECT @@global.wsrep_forced_binlog_format;
+@@global.wsrep_forced_binlog_format
+NONE
+
+# invalid values
+SET @@global.wsrep_forced_binlog_format=NULL;
+ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'NULL'
+SET @@global.wsrep_forced_binlog_format='junk';
+ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'junk'
+SET @@global.wsrep_forced_binlog_format=ON;
+ERROR 42000: Variable 'wsrep_forced_binlog_format' can't be set to the value of 'ON'
+
+# restore the initial value
+SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved;
+# End of test
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result
--- mariadb-10.11.11/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/r/wsrep_mixed_case_cmd_arg.result	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,8 @@
+#
+# MDEV-27126: my_getopt compares option names case sensitively
+#
+# Check if the variable is set correctly from options
+SELECT @@GLOBAL.wsrep_slave_uk_checks;
+@@GLOBAL.wsrep_slave_uk_checks
+1
+# End of test.
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/suite.pm mariadb-10.11.13/mysql-test/suite/wsrep/suite.pm
--- mariadb-10.11.11/mysql-test/suite/wsrep/suite.pm	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/suite.pm	2025-05-19 16:14:25.000000000 +0000
@@ -9,9 +9,9 @@
 
 push @::global_suppressions,
   (
-     qr(WSREP: Could not open saved state file for reading: .*),
-     qr(WSREP: Could not open state file for reading: .*),
-     qr|WSREP: access file\(.*gvwstate.dat\) failed\(No such file or directory\)|,
+     qr(WSREP: Could not open saved state file for reading: ),
+     qr(WSREP: Could not open state file for reading: ),
+     qr|WSREP: access file\(.*gvwstate.dat\) failed ?\(No such file or directory\)|,
    );
 
 bless { };
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/binlog_format.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/binlog_format.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/binlog_format.cnf	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/binlog_format.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -5,4 +5,3 @@
 wsrep-provider=@ENV.WSREP_PROVIDER
 wsrep-cluster-address=gcomm://
 innodb_autoinc_lock_mode=2
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/foreign_key.test mariadb-10.11.13/mysql-test/suite/wsrep/t/foreign_key.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/foreign_key.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/foreign_key.test	2025-05-19 16:14:25.000000000 +0000
@@ -17,4 +17,3 @@
 # Cleanup
 DROP TABLE c;
 DROP TABLE p;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_10186.test mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_10186.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_10186.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_10186.test	2025-05-19 16:14:25.000000000 +0000
@@ -9,4 +9,3 @@
 
 SELECT @@wsrep_on;
 SET @@GLOBAL.wsrep_cluster_address='gcomm://';
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_7798.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_7798.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/mdev_7798.cnf	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/mdev_7798.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -4,4 +4,3 @@
 wsrep-on=ON
 wsrep-provider=@ENV.WSREP_PROVIDER
 wsrep-cluster-address=gcomm://
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/plugin.test mariadb-10.11.13/mysql-test/suite/wsrep/t/plugin.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/plugin.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/plugin.test	2025-05-19 16:14:25.000000000 +0000
@@ -5,4 +5,4 @@
 # MDEV-7604: wsrep plugin lists its status as Unknown
 #
 
-SELECT plugin_name,plugin_version,plugin_maturity   FROM information_schema.plugins    where plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name;
\ No newline at end of file
+SELECT plugin_name,plugin_version,plugin_maturity FROM information_schema.plugins WHERE plugin_name like 'wsrep' ORDER BY plugin_maturity,plugin_name;
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/pool_of_threads.test mariadb-10.11.13/mysql-test/suite/wsrep/t/pool_of_threads.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/pool_of_threads.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/pool_of_threads.test	2025-05-19 16:14:25.000000000 +0000
@@ -1,3 +1,4 @@
+--source include/have_innodb.inc
 --source include/have_wsrep_enabled.inc
 --source include/have_binlog_format_row.inc
 
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/variables.test mariadb-10.11.13/mysql-test/suite/wsrep/t/variables.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/variables.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/variables.test	2025-05-19 16:14:25.000000000 +0000
@@ -23,4 +23,3 @@
 --echo # variables
 
 SELECT VARIABLE_NAME FROM INFORMATION_SCHEMA.SESSION_VARIABLES WHERE VARIABLE_NAME LIKE "wsrep%" ORDER BY VARIABLE_NAME;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/variables_debug.test mariadb-10.11.13/mysql-test/suite/wsrep/t/variables_debug.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/variables_debug.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/variables_debug.test	2025-05-19 16:14:25.000000000 +0000
@@ -8,7 +8,7 @@
 --let $galera_version=26.4.21
 source include/check_galera_version.inc;
 
-source include/galera_variables_ok.inc;
+source include/galera_variables_ok_debug.inc;
 
 --replace_column 2 #
 SHOW GLOBAL STATUS LIKE 'wsrep%';
@@ -25,4 +25,3 @@
 --echo # variables
 
 SELECT VARIABLE_NAME FROM INFORMATION_SCHEMA.SESSION_VARIABLES WHERE VARIABLE_NAME LIKE "wsrep%" ORDER BY VARIABLE_NAME;
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,10 @@
+!include ../my.cnf
+
+[mysqld.1]
+wsrep-on=ON
+wsrep-provider=@ENV.WSREP_PROVIDER
+wsrep-cluster-address=gcomm://
+binlog-format=ROW
+wsrep-gtid-domain-id=100
+server-id=10
+innodb-autoinc-lock-mode=2
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid-nobinlog.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,28 @@
+# Test wsrep GTID recovery with binlog off. The test restarts the server
+# and verifies that the GTID returned by SELECT WSREP_LAST_SEEN_GTID()
+# gets initialized properly during server restart.
+#
+--source include/have_wsrep.inc
+--source include/have_wsrep_provider.inc
+--source include/have_innodb.inc
+--source include/have_debug_sync.inc
+
+CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
+
+--echo # Case 1: Server goes through graceful shutdown and is restarted
+--connection default
+INSERT INTO t1 VALUES (1);
+
+--echo Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+
+--source include/shutdown_mysqld.inc
+--let $galera_wsrep_recover_server_id = 1
+--source suite/galera/include/galera_wsrep_recover.inc
+--source suite/galera/include/start_mysqld.inc
+
+--echo Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+SELECT * FROM t1;
+
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,14 @@
+!include ../my.cnf
+
+[mysqld.1]
+wsrep-on=ON
+wsrep-provider=@ENV.WSREP_PROVIDER
+wsrep-cluster-address=gcomm://
+binlog-format=ROW
+log-bin
+log-slave-updates
+gtid-domain-id=10
+gtid-strict-mode=ON
+wsrep-gtid-mode=ON
+wsrep-gtid-domain-id=100
+server-id=10
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover-gtid.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,73 @@
+# Test wsrep recovery with gtid_mode=ON. The test crashes the server
+# in different commit stages and verifies that the GTID returned by
+# SELECT WSREP_LAST_SEEN_GTID() and @@GLOBAL.gtid_binlog_pos get
+# initialized properly during server restart.
+#
+--source include/have_wsrep.inc
+--source include/have_wsrep_provider.inc
+--source include/have_innodb.inc
+--source include/have_log_bin.inc
+--source include/have_debug_sync.inc
+
+CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
+
+--echo # Case 1: Server goes through graceful shutdown and is restarted
+--connection default
+INSERT INTO t1 VALUES (1);
+--source include/shutdown_mysqld.inc
+--let $galera_wsrep_recover_server_id = 1
+--source suite/galera/include/galera_wsrep_recover.inc
+--source suite/galera/include/start_mysqld.inc
+
+--echo Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+SELECT @@GLOBAL.gtid_binlog_pos;
+SELECT * FROM t1;
+
+--echo # Case 2: Server is killed after the transaction gets prepared
+--echo # but before it is written into binlog. As there is not GTID assigned,
+--echo # the transaction must be rolled back during recovery.
+--connect con, localhost, root
+SET DEBUG_SYNC = "ha_commit_trans_after_prepare SIGNAL reached WAIT_FOR continue";
+--send INSERT INTO t1 VALUES (2)
+
+--connection default
+SET DEBUG_SYNC = "now WAIT_FOR reached";
+--source include/kill_mysqld.inc
+--let $galera_wsrep_recover_server_id = 1
+--source suite/galera/include/galera_wsrep_recover.inc
+--source suite/galera/include/start_mysqld.inc
+--source include/wait_wsrep_ready.inc
+
+--echo Expect 100-10-2
+SELECT WSREP_LAST_SEEN_GTID();
+SELECT @@GLOBAL.gtid_binlog_pos;
+--echo Expect 1
+SELECT * FROM t1;
+--disconnect con
+
+--echo # Case 3: Server is killed after the transaction gets written into binlog
+--echo # but before it is committed in storage engine. In this case the
+--echo # transaction must be committed during recovery as it had a valid
+--echo # GTID assigned.
+
+--connect con, localhost, root
+SET DEBUG_SYNC = "commit_before_get_LOCK_commit_ordered SIGNAL reached WAIT_FOR continue";
+--send INSERT INTO t1 VALUES (3)
+
+--connection default
+SET DEBUG_SYNC = "now WAIT_FOR reached";
+--source include/kill_mysqld.inc
+--let $galera_wsrep_recover_server_id = 1
+--source suite/galera/include/galera_wsrep_recover.inc
+--source suite/galera/include/start_mysqld.inc
+--source include/wait_wsrep_ready.inc
+--echo Expect 100-10-3
+SELECT WSREP_LAST_SEEN_GTID();
+SELECT @@GLOBAL.gtid_binlog_pos;
+--echo Expect 1 3
+SELECT * FROM t1;
+
+--disconnect con
+
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep-recover.cnf	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep-recover.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -6,4 +6,4 @@
 innodb-flush-log-at-trx-commit=1
 wsrep-cluster-address=gcomm://
 wsrep-provider=@ENV.WSREP_PROVIDER
-innodb-autoinc-lock-mode=2
\ No newline at end of file
+innodb-autoinc-lock-mode=2
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,7 @@
+!include ../my.cnf
+
+[mysqld.1]
+wsrep-on=ON
+wsrep-cluster-address=gcomm://
+wsrep-provider=@ENV.WSREP_PROVIDER
+binlog-format=ROW
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_forced_binlog_format.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,48 @@
+--source include/have_innodb.inc
+--source include/have_wsrep_provider.inc
+--source include/have_binlog_format_row.inc
+
+--echo #
+--echo # wsrep_forced_binlog_format
+--echo #
+
+--echo # save the initial value
+SET @wsrep_forced_binlog_format_global_saved = @@global.wsrep_forced_binlog_format;
+
+--echo # default
+SELECT @@global.wsrep_forced_binlog_format;
+
+--echo
+--echo # scope
+--error ER_INCORRECT_GLOBAL_LOCAL_VAR
+SELECT @@session.wsrep_forced_binlog_format;
+SET @@global.wsrep_forced_binlog_format=STATEMENT;
+SELECT @@global.wsrep_forced_binlog_format;
+
+--echo
+--echo # valid values
+SET @@global.wsrep_forced_binlog_format=STATEMENT;
+SELECT @@global.wsrep_forced_binlog_format;
+SET @@global.wsrep_forced_binlog_format=ROW;
+SELECT @@global.wsrep_forced_binlog_format;
+SET @@global.wsrep_forced_binlog_format=MIXED;
+SELECT @@global.wsrep_forced_binlog_format;
+SET @@global.wsrep_forced_binlog_format=NONE;
+SELECT @@global.wsrep_forced_binlog_format;
+SET @@global.wsrep_forced_binlog_format=default;
+SELECT @@global.wsrep_forced_binlog_format;
+
+--echo
+--echo # invalid values
+--error ER_WRONG_VALUE_FOR_VAR
+SET @@global.wsrep_forced_binlog_format=NULL;
+--error ER_WRONG_VALUE_FOR_VAR
+SET @@global.wsrep_forced_binlog_format='junk';
+--error ER_WRONG_VALUE_FOR_VAR
+SET @@global.wsrep_forced_binlog_format=ON;
+
+--echo
+--echo # restore the initial value
+SET @@global.wsrep_forced_binlog_format = @wsrep_forced_binlog_format_global_saved;
+
+--echo # End of test
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,6 @@
+!include ../my.cnf
+
+[mysqld.1]
+wsrep-on=ON
+wsrep-provider=@ENV.WSREP_PROVIDER
+wsrep-cluster-address=gcomm://
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.opt	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1 @@
+--wsrep-slave-uk-checks=1
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_mixed_case_cmd_arg.test	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,11 @@
+--source include/have_innodb.inc
+--source include/have_wsrep_provider.inc
+--source include/have_binlog_format_row.inc
+--echo #
+--echo # MDEV-27126: my_getopt compares option names case sensitively
+--echo #
+
+--echo # Check if the variable is set correctly from options
+SELECT @@GLOBAL.wsrep_slave_uk_checks;
+
+--echo # End of test.
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_rpl.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_rpl.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_rpl.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_rpl.test	2025-05-19 16:14:25.000000000 +0000
@@ -41,4 +41,3 @@
 
 --source include/rpl_end.inc
 --echo # End of test.
-
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_sst_method.test	2025-05-19 16:14:25.000000000 +0000
@@ -44,7 +44,6 @@
 SELECT @@global.wsrep_sst_method;
 SHOW WARNINGS;
 
-
 --disable_query_log
 SET @@global.wsrep_sst_method = @wsrep_sst_method_saved;
 --enable_query_log
diff -Nru mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf
--- mariadb-10.11.11/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysql-test/suite/wsrep/t/wsrep_variables_wsrep_off.cnf	2025-05-19 16:14:25.000000000 +0000
@@ -9,4 +9,3 @@
 #galera_port=@OPT.port
 #ist_port=@OPT.port
 #sst_port=@OPT.port
-
diff -Nru mariadb-10.11.11/mysys/CMakeLists.txt mariadb-10.11.13/mysys/CMakeLists.txt
--- mariadb-10.11.11/mysys/CMakeLists.txt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysys/CMakeLists.txt	2025-05-19 16:14:25.000000000 +0000
@@ -46,7 +46,8 @@
                                 my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c
 				my_rdtsc.c psi_noop.c
                                 my_atomic_writes.c my_cpu.c my_likely.c my_largepage.c
-                                file_logger.c my_dlerror.c   crc32/crc32c.cc)
+                                file_logger.c my_dlerror.c   crc32/crc32c.cc
+                                my_virtual_mem.c)
 
 IF (WIN32)
   SET (MYSYS_SOURCES ${MYSYS_SOURCES}
@@ -170,7 +171,7 @@
 ENDIF(HAVE_BFD_H)
 
 IF (WIN32)
-  TARGET_LINK_LIBRARIES(mysys iphlpapi dbghelp)
+  TARGET_LINK_LIBRARIES(mysys iphlpapi dbghelp ws2_32 synchronization)
 ENDIF(WIN32)
 
 # Need explicit pthread for gcc -fsanitize=address
diff -Nru mariadb-10.11.11/mysys/mf_keycache.c mariadb-10.11.13/mysys/mf_keycache.c
--- mariadb-10.11.11/mysys/mf_keycache.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysys/mf_keycache.c	2025-05-19 16:14:25.000000000 +0000
@@ -3762,10 +3762,11 @@
 
 static int cmp_sec_link(const void *_a, const void *_b)
 {
-  BLOCK_LINK *const *a= _a;
-  BLOCK_LINK *const *b= _b;
-  return (((*a)->hash_link->diskpos < (*b)->hash_link->diskpos) ? -1 :
-      ((*a)->hash_link->diskpos > (*b)->hash_link->diskpos) ? 1 : 0);
+  const BLOCK_LINK *a= *(const BLOCK_LINK **)_a;
+  const BLOCK_LINK *b= *(const BLOCK_LINK **)_b;
+
+  return (a->hash_link->diskpos < b->hash_link->diskpos) ? -1 :
+      (a->hash_link->diskpos > b->hash_link->diskpos) ? 1 : 0;
 }
 
 
diff -Nru mariadb-10.11.11/mysys/my_default.c mariadb-10.11.13/mysys/my_default.c
--- mariadb-10.11.11/mysys/my_default.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysys/my_default.c	2025-05-19 16:14:25.000000000 +0000
@@ -318,6 +318,9 @@
   }
 
   if (! my_defaults_group_suffix)
+    my_defaults_group_suffix= getenv("MARIADB_GROUP_SUFFIX");
+
+  if (! my_defaults_group_suffix)
     my_defaults_group_suffix= getenv("MYSQL_GROUP_SUFFIX");
 
   if (my_defaults_extra_file && my_defaults_extra_file != extra_file_buffer)
diff -Nru mariadb-10.11.11/mysys/my_getopt.c mariadb-10.11.13/mysys/my_getopt.c
--- mariadb-10.11.11/mysys/my_getopt.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysys/my_getopt.c	2025-05-19 16:14:25.000000000 +0000
@@ -18,6 +18,7 @@
 #include <mysys_priv.h>
 #include <my_default.h>
 #include <m_string.h>
+#include <ctype.h>
 #include <stdlib.h>
 #include <mysys_err.h>
 #include <my_getopt.h>
@@ -1002,7 +1003,7 @@
 
   for (;s != end ; s++, t++)
   {
-    if ((*s != '-' ? *s : '_') != (*t != '-' ? *t : '_'))
+    if ((*s != '-' ? tolower(*s) : '_') != (*t != '-' ? tolower(*t) : '_'))
       DBUG_RETURN(1);
   }
   DBUG_RETURN(0);
diff -Nru mariadb-10.11.11/mysys/my_largepage.c mariadb-10.11.13/mysys/my_largepage.c
--- mariadb-10.11.11/mysys/my_largepage.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysys/my_largepage.c	2025-05-19 16:14:25.000000000 +0000
@@ -35,17 +35,11 @@
 #endif /* __sun__ ... */
 #endif /* HAVE_SOLARIS_LARGE_PAGES */
 
-#if defined(_WIN32)
-static size_t my_large_page_size;
-#define HAVE_LARGE_PAGES
-#elif defined(HAVE_MMAP)
-#define HAVE_LARGE_PAGES
-#endif
 
-#ifdef HAVE_LARGE_PAGES
-static my_bool my_use_large_pages= 0;
-#else
-#define my_use_large_pages 0
+my_bool my_use_large_pages;
+
+#ifdef _WIN32
+static size_t my_large_page_size;
 #endif
 
 #if defined(HAVE_GETPAGESIZES) || defined(__linux__)
@@ -172,7 +166,7 @@
   @retval  a large page size that is valid on this system or 0 if no large page
            size possible.
 */
-#if defined(HAVE_MMAP) && !defined(_WIN32)
+#ifndef _WIN32
 static size_t my_next_large_page_size(size_t sz, int *start)
 {
   DBUG_ENTER("my_next_large_page_size");
@@ -188,11 +182,12 @@
   }
   DBUG_RETURN(0);
 }
-#endif /* defined(MMAP) || !defined(_WIN32) */
+#endif
 
 
-int my_init_large_pages(my_bool super_large_pages)
+int my_init_large_pages(void)
 {
+  my_use_large_pages= 1;
 #ifdef _WIN32
   if (!my_obtain_privilege(SE_LOCK_MEMORY_NAME))
   {
@@ -200,19 +195,15 @@
                     "Lock Pages in memory access rights required for use with"
                     " large-pages, see https://mariadb.com/kb/en/library/"
                     "mariadb-memory-allocation/#huge-pages", MYF(MY_WME));
+    my_use_large_pages= 0;
   }
   my_large_page_size= GetLargePageMinimum();
 #endif
 
-  my_use_large_pages= 1;
   my_get_large_page_sizes(my_large_page_sizes);
 
-#ifndef HAVE_LARGE_PAGES
-  my_printf_error(EE_OUTOFMEMORY, "No large page support on this platform",
-                  MYF(MY_WME));
-#endif
-
 #ifdef HAVE_SOLARIS_LARGE_PAGES
+  extern my_bool opt_super_large_pages;
   /*
     tell the kernel that we want to use 4/256MB page for heap storage
     and also for the stack. We use 4 MByte as default and if the
@@ -222,9 +213,15 @@
     measured in a number of GBytes.
     We use as big pages as possible which isn't bigger than the above
     desired page sizes.
+
+    Note: This refers to some implementations of the SPARC ISA,
+    where the supported page sizes are
+    8KiB, 64KiB, 512KiB, 4MiB, 32MiB, 256MiB, 2GiB, and 16GiB.
+    On implementations of the AMD64 ISA, the available page sizes
+    should be 4KiB, 2MiB, and 1GiB.
   */
   int nelem= 0;
-  size_t max_desired_page_size= (super_large_pages ? 256 : 4) * 1024 * 1024;
+  size_t max_desired_page_size= opt_super_large_pages ? 256 << 20 : 4 << 20;
   size_t max_page_size= my_next_large_page_size(max_desired_page_size, &nelem);
 
   if (max_page_size > 0)
@@ -426,6 +423,78 @@
   DBUG_RETURN(ptr);
 }
 
+#ifndef _WIN32
+/**
+  Special large pages allocator, with possibility to commit to allocating
+  more memory later.
+  Every implementation returns a zero filled buffer here.
+*/
+char *my_large_virtual_alloc(size_t *size)
+{
+  char *ptr;
+  DBUG_ENTER("my_large_virtual_alloc");
+
+  if (my_use_large_pages)
+  {
+    size_t large_page_size;
+    int page_i= 0;
+
+    while ((large_page_size= my_next_large_page_size(*size, &page_i)) != 0)
+    {
+      int mapflag= MAP_PRIVATE |
+# ifdef MAP_POPULATE
+        MAP_POPULATE |
+# endif
+# if defined MAP_HUGETLB /* linux 2.6.32 */
+        MAP_HUGETLB |
+#  if defined MAP_HUGE_SHIFT /* Linux-3.8+ */
+        my_bit_log2_size_t(large_page_size) << MAP_HUGE_SHIFT |
+#  else
+#   warning "No explicit large page (HUGETLB pages) support in Linux < 3.8"
+#  endif
+# elif defined MAP_ALIGNED
+        MAP_ALIGNED(my_bit_log2_size_t(large_page_size)) |
+#  if defined MAP_ALIGNED_SUPER
+        MAP_ALIGNED_SUPER |
+#  endif
+# endif
+        OS_MAP_ANON;
+
+      size_t aligned_size= MY_ALIGN(*size, (size_t) large_page_size);
+      ptr= mmap(NULL, aligned_size, PROT_READ | PROT_WRITE, mapflag, -1, 0);
+      if (ptr == (void*) -1)
+      {
+        ptr= NULL;
+        /* try next smaller memory size */
+        if (errno == ENOMEM)
+          continue;
+
+        /* other errors are more serious */
+        break;
+      }
+      else /* success */
+      {
+        /*
+          we do need to record the adjustment so that munmap gets called with
+          the right size. This is only the case for HUGETLB pages.
+        */
+        *size= aligned_size;
+        DBUG_RETURN(ptr);
+      }
+    }
+  }
+
+  ptr= mmap(NULL, *size, PROT_READ | PROT_WRITE,
+            MAP_PRIVATE | OS_MAP_ANON, -1, 0);
+  if (ptr == MAP_FAILED)
+  {
+    my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size);
+    ptr= NULL;
+  }
+
+  DBUG_RETURN(ptr);
+}
+#endif
 
 /**
   General large pages deallocator.
@@ -482,7 +551,7 @@
 #endif /* memory_sanitizer */
 #else
   my_free_lock(ptr);
-#endif /* HAVE_MMMAP */
+#endif /* HAVE_MMAP */
 
   DBUG_VOID_RETURN;
 }
diff -Nru mariadb-10.11.11/mysys/my_pread.c mariadb-10.11.13/mysys/my_pread.c
--- mariadb-10.11.11/mysys/my_pread.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/mysys/my_pread.c	2025-05-19 16:14:25.000000000 +0000
@@ -158,6 +158,15 @@
 #else
     writtenbytes= pwrite(Filedes, Buffer, Count, offset);
 #endif
+
+    DBUG_EXECUTE_IF ("simulate_file_pwrite_error",
+                     if (writtenbytes == Count &&
+                         my_seek(Filedes, 0, SEEK_END, MYF(0)) > 1024*1024L)
+                     {
+                       errno= ENOSPC;
+                       writtenbytes= (size_t) -1;
+                     });
+
     if (writtenbytes == Count)
       break;
     my_errno= errno;
diff -Nru mariadb-10.11.11/mysys/my_virtual_mem.c mariadb-10.11.13/mysys/my_virtual_mem.c
--- mariadb-10.11.11/mysys/my_virtual_mem.c	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/mysys/my_virtual_mem.c	2025-05-19 16:14:25.000000000 +0000
@@ -0,0 +1,201 @@
+/* Copyright (c) 2025, MariaDB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <mysys_err.h>
+#include <my_virtual_mem.h>
+#ifdef _AIX
+# include <sys/shm.h>
+#endif
+
+/*
+  Functionality for handling virtual memory
+
+  - reserve range,
+  - commit memory (within reserved range)
+  - decommit previously commited memory
+  - release range
+
+  Not every OS has a "reserve" functionality, i.e it is not always
+  possible to reserve memory larger than swap or RAM for example.
+
+  We try to respect use_large_pages setting, on Windows and Linux
+*/
+#ifdef _WIN32
+char *my_virtual_mem_reserve(size_t *size)
+{
+  DWORD flags= my_use_large_pages
+    ? MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT
+    : MEM_RESERVE;
+  char *ptr= VirtualAlloc(NULL, *size, flags, PAGE_READWRITE);
+  if (!ptr && (flags & MEM_LARGE_PAGES))
+  {
+    /* Try without large pages */
+    ptr= VirtualAlloc(NULL, *size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+    if (!ptr)
+      my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), *size);
+  }
+  return ptr;
+}
+#endif
+
+#if defined _WIN32 && !defined DBUG_OFF
+static my_bool is_memory_committed(char *ptr, size_t size)
+{
+  MEMORY_BASIC_INFORMATION mbi;
+  if (VirtualQuery(ptr, &mbi, sizeof mbi) == 0)
+    DBUG_ASSERT(0);
+  return !!(mbi.State & MEM_COMMIT);
+}
+#endif
+
+char *my_virtual_mem_commit(char *ptr, size_t size)
+{
+  DBUG_ASSERT(ptr);
+#ifdef _WIN32
+  if (my_use_large_pages)
+  {
+    DBUG_ASSERT(is_memory_committed(ptr, size));
+  }
+  else
+  {
+    void *p= VirtualAlloc(ptr, size, MEM_COMMIT, PAGE_READWRITE);
+    DBUG_ASSERT(p == ptr);
+    if (!p)
+    {
+      my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size);
+      return NULL;
+    }
+  }
+#else
+  if (my_use_large_pages)
+    /* my_large_virtual_alloc() already created a read/write mapping. */;
+  else
+  {
+# ifdef _AIX
+    /*
+      MAP_FIXED does not not work on IBM AIX in the way does works elsewhere.
+      Apparently, it is not possible to mmap(2) a range that is already in use,
+      at least not by default.
+
+      mprotect(2) is the fallback, it can't communicate out-of-memory
+      conditions, but it looks like overcommitting is not possible on
+      AIX anyway.
+    */
+    if (mprotect(ptr, size, PROT_READ | PROT_WRITE))
+    {
+      my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size);
+      return NULL;
+    }
+# else
+    void *p= 0;
+    const int flags=
+#  ifdef MAP_POPULATE
+      MAP_POPULATE |
+#  endif
+      MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED;
+    p= mmap(ptr, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+    if (p == MAP_FAILED)
+    {
+      my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size);
+      return NULL;
+    }
+    DBUG_ASSERT(p == ptr);
+#  if defined MADV_FREE_REUSABLE && defined MADV_FREE_REUSE /* Apple macOS */
+    madvise(ptr, size, MADV_FREE_REUSE); /* cancel MADV_FREE_REUSABLE */
+#  endif
+# endif
+  }
+#endif
+  update_malloc_size(size, 0);
+  return ptr;
+}
+
+void my_virtual_mem_decommit(char *ptr, size_t size)
+{
+#ifdef _WIN32
+  DBUG_ASSERT(is_memory_committed(ptr, size));
+# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
+#  error "VirtualFree(MEM_DECOMMIT) will not allow subsequent reads!"
+# endif
+  if (!my_use_large_pages)
+  {
+    if (!VirtualFree(ptr, size, MEM_DECOMMIT))
+    {
+      my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size,
+               GetLastError());
+      DBUG_ASSERT(0);
+    }
+  }
+#else
+  const int prot=
+# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
+    /*
+      In InnoDB, buf_pool_t::page_guess() may deference pointers to
+      this, assuming that either the original contents or zeroed
+      contents is available.
+    */
+    PROT_READ
+# else
+    /* We will explicitly mark the memory unaccessible. */
+    PROT_NONE
+# endif
+    ;
+# ifdef _AIX
+  disclaim(ptr, size, DISCLAIM_ZEROMEM);
+# elif defined __linux__ || defined __osf__
+  madvise(ptr, size, MADV_DONTNEED); /* OSF/1, Linux mimicing AIX disclaim() */
+# elif defined MADV_FREE_REUSABLE && defined MADV_FREE_REUSE
+  /* Mac OS X 10.9; undocumented in Apple macOS */
+  madvise(ptr, size, MADV_FREE_REUSABLE); /* macOS mimicing AIX disclaim() */
+# elif defined MADV_PURGE /* Illumos */
+  madvise(ptr, size, MADV_PURGE); /* Illumos mimicing AIX disclaim() */
+# elif defined MADV_FREE
+  /* FreeBSD, NetBSD, OpenBSD, Dragonfly BSD, OpenSolaris, Apple macOS */
+  madvise(ptr, size, MADV_FREE); /* allow lazy zeroing out */
+# elif defined MADV_DONTNEED
+#  warning "It is unclear if madvise(MADV_DONTNEED) works as intended"
+  madvise(ptr, size, MADV_DONTNEED);
+# else
+#  warning "Do not know how to decommit memory"
+# endif
+  if (mprotect(ptr, size, prot))
+  {
+    my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno);
+    DBUG_ASSERT(0);
+  }
+#endif
+  update_malloc_size(-(longlong) size, 0);
+}
+
+void my_virtual_mem_release(char *ptr, size_t size)
+{
+#ifdef _WIN32
+  DBUG_ASSERT(my_use_large_pages || !is_memory_committed(ptr, size));
+  if (!VirtualFree(ptr, 0, MEM_RELEASE))
+  {
+    my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size,
+             GetLastError());
+    DBUG_ASSERT(0);
+  }
+#else
+  if (munmap(ptr, size))
+  {
+    my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno);
+    DBUG_ASSERT(0);
+  }
+#endif
+}
diff -Nru mariadb-10.11.11/plugin/auth_examples/auth_0x0100.c mariadb-10.11.13/plugin/auth_examples/auth_0x0100.c
--- mariadb-10.11.11/plugin/auth_examples/auth_0x0100.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/auth_examples/auth_0x0100.c	2025-05-19 16:14:25.000000000 +0000
@@ -56,6 +56,10 @@
 };
 #endif
 
+/* function-type-mismatch ignore */
+#if defined(__clang__)
+__attribute__((no_sanitize("undefined")))
+#endif
 static int do_auth_0x0100(MYSQL_PLUGIN_VIO *vio, MYSQL_SERVER_AUTH_INFO *info)
 {
   info->password_used= 1;
diff -Nru mariadb-10.11.11/plugin/server_audit/server_audit.c mariadb-10.11.13/plugin/server_audit/server_audit.c
--- mariadb-10.11.11/plugin/server_audit/server_audit.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/server_audit/server_audit.c	2025-05-19 16:14:25.000000000 +0000
@@ -2855,6 +2855,18 @@
 {
   char *new_name= (*(char **) save) ? *(char **) save : empty_str;
 
+  if (strlen(new_name) + 4  > FN_REFLEN)
+  {
+    error_header();
+    fprintf(stderr,
+            "server_audit_file_path can't exceed %d characters.\n",
+            FN_REFLEN - 4);
+    fprintf(stderr, "Log filename remains unchanged '%s'.\n", file_path);
+    CLIENT_ERROR(1, "server_audit_file_path can't exceed %d characters.",
+                 MYF(ME_WARNING), FN_REFLEN - 4);
+    return;
+  }
+
   ADD_ATOMIC(internal_stop_logging, 1);
   error_header();
   fprintf(stderr, "Log file name was changed to '%s'.\n", new_name);
diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.result
--- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.result	2025-05-19 16:14:25.000000000 +0000
@@ -2407,3 +2407,26 @@
 DROP TABLE t1;
 SET max_sort_length=DEFAULT;
 # End of 10.8 tests
+#
+# MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+#
+CREATE OR REPLACE TABLE t1 (c1 BINARY(16), UNIQUE (c1));
+INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000001);
+INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000002);
+SELECT CAST(c1 AS INET6) FROM t1 WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1;
+CAST(c1 AS INET6)
+::1
+::2
+SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1;
+CAST(c1 AS INET6)
+::1
+::2
+SELECT CAST(c1 AS INET6) FROM t1 WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6);
+CAST(c1 AS INET6)
+::1
+::2
+SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6);
+CAST(c1 AS INET6)
+::1
+::2
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.test mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.test
--- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6.test	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6.test	2025-05-19 16:14:25.000000000 +0000
@@ -1741,3 +1741,15 @@
 SET max_sort_length=DEFAULT;
 
 --echo # End of 10.8 tests
+
+--echo #
+--echo # MDEV-36235 Incorrect result for BETWEEN over unique blob prefix
+--echo #
+CREATE OR REPLACE TABLE t1 (c1 BINARY(16), UNIQUE (c1));
+INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000001);
+INSERT INTO t1 (c1) VALUES (0x00000000000000000000000000000002);
+SELECT CAST(c1 AS INET6) FROM t1 WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1;
+SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::1' BETWEEN CAST('::1' AS INET6) AND c1;
+SELECT CAST(c1 AS INET6) FROM t1 WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6);
+SELECT CAST(c1 AS INET6) FROM t1 IGNORE KEY(c1) WHERE '::2' BETWEEN c1 AND CAST('::2' AS INET6);
+DROP TABLE t1;
diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc
--- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_engines.inc	2025-05-19 16:14:25.000000000 +0000
@@ -36,3 +36,16 @@
 EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=CAST('::ff' AS INET6);
 
 DROP TABLE t1;
+
+--echo #
+--echo # MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in
+--echo # Type_handler_fbt<FbtImpl, TypeCollectionImpl>::Field_fbt::store_native,
+--echo # Assertion `item->null_value' failed in Type_handler::Item_send_str
+--echo #
+
+CREATE TABLE t1 (a datetime);
+INSERT INTO t1 VALUES (NULL);
+SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt;
+DROP TABLE t1;
+
+--echo # End of 10.5 tests
diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result
--- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_innodb.result	2025-05-19 16:14:25.000000000 +0000
@@ -88,6 +88,18 @@
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = INET6'::ff'
 DROP TABLE t1;
 #
+# MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in
+# Type_handler_fbt<FbtImpl, TypeCollectionImpl>::Field_fbt::store_native,
+# Assertion `item->null_value' failed in Type_handler::Item_send_str
+#
+CREATE TABLE t1 (a datetime);
+INSERT INTO t1 VALUES (NULL);
+SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt;
+cast('::' AS INET6)	min(1)
+::	NULL
+DROP TABLE t1;
+# End of 10.5 tests
+#
 # MDEV-26742 Assertion `field->type_handler() == this' failed in FixedBinTypeBundle<NATIVE_LEN, MAX_CHAR_LEN>::Type_handler_fbt::stored_field_cmp_to_item
 #
 CREATE TABLE t1 (pk inet6, c text) engine=myisam;
diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result
--- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_memory.result	2025-05-19 16:14:25.000000000 +0000
@@ -155,5 +155,17 @@
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = INET6'::ff'
 DROP TABLE t1;
 #
+# MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in
+# Type_handler_fbt<FbtImpl, TypeCollectionImpl>::Field_fbt::store_native,
+# Assertion `item->null_value' failed in Type_handler::Item_send_str
+#
+CREATE TABLE t1 (a datetime);
+INSERT INTO t1 VALUES (NULL);
+SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt;
+cast('::' AS INET6)	min(1)
+::	NULL
+DROP TABLE t1;
+# End of 10.5 tests
+#
 # End of 10.5 tests
 #
diff -Nru mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result
--- mariadb-10.11.11/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/type_inet/mysql-test/type_inet/type_inet6_myisam.result	2025-05-19 16:14:25.000000000 +0000
@@ -88,6 +88,18 @@
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = INET6'::ff'
 DROP TABLE t1;
 #
+# MDEV-34922: Assertion `value.length() == FbtImpl::binary_length()' failed in
+# Type_handler_fbt<FbtImpl, TypeCollectionImpl>::Field_fbt::store_native,
+# Assertion `item->null_value' failed in Type_handler::Item_send_str
+#
+CREATE TABLE t1 (a datetime);
+INSERT INTO t1 VALUES (NULL);
+SELECT * FROM (SELECT cast('::' AS INET6),min(1) FROM t1 WHERE if(uuid_short(), a,1)) dt;
+cast('::' AS INET6)	min(1)
+::	NULL
+DROP TABLE t1;
+# End of 10.5 tests
+#
 # MDEV-26742 Assertion `field->type_handler() == this' failed in FixedBinTypeBundle<NATIVE_LEN, MAX_CHAR_LEN>::Type_handler_fbt::stored_field_cmp_to_item
 #
 CREATE TABLE t1 (c varchar(64), key(c)) engine=myisam;
diff -Nru mariadb-10.11.11/plugin/userstat/client_stats.cc mariadb-10.11.13/plugin/userstat/client_stats.cc
--- mariadb-10.11.11/plugin/userstat/client_stats.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/userstat/client_stats.cc	2025-05-19 16:14:25.000000000 +0000
@@ -45,8 +45,8 @@
     table->field[j++]->store((longlong)user_stats->total_connections,TRUE);
     table->field[j++]->store((longlong)user_stats->concurrent_connections, TRUE);
     table->field[j++]->store((longlong)user_stats->connected_time, TRUE);
-    table->field[j++]->store((double)user_stats->busy_time);
-    table->field[j++]->store((double)user_stats->cpu_time);
+    table->field[j++]->store((double)user_stats->busy_time/1e6);
+    table->field[j++]->store((double)user_stats->cpu_time/1e6);
     table->field[j++]->store((longlong)user_stats->bytes_received, TRUE);
     table->field[j++]->store((longlong)user_stats->bytes_sent, TRUE);
     table->field[j++]->store((longlong)user_stats->binlog_bytes_written, TRUE);
diff -Nru mariadb-10.11.11/plugin/versioning/versioning.cc mariadb-10.11.13/plugin/versioning/versioning.cc
--- mariadb-10.11.11/plugin/versioning/versioning.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/plugin/versioning/versioning.cc	2025-05-19 16:14:25.000000000 +0000
@@ -150,7 +150,6 @@
   { { C_STRING_WITH_LEN("TRT_TRX_ID") }, BUILDER(Create_func_trt<TR_table::FLD_TRX_ID>)},
   { { C_STRING_WITH_LEN("TRT_TRX_SEES") }, BUILDER(Create_func_trt_trx_sees<Item_func_trt_trx_sees>)},
   { { C_STRING_WITH_LEN("TRT_TRX_SEES_EQ") }, BUILDER(Create_func_trt_trx_sees<Item_func_trt_trx_sees_eq>)},
-  { {0, 0}, NULL}
 };
 
 
diff -Nru mariadb-10.11.11/scripts/mysqlhotcopy.sh mariadb-10.11.13/scripts/mysqlhotcopy.sh
--- mariadb-10.11.11/scripts/mysqlhotcopy.sh	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/scripts/mysqlhotcopy.sh	2025-05-19 16:14:25.000000000 +0000
@@ -208,7 +208,7 @@
 else
 {
   $dsn .= "host=" . $opt{host};
-  if ($opt{host} ne "localhost")
+  if ($opt{host} ne "localhost" and $opt{port})
   {
     $dsn .= ";port=". $opt{port};
   }
diff -Nru mariadb-10.11.11/scripts/wsrep_sst_common.sh mariadb-10.11.13/scripts/wsrep_sst_common.sh
--- mariadb-10.11.11/scripts/wsrep_sst_common.sh	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/scripts/wsrep_sst_common.sh	2025-05-19 16:14:25.000000000 +0000
@@ -1910,4 +1910,17 @@
 
 SST_PID="$DATA/wsrep_sst.pid"
 
+if [ -n "${MTR_SST_JOINER_DELAY:-}" ]; then
+    MTR_SST_JOINER_DELAY=$(trim_string "$MTR_SST_JOINER_DELAY")
+fi
+
+simulate_long_sst()
+{
+    # Delay for MTR tests if needed to simulate long SST/IST:
+    if [ ${MTR_SST_JOINER_DELAY:-0} -gt 0 ]; then
+        wsrep_log_info "Sleeping $MTR_SST_JOINER_DELAY seconds for MTR test"
+        sleep $MTR_SST_JOINER_DELAY
+    fi
+}
+
 wsrep_log_info "$WSREP_METHOD $WSREP_TRANSFER_TYPE started on $WSREP_SST_OPT_ROLE"
diff -Nru mariadb-10.11.11/scripts/wsrep_sst_mariabackup.sh mariadb-10.11.13/scripts/wsrep_sst_mariabackup.sh
--- mariadb-10.11.11/scripts/wsrep_sst_mariabackup.sh	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/scripts/wsrep_sst_mariabackup.sh	2025-05-19 16:14:25.000000000 +0000
@@ -1513,6 +1513,8 @@
         exit 2
     fi
 
+    simulate_long_sst
+
     # use donor magic file, if present
     # if IST was used, donor magic file was not created
     # Remove special tags from the magic file, and from the output:
diff -Nru mariadb-10.11.11/scripts/wsrep_sst_mysqldump.sh mariadb-10.11.13/scripts/wsrep_sst_mysqldump.sh
--- mariadb-10.11.11/scripts/wsrep_sst_mysqldump.sh	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/scripts/wsrep_sst_mysqldump.sh	2025-05-19 16:14:25.000000000 +0000
@@ -184,5 +184,9 @@
     echo "$SET_START_POSITION" | $MYSQL || exit $?
 fi
 
+if [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]; then
+    simulate_long_sst
+fi
+
 wsrep_log_info "$WSREP_METHOD $WSREP_TRANSFER_TYPE completed on $WSREP_SST_OPT_ROLE"
 exit 0
diff -Nru mariadb-10.11.11/scripts/wsrep_sst_rsync.sh mariadb-10.11.13/scripts/wsrep_sst_rsync.sh
--- mariadb-10.11.11/scripts/wsrep_sst_rsync.sh	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/scripts/wsrep_sst_rsync.sh	2025-05-19 16:14:25.000000000 +0000
@@ -915,6 +915,8 @@
         fi
     fi
 
+    simulate_long_sst
+
     # Remove special tags from the magic file, and from the output:
     coords=$(head -n1 "$MAGIC_FILE")
     wsrep_log_info "Galera co-ords from recovery: $coords"
diff -Nru mariadb-10.11.11/sql/filesort.cc mariadb-10.11.13/sql/filesort.cc
--- mariadb-10.11.11/sql/filesort.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/filesort.cc	2025-05-19 16:14:25.000000000 +0000
@@ -640,26 +640,16 @@
 }
 
 #ifndef DBUG_OFF
-/*
-  Print table's current row into a buffer and return a pointer to it.
 
-  This is intended to be used from gdb:
-  
-    (gdb) p dbug_print_table_row(table)
-      $33 = "SUBQUERY2_t1(col_int_key,col_varchar_nokey)=(7,c)"
-    (gdb)
+static char dbug_row_print_buf[4096];
 
-  Only columns in table->read_set are printed
-*/
 
-const char* dbug_print_row(TABLE *table, const uchar *rec, bool print_names)
+String dbug_format_row(TABLE *table, const uchar *rec, bool print_names)
 {
   Field **pfield;
-  const size_t alloc_size= 512;
-  char *row_buff= (char *) alloc_root(&table->mem_root, alloc_size);
-  char *row_buff_tmp= (char *) alloc_root(&table->mem_root, alloc_size);
-  String tmp(row_buff_tmp, alloc_size, &my_charset_bin);
-  String output(row_buff, alloc_size, &my_charset_bin);
+  char row_buff_tmp[512];
+  String tmp(row_buff_tmp, sizeof(row_buff_tmp), &my_charset_bin);
+  String output(dbug_row_print_buf, sizeof(dbug_row_print_buf), &my_charset_bin);
 
   auto move_back_lambda= [table, rec]() mutable {
     table->move_fields(table->field, table->record[0], rec);
@@ -672,7 +662,7 @@
     move_back_guard.engage();
   }
 
-  SCOPE_VALUE(table->read_set, (table->read_set && table->write_set) ?
+  SCOPE_VALUE(table->read_set, (table->reginfo.lock_type >= TL_WRITE_ALLOW_WRITE) ?
                                 table->write_set : table->read_set);
 
   output.length(0);
@@ -724,10 +714,35 @@
   }
   output.append(')');
 
-  return output.c_ptr_safe();
+  return output;
 }
 
+/**
+  A function to display a row in debugger.
+
+  Example usage:
+  (gdb) p dbug_print_row(table, table->record[1])
+*/
+const char *dbug_print_row(TABLE *table, const uchar *rec)
+{
+  String row= dbug_format_row(table, table->record[0]);
+  if (row.length() > sizeof dbug_row_print_buf - 1)
+    return "Couldn't fit into buffer";
+  memcpy(dbug_row_print_buf, row.c_ptr(), row.length());
+  return dbug_row_print_buf;
+}
 
+/**
+  Print table's current row into a buffer and return a pointer to it.
+
+  This is intended to be used from gdb:
+
+    (gdb) p dbug_print_table_row(table)
+      $33 = "SUBQUERY2_t1(col_int_key,col_varchar_nokey)=(7,c)"
+    (gdb)
+
+  Only columns in table->read_set are printed
+*/
 const char* dbug_print_table_row(TABLE *table)
 {
   return dbug_print_row(table, table->record[0]);
diff -Nru mariadb-10.11.11/sql/ha_partition.cc mariadb-10.11.13/sql/ha_partition.cc
--- mariadb-10.11.11/sql/ha_partition.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/ha_partition.cc	2025-05-19 16:14:25.000000000 +0000
@@ -2141,7 +2141,9 @@
     m_added_file[i]->extra(HA_EXTRA_BEGIN_ALTER_COPY);
   error= copy_partitions(copied, deleted);
   for (i= 0; i < part_count; i++)
-    m_added_file[i]->extra(HA_EXTRA_END_ALTER_COPY);
+    m_added_file[i]->extra(error
+                           ? HA_EXTRA_ABORT_ALTER_COPY
+                           : HA_EXTRA_END_ALTER_COPY);
   if (unlikely(error))
   {
     /*
@@ -4404,31 +4406,19 @@
   DBUG_ENTER("ha_partition::store_lock");
   DBUG_ASSERT(thd == current_thd);
 
-  /*
-    This can be called from get_lock_data() in mysql_lock_abort_for_thread(),
-    even when thd != table->in_use. In that case don't use partition pruning,
-    but use all partitions instead to avoid using another threads structures.
-  */
-  if (thd != table->in_use)
+  MY_BITMAP *used_partitions= lock_type == TL_UNLOCK ||
+                              lock_type == TL_IGNORE ?
+                              &m_locked_partitions :
+                              &m_part_info->lock_partitions;
+
+  for (i= bitmap_get_first_set(used_partitions);
+        i < m_tot_parts;
+        i= bitmap_get_next_set(used_partitions, i))
   {
-    for (i= 0; i < m_tot_parts; i++)
-      to= m_file[i]->store_lock(thd, to, lock_type);
+    DBUG_PRINT("info", ("store lock %u iteration", i));
+    to= m_file[i]->store_lock(thd, to, lock_type);
   }
-  else
-  {
-    MY_BITMAP *used_partitions= lock_type == TL_UNLOCK ||
-                                lock_type == TL_IGNORE ?
-                                &m_locked_partitions :
-                                &m_part_info->lock_partitions;
 
-    for (i= bitmap_get_first_set(used_partitions);
-         i < m_tot_parts;
-         i= bitmap_get_next_set(used_partitions, i))
-    {
-      DBUG_PRINT("info", ("store lock %u iteration", i));
-      to= m_file[i]->store_lock(thd, to, lock_type);
-    }
-  }
   DBUG_RETURN(to);
 }
 
@@ -4755,7 +4745,6 @@
   }
 
 
-  m_last_part= new_part_id;
   start_part_bulk_insert(thd, new_part_id);
   DBUG_ASSERT(!m_file[new_part_id]->row_logging);
   if (new_part_id == old_part_id)
@@ -4790,6 +4779,8 @@
       goto exit;
   }
 
+  m_last_part= new_part_id;
+
 exit:
   /*
     if updating an auto_increment column, update
@@ -9478,6 +9469,7 @@
   case HA_EXTRA_STARTING_ORDERED_INDEX_SCAN:
   case HA_EXTRA_BEGIN_ALTER_COPY:
   case HA_EXTRA_END_ALTER_COPY:
+  case HA_EXTRA_ABORT_ALTER_COPY:
     DBUG_RETURN(loop_partitions(extra_cb, &operation));
   default:
   {
diff -Nru mariadb-10.11.11/sql/ha_sequence.cc mariadb-10.11.13/sql/ha_sequence.cc
--- mariadb-10.11.11/sql/ha_sequence.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/ha_sequence.cc	2025-05-19 16:14:25.000000000 +0000
@@ -353,6 +353,12 @@
   return(COMPATIBLE_DATA_YES);
 }
 
+enum_alter_inplace_result
+ha_sequence::check_if_supported_inplace_alter(TABLE *altered_table,
+                                              Alter_inplace_info *ai)
+{
+  return file->check_if_supported_inplace_alter(altered_table, ai);
+}
 
 int ha_sequence::external_lock(THD *thd, int lock_type)
 {
diff -Nru mariadb-10.11.11/sql/ha_sequence.h mariadb-10.11.13/sql/ha_sequence.h
--- mariadb-10.11.11/sql/ha_sequence.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/ha_sequence.h	2025-05-19 16:14:25.000000000 +0000
@@ -94,6 +94,9 @@
   /* For ALTER ONLINE TABLE */
   bool check_if_incompatible_data(HA_CREATE_INFO *create_info,
                                   uint table_changes) override;
+  enum_alter_inplace_result
+  check_if_supported_inplace_alter(TABLE *altered_table,
+                                   Alter_inplace_info *ai) override;
   void write_lock() { write_locked= 1;}
   void unlock() { write_locked= 0; }
   bool is_locked() { return write_locked; }
diff -Nru mariadb-10.11.11/sql/handle_connections_win.cc mariadb-10.11.13/sql/handle_connections_win.cc
--- mariadb-10.11.11/sql/handle_connections_win.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/handle_connections_win.cc	2025-05-19 16:14:25.000000000 +0000
@@ -595,11 +595,8 @@
 
 void handle_connections_win()
 {
-  int n_waits;
-
   create_shutdown_event();
   wait_events.push_back(hEventShutdown);
-  n_waits= 1;
 
   for (size_t i= 0; i < all_listeners.size(); i++)
   {
diff -Nru mariadb-10.11.11/sql/handler.cc mariadb-10.11.13/sql/handler.cc
--- mariadb-10.11.11/sql/handler.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/handler.cc	2025-05-19 16:14:25.000000000 +0000
@@ -499,7 +499,7 @@
   SETMSG(HA_ERR_INDEX_COL_TOO_LONG,	ER_DEFAULT(ER_INDEX_COLUMN_TOO_LONG));
   SETMSG(HA_ERR_INDEX_CORRUPT,		ER_DEFAULT(ER_INDEX_CORRUPT));
   SETMSG(HA_FTS_INVALID_DOCID,		"Invalid InnoDB FTS Doc ID");
-  SETMSG(HA_ERR_DISK_FULL,              ER_DEFAULT(ER_DISK_FULL));
+  SETMSG(HA_ERR_DISK_FULL,              "Disk got full writing '%s'");
   SETMSG(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE,  "Too many words in a FTS phrase or proximity search");
   SETMSG(HA_ERR_FK_DEPTH_EXCEEDED,      "Foreign key cascade delete/update exceeds");
   SETMSG(HA_ERR_TABLESPACE_MISSING,     ER_DEFAULT(ER_TABLESPACE_MISSING));
@@ -672,6 +672,8 @@
 
   DBUG_EXECUTE_IF("unstable_db_type", {
                     static int i= (int) DB_TYPE_FIRST_DYNAMIC;
+                    while (installed_htons[i])
+                      i++;
                     hton->db_type= (enum legacy_db_type)++i;
                   });
 
@@ -1899,6 +1901,8 @@
     }
 #endif /* WITH_WSREP */
     error= ha_commit_one_phase(thd, all);
+    if (error)
+      goto err;
 #ifdef WITH_WSREP
     // Here in case of error we must return 2 for inconsistency
     if (run_wsrep_hooks && !error)
@@ -2139,7 +2143,7 @@
 
   if (ha_info)
   {
-    int err;
+    int err= 0;
 
     if (has_binlog_hton(ha_info) &&
         (err= binlog_commit(thd, all,
@@ -2147,6 +2151,8 @@
     {
       my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
       error= 1;
+
+      goto err;
     }
     for (; ha_info; ha_info= ha_info_next)
     {
@@ -2182,7 +2188,7 @@
     if (count >= 2)
       statistic_increment(transactions_multi_engine, LOCK_status);
   }
-
+ err:
   DBUG_RETURN(error);
 }
 
@@ -2291,7 +2297,7 @@
                      "conf %d wsrep_err %s SQL %s",
                      thd->thread_id, thd->query_id, thd->wsrep_trx().state(),
                      wsrep::to_c_string(thd->wsrep_cs().current_error()),
-                     thd->query());
+                     wsrep_thd_query(thd));
         }
 #endif /* WITH_WSREP */
       }
@@ -2307,7 +2313,7 @@
   if (WSREP(thd) && thd->is_error())
   {
     WSREP_DEBUG("ha_rollback_trans(%lld, %s) rolled back: msg %s is_real %d wsrep_err %s",
-                thd->thread_id, all? "TRUE" : "FALSE",
+                thd->thread_id, all ? "TRUE" : "FALSE",
                 thd->get_stmt_da()->message(), is_real_trans,
                 wsrep::to_c_string(thd->wsrep_cs().current_error()));
   }
@@ -2800,6 +2806,7 @@
         }
         if (IF_WSREP((wsrep_emulate_bin_log &&
                       wsrep_is_wsrep_xid(info->list + i) &&
+                      !wsrep_is_xid_gtid_undefined(info->list + i) &&
                       x <= wsrep_limit), false) ||
             tc_heuristic_recover == TC_HEURISTIC_RECOVER_COMMIT)
         {
@@ -4455,8 +4462,12 @@
     break;
   case ENOSPC:
   case HA_ERR_DISK_FULL:
-    textno= ER_DISK_FULL;
     SET_FATAL_ERROR;                            // Ensure error is logged
+    my_printf_error(ER_DISK_FULL, "Disk got full writing '%s.%s' (Errcode: %M)",
+                    MYF(errflag | ME_ERROR_LOG),
+                    table_share->db.str, table_share->table_name.str,
+                    error);
+    DBUG_VOID_RETURN;
     break;
   case HA_ERR_KEY_NOT_FOUND:
   case HA_ERR_NO_ACTIVE_RECORD:
@@ -7718,7 +7729,10 @@
                   });
 #endif /* WITH_WSREP */
   if ((error= ha_check_overlaps(NULL, buf)))
+  {
+    DEBUG_SYNC_C("ha_write_row_end");
     DBUG_RETURN(error);
+  }
 
   /*
     NOTE: this != table->file is true in 3 cases:
@@ -7739,6 +7753,7 @@
       if (table->next_number_field && buf == table->record[0])
         if (int err= update_auto_increment())
           error= err;
+      DEBUG_SYNC_C("ha_write_row_end");
       DBUG_RETURN(error);
     }
   }
@@ -7749,7 +7764,8 @@
 
   TABLE_IO_WAIT(tracker, PSI_TABLE_WRITE_ROW, MAX_KEY, error,
                       { error= write_row(buf); })
-  DBUG_PRINT("dml", ("INSERT: %s = %d", dbug_print_row(table, buf, false), error));
+  DBUG_PRINT("dml", ("INSERT: %s = %d",
+                     dbug_format_row(table, buf, false).c_ptr_safe(), error));
 
   MYSQL_INSERT_ROW_DONE(error);
   if (likely(!error))
@@ -7760,14 +7776,12 @@
       Log_func *log_func= Write_rows_log_event::binlog_row_logging_function;
       error= binlog_log_row(table, 0, buf, log_func);
     }
+
 #ifdef WITH_WSREP
-    if (WSREP_NNULL(ha_thd()) && table_share->tmp_table == NO_TMP_TABLE &&
-        ht->flags & HTON_WSREP_REPLICATION &&
-        !error && (error= wsrep_after_row(ha_thd())))
-    {
-      DEBUG_SYNC_C("ha_write_row_end");
-      DBUG_RETURN(error);
-    }
+    THD *thd= ha_thd();
+    if (WSREP_NNULL(thd) && table_share->tmp_table == NO_TMP_TABLE &&
+        ht->flags & HTON_WSREP_REPLICATION && !error)
+      error= wsrep_after_row(thd);
 #endif /* WITH_WSREP */
   }
 
@@ -7811,8 +7825,10 @@
 
   TABLE_IO_WAIT(tracker, PSI_TABLE_UPDATE_ROW, active_index, 0,
                       { error= update_row(old_data, new_data);})
-  DBUG_PRINT("dml", ("UPDATE: %s => %s = %d", dbug_print_row(table, old_data, false),
-                     dbug_print_row(table, new_data, false), error));
+  DBUG_PRINT("dml", ("UPDATE: %s => %s = %d",
+                     dbug_format_row(table, old_data, false).c_ptr_safe(),
+                     dbug_format_row(table, new_data, false).c_ptr_safe(),
+                     error));
 
   MYSQL_UPDATE_ROW_DONE(error);
   if (likely(!error))
@@ -7892,7 +7908,8 @@
 
   TABLE_IO_WAIT(tracker, PSI_TABLE_DELETE_ROW, active_index, error,
     { error= delete_row(buf);})
-  DBUG_PRINT("dml", ("DELETE: %s = %d", dbug_print_row(table, buf, false), error));
+  DBUG_PRINT("dml", ("DELETE: %s = %d",
+                     dbug_format_row(table, buf, false).c_ptr_safe(), error));
   MYSQL_DELETE_ROW_DONE(error);
   if (likely(!error))
   {
@@ -8236,16 +8253,6 @@
   VERSIONING functions
 ******************************************************************************/
 
-bool Vers_parse_info::is_start(const char *name) const
-{
-  DBUG_ASSERT(name);
-  return as_row.start && as_row.start.streq(name);
-}
-bool Vers_parse_info::is_end(const char *name) const
-{
-  DBUG_ASSERT(name);
-  return as_row.end && as_row.end.streq(name);
-}
 bool Vers_parse_info::is_start(const Create_field &f) const
 {
   return f.flags & VERS_ROW_START;
@@ -8300,8 +8307,8 @@
   return false;
 }
 
-const Lex_ident Vers_parse_info::default_start= "row_start";
-const Lex_ident Vers_parse_info::default_end= "row_end";
+const Lex_ident Vers_parse_info::default_start= { STRING_WITH_LEN("row_start")};
+const Lex_ident Vers_parse_info::default_end= { STRING_WITH_LEN("row_end")};
 
 bool Vers_parse_info::fix_implicit(THD *thd, Alter_info *alter_info)
 {
@@ -8560,7 +8567,7 @@
 
   if (alter_info->flags & ALTER_ADD_SYSTEM_VERSIONING)
   {
-    if (check_sys_fields(table_name, share->db, alter_info))
+    if (check_sys_fields(share->table_name, share->db, alter_info))
       return true;
   }
 
@@ -8866,8 +8873,8 @@
     }
   }
 
-  bool res= period_info.check_field(row_start, period.start.str)
-            || period_info.check_field(row_end, period.end.str);
+  bool res= period_info.check_field(row_start, period.start)
+            || period_info.check_field(row_end, period.end);
   if (res)
     return true;
 
diff -Nru mariadb-10.11.11/sql/handler.h mariadb-10.11.13/sql/handler.h
--- mariadb-10.11.11/sql/handler.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/handler.h	2025-05-19 16:14:25.000000000 +0000
@@ -2117,8 +2117,6 @@
   }
 
 protected:
-  bool is_start(const char *name) const;
-  bool is_end(const char *name) const;
   bool is_start(const Create_field &f) const;
   bool is_end(const Create_field &f) const;
   bool fix_implicit(THD *thd, Alter_info *alter_info);
@@ -5444,6 +5442,6 @@
                          bool versioned);
 
 #ifndef DBUG_OFF
-const char* dbug_print_row(TABLE *table, const uchar *rec, bool print_names= true);
+String dbug_format_row(TABLE *table, const uchar *rec, bool print_names= true);
 #endif /* DBUG_OFF */
 #endif /* HANDLER_INCLUDED */
diff -Nru mariadb-10.11.11/sql/item.cc mariadb-10.11.13/sql/item.cc
--- mariadb-10.11.11/sql/item.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item.cc	2025-05-19 16:14:25.000000000 +0000
@@ -5321,6 +5321,7 @@
 
 double Item_copy_string::val_real()
 {
+  DBUG_ASSERT(copied_in);
   int err_not_used;
   char *end_not_used;
   return (null_value ? 0.0 :
@@ -5331,6 +5332,7 @@
 
 longlong Item_copy_string::val_int()
 {
+  DBUG_ASSERT(copied_in);
   int err;
   return null_value ? 0 : str_value.charset()->strntoll(str_value.ptr(),
                                                         str_value.length(), 10,
@@ -5340,6 +5342,7 @@
 
 int Item_copy_string::save_in_field(Field *field, bool no_conversions)
 {
+  DBUG_ASSERT(copied_in);
   return save_str_value_in_field(field, &str_value);
 }
 
@@ -5350,11 +5353,15 @@
   if (res && res != &str_value)
     str_value.copy(*res);
   null_value=item->null_value;
+#ifndef DBUG_OFF
+  copied_in= 1;
+#endif
 }
 
 /* ARGSUSED */
 String *Item_copy_string::val_str(String *str)
 {
+  DBUG_ASSERT(copied_in);
   // Item_copy_string is used without fix_fields call
   if (null_value)
     return (String*) 0;
@@ -5364,6 +5371,7 @@
 
 my_decimal *Item_copy_string::val_decimal(my_decimal *decimal_value)
 {
+  DBUG_ASSERT(copied_in);
   // Item_copy_string is used without fix_fields call
   if (null_value)
     return (my_decimal *) 0;
@@ -11067,8 +11075,8 @@
 {}
 
 /**
-  Wrapper of hide_view_error call for Name_resolution_context error
-  processor.
+  Wrapper of replace_view_error_with_generic call for Name_resolution_context
+  error processor.
 
   @note
     hide view underlying tables details in error messages
@@ -11076,7 +11084,7 @@
 
 void view_error_processor(THD *thd, void *data)
 {
-  ((TABLE_LIST *)data)->hide_view_error(thd);
+  ((TABLE_LIST *)data)->replace_view_error_with_generic(thd);
 }
 
 
diff -Nru mariadb-10.11.11/sql/item.h mariadb-10.11.13/sql/item.h
--- mariadb-10.11.11/sql/item.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item.h	2025-05-19 16:14:25.000000000 +0000
@@ -757,6 +757,17 @@
   virtual const String *const_ptr_string() const { return NULL; }
 };
 
+struct subselect_table_finder_param
+{
+  THD *thd;
+  /*
+    We're searching for different TABLE_LIST objects referring to the same
+    table as this one
+  */
+  const TABLE_LIST *find;
+  /* NUL - not found, ERROR_TABLE - search error, or the found table reference */
+  TABLE_LIST *dup;
+};
 
 /****************************************************************************/
 
@@ -1954,6 +1965,19 @@
   */
   virtual Item *clone_item(THD *thd) const { return nullptr; }
 
+  /*
+    @detail
+    The meaning of this function seems to be:
+      Check what the item would return if it was provided with two identical
+      non-NULL arguments.
+    It is not clear why it is defined for generic class Item or what its other
+    uses are.
+
+    @return
+       COND_TRUE   Would return true
+       COND_FALSE  Would return false
+       COND_OK     May return either, depending on the argument type.
+  */
   virtual cond_result eq_cmp_result() const { return COND_OK; }
   inline uint float_length(uint decimals_par) const
   { return decimals < FLOATING_POINT_DECIMALS ? (DBL_DIG+2+decimals_par) : DBL_DIG+8;}
@@ -2292,6 +2316,7 @@
     set_extraction_flag(*(int16*)arg);
     return 0;
   }
+  virtual bool subselect_table_finder_processor(void *arg) { return 0; };
 
   /* 
     TRUE if the expression depends only on the table indicated by tab_map
@@ -6673,8 +6698,15 @@
     Type_std_attributes::set(item);
     name= item->name;
     set_handler(item->type_handler());
+#ifndef DBUG_OFF
+    copied_in= 0;
+#endif
   }
 
+#ifndef DBUG_OFF
+  bool copied_in;
+#endif
+
 public:
 
   /** 
@@ -6740,7 +6772,10 @@
   double val_real() override;
   longlong val_int() override;
   bool get_date(THD *thd, MYSQL_TIME *ltime, date_mode_t fuzzydate) override
-  { return get_date_from_string(thd, ltime, fuzzydate); }
+  {
+    DBUG_ASSERT(copied_in);
+    return get_date_from_string(thd, ltime, fuzzydate);
+  }
   void copy() override;
   int save_in_field(Field *field, bool no_conversions) override;
   Item *do_get_copy(THD *thd) const override
@@ -6770,9 +6805,13 @@
     null_value= tmp.is_null();
     m_value= tmp.is_null() ? Timestamp_or_zero_datetime() :
                              Timestamp_or_zero_datetime(tmp);
+#ifndef DBUG_OFF
+    copied_in=1;
+#endif
   }
   int save_in_field(Field *field, bool) override
   {
+    DBUG_ASSERT(copied_in);
     DBUG_ASSERT(sane());
     if (null_value)
       return set_field_to_null(field);
@@ -6781,30 +6820,35 @@
   }
   longlong val_int() override
   {
+    DBUG_ASSERT(copied_in);
     DBUG_ASSERT(sane());
     return null_value ? 0 :
            m_value.to_datetime(current_thd).to_longlong();
   }
   double val_real() override
   {
+    DBUG_ASSERT(copied_in);
     DBUG_ASSERT(sane());
     return null_value ? 0e0 :
            m_value.to_datetime(current_thd).to_double();
   }
   String *val_str(String *to) override
   {
+    DBUG_ASSERT(copied_in);
     DBUG_ASSERT(sane());
     return null_value ? NULL :
            m_value.to_datetime(current_thd).to_string(to, decimals);
   }
   my_decimal *val_decimal(my_decimal *to) override
   {
+    DBUG_ASSERT(copied_in);
     DBUG_ASSERT(sane());
     return null_value ? NULL :
            m_value.to_datetime(current_thd).to_decimal(to);
   }
   bool get_date(THD *thd, MYSQL_TIME *ltime, date_mode_t fuzzydate) override
   {
+    DBUG_ASSERT(copied_in);
     DBUG_ASSERT(sane());
     bool res= m_value.to_TIME(thd, ltime, fuzzydate);
     DBUG_ASSERT(!res);
@@ -6812,6 +6856,7 @@
   }
   bool val_native(THD *thd, Native *to) override
   {
+    DBUG_ASSERT(copied_in);
     DBUG_ASSERT(sane());
     return null_value || m_value.to_native(to, decimals);
   }
diff -Nru mariadb-10.11.11/sql/item_cmpfunc.h mariadb-10.11.13/sql/item_cmpfunc.h
--- mariadb-10.11.11/sql/item_cmpfunc.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_cmpfunc.h	2025-05-19 16:14:25.000000000 +0000
@@ -1003,6 +1003,23 @@
 
 class Item_func_between :public Item_func_opt_neg
 {
+  /*
+    If the types of the arguments to BETWEEN permit, then:
+
+    WHERE const1 BETWEEN expr2 AND field1
+      can be optimized as if it was just:
+    WHERE const1 <= field1
+
+    as expr2 could be an arbitrary expression.  More generally,
+    this optimization is permitted if aggregation for comparison
+    for three expressions (const1,const2,field1) and for two
+    expressions (const1,field1) return the same type handler.
+
+    @param [IN] field_item - This is a field from the right side
+                             of the BETWEEN operator.
+   */
+  bool can_optimize_range_const(Item_field *field_item) const;
+
 protected:
   SEL_TREE *get_func_mm_tree(RANGE_OPT_PARAM *param,
                              Field *field, Item *value) override;
@@ -2945,9 +2962,18 @@
       TODO:
       We could still replace "expr1" to "const" in "expr1 LIKE expr2"
       in case of a "PAD SPACE" collation, but only if "expr2" has '%'
-      at the end.         
+      at the end.
     */
-    return compare_collation() == &my_charset_bin ? COND_TRUE : COND_OK;
+    if (compare_collation() == &my_charset_bin)
+    {
+      /*
+        'foo' NOT LIKE 'foo' is false,
+        'foo' LIKE 'foo' is true.
+      */
+      return negated? COND_FALSE : COND_TRUE;
+    }
+
+    return COND_OK;
   }
   void add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
                       table_map usable_tables, SARGABLE_PARAM **sargables)
diff -Nru mariadb-10.11.11/sql/item_func.cc mariadb-10.11.13/sql/item_func.cc
--- mariadb-10.11.11/sql/item_func.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_func.cc	2025-05-19 16:14:25.000000000 +0000
@@ -7068,6 +7068,16 @@
 /*****************************************************************************
   SEQUENCE functions
 *****************************************************************************/
+bool Item_func_nextval::check_access_and_fix_fields(THD *thd, Item **ref,
+                                                    privilege_t want_access)
+{
+  table_list->sequence= false;
+  bool error= check_single_table_access(thd, want_access, table_list, false);
+  table_list->sequence= true;
+  if (error && table_list->belong_to_view)
+    table_list->replace_view_error_with_generic(thd);
+  return error || Item_longlong_func::fix_fields(thd, ref);
+}
 
 longlong Item_func_nextval::val_int()
 {
diff -Nru mariadb-10.11.11/sql/item_func.h mariadb-10.11.13/sql/item_func.h
--- mariadb-10.11.11/sql/item_func.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_func.h	2025-05-19 16:14:25.000000000 +0000
@@ -4234,6 +4234,7 @@
 protected:
   TABLE_LIST *table_list;
   TABLE *table;
+  bool check_access_and_fix_fields(THD *, Item **ref, privilege_t);
 public:
   Item_func_nextval(THD *thd, TABLE_LIST *table_list_arg):
   Item_longlong_func(thd), table_list(table_list_arg) {}
@@ -4243,6 +4244,8 @@
     static LEX_CSTRING name= {STRING_WITH_LEN("nextval") };
     return name;
   }
+  bool fix_fields(THD *thd, Item **ref) override
+  { return check_access_and_fix_fields(thd, ref, INSERT_ACL | SELECT_ACL); }
   bool fix_length_and_dec(THD *thd) override
   {
     unsigned_flag= 0;
@@ -4284,6 +4287,8 @@
 public:
   Item_func_lastval(THD *thd, TABLE_LIST *table_list_arg):
   Item_func_nextval(thd, table_list_arg) {}
+  bool fix_fields(THD *thd, Item **ref) override
+  { return check_access_and_fix_fields(thd, ref, SELECT_ACL); }
   longlong val_int() override;
   LEX_CSTRING func_name_cstring() const override
   {
@@ -4308,6 +4313,8 @@
     : Item_func_nextval(thd, table_list_arg),
     nextval(nextval_arg), round(round_arg), is_used(is_used_arg)
   {}
+  bool fix_fields(THD *thd, Item **ref) override
+  { return check_access_and_fix_fields(thd, ref, INSERT_ACL); }
   longlong val_int() override;
   LEX_CSTRING func_name_cstring() const override
   {
diff -Nru mariadb-10.11.11/sql/item_geofunc.cc mariadb-10.11.13/sql/item_geofunc.cc
--- mariadb-10.11.11/sql/item_geofunc.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_geofunc.cc	2025-05-19 16:14:25.000000000 +0000
@@ -91,6 +91,15 @@
   {
     String *str_ret= args[0]->val_str(str);
     null_value= args[0]->null_value;
+    if (!null_value && arg_count == 2 && !args[1]->null_value) {
+      srid= (uint32)args[1]->val_int();
+
+      if (str->copy(*str_ret))
+        return 0;
+
+      int4store(str->ptr(), srid);
+      return str;
+    }
     return str_ret;
   }
 
@@ -2524,7 +2533,7 @@
   String *arg2= args[1]->val_str(&bak2);
   double distance= 0.0;
   double sphere_radius= 6370986.0; // Default radius equals Earth radius
-  
+
   null_value= (args[0]->null_value || args[1]->null_value);
   if (null_value)
   {
@@ -2542,7 +2551,7 @@
     }
     if (sphere_radius <= 0)
     {
-      my_error(ER_INTERNAL_ERROR, MYF(0), "Radius must be greater than zero.");
+      my_error(ER_GIS_UNSUPPORTED_ARGUMENT, MYF(0), func_name());
       return 1;
     }
   }
@@ -2554,26 +2563,27 @@
     my_error(ER_GIS_INVALID_DATA, MYF(0), "ST_Distance_Sphere");
     goto handle_errors;
   }
-// Method allowed for points and multipoints
+  // Method allowed for points and multipoints
   if (!(g1->get_class_info()->m_type_id == Geometry::wkb_point ||
         g1->get_class_info()->m_type_id == Geometry::wkb_multipoint) ||
       !(g2->get_class_info()->m_type_id == Geometry::wkb_point ||
         g2->get_class_info()->m_type_id == Geometry::wkb_multipoint))
   {
-    // Generate error message in case different geometry is used? 
-    my_error(ER_INTERNAL_ERROR, MYF(0), func_name());
+    // Generate error message in case of unexpected geometry.
+    my_error(ER_GIS_UNSUPPORTED_ARGUMENT, MYF(0), func_name());
     return 0;
   }
   distance= spherical_distance_points(g1, g2, sphere_radius);
   if (distance < 0)
   {
-    my_error(ER_INTERNAL_ERROR, MYF(0), "Returned distance cannot be negative.");
+    my_error(ER_INTERNAL_ERROR, MYF(0),
+             "Returned distance cannot be negative.");
     return 1;
   }
   return distance;
 
-  handle_errors:
-    return 0;
+handle_errors:
+  return 0;
 }
 
 
diff -Nru mariadb-10.11.11/sql/item_jsonfunc.cc mariadb-10.11.13/sql/item_jsonfunc.cc
--- mariadb-10.11.11/sql/item_jsonfunc.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_jsonfunc.cc	2025-05-19 16:14:25.000000000 +0000
@@ -74,7 +74,8 @@
 }
 
 
-static bool append_simple(String *s, const char *a, size_t a_len)
+static bool __attribute__((warn_unused_result))
+append_simple(String *s, const char *a, size_t a_len)
 {
   if (!s->realloc_with_extra_if_needed(s->length() + a_len))
   {
@@ -86,7 +87,8 @@
 }
 
 
-static inline bool append_simple(String *s, const uchar *a, size_t a_len)
+static inline bool __attribute__((warn_unused_result))
+append_simple(String *s, const uchar *a, size_t a_len)
 {
   return append_simple(s, (const char *) a, a_len);
 }
@@ -300,8 +302,10 @@
 
   nice_js->length(0);
   nice_js->set_charset(je->s.cs);
-  nice_js->alloc(je->s.str_end - je->s.c_str + 32);
 
+  if (nice_js->alloc(je->s.str_end - je->s.c_str + 32))
+    goto error;
+  
   DBUG_ASSERT(mode != Item_func_json_format::DETAILED ||
               (tab_size >= 0 && tab_size <= TAB_SIZE_LIMIT));
 
@@ -347,7 +351,8 @@
           goto error;
 
         nice_js->append('"');
-        append_simple(nice_js, key_start, key_end - key_start);
+        if (append_simple(nice_js, key_start, key_end - key_start))
+          goto error;
         nice_js->append(colon, colon_len);
       }
       /* now we have key value to handle, so no 'break'. */
@@ -851,7 +856,7 @@
 
 bool Item_func_json_unquote::fix_length_and_dec(THD *thd)
 {
-  collation.set(&my_charset_utf8mb3_general_ci,
+  collation.set(&my_charset_utf8mb4_bin,
                 DERIVATION_COERCIBLE, MY_REPERTOIRE_ASCII);
   max_length= args[0]->max_char_length() * collation.collation->mbmaxlen;
   set_maybe_null();
@@ -894,12 +899,12 @@
     return js;
 
   str->length(0);
-  str->set_charset(&my_charset_utf8mb3_general_ci);
+  str->set_charset(&my_charset_utf8mb4_bin);
 
   if (str->realloc_with_extra_if_needed(je.value_len) ||
       (c_len= json_unescape(js->charset(),
         je.value, je.value + je.value_len,
-        &my_charset_utf8mb3_general_ci,
+        &my_charset_utf8mb4_bin,
         (uchar *) str->ptr(), (uchar *) (str->ptr() + je.value_len))) < 0)
     goto error;
 
@@ -2248,24 +2253,67 @@
     str->set_charset(js->charset());
     if (item_pos)
     {
-      if (append_simple(str, js->ptr(), item_pos - js->ptr()) ||
-          (n_item > 0  && str->append(" ", 1)) ||
-          append_json_value(str, args[n_arg+1], &tmp_val) ||
-          str->append(",", 1) ||
-          (n_item == 0  && str->append(" ", 1)) ||
-          append_simple(str, item_pos, js->end() - item_pos))
+      my_ptrdiff_t size= item_pos - js->ptr();
+      if (append_simple(str, js->ptr(), size))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), (int) size);
         goto return_null; /* Out of memory. */
+      }
+      if (n_item > 0 && str->append(" ", 1))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), 1);
+        goto return_null; /* Out of memory. */
+      }
+      if (append_json_value(str, args[n_arg+1], &tmp_val))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), tmp_val.length());
+        goto return_null; /* Out of memory. */
+      }
+      if (str->append(",", 1))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), 1);
+        goto return_null; /* Out of memory. */
+      }
+      if (n_item == 0 && str->append(" ", 1))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), 1);
+        goto return_null; /* Out of memory. */
+      }
+      size= js->end() - item_pos;
+      if (append_simple(str, item_pos, size))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), (int) size);
+        goto return_null; /* Out of memory. */
+      }
     }
     else
     {
+      my_ptrdiff_t size;
       /* Insert position wasn't found - append to the array. */
       DBUG_ASSERT(je.state == JST_ARRAY_END);
       item_pos= (const char *) (je.s.c_str - je.sav_c_len);
-      if (append_simple(str, js->ptr(), item_pos - js->ptr()) ||
-          (n_item > 0  && str->append(", ", 2)) ||
-          append_json_value(str, args[n_arg+1], &tmp_val) ||
-          append_simple(str, item_pos, js->end() - item_pos))
+      size= item_pos - js->ptr();
+      if (append_simple(str, js->ptr(), size))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), (int) size);
+        goto return_null; /* Out of memory. */
+      }
+      if (n_item > 0 && str->append(", ", 2))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), 2);
         goto return_null; /* Out of memory. */
+      }
+      if (append_json_value(str, args[n_arg+1], &tmp_val))
+      {
+        my_error(ER_OUTOFMEMORY, MYF(0), tmp_val.length());
+        goto return_null; /* Out of memory. */
+      }
+      size= js->end() - item_pos;
+      if (append_simple(str, item_pos, size))
+      {
+         my_error(ER_OUTOFMEMORY, MYF(0), (int) size);
+         goto return_null; /* Out of memory. */
+      }
     }
 
     {
@@ -4117,13 +4165,23 @@
        goto error;
      if (je.value_type == JSON_VALUE_STRING)
      {
-       if (value2.realloc_with_extra_if_needed(je.value_len) ||
-         (c_len= json_unescape(js->charset(), je.value,
+       if (value2.realloc_with_extra_if_needed(je.value_len))
+       {
+         my_error(ER_OUTOFMEMORY, MYF(0), je.value_len);
+         goto error;
+       }
+       if ((c_len= json_unescape(js->charset(), je.value,
                                je.value + je.value_len,
-                               &my_charset_utf8mb3_general_ci,
+                               &my_charset_utf8mb4_bin,
                                (uchar *) value2.ptr(),
                                (uchar *) (value2.ptr() + je.value_len))) < 0)
+       {
+         if (current_thd)
+           push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
+                               ER_JSON_BAD_CHR, ER_THD(current_thd, ER_JSON_BAD_CHR),
+                               0, "comparison", (int)((const char *) je.s.c_str - js->ptr()));
          goto error;
+       }
 
        value2.length(c_len);
        js= &value2;
@@ -4166,13 +4224,23 @@
 
   if (type == JSON_VALUE_STRING)
   {
-    if (value1.realloc_with_extra_if_needed(value_len) ||
-        (c_len= json_unescape(value1.charset(), (uchar *) value,
+    if (value1.realloc_with_extra_if_needed(value_len))
+    {
+      my_error(ER_OUTOFMEMORY, MYF(0), value_len);
+      return 1;
+    }
+    if ((c_len= json_unescape(value1.charset(), (uchar *) value,
                               (uchar *) value+value_len,
-                              &my_charset_utf8mb3_general_ci,
+                              &my_charset_utf8mb4_bin,
                               (uchar *) value1.ptr(),
                               (uchar *) (value1.ptr() + value_len))) < 0)
+    {
+      if (current_thd)
+        push_warning_printf(current_thd, Sql_condition::WARN_LEVEL_WARN,
+                            ER_JSON_BAD_CHR, ER_THD(current_thd, ER_JSON_BAD_CHR),
+                            0, "equality comparison", 0);
       return 1;
+    }
     value1.length(c_len);
     res1= &value1;
   }
diff -Nru mariadb-10.11.11/sql/item_strfunc.cc mariadb-10.11.13/sql/item_strfunc.cc
--- mariadb-10.11.11/sql/item_strfunc.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_strfunc.cc	2025-05-19 16:14:25.000000000 +0000
@@ -56,7 +56,6 @@
 #include "sql_statistics.h"
 
 /* fmtlib include (https://fmt.dev/). */
-#define FMT_STATIC_THOUSANDS_SEPARATOR ','
 #define FMT_HEADER_ONLY 1
 #include "fmt/args.h"
 
@@ -1403,6 +1402,13 @@
   };
 };
 
+struct fmt_locale_comma : std::numpunct<char>
+{
+  char do_thousands_sep() const override { return ','; }
+  std::string do_grouping() const override { return "\3"; }
+};
+static std::locale fmt_locale(std::locale(), new fmt_locale_comma);
+
 /*
   SFORMAT(format_string, ...)
   This function receives a formatting specification string and N parameters
@@ -1455,7 +1461,7 @@
   /* Create the string output  */
   try
   {
-    auto text = fmt::vformat(fmt_arg->c_ptr_safe(), arg_store);
+    auto text = fmt::vformat(fmt_locale, fmt_arg->c_ptr_safe(), arg_store);
     res->length(0);
     res->set_charset(collation.collation);
     res->append(text.c_str(), text.size(), fmt_arg->charset());
diff -Nru mariadb-10.11.11/sql/item_subselect.cc mariadb-10.11.13/sql/item_subselect.cc
--- mariadb-10.11.11/sql/item_subselect.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_subselect.cc	2025-05-19 16:14:25.000000000 +0000
@@ -7147,3 +7147,27 @@
   for (uint i= 0; i < merge_keys_count; i++)
     partial_match_array_sizes[i]= merge_keys[i]->get_key_buff_elements();
 }
+
+
+/*
+  Check if somewhere inside this subselect we read the table. This means a
+  full read "(SELECT ... FROM tbl)", outside reference to tbl.column does not
+  count
+*/
+
+bool
+Item_subselect::subselect_table_finder_processor(void *arg)
+{
+  subselect_table_finder_param *param= (subselect_table_finder_param *)arg;
+  for (SELECT_LEX *sl= unit->first_select(); sl; sl= sl->next_select())
+  {
+    TABLE_LIST *dup;
+    if ((dup= sl->find_table(param->thd, &param->find->db,
+                             &param->find->table_name)))
+    {
+      param->dup= dup;
+      return TRUE;
+    }
+  }
+  return FALSE;
+};
diff -Nru mariadb-10.11.11/sql/item_subselect.h mariadb-10.11.13/sql/item_subselect.h
--- mariadb-10.11.11/sql/item_subselect.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/item_subselect.h	2025-05-19 16:14:25.000000000 +0000
@@ -273,6 +273,7 @@
   {
     return TRUE;
   }
+  bool subselect_table_finder_processor(void *arg) override;
 
   void register_as_with_rec_ref(With_element *with_elem);
   void init_expr_cache_tracker(THD *thd);
diff -Nru mariadb-10.11.11/sql/lex_string.h mariadb-10.11.13/sql/lex_string.h
--- mariadb-10.11.11/sql/lex_string.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/lex_string.h	2025-05-19 16:14:25.000000000 +0000
@@ -110,7 +110,7 @@
 class Lex_cstring_strlen: public Lex_cstring
 {
 public:
-  Lex_cstring_strlen(const char *from)
+  explicit Lex_cstring_strlen(const char *from)
    :Lex_cstring(from, from ? strlen(from) : 0)
   { }
 };
diff -Nru mariadb-10.11.11/sql/log.cc mariadb-10.11.13/sql/log.cc
--- mariadb-10.11.11/sql/log.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/log.cc	2025-05-19 16:14:25.000000000 +0000
@@ -322,6 +322,11 @@
     incident= TRUE;
   }
   
+  void clear_incident(void)
+  {
+    incident= FALSE;
+  }
+
   bool has_incident(void)
   {
     return(incident);
@@ -1932,6 +1937,16 @@
     if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE))
       DBUG_RETURN(1);
 
+#ifdef WITH_WSREP
+    /* Wsrep transaction was BF aborted but it must replay because certification
+       succeeded. The transaction must not be written into binlog yet, it will
+       be done during commit after the replay. */
+    if (WSREP(thd) && wsrep_must_replay(thd))
+    {
+      DBUG_RETURN(0);
+    }
+#endif /* WITH_WSREP */
+
     /*
       Doing a commit or a rollback including non-transactional tables,
       i.e., ending a transaction where we might write the transaction
@@ -2530,6 +2545,18 @@
 }
 
 
+void binlog_clear_incident(THD *thd)
+{
+  binlog_cache_mngr *const cache_mngr= opt_bin_log ?
+    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton) : 0;
+  if (cache_mngr)
+  {
+    cache_mngr->stmt_cache.clear_incident();
+    cache_mngr->trx_cache.clear_incident();
+  }
+}
+
+
 void MYSQL_BIN_LOG::set_write_error(THD *thd, bool is_transactional)
 {
   DBUG_ENTER("MYSQL_BIN_LOG::set_write_error");
@@ -7971,7 +7998,12 @@
   {
     DBUG_RETURN(0);
   }
-  else if (!(thd->variables.option_bits & OPTION_BIN_LOG))
+
+  if (!(thd->variables.option_bits & OPTION_BIN_LOG)
+#ifdef WITH_WSREP
+      && !WSREP(thd)
+#endif
+      )
   {
     cache_mngr->need_unlog= false;
     DBUG_RETURN(0);
@@ -8878,6 +8910,13 @@
   bool has_xid= entry->end_event->get_type_code() == XID_EVENT;
 
   DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_or_stmt");
+#ifdef WITH_WSREP
+  if (WSREP(entry->thd) &&
+      !(entry->thd->variables.option_bits & OPTION_BIN_LOG))
+  {
+    DBUG_RETURN(0);
+  }
+#endif /* WITH_WSREP */
 
   /*
     An error in the trx_cache will truncate the cache to the last good
diff -Nru mariadb-10.11.11/sql/log.h mariadb-10.11.13/sql/log.h
--- mariadb-10.11.11/sql/log.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/log.h	2025-05-19 16:14:25.000000000 +0000
@@ -1186,6 +1186,7 @@
 
 void make_default_log_name(char **out, const char* log_ext, bool once);
 void binlog_reset_cache(THD *thd);
+void binlog_clear_incident(THD *thd);
 bool write_annotated_row(THD *thd);
 
 extern MYSQL_PLUGIN_IMPORT MYSQL_BIN_LOG mysql_bin_log;
diff -Nru mariadb-10.11.11/sql/mysql_install_db.cc mariadb-10.11.13/sql/mysql_install_db.cc
--- mariadb-10.11.11/sql/mysql_install_db.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/mysql_install_db.cc	2025-05-19 16:14:25.000000000 +0000
@@ -336,7 +336,7 @@
     " --bootstrap"
     " --datadir=."
     " --tmpdir=."
-    " --loose-innodb-buffer-pool-size=20M"
+    " --loose-innodb-buffer-pool-size=21M"
     "\""
     , mysqld_path, opt_verbose_bootstrap ? "--console" : "");
   return cmdline;
@@ -344,10 +344,29 @@
 
 static char my_ini_path[MAX_PATH];
 
+/**
+  Wrapper for WritePrivateProfileStringA, with retries and sleeps
+  if file is locked by another process.
+*/
+static BOOL write_private_profile_string_with_retries(const char *appname,
+  const char *key, const char *val, const char *filename)
+{
+  static constexpr int RETRIES=50;
+  static constexpr int SLEEP_MS=10;
+  for (int n= RETRIES;; n--)
+  {
+    if (WritePrivateProfileStringA(appname, key, val, filename))
+      return TRUE;
+    if (GetLastError() != ERROR_ACCESS_DENIED || !n)
+      return FALSE;
+    Sleep(SLEEP_MS);
+  }
+}
+
 static void write_myini_str(const char *key, const char* val, const char *section="mysqld")
 {
   DBUG_ASSERT(my_ini_path[0]);
-  if (!WritePrivateProfileString(section, key, val, my_ini_path))
+  if (!write_private_profile_string_with_retries(section, key, val, my_ini_path))
   {
     die("Can't write to ini file key=%s, val=%s, section=%s, Windows error %u",key,val,section,
       GetLastError());
diff -Nru mariadb-10.11.11/sql/mysql_upgrade_service.cc mariadb-10.11.13/sql/mysql_upgrade_service.cc
--- mariadb-10.11.11/sql/mysql_upgrade_service.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/mysql_upgrade_service.cc	2025-05-19 16:14:25.000000000 +0000
@@ -45,7 +45,6 @@
 "OPTIONS:"
 
 static char mysqld_path[MAX_PATH];
-static char mysqladmin_path[MAX_PATH];
 static char mysqlupgrade_path[MAX_PATH];
 
 static char defaults_file_param[MAX_PATH + 16]; /*--defaults-file=<path> */
@@ -302,13 +301,29 @@
   our --skip-grant-tables do not work anymore after mysql_upgrade
   that does "flush privileges". Instead, the shutdown event  is set.
 */
+#define OPEN_EVENT_RETRY_SLEEP_MS 100
+#define OPEN_EVENT_MAX_RETRIES 50
+
 void initiate_mysqld_shutdown()
 {
   char event_name[32];
   DWORD pid= GetProcessId(mysqld_process);
   sprintf_s(event_name, "MySQLShutdown%d", pid);
-  HANDLE shutdown_handle= OpenEvent(EVENT_MODIFY_STATE, FALSE, event_name);
-  if(!shutdown_handle)
+
+  HANDLE shutdown_handle;
+  for (int i= 0;; i++)
+  {
+    shutdown_handle= OpenEvent(EVENT_MODIFY_STATE, FALSE, event_name);
+    if(shutdown_handle != nullptr || i == OPEN_EVENT_MAX_RETRIES)
+      break;
+    if (WaitForSingleObject(mysqld_process, OPEN_EVENT_RETRY_SLEEP_MS) !=
+        WAIT_TIMEOUT)
+    {
+      die("server process exited before shutdown event was created");
+      break;
+    }
+  }
+  if (!shutdown_handle)
   {
     die("OpenEvent() failed for shutdown event");
   }
@@ -403,6 +418,26 @@
 
 }
 
+/**
+  Waits until starting server can be connected to, via given named pipe, with timeout
+  Dies if either server process exited meanwhile, or when timeout was exceeded.
+*/
+static void wait_for_server_startup(HANDLE process, const char *named_pipe, DWORD timeout_sec)
+{
+  unsigned long long end_time= GetTickCount64() + 1000ULL*timeout_sec;
+  for (;;)
+  {
+    if (WaitNamedPipe(named_pipe, 0))
+      return;
+
+    if (GetTickCount64() >= end_time)
+      die("Server did not startup after %lu seconds", timeout_sec);
+
+    if (WaitForSingleObject(process, 100) != WAIT_TIMEOUT)
+      die("Server did not start");
+  }
+}
+
 
 int main(int argc, char **argv)
 {
@@ -419,8 +454,9 @@
  
  /*
     Get full path to mysqld, we need it when changing service configuration.
-    Assume installation layout, i.e mysqld.exe, mysqladmin.exe, mysqlupgrade.exe
-    and mysql_upgrade_service.exe are in the same directory.
+    Assume mysqld.exe in the same directory as this program.
+    mysql_upgrade.exe is either in the same directory, or pointed to by
+    MARIADB_UPGRADE_EXE environment variable (in case of MTR running it)
   */
   GetModuleFileName(NULL, bindir, FN_REFLEN);
   p= strrchr(bindir, FN_LIBCHAR);
@@ -429,15 +465,19 @@
     *p= 0;
   }
   sprintf_s(mysqld_path, "%s\\mysqld.exe", bindir);
-  sprintf_s(mysqladmin_path, "%s\\mysqladmin.exe", bindir);
   sprintf_s(mysqlupgrade_path, "%s\\mysql_upgrade.exe", bindir);
 
-  char *paths[]= {mysqld_path, mysqladmin_path, mysqlupgrade_path};
-  for(int i= 0; i< 3;i++)
-  {
-    if(GetFileAttributes(paths[i]) == INVALID_FILE_ATTRIBUTES)
-      die("File %s does not exist", paths[i]);
+  if (access(mysqld_path, 0))
+    die("File %s does not exist", mysqld_path);
+  if (access(mysqlupgrade_path, 0))
+  {
+    /* Try to get path from environment variable, set by MTR */
+    char *alt_mysqlupgrade_path= getenv("MARIADB_UPGRADE_EXE");
+    if (alt_mysqlupgrade_path)
+      sprintf_s(mysqlupgrade_path, "%s", alt_mysqlupgrade_path);
   }
+  if (access(mysqlupgrade_path, 0))
+    die("File %s does not exist", mysqld_path);
 
   /*
     Messages written on stdout should not be buffered,  GUI upgrade program 
@@ -482,6 +522,10 @@
 
   DWORD start_duration_ms = 0;
 
+  char pipe_name[64];
+  snprintf(pipe_name, sizeof(pipe_name),
+           "\\\\.\\pipe\\mysql_upgrade_service_%lu", GetCurrentProcessId());
+
   if (do_start_stop_server)
   {
     /* Start/stop server with  --loose-innodb-fast-shutdown=1 */
@@ -493,37 +537,23 @@
     {
       die("Cannot start mysqld.exe process, last error =%u", GetLastError());
     }
-    char pipe_name[64];
-    snprintf(pipe_name, sizeof(pipe_name), "\\\\.\\pipe\\mysql_upgrade_service_%lu",
-      GetCurrentProcessId());
-    for (;;)
-    {
-      if (WaitForSingleObject(mysqld_process, 0) != WAIT_TIMEOUT)
-        die("mysqld.exe did not start");
-
-      if (WaitNamedPipe(pipe_name, 0))
-      {
-        // Server started, shut it down.
-        initiate_mysqld_shutdown();
-        if (WaitForSingleObject((HANDLE)mysqld_process, shutdown_timeout * 1000) != WAIT_OBJECT_0)
-        {
-          die("Could not shutdown server started with '--innodb-fast-shutdown=0'");
-        }
-        DWORD exit_code;
-        if (!GetExitCodeProcess((HANDLE)mysqld_process, &exit_code))
-        {
-          die("Could not get mysqld's exit code");
-        }
-        if (exit_code)
-        {
-          die("Could not get successfully shutdown mysqld");
-        }
-        CloseHandle(mysqld_process);
-        break;
-      }
-      Sleep(500);
-      start_duration_ms += 500;
+    wait_for_server_startup(mysqld_process, pipe_name, startup_timeout);
+    // Server started, shut it down.
+    initiate_mysqld_shutdown();
+    if (WaitForSingleObject((HANDLE)mysqld_process, shutdown_timeout * 1000) != WAIT_OBJECT_0)
+    {
+       die("Could not shutdown server");
+    }
+    DWORD exit_code;
+    if (!GetExitCodeProcess((HANDLE)mysqld_process, &exit_code))
+    {
+      die("Could not get server's exit code");
+    }
+    if (exit_code)
+    {
+      die("Could not get successfully shutdown server (exit code %u)",exit_code);
     }
+    CloseHandle(mysqld_process);
   }
 
   log("Phase %d/%d: Fixing server config file%s", ++phase, max_phases,
@@ -550,22 +580,7 @@
   }
 
   log("Phase %d/%d: Waiting for startup to complete",++phase,max_phases);
-  start_duration_ms= 0;
-  for(;;)
-  {
-    if (WaitForSingleObject(mysqld_process, 0) != WAIT_TIMEOUT)
-      die("mysqld.exe did not start");
-
-    if (run_tool(P_WAIT, mysqladmin_path, "--protocol=pipe", socket_param,
-                 "ping", "--no-beep", NULL) == 0)
-    {
-      break;
-    }
-    if (start_duration_ms > startup_timeout*1000)
-      die("Server did not come up in %d seconds",startup_timeout);
-    Sleep(500);
-    start_duration_ms+= 500;
-  }
+  wait_for_server_startup(mysqld_process, pipe_name, startup_timeout);
 
   log("Phase %d/%d: Running mysql_upgrade",++phase,max_phases);
   int upgrade_err= (int) run_tool(P_WAIT,  mysqlupgrade_path, 
diff -Nru mariadb-10.11.11/sql/mysqld.cc mariadb-10.11.13/sql/mysqld.cc
--- mariadb-10.11.11/sql/mysqld.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/mysqld.cc	2025-05-19 16:14:25.000000000 +0000
@@ -420,7 +420,9 @@
 char* opt_secure_file_priv;
 my_bool lower_case_file_system= 0;
 my_bool opt_large_pages= 0;
+#ifdef HAVE_SOLARIS_LARGE_PAGES
 my_bool opt_super_large_pages= 0;
+#endif
 my_bool opt_myisam_use_mmap= 0;
 uint   opt_large_page_size= 0;
 #if defined(ENABLED_DEBUG_SYNC)
@@ -1396,11 +1398,6 @@
 static int systemd_sock_activation; /* systemd socket activation */
 
 
-
-/** wakeup listening(main) thread by writing to this descriptor */
-static int termination_event_fd= -1;
-
-
 C_MODE_START
 #ifdef WITH_PERFSCHEMA_STORAGE_ENGINE
 /**
@@ -1453,9 +1450,14 @@
 #endif
 
 /* OS specific variables */
-
+#ifndef EMBEDDED_LIBRARY
 #ifdef _WIN32
+/** wakeup main thread by signaling this event */
 HANDLE hEventShutdown;
+#else
+/** wakeup listening(main) thread by writing to this descriptor */
+static int termination_event_fd= -1;
+#endif
 #endif
 
 
@@ -3744,12 +3746,12 @@
 #endif
 
   /*
-    When thread specific is set, both mysqld_server_initialized and thd
-    must be set, and we check that with DBUG_ASSERT.
-
-    However, do not crash, if current_thd is NULL, in release version.
+    is_thread_specific is only relevant when a THD exist and the server
+    has fully started. is_thread_specific can be set during recovery by
+    Aria for functions that are normally only run in one thread.
+    However InnoDB sets thd early, so we can use it.
   */
-  DBUG_ASSERT(!is_thread_specific || (mysqld_server_initialized && thd));
+  DBUG_ASSERT(!is_thread_specific || thd || !plugins_are_initialized);
 
   if (is_thread_specific && likely(thd))  /* If thread specific memory */
   {
@@ -4118,7 +4120,7 @@
   if (opt_large_pages)
   {
     DBUG_PRINT("info", ("Large page set"));
-    if (my_init_large_pages(opt_super_large_pages))
+    if (my_init_large_pages())
     {
       return 1;
     }
@@ -5337,7 +5339,7 @@
       MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"),
       MARIADB_REMOVED_OPTION("innodb-lru-flush-size"),
       MARIADB_REMOVED_OPTION("innodb-page-cleaners"),
-      MARIADB_REMOVED_OPTION("innodb-purge-truncate-frequency"),
+      MARIADB_REMOVED_OPTION("innodb-purge-rseg-truncate-frequency"),
       MARIADB_REMOVED_OPTION("innodb-replication-delay"),
       MARIADB_REMOVED_OPTION("innodb-scrub-log"),
       MARIADB_REMOVED_OPTION("innodb-scrub-log-speed"),
@@ -7872,7 +7874,9 @@
   bzero((char*) &global_status_var, offsetof(STATUS_VAR,
                                              last_cleared_system_status_var));
   opt_large_pages= 0;
+#ifdef HAVE_SOLARIS_LARGE_PAGES
   opt_super_large_pages= 0;
+#endif
 #if defined(ENABLED_DEBUG_SYNC)
   opt_debug_sync_timeout= 0;
 #endif /* defined(ENABLED_DEBUG_SYNC) */
@@ -8872,15 +8876,22 @@
   bool is_log= opt_log || global_system_variables.sql_log_slow || opt_bin_log;
   bool is_debug= IF_DBUG(!strstr(MYSQL_SERVER_SUFFIX_STR, "-debug"), 0);
   const char *is_valgrind=
-#ifdef HAVE_VALGRIND
+#ifdef HAVE_valgrind
     !strstr(MYSQL_SERVER_SUFFIX_STR, "-valgrind") ? "-valgrind" :
 #endif
     "";
+  const char *is_asan=
+#ifdef __SANITIZE_ADDRESS__
+    !strstr(MYSQL_SERVER_SUFFIX_STR, "-asan") ? "-asan" :
+#endif
+    "";
+
   return strxnmov(buf, size - 1,
                   MYSQL_SERVER_VERSION,
                   MYSQL_SERVER_SUFFIX_STR,
                   IF_EMBEDDED("-embedded", ""),
                   is_valgrind,
+                  is_asan,
                   is_debug ? "-debug" : "",
                   is_log ? "-log" : "",
                   NullS);
@@ -9303,6 +9314,7 @@
 PSI_stage_info stage_purging_old_relay_logs= { 0, "Purging old relay logs", 0};
 PSI_stage_info stage_query_end= { 0, "Query end", 0};
 PSI_stage_info stage_starting_cleanup= { 0, "Starting cleanup", 0};
+PSI_stage_info stage_slave_sql_cleanup= { 0, "Slave SQL thread ending", 0};
 PSI_stage_info stage_rollback= { 0, "Rollback", 0};
 PSI_stage_info stage_rollback_implicit= { 0, "Rollback_implicit", 0};
 PSI_stage_info stage_commit= { 0, "Commit", 0};
@@ -9544,6 +9556,7 @@
   & stage_preparing,
   & stage_purging_old_relay_logs,
   & stage_starting_cleanup,
+  & stage_slave_sql_cleanup,
   & stage_query_end,
   & stage_queueing_master_event_to_the_relay_log,
   & stage_reading_event_from_the_relay_log,
diff -Nru mariadb-10.11.11/sql/mysqld.h mariadb-10.11.13/sql/mysqld.h
--- mariadb-10.11.11/sql/mysqld.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/mysqld.h	2025-05-19 16:14:25.000000000 +0000
@@ -612,6 +612,7 @@
 extern PSI_stage_info stage_purging_old_relay_logs;
 extern PSI_stage_info stage_query_end;
 extern PSI_stage_info stage_starting_cleanup;
+extern PSI_stage_info stage_slave_sql_cleanup;
 extern PSI_stage_info stage_rollback;
 extern PSI_stage_info stage_rollback_implicit;
 extern PSI_stage_info stage_commit;
diff -Nru mariadb-10.11.11/sql/net_serv.cc mariadb-10.11.13/sql/net_serv.cc
--- mariadb-10.11.11/sql/net_serv.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/net_serv.cc	2025-05-19 16:14:25.000000000 +0000
@@ -773,18 +773,22 @@
       }
 #endif /* !defined(MYSQL_SERVER) */
       net->error= 2;				/* Close socket */
-      net->last_errno= (interrupted ? ER_NET_WRITE_INTERRUPTED :
-                        ER_NET_ERROR_ON_WRITE);
-#ifdef MYSQL_SERVER
-      if (global_system_variables.log_warnings > 3)
+
+      if (net->vio->state != VIO_STATE_SHUTDOWN || net->last_errno == 0)
       {
-        sql_print_warning("Could not write packet: fd: %lld  state: %d  "
-                          "errno: %d  vio_errno: %d  length: %ld",
-                          (longlong) vio_fd(net->vio), (int) net->vio->state,
-                          vio_errno(net->vio), net->last_errno,
-                          (ulong) (end-pos));
-      }
+        net->last_errno= (interrupted ? ER_NET_WRITE_INTERRUPTED :
+                          ER_NET_ERROR_ON_WRITE);
+#ifdef MYSQL_SERVER
+        if (global_system_variables.log_warnings > 3)
+        {
+          sql_print_warning("Could not write packet: fd: %lld  state: %d  "
+                            "errno: %d  vio_errno: %d  length: %ld",
+                            (longlong) vio_fd(net->vio), (int) net->vio->state,
+                            vio_errno(net->vio), net->last_errno,
+                            (ulong) (end-pos));
+        }
 #endif
+      }
       MYSQL_SERVER_my_error(net->last_errno, MYF(0));
       break;
     }
@@ -1097,6 +1101,7 @@
                             ER_NET_READ_INTERRUPTED :
                             ER_NET_READ_ERROR);
 #ifdef MYSQL_SERVER
+          strmake_buf(net->last_error, ER(net->last_errno));
           if (global_system_variables.log_warnings > 3)
           {
             /* Log things as a warning */
diff -Nru mariadb-10.11.11/sql/opt_range.cc mariadb-10.11.13/sql/opt_range.cc
--- mariadb-10.11.11/sql/opt_range.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/opt_range.cc	2025-05-19 16:14:25.000000000 +0000
@@ -8351,56 +8351,58 @@
 
 /*
   Build conjunction of all SEL_TREEs for a simple predicate applying equalities
- 
+
   SYNOPSIS
     get_full_func_mm_tree()
       param       PARAM from SQL_SELECT::test_quick_select
       field_item  field in the predicate
-      value       constant in the predicate (or a field already read from 
+      value       constant in the predicate (or a field already read from
                   a table in the case of dynamic range access)
                   (for BETWEEN it contains the number of the field argument,
-                   for IN it's always 0) 
+                   for IN it's always 0)
       inv         TRUE <> NOT cond_func is considered
                   (makes sense only when cond_func is BETWEEN or IN)
 
   DESCRIPTION
-    For a simple SARGable predicate of the form (f op c), where f is a field and
-    c is a constant, the function builds a conjunction of all SEL_TREES that can
-    be obtained by the substitution of f for all different fields equal to f.
+    For a simple SARGable predicate of the form (f op c), where f is a field
+    and c is a constant, the function builds a conjunction of all SEL_TREES that
+    can be obtained by the substitution of f for all different fields equal to f.
 
-  NOTES  
+  NOTES
     If the WHERE condition contains a predicate (fi op c),
     then not only SELL_TREE for this predicate is built, but
     the trees for the results of substitution of fi for
     each fj belonging to the same multiple equality as fi
     are built as well.
-    E.g. for WHERE t1.a=t2.a AND t2.a > 10 
+    E.g. for WHERE t1.a=t2.a AND t2.a > 10
     a SEL_TREE for t2.a > 10 will be built for quick select from t2
-    and   
+    and
     a SEL_TREE for t1.a > 10 will be built for quick select from t1.
 
-    A BETWEEN predicate of the form (fi [NOT] BETWEEN c1 AND c2) is treated
-    in a similar way: we build a conjuction of trees for the results
-    of all substitutions of fi for equal fj.
+    A BETWEEN predicate of the form (fi [NOT] BETWEEN c1 AND c2), where fi
+    is some field, is treated in a similar way: we build a conjuction of
+    trees for the results of all substitutions of fi equal fj.
+
     Yet a predicate of the form (c BETWEEN f1i AND f2i) is processed
     differently. It is considered as a conjuction of two SARGable
-    predicates (f1i <= c) and (f2i <=c) and the function get_full_func_mm_tree
-    is called for each of them separately producing trees for 
-       AND j (f1j <=c ) and AND j (f2j <= c) 
+    predicates (f1i <= c) and (c <= f2i) and the function get_full_func_mm_tree
+    is called for each of them separately producing trees for
+       AND j (f1j <= c) and AND j (c <= f2j)
     After this these two trees are united in one conjunctive tree.
     It's easy to see that the same tree is obtained for
-       AND j,k (f1j <=c AND f2k<=c)
-    which is equivalent to 
+       AND j,k (f1j <= c AND c <= f2k)
+    which is equivalent to
        AND j,k (c BETWEEN f1j AND f2k).
+
     The validity of the processing of the predicate (c NOT BETWEEN f1i AND f2i)
     which equivalent to (f1i > c OR f2i < c) is not so obvious. Here the
-    function get_full_func_mm_tree is called for (f1i > c) and (f2i < c)
-    producing trees for AND j (f1j > c) and AND j (f2j < c). Then this two
-    trees are united in one OR-tree. The expression 
+    function get_full_func_mm_tree is called for (f1i > c) and called for
+    (f2i < c) producing trees for AND j (f1j > c) and AND j (f2j < c). Then
+    this two trees are united in one OR-tree. The expression
       (AND j (f1j > c) OR AND j (f2j < c)
     is equivalent to the expression
-      AND j,k (f1j > c OR f2k < c) 
-    which is just a translation of 
+      AND j,k (f1j > c OR f2k < c)
+    which is just a translation of
       AND j,k (c NOT BETWEEN f1j AND f2k)
 
     In the cases when one of the items f1, f2 is a constant c1 we do not create
@@ -8413,9 +8415,9 @@
     As to IN predicates only ones of the form (f IN (c1,...,cn)),
     where f1 is a field and c1,...,cn are constant, are considered as
     SARGable. We never try to narrow the index scan using predicates of
-    the form (c IN (c1,...,f,...,cn)). 
-      
-  RETURN 
+    the form (c IN (c1,...,f,...,cn)).
+
+  RETURN
     Pointer to the tree representing the built conjunction of SEL_TREEs
 */
 
@@ -8513,6 +8515,11 @@
   SEL_TREE *tree= li.ref()[0]->get_mm_tree(param, li.ref());
   if (param->statement_should_be_aborted())
     DBUG_RETURN(NULL);
+  bool orig_disable_index_merge= param->disable_index_merge_plans;
+
+  if (list.elements > MAX_OR_ELEMENTS_FOR_INDEX_MERGE)
+    param->disable_index_merge_plans= true;
+
   if (tree)
   {
     if (tree->type == SEL_TREE::IMPOSSIBLE &&
@@ -8529,7 +8536,10 @@
     {
       SEL_TREE *new_tree= li.ref()[0]->get_mm_tree(param, li.ref());
       if (new_tree == NULL || param->statement_should_be_aborted())
+      {
+        param->disable_index_merge_plans= orig_disable_index_merge;
         DBUG_RETURN(NULL);
+      }
       tree= tree_or(param, tree, new_tree);
       if (tree == NULL || tree->type == SEL_TREE::ALWAYS)
       {
@@ -8561,6 +8571,7 @@
     if (replace_cond)
       *cond_ptr= replacement_item;
   }
+  param->disable_index_merge_plans= orig_disable_index_merge;
   DBUG_RETURN(tree);
 }
 
@@ -8614,6 +8625,19 @@
 }
 
 
+bool
+Item_func_between::can_optimize_range_const(Item_field *field_item) const
+{
+  const Type_handler *fi_handler= field_item->type_handler_for_comparison();
+  Type_handler_hybrid_field_type cmp(fi_handler);
+  if (cmp.aggregate_for_comparison(args[0]->type_handler_for_comparison()) ||
+      cmp.type_handler() != m_comparator.type_handler())
+      return false;  // Cannot optimize range because of type mismatch.
+
+  return true;
+}
+
+
 SEL_TREE *
 Item_func_between::get_mm_tree(RANGE_OPT_PARAM *param, Item **cond_ptr)
 {
@@ -8639,6 +8663,8 @@
     if (arguments()[i]->real_item()->type() == Item::FIELD_ITEM)
     {
       Item_field *field_item= (Item_field*) (arguments()[i]->real_item());
+      if (!can_optimize_range_const(field_item))
+        continue;
       SEL_TREE *tmp= get_full_func_mm_tree(param, field_item,
                                            (Item*)(intptr) i);
       if (negated)
@@ -9952,6 +9978,8 @@
   {
     bool must_be_ored= sel_trees_must_be_ored(param, tree1, tree2, ored_keys);
     no_imerge_from_ranges= must_be_ored;
+    if (param->disable_index_merge_plans)
+      no_imerge_from_ranges= true;
 
     if (no_imerge_from_ranges && no_merges1 && no_merges2)
     {
@@ -16006,7 +16034,7 @@
         Remember this key, and continue looking for a non-NULL key that
         satisfies some other condition.
       */
-      memcpy(tmp_record, record, head->s->rec_buff_length);
+      memcpy(tmp_record, record, head->s->reclength);
       found_null= TRUE;
       continue;
     }
@@ -16046,7 +16074,7 @@
   */
   if (found_null && result)
   {
-    memcpy(record, tmp_record, head->s->rec_buff_length);
+    memcpy(record, tmp_record, head->s->reclength);
     result= 0;
   }
   return result;
@@ -16079,7 +16107,7 @@
   ha_rkey_function find_flag;
   key_part_map keypart_map;
   QUICK_RANGE *cur_range;
-  int result;
+  int result= HA_ERR_KEY_NOT_FOUND;
 
   DBUG_ASSERT(min_max_ranges.elements > 0);
 
@@ -16088,10 +16116,11 @@
     get_dynamic(&min_max_ranges, (uchar*)&cur_range, range_idx - 1);
 
     /*
-      If the current value for the min/max argument is smaller than the left
-      boundary of cur_range, there is no need to check this range.
+      If the key has already been "moved" by a successful call to
+      ha_index_read_map, and the current value for the max argument
+      comes before the range, there is no need to check this range.
     */
-    if (range_idx != min_max_ranges.elements &&
+    if (!result &&
         !(cur_range->flag & NO_MIN_RANGE) &&
         (key_cmp(min_max_arg_part, (const uchar*) cur_range->min_key,
                  min_max_arg_len) == -1))
diff -Nru mariadb-10.11.11/sql/opt_range.h mariadb-10.11.13/sql/opt_range.h
--- mariadb-10.11.11/sql/opt_range.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/opt_range.h	2025-05-19 16:14:25.000000000 +0000
@@ -39,6 +39,32 @@
 class JOIN;
 class Item_sum;
 
+/*
+  When processing an OR clause with more than MAX_OR_ELEMENTS_FOR_INDEX_MERGE
+  disjuncts (i.e. OR-parts), do not construct index_merge plans from it.
+
+  Some users have OR clauses with extremely large number of disjuncts, like:
+
+      (key1=1 AND key2=10) OR
+      (key1=2 AND key2=20) OR
+      (key1=3 AND key2=30) OR
+      ...
+
+  When processing this, the optimizer would try to build a lot of potential
+  index_merge plans. Hypothetically this could be useful as the cheapest plan
+  could be to pick a specific index for each disjunct and build:
+
+     index_merge(key1 IN (1,3,8,15...), key2 IN (20, 40, 50 ...))
+
+  In practice this causes combinatorial amount of time to be spent in the range
+  analyzer, and most variants will be discarded when the range optimizer tries
+  to avoid this combinatorial explosion (which may or may not work depending on
+  the form of the WHERE clause).
+  In practice, very long ORs are served well enough by just considering range
+  accesses on individual indexes.
+*/
+const int MAX_OR_ELEMENTS_FOR_INDEX_MERGE=100;
+
 struct KEY_PART {
   uint16           key,part;
   /* See KEY_PART_INFO for meaning of the next two: */
@@ -889,6 +915,9 @@
   */
   bool remove_false_where_parts;
 
+  /* If TRUE, do not construct index_merge plans */
+  bool disable_index_merge_plans;
+
   /*
     Which functions should give SQL notes for unusable keys.
   */
diff -Nru mariadb-10.11.11/sql/rpl_injector.h mariadb-10.11.13/sql/rpl_injector.h
--- mariadb-10.11.11/sql/rpl_injector.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/rpl_injector.h	2025-05-19 16:14:25.000000000 +0000
@@ -146,7 +146,6 @@
       };
 
       transaction() : m_thd(NULL) { }
-      transaction(transaction const&);
       ~transaction();
 
       /* Clear transaction, i.e., make calls to 'good()' return false. */
diff -Nru mariadb-10.11.11/sql/rpl_mi.cc mariadb-10.11.13/sql/rpl_mi.cc
--- mariadb-10.11.11/sql/rpl_mi.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/rpl_mi.cc	2025-05-19 16:14:25.000000000 +0000
@@ -21,6 +21,7 @@
 #include "slave.h"
 #include "strfunc.h"
 #include "sql_repl.h"
+#include <sql_common.h>
 
 #ifdef HAVE_REPLICATION
 
@@ -1369,27 +1370,21 @@
                                    Sql_condition::enum_warning_level warning)
 {
   Master_info *mi;
-  char buff[MAX_CONNECTION_NAME+1], *res;
-  size_t buff_length;
   DBUG_ENTER("get_master_info");
   DBUG_PRINT("enter",
              ("connection_name: '%.*s'", (int) connection_name->length,
               connection_name->str));
 
-  /* Make name lower case for comparison */
-  res= strmake(buff, connection_name->str, connection_name->length);
-  my_casedn_str(system_charset_info, buff); 
-  buff_length= (size_t) (res-buff);
-
+  if (!connection_name->str)
+    connection_name= &empty_clex_str;
   mi= (Master_info*) my_hash_search(&master_info_hash,
-                                    (uchar*) buff, buff_length);
+                                    (uchar*) connection_name->str,
+                                    connection_name->length);
   if (!mi && warning != Sql_condition::WARN_LEVEL_NOTE)
   {
     my_error(WARN_NO_MASTER_INFO,
-             MYF(warning == Sql_condition::WARN_LEVEL_WARN ? ME_WARNING :
-                 0),
-             (int) connection_name->length,
-             connection_name->str);
+             MYF(warning == Sql_condition::WARN_LEVEL_WARN ? ME_WARNING : 0),
+             (int) connection_name->length, connection_name->str);
   }
   DBUG_RETURN(mi);
 }
@@ -2074,4 +2069,52 @@
   DBUG_RETURN(result);
 }
 
+void setup_mysql_connection_for_master(MYSQL *mysql, Master_info *mi,
+                                       uint timeout)
+{
+  DBUG_ASSERT(mi);
+  DBUG_ASSERT(mi->mysql);
+  mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, (char *) &timeout);
+  mysql_options(mysql, MYSQL_OPT_READ_TIMEOUT, (char *) &timeout);
+
+#ifdef HAVE_OPENSSL
+  if (mi->ssl)
+  {
+    mysql_ssl_set(mysql,
+                  mi->ssl_key[0]?mi->ssl_key:0,
+                  mi->ssl_cert[0]?mi->ssl_cert:0,
+                  mi->ssl_ca[0]?mi->ssl_ca:0,
+                  mi->ssl_capath[0]?mi->ssl_capath:0,
+                  mi->ssl_cipher[0]?mi->ssl_cipher:0);
+    mysql_options(mysql, MYSQL_OPT_SSL_CRL,
+                  mi->ssl_crl[0] ? mi->ssl_crl : 0);
+    mysql_options(mysql, MYSQL_OPT_SSL_CRLPATH,
+                  mi->ssl_crlpath[0] ? mi->ssl_crlpath : 0);
+    mysql_options(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT,
+                  &mi->ssl_verify_server_cert);
+  }
+#endif
+
+  /*
+    If server's default charset is not supported (like utf16, utf32) as client
+    charset, then set client charset to 'latin1' (default client charset).
+  */
+  if (is_supported_parser_charset(default_charset_info))
+    mysql_options(mysql, MYSQL_SET_CHARSET_NAME, default_charset_info->cs_name.str);
+  else
+  {
+    sql_print_information("'%s' can not be used as client character set. "
+                          "'%s' will be used as default client character set "
+                          "while connecting to master.",
+                          default_charset_info->cs_name.str,
+                          default_client_charset_info->cs_name.str);
+    mysql_options(mysql, MYSQL_SET_CHARSET_NAME,
+                  default_client_charset_info->cs_name.str);
+  }
+
+  /* Set MYSQL_PLUGIN_DIR in case master asks for an external authentication plugin */
+  if (opt_plugin_dir_ptr && *opt_plugin_dir_ptr)
+    mysql_options(mysql, MYSQL_PLUGIN_DIR, opt_plugin_dir_ptr);
+}
+
 #endif /* HAVE_REPLICATION */
diff -Nru mariadb-10.11.11/sql/rpl_mi.h mariadb-10.11.13/sql/rpl_mi.h
--- mariadb-10.11.11/sql/rpl_mi.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/rpl_mi.h	2025-05-19 16:14:25.000000000 +0000
@@ -487,5 +487,16 @@
 uint any_slave_sql_running(bool already_locked);
 bool give_error_if_slave_running(bool already_lock);
 
+/*
+  Sets up the basic options for a MYSQL connection, mysql, to connect to the
+  primary server described by the Master_info parameter, mi. The timeout must
+  be passed explicitly, as different types of connections created by the slave
+  will use different values.
+
+  Assumes mysql_init() has already been called on the mysql connection object.
+*/
+void setup_mysql_connection_for_master(MYSQL *mysql, Master_info *mi,
+                                       uint timeout);
+
 #endif /* HAVE_REPLICATION */
 #endif /* RPL_MI_H */
diff -Nru mariadb-10.11.11/sql/rpl_parallel.cc mariadb-10.11.13/sql/rpl_parallel.cc
--- mariadb-10.11.11/sql/rpl_parallel.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/rpl_parallel.cc	2025-05-19 16:14:25.000000000 +0000
@@ -124,8 +124,8 @@
   else if (cmp == 0
            && rli->group_master_log_pos < qev->future_event_master_log_pos)
     rli->group_master_log_pos= qev->future_event_master_log_pos;
-  mysql_mutex_unlock(&rli->data_lock);
   mysql_cond_broadcast(&rli->data_cond);
+  mysql_mutex_unlock(&rli->data_lock);
 }
 
 
@@ -153,14 +153,12 @@
 finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
                    rpl_parallel_entry *entry, rpl_group_info *rgi)
 {
-  THD *thd= rpt->thd;
-  wait_for_commit *wfc= &rgi->commit_orderer;
-  int err;
-
   if (rgi->get_finish_event_group_called())
     return;
 
-  thd->get_stmt_da()->set_overwrite_status(true);
+  THD *thd= rpt->thd;
+  wait_for_commit *wfc= &rgi->commit_orderer;
+  int err;
 
   if (unlikely(rgi->worker_error))
   {
@@ -320,10 +318,6 @@
     wait_for_pending_deadlock_kill(thd, rgi);
   thd->clear_error();
   thd->reset_killed();
-  /*
-    Would do thd->get_stmt_da()->set_overwrite_status(false) here, but
-    reset_diagnostics_area() already does that.
-  */
   thd->get_stmt_da()->reset_diagnostics_area();
   wfc->wakeup_subsequent_commits(rgi->worker_error);
   rgi->did_mark_start_commit= false;
@@ -1597,9 +1591,7 @@
       else
       {
         delete qev->ev;
-        thd->get_stmt_da()->set_overwrite_status(true);
         err= thd->wait_for_prior_commit();
-        thd->get_stmt_da()->set_overwrite_status(false);
       }
 
       end_of_group=
diff -Nru mariadb-10.11.11/sql/semisync_master.cc mariadb-10.11.13/sql/semisync_master.cc
--- mariadb-10.11.11/sql/semisync_master.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/semisync_master.cc	2025-05-19 16:14:25.000000000 +0000
@@ -565,12 +565,14 @@
 {
   lock();
   DBUG_ASSERT(rpl_semi_sync_master_clients > 0);
-  if (!(--rpl_semi_sync_master_clients) && !rpl_semi_sync_master_wait_no_slave)
+  if (!(--rpl_semi_sync_master_clients) && !rpl_semi_sync_master_wait_no_slave
+      && get_master_enabled())
   {
     /*
       Signal transactions waiting in commit_trx() that they do not have to
       wait anymore.
     */
+    DBUG_ASSERT(m_active_tranxs);
     m_active_tranxs->clear_active_tranx_nodes(NULL, 0,
                                               signal_waiting_transaction);
   }
diff -Nru mariadb-10.11.11/sql/semisync_slave.cc mariadb-10.11.13/sql/semisync_slave.cc
--- mariadb-10.11.11/sql/semisync_slave.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/semisync_slave.cc	2025-05-19 16:14:25.000000000 +0000
@@ -141,7 +141,7 @@
     DBUG_ASSERT(!debug_sync_set_action(mi->io_thd, STRING_WITH_LEN(act)));
   };);
 #endif
-    kill_connection(mi->mysql);
+    kill_connection(mi);
   }
 
   set_slave_enabled(0);
@@ -158,8 +158,9 @@
 }
 
 
-void Repl_semi_sync_slave::kill_connection(MYSQL *mysql)
+void Repl_semi_sync_slave::kill_connection(Master_info *mi)
 {
+  MYSQL *mysql= mi->mysql;
   if (!mysql)
     return;
 
@@ -168,8 +169,8 @@
   size_t kill_buffer_length;
 
   kill_mysql = mysql_init(kill_mysql);
-  mysql_options(kill_mysql, MYSQL_OPT_CONNECT_TIMEOUT, &m_kill_conn_timeout);
-  mysql_options(kill_mysql, MYSQL_OPT_READ_TIMEOUT, &m_kill_conn_timeout);
+
+  setup_mysql_connection_for_master(kill_mysql, mi, m_kill_conn_timeout);
   mysql_options(kill_mysql, MYSQL_OPT_WRITE_TIMEOUT, &m_kill_conn_timeout);
 
   bool ret= (!mysql_real_connect(kill_mysql, mysql->host,
diff -Nru mariadb-10.11.11/sql/semisync_slave.h mariadb-10.11.13/sql/semisync_slave.h
--- mariadb-10.11.11/sql/semisync_slave.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/semisync_slave.h	2025-05-19 16:14:25.000000000 +0000
@@ -92,7 +92,7 @@
   void slave_stop(Master_info *mi);
   void slave_reconnect(Master_info *mi);
   int request_transmit(Master_info *mi);
-  void kill_connection(MYSQL *mysql);
+  void kill_connection(Master_info *mi);
 
 private:
   /* True when init_object has been called */
diff -Nru mariadb-10.11.11/sql/signal_handler.cc mariadb-10.11.13/sql/signal_handler.cc
--- mariadb-10.11.11/sql/signal_handler.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/signal_handler.cc	2025-05-19 16:14:25.000000000 +0000
@@ -277,7 +277,7 @@
     my_safe_printf_stderr("Status: %s\n", kreason);
     my_safe_printf_stderr("Query (%p): ", thd->query());
     my_safe_print_str(thd->query(), MY_MIN(65536U, thd->query_length()));
-    my_safe_printf_stderr("%s", "Optimizer switch: ");
+    my_safe_printf_stderr("%s", "\nOptimizer switch: ");
     ulonglong optsw= thd->variables.optimizer_switch;
     for (uint i= 0; optimizer_switch_names[i+1]; i++, optsw >>= 1)
     {
diff -Nru mariadb-10.11.11/sql/slave.cc mariadb-10.11.13/sql/slave.cc
--- mariadb-10.11.11/sql/slave.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/slave.cc	2025-05-19 16:14:25.000000000 +0000
@@ -3213,21 +3213,23 @@
     if (full)
       protocol->store(mi->connection_name.str, mi->connection_name.length,
                       &my_charset_bin);
+
     mysql_mutex_lock(&mi->run_lock);
+    THD *sql_thd= mi->rli.sql_driver_thd;
+    const char *slave_sql_running_state=
+      sql_thd ? sql_thd->get_proc_info() : "";
+    THD *io_thd= mi->io_thd;
+    const char *slave_io_running_state= io_thd ? io_thd->get_proc_info() : "";
+    mysql_mutex_unlock(&mi->run_lock);
+
     if (full)
-    {
       /*
         Show what the sql driver replication thread is doing
         This is only meaningful if there is only one slave thread.
       */
-      msg= (mi->rli.sql_driver_thd ?
-            mi->rli.sql_driver_thd->get_proc_info() : "");
-      protocol->store_string_or_null(msg, &my_charset_bin);
-    }
-    msg= mi->io_thd ? mi->io_thd->get_proc_info() : "";
-    protocol->store_string_or_null(msg, &my_charset_bin);
+      protocol->store_string_or_null(slave_sql_running_state, &my_charset_bin);
 
-    mysql_mutex_unlock(&mi->run_lock);
+    protocol->store_string_or_null(slave_io_running_state, &my_charset_bin);
 
     mysql_mutex_lock(&mi->data_lock);
     mysql_mutex_lock(&mi->rli.data_lock);
@@ -3401,10 +3403,6 @@
 
     protocol->store((uint32) mi->rli.get_sql_delay());
     // SQL_Remaining_Delay
-    // THD::proc_info is not protected by any lock, so we read it once
-    // to ensure that we use the same value throughout this function.
-    const char *slave_sql_running_state=
-      mi->rli.sql_driver_thd ? mi->rli.sql_driver_thd->proc_info : "";
     if (slave_sql_running_state == stage_sql_thd_waiting_until_delay.m_name)
     {
       time_t t= my_time(0), sql_delay_end= mi->rli.get_sql_delay_end();
@@ -5485,6 +5483,7 @@
   THD *thd;                     /* needs to be first for thread_stack */
   char saved_log_name[FN_REFLEN];
   char saved_master_log_name[FN_REFLEN];
+  bool thd_initialized= 0;
   my_off_t UNINIT_VAR(saved_log_pos);
   my_off_t UNINIT_VAR(saved_master_log_pos);
   String saved_skip_gtid_pos;
@@ -5587,6 +5586,7 @@
   thd->variables.alter_algorithm= (ulong) Alter_info::ALTER_TABLE_ALGORITHM_DEFAULT;
 
   server_threads.insert(thd);
+  thd_initialized= 1;
   /*
     We are going to set slave_running to 1. Assuming slave I/O thread is
     alive and connected, this is going to make Seconds_Behind_Master be 0
@@ -5966,7 +5966,7 @@
   }
   THD_STAGE_INFO(thd, stage_waiting_for_slave_mutex_on_exit);
   thd->add_status_to_global();
-  server_threads.erase(thd);
+  THD_STAGE_INFO(thd, stage_slave_sql_cleanup);
   mysql_mutex_lock(&rli->run_lock);
 
 err_during_init:
@@ -5980,9 +5980,9 @@
   rli->relay_log.description_event_for_exec= 0;
   rli->reset_inuse_relaylog();
   /* Wake up master_pos_wait() */
-  mysql_mutex_unlock(&rli->data_lock);
   DBUG_PRINT("info",("Signaling possibly waiting master_pos_wait() functions"));
   mysql_cond_broadcast(&rli->data_cond);
+  mysql_mutex_unlock(&rli->data_lock);
   rli->ignore_log_space_limit= 0; /* don't need any lock */
   /* we die so won't remember charset - re-update them on next thread start */
   thd->system_thread_info.rpl_sql_info->cached_charset_invalidate();
@@ -6037,6 +6037,8 @@
   rpl_parallel_resize_pool_if_no_slaves();
 
   delete serial_rgi;
+  if (thd_initialized)
+    server_threads.erase(thd);
   delete thd;
 
   DBUG_LEAVE;                                   // Must match DBUG_ENTER()
@@ -7616,50 +7618,10 @@
   if (opt_slave_compressed_protocol)
     client_flag|= CLIENT_COMPRESS;                /* We will use compression */
 
-  mysql_options(mysql, MYSQL_OPT_CONNECT_TIMEOUT, (char *) &slave_net_timeout);
-  mysql_options(mysql, MYSQL_OPT_READ_TIMEOUT, (char *) &slave_net_timeout);
+  setup_mysql_connection_for_master(mi->mysql, mi, slave_net_timeout);
   mysql_options(mysql, MYSQL_OPT_USE_THREAD_SPECIFIC_MEMORY,
                 (char*) &my_true);
 
-#ifdef HAVE_OPENSSL
-  if (mi->ssl)
-  {
-    mysql_ssl_set(mysql,
-                  mi->ssl_key[0]?mi->ssl_key:0,
-                  mi->ssl_cert[0]?mi->ssl_cert:0,
-                  mi->ssl_ca[0]?mi->ssl_ca:0,
-                  mi->ssl_capath[0]?mi->ssl_capath:0,
-                  mi->ssl_cipher[0]?mi->ssl_cipher:0);
-    mysql_options(mysql, MYSQL_OPT_SSL_CRL,
-                  mi->ssl_crl[0] ? mi->ssl_crl : 0);
-    mysql_options(mysql, MYSQL_OPT_SSL_CRLPATH,
-                  mi->ssl_crlpath[0] ? mi->ssl_crlpath : 0);
-    mysql_options(mysql, MYSQL_OPT_SSL_VERIFY_SERVER_CERT,
-                  &mi->ssl_verify_server_cert);
-  }
-#endif
-
-  /*
-    If server's default charset is not supported (like utf16, utf32) as client
-    charset, then set client charset to 'latin1' (default client charset).
-  */
-  if (is_supported_parser_charset(default_charset_info))
-    mysql_options(mysql, MYSQL_SET_CHARSET_NAME, default_charset_info->cs_name.str);
-  else
-  {
-    sql_print_information("'%s' can not be used as client character set. "
-                          "'%s' will be used as default client character set "
-                          "while connecting to master.",
-                          default_charset_info->cs_name.str,
-                          default_client_charset_info->cs_name.str);
-    mysql_options(mysql, MYSQL_SET_CHARSET_NAME,
-                  default_client_charset_info->cs_name.str);
-  }
-
-  /* Set MYSQL_PLUGIN_DIR in case master asks for an external authentication plugin */
-  if (opt_plugin_dir_ptr && *opt_plugin_dir_ptr)
-    mysql_options(mysql, MYSQL_PLUGIN_DIR, opt_plugin_dir_ptr);
-
   /* we disallow empty users */
   if (mi->user[0] == 0)
   {
diff -Nru mariadb-10.11.11/sql/sp_head.cc mariadb-10.11.13/sql/sp_head.cc
--- mariadb-10.11.11/sql/sp_head.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sp_head.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1531,7 +1531,7 @@
           thd->wsrep_cs().reset_error();
           /* Reset also thd->killed if it has been set during BF abort. */
           if (killed_mask_hard(thd->killed) == KILL_QUERY)
-            thd->killed= NOT_KILLED;
+            thd->reset_killed();
           /* if failed transaction was not replayed, must return with error from here */
           if (!must_replay) err_status = 1;
         }
@@ -2552,6 +2552,16 @@
   if (!spvar)
     DBUG_RETURN(FALSE);
 
+  if (!spvar->field_def.type_handler()->is_scalar_type() &&
+      dynamic_cast<Item_param*>(arg_item))
+  {
+    // Item_param cannot store values of non-scalar data types yet
+    my_error(ER_ILLEGAL_PARAMETER_DATA_TYPE_FOR_OPERATION, MYF(0),
+             spvar->field_def.type_handler()->name().ptr(),
+             "EXECUTE ... USING ?");
+    DBUG_RETURN(true);
+  }
+
   if (spvar->mode != sp_variable::MODE_IN)
   {
     Settable_routine_parameter *srp=
diff -Nru mariadb-10.11.11/sql/sql_acl.cc mariadb-10.11.13/sql/sql_acl.cc
--- mariadb-10.11.11/sql/sql_acl.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_acl.cc	2025-05-19 16:14:25.000000000 +0000
@@ -8433,19 +8433,13 @@
 
     /*
       If sequence is used as part of NEXT VALUE, PREVIOUS VALUE or SELECT,
-      we need to modify the requested access rights depending on how the
-      sequence is used.
+      the privilege will be checked in ::fix_fields().
+      Direct SELECT of a sequence table doesn't set t_ref->sequence, so
+      privileges will be checked normally, as for any table.
     */
     if (t_ref->sequence &&
         !(want_access & ~(SELECT_ACL | INSERT_ACL | UPDATE_ACL | DELETE_ACL)))
-    {
-      /*
-        We want to have either SELECT or INSERT rights to sequences depending
-        on how they are accessed
-      */
-      orig_want_access= ((t_ref->lock_type >= TL_FIRST_WRITE) ?
-                         INSERT_ACL : SELECT_ACL);
-    }
+      continue;
 
     const ACL_internal_table_access *access=
       get_cached_table_access(&t_ref->grant.m_internal,
@@ -13111,6 +13105,9 @@
       return dup;
     }
 
+    if (!initialized)
+      return dup;
+
     if (lock)
       mysql_mutex_lock(&acl_cache->lock);
     if (find_acl_role(dup->user.str, false))
diff -Nru mariadb-10.11.11/sql/sql_base.cc mariadb-10.11.13/sql/sql_base.cc
--- mariadb-10.11.11/sql/sql_base.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_base.cc	2025-05-19 16:14:25.000000000 +0000
@@ -19,6 +19,7 @@
 
 #include "mariadb.h"
 #include "sql_base.h"                           // setup_table_map
+#include "sql_list.h"
 #include "sql_priv.h"
 #include "unireg.h"
 #include "debug_sync.h"
@@ -781,6 +782,7 @@
   }
 }
 
+#ifdef DBUG_ASSERT_EXISTS
 static inline bool check_field_pointers(const TABLE *table)
 {
   for (Field **pf= table->field; *pf; pf++)
@@ -796,6 +798,7 @@
   }
   return true;
 }
+#endif
 
 
 int close_thread_tables_for_query(THD *thd)
@@ -1173,7 +1176,6 @@
   t_name= &table->table_name;
   t_alias= &table->alias;
 
-retry:
   DBUG_PRINT("info", ("real table: %s.%s", d_name->str, t_name->str));
   for (TABLE_LIST *tl= table_list; tl ; tl= tl->next_global, res= 0)
   {
@@ -1235,28 +1237,53 @@
     DBUG_PRINT("info",
                ("found same copy of table or table which we should skip"));
   }
-  if (res && res->belong_to_derived)
-  {
-    /*
-      We come here for queries of type:
-      INSERT INTO t1 (SELECT tmp.a FROM (select * FROM t1) as tmp);
+  DBUG_RETURN(res);
+}
 
-      Try to fix by materializing the derived table
-    */
-    TABLE_LIST *derived=  res->belong_to_derived;
-    if (derived->is_merged_derived() && !derived->derived->is_excluded())
+
+TABLE_LIST* unique_table_in_select_list(THD *thd, TABLE_LIST *table, SELECT_LEX *sel)
+{
+  subselect_table_finder_param param= {thd, table, NULL};
+  List_iterator_fast<Item> it(sel->item_list);
+  Item *item;
+  while ((item= it++))
+  {
+    if (item->walk(&Item::subselect_table_finder_processor, FALSE, &param))
     {
-      DBUG_PRINT("info",
-                 ("convert merged to materialization to resolve the conflict"));
-      derived->change_refs_to_fields();
-      derived->set_materialized_derived();
-      goto retry;
+      if (param.dup == NULL)
+        return ERROR_TABLE;
+      return param.dup;
     }
+    DBUG_ASSERT(param.dup == NULL);
   }
-  DBUG_RETURN(res);
+  return NULL;
 }
 
 
+typedef TABLE_LIST* (*find_table_callback)(THD *thd, TABLE_LIST *table,
+                                           TABLE_LIST *table_list,
+                                           uint check_flag, SELECT_LEX *sel);
+
+static
+TABLE_LIST*
+find_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list,
+           uint check_flag, SELECT_LEX *sel, find_table_callback callback );
+
+TABLE_LIST* unique_table_callback(THD *thd, TABLE_LIST *table,
+                                  TABLE_LIST *table_list,
+                                  uint check_flag, SELECT_LEX *sel)
+{
+  return find_dup_table(thd, table, table_list, check_flag);
+}
+
+
+TABLE_LIST* unique_in_sel_table_callback(THD *thd, TABLE_LIST *table,
+                                         TABLE_LIST *table_list,
+                                         uint check_flag, SELECT_LEX *sel)
+{
+  return unique_table_in_select_list(thd, table, sel);
+}
+
 /**
   Test that the subject table of INSERT/UPDATE/DELETE/CREATE
   or (in case of MyISAMMRG) one of its children are not used later
@@ -1276,6 +1303,25 @@
 unique_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list,
              uint check_flag)
 {
+  return find_table(thd, table, table_list, check_flag, NULL,
+                    &unique_table_callback);
+}
+
+
+TABLE_LIST*
+unique_table_in_insert_returning_subselect(THD *thd, TABLE_LIST *table, SELECT_LEX *sel)
+{
+  return find_table(thd, table, NULL, 0, sel,
+                    &unique_in_sel_table_callback);
+
+}
+
+
+static
+TABLE_LIST*
+find_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list,
+           uint check_flag, SELECT_LEX *sel, find_table_callback callback )
+{
   TABLE_LIST *dup;
 
   table= table->find_table_for_update();
@@ -1306,12 +1352,12 @@
       if (!tmp_parent)
         break;
 
-      if ((dup= find_dup_table(thd, child, child->next_global, check_flag)))
+      if ((dup= (*callback)(thd, child, child->next_global, check_flag, sel)))
         break;
     }
   }
   else
-    dup= find_dup_table(thd, table, table_list, check_flag);
+    dup= (*callback)(thd, table, table_list, check_flag, sel);
   return dup;
 }
 
@@ -4561,6 +4607,7 @@
     }
 
   thd->current_tablenr= 0;
+  sroutine_to_open= &thd->lex->sroutines_list.first;
 
 restart:
   /*
@@ -4576,7 +4623,6 @@
 
   has_prelocking_list= thd->lex->requires_prelocking();
   table_to_open= start;
-  sroutine_to_open= &thd->lex->sroutines_list.first;
   *counter= 0;
   THD_STAGE_INFO(thd, stage_opening_tables);
   prelocking_strategy->reset(thd);
@@ -4673,7 +4719,7 @@
             elements from the table list (if MERGE tables are involved),
           */
           close_tables_for_reopen(thd, start, ot_ctx.start_of_statement_svp(),
-                                  ot_ctx.remove_implicitly_used_deps());
+                                  false);
 
           /*
             Here we rely on the fact that 'tables' still points to the valid
@@ -4741,10 +4787,9 @@
           /* F.ex. deadlock happened */
           if (ot_ctx.can_recover_from_failed_open())
           {
-            DBUG_ASSERT(ot_ctx.remove_implicitly_used_deps());
             close_tables_for_reopen(thd, start,
                                     ot_ctx.start_of_statement_svp(),
-                                    ot_ctx.remove_implicitly_used_deps());
+                                    true);
             if (ot_ctx.recover_from_failed_open())
               goto error;
 
@@ -4753,6 +4798,7 @@
               goto error;
 
             error= FALSE;
+            sroutine_to_open= &thd->lex->sroutines_list.first;
             goto restart;
           }
           /*
@@ -6034,19 +6080,19 @@
                          trying to reopen tables. NULL if no metadata locks
                          were held and thus all metadata locks should be
                          released.
-  @param[in] remove_implicit_deps  True in case routines and tables implicitly
+  @param[in] remove_indirect  True in case routines and tables implicitly
                                    used by a statement should be removed.
 */
 
 void close_tables_for_reopen(THD *thd, TABLE_LIST **tables,
                              const MDL_savepoint &start_of_statement_svp,
-                             bool remove_implicit_deps)
+                             bool remove_indirect)
 {
-  TABLE_LIST *first_not_own_table= thd->lex->first_not_own_table();
   TABLE_LIST *tmp;
 
-  if (remove_implicit_deps)
+  if (remove_indirect)
   {
+    TABLE_LIST *first_not_own_table= thd->lex->first_not_own_table();
     /*
       If table list consists only from tables from prelocking set, table list
       for new attempt should be empty, so we have to update list's root pointer.
@@ -7412,82 +7458,83 @@
     if (!found)
       continue;                                 // No matching field
 
+    /* Restore field_2 to point to the field which was a match for field_1. */
+    field_2= nj_col_2->field();
+
     /*
       field_1 and field_2 have the same names. Check if they are in the USING
       clause (if present), mark them as common fields, and add a new
       equi-join condition to the ON clause.
     */
-    if (nj_col_2)
-    {
-      /*
-        Create non-fixed fully qualified field and let fix_fields to
-        resolve it.
-      */
-      Item *item_1=   nj_col_1->create_item(thd);
-      Item *item_2=   nj_col_2->create_item(thd);
-      Item_ident *item_ident_1, *item_ident_2;
-      Item_func_eq *eq_cond;
 
-      if (!item_1 || !item_2)
-        goto err;                               // out of memory
+    /*
+      Create non-fixed fully qualified field and let fix_fields to
+      resolve it.
+    */
+    Item *item_1=   nj_col_1->create_item(thd);
+    Item *item_2=   nj_col_2->create_item(thd);
+    Item_ident *item_ident_1, *item_ident_2;
+    Item_func_eq *eq_cond;
 
-      /*
-        The following assert checks that the two created items are of
-        type Item_ident.
-      */
-      DBUG_ASSERT(!thd->lex->current_select->no_wrap_view_item);
-      /*
-        In the case of no_wrap_view_item == 0, the created items must be
-        of sub-classes of Item_ident.
-      */
-      DBUG_ASSERT(item_1->type() == Item::FIELD_ITEM ||
-                  item_1->type() == Item::REF_ITEM);
-      DBUG_ASSERT(item_2->type() == Item::FIELD_ITEM ||
-                  item_2->type() == Item::REF_ITEM);
+    if (!item_1 || !item_2)
+      goto err;                               // out of memory
 
-      /*
-        We need to cast item_1,2 to Item_ident, because we need to hook name
-        resolution contexts specific to each item.
-      */
-      item_ident_1= (Item_ident*) item_1;
-      item_ident_2= (Item_ident*) item_2;
-      /*
-        Create and hook special name resolution contexts to each item in the
-        new join condition . We need this to both speed-up subsequent name
-        resolution of these items, and to enable proper name resolution of
-        the items during the execute phase of PS.
-      */
-      if (set_new_item_local_context(thd, item_ident_1, nj_col_1->table_ref) ||
-          set_new_item_local_context(thd, item_ident_2, nj_col_2->table_ref))
-        goto err;
+    /*
+      The following assert checks that the two created items are of
+      type Item_ident.
+    */
+    DBUG_ASSERT(!thd->lex->current_select->no_wrap_view_item);
+    /*
+      In the case of no_wrap_view_item == 0, the created items must be
+      of sub-classes of Item_ident.
+    */
+    DBUG_ASSERT(item_1->type() == Item::FIELD_ITEM ||
+                item_1->type() == Item::REF_ITEM);
+    DBUG_ASSERT(item_2->type() == Item::FIELD_ITEM ||
+                item_2->type() == Item::REF_ITEM);
 
-      if (!(eq_cond= new (thd->mem_root) Item_func_eq(thd, item_ident_1, item_ident_2)))
-        goto err;                               /* Out of memory. */
+    /*
+      We need to cast item_1,2 to Item_ident, because we need to hook name
+      resolution contexts specific to each item.
+    */
+    item_ident_1= (Item_ident*) item_1;
+    item_ident_2= (Item_ident*) item_2;
+    /*
+      Create and hook special name resolution contexts to each item in the
+      new join condition . We need this to both speed-up subsequent name
+      resolution of these items, and to enable proper name resolution of
+      the items during the execute phase of PS.
+    */
+    if (set_new_item_local_context(thd, item_ident_1, nj_col_1->table_ref) ||
+        set_new_item_local_context(thd, item_ident_2, nj_col_2->table_ref))
+      goto err;
 
-      /*
-        Add the new equi-join condition to the ON clause. Notice that
-        fix_fields() is applied to all ON conditions in setup_conds()
-        so we don't do it here.
-      */
-      add_join_on(thd, (table_ref_1->outer_join & JOIN_TYPE_RIGHT ?
-                        table_ref_1 : table_ref_2),
-                  eq_cond);
-
-      nj_col_1->is_common= nj_col_2->is_common= TRUE;
-      DBUG_PRINT ("info", ("%s.%s and %s.%s are common", 
-                           nj_col_1->safe_table_name(),
-                           nj_col_1->name()->str,
-                           nj_col_2->safe_table_name(),
-                           nj_col_2->name()->str));
-
-      if (field_1)
-        update_field_dependencies(thd, field_1, field_1->table);
-      if (field_2)
-        update_field_dependencies(thd, field_2, field_2->table);
+    if (!(eq_cond= new (thd->mem_root) Item_func_eq(thd, item_ident_1, item_ident_2)))
+      goto err;                               /* Out of memory. */
 
-      if (using_fields != NULL)
-        ++(*found_using_fields);
-    }
+    /*
+      Add the new equi-join condition to the ON clause. Notice that
+      fix_fields() is applied to all ON conditions in setup_conds()
+      so we don't do it here.
+    */
+    add_join_on(thd, (table_ref_1->outer_join & JOIN_TYPE_RIGHT ?
+                      table_ref_1 : table_ref_2),
+                eq_cond);
+
+    nj_col_1->is_common= nj_col_2->is_common= TRUE;
+    DBUG_PRINT ("info", ("%s.%s and %s.%s are common", 
+                         nj_col_1->safe_table_name(),
+                         nj_col_1->name()->str,
+                         nj_col_2->safe_table_name(),
+                         nj_col_2->name()->str));
+
+    if (field_1)
+      update_field_dependencies(thd, field_1, field_1->table);
+    if (field_2)
+      update_field_dependencies(thd, field_2, field_2->table);
+
+    if (using_fields != NULL)
+      ++(*found_using_fields);
   }
   if (leaf_1)
     leaf_1->is_join_columns_complete= TRUE;
@@ -8392,7 +8439,7 @@
     if (table_list->belong_to_view && !table_list->view && 
         check_single_table_access(thd, access, table_list, FALSE))
     {
-      tables->hide_view_error(thd);
+      tables->replace_view_error_with_generic(thd);
       DBUG_RETURN(TRUE);
     }
     access= want_access;
@@ -8897,14 +8944,15 @@
 }
 
 
-static void unwind_stored_field_offsets(const List<Item> &fields, Field *end)
+static void unwind_stored_field_offsets(const List<Item> &fields, Item_field *end)
 {
-  for (Item &item_field: fields)
+  for (Item &item: fields)
   {
-    Field *f= item_field.field_for_view_update()->field;
-    if (f == end)
+    Item_field *item_field= item.field_for_view_update();
+    if (item_field == end)
       break;
 
+    Field *f= item_field->field;
     if (f->stored_in_db())
     {
       TABLE *table= f->table;
@@ -8948,7 +8996,7 @@
 {
   List_iterator_fast<Item> f(fields),v(values);
   Item *value, *fld;
-  Item_field *field;
+  Item_field *field= NULL;
   Field *rfield;
   TABLE *table;
   bool only_unvers_fields= update && table_arg->versioned();
@@ -8966,11 +9014,8 @@
 
   while ((fld= f++))
   {
-    if (!(field= fld->field_for_view_update()))
-    {
-      my_error(ER_NONUPDATEABLE_COLUMN, MYF(0), fld->name.str);
-      goto err_unwind_fields;
-    }
+    field= fld->field_for_view_update();
+    DBUG_ASSERT(field); // ensured by check_fields or check_view_insertability.
     value=v++;
     DBUG_ASSERT(value);
     rfield= field->field;
@@ -9038,7 +9083,7 @@
   DBUG_RETURN(thd->is_error());
 err_unwind_fields:
   if (update && thd->variables.sql_mode & MODE_SIMULTANEOUS_ASSIGNMENT)
-    unwind_stored_field_offsets(fields, rfield);
+    unwind_stored_field_offsets(fields, field);
 err:
   DBUG_PRINT("error",("got error"));
   thd->abort_on_warning= save_abort_on_warning;
@@ -9407,9 +9452,11 @@
           memcpy(path_copy, path, path_len - ext_len);
           path_copy[path_len - ext_len]= 0;
           init_tmp_table_share(thd, &share, "", 0, "", path_copy);
-          handlerton *ht= share.db_type();
           if (!open_table_def(thd, &share))
-            ht->drop_table(share.db_type(), path_copy);
+          {
+            handlerton *ht= share.db_type();
+            ht->drop_table(ht, path_copy);
+          }
           free_table_share(&share);
         }
         /*
diff -Nru mariadb-10.11.11/sql/sql_base.h mariadb-10.11.13/sql/sql_base.h
--- mariadb-10.11.11/sql/sql_base.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_base.h	2025-05-19 16:14:25.000000000 +0000
@@ -157,7 +157,7 @@
 my_bool mysql_rm_tmp_tables(void);
 void close_tables_for_reopen(THD *thd, TABLE_LIST **tables,
                              const MDL_savepoint &start_of_statement_svp,
-                             bool remove_implicit_dependencies);
+                             bool remove_indirect);
 bool table_already_fk_prelocked(TABLE_LIST *tl, LEX_CSTRING *db,
                                 LEX_CSTRING *table, thr_lock_type lock_type);
 TABLE_LIST *find_table_in_list(TABLE_LIST *table,
@@ -296,6 +296,8 @@
 bool lock_tables(THD *thd, TABLE_LIST *tables, uint counter, uint flags);
 int decide_logging_format(THD *thd, TABLE_LIST *tables);
 void close_thread_table(THD *thd, TABLE **table_ptr);
+TABLE_LIST*
+unique_table_in_insert_returning_subselect(THD *thd, TABLE_LIST *table, SELECT_LEX *sel);
 TABLE_LIST *unique_table(THD *thd, TABLE_LIST *table, TABLE_LIST *table_list,
                          uint check_flag);
 bool is_equal(const LEX_CSTRING *a, const LEX_CSTRING *b);
@@ -568,23 +570,6 @@
     return m_timeout;
   }
 
-  /**
-    Return true in case tables and routines the statement implicilty
-    dependent on should be removed, else return false.
-
-    @note The use case when routines and tables the statement implicitly
-    dependent on shouldn't be removed is the one when a new partition be
-    created on handling the INSERT statement against a versioning partitioned
-    table. For this case re-opening a versioning table would result in adding
-    implicitly dependent routines (e.g. table's triggers) that lead to
-    allocation of memory on PS mem_root and so leaking a memory until the PS
-    statement be deallocated.
-  */
-  bool remove_implicitly_used_deps() const
-  {
-    return m_action != OT_ADD_HISTORY_PARTITION;
-  }
-
   uint get_flags() const { return m_flags; }
 
   /**
diff -Nru mariadb-10.11.11/sql/sql_cache.cc mariadb-10.11.13/sql/sql_cache.cc
--- mariadb-10.11.11/sql/sql_cache.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_cache.cc	2025-05-19 16:14:25.000000000 +0000
@@ -3553,6 +3553,7 @@
     if (table_block == 0)
     {
       DBUG_PRINT("qcache", ("Can't write table name to cache"));
+      node->parent= NULL;
       DBUG_RETURN(0);
     }
     Query_cache_table *header= table_block->table();
@@ -3576,6 +3577,7 @@
       DBUG_PRINT("qcache", ("Can't insert table to hash"));
       // write_block_data return locked block
       free_memory_block(table_block);
+      node->parent= NULL;
       DBUG_RETURN(0);
     }
     char *db= header->db();
diff -Nru mariadb-10.11.11/sql/sql_class.cc mariadb-10.11.13/sql/sql_class.cc
--- mariadb-10.11.11/sql/sql_class.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_class.cc	2025-05-19 16:14:25.000000000 +0000
@@ -8381,6 +8381,24 @@
 }
 
 
+void
+wait_for_commit::prior_commit_error(THD *thd)
+{
+  /*
+    Only raise a "prior commit failed" error if we didn't already raise
+    an error.
+
+    The ER_PRIOR_COMMIT_FAILED is just an internal mechanism to ensure that a
+    transaction does not commit successfully if a prior commit failed, so that
+    the parallel replication worker threads stop in an orderly fashion when
+    one of them get an error. Thus, if another worker already got another real
+    error, overriding it with ER_PRIOR_COMMIT_FAILED is not useful.
+  */
+  if (!thd->get_stmt_da()->is_set())
+    my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
+}
+
+
 /*
   Wakeup anyone waiting for us to have committed.
 
diff -Nru mariadb-10.11.11/sql/sql_class.h mariadb-10.11.13/sql/sql_class.h
--- mariadb-10.11.11/sql/sql_class.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_class.h	2025-05-19 16:14:25.000000000 +0000
@@ -2383,8 +2383,8 @@
       return wait_for_prior_commit2(thd, allow_kill);
     else
     {
-      if (wakeup_error)
-        my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
+      if (unlikely(wakeup_error))
+        prior_commit_error(thd);
       return wakeup_error;
     }
   }
@@ -2435,6 +2435,7 @@
   void wakeup(int wakeup_error);
 
   int wait_for_prior_commit2(THD *thd, bool allow_kill);
+  void prior_commit_error(THD *thd);
   void wakeup_subsequent_commits2(int wakeup_error);
   void unregister_wait_for_prior_commit2();
 
diff -Nru mariadb-10.11.11/sql/sql_cmd.h mariadb-10.11.13/sql/sql_cmd.h
--- mariadb-10.11.11/sql/sql_cmd.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_cmd.h	2025-05-19 16:14:25.000000000 +0000
@@ -141,6 +141,7 @@
                                          handlerton **ha,
                                          bool tmp_table);
   bool is_set() { return m_storage_engine_name.str != NULL; }
+  const LEX_CSTRING *name() const { return &m_storage_engine_name; }
 };
 
 
diff -Nru mariadb-10.11.11/sql/sql_db.cc mariadb-10.11.13/sql/sql_db.cc
--- mariadb-10.11.11/sql/sql_db.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_db.cc	2025-05-19 16:14:25.000000000 +0000
@@ -536,36 +536,53 @@
 
   DESCRIPTION
 
+  create->default_table_charset is guaranteed to be alway set
+  Required by some callers
+
   RETURN VALUES
   0	File found
-  1	No database file or could not open it
-
+  -1	No database file (file was not found or 'empty' file was cached)
+  1     Could not open it
 */
 
-bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create)
+int load_db_opt(THD *thd, const char *path, Schema_specification_st *create)
 {
   File file;
   char buf[256+DATABASE_COMMENT_MAXLEN];
   DBUG_ENTER("load_db_opt");
-  bool error=1;
+  int error= 0;
   size_t nbytes;
   myf utf8_flag= thd->get_utf8_flag();
 
   bzero((char*) create,sizeof(*create));
-  create->default_table_charset= thd->variables.collation_server;
 
   /* Check if options for this database are already in the hash */
   if (!get_dbopt(thd, path, create))
-    DBUG_RETURN(0);
+  {
+    if (!create->default_table_charset)
+      error= -1;                                // db.opt did not exists
+    goto err1;
+  }
 
   /* Otherwise, load options from the .opt file */
   if ((file= mysql_file_open(key_file_dbopt,
                              path, O_RDONLY | O_SHARE, MYF(0))) < 0)
+  {
+    /*
+      Create an empty entry, to avoid doing an extra file open for every create
+      table.
+    */
+    put_dbopt(path, create);
+    error= -1;
     goto err1;
+  }
 
   IO_CACHE cache;
   if (init_io_cache(&cache, file, IO_SIZE, READ_CACHE, 0, 0, MYF(0)))
-    goto err2;
+  {
+    error= 1;
+    goto err2;                                  // Not cached
+  }
 
   while ((int) (nbytes= my_b_gets(&cache, (char*) buf, sizeof(buf))) > 0)
   {
@@ -586,7 +603,7 @@
            default-collation commands.
         */
         if (!(create->default_table_charset=
-        get_charset_by_csname(pos+1, MY_CS_PRIMARY, MYF(utf8_flag))) &&
+              get_charset_by_csname(pos+1, MY_CS_PRIMARY, MYF(utf8_flag))) &&
             !(create->default_table_charset=
               get_charset_by_name(pos+1, MYF(utf8_flag))))
         {
@@ -621,10 +638,11 @@
 err2:
   mysql_file_close(file, MYF(0));
 err1:
+  if (!create->default_table_charset)           // In case of error
+    create->default_table_charset= thd->variables.collation_server;
   DBUG_RETURN(error);
 }
 
-
 /*
   Retrieve database options by name. Load database options file or fetch from
   cache.
@@ -651,11 +669,12 @@
     db_create_info right after that.
 
   RETURN VALUES (read NOTE!)
-    FALSE   Success
-    TRUE    Failed to retrieve options
+  0	File found
+  -1	No database file (file was not found or 'empty' file was cached)
+  1     Could not open it
 */
 
-bool load_db_opt_by_name(THD *thd, const char *db_name,
+int load_db_opt_by_name(THD *thd, const char *db_name,
                          Schema_specification_st *db_create_info)
 {
   char db_opt_path[FN_REFLEN + 1];
@@ -1951,8 +1970,7 @@
 
   build_table_filename(path, sizeof(path)-1,
                        old_db->str, "", MY_DB_OPT_FILE, 0);
-  if ((load_db_opt(thd, path, &create_info)))
-    create_info.default_table_charset= thd->variables.collation_server;
+  load_db_opt(thd, path, &create_info);
 
   length= build_table_filename(path, sizeof(path)-1, old_db->str, "", "", 0);
   if (length && path[length-1] == FN_LIBCHAR)
diff -Nru mariadb-10.11.11/sql/sql_db.h mariadb-10.11.13/sql/sql_db.h
--- mariadb-10.11.11/sql/sql_db.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_db.h	2025-05-19 16:14:25.000000000 +0000
@@ -37,8 +37,8 @@
 bool my_dboptions_cache_init(void);
 void my_dboptions_cache_free(void);
 bool check_db_dir_existence(const char *db_name);
-bool load_db_opt(THD *thd, const char *path, Schema_specification_st *create);
-bool load_db_opt_by_name(THD *thd, const char *db_name,
+int load_db_opt(THD *thd, const char *path, Schema_specification_st *create);
+int load_db_opt_by_name(THD *thd, const char *db_name,
                          Schema_specification_st *db_create_info);
 CHARSET_INFO *get_default_db_collation(THD *thd, const char *db_name);
 bool my_dbopt_init(void);
diff -Nru mariadb-10.11.11/sql/sql_error.cc mariadb-10.11.13/sql/sql_error.cc
--- mariadb-10.11.11/sql/sql_error.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_error.cc	2025-05-19 16:14:25.000000000 +0000
@@ -318,18 +318,16 @@
 #endif
   get_warning_info()->clear_error_condition();
   set_is_sent(false);
-  /** Tiny reset in debug mode to see garbage right away */
-  if (!is_bulk_op())
-    /*
-      For BULK DML operations (e.g. UPDATE) the data member m_status
-      has the value DA_OK_BULK. Keep this value in order to handle
-      m_affected_rows, m_statement_warn_count in correct way. Else,
-      the number of rows and the number of warnings affected by
-      the last statement executed as part of a trigger fired by the dml
-      (e.g. UPDATE statement fires a trigger on AFTER UPDATE) would counts
-      rows modified by trigger's statement.
-    */
-    m_status= DA_EMPTY;
+  /*
+    For BULK DML operations (e.g. UPDATE) the data member m_status
+    has the value DA_OK_BULK. Keep this value in order to handle
+    m_affected_rows, m_statement_warn_count in correct way. Else,
+    the number of rows and the number of warnings affected by
+    the last statement executed as part of a trigger fired by the dml
+    (e.g. UPDATE statement fires a trigger on AFTER UPDATE) would counts
+    rows modified by trigger's statement.
+  */
+  m_status= is_bulk_op() ? DA_OK_BULK : DA_EMPTY;
   DBUG_VOID_RETURN;
 }
 
diff -Nru mariadb-10.11.11/sql/sql_insert.cc mariadb-10.11.13/sql/sql_insert.cc
--- mariadb-10.11.11/sql/sql_insert.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_insert.cc	2025-05-19 16:14:25.000000000 +0000
@@ -57,6 +57,7 @@
 */
 
 #include "mariadb.h"                 /* NO_EMBEDDED_ACCESS_CHECKS */
+#include "sql_list.h"
 #include "sql_priv.h"
 #include "sql_insert.h"
 #include "sql_update.h"                         // compare_record
@@ -728,6 +729,8 @@
   Name_resolution_context_state ctx_state;
   SELECT_LEX *returning= thd->lex->has_returning() ? thd->lex->returning() : 0;
   unsigned char *readbuff= NULL;
+  List<List_item> insert_values_cache;
+  bool cache_insert_values= FALSE;
 
 #ifndef EMBEDDED_LIBRARY
   char *query= thd->query();
@@ -785,7 +788,7 @@
 
   if ((res= mysql_prepare_insert(thd, table_list, fields, values,
                                  update_fields, update_values, duplic, ignore,
-                                 &unused_conds, FALSE)))
+                                 &unused_conds, FALSE, &cache_insert_values)))
   {
     retval= thd->is_error();
     if (res < 0)
@@ -1033,8 +1036,41 @@
   if (returning)
     fix_rownum_pointers(thd, thd->lex->returning(), &info.accepted_rows);
 
+  if (cache_insert_values)
+  {
+    insert_values_cache.empty();
+    while ((values= its++))
+    {
+      List<Item> *caches= new (thd->mem_root) List_item;
+      List_iterator_fast<Item> iv(*values);
+      Item *item;
+      if (caches == 0)
+      {
+        error= 1;
+        goto values_loop_end;
+      }
+      caches->empty();
+      while((item= iv++))
+      {
+        Item_cache *cache= item->get_cache(thd);
+        if (!cache)
+        {
+          error= 1;
+          goto values_loop_end;
+        }
+        cache->setup(thd, item);
+        caches->push_back(cache);
+      }
+      insert_values_cache.push_back(caches);
+    }
+    its.rewind();
+  }
+
   do
   {
+    List_iterator_fast<List_item> itc(insert_values_cache);
+    List_iterator_fast<List_item> *itr;
+
     DBUG_PRINT("info", ("iteration %llu", iteration));
     if (iteration && bulk_parameters_set(thd))
     {
@@ -1042,7 +1078,24 @@
       goto values_loop_end;
     }
 
-    while ((values= its++))
+    if (cache_insert_values)
+    {
+      List_item *caches;
+      while ((caches= itc++))
+      {
+        List_iterator_fast<Item> ic(*caches);
+        Item_cache *cache;
+        while((cache= (Item_cache*) ic++))
+        {
+          cache->cache_value();
+        }
+      }
+      itc.rewind();
+      itr= &itc;
+    }
+    else
+      itr= &its;
+    while ((values= (*itr)++))
     {
       thd->get_stmt_da()->inc_current_row_for_warning();
       if (fields.elements || !value_count)
@@ -1146,7 +1199,7 @@
         break;
       info.accepted_rows++;
     }
-    its.rewind();
+    itr->rewind();
     iteration++;
   } while (bulk_parameters_iterations(thd));
 
@@ -1657,6 +1710,7 @@
     table_list          Global/local table list
     where               Where clause (for insert ... select)
     select_insert       TRUE if INSERT ... SELECT statement
+    cache_insert_values insert's VALUES(...) has to be pre-computed
 
   TODO (in far future)
     In cases of:
@@ -1679,7 +1733,7 @@
                          List<Item> &update_fields, List<Item> &update_values,
                          enum_duplicates duplic, bool ignore,
                          COND **where,
-                         bool select_insert)
+                         bool select_insert, bool * const cache_insert_values)
 {
   SELECT_LEX *select_lex= thd->lex->first_select_lex();
   Name_resolution_context *context= &select_lex->context;
@@ -1783,6 +1837,15 @@
       thd->vers_insert_history(row_start); // check privileges
   }
 
+  /*
+    Check if we read from the same table we're inserting into.
+    Queries like INSERT INTO t1 VALUES ((SELECT ... FROM t1...)) have
+    to pre-compute the VALUES part.
+    Reading from the same table in the RETURNING clause is not allowed.
+
+    INSERT...SELECT detects this case in select_insert::prepare and also
+    uses buffering to handle it correcly.
+  */
   if (!select_insert)
   {
     Item *fake_conds= 0;
@@ -1790,10 +1853,30 @@
     if ((duplicate= unique_table(thd, table_list, table_list->next_global,
                                  CHECK_DUP_ALLOW_DIFFERENT_ALIAS)))
     {
-      update_non_unique_table_error(table_list, "INSERT", duplicate);
-      DBUG_RETURN(1);
+      /*
+        This is INSERT INTO ... VALUES (...) and it must pre-compute the
+        values to be inserted.
+      */
+      (*cache_insert_values)= true;
     }
+    else
+      (*cache_insert_values)= false;
+
     select_lex->fix_prepare_information(thd, &fake_conds, &fake_conds);
+
+    if ((*cache_insert_values) && thd->lex->has_returning())
+    {
+      // Check if the table we're inserting into is also in RETURNING clause
+      TABLE_LIST *dup=
+         unique_table_in_insert_returning_subselect(thd, table_list,
+                                                    thd->lex->returning());
+      if (dup)
+      {
+        if (dup != ERROR_TABLE)
+          update_non_unique_table_error(table_list, "INSERT", duplicate);
+        DBUG_RETURN(1);
+      }
+    }
   }
   /*
     Only call prepare_for_posistion() if we are not performing a DELAYED
@@ -3930,6 +4013,7 @@
   int res;
   LEX *lex= thd->lex;
   SELECT_LEX *select_lex= lex->first_select_lex();
+  bool cache_insert_values= false;
   DBUG_ENTER("mysql_insert_select_prepare");
 
   /*
@@ -3940,7 +4024,7 @@
   if ((res= mysql_prepare_insert(thd, lex->query_tables, lex->field_list, 0,
                                  lex->update_list, lex->value_list,
                                  lex->duplicates, lex->ignore,
-                                 &select_lex->where, TRUE)))
+                                 &select_lex->where, TRUE, &cache_insert_values)))
     DBUG_RETURN(res);
 
   /*
@@ -4227,6 +4311,7 @@
 int select_insert::prepare2(JOIN *)
 {
   DBUG_ENTER("select_insert::prepare2");
+  switch_to_nullable_trigger_fields(*fields, table);
   if (table->validate_default_values_of_unset_fields(thd))
     DBUG_RETURN(1);
   if (thd->lex->describe)
@@ -4348,7 +4433,11 @@
 bool select_insert::prepare_eof()
 {
   int error;
-  bool const trans_table= table->file->has_transactions_and_rollback();
+  // make sure any ROW format pending event is logged in the same binlog cache
+  bool const trans_table= (thd->is_current_stmt_binlog_format_row() &&
+                           table->file->row_logging) ?
+    table->file->row_logging_has_trans :
+    table->file->has_transactions_and_rollback();
   bool changed;
   bool binary_logged= 0;
   killed_state killed_status= thd->killed;
@@ -4527,7 +4616,7 @@
       table->file->ha_rnd_end();
     table->file->extra(HA_EXTRA_NO_IGNORE_DUP_KEY);
     table->file->extra(HA_EXTRA_WRITE_CANNOT_REPLACE);
-
+    table->file->extra(HA_EXTRA_ABORT_ALTER_COPY);
     /*
       If at least one row has been inserted/modified and will stay in
       the table (the table doesn't have transactions) we must write to
@@ -4573,7 +4662,8 @@
 	  query_cache_invalidate3(thd, table, 1);
     }
     DBUG_ASSERT(transactional_table || !changed ||
-		thd->transaction->stmt.modified_non_trans_table);
+		(thd->transaction->stmt.modified_non_trans_table ||
+                 thd->transaction->all.modified_non_trans_table));
 
     table->s->table_creation_was_logged|= binary_logged;
     table->file->ha_release_auto_increment();
@@ -5266,9 +5356,14 @@
     /* Remember xid's for the case of row based logging */
     ddl_log_update_xid(&ddl_log_state_create, thd->binlog_xid);
     ddl_log_update_xid(&ddl_log_state_rm, thd->binlog_xid);
-    trans_commit_stmt(thd);
-    if (!(thd->variables.option_bits & OPTION_GTID_BEGIN))
-      trans_commit_implicit(thd);
+    if (trans_commit_stmt(thd) ||
+	(!(thd->variables.option_bits & OPTION_GTID_BEGIN) &&
+	 trans_commit_implicit(thd)))
+    {
+        abort_result_set();
+        DBUG_RETURN(true);
+    }
+
     thd->binlog_xid= 0;
 
 #ifdef WITH_WSREP
@@ -5388,7 +5483,13 @@
 
   /* possible error of writing binary log is ignored deliberately */
   (void) thd->binlog_flush_pending_rows_event(TRUE, TRUE);
+  /*
+    In the error case, we remove any partially created table. So clear any
+    incident event generates due to cache error, as it no longer relevant.
+  */
+  binlog_clear_incident(thd);
 
+  bool drop_table_was_logged= false;
   if (table)
   {
     bool tmp_table= table->s->tmp_table;
@@ -5435,6 +5536,7 @@
                          create_info->db_type == partition_hton,
                          &create_info->tabledef_version,
                          tmp_table);
+          drop_table_was_logged= true;
           debug_crash_here("ddl_log_create_after_binlog");
           thd->binlog_xid= 0;
         }
@@ -5459,8 +5561,21 @@
 
   if (create_info->table_was_deleted)
   {
-    /* Unlock locked table that was dropped by CREATE. */
-    (void) trans_rollback_stmt(thd);
+    if (drop_table_was_logged)
+    {
+      /* for DROP binlogging the error status has to be canceled first */
+      Diagnostics_area new_stmt_da(thd->query_id, false, true);
+      Diagnostics_area *old_stmt_da= thd->get_stmt_da();
+
+      thd->set_stmt_da(&new_stmt_da);
+      (void) trans_rollback_stmt(thd);
+      thd->set_stmt_da(old_stmt_da);
+    }
+    else
+    {
+      /* Unlock locked table that was dropped by CREATE. */
+      (void) trans_rollback_stmt(thd);
+    }
     thd->locked_tables_list.unlock_locked_table(thd, create_info->mdl_ticket);
   }
 
diff -Nru mariadb-10.11.11/sql/sql_insert.h mariadb-10.11.13/sql/sql_insert.h
--- mariadb-10.11.11/sql/sql_insert.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_insert.h	2025-05-19 16:14:25.000000000 +0000
@@ -28,7 +28,7 @@
                          List<Item> &update_fields,
                          List<Item> &update_values, enum_duplicates duplic,
                          bool ignore,
-                         COND **where, bool select_insert);
+                         COND **where, bool select_insert, bool * const cache_results);
 bool mysql_insert(THD *thd,TABLE_LIST *table,List<Item> &fields,
                   List<List_item> &values, List<Item> &update_fields,
                   List<Item> &update_values, enum_duplicates flag,
diff -Nru mariadb-10.11.11/sql/sql_lex.cc mariadb-10.11.13/sql/sql_lex.cc
--- mariadb-10.11.11/sql/sql_lex.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_lex.cc	2025-05-19 16:14:25.000000000 +0000
@@ -11179,7 +11179,8 @@
 Field_pair *get_corresponding_field_pair(Item *item,
                                          List<Field_pair> pair_list)
 {
-  DBUG_ASSERT(item->type() == Item::FIELD_ITEM ||
+  DBUG_ASSERT(item->type() == Item::DEFAULT_VALUE_ITEM ||
+              item->type() == Item::FIELD_ITEM ||
               (item->type() == Item::REF_ITEM &&
                ((((Item_ref *) item)->ref_type() == Item_ref::VIEW_REF) ||
                (((Item_ref *) item)->ref_type() == Item_ref::REF))));
@@ -12244,6 +12245,48 @@
                false;
 }
 
+/**
+  Find the real table in prepared SELECT tree
+
+  NOTE: all SELECT must be prepared (to have leaf table list).
+
+  NOTE: it looks only for real tables (not view or derived)
+
+  @param thd          the current thread handle
+  @param db_name      name of db of the table to look for
+  @param db_name      name of db of the table to look for
+
+  @return first found table, NULL or ERROR_TABLE
+*/
+
+TABLE_LIST *SELECT_LEX::find_table(THD *thd,
+                                   const LEX_CSTRING *db_name,
+                                   const LEX_CSTRING *table_name)
+{
+  uchar buff[STACK_BUFF_ALLOC];                 // Max argument in function
+  if (check_stack_overrun(thd, STACK_MIN_SIZE, buff))
+    return NULL;
+
+  List_iterator_fast <TABLE_LIST> ti(leaf_tables);
+  TABLE_LIST *table;
+  while ((table= ti++))
+  {
+    if (cmp(&table->db, db_name) == 0 &&
+        cmp(&table->table_name, table_name) == 0)
+      return table;
+  }
+
+  for (SELECT_LEX_UNIT *u= first_inner_unit(); u; u= u->next_unit())
+  {
+    for (st_select_lex *sl= u->first_select(); sl; sl=sl->next_select())
+    {
+      if ((table= sl->find_table(thd, db_name, table_name)))
+        return table;
+    }
+  }
+  return NULL;
+}
+
 
 bool st_select_lex::is_query_topmost(THD *thd)
 {
diff -Nru mariadb-10.11.11/sql/sql_lex.h mariadb-10.11.13/sql/sql_lex.h
--- mariadb-10.11.11/sql/sql_lex.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_lex.h	2025-05-19 16:14:25.000000000 +0000
@@ -1690,6 +1690,10 @@
   void lex_start(LEX *plex);
   bool is_unit_nest() { return (nest_flags & UNIT_NEST_FL); }
   void mark_as_unit_nest() { nest_flags= UNIT_NEST_FL; }
+
+  TABLE_LIST *find_table(THD *thd,
+                         const LEX_CSTRING *db_name,
+                         const LEX_CSTRING *table_name);
 };
 typedef class st_select_lex SELECT_LEX;
 
@@ -4681,7 +4685,7 @@
 
   int add_period(Lex_ident name, Lex_ident_sys_st start, Lex_ident_sys_st end)
   {
-    if (check_period_name(name.str)) {
+    if (check_column_name(name)) {
       my_error(ER_WRONG_COLUMN_NAME, MYF(0), name.str);
       return 1;
     }
diff -Nru mariadb-10.11.11/sql/sql_parse.cc mariadb-10.11.13/sql/sql_parse.cc
--- mariadb-10.11.11/sql/sql_parse.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_parse.cc	2025-05-19 16:14:25.000000000 +0000
@@ -7296,18 +7296,9 @@
     DBUG_PRINT("info", ("derived: %d  view: %d", table_ref->derived != 0,
                         table_ref->view != 0));
 
-    if (table_ref->is_anonymous_derived_table())
+    if (table_ref->is_anonymous_derived_table() || table_ref->sequence)
       continue;
 
-    if (table_ref->sequence)
-    {
-      /* We want to have either SELECT or INSERT rights to sequences depending
-         on how they are accessed
-      */
-      want_access= ((table_ref->lock_type >= TL_FIRST_WRITE) ?
-                    INSERT_ACL : SELECT_ACL);
-    }
-
     if (check_access(thd, want_access, table_ref->get_db_name().str,
                      &table_ref->grant.privilege,
                      &table_ref->grant.m_internal,
@@ -10389,7 +10380,13 @@
 
 bool check_ident_length(const LEX_CSTRING *ident)
 {
-  if (check_string_char_length(ident, 0, NAME_CHAR_LEN, system_charset_info, 1))
+  /*
+    string_char_length desite the names, goes into Well_formed_prefix_status
+    so this is more than just a length comparison. Things like a primary key
+    doesn't have a name, therefore no length. Also the ident grammar allows
+    empty backtick. Check quickly the length, and if 0, accept that.
+  */
+  if (ident->length && check_string_char_length(ident, 0, NAME_CHAR_LEN, system_charset_info, 1))
   {
     my_error(ER_TOO_LONG_IDENT, MYF(0), ident->str);
     return 1;
diff -Nru mariadb-10.11.11/sql/sql_prepare.cc mariadb-10.11.13/sql/sql_prepare.cc
--- mariadb-10.11.11/sql/sql_prepare.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_prepare.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1304,6 +1304,7 @@
   THD *thd= stmt->thd;
   List_iterator_fast<List_item> its(values_list);
   List_item *values;
+  bool cache_results= FALSE;
   DBUG_ENTER("mysql_test_insert_common");
 
   if (insert_precheck(thd, table_list))
@@ -1336,7 +1337,8 @@
 
     if (mysql_prepare_insert(thd, table_list, fields, values, update_fields,
                              update_values, duplic, ignore,
-                             &unused_conds, FALSE))
+                             &unused_conds, FALSE,
+                             &cache_results))
       goto error;
 
     value_count= values->elements;
diff -Nru mariadb-10.11.11/sql/sql_priv.h mariadb-10.11.13/sql/sql_priv.h
--- mariadb-10.11.11/sql/sql_priv.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_priv.h	2025-05-19 16:14:25.000000000 +0000
@@ -281,6 +281,7 @@
 #define OPTIMIZER_FIX_INNODB_CARDINALITY (8)
 #define OPTIMIZER_ADJ_FIX_REUSE_RANGE_FOR_REF (16)
 #define OPTIMIZER_ADJ_FIX_CARD_MULT (32)
+#define OPTIMIZER_ADJ_FIX_DERIVED_TABLE_READ_COST (64)
 
 #define OPTIMIZER_ADJ_DEFAULT (OPTIMIZER_ADJ_FIX_REUSE_RANGE_FOR_REF | \
                                OPTIMIZER_ADJ_FIX_CARD_MULT)
diff -Nru mariadb-10.11.11/sql/sql_reload.cc mariadb-10.11.13/sql/sql_reload.cc
--- mariadb-10.11.11/sql/sql_reload.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_reload.cc	2025-05-19 16:14:25.000000000 +0000
@@ -618,7 +618,7 @@
       if (table_list->belong_to_view &&
           check_single_table_access(thd, PRIV_LOCK_TABLES, table_list, FALSE))
       {
-        table_list->hide_view_error(thd);
+        table_list->replace_view_error_with_generic(thd);
         goto error_reset_bits;
       }
       if (table_list->is_view_or_derived())
diff -Nru mariadb-10.11.11/sql/sql_select.cc mariadb-10.11.13/sql/sql_select.cc
--- mariadb-10.11.11/sql/sql_select.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_select.cc	2025-05-19 16:14:25.000000000 +0000
@@ -3581,7 +3581,14 @@
       continue;
     Item *item= new (thd->mem_root) Item_temptable_rowid(tab->table);
     item->fix_fields(thd, 0);
-    table_fields->push_back(item, thd->mem_root);
+    /*
+      table_fields points to JOIN::all_fields or JOIN::tmp_all_fields_*.
+      These lists start with "added" fields and then their suffix is shared
+      with JOIN::fields_list or JOIN::tmp_fields_list*.
+      Because of that, new elements can only be added to the front of the list,
+      not to the back.
+    */
+    table_fields->push_front(item, thd->mem_root);
     cur->tmp_table_param->func_count++;
   }
   return 0;
@@ -5994,7 +6001,10 @@
         s->table->opt_range_condition_rows=s->records;
       }
       else
+      {
+        /* Update s->records and s->read_time */
         s->scan_time();
+      }
 
       if (s->table->is_splittable())
         s->add_keyuses_for_splitting();
@@ -14049,6 +14059,36 @@
 }
    
 
+/*
+  Procedure of keys generation for result tables of materialized derived
+  tables/views.
+
+  A key is generated for each equi-join pair {derived_table, some_other_table}.
+  Each generated key consists of fields of derived table used in equi-join.
+  Example:
+
+    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
+                  t1 ON tt.f1=t1.f3 and tt.f2.=t1.f4;
+  In this case for the derived table tt one key will be generated. It will
+  consist of two parts f1 and f2.
+  Example:
+
+    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
+                  t1 ON tt.f1=t1.f3 JOIN
+                  t2 ON tt.f2=t2.f4;
+  In this case for the derived table tt two keys will be generated.
+  One key over f1 field, and another key over f2 field.
+  Currently optimizer may choose to use only one such key, thus the second
+  one will be dropped after range optimizer is finished.
+  See also JOIN::drop_unused_derived_keys function.
+  Example:
+
+    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
+                  t1 ON tt.f1=a_function(t1.f3);
+  In this case for the derived table tt one key will be generated. It will
+  consist of one field - f1.
+*/
+
 static
 bool generate_derived_keys(DYNAMIC_ARRAY *keyuse_array)
 {
@@ -14759,7 +14799,7 @@
       }
       goto no_join_cache;
     }
-    if (cache_level > 4 && no_bka_cache)
+    if (cache_level < 5 || no_bka_cache)
       goto no_join_cache;
     
     if ((flags & HA_MRR_NO_ASSOCIATION) &&
@@ -15461,6 +15501,7 @@
 double JOIN_TAB::scan_time()
 {
   double res;
+  THD *thd= join->thd;
   if (table->is_created())
   {
     if (table->is_filled_at_execution())
@@ -15481,10 +15522,53 @@
     }
     res= read_time;
   }
-  else
+  else if (!(thd->variables.optimizer_adjust_secondary_key_costs &
+             OPTIMIZER_ADJ_FIX_DERIVED_TABLE_READ_COST))
   {
+    /*
+      Old code, do not merge into 11.0+:
+    */
     found_records= records=table->stat_records();
-    read_time= found_records ? (double)found_records: 10.0;// TODO:fix this stub
+    read_time= found_records ? (double)found_records: 10.0;
+    res= read_time;
+  }
+  else
+  {
+    bool using_heap= 0;
+    TABLE_SHARE *share= table->s;
+    found_records= records= table->stat_records();
+
+    if (share->db_type() == heap_hton)
+    {
+      /* Check that the rows will fit into the heap table */
+      ha_rows max_rows;
+      max_rows= (ha_rows) ((MY_MIN(thd->variables.tmp_memory_table_size,
+                                   thd->variables.max_heap_table_size)) /
+                           MY_ALIGN(share->reclength, sizeof(char*)));
+      if (records <= max_rows)
+      {
+        /* The rows will fit into the heap table */
+        using_heap= 1;
+      }
+    }
+
+    /*
+      Code for the following is taken from the heap and aria storage engine.
+      In 11.# this is done without explict engine code
+    */
+    if (using_heap)
+      read_time= (records / 20.0) + 1;
+    else
+    {
+      handler *file= table->file;
+      file->stats.data_file_length= share->reclength * records;
+      /*
+        Call the default scan_time() method as this is the cost for the
+        scan when heap is converted to Aria
+      */
+      read_time= file->handler::scan_time();
+      file->stats.data_file_length= 0;
+    }
     res= read_time;
   }
   return res;
@@ -18544,6 +18628,8 @@
         prev_table->dep_tables|= used_tables;
       if (prev_table->on_expr)
       {
+        /* If the ON expression is still there, it's an outer join */
+        DBUG_ASSERT(prev_table->outer_join);
         prev_table->dep_tables|= table->on_expr_dep_tables;
         table_map prev_used_tables= prev_table->nested_join ?
 	                            prev_table->nested_join->used_tables :
@@ -18558,11 +18644,59 @@
           prevents update of inner table dependences.
           For example it might happen if RAND() function
           is used in JOIN ON clause.
-	*/  
-        if (!((prev_table->on_expr->used_tables() &
-               ~(OUTER_REF_TABLE_BIT | RAND_TABLE_BIT)) &
-              ~prev_used_tables))
+	*/
+        table_map prev_on_expr_deps= prev_table->on_expr->used_tables() &
+                                     ~(OUTER_REF_TABLE_BIT | RAND_TABLE_BIT);
+        prev_on_expr_deps&= ~prev_used_tables;
+
+        if (!prev_on_expr_deps)
           prev_table->dep_tables|= used_tables;
+        else
+        {
+          /*
+            Another possible case is when prev_on_expr_deps!=0 but it depends
+            on a table outside this join nest. SQL name resolution don't allow
+            this but it is possible when LEFT JOIN is inside a subquery which
+            is converted into a semi-join nest, Example:
+
+              t1 SEMI JOIN (
+                t2
+                LEFT JOIN (t3 LEFT JOIN t4 ON t4.col=t1.col) ON expr
+              ) ON ...
+
+            here, we would have prev_table=t4, table=t3.  The condition
+            "ON t4.col=t1.col" depends on tables {t1, t4}. To make sure the
+            optimizer puts t3 before t4 we need to make sure t4.dep_tables
+            includes t3.
+          */
+
+          DBUG_ASSERT(table->embedding == prev_table->embedding);
+          if (table->embedding)
+          {
+            /*
+              Find what are the "peers" of "table" in the join nest. Normally,
+              it is table->embedding->nested_join->used_tables, but here we are
+              in the process of recomputing that value.
+              So, we walk the join list and collect the bitmap of peers:
+            */
+            table_map peers= 0;
+            List_iterator_fast<TABLE_LIST> li(*join_list);
+            TABLE_LIST *peer;
+            while ((peer= li++))
+            {
+              table_map curmap= peer->nested_join
+                                    ? peer->nested_join->used_tables
+                                    : peer->get_map();
+              peers|= curmap;
+            }
+            /*
+              If prev_table doesn't depend on any of its peers, add a
+              dependency on nearest peer, that is, on 'table'.
+            */
+            if (!(prev_on_expr_deps & peers))
+              prev_table->dep_tables|= used_tables;
+          }
+        }
       }
     }
     prev_table= table;
@@ -22354,6 +22488,8 @@
         */
         clear_tables(join, &cleared_tables);
       }
+      if (join->tmp_table_param.copy_funcs.elements)
+        copy_fields(&join->tmp_table_param);
       if (!join->having || join->having->val_bool())
       {
         List<Item> *columns_list= (procedure ? &join->procedure_fields_list :
@@ -27021,9 +27157,13 @@
       original field name, we should additionally check if we have conflict
       for this name (in case if we would perform lookup in all tables).
     */
-    if (resolution == RESOLVED_BEHIND_ALIAS &&
-        order_item->fix_fields_if_needed_for_order_by(thd, order->item))
-      return TRUE;
+    if (resolution == RESOLVED_BEHIND_ALIAS)
+    {
+      if (order_item->fix_fields_if_needed_for_order_by(thd, order->item))
+        return TRUE;
+      // fix_fields may have replaced order->item, reset local variable.
+      order_item= *order->item;
+    }
 
     /* Lookup the current GROUP field in the FROM clause. */
     order_item_type= order_item->type();
@@ -30489,7 +30629,7 @@
       */
       if (top_level ||
           item->is_explicit_name() ||
-          !check_column_name(item->name.str))
+          !check_column_name(item->name))
         item->print_item_w_name(str, query_type);
       else
         item->print(str, query_type);
diff -Nru mariadb-10.11.11/sql/sql_show.cc mariadb-10.11.13/sql/sql_show.cc
--- mariadb-10.11.11/sql/sql_show.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_show.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1435,7 +1435,14 @@
       DBUG_RETURN(TRUE);
     }
 
-    load_db_opt_by_name(thd, dbname->str, &create);
+    if (load_db_opt_by_name(thd, dbname->str, &create) < 0)
+    {
+      push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE,
+                          ER_UNKNOWN_ERROR,
+                          "Database '%.192s' does not have a db.opt file. "
+                          "You can create one with ALTER DATABASE if needed",
+                          dbname->str);
+    }
   }
 
   mysqld_show_create_db_get_fields(thd, &field_list);
@@ -2943,25 +2950,27 @@
 
   while (thread_info *thd_info= arg.thread_infos.get())
   {
+    const char *str;
+    ulonglong start_time;
+    CSET_STRING query;
+
     protocol->prepare_for_resend();
     protocol->store(thd_info->thread_id);
     protocol->store(thd_info->user, strlen(thd_info->user), system_charset_info);
     protocol->store(thd_info->host, strlen(thd_info->host), system_charset_info);
     protocol->store_string_or_null(thd_info->db, system_charset_info);
-    if (thd_info->proc_info)
-      protocol->store(thd_info->proc_info, strlen(thd_info->proc_info),
-                      system_charset_info);
+    if ((str= thd_info->proc_info))
+      protocol->store(str, strlen(str), system_charset_info);
     else
       protocol->store(&command_name[thd_info->command], system_charset_info);
-    if (thd_info->start_time && now > thd_info->start_time)
-      protocol->store_long((now - thd_info->start_time) / HRTIME_RESOLUTION);
+    if ((start_time= thd_info->start_time) && now > start_time)
+      protocol->store_long((now - start_time) / HRTIME_RESOLUTION);
     else
       protocol->store_null();
     protocol->store_string_or_null(thd_info->state_info, system_charset_info);
-    if (thd_info->query_string.length())
-      protocol->store(thd_info->query_string.str(),
-                      thd_info->query_string.length(),
-                      thd_info->query_string.charset());
+    query= thd_info->query_string;
+    if (query.length() && query.str())
+      protocol->store(query.str(), query.length(), query.charset());
     else
       protocol->store_null();
     if (!(thd->variables.old_behavior & OLD_MODE_NO_PROGRESS_INFO))
@@ -4339,7 +4348,7 @@
     break;
   }
 
-  if (lower_case_table_names && !rc)
+  if (lower_case_table_names == 1 && !rc)
   {
     /* 
       We can safely do in-place upgrades here since all of the above cases
diff -Nru mariadb-10.11.11/sql/sql_statistics.cc mariadb-10.11.13/sql/sql_statistics.cc
--- mariadb-10.11.11/sql/sql_statistics.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_statistics.cc	2025-05-19 16:14:25.000000000 +0000
@@ -2077,12 +2077,9 @@
 
     for (i= 0, state= calc_state; i < prefixes; i++, state++)
     {
-      if (i < prefixes)
-      {
-        double val= state->prefix_count == 0 ?
-	            0 : (double) state->entry_count / state->prefix_count;                     
-        index_info->collected_stats->set_avg_frequency(i, val);
-      }
+      double val= state->prefix_count == 0 ?
+                  0 : (double) state->entry_count / state->prefix_count;
+      index_info->collected_stats->set_avg_frequency(i, val);
     }
   }       
 };
@@ -3142,7 +3139,7 @@
             double avg_frequency= pk_read_stats->get_avg_frequency(j-1);
             set_if_smaller(avg_frequency, 1);
             double val= (pk_read_stats->get_avg_frequency(j) /
-                         avg_frequency);
+                         avg_frequency > 0 ? avg_frequency : 1);
 	    index_statistics->set_avg_frequency (l, val);
           }
         }
diff -Nru mariadb-10.11.11/sql/sql_string.h mariadb-10.11.13/sql/sql_string.h
--- mariadb-10.11.11/sql/sql_string.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_string.h	2025-05-19 16:14:25.000000000 +0000
@@ -909,6 +909,8 @@
    :Charset(cs), Binary_string(str, len)
   { }
   String(const String &str) = default;
+  String(String &&str) noexcept
+   :Charset(std::move(str)), Binary_string(std::move(str)){}
 
   void set(String &str,size_t offset,size_t arg_length)
   {
diff -Nru mariadb-10.11.11/sql/sql_table.cc mariadb-10.11.13/sql/sql_table.cc
--- mariadb-10.11.11/sql/sql_table.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_table.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1587,12 +1587,19 @@
     else
     {
 #ifdef WITH_WSREP
-      if (WSREP(thd) && hton && !wsrep_should_replicate_ddl(thd, hton))
+      if (WSREP(thd) && hton)
       {
-        error= 1;
-        goto err;
+        handlerton *ht= hton;
+        // For partitioned tables resolve underlying handlerton
+        if (table->table && table->table->file->partition_ht())
+          ht= table->table->file->partition_ht();
+        if (!wsrep_should_replicate_ddl(thd, ht))
+        {
+          error= 1;
+          goto err;
+        }
       }
-#endif
+#endif /* WITH_WSREP */
 
       if (thd->locked_tables_mode == LTM_LOCK_TABLES ||
           thd->locked_tables_mode == LTM_PRELOCKED_UNDER_LOCK_TABLES)
@@ -1863,18 +1870,6 @@
   if (non_temp_tables_count)
     query_cache_invalidate3(thd, tables, 0);
 
-  /*
-    We are always logging drop of temporary tables.
-    The reason is to handle the following case:
-    - Use statement based replication
-    - CREATE TEMPORARY TABLE foo (logged)
-    - set row based replication
-    - DROP TEMPORARY TABLE foo   (needs to be logged)
-    This should be fixed so that we remember if creation of the
-    temporary table was logged and only log it if the creation was
-    logged.
-  */
-
   if (non_trans_tmp_table_deleted ||
       trans_tmp_table_deleted || non_tmp_table_deleted)
   {
@@ -3112,7 +3107,7 @@
 
     DBUG_ASSERT(sql_field->charset);
 
-    if (check_column_name(sql_field->field_name.str))
+    if (check_column_name(sql_field->field_name))
     {
       my_error(ER_WRONG_COLUMN_NAME, MYF(0), sql_field->field_name.str);
       DBUG_RETURN(TRUE);
@@ -3750,7 +3745,7 @@
 
       key_part_info++;
     }
-    if (!key_info->name.str || check_column_name(key_info->name.str))
+    if (!key_info->name.str || check_column_name(key_info->name))
     {
       my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key_info->name.str);
       DBUG_RETURN(TRUE);
@@ -4989,9 +4984,26 @@
     // In Galera cluster we support only InnoDB sequences
     if (db_type != DB_TYPE_INNODB)
     {
-      my_error(ER_NOT_SUPPORTED_YET, MYF(0),
-               "non-InnoDB sequences in Galera cluster");
-      return(true);
+      // Currently any dynamic storage engine is not possible to identify
+      // using DB_TYPE_XXXX and ENGINE=SEQUENCE is one of them.
+      // Therefore, we get storage engine name from lex.
+      const LEX_CSTRING *tb_name= thd->lex->m_sql_cmd->option_storage_engine_name()->name();
+      // (1) CREATE TABLE ... ENGINE=SEQUENCE  OR
+      // (2) ALTER TABLE ... ENGINE=           OR
+      //     Note in ALTER TABLE table->s->sequence != nullptr
+      // (3) CREATE SEQUENCE ... ENGINE=
+      if ((thd->lex->sql_command == SQLCOM_CREATE_TABLE &&
+           lex_string_eq(tb_name, STRING_WITH_LEN("SEQUENCE"))) ||
+          (thd->lex->sql_command == SQLCOM_ALTER_TABLE) ||
+          (thd->lex->sql_command == SQLCOM_CREATE_SEQUENCE))
+      {
+        my_error(ER_NOT_SUPPORTED_YET, MYF(0),
+                 "non-InnoDB sequences in Galera cluster");
+        push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE,
+                            ER_NOT_SUPPORTED_YET,
+                            "ENGINE=%s not supported by Galera", tb_name->str);
+	return(true);
+      }
     }
 
     // In Galera cluster it is best to use INCREMENT BY 0 with CACHE
@@ -6223,7 +6235,7 @@
       }
       else if (drop->type == Alter_drop::PERIOD)
       {
-        if (table->s->period.name.streq(drop->name))
+        if (table->s->period.name.streq(Lex_ident(drop->name)))
           remove_drop= FALSE;
       }
       else /* Alter_drop::KEY and Alter_drop::FOREIGN_KEY */
@@ -9215,7 +9227,7 @@
     for (bool found= false; !found && (drop= drop_it++); )
     {
       found= drop->type == Alter_drop::PERIOD &&
-             table->s->period.name.streq(drop->name);
+             table->s->period.name.streq(Lex_ident(drop->name));
     }
 
     if (drop)
@@ -9258,7 +9270,7 @@
         }
       }
 
-      if (share->period.constr_name.streq(check->name.str))
+      if (share->period.constr_name.streq(check->name))
       {
         if (!drop_period && !keep)
         {
@@ -10514,10 +10526,21 @@
   if (WSREP(thd) && table &&
       (thd->lex->sql_command == SQLCOM_ALTER_TABLE ||
        thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
-       thd->lex->sql_command == SQLCOM_DROP_INDEX) &&
-      !wsrep_should_replicate_ddl(thd, table->s->db_type()))
-    DBUG_RETURN(true);
-#endif /* WITH_WSREP */
+       thd->lex->sql_command == SQLCOM_DROP_INDEX))
+  {
+    handlerton *ht= table->s->db_type();
+
+    // If alter used ENGINE= we use that
+    if (create_info->used_fields & HA_CREATE_USED_ENGINE)
+      ht= create_info->db_type;
+    // For partitioned tables resolve underlying handlerton
+    else if (table->file->partition_ht())
+      ht= table->file->partition_ht();
+
+    if (!wsrep_should_replicate_ddl(thd, ht))
+      DBUG_RETURN(true);
+  }
+#endif
 
   DEBUG_SYNC(thd, "alter_table_after_open_tables");
 
@@ -11609,7 +11632,8 @@
     - Neither old or new engine uses files from another engine
       The above is mainly true for the sequence and the partition engine.
   */
-  engine_changed= ((new_table->file->ht != table->file->ht) &&
+  engine_changed= ((new_table->file->storage_ht() !=
+                    table->file->storage_ht()) &&
                    ((!(new_table->file->ha_table_flags() & HA_FILE_BASED) ||
                      !(table->file->ha_table_flags() & HA_FILE_BASED))) &&
                    !(table->file->ha_table_flags() & HA_REUSES_FILE_NAMES) &&
@@ -11644,7 +11668,7 @@
 
   debug_crash_here("ddl_log_alter_after_copy");      // Use old table
   /*
-    We are new ready to use the new table. Update the state in the
+    We are now ready to use the new table. Update the state in the
     ddl log so that we recovery know that the new table is ready and
     in case of crash it should use the new one and log the query
     to the binary log.
@@ -12354,6 +12378,7 @@
     if (alt_error > 0)
     {
       error= alt_error;
+      to->file->extra(HA_EXTRA_ABORT_ALTER_COPY);
       copy_data_error_ignore(error, false, to, thd, alter_ctx);
     }
   }
diff -Nru mariadb-10.11.11/sql/sql_trigger.cc mariadb-10.11.13/sql/sql_trigger.cc
--- mariadb-10.11.11/sql/sql_trigger.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_trigger.cc	2025-05-19 16:14:25.000000000 +0000
@@ -622,7 +622,12 @@
   table= tables->table;
 
 #ifdef WITH_WSREP
-  if (WSREP(thd) && !wsrep_should_replicate_ddl(thd, table->s->db_type()))
+  /* Resolve should we replicate creation of the trigger.
+     It should be replicated if storage engine(s) associated
+     to trigger are replicated by Galera.
+  */
+  if (WSREP(thd) &&
+      !wsrep_should_replicate_ddl_iterate(thd, tables))
     goto end;
 #endif
 
diff -Nru mariadb-10.11.11/sql/sql_truncate.cc mariadb-10.11.13/sql/sql_truncate.cc
--- mariadb-10.11.11/sql/sql_truncate.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_truncate.cc	2025-05-19 16:14:25.000000000 +0000
@@ -303,7 +303,7 @@
 bool Sql_cmd_truncate_table::lock_table(THD *thd, TABLE_LIST *table_ref,
                                         bool *hton_can_recreate)
 {
-  handlerton *hton;
+  const handlerton *hton;
   bool versioned;
   bool sequence= false;
   TABLE *table= NULL;
@@ -336,8 +336,15 @@
     versioned= table->versioned();
     hton= table->file->ht;
 #ifdef WITH_WSREP
+    /* Resolve should we replicate truncate. It should
+       be replicated if storage engine(s) associated
+       are replicated by Galera. If this is partitioned
+       table we need to find out default partition
+       handlerton.
+    */
     if (WSREP(thd) &&
-	!wsrep_should_replicate_ddl(thd, hton))
+        !wsrep_should_replicate_ddl(thd, table->file->partition_ht() ?
+                                    table->file->partition_ht() : hton))
       DBUG_RETURN(TRUE);
 #endif
 
@@ -359,12 +366,26 @@
     sequence= share->table_type == TABLE_TYPE_SEQUENCE;
     hton= share->db_type();
 #ifdef WITH_WSREP
-    if (WSREP(thd) &&
-	hton != view_pseudo_hton &&
-	!wsrep_should_replicate_ddl(thd, hton))
+    if (WSREP(thd) && hton != view_pseudo_hton)
     {
-      tdc_release_share(share);
-      DBUG_RETURN(TRUE);
+      /* Resolve should we replicate truncate. It should
+         be replicated if storage engine(s) associated
+         are replicated by Galera. If this is partitioned
+         table we need to find out default partition
+         handlerton.
+      */
+      const handlerton* const ht=
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+        share->default_part_plugin ?
+          plugin_hton(share->default_part_plugin) :
+#endif
+        hton;
+
+      if (ht && !wsrep_should_replicate_ddl(thd, ht))
+      {
+        tdc_release_share(share);
+        DBUG_RETURN(TRUE);
+      }
     }
 #endif
 
diff -Nru mariadb-10.11.11/sql/sql_update.cc mariadb-10.11.13/sql/sql_update.cc
--- mariadb-10.11.11/sql/sql_update.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_update.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1704,7 +1704,7 @@
       if (multi_update_check_table_access(thd, tbl, tables_for_update,
                                           &updated))
       {
-        tbl->hide_view_error(thd);
+        tbl->replace_view_error_with_generic(thd);
         return true;
       }
     }
@@ -2356,7 +2356,8 @@
   if (unlikely((thd->variables.option_bits & OPTION_SAFE_UPDATES) &&
                error_if_full_join(join)))
     DBUG_RETURN(1);
-  if (join->implicit_grouping)
+  if (join->implicit_grouping ||
+      join->select_lex->have_window_funcs())
   {
     my_error(ER_INVALID_GROUP_FUNC_USE, MYF(0));
     DBUG_RETURN(1);
diff -Nru mariadb-10.11.11/sql/sql_view.cc mariadb-10.11.13/sql/sql_view.cc
--- mariadb-10.11.11/sql/sql_view.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_view.cc	2025-05-19 16:14:25.000000000 +0000
@@ -183,7 +183,7 @@
 
   for (uint column_no= 1; (item= it++); column_no++)
   {
-    if (item->is_explicit_name() || !check_column_name(item->name.str))
+    if (item->is_explicit_name() || !check_column_name(item->name))
       continue;
     name_len= my_snprintf(buff, NAME_LEN, "Name_exp_%u", column_no);
     item->orig_name= item->name.str;
@@ -341,7 +341,7 @@
       {
         if (check_single_table_access(thd, SELECT_ACL, tbl, FALSE))
         {
-          tbl->hide_view_error(thd);
+          tbl->replace_view_error_with_generic(thd);
           goto err;
         }
       }
@@ -452,8 +452,6 @@
   lex->link_first_table_back(view, link_to_local);
   view->open_type= OT_BASE_ONLY;
 
-  WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
-
   /*
     ignore lock specs for CREATE statement
   */
@@ -471,13 +469,20 @@
   }
 
 #ifdef WITH_WSREP
-  if(!wsrep_should_replicate_ddl_iterate(thd, static_cast<const TABLE_LIST *>(tables)))
+  /* Resolve should we replicate creation of the view.
+     It should be replicated if storage engine(s) associated
+     to view are replicated by Galera.
+  */
+  if (WSREP(thd) &&
+      !wsrep_should_replicate_ddl_iterate(thd, tables))
   {
     res= TRUE;
     goto err_no_relink;
   }
 #endif
 
+  WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
+
   view= lex->unlink_first_table(&link_to_local);
 
   if (check_db_dir_existence(view->db.str))
diff -Nru mariadb-10.11.11/sql/sql_yacc.yy mariadb-10.11.13/sql/sql_yacc.yy
--- mariadb-10.11.11/sql/sql_yacc.yy	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sql_yacc.yy	2025-05-19 16:14:25.000000000 +0000
@@ -9107,7 +9107,7 @@
             if ($4.str)
             {
               if (unlikely(Lex->sql_command == SQLCOM_CREATE_VIEW &&
-                          check_column_name($4.str)))
+                          check_column_name($4)))
                 my_yyabort_error((ER_WRONG_COLUMN_NAME, MYF(0), $4.str));
               $2->base_flags|= item_base_t::IS_EXPLICIT_NAME;
               $2->set_name(thd, $4);
diff -Nru mariadb-10.11.11/sql/structs.h mariadb-10.11.13/sql/structs.h
--- mariadb-10.11.11/sql/structs.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/structs.h	2025-05-19 16:14:25.000000000 +0000
@@ -236,7 +236,7 @@
   LEX_CSTRING user, host;
   void init() { memset(this, 0, sizeof(*this)); }
   void copy(MEM_ROOT *root, const LEX_CSTRING *usr, const LEX_CSTRING *host);
-  bool is_role() const { return user.str[0] && !host.str[0]; }
+  bool is_role() const { return user.str[0] && (!host.str || !host.str[0]); }
   void set_lex_string(LEX_CSTRING *l, char *buf)
   {
     if (is_role())
diff -Nru mariadb-10.11.11/sql/sys_vars.cc mariadb-10.11.13/sql/sys_vars.cc
--- mariadb-10.11.11/sql/sys_vars.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/sys_vars.cc	2025-05-19 16:14:25.000000000 +0000
@@ -2982,7 +2982,7 @@
 {
   "adjust_secondary_key_cost", "disable_max_seek", "disable_forced_index_in_group_by",
   "fix_innodb_cardinality", "fix_reuse_range_for_ref",
-  "fix_card_multiplier", 0
+  "fix_card_multiplier", "fix_derived_table_read_cost", 0
 };
 
 
@@ -2999,8 +2999,9 @@
     "secondary keys. "
     "fix_reuse_range_for_ref = Do a better job at reusing range access estimates "
     "when estimating ref access. "
-    "fix_card_multiplier = Fix the computation in selectivity_for_indexes."
-    " selectivity_multiplier. "
+    "fix_card_multiplier = Fix the computation in selectivity_for_indexes. "
+    "fix_derived_table_read_cost = Fix the cost of reading materialized "
+    "derived table. "
 
     "This variable will be deleted in MariaDB 11.0 as it is not needed with the "
     "new 11.0 optimizer.",
@@ -6309,7 +6310,9 @@
 static Sys_var_enum Sys_wsrep_forced_binlog_format(
        "wsrep_forced_binlog_format", "binlog format to take effect over user's choice",
        GLOBAL_VAR(wsrep_forced_binlog_format), CMD_LINE(REQUIRED_ARG),
-       wsrep_binlog_format_names, DEFAULT(BINLOG_FORMAT_UNSPEC));
+       wsrep_binlog_format_names, DEFAULT(BINLOG_FORMAT_UNSPEC),
+       NO_MUTEX_GUARD, NOT_IN_BINLOG,
+       ON_CHECK(wsrep_forced_binlog_format_check));
 
 static Sys_var_mybool Sys_wsrep_recover_datadir(
        "wsrep_recover", "Recover database state after crash and exit",
diff -Nru mariadb-10.11.11/sql/table.cc mariadb-10.11.13/sql/table.cc
--- mariadb-10.11.11/sql/table.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/table.cc	2025-05-19 16:14:25.000000000 +0000
@@ -5320,9 +5320,10 @@
 }
 
 
-bool check_column_name(const char *name)
+bool check_column_name(const Lex_ident &ident)
 {
   // name length in symbols
+  const char *name= ident.str, *end= ident.str + ident.length;
   size_t name_length= 0;
   bool last_char_is_space= TRUE;
 
@@ -5332,9 +5333,7 @@
     last_char_is_space= my_isspace(system_charset_info, *name);
     if (system_charset_info->use_mb())
     {
-      int len=my_ismbchar(system_charset_info, name, 
-                          name+system_charset_info->mbmaxlen);
-      if (len)
+      if (int len= my_ismbchar(system_charset_info, name,  end))
       {
         name += len;
         name_length++;
@@ -5354,12 +5353,6 @@
 }
 
 
-bool check_period_name(const char *name)
-{
-  return check_column_name(name);
-}
-
-
 /**
   Checks whether a table is intact. Should be done *just* after the table has
   been opened.
@@ -6360,9 +6353,9 @@
   @pre This method can be called only if there is an error.
 */
 
-void TABLE_LIST::hide_view_error(THD *thd)
+void TABLE_LIST::replace_view_error_with_generic(THD *thd)
 {
-  if ((thd->killed && !thd->is_error())|| thd->get_internal_handler())
+  if ((thd->killed && !thd->is_error()) || thd->get_internal_handler())
     return;
   /* Hide "Unknown column" or "Unknown function" error */
   DBUG_ASSERT(thd->is_error());
@@ -9956,37 +9949,6 @@
   return error;
 }
 
-/*
-  Procedure of keys generation for result tables of materialized derived
-  tables/views.
-
-  A key is generated for each equi-join pair derived table-another table.
-  Each generated key consists of fields of derived table used in equi-join.
-  Example:
-
-    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
-                  t1 ON tt.f1=t1.f3 and tt.f2.=t1.f4;
-  In this case for the derived table tt one key will be generated. It will
-  consist of two parts f1 and f2.
-  Example:
-
-    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
-                  t1 ON tt.f1=t1.f3 JOIN
-                  t2 ON tt.f2=t2.f4;
-  In this case for the derived table tt two keys will be generated.
-  One key over f1 field, and another key over f2 field.
-  Currently optimizer may choose to use only one such key, thus the second
-  one will be dropped after range optimizer is finished.
-  See also JOIN::drop_unused_derived_keys function.
-  Example:
-
-    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
-                  t1 ON tt.f1=a_function(t1.f3);
-  In this case for the derived table tt one key will be generated. It will
-  consist of one field - f1.
-*/
-
-
 
 /*
   @brief
diff -Nru mariadb-10.11.11/sql/table.h mariadb-10.11.13/sql/table.h
--- mariadb-10.11.11/sql/table.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/table.h	2025-05-19 16:14:25.000000000 +0000
@@ -2192,7 +2192,7 @@
   void init(vers_system_time_t _type,
             Vers_history_point _start= Vers_history_point(),
             Vers_history_point _end= Vers_history_point(),
-            Lex_ident          _name= "SYSTEM_TIME")
+            Lex_ident          _name= { STRING_WITH_LEN("SYSTEM_TIME") })
   {
     type= _type;
     orig_type= _type;
@@ -2207,7 +2207,7 @@
   void set_all()
   {
     type= SYSTEM_TIME_ALL;
-    name= "SYSTEM_TIME";
+    name= { STRING_WITH_LEN("SYSTEM_TIME") };
   }
 
   void print(String *str, enum_query_type query_type) const;
@@ -2572,7 +2572,7 @@
   List<TABLE_LIST> *view_tables;
   /* most upper view this table belongs to */
   TABLE_LIST	*belong_to_view;
-  /* A derived table this table belongs to */
+  /* A merged derived table this table belongs to */
   TABLE_LIST    *belong_to_derived;
   /*
     The view directly referencing this table
@@ -2830,7 +2830,7 @@
   bool check_single_table(TABLE_LIST **table, table_map map,
                           TABLE_LIST *view);
   bool set_insert_values(MEM_ROOT *mem_root);
-  void hide_view_error(THD *thd);
+  void replace_view_error_with_generic(THD *thd);
   TABLE_LIST *find_underlying_table(TABLE *table);
   TABLE_LIST *first_leaf_for_name_resolution();
   TABLE_LIST *last_leaf_for_name_resolution();
@@ -3078,6 +3078,8 @@
   ulonglong m_table_ref_version;
 };
 
+#define ERROR_TABLE  ((TABLE_LIST*) 0x1)
+
 class Item;
 
 /*
@@ -3388,8 +3390,7 @@
                       int db_errno);
 void update_create_info_from_table(HA_CREATE_INFO *info, TABLE *form);
 bool check_db_name(LEX_STRING *db);
-bool check_column_name(const char *name);
-bool check_period_name(const char *name);
+bool check_column_name(const Lex_ident &name);
 bool check_table_name(const char *name, size_t length, bool check_for_path_chars);
 int rename_file_ext(const char * from,const char * to,const char * ext);
 char *get_field(MEM_ROOT *mem, Field *field);
diff -Nru mariadb-10.11.11/sql/vers_string.h mariadb-10.11.13/sql/vers_string.h
--- mariadb-10.11.11/sql/vers_string.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/vers_string.h	2025-05-19 16:14:25.000000000 +0000
@@ -62,7 +62,7 @@
   { }
   Lex_cstring_with_compare(const LEX_CSTRING src) : Lex_cstring(src.str, src.length)
   { }
-  Lex_cstring_with_compare(const char *_str) : Lex_cstring(_str, strlen(_str))
+  explicit Lex_cstring_with_compare(const char *_str) : Lex_cstring(_str, strlen(_str))
   { }
   bool streq(const Lex_cstring_with_compare& b) const
   {
diff -Nru mariadb-10.11.11/sql/wsrep_applier.cc mariadb-10.11.13/sql/wsrep_applier.cc
--- mariadb-10.11.11/sql/wsrep_applier.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_applier.cc	2025-05-19 16:14:25.000000000 +0000
@@ -203,6 +203,21 @@
       }
     }
 
+    if (LOG_EVENT_IS_WRITE_ROW(typ) ||
+        LOG_EVENT_IS_UPDATE_ROW(typ) ||
+        LOG_EVENT_IS_DELETE_ROW(typ))
+    {
+      Rows_log_event* rle = static_cast<Rows_log_event*>(ev);
+      if (thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS))
+      {
+        rle->set_flags(Rows_log_event::RELAXED_UNIQUE_CHECKS_F);
+      }
+      if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS))
+      {
+        rle->set_flags(Rows_log_event::NO_FOREIGN_KEY_CHECKS_F);
+      }
+    }
+
     /* Use the original server id for logging. */
     thd->set_server_id(ev->server_id);
     thd->lex->current_select= 0;
diff -Nru mariadb-10.11.11/sql/wsrep_client_service.cc mariadb-10.11.13/sql/wsrep_client_service.cc
--- mariadb-10.11.11/sql/wsrep_client_service.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_client_service.cc	2025-05-19 16:14:25.000000000 +0000
@@ -304,6 +304,12 @@
     replayer_service.replay_status(ret);
   }
 
+  // In Galera we allow only InnoDB sequences, thus
+  // sequence table updates are in writeset.
+  // Binlog cache needs reset so that binlog_close
+  // does not write cache to binlog file yet.
+  binlog_reset_cache(m_thd);
+
   replayer_thd->main_security_ctx = old_ctx;
   delete replayer_thd;
   DBUG_RETURN(ret);
diff -Nru mariadb-10.11.11/sql/wsrep_high_priority_service.cc mariadb-10.11.13/sql/wsrep_high_priority_service.cc
--- mariadb-10.11.11/sql/wsrep_high_priority_service.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_high_priority_service.cc	2025-05-19 16:14:25.000000000 +0000
@@ -610,7 +610,7 @@
   int ret= apply_events(thd, m_rli, data, err, true);
 
   thd->close_temporary_tables();
-  if (!ret && !(ws_meta.flags() & wsrep::provider::flag::commit))
+  if (!ret && !wsrep::commits_transaction(ws_meta.flags()))
   {
     thd->wsrep_cs().fragment_applied(ws_meta.seqno());
   }
@@ -778,7 +778,7 @@
   }
   ret= ret || apply_events(thd, m_rli, data, err, true);
   thd->close_temporary_tables();
-  if (!ret && !(ws_meta.flags() & wsrep::provider::flag::commit))
+  if (!ret && !wsrep::commits_transaction(ws_meta.flags()))
   {
     thd->wsrep_cs().fragment_applied(ws_meta.seqno());
   }
diff -Nru mariadb-10.11.11/sql/wsrep_mysqld.cc mariadb-10.11.13/sql/wsrep_mysqld.cc
--- mariadb-10.11.11/sql/wsrep_mysqld.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_mysqld.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1,5 +1,5 @@
-/* Copyright (c) 2008, 2023 Codership Oy <http://www.codership.com>
-   Copyright (c) 2020, 2022, MariaDB
+/* Copyright (c) 2008, 2025, Codership Oy <http://www.codership.com>
+   Copyright (c) 2020, 2025, MariaDB
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -834,7 +834,8 @@
     wsrep_server_gtid_t new_gtid;
     new_gtid.domain_id= wsrep_gtid_domain_id;
     new_gtid.server_id= global_system_variables.server_id;
-    new_gtid.seqno= 0;
+    /* Use seqno which was recovered in wsrep_init_gtid() */
+    new_gtid.seqno= wsrep_gtid_server.seqno();
     /* Try to search for domain_id and server_id combination in binlog if found continue from last seqno */
     wsrep_get_binlog_gtid_seqno(new_gtid);
     wsrep_gtid_server.gtid(new_gtid);
@@ -867,12 +868,13 @@
   wsrep_init_position();
   wsrep_sst_auth_init();
 
-  if (strlen(wsrep_provider)== 0 ||
-      !strcmp(wsrep_provider, WSREP_NONE))
+  if (!*wsrep_provider ||
+      !strcasecmp(wsrep_provider, WSREP_NONE))
   {
     // enable normal operation in case no provider is specified
     global_system_variables.wsrep_on= 0;
-    int err= Wsrep_server_state::instance().load_provider(wsrep_provider, wsrep_provider_options ? wsrep_provider_options : "");
+    int err= Wsrep_server_state::instance().load_provider(
+        wsrep_provider, wsrep_provider_options ? wsrep_provider_options : "");
     if (err)
     {
       DBUG_PRINT("wsrep",("wsrep::init() failed: %d", err));
@@ -1603,7 +1605,12 @@
       This allows autocommit SELECTs and a first SELECT after SET AUTOCOMMIT=0
       TODO: modify to check if thd has locked any rows.
     */
-    return thd->wsrep_cs().sync_wait(-1);
+    if (thd->wsrep_cs().sync_wait(-1))
+    {
+      wsrep_override_error(thd, thd->wsrep_cs().current_error(),
+                           thd->wsrep_cs().current_error_status());
+      return true;
+    }
   }
 
   return false;
@@ -2489,50 +2496,48 @@
 /* Forward declarations. */
 int wsrep_create_trigger_query(THD *thd, uchar** buf, size_t* buf_len);
 
-bool wsrep_should_replicate_ddl_iterate(THD* thd, const TABLE_LIST* table_list)
-{
-  if (WSREP(thd))
-  {
-    for (const TABLE_LIST* it= table_list; it; it= it->next_global)
-    {
-      if (it->table &&
-          !wsrep_should_replicate_ddl(thd, it->table->s->db_type()))
-        return false;
-    }
-  }
-  return true;
-}
+/*! Should DDL be replicated by Galera
+ *
+ * @param thd            thread handle
+ * @param hton           real storage engine handlerton
+ *
+ * @retval true if we should replicate DDL, false if not */
 
 bool wsrep_should_replicate_ddl(THD* thd, const handlerton *hton)
 {
   if (!wsrep_check_mode(WSREP_MODE_STRICT_REPLICATION))
     return true;
 
-  if (!hton)
-    return true;
+  DBUG_ASSERT(hton != nullptr);
 
   switch (hton->db_type)
   {
+    case DB_TYPE_UNKNOWN:
+      /* Special pseudo-handlertons (such as 10.6+ JSON tables). */
+      return true;
+      break;
     case DB_TYPE_INNODB:
       return true;
       break;
     case DB_TYPE_MYISAM:
       if (wsrep_check_mode(WSREP_MODE_REPLICATE_MYISAM))
         return true;
-      else
-        WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd));
       break;
     case DB_TYPE_ARIA:
       if (wsrep_check_mode(WSREP_MODE_REPLICATE_ARIA))
-	return true;
-      else
-        WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd));
+        return true;
+      break;
+    case DB_TYPE_PARTITION_DB:
+      /* In most cases this means we could not find out
+         table->file->partition_ht() */
+      return true;
       break;
     default:
-      WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd));
       break;
   }
 
+  WSREP_DEBUG("wsrep OSU failed for %s", wsrep_thd_query(thd));
+
   /* wsrep_mode = STRICT_REPLICATION, treat as error */
   my_error(ER_GALERA_REPLICATION_NOT_SUPPORTED, MYF(0));
   push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
@@ -2542,6 +2547,26 @@
                       ha_resolve_storage_engine_name(hton));
   return false;
 }
+
+bool wsrep_should_replicate_ddl_iterate(THD* thd, const TABLE_LIST* table_list)
+{
+  for (const TABLE_LIST* it= table_list; it; it= it->next_global)
+  {
+    const TABLE* table= it->table;
+    if (table && !it->table_function)
+    {
+      /* If this is partitioned table we need to find out
+         implementing storage engine handlerton.
+      */
+      const handlerton *ht= table->file->partition_ht();
+      if (!ht) ht= table->s->db_type();
+      if (!wsrep_should_replicate_ddl(thd, ht))
+        return false;
+    }
+  }
+  return true;
+}
+
 /*
   Decide if statement should run in TOI.
 
@@ -2650,9 +2675,8 @@
     if (create_info)
     {
       const handlerton *hton= create_info->db_type;
-
       if (!hton)
-	hton= ha_default_handlerton(thd);
+        hton= ha_default_handlerton(thd);
       if (!wsrep_should_replicate_ddl(thd, hton))
         return false;
     }
@@ -2787,7 +2811,6 @@
   unireg_abort(1);
 }
 
-
 /*
   returns:
    0: statement was replicated as TOI
@@ -2803,6 +2826,7 @@
   DBUG_ASSERT(wsrep_OSU_method_get(thd) == WSREP_OSU_TOI);
 
   WSREP_DEBUG("TOI Begin: %s", wsrep_thd_query(thd));
+  DEBUG_SYNC(thd, "wsrep_before_toi_begin");
 
   if (wsrep_can_run_in_toi(thd, db, table, table_list, create_info) == false)
   {
@@ -3043,12 +3067,13 @@
                              const wsrep::key_array *fk_tables,
                              const HA_CREATE_INFO *create_info)
 {
+  DEBUG_SYNC(thd, "wsrep_kill_thd_before_enter_toi");
   mysql_mutex_lock(&thd->LOCK_thd_kill);
   const killed_state killed = thd->killed;
   mysql_mutex_unlock(&thd->LOCK_thd_kill);
   if (killed)
   {
-    DBUG_ASSERT(FALSE);
+    /* The thread may have been killed as a result of memory pressure. */
     return -1;
   }
 
@@ -3217,29 +3242,28 @@
 
   @param  requestor_ctx        The MDL context of the requestor
   @param  ticket               MDL ticket for the requested lock
+  @param  key                  The key of the object (data) being protected
 
-  @retval TRUE   Lock request can be granted
-  @retval FALSE  Lock request cannot be granted
 */
-
 void wsrep_handle_mdl_conflict(MDL_context *requestor_ctx,
                                const MDL_ticket *ticket,
                                const MDL_key *key)
 {
   THD *request_thd= requestor_ctx->get_thd();
-  THD *granted_thd= ticket->get_ctx()->get_thd();
 
   /* Fallback to the non-wsrep behaviour */
   if (!WSREP(request_thd)) return;
 
-  const char* schema= key->db_name();
-  int schema_len= key->db_name_length();
-
   mysql_mutex_lock(&request_thd->LOCK_thd_data);
 
   if (wsrep_thd_is_toi(request_thd) ||
       wsrep_thd_is_applying(request_thd))
   {
+    THD *granted_thd= ticket->get_ctx()->get_thd();
+
+    const char* schema= key->db_name();
+    int schema_len= key->db_name_length();
+
     WSREP_DEBUG("wsrep_handle_mdl_conflict request TOI/APPLY for %s",
                 wsrep_thd_query(request_thd));
     THD_STAGE_INFO(request_thd, stage_waiting_isolation);
@@ -3259,7 +3283,6 @@
     /* Here we will call wsrep_abort_transaction so we should hold
     THD::LOCK_thd_data to protect victim from concurrent usage
     and THD::LOCK_thd_kill to protect from disconnect or delete.
-
     */
     mysql_mutex_lock(&granted_thd->LOCK_thd_kill);
     mysql_mutex_lock(&granted_thd->LOCK_thd_data);
@@ -3303,16 +3326,21 @@
             (granted_thd->system_thread != NON_SYSTEM_THREAD &&
              granted_thd->mdl_context.has_explicit_locks()))
     {
-      WSREP_DEBUG("BF thread waiting for FLUSH for %s",
-                  wsrep_thd_query(request_thd));
-      THD_STAGE_INFO(request_thd, stage_waiting_ddl);
+      WSREP_DEBUG("BF thread waiting for %s",
+                  granted_thd->lex->sql_command == SQLCOM_FLUSH ? "FLUSH" : "BACKUP");
       ticket->wsrep_report(wsrep_debug);
+
       if (granted_thd->current_backup_stage != BACKUP_FINISHED &&
 	  wsrep_check_mode(WSREP_MODE_BF_MARIABACKUP))
       {
 	wsrep_abort_thd(request_thd, granted_thd, 1);
       }
     }
+    else if (granted_thd->lex->sql_command == SQLCOM_LOCK_TABLES)
+    {
+      WSREP_DEBUG("BF thread waiting for LOCK TABLES");
+      ticket->wsrep_report(wsrep_debug);
+    }
     else if (request_thd->lex->sql_command == SQLCOM_DROP_TABLE)
     {
       WSREP_DEBUG("DROP caused BF abort, conf %s for %s",
diff -Nru mariadb-10.11.11/sql/wsrep_mysqld.h mariadb-10.11.13/sql/wsrep_mysqld.h
--- mariadb-10.11.11/sql/wsrep_mysqld.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_mysqld.h	2025-05-19 16:14:25.000000000 +0000
@@ -356,7 +356,7 @@
                              const wsrep::key_array *fk_tables= nullptr,
                              const HA_CREATE_INFO* create_info= nullptr);
 
-bool wsrep_should_replicate_ddl(THD* thd, const handlerton *db_type);
+bool wsrep_should_replicate_ddl(THD* thd, const handlerton *hton);
 bool wsrep_should_replicate_ddl_iterate(THD* thd, const TABLE_LIST* table_list);
 
 void wsrep_to_isolation_end(THD *thd);
@@ -615,7 +615,6 @@
 #define wsrep_thr_deinit() do {} while(0)
 #define wsrep_init_globals() do {} while(0)
 #define wsrep_create_appliers(X) do {} while(0)
-#define wsrep_should_replicate_ddl(X,Y) (1)
 #define wsrep_cluster_address_exists() (false)
 #define WSREP_MYSQL_DB (0)
 #define WSREP_TO_ISOLATION_BEGIN(db_, table_, table_list_) do { } while(0)
diff -Nru mariadb-10.11.11/sql/wsrep_server_service.cc mariadb-10.11.13/sql/wsrep_server_service.cc
--- mariadb-10.11.11/sql/wsrep_server_service.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_server_service.cc	2025-05-19 16:14:25.000000000 +0000
@@ -192,6 +192,7 @@
     break;
   case wsrep::log::unknown:
     WSREP_UNKNOWN("%s", message);
+    assert(0);
     break;
   }
 }
diff -Nru mariadb-10.11.11/sql/wsrep_sst.cc mariadb-10.11.13/sql/wsrep_sst.cc
--- mariadb-10.11.11/sql/wsrep_sst.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_sst.cc	2025-05-19 16:14:25.000000000 +0000
@@ -464,7 +464,7 @@
     if (WSREP_ON)
     {
       int const rcode(seqno < 0 ? seqno : 0);
-      error= wsrep_sst_complete(thd,rcode, sst_gtid);
+      error= wsrep_sst_complete(thd, rcode, sst_gtid);
     }
 
     return error;
@@ -1977,6 +1977,15 @@
                    wsrep::seqno(err ? wsrep::seqno::undefined() :
                                 wsrep::seqno(ret_seqno)));
 
+#ifdef ENABLED_DEBUG_SYNC
+  DBUG_EXECUTE_IF("sync.wsrep_sst_donor_after_donation", {
+    const char act[]= "now "
+                      "SIGNAL sync.wsrep_sst_donor_after_donation_reached "
+                      "WAIT_FOR signal.wsrep_sst_donor_after_donation_continue";
+    DBUG_ASSERT(!debug_sync_set_action(thd.ptr, STRING_WITH_LEN(act)));
+  });
+#endif /* ENABLED_DEBUG_SYNC */
+
   Wsrep_server_state::instance().sst_sent(gtid, err);
 
   proc.wait();
diff -Nru mariadb-10.11.11/sql/wsrep_thd.h mariadb-10.11.13/sql/wsrep_thd.h
--- mariadb-10.11.11/sql/wsrep_thd.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_thd.h	2025-05-19 16:14:25.000000000 +0000
@@ -237,25 +237,13 @@
                                         wsrep::client_error ce,
                                         enum wsrep::provider::status status)
 {
-    DBUG_ASSERT(ce != wsrep::e_success);
-    switch (ce)
-    {
-    case wsrep::e_error_during_commit:
-      if (status == wsrep::provider::error_size_exceeded)
-        wsrep_override_error(thd, ER_UNKNOWN_ERROR, "Maximum writeset size exceeded");
-      else
-        wsrep_override_error(thd, ER_ERROR_DURING_COMMIT, 0, status);
-      break;
-    case wsrep::e_deadlock_error:
-      wsrep_override_error(thd, ER_LOCK_DEADLOCK);
-      break;
-    case wsrep::e_interrupted_error:
-      wsrep_override_error(thd, ER_QUERY_INTERRUPTED);
-      break;
-    case wsrep::e_size_exceeded_error:
+  DBUG_ASSERT(ce != wsrep::e_success);
+  switch (ce)
+  {
+  case wsrep::e_error_during_commit:
+    if (status == wsrep::provider::error_size_exceeded)
       wsrep_override_error(thd, ER_UNKNOWN_ERROR, "Maximum writeset size exceeded");
-      break;
-    case wsrep::e_append_fragment_error:
+    else
       /* TODO: Figure out better error number */
       if (status)
         wsrep_override_error(thd, ER_ERROR_DURING_COMMIT,
@@ -265,17 +253,45 @@
       else
         wsrep_override_error(thd, ER_ERROR_DURING_COMMIT,
                              "Error while appending streaming replication fragment");
-      break;
-    case wsrep::e_not_supported_error:
-      wsrep_override_error(thd, ER_NOT_SUPPORTED_YET);
-      break;
-    case wsrep::e_timeout_error:
-      wsrep_override_error(thd, ER_LOCK_WAIT_TIMEOUT);
+    break;
+  case wsrep::e_deadlock_error:
+    switch (thd->lex->sql_command)
+    {
+    case SQLCOM_XA_END:
+    case SQLCOM_XA_PREPARE:
+      wsrep_override_error(thd, ER_XA_RBDEADLOCK);
       break;
     default:
-      wsrep_override_error(thd, ER_UNKNOWN_ERROR);
+      wsrep_override_error(thd, ER_LOCK_DEADLOCK);
       break;
     }
+    break;
+  case wsrep::e_interrupted_error:
+    wsrep_override_error(thd, ER_QUERY_INTERRUPTED);
+    break;
+  case wsrep::e_size_exceeded_error:
+    wsrep_override_error(thd, ER_UNKNOWN_ERROR, "Maximum writeset size exceeded");
+    break;
+  case wsrep::e_append_fragment_error:
+    /* TODO: Figure out better error number */
+    if (status)
+      wsrep_override_error(thd, ER_ERROR_DURING_COMMIT,
+                           "Error while appending streaming replication fragment"
+                           "(provider status: %s)",
+                           wsrep::provider::to_string(status).c_str());
+    else
+      wsrep_override_error(thd, ER_ERROR_DURING_COMMIT,
+                           "Error while appending streaming replication fragment");
+    break;
+  case wsrep::e_not_supported_error:
+    wsrep_override_error(thd, ER_NOT_SUPPORTED_YET);
+    break;
+  case wsrep::e_timeout_error:
+    wsrep_override_error(thd, ER_LOCK_WAIT_TIMEOUT);
+    break;
+  default:
+    wsrep_override_error(thd, ER_UNKNOWN_ERROR);
+  }
 }
 
 /**
diff -Nru mariadb-10.11.11/sql/wsrep_trans_observer.h mariadb-10.11.13/sql/wsrep_trans_observer.h
--- mariadb-10.11.11/sql/wsrep_trans_observer.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_trans_observer.h	2025-05-19 16:14:25.000000000 +0000
@@ -1,4 +1,4 @@
-/* Copyright 2016-2023 Codership Oy <http://www.codership.com>
+/* Copyright 2016-2025 Codership Oy <http://www.codership.com>
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -265,12 +265,17 @@
   {
     DBUG_RETURN(ret);
   }
+
   if ((ret= thd->wsrep_cs().before_prepare()) == 0)
   {
     DBUG_ASSERT(!thd->wsrep_trx().ws_meta().gtid().is_undefined());
+    /* Here we init xid with UUID and wsrep seqno. GTID is
+       set to undefined because commit order is decided later
+       in wsrep_before_commit(). wsrep_before_prepare() is
+       executed out of order. */
     wsrep_xid_init(&thd->wsrep_xid,
                    thd->wsrep_trx().ws_meta().gtid(),
-                   wsrep_gtid_server.gtid());
+                   wsrep_gtid_server.undefined());
   }
 
   mysql_mutex_lock(&thd->LOCK_thd_kill);
@@ -472,12 +477,6 @@
 int wsrep_after_statement(THD* thd)
 {
   DBUG_ENTER("wsrep_after_statement");
-  WSREP_DEBUG("wsrep_after_statement for %lu client_state %s "
-              " client_mode %s trans_state %s",
-              thd_get_thread_id(thd),
-              wsrep::to_c_string(thd->wsrep_cs().state()),
-              wsrep::to_c_string(thd->wsrep_cs().mode()),
-              wsrep::to_c_string(thd->wsrep_cs().transaction().state()));
   int ret= ((thd->wsrep_cs().state() != wsrep::client_state::s_none &&
                thd->wsrep_cs().mode() == Wsrep_client_state::m_local) &&
               !thd->internal_transaction() ?
diff -Nru mariadb-10.11.11/sql/wsrep_var.cc mariadb-10.11.13/sql/wsrep_var.cc
--- mariadb-10.11.11/sql/wsrep_var.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_var.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1,4 +1,4 @@
-/* Copyright 2008-2022 Codership Oy <http://www.codership.com>
+/* Copyright 2008-2023 Codership Oy <http://www.codership.com>
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -353,14 +353,12 @@
          var->save_result.string_value.length);
   start_pos_buf[var->save_result.string_value.length]= 0;
 
-
   WSREP_DEBUG("SST wsrep_start_position check for new position %s old %s",
-	     start_pos_buf, wsrep_start_position);
+              start_pos_buf, wsrep_start_position);
 
   // Verify the format.
   if (wsrep_start_position_verify(start_pos_buf)) return true;
 
-
   // Give error if position is updated when wsrep is not enabled or
   // provider is not loaded.
   if ((!WSREP_ON || !Wsrep_server_state::instance().is_provider_loaded())
@@ -667,7 +665,7 @@
   {
     wsrep_create_rollbacker();
     WSREP_DEBUG("Cluster address update creating %ld applier threads running %lu",
-	    wsrep_slave_threads, wsrep_running_applier_threads);
+                wsrep_slave_threads, wsrep_running_applier_threads);
     wsrep_create_appliers(wsrep_slave_threads);
   }
   mysql_mutex_unlock(&LOCK_wsrep_cluster_config);
@@ -771,7 +769,7 @@
 {
   wsrep_slave_count_change = (wsrep_slave_threads - wsrep_running_applier_threads);
   WSREP_DEBUG("Change on slave threads: New %ld old %lu difference %d",
-	  wsrep_slave_threads, wsrep_running_applier_threads, wsrep_slave_count_change);
+              wsrep_slave_threads, wsrep_running_applier_threads, wsrep_slave_count_change);
 }
 
 bool wsrep_slave_threads_update (sys_var *self, THD* thd, enum_var_type type)
@@ -796,9 +794,9 @@
     // Thread creation and execution is asyncronous, therefore we need
     // wait them to be started or error produced
     while (wsrep_running_applier_threads != (ulong)wsrep_slave_threads &&
-	   !wsrep_thread_create_failed.load(std::memory_order_relaxed))
+           !wsrep_thread_create_failed.load(std::memory_order_relaxed))
     {
-	    my_sleep(1000);
+      my_sleep(1000);
     }
 
     mysql_mutex_lock(&LOCK_global_system_variables);
@@ -987,6 +985,22 @@
 
 bool wsrep_mode_check(sys_var *self, THD* thd, set_var* var)
 {
+  ulonglong new_wsrep_mode= var->save_result.ulonglong_value;
+  ulonglong old_wsrep_mode= wsrep_mode;
+  wsrep_mode= new_wsrep_mode;
+  if (wsrep_check_mode(WSREP_MODE_REPLICATE_MYISAM) ||
+      wsrep_check_mode(WSREP_MODE_REPLICATE_ARIA))
+  {
+    if (!(wsrep_forced_binlog_format == BINLOG_FORMAT_UNSPEC ||
+          wsrep_forced_binlog_format == BINLOG_FORMAT_ROW))
+    {
+      my_message(ER_WRONG_ARGUMENTS, "wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA] "
+                 "can't be enabled if wsrep_forced_binlog != [NONE|ROW]", MYF(0));
+      wsrep_mode= old_wsrep_mode;
+      return true;
+    }
+  }
+  wsrep_mode= old_wsrep_mode;
   return false;
 }
 
@@ -1130,3 +1144,28 @@
   return false;
 }
 
+bool wsrep_forced_binlog_format_check(sys_var *self, THD* thd, set_var* var)
+{
+  ulonglong new_forced_binlog_format= var->save_result.ulonglong_value;
+
+  if (!(new_forced_binlog_format == BINLOG_FORMAT_UNSPEC ||
+        new_forced_binlog_format == BINLOG_FORMAT_ROW))
+  {
+    if (wsrep_check_mode(WSREP_MODE_BINLOG_ROW_FORMAT_ONLY))
+    {
+      my_message(ER_WRONG_ARGUMENTS, "wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set "
+                 "if wsrep_mode=BINLOG_ROW_FORMAT_ONLY", MYF(0));
+      return true;
+    }
+
+    if (wsrep_check_mode(WSREP_MODE_REPLICATE_MYISAM) ||
+        wsrep_check_mode(WSREP_MODE_REPLICATE_ARIA))
+    {
+      my_message(ER_WRONG_ARGUMENTS, "wsrep_forced_binlog_format=[MIXED|STATEMENT] can't be set "
+                 "if wsrep_mode=[REPLICATE_MYISAM|REPLICATE_ARIA]", MYF(0));
+      return true;
+    }
+  }
+
+  return false;
+}
diff -Nru mariadb-10.11.11/sql/wsrep_var.h mariadb-10.11.13/sql/wsrep_var.h
--- mariadb-10.11.11/sql/wsrep_var.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_var.h	2025-05-19 16:14:25.000000000 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2021 Codership Oy <info@codership.com>
+/* Copyright (C) 2013-2023 Codership Oy <info@codership.com>
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -110,6 +110,7 @@
 extern bool wsrep_gtid_domain_id_update      UPDATE_ARGS;
 
 extern bool wsrep_mode_check                 CHECK_ARGS;
+extern bool wsrep_forced_binlog_format_check CHECK_ARGS;
 #else  /* WITH_WSREP */
 
 #define wsrep_provider_init(X)
diff -Nru mariadb-10.11.11/sql/wsrep_xid.cc mariadb-10.11.13/sql/wsrep_xid.cc
--- mariadb-10.11.11/sql/wsrep_xid.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_xid.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1,4 +1,4 @@
-/* Copyright 2015 Codership Oy <http://www.codership.com>
+/* Copyright 2015-2025 Codership Oy <http://www.codership.com>
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -24,6 +24,8 @@
 #include <mysql/service_wsrep.h>
 
 #include <algorithm> /* std::sort() */
+#include <string>    /* std::string */
+#include <sstream>   /* std::stringstream */
 /*
  * WSREPXid
  */
@@ -119,11 +121,7 @@
 
   if (hton->set_checkpoint)
   {
-    const unsigned char* uuid= wsrep_xid_uuid(xid);
-    char uuid_str[40]= {0, };
-    wsrep_uuid_print((const wsrep_uuid_t*)uuid, uuid_str, sizeof(uuid_str));
-    WSREP_DEBUG("Set WSREPXid for InnoDB:  %s:%lld",
-                uuid_str, (long long)wsrep_xid_seqno(xid));
+    WSREP_DEBUG("Set WSREPXid for InnoDB:  %s", wsrep_xid_print(xid).c_str());
     hton->set_checkpoint(hton, xid);
   }
   return FALSE;
@@ -150,12 +148,7 @@
   if (hton->get_checkpoint)
   {
     hton->get_checkpoint(hton, xid);
-    wsrep_uuid_t uuid;
-    memcpy(&uuid, wsrep_xid_uuid(xid), sizeof(uuid));
-    char uuid_str[40]= {0, };
-    wsrep_uuid_print(&uuid, uuid_str, sizeof(uuid_str));
-    WSREP_DEBUG("Read WSREPXid from InnoDB:  %s:%lld",
-                uuid_str, (long long)wsrep_xid_seqno(xid));
+    WSREP_DEBUG("Read WSREPXid from InnoDB:  %s", wsrep_xid_print(xid).c_str());
   }
   return FALSE;
 }
@@ -252,3 +245,29 @@
 {
   std::sort(array, array + len, Wsrep_xid_cmp());
 }
+
+std::string wsrep_xid_print(const XID *xid)
+{
+  std::stringstream ss;
+  const unsigned char* uuid= wsrep_xid_uuid(xid);
+  char uuid_str[40]= {0, };
+  wsrep_uuid_print((const wsrep_uuid_t*)uuid, uuid_str, sizeof(uuid_str));
+  wsrep_server_gtid_t gtid= {0,0,0};
+  memcpy(&gtid, &xid->data[WSREP_XID_RPL_GTID_OFFSET], sizeof(wsrep_server_gtid_t));
+  ss << uuid_str << ":" << wsrep_xid_seqno(xid) << " " << gtid.domain_id << "-"
+     << gtid.server_id << "-" << gtid.seqno;
+  return ss.str();
+}
+
+bool wsrep_is_xid_gtid_undefined(const XID *xid)
+{
+  wsrep_server_gtid_t gtid= {0,0,0};
+
+  if (wsrep_is_wsrep_xid(xid) &&
+      xid->data[WSREP_XID_VERSION_OFFSET] == WSREP_XID_VERSION_3)
+  {
+    memcpy(&gtid, &xid->data[WSREP_XID_RPL_GTID_OFFSET], sizeof(wsrep_server_gtid_t));
+  }
+
+  return (gtid.seqno == 0 && gtid.server_id == 0 && gtid.domain_id == 0);
+}
diff -Nru mariadb-10.11.11/sql/wsrep_xid.h mariadb-10.11.13/sql/wsrep_xid.h
--- mariadb-10.11.11/sql/wsrep_xid.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/sql/wsrep_xid.h	2025-05-19 16:14:25.000000000 +0000
@@ -1,4 +1,4 @@
-/* Copyright (C) 2015 Codership Oy <info@codership.com>
+/* Copyright (C) 2015-2025 Codership Oy <info@codership.com>
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -34,6 +34,8 @@
 //void wsrep_set_SE_checkpoint(XID&);             /* uncomment if needed */
 
 void wsrep_sort_xid_array(XID *array, int len);
+std::string wsrep_xid_print(const XID *xid);
+bool wsrep_is_xid_gtid_undefined(const XID *xid);
 
 #endif /* WITH_WSREP */
 #endif /* WSREP_UTILS_H */
diff -Nru mariadb-10.11.11/sql/yy_mariadb.cc mariadb-10.11.13/sql/yy_mariadb.cc
--- mariadb-10.11.11/sql/yy_mariadb.cc	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/sql/yy_mariadb.cc	2025-05-19 16:14:28.000000000 +0000
@@ -39851,7 +39851,7 @@
             if ((yyvsp[0].lex_str).str)
             {
               if (unlikely(Lex->sql_command == SQLCOM_CREATE_VIEW &&
-                          check_column_name((yyvsp[0].lex_str).str)))
+                          check_column_name((yyvsp[0].lex_str))))
                 my_yyabort_error((ER_WRONG_COLUMN_NAME, MYF(0), (yyvsp[0].lex_str).str));
               (yyvsp[-2].item)->base_flags|= item_base_t::IS_EXPLICIT_NAME;
               (yyvsp[-2].item)->set_name(thd, (yyvsp[0].lex_str));
diff -Nru mariadb-10.11.11/sql/yy_oracle.cc mariadb-10.11.13/sql/yy_oracle.cc
--- mariadb-10.11.11/sql/yy_oracle.cc	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/sql/yy_oracle.cc	2025-05-19 16:14:28.000000000 +0000
@@ -39022,7 +39022,7 @@
             if ((yyvsp[0].lex_str).str)
             {
               if (unlikely(Lex->sql_command == SQLCOM_CREATE_VIEW &&
-                          check_column_name((yyvsp[0].lex_str).str)))
+                          check_column_name((yyvsp[0].lex_str))))
                 my_yyabort_error((ER_WRONG_COLUMN_NAME, MYF(0), (yyvsp[0].lex_str).str));
               (yyvsp[-2].item)->base_flags|= item_base_t::IS_EXPLICIT_NAME;
               (yyvsp[-2].item)->set_name(thd, (yyvsp[0].lex_str));
diff -Nru mariadb-10.11.11/storage/connect/CMakeLists.txt mariadb-10.11.13/storage/connect/CMakeLists.txt
--- mariadb-10.11.11/storage/connect/CMakeLists.txt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/connect/CMakeLists.txt	2025-05-19 16:14:25.000000000 +0000
@@ -413,14 +413,16 @@
   RETURN()
 ENDIF()
 
-IF(MSVC AND (CMAKE_CXX_FLAGS MATCHES "/MP"))
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
   # domdoc.cpp uses compiler directive #import which is not compatible
   # with the /MP option, resulting in  compiler error C2813.
   # Remove /MP for this file.
+  GET_TARGET_PROPERTY(CURRENT_COMPILE_OPTIONS connect COMPILE_OPTIONS)
+  LIST(REMOVE_ITEM CURRENT_COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:C,CXX>:/MP>")
+  SET_TARGET_PROPERTIES(connect PROPERTIES COMPILE_OPTIONS "${CURRENT_COMPILE_OPTIONS}")
   SET(src_list ${CONNECT_SOURCES})
   LIST(FIND src_list  domdoc.cpp idx)
   IF(idx GREATER -1)
-    STRING(REPLACE "/MP" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
     LIST(REMOVE_AT src_list ${idx})
     SET_SOURCE_FILES_PROPERTIES(${src_list} PROPERTIES COMPILE_FLAGS "/MP")
   ENDIF()
diff -Nru mariadb-10.11.11/storage/connect/connect.cc mariadb-10.11.13/storage/connect/connect.cc
--- mariadb-10.11.11/storage/connect/connect.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/connect/connect.cc	2025-05-19 16:14:25.000000000 +0000
@@ -92,11 +92,11 @@
 
     free(dbuserp);
 
-		if (trace(1))
-			htrc("CntEndDB: Freeing Dup\n");
+    if (trace(1))
+      htrc("CntEndDB: Freeing Dup\n");
 
-		g->Activityp->Aptr = NULL;
-	} // endif dbuserp
+    g->Activityp->Aptr = NULL;                  // Free PlgGetUser() data
+  } // endif dbuserp
 
 } // end of CntEndDB
 
diff -Nru mariadb-10.11.11/storage/connect/plgxml.h mariadb-10.11.13/storage/connect/plgxml.h
--- mariadb-10.11.11/storage/connect/plgxml.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/connect/plgxml.h	2025-05-19 16:14:25.000000000 +0000
@@ -5,7 +5,7 @@
 /******************************************************************/
 /*  Dual XML implementation base classes defines.                 */
 /******************************************************************/
-#if !defined(BASE_BUFFER_SIZE)
+#ifndef LIBXML2_SUPPORT
 enum ElementType {               // libxml2
      XML_ELEMENT_NODE       =  1,
      XML_ATTRIBUTE_NODE     =  2,
@@ -28,7 +28,7 @@
      XML_XINCLUDE_START     = 19,
      XML_XINCLUDE_END       = 20,
      XML_DOCB_DOCUMENT_NODE = 21};
-#endif   // !BASE_BUFFER_SIZE
+#endif
 
 //#if !defined(NODE_TYPE_LIST)
 #ifdef NOT_USED
diff -Nru mariadb-10.11.11/storage/connect/tabxml.cpp mariadb-10.11.13/storage/connect/tabxml.cpp
--- mariadb-10.11.11/storage/connect/tabxml.cpp	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/connect/tabxml.cpp	2025-05-19 16:14:25.000000000 +0000
@@ -25,6 +25,9 @@
 #include <netinet/in.h>
 #include <unistd.h>
 //#include <ctype.h>
+#ifdef LIBXML2_SUPPORT
+#include <libxml/tree.h>
+#endif
 #include "osutil.h"
 #define _O_RDONLY O_RDONLY
 #endif  // !_WIN32
diff -Nru mariadb-10.11.11/storage/connect/user_connect.cc mariadb-10.11.13/storage/connect/user_connect.cc
--- mariadb-10.11.11/storage/connect/user_connect.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/connect/user_connect.cc	2025-05-19 16:14:25.000000000 +0000
@@ -101,9 +101,6 @@
   PACTIVITY ap= NULL;
   PDBUSER   dup= NULL;
 
-  // Areasize= 64M because of VEC tables. Should be parameterisable
-//g= PlugInit(NULL, 67108864);
-//g= PlugInit(NULL, 134217728);  // 128M was because of old embedded tests
   g= PlugInit(NULL, (size_t)worksize);
 
   // Check whether the initialization is complete
@@ -113,12 +110,13 @@
       printf("%s\n", g->Message);
 
     (void) PlugExit(g);
+    g= 0;
 
-		if (dup)
-	    free(dup);
+    if (dup)
+      free(dup);
 
     return true;
-    } // endif g->
+  } // endif g->
 
   dup->Catalog= new MYCAT(NULL);
 
@@ -128,17 +126,16 @@
   g->Activityp= ap;
   g->Activityp->Aptr= dup;
 
-	pthread_mutex_lock(&usrmut);
+  pthread_mutex_lock(&usrmut);
   next= to_users;
   to_users= this;
 
   if (next)
     next->previous= this;
 
-	count = 1;
-	pthread_mutex_unlock(&usrmut);
-
-	last_query_id= thdp->query_id;
+  count = 1;
+  pthread_mutex_unlock(&usrmut);
+  last_query_id= thdp->query_id;
   return false;
 } // end of user_init
 
diff -Nru mariadb-10.11.11/storage/federatedx/federatedx_io.cc mariadb-10.11.13/storage/federatedx/federatedx_io.cc
--- mariadb-10.11.11/storage/federatedx/federatedx_io.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/federatedx/federatedx_io.cc	2025-05-19 16:14:25.000000000 +0000
@@ -51,6 +51,7 @@
 static const io_schemes_st federated_io_schemes[] =
 {
   { "mysql", &instantiate_io_mysql },
+  { "mariadb", &instantiate_io_mysql },
   { "null", instantiate_io_null } /* must be last element */
 };
 
diff -Nru mariadb-10.11.11/storage/federatedx/ha_federatedx.cc mariadb-10.11.13/storage/federatedx/ha_federatedx.cc
--- mariadb-10.11.11/storage/federatedx/ha_federatedx.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/federatedx/ha_federatedx.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1484,20 +1484,20 @@
        sizeof(int) + 8);
   key.append(scheme);
   key.q_append('\0');
-  server->hostname= (const char *) (intptr) key.length();
+  size_t hostname_pos= key.length();
   key.append(hostname);
   key.q_append('\0');
-  server->database= (const char *) (intptr) key.length();
+  size_t database_pos= key.length();
   key.append(database);
   key.q_append('\0');
   key.q_append((uint32) share->port);
-  server->socket= (const char *) (intptr) key.length();
+  size_t socket_pos= key.length();
   key.append(socket);
   key.q_append('\0');
-  server->username= (const char *) (intptr) key.length();
+  size_t username_pos= key.length();
   key.append(username);
   key.q_append('\0');
-  server->password= (const char *) (intptr) key.length();
+  size_t password_pos= key.length();
   key.append(password);
   key.c_ptr_safe();                             // Ensure we have end \0
 
@@ -1505,13 +1505,12 @@
   /* Copy and add end \0 */
   server->key= (uchar *)  strmake_root(mem_root, key.ptr(), key.length());
 
-  /* pointer magic */
-  server->scheme+= (intptr) server->key;
-  server->hostname+= (intptr) server->key;
-  server->database+= (intptr) server->key;
-  server->username+= (intptr) server->key;
-  server->password+= (intptr) server->key;
-  server->socket+= (intptr) server->key;
+  server->scheme= (const char *)server->key;
+  server->hostname= (const char *)server->key + hostname_pos;
+  server->database= (const char *)server->key + database_pos;
+  server->username= (const char *)server->key + username_pos;
+  server->password= (const char *)server->key + password_pos;
+  server->socket= (const char*)server->key + socket_pos;
   server->port= share->port;
 
   if (!share->socket)
diff -Nru mariadb-10.11.11/storage/innobase/CMakeLists.txt mariadb-10.11.13/storage/innobase/CMakeLists.txt
--- mariadb-10.11.11/storage/innobase/CMakeLists.txt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/CMakeLists.txt	2025-05-19 16:14:25.000000000 +0000
@@ -226,7 +226,6 @@
 	include/dict0pagecompress.h
 	include/dict0pagecompress.inl
 	include/dict0stats.h
-	include/dict0stats.inl
 	include/dict0stats_bg.h
 	include/dict0types.h
 	include/dyn0buf.h
diff -Nru mariadb-10.11.11/storage/innobase/btr/btr0sea.cc mariadb-10.11.13/storage/innobase/btr/btr0sea.cc
--- mariadb-10.11.11/storage/innobase/btr/btr0sea.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/btr/btr0sea.cc	2025-05-19 16:14:25.000000000 +0000
@@ -195,7 +195,7 @@
 }
 
 /** Lazily free detached metadata when removing the last reference. */
-ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
+ATTRIBUTE_COLD void btr_search_lazy_free(dict_index_t *index)
 {
   ut_ad(index->freed());
   dict_table_t *table= index->table;
@@ -219,8 +219,7 @@
   table->autoinc_mutex.wr_unlock();
 }
 
-/** Disable the adaptive hash search system and empty the index. */
-void btr_search_disable()
+ATTRIBUTE_COLD bool btr_search_disable()
 {
 	dict_table_t*	table;
 
@@ -231,7 +230,7 @@
 	if (!btr_search_enabled) {
 		dict_sys.unfreeze();
 		btr_search_x_unlock_all();
-		return;
+		return false;
 	}
 
 	btr_search_enabled = false;
@@ -259,23 +258,25 @@
 	btr_search_sys.clear();
 
 	btr_search_x_unlock_all();
+
+	return true;
 }
 
 /** Enable the adaptive hash search system.
 @param resize whether buf_pool_t::resize() is the caller */
-void btr_search_enable(bool resize)
+ATTRIBUTE_COLD void btr_search_enable(bool resize)
 {
 	if (!resize) {
 		mysql_mutex_lock(&buf_pool.mutex);
-		bool changed = srv_buf_pool_old_size != srv_buf_pool_size;
+		const auto is_shrinking = buf_pool.is_shrinking();
 		mysql_mutex_unlock(&buf_pool.mutex);
-		if (changed) {
+		if (is_shrinking) {
 			return;
 		}
 	}
 
 	btr_search_x_lock_all();
-	ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64;
+	ulint hash_size = buf_pool.curr_pool_size() / sizeof(void *) / 64;
 
 	if (btr_search_sys.parts[0].heap) {
 		ut_ad(btr_search_enabled);
@@ -939,88 +940,6 @@
 	info->last_hash_succ = FALSE;
 }
 
-/** Clear the adaptive hash index on all pages in the buffer pool. */
-inline void buf_pool_t::clear_hash_index() noexcept
-{
-  ut_ad(!resizing);
-  ut_ad(!btr_search_enabled);
-
-  std::set<dict_index_t*> garbage;
-
-  for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; )
-  {
-    for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size;
-         block != end; block++)
-    {
-      dict_index_t *index= block->index;
-      assert_block_ahi_valid(block);
-
-      /* We can clear block->index and block->n_pointers when
-      holding all AHI latches exclusively; see the comments in buf0buf.h */
-
-      if (!index)
-      {
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-        ut_a(!block->n_pointers);
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-        continue;
-      }
-
-      ut_d(const auto s= block->page.state());
-      /* Another thread may have set the state to
-      REMOVE_HASH in buf_LRU_block_remove_hashed().
-
-      The state change in buf_pool_t::realloc() is not observable
-      here, because in that case we would have !block->index.
-
-      In the end, the entire adaptive hash index will be removed. */
-      ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH);
-# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-      block->n_pointers= 0;
-# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
-      if (index->freed())
-        garbage.insert(index);
-      block->index= nullptr;
-    }
-  }
-
-  for (dict_index_t *index : garbage)
-    btr_search_lazy_free(index);
-}
-
-/** Get a buffer block from an adaptive hash index pointer.
-This function does not return if the block is not identified.
-@param ptr  pointer to within a page frame
-@return pointer to block, never NULL */
-inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const noexcept
-{
-  chunk_t::map *chunk_map = chunk_t::map_ref;
-  ut_ad(chunk_t::map_ref == chunk_t::map_reg);
-  ut_ad(!resizing);
-
-  chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr);
-  ut_a(it != chunk_map->begin());
-
-  chunk_t *chunk= it == chunk_map->end()
-    ? chunk_map->rbegin()->second
-    : (--it)->second;
-
-  const size_t offs= size_t(ptr - chunk->blocks->page.frame) >>
-    srv_page_size_shift;
-  ut_a(offs < chunk->size);
-
-  buf_block_t *block= &chunk->blocks[offs];
-  /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
-  block[n].frame == block->page.frame + n * srv_page_size.  Check it. */
-  ut_ad(block->page.frame == page_align(ptr));
-  /* Read the state of the block without holding hash_lock.
-  A state transition to REMOVE_HASH is possible during
-  this execution. */
-  ut_ad(block->page.state() >= buf_page_t::REMOVE_HASH);
-
-  return block;
-}
-
 /** Tries to guess the right search position based on the hash search info
 of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
 and the function returns TRUE, then cursor->up_match and cursor->low_match
@@ -1103,7 +1022,8 @@
 		return false;
 	}
 
-	buf_block_t* block = buf_pool.block_from_ahi(rec);
+	buf_block_t* block = buf_pool.block_from(rec);
+	ut_ad(block->page.frame == page_align(rec));
 
 	buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get(
 		block->page.id().fold());
@@ -2196,7 +2116,7 @@
 
 		for (; node != NULL; node = node->next) {
 			const buf_block_t*	block
-				= buf_pool.block_from_ahi((byte*) node->data);
+				= buf_pool.block_from(node->data);
 			index_id_t		page_index_id;
 
 			if (UNIV_LIKELY(block->page.in_file())) {
diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0buddy.cc mariadb-10.11.13/storage/innobase/buf/buf0buddy.cc
--- mariadb-10.11.11/storage/innobase/buf/buf0buddy.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/buf/buf0buddy.cc	2025-05-19 16:14:25.000000000 +0000
@@ -162,6 +162,20 @@
 }
 
 #ifdef UNIV_DEBUG
+const buf_block_t *buf_pool_t::contains_zip(const void *data, size_t shift)
+  const noexcept
+{
+  const size_t d= size_t(data) >> shift;
+
+  for (size_t i= 0; i < n_blocks; i++)
+  {
+    const buf_block_t *block= get_nth_page(i);
+    if (size_t(block->page.zip.data) >> shift == d)
+      return block;
+  }
+  return nullptr;
+}
+
 /** Validate a given zip_free list. */
 struct	CheckZipFree {
 	CheckZipFree(ulint i) : m_i(i) {}
@@ -257,13 +271,10 @@
 /** Add a block to the head of the appropriate buddy free list.
 @param[in,out]	buf		block to be freed
 @param[in]	i		index of buf_pool.zip_free[] */
-UNIV_INLINE
-void
-buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i)
+static void buf_buddy_add_to_free(buf_buddy_free_t *buf, ulint i)
 {
 	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(buf_pool.zip_free[i].start != buf);
-
 	buf_buddy_stamp_free(buf, i);
 	UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf);
 	ut_d(buf_buddy_list_validate(i));
@@ -272,9 +283,7 @@
 /** Remove a block from the appropriate buddy free list.
 @param[in,out]	buf		block to be freed
 @param[in]	i		index of buf_pool.zip_free[] */
-UNIV_INLINE
-void
-buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i)
+static void buf_buddy_remove_from_free(buf_buddy_free_t *buf, ulint i)
 {
 	mysql_mutex_assert_owner(&buf_pool.mutex);
 	ut_ad(buf_buddy_check_free(buf, i));
@@ -298,13 +307,10 @@
 
 	buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
 
-	if (buf_pool.is_shrinking()
-	    && UT_LIST_GET_LEN(buf_pool.withdraw)
-	    < buf_pool.withdraw_target) {
-
+	if (size_t size = buf_pool.shrinking_size()) {
 		while (buf != NULL
 		       && buf_pool.will_be_withdrawn(
-			       reinterpret_cast<byte*>(buf))) {
+			       reinterpret_cast<byte*>(buf), size)) {
 			/* This should be withdrawn, not to be allocated */
 			buf = UT_LIST_GET_NEXT(list, buf);
 		}
@@ -312,6 +318,7 @@
 
 	if (buf) {
 		buf_buddy_remove_from_free(buf, i);
+		ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + i));
 	} else if (i + 1 < BUF_BUDDY_SIZES) {
 		/* Attempt to split. */
 		buf = buf_buddy_alloc_zip(i + 1);
@@ -321,7 +328,6 @@
 				reinterpret_cast<buf_buddy_free_t*>(
 					reinterpret_cast<byte*>(buf)
 					+ (BUF_BUDDY_LOW << i));
-			ut_ad(!buf_pool.contains_zip(buddy));
 			buf_buddy_add_to_free(buddy, i);
 		}
 	}
@@ -340,74 +346,52 @@
 	return(buf);
 }
 
+#ifdef UNIV_DEBUG
+/** number of blocks allocated to the buddy system */
+static size_t buf_buddy_n_frames;
+#endif
+
 /** Deallocate a buffer frame of srv_page_size.
 @param buf    buffer frame to deallocate */
 static void buf_buddy_block_free(void *buf) noexcept
 {
   mysql_mutex_assert_owner(&buf_pool.mutex);
-  ut_a(!ut_align_offset(buf, srv_page_size));
-
-  const ulint fold= BUF_POOL_ZIP_FOLD_PTR(buf);
-  buf_page_t **prev= buf_pool.zip_hash.cell_get(fold)->
-    search(&buf_page_t::hash, [buf](const buf_page_t *b)
-    {
-      ut_ad(b->in_zip_hash);
-      ut_ad(b->state() == buf_page_t::MEMORY);
-      return b->frame == buf;
-    });
-
-  buf_page_t *bpage= *prev;
-  ut_a(bpage);
-  ut_a(bpage->frame == buf);
-  ut_d(bpage->in_zip_hash= false);
-  *prev= bpage->hash;
-  bpage->hash= nullptr;
-
+  buf_block_t *block= buf_pool.block_from(buf);
+  ut_ad(block->page.state() == buf_page_t::MEMORY);
+  ut_ad(block->page.frame == buf);
+  ut_ad(!buf_pool.contains_zip(buf, srv_page_size_shift));
   ut_d(memset(buf, 0, srv_page_size));
   MEM_UNDEFINED(buf, srv_page_size);
-
-  buf_LRU_block_free_non_file_page(reinterpret_cast<buf_block_t*>(bpage));
-  ut_ad(buf_pool.buddy_n_frames > 0);
-  ut_d(buf_pool.buddy_n_frames--);
+  buf_LRU_block_free_non_file_page(block);
+  ut_ad(buf_buddy_n_frames > 0);
+  ut_d(buf_buddy_n_frames--);
 }
 
 /** Allocate a buffer block to the buddy allocator.
 @param block   buffer block to register */
 static void buf_buddy_block_register(buf_block_t *block) noexcept
 {
-  const ulint fold= BUF_POOL_ZIP_FOLD(block);
+  ut_ad(buf_pool.is_uncompressed_current(block));
   ut_ad(block->page.state() == buf_page_t::MEMORY);
-
-  ut_a(block->page.frame);
-  ut_a(!ut_align_offset(block->page.frame, srv_page_size));
-
-  ut_ad(!block->page.in_zip_hash);
-  ut_d(block->page.in_zip_hash= true);
-  buf_pool.zip_hash.cell_get(fold)->append(block->page, &buf_page_t::hash);
-  ut_d(buf_pool.buddy_n_frames++);
+  ut_d(buf_buddy_n_frames++);
 }
 
 /** Allocate a block from a bigger object.
 @param[in]	buf		a block that is free to use
 @param[in]	i		index of buf_pool.zip_free[]
-@param[in]	j		size of buf as an index of buf_pool.zip_free[]
 @return allocated block */
-static
-void*
-buf_buddy_alloc_from(void* buf, ulint i, ulint j)
+static void *buf_buddy_alloc_from(void *buf, ulint i)
 {
-	ulint	offs	= BUF_BUDDY_LOW << j;
-	ut_ad(j <= BUF_BUDDY_SIZES);
 	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
-	ut_ad(j >= i);
-	ut_ad(!ut_align_offset(buf, offs));
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(!ut_align_offset(buf, srv_page_size));
+	ut_ad(!buf_pool.contains_zip(buf, srv_page_size_shift));
 
 	/* Add the unused parts of the block to the free lists. */
-	while (j > i) {
+	for (ulint j = BUF_BUDDY_SIZES, offs = srv_page_size; j-- > i; ) {
 		buf_buddy_free_t*	zip_buf;
 
 		offs >>= 1;
-		j--;
 
 		zip_buf = reinterpret_cast<buf_buddy_free_t*>(
 			reinterpret_cast<byte*>(buf) + offs);
@@ -422,7 +406,7 @@
 @param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
 @param lru    assigned to true if buf_pool.mutex was temporarily released
 @return allocated block, never NULL */
-byte *buf_buddy_alloc_low(ulint i, bool *lru)
+byte *buf_buddy_alloc_low(ulint i, bool *lru) noexcept
 {
 	buf_block_t*	block;
 
@@ -439,7 +423,7 @@
 	}
 
 	/* Try allocating from the buf_pool.free list. */
-	block = buf_LRU_get_free_only();
+	block = buf_pool.allocate();
 
 	if (block) {
 		goto alloc_big;
@@ -455,21 +439,21 @@
 	buf_buddy_block_register(block);
 
 	block = reinterpret_cast<buf_block_t*>(
-		buf_buddy_alloc_from(block->page.frame, i, BUF_BUDDY_SIZES));
+		buf_buddy_alloc_from(block->page.frame, i));
 
 func_exit:
 	buf_pool.buddy_stat[i].used++;
 	return reinterpret_cast<byte*>(block);
 }
 
-/** Try to relocate a block. The caller must hold zip_free_mutex, and this
-function will release and lock it again.
+/** Try to relocate a block.
 @param[in]	src		block to relocate
 @param[in]	dst		free block to relocated to
 @param[in]	i		index of buf_pool.zip_free[]
 @param[in]	force		true if we must relocated always
 @return true if relocated */
-static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
+static bool buf_buddy_relocate(void *src, void *dst, ulint i, bool force)
+  noexcept
 {
 	buf_page_t*	bpage;
 	const ulint	size = BUF_BUDDY_LOW << i;
@@ -575,7 +559,7 @@
 @param[in]	buf	block to be freed, must not be pointed to
 			by the buffer pool
 @param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
-void buf_buddy_free_low(void* buf, ulint i)
+void buf_buddy_free_low(void* buf, ulint i) noexcept
 {
 	buf_buddy_free_t*	buddy;
 
@@ -595,13 +579,12 @@
 
 	ut_ad(i < BUF_BUDDY_SIZES);
 	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
-	ut_ad(!buf_pool.contains_zip(buf));
+	ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + i));
 
 	/* Do not recombine blocks if there are few free blocks.
 	We may waste up to 15360*max_len bytes to free blocks
 	(1024 + 2048 + 4096 + 8192 = 15360) */
-	if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16
-	    && !buf_pool.is_shrinking()) {
+	if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16) {
 		goto func_exit;
 	}
 
@@ -615,10 +598,9 @@
 		/* The buddy is free: recombine */
 		buf_buddy_remove_from_free(buddy, i);
 buddy_is_free:
-		ut_ad(!buf_pool.contains_zip(buddy));
 		i++;
 		buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
-
+		ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + i));
 		goto recombine;
 
 	case BUF_BUDDY_STATE_USED:
@@ -655,107 +637,120 @@
 	buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i);
 }
 
-/** Try to reallocate a block.
-@param[in]	buf	buf_pool block to be reallocated
-@param[in]	size	block size, up to srv_page_size
-@return	whether the reallocation succeeded */
-bool
-buf_buddy_realloc(void* buf, ulint size)
-{
-	buf_block_t*	block = NULL;
-	ulint		i = buf_buddy_get_slot(size);
-
-	mysql_mutex_assert_owner(&buf_pool.mutex);
-	ut_ad(i <= BUF_BUDDY_SIZES);
-	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
-
-	if (i < BUF_BUDDY_SIZES) {
-		/* Try to allocate from the buddy system. */
-		block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i));
-	}
-
-	if (block == NULL) {
-		/* Try allocating from the buf_pool.free list. */
-		block = buf_LRU_get_free_only();
-
-		if (block == NULL) {
-			return(false); /* free_list was not enough */
-		}
+/** Reallocate a ROW_FORMAT=COMPRESSED page frame during buf_pool_t::shrink().
+@param bpage page descriptor covering a ROW_FORMAT=COMPRESSED page
+@param block uncompressed block for storage
+@return block
+@retval nullptr if the block was consumed */
+ATTRIBUTE_COLD
+buf_block_t *buf_buddy_shrink(buf_page_t *bpage, buf_block_t *block) noexcept
+{
+  ut_ad(bpage->zip.data);
+
+  void *dst= nullptr;
+  ulint size= page_zip_get_size(&bpage->zip);
+  ulint i= buf_buddy_get_slot(size);
+
+  ut_ad(buf_pool.will_be_withdrawn(bpage->zip.data, size));
+  ut_ad(bpage->can_relocate());
+  ut_ad(i <= BUF_BUDDY_SIZES);
+  ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+  if (UNIV_LIKELY(i < BUF_BUDDY_SIZES))
+    dst= buf_buddy_alloc_zip(i);
+
+  if (!dst)
+  {
+    buf_buddy_block_register(block);
+    dst= buf_buddy_alloc_from(block->page.frame, i);
+    ut_ad(dst);
+    block= nullptr;
+  }
+
+  void *src= bpage->zip.data;
+  memcpy_aligned<UNIV_ZIP_SIZE_MIN>(dst, src, size);
+  bpage->zip.data= static_cast<page_zip_t*>(dst);
+  buf_pool.buddy_stat[i].relocated++;
+
+  while (i < BUF_BUDDY_SIZES)
+  {
+    MEM_UNDEFINED(src, BUF_BUDDY_LOW << i);
+    /* Try to combine adjacent blocks. */
+    buf_buddy_free_t *buddy= reinterpret_cast<buf_buddy_free_t*>
+      (buf_buddy_get(static_cast<byte*>(src), BUF_BUDDY_LOW << i));
 
-		buf_buddy_block_register(block);
-
-		block = reinterpret_cast<buf_block_t*>(
-			buf_buddy_alloc_from(
-				block->page.frame, i, BUF_BUDDY_SIZES));
-	}
-
-	buf_pool.buddy_stat[i].used++;
-
-	/* Try to relocate the buddy of buf to the free block. */
-	if (buf_buddy_relocate(buf, block, i, true)) {
-		/* succeeded */
-		buf_buddy_free_low(buf, i);
-	} else {
-		/* failed */
-		buf_buddy_free_low(block, i);
-	}
-
-	return(true); /* free_list was enough */
-}
-
-/** Combine all pairs of free buddies. */
-void buf_buddy_condense_free()
-{
-	mysql_mutex_assert_owner(&buf_pool.mutex);
-	ut_ad(buf_pool.is_shrinking());
-
-	for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
-		buf_buddy_free_t* buf =
-			UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
-
-		/* seek to withdraw target */
-		while (buf != NULL
-		       && !buf_pool.will_be_withdrawn(
-			       reinterpret_cast<byte*>(buf))) {
-			buf = UT_LIST_GET_NEXT(list, buf);
-		}
-
-		while (buf != NULL) {
-			buf_buddy_free_t* next =
-				UT_LIST_GET_NEXT(list, buf);
-
-			buf_buddy_free_t* buddy =
-				reinterpret_cast<buf_buddy_free_t*>(
-					buf_buddy_get(
-						reinterpret_cast<byte*>(buf),
-						BUF_BUDDY_LOW << i));
-
-			/* seek to the next withdraw target */
-			while (true) {
-				while (next != NULL
-				       && !buf_pool.will_be_withdrawn(
-						reinterpret_cast<byte*>(next))) {
-					 next = UT_LIST_GET_NEXT(list, next);
-				}
-
-				if (buddy != next) {
-					break;
-				}
-
-				next = UT_LIST_GET_NEXT(list, next);
-			}
-
-			if (buf_buddy_is_free(buddy, i)
-			    == BUF_BUDDY_STATE_FREE) {
-				/* Both buf and buddy are free.
-				Try to combine them. */
-				buf_buddy_remove_from_free(buf, i);
-				buf_pool.buddy_stat[i].used++;
+    if (buf_buddy_is_free(buddy, i) != BUF_BUDDY_STATE_FREE)
+    {
+      ut_ad(!buf_pool.contains_zip(src, BUF_BUDDY_LOW_SHIFT + i));
+      buf_buddy_add_to_free(static_cast<buf_buddy_free_t*>(src), i);
+      return block;
+    }
+
+    /* The buddy is free: recombine */
+    buf_buddy_remove_from_free(buddy, i);
+    i++;
+    src= ut_align_down(src, BUF_BUDDY_LOW << i);
+  }
+
+  buf_buddy_block_free(src);
+  return block;
+}
+
+/** Combine all pairs of free buddies.
+@param size  the target innodb_buffer_pool_size */
+ATTRIBUTE_COLD void buf_buddy_condense_free(size_t size) noexcept
+{
+  ut_ad(size);
+  ut_ad(size == buf_pool.shrinking_size());
+
+  for (ulint i= 0; i < array_elements(buf_pool.zip_free); i++)
+  {
+    buf_buddy_free_t *buf= UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+    /* seek to withdraw target */
+    while (buf &&
+           !buf_pool.will_be_withdrawn(reinterpret_cast<byte*>(buf), size))
+      buf= UT_LIST_GET_NEXT(list, buf);
 
-				buf_buddy_free_low(buf, i);
-			}
+    for (buf_buddy_free_t *next= buf; buf; buf= next)
+    {
+      buf_buddy_free_t *buddy= reinterpret_cast<buf_buddy_free_t*>
+        (buf_buddy_get(reinterpret_cast<byte*>(buf), BUF_BUDDY_LOW << i));
 
-			buf = next;
-		}
-	}
+      /* seek to the next withdraw target */
+      do
+      {
+        while ((next= UT_LIST_GET_NEXT(list, next)) &&
+               !buf_pool.will_be_withdrawn(reinterpret_cast<byte*>(next),
+                                           size)) {}
+      }
+      while (buddy == next);
+
+      if (buf_buddy_is_free(buddy, i) != BUF_BUDDY_STATE_FREE)
+        continue;
+
+      buf_buddy_remove_from_free(buf, i);
+      ulint j= i;
+    recombine:
+      buf_buddy_remove_from_free(buddy, j);
+      j++;
+      buf= static_cast<buf_buddy_free_t*>
+        (ut_align_down(buf, BUF_BUDDY_LOW << j));
+      MEM_UNDEFINED(buf, BUF_BUDDY_LOW << j);
+
+      if (j == BUF_BUDDY_SIZES)
+      {
+        buf_buddy_block_free(buf);
+        continue;
+      }
+
+      buddy= reinterpret_cast<buf_buddy_free_t*>
+        (buf_buddy_get(reinterpret_cast<byte*>(buf), BUF_BUDDY_LOW << j));
+      if (buf_buddy_is_free(buddy, j) == BUF_BUDDY_STATE_FREE)
+        goto recombine;
+
+      ut_ad(!buf_pool.contains_zip(buf, BUF_BUDDY_LOW_SHIFT + j));
+      buf_buddy_add_to_free(buf, j);
+    }
+  }
 }
diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0buf.cc mariadb-10.11.13/storage/innobase/buf/buf0buf.cc
--- mariadb-10.11.11/storage/innobase/buf/buf0buf.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/buf/buf0buf.cc	2025-05-19 16:14:25.000000000 +0000
@@ -47,8 +47,6 @@
 #include "lock0lock.h"
 #include "btr0sea.h"
 #include "ibuf0ibuf.h"
-#include "trx0undo.h"
-#include "trx0purge.h"
 #include "log0log.h"
 #include "dict0stats_bg.h"
 #include "srv0srv.h"
@@ -64,6 +62,7 @@
 #include <map>
 #include <sstream>
 #include "log.h"
+#include "my_virtual_mem.h"
 
 using st_::span;
 
@@ -277,6 +276,56 @@
 */
 
 #ifndef UNIV_INNOCHECKSUM
+/** Compute the number of page frames needed for buf_block_t,
+per innodb_buffer_pool_extent_size.
+@param ps      innodb_page_size
+@return number of buf_block_t frames per extent */
+static constexpr uint8_t first_page(size_t ps)
+{
+  return uint8_t(innodb_buffer_pool_extent_size / ps -
+                 innodb_buffer_pool_extent_size / (ps + sizeof(buf_block_t)));
+}
+
+/** Compute the number of bytes needed for buf_block_t,
+per innodb_buffer_pool_extent_size.
+@param ps      innodb_page_size
+@return number of buf_block_t frames per extent */
+static constexpr size_t first_frame(size_t ps)
+{
+  return first_page(ps) * ps;
+}
+
+/** Compute the number of pages per innodb_buffer_pool_extent_size.
+@param ps      innodb_page_size
+@return number of buf_block_t frames per extent */
+static constexpr uint16_t pages(size_t ps)
+{
+  return uint16_t(innodb_buffer_pool_extent_size / ps - first_page(ps));
+}
+
+/** The byte offset of the first page frame in a buffer pool extent
+of innodb_buffer_pool_extent_size bytes */
+static constexpr size_t first_frame_in_extent[]=
+{
+  first_frame(4096), first_frame(8192), first_frame(16384),
+  first_frame(32768), first_frame(65536)
+};
+
+/** The position offset of the first page frame in a buffer pool extent
+of innodb_buffer_pool_extent_size bytes */
+static constexpr uint8_t first_page_in_extent[]=
+{
+  first_page(4096), first_page(8192), first_page(16384),
+  first_page(32768), first_page(65536)
+};
+
+/** Number of pages per buffer pool extent
+of innodb_buffer_pool_extent_size bytes */
+static constexpr size_t pages_in_extent[]=
+{
+  pages(4096), pages(8192), pages(16384), pages(32768), pages(65536)
+};
+
 # ifdef SUX_LOCK_GENERIC
 void page_hash_latch::read_lock_wait() noexcept
 {
@@ -326,8 +375,6 @@
 
 /** The InnoDB buffer pool */
 buf_pool_t buf_pool;
-buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg;
-buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
 
 #ifdef UNIV_DEBUG
 /** This is used to insert validation operations in execution
@@ -511,16 +558,18 @@
 }
 
 #ifndef UNIV_INNOCHECKSUM
-/** Checks whether the lsn present in the page is lesser than the
-peek current lsn.
-@param check_lsn    lsn to check
+/** Check whether a page is newer than the durable LSN.
+@param check_lsn    whether to check the LSN
 @param read_buf     page frame
-@return whether the FIL_PAGE_LSN is invalid */
-static bool buf_page_check_lsn(bool check_lsn, const byte *read_buf)
+@return whether the FIL_PAGE_LSN is invalid (ahead of the durable LSN) */
+static bool buf_page_check_lsn(bool check_lsn, const byte *read_buf) noexcept
 {
   if (!check_lsn)
     return false;
-  lsn_t current_lsn= log_sys.get_lsn();
+  /* A page may not be read before it is written, and it may not be
+  written before the corresponding log has been durably written.
+  Hence, we refer to the current durable LSN here */
+  lsn_t current_lsn= log_sys.get_flushed_lsn(std::memory_order_relaxed);
   if (UNIV_UNLIKELY(current_lsn == log_sys.FIRST_LSN) &&
       srv_force_recovery == SRV_FORCE_NO_LOG_REDO)
     return false;
@@ -797,6 +846,11 @@
 
   bool setup()
   {
+    m_num_fds= 0;
+
+    if (my_use_large_pages)
+      return false;
+
     static_assert(array_elements(m_fds) == (array_elements(m_triggers) + 1),
                   "insufficient fds");
     std::string memcgroup{"/sys/fs/cgroup"};
@@ -809,7 +863,6 @@
     cgroup.erase(0, 3); // Remove "0::"
     memcgroup+= cgroup + "/memory.pressure";
 
-    m_num_fds= 0;
     for (auto trig= std::begin(m_triggers); trig!= std::end(m_triggers); ++trig)
     {
       if ((m_fds[m_num_fds].fd=
@@ -958,29 +1011,121 @@
 }
 
 /** Initialize mem pressure. */
-ATTRIBUTE_COLD void buf_mem_pressure_detect_init()
+ATTRIBUTE_COLD static void buf_mem_pressure_detect_init() noexcept
 {
   mem_pressure_obj.setup();
 }
 
-ATTRIBUTE_COLD void buf_mem_pressure_shutdown()
+ATTRIBUTE_COLD void buf_mem_pressure_shutdown() noexcept
 {
   mem_pressure_obj.join();
 }
-#endif /* __linux__ */
+#endif
+
+#if defined __linux__ || !defined DBUG_OFF
+inline void buf_pool_t::garbage_collect() noexcept
+{
+  mysql_mutex_lock(&mutex);
+  const size_t old_size{size_in_bytes}, min_size{size_in_bytes_auto_min};
+  const size_t reduce_size=
+    std::max(innodb_buffer_pool_extent_size,
+             ut_calc_align((old_size - min_size) / 2,
+                           innodb_buffer_pool_extent_size));
+  if (old_size < min_size + reduce_size ||
+      first_to_withdraw || old_size != size_in_bytes_requested)
+  {
+    mysql_mutex_unlock(&mutex);
+    sql_print_information("InnoDB: Memory pressure event disregarded;"
+                          " innodb_buffer_pool_size=%zum,"
+                          " innodb_buffer_pool_size_min=%zum",
+                          old_size >> 20, min_size >> 20);
+    return;
+  }
+
+  size_t size= old_size - reduce_size;
+  size_t n_blocks_new= get_n_blocks(size);
+
+  ut_ad(UT_LIST_GET_LEN(withdrawn) == 0);
+  ut_ad(n_blocks_to_withdraw == 0);
+
+  n_blocks_to_withdraw= n_blocks - n_blocks_new;
+  first_to_withdraw= &get_nth_page(n_blocks_new)->page;
+
+  size_in_bytes_requested= size;
+  mysql_mutex_unlock(&mutex);
+  mysql_mutex_lock(&flush_list_mutex);
+  page_cleaner_wakeup(true);
+  my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex);
+  mysql_mutex_unlock(&flush_list_mutex);
+# ifdef BTR_CUR_HASH_ADAPT
+  bool ahi_disabled= btr_search_disable();
+# endif /* BTR_CUR_HASH_ADAPT */
+  time_t start= time(nullptr);
+  mysql_mutex_lock(&mutex);
+
+  do
+  {
+    if (shrink(size))
+    {
+      const size_t old_blocks{n_blocks};
+      n_blocks= n_blocks_new;
+
+      size_t s= n_blocks_new / BUF_READ_AHEAD_PORTION;
+      read_ahead_area= s >= READ_AHEAD_PAGES
+        ? READ_AHEAD_PAGES
+        : my_round_up_to_next_power(uint32(s));
+
+      os_total_large_mem_allocated-= reduce_size;
+      shrunk(size, reduce_size);
+      ibuf_max_size_update(srv_change_buffer_max_size);
+# ifdef BTR_CUR_HASH_ADAPT
+      if (ahi_disabled)
+        btr_search_enable(true);
+# endif
+      mysql_mutex_unlock(&mutex);
+      sql_print_information("InnoDB: Memory pressure event shrunk"
+                            " innodb_buffer_pool_size=%zum (%zu pages)"
+                            " from %zum (%zu pages)",
+                            size >> 20, n_blocks_new, old_size >> 20,
+                            old_blocks);
+      ut_d(validate());
+      return;
+    }
+  }
+  while (time(nullptr) - start < 15);
+
+  ut_ad(size_in_bytes > size_in_bytes_requested);
+  n_blocks_to_withdraw= 0;
+  first_to_withdraw= nullptr;
+  size_in_bytes_requested= size_in_bytes;
+
+  while (buf_page_t *b= UT_LIST_GET_FIRST(withdrawn))
+  {
+    UT_LIST_REMOVE(withdrawn, b);
+    UT_LIST_ADD_LAST(free, b);
+    ut_d(b->in_free_list= true);
+    ut_ad(b->state() == buf_page_t::NOT_USED);
+    b->lock.init();
+  }
+
+  mysql_mutex_unlock(&mutex);
+  sql_print_information("InnoDB: Memory pressure event failed to shrink"
+                        " innodb_buffer_pool_size=%zum", old_size);
+  ut_d(validate());
+}
+#endif
 
 #if defined(DBUG_OFF) && defined(HAVE_MADVISE) &&  defined(MADV_DODUMP)
-/** Enable buffers to be dumped to core files
+/** Enable buffers to be dumped to core files.
 
-A convience function, not called anyhwere directly however
+A convenience function, not called anyhwere directly however
 it is left available for gdb or any debugger to call
 in the event that you want all of the memory to be dumped
 to a core file.
 
-Returns number of errors found in madvise calls. */
+@return number of errors found in madvise() calls */
 MY_ATTRIBUTE((used))
-int
-buf_madvise_do_dump()
+int buf_pool_t::madvise_do_dump() noexcept
 {
 	int ret= 0;
 
@@ -991,20 +1136,13 @@
 			       MADV_DODUMP);
 	}
 
-	mysql_mutex_lock(&buf_pool.mutex);
-	auto chunk = buf_pool.chunks;
-
-	for (ulint n = buf_pool.n_chunks; n--; chunk++) {
-		ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
-	}
-
-	mysql_mutex_unlock(&buf_pool.mutex);
+	ret+= madvise(buf_pool.memory, buf_pool.size_in_bytes, MADV_DODUMP);
 	return ret;
 }
 #endif
 
 #ifndef UNIV_DEBUG
-static inline byte hex_to_ascii(byte hex_digit)
+static inline byte hex_to_ascii(byte hex_digit) noexcept
 {
   const int offset= hex_digit <= 9 ? '0' : 'a' - 10;
   return byte(hex_digit + offset);
@@ -1040,163 +1178,80 @@
 #endif
 }
 
-/** Initialize a buffer page descriptor.
-@param[in,out]	block	buffer page descriptor
-@param[in]	frame	buffer page frame */
-static
-void
-buf_block_init(buf_block_t* block, byte* frame)
+IF_DBUG(,inline) byte *buf_block_t::frame_address() const noexcept
 {
-	/* This function should only be executed at database startup or by
-	buf_pool.resize(). Either way, adaptive hash index must not exist. */
-	assert_block_ahi_empty_on_init(block);
-
-	block->page.frame = frame;
+  static_assert(ut_is_2pow(innodb_buffer_pool_extent_size), "");
 
-	MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock);
-	ut_ad(!block->modify_clock);
-	MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock);
-	block->page.lock.init();
-	block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL));
-#ifdef BTR_CUR_HASH_ADAPT
-	MEM_MAKE_DEFINED(&block->index, sizeof block->index);
-	ut_ad(!block->index);
-#endif /* BTR_CUR_HASH_ADAPT */
-	ut_d(block->in_unzip_LRU_list = false);
-	ut_d(block->in_withdraw_list = false);
-
-	page_zip_des_init(&block->page.zip);
-
-	MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash);
-	ut_ad(!block->page.hash);
+  byte *frame_= reinterpret_cast<byte*>
+    ((reinterpret_cast<size_t>(this) & ~(innodb_buffer_pool_extent_size - 1)) |
+     first_frame_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN]);
+  ut_ad(reinterpret_cast<const byte*>(this) + sizeof(*this) <= frame_);
+  frame_+=
+    (((reinterpret_cast<size_t>(this) & (innodb_buffer_pool_extent_size - 1)) /
+      sizeof(*this)) << srv_page_size_shift);
+  return frame_;
 }
 
-/** Allocate a chunk of buffer frames.
-@param bytes    requested size
-@return whether the allocation succeeded */
-inline bool buf_pool_t::chunk_t::create(size_t bytes) noexcept
+buf_block_t *buf_pool_t::block_from(const void *ptr) noexcept
 {
-  DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;);
-  /* Round down to a multiple of page size, although it already should be. */
-  bytes= ut_2pow_round<size_t>(bytes, srv_page_size);
-
-  mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx);
-
-  if (UNIV_UNLIKELY(!mem))
-    return false;
-
-  MEM_UNDEFINED(mem, mem_size());
-
-#ifdef HAVE_LIBNUMA
-  if (srv_numa_interleave)
-  {
-    struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
-    MEM_MAKE_DEFINED(numa_mems_allowed, sizeof *numa_mems_allowed);
-    if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
-              numa_mems_allowed->maskp, numa_mems_allowed->size,
-              MPOL_MF_MOVE))
-    {
-      ib::warn() << "Failed to set NUMA memory policy of"
-              " buffer pool page frames to MPOL_INTERLEAVE"
-              " (error: " << strerror(errno) << ").";
-    }
-    numa_bitmask_free(numa_mems_allowed);
-  }
-#endif /* HAVE_LIBNUMA */
-
-
-  /* Allocate the block descriptors from
-  the start of the memory block. */
-  blocks= reinterpret_cast<buf_block_t*>(mem);
-
-  /* Align a pointer to the first frame.  Note that when
-  opt_large_page_size is smaller than srv_page_size,
-  (with max srv_page_size at 64k don't think any hardware
-  makes this true),
-  we may allocate one fewer block than requested.  When
-  it is bigger, we may allocate more blocks than requested. */
-  static_assert(sizeof(byte*) == sizeof(ulint), "pointer size");
-
-  byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) +
-                                        srv_page_size - 1) &
-                                       ~ulint{srv_page_size - 1});
-  size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem);
-
-  /* Subtract the space needed for block descriptors. */
-  {
-    ulint s= size;
-
-    while (frame < reinterpret_cast<const byte*>(blocks + s))
-    {
-      frame+= srv_page_size;
-      s--;
-    }
-
-    size= s;
-  }
-
-  /* Init block structs and assign frames for them. Then we assign the
-  frames to the first blocks (we already mapped the memory above). */
-
-  buf_block_t *block= blocks;
+  static_assert(ut_is_2pow(innodb_buffer_pool_extent_size), "");
+  ut_ad(static_cast<const char*>(ptr) >= buf_pool.memory);
 
-  for (auto i= size; i--; ) {
-    buf_block_init(block, frame);
-    MEM_UNDEFINED(block->page.frame, srv_page_size);
-    /* Add the block to the free list */
-    UT_LIST_ADD_LAST(buf_pool.free, &block->page);
+  byte *first_block= reinterpret_cast<byte*>
+    (reinterpret_cast<size_t>(ptr) & ~(innodb_buffer_pool_extent_size - 1));
+  const size_t first_frame=
+    first_frame_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN];
 
-    ut_d(block->page.in_free_list = TRUE);
-    block++;
-    frame+= srv_page_size;
-  }
-
-  reg();
-
-  return true;
+  ut_ad(static_cast<const byte*>(ptr) >= first_block + first_frame);
+  return reinterpret_cast<buf_block_t*>(first_block) +
+    (((size_t(ptr) & (innodb_buffer_pool_extent_size - 1)) - first_frame) >>
+     srv_page_size_shift);
 }
 
-#ifdef UNIV_DEBUG
-/** Check that all file pages in the buffer chunk are in a replaceable state.
-@return address of a non-free block
-@retval nullptr if all freed */
-inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const noexcept
+/** Determine the address of the first invalid block descriptor
+@param n_blocks   buf_pool.n_blocks
+@return offset of the first invalid buf_block_t, relative to buf_pool.memory */
+static size_t block_descriptors_in_bytes(size_t n_blocks) noexcept
 {
-  buf_block_t *block= blocks;
-  for (auto i= size; i--; block++)
-  {
-    if (block->page.in_file())
-    {
-      /* The uncompressed buffer pool should never
-      contain ROW_FORMAT=COMPRESSED block descriptors. */
-      ut_ad(block->page.frame);
-      const lsn_t lsn= block->page.oldest_modification();
+  const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN;
+  const size_t extent_size= pages_in_extent[ssize];
+  return n_blocks / extent_size * innodb_buffer_pool_extent_size +
+    (n_blocks % extent_size) * sizeof(buf_block_t);
+}
 
-      if (srv_read_only_mode)
-      {
-        /* The page cleaner is disabled in read-only mode.  No pages
-        can be dirtied, so all of them must be clean. */
-        ut_ad(lsn == 0 || lsn == recv_sys.lsn ||
-              srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
-        break;
-      }
+buf_block_t *buf_pool_t::get_nth_page(size_t pos) const noexcept
+{
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(pos < n_blocks);
+  return reinterpret_cast<buf_block_t*>
+    (memory + block_descriptors_in_bytes(pos));
+}
 
-      if (fsp_is_system_temporary(block->page.id().space()))
-      {
-        ut_ad(lsn == 0 || lsn == 2);
-        break;
-      }
+buf_block_t *buf_pool_t::allocate() noexcept
+{
+  mysql_mutex_assert_owner(&mutex);
 
-      if (lsn > 1 || !block->page.can_relocate())
-        return block;
+  while (buf_page_t *b= UT_LIST_GET_FIRST(free))
+  {
+    ut_ad(b->in_free_list);
+    ut_d(b->in_free_list = FALSE);
+    ut_ad(!b->oldest_modification());
+    ut_ad(!b->in_LRU_list);
+    ut_a(!b->in_file());
+    UT_LIST_REMOVE(free, b);
 
-      break;
+    if (UNIV_LIKELY(!n_blocks_to_withdraw) || !withdraw(*b))
+    {
+      /* No adaptive hash index entries may point to a free block. */
+      assert_block_ahi_empty(reinterpret_cast<buf_block_t*>(b));
+      b->set_state(buf_page_t::MEMORY);
+      b->set_os_used();
+      return reinterpret_cast<buf_block_t*>(b);
     }
   }
 
   return nullptr;
 }
-#endif /* UNIV_DEBUG */
 
 /** Create the hash table.
 @param n  the lower bound of n_cells */
@@ -1210,96 +1265,189 @@
   array= static_cast<hash_chain*>(v);
 }
 
+size_t buf_pool_t::get_n_blocks(size_t size_in_bytes) noexcept
+{
+  const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN;
+  size_t n_blocks_alloc= size_in_bytes / innodb_buffer_pool_extent_size *
+    pages_in_extent[ssize];
+
+  if (const size_t incomplete_extent_pages=
+      (size_in_bytes & (innodb_buffer_pool_extent_size - 1)) >>
+      srv_page_size_shift)
+  {
+    ssize_t d= incomplete_extent_pages - first_page_in_extent[ssize];
+    ut_ad(d > 0);
+    n_blocks_alloc+= d;
+  }
+
+  return n_blocks_alloc;
+}
+
+size_t buf_pool_t::blocks_in_bytes(size_t n_blocks) noexcept
+{
+  const size_t shift{srv_page_size_shift};
+  const size_t ssize{shift - UNIV_PAGE_SIZE_SHIFT_MIN};
+  const size_t extent_size= pages_in_extent[ssize];
+  size_t size_in_bytes= n_blocks / extent_size *
+    innodb_buffer_pool_extent_size;
+  if (size_t remainder= n_blocks % extent_size)
+    size_in_bytes+= (remainder + first_page_in_extent[ssize]) << shift;
+  ut_ad(get_n_blocks(size_in_bytes) == n_blocks);
+  return size_in_bytes;
+}
+
 /** Create the buffer pool.
 @return whether the creation failed */
-bool buf_pool_t::create()
+bool buf_pool_t::create() noexcept
 {
   ut_ad(this == &buf_pool);
-  ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0);
   ut_ad(!is_initialised());
-  ut_ad(srv_buf_pool_size > 0);
-  ut_ad(!resizing);
-  ut_ad(!chunks_old);
+  ut_ad(size_in_bytes_requested > 0);
+  ut_ad(!(size_in_bytes_max & (innodb_buffer_pool_extent_size - 1)));
+  ut_ad(!(size_in_bytes_requested & ((1U << 20) - 1)));
+  ut_ad(size_in_bytes_requested <= size_in_bytes_max);
   /* mariabackup loads tablespaces, and it requires field_ref_zero to be
   allocated before innodb initialization */
   ut_ad(srv_operation >= SRV_OPERATION_RESTORE || !field_ref_zero);
 
-  NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
-
-  if (!field_ref_zero) {
+  if (!field_ref_zero)
+  {
     if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096))
+    {
       field_ref_zero= static_cast<const byte*>
         (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX));
-    else
-      return true;
+      goto init;
+    }
+
+  oom:
+    ut_ad(!is_initialised());
+    sql_print_error("InnoDB: Cannot map innodb_buffer_pool_size_max=%zum",
+                    size_in_bytes_max >> 20);
+    return true;
+  }
+
+ init:
+  DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", goto oom;);
+  size_t size= size_in_bytes_max;
+  sql_print_information("InnoDB: innodb_buffer_pool_size_max=%zum,"
+                        " innodb_buffer_pool_size=%zum",
+                        size >> 20, size_in_bytes_requested >> 20);
+
+ retry:
+  {
+    NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+#ifdef _WIN32
+    memory_unaligned= my_virtual_mem_reserve(&size);
+#else
+    memory_unaligned= my_large_virtual_alloc(&size);
+#endif
   }
 
-  chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
+  if (!memory_unaligned)
+    goto oom;
 
-  new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
+  const size_t alignment_waste=
+    ((~size_t(memory_unaligned) & (innodb_buffer_pool_extent_size - 1)) + 1) &
+    (innodb_buffer_pool_extent_size - 1);
 
-  n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
-  const size_t chunk_size= srv_buf_pool_chunk_unit;
+  if (size < size_in_bytes_max + alignment_waste)
+  {
+    my_virtual_mem_release(memory_unaligned, size);
+    size+= 1 +
+      (~size_t(memory_unaligned) & (innodb_buffer_pool_extent_size - 1));
+    goto retry;
+  }
 
-  chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
-  UT_LIST_INIT(free, &buf_page_t::list);
-  curr_size= 0;
-  auto chunk= chunks;
+  MEM_UNDEFINED(memory_unaligned, size);
+  ut_dontdump(memory_unaligned, size, true);
+  memory= memory_unaligned + alignment_waste;
+  size_unaligned= size;
+  size-= alignment_waste;
+  size&= ~(innodb_buffer_pool_extent_size - 1);
 
-  do
+  const size_t actual_size= size_in_bytes_requested;
+  ut_ad(actual_size <= size);
+
+  size_in_bytes= actual_size;
+  os_total_large_mem_allocated+= actual_size;
+
+#ifdef UNIV_PFS_MEMORY
+  PSI_MEMORY_CALL(memory_alloc)(mem_key_buf_buf_pool, actual_size, &owner);
+#endif
+#ifdef _WIN32
+  if (!my_virtual_mem_commit(memory, actual_size))
   {
-    if (!chunk->create(chunk_size))
-    {
-      while (--chunk >= chunks)
-      {
-        buf_block_t* block= chunk->blocks;
+    my_virtual_mem_release(memory_unaligned, size_unaligned);
+    memory= nullptr;
+    memory_unaligned= nullptr;
+    goto oom;
+  }
+#else
+  update_malloc_size(actual_size, 0);
+#endif
 
-        for (auto i= chunk->size; i--; block++)
-          block->page.lock.free();
+#ifdef HAVE_LIBNUMA
+  if (srv_numa_interleave)
+  {
+    struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+    MEM_MAKE_DEFINED(numa_mems_allowed, sizeof *numa_mems_allowed);
+    if (mbind(memory_unaligned, size_unaligned, MPOL_INTERLEAVE,
+              numa_mems_allowed->maskp, numa_mems_allowed->size,
+              MPOL_MF_MOVE))
+      sql_print_warning("InnoDB: Failed to set NUMA memory policy of"
+                        " buffer pool page frames to MPOL_INTERLEAVE"
+                        " (error: %s).", strerror(errno));
+    numa_bitmask_free(numa_mems_allowed);
+  }
+#endif /* HAVE_LIBNUMA */
 
-        allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
-      }
-      ut_free(chunks);
-      chunks= nullptr;
-      UT_DELETE(chunk_t::map_reg);
-      chunk_t::map_reg= nullptr;
-      aligned_free(const_cast<byte*>(field_ref_zero));
-      field_ref_zero= nullptr;
-      ut_ad(!is_initialised());
-      return true;
-    }
+  n_blocks= get_n_blocks(actual_size);
+  n_blocks_to_withdraw= 0;
+  UT_LIST_INIT(free, &buf_page_t::list);
+  const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN;
 
-    curr_size+= chunk->size;
+  for (char *extent= memory,
+         *end= memory + block_descriptors_in_bytes(n_blocks);
+       extent < end; extent+= innodb_buffer_pool_extent_size)
+  {
+    buf_block_t *block= reinterpret_cast<buf_block_t*>(extent);
+    const buf_block_t *extent_end= block + pages_in_extent[ssize];
+    if (reinterpret_cast<const char*>(extent_end) > end)
+      extent_end= reinterpret_cast<buf_block_t*>(end);
+    MEM_MAKE_DEFINED(block, (extent_end - block) * sizeof *block);
+    for (byte *frame= reinterpret_cast<byte*>(extent) +
+           first_frame_in_extent[ssize];
+         block < extent_end; block++, frame+= srv_page_size)
+    {
+      ut_ad(!memcmp(block, field_ref_zero, sizeof *block));
+      block->page.frame= frame;
+      block->page.lock.init();
+      UT_LIST_ADD_LAST(free, &block->page);
+      ut_d(block->page.in_free_list= true);
+    }
   }
-  while (++chunk < chunks + n_chunks);
 
-  ut_ad(is_initialised());
 #if defined(__aarch64__)
   mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
 #else
   mysql_mutex_init(buf_pool_mutex_key, &mutex, nullptr);
 #endif
 
+  UT_LIST_INIT(withdrawn, &buf_page_t::list);
   UT_LIST_INIT(LRU, &buf_page_t::LRU);
-  UT_LIST_INIT(withdraw, &buf_page_t::list);
-  withdraw_target= 0;
   UT_LIST_INIT(flush_list, &buf_page_t::list);
   UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
 
   for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
     UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
-  ulint s= curr_size;
+  ulint s= n_blocks;
   s/= BUF_READ_AHEAD_PORTION;
   read_ahead_area= s >= READ_AHEAD_PAGES
     ? READ_AHEAD_PAGES
     : my_round_up_to_next_power(static_cast<uint32_t>(s));
-  curr_pool_size= srv_buf_pool_size;
 
-  n_chunks_new= n_chunks;
-
-  page_hash.create(2 * curr_size);
-  zip_hash.create(2 * curr_size);
-  last_printout_time= time(NULL);
+  page_hash.create(2 * n_blocks);
+  last_printout_time= time(nullptr);
 
   mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
                    MY_MUTEX_INIT_FAST);
@@ -1318,14 +1466,8 @@
   io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
                 OS_AIO_N_PENDING_IOS_PER_THREAD);
 
-  /* FIXME: remove some of these variables */
-  srv_buf_pool_curr_size= curr_pool_size;
-  srv_buf_pool_old_size= srv_buf_pool_size;
-  srv_buf_pool_base_size= srv_buf_pool_size;
-
   last_activity_count= srv_get_activity_count();
 
-  chunk_t::map_ref= chunk_t::map_reg;
   buf_LRU_old_ratio_update(100 * 3 / 8, false);
   btr_search_sys_create();
 
@@ -1334,6 +1476,7 @@
     buf_mem_pressure_detect_init();
 #endif
   ut_ad(is_initialised());
+  sql_print_information("InnoDB: Completed initialization of buffer pool");
   return false;
 }
 
@@ -1368,14 +1511,31 @@
     }
   }
 
-  for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
   {
-    buf_block_t *block= chunk->blocks;
+    const size_t size{size_in_bytes};
 
-    for (auto i= chunk->size; i--; block++)
-      block->page.lock.free();
+    for (char *extent= memory,
+           *end= memory + block_descriptors_in_bytes(n_blocks);
+         extent < end; extent+= innodb_buffer_pool_extent_size)
+      for (buf_block_t *block= reinterpret_cast<buf_block_t*>(extent),
+             *extent_end= block +
+             pages_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN];
+           block < extent_end && reinterpret_cast<char*>(block) < end; block++)
+      {
+        MEM_MAKE_DEFINED(&block->page.lock, sizeof &block->page.lock);
+        block->page.lock.free();
+      }
 
-    allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+    ut_dodump(memory_unaligned, size_unaligned);
+#ifdef UNIV_PFS_MEMORY
+    PSI_MEMORY_CALL(memory_free)(mem_key_buf_buf_pool, size, owner);
+    owner= nullptr;
+#endif
+    os_total_large_mem_allocated-= size;
+    my_virtual_mem_decommit(memory, size);
+    my_virtual_mem_release(memory_unaligned, size_unaligned);
+    memory= nullptr;
+    memory_unaligned= nullptr;
   }
 
   pthread_cond_destroy(&done_flush_LRU);
@@ -1383,137 +1543,13 @@
   pthread_cond_destroy(&do_flush_list);
   pthread_cond_destroy(&done_free);
 
-  ut_free(chunks);
-  chunks= nullptr;
   page_hash.free();
-  zip_hash.free();
 
   io_buf.close();
-  UT_DELETE(chunk_t::map_reg);
-  chunk_t::map_reg= chunk_t::map_ref= nullptr;
   aligned_free(const_cast<byte*>(field_ref_zero));
   field_ref_zero= nullptr;
 }
 
-/** Try to reallocate a control block.
-@param block  control block to reallocate
-@return whether the reallocation succeeded */
-inline bool buf_pool_t::realloc(buf_block_t *block) noexcept
-{
-	buf_block_t*	new_block;
-
-	mysql_mutex_assert_owner(&mutex);
-	ut_ad(block->page.in_file());
-	ut_ad(block->page.frame);
-
-	new_block = buf_LRU_get_free_only();
-
-	if (new_block == NULL) {
-		mysql_mutex_lock(&buf_pool.flush_list_mutex);
-		page_cleaner_wakeup();
-		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-		return(false); /* free list was not enough */
-	}
-
-	const page_id_t id{block->page.id()};
-	hash_chain& chain = page_hash.cell_get(id.fold());
-	page_hash_latch& hash_lock = page_hash.lock_get(chain);
-	/* It does not make sense to use transactional_lock_guard
-	here, because copying innodb_page_size (4096 to 65536) bytes
-	as well as other changes would likely make the memory
-	transaction too large. */
-	hash_lock.lock();
-
-	if (block->page.can_relocate()) {
-		memcpy_aligned<UNIV_PAGE_SIZE_MIN>(
-			new_block->page.frame, block->page.frame,
-			srv_page_size);
-		mysql_mutex_lock(&buf_pool.flush_list_mutex);
-		const auto frame = new_block->page.frame;
-		new_block->page.lock.free();
-		new (&new_block->page) buf_page_t(block->page);
-		new_block->page.frame = frame;
-
-		/* relocate LRU list */
-		if (buf_page_t*	prev_b = buf_pool.LRU_remove(&block->page)) {
-			UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page);
-		} else {
-			UT_LIST_ADD_FIRST(LRU, &new_block->page);
-		}
-
-		if (LRU_old == &block->page) {
-			LRU_old = &new_block->page;
-		}
-
-		ut_ad(new_block->page.in_LRU_list);
-
-		/* relocate unzip_LRU list */
-		if (block->page.zip.data != NULL) {
-			ut_ad(block->in_unzip_LRU_list);
-			ut_d(new_block->in_unzip_LRU_list = true);
-
-			buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
-			UT_LIST_REMOVE(unzip_LRU, block);
-
-			ut_d(block->in_unzip_LRU_list = false);
-			block->page.zip.data = NULL;
-			page_zip_set_size(&block->page.zip, 0);
-
-			if (prev_block != NULL) {
-				UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block);
-			} else {
-				UT_LIST_ADD_FIRST(unzip_LRU, new_block);
-			}
-		} else {
-			ut_ad(!block->in_unzip_LRU_list);
-			ut_d(new_block->in_unzip_LRU_list = false);
-		}
-
-		/* relocate page_hash */
-		hash_chain& chain = page_hash.cell_get(id.fold());
-		ut_ad(&block->page == page_hash.get(id, chain));
-		buf_pool.page_hash.replace(chain, &block->page,
-					   &new_block->page);
-		buf_block_modify_clock_inc(block);
-		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-		memset_aligned<4>(block->page.frame
-				  + FIL_PAGE_OFFSET, 0xff, 4);
-		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
-			      "not perfect alignment");
-		memset_aligned<2>(block->page.frame
-				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
-		MEM_UNDEFINED(block->page.frame, srv_page_size);
-		block->page.set_state(buf_page_t::REMOVE_HASH);
-		if (!fsp_is_system_temporary(id.space())) {
-			buf_flush_relocate_on_flush_list(&block->page,
-							 &new_block->page);
-		}
-		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-		block->page.set_corrupt_id();
-
-		/* set other flags of buf_block_t */
-
-#ifdef BTR_CUR_HASH_ADAPT
-		/* This code should only be executed by resize(),
-		while the adaptive hash index is disabled. */
-		assert_block_ahi_empty(block);
-		assert_block_ahi_empty_on_init(new_block);
-		ut_ad(!block->index);
-		new_block->index	= NULL;
-		new_block->n_hash_helps	= 0;
-		new_block->n_fields	= 1;
-		new_block->left_side	= TRUE;
-#endif /* BTR_CUR_HASH_ADAPT */
-		ut_d(block->page.set_state(buf_page_t::MEMORY));
-		/* free block */
-		new_block = block;
-	}
-
-	hash_lock.unlock();
-	buf_LRU_block_free_non_file_page(new_block);
-	return(true); /* free_list was enough */
-}
-
 void buf_pool_t::io_buf_t::create(ulint n_slots) noexcept
 {
   this->n_slots= n_slots;
@@ -1552,720 +1588,528 @@
   }
 }
 
-/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
-to the specified string. The format and the following parameters are the
-same as the ones used for printf(3).
-@param[in]	fmt	format
-@param[in]	...	extra parameters according to fmt */
-static
-void
-buf_resize_status(
-	const char*	fmt,
-	...)
+ATTRIBUTE_COLD bool buf_pool_t::withdraw(buf_page_t &bpage) noexcept
 {
-	va_list	ap;
-
-	va_start(ap, fmt);
-
-	vsnprintf(
-		export_vars.innodb_buffer_pool_resize_status,
-		sizeof(export_vars.innodb_buffer_pool_resize_status),
-		fmt, ap);
-
-	va_end(ap);
-
-	ib::info() << export_vars.innodb_buffer_pool_resize_status;
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(n_blocks_to_withdraw);
+  ut_ad(first_to_withdraw);
+  ut_ad(!bpage.zip.data);
+  if (&bpage < first_to_withdraw)
+    return false;
+  n_blocks_to_withdraw--;
+  bpage.lock.free();
+  UT_LIST_ADD_LAST(withdrawn, &bpage);
+  return true;
 }
 
-/** Withdraw blocks from the buffer pool until meeting withdraw_target.
-@return whether retry is needed */
-inline bool buf_pool_t::withdraw_blocks() noexcept
+ATTRIBUTE_COLD buf_pool_t::shrink_status buf_pool_t::shrink(size_t size)
+  noexcept
 {
-	buf_block_t*	block;
-	ulint		loop_count = 0;
-
-	ib::info() << "Start to withdraw the last "
-		<< withdraw_target << " blocks.";
-
-	while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
-
-		/* try to withdraw from free_list */
-		ulint	count1 = 0;
+  mysql_mutex_assert_owner(&mutex);
+  buf_load_abort();
 
-		mysql_mutex_lock(&mutex);
-		buf_buddy_condense_free();
-		block = reinterpret_cast<buf_block_t*>(
-			UT_LIST_GET_FIRST(free));
-		while (block != NULL
-		       && UT_LIST_GET_LEN(withdraw) < withdraw_target) {
-			ut_ad(block->page.in_free_list);
-			ut_ad(!block->page.oldest_modification());
-			ut_ad(!block->page.in_LRU_list);
-			ut_a(!block->page.in_file());
-
-			buf_block_t*	next_block;
-			next_block = reinterpret_cast<buf_block_t*>(
-				UT_LIST_GET_NEXT(
-					list, &block->page));
-
-			if (will_be_withdrawn(block->page)) {
-				/* This should be withdrawn */
-				UT_LIST_REMOVE(free, &block->page);
-				UT_LIST_ADD_LAST(withdraw, &block->page);
-				ut_d(block->in_withdraw_list = true);
-				count1++;
-			}
-
-			block = next_block;
-		}
-
-		/* reserve free_list length */
-		if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
-			try_LRU_scan = false;
-			mysql_mutex_unlock(&mutex);
-			mysql_mutex_lock(&flush_list_mutex);
-			page_cleaner_wakeup(true);
-			my_cond_wait(&done_flush_list,
-				     &flush_list_mutex.m_mutex);
-			mysql_mutex_unlock(&flush_list_mutex);
-			mysql_mutex_lock(&mutex);
-		}
-
-		/* relocate blocks/buddies in withdrawn area */
-		ulint	count2 = 0;
-
-		buf_pool_mutex_exit_forbid();
-		for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage;
-		     bpage; bpage = next_bpage) {
-			ut_ad(bpage->in_file());
-			next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
-			if (UNIV_LIKELY_NULL(bpage->zip.data)
-			    && will_be_withdrawn(bpage->zip.data)
-			    && bpage->can_relocate()) {
-				if (!buf_buddy_realloc(
-					    bpage->zip.data,
-					    page_zip_get_size(&bpage->zip))) {
-					/* failed to allocate block */
-					break;
-				}
-				count2++;
-				if (bpage->frame) {
-					goto realloc_frame;
-				}
-			}
-
-			if (bpage->frame && will_be_withdrawn(*bpage)
-			    && bpage->can_relocate()) {
-realloc_frame:
-				if (!realloc(reinterpret_cast<buf_block_t*>(
-						     bpage))) {
-					/* failed to allocate block */
-					break;
-				}
-				count2++;
-			}
-		}
-		buf_pool_mutex_exit_allow();
-		mysql_mutex_unlock(&mutex);
-
-		buf_resize_status(
-			"Withdrawing blocks. (" ULINTPF "/" ULINTPF ").",
-			UT_LIST_GET_LEN(withdraw),
-			withdraw_target);
-
-		ib::info() << "Withdrew "
-			<< count1 << " blocks from free list."
-			<< " Tried to relocate " << count2 << " blocks ("
-			<< UT_LIST_GET_LEN(withdraw) << "/"
-			<< withdraw_target << ").";
-
-		if (++loop_count >= 10) {
-			/* give up for now.
-			retried after user threads paused. */
-
-			ib::info() << "will retry to withdraw later";
-
-			/* need retry later */
-			return(true);
-		}
-	}
-
-	/* confirm withdrawn enough */
-	for (const chunk_t* chunk = chunks + n_chunks_new,
-	     * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
-		block = chunk->blocks;
-		for (ulint j = chunk->size; j--; block++) {
-			ut_a(block->page.state() == buf_page_t::NOT_USED);
-			ut_ad(block->in_withdraw_list);
-		}
-	}
-
-	ib::info() << "Withdrawn target: " << UT_LIST_GET_LEN(withdraw)
-		   << " blocks.";
-
-	return(false);
-}
-
-
-
-inline void buf_pool_t::page_hash_table::write_lock_all() noexcept
-{
-  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  if (!n_blocks_to_withdraw)
   {
-    reinterpret_cast<page_hash_latch&>(array[n]).lock();
-    if (!n)
-      break;
+  withdraw_done:
+    first_to_withdraw= nullptr;
+    while (buf_page_t *b= UT_LIST_GET_FIRST(withdrawn))
+    {
+      UT_LIST_REMOVE(withdrawn, b);
+      /* satisfy the check in lazy_allocate() */
+      ut_d(memset((void*) b, 0, sizeof(buf_block_t)));
+    }
+    return SHRINK_DONE;
   }
-}
 
+  buf_buddy_condense_free(size);
 
-inline void buf_pool_t::page_hash_table::write_unlock_all() noexcept
-{
-  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  for (buf_page_t *b= UT_LIST_GET_FIRST(free), *next; b; b= next)
   {
-    reinterpret_cast<page_hash_latch&>(array[n]).unlock();
-    if (!n)
-      break;
-  }
-}
+    ut_ad(b->in_free_list);
+    ut_ad(!b->in_LRU_list);
+    ut_ad(!b->zip.data);
+    ut_ad(!b->oldest_modification());
+    ut_a(b->state() == buf_page_t::NOT_USED);
 
+    next= UT_LIST_GET_NEXT(list, b);
 
-namespace
-{
-
-struct find_interesting_trx
-{
-  void operator()(const trx_t &trx)
-  {
-    if (!trx.is_started())
-      return;
-    if (trx.mysql_thd == nullptr)
-      return;
-    if (withdraw_started <= trx.start_time_micro)
-      return;
-
-    if (!found)
+    if (b >= first_to_withdraw)
     {
-      sql_print_warning("InnoDB: The following trx might hold "
-                    "the blocks in buffer pool to "
-                    "be withdrawn. Buffer pool "
-                    "resizing can complete only "
-                    "after all the transactions "
-                    "below release the blocks.");
-      found= true;
+      UT_LIST_REMOVE(free, b);
+      b->lock.free();
+      UT_LIST_ADD_LAST(withdrawn, b);
+      if (!--n_blocks_to_withdraw)
+        goto withdraw_done;
     }
-
-    lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time);
   }
 
-  bool &found;
-  /** microsecond_interval_timer() */
-  const ulonglong withdraw_started;
-  const my_hrtime_t current_time;
-};
-
-} // namespace
-
-/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
-inline void buf_pool_t::resize()
-{
-  ut_ad(this == &buf_pool);
-  ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
-
-	bool		warning = false;
-
-	NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
-
-	ut_ad(!resize_in_progress());
-	ut_ad(srv_buf_pool_chunk_unit > 0);
-
-	ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
-	std::ostringstream str_old_size, str_new_size, str_chunk_size;
-	str_old_size << ib::bytes_iec{srv_buf_pool_old_size};
-	str_new_size << ib::bytes_iec{srv_buf_pool_size};
-	str_chunk_size << ib::bytes_iec{srv_buf_pool_chunk_unit};
+  buf_block_t *block= allocate();
+  size_t scanned= 0;
+  for (buf_page_t *b= lru_scan_itr.start(), *prev; block && b; b= prev)
+  {
+    ut_ad(b->in_LRU_list);
+    ut_a(b->in_file());
 
-	buf_resize_status("Resizing buffer pool from %s to %s (unit = %s).",
-			  str_old_size.str().c_str(),
-			  str_new_size.str().c_str(),
-			  str_chunk_size.str().c_str());
+    prev= UT_LIST_GET_PREV(LRU, b);
 
-#ifdef BTR_CUR_HASH_ADAPT
-	/* disable AHI if needed */
-	buf_resize_status("Disabling adaptive hash index.");
+    if (!b->can_relocate())
+    {
+    next:
+      if (++scanned & 31)
+        continue;
+      /* Avoid starvation by periodically releasing buf_pool.mutex. */
+      lru_scan_itr.set(prev);
+      mysql_mutex_unlock(&mutex);
+      mysql_mutex_lock(&mutex);
+      prev= lru_scan_itr.get();
+      continue;
+    }
 
-	btr_search_s_lock_all();
-	const bool btr_search_disabled = btr_search_enabled;
-	btr_search_s_unlock_all();
+    const page_id_t id{b->id()};
+    hash_chain &chain= page_hash.cell_get(id.fold());
+    page_hash_latch &hash_lock= page_hash.lock_get(chain);
+    hash_lock.lock();
 
-	btr_search_disable();
+    {
+      /* relocate flush_list and b->page.zip */
+      bool have_flush_list_mutex= false;
 
-	if (btr_search_disabled) {
-		ib::info() << "disabled adaptive hash index.";
-	}
-#endif /* BTR_CUR_HASH_ADAPT */
+      switch (b->oldest_modification()) {
+      case 2:
+        ut_ad(fsp_is_system_temporary(id.space()));
+        /* fall through */
+      case 0:
+        break;
+      default:
+        mysql_mutex_lock(&flush_list_mutex);
+        switch (ut_d(lsn_t om=) b->oldest_modification()) {
+        case 1:
+          delete_from_flush_list(b);
+          /* fall through */
+        case 0:
+          mysql_mutex_unlock(&flush_list_mutex);
+          break;
+        default:
+          ut_ad(om != 2);
+          have_flush_list_mutex= true;
+        }
+      }
 
-	mysql_mutex_lock(&mutex);
-	ut_ad(n_chunks_new == n_chunks);
-	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+      if (!b->can_relocate())
+      {
+      next_quick:
+        if (have_flush_list_mutex)
+          mysql_mutex_unlock(&flush_list_mutex);
+        hash_lock.unlock();
+        continue;
+      }
 
-	n_chunks_new = (new_instance_size << srv_page_size_shift)
-		/ srv_buf_pool_chunk_unit;
-	curr_size = n_chunks_new * chunks->size;
-	mysql_mutex_unlock(&mutex);
+      if (UNIV_UNLIKELY(will_be_withdrawn(b->zip.data, size)))
+      {
+        block= buf_buddy_shrink(b, block);
+        ut_ad(mach_read_from_4(b->zip.data + FIL_PAGE_OFFSET) == id.page_no());
+        if (UNIV_UNLIKELY(!n_blocks_to_withdraw))
+        {
+          if (have_flush_list_mutex)
+            mysql_mutex_unlock(&flush_list_mutex);
+          hash_lock.unlock();
+          if (block)
+            buf_LRU_block_free_non_file_page(block);
+          goto withdraw_done;
+        }
+        if (!block && !(block= allocate()))
+          goto next_quick;
+      }
 
-	if (is_shrinking()) {
-		/* set withdraw target */
-		size_t w = 0;
+      if (!b->frame || b < first_to_withdraw)
+        goto next_quick;
 
-		for (const chunk_t* chunk = chunks + n_chunks_new,
-		     * const echunk = chunks + n_chunks;
-		     chunk != echunk; chunk++)
-			w += chunk->size;
+      ut_ad(is_uncompressed_current(b));
 
-		ut_ad(withdraw_target == 0);
-		withdraw_target = w;
-	}
+      byte *const frame= block->page.frame;
+      memcpy_aligned<4096>(frame, b->frame, srv_page_size);
+      b->lock.free();
+      block->page.lock.free();
+      new(&block->page) buf_page_t(*b);
+      block->page.frame= frame;
 
-	buf_resize_status("Withdrawing blocks to be shrunken.");
+      if (have_flush_list_mutex)
+      {
+        buf_flush_relocate_on_flush_list(b, &block->page);
+        mysql_mutex_unlock(&flush_list_mutex);
+      }
+    }
 
-	ulonglong	withdraw_started = microsecond_interval_timer();
-	ulonglong	message_interval = 60ULL * 1000 * 1000;
-	ulint		retry_interval = 1;
+    /* relocate LRU list */
+    if (buf_page_t *prev_b= LRU_remove(b))
+      UT_LIST_INSERT_AFTER(LRU, prev_b, &block->page);
+    else
+      UT_LIST_ADD_FIRST(LRU, &block->page);
 
-withdraw_retry:
-	/* wait for the number of blocks fit to the new size (if needed)*/
-	bool	should_retry_withdraw = is_shrinking()
-		&& withdraw_blocks();
+    if (LRU_old == b)
+      LRU_old= &block->page;
 
-	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
-		/* abort to resize for shutdown. */
-		return;
-	}
+    ut_ad(block->page.in_LRU_list);
 
-	/* abort buffer pool load */
-	buf_load_abort();
+    /* relocate page_hash */
+    ut_ad(b == page_hash.get(id, chain));
+    page_hash.replace(chain, b, &block->page);
 
-	const ulonglong current_time = microsecond_interval_timer();
+    if (b->zip.data)
+    {
+      ut_ad(mach_read_from_4(b->zip.data + FIL_PAGE_OFFSET) == id.page_no());
+      b->zip.data= nullptr;
+      /* relocate unzip_LRU list */
+      buf_block_t *old_block= reinterpret_cast<buf_block_t*>(b);
+      ut_ad(old_block->in_unzip_LRU_list);
+      ut_d(old_block->in_unzip_LRU_list= false);
+      ut_d(block->in_unzip_LRU_list= true);
 
-	if (should_retry_withdraw
-	    && current_time - withdraw_started >= message_interval) {
+      buf_block_t *prev= UT_LIST_GET_PREV(unzip_LRU, old_block);
+      UT_LIST_REMOVE(unzip_LRU, old_block);
 
-		if (message_interval > 900000000) {
-			message_interval = 1800000000;
-		} else {
-			message_interval *= 2;
-		}
+      if (prev)
+        UT_LIST_INSERT_AFTER(unzip_LRU, prev, block);
+      else
+        UT_LIST_ADD_FIRST(unzip_LRU, block);
+    }
 
-		bool found= false;
-		find_interesting_trx f
-			{found, withdraw_started, my_hrtime_coarse()};
-		withdraw_started = current_time;
-
-		/* This is going to exceed the maximum size of a
-		memory transaction. */
-		LockMutexGuard g{SRW_LOCK_CALL};
-		trx_sys.trx_list.for_each(f);
-	}
-
-	if (should_retry_withdraw) {
-		ib::info() << "Will retry to withdraw " << retry_interval
-			<< " seconds later.";
-		std::this_thread::sleep_for(
-			std::chrono::seconds(retry_interval));
+    buf_block_modify_clock_inc(block);
 
-		if (retry_interval > 5) {
-			retry_interval = 10;
-		} else {
-			retry_interval *= 2;
-		}
+#ifdef BTR_CUR_HASH_ADAPT
+    assert_block_ahi_empty_on_init(block);
+    block->index= nullptr;
+    block->n_hash_helps= 0;
+    block->n_fields= 1;
+    block->left_side= true;
+#endif /* BTR_CUR_HASH_ADAPT */
+    hash_lock.unlock();
 
-		goto withdraw_retry;
-	}
+    ut_d(b->in_LRU_list= false);
 
-	buf_resize_status("Latching entire buffer pool.");
+    b->set_state(buf_page_t::NOT_USED);
+    UT_LIST_ADD_LAST(withdrawn, b);
+    if (!--n_blocks_to_withdraw)
+      goto withdraw_done;
 
-#ifndef DBUG_OFF
-	{
-		bool	should_wait = true;
+    block= allocate();
+    goto next;
+  }
 
-		while (should_wait) {
-			should_wait = false;
-			DBUG_EXECUTE_IF(
-				"ib_buf_pool_resize_wait_before_resize",
-				should_wait = true;
-				std::this_thread::sleep_for(
-					std::chrono::milliseconds(10)););
-		}
-	}
-#endif /* !DBUG_OFF */
+  if (UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < usable_size() / 20)
+    return SHRINK_ABORT;
 
-	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
-		return;
-	}
+  mysql_mutex_lock(&flush_list_mutex);
 
-	/* Indicate critical path */
-	resizing.store(true, std::memory_order_relaxed);
+  if (LRU_warned && !UT_LIST_GET_FIRST(free))
+  {
+    LRU_warned_clear();
+    mysql_mutex_unlock(&flush_list_mutex);
+    return SHRINK_ABORT;
+  }
 
-	mysql_mutex_lock(&mutex);
-	page_hash.write_lock_all();
+  try_LRU_scan= false;
+  mysql_mutex_unlock(&mutex);
+  page_cleaner_wakeup(true);
+  my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex);
+  mysql_mutex_unlock(&flush_list_mutex);
+  mysql_mutex_lock(&mutex);
 
-	chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
+  if (!n_blocks_to_withdraw)
+    goto withdraw_done;
 
-	/* add/delete chunks */
+  return SHRINK_IN_PROGRESS;
+}
 
-	buf_resize_status("Resizing buffer pool from "
-			  ULINTPF " chunks to " ULINTPF " chunks.",
-			  n_chunks, n_chunks_new);
-
-	if (is_shrinking()) {
-		/* delete chunks */
-		chunk_t* chunk = chunks + n_chunks_new;
-		const chunk_t* const echunk = chunks + n_chunks;
-
-		ulint	sum_freed = 0;
-
-		while (chunk < echunk) {
-			/* buf_LRU_block_free_non_file_page() invokes
-			MEM_NOACCESS() on any buf_pool.free blocks.
-			We must cancel the effect of that. In
-			MemorySanitizer, MEM_NOACCESS() is no-op, so
-			we must not do anything special for it here. */
-#ifdef HAVE_valgrind
-# if !__has_feature(memory_sanitizer)
-			MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size());
+inline void buf_pool_t::shrunk(size_t size, size_t reduced) noexcept
+{
+  ut_ad(size + reduced == size_in_bytes);
+  size_in_bytes_requested= size;
+  size_in_bytes= size;
+# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
+  /* Only page_guess() may read this memory, which after
+  my_virtual_mem_decommit() may be zeroed out or preserve its original
+  contents.  Try to catch any unintended reads outside page_guess(). */
+  MEM_UNDEFINED(memory + size, size_in_bytes_max - size);
+# else
+  for (size_t n= page_hash.pad(page_hash.n_cells), i= 0; i < n;
+       i+= page_hash.ELEMENTS_PER_LATCH + 1)
+  {
+    auto &latch= reinterpret_cast<page_hash_latch&>(page_hash.array[i]);
+    latch.lock();
+    /* We already shrunk size_in_bytes. The exclusive lock here
+    ensures that any page_guess() will detect an out-of-bounds
+    guess before we invoke my_virtual_mem_decommit() below. */
+    latch.unlock();
+  }
 # endif
-#else
-			MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size);
+  my_virtual_mem_decommit(memory + size, reduced);
+#ifdef UNIV_PFS_MEMORY
+  PSI_MEMORY_CALL(memory_free)(mem_key_buf_buf_pool, reduced, owner);
 #endif
+}
 
-			buf_block_t*	block = chunk->blocks;
-
-			for (ulint j = chunk->size; j--; block++) {
-				block->page.lock.free();
-			}
-
-			allocator.deallocate_large_dodump(
-				chunk->mem, &chunk->mem_pfx);
-			sum_freed += chunk->size;
-			++chunk;
-		}
-
-		/* discard withdraw list */
-		UT_LIST_INIT(withdraw, &buf_page_t::list);
-		withdraw_target = 0;
-
-		ib::info() << n_chunks - n_chunks_new
-			   << " Chunks (" << sum_freed
-			   << " blocks) were freed.";
-
-		n_chunks = n_chunks_new;
-	}
-
-	{
-		/* reallocate chunks */
-		const size_t	new_chunks_size
-			= n_chunks_new * sizeof(chunk_t);
-
-		chunk_t*	new_chunks = static_cast<chunk_t*>(
-			ut_zalloc_nokey_nofatal(new_chunks_size));
-
-		DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
-				ut_free(new_chunks); new_chunks= nullptr; );
-
-		if (!new_chunks) {
-			ib::error() << "failed to allocate"
-				" the chunk array.";
-			n_chunks_new = n_chunks;
-			warning = true;
-			chunks_old = NULL;
-			goto calc_buf_pool_size;
-		}
-
-		ulint	n_chunks_copy = ut_min(n_chunks_new, n_chunks);
-
-		memcpy(new_chunks, chunks,
-		       n_chunks_copy * sizeof *new_chunks);
+ATTRIBUTE_COLD void buf_pool_t::resize(size_t size, THD *thd) noexcept
+{
+  ut_ad(this == &buf_pool);
+  mysql_mutex_assert_owner(&LOCK_global_system_variables);
+  ut_ad(size <= size_in_bytes_max);
+  if (my_use_large_pages)
+  {
+    my_error(ER_VARIABLE_IS_READONLY, MYF(0), "InnoDB",
+             "innodb_buffer_pool_size", "large_pages=0");
+    return;
+  }
 
-		for (ulint j = 0; j < n_chunks_copy; j++) {
-			new_chunks[j].reg();
-		}
+  size_t n_blocks_new= get_n_blocks(size);
 
-		chunks_old = chunks;
-		chunks = new_chunks;
-	}
+  mysql_mutex_lock(&mutex);
 
-	if (n_chunks_new > n_chunks) {
-		/* add chunks */
-		ulint	sum_added = 0;
-		ulint	n = n_chunks;
-		const size_t unit = srv_buf_pool_chunk_unit;
-
-		for (chunk_t* chunk = chunks + n_chunks,
-		     * const echunk = chunks + n_chunks_new;
-		     chunk != echunk; chunk++) {
-			if (!chunk->create(unit)) {
-				ib::error() << "failed to allocate"
-					" memory for buffer pool chunk";
+  const size_t old_size= size_in_bytes;
+  if (first_to_withdraw || old_size != size_in_bytes_requested)
+  {
+    mysql_mutex_unlock(&mutex);
+    my_printf_error(ER_WRONG_USAGE,
+                    "innodb_buffer_pool_size change is already in progress",
+                    MYF(0));
+    return;
+  }
 
-				warning = true;
-				n_chunks_new = n_chunks;
-				break;
-			}
+  ut_ad(UT_LIST_GET_LEN(withdrawn) == 0);
+  ut_ad(n_blocks_to_withdraw == 0);
+#ifdef __linux__
+  DBUG_EXECUTE_IF("trigger_garbage_collection",
+                  mem_pressure_obj.trigger_collection(););
+#endif
 
-			sum_added += chunk->size;
-			++n;
-		}
+  if (size == old_size)
+  {
+    mysql_mutex_unlock(&mutex);
+    DBUG_EXECUTE_IF("trigger_garbage_collection",
+                    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+                    garbage_collect(););
+    return;
+  }
 
-		ib::info() << n_chunks_new - n_chunks
-			   << " chunks (" << sum_added
-			   << " blocks) were added.";
-
-		n_chunks = n;
-	}
-calc_buf_pool_size:
-	/* recalc curr_size */
-	ulint	new_size = 0;
+#ifdef BTR_CUR_HASH_ADAPT
+  bool ahi_disabled= false;
+#endif
 
-	{
-		chunk_t* chunk = chunks;
-		const chunk_t* const echunk = chunk + n_chunks;
-		do {
-			new_size += chunk->size;
-		} while (++chunk != echunk);
-	}
+  const bool significant_change=
+    n_blocks_new > n_blocks * 2 || n_blocks > n_blocks_new * 2;
+  const ssize_t n_blocks_removed= n_blocks - n_blocks_new;
 
-	curr_size = new_size;
-	n_chunks_new = n_chunks;
+  if (n_blocks_removed <= 0)
+  {
+    if (!my_virtual_mem_commit(memory + old_size, size - old_size))
+    {
+      mysql_mutex_unlock(&mutex);
+      sql_print_error("InnoDB: Cannot commit innodb_buffer_pool_size=%zum;"
+                      " retaining innodb_buffer_pool_size=%zum",
+                      size >> 20, old_size >> 20);
+      my_error(ER_OUT_OF_RESOURCES, MYF(0));
+      return;
+    }
 
-	if (chunks_old) {
-		ut_free(chunks_old);
-		chunks_old = NULL;
-	}
+    size_in_bytes_requested= size;
+    size_in_bytes= size;
 
-	chunk_t::map* chunk_map_old = chunk_t::map_ref;
-	chunk_t::map_ref = chunk_t::map_reg;
+    {
+      const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN;
+      const size_t pages= pages_in_extent[ssize];
+      const size_t first_extent= n_blocks / pages;
 
-	/* set size */
-	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
-  ulint s= curr_size;
-  s/= BUF_READ_AHEAD_PORTION;
-  read_ahead_area= s >= READ_AHEAD_PAGES
-    ? READ_AHEAD_PAGES
-    : my_round_up_to_next_power(static_cast<uint32_t>(s));
-  curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
-  srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
-  extern ulonglong innobase_buffer_pool_size;
-  innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size);
-
-	const bool	new_size_too_diff
-		= srv_buf_pool_base_size > srv_buf_pool_size * 2
-			|| srv_buf_pool_base_size * 2 < srv_buf_pool_size;
+      char *extent= memory + first_extent * innodb_buffer_pool_extent_size;
 
-  mysql_mutex_unlock(&mutex);
-  page_hash.write_unlock_all();
+      buf_block_t *block= reinterpret_cast<buf_block_t*>(extent);
+      if (const size_t first_blocks= n_blocks % pages)
+      {
+        /* Extend the last (partial) extent until its end */
+        const buf_block_t *extent_end= block +
+          (first_extent == (n_blocks_new / pages)
+           ? (n_blocks_new % pages)
+           : pages);
+        block+= first_blocks;
+        memset((void*) block, 0, (extent_end - block) * sizeof *block);
+
+        for (byte *frame= reinterpret_cast<byte*>(extent) +
+               first_frame_in_extent[ssize] +
+               (first_blocks << srv_page_size_shift); block < extent_end;
+             block++, frame+= srv_page_size)
+        {
+          block->page.frame= frame;
+          block->page.lock.init();
+          UT_LIST_ADD_LAST(free, &block->page);
+          ut_d(block->page.in_free_list= true);
+        }
+        extent+= innodb_buffer_pool_extent_size;
+      }
 
-	UT_DELETE(chunk_map_old);
+      /* Fill in further extents; @see buf_pool_t::create() */
+      for (const char *const end_new= memory +
+             block_descriptors_in_bytes(n_blocks_new);
+           extent < end_new; extent+= innodb_buffer_pool_extent_size)
+      {
+        block= reinterpret_cast<buf_block_t*>(extent);
+        const buf_block_t *extent_end= block + pages;
+        if (reinterpret_cast<const char*>(extent_end) > end_new)
+          extent_end= reinterpret_cast<const buf_block_t*>(end_new);
+
+        memset((void*) block, 0, (extent_end - block) * sizeof *block);
+        for (byte *frame= reinterpret_cast<byte*>(extent) +
+               first_frame_in_extent[ssize];
+             block < extent_end; block++, frame+= srv_page_size)
+        {
+          block->page.frame= frame;
+          block->page.lock.init();
+          UT_LIST_ADD_LAST(free, &block->page);
+          ut_d(block->page.in_free_list= true);
+        }
+      }
+    }
 
-	resizing.store(false, std::memory_order_relaxed);
+    mysql_mutex_unlock(&LOCK_global_system_variables);
+  resized:
+    ut_ad(UT_LIST_GET_LEN(withdrawn) == 0);
+    ut_ad(n_blocks_to_withdraw == 0);
+    ut_ad(!first_to_withdraw);
+    const size_t old_blocks{n_blocks};
+    n_blocks= n_blocks_new;
+
+    size_t s= n_blocks_new / BUF_READ_AHEAD_PORTION;
+    read_ahead_area= s >= READ_AHEAD_PAGES
+      ? READ_AHEAD_PAGES
+      : my_round_up_to_next_power(uint32(s));
 
-	/* Normalize other components, if the new size is too different */
-	if (!warning && new_size_too_diff) {
-		srv_buf_pool_base_size = srv_buf_pool_size;
+    if (ssize_t d= size - old_size)
+    {
+      os_total_large_mem_allocated+= d;
+      if (d > 0)
+      {
+        /* Already committed memory earlier */
+        ut_ad(n_blocks_removed <= 0);
+#ifdef UNIV_PFS_MEMORY
+        PSI_MEMORY_CALL(memory_alloc)(mem_key_buf_buf_pool, d, &owner);
+#endif
+      }
+      else
+        shrunk(size, size_t(-d));
+    }
 
-		buf_resize_status("Resizing other hash tables.");
+    mysql_mutex_unlock(&mutex);
 
-		srv_lock_table_size = 5
-			* (srv_buf_pool_size >> srv_page_size_shift);
-		lock_sys.resize(srv_lock_table_size);
-		dict_sys.resize();
+    if (significant_change)
+    {
+      sql_print_information("InnoDB: Resizing hash tables");
+      srv_lock_table_size= 5 * n_blocks_new;
+      lock_sys.resize(srv_lock_table_size);
+      dict_sys.resize();
+    }
 
-		ib::info() << "Resized hash tables: lock_sys,"
+    ibuf_max_size_update(srv_change_buffer_max_size);
 #ifdef BTR_CUR_HASH_ADAPT
-			" adaptive hash index,"
-#endif /* BTR_CUR_HASH_ADAPT */
-			" and dictionary.";
-	}
-
-	/* normalize ibuf.max_size */
-	ibuf_max_size_update(srv_change_buffer_max_size);
-
-	if (srv_buf_pool_old_size != srv_buf_pool_size) {
+    if (ahi_disabled)
+      btr_search_enable(true);
+#endif
+    mysql_mutex_lock(&LOCK_global_system_variables);
+    bool resized= n_blocks_removed < 0;
+    if (n_blocks_removed > 0)
+    {
+      mysql_mutex_lock(&mutex);
+      resized= size_in_bytes == old_size;
+      if (resized)
+      {
+        size_in_bytes_requested= size;
+        size_in_bytes= size;
+      }
+      mysql_mutex_unlock(&mutex);
+    }
 
-	        buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes."
-			    ,srv_buf_pool_old_size, srv_buf_pool_size);
-		srv_buf_pool_old_size = srv_buf_pool_size;
-	}
+    if (resized)
+      sql_print_information("InnoDB: innodb_buffer_pool_size=%zum (%zu pages)"
+                            " resized from %zum (%zu pages)",
+                            size >> 20, n_blocks_new, old_size >> 20,
+                            old_blocks);
+  }
+  else
+  {
+    size_t to_withdraw= size_t(n_blocks_removed);
+    n_blocks_to_withdraw= to_withdraw;
+    first_to_withdraw= &get_nth_page(n_blocks_new)->page;
+    size_in_bytes_requested= size;
+    mysql_mutex_unlock(&LOCK_global_system_variables);
 
+    mysql_mutex_unlock(&mutex);
+    DEBUG_SYNC_C("buf_pool_shrink_before_wakeup");
+    mysql_mutex_lock(&flush_list_mutex);
+    page_cleaner_wakeup(true);
+    my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex);
+    mysql_mutex_unlock(&flush_list_mutex);
 #ifdef BTR_CUR_HASH_ADAPT
-	/* enable AHI if needed */
-	if (btr_search_disabled) {
-		btr_search_enable(true);
-		ib::info() << "Re-enabled adaptive hash index.";
-	}
+    ahi_disabled= btr_search_disable();
 #endif /* BTR_CUR_HASH_ADAPT */
+    mysql_mutex_lock(&mutex);
 
-	if (warning)
-		buf_resize_status("Resizing buffer pool failed");
-
-	ut_d(validate());
-
-	return;
-}
+    time_t last_message= 0;
 
-#ifdef __linux__
-inline void buf_pool_t::garbage_collect()
-{
-  mysql_mutex_lock(&mutex);
-  size_t freed= 0;
-
-#ifdef BTR_CUR_HASH_ADAPT
-  /* buf_LRU_free_page() will temporarily release and reacquire
-  buf_pool.mutex for invoking btr_search_drop_page_hash_index(). Thus,
-  we must protect ourselves with the hazard pointer. */
-rescan:
-#else
-  lru_hp.set(nullptr);
-#endif
-  for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev; bpage; bpage= prev)
-  {
-    prev= UT_LIST_GET_PREV(LRU, bpage);
-#ifdef BTR_CUR_HASH_ADAPT
-    lru_hp.set(prev);
-#endif
-    auto state= bpage->state();
-    ut_ad(state >= buf_page_t::FREED);
-    ut_ad(bpage->in_LRU_list);
-
-    /* We try to free any pages that can be freed without writing out
-    anything. */
-    switch (bpage->oldest_modification()) {
-    case 0:
-    try_to_evict:
-      if (buf_LRU_free_page(bpage, true))
+    do
+    {
+      time_t now= time(nullptr);
+      if (now - last_message > 15)
       {
-      evicted:
-        freed++;
-#ifdef BTR_CUR_HASH_ADAPT
-        bpage= prev;
-        prev= lru_hp.get();
-        if (!prev && bpage)
-          goto rescan;
-#endif
+        if (last_message != 0 && to_withdraw == n_blocks_to_withdraw)
+          break;
+        to_withdraw= n_blocks_to_withdraw;
+        last_message= now;
+        sql_print_information("InnoDB: Trying to shrink"
+                              " innodb_buffer_pool_size=%zum (%zu pages)"
+                              " from %zum (%zu pages, to withdraw %zu)",
+                              size >> 20, n_blocks_new,
+                              old_size >> 20, n_blocks, to_withdraw);
       }
-      continue;
-    case 1:
-      break;
-    default:
-      if (state >= buf_page_t::UNFIXED)
-        continue;
+      shrink_status s{shrink(size)};
+      if (s == SHRINK_DONE)
+        goto resized;
+      if (s != SHRINK_IN_PROGRESS)
+        break;
     }
+    while (!thd_kill_level(thd));
+
+    ut_ad(size_in_bytes > size_in_bytes_requested);
+    n_blocks_to_withdraw= 0;
+    first_to_withdraw= nullptr;
+    size_in_bytes_requested= size_in_bytes;
 
-    if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true))
+    while (buf_page_t *b= UT_LIST_GET_FIRST(withdrawn))
     {
-      ut_ad(!bpage->is_io_fixed());
-      lsn_t oldest_modification= bpage->oldest_modification();
-      switch (oldest_modification) {
-      case 1:
-        mysql_mutex_lock(&flush_list_mutex);
-        oldest_modification= bpage->oldest_modification();
-        if (oldest_modification)
-        {
-          ut_ad(oldest_modification == 1);
-          delete_from_flush_list(bpage);
-        }
-        mysql_mutex_unlock(&flush_list_mutex);
-        /* fall through */
-      case 0:
-        bpage->lock.u_unlock(true);
-        goto try_to_evict;
-      default:
-        if (bpage->state() < buf_page_t::UNFIXED &&
-            oldest_modification <= log_sys.get_flushed_lsn())
-        {
-          release_freed_page(bpage);
-          goto evicted;
-        }
-        else
-          bpage->lock.u_unlock(true);
-      }
+      UT_LIST_REMOVE(withdrawn, b);
+      UT_LIST_ADD_LAST(free, b);
+      ut_d(b->in_free_list= true);
+      ut_ad(b->state() == buf_page_t::NOT_USED);
+      b->lock.init();
     }
-  }
-
-#if defined MADV_FREE
-  /* FIXME: Issue fewer calls for larger contiguous blocks of
-  memory. For now, we assume that this is acceptable, because this
-  code should be executed rarely. */
-  for (buf_page_t *bpage= UT_LIST_GET_FIRST(free); bpage;
-       bpage= UT_LIST_GET_NEXT(list, bpage))
-    madvise(bpage->frame, srv_page_size, MADV_FREE);
-#endif
-  mysql_mutex_unlock(&mutex);
-  sql_print_information("InnoDB: Memory pressure event freed %zu pages",
-                        freed);
-  return;
-}
-#endif /* __linux__ */
-
-/** Thread pool task invoked by innodb_buffer_pool_size changes. */
-static void buf_resize_callback(void *)
-{
-  DBUG_ENTER("buf_resize_callback");
-  ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
-  mysql_mutex_lock(&buf_pool.mutex);
-  const auto size= srv_buf_pool_size;
-  const bool work= srv_buf_pool_old_size != size;
-  mysql_mutex_unlock(&buf_pool.mutex);
-
-  if (work)
-    buf_pool.resize();
-  else
-  {
-    std::ostringstream sout;
-    sout << "Size did not change: old size = new size = " << size;
-    buf_resize_status(sout.str().c_str());
-  }
-  DBUG_VOID_RETURN;
-}
 
-/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */
-static tpool::task_group single_threaded_group(1);
-static tpool::waitable_task buf_resize_task(buf_resize_callback,
-	nullptr, &single_threaded_group);
-
-void buf_resize_start()
-{
-#if !defined(DBUG_OFF) && defined(__linux__)
-  DBUG_EXECUTE_IF("trigger_garbage_collection",
-  {
-    mem_pressure_obj.trigger_collection();
+    mysql_mutex_unlock(&mutex);
+    my_printf_error(ER_WRONG_USAGE, "innodb_buffer_pool_size change aborted",
+                    MYF(ME_ERROR_LOG));
+    mysql_mutex_lock(&LOCK_global_system_variables);
   }
-  );
-#endif
-
-  srv_thread_pool->submit_task(&buf_resize_task);
-}
 
-void buf_resize_shutdown()
-{
-#ifdef __linux__
-  buf_mem_pressure_shutdown();
-#endif
-  buf_resize_task.wait();
+  ut_d(validate());
 }
 
-
 /** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
 buf_pool.page_hash.
 The caller must relocate bpage->list.
 @param bpage   ROW_FORMAT=COMPRESSED only block
 @param dpage   destination control block */
-static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
+static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) noexcept
 {
   const page_id_t id{bpage->id()};
   buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
   ut_ad(!bpage->frame);
   mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(mach_read_from_4(bpage->zip.data + FIL_PAGE_OFFSET) == id.page_no());
   ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
   ut_ad(bpage == buf_pool.page_hash.get(id, chain));
   ut_ad(!buf_pool.watch_is_sentinel(*bpage));
@@ -2274,6 +2118,7 @@
   ut_ad(state <= buf_page_t::READ_FIX);
   ut_ad(bpage->lock.is_write_locked());
   const auto frame= dpage->frame;
+  ut_ad(frame == reinterpret_cast<buf_block_t*>(dpage)->frame_address());
 
   dpage->lock.free();
   new (dpage) buf_page_t(*bpage);
@@ -2345,7 +2190,6 @@
     ut_ad(w->access_time == 0);
     ut_ad(!w->oldest_modification());
     ut_ad(!w->zip.data);
-    ut_ad(!w->in_zip_hash);
     static_assert(buf_page_t::NOT_USED == 0, "efficiency");
     if (ut_d(auto s=) w->state())
     {
@@ -2625,6 +2469,8 @@
 
 	ut_ad(block->zip_size());
 	ut_a(block->page.id().space() != 0);
+	ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET)
+              == block->page.id().page_no());
 
 	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
 
@@ -2863,7 +2709,6 @@
     if (b && !watch_is_sentinel(*b))
     {
       uint32_t state= b->fix() + 1;
-      ut_ad(!b->in_zip_hash);
       hash_lock.unlock_shared();
 
       if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED))
@@ -2893,7 +2738,8 @@
           return reinterpret_cast<buf_block_t*>(-1);
         }
 
-        if (UNIV_LIKELY(b->frame != nullptr));
+        if (UNIV_LIKELY(b->frame != nullptr))
+          ut_ad(b->frame==reinterpret_cast<buf_block_t*>(b)->frame_address());
         else if (state < buf_page_t::READ_FIX)
           goto unzip;
         else
@@ -2959,6 +2805,49 @@
   }
 }
 
+TRANSACTIONAL_TARGET
+uint32_t buf_pool_t::page_guess(buf_block_t *b, page_hash_latch &latch,
+                                const page_id_t id) noexcept
+{
+  transactional_shared_lock_guard<page_hash_latch> g{latch};
+#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
+  /* shrunk() and my_virtual_mem_decommit() could retain the original
+  contents of the virtual memory range or zero it out immediately or
+  with a delay.  Any zeroing out may lead to a false positive for
+  b->page.id() == id but never for b->page.state().  At the time of
+  the shrunk() call, shrink() and buf_LRU_block_free_non_file_page()
+  should guarantee that b->page.state() is equal to
+  buf_page_t::NOT_USED (0) for all to-be-freed blocks. */
+#else
+  /* shrunk() made the memory inaccessible. */
+  if (UNIV_UNLIKELY(reinterpret_cast<char*>(b) >= memory + size_in_bytes))
+    return 0;
+#endif
+  const page_id_t block_id{b->page.id()};
+#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
+  /* shrunk() may have invoked MEM_UNDEFINED() on this memory to be able
+  to catch any unintended access elsewhere in our code. */
+  MEM_MAKE_DEFINED(&block_id, sizeof block_id);
+#endif
+
+  if (id == block_id)
+  {
+    uint32_t state= b->page.state();
+#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT
+    /* shrunk() may have invoked MEM_UNDEFINED() on this memory to be able
+    to catch any unintended access elsewhere in our code. */
+    MEM_MAKE_DEFINED(&state, sizeof state);
+#endif
+    /* Ignore guesses that point to read-fixed blocks.  We can only
+    avoid a race condition by looking up the block via page_hash. */
+    if ((state >= buf_page_t::FREED && state < buf_page_t::READ_FIX) ||
+        state >= buf_page_t::WRITE_FIX)
+      return b->page.fix();
+    ut_ad(b->page.frame);
+  }
+  return 0;
+}
+
 /** Low level function used to get access to a database page.
 @param[in]	page_id			page id
 @param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
@@ -3023,22 +2912,9 @@
 	buf_block_t* block = guess;
 	uint32_t state;
 
-	if (block) {
-		transactional_shared_lock_guard<page_hash_latch> g{hash_lock};
-		if (buf_pool.is_uncompressed(block)
-		    && page_id == block->page.id()) {
-			ut_ad(!block->page.in_zip_hash);
-			state = block->page.state();
-			/* Ignore guesses that point to read-fixed blocks.
-			We can only avoid a race condition by
-			looking up the block via buf_pool.page_hash. */
-			if ((state >= buf_page_t::FREED
-			     && state < buf_page_t::READ_FIX)
-			    || state >= buf_page_t::WRITE_FIX) {
-				state = block->page.fix();
-				goto got_block;
-			}
-		}
+	if (block
+	    && (state = buf_pool.page_guess(block, hash_lock, page_id))) {
+		goto got_block;
 	}
 
 	guess = nullptr;
@@ -3108,7 +2984,6 @@
 	goto loop;
 
 got_block:
-	ut_ad(!block->page.in_zip_hash);
 	state++;
 got_block_fixed:
 	ut_ad(state > buf_page_t::FREED);
@@ -3313,6 +3188,7 @@
 	btr_search_drop_page_hash_index(block, true);
 #endif /* BTR_CUR_HASH_ADAPT */
 
+	ut_ad(block->page.frame == block->frame_address());
 	ut_ad(page_id_t(page_get_space_id(block->page.frame),
 			page_get_page_no(block->page.frame)) == page_id);
 	return block;
@@ -3418,21 +3294,19 @@
   return block;
 }
 
-TRANSACTIONAL_TARGET
 buf_block_t *buf_page_optimistic_fix(buf_block_t *block, page_id_t id) noexcept
 {
   buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
-  transactional_shared_lock_guard<page_hash_latch> g
-    {buf_pool.page_hash.lock_get(chain)};
-  if (UNIV_UNLIKELY(!buf_pool.is_uncompressed(block) ||
-                    id != block->page.id() || !block->page.frame))
-    return nullptr;
-  const auto state= block->page.state();
-  if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
-                    state >= buf_page_t::READ_FIX))
-    return nullptr;
-  block->page.fix();
-  return block;
+  page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+  if (uint32_t state= buf_pool.page_guess(block, hash_lock, id))
+  {
+    if (UNIV_LIKELY(state >= buf_page_t::UNFIXED))
+      return block;
+    else
+      /* Refuse access to pages that are marked as freed in the data file. */
+      block->page.unfix();
+  }
+  return nullptr;
 }
 
 buf_block_t *buf_page_optimistic_get(buf_block_t *block,
@@ -3635,6 +3509,7 @@
       {
         mysql_mutex_unlock(&buf_pool.mutex);
         buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+        ut_ad(bpage->frame == block->frame_address());
         mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
 #ifdef BTR_CUR_HASH_ADAPT
         drop_hash_entry= block->index;
@@ -3670,7 +3545,8 @@
     else
     {
       mysql_mutex_unlock(&buf_pool.mutex);
-      ut_ad(bpage->frame);
+      ut_ad(bpage->frame ==
+            reinterpret_cast<buf_block_t*>(bpage)->frame_address());
 #ifdef BTR_CUR_HASH_ADAPT
       ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
 #endif
@@ -4064,10 +3940,9 @@
   if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
   {
 release_page:
-    if (node.space->full_crc32() && node.space->crypt_data &&
-        recv_recovery_is_on() &&
-        recv_sys.dblwr.find_encrypted_page(node, id().page_no(),
-                                           const_cast<byte*>(read_frame)))
+    if (node.space->full_crc32() && recv_recovery_is_on() &&
+        recv_sys.dblwr.find_deferred_page(node, id().page_no(),
+                                          const_cast<byte*>(read_frame)))
     {
       /* Recover from doublewrite buffer */
       err= DB_SUCCESS;
@@ -4127,6 +4002,61 @@
   return DB_SUCCESS;
 }
 
+#ifdef BTR_CUR_HASH_ADAPT
+/** Clear the adaptive hash index on all pages in the buffer pool. */
+ATTRIBUTE_COLD void buf_pool_t::clear_hash_index() noexcept
+{
+  std::set<dict_index_t*> garbage;
+
+  mysql_mutex_lock(&mutex);
+  ut_ad(!btr_search_enabled);
+
+  for (char *extent= memory,
+         *end= memory + block_descriptors_in_bytes(n_blocks);
+       extent < end; extent+= innodb_buffer_pool_extent_size)
+    for (buf_block_t *block= reinterpret_cast<buf_block_t*>(extent),
+           *extent_end= block +
+           pages_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN];
+         block < extent_end && reinterpret_cast<char*>(block) < end; block++)
+    {
+      dict_index_t *index= block->index;
+      assert_block_ahi_valid(block);
+
+      /* We can clear block->index and block->n_pointers when
+      holding all AHI latches exclusively; see the comments in buf0buf.h */
+
+      if (!index)
+      {
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+        ut_a(!block->n_pointers);
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+        continue;
+      }
+
+      ut_d(const auto s= block->page.state());
+      /* Another thread may have set the state to
+      REMOVE_HASH in buf_LRU_block_remove_hashed().
+
+      The state change in buf_pool_t::resize() is not observable
+      here, because in that case we would have !block->index.
+
+      In the end, the entire adaptive hash index will be removed. */
+      ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH);
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+      block->n_pointers= 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+      if (index->freed())
+        garbage.insert(index);
+      block->index= nullptr;
+    }
+
+  mysql_mutex_unlock(&mutex);
+
+  for (dict_index_t *index : garbage)
+    btr_search_lazy_free(index);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
 #ifdef UNIV_DEBUG
 /** Check that all blocks are in a replaceable state.
 @return address of a non-free block
@@ -4134,10 +4064,44 @@
 void buf_pool_t::assert_all_freed() noexcept
 {
   mysql_mutex_lock(&mutex);
-  const chunk_t *chunk= chunks;
-  for (auto i= n_chunks; i--; chunk++)
-    if (const buf_block_t* block= chunk->not_freed())
-      ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+
+    for (char *extent= memory,
+           *end= memory + block_descriptors_in_bytes(n_blocks);
+         extent < end; extent+= innodb_buffer_pool_extent_size)
+      for (buf_block_t *block= reinterpret_cast<buf_block_t*>(extent),
+             *extent_end= block +
+             pages_in_extent[srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN];
+           block < extent_end && reinterpret_cast<char*>(block) < end; block++)
+    {
+      if (!block->page.in_file())
+        continue;
+      switch (const lsn_t lsn= block->page.oldest_modification()) {
+      case 0:
+      case 1:
+        break;
+
+      case 2:
+        ut_ad(fsp_is_system_temporary(block->page.id().space()));
+        break;
+
+      default:
+        if (srv_read_only_mode)
+        {
+          /* The page cleaner is disabled in read-only mode.  No pages
+          can be dirtied, so all of them must be clean. */
+          ut_ad(lsn == recv_sys.lsn ||
+                srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+          break;
+        }
+
+        goto fixed_or_dirty;
+      }
+
+      if (!block->page.can_relocate())
+      fixed_or_dirty:
+        ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+    }
+
   mysql_mutex_unlock(&mutex);
 }
 #endif /* UNIV_DEBUG */
@@ -4187,40 +4151,35 @@
 
 	mysql_mutex_lock(&mutex);
 
-	chunk_t* chunk = chunks;
-
 	/* Check the uncompressed blocks. */
 
-	for (auto i = n_chunks; i--; chunk++) {
-		buf_block_t*	block = chunk->blocks;
-
-		for (auto j = chunk->size; j--; block++) {
-			ut_ad(block->page.frame);
-			switch (const auto f = block->page.state()) {
-			case buf_page_t::NOT_USED:
-				n_free++;
-				break;
+	for (ulint i = 0; i < n_blocks; i++) {
+		const buf_block_t* block = get_nth_page(i);
+		ut_ad(block->page.frame == block->frame_address());
 
-			case buf_page_t::MEMORY:
-			case buf_page_t::REMOVE_HASH:
-				/* do nothing */
+		switch (const auto f = block->page.state()) {
+		case buf_page_t::NOT_USED:
+			ut_ad(!block->page.in_LRU_list);
+			n_free++;
+			break;
+		case buf_page_t::MEMORY:
+		case buf_page_t::REMOVE_HASH:
+			/* do nothing */
+			break;
+		default:
+			if (f >= buf_page_t::READ_FIX
+			    && f < buf_page_t::WRITE_FIX) {
+				/* A read-fixed block is not
+				necessarily in the page_hash yet. */
 				break;
-
-			default:
-				if (f >= buf_page_t::READ_FIX
-				    && f < buf_page_t::WRITE_FIX) {
-					/* A read-fixed block is not
-					necessarily in the page_hash yet. */
-					break;
-				}
-				ut_ad(f >= buf_page_t::FREED);
-				const page_id_t id{block->page.id()};
-				ut_ad(page_hash.get(
-					      id,
-					      page_hash.cell_get(id.fold()))
-				      == &block->page);
-				n_lru++;
 			}
+			ut_ad(f >= buf_page_t::FREED);
+			const page_id_t id{block->page.id()};
+			ut_ad(page_hash.get(
+				      id,
+				      page_hash.cell_get(id.fold()))
+			      == &block->page);
+			n_lru++;
 		}
 	}
 
@@ -4245,24 +4204,11 @@
 	ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
 
 	mysql_mutex_unlock(&flush_list_mutex);
-
-	if (n_chunks_new == n_chunks
-	    && n_lru + n_free > curr_size + n_zip) {
-
-		ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
-			<< ", pool " << curr_size
-			<< " zip " << n_zip << ". Aborting...";
-	}
-
+	ut_ad(n_lru + n_free <= n_blocks + n_zip);
 	ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
-
-	if (n_chunks_new == n_chunks
-	    && UT_LIST_GET_LEN(free) != n_free) {
-
-		ib::fatal() << "Free list len "
-			<< UT_LIST_GET_LEN(free)
-			<< ", free blocks " << n_free << ". Aborting...";
-	}
+	ut_ad(UT_LIST_GET_LEN(free) <= n_free);
+	ut_ad(size_in_bytes != size_in_bytes_requested
+	      || UT_LIST_GET_LEN(free) == n_free);
 
 	mysql_mutex_unlock(&mutex);
 
@@ -4277,26 +4223,23 @@
 {
 	index_id_t*	index_ids;
 	ulint*		counts;
-	ulint		size;
 	ulint		i;
-	ulint		j;
 	index_id_t	id;
 	ulint		n_found;
-	chunk_t*	chunk;
 	dict_index_t*	index;
 
-	size = curr_size;
+	mysql_mutex_lock(&mutex);
 
 	index_ids = static_cast<index_id_t*>(
-		ut_malloc_nokey(size * sizeof *index_ids));
+		ut_malloc_nokey(n_blocks * sizeof *index_ids));
 
-	counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
+	counts = static_cast<ulint*>(
+		ut_malloc_nokey(sizeof(ulint) * n_blocks));
 
-	mysql_mutex_lock(&mutex);
 	mysql_mutex_lock(&flush_list_mutex);
 
 	ib::info()
-		<< "[buffer pool: size=" << curr_size
+		<< "[buffer pool: size=" << n_blocks
 		<< ", database pages=" << UT_LIST_GET_LEN(LRU)
 		<< ", free pages=" << UT_LIST_GET_LEN(free)
 		<< ", modified database pages="
@@ -4316,38 +4259,28 @@
 
 	n_found = 0;
 
-	chunk = chunks;
-
-	for (i = n_chunks; i--; chunk++) {
-		buf_block_t*	block		= chunk->blocks;
-		ulint		n_blocks	= chunk->size;
-
-		for (; n_blocks--; block++) {
-			const buf_frame_t* frame = block->page.frame;
-
-			if (fil_page_index_page_check(frame)) {
-
-				id = btr_page_get_index_id(frame);
-
-				/* Look for the id in the index_ids array */
-				j = 0;
-
-				while (j < n_found) {
-
-					if (index_ids[j] == id) {
-						counts[j]++;
-
-						break;
-					}
-					j++;
-				}
-
-				if (j == n_found) {
-					n_found++;
-					index_ids[j] = id;
-					counts[j] = 1;
+	for (size_t i = 0; i < n_blocks; i++) {
+		buf_block_t* block = get_nth_page(i);
+		const buf_frame_t* frame = block->page.frame;
+		ut_ad(frame == block->frame_address());
+
+		if (fil_page_index_page_check(frame)) {
+
+			id = btr_page_get_index_id(frame);
+
+			/* Look for the id in the index_ids array */
+			for (ulint j = 0; j < n_found; j++) {
+				if (index_ids[j] == id) {
+					counts[j]++;
+					goto found;
 				}
 			}
+
+			index_ids[n_found] = id;
+			counts[n_found] = 1;
+			n_found++;
+found:
+			continue;
 		}
 	}
 
@@ -4381,138 +4314,78 @@
 {
   ulint fixed_pages_number= 0;
 
-  mysql_mutex_lock(&buf_pool.mutex);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
 
   for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
        b= UT_LIST_GET_NEXT(LRU, b))
     if (b->state() > buf_page_t::UNFIXED)
       fixed_pages_number++;
 
-  mysql_mutex_unlock(&buf_pool.mutex);
-
   return fixed_pages_number;
 }
 #endif /* UNIV_DEBUG */
 
-/** Collect buffer pool metadata.
-@param[out]	pool_info	buffer pool metadata */
-void buf_stats_get_pool_info(buf_pool_info_t *pool_info) noexcept
+void buf_pool_t::get_info(buf_pool_info_t *pool_info) noexcept
 {
-	time_t			current_time;
-	double			time_elapsed;
-
-	mysql_mutex_lock(&buf_pool.mutex);
-
-	pool_info->pool_size = buf_pool.curr_size;
-
-	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
-
-	pool_info->old_lru_len = buf_pool.LRU_old_len;
-
-	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free);
-
-	mysql_mutex_lock(&buf_pool.flush_list_mutex);
-	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
-
-	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
-
-	pool_info->n_pend_reads = os_aio_pending_reads_approx();
-
-	pool_info->n_pending_flush_lru = buf_pool.n_flush();
-
-	pool_info->n_pending_flush_list = os_aio_pending_writes();
-	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-
-	current_time = time(NULL);
-	time_elapsed = 0.001 + difftime(current_time,
-					buf_pool.last_printout_time);
-
-	pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young;
-
-	pool_info->n_pages_not_made_young =
-		buf_pool.stat.n_pages_not_made_young;
-
-	pool_info->n_pages_read = buf_pool.stat.n_pages_read;
-
-	pool_info->n_pages_created = buf_pool.stat.n_pages_created;
-
-	pool_info->n_pages_written = buf_pool.stat.n_pages_written;
-
-	pool_info->n_page_gets = buf_pool.stat.n_page_gets;
-
-	pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd;
-	pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read;
-
-	pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted;
-
-	pool_info->page_made_young_rate =
-	static_cast<double>(buf_pool.stat.n_pages_made_young
-			    - buf_pool.old_stat.n_pages_made_young)
-	/ time_elapsed;
-
-	pool_info->page_not_made_young_rate =
-	static_cast<double>(buf_pool.stat.n_pages_not_made_young
-			    - buf_pool.old_stat.n_pages_not_made_young)
-	/ time_elapsed;
-
-	pool_info->pages_read_rate =
-	static_cast<double>(buf_pool.stat.n_pages_read
-			    - buf_pool.old_stat.n_pages_read)
-	/ time_elapsed;
-
-	pool_info->pages_created_rate =
-	static_cast<double>(buf_pool.stat.n_pages_created
-			    - buf_pool.old_stat.n_pages_created)
-	/ time_elapsed;
-
-	pool_info->pages_written_rate =
-	static_cast<double>(buf_pool.stat.n_pages_written
-			    - buf_pool.old_stat.n_pages_written)
-	/ time_elapsed;
-
-	pool_info->n_page_get_delta = buf_pool.stat.n_page_gets
-				      - buf_pool.old_stat.n_page_gets;
-
-	if (pool_info->n_page_get_delta) {
-		pool_info->page_read_delta = buf_pool.stat.n_pages_read
-					     - buf_pool.old_stat.n_pages_read;
-
-		pool_info->young_making_delta =
-			buf_pool.stat.n_pages_made_young
-			- buf_pool.old_stat.n_pages_made_young;
-
-		pool_info->not_young_making_delta =
-			buf_pool.stat.n_pages_not_made_young
-			- buf_pool.old_stat.n_pages_not_made_young;
-	}
-	pool_info->pages_readahead_rnd_rate =
-	static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd
-			    - buf_pool.old_stat.n_ra_pages_read_rnd)
-	/ time_elapsed;
-
-
-	pool_info->pages_readahead_rate =
-	static_cast<double>(buf_pool.stat.n_ra_pages_read
-			    - buf_pool.old_stat.n_ra_pages_read)
-	/ time_elapsed;
-
-	pool_info->pages_evicted_rate =
-	static_cast<double>(buf_pool.stat.n_ra_pages_evicted
-			    - buf_pool.old_stat.n_ra_pages_evicted)
-	/ time_elapsed;
-
-	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
-
-	pool_info->io_sum = buf_LRU_stat_sum.io;
-
-	pool_info->io_cur = buf_LRU_stat_cur.io;
+  mysql_mutex_lock(&mutex);
+  pool_info->pool_size= curr_size();
+  pool_info->lru_len= UT_LIST_GET_LEN(LRU);
+  pool_info->old_lru_len= LRU_old_len;
+  pool_info->free_list_len= UT_LIST_GET_LEN(free);
 
-	pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+  mysql_mutex_lock(&flush_list_mutex);
+  pool_info->flush_list_len= UT_LIST_GET_LEN(flush_list);
+  pool_info->n_pend_unzip= UT_LIST_GET_LEN(unzip_LRU);
+  pool_info->n_pend_reads= os_aio_pending_reads_approx();
+  pool_info->n_pending_flush_lru= n_flush();
+  pool_info->n_pending_flush_list= os_aio_pending_writes();
+  mysql_mutex_unlock(&flush_list_mutex);
 
-	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+  double elapsed= 0.001 + difftime(time(nullptr), last_printout_time);
 
-	buf_refresh_io_stats();
-	mysql_mutex_unlock(&buf_pool.mutex);
+  pool_info->n_pages_made_young= stat.n_pages_made_young;
+  pool_info->page_made_young_rate=
+    double(stat.n_pages_made_young - old_stat.n_pages_made_young) /
+    elapsed;
+  pool_info->n_pages_not_made_young= stat.n_pages_not_made_young;
+  pool_info->page_not_made_young_rate=
+    double(stat.n_pages_not_made_young - old_stat.n_pages_not_made_young) /
+    elapsed;
+  pool_info->n_pages_read= stat.n_pages_read;
+  pool_info->pages_read_rate=
+    double(stat.n_pages_read - old_stat.n_pages_read) / elapsed;
+  pool_info->n_pages_created= stat.n_pages_created;
+  pool_info->pages_created_rate=
+    double(stat.n_pages_created - old_stat.n_pages_created) / elapsed;
+  pool_info->n_pages_written= stat.n_pages_written;
+  pool_info->pages_written_rate=
+    double(stat.n_pages_written - old_stat.n_pages_written) / elapsed;
+  pool_info->n_page_gets= stat.n_page_gets;
+  pool_info->n_page_get_delta= stat.n_page_gets - old_stat.n_page_gets;
+  if (pool_info->n_page_get_delta)
+  {
+    pool_info->page_read_delta= stat.n_pages_read - old_stat.n_pages_read;
+    pool_info->young_making_delta=
+      stat.n_pages_made_young - old_stat.n_pages_made_young;
+    pool_info->not_young_making_delta=
+      stat.n_pages_not_made_young - old_stat.n_pages_not_made_young;
+  }
+  pool_info->n_ra_pages_read_rnd= stat.n_ra_pages_read_rnd;
+  pool_info->pages_readahead_rnd_rate=
+    double(stat.n_ra_pages_read_rnd - old_stat.n_ra_pages_read_rnd) / elapsed;
+  pool_info->n_ra_pages_read= stat.n_ra_pages_read;
+  pool_info->pages_readahead_rate=
+    double(stat.n_ra_pages_read - old_stat.n_ra_pages_read) / elapsed;
+  pool_info->n_ra_pages_evicted= stat.n_ra_pages_evicted;
+  pool_info->pages_evicted_rate=
+    double(stat.n_ra_pages_evicted - old_stat.n_ra_pages_evicted) / elapsed;
+  pool_info->unzip_lru_len= UT_LIST_GET_LEN(unzip_LRU);
+  pool_info->io_sum= buf_LRU_stat_sum.io;
+  pool_info->io_cur= buf_LRU_stat_cur.io;
+  pool_info->unzip_sum= buf_LRU_stat_sum.unzip;
+  pool_info->unzip_cur= buf_LRU_stat_cur.unzip;
+  buf_refresh_io_stats();
+  mysql_mutex_unlock(&mutex);
 }
 
 /*********************************************************************//**
@@ -4620,7 +4493,7 @@
 {
 	buf_pool_info_t	pool_info;
 
-	buf_stats_get_pool_info(&pool_info);
+	buf_pool.get_info(&pool_info);
 	buf_print_io_instance(&pool_info, file);
 }
 
diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0dblwr.cc mariadb-10.11.13/storage/innobase/buf/buf0dblwr.cc
--- mariadb-10.11.11/storage/innobase/buf/buf0dblwr.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/buf/buf0dblwr.cc	2025-05-19 16:14:25.000000000 +0000
@@ -365,7 +365,7 @@
   ut_ad(log_sys.last_checkpoint_lsn);
   if (!is_created())
     return;
-  const lsn_t max_lsn{log_sys.get_lsn()};
+  const lsn_t max_lsn{log_sys.get_flushed_lsn(std::memory_order_relaxed)};
   ut_ad(recv_sys.scanned_lsn == max_lsn);
   ut_ad(recv_sys.scanned_lsn >= recv_sys.lsn);
 
@@ -374,7 +374,7 @@
                                                     srv_page_size));
   byte *const buf= read_buf + srv_page_size;
 
-  std::deque<byte*> encrypted_pages;
+  std::deque<byte*> deferred_pages;
   for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
        i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
   {
@@ -393,11 +393,12 @@
     {
       /* These pages does not appear to belong to any tablespace.
       There is a possibility that this page could be
-      encrypted using full_crc32 format. If innodb encounters
-      any corrupted encrypted page during recovery then
-      InnoDB should use this page to find the valid page.
-      See find_encrypted_page() */
-      encrypted_pages.push_back(*i);
+      encrypted/compressed using full_crc32 format.
+      If innodb encounters any corrupted encrypted/compressed
+      page during recovery then InnoDB should use this page to
+      find the valid page.
+      See find_encrypted_page()/find_page_compressed() */
+      deferred_pages.push_back(*i);
       continue;
     }
 
@@ -478,7 +479,7 @@
   }
 
   recv_sys.dblwr.pages.clear();
-  for (byte *page : encrypted_pages)
+  for (byte *page : deferred_pages)
     recv_sys.dblwr.pages.push_back(page);
   fil_flush_file_spaces();
   aligned_free(read_buf);
@@ -599,20 +600,67 @@
 }
 #endif /* UNIV_DEBUG */
 
+ATTRIBUTE_COLD void buf_dblwr_t::print_info() const noexcept
+{
+  mysql_mutex_assert_owner(&mutex);
+  const slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+
+  sql_print_information("InnoDB: Double Write State\n"
+      "-------------------\n"
+      "Batch running : %s\n"
+      "Active Slot - first_free: %zu reserved:  %zu\n"
+      "Flush Slot  - first_free: %zu reserved:  %zu\n"
+      "-------------------",
+      (batch_running ? "true" : "false"),
+      active_slot->first_free, active_slot->reserved,
+      flush_slot->first_free, flush_slot->reserved);
+}
+
 bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept
 {
   mysql_mutex_assert_owner(&mutex);
   ut_ad(size == block_size());
 
-  for (;;)
+  const size_t max_count= 60 * 60;
+  const size_t first_log_count= 30;
+  const size_t fatal_threshold=
+      static_cast<size_t>(srv_fatal_semaphore_wait_threshold);
+  size_t log_count= first_log_count;
+
+  for (size_t count= 0;;)
   {
     if (!active_slot->first_free)
       return false;
     if (!batch_running)
       break;
-    my_cond_wait(&cond, &mutex.m_mutex);
-  }
 
+    timespec abstime;
+    set_timespec(abstime, 1);
+    my_cond_timedwait(&cond, &mutex.m_mutex, &abstime);
+
+    if (count > fatal_threshold)
+    {
+      buf_pool.print_flush_info();
+      print_info();
+      ib::fatal() << "InnoDB: Long wait (" << count
+                  << " seconds) for double-write buffer flush.";
+    }
+    else if (++count < first_log_count && !(count % 5))
+    {
+      sql_print_information("InnoDB: Long wait (%zu seconds) for double-write"
+                            " buffer flush.", count);
+      buf_pool.print_flush_info();
+      print_info();
+    }
+    else if (!(count % log_count))
+    {
+      sql_print_warning("InnoDB: Long wait (%zu seconds) for double-write"
+                        " buffer flush.", count);
+      buf_pool.print_flush_info();
+      print_info();
+      log_count= log_count >= max_count ? max_count : log_count * 2;
+    }
+  }
   ut_ad(active_slot->reserved == active_slot->first_free);
   ut_ad(!flushing_buffered_writes);
 
@@ -732,6 +780,9 @@
     ut_ad(lsn);
     ut_ad(lsn >= bpage->oldest_modification());
     log_write_up_to(lsn, true);
+    ut_ad(!e.request.node->space->full_crc32() ||
+          !buf_page_is_corrupted(true, static_cast<const byte*>(frame),
+                                 e.request.node->space->flags));
     e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
                               frame, bpage);
   }
diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0dump.cc mariadb-10.11.13/storage/innobase/buf/buf0dump.cc
--- mariadb-10.11.11/storage/innobase/buf/buf0dump.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/buf/buf0dump.cc	2025-05-19 16:14:25.000000000 +0000
@@ -58,7 +58,7 @@
 static volatile bool	buf_dump_should_start;
 static volatile bool	buf_load_should_start;
 
-static bool	buf_load_abort_flag;
+static Atomic_relaxed<bool> buf_load_abort_flag;
 
 /** Start the buffer pool dump/load task and instructs it to start a dump. */
 void buf_dump_start()
@@ -295,7 +295,7 @@
 
 		/* limit the number of total pages dumped to X% of the
 		total number of pages */
-		t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
+		t_pages = buf_pool.curr_size() * srv_buf_pool_dump_pct / 100;
 		if (n_pages > t_pages) {
 			buf_dump_status(STATUS_INFO,
 					"Restricted to " ULINTPF
@@ -477,10 +477,10 @@
 		return;
 	}
 
-	/* If dump is larger than the buffer pool(s), then we ignore the
+	/* If the dump is larger than the buffer pool, then we ignore the
 	extra trailing. This could happen if a dump is made, then buffer
 	pool is shrunk and then load is attempted. */
-	dump_n = std::min(dump_n, buf_pool.get_n_pages());
+	dump_n = std::min(dump_n, buf_pool.curr_size());
 
 	if (dump_n != 0) {
 		dump = static_cast<page_id_t*>(ut_malloc_nokey(
diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0flu.cc mariadb-10.11.13/storage/innobase/buf/buf0flu.cc
--- mariadb-10.11.11/storage/innobase/buf/buf0flu.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/buf/buf0flu.cc	2025-05-19 16:14:25.000000000 +0000
@@ -281,6 +281,8 @@
 {
   ut_ad(!persistent == fsp_is_system_temporary(id().space()));
   ut_ad(state >= WRITE_FIX);
+  ut_ad(!frame ||
+        frame == reinterpret_cast<buf_block_t*>(this)->frame_address());
 
   if (UNIV_LIKELY(!error))
   {
@@ -692,7 +694,6 @@
     {
       static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
       mach_write_to_4(tmp + len - 4, my_crc32c(0, tmp, len - 4));
-      ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
     }
 
     d= tmp;
@@ -795,6 +796,7 @@
   size_t orig_size;
 #endif
   buf_tmp_buffer_t *slot= nullptr;
+  byte *page= frame;
 
   if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */
   {
@@ -810,7 +812,6 @@
   }
   else
   {
-    byte *page= frame;
     size= block->physical_size();
 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
     orig_size= size;
@@ -852,6 +853,8 @@
     if (!space->is_temporary() && !space->is_being_imported() &&
         lsn > log_sys.get_flushed_lsn())
       log_write_up_to(lsn, true);
+    ut_ad(space->is_temporary() || !space->full_crc32() ||
+          !buf_page_is_corrupted(true, write_frame, space->flags));
     space->io(IORequest{type, this, slot}, physical_offset(), size,
               write_frame, this);
   }
@@ -891,7 +894,7 @@
          : space.physical_size() == 1024 ? 3 : 0));
   /* When flushed, dirty blocks are searched in neighborhoods of this
   size, and flushed along with the original page. */
-  const ulint s= buf_pool.curr_size / 16;
+  const ulint s= buf_pool.curr_size() / 16;
   const uint32_t read_ahead= buf_pool.read_ahead_area;
   const uint32_t buf_flush_area= read_ahead > s
     ? static_cast<uint32_t>(s) : read_ahead;
@@ -1209,18 +1212,34 @@
   buf_LRU_free_page(bpage, true);
 }
 
+/** Adjust to_withdraw during buf_pool_t::shrink() */
+ATTRIBUTE_COLD static size_t buf_flush_LRU_to_withdraw(size_t to_withdraw,
+                                                       const buf_page_t &bpage)
+  noexcept
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  if (!buf_pool.is_shrinking())
+    return 0;
+  const size_t size{buf_pool.size_in_bytes_requested};
+  if (buf_pool.will_be_withdrawn(bpage.frame, size) ||
+      buf_pool.will_be_withdrawn(bpage.zip.data, size))
+    to_withdraw--;
+  return to_withdraw;
+}
+
 /** Flush dirty blocks from the end buf_pool.LRU,
 and move clean blocks to buf_pool.free.
-@param max    maximum number of blocks to flush
-@param n      counts of flushed and evicted pages */
-static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) noexcept
+@param max         maximum number of blocks to flush
+@param n           counts of flushed and evicted pages
+@param to_withdraw buf_pool.to_withdraw() */
+static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n,
+                                     size_t to_withdraw) noexcept
 {
-  ulint scanned= 0;
+  size_t scanned= 0;
   mysql_mutex_assert_owner(&buf_pool.mutex);
-  ulint free_limit{buf_pool.LRU_scan_depth};
-  if (buf_pool.withdraw_target && buf_pool.is_shrinking())
-    free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
-
+  size_t free_limit{buf_pool.LRU_scan_depth};
+  if (UNIV_UNLIKELY(to_withdraw > free_limit))
+    to_withdraw= free_limit;
   const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
     ? 0 : buf_pool.flush_neighbors;
   fil_space_t *space= nullptr;
@@ -1230,20 +1249,21 @@
 
   /* BUF_LRU_MIN_LEN (256) is too high value for low buffer pool(BP) size. For
   example, for BP size lower than 80M and 16 K page size, the limit is more than
-  5% of total BP and for lowest BP 5M, it is 80% of the BP. Non-data objects
+  5% of total BP and for lowest BP 6M, it is 80% of the BP. Non-data objects
   like explicit locks could occupy part of the BP pool reducing the pages
   available for LRU. If LRU reaches minimum limit and if no free pages are
   available, server would hang with page cleaner not able to free any more
   pages. To avoid such hang, we adjust the LRU limit lower than the limit for
   data objects as checked in buf_LRU_check_size_of_non_data_objects() i.e. one
   page less than 5% of BP. */
-  size_t pool_limit= buf_pool.curr_size / 20 - 1;
-  auto buf_lru_min_len= std::min<size_t>(pool_limit, BUF_LRU_MIN_LEN);
+  const size_t buf_lru_min_len=
+    std::min((buf_pool.usable_size()) / 20 - 1, size_t{BUF_LRU_MIN_LEN});
 
   for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
        bpage &&
        ((UT_LIST_GET_LEN(buf_pool.LRU) > buf_lru_min_len &&
          UT_LIST_GET_LEN(buf_pool.free) < free_limit) ||
+        to_withdraw ||
         recv_recovery_is_on());
        ++scanned, bpage= buf_pool.lru_hp.get())
   {
@@ -1259,6 +1279,8 @@
       if (state != buf_page_t::FREED &&
           (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state)))
         continue;
+      if (UNIV_UNLIKELY(to_withdraw != 0))
+        to_withdraw= buf_flush_LRU_to_withdraw(to_withdraw, *bpage);
       buf_LRU_free_page(bpage, true);
       ++n->evicted;
       if (UNIV_LIKELY(scanned & 31))
@@ -1330,20 +1352,32 @@
         continue;
       }
 
+      if (state < buf_page_t::UNFIXED)
+        goto flush;
+
       if (n->flushed >= max && !recv_recovery_is_on())
       {
         bpage->lock.u_unlock(true);
         break;
       }
 
-      if (neighbors && space->is_rotational())
+      if (neighbors && space->is_rotational() && UNIV_LIKELY(!to_withdraw) &&
+          /* Skip neighbourhood flush from LRU list if we haven't yet reached
+          half of the free page target. */
+          UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit)
         n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
                                              neighbors == 1,
                                              n->flushed, max);
-      else if (bpage->flush(space))
-        ++n->flushed;
       else
-        continue;
+      {
+      flush:
+        if (UNIV_UNLIKELY(to_withdraw != 0))
+          to_withdraw= buf_flush_LRU_to_withdraw(to_withdraw, *bpage);
+        if (bpage->flush(space))
+          ++n->flushed;
+        else
+          continue;
+      }
 
       goto reacquire_mutex;
     }
@@ -1372,11 +1406,12 @@
 @param n      counts of flushed and evicted pages */
 static void buf_do_LRU_batch(ulint max, flush_counters_t *n) noexcept
 {
-  if (buf_LRU_evict_from_unzip_LRU())
+  const size_t to_withdraw= buf_pool.to_withdraw();
+  if (!to_withdraw && buf_LRU_evict_from_unzip_LRU())
     buf_free_from_unzip_LRU_list_batch();
   n->evicted= 0;
   n->flushed= 0;
-  buf_flush_LRU_list_batch(max, n);
+  buf_flush_LRU_list_batch(max, n, to_withdraw);
 
   mysql_mutex_assert_owner(&buf_pool.mutex);
   buf_lru_freed_page_count+= n->evicted;
@@ -1725,14 +1760,22 @@
   buf_do_LRU_batch(max_n, &n);
 
   ulint pages= n.flushed;
+  ulint evicted= n.evicted;
+
+  /* If we have exhausted flush quota, it is likely we exited before
+  generating enough free pages. Call once more with 0 flush to generate
+  free pages immediately as required. */
+  if (pages >= max_n)
+    buf_do_LRU_batch(0, &n);
 
-  if (n.evicted)
+  evicted+= n.evicted;
+  if (evicted)
   {
     buf_pool.try_LRU_scan= true;
     pthread_cond_broadcast(&buf_pool.done_free);
   }
   else if (!pages && !buf_pool.try_LRU_scan)
-    /* For example, with the minimum innodb_buffer_pool_size=5M and
+    /* For example, with the minimum innodb_buffer_pool_size=6M and
     the default innodb_page_size=16k there are only a little over 316
     pages in the buffer pool. The buffer pool can easily be exhausted
     by a workload of some dozen concurrent connections. The system could
@@ -1760,8 +1803,9 @@
 {
   ut_ad(!srv_read_only_mode);
   ut_ad(end_lsn >= next_checkpoint_lsn);
-  ut_ad(end_lsn <= get_lsn());
-  ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= get_lsn() ||
+  ut_d(const lsn_t current_lsn{get_lsn()});
+  ut_ad(end_lsn <= current_lsn);
+  ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= current_lsn ||
         srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
 
   DBUG_PRINT("ib_log",
@@ -1890,7 +1934,8 @@
         ut_ad(!is_opened());
         my_munmap(buf, file_size);
         buf= resize_buf;
-        set_buf_free(START_OFFSET + (get_lsn() - resizing));
+        buf_size= unsigned(std::min<uint64_t>(resize_target - START_OFFSET,
+                                              buf_size_max));
       }
       else
 #endif
@@ -1912,7 +1957,8 @@
     resize_flush_buf= nullptr;
     resize_target= 0;
     resize_lsn.store(0, std::memory_order_relaxed);
-    writer_update();
+    resize_initiator= nullptr;
+    writer_update(false);
   }
 
   log_resize_release();
@@ -1999,6 +2045,14 @@
   if (recv_recovery_is_on())
     recv_sys.apply(true);
 
+#if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
+  /* The built-in scheduler in Valgrind may neglect some threads for a
+  long time.  Under Valgrind, let us explicitly wait for page write
+  completion in order to avoid a result difference in the test
+  innodb.page_cleaner. */
+  os_aio_wait_until_no_pending_writes(false);
+#endif
+
   switch (srv_file_flush_method) {
   case SRV_NOSYNC:
   case SRV_O_DIRECT_NO_FSYNC:
@@ -2016,9 +2070,9 @@
 }
 
 /** Make a checkpoint. */
-ATTRIBUTE_COLD void log_make_checkpoint()
+ATTRIBUTE_COLD void log_make_checkpoint() noexcept
 {
-  buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
+  buf_flush_wait_flushed(log_get_lsn());
   while (!log_checkpoint());
 }
 
@@ -2026,8 +2080,6 @@
 NOTE: The calling thread is not allowed to hold any buffer page latches! */
 static void buf_flush_wait(lsn_t lsn) noexcept
 {
-  ut_ad(lsn <= log_sys.get_lsn());
-
   lsn_t oldest_lsn;
 
   while ((oldest_lsn= buf_pool.get_oldest_modification(lsn)) < lsn)
@@ -2192,6 +2244,8 @@
                                  MONITOR_FLUSH_SYNC_PAGES, n_flushed);
   }
 
+  os_aio_wait_until_no_pending_writes(false);
+
   switch (srv_file_flush_method) {
   case SRV_NOSYNC:
   case SRV_O_DIRECT_NO_FSYNC:
@@ -2234,13 +2288,13 @@
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 }
 
-/** Check if the adpative flushing threshold is recommended based on
+/** Check if the adaptive flushing threshold is recommended based on
 redo log capacity filled threshold.
 @param oldest_lsn     buf_pool.get_oldest_modification()
 @return true if adaptive flushing is recommended. */
 static bool af_needed_for_redo(lsn_t oldest_lsn) noexcept
 {
-  lsn_t age= (log_sys.get_lsn() - oldest_lsn);
+  lsn_t age= log_sys.get_lsn_approx() - oldest_lsn;
   lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
     static_cast<double>(log_sys.log_capacity) / 100);
 
@@ -2300,7 +2354,7 @@
 	lsn_t			lsn_rate;
 	ulint			n_pages = 0;
 
-	const lsn_t cur_lsn = log_sys.get_lsn();
+	const lsn_t cur_lsn = log_sys.get_lsn_approx();
 	ut_ad(oldest_lsn <= cur_lsn);
 	ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
 	time_t curr_time = time(nullptr);
@@ -2309,13 +2363,23 @@
 	if (!prev_lsn || !pct_for_lsn) {
 		prev_time = curr_time;
 		prev_lsn = cur_lsn;
-		if (max_pct > 0.0) {
-			dirty_pct /= max_pct;
-		}
 
-		n_pages = ulint(dirty_pct * double(srv_io_capacity));
-		if (n_pages < dirty_blocks) {
-			n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
+		if (srv_io_capacity >= dirty_blocks) {
+			n_pages = dirty_blocks;
+		} else {
+			if (max_pct > 1.0) {
+				dirty_pct/= max_pct;
+			}
+			n_pages= ulint(dirty_pct * double(srv_io_capacity));
+
+			if (n_pages < dirty_blocks) {
+				n_pages= srv_io_capacity;
+
+			} else {
+				/* Set maximum IO capacity upper bound. */
+				n_pages= std::min<ulint>(srv_max_io_capacity,
+							 dirty_blocks);
+			}
 		}
 
 func_exit:
@@ -2412,6 +2476,13 @@
 }
 
 TPOOL_SUPPRESS_TSAN
+bool buf_pool_t::running_out() const noexcept
+{
+  return !recv_recovery_is_on() &&
+    UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < n_blocks / 4;
+}
+
+TPOOL_SUPPRESS_TSAN
 bool buf_pool_t::need_LRU_eviction() const noexcept
 {
   /* try_LRU_scan==false means that buf_LRU_get_free_block() is waiting
@@ -2448,6 +2519,11 @@
     DBUG_EXECUTE_IF("ib_page_cleaner_sleep",
     {
       std::this_thread::sleep_for(std::chrono::seconds(1));
+      /* Cover the logging code in debug mode. */
+      buf_pool.print_flush_info();
+      buf_dblwr.lock();
+      buf_dblwr.print_info();
+      buf_dblwr.unlock();
     });
     lsn_limit= buf_flush_sync_lsn;
 
@@ -2470,7 +2546,7 @@
           (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
            srv_max_dirty_pages_pct_lwm == 0.0))
       {
-        buf_pool.LRU_warned.clear(std::memory_order_release);
+        buf_pool.LRU_warned_clear();
         /* We are idle; wait for buf_pool.page_cleaner_wakeup() */
         my_cond_wait(&buf_pool.do_flush_list,
                      &buf_pool.flush_list_mutex.m_mutex);
@@ -2545,6 +2621,7 @@
       buf_pool.n_flush_inc();
       mysql_mutex_unlock(&buf_pool.flush_list_mutex);
       n= srv_max_io_capacity;
+      os_aio_wait_until_no_pending_writes(false);
       mysql_mutex_lock(&buf_pool.mutex);
     LRU_flush:
       n= buf_flush_LRU(n);
@@ -2648,10 +2725,17 @@
              !buf_pool.need_LRU_eviction())
       goto check_oldest_and_set_idle;
     else
+    {
       mysql_mutex_lock(&buf_pool.mutex);
+      os_aio_wait_until_no_pending_writes(false);
+    }
 
     n= srv_max_io_capacity;
     n= n >= n_flushed ? n - n_flushed : 0;
+    /* It is critical to generate free pages to keep the system alive. Make
+    sure we are not hindered by dirty pages in LRU tail. */
+    n= std::max<ulint>(n, std::min<ulint>(srv_max_io_capacity,
+                                          buf_pool.LRU_scan_depth));
     goto LRU_flush;
   }
 
@@ -2689,11 +2773,13 @@
 {
   mysql_mutex_assert_owner(&mutex);
   try_LRU_scan= false;
-  if (!LRU_warned.test_and_set(std::memory_order_acquire))
+  if (!LRU_warned)
+  {
+    LRU_warned= true;
     sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!"
-                      " %zu blocks are in use and %zu free."
-                      " Consider increasing innodb_buffer_pool_size.",
-                      UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free));
+                      " Consider increasing innodb_buffer_pool_size.");
+    print_flush_info();
+  }
 }
 
 /** Initialize page_cleaner. */
@@ -2740,7 +2826,7 @@
 NOTE: The calling thread is not allowed to hold any buffer page latches! */
 void buf_flush_sync_batch(lsn_t lsn) noexcept
 {
-  lsn= std::max(lsn, log_sys.get_lsn());
+  lsn= std::max(lsn, log_get_lsn());
   mysql_mutex_lock(&buf_pool.flush_list_mutex);
   buf_flush_wait(lsn);
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
@@ -2759,24 +2845,77 @@
 
   thd_wait_begin(nullptr, THD_WAIT_DISKIO);
   tpool::tpool_wait_begin();
-  mysql_mutex_lock(&buf_pool.flush_list_mutex);
-  for (;;)
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+
+  for (lsn_t lsn= log_sys.get_lsn();;)
   {
-    const lsn_t lsn= log_sys.get_lsn();
+    log_sys.latch.wr_unlock();
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
     buf_flush_wait(lsn);
     /* Wait for the page cleaner to be idle (for log resizing at startup) */
     while (buf_flush_sync_lsn)
       my_cond_wait(&buf_pool.done_flush_list,
                    &buf_pool.flush_list_mutex.m_mutex);
-    if (lsn == log_sys.get_lsn())
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+    lsn_t new_lsn= log_sys.get_lsn();
+    if (lsn == new_lsn)
       break;
+    lsn= new_lsn;
   }
 
-  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  log_sys.latch.wr_unlock();
   tpool::tpool_wait_end();
   thd_wait_end(nullptr);
 }
 
+ATTRIBUTE_COLD void buf_pool_t::print_flush_info() const noexcept
+{
+  /* We do dirty read of UT_LIST count variable. */
+  size_t lru_size= UT_LIST_GET_LEN(LRU);
+  size_t dirty_size= UT_LIST_GET_LEN(flush_list);
+  size_t free_size= UT_LIST_GET_LEN(free);
+  size_t dirty_pct= lru_size ? dirty_size * 100 / (lru_size + free_size) : 0;
+  sql_print_information("InnoDB: Buffer Pool pages\n"
+    "-------------------\n"
+    "LRU Pages  : %zu\n"
+    "Free Pages : %zu\n"
+    "Dirty Pages: %zu : %zu%%\n"
+    "-------------------",
+    lru_size, free_size, dirty_size, dirty_pct);
+
+  lsn_t lsn= log_get_lsn();
+  lsn_t clsn= log_sys.last_checkpoint_lsn;
+  sql_print_information("InnoDB: LSN flush parameters\n"
+    "-------------------\n"
+    "System LSN     : %" PRIu64 "\n"
+    "Checkpoint  LSN: %" PRIu64 "\n"
+    "Flush ASync LSN: %" PRIu64 "\n"
+    "Flush Sync  LSN: %" PRIu64 "\n"
+    "-------------------",
+    lsn, clsn, buf_flush_async_lsn.load(), buf_flush_sync_lsn.load());
+
+  lsn_t age= lsn - clsn;
+  lsn_t age_pct= log_sys.max_checkpoint_age
+      ? age * 100 / log_sys.max_checkpoint_age : 0;
+  sql_print_information("InnoDB: LSN age parameters\n"
+    "-------------------\n"
+    "Current Age   : %" PRIu64 " : %" PRIu64 "%%\n"
+    "Max Age(Async): %" PRIu64 "\n"
+    "Max Age(Sync) : %" PRIu64 "\n"
+    "Capacity      : %" PRIu64 "\n"
+    "-------------------",
+    age, age_pct, log_sys.max_modified_age_async, log_sys.max_checkpoint_age,
+    log_sys.log_capacity);
+
+  sql_print_information("InnoDB: Pending IO count\n"
+    "-------------------\n"
+    "Pending Read : %zu\n"
+    "Pending Write: %zu\n"
+    "-------------------",
+    os_aio_pending_reads_approx(), os_aio_pending_writes_approx());
+}
+
 #ifdef UNIV_DEBUG
 /** Functor to validate the flush list. */
 struct	Check {
diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0lru.cc mariadb-10.11.13/storage/innobase/buf/buf0lru.cc
--- mariadb-10.11.11/storage/innobase/buf/buf0lru.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/buf/buf0lru.cc	2025-05-19 16:14:25.000000000 +0000
@@ -38,6 +38,7 @@
 #include "srv0srv.h"
 #include "srv0mon.h"
 #include "my_cpu.h"
+#include "log.h"
 
 /** The number of blocks from the LRU_old pointer onward, including
 the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
@@ -133,7 +134,7 @@
 
 	buf_pool.stat.LRU_bytes += bpage->physical_size();
 
-	ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size);
+	ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size());
 }
 
 /** @return whether the unzip_LRU list should be used for evicting a victim
@@ -259,89 +260,55 @@
 	return(freed);
 }
 
-/** @return a buffer block from the buf_pool.free list
-@retval	NULL	if the free list is empty */
-buf_block_t* buf_LRU_get_free_only()
-{
-	buf_block_t*	block;
-
-	mysql_mutex_assert_owner(&buf_pool.mutex);
-
-	block = reinterpret_cast<buf_block_t*>(
-		UT_LIST_GET_FIRST(buf_pool.free));
-
-	while (block != NULL) {
-		ut_ad(block->page.in_free_list);
-		ut_d(block->page.in_free_list = FALSE);
-		ut_ad(!block->page.oldest_modification());
-		ut_ad(!block->page.in_LRU_list);
-		ut_a(!block->page.in_file());
-		UT_LIST_REMOVE(buf_pool.free, &block->page);
-
-		if (!buf_pool.is_shrinking()
-		    || UT_LIST_GET_LEN(buf_pool.withdraw)
-			>= buf_pool.withdraw_target
-		    || !buf_pool.will_be_withdrawn(block->page)) {
-			/* No adaptive hash index entries may point to
-			a free block. */
-			assert_block_ahi_empty(block);
-
-			block->page.set_state(buf_page_t::MEMORY);
-			block->page.set_os_used();
-			break;
-		}
-
-		/* This should be withdrawn */
-		UT_LIST_ADD_LAST(buf_pool.withdraw, &block->page);
-		ut_d(block->in_withdraw_list = true);
-
-		block = reinterpret_cast<buf_block_t*>(
-			UT_LIST_GET_FIRST(buf_pool.free));
-	}
-
-	return(block);
-}
-
 /******************************************************************//**
 Checks how much of buf_pool is occupied by non-data objects like
 AHI, lock heaps etc. Depending on the size of non-data objects this
 function will either assert or issue a warning and switch on the
 status monitor. */
-static void buf_LRU_check_size_of_non_data_objects()
+static void buf_LRU_check_size_of_non_data_objects() noexcept
 {
   mysql_mutex_assert_owner(&buf_pool.mutex);
 
-  if (recv_recovery_is_on() || buf_pool.n_chunks_new != buf_pool.n_chunks)
+  if (recv_recovery_is_on())
     return;
 
-  const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+  const size_t curr_size{buf_pool.usable_size()};
 
-  if (s < buf_pool.curr_size / 20)
-    ib::fatal() << "Over 95 percent of the buffer pool is"
-            " occupied by lock heaps"
+  auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+
+  if (s >= curr_size / 20);
+  else if (buf_pool.is_shrinking())
+    buf_pool.LRU_warn();
+  else
+  {
+    sql_print_error("[FATAL] InnoDB: Over 95 percent of the buffer pool is"
+                    " occupied by lock heaps"
 #ifdef BTR_CUR_HASH_ADAPT
-            " or the adaptive hash index"
+                    " or the adaptive hash index"
 #endif /* BTR_CUR_HASH_ADAPT */
-            "! Check that your transactions do not set too many"
-            " row locks, or review if innodb_buffer_pool_size="
-                << (buf_pool.curr_size >> (20U - srv_page_size_shift))
-                << "M could be bigger.";
+                    "! Check that your transactions do not set too many"
+                    " row locks, or review if innodb_buffer_pool_size=%zuM"
+                    " could be bigger",
+                    curr_size >> (20 - srv_page_size_shift));
+    abort();
+  }
 
-  if (s < buf_pool.curr_size / 3)
+  if (s < curr_size / 3)
   {
     if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer)
     {
       /* Over 67 % of the buffer pool is occupied by lock heaps or
       the adaptive hash index. This may be a memory leak! */
-      ib::warn() << "Over 67 percent of the buffer pool is"
-              " occupied by lock heaps"
+      sql_print_warning("InnoDB: Over 67 percent of the buffer pool is"
+                        " occupied by lock heaps"
 #ifdef BTR_CUR_HASH_ADAPT
-              " or the adaptive hash index"
+                        " or the adaptive hash index"
 #endif /* BTR_CUR_HASH_ADAPT */
-              "! Check that your transactions do not set too many row locks."
-              " innodb_buffer_pool_size="
-                 << (buf_pool.curr_size >> (20U - srv_page_size_shift))
-                 << "M. Starting the InnoDB Monitor to print diagnostics.";
+                        "! Check that your transactions do not set too many"
+                        " row locks. innodb_buffer_pool_size=%zuM."
+                        " Starting the InnoDB Monitor to print diagnostics.",
+                        curr_size >> (20 - srv_page_size_shift));
+
       buf_lru_switched_on_innodb_mon= true;
       srv_print_innodb_monitor= TRUE;
       srv_monitor_timer_schedule_now();
@@ -389,15 +356,15 @@
 
 retry:
   /* If there is a block in the free list, take it */
-  block= buf_LRU_get_free_only();
+  block= buf_pool.allocate();
   if (block)
   {
 got_block:
     const ulint LRU_size= UT_LIST_GET_LEN(buf_pool.LRU);
     const ulint available= UT_LIST_GET_LEN(buf_pool.free);
-    const ulint scan_depth= buf_pool.LRU_scan_depth / 2;
-    ut_ad(LRU_size <= BUF_LRU_MIN_LEN ||
-          available >= scan_depth || buf_pool.need_LRU_eviction());
+    const size_t scan_depth{buf_pool.LRU_scan_depth / 2};
+    ut_ad(LRU_size <= BUF_LRU_MIN_LEN || available >= scan_depth ||
+          buf_pool.is_shrinking() || buf_pool.need_LRU_eviction());
 
     ut_d(bool signalled = false);
 
@@ -446,7 +413,7 @@
 
   waited= true;
 
-  while (!(block= buf_LRU_get_free_only()))
+  while (!(block= buf_pool.allocate()))
   {
     buf_pool.stat.LRU_waits++;
 
@@ -811,10 +778,10 @@
 		if (zip || !bpage->zip.data || !bpage->frame) {
 			break;
 		}
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
 relocate_compressed:
 		b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b));
 		ut_a(b);
-		mysql_mutex_lock(&buf_pool.flush_list_mutex);
 		new (b) buf_page_t(*bpage);
 		b->frame = nullptr;
 		{
@@ -833,7 +800,12 @@
 			hash_lock.unlock();
 			return(false);
 		}
-		goto relocate_compressed;
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		if (bpage->can_relocate()) {
+			goto relocate_compressed;
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		goto func_exit;
 	}
 
 	mysql_mutex_assert_owner(&buf_pool.mutex);
@@ -872,7 +844,6 @@
 
 		/* The fields of bpage were copied to b before
 		buf_LRU_block_remove_hashed() was invoked. */
-		ut_ad(!b->in_zip_hash);
 		ut_ad(b->in_LRU_list);
 		ut_ad(b->in_page_hash);
 		ut_d(b->in_page_hash = false);
@@ -988,24 +959,12 @@
 
 	if (data != NULL) {
 		block->page.zip.data = NULL;
-		buf_pool_mutex_exit_forbid();
-
 		ut_ad(block->zip_size());
-
 		buf_buddy_free(data, block->zip_size());
-
-		buf_pool_mutex_exit_allow();
 		page_zip_set_size(&block->page.zip, 0);
 	}
 
-	if (buf_pool.is_shrinking()
-	    && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target
-	    && buf_pool.will_be_withdrawn(block->page)) {
-		/* This should be withdrawn */
-		UT_LIST_ADD_LAST(
-			buf_pool.withdraw,
-			&block->page);
-		ut_d(block->in_withdraw_list = true);
+	if (buf_pool.to_withdraw() && buf_pool.withdraw(block->page)) {
 	} else {
 		UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
 		ut_d(block->page.in_free_list = true);
@@ -1106,7 +1065,6 @@
 		MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size());
 	}
 
-	ut_ad(!bpage->in_zip_hash);
 	buf_pool.page_hash.remove(chain, bpage);
 	page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
 
@@ -1118,11 +1076,7 @@
 		ut_ad(!bpage->oldest_modification());
 
 		hash_lock.unlock();
-		buf_pool_mutex_exit_forbid();
-
 		buf_buddy_free(bpage->zip.data, bpage->zip_size());
-
-		buf_pool_mutex_exit_allow();
 		bpage->lock.free();
 		ut_free(bpage);
 		return false;
@@ -1151,12 +1105,7 @@
 			ut_ad(!bpage->in_free_list);
 			ut_ad(!bpage->oldest_modification());
 			ut_ad(!bpage->in_LRU_list);
-			buf_pool_mutex_exit_forbid();
-
 			buf_buddy_free(data, bpage->zip_size());
-
-			buf_pool_mutex_exit_allow();
-
 			page_zip_set_size(&bpage->zip, 0);
 		}
 
@@ -1327,7 +1276,7 @@
 		ut_ad(!bpage->frame
 		      || reinterpret_cast<buf_block_t*>(bpage)
 		      ->in_unzip_LRU_list
-		      == bpage->belongs_to_unzip_LRU());
+		      == !!bpage->zip.data);
 
 		if (bpage->is_old()) {
 			const buf_page_t*	prev
diff -Nru mariadb-10.11.11/storage/innobase/buf/buf0rea.cc mariadb-10.11.13/storage/innobase/buf/buf0rea.cc
--- mariadb-10.11.11/storage/innobase/buf/buf0rea.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/buf/buf0rea.cc	2025-05-19 16:14:25.000000000 +0000
@@ -44,7 +44,7 @@
 #include "log.h"
 #include "mariadb_stats.h"
 
-/** If there are buf_pool.curr_size per the number below pending reads, then
+/** If there are buf_pool.curr_size() per the number below pending reads, then
 read-ahead is not done: this is to prevent flooding the buffer pool with
 i/o-fixed buffer blocks */
 #define BUF_READ_AHEAD_PEND_LIMIT	2
@@ -63,7 +63,6 @@
   ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked());
   ut_ad(w >= &watch[0]);
   ut_ad(w < &watch[array_elements(watch)]);
-  ut_ad(!w->in_zip_hash);
   ut_ad(!w->zip.data);
 
   uint32_t s{w->state()};
@@ -372,7 +371,7 @@
     return 0;
 
   if (os_aio_pending_reads_approx() >
-      buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+      buf_pool.curr_size() / BUF_READ_AHEAD_PEND_LIMIT)
     return 0;
 
   fil_space_t* space= fil_space_t::get(page_id.space());
@@ -525,7 +524,7 @@
     return 0;
 
   if (os_aio_pending_reads_approx() >
-      buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+      buf_pool.curr_size() / BUF_READ_AHEAD_PEND_LIMIT)
     return 0;
 
   const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0defrag_bg.cc mariadb-10.11.13/storage/innobase/dict/dict0defrag_bg.cc
--- mariadb-10.11.11/storage/innobase/dict/dict0defrag_bg.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/dict/dict0defrag_bg.cc	2025-05-19 16:14:25.000000000 +0000
@@ -196,7 +196,7 @@
         ? dict_table_find_index_on_id(table, index_id) : nullptr)
       if (index->is_btree())
         dict_stats_save_defrag_stats(index);
-    dict_table_close(table, false, thd, mdl);
+    dict_table_close(table, thd, mdl);
   }
 }
 
@@ -217,47 +217,17 @@
   if (index->is_ibuf())
     return DB_SUCCESS;
 
-  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
-  dict_table_t *table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
-                                                     DICT_ERR_IGNORE_NONE);
-  if (table_stats)
-  {
-    dict_sys.freeze(SRW_LOCK_CALL);
-    table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
-    dict_sys.unfreeze();
-  }
-  if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
-  {
-release_and_exit:
-    if (table_stats)
-      dict_table_close(table_stats, false, thd, mdl_table);
+  dict_stats stats;
+  if (stats.open(thd))
     return DB_STATS_DO_NOT_EXIST;
-  }
-
-  dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
-                                                     DICT_ERR_IGNORE_NONE);
-  if (index_stats)
-  {
-    dict_sys.freeze(SRW_LOCK_CALL);
-    index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
-    dict_sys.unfreeze();
-  }
-  if (!index_stats)
-    goto release_and_exit;
-  if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
-  {
-    dict_table_close(index_stats, false, thd, mdl_index);
-    goto release_and_exit;
-  }
-
   trx_t *trx= trx_create();
   trx->mysql_thd= thd;
   trx_start_internal(trx);
   dberr_t ret= trx->read_only
     ? DB_READ_ONLY
-    : lock_table_for_trx(table_stats, trx, LOCK_X);
+    : lock_table_for_trx(stats.table(), trx, LOCK_X);
   if (ret == DB_SUCCESS)
-    ret= lock_table_for_trx(index_stats, trx, LOCK_X);
+    ret= lock_table_for_trx(stats.index(), trx, LOCK_X);
   row_mysql_lock_data_dictionary(trx);
   if (ret == DB_SUCCESS)
     ret= dict_stats_save_index_stat(index, time(nullptr), "n_pages_freed",
@@ -271,31 +241,27 @@
   else
     trx->rollback();
 
-  if (table_stats)
-    dict_table_close(table_stats, true, thd, mdl_table);
-  if (index_stats)
-    dict_table_close(index_stats, true, thd, mdl_index);
-
   row_mysql_unlock_data_dictionary(trx);
   trx->free();
+  stats.close();
 
   return ret;
 }
 
 /**************************************************************//**
 Gets the number of reserved and used pages in a B-tree.
-@return	number of pages reserved, or ULINT_UNDEFINED if the index
-is unavailable */
+@return	number of pages reserved
+@retval 0 if the index is unavailable */
 static
-ulint
+uint32_t
 btr_get_size_and_reserved(
 	dict_index_t*	index,	/*!< in: index */
 	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
-	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	uint32_t*	used,	/*!< out: number of pages used (<= reserved) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
 				is s-latched */
 {
-	ulint		dummy;
+	uint32_t	dummy;
 
 	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
 	ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
@@ -304,19 +270,19 @@
 	    || dict_index_is_online_ddl(index)
 	    || !index->is_committed()
 	    || !index->table->space) {
-		return(ULINT_UNDEFINED);
+		return 0;
 	}
 
 	dberr_t err;
 	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
 	*used = 0;
 	if (!root) {
-		return ULINT_UNDEFINED;
+		return 0;
 	}
 
 	mtr->x_lock_space(index->table->space);
 
-	ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+	auto n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
 					+ root->page.frame, used, mtr);
 	if (flag == BTR_TOTAL_SIZE) {
 		n += fseg_n_reserved_pages(*root,
@@ -343,59 +309,28 @@
 
   const time_t now= time(nullptr);
   mtr_t mtr;
-  ulint n_leaf_pages;
+  uint32_t n_leaf_pages;
   mtr.start();
   mtr_sx_lock_index(index, &mtr);
-  ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
-                                                   &n_leaf_pages, &mtr);
+  uint32_t n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+                                                      &n_leaf_pages, &mtr);
   mtr.commit();
 
-  if (n_leaf_reserved == ULINT_UNDEFINED)
+  if (!n_leaf_reserved)
     return DB_SUCCESS;
 
-  THD *thd= current_thd;
-  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
-  dict_table_t* table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
-                                                     DICT_ERR_IGNORE_NONE);
-  if (table_stats)
-  {
-    dict_sys.freeze(SRW_LOCK_CALL);
-    table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
-    dict_sys.unfreeze();
-  }
-  if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
-  {
-release_and_exit:
-    if (table_stats)
-      dict_table_close(table_stats, false, thd, mdl_table);
+  THD *const thd= current_thd;
+  dict_stats stats;
+  if (stats.open(thd))
     return DB_STATS_DO_NOT_EXIST;
-  }
-
-  dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
-                                                     DICT_ERR_IGNORE_NONE);
-  if (index_stats)
-  {
-    dict_sys.freeze(SRW_LOCK_CALL);
-    index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
-    dict_sys.unfreeze();
-  }
-  if (!index_stats)
-    goto release_and_exit;
-
-  if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
-  {
-    dict_table_close(index_stats, false, thd, mdl_index);
-    goto release_and_exit;
-  }
-
   trx_t *trx= trx_create();
   trx->mysql_thd= thd;
   trx_start_internal(trx);
   dberr_t ret= trx->read_only
     ? DB_READ_ONLY
-    : lock_table_for_trx(table_stats, trx, LOCK_X);
+    : lock_table_for_trx(stats.table(), trx, LOCK_X);
   if (ret == DB_SUCCESS)
-    ret= lock_table_for_trx(index_stats, trx, LOCK_X);
+    ret= lock_table_for_trx(stats.index(), trx, LOCK_X);
 
   row_mysql_lock_data_dictionary(trx);
 
@@ -423,12 +358,9 @@
   else
     trx->rollback();
 
-  if (table_stats)
-    dict_table_close(table_stats, true, thd, mdl_table);
-  if (index_stats)
-    dict_table_close(index_stats, true, thd, mdl_index);
   row_mysql_unlock_data_dictionary(trx);
   trx->free();
+  stats.close();
 
   return ret;
 }
diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0dict.cc mariadb-10.11.13/storage/innobase/dict/dict0dict.cc
--- mariadb-10.11.11/storage/innobase/dict/dict0dict.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/dict/dict0dict.cc	2025-05-19 16:14:25.000000000 +0000
@@ -44,6 +44,7 @@
 #include "btr0cur.h"
 #include "btr0sea.h"
 #include "buf0buf.h"
+#include "buf0flu.h"
 #include "data0type.h"
 #include "dict0boot.h"
 #include "dict0load.h"
@@ -195,71 +196,6 @@
 	return(FALSE);
 }
 
-/** Decrement the count of open handles */
-void dict_table_close(dict_table_t *table)
-{
-  if (table->get_ref_count() == 1 &&
-      dict_stats_is_persistent_enabled(table) &&
-      strchr(table->name.m_name, '/'))
-  {
-    /* It looks like we are closing the last handle. The user could
-    have executed FLUSH TABLES in order to have the statistics reloaded
-    from the InnoDB persistent statistics tables. We must acquire
-    exclusive dict_sys.latch to prevent a race condition with another
-    thread concurrently acquiring a handle on the table. */
-    dict_sys.lock(SRW_LOCK_CALL);
-    if (table->release())
-    {
-      table->stats_mutex_lock();
-      if (table->get_ref_count() == 0)
-        dict_stats_deinit(table);
-      table->stats_mutex_unlock();
-    }
-    dict_sys.unlock();
-  }
-  else
-    table->release();
-}
-
-/** Decrements the count of open handles of a table.
-@param[in,out]	table		table
-@param[in]	dict_locked	whether dict_sys.latch is being held
-@param[in]	thd		thread to release MDL
-@param[in]	mdl		metadata lock or NULL if the thread
-				is a foreground one. */
-void
-dict_table_close(
-	dict_table_t*	table,
-	bool		dict_locked,
-	THD*		thd,
-	MDL_ticket*	mdl)
-{
-  if (!dict_locked)
-    dict_table_close(table);
-  else
-  {
-    if (table->release() && dict_stats_is_persistent_enabled(table) &&
-	strchr(table->name.m_name, '/'))
-    {
-      /* Force persistent stats re-read upon next open of the table so
-      that FLUSH TABLE can be used to forcibly fetch stats from disk if
-      they have been manually modified. */
-      table->stats_mutex_lock();
-      if (table->get_ref_count() == 0)
-        dict_stats_deinit(table);
-      table->stats_mutex_unlock();
-    }
-
-    ut_ad(dict_lru_validate());
-    ut_ad(dict_sys.find(table));
-  }
-
-  if (!thd || !mdl);
-  else if (MDL_context *mdl_context= static_cast<MDL_context*>
-           (thd_mdl_context(thd)))
-    mdl_context->release_lock(mdl);
-}
-
 /** Check if the table has a given (non_virtual) column.
 @param[in]	table		table object
 @param[in]	col_name	column name
@@ -586,6 +522,14 @@
 	return(ULINT_UNDEFINED);
 }
 
+void mdl_release(THD *thd, MDL_ticket *mdl) noexcept
+{
+  if (!thd || !mdl);
+  else if (MDL_context *mdl_context= static_cast<MDL_context*>
+           (thd_mdl_context(thd)))
+    mdl_context->release_lock(mdl);
+}
+
 /** Parse the table file name into table name and database name.
 @tparam        dict_frozen  whether the caller holds dict_sys.latch
 @param[in,out] db_name      database name buffer
@@ -694,32 +638,28 @@
                         MDL_context *mdl_context, MDL_ticket **mdl,
                         dict_table_op_t table_op)
 {
-  table_id_t table_id= table->id;
   char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1];
   char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1];
   size_t db_len, tbl_len;
-  bool unaccessible= false;
 
   if (!table->parse_name<!trylock>(db_buf, tbl_buf, &db_len, &tbl_len))
     /* The name of an intermediate table starts with #sql */
     return table;
 
 retry:
-  if (!unaccessible && (!table->is_readable() || table->corrupted))
+  ut_ad(!trylock == dict_sys.frozen());
+
+  if (!table->is_readable() || table->corrupted)
   {
     if (*mdl)
     {
       mdl_context->release_lock(*mdl);
       *mdl= nullptr;
     }
-    unaccessible= true;
+    return nullptr;
   }
 
-  if (!trylock)
-    table->release();
-
-  if (unaccessible)
-    return nullptr;
+  const table_id_t table_id{table->id};
 
   if (!trylock)
     dict_sys.unfreeze();
@@ -748,11 +688,38 @@
     }
   }
 
+  size_t db1_len, tbl1_len;
+lookup:
   dict_sys.freeze(SRW_LOCK_CALL);
   table= dict_sys.find_table(table_id);
   if (table)
-    table->acquire();
-  if (!table && table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
+  {
+    if (!table->is_accessible())
+    {
+      table= nullptr;
+    unlock_and_return_without_mdl:
+      if (trylock)
+        dict_sys.unfreeze();
+    return_without_mdl:
+      if (*mdl)
+      {
+        mdl_context->release_lock(*mdl);
+        *mdl= nullptr;
+      }
+      return table;
+    }
+
+    if (trylock)
+      table->acquire();
+
+    if (!table->parse_name<true>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
+    {
+      /* The table was renamed to #sql prefix.
+      Release MDL (if any) for the old name and return. */
+      goto unlock_and_return_without_mdl;
+    }
+  }
+  else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
   {
     dict_sys.unfreeze();
     dict_sys.lock(SRW_LOCK_CALL);
@@ -760,33 +727,19 @@
                                  table_op == DICT_TABLE_OP_LOAD_TABLESPACE
                                  ? DICT_ERR_IGNORE_RECOVER_LOCK
                                  : DICT_ERR_IGNORE_FK_NOKEY);
-    if (table)
-      table->acquire();
     dict_sys.unlock();
-    dict_sys.freeze(SRW_LOCK_CALL);
-  }
-
-  if (!table || !table->is_accessible())
-  {
-return_without_mdl:
-    if (trylock)
-      dict_sys.unfreeze();
-    if (*mdl)
-    {
-      mdl_context->release_lock(*mdl);
-      *mdl= nullptr;
-    }
-    return nullptr;
-  }
-
-  size_t db1_len, tbl1_len;
-
-  if (!table->parse_name<true>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
-  {
-    /* The table was renamed to #sql prefix.
-    Release MDL (if any) for the old name and return. */
+    /* At this point, the freshly loaded table may already have been evicted.
+    We must look it up again while holding a shared dict_sys.latch.  We keep
+    trying this until the table is found in the cache or it cannot be found
+    in the dictionary (because the table has been dropped or rebuilt). */
+    if (table)
+      goto lookup;
+    if (!trylock)
+      dict_sys.freeze(SRW_LOCK_CALL);
     goto return_without_mdl;
   }
+  else
+    goto return_without_mdl;
 
   if (*mdl)
   {
@@ -873,6 +826,7 @@
                                     dict_table_op_t table_op, THD *thd,
                                     MDL_ticket **mdl)
 {
+retry:
   if (!dict_locked)
     dict_sys.freeze(SRW_LOCK_CALL);
 
@@ -880,9 +834,21 @@
 
   if (table)
   {
-    table->acquire();
-    if (thd && !dict_locked)
-      table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op);
+    if (!dict_locked)
+    {
+      if (thd)
+      {
+        table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op);
+        if (table)
+          goto acquire;
+      }
+      else
+      acquire:
+        table->acquire();
+      dict_sys.unfreeze();
+    }
+    else
+      table->acquire();
   }
   else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
   {
@@ -895,24 +861,16 @@
                                  table_op == DICT_TABLE_OP_LOAD_TABLESPACE
                                  ? DICT_ERR_IGNORE_RECOVER_LOCK
                                  : DICT_ERR_IGNORE_FK_NOKEY);
-    if (table)
-      table->acquire();
     if (!dict_locked)
     {
       dict_sys.unlock();
-      if (table && thd)
-      {
-        dict_sys.freeze(SRW_LOCK_CALL);
-        table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op);
-        dict_sys.unfreeze();
-      }
-      return table;
+      if (table)
+        goto retry;
     }
+    else if (table)
+      table->acquire();
   }
 
-  if (!dict_locked)
-    dict_sys.unfreeze();
-
   return table;
 }
 
@@ -975,7 +933,7 @@
   UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU);
   UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU);
 
-  const ulint hash_size = buf_pool_get_curr_size()
+  const ulint hash_size = buf_pool.curr_pool_size()
     / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
 
   table_hash.create(hash_size);
@@ -1012,7 +970,10 @@
   const ulong threshold= srv_fatal_semaphore_wait_threshold;
 
   if (waited >= threshold)
+  {
+    buf_pool.print_flush_info();
     ib::fatal() << fatal_msg;
+  }
 
   if (waited > threshold / 4)
     ib::warn() << "A long wait (" << waited
@@ -1129,6 +1090,55 @@
   DBUG_RETURN(table);
 }
 
+bool dict_stats::open(THD *thd) noexcept
+{
+  ut_ad(!mdl_table);
+  ut_ad(!mdl_index);
+  ut_ad(!table_stats);
+  ut_ad(!index_stats);
+  ut_ad(!mdl_context);
+
+  mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+  if (!mdl_context)
+    return true;
+  /* FIXME: use compatible type, and maybe remove this parameter altogether! */
+  const double timeout= double(global_system_variables.lock_wait_timeout);
+  MDL_request request;
+  MDL_REQUEST_INIT(&request, MDL_key::TABLE, "mysql", "innodb_table_stats",
+                   MDL_SHARED, MDL_EXPLICIT);
+  if (UNIV_UNLIKELY(mdl_context->acquire_lock(&request, timeout)))
+    return true;
+  mdl_table= request.ticket;
+  MDL_REQUEST_INIT(&request, MDL_key::TABLE, "mysql", "innodb_index_stats",
+                   MDL_SHARED, MDL_EXPLICIT);
+  if (UNIV_UNLIKELY(mdl_context->acquire_lock(&request, timeout)))
+    goto release_mdl;
+  mdl_index= request.ticket;
+  table_stats= dict_table_open_on_name("mysql/innodb_table_stats", false,
+                                       DICT_ERR_IGNORE_NONE);
+  if (!table_stats)
+    goto release_mdl;
+  index_stats= dict_table_open_on_name("mysql/innodb_index_stats", false,
+                                       DICT_ERR_IGNORE_NONE);
+  if (index_stats)
+    return false;
+
+  table_stats->release();
+release_mdl:
+  if (mdl_index)
+    mdl_context->release_lock(mdl_index);
+  mdl_context->release_lock(mdl_table);
+  return true;
+}
+
+void dict_stats::close() noexcept
+{
+  table_stats->release();
+  index_stats->release();
+  mdl_context->release_lock(mdl_table);
+  mdl_context->release_lock(mdl_index);
+}
+
 /**********************************************************************//**
 Adds system columns to a table object. */
 void
@@ -4389,7 +4399,7 @@
   table_id_hash.free();
   temp_id_hash.free();
 
-  const ulint hash_size = buf_pool_get_curr_size()
+  const ulint hash_size = buf_pool.curr_pool_size()
     / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
   table_hash.create(hash_size);
   table_id_hash.create(hash_size);
diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0load.cc mariadb-10.11.13/storage/innobase/dict/dict0load.cc
--- mariadb-10.11.11/storage/innobase/dict/dict0load.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/dict/dict0load.cc	2025-05-19 16:14:25.000000000 +0000
@@ -2513,10 +2513,12 @@
 	if (!table->is_readable()) {
 		/* Don't attempt to load the indexes from disk. */
 	} else if (err == DB_SUCCESS) {
+		auto i = fk_tables.size();
 		err = dict_load_foreigns(table->name.m_name, nullptr,
 					 0, true, ignore_err, fk_tables);
 
 		if (err != DB_SUCCESS) {
+			fk_tables.erase(fk_tables.begin() + i, fk_tables.end());
 			ib::warn() << "Load table " << table->name
 				<< " failed, the table has missing"
 				" foreign key indexes. Turn off"
diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0stats.cc mariadb-10.11.13/storage/innobase/dict/dict0stats.cc
--- mariadb-10.11.11/storage/innobase/dict/dict0stats.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/dict/dict0stats.cc	2025-05-19 16:14:25.000000000 +0000
@@ -359,7 +359,7 @@
 
 	if (!table) {
 		if (opt_bootstrap)
-			return DB_TABLE_NOT_FOUND;
+			return DB_STATS_DO_NOT_EXIST;
 		if (req_schema == &table_stats_schema) {
 			if (innodb_table_stats_not_found_reported) {
 				return DB_STATS_DO_NOT_EXIST;
@@ -377,10 +377,10 @@
 
 		snprintf(errstr, errstr_sz, "Table %s not found.",
 			 req_schema->table_name_sql);
-		return DB_TABLE_NOT_FOUND;
+		return DB_STATS_DO_NOT_EXIST;
 	}
 
-	if (!table->is_readable() && !table->space) {
+	if (!table->is_readable() || !table->space) {
 		/* missing tablespace */
 		snprintf(errstr, errstr_sz,
 			 "Tablespace for table %s is missing.",
@@ -491,11 +491,8 @@
 	return DB_SUCCESS;
 }
 
-/*********************************************************************//**
-Checks whether the persistent statistics storage exists and that all
-tables have the proper structure.
-@return true if exists and all tables are ok */
-static bool dict_stats_persistent_storage_check(bool dict_already_locked)
+dict_stats_schema_check
+dict_stats_persistent_storage_check(bool dict_already_locked) noexcept
 {
 	char		errstr[512];
 	dberr_t		ret;
@@ -521,14 +518,14 @@
 
 	switch (ret) {
 	case DB_SUCCESS:
-		return true;
+		return SCHEMA_OK;
+	case DB_STATS_DO_NOT_EXIST:
+		return SCHEMA_NOT_EXIST;
 	default:
 		if (!opt_bootstrap) {
-			ib::error() << errstr;
+			sql_print_error("InnoDB: %s", errstr);
 		}
-		/* fall through */
-	case DB_STATS_DO_NOT_EXIST:
-		return false;
+		return SCHEMA_INVALID;
 	}
 }
 
@@ -544,13 +541,16 @@
 {
   ut_ad(dict_sys.locked());
 
-  if (!dict_stats_persistent_storage_check(true))
-  {
-    pars_info_free(pinfo);
-    return DB_STATS_DO_NOT_EXIST;
+  switch (dict_stats_persistent_storage_check(true)) {
+  case SCHEMA_OK:
+    return que_eval_sql(pinfo, sql, trx);
+  case SCHEMA_INVALID:
+  case SCHEMA_NOT_EXIST:
+    break;
   }
 
-  return que_eval_sql(pinfo, sql, trx);
+  pars_info_free(pinfo);
+  return DB_STATS_DO_NOT_EXIST;
 }
 
 
@@ -599,7 +599,7 @@
 	table->stat_clustered_index_size = 1;
 	/* 1 page for each index, not counting the clustered */
 	table->stat_sum_of_other_index_sizes
-		= UT_LIST_GET_LEN(table->indexes) - 1;
+		= uint32_t(UT_LIST_GET_LEN(table->indexes) - 1);
 	table->stat_modified_counter = 0;
 
 	dict_index_t*	index;
@@ -617,7 +617,7 @@
 		dict_stats_empty_index(index, empty_defrag_stats);
 	}
 
-	table->stat_initialized = TRUE;
+	table->stat = table->stat | dict_table_t::STATS_INITIALIZED;
 	table->stats_mutex_unlock();
 }
 
@@ -658,16 +658,10 @@
 /*==========================*/
 	const dict_table_t*	table)	/*!< in: table */
 {
-	ut_a(table->stat_initialized);
-
 	MEM_CHECK_DEFINED(&table->stats_last_recalc,
 			  sizeof table->stats_last_recalc);
 
-	MEM_CHECK_DEFINED(&table->stat_persistent,
-			  sizeof table->stat_persistent);
-
-	MEM_CHECK_DEFINED(&table->stats_auto_recalc,
-			  sizeof table->stats_auto_recalc);
+	MEM_CHECK_DEFINED(&table->stat, sizeof table->stat);
 
 	MEM_CHECK_DEFINED(&table->stats_sample_pages,
 			  sizeof table->stats_sample_pages);
@@ -844,8 +838,8 @@
 	ulint		n_cols;
 	ib_uint64_t*	n_diff;
 	ib_uint64_t*	n_not_null;
-	ibool		stats_null_not_equal;
-	uintmax_t	n_sample_pages=1; /* number of pages to sample */
+	bool		stats_null_not_equal;
+	uint32_t	n_sample_pages=1; /* number of pages to sample */
 	ulint		not_empty_flag	= 0;
 	ulint		total_external_size = 0;
 	uintmax_t	add_on;
@@ -883,11 +877,11 @@
 	case SRV_STATS_NULLS_UNEQUAL:
 		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
 		case, we will treat NULLs as unequal value */
-		stats_null_not_equal = TRUE;
+		stats_null_not_equal = true;
 		break;
 
 	case SRV_STATS_NULLS_EQUAL:
-		stats_null_not_equal = FALSE;
+		stats_null_not_equal = false;
 		break;
 
 	default:
@@ -938,19 +932,21 @@
 
 		so taking all case2 paths is I, our expression is:
 		n_pages = S < I? min(I,L) : I
-                */
-		if (index->stat_index_size > 1) {
-			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
-				? ut_min(index->stat_index_size,
-					 static_cast<ulint>(
-						 log2(double(index->stat_index_size))
-						 * double(srv_stats_transient_sample_pages)))
-				: index->stat_index_size;
+		*/
+		if (uint32_t I = index->stat_index_size) {
+			const uint32_t S{srv_stats_transient_sample_pages};
+			n_sample_pages = S < I
+				? std::min(I,
+					   uint32_t(log2(double(I))
+						    * double(S)))
+				: I;
 		}
 	}
 
 	/* Sanity check */
-	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
+	ut_ad(n_sample_pages);
+	ut_ad(n_sample_pages <= (index->stat_index_size <= 1
+				 ? 1 : index->stat_index_size));
 
 	/* We sample some pages in the index to get an estimate */
 	btr_cur_t cursor;
@@ -1169,7 +1165,7 @@
 
 		mtr.x_lock_space(index->table->space);
 
-		ulint dummy, size;
+		uint32_t dummy, size;
 		index->stat_index_size
 			= fseg_n_reserved_pages(*root, PAGE_HEADER
 						+ PAGE_BTR_SEG_LEAF
@@ -1209,24 +1205,12 @@
 	return err;
 }
 
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. This function
-is relatively quick and is used to calculate transient statistics that
-are not saved on disk.
-This was the only way to calculate statistics before the
-Persistent Statistics feature was introduced.
-@return error code
-@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */
-static
-dberr_t
-dict_stats_update_transient(
-/*========================*/
-	dict_table_t*	table)	/*!< in/out: table */
+dberr_t dict_stats_update_transient(dict_table_t *table) noexcept
 {
 	ut_ad(!table->stats_mutex_is_owner());
 
 	dict_index_t*	index;
-	ulint		sum_of_index_sizes	= 0;
+	uint32_t	sum_of_index_sizes	= 0;
 	dberr_t		err = DB_SUCCESS;
 
 	/* Find out the sizes of the indexes and how many different values
@@ -1234,17 +1218,16 @@
 
 	index = dict_table_get_first_index(table);
 
-	if (!table->space) {
-		/* Nothing to do. */
-empty_table:
+	if (!index || !table->space) {
 		dict_stats_empty_table(table, true);
-		return err;
-	} else if (index == NULL) {
-		/* Table definition is corrupt */
+		return DB_SUCCESS;
+	}
 
-		ib::warn() << "Table " << table->name
-			<< " has no indexes. Cannot calculate statistics.";
-		goto empty_table;
+	if (trx_id_t bulk_trx_id = table->bulk_trx_id) {
+		if (trx_sys.find(nullptr, bulk_trx_id, false)) {
+			dict_stats_empty_table(table, false);
+			return DB_SUCCESS_LOCKED_REC;
+		}
 	}
 
 	for (; index != NULL; index = dict_table_get_next_index(index)) {
@@ -1285,7 +1268,7 @@
 
 	table->stat_modified_counter = 0;
 
-	table->stat_initialized = TRUE;
+	table->stat = table->stat | dict_table_t::STATS_INITIALIZED;
 
 	table->stats_mutex_unlock();
 
@@ -2225,8 +2208,8 @@
 struct index_stats_t
 {
   std::vector<index_field_stats_t> stats;
-  ulint index_size;
-  ulint n_leaf_pages;
+  uint32_t index_size;
+  uint32_t n_leaf_pages;
 
   index_stats_t(ulint n_uniq) : index_size(1), n_leaf_pages(1)
   {
@@ -2365,7 +2348,7 @@
 
 	uint16_t root_level = btr_page_get_level(root->page.frame);
 	mtr.x_lock_space(index->table->space);
-	ulint dummy, size;
+	uint32_t dummy, size;
 	result.index_size
 		= fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
 					+ root->page.frame, &size, &mtr)
@@ -2635,17 +2618,7 @@
 	DBUG_RETURN(result);
 }
 
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. This function
-is relatively slow and is used to calculate persistent statistics that
-will be saved on disk.
-@return DB_SUCCESS or error code
-@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
-static
-dberr_t
-dict_stats_update_persistent(
-/*=========================*/
-	dict_table_t*	table)		/*!< in/out: table */
+dberr_t dict_stats_update_persistent(dict_table_t *table) noexcept
 {
 	dict_index_t*	index;
 
@@ -2653,6 +2626,13 @@
 
 	DEBUG_SYNC_C("dict_stats_update_persistent");
 
+	if (trx_id_t bulk_trx_id = table->bulk_trx_id) {
+		if (trx_sys.find(nullptr, bulk_trx_id, false)) {
+			dict_stats_empty_table(table, false);
+			return DB_SUCCESS_LOCKED_REC;
+		}
+	}
+
 	/* analyze the clustered index first */
 
 	index = dict_table_get_first_index(table);
@@ -2742,7 +2722,7 @@
 
 	table->stat_modified_counter = 0;
 
-	table->stat_initialized = TRUE;
+	table->stat = table->stat | dict_table_t::STATS_INITIALIZED;
 
 	dict_stats_assert_initialized(table);
 
@@ -2751,6 +2731,18 @@
 	return(DB_SUCCESS);
 }
 
+dberr_t dict_stats_update_persistent_try(dict_table_t *table)
+{
+  if (table->stats_is_persistent() &&
+      dict_stats_persistent_storage_check(false) == SCHEMA_OK)
+  {
+    if (dberr_t err= dict_stats_update_persistent(table))
+      return err;
+    return dict_stats_save(table);
+  }
+  return DB_SUCCESS;
+}
+
 #include "mysql_com.h"
 /** Save an individual index's statistic into the persistent statistics
 storage.
@@ -2829,14 +2821,14 @@
 		"END;", trx);
 
 	if (UNIV_UNLIKELY(ret != DB_SUCCESS)) {
-		if (innodb_index_stats_not_found == false &&
-		    index->stats_error_printed == false) {
+		if (innodb_index_stats_not_found == false
+		    && !index->table->stats_error_printed) {
+			index->table->stats_error_printed = true;
 		ib::error() << "Cannot save index statistics for table "
 			<< index->table->name
 			<< ", index " << index->name
 			<< ", stat name \"" << stat_name << "\": "
 			<< ret;
-			index->stats_error_printed = true;
 		}
 	}
 
@@ -2878,27 +2870,29 @@
 	return err;
 }
 
-/** Save the table's statistics into the persistent statistics storage.
-@param[in]	table		table whose stats to save
-@param[in]	only_for_index	if this is non-NULL, then stats for indexes
-that are not equal to it will not be saved, if NULL, then all indexes' stats
-are saved
+/** Save the persistent statistics of a table or an index.
+@param table            table whose stats to save
+@param only_for_index   the index ID to save statistics for (0=all)
 @return DB_SUCCESS or error code */
-static
-dberr_t
-dict_stats_save(
-	dict_table_t*		table,
-	const index_id_t*	only_for_index)
+dberr_t dict_stats_save(dict_table_t* table, index_id_t index_id)
 {
 	pars_info_t*	pinfo;
 	char		db_utf8[MAX_DB_UTF8_LEN];
 	char		table_utf8[MAX_TABLE_UTF8_LEN];
+	THD* const	thd = current_thd;
 
 #ifdef ENABLED_DEBUG_SYNC
 	DBUG_EXECUTE_IF("dict_stats_save_exit_notify",
+	   SCOPE_EXIT([thd] {
+	       debug_sync_set_action(thd,
+	       STRING_WITH_LEN("now SIGNAL dict_stats_save_finished"));
+	    });
+	);
+	DBUG_EXECUTE_IF("dict_stats_save_exit_notify_and_wait",
 	   SCOPE_EXIT([] {
 	       debug_sync_set_action(current_thd,
-	       STRING_WITH_LEN("now SIGNAL dict_stats_save_finished"));
+	       STRING_WITH_LEN("now SIGNAL dict_stats_save_finished"
+			       " WAIT_FOR dict_stats_save_unblock"));
 	    });
 	);
 #endif /* ENABLED_DEBUG_SYNC */
@@ -2911,41 +2905,10 @@
 		return (dict_stats_report_error(table));
 	}
 
-	THD* thd = current_thd;
-	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
-	dict_table_t* table_stats = dict_table_open_on_name(
-		TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
-	if (table_stats) {
-		dict_sys.freeze(SRW_LOCK_CALL);
-		table_stats = dict_acquire_mdl_shared<false>(table_stats, thd,
-							     &mdl_table);
-		dict_sys.unfreeze();
-	}
-	if (!table_stats
-	    || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) {
-release_and_exit:
-		if (table_stats) {
-			dict_table_close(table_stats, false, thd, mdl_table);
-		}
+	dict_stats stats;
+	if (stats.open(thd)) {
 		return DB_STATS_DO_NOT_EXIST;
 	}
-
-	dict_table_t* index_stats = dict_table_open_on_name(
-		INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
-	if (index_stats) {
-		dict_sys.freeze(SRW_LOCK_CALL);
-		index_stats = dict_acquire_mdl_shared<false>(index_stats, thd,
-							     &mdl_index);
-		dict_sys.unfreeze();
-	}
-	if (!index_stats) {
-		goto release_and_exit;
-	}
-	if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
-		dict_table_close(index_stats, false, thd, mdl_index);
-		goto release_and_exit;
-	}
-
 	dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
 		     table_utf8, sizeof(table_utf8));
 	const time_t now = time(NULL);
@@ -2954,9 +2917,9 @@
 	trx_start_internal(trx);
 	dberr_t ret = trx->read_only
 		? DB_READ_ONLY
-		: lock_table_for_trx(table_stats, trx, LOCK_X);
+		: lock_table_for_trx(stats.table(), trx, LOCK_X);
 	if (ret == DB_SUCCESS) {
-		ret = lock_table_for_trx(index_stats, trx, LOCK_X);
+		ret = lock_table_for_trx(stats.index(), trx, LOCK_X);
 	}
 	if (ret != DB_SUCCESS) {
 		if (trx->state != TRX_STATE_NOT_STARTED) {
@@ -3002,8 +2965,14 @@
 		"END;", trx);
 
 	if (UNIV_UNLIKELY(ret != DB_SUCCESS)) {
-		ib::error() << "Cannot save table statistics for table "
-			<< table->name << ": " << ret;
+		sql_print_error("InnoDB: Cannot save table statistics for"
+#ifdef EMBEDDED_LIBRARY
+				" table %.*s.%s: %s",
+#else
+				" table %`.*s.%`s: %s",
+#endif
+				int(table->name.dblen()), table->name.m_name,
+				table->name.basename(), ut_strerr(ret));
 rollback_and_exit:
 		trx->rollback();
 free_and_exit:
@@ -3011,8 +2980,7 @@
 		dict_sys.unlock();
 unlocked_free_and_exit:
 		trx->free();
-		dict_table_close(table_stats, false, thd, mdl_table);
-		dict_table_close(index_stats, false, thd, mdl_index);
+		stats.close();
 		return ret;
 	}
 
@@ -3046,7 +3014,7 @@
 
 		index = it->second;
 
-		if (only_for_index != NULL && index->id != *only_for_index) {
+		if (index_id != 0 && index->id != index_id) {
 			continue;
 		}
 
@@ -3116,6 +3084,14 @@
 	goto free_and_exit;
 }
 
+void dict_stats_empty_table_and_save(dict_table_t *table)
+{
+  dict_stats_empty_table(table, true);
+  if (table->stats_is_persistent() &&
+      dict_stats_persistent_storage_check(false) == SCHEMA_OK)
+    dict_stats_save(table);
+}
+
 /*********************************************************************//**
 Called for the row that is selected by
 SELECT ... FROM mysql.innodb_table_stats WHERE table='...'
@@ -3164,8 +3140,7 @@
 			ut_a(len == 8);
 
 			table->stat_clustered_index_size
-				= std::max<ulint>(
-				      (ulint) mach_read_from_8(data), 1);
+				= std::max(mach_read_from_4(data + 4), 1U);
 			break;
 		}
 
@@ -3174,18 +3149,9 @@
 			ut_a(dtype_get_mtype(type) == DATA_INT);
 			ut_a(len == 8);
 
-			ulint stat_other_idx_size
-				= (ulint) mach_read_from_8(data);
-			if (!stat_other_idx_size
-			    && UT_LIST_GET_LEN(table->indexes) > 1) {
-				stat_other_idx_size
-					= UT_LIST_GET_LEN(table->indexes) - 1;
-			}
-			table->stat_sum_of_other_index_sizes
-				= std::max<ulint>(
-				    (ulint) mach_read_from_8(data),
-				    UT_LIST_GET_LEN(table->indexes) - 1);
-
+			table->stat_sum_of_other_index_sizes = std::max(
+				mach_read_from_4(data + 4),
+				uint32_t(UT_LIST_GET_LEN(table->indexes) - 1));
 			break;
 		}
 		default:
@@ -3370,14 +3336,12 @@
 
 	if (stat_name_len == 4 /* strlen("size") */
 	    && strncasecmp("size", stat_name, stat_name_len) == 0) {
-		index->stat_index_size
-			= std::max<ulint>((ulint) stat_value, 1);
+		index->stat_index_size = std::max(uint32_t(stat_value), 1U);
 		arg->stats_were_modified = true;
 	} else if (stat_name_len == 12 /* strlen("n_leaf_pages") */
 		   && strncasecmp("n_leaf_pages", stat_name, stat_name_len)
 		   == 0) {
-		index->stat_n_leaf_pages
-			= std::max<ulint>((ulint) stat_value, 1);
+		index->stat_n_leaf_pages = std::max(uint32_t(stat_value), 1U);
 		arg->stats_were_modified = true;
 	} else if (stat_name_len == 12 /* strlen("n_page_split") */
 		   && strncasecmp("n_page_split", stat_name, stat_name_len)
@@ -3477,19 +3441,11 @@
 	return(TRUE);
 }
 
-/*********************************************************************//**
-Read table's statistics from the persistent statistics storage.
-@return DB_SUCCESS or error code */
-static
-dberr_t
-dict_stats_fetch_from_ps(
-/*=====================*/
-	dict_table_t*	table)	/*!< in/out: table */
+/** Read the stored persistent statistics of a table. */
+dberr_t dict_stats_fetch_from_ps(dict_table_t *table)
 {
 	index_fetch_t	index_fetch_arg;
-	trx_t*		trx;
 	pars_info_t*	pinfo;
-	dberr_t		ret;
 	char		db_utf8[MAX_DB_UTF8_LEN];
 	char		table_utf8[MAX_TABLE_UTF8_LEN];
 
@@ -3499,49 +3455,16 @@
 	stats. */
 	dict_stats_empty_table(table, true);
 
-	THD* thd = current_thd;
-	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
-	dict_table_t* table_stats = dict_table_open_on_name(
-		TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
-	if (table_stats) {
-		dict_sys.freeze(SRW_LOCK_CALL);
-		table_stats = dict_acquire_mdl_shared<false>(table_stats, thd,
-							     &mdl_table);
-		dict_sys.unfreeze();
-	}
-	if (!table_stats
-	    || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) {
-release_and_exit:
-		if (table_stats) {
-			dict_table_close(table_stats, false, thd, mdl_table);
-		}
+	THD* const thd = current_thd;
+	dict_stats stats;
+	if (stats.open(thd)) {
 		return DB_STATS_DO_NOT_EXIST;
 	}
 
-	dict_table_t* index_stats = dict_table_open_on_name(
-		INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
-	if (index_stats) {
-		dict_sys.freeze(SRW_LOCK_CALL);
-		index_stats = dict_acquire_mdl_shared<false>(index_stats, thd,
-							     &mdl_index);
-		dict_sys.unfreeze();
-	}
-	if (!index_stats) {
-		goto release_and_exit;
-	}
-	if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
-		dict_table_close(index_stats, false, thd, mdl_index);
-		goto release_and_exit;
-	}
-
 #ifdef ENABLED_DEBUG_SYNC
 	DEBUG_SYNC(thd, "dict_stats_mdl_acquired");
 #endif /* ENABLED_DEBUG_SYNC */
 
-	trx = trx_create();
-
-	trx_start_internal_read_only(trx);
-
 	dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
 		     table_utf8, sizeof(table_utf8));
 
@@ -3562,76 +3485,77 @@
 			        "fetch_index_stats_step",
 			        dict_stats_fetch_index_stats_step,
 			        &index_fetch_arg);
-	dict_sys.lock(SRW_LOCK_CALL); /* FIXME: remove this */
-	ret = que_eval_sql(pinfo,
-			   "PROCEDURE FETCH_STATS () IS\n"
-			   "found INT;\n"
-			   "DECLARE FUNCTION fetch_table_stats_step;\n"
-			   "DECLARE FUNCTION fetch_index_stats_step;\n"
-			   "DECLARE CURSOR table_stats_cur IS\n"
-			   "  SELECT\n"
-			   /* if you change the selected fields, be
-			   sure to adjust
-			   dict_stats_fetch_table_stats_step() */
-			   "  n_rows,\n"
-			   "  clustered_index_size,\n"
-			   "  sum_of_other_index_sizes\n"
-			   "  FROM \"" TABLE_STATS_NAME "\"\n"
-			   "  WHERE\n"
-			   "  database_name = :database_name AND\n"
-			   "  table_name = :table_name;\n"
-			   "DECLARE CURSOR index_stats_cur IS\n"
-			   "  SELECT\n"
-			   /* if you change the selected fields, be
-			   sure to adjust
-			   dict_stats_fetch_index_stats_step() */
-			   "  index_name,\n"
-			   "  stat_name,\n"
-			   "  stat_value,\n"
-			   "  sample_size\n"
-			   "  FROM \"" INDEX_STATS_NAME "\"\n"
-			   "  WHERE\n"
-			   "  database_name = :database_name AND\n"
-			   "  table_name = :table_name;\n"
-
-			   "BEGIN\n"
-
-			   "OPEN table_stats_cur;\n"
-			   "FETCH table_stats_cur INTO\n"
-			   "  fetch_table_stats_step();\n"
-			   "IF (SQL % NOTFOUND) THEN\n"
-			   "  CLOSE table_stats_cur;\n"
-			   "  RETURN;\n"
-			   "END IF;\n"
-			   "CLOSE table_stats_cur;\n"
-
-			   "OPEN index_stats_cur;\n"
-			   "found := 1;\n"
-			   "WHILE found = 1 LOOP\n"
-			   "  FETCH index_stats_cur INTO\n"
-			   "    fetch_index_stats_step();\n"
-			   "  IF (SQL % NOTFOUND) THEN\n"
-			   "    found := 0;\n"
-			   "  END IF;\n"
-			   "END LOOP;\n"
-			   "CLOSE index_stats_cur;\n"
+	dict_sys.lock(SRW_LOCK_CALL);
+	que_t* graph = pars_sql(
+		pinfo,
+		"PROCEDURE FETCH_STATS () IS\n"
+		"found INT;\n"
+		"DECLARE FUNCTION fetch_table_stats_step;\n"
+		"DECLARE FUNCTION fetch_index_stats_step;\n"
+		"DECLARE CURSOR table_stats_cur IS\n"
+		"  SELECT\n"
+		/* if you change the selected fields, be
+		sure to adjust
+		dict_stats_fetch_table_stats_step() */
+		"  n_rows,\n"
+		"  clustered_index_size,\n"
+		"  sum_of_other_index_sizes\n"
+		"  FROM \"" TABLE_STATS_NAME "\"\n"
+		"  WHERE\n"
+		"  database_name = :database_name AND\n"
+		"  table_name = :table_name;\n"
+		"DECLARE CURSOR index_stats_cur IS\n"
+		"  SELECT\n"
+		/* if you change the selected fields, be
+		sure to adjust
+		dict_stats_fetch_index_stats_step() */
+		"  index_name,\n"
+		"  stat_name,\n"
+		"  stat_value,\n"
+		"  sample_size\n"
+		"  FROM \"" INDEX_STATS_NAME "\"\n"
+		"  WHERE\n"
+		"  database_name = :database_name AND\n"
+		"  table_name = :table_name;\n"
+
+		"BEGIN\n"
+
+		"OPEN table_stats_cur;\n"
+		"FETCH table_stats_cur INTO\n"
+		"  fetch_table_stats_step();\n"
+		"IF (SQL % NOTFOUND) THEN\n"
+		"  CLOSE table_stats_cur;\n"
+		"  RETURN;\n"
+		"END IF;\n"
+		"CLOSE table_stats_cur;\n"
+
+		"OPEN index_stats_cur;\n"
+		"found := 1;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_stats_cur INTO\n"
+		"    fetch_index_stats_step();\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_stats_cur;\n"
 
-			   "END;", trx);
-	/* pinfo is freed by que_eval_sql() */
+		"END;");
 	dict_sys.unlock();
 
-	dict_table_close(table_stats, false, thd, mdl_table);
-	dict_table_close(index_stats, false, thd, mdl_index);
+	trx_t* trx = trx_create();
+	trx->graph = nullptr;
+	graph->trx = trx;
 
+	trx_start_internal_read_only(trx);
+	que_run_threads(que_fork_start_command(graph));
+	que_graph_free(graph);
 	trx_commit_for_mysql(trx);
-
+	dberr_t ret = index_fetch_arg.stats_were_modified
+		? trx->error_state : DB_STATS_DO_NOT_EXIST;
 	trx->free();
-
-	if (!index_fetch_arg.stats_were_modified) {
-		return(DB_STATS_DO_NOT_EXIST);
-	}
-
-	return(ret);
+	stats.close();
+	return ret;
 }
 
 /*********************************************************************//**
@@ -3641,250 +3565,46 @@
 /*========================*/
 	dict_index_t*	index)	/*!< in/out: index */
 {
-	DBUG_ENTER("dict_stats_update_for_index");
-
-	if (dict_stats_is_persistent_enabled(index->table)) {
-
-		if (dict_stats_persistent_storage_check(false)) {
-			index_stats_t stats = dict_stats_analyze_index(index);
-			index->table->stats_mutex_lock();
-			index->stat_index_size = stats.index_size;
-			index->stat_n_leaf_pages = stats.n_leaf_pages;
-			for (size_t i = 0; i < stats.stats.size(); ++i) {
-				index->stat_n_diff_key_vals[i]
-					= stats.stats[i].n_diff_key_vals;
-				index->stat_n_sample_sizes[i]
-					= stats.stats[i].n_sample_sizes;
-				index->stat_n_non_null_key_vals[i]
-					= stats.stats[i].n_non_null_key_vals;
-			}
-			index->table->stat_sum_of_other_index_sizes
-				+= index->stat_index_size;
-			index->table->stats_mutex_unlock();
-
-			dict_stats_save(index->table, &index->id);
-			DBUG_VOID_RETURN;
-		}
-		/* else */
-
-		if (innodb_index_stats_not_found == false &&
-		    index->stats_error_printed == false) {
-			/* Fall back to transient stats since the persistent
-			storage is not present or is corrupted */
-
-		ib::info() << "Recalculation of persistent statistics"
-			" requested for table " << index->table->name
-			<< " index " << index->name
-			<< " but the required"
-			" persistent statistics storage is not present or is"
-			" corrupted. Using transient stats instead.";
-			index->stats_error_printed = false;
-		}
-	}
-
-	dict_stats_update_transient_for_index(index);
-
-	DBUG_VOID_RETURN;
-}
-
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization.
-@return DB_SUCCESS or error code
-@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
-dberr_t
-dict_stats_update(
-/*==============*/
-	dict_table_t*		table,	/*!< in/out: table */
-	dict_stats_upd_option_t	stats_upd_option)
-					/*!< in: whether to (re) calc
-					the stats or to fetch them from
-					the persistent statistics
-					storage */
-{
-	ut_ad(!table->stats_mutex_is_owner());
-
-	if (!table->is_readable()) {
-		return (dict_stats_report_error(table));
-	} else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
-		/* If we have set a high innodb_force_recovery level, do
-		not calculate statistics, as a badly corrupted index can
-		cause a crash in it. */
-		dict_stats_empty_table(table, false);
-		return(DB_SUCCESS);
-	}
-
-	if (trx_id_t bulk_trx_id = table->bulk_trx_id) {
-		if (trx_sys.find(nullptr, bulk_trx_id, false)) {
-			dict_stats_empty_table(table, false);
-			return DB_SUCCESS_LOCKED_REC;
-		}
-	}
-
-	switch (stats_upd_option) {
-	case DICT_STATS_RECALC_PERSISTENT:
-
-		if (srv_read_only_mode) {
-			goto transient;
-		}
-
-		/* Persistent recalculation requested, called from
-		1) ANALYZE TABLE, or
-		2) the auto recalculation background thread, or
-		3) open table if stats do not exist on disk and auto recalc
-		   is enabled */
-
-		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
-		persistent stats enabled */
-		ut_a(strchr(table->name.m_name, '/') != NULL);
-
-		/* check if the persistent statistics storage exists
-		before calling the potentially slow function
-		dict_stats_update_persistent(); that is a
-		prerequisite for dict_stats_save() succeeding */
-		if (dict_stats_persistent_storage_check(false)) {
-
-			dberr_t	err;
-
-			err = dict_stats_update_persistent(table);
-
-			if (err != DB_SUCCESS) {
-				return(err);
-			}
-
-			err = dict_stats_save(table, NULL);
-
-			return(err);
-		}
+  dict_table_t *const table= index->table;
+  ut_ad(table->stat_initialized());
 
-		/* Fall back to transient stats since the persistent
-		storage is not present or is corrupted */
-
-		if (innodb_table_stats_not_found == false &&
-		    table->stats_error_printed == false) {
-		ib::warn() << "Recalculation of persistent statistics"
-			" requested for table "
-			<< table->name
-			<< " but the required persistent"
-			" statistics storage is not present or is corrupted."
-			" Using transient stats instead.";
-			table->stats_error_printed = true;
-		}
-
-		goto transient;
-
-	case DICT_STATS_RECALC_TRANSIENT:
-
-		goto transient;
-
-	case DICT_STATS_EMPTY_TABLE:
-
-		dict_stats_empty_table(table, true);
-
-		/* If table is using persistent stats,
-		then save the stats on disk */
-
-		if (dict_stats_is_persistent_enabled(table)) {
-
-			if (dict_stats_persistent_storage_check(false)) {
-
-				return(dict_stats_save(table, NULL));
-			}
-
-			return(DB_STATS_DO_NOT_EXIST);
-		}
-
-		return(DB_SUCCESS);
-
-	case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY:
-
-		/* fetch requested, either fetch from persistent statistics
-		storage or use the old method */
-
-		if (table->stat_initialized) {
-			return(DB_SUCCESS);
-		}
-
-		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
-		persistent stats enabled */
-		ut_a(strchr(table->name.m_name, '/') != NULL);
-
-		if (!dict_stats_persistent_storage_check(false)) {
-			/* persistent statistics storage does not exist
-			or is corrupted, calculate the transient stats */
-
-			if (innodb_table_stats_not_found == false &&
-			    table->stats_error_printed == false &&
-			    !opt_bootstrap) {
-				ib::error() << "Fetch of persistent statistics"
-					" requested for table "
-					<< table->name
-					<< " but the required system tables "
-					<< TABLE_STATS_NAME_PRINT
-					<< " and " << INDEX_STATS_NAME_PRINT
-					<< " are not present or have unexpected"
-					" structure. Using transient stats instead.";
-					table->stats_error_printed = true;
-			}
-
-			goto transient;
-		}
-
-		dberr_t	err = dict_stats_fetch_from_ps(table);
-
-		switch (err) {
-		case DB_SUCCESS:
-			return(DB_SUCCESS);
-		case DB_STATS_DO_NOT_EXIST:
-
-			if (srv_read_only_mode) {
-				goto transient;
-			}
-#ifdef WITH_WSREP
-			if (wsrep_thd_skip_locking(current_thd)) {
-				goto transient;
-			}
+  if (table->stats_is_persistent())
+    switch (dict_stats_persistent_storage_check(false)) {
+    case SCHEMA_NOT_EXIST:
+      break;
+    case SCHEMA_INVALID:
+      if (table->stats_error_printed)
+        break;
+      table->stats_error_printed= true;
+      sql_print_information("InnoDB: Recalculation of persistent statistics"
+#ifdef EMBEDDED_LIBRARY
+                            " requested for table %.*s.%s index %s but"
+#else
+                            " requested for table %`.*s.%`s index %`s but"
 #endif
-			if (dict_stats_auto_recalc_is_enabled(table)) {
-				return(dict_stats_update(
-						table,
-						DICT_STATS_RECALC_PERSISTENT));
-			}
-
-			ib::info() << "Trying to use table " << table->name
-				<< " which has persistent statistics enabled,"
-				" but auto recalculation turned off and the"
-				" statistics do not exist in "
-				TABLE_STATS_NAME_PRINT
-				" and " INDEX_STATS_NAME_PRINT
-				". Please either run \"ANALYZE TABLE "
-				<< table->name << ";\" manually or enable the"
-				" auto recalculation with \"ALTER TABLE "
-				<< table->name << " STATS_AUTO_RECALC=1;\"."
-				" InnoDB will now use transient statistics for "
-				<< table->name << ".";
-
-			goto transient;
-		default:
-
-			if (innodb_table_stats_not_found == false &&
-			    table->stats_error_printed == false) {
-				ib::error() << "Error fetching persistent statistics"
-					" for table "
-					<< table->name
-					<< " from " TABLE_STATS_NAME_PRINT " and "
-					INDEX_STATS_NAME_PRINT ": " << err
-					<< ". Using transient stats method instead.";
-			}
-
-			goto transient;
-		}
-	/* no "default:" in order to produce a compilation warning
-	about unhandled enumeration value */
-	}
+                            " the required persistent statistics storage"
+                            " is corrupted. Using transient stats instead.",
+                            int(table->name.dblen()), table->name.m_name,
+                            table->name.basename(), index->name());
+      break;
+    case SCHEMA_OK:
+      index_stats_t stats{dict_stats_analyze_index(index)};
+      table->stats_mutex_lock();
+      index->stat_index_size = stats.index_size;
+      index->stat_n_leaf_pages = stats.n_leaf_pages;
+      for (size_t i = 0; i < stats.stats.size(); ++i)
+      {
+        index->stat_n_diff_key_vals[i]= stats.stats[i].n_diff_key_vals;
+        index->stat_n_sample_sizes[i]= stats.stats[i].n_sample_sizes;
+        index->stat_n_non_null_key_vals[i]= stats.stats[i].n_non_null_key_vals;
+      }
+      table->stat_sum_of_other_index_sizes+= index->stat_index_size;
+      table->stats_mutex_unlock();
+      dict_stats_save(table, index->id);
+      return;
+    }
 
-transient:
-	return dict_stats_update_transient(table);
+  dict_stats_update_transient_for_index(index);
 }
 
 /** Execute DELETE FROM mysql.innodb_table_stats
@@ -4034,7 +3754,7 @@
                                 const char *old_name, const char *new_name,
                                 trx_t *trx)
 {
-  if (!dict_stats_persistent_storage_check(true))
+  if (dict_stats_persistent_storage_check(true) != SCHEMA_OK)
     return DB_STATS_DO_NOT_EXIST;
   pars_info_t *pinfo= pars_info_create();
 
@@ -4170,7 +3890,7 @@
 	index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE;
 	index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE;
 
-	ret = dict_stats_save(&table, NULL);
+	ret = dict_stats_save(&table);
 
 	ut_a(ret == DB_SUCCESS);
 
diff -Nru mariadb-10.11.11/storage/innobase/dict/dict0stats_bg.cc mariadb-10.11.13/storage/innobase/dict/dict0stats_bg.cc
--- mariadb-10.11.11/storage/innobase/dict/dict0stats_bg.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/dict/dict0stats_bg.cc	2025-05-19 16:14:25.000000000 +0000
@@ -135,7 +135,9 @@
 void dict_stats_update_if_needed_func(dict_table_t *table)
 #endif
 {
-	if (UNIV_UNLIKELY(!table->stat_initialized)) {
+        uint32_t stat{table->stat};
+
+	if (UNIV_UNLIKELY(!table->stat_initialized(stat))) {
 		/* The table may have been evicted from dict_sys
 		and reloaded internally by InnoDB for FOREIGN KEY
 		processing, but not reloaded by the SQL layer.
@@ -154,13 +156,9 @@
 	ulonglong	counter = table->stat_modified_counter++;
 	ulonglong	n_rows = dict_table_get_n_rows(table);
 
-	if (dict_stats_is_persistent_enabled(table)) {
-		if (table->name.is_temporary()) {
-			return;
-		}
-		if (counter > n_rows / 10 /* 10% */
-		    && dict_stats_auto_recalc_is_enabled(table)) {
-
+	if (table->stats_is_persistent(stat)) {
+		if (table->stats_is_auto_recalc(stat)
+		    && counter > n_rows / 10 && !table->name.is_temporary()) {
 #ifdef WITH_WSREP
 			/* Do not add table to background
 			statistic calculation if this thread is not a
@@ -203,7 +201,7 @@
 
 	if (counter > threshold) {
 		/* this will reset table->stat_modified_counter to 0 */
-		dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT);
+		dict_stats_update_transient(table);
 	}
 }
 
@@ -331,7 +329,7 @@
 
   if (!mdl || !table->is_accessible())
   {
-    dict_table_close(table, false, thd, mdl);
+    dict_table_close(table, thd, mdl);
     goto invalid_table_id;
   }
 
@@ -345,10 +343,10 @@
     difftime(time(nullptr), table->stats_last_recalc) >= MIN_RECALC_INTERVAL;
 
   const dberr_t err= update_now
-    ? dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT)
+    ? dict_stats_update_persistent_try(table)
     : DB_SUCCESS_LOCKED_REC;
 
-  dict_table_close(table, false, thd, mdl);
+  dict_table_close(table, thd, mdl);
 
   mysql_mutex_lock(&recalc_pool_mutex);
   auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(),
diff -Nru mariadb-10.11.11/storage/innobase/fsp/fsp0fsp.cc mariadb-10.11.13/storage/innobase/fsp/fsp0fsp.cc
--- mariadb-10.11.11/storage/innobase/fsp/fsp0fsp.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/fsp/fsp0fsp.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1644,12 +1644,11 @@
 /** Calculate reserved fragment page slots.
 @param inode  file segment index
 @return number of fragment pages */
-static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode)
+static uint32_t fseg_get_n_frag_pages(const fseg_inode_t *inode) noexcept
 {
-	ulint	i;
-	ulint	count	= 0;
+	uint32_t count = 0;
 
-	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+	for (ulint i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
 		if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) {
 			count++;
 		}
@@ -1794,21 +1793,24 @@
 currently used.
 @return number of reserved pages */
 static
-ulint
+uint32_t
 fseg_n_reserved_pages_low(
 /*======================*/
 	const fseg_inode_t*	inode,	/*!< in: segment inode */
-	ulint*		used)	/*!< out: number of pages used (not
+	uint32_t*		used)	/*!< out: number of pages used (not
 				more than reserved) */
+	noexcept
 {
+	const uint32_t extent_size = FSP_EXTENT_SIZE;
+
 	*used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)
-		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL)
+		+ extent_size * flst_get_len(inode + FSEG_FULL)
 		+ fseg_get_n_frag_pages(inode);
 
 	return fseg_get_n_frag_pages(inode)
-		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE)
-		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL)
-		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL);
+		+ extent_size * flst_get_len(inode + FSEG_FREE)
+		+ extent_size * flst_get_len(inode + FSEG_NOT_FULL)
+		+ extent_size * flst_get_len(inode + FSEG_FULL);
 }
 
 /** Calculate the number of pages reserved by a segment,
@@ -1818,9 +1820,9 @@
 @param[out]     used    number of pages that are used (not more than reserved)
 @param[in,out]  mtr     mini-transaction
 @return number of reserved pages */
-ulint fseg_n_reserved_pages(const buf_block_t &block,
-                            const fseg_header_t *header, ulint *used,
-                            mtr_t *mtr)
+uint32_t fseg_n_reserved_pages(const buf_block_t &block,
+                               const fseg_header_t *header, uint32_t *used,
+                               mtr_t *mtr) noexcept
 {
   ut_ad(page_align(header) == block.page.frame);
   buf_block_t *iblock;
@@ -1845,7 +1847,7 @@
                                    buf_block_t *iblock, fil_space_t *space,
                                    uint32_t hint, mtr_t *mtr)
 {
-  ulint	used;
+  uint32_t used;
 
   ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
   ut_d(space->modify_check(*mtr));
@@ -1996,8 +1998,7 @@
 	dberr_t*		err)
 {
 	ib_id_t		seg_id;
-	ulint		used;
-	ulint		reserved;
+	uint32_t	used, reserved;
 	xdes_t*		descr;		/*!< extent of the hinted page */
 	uint32_t	ret_page;	/*!< the allocated page offset, FIL_NULL
 					if could not be allocated */
diff -Nru mariadb-10.11.11/storage/innobase/fts/fts0config.cc mariadb-10.11.13/storage/innobase/fts/fts0config.cc
--- mariadb-10.11.11/storage/innobase/fts/fts0config.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/fts/fts0config.cc	2025-05-19 16:14:25.000000000 +0000
@@ -231,7 +231,7 @@
 	n_rows_updated = trx->undo_no - undo_no;
 
 	/* Check if we need to do an insert. */
-	if (n_rows_updated == 0) {
+	if (error == DB_SUCCESS && n_rows_updated == 0) {
 		info = pars_info_create();
 
 		pars_info_bind_varchar_literal(
diff -Nru mariadb-10.11.11/storage/innobase/fts/fts0fts.cc mariadb-10.11.13/storage/innobase/fts/fts0fts.cc
--- mariadb-10.11.11/storage/innobase/fts/fts0fts.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/fts/fts0fts.cc	2025-05-19 16:14:25.000000000 +0000
@@ -37,6 +37,7 @@
 #include "fts0plugin.h"
 #include "dict0stats.h"
 #include "btr0pcur.h"
+#include "log.h"
 
 static const ulint FTS_MAX_ID_LEN = 32;
 
@@ -1870,8 +1871,10 @@
 		}
 	}
 
-	ib::warn() << "Failed to create FTS common table " << fts_table_name;
-	trx->error_state = error;
+	ut_ad(trx->state == TRX_STATE_NOT_STARTED
+	      || trx->error_state == error);
+	sql_print_warning("InnoDB: Failed to create FTS common table %s: %s",
+			  fts_table_name, ut_strerr(error));
 	return NULL;
 }
 
@@ -2055,8 +2058,10 @@
 		}
 	}
 
-	ib::warn() << "Failed to create FTS index table " << table_name;
-	trx->error_state = error;
+	ut_ad(trx->state == TRX_STATE_NOT_STARTED
+	      || trx->error_state == error);
+	sql_print_warning("InnoDB: Failed to create FTS index table %s: %s",
+			  table_name, ut_strerr(error));
 	return NULL;
 }
 
diff -Nru mariadb-10.11.11/storage/innobase/fts/fts0opt.cc mariadb-10.11.13/storage/innobase/fts/fts0opt.cc
--- mariadb-10.11.11/storage/innobase/fts/fts0opt.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/fts/fts0opt.cc	2025-05-19 16:14:25.000000000 +0000
@@ -2809,7 +2809,7 @@
 		  std::this_thread::sleep_for(std::chrono::seconds(6)););
 
   if (mdl_ticket)
-    dict_table_close(sync_table, false, fts_opt_thd, mdl_ticket);
+    dict_table_close(sync_table, fts_opt_thd, mdl_ticket);
 }
 
 /**********************************************************************//**
diff -Nru mariadb-10.11.11/storage/innobase/gis/gis0sea.cc mariadb-10.11.13/storage/innobase/gis/gis0sea.cc
--- mariadb-10.11.11/storage/innobase/gis/gis0sea.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/gis/gis0sea.cc	2025-05-19 16:14:25.000000000 +0000
@@ -504,10 +504,10 @@
 		rtr_rec_t	rec;
 		rec = rtr_info->matches->matched_recs->back();
 		rtr_info->matches->matched_recs->pop_back();
+		cursor->btr_cur.page_cur.block = rtr_info->matches->block;
 		mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
 
 		cursor->btr_cur.page_cur.rec = rec.r_rec;
-		cursor->btr_cur.page_cur.block = rtr_info->matches->block;
 
 		DEBUG_SYNC_C("rtr_pcur_move_to_next_return");
 		return(true);
@@ -1565,7 +1565,10 @@
 		if (auto matches = rtr_info->matches) {
 			mysql_mutex_lock(&matches->rtr_match_mutex);
 
-			if (matches->block->page.id() == id) {
+			/* matches->block could be nullptr when cursor
+			encounters empty table */
+			if (rtr_info->matches->block
+			    && matches->block->page.id() == id) {
 				matches->matched_recs->clear();
 				matches->valid = false;
 			}
@@ -2201,6 +2204,15 @@
 					ut_ad(orig_mode
 					      != PAGE_CUR_RTREE_LOCATE);
 
+					/* Collect matched records on page */
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						index->n_fields,
+						ULINT_UNDEFINED, &heap);
+
+					mysql_mutex_lock(
+					  &rtr_info->matches->rtr_match_mutex);
+
 					if (!match_init) {
 						rtr_init_match(
 							rtr_info->matches,
@@ -2208,14 +2220,12 @@
 						match_init = true;
 					}
 
-					/* Collect matched records on page */
-					offsets = rec_get_offsets(
-						rec, index, offsets,
-						index->n_fields,
-						ULINT_UNDEFINED, &heap);
 					rtr_leaf_push_match_rec(
 						rec, rtr_info, offsets,
 						page_is_comp(page));
+
+					mysql_mutex_unlock(
+					  &rtr_info->matches->rtr_match_mutex);
 				}
 
 				last_match_rec = rec;
diff -Nru mariadb-10.11.11/storage/innobase/handler/ha_innodb.cc mariadb-10.11.13/storage/innobase/handler/ha_innodb.cc
--- mariadb-10.11.11/storage/innobase/handler/ha_innodb.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/handler/ha_innodb.cc	2025-05-19 16:14:25.000000000 +0000
@@ -45,6 +45,7 @@
 #include <table_cache.h>
 #include <my_check_opt.h>
 #include <my_bitmap.h>
+#include <my_sys.h>
 #include <mysql/service_thd_alloc.h>
 #include <mysql/service_thd_wait.h>
 #include <mysql/service_print_check_msg.h>
@@ -154,11 +155,6 @@
 #include "wsrep_sst.h"
 #endif /* WITH_WSREP */
 
-#ifdef HAVE_URING
-/** The Linux kernel version if io_uring() is considered unsafe */
-const char *io_uring_may_be_unsafe;
-#endif
-
 #define INSIDE_HA_INNOBASE_CC
 
 #define EQ_CURRENT_THD(thd) ((thd) == current_thd)
@@ -169,13 +165,9 @@
 static const long AUTOINC_NEW_STYLE_LOCKING = 1;
 static const long AUTOINC_NO_LOCKING = 2;
 
-static constexpr size_t buf_pool_chunk_min_size= 1U << 20;
-
 static ulong innobase_open_files;
 static long innobase_autoinc_lock_mode;
 
-ulonglong innobase_buffer_pool_size;
-
 /** Percentage of the buffer pool to reserve for 'old' blocks.
 Connected to buf_LRU_old_ratio. */
 static uint innobase_old_blocks_pct;
@@ -246,11 +238,11 @@
     if (thd_kill_level(thd))
       break;
     /* Adjust for purge_coordinator_state::refresh() */
-    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
     const lsn_t last= log_sys.last_checkpoint_lsn,
       max_age= log_sys.max_checkpoint_age;
-    log_sys.latch.rd_unlock();
     const lsn_t lsn= log_sys.get_lsn();
+    log_sys.latch.wr_unlock();
     if ((lsn - last) / 4 >= max_age / 5)
       buf_flush_ahead(last + max_age / 5, false);
     purge_sys.wake_if_not_active();
@@ -1158,7 +1150,7 @@
 					be rolled back to savepoint */
 
 /** Request notification of log writes */
-static void innodb_log_flush_request(void *cookie);
+static void innodb_log_flush_request(void *cookie) noexcept;
 
 /** Requests for log flushes */
 struct log_flush_request
@@ -1330,38 +1322,17 @@
 
   dict_sys.unlock();
 
-  dict_table_t *table_stats, *index_stats;
-  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
-  table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
-                                       DICT_ERR_IGNORE_NONE);
-  if (table_stats)
-  {
-    dict_sys.freeze(SRW_LOCK_CALL);
-    table_stats= dict_acquire_mdl_shared<false>(table_stats,
-                                                thd, &mdl_table);
-    dict_sys.unfreeze();
-  }
-  index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
-                                       DICT_ERR_IGNORE_NONE);
-  if (index_stats)
-  {
-    dict_sys.freeze(SRW_LOCK_CALL);
-    index_stats= dict_acquire_mdl_shared<false>(index_stats,
-                                                thd, &mdl_index);
-    dict_sys.unfreeze();
-  }
-
+  dict_stats stats;
+  const bool stats_failed{stats.open(thd)};
   trx_start_for_ddl(trx);
 
   uint errors= 0;
   char db[NAME_LEN + 1];
   strconvert(&my_charset_filename, namebuf, len, system_charset_info, db,
              sizeof db, &errors);
-  if (!errors && table_stats && index_stats &&
-      !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
-      !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
-      lock_table_for_trx(table_stats, trx, LOCK_X) == DB_SUCCESS &&
-      lock_table_for_trx(index_stats, trx, LOCK_X) == DB_SUCCESS)
+  if (!errors && !stats_failed &&
+      lock_table_for_trx(stats.table(), trx, LOCK_X) == DB_SUCCESS &&
+      lock_table_for_trx(stats.index(), trx, LOCK_X) == DB_SUCCESS)
   {
     row_mysql_lock_data_dictionary(trx);
     if (dict_stats_delete(db, trx))
@@ -1457,19 +1428,16 @@
   if (err != DB_SUCCESS)
   {
     trx->rollback();
-    namebuf[len] = '\0';
-    ib::error() << "DROP DATABASE " << namebuf << ": " << err;
+    sql_print_error("InnoDB: DROP DATABASE %.*s: %s",
+                    int(len), namebuf, ut_strerr(err));
   }
   else
     trx->commit();
 
-  if (table_stats)
-    dict_table_close(table_stats, true, thd, mdl_table);
-  if (index_stats)
-    dict_table_close(index_stats, true, thd, mdl_index);
   row_mysql_unlock_data_dictionary(trx);
-
   trx->free();
+  if (!stats_failed)
+    stats.close();
 
   if (err == DB_SUCCESS)
   {
@@ -1620,9 +1588,9 @@
   if (dict_table_t *table= m_prebuilt ? m_prebuilt->table : nullptr)
   {
     if (table->is_readable())
-      dict_stats_init(table);
+      statistics_init(table, true);
     else
-      table->stat_initialized= 1;
+      table->stat.fetch_or(dict_table_t::STATS_INITIALIZED);
   }
 }
 
@@ -1932,7 +1900,7 @@
   {
     const trx_id_t trx_id= table->def_trx_id;
     DBUG_ASSERT(trx_id <= create_id);
-    dict_table_close(table);
+    table->release();
     DBUG_PRINT("info", ("create_id: %llu  trx_id: %" PRIu64, create_id, trx_id));
     DBUG_RETURN(create_id != trx_id);
   }
@@ -2978,6 +2946,45 @@
   return XAER_NOTA;
 }
 
+/** Initialize the InnoDB persistent statistics attributes.
+@param table           InnoDB table
+@param table_options   MariaDB table options
+@param sar             the value of STATS_AUTO_RECALC
+@param initialized     whether the InnoDB statistics were already initialized
+@return whether table->stats_sample_pages needs to be initialized */
+static bool innodb_copy_stat_flags(dict_table_t *table,
+                                   ulong table_options,
+                                   enum_stats_auto_recalc sar,
+                                   bool initialized) noexcept
+{
+  if (table->is_temporary() || table->no_rollback())
+  {
+    table->stat= dict_table_t::STATS_INITIALIZED |
+      dict_table_t::STATS_PERSISTENT_OFF | dict_table_t::STATS_AUTO_RECALC_OFF;
+    table->stats_sample_pages= 1;
+    return false;
+  }
+
+  static_assert(HA_OPTION_STATS_PERSISTENT ==
+                dict_table_t::STATS_PERSISTENT_ON << 11, "");
+  static_assert(HA_OPTION_NO_STATS_PERSISTENT ==
+                dict_table_t::STATS_PERSISTENT_OFF << 11, "");
+  uint32_t stat=
+    uint32_t(table_options &
+             (HA_OPTION_STATS_PERSISTENT |
+              HA_OPTION_NO_STATS_PERSISTENT)) >> 11;
+  static_assert(uint32_t{HA_STATS_AUTO_RECALC_ON} << 3 ==
+                dict_table_t::STATS_AUTO_RECALC_ON, "");
+  static_assert(uint32_t{HA_STATS_AUTO_RECALC_OFF} << 3 ==
+                dict_table_t::STATS_AUTO_RECALC_OFF, "");
+  static_assert(true == dict_table_t::STATS_INITIALIZED, "");
+  stat|= (sar & (HA_STATS_AUTO_RECALC_ON | HA_STATS_AUTO_RECALC_OFF)) << 3 |
+    uint32_t(initialized);
+
+  table->stat= stat;
+  return true;
+}
+
 /*********************************************************************//**
 Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
 Those flags are stored in .frm file and end up in the MySQL table object,
@@ -2990,29 +2997,9 @@
 	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
 	const HA_CREATE_INFO*	create_info)	/*!< in: create info */
 {
-	ibool	ps_on;
-	ibool	ps_off;
-
-	if (innodb_table->is_temporary()
-	    || innodb_table->no_rollback()) {
-		/* Temp tables do not use persistent stats. */
-		ps_on = FALSE;
-		ps_off = TRUE;
-	} else {
-		ps_on = create_info->table_options
-			& HA_OPTION_STATS_PERSISTENT;
-		ps_off = create_info->table_options
-			& HA_OPTION_NO_STATS_PERSISTENT;
-	}
-
-	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
-
-	dict_stats_auto_recalc_set(
-		innodb_table,
-		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
-		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
-
-	innodb_table->stats_sample_pages = create_info->stats_sample_pages;
+  if (innodb_copy_stat_flags(innodb_table, create_info->table_options,
+                             create_info->stats_auto_recalc, false))
+    innodb_table->stats_sample_pages= create_info->stats_sample_pages;
 }
 
 /*********************************************************************//**
@@ -3026,28 +3013,10 @@
 	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
 	const TABLE_SHARE*	table_share)	/*!< in: table share */
 {
-	ibool	ps_on;
-	ibool	ps_off;
-
-	if (innodb_table->is_temporary()) {
-		/* Temp tables do not use persistent stats */
-		ps_on = FALSE;
-		ps_off = TRUE;
-	} else {
-		ps_on = table_share->db_create_options
-			& HA_OPTION_STATS_PERSISTENT;
-		ps_off = table_share->db_create_options
-			& HA_OPTION_NO_STATS_PERSISTENT;
-	}
-
-	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
-
-	dict_stats_auto_recalc_set(
-		innodb_table,
-		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
-		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
-
-	innodb_table->stats_sample_pages = table_share->stats_sample_pages;
+  if (innodb_copy_stat_flags(innodb_table, table_share->db_create_options,
+                             table_share->stats_auto_recalc,
+                             innodb_table->stat_initialized()))
+    innodb_table->stats_sample_pages= table_share->stats_sample_pages;
 }
 
 /*********************************************************************//**
@@ -3288,7 +3257,7 @@
 
 	bool allow = innobase_query_caching_table_check_low(table, trx);
 
-	dict_table_close(table);
+	table->release();
 
 	if (allow) {
 		/* If the isolation level is high, assign a read view for the
@@ -3678,7 +3647,7 @@
 	m_prebuilt->used_in_HANDLER = TRUE;
 
 	reset_template();
-	m_prebuilt->trx->bulk_insert = false;
+	m_prebuilt->trx->bulk_insert &= TRX_DDL_BULK;
 }
 
 /*********************************************************************//**
@@ -3701,53 +3670,44 @@
 	DBUG_RETURN(1);
 }
 
-/** Return the minimum buffer pool size based on page size */
-static inline ulint min_buffer_pool_size()
+static void innodb_buffer_pool_size_update(THD* thd,st_mysql_sys_var*,void*,
+                                           const void *save) noexcept
 {
-  ulint s= (BUF_LRU_MIN_LEN + BUF_LRU_MIN_LEN / 4) * srv_page_size;
-  /* buf_pool_chunk_size minimum is 1M, so round up to a multiple */
-  ulint alignment= 1U << 20;
-  return UT_CALC_ALIGN(s, alignment);
+  buf_pool.resize(*static_cast<const size_t*>(save), thd);
 }
 
-/** Validate the requested buffer pool size.  Also, reserve the necessary
-memory needed for buffer pool resize.
-@param[in]	thd	thread handle
-@param[in]	var	pointer to system variable
-@param[out]	save	immediate result for update function
-@param[in]	value	incoming string
-@return 0 on success, 1 on failure.
-*/
-static
-int
-innodb_buffer_pool_size_validate(
-	THD*				thd,
-	struct st_mysql_sys_var*	var,
-	void*				save,
-	struct st_mysql_value*		value);
-
-/** Update the system variable innodb_buffer_pool_size using the "saved"
-value. This function is registered as a callback with MySQL.
-@param[in]	thd	thread handle
-@param[in]	var	pointer to system variable
-@param[out]	var_ptr	where the formal string goes
-@param[in]	save	immediate result from check function */
-static
-void
-innodb_buffer_pool_size_update(
-	THD*				thd,
-	struct st_mysql_sys_var*	var,
-	void*				var_ptr,
-	const void*			save);
+static MYSQL_SYSVAR_SIZE_T(buffer_pool_size, buf_pool.size_in_bytes_requested,
+  PLUGIN_VAR_RQCMDARG,
+  "The size of the memory buffer InnoDB uses to cache data"
+  " and indexes of its tables.",
+  nullptr, innodb_buffer_pool_size_update, 128U << 20, 2U << 20,
+  size_t(-ssize_t(innodb_buffer_pool_extent_size)), 1U << 20);
+
+#if defined __linux__ || !defined DBUG_OFF
+static void innodb_buffer_pool_size_auto_min_update(THD*,st_mysql_sys_var*,
+                                                    void*, const void *save)
+  noexcept
+{
+  mysql_mutex_lock(&buf_pool.mutex);
+  buf_pool.size_in_bytes_auto_min= *static_cast<const size_t*>(save);
+  mysql_mutex_unlock(&buf_pool.mutex);
+}
 
-static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size,
+static MYSQL_SYSVAR_SIZE_T(buffer_pool_size_auto_min,
+                           buf_pool.size_in_bytes_auto_min,
   PLUGIN_VAR_RQCMDARG,
-  "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
-  innodb_buffer_pool_size_validate,
-  innodb_buffer_pool_size_update,
-  128ULL << 20,
-  2ULL << 20,
-  LLONG_MAX, 1024*1024L);
+  "Minimum innodb_buffer_pool_size for dynamic shrinking on memory pressure",
+  nullptr, innodb_buffer_pool_size_auto_min_update, 0, 0,
+  size_t(-ssize_t(innodb_buffer_pool_extent_size)),
+  innodb_buffer_pool_extent_size);
+#endif
+
+static MYSQL_SYSVAR_SIZE_T(buffer_pool_size_max, buf_pool.size_in_bytes_max,
+                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+                           "Maximum innodb_buffer_pool_size",
+                           nullptr, nullptr, 0, 0,
+                           size_t(-ssize_t(innodb_buffer_pool_extent_size)),
+                           innodb_buffer_pool_extent_size);
 
 static MYSQL_SYSVAR_UINT(log_write_ahead_size, log_sys.write_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -3799,29 +3759,6 @@
   return 0;
 }
 
-/** Initialize and normalize innodb_buffer_pool_{chunk_,}size. */
-static void innodb_buffer_pool_size_init()
-{
-  if (srv_buf_pool_chunk_unit > srv_buf_pool_size)
-  {
-    /* Size unit of buffer pool is larger than srv_buf_pool_size.
-    adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */
-    srv_buf_pool_chunk_unit = srv_buf_pool_size;
-  }
-  else if (srv_buf_pool_chunk_unit == 0)
-  {
-    srv_buf_pool_chunk_unit = srv_buf_pool_size / 64;
-    my_large_page_truncate(&srv_buf_pool_chunk_unit);
-  }
-
-  if (srv_buf_pool_chunk_unit < buf_pool_chunk_min_size)
-    srv_buf_pool_chunk_unit = buf_pool_chunk_min_size;
-
-  srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size);
-  innobase_buffer_pool_size = srv_buf_pool_size;
-}
-
-
 static bool
 compression_algorithm_is_not_loaded(ulong compression_algorithm, myf flags)
 {
@@ -3847,323 +3784,298 @@
 @retval HA_ERR_INITIALIZATION	when some parameters are out of range */
 static int innodb_init_params()
 {
-	DBUG_ENTER("innodb_init_params");
+  DBUG_ENTER("innodb_init_params");
 
-	ulong		num_pll_degree;
+  srv_page_size_shift= innodb_page_size_validate(srv_page_size);
+  if (!srv_page_size_shift)
+  {
+    sql_print_error("InnoDB: Invalid page size=%lu.\n", srv_page_size);
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 
-	/* Check that values don't overflow on 32-bit systems. */
-	if (sizeof(ulint) == 4) {
-		if (innobase_buffer_pool_size > UINT_MAX32) {
-			sql_print_error(
-				"innodb_buffer_pool_size can't be over 4GB"
-				" on 32-bit systems");
-			DBUG_RETURN(HA_ERR_OUT_OF_MEM);
-		}
-	}
+  size_t &min= MYSQL_SYSVAR_NAME(buffer_pool_size).min_val;
+  min= ut_calc_align<size_t>
+    (buf_pool.blocks_in_bytes(BUF_LRU_MIN_LEN + BUF_LRU_MIN_LEN / 4),
+     1U << 20);
+  size_t innodb_buffer_pool_size= buf_pool.size_in_bytes_requested;
+
+  /* With large pages, buffer pool can't grow or shrink. */
+  if (!buf_pool.size_in_bytes_max || my_use_large_pages ||
+      innodb_buffer_pool_size > buf_pool.size_in_bytes_max)
+    buf_pool.size_in_bytes_max= ut_calc_align(innodb_buffer_pool_size,
+                                              innodb_buffer_pool_extent_size);
+
+  MYSQL_SYSVAR_NAME(buffer_pool_size).max_val= buf_pool.size_in_bytes_max;
+#if defined __linux__ || !defined DBUG_OFF
+  if (!buf_pool.size_in_bytes_auto_min ||
+      buf_pool.size_in_bytes_auto_min > buf_pool.size_in_bytes_max)
+    buf_pool.size_in_bytes_auto_min= buf_pool.size_in_bytes_max;
+  MYSQL_SYSVAR_NAME(buffer_pool_size_auto_min).max_val=
+    buf_pool.size_in_bytes_max;
+#endif
 
-	/* The buffer pool needs to be able to accommodate enough many
-	pages, even for larger pages */
-	MYSQL_SYSVAR_NAME(buffer_pool_size).min_val= min_buffer_pool_size();
-
-	if (innobase_buffer_pool_size < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) {
-		ib::error() << "innodb_page_size="
-			<< srv_page_size << " requires "
-			<< "innodb_buffer_pool_size >= "
-			<< (MYSQL_SYSVAR_NAME(buffer_pool_size).min_val >> 20)
-			<< "MiB current " << (innobase_buffer_pool_size >> 20)
-			<< "MiB";
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
-	if (!ut_is_2pow(log_sys.write_size)) {
-		sql_print_error("InnoDB: innodb_log_write_ahead_size=%u"
-				" is not a power of two",
-				log_sys.write_size);
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
-        if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG))
-          DBUG_RETURN(HA_ERR_INITIALIZATION);
-
-	if ((srv_encrypt_tables || srv_encrypt_log
-	     || innodb_encrypt_temporary_tables)
-	     && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
-		sql_print_error("InnoDB: cannot enable encryption, "
-				"encryption plugin is not available");
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
+  if (innodb_buffer_pool_size < min)
+  {
+     sql_print_error("InnoDB: innodb_page_size=%lu requires "
+                     "innodb_buffer_pool_size >= %zu MiB current %zu MiB",
+                     srv_page_size, min >> 20, innodb_buffer_pool_size >> 20);
+     DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
+
+  if (!ut_is_2pow(log_sys.write_size))
+  {
+    sql_print_error("InnoDB: innodb_log_write_ahead_size=%u"
+                    " is not a power of two",
+                    log_sys.write_size);
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
+
+  if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG))
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+
+  if ((srv_encrypt_tables || srv_encrypt_log ||
+       innodb_encrypt_temporary_tables) &&
+      !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY))
+  {
+    sql_print_error("InnoDB: cannot enable encryption, "
+                    "encryption plugin is not available");
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 
 #ifdef _WIN32
-	if (!is_filename_allowed(srv_buf_dump_filename,
-				 strlen(srv_buf_dump_filename), FALSE)) {
-		sql_print_error("InnoDB: innodb_buffer_pool_filename"
-			" cannot have colon (:) in the file name.");
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
+  if (!is_filename_allowed(srv_buf_dump_filename,
+                           strlen(srv_buf_dump_filename), false))
+  {
+    sql_print_error("InnoDB: innodb_buffer_pool_filename"
+                    " cannot have colon (:) in the file name.");
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 #endif
 
-	/* First calculate the default path for innodb_data_home_dir etc.,
-	in case the user has not given any value.
+  /* First calculate the default path for innodb_data_home_dir etc.,
+  in case the user has not given any value.
 
-	Note that when using the embedded server, the datadirectory is not
-	necessarily the current directory of this program. */
+  Note that when using the embedded server, the datadirectory is not
+  necessarily the current directory of this program. */
 
-	fil_path_to_mysql_datadir =
+  fil_path_to_mysql_datadir =
 #ifndef HAVE_REPLICATION
-		mysqld_embedded ? mysql_real_data_home :
+    mysqld_embedded ? mysql_real_data_home :
 #endif
-		"./";
+    "./";
 
-	/* Set InnoDB initialization parameters according to the values
-	read from MySQL .cnf file */
+  /* Set InnoDB initialization parameters according to the values
+  read from MySQL .cnf file */
 
-	/* The default dir for data files is the datadir of MySQL */
+  /* The default dir for data files is the datadir of MySQL */
 
-	srv_data_home = innobase_data_home_dir
-		? innobase_data_home_dir
-		: const_cast<char*>(fil_path_to_mysql_datadir);
+  srv_data_home= innobase_data_home_dir
+    ? innobase_data_home_dir
+    : const_cast<char*>(fil_path_to_mysql_datadir);
 #ifdef WITH_WSREP
-	/* If we use the wsrep API, then we need to tell the server
-	the path to the data files (for passing it to the SST scripts): */
-	wsrep_set_data_home_dir(srv_data_home);
+  /* If we use the wsrep API, then we need to tell the server
+  the path to the data files (for passing it to the SST scripts): */
+  wsrep_set_data_home_dir(srv_data_home);
 #endif /* WITH_WSREP */
 
 
-	/*--------------- Shared tablespaces -------------------------*/
-
-	/* Check that the value of system variable innodb_page_size was
-	set correctly.  Its value was put into srv_page_size. If valid,
-	return the associated srv_page_size_shift. */
-	srv_page_size_shift = innodb_page_size_validate(srv_page_size);
-	if (!srv_page_size_shift) {
-		sql_print_error("InnoDB: Invalid page size=%lu.\n",
-				srv_page_size);
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
-	srv_sys_space.set_space_id(TRX_SYS_SPACE);
-
-	switch (srv_checksum_algorithm) {
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-		srv_sys_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER
-					| FSP_FLAGS_FCRC32_PAGE_SSIZE());
-		break;
-	default:
-		srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE());
-	}
-
-	srv_sys_space.set_path(srv_data_home);
-
-	/* Supports raw devices */
-	if (!srv_sys_space.parse_params(innobase_data_file_path, true)) {
-		ib::error() << "Unable to parse innodb_data_file_path="
-			    << innobase_data_file_path;
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
-	srv_tmp_space.set_path(srv_data_home);
-
-	/* Temporary tablespace is in full crc32 format. */
-	srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER
-				| FSP_FLAGS_FCRC32_PAGE_SSIZE());
-
-	if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) {
-		ib::error() << "Unable to parse innodb_temp_data_file_path="
-			    << innobase_temp_data_file_path;
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
-	/* Perform all sanity check before we take action of deleting files*/
-	if (srv_sys_space.intersection(&srv_tmp_space)) {
-		sql_print_error("innodb_temporary and innodb_system"
-				" file names seem to be the same.");
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
-	srv_sys_space.normalize_size();
-	srv_tmp_space.normalize_size();
-
-	/* ------------ UNDO tablespaces files ---------------------*/
-	if (!srv_undo_dir) {
-		srv_undo_dir = const_cast<char*>(fil_path_to_mysql_datadir);
-	}
-
-	if (strchr(srv_undo_dir, ';')) {
-		sql_print_error("syntax error in innodb_undo_directory");
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
+  /*--------------- Shared tablespaces -------------------------*/
 
-	/* -------------- All log files ---------------------------*/
-
-	/* The default dir for log files is the datadir of MySQL */
+  /* Check that the value of system variable innodb_page_size was
+  set correctly.  Its value was put into srv_page_size. If valid,
+  return the associated srv_page_size_shift. */
+
+  srv_sys_space.set_space_id(TRX_SYS_SPACE);
+  /* Temporary tablespace is in full crc32 format. */
+  srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER |
+                          FSP_FLAGS_FCRC32_PAGE_SSIZE());
+
+  switch (srv_checksum_algorithm) {
+  case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+  case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+    srv_sys_space.set_flags(srv_tmp_space.flags());
+    break;
+  default:
+    srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE());
+  }
 
-	if (!srv_log_group_home_dir) {
-		srv_log_group_home_dir
-			= const_cast<char*>(fil_path_to_mysql_datadir);
-	}
+  srv_sys_space.set_path(srv_data_home);
 
-	if (strchr(srv_log_group_home_dir, ';')) {
-		sql_print_error("syntax error in innodb_log_group_home_dir");
-		DBUG_RETURN(HA_ERR_INITIALIZATION);
-	}
-
-	DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL);
+  if (!srv_sys_space.parse_params(innobase_data_file_path, true))
+  {
+    sql_print_error("InnoDB: Unable to parse innodb_data_file_path=%s",
+                    innobase_data_file_path);
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 
-	/* Check that interdependent parameters have sane values. */
-	if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
-		sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
-				  " cannot be set higher than"
-				  " innodb_max_dirty_pages_pct.\n"
-				  "InnoDB: Setting"
-				  " innodb_max_dirty_pages_pct_lwm to %lf\n",
-				  srv_max_buf_pool_modified_pct);
+  srv_tmp_space.set_path(srv_data_home);
 
-		srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
-	}
+  if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false))
+  {
+    sql_print_error("InnoDB: Unable to parse innodb_temp_data_file_path=%s",
+                    innobase_temp_data_file_path);
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 
-	if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
+  /* Perform all sanity check before we take action of deleting files*/
+  if (srv_sys_space.intersection(&srv_tmp_space))
+  {
+    sql_print_error("innodb_temporary and innodb_system"
+                    " file names seem to be the same.");
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 
-		if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
-			/* Avoid overflow. */
-			srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
-		} else {
-			/* The user has not set the value. We should
-			set it based on innodb_io_capacity. */
-			srv_max_io_capacity =
-				ut_max(2 * srv_io_capacity, 2000UL);
-		}
+  srv_sys_space.normalize_size();
+  srv_tmp_space.normalize_size();
 
-	} else if (srv_max_io_capacity < srv_io_capacity) {
-		sql_print_warning("InnoDB: innodb_io_capacity"
-				  " cannot be set higher than"
-				  " innodb_io_capacity_max."
-				  "Setting innodb_io_capacity=%lu",
-				  srv_max_io_capacity);
+  /* ------------ UNDO tablespaces files ---------------------*/
+  if (!srv_undo_dir)
+    srv_undo_dir= const_cast<char*>(fil_path_to_mysql_datadir);
 
-		srv_io_capacity = srv_max_io_capacity;
-	}
+  if (strchr(srv_undo_dir, ';'))
+  {
+    sql_print_error("syntax error in innodb_undo_directory");
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 
-	if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
-		ib::info() << "innodb_page_size=" << srv_page_size;
+  if (!srv_log_group_home_dir)
+    srv_log_group_home_dir= const_cast<char*>(fil_path_to_mysql_datadir);
 
-		srv_max_undo_log_size = std::max(
-			srv_max_undo_log_size,
-			ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
-			<< srv_page_size_shift);
-	}
+  if (strchr(srv_log_group_home_dir, ';'))
+  {
+    sql_print_error("syntax error in innodb_log_group_home_dir");
+    DBUG_RETURN(HA_ERR_INITIALIZATION);
+  }
 
-	srv_buf_pool_size = ulint(innobase_buffer_pool_size);
+  DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL);
 
-	if (innobase_open_files < 10) {
-		innobase_open_files = 300;
-		if (srv_file_per_table && tc_size > 300 && tc_size < open_files_limit) {
-			innobase_open_files = tc_size;
-		}
-	}
+  /* Check that interdependent parameters have sane values. */
+  if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm)
+  {
+    sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
+                      " cannot be set higher than"
+                      " innodb_max_dirty_pages_pct.\n"
+                      "InnoDB: Setting"
+                      " innodb_max_dirty_pages_pct_lwm to %lf\n",
+                      srv_max_buf_pool_modified_pct);
+    srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
+  }
 
-	if (innobase_open_files > open_files_limit) {
-		ib::warn() << "innodb_open_files " << innobase_open_files
-			   << " should not be greater"
-			   << " than the open_files_limit " << open_files_limit;
-		if (innobase_open_files > tc_size) {
-			innobase_open_files = tc_size;
-		}
-	}
+  if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT)
+  {
+    if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2)
+      /* Avoid overflow. */
+      srv_max_io_capacity= SRV_MAX_IO_CAPACITY_LIMIT;
+    else
+      /* The user has not set the value. We should set it based on
+      innodb_io_capacity. */
+      srv_max_io_capacity= std::max(2 * srv_io_capacity, 2000UL);
+  }
+  else if (srv_max_io_capacity < srv_io_capacity)
+  {
+    sql_print_warning("InnoDB: innodb_io_capacity cannot be set higher than"
+                      " innodb_io_capacity_max."
+                      "Setting innodb_io_capacity=%lu", srv_max_io_capacity);
+    srv_io_capacity= srv_max_io_capacity;
+  }
 
-	ulint min_open_files_limit = srv_undo_tablespaces
-				+ srv_sys_space.m_files.size()
-				+ srv_tmp_space.m_files.size() + 1;
-	if (min_open_files_limit > innobase_open_files) {
-		sql_print_warning(
-			"InnoDB: innodb_open_files=%lu is not greater "
-			"than the number of system tablespace files, "
-			"temporary tablespace files, "
-			"innodb_undo_tablespaces=%u; adjusting "
-			"to innodb_open_files=%zu",
-			innobase_open_files, srv_undo_tablespaces,
-			min_open_files_limit);
-		innobase_open_files = (ulong) min_open_files_limit;
-	}
+  if (UNIV_PAGE_SIZE_DEF != srv_page_size)
+  {
+    sql_print_information("InnoDB: innodb_page_size=%lu", srv_page_size);
+    srv_max_undo_log_size=
+      std::max(srv_max_undo_log_size,
+               ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) <<
+               srv_page_size_shift);
+  }
 
-	srv_max_n_open_files = innobase_open_files;
-	srv_innodb_status = (ibool) innobase_create_status_file;
+  if (innobase_open_files < 10)
+    innobase_open_files= (srv_file_per_table && tc_size > 300 &&
+                          tc_size < open_files_limit)
+      ? tc_size
+      : 300;
 
-	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+  if (innobase_open_files > open_files_limit)
+  {
+    sql_print_warning("InnoDB: innodb_open_files %lu"
+                      " should not be greater than the open_files_limit %lu",
+                      innobase_open_files, open_files_limit);
+    if (innobase_open_files > tc_size)
+      innobase_open_files= tc_size;
+  }
 
-	/* Round up fts_sort_pll_degree to nearest power of 2 number */
-	for (num_pll_degree = 1;
-	     num_pll_degree < fts_sort_pll_degree;
-	     num_pll_degree <<= 1) {
+  const size_t min_open_files_limit= srv_undo_tablespaces +
+    srv_sys_space.m_files.size() + srv_tmp_space.m_files.size() + 1;
+  if (min_open_files_limit > innobase_open_files)
+  {
+    sql_print_warning("InnoDB: innodb_open_files=%lu is not greater "
+                      "than the number of system tablespace files, "
+                      "temporary tablespace files, "
+                      "innodb_undo_tablespaces=%lu; adjusting "
+                      "to innodb_open_files=%zu",
+                      innobase_open_files, srv_undo_tablespaces,
+                      min_open_files_limit);
+    innobase_open_files= ulong(min_open_files_limit);
+  }
 
-		/* No op */
-	}
+  srv_max_n_open_files= innobase_open_files;
+  srv_innodb_status = (ibool) innobase_create_status_file;
 
-	fts_sort_pll_degree = num_pll_degree;
+  srv_print_verbose_log= !mysqld_embedded;
 
-	/* Store the default charset-collation number of this MySQL
-	installation */
+  if (!ut_is_2pow(fts_sort_pll_degree))
+  {
+    ulong n;
+    for (n= 1; n < fts_sort_pll_degree; n<<= 1) {}
+    fts_sort_pll_degree= n;
+  }
 
-	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
+  /* Store the default charset-collation number of this	installation */
+  data_mysql_default_charset_coll = (ulint) default_charset_info->number;
 
 #if !defined _WIN32 && defined O_DIRECT
-	if (srv_use_atomic_writes && my_may_have_atomic_write) {
-		/*
-                  Force O_DIRECT on Unixes (on Windows writes are always
-                  unbuffered)
-                */
-		switch (srv_file_flush_method) {
-		case SRV_O_DIRECT:
-		case SRV_O_DIRECT_NO_FSYNC:
-			break;
-		default:
-			srv_file_flush_method = SRV_O_DIRECT;
-			fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
-		}
-	}
+  if (srv_use_atomic_writes && my_may_have_atomic_write)
+  {
+    /* Force O_DIRECT on Unixes (on Windows writes are always unbuffered) */
+    switch (srv_file_flush_method) {
+    case SRV_O_DIRECT:
+    case SRV_O_DIRECT_NO_FSYNC:
+      break;
+    default:
+      srv_file_flush_method= SRV_O_DIRECT;
+      fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
+    }
+  }
 #endif
 
 #if defined __linux__ || defined _WIN32
-	if (srv_flush_log_at_trx_commit == 2) {
-		/* Do not disable the file system cache if
-		innodb_flush_log_at_trx_commit=2. */
-		log_sys.log_buffered = true;
-	}
+  if (srv_flush_log_at_trx_commit == 2)
+    /* Do not disable the file system cache if
+    innodb_flush_log_at_trx_commit=2. */
+    log_sys.log_buffered= true;
 #endif
 
 #if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32
-	/* Currently native AIO is supported only on windows and linux
-	and that also when the support is compiled in. In all other
-	cases, we ignore the setting of innodb_use_native_aio. */
-	srv_use_native_aio = FALSE;
-#endif
-#ifdef HAVE_URING
-	if (srv_use_native_aio && io_uring_may_be_unsafe) {
-		sql_print_warning("innodb_use_native_aio may cause "
-				  "hangs with this kernel %s; see "
-				  "https://jira.mariadb.org/browse/MDEV-26674",
-				  io_uring_may_be_unsafe);
-	}
+  /* Currently native AIO is supported only on windows and linux
+  and that also when the support is compiled in. In all other
+  cases, we ignore the setting of innodb_use_native_aio. */
+  srv_use_native_aio= FALSE;
 #endif
 
 #ifdef _WIN32
-	switch (srv_file_flush_method) {
-	case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
-		srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC;
-		break;
-	case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
-		srv_file_flush_method = SRV_FSYNC;
-		break;
-	default:
-		ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
-	}
+  switch (srv_file_flush_method) {
+  case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
+    srv_file_flush_method= SRV_ALL_O_DIRECT_FSYNC;
+    break;
+  case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
+    srv_file_flush_method= SRV_FSYNC;
+    break;
+  default:
+    ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
+  }
 #else
-	ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
+  ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
 #endif
-	innodb_buffer_pool_size_init();
-
-	srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
-	DBUG_RETURN(0);
+  DBUG_RETURN(0);
 }
 
 /** Initialize the InnoDB storage engine plugin.
@@ -4576,7 +4488,7 @@
   undo_no_t savept= 0;
   trx->rollback(&savept);
   /* MariaDB will roll back the entire transaction. */
-  trx->bulk_insert= false;
+  trx->bulk_insert&= TRX_DDL_BULK;
   trx->last_stmt_start= 0;
   return true;
 }
@@ -4620,10 +4532,9 @@
 		ut_ad("invalid state" == 0);
 		/* fall through */
 	case TRX_STATE_PREPARED:
-		ut_ad(commit_trx || trx->is_wsrep());
-		ut_ad(thd_test_options(thd, OPTION_NOT_AUTOCOMMIT
-				       | OPTION_BEGIN)
-		      || trx->is_wsrep());
+		ut_ad(commit_trx ||
+		      !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT
+				       | OPTION_BEGIN));
 		/* fall through */
 	case TRX_STATE_ACTIVE:
 		/* Transaction is deregistered only in a commit or a
@@ -4825,11 +4736,13 @@
 We put the request in a queue, so that we can notify upper layer about
 checkpoint complete when we have flushed the redo log.
 If we have already flushed all relevant redo log, we notify immediately.*/
-static void innodb_log_flush_request(void *cookie)
+static void innodb_log_flush_request(void *cookie) noexcept
 {
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
   lsn_t flush_lsn= log_sys.get_flushed_lsn();
   /* Load lsn relaxed after flush_lsn was loaded from the same cache line */
   const lsn_t lsn= log_sys.get_lsn();
+  log_sys.latch.wr_unlock();
 
   if (flush_lsn >= lsn)
     /* All log is already persistent. */;
@@ -5837,6 +5750,70 @@
   table->autoinc_mutex.wr_unlock();
 }
 
+dberr_t ha_innobase::statistics_init(dict_table_t *table, bool recalc)
+{
+  ut_ad(table->is_readable());
+  ut_ad(!table->stats_mutex_is_owner());
+
+  uint32_t stat= table->stat;
+  dberr_t err= DB_SUCCESS;
+
+  if (!recalc && dict_table_t::stat_initialized(stat));
+  else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN)
+    dict_stats_empty_table(table, false);
+  else
+  {
+    if (dict_table_t::stats_is_persistent(stat) && !srv_read_only_mode
+#ifdef WITH_WSREP
+        && !wsrep_thd_skip_locking(m_user_thd)
+#endif
+    )
+    {
+      switch (dict_stats_persistent_storage_check(false)) {
+      case SCHEMA_OK:
+        if (recalc)
+        {
+        recalc:
+          err= dict_stats_update_persistent(table);
+          if (err == DB_SUCCESS)
+            err= dict_stats_save(table);
+        }
+        else
+        {
+          err= dict_stats_fetch_from_ps(table);
+          if (err == DB_STATS_DO_NOT_EXIST && table->stats_is_auto_recalc())
+            goto recalc;
+        }
+        if (err == DB_SUCCESS || err == DB_READ_ONLY)
+          return err;
+        if (!recalc)
+          break;
+        /* fall through */
+      case SCHEMA_INVALID:
+        if (table->stats_error_printed)
+          break;
+        table->stats_error_printed = true;
+        if (opt_bootstrap)
+          break;
+        sql_print_warning("InnoDB: %s of persistent statistics requested"
+                          " for table %`.*s.%`s"
+                          " but the required persistent statistics storage"
+                          " is corrupted.",
+                          recalc ? "Recalculation" : "Fetch",
+                          int(table->name.dblen()), table->name.m_name,
+                          table->name.basename());
+        /* fall through */
+      case SCHEMA_NOT_EXIST:
+        err= DB_STATS_DO_NOT_EXIST;
+      }
+    }
+
+    dict_stats_update_transient(table);
+  }
+
+  return err;
+}
+
 /** Open an InnoDB table
 @param[in]	name	table name
 @return	error code
@@ -7958,6 +7935,17 @@
 		error, m_prebuilt->table->flags, m_user_thd);
 
 #ifdef WITH_WSREP
+#ifdef ENABLED_DEBUG_SYNC
+	DBUG_EXECUTE_IF("sync.wsrep_after_write_row",
+	{
+	  const char act[]=
+	    "now "
+	    "SIGNAL sync.wsrep_after_write_row_reached "
+	    "WAIT_FOR signal.wsrep_after_write_row";
+	  DBUG_ASSERT(!debug_sync_set_action(m_user_thd, STRING_WITH_LEN(act)));
+	};);
+#endif /* ENABLED_DEBUG_SYNC */
+
 	if (!error_result && trx->is_wsrep()
 	    && !trx->is_bulk_insert()
 	    && wsrep_thd_is_local(m_user_thd)
@@ -13338,7 +13326,7 @@
 
       if (!error)
       {
-        dict_stats_update(info.table(), DICT_STATS_EMPTY_TABLE);
+        dict_stats_empty_table_and_save(info.table());
         if (!info.table()->is_temporary())
           log_write_up_to(trx->commit_lsn, true);
         info.table()->release();
@@ -13387,6 +13375,8 @@
 		DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE);
 	}
 
+	ut_ad(m_prebuilt->table->stat_initialized());
+
 	if (m_prebuilt->table->space == fil_system.sys_space) {
 		ib_senderrf(
 			m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
@@ -13460,23 +13450,17 @@
 				    err, m_prebuilt->table->flags, NULL));
 	}
 
-	if (dict_stats_is_persistent_enabled(m_prebuilt->table)) {
-		dberr_t		ret;
-
-		/* Adjust the persistent statistics. */
-		ret = dict_stats_update(m_prebuilt->table,
-					DICT_STATS_RECALC_PERSISTENT);
+	dict_table_t* t = m_prebuilt->table;
 
-		if (ret != DB_SUCCESS) {
-			push_warning_printf(
-				ha_thd(),
-				Sql_condition::WARN_LEVEL_WARN,
-				ER_ALTER_INFO,
-				"Error updating stats for table '%s'"
-				" after table rebuild: %s",
-				m_prebuilt->table->name.m_name,
-				ut_strerr(ret));
-		}
+	if (dberr_t ret = dict_stats_update_persistent_try(t)) {
+		push_warning_printf(
+			ha_thd(),
+			Sql_condition::WARN_LEVEL_WARN,
+			ER_ALTER_INFO,
+			"Error updating stats after"
+			" ALTER TABLE %`.*s.%`s IMPORT TABLESPACE: %s",
+			int(t->name.dblen()), t->name.m_name,
+			t->name.basename(), ut_strerr(ret));
 	}
 
 	DBUG_RETURN(0);
@@ -13619,8 +13603,6 @@
       err= lock_table_children(table, trx);
   }
 
-  dict_table_t *table_stats= nullptr, *index_stats= nullptr;
-  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
   if (err == DB_SUCCESS)
     err= lock_table_for_trx(table, trx, LOCK_X);
 
@@ -13645,7 +13627,7 @@
     /* This looks like the rollback of ALTER TABLE...ADD PARTITION
     that was caused by MDL timeout. We could have written undo log
     for inserting the data into the new partitions. */
-    if (table->stat_persistent != DICT_STATS_PERSISTENT_OFF)
+    if (!(table->stat & dict_table_t::STATS_PERSISTENT_OFF))
     {
       /* We do not really know if we are holding MDL_EXCLUSIVE. Even
       though this code is handling the case that we are not holding
@@ -13659,37 +13641,18 @@
 #endif
 
   DEBUG_SYNC(thd, "before_delete_table_stats");
+  dict_stats stats;
+  bool stats_failed= true;
 
-  if (err == DB_SUCCESS && dict_stats_is_persistent_enabled(table) &&
+  if (err == DB_SUCCESS && table->stats_is_persistent() &&
       !table->is_stats_table())
   {
-    table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
-                                         DICT_ERR_IGNORE_NONE);
-    if (table_stats)
-    {
-      dict_sys.freeze(SRW_LOCK_CALL);
-      table_stats= dict_acquire_mdl_shared<false>(table_stats,
-                                                  thd, &mdl_table);
-      dict_sys.unfreeze();
-    }
-
-    index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
-                                         DICT_ERR_IGNORE_NONE);
-    if (index_stats)
-    {
-      dict_sys.freeze(SRW_LOCK_CALL);
-      index_stats= dict_acquire_mdl_shared<false>(index_stats,
-                                                  thd, &mdl_index);
-      dict_sys.unfreeze();
-    }
-
+    stats_failed= stats.open(thd);
     const bool skip_wait{table->name.is_temporary()};
 
-    if (table_stats && index_stats &&
-        !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
-        !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
-        !(err= lock_table_for_trx(table_stats, trx, LOCK_X, skip_wait)))
-      err= lock_table_for_trx(index_stats, trx, LOCK_X, skip_wait);
+    if (!stats_failed &&
+        !(err= lock_table_for_trx(stats.table(), trx, LOCK_X, skip_wait)))
+      err= lock_table_for_trx(stats.index(), trx, LOCK_X, skip_wait);
 
     if (err != DB_SUCCESS && skip_wait)
     {
@@ -13698,10 +13661,8 @@
       ut_ad(err == DB_LOCK_WAIT);
       ut_ad(trx->error_state == DB_SUCCESS);
       err= DB_SUCCESS;
-      dict_table_close(table_stats, false, thd, mdl_table);
-      dict_table_close(index_stats, false, thd, mdl_index);
-      table_stats= nullptr;
-      index_stats= nullptr;
+      stats.close();
+      stats_failed= true;
     }
   }
 
@@ -13772,13 +13733,11 @@
     else if (rollback_add_partition)
       purge_sys.resume_FTS();
 #endif
-    if (table_stats)
-      dict_table_close(table_stats, true, thd, mdl_table);
-    if (index_stats)
-      dict_table_close(index_stats, true, thd, mdl_index);
     row_mysql_unlock_data_dictionary(trx);
     if (trx != parent_trx)
       trx->free();
+    if (!stats_failed)
+      stats.close();
     DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
   }
 
@@ -13793,7 +13752,7 @@
     err= trx->drop_table_foreign(table->name);
   }
 
-  if (err == DB_SUCCESS && table_stats && index_stats)
+  if (err == DB_SUCCESS && !stats_failed)
     err= trx->drop_table_statistics(table->name);
   if (err != DB_SUCCESS)
     goto err_exit;
@@ -13804,11 +13763,9 @@
 
   std::vector<pfs_os_file_t> deleted;
   trx->commit(deleted);
-  if (table_stats)
-    dict_table_close(table_stats, true, thd, mdl_table);
-  if (index_stats)
-    dict_table_close(index_stats, true, thd, mdl_index);
   row_mysql_unlock_data_dictionary(trx);
+  if (!stats_failed)
+    stats.close();
   for (pfs_os_file_t d : deleted)
     os_file_close(d);
   log_write_up_to(trx->commit_lsn, true);
@@ -14004,9 +13961,6 @@
                                         ib_table->name.m_name, ib_table->id);
   const char *name= mem_heap_strdup(heap, ib_table->name.m_name);
 
-  dict_table_t *table_stats = nullptr, *index_stats = nullptr;
-  MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
-
   dberr_t error= lock_table_children(ib_table, trx);
 
   if (error == DB_SUCCESS)
@@ -14014,6 +13968,7 @@
 
   const bool fts= error == DB_SUCCESS &&
     ib_table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+  const bool pause_purge= error == DB_SUCCESS && ib_table->get_ref_count() > 1;
 
   if (fts)
   {
@@ -14021,45 +13976,33 @@
     purge_sys.stop_FTS(*ib_table);
     error= fts_lock_tables(trx, *ib_table);
   }
+  else if (pause_purge)
+    purge_sys.stop_FTS();
 
-  /* Wait for purge threads to stop using the table. */
-  for (uint n = 15; ib_table->get_ref_count() > 1; )
+  if (error == DB_SUCCESS)
   {
-    if (!--n)
+    /* Wait for purge threads to stop using the table. */
+    for (uint n = 15; ib_table->get_ref_count() > 1; )
     {
-      error= DB_LOCK_WAIT_TIMEOUT;
-      break;
+      if (!--n)
+      {
+        error= DB_LOCK_WAIT_TIMEOUT;
+        break;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(50));
     }
-    std::this_thread::sleep_for(std::chrono::milliseconds(50));
   }
 
-  if (error == DB_SUCCESS && dict_stats_is_persistent_enabled(ib_table) &&
+  dict_stats stats;
+  bool stats_failed= true;
+
+  if (error == DB_SUCCESS && ib_table->stats_is_persistent() &&
       !ib_table->is_stats_table())
   {
-    table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
-                                         DICT_ERR_IGNORE_NONE);
-    if (table_stats)
-    {
-      dict_sys.freeze(SRW_LOCK_CALL);
-      table_stats= dict_acquire_mdl_shared<false>(table_stats, m_user_thd,
-                                                  &mdl_table);
-      dict_sys.unfreeze();
-    }
-    index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
-                                         DICT_ERR_IGNORE_NONE);
-    if (index_stats)
-    {
-      dict_sys.freeze(SRW_LOCK_CALL);
-      index_stats= dict_acquire_mdl_shared<false>(index_stats, m_user_thd,
-                                                  &mdl_index);
-      dict_sys.unfreeze();
-    }
-
-    if (table_stats && index_stats &&
-        !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
-        !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
-        !(error= lock_table_for_trx(table_stats, trx, LOCK_X)))
-      error= lock_table_for_trx(index_stats, trx, LOCK_X);
+    stats_failed= stats.open(m_user_thd);
+    if (!stats_failed &&
+        !(error= lock_table_for_trx(stats.table(), trx, LOCK_X)))
+      error= lock_table_for_trx(stats.index(), trx, LOCK_X);
   }
 
   if (error == DB_SUCCESS)
@@ -14123,7 +14066,7 @@
 
   if (!err)
   {
-    dict_stats_update(m_prebuilt->table, DICT_STATS_EMPTY_TABLE);
+    dict_stats_empty_table_and_save(m_prebuilt->table);
     log_write_up_to(trx->commit_lsn, true);
     row_prebuilt_t *prebuilt= m_prebuilt;
     uchar *upd_buf= m_upd_buf;
@@ -14151,15 +14094,46 @@
   }
 
   trx->free();
-
+  if (!stats_failed)
+    stats.close();
   mem_heap_free(heap);
+  DBUG_RETURN(err);
+}
 
-  if (table_stats)
-    dict_table_close(table_stats, false, m_user_thd, mdl_table);
-  if (index_stats)
-    dict_table_close(index_stats, false, m_user_thd, mdl_index);
+/** Deinitialize InnoDB persistent statistics, forcing them
+to be reloaded on subsequent ha_innobase::open().
+@param t  table for which the cached STATS_PERSISTENT are to be evicted */
+static void stats_deinit(dict_table_t *t) noexcept
+{
+  ut_ad(dict_sys.frozen());
+  ut_ad(t->get_ref_count() == 0);
 
-  DBUG_RETURN(err);
+  if (t->is_temporary() || t->no_rollback())
+    return;
+
+  t->stats_mutex_lock();
+  t->stat= t->stat & ~dict_table_t::STATS_INITIALIZED;
+  MEM_UNDEFINED(&t->stat_n_rows, sizeof t->stat_n_rows);
+  MEM_UNDEFINED(&t->stat_clustered_index_size,
+                sizeof t->stat_clustered_index_size);
+  MEM_UNDEFINED(&t->stat_sum_of_other_index_sizes,
+                sizeof t->stat_sum_of_other_index_sizes);
+  MEM_UNDEFINED(&t->stat_modified_counter, sizeof t->stat_modified_counter);
+#ifdef HAVE_valgrind
+  for (dict_index_t *i= dict_table_get_first_index(t); i;
+       i= dict_table_get_next_index(i))
+  {
+    MEM_UNDEFINED(i->stat_n_diff_key_vals,
+                  i->n_uniq * sizeof *i->stat_n_diff_key_vals);
+    MEM_UNDEFINED(i->stat_n_sample_sizes,
+                  i->n_uniq * sizeof *i->stat_n_sample_sizes);
+    MEM_UNDEFINED(i->stat_n_non_null_key_vals,
+                  i->n_uniq * sizeof *i->stat_n_non_null_key_vals);
+    MEM_UNDEFINED(&i->stat_index_size, sizeof i->stat_index_size);
+    MEM_UNDEFINED(&i->stat_n_leaf_pages, sizeof i->stat_n_leaf_pages);
+  }
+#endif /* HAVE_valgrind */
+  t->stats_mutex_unlock();
 }
 
 /*********************************************************************//**
@@ -14184,8 +14158,6 @@
 	trx_t*	trx = innobase_trx_allocate(thd);
 	trx_start_for_ddl(trx);
 
-	dict_table_t *table_stats = nullptr, *index_stats = nullptr;
-	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
 	char norm_from[MAX_FULL_NAME_LEN];
 	char norm_to[MAX_FULL_NAME_LEN];
 
@@ -14195,45 +14167,49 @@
 	dberr_t error = DB_SUCCESS;
 	const bool from_temp = dict_table_t::is_temporary_name(norm_from);
 
+	dict_table_t* t;
+	bool pause_purge = false, fts_exist = false;
+
 	if (from_temp) {
 		/* There is no need to lock any FOREIGN KEY child tables. */
-	} else if (dict_table_t *table = dict_table_open_on_name(
-		    norm_from, false, DICT_ERR_IGNORE_FK_NOKEY)) {
-		error = lock_table_children(table, trx);
-		if (error == DB_SUCCESS) {
-			error = lock_table_for_trx(table, trx, LOCK_X);
+		t = nullptr;
+	} else {
+		t = dict_table_open_on_name(
+			norm_from, false, DICT_ERR_IGNORE_FK_NOKEY);
+		if (t) {
+			error = lock_table_children(t, trx);
+			if (error == DB_SUCCESS) {
+				error = lock_table_for_trx(t, trx, LOCK_X);
+			}
+			fts_exist = error == DB_SUCCESS && t->flags2
+				& (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+			pause_purge = error == DB_SUCCESS
+				&& t->get_ref_count() > 1;
+			if (fts_exist) {
+				fts_optimize_remove_table(t);
+				purge_sys.stop_FTS(*t);
+				if (error == DB_SUCCESS) {
+					error = fts_lock_tables(trx, *t);
+				}
+			} else if (pause_purge) {
+				purge_sys.stop_FTS();
+			}
 		}
-		table->release();
 	}
 
+	dict_stats stats;
+	bool stats_fail = true;
+
 	if (strcmp(norm_from, TABLE_STATS_NAME)
 	    && strcmp(norm_from, INDEX_STATS_NAME)
 	    && strcmp(norm_to, TABLE_STATS_NAME)
 	    && strcmp(norm_to, INDEX_STATS_NAME)) {
-		table_stats = dict_table_open_on_name(TABLE_STATS_NAME, false,
-						      DICT_ERR_IGNORE_NONE);
-		if (table_stats) {
-			dict_sys.freeze(SRW_LOCK_CALL);
-			table_stats = dict_acquire_mdl_shared<false>(
-				table_stats, thd, &mdl_table);
-			dict_sys.unfreeze();
-		}
-		index_stats = dict_table_open_on_name(INDEX_STATS_NAME, false,
-						      DICT_ERR_IGNORE_NONE);
-		if (index_stats) {
-			dict_sys.freeze(SRW_LOCK_CALL);
-			index_stats = dict_acquire_mdl_shared<false>(
-				index_stats, thd, &mdl_index);
-			dict_sys.unfreeze();
-		}
-
-		if (error == DB_SUCCESS && table_stats && index_stats
-		    && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME)
-		    && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
-			error = lock_table_for_trx(table_stats, trx, LOCK_X,
-						   from_temp);
+		stats_fail = stats.open(thd);
+		if (!stats_fail && error == DB_SUCCESS) {
+			error = lock_table_for_trx(stats.table(), trx,
+						   LOCK_X, from_temp);
 			if (error == DB_SUCCESS) {
-				error = lock_table_for_trx(index_stats, trx,
+				error = lock_table_for_trx(stats.index(), trx,
 							   LOCK_X, from_temp);
 			}
 			if (error != DB_SUCCESS && from_temp) {
@@ -14244,12 +14220,8 @@
 				we cannot lock the tables, when the
 				table is being renamed from from a
 				temporary name. */
-				dict_table_close(table_stats, false, thd,
-						 mdl_table);
-				dict_table_close(index_stats, false, thd,
-						 mdl_index);
-				table_stats = nullptr;
-				index_stats = nullptr;
+				stats.close();
+				stats_fail = true;
 			}
 		}
 	}
@@ -14276,7 +14248,7 @@
 
 	DEBUG_SYNC(thd, "after_innobase_rename_table");
 
-	if (error == DB_SUCCESS && table_stats && index_stats) {
+	if (error == DB_SUCCESS && !stats_fail) {
 		error = dict_stats_rename_table(norm_from, norm_to, trx);
 		if (error == DB_DUPLICATE_KEY) {
 			/* The duplicate may also occur in
@@ -14289,33 +14261,52 @@
 
 	if (error == DB_SUCCESS) {
 		trx->flush_log_later = true;
+		if (t) {
+			ut_ad(dict_sys.locked());
+			if (fts_exist) {
+		                fts_optimize_add_table(t);
+                        }
+			if (UNIV_LIKELY(t->release())) {
+				stats_deinit(t);
+			} else {
+				ut_ad("unexpected references" == 0);
+			}
+		}
 		innobase_commit_low(trx);
 	} else {
+		if (t) {
+			if (fts_exist) {
+		                fts_optimize_add_table(t);
+                        }
+			t->release();
+		}
 		trx->rollback();
 	}
 
-	if (table_stats) {
-		dict_table_close(table_stats, true, thd, mdl_table);
-	}
-	if (index_stats) {
-		dict_table_close(index_stats, true, thd, mdl_index);
-	}
 	row_mysql_unlock_data_dictionary(trx);
+
+	if (fts_exist || pause_purge) {
+		purge_sys.resume_FTS();
+	}
+
 	if (error == DB_SUCCESS) {
 		log_write_up_to(trx->commit_lsn, true);
 	}
 	trx->flush_log_later = false;
 	trx->free();
+	if (!stats_fail) {
+		stats.close();
+	}
 
 	if (error == DB_DUPLICATE_KEY) {
 		/* We are not able to deal with handler::get_dup_key()
 		during DDL operations, because the duplicate key would
 		exist in metadata tables, not in the user table. */
 		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
-		error = DB_ERROR;
+		DBUG_RETURN(HA_ERR_GENERIC);
 	} else if (error == DB_LOCK_WAIT_TIMEOUT) {
 		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
-		error = DB_LOCK_WAIT;
+		DBUG_RETURN(HA_ERR_GENERIC);
 	}
 
 	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
@@ -14529,7 +14520,7 @@
 
 	ulint	stat_clustered_index_size;
 
-	ut_a(m_prebuilt->table->stat_initialized);
+	ut_ad(m_prebuilt->table->stat_initialized());
 
 	stat_clustered_index_size =
 		m_prebuilt->table->stat_clustered_index_size;
@@ -14656,7 +14647,7 @@
 	rec_per_key_t	rec_per_key;
 	ib_uint64_t	n_diff;
 
-	ut_a(index->table->stat_initialized);
+	ut_ad(index->table->stat_initialized());
 
 	ut_ad(i < dict_index_get_n_unique(index));
 	ut_ad(!dict_index_is_spatial(index));
@@ -14794,63 +14785,82 @@
 	ib_table = m_prebuilt->table;
 	DBUG_ASSERT(ib_table->get_ref_count() > 0);
 
-	if (!ib_table->is_readable()) {
+	if (!ib_table->is_readable()
+	    || srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
 		dict_stats_empty_table(ib_table, true);
-	}
-
-	if (flag & HA_STATUS_TIME) {
-		if (is_analyze || innobase_stats_on_metadata) {
+        } else if (flag & HA_STATUS_TIME) {
+		stats.update_time = ib_table->update_time;
+		if (!is_analyze && !innobase_stats_on_metadata) {
+			goto stats_fetch;
+		}
 
-			dict_stats_upd_option_t	opt;
-			dberr_t			ret;
+		dberr_t ret;
+		m_prebuilt->trx->op_info = "updating table statistics";
 
-			m_prebuilt->trx->op_info = "updating table statistics";
+		if (ib_table->stats_is_persistent()
+		    && !srv_read_only_mode
+		    && dict_stats_persistent_storage_check(false)
+		    == SCHEMA_OK) {
+			if (is_analyze) {
+				dict_stats_recalc_pool_del(ib_table->id,
+							   false);
+recalc:
+				ret = statistics_init(ib_table, is_analyze);
+			} else {
+				/* This is e.g. 'SHOW INDEXES' */
+				ret = statistics_init(ib_table, is_analyze);
+				switch (ret) {
+				case DB_SUCCESS:
+				case DB_READ_ONLY:
+					break;
+				default:
+					goto error;
+				case DB_STATS_DO_NOT_EXIST:
+					if (!ib_table
+					    ->stats_is_auto_recalc()) {
+						break;
+					}
 
-			if (dict_stats_is_persistent_enabled(ib_table)) {
-				if (is_analyze) {
-					if (!srv_read_only_mode) {
-						dict_stats_recalc_pool_del(
-							ib_table->id, false);
+					if (opt_bootstrap) {
+						break;
 					}
-					opt = DICT_STATS_RECALC_PERSISTENT;
-				} else {
-					/* This is e.g. 'SHOW INDEXES', fetch
-					the persistent stats from disk. */
-					opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+#ifdef WITH_WSREP
+					if (wsrep_thd_skip_locking(
+						    m_user_thd)) {
+						break;
+					}
+#endif
+					is_analyze = true;
+					goto recalc;
 				}
-			} else {
-				opt = DICT_STATS_RECALC_TRANSIENT;
 			}
-
-			ret = dict_stats_update(ib_table, opt);
-
+		} else {
+			ret = dict_stats_update_transient(ib_table);
 			if (ret != DB_SUCCESS) {
+error:
 				m_prebuilt->trx->op_info = "";
 				DBUG_RETURN(HA_ERR_GENERIC);
 			}
-
-			m_prebuilt->trx->op_info =
-				"returning various info to MariaDB";
 		}
 
-
-		stats.update_time = (ulong) ib_table->update_time;
+		m_prebuilt->trx->op_info = "returning various info to MariaDB";
+	} else {
+stats_fetch:
+		statistics_init(ib_table, false);
 	}
 
-	dict_stats_init(ib_table);
-
 	if (flag & HA_STATUS_VARIABLE) {
 
 		ulint	stat_clustered_index_size;
 		ulint	stat_sum_of_other_index_sizes;
 
-		ut_a(ib_table->stat_initialized);
-
 #if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
 		if (xbegin()) {
 			if (ib_table->stats_mutex_is_locked())
 				xabort();
 
+			ut_ad(ib_table->stat_initialized());
+
 			n_rows = ib_table->stat_n_rows;
 
 			stat_clustered_index_size
@@ -14865,6 +14875,8 @@
 		{
 			ib_table->stats_shared_lock();
 
+			ut_ad(ib_table->stat_initialized());
+
 			n_rows = ib_table->stat_n_rows;
 
 			stat_clustered_index_size
@@ -14998,7 +15010,7 @@
 		auto _ = make_scope_exit([ib_table]() {
 			ib_table->stats_shared_unlock(); });
 
-		ut_a(ib_table->stat_initialized);
+		ut_ad(ib_table->stat_initialized());
 
 		for (uint i = 0; i < table->s->keys; i++) {
 			ulong	j;
@@ -15694,7 +15706,7 @@
 					<< foreign->foreign_table_name;
  			}
 		} else {
-			dict_table_close(ref_table, true);
+			ref_table->release();
 		}
 	}
 
@@ -15852,7 +15864,7 @@
 	stmt_boundary:
 		trx->bulk_insert_apply();
 		trx->end_bulk_insert(*m_prebuilt->table);
-		trx->bulk_insert = false;
+		trx->bulk_insert &= TRX_DDL_BULK;
 		break;
 	case HA_EXTRA_NO_KEYREAD:
 		(void)check_trx_exists(ha_thd());
@@ -15911,32 +15923,47 @@
 		break;
 	case HA_EXTRA_END_ALTER_COPY:
 		trx = check_trx_exists(ha_thd());
-		if (m_prebuilt->table->skip_alter_undo) {
-			if (dberr_t err= trx->bulk_insert_apply()) {
-				m_prebuilt->table->skip_alter_undo = 0;
-				return convert_error_code_to_mysql(
-					 err,
-					 m_prebuilt->table->flags,
-					 trx->mysql_thd);
-			}
-
-			trx->end_bulk_insert(*m_prebuilt->table);
-			trx->bulk_insert = false;
-			/* During copy alter operation, InnoDB
-			updates the stats only for non-persistent
-			tables. */
-			if (!dict_stats_is_persistent_enabled(
-					m_prebuilt->table)) {
-				dict_stats_update_if_needed(
-					m_prebuilt->table, *trx);
-			}
+		if (!m_prebuilt->table->skip_alter_undo) {
+			/* This could be invoked inside INSERT...SELECT.
+			We do not want any extra log writes, because
+			they could cause a severe performance regression. */
+			break;
 		}
 		m_prebuilt->table->skip_alter_undo = 0;
+		if (dberr_t err= trx->bulk_insert_apply<TRX_DDL_BULK>()) {
+			m_prebuilt->table->skip_alter_undo = 0;
+			return convert_error_code_to_mysql(
+				 err, m_prebuilt->table->flags,
+				 trx->mysql_thd);
+		}
+
+		trx->end_bulk_insert(*m_prebuilt->table);
+		trx->bulk_insert &= TRX_DML_BULK;
 		if (!m_prebuilt->table->is_temporary()
 		    && !high_level_read_only) {
+			/* During copy_data_between_tables(), InnoDB only
+			updates transient statistics. */
+			if (!m_prebuilt->table->stats_is_persistent()) {
+				dict_stats_update_if_needed(m_prebuilt->table,
+							    *trx);
+			}
+			/* The extra log write is necessary for
+			ALTER TABLE...ALGORITHM=COPY, because
+			a normal transaction commit would be a no-op
+			because no undo log records were generated.
+			This log write will also be unnecessarily executed
+			during CREATE...SELECT, which is the other caller of
+			handler::extra(HA_EXTRA_BEGIN_ALTER_COPY). */
 			log_buffer_flush_to_disk();
 		}
 		break;
+	case HA_EXTRA_ABORT_ALTER_COPY:
+		if (m_prebuilt->table->skip_alter_undo) {
+			trx = check_trx_exists(ha_thd());
+			m_prebuilt->table->skip_alter_undo = 0;
+			trx->rollback();
+		}
+		break;
 	default:/* Do nothing */
 		;
 	}
@@ -16031,7 +16058,8 @@
 			break;
 		}
 
-		trx->bulk_insert = false;
+		ut_ad(trx->bulk_insert != TRX_DDL_BULK);
+		trx->bulk_insert = TRX_NO_BULK;
 		trx->last_stmt_start = trx->undo_no;
 	}
 
@@ -16239,7 +16267,7 @@
 		if (!trx->bulk_insert) {
 			break;
 		}
-		trx->bulk_insert = false;
+		trx->bulk_insert &= TRX_DDL_BULK;
 		trx->last_stmt_start = trx->undo_no;
 	}
 
@@ -17294,7 +17322,12 @@
 	param_new = info->option_struct;
 	param_old = table->s->option_struct;
 
-	innobase_copy_frm_flags_from_create_info(m_prebuilt->table, info);
+	m_prebuilt->table->stats_mutex_lock();
+	if (!m_prebuilt->table->stat_initialized()) {
+		innobase_copy_frm_flags_from_create_info(
+			m_prebuilt->table, info);
+	}
+	m_prebuilt->table->stats_mutex_unlock();
 
 	if (table_changes != IS_EQUAL_YES) {
 
@@ -17383,7 +17416,8 @@
 				    " higher than innodb_io_capacity_max %lu",
 				    in_val, srv_max_io_capacity);
 
-		srv_max_io_capacity = (in_val & ~(~0UL >> 1))
+		/* Avoid overflow. */
+		srv_max_io_capacity = (in_val >= SRV_MAX_IO_CAPACITY_LIMIT / 2)
 			? in_val : in_val * 2;
 
 		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
@@ -17546,22 +17580,6 @@
 	return(ret);
 }
 
-extern void buf_resize_start();
-
-/** Update the system variable innodb_buffer_pool_size using the "saved"
-value. This function is registered as a callback with MySQL.
-@param[in]	save	immediate result from check function */
-static
-void
-innodb_buffer_pool_size_update(THD*,st_mysql_sys_var*,void*, const void* save)
-{
-	snprintf(export_vars.innodb_buffer_pool_resize_status,
-	        sizeof(export_vars.innodb_buffer_pool_resize_status),
-		"Buffer pool resize requested");
-
-	buf_resize_start();
-}
-
 /** The latest assigned innodb_ft_aux_table name */
 static char* innodb_ft_aux_table;
 
@@ -17576,11 +17594,16 @@
 	int len = sizeof buf;
 
 	if (const char* table_name = value->val_str(value, buf, &len)) {
+		/* Because we are not acquiring MDL on the table name,
+		we must contiguously hold dict_sys.latch while we are
+		examining the table, to protect us against concurrent DDL. */
+		dict_sys.lock(SRW_LOCK_CALL);
 		if (dict_table_t* table = dict_table_open_on_name(
-			    table_name, false, DICT_ERR_IGNORE_NONE)) {
+			    table_name, true, DICT_ERR_IGNORE_NONE)) {
+			table->release();
 			const table_id_t id = dict_table_has_fts_index(table)
 				? table->id : 0;
-			dict_table_close(table);
+			dict_sys.unlock();
 			if (id) {
 				innodb_ft_aux_table_id = id;
 				if (table_name == buf) {
@@ -17591,12 +17614,12 @@
 								 len);
 				}
 
-
 				*static_cast<const char**>(save) = table_name;
 				return 0;
 			}
+		} else {
+			dict_sys.unlock();
 		}
-
 		return 1;
 	} else {
 		*static_cast<char**>(save) = NULL;
@@ -18385,14 +18408,14 @@
   mysql_mutex_unlock(&buf_pool.mutex);
 }
 
+static my_bool innodb_log_checkpoint_now;
 #ifdef UNIV_DEBUG
-static my_bool	innodb_log_checkpoint_now = TRUE;
 static my_bool	innodb_buf_flush_list_now = TRUE;
 static uint	innodb_merge_threshold_set_all_debug
 	= DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+#endif
 
 /** Force an InnoDB log checkpoint. */
-/** Force an InnoDB log checkpoint. */
 static
 void
 checkpoint_now_set(THD* thd, st_mysql_sys_var*, void*, const void *save)
@@ -18416,14 +18439,21 @@
   const auto size= log_sys.is_encrypted()
     ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT;
   mysql_mutex_unlock(&LOCK_global_system_variables);
-  lsn_t lsn;
-  while (log_sys.last_checkpoint_lsn.load(std::memory_order_acquire) + size <
-         (lsn= log_sys.get_lsn(std::memory_order_acquire)))
+  while (!thd_kill_level(thd))
+  {
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+    lsn_t cp= log_sys.last_checkpoint_lsn.load(std::memory_order_relaxed),
+      lsn= log_sys.get_lsn();
+    log_sys.latch.wr_unlock();
+    if (cp + size >= lsn)
+      break;
     log_make_checkpoint();
+  }
 
   mysql_mutex_lock(&LOCK_global_system_variables);
 }
 
+#ifdef UNIV_DEBUG
 /****************************************************************//**
 Force a dirty pages flush now. */
 static
@@ -18605,7 +18635,7 @@
                     " innodb_log_buffer_size=%u", MYF(0), log_sys.buf_size);
   else
   {
-    switch (log_sys.resize_start(*static_cast<const ulonglong*>(save))) {
+    switch (log_sys.resize_start(*static_cast<const ulonglong*>(save), thd)) {
     case log_t::RESIZE_NO_CHANGE:
       break;
     case log_t::RESIZE_IN_PROGRESS:
@@ -18617,12 +18647,11 @@
       ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_CANT_CREATE_HANDLER_FILE);
       break;
     case log_t::RESIZE_STARTED:
-      const lsn_t start{log_sys.resize_in_progress()};
       for (timespec abstime;;)
       {
         if (thd_kill_level(thd))
         {
-          log_sys.resize_abort();
+          log_sys.resize_abort(thd);
           break;
         }
 
@@ -18637,37 +18666,25 @@
           resizing= log_sys.resize_in_progress();
         }
         mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-        if (start > log_sys.get_lsn())
+        if (!resizing || !log_sys.resize_running(thd))
+          break;
+        log_sys.latch.wr_lock(SRW_LOCK_CALL);
+        while (resizing > log_sys.get_lsn())
         {
           ut_ad(!log_sys.is_mmap());
           /* The server is almost idle. Write dummy FILE_CHECKPOINT records
           to ensure that the log resizing will complete. */
-          log_sys.latch.wr_lock(SRW_LOCK_CALL);
-          while (start > log_sys.get_lsn())
-          {
-            mtr_t mtr;
-            mtr.start();
-            mtr.commit_files(log_sys.last_checkpoint_lsn);
-          }
-          log_sys.latch.wr_unlock();
+          mtr_t mtr;
+          mtr.start();
+          mtr.commit_files(log_sys.last_checkpoint_lsn);
         }
-        if (!resizing || resizing > start /* only wait for our resize */)
-          break;
+        log_sys.latch.wr_unlock();
       }
     }
   }
   mysql_mutex_lock(&LOCK_global_system_variables);
 }
 
-static void innodb_log_spin_wait_delay_update(THD *, st_mysql_sys_var*,
-                                              void *, const void *save)
-{
-  log_sys.latch.wr_lock(SRW_LOCK_CALL);
-  mtr_t::spin_wait_delay= *static_cast<const unsigned*>(save);
-  mtr_t::finisher_update();
-  log_sys.latch.wr_unlock();
-}
-
 /** Update innodb_status_output or innodb_status_output_locks,
 which control InnoDB "status monitor" output to the error log.
 @param[out]	var	current value
@@ -18987,7 +19004,7 @@
 static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
   PLUGIN_VAR_RQCMDARG,
   "Number of IOPs the server can do. Tunes the background IO rate",
-  NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0);
+  NULL, innodb_io_capacity_update, 200, 100, SRV_MAX_IO_CAPACITY_LIMIT, 0);
 
 static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
   PLUGIN_VAR_RQCMDARG,
@@ -18996,12 +19013,12 @@
   SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
   SRV_MAX_IO_CAPACITY_LIMIT, 0);
 
-#ifdef UNIV_DEBUG
 static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now,
   PLUGIN_VAR_OPCMDARG,
-  "Force checkpoint now",
+  "Write back dirty pages from the buffer pool and update the log checkpoint",
   NULL, checkpoint_now_set, FALSE);
 
+#ifdef UNIV_DEBUG
 static MYSQL_SYSVAR_BOOL(buf_flush_list_now, innodb_buf_flush_list_now,
   PLUGIN_VAR_OPCMDARG,
   "Force dirty page flush now",
@@ -19157,12 +19174,12 @@
   " SHOW TABLE STATUS for tables that use transient statistics (off by default)",
   NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
+static MYSQL_SYSVAR_UINT(stats_transient_sample_pages,
   srv_stats_transient_sample_pages,
   PLUGIN_VAR_RQCMDARG,
   "The number of leaf index pages to sample when calculating transient"
   " statistics (if persistent statistics are not used, default 8)",
-  NULL, NULL, 8, 1, ~0ULL, 0);
+  NULL, NULL, 8, 1, ~0U, 0);
 
 static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent,
   PLUGIN_VAR_OPCMDARG,
@@ -19178,12 +19195,12 @@
   " new statistics)",
   NULL, NULL, TRUE);
 
-static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages,
+static MYSQL_SYSVAR_UINT(stats_persistent_sample_pages,
   srv_stats_persistent_sample_pages,
   PLUGIN_VAR_RQCMDARG,
   "The number of leaf index pages to sample when calculating persistent"
   " statistics (by ANALYZE, default 20)",
-  NULL, NULL, 20, 1, ~0ULL, 0);
+  NULL, NULL, 20, 1, ~0U, 0);
 
 static MYSQL_SYSVAR_ULONGLONG(stats_modified_counter, srv_stats_modified_counter,
   PLUGIN_VAR_RQCMDARG,
@@ -19222,11 +19239,12 @@
   "Data file autoextend increment in megabytes",
   NULL, NULL, 64, 1, 1000, 0);
 
-static MYSQL_SYSVAR_SIZE_T(buffer_pool_chunk_size, srv_buf_pool_chunk_unit,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "Size of a single memory chunk"
-  " for resizing buffer pool. Online buffer pool resizing happens at this"
-  " granularity. 0 means autosize this variable based on buffer pool size.",
+static size_t innodb_buffer_pool_chunk_size;
+
+static MYSQL_SYSVAR_SIZE_T(buffer_pool_chunk_size,
+                           innodb_buffer_pool_chunk_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_DEPRECATED,
+  "Deprecated parameter with no effect",
   NULL, NULL,
   0, 0, SIZE_T_MAX, 1024 * 1024);
 
@@ -19525,11 +19543,12 @@
   nullptr, innodb_log_file_size_update,
   96 << 20, 4 << 20, std::numeric_limits<ulonglong>::max(), 4096);
 
-static MYSQL_SYSVAR_UINT(log_spin_wait_delay, mtr_t::spin_wait_delay,
-  PLUGIN_VAR_OPCMDARG,
-  "Delay between log buffer spin lock polls (0 to use a blocking latch)",
-  nullptr, innodb_log_spin_wait_delay_update,
-  0, 0, 6000, 0);
+static uint innodb_log_spin_wait_delay;
+
+static MYSQL_SYSVAR_UINT(log_spin_wait_delay, innodb_log_spin_wait_delay,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED,
+  "Deprecated parameter with no effect",
+  nullptr, nullptr, 0, 0, 6000, 0);
 
 static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
   PLUGIN_VAR_RQCMDARG,
@@ -19634,37 +19653,10 @@
   AUTOINC_OLD_STYLE_LOCKING,	/* Minimum value */
   AUTOINC_NO_LOCKING, 0);	/* Maximum value */
 
-#ifdef HAVE_URING
-# include <sys/utsname.h>
-static utsname uname_for_io_uring;
-#else
-static
-#endif
-bool innodb_use_native_aio_default()
-{
-#ifdef HAVE_URING
-  utsname &u= uname_for_io_uring;
-  if (!uname(&u) && u.release[0] == '5' && u.release[1] == '.' &&
-      u.release[2] == '1' && u.release[3] >= '1' && u.release[3] <= '5' &&
-      u.release[4] == '.')
-  {
-    if (u.release[3] == '5') {
-      const char *s= strstr(u.version, "5.15.");
-      if (s || (s= strstr(u.release, "5.15.")))
-        if ((s[5] >= '3' || s[6] >= '0'))
-          return true; /* 5.15.3 and later should be fine */
-    }
-    io_uring_may_be_unsafe= u.release;
-    return false; /* working around io_uring hangs (MDEV-26674) */
-  }
-#endif
-  return true;
-}
-
 static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
   "Use native AIO if supported on this platform.",
-  NULL, NULL, innodb_use_native_aio_default());
+  NULL, NULL, TRUE);
 
 #ifdef HAVE_LIBNUMA
 static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave,
@@ -19953,6 +19945,10 @@
 static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(autoextend_increment),
   MYSQL_SYSVAR(buffer_pool_size),
+#if defined __linux__ || !defined DBUG_OFF
+  MYSQL_SYSVAR(buffer_pool_size_auto_min),
+#endif
+  MYSQL_SYSVAR(buffer_pool_size_max),
   MYSQL_SYSVAR(buffer_pool_chunk_size),
   MYSQL_SYSVAR(buffer_pool_filename),
   MYSQL_SYSVAR(buffer_pool_dump_now),
@@ -20079,8 +20075,8 @@
   MYSQL_SYSVAR(monitor_reset_all),
   MYSQL_SYSVAR(purge_threads),
   MYSQL_SYSVAR(purge_batch_size),
-#ifdef UNIV_DEBUG
   MYSQL_SYSVAR(log_checkpoint_now),
+#ifdef UNIV_DEBUG
   MYSQL_SYSVAR(buf_flush_list_now),
   MYSQL_SYSVAR(merge_threshold_set_all_debug),
 #endif /* UNIV_DEBUG */
@@ -21057,90 +21053,6 @@
 				cs2, to, static_cast<uint>(len), errors)));
 }
 
-/** Validate the requested buffer pool size.  Also, reserve the necessary
-memory needed for buffer pool resize.
-@param[in]	thd	thread handle
-@param[out]	save	immediate result for update function
-@param[in]	value	incoming string
-@return 0 on success, 1 on failure.
-*/
-static
-int
-innodb_buffer_pool_size_validate(
-	THD*				thd,
-	st_mysql_sys_var*,
-	void*				save,
-	struct st_mysql_value*		value)
-{
-	longlong	intbuf;
-
-	value->val_int(value, &intbuf);
-
-	if (static_cast<ulonglong>(intbuf) < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) {
-		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-				    ER_WRONG_ARGUMENTS,
-				    "innodb_buffer_pool_size must be at least"
-				    " %lld for innodb_page_size=%lu",
-				    MYSQL_SYSVAR_NAME(buffer_pool_size).min_val,
-				    srv_page_size);
-		return(1);
-	}
-
-	if (!srv_was_started) {
-		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-				    ER_WRONG_ARGUMENTS,
-				    "Cannot update innodb_buffer_pool_size,"
-				    " because InnoDB is not started.");
-		return(1);
-	}
-
-	mysql_mutex_lock(&buf_pool.mutex);
-
-	if (srv_buf_pool_old_size != srv_buf_pool_size) {
-		mysql_mutex_unlock(&buf_pool.mutex);
-		my_printf_error(ER_WRONG_ARGUMENTS,
-			"Another buffer pool resize is already in progress.", MYF(0));
-		return(1);
-	}
-
-	ulint	requested_buf_pool_size = buf_pool_size_align(ulint(intbuf));
-
-	*static_cast<ulonglong*>(save) = requested_buf_pool_size;
-
-	if (srv_buf_pool_size == ulint(intbuf)) {
-		mysql_mutex_unlock(&buf_pool.mutex);
-		/* nothing to do */
-		return(0);
-	}
-
-	if (srv_buf_pool_size == requested_buf_pool_size) {
-		mysql_mutex_unlock(&buf_pool.mutex);
-		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-				    ER_WRONG_ARGUMENTS,
-				    "innodb_buffer_pool_size must be at least"
-				    " innodb_buffer_pool_chunk_size=%zu",
-				    srv_buf_pool_chunk_unit);
-		/* nothing to do */
-		return(0);
-	}
-
-	srv_buf_pool_size = requested_buf_pool_size;
-	mysql_mutex_unlock(&buf_pool.mutex);
-
-	if (intbuf != static_cast<longlong>(requested_buf_pool_size)) {
-		char	buf[64];
-		int	len = 64;
-		value->val_str(value, buf, &len);
-		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-				    ER_TRUNCATED_WRONG_VALUE,
-				    "Truncated incorrect %-.32s value: '%-.128s'",
-				    mysql_sysvar_buffer_pool_size.name,
-				    value->val_str(value, buf, &len));
-	}
-
-	return(0);
-}
-
 /*************************************************************//**
 Check for a valid value of innobase_compression_algorithm.
 @return	0 for valid innodb_compression_algorithm. */
@@ -21436,19 +21348,3 @@
   if (UNIV_LIKELY_NULL(local_heap))
     mem_heap_free(local_heap);
 }
-
-/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
-if needed.
-@param[in]	size	size in bytes
-@return	aligned size */
-ulint buf_pool_size_align(ulint size) noexcept
-{
-  const size_t m = srv_buf_pool_chunk_unit;
-  size = ut_max(size, (size_t) MYSQL_SYSVAR_NAME(buffer_pool_size).min_val);
-
-  if (size % m == 0) {
-    return(size);
-  } else {
-    return (size / m + 1) * m;
-  }
-}
diff -Nru mariadb-10.11.11/storage/innobase/handler/ha_innodb.h mariadb-10.11.13/storage/innobase/handler/ha_innodb.h
--- mariadb-10.11.11/storage/innobase/handler/ha_innodb.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/handler/ha_innodb.h	2025-05-19 16:14:25.000000000 +0000
@@ -101,6 +101,9 @@
 
 	int open(const char *name, int mode, uint test_if_locked) override;
 
+	/** Fetch or recalculate InnoDB table statistics */
+	dberr_t statistics_init(dict_table_t *table, bool recalc);
+
 	handler* clone(const char *name, MEM_ROOT *mem_root) override;
 
 	int close(void) override;
diff -Nru mariadb-10.11.11/storage/innobase/handler/handler0alter.cc mariadb-10.11.13/storage/innobase/handler/handler0alter.cc
--- mariadb-10.11.11/storage/innobase/handler/handler0alter.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/handler/handler0alter.cc	2025-05-19 16:14:25.000000000 +0000
@@ -621,6 +621,16 @@
 	}
 
 	dict_index_t* index = dict_table_get_first_index(this);
+	if (instant) {
+		instant->field_map= static_cast<field_map_element_t*>(
+			mem_heap_dup(heap, instant->field_map,
+				     (index->n_fields -
+				      index->first_user_field()) *
+					sizeof *instant->field_map));
+		instant= static_cast<dict_instant_t*>(
+			mem_heap_dup(heap, instant, sizeof *instant));
+	}
+
 	bool metadata_changed;
 	{
 		const dict_index_t& i = *dict_table_get_first_index(&table);
@@ -2241,6 +2251,12 @@
 		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
 	}
 
+	if (ha_alter_info->create_info->used_fields
+	    & HA_CREATE_USED_SEQUENCE) {
+		ha_alter_info->unsupported_reason = "SEQUENCE";
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
 	update_thd();
 
 	if (!m_prebuilt->table->space) {
@@ -5525,6 +5541,12 @@
 		return false;
 	}
 
+	DBUG_EXECUTE_IF("instant_insert_fail",
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "InnoDB: Insert into SYS_COLUMNS failed");
+			mem_heap_free(info->heap);
+			return true;);
+
 	if (DB_SUCCESS != que_eval_sql(
 		    info,
 		    "PROCEDURE ADD_COL () IS\n"
@@ -6512,6 +6534,8 @@
 	DBUG_ASSERT(!ctx->add_index);
 	DBUG_ASSERT(!ctx->add_key_numbers);
 	DBUG_ASSERT(!ctx->num_to_add_index);
+	DBUG_ASSERT(!(ha_alter_info->create_info->used_fields
+		      & HA_CREATE_USED_SEQUENCE));
 
 	user_table = ctx->new_table;
 
@@ -6611,8 +6635,9 @@
 		mem_heap_alloc(ctx->heap, ctx->num_to_add_index
 			       * sizeof *ctx->add_key_numbers));
 
-	const bool fts_exist = ctx->new_table->flags2
+	const bool have_fts = user_table->flags2
 		& (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+	const bool pause_purge = have_fts || user_table->get_ref_count() > 1;
 	/* Acquire a lock on the table before creating any indexes. */
 	bool table_lock_failed = false;
 
@@ -6639,13 +6664,18 @@
 		user_table->lock_shared_unlock();
 	}
 
-	if (fts_exist) {
-		purge_sys.stop_FTS(*ctx->new_table);
+	if (pause_purge) {
+		purge_sys.stop_FTS();
+		if (have_fts) {
+			purge_sys.stop_FTS(*user_table, true);
+		}
 		if (error == DB_SUCCESS) {
-			error = fts_lock_tables(ctx->trx, *ctx->new_table);
+			error = fts_lock_tables(ctx->trx, *user_table);
 		}
 	}
 
+	ut_ad(user_table->get_ref_count() == 1);
+
 	if (error == DB_SUCCESS) {
 		error = lock_sys_tables(ctx->trx);
 	}
@@ -7478,7 +7508,7 @@
 		/* fts_create_common_tables() may drop old common tables,
 		whose files would be deleted here. */
 		commit_unlock_and_unlink(ctx->trx);
-		if (fts_exist) {
+		if (pause_purge) {
 			purge_sys.resume_FTS();
 		}
 
@@ -7542,10 +7572,11 @@
 		}
 	}
 
-	/* n_ref_count must be 1, because background threads cannot
+	/* n_ref_count must be 1 (+ InnoDB_share),
+	because background threads cannot
 	be executing on this very table as we are
 	holding MDL_EXCLUSIVE. */
-	ut_ad(ctx->online || user_table->get_ref_count() == 1);
+	ut_ad(ctx->online || ((user_table->get_ref_count() - 1) <= 1));
 
 	if (new_clustered) {
 		online_retry_drop_indexes_low(user_table, ctx->trx);
@@ -7574,7 +7605,7 @@
 		ctx->trx->free();
 	}
 	trx_commit_for_mysql(ctx->prebuilt->trx);
-	if (fts_exist) {
+	if (pause_purge) {
 		purge_sys.resume_FTS();
 	}
 
@@ -11180,7 +11211,10 @@
 	DBUG_ENTER("alter_stats_norebuild");
 	DBUG_ASSERT(!ctx->need_rebuild());
 
-	if (!dict_stats_is_persistent_enabled(ctx->new_table)) {
+	auto stat = ctx->new_table->stat;
+
+	if (!dict_table_t::stat_initialized(stat)
+	    || !dict_table_t::stats_is_persistent(stat)) {
 		DBUG_VOID_RETURN;
 	}
 
@@ -11189,7 +11223,6 @@
 		DBUG_ASSERT(index->table == ctx->new_table);
 
 		if (!(index->type & DICT_FTS)) {
-			dict_stats_init(ctx->new_table);
 			dict_stats_update_for_index(index);
 		}
 	}
@@ -11214,12 +11247,15 @@
 {
 	DBUG_ENTER("alter_stats_rebuild");
 
-	if (!table->space
-	    || !dict_stats_is_persistent_enabled(table)) {
+	if (!table->space || !table->stats_is_persistent()
+	    || dict_stats_persistent_storage_check(false) != SCHEMA_OK) {
 		DBUG_VOID_RETURN;
 	}
 
-	dberr_t	ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+	dberr_t	ret = dict_stats_update_persistent(table);
+	if (ret == DB_SUCCESS) {
+		ret = dict_stats_save(table);
+	}
 
 	if (ret != DB_SUCCESS) {
 		push_warning_printf(
@@ -11332,6 +11368,13 @@
 		/* A rollback is being requested. So far we may at
 		most have created stubs for ADD INDEX or a copy of the
 		table for rebuild. */
+#if 0 /* FIXME: is there a better way for innodb.innodb-index-online? */
+		lock_shared_ha_data();
+		auto share = static_cast<InnoDB_share*>(get_ha_share_ptr());
+		set_ha_share_ptr(nullptr);
+		unlock_shared_ha_data();
+		delete share;
+#endif
 		DBUG_RETURN(rollback_inplace_alter_table(
 				    ha_alter_info, table, m_prebuilt));
 	}
@@ -11559,34 +11602,16 @@
 		}
 	}
 
-	dict_table_t *table_stats = nullptr, *index_stats = nullptr;
-	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	dict_stats stats;
+	bool stats_failed = true;
 	dberr_t error = DB_SUCCESS;
 	if (!ctx0->old_table->is_stats_table() &&
 	    !ctx0->new_table->is_stats_table()) {
-		table_stats = dict_table_open_on_name(
-			TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
-		if (table_stats) {
-			dict_sys.freeze(SRW_LOCK_CALL);
-			table_stats = dict_acquire_mdl_shared<false>(
-				table_stats, m_user_thd, &mdl_table);
-			dict_sys.unfreeze();
-		}
-		index_stats = dict_table_open_on_name(
-			INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
-		if (index_stats) {
-			dict_sys.freeze(SRW_LOCK_CALL);
-			index_stats = dict_acquire_mdl_shared<false>(
-				index_stats, m_user_thd, &mdl_index);
-			dict_sys.unfreeze();
-		}
-
-		if (table_stats && index_stats
-		    && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME)
-		    && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)
-		    && !(error = lock_table_for_trx(table_stats,
+		stats_failed = stats.open(m_user_thd);
+		if (!stats_failed
+		    && !(error = lock_table_for_trx(stats.table(),
 						    trx, LOCK_X))) {
-			error = lock_table_for_trx(index_stats, trx, LOCK_X);
+			error = lock_table_for_trx(stats.index(), trx, LOCK_X);
 		}
 	}
 
@@ -11600,15 +11625,9 @@
 		error = lock_sys_tables(trx);
 	}
 	if (error != DB_SUCCESS) {
-		if (table_stats) {
-			dict_table_close(table_stats, false, m_user_thd,
-					 mdl_table);
-		}
-		if (index_stats) {
-			dict_table_close(index_stats, false, m_user_thd,
-					 mdl_index);
+		if (!stats_failed) {
+			stats.close();
 		}
-		my_error_innodb(error, table_share->table_name.str, 0);
 		if (fts_exist) {
 			purge_sys.resume_FTS();
 		}
@@ -11624,6 +11643,7 @@
 			trx_start_for_ddl(trx);
 		}
 
+		my_error_innodb(error, table_share->table_name.str, 0);
 		DBUG_RETURN(true);
 	}
 
@@ -11641,15 +11661,10 @@
 fail:
 			trx->rollback();
 			ut_ad(!trx->fts_trx);
-			if (table_stats) {
-				dict_table_close(table_stats, true, m_user_thd,
-						 mdl_table);
-			}
-			if (index_stats) {
-				dict_table_close(index_stats, true, m_user_thd,
-						 mdl_index);
-			}
 			row_mysql_unlock_data_dictionary(trx);
+			if (!stats_failed) {
+				stats.close();
+			}
 			if (fts_exist) {
 				purge_sys.resume_FTS();
 			}
@@ -11669,14 +11684,14 @@
 
 			if (commit_try_rebuild(ha_alter_info, ctx,
 					       altered_table, table,
-					       table_stats && index_stats,
+					       !stats_failed,
 					       trx,
 					       table_share->table_name.str)) {
 				goto fail;
 			}
 		} else if (commit_try_norebuild(ha_alter_info, ctx,
 						altered_table, table,
-						table_stats && index_stats,
+						!stats_failed,
 						trx,
 						table_share->table_name.str)) {
 			goto fail;
@@ -11699,13 +11714,6 @@
 #endif
 	}
 
-	if (table_stats) {
-		dict_table_close(table_stats, true, m_user_thd, mdl_table);
-	}
-	if (index_stats) {
-		dict_table_close(index_stats, true, m_user_thd, mdl_index);
-	}
-
 	/* Commit or roll back the changes to the data dictionary. */
 	DEBUG_SYNC(m_user_thd, "innodb_alter_inplace_before_commit");
 
@@ -11854,6 +11862,9 @@
 		DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
 				DBUG_SUICIDE(););
 		trx->free();
+		if (!stats_failed) {
+			stats.close();
+		}
 		if (fts_exist) {
 			purge_sys.resume_FTS();
 		}
@@ -11910,6 +11921,9 @@
 	DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
 			DBUG_SUICIDE(););
 	trx->free();
+	if (!stats_failed) {
+		stats.close();
+	}
 	if (fts_exist) {
 		purge_sys.resume_FTS();
 	}
diff -Nru mariadb-10.11.11/storage/innobase/handler/i_s.cc mariadb-10.11.13/storage/innobase/handler/i_s.cc
--- mariadb-10.11.11/storage/innobase/handler/i_s.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/handler/i_s.cc	2025-05-19 16:14:25.000000000 +0000
@@ -2230,7 +2230,7 @@
 		DBUG_RETURN(0);
 	} else if (!dict_table_has_fts_index(user_table)
 		   || !user_table->is_readable()) {
-		dict_table_close(user_table, false, thd, mdl_ticket);
+		dict_table_close(user_table, thd, mdl_ticket);
 		DBUG_RETURN(0);
 	}
 
@@ -2245,7 +2245,7 @@
 
 	fts_table_fetch_doc_ids(trx, &fts_table, deleted);
 
-	dict_table_close(user_table, false, thd, mdl_ticket);
+	dict_table_close(user_table, thd, mdl_ticket);
 
 	trx->free();
 
@@ -2578,7 +2578,7 @@
 	}
 
 	if (!user_table->fts || !user_table->fts->cache) {
-		dict_table_close(user_table, false, thd, mdl_ticket);
+		dict_table_close(user_table, thd, mdl_ticket);
 		DBUG_RETURN(0);
 	}
 
@@ -2603,7 +2603,7 @@
 	}
 
 	mysql_mutex_unlock(&cache->lock);
-	dict_table_close(user_table, false, thd, mdl_ticket);
+	dict_table_close(user_table, thd, mdl_ticket);
 
 	DBUG_RETURN(ret);
 }
@@ -3020,7 +3020,7 @@
 		}
 	}
 
-	dict_table_close(user_table, false, thd, mdl_ticket);
+	dict_table_close(user_table, thd, mdl_ticket);
 
 	ut_free(conv_str.f_str);
 
@@ -3145,7 +3145,7 @@
 	}
 
 	if (!dict_table_has_fts_index(user_table)) {
-		dict_table_close(user_table, false, thd, mdl_ticket);
+		dict_table_close(user_table, thd, mdl_ticket);
 		DBUG_RETURN(0);
 	}
 
@@ -3202,7 +3202,7 @@
 
 	fts_sql_commit(trx);
 
-	dict_table_close(user_table, false, thd, mdl_ticket);
+	dict_table_close(user_table, thd, mdl_ticket);
 
 	trx->free();
 
@@ -3388,7 +3388,7 @@
 		DBUG_RETURN(0);
 	}
 
-	buf_stats_get_pool_info(&info);
+	buf_pool.get_info(&info);
 
 	table = tables->table;
 
@@ -3937,87 +3937,37 @@
 @return 0 on success, 1 on failure */
 static int i_s_innodb_buffer_page_fill(THD *thd, TABLE_LIST *tables, Item *)
 {
-	int			status	= 0;
-	mem_heap_t*		heap;
-
-	DBUG_ENTER("i_s_innodb_buffer_page_fill");
-
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* deny access to user without PROCESS privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
-		DBUG_RETURN(0);
-	}
-
-	heap = mem_heap_create(10000);
-
-	for (ulint n = 0;
-	     n < ut_min(buf_pool.n_chunks, buf_pool.n_chunks_new); n++) {
-		const buf_block_t*	block;
-		ulint			n_blocks;
-		buf_page_info_t*	info_buffer;
-		ulint			num_page;
-		ulint			mem_size;
-		ulint			chunk_size;
-		ulint			num_to_process = 0;
-		ulint			block_id = 0;
-
-		/* Get buffer block of the nth chunk */
-		block = buf_pool.chunks[n].blocks;
-		chunk_size = buf_pool.chunks[n].size;
-		num_page = 0;
-
-		while (chunk_size > 0) {
-			/* we cache maximum MAX_BUF_INFO_CACHED number of
-			buffer page info */
-			num_to_process = ut_min(chunk_size,
-				(ulint)MAX_BUF_INFO_CACHED);
-
-			mem_size = num_to_process * sizeof(buf_page_info_t);
-
-			/* For each chunk, we'll pre-allocate information
-			structures to cache the page information read from
-			the buffer pool. Doing so before obtain any mutex */
-			info_buffer = (buf_page_info_t*) mem_heap_zalloc(
-				heap, mem_size);
-
-			/* Obtain appropriate mutexes. Since this is diagnostic
-			buffer pool info printout, we are not required to
-			preserve the overall consistency, so we can
-			release mutex periodically */
-			mysql_mutex_lock(&buf_pool.mutex);
-
-			/* GO through each block in the chunk */
-			for (n_blocks = num_to_process; n_blocks--; block++) {
-				i_s_innodb_buffer_page_get_info(
-					&block->page, block_id,
-					info_buffer + num_page);
-				block_id++;
-				num_page++;
-			}
-
-			mysql_mutex_unlock(&buf_pool.mutex);
+  DBUG_ENTER("i_s_innodb_buffer_page_fill");
+  RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-			/* Fill in information schema table with information
-			just collected from the buffer chunk scan */
-			status = i_s_innodb_buffer_page_fill(
-				thd, tables, info_buffer,
-				num_page);
-
-			/* If something goes wrong, break and return */
-			if (status) {
-				break;
-			}
-
-			mem_heap_empty(heap);
-			chunk_size -= num_to_process;
-			num_page = 0;
-		}
-	}
-
-	mem_heap_free(heap);
-
-	DBUG_RETURN(status);
+  /* deny access to user without PROCESS privilege */
+  if (check_global_access(thd, PROCESS_ACL))
+    DBUG_RETURN(0);
+
+  int status;
+  buf_page_info_t *b=
+    static_cast<buf_page_info_t*>(my_malloc(PSI_INSTRUMENT_ME,
+                                            MAX_BUF_INFO_CACHED * sizeof *b,
+                                            MYF(MY_WME)));
+  if (!b)
+    DBUG_RETURN(1);
+  for (size_t j= 0;;)
+  {
+    memset((void*) b, 0, MAX_BUF_INFO_CACHED * sizeof *b);
+    mysql_mutex_lock(&buf_pool.mutex);
+    const size_t N= buf_pool.curr_size();
+    const size_t n= std::min<size_t>(N, MAX_BUF_INFO_CACHED);
+    for (size_t i= 0; i < n && j < N; i++, j++)
+      i_s_innodb_buffer_page_get_info(&buf_pool.get_nth_page(j)->page, j,
+                                      &b[i]);
+
+    mysql_mutex_unlock(&buf_pool.mutex);
+    status= i_s_innodb_buffer_page_fill(thd, tables, b, n);
+    if (status || j >= N)
+      break;
+  }
+  my_free(b);
+  DBUG_RETURN(status);
 }
 
 /*******************************************************************//**
@@ -4777,9 +4727,9 @@
 
     OK(field_store_string(fields[SYS_TABLESTATS_NAME],
                           table->name.m_name));
-    OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized, true));
+    OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized(), true));
 
-    if (table->stat_initialized)
+    if (table->stat_initialized())
     {
       OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, true));
 
diff -Nru mariadb-10.11.11/storage/innobase/ibuf/ibuf0ibuf.cc mariadb-10.11.13/storage/innobase/ibuf/ibuf0ibuf.cc
--- mariadb-10.11.11/storage/innobase/ibuf/ibuf0ibuf.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/ibuf/ibuf0ibuf.cc	2025-05-19 16:14:25.000000000 +0000
@@ -375,7 +375,7 @@
 	ibuf.free_list_len = flst_get_len(root + PAGE_HEADER
 					   + PAGE_BTR_IBUF_FREE_LIST);
 
-	ibuf.height = 1 + btr_page_get_level(root);
+	ibuf.height = uint8_t(1 + btr_page_get_level(root));
 
 	/* the '1 +' is the ibuf header page */
 	ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len);
@@ -443,18 +443,11 @@
 		goto err_exit;
 	}
 
-	/* At startup we intialize ibuf to have a maximum of
-	CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
-	buffer pool size. Once ibuf struct is initialized this
-	value is updated with the user supplied size by calling
-	ibuf_max_size_update(). */
-	ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
-			  * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
-
 	mysql_mutex_init(ibuf_mutex_key, &ibuf_mutex, nullptr);
 	mysql_mutex_init(ibuf_pessimistic_insert_mutex_key,
 			 &ibuf_pessimistic_insert_mutex, nullptr);
 
+	ibuf_max_size_update(CHANGE_BUFFER_DEFAULT_SIZE);
 	mysql_mutex_lock(&ibuf_mutex);
 	ibuf_size_update(root);
 	mysql_mutex_unlock(&ibuf_mutex);
@@ -506,10 +499,10 @@
 				percentage of the buffer pool size */
 {
 	if (UNIV_UNLIKELY(!ibuf.index)) return;
-	ulint	new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
-			    * new_val) / 100;
+	ulint	new_size = std::min<ulint>(
+		buf_pool.curr_size() * new_val / 100, uint32_t(~0U));
 	mysql_mutex_lock(&ibuf_mutex);
-	ibuf.max_size = new_size;
+	ibuf.max_size = uint32_t(new_size);
 	mysql_mutex_unlock(&ibuf_mutex);
 }
 
@@ -2061,8 +2054,7 @@
 		}
 	}
 
-	limit = ut_min(IBUF_MAX_N_PAGES_MERGED,
-		       buf_pool_get_curr_size() / 4);
+	limit = std::min(IBUF_MAX_N_PAGES_MERGED, buf_pool.curr_size() / 4);
 
 	first_page_no = ibuf_rec_get_page_no(mtr, rec);
 	first_space_id = ibuf_rec_get_space(mtr, rec);
@@ -4483,17 +4475,17 @@
     return;
   }
 
-  const ulint size= ibuf.size;
-  const ulint free_list_len= ibuf.free_list_len;
-  const ulint seg_size= ibuf.seg_size;
+  const uint32_t size= ibuf.size;
+  const uint32_t free_list_len= ibuf.free_list_len;
+  const uint32_t seg_size= ibuf.seg_size;
   mysql_mutex_unlock(&ibuf_mutex);
 
   fprintf(file,
           "-------------\n"
           "INSERT BUFFER\n"
           "-------------\n"
-          "size " ULINTPF ", free list len " ULINTPF ","
-          " seg size " ULINTPF ", " ULINTPF " merges\n",
+          "size %" PRIu32 ", free list len %" PRIu32 ","
+          " seg size %" PRIu32 ", " ULINTPF " merges\n",
           size, free_list_len, seg_size, ulint{ibuf.n_merges});
   ibuf_print_ops("merged operations:\n", ibuf.n_merged_ops, file);
   ibuf_print_ops("discarded operations:\n", ibuf.n_discarded_ops, file);
diff -Nru mariadb-10.11.11/storage/innobase/include/btr0sea.h mariadb-10.11.13/storage/innobase/include/btr0sea.h
--- mariadb-10.11.11/storage/innobase/include/btr0sea.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/btr0sea.h	2025-05-19 16:14:25.000000000 +0000
@@ -39,12 +39,16 @@
 #define btr_search_sys_create() btr_search_sys.create()
 #define btr_search_sys_free() btr_search_sys.free()
 
-/** Disable the adaptive hash search system and empty the index. */
-void btr_search_disable();
+/** Lazily free detached metadata when removing the last reference. */
+ATTRIBUTE_COLD void btr_search_lazy_free(dict_index_t *index);
+
+/** Disable the adaptive hash search system and empty the index.
+@return whether the adaptive hash index was enabled */
+ATTRIBUTE_COLD bool btr_search_disable();
 
 /** Enable the adaptive hash search system.
 @param resize whether buf_pool_t::resize() is the caller */
-void btr_search_enable(bool resize= false);
+ATTRIBUTE_COLD void btr_search_enable(bool resize= false);
 
 /*********************************************************************//**
 Updates the search info. */
diff -Nru mariadb-10.11.11/storage/innobase/include/buf0buddy.h mariadb-10.11.13/storage/innobase/include/buf0buddy.h
--- mariadb-10.11.11/storage/innobase/include/buf0buddy.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/buf0buddy.h	2025-05-19 16:14:25.000000000 +0000
@@ -24,17 +24,13 @@
 Created December 2006 by Marko Makela
 *******************************************************/
 
-#ifndef buf0buddy_h
-#define buf0buddy_h
-
+#pragma once
 #include "buf0types.h"
 
 /**
 @param[in]	block size in bytes
 @return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
-inline
-ulint
-buf_buddy_get_slot(ulint size)
+inline ulint buf_buddy_get_slot(ulint size) noexcept
 {
 	ulint	i;
 	ulint	s;
@@ -53,13 +49,13 @@
 @param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
 @param lru    assigned to true if buf_pool.mutex was temporarily released
 @return allocated block, never NULL */
-byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc));
+byte *buf_buddy_alloc_low(ulint i, bool *lru) noexcept MY_ATTRIBUTE((malloc));
 
 /** Allocate a ROW_FORMAT=COMPRESSED block.
 @param size   compressed page size in bytes
 @param lru    assigned to true if buf_pool.mutex was temporarily released
 @return allocated block, never NULL */
-inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr)
+inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr) noexcept
 {
   return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru);
 }
@@ -68,24 +64,26 @@
 @param[in]	buf	block to be freed, must not be pointed to
 			by the buffer pool
 @param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
-void buf_buddy_free_low(void* buf, ulint i);
+void buf_buddy_free_low(void* buf, ulint i) noexcept;
 
 /** Deallocate a block.
 @param[in]	buf	block to be freed, must not be pointed to
 			by the buffer pool
 @param[in]	size	block size in bytes */
-inline void buf_buddy_free(void* buf, ulint size)
+inline void buf_buddy_free(void* buf, ulint size) noexcept
 {
-	buf_buddy_free_low(buf, buf_buddy_get_slot(size));
+  buf_buddy_free_low(buf, buf_buddy_get_slot(size));
 }
 
-/** Try to reallocate a block.
-@param[in]	buf		block to be reallocated, must be pointed
-to by the buffer pool
-@param[in]	size		block size, up to srv_page_size
-@retval false	if failed because of no free blocks. */
-bool buf_buddy_realloc(void* buf, ulint size);
-
-/** Combine all pairs of free buddies. */
-void buf_buddy_condense_free();
-#endif /* buf0buddy_h */
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Reallocate a ROW_FORMAT=COMPRESSED page frame during buf_pool_t::shrink().
+@param bpage page descriptor covering a ROW_FORMAT=COMPRESSED page
+@param block uncompressed block for storage
+@return block
+@retval nullptr if the block was consumed */
+ATTRIBUTE_COLD
+buf_block_t *buf_buddy_shrink(buf_page_t *bpage, buf_block_t *block) noexcept;
+
+/** Combine all pairs of free buddies.
+@param size  the target innodb_buffer_pool_size */
+ATTRIBUTE_COLD void buf_buddy_condense_free(size_t size) noexcept;
diff -Nru mariadb-10.11.11/storage/innobase/include/buf0buf.h mariadb-10.11.13/storage/innobase/include/buf0buf.h
--- mariadb-10.11.11/storage/innobase/include/buf0buf.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/buf0buf.h	2025-05-19 16:14:25.000000000 +0000
@@ -35,13 +35,16 @@
 #include "assume_aligned.h"
 #include "buf0types.h"
 #ifndef UNIV_INNOCHECKSUM
-#include "ut0byte.h"
 #include "page0types.h"
 #include "log0log.h"
 #include "srv0srv.h"
 #include "transactional_lock_guard.h"
 #include <ostream>
 
+/** The allocation granularity of innodb_buffer_pool_size */
+constexpr size_t innodb_buffer_pool_extent_size=
+  sizeof(size_t) < 8 ? 2 << 20 : 8 << 20;
+
 /** @name Modes for buf_page_get_gen */
 /* @{ */
 #define BUF_GET			10	/*!< get always */
@@ -71,7 +74,7 @@
 	ulint	pool_size;		/*!< Buffer Pool size in pages */
 	ulint	lru_len;		/*!< Length of buf_pool.LRU */
 	ulint	old_lru_len;		/*!< buf_pool.LRU_old_len */
-	ulint	free_list_len;		/*!< Length of buf_pool.free list */
+	ulint	free_list_len;		/*!< free + lazy_allocate_size() */
 	ulint	flush_list_len;		/*!< Length of buf_pool.flush_list */
 	ulint	n_pend_unzip;		/*!< buf_pool.n_pend_unzip, pages
 					pending decompress */
@@ -142,10 +145,8 @@
 	const page_id_t		page_id);
 
 #ifndef UNIV_INNOCHECKSUM
-# define buf_pool_get_curr_size() srv_buf_pool_curr_size
 # define buf_block_free(block) buf_pool.free_block(block)
-
-#define buf_page_get(ID, SIZE, LA, MTR)					\
+# define buf_page_get(ID, SIZE, LA, MTR)                        \
 	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR)
 
 /** Try to buffer-fix a page.
@@ -395,9 +396,6 @@
 buf_print_io(
 /*=========*/
 	FILE*	file);	/*!< in: file where to print */
-/** Collect buffer pool metadata.
-@param[out]	pool_info	buffer pool metadata */
-void buf_stats_get_pool_info(buf_pool_info_t *pool_info) noexcept;
 
 /** Refresh the statistics used to print per-second averages. */
 void buf_refresh_io_stats() noexcept;
@@ -427,12 +425,6 @@
 ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read)
   noexcept;
 
-/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
-if needed.
-@param[in]	size	size in bytes
-@return	aligned size */
-ulint buf_pool_size_align(ulint size) noexcept;
-
 /** Verify that post encryption checksum match with the calculated checksum.
 This function should be called only if tablespace contains crypt data metadata.
 @param page       page frame
@@ -549,7 +541,7 @@
   /** buf_pool.LRU status mask in state() */
   static constexpr uint32_t LRU_MASK= 7U << 29;
 
-  /** lock covering the contents of frame */
+  /** lock covering the contents of frame() */
   block_lock lock;
   /** pointer to aligned, uncompressed page frame of innodb_page_size */
   byte *frame;
@@ -559,8 +551,6 @@
   !frame && !zip.data means an active buf_pool.watch */
   page_zip_des_t zip;
 #ifdef UNIV_DEBUG
-  /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
-  bool in_zip_hash;
   /** whether this->LRU is in buf_pool.LRU (in_file());
   protected by buf_pool.mutex */
   bool in_LRU_list;
@@ -574,7 +564,7 @@
   /** list member in one of the lists of buf_pool; protected by
   buf_pool.mutex or buf_pool.flush_list_mutex
 
-  state() == NOT_USED: buf_pool.free or buf_pool.withdraw
+  state() == NOT_USED: buf_pool.free
 
   in_file() && oldest_modification():
   buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
@@ -615,7 +605,7 @@
     lock() /* not copied */,
     frame(b.frame), zip(b.zip),
 #ifdef UNIV_DEBUG
-    in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list),
+    in_LRU_list(b.in_LRU_list),
     in_page_hash(b.in_page_hash), in_free_list(b.in_free_list),
 #endif /* UNIV_DEBUG */
     list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock),
@@ -632,7 +622,6 @@
     id_= id;
     zip.fix= state;
     oldest_modification_= 0;
-    ut_d(in_zip_hash= false);
     ut_d(in_free_list= false);
     ut_d(in_LRU_list= false);
     ut_d(in_page_hash= false);
@@ -891,10 +880,6 @@
 					buf_pool.page_hash can point
 					to buf_page_t or buf_block_t */
 #ifdef UNIV_DEBUG
-  /** whether page.list is in buf_pool.withdraw
-  ((state() == NOT_USED)) and the buffer pool is being shrunk;
-  protected by buf_pool.mutex */
-  bool in_withdraw_list;
   /** whether unzip_LRU is in buf_pool.unzip_LRU
   (in_file() && frame && zip.data);
   protected by buf_pool.mutex */
@@ -1022,15 +1007,10 @@
   @param state    initial state() */
   void initialise(const page_id_t page_id, ulint zip_size, uint32_t state)
     noexcept;
-};
 
-/**********************************************************************//**
-Compute the hash fold value for blocks in buf_pool.zip_hash. */
-/* @{ */
-#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
-#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame)
-#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
-/* @} */
+  /** Calculate the page frame address */
+  IF_DBUG(,inline) byte *frame_address() const noexcept;
+};
 
 /** A "Hazard Pointer" class used to iterate over buf_pool.LRU or
 buf_pool.flush_list. A hazard pointer is a buf_page_t pointer
@@ -1198,59 +1178,66 @@
 /** The buffer pool */
 class buf_pool_t
 {
-  /** A chunk of buffers */
-  struct chunk_t
-  {
-    /** number of elements in blocks[] */
-    size_t size;
-    /** memory allocated for the page frames */
-    unsigned char *mem;
-    /** descriptor of mem */
-    ut_new_pfx_t mem_pfx;
-    /** array of buffer control blocks */
-    buf_block_t *blocks;
-
-    /** Map of first page frame address to chunks[] */
-    using map= std::map<const void*, chunk_t*, std::less<const void*>,
-                        ut_allocator<std::pair<const void* const,chunk_t*>>>;
-    /** Chunk map that may be under construction by buf_resize_thread() */
-    static map *map_reg;
-    /** Current chunk map for lookup only */
-    static map *map_ref;
-
-    /** @return the memory size bytes. */
-    size_t mem_size() const noexcept { return mem_pfx.m_size; }
-
-    /** Register the chunk */
-    void reg() noexcept
-    { map_reg->emplace(map::value_type(blocks->page.frame, this)); }
-
-    /** Allocate a chunk of buffer frames.
-    @param bytes    requested size
-    @return whether the allocation succeeded */
-    inline bool create(size_t bytes) noexcept;
+  /** arrays of buf_block_t followed by page frames;
+  aliged to and repeating every innodb_buffer_pool_extent_size;
+  each extent comprises pages_in_extent[] blocks */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) char *memory;
+  /** the allocation of the above memory, possibly including some
+  alignment loss at the beginning */
+  char *memory_unaligned;
+  /** the virtual address range size of memory_unaligned */
+  size_t size_unaligned;
+#ifdef UNIV_PFS_MEMORY
+  /** the "owner thread" of the buffer pool allocation */
+  PSI_thread *owner;
+#endif
+  /** initialized number of block descriptors */
+  size_t n_blocks;
+  /** number of blocks that need to be freed in shrink() */
+  size_t n_blocks_to_withdraw;
+  /** first block to withdraw in shrink() */
+  const buf_page_t *first_to_withdraw;
 
-#ifdef UNIV_DEBUG
-    /** Find a block that points to a ROW_FORMAT=COMPRESSED page
-    @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
-    @return the block
-    @retval nullptr  if not found */
-    const buf_block_t *contains_zip(const void *data) const noexcept
-    {
-      const buf_block_t *block= blocks;
-      for (auto i= size; i--; block++)
-        if (block->page.zip.data == data)
-          return block;
-      return nullptr;
-    }
+  /** amount of memory allocated to the buffer pool and descriptors;
+  protected by mutex */
+  Atomic_relaxed<size_t> size_in_bytes;
 
-    /** Check that all blocks are in a replaceable state.
-    @return address of a non-free block
-    @retval nullptr if all freed */
-    inline const buf_block_t *not_freed() const noexcept;
-#endif /* UNIV_DEBUG */
-  };
 public:
+  /** The requested innodb_buffer_pool_size */
+  size_t size_in_bytes_requested;
+#if defined __linux__ || !defined DBUG_OFF
+  /** The minimum allowed innodb_buffer_pool_size in garbage_collect() */
+  size_t size_in_bytes_auto_min;
+#endif
+  /** The maximum allowed innodb_buffer_pool_size */
+  size_t size_in_bytes_max;
+
+  /** @return the current size of the buffer pool, in bytes */
+  size_t curr_pool_size() const noexcept { return size_in_bytes; }
+
+  /** @return the current size of the buffer pool, in pages */
+  TPOOL_SUPPRESS_TSAN size_t curr_size() const noexcept { return n_blocks; }
+  /** @return the maximum usable size of the buffer pool, in pages */
+  TPOOL_SUPPRESS_TSAN size_t usable_size() const noexcept
+  { return n_blocks - n_blocks_to_withdraw - UT_LIST_GET_LEN(withdrawn); }
+
+  /** Determine the used size of the buffer pool in bytes.
+  @param n_blocks   size of the buffer pool in blocks
+  @return the size needed for n_blocks in bytes, for innodb_page_size */
+  static size_t blocks_in_bytes(size_t n_blocks) noexcept;
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+  /** Enable buffers to be dumped to core files.
+
+  A convenience function, not called anyhwere directly however
+  it is left available for gdb or any debugger to call
+  in the event that you want all of the memory to be dumped
+  to a core file.
+
+  @return number of errors found in madvise() calls */
+  static int madvise_do_dump() noexcept;
+#endif
+
   /** Hash cell chain in page_hash_table */
   struct hash_chain
   {
@@ -1258,106 +1245,58 @@
     buf_page_t *first;
   };
 private:
-  /** Withdraw blocks from the buffer pool until meeting withdraw_target.
-  @return whether retry is needed */
-  inline bool withdraw_blocks() noexcept;
-
-  /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to
-  the buf_block_t itself or a member of it.
-  @param ptr    a pointer that will not be dereferenced
-  @return whether the ptr belongs to a buf_block_t struct */
-  bool is_block_field(const void *ptr) const noexcept
-  {
-    const chunk_t *chunk= chunks;
-    const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new);
-
-    /* TODO: protect chunks with a mutex (the older pointer will
-    currently remain during resize()) */
-    for (; chunk < echunk; chunk++)
-      if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
-          ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size))
-        return true;
-    return false;
-  }
-
-  /** Try to reallocate a control block.
-  @param block  control block to reallocate
-  @return whether the reallocation succeeded */
-  inline bool realloc(buf_block_t *block) noexcept;
+  /** Determine the number of blocks in a buffer pool of a particular size.
+  @param size_in_bytes    innodb_buffer_pool_size in bytes
+  @return number of buffer pool pages */
+  static size_t get_n_blocks(size_t size_in_bytes) noexcept;
+
+  /** The outcome of shrink() */
+  enum shrink_status{SHRINK_DONE= -1, SHRINK_IN_PROGRESS= 0, SHRINK_ABORT};
+
+  /** Attempt to shrink the buffer pool.
+  @param size   requested innodb_buffer_pool_size in bytes
+  @retval whether the shrinking was completed */
+  ATTRIBUTE_COLD shrink_status shrink(size_t size) noexcept;
+
+  /** Finish shrinking the buffer pool.
+  @param size    the new innodb_buffer_pool_size in bytes
+  @param reduced how much the innodb_buffer_pool_size was reduced */
+  inline void shrunk(size_t size, size_t reduced) noexcept;
 
 public:
-  bool is_initialised() const noexcept { return chunks != nullptr; }
+  bool is_initialised() const noexcept { return memory != nullptr; }
 
   /** Create the buffer pool.
   @return whether the creation failed */
-  bool create();
+  bool create() noexcept;
 
   /** Clean up after successful create() */
   void close() noexcept;
 
-  /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
-  inline void resize();
+  /** Resize the buffer pool.
+  @param size   requested innodb_buffer_pool_size in bytes
+  @param thd    current connnection */
+  ATTRIBUTE_COLD void resize(size_t size, THD *thd) noexcept;
 
-#ifdef __linux__
   /** Collect garbage (release pages from the LRU list) */
-  inline void garbage_collect();
-#endif
-
-  /** @return whether resize() is in progress */
-  bool resize_in_progress() const noexcept
-  {
-    return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed));
-  }
-
-  /** @return the current size in blocks */
-  size_t get_n_pages() const noexcept
-  {
-    ut_ad(is_initialised());
-    size_t size= 0;
-    for (auto j= ut_min(n_chunks_new, n_chunks); j--; )
-      size+= chunks[j].size;
-    return size;
-  }
+  inline void garbage_collect() noexcept;
 
-  /** Determine whether a frame is intended to be withdrawn during resize().
+  /** Determine whether a frame needs to be withdrawn during resize().
   @param ptr    pointer within a buf_page_t::frame
+  @param size   size_in_bytes_requested
   @return whether the frame will be withdrawn */
-  bool will_be_withdrawn(const byte *ptr) const noexcept
+  bool will_be_withdrawn(const byte *ptr, size_t size) const noexcept
   {
-    ut_ad(n_chunks_new < n_chunks);
-#ifdef SAFE_MUTEX
-    if (resize_in_progress())
-      mysql_mutex_assert_owner(&mutex);
-#endif /* SAFE_MUTEX */
-
-    for (const chunk_t *chunk= chunks + n_chunks_new,
-         * const echunk= chunks + n_chunks;
-         chunk != echunk; chunk++)
-      if (ptr >= chunk->blocks->page.frame &&
-          ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size)
-        return true;
-    return false;
+    const char *p= reinterpret_cast<const char*>(ptr);
+    ut_ad(!p || p >= memory);
+    ut_ad(p < memory + size_in_bytes_max);
+    return p >= memory + size;
   }
 
-  /** Determine whether a block is intended to be withdrawn during resize().
+  /** Withdraw a block if needed in case resize() is shrinking.
   @param bpage  buffer pool block
-  @return whether the frame will be withdrawn */
-  bool will_be_withdrawn(const buf_page_t &bpage) const noexcept
-  {
-    ut_ad(n_chunks_new < n_chunks);
-#ifdef SAFE_MUTEX
-    if (resize_in_progress())
-      mysql_mutex_assert_owner(&mutex);
-#endif /* SAFE_MUTEX */
-
-    for (const chunk_t *chunk= chunks + n_chunks_new,
-         * const echunk= chunks + n_chunks;
-         chunk != echunk; chunk++)
-      if (&bpage >= &chunk->blocks->page &&
-          &bpage < &chunk->blocks[chunk->size].page)
-        return true;
-    return false;
-  }
+  @return whether the block was withdrawn */
+  ATTRIBUTE_COLD bool withdraw(buf_page_t &bpage) noexcept;
 
   /** Release and evict a corrupted page.
   @param bpage    x-latched page that was found corrupted
@@ -1371,31 +1310,18 @@
 #ifdef UNIV_DEBUG
   /** Find a block that points to a ROW_FORMAT=COMPRESSED page
   @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+  @param shift number of least significant address bits to ignore
   @return the block
   @retval nullptr  if not found */
-  const buf_block_t *contains_zip(const void *data) const noexcept
-  {
-    mysql_mutex_assert_owner(&mutex);
-    for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks;
-         chunk != end; chunk++)
-      if (const buf_block_t *block= chunk->contains_zip(data))
-        return block;
-    return nullptr;
-  }
-
+  const buf_block_t *contains_zip(const void *data, size_t shift= 0)
+    const noexcept;
   /** Assert that all buffer pool pages are in a replaceable state */
   void assert_all_freed() noexcept;
 #endif /* UNIV_DEBUG */
 
 #ifdef BTR_CUR_HASH_ADAPT
   /** Clear the adaptive hash index on all pages in the buffer pool. */
-  inline void clear_hash_index() noexcept;
-
-  /** Get a buffer block from an adaptive hash index pointer.
-  This function does not return if the block is not identified.
-  @param ptr  pointer to within a page frame
-  @return pointer to block, never NULL */
-  inline buf_block_t *block_from_ahi(const byte *ptr) const noexcept;
+  void clear_hash_index() noexcept;
 #endif /* BTR_CUR_HASH_ADAPT */
 
   /**
@@ -1418,13 +1344,27 @@
     return empty_lsn;
   }
 
-  /** Determine if a buffer block was created by chunk_t::create().
-  @param block  block descriptor (not dereferenced)
-  @return whether block has been created by chunk_t::create() */
-  bool is_uncompressed(const buf_block_t *block) const noexcept
+  /** Look up the block descriptor for a page frame address.
+  @param ptr   address within a valid page frame
+  @return the corresponding block descriptor */
+  static buf_block_t *block_from(const void *ptr) noexcept;
+
+  /** Access a block while holding the buffer pool mutex.
+  @param pos    position between 0 and get_n_pages()
+  @return the block descriptor */
+  buf_block_t *get_nth_page(size_t pos) const noexcept;
+
+#ifdef UNIV_DEBUG
+  /** Determine if an object is within the curr_pool_size()
+  and associated with an uncompressed page.
+  @param ptr   memory object (not dereferenced)
+  @return whether the object is valid in the current buffer pool */
+  bool is_uncompressed_current(const void *ptr) const noexcept
   {
-    return is_block_field(reinterpret_cast<const void*>(block));
+    const ptrdiff_t d= static_cast<const char*>(ptr) - memory;
+    return d >= 0 && size_t(d) < curr_pool_size();
   }
+#endif
 
 public:
   /** page_fix() mode of operation */
@@ -1456,6 +1396,16 @@
   buf_block_t *page_fix(const page_id_t id) noexcept
   { return page_fix(id, nullptr, FIX_WAIT_READ); }
 
+  /** Validate a block descriptor.
+  @param b     block descriptor that may be invalid after shrink()
+  @param latch page_hash latch for id
+  @param id    page identifier
+  @return b->page.fix() if b->page.id() == id
+  @retval 0 if b is invalid */
+  TRANSACTIONAL_TARGET
+  uint32_t page_guess(buf_block_t *b, page_hash_latch &latch,
+                      const page_id_t id) noexcept;
+
   /** Decompress a page and relocate the block descriptor
   @param b      buffer-fixed compressed-only ROW_FORMAT=COMPRESSED page
   @param chain  hash table chain for b->id().fold()
@@ -1477,7 +1427,6 @@
     buf_page_t *bpage= page_hash.get(page_id, chain);
     if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
     {
-      ut_ad(!bpage->in_zip_hash);
       ut_ad(!bpage->zip.data);
       if (!allow_watch)
         bpage= nullptr;
@@ -1498,7 +1447,6 @@
     ut_ad(bpage.in_file());
     if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)])
       return false;
-    ut_ad(!bpage.in_zip_hash);
     ut_ad(!bpage.zip.data);
     return true;
   }
@@ -1539,23 +1487,30 @@
   inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain) noexcept;
 
   /** @return whether less than 1/4 of the buffer pool is available */
-  TPOOL_SUPPRESS_TSAN
-  bool running_out() const noexcept
-  {
-    return !recv_recovery_is_on() &&
-      UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
-        (n_chunks_new * chunks->size) / 4;
-  }
+  bool running_out() const noexcept;
 
   /** @return whether the buffer pool is running low */
   bool need_LRU_eviction() const noexcept;
 
-  /** @return whether the buffer pool is shrinking */
-  inline bool is_shrinking() const noexcept
+  /** @return number of blocks resize() needs to evict from the buffer pool */
+  size_t is_shrinking() const noexcept
+  {
+    mysql_mutex_assert_owner(&mutex);
+    return n_blocks_to_withdraw + UT_LIST_GET_LEN(withdrawn);
+  }
+
+  /** @return number of blocks in resize() waiting to be withdrawn */
+  size_t to_withdraw() const noexcept
   {
-    return n_chunks_new < n_chunks;
+    mysql_mutex_assert_owner(&mutex);
+    return n_blocks_to_withdraw;
   }
 
+  /** @return the shrinking size of the buffer pool, in bytes
+  @retval 0 if resize() is not shrinking the buffer pool */
+  size_t shrinking_size() const noexcept
+  { return is_shrinking() ? size_in_bytes_requested : 0; }
+
 #ifdef UNIV_DEBUG
   /** Validate the buffer pool. */
   void validate() noexcept;
@@ -1572,7 +1527,6 @@
     mysql_mutex_assert_owner(&mutex);
     ut_ad(bpage->in_LRU_list);
     ut_ad(bpage->in_page_hash);
-    ut_ad(!bpage->in_zip_hash);
     ut_ad(bpage->in_file());
     lru_hp.adjust(bpage);
     lru_scan_itr.adjust(bpage);
@@ -1592,26 +1546,8 @@
 
 	/** @name General fields */
 	/* @{ */
-	ulint		curr_pool_size;	/*!< Current pool size in bytes */
 	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
 					pool for "old" blocks */
-#ifdef UNIV_DEBUG
-	ulint		buddy_n_frames; /*!< Number of frames allocated from
-					the buffer pool to the buddy system */
-	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
-#endif
-	ut_allocator<unsigned char>	allocator;	/*!< Allocator used for
-					allocating memory for the the "chunks"
-					member. */
-	ulint		n_chunks;	/*!< number of buffer pool chunks */
-	ulint		n_chunks_new;	/*!< new number of buffer pool chunks.
-					both n_chunks{,new} are protected under
-					mutex */
-	chunk_t*	chunks;		/*!< buffer pool chunks */
-	chunk_t*	chunks_old;	/*!< old buffer pool chunks to be freed
-					after resizing buffer pool */
-	/** current pool size in pages */
-	Atomic_counter<ulint> curr_size;
 	/** read-ahead request size in pages */
 	Atomic_counter<uint32_t> read_ahead_area;
 
@@ -1723,12 +1659,6 @@
     /** Look up a page in a hash bucket chain. */
     inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const
       noexcept;
-
-    /** Exclusively aqcuire all latches */
-    inline void write_lock_all() noexcept;
-
-    /** Release all latches */
-    inline void write_unlock_all() noexcept;
   };
 
   /** Buffer pool mutex */
@@ -1745,9 +1675,6 @@
   indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
   page_hash_table page_hash;
 
-  /** map of block->frame to buf_block_t blocks that belong
-  to buf_buddy_alloc(); protected by buf_pool.mutex */
-  hash_table_t zip_hash;
   /** number of pending unzip() */
   Atomic_counter<ulint> n_pend_unzip;
 
@@ -1878,30 +1805,29 @@
   Set whenever the free list grows, along with a broadcast of done_free.
   Protected by buf_pool.mutex. */
   Atomic_relaxed<bool> try_LRU_scan;
-  /** Whether we have warned to be running out of buffer pool */
-  std::atomic_flag LRU_warned;
 
 	/* @} */
 
 	/** @name LRU replacement algorithm fields */
 	/* @{ */
 
-	UT_LIST_BASE_NODE_T(buf_page_t) free;
-					/*!< base node of the free
-					block list */
+private:
+  /** Whether we have warned to be running out of buffer pool;
+  only modified by buf_flush_page_cleaner():
+  set while holding mutex, cleared while holding flush_list_mutex */
+  Atomic_relaxed<bool> LRU_warned;
+
+  /** withdrawn blocks during resize() */
+  UT_LIST_BASE_NODE_T(buf_page_t) withdrawn;
+
+public:
+  /** list of blocks available for allocate() */
+  UT_LIST_BASE_NODE_T(buf_page_t) free;
+
   /** broadcast each time when the free list grows or try_LRU_scan is set;
   protected by mutex */
   pthread_cond_t done_free;
 
-	UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
-					/*!< base node of the withdraw
-					block list. It is only used during
-					shrinking buffer pool size, not to
-					reuse the blocks will be removed */
-
-	ulint		withdraw_target;/*!< target length of withdraw
-					block list, when withdrawing */
-
 	/** "hazard pointer" used during scan of LRU while doing
 	LRU list batch.  Protected by buf_pool_t::mutex. */
 	LRUHp		lru_hp;
@@ -1942,10 +1868,22 @@
   /** Sentinels to detect if pages are read into the buffer pool while
   a delete-buffering operation is pending. Protected by mutex. */
   buf_page_t watch[innodb_purge_threads_MAX + 1];
+
+  /** Clear LRU_warned */
+  void LRU_warned_clear() noexcept
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    LRU_warned= false;
+  }
+
   /** Reserve a buffer. */
   buf_tmp_buffer_t *io_buf_reserve(bool wait_for_reads) noexcept
   { return io_buf.reserve(wait_for_reads); }
 
+  /** Try to allocate a block.
+  @return a buffer block
+  @retval nullptr if no blocks are available */
+  buf_block_t *allocate() noexcept;
   /** Remove a block from flush_list.
   @param bpage   buffer pool page */
   void delete_from_flush_list(buf_page_t *bpage) noexcept;
@@ -1968,6 +1906,13 @@
   /** Issue a warning that we could not free up buffer pool pages. */
   ATTRIBUTE_COLD void LRU_warn() noexcept;
 
+  /** Print buffer pool flush state information. */
+  ATTRIBUTE_COLD void print_flush_info() const noexcept;
+
+  /** Collect buffer pool metadata.
+  @param pool_info    buffer pool metadata */
+  void get_info(buf_pool_info_t *pool_info) noexcept;
+
 private:
   /** Temporary memory for page_compressed and encrypted I/O */
   struct io_buf_t
@@ -1984,9 +1929,6 @@
     /** Reserve a buffer */
     buf_tmp_buffer_t *reserve(bool wait_for_reads) noexcept;
   } io_buf;
-
-  /** whether resize() is in the critical path */
-  std::atomic<bool> resizing;
 };
 
 /** The InnoDB buffer pool */
@@ -2135,24 +2077,6 @@
   this->old= old;
 }
 
-#ifdef UNIV_DEBUG
-/** Forbid the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_forbid() do {		\
-	mysql_mutex_assert_owner(&buf_pool.mutex);	\
-	buf_pool.mutex_exit_forbidden++;		\
-} while (0)
-/** Allow the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_allow() do {		\
-	mysql_mutex_assert_owner(&buf_pool.mutex);	\
-	ut_ad(buf_pool.mutex_exit_forbidden--);		\
-} while (0)
-#else
-/** Forbid the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_forbid() ((void) 0)
-/** Allow the release of the buffer pool mutex. */
-# define buf_pool_mutex_exit_allow() ((void) 0)
-#endif
-
 /**********************************************************************
 Let us list the consistency conditions for different control block states.
 
diff -Nru mariadb-10.11.11/storage/innobase/include/buf0buf.inl mariadb-10.11.13/storage/innobase/include/buf0buf.inl
--- mariadb-10.11.11/storage/innobase/include/buf0buf.inl	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/buf0buf.inl	2025-05-19 16:14:25.000000000 +0000
@@ -37,7 +37,7 @@
 	/* FIXME: bpage->freed_page_clock is 31 bits */
 	return((buf_pool.freed_page_clock & ((1UL << 31) - 1))
 	       < (bpage->freed_page_clock
-		  + (buf_pool.curr_size
+		  + (buf_pool.curr_size()
 		     * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio)
 		     / (BUF_LRU_OLD_RATIO_DIV * 4))));
 }
diff -Nru mariadb-10.11.11/storage/innobase/include/buf0dblwr.h mariadb-10.11.13/storage/innobase/include/buf0dblwr.h
--- mariadb-10.11.11/storage/innobase/include/buf0dblwr.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/buf0dblwr.h	2025-05-19 16:14:25.000000000 +0000
@@ -159,6 +159,9 @@
       my_cond_wait(&cond, &mutex.m_mutex);
     mysql_mutex_unlock(&mutex);
   }
+
+  /** Print double write state information. */
+  ATTRIBUTE_COLD void print_info() const noexcept;
 };
 
 /** The doublewrite buffer */
diff -Nru mariadb-10.11.11/storage/innobase/include/buf0lru.h mariadb-10.11.13/storage/innobase/include/buf0lru.h
--- mariadb-10.11.11/storage/innobase/include/buf0lru.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/buf0lru.h	2025-05-19 16:14:25.000000000 +0000
@@ -55,10 +55,6 @@
 @return true if found and freed */
 bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED);
 
-/** @return a buffer block from the buf_pool.free list
-@retval	NULL	if the free list is empty */
-buf_block_t* buf_LRU_get_free_only();
-
 /** Get a block from the buf_pool.free list.
 If the list is empty, blocks will be moved from the end of buf_pool.LRU
 to buf_pool.free.
diff -Nru mariadb-10.11.11/storage/innobase/include/dict0dict.h mariadb-10.11.13/storage/innobase/include/dict0dict.h
--- mariadb-10.11.11/storage/innobase/include/dict0dict.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/dict0dict.h	2025-05-19 16:14:25.000000000 +0000
@@ -146,21 +146,21 @@
                       MDL_ticket **mdl= nullptr)
   MY_ATTRIBUTE((warn_unused_result));
 
-/** Decrement the count of open handles */
-void dict_table_close(dict_table_t *table);
-
-/** Decrements the count of open handles of a table.
-@param[in,out]	table		table
-@param[in]	dict_locked	whether dict_sys.latch is being held
-@param[in]	thd		thread to release MDL
-@param[in]	mdl		metadata lock or NULL if the thread is a
-				foreground one. */
-void
-dict_table_close(
-	dict_table_t*	table,
-	bool		dict_locked,
-	THD*		thd = NULL,
-	MDL_ticket*	mdl = NULL);
+/** Release a metadata lock.
+@param thd    connection that holds mdl
+@param mdl    metadata lock, or nullptr */
+void mdl_release(THD *thd, MDL_ticket *mdl) noexcept;
+
+/** Release a table reference and a metadata lock.
+@param table  referenced table
+@param thd    connection that holds mdl
+@param mdl    metadata lock, or nullptr */
+inline void dict_table_close(dict_table_t* table, THD *thd, MDL_ticket *mdl)
+  noexcept
+{
+  table->release();
+  mdl_release(thd, mdl);
+}
 
 /*********************************************************************//**
 Gets the minimum number of bytes per character.
@@ -674,7 +674,7 @@
 @return estimated number of rows */
 inline uint64_t dict_table_get_n_rows(const dict_table_t *table)
 {
-  ut_ad(table->stat_initialized);
+  ut_ad(table->stat_initialized());
   return table->stat_n_rows;
 }
 
@@ -1657,6 +1657,27 @@
 dict_table_have_virtual_index(
 	dict_table_t*	table);
 
+/** Helper for opening the InnoDB persistent statistics tables */
+class dict_stats final
+{
+  MDL_context *mdl_context= nullptr;
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  dict_table_t *table_stats= nullptr, *index_stats= nullptr;
+
+public:
+  dict_stats()= default;
+
+  /** Open the statistics tables.
+  @return whether the operation failed */
+  bool open(THD *thd) noexcept;
+
+  /** Close the statistics tables after !open_tables(thd). */
+  void close() noexcept;
+
+  dict_table_t *table() const noexcept { return table_stats; }
+  dict_table_t *index() const noexcept { return index_stats; }
+};
+
 #include "dict0dict.inl"
 
 #endif
diff -Nru mariadb-10.11.11/storage/innobase/include/dict0dict.inl mariadb-10.11.13/storage/innobase/include/dict0dict.inl
--- mariadb-10.11.11/storage/innobase/include/dict0dict.inl	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/dict0dict.inl	2025-05-19 16:14:25.000000000 +0000
@@ -1076,8 +1076,8 @@
 /** Acquire the table handle. */
 inline void dict_table_t::acquire()
 {
-  ut_ad(dict_sys.frozen());
-  n_ref_count++;
+  ut_d(const auto old=) n_ref_count++;
+  ut_ad(old || dict_sys.frozen());
 }
 
 /** Release the table handle.
diff -Nru mariadb-10.11.11/storage/innobase/include/dict0mem.h mariadb-10.11.13/storage/innobase/include/dict0mem.h
--- mariadb-10.11.11/storage/innobase/include/dict0mem.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/dict0mem.h	2025-05-19 16:14:25.000000000 +0000
@@ -1106,15 +1106,12 @@
 				is indexed from 0 to n_uniq-1); This
 				is used when innodb_stats_method is
 				"nulls_ignored". */
-	ulint		stat_index_size;
+	uint32_t	stat_index_size;
 				/*!< approximate index size in
 				database pages */
-	ulint		stat_n_leaf_pages;
+	uint32_t	stat_n_leaf_pages;
 				/*!< approximate number of leaf pages in the
 				index tree */
-	bool		stats_error_printed;
-				/*!< has persistent statistics error printed
-				for this index ? */
 	/* @} */
 	/** Statistics for defragmentation, these numbers are estimations and
 	could be very inaccurate at certain times, e.g. right after restart,
@@ -2358,63 +2355,32 @@
 	/** Statistics for query optimization. Mostly protected by
 	dict_sys.latch and stats_mutex_lock(). @{ */
 
-	/** TRUE if statistics have been calculated the first time after
-	database startup or table creation. */
-	unsigned				stat_initialized:1;
-
 	/** Timestamp of last recalc of the stats. */
 	time_t					stats_last_recalc;
 
-	/** The two bits below are set in the 'stat_persistent' member. They
-	have the following meaning:
-	1. _ON=0, _OFF=0, no explicit persistent stats setting for this table,
-	the value of the global srv_stats_persistent is used to determine
-	whether the table has persistent stats enabled or not
-	2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this
-	table, regardless of the value of the global srv_stats_persistent
-	3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this
-	table, regardless of the value of the global srv_stats_persistent
-	4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
-	#define DICT_STATS_PERSISTENT_ON	(1 << 1)
-	#define DICT_STATS_PERSISTENT_OFF	(1 << 2)
-
-	/** Indicates whether the table uses persistent stats or not. See
-	DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */
-	ib_uint32_t				stat_persistent;
-
-	/** The two bits below are set in the 'stats_auto_recalc' member. They
-	have the following meaning:
-	1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the
-	value of the global srv_stats_persistent_auto_recalc is used to
-	determine whether the table has auto recalc enabled or not
-	2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table,
-	regardless of the value of the global srv_stats_persistent_auto_recalc
-	3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table,
-	regardless of the value of the global srv_stats_persistent_auto_recalc
-	4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
-	#define DICT_STATS_AUTO_RECALC_ON	(1 << 1)
-	#define DICT_STATS_AUTO_RECALC_OFF	(1 << 2)
-
-	/** Indicates whether the table uses automatic recalc for persistent
-	stats or not. See DICT_STATS_AUTO_RECALC_ON and
-	DICT_STATS_AUTO_RECALC_OFF. */
-	ib_uint32_t				stats_auto_recalc;
-
-	/** The number of pages to sample for this table during persistent
-	stats estimation. If this is 0, then the value of the global
-	srv_stats_persistent_sample_pages will be used instead. */
-	ulint					stats_sample_pages;
+  static constexpr uint32_t STATS_INITIALIZED= 1U;
+  static constexpr uint32_t STATS_PERSISTENT_ON= 1U << 1;
+  static constexpr uint32_t STATS_PERSISTENT_OFF= 1U << 2;
+  static constexpr uint32_t STATS_AUTO_RECALC_ON= 1U << 3;
+  static constexpr uint32_t STATS_AUTO_RECALC_OFF= 1U << 4;
+
+  /** flags for index cardinality statistics */
+  Atomic_relaxed<uint32_t> stat;
+  /** Approximate clustered index size in database pages. */
+  uint32_t stat_clustered_index_size;
+  /** Approximate size of other indexes in database pages. */
+  uint32_t stat_sum_of_other_index_sizes;
+
+
+  /** The number of pages to sample for this table during persistent
+  stats estimation. If this is 0, then the value of the global
+  srv_stats_persistent_sample_pages will be used instead. */
+  uint32_t stats_sample_pages;
 
 	/** Approximate number of rows in the table. We periodically calculate
 	new estimates. */
 	ib_uint64_t				stat_n_rows;
 
-	/** Approximate clustered index size in database pages. */
-	ulint					stat_clustered_index_size;
-
-	/** Approximate size of other indexes in database pages. */
-	ulint					stat_sum_of_other_index_sizes;
-
 	/** How many rows are modified since last stats recalc. When a row is
 	inserted, updated, or deleted, we add 1 to this number; we calculate
 	new estimates for the table and the indexes if the table has changed
@@ -2424,7 +2390,7 @@
 	ib_uint64_t				stat_modified_counter;
 
 	bool		stats_error_printed;
-				/*!< Has persistent stats error beein
+				/*!< Has persistent stats error been
 				already printed for this table ? */
 	/* @} */
 
@@ -2551,6 +2517,35 @@
   /** @return the index for that starts with a specific column */
   dict_index_t *get_index(const dict_col_t &col) const;
 
+  /** @return whether the statistics are initialized */
+  static bool stat_initialized(uint32_t stat) noexcept
+  { return stat & STATS_INITIALIZED; }
+
+  /** @return whether STATS_PERSISTENT is enabled */
+  static bool stats_is_persistent(uint32_t stat) noexcept
+  {
+    ut_ad(~(stat & (STATS_PERSISTENT_ON | STATS_PERSISTENT_OFF)));
+    if (stat & STATS_PERSISTENT_ON) return true;
+    return !(stat & STATS_PERSISTENT_OFF) && srv_stats_persistent;
+  }
+  /** @return whether STATS_AUTO_RECALC is enabled */
+  static bool stats_is_auto_recalc(uint32_t stat) noexcept
+  {
+    ut_ad(stat_initialized(stat));
+    ut_ad(~(stat & (STATS_AUTO_RECALC_ON | STATS_AUTO_RECALC_OFF)));
+    if (stat & STATS_AUTO_RECALC_ON) return true;
+    return !(stat & STATS_AUTO_RECALC_OFF) && srv_stats_auto_recalc;
+  }
+
+  /** @return whether the statistics are initialized */
+  bool stat_initialized() const noexcept { return stat_initialized(stat); }
+  /** @return whether STATS_PERSISTENT is enabled */
+  bool stats_is_persistent() const noexcept
+  { return stats_is_persistent(stat); }
+  /** @return whether STATS_AUTO_RECALC is enabled */
+  bool stats_is_auto_recalc() const noexcept
+  { return stats_is_auto_recalc(stat); }
+
   /** Create metadata.
   @param name     table name
   @param space    tablespace
diff -Nru mariadb-10.11.11/storage/innobase/include/dict0stats.h mariadb-10.11.13/storage/innobase/include/dict0stats.h
--- mariadb-10.11.11/storage/innobase/include/dict0stats.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/dict0stats.h	2025-05-19 16:14:25.000000000 +0000
@@ -30,84 +30,6 @@
 #include "dict0types.h"
 #include "trx0types.h"
 
-enum dict_stats_upd_option_t {
-	DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
-				statistics using a precise and slow
-				algo and save them to the persistent
-				storage, if the persistent storage is
-				not present then emit a warning and
-				fall back to transient stats */
-	DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
-				using an imprecise quick algo
-				without saving the results
-				persistently */
-	DICT_STATS_EMPTY_TABLE,	/* Write all zeros (or 1 where it makes sense)
-				into a table and its indexes' statistics
-				members. The resulting stats correspond to an
-				empty table. If the table is using persistent
-				statistics, then they are saved on disk. */
-	DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats
-				from the persistent storage if the in-memory
-				structures have not been initialized yet,
-				otherwise do nothing */
-};
-
-/*********************************************************************//**
-Set the persistent statistics flag for a given table. This is set only
-in the in-memory table object and is not saved on disk. It will be read
-from the .frm file upon first open from MySQL after a server restart. */
-UNIV_INLINE
-void
-dict_stats_set_persistent(
-/*======================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
-	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
-	MY_ATTRIBUTE((nonnull));
-
-/** @return whether persistent statistics is enabled for a given table */
-UNIV_INLINE
-bool
-dict_stats_is_persistent_enabled(const dict_table_t* table)
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/*********************************************************************//**
-Set the auto recalc flag for a given table (only honored for a persistent
-stats enabled table). The flag is set only in the in-memory table object
-and is not saved in InnoDB files. It will be read from the .frm file upon
-first open from MySQL after a server restart. */
-UNIV_INLINE
-void
-dict_stats_auto_recalc_set(
-/*=======================*/
-	dict_table_t*	table,			/*!< in/out: table */
-	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
-	ibool		auto_recalc_off);	/*!< in: explicitly disabled */
-
-/** @return whether auto recalc is enabled for a given table*/
-UNIV_INLINE
-bool
-dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/*********************************************************************//**
-Initialize table's stats for the first time when opening a table. */
-UNIV_INLINE
-void
-dict_stats_init(
-/*============*/
-	dict_table_t*	table);	/*!< in/out: table */
-
-/*********************************************************************//**
-Deinitialize table's stats after the last close of the table. This is
-used to detect "FLUSH TABLE" and refresh the stats upon next open. */
-UNIV_INLINE
-void
-dict_stats_deinit(
-/*==============*/
-	dict_table_t*	table)	/*!< in/out: table */
-	MY_ATTRIBUTE((nonnull));
-
 #ifdef WITH_WSREP
 /** Update the table modification counter and if necessary,
 schedule new estimates for table and index statistics to be calculated.
@@ -124,19 +46,6 @@
 # define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t)
 #endif
 
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization.
-@return DB_* error code or DB_SUCCESS */
-dberr_t
-dict_stats_update(
-/*==============*/
-	dict_table_t*		table,	/*!< in/out: table */
-	dict_stats_upd_option_t	stats_upd_option);
-					/*!< in: whether to (re) calc
-					the stats or to fetch them from
-					the persistent storage */
-
 /** Execute DELETE FROM mysql.innodb_table_stats
 @param database_name  database name
 @param table_name     table name
@@ -173,6 +82,50 @@
 	dict_index_t*	index)	/*!< in/out: index */
 	MY_ATTRIBUTE((nonnull));
 
+enum dict_stats_schema_check {
+  /** The InnoDB persistent statistics tables do not exist. */
+  SCHEMA_NOT_EXIST= -1,
+  /** The schema of the InnoDB persistent statistics tables is valid. */
+  SCHEMA_OK= 0,
+  /** The schema is invalid. */
+  SCHEMA_INVALID
+};
+
+/** @return whether the persistent statistics storage is usable */
+dict_stats_schema_check
+dict_stats_persistent_storage_check(bool dict_already_locked= false) noexcept;
+
+/** Save the persistent statistics of a table or an index.
+@param table            table whose stats to save
+@param only_for_index   the index ID to save statistics for (0=all)
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_save(dict_table_t* table, index_id_t index_id= 0);
+
+/** Read the stored persistent statistics of a table. */
+dberr_t dict_stats_fetch_from_ps(dict_table_t *table);
+
+/**
+Calculate new estimates for table and index statistics. This function
+is relatively quick and is used to calculate non-persistent statistics.
+@param table    table for which the non-persistent statistics are being updated
+@return error code
+@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */
+dberr_t dict_stats_update_transient(dict_table_t *table) noexcept;
+
+/**
+Calculate new estimates for table and index statistics. This function
+is slower than dict_stats_update_transient().
+@param table    table for which the persistent statistics are being updated
+@return DB_SUCCESS or error code
+@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
+dberr_t dict_stats_update_persistent(dict_table_t *table) noexcept;
+
+/**
+Try to calculate and save new estimates for persistent statistics.
+If persistent statistics are not enabled for the table or not available,
+this does nothing. */
+dberr_t dict_stats_update_persistent_try(dict_table_t *table);
+
 /** Rename a table in InnoDB persistent stats storage.
 @param old_name  old table name
 @param new_name  new table name
@@ -229,8 +182,6 @@
 dict_stats_report_error(dict_table_t* table, bool defragment = false)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-#include "dict0stats.inl"
-
 #ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
 void test_dict_stats_all();
 #endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
@@ -244,4 +195,8 @@
 dict_stats_empty_table(
 	dict_table_t*	table,
 	bool		empty_defrag_stats);
+
+/** Clear the statistics for a table and save them if
+persistent statistics are enabled. */
+void dict_stats_empty_table_and_save(dict_table_t *table);
 #endif /* dict0stats_h */
diff -Nru mariadb-10.11.11/storage/innobase/include/dict0stats.inl mariadb-10.11.13/storage/innobase/include/dict0stats.inl
--- mariadb-10.11.11/storage/innobase/include/dict0stats.inl	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/dict0stats.inl	1970-01-01 00:00:00.000000000 +0000
@@ -1,219 +0,0 @@
-/*****************************************************************************
-
-Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
-
-This program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free Software
-Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
-
-*****************************************************************************/
-
-/**************************************************//**
-@file include/dict0stats.ic
-Code used for calculating and manipulating table statistics.
-
-Created Jan 23, 2012 Vasil Dimov
-*******************************************************/
-
-#include "dict0dict.h"
-#include "srv0srv.h"
-
-/*********************************************************************//**
-Set the persistent statistics flag for a given table. This is set only
-in the in-memory table object and is not saved on disk. It will be read
-from the .frm file upon first open from MySQL after a server restart. */
-UNIV_INLINE
-void
-dict_stats_set_persistent(
-/*======================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
-	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
-{
-	/* Not allowed to have both flags set, but a CREATE or ALTER
-	statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would
-	end up having both set. In this case we clear the OFF flag. */
-	if (ps_on && ps_off) {
-		ps_off = FALSE;
-	}
-
-	ib_uint32_t	stat_persistent = 0;
-
-	if (ps_on) {
-		stat_persistent |= DICT_STATS_PERSISTENT_ON;
-	}
-
-	if (ps_off) {
-		stat_persistent |= DICT_STATS_PERSISTENT_OFF;
-	}
-
-	/* we rely on this assignment to be atomic */
-	table->stat_persistent = stat_persistent;
-}
-
-/** @return whether persistent statistics is enabled for a given table */
-UNIV_INLINE
-bool
-dict_stats_is_persistent_enabled(const dict_table_t* table)
-{
-	/* Because of the nature of this check (non-locking) it is possible
-	that a table becomes:
-	* PS-disabled immediately after this function has returned TRUE or
-	* PS-enabled immediately after this function has returned FALSE.
-	This means that it is possible that we do:
-	+ dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has
-	  just been PS-disabled or
-	+ dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
-	  just been PS-enabled.
-	This is acceptable. Avoiding this would mean that we would have to
-	hold dict_sys.latch or stats_mutex_lock() like for accessing the
-	other ::stat_ members which would be too big performance penalty,
-	especially when this function is called from
-	dict_stats_update_if_needed(). */
-
-	/* we rely on this read to be atomic */
-	ib_uint32_t	stat_persistent = table->stat_persistent;
-
-	if (stat_persistent & DICT_STATS_PERSISTENT_ON) {
-		ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF));
-		return(true);
-	} else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) {
-		return(false);
-	} else {
-		return(srv_stats_persistent);
-	}
-}
-
-/*********************************************************************//**
-Set the auto recalc flag for a given table (only honored for a persistent
-stats enabled table). The flag is set only in the in-memory table object
-and is not saved in InnoDB files. It will be read from the .frm file upon
-first open from MySQL after a server restart. */
-UNIV_INLINE
-void
-dict_stats_auto_recalc_set(
-/*=======================*/
-	dict_table_t*	table,			/*!< in/out: table */
-	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
-	ibool		auto_recalc_off)	/*!< in: explicitly disabled */
-{
-	ut_ad(!auto_recalc_on || !auto_recalc_off);
-
-	ib_uint32_t	stats_auto_recalc = 0;
-
-	if (auto_recalc_on) {
-		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON;
-	}
-
-	if (auto_recalc_off) {
-		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF;
-	}
-
-	/* we rely on this assignment to be atomic */
-	table->stats_auto_recalc = stats_auto_recalc;
-}
-
-/** @return whether auto recalc is enabled for a given table*/
-UNIV_INLINE
-bool
-dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
-{
-	/* we rely on this read to be atomic */
-	ib_uint32_t	stats_auto_recalc = table->stats_auto_recalc;
-
-	if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) {
-		ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF));
-		return(true);
-	} else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) {
-		return(false);
-	} else {
-		return(srv_stats_auto_recalc);
-	}
-}
-
-/*********************************************************************//**
-Initialize table's stats for the first time when opening a table. */
-UNIV_INLINE
-void
-dict_stats_init(
-/*============*/
-	dict_table_t*	table)	/*!< in/out: table */
-{
-	ut_ad(!table->stats_mutex_is_owner());
-
-	if (table->stat_initialized) {
-		return;
-	}
-
-	dict_stats_upd_option_t	opt;
-
-	if (dict_stats_is_persistent_enabled(table)) {
-		opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
-	} else {
-		opt = DICT_STATS_RECALC_TRANSIENT;
-	}
-
-	dict_stats_update(table, opt);
-}
-
-/*********************************************************************//**
-Deinitialize table's stats after the last close of the table. This is
-used to detect "FLUSH TABLE" and refresh the stats upon next open. */
-UNIV_INLINE
-void
-dict_stats_deinit(
-/*==============*/
-	dict_table_t*	table)	/*!< in/out: table */
-{
-	ut_ad(table->stats_mutex_is_owner());
-	ut_ad(table->get_ref_count() == 0);
-
-#ifdef HAVE_valgrind
-	if (!table->stat_initialized) {
-		return;
-	}
-
-	MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows);
-	MEM_UNDEFINED(&table->stat_clustered_index_size,
-		      sizeof table->stat_clustered_index_size);
-	MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes,
-		      sizeof table->stat_sum_of_other_index_sizes);
-	MEM_UNDEFINED(&table->stat_modified_counter,
-		      sizeof table->stat_modified_counter);
-
-	dict_index_t*   index;
-
-	for (index = dict_table_get_first_index(table);
-	     index != NULL;
-	     index = dict_table_get_next_index(index)) {
-		MEM_UNDEFINED(
-			index->stat_n_diff_key_vals,
-			index->n_uniq
-			* sizeof index->stat_n_diff_key_vals[0]);
-		MEM_UNDEFINED(
-			index->stat_n_sample_sizes,
-			index->n_uniq
-			* sizeof index->stat_n_sample_sizes[0]);
-		MEM_UNDEFINED(
-			index->stat_n_non_null_key_vals,
-			index->n_uniq
-			* sizeof index->stat_n_non_null_key_vals[0]);
-		MEM_UNDEFINED(
-			&index->stat_index_size,
-			sizeof(index->stat_index_size));
-		MEM_UNDEFINED(
-			&index->stat_n_leaf_pages,
-			sizeof(index->stat_n_leaf_pages));
-	}
-#endif /* HAVE_valgrind */
-	table->stat_initialized = FALSE;
-}
diff -Nru mariadb-10.11.11/storage/innobase/include/fil0fil.h mariadb-10.11.13/storage/innobase/include/fil0fil.h
--- mariadb-10.11.11/storage/innobase/include/fil0fil.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/fil0fil.h	2025-05-19 16:14:25.000000000 +0000
@@ -351,7 +351,7 @@
   /** fil_system.spaces chain node */
   fil_space_t *hash= nullptr;
   /** log_sys.get_lsn() of the most recent fil_names_write_if_was_clean().
-  Reset to 0 by fil_names_clear(). Protected by log_sys.mutex.
+  Reset to 0 by fil_names_clear(). Protected by log_sys.latch_have_wr().
   If and only if this is nonzero, the tablespace will be in named_spaces. */
   lsn_t max_lsn= 0;
   /** base node for the chain of data files; multiple entries are
@@ -422,7 +422,7 @@
   bool being_imported= false;
 
   /** Whether any corrupton of this tablespace has been reported */
-  mutable std::atomic_flag is_corrupted{false};
+  mutable std::atomic_flag is_corrupted= ATOMIC_FLAG_INIT;
 
 public:
   /** mutex to protect freed_ranges and last_freed_lsn */
@@ -1527,7 +1527,10 @@
 
 inline void fil_space_t::reacquire() noexcept
 {
-  ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
+#ifdef SAFE_MUTEX
+  uint32_t n=
+#endif
+  n_pending.fetch_add(1, std::memory_order_relaxed);
 #ifdef SAFE_MUTEX
   if (mysql_mutex_is_owner(&fil_system.mutex)) return;
   ut_ad(n & PENDING);
diff -Nru mariadb-10.11.11/storage/innobase/include/fsp0fsp.h mariadb-10.11.13/storage/innobase/include/fsp0fsp.h
--- mariadb-10.11.11/storage/innobase/include/fsp0fsp.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/fsp0fsp.h	2025-05-19 16:14:25.000000000 +0000
@@ -355,9 +355,9 @@
 @param[out]     used    number of pages that are used (not more than reserved)
 @param[in,out]  mtr     mini-transaction
 @return number of reserved pages */
-ulint fseg_n_reserved_pages(const buf_block_t &block,
-                            const fseg_header_t *header, ulint *used,
-                            mtr_t *mtr)
+uint32_t fseg_n_reserved_pages(const buf_block_t &block,
+                               const fseg_header_t *header, uint32_t *used,
+                               mtr_t *mtr) noexcept
   MY_ATTRIBUTE((nonnull));
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
diff -Nru mariadb-10.11.11/storage/innobase/include/ibuf0ibuf.h mariadb-10.11.13/storage/innobase/include/ibuf0ibuf.h
--- mariadb-10.11.11/storage/innobase/include/ibuf0ibuf.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/ibuf0ibuf.h	2025-05-19 16:14:25.000000000 +0000
@@ -62,11 +62,11 @@
 
 /** Insert buffer struct */
 struct ibuf_t{
-	Atomic_relaxed<ulint> size;	/*!< current size of the ibuf index
+	Atomic_relaxed<uint32_t> size;	/*!< current size of the ibuf index
 					tree, in pages */
-	Atomic_relaxed<ulint> max_size;	/*!< recommended maximum size of the
+	Atomic_relaxed<uint32_t> max_size;/*!< recommended maximum size of the
 					ibuf index tree, in pages */
-	ulint		seg_size;	/*!< allocated pages of the file
+	uint32_t	seg_size;	/*!< allocated pages of the file
 					segment containing ibuf header and
 					tree */
 	bool		empty;		/*!< Protected by the page
@@ -75,8 +75,8 @@
 					(FSP_IBUF_TREE_ROOT_PAGE_NO). true
 					if and only if the insert
 					buffer tree is empty. */
-	ulint		free_list_len;	/*!< length of the free list */
-	ulint		height;		/*!< tree height */
+	uint8_t		height;		/*!< tree height */
+	uint32_t	free_list_len;	/*!< length of the free list */
 	dict_index_t*	index;		/*!< insert buffer index */
 
 	/** number of pages merged */
diff -Nru mariadb-10.11.11/storage/innobase/include/log0log.h mariadb-10.11.13/storage/innobase/include/log0log.h
--- mariadb-10.11.11/storage/innobase/include/log0log.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/log0log.h	2025-05-19 16:14:25.000000000 +0000
@@ -64,20 +64,19 @@
 
 /** Write to the log file up to the last log entry.
 @param durable  whether to wait for a durable write to complete */
-void log_buffer_flush_to_disk(bool durable= true);
-
+void log_buffer_flush_to_disk(bool durable= true) noexcept;
 
 /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
-ATTRIBUTE_COLD void log_write_and_flush_prepare();
+ATTRIBUTE_COLD void log_write_and_flush_prepare() noexcept;
 
 /** Durably write the log up to log_sys.get_lsn(). */
-ATTRIBUTE_COLD void log_write_and_flush();
+ATTRIBUTE_COLD void log_write_and_flush() noexcept;
 
 /** Make a checkpoint */
-ATTRIBUTE_COLD void log_make_checkpoint();
+ATTRIBUTE_COLD void log_make_checkpoint() noexcept;
 
 /** Make a checkpoint at the latest lsn on shutdown. */
-ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown();
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown() noexcept;
 
 /******************************************************//**
 Prints info of the log. */
@@ -167,40 +166,35 @@
   static constexpr lsn_t FIRST_LSN= START_OFFSET;
 
 private:
-  /** the lock bit in buf_free */
-  static constexpr size_t buf_free_LOCK= ~(~size_t{0} >> 1);
+  /** the least significant bit of the write_to_buf buffer */
+  static constexpr size_t WRITE_TO_BUF_SHIFT{34};
+  /** write_lsn_offset component for incrementing write_to_buf */
+  static constexpr uint64_t WRITE_TO_BUF{1ULL << WRITE_TO_BUF_SHIFT};
+  /** write_lsn_offset flag to indicate that append_prepare_wait() is active */
+  static constexpr uint64_t WRITE_BACKOFF{1ULL << 33};
+
+  /** The current log sequence number, relative to base_lsn, and flags;
+  may be modified while latch_have_any() */
   alignas(CPU_LEVEL1_DCACHE_LINESIZE)
-  /** first free offset within buf used;
-  the most significant bit is set by lock_lsn() to protect this field
-  as well as write_to_buf, waits */
-  std::atomic<size_t> buf_free;
-public:
-  /** number of write requests (to buf); protected by lock_lsn() or lsn_lock */
-  size_t write_to_buf;
-  /** log record buffer, written to by mtr_t::commit() */
-  byte *buf;
-private:
-  /** The log sequence number of the last change of durable InnoDB files;
-  protected by lock_lsn() or lsn_lock or latch.wr_lock() */
-  std::atomic<lsn_t> lsn;
+  Atomic_relaxed<uint64_t> write_lsn_offset;
+  /** the LSN of the last write_buf() or persist(); protected by latch */
+  std::atomic<lsn_t> base_lsn;
   /** the first guaranteed-durable log sequence number */
   std::atomic<lsn_t> flushed_to_disk_lsn;
 public:
-  /** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */
-  size_t waits;
-  /** innodb_log_buffer_size (size of buf,flush_buf if !is_mmap(), in bytes) */
+  /** innodb_log_buffer_size (usable append_prepare() size in bytes) */
   unsigned buf_size;
   /** log file size in bytes, including the header */
   lsn_t file_size;
 
 #ifdef LOG_LATCH_DEBUG
   typedef srw_lock_debug log_rwlock;
-  typedef srw_mutex log_lsn_lock;
 
   bool latch_have_wr() const { return latch.have_wr(); }
   bool latch_have_rd() const { return latch.have_rd(); }
   bool latch_have_any() const { return latch.have_any(); }
 #else
+  typedef srw_lock log_rwlock;
 # ifndef UNIV_DEBUG
 # elif defined SUX_LOCK_GENERIC
   bool latch_have_wr() const { return true; }
@@ -211,23 +205,23 @@
   bool latch_have_rd() const { return latch.is_locked(); }
   bool latch_have_any() const { return latch.is_locked(); }
 # endif
-# ifdef __aarch64__
-  /* On ARM, we spin more */
-  typedef srw_spin_lock log_rwlock;
-  typedef pthread_mutex_wrapper<true> log_lsn_lock;
-# else
-  typedef srw_lock log_rwlock;
-  typedef srw_mutex log_lsn_lock;
-# endif
 #endif
-  /** exclusive latch for checkpoint, shared for mtr_t::commit() to buf */
-  alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock latch;
+  /** latch_have_wr() for checkpoint, latch_have_any() for append_prepare() */
+  log_rwlock latch;
+
+  /** log record buffer, written to by mtr_t::commit() */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) byte *buf;
+
+  /** number of write requests to buf,
+  excluding (write_lsn_offset & WRITE_TO_BUF);
+  protected by latch.wr_lock() */
+  size_t write_to_buf;
 
   /** number of writes from buf or flush_buf to log;
   protected by latch.wr_lock() */
-  ulint write_to_log;
+  size_t write_to_log;
 
-  /** Last written LSN */
+  /** Last written LSN; protected by latch */
   lsn_t write_lsn;
 
   /** Buffer for writing data to ib_logfile0, or nullptr if is_mmap().
@@ -241,8 +235,6 @@
   Atomic_relaxed<bool> checkpoint_pending;
   /** next checkpoint number (protected by latch.wr_lock()) */
   byte next_checkpoint_no;
-  /** recommended maximum buf_free size, after which the buffer is flushed */
-  unsigned max_buf_free;
   /** Log sequence number when a log file overwrite (broken crash recovery)
   was noticed. Protected by latch.wr_lock(). */
   lsn_t overwrite_warned;
@@ -266,12 +258,6 @@
   /** Buffer for writing to resize_log; @see flush_buf */
   byte *resize_flush_buf;
 
-  /** Special implementation of lock_lsn() for IA-32 and AMD64 */
-  void lsn_lock_bts() noexcept;
-  /** Acquire a lock for updating buf_free and related fields.
-  @return the value of buf_free */
-  size_t lock_lsn() noexcept;
-
   /** log sequence number when log resizing was initiated;
   0 if the log is not being resized, 1 if resize_start() is in progress */
   std::atomic<lsn_t> resize_lsn;
@@ -303,7 +289,6 @@
   bool log_maybe_unbuffered;
 # endif
 #endif
-
 	/** Fields involved in checkpoints @{ */
 	lsn_t		log_capacity;	/*!< capacity of the log; if
 					the checkpoint age exceeds this, it is
@@ -326,34 +311,26 @@
 	/* @} */
 
 private:
-  /** A lock when the spin-only lock_lsn() is not being used */
-  log_lsn_lock lsn_lock;
+  /** the thread that initiated resize_lsn() */
+  Atomic_relaxed<void*> resize_initiator;
+#ifdef HAVE_PMEM
+  /** mutex protecting wrap-around in resize_write() */
+  srw_mutex resize_wrap_mutex;
+#endif
 public:
+  /** number of long append_prepare_wait(); protected by latch_have_wr() */
+  size_t waits;
 
-  bool is_initialised() const noexcept { return max_buf_free != 0; }
-
-  /** whether there is capacity in the log buffer */
-  bool buf_free_ok() const noexcept
-  {
-    ut_ad(!is_mmap());
-    return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) <
-      max_buf_free;
-  }
-
+  bool is_initialised() const noexcept
+  { return base_lsn.load(std::memory_order_relaxed) != 0; }
   inline void set_recovered() noexcept;
 
-  void set_buf_free(size_t f) noexcept
-  { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); }
-
   bool is_mmap() const noexcept { return !flush_buf; }
 
   /** @return whether a handle to the log is open;
   is_mmap() && !is_opened() holds for PMEM */
   bool is_opened() const noexcept { return log.is_opened(); }
 
-  /** @return target write LSN to react on !buf_free_ok() */
-  inline lsn_t get_write_target() const;
-
   /** @return LSN at which log resizing was started and is still in progress
       @retval 0 if no log resizing is in progress
       @retval 1 if resize_start() is in progress */
@@ -367,11 +344,17 @@
 
   /** Start resizing the log and release the exclusive latch.
   @param size  requested new file_size
+  @param thd   the current thread identifier
   @return whether the resizing was started successfully */
-  resize_start_status resize_start(os_offset_t size) noexcept;
+  resize_start_status resize_start(os_offset_t size, void *thd) noexcept;
 
-  /** Abort any resize_start(). */
-  void resize_abort() noexcept;
+  /** Abort a resize_start() that we started.
+  @param thd  thread identifier that had been passed to resize_start() */
+  void resize_abort(void *thd) noexcept;
+
+  /** @return whether a particular resize_start() is in progress */
+  bool resize_running(void *thd) const noexcept
+  { return thd == resize_initiator; }
 
   /** Replicate a write to the log.
   @param lsn  start LSN
@@ -400,53 +383,64 @@
   { return resize_buf + resize_target; }
 
   /** Initialise the redo log subsystem. */
-  void create();
+  void create() noexcept;
 
   /** Attach a log file.
   @return whether the memory allocation succeeded */
-  bool attach(log_file_t file, os_offset_t size);
+  bool attach(log_file_t file, os_offset_t size) noexcept;
 
   /** Disable memory-mapped access (update log_mmap) */
-  void clear_mmap();
-  void close_file(bool really_close= true);
+  void clear_mmap() noexcept;
+  void close_file(bool really_close= true) noexcept;
 #if defined __linux__ || defined _WIN32
   /** Try to enable or disable file system caching (update log_buffered) */
-  void set_buffered(bool buffered);
+  void set_buffered(bool buffered) noexcept;
 #endif
 
   /** Calculate the checkpoint safety margins. */
-  static void set_capacity();
+  static void set_capacity() noexcept;
 
   /** Write a log file header.
   @param buf        log header buffer
   @param lsn        log sequence number corresponding to log_sys.START_OFFSET
   @param encrypted  whether the log is encrypted */
-  static void header_write(byte *buf, lsn_t lsn, bool encrypted);
+  static void header_write(byte *buf, lsn_t lsn, bool encrypted) noexcept;
 
-  lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
-  { return lsn.load(order); }
+  /** @return a lower bound estimate of get_lsn(),
+  using acquire-release ordering with write_buf() or persist();
+  this is exact unless append_prepare_wait() is pending */
+  lsn_t get_lsn_approx() const noexcept
+  {
+    /* acquire-release ordering with write_buf() and persist() */
+    lsn_t lsn= base_lsn.load(std::memory_order_acquire);
+    lsn += write_lsn_offset.load(std::memory_order_relaxed) &
+      (WRITE_BACKOFF - 1);
+    return lsn;
+  }
+
+  /** @return the current log sequence number (logical time stamp) */
+  lsn_t get_lsn() const noexcept
+  {
+    ut_ad(latch_have_wr());
+    return base_lsn.load(std::memory_order_relaxed) +
+      (write_lsn_offset & (WRITE_BACKOFF - 1));
+  }
 
   lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire)
     const noexcept
   { return flushed_to_disk_lsn.load(order); }
 
   /** Initialize the LSN on initial log file creation. */
-  lsn_t init_lsn() noexcept
-  {
-    latch.wr_lock(SRW_LOCK_CALL);
-    const lsn_t lsn{get_lsn()};
-    flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
-    write_lsn= lsn;
-    latch.wr_unlock();
-    return lsn;
-  }
+  inline lsn_t init_lsn() noexcept;
 
   void set_recovered_lsn(lsn_t lsn) noexcept
   {
     ut_ad(latch_have_wr());
-    write_lsn= lsn;
-    this->lsn.store(lsn, std::memory_order_relaxed);
+    uint64_t lsn_offset= ((write_size - 1) & (lsn - first_lsn));
+    write_lsn_offset= lsn_offset;
+    base_lsn.store(lsn - lsn_offset, std::memory_order_relaxed);
     flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+    write_lsn= lsn;
   }
 
 #ifdef HAVE_PMEM
@@ -481,25 +475,19 @@
 
 private:
   /** Update writer and mtr_t::finisher */
-  void writer_update() noexcept;
+  void writer_update(bool resizing) noexcept;
 
   /** Wait in append_prepare() for buffer to become available
-  @tparam spin  whether to use the spin-only lock_lsn()
-  @param b      the value of buf_free
-  @param ex     whether log_sys.latch is exclusively locked
-  @param lsn    log sequence number to write up to
-  @return the new value of buf_free */
-  template<bool spin>
-  ATTRIBUTE_COLD size_t append_prepare_wait(size_t b, bool ex, lsn_t lsn)
-    noexcept;
+  @param late   whether the WRITE_BACKOFF flag had already been set
+  @param ex     whether log_sys.latch is exclusively locked */
+  ATTRIBUTE_COLD void append_prepare_wait(bool late, bool ex) noexcept;
 public:
   /** Reserve space in the log buffer for appending data.
-  @tparam spin  whether to use the spin-only lock_lsn()
   @tparam mmap  log_sys.is_mmap()
   @param size   total length of the data to append(), in bytes
   @param ex     whether log_sys.latch is exclusively locked
   @return the start LSN and the buffer position for append() */
-  template<bool spin,bool mmap>
+  template<bool mmap>
   std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
 
   /** Append a string of bytes to the redo log.
@@ -570,7 +558,10 @@
 /** Wait for a log checkpoint if needed.
 NOTE that this function may only be called while not holding
 any synchronization objects except dict_sys.latch. */
-void log_free_check();
+void log_free_check() noexcept;
+
+/** @return the current log sequence number (may be stale) */
+lsn_t log_get_lsn() noexcept;
 
 /** Release the latches that protect log resizing. */
-void log_resize_release();
+void log_resize_release() noexcept;
diff -Nru mariadb-10.11.11/storage/innobase/include/log0recv.h mariadb-10.11.13/storage/innobase/include/log0recv.h
--- mariadb-10.11.11/storage/innobase/include/log0recv.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/log0recv.h	2025-05-19 16:14:25.000000000 +0000
@@ -118,15 +118,17 @@
                         const fil_space_t *space= nullptr,
                         byte *tmp_buf= nullptr) const noexcept;
 
-  /** Find the doublewrite copy of an encrypted page with the
-  smallest FIL_PAGE_LSN that is large enough for recovery.
+  /** Find the doublewrite copy of an encrypted/page_compressed
+  page with the smallest FIL_PAGE_LSN that is large enough for
+  recovery.
   @param space    tablespace object
   @param page_no  page number to find
-  @param buf      buffer for unencrypted page
+  @param buf      buffer for unencrypted/uncompressed page
   @return buf
   @retval nullptr if the page was not found in doublewrite buffer */
-  byte *find_encrypted_page(const fil_node_t &space, uint32_t page_no,
-                            byte *buf) noexcept;
+  ATTRIBUTE_COLD byte *find_deferred_page(const fil_node_t &space,
+                                          uint32_t page_no,
+                                          byte *buf) noexcept;
 
   /** Restore the first page of the given tablespace from
   doublewrite buffer.
diff -Nru mariadb-10.11.11/storage/innobase/include/mtr0mtr.h mariadb-10.11.13/storage/innobase/include/mtr0mtr.h
--- mariadb-10.11.11/storage/innobase/include/mtr0mtr.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/mtr0mtr.h	2025-05-19 16:14:25.000000000 +0000
@@ -700,19 +700,19 @@
   @param mtr   mini-transaction
   @param lsns  {start_lsn,flush_ahead} */
   template<bool pmem>
-  static void commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns);
+  static void commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
+    noexcept;
 
   /** Append the redo log records to the redo log buffer.
   @return {start_lsn,flush_ahead} */
   std::pair<lsn_t,page_flush_ahead> do_write();
 
   /** Append the redo log records to the redo log buffer.
-  @tparam spin whether to use the spin-only log_sys.lock_lsn()
   @tparam mmap log_sys.is_mmap()
   @param mtr   mini-transaction
   @param len   number of bytes to write
   @return {start_lsn,flush_ahead} */
-  template<bool spin,bool mmap> static
+  template<bool mmap> static
   std::pair<lsn_t,page_flush_ahead> finish_writer(mtr_t *mtr, size_t len);
 
   /** The applicable variant of commit_log() */
@@ -723,9 +723,6 @@
   std::pair<lsn_t,page_flush_ahead> finish_write(size_t len)
   { return finisher(this, len); }
 public:
-  /** Poll interval in log_sys.lock_lsn(); 0 to use log_sys.lsn_lock.
-  Protected by LOCK_global_system_variables and log_sys.latch. */
-  static unsigned spin_wait_delay;
   /** Update finisher when spin_wait_delay is changing to or from 0. */
   static void finisher_update();
 private:
diff -Nru mariadb-10.11.11/storage/innobase/include/os0file.h mariadb-10.11.13/storage/innobase/include/os0file.h
--- mariadb-10.11.11/storage/innobase/include/os0file.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/os0file.h	2025-05-19 16:14:25.000000000 +0000
@@ -1003,6 +1003,8 @@
 size_t os_aio_pending_reads_approx() noexcept;
 /** @return number of pending writes */
 size_t os_aio_pending_writes() noexcept;
+/** @return approximate number of pending writes */
+size_t os_aio_pending_writes_approx() noexcept;
 
 /** Wait until there are no pending asynchronous writes.
 @param declare  whether the wait will be declared in tpool */
diff -Nru mariadb-10.11.11/storage/innobase/include/row0row.h mariadb-10.11.13/storage/innobase/include/row0row.h
--- mariadb-10.11.11/storage/innobase/include/row0row.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/row0row.h	2025-05-19 16:14:25.000000000 +0000
@@ -328,22 +328,6 @@
 	mtr_t*		mtr)	/*!< in: mtr */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/** Parse the integer data from specified data, which could be
-DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
-and the type is not unsigned then we reset the value to 0
-@param[in]	data		data to read
-@param[in]	len		length of data
-@param[in]	mtype		mtype of data
-@param[in]	unsigned_type	if the data is unsigned
-@return the integer value from the data */
-inline
-ib_uint64_t
-row_parse_int(
-	const byte*	data,
-	ulint		len,
-	ulint		mtype,
-	bool		unsigned_type);
-
 /** Result of row_search_index_entry */
 enum row_search_result {
 	ROW_FOUND = 0,		/*!< the record was found */
diff -Nru mariadb-10.11.11/storage/innobase/include/row0row.inl mariadb-10.11.13/storage/innobase/include/row0row.inl
--- mariadb-10.11.11/storage/innobase/include/row0row.inl	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/row0row.inl	2025-05-19 16:14:25.000000000 +0000
@@ -170,52 +170,3 @@
 		}
 	}
 }
-
-/** Parse the integer data from specified data, which could be
-DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
-and the type is not unsigned then we reset the value to 0
-@param[in]	data		data to read
-@param[in]	len		length of data
-@param[in]	mtype		mtype of data
-@param[in]	unsigned_type	if the data is unsigned
-@return the integer value from the data */
-ib_uint64_t
-row_parse_int(
-	const byte*	data,
-	ulint		len,
-	ulint		mtype,
-	bool		unsigned_type)
-{
-	ib_uint64_t	value = 0;
-
-	switch (mtype) {
-	case DATA_INT:
-
-		ut_a(len <= sizeof value);
-		value = mach_read_int_type(data, len, unsigned_type);
-		break;
-
-	case DATA_FLOAT:
-
-		ut_a(len == sizeof(float));
-		value = static_cast<ib_uint64_t>(mach_float_read(data));
-		break;
-
-	case DATA_DOUBLE:
-
-		ut_a(len == sizeof(double));
-		value = static_cast<ib_uint64_t>(mach_double_read(data));
-		break;
-
-	default:
-		ut_error;
-
-	}
-
-	if (!unsigned_type && static_cast<int64_t>(value) < 0) {
-		value = 0;
-	}
-
-	return(value);
-}
-
diff -Nru mariadb-10.11.11/storage/innobase/include/row0sel.h mariadb-10.11.13/storage/innobase/include/row0sel.h
--- mariadb-10.11.11/storage/innobase/include/row0sel.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/row0sel.h	2025-05-19 16:14:25.000000000 +0000
@@ -182,9 +182,8 @@
 @param[in] index	index starting with an AUTO_INCREMENT column
 @return	the largest AUTO_INCREMENT value
 @retval	0	if no records were found */
-ib_uint64_t
-row_search_max_autoinc(dict_index_t* index)
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
+uint64_t row_search_max_autoinc(dict_index_t *index) noexcept
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /** A structure for caching column values for prefetched rows */
 struct sel_buf_t{
diff -Nru mariadb-10.11.11/storage/innobase/include/srv0srv.h mariadb-10.11.13/storage/innobase/include/srv0srv.h
--- mariadb-10.11.11/storage/innobase/include/srv0srv.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/srv0srv.h	2025-05-19 16:14:25.000000000 +0000
@@ -223,17 +223,6 @@
 extern my_bool	srv_adaptive_flushing;
 extern my_bool	srv_flush_sync;
 
-/** Requested size in bytes */
-extern ulint		srv_buf_pool_size;
-/** Requested buffer pool chunk size */
-extern size_t		srv_buf_pool_chunk_unit;
-/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
-/** Previously requested size */
-extern ulint	srv_buf_pool_old_size;
-/** Current size as scaling factor for the other components */
-extern ulint	srv_buf_pool_base_size;
-/** Current size in bytes */
-extern ulint	srv_buf_pool_curr_size;
 /** Dump this % of each buffer pool during BP dump */
 extern ulong	srv_buf_pool_dump_pct;
 #ifdef UNIV_DEBUG
@@ -267,8 +256,8 @@
 
 /* We use this dummy default value at startup for max_io_capacity.
 The real value is set based on the value of io_capacity. */
-#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT	(~0UL)
-#define SRV_MAX_IO_CAPACITY_LIMIT		(~0UL)
+#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT	(UINT32_MAX)
+#define SRV_MAX_IO_CAPACITY_LIMIT		(UINT32_MAX)
 extern ulong    srv_max_io_capacity;
 
 /* The "innodb_stats_method" setting, decides how InnoDB is going
@@ -294,9 +283,9 @@
 
 extern ibool	srv_innodb_status;
 
-extern unsigned long long	srv_stats_transient_sample_pages;
+extern uint32_t			srv_stats_transient_sample_pages;
 extern my_bool			srv_stats_persistent;
-extern unsigned long long	srv_stats_persistent_sample_pages;
+extern uint32_t			srv_stats_persistent_sample_pages;
 extern my_bool			srv_stats_auto_recalc;
 extern my_bool			srv_stats_include_delete_marked;
 extern unsigned long long	srv_stats_modified_counter;
@@ -596,7 +585,7 @@
 #endif /* BTR_CUR_HASH_ADAPT */
 	char  innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
 	char  innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
-	char  innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
+	char  innodb_buffer_pool_resize_status[65];/*!< Buf pool resize status */
 	my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */
 	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
 	ulint innodb_buffer_pool_bytes_data;	/*!< File bytes used */
diff -Nru mariadb-10.11.11/storage/innobase/include/trx0trx.h mariadb-10.11.13/storage/innobase/include/trx0trx.h
--- mariadb-10.11.11/storage/innobase/include/trx0trx.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/trx0trx.h	2025-05-19 16:14:25.000000000 +0000
@@ -809,8 +809,13 @@
   /** normally set; "SET unique_checks=0, foreign_key_checks=0"
   enables bulk insert into an empty table */
   unsigned check_unique_secondary:1;
-  /** whether an insert into an empty table is active */
-  unsigned bulk_insert:1;
+  /** whether an insert into an empty table is active
+  Possible states are
+  TRX_NO_BULK
+  TRX_DML_BULK
+  TRX_DDL_BULK
+  @see trx_bulk_insert in trx0types.h */
+  unsigned bulk_insert:2;
 	/*------------------------------*/
 	/* MySQL has a transaction coordinator to coordinate two phase
 	commit between multiple storage engines and the binary log. When
@@ -1117,6 +1122,7 @@
     ut_ad(!is_not_inheriting_locks());
     ut_ad(check_foreigns);
     ut_ad(check_unique_secondary);
+    ut_ad(bulk_insert == TRX_NO_BULK);
   }
 
   /** This has to be invoked on SAVEPOINT or at the end of a statement.
@@ -1142,6 +1148,8 @@
   rollback to the start of a statement will work. */
   void end_bulk_insert()
   {
+    if (bulk_insert == TRX_DDL_BULK)
+      return;
     for (auto& t : mod_tables)
       t.second.end_bulk_insert();
   }
@@ -1149,7 +1157,15 @@
   /** @return whether a bulk insert into empty table is in progress */
   bool is_bulk_insert() const
   {
-    if (!bulk_insert || check_unique_secondary || check_foreigns)
+    switch (bulk_insert) {
+    case TRX_NO_BULK:
+      return false;
+    case TRX_DDL_BULK:
+      return true;
+    default:
+      ut_ad(bulk_insert == TRX_DML_BULK);
+    }
+    if (check_unique_secondary || check_foreigns)
       return false;
     for (const auto& t : mod_tables)
       if (t.second.is_bulk_insert())
@@ -1179,9 +1195,11 @@
   /** Do the bulk insert for the buffered insert operation
   for the transaction.
   @return DB_SUCCESS or error code */
+  template<trx_bulk_insert type= TRX_DML_BULK>
   dberr_t bulk_insert_apply()
   {
-    return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS;
+    static_assert(type != TRX_NO_BULK, "");
+    return bulk_insert == type ? bulk_insert_apply_low(): DB_SUCCESS;
   }
 
 private:
diff -Nru mariadb-10.11.11/storage/innobase/include/trx0types.h mariadb-10.11.13/storage/innobase/include/trx0types.h
--- mariadb-10.11.11/storage/innobase/include/trx0types.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/trx0types.h	2025-05-19 16:14:25.000000000 +0000
@@ -65,6 +65,15 @@
 	TRX_STATE_COMMITTED_IN_MEMORY
 };
 
+/** Transaction bulk insert operation @see trx_t::bulk_insert */
+enum trx_bulk_insert {
+    TRX_NO_BULK,
+    /** bulk insert is being executed during DML */
+    TRX_DML_BULK,
+    /** bulk insert is being executed in copy_data_between_tables() */
+    TRX_DDL_BULK
+};
+
 /** Memory objects */
 /* @{ */
 /** Transaction */
diff -Nru mariadb-10.11.11/storage/innobase/include/ut0new.h mariadb-10.11.13/storage/innobase/include/ut0new.h
--- mariadb-10.11.11/storage/innobase/include/ut0new.h	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/include/ut0new.h	2025-05-19 16:14:25.000000000 +0000
@@ -277,7 +277,6 @@
 
 #ifdef UNIV_PFS_MEMORY
 	/** Default constructor. */
-	explicit
 	ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED)
 		: m_key(key)
 	{
diff -Nru mariadb-10.11.11/storage/innobase/lock/lock0lock.cc mariadb-10.11.13/storage/innobase/lock/lock0lock.cc
--- mariadb-10.11.11/storage/innobase/lock/lock0lock.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/lock/lock0lock.cc	2025-05-19 16:14:25.000000000 +0000
@@ -4140,13 +4140,12 @@
           children.end())
         continue; /* We already acquired MDL on this child table. */
       MDL_ticket *mdl= nullptr;
-      child->acquire();
       child= dict_acquire_mdl_shared<false>(child, mdl_context, &mdl,
                                             DICT_TABLE_OP_NORMAL);
       if (child)
       {
-        if (!mdl)
-          child->release();
+        if (mdl)
+          child->acquire();
         children.emplace_back(table_mdl{child, mdl});
         goto rescan;
       }
@@ -6053,17 +6052,10 @@
 	for it */
 
 	trx_t *trx = thr_get_trx(thr);
-	if (const trx_t *owner =
-	    lock_rec_convert_impl_to_expl<true>(trx, *block,
-						rec, index, offsets)) {
-		if (owner == trx) {
-			/* We already hold an exclusive lock. */
-			return DB_SUCCESS;
-		}
-
-		if (trx->snapshot_isolation && trx->read_view.is_open()) {
-			return DB_RECORD_CHANGED;
-		}
+	if (lock_rec_convert_impl_to_expl<true>(trx, *block,
+						rec, index, offsets) == trx) {
+		/* We already hold an exclusive lock. */
+		return DB_SUCCESS;
 	}
 
 	err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP,
@@ -6225,19 +6217,11 @@
 		return DB_SUCCESS;
 	}
 
-	if (page_rec_is_supremum(rec)) {
-	} else if (const trx_t *owner =
-		   lock_rec_convert_impl_to_expl<false>(trx, *block,
-							rec, index, offsets)) {
-		if (owner == trx) {
-			if (gap_mode == LOCK_REC_NOT_GAP) {
-				/* We already hold an exclusive lock. */
-				return DB_SUCCESS;
-			}
-		} else if (trx->snapshot_isolation
-			   && trx->read_view.is_open()) {
-			return DB_RECORD_CHANGED;
-		}
+	if (!page_rec_is_supremum(rec)
+	    && lock_rec_convert_impl_to_expl<false>(trx, *block, rec, index,
+						    offsets) == trx
+	    && gap_mode == LOCK_REC_NOT_GAP) {
+		return DB_SUCCESS;
 	}
 
 #ifdef WITH_WSREP
@@ -6317,28 +6301,24 @@
 	trx_t *trx = thr_get_trx(thr);
 	if (lock_table_has(trx, index->table, LOCK_X)
 	    || heap_no == PAGE_HEAP_NO_SUPREMUM) {
-	} else if (const trx_t *owner =
-		   lock_rec_convert_impl_to_expl<true>(trx, *block,
-						       rec, index, offsets)) {
-		if (owner == trx) {
-			if (gap_mode == LOCK_REC_NOT_GAP) {
-				/* We already hold an exclusive lock. */
-				return DB_SUCCESS;
-			}
-		} else if (trx->snapshot_isolation
-			   && trx->read_view.is_open()) {
-			return DB_RECORD_CHANGED;
-		}
+	} else if (lock_rec_convert_impl_to_expl<true>(trx, *block, rec, index,
+						       offsets) == trx
+	    && gap_mode == LOCK_REC_NOT_GAP) {
+		/* We already hold an exclusive lock. */
+		return DB_SUCCESS;
 	}
 
 	if (heap_no > PAGE_HEAP_NO_SUPREMUM && gap_mode != LOCK_GAP
             && trx->snapshot_isolation
-	    && trx->read_view.is_open()
-	    && !trx->read_view.changes_visible(
-		trx_read_trx_id(rec + row_trx_id_offset(rec, index)))
-	    && IF_WSREP(!(trx->is_wsrep()
+	    && trx->read_view.is_open()) {
+		trx_id_t trx_id= trx_read_trx_id(rec +
+						 row_trx_id_offset(rec, index));
+		if (!trx_sys.is_registered(trx, trx_id)
+		    && !trx->read_view.changes_visible(trx_id)
+		    && IF_WSREP(!(trx->is_wsrep()
 			&& wsrep_thd_skip_locking(trx->mysql_thd)), true)) {
-		return DB_RECORD_CHANGED;
+			return DB_RECORD_CHANGED;
+		}
 	}
 
 	dberr_t err = lock_rec_lock(false, gap_mode | mode,
@@ -7109,10 +7089,6 @@
       victim->lock.was_chosen_as_deadlock_victim= true;
       DEBUG_SYNC_C("deadlock_report_before_lock_releasing");
       lock_cancel_waiting_and_release<true>(victim->lock.wait_lock);
-#ifdef WITH_WSREP
-      if (victim->is_wsrep() && wsrep_thd_is_SR(victim->mysql_thd))
-        wsrep_handle_SR_rollback(trx->mysql_thd, victim->mysql_thd);
-#endif
     }
 
 func_exit:
diff -Nru mariadb-10.11.11/storage/innobase/log/log0crypt.cc mariadb-10.11.13/storage/innobase/log/log0crypt.cc
--- mariadb-10.11.11/storage/innobase/log/log0crypt.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/log/log0crypt.cc	2025-05-19 16:14:25.000000000 +0000
@@ -566,7 +566,7 @@
 
   alignas(8) byte iv[MY_AES_BLOCK_SIZE];
 
-  m_commit_lsn= log_sys.get_lsn();
+  m_commit_lsn= log_sys.get_flushed_lsn();
   ut_ad(m_commit_lsn);
   byte *tmp= static_cast<byte*>(alloca(srv_page_size)), *t= tmp;
   byte *dst= static_cast<byte*>(alloca(srv_page_size));
diff -Nru mariadb-10.11.11/storage/innobase/log/log0log.cc mariadb-10.11.13/storage/innobase/log/log0log.cc
--- mariadb-10.11.11/storage/innobase/log/log0log.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/log/log0log.cc	2025-05-19 16:14:25.000000000 +0000
@@ -68,7 +68,7 @@
 #define LOG_BUF_FLUSH_MARGIN	((4 * 4096) /* cf. log_t::append_prepare() */ \
 				 + (4U << srv_page_size_shift))
 
-void log_t::set_capacity()
+void log_t::set_capacity() noexcept
 {
 	ut_ad(log_sys.latch_have_wr());
 	/* Margin for the free space in the smallest log, before a new query
@@ -87,13 +87,15 @@
 	log_sys.max_checkpoint_age = margin;
 }
 
-void log_t::create()
+void log_t::create() noexcept
 {
   ut_ad(this == &log_sys);
   ut_ad(!is_initialised());
 
+  latch.SRW_LOCK_INIT(log_latch_key);
+  write_lsn_offset= 0;
   /* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */
-  lsn.store(FIRST_LSN, std::memory_order_relaxed);
+  base_lsn.store(FIRST_LSN, std::memory_order_relaxed);
   flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed);
   need_checkpoint.store(true, std::memory_order_relaxed);
   write_lsn= FIRST_LSN;
@@ -102,10 +104,10 @@
   ut_ad(!buf);
   ut_ad(!flush_buf);
   ut_ad(!writer);
-  max_buf_free= 1;
 
-  latch.SRW_LOCK_INIT(log_latch_key);
-  lsn_lock.init();
+#ifdef HAVE_PMEM
+  resize_wrap_mutex.init();
+#endif
 
   last_checkpoint_lsn= FIRST_LSN;
   log_capacity= 0;
@@ -114,8 +116,6 @@
   next_checkpoint_lsn= 0;
   checkpoint_pending= false;
 
-  set_buf_free(0);
-
   ut_ad(is_initialised());
 }
 
@@ -306,7 +306,7 @@
 
 #if defined __linux__ || defined _WIN32
 /** Display a message about opening the log */
-ATTRIBUTE_COLD static void log_file_message()
+ATTRIBUTE_COLD static void log_file_message() noexcept
 {
   sql_print_information("InnoDB: %s (block size=%u bytes)",
                         log_sys.log_mmap
@@ -320,10 +320,10 @@
                         log_sys.write_size);
 }
 #else
-static inline void log_file_message() {}
+static inline void log_file_message() noexcept {}
 #endif
 
-bool log_t::attach(log_file_t file, os_offset_t size)
+bool log_t::attach(log_file_t file, os_offset_t size) noexcept
 {
   log= file;
   ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT);
@@ -352,8 +352,7 @@
       }
 # endif
       buf= static_cast<byte*>(ptr);
-      max_buf_free= 1;
-      writer_update();
+      writer_update(false);
 # ifdef HAVE_PMEM
       if (is_pmem)
         return true;
@@ -366,7 +365,7 @@
   if (!buf)
   {
   alloc_fail:
-    max_buf_free= 0;
+    base_lsn.store(0, std::memory_order_relaxed);
     sql_print_error("InnoDB: Cannot allocate memory;"
                     " too large innodb_log_buffer_size?");
     return false;
@@ -394,8 +393,7 @@
 
   TRASH_ALLOC(buf, buf_size);
   TRASH_ALLOC(flush_buf, buf_size);
-  max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
-  writer_update();
+  writer_update(false);
   memset_aligned<512>(checkpoint_buf, 0, write_size);
 
  func_exit:
@@ -407,7 +405,7 @@
 @param buf        log header buffer
 @param lsn        log sequence number corresponding to log_sys.START_OFFSET
 @param encrypted  whether the log is encrypted */
-void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted)
+void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted) noexcept
 {
   mach_write_to_4(my_assume_aligned<4>(buf) + LOG_HEADER_FORMAT,
                   log_sys.FORMAT_10_8);
@@ -436,8 +434,9 @@
   ut_ad(is_latest());
   ut_ad(this == &log_sys);
 
-  this->lsn.store(lsn, std::memory_order_relaxed);
-  this->flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+  write_lsn_offset= 0;
+  base_lsn.store(lsn, std::memory_order_relaxed);
+  flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
   first_lsn= lsn;
   write_lsn= lsn;
 
@@ -452,14 +451,13 @@
     mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
     memset_aligned<4096>(buf, 0, 4096);
     log_sys.header_write(buf, lsn, is_encrypted());
-    set_buf_free(START_OFFSET);
     pmem_persist(buf, 512);
+    buf_size= unsigned(std::min<uint64_t>(capacity(), buf_size_max));
   }
   else
 #endif
   {
     ut_ad(!is_mmap());
-    set_buf_free(0);
     memset_aligned<4096>(flush_buf, 0, buf_size);
     memset_aligned<4096>(buf, 0, buf_size);
     log_sys.header_write(buf, lsn, is_encrypted());
@@ -468,12 +466,12 @@
   }
 }
 
-ATTRIBUTE_COLD static void log_close_failed(dberr_t err)
+ATTRIBUTE_COLD static void log_close_failed(dberr_t err) noexcept
 {
   ib::fatal() << "closing ib_logfile0 failed: " << err;
 }
 
-void log_t::close_file(bool really_close)
+void log_t::close_file(bool really_close) noexcept
 {
   if (is_mmap())
   {
@@ -508,16 +506,25 @@
         log_close_failed(err);
 }
 
+/** @return the current log sequence number (may be stale) */
+lsn_t log_get_lsn() noexcept
+{
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+  lsn_t lsn= log_sys.get_lsn();
+  log_sys.latch.wr_unlock();
+  return lsn;
+}
+
 /** Acquire all latches that protect the log. */
-static void log_resize_acquire()
+static void log_resize_acquire() noexcept
 {
 #ifdef HAVE_PMEM
   if (!log_sys.is_mmap())
 #endif
   {
-    while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+    while (flush_lock.acquire(log_get_lsn() + 1, nullptr) !=
            group_commit_lock::ACQUIRED);
-    while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+    while (write_lock.acquire(log_get_lsn() + 1, nullptr) !=
            group_commit_lock::ACQUIRED);
   }
 
@@ -525,7 +532,7 @@
 }
 
 /** Release the latches that protect the log. */
-void log_resize_release()
+void log_resize_release() noexcept
 {
   log_sys.latch.wr_unlock();
 
@@ -542,7 +549,7 @@
 
 #if defined __linux__ || defined _WIN32
 /** Try to enable or disable file system caching (update log_buffered) */
-void log_t::set_buffered(bool buffered)
+void log_t::set_buffered(bool buffered) noexcept
 {
   if (!log_maybe_unbuffered ||
 #ifdef HAVE_PMEM
@@ -570,31 +577,35 @@
 
 /** Start resizing the log and release the exclusive latch.
 @param size  requested new file_size
+@param thd   the current thread identifier
 @return whether the resizing was started successfully */
-log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
+log_t::resize_start_status log_t::resize_start(os_offset_t size, void *thd)
+  noexcept
 {
   ut_ad(size >= 4U << 20);
   ut_ad(!(size & 4095));
   ut_ad(!srv_read_only_mode);
+  ut_ad(thd);
 
   log_resize_acquire();
 
-  resize_start_status status= RESIZE_NO_CHANGE;
-  lsn_t start_lsn{0};
-#ifdef HAVE_PMEM
-  bool is_pmem{false};
-#endif
+  resize_start_status status;
 
-  if (resize_in_progress())
+  if (size == file_size)
+    status= RESIZE_NO_CHANGE;
+  else if (resize_in_progress())
     status= RESIZE_IN_PROGRESS;
-  else if (size != file_size)
+  else
   {
+    lsn_t start_lsn;
     ut_ad(!resize_in_progress());
     ut_ad(!resize_log.is_opened());
     ut_ad(!resize_buf);
     ut_ad(!resize_flush_buf);
+    ut_ad(!resize_initiator);
     std::string path{get_log_file_path("ib_logfile101")};
     bool success;
+    resize_initiator= thd;
     resize_lsn.store(1, std::memory_order_relaxed);
     resize_target= 0;
     resize_log.m_file=
@@ -612,6 +623,7 @@
 #ifdef HAVE_PMEM
       else if (is_mmap())
       {
+        bool is_pmem{false};
         ptr= ::log_mmap(resize_log.m_file, is_pmem, size);
 
         if (ptr == MAP_FAILED)
@@ -661,34 +673,33 @@
         else if (!is_opened())
           resize_log.close();
 
-        writer_update();
+        resize_lsn.store(start_lsn, std::memory_order_relaxed);
+        writer_update(true);
+        log_resize_release();
+
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        lsn_t target_lsn= buf_pool.get_oldest_modification(0);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        buf_flush_ahead(start_lsn < target_lsn ? target_lsn + 1 : start_lsn,
+                        false);
+        return RESIZE_STARTED;
       }
-      status= success ? RESIZE_STARTED : RESIZE_FAILED;
     }
-    resize_lsn.store(start_lsn, std::memory_order_relaxed);
+    resize_initiator= nullptr;
+    resize_lsn.store(0, std::memory_order_relaxed);
+    status= RESIZE_FAILED;
   }
 
   log_resize_release();
-
-  if (start_lsn)
-  {
-    mysql_mutex_lock(&buf_pool.flush_list_mutex);
-    lsn_t target_lsn= buf_pool.get_oldest_modification(0);
-    if (start_lsn < target_lsn)
-      start_lsn= target_lsn + 1;
-    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-    buf_flush_ahead(start_lsn, false);
-  }
-
   return status;
 }
 
-/** Abort log resizing. */
-void log_t::resize_abort() noexcept
+/** Abort a resize_start() that we started. */
+void log_t::resize_abort(void *thd) noexcept
 {
   log_resize_acquire();
 
-  if (resize_in_progress() > 1)
+  if (resize_running(thd))
   {
 #ifdef HAVE_PMEM
     const bool is_mmap{this->is_mmap()};
@@ -715,11 +726,12 @@
     resize_buf= nullptr;
     resize_target= 0;
     resize_lsn.store(0, std::memory_order_relaxed);
+    resize_initiator= nullptr;
     std::string path{get_log_file_path("ib_logfile101")};
     IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str()));
+    writer_update(false);
   }
 
-  writer_update();
   log_resize_release();
 }
 
@@ -882,9 +894,7 @@
   ut_ad(!is_opened());
   ut_ad(!write_lock.is_owner());
   ut_ad(!flush_lock.is_owner());
-#ifdef LOG_LATCH_DEBUG
-  ut_ad(latch_have_any());
-#endif
+  ut_ad(latch_have_wr());
 
   lsn_t old= flushed_to_disk_lsn.load(std::memory_order_relaxed);
 
@@ -902,26 +912,26 @@
   else
     pmem_persist(buf + start, end - start);
 
-  old= flushed_to_disk_lsn.load(std::memory_order_relaxed);
-
-  if (old < lsn)
-  {
-    while (!flushed_to_disk_lsn.compare_exchange_weak
-           (old, lsn, std::memory_order_release, std::memory_order_relaxed))
-      if (old >= lsn)
-        break;
-
-    log_flush_notify(lsn);
-    DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
-  }
+  uint64_t offset{write_lsn_offset};
+  const lsn_t new_base_lsn= base_lsn.load(std::memory_order_relaxed) +
+    (offset & (WRITE_BACKOFF - 1));
+  ut_ad(new_base_lsn >= lsn);
+  write_to_buf+= size_t(offset >> WRITE_TO_BUF_SHIFT);
+  /* This synchronizes with get_lsn_approx();
+  we must store write_lsn_offset before base_lsn. */
+  write_lsn_offset.store(0, std::memory_order_relaxed);
+  base_lsn.store(new_base_lsn, std::memory_order_release);
+  flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+  log_flush_notify(lsn);
+  DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
 }
 
 ATTRIBUTE_NOINLINE
 static void log_write_persist(lsn_t lsn) noexcept
 {
-  log_sys.latch.rd_lock(SRW_LOCK_CALL);
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
   log_sys.persist(lsn);
-  log_sys.latch.rd_unlock();
+  log_sys.latch.wr_unlock();
 }
 #endif
 
@@ -972,7 +982,7 @@
   ut_ad(resizing == RETAIN_LATCH ||
         (resizing == RESIZING) == (resize_in_progress() > 1));
 
-  const lsn_t lsn{get_lsn(std::memory_order_relaxed)};
+  const lsn_t lsn{get_lsn()};
 
   if (write_lsn >= lsn)
   {
@@ -988,7 +998,8 @@
     ut_ad(write_lsn >= get_flushed_lsn());
     const size_t write_size_1{write_size - 1};
     ut_ad(ut_is_2pow(write_size));
-    size_t length{buf_free.load(std::memory_order_relaxed)};
+    lsn_t base= base_lsn.load(std::memory_order_relaxed);
+    size_t length{size_t(lsn - base)};
     lsn_t offset{calc_lsn_offset(write_lsn)};
     ut_ad(length >= (offset & write_size_1));
     ut_ad(write_size_1 >= 511);
@@ -1010,14 +1021,8 @@
     {
       ut_ad(!((length ^ (size_t(lsn) - size_t(first_lsn))) & write_size_1));
       /* Keep filling the same buffer until we have more than one block. */
-#if 0 /* TODO: Pad the last log block with dummy records. */
-      buf_free= log_pad(lsn, (write_size_1 + 1) - length,
-                        buf + length, flush_buf);
-      ... /* TODO: Update the LSN and adjust other code. */
-#else
       MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - length);
       buf[length]= 0; /* ensure that recovery catches EOF */
-#endif
       if (UNIV_LIKELY_NULL(re_write_buf))
       {
         MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) - length);
@@ -1028,8 +1033,13 @@
     else
     {
       const size_t new_buf_free{length & write_size_1};
+      base+= length & ~write_size_1;
       ut_ad(new_buf_free == ((lsn - first_lsn) & write_size_1));
-      buf_free.store(new_buf_free, std::memory_order_relaxed);
+      write_to_buf+= size_t(write_lsn_offset >> WRITE_TO_BUF_SHIFT);
+      /* This synchronizes with get_lsn_approx();
+      we must store write_lsn_offset before base_lsn. */
+      write_lsn_offset.store(new_buf_free, std::memory_order_relaxed);
+      base_lsn.store(base, std::memory_order_release);
 
       if (new_buf_free)
       {
@@ -1039,12 +1049,13 @@
         the current LSN are generated. */
         MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - new_buf_free);
         buf[length]= 0; /* allow recovery to catch EOF faster */
+        if (UNIV_LIKELY_NULL(re_write_buf))
+          MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) -
+                           new_buf_free);
         length&= ~write_size_1;
         memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15);
         if (UNIV_LIKELY_NULL(re_write_buf))
         {
-          MEM_MAKE_DEFINED(re_write_buf + length, (write_size_1 + 1) -
-                           new_buf_free);
           memcpy_aligned<16>(resize_flush_buf, re_write_buf + length,
                              (new_buf_free + 15) & ~15);
           re_write_buf[length + new_buf_free]= 0;
@@ -1057,7 +1068,9 @@
         std::swap(resize_buf, resize_flush_buf);
     }
 
+    ut_ad(base + (write_lsn_offset & (WRITE_TO_BUF - 1)) == lsn);
     write_to_log++;
+
     if (resizing != RETAIN_LATCH)
       latch.wr_unlock();
 
@@ -1101,7 +1114,7 @@
 @retval 0  if there are no pending callbacks on flush_lock
            or there is another group commit lead.
 */
-static lsn_t log_flush(lsn_t lsn)
+static lsn_t log_flush(lsn_t lsn) noexcept
 {
   ut_ad(!log_sys.is_mmap());
   ut_a(log_sys.flush(lsn));
@@ -1120,7 +1133,7 @@
 void log_write_up_to(lsn_t lsn, bool durable,
                      const completion_callback *callback) noexcept
 {
-  ut_ad(!srv_read_only_mode || log_sys.buf_free_ok());
+  ut_ad(!srv_read_only_mode);
   ut_ad(lsn != LSN_MAX);
   ut_ad(lsn != 0);
   ut_ad(!log_sys.is_mmap() || !callback || durable);
@@ -1133,8 +1146,6 @@
     return;
   }
 
-  ut_ad(lsn <= log_sys.get_lsn());
-
 #ifdef HAVE_PMEM
   if (log_sys.is_mmap())
   {
@@ -1151,10 +1162,10 @@
     if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED)
       return;
     /* Promise to other concurrent flush_lock.acquire() that we
-    will durable at least up to the current LSN. The LSN may still
-    advance until we acquire log_sys.latch below. */
-    lsn= log_sys.get_lsn();
-    flush_lock.set_pending(lsn);
+    will be durable at least up to the current LSN. The LSN may still
+    advance when we acquire log_sys.latch below. */
+    if (lsn > log_sys.get_flushed_lsn())
+      flush_lock.set_pending(lsn);
   }
 
   lsn_t pending_write_lsn= 0, pending_flush_lsn= 0;
@@ -1190,42 +1201,50 @@
   return log_sys.write_buf<log_t::RESIZING>();
 }
 
-void log_t::writer_update() noexcept
+void log_t::writer_update(bool resizing) noexcept
 {
   ut_ad(latch_have_wr());
-  writer= resize_in_progress() ? log_writer_resizing : log_writer;
+  ut_ad(resizing == (resize_in_progress() > 1));
+  writer= resizing ? log_writer_resizing : log_writer;
   mtr_t::finisher_update();
 }
 
 /** Write to the log file up to the last log entry.
 @param durable  whether to wait for a durable write to complete */
-void log_buffer_flush_to_disk(bool durable)
+void log_buffer_flush_to_disk(bool durable) noexcept
 {
-  log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable);
+  log_write_up_to(log_get_lsn(), durable);
 }
 
 /** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
-ATTRIBUTE_COLD void log_write_and_flush_prepare()
+ATTRIBUTE_COLD void log_write_and_flush_prepare() noexcept
 {
 #ifdef HAVE_PMEM
   if (log_sys.is_mmap())
     return;
 #endif
 
-  while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+  while (flush_lock.acquire(log_get_lsn() + 1, nullptr) !=
          group_commit_lock::ACQUIRED);
-  while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+  while (write_lock.acquire(log_get_lsn() + 1, nullptr) !=
          group_commit_lock::ACQUIRED);
 }
 
-void log_t::clear_mmap()
+void log_t::clear_mmap() noexcept
 {
-  if (!is_mmap() ||
+  if (!is_mmap() || high_level_read_only)
+    return;
 #ifdef HAVE_PMEM
-      !is_opened() ||
-#endif
-      high_level_read_only)
+  if (!is_opened())
+  {
+    ut_d(latch.wr_lock(SRW_LOCK_CALL));
+    ut_ad(!resize_in_progress());
+    ut_ad(get_lsn() == get_flushed_lsn(std::memory_order_relaxed));
+    ut_d(latch.wr_unlock());
     return;
+  }
+#endif
+
   log_resize_acquire();
   ut_ad(!resize_in_progress());
   ut_ad(write_lsn == get_lsn());
@@ -1235,10 +1254,10 @@
   {
     alignas(16) byte log_block[4096];
     const size_t bs{write_size};
-    const size_t bf{buf_free.load(std::memory_order_relaxed)};
     {
-      byte *const b= buf;
-      memcpy_aligned<16>(log_block, b + (bf & ~(bs - 1)), bs);
+      const size_t bf=
+        size_t(write_lsn - base_lsn.load(std::memory_order_relaxed));
+      memcpy_aligned<16>(log_block, buf + (bf & ~(bs - 1)), bs);
     }
 
     close_file(false);
@@ -1246,14 +1265,13 @@
     ut_a(attach(log, file_size));
     ut_ad(!is_mmap());
 
-    set_buf_free(bf & (bs - 1));
-    memcpy_aligned<16>(log_sys.buf, log_block, bs);
+    memcpy_aligned<16>(buf, log_block, bs);
   }
   log_resize_release();
 }
 
 /** Durably write the log up to log_sys.get_lsn(). */
-ATTRIBUTE_COLD void log_write_and_flush()
+ATTRIBUTE_COLD void log_write_and_flush() noexcept
 {
   ut_ad(!srv_read_only_mode);
 #ifdef HAVE_PMEM
@@ -1273,17 +1291,17 @@
 that a new log entry can be catenated without an immediate need for a
 checkpoint. NOTE: this function may only be called if the calling thread
 owns no synchronization objects! */
-ATTRIBUTE_COLD static void log_checkpoint_margin()
+ATTRIBUTE_COLD static void log_checkpoint_margin() noexcept
 {
   while (log_sys.check_for_checkpoint())
   {
-    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
     ut_ad(!recv_no_log_write);
 
     if (!log_sys.check_for_checkpoint())
     {
 func_exit:
-      log_sys.latch.rd_unlock();
+      log_sys.latch.wr_unlock();
       return;
     }
 
@@ -1301,7 +1319,7 @@
     }
 
     DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto skip_checkpoint;);
-    log_sys.latch.rd_unlock();
+    log_sys.latch.wr_unlock();
 
     /* We must wait to prevent the tail of the log overwriting the head. */
     buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
@@ -1313,7 +1331,7 @@
 /** Wait for a log checkpoint if needed.
 NOTE that this function may only be called while not holding
 any synchronization objects except dict_sys.latch. */
-void log_free_check()
+void log_free_check() noexcept
 {
   ut_ad(!lock_sys.is_holder());
   if (log_sys.check_for_checkpoint())
@@ -1323,10 +1341,14 @@
   }
 }
 
-extern void buf_resize_shutdown();
+#ifdef __linux__
+extern void buf_mem_pressure_shutdown() noexcept;
+#else
+inline void buf_mem_pressure_shutdown() noexcept {}
+#endif
 
 /** Make a checkpoint at the latest lsn on shutdown. */
-ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown() noexcept
 {
 	lsn_t			lsn;
 	ulint			count = 0;
@@ -1341,8 +1363,7 @@
 		srv_master_timer.reset();
 	}
 
-	/* Wait for the end of the buffer resize task.*/
-	buf_resize_shutdown();
+	buf_mem_pressure_shutdown();
 	dict_stats_shutdown();
 	btr_defragment_shutdown();
 
@@ -1464,7 +1485,7 @@
 			? SIZE_OF_FILE_CHECKPOINT + 8
 			: SIZE_OF_FILE_CHECKPOINT;
 
-		log_sys.latch.rd_lock(SRW_LOCK_CALL);
+		log_sys.latch.wr_lock(SRW_LOCK_CALL);
 
 		lsn = log_sys.get_lsn();
 
@@ -1472,7 +1493,7 @@
 			&& lsn != log_sys.last_checkpoint_lsn + sizeof_cp;
 		ut_ad(lsn >= log_sys.last_checkpoint_lsn);
 
-		log_sys.latch.rd_unlock();
+		log_sys.latch.wr_unlock();
 
 		if (lsn_changed) {
 			goto loop;
@@ -1490,7 +1511,7 @@
 				       "Free innodb buffer pool");
 	ut_d(buf_pool.assert_all_freed());
 
-	ut_a(lsn == log_sys.get_lsn()
+	ut_a(lsn == log_get_lsn()
 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
 
 	if (UNIV_UNLIKELY(lsn < recv_sys.lsn)) {
@@ -1504,7 +1525,7 @@
 	/* Make some checks that the server really is quiet */
 	ut_ad(!srv_any_background_activity());
 
-	ut_a(lsn == log_sys.get_lsn()
+	ut_a(lsn == log_get_lsn()
 	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
 }
 
@@ -1515,44 +1536,42 @@
 /*======*/
 	FILE*	file)	/*!< in: file where to print */
 {
-	log_sys.latch.rd_lock(SRW_LOCK_CALL);
+	log_sys.latch.wr_lock(SRW_LOCK_CALL);
 
 	const lsn_t lsn= log_sys.get_lsn();
 	mysql_mutex_lock(&buf_pool.flush_list_mutex);
 	const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn);
 	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+	const lsn_t flushed_lsn{log_sys.get_flushed_lsn()};
+	const lsn_t checkpoint_lsn{log_sys.last_checkpoint_lsn};
+	log_sys.latch.wr_unlock();
 
 	fprintf(file,
 		"Log sequence number " LSN_PF "\n"
 		"Log flushed up to   " LSN_PF "\n"
 		"Pages flushed up to " LSN_PF "\n"
 		"Last checkpoint at  " LSN_PF "\n",
-		lsn,
-		log_sys.get_flushed_lsn(),
-		pages_flushed,
-		lsn_t{log_sys.last_checkpoint_lsn});
-
-	log_sys.latch.rd_unlock();
+		lsn, flushed_lsn, pages_flushed, checkpoint_lsn);
 }
 
 /** Shut down the redo log subsystem. */
 void log_t::close()
 {
   ut_ad(this == &log_sys);
-  ut_ad(!(buf_free & buf_free_LOCK));
   if (!is_initialised()) return;
   close_file();
 
   ut_ad(!checkpoint_buf);
   ut_ad(!buf);
   ut_ad(!flush_buf);
+  base_lsn.store(0, std::memory_order_relaxed);
 
   latch.destroy();
-  lsn_lock.destroy();
+#ifdef HAVE_PMEM
+  resize_wrap_mutex.destroy();
+#endif
 
   recv_sys.close();
-
-  max_buf_free= 0;
 }
 
 std::string get_log_file_path(const char *filename)
diff -Nru mariadb-10.11.11/storage/innobase/log/log0recv.cc mariadb-10.11.13/storage/innobase/log/log0recv.cc
--- mariadb-10.11.11/storage/innobase/log/log0recv.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/log/log0recv.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1266,6 +1266,13 @@
 	} else if (p.second // the first FILE_MODIFY or FILE_RENAME
 		   || f.name != fname.name) {
 reload:
+		if (f.name.size() == 0) {
+			/* Augment the recv_spaces.emplace_hint() for the
+			FILE_MODIFY record that had been added by
+			recv_sys_t::parse() */
+			f.name = fname.name;
+		}
+
 		fil_space_t*	space;
 
 		/* Check if the tablespace file exists and contains
@@ -1466,6 +1473,7 @@
   mysql_mutex_lock(&mutex);
 
   recovery_on= false;
+  recv_needed_recovery= false;
   pages.clear();
   pages_it= pages.end();
 
@@ -1473,7 +1481,6 @@
   log_sys.clear_mmap();
 }
 
-
 /** Free a redo log snippet.
 @param data buffer allocated in add() */
 inline void recv_sys_t::free(const void *data)
@@ -1481,34 +1488,18 @@
   ut_ad(!ut_align_offset(data, ALIGNMENT));
   mysql_mutex_assert_owner(&mutex);
 
-  /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(),
-  we must acquire and hold the buffer pool mutex here. */
-  ut_ad(!buf_pool.resize_in_progress());
-
-  auto *chunk= buf_pool.chunks;
-  for (auto i= buf_pool.n_chunks; i--; chunk++)
+  buf_block_t *block= buf_pool.block_from(data);
+  ut_ad(block->page.frame == page_align(data));
+  ut_ad(block->page.state() == buf_page_t::MEMORY);
+  ut_ad(uint16_t(block->page.free_offset - 1) < srv_page_size);
+  ut_ad(block->page.used_records);
+  if (!--block->page.used_records)
   {
-    if (data < chunk->blocks->page.frame)
-      continue;
-    const size_t offs= (reinterpret_cast<const byte*>(data) -
-                        chunk->blocks->page.frame) >> srv_page_size_shift;
-    if (offs >= chunk->size)
-      continue;
-    buf_block_t *block= &chunk->blocks[offs];
-    ut_ad(block->page.frame == page_align(data));
-    ut_ad(block->page.state() == buf_page_t::MEMORY);
-    ut_ad(uint16_t(block->page.free_offset - 1) < srv_page_size);
-    ut_ad(block->page.used_records);
-    if (!--block->page.used_records)
-    {
-      block->page.hash= nullptr;
-      UT_LIST_REMOVE(blocks, block);
-      MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size);
-      buf_block_free(block);
-    }
-    return;
+    block->page.hash= nullptr;
+    UT_LIST_REMOVE(blocks, block);
+    MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size);
+    buf_block_free(block);
   }
-  ut_ad(0);
 }
 
 
@@ -2057,12 +2048,13 @@
 {
   mysql_mutex_unlock(&mutex);
   os_aio_wait_until_no_pending_reads(false);
+  os_aio_wait_until_no_pending_writes(false);
   mysql_mutex_lock(&mutex);
   garbage_collect();
   mysql_mutex_lock(&buf_pool.mutex);
-  bool need_more= UT_LIST_GET_LEN(buf_pool.free) < pages;
+  const size_t available= UT_LIST_GET_LEN(buf_pool.free);
   mysql_mutex_unlock(&buf_pool.mutex);
-  if (need_more)
+  if (available < pages)
     buf_flush_sync_batch(lsn);
 }
 
@@ -2507,9 +2499,11 @@
   ut_ad(log_sys.is_latest());
 
   alignas(8) byte iv[MY_AES_BLOCK_SIZE];
-  byte *decrypt_buf= storing != BACKUP
-    ? static_cast<byte*>(alloca(srv_page_size)) : nullptr;
-
+  byte *decrypt_buf=
+    static_cast<byte*>(alloca(storing == BACKUP
+                              ? 1/*type,length*/ + 5/*space_id*/ +
+                              5/*page_no*/ + 1/*rlen*/
+                              : srv_page_size));
   const lsn_t start_lsn{lsn};
 
   /* Check that the entire mini-transaction is included within the buffer */
@@ -2599,7 +2593,10 @@
   ut_d(std::set<page_id_t> modified);
 #endif
 
-  uint32_t space_id= 0, page_no= 0, last_offset= 0;
+  uint32_t space_id= 0, page_no= 0;
+  /* The end offset the last write (always 0 in storing==BACKUP).
+  The value 1 means that no "same page" record is allowed. */
+  uint last_offset= 0;
   bool got_page_op= false;
 
   for (l= begin;; l+= rlen)
@@ -2712,8 +2709,7 @@
         {
           mach_write_to_4(iv + 8, space_id);
           mach_write_to_4(iv + 12, page_no);
-          byte eb[1/*type,length*/ + 5/*space_id*/ + 5/*page_no*/ + 1/*rlen*/];
-          if (*l.copy_if_needed(iv, eb, recs, 1) == TRIM_PAGES)
+          if (*l.copy_if_needed(iv, decrypt_buf, recs, 1) == TRIM_PAGES)
             undo_space_trunc(space_id);
         }
         continue;
@@ -2726,8 +2722,8 @@
         if (i != recv_spaces.end() && i->first == space_id);
         else if (lsn < file_checkpoint)
           /* We have not seen all records between the checkpoint and
-          FILE_CHECKPOINT. There should be a FILE_DELETE for this
-          tablespace later. */
+          FILE_CHECKPOINT. There should be a FILE_DELETE or FILE_MODIFY
+          for this tablespace later, to be handled in fil_name_process(). */
           recv_spaces.emplace_hint(i, space_id, file_name_t("", false));
         else
         {
@@ -2762,10 +2758,10 @@
       case FREE_PAGE:
         ut_ad(freed.emplace(id).second);
         /* the next record must not be same_page */
-        last_offset= 1;
+        if (storing != BACKUP) last_offset= 1;
         goto free_or_init_page;
       case INIT_PAGE:
-        last_offset= FIL_PAGE_TYPE;
+        if (storing != BACKUP) last_offset= FIL_PAGE_TYPE;
       free_or_init_page:
         if (UNIV_UNLIKELY(rlen != 0))
           goto record_corrupted;
@@ -2797,7 +2793,8 @@
           erase(r);
           continue;
         }
-        cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
+        if (storing == YES)
+          cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
         break;
       case EXTENDED:
         if (storing == NO)
@@ -2811,7 +2808,8 @@
           continue;
         if (UNIV_UNLIKELY(!rlen))
           goto record_corrupted;
-        cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
+        if (storing == YES || rlen == 1)
+          cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
         if (rlen == 1 && *cl == TRIM_PAGES)
         {
           if (!srv_is_undo_tablespace(space_id) ||
@@ -2825,7 +2823,7 @@
           truncated_undo_spaces[space_id - srv_undo_space_id_start]=
             { start_lsn, page_no };
           /* the next record must not be same_page */
-          last_offset= 1;
+          if (storing != BACKUP) last_offset= 1;
           if (undo_space_trunc)
             undo_space_trunc(space_id);
           continue;
@@ -2833,7 +2831,7 @@
         /* This record applies to an undo log or index page, and it
         may be followed by subsequent WRITE or similar records for the
         same page in the same mini-transaction. */
-        last_offset= FIL_PAGE_TYPE;
+        if (storing != BACKUP) last_offset= FIL_PAGE_TYPE;
         break;
       case OPTION:
         /* OPTION records can be safely ignored in recovery */
@@ -2850,6 +2848,8 @@
       case WRITE:
       case MEMMOVE:
       case MEMSET:
+        if (storing == BACKUP)
+          continue;
         if (storing == NO && UNIV_LIKELY(page_no != 0))
           /* fil_space_set_recv_size_and_flags() is mandatory for storing==NO.
           It is only applicable to page_no == 0. Other than that, we can just
@@ -2979,7 +2979,7 @@
                                 l - recs + rlen)))
           {
             lsn= start_lsn;
-            if (lsn > log_sys.get_lsn())
+            if (lsn > log_sys.get_flushed_lsn(std::memory_order_relaxed))
               log_sys.set_recovered_lsn(start_lsn);
             l+= rlen;
             offset= begin.ptr - log_sys.buf;
@@ -3566,13 +3566,14 @@
   }
   else
   {
+    const lsn_t end{std::max(recv_sys.scanned_lsn, recv_sys.file_checkpoint)};
     sql_print_information("InnoDB: To recover: LSN " LSN_PF
                           "/" LSN_PF "; %zu pages",
-                          recv_sys.lsn, recv_sys.scanned_lsn, n);
+                          recv_sys.lsn, end, n);
     service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
                                    "To recover: LSN " LSN_PF
                                    "/" LSN_PF "; %zu pages",
-                                   recv_sys.lsn, recv_sys.scanned_lsn, n);
+                                   recv_sys.lsn, end, n);
   }
 }
 
@@ -4113,8 +4114,8 @@
                                         {log_sys.buf + recv_sys.len, size}))
       {
         mysql_mutex_unlock(&recv_sys.mutex);
-        ib::error() << "Failed to read log at " << source_offset
-                    << ": " << err;
+        sql_print_error("InnoDB: Failed to read log at %" PRIu64 ": %s",
+                        source_offset, ut_strerr(err));
         recv_sys.set_corrupt_log();
         mysql_mutex_lock(&recv_sys.mutex);
       }
@@ -4294,7 +4295,7 @@
 		break;
 	case SRV_OPERATION_RESTORE:
 	case SRV_OPERATION_RESTORE_EXPORT:
-		if (i->second.name.find("/#sql") != std::string::npos) {
+		if (i->second.name.find("/#sql") == std::string::npos) {
 			sql_print_warning("InnoDB: Tablespace " UINT32PF
 					  " was not found at %.*s when"
 					  " restoring a (partial?) backup."
@@ -4588,19 +4589,19 @@
 inline void log_t::set_recovered() noexcept
 {
   ut_ad(get_flushed_lsn() == get_lsn());
-  ut_ad(recv_sys.lsn == get_lsn());
-  size_t offset{recv_sys.offset};
+  ut_ad(recv_sys.lsn == get_flushed_lsn());
   if (!is_mmap())
   {
     const size_t bs{log_sys.write_size}, bs_1{bs - 1};
-    memmove_aligned<512>(buf, buf + (offset & ~bs_1), bs);
-    offset&= bs_1;
+    memmove_aligned<512>(buf, buf + (recv_sys.offset & ~bs_1), bs);
   }
-#ifndef _WIN32
+#ifdef HAVE_PMEM
   else
+  {
+    buf_size= unsigned(std::min<uint64_t>(capacity(), buf_size_max));
     mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
+  }
 #endif
-  set_buf_free(offset);
 }
 
 inline bool recv_sys_t::validate_checkpoint() const noexcept
@@ -4674,7 +4675,7 @@
 			goto err_exit;
 		}
 		ut_ad(recv_sys.file_checkpoint);
-		ut_ad(log_sys.get_lsn() >= recv_sys.scanned_lsn);
+		ut_ad(log_sys.get_flushed_lsn() >= recv_sys.scanned_lsn);
 		if (rewind) {
 			recv_sys.lsn = log_sys.next_checkpoint_lsn;
 			recv_sys.offset = 0;
@@ -4736,7 +4737,7 @@
 			tablespaces (not individual pages), while retaining
 			the initial recv_sys.pages. */
 			mysql_mutex_lock(&recv_sys.mutex);
-			ut_ad(log_sys.get_lsn() >= recv_sys.lsn);
+			ut_ad(log_sys.get_flushed_lsn() >= recv_sys.lsn);
 			recv_sys.clear();
 			recv_sys.lsn = log_sys.next_checkpoint_lsn;
 			mysql_mutex_unlock(&recv_sys.mutex);
@@ -4744,7 +4745,8 @@
 
 		if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) {
 			mysql_mutex_lock(&recv_sys.mutex);
-			deferred_spaces.deferred_dblwr(log_sys.get_lsn());
+			deferred_spaces.deferred_dblwr(
+				log_sys.get_flushed_lsn());
 			buf_dblwr.recover();
 			mysql_mutex_unlock(&recv_sys.mutex);
 		}
@@ -4777,16 +4779,6 @@
 
 	if (!srv_read_only_mode && log_sys.is_latest()) {
 		log_sys.set_recovered();
-		if (recv_needed_recovery
-		    && srv_operation <= SRV_OPERATION_EXPORT_RESTORED
-		    && recv_sys.lsn - log_sys.next_checkpoint_lsn
-		    < log_sys.log_capacity) {
-			/* Write a FILE_CHECKPOINT marker as the first thing,
-			before generating any other redo log. This ensures
-			that subsequent crash recovery will be possible even
-			if the server were killed soon after this. */
-			fil_names_clear(log_sys.next_checkpoint_lsn);
-		}
 	}
 
 	DBUG_EXECUTE_IF("before_final_redo_apply", goto err_exit;);
@@ -4892,28 +4884,43 @@
   goto check_if_corrupted;
 }
 
-byte *recv_dblwr_t::find_encrypted_page(const fil_node_t &node,
-                                        uint32_t page_no,
-                                        byte *buf) noexcept
+ATTRIBUTE_COLD
+byte *recv_dblwr_t::find_deferred_page(const fil_node_t &node,
+                                       uint32_t page_no,
+                                       byte *buf) noexcept
 {
-  ut_ad(node.space->crypt_data);
   ut_ad(node.space->full_crc32());
   mysql_mutex_lock(&recv_sys.mutex);
   byte *result_page= nullptr;
+  bool is_encrypted= node.space->crypt_data &&
+                     node.space->crypt_data->is_encrypted();
   for (list::iterator page_it= pages.begin(); page_it != pages.end();
        page_it++)
   {
     if (page_get_page_no(*page_it) != page_no ||
         buf_page_is_corrupted(true, *page_it, node.space->flags))
       continue;
+
+    if (is_encrypted &&
+        !mach_read_from_4(*page_it + FIL_PAGE_FCRC32_KEY_VERSION))
+      continue;
+
     memcpy(buf, *page_it, node.space->physical_size());
     buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve(false);
     ut_a(slot);
     slot->allocate();
-    bool invalidate=
-      !fil_space_decrypt(node.space, slot->crypt_buf, buf) ||
-      (node.space->is_compressed() &&
-       !fil_page_decompress(slot->crypt_buf, buf, node.space->flags));
+
+    bool invalidate= false;
+    if (is_encrypted)
+    {
+      invalidate= !fil_space_decrypt(node.space, slot->crypt_buf, buf);
+      if (!invalidate && node.space->is_compressed())
+        goto decompress;
+    }
+    else
+decompress:
+      invalidate= !fil_page_decompress(slot->crypt_buf, buf,
+                                       node.space->flags);
     slot->release();
 
     if (invalidate ||
diff -Nru mariadb-10.11.11/storage/innobase/mtr/mtr0mtr.cc mariadb-10.11.13/storage/innobase/mtr/mtr0mtr.cc
--- mariadb-10.11.11/storage/innobase/mtr/mtr0mtr.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/mtr/mtr0mtr.cc	2025-05-19 16:14:25.000000000 +0000
@@ -44,7 +44,6 @@
 #endif
 
 std::pair<lsn_t,mtr_t::page_flush_ahead> (*mtr_t::finisher)(mtr_t *, size_t);
-unsigned mtr_t::spin_wait_delay;
 
 void mtr_t::finisher_update()
 {
@@ -53,15 +52,12 @@
   if (log_sys.is_mmap())
   {
     commit_logger= mtr_t::commit_log<true>;
-    finisher= spin_wait_delay
-      ? mtr_t::finish_writer<true,true> : mtr_t::finish_writer<false,true>;
+    finisher= mtr_t::finish_writer<true>;
     return;
   }
   commit_logger= mtr_t::commit_log<false>;
 #endif
-  finisher=
-    (spin_wait_delay
-     ? mtr_t::finish_writer<true,false> : mtr_t::finish_writer<false,false>);
+  finisher= mtr_t::finish_writer<false>;
 }
 
 void mtr_memo_slot_t::release() const
@@ -169,7 +165,7 @@
   else
     flush_list_bytes+= block->physical_size();
 
-  ut_ad(flush_list_bytes <= curr_pool_size);
+  ut_ad(flush_list_bytes <= size_in_bytes);
 
   if (prev)
     UT_LIST_INSERT_AFTER(flush_list, prev, &block->page);
@@ -257,7 +253,7 @@
 {
   if (block->page.oldest_modification() <= 1)
   {
-    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
     /* For unlogged mtrs (MTR_LOG_NO_REDO), we use the current system LSN. The
     mtr that generated the LSN is either already committed or in mtr_t::commit.
     Shared latch and relaxed atomics should be fine here as it is guaranteed
@@ -269,7 +265,7 @@
     mysql_mutex_lock(&buf_pool.flush_list_mutex);
     buf_pool.insert_into_flush_list
       (buf_pool.prepare_insert_into_flush_list(lsn), block, lsn);
-    log_sys.latch.rd_unlock();
+    log_sys.latch.wr_unlock();
     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
   }
 }
@@ -339,24 +335,11 @@
   m_memo.clear();
 }
 
-inline lsn_t log_t::get_write_target() const
-{
-  ut_ad(latch_have_any());
-  if (UNIV_LIKELY(buf_free_ok()))
-    return 0;
-  /* The LSN corresponding to the end of buf is
-  write_lsn - (first_lsn & 4095) + buf_free,
-  but we use simpler arithmetics to return a smaller write target in
-  order to minimize waiting in log_write_up_to(). */
-  ut_ad(max_buf_free >= 4096 * 4);
-  return write_lsn + max_buf_free / 2;
-}
-
 template<bool mmap>
 void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
+  noexcept
 {
   size_t modified= 0;
-  const lsn_t write_lsn= mmap ? 0 : log_sys.get_write_target();
 
   if (mtr->m_made_dirty)
   {
@@ -475,9 +458,6 @@
 
   if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
     buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
-
-  if (!mmap && UNIV_UNLIKELY(write_lsn != 0))
-    log_write_up_to(write_lsn, false);
 }
 
 /** Commit a mini-transaction. */
@@ -690,7 +670,7 @@
     /* We will not encrypt any FILE_ records, but we will reserve
     a nonce at the end. */
     size+= 8;
-    m_commit_lsn= log_sys.get_lsn();
+    m_commit_lsn= log_sys.get_flushed_lsn();
   }
   else
     m_commit_lsn= 0;
@@ -775,7 +755,7 @@
     /* We will not encrypt any FILE_ records, but we will reserve
     a nonce at the end. */
     size+= 8;
-    m_commit_lsn= log_sys.get_lsn();
+    m_commit_lsn= log_sys.get_flushed_lsn();
   }
   else
     m_commit_lsn= 0;
@@ -897,181 +877,109 @@
                   ? ". Shutdown is in progress" : "");
 }
 
-static ATTRIBUTE_NOINLINE void lsn_delay(size_t delay, size_t mult) noexcept
+ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept
 {
-  delay*= mult * 2; // GCC 13.2.0 -O2 targeting AMD64 wants to unroll twice
-  HMT_low();
-  do
-    MY_RELAX_CPU();
-  while (--delay);
-  HMT_medium();
-}
-
-#if defined __clang_major__ && __clang_major__ < 10
-/* Only clang-10 introduced support for asm goto */
-#elif defined __APPLE__
-/* At least some versions of Apple Xcode do not support asm goto */
-#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-# if SIZEOF_SIZE_T == 8
-#  define LOCK_TSET                                             \
-  __asm__ goto("lock btsq $63, %0\n\t" "jnc %l1"                \
-               : : "m"(buf_free) : "cc", "memory" : got)
-# else
-#  define LOCK_TSET                                             \
-  __asm__ goto("lock btsl $31, %0\n\t" "jnc %l1"                \
-               : : "m"(buf_free) : "cc", "memory" : got)
-# endif
-#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-# if SIZEOF_SIZE_T == 8
-#  define LOCK_TSET                                                     \
-  if (!_interlockedbittestandset64                                      \
-      (reinterpret_cast<volatile LONG64*>(&buf_free), 63)) return
-# else
-#  define LOCK_TSET                                                     \
-  if (!_interlockedbittestandset                                        \
-      (reinterpret_cast<volatile long*>(&buf_free), 31)) return
-# endif
-#endif
-
-#ifdef LOCK_TSET
-ATTRIBUTE_NOINLINE
-void log_t::lsn_lock_bts() noexcept
-{
-  LOCK_TSET;
-  {
-    const size_t m= mtr_t::spin_wait_delay;
-    constexpr size_t DELAY= 10, MAX_ITERATIONS= 10;
-    for (size_t delay_count= DELAY, delay_iterations= 1;;
-         lsn_delay(delay_iterations, m))
+  if (UNIV_LIKELY(!ex))
+  {
+    latch.rd_unlock();
+    if (!late)
     {
-      if (!(buf_free.load(std::memory_order_relaxed) & buf_free_LOCK))
-        LOCK_TSET;
-      if (!delay_count);
-      else if (delay_iterations < MAX_ITERATIONS)
-        delay_count= DELAY, delay_iterations++;
-      else
-        delay_count--;
+      /* Wait for all threads to back off. */
+      latch.wr_lock(SRW_LOCK_CALL);
+      goto got_ex;
     }
-  }
 
-# ifdef __GNUC__
- got:
-  return;
-# endif
-}
+    const auto delay= my_cpu_relax_multiplier / 4 * srv_spin_wait_delay;
+    const auto rounds= srv_n_spin_wait_rounds;
 
-inline
-#else
-ATTRIBUTE_NOINLINE
-#endif
-size_t log_t::lock_lsn() noexcept
-{
-#ifdef LOCK_TSET
-  lsn_lock_bts();
-  return ~buf_free_LOCK & buf_free.load(std::memory_order_relaxed);
-# undef LOCK_TSET
-#else
-  size_t b= buf_free.fetch_or(buf_free_LOCK, std::memory_order_acquire);
-  if (b & buf_free_LOCK)
-  {
-    const size_t m= mtr_t::spin_wait_delay;
-    constexpr size_t DELAY= 10, MAX_ITERATIONS= 10;
-    for (size_t delay_count= DELAY, delay_iterations= 1;
-         ((b= buf_free.load(std::memory_order_relaxed)) & buf_free_LOCK) ||
-           (buf_free_LOCK & (b= buf_free.fetch_or(buf_free_LOCK,
-                                                  std::memory_order_acquire)));
-         lsn_delay(delay_iterations, m))
-      if (!delay_count);
-      else if (delay_iterations < MAX_ITERATIONS)
-        delay_count= DELAY, delay_iterations++;
-      else
-        delay_count--;
+    for (;;)
+    {
+      HMT_low();
+      for (auto r= rounds + 1; r--; )
+      {
+        if (write_lsn_offset.load(std::memory_order_relaxed) & WRITE_BACKOFF)
+        {
+          for (auto d= delay; d--; )
+            MY_RELAX_CPU();
+        }
+        else
+        {
+          HMT_medium();
+          goto done;
+        }
+      }
+      HMT_medium();
+      std::this_thread::sleep_for(std::chrono::microseconds(100));
+    }
   }
-  return b;
-#endif
-}
-
-template<bool spin>
-ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn)
-  noexcept
-{
-  waits++;
-  ut_ad(buf_free.load(std::memory_order_relaxed) ==
-        (spin ? (b | buf_free_LOCK) : b));
-  if (spin)
-    buf_free.store(b, std::memory_order_release);
   else
-    lsn_lock.wr_unlock();
-
-  if (ex)
+  {
+  got_ex:
+    const uint64_t l= write_lsn_offset.load(std::memory_order_relaxed);
+    const lsn_t lsn= base_lsn.load(std::memory_order_relaxed) +
+      (l & (WRITE_BACKOFF - 1));
+    waits++;
+#ifdef HAVE_PMEM
+    const bool is_pmem{is_mmap()};
+    if (is_pmem)
+    {
+      ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity());
+      persist(lsn);
+    }
+#endif
     latch.wr_unlock();
-  else
-    latch.rd_unlock();
-
-  log_write_up_to(lsn, is_mmap());
-
-  if (ex)
-    latch.wr_lock(SRW_LOCK_CALL);
-  else
-    latch.rd_lock(SRW_LOCK_CALL);
-
-  if (spin)
-    return lock_lsn();
+    /* write_buf() or persist() will clear the WRITE_BACKOFF flag,
+    which our caller will recheck. */
+#ifdef HAVE_PMEM
+    if (!is_pmem)
+#endif
+    log_write_up_to(lsn, false);
+    if (ex)
+    {
+      latch.wr_lock(SRW_LOCK_CALL);
+      return;
+    }
+  }
 
-  lsn_lock.wr_lock();
-  return buf_free.load(std::memory_order_relaxed);
+done:
+  latch.rd_lock(SRW_LOCK_CALL);
 }
 
 /** Reserve space in the log buffer for appending data.
-@tparam spin  whether to use the spin-only lock_lsn()
 @tparam mmap  log_sys.is_mmap()
 @param size   total length of the data to append(), in bytes
 @param ex     whether log_sys.latch is exclusively locked
 @return the start LSN and the buffer position for append() */
-template<bool spin,bool mmap>
+template<bool mmap>
 inline
 std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
 {
   ut_ad(ex ? latch_have_wr() : latch_have_rd());
   ut_ad(mmap == is_mmap());
-  if (!spin)
-    lsn_lock.wr_lock();
-  size_t b{spin ? lock_lsn() : buf_free.load(std::memory_order_relaxed)};
-  write_to_buf++;
-
-  lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size};
-
-  if (UNIV_UNLIKELY(mmap
-                    ? (end_lsn -
-                       get_flushed_lsn(std::memory_order_relaxed)) > capacity()
-                    : b + size >= buf_size))
-  {
-    b= append_prepare_wait<spin>(b, ex, l);
-    /* While flushing log, we had released the lsn lock and LSN could have
-    progressed in the meantime. */
-    l= lsn.load(std::memory_order_relaxed);
-    end_lsn= l + size;
-  }
-
-  size_t new_buf_free= b + size;
-  if (mmap && new_buf_free >= file_size)
-    new_buf_free-= size_t(capacity());
+  ut_ad(!mmap || buf_size == std::min<uint64_t>(capacity(), buf_size_max));
+  const size_t buf_size{this->buf_size - size};
+  uint64_t l;
+  static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, "");
+  while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) &
+                        (WRITE_TO_BUF - 1)) >= buf_size))
+  {
+    /* The following is inlined here instead of being part of
+    append_prepare_wait(), in order to increase the locality of reference
+    and to set the WRITE_BACKOFF flag as soon as possible. */
+    bool late(write_lsn_offset.fetch_or(WRITE_BACKOFF) & WRITE_BACKOFF);
+    /* Subtract our LSN overshoot. */
+    write_lsn_offset.fetch_sub(size);
+    append_prepare_wait(late, ex);
+  }
 
-  lsn.store(end_lsn, std::memory_order_relaxed);
+  const lsn_t lsn{l + base_lsn.load(std::memory_order_relaxed)},
+    end_lsn{lsn + size};
 
   if (UNIV_UNLIKELY(end_lsn >= last_checkpoint_lsn + log_capacity))
     set_check_for_checkpoint(true);
 
-  byte *our_buf= buf;
-  if (spin)
-    buf_free.store(new_buf_free, std::memory_order_release);
-  else
-  {
-    buf_free.store(new_buf_free, std::memory_order_relaxed);
-    lsn_lock.wr_unlock();
-  }
-
-  return {l, our_buf + b};
+  return {lsn,
+          buf + size_t(mmap ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)};
 }
 
 /** Finish appending data to the log.
@@ -1216,7 +1124,7 @@
     if (!resize_flush_buf)
     {
       ut_ad(is_mmap());
-      lsn_lock.wr_lock();
+      resize_wrap_mutex.wr_lock();
       const size_t resize_capacity{resize_target - START_OFFSET};
       {
         const lsn_t resizing{resize_in_progress()};
@@ -1227,7 +1135,7 @@
         if (UNIV_UNLIKELY(lsn < resizing))
         {
           /* This function may execute in multiple concurrent threads
-          that hold a shared log_sys.latch. Before we got lsn_lock,
+          that hold a shared log_sys.latch. Before we got resize_wrap_mutex,
           another thread could have executed resize_lsn.store(lsn) below
           with a larger lsn than ours.
 
@@ -1277,7 +1185,7 @@
       ut_ad(resize_buf[s] <= 1);
       resize_buf[s]= 1;
     mmap_done:
-      lsn_lock.wr_unlock();
+      resize_wrap_mutex.wr_unlock();
     }
     else
 #endif
@@ -1304,7 +1212,7 @@
   d+= size;
 }
 
-template<bool spin,bool mmap>
+template<bool mmap>
 std::pair<lsn_t,mtr_t::page_flush_ahead>
 mtr_t::finish_writer(mtr_t *mtr, size_t len)
 {
@@ -1315,7 +1223,7 @@
 
   const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U};
   std::pair<lsn_t, byte*> start=
-    log_sys.append_prepare<spin,mmap>(len, mtr->m_latch_ex);
+    log_sys.append_prepare<mmap>(len, mtr->m_latch_ex);
 
   if (!mmap)
   {
diff -Nru mariadb-10.11.11/storage/innobase/os/os0file.cc mariadb-10.11.13/storage/innobase/os/os0file.cc
--- mariadb-10.11.11/storage/innobase/os/os0file.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/os/os0file.cc	2025-05-19 16:14:25.000000000 +0000
@@ -2314,8 +2314,20 @@
 	ut_ad(exists);
 #endif /* UNIV_DEBUG */
 
-	if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING)) {
-		return(true);
+	for (int retry= 50;; retry--){
+		if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING))
+			return true;
+
+		if (!retry)
+			break;
+
+		if (GetLastError() != ERROR_SHARING_VIOLATION)
+			break;
+
+		// oldpath was opened by someone else (antivirus?)
+		//without FILE_SHARE_DELETE flag. Retry operation
+
+		Sleep(10);
 	}
 
 	os_file_handle_rename_error(oldpath, newpath);
@@ -3357,6 +3369,12 @@
   return pending;
 }
 
+/** @return approximate number of pending writes */
+size_t os_aio_pending_writes_approx() noexcept
+{
+  return write_slots->pending_io_count();
+}
+
 /** Wait until all pending asynchronous reads have completed.
 @param declare  whether the wait will be declared in tpool */
 void os_aio_wait_until_no_pending_reads(bool declare) noexcept
diff -Nru mariadb-10.11.11/storage/innobase/pars/pars0pars.cc mariadb-10.11.13/storage/innobase/pars/pars0pars.cc
--- mariadb-10.11.11/storage/innobase/pars/pars0pars.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/pars/pars0pars.cc	2025-05-19 16:14:25.000000000 +0000
@@ -783,11 +783,6 @@
 {
 	ulint		count		= 0;
 
-	if (sym_node == NULL) {
-
-		return(count);
-	}
-
 	while (sym_node) {
 		pars_retrieve_table_def(sym_node);
 
diff -Nru mariadb-10.11.11/storage/innobase/row/row0ins.cc mariadb-10.11.13/storage/innobase/row/row0ins.cc
--- mariadb-10.11.11/storage/innobase/row/row0ins.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0ins.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1955,7 +1955,7 @@
 				TRUE, foreign, table, ref_tuple, thr);
 
 			if (ref_table) {
-				dict_table_close(ref_table);
+				ref_table->release();
 			}
 		}
 	}
@@ -2580,12 +2580,44 @@
   }
 }
 
-#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
-/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock().
-We would only need this for row_ins_clust_index_entry_low(),
-but GCC 4.8.5 does not support pop_options. */
-# pragma GCC optimize ("O0")
-#endif
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param data             data to read
+@param len              length of data
+@param mtype            main type of the column
+@param prtype           precise type of the column
+@return the integer value from the data
+@retval 0  if the value is negative or the type or length invalid */
+static uint64_t row_parse_int(const byte *data, size_t len,
+                              ulint mtype, ulint prtype) noexcept
+{
+  switch (mtype) {
+  case DATA_FLOAT:
+    if (len != sizeof(float))
+      return 0;
+    {
+      float f= mach_float_read(data);
+      return f <= 0.0 ? 0 : uint64_t(f);
+    }
+  case DATA_DOUBLE:
+    if (len != sizeof(double))
+      return 0;
+    {
+      double d= mach_double_read(data);
+      return d <= 0.0 ? 0 : uint64_t(d);
+    }
+  case DATA_INT:
+    if (len == 0 || len > 8)
+      return 0;
+    const ibool unsigned_type{prtype & DATA_UNSIGNED};
+    uint64_t value= mach_read_int_type(data, len, unsigned_type);
+    return !unsigned_type && int64_t(value) < 0 ? 0 : value;
+  }
+
+  ut_ad("invalid type" == 0);
+  return 0;
+}
 
 /***************************************************************//**
 Tries to insert an entry into a clustered index, ignoring foreign key
@@ -2672,8 +2704,7 @@
 							dfield->data),
 						dfield->len,
 						dfield->type.mtype,
-						dfield->type.prtype
-						& DATA_UNSIGNED);
+						dfield->type.prtype);
 					if (auto_inc
 					    && mode != BTR_MODIFY_TREE) {
 						mode = btr_latch_mode(
@@ -2722,6 +2753,12 @@
 
 	DBUG_EXECUTE_IF("row_ins_row_level", goto row_level_insert;);
 
+#ifdef WITH_WSREP
+	/* Appliers never execute bulk insert statements directly. */
+	if (trx->is_wsrep() && !wsrep_thd_is_local_transaction(trx->mysql_thd))
+		goto row_level_insert;
+#endif /* WITH_WSREP */
+
 	if (!(flags & BTR_NO_UNDO_LOG_FLAG)
 	    && page_is_empty(block->page.frame)
 	    && !entry->is_metadata() && !trx->duplicates
@@ -2738,28 +2775,24 @@
 		    && !index->table->has_spatial_index()) {
 
 			ut_ad(!index->table->skip_alter_undo);
-			trx->bulk_insert = true;
+			trx->bulk_insert = TRX_DML_BULK;
 			err = lock_table(index->table, NULL, LOCK_X, thr);
 			if (err != DB_SUCCESS) {
 				trx->error_state = err;
-				trx->bulk_insert = false;
+				trx->bulk_insert = TRX_NO_BULK;
 				goto err_exit;
 			}
 			if (index->table->n_rec_locks) {
 avoid_bulk:
-				trx->bulk_insert = false;
+				trx->bulk_insert = TRX_NO_BULK;
 				goto row_level_insert;
 			}
 #ifdef WITH_WSREP
-			if (trx->is_wsrep())
+			if (trx->is_wsrep() &&
+			    wsrep_append_table_key(trx->mysql_thd, *index->table))
 			{
-				if (!wsrep_thd_is_local_transaction(trx->mysql_thd))
-					goto row_level_insert;
-				if (wsrep_append_table_key(trx->mysql_thd, *index->table))
-				{
-					trx->error_state = DB_ROLLBACK;
-					goto err_exit;
-				}
+				trx->error_state = DB_ROLLBACK;
+				goto err_exit;
 			}
 #endif /* WITH_WSREP */
 
@@ -2811,7 +2844,7 @@
 			bulk buffer and doesn't check for constraint
 			validity of foreign key relationship. */
 			trx_start_if_not_started(trx, true);
-			trx->bulk_insert = true;
+			trx->bulk_insert = TRX_DDL_BULK;
 			auto m = trx->mod_tables.emplace(index->table, 0);
 			m.first->second.start_bulk_insert(index->table, true);
 			err = m.first->second.bulk_insert_buffered(
@@ -3891,3 +3924,79 @@
 
 	return(thr);
 }
+
+/** Read the AUTOINC column from an index record
+@param index  index of the record
+@param rec    the record
+@return value read from the first column
+@retval 0 if the value would be NULL or negative */
+static uint64_t row_read_autoinc(const dict_index_t &index, const rec_t *rec)
+  noexcept
+{
+  const dict_field_t &field= index.fields[0];
+  ut_ad(!DATA_BIG_COL(field.col));
+  ut_ad(!(rec_get_info_bits(rec, index.table->not_redundant()) &
+          (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_HEADER_SIZE + 2];
+  rec_offs_init(offsets_);
+  rec_offs *offsets= rec_get_offsets(rec, &index, offsets_,
+                                     index.n_core_fields, 1, &heap);
+  ut_ad(!heap);
+
+  size_t len;
+  ut_d(size_t first_offset=) rec_get_nth_field_offs(offsets, 0, &len);
+  ut_ad(!first_offset);
+  return row_parse_int(rec, len, field.col->mtype, field.col->prtype);
+}
+
+/** Get the maximum and non-delete-marked record in an index.
+@param index    index B-tree
+@param mtr      mini-transaction (may be committed and restarted)
+@return maximum record, page s-latched in mtr
+@retval nullptr if there are no records, or if all of them are delete-marked */
+static
+const rec_t *row_search_get_max_rec(dict_index_t *index, mtr_t *mtr) noexcept
+{
+  btr_pcur_t pcur;
+  const bool desc= index->fields[0].descending;
+
+  /* Open at the high/right end (false), and init cursor */
+  if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS)
+    return nullptr;
+
+  if (desc)
+  {
+    const bool comp= index->table->not_redundant();
+    while (btr_pcur_move_to_next_user_rec(&pcur, mtr))
+    {
+      const rec_t *rec= btr_pcur_get_rec(&pcur);
+      if (!rec_is_metadata(rec, comp) && !rec_get_deleted_flag(rec, comp))
+        return rec;
+    }
+    return nullptr;
+  }
+
+  do
+  {
+    const page_t *page= btr_pcur_get_page(&pcur);
+    const rec_t *rec= page_find_rec_last_not_deleted(page);
+    if (page_rec_is_user_rec_low(rec - page))
+      return rec;
+    btr_pcur_move_before_first_on_page(&pcur);
+  }
+  while (btr_pcur_move_to_prev(&pcur, mtr));
+
+  return nullptr;
+}
+
+uint64_t row_search_max_autoinc(dict_index_t *index) noexcept
+{
+  uint64_t value= 0;
+  mtr_t mtr;
+  mtr.start();
+  if (const rec_t *rec= row_search_get_max_rec(index, &mtr))
+    value= row_read_autoinc(*index, rec);
+  mtr.commit();
+  return value;
+}
diff -Nru mariadb-10.11.11/storage/innobase/row/row0log.cc mariadb-10.11.13/storage/innobase/row/row0log.cc
--- mariadb-10.11.11/storage/innobase/row/row0log.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0log.cc	2025-05-19 16:14:25.000000000 +0000
@@ -4065,21 +4065,20 @@
   if (!(this->cmpl_info & UPD_NODE_NO_ORD_CHANGE))
   {
     for (ulint i = 0; i < dict_table_get_n_v_cols(table); i++)
-       dfield_get_type(
-         dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING;
+     dfield_get_type(dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING;
   }
 
+  if (table->n_v_cols)
+    row_upd_replace_vcol(row, table, update, false, nullptr,
+                         (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+                         ? nullptr : undo_rec);
+
   if (is_update)
   {
     old_row= dtuple_copy(row, heap);
     row_upd_replace(old_row, &old_ext, clust_index, update, heap);
   }
 
-  if (table->n_v_cols)
-    row_upd_replace_vcol(row, table, update, false, nullptr,
-                         (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
-                         ? nullptr : undo_rec);
-
   bool success= true;
   dict_index_t *index= dict_table_get_next_index(clust_index);
   while (index)
diff -Nru mariadb-10.11.11/storage/innobase/row/row0mysql.cc mariadb-10.11.13/storage/innobase/row/row0mysql.cc
--- mariadb-10.11.11/storage/innobase/row/row0mysql.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0mysql.cc	2025-05-19 16:14:25.000000000 +0000
@@ -69,7 +69,7 @@
 
 
 /** Delay an INSERT, DELETE or UPDATE operation if the purge is lagging. */
-static void row_mysql_delay_if_needed()
+static void row_mysql_delay_if_needed() noexcept
 {
   const auto delay= srv_dml_needed_delay;
   if (UNIV_UNLIKELY(delay != 0))
@@ -78,8 +78,8 @@
     log_sys.latch.rd_lock(SRW_LOCK_CALL);
     const lsn_t last= log_sys.last_checkpoint_lsn,
       max_age= log_sys.max_checkpoint_age;
+    const lsn_t lsn= log_sys.get_flushed_lsn();
     log_sys.latch.rd_unlock();
-    const lsn_t lsn= log_sys.get_lsn();
     if ((lsn - last) / 4 >= max_age / 5)
       buf_flush_ahead(last + max_age / 5, false);
     purge_sys.wake_if_not_active();
@@ -687,8 +687,12 @@
 			/* MariaDB will roll back the latest SQL statement */
 			break;
 		}
-		/* MariaDB will roll back the entire transaction. */
-		trx->bulk_insert = false;
+		/* For DML, InnoDB does partial rollback and clear
+		bulk buffer in row_mysql_handle_errors().
+		For ALTER TABLE ALGORITHM=COPY & CREATE TABLE...SELECT,
+		the bulk insert transaction will be rolled back inside
+		ha_innobase::extra(HA_EXTRA_ABORT_ALTER_COPY) */
+		trx->bulk_insert &= TRX_DDL_BULK;
 		trx->last_stmt_start = 0;
 		break;
 	case DB_LOCK_WAIT:
@@ -981,7 +985,7 @@
 		rtr_clean_rtr_info(prebuilt->rtr_info, true);
 	}
 	if (prebuilt->table) {
-		dict_table_close(prebuilt->table);
+		prebuilt->table->release();
 	}
 
 	mem_heap_free(prebuilt->heap);
@@ -1599,7 +1603,7 @@
 	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
 	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
 	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
-	ut_ad(table->stat_initialized);
+	ut_ad(table->stat_initialized());
 
 	if (!table->is_readable()) {
 		return row_mysql_get_table_error(trx, table);
@@ -2159,11 +2163,9 @@
 
 		index = node->index;
 
-		ut_ad(!index == (err != DB_SUCCESS));
-
 		que_graph_free((que_t*) que_node_get_parent(thr));
 
-		if (index && (index->type & DICT_FTS)) {
+		if (err == DB_SUCCESS && (index->type & DICT_FTS)) {
 			err = fts_create_index_tables(trx, index, table->id);
 		}
 
diff -Nru mariadb-10.11.11/storage/innobase/row/row0purge.cc mariadb-10.11.13/storage/innobase/row/row0purge.cc
--- mariadb-10.11.11/storage/innobase/row/row0purge.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0purge.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1564,7 +1564,7 @@
 	case TRX_UNDO_DEL_MARK_REC:
 		purged = row_purge_del_mark(node);
 		if (purged) {
-			if (node->table->stat_initialized
+			if (node->table->stat_initialized()
 			    && srv_stats_include_delete_marked) {
 				dict_stats_update_if_needed(
 					node->table, *thr->graph->trx);
diff -Nru mariadb-10.11.11/storage/innobase/row/row0sel.cc mariadb-10.11.13/storage/innobase/row/row0sel.cc
--- mariadb-10.11.11/storage/innobase/row/row0sel.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0sel.cc	2025-05-19 16:14:25.000000000 +0000
@@ -6852,123 +6852,3 @@
 
   goto rec_loop;
 }
-
-/*******************************************************************//**
-Read the AUTOINC column from the current row. If the value is less than
-0 and the type is not unsigned then we reset the value to 0.
-@return value read from the column */
-static
-ib_uint64_t
-row_search_autoinc_read_column(
-/*===========================*/
-	dict_index_t*	index,		/*!< in: index to read from */
-	const rec_t*	rec,		/*!< in: current rec */
-	ulint		col_no,		/*!< in: column number */
-	ulint		mtype,		/*!< in: column main type */
-	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
-{
-	ulint		len;
-	const byte*	data;
-	ib_uint64_t	value;
-	mem_heap_t*	heap = NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets	= offsets_;
-
-	rec_offs_init(offsets_);
-	ut_ad(page_rec_is_leaf(rec));
-
-	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
-				  col_no + 1, &heap);
-
-	if (rec_offs_nth_sql_null(offsets, col_no)) {
-		/* There is no non-NULL value in the auto-increment column. */
-		value = 0;
-		goto func_exit;
-	}
-
-	data = rec_get_nth_field(rec, offsets, col_no, &len);
-
-	value = row_parse_int(data, len, mtype, unsigned_type);
-
-func_exit:
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
-
-	return(value);
-}
-
-/** Get the maximum and non-delete-marked record in an index.
-@param[in]	index	index tree
-@param[in,out]	mtr	mini-transaction (may be committed and restarted)
-@return maximum record, page s-latched in mtr
-@retval NULL if there are no records, or if all of them are delete-marked */
-static
-const rec_t*
-row_search_get_max_rec(
-	dict_index_t*	index,
-	mtr_t*		mtr)
-{
-	btr_pcur_t	pcur;
-	const rec_t*	rec;
-	const bool	desc	= index->fields[0].descending;
-
-	if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) {
-		return nullptr;
-	}
-
-	if (desc) {
-		const bool comp = index->table->not_redundant();
-		while (btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
-			rec = btr_pcur_get_rec(&pcur);
-			if (rec_is_metadata(rec, *index)) {
-				continue;
-			}
-			if (!rec_get_deleted_flag(rec, comp)) {
-				goto found;
-			}
-		}
-	} else {
-		do {
-			rec = page_find_rec_last_not_deleted(
-				btr_pcur_get_page(&pcur));
-			if (page_rec_is_user_rec(rec)) {
-				goto found;
-			}
-			btr_pcur_move_before_first_on_page(&pcur);
-		} while (btr_pcur_move_to_prev(&pcur, mtr));
-	}
-
-	rec = nullptr;
-
-found:
-	ut_ad(!rec
-	      || !(rec_get_info_bits(rec, dict_table_is_comp(index->table))
-		   & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
-	return(rec);
-}
-
-/** Read the max AUTOINC value from an index.
-@param[in] index	index starting with an AUTO_INCREMENT column
-@return	the largest AUTO_INCREMENT value
-@retval	0	if no records were found */
-ib_uint64_t
-row_search_max_autoinc(dict_index_t* index)
-{
-	const dict_field_t*	dfield = dict_index_get_nth_field(index, 0);
-
-	ib_uint64_t	value = 0;
-
-	mtr_t		mtr;
-	mtr.start();
-
-	if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) {
-		value = row_search_autoinc_read_column(
-			index, rec, 0,
-			dfield->col->mtype,
-			dfield->col->prtype & DATA_UNSIGNED);
-	}
-
-	mtr.commit();
-	return(value);
-}
diff -Nru mariadb-10.11.11/storage/innobase/row/row0uins.cc mariadb-10.11.13/storage/innobase/row/row0uins.cc
--- mariadb-10.11.11/storage/innobase/row/row0uins.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0uins.cc	2025-05-19 16:14:25.000000000 +0000
@@ -244,8 +244,7 @@
 	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
 
 	if (UNIV_LIKELY_NULL(table)) {
-		dict_table_close(table, dict_locked,
-				 node->trx->mysql_thd, mdl_ticket);
+		dict_table_close(table, node->trx->mysql_thd, mdl_ticket);
 	}
 
 	return(err);
@@ -452,7 +451,7 @@
 		would probably be better to just drop all temporary
 		tables (and temporary undo log records) of the current
 		connection, instead of doing this rollback. */
-		dict_table_close(node->table, dict_locked);
+		node->table->release();
 		node->table = NULL;
 		return false;
 	} else {
@@ -614,7 +613,7 @@
 			err = row_undo_ins_remove_clust_rec(node);
 		}
 
-		if (err == DB_SUCCESS && node->table->stat_initialized) {
+		if (err == DB_SUCCESS && node->table->stat_initialized()) {
 			/* Not protected by dict_sys.latch
 			or table->stats_mutex_lock() for
 			performance reasons, we would rather get garbage
@@ -644,8 +643,7 @@
 		break;
 	}
 
-	dict_table_close(node->table, dict_locked);
-
+	node->table->release();
 	node->table = NULL;
 
 	return(err);
diff -Nru mariadb-10.11.11/storage/innobase/row/row0umod.cc mariadb-10.11.13/storage/innobase/row/row0umod.cc
--- mariadb-10.11.11/storage/innobase/row/row0umod.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0umod.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1259,7 +1259,7 @@
 		would probably be better to just drop all temporary
 		tables (and temporary undo log records) of the current
 		connection, instead of doing this rollback. */
-		dict_table_close(node->table, dict_locked);
+		node->table->release();
 		node->table = NULL;
 		return false;
 	}
@@ -1388,7 +1388,7 @@
 		bool update_statistics
 			= !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
 
-		if (err == DB_SUCCESS && node->table->stat_initialized) {
+		if (err == DB_SUCCESS && node->table->stat_initialized()) {
 			switch (node->rec_type) {
 			case TRX_UNDO_UPD_EXIST_REC:
 				break;
@@ -1418,8 +1418,7 @@
 		}
 	}
 
-	dict_table_close(node->table, dict_locked);
-
+	node->table->release();
 	node->table = NULL;
 
 	return(err);
diff -Nru mariadb-10.11.11/storage/innobase/row/row0upd.cc mariadb-10.11.13/storage/innobase/row/row0upd.cc
--- mariadb-10.11.11/storage/innobase/row/row0upd.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/row/row0upd.cc	2025-05-19 16:14:25.000000000 +0000
@@ -253,7 +253,7 @@
 				FALSE, foreign, table, entry, thr);
 
 			if (ref_table) {
-				dict_table_close(ref_table);
+				ref_table->release();
 			}
 
 			if (err != DB_SUCCESS) {
@@ -338,7 +338,7 @@
 				TRUE, foreign, table, entry, thr);
 
 			if (opened) {
-				dict_table_close(opened);
+				opened->release();
 			}
 
 			if (err != DB_SUCCESS) {
diff -Nru mariadb-10.11.11/storage/innobase/srv/srv0mon.cc mariadb-10.11.13/storage/innobase/srv/srv0mon.cc
--- mariadb-10.11.11/storage/innobase/srv/srv0mon.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/srv/srv0mon.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1366,12 +1366,13 @@
 
 	/* innodb_buffer_pool_pages_total */
 	case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL:
-		value = buf_pool.get_n_pages();
+	case MONITOR_OVLD_BUFFER_POOL_SIZE:
+		value = buf_pool.curr_size();
 		break;
 
 	/* innodb_buffer_pool_pages_misc */
 	case MONITOR_OVLD_BUF_POOL_PAGE_MISC:
-		value = buf_pool.get_n_pages()
+		value = buf_pool.curr_size()
 			- UT_LIST_GET_LEN(buf_pool.LRU)
 			- UT_LIST_GET_LEN(buf_pool.free);
 		break;
@@ -1453,7 +1454,7 @@
 
 	/* innodb_os_log_written */
 	case MONITOR_OVLD_OS_LOG_WRITTEN:
-		value = log_sys.get_lsn() - recv_sys.lsn;
+		value = log_get_lsn() - recv_sys.lsn;
 		break;
 
 	/* innodb_log_waits */
@@ -1490,10 +1491,6 @@
 		value = srv_page_size;
 		break;
 
-	case MONITOR_OVLD_BUFFER_POOL_SIZE:
-		value = srv_buf_pool_size;
-		break;
-
 	/* innodb_row_lock_current_waits */
 	case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT:
 		// dirty read without lock_sys.wait_mutex
@@ -1590,7 +1587,7 @@
 		break;
 
 	case MONITOR_OVLD_LSN_CURRENT:
-		value = log_sys.get_lsn();
+		value = log_get_lsn();
 		break;
 
         case MONITOR_OVLD_CHECKPOINTS:
@@ -1598,10 +1595,10 @@
 		break;
 
 	case MONITOR_LSN_CHECKPOINT_AGE:
-		log_sys.latch.rd_lock(SRW_LOCK_CALL);
+		log_sys.latch.wr_lock(SRW_LOCK_CALL);
 		value = static_cast<mon_type_t>(log_sys.get_lsn()
 						- log_sys.last_checkpoint_lsn);
-		log_sys.latch.rd_unlock();
+		log_sys.latch.wr_unlock();
 		break;
 
 	case MONITOR_OVLD_BUF_OLDEST_LSN:
diff -Nru mariadb-10.11.11/storage/innobase/srv/srv0srv.cc mariadb-10.11.13/storage/innobase/srv/srv0srv.cc
--- mariadb-10.11.11/storage/innobase/srv/srv0srv.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/srv/srv0srv.cc	2025-05-19 16:14:25.000000000 +0000
@@ -178,16 +178,6 @@
 with mysql_mutex_lock(), which will wait until it gets the mutex. */
 #define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
 
-/** copy of innodb_buffer_pool_size */
-ulint	srv_buf_pool_size;
-/** Requested buffer pool chunk size */
-size_t	srv_buf_pool_chunk_unit;
-/** Previously requested size */
-ulint	srv_buf_pool_old_size;
-/** Current size as scaling factor for the other components */
-ulint	srv_buf_pool_base_size;
-/** Current size in bytes */
-ulint	srv_buf_pool_curr_size;
 /** Dump this % of each buffer pool during BP dump */
 ulong	srv_buf_pool_dump_pct;
 /** Abort load after this amount of pages */
@@ -291,13 +281,13 @@
   in the innodb database.
 * quick transient stats, that are used if persistent stats for the given
   table/index are not found in the innodb database */
-unsigned long long	srv_stats_transient_sample_pages;
+uint32_t	srv_stats_transient_sample_pages;
 /** innodb_stats_persistent */
 my_bool		srv_stats_persistent;
 /** innodb_stats_include_delete_marked */
 my_bool		srv_stats_include_delete_marked;
 /** innodb_stats_persistent_sample_pages */
-unsigned long long	srv_stats_persistent_sample_pages;
+uint32_t	srv_stats_persistent_sample_pages;
 /** innodb_stats_auto_recalc */
 my_bool		srv_stats_auto_recalc;
 
@@ -901,6 +891,7 @@
 	export_vars.innodb_buffer_pool_read_requests
 		= buf_pool.stat.n_page_gets;
 
+	mysql_mutex_lock(&buf_pool.mutex);
 	export_vars.innodb_buffer_pool_bytes_data =
 		buf_pool.stat.LRU_bytes
 		+ (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
@@ -910,12 +901,21 @@
 	export_vars.innodb_buffer_pool_pages_latched =
 		buf_get_latched_pages_number();
 #endif /* UNIV_DEBUG */
-	export_vars.innodb_buffer_pool_pages_total = buf_pool.get_n_pages();
+	export_vars.innodb_buffer_pool_pages_total = buf_pool.curr_size();
 
 	export_vars.innodb_buffer_pool_pages_misc =
-		buf_pool.get_n_pages()
+		export_vars.innodb_buffer_pool_pages_total
 		- UT_LIST_GET_LEN(buf_pool.LRU)
 		- UT_LIST_GET_LEN(buf_pool.free);
+	if (size_t shrinking = buf_pool.is_shrinking()) {
+		snprintf(export_vars.innodb_buffer_pool_resize_status,
+			 sizeof export_vars.innodb_buffer_pool_resize_status,
+			 "Withdrawing blocks. (%zu/%zu).",
+			 buf_pool.to_withdraw(), shrinking);
+	} else {
+		export_vars.innodb_buffer_pool_resize_status[0] = '\0';
+	}
+	mysql_mutex_unlock(&buf_pool.mutex);
 
 	export_vars.innodb_max_trx_id = trx_sys.get_max_trx_id();
 	export_vars.innodb_history_list_length = trx_sys.history_size_approx();
@@ -979,13 +979,13 @@
 
 	mysql_mutex_unlock(&srv_innodb_monitor_mutex);
 
-	log_sys.latch.rd_lock(SRW_LOCK_CALL);
+	log_sys.latch.wr_lock(SRW_LOCK_CALL);
 	export_vars.innodb_lsn_current = log_sys.get_lsn();
 	export_vars.innodb_lsn_flushed = log_sys.get_flushed_lsn();
 	export_vars.innodb_lsn_last_checkpoint = log_sys.last_checkpoint_lsn;
 	export_vars.innodb_checkpoint_max_age = static_cast<ulint>(
 		log_sys.max_checkpoint_age);
-	log_sys.latch.rd_unlock();
+	log_sys.latch.wr_unlock();
 	export_vars.innodb_os_log_written = export_vars.innodb_lsn_current
 		- recv_sys.lsn;
 
@@ -1072,7 +1072,7 @@
 	/* Try to track a strange bug reported by Harald Fuchs and others,
 	where the lsn seems to decrease at times */
 
-	lsn_t new_lsn = log_sys.get_lsn();
+	lsn_t new_lsn = log_get_lsn();
 	ut_a(new_lsn >= old_lsn);
 	old_lsn = new_lsn;
 
@@ -1088,6 +1088,7 @@
 			now -= start;
 			ulong waited = static_cast<ulong>(now / 1000000);
 			if (waited >= threshold) {
+				buf_pool.print_flush_info();
 				ib::fatal() << dict_sys.fatal_msg;
 			}
 
diff -Nru mariadb-10.11.11/storage/innobase/srv/srv0start.cc mariadb-10.11.13/storage/innobase/srv/srv0start.cc
--- mariadb-10.11.11/storage/innobase/srv/srv0start.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/srv/srv0start.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1057,7 +1057,7 @@
 /** Prepare to delete the redo log file. Flush the dirty pages from all the
 buffer pools.  Flush the redo log buffer to the redo log file.
 @return lsn upto which data pages have been flushed. */
-static lsn_t srv_prepare_to_delete_redo_log_file()
+static lsn_t srv_prepare_to_delete_redo_log_file() noexcept
 {
   DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
 
@@ -1071,7 +1071,7 @@
 
   log_sys.latch.wr_lock(SRW_LOCK_CALL);
   const bool latest_format{log_sys.is_latest()};
-  lsn_t flushed_lsn{log_sys.get_lsn()};
+  lsn_t flushed_lsn{log_sys.get_flushed_lsn(std::memory_order_relaxed)};
 
   if (latest_format && !(log_sys.file_size & 4095) &&
       flushed_lsn != log_sys.next_checkpoint_lsn +
@@ -1079,6 +1079,11 @@
        ? SIZE_OF_FILE_CHECKPOINT + 8
        : SIZE_OF_FILE_CHECKPOINT))
   {
+#ifdef HAVE_PMEM
+    if (!log_sys.is_opened())
+      log_sys.buf_size= unsigned(std::min<uint64_t>(log_sys.capacity(),
+                                                    log_sys.buf_size_max));
+#endif
     fil_names_clear(flushed_lsn);
     flushed_lsn= log_sys.get_lsn();
   }
@@ -1119,7 +1124,7 @@
   if (latest_format)
     log_write_up_to(flushed_lsn, false);
 
-  ut_ad(flushed_lsn == log_sys.get_lsn());
+  ut_ad(flushed_lsn == log_get_lsn());
   ut_ad(!os_aio_pending_reads());
   ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
   ut_ad(!buf_pool.get_oldest_modification(0));
@@ -1134,6 +1139,18 @@
 					       nullptr,
 					       &rollback_all_recovered_group);
 
+inline lsn_t log_t::init_lsn() noexcept
+{
+  latch.wr_lock(SRW_LOCK_CALL);
+  ut_ad(!write_lsn_offset);
+  write_lsn_offset= 0;
+  const lsn_t lsn{base_lsn.load(std::memory_order_relaxed)};
+  flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+  write_lsn= lsn;
+  latch.wr_unlock();
+  return lsn;
+}
+
 /** Start InnoDB.
 @param[in]	create_new_db	whether to create a new database
 @return DB_SUCCESS or error code */
@@ -1288,34 +1305,13 @@
 
 	fil_system.create(srv_file_per_table ? 50000 : 5000);
 
-	ib::info() << "Initializing buffer pool, total size = "
-		<< ib::bytes_iec{srv_buf_pool_size}
-		<< ", chunk size = " << ib::bytes_iec{srv_buf_pool_chunk_unit};
-
 	if (buf_pool.create()) {
-		ib::error() << "Cannot allocate memory for the buffer pool";
-
 		return(srv_init_abort(DB_ERROR));
 	}
 
-	ib::info() << "Completed initialization of buffer pool";
-
-#ifdef UNIV_DEBUG
-	/* We have observed deadlocks with a 5MB buffer pool but
-	the actual lower limit could very well be a little higher. */
-
-	if (srv_buf_pool_size <= 5 * 1024 * 1024) {
-
-		ib::info() << "Small buffer pool size ("
-			<< ib::bytes_iec{srv_buf_pool_size}
-			<< "), the flst_validate() debug function can cause a"
-			<< " deadlock if the buffer pool fills up.";
-	}
-#endif /* UNIV_DEBUG */
-
 	log_sys.create();
 	recv_sys.create();
-	lock_sys.create(srv_lock_table_size);
+	lock_sys.create(srv_lock_table_size = 5 * buf_pool.curr_size());
 
 	srv_startup_is_before_trx_rollback_phase = true;
 
diff -Nru mariadb-10.11.11/storage/innobase/trx/trx0purge.cc mariadb-10.11.13/storage/innobase/trx/trx0purge.cc
--- mariadb-10.11.11/storage/innobase/trx/trx0purge.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/trx/trx0purge.cc	2025-05-19 16:14:25.000000000 +0000
@@ -1052,16 +1052,25 @@
 /** Close all tables that were opened in a purge batch for a worker.
 @param node   purge task context
 @param thd    purge coordinator thread handle */
-static void trx_purge_close_tables(purge_node_t *node, THD *thd)
+static void trx_purge_close_tables(purge_node_t *node, THD *thd) noexcept
 {
   for (auto &t : node->tables)
   {
-    if (!t.second.first);
-    else if (t.second.first == reinterpret_cast<dict_table_t*>(-1));
-    else
+    dict_table_t *table= t.second.first;
+    if (table != nullptr && table != reinterpret_cast<dict_table_t*>(-1))
+      table->release();
+  }
+
+  MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+
+  for (auto &t : node->tables)
+  {
+    dict_table_t *table= t.second.first;
+    if (table != nullptr && table != reinterpret_cast<dict_table_t*>(-1))
     {
-      dict_table_close(t.second.first, false, thd, t.second.second);
       t.second.first= reinterpret_cast<dict_table_t*>(-1);
+      if (mdl_context != nullptr && t.second.second != nullptr)
+        mdl_context->release_lock(t.second.second);
     }
   }
 }
@@ -1073,36 +1082,35 @@
 }
 
 __attribute__((nonnull))
-/** Aqcuire a metadata lock on a table.
+/** Acquire a metadata lock on a table.
 @param table        table handle
 @param mdl_context  metadata lock acquisition context
-@param mdl          metadata lcok
+@param mdl          metadata lock
 @return table handle
 @retval nullptr if the table is not found or accessible
 @retval -1      if the purge of history must be suspended due to DDL */
 static dict_table_t *trx_purge_table_acquire(dict_table_t *table,
                                              MDL_context *mdl_context,
-                                             MDL_ticket **mdl)
+                                             MDL_ticket **mdl) noexcept
 {
   ut_ad(dict_sys.frozen_not_locked());
   *mdl= nullptr;
 
   if (!table->is_readable() || table->corrupted)
-  {
-    table->release();
     return nullptr;
-  }
 
   size_t db_len= dict_get_db_name_len(table->name.m_name);
   if (db_len == 0)
-    return table; /* InnoDB system tables are not covered by MDL */
+  {
+    /* InnoDB system tables are not covered by MDL */
+  got_table:
+    table->acquire();
+    return table;
+  }
 
   if (purge_sys.must_wait_FTS())
-  {
   must_wait:
-    table->release();
     return reinterpret_cast<dict_table_t*>(-1);
-  }
 
   char db_buf[NAME_LEN + 1];
   char tbl_buf[NAME_LEN + 1];
@@ -1110,7 +1118,7 @@
 
   if (!table->parse_name<true>(db_buf, tbl_buf, &db_len, &tbl_len))
     /* The name of an intermediate table starts with #sql */
-    return table;
+    goto got_table;
 
   {
     MDL_request request;
@@ -1123,37 +1131,38 @@
       goto must_wait;
   }
 
-  return table;
+  goto got_table;
 }
 
 /** Open a table handle for the purge of committed transaction history
 @param table_id     InnoDB table identifier
 @param mdl_context  metadata lock acquisition context
-@param mdl          metadata lcok
+@param mdl          metadata lock
 @return table handle
 @retval nullptr if the table is not found or accessible
 @retval -1      if the purge of history must be suspended due to DDL */
 static dict_table_t *trx_purge_table_open(table_id_t table_id,
                                           MDL_context *mdl_context,
-                                          MDL_ticket **mdl)
+                                          MDL_ticket **mdl) noexcept
 {
-  dict_sys.freeze(SRW_LOCK_CALL);
-
-  dict_table_t *table= dict_sys.find_table(table_id);
+  dict_table_t *table;
 
-  if (table)
-    table->acquire();
-  else
+  for (;;)
   {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table= dict_sys.find_table(table_id);
+    if (table)
+      break;
     dict_sys.unfreeze();
     dict_sys.lock(SRW_LOCK_CALL);
     table= dict_load_table_on_id(table_id, DICT_ERR_IGNORE_FK_NOKEY);
-    if (table)
-      table->acquire();
     dict_sys.unlock();
     if (!table)
       return nullptr;
-    dict_sys.freeze(SRW_LOCK_CALL);
+    /* At this point, the freshly loaded table may already have been evicted.
+    We must look it up again while holding a shared dict_sys.latch.  We keep
+    trying this until the table is found in the cache or it cannot be found
+    in the dictionary (because the table has been dropped or rebuilt). */
   }
 
   table= trx_purge_table_acquire(table, mdl_context, mdl);
@@ -1172,10 +1181,7 @@
 
   for (que_thr_t *thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
        thr= UT_LIST_GET_NEXT(thrs, thr))
-  {
-    purge_node_t *node= static_cast<purge_node_t*>(thr->child);
-    trx_purge_close_tables(node, thd);
-  }
+    trx_purge_close_tables(static_cast<purge_node_t*>(thr->child), thd);
 
   m_active= false;
   wait_FTS(false);
@@ -1198,7 +1204,7 @@
         if (t.second.first == reinterpret_cast<dict_table_t*>(-1))
         {
           if (table)
-            dict_table_close(table, false, thd, *mdl);
+            dict_table_close(table, thd, *mdl);
           goto retry;
         }
       }
@@ -1231,9 +1237,6 @@
     static_cast<MDL_context*>(thd_mdl_context(thd));
   ut_ad(mdl_context);
 
-  const size_t max_pages=
-    std::min(buf_pool.curr_size * 3 / 4, size_t{srv_purge_batch_size});
-
   while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown)
   {
     /* Track the max {trx_id, undo_no} for truncating the
@@ -1283,12 +1286,12 @@
       ut_ad(!table_node->in_progress);
     }
 
-    if (purge_sys.n_pages_handled() >= max_pages)
+    const size_t size{purge_sys.n_pages_handled()};
+    if (size >= size_t{srv_purge_batch_size} ||
+        size >= buf_pool.usable_size() * 3 / 4)
       break;
   }
 
-  purge_sys.m_active= false;
-
 #ifdef UNIV_DEBUG
   thr= UT_LIST_GET_FIRST(purge_sys.query->thrs);
   for (ulint i= 0; thr && i < *n_work_items;
@@ -1337,6 +1340,8 @@
 TRANSACTIONAL_INLINE
 void purge_sys_t::batch_cleanup(const purge_sys_t::iterator &head)
 {
+  m_active= false;
+
   /* Release the undo pages. */
   for (auto p : pages)
     p.second->unfix();
diff -Nru mariadb-10.11.11/storage/innobase/trx/trx0rec.cc mariadb-10.11.13/storage/innobase/trx/trx0rec.cc
--- mariadb-10.11.11/storage/innobase/trx/trx0rec.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/trx/trx0rec.cc	2025-05-19 16:14:25.000000000 +0000
@@ -152,7 +152,9 @@
 	ulint n_idx = 0;
 	for (const auto& v_index : vcol->v_indexes) {
 		n_idx++;
-		/* FIXME: index->id is 64 bits! */
+		if (uint32_t hi= uint32_t(v_index.index->id >> 32)) {
+			size += 1 + mach_get_compressed_size(hi);
+		}
 		size += mach_get_compressed_size(uint32_t(v_index.index->id));
 		size += mach_get_compressed_size(v_index.nth_field);
 	}
@@ -179,10 +181,14 @@
 	ptr += mach_write_compressed(ptr, n_idx);
 
 	for (const auto& v_index : vcol->v_indexes) {
-		ptr += mach_write_compressed(
-			/* FIXME: index->id is 64 bits! */
-			ptr, uint32_t(v_index.index->id));
-
+		/* This is compatible with
+		ptr += mach_u64_write_much_compressed(ptr, v_index.index-id)
+		(the added "if" statement is fixing an old regression). */
+		if (uint32_t hi= uint32_t(v_index.index->id >> 32)) {
+			*ptr++ = 0xff;
+			ptr += mach_write_compressed(ptr, hi);
+		}
+		ptr += mach_write_compressed(ptr, uint32_t(v_index.index->id));
 		ptr += mach_write_compressed(ptr, v_index.nth_field);
 	}
 
@@ -221,7 +227,15 @@
 	dict_index_t*	clust_index = dict_table_get_first_index(table);
 
 	for (ulint i = 0; i < num_idx; i++) {
-		index_id_t	id = mach_read_next_compressed(&ptr);
+		index_id_t	id = 0;
+		/* This is like mach_u64_read_much_compressed(),
+		but advancing ptr to the next field. */
+		if (*ptr == 0xff) {
+			ptr++;
+			id = mach_read_next_compressed(&ptr);
+			id <<= 32;
+		}
+		id |= mach_read_next_compressed(&ptr);
 		ulint		pos = mach_read_next_compressed(&ptr);
 		dict_index_t*	index = dict_table_get_next_index(clust_index);
 
diff -Nru mariadb-10.11.11/storage/innobase/trx/trx0trx.cc mariadb-10.11.13/storage/innobase/trx/trx0trx.cc
--- mariadb-10.11.11/storage/innobase/trx/trx0trx.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/trx/trx0trx.cc	2025-05-19 16:14:25.000000000 +0000
@@ -134,8 +134,6 @@
 
 	trx->will_lock = false;
 
-	trx->bulk_insert = false;
-
 	trx->apply_online_log = false;
 
 	ut_d(trx->start_file = 0);
@@ -452,7 +450,7 @@
 /** Transition to committed state, to release implicit locks. */
 TRANSACTIONAL_INLINE inline void trx_t::commit_state()
 {
-  ut_d(auto trx_state{state});
+  ut_d(auto trx_state= state);
   ut_ad(trx_state == TRX_STATE_PREPARED ||
         trx_state == TRX_STATE_PREPARED_RECOVERED ||
         trx_state == TRX_STATE_ACTIVE);
@@ -1513,6 +1511,7 @@
   *detailed_error= '\0';
   mod_tables.clear();
 
+  bulk_insert= TRX_NO_BULK;
   check_foreigns= true;
   check_unique_secondary= true;
   assert_freed();
diff -Nru mariadb-10.11.11/storage/innobase/ut/ut0rnd.cc mariadb-10.11.13/storage/innobase/ut/ut0rnd.cc
--- mariadb-10.11.11/storage/innobase/ut/ut0rnd.cc	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/innobase/ut/ut0rnd.cc	2025-05-19 16:14:25.000000000 +0000
@@ -48,6 +48,8 @@
 	ulint	pow2;
 	ulint	i;
 
+	ut_ad(n);
+
 	n += 100;
 
 	pow2 = 1;
diff -Nru mariadb-10.11.11/storage/maria/ma_control_file.c mariadb-10.11.13/storage/maria/ma_control_file.c
--- mariadb-10.11.11/storage/maria/ma_control_file.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/maria/ma_control_file.c	2025-05-19 16:14:25.000000000 +0000
@@ -276,7 +276,7 @@
                                         int open_flags)
 {
   uchar buffer[CF_MAX_SIZE];
-  char name[FN_REFLEN], errmsg_buff[256];
+  char name[FN_REFLEN], errmsg_buff[512];
   const char *errmsg, *lock_failed_errmsg= "Could not get an exclusive lock;"
     " file is probably in use by another process";
   uint new_cf_create_time_size, new_cf_changeable_size, new_block_size;
@@ -399,10 +399,14 @@
 
   if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE ||
       new_cf_changeable_size <  CF_MIN_CHANGEABLE_TOTAL_SIZE ||
-      new_cf_create_time_size + new_cf_changeable_size != file_size)
+      new_cf_create_time_size + new_cf_changeable_size > file_size)
   {
     error= CONTROL_FILE_INCONSISTENT_INFORMATION;
-    errmsg= "Sizes stored in control file are inconsistent";
+    sprintf(errmsg_buff,
+            "Sizes stored in control file are inconsistent. "
+            "create_time_size: %u  changeable_size: %u  file_size: %llu",
+            new_cf_create_time_size, new_cf_changeable_size, (ulonglong) file_size);
+    errmsg= errmsg_buff;
     goto err;
   }
 
@@ -622,6 +626,20 @@
   return (control_file_fd >= 0);
 }
 
+
+
+static int check_zerofill(uchar *buffer, ulonglong offset, ulonglong length)
+{
+  uchar *pos= buffer + offset, *end= buffer+length;
+  while (pos < end)
+  {
+    if (*pos++)
+      return 1;
+  }
+  return 0;
+}
+
+
 /**
    Print content of aria_log_control file
 */
@@ -629,6 +647,7 @@
 my_bool print_aria_log_control()
 {
   uchar buffer[CF_MAX_SIZE];
+  char errmsg_buff[512];
   char name[FN_REFLEN], uuid_str[MY_UUID_STRING_LENGTH+1];
   const char *errmsg;
   uint new_cf_create_time_size, new_cf_changeable_size;
@@ -705,10 +724,14 @@
 
   if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE ||
       new_cf_changeable_size <  CF_MIN_CHANGEABLE_TOTAL_SIZE ||
-      new_cf_create_time_size + new_cf_changeable_size != file_size)
+      new_cf_create_time_size + new_cf_changeable_size > file_size)
   {
     error= CONTROL_FILE_INCONSISTENT_INFORMATION;
-    errmsg= "Sizes stored in control file are inconsistent";
+    sprintf(errmsg_buff,
+            "Sizes stored in control file are inconsistent. "
+            "create_time_size: %u  changeable_size: %u  file_size: %llu",
+            new_cf_create_time_size, new_cf_changeable_size, (ulonglong) file_size);
+    errmsg= errmsg_buff;
     goto err;
   }
   checkpoint_lsn= lsn_korr(buffer + new_cf_create_time_size +
@@ -732,6 +755,18 @@
       (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0];
     printf("recovery_failures:   %u\n", recovery_fails);
   }
+  if (check_zerofill(buffer, new_cf_create_time_size + new_cf_changeable_size, file_size))
+  {
+    printf("Warning: %s file_size is %llu (should be %llu) and contains unknown data.\n"
+           "It will still work but should be examined.\n",
+           name, (ulonglong) file_size,
+           (ulonglong) (new_cf_create_time_size + new_cf_changeable_size));
+  }
+  else if (new_cf_create_time_size + new_cf_changeable_size < file_size)
+    printf("Note: file_size (%llu) is bigger than the expected file size %llu.\n"
+           "This is unexpected but will not cause any issues.\n",
+           (ulonglong) file_size,
+           (ulonglong) (new_cf_create_time_size + new_cf_changeable_size));
   mysql_file_close(file, MYF(0));
   DBUG_RETURN(0);
 
diff -Nru mariadb-10.11.11/storage/maria/ma_pagecache.c mariadb-10.11.13/storage/maria/ma_pagecache.c
--- mariadb-10.11.11/storage/maria/ma_pagecache.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/maria/ma_pagecache.c	2025-05-19 16:14:25.000000000 +0000
@@ -4726,10 +4726,10 @@
 
 static int cmp_sec_link(const void *a_, const void *b_)
 {
-  PAGECACHE_BLOCK_LINK *const *a= a_;
-  PAGECACHE_BLOCK_LINK *const *b= b_;
-  return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 :
-      ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0);
+  const PAGECACHE_BLOCK_LINK *a= *(const PAGECACHE_BLOCK_LINK **) a_;
+  const PAGECACHE_BLOCK_LINK *b= *(const PAGECACHE_BLOCK_LINK **) b_;
+  return ((a->hash_link->pageno < b->hash_link->pageno) ? -1 :
+      (a->hash_link->pageno > b->hash_link->pageno) ? 1 : 0);
 }
 
 
diff -Nru mariadb-10.11.11/storage/maria/ma_unique.c mariadb-10.11.13/storage/maria/ma_unique.c
--- mariadb-10.11.11/storage/maria/ma_unique.c	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/maria/ma_unique.c	2025-05-19 16:14:25.000000000 +0000
@@ -139,6 +139,8 @@
     {
       uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos);
       memcpy((void*) &pos,pos+keyseg->bit_start,sizeof(char*));
+      if (!pos)
+        pos= (const uchar*) ""; /* hash_sort does not support NULL ptr */
       if (!length || length > tmp_length)
 	length=tmp_length;			/* The whole blob */
     }
@@ -236,6 +238,10 @@
       }
       memcpy((void*) &pos_a, pos_a+keyseg->bit_start, sizeof(char*));
       memcpy((void*) &pos_b, pos_b+keyseg->bit_start, sizeof(char*));
+      if (pos_a == 0)
+        pos_a= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */
+      if (pos_b == 0)
+        pos_b= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */
     }
     if (type == HA_KEYTYPE_TEXT/* the CHAR data type*/)
     {
diff -Nru mariadb-10.11.11/storage/mroonga/CMakeLists.txt mariadb-10.11.13/storage/mroonga/CMakeLists.txt
--- mariadb-10.11.11/storage/mroonga/CMakeLists.txt	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/mroonga/CMakeLists.txt	2025-05-19 16:14:25.000000000 +0000
@@ -17,7 +17,7 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.8...3.12)
 project(mroonga)
 
 if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
diff -Nru mariadb-10.11.11/storage/mroonga/ha_mroonga.cpp mariadb-10.11.13/storage/mroonga/ha_mroonga.cpp
--- mariadb-10.11.11/storage/mroonga/ha_mroonga.cpp	2025-01-30 11:01:24.000000000 +0000
+++ mariadb-10.11.13/storage/mroonga/ha_mroonga.cpp	2025-05-19 16:14:25.000000000 +0000
@@ -558,6 +558,9 @@
   case HA_EXTRA_END_ALTER_COPY:
     inspected = "HA_EXTRA_END_ALTER_COPY";
     break;
+  case HA_EXTRA_ABORT_ALTER_COPY:
+    inspected = "HA_EXTRA_ABORT_ALTER_COPY";
+    break;
 #ifdef MRN_HAVE_HA_EXTRA_EXPORT
   case HA_EXTRA_EXPORT:
     inspected = "HA_EXTRA_EXPORT";
@@ -593,6 +596,11 @@
     inspected = "HA_EXTRA_END_ALTER_COPY";
     break;
 #endif
+#ifdef MRN_HAVE_HA_EXTRA_ABORT_ALTER_COPY
+  case HA_EXTRA_ABORT_ALTER_COPY:
+    inspected = "HA_EXTRA_ABORT_ALTER_COPY";
+    break;
+#endif
 #ifdef MRN_HAVE_HA_EXTRA_NO_AUTOINC_LOCKING
   case HA_EXTRA_NO_AUTOINC_LOCKING:
     inspected = "HA_EXTRA_NO_AUTOINC_LOCKING";
diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/CMakeLists.txt mariadb-10.11.13/storage/mroonga/vendor/groonga/CMakeLists.txt
--- mariadb-10.11.11/storage/mroonga/vendor/groonga/CMakeLists.txt	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/mroonga/vendor/groonga/CMakeLists.txt	2025-05-19 16:14:26.000000000 +0000
@@ -15,7 +15,7 @@
 
 # https://buildbot.askmonty.org/buildbot/builders/work-amd64-valgrind/builds/5263/steps/compile/logs/stdio
 # says CMake 2.6.2... We want to drop old software support...
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.8...3.12)
 # cmake_minimum_required(VERSION 2.6.4) # CentOS 5
 set(GRN_PROJECT_NAME "groonga")
 set(GRN_PROJECT_LABEL "Groonga")
diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/db.c mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/db.c
--- mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/db.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/db.c	2025-05-19 16:14:26.000000000 +0000
@@ -969,8 +969,8 @@
       *subrec_size = range_size + sizeof(uint32_t) + sizeof(uint32_t);
       break;
     }
-    *value_size = (uintptr_t)GRN_RSET_SUBRECS_NTH((((grn_rset_recinfo *)0)->subrecs),
-                                                  *subrec_size, max_n_subrecs);
+    *value_size = (uintptr_t) GRN_RSET_SUBRECS_NTH(offsetof(grn_rset_recinfo, subrecs),
+                                                   *subrec_size, max_n_subrecs);
   } else {
     *value_size = range_size;
   }
diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/hash.c mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/hash.c
--- mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/hash.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/hash.c	2025-05-19 16:14:26.000000000 +0000
@@ -1727,15 +1727,15 @@
 {
   if (flags & GRN_OBJ_KEY_VAR_SIZE) {
     if (flags & GRN_OBJ_KEY_LARGE) {
-      return (uintptr_t)((grn_io_hash_entry_large *)0)->value + value_size;
+      return offsetof(grn_io_hash_entry_large, value) + value_size;
     } else {
-      return (uintptr_t)((grn_io_hash_entry_normal *)0)->value + value_size;
+      return offsetof(grn_io_hash_entry_normal, value) + value_size;
     }
   } else {
     if (key_size == sizeof(uint32_t)) {
-      return (uintptr_t)((grn_plain_hash_entry *)0)->value + value_size;
+      return offsetof(grn_plain_hash_entry, value) + value_size;
     } else {
-      return (uintptr_t)((grn_rich_hash_entry *)0)->key_and_value
+      return offsetof(grn_rich_hash_entry, key_and_value)
           + key_size + value_size;
     }
   }
@@ -1865,12 +1865,12 @@
 {
   uint32_t entry_size;
   if (flags & GRN_OBJ_KEY_VAR_SIZE) {
-    entry_size = (uintptr_t)((grn_tiny_hash_entry *)0)->value + value_size;
+    entry_size = offsetof(grn_tiny_hash_entry, value) + value_size;
   } else {
     if (key_size == sizeof(uint32_t)) {
-      entry_size = (uintptr_t)((grn_plain_hash_entry *)0)->value + value_size;
+      entry_size = offsetof(grn_plain_hash_entry, value) + value_size;
     } else {
-      entry_size = (uintptr_t)((grn_rich_hash_entry *)0)->key_and_value
+      entry_size = offsetof(grn_rich_hash_entry, key_and_value)
           + key_size + value_size;
     }
   }
diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/ii.c mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/ii.c
--- mariadb-10.11.11/storage/mroonga/vendor/groonga/lib/ii.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/mroonga/vendor/groonga/lib/ii.c	2025-05-19 16:14:26.000000000 +0000
@@ -2049,7 +2049,7 @@
   if ((df & 1)) {
     df >>= 1;
     size = nreq == dvlen ? data_size : df * nreq;
-    if (dv[dvlen].data < dv[0].data + size) {
+    if (!dv[0].data || dv[dvlen].data < dv[0].data + size) {
       if (dv[0].data) { GRN_FREE(dv[0].data); }
       if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; }
       dv[dvlen].data = rp + size;
@@ -10653,7 +10653,7 @@
 }
 
 #define GRN_II_BUILDER_TERM_INPLACE_SIZE\
-  (sizeof(grn_ii_builder_term) - (uintptr_t)&((grn_ii_builder_term *)0)->dummy)
+  (sizeof(grn_ii_builder_term) - offsetof(grn_ii_builder_term, dummy))
 
 typedef struct {
   grn_id   rid;    /* Last record ID */
diff -Nru mariadb-10.11.11/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt mariadb-10.11.13/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt
--- mariadb-10.11.11/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/mroonga/vendor/groonga/vendor/plugins/groonga-normalizer-mysql/CMakeLists.txt	2025-05-19 16:14:26.000000000 +0000
@@ -15,7 +15,7 @@
 # Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
 # MA 02110-1335  USA
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.8...3.12)
 if(NOT DEFINED GROONGA_NORMALIZER_MYSQL_PROJECT_NAME)
     set(GROONGA_NORMALIZER_MYSQL_PROJECT_NAME "groonga-normalizer-mysql")
 endif()
diff -Nru mariadb-10.11.11/storage/myisam/mi_unique.c mariadb-10.11.13/storage/myisam/mi_unique.c
--- mariadb-10.11.11/storage/myisam/mi_unique.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/myisam/mi_unique.c	2025-05-19 16:14:26.000000000 +0000
@@ -115,6 +115,8 @@
     {
       uint tmp_length=_mi_calc_blob_length(keyseg->bit_start,pos);
       memcpy((char**) &pos, pos+keyseg->bit_start, sizeof(char*));
+      if (!pos)
+        pos= (const uchar*) ""; /* hash_sort does not support NULL ptr */
       if (!length || length > tmp_length)
 	length=tmp_length;			/* The whole blob */
     }
@@ -211,6 +213,10 @@
       }
       memcpy((char**) &pos_a, pos_a+keyseg->bit_start, sizeof(char*));
       memcpy((char**) &pos_b, pos_b+keyseg->bit_start, sizeof(char*));
+      if (pos_a == 0)
+        pos_a= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */
+      if (pos_b == 0)
+        pos_b= (const uchar *) ""; /* Avoid UBSAN nullptr-with-offset */
     }
     if (type == HA_KEYTYPE_TEXT/*The CHAR data type*/)
     {
diff -Nru mariadb-10.11.11/storage/rocksdb/build_rocksdb.cmake mariadb-10.11.13/storage/rocksdb/build_rocksdb.cmake
--- mariadb-10.11.11/storage/rocksdb/build_rocksdb.cmake	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/build_rocksdb.cmake	2025-05-19 16:14:26.000000000 +0000
@@ -176,35 +176,53 @@
 #  - *_test.cc
 #  - *_bench.cc
 set(ROCKSDB_SOURCES
+        cache/cache.cc
+        cache/cache_entry_roles.cc
+        cache/cache_key.cc
+        cache/cache_reservation_manager.cc
         cache/clock_cache.cc
         cache/lru_cache.cc
         cache/sharded_cache.cc
         db/arena_wrapped_db_iter.cc
+        db/blob/blob_fetcher.cc
+        db/blob/blob_file_addition.cc
+        db/blob/blob_file_builder.cc
+        db/blob/blob_file_builder.cc
+        db/blob/blob_file_cache.cc
+        db/blob/blob_file_garbage.cc
+        db/blob/blob_file_meta.cc
+        db/blob/blob_file_reader.cc
+        db/blob/blob_garbage_meter.cc
+        db/blob/blob_log_format.cc
+        db/blob/blob_log_sequential_reader.cc
+        db/blob/blob_log_writer.cc
+        db/blob/prefetch_buffer_collection.cc
         db/builder.cc
         db/c.cc
         db/column_family.cc
-        db/compacted_db_impl.cc
         db/compaction/compaction.cc
         db/compaction/compaction_iterator.cc
-        db/compaction/compaction_picker.cc
         db/compaction/compaction_job.cc
+        db/compaction/compaction_picker.cc
         db/compaction/compaction_picker_fifo.cc
         db/compaction/compaction_picker_level.cc
         db/compaction/compaction_picker_universal.cc
+        db/compaction/sst_partitioner.cc
         db/convenience.cc
         db/db_filesnapshot.cc
+        db/dbformat.cc
+        db/db_impl/compacted_db_impl.cc
         db/db_impl/db_impl.cc
-        db/db_impl/db_impl_write.cc
         db/db_impl/db_impl_compaction_flush.cc
-        db/db_impl/db_impl_files.cc
-        db/db_impl/db_impl_open.cc
         db/db_impl/db_impl_debug.cc
         db/db_impl/db_impl_experimental.cc
+        db/db_impl/db_impl_files.cc
+        db/db_impl/db_impl_open.cc
         db/db_impl/db_impl_readonly.cc
         db/db_impl/db_impl_secondary.cc
+        db/db_impl/db_impl_write.cc
         db/db_info_dumper.cc
         db/db_iter.cc
-        db/dbformat.cc
         db/error_handler.cc
         db/event_helpers.cc
         db/experimental.cc
@@ -215,14 +233,16 @@
         db/forward_iterator.cc
         db/import_column_family_job.cc
         db/internal_stats.cc
-        db/logs_with_prep_tracker.cc
         db/log_reader.cc
+        db/logs_with_prep_tracker.cc
         db/log_writer.cc
         db/malloc_stats.cc
         db/memtable.cc
         db/memtable_list.cc
         db/merge_helper.cc
         db/merge_operator.cc
+        db/output_validator.cc
+        db/periodic_work_scheduler.cc
         db/range_del_aggregator.cc
         db/range_tombstone_fragmenter.cc
         db/repair.cc
@@ -233,25 +253,32 @@
         db/trim_history_scheduler.cc
         db/version_builder.cc
         db/version_edit.cc
+        db/version_edit_handler.cc
         db/version_set.cc
+        db/wal_edit.cc
         db/wal_manager.cc
-        db/write_batch.cc
         db/write_batch_base.cc
+        db/write_batch.cc
         db/write_controller.cc
         db/write_thread.cc
+        env/composite_env.cc
         env/env.cc
         env/env_chroot.cc
         env/env_encryption.cc
         env/env_hdfs.cc
         env/file_system.cc
+        env/file_system_tracer.cc
+        env/fs_remap.cc
         env/mock_env.cc
+        env/unique_id_gen.cc
         file/delete_scheduler.cc
+        file/filename.cc
         file/file_prefetch_buffer.cc
         file/file_util.cc
-        file/filename.cc
+        file/line_file_reader.cc
         file/random_access_file_reader.cc
-        file/read_write_util.cc
         file/readahead_raf.cc
+        file/read_write_util.cc
         file/sequence_file_reader.cc
         file/sst_file_manager_impl.cc
         file/writable_file_writer.cc
@@ -281,29 +308,38 @@
         monitoring/thread_status_util.cc
         monitoring/thread_status_util_debug.cc
         options/cf_options.cc
+        options/configurable.cc
+        options/customizable.cc
         options/db_options.cc
         options/options.cc
         options/options_helper.cc
         options/options_parser.cc
-        options/options_sanity_check.cc
         port/stack_trace.cc
         table/adaptive/adaptive_table_factory.cc
-        table/block_based/block.cc
+        table/block_based/binary_search_index_reader.cc
         table/block_based/block_based_filter_block.cc
         table/block_based/block_based_table_builder.cc
         table/block_based/block_based_table_factory.cc
+        table/block_based/block_based_table_iterator.cc
         table/block_based/block_based_table_reader.cc
         table/block_based/block_builder.cc
+        table/block_based/block.cc
+        table/block_based/block_prefetcher.cc
         table/block_based/block_prefix_index.cc
-        table/block_based/data_block_hash_index.cc
         table/block_based/data_block_footer.cc
+        table/block_based/data_block_hash_index.cc
         table/block_based/filter_block_reader_common.cc
         table/block_based/filter_policy.cc
         table/block_based/flush_block_policy.cc
         table/block_based/full_filter_block.cc
+        table/block_based/hash_index_reader.cc
         table/block_based/index_builder.cc
+        table/block_based/index_reader_common.cc
         table/block_based/parsed_full_filter_block.cc
         table/block_based/partitioned_filter_block.cc
+        table/block_based/partitioned_index_iterator.cc
+        table/block_based/partitioned_index_reader.cc
+        table/block_based/reader_common.cc
         table/block_based/uncompression_dict_reader.cc
         table/block_fetcher.cc
         table/cuckoo/cuckoo_table_builder.cc
@@ -321,10 +357,13 @@
         table/plain/plain_table_index.cc
         table/plain/plain_table_key_coding.cc
         table/plain/plain_table_reader.cc
+        table/sst_file_dumper.cc
         table/sst_file_reader.cc
         table/sst_file_writer.cc
+        table/table_factory.cc
         table/table_properties.cc
         table/two_level_iterator.cc
+        table/unique_id.cc
         test_util/sync_point.cc
         test_util/sync_point_impl.cc
         test_util/testutil.cc
@@ -335,8 +374,12 @@
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
         tools/trace_analyzer_tool.cc
-        trace_replay/trace_replay.cc
         trace_replay/block_cache_tracer.cc
+        trace_replay/io_tracer.cc
+        trace_replay/trace_record.cc
+        trace_replay/trace_record_handler.cc
+        trace_replay/trace_record_result.cc
+        trace_replay/trace_replay.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
@@ -344,17 +387,8 @@
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
         util/dynamic_bloom.cc
-        util/hash.cc
-        util/murmurhash.cc
-        util/random.cc
-        util/rate_limiter.cc
-        util/slice.cc
         util/file_checksum_helper.cc
-        util/status.cc
-        util/string_util.cc
-        util/thread_local.cc
-        util/threadpool_imp.cc
-        util/xxhash.cc
+        util/hash.cc
         utilities/backupable/backupable_db.cc
         utilities/blob_db/blob_compaction_filter.cc
         utilities/blob_db/blob_db.cc
@@ -362,10 +396,8 @@
         utilities/blob_db/blob_db_impl_filesnapshot.cc
         utilities/blob_db/blob_dump_tool.cc
         utilities/blob_db/blob_file.cc
-        utilities/blob_db/blob_log_reader.cc
-        utilities/blob_db/blob_log_writer.cc
-        utilities/blob_db/blob_log_format.cc
         utilities/checkpoint/checkpoint_impl.cc
+        utilities/compaction_filters.cc
         utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
         utilities/debug.cc
         utilities/env_mirror.cc
@@ -373,11 +405,12 @@
         utilities/leveldb_options/leveldb_options.cc
         utilities/memory/memory_util.cc
         utilities/merge_operators/bytesxor.cc
+        utilities/merge_operators.cc
         utilities/merge_operators/max.cc
         utilities/merge_operators/put.cc
         utilities/merge_operators/sortlist.cc
-        utilities/merge_operators/string_append/stringappend.cc
         utilities/merge_operators/string_append/stringappend2.cc
+        utilities/merge_operators/string_append/stringappend.cc
         utilities/merge_operators/uint64add.cc
         utilities/object_registry.cc
         utilities/option_change_migration/option_change_migration.cc
@@ -391,22 +424,37 @@
         utilities/simulator_cache/sim_cache.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
         utilities/trace/file_trace_reader_writer.cc
-        utilities/transactions/optimistic_transaction_db_impl.cc
+        utilities/trace/replayer_impl.cc
+        utilities/transactions/lock/lock_manager.cc
+        utilities/transactions/lock/point/point_lock_manager.cc
+        utilities/transactions/lock/point/point_lock_tracker.cc
         utilities/transactions/optimistic_transaction.cc
+        utilities/transactions/optimistic_transaction_db_impl.cc
         utilities/transactions/pessimistic_transaction.cc
         utilities/transactions/pessimistic_transaction_db.cc
         utilities/transactions/snapshot_checker.cc
         utilities/transactions/transaction_base.cc
         utilities/transactions/transaction_db_mutex_impl.cc
-        utilities/transactions/transaction_lock_mgr.cc
         utilities/transactions/transaction_util.cc
         utilities/transactions/write_prepared_txn.cc
         utilities/transactions/write_prepared_txn_db.cc
         utilities/transactions/write_unprepared_txn.cc
         utilities/transactions/write_unprepared_txn_db.cc
         utilities/ttl/db_ttl_impl.cc
+        utilities/wal_filter.cc
         utilities/write_batch_with_index/write_batch_with_index.cc
         utilities/write_batch_with_index/write_batch_with_index_internal.cc
+        util/murmurhash.cc
+        util/random.cc
+        util/rate_limiter.cc
+        util/regex.cc
+        util/ribbon_config.cc
+        util/slice.cc
+        util/status.cc
+        util/string_util.cc
+        util/thread_local.cc
+        util/threadpool_imp.cc
+        util/xxhash.cc
 )
 
 
@@ -484,8 +532,10 @@
   STRING(TIMESTAMP GIT_DATE_TIME "%Y-%m-%d %H:%M:%S")
 ENDIF()
 
+# psergey-added:
+SET(GIT_MOD 0)
 CONFIGURE_FILE(${ROCKSDB_SOURCE_DIR}/util/build_version.cc.in build_version.cc @ONLY)
-INCLUDE_DIRECTORIES(${ROCKSDB_SOURCE_DIR}/util)
+
 list(APPEND SOURCES ${CMAKE_CURRENT_BINARY_DIR}/build_version.cc)
 
 ADD_CONVENIENCE_LIBRARY(rocksdblib ${SOURCES})
diff -Nru mariadb-10.11.11/storage/rocksdb/ha_rocksdb.cc mariadb-10.11.13/storage/rocksdb/ha_rocksdb.cc
--- mariadb-10.11.11/storage/rocksdb/ha_rocksdb.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/ha_rocksdb.cc	2025-05-19 16:14:26.000000000 +0000
@@ -1250,7 +1250,7 @@
     "Statistics Level for RocksDB. Default is 0 (kExceptHistogramOrTimers)",
     nullptr, rocksdb_set_rocksdb_stats_level,
     /* default */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
-    /* min */ (uint)rocksdb::StatsLevel::kExceptHistogramOrTimers,
+    /* min */ (uint)rocksdb::StatsLevel::kDisableAll,
     /* max */ (uint)rocksdb::StatsLevel::kAll, 0);
 
 static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size,
@@ -1596,7 +1596,7 @@
     "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
     rocksdb_tbl_options->no_block_cache);
 
-static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size,
+static MYSQL_SYSVAR_UINT64_T(block_size, rocksdb_tbl_options->block_size,
                           PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
                           "BlockBasedTableOptions::block_size for RocksDB",
                           nullptr, nullptr, rocksdb_tbl_options->block_size,
@@ -3992,7 +3992,7 @@
   DBUG_ASSERT(xid != nullptr);
   DBUG_ASSERT(commit_latency_stats != nullptr);
 
-  rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
+  rocksdb::StopWatchNano timer(rocksdb::SystemClock::Default().get(), true);
 
   const auto name = rdb_xid_to_string(*xid);
   DBUG_ASSERT(!name.empty());
@@ -4187,7 +4187,7 @@
   DBUG_ASSERT(thd != nullptr);
   DBUG_ASSERT(commit_latency_stats != nullptr);
 
-  rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
+  rocksdb::StopWatchNano timer(rocksdb::SystemClock::Default().get(), true);
 
   /* note: h->external_lock(F_UNLCK) is called after this function is called) */
   Rdb_transaction *tx = get_tx_from_thd(thd);
@@ -4732,8 +4732,7 @@
 
         if (tf_name.find("BlockBasedTable") != std::string::npos) {
           const rocksdb::BlockBasedTableOptions *const bbt_opt =
-              reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
-                  table_factory->GetOptions());
+                  table_factory->GetOptions<rocksdb::BlockBasedTableOptions>();
 
           if (bbt_opt != nullptr) {
             if (bbt_opt->block_cache.get() != nullptr) {
diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result
--- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/corrupted_data_reads_debug.result	2025-05-19 16:14:26.000000000 +0000
@@ -20,7 +20,7 @@
 set rocksdb_verify_row_debug_checksums=1;
 set session debug_dbug= "+d,myrocks_simulate_bad_row_read1";
 select * from t1 where pk=1;
-ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB
+ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB
 set session debug_dbug= "-d,myrocks_simulate_bad_row_read1";
 set rocksdb_verify_row_debug_checksums=@tmp1;
 select * from t1 where pk=1;
@@ -28,11 +28,11 @@
 1	1
 set session debug_dbug= "+d,myrocks_simulate_bad_row_read2";
 select * from t1 where pk=1;
-ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB
+ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB
 set session debug_dbug= "-d,myrocks_simulate_bad_row_read2";
 set session debug_dbug= "+d,myrocks_simulate_bad_row_read3";
 select * from t1 where pk=1;
-ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB
+ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB
 set session debug_dbug= "-d,myrocks_simulate_bad_row_read3";
 insert into t1 values(4,'0123456789');
 select * from t1;
@@ -56,7 +56,7 @@
 ABCD	1
 set session debug_dbug= "+d,myrocks_simulate_bad_pk_read1";
 select * from t2;
-ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB
+ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB
 set session debug_dbug= "-d,myrocks_simulate_bad_pk_read1";
 drop table t2;
 create table t2 (
@@ -69,6 +69,6 @@
 ABCD	1
 set session debug_dbug= "+d,myrocks_simulate_bad_pk_read1";
 select * from t2;
-ERROR HY000: Got error 205 'Found data corruption.' from ROCKSDB
+ERROR HY000: Got error 206 'Found data corruption.' from ROCKSDB
 set session debug_dbug= "-d,myrocks_simulate_bad_pk_read1";
 drop table t2;
diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result
--- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/drop_table3.result	2025-05-19 16:14:26.000000000 +0000
@@ -1,12 +1,6 @@
-call mtr.add_suppression("Column family 'cf1' not found");
-call mtr.add_suppression("Column family 'rev:cf2' not found");
 DROP TABLE IF EXISTS t1;
 call mtr.add_suppression("Column family 'cf1' not found");
 call mtr.add_suppression("Column family 'rev:cf2' not found");
-set global rocksdb_compact_cf = 'cf1';
-set global rocksdb_compact_cf = 'rev:cf2';
-set global rocksdb_signal_drop_index_thread = 1;
-# restart
 CREATE TABLE t1 (
 a int not null,
 b int not null,
@@ -15,6 +9,10 @@
 key (b) comment 'rev:cf2'
 ) ENGINE=RocksDB;
 DELETE FROM t1;
+set global rocksdb_compact_cf = 'cf1';
+set global rocksdb_compact_cf = 'rev:cf2';
+set global rocksdb_signal_drop_index_thread = 1;
+# restart
 select variable_value into @a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes';
 drop table t1;
 select case when variable_value-@a < 500000 then 'true' else 'false' end  from information_schema.global_status where variable_name='rocksdb_compact_read_bytes';
diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result
--- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result	2025-05-19 16:14:26.000000000 +0000
@@ -982,7 +982,7 @@
 rocksdb_skip_unique_check_tables	.*
 rocksdb_sst_mgr_rate_bytes_per_sec	0
 rocksdb_stats_dump_period_sec	600
-rocksdb_stats_level	0
+rocksdb_stats_level	1
 rocksdb_stats_recalc_rate	0
 rocksdb_store_row_debug_checksums	OFF
 rocksdb_strict_collation_check	OFF
diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result
--- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/rocksdb_datadir.result	2025-05-19 16:14:26.000000000 +0000
@@ -1,2 +1,2 @@
 Check for MANIFEST files
-MANIFEST-000006
+MANIFEST-000004
diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result
--- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/r/truncate_table3.result	2025-05-19 16:14:26.000000000 +0000
@@ -1,12 +1,6 @@
-call mtr.add_suppression("Column family 'cf1' not found");
-call mtr.add_suppression("Column family 'rev:cf2' not found");
 DROP TABLE IF EXISTS t1;
 call mtr.add_suppression("Column family 'cf1' not found");
 call mtr.add_suppression("Column family 'rev:cf2' not found");
-set global rocksdb_compact_cf = 'cf1';
-set global rocksdb_compact_cf = 'rev:cf2';
-set global rocksdb_signal_drop_index_thread = 1;
-# restart
 CREATE TABLE t1 (
 a int not null,
 b int not null,
@@ -15,6 +9,10 @@
 key (b) comment 'rev:cf2'
 ) ENGINE=RocksDB;
 DELETE FROM t1;
+set global rocksdb_compact_cf = 'cf1';
+set global rocksdb_compact_cf = 'rev:cf2';
+set global rocksdb_signal_drop_index_thread = 1;
+# restart
 select variable_value into @a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes';
 truncate table t1;
 select case when variable_value-@a < 500000 then 'true' else 'false' end  from information_schema.global_status where variable_name='rocksdb_compact_read_bytes';
diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc
--- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb/t/drop_table3.inc	2025-05-19 16:14:26.000000000 +0000
@@ -1,8 +1,5 @@
 --source include/have_rocksdb.inc
 
-call mtr.add_suppression("Column family 'cf1' not found");
-call mtr.add_suppression("Column family 'rev:cf2' not found");
-
 --disable_warnings
 DROP TABLE IF EXISTS t1;
 --enable_warnings
@@ -10,11 +7,6 @@
 call mtr.add_suppression("Column family 'cf1' not found");
 call mtr.add_suppression("Column family 'rev:cf2' not found");
 
-# Start from clean slate
-set global rocksdb_compact_cf = 'cf1';
-set global rocksdb_compact_cf = 'rev:cf2';
-set global rocksdb_signal_drop_index_thread = 1;
---source include/restart_mysqld.inc
 
 CREATE TABLE t1 (
   a int not null,
@@ -29,6 +21,12 @@
 let $table = t1;
 --source drop_table3_repopulate_table.inc
 
+# Start from clean slate
+set global rocksdb_compact_cf = 'cf1';
+set global rocksdb_compact_cf = 'rev:cf2';
+set global rocksdb_signal_drop_index_thread = 1;
+--source include/restart_mysqld.inc
+
 --disable_cursor_protocol
 select variable_value into @a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes';
 --enable_cursor_protocol
@@ -49,6 +47,7 @@
 --source include/wait_condition.inc
 
 select case when variable_value-@a < 500000 then 'true' else 'false' end  from information_schema.global_status where variable_name='rocksdb_compact_read_bytes';
+#select variable_value-@a from information_schema.global_status where variable_name='rocksdb_compact_read_bytes';
 
 # Cleanup
 DROP TABLE IF EXISTS t1;
diff -Nru mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result
--- mariadb-10.11.11/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_stats_level_basic.result	2025-05-19 16:14:26.000000000 +0000
@@ -11,7 +11,7 @@
 SET @start_global_value = @@global.ROCKSDB_STATS_LEVEL;
 SELECT @start_global_value;
 @start_global_value
-0
+1
 '# Setting to valid values in global scope#'
 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 0"
 SET @@global.ROCKSDB_STATS_LEVEL   = 0;
@@ -22,7 +22,7 @@
 SET @@global.ROCKSDB_STATS_LEVEL = DEFAULT;
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 4"
 SET @@global.ROCKSDB_STATS_LEVEL   = 4;
 SELECT @@global.ROCKSDB_STATS_LEVEL;
@@ -32,7 +32,7 @@
 SET @@global.ROCKSDB_STATS_LEVEL = DEFAULT;
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 2"
 SET @@global.ROCKSDB_STATS_LEVEL   = 2;
 SELECT @@global.ROCKSDB_STATS_LEVEL;
@@ -42,7 +42,7 @@
 SET @@global.ROCKSDB_STATS_LEVEL = DEFAULT;
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 "Trying to set variable @@session.ROCKSDB_STATS_LEVEL to 444. It should fail because it is not session."
 SET @@session.ROCKSDB_STATS_LEVEL   = 444;
 ERROR HY000: Variable 'rocksdb_stats_level' is a GLOBAL variable and should be set with SET GLOBAL
@@ -52,34 +52,34 @@
 Got one of the listed errors
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to 'bbb'"
 SET @@global.ROCKSDB_STATS_LEVEL   = 'bbb';
 Got one of the listed errors
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to '-1'"
 SET @@global.ROCKSDB_STATS_LEVEL   = '-1';
 Got one of the listed errors
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to '101'"
 SET @@global.ROCKSDB_STATS_LEVEL   = '101';
 Got one of the listed errors
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 "Trying to set variable @@global.ROCKSDB_STATS_LEVEL to '484436'"
 SET @@global.ROCKSDB_STATS_LEVEL   = '484436';
 Got one of the listed errors
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 SET @@global.ROCKSDB_STATS_LEVEL = @start_global_value;
 SELECT @@global.ROCKSDB_STATS_LEVEL;
 @@global.ROCKSDB_STATS_LEVEL
-0
+1
 DROP TABLE valid_values;
 DROP TABLE invalid_values;
diff -Nru mariadb-10.11.11/storage/rocksdb/rdb_i_s.cc mariadb-10.11.13/storage/rocksdb/rdb_i_s.cc
--- mariadb-10.11.11/storage/rocksdb/rdb_i_s.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rdb_i_s.cc	2025-05-19 16:14:26.000000000 +0000
@@ -587,8 +587,7 @@
     cf_option_types.push_back(
         {"PREFIX_EXTRACTOR", opts.prefix_extractor == nullptr
                                  ? "NULL"
-                                 : std::string(opts.prefix_extractor->Name())});
-
+                                 : std::string(opts.prefix_extractor->AsString())});
     // get COMPACTION_STYLE option
     switch (opts.compaction_style) {
       case rocksdb::kCompactionStyleLevel:
@@ -646,7 +645,7 @@
 
     // get table related options
     std::vector<std::string> table_options =
-        split_into_vector(opts.table_factory->GetPrintableTableOptions(), '\n');
+        split_into_vector(opts.table_factory->GetPrintableOptions(), '\n');
 
     for (auto option : table_options) {
       option.erase(std::remove(option.begin(), option.end(), ' '),
diff -Nru mariadb-10.11.11/storage/rocksdb/rdb_source_revision.h mariadb-10.11.13/storage/rocksdb/rdb_source_revision.h
--- mariadb-10.11.11/storage/rocksdb/rdb_source_revision.h	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rdb_source_revision.h	2025-05-19 16:14:28.000000000 +0000
@@ -1 +1 @@
-#define ROCKSDB_GIT_HASH "bba5e7bc21093d7cfa765e1280a7c4fdcd284288"
+#define ROCKSDB_GIT_HASH "79f08d7ffa6d34d9ca3357777bcb335884a56cfb"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/cat_ignore_eagain	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,54 @@
+#! /bin/bash
+
+# Work around issue with parallel make output causing random error, as in
+# make[1]: write error: stdout
+# Probably due to a kernel bug:
+#   https://bugs.launchpad.net/ubuntu/+source/linux-signed/+bug/1814393
+# Seems to affect image ubuntu-1604:201903-01 and ubuntu-1604:202004-01
+
+cd "$(dirname $0)"
+
+if [ ! -x cat_ignore_eagain.out ]; then
+    cc -x c -o cat_ignore_eagain.out - << EOF
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+int main() {
+    int n, m, p;
+    char buf[1024];
+    for (;;) {
+        n = read(STDIN_FILENO, buf, 1024);
+        if (n > 0 && n <= 1024) {
+            for (m = 0; m < n;) {
+                p = write(STDOUT_FILENO, buf + m, n - m);
+                if (p < 0) {
+                    if (errno == EAGAIN) {
+                        // ignore but pause a bit
+                        usleep(100);
+                    } else {
+                        perror("write failed");
+                        return 42;
+                    }
+                } else {
+                    m += p;
+                }
+            }
+        } else if (n < 0) {
+            if (errno == EAGAIN) {
+                // ignore but pause a bit
+                usleep(100);
+            } else {
+                // Some non-ignorable error
+                perror("read failed");
+                return 43;
+            }
+        } else {
+            // EOF
+            return 0;
+        }
+    }
+}
+EOF
+fi
+
+exec ./cat_ignore_eagain.out
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/config.yml mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/config.yml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/config.yml	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/config.yml	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,872 @@
+version: 2.1
+
+orbs:
+  win: circleci/windows@2.4.0
+  slack: circleci/slack@3.4.2
+
+aliases:
+  - &notify-on-main-failure
+    fail_only: true
+    only_for_branches: main
+
+commands:
+  install-cmake-on-macos:
+    steps:
+      - run:
+          name: Install cmake on macos
+          command: |
+            HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake
+
+  install-jdk8-on-macos:
+    steps:
+      - run:
+          name: Install JDK 8 on macos
+          command: |
+            brew install --cask adoptopenjdk/openjdk/adoptopenjdk8
+
+  increase-max-open-files-on-macos:
+    steps:
+      - run:
+          name: Increase max open files
+          command: |
+            sudo sysctl -w kern.maxfiles=1048576
+            sudo sysctl -w kern.maxfilesperproc=1048576
+            sudo launchctl limit maxfiles 1048576
+
+  pre-steps:
+    steps:
+      - checkout
+      - run:
+          name: Setup Environment Variables
+          command: |
+            echo "export GTEST_THROW_ON_FAILURE=0" >> $BASH_ENV
+            echo "export GTEST_OUTPUT=\"xml:/tmp/test-results/\"" >> $BASH_ENV
+            echo "export SKIP_FORMAT_BUCK_CHECKS=1" >> $BASH_ENV
+            echo "export GTEST_COLOR=1" >> $BASH_ENV
+            echo "export CTEST_OUTPUT_ON_FAILURE=1" >> $BASH_ENV
+            echo "export CTEST_TEST_TIMEOUT=300" >> $BASH_ENV
+            echo "export ZLIB_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zlib" >> $BASH_ENV
+            echo "export BZIP2_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/bzip2" >> $BASH_ENV
+            echo "export SNAPPY_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/snappy" >> $BASH_ENV
+            echo "export LZ4_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/lz4" >> $BASH_ENV
+            echo "export ZSTD_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zstd" >> $BASH_ENV
+
+  pre-steps-macos:
+      steps:
+        - pre-steps
+
+  post-steps:
+    steps:
+      - slack/status: *notify-on-main-failure
+      - store_test_results: # store test result if there's any
+          path: /tmp/test-results
+      - store_artifacts: # store LOG for debugging if there's any
+          path: LOG
+      - run: # on fail, compress Test Logs for diagnosing the issue
+         name: Compress Test Logs
+         command: tar -cvzf t.tar.gz t
+         when: on_fail
+      - store_artifacts: # on fail, store Test Logs for diagnosing the issue
+          path: t.tar.gz
+          destination: test_logs
+          when: on_fail
+
+  install-clang-10:
+    steps:
+      - run:
+          name: Install Clang 10
+          command: |
+            echo "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main" | sudo tee -a /etc/apt/sources.list
+            echo "deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main" | sudo tee -a /etc/apt/sources.list
+            echo "APT::Acquire::Retries \"10\";" | sudo tee -a /etc/apt/apt.conf.d/80-retries # llvm.org unreliable
+            sudo apt-get update -y && sudo apt-get install -y clang-10
+
+  install-clang-13:
+    steps:
+      - run:
+          name: Install Clang 13
+          command: |
+            echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-13 main" | sudo tee -a /etc/apt/sources.list
+            echo "deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-13 main" | sudo tee -a /etc/apt/sources.list
+            echo "APT::Acquire::Retries \"10\";" | sudo tee -a /etc/apt/apt.conf.d/80-retries # llvm.org unreliable
+            wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
+            sudo apt-get update -y && sudo apt-get install -y clang-13
+
+  install-gflags:
+    steps:
+      - run:
+          name: Install gflags
+          command: |
+            sudo apt-get update -y && sudo apt-get install -y libgflags-dev
+
+  install-benchmark:
+    steps:
+      - run: # currently doesn't support ubuntu-1604 which doesn't have libbenchmark package, user can still install by building it youself
+          name: Install benchmark
+          command: |
+            sudo apt-get update -y && sudo apt-get install -y libbenchmark-dev
+
+  install-librados:
+    steps:
+      - run:
+          name: Install librados
+          command: |
+            sudo apt-get update -y && sudo apt-get install -y librados-dev
+
+  upgrade-cmake:
+    steps:
+      - run:
+          name: Upgrade cmake
+          command: |
+            sudo apt remove --purge cmake
+            sudo snap install cmake --classic
+
+  install-gflags-on-macos:
+    steps:
+      - run:
+          name: Install gflags on macos
+          command: |
+            HOMEBREW_NO_AUTO_UPDATE=1 brew install gflags
+
+  install-gtest-parallel:
+    steps:
+      - run:
+          name: Install gtest-parallel
+          command: |
+            git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
+            echo 'export PATH=$HOME/gtest-parallel:$PATH' >> $BASH_ENV
+
+  install-compression-libs:
+    steps:
+      - run:
+          name: Install compression libs
+          command: |
+            sudo apt-get update -y && sudo apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+
+executors:
+  windows-2xlarge:
+    machine:
+      image: 'windows-server-2019-vs2019:stable'
+      resource_class: windows.2xlarge
+      shell: bash.exe
+
+jobs:
+  build-macos:
+    macos:
+      xcode: 12.5.1
+    resource_class: large
+    environment:
+      ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc cause env_test hang, disable it for now
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - pre-steps-macos
+      - run: ulimit -S -n 1048576 && OPT=-DCIRCLECI make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-macos-cmake:
+    macos:
+      xcode: 12.5.1
+    resource_class: large
+    steps:
+      - increase-max-open-files-on-macos
+      - install-cmake-on-macos
+      - install-gflags-on-macos
+      - pre-steps-macos
+      - run: ulimit -S -n 1048576 && (mkdir build && cd build && cmake -DWITH_GFLAGS=1 .. && make V=1 -j32 && ctest -j10) 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-mem-env-librados:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-librados
+      - run: MEM_ENV=1 ROCKSDB_USE_LIBRADOS=1 make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-encrypted-env:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: ENCRYPTED_ENV=1 make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-shared_lib-alt_namespace-status_checked:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-release:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: large
+    steps:
+      - checkout # check out the code in the project directory
+      - run: make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain
+      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
+      - install-gflags
+      - run: make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain
+      - run: ./db_stress --version # ensure with gflags
+      - post-steps
+
+  build-linux-release-rtti:
+    machine:
+      image: ubuntu-1604:201903-01
+    resource_class: large
+    steps:
+      - checkout # check out the code in the project directory
+      - run: make clean
+      - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j8 static_lib tools db_bench 2>&1 | .circleci/cat_ignore_eagain
+      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
+      - run: sudo apt-get update -y && sudo apt-get install -y libgflags-dev
+      - run: make clean
+      - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j8 static_lib tools db_bench 2>&1 | .circleci/cat_ignore_eagain
+      - run: ./db_stress --version # ensure with gflags
+
+  build-linux-lite:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: LITE=1 make V=1 J=32 -j32 check 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-lite-release:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: large
+    steps:
+      - checkout # check out the code in the project directory
+      - run: LITE=1 make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain
+      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
+      - install-gflags
+      - run: LITE=1 make V=1 -j8 release 2>&1 | .circleci/cat_ignore_eagain
+      - run: ./db_stress --version # ensure with gflags
+      - post-steps
+
+  build-linux-clang-no_test_run:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: xlarge
+    steps:
+      - checkout # check out the code in the project directory
+      - run: sudo apt-get update -y && sudo apt-get install -y clang libgflags-dev libtbb-dev
+      - run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-clang10-asan:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-clang-10
+      - run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out
+      - post-steps
+
+  build-linux-clang10-mini-tsan:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-clang-10
+      - run: COMPILE_WITH_TSAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out.
+      - post-steps
+
+  build-linux-clang10-ubsan:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-clang-10
+      - run: COMPILE_WITH_UBSAN=1 OPT="-fsanitize-blacklist=.circleci/ubsan_suppression_list.txt" CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out
+      - post-steps
+
+  build-linux-clang10-clang-analyze:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-clang-10
+      - run: sudo apt-get update -y && sudo apt-get install -y clang-tools-10
+      - run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze 2>&1 | .circleci/cat_ignore_eagain # aligned new doesn't work for reason we haven't figured out. For unknown, reason passing "clang++-10" as CLANG_ANALYZER doesn't work, and we need a full path.
+      - post-steps
+
+  build-linux-cmake:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - upgrade-cmake
+      - run: (mkdir build && cd build && cmake -DWITH_GFLAGS=1 .. && make V=1 -j20 && ctest -j20) 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-cmake-ubuntu-20:
+    machine:
+      image: ubuntu-2004:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-benchmark
+      - run: (mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20 && make microbench) 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-unity-and-headers:
+    docker: # executor type
+      - image: gcc:latest
+    resource_class: large
+    steps:
+      - checkout # check out the code in the project directory
+      - run: apt-get update -y && apt-get install -y libgflags-dev
+      - run: TEST_TMPDIR=/dev/shm && make V=1 -j8 unity_test 2>&1 | .circleci/cat_ignore_eagain
+      - run: make V=1 -j8 -k check-headers 2>&1 | .circleci/cat_ignore_eagain # could be moved to a different build
+      - post-steps
+
+  build-linux-gcc-4_8-no_test_run:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: large
+    steps:
+      - pre-steps
+      - run: sudo apt-get update -y && sudo apt-get install gcc-4.8 g++-4.8 libgflags-dev
+      - run: CC=gcc-4.8 CXX=g++-4.8 V=1 SKIP_LINK=1 make -j8 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI
+      - post-steps
+
+  build-linux-gcc-8-no_test_run:
+    machine:
+      image: ubuntu-2004:202010-01
+    resource_class: large
+    steps:
+      - pre-steps
+      - run: sudo apt-get update -y && sudo apt-get install gcc-8 g++-8 libgflags-dev
+      - run: CC=gcc-8 CXX=g++-8 V=1 SKIP_LINK=1 make -j8 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI
+      - post-steps
+
+  build-linux-gcc-9-no_test_run:
+    machine:
+      image: ubuntu-2004:202010-01
+    resource_class: large
+    steps:
+      - pre-steps
+      - run: sudo apt-get update -y && sudo apt-get install gcc-9 g++-9 libgflags-dev
+      - run: CC=gcc-9 CXX=g++-9 V=1 SKIP_LINK=1 make -j8 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI
+      - post-steps
+
+  build-linux-gcc-10-cxx20-no_test_run:
+    machine:
+      image: ubuntu-2004:202010-01
+    resource_class: xlarge
+    steps:
+      - pre-steps
+      - run: sudo apt-get update -y && sudo apt-get install gcc-10 g++-10 libgflags-dev
+      - run: CC=gcc-10 CXX=g++-10 V=1 SKIP_LINK=1 ROCKSDB_CXX_STANDARD=c++20 make -j16 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI
+      - post-steps
+
+  build-linux-gcc-11-no_test_run:
+    machine:
+      image: ubuntu-2004:202010-01
+    resource_class: xlarge
+    steps:
+      - pre-steps
+      - run: sudo apt-get update -y && sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get install gcc-11 g++-11 libgflags-dev
+      - run: CC=gcc-11 CXX=g++-11 V=1 SKIP_LINK=1 make -j16 all 2>&1 | .circleci/cat_ignore_eagain # Linking broken because libgflags compiled with newer ABI
+      - post-steps
+
+  build-linux-clang-13-no_test_run:
+    machine:
+      image: ubuntu-2004:202010-01
+    resource_class: xlarge
+    steps:
+      - pre-steps
+      - install-clang-13
+      - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j16 all 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  # This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing.
+  build-linux-microbench:
+    machine:
+      image: ubuntu-2004:202010-01
+    resource_class: xlarge
+    steps:
+      - pre-steps
+      - install-benchmark
+      - run: DEBUG_LEVEL=0 make microbench 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-windows:
+    executor: windows-2xlarge
+    parameters:
+      extra_cmake_opt:
+        default: ""
+        type: string
+      vs_year:
+        default: "2019"
+        type: string
+      cmake_generator:
+        default: "Visual Studio 16 2019"
+        type: string
+    environment:
+      THIRDPARTY_HOME: C:/Users/circleci/thirdparty
+      CMAKE_HOME: C:/Users/circleci/thirdparty/cmake-3.16.4-win64-x64
+      CMAKE_BIN: C:/Users/circleci/thirdparty/cmake-3.16.4-win64-x64/bin/cmake.exe
+      SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.7
+      SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.7;C:/Users/circleci/thirdparty/snappy-1.1.7/build
+      SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.7/build/Debug/snappy.lib
+      VS_YEAR: <<parameters.vs_year>>
+      CMAKE_GENERATOR: <<parameters.cmake_generator>>
+    steps:
+      - checkout
+      - run:
+          name: "Setup VS"
+          command: |
+            if [[ "${VS_YEAR}" == "2019" ]]; then
+              echo "VS2019 already present."
+            elif [[ "${VS_YEAR}" == "2017" ]]; then
+              echo "Installing VS2017..."
+              powershell .circleci/vs2017_install.ps1
+            elif [[ "${VS_YEAR}" == "2015" ]]; then
+              echo "Installing VS2015..."
+              powershell .circleci/vs2015_install.ps1
+            fi
+      - store_artifacts:
+          path: \Users\circleci\AppData\Local\Temp\vslogs.zip
+      - run:
+          name: "Install thirdparty dependencies"
+          command: |
+            mkdir ${THIRDPARTY_HOME}
+            cd ${THIRDPARTY_HOME}
+            echo "Installing CMake..."
+            curl --fail --silent --show-error --output cmake-3.16.4-win64-x64.zip --location https://github.com/Kitware/CMake/releases/download/v3.16.4/cmake-3.16.4-win64-x64.zip
+            unzip -q cmake-3.16.4-win64-x64.zip
+            echo "Building Snappy dependency..."
+            curl --fail --silent --show-error --output snappy-1.1.7.zip --location https://github.com/google/snappy/archive/1.1.7.zip
+            unzip -q snappy-1.1.7.zip
+            cd snappy-1.1.7
+            mkdir build
+            cd build
+            ${CMAKE_BIN} -G "${CMAKE_GENERATOR}" ..
+            msbuild.exe Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
+      - run:
+          name: "Build RocksDB"
+          command: |
+            mkdir build
+            cd build
+            ${CMAKE_BIN} -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DJNI=1 << parameters.extra_cmake_opt >> ..
+            cd ..
+            echo "Building with VS version: ${CMAKE_GENERATOR}"
+            msbuild.exe build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
+      - run:
+          name: "Test RocksDB"
+          shell: powershell.exe
+          command: |
+            build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16
+
+  build-linux-java:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: large
+    environment:
+      JAVA_HOME: /usr/lib/jvm/java-1.8.0-openjdk-amd64
+    steps:
+      - pre-steps
+      - install-gflags
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava Shared Library"
+          command: make V=1 J=8 -j8 rocksdbjava 2>&1 | .circleci/cat_ignore_eagain
+      - run:
+          name: "Test RocksDBJava"
+          command: make V=1 J=8 -j8 jtest 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-java-static:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: large
+    environment:
+      JAVA_HOME: /usr/lib/jvm/java-1.8.0-openjdk-amd64
+    steps:
+      - pre-steps
+      - install-gflags
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava Static Library"
+          command: make V=1 J=8 -j8 rocksdbjavastatic 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-macos-java:
+    macos:
+      xcode: 12.5.1
+    resource_class: medium
+    environment:
+      JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
+      ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc causes java 8 crash
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - install-jdk8-on-macos
+      - pre-steps-macos
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava Shared Library"
+          command: make V=1 J=8 -j8 rocksdbjava 2>&1 | .circleci/cat_ignore_eagain
+      - run:
+          name: "Test RocksDBJava"
+          command: make V=1 J=8 -j8 jtest 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-macos-java-static:
+    macos:
+      xcode: 12.5.1
+    resource_class: medium
+    environment:
+      JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - install-cmake-on-macos
+      - install-jdk8-on-macos
+      - pre-steps-macos
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava x86 and ARM Static Libraries"
+          command: make V=1 J=8 -j8 rocksdbjavastaticosx 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-macos-java-static-universal:
+    macos:
+      xcode: 12.5.1
+    resource_class: medium
+    environment:
+      JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
+    steps:
+      - increase-max-open-files-on-macos
+      - install-gflags-on-macos
+      - install-cmake-on-macos
+      - install-jdk8-on-macos
+      - pre-steps-macos
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build RocksDBJava Universal Binary Static Library"
+          command: make V=1 J=8 -j8 rocksdbjavastaticosx_ub 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-examples:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: large
+    steps:
+      - pre-steps
+      - install-gflags
+      - run:
+          name: "Build examples"
+          command: |
+            OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4 | ../.circleci/cat_ignore_eagain
+      - post-steps
+
+  build-cmake-mingw:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: sudo apt-get update -y && sudo apt-get install -y mingw-w64
+      - run: sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix
+      - run:
+          name: "Build cmake-mingw"
+          command: |
+            sudo apt-get install snapd && sudo snap install cmake --beta --classic
+            export PATH=/snap/bin:$PATH
+            sudo apt-get install -y openjdk-8-jdk
+            export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+            export PATH=$JAVA_HOME/bin:$PATH
+            echo "JAVA_HOME=${JAVA_HOME}"
+            which java && java -version
+            which javac && javac -version
+            mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
+      - post-steps
+
+  build-linux-non-shm:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    parameters:
+      start_test:
+        default: ""
+        type: string
+      end_test:
+        default: ""
+        type: string
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-gtest-parallel
+      - run:
+          name: "Build unit tests"
+          command: |
+            echo "env: $(env)"
+            echo "** done env"
+            ROCKSDBTESTS_START=<<parameters.start_test>> ROCKSDBTESTS_END=<<parameters.end_test>> ROCKSDBTESTS_SUBSET_TESTS_TO_FILE=/tmp/test_list make V=1 -j32 --output-sync=target build_subset_tests
+      - run:
+          name: "Run unit tests in parallel"
+          command: |
+            sed -i 's/[[:space:]]*$//; s/ / \.\//g; s/.*/.\/&/' /tmp/test_list
+            cat /tmp/test_list
+            export TEST_TMPDIR=/tmp/rocksdb_test_tmp
+            gtest-parallel $(</tmp/test_list) --output_dir=/tmp | cat  # pipe to cat to continuously output status on circleci UI. Otherwise, no status will be printed while the job is running.
+      - post-steps
+
+  build-linux-arm-test-full:
+    machine:
+      image: ubuntu-2004:202101-01
+    resource_class: arm.large
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: make V=1 J=4 -j4 check 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-arm:
+    machine:
+      image: ubuntu-2004:202101-01
+    resource_class: arm.large
+    steps:
+      - pre-steps
+      - install-gflags
+      - run: ROCKSDBTESTS_PLATFORM_DEPENDENT=only make V=1 J=4 -j4 all_but_some_tests check_some 2>&1 | .circleci/cat_ignore_eagain
+      - post-steps
+
+  build-linux-arm-cmake-no_test_run:
+    machine:
+      image: ubuntu-2004:202101-01
+    resource_class: arm.large
+    environment:
+      JAVA_HOME: /usr/lib/jvm/java-8-openjdk-arm64
+    steps:
+      - pre-steps
+      - install-gflags
+      - run:
+          name: "Set Java Environment"
+          command: |
+            echo "JAVA_HOME=${JAVA_HOME}"
+            echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
+            which java && java -version
+            which javac && javac -version
+      - run:
+          name: "Build with cmake"
+          command: |
+            mkdir build
+            cd build
+            cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=1 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 ..
+            make -j4
+      - run:
+          name: "Build Java with cmake"
+          command: |
+            rm -rf build
+            mkdir build
+            cd build
+            cmake -DJNI=1 -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=1 ..
+            make -j4 rocksdb rocksdbjni
+      - post-steps
+
+  build-format-compatible:
+    machine:
+      image: ubuntu-1604:202104-01
+    resource_class: 2xlarge
+    steps:
+      - pre-steps
+      - install-gflags
+      - install-compression-libs
+      - run:
+          name: "test"
+          command: |
+            export TEST_TMPDIR=/dev/shm/rocksdb
+            rm -rf /dev/shm/rocksdb
+            mkdir /dev/shm/rocksdb
+            tools/check_format_compatible.sh
+      - post-steps
+
+workflows:
+  version: 2
+  build-linux:
+    jobs:
+      - build-linux
+  build-linux-cmake:
+    jobs:
+      - build-linux-cmake
+      - build-linux-cmake-ubuntu-20
+  build-linux-mem-env-librados:
+    jobs:
+      - build-linux-mem-env-librados
+  build-linux-encrypted-env:
+    jobs:
+      - build-linux-encrypted-env
+  build-linux-shared_lib-alt_namespace-status_checked:
+    jobs:
+      - build-linux-shared_lib-alt_namespace-status_checked
+  build-linux-lite:
+    jobs:
+      - build-linux-lite
+  build-linux-release:
+    jobs:
+      - build-linux-release
+  build-linux-release-rtti:
+    jobs:
+      - build-linux-release-rtti
+  build-linux-lite-release:
+    jobs:
+      - build-linux-lite-release
+  build-linux-clang10-asan:
+    jobs:
+      - build-linux-clang10-asan
+  build-linux-clang10-mini-tsan:
+    jobs:
+      - build-linux-clang10-mini-tsan
+  build-linux-clang10-ubsan:
+    jobs:
+      - build-linux-clang10-ubsan
+  build-linux-clang10-clang-analyze:
+    jobs:
+      - build-linux-clang10-clang-analyze
+  build-linux-unity-and-headers:
+    jobs:
+      - build-linux-unity-and-headers
+  build-windows-vs2019:
+    jobs:
+      - build-windows:
+          name: "build-windows-vs2019"
+  build-windows-vs2019-cxx20:
+    jobs:
+      - build-windows:
+          name: "build-windows-vs2019-cxx20"
+          extra_cmake_opt: -DCMAKE_CXX_STANDARD=20
+  build-windows-vs2017:
+    jobs:
+      - build-windows:
+          name: "build-windows-vs2017"
+          vs_year: "2017"
+          cmake_generator: "Visual Studio 15 Win64"
+  build-java:
+    jobs:
+      - build-linux-java
+      - build-linux-java-static
+      - build-macos-java
+      - build-macos-java-static
+      - build-macos-java-static-universal
+  build-examples:
+    jobs:
+      - build-examples
+  build-linux-non-shm:
+    jobs:
+      - build-linux-non-shm:
+          start_test: ""
+          end_test: "db_options_test" # make sure unique in src.mk
+      - build-linux-non-shm:
+          start_test: "db_options_test" # make sure unique in src.mk
+          end_test: "filename_test" # make sure unique in src.mk
+      - build-linux-non-shm:
+          start_test: "filename_test" # make sure unique in src.mk
+          end_test: "statistics_test" # make sure unique in src.mk
+      - build-linux-non-shm:
+          start_test: "statistics_test" # make sure unique in src.mk
+          end_test: ""
+  build-linux-compilers-no_test_run:
+    jobs:
+      - build-linux-clang-no_test_run
+      - build-linux-clang-13-no_test_run
+      - build-linux-gcc-4_8-no_test_run
+      - build-linux-gcc-8-no_test_run
+      - build-linux-gcc-9-no_test_run
+      - build-linux-gcc-10-cxx20-no_test_run
+      - build-linux-gcc-11-no_test_run
+      - build-linux-arm-cmake-no_test_run
+  build-macos:
+    jobs:
+      - build-macos
+  build-macos-cmake:
+    jobs:
+      - build-macos-cmake
+  build-cmake-mingw:
+    jobs:
+      - build-cmake-mingw
+  build-linux-arm:
+    jobs:
+      - build-linux-arm
+  build-microbench:
+    jobs:
+      - build-linux-microbench
+  nightly:
+    triggers:
+      - schedule:
+          cron: "0 0 * * *"
+          filters:
+            branches:
+              only:
+                - main
+    jobs:
+      - build-format-compatible
+      - build-linux-arm-test-full
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/ubsan_suppression_list.txt	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,6 @@
+# Supress UBSAN warnings related to stl_tree.h, e.g.
+# UndefinedBehaviorSanitizer: undefined-behavior /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43 in 
+# /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43:
+# runtime error: upcast of address 0x000001fa8820 with insufficient space for an object of type
+# 'std::_Rb_tree_node<std::pair<const std::__cxx11::basic_string<char>, rocksdb::(anonymous namespace)::LockHoldingInfo> >'
+src:*bits/stl_tree.h
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2015_install.ps1	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,24 @@
+$VS_DOWNLOAD_LINK = "https://go.microsoft.com/fwlink/?LinkId=691126"
+$COLLECT_DOWNLOAD_LINK = "https://aka.ms/vscollect.exe"
+curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
+if ($LASTEXITCODE -ne 0) {
+    echo "Download of the VS 2015 installer failed"
+    exit 1
+}
+$VS_INSTALL_ARGS = @("/Quiet", "/NoRestart")
+$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
+Remove-Item -Path vs_installer.exe -Force
+$exitCode = $process.ExitCode
+if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
+    echo "VS 2015 installer exited with code $exitCode, which should be one of [0, 3010]."
+    curl.exe --retry 3 -kL $COLLECT_DOWNLOAD_LINK --output Collect.exe
+    if ($LASTEXITCODE -ne 0) {
+        echo "Download of the VS Collect tool failed."
+        exit 1
+    }
+    Start-Process "${PWD}\Collect.exe" -NoNewWindow -Wait -PassThru
+    New-Item -Path "C:\w\build-results" -ItemType "directory" -Force
+    Copy-Item -Path "C:\Users\circleci\AppData\Local\Temp\vslogs.zip" -Destination "C:\w\build-results\"
+    exit 1
+}
+echo "VS 2015 installed."
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1 mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.circleci/vs2017_install.ps1	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,35 @@
+$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe"
+$COLLECT_DOWNLOAD_LINK = "https://aka.ms/vscollect.exe"
+$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.14.13",
+                                                     "--add Microsoft.Component.MSBuild",
+                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
+                                                     "--add Microsoft.VisualStudio.Component.TextTemplating",
+                                                     "--add Microsoft.VisualStudio.Component.VC.CoreIde",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
+                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")
+
+curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
+if ($LASTEXITCODE -ne 0) {
+    echo "Download of the VS 2017 installer failed"
+    exit 1
+}
+
+$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
+Remove-Item -Path vs_installer.exe -Force
+$exitCode = $process.ExitCode
+if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
+    echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]."
+    curl.exe --retry 3 -kL $COLLECT_DOWNLOAD_LINK --output Collect.exe
+    if ($LASTEXITCODE -ne 0) {
+        echo "Download of the VS Collect tool failed."
+        exit 1
+    }
+    Start-Process "${PWD}\Collect.exe" -NoNewWindow -Wait -PassThru
+    New-Item -Path "C:\w\build-results" -ItemType "directory" -Force
+    Copy-Item -Path "C:\Users\circleci\AppData\Local\Temp\vslogs.zip" -Destination "C:\w\build-results\"
+    exit 1
+}
+echo "VS 2017 installed."
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml mariadb-10.11.13/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.github/workflows/sanity_check.yml	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,44 @@
+name: Check buck targets and code format
+on: [push, pull_request]
+jobs:
+  check:
+    name: Check TARGETS file and code format
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout feature branch
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Fetch from upstream
+      run: |
+        git remote add upstream https://github.com/facebook/rocksdb.git && git fetch upstream
+
+    - name: Where am I
+      run: |
+        echo git status && git status
+        echo "git remote -v" && git remote -v
+        echo git branch && git branch
+
+    - name: Setup Python
+      uses: actions/setup-python@v1
+
+    - name: Install Dependencies
+      run: python -m pip install --upgrade pip
+
+    - name: Install argparse
+      run: pip install argparse
+
+    - name: Download clang-format-diff.py
+      uses: wei/wget@v1
+      with:
+        args: https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py
+
+    - name: Check format
+      run: VERBOSE_CHECK=1 make check-format
+
+    - name: Compare buckify output
+      run: make check-buck-targets
+
+    - name: Simple source code checks
+      run: make check-sources
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.gitignore mariadb-10.11.13/storage/rocksdb/rocksdb/.gitignore
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.gitignore	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.gitignore	2025-05-19 16:14:27.000000000 +0000
@@ -1,4 +1,5 @@
 make_config.mk
+rocksdb.pc
 
 *.a
 *.arc
@@ -7,6 +8,7 @@
 *.gcda
 *.gcno
 *.o
+*.o.tmp
 *.so
 *.so.*
 *_test
@@ -34,6 +36,7 @@
 sst_dump
 blob_dump
 block_cache_trace_analyzer
+db_with_timestamp_basic_test
 tools/block_cache_analyzer/*.pyc
 column_aware_encoding_exp
 util/build_version.cc
@@ -51,6 +54,7 @@
 trace_analyzer
 trace_analyzer_test
 block_cache_trace_analyzer
+io_tracer_parser
 .DS_Store
 .vs
 .vscode
@@ -82,3 +86,12 @@
 fbcode
 buckifier/*.pyc
 buckifier/__pycache__
+
+compile_commands.json
+clang-format-diff.py
+.py3/
+
+fuzz/proto/gen/
+fuzz/crash-*
+
+cmake-build-*
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/.travis.yml mariadb-10.11.13/storage/rocksdb/rocksdb/.travis.yml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/.travis.yml	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/.travis.yml	2025-05-19 16:14:27.000000000 +0000
@@ -2,18 +2,19 @@
 language: cpp
 os:
   - linux
-  - osx
+arch:
+  - arm64
+  - ppc64le
+  - s390x
 compiler:
   - clang
   - gcc
-osx_image: xcode9.4
-jdk:
-  - openjdk7
 cache:
   - ccache
 
 addons:
   apt:
+    update: true
     sources:
       - ubuntu-toolchain-r-test
     packages:
@@ -24,15 +25,6 @@
       - liblzma-dev  # xv
       - libzstd-dev
       - zlib1g-dev
-  homebrew:
-    update: true
-    packages:
-      - ccache
-      - gflags
-      - lz4
-      - snappy
-      - xz
-      - zstd
 
 env:
   - TEST_GROUP=platform_dependent # 16-18 minutes
@@ -48,43 +40,209 @@
   - JOB_NAME=examples # 5-7 minutes
   - JOB_NAME=cmake # 3-5 minutes
   - JOB_NAME=cmake-gcc8 # 3-5 minutes
+  - JOB_NAME=cmake-gcc9 # 3-5 minutes
+  - JOB_NAME=cmake-gcc9-c++20 # 3-5 minutes
   - JOB_NAME=cmake-mingw # 3 minutes
+  - JOB_NAME=make-gcc4.8
+  - JOB_NAME=status_checked
 
 matrix:
   exclude:
-  - os: osx
+  - os : linux
+    arch: arm64
+    env: JOB_NAME=cmake-mingw
+  - os : linux
+    arch: arm64
+    env: JOB_NAME=make-gcc4.8
+  - os: linux
+    arch: ppc64le
+    env: JOB_NAME=cmake-mingw
+  - os: linux
+    arch: ppc64le
+    env: JOB_NAME=make-gcc4.8
+  - os: linux
+    arch: s390x
+    env: JOB_NAME=cmake-mingw
+  - os: linux
+    arch: s390x
+    env: JOB_NAME=make-gcc4.8
+  - os: linux
+    compiler: clang
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: arm64
+    env: TEST_GROUP=platform_dependent
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: TEST_GROUP=1
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: TEST_GROUP=1
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
     env: TEST_GROUP=1
-  - os: osx
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
     env: TEST_GROUP=2
-  - os: osx
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: TEST_GROUP=2
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: TEST_GROUP=2
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: TEST_GROUP=3
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
     env: TEST_GROUP=3
-  - os: osx
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: TEST_GROUP=3
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: TEST_GROUP=4
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
     env: TEST_GROUP=4
-  - os: osx
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: TEST_GROUP=4
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: JOB_NAME=cmake
+  - if: type = pull_request AND commit_message !~ /FULL_CI/ AND commit_message !~ /java/
+    os : linux
+    arch: arm64
+    env: JOB_NAME=java_test
+  - if: type = pull_request AND commit_message !~ /FULL_CI/ AND commit_message !~ /java/
+    os: linux
+    arch: ppc64le
+    env: JOB_NAME=java_test
+  - if: type = pull_request AND commit_message !~ /FULL_CI/ AND commit_message !~ /java/
+    os: linux
+    arch: s390x
+    env: JOB_NAME=java_test
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: JOB_NAME=lite_build
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: JOB_NAME=lite_build
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: JOB_NAME=lite_build
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: JOB_NAME=examples
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: JOB_NAME=examples
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: JOB_NAME=examples
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
     env: JOB_NAME=cmake-gcc8
-  - os : osx
-    env: JOB_NAME=cmake-mingw
-  - os : linux
-    compiler: clang
-  - os : osx
-    compiler: gcc
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: JOB_NAME=cmake-gcc8
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: JOB_NAME=cmake-gcc8
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: JOB_NAME=cmake-gcc9
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: JOB_NAME=cmake-gcc9
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: JOB_NAME=cmake-gcc9
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: JOB_NAME=cmake-gcc9-c++20
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: JOB_NAME=cmake-gcc9-c++20
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: JOB_NAME=cmake-gcc9-c++20
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os : linux
+    arch: arm64
+    env: JOB_NAME=status_checked
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: ppc64le
+    env: JOB_NAME=status_checked
+  - if: type = pull_request AND commit_message !~ /FULL_CI/
+    os: linux
+    arch: s390x
+    env: JOB_NAME=status_checked
 
 install:
-  - if [ "${TRAVIS_OS_NAME}" == osx ]; then
-      PATH=$PATH:/usr/local/opt/ccache/libexec;
-    fi
   - if [ "${JOB_NAME}" == cmake-gcc8 ]; then
-      sudo apt-get install -y g++-8;
+      sudo apt-get install -y g++-8 || exit $?;
       CC=gcc-8 && CXX=g++-8;
     fi
+  - if [ "${JOB_NAME}" == cmake-gcc9 ] || [ "${JOB_NAME}" == cmake-gcc9-c++20 ]; then
+      sudo apt-get install -y g++-9 || exit $?;
+      CC=gcc-9 && CXX=g++-9;
+    fi
   - if [ "${JOB_NAME}" == cmake-mingw ]; then
-      sudo apt-get install -y mingw-w64 ;
+      sudo apt-get install -y mingw-w64 || exit $?;
+    fi
+  - if [ "${JOB_NAME}" == make-gcc4.8 ]; then
+      sudo apt-get install -y g++-4.8 || exit $?;
+      CC=gcc-4.8 && CXX=g++-4.8;
     fi
-  - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then
-      mkdir cmake-dist && curl --silent --fail --show-error --location https://github.com/Kitware/CMake/releases/download/v3.14.5/cmake-3.14.5-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+  - |
+    if [[ "${JOB_NAME}" == cmake* ]]; then
+      sudo apt-get remove -y cmake cmake-data
+      export CMAKE_DEB="cmake-3.14.5-Linux-$(uname -m).deb"
+      export CMAKE_DEB_URL="https://rocksdb-deps.s3-us-west-2.amazonaws.com/cmake/${CMAKE_DEB}"
+      curl --silent --fail --show-error --location --output "${CMAKE_DEB}" "${CMAKE_DEB_URL}" || exit $?
+      sudo dpkg -i "${CMAKE_DEB}" || exit $?
+      which cmake && cmake --version
     fi
-  - if [[ "${JOB_NAME}" == java_test ]]; then
-      java -version && echo "JAVA_HOME=${JAVA_HOME}";
+  - |
+    if [[ "${JOB_NAME}" == java_test || "${JOB_NAME}" == cmake* ]]; then
+      # Ensure JDK 8
+      sudo apt-get install -y openjdk-8-jdk || exit $?
+      export PATH=/usr/lib/jvm/java-8-openjdk-$(dpkg --print-architecture)/bin:$PATH
+      export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-$(dpkg --print-architecture)
+      echo "JAVA_HOME=${JAVA_HOME}"
+      which java && java -version
+      which javac && javac -version
     fi
 
 before_script:
@@ -93,41 +251,53 @@
   - ulimit -n 8192
 
 script:
-  - ${CXX} --version
+  - date; ${CXX} --version
   - if [ `command -v ccache` ]; then ccache -C; fi
   - case $TEST_GROUP in
     platform_dependent)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=only make -j4 all_but_some_tests check_some
       ;;
     1)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=db_iter_test make -j4 check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_END=backupable_db_test make -j4 check_some
       ;;
     2)
-      OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" V=1 make -j4 tools && OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" V=1 ROCKSDBTESTS_START=db_iter_test ROCKSDBTESTS_END=options_file_test make -j4 check_some
+      OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 make -j4 tools && OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=backupable_db_test ROCKSDBTESTS_END=db_universal_compaction_test make -j4 check_some
       ;;
     3)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=options_file_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=db_universal_compaction_test ROCKSDBTESTS_END=table_properties_collector_test make -j4 check_some
       ;;
     4)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=table_properties_collector_test make -j4 check_some
       ;;
     esac
   - case $JOB_NAME in
     java_test)
-      OPT=-DTRAVIS V=1 make rocksdbjava jtest
+      OPT=-DTRAVIS LIB_MODE=shared V=1 make rocksdbjava jtest
       ;;
     lite_build)
-      OPT='-DTRAVIS -DROCKSDB_LITE' V=1 make -j4 static_lib tools
+      OPT='-DTRAVIS -DROCKSDB_LITE' LIB_MODE=shared V=1 make -j4 all
       ;;
     examples)
-      OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4
+      OPT=-DTRAVIS LIB_MODE=shared V=1 make -j4 static_lib && cd examples && make -j4
       ;;
     cmake-mingw)
       sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix;
       mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
       ;;
     cmake*)
-      mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_BUILD_TYPE=Release && make -j4 rocksdb rocksdbjni
+      case $JOB_NAME in
+        *-c++20)
+          OPT=-DCMAKE_CXX_STANDARD=20
+          ;;
+      esac
+
+      mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=0 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 .. && make -j4 && cd  .. && rm -rf build && mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_BUILD_TYPE=Release $OPT && make -j4 rocksdb rocksdbjni
+      ;;
+    make-gcc4.8)
+      OPT=-DTRAVIS LIB_MODE=shared V=1 SKIP_LINK=1 make -j4 all && [ "Linking broken because libgflags compiled with newer ABI" ]
+      ;;
+    status_checked)
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ASSERT_STATUS_CHECKED=1 make -j4 check_some
       ;;
     esac
 notifications:
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/CMakeLists.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/CMakeLists.txt	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -32,10 +32,11 @@
 # 3. cmake ..
 # 4. make -j
 
-cmake_minimum_required(VERSION 3.5.1)
+cmake_minimum_required(VERSION 3.10)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
 include(ReadVersion)
+include(GoogleTest)
 get_rocksdb_version(rocksdb_VERSION)
 project(rocksdb
   VERSION ${rocksdb_VERSION}
@@ -62,6 +63,7 @@
 endif(CCACHE_FOUND)
 
 option(WITH_JEMALLOC "build with JeMalloc" OFF)
+option(WITH_LIBURING "build with liburing" ON)
 option(WITH_SNAPPY "build with SNAPPY" OFF)
 option(WITH_LZ4 "build with lz4" OFF)
 option(WITH_ZLIB "build with zlib" OFF)
@@ -70,6 +72,12 @@
 if (WITH_WINDOWS_UTF8_FILENAMES)
   add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES)
 endif()
+
+if ($ENV{CIRCLECI})
+  message(STATUS "Build for CircieCI env, a few tests may be disabled")
+  add_definitions(-DCIRCLECI)
+endif()
+
 # third-party/folly is only validated to work on Linux and Windows for now.
 # So only turn it on there by default.
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Windows")
@@ -83,15 +91,18 @@
   option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF)
 endif()
 
+if( NOT DEFINED CMAKE_CXX_STANDARD )
+  set(CMAKE_CXX_STANDARD 11)
+endif()
+
 include(CMakeDependentOption)
-CMAKE_DEPENDENT_OPTION(WITH_GFLAGS "build with GFlags" ON
-  "NOT MSVC;NOT MINGW" OFF)
 
 if(MSVC)
+  option(WITH_GFLAGS "build with GFlags" OFF)
   option(WITH_XPRESS "build with windows built in compression" OFF)
   include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
 else()
-  if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+  if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD" AND NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
     # FreeBSD has jemalloc as default malloc
     # but it does not have all the jemalloc files in include/...
     set(WITH_JEMALLOC ON)
@@ -103,18 +114,40 @@
     endif()
   endif()
 
-  # No config file for this
+  if(MINGW)
+    option(WITH_GFLAGS "build with GFlags" OFF)
+  else()
+    option(WITH_GFLAGS "build with GFlags" ON)
+  endif()
+  set(GFLAGS_LIB)
   if(WITH_GFLAGS)
-    find_package(gflags REQUIRED)
+    # Config with namespace available since gflags 2.2.2
+    option(GFLAGS_USE_TARGET_NAMESPACE "Use gflags import target with namespace." ON)
+    find_package(gflags CONFIG)
+    if(gflags_FOUND)
+      if(TARGET ${GFLAGS_TARGET})
+        # Config with GFLAGS_TARGET available since gflags 2.2.0
+        set(GFLAGS_LIB ${GFLAGS_TARGET})
+      else()
+        # Config with GFLAGS_LIBRARIES available since gflags 2.1.0
+        set(GFLAGS_LIB ${gflags_LIBRARIES})
+      endif()
+    else()
+      find_package(gflags REQUIRED)
+      set(GFLAGS_LIB gflags::gflags)
+    endif()
+    include_directories(${GFLAGS_INCLUDE_DIR})
+    list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB})
     add_definitions(-DGFLAGS=1)
-    include_directories(${gflags_INCLUDE_DIR})
-    list(APPEND THIRDPARTY_LIBS gflags::gflags)
   endif()
 
   if(WITH_SNAPPY)
-    find_package(snappy REQUIRED)
+    find_package(Snappy CONFIG)
+    if(NOT Snappy_FOUND)
+      find_package(Snappy REQUIRED)
+    endif()
     add_definitions(-DSNAPPY)
-    list(APPEND THIRDPARTY_LIBS snappy::snappy)
+    list(APPEND THIRDPARTY_LIBS Snappy::snappy)
   endif()
 
   if(WITH_ZLIB)
@@ -149,23 +182,25 @@
   endif()
 endif()
 
-string(TIMESTAMP TS "%Y/%m/%d %H:%M:%S" UTC)
-set(GIT_DATE_TIME "${TS}" CACHE STRING "the time we first built rocksdb")
+string(TIMESTAMP TS "%Y-%m-%d %H:%M:%S" UTC)
+set(BUILD_DATE "${TS}" CACHE STRING "the time we first built rocksdb")
 
 find_package(Git)
 
 if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
-  if(WIN32)
-    execute_process(COMMAND $ENV{COMSPEC} /C ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA)
-  else()
-    execute_process(COMMAND ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA)
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_SHA COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD )
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" RESULT_VARIABLE GIT_MOD COMMAND "${GIT_EXECUTABLE}" diff-index HEAD --quiet)
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DATE COMMAND "${GIT_EXECUTABLE}" log -1 --date=format:"%Y-%m-%d %T" --format="%ad")
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG RESULT_VARIABLE rv COMMAND "${GIT_EXECUTABLE}" symbolic-ref -q --short HEAD OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if (rv AND NOT rv EQUAL 0)
+    execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match OUTPUT_STRIP_TRAILING_WHITESPACE)
   endif()
 else()
   set(GIT_SHA 0)
+  set(GIT_MOD 1)
 endif()
-
-string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}")
-
+string(REGEX REPLACE "[^0-9a-fA-F]+" "" GIT_SHA "${GIT_SHA}")
+string(REGEX REPLACE "[^0-9: /-]+" "" GIT_DATE "${GIT_DATE}")
 
 option(WITH_MD_LIBRARY "build with MD" ON)
 if(WIN32 AND MSVC)
@@ -178,20 +213,20 @@
 
 set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc)
 configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY)
-add_library(build_version OBJECT ${BUILD_VERSION_CC})
-target_include_directories(build_version PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/util)
+
 if(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324")
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wstrict-prototypes")
+  endif()
   if(MINGW)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format -fno-asynchronous-unwind-tables")
     add_definitions(-D_POSIX_C_SOURCE=1)
   endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
   if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
     include(CheckCXXCompilerFlag)
@@ -203,49 +238,91 @@
 endif()
 
 include(CheckCCompilerFlag)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+  CHECK_C_COMPILER_FLAG("-mcpu=power9" HAS_POWER9)
+  if(HAS_POWER9)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power9 -mtune=power9")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power9 -mtune=power9")
+  else()
+    CHECK_C_COMPILER_FLAG("-mcpu=power8" HAS_POWER8)
+    if(HAS_POWER8)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power8 -mtune=power8")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power8 -mtune=power8")
+    endif(HAS_POWER8)
+  endif(HAS_POWER9)
   CHECK_C_COMPILER_FLAG("-maltivec" HAS_ALTIVEC)
   if(HAS_ALTIVEC)
     message(STATUS " HAS_ALTIVEC yes")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maltivec")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power8")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power8")
   endif(HAS_ALTIVEC)
-endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
 
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64")
         CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC)
   if(HAS_ARMV8_CRC)
     message(STATUS " HAS_ARMV8_CRC yes")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
   endif(HAS_ARMV8_CRC)
-endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64")
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
+  CHECK_C_COMPILER_FLAG("-march=native" HAS_S390X_MARCH_NATIVE)
+  if(HAS_S390X_MARCH_NATIVE)
+    message(STATUS " HAS_S390X_MARCH_NATIVE yes")
+  endif(HAS_S390X_MARCH_NATIVE)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
 
 option(PORTABLE "build a portable binary" OFF)
 option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
+option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF)
+option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF)
 if(PORTABLE)
   # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h
   # is available, it is available by default.
   if(FORCE_SSE42 AND NOT MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
   endif()
+  if(MSVC)
+    if(FORCE_AVX)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+    endif()
+    # MSVC automatically enables BMI / lzcnt with AVX2.
+    if(FORCE_AVX2)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    endif()
+  else()
+    if(FORCE_AVX)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    endif()
+    if(FORCE_AVX2)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt")
+    endif()
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
+    endif()
+  endif()
 else()
   if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
   else()
-    if(NOT HAVE_POWER8 AND NOT HAS_ARMV8_CRC)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
+    elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
     endif()
   endif()
 endif()
 
 include(CheckCXXSourceCompiles)
+set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
 if(NOT MSVC)
   set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
 endif()
-CHECK_CXX_SOURCE_COMPILES("
+
+if (NOT PORTABLE OR FORCE_SSE42)
+  CHECK_CXX_SOURCE_COMPILES("
 #include <cstdint>
 #include <nmmintrin.h>
 #include <wmmintrin.h>
@@ -257,26 +334,66 @@
   auto d = _mm_cvtsi128_si64(c);
 }
 " HAVE_SSE42)
-unset(CMAKE_REQUIRED_FLAGS)
-if(HAVE_SSE42)
-  add_definitions(-DHAVE_SSE42)
-  add_definitions(-DHAVE_PCLMUL)
-elseif(FORCE_SSE42)
-  message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
+  if(HAVE_SSE42)
+    add_definitions(-DHAVE_SSE42)
+    add_definitions(-DHAVE_PCLMUL)
+  elseif(FORCE_SSE42)
+    message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
+  endif()
+endif()
+
+# Check if -latomic is required or not
+if (NOT MSVC)
+  set(CMAKE_REQUIRED_FLAGS "--std=c++11")
+  CHECK_CXX_SOURCE_COMPILES("
+#include <atomic>
+std::atomic<uint64_t> x(0);
+int main() {
+  uint64_t i = x.load(std::memory_order_relaxed);
+  bool b = x.is_lock_free();
+  return 0;
+}
+" BUILTIN_ATOMIC)
+  if (NOT BUILTIN_ATOMIC)
+    #TODO: Check if -latomic exists
+    list(APPEND THIRDPARTY_LIBS atomic)
+  endif()
 endif()
 
+if (WITH_LIBURING)
+  find_package(uring)
+  if (uring_FOUND)
+    add_definitions(-DROCKSDB_IOURING_PRESENT)
+    list(APPEND THIRDPARTY_LIBS uring::uring)
+  endif()
+endif()
+
+# Reset the required flags
+set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+
 CHECK_CXX_SOURCE_COMPILES("
 #if defined(_MSC_VER) && !defined(__thread)
 #define __thread __declspec(thread)
 #endif
 int main() {
   static __thread int tls;
+  (void)tls;
 }
 " HAVE_THREAD_LOCAL)
 if(HAVE_THREAD_LOCAL)
   add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL)
 endif()
 
+option(WITH_IOSTATS_CONTEXT "Enable IO stats context" ON)
+if (NOT WITH_IOSTATS_CONTEXT)
+  add_definitions(-DNIOSTATS_CONTEXT)
+endif()
+
+option(WITH_PERF_CONTEXT "Enable perf context" ON)
+if (NOT WITH_PERF_CONTEXT)
+  add_definitions(-DNPERF_CONTEXT)
+endif()
+
 option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON)
 if(FAIL_ON_WARNINGS)
   if(MSVC)
@@ -343,6 +460,12 @@
   add_definitions(-DROCKSDB_NO_DYNAMIC_EXTENSION)
 endif()
 
+option(ASSERT_STATUS_CHECKED "build with assert status checked" OFF)
+if (ASSERT_STATUS_CHECKED)
+  message(STATUS "Build with assert status checked")
+  add_definitions(-DROCKSDB_ASSERT_STATUS_CHECKED)
+endif()
+
 if(DEFINED USE_RTTI)
   if(USE_RTTI)
     message(STATUS "Enabling RTTI")
@@ -377,7 +500,15 @@
     message(STATUS "Debug optimization is enabled")
     set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
   else()
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1")
+
+    # Minimal Build is deprecated after MSVC 2015
+    if( MSVC_VERSION GREATER 1900 )
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-")
+    else()
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm")
+    endif()
+
   endif()
   if(WITH_RUNTIME_DEBUG)
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d")
@@ -404,15 +535,12 @@
   add_definitions(-fno-builtin-memcmp -DCYGWIN)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
   add_definitions(-DOS_MACOSX)
-  if(CMAKE_SYSTEM_PROCESSOR MATCHES arm)
-    add_definitions(-DIOS_CROSS_COMPILE -DROCKSDB_LITE)
-    # no debug info for IOS, that will make our library big
-    add_definitions(-DNDEBUG)
-  endif()
 elseif(CMAKE_SYSTEM_NAME MATCHES "Linux")
   add_definitions(-DOS_LINUX)
 elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS")
   add_definitions(-DOS_SOLARIS)
+elseif(CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")
+  add_definitions(-DOS_GNU_KFREEBSD)
 elseif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
   add_definitions(-DOS_FREEBSD)
 elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
@@ -471,7 +599,11 @@
 endif()
 
 include(CheckCXXSymbolExists)
-check_cxx_symbol_exists(malloc_usable_size malloc.h HAVE_MALLOC_USABLE_SIZE)
+if(CMAKE_SYSTEM_NAME MATCHES "^FreeBSD")
+  check_cxx_symbol_exists(malloc_usable_size malloc_np.h HAVE_MALLOC_USABLE_SIZE)
+else()
+  check_cxx_symbol_exists(malloc_usable_size malloc.h HAVE_MALLOC_USABLE_SIZE)
+endif()
 if(HAVE_MALLOC_USABLE_SIZE)
   add_definitions(-DROCKSDB_MALLOC_USABLE_SIZE)
 endif()
@@ -481,9 +613,18 @@
   add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT)
 endif()
 
+check_cxx_symbol_exists(getauxval auvx.h HAVE_AUXV_GETAUXVAL)
+if(HAVE_AUXV_GETAUXVAL)
+  add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT)
+endif()
+
+check_cxx_symbol_exists(F_FULLFSYNC "fcntl.h" HAVE_FULLFSYNC)
+if(HAVE_FULLFSYNC)
+  add_definitions(-DHAVE_FULLFSYNC)
+endif()
+
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/include)
-include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src)
 if(WITH_FOLLY_DISTRIBUTED_MUTEX)
   include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
 endif()
@@ -492,14 +633,29 @@
 # Main library source code
 
 set(SOURCES
+        cache/cache.cc
+        cache/cache_entry_roles.cc
+        cache/cache_key.cc
+        cache/cache_reservation_manager.cc
         cache/clock_cache.cc
         cache/lru_cache.cc
         cache/sharded_cache.cc
         db/arena_wrapped_db_iter.cc
+        db/blob/blob_fetcher.cc
+        db/blob/blob_file_addition.cc
+        db/blob/blob_file_builder.cc
+        db/blob/blob_file_cache.cc
+        db/blob/blob_file_garbage.cc
+        db/blob/blob_file_meta.cc
+        db/blob/blob_file_reader.cc
+        db/blob/blob_garbage_meter.cc
+        db/blob/blob_log_format.cc
+        db/blob/blob_log_sequential_reader.cc
+        db/blob/blob_log_writer.cc
+        db/blob/prefetch_buffer_collection.cc
         db/builder.cc
         db/c.cc
         db/column_family.cc
-        db/compacted_db_impl.cc
         db/compaction/compaction.cc
         db/compaction/compaction_iterator.cc
         db/compaction/compaction_picker.cc
@@ -507,8 +663,10 @@
         db/compaction/compaction_picker_fifo.cc
         db/compaction/compaction_picker_level.cc
         db/compaction/compaction_picker_universal.cc
+        db/compaction/sst_partitioner.cc
         db/convenience.cc
         db/db_filesnapshot.cc
+        db/db_impl/compacted_db_impl.cc
         db/db_impl/db_impl.cc
         db/db_impl/db_impl_write.cc
         db/db_impl/db_impl_compaction_flush.cc
@@ -539,6 +697,8 @@
         db/memtable_list.cc
         db/merge_helper.cc
         db/merge_operator.cc
+        db/output_validator.cc
+        db/periodic_work_scheduler.cc
         db/range_del_aggregator.cc
         db/range_tombstone_fragmenter.cc
         db/repair.cc
@@ -549,22 +709,29 @@
         db/trim_history_scheduler.cc
         db/version_builder.cc
         db/version_edit.cc
+        db/version_edit_handler.cc
         db/version_set.cc
+        db/wal_edit.cc
         db/wal_manager.cc
         db/write_batch.cc
         db/write_batch_base.cc
         db/write_controller.cc
         db/write_thread.cc
+        env/composite_env.cc
         env/env.cc
         env/env_chroot.cc
         env/env_encryption.cc
         env/env_hdfs.cc
         env/file_system.cc
+        env/file_system_tracer.cc
+        env/fs_remap.cc
         env/mock_env.cc
+        env/unique_id_gen.cc
         file/delete_scheduler.cc
         file/file_prefetch_buffer.cc
         file/file_util.cc
         file/filename.cc
+        file/line_file_reader.cc
         file/random_access_file_reader.cc
         file/read_write_util.cc
         file/readahead_raf.cc
@@ -577,6 +744,8 @@
         memory/arena.cc
         memory/concurrent_arena.cc
         memory/jemalloc_nodump_allocator.cc
+        memory/memkind_kmem_allocator.cc
+        memory/memory_allocator.cc
         memtable/alloc_tracker.cc
         memtable/hash_linklist_rep.cc
         memtable/hash_skiplist_rep.cc
@@ -597,19 +766,23 @@
         monitoring/thread_status_util.cc
         monitoring/thread_status_util_debug.cc
         options/cf_options.cc
+        options/configurable.cc
+        options/customizable.cc
         options/db_options.cc
         options/options.cc
         options/options_helper.cc
         options/options_parser.cc
-        options/options_sanity_check.cc
         port/stack_trace.cc
         table/adaptive/adaptive_table_factory.cc
+        table/block_based/binary_search_index_reader.cc
         table/block_based/block.cc
         table/block_based/block_based_filter_block.cc
         table/block_based/block_based_table_builder.cc
         table/block_based/block_based_table_factory.cc
+        table/block_based/block_based_table_iterator.cc
         table/block_based/block_based_table_reader.cc
         table/block_based/block_builder.cc
+        table/block_based/block_prefetcher.cc
         table/block_based/block_prefix_index.cc
         table/block_based/data_block_hash_index.cc
         table/block_based/data_block_footer.cc
@@ -617,9 +790,14 @@
         table/block_based/filter_policy.cc
         table/block_based/flush_block_policy.cc
         table/block_based/full_filter_block.cc
+        table/block_based/hash_index_reader.cc
         table/block_based/index_builder.cc
+        table/block_based/index_reader_common.cc
         table/block_based/parsed_full_filter_block.cc
         table/block_based/partitioned_filter_block.cc
+        table/block_based/partitioned_index_iterator.cc
+        table/block_based/partitioned_index_reader.cc
+        table/block_based/reader_common.cc
         table/block_based/uncompression_dict_reader.cc
         table/block_fetcher.cc
         table/cuckoo/cuckoo_table_builder.cc
@@ -637,22 +815,30 @@
         table/plain/plain_table_index.cc
         table/plain/plain_table_key_coding.cc
         table/plain/plain_table_reader.cc
+        table/sst_file_dumper.cc
         table/sst_file_reader.cc
         table/sst_file_writer.cc
+        table/table_factory.cc
         table/table_properties.cc
         table/two_level_iterator.cc
+        table/unique_id.cc
         test_util/sync_point.cc
         test_util/sync_point_impl.cc
         test_util/testutil.cc
         test_util/transaction_test_util.cc
         tools/block_cache_analyzer/block_cache_trace_analyzer.cc
         tools/dump/db_dump_tool.cc
+        tools/io_tracer_parser_tool.cc
         tools/ldb_cmd.cc
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
         tools/trace_analyzer_tool.cc
-        trace_replay/trace_replay.cc
         trace_replay/block_cache_tracer.cc
+        trace_replay/io_tracer.cc
+        trace_replay/trace_record_handler.cc
+        trace_replay/trace_record_result.cc
+        trace_replay/trace_record.cc
+        trace_replay/trace_replay.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
@@ -664,6 +850,8 @@
         util/murmurhash.cc
         util/random.cc
         util/rate_limiter.cc
+        util/ribbon_config.cc
+        util/regex.cc
         util/slice.cc
         util/file_checksum_helper.cc
         util/status.cc
@@ -678,19 +866,23 @@
         utilities/blob_db/blob_db_impl_filesnapshot.cc
         utilities/blob_db/blob_dump_tool.cc
         utilities/blob_db/blob_file.cc
-        utilities/blob_db/blob_log_reader.cc
-        utilities/blob_db/blob_log_writer.cc
-        utilities/blob_db/blob_log_format.cc
+        utilities/cache_dump_load.cc
+        utilities/cache_dump_load_impl.cc
         utilities/cassandra/cassandra_compaction_filter.cc
         utilities/cassandra/format.cc
         utilities/cassandra/merge_operator.cc
         utilities/checkpoint/checkpoint_impl.cc
+        utilities/compaction_filters.cc
         utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
         utilities/debug.cc
         utilities/env_mirror.cc
         utilities/env_timed.cc
+        utilities/fault_injection_env.cc
+        utilities/fault_injection_fs.cc
+        utilities/fault_injection_secondary_cache.cc
         utilities/leveldb_options/leveldb_options.cc
         utilities/memory/memory_util.cc
+        utilities/merge_operators.cc
         utilities/merge_operators/bytesxor.cc
         utilities/merge_operators/max.cc
         utilities/merge_operators/put.cc
@@ -710,6 +902,12 @@
         utilities/simulator_cache/sim_cache.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
         utilities/trace/file_trace_reader_writer.cc
+        utilities/trace/replayer_impl.cc
+        utilities/transactions/lock/lock_manager.cc
+        utilities/transactions/lock/point/point_lock_tracker.cc
+        utilities/transactions/lock/point/point_lock_manager.cc
+        utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
+        utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
         utilities/transactions/optimistic_transaction_db_impl.cc
         utilities/transactions/optimistic_transaction.cc
         utilities/transactions/pessimistic_transaction.cc
@@ -717,16 +915,54 @@
         utilities/transactions/snapshot_checker.cc
         utilities/transactions/transaction_base.cc
         utilities/transactions/transaction_db_mutex_impl.cc
-        utilities/transactions/transaction_lock_mgr.cc
         utilities/transactions/transaction_util.cc
         utilities/transactions/write_prepared_txn.cc
         utilities/transactions/write_prepared_txn_db.cc
         utilities/transactions/write_unprepared_txn.cc
         utilities/transactions/write_unprepared_txn_db.cc
         utilities/ttl/db_ttl_impl.cc
+        utilities/wal_filter.cc
         utilities/write_batch_with_index/write_batch_with_index.cc
-        utilities/write_batch_with_index/write_batch_with_index_internal.cc
-        $<TARGET_OBJECTS:build_version>)
+        utilities/write_batch_with_index/write_batch_with_index_internal.cc)
+
+list(APPEND SOURCES
+  utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
+  utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
+  utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
+  utilities/transactions/lock/range/range_tree/lib/util/memarena.cc)
+
+message(STATUS "ROCKSDB_PLUGINS: ${ROCKSDB_PLUGINS}")
+if ( ROCKSDB_PLUGINS )
+  string(REPLACE " " ";" PLUGINS ${ROCKSDB_PLUGINS})
+  foreach (plugin ${PLUGINS})
+    add_subdirectory("plugin/${plugin}")
+    foreach (src ${${plugin}_SOURCES})
+      list(APPEND SOURCES plugin/${plugin}/${src})
+      set_source_files_properties(
+        plugin/${plugin}/${src}
+        PROPERTIES COMPILE_FLAGS "${${plugin}_COMPILE_FLAGS}")
+    endforeach()
+    foreach (path ${${plugin}_INCLUDE_PATHS})
+      include_directories(${path})
+    endforeach()
+    foreach (lib ${${plugin}_LIBS})
+      list(APPEND THIRDPARTY_LIBS ${lib})
+    endforeach()
+    foreach (link_path ${${plugin}_LINK_PATHS})
+      link_directories(AFTER ${link_path})
+    endforeach()
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${${plugin}_CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${${plugin}_CMAKE_EXE_LINKER_FLAGS}")
+  endforeach()
+endif()
 
 if(HAVE_SSE42 AND NOT MSVC)
   set_source_files_properties(
@@ -734,11 +970,11 @@
     PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
 endif()
 
-if(HAVE_POWER8)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
   list(APPEND SOURCES
     util/crc32c_ppc.c
     util/crc32c_ppc_asm.S)
-endif(HAVE_POWER8)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
 
 if(HAS_ARMV8_CRC)
   list(APPEND SOURCES
@@ -753,7 +989,6 @@
     port/win/port_win.cc
     port/win/win_logger.cc
     port/win/win_thread.cc)
-
 if(WITH_XPRESS)
   list(APPEND SOURCES
     port/win/xpress_win.cc)
@@ -799,13 +1034,13 @@
   set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
-add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES})
-target_link_libraries(${ROCKSDB_STATIC_LIB}
+add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES} ${BUILD_VERSION_CC})
+target_link_libraries(${ROCKSDB_STATIC_LIB} PRIVATE
   ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 
 if(ROCKSDB_BUILD_SHARED)
-  add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES})
-  target_link_libraries(${ROCKSDB_SHARED_LIB}
+  add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES} ${BUILD_VERSION_CC})
+  target_link_libraries(${ROCKSDB_SHARED_LIB} PRIVATE
     ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 
   if(WIN32)
@@ -822,8 +1057,7 @@
                           LINKER_LANGUAGE CXX
                           VERSION ${rocksdb_VERSION}
                           SOVERSION ${rocksdb_VERSION_MAJOR}
-                          CXX_STANDARD 11
-                          OUTPUT_NAME "rocksdb")
+                          OUTPUT_NAME "rocksdb${ARTIFACT_SUFFIX}")
   endif()
 endif()
 
@@ -834,6 +1068,16 @@
 endif()
 
 option(WITH_JNI "build with JNI" OFF)
+# Tests are excluded from Release builds
+CMAKE_DEPENDENT_OPTION(WITH_TESTS "build with tests" ON
+  "CMAKE_BUILD_TYPE STREQUAL Debug" OFF)
+option(WITH_BENCHMARK_TOOLS "build with benchmarks" ON)
+option(WITH_CORE_TOOLS "build with ldb and sst_dump" ON)
+option(WITH_TOOLS "build with tools" ON)
+
+if(WITH_TESTS OR WITH_BENCHMARK_TOOLS OR WITH_TOOLS OR WITH_JNI OR JNI)
+  include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src)
+endif()
 if(WITH_JNI OR JNI)
   message(STATUS "JNI library is enabled")
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/java)
@@ -871,6 +1115,8 @@
 
   install(DIRECTORY include/rocksdb COMPONENT devel DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
 
+  install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" COMPONENT devel DESTINATION ${package_config_destination})
+
   install(
     TARGETS ${ROCKSDB_STATIC_LIB}
     EXPORT RocksDBTargets
@@ -907,29 +1153,49 @@
   )
 endif()
 
-# Tests are excluded from Release builds
-CMAKE_DEPENDENT_OPTION(WITH_TESTS "build with tests" ON
-  "CMAKE_BUILD_TYPE STREQUAL Debug" OFF)
-if(WITH_TESTS)
+option(WITH_ALL_TESTS "Build all test, rather than a small subset" ON)
+
+if(WITH_TESTS OR WITH_BENCHMARK_TOOLS)
   add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest)
   add_library(testharness STATIC
+  test_util/mock_time_env.cc
   test_util/testharness.cc)
   target_link_libraries(testharness gtest)
+endif()
 
+if(WITH_TESTS)
   set(TESTS
+        db/db_basic_test.cc
+        env/env_basic_test.cc
+  )
+  if(WITH_ALL_TESTS)
+    list(APPEND TESTS
+        cache/cache_reservation_manager_test.cc
         cache/cache_test.cc
         cache/lru_cache_test.cc
+        db/blob/blob_counting_iterator_test.cc
+        db/blob/blob_file_addition_test.cc
+        db/blob/blob_file_builder_test.cc
+        db/blob/blob_file_cache_test.cc
+        db/blob/blob_file_garbage_test.cc
+        db/blob/blob_file_reader_test.cc
+        db/blob/blob_garbage_meter_test.cc
+        db/blob/db_blob_basic_test.cc
+        db/blob/db_blob_compaction_test.cc
+        db/blob/db_blob_corruption_test.cc
+        db/blob/db_blob_index_test.cc
         db/column_family_test.cc
         db/compact_files_test.cc
+        db/compaction/clipping_iterator_test.cc
         db/compaction/compaction_job_stats_test.cc
         db/compaction/compaction_job_test.cc
         db/compaction/compaction_iterator_test.cc
         db/compaction/compaction_picker_test.cc
+        db/compaction/compaction_service_test.cc
         db/comparator_db_test.cc
         db/corruption_test.cc
         db/cuckoo_table_db_test.cc
-        db/db_basic_test.cc
-        db/db_blob_index_test.cc
+        db/db_with_timestamp_basic_test.cc
         db/db_block_cache_test.cc
         db/db_bloom_filter_test.cc
         db/db_compaction_filter_test.cc
@@ -941,6 +1207,7 @@
         db/db_iter_test.cc
         db/db_iter_stress_test.cc
         db/db_iterator_test.cc
+        db/db_kv_checksum_test.cc
         db/db_log_iter_test.cc
         db/db_memtable_test.cc
         db/db_merge_operator_test.cc
@@ -948,19 +1215,21 @@
         db/db_options_test.cc
         db/db_properties_test.cc
         db/db_range_del_test.cc
-        db/db_impl/db_secondary_test.cc
+        db/db_secondary_test.cc
         db/db_sst_test.cc
         db/db_statistics_test.cc
         db/db_table_properties_test.cc
         db/db_tailing_iter_test.cc
         db/db_test.cc
         db/db_test2.cc
+        db/db_logical_block_size_cache_test.cc
         db/db_universal_compaction_test.cc
         db/db_wal_test.cc
+        db/db_with_timestamp_compaction_test.cc
         db/db_write_test.cc
         db/dbformat_test.cc
         db/deletefile_test.cc
-        db/error_handler_test.cc
+        db/error_handler_fs_test.cc
         db/obsolete_files_test.cc
         db/external_sst_file_basic_test.cc
         db/external_sst_file_test.cc
@@ -976,6 +1245,7 @@
         db/merge_test.cc
         db/options_file_test.cc
         db/perf_context_test.cc
+        db/periodic_work_scheduler_test.cc
         db/plain_table_db_test.cc
         db/prefix_test.cc
         db/range_del_aggregator_test.cc
@@ -986,17 +1256,21 @@
         db/version_edit_test.cc
         db/version_set_test.cc
         db/wal_manager_test.cc
+        db/wal_edit_test.cc
         db/write_batch_test.cc
         db/write_callback_test.cc
         db/write_controller_test.cc
-        env/env_basic_test.cc
         env/env_test.cc
+        env/io_posix_test.cc
         env/mock_env_test.cc
         file/delete_scheduler_test.cc
+        file/prefetch_test.cc
+        file/random_access_file_reader_test.cc
         logging/auto_roll_logger_test.cc
         logging/env_logger_test.cc
         logging/event_logger_test.cc
         memory/arena_test.cc
+        memory/memory_allocator_test.cc
         memtable/inlineskiplist_test.cc
         memtable/skiplist_test.cc
         memtable/write_buffer_manager_test.cc
@@ -1004,9 +1278,12 @@
         monitoring/iostats_context_test.cc
         monitoring/statistics_test.cc
         monitoring/stats_history_test.cc
+        options/configurable_test.cc
+        options/customizable_test.cc
         options/options_settable_test.cc
         options/options_test.cc
         table/block_based/block_based_filter_block_test.cc
+        table/block_based/block_based_table_reader_test.cc
         table/block_based/block_test.cc
         table/block_based/data_block_hash_index_test.cc
         table/block_based/full_filter_block_test.cc
@@ -1017,7 +1294,12 @@
         table/merger_test.cc
         table/sst_file_reader_test.cc
         table/table_test.cc
+        table/block_fetcher_test.cc
+        test_util/testutil_test.cc
+        trace_replay/block_cache_tracer_test.cc
+        trace_replay/io_tracer_test.cc
         tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+        tools/io_tracer_parser_test.cc
         tools/ldb_cmd_test.cc
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
@@ -1035,11 +1317,14 @@
         util/random_test.cc
         util/rate_limiter_test.cc
         util/repeatable_thread_test.cc
+        util/ribbon_test.cc
         util/slice_test.cc
         util/slice_transform_test.cc
         util/timer_queue_test.cc
+        util/timer_test.cc
         util/thread_list_test.cc
         util/thread_local_test.cc
+        util/work_queue_test.cc
         utilities/backupable/backupable_db_test.cc
         utilities/blob_db/blob_db_test.cc
         utilities/cassandra/cassandra_functional_test.cc
@@ -1059,11 +1344,14 @@
         utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
         utilities/transactions/optimistic_transaction_test.cc
         utilities/transactions/transaction_test.cc
+        utilities/transactions/lock/point/point_lock_manager_test.cc
         utilities/transactions/write_prepared_transaction_test.cc
         utilities/transactions/write_unprepared_transaction_test.cc
+        utilities/transactions/lock/range/range_locking_test.cc
         utilities/ttl/ttl_test.cc
         utilities/write_batch_with_index/write_batch_with_index_test.cc
-  )
+    )
+  endif()
   if(WITH_LIBRADOS)
     list(APPEND TESTS utilities/env_librados_test.cc)
   endif()
@@ -1076,7 +1364,6 @@
       db/db_test_util.cc
       monitoring/thread_status_updater_debug.cc
       table/mock_table.cc
-      test_util/fault_injection_test_env.cc
       utilities/cassandra/test_utils.cc
   )
   enable_testing()
@@ -1091,21 +1378,25 @@
         PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
         EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
         EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
-        )
+  )
 
   foreach(sourcefile ${TESTS})
       get_filename_component(exename ${sourcefile} NAME_WE)
-      add_executable(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} ${sourcefile})
-      set_target_properties(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX}
+      add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
+      set_target_properties(${exename}${ARTIFACT_SUFFIX}
         PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
         EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
         EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
         OUTPUT_NAME ${exename}${ARTIFACT_SUFFIX}
-        )
-      target_link_libraries(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${ROCKSDB_LIB})
+      )
+      target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB})
       if(NOT "${exename}" MATCHES "db_sanity_test")
-        add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX})
-        add_dependencies(check ${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX})
+        gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120)
+        add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
+      endif()
+      if("${exename}" MATCHES "env_librados_test")
+        # env_librados_test.cc uses librados directly
+        target_link_libraries(${exename}${ARTIFACT_SUFFIX} rados)
       endif()
   endforeach(sourcefile ${TESTS})
 
@@ -1122,57 +1413,71 @@
 
   if(ROCKSDB_LIB_FOR_C)
     set(C_TESTS db/c_test.c)
-    # C executables must link to a shared object
     add_executable(c_test db/c_test.c)
-    target_link_libraries(c_test ${ROCKSDB_SHARED_LIB} testharness)
+    target_link_libraries(c_test ${ROCKSDB_LIB_FOR_C} testharness)
     add_test(NAME c_test COMMAND c_test${ARTIFACT_SUFFIX})
     add_dependencies(check c_test)
   endif()
 endif()
 
-option(WITH_BENCHMARK_TOOLS "build with benchmarks" ON)
 if(WITH_BENCHMARK_TOOLS)
-  add_executable(db_bench
+  add_executable(db_bench${ARTIFACT_SUFFIX}
+    tools/simulated_hybrid_file_system.cc
     tools/db_bench.cc
     tools/db_bench_tool.cc)
-  target_link_libraries(db_bench
-    ${ROCKSDB_LIB})
+  target_link_libraries(db_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
 
-  add_executable(cache_bench
-    cache/cache_bench.cc)
-  target_link_libraries(cache_bench
-    ${ROCKSDB_LIB})
+  add_executable(cache_bench${ARTIFACT_SUFFIX}
+    cache/cache_bench.cc
+    cache/cache_bench_tool.cc)
+  target_link_libraries(cache_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(memtablerep_bench
+  add_executable(memtablerep_bench${ARTIFACT_SUFFIX}
     memtable/memtablerep_bench.cc)
-  target_link_libraries(memtablerep_bench
-    ${ROCKSDB_LIB})
+  target_link_libraries(memtablerep_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(range_del_aggregator_bench
+  add_executable(range_del_aggregator_bench${ARTIFACT_SUFFIX}
     db/range_del_aggregator_bench.cc)
-  target_link_libraries(range_del_aggregator_bench
-    ${ROCKSDB_LIB})
+  target_link_libraries(range_del_aggregator_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(table_reader_bench
+  add_executable(table_reader_bench${ARTIFACT_SUFFIX}
     table/table_reader_bench.cc)
-  target_link_libraries(table_reader_bench
-    ${ROCKSDB_LIB} testharness)
+  target_link_libraries(table_reader_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} testharness ${GFLAGS_LIB})
 
-  add_executable(filter_bench
+  add_executable(filter_bench${ARTIFACT_SUFFIX}
     util/filter_bench.cc)
-  target_link_libraries(filter_bench
-    ${ROCKSDB_LIB})
+  target_link_libraries(filter_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(hash_table_bench
+  add_executable(hash_table_bench${ARTIFACT_SUFFIX}
     utilities/persistent_cache/hash_table_bench.cc)
-  target_link_libraries(hash_table_bench
-    ${ROCKSDB_LIB})
+  target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB})
 endif()
 
-option(WITH_TOOLS "build with tools" ON)
-if(WITH_TOOLS)
+if(WITH_CORE_TOOLS OR WITH_TOOLS)
   add_subdirectory(tools)
+  add_custom_target(core_tools
+    DEPENDS ${core_tool_deps})
+endif()
+
+if(WITH_TOOLS)
   add_subdirectory(db_stress_tool)
   add_custom_target(tools
     DEPENDS ${tool_deps})
 endif()
+
+option(WITH_EXAMPLES "build with examples" OFF)
+if(WITH_EXAMPLES)
+  add_subdirectory(examples)
+endif()
+
+option(WITH_BENCHMARK "build benchmark tests" OFF)
+if(WITH_BENCHMARK)
+  add_subdirectory(${PROJECT_SOURCE_DIR}/microbench/)
+endif()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md mariadb-10.11.13/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/DEFAULT_OPTIONS_HISTORY.md	2025-05-19 16:14:27.000000000 +0000
@@ -1,4 +1,4 @@
-# RocksDB default options change log
+# RocksDB default options change log (NO LONGER MAINTAINED)
 ## Unreleased
 * delayed_write_rate takes the rate given by rate_limiter if not specified.
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/HISTORY.md mariadb-10.11.13/storage/rocksdb/rocksdb/HISTORY.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/HISTORY.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/HISTORY.md	2025-05-19 16:14:27.000000000 +0000
@@ -1,9 +1,707 @@
 # Rocksdb Change Log
-## Unreleased
+## 6.29.5 (03/29/2022)
 ### Bug Fixes
+* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
+* Fixed a race condition when mmaping a WritableFile on POSIX.
+* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
+* Fixed a heap use-after-free race with DropColumnFamily.
+* Fixed a bug that `rocksdb.read.block.compaction.micros` cannot track compaction stats (#9722).
+
+## 6.29.4 (03/22/2022)
+### Bug Fixes
+* Fixed a bug caused by race among flush, incoming writes and taking snapshots. Queries to snapshots created with these race condition can return incorrect result, e.g. resurfacing deleted data.
+* Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
+* Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed.
+* Fixed a race condition when disable and re-enable manual compaction.
+* Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
+* Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
+* Fixed a read-after-free bug in `DB::GetMergeOperands()`.
+* Fixed NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, NUM_DATA_BLOCKS_READ_PER_LEVEL, and NUM_SST_READ_PER_LEVEL stats to be reported once per MultiGet batch per level.
+
+## 6.29.3 (02/17/2022)
+### Bug Fixes
+* Fix a data loss bug for 2PC write-committed transaction caused by concurrent transaction commit and memtable switch (#9571).
+
+## 6.29.2 (02/15/2022)
+### Performance Improvements
+* DisableManualCompaction() doesn't have to wait scheduled manual compaction to be executed in thread-pool to cancel the job.
+
+## 6.29.1 (01/31/2022)
+### Bug Fixes
+* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
+
+## 6.29.0 (01/21/2022)
+Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info.
+### Public API change
+* Added values to `TraceFilterType`: `kTraceFilterIteratorSeek`, `kTraceFilterIteratorSeekForPrev`, and `kTraceFilterMultiGet`. They can be set in `TraceOptions` to filter out the operation types after which they are named.
+* Added `TraceOptions::preserve_write_order`. When enabled it  guarantees write records are traced in the same order they are logged to WAL and applied to the DB. By default it is disabled (false) to match the legacy behavior and prevent regression.
+* Made the Env class extend the Customizable class.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* `Options::OldDefaults` is marked deprecated, as it is no longer maintained.
+* Add ObjectLibrary::AddFactory and ObjectLibrary::PatternEntry classes.  This method and associated class are the preferred mechanism for registering factories with the ObjectLibrary going forward.  The ObjectLibrary::Register method, which uses regular expressions and may be problematic, is deprecated and will be in a future release.
+* Changed `BlockBasedTableOptions::block_size` from `size_t` to `uint64_t`.
+* Added API warning against using `Iterator::Refresh()` together with `DB::DeleteRange()`, which are incompatible and have always risked causing the refreshed iterator to return incorrect results.
+
+### Behavior Changes
+* `DB::DestroyColumnFamilyHandle()` will return Status::InvalidArgument() if called with `DB::DefaultColumnFamily()`.
+* On 32-bit platforms, mmap reads are no longer quietly disabled, just discouraged.
+
+### New Features
+* Added `Options::DisableExtraChecks()` that can be used to improve peak write performance by disabling checks that should not be necessary in the absence of software logic errors or CPU+memory hardware errors. (Default options are slowly moving toward some performance overheads for extra correctness checking.)
+
+### Performance Improvements
+* Improved read performance when a prefix extractor is used (Seek, Get, MultiGet), even compared to version 6.25 baseline (see bug fix below), by optimizing the common case of prefix extractor compatible with table file and unchanging.
+
+### Bug Fixes
+* Fix a bug that FlushMemTable may return ok even flush not succeed.
+* Fixed a bug of Sync() and Fsync() not using `fcntl(F_FULLFSYNC)` on OS X and iOS.
+* Fixed a significant performance regression in version 6.26 when a prefix extractor is used on the read path (Seek, Get, MultiGet). (Excessive time was spent in SliceTransform::AsString().)
+
+### New Features
+* Added RocksJava support for MacOS universal binary (ARM+x86)
+
+## 6.28.0 (2021-12-17)
+### New Features
+* Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps.
+* Introduce SimulatedHybridFileSystem which can help simulating HDD latency in db_bench. Tiered Storage latency simulation can be enabled using -simulate_hybrid_fs_file (note that it doesn't work if db_bench is interrupted in the middle). -simulate_hdd can also be used to simulate all files on HDD.
+
+### Bug Fixes
+* Fixed a bug in rocksdb automatic implicit prefetching which got broken because of new feature adaptive_readahead and internal prefetching got disabled when iterator moves from one file to next.
+* Fixed a bug in TableOptions.prepopulate_block_cache which causes segmentation fault when used with TableOptions.partition_filters = true and TableOptions.cache_index_and_filter_blocks = true.
+* Fixed a bug affecting custom memtable factories which are not registered with the `ObjectRegistry`. The bug could result in failure to save the OPTIONS file.
+* Fixed a bug causing two duplicate entries to be appended to a file opened in non-direct mode and tracked by `FaultInjectionTestFS`.
+* Fixed a bug in TableOptions.prepopulate_block_cache to support block-based filters also.
+* Block cache keys no longer use `FSRandomAccessFile::GetUniqueId()` (previously used when available), so a filesystem recycling unique ids can no longer lead to incorrect result or crash (#7405). For files generated by RocksDB >= 6.24, the cache keys are stable across DB::Open and DB directory move / copy / import / export / migration, etc. Although collisions are still theoretically possible, they are (a) impossible in many common cases, (b) not dependent on environmental factors, and (c) much less likely than a CPU miscalculation while executing RocksDB.
+* Fixed a bug in C bindings causing iterator to return incorrect result (#9343).
+
+### Behavior Changes
+* MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371.
+
+### Public API change
+* Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
+* Introduce a new EventListener callback that will be called upon the end of automatic error recovery.
+* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low seperately.
+* Add GetFullHistoryTsLow API so users can query current full_history_low value of specified column family.
+
+### Performance Improvements
+* Replaced map property `TableProperties::properties_offsets`  with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory.
+* Block cache accesses are faster by RocksDB using cache keys of fixed size (16 bytes).
+
+### Java API Changes
+* Removed Java API `TableProperties.getPropertiesOffsets()` as it exposed internal details to external users.
+
+## 6.27.0 (2021-11-19)
+### New Features
+* Added new ChecksumType kXXH3 which is faster than kCRC32c on almost all x86\_64 hardware.
+* Added a new online consistency check for BlobDB which validates that the number/total size of garbage blobs does not exceed the number/total size of all blobs in any given blob file.
+* Provided support for tracking per-sst user-defined timestamp information in MANIFEST.
+* Added new option "adaptive_readahead" in ReadOptions. For iterators, RocksDB does auto-readahead on noticing sequential reads and by enabling this option, readahead_size of current file (if reads are sequential) will be carried forward to next file instead of starting from the scratch at each level (except L0 level files). If reads are not sequential it will fall back to 8KB. This option is applicable only for RocksDB internal prefetch buffer and isn't supported with underlying file system prefetching.
+* Added the read count and read bytes related stats to Statistics for tiered storage hot, warm, and cold file reads.
+* Added an option to dynamically charge an updating estimated memory usage of block-based table building to block cache if block cache available. It currently only includes charging memory usage of constructing (new) Bloom Filter and Ribbon Filter to block cache. To enable this feature, set `BlockBasedTableOptions::reserve_table_builder_memory = true`.
+* Add a new API OnIOError in listener.h that notifies listeners when an IO error occurs during FileSystem operation along with filename, status etc.
+* Added compaction readahead support for blob files to the integrated BlobDB implementation, which can improve compaction performance when the database resides on higher-latency storage like HDDs or remote filesystems. Readahead can be configured using the column family option `blob_compaction_readahead_size`.
+
+### Bug Fixes
+* Prevent a `CompactRange()` with `CompactRangeOptions::change_level == true` from possibly causing corruption to the LSM state (overlapping files within a level) when run in parallel with another manual compaction. Note that setting `force_consistency_checks == true` (the default) would cause the DB to enter read-only mode in this scenario and return `Status::Corruption`, rather than committing any corruption.
+* Fixed a bug in CompactionIterator when write-prepared transaction is used. A released earliest write conflict snapshot may cause assertion failure in dbg mode and unexpected key in opt mode.
+* Fix ticker WRITE_WITH_WAL("rocksdb.write.wal"), this bug is caused by a bad extra `RecordTick(stats_, WRITE_WITH_WAL)` (at 2 place), this fix remove the extra `RecordTick`s and fix the corresponding test case.
+* EventListener::OnTableFileCreated was previously called with OK status and file_size==0 in cases of no SST file contents written (because there was no content to add) and the empty file deleted before calling the listener. Now the status is Aborted.
+* Fixed a bug in CompactionIterator when write-preared transaction is used. Releasing earliest_snapshot during compaction may cause a SingleDelete to be output after a PUT of the same user key whose seq has been zeroed.
+* Added input sanitization on negative bytes passed into `GenericRateLimiter::Request`.
+* Fixed an assertion failure in CompactionIterator when write-prepared transaction is used. We prove that certain operations can lead to a Delete being followed by a SingleDelete (same user key). We can drop the SingleDelete.
+* Fixed a bug of timestamp-based GC which can cause all versions of a key under full_history_ts_low to be dropped. This bug will be triggered when some of the ikeys' timestamps are lower than full_history_ts_low, while others are newer.
+* In some cases outside of the DB read and compaction paths, SST block checksums are now checked where they were not before.
+* Explicitly check for and disallow the `BlockBasedTableOptions` if insertion into one of {`block_cache`, `block_cache_compressed`, `persistent_cache`} can show up in another of these. (RocksDB expects to be able to use the same key for different physical data among tiers.)
+* Users who configured a dedicated thread pool for bottommost compactions by explicitly adding threads to the `Env::Priority::BOTTOM` pool will no longer see RocksDB schedule automatic compactions exceeding the DB's compaction concurrency limit. For details on per-DB compaction concurrency limit, see API docs of `max_background_compactions` and `max_background_jobs`.
+* Fixed a bug of background flush thread picking more memtables to flush and prematurely advancing column family's log_number.
+* Fixed an assertion failure in ManifestTailer.
+* Fixed a bug that could, with WAL enabled, cause backups, checkpoints, and `GetSortedWalFiles()` to fail randomly with an error like `IO error: 001234.log: No such file or directory`
+
+### Behavior Changes
+* `NUM_FILES_IN_SINGLE_COMPACTION` was only counting the first input level files, now it's including all input files.
+* `TransactionUtil::CheckKeyForConflicts` can also perform conflict-checking based on user-defined timestamps in addition to sequence numbers.
+* Removed `GenericRateLimiter`'s minimum refill bytes per period previously enforced.
+
+### Public API change
+* When options.ttl is used with leveled compaction with compactinon priority kMinOverlappingRatio, files exceeding half of TTL value will be prioritized more, so that by the time TTL is reached, fewer extra compactions will be scheduled to clear them up. At the same time, when compacting files with data older than half of TTL, output files may be cut off based on those files' boundaries, in order for the early TTL compaction to work properly.
+* Made FileSystem and RateLimiter extend the Customizable class and added a CreateFromString method.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* Clarified in API comments that RocksDB is not exception safe for callbacks and custom extensions. An exception propagating into RocksDB can lead to undefined behavior, including data loss, unreported corruption, deadlocks, and more.
+* Marked `WriteBufferManager` as `final` because it is not intended for extension.
+* Removed unimportant implementation details from table_properties.h
+* Add API `FSDirectory::FsyncWithDirOptions()`, which provides extra information like directory fsync reason in `DirFsyncOptions`. File system like btrfs is using that to skip directory fsync for creating a new file, or when renaming a file, fsync the target file instead of the directory, which improves the `DB::Open()` speed by ~20%.
+* `DB::Open()` is not going be blocked by obsolete file purge if `DBOptions::avoid_unnecessary_blocking_io` is set to true.
+* In builds where glibc provides `gettid()`, info log ("LOG" file) lines now print a system-wide thread ID from `gettid()` instead of the process-local `pthread_self()`. For all users, the thread ID format is changed from hexadecimal to decimal integer.
+* In builds where glibc provides `pthread_setname_np()`, the background thread names no longer contain an ID suffix. For example, "rocksdb:bottom7" (and all other threads in the `Env::Priority::BOTTOM` pool) are now named "rocksdb:bottom". Previously large thread pools could breach the name size limit (e.g., naming "rocksdb:bottom10" would fail).
+* Deprecating `ReadOptions::iter_start_seqnum` and `DBOptions::preserve_deletes`, please try using user defined timestamp feature instead. The options will be removed in a future release, currently it logs a warning message when using.
+
+### Performance Improvements
+* Released some memory related to filter construction earlier in `BlockBasedTableBuilder` for `FullFilter` and `PartitionedFilter` case (#9070)
+
+### Behavior Changes
+* `NUM_FILES_IN_SINGLE_COMPACTION` was only counting the first input level files, now it's including all input files.
+
+## 6.26.0 (2021-10-20)
+### Bug Fixes
+* Fixes a bug in directed IO mode when calling MultiGet() for blobs in the same blob file. The bug is caused by not sorting the blob read requests by file offsets.
+* Fix the incorrect disabling of SST rate limited deletion when the WAL and DB are in different directories. Only WAL rate limited deletion should be disabled if its in a different directory.
+* Fix `DisableManualCompaction()` to cancel compactions even when they are waiting on automatic compactions to drain due to `CompactRangeOptions::exclusive_manual_compactions == true`.
+* Fix contract of `Env::ReopenWritableFile()` and `FileSystem::ReopenWritableFile()` to specify any existing file must not be deleted or truncated.
+* Fixed bug in calls to `IngestExternalFiles()` with files for multiple column families. The bug could have introduced a delay in ingested file keys becoming visible after `IngestExternalFiles()` returned. Furthermore, mutations to ingested file keys while they were invisible could have been dropped (not necessarily immediately).
+* Fixed a possible race condition impacting users of `WriteBufferManager` who constructed it with `allow_stall == true`. The race condition led to undefined behavior (in our experience, typically a process crash).
+* Fixed a bug where stalled writes would remain stalled forever after the user calls `WriteBufferManager::SetBufferSize()` with `new_size == 0` to dynamically disable memory limiting.
+* Make `DB::close()` thread-safe.
+* Fix a bug in atomic flush where one bg flush thread will wait forever for a preceding bg flush thread to commit its result to MANIFEST but encounters an error which is mapped to a soft error (DB not stopped).
+* Fix a bug in `BackupEngine` where some internal callers of `GenericRateLimiter::Request()` do not honor `bytes <= GetSingleBurstBytes()`.
+
+### New Features
+* Print information about blob files when using "ldb list_live_files_metadata"
+* Provided support for SingleDelete with user defined timestamp.
+* Experimental new function DB::GetLiveFilesStorageInfo offers essentially a unified version of other functions like GetLiveFiles, GetLiveFilesChecksumInfo, and GetSortedWalFiles. Checkpoints and backups could show small behavioral changes and/or improved performance as they now use this new API.
+* Add remote compaction read/write bytes statistics: `REMOTE_COMPACT_READ_BYTES`, `REMOTE_COMPACT_WRITE_BYTES`.
+* Introduce an experimental feature to dump out the blocks from block cache and insert them to the secondary cache to reduce the cache warmup time (e.g., used while migrating DB instance). More information are in `class CacheDumper` and `CacheDumpedLoader` at `rocksdb/utilities/cache_dump_load.h` Note that, this feature is subject to the potential change in the future, it is still experimental.
+* Introduced a new BlobDB configuration option `blob_garbage_collection_force_threshold`, which can be used to trigger compactions targeting the SST files which reference the oldest blob files when the ratio of garbage in those blob files meets or exceeds the specified threshold. This can reduce space amplification with skewed workloads where the affected SST files might not otherwise get picked up for compaction.
+* Added EXPERIMENTAL support for table file (SST) unique identifiers that are stable and universally unique, available with new function `GetUniqueIdFromTableProperties`. Only SST files from RocksDB >= 6.24 support unique IDs.
+* Added `GetMapProperty()` support for "rocksdb.dbstats" (`DB::Properties::kDBStats`). As a map property, it includes DB-level internal stats accumulated over the DB's lifetime, such as user write related stats and uptime.
+
+### Public API change
+* Made SystemClock extend the Customizable class and added a CreateFromString method.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* Made SliceTransform extend the Customizable class and added a CreateFromString method.  Implementations need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.  The Capped and Prefixed transform classes return a short name (no length); use GetId for the fully qualified name.
+* Made FileChecksumGenFactory, SstPartitionerFactory, TablePropertiesCollectorFactory, and WalFilter extend the Customizable class and added a CreateFromString method.
+* Some fields of SstFileMetaData are deprecated for compatibility with new base class FileStorageInfo.
+* Add `file_temperature` to `IngestExternalFileArg` such that when ingesting SST files, we are able to indicate the temperature of the this batch of files.
+* If `DB::Close()` failed with a non aborted status, calling `DB::Close()` again will return the original status instead of Status::OK.
+* Add CacheTier to advanced_options.h to describe the cache tier we used. Add a `lowest_used_cache_tier` option to `DBOptions` (immutable) and pass it to BlockBasedTableReader. By default it is `CacheTier::kNonVolatileBlockTier`, which means, we always use both block cache (kVolatileTier) and secondary cache (kNonVolatileBlockTier). By set it to `CacheTier::kVolatileTier`, the DB will not use the secondary cache.
+* Even when options.max_compaction_bytes is hit, compaction output files are only cut when it aligns with grandparent files' boundaries. options.max_compaction_bytes could be slightly violated with the change, but the violation is no more than one target SST file size, which is usually much smaller.
+
+### Performance Improvements
+* Improved CPU efficiency of building block-based table (SST) files (#9039 and #9040).
+
+### Java API Changes
+* Add Java API bindings for new integrated BlobDB options
+* `keyMayExist()` supports ByteBuffer.
+* Fix multiget throwing Null Pointer Exception for num of keys > 70k (https://github.com/facebook/rocksdb/issues/8039).
+
+## 6.25.0 (2021-09-20)
+### Bug Fixes
+* Allow secondary instance to refresh iterator. Assign read seq after referencing SuperVersion.
+* Fixed a bug of secondary instance's last_sequence going backward, and reads on the secondary fail to see recent updates from the primary.
+* Fixed a bug that could lead to duplicate DB ID or DB session ID in POSIX environments without /proc/sys/kernel/random/uuid.
+* Fix a race in DumpStats() with column family destruction due to not taking a Ref on each entry while iterating the ColumnFamilySet.
+* Fix a race in item ref counting in LRUCache when promoting an item from the SecondaryCache.
+* Fix a race in BackupEngine if RateLimiter is reconfigured during concurrent Restore operations.
+* Fix a bug on POSIX in which failure to create a lock file (e.g. out of space) can prevent future LockFile attempts in the same process on the same file from succeeding.
+* Fix a bug that backup_rate_limiter and restore_rate_limiter in BackupEngine could not limit read rates.
+* Fix the implementation of `prepopulate_block_cache = kFlushOnly` to only apply to flushes rather than to all generated files.
+* Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together. The sync WAL should work with locked log_write_mutex_.
+* Add checks for validity of the IO uring completion queue entries, and fail the BlockBasedTableReader MultiGet sub-batch if there's an invalid completion
+* Add an interface RocksDbIOUringEnable() that, if defined by the user, will allow them to enable/disable the use of IO uring by RocksDB
+* Fix the bug that when direct I/O is used and MultiRead() returns a short result, RandomAccessFileReader::MultiRead() still returns full size buffer, with returned short value together with some data in original buffer. This bug is unlikely cause incorrect results, because (1) since FileSystem layer is expected to retry on short result, returning short results is only possible when asking more bytes in the end of the file, which RocksDB doesn't do when using MultiRead(); (2) checksum is unlikely to match.
+
+### New Features
+* RemoteCompaction's interface now includes `db_name`, `db_id`, `session_id`, which could help the user uniquely identify compaction job between db instances and sessions.
+* Added a ticker statistic, "rocksdb.verify_checksum.read.bytes", reporting how many bytes were read from file to serve `VerifyChecksum()` and `VerifyFileChecksums()` queries.
+* Added ticker statistics, "rocksdb.backup.read.bytes" and "rocksdb.backup.write.bytes", reporting how many bytes were read and written during backup.
+* Added properties for BlobDB: `rocksdb.num-blob-files`, `rocksdb.blob-stats`, `rocksdb.total-blob-file-size`, and `rocksdb.live-blob-file-size`. The existing property `rocksdb.estimate_live-data-size` was also extended to include live bytes residing in blob files.
+* Added two new RateLimiter IOPriorities: `Env::IO_USER`,`Env::IO_MID`. `Env::IO_USER` will have superior priority over all other RateLimiter IOPriorities without being subject to fair scheduling constraint.
+* `SstFileWriter` now supports `Put`s and `Delete`s with user-defined timestamps. Note that the ingestion logic itself is not timestamp-aware yet.
+* Allow a single write batch to include keys from multiple column families whose timestamps' formats can differ. For example, some column families may disable timestamp, while others enable timestamp.
+* Add compaction priority information in RemoteCompaction, which can be used to schedule high priority job first.
+* Added new callback APIs `OnBlobFileCreationStarted`,`OnBlobFileCreated`and `OnBlobFileDeleted` in `EventListener` class of listener.h. It notifies listeners during creation/deletion of individual blob files in Integrated BlobDB. It also log blob file creation finished event and deletion event in LOG file.
+* Batch blob read requests for `DB::MultiGet` using `MultiRead`.
+* Add support for fallback to local compaction, the user can return `CompactionServiceJobStatus::kUseLocal` to instruct RocksDB to run the compaction locally instead of waiting for the remote compaction result.
+* Add built-in rate limiter's implementation of `RateLimiter::GetTotalPendingRequest(int64_t* total_pending_requests, const Env::IOPriority pri)` for the total number of requests that are pending for bytes in the rate limiter.
+* Charge memory usage during data buffering, from which training samples are gathered for dictionary compression, to block cache. Unbuffering data can now be triggered if the block cache becomes full and `strict_capacity_limit=true` for the block cache, in addition to existing conditions that can trigger unbuffering.
+
+### Public API change
+* Remove obsolete implementation details FullKey and ParseFullKey from public API
+* Change `SstFileMetaData::size` from `size_t` to `uint64_t`.
+* Made Statistics extend the Customizable class and added a CreateFromString method.  Implementations of Statistics need to be registered with the ObjectRegistry and to implement a Name() method in order to be created via this method.
+* Extended `FlushJobInfo` and `CompactionJobInfo` in listener.h to provide information about the blob files generated by a flush/compaction and garbage collected during compaction in Integrated BlobDB. Added struct members `blob_file_addition_infos` and `blob_file_garbage_infos` that contain this information.
+* Extended parameter `output_file_names` of `CompactFiles` API to also include paths of the blob files generated by the compaction in Integrated BlobDB.
+* Most `BackupEngine` functions now return `IOStatus` instead of `Status`. Most existing code should be compatible with this change but some calls might need to be updated.
+* Add a new field `level_at_creation` in `TablePropertiesCollectorFactory::Context` to capture the level at creating the SST file (i.e, table), of which the properties are being collected.
+
+### Miscellaneous
+* Add a paranoid check where in case FileSystem layer doesn't fill the buffer but returns succeed, checksum is unlikely to match even if buffer contains a previous block. The byte modified is not useful anyway, so it isn't expected to change any behavior when FileSystem is satisfying its contract.
+
+## 6.24.0 (2021-08-20)
+### Bug Fixes
+* If the primary's CURRENT file is missing or inaccessible, the secondary instance should not hang repeatedly trying to switch to a new MANIFEST. It should instead return the error code encountered while accessing the file.
+* Restoring backups with BackupEngine is now a logically atomic operation, so that if a restore operation is interrupted, DB::Open on it will fail. Using BackupEngineOptions::sync (default) ensures atomicity even in case of power loss or OS crash.
+* Fixed a race related to the destruction of `ColumnFamilyData` objects. The earlier logic unlocked the DB mutex before destroying the thread-local `SuperVersion` pointers, which could result in a process crash if another thread managed to get a reference to the `ColumnFamilyData` object.
+* Removed a call to `RenameFile()` on a non-existent info log file ("LOG") when opening a new DB. Such a call was guaranteed to fail though did not impact applications since we swallowed the error. Now we also stopped swallowing errors in renaming "LOG" file.
+* Fixed an issue where `OnFlushCompleted` was not called for atomic flush.
+* Fixed a bug affecting the batched `MultiGet` API when used with keys spanning multiple column families and `sorted_input == false`.
+* Fixed a potential incorrect result in opt mode and assertion failures caused by releasing snapshot(s) during compaction.
+* Fixed passing of BlobFileCompletionCallback to Compaction job and Atomic flush job which was default paramter (nullptr). BlobFileCompletitionCallback is internal callback that manages addition of blob files to SSTFileManager.
+* Fixed MultiGet not updating the block_read_count and block_read_byte PerfContext counters.
+
+### New Features
+* Made the EventListener extend the Customizable class.
+* EventListeners that have a non-empty Name() and that are registered with the ObjectRegistry can now be serialized to/from the OPTIONS file.
+* Insert warm blocks (data blocks, uncompressed dict blocks, index and filter blocks) in Block cache during flush under option BlockBasedTableOptions.prepopulate_block_cache. Previously it was enabled for only data blocks.
+* BlockBasedTableOptions.prepopulate_block_cache can be dynamically configured using DB::SetOptions.
+* Add CompactionOptionsFIFO.age_for_warm, which allows RocksDB to move old files to warm tier in FIFO compactions. Note that file temperature is still an experimental feature.
+* Add a comment to suggest btrfs user to disable file preallocation by setting `options.allow_fallocate=false`.
+* Fast forward option in Trace replay changed to double type to allow replaying at a lower speed, by settings the value between 0 and 1. This option can be set via `ReplayOptions` in `Replayer::Replay()`, or via `--trace_replay_fast_forward` in db_bench.
+* Add property `LiveSstFilesSizeAtTemperature` to retrieve sst file size at different temperature.
+* Added a stat rocksdb.secondary.cache.hits.
+* Added a PerfContext counter secondary_cache_hit_count.
+* The integrated BlobDB implementation now supports the tickers `BLOB_DB_BLOB_FILE_BYTES_READ`, `BLOB_DB_GC_NUM_KEYS_RELOCATED`, and `BLOB_DB_GC_BYTES_RELOCATED`, as well as the histograms `BLOB_DB_COMPRESSION_MICROS` and `BLOB_DB_DECOMPRESSION_MICROS`.
+* Added hybrid configuration of Ribbon filter and Bloom filter where some LSM levels use Ribbon for memory space efficiency and some use Bloom for speed. See NewRibbonFilterPolicy. This also changes the default behavior of NewRibbonFilterPolicy to use Bloom for flushes under Leveled and Universal compaction and Ribbon otherwise. The C API function `rocksdb_filterpolicy_create_ribbon` is unchanged but adds new `rocksdb_filterpolicy_create_ribbon_hybrid`.
+
+### Public API change
+* Added APIs to decode and replay trace file via Replayer class. Added `DB::NewDefaultReplayer()` to create a default Replayer instance. Added `TraceReader::Reset()` to restart reading a trace file. Created trace_record.h, trace_record_result.h and utilities/replayer.h files to access the decoded Trace records, replay them, and query the actual operation results.
+* Added Configurable::GetOptionsMap to the public API for use in creating new Customizable classes.
+* Generalized bits_per_key parameters in C API from int to double for greater configurability. Although this is a compatible change for existing C source code, anything depending on C API signatures, such as foreign function interfaces, will need to be updated.
+
+### Performance Improvements
+* Try to avoid updating DBOptions if `SetDBOptions()` does not change any option value.
+
+### Behavior Changes
+* `StringAppendOperator` additionally accepts a string as the delimiter.
+* BackupEngineOptions::sync (default true) now applies to restoring backups in addition to creating backups. This could slow down restores, but ensures they are fully persisted before returning OK. (Consider increasing max_background_operations to improve performance.)
+
+## 6.23.0 (2021-07-16)
+### Behavior Changes
+* Obsolete keys in the bottommost level that were preserved for a snapshot will now be cleaned upon snapshot release in all cases. This form of compaction (snapshot release triggered compaction) previously had an artificial limitation that multiple tombstones needed to be present.
+### Bug Fixes
+* Blob file checksums are now printed in hexadecimal format when using the `manifest_dump` `ldb` command.
+* `GetLiveFilesMetaData()` now populates the `temperature`, `oldest_ancester_time`, and `file_creation_time` fields of its `LiveFileMetaData` results when the information is available. Previously these fields always contained zero indicating unknown.
+* Fix mismatches of OnCompaction{Begin,Completed} in case of DisableManualCompaction().
+* Fix continuous logging of an existing background error on every user write
+* Fix a bug that `Get()` return Status::OK() and an empty value for non-existent key when `read_options.read_tier = kBlockCacheTier`.
+* Fix a bug that stat in `get_context` didn't accumulate to statistics when query is failed.
+* Fixed handling of DBOptions::wal_dir with LoadLatestOptions() or ldb --try_load_options on a copied or moved DB. Previously, when the WAL directory is same as DB directory (default), a copied or moved DB would reference the old path of the DB as the WAL directory, potentially corrupting both copies. Under this change, the wal_dir from DB::GetOptions() or LoadLatestOptions() may now be empty, indicating that the current DB directory is used for WALs. This is also a subtle API change.
+
+### New Features
+* ldb has a new feature, `list_live_files_metadata`, that shows the live SST files, as well as their LSM storage level and the column family they belong to.
+* The new BlobDB implementation now tracks the amount of garbage in each blob file in the MANIFEST.
+* Integrated BlobDB now supports Merge with base values (Put/Delete etc.).
+* RemoteCompaction supports sub-compaction, the job_id in the user interface is changed from `int` to `uint64_t` to support sub-compaction id.
+* Expose statistics option in RemoteCompaction worker.
+
+### Public API change
+* Added APIs to the Customizable class to allow developers to create their own Customizable classes.  Created the utilities/customizable_util.h file to contain helper methods for developing new Customizable classes.
+* Change signature of SecondaryCache::Name().  Make SecondaryCache customizable and add SecondaryCache::CreateFromString method.
+
+## 6.22.0 (2021-06-18)
+### Behavior Changes
+* Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time.
+* Added API comments clarifying safe usage of Disable/EnableManualCompaction and EventListener callbacks for compaction.
+### Bug Fixes
+* fs_posix.cc GetFreeSpace() always report disk space available to root even when running as non-root.  Linux defaults often have disk mounts with 5 to 10 percent of total space reserved only for root.  Out of space could result for non-root users.
+* Subcompactions are now disabled when user-defined timestamps are used, since the subcompaction boundary picking logic is currently not timestamp-aware, which could lead to incorrect results when different subcompactions process keys that only differ by timestamp.
+* Fix an issue that `DeleteFilesInRange()` may cause ongoing compaction reports corruption exception, or ASSERT for debug build. There's no actual data loss or corruption that we find.
+* Fixed confusingly duplicated output in LOG for periodic stats ("DUMPING STATS"), including "Compaction Stats" and "File Read Latency Histogram By Level".
+* Fixed performance bugs in background gathering of block cache entry statistics, that could consume a lot of CPU when there are many column families with a shared block cache.
+
+### New Features
+* Marked the Ribbon filter and optimize_filters_for_memory features as production-ready, each enabling memory savings for Bloom-like filters. Use `NewRibbonFilterPolicy` in place of `NewBloomFilterPolicy` to use Ribbon filters instead of Bloom, or `ribbonfilter` in place of `bloomfilter` in configuration string.
+* Allow `DBWithTTL` to use `DeleteRange` api just like other DBs. `DeleteRangeCF()` which executes `WriteBatchInternal::DeleteRange()` has been added to the handler in `DBWithTTLImpl::Write()` to implement it.
+* Add BlockBasedTableOptions.prepopulate_block_cache.  If enabled, it prepopulate warm/hot data blocks which are already in memory into block cache at the time of flush. On a flush, the data block that is in memory (in memtables) get flushed to the device. If using Direct IO, additional IO is incurred to read this data back into memory again, which is avoided by enabling this option and it also helps with Distributed FileSystem. More details in include/rocksdb/table.h.
+* Added a `cancel` field to `CompactRangeOptions`, allowing individual in-process manual range compactions to be cancelled.
+
+### New Features
+* Added BlobMetaData to the ColumnFamilyMetaData to return information about blob files
+
+### Public API change
+* Added GetAllColumnFamilyMetaData API to retrieve the ColumnFamilyMetaData about all column families.
+
+## 6.21.0 (2021-05-21)
+### Bug Fixes
+* Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened.
+* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
+* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
+* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted.
+* Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed.
+* Handle return code by io_uring_submit_and_wait() and io_uring_wait_cqe().
+* In the IngestExternalFile() API, only try to sync the ingested file if the file is linked and the FileSystem/Env supports reopening a writable file.
+* Fixed a bug that `AdvancedColumnFamilyOptions.max_compaction_bytes` is under-calculated for manual compaction (`CompactRange()`). Manual compaction is split to multiple compactions if the compaction size exceed the `max_compaction_bytes`. The bug creates much larger compaction which size exceed the user setting. On the other hand, larger manual compaction size can increase the subcompaction parallelism, you can tune that by setting `max_compaction_bytes`.
+
+### Behavior Changes
+* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status.
+
+### New Features
+* Add new option allow_stall passed during instance creation of WriteBufferManager. When allow_stall is set, WriteBufferManager will stall all writers shared across multiple DBs and columns if memory usage goes beyond specified WriteBufferManager::buffer_size (soft limit). Stall will be cleared when memory is freed after flush and memory usage goes down below buffer_size.
+* Allow `CompactionFilter`s to apply in more table file creation scenarios such as flush and recovery. For compatibility, `CompactionFilter`s by default apply during compaction. Users can customize this behavior by overriding `CompactionFilterFactory::ShouldFilterTableFileCreation()`.
+* Added more fields to FilterBuildingContext with LSM details, for custom filter policies that vary behavior based on where they are in the LSM-tree.
+* Added DB::Properties::kBlockCacheEntryStats for querying statistics on what percentage of block cache is used by various kinds of blocks, etc. using DB::GetProperty and DB::GetMapProperty. The same information is now dumped to info LOG periodically according to `stats_dump_period_sec`.
+* Add an experimental Remote Compaction feature, which allows the user to run Compaction on a different host or process. The feature is still under development, currently only works on some basic use cases. The interface will be changed without backward/forward compatibility support.
+* RocksDB would validate total entries read in flush, and compare with counter inserted into it. If flush_verify_memtable_count = true (default), flush will fail. Otherwise, only log to info logs.
+* Add `TableProperties::num_filter_entries`, which can be used with `TableProperties::filter_size` to calculate the effective bits per filter entry (unique user key or prefix) for a table file.
+
+### Performance Improvements
+* BlockPrefetcher is used by iterators to prefetch data if they anticipate more data to be used in future. It is enabled implicitly by rocksdb. Added change to take in account read pattern if reads are sequential. This would disable prefetching for random reads in MultiGet and iterators as readahead_size is increased exponential doing large prefetches.
+
+### Public API change
+* Removed a parameter from TableFactory::NewTableBuilder, which should not be called by user code because TableBuilder is not a public API.
+* Removed unused structure `CompactionFilterContext`.
+* The `skip_filters` parameter to SstFileWriter is now considered deprecated. Use `BlockBasedTableOptions::filter_policy` to control generation of filters.
+* ClockCache is known to have bugs that could lead to crash or corruption, so should not be used until fixed. Use NewLRUCache instead.
+* Added a new pure virtual function `ApplyToAllEntries` to `Cache`, to replace `ApplyToAllCacheEntries`. Custom `Cache` implementations must add an implementation. Because this function is for gathering statistics, an empty implementation could be acceptable for some applications.
+* Added the ObjectRegistry to the ConfigOptions class.  This registry instance will be used to find any customizable loadable objects during initialization.
+* Expanded the ObjectRegistry functionality to allow nested ObjectRegistry instances.  Added methods to register a set of functions with the registry/library as a group.
+* Deprecated backupable_db.h and BackupableDBOptions in favor of new versions with appropriate names: backup_engine.h and BackupEngineOptions. Old API compatibility is preserved.
+
+### Default Option Change
+* When options.arena_block_size <= 0 (default value 0), still use writer_buffer_size / 8 but cap to 1MB. Too large alloation size might not be friendly to allocator and might cause performance issues in extreme cases.
+
+### Build
+* By default, try to build with liburing. For make, if ROCKSDB_USE_IO_URING is not set, treat as enable, which means RocksDB will try to build with liburing. Users can disable it with ROCKSDB_USE_IO_URING=0. For cmake, add WITH_LIBURING to control it, with default on.
+
+## 6.20.0 (2021-04-16)
+### Behavior Changes
+* `ColumnFamilyOptions::sample_for_compression` now takes effect for creation of all block-based tables. Previously it only took effect for block-based tables created by flush.
+* `CompactFiles()` can no longer compact files from lower level to up level, which has the risk to corrupt DB (details: #8063). The validation is also added to all compactions.
+* Fixed some cases in which DB::OpenForReadOnly() could write to the filesystem. If you want a Logger with a read-only DB, you must now set DBOptions::info_log yourself, such as using CreateLoggerFromOptions().
+* get_iostats_context() will never return nullptr. If thread-local support is not available, and user does not opt-out iostats context, then compilation will fail. The same applies to perf context as well.
+* Added support for WriteBatchWithIndex::NewIteratorWithBase when overwrite_key=false.  Previously, this combination was not supported and would assert or return nullptr.
+* Improve the behavior of WriteBatchWithIndex for Merge operations.  Now more operations may be stored in order to return the correct merged result.
+
+### Bug Fixes
+* Use thread-safe `strerror_r()` to get error messages.
+* Fixed a potential hang in shutdown for a DB whose `Env` has high-pri thread pool disabled (`Env::GetBackgroundThreads(Env::Priority::HIGH) == 0`)
+* Made BackupEngine thread-safe and added documentation comments to clarify what is safe for multiple BackupEngine objects accessing the same backup directory.
+* Fixed crash (divide by zero) when compression dictionary is applied to a file containing only range tombstones.
+* Fixed a backward iteration bug with partitioned filter enabled: not including the prefix of the last key of the previous filter partition in current filter partition can cause wrong iteration result.
+* Fixed a bug that allowed `DBOptions::max_open_files` to be set with a non-negative integer with `ColumnFamilyOptions::compaction_style = kCompactionStyleFIFO`.
+
+### Performance Improvements
+* On ARM platform, use `yield` instead of `wfe` to relax cpu to gain better performance.
+
+### Public API change
+* Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead.
+* Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace
+* Added `FlushReason::kWalFull`, which is reported when a memtable is flushed due to the WAL reaching its size limit; those flushes were previously reported as `FlushReason::kWriteBufferManager`. Also, changed the reason for flushes triggered by the write buffer manager to `FlushReason::kWriteBufferManager`; they were previously reported as `FlushReason::kWriteBufferFull`.
+* Extend file_checksum_dump ldb command and DB::GetLiveFilesChecksumInfo API for IntegratedBlobDB and get checksum of blob files along with SST files.
+
+### New Features
+* Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true.
+* Added BackupEngine support for integrated BlobDB, with blob files shared between backups when table files are shared. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up.
+* Added an optional output parameter to BackupEngine::CreateNewBackup(WithMetadata) to return the BackupID of the new backup.
+* Added BackupEngine::GetBackupInfo / GetLatestBackupInfo for querying individual backups.
+* Made the Ribbon filter a long-term supported feature in terms of the SST schema(compatible with version >= 6.15.0) though the API for enabling it is expected to change.
+
+## 6.19.0 (2021-03-21)
+### Bug Fixes
+* Fixed the truncation error found in APIs/tools when dumping block-based SST files in a human-readable format. After fix, the block-based table can be fully dumped as a readable file.
+* When hitting a write slowdown condition, no write delay (previously 1 millisecond) is imposed until `delayed_write_rate` is actually exceeded, with an initial burst allowance of 1 millisecond worth of bytes. Also, beyond the initial burst allowance, `delayed_write_rate` is now more strictly enforced, especially with multiple column families.
+
+### Public API change
+* Changed default `BackupableDBOptions::share_files_with_checksum` to `true` and deprecated `false` because of potential for data loss. Note that accepting this change in behavior can temporarily increase backup data usage because files are not shared between backups using the two different settings. Also removed obsolete option kFlagMatchInterimNaming.
+* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change.
+* Add suppport to extend DB::VerifyFileChecksums API to also verify blob files checksum.
+* When using the new BlobDB, the amount of data written by flushes/compactions is now broken down into table files and blob files in the compaction statistics; namely, Write(GB) denotes the amount of data written to table files, while Wblob(GB) means the amount of data written to blob files.
+* New default BlockBasedTableOptions::format_version=5 to enable new Bloom filter implementation by default, compatible with RocksDB versions >= 6.6.0.
+* Add new SetBufferSize API to WriteBufferManager to allow dynamic management of memory allotted to all write buffers.  This allows user code to adjust memory monitoring provided by WriteBufferManager as process memory needs change datasets grow and shrink.
+* Clarified the required semantics of Read() functions in FileSystem and Env APIs. Please ensure any custom implementations are compliant.
+* For the new integrated BlobDB implementation, compaction statistics now include the amount of data read from blob files during compaction (due to garbage collection or compaction filters). Write amplification metrics have also been extended to account for data read from blob files.
+* Add EqualWithoutTimestamp() to Comparator.
+* Extend support to track blob files in SSTFileManager whenever a blob file is created/deleted. Blob files will be scheduled to delete via SSTFileManager and SStFileManager will now take blob files in account while calculating size and space limits along with SST files.
+* Add new Append and PositionedAppend API with checksum handoff to legacy Env.
+
+### New Features
+* Support compaction filters for the new implementation of BlobDB. Add `FilterBlobByKey()` to `CompactionFilter`. Subclasses can override this method so that compaction filters can determine whether the actual blob value has to be read during compaction. Use a new `kUndetermined` in `CompactionFilter::Decision` to indicated that further action is necessary for compaction filter to make a decision.
+* Add support to extend retrieval of checksums for blob files from the MANIFEST when checkpointing. During backup, rocksdb can detect corruption in blob files  during file copies.
+* Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1.
+* Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage.
+* Enable backward iteration on keys with user-defined timestamps.
+* Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume count, auto resume total retry number, and auto resume sucess; Histogram for auto resume retry count in each recovery call. Note that, each auto resume attempt will have one or multiple retries.
+
+### Behavior Changes
+* During flush, only WAL sync retryable IO error is mapped to hard error, which will stall the writes. When WAL is used but only SST file write has retryable IO error, it will be mapped to soft error and write will not be affected.
+
+## 6.18.0 (2021-02-19)
+### Behavior Changes
+* When retryable IO error occurs during compaction, it is mapped to soft error and set the BG error. However, auto resume is not called to clean the soft error since compaction will reschedule by itself. In this change, When retryable IO error occurs during compaction, BG error is not set. User will be informed the error via EventHelper.
+* Introduce a new trace file format for query tracing and replay and trace file version is bump up to 0.2. A payload map is added as the first portion of the payload. We will not have backward compatible issues when adding new entries to trace records. Added the iterator_upper_bound and iterator_lower_bound in Seek and SeekForPrev tracing function. Added them as the new payload member for iterator tracing.
+
+### New Features
+* Add support for key-value integrity protection in live updates from the user buffers provided to `WriteBatch` through the write to RocksDB's in-memory update buffer (memtable). This is intended to detect some cases of in-memory data corruption, due to either software or hardware errors. Users can enable protection by constructing their `WriteBatch` with `protection_bytes_per_key == 8`.
+* Add support for updating `full_history_ts_low` option in manual compaction, which is for old timestamp data GC.
+* Add a mechanism for using Makefile to build external plugin code into the RocksDB libraries/binaries. This intends to simplify compatibility and distribution for plugins (e.g., special-purpose `FileSystem`s) whose source code resides outside the RocksDB repo. See "plugin/README.md" for developer details, and "PLUGINS.md" for a listing of available plugins.
+* Added memory pre-fetching for experimental Ribbon filter, which especially optimizes performance with batched MultiGet.
+* A new, experimental version of BlobDB (key-value separation) is now available. The new implementation is integrated into the RocksDB core, i.e. it is accessible via the usual `rocksdb::DB` API, as opposed to the separate `rocksdb::blob_db::BlobDB` interface used by the earlier version, and can be configured on a per-column family basis using the configuration options `enable_blob_files`, `min_blob_size`, `blob_file_size`, `blob_compression_type`, `enable_blob_garbage_collection`, and `blob_garbage_collection_age_cutoff`. It extends RocksDB's consistency guarantees to blobs, and offers more features and better performance. Note that some features, most notably `Merge`, compaction filters, and backup/restore are not yet supported, and there is no support for migrating a database created by the old implementation.
+
+### Bug Fixes
+* Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details.
+* `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees.
+* Fix `WRITE_PREPARED`, `WRITE_UNPREPARED` TransactionDB `MultiGet()` may return uncommitted data with snapshot.
+* In DB::OpenForReadOnly, if any error happens while checking Manifest file path, it was overridden by Status::NotFound. It has been fixed and now actual error is returned.
+
+### Public API Change
+* Added a "only_mutable_options" flag to the ConfigOptions.  When this flag is "true", the Configurable functions and convenience methods (such as GetDBOptionsFromString) will only deal with options that are marked as mutable.  When this flag is true, only options marked as mutable can be configured (a Status::InvalidArgument will be returned) and options not marked as mutable will not be returned or compared.  The default is "false", meaning to compare all options.
+* Add new Append and PositionedAppend APIs to FileSystem to bring the data verification information (data checksum information) from upper layer (e.g., WritableFileWriter) to the storage layer. In this way, the customized FileSystem is able to verify the correctness of data being written to the storage on time. Add checksum_handoff_file_types to DBOptions. User can use this option to control which file types (Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.) should use the new Append and PositionedAppend APIs to handoff the verification information. Currently, RocksDB only use crc32c to calculate the checksum for write handoff.
+* Add an option, `CompressionOptions::max_dict_buffer_bytes`, to limit the in-memory buffering for selecting samples for generating/training a dictionary. The limit is currently loosely adhered to.
+
+
+## 6.17.0 (2021-01-15)
+### Behavior Changes
+* When verifying full file checksum with `DB::VerifyFileChecksums()`, we now fail with `Status::InvalidArgument` if the name of the checksum generator used for verification does not match the name of the checksum generator used for protecting the file when it was created.
+* Since RocksDB does not continue write the same file if a file write fails for any reason, the file scope write IO error is treated the same as retryable IO error. More information about error handling of file scope IO error is included in `ErrorHandler::SetBGError`.
+
+### Bug Fixes
+* Version older than 6.15 cannot decode VersionEdits `WalAddition` and `WalDeletion`, fixed this by changing the encoded format of them to be ignorable by older versions.
+* Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated.
+
+### Public API Change
+* Add a public API WriteBufferManager::dummy_entries_in_cache_usage() which reports the size of dummy entries stored in cache (passed to WriteBufferManager). Dummy entries are used to account for DataBlocks.
+* Add a SystemClock class that contains the time-related methods from Env.  The original methods in Env may be deprecated in a future release.  This class will allow easier testing, development, and expansion of time-related features.
+* Add a public API GetRocksBuildProperties and GetRocksBuildInfoAsString to get properties about the current build.  These properties may include settings related to the GIT settings (branch, timestamp).  This change also sets the "build date" based on the GIT properties, rather than the actual build time, thereby enabling more reproducible builds.
+
+## 6.16.0 (2020-12-18)
+### Behavior Changes
+* Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation.
+
+### Bug Fixes
+* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`.
+* Fix a bug where compressed blocks read by MultiGet are not inserted into the compressed block cache when use_direct_reads = true.
+* Fixed the issue of full scanning on obsolete files when there are too many outstanding compactions with ConcurrentTaskLimiter enabled.
+* Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected.
+* Fixed prefix extractor with timestamp issues.
+* Fixed a bug in atomic flush: in two-phase commit mode, the minimum WAL log number to keep is incorrect.
+* Fixed a bug related to checkpoint in PR7789: if there are multiple column families, and the checkpoint is not opened as read only, then in rare cases, data loss may happen in the checkpoint. Since backup engine relies on checkpoint, it may also be affected.
+* When ldb --try_load_options is used with the --column_family option, the ColumnFamilyOptions for the specified column family was not loaded from the OPTIONS file. Fix it so its loaded from OPTIONS and then overridden with command line overrides.
+
+### New Features
+* User defined timestamp feature supports `CompactRange` and `GetApproximateSizes`.
+* Support getting aggregated table properties (kAggregatedTableProperties and kAggregatedTablePropertiesAtLevel) with DB::GetMapProperty, for easier access to the data in a structured format.
+* Experimental option BlockBasedTableOptions::optimize_filters_for_memory now works with experimental Ribbon filter (as well as Bloom filter).
+
+### Public API Change
+* Deprecated public but rarely-used FilterBitsBuilder::CalculateNumEntry, which is replaced with ApproximateNumEntries taking a size_t parameter and returning size_t.
+* To improve portability the functions `Env::GetChildren` and `Env::GetChildrenFileAttributes` will no longer return entries for the special directories `.` or `..`.
+* Added a new option `track_and_verify_wals_in_manifest`. If `true`, the log numbers and sizes of the synced WALs are tracked in MANIFEST, then during DB recovery, if a synced WAL is missing from disk, or the WAL's size does not match the recorded size in MANIFEST, an error will be reported and the recovery will be aborted. Note that this option does not work with secondary instance.
+* `rocksdb_approximate_sizes` and `rocksdb_approximate_sizes_cf` in the C API now requires an error pointer (`char** errptr`) for receiving any error.
+* All overloads of DB::GetApproximateSizes now return Status, so that any failure to obtain the sizes is indicated to the caller.
+
+## 6.15.0 (2020-11-13)
+### Bug Fixes
+* Fixed a bug in the following combination of features: indexes with user keys (`format_version >= 3`), indexes are partitioned (`index_type == kTwoLevelIndexSearch`), and some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`). The bug could cause keys to be truncated when read from the index leading to wrong read results or other unexpected behavior.
+* Fixed a bug when indexes are partitioned (`index_type == kTwoLevelIndexSearch`), some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`), and partitions reads could be mixed between block cache and directly from the file (e.g., with `enable_index_compression == 1` and `mmap_read == 1`, partitions that were stored uncompressed due to poor compression ratio would be read directly from the file via mmap, while partitions that were stored compressed would be read from block cache). The bug could cause index partitions to be mistakenly considered empty during reads leading to wrong read results.
+* Since 6.12, memtable lookup should report unrecognized value_type as corruption (#7121).
+* Since 6.14, fix false positive flush/compaction `Status::Corruption` failure when `paranoid_file_checks == true` and range tombstones were written to the compaction output files.
+* Since 6.14, fix a bug that could cause a stalled write to crash with mixed of slowdown and no_slowdown writes (`WriteOptions.no_slowdown=true`).
+* Fixed a bug which causes hang in closing DB when refit level is set in opt build. It was because ContinueBackgroundWork() was called in assert statement which is a no op. It was introduced in 6.14.
+* Fixed a bug which causes Get() to return incorrect result when a key's merge operand is applied twice. This can occur if the thread performing Get() runs concurrently with a background flush thread and another thread writing to the MANIFEST file (PR6069).
+* Reverted a behavior change silently introduced in 6.14.2, in which the effects of the `ignore_unknown_options` flag (used in option parsing/loading functions) changed.
+* Reverted a behavior change silently introduced in 6.14, in which options parsing/loading functions began returning `NotFound` instead of `InvalidArgument` for option names not available in the present version.
+* Fixed MultiGet bugs it doesn't return valid data with user defined timestamp.
+* Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before `TableBuilder::Finish()` in compaction job. For example, the `NeedCompact()` method of `CompactOnDeletionCollector` returned by built-in `CompactOnDeletionCollectorFactory` requires `BlockBasedTable::Finish()` to return the correct result. The bug can cause a compaction-generated file not to be marked for future compaction based on deletion ratio.
+* Fixed a seek issue with prefix extractor and timestamp.
+* Fixed a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer.
+* Fixed a bug of a recovery corner case, details in PR7621.
+
+### Public API Change
+* Deprecate `BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache` and `BlockBasedTableOptions::pin_top_level_index_and_filter`. These options still take effect until users migrate to the replacement APIs in `BlockBasedTableOptions::metadata_cache_options`. Migration guidance can be found in the API comments on the deprecated options.
+* Add new API `DB::VerifyFileChecksums` to verify SST file checksum with corresponding entries in the MANIFEST if present. Current implementation requires scanning and recomputing file checksums.
+
+### Behavior Changes
+* The dictionary compression settings specified in `ColumnFamilyOptions::compression_opts` now additionally affect files generated by flush and compaction to non-bottommost level. Previously those settings at most affected files generated by compaction to bottommost level, depending on whether `ColumnFamilyOptions::bottommost_compression_opts` overrode them. Users who relied on dictionary compression settings in `ColumnFamilyOptions::compression_opts` affecting only the bottommost level can keep the behavior by moving their dictionary settings to `ColumnFamilyOptions::bottommost_compression_opts` and setting its `enabled` flag.
+* When the `enabled` flag is set in `ColumnFamilyOptions::bottommost_compression_opts`, those compression options now take effect regardless of the value in `ColumnFamilyOptions::bottommost_compression`. Previously, those compression options only took effect when `ColumnFamilyOptions::bottommost_compression != kDisableCompressionOption`. Now, they additionally take effect when `ColumnFamilyOptions::bottommost_compression == kDisableCompressionOption` (such a setting causes bottommost compression type to fall back to `ColumnFamilyOptions::compression_per_level` if configured, and otherwise fall back to `ColumnFamilyOptions::compression`).
+
+### New Features
+* An EXPERIMENTAL new Bloom alternative that saves about 30% space compared to Bloom filters, with about 3-4x construction time and similar query times is available using NewExperimentalRibbonFilterPolicy.
+
+## 6.14 (2020-10-09)
+### Bug fixes
+* Fixed a bug after a `CompactRange()` with `CompactRangeOptions::change_level` set fails due to a conflict in the level change step, which caused all subsequent calls to `CompactRange()` with `CompactRangeOptions::change_level` set to incorrectly fail with a `Status::NotSupported("another thread is refitting")` error.
+* Fixed a bug that the bottom most level compaction could still be a trivial move even if `BottommostLevelCompaction.kForce` or `kForceOptimized` is set.
+
+### Public API Change
+* The methods to create and manage EncrypedEnv have been changed.  The EncryptionProvider is now passed to NewEncryptedEnv as a shared pointer, rather than a raw pointer.  Comparably, the CTREncryptedProvider now takes a shared pointer, rather than a reference, to a BlockCipher.  CreateFromString methods have been added to BlockCipher and EncryptionProvider to provide a single API by which different ciphers and providers can be created, respectively.
+* The internal classes (CTREncryptionProvider, ROT13BlockCipher, CTRCipherStream) associated with the EncryptedEnv have been moved out of the public API.  To create a CTREncryptionProvider, one can either use EncryptionProvider::NewCTRProvider, or EncryptionProvider::CreateFromString("CTR").  To create a new ROT13BlockCipher, one can either use BlockCipher::NewROT13Cipher or BlockCipher::CreateFromString("ROT13").
+* The EncryptionProvider::AddCipher method has been added to allow keys to be added to an EncryptionProvider.  This API will allow future providers to support multiple cipher keys.
+* Add a new option "allow_data_in_errors". When this new option is set by users, it allows users to opt-in to get error messages containing corrupted keys/values. Corrupt keys, values will be logged in the messages, logs, status etc. that will help users with the useful information regarding affected data. By default value of this option is set false to prevent users data to be exposed in the messages so currently, data will be redacted from logs, messages, status by default.
+* AdvancedColumnFamilyOptions::force_consistency_checks is now true by default, for more proactive DB corruption detection at virtually no cost (estimated two extra CPU cycles per million on a major production workload). Corruptions reported by these checks now mention "force_consistency_checks" in case a false positive corruption report is suspected and the option needs to be disabled (unlikely). Since existing column families have a saved setting for force_consistency_checks, only new column families will pick up the new default.
+
+### General Improvements
+* The settings of the DBOptions and ColumnFamilyOptions are now managed by Configurable objects (see New Features).  The same convenience methods to configure these options still exist but the backend implementation has been unified under a common implementation.
+
+### New Features
+
+* Methods to configure serialize, and compare -- such as TableFactory -- are exposed directly through the Configurable base class (from which these objects inherit).  This change will allow for better and more thorough configuration management and retrieval in the future.  The options for a Configurable object can be set via the ConfigureFromMap, ConfigureFromString, or ConfigureOption method.  The serialized version of the options of an object can be retrieved via the GetOptionString, ToString, or GetOption methods.  The list of options supported by an object can be obtained via the GetOptionNames method.  The "raw" object (such as the BlockBasedTableOption) for an option may be retrieved via the GetOptions method.  Configurable options can be compared via the AreEquivalent method.  The settings within a Configurable object may be validated via the ValidateOptions method.  The object may be intialized (at which point only mutable options may be updated) via the PrepareOptions method.
+* Introduce options.check_flush_compaction_key_order with default value to be true. With this option, during flush and compaction, key order will be checked when writing to each SST file. If the order is violated, the flush or compaction will fail.
+* Added is_full_compaction to CompactionJobStats, so that the information is available through the EventListener interface.
+* Add more stats for MultiGet in Histogram to get number of data blocks, index blocks, filter blocks and sst files read from file system per level.
+* SST files have a new table property called db_host_id, which is set to the hostname by default. A new option in DBOptions, db_host_id, allows the property value to be overridden with a user specified string, or disable it completely by making the option string empty.
+* Methods to create customizable extensions -- such as TableFactory -- are exposed directly through the Customizable base class (from which these objects inherit).  This change will allow these Customizable classes to be loaded and configured in a standard way (via CreateFromString).  More information on how to write and use Customizable classes is in the customizable.h header file.
+
+## 6.13 (2020-09-12)
+### Bug fixes
+* Fix a performance regression introduced in 6.4 that makes a upper bound check for every Next() even if keys are within a data block that is within the upper bound.
+* Fix a possible corruption to the LSM state (overlapping files within a level) when a `CompactRange()` for refitting levels (`CompactRangeOptions::change_level == true`) and another manual compaction are executed in parallel.
+* Sanitize `recycle_log_file_num` to zero when the user attempts to enable it in combination with `WALRecoveryMode::kTolerateCorruptedTailRecords`. Previously the two features were allowed together, which compromised the user's configured crash-recovery guarantees.
+* Fix a bug where a level refitting in CompactRange() might race with an automatic compaction that puts the data to the target level of the refitting. The bug has been there for years.
+* Fixed a bug in version 6.12 in which BackupEngine::CreateNewBackup could fail intermittently with non-OK status when backing up a read-write DB configured with a DBOptions::file_checksum_gen_factory.
+* Fix useless no-op compactions scheduled upon snapshot release when options.disable-auto-compactions = true.
+* Fix a bug when max_write_buffer_size_to_maintain is set, immutable flushed memtable destruction is delayed until the next super version is installed. A memtable is not added to delete list because of its reference hold by super version and super version doesn't switch because of empt delete list. So memory usage keeps on increasing beyond write_buffer_size + max_write_buffer_size_to_maintain.
+* Avoid converting MERGES to PUTS when allow_ingest_behind is true.
+* Fix compression dictionary sampling together with `SstFileWriter`. Previously, the dictionary would be trained/finalized immediately with zero samples. Now, the whole `SstFileWriter` file is buffered in memory and then sampled.
+* Fix a bug with `avoid_unnecessary_blocking_io=1` and creating backups (BackupEngine::CreateNewBackup) or checkpoints (Checkpoint::Create). With this setting and WAL enabled, these operations could randomly fail with non-OK status.
+* Fix a bug in which bottommost compaction continues to advance the underlying InternalIterator to skip tombstones even after shutdown.
+
+### New Features
+* A new field `std::string requested_checksum_func_name` is added to `FileChecksumGenContext`, which enables the checksum factory to create generators for a suite of different functions.
+* Added a new subcommand, `ldb unsafe_remove_sst_file`, which removes a lost or corrupt SST file from a DB's metadata. This command involves data loss and must not be used on a live DB.
+
+### Performance Improvements
+* Reduce thread number for multiple DB instances by re-using one global thread for statistics dumping and persisting.
+* Reduce write-amp in heavy write bursts in `kCompactionStyleLevel` compaction style with `level_compaction_dynamic_level_bytes` set.
+* BackupEngine incremental backups no longer read DB table files that are already saved to a shared part of the backup directory, unless `share_files_with_checksum` is used with `kLegacyCrc32cAndFileSize` naming (discouraged).
+  * For `share_files_with_checksum`, we are confident there is no regression (vs. pre-6.12) in detecting DB or backup corruption at backup creation time, mostly because the old design did not leverage this extra checksum computation for detecting inconsistencies at backup creation time.
+  * For `share_table_files` without "checksum" (not recommended), there is a regression in detecting fundamentally unsafe use of the option, greatly mitigated by file size checking (under "Behavior Changes"). Almost no reason to use `share_files_with_checksum=false` should remain.
+  * `DB::VerifyChecksum` and `BackupEngine::VerifyBackup` with checksum checking are still able to catch corruptions that `CreateNewBackup` does not.
+
+### Public API Change
+* Expose kTypeDeleteWithTimestamp in EntryType and update GetEntryType() accordingly.
+* Added file_checksum and file_checksum_func_name to TableFileCreationInfo, which can pass the table file checksum information through the OnTableFileCreated callback during flush and compaction.
+* A warning is added to `DB::DeleteFile()` API describing its known problems and deprecation plan.
+* Add a new stats level, i.e. StatsLevel::kExceptTickers (PR7329) to exclude tickers even if application passes a non-null Statistics object.
+* Added a new status code IOStatus::IOFenced() for the Env/FileSystem to indicate that writes from this instance are fenced off. Like any other background error, this error is returned to the user in Put/Merge/Delete/Flush calls and can be checked using Status::IsIOFenced().
+
+### Behavior Changes
+* File abstraction `FSRandomAccessFile.Prefetch()` default return status is changed from `OK` to `NotSupported`. If the user inherited file doesn't implement prefetch, RocksDB will create internal prefetch buffer to improve read performance.
+* When retryabel IO error happens during Flush (manifest write error is excluded) and WAL is disabled, originally it is mapped to kHardError. Now,it is mapped to soft error. So DB will not stall the writes unless the memtable is full. At the same time, when auto resume is triggered to recover the retryable IO error during Flush, SwitchMemtable is not called to avoid generating to many small immutable memtables. If WAL is enabled, no behavior changes.
+* When considering whether a table file is already backed up in a shared part of backup directory, BackupEngine would already query the sizes of source (DB) and pre-existing destination (backup) files. BackupEngine now uses these file sizes to detect corruption, as at least one of (a) old backup, (b) backup in progress, or (c) current DB is corrupt if there's a size mismatch.
+
+### Others
+* Error in prefetching partitioned index blocks will not be swallowed. It will fail the query and return the IOError users.
+
+## 6.12 (2020-07-28)
+### Public API Change
+* Encryption file classes now exposed for inheritance in env_encryption.h
+* File I/O listener is extended to cover more I/O operations. Now class `EventListener` in listener.h contains new callback functions: `OnFileFlushFinish()`, `OnFileSyncFinish()`, `OnFileRangeSyncFinish()`, `OnFileTruncateFinish()`, and ``OnFileCloseFinish()``.
+* `FileOperationInfo` now reports `duration` measured by `std::chrono::steady_clock` and `start_ts` measured by `std::chrono::system_clock` instead of start and finish timestamps measured by `system_clock`. Note that `system_clock` is called before `steady_clock` in program order at operation starts.
+* `DB::GetDbSessionId(std::string& session_id)` is added. `session_id` stores a unique identifier that gets reset every time the DB is opened. This DB session ID should be unique among all open DB instances on all hosts, and should be unique among re-openings of the same or other DBs. This identifier is recorded in the LOG file on the line starting with "DB Session ID:".
+* `DB::OpenForReadOnly()` now returns `Status::NotFound` when the specified DB directory does not exist. Previously the error returned depended on the underlying `Env`. This change is available in all 6.11 releases as well.
+* A parameter `verify_with_checksum` is added to `BackupEngine::VerifyBackup`, which is false by default. If it is ture, `BackupEngine::VerifyBackup` verifies checksums and file sizes of backup files. Pass `false` for `verify_with_checksum` to maintain the previous behavior and performance of `BackupEngine::VerifyBackup`, by only verifying sizes of backup files.
+
+### Behavior Changes
+* Best-efforts recovery ignores CURRENT file completely. If CURRENT file is missing during recovery, best-efforts recovery still proceeds with MANIFEST file(s).
+* In best-efforts recovery, an error that is not Corruption or IOError::kNotFound or IOError::kPathNotFound will be overwritten silently. Fix this by checking all non-ok cases and return early.
+* When `file_checksum_gen_factory` is set to `GetFileChecksumGenCrc32cFactory()`, BackupEngine will compare the crc32c checksums of table files computed when creating a backup to the expected checksums stored in the DB manifest, and will fail `CreateNewBackup()` on mismatch (corruption). If the `file_checksum_gen_factory` is not set or set to any other customized factory, there is no checksum verification to detect if SST files in a DB are corrupt when read, copied, and independently checksummed by BackupEngine.
+* When a DB sets `stats_dump_period_sec > 0`, either as the initial value for DB open or as a dynamic option change, the first stats dump is staggered in the following X seconds, where X is an integer in `[0, stats_dump_period_sec)`. Subsequent stats dumps are still spaced `stats_dump_period_sec` seconds apart.
+* When the paranoid_file_checks option is true, a hash is generated of all keys and values are generated when the SST file is written, and then the values are read back in to validate the file.  A corruption is signaled if the two hashes do not match.
+
+### Bug fixes
+* Compressed block cache was automatically disabled with read-only DBs by mistake. Now it is fixed: compressed block cache will be in effective with read-only DB too.
+* Fix a bug of wrong iterator result if another thread finishes an update and a DB flush between two statement.
+* Disable file deletion after MANIFEST write/sync failure until db re-open or Resume() so that subsequent re-open will not see MANIFEST referencing deleted SSTs.
+* Fix a bug when index_type == kTwoLevelIndexSearch in PartitionedIndexBuilder to update FlushPolicy to point to internal key partitioner when it changes from user-key mode to internal-key mode in index partition.
+* Make compaction report InternalKey corruption while iterating over the input.
+* Fix a bug which may cause MultiGet to be slow because it may read more data than requested, but this won't affect correctness. The bug was introduced in 6.10 release.
+* Fail recovery and report once hitting a physical log record checksum mismatch, while reading MANIFEST. RocksDB should not continue processing the MANIFEST any further.
+* Fixed a bug in size-amp-triggered and periodic-triggered universal compaction, where the compression settings for the first input level were used rather than the compression settings for the output (bottom) level.
+
+### New Features
+* DB identity (`db_id`) and DB session identity (`db_session_id`) are added to table properties and stored in SST files. SST files generated from SstFileWriter and Repairer have DB identity “SST Writer” and “DB Repairer”, respectively. Their DB session IDs are generated in the same way as `DB::GetDbSessionId`. The session ID for SstFileWriter (resp., Repairer) resets every time `SstFileWriter::Open` (resp., `Repairer::Run`) is called.
+* Added experimental option BlockBasedTableOptions::optimize_filters_for_memory for reducing allocated memory size of Bloom filters (~10% savings with Jemalloc) while preserving the same general accuracy. To have an effect, the option requires format_version=5 and malloc_usable_size. Enabling this option is forward and backward compatible with existing format_version=5.
+* `BackupableDBOptions::share_files_with_checksum_naming` is added with new default behavior for naming backup files with `share_files_with_checksum`, to address performance and backup integrity issues. See API comments for details.
+* Added auto resume function to automatically recover the DB from background Retryable IO Error. When retryable IOError happens during flush and WAL write, the error is mapped to Hard Error and DB will be in read mode. When retryable IO Error happens during compaction, the error will be mapped to Soft Error. DB is still in write/read mode. Autoresume function will create a thread for a DB to call DB->ResumeImpl() to try the recover for Retryable IO Error during flush and WAL write. Compaction will be rescheduled by itself if retryable IO Error happens. Auto resume may also cause other Retryable IO Error during the recovery, so the recovery will fail. Retry the auto resume may solve the issue, so we use max_bgerror_resume_count to decide how many resume cycles will be tried in total. If it is <=0, auto resume retryable IO Error is disabled. Default is INT_MAX, which will lead to a infinit auto resume. bgerror_resume_retry_interval decides the time interval between two auto resumes.
+* Option `max_subcompactions` can be set dynamically using DB::SetDBOptions().
+* Added experimental ColumnFamilyOptions::sst_partitioner_factory to define determine the partitioning of sst files. This helps compaction to split the files on interesting boundaries (key prefixes) to make propagation of sst files less write amplifying (covering the whole key space).
+
+### Performance Improvements
+* Eliminate key copies for internal comparisons while accessing ingested block-based tables.
+* Reduce key comparisons during random access in all block-based tables.
+* BackupEngine avoids unnecessary repeated checksum computation for backing up a table file to the `shared_checksum` directory when using `share_files_with_checksum_naming = kUseDbSessionId` (new default), except on SST files generated before this version of RocksDB, which fall back on using `kLegacyCrc32cAndFileSize`.
+
+## 6.11 (2020-06-12)
+### Bug Fixes
+* Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true.
+* Fix possible false NotFound status from batched MultiGet using index type kHashSearch.
+* Fix corruption caused by enabling delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode, along with parallel compactions. The bug can result in two parallel compactions picking the same input files, resulting in the DB resurrecting older and deleted versions of some keys.
+* Fix a use-after-free bug in best-efforts recovery. column_family_memtables_ needs to point to valid ColumnFamilySet.
+* Let best-efforts recovery ignore corrupted files during table loading.
+* Fix corrupt key read from ingested file when iterator direction switches from reverse to forward at a key that is a prefix of another key in the same file. It is only possible in files with a non-zero global seqno.
+* Fix abnormally large estimate from GetApproximateSizes when a range starts near the end of one SST file and near the beginning of another. Now GetApproximateSizes consistently and fairly includes the size of SST metadata in addition to data blocks, attributing metadata proportionally among the data blocks based on their size.
+* Fix potential file descriptor leakage in PosixEnv's IsDirectory() and NewRandomAccessFile().
+* Fix false negative from the VerifyChecksum() API when there is a checksum mismatch in an index partition block in a BlockBasedTable format table file (index_type is kTwoLevelIndexSearch).
+* Fix sst_dump to return non-zero exit code if the specified file is not a recognized SST file or fails requested checks.
+* Fix incorrect results from batched MultiGet for duplicate keys, when the duplicate key matches the largest key of an SST file and the value type for the key in the file is a merge value.
+
+### Public API Change
+* Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request.
+* BlobDB now explicitly disallows using the default column family's storage directories as blob directory.
+* DeleteRange now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined.
+* ldb now uses options.force_consistency_checks = true by default and "--disable_consistency_checks" is added to disable it.
+* DB::OpenForReadOnly no longer creates files or directories if the named DB does not exist, unless create_if_missing is set to true.
+* The consistency checks that validate LSM state changes (table file additions/deletions during flushes and compactions) are now stricter, more efficient, and no longer optional, i.e. they are performed even if `force_consistency_checks` is `false`.
+* Disable delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode and num_levels = 1 in order to avoid a corruption bug.
+* `pin_l0_filter_and_index_blocks_in_cache` no longer applies to L0 files larger than `1.5 * write_buffer_size` to give more predictable memory usage. Such L0 files may exist due to intra-L0 compaction, external file ingestion, or user dynamically changing `write_buffer_size` (note, however, that files that are already pinned will continue being pinned, even after such a dynamic change).
+* In point-in-time wal recovery mode, fail database recovery in case of IOError while reading the WAL to avoid data loss.
+* A new method `Env::LowerThreadPoolCPUPriority(Priority, CpuPriority)` is added to `Env` to be able to lower to a specific priority such as `CpuPriority::kIdle`.
+
+### New Features
+* sst_dump to add a new --readahead_size argument. Users can specify read size when scanning the data. Sst_dump also tries to prefetch tail part of the SST files so usually some number of I/Os are saved there too.
+* Generate file checksum in SstFileWriter if Options.file_checksum_gen_factory is set. The checksum and checksum function name are stored in ExternalSstFileInfo after the sst file write is finished.
+* Add a value_size_soft_limit in read options which limits the cumulative value size of keys read in batches in MultiGet. Once the cumulative value size of found keys exceeds read_options.value_size_soft_limit, all the remaining keys are returned with status Abort without further finding their values. By default the value_size_soft_limit is std::numeric_limits<uint64_t>::max().
+* Enable SST file ingestion with file checksum information when calling IngestExternalFiles(const std::vector<IngestExternalFileArg>& args). Added files_checksums and files_checksum_func_names to IngestExternalFileArg such that user can ingest the sst files with their file checksum information. Added verify_file_checksum to IngestExternalFileOptions (default is True). To be backward compatible, if DB does not enable file checksum or user does not provide checksum information (vectors of files_checksums and files_checksum_func_names are both empty), verification of file checksum is always sucessful. If DB enables file checksum, DB will always generate the checksum for each ingested SST file during Prepare stage of ingestion and store the checksum in Manifest, unless verify_file_checksum is False and checksum information is provided by the application. In this case, we only verify the checksum function name and directly store the ingested checksum in Manifest. If verify_file_checksum is set to True, DB will verify the ingested checksum and function name with the genrated ones. Any mismatch will fail the ingestion. Note that, if IngestExternalFileOptions::write_global_seqno is True, the seqno will be changed in the ingested file. Therefore, the checksum of the file will be changed. In this case, a new checksum will be generated after the seqno is updated and be stored in the Manifest.
+
+### Performance Improvements
+* Eliminate redundant key comparisons during random access in block-based tables.
+
+## 6.10 (2020-05-02)
+### Bug Fixes
+* Fix wrong result being read from ingested file. May happen when a key in the file happen to be prefix of another key also in the file. The issue can further cause more data corruption. The issue exists with rocksdb >= 5.0.0 since DB::IngestExternalFile() was introduced.
+* Finish implementation of BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It's now ready for use. Significantly reduces read amplification in some setups, especially for iterator seeks.
+* Fix a bug by updating CURRENT file so that it points to the correct MANIFEST file after best-efforts recovery.
+* Fixed a bug where ColumnFamilyHandle objects were not cleaned up in case an error happened during BlobDB's open after the base DB had been opened.
+* Fix a potential undefined behavior caused by trying to dereference nullable pointer (timestamp argument) in DB::MultiGet.
+* Fix a bug caused by not including user timestamp in MultiGet LookupKey construction. This can lead to wrong query result since the trailing bytes of a user key, if not shorter than timestamp, will be mistaken for user timestamp.
+* Fix a bug caused by using wrong compare function when sorting the input keys of MultiGet with timestamps.
+* Upgraded version of bzip library (1.0.6 -> 1.0.8) used with RocksJava to address potential vulnerabilities if an attacker can manipulate compressed data saved and loaded by RocksDB (not normal). See issue #6703.
+
+### Public API Change
+* Add a ConfigOptions argument to the APIs dealing with converting options to and from strings and files.  The ConfigOptions is meant to replace some of the options (such as input_strings_escaped and ignore_unknown_options) and allow for more parameters to be passed in the future without changing the function signature.
+* Add NewFileChecksumGenCrc32cFactory to the file checksum public API, such that the builtin Crc32c based file checksum generator factory can be used by applications.
+* Add IsDirectory to Env and FS to indicate if a path is a directory.
+
+### New Features
+* Added support for pipelined & parallel compression optimization for `BlockBasedTableBuilder`. This optimization makes block building, block compression and block appending a pipeline, and uses multiple threads to accelerate block compression. Users can set `CompressionOptions::parallel_threads` greater than 1 to enable compression parallelism. This feature is experimental for now.
+* Provide an allocator for memkind to be used with block cache. This is to work with memory technologies (Intel DCPMM is one such technology currently available) that require different libraries for allocation and management (such as PMDK and memkind). The high capacities available make it possible to provision large caches (up to several TBs in size) beyond what is achievable with DRAM.
+* Option `max_background_flushes` can be set dynamically using DB::SetDBOptions().
+* Added functionality in sst_dump tool to check the compressed file size for different compression levels and print the time spent on compressing files with each compression type. Added arguments `--compression_level_from` and `--compression_level_to` to report size of all compression levels and one compression_type must be specified with it so that it will report compressed sizes of one compression type with different levels.
+* Added statistics for redundant insertions into block cache: rocksdb.block.cache.*add.redundant. (There is currently no coordination to ensure that only one thread loads a table block when many threads are trying to access that same table block.)
+
+### Bug Fixes
+* Fix a bug when making options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts dynamically changeable: the modified values are not written to option files or returned back to users when being queried.
+* Fix a bug where index key comparisons were unaccounted in `PerfContext::user_key_comparison_count` for lookups in files written with `format_version >= 3`.
+* Fix many bloom.filter statistics not being updated in batch MultiGet.
+
+### Performance Improvements
+* Improve performance of batch MultiGet with partitioned filters, by sharing block cache lookups to applicable filter blocks.
+* Reduced memory copies when fetching and uncompressing compressed blocks from sst files.
+
+## 6.9.0 (2020-03-29)
+### Behavior changes
+* Since RocksDB 6.8, ttl-based FIFO compaction can drop a file whose oldest key becomes older than options.ttl while others have not. This fix reverts this and makes ttl-based FIFO compaction use the file's flush time as the criterion. This fix also requires that max_open_files = -1 and compaction_options_fifo.allow_compaction = false to function properly.
+
+### Public API Change
+* Fix spelling so that API now has correctly spelled transaction state name `COMMITTED`, while the old misspelled `COMMITED` is still available as an alias.
+* Updated default format_version in BlockBasedTableOptions from 2 to 4. SST files generated with the new default can be read by RocksDB versions 5.16 and newer, and use more efficient encoding of keys in index blocks.
+* A new parameter `CreateBackupOptions` is added to both `BackupEngine::CreateNewBackup` and `BackupEngine::CreateNewBackupWithMetadata`, you can decrease CPU priority of `BackupEngine`'s background threads by setting `decrease_background_thread_cpu_priority` and `background_thread_cpu_priority` in `CreateBackupOptions`.
+* Updated the public API of SST file checksum. Introduce the FileChecksumGenFactory to create the FileChecksumGenerator for each SST file, such that the FileChecksumGenerator is not shared and it can be more general for checksum implementations. Changed the FileChecksumGenerator interface from Value, Extend, and GetChecksum to Update, Finalize, and GetChecksum. Finalize should be only called once after all data is processed to generate the final checksum. Temproal data should be maintained by the FileChecksumGenerator object itself and finally it can return the checksum string.
+
+### Bug Fixes
+* Fix a bug where range tombstone blocks in ingested files were cached incorrectly during ingestion. If range tombstones were read from those incorrectly cached blocks, the keys they covered would be exposed.
 * Fix a data race that might cause crash when calling DB::GetCreationTimeOfOldestFile() by a small chance. The bug was introduced in 6.6 Release.
+* Fix a bug where a boolean value optimize_filters_for_hits was for max threads when calling load table handles after a flush or compaction. The value is correct to 1. The bug should not cause user visible problems.
+* Fix a bug which might crash the service when write buffer manager fails to insert the dummy handle to the block cache.
+
+### Performance Improvements
+* In CompactRange, for levels starting from 0, if the level does not have any file with any key falling in the specified range, the level is skipped. So instead of always compacting from level 0, the compaction starts from the first level with keys in the specified range until the last such level.
+* Reduced memory copy when reading sst footer and blobdb in direct IO mode.
+* When restarting a database with large numbers of sst files, large amount of CPU time is spent on getting logical block size of the sst files, which slows down the starting progress, this inefficiency is optimized away with an internal cache for the logical block sizes.
+
+### New Features
+* Basic support for user timestamp in iterator. Seek/SeekToFirst/Next and lower/upper bounds are supported. Reverse iteration is not supported. Merge is not considered.
+* When file lock failure when the lock is held by the current process, return acquiring time and thread ID in the error message.
+* Added a new option, best_efforts_recovery (default: false), to allow database to open in a db dir with missing table files. During best efforts recovery, missing table files are ignored, and database recovers to the most recent state without missing table file. Cross-column-family consistency is not guaranteed even if WAL is enabled.
+* options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts are now dynamically changeable.
 
-## 6.8.0 (02/24/2020)
+## 6.8.0 (2020-02-24)
 ### Java API Changes
 * Major breaking changes to Java comparators, toward standardizing on ByteBuffer for performant, locale-neutral operations on keys (#6252).
 * Added overloads of common API methods using direct ByteBuffers for keys and values (#2283).
@@ -30,7 +728,7 @@
 * `db_bench` now supports `value_size_distribution_type`, `value_size_min`, `value_size_max` options for generating random variable sized value. Added `blob_db_compression_type` option for BlobDB to enable blob compression.
 * Replace RocksDB namespace "rocksdb" with flag "ROCKSDB_NAMESPACE" which if is not defined, defined as "rocksdb" in header file rocksdb_namespace.h.
 
-## 6.7.0 (01/21/2020)
+## 6.7.0 (2020-01-21)
 ### Public API Change
 * Added a rocksdb::FileSystem class in include/rocksdb/file_system.h to encapsulate file creation/read/write operations, and an option DBOptions::file_system to allow a user to pass in an instance of rocksdb::FileSystem. If its a non-null value, this will take precendence over DBOptions::env for file operations. A new API rocksdb::FileSystem::Default() returns a platform default object. The DBOptions::env option and Env::Default() API will continue to be used for threading and other OS related functions, and where DBOptions::file_system is not specified, for file operations. For storage developers who are accustomed to rocksdb::Env, the interface in rocksdb::FileSystem is new and will probably undergo some changes as more storage systems are ported to it from rocksdb::Env. As of now, no env other than Posix has been ported to the new interface.
 * A new rocksdb::NewSstFileManager() API that allows the caller to pass in separate Env and FileSystem objects.
@@ -55,11 +753,11 @@
 * Introduce ReadOptions.auto_prefix_mode. When set to true, iterator will return the same result as total order seek, but may choose to use prefix seek internally based on seek key and iterator upper bound.
 * MultiGet() can use IO Uring to parallelize read from the same SST file. This featuer is by default disabled. It can be enabled with environment variable ROCKSDB_USE_IO_URING.
 
-## 6.6.2 (01/13/2020)
+## 6.6.2 (2020-01-13)
 ### Bug Fixes
 * Fixed a bug where non-L0 compaction input files were not considered to compute the `creation_time` of new compaction outputs.
 
-## 6.6.1 (01/02/2020)
+## 6.6.1 (2020-01-02)
 ### Bug Fixes
 * Fix a bug in WriteBatchWithIndex::MultiGetFromBatchAndDB, which is called by Transaction::MultiGet, that causes due to stale pointer access when the number of keys is > 32
 * Fixed two performance issues related to memtable history trimming. First, a new SuperVersion is now created only if some memtables were actually trimmed. Second, trimming is only scheduled if there is at least one flushed memtable that is kept in memory for the purposes of transaction conflict checking.
@@ -69,7 +767,7 @@
 * Delete superversions in BackgroundCallPurge.
 * Fix use-after-free and double-deleting files in BackgroundCallPurge().
 
-## 6.6.0 (11/25/2019)
+## 6.6.0 (2019-11-25)
 ### Bug Fixes
 * Fix data corruption caused by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
 * Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
@@ -122,19 +820,19 @@
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
 * Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
 
-## 6.5.2 (11/15/2019)
+## 6.5.2 (2019-11-15)
 ### Bug Fixes
 * Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
 * Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
 * If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
 
-## 6.5.1 (10/16/2019)
+## 6.5.1 (2019-10-16)
 ### Bug Fixes
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
 * Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound.
 * Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
 
-## 6.5.0 (9/13/2019)
+## 6.5.0 (2019-09-13)
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
 * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
@@ -155,7 +853,7 @@
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
-## 6.4.0 (7/30/2019)
+## 6.4.0 (2019-07-30)
 ### Default Option Change
 * LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explicitly created, the small block cache created by BlockBasedTable will still has this option to be 0.0.
 * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true.
@@ -191,7 +889,7 @@
 * Fixed a regression where the fill_cache read option also affected index blocks.
 * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well.
 
-## 6.3.2 (8/15/2019)
+## 6.3.2 (2019-08-15)
 ### Public API Change
 * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 
@@ -199,11 +897,11 @@
 * Fixed a regression where the fill_cache read option also affected index blocks.
 * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well.
 
-## 6.3.1 (7/24/2019)
+## 6.3.1 (2019-07-24)
 ### Bug Fixes
 * Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails.
 
-## 6.3.0 (6/18/2019)
+## 6.3.0 (2019-06-18)
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 * Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers.
@@ -240,7 +938,7 @@
 * Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
 * On DB open, delete WAL trash files left behind in wal_dir
 
-## 6.2.0 (4/30/2019)
+## 6.2.0 (2019-04-30)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
 * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator.
@@ -262,7 +960,7 @@
 * Close a WAL file before another thread deletes it.
 * Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag.
 
-## 6.1.1 (4/9/2019)
+## 6.1.1 (2019-04-09)
 ### New Features
 * When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry.
 
@@ -272,7 +970,7 @@
 * Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction.
 * Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries.
 
-## 6.1.0 (3/27/2019)
+## 6.1.0 (2019-03-27)
 ### New Features
 * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers.
 * Added a feature to perform data-block sampling for compressibility, and report stats to user.
@@ -290,7 +988,7 @@
 * Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms.
 * Fix SstFileReader not able to open file ingested with write_glbal_seqno=true.
 
-## 6.0.0 (2/19/2019)
+## 6.0.0 (2019-02-19)
 ### New Features
 * Enabled checkpoint on readonly db (DBImplReadOnly).
 * Make DB ignore dropped column families while committing results of atomic flush.
@@ -332,7 +1030,7 @@
 ### Change Default Options
 * Change options.compaction_pri's default to kMinOverlappingRatio
 
-## 5.18.0 (11/30/2018)
+## 5.18.0 (2018-11-30)
 ### New Features
 * Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump.
 * Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. Added per-level perf context for bloom filter and `Get` query.
@@ -360,7 +1058,7 @@
 * Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously.
 * The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files.
 
-## 5.17.0 (10/05/2018)
+## 5.17.0 (2018-10-05)
 ### Public API Change
 * `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero.
 * Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not.
@@ -374,21 +1072,21 @@
 * Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction.
 * Sync CURRENT file contents during checkpoint.
 
-## 5.16.3 (10/1/2018)
+## 5.16.3 (2018-10-01)
 ### Bug Fixes
 * Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options.
 
-## 5.16.2 (9/21/2018)
+## 5.16.2 (2018-09-21)
 ### Bug Fixes
 * Fix bug in partition filters with format_version=4.
 
-## 5.16.1 (9/17/2018)
+## 5.16.1 (2018-09-17)
 ### Bug Fixes
 * Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file.
 * Fix RocksDB Java build and tests.
 * Remove sync point in Block destructor.
 
-## 5.16.0 (8/21/2018)
+## 5.16.0 (2018-08-21)
 ### Public API Change
 * The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons
 * GetAllKeyVersions() to take an extra argument of `max_num_ikeys`.
@@ -402,7 +1100,7 @@
 ### Bug Fixes
 * Fix a bug in misreporting the estimated partition index size in properties block.
 
-## 5.15.0 (7/17/2018)
+## 5.15.0 (2018-07-17)
 ### Public API Change
 * Remove managed iterator. ReadOptions.managed is not effective anymore.
 * For bottommost_compression, a compatible CompressionOptions is added via `bottommost_compression_opts`. To keep backward compatible, a new boolean `enabled` is added to CompressionOptions. For compression_opts, it will be always used no matter what value of `enabled` is. For bottommost_compression_opts, it will only be used when user set `enabled=true`, otherwise, compression_opts will be used for bottommost_compression as default.
@@ -428,7 +1126,7 @@
 * Fix a bug caused by not copying the block trailer with compressed SST file, direct IO, prefetcher and no compressed block cache.
 * Fix write can stuck indefinitely if enable_pipelined_write=true. The issue exists since pipelined write was introduced in 5.5.0.
 
-## 5.14.0 (5/16/2018)
+## 5.14.0 (2018-05-16)
 ### Public API Change
 * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages.
 * The background thread naming convention changed (on supporting platforms) to "rocksdb:<thread pool priority><thread number>", e.g., "rocksdb:low0".
@@ -461,7 +1159,7 @@
 * Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances.
 * Added SstFileManager to the Java API to allow managing SST files across DB instances.
 
-## 5.13.0 (3/20/2018)
+## 5.13.0 (2018-03-20)
 ### Public API Change
 * RocksDBOptionsParser::Parse()'s `ignore_unknown_options` argument will only be effective if the option file shows it is generated using a higher version of RocksDB than the current version.
 * Remove CompactionEventListener.
@@ -477,7 +1175,7 @@
 * Fix a leak in prepared_section_completed_ where the zeroed entries would not removed from the map.
 * Fix WAL corruption caused by race condition between user write thread and backup/checkpoint thread.
 
-## 5.12.0 (2/14/2018)
+## 5.12.0 (2018-02-14)
 ### Public API Change
 * Iterator::SeekForPrev is now a pure virtual method. This is to prevent user who implement the Iterator interface fail to implement SeekForPrev by mistake.
 * Add `include_end` option to make the range end exclusive when `include_end == false` in `DeleteFilesInRange()`.
@@ -499,7 +1197,7 @@
 * Fix advance reservation of arena block addresses.
 * Fix handling of empty string as checkpoint directory.
 
-## 5.11.0 (01/08/2018)
+## 5.11.0 (2018-01-08)
 ### Public API Change
 * Add `autoTune` and `getBytesPerSecond()` to RocksJava RateLimiter
 
@@ -516,7 +1214,7 @@
 * Fix a mislabel bug for bottom-pri compaction threads.
 * Fix DB::Flush() keep waiting after flush finish under certain condition.
 
-## 5.10.0 (12/11/2017)
+## 5.10.0 (2017-12-11)
 ### Public API Change
 * When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features.
 
@@ -531,7 +1229,7 @@
 * Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files.
 * Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker.
 
-## 5.9.0 (11/1/2017)
+## 5.9.0 (2017-11-01)
 ### Public API Change
 * `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened.
 * `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default.
@@ -558,7 +1256,7 @@
 * Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery.
 * Fix possible metadata corruption in databases using `DeleteRange()`.
 
-## 5.8.0 (08/30/2017)
+## 5.8.0 (2017-08-30)
 ### Public API Change
 * Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints.
 * `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr.
@@ -578,7 +1276,7 @@
 * Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled.
 * Fix potentially wrong file smallest key when range deletions separated by snapshot are written together.
 
-## 5.7.0 (07/13/2017)
+## 5.7.0 (2017-07-13)
 ### Public API Change
 * DB property "rocksdb.sstables" now prints keys in hex form.
 
@@ -593,7 +1291,7 @@
 ### Bug Fixes
 * Fix discarding empty compaction output files when `DeleteRange()` is used together with subcompactions.
 
-## 5.6.0 (06/06/2017)
+## 5.6.0 (2017-06-06)
 ### Public API Change
 * Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads.
 * Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction.
@@ -610,7 +1308,7 @@
 ### Bug Fixes
 * Shouldn't ignore return value of fsync() in flush.
 
-## 5.5.0 (05/17/2017)
+## 5.5.0 (2017-05-17)
 ### New Features
 * FIFO compaction to support Intra L0 compaction too with CompactionOptionsFIFO.allow_compaction=true.
 * DB::ResetStats() to reset internal stats.
@@ -627,7 +1325,7 @@
 ### Bug Fixes
 * Fix the bug that Direct I/O uses direct reads for non-SST file
 
-## 5.4.0 (04/11/2017)
+## 5.4.0 (2017-04-11)
 ### Public API Change
 * random_access_max_buffer_size no longer has any effect
 * Removed Env::EnableReadAhead(), Env::ShouldForwardRawRequest()
@@ -644,7 +1342,7 @@
 * Introduce level-based L0->L0 compactions to reduce file count, so write delays are incurred less often.
 * (Experimental) Partitioning filters which creates an index on the partitions. The feature can be enabled by setting partition_filters when using kFullFilter. Currently the feature also requires two-level indexing to be enabled. Number of partitions is the same as the number of partitions for indexes, which is controlled by metadata_block_size.
 
-## 5.3.0 (03/08/2017)
+## 5.3.0 (2017-03-08)
 ### Public API Change
 * Remove disableDataSync option.
 * Remove timeout_hint_us option from WriteOptions. The option has been deprecated and has no effect since 3.13.0.
@@ -654,7 +1352,7 @@
 ### Bug Fixes
 * Fix the bug that iterator may skip keys
 
-## 5.2.0 (02/08/2017)
+## 5.2.0 (2017-02-08)
 ### Public API Change
 * NewLRUCache() will determine number of shard bits automatically based on capacity, if the user doesn't pass one. This also impacts the default block cache when the user doesn't explicit provide one.
 * Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files.
@@ -672,7 +1370,7 @@
 * Some fixes related to 2PC.
 * Fix bugs of data corruption in direct I/O
 
-## 5.1.0 (01/13/2017)
+## 5.1.0 (2017-01-13)
 * Support dynamically change `delete_obsolete_files_period_micros` option via SetDBOptions().
 * Added EventListener::OnExternalFileIngested which will be called when IngestExternalFile() add a file successfully.
 * BackupEngine::Open and BackupEngineReadOnly::Open now always return error statuses matching those of the backup Env.
@@ -681,7 +1379,7 @@
 * Fix the bug that if 2PC is enabled, checkpoints may loss some recent transactions.
 * When file copying is needed when creating checkpoints or bulk loading files, fsync the file after the file copying.
 
-## 5.0.0 (11/17/2016)
+## 5.0.0 (2016-11-17)
 ### Public API Change
 * Options::max_bytes_for_level_multiplier is now a double along with all getters and setters.
 * Support dynamically change `delayed_write_rate` and `max_total_wal_size` options via SetDBOptions().
@@ -700,7 +1398,7 @@
 * Add LuaCompactionFilter in utilities.  This allows developers to write compaction filters in Lua.  To use this feature, LUA_PATH needs to be set to the root directory of Lua.
 * No longer populate "LATEST_BACKUP" file in backup directory, which formerly contained the number of the latest backup. The latest backup can be determined by finding the highest numbered file in the "meta/" subdirectory.
 
-## 4.13.0 (10/18/2016)
+## 4.13.0 (2016-10-18)
 ### Public API Change
 * DB::GetOptions() reflect dynamic changed options (i.e. through DB::SetOptions()) and return copy of options instead of reference.
 * Added Statistics::getAndResetTickerCount().
@@ -709,7 +1407,7 @@
 * Add DB::SetDBOptions() to dynamic change base_background_compactions and max_background_compactions.
 * Added Iterator::SeekForPrev(). This new API will seek to the last key that less than or equal to the target key.
 
-## 4.12.0 (9/12/2016)
+## 4.12.0 (2016-09-12)
 ### Public API Change
 * CancelAllBackgroundWork() flushes all memtables for databases containing writes that have bypassed the WAL (writes issued with WriteOptions::disableWAL=true) before shutting down background threads.
 * Merge options source_compaction_factor, max_grandparent_overlap_bytes and expanded_compaction_factor into max_compaction_bytes.
@@ -721,7 +1419,7 @@
 * Change ticker/histogram statistics implementations to accumulate data in thread-local storage, which improves CPU performance by reducing cache coherency costs. Callers of CreateDBStatistics do not need to change anything to use this feature.
 * Block cache mid-point insertion, where index and filter block are inserted into LRU block cache with higher priority. The feature can be enabled by setting BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority to true and high_pri_pool_ratio > 0 when creating NewLRUCache.
 
-## 4.11.0 (8/1/2016)
+## 4.11.0 (2016-08-01)
 ### Public API Change
 * options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter.
 
@@ -729,7 +1427,7 @@
 * A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h.
 * Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destroying iterators.
 
-## 4.10.0 (7/5/2016)
+## 4.10.0 (2016-07-05)
 ### Public API Change
 * options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes
 * enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one.
@@ -741,7 +1439,7 @@
 * RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family.
 * Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances.
 
-## 4.9.0 (6/9/2016)
+## 4.9.0 (2016-06-09)
 ### Public API changes
 * Add bottommost_compression option, This option can be used to set a specific compression algorithm for the bottommost level (Last level containing files in the DB).
 * Introduce CompactionJobInfo::compression, This field state the compression algorithm used to generate the output files of the compaction.
@@ -751,7 +1449,7 @@
 ### New Features
 * Introduce NewSimCache() in rocksdb/utilities/sim_cache.h. This function creates a block cache that is able to give simulation results (mainly hit rate) of simulating block behavior with a configurable cache size.
 
-## 4.8.0 (5/2/2016)
+## 4.8.0 (2016-05-02)
 ### Public API Change
 * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes.
 * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F
@@ -761,12 +1459,12 @@
 ### New Features
 * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size.
 
-## 4.7.0 (4/8/2016)
+## 4.7.0 (2016-04-08)
 ### Public API Change
 * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too.
 * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options.
 
-## 4.6.0 (3/10/2016)
+## 4.6.0 (2016-03-10)
 ### Public API Changes
 * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier.
 * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly.
@@ -777,7 +1475,7 @@
 * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification.
 * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned"
 
-## 4.5.0 (2/5/2016)
+## 4.5.0 (2016-02-05)
 ### Public API Changes
 * Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes.
 * Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
@@ -788,7 +1486,7 @@
 * Add kPersistedTier to ReadTier.  This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true.
 * Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
 
-## 4.4.0 (1/14/2016)
+## 4.4.0 (2016-01-14)
 ### Public API Changes
 * Change names in CompactionPri and add a new one.
 * Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit.
@@ -798,7 +1496,7 @@
 * Increase default options.delayed_write_rate to 2MB/s.
 * Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb.
 
-## 4.3.0 (12/8/2015)
+## 4.3.0 (2015-12-08)
 ### New Features
 * CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key.
 * RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions.
@@ -808,7 +1506,7 @@
 ### Public API Changes
 * When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families.
 
-## 4.2.0 (11/9/2015)
+## 4.2.0 (2015-11-09)
 ### New Features
 * Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions.
 * Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families.
@@ -821,7 +1519,7 @@
 * Remove DefaultCompactionFilterFactory.
 
 
-## 4.1.0 (10/8/2015)
+## 4.1.0 (2015-10-08)
 ### New Features
 * Added single delete operation as a more efficient way to delete keys that have not been overwritten.
 * Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info.
@@ -835,7 +1533,7 @@
 * CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand.
 * We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13.
 
-## 4.0.0 (9/9/2015)
+## 4.0.0 (2015-09-09)
 ### New Features
 * Added support for transactions.  See include/rocksdb/utilities/transaction.h for more info.
 * DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used.
@@ -848,7 +1546,7 @@
 * Added Equal() method to the Comparator interface that can optionally be overwritten in cases where equality comparisons can be done more efficiently than three-way comparisons.
 * Previous 'experimental' OptimisticTransaction class has been replaced by Transaction class.
 
-## 3.13.0 (8/6/2015)
+## 3.13.0 (2015-08-06)
 ### New Features
 * RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex
 * Add NewCompactOnDeletionCollectorFactory() in utilities/table_properties_collectors, which allows rocksdb to mark a SST file as need-compaction when it observes at least D deletion entries in any N consecutive entries in that SST file.  Note that this feature depends on an experimental NeedCompact() API --- the result of this API will not persist after DB restart.
@@ -863,7 +1561,7 @@
 * Add statistics::getHistogramString() to print detailed distribution of a histogram metric.
 * Add DBOptions::skip_stats_update_on_db_open.  When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction.
 
-## 3.12.0 (7/2/2015)
+## 3.12.0 (2015-07-02)
 ### New Features
 * Added experimental support for optimistic transactions.  See include/rocksdb/utilities/optimistic_transaction.h for more info.
 * Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds)
@@ -893,7 +1591,7 @@
 * Add BackupEngineImpl.options_.max_background_operations to specify the maximum number of operations that may be performed in parallel. Add support for parallelized backup and restore.
 * Add DB::SyncWAL() that does a WAL sync without blocking writers.
 
-## 3.11.0 (5/19/2015)
+## 3.11.0 (2015-05-19)
 ### New Features
 * Added a new API Cache::SetCapacity(size_t capacity) to dynamically change the maximum configured capacity of the cache. If the new capacity is less than the existing cache usage, the implementation will try to lower the usage by evicting the necessary number of elements following a strict LRU policy.
 * Added an experimental API for handling flashcache devices (blacklists background threads from caching their reads) -- NewFlashcacheAwareEnv
@@ -904,7 +1602,7 @@
 * TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users.
 * DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync
 
-## 3.10.0 (3/24/2015)
+## 3.10.0 (2015-03-24)
 ### New Features
 * GetThreadStatus() is now able to report detailed thread status, including:
  - Thread Operation including flush and compaction.
@@ -939,7 +1637,7 @@
 * lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`.
 * Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly.
 
-## 3.9.0 (12/8/2014)
+## 3.9.0 (2014-12-08)
 
 ### New Features
 * Add rocksdb::GetThreadList(), which in the future will return the current status of all
@@ -958,7 +1656,7 @@
 ### Improvements
 * RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
 
-## 3.8.0 (11/14/2014)
+## 3.8.0 (2014-11-14)
 
 ### Public API changes
 * BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
@@ -972,14 +1670,14 @@
 * CompactFiles and EventListener, although they are still in experimental state
 * Full ColumnFamily support in RocksJava.
 
-## 3.7.0 (11/6/2014)
+## 3.7.0 (2014-11-06)
 ### Public API changes
 * Introduce SetOptions() API to allow adjusting a subset of options dynamically online
 * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
 * Remove WriteBatchWithIndex.Delete() overloads using SliceParts
 * When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
 
-## 3.6.0 (10/7/2014)
+## 3.6.0 (2014-10-07)
 ### Disk format changes
 * If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
 
@@ -992,7 +1690,7 @@
 * Change target_file_size_base type to uint64_t from int.
 * Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
 
-## 3.5.0 (9/3/2014)
+## 3.5.0 (2014-09-03)
 ### New Features
 * Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it.
 * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
@@ -1003,7 +1701,7 @@
 ### Public API changes
 * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
 
-## 3.4.0 (8/18/2014)
+## 3.4.0 (2014-08-18)
 ### New Features
 * Support Multiple DB paths in universal style compactions
 * Add feature of storing plain table index and bloom filter in SST file.
@@ -1019,7 +1717,7 @@
 * Add DB::GetIntProperty(), which returns DB properties that are integer as uint64_t.
 * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
 
-## 3.3.0 (7/10/2014)
+## 3.3.0 (2014-07-10)
 ### New Features
 * Added JSON API prototype.
 * HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory().
@@ -1030,7 +1728,7 @@
 ### Public API changes
 * Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect.
 
-## 3.2.0 (06/20/2014)
+## 3.2.0 (2014-06-20)
 
 ### Public API changes
 * We removed seek compaction as a concept from RocksDB because:
@@ -1048,7 +1746,7 @@
 ### Performance Improvements
 * Tailing Iterator re-implemeted with ForwardIterator + Cascading Search Hint , see ~20% throughput improvement.
 
-## 3.1.0 (05/21/2014)
+## 3.1.0 (2014-05-21)
 
 ### Public API changes
 * Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
@@ -1057,7 +1755,7 @@
 * Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
 * FIFO compaction style
 
-## 3.0.0 (05/05/2014)
+## 3.0.0 (2014-05-05)
 
 ### Public API changes
 * Added _LEVEL to all InfoLogLevel enums
@@ -1069,7 +1767,7 @@
 * Added an option to use different checksum functions in BlockBasedTableOptions
 * Added ApplyToAllCacheEntries() function to Cache
 
-## 2.8.0 (04/04/2014)
+## 2.8.0 (2014-04-04)
 
 * Removed arena.h from public header files.
 * By default, checksums are verified on every read from database
@@ -1098,7 +1796,7 @@
 * Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
 * Geo-spatial support for locations and radial-search.
 
-## 2.7.0 (01/28/2014)
+## 2.7.0 (2014-01-28)
 
 ### Public API changes
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/INSTALL.md mariadb-10.11.13/storage/rocksdb/rocksdb/INSTALL.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/INSTALL.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/INSTALL.md	2025-05-19 16:14:27.000000000 +0000
@@ -43,6 +43,8 @@
       command line flags processing. You can compile rocksdb library even
       if you don't have gflags installed.
 
+* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
+
 * If you wish to build the RocksJava static target, then cmake is required for building Snappy.
 
 ## Supported platforms
@@ -94,12 +96,21 @@
               sudo yum install libasan
 
     * Install zstandard:
+        * With [EPEL](https://fedoraproject.org/wiki/EPEL):
+
+              sudo yum install libzstd-devel
+
+        * With CentOS 8:
+
+              sudo dnf install libzstd-devel
+
+        * From source:
 
-             wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
-             mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
-             tar zxvf zstd-1.1.3.tar.gz
-             cd zstd-1.1.3
-             make && sudo make install
+              wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
+              mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
+              tar zxvf zstd-1.1.3.tar.gz
+              cd zstd-1.1.3
+              make && sudo make install
 
 * **OS X**:
     * Install latest C++ compiler that supports C++ 11:
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md mariadb-10.11.13/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/LANGUAGE-BINDINGS.md	2025-05-19 16:14:27.000000000 +0000
@@ -1,6 +1,6 @@
 This is the list of all known third-party language bindings for RocksDB. If something is missing, please open a pull request to add it.
 
-* Java - https://github.com/facebook/rocksdb/tree/master/java
+* Java - https://github.com/facebook/rocksdb/tree/main/java
 * Python
     * http://python-rocksdb.readthedocs.io/en/latest/
     * http://pyrocksdb.readthedocs.org/en/latest/ (unmaintained)
@@ -10,7 +10,9 @@
 * Ruby - http://rubygems.org/gems/rocksdb-ruby
 * Haskell - https://hackage.haskell.org/package/rocksdb-haskell
 * PHP - https://github.com/Photonios/rocksdb-php
-* C# - https://github.com/warrenfalk/rocksdb-sharp
+* C#
+    * https://github.com/warrenfalk/rocksdb-sharp
+    * https://github.com/curiosity-ai/rocksdb-sharp
 * Rust
     * https://github.com/pingcap/rust-rocksdb (used in production fork of https://github.com/spacejam/rust-rocksdb)
     * https://github.com/spacejam/rust-rocksdb
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/Makefile
--- mariadb-10.11.11/storage/rocksdb/rocksdb/Makefile	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/Makefile	2025-05-19 16:14:27.000000000 +0000
@@ -8,6 +8,11 @@
 
 BASH_EXISTS := $(shell which bash)
 SHELL := $(shell which bash)
+# Default to python3. Some distros like CentOS 8 do not have `python`.
+ifeq ($(origin PYTHON), undefined)
+	PYTHON := $(shell which python3 || which python || echo python3)
+endif
+export PYTHON
 
 CLEAN_FILES = # deliberately empty, so we can append below.
 CFLAGS += ${EXTRA_CFLAGS}
@@ -43,60 +48,43 @@
 # Set the default DEBUG_LEVEL to 1
 DEBUG_LEVEL?=1
 
-ifeq ($(MAKECMDGOALS),dbg)
-	DEBUG_LEVEL=2
-endif
-
-ifeq ($(MAKECMDGOALS),clean)
-	DEBUG_LEVEL=0
-endif
-
-ifeq ($(MAKECMDGOALS),release)
-	DEBUG_LEVEL=0
-endif
+# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
+# Mode "static" means to link against static libraries (.a)
+# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
+#
+# Set the default LIB_MODE to static
+LIB_MODE?=static
 
-ifeq ($(MAKECMDGOALS),shared_lib)
-	DEBUG_LEVEL=0
-endif
+# OBJ_DIR is where the object files reside.  Default to the current directory
+OBJ_DIR?=.
 
-ifeq ($(MAKECMDGOALS),install-shared)
-	DEBUG_LEVEL=0
-endif
+# Check the MAKECMDGOALS to set the DEBUG_LEVEL and LIB_MODE appropriately
 
-ifeq ($(MAKECMDGOALS),static_lib)
+ifneq ($(filter clean release install, $(MAKECMDGOALS)),)
 	DEBUG_LEVEL=0
 endif
-
-ifeq ($(MAKECMDGOALS),install-static)
+ifneq ($(filter dbg, $(MAKECMDGOALS)),)
+	DEBUG_LEVEL=2
+else ifneq ($(filter shared_lib install-shared, $(MAKECMDGOALS)),)
 	DEBUG_LEVEL=0
-endif
-
-ifeq ($(MAKECMDGOALS),install)
+	LIB_MODE=shared
+else ifneq ($(filter static_lib install-static, $(MAKECMDGOALS)),)
 	DEBUG_LEVEL=0
-endif
-
-ifeq ($(MAKECMDGOALS),rocksdbjavastatic)
-	ifneq ($(DEBUG_LEVEL),2)
-		DEBUG_LEVEL=0
-	endif
-endif
-
-ifeq ($(MAKECMDGOALS),rocksdbjavastaticrelease)
-	ifneq ($(DEBUG_LEVEL),2)
-		DEBUG_LEVEL=0
+	LIB_MODE=static
+else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
+	OBJ_DIR=jl
+	LIB_MODE=shared
+	ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),)
+		OBJ_DIR=jls
+		ifneq ($(DEBUG_LEVEL),2)
+			DEBUG_LEVEL=0
+		endif
+		ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
+			DEBUG_LEVEL=0
+		endif
 	endif
 endif
 
-ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker)
-	ifneq ($(DEBUG_LEVEL),2)
-		DEBUG_LEVEL=0
-	endif
-endif
-
-ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
-	DEBUG_LEVEL=0
-endif
-
 $(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
 
 # Lite build flag.
@@ -116,11 +104,14 @@
 # Figure out optimize level.
 ifneq ($(DEBUG_LEVEL), 2)
 ifeq ($(LITE), 0)
-	OPT += -O2
+	OPTIMIZE_LEVEL ?= -O2
 else
-	OPT += -Os
+	OPTIMIZE_LEVEL ?= -Os
 endif
 endif
+# `OPTIMIZE_LEVEL` is empty when the user does not set it and `DEBUG_LEVEL=2`.
+# In that case, the compiler default (`-O0` for gcc and clang) will be used.
+OPT += $(OPTIMIZE_LEVEL)
 
 # compile with -O2 if debug level is not 2
 ifneq ($(DEBUG_LEVEL), 2)
@@ -143,10 +134,10 @@
 HAVE_POWER8=1
 endif
 
-ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1))
-CXXFLAGS += -march=armv8-a+crc+crypto
-CFLAGS += -march=armv8-a+crc+crypto
-ARMCRC_SOURCE=1
+# if we're compiling for shared libraries, add the shared flags
+ifeq ($(LIB_MODE),shared)
+CXXFLAGS += $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL
+CFLAGS +=  $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL
 endif
 
 # if we're compiling for release, compile without debug code (-DNDEBUG)
@@ -165,13 +156,35 @@
 	CXXFLAGS += -fno-rtti
 endif
 
+ifdef ASSERT_STATUS_CHECKED
+# For ASC, turn off constructor elision, preventing the case where a constructor returned
+# by a method may pass the ASC check if the status is checked in the inner method.  Forcing
+# the copy constructor to be invoked disables the optimization and will cause the calling method
+# to check the status in order to prevent an error from being raised.
+PLATFORM_CXXFLAGS += -fno-elide-constructors
+ifeq ($(filter -DROCKSDB_ASSERT_STATUS_CHECKED,$(OPT)),)
+	OPT += -DROCKSDB_ASSERT_STATUS_CHECKED
+endif
+endif
+
 $(warning Warning: Compiling in debug mode. Don't use the resulting binary in production)
 endif
 
+# `USE_LTO=1` enables link-time optimizations. Among other things, this enables
+# more devirtualization opportunities and inlining across translation units.
+# This can save significant overhead introduced by RocksDB's pluggable
+# interfaces/internal abstractions, like in the iterator hierarchy. It works
+# better when combined with profile-guided optimizations (not currently
+# supported natively in Makefile).
+ifeq ($(USE_LTO), 1)
+	CXXFLAGS += -flto
+	LDFLAGS += -flto -fuse-linker-plugin
+endif
+
 #-----------------------------------------------
 include src.mk
 
-AM_DEFAULT_VERBOSITY = 0
+AM_DEFAULT_VERBOSITY ?= 0
 
 AM_V_GEN = $(am__v_GEN_$(V))
 am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY))
@@ -186,12 +199,16 @@
 am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY))
 am__v_CC_0 = @echo "  CC      " $@;
 am__v_CC_1 =
-CCLD = $(CC)
-LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+
 AM_V_CCLD = $(am__v_CCLD_$(V))
 am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY))
+ifneq ($(SKIP_LINK), 1)
 am__v_CCLD_0 = @echo "  CCLD    " $@;
 am__v_CCLD_1 =
+else
+am__v_CCLD_0 = @echo "  !CCLD   " $@; true skip
+am__v_CCLD_1 = true skip
+endif
 AM_V_AR = $(am__v_AR_$(V))
 am__v_AR_ = $(am__v_AR_$(AM_DEFAULT_VERBOSITY))
 am__v_AR_0 = @echo "  AR      " $@;
@@ -199,15 +216,66 @@
 
 ifdef ROCKSDB_USE_LIBRADOS
 LIB_SOURCES += utilities/env_librados.cc
+TEST_MAIN_SOURCES += utilities/env_librados_test.cc
 LDFLAGS += -lrados
 endif
 
-AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
-# detect what platform we're building on
-dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; export PORTABLE="$(PORTABLE)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
+AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(LDFLAGS) -o $@
+
+# Detect what platform we're building on.
+# Export some common variables that might have been passed as Make variables
+# instead of environment variables.
+dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
+                  export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
+                  export LDFLAGS="$(EXTRA_LDFLAGS)"; \
+                  export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
+                  export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
+                  export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
+                  export PORTABLE="$(PORTABLE)"; \
+                  export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
+                  export USE_CLANG="$(USE_CLANG)"; \
+                  "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
 # this file is generated by the previous line to set build flags and sources
 include make_config.mk
-CLEAN_FILES += make_config.mk
+
+ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk)
+include $(ROCKSDB_PLUGIN_MKS)
+ROCKSDB_PLUGIN_SOURCES = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach source, $($(plugin)_SOURCES), plugin/$(plugin)/$(source)))
+ROCKSDB_PLUGIN_HEADERS = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach header, $($(plugin)_HEADERS), plugin/$(plugin)/$(header)))
+ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_PKGCONFIG_REQUIRES))
+PLATFORM_LDFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_LDFLAGS))
+CXXFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_CXXFLAGS))
+
+ifneq ($(strip $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES)),)
+LDFLAGS := $(LDFLAGS) $(shell pkg-config --libs $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))
+ifneq ($(.SHELLSTATUS),0)
+$(error pkg-config failed)
+endif
+CXXFLAGS := $(CXXFLAGS) $(shell pkg-config --cflags $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))
+ifneq ($(.SHELLSTATUS),0)
+$(error pkg-config failed)
+endif
+endif
+
+CXXFLAGS += $(ARCHFLAG)
+
+ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1))
+ifneq ($(PLATFORM),OS_MACOSX)
+CXXFLAGS += -march=armv8-a+crc+crypto
+CFLAGS += -march=armv8-a+crc+crypto
+ARMCRC_SOURCE=1
+endif
+endif
+
+export JAVAC_ARGS
+CLEAN_FILES += make_config.mk rocksdb.pc
+
+ifeq ($(V), 1)
+$(info $(shell uname -a))
+$(info $(shell $(CC) --version))
+$(info $(shell $(CXX) --version))
+endif
 
 missing_make_config_paths := $(shell				\
 	grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | 	\
@@ -216,7 +284,7 @@
 	done | sort | uniq)
 
 $(foreach path, $(missing_make_config_paths), \
-	$(warning Warning: $(path) dont exist))
+	$(warning Warning: $(path) does not exist))
 
 ifeq ($(PLATFORM), OS_AIX)
 # no debug info
@@ -244,12 +312,37 @@
 	LUA_PATH =
 endif
 
+ifeq ($(LIB_MODE),shared)
+# So that binaries are executable from build location, in addition to install location
+EXEC_LDFLAGS += -Wl,-rpath -Wl,'$$ORIGIN'
+endif
+
+ifeq ($(PLATFORM), OS_MACOSX)
+ifeq ($(ARCHFLAG), -arch arm64)
+ifneq ($(MACHINE), arm64)
+# If we're building on a non-arm64 machine but targeting arm64 Mac, we need to disable
+# linking with jemalloc (as it won't be arm64-compatible) and remove some other options
+# set during platform detection
+DISABLE_JEMALLOC=1
+PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
+PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
+endif
+endif
+endif
+
 # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
 ifdef COMPILE_WITH_ASAN
 	DISABLE_JEMALLOC=1
 	EXEC_LDFLAGS += -fsanitize=address
 	PLATFORM_CCFLAGS += -fsanitize=address
 	PLATFORM_CXXFLAGS += -fsanitize=address
+ifeq ($(LIB_MODE),shared)
+ifdef USE_CLANG
+# Fix false ODR violation; see https://github.com/google/sanitizers/issues/1017
+	EXEC_LDFLAGS += -mllvm -asan-use-private-alias=1
+	PLATFORM_CXXFLAGS += -mllvm -asan-use-private-alias=1
+endif
+endif
 endif
 
 # TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc.
@@ -289,6 +382,12 @@
 	PLATFORM_CCFLAGS += -DROCKSDB_VALGRIND_RUN
 	PLATFORM_CXXFLAGS += -DROCKSDB_VALGRIND_RUN
 endif
+ifdef ROCKSDB_FULL_VALGRIND_RUN
+	# Some tests are slow when run under valgrind and are only run when
+	# explicitly requested via the ROCKSDB_FULL_VALGRIND_RUN compiler flag.
+	PLATFORM_CCFLAGS += -DROCKSDB_VALGRIND_RUN -DROCKSDB_FULL_VALGRIND_RUN
+	PLATFORM_CXXFLAGS += -DROCKSDB_VALGRIND_RUN -DROCKSDB_FULL_VALGRIND_RUN
+endif
 
 ifndef DISABLE_JEMALLOC
 	ifdef JEMALLOC
@@ -308,9 +407,14 @@
 	USE_FOLLY_DISTRIBUTED_MUTEX=0
 endif
 
-export GTEST_THROW_ON_FAILURE=1
-export GTEST_HAS_EXCEPTIONS=1
-GTEST_DIR = ./third-party/gtest-1.8.1/fused-src
+ifndef GTEST_THROW_ON_FAILURE
+	export GTEST_THROW_ON_FAILURE=1
+endif
+ifndef GTEST_HAS_EXCEPTIONS
+	export GTEST_HAS_EXCEPTIONS=1
+endif
+
+GTEST_DIR = third-party/gtest-1.8.1/fused-src
 # AIX: pre-defined system headers are surrounded by an extern "C" block
 ifeq ($(PLATFORM), OS_AIX)
 	PLATFORM_CCFLAGS += -I$(GTEST_DIR)
@@ -336,6 +440,14 @@
   PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
   PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
 endif
+ifdef TEST_UINT128_COMPAT
+  PLATFORM_CCFLAGS += -DTEST_UINT128_COMPAT=1
+  PLATFORM_CXXFLAGS += -DTEST_UINT128_COMPAT=1
+endif
+ifdef ROCKSDB_MODIFY_NPHASH
+  PLATFORM_CCFLAGS += -DROCKSDB_MODIFY_NPHASH=1
+  PLATFORM_CXXFLAGS += -DROCKSDB_MODIFY_NPHASH=1
+endif
 
 # This (the first rule) must depend on "all".
 default: all
@@ -343,6 +455,15 @@
 WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \
   -Wunused-parameter
 
+ifeq (,$(filter amd64, $(MACHINE)))
+	C_WARNING_FLAGS = -Wstrict-prototypes
+endif
+
+ifdef USE_CLANG
+	# Used by some teams in Facebook
+	WARNING_FLAGS += -Wshift-sign-overflow
+endif
+
 ifeq ($(PLATFORM), OS_OPENBSD)
 	WARNING_FLAGS += -Wno-unused-lambda-capture
 endif
@@ -382,69 +503,113 @@
 	CXXFLAGS += -DNO_THREEWAY_CRC32C
 endif
 
-CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CFLAGS += $(C_WARNING_FLAGS) $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers
 
 LDFLAGS += $(PLATFORM_LDFLAGS)
 
-# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but
-# the file needs to already exist or else the build will fail
-ifndef NO_UPDATE_BUILD_VERSION
-date := $(shell date +%F)
-ifdef FORCE_GIT_SHA
-	git_sha := $(FORCE_GIT_SHA)
-else
-	git_sha := $(shell git rev-parse HEAD 2>/dev/null)
-endif
-gen_build_version = sed -e s/@@GIT_SHA@@/$(git_sha)/ -e s/@@GIT_DATE_TIME@@/$(date)/ util/build_version.cc.in
-
-# Record the version of the source that we are compiling.
-# We keep a record of the git revision in this file.  It is then built
-# as a regular source file as part of the compilation process.
-# One can run "strings executable_filename | grep _build_" to find
-# the version of the source that we used to build the executable file.
-FORCE:
-util/build_version.cc: FORCE
-	$(AM_V_GEN)rm -f $@-t
-	$(AM_V_at)$(gen_build_version) > $@-t
-	$(AM_V_at)if test -f $@; then					\
-	  cmp -s $@-t $@ && rm -f $@-t || mv -f $@-t $@;		\
-	else mv -f $@-t $@; fi
-endif
-
-LIBOBJECTS = $(LIB_SOURCES:.cc=.o)
+LIB_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES))
+LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ROCKSDB_PLUGIN_SOURCES))
 ifeq ($(HAVE_POWER8),1)
-LIB_CC_OBJECTS = $(LIB_SOURCES:.cc=.o)
-LIBOBJECTS += $(LIB_SOURCES_C:.c=.o)
-LIBOBJECTS += $(LIB_SOURCES_ASM:.S=.o)
-else
-LIB_CC_OBJECTS = $(LIB_SOURCES:.cc=.o)
+LIB_OBJECTS += $(patsubst %.c, $(OBJ_DIR)/%.o, $(LIB_SOURCES_C))
+LIB_OBJECTS += $(patsubst %.S, $(OBJ_DIR)/%.o, $(LIB_SOURCES_ASM))
 endif
 
-LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o)
-MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o)
 ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
-  FOLLYOBJECTS = $(FOLLY_SOURCES:.cpp=.o)
+  LIB_OBJECTS += $(patsubst %.cpp, $(OBJ_DIR)/%.o, $(FOLLY_SOURCES))
+endif
+
+# range_tree is not compatible with non GNU libc on ppc64
+# see https://jira.percona.com/browse/PS-7559
+ifneq ($(PPC_LIBC_IS_GNU),0)
+  LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES))
 endif
 
-GTEST = $(GTEST_DIR)/gtest/gtest-all.o
-TESTUTIL = ./test_util/testutil.o
-TESTHARNESS = ./test_util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST)
+GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o
+TESTUTIL = $(OBJ_DIR)/test_util/testutil.o
+TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST)
 VALGRIND_ERROR = 2
 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
 
 VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
+# Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable
 
-BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
+TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
+BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
+CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
+TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES))
+ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES))
+STRESS_OBJECTS =  $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES))
+
+# Exclude build_version.cc -- a generated source file -- from all sources.  Not needed for dependencies
+ALL_SOURCES  = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
+ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
+ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES)
+ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES)
 
-ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o)
+TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES)))
+TESTS += $(patsubst %.c, %, $(notdir $(TEST_MAIN_SOURCES_C)))
 
-STRESSTOOLOBJECTS = $(STRESS_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+	TESTS += folly_synchronization_distributed_mutex_test
+	ALL_SOURCES += third-party/folly/folly/synchronization/test/DistributedMutexTest.cc
+endif
+
+# `make check-headers` to very that each header file includes its own
+# dependencies
+ifneq ($(filter check-headers, $(MAKECMDGOALS)),)
+# TODO: add/support JNI headers
+	DEV_HEADER_DIRS := $(sort include/ hdfs/ $(dir $(ALL_SOURCES)))
+# Some headers like in port/ are platform-specific
+	DEV_HEADERS := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | egrep -v 'port/|plugin/|lua/|range_tree/|tools/rdb/db_wrapper.h|include/rocksdb/utilities/env_librados.h')
+else
+	DEV_HEADERS :=
+endif
+HEADER_OK_FILES = $(patsubst %.h, %.h.ok, $(DEV_HEADERS))
+
+AM_V_CCH = $(am__v_CCH_$(V))
+am__v_CCH_ = $(am__v_CCH_$(AM_DEFAULT_VERBOSITY))
+am__v_CCH_0 = @echo "  CC.h    " $<;
+am__v_CCH_1 =
+
+%.h.ok: %.h # .h.ok not actually created, so re-checked on each invocation
+# -DROCKSDB_NAMESPACE=42 ensures the namespace header is included
+	$(AM_V_CCH) echo '#include "$<"' | $(CXX) $(CXXFLAGS) -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null
+
+check-headers: $(HEADER_OK_FILES)
+
+# options_settable_test doesn't pass with UBSAN as we use hack in the test
+ifdef COMPILE_WITH_UBSAN
+        TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g')
+endif
+ifdef ASSERT_STATUS_CHECKED
+	# TODO: finish fixing all tests to pass this check
+	TESTS_FAILING_ASC = \
+		c_test \
+		env_test \
+		range_locking_test \
+		testutil_test \
+
+	# Since we have very few ASC exclusions left, excluding them from
+	# the build is the most convenient way to exclude them from testing
+	TESTS := $(filter-out $(TESTS_FAILING_ASC),$(TESTS))
+endif
+
+ROCKSDBTESTS_SUBSET ?= $(TESTS)
+
+# env_test - suspicious use of test::TmpDir
+# deletefile_test - serial because it generates giant temporary files in
+#   its various tests. Parallel can fill up your /dev/shm
+NON_PARALLEL_TEST = \
+	env_test \
+	deletefile_test \
 
-EXPOBJECTS = $(LIBOBJECTS) $(TESTUTIL)
+PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS))
 
-TESTS = \
+# Not necessarily well thought out or up-to-date, but matches old list
+TESTS_PLATFORM_DEPENDENT := \
 	db_basic_test \
+	db_blob_basic_test \
 	db_encryption_test \
 	db_test2 \
 	external_sst_file_basic_test \
@@ -459,220 +624,115 @@
 	env_basic_test \
 	env_test \
 	env_logger_test \
+	io_posix_test \
 	hash_test \
 	random_test \
+	ribbon_test \
 	thread_local_test \
+	work_queue_test \
 	rate_limiter_test \
 	perf_context_test \
 	iostats_context_test \
 	db_wal_test \
-	db_block_cache_test \
-	db_test \
-	db_blob_index_test \
-	db_iter_test \
-	db_iter_stress_test \
-	db_log_iter_test \
-	db_bloom_filter_test \
-	db_compaction_filter_test \
-	db_compaction_test \
-	db_dynamic_level_test \
-	db_flush_test \
-	db_inplace_update_test \
-	db_iterator_test \
-	db_memtable_test \
-	db_merge_operator_test \
-	db_merge_operand_test \
-	db_options_test \
-	db_range_del_test \
-	db_secondary_test \
-	db_sst_test \
-	db_tailing_iter_test \
-	db_io_failure_test \
-	db_properties_test \
-	db_table_properties_test \
-	db_statistics_test \
-	db_write_test \
-	error_handler_test \
-	autovector_test \
-	blob_db_test \
-	cleanable_test \
-	column_family_test \
-	table_properties_collector_test \
-	arena_test \
-	block_test \
-	data_block_hash_index_test \
-	cache_test \
-	corruption_test \
-	slice_test \
-	slice_transform_test \
-	dbformat_test \
-	fault_injection_test \
-	filelock_test \
-	filename_test \
-	file_reader_writer_test \
-	block_based_filter_block_test \
-	full_filter_block_test \
-	partitioned_filter_block_test \
-	hash_table_test \
-	histogram_test \
-	log_test \
-	manual_compaction_test \
-	mock_env_test \
-	memtable_list_test \
-	merge_helper_test \
-	memory_test \
-	merge_test \
-	merger_test \
-	util_merge_operators_test \
-	options_file_test \
-	reduce_levels_test \
-	plain_table_db_test \
-	comparator_db_test \
-	external_sst_file_test \
-	import_column_family_test \
-	prefix_test \
-	skiplist_test \
-	write_buffer_manager_test \
-	stringappend_test \
-	cassandra_format_test \
-	cassandra_functional_test \
-	cassandra_row_merge_test \
-	cassandra_serialize_test \
-	ttl_test \
-	backupable_db_test \
-	cache_simulator_test \
-	sim_cache_test \
-	version_edit_test \
-	version_set_test \
-	compaction_picker_test \
-	version_builder_test \
-	file_indexer_test \
-	write_batch_test \
-	write_batch_with_index_test \
-	write_controller_test\
-	deletefile_test \
-	obsolete_files_test \
-	table_test \
-	delete_scheduler_test \
-	options_test \
-	options_settable_test \
-	options_util_test \
-	event_logger_test \
-	timer_queue_test \
-	cuckoo_table_builder_test \
-	cuckoo_table_reader_test \
-	cuckoo_table_db_test \
-	flush_job_test \
-	wal_manager_test \
-	listener_test \
-	compaction_iterator_test \
-	compaction_job_test \
-	thread_list_test \
-	sst_dump_test \
-	compact_files_test \
-	optimistic_transaction_test \
-	write_callback_test \
-	heap_test \
-	compact_on_deletion_collector_test \
-	compaction_job_stats_test \
-	option_change_migration_test \
-	transaction_test \
-	ldb_cmd_test \
-	persistent_cache_test \
-	statistics_test \
-	stats_history_test \
-	lru_cache_test \
-	object_registry_test \
-	repair_test \
-	env_timed_test \
-	write_prepared_transaction_test \
-	write_unprepared_transaction_test \
-	db_universal_compaction_test \
-	trace_analyzer_test \
-	repeatable_thread_test \
-	range_tombstone_fragmenter_test \
-	range_del_aggregator_test \
-	sst_file_reader_test \
-	db_secondary_test \
-	block_cache_tracer_test \
-	block_cache_trace_analyzer_test \
-	defer_test \
-
-ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
-	TESTS += folly_synchronization_distributed_mutex_test
-endif
 
-PARALLEL_TEST = \
-	backupable_db_test \
-	db_bloom_filter_test \
-	db_compaction_filter_test \
-	db_compaction_test \
-	db_merge_operator_test \
-	db_sst_test \
-	db_test \
-	db_universal_compaction_test \
-	db_wal_test \
-	external_sst_file_test \
-	import_column_family_test \
-	fault_injection_test \
-	file_reader_writer_test \
-	inlineskiplist_test \
-	manual_compaction_test \
-	persistent_cache_test \
-	table_test \
-	transaction_test \
-	write_prepared_transaction_test \
-	write_unprepared_transaction_test \
+# Sort ROCKSDBTESTS_SUBSET for filtering, except db_test is special (expensive)
+# so is placed first (out-of-order)
+ROCKSDBTESTS_SUBSET := $(filter db_test, $(ROCKSDBTESTS_SUBSET)) $(sort $(filter-out db_test, $(ROCKSDBTESTS_SUBSET)))
 
-# options_settable_test doesn't pass with UBSAN as we use hack in the test
-ifdef COMPILE_WITH_UBSAN
-        TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g')
-endif
-SUBSET := $(TESTS)
 ifdef ROCKSDBTESTS_START
-        SUBSET := $(shell echo $(SUBSET) | sed 's/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/')
+        ROCKSDBTESTS_SUBSET := $(shell echo $(ROCKSDBTESTS_SUBSET) | sed 's/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/')
 endif
 
 ifdef ROCKSDBTESTS_END
-        SUBSET := $(shell echo $(SUBSET) | sed 's/$(ROCKSDBTESTS_END).*//')
+        ROCKSDBTESTS_SUBSET := $(shell echo $(ROCKSDBTESTS_SUBSET) | sed 's/$(ROCKSDBTESTS_END).*//')
 endif
 
-TOOLS = \
-	sst_dump \
-	db_sanity_test \
-	db_stress \
-	write_stress \
-	ldb \
-	db_repl_stress \
-	rocksdb_dump \
-	rocksdb_undump \
-	blob_dump \
-	trace_analyzer \
-	block_cache_trace_analyzer \
+ifeq ($(ROCKSDBTESTS_PLATFORM_DEPENDENT), only)
+        ROCKSDBTESTS_SUBSET := $(filter $(TESTS_PLATFORM_DEPENDENT), $(ROCKSDBTESTS_SUBSET))
+else ifeq ($(ROCKSDBTESTS_PLATFORM_DEPENDENT), exclude)
+        ROCKSDBTESTS_SUBSET := $(filter-out $(TESTS_PLATFORM_DEPENDENT), $(ROCKSDBTESTS_SUBSET))
+endif
+
+# bench_tool_analyer main is in bench_tool_analyzer_tool, or this would be simpler...
+TOOLS = $(patsubst %.cc, %, $(notdir $(patsubst %_tool.cc, %.cc, $(TOOLS_MAIN_SOURCES))))
 
 TEST_LIBS = \
 	librocksdb_env_basic_test.a
 
 # TODO: add back forward_iterator_bench, after making it build in all environemnts.
-BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench filter_bench persistent_cache_bench range_del_aggregator_bench
+BENCHMARKS = $(patsubst %.cc, %, $(notdir $(BENCH_MAIN_SOURCES)))
+
+MICROBENCHS = $(patsubst %.cc, %, $(notdir $(MICROBENCH_SOURCES)))
 
 # if user didn't config LIBNAME, set the default
 ifeq ($(LIBNAME),)
+  LIBNAME=librocksdb
 # we should only run rocksdb in production with DEBUG_LEVEL 0
-ifeq ($(DEBUG_LEVEL),0)
-        LIBNAME=librocksdb
-else
-        LIBNAME=librocksdb_debug
+ifneq ($(DEBUG_LEVEL),0)
+  LIBDEBUG=_debug
 endif
 endif
-LIBRARY = ${LIBNAME}.a
-TOOLS_LIBRARY = ${LIBNAME}_tools.a
-STRESS_LIBRARY = ${LIBNAME}_stress.a
+STATIC_LIBRARY = ${LIBNAME}$(LIBDEBUG).a
+STATIC_TEST_LIBRARY =  ${LIBNAME}_test$(LIBDEBUG).a
+STATIC_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).a
+STATIC_STRESS_LIBRARY = ${LIBNAME}_stress$(LIBDEBUG).a
+
+ALL_STATIC_LIBS = $(STATIC_LIBRARY) $(STATIC_TEST_LIBRARY) $(STATIC_TOOLS_LIBRARY) $(STATIC_STRESS_LIBRARY)
+
+SHARED_TEST_LIBRARY =  ${LIBNAME}_test$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+SHARED_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+SHARED_STRESS_LIBRARY = ${LIBNAME}_stress$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+
+ALL_SHARED_LIBS = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4) $(SHARED_TEST_LIBRARY) $(SHARED_TOOLS_LIBRARY) $(SHARED_STRESS_LIBRARY)
+
+ifeq ($(LIB_MODE),shared)
+LIBRARY=$(SHARED1)
+TEST_LIBRARY=$(SHARED_TEST_LIBRARY)
+TOOLS_LIBRARY=$(SHARED_TOOLS_LIBRARY)
+STRESS_LIBRARY=$(SHARED_STRESS_LIBRARY)
+CLOUD_LIBRARY=$(SHARED_CLOUD_LIBRARY)
+else
+LIBRARY=$(STATIC_LIBRARY)
+TEST_LIBRARY=$(STATIC_TEST_LIBRARY)
+TOOLS_LIBRARY=$(STATIC_TOOLS_LIBRARY)
+endif
+STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY)
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 
+# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but
+# the file needs to already exist or else the build will fail
+ifndef NO_UPDATE_BUILD_VERSION
+
+# By default, use the current date-time as the date.  If there are no changes,
+# we will use the last commit date instead.
+build_date := $(shell date "+%Y-%m-%d %T")
+
+ifdef FORCE_GIT_SHA
+	git_sha := $(FORCE_GIT_SHA)
+	git_mod := 1
+	git_date := $(build_date)
+else
+	git_sha := $(shell git rev-parse HEAD 2>/dev/null)
+	git_tag  := $(shell git symbolic-ref -q --short HEAD 2> /dev/null || git describe --tags --exact-match 2>/dev/null)
+	git_mod  := $(shell git diff-index HEAD --quiet 2>/dev/null; echo $$?)
+	git_date := $(shell git log -1 --date=format:"%Y-%m-%d %T" --format="%ad" 2>/dev/null)
+endif
+gen_build_version = sed -e s/@GIT_SHA@/$(git_sha)/ -e s:@GIT_TAG@:"$(git_tag)": -e s/@GIT_MOD@/"$(git_mod)"/ -e s/@BUILD_DATE@/"$(build_date)"/ -e s/@GIT_DATE@/"$(git_date)"/ util/build_version.cc.in
+
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in this file.  It is then built
+# as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find
+# the version of the source that we used to build the executable file.
+util/build_version.cc: $(filter-out $(OBJ_DIR)/util/build_version.o, $(LIB_OBJECTS)) util/build_version.cc.in
+	$(AM_V_GEN)rm -f $@-t
+	$(AM_V_at)$(gen_build_version) > $@
+endif
+CLEAN_FILES += util/build_version.cc
+
 default: all
 
 #-----------------------------------------------
@@ -681,7 +741,7 @@
 ifneq ($(PLATFORM_SHARED_EXT),)
 
 ifneq ($(PLATFORM_SHARED_VERSIONED),true)
-SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED1 = ${LIBNAME}$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1)
 SHARED3 = $(SHARED1)
 SHARED4 = $(SHARED1)
@@ -692,7 +752,7 @@
 SHARED_PATCH = $(ROCKSDB_PATCH)
 SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
 ifeq ($(PLATFORM), OS_MACOSX)
-SHARED_OSX = $(LIBNAME).$(SHARED_MAJOR)
+SHARED_OSX = $(LIBNAME)$(LIBDEBUG).$(SHARED_MAJOR)
 SHARED2 = $(SHARED_OSX).$(PLATFORM_SHARED_EXT)
 SHARED3 = $(SHARED_OSX).$(SHARED_MINOR).$(PLATFORM_SHARED_EXT)
 SHARED4 = $(SHARED_OSX).$(SHARED_MINOR).$(SHARED_PATCH).$(PLATFORM_SHARED_EXT)
@@ -700,61 +760,35 @@
 SHARED2 = $(SHARED1).$(SHARED_MAJOR)
 SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
 SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH)
-endif
+endif # MACOSX
 SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
-$(SHARED1): $(SHARED4)
+$(SHARED1): $(SHARED4) $(SHARED2)
 	ln -fs $(SHARED4) $(SHARED1)
-$(SHARED2): $(SHARED4)
+$(SHARED2): $(SHARED4) $(SHARED3)
 	ln -fs $(SHARED4) $(SHARED2)
 $(SHARED3): $(SHARED4)
 	ln -fs $(SHARED4) $(SHARED3)
-endif
-ifeq ($(HAVE_POWER8),1)
-SHARED_C_OBJECTS = $(LIB_SOURCES_C:.c=.o)
-SHARED_ASM_OBJECTS = $(LIB_SOURCES_ASM:.S=.o)
-SHARED_C_LIBOBJECTS = $(patsubst %.o,shared-objects/%.o,$(SHARED_C_OBJECTS))
-SHARED_ASM_LIBOBJECTS = $(patsubst %.o,shared-objects/%.o,$(SHARED_ASM_OBJECTS))
-shared_libobjects = $(patsubst %,shared-objects/%,$(LIB_CC_OBJECTS))
-else
-shared_libobjects = $(patsubst %,shared-objects/%,$(LIBOBJECTS))
-endif
-
-CLEAN_FILES += shared-objects
-shared_all_libobjects = $(shared_libobjects)
-
-ifeq ($(HAVE_POWER8),1)
-shared-ppc-objects = $(SHARED_C_LIBOBJECTS) $(SHARED_ASM_LIBOBJECTS)
-
-shared-objects/util/crc32c_ppc.o: util/crc32c_ppc.c
-	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
-
-shared-objects/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
-	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
-endif
-$(shared_libobjects): shared-objects/%.o: %.cc
-	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
-
-ifeq ($(HAVE_POWER8),1)
-shared_all_libobjects = $(shared_libobjects) $(shared-ppc-objects)
-endif
-$(SHARED4): $(shared_all_libobjects)
-	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(shared_all_libobjects) $(LDFLAGS) -o $@
 
+endif   # PLATFORM_SHARED_VERSIONED
+$(SHARED4): $(LIB_OBJECTS)
+	$(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(LIB_OBJECTS) $(LDFLAGS) -o $@
 endif  # PLATFORM_SHARED_EXT
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
 	release tags tags0 valgrind_check whitebox_crash_test format static_lib shared_lib all \
-	dbg rocksdbjavastatic rocksdbjava install install-static install-shared uninstall \
-	analyze tools tools_lib \
+	dbg rocksdbjavastatic rocksdbjava gen-pc install install-static install-shared uninstall \
+	analyze tools tools_lib check-headers \
 	blackbox_crash_test_with_atomic_flush whitebox_crash_test_with_atomic_flush  \
-	blackbox_crash_test_with_txn whitebox_crash_test_with_txn
+	blackbox_crash_test_with_txn whitebox_crash_test_with_txn \
+	blackbox_crash_test_with_best_efforts_recovery \
+	blackbox_crash_test_with_ts whitebox_crash_test_with_ts
 
 
 all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
 
-all_but_some_tests: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(SUBSET)
+all_but_some_tests: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(ROCKSDBTESTS_SUBSET)
 
-static_lib: $(LIBRARY)
+static_lib: $(STATIC_LIBRARY)
 
 shared_lib: $(SHARED)
 
@@ -766,19 +800,22 @@
 
 test_libs: $(TEST_LIBS)
 
+benchmarks: $(BENCHMARKS)
+
+microbench: $(MICROBENCHS)
+	for t in $(MICROBENCHS); do echo "===== Running benchmark $$t (`date`)"; ./$$t || exit 1; done;
+
 dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS)
 
-# creates static library and programs
-release:
-	$(MAKE) clean
-	DEBUG_LEVEL=0 $(MAKE) static_lib tools db_bench
+# creates library and programs
+release: clean
+	LIB_MODE=$(LIB_MODE) DEBUG_LEVEL=0 $(MAKE) $(LIBRARY) tools db_bench
 
-coverage:
-	$(MAKE) clean
+coverage: clean
 	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) J=1 all check
 	cd coverage && ./coverage_test.sh
         # Delete intermediate files
-	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm -f {} \;
 
 ifneq (,$(filter check parallel_check,$(MAKECMDGOALS)),)
 # Use /dev/shm if it has the sticky bit set (otherwise, /tmp),
@@ -824,14 +861,11 @@
 $(parallel_tests): $(PARALLEL_TEST)
 	$(AM_V_at)TEST_BINARY=$(patsubst parallel_%,%,$@); \
   TEST_NAMES=` \
-    ./$$TEST_BINARY --gtest_list_tests \
-    | perl -n \
-      -e 's/ *\#.*//;' \
-      -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};'	\
-      -e 'print qq! $$p$$2!'`; \
+    (./$$TEST_BINARY --gtest_list_tests || echo "  $${TEST_BINARY}__list_tests_failure") \
+    | awk '/^[^ ]/ { prefix = $$1 } /^[ ]/ { print prefix $$1 }'`; \
+	echo "  Generating parallel test scripts for $$TEST_BINARY"; \
 	for TEST_NAME in $$TEST_NAMES; do \
 		TEST_SCRIPT=t/run-$$TEST_BINARY-$${TEST_NAME//\//-}; \
-		echo "  GEN     " $$TEST_SCRIPT; \
     printf '%s\n' \
       '#!/bin/sh' \
       "d=\$(TMPD)$$TEST_SCRIPT" \
@@ -843,7 +877,7 @@
 
 gen_parallel_tests:
 	$(AM_V_at)mkdir -p t
-	$(AM_V_at)rm -f t/run-*
+	$(AM_V_at)$(FIND) t -type f -name 'run-*' -exec rm -f {} \;
 	$(MAKE) $(parallel_tests)
 
 # Reorder input lines (which are one per test) so that the
@@ -863,7 +897,7 @@
 # 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest
 #
 slow_test_regexp = \
-	^.*SnapshotConcurrentAccessTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
+	^.*SnapshotConcurrentAccessTest.*$$|^.*SeqAdvanceConcurrentTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
 prioritize_long_running_tests =						\
   perl -pe 's,($(slow_test_regexp)),100 $$1,'				\
     | sort -k1,1gr							\
@@ -878,6 +912,19 @@
 
 # Use this regexp to select the subset of tests whose names match.
 tests-regexp = .
+EXCLUDE_TESTS_REGEX ?= "^$$"
+
+ifeq ($(PRINT_PARALLEL_OUTPUTS), 1)
+	parallel_redir =
+else ifeq ($(QUIET_PARALLEL_TESTS), 1)
+	parallel_redir = >& t/$(test_log_prefix)log-{/}
+else
+# Default: print failure output only, as it happens
+# Note: gnu_parallel --eta is now always used, but has been modified to provide
+# only infrequent updates when not connected to a terminal. (CircleCI will
+# kill a job if no output for 10min.)
+	parallel_redir = >& t/$(test_log_prefix)log-{/} || bash -c "cat t/$(test_log_prefix)log-{/}; exit $$?"
+endif
 
 .PHONY: check_0
 check_0:
@@ -885,34 +932,38 @@
 	printf '%s\n' ''						\
 	  'To monitor subtest <duration,pass/fail,name>,'		\
 	  '  run "make watch-log" in a separate window' '';		\
-	test -t 1 && eta=--eta || eta=; \
 	{ \
 		printf './%s\n' $(filter-out $(PARALLEL_TEST),$(TESTS)); \
 		find t -name 'run-*' -print; \
 	} \
 	  | $(prioritize_long_running_tests)				\
 	  | grep -E '$(tests-regexp)'					\
-	  | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG $$eta --gnu '{} >& t/log-{/}'
+	  | grep -E -v '$(EXCLUDE_TESTS_REGEX)'					\
+	  | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu '{} $(parallel_redir)' ; \
+	parallel_retcode=$$? ; \
+	awk '{ if ($$7 != 0 || $$8 != 0) { if ($$7 == "Exitval") { h = $$0; } else { if (!f) print h; print; f = 1 } } } END { if(f) exit 1; }' < LOG ; \
+	awk_retcode=$$?; \
+	if [ $$parallel_retcode -ne 0 ] || [ $$awk_retcode -ne 0 ] ; then exit 1 ; fi
 
-valgrind-blacklist-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest
+valgrind-exclude-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest
 
 .PHONY: valgrind_check_0
+valgrind_check_0: test_log_prefix := valgrind_
 valgrind_check_0:
 	$(AM_V_GEN)export TEST_TMPDIR=$(TMPD);				\
 	printf '%s\n' ''						\
 	  'To monitor subtest <duration,pass/fail,name>,'		\
 	  '  run "make watch-log" in a separate window' '';		\
-	test -t 1 && eta=--eta || eta=;					\
 	{								\
 	  printf './%s\n' $(filter-out $(PARALLEL_TEST) %skiplist_test options_settable_test, $(TESTS));		\
 	  find t -name 'run-*' -print; \
 	}								\
 	  | $(prioritize_long_running_tests)				\
 	  | grep -E '$(tests-regexp)'					\
-	  | grep -E -v '$(valgrind-blacklist-regexp)'					\
-	  | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG $$eta --gnu \
-	  '(if [[ "{}" == "./"* ]] ; then $(DRIVER) {}; else {}; fi) ' \
-	  '>& t/valgrind_log-{/}'
+	  | grep -E -v '$(valgrind-exclude-regexp)'					\
+	  | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG --eta --gnu \
+	  '(if [[ "{}" == "./"* ]] ; then $(DRIVER) {}; else {}; fi) \
+	  $(parallel_redir)' \
 
 CLEAN_FILES += t LOG $(TMPD)
 
@@ -926,6 +977,9 @@
 watch-log:
 	$(WATCH) --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)'
 
+dump-log:
+	bash -c '$(quoted_perl_command)' < LOG
+
 # If J != 1 and GNU parallel is installed, run the tests in parallel,
 # via the check_0 rule above.  Otherwise, run them sequentially.
 check: all
@@ -937,102 +991,160 @@
 	    $(MAKE) T="$$t" TMPD=$(TMPD) check_0;                       \
 	else                                                            \
 	    for t in $(TESTS); do                                       \
-	      echo "===== Running $$t"; ./$$t || exit 1; done;          \
+	      echo "===== Running $$t (`date`)"; ./$$t || exit 1; done;          \
 	fi
 	rm -rf $(TMPD)
 ifneq ($(PLATFORM), OS_AIX)
-	python tools/check_all_python.py
+	$(PYTHON) tools/check_all_python.py
 ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
-	python tools/ldb_test.py
+ifndef ASSERT_STATUS_CHECKED # not yet working with these tests
+	$(PYTHON) tools/ldb_test.py
 	sh tools/rocksdb_dump_test.sh
 endif
 endif
+endif
+ifndef SKIP_FORMAT_BUCK_CHECKS
+	$(MAKE) check-format
+	$(MAKE) check-buck-targets
+	$(MAKE) check-sources
+endif
 
 # TODO add ldb_tests
-check_some: $(SUBSET)
-	for t in $(SUBSET); do echo "===== Running $$t"; ./$$t || exit 1; done
+check_some: $(ROCKSDBTESTS_SUBSET)
+	for t in $(ROCKSDBTESTS_SUBSET); do echo "===== Running $$t (`date`)"; ./$$t || exit 1; done
 
 .PHONY: ldb_tests
 ldb_tests: ldb
-	python tools/ldb_test.py
-
-crash_test: whitebox_crash_test blackbox_crash_test
+	$(PYTHON) tools/ldb_test.py
 
-crash_test_with_atomic_flush: whitebox_crash_test_with_atomic_flush blackbox_crash_test_with_atomic_flush
-
-crash_test_with_txn: whitebox_crash_test_with_txn blackbox_crash_test_with_txn
+crash_test:
+# Do not parallelize
+	$(MAKE) whitebox_crash_test
+	$(MAKE) blackbox_crash_test
+
+crash_test_with_atomic_flush:
+# Do not parallelize
+	$(MAKE) whitebox_crash_test_with_atomic_flush
+	$(MAKE) blackbox_crash_test_with_atomic_flush
+
+crash_test_with_txn:
+# Do not parallelize
+	$(MAKE) whitebox_crash_test_with_txn
+	$(MAKE) blackbox_crash_test_with_txn
+
+crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery
+
+crash_test_with_ts:
+# Do not parallelize
+	$(MAKE) whitebox_crash_test_with_ts
+	$(MAKE) blackbox_crash_test_with_ts
 
 blackbox_crash_test: db_stress
-	python -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS)
-	python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
+	$(PYTHON) -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS)
+	$(PYTHON) -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_atomic_flush: db_stress
-	python -u tools/db_crashtest.py --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
+	$(PYTHON) -u tools/db_crashtest.py --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_txn: db_stress
-	python -u tools/db_crashtest.py --txn blackbox $(CRASH_TEST_EXT_ARGS)
+	$(PYTHON) -u tools/db_crashtest.py --txn blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_best_efforts_recovery: db_stress
+	$(PYTHON) -u tools/db_crashtest.py --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_ts: db_stress
+	$(PYTHON) -u tools/db_crashtest.py --enable_ts blackbox $(CRASH_TEST_EXT_ARGS)
 
 ifeq ($(CRASH_TEST_KILL_ODD),)
   CRASH_TEST_KILL_ODD=888887
 endif
 
 whitebox_crash_test: db_stress
-	python -u tools/db_crashtest.py --simple whitebox --random_kill_odd \
+	$(PYTHON) -u tools/db_crashtest.py --simple whitebox --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
-	python -u tools/db_crashtest.py whitebox  --random_kill_odd \
+	$(PYTHON) -u tools/db_crashtest.py whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 whitebox_crash_test_with_atomic_flush: db_stress
-	python -u tools/db_crashtest.py --cf_consistency whitebox  --random_kill_odd \
+	$(PYTHON) -u tools/db_crashtest.py --cf_consistency whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 whitebox_crash_test_with_txn: db_stress
-	python -u tools/db_crashtest.py --txn whitebox --random_kill_odd \
+	$(PYTHON) -u tools/db_crashtest.py --txn whitebox --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
-asan_check:
-	$(MAKE) clean
+whitebox_crash_test_with_ts: db_stress
+	$(PYTHON) -u tools/db_crashtest.py --enable_ts whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+asan_check: clean
 	COMPILE_WITH_ASAN=1 $(MAKE) check -j32
 	$(MAKE) clean
 
-asan_crash_test:
-	$(MAKE) clean
+asan_crash_test: clean
 	COMPILE_WITH_ASAN=1 $(MAKE) crash_test
 	$(MAKE) clean
 
-asan_crash_test_with_atomic_flush:
+whitebox_asan_crash_test: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) whitebox_crash_test
 	$(MAKE) clean
-	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_atomic_flush
+
+blackbox_asan_crash_test: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) blackbox_crash_test
 	$(MAKE) clean
 
-asan_crash_test_with_txn:
+asan_crash_test_with_atomic_flush: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_atomic_flush
 	$(MAKE) clean
+
+asan_crash_test_with_txn: clean
 	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_txn
 	$(MAKE) clean
 
-ubsan_check:
+asan_crash_test_with_best_efforts_recovery: clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_best_efforts_recovery
 	$(MAKE) clean
+
+ubsan_check: clean
 	COMPILE_WITH_UBSAN=1 $(MAKE) check -j32
 	$(MAKE) clean
 
-ubsan_crash_test:
-	$(MAKE) clean
+ubsan_crash_test: clean
 	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test
 	$(MAKE) clean
 
-ubsan_crash_test_with_atomic_flush:
+whitebox_ubsan_crash_test: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) whitebox_crash_test
 	$(MAKE) clean
-	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_atomic_flush
+
+blackbox_ubsan_crash_test: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) blackbox_crash_test
 	$(MAKE) clean
 
-ubsan_crash_test_with_txn:
+ubsan_crash_test_with_atomic_flush: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_atomic_flush
 	$(MAKE) clean
+
+ubsan_crash_test_with_txn: clean
 	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_txn
 	$(MAKE) clean
 
+ubsan_crash_test_with_best_efforts_recovery: clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_best_efforts_recovery
+	$(MAKE) clean
+
+full_valgrind_test:
+	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
+
+full_valgrind_test_some:
+	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
+
 valgrind_test:
 	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
 
+valgrind_test_some:
+	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
+
 valgrind_check: $(TESTS)
 	$(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests
 	$(AM_V_GEN)if test "$(J)" != 1                                  \
@@ -1051,12 +1163,20 @@
 		done; \
 	fi
 
+valgrind_check_some: $(ROCKSDBTESTS_SUBSET)
+	for t in $(ROCKSDBTESTS_SUBSET); do \
+		$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+		ret_code=$$?; \
+		if [ $$ret_code -ne 0 ]; then \
+			exit $$ret_code; \
+		fi; \
+	done
 
 ifneq ($(PAR_TEST),)
 parloop:
 	ret_bad=0;							\
 	for t in $(PAR_TEST); do		\
-		echo "===== Running $$t in parallel $(NUM_PAR)";\
+		echo "===== Running $$t in parallel $(NUM_PAR) (`date`)";\
 		if [ $(db_test) -eq 1 ]; then \
 			seq $(J) | v="$$t" build_tools/gnu_parallel --gnu --plain 's=$(TMPD)/rdb-{};  export TEST_TMPDIR=$$s;' \
 				'timeout 2m ./db_test --gtest_filter=$$v >> $$s/log-{} 2>1'; \
@@ -1108,22 +1228,22 @@
 		$(MAKE) dbg
 
 CLEAN_FILES += unity.cc
-unity.cc: Makefile
+unity.cc: Makefile util/build_version.cc.in
 	rm -f $@ $@-t
+	$(AM_V_at)$(gen_build_version) > util/build_version.cc
 	for source_file in $(LIB_SOURCES); do \
 		echo "#include \"$$source_file\"" >> $@-t; \
 	done
 	chmod a=r $@-t
 	mv $@-t $@
 
-unity.a: unity.o
+unity.a: $(OBJ_DIR)/unity.o
 	$(AM_V_AR)rm -f $@
-	$(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(OBJ_DIR)/unity.o
 
 
-TOOLLIBOBJECTS = $(TOOL_LIB_SOURCES:.cc=.o)
 # try compiling db_test with unity
-unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unity.a
+unity_test: $(OBJ_DIR)/db/db_basic_test.o $(OBJ_DIR)/db/db_test_util.o $(TEST_OBJECTS) $(TOOL_OBJECTS) unity.a
 	$(AM_LINK)
 	./unity_test
 
@@ -1135,12 +1255,15 @@
 clean-not-downloaded: clean-ext-libraries-bin clean-rocks clean-not-downloaded-rocksjava
 
 clean-rocks:
-	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(LIBRARY) $(SHARED)
+	echo shared=$(ALL_SHARED_LIBS)
+	echo static=$(ALL_STATIC_LIBS)
+	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(ALL_STATIC_LIBS) $(ALL_SHARED_LIBS) $(MICROBENCHS)
 	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
 	$(FIND) . -name "*.[oda]" -exec rm -f {} \;
-	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm -f {} \;
 
 clean-rocksjava:
+	rm -rf jl jls
 	cd java && $(MAKE) clean
 
 clean-not-downloaded-rocksjava:
@@ -1167,603 +1290,769 @@
 format:
 	build_tools/format-diff.sh
 
+check-format:
+	build_tools/format-diff.sh -c
+
+check-buck-targets:
+	buckifier/check_buck_targets.sh
+
+check-sources:
+	build_tools/check-sources.sh
+
 package:
 	bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
 
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
-$(LIBRARY): $(LIBOBJECTS)
-	$(AM_V_AR)rm -f $@
-	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS)
+$(STATIC_LIBRARY): $(LIB_OBJECTS)
+	$(AM_V_AR)rm -f $@ $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIB_OBJECTS)
 
-$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o)
-	$(AM_V_AR)rm -f $@
+$(STATIC_TEST_LIBRARY): $(TEST_OBJECTS)
+	$(AM_V_AR)rm -f $@ $(SHARED_TEST_LIBRARY)
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
-$(STRESS_LIBRARY): $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o) $(STRESS_LIB_SOURCES:.cc=.o)
-	$(AM_V_AR)rm -f $@
+$(STATIC_TOOLS_LIBRARY): $(TOOL_OBJECTS)
+	$(AM_V_AR)rm -f $@ $(SHARED_TOOLS_LIBRARY)
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
-librocksdb_env_basic_test.a: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS)
+$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL)
+	$(AM_V_AR)rm -f $@ $(SHARED_STRESS_LIBRARY)
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
+
+$(SHARED_TEST_LIBRARY): $(TEST_OBJECTS) $(SHARED1)
+	$(AM_V_AR)rm -f $@ $(STATIC_TEST_LIBRARY)
+	$(AM_SHARE)
+
+$(SHARED_TOOLS_LIBRARY): $(TOOL_OBJECTS) $(SHARED1)
+	$(AM_V_AR)rm -f $@ $(STATIC_TOOLS_LIBRARY)
+	$(AM_SHARE)
+
+$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(SHARED_TOOLS_LIBRARY) $(SHARED1)
+	$(AM_V_AR)rm -f $@ $(STATIC_STRESS_LIBRARY)
+	$(AM_SHARE)
+
+librocksdb_env_basic_test.a: $(OBJ_DIR)/env/env_basic_test.o $(LIB_OBJECTS) $(TESTHARNESS)
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
-db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS)
+db_bench: $(OBJ_DIR)/tools/db_bench.o $(BENCH_OBJECTS) $(TESTUTIL) $(LIBRARY)
 	$(AM_LINK)
 
-trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
+trace_analyzer: $(OBJ_DIR)/tools/trace_analyzer.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
+block_cache_trace_analyzer: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
 ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
-folly_synchronization_distributed_mutex_test: $(LIBOBJECTS) $(TESTHARNESS) $(FOLLYOBJECTS) third-party/folly/folly/synchronization/test/DistributedMutexTest.o
+folly_synchronization_distributed_mutex_test: $(OBJ_DIR)/third-party/folly/folly/synchronization/test/DistributedMutexTest.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 endif
 
-cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
+cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY)
+	$(AM_LINK)
+
+persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+memtablerep_bench: $(OBJ_DIR)/memtable/memtablerep_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+filter_bench: $(OBJ_DIR)/util/filter_bench.o $(LIBRARY)
 	$(AM_LINK)
 
-persistent_cache_bench: utilities/persistent_cache/persistent_cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
+db_stress: $(OBJ_DIR)/db_stress_tool/db_stress.o $(STRESS_LIBRARY) $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-memtablerep_bench: memtable/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
+write_stress: $(OBJ_DIR)/tools/write_stress.o $(LIBRARY)
 	$(AM_LINK)
 
-filter_bench: util/filter_bench.o $(LIBOBJECTS) $(TESTUTIL)
+db_sanity_test: $(OBJ_DIR)/tools/db_sanity_test.o $(LIBRARY)
 	$(AM_LINK)
 
-db_stress: db_stress_tool/db_stress.o $(STRESSTOOLOBJECTS)
+db_repl_stress: $(OBJ_DIR)/tools/db_repl_stress.o $(LIBRARY)
 	$(AM_LINK)
 
-write_stress: tools/write_stress.o $(LIBOBJECTS) $(TESTUTIL)
+arena_test: $(OBJ_DIR)/memory/arena_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
+memory_allocator_test: memory/memory_allocator_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
+autovector_test: $(OBJ_DIR)/util/autovector_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-arena_test: memory/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+column_family_test: $(OBJ_DIR)/db/column_family_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+table_properties_collector_test: $(OBJ_DIR)/db/table_properties_collector_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-column_family_test: db/column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+bloom_test: $(OBJ_DIR)/util/bloom_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+dynamic_bloom_test: $(OBJ_DIR)/util/dynamic_bloom_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+c_test: $(OBJ_DIR)/db/c_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cache_test: $(OBJ_DIR)/cache/cache_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+coding_test: $(OBJ_DIR)/util/coding_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cache_test: cache/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+hash_test: $(OBJ_DIR)/util/hash_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
+random_test: $(OBJ_DIR)/util/random_test.o  $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-hash_test: util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ribbon_test: $(OBJ_DIR)/util/ribbon_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-random_test: util/random_test.o $(LIBOBJECTS) $(TESTHARNESS)
+option_change_migration_test: $(OBJ_DIR)/utilities/option_change_migration/option_change_migration_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-option_change_migration_test: utilities/option_change_migration/option_change_migration_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+stringappend_test: $(OBJ_DIR)/utilities/merge_operators/string_append/stringappend_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_format_test: utilities/cassandra/cassandra_format_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_functional_test: utilities/cassandra/cassandra_functional_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_row_merge_test: utilities/cassandra/cassandra_row_merge_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_serialize_test: $(OBJ_DIR)/utilities/cassandra/cassandra_serialize_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_serialize_test: utilities/cassandra/cassandra_serialize_test.o $(LIBOBJECTS) $(TESTHARNESS)
+hash_table_test: $(OBJ_DIR)/utilities/persistent_cache/hash_table_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-hash_table_test: utilities/persistent_cache/hash_table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+histogram_test: $(OBJ_DIR)/monitoring/histogram_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
+thread_local_test: $(OBJ_DIR)/util/thread_local_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
+work_queue_test: $(OBJ_DIR)/util/work_queue_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+corruption_test: $(OBJ_DIR)/db/corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+crc32c_test: $(OBJ_DIR)/util/crc32c_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-slice_test: util/slice_test.o $(LIBOBJECTS) $(TESTHARNESS)
+slice_test: $(OBJ_DIR)/util/slice_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS)
+slice_transform_test: $(OBJ_DIR)/util/slice_transform_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_basic_test: db/db_basic_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_basic_test: $(OBJ_DIR)/db/db_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_encryption_test: db/db_encryption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_blob_basic_test: $(OBJ_DIR)/db/blob/db_blob_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_blob_compaction_test: $(OBJ_DIR)/db/blob/db_blob_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_test2: db/db_test2.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_blob_index_test: db/db_blob_index_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_with_timestamp_compaction_test: db/db_with_timestamp_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_block_cache_test: db/db_block_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_encryption_test: $(OBJ_DIR)/db/db_encryption_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_bloom_filter_test: db/db_bloom_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_log_iter_test: db/db_log_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_compaction_filter_test: db/db_compaction_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_logical_block_size_cache_test: $(OBJ_DIR)/db/db_logical_block_size_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_compaction_test: db/db_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_blob_index_test: $(OBJ_DIR)/db/blob/db_blob_index_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_dynamic_level_test: db/db_dynamic_level_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_block_cache_test: $(OBJ_DIR)/db/db_block_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_flush_test: db/db_flush_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_bloom_filter_test: $(OBJ_DIR)/db/db_bloom_filter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_inplace_update_test: db/db_inplace_update_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_log_iter_test: $(OBJ_DIR)/db/db_log_iter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_iterator_test: db/db_iterator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_memtable_test: db/db_memtable_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_merge_operator_test: db/db_merge_operator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_merge_operand_test: db/db_merge_operand_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_flush_test: $(OBJ_DIR)/db/db_flush_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_inplace_update_test: $(OBJ_DIR)/db/db_inplace_update_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_range_del_test: db/db_range_del_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_iterator_test: $(OBJ_DIR)/db/db_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_sst_test: db/db_sst_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_kv_checksum_test: $(OBJ_DIR)/db/db_kv_checksum_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_statistics_test: db/db_statistics_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_memtable_test: $(OBJ_DIR)/db/db_memtable_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_write_test: db/db_write_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_merge_operator_test: $(OBJ_DIR)/db/db_merge_operator_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-error_handler_test: db/error_handler_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_merge_operand_test: $(OBJ_DIR)/db/db_merge_operand_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-external_sst_file_basic_test: db/external_sst_file_basic_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_options_test: $(OBJ_DIR)/db/db_options_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-external_sst_file_test: db/external_sst_file_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_range_del_test: $(OBJ_DIR)/db/db_range_del_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-import_column_family_test: db/import_column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_sst_test: $(OBJ_DIR)/db/db_sst_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_statistics_test: $(OBJ_DIR)/db/db_statistics_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS)
+db_write_test: $(OBJ_DIR)/db/db_write_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_iter_stress_test: db/db_iter_stress_test.o $(LIBOBJECTS) $(TESTHARNESS)
+error_handler_fs_test: $(OBJ_DIR)/db/error_handler_fs_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+external_sst_file_basic_test: $(OBJ_DIR)/db/external_sst_file_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_wal_test: db/db_wal_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+external_sst_file_test: $(OBJ_DIR)/db/external_sst_file_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_io_failure_test: db/db_io_failure_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+import_column_family_test: $(OBJ_DIR)/db/import_column_family_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_properties_test: db/db_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_tailing_iter_test: $(OBJ_DIR)/db/db_tailing_iter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_table_properties_test: db/db_table_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_iter_test: $(OBJ_DIR)/db/db_iter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+db_iter_stress_test: $(OBJ_DIR)/db/db_iter_stress_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_universal_compaction_test: $(OBJ_DIR)/db/db_universal_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_wal_test: $(OBJ_DIR)/db/db_wal_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_io_failure_test: $(OBJ_DIR)/db/db_io_failure_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_properties_test: $(OBJ_DIR)/db/db_properties_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_table_properties_test: $(OBJ_DIR)/db/db_table_properties_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+log_write_bench: $(OBJ_DIR)/util/log_write_bench.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK) $(PROFILING_FLAGS)
 
-plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+plain_table_db_test: $(OBJ_DIR)/db/plain_table_db_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+comparator_db_test: $(OBJ_DIR)/db/comparator_db_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+table_reader_bench: $(OBJ_DIR)/table/table_reader_bench.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK) $(PROFILING_FLAGS)
 
-perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+perf_context_test: $(OBJ_DIR)/db/perf_context_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
 
-prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+prefix_test: $(OBJ_DIR)/db/prefix_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
 
-backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+backupable_db_test: $(OBJ_DIR)/utilities/backupable/backupable_db_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS)
+checkpoint_test: $(OBJ_DIR)/utilities/checkpoint/checkpoint_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cache_simulator_test: utilities/simulator_cache/cache_simulator_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cache_simulator_test: $(OBJ_DIR)/utilities/simulator_cache/cache_simulator_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+sim_cache_test: $(OBJ_DIR)/utilities/simulator_cache/sim_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-env_mirror_test: utilities/env_mirror_test.o $(LIBOBJECTS) $(TESTHARNESS)
+env_mirror_test: $(OBJ_DIR)/utilities/env_mirror_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-env_timed_test: utilities/env_timed_test.o $(LIBOBJECTS) $(TESTHARNESS)
+env_timed_test: $(OBJ_DIR)/utilities/env_timed_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
 ifdef ROCKSDB_USE_LIBRADOS
-env_librados_test: utilities/env_librados_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+env_librados_test: $(OBJ_DIR)/utilities/env_librados_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
 endif
 
-object_registry_test: utilities/object_registry_test.o $(LIBOBJECTS) $(TESTHARNESS)
+object_registry_test: $(OBJ_DIR)/utilities/object_registry_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+ttl_test: $(OBJ_DIR)/utilities/ttl/ttl_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+write_batch_with_index_test: $(OBJ_DIR)/utilities/write_batch_with_index/write_batch_with_index_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+flush_job_test: $(OBJ_DIR)/db/flush_job_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_iterator_test: $(OBJ_DIR)/db/compaction/compaction_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_job_test: $(OBJ_DIR)/db/compaction/compaction_job_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_job_stats_test: $(OBJ_DIR)/db/compaction/compaction_job_stats_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compaction_service_test: $(OBJ_DIR)/db/compaction/compaction_service_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compact_on_deletion_collector_test: $(OBJ_DIR)/utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+wal_manager_test: $(OBJ_DIR)/db/wal_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
+wal_edit_test: $(OBJ_DIR)/db/wal_edit_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+dbformat_test: $(OBJ_DIR)/db/dbformat_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+env_basic_test: $(OBJ_DIR)/env/env_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-compaction_iterator_test: db/compaction/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS)
+env_test: $(OBJ_DIR)/env/env_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-compaction_job_test: db/compaction/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+io_posix_test: $(OBJ_DIR)/env/io_posix_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-compaction_job_stats_test: db/compaction/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS)
+fault_injection_test: $(OBJ_DIR)/db/fault_injection_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-compact_on_deletion_collector_test: utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+rate_limiter_test: $(OBJ_DIR)/util/rate_limiter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS)
+delete_scheduler_test: $(OBJ_DIR)/file/delete_scheduler_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
+filename_test: $(OBJ_DIR)/db/filename_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-env_basic_test: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS)
+random_access_file_reader_test: $(OBJ_DIR)/file/random_access_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-env_test: env/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+file_reader_writer_test: $(OBJ_DIR)/util/file_reader_writer_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
+block_based_filter_block_test: $(OBJ_DIR)/table/block_based/block_based_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-rate_limiter_test: util/rate_limiter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+block_based_table_reader_test: table/block_based/block_based_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-delete_scheduler_test: file/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS)
+full_filter_block_test: $(OBJ_DIR)/table/block_based/full_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
+partitioned_filter_block_test: $(OBJ_DIR)/table/block_based/partitioned_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS)
+log_test: $(OBJ_DIR)/db/log_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-block_based_filter_block_test: table/block_based/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cleanable_test: $(OBJ_DIR)/table/cleanable_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-full_filter_block_test: table/block_based/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+table_test: $(OBJ_DIR)/table/table_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-partitioned_filter_block_test: table/block_based/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+block_fetcher_test: table/block_fetcher_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
+block_test: $(OBJ_DIR)/table/block_based/block_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cleanable_test: table/cleanable_test.o $(LIBOBJECTS) $(TESTHARNESS)
+data_block_hash_index_test: $(OBJ_DIR)/table/block_based/data_block_hash_index_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+inlineskiplist_test: $(OBJ_DIR)/memtable/inlineskiplist_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-block_test: table/block_based/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+skiplist_test: $(OBJ_DIR)/memtable/skiplist_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-data_block_hash_index_test: table/block_based/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+write_buffer_manager_test: $(OBJ_DIR)/memtable/write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+version_edit_test: $(OBJ_DIR)/db/version_edit_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-skiplist_test: memtable/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+version_set_test: $(OBJ_DIR)/db/version_set_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-write_buffer_manager_test: memtable/write_buffer_manager_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_picker_test: $(OBJ_DIR)/db/compaction/compaction_picker_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
+version_builder_test: $(OBJ_DIR)/db/version_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
+file_indexer_test: $(OBJ_DIR)/db/file_indexer_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-compaction_picker_test: db/compaction/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
+reduce_levels_test: $(OBJ_DIR)/tools/reduce_levels_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+write_batch_test: $(OBJ_DIR)/db/write_batch_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-file_indexer_test: db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
+write_controller_test: $(OBJ_DIR)/db/write_controller_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
+merge_helper_test: $(OBJ_DIR)/db/merge_helper_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
+memory_test: $(OBJ_DIR)/utilities/memory/memory_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
+merge_test: $(OBJ_DIR)/db/merge_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-merge_helper_test: db/merge_helper_test.o $(LIBOBJECTS) $(TESTHARNESS)
+merger_test: $(OBJ_DIR)/table/merger_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-memory_test: utilities/memory/memory_test.o $(LIBOBJECTS) $(TESTHARNESS)
+util_merge_operators_test: $(OBJ_DIR)/utilities/util_merge_operators_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
+options_file_test: $(OBJ_DIR)/db/options_file_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+deletefile_test: $(OBJ_DIR)/db/deletefile_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-util_merge_operators_test: utilities/util_merge_operators_test.o $(LIBOBJECTS) $(TESTHARNESS)
+obsolete_files_test: $(OBJ_DIR)/db/obsolete_files_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS)
+rocksdb_dump: $(OBJ_DIR)/tools/dump/rocksdb_dump.o $(LIBRARY)
 	$(AM_LINK)
 
-deletefile_test: db/deletefile_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+rocksdb_undump: $(OBJ_DIR)/tools/dump/rocksdb_undump.o $(LIBRARY)
 	$(AM_LINK)
 
-obsolete_files_test: db/obsolete_files_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+cuckoo_table_builder_test: $(OBJ_DIR)/table/cuckoo/cuckoo_table_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS)
+cuckoo_table_reader_test: $(OBJ_DIR)/table/cuckoo/cuckoo_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-rocksdb_undump: tools/dump/rocksdb_undump.o $(LIBOBJECTS)
+cuckoo_table_db_test: $(OBJ_DIR)/db/cuckoo_table_db_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cuckoo_table_builder_test: table/cuckoo/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+listener_test: $(OBJ_DIR)/db/listener_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cuckoo_table_reader_test: table/cuckoo/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
+thread_list_test: $(OBJ_DIR)/util/thread_list_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compact_files_test: $(OBJ_DIR)/db/compact_files_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-listener_test: db/listener_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+configurable_test: options/configurable_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
+customizable_test: options/customizable_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-compact_files_test: db/compact_files_test.o $(LIBOBJECTS) $(TESTHARNESS)
+options_test: $(OBJ_DIR)/options/options_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-options_test: options/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
+options_settable_test: $(OBJ_DIR)/options/options_settable_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-options_settable_test: options/options_settable_test.o $(LIBOBJECTS) $(TESTHARNESS)
+options_util_test: $(OBJ_DIR)/utilities/options/options_util_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-options_util_test: utilities/options/options_util_test.o $(LIBOBJECTS) $(TESTHARNESS)
+db_bench_tool_test: $(OBJ_DIR)/tools/db_bench_tool_test.o $(BENCH_OBJECTS) $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS)
+trace_analyzer_test: $(OBJ_DIR)/tools/trace_analyzer_test.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS)
+event_logger_test: $(OBJ_DIR)/logging/event_logger_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-event_logger_test: logging/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+timer_queue_test: $(OBJ_DIR)/util/timer_queue_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS)
+sst_dump_test: $(OBJ_DIR)/tools/sst_dump_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
+optimistic_transaction_test: $(OBJ_DIR)/utilities/transactions/optimistic_transaction_test.o  $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-optimistic_transaction_test: utilities/transactions/optimistic_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+mock_env_test : $(OBJ_DIR)/env/mock_env_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-mock_env_test : env/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+manual_compaction_test: $(OBJ_DIR)/db/manual_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+filelock_test: $(OBJ_DIR)/util/filelock_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
+auto_roll_logger_test: $(OBJ_DIR)/logging/auto_roll_logger_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+env_logger_test: $(OBJ_DIR)/logging/env_logger_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-env_logger_test: logging/env_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+memtable_list_test: $(OBJ_DIR)/db/memtable_list_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
+write_callback_test: $(OBJ_DIR)/db/write_callback_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-write_callback_test: db/write_callback_test.o $(LIBOBJECTS) $(TESTHARNESS)
+heap_test: $(OBJ_DIR)/util/heap_test.o $(GTEST)
 	$(AM_LINK)
 
-heap_test: util/heap_test.o $(GTEST)
+point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-write_prepared_transaction_test: utilities/transactions/write_prepared_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+write_prepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-write_unprepared_transaction_test: utilities/transactions/write_unprepared_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unprepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+sst_dump: $(OBJ_DIR)/tools/sst_dump.o $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-blob_dump: tools/blob_dump.o $(LIBOBJECTS)
+blob_dump: $(OBJ_DIR)/tools/blob_dump.o $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-repair_test: db/repair_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+repair_test: $(OBJ_DIR)/db/repair_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-ldb_cmd_test: tools/ldb_cmd_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ldb_cmd_test: $(OBJ_DIR)/tools/ldb_cmd_test.o $(TOOLS_LIBRARY) $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-ldb: tools/ldb.o $(LIBOBJECTS)
+ldb: $(OBJ_DIR)/tools/ldb.o $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-iostats_context_test: monitoring/iostats_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
+iostats_context_test: $(OBJ_DIR)/monitoring/iostats_context_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
 
-persistent_cache_test: utilities/persistent_cache/persistent_cache_test.o  db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+persistent_cache_test: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+statistics_test: $(OBJ_DIR)/monitoring/statistics_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+stats_history_test: $(OBJ_DIR)/monitoring/stats_history_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+lru_cache_test: $(OBJ_DIR)/cache/lru_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_del_aggregator_test: $(OBJ_DIR)/db/range_del_aggregator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_del_aggregator_bench: $(OBJ_DIR)/db/range_del_aggregator_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+blob_db_test: $(OBJ_DIR)/utilities/blob_db/blob_db_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_locking_test: utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+sst_file_reader_test: $(OBJ_DIR)/table/sst_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_secondary_test: $(OBJ_DIR)/db/db_secondary_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_cache_tracer_test: $(OBJ_DIR)/trace_replay/block_cache_tracer_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+block_cache_trace_analyzer_test: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_test.o $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-statistics_test: monitoring/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS)
+defer_test: $(OBJ_DIR)/util/defer_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-stats_history_test: monitoring/stats_history_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+blob_counting_iterator_test: $(OBJ_DIR)/db/blob/blob_counting_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-lru_cache_test: cache/lru_cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+blob_file_addition_test: $(OBJ_DIR)/db/blob/blob_file_addition_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+blob_file_builder_test: $(OBJ_DIR)/db/blob/blob_file_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-range_del_aggregator_bench: db/range_del_aggregator_bench.o $(LIBOBJECTS) $(TESTUTIL)
+blob_file_cache_test: $(OBJ_DIR)/db/blob/blob_file_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+blob_file_garbage_test: $(OBJ_DIR)/db/blob/blob_file_garbage_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNESS)
+blob_file_reader_test: $(OBJ_DIR)/db/blob/blob_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+blob_garbage_meter_test: $(OBJ_DIR)/db/blob/blob_garbage_meter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
+timer_test: $(OBJ_DIR)/util/timer_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+periodic_work_scheduler_test: $(OBJ_DIR)/db/periodic_work_scheduler_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS)
+testutil_test: $(OBJ_DIR)/test_util/testutil_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-block_cache_trace_analyzer_test: tools/block_cache_analyzer/block_cache_trace_analyzer_test.o tools/block_cache_analyzer/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS)
+io_tracer_test: $(OBJ_DIR)/trace_replay/io_tracer_test.o $(OBJ_DIR)/trace_replay/io_tracer.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-defer_test: util/defer_test.o $(LIBOBJECTS) $(TESTHARNESS)
+prefetch_test: $(OBJ_DIR)/file/prefetch_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+io_tracer_parser_test: $(OBJ_DIR)/tools/io_tracer_parser_test.o $(OBJ_DIR)/tools/io_tracer_parser_tool.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+io_tracer_parser: $(OBJ_DIR)/tools/io_tracer_parser.o $(TOOLS_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_blob_corruption_test: $(OBJ_DIR)/db/blob/db_blob_corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_write_buffer_manager_test: $(OBJ_DIR)/db/db_write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+clipping_iterator_test: $(OBJ_DIR)/db/compaction/clipping_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+ribbon_bench: $(OBJ_DIR)/microbench/ribbon_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+db_basic_bench: $(OBJ_DIR)/microbench/db_basic_bench.o $(LIBRARY)
+	$(AM_LINK)
+
+cache_reservation_manager_test: $(OBJ_DIR)/cache/cache_reservation_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
 #-------------------------------------------------
 # make install related stuff
-INSTALL_PATH ?= /usr/local
+PREFIX ?= /usr/local
+LIBDIR ?= $(PREFIX)/lib
+INSTALL_LIBDIR = $(DESTDIR)$(LIBDIR)
 
 uninstall:
-	rm -rf $(INSTALL_PATH)/include/rocksdb \
-	  $(INSTALL_PATH)/lib/$(LIBRARY) \
-	  $(INSTALL_PATH)/lib/$(SHARED4) \
-	  $(INSTALL_PATH)/lib/$(SHARED3) \
-	  $(INSTALL_PATH)/lib/$(SHARED2) \
-	  $(INSTALL_PATH)/lib/$(SHARED1)
-
-install-headers:
-	install -d $(INSTALL_PATH)/lib
+	rm -rf $(DESTDIR)$(PREFIX)/include/rocksdb \
+	  $(INSTALL_LIBDIR)/$(LIBRARY) \
+	  $(INSTALL_LIBDIR)/$(SHARED4) \
+	  $(INSTALL_LIBDIR)/$(SHARED3) \
+	  $(INSTALL_LIBDIR)/$(SHARED2) \
+	  $(INSTALL_LIBDIR)/$(SHARED1) \
+	  $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc
+
+install-headers: gen-pc
+	install -d $(INSTALL_LIBDIR)
+	install -d $(INSTALL_LIBDIR)/pkgconfig
 	for header_dir in `$(FIND) "include/rocksdb" -type d`; do \
-		install -d $(INSTALL_PATH)/$$header_dir; \
+		install -d $(DESTDIR)/$(PREFIX)/$$header_dir; \
 	done
 	for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \
-		install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
+		install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/$$header; \
+	done
+	for header in $(ROCKSDB_PLUGIN_HEADERS); do \
+		install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \
+		install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \
 	done
+	install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc
 
 install-static: install-headers $(LIBRARY)
-	install -C -m 755 $(LIBRARY) $(INSTALL_PATH)/lib
+	install -d $(INSTALL_LIBDIR)
+	install -C -m 755 $(LIBRARY) $(INSTALL_LIBDIR)
 
 install-shared: install-headers $(SHARED4)
-	install -C -m 755 $(SHARED4) $(INSTALL_PATH)/lib && \
-		ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED3) && \
-		ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED2) && \
-		ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED1)
+	install -d $(INSTALL_LIBDIR)
+	install -C -m 755 $(SHARED4) $(INSTALL_LIBDIR)
+	ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED3)
+	ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED2)
+	ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED1)
 
 # install static by default + install shared if it exists
 install: install-static
 	[ -e $(SHARED4) ] && $(MAKE) install-shared || :
 
+# Generate the pkg-config file
+gen-pc:
+	-echo 'prefix=$(PREFIX)' > rocksdb.pc
+	-echo 'exec_prefix=$${prefix}' >> rocksdb.pc
+	-echo 'includedir=$${prefix}/include' >> rocksdb.pc
+	-echo 'libdir=$(LIBDIR)' >> rocksdb.pc
+	-echo '' >> rocksdb.pc
+	-echo 'Name: rocksdb' >> rocksdb.pc
+	-echo 'Description: An embeddable persistent key-value store for fast storage' >> rocksdb.pc
+	-echo Version: $(shell ./build_tools/version.sh full) >> rocksdb.pc
+	-echo 'Libs: -L$${libdir} $(EXEC_LDFLAGS) -lrocksdb' >> rocksdb.pc
+	-echo 'Libs.private: $(PLATFORM_LDFLAGS)' >> rocksdb.pc
+	-echo 'Cflags: -I$${includedir} $(PLATFORM_CXXFLAGS)' >> rocksdb.pc
+	-echo 'Requires: $(subst ",,$(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))' >> rocksdb.pc
+
 #-------------------------------------------------
 
 
 # ---------------------------------------------------------------------------
 # Jni stuff
 # ---------------------------------------------------------------------------
-
 JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
 else ifeq ($(PLATFORM), OS_OPENBSD)
-	ifneq (,$(filter amd64 ppc64 ppc64le arm64 aarch64 sparc64, $(MACHINE)))
+	ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64, $(MACHINE)))
 		ARCH := 64
 	else
 		ARCH := 32
@@ -1783,37 +2072,48 @@
   JNI_LIBC_POSTFIX = -$(JNI_LIBC)
 endif
 
-ifneq (,$(filter ppc% arm64 aarch64 sparc64, $(MACHINE)))
+ifeq (,$(ROCKSDBJNILIB))
+ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE)))
 	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so
 else
 	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
 endif
-ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar
-ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
-ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
-ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar
+endif
+ROCKSDB_JAVA_VERSION ?= $(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar
+ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
+ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar
+ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-sources.jar
 SHA256_CMD = sha256sum
 
 ZLIB_VER ?= 1.2.11
 ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
-ZLIB_DOWNLOAD_BASE ?= http://zlib.net
-BZIP2_VER ?= 1.0.6
-BZIP2_SHA256 ?= a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd
-BZIP2_DOWNLOAD_BASE ?= https://downloads.sourceforge.net/project/bzip2
-SNAPPY_VER ?= 1.1.7
-SNAPPY_SHA256 ?= 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4
+ZLIB_DOWNLOAD_BASE ?= https://zlib.net/fossils
+BZIP2_VER ?= 1.0.8
+BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269
+BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2
+SNAPPY_VER ?= 1.1.8
+SNAPPY_SHA256 ?= 16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
-LZ4_VER ?= 1.9.2
-LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc
+LZ4_VER ?= 1.9.3
+LZ4_SHA256 ?= 030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.4.4
-ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8
+ZSTD_VER ?= 1.4.9
+ZSTD_SHA256 ?= acf714d98e3db7b876e5b540cbf6dee298f60eb3c0723104f6d3f065cd60d6a8
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1
 
 ifeq ($(PLATFORM), OS_MACOSX)
+ifeq (,$(findstring librocksdbjni-osx,$(ROCKSDBJNILIB)))
+ifeq ($(MACHINE),arm64)
+	ROCKSDBJNILIB = librocksdbjni-osx-arm64.jnilib
+else ifeq ($(MACHINE),x86_64)
+	ROCKSDBJNILIB = librocksdbjni-osx-x86_64.jnilib
+else
 	ROCKSDBJNILIB = librocksdbjni-osx.jnilib
-	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
+endif
+endif
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-osx.jar
 	SHA256_CMD = openssl sha256 -r
 ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","")
 	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin
@@ -1821,10 +2121,11 @@
 	JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
 endif
 endif
+
 ifeq ($(PLATFORM), OS_FREEBSD)
 	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/freebsd
 	ROCKSDBJNILIB = librocksdbjni-freebsd$(ARCH).so
-	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-freebsd$(ARCH).jar
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-freebsd$(ARCH).jar
 endif
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ROCKSDBJNILIB = librocksdbjni-solaris$(ARCH).so
@@ -1839,142 +2140,186 @@
 	SNAPPY_MAKE_TARGET = libsnappy.la
 endif
 ifeq ($(PLATFORM), OS_OPENBSD)
-        JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd
 	ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so
-        ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-openbsd$(ARCH).jar
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar
 endif
 
-libz.a:
-	-rm -rf zlib-$(ZLIB_VER)
-ifeq (,$(wildcard ./zlib-$(ZLIB_VER).tar.gz))
+zlib-$(ZLIB_VER).tar.gz:
 	curl --fail --output zlib-$(ZLIB_VER).tar.gz --location ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
-endif
 	ZLIB_SHA256_ACTUAL=`$(SHA256_CMD) zlib-$(ZLIB_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZLIB_SHA256)" != "$$ZLIB_SHA256_ACTUAL" ]; then \
 		echo zlib-$(ZLIB_VER).tar.gz checksum mismatch, expected=\"$(ZLIB_SHA256)\" actual=\"$$ZLIB_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libz.a: zlib-$(ZLIB_VER).tar.gz
+	-rm -rf zlib-$(ZLIB_VER)
 	tar xvzf zlib-$(ZLIB_VER).tar.gz
-	cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${EXTRA_CFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --static && $(MAKE)
+	if [ -n"$(ARCHFLAG)" ]; then \
+		cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static --archs="$(ARCHFLAG)" && $(MAKE);  \
+	else \
+		cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static && $(MAKE);  \
+	fi
 	cp zlib-$(ZLIB_VER)/libz.a .
 
-libbz2.a:
-	-rm -rf bzip2-$(BZIP2_VER)
-ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz))
+bzip2-$(BZIP2_VER).tar.gz:
 	curl --fail --output bzip2-$(BZIP2_VER).tar.gz --location ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz
-endif
 	BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \
 		echo bzip2-$(BZIP2_VER).tar.gz checksum mismatch, expected=\"$(BZIP2_SHA256)\" actual=\"$$BZIP2_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libbz2.a: bzip2-$(BZIP2_VER).tar.gz
+	-rm -rf bzip2-$(BZIP2_VER)
 	tar xvzf bzip2-$(BZIP2_VER).tar.gz
-	cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${EXTRA_CFLAGS}' AR='ar ${EXTRA_ARFLAGS}'
+	cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' AR='ar ${EXTRA_ARFLAGS}' libbz2.a
 	cp bzip2-$(BZIP2_VER)/libbz2.a .
 
-libsnappy.a:
-	-rm -rf snappy-$(SNAPPY_VER)
-ifeq (,$(wildcard ./snappy-$(SNAPPY_VER).tar.gz))
+snappy-$(SNAPPY_VER).tar.gz:
 	curl --fail --output snappy-$(SNAPPY_VER).tar.gz --location ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER).tar.gz
-endif
 	SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \
 		echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libsnappy.a: snappy-$(SNAPPY_VER).tar.gz
+	-rm -rf snappy-$(SNAPPY_VER)
 	tar xvzf snappy-$(SNAPPY_VER).tar.gz
 	mkdir snappy-$(SNAPPY_VER)/build
-	cd snappy-$(SNAPPY_VER)/build && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
+	cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
 	cp snappy-$(SNAPPY_VER)/build/libsnappy.a .
 
-liblz4.a:
-	-rm -rf lz4-$(LZ4_VER)
-ifeq (,$(wildcard ./lz4-$(LZ4_VER).tar.gz))
+lz4-$(LZ4_VER).tar.gz:
 	curl --fail --output lz4-$(LZ4_VER).tar.gz --location ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz
-endif
 	LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \
 		echo lz4-$(LZ4_VER).tar.gz checksum mismatch, expected=\"$(LZ4_SHA256)\" actual=\"$$LZ4_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+liblz4.a: lz4-$(LZ4_VER).tar.gz
+	-rm -rf lz4-$(LZ4_VER)
 	tar xvzf lz4-$(LZ4_VER).tar.gz
-	cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' all
+	cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' all
 	cp lz4-$(LZ4_VER)/lib/liblz4.a .
 
-libzstd.a:
-	-rm -rf zstd-$(ZSTD_VER)
-ifeq (,$(wildcard ./zstd-$(ZSTD_VER).tar.gz))
+zstd-$(ZSTD_VER).tar.gz:
 	curl --fail --output zstd-$(ZSTD_VER).tar.gz --location ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz
-endif
 	ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \
 		echo zstd-$(ZSTD_VER).tar.gz checksum mismatch, expected=\"$(ZSTD_SHA256)\" actual=\"$$ZSTD_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libzstd.a: zstd-$(ZSTD_VER).tar.gz
+	-rm -rf zstd-$(ZSTD_VER)
 	tar xvzf zstd-$(ZSTD_VER).tar.gz
-	cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' install
+	cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 $(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' libzstd.a
 	cp zstd-$(ZSTD_VER)/lib/libzstd.a .
 
-# A version of each $(LIBOBJECTS) compiled with -fPIC and a fixed set of static compression libraries
-java_static_libobjects = $(patsubst %,jls/%,$(LIB_CC_OBJECTS))
-CLEAN_FILES += jls
-java_static_all_libobjects = $(java_static_libobjects)
-
+# A version of each $(LIB_OBJECTS) compiled with -fPIC and a fixed set of static compression libraries
 ifneq ($(ROCKSDB_JAVA_NO_COMPRESSION), 1)
 JAVA_COMPRESSIONS = libz.a libbz2.a libsnappy.a liblz4.a libzstd.a
 endif
 
 JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 -DZSTD
-JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib/include
+JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./snappy-$(SNAPPY_VER)/build -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib -I./zstd-$(ZSTD_VER)/lib/dictBuilder
 
-ifeq ($(HAVE_POWER8),1)
-JAVA_STATIC_C_LIBOBJECTS = $(patsubst %.c.o,jls/%.c.o,$(LIB_SOURCES_C:.c=.o))
-JAVA_STATIC_ASM_LIBOBJECTS = $(patsubst %.S.o,jls/%.S.o,$(LIB_SOURCES_ASM:.S=.o))
-
-java_static_ppc_libobjects = $(JAVA_STATIC_C_LIBOBJECTS) $(JAVA_STATIC_ASM_LIBOBJECTS)
-
-jls/util/crc32c_ppc.o: util/crc32c_ppc.c
-	$(AM_V_CC)$(CC) $(CFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -c $< -o $@
-
-jls/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
-	$(AM_V_CC)$(CC) $(CFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -c $< -o $@
-
-java_static_all_libobjects += $(java_static_ppc_libobjects)
+ifneq ($(findstring rocksdbjavastatic, $(filter-out rocksdbjavastatic_deps, $(MAKECMDGOALS))),)
+CXXFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES)
+CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES)
+endif
+rocksdbjavastatic:
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
+endif
+	$(MAKE) rocksdbjavastatic_deps
+	$(MAKE) rocksdbjavastatic_libobjects
+	$(MAKE) rocksdbjavastatic_javalib
+	$(MAKE) rocksdbjava_jar
+
+rocksdbjavastaticosx: rocksdbjavastaticosx_archs
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjavastaticosx_ub: rocksdbjavastaticosx_archs
+	cd java/target; lipo -create -output librocksdbjni-osx.jnilib librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjavastaticosx_archs: 
+	$(MAKE) rocksdbjavastaticosx_arch_x86_64
+	$(MAKE) rocksdbjavastaticosx_arch_arm64
+
+rocksdbjavastaticosx_arch_%:
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
+endif
+	$(MAKE) clean-ext-libraries-bin
+	$(MAKE) clean-rocks
+	ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_deps
+	ARCHFLAG="-arch $*" $(MAKE) rocksdbjavastatic_libobjects
+	ARCHFLAG="-arch $*" ROCKSDBJNILIB="librocksdbjni-osx-$*.jnilib" $(MAKE) rocksdbjavastatic_javalib
+
+ifeq ($(JAR_CMD),)
+ifneq ($(JAVA_HOME),)
+JAR_CMD := $(JAVA_HOME)/bin/jar
+else
+JAR_CMD := jar
 endif
-
-$(java_static_libobjects): jls/%.o: %.cc $(JAVA_COMPRESSIONS)
-	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
-
-rocksdbjavastatic: $(java_static_all_libobjects)
-	cd java;$(MAKE) javalib;
-	rm -f ./java/target/$(ROCKSDBJNILIB)
+endif
+rocksdbjavastatic_javalib:
+	cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib
+	rm -f java/target/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
 	  -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \
-	  $(java_static_all_libobjects) $(COVERAGEFLAGS) \
+	  $(LIB_OBJECTS) $(COVERAGEFLAGS) \
 	  $(JAVA_COMPRESSIONS) $(JAVA_STATIC_LDFLAGS)
 	cd java/target;if [ "$(DEBUG_LEVEL)" == "0" ]; then \
 		strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \
 	fi
-	cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
-	cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
-	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
-	cd java/target/apidocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
-	cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 
-rocksdbjavastaticrelease: rocksdbjavastatic
+rocksdbjava_jar:
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjava_javadocs_jar:
+	cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1
+
+rocksdbjava_sources_jar:
+	cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
+	openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1
+
+rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS)
+
+rocksdbjavastatic_libobjects: $(LIB_OBJECTS)
+
+rocksdbjavastaticrelease: rocksdbjavastaticosx rocksdbjava_javadocs_jar rocksdbjava_sources_jar
 	cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
-	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
-	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
-	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
-
-rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl
-	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
-	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
-	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1
+
+rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl rocksdbjava_javadocs_jar rocksdbjava_sources_jar
+	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1
 
 rocksdbjavastaticdockerx86:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerx86_64:
 	mkdir -p java/target
@@ -1988,87 +2333,83 @@
 	mkdir -p java/target
 	docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
+rocksdbjavastaticdockers390x:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
 rocksdbjavastaticdockerx86musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x86-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_x86-musl-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
 
 rocksdbjavastaticdockerx86_64musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
 
 rocksdbjavastaticdockerppc64lemusl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
 
 rocksdbjavastaticdockerarm64v8musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
+
+rocksdbjavastaticdockers390xmusl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux-alpine.sh
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
 rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastaticpublishcentral
 
-rocksdbjavastaticpublishcentral:
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64-musl.jar -Dclassifier=linux64-musl
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32-musl.jar -Dclassifier=linux32-musl
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-win64.jar -Dclassifier=win64
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
-
-# A version of each $(LIBOBJECTS) compiled with -fPIC
-ifeq ($(HAVE_POWER8),1)
-JAVA_CC_OBJECTS = $(SHARED_CC_OBJECTS)
-JAVA_C_OBJECTS = $(SHARED_C_OBJECTS)
-JAVA_ASM_OBJECTS = $(SHARED_ASM_OBJECTS)
-
-JAVA_C_LIBOBJECTS = $(patsubst %.c.o,jl/%.c.o,$(JAVA_C_OBJECTS))
-JAVA_ASM_LIBOBJECTS = $(patsubst %.S.o,jl/%.S.o,$(JAVA_ASM_OBJECTS))
-endif
-
-java_libobjects = $(patsubst %,jl/%,$(LIB_CC_OBJECTS))
-CLEAN_FILES += jl
-java_all_libobjects = $(java_libobjects)
+ROCKSDB_JAVA_RELEASE_CLASSIFIERS = javadoc sources linux64 linux32 linux64-musl linux32-musl osx win64
 
-ifeq ($(HAVE_POWER8),1)
-java_ppc_libobjects = $(JAVA_C_LIBOBJECTS) $(JAVA_ASM_LIBOBJECTS)
+rocksdbjavastaticpublishcentral: rocksdbjavageneratepom
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -Dclassifier=$(classifier);)
+
+rocksdbjavageneratepom:
+	cd java;cat pom.xml.template | sed 's/\$${ROCKSDB_JAVA_VERSION}/$(ROCKSDB_JAVA_VERSION)/' > pom.xml
+
+rocksdbjavastaticnexusbundlejar: rocksdbjavageneratepom
+	openssl sha1 -r java/pom.xml | awk '{  print $$1 }' > java/target/pom.xml.sha1
+	openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar | awk '{  print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), openssl sha1 -r java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar | awk '{  print $$1 }' > java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1;)
+	gpg --yes --output java/target/pom.xml.asc -ab java/pom.xml
+	gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), gpg --yes -ab java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar;)
+	$(JAR_CMD) cvf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java pom.xml -C java/target pom.xml.sha1 -C java/target pom.xml.asc -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar.asc
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), $(JAR_CMD) uf java/target/nexus-bundle-rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.sha1 -C java/target rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar.asc;)
 
-jl/crc32c_ppc.o: util/crc32c_ppc.c
-	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
 
-jl/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
-	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
-java_all_libobjects += $(java_ppc_libobjects)
-endif
+# A version of each $(LIBOBJECTS) compiled with -fPIC
 
-$(java_libobjects): jl/%.o: %.cc
+jl/%.o: %.cc
 	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
 
-
-
-rocksdbjava: $(java_all_libobjects)
-	$(AM_V_GEN)cd java;$(MAKE) javalib;
+rocksdbjava: $(LIB_OBJECTS)
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
+endif
+	$(AM_V_GEN)cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib;
 	$(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB)
-	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(java_all_libobjects) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
-	$(AM_V_at)cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
-	$(AM_V_at)cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
-	$(AM_V_at)cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
+	$(AM_V_at)cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	$(AM_V_at)cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	$(AM_V_at)cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	$(AM_V_at)openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
 
 jclean:
 	cd java;$(MAKE) clean;
 
 jtest_compile: rocksdbjava
-	cd java;$(MAKE) java_test
+	cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) java_test
 
 jtest_run:
 	cd java;$(MAKE) run_test
 
 jtest: rocksdbjava
-	cd java;$(MAKE) sample;$(MAKE) test;
-	python tools/check_all_python.py # TODO peterd: find a better place for this check in CI targets
+	cd java;$(MAKE) sample; SHA256_CMD='$(SHA256_CMD)' $(MAKE) test;
+	$(PYTHON) tools/check_all_python.py # TODO peterd: find a better place for this check in CI targets
 
 jdb_bench:
 	cd java;$(MAKE) db_bench;
@@ -2107,30 +2448,32 @@
 
 else
 ifeq ($(HAVE_POWER8),1)
-util/crc32c_ppc.o: util/crc32c_ppc.c
+$(OBJ_DIR)/util/crc32c_ppc.o: util/crc32c_ppc.c
 	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
 
-util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
+$(OBJ_DIR)/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
 	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
 endif
-.cc.o:
-	$(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+$(OBJ_DIR)/%.o: %.cc
+	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
 
-.cpp.o:
-	$(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+$(OBJ_DIR)/%.o: %.cpp
+	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
 
-.c.o:
+$(OBJ_DIR)/%.o: %.c
 	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
 endif
+
 # ---------------------------------------------------------------------------
 #  	Source files dependencies detection
 # ---------------------------------------------------------------------------
-
-all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
-DEPFILES = $(all_sources:.cc=.cc.d)
-
+# If skip dependencies is ON, skip including the dep files
+ifneq ($(SKIP_DEPENDS), 1)
+DEPFILES = $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES))
+DEPFILES+ = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C))
 ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
-  DEPFILES += $(FOLLY_SOURCES:.cpp=.cpp.d)
+  DEPFILES +=$(patsubst %.cpp, $(OBJ_DIR)/%.cpp.d, $(FOLLY_SOURCES))
+endif
 endif
 
 # Add proper dependency support so changing a .h file forces a .cc file to
@@ -2138,23 +2481,25 @@
 
 # The .d file indicates .cc file's dependencies on .h files. We generate such
 # dependency by g++'s -MM option, whose output is a make dependency rule.
-%.cc.d: %.cc
-	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
-	  -MM -MT'$@' -MT'$(<:.cc=.o)' "$<" -o '$@'
-
-%.cpp.d: %.cpp
-	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
-	  -MM -MT'$@' -MT'$(<:.cpp=.o)' "$<" -o '$@'
+$(OBJ_DIR)/%.cc.d: %.cc
+	@mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.cc=.o)' -MT'$(<:%.cc=$(OBJ_DIR)/%.o)' \
+          "$<" -o '$@'
+
+$(OBJ_DIR)/%.cpp.d: %.cpp
+	@mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.cpp=.o)' -MT'$(<:%.cpp=$(OBJ_DIR)/%.o)' \
+          "$<" -o '$@'
 
 ifeq ($(HAVE_POWER8),1)
-DEPFILES_C = $(LIB_SOURCES_C:.c=.c.d)
-DEPFILES_ASM = $(LIB_SOURCES_ASM:.S=.S.d)
+DEPFILES_C = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C))
+DEPFILES_ASM = $(patsubst %.S, $(OBJ_DIR)/%.S.d, $(LIB_SOURCES_ASM))
 
-%.c.d: %.c
+$(OBJ_DIR)/%.c.d: %.c
 	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
 	  -MM -MT'$@' -MT'$(<:.c=.o)' "$<" -o '$@'
 
-%.S.d: %.S
+$(OBJ_DIR)/%.S.d: %.S
 	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
 	  -MM -MT'$@' -MT'$(<:.S=.o)' "$<" -o '$@'
 
@@ -2166,20 +2511,12 @@
 depend: $(DEPFILES)
 endif
 
-# if the make goal is either "clean" or "format", we shouldn't
-# try to import the *.d files.
-# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
-# working solution.
-ifneq ($(MAKECMDGOALS),clean)
-ifneq ($(MAKECMDGOALS),format)
-ifneq ($(MAKECMDGOALS),jclean)
-ifneq ($(MAKECMDGOALS),jtest)
-ifneq ($(MAKECMDGOALS),package)
-ifneq ($(MAKECMDGOALS),analyze)
+build_subset_tests: $(ROCKSDBTESTS_SUBSET)
+	$(AM_V_GEN)if [ -n "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}" ]; then echo "$(ROCKSDBTESTS_SUBSET)" > "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}"; else echo "$(ROCKSDBTESTS_SUBSET)"; fi
+
+# Remove the rules for which dependencies should not be generated and see if any are left.
+#If so, include the dependencies; if not, do not include the dependency files
+ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test, $(MAKECMDGOALS))
+ifneq ("$(ROCKS_DEP_RULES)", "")
 -include $(DEPFILES)
 endif
-endif
-endif
-endif
-endif
-endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/PLUGINS.md mariadb-10.11.13/storage/rocksdb/rocksdb/PLUGINS.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/PLUGINS.md	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/PLUGINS.md	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,4 @@
+This is the list of all known third-party plugins for RocksDB. If something is missing, please open a pull request to add it.
+
+* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference
+* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/README.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/README.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/README.md	2025-05-19 16:14:27.000000000 +0000
@@ -1,8 +1,9 @@
 ## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
 
-[![Linux/Mac Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
-[![Windows Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/master?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/master)
-[![PPC64le Build Status](http://140.211.168.68:8080/buildStatus/icon?job=Rocksdb)](http://140.211.168.68:8080/job/Rocksdb)
+[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
+[![TravisCI Status](https://api.travis-ci.com/facebook/rocksdb.svg?branch=main)](https://travis-ci.com/github/facebook/rocksdb)
+[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main)
+[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb)
 
 RocksDB is developed and maintained by Facebook Database Engineering Team.
 It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)
@@ -16,7 +17,7 @@
 making it especially suitable for storing multiple terabytes of data in a
 single database.
 
-Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples
+Start with example usage here: https://github.com/facebook/rocksdb/tree/main/examples
 
 See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
 
@@ -24,7 +25,7 @@
 rely on the details of any other header files in this package.  Those
 internal APIs may be changed without warning.
 
-Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
+Questions and discussions are welcome on the [RocksDB Developers Public](https://www.facebook.com/groups/rocksdb.dev/) Facebook group and [email list](https://groups.google.com/g/rocksdb) on Google Groups.
 
 ## License
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/TARGETS mariadb-10.11.13/storage/rocksdb/rocksdb/TARGETS
--- mariadb-10.11.11/storage/rocksdb/rocksdb/TARGETS	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/TARGETS	2025-05-19 16:14:27.000000000 +0000
@@ -1,4 +1,5 @@
-# This file @generated by `python buckifier/buckify_rocksdb.py`
+# This file @generated by:
+#$ python3 buckifier/buckify_rocksdb.py
 # --> DO NOT EDIT MANUALLY <--
 # This file is a Facebook-specific integration for buck builds, so can
 # only be validated by Facebook employees.
@@ -9,7 +10,7 @@
 
 REPO_PATH = package_name() + "/"
 
-ROCKSDB_COMPILER_FLAGS = [
+ROCKSDB_COMPILER_FLAGS_0 = [
     "-fno-builtin-memcmp",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
@@ -24,19 +25,25 @@
     ("zlib", None, "z"),
     ("gflags", None, "gflags"),
     ("lz4", None, "lz4"),
-    ("zstd", None),
-    ("tbb", None),
-    ("googletest", None, "gtest"),
+    ("zstd", None, "zstd"),
 ]
 
-ROCKSDB_OS_DEPS = [
+ROCKSDB_OS_DEPS_0 = [
     (
         "linux",
-        ["third-party//numa:numa", "third-party//liburing:uring"],
+        [
+            "third-party//numa:numa",
+            "third-party//liburing:uring",
+            "third-party//tbb:tbb",
+        ],
+    ),
+    (
+        "macos",
+        ["third-party//tbb:tbb"],
     ),
 ]
 
-ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [
     (
         "linux",
         [
@@ -50,17 +57,33 @@
             "-DHAVE_SSE42",
             "-DLIBURING",
             "-DNUMA",
+            "-DROCKSDB_PLATFORM_POSIX",
+            "-DROCKSDB_LIB_IO_POSIX",
+            "-DTBB",
         ],
     ),
     (
         "macos",
-        ["-DOS_MACOSX"],
+        [
+            "-DOS_MACOSX",
+            "-DROCKSDB_PLATFORM_POSIX",
+            "-DROCKSDB_LIB_IO_POSIX",
+            "-DTBB",
+        ],
+    ),
+    (
+        "windows",
+        [
+            "-DOS_WIN",
+            "-DWIN32",
+            "-D_MBCS",
+            "-DWIN64",
+            "-DNOMINMAX",
+        ],
     ),
 ]
 
 ROCKSDB_PREPROCESSOR_FLAGS = [
-    "-DROCKSDB_PLATFORM_POSIX",
-    "-DROCKSDB_LIB_IO_POSIX",
     "-DROCKSDB_SUPPORT_THREAD_LOCAL",
 
     # Flags to enable libs we include
@@ -71,14 +94,15 @@
     "-DZSTD",
     "-DZSTD_STATIC_LINKING_ONLY",
     "-DGFLAGS=gflags",
-    "-DTBB",
 
     # Added missing flags from output of build_detect_platform
     "-DROCKSDB_BACKTRACE",
+]
 
-    # Directories with files for #include
-    "-I" + REPO_PATH + "include/",
-    "-I" + REPO_PATH,
+# Directories with files for #include
+ROCKSDB_INCLUDE_PATHS = [
+    "",
+    "include",
 ]
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
@@ -93,33 +117,53 @@
 
 # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
 # doesn't harm and avoid forgetting to add it.
-ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else [])
+ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else [])
 
 sanitizer = read_config("fbcode", "sanitizer")
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([(
     "linux",
     ["-DROCKSDB_JEMALLOC"],
 )] if sanitizer == "" else [])
 
-ROCKSDB_OS_DEPS += ([(
+ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([(
     "linux",
     ["third-party//jemalloc:headers"],
 )] if sanitizer == "" else [])
 
+ROCKSDB_LIB_DEPS = [
+    ":rocksdb_lib",
+    ":rocksdb_test_lib",
+] if not is_opt_mode else [":rocksdb_lib"]
+
 cpp_library(
     name = "rocksdb_lib",
     srcs = [
+        "cache/cache.cc",
+        "cache/cache_entry_roles.cc",
+        "cache/cache_key.cc",
+        "cache/cache_reservation_manager.cc",
         "cache/clock_cache.cc",
         "cache/lru_cache.cc",
         "cache/sharded_cache.cc",
         "db/arena_wrapped_db_iter.cc",
+        "db/blob/blob_fetcher.cc",
+        "db/blob/blob_file_addition.cc",
+        "db/blob/blob_file_builder.cc",
+        "db/blob/blob_file_cache.cc",
+        "db/blob/blob_file_garbage.cc",
+        "db/blob/blob_file_meta.cc",
+        "db/blob/blob_file_reader.cc",
+        "db/blob/blob_garbage_meter.cc",
+        "db/blob/blob_log_format.cc",
+        "db/blob/blob_log_sequential_reader.cc",
+        "db/blob/blob_log_writer.cc",
+        "db/blob/prefetch_buffer_collection.cc",
         "db/builder.cc",
         "db/c.cc",
         "db/column_family.cc",
-        "db/compacted_db_impl.cc",
         "db/compaction/compaction.cc",
         "db/compaction/compaction_iterator.cc",
         "db/compaction/compaction_job.cc",
@@ -127,8 +171,10 @@
         "db/compaction/compaction_picker_fifo.cc",
         "db/compaction/compaction_picker_level.cc",
         "db/compaction/compaction_picker_universal.cc",
+        "db/compaction/sst_partitioner.cc",
         "db/convenience.cc",
         "db/db_filesnapshot.cc",
+        "db/db_impl/compacted_db_impl.cc",
         "db/db_impl/db_impl.cc",
         "db/db_impl/db_impl_compaction_flush.cc",
         "db/db_impl/db_impl_debug.cc",
@@ -159,6 +205,8 @@
         "db/memtable_list.cc",
         "db/merge_helper.cc",
         "db/merge_operator.cc",
+        "db/output_validator.cc",
+        "db/periodic_work_scheduler.cc",
         "db/range_del_aggregator.cc",
         "db/range_tombstone_fragmenter.cc",
         "db/repair.cc",
@@ -169,25 +217,32 @@
         "db/trim_history_scheduler.cc",
         "db/version_builder.cc",
         "db/version_edit.cc",
+        "db/version_edit_handler.cc",
         "db/version_set.cc",
+        "db/wal_edit.cc",
         "db/wal_manager.cc",
         "db/write_batch.cc",
         "db/write_batch_base.cc",
         "db/write_controller.cc",
         "db/write_thread.cc",
+        "env/composite_env.cc",
         "env/env.cc",
         "env/env_chroot.cc",
         "env/env_encryption.cc",
         "env/env_hdfs.cc",
         "env/env_posix.cc",
         "env/file_system.cc",
+        "env/file_system_tracer.cc",
         "env/fs_posix.cc",
+        "env/fs_remap.cc",
         "env/io_posix.cc",
         "env/mock_env.cc",
+        "env/unique_id_gen.cc",
         "file/delete_scheduler.cc",
         "file/file_prefetch_buffer.cc",
         "file/file_util.cc",
         "file/filename.cc",
+        "file/line_file_reader.cc",
         "file/random_access_file_reader.cc",
         "file/read_write_util.cc",
         "file/readahead_raf.cc",
@@ -200,6 +255,8 @@
         "memory/arena.cc",
         "memory/concurrent_arena.cc",
         "memory/jemalloc_nodump_allocator.cc",
+        "memory/memkind_kmem_allocator.cc",
+        "memory/memory_allocator.cc",
         "memtable/alloc_tracker.cc",
         "memtable/hash_linklist_rep.cc",
         "memtable/hash_skiplist_rep.cc",
@@ -221,20 +278,30 @@
         "monitoring/thread_status_util.cc",
         "monitoring/thread_status_util_debug.cc",
         "options/cf_options.cc",
+        "options/configurable.cc",
+        "options/customizable.cc",
         "options/db_options.cc",
         "options/options.cc",
         "options/options_helper.cc",
         "options/options_parser.cc",
-        "options/options_sanity_check.cc",
         "port/port_posix.cc",
         "port/stack_trace.cc",
+        "port/win/env_default.cc",
+        "port/win/env_win.cc",
+        "port/win/io_win.cc",
+        "port/win/port_win.cc",
+        "port/win/win_logger.cc",
+        "port/win/win_thread.cc",
         "table/adaptive/adaptive_table_factory.cc",
+        "table/block_based/binary_search_index_reader.cc",
         "table/block_based/block.cc",
         "table/block_based/block_based_filter_block.cc",
         "table/block_based/block_based_table_builder.cc",
         "table/block_based/block_based_table_factory.cc",
+        "table/block_based/block_based_table_iterator.cc",
         "table/block_based/block_based_table_reader.cc",
         "table/block_based/block_builder.cc",
+        "table/block_based/block_prefetcher.cc",
         "table/block_based/block_prefix_index.cc",
         "table/block_based/data_block_footer.cc",
         "table/block_based/data_block_hash_index.cc",
@@ -242,9 +309,14 @@
         "table/block_based/filter_policy.cc",
         "table/block_based/flush_block_policy.cc",
         "table/block_based/full_filter_block.cc",
+        "table/block_based/hash_index_reader.cc",
         "table/block_based/index_builder.cc",
+        "table/block_based/index_reader_common.cc",
         "table/block_based/parsed_full_filter_block.cc",
         "table/block_based/partitioned_filter_block.cc",
+        "table/block_based/partitioned_index_iterator.cc",
+        "table/block_based/partitioned_index_reader.cc",
+        "table/block_based/reader_common.cc",
         "table/block_based/uncompression_dict_reader.cc",
         "table/block_fetcher.cc",
         "table/cuckoo/cuckoo_table_builder.cc",
@@ -262,18 +334,26 @@
         "table/plain/plain_table_index.cc",
         "table/plain/plain_table_key_coding.cc",
         "table/plain/plain_table_reader.cc",
+        "table/sst_file_dumper.cc",
         "table/sst_file_reader.cc",
         "table/sst_file_writer.cc",
+        "table/table_factory.cc",
         "table/table_properties.cc",
         "table/two_level_iterator.cc",
+        "table/unique_id.cc",
         "test_util/sync_point.cc",
         "test_util/sync_point_impl.cc",
         "test_util/transaction_test_util.cc",
         "tools/dump/db_dump_tool.cc",
+        "tools/io_tracer_parser_tool.cc",
         "tools/ldb_cmd.cc",
         "tools/ldb_tool.cc",
         "tools/sst_dump_tool.cc",
         "trace_replay/block_cache_tracer.cc",
+        "trace_replay/io_tracer.cc",
+        "trace_replay/trace_record.cc",
+        "trace_replay/trace_record_handler.cc",
+        "trace_replay/trace_record_result.cc",
         "trace_replay/trace_replay.cc",
         "util/build_version.cc",
         "util/coding.cc",
@@ -282,12 +362,15 @@
         "util/compression_context_cache.cc",
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
+        "util/crc32c_arm64.cc",
         "util/dynamic_bloom.cc",
         "util/file_checksum_helper.cc",
         "util/hash.cc",
         "util/murmurhash.cc",
         "util/random.cc",
         "util/rate_limiter.cc",
+        "util/regex.cc",
+        "util/ribbon_config.cc",
         "util/slice.cc",
         "util/status.cc",
         "util/string_util.cc",
@@ -301,20 +384,24 @@
         "utilities/blob_db/blob_db_impl_filesnapshot.cc",
         "utilities/blob_db/blob_dump_tool.cc",
         "utilities/blob_db/blob_file.cc",
-        "utilities/blob_db/blob_log_format.cc",
-        "utilities/blob_db/blob_log_reader.cc",
-        "utilities/blob_db/blob_log_writer.cc",
+        "utilities/cache_dump_load.cc",
+        "utilities/cache_dump_load_impl.cc",
         "utilities/cassandra/cassandra_compaction_filter.cc",
         "utilities/cassandra/format.cc",
         "utilities/cassandra/merge_operator.cc",
         "utilities/checkpoint/checkpoint_impl.cc",
+        "utilities/compaction_filters.cc",
         "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
         "utilities/convenience/info_log_finder.cc",
         "utilities/debug.cc",
         "utilities/env_mirror.cc",
         "utilities/env_timed.cc",
+        "utilities/fault_injection_env.cc",
+        "utilities/fault_injection_fs.cc",
+        "utilities/fault_injection_secondary_cache.cc",
         "utilities/leveldb_options/leveldb_options.cc",
         "utilities/memory/memory_util.cc",
+        "utilities/merge_operators.cc",
         "utilities/merge_operators/bytesxor.cc",
         "utilities/merge_operators/max.cc",
         "utilities/merge_operators/put.cc",
@@ -334,6 +421,24 @@
         "utilities/simulator_cache/sim_cache.cc",
         "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
         "utilities/trace/file_trace_reader_writer.cc",
+        "utilities/trace/replayer_impl.cc",
+        "utilities/transactions/lock/lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_tracker.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
+        "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
         "utilities/transactions/optimistic_transaction.cc",
         "utilities/transactions/optimistic_transaction_db_impl.cc",
         "utilities/transactions/pessimistic_transaction.cc",
@@ -341,24 +446,356 @@
         "utilities/transactions/snapshot_checker.cc",
         "utilities/transactions/transaction_base.cc",
         "utilities/transactions/transaction_db_mutex_impl.cc",
-        "utilities/transactions/transaction_lock_mgr.cc",
         "utilities/transactions/transaction_util.cc",
         "utilities/transactions/write_prepared_txn.cc",
         "utilities/transactions/write_prepared_txn_db.cc",
         "utilities/transactions/write_unprepared_txn.cc",
         "utilities/transactions/write_unprepared_txn_db.cc",
         "utilities/ttl/db_ttl_impl.cc",
+        "utilities/wal_filter.cc",
         "utilities/write_batch_with_index/write_batch_with_index.cc",
         "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
     ],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    link_whole = False,
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-    deps = [],
-    external_deps = ROCKSDB_EXTERNAL_DEPS,
+    exported_deps = [],
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS,
+)
+
+cpp_library(
+    name = "rocksdb_whole_archive_lib",
+    srcs = [
+        "cache/cache.cc",
+        "cache/cache_entry_roles.cc",
+        "cache/cache_key.cc",
+        "cache/cache_reservation_manager.cc",
+        "cache/clock_cache.cc",
+        "cache/lru_cache.cc",
+        "cache/sharded_cache.cc",
+        "db/arena_wrapped_db_iter.cc",
+        "db/blob/blob_fetcher.cc",
+        "db/blob/blob_file_addition.cc",
+        "db/blob/blob_file_builder.cc",
+        "db/blob/blob_file_cache.cc",
+        "db/blob/blob_file_garbage.cc",
+        "db/blob/blob_file_meta.cc",
+        "db/blob/blob_file_reader.cc",
+        "db/blob/blob_garbage_meter.cc",
+        "db/blob/blob_log_format.cc",
+        "db/blob/blob_log_sequential_reader.cc",
+        "db/blob/blob_log_writer.cc",
+        "db/blob/prefetch_buffer_collection.cc",
+        "db/builder.cc",
+        "db/c.cc",
+        "db/column_family.cc",
+        "db/compaction/compaction.cc",
+        "db/compaction/compaction_iterator.cc",
+        "db/compaction/compaction_job.cc",
+        "db/compaction/compaction_picker.cc",
+        "db/compaction/compaction_picker_fifo.cc",
+        "db/compaction/compaction_picker_level.cc",
+        "db/compaction/compaction_picker_universal.cc",
+        "db/compaction/sst_partitioner.cc",
+        "db/convenience.cc",
+        "db/db_filesnapshot.cc",
+        "db/db_impl/compacted_db_impl.cc",
+        "db/db_impl/db_impl.cc",
+        "db/db_impl/db_impl_compaction_flush.cc",
+        "db/db_impl/db_impl_debug.cc",
+        "db/db_impl/db_impl_experimental.cc",
+        "db/db_impl/db_impl_files.cc",
+        "db/db_impl/db_impl_open.cc",
+        "db/db_impl/db_impl_readonly.cc",
+        "db/db_impl/db_impl_secondary.cc",
+        "db/db_impl/db_impl_write.cc",
+        "db/db_info_dumper.cc",
+        "db/db_iter.cc",
+        "db/dbformat.cc",
+        "db/error_handler.cc",
+        "db/event_helpers.cc",
+        "db/experimental.cc",
+        "db/external_sst_file_ingestion_job.cc",
+        "db/file_indexer.cc",
+        "db/flush_job.cc",
+        "db/flush_scheduler.cc",
+        "db/forward_iterator.cc",
+        "db/import_column_family_job.cc",
+        "db/internal_stats.cc",
+        "db/log_reader.cc",
+        "db/log_writer.cc",
+        "db/logs_with_prep_tracker.cc",
+        "db/malloc_stats.cc",
+        "db/memtable.cc",
+        "db/memtable_list.cc",
+        "db/merge_helper.cc",
+        "db/merge_operator.cc",
+        "db/output_validator.cc",
+        "db/periodic_work_scheduler.cc",
+        "db/range_del_aggregator.cc",
+        "db/range_tombstone_fragmenter.cc",
+        "db/repair.cc",
+        "db/snapshot_impl.cc",
+        "db/table_cache.cc",
+        "db/table_properties_collector.cc",
+        "db/transaction_log_impl.cc",
+        "db/trim_history_scheduler.cc",
+        "db/version_builder.cc",
+        "db/version_edit.cc",
+        "db/version_edit_handler.cc",
+        "db/version_set.cc",
+        "db/wal_edit.cc",
+        "db/wal_manager.cc",
+        "db/write_batch.cc",
+        "db/write_batch_base.cc",
+        "db/write_controller.cc",
+        "db/write_thread.cc",
+        "env/composite_env.cc",
+        "env/env.cc",
+        "env/env_chroot.cc",
+        "env/env_encryption.cc",
+        "env/env_hdfs.cc",
+        "env/env_posix.cc",
+        "env/file_system.cc",
+        "env/file_system_tracer.cc",
+        "env/fs_posix.cc",
+        "env/fs_remap.cc",
+        "env/io_posix.cc",
+        "env/mock_env.cc",
+        "env/unique_id_gen.cc",
+        "file/delete_scheduler.cc",
+        "file/file_prefetch_buffer.cc",
+        "file/file_util.cc",
+        "file/filename.cc",
+        "file/line_file_reader.cc",
+        "file/random_access_file_reader.cc",
+        "file/read_write_util.cc",
+        "file/readahead_raf.cc",
+        "file/sequence_file_reader.cc",
+        "file/sst_file_manager_impl.cc",
+        "file/writable_file_writer.cc",
+        "logging/auto_roll_logger.cc",
+        "logging/event_logger.cc",
+        "logging/log_buffer.cc",
+        "memory/arena.cc",
+        "memory/concurrent_arena.cc",
+        "memory/jemalloc_nodump_allocator.cc",
+        "memory/memkind_kmem_allocator.cc",
+        "memory/memory_allocator.cc",
+        "memtable/alloc_tracker.cc",
+        "memtable/hash_linklist_rep.cc",
+        "memtable/hash_skiplist_rep.cc",
+        "memtable/skiplistrep.cc",
+        "memtable/vectorrep.cc",
+        "memtable/write_buffer_manager.cc",
+        "monitoring/histogram.cc",
+        "monitoring/histogram_windowing.cc",
+        "monitoring/in_memory_stats_history.cc",
+        "monitoring/instrumented_mutex.cc",
+        "monitoring/iostats_context.cc",
+        "monitoring/perf_context.cc",
+        "monitoring/perf_level.cc",
+        "monitoring/persistent_stats_history.cc",
+        "monitoring/statistics.cc",
+        "monitoring/thread_status_impl.cc",
+        "monitoring/thread_status_updater.cc",
+        "monitoring/thread_status_updater_debug.cc",
+        "monitoring/thread_status_util.cc",
+        "monitoring/thread_status_util_debug.cc",
+        "options/cf_options.cc",
+        "options/configurable.cc",
+        "options/customizable.cc",
+        "options/db_options.cc",
+        "options/options.cc",
+        "options/options_helper.cc",
+        "options/options_parser.cc",
+        "port/port_posix.cc",
+        "port/stack_trace.cc",
+        "port/win/env_default.cc",
+        "port/win/env_win.cc",
+        "port/win/io_win.cc",
+        "port/win/port_win.cc",
+        "port/win/win_logger.cc",
+        "port/win/win_thread.cc",
+        "table/adaptive/adaptive_table_factory.cc",
+        "table/block_based/binary_search_index_reader.cc",
+        "table/block_based/block.cc",
+        "table/block_based/block_based_filter_block.cc",
+        "table/block_based/block_based_table_builder.cc",
+        "table/block_based/block_based_table_factory.cc",
+        "table/block_based/block_based_table_iterator.cc",
+        "table/block_based/block_based_table_reader.cc",
+        "table/block_based/block_builder.cc",
+        "table/block_based/block_prefetcher.cc",
+        "table/block_based/block_prefix_index.cc",
+        "table/block_based/data_block_footer.cc",
+        "table/block_based/data_block_hash_index.cc",
+        "table/block_based/filter_block_reader_common.cc",
+        "table/block_based/filter_policy.cc",
+        "table/block_based/flush_block_policy.cc",
+        "table/block_based/full_filter_block.cc",
+        "table/block_based/hash_index_reader.cc",
+        "table/block_based/index_builder.cc",
+        "table/block_based/index_reader_common.cc",
+        "table/block_based/parsed_full_filter_block.cc",
+        "table/block_based/partitioned_filter_block.cc",
+        "table/block_based/partitioned_index_iterator.cc",
+        "table/block_based/partitioned_index_reader.cc",
+        "table/block_based/reader_common.cc",
+        "table/block_based/uncompression_dict_reader.cc",
+        "table/block_fetcher.cc",
+        "table/cuckoo/cuckoo_table_builder.cc",
+        "table/cuckoo/cuckoo_table_factory.cc",
+        "table/cuckoo/cuckoo_table_reader.cc",
+        "table/format.cc",
+        "table/get_context.cc",
+        "table/iterator.cc",
+        "table/merging_iterator.cc",
+        "table/meta_blocks.cc",
+        "table/persistent_cache_helper.cc",
+        "table/plain/plain_table_bloom.cc",
+        "table/plain/plain_table_builder.cc",
+        "table/plain/plain_table_factory.cc",
+        "table/plain/plain_table_index.cc",
+        "table/plain/plain_table_key_coding.cc",
+        "table/plain/plain_table_reader.cc",
+        "table/sst_file_dumper.cc",
+        "table/sst_file_reader.cc",
+        "table/sst_file_writer.cc",
+        "table/table_factory.cc",
+        "table/table_properties.cc",
+        "table/two_level_iterator.cc",
+        "table/unique_id.cc",
+        "test_util/sync_point.cc",
+        "test_util/sync_point_impl.cc",
+        "test_util/transaction_test_util.cc",
+        "tools/dump/db_dump_tool.cc",
+        "tools/io_tracer_parser_tool.cc",
+        "tools/ldb_cmd.cc",
+        "tools/ldb_tool.cc",
+        "tools/sst_dump_tool.cc",
+        "trace_replay/block_cache_tracer.cc",
+        "trace_replay/io_tracer.cc",
+        "trace_replay/trace_record.cc",
+        "trace_replay/trace_record_handler.cc",
+        "trace_replay/trace_record_result.cc",
+        "trace_replay/trace_replay.cc",
+        "util/build_version.cc",
+        "util/coding.cc",
+        "util/compaction_job_stats_impl.cc",
+        "util/comparator.cc",
+        "util/compression_context_cache.cc",
+        "util/concurrent_task_limiter_impl.cc",
+        "util/crc32c.cc",
+        "util/crc32c_arm64.cc",
+        "util/dynamic_bloom.cc",
+        "util/file_checksum_helper.cc",
+        "util/hash.cc",
+        "util/murmurhash.cc",
+        "util/random.cc",
+        "util/rate_limiter.cc",
+        "util/regex.cc",
+        "util/ribbon_config.cc",
+        "util/slice.cc",
+        "util/status.cc",
+        "util/string_util.cc",
+        "util/thread_local.cc",
+        "util/threadpool_imp.cc",
+        "util/xxhash.cc",
+        "utilities/backupable/backupable_db.cc",
+        "utilities/blob_db/blob_compaction_filter.cc",
+        "utilities/blob_db/blob_db.cc",
+        "utilities/blob_db/blob_db_impl.cc",
+        "utilities/blob_db/blob_db_impl_filesnapshot.cc",
+        "utilities/blob_db/blob_dump_tool.cc",
+        "utilities/blob_db/blob_file.cc",
+        "utilities/cache_dump_load.cc",
+        "utilities/cache_dump_load_impl.cc",
+        "utilities/cassandra/cassandra_compaction_filter.cc",
+        "utilities/cassandra/format.cc",
+        "utilities/cassandra/merge_operator.cc",
+        "utilities/checkpoint/checkpoint_impl.cc",
+        "utilities/compaction_filters.cc",
+        "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
+        "utilities/convenience/info_log_finder.cc",
+        "utilities/debug.cc",
+        "utilities/env_mirror.cc",
+        "utilities/env_timed.cc",
+        "utilities/fault_injection_env.cc",
+        "utilities/fault_injection_fs.cc",
+        "utilities/fault_injection_secondary_cache.cc",
+        "utilities/leveldb_options/leveldb_options.cc",
+        "utilities/memory/memory_util.cc",
+        "utilities/merge_operators.cc",
+        "utilities/merge_operators/bytesxor.cc",
+        "utilities/merge_operators/max.cc",
+        "utilities/merge_operators/put.cc",
+        "utilities/merge_operators/sortlist.cc",
+        "utilities/merge_operators/string_append/stringappend.cc",
+        "utilities/merge_operators/string_append/stringappend2.cc",
+        "utilities/merge_operators/uint64add.cc",
+        "utilities/object_registry.cc",
+        "utilities/option_change_migration/option_change_migration.cc",
+        "utilities/options/options_util.cc",
+        "utilities/persistent_cache/block_cache_tier.cc",
+        "utilities/persistent_cache/block_cache_tier_file.cc",
+        "utilities/persistent_cache/block_cache_tier_metadata.cc",
+        "utilities/persistent_cache/persistent_cache_tier.cc",
+        "utilities/persistent_cache/volatile_tier_impl.cc",
+        "utilities/simulator_cache/cache_simulator.cc",
+        "utilities/simulator_cache/sim_cache.cc",
+        "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
+        "utilities/trace/file_trace_reader_writer.cc",
+        "utilities/trace/replayer_impl.cc",
+        "utilities/transactions/lock/lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_tracker.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
+        "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
+        "utilities/transactions/optimistic_transaction.cc",
+        "utilities/transactions/optimistic_transaction_db_impl.cc",
+        "utilities/transactions/pessimistic_transaction.cc",
+        "utilities/transactions/pessimistic_transaction_db.cc",
+        "utilities/transactions/snapshot_checker.cc",
+        "utilities/transactions/transaction_base.cc",
+        "utilities/transactions/transaction_db_mutex_impl.cc",
+        "utilities/transactions/transaction_util.cc",
+        "utilities/transactions/write_prepared_txn.cc",
+        "utilities/transactions/write_prepared_txn_db.cc",
+        "utilities/transactions/write_unprepared_txn.cc",
+        "utilities/transactions/write_unprepared_txn_db.cc",
+        "utilities/ttl/db_ttl_impl.cc",
+        "utilities/wal_filter.cc",
+        "utilities/write_batch_with_index/write_batch_with_index.cc",
+        "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
+    ],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    link_whole = True,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    exported_deps = [],
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
 cpp_library(
@@ -366,7 +803,7 @@
     srcs = [
         "db/db_test_util.cc",
         "table/mock_table.cc",
-        "test_util/fault_injection_test_env.cc",
+        "test_util/mock_time_env.cc",
         "test_util/testharness.cc",
         "test_util/testutil.cc",
         "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
@@ -376,11 +813,15 @@
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    link_whole = False,
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-    deps = [":rocksdb_lib"],
-    external_deps = ROCKSDB_EXTERNAL_DEPS,
+    exported_deps = [":rocksdb_lib"],
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS + [
+        ("googletest", None, "gtest"),
+    ],
 )
 
 cpp_library(
@@ -389,16 +830,34 @@
         "test_util/testutil.cc",
         "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
         "tools/db_bench_tool.cc",
+        "tools/simulated_hybrid_file_system.cc",
         "tools/trace_analyzer_tool.cc",
     ],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    link_whole = False,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    exported_deps = [":rocksdb_lib"],
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS,
+)
+
+cpp_library(
+    name = "rocksdb_cache_bench_tools_lib",
+    srcs = ["cache/cache_bench_tool.cc"],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    link_whole = False,
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-    deps = [":rocksdb_lib"],
-    external_deps = ROCKSDB_EXTERNAL_DEPS,
+    exported_deps = [":rocksdb_lib"],
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
 cpp_library(
@@ -409,9 +868,13 @@
         "db_stress_tool/db_stress_common.cc",
         "db_stress_tool/db_stress_driver.cc",
         "db_stress_tool/db_stress_gflags.cc",
+        "db_stress_tool/db_stress_listener.cc",
         "db_stress_tool/db_stress_shared_state.cc",
+        "db_stress_tool/db_stress_stat.cc",
         "db_stress_tool/db_stress_test_base.cc",
         "db_stress_tool/db_stress_tool.cc",
+        "db_stress_tool/expected_state.cc",
+        "db_stress_tool/multi_ops_txns_stress.cc",
         "db_stress_tool/no_batched_ops_stress.cc",
         "test_util/testutil.cc",
         "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
@@ -420,24 +883,47 @@
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-    deps = [":rocksdb_lib"],
-    external_deps = ROCKSDB_EXTERNAL_DEPS,
+    exported_deps = ROCKSDB_LIB_DEPS,
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
+cpp_binary(
+    name = "c_test_bin",
+    srcs = ["db/c_test.c"],
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    deps = [":rocksdb_test_lib"],
+) if not is_opt_mode else None
+
+custom_unittest(
+    name = "c_test",
+    command = [
+        native.package_name() + "/buckifier/rocks_test_runner.sh",
+        "$(location :{})".format("c_test_bin"),
+    ],
+    type = "simple",
+) if not is_opt_mode else None
+
 cpp_library(
     name = "env_basic_test_lib",
     srcs = ["env/env_basic_test.cc"],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    link_whole = False,
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-    deps = [":rocksdb_test_lib"],
-    external_deps = ROCKSDB_EXTERNAL_DEPS,
+    exported_deps = [":rocksdb_test_lib"],
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
 # [test_name, test_src, test_type, extra_deps, extra_compiler_flags]
@@ -445,21 +931,21 @@
     [
         "arena_test",
         "memory/arena_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "auto_roll_logger_test",
         "logging/auto_roll_logger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "autovector_test",
         "util/autovector_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -471,233 +957,345 @@
         [],
     ],
     [
+        "blob_counting_iterator_test",
+        "db/blob/blob_counting_iterator_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
         "blob_db_test",
         "utilities/blob_db/blob_db_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "blob_file_addition_test",
+        "db/blob/blob_file_addition_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "blob_file_builder_test",
+        "db/blob/blob_file_builder_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "blob_file_cache_test",
+        "db/blob/blob_file_cache_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "blob_file_garbage_test",
+        "db/blob/blob_file_garbage_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "blob_file_reader_test",
+        "db/blob/blob_file_reader_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "blob_garbage_meter_test",
+        "db/blob/blob_garbage_meter_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "block_based_filter_block_test",
         "table/block_based/block_based_filter_block_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "block_based_table_reader_test",
+        "table/block_based/block_based_table_reader_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "block_cache_trace_analyzer_test",
         "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "block_cache_tracer_test",
         "trace_replay/block_cache_tracer_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "block_fetcher_test",
+        "table/block_fetcher_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "block_test",
         "table/block_based/block_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "bloom_test",
         "util/bloom_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
-        "c_test",
-        "db/c_test.c",
-        "serial",
+        "cache_reservation_manager_test",
+        "cache/cache_reservation_manager_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "cache_simulator_test",
         "utilities/simulator_cache/cache_simulator_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cache_test",
         "cache/cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_format_test",
         "utilities/cassandra/cassandra_format_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_functional_test",
         "utilities/cassandra/cassandra_functional_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_row_merge_test",
         "utilities/cassandra/cassandra_row_merge_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_serialize_test",
         "utilities/cassandra/cassandra_serialize_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "checkpoint_test",
         "utilities/checkpoint/checkpoint_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cleanable_test",
         "table/cleanable_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "clipping_iterator_test",
+        "db/compaction/clipping_iterator_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "coding_test",
         "util/coding_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "column_family_test",
         "db/column_family_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compact_files_test",
         "db/compact_files_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compact_on_deletion_collector_test",
         "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_iterator_test",
         "db/compaction/compaction_iterator_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_job_stats_test",
         "db/compaction/compaction_job_stats_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_job_test",
         "db/compaction/compaction_job_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_picker_test",
         "db/compaction/compaction_picker_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "compaction_service_test",
+        "db/compaction/compaction_service_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "comparator_db_test",
         "db/comparator_db_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "configurable_test",
+        "options/configurable_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "corruption_test",
         "db/corruption_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "crc32c_test",
         "util/crc32c_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cuckoo_table_builder_test",
         "table/cuckoo/cuckoo_table_builder_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cuckoo_table_db_test",
         "db/cuckoo_table_db_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cuckoo_table_reader_test",
         "table/cuckoo/cuckoo_table_reader_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "customizable_test",
+        "options/customizable_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "data_block_hash_index_test",
         "table/block_based/data_block_hash_index_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_basic_test",
         "db/db_basic_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_blob_basic_test",
+        "db/blob/db_blob_basic_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_blob_compaction_test",
+        "db/blob/db_blob_compaction_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_blob_corruption_test",
+        "db/blob/db_blob_corruption_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "db_blob_index_test",
-        "db/db_blob_index_test.cc",
-        "serial",
+        "db/blob/db_blob_index_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "db_block_cache_test",
         "db/db_block_cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -725,77 +1323,91 @@
     [
         "db_dynamic_level_test",
         "db/db_dynamic_level_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_encryption_test",
         "db/db_encryption_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_flush_test",
         "db/db_flush_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_inplace_update_test",
         "db/db_inplace_update_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_io_failure_test",
         "db/db_io_failure_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_iter_stress_test",
         "db/db_iter_stress_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_iter_test",
         "db/db_iter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_iterator_test",
         "db/db_iterator_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_kv_checksum_test",
+        "db/db_kv_checksum_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "db_log_iter_test",
         "db/db_log_iter_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_logical_block_size_cache_test",
+        "db/db_logical_block_size_cache_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "db_memtable_test",
         "db/db_memtable_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_merge_operand_test",
         "db/db_merge_operand_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -809,28 +1421,28 @@
     [
         "db_options_test",
         "db/db_options_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_properties_test",
         "db/db_properties_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_range_del_test",
         "db/db_range_del_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_secondary_test",
-        "db/db_impl/db_secondary_test.cc",
-        "serial",
+        "db/db_secondary_test.cc",
+        "parallel",
         [],
         [],
     ],
@@ -844,21 +1456,21 @@
     [
         "db_statistics_test",
         "db/db_statistics_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_table_properties_test",
         "db/db_table_properties_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_tailing_iter_test",
         "db/db_tailing_iter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -872,7 +1484,7 @@
     [
         "db_test2",
         "db/db_test2.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -891,30 +1503,51 @@
         [],
     ],
     [
+        "db_with_timestamp_basic_test",
+        "db/db_with_timestamp_basic_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_with_timestamp_compaction_test",
+        "db/db_with_timestamp_compaction_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_write_buffer_manager_test",
+        "db/db_write_buffer_manager_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
         "db_write_test",
         "db/db_write_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "dbformat_test",
         "db/dbformat_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "defer_test",
         "util/defer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "delete_scheduler_test",
         "file/delete_scheduler_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -928,21 +1561,21 @@
     [
         "dynamic_bloom_test",
         "util/dynamic_bloom_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "env_basic_test",
         "env/env_basic_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "env_logger_test",
         "logging/env_logger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -956,28 +1589,28 @@
     [
         "env_timed_test",
         "utilities/env_timed_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
-        "error_handler_test",
-        "db/error_handler_test.cc",
-        "serial",
+        "error_handler_fs_test",
+        "db/error_handler_fs_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "event_logger_test",
         "logging/event_logger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "external_sst_file_basic_test",
         "db/external_sst_file_basic_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -998,7 +1631,7 @@
     [
         "file_indexer_test",
         "db/file_indexer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1012,56 +1645,56 @@
     [
         "filelock_test",
         "util/filelock_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "filename_test",
         "db/filename_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "flush_job_test",
         "db/flush_job_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "full_filter_block_test",
         "table/block_based/full_filter_block_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "hash_table_test",
         "utilities/persistent_cache/hash_table_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "hash_test",
         "util/hash_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "heap_test",
         "util/heap_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "histogram_test",
         "monitoring/histogram_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1080,37 +1713,58 @@
         [],
     ],
     [
+        "io_posix_test",
+        "env/io_posix_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "io_tracer_parser_test",
+        "tools/io_tracer_parser_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "io_tracer_test",
+        "trace_replay/io_tracer_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
         "iostats_context_test",
         "monitoring/iostats_context_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "ldb_cmd_test",
         "tools/ldb_cmd_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "listener_test",
         "db/listener_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "log_test",
         "db/log_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "lru_cache_test",
         "cache/lru_cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1122,114 +1776,128 @@
         [],
     ],
     [
+        "memory_allocator_test",
+        "memory/memory_allocator_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
         "memory_test",
         "utilities/memory/memory_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "memtable_list_test",
         "db/memtable_list_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "merge_helper_test",
         "db/merge_helper_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "merge_test",
         "db/merge_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "merger_test",
         "table/merger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "mock_env_test",
         "env/mock_env_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "object_registry_test",
         "utilities/object_registry_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "obsolete_files_test",
         "db/obsolete_files_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "optimistic_transaction_test",
         "utilities/transactions/optimistic_transaction_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "option_change_migration_test",
         "utilities/option_change_migration/option_change_migration_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_file_test",
         "db/options_file_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_settable_test",
         "options/options_settable_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_test",
         "options/options_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_util_test",
         "utilities/options/options_util_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "partitioned_filter_block_test",
         "table/block_based/partitioned_filter_block_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "perf_context_test",
         "db/perf_context_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "periodic_work_scheduler_test",
+        "db/periodic_work_scheduler_test.cc",
+        "parallel",
         [],
         [],
     ],
@@ -1243,133 +1911,168 @@
     [
         "plain_table_db_test",
         "db/plain_table_db_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "point_lock_manager_test",
+        "utilities/transactions/lock/point/point_lock_manager_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "prefetch_test",
+        "file/prefetch_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "prefix_test",
         "db/prefix_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "random_access_file_reader_test",
+        "file/random_access_file_reader_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "random_test",
         "util/random_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "range_del_aggregator_test",
         "db/range_del_aggregator_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "range_locking_test",
+        "utilities/transactions/lock/range/range_locking_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "range_tombstone_fragmenter_test",
         "db/range_tombstone_fragmenter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "rate_limiter_test",
         "util/rate_limiter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "reduce_levels_test",
         "tools/reduce_levels_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "repair_test",
         "db/repair_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "repeatable_thread_test",
         "util/repeatable_thread_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "ribbon_test",
+        "util/ribbon_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "sim_cache_test",
         "utilities/simulator_cache/sim_cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "skiplist_test",
         "memtable/skiplist_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "slice_test",
         "util/slice_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "slice_transform_test",
         "util/slice_transform_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "sst_dump_test",
         "tools/sst_dump_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "sst_file_reader_test",
         "table/sst_file_reader_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "statistics_test",
         "monitoring/statistics_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "stats_history_test",
         "monitoring/stats_history_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "stringappend_test",
         "utilities/merge_operators/string_append/stringappend_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "table_properties_collector_test",
         "db/table_properties_collector_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1381,30 +2084,44 @@
         [],
     ],
     [
+        "testutil_test",
+        "test_util/testutil_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
         "thread_list_test",
         "util/thread_list_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "thread_local_test",
         "util/thread_local_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "timer_queue_test",
         "util/timer_queue_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "timer_test",
+        "util/timer_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "trace_analyzer_test",
         "tools/trace_analyzer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1418,77 +2135,84 @@
     [
         "ttl_test",
         "utilities/ttl/ttl_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "util_merge_operators_test",
         "utilities/util_merge_operators_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "version_builder_test",
         "db/version_builder_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "version_edit_test",
         "db/version_edit_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "version_set_test",
         "db/version_set_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "wal_manager_test",
         "db/wal_manager_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "work_queue_test",
+        "util/work_queue_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "write_batch_test",
         "db/write_batch_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_batch_with_index_test",
         "utilities/write_batch_with_index/write_batch_with_index_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_buffer_manager_test",
         "memtable/write_buffer_manager_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_callback_test",
         "db/write_callback_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_controller_test",
         "db/write_controller_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1512,18 +2236,18 @@
 # Do not build the tests in opt mode, since SyncPoint and other test code
 # will not be included.
 [
-    test_binary(
-        extra_compiler_flags = extra_compiler_flags,
-        extra_deps = extra_deps,
-        parallelism = parallelism,
-        rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
-        rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
-        rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
-        rocksdb_os_deps = ROCKSDB_OS_DEPS,
-        rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
-        rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-        test_cc = test_cc,
-        test_name = test_name,
+    cpp_unittest(
+        name = test_name,
+        srcs = [test_cc],
+        arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+        compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags,
+        include_paths = ROCKSDB_INCLUDE_PATHS,
+        os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+        preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+        deps = [":rocksdb_test_lib"] + extra_deps,
+        external_deps = ROCKSDB_EXTERNAL_DEPS + [
+            ("googletest", None, "gtest"),
+        ],
     )
     for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS
     if not is_opt_mode
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/USERS.md mariadb-10.11.13/storage/rocksdb/rocksdb/USERS.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/USERS.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/USERS.md	2025-05-19 16:14:27.000000000 +0000
@@ -26,6 +26,9 @@
 ## Yahoo
 Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
 
+## Baidu
+[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata.
+
 ## CockroachDB
 CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
 
@@ -44,7 +47,7 @@
 Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters.
 Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf
 
-## Santanader UK/Cloudera Profession Services
+## Santander UK/Cloudera Profession Services
 Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/
 
 ## Airbnb
@@ -67,7 +70,7 @@
 [VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed.
 
 ## quasardb
-[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. 
+[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark.
 quasardb uses a heavily tuned RocksDB as its persistence layer.
 
 ## Netflix
@@ -86,7 +89,7 @@
 [Uber](http://eng.uber.com/cherami/) uses RocksDB as a durable and scalable task queue.
 
 ## 360 Pika
-[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been widely used in many company
+[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been used in many companies.
 
 ## LzLabs
 LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data.
@@ -96,13 +99,28 @@
 
 ## IOTA Foundation
  [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
- 
+
 ## Avrio Project
  [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions.
- 
+
 ## Crux
 [Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability.
 
 ## Nebula Graph
-
 [Nebula Graph](https://github.com/vesoft-inc/nebula) is a distributed, scalable, lightning-fast, open source graph database capable of hosting super large scale graphs with dozens of billions of vertices (nodes) and trillions of edges, with milliseconds of latency.
+
+## YugabyteDB
+[YugabyteDB](https://www.yugabyte.com/) is an open source, high performance, distributed SQL database that uses RocksDB as its storage layer. For more information, please see https://github.com/yugabyte/yugabyte-db/.
+
+## ArangoDB
+[ArangoDB](https://www.arangodb.com/) is a native multi-model database with flexible data models for documents, graphs, and key-values, for building high performance applications using a convenient SQL-like query language or JavaScript extensions. It uses RocksDB as its storage engine.
+
+## Milvus
+[Milvus](https://milvus.io/) is an open source vector database for unstructured data. It uses RocksDB not only as one of the supported kv storage engines, but also as a message queue.
+
+## Kafka
+[Kafka](https://kafka.apache.org/) is an open-source distributed event streaming platform, it uses RocksDB to store state in Kafka Streams: https://www.confluent.io/blog/how-to-tune-rocksdb-kafka-streams-state-stores-performance/.
+
+## Others
+More databases using RocksDB can be found at [dbdb.io](https://dbdb.io/browse?embeds=rocksdb).
+
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/WINDOWS_PORT.md mariadb-10.11.13/storage/rocksdb/rocksdb/WINDOWS_PORT.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/WINDOWS_PORT.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/WINDOWS_PORT.md	2025-05-19 16:14:27.000000000 +0000
@@ -24,7 +24,7 @@
 * make all unit test pass both in debug and release builds. 
   * Note: latest introduction of SyncPoint seems to disable running db_test in Release.
 * make performance on par with published benchmarks accounting for HW differences
-* we would like to keep the port code inline with the master branch with no forking
+* we would like to keep the port code inline with the main branch with no forking
 
 ## Build system
 We have chosen CMake as a widely accepted build system to build the Windows port. It is very fast and convenient. 
@@ -66,7 +66,7 @@
 Even though Windows provides its own efficient thread-pool implementation we chose to replicate posix logic using `std::thread` primitives. This allows anyone to quickly detect any changes within the posix source code and replicate them within windows env. This has proven to work very well. At the same time for anyone who wishes to replace the built-in thread-pool can do so using RocksDB stackable environments.
 
 For disk access we implemented all of the functionality present within the posix_env which includes memory mapped files, random access, rate-limiter support etc.
-The `use_os_buffer` flag on Posix platforms currently denotes disabling read-ahead log via `fadvise` mechanism. Windows does not have `fadvise` system call. What is more, it implements disk cache in a way that differs from Linux greatly. It�s not an uncommon practice on Windows to perform un-buffered disk access to gain control of the memory consumption. We think that in our use case this may also be a good configuration option at the expense of disk throughput. To compensate one may increase the configured in-memory cache size instead. Thus we have chosen  `use_os_buffer=false` to disable OS disk buffering for `WinWritableFile` and `WinRandomAccessFile`. The OS imposes restrictions on the alignment of the disk offsets, buffers used and the amount of data that is read/written when accessing files in un-buffered mode. When the option is true, the classes behave in a standard way. This allows to perform writes and reads in cases when un-buffered access does not make sense such as WAL and MANIFEST.
+The `use_os_buffer` flag on Posix platforms currently denotes disabling read-ahead log via `fadvise` mechanism. Windows does not have `fadvise` system call. What is more, it implements disk cache in a way that differs from Linux greatly. It's not an uncommon practice on Windows to perform un-buffered disk access to gain control of the memory consumption. We think that in our use case this may also be a good configuration option at the expense of disk throughput. To compensate one may increase the configured in-memory cache size instead. Thus we have chosen  `use_os_buffer=false` to disable OS disk buffering for `WinWritableFile` and `WinRandomAccessFile`. The OS imposes restrictions on the alignment of the disk offsets, buffers used and the amount of data that is read/written when accessing files in un-buffered mode. When the option is true, the classes behave in a standard way. This allows to perform writes and reads in cases when un-buffered access does not make sense such as WAL and MANIFEST.
 
 We have replaced `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure so we can atomically seek to the position of the disk operation but still perform the operation synchronously. Thus we able to emulate that functionality of `pread/pwrite` reasonably well. The only difference is that the file pointer is not returned to its original position but that hardly matters given the random nature of access.
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/appveyor.yml mariadb-10.11.13/storage/rocksdb/rocksdb/appveyor.yml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/appveyor.yml	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/appveyor.yml	2025-05-19 16:14:27.000000000 +0000
@@ -1,6 +1,6 @@
 version: 1.0.{build}
 
-image: Visual Studio 2017
+image: Visual Studio 2019
 
 environment:
   JAVA_HOME: C:\Program Files\Java\jdk1.8.0
@@ -21,9 +21,6 @@
     - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
       CMAKE_GENERATOR: Visual Studio 14 Win64
       DEV_ENV: C:\Program Files (x86)\Microsoft Visual Studio 14.0\Common7\IDE\devenv.com
-    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      CMAKE_GENERATOR: Visual Studio 15 Win64
-      DEV_ENV: C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com
 
 install:
   - md %THIRDPARTY_HOME%
@@ -34,7 +31,8 @@
   - cd snappy-1.1.7
   - mkdir build
   - cd build
-  - cmake -G "%CMAKE_GENERATOR%" ..
+  - if DEFINED CMAKE_PLATEFORM_NAME (set "PLATEFORM_OPT=-A %CMAKE_PLATEFORM_NAME%")
+  - cmake .. -G "%CMAKE_GENERATOR%" %PLATEFORM_OPT%
   - msbuild Snappy.sln /p:Configuration=Debug /p:Platform=x64
   - msbuild Snappy.sln /p:Configuration=Release /p:Platform=x64
   - echo "Building LZ4 dependency..."
@@ -57,7 +55,8 @@
 before_build:
   - md %APPVEYOR_BUILD_FOLDER%\build
   - cd %APPVEYOR_BUILD_FOLDER%\build
-  - cmake -G "%CMAKE_GENERATOR%" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 ..
+  - if DEFINED CMAKE_PLATEFORM_NAME (set "PLATEFORM_OPT=-A %CMAKE_PLATEFORM_NAME%")
+  - cmake .. -G "%CMAKE_GENERATOR%" %PLATEFORM_OPT% %CMAKE_OPT% -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 -DWITH_ALL_TESTS=0
   - cd ..
 
 build:
@@ -68,7 +67,7 @@
 test:
 
 test_script:
-  - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test,db_merge_operand_test -Concurrency 8
+  - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,env_basic_test -Concurrency 8
 
 on_failure:
   - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/buckify_rocksdb.py	2025-05-19 16:14:27.000000000 +0000
@@ -20,14 +20,14 @@
 # User can pass extra dependencies as a JSON object via command line, and this
 # script can include these dependencies in the generate TARGETS file.
 # Usage:
-# $python buckifier/buckify_rocksdb.py
+# $python3 buckifier/buckify_rocksdb.py
 # (This generates a TARGET file without user-specified dependency for unit
 # tests.)
-# $python buckifier/buckify_rocksdb.py \
-#        '{"fake": { \
-#                      "extra_deps": [":test_dep", "//fakes/module:mock1"],  \
-#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \
-#                  } \
+# $python3 buckifier/buckify_rocksdb.py \
+#        '{"fake": {
+#                      "extra_deps": [":test_dep", "//fakes/module:mock1"],
+#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]
+#                  }
 #         }'
 # (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
 # unit tests, and will use the extra_compiler_flags to compile the unit test
@@ -48,8 +48,8 @@
         if '=' in line:
             current_src = line.split('=')[0].strip()
             src_files[current_src] = []
-        elif '.cc' in line:
-            src_path = line.split('.cc')[0].strip() + '.cc'
+        elif '.c' in line:
+            src_path = line.split('\\')[0].strip()
             src_files[current_src].append(src_path)
     return src_files
 
@@ -69,45 +69,28 @@
     return cc_files
 
 
-# Get tests from Makefile
-def get_tests(repo_path):
+# Get non_parallel tests from Makefile
+def get_non_parallel_tests(repo_path):
     Makefile = repo_path + "/Makefile"
 
-    # Dictionary TEST_NAME => IS_PARALLEL
-    tests = {}
+    s = set({})
 
-    found_tests = False
+    found_non_parallel_tests = False
     for line in open(Makefile):
         line = line.strip()
-        if line.startswith("TESTS ="):
-            found_tests = True
-        elif found_tests:
+        if line.startswith("NON_PARALLEL_TEST ="):
+            found_non_parallel_tests = True
+        elif found_non_parallel_tests:
             if line.endswith("\\"):
                 # remove the trailing \
                 line = line[:-1]
                 line = line.strip()
-                tests[line] = False
+                s.add(line)
             else:
-                # we consumed all the tests
+                # we consumed all the non_parallel tests
                 break
 
-    found_parallel_tests = False
-    for line in open(Makefile):
-        line = line.strip()
-        if line.startswith("PARALLEL_TEST ="):
-            found_parallel_tests = True
-        elif found_parallel_tests:
-            if line.endswith("\\"):
-                # remove the trailing \
-                line = line[:-1]
-                line = line.strip()
-                tests[line] = True
-            else:
-                # we consumed all the parallel tests
-                break
-
-    return tests
-
+    return s
 
 # Parse extra dependencies passed by user from command line
 def get_dependencies():
@@ -140,18 +123,38 @@
     src_mk = parse_src_mk(repo_path)
     # get all .cc files
     cc_files = get_cc_files(repo_path)
-    # get tests from Makefile
-    tests = get_tests(repo_path)
+    # get non_parallel tests from Makefile
+    non_parallel_tests = get_non_parallel_tests(repo_path)
 
-    if src_mk is None or cc_files is None or tests is None:
+    if src_mk is None or cc_files is None or non_parallel_tests is None:
         return False
 
-    TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path)
+    extra_argv = ""
+    if len(sys.argv) >= 2:
+        # Heuristically quote and canonicalize whitespace for inclusion
+        # in how the file was generated.
+        extra_argv = " '{0}'".format(" ".join(sys.argv[1].split()))
+
+    TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv)
+
     # rocksdb_lib
     TARGETS.add_library(
         "rocksdb_lib",
         src_mk["LIB_SOURCES"] +
+        # always add range_tree, it's only excluded on ppc64, which we don't use internally
+        src_mk["RANGE_TREE_SOURCES"] +
         src_mk["TOOL_LIB_SOURCES"])
+    # rocksdb_whole_archive_lib
+    TARGETS.add_library(
+        "rocksdb_whole_archive_lib",
+        src_mk["LIB_SOURCES"] +
+        # always add range_tree, it's only excluded on ppc64, which we don't use internally
+        src_mk["RANGE_TREE_SOURCES"] +
+        src_mk["TOOL_LIB_SOURCES"],
+        deps=None,
+        headers=None,
+        extra_external_deps="",
+        link_whole=True)
     # rocksdb_test_lib
     TARGETS.add_library(
         "rocksdb_test_lib",
@@ -159,7 +162,10 @@
         src_mk.get("TEST_LIB_SOURCES", []) +
         src_mk.get("EXP_LIB_SOURCES", []) +
         src_mk.get("ANALYZER_LIB_SOURCES", []),
-        [":rocksdb_lib"])
+        [":rocksdb_lib"],
+        extra_external_deps=""" + [
+        ("googletest", None, "gtest"),
+    ]""")
     # rocksdb_tools_lib
     TARGETS.add_library(
         "rocksdb_tools_lib",
@@ -167,41 +173,56 @@
         src_mk.get("ANALYZER_LIB_SOURCES", []) +
         ["test_util/testutil.cc"],
         [":rocksdb_lib"])
-    # rocksdb_stress_lib
+    # rocksdb_cache_bench_tools_lib
     TARGETS.add_library(
+        "rocksdb_cache_bench_tools_lib",
+        src_mk.get("CACHE_BENCH_LIB_SOURCES", []),
+        [":rocksdb_lib"])
+    # rocksdb_stress_lib
+    TARGETS.add_rocksdb_library(
         "rocksdb_stress_lib",
         src_mk.get("ANALYZER_LIB_SOURCES", [])
         + src_mk.get('STRESS_LIB_SOURCES', [])
-        + ["test_util/testutil.cc"],
-        [":rocksdb_lib"])
+        + ["test_util/testutil.cc"])
+
+    print("Extra dependencies:\n{0}".format(json.dumps(deps_map)))
+
+    # Dictionary test executable name -> relative source file path
+    test_source_map = {}
+    print(src_mk)
+
+    # c_test.c is added through TARGETS.add_c_test(). If there
+    # are more than one .c test file, we need to extend
+    # TARGETS.add_c_test() to include other C tests too.
+    for test_src in src_mk.get("TEST_MAIN_SOURCES_C", []):
+        if test_src != 'db/c_test.c':
+            print("Don't know how to deal with " + test_src)
+            return False
+    TARGETS.add_c_test()
+
+    for test_src in src_mk.get("TEST_MAIN_SOURCES", []):
+        test = test_src.split('.c')[0].strip().split('/')[-1].strip()
+        test_source_map[test] = test_src
+        print("" + test + " " + test_src)
 
-    print("Extra dependencies:\n{0}".format(str(deps_map)))
-    # test for every test we found in the Makefile
     for target_alias, deps in deps_map.items():
-        for test in sorted(tests):
-            match_src = [src for src in cc_files if ("/%s.c" % test) in src]
-            if len(match_src) == 0:
-                print(ColorString.warning("Cannot find .cc file for %s" % test))
-                continue
-            elif len(match_src) > 1:
-                print(ColorString.warning("Found more than one .cc for %s" % test))
-                print(match_src)
+        for test, test_src in sorted(test_source_map.items()):
+            if len(test) == 0:
+                print(ColorString.warning("Failed to get test name for %s" % test_src))
                 continue
 
-            assert(len(match_src) == 1)
-            is_parallel = tests[test]
             test_target_name = \
                 test if not target_alias else test + "_" + target_alias
             TARGETS.register_test(
                 test_target_name,
-                match_src[0],
-                is_parallel,
-                deps['extra_deps'],
-                deps['extra_compiler_flags'])
+                test_src,
+                test not in non_parallel_tests,
+                json.dumps(deps['extra_deps']),
+                json.dumps(deps['extra_compiler_flags']))
 
             if test in _EXPORTED_TEST_LIBS:
                 test_library = "%s_lib" % test_target_name
-                TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"])
+                TARGETS.add_library(test_library, [test_src], [":rocksdb_test_lib"])
     TARGETS.flush_tests()
 
     print(ColorString.info("Generated TARGETS Summary:"))
@@ -220,6 +241,7 @@
 
     return rocksdb_path
 
+
 def exit_with_error(msg):
     print(ColorString.error(msg))
     sys.exit(1)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/check_buck_targets.sh	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+
+TGT_DIFF=`git diff TARGETS | head -n 1`
+
+if [ ! -z "$TGT_DIFF" ]
+then
+  echo "TARGETS file has uncommitted changes. Skip this check."
+  exit 0
+fi
+
+echo Backup original TARGETS file.
+
+cp TARGETS TARGETS.bkp
+
+${PYTHON:-python3} buckifier/buckify_rocksdb.py
+
+TGT_DIFF=`git diff TARGETS | head -n 1`
+
+if [ -z "$TGT_DIFF" ]
+then
+  mv TARGETS.bkp TARGETS
+  exit 0
+else
+  echo "Please run '${PYTHON:-python3} buckifier/buckify_rocksdb.py' to update TARGETS file."
+  echo "Do not manually update TARGETS file."
+  ${PYTHON:-python3} --version
+  mv TARGETS.bkp TARGETS
+  exit 1
+fi
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_builder.py mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_builder.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_builder.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_builder.py	2025-05-19 16:14:27.000000000 +0000
@@ -25,10 +25,12 @@
 
 
 class TARGETSBuilder(object):
-    def __init__(self, path):
+    def __init__(self, path, extra_argv):
         self.path = path
-        self.targets_file = open(path, 'w')
-        self.targets_file.write(targets_cfg.rocksdb_target_header)
+        self.targets_file = open(path, 'wb')
+        header = targets_cfg.rocksdb_target_header_template.format(
+            extra_argv=extra_argv)
+        self.targets_file.write(header.encode("utf-8"))
         self.total_lib = 0
         self.total_bin = 0
         self.total_test = 0
@@ -37,26 +39,68 @@
     def __del__(self):
         self.targets_file.close()
 
-    def add_library(self, name, srcs, deps=None, headers=None):
+    def add_library(self, name, srcs, deps=None, headers=None,
+                    extra_external_deps="", link_whole=False):
         headers_attr_prefix = ""
         if headers is None:
             headers_attr_prefix = "auto_"
             headers = "AutoHeaders.RECURSIVE_GLOB"
+        else:
+            headers = "[" + pretty_list(headers) + "]"
         self.targets_file.write(targets_cfg.library_template.format(
             name=name,
             srcs=pretty_list(srcs),
             headers_attr_prefix=headers_attr_prefix,
             headers=headers,
-            deps=pretty_list(deps)))
+            deps=pretty_list(deps),
+            extra_external_deps=extra_external_deps,
+            link_whole=link_whole).encode("utf-8"))
+        self.total_lib = self.total_lib + 1
+
+    def add_rocksdb_library(self, name, srcs, headers=None):
+        headers_attr_prefix = ""
+        if headers is None:
+            headers_attr_prefix = "auto_"
+            headers = "AutoHeaders.RECURSIVE_GLOB"
+        else:
+            headers = "[" + pretty_list(headers) + "]"
+        self.targets_file.write(targets_cfg.rocksdb_library_template.format(
+            name=name,
+            srcs=pretty_list(srcs),
+            headers_attr_prefix=headers_attr_prefix,
+            headers=headers).encode("utf-8"))
         self.total_lib = self.total_lib + 1
 
     def add_binary(self, name, srcs, deps=None):
-        self.targets_file.write(targets_cfg.binary_template % (
-            name,
-            pretty_list(srcs),
-            pretty_list(deps)))
+        self.targets_file.write(targets_cfg.binary_template.format(
+            name=name,
+            srcs=pretty_list(srcs),
+            deps=pretty_list(deps)).encode("utf-8"))
         self.total_bin = self.total_bin + 1
 
+    def add_c_test(self):
+        self.targets_file.write(b"""
+cpp_binary(
+    name = "c_test_bin",
+    srcs = ["db/c_test.c"],
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    deps = [":rocksdb_test_lib"],
+) if not is_opt_mode else None
+
+custom_unittest(
+    name = "c_test",
+    command = [
+        native.package_name() + "/buckifier/rocks_test_runner.sh",
+        "$(location :{})".format("c_test_bin"),
+    ],
+    type = "simple",
+) if not is_opt_mode else None
+""")
+
     def register_test(self,
                       test_name,
                       src,
@@ -76,5 +120,5 @@
         self.total_test = self.total_test + 1
 
     def flush_tests(self):
-        self.targets_file.write(targets_cfg.unittests_template % self.tests_cfg)
+        self.targets_file.write(targets_cfg.unittests_template.format(tests=self.tests_cfg).encode("utf-8"))
         self.tests_cfg = ""
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_cfg.py mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_cfg.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/buckifier/targets_cfg.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/buckifier/targets_cfg.py	2025-05-19 16:14:27.000000000 +0000
@@ -4,7 +4,9 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-rocksdb_target_header = """# This file \100generated by `python buckifier/buckify_rocksdb.py`
+rocksdb_target_header_template = \
+    """# This file \100generated by:
+#$ python3 buckifier/buckify_rocksdb.py{extra_argv}
 # --> DO NOT EDIT MANUALLY <--
 # This file is a Facebook-specific integration for buck builds, so can
 # only be validated by Facebook employees.
@@ -15,7 +17,7 @@
 
 REPO_PATH = package_name() + "/"
 
-ROCKSDB_COMPILER_FLAGS = [
+ROCKSDB_COMPILER_FLAGS_0 = [
     "-fno-builtin-memcmp",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
@@ -30,19 +32,25 @@
     ("zlib", None, "z"),
     ("gflags", None, "gflags"),
     ("lz4", None, "lz4"),
-    ("zstd", None),
-    ("tbb", None),
-    ("googletest", None, "gtest"),
+    ("zstd", None, "zstd"),
 ]
 
-ROCKSDB_OS_DEPS = [
+ROCKSDB_OS_DEPS_0 = [
     (
         "linux",
-        ["third-party//numa:numa", "third-party//liburing:uring"],
+        [
+            "third-party//numa:numa",
+            "third-party//liburing:uring",
+            "third-party//tbb:tbb",
+        ],
+    ),
+    (
+        "macos",
+        ["third-party//tbb:tbb"],
     ),
 ]
 
-ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [
     (
         "linux",
         [
@@ -56,17 +64,33 @@
             "-DHAVE_SSE42",
             "-DLIBURING",
             "-DNUMA",
+            "-DROCKSDB_PLATFORM_POSIX",
+            "-DROCKSDB_LIB_IO_POSIX",
+            "-DTBB",
         ],
     ),
     (
         "macos",
-        ["-DOS_MACOSX"],
+        [
+            "-DOS_MACOSX",
+            "-DROCKSDB_PLATFORM_POSIX",
+            "-DROCKSDB_LIB_IO_POSIX",
+            "-DTBB",
+        ],
+    ),
+    (
+        "windows",
+        [
+            "-DOS_WIN",
+            "-DWIN32",
+            "-D_MBCS",
+            "-DWIN64",
+            "-DNOMINMAX",
+        ],
     ),
 ]
 
 ROCKSDB_PREPROCESSOR_FLAGS = [
-    "-DROCKSDB_PLATFORM_POSIX",
-    "-DROCKSDB_LIB_IO_POSIX",
     "-DROCKSDB_SUPPORT_THREAD_LOCAL",
 
     # Flags to enable libs we include
@@ -77,21 +101,22 @@
     "-DZSTD",
     "-DZSTD_STATIC_LINKING_ONLY",
     "-DGFLAGS=gflags",
-    "-DTBB",
 
     # Added missing flags from output of build_detect_platform
     "-DROCKSDB_BACKTRACE",
+]
 
-    # Directories with files for #include
-    "-I" + REPO_PATH + "include/",
-    "-I" + REPO_PATH,
+# Directories with files for #include
+ROCKSDB_INCLUDE_PATHS = [
+    "",
+    "include",
 ]
 
-ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
+ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {{
     "x86_64": [
         "-DHAVE_PCLMUL",
     ],
-}
+}}
 
 build_mode = read_config("fbcode", "build_mode")
 
@@ -99,21 +124,26 @@
 
 # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
 # doesn't harm and avoid forgetting to add it.
-ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else [])
+ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else [])
 
 sanitizer = read_config("fbcode", "sanitizer")
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([(
     "linux",
     ["-DROCKSDB_JEMALLOC"],
 )] if sanitizer == "" else [])
 
-ROCKSDB_OS_DEPS += ([(
+ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([(
     "linux",
     ["third-party//jemalloc:headers"],
 )] if sanitizer == "" else [])
+
+ROCKSDB_LIB_DEPS = [
+    ":rocksdb_lib",
+    ":rocksdb_test_lib",
+] if not is_opt_mode else [":rocksdb_lib"]
 """
 
 
@@ -124,22 +154,41 @@
     {headers_attr_prefix}headers = {headers},
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    link_whole = {link_whole},
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-    deps = [{deps}],
-    external_deps = ROCKSDB_EXTERNAL_DEPS,
+    exported_deps = [{deps}],
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS{extra_external_deps},
+)
+"""
+
+rocksdb_library_template = """
+cpp_library(
+    name = "{name}",
+    srcs = [{srcs}],
+    {headers_attr_prefix}headers = {headers},
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    exported_deps = ROCKSDB_LIB_DEPS,
+    exported_external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 """
 
 binary_template = """
 cpp_binary(
-    name = "%s",
-    srcs = [%s],
+    name = "{name}",
+    srcs = [{srcs}],
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-    deps = [%s],
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    deps = [{deps}],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 """
@@ -156,24 +205,24 @@
 unittests_template = """
 # [test_name, test_src, test_type, extra_deps, extra_compiler_flags]
 ROCKS_TESTS = [
-%s]
+{tests}]
 
 # Generate a test rule for each entry in ROCKS_TESTS
 # Do not build the tests in opt mode, since SyncPoint and other test code
 # will not be included.
 [
-    test_binary(
-        extra_compiler_flags = extra_compiler_flags,
-        extra_deps = extra_deps,
-        parallelism = parallelism,
-        rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
-        rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
-        rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
-        rocksdb_os_deps = ROCKSDB_OS_DEPS,
-        rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
-        rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-        test_cc = test_cc,
-        test_name = test_name,
+    cpp_unittest(
+        name = test_name,
+        srcs = [test_cc],
+        arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+        compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags,
+        include_paths = ROCKSDB_INCLUDE_PATHS,
+        os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+        preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+        deps = [":rocksdb_test_lib"] + extra_deps,
+        external_deps = ROCKSDB_EXTERNAL_DEPS + [
+            ("googletest", None, "gtest"),
+        ],
     )
     for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS
     if not is_opt_mode
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/build_detect_platform mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/build_detect_platform
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/build_detect_platform	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/build_detect_platform	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,7 @@
 #   PLATFORM_LDFLAGS            Linker flags
 #   JAVA_LDFLAGS                Linker flags for RocksDBJava
 #   JAVA_STATIC_LDFLAGS         Linker flags for RocksDBJava static build
+#   JAVAC_ARGS                  Arguments for javac
 #   PLATFORM_SHARED_EXT         Extension for shared libraries
 #   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
 #   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
@@ -27,6 +28,7 @@
 #       -DZSTD                      if the ZSTD library is present
 #       -DNUMA                      if the NUMA library is present
 #       -DTBB                       if the TBB library is present
+#       -DMEMKIND                   if the memkind library is present
 #
 # Using gflags in rocksdb:
 # Our project depends on gflags, which requires users to take some extra steps
@@ -43,8 +45,13 @@
   exit 1
 fi
 
-# we depend on C++11
-PLATFORM_CXXFLAGS="-std=c++11"
+# we depend on C++11, but should be compatible with newer standards
+if [ "$ROCKSDB_CXX_STANDARD" ]; then
+  PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD"
+else
+  PLATFORM_CXXFLAGS="-std=c++11"
+fi
+
 # we currently depend on POSIX platform
 COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
 
@@ -58,8 +65,12 @@
       source "$PWD/build_tools/fbcode_config4.8.1.sh"
     elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_5xx" ]; then
       source "$PWD/build_tools/fbcode_config.sh"
-    else
+    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007" ]; then
       source "$PWD/build_tools/fbcode_config_platform007.sh"
+    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then
+      source "$PWD/build_tools/fbcode_config_platform009.sh"
+    else
+      source "$PWD/build_tools/fbcode_config_platform009.sh"
     fi
 fi
 
@@ -87,6 +98,16 @@
     fi
 fi
 
+if test -z "$AR"; then
+    if [ -x "$(command -v gcc-ar)" ]; then
+        AR=gcc-ar
+    elif [ -x "$(command -v llvm-ar)" ]; then
+        AR=llvm-ar
+    else
+        AR=ar
+    fi
+fi
+
 # Detect OS
 if test -z "$TARGET_OS"; then
     TARGET_OS=`uname -s`
@@ -149,10 +170,13 @@
         else
             PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
         fi
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
-        if test $ROCKSDB_USE_IO_URING; then
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl"
+        if test -z "$ROCKSDB_USE_IO_URING"; then
+            ROCKSDB_USE_IO_URING=1
+        fi
+        if test "$ROCKSDB_USE_IO_URING" -ne 0; then
             # check for liburing
-            $CXX $CFLAGS -x c++ - -luring -o /dev/null 2>/dev/null  <<EOF
+            $CXX $PLATFORM_CXXFLAGS -x c++ - -luring -o test.o 2>/dev/null  <<EOF
               #include <liburing.h>
               int main() {
                 struct io_uring ring;
@@ -165,9 +189,6 @@
                 COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_IOURING_PRESENT"
             fi
         fi
-        if test -z "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
-          USE_FOLLY_DISTRIBUTED_MUTEX=1
-        fi
         # PORT_FILES=port/linux/linux_specific.cc
         ;;
     SunOS)
@@ -190,6 +211,17 @@
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
         # PORT_FILES=port/freebsd/freebsd_specific.cc
         ;;
+    GNU/kFreeBSD)
+        PLATFORM=OS_GNU_KFREEBSD
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_GNU_KFREEBSD"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/gnu_kfreebsd/gnu_kfreebsd_specific.cc
+        ;;
     NetBSD)
         PLATFORM=OS_NETBSD
         COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
@@ -239,15 +271,20 @@
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
 JAVA_LDFLAGS="$PLATFORM_LDFLAGS"
 JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS"
+JAVAC_ARGS="-source 7"
 
 if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
     # Cross-compiling; do not try any compilation tests.
     # Also don't need any compilation tests if compiling on fbcode
+    if [ "$FBCODE_BUILD" = "true" ]; then
+      # Enable backtrace on fbcode since the necessary libraries are present
+      COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
+    fi
     true
 else
     if ! test $ROCKSDB_DISABLE_FALLOCATE; then
         # Test whether fallocate is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <fcntl.h>
           #include <linux/falloc.h>
           int main() {
@@ -263,7 +300,7 @@
     if ! test $ROCKSDB_DISABLE_SNAPPY; then
         # Test whether Snappy library is installed
         # http://code.google.com/p/snappy/
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <snappy.h>
           int main() {}
 EOF
@@ -278,30 +315,38 @@
         # Test whether gflags library is installed
         # http://gflags.github.io/gflags/
         # check if the namespace is gflags
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+        if $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
           #include <gflags/gflags.h>
+          using namespace GFLAGS_NAMESPACE;
           int main() {}
 EOF
-        if [ "$?" = 0 ]; then
-            COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
-            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
-        else
-          # check if namespace is google
-          $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+        then
+          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
+          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+        # check if namespace is gflags
+        elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
+            #include <gflags/gflags.h>
+            using namespace gflags;
+            int main() {}
+EOF
+        then
+          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags"
+          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+        # check if namespace is google
+        elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
             #include <gflags/gflags.h>
             using namespace google;
             int main() {}
 EOF
-          if [ "$?" = 0 ]; then
-              COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google"
-              PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
-          fi
+        then
+          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=google"
+          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
         fi
     fi
 
     if ! test $ROCKSDB_DISABLE_ZLIB; then
         # Test whether zlib library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <zlib.h>
           int main() {}
 EOF
@@ -314,7 +359,7 @@
 
     if ! test $ROCKSDB_DISABLE_BZIP; then
         # Test whether bzip library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <bzlib.h>
           int main() {}
 EOF
@@ -327,7 +372,7 @@
 
     if ! test $ROCKSDB_DISABLE_LZ4; then
         # Test whether lz4 library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <lz4.h>
           #include <lz4hc.h>
           int main() {}
@@ -341,7 +386,7 @@
 
     if ! test $ROCKSDB_DISABLE_ZSTD; then
         # Test whether zstd library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <zstd.h>
           int main() {}
 EOF
@@ -354,7 +399,7 @@
 
     if ! test $ROCKSDB_DISABLE_NUMA; then
         # Test whether numa is available
-        $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -lnuma 2>/dev/null  <<EOF
           #include <numa.h>
           #include <numaif.h>
           int main() {}
@@ -368,7 +413,7 @@
 
     if ! test $ROCKSDB_DISABLE_TBB; then
         # Test whether tbb is available
-        $CXX $CFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ltbb 2>/dev/null  <<EOF
           #include <tbb/tbb.h>
           int main() {}
 EOF
@@ -381,7 +426,7 @@
 
     if ! test $ROCKSDB_DISABLE_JEMALLOC; then
         # Test whether jemalloc is available
-        if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \
+        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -ljemalloc \
           2>/dev/null; then
             # This will enable some preprocessor identifiers in the Makefile
             JEMALLOC=1
@@ -402,7 +447,7 @@
     fi
     if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then
         # jemalloc is not available. Let's try tcmalloc
-        if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \
+        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o \
           -ltcmalloc 2>/dev/null; then
             PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
             JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc"
@@ -411,7 +456,7 @@
 
     if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then
         # Test whether malloc_usable_size is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <malloc.h>
           int main() {
             size_t res = malloc_usable_size(0);
@@ -424,9 +469,25 @@
         fi
     fi
 
+    if ! test $ROCKSDB_DISABLE_MEMKIND; then
+        # Test whether memkind library is installed
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <memkind.h>
+          int main() {
+            memkind_malloc(MEMKIND_DAX_KMEM, 1024);
+            return 0;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DMEMKIND"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lmemkind"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lmemkind"
+        fi
+    fi
+
     if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then
         # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <pthread.h>
           int main() {
             int x = PTHREAD_MUTEX_ADAPTIVE_NP;
@@ -441,7 +502,7 @@
 
     if ! test $ROCKSDB_DISABLE_BACKTRACE; then
         # Test whether backtrace is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <execinfo.h>
           int main() {
             void* frames[1];
@@ -453,7 +514,7 @@
             COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
         else
             # Test whether execinfo library is installed
-            $CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null  <<EOF
+            $CXX $PLATFORM_CXXFLAGS -lexecinfo -x c++ - -o test.o 2>/dev/null  <<EOF
               #include <execinfo.h>
               int main() {
                 void* frames[1];
@@ -470,7 +531,7 @@
 
     if ! test $ROCKSDB_DISABLE_PG; then
         # Test if -pg is supported
-        $CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -pg -x c++ - -o test.o 2>/dev/null  <<EOF
           int main() {
             return 0;
           }
@@ -482,7 +543,7 @@
 
     if ! test $ROCKSDB_DISABLE_SYNC_FILE_RANGE; then
         # Test whether sync_file_range is supported for compatibility with an old glibc
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <fcntl.h>
           int main() {
             int fd = open("/dev/null", 0);
@@ -496,7 +557,7 @@
 
     if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then
         # Test whether sched_getcpu is supported
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
           #include <sched.h>
           int main() {
             int cpuid = sched_getcpu();
@@ -508,9 +569,23 @@
         fi
     fi
 
+    if ! test $ROCKSDB_DISABLE_AUXV_GETAUXVAL; then
+        # Test whether getauxval is supported
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null  <<EOF
+          #include <sys/auxv.h>
+          int main() {
+            uint64_t auxv = getauxval(AT_HWCAP);
+            (void)auxv;
+          }
+EOF
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_AUXV_GETAUXVAL_PRESENT"
+        fi
+    fi
+
     if ! test $ROCKSDB_DISABLE_ALIGNED_NEW; then
         # Test whether c++17 aligned-new is supported
-        $CXX $PLATFORM_CXXFLAGS -faligned-new -x c++ - -o /dev/null 2>/dev/null <<EOF
+        $CXX $PLATFORM_CXXFLAGS -faligned-new -x c++ - -o test.o 2>/dev/null <<EOF
             struct alignas(1024) t {int a;};
             int main() {}
 EOF
@@ -518,13 +593,23 @@
             PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -faligned-new -DHAVE_ALIGNED_NEW"
         fi
     fi
+    if ! test $ROCKSDB_DISABLE_BENCHMARK; then
+        # Test whether google benchmark is available
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -lbenchmark 2>/dev/null  <<EOF
+          #include <benchmark/benchmark.h>
+          int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbenchmark"
+        fi
+    fi
 fi
 
 # TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning.
-# -Wshorten-64-to-32 breaks compilation on FreeBSD i386
-if ! [ "$TARGET_OS" = FreeBSD -a "$TARGET_ARCHITECTURE" = i386 ]; then
+# -Wshorten-64-to-32 breaks compilation on FreeBSD aarch64 and i386
+if ! { [ "$TARGET_OS" = FreeBSD ] && [ "$TARGET_ARCHITECTURE" = arm64 -o "$TARGET_ARCHITECTURE" = i386 ]; }; then
   # Test whether -Wshorten-64-to-32 is available
-  $CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null  <<EOF
+  $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -Wshorten-64-to-32 2>/dev/null  <<EOF
     int main() {}
 EOF
   if [ "$?" = 0 ]; then
@@ -553,13 +638,19 @@
     # Tune for this POWER processor, treating '+' models as base models
     POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
     COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER "
-  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
-    COMMON_FLAGS="$COMMON_FLAGS -march=z10 "
   elif test -n "`echo $TARGET_ARCHITECTURE | grep -e^arm -e^aarch64`"; then
     # TODO: Handle this with approprite options.
     COMMON_FLAGS="$COMMON_FLAGS"
   elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then
     COMMON_FLAGS="$COMMON_FLAGS"
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
+    if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ \
+      -fsyntax-only -march=native - -o /dev/null 2>/dev/null; then
+      COMMON_FLAGS="$COMMON_FLAGS -march=native "
+    else
+      COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
+    fi
+    COMMON_FLAGS="$COMMON_FLAGS"
   elif [ "$TARGET_OS" == "IOS" ]; then
     COMMON_FLAGS="$COMMON_FLAGS"
   elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then
@@ -575,6 +666,40 @@
   if test "$USE_SSE"; then
     TRY_SSE_ETC="1"
   fi
+
+  if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
+    COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
+  fi
+
+  if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then
+    # For portability compile for macOS 10.12 (2016) or newer
+    COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.12"
+    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.12"
+    # -mmacosx-version-min must come first here.
+    PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.12 $PLATFORM_SHARED_LDFLAGS"
+    PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.12"
+    JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.12"
+    JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+    JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+    JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+  fi
+fi
+
+if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
+  # check for GNU libc on ppc64
+  $CXX -x c++ - -o /dev/null 2>/dev/null <<EOF
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <gnu/libc-version.h>
+
+    int main(int argc, char *argv[]) {
+      printf("GNU libc version: %s\n", gnu_get_libc_version());
+      return 0;
+    }
+EOF
+  if [ "$?" != 0 ]; then
+      PPC_LIBC_IS_GNU=0
+  fi
 fi
 
 if test "$TRY_SSE_ETC"; then
@@ -584,14 +709,21 @@
   # It doesn't even really check that your current CPU is compatible.
   #
   # SSE4.2 available since nehalem, ca. 2008-2010
+  # Includes POPCNT for BitsSetToOne, BitParity
   TRY_SSE42="-msse4.2"
   # PCLMUL available since westmere, ca. 2010-2011
   TRY_PCLMUL="-mpclmul"
   # AVX2 available since haswell, ca. 2013-2015
   TRY_AVX2="-mavx2"
+  # BMI available since haswell, ca. 2013-2015
+  # Primarily for TZCNT for CountTrailingZeroBits
+  TRY_BMI="-mbmi"
+  # LZCNT available since haswell, ca. 2013-2015
+  # For FloorLog2
+  TRY_LZCNT="-mlzcnt"
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o /dev/null 2>/dev/null <<EOF
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o test.o 2>/dev/null <<EOF
   #include <cstdint>
   #include <nmmintrin.h>
   int main() {
@@ -605,7 +737,7 @@
   echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o /dev/null 2>/dev/null <<EOF
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o test.o 2>/dev/null <<EOF
   #include <cstdint>
   #include <wmmintrin.h>
   int main() {
@@ -622,7 +754,7 @@
   echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o /dev/null 2>/dev/null <<EOF
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o test.o 2>/dev/null <<EOF
   #include <cstdint>
   #include <immintrin.h>
   int main() {
@@ -637,7 +769,35 @@
   echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main(int argc, char *argv[]) {
+    (void)argv;
+    return (int)_tzcnt_u64((uint64_t)argc);
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main(int argc, char *argv[]) {
+    (void)argv;
+    return (int)_lzcnt_u64((uint64_t)argc);
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
   #include <cstdint>
   int main() {
     uint64_t a = 0xffffFFFFffffFFFF;
@@ -654,7 +814,7 @@
 # succeed because the cross-compiler flags are added by the Makefile, not this
 # script.
 if [ "$PLATFORM" != IOS ]; then
-  $CXX $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+  $CXX $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
   #if defined(_MSC_VER) && !defined(__thread)
   #define __thread __declspec(thread)
   #endif
@@ -668,12 +828,13 @@
   fi
 fi
 
+
 if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then
   $CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null <<EOF
   void dummy_func() {}
 EOF
   if [ "$?" = 0 ]; then
-    $CXX $COMMON_FLAGS $PLATFORM_SHARED_LDFLAGS test_dl.o -o /dev/null 2>/dev/null
+    $CXX $COMMON_FLAGS $PLATFORM_SHARED_LDFLAGS test_dl.o -o test.o 2>/dev/null
     if [ "$?" = 0 ]; then
       EXEC_LDFLAGS+="-ldl"
       rm -f test_dl.o
@@ -681,6 +842,20 @@
   fi
 fi
 
+# check for F_FULLFSYNC
+$CXX $PLATFORM_CXXFALGS -x c++ - -o test.o 2>/dev/null  <<EOF
+  #include <fcntl.h>
+  int main() {
+    fcntl(0, F_FULLFSYNC);
+    return 0;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_FULLFSYNC"
+fi
+
+rm -f test.o test_dl.o
+
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
 
@@ -692,10 +867,16 @@
 
 echo "CC=$CC" >> "$OUTPUT"
 echo "CXX=$CXX" >> "$OUTPUT"
+echo "AR=$AR" >> "$OUTPUT"
 echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
 echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
+echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" >> "$OUTPUT"
 echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT"
 echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" >> "$OUTPUT"
+echo "JAVAC_ARGS=$JAVAC_ARGS" >> "$OUTPUT"
 echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
 echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
 echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"
@@ -728,3 +909,6 @@
 if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
   echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT"
 fi
+if test -n "$PPC_LIBC_IS_GNU"; then
+  echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT"
+fi
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/check-sources.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/check-sources.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/check-sources.sh	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/check-sources.sh	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# Check for some simple mistakes that should prevent commit or push
+
+BAD=""
+
+git grep 'namespace rocksdb' -- '*.[ch]*'
+if [ "$?" != "1" ]; then
+  echo "^^^^^ Do not hardcode namespace rocksdb. Use ROCKSDB_NAMESPACE"
+  BAD=1
+fi
+
+git grep -i 'nocommit' -- ':!build_tools/check-sources.sh'
+if [ "$?" != "1" ]; then
+  echo "^^^^^ Code was not intended to be committed"
+  BAD=1
+fi
+
+git grep '<rocksdb/' -- ':!build_tools/check-sources.sh'
+if [ "$?" != "1" ]; then
+  echo '^^^^^ Use double-quotes as in #include "rocksdb/something.h"'
+  BAD=1
+fi
+
+git grep 'using namespace' -- ':!build_tools' ':!docs' \
+    ':!third-party/folly/folly/lang/Align.h' \
+    ':!third-party/gtest-1.8.1/fused-src/gtest/gtest.h'
+if [ "$?" != "1" ]; then
+  echo '^^^^ Do not use "using namespace"'
+  BAD=1
+fi
+
+if [ "$BAD" ]; then
+  exit 1
+fi
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/dependencies_platform009.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/dependencies_platform009.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/dependencies_platform009.sh	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/dependencies_platform009.sh	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+GCC_BASE=/mnt/gvfs/third-party2/gcc/1795efe5f06778c15a92c8f9a2aba5dc496d9d4d/9.x/centos7-native/3bed279
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/7318eaac22659b6ff2fe43918e4b69fd0772a8a7/9.0.0/platform009/651ee30
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/4959b39cfbe5965a37c861c4c327fa7c5c759b87/9.x/platform009/9202ce7
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/45ce3375cdc77ecb2520bbf8f0ecddd3f98efd7a/2.30/platform009/f259413
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/be4de3205e029101b18aa8103daa696c2bef3b19/1.1.3/platform009/7f3b187
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/3c160ac5c67e257501e24c6c1d00ad5e01d73db6/1.2.8/platform009/7f3b187
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/73a237ac5bc0a5f5d67b39b8d253cfebaab88684/1.0.6/platform009/7f3b187
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/ec6573523b0ce55ef6373a4801189027cf07bb2c/1.9.1/platform009/7f3b187
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/64c58a207d2495e83abc57a500a956df09b79a7c/1.4.x/platform009/ba86d1f
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/824d0a8a5abb5b121afd1b35fc3896407ea50092/2.2.0/platform009/7f3b187
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/d9aef9feb850b168a68736420f217b01cce11a89/master/platform009/c305944
+NUMA_BASE=/mnt/gvfs/third-party2/numa/0af65f71e23a67bf65dc91b11f95caa39325c432/2.0.11/platform009/7f3b187
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/02486dac347645d31dce116f44e1de3177315be2/1.4/platform009/5191652
+TBB_BASE=/mnt/gvfs/third-party2/tbb/2e0ec671e550bfca347300bf3f789d9c0fff24ad/2018_U5/platform009/7f3b187
+LIBURING_BASE=/mnt/gvfs/third-party2/liburing/70dbd9cfee63a25611417d09433a86d7711b3990/20200729/platform009/7f3b187
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/32b8a2407b634df3f8f948ba373fc4acc6a18296/fb/platform009/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/08634589372fa5f237bfd374e8c644a8364e78c1/2.32/platform009/ba86d1f/
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/6ae525939ad02e5e676855082fbbc7828dbafeac/3.15.0/platform009/7f3b187
+LUA_BASE=/mnt/gvfs/third-party2/lua/162efd9561a3d21f6869f4814011e9cf1b3ff4dc/5.3.4/platform009/a6271c4
+BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/bce8d9564eaf161700aa3a20b1051564acf555fb/1.5.5/platform009/7f3b187
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config.sh	2025-05-19 16:14:27.000000000 +0000
@@ -109,6 +109,7 @@
   # gcc
   CC="$GCC_BASE/bin/gcc"
   CXX="$GCC_BASE/bin/g++"
+  AR="$GCC_BASE/bin/gcc-ar"
 
   CFLAGS+=" -B$BINUTILS/gold"
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
@@ -119,6 +120,7 @@
   CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
   CC="$CLANG_BIN/clang"
   CXX="$CLANG_BIN/clang++"
+  AR="$CLANG_BIN/llvm-ar"
 
   KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config4.8.1.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config4.8.1.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config4.8.1.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config4.8.1.sh	2025-05-19 16:14:27.000000000 +0000
@@ -69,6 +69,7 @@
   # gcc
   CC="$GCC_BASE/bin/gcc"
   CXX="$GCC_BASE/bin/g++"
+  CXX="$GCC_BASE/bin/gcc-ar"
 
   CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
@@ -81,6 +82,7 @@
   CLANG_INCLUDE="$CLANG_LIB/clang/*/include"
   CC="$CLANG_BIN/clang"
   CXX="$CLANG_BIN/clang++"
+  AR="$CLANG_BIN/llvm-ar"
 
   KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include/"
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform007.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform007.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform007.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform007.sh	2025-05-19 16:14:27.000000000 +0000
@@ -118,6 +118,7 @@
   # gcc
   CC="$GCC_BASE/bin/gcc"
   CXX="$GCC_BASE/bin/g++"
+  AR="$GCC_BASE/bin/gcc-ar"
 
   CFLAGS+=" -B$BINUTILS/gold"
   CFLAGS+=" -isystem $LIBGCC_INCLUDE"
@@ -128,6 +129,7 @@
   CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
   CC="$CLANG_BIN/clang"
   CXX="$CLANG_BIN/clang++"
+  AR="$CLANG_BIN/llvm-ar"
 
   KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
 
@@ -165,6 +167,4 @@
 LUA_PATH=
 LUA_LIB=
 
-USE_FOLLY_DISTRIBUTED_MUTEX=1
-
 export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform009.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform009.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform009.sh	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/fbcode_config_platform009.sh	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,179 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+
+BASEDIR=`dirname $BASH_SOURCE`
+source "$BASEDIR/dependencies_platform009.sh"
+
+CFLAGS=""
+
+# libgcc
+LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
+
+# glibc
+GLIBC_INCLUDE="$GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"
+
+# snappy
+SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+if test -z $PIC_BUILD; then
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"
+else
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a"
+fi
+CFLAGS+=" -DSNAPPY"
+
+if test -z $PIC_BUILD; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+  ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
+  CFLAGS+=" -DZLIB"
+
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
+  BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a"
+  CFLAGS+=" -DBZIP2"
+
+  LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+  LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
+  CFLAGS+=" -DLZ4"
+fi
+
+ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+if test -z $PIC_BUILD; then
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
+else
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
+fi
+CFLAGS+=" -DZSTD"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=gflags"
+
+BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/"
+if test -z $PIC_BUILD; then
+  BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark.a"
+else
+  BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark_pic.a"
+fi
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a"
+
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I $NUMA_BASE/include/"
+  NUMA_LIB=" $NUMA_BASE/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
+
+  # location of libunwind
+  LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
+fi
+
+# location of TBB
+TBB_INCLUDE=" -isystem $TBB_BASE/include/"
+if test -z $PIC_BUILD; then
+  TBB_LIBS="$TBB_BASE/lib/libtbb.a"
+else
+  TBB_LIBS="$TBB_BASE/lib/libtbb_pic.a"
+fi
+CFLAGS+=" -DTBB"
+
+# location of LIBURING
+LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/"
+if test -z $PIC_BUILD; then
+  LIBURING_LIBS="$LIBURING_BASE/lib/liburing.a"
+else
+  LIBURING_LIBS="$LIBURING_BASE/lib/liburing_pic.a"
+fi
+CFLAGS+=" -DLIBURING"
+
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
+
+BINUTILS="$BINUTILS_BASE/bin"
+AR="$BINUTILS/ar"
+AS="$BINUTILS/as"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE"
+
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BIN="$CLANG_BASE/bin"
+CLANG_LIB="$CLANG_BASE/lib"
+CLANG_SRC="$CLANG_BASE/../../src"
+
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  AR="$GCC_BASE/bin/gcc-ar"
+
+  CFLAGS+=" -B$BINUTILS"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  JEMALLOC=1
+else
+  # clang
+  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+  AR="$CLANG_BIN/llvm-ar"
+
+  KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
+
+  CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CFLAGS+=" -Wno-expansion-to-defined "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib"
+EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
+# required by libtbb
+EXEC_LDFLAGS+=" -ldl"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+PLATFORM_LDFLAGS+=" -B$BINUTILS"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
+
+VALGRIND_VER="$VALGRIND_BASE/bin/"
+
+# lua not supported because it's on track for deprecation, I think
+LUA_PATH=
+LUA_LIB=
+
+export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/format-diff.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/format-diff.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/format-diff.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/format-diff.sh	2025-05-19 16:14:27.000000000 +0000
@@ -2,41 +2,101 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 # If clang_format_diff.py command is not specfied, we assume we are able to
 # access directly without any path.
-if [ -z $CLANG_FORMAT_DIFF ]
-then
-CLANG_FORMAT_DIFF="clang-format-diff.py"
-fi
 
-# Check clang-format-diff.py
-if ! which $CLANG_FORMAT_DIFF &> /dev/null
-then
-  echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!"
-  echo "You can download clang-format-diff.py by running: "
-  echo "    curl --location http://goo.gl/iUW1u2 -o ${CLANG_FORMAT_DIFF}"
-  echo "You can download clang-format by running:"
-  echo "    brew install clang-format"
-  echo "  Or"
-  echo "    apt install clang-format"
-  echo "  This might work too:"
-  echo "    yum install git-clang-format"
-  echo "Then, move both files (i.e. ${CLANG_FORMAT_DIFF} and clang-format) to some directory within PATH=${PATH}"
-  echo "and make sure ${CLANG_FORMAT_DIFF} is executable."
-  exit 128
-fi
-
-# Check argparse, a library that clang-format-diff.py requires.
-python 2>/dev/null << EOF
-import argparse
-EOF
-
-if [ "$?" != 0 ]
-then
-  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
-  echo "installed. You can try either of the follow ways to install it:"
-  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
-  echo "  2. easy_install argparse (if you have easy_install)"
-  echo "  3. pip install argparse (if you have pip)"
-  exit 129
+print_usage () {
+  echo "Usage:"
+  echo "format-diff.sh [OPTIONS]"
+  echo "-c: check only."
+  echo "-h: print this message."
+}
+
+while getopts ':ch' OPTION; do
+  case "$OPTION" in
+    c)
+      CHECK_ONLY=1
+      ;;
+    h)
+      print_usage
+      exit 1
+      ;;
+    ?)
+      print_usage
+      exit 1
+      ;;
+  esac
+done
+
+REPO_ROOT="$(git rev-parse --show-toplevel)"
+
+if [ "$CLANG_FORMAT_DIFF" ]; then
+  echo "Note: CLANG_FORMAT_DIFF='$CLANG_FORMAT_DIFF'"
+  # Dry run to confirm dependencies like argparse
+  if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then
+    true #Good
+  else
+    exit 128
+  fi
+else
+  # First try directly executing the possibilities
+  if clang-format-diff --help &> /dev/null < /dev/null; then
+    CLANG_FORMAT_DIFF=clang-format-diff
+  elif clang-format-diff.py --help &> /dev/null < /dev/null; then
+    CLANG_FORMAT_DIFF=clang-format-diff.py
+  elif $REPO_ROOT/clang-format-diff.py --help &> /dev/null < /dev/null; then
+    CLANG_FORMAT_DIFF=$REPO_ROOT/clang-format-diff.py
+  else
+    # This probably means we need to directly invoke the interpreter.
+    # But first find clang-format-diff.py
+    if [ -f "$REPO_ROOT/clang-format-diff.py" ]; then
+      CFD_PATH="$REPO_ROOT/clang-format-diff.py"
+    elif which clang-format-diff.py &> /dev/null; then
+      CFD_PATH="$(which clang-format-diff.py)"
+    else
+      echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!"
+      echo "You can download clang-format-diff.py by running: "
+      echo "    curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
+      echo "You should make sure the downloaded script is not compromised."
+      echo "You can download clang-format by running:"
+      echo "    brew install clang-format"
+      echo "  Or"
+      echo "    apt install clang-format"
+      echo "  This might work too:"
+      echo "    yum install git-clang-format"
+      echo "Then make sure clang-format is available and executable from \$PATH:"
+      echo "    clang-format --version"
+      exit 128
+    fi
+    # Check argparse pre-req on interpreter, or it will fail
+    if echo import argparse | ${PYTHON:-python3}; then
+      true # Good
+    else
+      echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+      echo "installed. You can try either of the follow ways to install it:"
+      echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+      echo "  2. easy_install argparse (if you have easy_install)"
+      echo "  3. pip install argparse (if you have pip)"
+      exit 129
+    fi
+    # Unfortunately, some machines have a Python2 clang-format-diff.py
+    # installed but only a Python3 interpreter installed. Unfortunately,
+    # automatic 2to3 migration is insufficient, so suggest downloading latest.
+    if grep -q "print '" "$CFD_PATH" && \
+       ${PYTHON:-python3} --version | grep -q 'ython 3'; then
+      echo "You have clang-format-diff.py for Python 2 but are using a Python 3"
+      echo "interpreter (${PYTHON:-python3})."
+      echo "You can download clang-format-diff.py for Python 3 by running: "
+      echo "    curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
+      echo "You should make sure the downloaded script is not compromised."
+      exit 130
+    fi
+    CLANG_FORMAT_DIFF="${PYTHON:-python3} $CFD_PATH"
+    # This had better work after all those checks
+    if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then
+      true #Good
+    else
+      exit 128
+    fi
+  fi
 fi
 
 # TODO(kailiu) following work is not complete since we still need to figure
@@ -62,31 +122,41 @@
 
 # If there's no uncommitted changes, we assume user are doing post-commit
 # format check, in which case we'll try to check the modified lines vs. the
-# facebook/rocksdb.git master branch. Otherwise, we'll check format of the
+# facebook/rocksdb.git main branch. Otherwise, we'll check format of the
 # uncommitted code only.
 if [ -z "$uncommitted_code" ]
 then
   # Attempt to get name of facebook/rocksdb.git remote.
-  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)"
+  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(LC_ALL=POSIX LANG=POSIX git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)"
   # Fall back on 'origin' if that fails
   [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin
-  # Use master branch from that remote
-  [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/master"
+  # Use main branch from that remote
+  [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/$(LC_ALL=POSIX LANG=POSIX git remote show $FORMAT_REMOTE | sed -n '/HEAD branch/s/.*: //p')"
   # Get the common ancestor with that remote branch. Everything after that
   # common ancestor would be considered the contents of a pull request, so
   # should be relevant for formatting fixes.
   FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
   # Get the differences
   diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1)
+  echo "Checking format of changes not yet in $FORMAT_UPSTREAM..."
 else
   # Check the format of uncommitted lines,
   diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+  echo "Checking format of uncommitted changes..."
 fi
 
 if [ -z "$diffs" ]
 then
   echo "Nothing needs to be reformatted!"
   exit 0
+elif [ $CHECK_ONLY ]
+then
+  echo "Your change has unformatted code. Please run make format!"
+  if [ $VERBOSE_CHECK ]; then
+    clang-format --version
+    echo "$diffs"
+  fi
+  exit 1
 fi
 
 # Highlight the insertion/deletion from the clang-format-diff.py's output
@@ -121,7 +191,7 @@
 then
   git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1
 else
-  git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
+  git diff -U0 HEAD | $CLANG_FORMAT_DIFF -i -p 1
 fi
 echo "Files reformatted!"
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/gnu_parallel mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/gnu_parallel
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/gnu_parallel	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/gnu_parallel	2025-05-19 16:14:27.000000000 +0000
@@ -1561,6 +1561,7 @@
 	::die_bug("Can't dup STDERR: $!");
     open $Global::original_stdin, "<&", "STDIN" or
 	::die_bug("Can't dup STDIN: $!");
+    $Global::is_terminal = (-t $Global::original_stderr);
 }
 
 sub enough_file_handles {
@@ -1840,12 +1841,17 @@
 }
 }
 
+$opt::min_progress_interval = 0;
+
 sub init_progress {
     # Uses:
     #   $opt::bar
     # Returns:
     #   list of computers for progress output
     $|=1;
+    if (not $Global::is_terminal) {
+      $opt::min_progress_interval = 30;
+    }
     if($opt::bar) {
 	return("","");
     }
@@ -1870,6 +1876,9 @@
     }
     my $last_header="";
     my $sleep = 0.2;
+    my $last_left = 1000000000;
+    my $last_progress_time = 0;
+    my $ps_reported = 0;
     do {
         while($Global::total_running > 0) {
             debug($Global::total_running, "==", scalar
@@ -1880,14 +1889,38 @@
 		    close $job->fh(0,"w");
 		}
 	    }
-            if($opt::progress) {
+            # When not connected to terminal, assume CI (e.g. CircleCI). In
+            # that case we want occasional progress output to prevent abort
+            # due to timeout with no output, but we also need to stop sending
+            # progress output if there has been no actual progress, so that
+            # the job can time out appropriately (CirecleCI: 10m) in case of
+            # a hung test. But without special output, it is extremely
+            # annoying to diagnose which test is hung, so we add that using
+            # `ps` below.
+            if($opt::progress and
+               ($Global::is_terminal or (time() - $last_progress_time) >= 30)) {
                 my %progress = progress();
                 if($last_header ne $progress{'header'}) {
                     print $Global::original_stderr "\n", $progress{'header'}, "\n";
                     $last_header = $progress{'header'};
                 }
-                print $Global::original_stderr "\r",$progress{'status'};
-		flush $Global::original_stderr;
+                if ($Global::is_terminal) {
+                    print $Global::original_stderr "\r",$progress{'status'};
+                }
+                if ($last_left > $Global::left) {
+                    if (not $Global::is_terminal) {
+                        print $Global::original_stderr $progress{'status'},"\n";
+                    }
+                    $last_progress_time = time();
+                    $ps_reported = 0;
+                } elsif (not $ps_reported and (time() - $last_progress_time) >= 60) {
+                    # No progress in at least 60 seconds: run ps
+                    print $Global::original_stderr "\n";
+                    system("ps", "-wf");
+                    $ps_reported = 1;
+                }
+                $last_left = $Global::left;
+                flush $Global::original_stderr;
             }
 	    if($Global::total_running < $Global::max_jobs_running
 	       and not $Global::JobQueue->empty()) {
@@ -1921,7 +1954,7 @@
 	     not $Global::start_no_new_jobs and not $Global::JobQueue->empty());
     if($opt::progress) {
 	my %progress = progress();
-	print $Global::original_stderr "\r", $progress{'status'}, "\n";
+	print $Global::original_stderr $opt::progress_sep, $progress{'status'}, "\n";
 	flush $Global::original_stderr;
     }
 }
@@ -1954,10 +1987,11 @@
     my $eta = "";
     my ($status,$header)=("","");
     if($opt::eta) {
-	my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) =
-	    compute_eta();
-	$eta = sprintf("ETA: %ds Left: %d AVG: %.2fs  ",
-		       $this_eta, $left, $avgtime);
+      my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) =
+        compute_eta();
+      $eta = sprintf("ETA: %ds Left: %d AVG: %.2fs  ",
+        $this_eta, $left, $avgtime);
+      $Global::left = $left;
     }
     my $termcols = terminal_columns();
     my @workers = sort keys %Global::host;
@@ -5801,7 +5835,7 @@
 		    . "-" . $self->seq();
 	    } else {
 		$workdir = $opt::workdir;
-		# Rsync treats /./ special. We dont want that
+		# Rsync treats /./ special. We don't want that
 		$workdir =~ s:/\./:/:g; # Remove /./
 		$workdir =~ s:/+$::; # Remove ending / if any
 		$workdir =~ s:^\./::g; # Remove starting ./ if any
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/make_package.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/make_package.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/make_package.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/make_package.sh	2025-05-19 16:14:27.000000000 +0000
@@ -103,31 +103,26 @@
   gem_install fpm
 
   make static_lib
-  make install INSTALL_PATH=package
-
-  cd package
-
-  LIB_DIR=lib
-  if [[ -z "$ARCH" ]]; then
-      ARCH=$(getconf LONG_BIT)
-  fi
-  if [[ ("$FPM_OUTPUT" = "rpm") && ($ARCH -eq 64) ]]; then
-      mv lib lib64
-      LIB_DIR=lib64
+  LIBDIR=/usr/lib
+  if [[ $FPM_OUTPUT = "rpm" ]]; then
+      LIBDIR=$(rpm --eval '%_libdir')
   fi
 
+  rm -rf package
+  make install DESTDIR=package PREFIX=/usr LIBDIR=$LIBDIR
+
   fpm \
     -s dir \
     -t $FPM_OUTPUT \
+    -C package \
     -n rocksdb \
     -v $1 \
-    --prefix /usr \
     --url http://rocksdb.org/ \
     -m rocksdb@fb.com \
     --license BSD \
     --vendor Facebook \
     --description "RocksDB is an embeddable persistent key-value store for fast storage." \
-    include $LIB_DIR
+    usr
 }
 
 # shellcheck disable=SC2068
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/regression_build_test.sh	2025-05-19 16:14:27.000000000 +0000
@@ -20,26 +20,11 @@
 
 function cleanup {
   rm -rf $DATA_DIR
-  rm -f $STAT_FILE.fillseq
-  rm -f $STAT_FILE.readrandom
-  rm -f $STAT_FILE.overwrite
-  rm -f $STAT_FILE.memtablefillreadrandom
+  rm -f $STAT_FILE.*
 }
 
 trap cleanup EXIT
 
-if [ -z $GIT_BRANCH ]; then
-  git_br=`git rev-parse --abbrev-ref HEAD`
-else
-  git_br=$(basename $GIT_BRANCH)
-fi
-
-if [ $git_br == "master" ]; then
-  git_br=""
-else
-  git_br="."$git_br
-fi
-
 make release
 
 # measure fillseq + fill up the DB for overwrite benchmark
@@ -286,12 +271,10 @@
     --sync=0 \
     --verify_checksum=1 \
     --delete_obsolete_files_period_micros=314572800 \
-    --max_grandparent_overlap_factor=10 \
     --use_plain_table=1 \
     --open_files=-1 \
     --mmap_read=1 \
     --mmap_write=0 \
-    --memtablerep=prefix_hash \
     --bloom_bits=10 \
     --bloom_locality=1 \
     --perf_level=0"
@@ -378,7 +361,7 @@
     echo >&2 "ERROR: Key $key doesn't have a value."
     return
   fi
-  curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
+  curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
     --connect-timeout 60
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/rocksdb-lego-determinator	2025-05-19 16:14:27.000000000 +0000
@@ -3,7 +3,7 @@
 # to determine next steps to run
 
 # Usage:
-# EMAIL=<email> ONCALL=<email> TRIGGER=<trigger> SUBSCRIBER=<email> rocks_ci.py <test-name>
+# EMAIL=<email> ONCALL=<email> TRIGGER=<trigger> SUBSCRIBER=<email> WORKINGDIR=<working_dir> rocksdb-lego-determinator <test-name>
 #
 # Input         Value
 # -------------------------------------------------------------------------
@@ -11,7 +11,7 @@
 # ONCALL        Email address to raise a task on failure
 # TRIGGER       Trigger conditions for email. Valid values are fail, warn, all
 # SUBSCRIBER    Email addresss to add as subscriber for task
-#
+# WORKINGDIR    Working directory
 
 #
 # Report configuration
@@ -24,22 +24,22 @@
 
   REPORT_EMAIL="
   {
-      'type':'email',
-      'triggers': [ '$TRIGGER' ],
-      'emails':['$EMAIL']
-  },"
+      \"type\":\"email\",
+      \"triggers\": [ \"$TRIGGER\" ],
+      \"emails\":[\"$EMAIL\"]
+  }"
 fi
 
 CREATE_TASK=
 if [ ! -z $ONCALL ]; then
   CREATE_TASK="
   {
-      'type':'task',
-      'triggers':[ 'fail' ],
-      'priority':0,
-      'subscribers':[ '$SUBSCRIBER' ],
-      'tags':[ 'rocksdb', 'ci' ],
-  },"
+      \"type\":\"task\",
+      \"triggers\":[ \"fail\" ],
+      \"priority\":0,
+      \"subscribers\":[ \"$SUBSCRIBER\" ],
+      \"tags\":[ \"rocksdb\", \"ci\" ]
+  }"
 fi
 
 # For now, create the tasks using only the dedicated task creation tool.
@@ -47,47 +47,54 @@
 
 REPORT=
 if [[ ! -z $REPORT_EMAIL || ! -z $CREATE_TASK ]]; then
-  REPORT="'report': [
-    $REPORT_EMAIL
+  REPORT=",\"report\": [
+    $REPORT_EMAIL,
     $CREATE_TASK
   ]"
 fi
 
+# Working directory for the following command, default to current directory
+WORKING_DIR=.
+if [ ! -z $WORKINGDIR ]; then
+  WORKING_DIR=$WORKINGDIR
+fi
+
 #
 # Helper variables
 #
 CLEANUP_ENV="
 {
-    'name':'Cleanup environment',
-    'shell':'rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true)  && make clean',
-    'user':'root'
+    \"name\":\"Cleanup environment\",
+    \"shell\":\"cd $WORKING_DIR; rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true) && make clean\",
+    \"user\":\"root\"
 }"
 
 UPLOAD_DB_DIR="
 {
-    'name':'Upload database directory',
-    'shell':'tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/',
-    'user':'root',
-    'cleanup':true,
-    'provide_artifacts': [
-    {
-      'name':'rocksdb_db_dir',
-      'paths': ['rocksdb_db.tar.gz'],
-      'bundle': false,
-    },
-    ],
+    \"name\":\"Upload database directory\",
+    \"shell\":\"tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/\",
+    \"user\":\"root\",
+    \"cleanup\":true,
+    \"provide_artifacts\": [
+    {
+      \"name\":\"rocksdb_db_dir\",
+      \"paths\": [\"rocksdb_db.tar.gz\"],
+      \"bundle\": false
+    }
+    ]
 }"
 
-# We will eventually set the RATIO to 1, but we want do this
-# in steps. RATIO=$(nproc) will make it work as J=1
+# set default RATIO to 1, which sets J=$(nproc) and j=$(nproc)
 if [ -z $RATIO ]; then
-  RATIO=$(nproc)
+  RATIO=1
 fi
 
+# Should probably be called PARALLEL_TEST
 if [ -z $PARALLEL_J ]; then
   PARALLEL_J="J=$(expr $(nproc) / ${RATIO})"
 fi
 
+# Should probably be called PARALLEL_MAKE
 if [ -z $PARALLEL_j ]; then
   PARALLEL_j="-j$(expr $(nproc) / ${RATIO})"
 fi
@@ -100,18 +107,18 @@
 GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1"
 ASAN="COMPILE_WITH_ASAN=1"
 CLANG="USE_CLANG=1"
-# in gcc-5 there are known problems with TSAN like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090.
-# using platform007 gives us gcc-8 or higher which has that bug fixed.
-TSAN="ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1"
+TSAN="COMPILE_WITH_TSAN=1"
 UBSAN="COMPILE_WITH_UBSAN=1"
-TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"'
+ASAN_CRASH="ASAN_OPTIONS=disable_coredump=0"
+TSAN_CRASH="CRASH_TEST_EXT_ARGS=\\\"--compression_type=zstd --log2_keys_per_lock=22\\\""
 NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd"
 DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
 HTTP_PROXY="https_proxy=http://fwdproxy.29.prn1:8080 http_proxy=http://fwdproxy.29.prn1:8080 ftp_proxy=http://fwdproxy.29.prn1:8080"
 SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-8u60-64/; export PATH=\$JAVA_HOME/bin:\$PATH"
-PARSER="'parser':'python build_tools/error_filter.py $1'"
+PARSER="\"parser\":\"/usr/bin/env python3 build_tools/error_filter.py $1\""
 
 CONTRUN_NAME="ROCKSDB_CONTRUN_NAME"
+SKIP_FORMAT_CHECKS="SKIP_FORMAT_BUCK_CHECKS=1"
 
 # This code is getting called under various scenarios. What we care about is to
 # understand when it's called from nightly contruns because in that case we'll
@@ -129,15 +136,15 @@
 #
 DISABLE_COMMANDS="[
     {
-        'name':'Disable test',
-        'oncall':'$ONCALL',
-        'steps': [
-            {
-              'name':'Job disabled. Please contact test owner',
-              'shell':'exit 1',
-              'user':'root'
-            },
-        ],
+        \"name\":\"Disable test\",
+        \"oncall\":\"$ONCALL\",
+        \"steps\": [
+            {
+              \"name\":\"Job disabled. Please contact test owner\",
+              \"shell\":\"exit 1\",
+              \"user\":\"root\"
+            }
+        ]
     }
 ]"
 
@@ -146,18 +153,18 @@
 #
 UNIT_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Unit Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and test RocksDB debug version',
-                'shell':'$SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and test RocksDB debug version\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -167,20 +174,20 @@
 #
 UNIT_TEST_NON_SHM_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Unit Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and test RocksDB debug version',
-                'timeout': 86400,
-                'shell':'$NON_SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and test RocksDB debug version\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $NON_SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -190,18 +197,18 @@
 #
 RELEASE_BUILD_COMMANDS="[
     {
-        'name':'Rocksdb Release Build',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Release Build\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build RocksDB release',
-                'shell':'make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build RocksDB release\",
+                \"shell\":\"cd $WORKING_DIR; make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -211,18 +218,18 @@
 #
 UNIT_TEST_COMMANDS_481="[
     {
-        'name':'Rocksdb Unit Test on GCC 4.8.1',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Unit Test on GCC 4.8.1\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and test RocksDB debug version',
-                'shell':'$SHM $GCC_481 $DEBUG make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and test RocksDB debug version\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $GCC_481 $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -232,18 +239,18 @@
 #
 RELEASE_BUILD_COMMANDS_481="[
     {
-        'name':'Rocksdb Release on GCC 4.8.1',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Release on GCC 4.8.1\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build RocksDB release on GCC 4.8.1',
-                'shell':'$GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build RocksDB release on GCC 4.8.1\",
+                \"shell\":\"cd $WORKING_DIR; $GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -253,18 +260,18 @@
 #
 CLANG_UNIT_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Unit Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and test RocksDB debug',
-                'shell':'$CLANG $SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and test RocksDB debug\",
+                \"shell\":\"cd $WORKING_DIR; $CLANG $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -274,18 +281,18 @@
 #
 CLANG_RELEASE_BUILD_COMMANDS="[
     {
-        'name':'Rocksdb CLANG Release Build',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb CLANG Release Build\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build RocksDB release',
-                'shell':'$CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build RocksDB release\",
+                \"shell\":\"cd $WORKING_DIR; $CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -295,18 +302,18 @@
 #
 CLANG_ANALYZE_COMMANDS="[
     {
-        'name':'Rocksdb analyze',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb analyze\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'RocksDB build and analyze',
-                'shell':'$CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"RocksDB build and analyze\",
+                \"shell\":\"cd $WORKING_DIR; $CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -316,18 +323,18 @@
 #
 CODE_COV_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test Code Coverage',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Unit Test Code Coverage\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build, test and collect code coverage info',
-                'shell':'$SHM $DEBUG make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build, test and collect code coverage info\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -337,18 +344,18 @@
 #
 UNITY_COMMANDS="[
     {
-        'name':'Rocksdb Unity',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Unity\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build, test unity test',
-                'shell':'$SHM $DEBUG V=1 make J=1 unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build, test unity test\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG V=1 make $PARALLELISM unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -358,65 +365,108 @@
 #
 LITE_BUILD_COMMANDS="[
     {
-        'name':'Rocksdb Lite build',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Lite build\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build RocksDB debug version',
-                'shell':'make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build RocksDB debug version\",
+                \"shell\":\"cd $WORKING_DIR; $SKIP_FORMAT_CHECKS make $PARALLELISM LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            }
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB stress/crash test
+#
+STRESS_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb Stress and Crash Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Build and run RocksDB debug stress tests\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-        ],
+            {
+                \"name\":\"Build and run RocksDB debug crash tests\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
 
 #
-# Report RocksDB lite binary size to scuba
-REPORT_LITE_BINARY_SIZE_COMMANDS="[
+# RocksDB blackbox stress/crash test
+#
+BLACKBOX_STRESS_CRASH_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Lite Binary Size',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Blackbox Stress and Crash Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Report RocksDB Lite binary size to scuba',
-                'shell':'tools/report_lite_binary_size.sh',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug stress tests\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
             },
-        ],
+            {
+                \"name\":\"Build and run RocksDB debug blackbox crash tests\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM blackbox_crash_test || $CONTRUN_NAME=blackbox_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
 ]"
 
 #
-# RocksDB stress/crash test
+# RocksDB whitebox stress/crash test
 #
-STRESS_CRASH_TEST_COMMANDS="[
+WHITEBOX_STRESS_CRASH_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Stress and Crash Test',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Whitebox Stress and Crash Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug stress tests\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
             {
-                'name':'Build and run RocksDB debug crash tests',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug whitebox crash tests\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM whitebox_crash_test || $CONTRUN_NAME=whitebox_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -426,27 +476,27 @@
 #
 STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb Stress and Crash Test with atomic flush',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Stress and Crash Test with atomic flush\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug stress tests\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
             {
-                'name':'Build and run RocksDB debug crash tests with atomic flush',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug crash tests with atomic flush\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -456,27 +506,57 @@
 #
 STRESS_CRASH_TEST_WITH_TXN_COMMANDS="[
     {
-        'name':'Rocksdb Stress and Crash Test with txn',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Stress and Crash Test with txn\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug stress tests\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
             {
-                'name':'Build and run RocksDB debug crash tests with txn',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug crash tests with txn\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB stress/crash test with timestamp
+#
+STRESS_CRASH_TEST_WITH_TS_COMMANDS="[
+    {
+        \"name\":\"Rocksdb Stress and Crash Test with ts\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Build and run RocksDB debug stress tests\",
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            {
+                \"name\":\"Build and run RocksDB debug crash tests with ts\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make $PARALLELISM crash_test_with_ts || $CONTRUN_NAME=crash_test_with_ts $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -486,19 +566,19 @@
 # because we want to add some randomness to fsync commands
 WRITE_STRESS_COMMANDS="[
     {
-        'name':'Rocksdb Write Stress Test',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Write Stress Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB write stress tests',
-                'shell':'make write_stress && python tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB write stress tests\",
+                \"shell\":\"cd $WORKING_DIR; make write_stress && /usr/bin/env python3 tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             }
         ],
-        'artifacts': [{'name': 'database', 'paths': ['/tmp/rocksdb_write_stress']}],
+        \"artifacts\": [{\"name\": \"database\", \"paths\": [\"/tmp/rocksdb_write_stress\"]}]
         $REPORT
     }
 ]"
@@ -509,18 +589,18 @@
 #
 ASAN_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test under ASAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Unit Test under ASAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Test RocksDB debug under ASAN',
-'shell':'set -o pipefail && ($SHM $ASAN $DEBUG make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL) |& /usr/facebook/ops/scripts/asan_symbolize.py -d',
-                'user':'root',
+                \"name\":\"Test RocksDB debug under ASAN\",
+\"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $ASAN $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             }
-        ],
+        ]
         $REPORT
     }
 ]"
@@ -530,21 +610,69 @@
 #
 ASAN_CRASH_TEST_COMMANDS="[
     {
-        'name':'Rocksdb crash test under ASAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb crash test under ASAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug asan_crash_test',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug asan_crash_test\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB blackbox crash testing under address sanitizer
+#
+ASAN_BLACKBOX_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb blackbox crash test under ASAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Build and run RocksDB debug blackbox asan_crash_test\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM blackbox_asan_crash_test || $CONTRUN_NAME=blackbox_asan_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB whitebox crash testing under address sanitizer
+#
+ASAN_WHITEBOX_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb whitebox crash test under ASAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Build and run RocksDB debug whitebox asan_crash_test\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM whitebox_asan_crash_test || $CONTRUN_NAME=whitebox_asan_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -554,21 +682,21 @@
 #
 ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb crash test with atomic flush under ASAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb crash test with atomic flush under ASAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug asan_crash_test_with_atomic_flush',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug asan_crash_test_with_atomic_flush\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -578,21 +706,21 @@
 #
 ASAN_CRASH_TEST_WITH_TXN_COMMANDS="[
     {
-        'name':'Rocksdb crash test with txn under ASAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb crash test with txn under ASAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug asan_crash_test_with_txn',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug asan_crash_test_with_txn\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $ASAN_CRASH $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make $PARALLELISM asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -602,42 +730,90 @@
 #
 UBSAN_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test under UBSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Unit Test under UBSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Test RocksDB debug under UBSAN',
-                'shell':'set -o pipefail && $SHM $UBSAN $CLANG $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Test RocksDB debug under UBSAN\",
+                \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $UBSAN $CLANG $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             }
-        ],
+        ]
         $REPORT
     }
 ]"
 
 #
-# RocksDB crash testing under udnefined behavior sanitizer
+# RocksDB crash testing under undefined behavior sanitizer
 #
 UBSAN_CRASH_TEST_COMMANDS="[
     {
-        'name':'Rocksdb crash test under UBSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb crash test under UBSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug ubsan_crash_test',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug ubsan_crash_test\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB crash testing under undefined behavior sanitizer
+#
+UBSAN_BLACKBOX_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb blackbox crash test under UBSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Build and run RocksDB debug blackbox ubsan_crash_test\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM blackbox_ubsan_crash_test || $CONTRUN_NAME=blackbox_ubsan_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB crash testing under undefined behavior sanitizer
+#
+UBSAN_WHITEBOX_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb whitebox crash test under UBSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Build and run RocksDB debug whitebox ubsan_crash_test\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM whitebox_ubsan_crash_test || $CONTRUN_NAME=whitebox_ubsan_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -647,21 +823,21 @@
 #
 UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb crash test with atomic flush under UBSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb crash test with atomic flush under UBSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug ubsan_crash_test_with_atomic_flush\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -671,21 +847,21 @@
 #
 UBSAN_CRASH_TEST_WITH_TXN_COMMANDS="[
     {
-        'name':'Rocksdb crash test with txn under UBSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb crash test with txn under UBSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build and run RocksDB debug ubsan_crash_test_with_txn',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build and run RocksDB debug ubsan_crash_test_with_txn\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -695,20 +871,20 @@
 #
 VALGRIND_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test under valgrind',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Unit Test under valgrind\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Run RocksDB debug unit tests',
-                'timeout': 86400,
-                'shell':'$SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Run RocksDB debug unit tests\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; $SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -718,20 +894,20 @@
 #
 TSAN_UNIT_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Unit Test under TSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Unit Test under TSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Run RocksDB debug unit test',
-                'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Run RocksDB debug unit test\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -741,21 +917,69 @@
 #
 TSAN_CRASH_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Crash Test under TSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Crash Test under TSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Compile and run',
-                'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Compile and run\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB blackbox crash test under TSAN
+#
+TSAN_BLACKBOX_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb Blackbox Crash Test under TSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Compile and run\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM blackbox_crash_test || $CONTRUN_NAME=tsan_blackbox_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB whitebox crash test under TSAN
+#
+TSAN_WHITEBOX_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb Whitebox Crash Test under TSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            $CLEANUP_ENV,
+            {
+                \"name\":\"Compile and run\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM whitebox_crash_test || $CONTRUN_NAME=tsan_whitebox_crash_test $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -765,21 +989,21 @@
 #
 TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb Crash Test with atomic flush under TSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Crash Test with atomic flush under TSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Compile and run',
-                'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Compile and run\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -789,21 +1013,21 @@
 #
 TSAN_CRASH_TEST_WITH_TXN_COMMANDS="[
     {
-        'name':'Rocksdb Crash Test with txn under TSAN',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'timeout': 86400,
-        'steps': [
+        \"name\":\"Rocksdb Crash Test with txn under TSAN\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Compile and run',
-                'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Compile and run\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make $PARALLELISM crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-            $UPLOAD_DB_DIR,
-        ],
+            $UPLOAD_DB_DIR
+        ]
         $REPORT
     }
 ]"
@@ -818,23 +1042,25 @@
   rm -rf /dev/shm/rocksdb
   mkdir /dev/shm/rocksdb
 
+  export https_proxy="fwdproxy:8080"
+
   tools/check_format_compatible.sh
 }
 
 FORMAT_COMPATIBLE_COMMANDS="[
     {
-        'name':'Rocksdb Format Compatible tests',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Format Compatible tests\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Run RocksDB debug unit test',
-                'shell':'build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Run RocksDB debug unit test\",
+                \"shell\":\"cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -852,23 +1078,24 @@
   mv .tmp.fbcode_config.sh build_tools/fbcode_config.sh
   cat Makefile | grep -v tools/ldb_test.py > .tmp.Makefile
   mv .tmp.Makefile Makefile
-  make $DEBUG J=1 check
+  export $SKIP_FORMAT_CHECKS
+  make $DEBUG $PARALLELISM check
 }
 
 NO_COMPRESSION_COMMANDS="[
     {
-        'name':'Rocksdb No Compression tests',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb No Compression tests\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Run RocksDB debug unit test',
-                'shell':'build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Run RocksDB debug unit test\",
+                \"shell\":\"cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -878,7 +1105,7 @@
 #
 run_regression()
 {
-  time -v bash -vx ./build_tools/regression_build_test.sh $(mktemp -d  $WORKSPACE/leveldb.XXXX) $(mktemp leveldb_test_stats.XXXX)
+  time bash -vx ./build_tools/regression_build_test.sh $(mktemp -d  $WORKING_DIR/rocksdb.XXXX) $(mktemp rocksdb_test_stats.XXXX)
 
   # ======= report size to ODS ========
 
@@ -895,6 +1122,7 @@
   strip librocksdb.a
   send_size_to_ods static_lib_stripped $(stat --printf="%s" librocksdb.a)
 
+  make clean
   make -j$(nproc) shared_lib
   send_size_to_ods shared_lib $(stat --printf="%s" `readlink -f librocksdb.so`)
   strip `readlink -f librocksdb.so`
@@ -907,6 +1135,7 @@
   strip librocksdb.a
   send_size_to_ods static_lib_lite_stripped $(stat --printf="%s" librocksdb.a)
 
+  make clean
   make LITE=1 -j$(nproc) shared_lib
   send_size_to_ods shared_lib_lite $(stat --printf="%s" `readlink -f librocksdb.so`)
   strip `readlink -f librocksdb.so`
@@ -915,17 +1144,18 @@
 
 REGRESSION_COMMANDS="[
     {
-        'name':'Rocksdb regression commands',
-        'oncall':'$ONCALL',
-        'steps': [
+        \"name\":\"Rocksdb regression commands\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Make and run script',
-                'shell':'build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Make and run script\",
+                \"shell\":\"cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
-            },
-        ],
+            }
+        ]
         $REPORT
     }
 ]"
@@ -935,18 +1165,52 @@
 #
 JAVA_BUILD_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Java Build',
-        'oncall':'$ONCALL',
-        'executeLocal': 'true',
-        'steps': [
+        \"name\":\"Rocksdb Java Build\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"steps\": [
             $CLEANUP_ENV,
             {
-                'name':'Build RocksDB for Java',
-                'shell':'$SETUP_JAVA_ENV; $SHM make rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL',
-                'user':'root',
+                \"name\":\"Build RocksDB for Java\",
+                \"shell\":\"cd $WORKING_DIR; $SETUP_JAVA_ENV; $SHM make $PARALLELISM rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            }
+        ]
+        $REPORT
+    }
+]"
+
+#
+# RocksDB fbcode stress/crash test
+#
+FBCODE_STRESS_CRASH_TEST_COMMANDS="[
+    {
+        \"name\":\"Rocksdb Fbcode Stress and Crash Test\",
+        \"oncall\":\"$ONCALL\",
+        \"executeLocal\": \"true\",
+        \"timeout\": 86400,
+        \"steps\": [
+            {
+                \"name\":\"Copy RocksDB code to fbcode repo\",
+                \"shell\":\"cd internal_repo_rocksdb/repo && git init -b main && git add * && git commit -a -m \\\"Make internal_repo_rocksdb/repo a git repo\\\" && cd ../.. && echo Y | python3 rocks/release_script/release_to_fbcode.py -u internal_repo_rocksdb/repo main || $CONTRUN_NAME=db_stress_fbcode $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
                 $PARSER
             },
-        ],
+            {
+                \"name\":\"Build RocksDB fbcode stress tests\",
+                \"shell\":\"cd $WORKING_DIR; buck build @mode/dbg rocks/tools:rocks_db_stress || $CONTRUN_NAME=db_stress_fbcode $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            },
+            {
+                \"name\":\"Run RocksDB whitebox crash tests\",
+                \"timeout\": 86400,
+                \"shell\":\"cd $WORKING_DIR; mkdir /dev/shm/rocksdb_fbcode_crash_test && TEST_TMPDIR=\$(mktemp -d --tmpdir=/dev/shm/rocksdb_fbcode_crash_test) python3 rocksdb/src/tools/db_crashtest.py --stress_cmd=buck-out/dbg/gen/rocks/tools/rocks_db_stress -secondary_cache_uri=\\\"$SECONDARY_CACHE_URI\\\" --env_uri=$ENV_URI $EXTRA_DB_STRESS_ARGS -logtostderr=false $TEST_TYPE || $CONTRUN_NAME=db_stress_fbcode $TASK_CREATION_TOOL\",
+                \"user\":\"root\",
+                $PARSER
+            }
+        ]
         $REPORT
     }
 ]"
@@ -986,18 +1250,24 @@
   lite)
     echo $LITE_BUILD_COMMANDS
     ;;
-  report_lite_binary_size)
-    echo $REPORT_LITE_BINARY_SIZE_COMMANDS
-    ;;
   stress_crash)
     echo $STRESS_CRASH_TEST_COMMANDS
     ;;
+  blackbox_stress_crash)
+    echo $BLACKBOX_STRESS_CRASH_TEST_COMMANDS
+    ;;
+  whitebox_stress_crash)
+    echo $WHITEBOX_STRESS_CRASH_TEST_COMMANDS
+    ;;
   stress_crash_with_atomic_flush)
     echo $STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
     ;;
   stress_crash_with_txn)
     echo $STRESS_CRASH_TEST_WITH_TXN_COMMANDS
     ;;
+  stress_crash_with_ts)
+    echo $STRESS_CRASH_TEST_WITH_TS_COMMANDS
+    ;;
   write_stress)
     echo $WRITE_STRESS_COMMANDS
     ;;
@@ -1007,6 +1277,12 @@
   asan_crash)
     echo $ASAN_CRASH_TEST_COMMANDS
     ;;
+  blackbox_asan_crash)
+    echo $ASAN_BLACKBOX_CRASH_TEST_COMMANDS
+    ;;
+  whitebox_asan_crash)
+    echo $ASAN_WHITEBOX_CRASH_TEST_COMMANDS
+    ;;
   asan_crash_with_atomic_flush)
     echo $ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
     ;;
@@ -1019,6 +1295,12 @@
   ubsan_crash)
     echo $UBSAN_CRASH_TEST_COMMANDS
     ;;
+  blackbox_ubsan_crash)
+    echo $UBSAN_BLACKBOX_CRASH_TEST_COMMANDS
+    ;;
+  whitebox_ubsan_crash)
+    echo $UBSAN_WHITEBOX_CRASH_TEST_COMMANDS
+    ;;
   ubsan_crash_with_atomic_flush)
     echo $UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
     ;;
@@ -1034,6 +1316,12 @@
   tsan_crash)
     echo $TSAN_CRASH_TEST_COMMANDS
     ;;
+  blackbox_tsan_crash)
+    echo $TSAN_BLACKBOX_CRASH_TEST_COMMANDS
+    ;;
+  whitebox_tsan_crash)
+    echo $TSAN_WHITEBOX_CRASH_TEST_COMMANDS
+    ;;
   tsan_crash_with_atomic_flush)
     echo $TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
     ;;
@@ -1056,11 +1344,18 @@
     echo $REGRESSION_COMMANDS
     ;;
   run_regression)
+    set -e
     run_regression
+    set +e
     ;;
   java_build)
     echo $JAVA_BUILD_TEST_COMMANDS
     ;;
+  fbcode_stress_crash)
+    set -f
+    echo $FBCODE_STRESS_CRASH_TEST_COMMANDS
+    set +f
+    ;;
   *)
     echo "Invalid determinator command"
     exit 1
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1 mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/run_ci_db_test.ps1	2025-05-19 16:14:27.000000000 +0000
@@ -68,7 +68,7 @@
 
 if($WorkFolder -eq "") {
 
-    # If TEST_TMPDIR is set use it    
+    # If TEST_TMPDIR is set use it
     [string]$var = $Env:TEST_TMPDIR
     if($var -eq "") {
         $WorkFolder = -Join($RootFolder, "\db_tests\")
@@ -93,7 +93,7 @@
 if($ExcludeCases -ne "") {
     Write-Host "ExcludeCases: $ExcludeCases"
     $l = $ExcludeCases -split ' '
-    ForEach($t in $l) { 
+    ForEach($t in $l) {
       $ExcludeCasesSet.Add($t) | Out-Null
     }
 }
@@ -102,7 +102,7 @@
 if($ExcludeExes -ne "") {
     Write-Host "ExcludeExe: $ExcludeExes"
     $l = $ExcludeExes -split ' '
-    ForEach($t in $l) { 
+    ForEach($t in $l) {
       $ExcludeExesSet.Add($t) | Out-Null
     }
 }
@@ -118,6 +118,10 @@
 #   MultiThreaded/MultiThreadedDBTest.
 #     MultiThreaded/0  # GetParam() = 0
 #     MultiThreaded/1  # GetParam() = 1
+#   RibbonTypeParamTest/0.  # TypeParam = struct DefaultTypesAndSettings
+#     CompactnessAndBacktrackAndFpRate
+#     Extremes
+#     FindOccupancyForSuccessRate
 #
 # into this:
 #
@@ -125,6 +129,9 @@
 #   DBTest.WriteEmptyBatch
 #   MultiThreaded/MultiThreadedDBTest.MultiThreaded/0
 #   MultiThreaded/MultiThreadedDBTest.MultiThreaded/1
+#   RibbonTypeParamTest/0.CompactnessAndBacktrackAndFpRate
+#   RibbonTypeParamTest/0.Extremes
+#   RibbonTypeParamTest/0.FindOccupancyForSuccessRate
 #
 # Output into the parameter in a form TestName -> Log File Name
 function ExtractTestCases([string]$GTestExe, $HashTable) {
@@ -138,6 +145,8 @@
 
     ForEach( $l in $Tests) {
 
+      # remove trailing comment if any
+      $l = $l -replace '\s+\#.*',''
       # Leading whitespace is fine
       $l = $l -replace '^\s+',''
       # Trailing dot is a test group but no whitespace
@@ -146,8 +155,7 @@
       }  else {
         # Otherwise it is a test name, remove leading space
         $test = $l
-        # remove trailing comment if any and create a log name
-        $test = $test -replace '\s+\#.*',''
+        # create a log name
         $test = "$Group$test"
 
         if($ExcludeCasesSet.Contains($test)) {
@@ -253,7 +261,7 @@
 
   $DiscoveredExe = @()
   dir -Path $search_path | ForEach-Object {
-     $DiscoveredExe += ($_.Name)     
+     $DiscoveredExe += ($_.Name)
   }
 
   # Remove exclusions
@@ -293,7 +301,7 @@
 
   $ListOfExe = @()
   dir -Path $search_path | ForEach-Object {
-     $ListOfExe += ($_.Name)     
+     $ListOfExe += ($_.Name)
   }
 
   # Exclude those in RunOnly from running as suites
@@ -348,7 +356,7 @@
 
     # Wait for all to finish and get the results
     while(($JobToLog.Count -gt 0) -or
-          ($TestCmds.Count -gt 0) -or 
+          ($TestCmds.Count -gt 0) -or
            ($Suites.Count -gt 0)) {
 
         # Make sure we have maximum concurrent jobs running if anything
@@ -468,8 +476,8 @@
 
 $EndDate = (Get-Date)
 
-New-TimeSpan -Start $StartDate -End $EndDate | 
-  ForEach-Object { 
+New-TimeSpan -Start $StartDate -End $EndDate |
+  ForEach-Object {
     "Elapsed time: {0:g}" -f $_
   }
 
@@ -484,4 +492,4 @@
 
  exit 0
 
- 
+
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/build_tools/setup_centos7.sh	2025-05-19 16:14:27.000000000 +0000
@@ -1,9 +1,9 @@
 #!/bin/bash
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-set -e
+set -ex
 
-ROCKSDB_VERSION="5.10.3"
-ZSTD_VERSION="1.1.3"
+ROCKSDB_VERSION="6.7.3"
+ZSTD_VERSION="1.4.4"
 
 echo "This script configures CentOS with everything needed to build and run RocksDB"
 
@@ -40,5 +40,6 @@
 chown -R vagrant:vagrant /usr/local/rocksdb/
 sudo -u vagrant make static_lib
 cd examples/
-sudo -u vagrant make all
-sudo -u vagrant ./c_simple_example
+sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ make all
+sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ ./c_simple_example
+
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,72 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/cache.h"
+
+#include "cache/lru_cache.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo>
+    lru_cache_options_type_info = {
+        {"capacity",
+         {offsetof(struct LRUCacheOptions, capacity), OptionType::kSizeT,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"num_shard_bits",
+         {offsetof(struct LRUCacheOptions, num_shard_bits), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"strict_capacity_limit",
+         {offsetof(struct LRUCacheOptions, strict_capacity_limit),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"high_pri_pool_ratio",
+         {offsetof(struct LRUCacheOptions, high_pri_pool_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+#endif  // ROCKSDB_LITE
+
+Status SecondaryCache::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<SecondaryCache>* result) {
+  return LoadSharedObject<SecondaryCache>(config_options, value, nullptr,
+                                          result);
+}
+
+Status Cache::CreateFromString(const ConfigOptions& config_options,
+                               const std::string& value,
+                               std::shared_ptr<Cache>* result) {
+  Status status;
+  std::shared_ptr<Cache> cache;
+  if (value.find('=') == std::string::npos) {
+    cache = NewLRUCache(ParseSizeT(value));
+  } else {
+#ifndef ROCKSDB_LITE
+    LRUCacheOptions cache_opts;
+    status = OptionTypeInfo::ParseStruct(config_options, "",
+                                         &lru_cache_options_type_info, "",
+                                         value, &cache_opts);
+    if (status.ok()) {
+      cache = NewLRUCache(cache_opts);
+    }
+#else
+    (void)config_options;
+    status = Status::NotSupported("Cannot load cache in LITE mode ", value);
+#endif  //! ROCKSDB_LITE
+  }
+  if (status.ok()) {
+    result->swap(cache);
+  }
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -1,8 +1,11 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
@@ -10,272 +13,8 @@
   return 1;
 }
 #else
-
-#include <stdio.h>
-#include <sys/types.h>
-#include <cinttypes>
-
-#include "port/port.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "util/gflags_compat.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
-
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-
-static const uint32_t KB = 1024;
-
-DEFINE_int32(threads, 16, "Number of concurrent threads to run.");
-DEFINE_int64(cache_size, 8 * KB * KB,
-             "Number of bytes to use as a cache of uncompressed data.");
-DEFINE_int32(num_shard_bits, 4, "shard_bits.");
-
-DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache");
-DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
-
-DEFINE_bool(populate_cache, false, "Populate cache before operations");
-DEFINE_int32(insert_percent, 40,
-             "Ratio of insert to total workload (expressed as a percentage)");
-DEFINE_int32(lookup_percent, 50,
-             "Ratio of lookup to total workload (expressed as a percentage)");
-DEFINE_int32(erase_percent, 10,
-             "Ratio of erase to total workload (expressed as a percentage)");
-
-DEFINE_bool(use_clock_cache, false, "");
-
-namespace ROCKSDB_NAMESPACE {
-
-class CacheBench;
-namespace {
-void deleter(const Slice& /*key*/, void* value) {
-    delete reinterpret_cast<char *>(value);
-}
-
-// State shared by all concurrent executions of the same benchmark.
-class SharedState {
- public:
-  explicit SharedState(CacheBench* cache_bench)
-      : cv_(&mu_),
-        num_threads_(FLAGS_threads),
-        num_initialized_(0),
-        start_(false),
-        num_done_(0),
-        cache_bench_(cache_bench) {
-  }
-
-  ~SharedState() {}
-
-  port::Mutex* GetMutex() {
-    return &mu_;
-  }
-
-  port::CondVar* GetCondVar() {
-    return &cv_;
-  }
-
-  CacheBench* GetCacheBench() const {
-    return cache_bench_;
-  }
-
-  void IncInitialized() {
-    num_initialized_++;
-  }
-
-  void IncDone() {
-    num_done_++;
-  }
-
-  bool AllInitialized() const {
-    return num_initialized_ >= num_threads_;
-  }
-
-  bool AllDone() const {
-    return num_done_ >= num_threads_;
-  }
-
-  void SetStart() {
-    start_ = true;
-  }
-
-  bool Started() const {
-    return start_;
-  }
-
- private:
-  port::Mutex mu_;
-  port::CondVar cv_;
-
-  const uint64_t num_threads_;
-  uint64_t num_initialized_;
-  bool start_;
-  uint64_t num_done_;
-
-  CacheBench* cache_bench_;
-};
-
-// Per-thread state for concurrent executions of the same benchmark.
-struct ThreadState {
-  uint32_t tid;
-  Random rnd;
-  SharedState* shared;
-
-  ThreadState(uint32_t index, SharedState* _shared)
-      : tid(index), rnd(1000 + index), shared(_shared) {}
-};
-}  // namespace
-
-class CacheBench {
- public:
-  CacheBench() : num_threads_(FLAGS_threads) {
-    if (FLAGS_use_clock_cache) {
-      cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
-      if (!cache_) {
-        fprintf(stderr, "Clock cache not supported.\n");
-        exit(1);
-      }
-    } else {
-      cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits);
-    }
-  }
-
-  ~CacheBench() {}
-
-  void PopulateCache() {
-    Random rnd(1);
-    for (int64_t i = 0; i < FLAGS_cache_size; i++) {
-      uint64_t rand_key = rnd.Next() % FLAGS_max_key;
-      // Cast uint64* to be char*, data would be copied to cache
-      Slice key(reinterpret_cast<char*>(&rand_key), 8);
-      // do insert
-      cache_->Insert(key, new char[10], 1, &deleter);
-    }
-  }
-
-  bool Run() {
-    ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
-
-    PrintEnv();
-    SharedState shared(this);
-    std::vector<ThreadState*> threads(num_threads_);
-    for (uint32_t i = 0; i < num_threads_; i++) {
-      threads[i] = new ThreadState(i, &shared);
-      env->StartThread(ThreadBody, threads[i]);
-    }
-    {
-      MutexLock l(shared.GetMutex());
-      while (!shared.AllInitialized()) {
-        shared.GetCondVar()->Wait();
-      }
-      // Record start time
-      uint64_t start_time = env->NowMicros();
-
-      // Start all threads
-      shared.SetStart();
-      shared.GetCondVar()->SignalAll();
-
-      // Wait threads to complete
-      while (!shared.AllDone()) {
-        shared.GetCondVar()->Wait();
-      }
-
-      // Record end time
-      uint64_t end_time = env->NowMicros();
-      double elapsed = static_cast<double>(end_time - start_time) * 1e-6;
-      uint32_t qps = static_cast<uint32_t>(
-          static_cast<double>(FLAGS_threads * FLAGS_ops_per_thread) / elapsed);
-      fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps);
-    }
-    return true;
-  }
-
- private:
-  std::shared_ptr<Cache> cache_;
-  uint32_t num_threads_;
-
-  static void ThreadBody(void* v) {
-    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
-    SharedState* shared = thread->shared;
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncInitialized();
-      if (shared->AllInitialized()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->Started()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-    thread->shared->GetCacheBench()->OperateCache(thread);
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncDone();
-      if (shared->AllDone()) {
-        shared->GetCondVar()->SignalAll();
-      }
-    }
-  }
-
-  void OperateCache(ThreadState* thread) {
-    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
-      uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key;
-      // Cast uint64* to be char*, data would be copied to cache
-      Slice key(reinterpret_cast<char*>(&rand_key), 8);
-      int32_t prob_op = thread->rnd.Uniform(100);
-      if (prob_op >= 0 && prob_op < FLAGS_insert_percent) {
-        // do insert
-        cache_->Insert(key, new char[10], 1, &deleter);
-      } else if (prob_op -= FLAGS_insert_percent &&
-                 prob_op < FLAGS_lookup_percent) {
-        // do lookup
-        auto handle = cache_->Lookup(key);
-        if (handle) {
-          cache_->Release(handle);
-        }
-      } else if (prob_op -= FLAGS_lookup_percent &&
-                 prob_op < FLAGS_erase_percent) {
-        // do erase
-        cache_->Erase(key);
-      }
-    }
-  }
-
-  void PrintEnv() const {
-    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
-    printf("Number of threads   : %d\n", FLAGS_threads);
-    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
-    printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
-    printf("Num shard bits      : %d\n", FLAGS_num_shard_bits);
-    printf("Max key             : %" PRIu64 "\n", FLAGS_max_key);
-    printf("Populate cache      : %d\n", FLAGS_populate_cache);
-    printf("Insert percentage   : %d%%\n", FLAGS_insert_percent);
-    printf("Lookup percentage   : %d%%\n", FLAGS_lookup_percent);
-    printf("Erase percentage    : %d%%\n", FLAGS_erase_percent);
-    printf("----------------------------\n");
-  }
-};
-}  // namespace ROCKSDB_NAMESPACE
-
+#include "rocksdb/cache_bench_tool.h"
 int main(int argc, char** argv) {
-  ParseCommandLineFlags(&argc, &argv, true);
-
-  if (FLAGS_threads <= 0) {
-    fprintf(stderr, "threads number <= 0\n");
-    exit(1);
-  }
-
-  ROCKSDB_NAMESPACE::CacheBench bench;
-  if (FLAGS_populate_cache) {
-    bench.PopulateCache();
-  }
-  if (bench.Run()) {
-    return 0;
-  } else {
-    return 1;
-  }
+  return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv);
 }
-
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_bench_tool.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,794 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+#include <cinttypes>
+#include <cstddef>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <set>
+#include <sstream>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+static constexpr uint32_t KiB = uint32_t{1} << 10;
+static constexpr uint32_t MiB = KiB << 10;
+static constexpr uint64_t GiB = MiB << 10;
+
+DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_uint64(cache_size, 1 * GiB,
+              "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
+
+DEFINE_double(resident_ratio, 0.25,
+              "Ratio of keys fitting in cache to keyspace.");
+DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
+DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
+
+DEFINE_uint32(skew, 5, "Degree of skew in key selection");
+DEFINE_bool(populate_cache, true, "Populate cache before operations");
+
+DEFINE_uint32(lookup_insert_percent, 87,
+              "Ratio of lookup (+ insert on not found) to total workload "
+              "(expressed as a percentage)");
+DEFINE_uint32(insert_percent, 2,
+              "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_uint32(lookup_percent, 10,
+              "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_uint32(erase_percent, 1,
+              "Ratio of erase to total workload (expressed as a percentage)");
+DEFINE_bool(gather_stats, false,
+            "Whether to periodically simulate gathering block cache stats, "
+            "using one more thread.");
+DEFINE_uint32(
+    gather_stats_sleep_ms, 1000,
+    "How many milliseconds to sleep between each gathering of stats.");
+
+DEFINE_uint32(gather_stats_entries_per_lock, 256,
+              "For Cache::ApplyToAllEntries");
+DEFINE_bool(skewed, false, "If true, skew the key access distribution");
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a custom secondary cache object");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
+#endif  // ROCKSDB_LITE
+
+DEFINE_bool(use_clock_cache, false, "");
+
+// ## BEGIN stress_cache_key sub-tool options ##
+DEFINE_bool(stress_cache_key, false,
+            "If true, run cache key stress test instead");
+DEFINE_uint32(sck_files_per_day, 2500000,
+              "(-stress_cache_key) Simulated files generated per day");
+DEFINE_uint32(sck_duration, 90,
+              "(-stress_cache_key) Number of days to simulate in each run");
+DEFINE_uint32(
+    sck_min_collision, 15,
+    "(-stress_cache_key) Keep running until this many collisions seen");
+DEFINE_uint32(
+    sck_file_size_mb, 32,
+    "(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
+DEFINE_uint32(sck_reopen_nfiles, 100,
+              "(-stress_cache_key) Re-opens DB average every n files");
+DEFINE_uint32(
+    sck_restarts_per_day, 24,
+    "(-stress_cache_key) Simulated process restarts per day (across DBs)");
+DEFINE_uint32(sck_db_count, 100,
+              "(-stress_cache_key) Parallel DBs in operation");
+DEFINE_uint32(sck_table_bits, 20,
+              "(-stress_cache_key) Log2 number of tracked files");
+DEFINE_uint32(sck_keep_bits, 50,
+              "(-stress_cache_key) Number of cache key bits to keep");
+DEFINE_bool(sck_randomize, false,
+            "(-stress_cache_key) Randomize (hash) cache key");
+DEFINE_bool(sck_footer_unique_id, false,
+            "(-stress_cache_key) Simulate using proposed footer unique id");
+// ## END stress_cache_key sub-tool options ##
+
+namespace ROCKSDB_NAMESPACE {
+
+class CacheBench;
+namespace {
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  explicit SharedState(CacheBench* cache_bench)
+      : cv_(&mu_),
+        num_initialized_(0),
+        start_(false),
+        num_done_(0),
+        cache_bench_(cache_bench) {}
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() { return &mu_; }
+
+  port::CondVar* GetCondVar() { return &cv_; }
+
+  CacheBench* GetCacheBench() const { return cache_bench_; }
+
+  void IncInitialized() { num_initialized_++; }
+
+  void IncDone() { num_done_++; }
+
+  bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
+
+  bool AllDone() const { return num_done_ >= FLAGS_threads; }
+
+  void SetStart() { start_ = true; }
+
+  bool Started() const { return start_; }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+
+  uint64_t num_initialized_;
+  bool start_;
+  uint64_t num_done_;
+
+  CacheBench* cache_bench_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;
+  Random64 rnd;
+  SharedState* shared;
+  HistogramImpl latency_ns_hist;
+  uint64_t duration_us = 0;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rnd(1000 + index), shared(_shared) {}
+};
+
+struct KeyGen {
+  char key_data[27];
+
+  Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
+    uint64_t key = 0;
+    if (!FLAGS_skewed) {
+      uint64_t raw = rnd.Next();
+      // Skew according to setting
+      for (uint32_t i = 0; i < FLAGS_skew; ++i) {
+        raw = std::min(raw, rnd.Next());
+      }
+      key = FastRange64(raw, max_key);
+    } else {
+      key = rnd.Skewed(max_log);
+      if (key > max_key) {
+        key -= max_key;
+      }
+    }
+    // Variable size and alignment
+    size_t off = key % 8;
+    key_data[0] = char{42};
+    EncodeFixed64(key_data + 1, key);
+    key_data[9] = char{11};
+    EncodeFixed64(key_data + 10, key);
+    key_data[18] = char{4};
+    EncodeFixed64(key_data + 19, key);
+    return Slice(&key_data[off], sizeof(key_data) - off);
+  }
+};
+
+char* createValue(Random64& rnd) {
+  char* rv = new char[FLAGS_value_bytes];
+  // Fill with some filler data, and take some CPU time
+  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
+    EncodeFixed64(rv + i, rnd.Next());
+  }
+  return rv;
+}
+
+// Callbacks for secondary cache
+size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; }
+
+Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) {
+  memcpy(out, obj, size);
+  return Status::OK();
+}
+
+// Different deleters to simulate using deleter to gather
+// stats on the code origin and kind of cache entries.
+void deleter1(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+void deleter2(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+void deleter3(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+
+Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1);
+Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2);
+Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3);
+}  // namespace
+
+class CacheBench {
+  static constexpr uint64_t kHundredthUint64 =
+      std::numeric_limits<uint64_t>::max() / 100U;
+
+ public:
+  CacheBench()
+      : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
+                                       FLAGS_value_bytes)),
+        lookup_insert_threshold_(kHundredthUint64 *
+                                 FLAGS_lookup_insert_percent),
+        insert_threshold_(lookup_insert_threshold_ +
+                          kHundredthUint64 * FLAGS_insert_percent),
+        lookup_threshold_(insert_threshold_ +
+                          kHundredthUint64 * FLAGS_lookup_percent),
+        erase_threshold_(lookup_threshold_ +
+                         kHundredthUint64 * FLAGS_erase_percent),
+        skewed_(FLAGS_skewed) {
+    if (erase_threshold_ != 100U * kHundredthUint64) {
+      fprintf(stderr, "Percentages must add to 100.\n");
+      exit(1);
+    }
+
+    max_log_ = 0;
+    if (skewed_) {
+      uint64_t max_key = max_key_;
+      while (max_key >>= 1) max_log_++;
+      if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
+    }
+
+    if (FLAGS_use_clock_cache) {
+      cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
+      if (!cache_) {
+        fprintf(stderr, "Clock cache not supported.\n");
+        exit(1);
+      }
+    } else {
+      LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false, 0.5);
+#ifndef ROCKSDB_LITE
+      if (!FLAGS_secondary_cache_uri.empty()) {
+        Status s = SecondaryCache::CreateFromString(
+            ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
+        if (secondary_cache == nullptr) {
+          fprintf(
+              stderr,
+              "No secondary cache registered matching string: %s status=%s\n",
+              FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+          exit(1);
+        }
+        opts.secondary_cache = secondary_cache;
+      }
+#endif  // ROCKSDB_LITE
+
+      cache_ = NewLRUCache(opts);
+    }
+  }
+
+  ~CacheBench() {}
+
+  void PopulateCache() {
+    Random64 rnd(1);
+    KeyGen keygen;
+    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
+      cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), createValue(rnd),
+                     &helper1, FLAGS_value_bytes);
+    }
+  }
+
+  bool Run() {
+    const auto clock = SystemClock::Default().get();
+
+    PrintEnv();
+    SharedState shared(this);
+    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      threads[i].reset(new ThreadState(i, &shared));
+      std::thread(ThreadBody, threads[i].get()).detach();
+    }
+
+    HistogramImpl stats_hist;
+    std::string stats_report;
+    std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report);
+
+    uint64_t start_time;
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      // Record start time
+      start_time = clock->NowMicros();
+
+      // Start all threads
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+
+      // Wait threads to complete
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    // Stats gathering is considered background work. This time measurement
+    // is for foreground work, and not really ideal for that. See below.
+    uint64_t end_time = clock->NowMicros();
+    stats_thread.join();
+
+    // Wall clock time - includes idle time if threads
+    // finish at different times (not ideal).
+    double elapsed_secs = static_cast<double>(end_time - start_time) * 1e-6;
+    uint32_t ops_per_sec = static_cast<uint32_t>(
+        1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs);
+    printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs,
+           ops_per_sec);
+
+    // Total time in each thread (more accurate throughput measure)
+    elapsed_secs = 0;
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      elapsed_secs += threads[i]->duration_us * 1e-6;
+    }
+    ops_per_sec = static_cast<uint32_t>(1.0 * FLAGS_threads *
+                                        FLAGS_ops_per_thread / elapsed_secs);
+    printf("Thread ops/sec = %u\n", ops_per_sec);
+
+    printf("\nOperation latency (ns):\n");
+    HistogramImpl combined;
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      combined.Merge(threads[i]->latency_ns_hist);
+    }
+    printf("%s", combined.ToString().c_str());
+
+    if (FLAGS_gather_stats) {
+      printf("\nGather stats latency (us):\n");
+      printf("%s", stats_hist.ToString().c_str());
+    }
+
+    printf("\n%s", stats_report.c_str());
+
+    return true;
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  const uint64_t max_key_;
+  // Cumulative thresholds in the space of a random uint64_t
+  const uint64_t lookup_insert_threshold_;
+  const uint64_t insert_threshold_;
+  const uint64_t lookup_threshold_;
+  const uint64_t erase_threshold_;
+  const bool skewed_;
+  int max_log_;
+
+  // A benchmark version of gathering stats on an active block cache by
+  // iterating over it. The primary purpose is to measure the impact of
+  // gathering stats with ApplyToAllEntries on throughput- and
+  // latency-sensitive Cache users. Performance of stats gathering is
+  // also reported. The last set of gathered stats is also reported, for
+  // manual sanity checking for logical errors or other unexpected
+  // behavior of cache_bench or the underlying Cache.
+  static void StatsBody(SharedState* shared, HistogramImpl* stats_hist,
+                        std::string* stats_report) {
+    if (!FLAGS_gather_stats) {
+      return;
+    }
+    const auto clock = SystemClock::Default().get();
+    uint64_t total_key_size = 0;
+    uint64_t total_charge = 0;
+    uint64_t total_entry_count = 0;
+    std::set<Cache::DeleterFn> deleters;
+    StopWatchNano timer(clock);
+
+    for (;;) {
+      uint64_t time;
+      time = clock->NowMicros();
+      uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000;
+
+      {
+        MutexLock l(shared->GetMutex());
+        for (;;) {
+          if (shared->AllDone()) {
+            std::ostringstream ostr;
+            ostr << "Most recent cache entry stats:\n"
+                 << "Number of entries: " << total_entry_count << "\n"
+                 << "Total charge: " << BytesToHumanString(total_charge) << "\n"
+                 << "Average key size: "
+                 << (1.0 * total_key_size / total_entry_count) << "\n"
+                 << "Average charge: "
+                 << BytesToHumanString(static_cast<uint64_t>(
+                        1.0 * total_charge / total_entry_count))
+                 << "\n"
+                 << "Unique deleters: " << deleters.size() << "\n";
+            *stats_report = ostr.str();
+            return;
+          }
+          if (clock->NowMicros() >= deadline) {
+            break;
+          }
+          uint64_t diff = deadline - std::min(clock->NowMicros(), deadline);
+          shared->GetCondVar()->TimedWait(diff + 1);
+        }
+      }
+
+      // Now gather stats, outside of mutex
+      total_key_size = 0;
+      total_charge = 0;
+      total_entry_count = 0;
+      deleters.clear();
+      auto fn = [&](const Slice& key, void* /*value*/, size_t charge,
+                    Cache::DeleterFn deleter) {
+        total_key_size += key.size();
+        total_charge += charge;
+        ++total_entry_count;
+        // Something slightly more expensive as in (future) stats by category
+        deleters.insert(deleter);
+      };
+      timer.Start();
+      Cache::ApplyToAllEntriesOptions opts;
+      opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
+      shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
+      stats_hist->Add(timer.ElapsedNanos() / 1000);
+    }
+  }
+
+  static void ThreadBody(ThreadState* thread) {
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetCacheBench()->OperateCache(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  void OperateCache(ThreadState* thread) {
+    // To use looked-up values
+    uint64_t result = 0;
+    // To hold handles for a non-trivial amount of time
+    Cache::Handle* handle = nullptr;
+    KeyGen gen;
+    const auto clock = SystemClock::Default().get();
+    uint64_t start_time = clock->NowMicros();
+    StopWatchNano timer(clock);
+
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      timer.Start();
+      Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
+      uint64_t random_op = thread->rnd.Next();
+      Cache::CreateCallback create_cb =
+          [](void* buf, size_t size, void** out_obj, size_t* charge) -> Status {
+        *out_obj = reinterpret_cast<void*>(new char[size]);
+        memcpy(*out_obj, buf, size);
+        *charge = size;
+        return Status::OK();
+      };
+
+      if (random_op < lookup_insert_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do lookup
+        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+                                true);
+        if (handle) {
+          // do something with the data
+          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                             FLAGS_value_bytes);
+        } else {
+          // do insert
+          cache_->Insert(key, createValue(thread->rnd), &helper2,
+                         FLAGS_value_bytes, &handle);
+        }
+      } else if (random_op < insert_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do insert
+        cache_->Insert(key, createValue(thread->rnd), &helper3,
+                       FLAGS_value_bytes, &handle);
+      } else if (random_op < lookup_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do lookup
+        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+                                true);
+        if (handle) {
+          // do something with the data
+          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                             FLAGS_value_bytes);
+        }
+      } else if (random_op < erase_threshold_) {
+        // do erase
+        cache_->Erase(key);
+      } else {
+        // Should be extremely unlikely (noop)
+        assert(random_op >= kHundredthUint64 * 100U);
+      }
+      thread->latency_ns_hist.Add(timer.ElapsedNanos());
+    }
+    if (handle) {
+      cache_->Release(handle);
+      handle = nullptr;
+    }
+    // Ensure computations on `result` are not optimized away.
+    if (result == 1) {
+      printf("You are extremely unlucky(2). Try again.\n");
+      exit(1);
+    }
+    thread->duration_us = clock->NowMicros() - start_time;
+  }
+
+  void PrintEnv() const {
+    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
+    printf("Number of threads   : %u\n", FLAGS_threads);
+    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
+    printf("Cache size          : %s\n",
+           BytesToHumanString(FLAGS_cache_size).c_str());
+    printf("Num shard bits      : %u\n", FLAGS_num_shard_bits);
+    printf("Max key             : %" PRIu64 "\n", max_key_);
+    printf("Resident ratio      : %g\n", FLAGS_resident_ratio);
+    printf("Skew degree         : %u\n", FLAGS_skew);
+    printf("Populate cache      : %d\n", int{FLAGS_populate_cache});
+    printf("Lookup+Insert pct   : %u%%\n", FLAGS_lookup_insert_percent);
+    printf("Insert percentage   : %u%%\n", FLAGS_insert_percent);
+    printf("Lookup percentage   : %u%%\n", FLAGS_lookup_percent);
+    printf("Erase percentage    : %u%%\n", FLAGS_erase_percent);
+    std::ostringstream stats;
+    if (FLAGS_gather_stats) {
+      stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, "
+            << FLAGS_gather_stats_entries_per_lock << "/lock)";
+    } else {
+      stats << "disabled";
+    }
+    printf("Gather stats        : %s\n", stats.str().c_str());
+    printf("----------------------------\n");
+  }
+};
+
+// TODO: better description (see PR #9126 for some info)
+class StressCacheKey {
+ public:
+  void Run() {
+    if (FLAGS_sck_footer_unique_id) {
+      FLAGS_sck_db_count = 1;
+    }
+
+    uint64_t mb_per_day =
+        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb;
+    printf("Total cache or DBs size: %gTiB  Writing %g MiB/s or %gTiB/day\n",
+           FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
+               std::pow(2.0, FLAGS_sck_table_bits),
+           mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0);
+    multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) /
+                  (FLAGS_sck_file_size_mb * 1024.0 * 1024.0);
+    printf(
+        "Multiply by %g to correct for simulation losses (but still assume "
+        "whole file cached)\n",
+        multiplier_);
+    restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day;
+    double without_ejection =
+        std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day;
+    printf(
+        "Without ejection, expect random collision after %g days (%g "
+        "corrected)\n",
+        without_ejection, without_ejection * multiplier_);
+    double with_full_table =
+        std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) /
+        FLAGS_sck_files_per_day;
+    printf(
+        "With ejection and full table, expect random collision after %g "
+        "days (%g corrected)\n",
+        with_full_table, with_full_table * multiplier_);
+    collisions_ = 0;
+
+    for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) {
+      RunOnce();
+      if (collisions_ == 0) {
+        printf(
+            "No collisions after %d x %u days                              "
+            "                   \n",
+            i, FLAGS_sck_duration);
+      } else {
+        double est = 1.0 * i * FLAGS_sck_duration / collisions_;
+        printf("%" PRIu64
+               " collisions after %d x %u days, est %g days between (%g "
+               "corrected)        \n",
+               collisions_, i, FLAGS_sck_duration, est, est * multiplier_);
+      }
+    }
+  }
+
+  void RunOnce() {
+    const size_t db_count = FLAGS_sck_db_count;
+    dbs_.reset(new TableProperties[db_count]{});
+    const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
+    table_.reset(new uint64_t[table_mask + 1]{});
+    if (FLAGS_sck_keep_bits > 64) {
+      FLAGS_sck_keep_bits = 64;
+    }
+    uint32_t shift_away = 64 - FLAGS_sck_keep_bits;
+    uint32_t shift_away_b = shift_away / 3;
+    uint32_t shift_away_a = shift_away - shift_away_b;
+
+    process_count_ = 0;
+    session_count_ = 0;
+    ResetProcess();
+
+    Random64 r{std::random_device{}()};
+
+    uint64_t max_file_count =
+        uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_duration;
+    uint64_t file_count = 0;
+    uint32_t report_count = 0;
+    uint32_t collisions_this_run = 0;
+    // Round robin through DBs
+    for (size_t db_i = 0;; ++db_i) {
+      if (db_i >= db_count) {
+        db_i = 0;
+      }
+      if (file_count >= max_file_count) {
+        break;
+      }
+      if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
+        ResetSession(db_i);
+      } else if (r.OneIn(restart_nfiles_)) {
+        ResetProcess();
+      }
+      OffsetableCacheKey ock;
+      dbs_[db_i].orig_file_number += 1;
+      // skip some file numbers, unless 1 DB so that that can simulate
+      // better (DB-independent) unique IDs
+      if (db_count > 1) {
+        dbs_[db_i].orig_file_number += (r.Next() & 3);
+      }
+      BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], "", 42, 42, &ock);
+      CacheKey ck = ock.WithOffset(0);
+      uint64_t stripped;
+      if (FLAGS_sck_randomize) {
+        stripped = GetSliceHash64(ck.AsSlice()) >> shift_away;
+      } else if (FLAGS_sck_footer_unique_id) {
+        uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a;
+        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
+        stripped = (uint64_t{a} << 32) + b;
+      } else {
+        uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
+        uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
+        stripped = (uint64_t{a} << 32) + b;
+      }
+      if (stripped == 0) {
+        // Unlikely, but we need to exclude tracking this value
+        printf("Hit Zero!                                                  \n");
+        continue;
+      }
+      file_count++;
+      uint64_t h = NPHash64(reinterpret_cast<char*>(&stripped), 8);
+      // Skew lifetimes
+      size_t pos =
+          std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask);
+      if (table_[pos] == stripped) {
+        collisions_this_run++;
+        // To predict probability of no collisions, we have to get rid of
+        // correlated collisions, which this takes care of:
+        ResetProcess();
+      } else {
+        // Replace
+        table_[pos] = stripped;
+      }
+
+      if (++report_count == FLAGS_sck_files_per_day) {
+        report_count = 0;
+        // Estimate fill %
+        size_t incr = table_mask / 1000;
+        size_t sampled_count = 0;
+        for (size_t i = 0; i <= table_mask; i += incr) {
+          if (table_[i] != 0) {
+            sampled_count++;
+          }
+        }
+        // Report
+        printf(
+            "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64
+            " sess, %u coll, occ %g%%, ejected %g%%   \r",
+            file_count / FLAGS_sck_files_per_day, process_count_,
+            session_count_, collisions_this_run, 100.0 * sampled_count / 1000.0,
+            100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count));
+        fflush(stdout);
+      }
+    }
+    collisions_ += collisions_this_run;
+  }
+
+  void ResetSession(size_t i) {
+    dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+    session_count_++;
+  }
+
+  void ResetProcess() {
+    process_count_++;
+    DBImpl::TEST_ResetDbSessionIdGen();
+    for (size_t i = 0; i < FLAGS_sck_db_count; ++i) {
+      ResetSession(i);
+    }
+    if (FLAGS_sck_footer_unique_id) {
+      dbs_[0].orig_file_number = 0;
+    }
+  }
+
+ private:
+  // Use db_session_id and orig_file_number from TableProperties
+  std::unique_ptr<TableProperties[]> dbs_;
+  std::unique_ptr<uint64_t[]> table_;
+  uint64_t process_count_ = 0;
+  uint64_t session_count_ = 0;
+  uint64_t collisions_ = 0;
+  uint32_t restart_nfiles_ = 0;
+  double multiplier_ = 0.0;
+};
+
+int cache_bench_tool(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_stress_cache_key) {
+    // Alternate tool
+    StressCacheKey().Run();
+    return 0;
+  }
+
+  if (FLAGS_threads <= 0) {
+    fprintf(stderr, "threads number <= 0\n");
+    exit(1);
+  }
+
+  ROCKSDB_NAMESPACE::CacheBench bench;
+  if (FLAGS_populate_cache) {
+    bench.PopulateCache();
+    printf("Population complete\n");
+    printf("----------------------------\n");
+  }
+  if (bench.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}  // namespace ROCKSDB_NAMESPACE
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,70 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_entry_roles.h"
+
+#include <mutex>
+
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::array<const char*, kNumCacheEntryRoles> kCacheEntryRoleToCamelString{{
+    "DataBlock",
+    "FilterBlock",
+    "FilterMetaBlock",
+    "DeprecatedFilterBlock",
+    "IndexBlock",
+    "OtherBlock",
+    "WriteBuffer",
+    "CompressionDictionaryBuildingBuffer",
+    "FilterConstruction",
+    "Misc",
+}};
+
+std::array<const char*, kNumCacheEntryRoles> kCacheEntryRoleToHyphenString{{
+    "data-block",
+    "filter-block",
+    "filter-meta-block",
+    "deprecated-filter-block",
+    "index-block",
+    "other-block",
+    "write-buffer",
+    "compression-dictionary-building-buffer",
+    "filter-construction",
+    "misc",
+}};
+
+namespace {
+
+struct Registry {
+  std::mutex mutex;
+  std::unordered_map<Cache::DeleterFn, CacheEntryRole> role_map;
+  void Register(Cache::DeleterFn fn, CacheEntryRole role) {
+    std::lock_guard<std::mutex> lock(mutex);
+    role_map[fn] = role;
+  }
+  std::unordered_map<Cache::DeleterFn, CacheEntryRole> Copy() {
+    std::lock_guard<std::mutex> lock(mutex);
+    return role_map;
+  }
+};
+
+Registry& GetRegistry() {
+  STATIC_AVOID_DESTRUCTION(Registry, registry);
+  return registry;
+}
+
+}  // namespace
+
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) {
+  GetRegistry().Register(fn, role);
+}
+
+std::unordered_map<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() {
+  return GetRegistry().Copy();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_roles.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_roles.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,134 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Classifications of block cache entries, for reporting statistics
+// Adding new enum to this class requires corresponding updates to
+// kCacheEntryRoleToCamelString and kCacheEntryRoleToHyphenString
+enum class CacheEntryRole {
+  // Block-based table data block
+  kDataBlock,
+  // Block-based table filter block (full or partitioned)
+  kFilterBlock,
+  // Block-based table metadata block for partitioned filter
+  kFilterMetaBlock,
+  // Block-based table deprecated filter block (old "block-based" filter)
+  kDeprecatedFilterBlock,
+  // Block-based table index block
+  kIndexBlock,
+  // Other kinds of block-based table block
+  kOtherBlock,
+  // WriteBufferManager reservations to account for memtable usage
+  kWriteBuffer,
+  // BlockBasedTableBuilder reservations to account for
+  // compression dictionary building buffer's memory usage
+  kCompressionDictionaryBuildingBuffer,
+  // Filter reservations to account for
+  // (new) bloom and ribbon filter construction's memory usage
+  kFilterConstruction,
+  // Default bucket, for miscellaneous cache entries. Do not use for
+  // entries that could potentially add up to large usage.
+  kMisc,
+};
+constexpr uint32_t kNumCacheEntryRoles =
+    static_cast<uint32_t>(CacheEntryRole::kMisc) + 1;
+
+extern std::array<const char*, kNumCacheEntryRoles>
+    kCacheEntryRoleToCamelString;
+extern std::array<const char*, kNumCacheEntryRoles>
+    kCacheEntryRoleToHyphenString;
+
+// To associate cache entries with their role, we use a hack on the
+// existing Cache interface. Because the deleter of an entry can authenticate
+// the code origin of an entry, we can elaborate the choice of deleter to
+// also encode role information, without inferring false role information
+// from entries not choosing to encode a role.
+//
+// The rest of this file is for handling mappings between deleters and
+// roles.
+
+// To infer a role from a deleter, the deleter must be registered. This
+// can be done "manually" with this function. This function is thread-safe,
+// and the registration mappings go into private but static storage. (Note
+// that DeleterFn is a function pointer, not std::function. Registrations
+// should not be too many.)
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role);
+
+// Gets a copy of the registered deleter -> role mappings. This is the only
+// function for reading the mappings made with RegisterCacheDeleterRole.
+// Why only this interface for reading?
+// * This function has to be thread safe, which could incur substantial
+// overhead. We should not pay this overhead for every deleter look-up.
+// * This is suitable for preparing for batch operations, like with
+// CacheEntryStatsCollector.
+// * The number of mappings should be sufficiently small (dozens).
+std::unordered_map<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap();
+
+// ************************************************************** //
+// An automatic registration infrastructure. This enables code
+// to simply ask for a deleter associated with a particular type
+// and role, and registration is automatic. In a sense, this is
+// a small dependency injection infrastructure, because linking
+// in new deleter instantiations is essentially sufficient for
+// making stats collection (using CopyCacheDeleterRoleMap) aware
+// of them.
+
+namespace cache_entry_roles_detail {
+
+template <typename T, CacheEntryRole R>
+struct RegisteredDeleter {
+  RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+  // These have global linkage to help ensure compiler optimizations do not
+  // break uniqueness for each <T,R>
+  static void Delete(const Slice& /* key */, void* value) {
+    // Supports T == Something[], unlike delete operator
+    std::default_delete<T>()(
+        static_cast<typename std::remove_extent<T>::type*>(value));
+  }
+};
+
+template <CacheEntryRole R>
+struct RegisteredNoopDeleter {
+  RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+  static void Delete(const Slice& /* key */, void* /* value */) {
+    // Here was `assert(value == nullptr);` but we can also put pointers
+    // to static data in Cache, for testing at least.
+  }
+};
+
+}  // namespace cache_entry_roles_detail
+
+// Get an automatically registered deleter for value type T and role R.
+// Based on C++ semantics, registration is invoked exactly once in a
+// thread-safe way on first call to this function, for each <T, R>.
+template <typename T, CacheEntryRole R>
+Cache::DeleterFn GetCacheEntryDeleterForRole() {
+  static cache_entry_roles_detail::RegisteredDeleter<T, R> reg;
+  return reg.Delete;
+}
+
+// Get an automatically registered no-op deleter (value should be nullptr)
+// and associated with role R. This is used for Cache "reservation" entries
+// such as for WriteBufferManager.
+template <CacheEntryRole R>
+Cache::DeleterFn GetNoopDeleterForRole() {
+  static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg;
+  return reg.Delete;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_stats.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_entry_stats.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_entry_stats.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,183 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding_lean.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A generic helper object for gathering stats about cache entries by
+// iterating over them with ApplyToAllEntries. This class essentially
+// solves the problem of slowing down a Cache with too many stats
+// collectors that could be sharing stat results, such as from multiple
+// column families or multiple DBs sharing a Cache. We employ a few
+// mitigations:
+// * Only one collector for a particular kind of Stats is alive
+// for each Cache. This is guaranteed using the Cache itself to hold
+// the collector.
+// * A mutex ensures only one thread is gathering stats for this
+// collector.
+// * The most recent gathered stats are saved and simply copied to
+// satisfy requests within a time window (default: 3 minutes) of
+// completion of the most recent stat gathering.
+//
+// Template parameter Stats must be copyable and trivially constructable,
+// as well as...
+// concept Stats {
+//   // Notification before applying callback to all entries
+//   void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+//   // Get the callback to apply to all entries. `callback`
+//   // type must be compatible with Cache::ApplyToAllEntries
+//   callback GetEntryCallback();
+//   // Notification after applying callback to all entries
+//   void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+//   // Notification that a collection was skipped because of
+//   // sufficiently recent saved results.
+//   void SkippedCollection();
+// }
+template <class Stats>
+class CacheEntryStatsCollector {
+ public:
+  // Gather and save stats if saved stats are too old. (Use GetStats() to
+  // read saved stats.)
+  //
+  // Maximum allowed age for a "hit" on saved results is determined by the
+  // two interval parameters. Both set to 0 forces a re-scan. For example
+  // with min_interval_seconds=300 and min_interval_factor=100, if the last
+  // scan took 10s, we would only rescan ("miss") if the age in seconds of
+  // the saved results is > max(300, 100*10).
+  // Justification: scans can vary wildly in duration, e.g. from 0.02 sec
+  // to as much as 20 seconds, so we want to be able to cap the absolute
+  // and relative frequency of scans.
+  void CollectStats(int min_interval_seconds, int min_interval_factor) {
+    // Waits for any pending reader or writer (collector)
+    std::lock_guard<std::mutex> lock(working_mutex_);
+
+    uint64_t max_age_micros =
+        static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
+
+    if (last_end_time_micros_ > last_start_time_micros_ &&
+        min_interval_factor > 0) {
+      max_age_micros = std::max(
+          max_age_micros, min_interval_factor * (last_end_time_micros_ -
+                                                 last_start_time_micros_));
+    }
+
+    uint64_t start_time_micros = clock_->NowMicros();
+    if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
+      last_start_time_micros_ = start_time_micros;
+      working_stats_.BeginCollection(cache_, clock_, start_time_micros);
+
+      cache_->ApplyToAllEntries(working_stats_.GetEntryCallback(), {});
+      TEST_SYNC_POINT_CALLBACK(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);
+
+      uint64_t end_time_micros = clock_->NowMicros();
+      last_end_time_micros_ = end_time_micros;
+      working_stats_.EndCollection(cache_, clock_, end_time_micros);
+    } else {
+      working_stats_.SkippedCollection();
+    }
+
+    // Save so that we don't need to wait for an outstanding collection in
+    // order to make of copy of the last saved stats
+    std::lock_guard<std::mutex> lock2(saved_mutex_);
+    saved_stats_ = working_stats_;
+  }
+
+  // Gets saved stats, regardless of age
+  void GetStats(Stats *stats) {
+    std::lock_guard<std::mutex> lock(saved_mutex_);
+    *stats = saved_stats_;
+  }
+
+  Cache *GetCache() const { return cache_; }
+
+  // Gets or creates a shared instance of CacheEntryStatsCollector in the
+  // cache itself, and saves into `ptr`. This shared_ptr will hold the
+  // entry in cache until all refs are destroyed.
+  static Status GetShared(Cache *cache, SystemClock *clock,
+                          std::shared_ptr<CacheEntryStatsCollector> *ptr) {
+    const Slice &cache_key = GetCacheKey();
+
+    Cache::Handle *h = cache->Lookup(cache_key);
+    if (h == nullptr) {
+      // Not yet in cache, but Cache doesn't provide a built-in way to
+      // avoid racing insert. So we double-check under a shared mutex,
+      // inspired by TableCache.
+      STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
+      std::lock_guard<std::mutex> lock(static_mutex);
+
+      h = cache->Lookup(cache_key);
+      if (h == nullptr) {
+        auto new_ptr = new CacheEntryStatsCollector(cache, clock);
+        // TODO: non-zero charge causes some tests that count block cache
+        // usage to go flaky. Fix the problem somehow so we can use an
+        // accurate charge.
+        size_t charge = 0;
+        Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
+                                 Cache::Priority::HIGH);
+        if (!s.ok()) {
+          assert(h == nullptr);
+          delete new_ptr;
+          return s;
+        }
+      }
+    }
+    // If we reach here, shared entry is in cache with handle `h`.
+    assert(cache->GetDeleter(h) == Deleter);
+
+    // Build an aliasing shared_ptr that keeps `ptr` in cache while there
+    // are references.
+    *ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h);
+    return Status::OK();
+  }
+
+ private:
+  explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock)
+      : saved_stats_(),
+        working_stats_(),
+        last_start_time_micros_(0),
+        last_end_time_micros_(/*pessimistic*/ 10000000),
+        cache_(cache),
+        clock_(clock) {}
+
+  static void Deleter(const Slice &, void *value) {
+    delete static_cast<CacheEntryStatsCollector *>(value);
+  }
+
+  static const Slice &GetCacheKey() {
+    // For each template instantiation
+    static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();
+    static Slice ckey_slice = ckey.AsSlice();
+    return ckey_slice;
+  }
+
+  std::mutex saved_mutex_;
+  Stats saved_stats_;
+
+  std::mutex working_mutex_;
+  Stats working_stats_;
+  uint64_t last_start_time_micros_;
+  uint64_t last_end_time_micros_;
+
+  Cache *const cache_;
+  SystemClock *const clock_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_helpers.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_helpers.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_helpers.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_helpers.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,125 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the cached value given a cache handle.
+template <typename T>
+T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
+  assert(cache);
+  assert(handle);
+
+  return static_cast<T*>(cache->Value(handle));
+}
+
+// Simple generic deleter for Cache (to be used with Cache::Insert).
+template <typename T>
+void DeleteCacheEntry(const Slice& /* key */, void* value) {
+  delete static_cast<T*>(value);
+}
+
+// Turns a T* into a Slice so it can be used as a key with Cache.
+template <typename T>
+Slice GetSlice(const T* t) {
+  return Slice(reinterpret_cast<const char*>(t), sizeof(T));
+}
+
+// Generic resource management object for cache handles that releases the handle
+// when destroyed. Has unique ownership of the handle, so copying it is not
+// allowed, while moving it transfers ownership.
+template <typename T>
+class CacheHandleGuard {
+ public:
+  CacheHandleGuard() = default;
+
+  CacheHandleGuard(Cache* cache, Cache::Handle* handle)
+      : cache_(cache),
+        handle_(handle),
+        value_(GetFromCacheHandle<T>(cache, handle)) {
+    assert(cache_ && handle_ && value_);
+  }
+
+  CacheHandleGuard(const CacheHandleGuard&) = delete;
+  CacheHandleGuard& operator=(const CacheHandleGuard&) = delete;
+
+  CacheHandleGuard(CacheHandleGuard&& rhs) noexcept
+      : cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) {
+    assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+    rhs.ResetFields();
+  }
+
+  CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+
+    ReleaseHandle();
+
+    cache_ = rhs.cache_;
+    handle_ = rhs.handle_;
+    value_ = rhs.value_;
+
+    assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CacheHandleGuard() { ReleaseHandle(); }
+
+  bool IsEmpty() const { return !handle_; }
+
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return handle_; }
+  T* GetValue() const { return value_; }
+
+  void Reset() {
+    ReleaseHandle();
+    ResetFields();
+  }
+
+ private:
+  void ReleaseHandle() {
+    if (IsEmpty()) {
+      return;
+    }
+
+    assert(cache_);
+    cache_->Release(handle_);
+  }
+
+  void ResetFields() {
+    cache_ = nullptr;
+    handle_ = nullptr;
+    value_ = nullptr;
+  }
+
+ private:
+  Cache* cache_ = nullptr;
+  Cache::Handle* handle_ = nullptr;
+  T* value_ = nullptr;
+};
+
+// Build an aliasing shared_ptr that keeps `handle` in cache while there
+// are references, but the pointer is to the value for that cache entry,
+// which must be of type T. This is copyable, unlike CacheHandleGuard, but
+// does not provide access to caching details.
+template <typename T>
+std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
+                                              Cache::Handle* handle) {
+  auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
+  return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,271 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_key.h"
+
+#include <algorithm>
+#include <atomic>
+
+#include "rocksdb/cache.h"
+#include "table/unique_id_impl.h"
+#include "util/hash.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Value space plan for CacheKey:
+//
+// session_etc64_ | offset_etc64_ | Only generated by
+// ---------------+---------------+------------------------------------------
+//              0 |             0 | Reserved for "empty" CacheKey()
+//              0 |  > 0, < 1<<63 | CreateUniqueForCacheLifetime
+//              0 |      >= 1<<63 | CreateUniqueForProcessLifetime
+//            > 0 |           any | OffsetableCacheKey.WithOffset
+
+CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
+  // +1 so that we can reserve all zeros for "unset" cache key
+  uint64_t id = cache->NewId() + 1;
+  // Ensure we don't collide with CreateUniqueForProcessLifetime
+  assert((id >> 63) == 0U);
+  return CacheKey(0, id);
+}
+
+CacheKey CacheKey::CreateUniqueForProcessLifetime() {
+  // To avoid colliding with CreateUniqueForCacheLifetime, assuming
+  // Cache::NewId counts up from zero, here we count down from UINT64_MAX.
+  // If this ever becomes a point of contention, we could use CoreLocalArray.
+  static std::atomic<uint64_t> counter{UINT64_MAX};
+  uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
+  // Ensure we don't collide with CreateUniqueForCacheLifetime
+  assert((id >> 63) == 1U);
+  return CacheKey(0, id);
+}
+
+// Value plan for CacheKeys from OffsetableCacheKey, assuming that
+// db_session_ids are generated from a base_session_id and
+// session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
+// in DBImpl::GenerateDbSessionId):
+//
+// Conceptual inputs:
+//   db_id                   (unstructured, from GenerateRawUniqueId or equiv)
+//                           * could be shared between cloned DBs but rare
+//                           * could be constant, if session id suffices
+//   base_session_id         (unstructured, from GenerateRawUniqueId)
+//   session_id_counter      (structured)
+//                           * usually much smaller than 2**24
+//   file_number             (structured)
+//                           * usually smaller than 2**24
+//   offset_in_file          (structured, might skip lots of values)
+//                           * usually smaller than 2**32
+//   max_offset              determines placement of file_number to prevent
+//                           overlapping with offset
+//
+// Outputs come from bitwise-xor of the constituent pieces, low bits on left:
+//
+// |------------------------- session_etc64 -------------------------|
+// | +++++++++++++++ base_session_id (lower 64 bits) +++++++++++++++ |
+// |-----------------------------------------------------------------|
+// | session_id_counter ...|                                         |
+// |-----------------------------------------------------------------|
+// |                                               | ... file_number |
+// |                                               | overflow & meta |
+// |-----------------------------------------------------------------|
+//
+//
+// |------------------------- offset_etc64 --------------------------|
+// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
+// |  * base_session_id (upper ~39 bits)                             |
+// |  * db_id (~122 bits entropy)                                    |
+// |-----------------------------------------------------------------|
+// | offset_in_file ............... |                                |
+// |-----------------------------------------------------------------|
+// |                                              | file_number, 0-3 |
+// |                                              | lower bytes      |
+// |-----------------------------------------------------------------|
+//
+// Based on max_offset, a maximal number of bytes 0..3 is chosen for
+// including from lower bits of file_number in offset_etc64. The choice
+// is encoded in two bits of metadata going into session_etc64, though
+// the common case of 3 bytes is encoded as 0 so that session_etc64
+// is unmodified by file_number concerns in the common case.
+//
+// There is nothing preventing "file number overflow & meta" from meeting
+// and overlapping with session_id_counter, but reaching such a case requires
+// an intractable combination of large file offsets (thus at least some large
+// files), large file numbers (thus large number of files generated), and
+// large number of session IDs generated in a single process. A trillion each
+// (2**40) of session ids, offsets, and file numbers comes to 120 bits.
+// With two bits of metadata and byte granularity, this is on the verge of
+// overlap, but even in the overlap case, it doesn't seem likely that
+// a file from billions of files or session ids ago will still be live
+// or cached.
+//
+// In fact, if our SST files are all < 4TB (see
+// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
+// in a single process are guaranteed to have unique cache keys, unless/until
+// number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
+// a single process and 64 trillion files generated. Even at that point, to
+// see a collision we would need a miraculous re-synchronization of session
+// id and file number, along with a live file or stale cache entry from
+// trillions of files ago.
+//
+// How https://github.com/pdillinger/unique_id applies here:
+// Every bit of output always includes "unstructured" uniqueness bits and
+// often combines with "structured" uniqueness bits. The "unstructured" bits
+// change infrequently: only when we cannot guarantee our state tracking for
+// "structured" uniqueness hasn't been cloned. Using a static
+// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
+// "all new" session id when a new process uses RocksDB. (Between processes,
+// we don't know if a DB or other persistent storage has been cloned.) Within
+// a process, only the session_lower of the db_session_id changes
+// incrementally ("structured" uniqueness).
+//
+// This basically means that our offsets, counters and file numbers allow us
+// to do somewhat "better than random" (birthday paradox) while in the
+// degenerate case of completely new session for each tiny file, we still
+// have strong uniqueness properties from the birthday paradox, with ~103
+// bit session IDs or up to 128 bits entropy with different DB IDs sharing a
+// cache.
+//
+// More collision probability analysis:
+// Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD)
+// with average process/session lifetime of (pessimistically) 4 minutes.
+// In 180 days (generous allowable data lifespan), we generate 31 million GB
+// of data, or 2^55 bytes, and 2^16 "all new" session IDs.
+//
+// First, suppose this is in a single DB (lifetime 180 days):
+// 128 bits cache key size
+// - 55 <- ideal size for byte offsets + file numbers
+// -  2 <- bits for offsets and file numbers not exactly powers of two
+// -  2 <- bits for file number encoding metadata
+// +  2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
+// ----
+//   71 <- bits remaining for distinguishing session IDs
+// The probability of a collision in 71 bits of session ID data is less than
+// 1 in 2**(71 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
+// data from the last 180 days is in cache for potential collision, and that
+// cache keys under each session id exhaustively cover the remaining 57 bits
+// while in reality they'll only cover a small fraction of it.
+//
+// Although data could be transferred between hosts, each host has its own
+// cache and we are already assuming a high rate of "all new" session ids.
+// So this doesn't really change the collision calculation. Across a fleet
+// of 1 million, each with <1 in a trillion collision possibility,
+// fleetwide collision probability is <1 in a million.
+//
+// Now suppose we have many DBs per host, say 2**10, with same host-wide write
+// rate and process/session lifetime. File numbers will be ~10 bits smaller
+// and we will have 2**10 times as many session IDs because of simultaneous
+// lifetimes. So now collision chance is less than 1 in 2**(81 - (2 * 26)),
+// or roughly 1 in a billion.
+//
+// Suppose instead we generated random or hashed cache keys for each
+// (compressed) block. For 1KB compressed block size, that is 2^45 cache keys
+// in 180 days. Collision probability is more easily estimated at roughly
+// 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all
+// data from the last 180 days is in cache, but NOT the other assumption
+// for the 1 in a trillion estimate above).
+//
+// Conclusion: Burning through session IDs, particularly "all new" IDs that
+// only arise when a new process is started, is the only way to have a
+// plausible chance of cache key collision. When processes live for hours
+// or days, the chance of a cache key collision seems more plausibly due
+// to bad hardware than to bad luck in random session ID data.
+//
+OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
+                                       const std::string &db_session_id,
+                                       uint64_t file_number,
+                                       uint64_t max_offset) {
+#ifndef NDEBUG
+  max_offset_ = max_offset;
+#endif
+  // Closely related to GetSstInternalUniqueId, but only need 128 bits and
+  // need to include an offset within the file.
+  // See also https://github.com/pdillinger/unique_id for background.
+  uint64_t session_upper = 0;  // Assignment to appease clang-analyze
+  uint64_t session_lower = 0;  // Assignment to appease clang-analyze
+  {
+    Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
+    if (!s.ok()) {
+      // A reasonable fallback in case malformed
+      Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper,
+               &session_lower);
+    }
+  }
+
+  // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
+  // for more global uniqueness entropy.
+  // (It is possible that many DBs descended from one common DB id are copied
+  // around and proliferate, in which case session id is critical, but it is
+  // more common for different DBs to have different DB ids.)
+  uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper);
+
+  // This establishes the db+session id part of the cache key.
+  //
+  // Exactly preserve (in common cases; see modifiers below) session lower to
+  // ensure that session ids generated during the same process lifetime are
+  // guaranteed unique.
+  //
+  // We put this first for CommonPrefixSlice(), so that a small-ish set of
+  // cache key prefixes to cover entries relevant to any DB.
+  session_etc64_ = session_lower;
+  // This provides extra entopy in case of different DB id or process
+  // generating a session id, but is also partly/variably obscured by
+  // file_number and offset (see below).
+  offset_etc64_ = db_hash;
+
+  // Into offset_etc64_ we are (eventually) going to pack & xor in an offset and
+  // a file_number, but we might need the file_number to overflow into
+  // session_etc64_. (There must only be one session_etc64_ value per
+  // file, and preferably shared among many files.)
+  //
+  // Figure out how many bytes of file_number we are going to be able to
+  // pack in with max_offset, though our encoding will only support packing
+  // in up to 3 bytes of file_number. (16M file numbers is enough for a new
+  // file number every second for half a year.)
+  int file_number_bytes_in_offset_etc =
+      (63 - FloorLog2(max_offset | 0x100000000U)) / 8;
+  int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8;
+
+  // Assert two bits of metadata
+  assert(file_number_bytes_in_offset_etc >= 0 &&
+         file_number_bytes_in_offset_etc <= 3);
+  // Assert we couldn't have used a larger allowed number of bytes (shift
+  // would chop off bytes).
+  assert(file_number_bytes_in_offset_etc == 3 ||
+         (max_offset << (file_number_bits_in_offset_etc + 8) >>
+          (file_number_bits_in_offset_etc + 8)) != max_offset);
+
+  uint64_t mask = (uint64_t{1} << (file_number_bits_in_offset_etc)) - 1;
+  // Pack into high bits of etc so that offset can go in low bits of etc
+  // TODO: could be EndianSwapValue?
+  uint64_t offset_etc_modifier = ReverseBits(file_number & mask);
+  assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U);
+
+  // Overflow and 3 - byte count (likely both zero) go into session_id part
+  uint64_t session_etc_modifier =
+      (file_number >> file_number_bits_in_offset_etc << 2) |
+      static_cast<uint64_t>(3 - file_number_bytes_in_offset_etc);
+  // Packed into high bits to minimize interference with session id counter.
+  session_etc_modifier = ReverseBits(session_etc_modifier);
+
+  // Assert session_id part is only modified in extreme cases
+  assert(session_etc_modifier == 0 || file_number > /*3 bytes*/ 0xffffffU ||
+         max_offset > /*5 bytes*/ 0xffffffffffU);
+
+  // Xor in the modifiers
+  session_etc64_ ^= session_etc_modifier;
+  offset_etc64_ ^= offset_etc_modifier;
+
+  // Although DBImpl guarantees (in recent versions) that session_lower is not
+  // zero, that's not entirely sufficient to guarantee that session_etc64_ is
+  // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
+  if (session_etc64_ == 0U) {
+    session_etc64_ = session_upper | 1U;
+  }
+  assert(session_etc64_ != 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_key.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_key.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,132 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+
+// A standard holder for fixed-size block cache keys (and for related caches).
+// They are created through one of these, each using its own range of values:
+// * CacheKey::CreateUniqueForCacheLifetime
+// * CacheKey::CreateUniqueForProcessLifetime
+// * Default ctor ("empty" cache key)
+// * OffsetableCacheKey->WithOffset
+//
+// The first two use atomic counters to guarantee uniqueness over the given
+// lifetime and the last uses a form of universally unique identifier for
+// uniqueness with very high probabilty (and guaranteed for files generated
+// during a single process lifetime).
+//
+// CacheKeys are currently used by calling AsSlice() to pass as a key to
+// Cache. For performance, the keys are endianness-dependent (though otherwise
+// portable). (Persistable cache entries are not intended to cross platforms.)
+class CacheKey {
+ public:
+  // For convenience, constructs an "empty" cache key that is never returned
+  // by other means.
+  inline CacheKey() : session_etc64_(), offset_etc64_() {}
+
+  inline bool IsEmpty() const {
+    return (session_etc64_ == 0) & (offset_etc64_ == 0);
+  }
+
+  // Use this cache key as a Slice (byte order is endianness-dependent)
+  inline Slice AsSlice() const {
+    static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key");
+    assert(!IsEmpty());
+    return Slice(reinterpret_cast<const char *>(this), sizeof(*this));
+  }
+
+  // Create a CacheKey that is unique among others associated with this Cache
+  // instance. Depends on Cache::NewId. This is useful for block cache
+  // "reservations".
+  static CacheKey CreateUniqueForCacheLifetime(Cache *cache);
+
+  // Create a CacheKey that is unique among others for the lifetime of this
+  // process. This is useful for saving in a static data member so that
+  // different DB instances can agree on a cache key for shared entities,
+  // such as for CacheEntryStatsCollector.
+  static CacheKey CreateUniqueForProcessLifetime();
+
+ protected:
+  friend class OffsetableCacheKey;
+  CacheKey(uint64_t session_etc64, uint64_t offset_etc64)
+      : session_etc64_(session_etc64), offset_etc64_(offset_etc64) {}
+  uint64_t session_etc64_;
+  uint64_t offset_etc64_;
+};
+
+// A file-specific generator of cache keys, sometimes referred to as the
+// "base" cache key for a file because all the cache keys for various offsets
+// within the file are computed using simple arithmetic. The basis for the
+// general approach is dicussed here: https://github.com/pdillinger/unique_id
+// Heavily related to GetUniqueIdFromTableProperties.
+//
+// If the db_id, db_session_id, and file_number come from the file's table
+// properties, then the keys will be stable across DB::Open/Close, backup/
+// restore, import/export, etc.
+//
+// This class "is a" CacheKey only privately so that it is not misused as
+// a ready-to-use CacheKey.
+class OffsetableCacheKey : private CacheKey {
+ public:
+  // For convenience, constructs an "empty" cache key that should not be used.
+  inline OffsetableCacheKey() : CacheKey() {}
+
+  // Constructs an OffsetableCacheKey with the given information about a file.
+  // max_offset is based on file size (see WithOffset) and is required here to
+  // choose an appropriate (sub-)encoding. This constructor never generates an
+  // "empty" base key.
+  OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id,
+                     uint64_t file_number, uint64_t max_offset);
+
+  inline bool IsEmpty() const {
+    bool result = session_etc64_ == 0;
+    assert(!(offset_etc64_ > 0 && result));
+    return result;
+  }
+
+  // Construct a CacheKey for an offset within a file, which must be
+  // <= max_offset provided in constructor. An offset is not necessarily a
+  // byte offset if a smaller unique identifier of keyable offsets is used.
+  //
+  // This class was designed to make this hot code extremely fast.
+  inline CacheKey WithOffset(uint64_t offset) const {
+    assert(!IsEmpty());
+    assert(offset <= max_offset_);
+    return CacheKey(session_etc64_, offset_etc64_ ^ offset);
+  }
+
+  // The "common prefix" is a shared prefix for all the returned CacheKeys,
+  // that also happens to usually be the same among many files in the same DB,
+  // so is efficient and highly accurate (not perfectly) for DB-specific cache
+  // dump selection (but not file-specific).
+  static constexpr size_t kCommonPrefixSize = 8;
+  inline Slice CommonPrefixSlice() const {
+    static_assert(sizeof(session_etc64_) == kCommonPrefixSize,
+                  "8 byte common prefix expected");
+    assert(!IsEmpty());
+    assert(&this->session_etc64_ == static_cast<const void *>(this));
+
+    return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize);
+  }
+
+  // For any max_offset <= this value, the same encoding scheme is guaranteed.
+  static constexpr uint64_t kMaxOffsetStandardEncoding = 0xffffffffffU;
+
+ private:
+#ifndef NDEBUG
+  uint64_t max_offset_ = 0;
+#endif
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,188 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "cache/cache_reservation_manager.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+CacheReservationManager::CacheReservationManager(std::shared_ptr<Cache> cache,
+                                                 bool delayed_decrease)
+    : delayed_decrease_(delayed_decrease),
+      cache_allocated_size_(0),
+      memory_used_(0) {
+  assert(cache != nullptr);
+  cache_ = cache;
+}
+
+CacheReservationManager::~CacheReservationManager() {
+  for (auto* handle : dummy_handles_) {
+    cache_->Release(handle, true);
+  }
+}
+
+template <CacheEntryRole R>
+Status CacheReservationManager::UpdateCacheReservation(
+    std::size_t new_mem_used) {
+  memory_used_ = new_mem_used;
+  std::size_t cur_cache_allocated_size =
+      cache_allocated_size_.load(std::memory_order_relaxed);
+  if (new_mem_used == cur_cache_allocated_size) {
+    return Status::OK();
+  } else if (new_mem_used > cur_cache_allocated_size) {
+    Status s = IncreaseCacheReservation<R>(new_mem_used);
+    return s;
+  } else {
+    // In delayed decrease mode, we don't decrease cache reservation
+    // untill the memory usage is less than 3/4 of what we reserve
+    // in the cache.
+    // We do this because
+    // (1) Dummy entry insertion is expensive in block cache
+    // (2) Delayed releasing previously inserted dummy entries can save such
+    // expensive dummy entry insertion on memory increase in the near future,
+    // which is likely to happen when the memory usage is greater than or equal
+    // to 3/4 of what we reserve
+    if (delayed_decrease_ && new_mem_used >= cur_cache_allocated_size / 4 * 3) {
+      return Status::OK();
+    } else {
+      Status s = DecreaseCacheReservation(new_mem_used);
+      return s;
+    }
+  }
+}
+
+// Explicitly instantiate templates for "CacheEntryRole" values we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template Status CacheReservationManager::UpdateCacheReservation<
+    CacheEntryRole::kWriteBuffer>(std::size_t new_mem_used);
+template Status CacheReservationManager::UpdateCacheReservation<
+    CacheEntryRole::kCompressionDictionaryBuildingBuffer>(
+    std::size_t new_mem_used);
+// For cache reservation manager unit tests
+template Status CacheReservationManager::UpdateCacheReservation<
+    CacheEntryRole::kMisc>(std::size_t new_mem_used);
+
+template <CacheEntryRole R>
+Status CacheReservationManager::MakeCacheReservation(
+    std::size_t incremental_memory_used,
+    std::unique_ptr<CacheReservationHandle<R>>* handle) {
+  assert(handle != nullptr);
+  Status s =
+      UpdateCacheReservation<R>(GetTotalMemoryUsed() + incremental_memory_used);
+  (*handle).reset(new CacheReservationHandle<R>(incremental_memory_used,
+                                                shared_from_this()));
+  return s;
+}
+
+template Status
+CacheReservationManager::MakeCacheReservation<CacheEntryRole::kMisc>(
+    std::size_t incremental_memory_used,
+    std::unique_ptr<CacheReservationHandle<CacheEntryRole::kMisc>>* handle);
+template Status CacheReservationManager::MakeCacheReservation<
+    CacheEntryRole::kFilterConstruction>(
+    std::size_t incremental_memory_used,
+    std::unique_ptr<
+        CacheReservationHandle<CacheEntryRole::kFilterConstruction>>* handle);
+
+template <CacheEntryRole R>
+Status CacheReservationManager::IncreaseCacheReservation(
+    std::size_t new_mem_used) {
+  Status return_status = Status::OK();
+  while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) {
+    Cache::Handle* handle = nullptr;
+    return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry,
+                                   GetNoopDeleterForRole<R>(), &handle);
+
+    if (return_status != Status::OK()) {
+      return return_status;
+    }
+
+    dummy_handles_.push_back(handle);
+    cache_allocated_size_ += kSizeDummyEntry;
+  }
+  return return_status;
+}
+
+Status CacheReservationManager::DecreaseCacheReservation(
+    std::size_t new_mem_used) {
+  Status return_status = Status::OK();
+
+  // Decrease to the smallest multiple of kSizeDummyEntry that is greater than
+  // or equal to new_mem_used We do addition instead of new_mem_used <=
+  // cache_allocated_size_.load(std::memory_order_relaxed) - kSizeDummyEntry to
+  // avoid underflow of size_t when cache_allocated_size_ = 0
+  while (new_mem_used + kSizeDummyEntry <=
+         cache_allocated_size_.load(std::memory_order_relaxed)) {
+    assert(!dummy_handles_.empty());
+    auto* handle = dummy_handles_.back();
+    cache_->Release(handle, true);
+    dummy_handles_.pop_back();
+    cache_allocated_size_ -= kSizeDummyEntry;
+  }
+  return return_status;
+}
+
+std::size_t CacheReservationManager::GetTotalReservedCacheSize() {
+  return cache_allocated_size_.load(std::memory_order_relaxed);
+}
+
+std::size_t CacheReservationManager::GetTotalMemoryUsed() {
+  return memory_used_;
+}
+
+Slice CacheReservationManager::GetNextCacheKey() {
+  // Calling this function will have the side-effect of changing the
+  // underlying cache_key_ that is shared among other keys generated from this
+  // fucntion. Therefore please make sure the previous keys are saved/copied
+  // before calling this function.
+  cache_key_ = CacheKey::CreateUniqueForCacheLifetime(cache_.get());
+  return cache_key_.AsSlice();
+}
+
+template <CacheEntryRole R>
+Cache::DeleterFn CacheReservationManager::TEST_GetNoopDeleterForRole() {
+  return GetNoopDeleterForRole<R>();
+}
+
+template Cache::DeleterFn CacheReservationManager::TEST_GetNoopDeleterForRole<
+    CacheEntryRole::kFilterConstruction>();
+
+template <CacheEntryRole R>
+CacheReservationHandle<R>::CacheReservationHandle(
+    std::size_t incremental_memory_used,
+    std::shared_ptr<CacheReservationManager> cache_res_mgr)
+    : incremental_memory_used_(incremental_memory_used) {
+  assert(cache_res_mgr != nullptr);
+  cache_res_mgr_ = cache_res_mgr;
+}
+
+template <CacheEntryRole R>
+CacheReservationHandle<R>::~CacheReservationHandle() {
+  assert(cache_res_mgr_ != nullptr);
+  assert(cache_res_mgr_->GetTotalMemoryUsed() >= incremental_memory_used_);
+
+  Status s = cache_res_mgr_->UpdateCacheReservation<R>(
+      cache_res_mgr_->GetTotalMemoryUsed() - incremental_memory_used_);
+  s.PermitUncheckedError();
+}
+
+// Explicitly instantiate templates for "CacheEntryRole" values we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template class CacheReservationHandle<CacheEntryRole::kMisc>;
+template class CacheReservationHandle<CacheEntryRole::kFilterConstruction>;
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,191 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <CacheEntryRole R>
+class CacheReservationHandle;
+
+// CacheReservationManager is for reserving cache space for the memory used
+// through inserting/releasing dummy entries in the cache.
+//
+// This class is NOT thread-safe, except that GetTotalReservedCacheSize()
+// can be called without external synchronization.
+class CacheReservationManager
+    : public std::enable_shared_from_this<CacheReservationManager> {
+ public:
+  // Construct a CacheReservationManager
+  // @param cache The cache where dummy entries are inserted and released for
+  // reserving cache space
+  // @param delayed_decrease If set true, then dummy entries won't be released
+  // immediately when memory usage decreases.
+  //                         Instead, it will be released when the memory usage
+  //                         decreases to 3/4 of what we have reserved so far.
+  //                         This is for saving some future dummy entry
+  //                         insertion when memory usage increases are likely to
+  //                         happen in the near future.
+  explicit CacheReservationManager(std::shared_ptr<Cache> cache,
+                                   bool delayed_decrease = false);
+
+  // no copy constructor, copy assignment, move constructor, move assignment
+  CacheReservationManager(const CacheReservationManager &) = delete;
+  CacheReservationManager &operator=(const CacheReservationManager &) = delete;
+  CacheReservationManager(CacheReservationManager &&) = delete;
+  CacheReservationManager &operator=(CacheReservationManager &&) = delete;
+
+  ~CacheReservationManager();
+
+  template <CacheEntryRole R>
+
+  // One of the two ways of reserving/releasing cache,
+  // see CacheReservationManager::MakeCacheReservation() for the other.
+  // Use ONLY one of them to prevent unexpected behavior.
+  //
+  // Insert and release dummy entries in the cache to
+  // match the size of total dummy entries with the least multiple of
+  // kSizeDummyEntry greater than or equal to new_mem_used
+  //
+  // Insert dummy entries if new_memory_used > cache_allocated_size_;
+  //
+  // Release dummy entries if new_memory_used < cache_allocated_size_
+  // (and new_memory_used < cache_allocated_size_ * 3/4
+  // when delayed_decrease is set true);
+  //
+  // Keey dummy entries the same if (1) new_memory_used == cache_allocated_size_
+  // or (2) new_memory_used is in the interval of
+  // [cache_allocated_size_ * 3/4, cache_allocated_size) when delayed_decrease
+  // is set true.
+  //
+  // @param new_memory_used The number of bytes used by new memory
+  //        The most recent new_memoy_used passed in will be returned
+  //        in GetTotalMemoryUsed() even when the call return non-ok status.
+  //
+  //        Since the class is NOT thread-safe, external synchronization on the
+  //        order of calling UpdateCacheReservation() is needed if you want
+  //        GetTotalMemoryUsed() indeed returns the latest memory used.
+  //
+  // @return On inserting dummy entries, it returns Status::OK() if all dummy
+  //         entry insertions succeed.
+  //         Otherwise, it returns the first non-ok status;
+  //         On releasing dummy entries, it always returns Status::OK().
+  //         On keeping dummy entries the same, it always returns Status::OK().
+  Status UpdateCacheReservation(std::size_t new_memory_used);
+
+  // One of the two ways of reserving/releasing cache,
+  // see CacheReservationManager::UpdateCacheReservation() for the other.
+  // Use ONLY one of them to prevent unexpected behavior.
+  //
+  // Insert dummy entries in the cache for the incremental memory usage
+  // to match the size of total dummy entries with the least multiple of
+  // kSizeDummyEntry greater than or equal to the total memory used.
+  //
+  // A CacheReservationHandle is returned as an output parameter.
+  // The reserved dummy entries are automatically released on the destruction of
+  // this handle, which achieves better RAII per cache reservation.
+  //
+  // WARNING: Deallocate all the handles of the CacheReservationManager object
+  //          before deallocating the object to prevent unexpected behavior.
+  //
+  // @param incremental_memory_used The number of bytes increased in memory
+  //        usage.
+  //
+  //        Calling GetTotalMemoryUsed() afterward will return the total memory
+  //        increased by this number, even when calling MakeCacheReservation()
+  //        returns non-ok status.
+  //
+  //        Since the class is NOT thread-safe, external synchronization in
+  //        calling MakeCacheReservation() is needed if you want
+  //        GetTotalMemoryUsed() indeed returns the latest memory used.
+  //
+  // @param handle An pointer to std::unique_ptr<CacheReservationHandle<R>> that
+  //        manages the lifetime of the handle and its cache reservation.
+  //
+  // @return It returns Status::OK() if all dummy
+  //         entry insertions succeed.
+  //         Otherwise, it returns the first non-ok status;
+  //
+  // REQUIRES: handle != nullptr
+  // REQUIRES: The CacheReservationManager object is NOT managed by
+  //           std::unique_ptr as CacheReservationHandle needs to
+  //           shares ownership to the CacheReservationManager object.
+  template <CacheEntryRole R>
+  Status MakeCacheReservation(
+      std::size_t incremental_memory_used,
+      std::unique_ptr<CacheReservationHandle<R>> *handle);
+
+  // Return the size of the cache (which is a multiple of kSizeDummyEntry)
+  // successfully reserved by calling UpdateCacheReservation().
+  //
+  // When UpdateCacheReservation() returns non-ok status,
+  // calling GetTotalReservedCacheSize() after that might return a slightly
+  // smaller number than the actual reserved cache size due to
+  // the returned number will always be a multiple of kSizeDummyEntry
+  // and cache full might happen in the middle of inserting a dummy entry.
+  std::size_t GetTotalReservedCacheSize();
+
+  // Return the latest total memory used indicated by the most recent call of
+  // UpdateCacheReservation(std::size_t new_memory_used);
+  std::size_t GetTotalMemoryUsed();
+
+  static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }
+
+  // For testing only - it is to help ensure the NoopDeleterForRole<R>
+  // accessed from CacheReservationManager and the one accessed from the test
+  // are from the same translation units
+  template <CacheEntryRole R>
+  static Cache::DeleterFn TEST_GetNoopDeleterForRole();
+
+ private:
+  static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+
+  Slice GetNextCacheKey();
+  template <CacheEntryRole R>
+  Status IncreaseCacheReservation(std::size_t new_mem_used);
+  Status DecreaseCacheReservation(std::size_t new_mem_used);
+
+  std::shared_ptr<Cache> cache_;
+  bool delayed_decrease_;
+  std::atomic<std::size_t> cache_allocated_size_;
+  std::size_t memory_used_;
+  std::vector<Cache::Handle *> dummy_handles_;
+  CacheKey cache_key_;
+};
+
+// CacheReservationHandle is for managing the lifetime of a cache reservation
+// This class is NOT thread-safe
+template <CacheEntryRole R>
+class CacheReservationHandle {
+ public:
+  // REQUIRES: cache_res_mgr != nullptr
+  explicit CacheReservationHandle(
+      std::size_t incremental_memory_used,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr);
+
+  ~CacheReservationHandle();
+
+ private:
+  std::size_t incremental_memory_used_;
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,506 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "cache/cache_reservation_manager.h"
+
+#include <cstddef>
+#include <cstring>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+class CacheReservationManagerTest : public ::testing::Test {
+ protected:
+  static constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManager::GetDummyEntrySize();
+  static constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+  static constexpr int kNumShardBits = 0;  // 2^0 shard
+  static constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  std::shared_ptr<Cache> cache = NewLRUCache(kCacheCapacity, kNumShardBits);
+  std::unique_ptr<CacheReservationManager> test_cache_rev_mng;
+
+  CacheReservationManagerTest() {
+    test_cache_rev_mng.reset(new CacheReservationManager(cache));
+  }
+};
+
+TEST_F(CacheReservationManagerTest, GenerateCacheKey) {
+  std::size_t new_mem_used = 1 * kSizeDummyEntry;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Next unique Cache key
+  CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get());
+  // Back it up to the one used by CRM (using CacheKey implementation details)
+  using PairU64 = std::pair<uint64_t, uint64_t>;
+  auto& ckey_pair = *reinterpret_cast<PairU64*>(&ckey);
+  ckey_pair.second--;
+
+  // Specific key (subject to implementation details)
+  EXPECT_EQ(ckey_pair, PairU64(0, 2));
+
+  Cache::Handle* handle = cache->Lookup(ckey.AsSlice());
+  EXPECT_NE(handle, nullptr)
+      << "Failed to generate the cache key for the dummy entry correctly";
+  // Clean up the returned handle from Lookup() to prevent memory leak
+  cache->Release(handle);
+}
+
+TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) {
+  std::size_t new_mem_used = 1 * kSizeDummyEntry;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  std::size_t initial_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_GE(initial_pinned_usage, 1 * kSizeDummyEntry);
+  ASSERT_LT(initial_pinned_usage,
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to keep cache reservation the same when new_mem_used equals "
+         "to current cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when new_mem_used equals to current "
+         "cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly when new_mem_used "
+         "equals to current cache reservation";
+  EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+      << "Failed to keep underlying dummy entries the same when new_mem_used "
+         "equals to current cache reservation";
+}
+
+TEST_F(CacheReservationManagerTest,
+       IncreaseCacheReservationByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to increase cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation increase correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
+      << "Failed to increase underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to increase underlying dummy entries in cache correctly";
+}
+
+TEST_F(CacheReservationManagerTest,
+       IncreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry + kSizeDummyEntry / 2;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to increase cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            3 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation increase correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 3 * kSizeDummyEntry)
+      << "Failed to increase underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            3 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to increase underlying dummy entries in cache correctly";
+}
+
+TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest,
+     IncreaseCacheReservationOnFullCache) {
+  ;
+  constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManager::GetDummyEntrySize();
+  constexpr std::size_t kSmallCacheCapacity = 4 * kSizeDummyEntry;
+  constexpr std::size_t kBigCacheCapacity = 4096 * kSizeDummyEntry;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kSmallCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  std::unique_ptr<CacheReservationManager> test_cache_rev_mng(
+      new CacheReservationManager(cache));
+
+  std::size_t new_mem_used = kSmallCacheCapacity + 1;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::Incomplete())
+      << "Failed to return status to indicate failure of dummy entry insertion "
+         "during cache reservation on full cache";
+  EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly before cache resevation failure happens "
+         "due to full cache";
+  EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            kSmallCacheCapacity)
+      << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
+         "entry insertions) when encountering cache resevation failure due to "
+         "full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+  EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+
+  new_mem_used = kSmallCacheCapacity / 2;  // 2 dummy entries
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation after encountering cache "
+         "reservation failure due to full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation decrease correctly after "
+         "encountering cache reservation due to full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
+      << "Failed to release underlying dummy entries correctly on cache "
+         "reservation decrease after encountering cache resevation failure due "
+         "to full cache";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to release underlying dummy entries correctly on cache "
+         "reservation decrease after encountering cache resevation failure due "
+         "to full cache";
+
+  // Create cache full again for subsequent tests
+  new_mem_used = kSmallCacheCapacity + 1;
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::Incomplete())
+      << "Failed to return status to indicate failure of dummy entry insertion "
+         "during cache reservation on full cache";
+  EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly before cache resevation failure happens "
+         "due to full cache";
+  EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            kSmallCacheCapacity)
+      << "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
+         "entry insertions) when encountering cache resevation failure due to "
+         "full cache";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+  EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
+      << "Failed to insert underlying dummy entries correctly when "
+         "encountering cache resevation failure due to full cache";
+
+  // Increase cache capacity so the previously failed insertion can fully
+  // succeed
+  cache->SetCapacity(kBigCacheCapacity);
+  new_mem_used = kSmallCacheCapacity + 1;
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to increase cache reservation after increasing cache capacity "
+         "and mitigating cache full error";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            5 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation increase correctly after "
+         "increasing cache capacity and mitigating cache full error";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 5 * kSizeDummyEntry)
+      << "Failed to insert underlying dummy entries correctly after increasing "
+         "cache capacity and mitigating cache full error";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            5 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to insert underlying dummy entries correctly after increasing "
+         "cache capacity and mitigating cache full error";
+}
+
+TEST_F(CacheReservationManagerTest,
+       DecreaseCacheReservationByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  new_mem_used = 1 * kSizeDummyEntry;
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation decrease correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+}
+
+TEST_F(CacheReservationManagerTest,
+       DecreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
+  std::size_t new_mem_used = 2 * kSizeDummyEntry;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            2 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  new_mem_used = kSizeDummyEntry / 2;
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            1 * kSizeDummyEntry)
+      << "Failed to bookkeep cache reservation decrease correctly";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to decrease underlying dummy entries in cache correctly";
+}
+
+TEST(CacheReservationManagerWithDelayedDecreaseTest,
+     DecreaseCacheReservationWithDelayedDecrease) {
+  constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManager::GetDummyEntrySize();
+  constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  std::unique_ptr<CacheReservationManager> test_cache_rev_mng(
+      new CacheReservationManager(cache, true /* delayed_decrease */));
+
+  std::size_t new_mem_used = 8 * kSizeDummyEntry;
+  Status s =
+      test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            8 * kSizeDummyEntry);
+  ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
+  std::size_t initial_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_GE(initial_pinned_usage, 8 * kSizeDummyEntry);
+  ASSERT_LT(initial_pinned_usage,
+            8 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  new_mem_used = 6 * kSizeDummyEntry;
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            8 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when delaying cache reservation "
+         "decrease";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+      << "Failed to delay decreasing underlying dummy entries in cache";
+
+  new_mem_used = 7 * kSizeDummyEntry;
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            8 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when delaying cache reservation "
+         "decrease";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
+      << "Failed to delay decreasing underlying dummy entries in cache";
+
+  new_mem_used = 6 * kSizeDummyEntry - 1;
+  s = test_cache_rev_mng
+          ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+              new_mem_used);
+  EXPECT_EQ(s, Status::OK())
+      << "Failed to decrease cache reservation correctly when new_mem_used < "
+         "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
+            6 * kSizeDummyEntry)
+      << "Failed to bookkeep correctly when new_mem_used < "
+         "GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
+      << "Failed to bookkeep the used memory correctly";
+  EXPECT_GE(cache->GetPinnedUsage(), 6 * kSizeDummyEntry)
+      << "Failed to decrease underlying dummy entries in cache when "
+         "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
+         "decrease mode";
+  EXPECT_LT(cache->GetPinnedUsage(),
+            6 * kSizeDummyEntry + kMetaDataChargeOverhead)
+      << "Failed to decrease underlying dummy entries in cache when "
+         "new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
+         "decrease mode";
+}
+
+TEST(CacheReservationManagerDestructorTest,
+     ReleaseRemainingDummyEntriesOnDestruction) {
+  constexpr std::size_t kSizeDummyEntry =
+      CacheReservationManager::GetDummyEntrySize();
+  constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  {
+    std::unique_ptr<CacheReservationManager> test_cache_rev_mng(
+        new CacheReservationManager(cache));
+    std::size_t new_mem_used = 1 * kSizeDummyEntry;
+    Status s =
+        test_cache_rev_mng
+            ->UpdateCacheReservation<ROCKSDB_NAMESPACE::CacheEntryRole::kMisc>(
+                new_mem_used);
+    ASSERT_EQ(s, Status::OK());
+    ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+    ASSERT_LT(cache->GetPinnedUsage(),
+              1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+  }
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry)
+      << "Failed to release remaining underlying dummy entries in cache in "
+         "CacheReservationManager's destructor";
+}
+
+TEST(CacheReservationHandleTest, HandleTest) {
+  constexpr std::size_t kOneGigabyte = 1024 * 1024 * 1024;
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
+  LRUCacheOptions lo;
+  lo.capacity = kOneGigabyte;
+  lo.num_shard_bits = 0;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+
+  std::shared_ptr<CacheReservationManager> test_cache_rev_mng(
+      std::make_shared<CacheReservationManager>(cache));
+
+  std::size_t mem_used = 0;
+  const std::size_t incremental_mem_used_handle_1 = 1 * kSizeDummyEntry;
+  const std::size_t incremental_mem_used_handle_2 = 2 * kSizeDummyEntry;
+  std::unique_ptr<CacheReservationHandle<CacheEntryRole::kMisc>> handle_1,
+      handle_2;
+
+  // To test consecutive CacheReservationManager::MakeCacheReservation works
+  // correctly in terms of returning the handle as well as updating cache
+  // reservation and the latest total memory used
+  Status s = test_cache_rev_mng->MakeCacheReservation<CacheEntryRole::kMisc>(
+      incremental_mem_used_handle_1, &handle_1);
+  mem_used = mem_used + incremental_mem_used_handle_1;
+  ASSERT_EQ(s, Status::OK());
+  EXPECT_TRUE(handle_1 != nullptr);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  s = test_cache_rev_mng->MakeCacheReservation<CacheEntryRole::kMisc>(
+      incremental_mem_used_handle_2, &handle_2);
+  mem_used = mem_used + incremental_mem_used_handle_2;
+  ASSERT_EQ(s, Status::OK());
+  EXPECT_TRUE(handle_2 != nullptr);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  // To test CacheReservationHandle::~CacheReservationHandle() works correctly
+  // in releasing the cache reserved for the handle
+  handle_1.reset();
+  EXPECT_TRUE(handle_1 == nullptr);
+  mem_used = mem_used - incremental_mem_used_handle_1;
+  EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
+  EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  // To test the actual CacheReservationManager object won't be deallocated
+  // as long as there remain handles pointing to it.
+  // We strongly recommend deallocating CacheReservationManager object only
+  // after all its handles are deallocated to keep things easy to reasonate
+  test_cache_rev_mng.reset();
+  EXPECT_GE(cache->GetPinnedUsage(), mem_used);
+  EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
+
+  handle_2.reset();
+  // The CacheReservationManager object is now deallocated since all the handles
+  // and its original pointer is gone
+  mem_used = mem_used - incremental_mem_used_handle_2;
+  EXPECT_EQ(mem_used, 0);
+  EXPECT_EQ(cache->GetPinnedUsage(), mem_used);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/cache_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/cache_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -117,8 +117,8 @@
 
   void Insert(std::shared_ptr<Cache> cache, int key, int value,
               int charge = 1) {
-    cache->Insert(EncodeKey(key), EncodeValue(value), charge,
-                  &CacheTest::Deleter);
+    EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
+                            &CacheTest::Deleter));
   }
 
   void Erase(std::shared_ptr<Cache> cache, int key) {
@@ -167,9 +167,10 @@
   for (int i = 1; i < 100; ++i) {
     std::string key(i, 'a');
     auto kv_size = key.size() + 5;
-    cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter);
-    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
-                          dumbDeleter);
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                            dumbDeleter));
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    kv_size, dumbDeleter));
     usage += kv_size;
     ASSERT_EQ(usage, cache->GetUsage());
     ASSERT_LT(usage, precise_cache->GetUsage());
@@ -183,10 +184,10 @@
   // make sure the cache will be overloaded
   for (uint64_t i = 1; i < kCapacity; ++i) {
     auto key = ToString(i);
-    cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                  dumbDeleter);
-    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                          dumbDeleter);
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                            dumbDeleter));
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    key.size() + 5, dumbDeleter));
   }
 
   // the usage should be close to the capacity
@@ -215,11 +216,12 @@
     auto kv_size = key.size() + 5;
     Cache::Handle* handle;
     Cache::Handle* handle_in_precise_cache;
-    cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter,
-                  &handle);
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                            dumbDeleter, &handle));
     assert(handle);
-    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
-                          dumbDeleter, &handle_in_precise_cache);
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    kv_size, dumbDeleter,
+                                    &handle_in_precise_cache));
     assert(handle_in_precise_cache);
     pinned_usage += kv_size;
     ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
@@ -254,10 +256,10 @@
   // check that overloading the cache does not change the pinned usage
   for (uint64_t i = 1; i < 2 * kCapacity; ++i) {
     auto key = ToString(i);
-    cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                  dumbDeleter);
-    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                          dumbDeleter);
+    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                            dumbDeleter));
+    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
+                                    key.size() + 5, dumbDeleter));
   }
   ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
   ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
@@ -607,6 +609,9 @@
   for (size_t i = 5; i < 10; i++) {
     cache->Release(handles[i]);
   }
+
+  // Make sure this doesn't crash or upset ASAN/valgrind
+  cache->DisownData();
 }
 
 TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
@@ -710,25 +715,98 @@
 }
 
 namespace {
-std::vector<std::pair<int, int>> callback_state;
-void callback(void* entry, size_t charge) {
-  callback_state.push_back({DecodeValue(entry), static_cast<int>(charge)});
+std::vector<std::pair<int, int>> legacy_callback_state;
+void legacy_callback(void* value, size_t charge) {
+  legacy_callback_state.push_back(
+      {DecodeValue(value), static_cast<int>(charge)});
 }
 };
 
-TEST_P(CacheTest, ApplyToAllCacheEntiresTest) {
+TEST_P(CacheTest, ApplyToAllCacheEntriesTest) {
   std::vector<std::pair<int, int>> inserted;
-  callback_state.clear();
+  legacy_callback_state.clear();
 
   for (int i = 0; i < 10; ++i) {
     Insert(i, i * 2, i + 1);
     inserted.push_back({i * 2, i + 1});
   }
-  cache_->ApplyToAllCacheEntries(callback, true);
+  cache_->ApplyToAllCacheEntries(legacy_callback, true);
+
+  std::sort(inserted.begin(), inserted.end());
+  std::sort(legacy_callback_state.begin(), legacy_callback_state.end());
+  ASSERT_EQ(inserted.size(), legacy_callback_state.size());
+  for (size_t i = 0; i < inserted.size(); ++i) {
+    EXPECT_EQ(inserted[i], legacy_callback_state[i]);
+  }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesTest) {
+  std::vector<std::string> callback_state;
+  const auto callback = [&](const Slice& key, void* value, size_t charge,
+                            Cache::DeleterFn deleter) {
+    callback_state.push_back(ToString(DecodeKey(key)) + "," +
+                             ToString(DecodeValue(value)) + "," +
+                             ToString(charge));
+    assert(deleter == &CacheTest::Deleter);
+  };
+
+  std::vector<std::string> inserted;
+  callback_state.clear();
+
+  for (int i = 0; i < 10; ++i) {
+    Insert(i, i * 2, i + 1);
+    inserted.push_back(ToString(i) + "," + ToString(i * 2) + "," +
+                       ToString(i + 1));
+  }
+  cache_->ApplyToAllEntries(callback, /*opts*/ {});
 
   std::sort(inserted.begin(), inserted.end());
   std::sort(callback_state.begin(), callback_state.end());
-  ASSERT_TRUE(inserted == callback_state);
+  ASSERT_EQ(inserted.size(), callback_state.size());
+  for (size_t i = 0; i < inserted.size(); ++i) {
+    EXPECT_EQ(inserted[i], callback_state[i]);
+  }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
+  // This is a mini-stress test of ApplyToAllEntries, to ensure
+  // items in the cache that are neither added nor removed
+  // during ApplyToAllEntries are counted exactly once.
+
+  // Insert some entries that we expect to be seen exactly once
+  // during iteration.
+  constexpr int kSpecialCharge = 2;
+  constexpr int kNotSpecialCharge = 1;
+  constexpr int kSpecialCount = 100;
+  for (int i = 0; i < kSpecialCount; ++i) {
+    Insert(i, i * 2, kSpecialCharge);
+  }
+
+  // For callback
+  int special_count = 0;
+  const auto callback = [&](const Slice&, void*, size_t charge,
+                            Cache::DeleterFn) {
+    if (charge == static_cast<size_t>(kSpecialCharge)) {
+      ++special_count;
+    }
+  };
+
+  // Start counting
+  std::thread apply_thread([&]() {
+    // Use small average_entries_per_lock to make the problem difficult
+    Cache::ApplyToAllEntriesOptions opts;
+    opts.average_entries_per_lock = 2;
+    cache_->ApplyToAllEntries(callback, opts);
+  });
+
+  // In parallel, add more entries, enough to cause resize but not enough
+  // to cause ejections
+  for (int i = kSpecialCount * 1; i < kSpecialCount * 6; ++i) {
+    Insert(i, i * 2, kNotSpecialCharge);
+  }
+
+  apply_thread.join();
+  ASSERT_EQ(special_count, kSpecialCount);
 }
 
 TEST_P(CacheTest, DefaultShardBits) {
@@ -747,11 +825,12 @@
   ASSERT_EQ(6, sc->GetNumShardBits());
 }
 
-TEST_P(CacheTest, GetCharge) {
+TEST_P(CacheTest, GetChargeAndDeleter) {
   Insert(1, 2);
   Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
   ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
   ASSERT_EQ(1, cache_->GetCharge(h1));
+  ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1));
   cache_->Release(h1);
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/clock_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/clock_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/clock_cache.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/clock_cache.cc	2025-05-19 16:14:27.000000000 +0000
@@ -33,11 +33,11 @@
 #ifndef ROCKSDB_USE_RTTI
 #define TBB_USE_EXCEPTIONS 0
 #endif
-#include "tbb/concurrent_hash_map.h"
-
 #include "cache/sharded_cache.h"
+#include "port/lang.h"
 #include "port/malloc.h"
 #include "port/port.h"
+#include "tbb/concurrent_hash_map.h"
 #include "util/autovector.h"
 #include "util/mutexlock.h"
 
@@ -176,13 +176,16 @@
 // Cache entry meta data.
 struct CacheHandle {
   Slice key;
-  uint32_t hash;
   void* value;
   size_t charge;
-  void (*deleter)(const Slice&, void* value);
+  Cache::DeleterFn deleter;
+  uint32_t hash;
+
+  // Addition to "charge" to get "total charge" under metadata policy.
+  uint32_t meta_charge;
 
   // Flags and counters associated with the cache handle:
-  //   lowest bit: n-cache bit
+  //   lowest bit: in-cache bit
   //   second lowest bit: usage bit
   //   the rest bits: reference count
   // The handle is unused when flags equals to 0. The thread decreases the count
@@ -205,9 +208,8 @@
     return *this;
   }
 
-  inline static size_t CalcTotalCharge(
-      Slice key, size_t charge,
-      CacheMetadataChargePolicy metadata_charge_policy) {
+  inline static uint32_t CalcMetadataCharge(
+      Slice key, CacheMetadataChargePolicy metadata_charge_policy) {
     size_t meta_charge = 0;
     if (metadata_charge_policy == kFullChargeCacheMetadata) {
       meta_charge += sizeof(CacheHandle);
@@ -218,32 +220,30 @@
       meta_charge += key.size();
 #endif
     }
-    return charge + meta_charge;
+    assert(meta_charge <= UINT32_MAX);
+    return static_cast<uint32_t>(meta_charge);
   }
 
-  inline size_t CalcTotalCharge(
-      CacheMetadataChargePolicy metadata_charge_policy) {
-    return CalcTotalCharge(key, charge, metadata_charge_policy);
-  }
+  inline size_t GetTotalCharge() { return charge + meta_charge; }
 };
 
 // Key of hash map. We store hash value with the key for convenience.
-struct CacheKey {
+struct ClockCacheKey {
   Slice key;
   uint32_t hash_value;
 
-  CacheKey() = default;
+  ClockCacheKey() = default;
 
-  CacheKey(const Slice& k, uint32_t h) {
+  ClockCacheKey(const Slice& k, uint32_t h) {
     key = k;
     hash_value = h;
   }
 
-  static bool equal(const CacheKey& a, const CacheKey& b) {
+  static bool equal(const ClockCacheKey& a, const ClockCacheKey& b) {
     return a.hash_value == b.hash_value && a.key == b.key;
   }
 
-  static size_t hash(const CacheKey& a) {
+  static size_t hash(const ClockCacheKey& a) {
     return static_cast<size_t>(a.hash_value);
   }
 };
@@ -260,7 +260,8 @@
 class ClockCacheShard final : public CacheShard {
  public:
   // Hash map type.
-  typedef tbb::concurrent_hash_map<CacheKey, CacheHandle*, CacheKey> HashTable;
+  using HashTable =
+      tbb::concurrent_hash_map<ClockCacheKey, CacheHandle*, ClockCacheKey>;
 
   ClockCacheShard();
   ~ClockCacheShard() override;
@@ -271,7 +272,26 @@
   Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                 void (*deleter)(const Slice& key, void* value),
                 Cache::Handle** handle, Cache::Priority priority) override;
+  Status Insert(const Slice& key, uint32_t hash, void* value,
+                const Cache::CacheItemHelper* helper, size_t charge,
+                Cache::Handle** handle, Cache::Priority priority) override {
+    return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
+  }
   Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                        const Cache::CacheItemHelper* /*helper*/,
+                        const Cache::CreateCallback& /*create_cb*/,
+                        Cache::Priority /*priority*/, bool /*wait*/,
+                        Statistics* /*stats*/) override {
+    return Lookup(key, hash);
+  }
+  bool Release(Cache::Handle* handle, bool /*useful*/,
+               bool force_erase) override {
+    return Release(handle, force_erase);
+  }
+  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
+  void Wait(Cache::Handle* /*handle*/) override {}
+
   // If the entry in in cache, increase reference count and return true.
   // Return false otherwise.
   //
@@ -284,8 +304,10 @@
   size_t GetUsage() const override;
   size_t GetPinnedUsage() const override;
   void EraseUnRefEntries() override;
-  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                              bool thread_safe) override;
+  void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      uint32_t average_entries_per_lock, uint32_t* state) override;
 
  private:
   static const uint32_t kInCacheBit = 1;
@@ -341,7 +363,8 @@
   CacheHandle* Insert(const Slice& key, uint32_t hash, void* value,
                       size_t change,
                       void (*deleter)(const Slice& key, void* value),
-                      bool hold_reference, CleanupContext* context);
+                      bool hold_reference, CleanupContext* context,
+                      bool* overwritten);
 
   // Guards list_, head_, and recycle_. In addition, updating table_ also has
   // to hold the mutex, to avoid the cache being in inconsistent state.
@@ -403,22 +426,46 @@
   return pinned_usage_.load(std::memory_order_relaxed);
 }
 
-void ClockCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                             bool thread_safe) {
-  if (thread_safe) {
-    mutex_.Lock();
+void ClockCacheShard::ApplyToSomeEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    uint32_t average_entries_per_lock, uint32_t* state) {
+  assert(average_entries_per_lock > 0);
+  MutexLock lock(&mutex_);
+
+  // Figure out the range to iterate, update `state`
+  size_t list_size = list_.size();
+  size_t start_idx = *state;
+  size_t end_idx = start_idx + average_entries_per_lock;
+  if (start_idx > list_size) {
+    // Shouldn't reach here, but recoverable
+    assert(false);
+    // Mark finished with all
+    *state = UINT32_MAX;
+    return;
+  }
+  if (end_idx >= list_size || end_idx >= UINT32_MAX) {
+    // This also includes the hypothetical case of >4 billion
+    // cache handles.
+    end_idx = list_size;
+    // Mark finished with all
+    *state = UINT32_MAX;
+  } else {
+    *state = static_cast<uint32_t>(end_idx);
   }
-  for (auto& handle : list_) {
-    // Use relaxed semantics instead of acquire semantics since we are either
-    // holding mutex, or don't have thread safe requirement.
+
+  // Do the iteration
+  auto cur = list_.begin() + start_idx;
+  auto end = list_.begin() + end_idx;
+  for (; cur != end; ++cur) {
+    const CacheHandle& handle = *cur;
+    // Use relaxed semantics instead of acquire semantics since we are
+    // holding mutex
     uint32_t flags = handle.flags.load(std::memory_order_relaxed);
     if (InCache(flags)) {
-      callback(handle.value, handle.charge);
+      callback(handle.key, handle.value, handle.charge, handle.deleter);
     }
   }
-  if (thread_safe) {
-    mutex_.Unlock();
-  }
 }
 
 void ClockCacheShard::RecycleHandle(CacheHandle* handle,
@@ -427,10 +474,8 @@
   assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
   context->to_delete_key.push_back(handle->key.data());
   context->to_delete_value.emplace_back(*handle);
-  size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
-  handle->key.clear();
-  handle->value = nullptr;
-  handle->deleter = nullptr;
+  size_t total_charge = handle->GetTotalCharge();
+  // clearing `handle` fields would go here but not strictly required
   recycle_.push_back(handle);
   usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 }
@@ -458,7 +503,7 @@
                                             std::memory_order_relaxed)) {
       if (CountRefs(flags) == 0) {
         // No reference count before the operation.
-        size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+        size_t total_charge = handle->GetTotalCharge();
         pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
       }
       return true;
@@ -472,6 +517,11 @@
   if (set_usage) {
     handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed);
   }
+  // If the handle reaches state refs=0 and InCache=true after this
+  // atomic operation then we cannot access `handle` afterward, because
+  // it could be evicted before we access the `handle`.
+  size_t total_charge = handle->GetTotalCharge();
+
   // Use acquire-release semantics as previous operations on the cache entry
   // has to be order before reference count is decreased, and potential cleanup
   // of the entry has to be order after.
@@ -479,7 +529,6 @@
   assert(CountRefs(flags) > 0);
   if (CountRefs(flags) == 1) {
     // this is the last reference.
-    size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
     pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
     // Cleanup if it is the last reference.
     if (!InCache(flags)) {
@@ -511,7 +560,7 @@
   if (handle->flags.compare_exchange_strong(flags, 0, std::memory_order_acquire,
                                             std::memory_order_relaxed)) {
     bool erased __attribute__((__unused__)) =
-        table_.erase(CacheKey(handle->key, handle->hash));
+        table_.erase(ClockCacheKey(handle->key, handle->hash));
     assert(erased);
     RecycleHandle(handle, context);
     return true;
@@ -564,9 +613,11 @@
 CacheHandle* ClockCacheShard::Insert(
     const Slice& key, uint32_t hash, void* value, size_t charge,
     void (*deleter)(const Slice& key, void* value), bool hold_reference,
-    CleanupContext* context) {
-  size_t total_charge =
-      CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
+    CleanupContext* context, bool* overwritten) {
+  assert(overwritten != nullptr && *overwritten == false);
+  uint32_t meta_charge =
+      CacheHandle::CalcMetadataCharge(key, metadata_charge_policy_);
+  size_t total_charge = charge + meta_charge;
   MutexLock l(&mutex_);
   bool success = EvictFromCache(total_charge, context);
   bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
@@ -592,16 +643,27 @@
   handle->hash = hash;
   handle->value = value;
   handle->charge = charge;
+  handle->meta_charge = meta_charge;
   handle->deleter = deleter;
   uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit;
+
+  // TODO investigate+fix suspected race condition:
+  // [thread 1] Lookup starts, up to Ref()
+  // [thread 2] Erase/evict the entry just looked up
+  // [thread 1] Ref() the handle, even though it's in the recycle bin
+  // [thread 2] Insert with recycling that handle
+  // Here we obliterate the other thread's Ref
+  // Possible fix: never blindly overwrite the flags, but only make
+  // relative updates (fetch_add, etc).
   handle->flags.store(flags, std::memory_order_relaxed);
   HashTable::accessor accessor;
-  if (table_.find(accessor, CacheKey(key, hash))) {
+  if (table_.find(accessor, ClockCacheKey(key, hash))) {
+    *overwritten = true;
     CacheHandle* existing_handle = accessor->second;
     table_.erase(accessor);
     UnsetInCache(existing_handle, context);
   }
-  table_.insert(HashTable::value_type(CacheKey(key, hash), handle));
+  table_.insert(HashTable::value_type(ClockCacheKey(key, hash), handle));
   if (hold_reference) {
     pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
   }
@@ -619,8 +681,9 @@
   char* key_data = new char[key.size()];
   memcpy(key_data, key.data(), key.size());
   Slice key_copy(key_data, key.size());
+  bool overwritten = false;
   CacheHandle* handle = Insert(key_copy, hash, value, charge, deleter,
-                               out_handle != nullptr, &context);
+                               out_handle != nullptr, &context, &overwritten);
   Status s;
   if (out_handle != nullptr) {
     if (handle == nullptr) {
@@ -629,13 +692,17 @@
       *out_handle = reinterpret_cast<Cache::Handle*>(handle);
     }
   }
+  if (overwritten) {
+    assert(s.ok());
+    s = Status::OkOverwritten();
+  }
   Cleanup(context);
   return s;
 }
 
 Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
   HashTable::const_accessor accessor;
-  if (!table_.find(accessor, CacheKey(key, hash))) {
+  if (!table_.find(accessor, ClockCacheKey(key, hash))) {
     return nullptr;
   }
   CacheHandle* handle = accessor->second;
@@ -680,7 +747,7 @@
   MutexLock l(&mutex_);
   HashTable::accessor accessor;
   bool erased = false;
-  if (table_.find(accessor, CacheKey(key, hash))) {
+  if (table_.find(accessor, ClockCacheKey(key, hash))) {
     CacheHandle* handle = accessor->second;
     table_.erase(accessor);
     erased = UnsetInCache(handle, context);
@@ -718,11 +785,11 @@
 
   const char* Name() const override { return "ClockCache"; }
 
-  CacheShard* GetShard(int shard) override {
+  CacheShard* GetShard(uint32_t shard) override {
     return reinterpret_cast<CacheShard*>(&shards_[shard]);
   }
 
-  const CacheShard* GetShard(int shard) const override {
+  const CacheShard* GetShard(uint32_t shard) const override {
     return reinterpret_cast<CacheShard*>(&shards_[shard]);
   }
 
@@ -738,7 +805,18 @@
     return reinterpret_cast<const CacheHandle*>(handle)->hash;
   }
 
-  void DisownData() override { shards_ = nullptr; }
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return reinterpret_cast<const CacheHandle*>(handle)->deleter;
+  }
+
+  void DisownData() override {
+    // Leak data only if that won't generate an ASAN/valgrind warning
+    if (!kMustFreeHeapAllocations) {
+      shards_ = nullptr;
+    }
+  }
+
+  void WaitAll(std::vector<Handle*>& /*handles*/) override {}
 
  private:
   ClockCacheShard* shards_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,26 +9,31 @@
 
 #include "cache/lru_cache.h"
 
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
-  Resize();
-}
+LRUHandleTable::LRUHandleTable(int max_upper_hash_bits)
+    : length_bits_(/* historical starting size*/ 4),
+      list_(new LRUHandle* [size_t{1} << length_bits_] {}),
+      elems_(0),
+      max_length_bits_(max_upper_hash_bits) {}
 
 LRUHandleTable::~LRUHandleTable() {
-  ApplyToAllCacheEntries([](LRUHandle* h) {
-    if (!h->HasRefs()) {
-      h->Free();
-    }
-  });
-  delete[] list_;
+  ApplyToEntriesRange(
+      [](LRUHandle* h) {
+        if (!h->HasRefs()) {
+          h->Free();
+        }
+      },
+      0, uint32_t{1} << length_bits_);
 }
 
 LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
@@ -42,7 +47,7 @@
   *ptr = h;
   if (old == nullptr) {
     ++elems_;
-    if (elems_ > length_) {
+    if ((elems_ >> length_bits_) > 0) {  // elems_ >= length
       // Since each cache entry is fairly large, we aim for a small
       // average linked list length (<= 1).
       Resize();
@@ -62,7 +67,7 @@
 }
 
 LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
-  LRUHandle** ptr = &list_[hash & (length_ - 1)];
+  LRUHandle** ptr = &list_[hash >> (32 - length_bits_)];
   while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
     ptr = &(*ptr)->next_hash;
   }
@@ -70,19 +75,29 @@
 }
 
 void LRUHandleTable::Resize() {
-  uint32_t new_length = 16;
-  while (new_length < elems_ * 1.5) {
-    new_length *= 2;
+  if (length_bits_ >= max_length_bits_) {
+    // Due to reaching limit of hash information, if we made the table
+    // bigger, we would allocate more addresses but only the same
+    // number would be used.
+    return;
+  }
+  if (length_bits_ >= 31) {
+    // Avoid undefined behavior shifting uint32_t by 32
+    return;
   }
-  LRUHandle** new_list = new LRUHandle*[new_length];
-  memset(new_list, 0, sizeof(new_list[0]) * new_length);
+
+  uint32_t old_length = uint32_t{1} << length_bits_;
+  int new_length_bits = length_bits_ + 1;
+  std::unique_ptr<LRUHandle* []> new_list {
+    new LRUHandle* [size_t{1} << new_length_bits] {}
+  };
   uint32_t count = 0;
-  for (uint32_t i = 0; i < length_; i++) {
+  for (uint32_t i = 0; i < old_length; i++) {
     LRUHandle* h = list_[i];
     while (h != nullptr) {
       LRUHandle* next = h->next_hash;
       uint32_t hash = h->hash;
-      LRUHandle** ptr = &new_list[hash & (new_length - 1)];
+      LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)];
       h->next_hash = *ptr;
       *ptr = h;
       h = next;
@@ -90,23 +105,25 @@
     }
   }
   assert(elems_ == count);
-  delete[] list_;
-  list_ = new_list;
-  length_ = new_length;
+  list_ = std::move(new_list);
+  length_bits_ = new_length_bits;
 }
 
-LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
-                             double high_pri_pool_ratio,
-                             bool use_adaptive_mutex,
-                             CacheMetadataChargePolicy metadata_charge_policy)
+LRUCacheShard::LRUCacheShard(
+    size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio,
+    bool use_adaptive_mutex, CacheMetadataChargePolicy metadata_charge_policy,
+    int max_upper_hash_bits,
+    const std::shared_ptr<SecondaryCache>& secondary_cache)
     : capacity_(0),
       high_pri_pool_usage_(0),
       strict_capacity_limit_(strict_capacity_limit),
       high_pri_pool_ratio_(high_pri_pool_ratio),
       high_pri_pool_capacity_(0),
+      table_(max_upper_hash_bits),
       usage_(0),
       lru_usage_(0),
-      mutex_(use_adaptive_mutex) {
+      mutex_(use_adaptive_mutex),
+      secondary_cache_(secondary_cache) {
   set_metadata_charge_policy(metadata_charge_policy);
   // Make empty circular linked list
   lru_.next = &lru_;
@@ -138,19 +155,40 @@
   }
 }
 
-void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                           bool thread_safe) {
-  const auto applyCallback = [&]() {
-    table_.ApplyToAllCacheEntries(
-        [callback](LRUHandle* h) { callback(h->value, h->charge); });
-  };
+void LRUCacheShard::ApplyToSomeEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    uint32_t average_entries_per_lock, uint32_t* state) {
+  // The state is essentially going to be the starting hash, which works
+  // nicely even if we resize between calls because we use upper-most
+  // hash bits for table indexes.
+  MutexLock l(&mutex_);
+  uint32_t length_bits = table_.GetLengthBits();
+  uint32_t length = uint32_t{1} << length_bits;
 
-  if (thread_safe) {
-    MutexLock l(&mutex_);
-    applyCallback();
+  assert(average_entries_per_lock > 0);
+  // Assuming we are called with same average_entries_per_lock repeatedly,
+  // this simplifies some logic (index_end will not overflow)
+  assert(average_entries_per_lock < length || *state == 0);
+
+  uint32_t index_begin = *state >> (32 - length_bits);
+  uint32_t index_end = index_begin + average_entries_per_lock;
+  if (index_end >= length) {
+    // Going to end
+    index_end = length;
+    *state = UINT32_MAX;
   } else {
-    applyCallback();
+    *state = index_end << (32 - length_bits);
   }
+
+  table_.ApplyToEntriesRange(
+      [callback](LRUHandle* h) {
+        DeleterFn deleter = h->IsSecondaryCacheCompatible()
+                                ? h->info_.helper->del_cb
+                                : h->info_.deleter;
+        callback(h->key(), h->value, h->charge, deleter);
+      },
+      index_begin, index_end);
 }
 
 void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
@@ -257,8 +295,14 @@
     EvictFromLRU(0, &last_reference_list);
   }
 
+  // Try to insert the evicted entries into tiered cache
   // Free the entries outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
+    if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
+        !entry->IsPromoted()) {
+      secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
+          .PermitUncheckedError();
+    }
     entry->Free();
   }
 }
@@ -268,17 +312,181 @@
   strict_capacity_limit_ = strict_capacity_limit;
 }
 
-Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
-  MutexLock l(&mutex_);
-  LRUHandle* e = table_.Lookup(key, hash);
-  if (e != nullptr) {
-    assert(e->InCache());
-    if (!e->HasRefs()) {
-      // The entry is in LRU since it's in hash and has no external references
-      LRU_Remove(e);
+Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
+                                 bool free_handle_on_fail) {
+  Status s = Status::OK();
+  autovector<LRUHandle*> last_reference_list;
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+
+  {
+    MutexLock l(&mutex_);
+
+    // Free the space following strict LRU policy until enough space
+    // is freed or the lru list is empty
+    EvictFromLRU(total_charge, &last_reference_list);
+
+    if ((usage_ + total_charge) > capacity_ &&
+        (strict_capacity_limit_ || handle == nullptr)) {
+      e->SetInCache(false);
+      if (handle == nullptr) {
+        // Don't insert the entry but still return ok, as if the entry inserted
+        // into cache and get evicted immediately.
+        last_reference_list.push_back(e);
+      } else {
+        if (free_handle_on_fail) {
+          delete[] reinterpret_cast<char*>(e);
+          *handle = nullptr;
+        }
+        s = Status::Incomplete("Insert failed due to LRU cache being full.");
+      }
+    } else {
+      // Insert into the cache. Note that the cache might get larger than its
+      // capacity if not enough space was freed up.
+      LRUHandle* old = table_.Insert(e);
+      usage_ += total_charge;
+      if (old != nullptr) {
+        s = Status::OkOverwritten();
+        assert(old->InCache());
+        old->SetInCache(false);
+        if (!old->HasRefs()) {
+          // old is on LRU because it's in cache and its reference count is 0
+          LRU_Remove(old);
+          size_t old_total_charge =
+              old->CalcTotalCharge(metadata_charge_policy_);
+          assert(usage_ >= old_total_charge);
+          usage_ -= old_total_charge;
+          last_reference_list.push_back(old);
+        }
+      }
+      if (handle == nullptr) {
+        LRU_Insert(e);
+      } else {
+        // If caller already holds a ref, no need to take one here
+        if (!e->HasRefs()) {
+          e->Ref();
+        }
+        *handle = reinterpret_cast<Cache::Handle*>(e);
+      }
+    }
+  }
+
+  // Try to insert the evicted entries into the secondary cache
+  // Free the entries here outside of mutex for performance reasons
+  for (auto entry : last_reference_list) {
+    if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
+        !entry->IsPromoted()) {
+      secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
+          .PermitUncheckedError();
+    }
+    entry->Free();
+  }
+
+  return s;
+}
+
+void LRUCacheShard::Promote(LRUHandle* e) {
+  SecondaryCacheResultHandle* secondary_handle = e->sec_handle;
+
+  assert(secondary_handle->IsReady());
+  e->SetIncomplete(false);
+  e->SetInCache(true);
+  e->SetPromoted(true);
+  e->value = secondary_handle->Value();
+  e->charge = secondary_handle->Size();
+  delete secondary_handle;
+
+  // This call could fail if the cache is over capacity and
+  // strict_capacity_limit_ is true. In such a case, we don't want
+  // InsertItem() to free the handle, since the item is already in memory
+  // and the caller will most likely just read from disk if we erase it here.
+  if (e->value) {
+    Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(e);
+    Status s = InsertItem(e, &handle, /*free_handle_on_fail=*/false);
+    if (!s.ok()) {
+      // Item is in memory, but not accounted against the cache capacity.
+      // When the handle is released, the item should get deleted
+      assert(!e->InCache());
+    }
+  } else {
+    // Since the secondary cache lookup failed, mark the item as not in cache
+    // Don't charge the cache as its only metadata that'll shortly be released
+    MutexLock l(&mutex_);
+    e->charge = 0;
+    e->SetInCache(false);
+  }
+}
+
+Cache::Handle* LRUCacheShard::Lookup(
+    const Slice& key, uint32_t hash,
+    const ShardedCache::CacheItemHelper* helper,
+    const ShardedCache::CreateCallback& create_cb, Cache::Priority priority,
+    bool wait, Statistics* stats) {
+  LRUHandle* e = nullptr;
+  {
+    MutexLock l(&mutex_);
+    e = table_.Lookup(key, hash);
+    if (e != nullptr) {
+      assert(e->InCache());
+      if (!e->HasRefs()) {
+        // The entry is in LRU since it's in hash and has no external references
+        LRU_Remove(e);
+      }
+      e->Ref();
+      e->SetHit();
+    }
+  }
+
+  // If handle table lookup failed, then allocate a handle outside the
+  // mutex if we're going to lookup in the secondary cache
+  // Only support synchronous for now
+  // TODO: Support asynchronous lookup in secondary cache
+  if (!e && secondary_cache_ && helper && helper->saveto_cb) {
+    // For objects from the secondary cache, we expect the caller to provide
+    // a way to create/delete the primary cache object. The only case where
+    // a deleter would not be required is for dummy entries inserted for
+    // accounting purposes, which we won't demote to the secondary cache
+    // anyway.
+    assert(create_cb && helper->del_cb);
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
+        secondary_cache_->Lookup(key, create_cb, wait);
+    if (secondary_handle != nullptr) {
+      e = reinterpret_cast<LRUHandle*>(
+          new char[sizeof(LRUHandle) - 1 + key.size()]);
+
+      e->flags = 0;
+      e->SetSecondaryCacheCompatible(true);
+      e->info_.helper = helper;
+      e->key_length = key.size();
+      e->hash = hash;
+      e->refs = 0;
+      e->next = e->prev = nullptr;
+      e->SetPriority(priority);
+      memcpy(e->key_data, key.data(), key.size());
+      e->value = nullptr;
+      e->sec_handle = secondary_handle.release();
+      e->Ref();
+
+      if (wait) {
+        Promote(e);
+        if (!e->value) {
+          // The secondary cache returned a handle, but the lookup failed
+          e->Unref();
+          e->Free();
+          e = nullptr;
+        } else {
+          PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
+          RecordTick(stats, SECONDARY_CACHE_HITS);
+        }
+      } else {
+        // If wait is false, we always return a handle and let the caller
+        // release the handle after checking for success or failure
+        e->SetIncomplete(true);
+        // This may be slightly inaccurate, if the lookup eventually fails.
+        // But the probability is very low.
+        PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
+        RecordTick(stats, SECONDARY_CACHE_HITS);
+      }
     }
-    e->Ref();
-    e->SetHit();
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
@@ -322,7 +530,12 @@
         last_reference = false;
       }
     }
-    if (last_reference) {
+    // If it was the last reference, and the entry is either not secondary
+    // cache compatible (i.e a dummy entry for accounting), or is secondary
+    // cache compatible and has a non-null value, then decrement the cache
+    // usage. If value is null in the latter case, taht means the lookup
+    // failed and we didn't charge the cache.
+    if (last_reference && (!e->IsSecondaryCacheCompatible() || e->value)) {
       size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
       assert(usage_ >= total_charge);
       usage_ -= total_charge;
@@ -339,80 +552,35 @@
 Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                              size_t charge,
                              void (*deleter)(const Slice& key, void* value),
+                             const Cache::CacheItemHelper* helper,
                              Cache::Handle** handle, Cache::Priority priority) {
   // Allocate the memory here outside of the mutex
   // If the cache is full, we'll have to release it
   // It shouldn't happen very often though.
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       new char[sizeof(LRUHandle) - 1 + key.size()]);
-  Status s = Status::OK();
-  autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
-  e->deleter = deleter;
+  e->flags = 0;
+  if (helper) {
+    e->SetSecondaryCacheCompatible(true);
+    e->info_.helper = helper;
+  } else {
+#ifdef __SANITIZE_THREAD__
+    e->is_secondary_cache_compatible_for_tsan = false;
+#endif  // __SANITIZE_THREAD__
+    e->info_.deleter = deleter;
+  }
   e->charge = charge;
   e->key_length = key.size();
-  e->flags = 0;
   e->hash = hash;
   e->refs = 0;
   e->next = e->prev = nullptr;
   e->SetInCache(true);
   e->SetPriority(priority);
   memcpy(e->key_data, key.data(), key.size());
-  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
-
-  {
-    MutexLock l(&mutex_);
-
-    // Free the space following strict LRU policy until enough space
-    // is freed or the lru list is empty
-    EvictFromLRU(total_charge, &last_reference_list);
-
-    if ((usage_ + total_charge) > capacity_ &&
-        (strict_capacity_limit_ || handle == nullptr)) {
-      if (handle == nullptr) {
-        // Don't insert the entry but still return ok, as if the entry inserted
-        // into cache and get evicted immediately.
-        e->SetInCache(false);
-        last_reference_list.push_back(e);
-      } else {
-        delete[] reinterpret_cast<char*>(e);
-        *handle = nullptr;
-        s = Status::Incomplete("Insert failed due to LRU cache being full.");
-      }
-    } else {
-      // Insert into the cache. Note that the cache might get larger than its
-      // capacity if not enough space was freed up.
-      LRUHandle* old = table_.Insert(e);
-      usage_ += total_charge;
-      if (old != nullptr) {
-        assert(old->InCache());
-        old->SetInCache(false);
-        if (!old->HasRefs()) {
-          // old is on LRU because it's in cache and its reference count is 0
-          LRU_Remove(old);
-          size_t old_total_charge =
-              old->CalcTotalCharge(metadata_charge_policy_);
-          assert(usage_ >= old_total_charge);
-          usage_ -= old_total_charge;
-          last_reference_list.push_back(old);
-        }
-      }
-      if (handle == nullptr) {
-        LRU_Insert(e);
-      } else {
-        e->Ref();
-        *handle = reinterpret_cast<Cache::Handle*>(e);
-      }
-    }
-  }
-
-  // Free the entries here outside of mutex for performance reasons
-  for (auto entry : last_reference_list) {
-    entry->Free();
-  }
 
-  return s;
+  return InsertItem(e, handle, /* free_handle_on_fail */ true);
 }
 
 void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
@@ -442,6 +610,18 @@
   }
 }
 
+bool LRUCacheShard::IsReady(Cache::Handle* handle) {
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+  MutexLock l(&mutex_);
+  bool ready = true;
+  if (e->IsPending()) {
+    assert(secondary_cache_);
+    assert(e->sec_handle);
+    ready = e->sec_handle->IsReady();
+  }
+  return ready;
+}
+
 size_t LRUCacheShard::GetUsage() const {
   MutexLock l(&mutex_);
   return usage_;
@@ -468,7 +648,8 @@
                    bool strict_capacity_limit, double high_pri_pool_ratio,
                    std::shared_ptr<MemoryAllocator> allocator,
                    bool use_adaptive_mutex,
-                   CacheMetadataChargePolicy metadata_charge_policy)
+                   CacheMetadataChargePolicy metadata_charge_policy,
+                   const std::shared_ptr<SecondaryCache>& secondary_cache)
     : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
                    std::move(allocator)) {
   num_shards_ = 1 << num_shard_bits;
@@ -476,10 +657,12 @@
       port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
   size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
   for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i])
-        LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
-                      use_adaptive_mutex, metadata_charge_policy);
+    new (&shards_[i]) LRUCacheShard(
+        per_shard, strict_capacity_limit, high_pri_pool_ratio,
+        use_adaptive_mutex, metadata_charge_policy,
+        /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
   }
+  secondary_cache_ = secondary_cache;
 }
 
 LRUCache::~LRUCache() {
@@ -492,11 +675,11 @@
   }
 }
 
-CacheShard* LRUCache::GetShard(int shard) {
+CacheShard* LRUCache::GetShard(uint32_t shard) {
   return reinterpret_cast<CacheShard*>(&shards_[shard]);
 }
 
-const CacheShard* LRUCache::GetShard(int shard) const {
+const CacheShard* LRUCache::GetShard(uint32_t shard) const {
   return reinterpret_cast<CacheShard*>(&shards_[shard]);
 }
 
@@ -508,23 +691,25 @@
   return reinterpret_cast<const LRUHandle*>(handle)->charge;
 }
 
+Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
+  auto h = reinterpret_cast<const LRUHandle*>(handle);
+  if (h->IsSecondaryCacheCompatible()) {
+    return h->info_.helper->del_cb;
+  } else {
+    return h->info_.deleter;
+  }
+}
+
 uint32_t LRUCache::GetHash(Handle* handle) const {
   return reinterpret_cast<const LRUHandle*>(handle)->hash;
 }
 
 void LRUCache::DisownData() {
-// Do not drop data if compile with ASAN to suppress leak warning.
-#if defined(__clang__)
-#if !defined(__has_feature) || !__has_feature(address_sanitizer)
-  shards_ = nullptr;
-  num_shards_ = 0;
-#endif
-#else  // __clang__
-#ifndef __SANITIZE_ADDRESS__
-  shards_ = nullptr;
-  num_shards_ = 0;
-#endif  // !__SANITIZE_ADDRESS__
-#endif  // __clang__
+  // Leak data only if that won't generate an ASAN/valgrind warning
+  if (!kMustFreeHeapAllocations) {
+    shards_ = nullptr;
+    num_shards_ = 0;
+  }
 }
 
 size_t LRUCache::TEST_GetLRUSize() {
@@ -543,19 +728,42 @@
   return result;
 }
 
-std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
-  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
-                     cache_opts.strict_capacity_limit,
-                     cache_opts.high_pri_pool_ratio,
-                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
-                     cache_opts.metadata_charge_policy);
+void LRUCache::WaitAll(std::vector<Handle*>& handles) {
+  if (secondary_cache_) {
+    std::vector<SecondaryCacheResultHandle*> sec_handles;
+    sec_handles.reserve(handles.size());
+    for (Handle* handle : handles) {
+      if (!handle) {
+        continue;
+      }
+      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+      if (!lru_handle->IsPending()) {
+        continue;
+      }
+      sec_handles.emplace_back(lru_handle->sec_handle);
+    }
+    secondary_cache_->WaitAll(sec_handles);
+    for (Handle* handle : handles) {
+      if (!handle) {
+        continue;
+      }
+      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+      if (!lru_handle->IsPending()) {
+        continue;
+      }
+      uint32_t hash = GetHash(handle);
+      LRUCacheShard* shard = static_cast<LRUCacheShard*>(GetShard(Shard(hash)));
+      shard->Promote(lru_handle);
+    }
+  }
 }
 
 std::shared_ptr<Cache> NewLRUCache(
     size_t capacity, int num_shard_bits, bool strict_capacity_limit,
     double high_pri_pool_ratio,
     std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy) {
+    CacheMetadataChargePolicy metadata_charge_policy,
+    const std::shared_ptr<SecondaryCache>& secondary_cache) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
@@ -568,7 +776,25 @@
   }
   return std::make_shared<LRUCache>(
       capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
-      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy);
+      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy,
+      secondary_cache);
+}
+
+std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
+  return NewLRUCache(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+      cache_opts.metadata_charge_policy, cache_opts.secondary_cache);
 }
 
+std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+                     high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+                     metadata_charge_policy, nullptr);
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -1,4 +1,4 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
@@ -8,12 +8,14 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 
+#include <memory>
 #include <string>
 
 #include "cache/sharded_cache.h"
-
+#include "port/lang.h"
 #include "port/malloc.h"
 #include "port/port.h"
+#include "rocksdb/secondary_cache.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -49,8 +51,18 @@
 
 struct LRUHandle {
   void* value;
-  void (*deleter)(const Slice&, void* value);
-  LRUHandle* next_hash;
+  union Info {
+    Info() {}
+    ~Info() {}
+    Cache::DeleterFn deleter;
+    const ShardedCache::CacheItemHelper* helper;
+  } info_;
+  // An entry is not added to the LRUHandleTable until the secondary cache
+  // lookup is complete, so its safe to have this union.
+  union {
+    LRUHandle* next_hash;
+    SecondaryCacheResultHandle* sec_handle;
+  };
   LRUHandle* next;
   LRUHandle* prev;
   size_t charge;  // TODO(opt): Only allow uint32_t?
@@ -67,12 +79,26 @@
     IS_HIGH_PRI = (1 << 1),
     // Whether this entry is in high-pri pool.
     IN_HIGH_PRI_POOL = (1 << 2),
-    // Wwhether this entry has had any lookups (hits).
+    // Whether this entry has had any lookups (hits).
     HAS_HIT = (1 << 3),
+    // Can this be inserted into the secondary cache
+    IS_SECONDARY_CACHE_COMPATIBLE = (1 << 4),
+    // Is the handle still being read from a lower tier
+    IS_PENDING = (1 << 5),
+    // Has the item been promoted from a lower tier
+    IS_PROMOTED = (1 << 6),
   };
 
   uint8_t flags;
 
+#ifdef __SANITIZE_THREAD__
+  // TSAN can report a false data race on flags, where one thread is writing
+  // to one of the mutable bits and another thread is reading this immutable
+  // bit. So precisely suppress that TSAN warning, we separate out this bit
+  // during TSAN runs.
+  bool is_secondary_cache_compatible_for_tsan;
+#endif  // __SANITIZE_THREAD__
+
   // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
   char key_data[1];
 
@@ -95,6 +121,15 @@
   bool IsHighPri() const { return flags & IS_HIGH_PRI; }
   bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
   bool HasHit() const { return flags & HAS_HIT; }
+  bool IsSecondaryCacheCompatible() const {
+#ifdef __SANITIZE_THREAD__
+    return is_secondary_cache_compatible_for_tsan;
+#else
+    return flags & IS_SECONDARY_CACHE_COMPATIBLE;
+#endif  // __SANITIZE_THREAD__
+  }
+  bool IsPending() const { return flags & IS_PENDING; }
+  bool IsPromoted() const { return flags & IS_PROMOTED; }
 
   void SetInCache(bool in_cache) {
     if (in_cache) {
@@ -122,15 +157,58 @@
 
   void SetHit() { flags |= HAS_HIT; }
 
+  void SetSecondaryCacheCompatible(bool compat) {
+    if (compat) {
+      flags |= IS_SECONDARY_CACHE_COMPATIBLE;
+    } else {
+      flags &= ~IS_SECONDARY_CACHE_COMPATIBLE;
+    }
+#ifdef __SANITIZE_THREAD__
+    is_secondary_cache_compatible_for_tsan = compat;
+#endif  // __SANITIZE_THREAD__
+  }
+
+  void SetIncomplete(bool incomp) {
+    if (incomp) {
+      flags |= IS_PENDING;
+    } else {
+      flags &= ~IS_PENDING;
+    }
+  }
+
+  void SetPromoted(bool promoted) {
+    if (promoted) {
+      flags |= IS_PROMOTED;
+    } else {
+      flags &= ~IS_PROMOTED;
+    }
+  }
+
   void Free() {
     assert(refs == 0);
-    if (deleter) {
-      (*deleter)(key(), value);
+#ifdef __SANITIZE_THREAD__
+    // Here we can safely assert they are the same without a data race reported
+    assert(((flags & IS_SECONDARY_CACHE_COMPATIBLE) != 0) ==
+           is_secondary_cache_compatible_for_tsan);
+#endif  // __SANITIZE_THREAD__
+    if (!IsSecondaryCacheCompatible() && info_.deleter) {
+      (*info_.deleter)(key(), value);
+    } else if (IsSecondaryCacheCompatible()) {
+      if (IsPending()) {
+        assert(sec_handle != nullptr);
+        SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
+        tmp_sec_handle->Wait();
+        value = tmp_sec_handle->Value();
+        delete tmp_sec_handle;
+      }
+      if (value) {
+        (*info_.helper->del_cb)(key(), value);
+      }
     }
     delete[] reinterpret_cast<char*>(this);
   }
 
-  // Caclculate the memory usage by metadata
+  // Calculate the memory usage by metadata
   inline size_t CalcTotalCharge(
       CacheMetadataChargePolicy metadata_charge_policy) {
     size_t meta_charge = 0;
@@ -153,7 +231,10 @@
 // 4.4.3's builtin hashtable.
 class LRUHandleTable {
  public:
-  LRUHandleTable();
+  // If the table uses more hash bits than `max_upper_hash_bits`,
+  // it will eat into the bits used for sharding, which are constant
+  // for a given LRUHandleTable.
+  explicit LRUHandleTable(int max_upper_hash_bits);
   ~LRUHandleTable();
 
   LRUHandle* Lookup(const Slice& key, uint32_t hash);
@@ -161,8 +242,8 @@
   LRUHandle* Remove(const Slice& key, uint32_t hash);
 
   template <typename T>
-  void ApplyToAllCacheEntries(T func) {
-    for (uint32_t i = 0; i < length_; i++) {
+  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
+    for (uint32_t i = index_begin; i < index_end; i++) {
       LRUHandle* h = list_[i];
       while (h != nullptr) {
         auto n = h->next_hash;
@@ -173,6 +254,8 @@
     }
   }
 
+  int GetLengthBits() const { return length_bits_; }
+
  private:
   // Return a pointer to slot that points to a cache entry that
   // matches key/hash.  If there is no such cache entry, return a
@@ -181,11 +264,19 @@
 
   void Resize();
 
+  // Number of hash bits (upper because lower bits used for sharding)
+  // used for table index. Length == 1 << length_bits_
+  int length_bits_;
+
   // The table consists of an array of buckets where each bucket is
   // a linked list of cache entries that hash into the bucket.
-  LRUHandle** list_;
-  uint32_t length_;
+  std::unique_ptr<LRUHandle*[]> list_;
+
+  // Number of elements currently in the table
   uint32_t elems_;
+
+  // Set from max_upper_hash_bits (see constructor)
+  const int max_length_bits_;
 };
 
 // A single shard of sharded cache.
@@ -193,7 +284,9 @@
  public:
   LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                 double high_pri_pool_ratio, bool use_adaptive_mutex,
-                CacheMetadataChargePolicy metadata_charge_policy);
+                CacheMetadataChargePolicy metadata_charge_policy,
+                int max_upper_hash_bits,
+                const std::shared_ptr<SecondaryCache>& secondary_cache);
   virtual ~LRUCacheShard() override = default;
 
   // Separate from constructor so caller can easily make an array of LRUCache
@@ -209,11 +302,35 @@
 
   // Like Cache methods, but with an extra "hash" parameter.
   virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
+                        size_t charge, Cache::DeleterFn deleter,
                         Cache::Handle** handle,
-                        Cache::Priority priority) override;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+                        Cache::Priority priority) override {
+    return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
+  }
+  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+                        const Cache::CacheItemHelper* helper, size_t charge,
+                        Cache::Handle** handle,
+                        Cache::Priority priority) override {
+    assert(helper);
+    return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
+  }
+  // If helper_cb is null, the values of the following arguments don't
+  // matter
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                                const ShardedCache::CacheItemHelper* helper,
+                                const ShardedCache::CreateCallback& create_cb,
+                                ShardedCache::Priority priority, bool wait,
+                                Statistics* stats) override;
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override {
+    return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
+                  nullptr);
+  }
+  virtual bool Release(Cache::Handle* handle, bool /*useful*/,
+                       bool force_erase) override {
+    return Release(handle, force_erase);
+  }
+  virtual bool IsReady(Cache::Handle* /*handle*/) override;
+  virtual void Wait(Cache::Handle* /*handle*/) override {}
   virtual bool Ref(Cache::Handle* handle) override;
   virtual bool Release(Cache::Handle* handle,
                        bool force_erase = false) override;
@@ -226,8 +343,10 @@
   virtual size_t GetUsage() const override;
   virtual size_t GetPinnedUsage() const override;
 
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) override;
+  virtual void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      uint32_t average_entries_per_lock, uint32_t* state) override;
 
   virtual void EraseUnRefEntries() override;
 
@@ -239,10 +358,27 @@
   //  not threadsafe
   size_t TEST_GetLRUSize();
 
-  //  Retrives high pri pool ratio
+  //  Retrieves high pri pool ratio
   double GetHighPriPoolRatio();
 
  private:
+  friend class LRUCache;
+  // Insert an item into the hash table and, if handle is null, insert into
+  // the LRU list. Older items are evicted as necessary. If the cache is full
+  // and free_handle_on_fail is true, the item is deleted and handle is set to.
+  Status InsertItem(LRUHandle* item, Cache::Handle** handle,
+                    bool free_handle_on_fail);
+  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
+                DeleterFn deleter, const Cache::CacheItemHelper* helper,
+                Cache::Handle** handle, Cache::Priority priority);
+  // Promote an item looked up from the secondary cache to the LRU cache. The
+  // item is only inserted into the hash table and not the LRU list, and only
+  // if the cache is not at full capacity, as is the case during Insert.  The
+  // caller should hold a reference on the LRUHandle. When the caller releases
+  // the last reference, the item is added to the LRU list.
+  // The item is promoted to the high pri or low pri pool as specified by the
+  // caller in Lookup.
+  void Promote(LRUHandle* e);
   void LRU_Remove(LRUHandle* e);
   void LRU_Insert(LRUHandle* e);
 
@@ -303,6 +439,8 @@
   // We don't count mutex_ as the cache's internal state so semantically we
   // don't mind mutex_ invoking the non-const actions.
   mutable port::Mutex mutex_;
+
+  std::shared_ptr<SecondaryCache> secondary_cache_;
 };
 
 class LRUCache
@@ -316,24 +454,28 @@
            std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
            bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
            CacheMetadataChargePolicy metadata_charge_policy =
-               kDontChargeCacheMetadata);
+               kDontChargeCacheMetadata,
+           const std::shared_ptr<SecondaryCache>& secondary_cache = nullptr);
   virtual ~LRUCache();
   virtual const char* Name() const override { return "LRUCache"; }
-  virtual CacheShard* GetShard(int shard) override;
-  virtual const CacheShard* GetShard(int shard) const override;
+  virtual CacheShard* GetShard(uint32_t shard) override;
+  virtual const CacheShard* GetShard(uint32_t shard) const override;
   virtual void* Value(Handle* handle) override;
   virtual size_t GetCharge(Handle* handle) const override;
   virtual uint32_t GetHash(Handle* handle) const override;
+  virtual DeleterFn GetDeleter(Handle* handle) const override;
   virtual void DisownData() override;
+  virtual void WaitAll(std::vector<Handle*>& handles) override;
 
   //  Retrieves number of elements in LRU, for unit test purpose only
   size_t TEST_GetLRUSize();
-  //  Retrives high pri pool ratio
+  //  Retrieves high pri pool ratio
   double GetHighPriPoolRatio();
 
  private:
   LRUCacheShard* shards_ = nullptr;
   int num_shards_ = 0;
+  std::shared_ptr<SecondaryCache> secondary_cache_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/lru_cache_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/lru_cache_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,8 +7,21 @@
 
 #include <string>
 #include <vector>
+
+#include "cache/cache_key.h"
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/utilities/cache_dump_load.h"
 #include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "utilities/cache_dump_load_impl.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -30,15 +43,17 @@
     DeleteCache();
     cache_ = reinterpret_cast<LRUCacheShard*>(
         port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
-    new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
-                               high_pri_pool_ratio, use_adaptive_mutex,
-                               kDontChargeCacheMetadata);
+    new (cache_) LRUCacheShard(
+        capacity, false /*strict_capcity_limit*/, high_pri_pool_ratio,
+        use_adaptive_mutex, kDontChargeCacheMetadata,
+        24 /*max_upper_hash_bits*/, nullptr /*secondary_cache*/);
   }
 
   void Insert(const std::string& key,
               Cache::Priority priority = Cache::Priority::LOW) {
-    cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
-                   nullptr /*deleter*/, nullptr /*handle*/, priority);
+    EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/,
+                             nullptr /*deleter*/, nullptr /*handle*/,
+                             priority));
   }
 
   void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) {
@@ -190,6 +205,1641 @@
   ValidateLRUList({"e", "f", "g", "Z", "d"}, 2);
 }
 
+class TestSecondaryCache : public SecondaryCache {
+ public:
+  // Specifies what action to take on a lookup for a particular key
+  enum ResultType {
+    SUCCESS,
+    // Fail lookup immediately
+    FAIL,
+    // Defer the result. It will returned after Wait/WaitAll is called
+    DEFER,
+    // Defer the result and eventually return failure
+    DEFER_AND_FAIL
+  };
+
+  using ResultMap = std::unordered_map<std::string, ResultType>;
+
+  explicit TestSecondaryCache(size_t capacity)
+      : num_inserts_(0), num_lookups_(0), inject_failure_(false) {
+    cache_ = NewLRUCache(capacity, 0, false, 0.5, nullptr,
+                         kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  }
+  ~TestSecondaryCache() override { cache_.reset(); }
+
+  const char* Name() const override { return "TestSecondaryCache"; }
+
+  void InjectFailure() { inject_failure_ = true; }
+
+  void ResetInjectFailure() { inject_failure_ = false; }
+
+  void SetDbSessionId(const std::string& db_session_id) {
+    // NOTE: we assume the file is smaller than kMaxFileSizeStandardEncoding
+    // for this to work, but that's safe in a test.
+    auto base = OffsetableCacheKey("unknown", db_session_id, 1, 1);
+    ckey_prefix_ = base.CommonPrefixSlice().ToString();
+  }
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper) override {
+    if (inject_failure_) {
+      return Status::Corruption("Insertion Data Corrupted");
+    }
+    EXPECT_TRUE(IsDbSessionLowerAsKeyPrefix(key));
+    size_t size;
+    char* buf;
+    Status s;
+
+    num_inserts_++;
+    size = (*helper->size_cb)(value);
+    buf = new char[size + sizeof(uint64_t)];
+    EncodeFixed64(buf, size);
+    s = (*helper->saveto_cb)(value, 0, size, buf + sizeof(uint64_t));
+    if (!s.ok()) {
+      delete[] buf;
+      return s;
+    }
+    return cache_->Insert(key, buf, size,
+                          [](const Slice& /*key*/, void* val) -> void {
+                            delete[] static_cast<char*>(val);
+                          });
+  }
+
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb,
+      bool /*wait*/) override {
+    std::string key_str = key.ToString();
+    TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str);
+
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle;
+    ResultType type = ResultType::SUCCESS;
+    auto iter = result_map_.find(key.ToString());
+    if (iter != result_map_.end()) {
+      type = iter->second;
+    }
+    if (type == ResultType::FAIL) {
+      return secondary_handle;
+    }
+
+    Cache::Handle* handle = cache_->Lookup(key);
+    num_lookups_++;
+    if (handle) {
+      void* value = nullptr;
+      size_t charge = 0;
+      Status s;
+      if (type != ResultType::DEFER_AND_FAIL) {
+        char* ptr = (char*)cache_->Value(handle);
+        size_t size = DecodeFixed64(ptr);
+        ptr += sizeof(uint64_t);
+        s = create_cb(ptr, size, &value, &charge);
+      }
+      if (s.ok()) {
+        secondary_handle.reset(new TestSecondaryCacheResultHandle(
+            cache_.get(), handle, value, charge, type));
+      } else {
+        cache_->Release(handle);
+      }
+    }
+    return secondary_handle;
+  }
+
+  void Erase(const Slice& /*key*/) override {}
+
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override {
+    for (SecondaryCacheResultHandle* handle : handles) {
+      TestSecondaryCacheResultHandle* sec_handle =
+          static_cast<TestSecondaryCacheResultHandle*>(handle);
+      sec_handle->SetReady();
+    }
+  }
+
+  std::string GetPrintableOptions() const override { return ""; }
+
+  void SetResultMap(ResultMap&& map) { result_map_ = std::move(map); }
+
+  uint32_t num_inserts() { return num_inserts_; }
+
+  uint32_t num_lookups() { return num_lookups_; }
+
+  bool IsDbSessionLowerAsKeyPrefix(const Slice& key) {
+    return key.starts_with(ckey_prefix_);
+  }
+
+ private:
+  class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
+   public:
+    TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle,
+                                   void* value, size_t size, ResultType type)
+        : cache_(cache),
+          handle_(handle),
+          value_(value),
+          size_(size),
+          is_ready_(true) {
+      if (type != ResultType::SUCCESS) {
+        is_ready_ = false;
+      }
+    }
+
+    ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); }
+
+    bool IsReady() override { return is_ready_; }
+
+    void Wait() override {}
+
+    void* Value() override {
+      assert(is_ready_);
+      return value_;
+    }
+
+    size_t Size() override { return Value() ? size_ : 0; }
+
+    void SetReady() { is_ready_ = true; }
+
+   private:
+    Cache* cache_;
+    Cache::Handle* handle_;
+    void* value_;
+    size_t size_;
+    bool is_ready_;
+  };
+
+  std::shared_ptr<Cache> cache_;
+  uint32_t num_inserts_;
+  uint32_t num_lookups_;
+  bool inject_failure_;
+  std::string ckey_prefix_;
+  ResultMap result_map_;
+};
+
+class DBSecondaryCacheTest : public DBTestBase {
+ public:
+  DBSecondaryCacheTest()
+      : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+class LRUSecondaryCacheTest : public LRUCacheTest {
+ public:
+  LRUSecondaryCacheTest() : fail_create_(false) {}
+  ~LRUSecondaryCacheTest() {}
+
+ protected:
+  class TestItem {
+   public:
+    TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
+      memcpy(buf_.get(), buf, size);
+    }
+    ~TestItem() {}
+
+    char* Buf() { return buf_.get(); }
+    size_t Size() { return size_; }
+    std::string ToString() { return std::string(Buf(), Size()); }
+
+   private:
+    std::unique_ptr<char[]> buf_;
+    size_t size_;
+  };
+
+  static size_t SizeCallback(void* obj) {
+    return reinterpret_cast<TestItem*>(obj)->Size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    TestItem* item = reinterpret_cast<TestItem*>(from_obj);
+    char* buf = item->Buf();
+    EXPECT_EQ(length, item->Size());
+    EXPECT_EQ(from_offset, 0);
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static void DeletionCallback(const Slice& /*key*/, void* obj) {
+    delete reinterpret_cast<TestItem*>(obj);
+  }
+
+  static Cache::CacheItemHelper helper_;
+
+  static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
+                                   size_t /*size*/, void* /*out*/) {
+    return Status::NotSupported();
+  }
+
+  static Cache::CacheItemHelper helper_fail_;
+
+  Cache::CreateCallback test_item_creator =
+      [&](void* buf, size_t size, void** out_obj, size_t* charge) -> Status {
+    if (fail_create_) {
+      return Status::NotSupported();
+    }
+    *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
+    *charge = size;
+    return Status::OK();
+  };
+
+  void SetFailCreate(bool fail) { fail_create_ = fail; }
+
+ private:
+  bool fail_create_;
+};
+
+Cache::CacheItemHelper LRUSecondaryCacheTest::helper_(
+    LRUSecondaryCacheTest::SizeCallback, LRUSecondaryCacheTest::SaveToCallback,
+    LRUSecondaryCacheTest::DeletionCallback);
+
+Cache::CacheItemHelper LRUSecondaryCacheTest::helper_fail_(
+    LRUSecondaryCacheTest::SizeCallback,
+    LRUSecondaryCacheTest::SaveToCallbackFail,
+    LRUSecondaryCacheTest::DeletionCallback);
+
+TEST_F(LRUSecondaryCacheTest, BasicTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  std::shared_ptr<Statistics> stats = CreateDBStatistics();
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_,
+                          str2.length()));
+
+  get_perf_context()->Reset();
+  Cache::Handle* handle;
+  handle =
+      cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, test_item_creator,
+                    Cache::Priority::LOW, true, stats.get());
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should promote k1 and demote k2
+  handle =
+      cache->Lookup("k1", &LRUSecondaryCacheTest::helper_, test_item_creator,
+                    Cache::Priority::LOW, true, stats.get());
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+  ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_HITS),
+            secondary_cache->num_lookups());
+  PerfContext perf_ctx = *get_perf_context();
+  ASSERT_EQ(perf_ctx.secondary_cache_hit_count, secondary_cache->num_lookups());
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, BasicFailTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_NOK(cache->Insert("k1", item1, nullptr, str1.length()));
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup("k2", nullptr, test_item_creator, Cache::Priority::LOW,
+                         true);
+  ASSERT_EQ(handle, nullptr);
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, false);
+  ASSERT_EQ(handle, nullptr);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, SaveFailTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_fail_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_fail_,
+                          str2.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should fail, since k1 demotion would have failed
+  handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_fail_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_EQ(handle, nullptr);
+  // Since k1 didn't get promoted, k2 should still be in cache
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, CreateFailTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_,
+                          str2.length()));
+
+  Cache::Handle* handle;
+  SetFailCreate(true);
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should fail, since k1 creation would have failed
+  handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_EQ(handle, nullptr);
+  // Since k1 didn't get promoted, k2 should still be in cache
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, FullCapacityTest) {
+  LRUCacheOptions opts(1024, 0, /*_strict_capacity_limit=*/true, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_,
+                          str2.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  // k1 promotion should fail due to the block cache being at capacity,
+  // but the lookup should still succeed
+  Cache::Handle* handle2;
+  handle2 = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_,
+                          test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle2, nullptr);
+  // Since k1 didn't get inserted, k2 should still be in cache
+  cache->Release(handle);
+  cache->Release(handle2);
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) {
+  LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Note that, block_1 is never successfully
+  // inserted to the block cache. Here are 2 lookups in the secondary cache
+  // for block_1 and block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. Meta blocks are always cached. When block_1 is read
+  // out, block_2 is evicted from block cache and inserted to secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // The first data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+  // be inserted successfully due to the size. Currently, cache only has
+  // the meta blocks.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // The second data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+  // in the secondary cache. Now block cache has block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // block_2 is in the block cache. There is a block cache hit. No need to
+  // lookup or insert the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+
+  Destroy(options);
+}
+
+// In this test, the block cache size is set to 6100, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// insert and cache block_1 in the block cache (this is the different place
+// from TestSecondaryCacheCorrectness1)
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
+  LRUCacheOptions opts(6100, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.paranoid_file_checks = true;
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Thefore, block_1 is evicted from block
+  // cache and successfully inserted to the secondary cache. Here are 2
+  // lookups in the secondary cache for block_1 and block_2.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. After Flush, only block_2 is cached in block cache
+  // and block_1 is in the secondary cache. So when read block_1, it is
+  // read out from secondary cache and inserted to block cache. At the same
+  // time, block_2 is inserted to secondary cache. Now, secondary cache has
+  // both block_1 and block_2. After compaction, block_1 is in the cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is cached in block cache
+  // there is no secondary cache lookup.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_2 which is not in the block cache. So
+  // it will lookup the secondary cache for block_2 and cache it in the
+  // block_cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_2 which is already in the block cache.
+  // No need to lookup secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is not in block cache
+  // there is one econdary cache lookup. Then, block_1 is cached in the
+  // block cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is cached in block cache
+  // there is no secondary cache lookup.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  Destroy(options);
+}
+
+// The block cache size is set to 1024*1024, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// cache all the blocks in the block cache and there is not secondary cache
+// insertion. 2 lookup is needed for the blocks.
+TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) {
+  LRUCacheOptions opts(1024 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.paranoid_file_checks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1000);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. Now, block cache is large enough, it cache
+  // both block_1 and block_2. When first time read block_1 and block_2
+  // there are cache misses. So 2 secondary cache lookups are needed for
+  // the 2 blocks
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will iterate the whole SST file. Since all the data blocks
+  // are in the block cache. No need to lookup the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1000, v.size());
+  // Since the block cache is large enough, all the blocks are cached. we
+  // do not need to lookup the seondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) {
+  LRUCacheOptions opts(8 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 256;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1000);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+  ASSERT_OK(Flush());
+  Compact("a", "z");
+
+  Random r_index(47);
+  std::string v;
+  for (int i = 0; i < 1000; i++) {
+    uint32_t key_i = r_index.Next() % N;
+    v = Get(Key(key_i));
+  }
+
+  // We have over 200 data blocks there will be multiple insertion
+  // and lookups.
+  ASSERT_GE(secondary_cache->num_inserts(), 1u);
+  ASSERT_GE(secondary_cache->num_lookups(), 1u);
+
+  Destroy(options);
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) {
+  LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.paranoid_file_checks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB will do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Note that, block_1 is never successfully
+  // inserted to the block cache. Here are 2 lookups in the secondary cache
+  // for block_1 and block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  // Fail the insertion, in LRU cache, the secondary insertion returned status
+  // is not checked, therefore, the DB will not be influenced.
+  secondary_cache->InjectFailure();
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. Meta blocks are always cached. When block_1 is read
+  // out, block_2 is evicted from block cache and inserted to secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // The first data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+  // be inserted successfully due to the size. Currently, cache only has
+  // the meta blocks.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // The second data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+  // in the secondary cache. Now block cache has block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // block_2 is in the block cache. There is a block cache hit. No need to
+  // lookup or insert the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+  secondary_cache->ResetInjectFailure();
+
+  Destroy(options);
+}
+
+TEST_F(LRUSecondaryCacheTest, BasicWaitAllTest) {
+  LRUCacheOptions opts(1024, 2, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(32 * 1024);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  const int num_keys = 32;
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  for (int i = 0; i < num_keys; ++i) {
+    std::string str = rnd.RandomString(1020);
+    values.emplace_back(str);
+    TestItem* item = new TestItem(str.data(), str.length());
+    ASSERT_OK(cache->Insert("k" + std::to_string(i), item,
+                            &LRUSecondaryCacheTest::helper_, str.length()));
+  }
+  // Force all entries to be evicted to the secondary cache
+  cache->SetCapacity(0);
+  ASSERT_EQ(secondary_cache->num_inserts(), 32u);
+  cache->SetCapacity(32 * 1024);
+
+  secondary_cache->SetResultMap(
+      {{"k3", TestSecondaryCache::ResultType::DEFER},
+       {"k4", TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+       {"k5", TestSecondaryCache::ResultType::FAIL}});
+  std::vector<Cache::Handle*> results;
+  for (int i = 0; i < 6; ++i) {
+    results.emplace_back(
+        cache->Lookup("k" + std::to_string(i), &LRUSecondaryCacheTest::helper_,
+                      test_item_creator, Cache::Priority::LOW, false));
+  }
+  cache->WaitAll(results);
+  for (int i = 0; i < 6; ++i) {
+    if (i == 4) {
+      ASSERT_EQ(cache->Value(results[i]), nullptr);
+    } else if (i == 5) {
+      ASSERT_EQ(results[i], nullptr);
+      continue;
+    } else {
+      TestItem* item = static_cast<TestItem*>(cache->Value(results[i]));
+      ASSERT_EQ(item->ToString(), values[i]);
+    }
+    cache->Release(results[i]);
+  }
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+// In this test, we have one KV pair per data block. We indirectly determine
+// the cache key associated with each data block (and thus each KV) by using
+// a sync point callback in TestSecondaryCache::Lookup. We then control the
+// lookup result by setting the ResultMap.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) {
+  LRUCacheOptions opts(1 << 20, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  table_options.cache_index_and_filter_blocks = false;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 8;
+  std::vector<std::string> keys;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(4000);
+    keys.emplace_back(p_v);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB does the paranoid check for the new
+  // SST file. This will try to lookup all data blocks in the secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 8u);
+
+  cache->SetCapacity(0);
+  ASSERT_EQ(secondary_cache->num_inserts(), 8u);
+  cache->SetCapacity(1 << 20);
+
+  std::vector<std::string> cache_keys;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestSecondaryCache::Lookup", [&cache_keys](void* key) -> void {
+        cache_keys.emplace_back(*(static_cast<std::string*>(key)));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < N; ++i) {
+    std::string v = Get(Key(i));
+    ASSERT_EQ(4000, v.size());
+    ASSERT_EQ(v, keys[i]);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(secondary_cache->num_lookups(), 16u);
+  cache->SetCapacity(0);
+  cache->SetCapacity(1 << 20);
+
+  ASSERT_EQ(Get(Key(2)), keys[2]);
+  ASSERT_EQ(Get(Key(7)), keys[7]);
+  secondary_cache->SetResultMap(
+      {{cache_keys[3], TestSecondaryCache::ResultType::DEFER},
+       {cache_keys[4], TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+       {cache_keys[5], TestSecondaryCache::ResultType::FAIL}});
+
+  std::vector<std::string> mget_keys(
+      {Key(0), Key(1), Key(2), Key(3), Key(4), Key(5), Key(6), Key(7)});
+  std::vector<PinnableSlice> values(mget_keys.size());
+  std::vector<Status> s(keys.size());
+  std::vector<Slice> key_slices;
+  for (const std::string& key : mget_keys) {
+    key_slices.emplace_back(key);
+  }
+  uint32_t num_lookups = secondary_cache->num_lookups();
+  dbfull()->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(),
+                     key_slices.size(), key_slices.data(), values.data(),
+                     s.data(), false);
+  ASSERT_EQ(secondary_cache->num_lookups(), num_lookups + 5);
+  for (int i = 0; i < N; ++i) {
+    ASSERT_OK(s[i]);
+    ASSERT_EQ(values[i].ToString(), keys[i]);
+    values[i].Reset();
+  }
+  Destroy(options);
+}
+
+class LRUCacheWithStat : public LRUCache {
+ public:
+  LRUCacheWithStat(
+      size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit,
+      double _high_pri_pool_ratio,
+      std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+      bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+      CacheMetadataChargePolicy _metadata_charge_policy =
+          kDontChargeCacheMetadata,
+      const std::shared_ptr<SecondaryCache>& _secondary_cache = nullptr)
+      : LRUCache(_capacity, _num_shard_bits, _strict_capacity_limit,
+                 _high_pri_pool_ratio, _memory_allocator, _use_adaptive_mutex,
+                 _metadata_charge_policy, _secondary_cache) {
+    insert_count_ = 0;
+    lookup_count_ = 0;
+  }
+  ~LRUCacheWithStat() {}
+
+  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
+                Handle** handle, Priority priority) override {
+    insert_count_++;
+    return LRUCache::Insert(key, value, charge, deleter, handle, priority);
+  }
+  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+                size_t chargge, Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    insert_count_++;
+    return LRUCache::Insert(key, value, helper, chargge, handle, priority);
+  }
+  Handle* Lookup(const Slice& key, Statistics* stats) override {
+    lookup_count_++;
+    return LRUCache::Lookup(key, stats);
+  }
+  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                 const CreateCallback& create_cb, Priority priority, bool wait,
+                 Statistics* stats = nullptr) override {
+    lookup_count_++;
+    return LRUCache::Lookup(key, helper, create_cb, priority, wait, stats);
+  }
+
+  uint32_t GetInsertCount() { return insert_count_; }
+  uint32_t GetLookupcount() { return lookup_count_; }
+  void ResetCount() {
+    insert_count_ = 0;
+    lookup_count_ = 0;
+  }
+
+ private:
+  uint32_t insert_count_;
+  uint32_t lookup_count_;
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) {
+  LRUCacheOptions cache_opts(1024 * 1024, 0, false, 0.5, nullptr,
+                             kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  LRUCacheWithStat* tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+      cache_opts.metadata_charge_policy, cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache(tmp_cache);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  DestroyAndReopen(options);
+  fault_fs_->SetFailGetUniqueId(true);
+
+  Random rnd(301);
+  const int N = 256;
+  std::vector<std::string> value;
+  char buf[1000];
+  memset(buf, 'a', 1000);
+  value.resize(N);
+  for (int i = 0; i < N; i++) {
+    // std::string p_v = rnd.RandomString(1000);
+    std::string p_v(buf, 1000);
+    value[i] = p_v;
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+  ASSERT_OK(Flush());
+  Compact("a", "z");
+
+  // do th eread for all the key value pairs, so all the blocks should be in
+  // cache
+  uint32_t start_insert = tmp_cache->GetInsertCount();
+  uint32_t start_lookup = tmp_cache->GetLookupcount();
+  std::string v;
+  for (int i = 0; i < N; i++) {
+    v = Get(Key(i));
+    ASSERT_EQ(v, value[i]);
+  }
+  uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert;
+  uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup;
+  ASSERT_EQ(63,
+            static_cast<int>(dump_insert));  // the insert in the block cache
+  ASSERT_EQ(256,
+            static_cast<int>(dump_lookup));  // the lookup in the block cache
+  // We have enough blocks in the block cache
+
+  CacheDumpOptions cd_options;
+  cd_options.clock = fault_env_->GetSystemClock().get();
+  std::string dump_path = db_->GetName() + "/cache_dump";
+  std::unique_ptr<CacheDumpWriter> dump_writer;
+  Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path,
+                                      &dump_writer);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumper> cache_dumper;
+  s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer),
+                            &cache_dumper);
+  ASSERT_OK(s);
+  std::vector<DB*> db_list;
+  db_list.push_back(db_);
+  s = cache_dumper->SetDumpFilter(db_list);
+  ASSERT_OK(s);
+  s = cache_dumper->DumpCacheEntriesToWriter();
+  ASSERT_OK(s);
+  cache_dumper.reset();
+
+  // we have a new cache it is empty, then, before we do the Get, we do the
+  // dumpload
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048 * 1024);
+  cache_opts.secondary_cache = secondary_cache;
+  tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+      cache_opts.metadata_charge_policy, cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache_new(tmp_cache);
+  table_options.block_cache = cache_new;
+  table_options.block_size = 4 * 1024;
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+
+  // start to load the data to new block cache
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  std::unique_ptr<CacheDumpReader> dump_reader;
+  s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path,
+                                 &dump_reader);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumpedLoader> cache_loader;
+  s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache,
+                                  std::move(dump_reader), &cache_loader);
+  ASSERT_OK(s);
+  s = cache_loader->RestoreCacheEntriesToSecondaryCache();
+  ASSERT_OK(s);
+  uint32_t load_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup;
+  // check the number we inserted
+  ASSERT_EQ(64, static_cast<int>(load_insert));
+  ASSERT_EQ(0, static_cast<int>(load_lookup));
+  ASSERT_OK(s);
+
+  Reopen(options);
+
+  // After load, we do the Get again
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  uint32_t cache_insert = tmp_cache->GetInsertCount();
+  uint32_t cache_lookup = tmp_cache->GetLookupcount();
+  for (int i = 0; i < N; i++) {
+    v = Get(Key(i));
+    ASSERT_EQ(v, value[i]);
+  }
+  uint32_t final_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup;
+  // no insert to secondary cache
+  ASSERT_EQ(0, static_cast<int>(final_insert));
+  // lookup the secondary to get all blocks
+  ASSERT_EQ(64, static_cast<int>(final_lookup));
+  uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert;
+  uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup;
+  // Check the new block cache insert and lookup, should be no insert since all
+  // blocks are from the secondary cache.
+  ASSERT_EQ(0, static_cast<int>(block_insert));
+  ASSERT_EQ(256, static_cast<int>(block_lookup));
+
+  fault_fs_->SetFailGetUniqueId(false);
+  Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
+  LRUCacheOptions cache_opts(1024 * 1024, 0, false, 0.5, nullptr,
+                             kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  LRUCacheWithStat* tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+      cache_opts.metadata_charge_policy, cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache(tmp_cache);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  std::string dbname1 = test::PerThreadDBPath("db_1");
+  ASSERT_OK(DestroyDB(dbname1, options));
+  DB* db1 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname1, &db1));
+  std::string dbname2 = test::PerThreadDBPath("db_2");
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname2, &db2));
+  fault_fs_->SetFailGetUniqueId(true);
+
+  // write the KVs to db1
+  Random rnd(301);
+  const int N = 256;
+  std::vector<std::string> value1;
+  WriteOptions wo;
+  char buf[1000];
+  memset(buf, 'a', 1000);
+  value1.resize(N);
+  for (int i = 0; i < N; i++) {
+    std::string p_v(buf, 1000);
+    value1[i] = p_v;
+    ASSERT_OK(db1->Put(wo, Key(i), p_v));
+  }
+  ASSERT_OK(db1->Flush(FlushOptions()));
+  Slice bg("a");
+  Slice ed("b");
+  ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+  // Write the KVs to DB2
+  std::vector<std::string> value2;
+  memset(buf, 'b', 1000);
+  value2.resize(N);
+  for (int i = 0; i < N; i++) {
+    std::string p_v(buf, 1000);
+    value2[i] = p_v;
+    ASSERT_OK(db2->Put(wo, Key(i), p_v));
+  }
+  ASSERT_OK(db2->Flush(FlushOptions()));
+  ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+  // do th eread for all the key value pairs, so all the blocks should be in
+  // cache
+  uint32_t start_insert = tmp_cache->GetInsertCount();
+  uint32_t start_lookup = tmp_cache->GetLookupcount();
+  ReadOptions ro;
+  std::string v;
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db1->Get(ro, Key(i), &v));
+    ASSERT_EQ(v, value1[i]);
+  }
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db2->Get(ro, Key(i), &v));
+    ASSERT_EQ(v, value2[i]);
+  }
+  uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert;
+  uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup;
+  ASSERT_EQ(128,
+            static_cast<int>(dump_insert));  // the insert in the block cache
+  ASSERT_EQ(512,
+            static_cast<int>(dump_lookup));  // the lookup in the block cache
+  // We have enough blocks in the block cache
+
+  CacheDumpOptions cd_options;
+  cd_options.clock = fault_env_->GetSystemClock().get();
+  std::string dump_path = db1->GetName() + "/cache_dump";
+  std::unique_ptr<CacheDumpWriter> dump_writer;
+  Status s = NewToFileCacheDumpWriter(fault_fs_, FileOptions(), dump_path,
+                                      &dump_writer);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumper> cache_dumper;
+  s = NewDefaultCacheDumper(cd_options, cache, std::move(dump_writer),
+                            &cache_dumper);
+  ASSERT_OK(s);
+  std::vector<DB*> db_list;
+  db_list.push_back(db1);
+  s = cache_dumper->SetDumpFilter(db_list);
+  ASSERT_OK(s);
+  s = cache_dumper->DumpCacheEntriesToWriter();
+  ASSERT_OK(s);
+  cache_dumper.reset();
+
+  // we have a new cache it is empty, then, before we do the Get, we do the
+  // dumpload
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048 * 1024);
+  cache_opts.secondary_cache = secondary_cache;
+  tmp_cache = new LRUCacheWithStat(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+      cache_opts.metadata_charge_policy, cache_opts.secondary_cache);
+  std::shared_ptr<Cache> cache_new(tmp_cache);
+  table_options.block_cache = cache_new;
+  table_options.block_size = 4 * 1024;
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+
+  // Start the cache loading process
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  std::unique_ptr<CacheDumpReader> dump_reader;
+  s = NewFromFileCacheDumpReader(fault_fs_, FileOptions(), dump_path,
+                                 &dump_reader);
+  ASSERT_OK(s);
+  std::unique_ptr<CacheDumpedLoader> cache_loader;
+  s = NewDefaultCacheDumpedLoader(cd_options, table_options, secondary_cache,
+                                  std::move(dump_reader), &cache_loader);
+  ASSERT_OK(s);
+  s = cache_loader->RestoreCacheEntriesToSecondaryCache();
+  ASSERT_OK(s);
+  uint32_t load_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t load_lookup = secondary_cache->num_lookups() - start_lookup;
+  // check the number we inserted
+  ASSERT_EQ(64, static_cast<int>(load_insert));
+  ASSERT_EQ(0, static_cast<int>(load_lookup));
+  ASSERT_OK(s);
+
+  ASSERT_OK(db1->Close());
+  delete db1;
+  ASSERT_OK(DB::Open(options, dbname1, &db1));
+
+  // After load, we do the Get again. To validate the cache, we do not allow any
+  // I/O, so we set the file system to false.
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  fault_fs_->SetFilesystemActive(false, error_msg);
+  start_insert = secondary_cache->num_inserts();
+  start_lookup = secondary_cache->num_lookups();
+  uint32_t cache_insert = tmp_cache->GetInsertCount();
+  uint32_t cache_lookup = tmp_cache->GetLookupcount();
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db1->Get(ro, Key(i), &v));
+    ASSERT_EQ(v, value1[i]);
+  }
+  uint32_t final_insert = secondary_cache->num_inserts() - start_insert;
+  uint32_t final_lookup = secondary_cache->num_lookups() - start_lookup;
+  // no insert to secondary cache
+  ASSERT_EQ(0, static_cast<int>(final_insert));
+  // lookup the secondary to get all blocks
+  ASSERT_EQ(64, static_cast<int>(final_lookup));
+  uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert;
+  uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup;
+  // Check the new block cache insert and lookup, should be no insert since all
+  // blocks are from the secondary cache.
+  ASSERT_EQ(0, static_cast<int>(block_insert));
+  ASSERT_EQ(256, static_cast<int>(block_lookup));
+  fault_fs_->SetFailGetUniqueId(false);
+  fault_fs_->SetFilesystemActive(true);
+  delete db1;
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname1, options));
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+// Test the option not to use the secondary cache in a certain DB.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) {
+  LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i + 70), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  // Flush will trigger the paranoid check and read blocks. But only block cache
+  // will be read. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  Compact("a", "z");
+
+  // Compaction will also insert and evict blocks, no operations to the block
+  // cache. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the data in first block. Cache miss, direclty read from SST file.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the second block.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // block cache hit
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(70));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the first block in the second SST file. Cache miss and trigger SST
+  // file read. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(75));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the second block in the second SST file. Cache miss and trigger SST
+  // file read. No operations for secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  Destroy(options);
+}
+
+// We disable the secondary cache in DBOptions at first. Close and reopen the DB
+// with new options, which set the lowest_used_cache_tier to
+// kNonVolatileBlockTier. So secondary cache will be used.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) {
+  LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i + 70), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  // Flush will trigger the paranoid check and read blocks. But only block cache
+  // will be read.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  Compact("a", "z");
+
+  // Compaction will also insert and evict blocks, no operations to the block
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the data in first block. Cache miss, direclty read from SST file.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // Check the second block.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+
+  // block cache hit
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+
+  // Change the option to enable secondary cache after we Reopen the DB
+  options.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+  Reopen(options);
+
+  v = Get(Key(70));
+  ASSERT_EQ(1007, v.size());
+
+  // Enable the secondary cache, trigger lookup of the first block in second SST
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  v = Get(Key(75));
+  ASSERT_EQ(1007, v.size());
+
+  // trigger lookup of the second block in second SST
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+  Destroy(options);
+}
+
+// Two DB test. We create 2 DBs sharing the same block cache and secondary
+// cache. We diable the secondary cache option for DB2.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
+  LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  options.paranoid_file_checks = true;
+  std::string dbname1 = test::PerThreadDBPath("db_t_1");
+  ASSERT_OK(DestroyDB(dbname1, options));
+  DB* db1 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname1, &db1));
+  std::string dbname2 = test::PerThreadDBPath("db_t_2");
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  Options options2 = options;
+  options2.lowest_used_cache_tier = CacheTier::kVolatileTier;
+  ASSERT_OK(DB::Open(options2, dbname2, &db2));
+  fault_fs_->SetFailGetUniqueId(true);
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  std::string session_id;
+  ASSERT_OK(db1->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+
+  WriteOptions wo;
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(db1->Put(wo, Key(i), p_v));
+  }
+
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 0u);
+  ASSERT_OK(db1->Flush(FlushOptions()));
+
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(db2->Put(wo, Key(i), p_v));
+  }
+
+  // No change in the secondary cache, since it is disabled in DB2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+  ASSERT_OK(db2->Flush(FlushOptions()));
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Slice bg("a");
+  Slice ed("b");
+  ASSERT_OK(db1->CompactRange(CompactRangeOptions(), &bg, &ed));
+  ASSERT_OK(db2->CompactRange(CompactRangeOptions(), &bg, &ed));
+
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  ReadOptions ro;
+  std::string v;
+  ASSERT_OK(db1->Get(ro, Key(0), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // DB 1 has lookup block 1 and it is miss in block cache, trigger secondary
+  // cache lookup
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  ASSERT_OK(db1->Get(ro, Key(5), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // DB 1 lookup the second block and it is miss in block cache, trigger
+  // secondary cache lookup
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  ASSERT_OK(db2->Get(ro, Key(0), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // For db2, it is not enabled with secondary cache, so no search in the
+  // secondary cache
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  ASSERT_OK(db2->Get(ro, Key(5), &v));
+  ASSERT_EQ(1007, v.size());
+
+  // For db2, it is not enabled with secondary cache, so no search in the
+  // secondary cache
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  fault_fs_->SetFailGetUniqueId(false);
+  fault_fs_->SetFilesystemActive(true);
+  delete db1;
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname1, options));
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,53 +9,96 @@
 
 #include "cache/sharded_cache.h"
 
-#include <string>
+#include <algorithm>
+#include <cstdint>
+#include <memory>
 
+#include "util/hash.h"
+#include "util/math.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+namespace {
+
+inline uint32_t HashSlice(const Slice& s) {
+  return Lower32of64(GetSliceNPHash64(s));
+}
+
+}  // namespace
+
 ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
                            bool strict_capacity_limit,
                            std::shared_ptr<MemoryAllocator> allocator)
     : Cache(std::move(allocator)),
-      num_shard_bits_(num_shard_bits),
+      shard_mask_((uint32_t{1} << num_shard_bits) - 1),
       capacity_(capacity),
       strict_capacity_limit_(strict_capacity_limit),
       last_id_(1) {}
 
 void ShardedCache::SetCapacity(size_t capacity) {
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
   MutexLock l(&capacity_mutex_);
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     GetShard(s)->SetCapacity(per_shard);
   }
   capacity_ = capacity;
 }
 
 void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) {
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   MutexLock l(&capacity_mutex_);
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit);
   }
   strict_capacity_limit_ = strict_capacity_limit;
 }
 
 Status ShardedCache::Insert(const Slice& key, void* value, size_t charge,
-                            void (*deleter)(const Slice& key, void* value),
-                            Handle** handle, Priority priority) {
+                            DeleterFn deleter, Handle** handle,
+                            Priority priority) {
   uint32_t hash = HashSlice(key);
   return GetShard(Shard(hash))
       ->Insert(key, hash, value, charge, deleter, handle, priority);
 }
 
+Status ShardedCache::Insert(const Slice& key, void* value,
+                            const CacheItemHelper* helper, size_t charge,
+                            Handle** handle, Priority priority) {
+  uint32_t hash = HashSlice(key);
+  if (!helper) {
+    return Status::InvalidArgument();
+  }
+  return GetShard(Shard(hash))
+      ->Insert(key, hash, value, helper, charge, handle, priority);
+}
+
 Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) {
   uint32_t hash = HashSlice(key);
   return GetShard(Shard(hash))->Lookup(key, hash);
 }
 
+Cache::Handle* ShardedCache::Lookup(const Slice& key,
+                                    const CacheItemHelper* helper,
+                                    const CreateCallback& create_cb,
+                                    Priority priority, bool wait,
+                                    Statistics* stats) {
+  uint32_t hash = HashSlice(key);
+  return GetShard(Shard(hash))
+      ->Lookup(key, hash, helper, create_cb, priority, wait, stats);
+}
+
+bool ShardedCache::IsReady(Handle* handle) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->IsReady(handle);
+}
+
+void ShardedCache::Wait(Handle* handle) {
+  uint32_t hash = GetHash(handle);
+  GetShard(Shard(hash))->Wait(handle);
+}
+
 bool ShardedCache::Ref(Handle* handle) {
   uint32_t hash = GetHash(handle);
   return GetShard(Shard(hash))->Ref(handle);
@@ -66,6 +109,11 @@
   return GetShard(Shard(hash))->Release(handle, force_erase);
 }
 
+bool ShardedCache::Release(Handle* handle, bool useful, bool force_erase) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->Release(handle, useful, force_erase);
+}
+
 void ShardedCache::Erase(const Slice& key) {
   uint32_t hash = HashSlice(key);
   GetShard(Shard(hash))->Erase(key, hash);
@@ -87,9 +135,9 @@
 
 size_t ShardedCache::GetUsage() const {
   // We will not lock the cache when getting the usage from shards.
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   size_t usage = 0;
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     usage += GetShard(s)->GetUsage();
   }
   return usage;
@@ -101,25 +149,42 @@
 
 size_t ShardedCache::GetPinnedUsage() const {
   // We will not lock the cache when getting the usage from shards.
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   size_t usage = 0;
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     usage += GetShard(s)->GetPinnedUsage();
   }
   return usage;
 }
 
-void ShardedCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                          bool thread_safe) {
-  int num_shards = 1 << num_shard_bits_;
-  for (int s = 0; s < num_shards; s++) {
-    GetShard(s)->ApplyToAllCacheEntries(callback, thread_safe);
-  }
+void ShardedCache::ApplyToAllEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    const ApplyToAllEntriesOptions& opts) {
+  uint32_t num_shards = GetNumShards();
+  // Iterate over part of each shard, rotating between shards, to
+  // minimize impact on latency of concurrent operations.
+  std::unique_ptr<uint32_t[]> states(new uint32_t[num_shards]{});
+
+  uint32_t aepl_in_32 = static_cast<uint32_t>(
+      std::min(size_t{UINT32_MAX}, opts.average_entries_per_lock));
+  aepl_in_32 = std::min(aepl_in_32, uint32_t{1});
+
+  bool remaining_work;
+  do {
+    remaining_work = false;
+    for (uint32_t s = 0; s < num_shards; s++) {
+      if (states[s] != UINT32_MAX) {
+        GetShard(s)->ApplyToSomeEntries(callback, aepl_in_32, &states[s]);
+        remaining_work |= states[s] != UINT32_MAX;
+      }
+    }
+  } while (remaining_work);
 }
 
 void ShardedCache::EraseUnRefEntries() {
-  int num_shards = 1 << num_shard_bits_;
-  for (int s = 0; s < num_shards; s++) {
+  uint32_t num_shards = GetNumShards();
+  for (uint32_t s = 0; s < num_shards; s++) {
     GetShard(s)->EraseUnRefEntries();
   }
 }
@@ -134,7 +199,8 @@
     snprintf(buffer, kBufferSize, "    capacity : %" ROCKSDB_PRIszt "\n",
              capacity_);
     ret.append(buffer);
-    snprintf(buffer, kBufferSize, "    num_shard_bits : %d\n", num_shard_bits_);
+    snprintf(buffer, kBufferSize, "    num_shard_bits : %d\n",
+             GetNumShardBits());
     ret.append(buffer);
     snprintf(buffer, kBufferSize, "    strict_capacity_limit : %d\n",
              strict_capacity_limit_);
@@ -159,4 +225,8 @@
   return num_shard_bits;
 }
 
+int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); }
+
+uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; }
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cache/sharded_cache.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cache/sharded_cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,7 +14,6 @@
 
 #include "port/port.h"
 #include "rocksdb/cache.h"
-#include "util/hash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -24,20 +23,38 @@
   CacheShard() = default;
   virtual ~CacheShard() = default;
 
+  using DeleterFn = Cache::DeleterFn;
   virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
+                        size_t charge, DeleterFn deleter,
+                        Cache::Handle** handle, Cache::Priority priority) = 0;
+  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+                        const Cache::CacheItemHelper* helper, size_t charge,
                         Cache::Handle** handle, Cache::Priority priority) = 0;
   virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0;
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                                const Cache::CacheItemHelper* helper,
+                                const Cache::CreateCallback& create_cb,
+                                Cache::Priority priority, bool wait,
+                                Statistics* stats) = 0;
+  virtual bool Release(Cache::Handle* handle, bool useful,
+                       bool force_erase) = 0;
+  virtual bool IsReady(Cache::Handle* handle) = 0;
+  virtual void Wait(Cache::Handle* handle) = 0;
   virtual bool Ref(Cache::Handle* handle) = 0;
-  virtual bool Release(Cache::Handle* handle, bool force_erase = false) = 0;
+  virtual bool Release(Cache::Handle* handle, bool force_erase) = 0;
   virtual void Erase(const Slice& key, uint32_t hash) = 0;
   virtual void SetCapacity(size_t capacity) = 0;
   virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
   virtual size_t GetUsage() const = 0;
   virtual size_t GetPinnedUsage() const = 0;
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) = 0;
+  // Handles iterating over roughly `average_entries_per_lock` entries, using
+  // `state` to somehow record where it last ended up. Caller initially uses
+  // *state == 0 and implementation sets *state = UINT32_MAX to indicate
+  // completion.
+  virtual void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      uint32_t average_entries_per_lock, uint32_t* state) = 0;
   virtual void EraseUnRefEntries() = 0;
   virtual std::string GetPrintableOptions() const { return ""; }
   void set_metadata_charge_policy(
@@ -57,22 +74,29 @@
   ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
                std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
   virtual ~ShardedCache() = default;
-  virtual const char* Name() const override = 0;
-  virtual CacheShard* GetShard(int shard) = 0;
-  virtual const CacheShard* GetShard(int shard) const = 0;
-  virtual void* Value(Handle* handle) override = 0;
-  virtual size_t GetCharge(Handle* handle) const override = 0;
+  virtual CacheShard* GetShard(uint32_t shard) = 0;
+  virtual const CacheShard* GetShard(uint32_t shard) const = 0;
 
   virtual uint32_t GetHash(Handle* handle) const = 0;
-  virtual void DisownData() override = 0;
 
   virtual void SetCapacity(size_t capacity) override;
   virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
 
   virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Handle** handle, Priority priority) override;
+                        DeleterFn deleter, Handle** handle,
+                        Priority priority) override;
+  virtual Status Insert(const Slice& key, void* value,
+                        const CacheItemHelper* helper, size_t chargge,
+                        Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) override;
   virtual Handle* Lookup(const Slice& key, Statistics* stats) override;
+  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                         const CreateCallback& create_cb, Priority priority,
+                         bool wait, Statistics* stats = nullptr) override;
+  virtual bool Release(Handle* handle, bool useful,
+                       bool force_erase = false) override;
+  virtual bool IsReady(Handle* handle) override;
+  virtual void Wait(Handle* handle) override;
   virtual bool Ref(Handle* handle) override;
   virtual bool Release(Handle* handle, bool force_erase = false) override;
   virtual void Erase(const Slice& key) override;
@@ -82,24 +106,21 @@
   virtual size_t GetUsage() const override;
   virtual size_t GetUsage(Handle* handle) const override;
   virtual size_t GetPinnedUsage() const override;
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) override;
+  virtual void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override;
   virtual void EraseUnRefEntries() override;
   virtual std::string GetPrintableOptions() const override;
 
-  int GetNumShardBits() const { return num_shard_bits_; }
-
- private:
-  static inline uint32_t HashSlice(const Slice& s) {
-    return static_cast<uint32_t>(GetSliceNPHash64(s));
-  }
+  int GetNumShardBits() const;
+  uint32_t GetNumShards() const;
 
-  uint32_t Shard(uint32_t hash) {
-    // Note, hash >> 32 yields hash in gcc, not the zero we expect!
-    return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
-  }
+ protected:
+  inline uint32_t Shard(uint32_t hash) { return hash & shard_mask_; }
 
-  int num_shard_bits_;
+ private:
+  const uint32_t shard_mask_;
   mutable port::Mutex capacity_mutex_;
   size_t capacity_;
   bool strict_capacity_limit_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/RocksDBConfig.cmake.in	2025-05-19 16:14:27.000000000 +0000
@@ -1,3 +1,54 @@
 @PACKAGE_INIT@
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
+
+include(CMakeFindDependencyMacro)
+
+set(GFLAGS_USE_TARGET_NAMESPACE @GFLAGS_USE_TARGET_NAMESPACE@)
+
+if(@WITH_JEMALLOC@)
+  find_dependency(JeMalloc)
+endif()
+
+if(@WITH_GFLAGS@)
+  find_dependency(gflags CONFIG)
+  if(NOT gflags_FOUND)
+    find_dependency(gflags)
+  endif()
+endif()
+
+if(@WITH_SNAPPY@)
+  find_dependency(Snappy CONFIG)
+  if(NOT Snappy_FOUND)
+    find_dependency(Snappy)
+  endif()
+endif()
+
+if(@WITH_ZLIB@)
+  find_dependency(ZLIB)
+endif()
+
+if(@WITH_BZ2@)
+  find_dependency(BZip2)
+endif()
+
+if(@WITH_LZ4@)
+  find_dependency(lz4)
+endif()
+
+if(@WITH_ZSTD@)
+  find_dependency(zstd)
+endif()
+
+if(@WITH_NUMA@)
+  find_dependency(NUMA)
+endif()
+
+if(@WITH_TBB@)
+  find_dependency(TBB)
+endif()
+
+find_dependency(Threads)
+
 include("${CMAKE_CURRENT_LIST_DIR}/RocksDBTargets.cmake")
 check_required_components(RocksDB)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/CxxFlags.cmake	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,7 @@
+macro(get_cxx_std_flags FLAGS_VARIABLE)
+  if( CMAKE_CXX_STANDARD_REQUIRED )
+    set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION})
+  else()
+    set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION})
+  endif()
+endmacro()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/FindSnappy.cmake	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,29 @@
+# - Find Snappy
+# Find the snappy compression library and includes
+#
+# Snappy_INCLUDE_DIRS - where to find snappy.h, etc.
+# Snappy_LIBRARIES - List of libraries when using snappy.
+# Snappy_FOUND - True if snappy found.
+
+find_path(Snappy_INCLUDE_DIRS
+  NAMES snappy.h
+  HINTS ${snappy_ROOT_DIR}/include)
+
+find_library(Snappy_LIBRARIES
+  NAMES snappy
+  HINTS ${snappy_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Snappy DEFAULT_MSG Snappy_LIBRARIES Snappy_INCLUDE_DIRS)
+
+mark_as_advanced(
+  Snappy_LIBRARIES
+  Snappy_INCLUDE_DIRS)
+
+if(Snappy_FOUND AND NOT (TARGET Snappy::snappy))
+  add_library (Snappy::snappy UNKNOWN IMPORTED)
+  set_target_properties(Snappy::snappy
+    PROPERTIES
+      IMPORTED_LOCATION ${Snappy_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${Snappy_INCLUDE_DIRS})
+endif()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findgflags.cmake	2025-05-19 16:14:27.000000000 +0000
@@ -1,8 +1,8 @@
 # - Find gflags library
 # Find the gflags includes and library
 #
-# gflags_INCLUDE_DIR - where to find gflags.h.
-# gflags_LIBRARIES - List of libraries when using gflags.
+# GFLAGS_INCLUDE_DIR - where to find gflags.h.
+# GFLAGS_LIBRARIES - List of libraries when using gflags.
 # gflags_FOUND - True if gflags found.
 
 find_path(GFLAGS_INCLUDE_DIR
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Findsnappy.cmake	1970-01-01 00:00:00.000000000 +0000
@@ -1,29 +0,0 @@
-# - Find Snappy
-# Find the snappy compression library and includes
-#
-# snappy_INCLUDE_DIRS - where to find snappy.h, etc.
-# snappy_LIBRARIES - List of libraries when using snappy.
-# snappy_FOUND - True if snappy found.
-
-find_path(snappy_INCLUDE_DIRS
-  NAMES snappy.h
-  HINTS ${snappy_ROOT_DIR}/include)
-
-find_library(snappy_LIBRARIES
-  NAMES snappy
-  HINTS ${snappy_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(snappy DEFAULT_MSG snappy_LIBRARIES snappy_INCLUDE_DIRS)
-
-mark_as_advanced(
-  snappy_LIBRARIES
-  snappy_INCLUDE_DIRS)
-
-if(snappy_FOUND AND NOT (TARGET snappy::snappy))
-  add_library (snappy::snappy UNKNOWN IMPORTED)
-  set_target_properties(snappy::snappy
-    PROPERTIES
-      IMPORTED_LOCATION ${snappy_LIBRARIES}
-      INTERFACE_INCLUDE_DIRECTORIES ${snappy_INCLUDE_DIRS})
-endif()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake
--- mariadb-10.11.11/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/cmake/modules/Finduring.cmake	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,26 @@
+# - Find liburing
+#
+# uring_INCLUDE_DIR - Where to find liburing.h
+# uring_LIBRARIES - List of libraries when using uring.
+# uring_FOUND - True if uring found.
+
+find_path(uring_INCLUDE_DIR
+  NAMES liburing.h)
+find_library(uring_LIBRARIES
+  NAMES liburing.a liburing)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(uring
+  DEFAULT_MSG uring_LIBRARIES uring_INCLUDE_DIR)
+
+mark_as_advanced(
+  uring_INCLUDE_DIR
+  uring_LIBRARIES)
+
+if(uring_FOUND AND NOT TARGET uring::uring)
+  add_library(uring::uring UNKNOWN IMPORTED)
+  set_target_properties(uring::uring PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${uring_INCLUDE_DIR}"
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION "${uring_LIBRARIES}")
+endif()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/coverage_test.sh mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/coverage_test.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/coverage_test.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/coverage_test.sh	2025-05-19 16:14:27.000000000 +0000
@@ -12,21 +12,24 @@
 ROOT=".."
 # Fetch right version of gcov
 if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
-  source $ROOT/build_tools/fbcode_config.sh
+  source $ROOT/build_tools/fbcode_config_platform007.sh
   GCOV=$GCC_BASE/bin/gcov
 else
   GCOV=$(which gcov)
 fi
+echo -e "Using $GCOV"
 
 COVERAGE_DIR="$PWD/COVERAGE_REPORT"
 mkdir -p $COVERAGE_DIR
 
 # Find all gcno files to generate the coverage report
 
+PYTHON=${1:-`which python3`}
+echo -e "Using $PYTHON"
 GCNO_FILES=`find $ROOT -name "*.gcno"`
 $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
   # Parse the raw gcov report to more human readable form.
-  python $ROOT/coverage/parse_gcov_output.py |
+  $PYTHON $ROOT/coverage/parse_gcov_output.py |
   # Write the output to both stdout and report file.
   tee $COVERAGE_DIR/coverage_report_all.txt &&
 echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
@@ -41,7 +44,7 @@
 
 echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
 $GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
-  python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
+  $PYTHON $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
   tee -a $RECENT_REPORT &&
 echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/coverage/parse_gcov_output.py	2025-05-19 16:14:27.000000000 +0000
@@ -1,10 +1,12 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from __future__ import print_function
+
+import optparse
 import re
 import sys
 
-from optparse import OptionParser
-
 # the gcov report follows certain pattern. Each file will have two lines
 # of report, from which we can extract the file name, total lines and coverage
 # percentage.
@@ -48,7 +50,7 @@
 def get_option_parser():
     usage = "Parse the gcov output and generate more human-readable code " +\
             "coverage report."
-    parser = OptionParser(usage)
+    parser = optparse.OptionParser(usage)
 
     parser.add_option(
         "--interested-files", "-i",
@@ -73,8 +75,8 @@
     header_template = \
         "%" + str(max_file_name_length) + "s\t%s\t%s"
     separator = "-" * (max_file_name_length + 10 + 20)
-    print header_template % ("Filename", "Coverage", "Lines")  # noqa: E999 T25377293 Grandfathered in
-    print separator
+    print(header_template % ("Filename", "Coverage", "Lines"))  # noqa: E999 T25377293 Grandfathered in
+    print(separator)
 
     # -- Print body
     # template for printing coverage report for each file.
@@ -82,12 +84,12 @@
 
     for fname, coverage_info in per_file_coverage.items():
         coverage, lines = coverage_info
-        print record_template % (fname, coverage, lines)
+        print(record_template % (fname, coverage, lines))
 
     # -- Print footer
     if total_coverage:
-        print separator
-        print record_template % ("Total", total_coverage[0], total_coverage[1])
+        print(separator)
+        print(record_template % ("Total", total_coverage[0], total_coverage[1]))
 
 def report_coverage():
     parser = get_option_parser()
@@ -111,7 +113,7 @@
         total_coverage = None
 
     if not len(per_file_coverage):
-        print >> sys.stderr, "Cannot find coverage info for the given files."
+        print("Cannot find coverage info for the given files.", file=sys.stderr)
         return
     display_file_coverage(per_file_coverage, total_coverage)
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc	2025-05-19 16:14:27.000000000 +0000
@@ -30,21 +30,20 @@
   return db_iter_->GetProperty(prop_name, prop);
 }
 
-void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
-                              const ImmutableCFOptions& cf_options,
-                              const MutableCFOptions& mutable_cf_options,
-                              const SequenceNumber& sequence,
-                              uint64_t max_sequential_skip_in_iteration,
-                              uint64_t version_number,
-                              ReadCallback* read_callback, DBImpl* db_impl,
-                              ColumnFamilyData* cfd, bool allow_blob,
-                              bool allow_refresh) {
+void ArenaWrappedDBIter::Init(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
   auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
-                              cf_options.user_comparator, nullptr, sequence,
-                              true, max_sequential_skip_in_iteration,
-                              read_callback, db_impl, cfd, allow_blob);
+  db_iter_ =
+      new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
+                       ioptions.user_comparator, /* iter */ nullptr, version,
+                       sequence, true, max_sequential_skip_in_iteration,
+                       read_callback, db_impl, cfd, expose_blob_index);
   sv_number_ = version_number;
+  read_options_ = read_options;
   allow_refresh_ = allow_refresh;
 }
 
@@ -56,48 +55,74 @@
   // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
   // correct behavior. Will be corrected automatically when we take a snapshot
   // here for the case of WritePreparedTxnDB.
-  SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
   uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
-  if (sv_number_ != cur_sv_number) {
-    Env* env = db_iter_->env();
-    db_iter_->~DBIter();
-    arena_.~Arena();
-    new (&arena_) Arena();
-
-    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
-    if (read_callback_) {
-      read_callback_->Refresh(latest_seq);
+  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
+  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
+  while (true) {
+    if (sv_number_ != cur_sv_number) {
+      Env* env = db_iter_->env();
+      db_iter_->~DBIter();
+      arena_.~Arena();
+      new (&arena_) Arena();
+
+      SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
+      SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+      if (read_callback_) {
+        read_callback_->Refresh(latest_seq);
+      }
+      Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+           sv->current, latest_seq,
+           sv->mutable_cf_options.max_sequential_skip_in_iterations,
+           cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
+           allow_refresh_);
+
+      InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+          read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+          latest_seq, /* allow_unprepared_value */ true);
+      SetIterUnderDBIter(internal_iter);
+      break;
+    } else {
+      SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+      // Refresh range-tombstones in MemTable
+      if (!read_options_.ignore_range_deletions) {
+        SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
+        ReadRangeDelAggregator* range_del_agg =
+            db_iter_->GetRangeDelAggregator();
+        std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
+        range_del_iter.reset(
+            sv->mem->NewRangeTombstoneIterator(read_options_, latest_seq));
+        range_del_agg->AddTombstones(std::move(range_del_iter));
+        cfd_->ReturnThreadLocalSuperVersion(sv);
+      }
+      // Refresh latest sequence number
+      db_iter_->set_sequence(latest_seq);
+      db_iter_->set_valid(false);
+      // Check again if the latest super version number is changed
+      uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
+      if (latest_sv_number != cur_sv_number) {
+        // If the super version number is changed after refreshing,
+        // fallback to Re-Init the InternalIterator
+        cur_sv_number = latest_sv_number;
+        continue;
+      }
+      break;
     }
-    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
-         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
-         allow_refresh_);
-
-    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
-        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
-        latest_seq);
-    SetIterUnderDBIter(internal_iter);
-  } else {
-    db_iter_->set_sequence(latest_seq);
-    db_iter_->set_valid(false);
   }
   return Status::OK();
 }
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-    bool allow_blob, bool allow_refresh) {
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
   ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
+  iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
              max_sequential_skip_in_iterations, version_number, read_callback,
-             db_impl, cfd, allow_blob, allow_refresh);
+             db_impl, cfd, expose_blob_index, allow_refresh);
   if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
-    iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
-                           allow_blob);
+    iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index);
   }
 
   return iter;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/arena_wrapped_db_iter.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,7 +12,6 @@
 #include <string>
 #include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
-#include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
 #include "memory/arena.h"
 #include "options/cf_options.h"
@@ -23,6 +22,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 class Arena;
+class Version;
 
 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed to be allocated. This class is used as an entry point of
@@ -33,7 +33,13 @@
 // the same as the inner DBIter.
 class ArenaWrappedDBIter : public Iterator {
  public:
-  virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
+  ~ArenaWrappedDBIter() override {
+    if (db_iter_ != nullptr) {
+      db_iter_->~DBIter();
+    } else {
+      assert(false);
+    }
+  }
 
   // Get the arena to be used to allocate memory for DBIter to be wrapped,
   // as well as child iterators in it.
@@ -41,6 +47,7 @@
   virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
     return db_iter_->GetRangeDelAggregator();
   }
+  const ReadOptions& GetReadOptions() { return read_options_; }
 
   // Set the internal iterator wrapped inside the DB Iterator. Usually it is
   // a merging iterator.
@@ -51,6 +58,8 @@
   bool Valid() const override { return db_iter_->Valid(); }
   void SeekToFirst() override { db_iter_->SeekToFirst(); }
   void SeekToLast() override { db_iter_->SeekToLast(); }
+  // 'target' does not contain timestamp, even if user timestamp feature is
+  // enabled.
   void Seek(const Slice& target) override { db_iter_->Seek(target); }
   void SeekForPrev(const Slice& target) override {
     db_iter_->SeekForPrev(target);
@@ -60,6 +69,7 @@
   Slice key() const override { return db_iter_->key(); }
   Slice value() const override { return db_iter_->value(); }
   Status status() const override { return db_iter_->status(); }
+  Slice timestamp() const override { return db_iter_->timestamp(); }
   bool IsBlob() const { return db_iter_->IsBlob(); }
 
   Status GetProperty(std::string prop_name, std::string* prop) override;
@@ -67,34 +77,32 @@
   Status Refresh() override;
 
   void Init(Env* env, const ReadOptions& read_options,
-            const ImmutableCFOptions& cf_options,
-            const MutableCFOptions& mutable_cf_options,
+            const ImmutableOptions& ioptions,
+            const MutableCFOptions& mutable_cf_options, const Version* version,
             const SequenceNumber& sequence,
             uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
             ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-            bool allow_blob, bool allow_refresh);
+            bool expose_blob_index, bool allow_refresh);
 
   // Store some parameters so we can refresh the iterator at a later point
   // with these same params
-  void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
-                        ColumnFamilyData* cfd, ReadCallback* read_callback,
-                        bool allow_blob) {
-    read_options_ = read_options;
+  void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd,
+                        ReadCallback* read_callback, bool expose_blob_index) {
     db_impl_ = db_impl;
     cfd_ = cfd;
     read_callback_ = read_callback;
-    allow_blob_ = allow_blob;
+    expose_blob_index_ = expose_blob_index;
   }
 
  private:
-  DBIter* db_iter_;
+  DBIter* db_iter_ = nullptr;
   Arena arena_;
   uint64_t sv_number_;
   ColumnFamilyData* cfd_ = nullptr;
   DBImpl* db_impl_ = nullptr;
   ReadOptions read_options_;
   ReadCallback* read_callback_;
-  bool allow_blob_ = false;
+  bool expose_blob_index_ = false;
   bool allow_refresh_ = true;
 };
 
@@ -102,11 +110,10 @@
 // `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
 // be supported.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
-    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
-    bool allow_refresh = true);
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false, bool allow_refresh = true);
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_constants.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_constants.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_constants.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_constants.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,16 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that passes each key-value encountered to
+// BlobGarbageMeter as inflow in order to measure the total number and size of
+// blobs in the compaction input on a per-blob file basis.
+class BlobCountingIterator : public InternalIterator {
+ public:
+  BlobCountingIterator(InternalIterator* iter,
+                       BlobGarbageMeter* blob_garbage_meter)
+      : iter_(iter), blob_garbage_meter_(blob_garbage_meter) {
+    assert(iter_);
+    assert(blob_garbage_meter_);
+
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  bool Valid() const override { return iter_->Valid() && status_.ok(); }
+
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void SeekToLast() override {
+    iter_->SeekToLast();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void Seek(const Slice& target) override {
+    iter_->Seek(target);
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    iter_->SeekForPrev(target);
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void Next() override {
+    assert(Valid());
+
+    iter_->Next();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(Valid());
+
+    const bool res = iter_->NextAndGetResult(result);
+    UpdateAndCountBlobIfNeeded();
+    return res;
+  }
+
+  void Prev() override {
+    assert(Valid());
+
+    iter_->Prev();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return iter_->key();
+  }
+
+  Slice user_key() const override {
+    assert(Valid());
+    return iter_->user_key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return iter_->value();
+  }
+
+  Status status() const override { return status_; }
+
+  bool PrepareValue() override {
+    assert(Valid());
+    return iter_->PrepareValue();
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+    return iter_->UpperBoundCheckResult();
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+
+  bool IsKeyPinned() const override {
+    assert(Valid());
+    return iter_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(Valid());
+    return iter_->IsValuePinned();
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override {
+    return iter_->GetProperty(prop_name, prop);
+  }
+
+ private:
+  void UpdateAndCountBlobIfNeeded() {
+    assert(!iter_->Valid() || iter_->status().ok());
+
+    if (!iter_->Valid()) {
+      status_ = iter_->status();
+      return;
+    }
+
+    TEST_SYNC_POINT(
+        "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow");
+
+    status_ = blob_garbage_meter_->ProcessInFlow(key(), value());
+  }
+
+  InternalIterator* iter_;
+  BlobGarbageMeter* blob_garbage_meter_;
+  Status status_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_counting_iterator_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,326 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_counting_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter,
+                 uint64_t blob_file_number, uint64_t count, uint64_t bytes) {
+  const auto& flows = blob_garbage_meter.flows();
+
+  const auto it = flows.find(blob_file_number);
+  if (it == flows.end()) {
+    ASSERT_EQ(count, 0);
+    ASSERT_EQ(bytes, 0);
+    return;
+  }
+
+  const auto& in = it->second.GetInFlow();
+
+  ASSERT_EQ(in.GetCount(), count);
+  ASSERT_EQ(in.GetBytes(), bytes);
+}
+
+TEST(BlobCountingIteratorTest, CountBlobs) {
+  // Note: the input consists of three key-values: two are blob references to
+  // different blob files, while the third one is a plain value.
+  constexpr char user_key0[] = "key0";
+  constexpr char user_key1[] = "key1";
+  constexpr char user_key2[] = "key2";
+
+  const std::vector<std::string> keys{
+      test::KeyStr(user_key0, 1, kTypeBlobIndex),
+      test::KeyStr(user_key1, 2, kTypeBlobIndex),
+      test::KeyStr(user_key2, 3, kTypeValue)};
+
+  constexpr uint64_t first_blob_file_number = 4;
+  constexpr uint64_t first_offset = 1000;
+  constexpr uint64_t first_size = 2000;
+
+  std::string first_blob_index;
+  BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset,
+                        first_size, kNoCompression);
+
+  constexpr uint64_t second_blob_file_number = 6;
+  constexpr uint64_t second_offset = 2000;
+  constexpr uint64_t second_size = 4000;
+
+  std::string second_blob_index;
+  BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number,
+                        second_offset, second_size, kNoCompression);
+
+  const std::vector<std::string> values{first_blob_index, second_blob_index,
+                                        "raw_value"};
+
+  assert(keys.size() == values.size());
+
+  VectorIterator input(keys, values);
+  BlobGarbageMeter blob_garbage_meter;
+
+  BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+  constexpr uint64_t first_expected_bytes =
+      first_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1);
+  constexpr uint64_t second_expected_bytes =
+      second_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1);
+
+  // Call SeekToFirst and iterate forward
+  blob_counter.SeekToFirst();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0);
+
+  blob_counter.Next();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  blob_counter.Next();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  blob_counter.Next();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  // Do it again using NextAndGetResult
+  blob_counter.SeekToFirst();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  {
+    IterateResult result;
+    ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[1]);
+    ASSERT_EQ(blob_counter.user_key(), user_key1);
+    ASSERT_TRUE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    ASSERT_EQ(blob_counter.key(), keys[1]);
+    ASSERT_EQ(blob_counter.value(), values[1]);
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  {
+    IterateResult result;
+    ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[2]);
+    ASSERT_EQ(blob_counter.user_key(), user_key2);
+    ASSERT_TRUE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    ASSERT_EQ(blob_counter.key(), keys[2]);
+    ASSERT_EQ(blob_counter.value(), values[2]);
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  {
+    IterateResult result;
+    ASSERT_FALSE(blob_counter.NextAndGetResult(&result));
+    ASSERT_FALSE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  // Call SeekToLast and iterate backward
+  blob_counter.SeekToLast();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+              2 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+              3 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+              3 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  // Call Seek for all keys (plus one that's greater than all of them)
+  blob_counter.Seek(keys[0]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Seek(keys[1]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.Seek(keys[2]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.Seek("zzz");
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  // Call SeekForPrev for all keys (plus one that's less than all of them)
+  blob_counter.SeekForPrev("aaa");
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[0]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[1]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+              5 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[2]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+              5 * second_expected_bytes);
+}
+
+TEST(BlobCountingIteratorTest, CorruptBlobIndex) {
+  const std::vector<std::string> keys{
+      test::KeyStr("user_key", 1, kTypeBlobIndex)};
+  const std::vector<std::string> values{"i_am_not_a_blob_index"};
+
+  assert(keys.size() == values.size());
+
+  VectorIterator input(keys, values);
+  BlobGarbageMeter blob_garbage_meter;
+
+  BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+  blob_counter.SeekToFirst();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_NOK(blob_counter.status());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_fetcher.h"
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+                              const Slice& blob_index_slice,
+                              FilePrefetchBuffer* prefetch_buffer,
+                              PinnableSlice* blob_value,
+                              uint64_t* bytes_read) const {
+  assert(version_);
+
+  return version_->GetBlob(read_options_, user_key, blob_index_slice,
+                           prefetch_buffer, blob_value, bytes_read);
+}
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+                              const BlobIndex& blob_index,
+                              FilePrefetchBuffer* prefetch_buffer,
+                              PinnableSlice* blob_value,
+                              uint64_t* bytes_read) const {
+  assert(version_);
+
+  return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer,
+                           blob_value, bytes_read);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_fetcher.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,37 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class Slice;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class BlobIndex;
+
+// A thin wrapper around the blob retrieval functionality of Version.
+class BlobFetcher {
+ public:
+  BlobFetcher(const Version* version, const ReadOptions& read_options)
+      : version_(version), read_options_(read_options) {}
+
+  Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice,
+                   FilePrefetchBuffer* prefetch_buffer,
+                   PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+  Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index,
+                   FilePrefetchBuffer* prefetch_buffer,
+                   PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+ private:
+  const Version* version_;
+  ReadOptions read_options_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,156 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileAddition::CustomFieldTags : uint32_t {
+  kEndMarker,
+
+  // Add forward compatible fields here
+
+  /////////////////////////////////////////////////////////////////////
+
+  kForwardIncompatibleMask = 1 << 6,
+
+  // Add forward incompatible fields here
+};
+
+void BlobFileAddition::EncodeTo(std::string* output) const {
+  PutVarint64(output, blob_file_number_);
+  PutVarint64(output, total_blob_count_);
+  PutVarint64(output, total_blob_bytes_);
+  PutLengthPrefixedSlice(output, checksum_method_);
+  PutLengthPrefixedSlice(output, checksum_value_);
+
+  // Encode any custom fields here. The format to use is a Varint32 tag (see
+  // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+  // fields will be ignored during decoding unless they're in the forward
+  // incompatible range.
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output);
+
+  PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileAddition::DecodeFrom(Slice* input) {
+  constexpr char class_name[] = "BlobFileAddition";
+
+  if (!GetVarint64(input, &blob_file_number_)) {
+    return Status::Corruption(class_name, "Error decoding blob file number");
+  }
+
+  if (!GetVarint64(input, &total_blob_count_)) {
+    return Status::Corruption(class_name, "Error decoding total blob count");
+  }
+
+  if (!GetVarint64(input, &total_blob_bytes_)) {
+    return Status::Corruption(class_name, "Error decoding total blob bytes");
+  }
+
+  Slice checksum_method;
+  if (!GetLengthPrefixedSlice(input, &checksum_method)) {
+    return Status::Corruption(class_name, "Error decoding checksum method");
+  }
+  checksum_method_ = checksum_method.ToString();
+
+  Slice checksum_value;
+  if (!GetLengthPrefixedSlice(input, &checksum_value)) {
+    return Status::Corruption(class_name, "Error decoding checksum value");
+  }
+  checksum_value_ = checksum_value.ToString();
+
+  while (true) {
+    uint32_t custom_field_tag = 0;
+    if (!GetVarint32(input, &custom_field_tag)) {
+      return Status::Corruption(class_name, "Error decoding custom field tag");
+    }
+
+    if (custom_field_tag == kEndMarker) {
+      break;
+    }
+
+    if (custom_field_tag & kForwardIncompatibleMask) {
+      return Status::Corruption(
+          class_name, "Forward incompatible custom field encountered");
+    }
+
+    Slice custom_field_value;
+    if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+      return Status::Corruption(class_name,
+                                "Error decoding custom field value");
+    }
+  }
+
+  return Status::OK();
+}
+
+std::string BlobFileAddition::DebugString() const {
+  std::ostringstream oss;
+
+  oss << *this;
+
+  return oss.str();
+}
+
+std::string BlobFileAddition::DebugJSON() const {
+  JSONWriter jw;
+
+  jw << *this;
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+  return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+         lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() &&
+         lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() &&
+         lhs.GetChecksumMethod() == rhs.GetChecksumMethod() &&
+         lhs.GetChecksumValue() == rhs.GetChecksumValue();
+}
+
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+  return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileAddition& blob_file_addition) {
+  os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber()
+     << " total_blob_count: " << blob_file_addition.GetTotalBlobCount()
+     << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes()
+     << " checksum_method: " << blob_file_addition.GetChecksumMethod()
+     << " checksum_value: "
+     << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+  return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileAddition& blob_file_addition) {
+  jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber()
+     << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount()
+     << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes()
+     << "ChecksumMethod" << blob_file_addition.GetChecksumMethod()
+     << "ChecksumValue"
+     << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+  return jw;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileAddition {
+ public:
+  BlobFileAddition() = default;
+
+  BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count,
+                   uint64_t total_blob_bytes, std::string checksum_method,
+                   std::string checksum_value)
+      : blob_file_number_(blob_file_number),
+        total_blob_count_(total_blob_count),
+        total_blob_bytes_(total_blob_bytes),
+        checksum_method_(std::move(checksum_method)),
+        checksum_value_(std::move(checksum_value)) {
+    assert(checksum_method_.empty() == checksum_value_.empty());
+  }
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+  uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+  const std::string& GetChecksumMethod() const { return checksum_method_; }
+  const std::string& GetChecksumValue() const { return checksum_value_; }
+
+  void EncodeTo(std::string* output) const;
+  Status DecodeFrom(Slice* input);
+
+  std::string DebugString() const;
+  std::string DebugJSON() const;
+
+ private:
+  enum CustomFieldTags : uint32_t;
+
+  uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+  uint64_t total_blob_count_ = 0;
+  uint64_t total_blob_bytes_ = 0;
+  std::string checksum_method_;
+  std::string checksum_value_;
+};
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileAddition& blob_file_addition);
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileAddition& blob_file_addition);
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_addition_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,210 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileAdditionTest : public testing::Test {
+ public:
+  static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) {
+    std::string encoded;
+    blob_file_addition.EncodeTo(&encoded);
+
+    BlobFileAddition decoded;
+    Slice input(encoded);
+    ASSERT_OK(decoded.DecodeFrom(&input));
+
+    ASSERT_EQ(blob_file_addition, decoded);
+  }
+};
+
+TEST_F(BlobFileAdditionTest, Empty) {
+  BlobFileAddition blob_file_addition;
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0);
+  ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty());
+  ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty());
+
+  TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, NonEmpty) {
+  constexpr uint64_t blob_file_number = 123;
+  constexpr uint64_t total_blob_count = 2;
+  constexpr uint64_t total_blob_bytes = 123456;
+  const std::string checksum_method("SHA1");
+  const std::string checksum_value(
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value);
+
+  TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, DecodeErrors) {
+  std::string str;
+  Slice slice(str);
+
+  BlobFileAddition blob_file_addition;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+  }
+
+  constexpr uint64_t blob_file_number = 123;
+  PutVarint64(&str, blob_file_number);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "total blob count"));
+  }
+
+  constexpr uint64_t total_blob_count = 4567;
+  PutVarint64(&str, total_blob_count);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes"));
+  }
+
+  constexpr uint64_t total_blob_bytes = 12345678;
+  PutVarint64(&str, total_blob_bytes);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "checksum method"));
+  }
+
+  constexpr char checksum_method[] = "SHA1";
+  PutLengthPrefixedSlice(&str, checksum_method);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "checksum value"));
+  }
+
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  PutLengthPrefixedSlice(&str, checksum_value);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+  }
+
+  constexpr uint32_t custom_tag = 2;
+  PutVarint32(&str, custom_tag);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+  }
+}
+
+TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_compatible_tag = 2;
+        PutVarint32(output, forward_compatible_tag);
+
+        PutLengthPrefixedSlice(output, "deadbeef");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 678;
+  constexpr uint64_t total_blob_count = 9999;
+  constexpr uint64_t total_blob_bytes = 100000000;
+  const std::string checksum_method("CRC32");
+  const std::string checksum_value("\x3d\x87\xff\x57");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  TestEncodeDecode(blob_file_addition);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+        PutVarint32(output, forward_incompatible_tag);
+
+        PutLengthPrefixedSlice(output, "foobar");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 456;
+  constexpr uint64_t total_blob_count = 100;
+  constexpr uint64_t total_blob_bytes = 2000000;
+  const std::string checksum_method("CRC32B");
+  const std::string checksum_value("\x6d\xbd\xf2\x3a");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  std::string encoded;
+  blob_file_addition.EncodeTo(&encoded);
+
+  BlobFileAddition decoded_blob_file_addition;
+  Slice input(encoded);
+  const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,375 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/event_helpers.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileBuilder::BlobFileBuilder(
+    VersionSet* versions, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
+    const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+    int job_id, uint32_t column_family_id,
+    const std::string& column_family_name, Env::IOPriority io_priority,
+    Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
+    BlobFileCreationReason creation_reason,
+    std::vector<std::string>* blob_file_paths,
+    std::vector<BlobFileAddition>* blob_file_additions)
+    : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs,
+                      immutable_options, mutable_cf_options, file_options,
+                      job_id, column_family_id, column_family_name, io_priority,
+                      write_hint, io_tracer, blob_callback, creation_reason,
+                      blob_file_paths, blob_file_additions) {}
+
+BlobFileBuilder::BlobFileBuilder(
+    std::function<uint64_t()> file_number_generator, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
+    const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+    int job_id, uint32_t column_family_id,
+    const std::string& column_family_name, Env::IOPriority io_priority,
+    Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
+    BlobFileCreationReason creation_reason,
+    std::vector<std::string>* blob_file_paths,
+    std::vector<BlobFileAddition>* blob_file_additions)
+    : file_number_generator_(std::move(file_number_generator)),
+      fs_(fs),
+      immutable_options_(immutable_options),
+      min_blob_size_(mutable_cf_options->min_blob_size),
+      blob_file_size_(mutable_cf_options->blob_file_size),
+      blob_compression_type_(mutable_cf_options->blob_compression_type),
+      file_options_(file_options),
+      job_id_(job_id),
+      column_family_id_(column_family_id),
+      column_family_name_(column_family_name),
+      io_priority_(io_priority),
+      write_hint_(write_hint),
+      io_tracer_(io_tracer),
+      blob_callback_(blob_callback),
+      creation_reason_(creation_reason),
+      blob_file_paths_(blob_file_paths),
+      blob_file_additions_(blob_file_additions),
+      blob_count_(0),
+      blob_bytes_(0) {
+  assert(file_number_generator_);
+  assert(fs_);
+  assert(immutable_options_);
+  assert(file_options_);
+  assert(blob_file_paths_);
+  assert(blob_file_paths_->empty());
+  assert(blob_file_additions_);
+  assert(blob_file_additions_->empty());
+}
+
+BlobFileBuilder::~BlobFileBuilder() = default;
+
+Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
+                            std::string* blob_index) {
+  assert(blob_index);
+  assert(blob_index->empty());
+
+  if (value.size() < min_blob_size_) {
+    return Status::OK();
+  }
+
+  {
+    const Status s = OpenBlobFileIfNeeded();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  Slice blob = value;
+  std::string compressed_blob;
+
+  {
+    const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t blob_file_number = 0;
+  uint64_t blob_offset = 0;
+
+  {
+    const Status s =
+        WriteBlobToFile(key, blob, &blob_file_number, &blob_offset);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s = CloseBlobFileIfNeeded();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(),
+                        blob_compression_type_);
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::Finish() {
+  if (!IsBlobFileOpen()) {
+    return Status::OK();
+  }
+
+  return CloseBlobFile();
+}
+
+bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; }
+
+Status BlobFileBuilder::OpenBlobFileIfNeeded() {
+  if (IsBlobFileOpen()) {
+    return Status::OK();
+  }
+
+  assert(!blob_count_);
+  assert(!blob_bytes_);
+
+  assert(file_number_generator_);
+  const uint64_t blob_file_number = file_number_generator_();
+
+  assert(immutable_options_);
+  assert(!immutable_options_->cf_paths.empty());
+  std::string blob_file_path =
+      BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number);
+
+  if (blob_callback_) {
+    blob_callback_->OnBlobFileCreationStarted(
+        blob_file_path, column_family_name_, job_id_, creation_reason_);
+  }
+
+  std::unique_ptr<FSWritableFile> file;
+
+  {
+    assert(file_options_);
+    Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
+
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Note: files get added to blob_file_paths_ right after the open, so they
+  // can be cleaned up upon failure. Contrast this with blob_file_additions_,
+  // which only contains successfully written files.
+  assert(blob_file_paths_);
+  blob_file_paths_->emplace_back(std::move(blob_file_path));
+
+  assert(file);
+  file->SetIOPriority(io_priority_);
+  file->SetWriteLifeTimeHint(write_hint_);
+  FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
+  Statistics* const statistics = immutable_options_->stats;
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_paths_->back(), *file_options_,
+      immutable_options_->clock, io_tracer_, statistics,
+      immutable_options_->listeners,
+      immutable_options_->file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kBlobFile), false));
+
+  constexpr bool do_flush = false;
+
+  std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter(
+      std::move(file_writer), immutable_options_->clock, statistics,
+      blob_file_number, immutable_options_->use_fsync, do_flush));
+
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+
+  BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl,
+                       expiration_range);
+
+  {
+    Status s = blob_log_writer->WriteHeader(header);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s);
+
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  writer_ = std::move(blob_log_writer);
+
+  assert(IsBlobFileOpen());
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::CompressBlobIfNeeded(
+    Slice* blob, std::string* compressed_blob) const {
+  assert(blob);
+  assert(compressed_blob);
+  assert(compressed_blob->empty());
+  assert(immutable_options_);
+
+  if (blob_compression_type_ == kNoCompression) {
+    return Status::OK();
+  }
+
+  CompressionOptions opts;
+  CompressionContext context(blob_compression_type_);
+  constexpr uint64_t sample_for_compression = 0;
+
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                       blob_compression_type_, sample_for_compression);
+
+  constexpr uint32_t compression_format_version = 2;
+
+  bool success = false;
+
+  {
+    StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
+                         BLOB_DB_COMPRESSION_MICROS);
+    success =
+        CompressData(*blob, info, compression_format_version, compressed_blob);
+  }
+
+  if (!success) {
+    return Status::Corruption("Error compressing blob");
+  }
+
+  *blob = Slice(*compressed_blob);
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob,
+                                        uint64_t* blob_file_number,
+                                        uint64_t* blob_offset) {
+  assert(IsBlobFileOpen());
+  assert(blob_file_number);
+  assert(blob_offset);
+
+  uint64_t key_offset = 0;
+
+  Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  *blob_file_number = writer_->get_log_number();
+
+  ++blob_count_;
+  blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size();
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::CloseBlobFile() {
+  assert(IsBlobFileOpen());
+
+  BlobLogFooter footer;
+  footer.blob_count = blob_count_;
+
+  std::string checksum_method;
+  std::string checksum_value;
+
+  Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  const uint64_t blob_file_number = writer_->get_log_number();
+
+  if (blob_callback_) {
+    s = blob_callback_->OnBlobFileCompleted(
+        blob_file_paths_->back(), column_family_name_, job_id_,
+        blob_file_number, creation_reason_, s, checksum_value, checksum_method,
+        blob_count_, blob_bytes_);
+  }
+
+  assert(blob_file_additions_);
+  blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_,
+                                     std::move(checksum_method),
+                                     std::move(checksum_value));
+
+  assert(immutable_options_);
+  ROCKS_LOG_INFO(immutable_options_->logger,
+                 "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64
+                 " total blobs, %" PRIu64 " total bytes",
+                 column_family_name_.c_str(), job_id_, blob_file_number,
+                 blob_count_, blob_bytes_);
+
+  writer_.reset();
+  blob_count_ = 0;
+  blob_bytes_ = 0;
+
+  return s;
+}
+
+Status BlobFileBuilder::CloseBlobFileIfNeeded() {
+  assert(IsBlobFileOpen());
+
+  const WritableFileWriter* const file_writer = writer_->file();
+  assert(file_writer);
+
+  if (file_writer->GetFileSize() < blob_file_size_) {
+    return Status::OK();
+  }
+
+  return CloseBlobFile();
+}
+
+void BlobFileBuilder::Abandon(const Status& s) {
+  if (!IsBlobFileOpen()) {
+    return;
+  }
+  if (blob_callback_) {
+    // BlobFileBuilder::Abandon() is called because of error while writing to
+    // Blob files. So we can ignore the below error.
+    blob_callback_
+        ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_,
+                              job_id_, writer_->get_log_number(),
+                              creation_reason_, s, "", "", blob_count_,
+                              blob_bytes_)
+        .PermitUncheckedError();
+  }
+
+  writer_.reset();
+  blob_count_ = 0;
+  blob_bytes_ = 0;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,103 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cinttypes>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionSet;
+class FileSystem;
+class SystemClock;
+struct ImmutableOptions;
+struct MutableCFOptions;
+struct FileOptions;
+class BlobFileAddition;
+class Status;
+class Slice;
+class BlobLogWriter;
+class IOTracer;
+class BlobFileCompletionCallback;
+
+class BlobFileBuilder {
+ public:
+  BlobFileBuilder(VersionSet* versions, FileSystem* fs,
+                  const ImmutableOptions* immutable_options,
+                  const MutableCFOptions* mutable_cf_options,
+                  const FileOptions* file_options, int job_id,
+                  uint32_t column_family_id,
+                  const std::string& column_family_name,
+                  Env::IOPriority io_priority,
+                  Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
+                  BlobFileCreationReason creation_reason,
+                  std::vector<std::string>* blob_file_paths,
+                  std::vector<BlobFileAddition>* blob_file_additions);
+
+  BlobFileBuilder(std::function<uint64_t()> file_number_generator,
+                  FileSystem* fs, const ImmutableOptions* immutable_options,
+                  const MutableCFOptions* mutable_cf_options,
+                  const FileOptions* file_options, int job_id,
+                  uint32_t column_family_id,
+                  const std::string& column_family_name,
+                  Env::IOPriority io_priority,
+                  Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
+                  BlobFileCreationReason creation_reason,
+                  std::vector<std::string>* blob_file_paths,
+                  std::vector<BlobFileAddition>* blob_file_additions);
+
+  BlobFileBuilder(const BlobFileBuilder&) = delete;
+  BlobFileBuilder& operator=(const BlobFileBuilder&) = delete;
+
+  ~BlobFileBuilder();
+
+  Status Add(const Slice& key, const Slice& value, std::string* blob_index);
+  Status Finish();
+  void Abandon(const Status& s);
+
+ private:
+  bool IsBlobFileOpen() const;
+  Status OpenBlobFileIfNeeded();
+  Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const;
+  Status WriteBlobToFile(const Slice& key, const Slice& blob,
+                         uint64_t* blob_file_number, uint64_t* blob_offset);
+  Status CloseBlobFile();
+  Status CloseBlobFileIfNeeded();
+
+  std::function<uint64_t()> file_number_generator_;
+  FileSystem* fs_;
+  const ImmutableOptions* immutable_options_;
+  uint64_t min_blob_size_;
+  uint64_t blob_file_size_;
+  CompressionType blob_compression_type_;
+  const FileOptions* file_options_;
+  int job_id_;
+  uint32_t column_family_id_;
+  std::string column_family_name_;
+  Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  BlobFileCompletionCallback* blob_callback_;
+  BlobFileCreationReason creation_reason_;
+  std::vector<std::string>* blob_file_paths_;
+  std::vector<BlobFileAddition>* blob_file_additions_;
+  std::unique_ptr<BlobLogWriter> writer_;
+  uint64_t blob_count_;
+  uint64_t blob_bytes_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,672 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_sequential_reader.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFileNumberGenerator {
+ public:
+  uint64_t operator()() { return ++next_file_number_; }
+
+ private:
+  uint64_t next_file_number_ = 1;
+};
+
+class BlobFileBuilderTest : public testing::Test {
+ protected:
+  BlobFileBuilderTest() {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fs_ = mock_env_->GetFileSystem().get();
+    clock_ = mock_env_->GetSystemClock().get();
+  }
+
+  void VerifyBlobFile(uint64_t blob_file_number,
+                      const std::string& blob_file_path,
+                      uint32_t column_family_id,
+                      CompressionType blob_compression_type,
+                      const std::vector<std::pair<std::string, std::string>>&
+                          expected_key_value_pairs,
+                      const std::vector<std::string>& blob_indexes) {
+    assert(expected_key_value_pairs.size() == blob_indexes.size());
+
+    std::unique_ptr<FSRandomAccessFile> file;
+    constexpr IODebugContext* dbg = nullptr;
+    ASSERT_OK(
+        fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg));
+
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(file), blob_file_path, clock_));
+
+    constexpr Statistics* statistics = nullptr;
+    BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_,
+                                            statistics);
+
+    BlobLogHeader header;
+    ASSERT_OK(blob_log_reader.ReadHeader(&header));
+    ASSERT_EQ(header.version, kVersion1);
+    ASSERT_EQ(header.column_family_id, column_family_id);
+    ASSERT_EQ(header.compression, blob_compression_type);
+    ASSERT_FALSE(header.has_ttl);
+    ASSERT_EQ(header.expiration_range, ExpirationRange());
+
+    for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) {
+      BlobLogRecord record;
+      uint64_t blob_offset = 0;
+
+      ASSERT_OK(blob_log_reader.ReadRecord(
+          &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset));
+
+      // Check the contents of the blob file
+      const auto& expected_key_value = expected_key_value_pairs[i];
+      const auto& key = expected_key_value.first;
+      const auto& value = expected_key_value.second;
+
+      ASSERT_EQ(record.key_size, key.size());
+      ASSERT_EQ(record.value_size, value.size());
+      ASSERT_EQ(record.expiration, 0);
+      ASSERT_EQ(record.key, key);
+      ASSERT_EQ(record.value, value);
+
+      // Make sure the blob reference returned by the builder points to the
+      // right place
+      BlobIndex blob_index;
+      ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i]));
+      ASSERT_FALSE(blob_index.IsInlined());
+      ASSERT_FALSE(blob_index.HasTTL());
+      ASSERT_EQ(blob_index.file_number(), blob_file_number);
+      ASSERT_EQ(blob_index.offset(), blob_offset);
+      ASSERT_EQ(blob_index.size(), value.size());
+    }
+
+    BlobLogFooter footer;
+    ASSERT_OK(blob_log_reader.ReadFooter(&footer));
+    ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size());
+    ASSERT_EQ(footer.expiration_range, ExpirationRange());
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  FileSystem* fs_;
+  SystemClock* clock_;
+  FileOptions file_options_;
+};
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
+  // Build a single blob file
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 4;
+  constexpr size_t value_offset = 1234;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_BuildAndCheckOneFile"),
+      0);
+  options.enable_blob_files = true;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+      number_of_blobs);
+  std::vector<std::string> blob_indexes(number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    auto& expected_key_value = expected_key_value_pairs[i];
+
+    auto& key = expected_key_value.first;
+    key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    auto& value = expected_key_value.second;
+    value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    auto& blob_index = blob_indexes[i];
+
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_FALSE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs);
+  ASSERT_EQ(
+      blob_file_addition.GetTotalBlobBytes(),
+      number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size));
+
+  // Verify the contents of the new blob file as well as the blob references
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
+  // Build multiple blob files: file size limit is set to the size of a single
+  // value, so each blob ends up in a file of its own
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 10;
+  constexpr size_t value_offset = 1234567890;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_BuildAndCheckMultipleFiles"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_file_size = value_size;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+      number_of_blobs);
+  std::vector<std::string> blob_indexes(number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    auto& expected_key_value = expected_key_value_pairs[i];
+
+    auto& key = expected_key_value.first;
+    key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    auto& value = expected_key_value.second;
+    value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    auto& blob_index = blob_indexes[i];
+
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_FALSE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  ASSERT_EQ(blob_file_paths.size(), number_of_blobs);
+  ASSERT_EQ(blob_file_additions.size(), number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    const uint64_t blob_file_number = i + 2;
+
+    ASSERT_EQ(blob_file_paths[i],
+              BlobFileName(immutable_options.cf_paths.front().path,
+                           blob_file_number));
+
+    const auto& blob_file_addition = blob_file_additions[i];
+
+    ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+    ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+    ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+              BlobLogRecord::kHeaderSize + key_size + value_size);
+  }
+
+  // Verify the contents of the new blob files as well as the blob references
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    std::vector<std::pair<std::string, std::string>> expected_key_value_pair{
+        expected_key_value_pairs[i]};
+    std::vector<std::string> blob_index{blob_indexes[i]};
+
+    VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression,
+                   expected_key_value_pair, blob_index);
+  }
+}
+
+TEST_F(BlobFileBuilderTest, InlinedValues) {
+  // All values are below the min_blob_size threshold; no blob files get written
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 10;
+  constexpr size_t value_offset = 1234567890;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_InlinedValues"),
+      0);
+  options.enable_blob_files = true;
+  options.min_blob_size = 1024;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    const std::string key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    const std::string value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    std::string blob_index;
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_TRUE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  ASSERT_TRUE(blob_file_paths.empty());
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Compression) {
+  // Build a blob file with a compressed blob
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 100;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_compression_type = kSnappyCompression;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  const std::string key("1");
+  const std::string uncompressed_value(value_size, 'x');
+
+  std::string blob_index;
+
+  ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index));
+  ASSERT_FALSE(blob_index.empty());
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+
+  CompressionOptions opts;
+  CompressionContext context(kSnappyCompression);
+  constexpr uint64_t sample_for_compression = 0;
+
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                       kSnappyCompression, sample_for_compression);
+
+  std::string compressed_value;
+  ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
+                              uncompressed_value.size(), &compressed_value));
+
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+            BlobLogRecord::kHeaderSize + key_size + compressed_value.size());
+
+  // Verify the contents of the new blob file as well as the blob reference
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+      {key, compressed_value}};
+  std::vector<std::string> blob_indexes{blob_index};
+
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kSnappyCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, CompressionError) {
+  // Simulate an error during compression
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_CompressionError"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_compression_type = kSnappyCompression;
+  options.env = mock_env_.get();
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
+                                        [](void* arg) {
+                                          bool* ret = static_cast<bool*>(arg);
+                                          *ret = false;
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr char key[] = "1";
+  constexpr char value[] = "deadbeef";
+
+  std::string blob_index;
+
+  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+  ASSERT_EQ(
+      blob_file_paths[0],
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Checksum) {
+  // Build a blob file with checksum
+
+  class DummyFileChecksumGenerator : public FileChecksumGenerator {
+   public:
+    void Update(const char* /* data */, size_t /* n */) override {}
+
+    void Finalize() override {}
+
+    std::string GetChecksum() const override { return std::string("dummy"); }
+
+    const char* Name() const override { return "DummyFileChecksum"; }
+  };
+
+  class DummyFileChecksumGenFactory : public FileChecksumGenFactory {
+   public:
+    std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+        const FileChecksumGenContext& /* context */) override {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new DummyFileChecksumGenerator);
+    }
+
+    const char* Name() const override { return "DummyFileChecksumGenFactory"; }
+  };
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"),
+      0);
+  options.enable_blob_files = true;
+  options.file_checksum_gen_factory =
+      std::make_shared<DummyFileChecksumGenFactory>();
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  const std::string key("1");
+  const std::string value("deadbeef");
+
+  std::string blob_index;
+
+  ASSERT_OK(builder.Add(key, value, &blob_index));
+  ASSERT_FALSE(blob_index.empty());
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+            BlobLogRecord::kHeaderSize + key.size() + value.size());
+  ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum");
+  ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy");
+
+  // Verify the contents of the new blob file as well as the blob reference
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+      {key, value}};
+  std::vector<std::string> blob_indexes{blob_index};
+
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+class BlobFileBuilderIOErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fs_ = mock_env_->GetFileSystem().get();
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  FileSystem* fs_;
+  FileOptions file_options_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BlobFileBuilderTest, BlobFileBuilderIOErrorTest,
+    ::testing::ValuesIn(std::vector<std::string>{
+        "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile",
+        "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader",
+        "BlobFileBuilder::WriteBlobToFile:AddRecord",
+        "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(BlobFileBuilderIOErrorTest, IOError) {
+  // Simulate an I/O error during the specified step of Add()
+  // Note: blob_file_size will be set to value_size in order for the first blob
+  // to trigger close
+  constexpr size_t value_size = 8;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderIOErrorTest_IOError"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_file_size = value_size;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr char key[] = "1";
+  constexpr char value[] = "deadbeef";
+
+  std::string blob_index;
+
+  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") {
+    ASSERT_TRUE(blob_file_paths.empty());
+  } else {
+    constexpr uint64_t blob_file_number = 2;
+
+    ASSERT_EQ(blob_file_paths.size(), 1);
+    ASSERT_EQ(blob_file_paths[0],
+              BlobFileName(immutable_options.cf_paths.front().path,
+                           blob_file_number));
+  }
+
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <memory>
+
+#include "db/blob/blob_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileCache::BlobFileCache(Cache* cache,
+                             const ImmutableOptions* immutable_options,
+                             const FileOptions* file_options,
+                             uint32_t column_family_id,
+                             HistogramImpl* blob_file_read_hist,
+                             const std::shared_ptr<IOTracer>& io_tracer)
+    : cache_(cache),
+      mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
+      immutable_options_(immutable_options),
+      file_options_(file_options),
+      column_family_id_(column_family_id),
+      blob_file_read_hist_(blob_file_read_hist),
+      io_tracer_(io_tracer) {
+  assert(cache_);
+  assert(immutable_options_);
+  assert(file_options_);
+}
+
+Status BlobFileCache::GetBlobFileReader(
+    uint64_t blob_file_number,
+    CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+  assert(blob_file_reader);
+  assert(blob_file_reader->IsEmpty());
+
+  const Slice key = GetSlice(&blob_file_number);
+
+  assert(cache_);
+
+  Cache::Handle* handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
+
+  // Check again while holding mutex
+  MutexLock lock(mutex_.get(key));
+
+  handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  assert(immutable_options_);
+  Statistics* const statistics = immutable_options_->stats;
+
+  RecordTick(statistics, NO_FILE_OPENS);
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  {
+    assert(file_options_);
+    const Status s = BlobFileReader::Create(
+        *immutable_options_, *file_options_, column_family_id_,
+        blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  {
+    constexpr size_t charge = 1;
+
+    const Status s = cache_->Insert(key, reader.get(), charge,
+                                    &DeleteCacheEntry<BlobFileReader>, &handle);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  reader.release();
+
+  *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "cache/cache_helpers.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+class Status;
+class BlobFileReader;
+class Slice;
+class IOTracer;
+
+class BlobFileCache {
+ public:
+  BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options,
+                const FileOptions* file_options, uint32_t column_family_id,
+                HistogramImpl* blob_file_read_hist,
+                const std::shared_ptr<IOTracer>& io_tracer);
+
+  BlobFileCache(const BlobFileCache&) = delete;
+  BlobFileCache& operator=(const BlobFileCache&) = delete;
+
+  Status GetBlobFileReader(uint64_t blob_file_number,
+                           CacheHandleGuard<BlobFileReader>* blob_file_reader);
+
+ private:
+  Cache* cache_;
+  // Note: mutex_ below is used to guard against multiple threads racing to open
+  // the same file.
+  Striped<port::Mutex, Slice> mutex_;
+  const ImmutableOptions* immutable_options_;
+  const FileOptions* file_options_;
+  uint32_t column_family_id_;
+  HistogramImpl* blob_file_read_hist_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  static constexpr size_t kNumberOfMutexStripes = 1 << 7;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,268 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with a single blob in it.
+void WriteBlobFile(uint32_t column_family_id,
+                   const ImmutableOptions& immutable_options,
+                   uint64_t blob_file_number) {
+  assert(!immutable_options.cf_paths.empty());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+
+  BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+                       expiration_range);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  std::string compressed_blob;
+
+  uint64_t key_offset = 0;
+  uint64_t blob_offset = 0;
+
+  ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset));
+
+  BlobLogFooter footer;
+  footer.blob_count = 1;
+  footer.expiration_range = expiration_range;
+
+  std::string checksum_method;
+  std::string checksum_value;
+
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+}  // anonymous namespace
+
+class BlobFileCacheTest : public testing::Test {
+ protected:
+  BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+
+  std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // First try: reader should be opened and put in cache
+  CacheHandleGuard<BlobFileReader> first;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  // Second try: reader should be served from cache
+  CacheHandleGuard<BlobFileReader> second;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+  ASSERT_NE(second.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_Race"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  CacheHandleGuard<BlobFileReader> first;
+  CacheHandleGuard<BlobFileReader> second;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
+        // Disabling sync points to prevent infinite recursion
+        SyncPoint::GetInstance()->DisableProcessing();
+
+        ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+        ASSERT_NE(second.GetValue(), nullptr);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_IOError"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  ImmutableOptions immutable_options(options);
+  FileOptions file_options;
+  constexpr uint32_t column_family_id = 1;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Note: there is no blob file with the below number
+  constexpr uint64_t blob_file_number = 123;
+
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(
+      blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_CacheFull"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 0;
+  constexpr int num_shard_bits = -1;  // determined automatically
+  constexpr bool strict_capacity_limit = true;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Insert into cache should fail since it has zero capacity and
+  // strict_capacity_limit is set
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
+                  .IsIncomplete());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_completion_callback.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileCompletionCallback {
+ public:
+  BlobFileCompletionCallback(
+      SstFileManager* sst_file_manager, InstrumentedMutex* mutex,
+      ErrorHandler* error_handler, EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& dbname)
+      : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
+#ifndef ROCKSDB_LITE
+    sst_file_manager_ = sst_file_manager;
+    mutex_ = mutex;
+    error_handler_ = error_handler;
+#else
+    (void)sst_file_manager;
+    (void)mutex;
+    (void)error_handler;
+#endif  // ROCKSDB_LITE
+  }
+
+  void OnBlobFileCreationStarted(const std::string& file_name,
+                                 const std::string& column_family_name,
+                                 int job_id,
+                                 BlobFileCreationReason creation_reason) {
+#ifndef ROCKSDB_LITE
+    // Notify the listeners.
+    EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
+                                                column_family_name, file_name,
+                                                job_id, creation_reason);
+#else
+    (void)file_name;
+    (void)column_family_name;
+    (void)job_id;
+    (void)creation_reason;
+#endif
+  }
+
+  Status OnBlobFileCompleted(const std::string& file_name,
+                             const std::string& column_family_name, int job_id,
+                             uint64_t file_number,
+                             BlobFileCreationReason creation_reason,
+                             const Status& report_status,
+                             const std::string& checksum_value,
+                             const std::string& checksum_method,
+                             uint64_t blob_count, uint64_t blob_bytes) {
+    Status s;
+
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
+    if (sfm) {
+      // Report new blob files to SstFileManagerImpl
+      s = sfm->OnAddFile(file_name);
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        s = Status::SpaceLimit("Max allowed space was reached");
+        TEST_SYNC_POINT(
+            "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached");
+        InstrumentedMutexLock l(mutex_);
+        error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
+      }
+    }
+#endif  // !ROCKSDB_LITE
+
+    // Notify the listeners.
+    EventHelpers::LogAndNotifyBlobFileCreationFinished(
+        event_logger_, listeners_, dbname_, column_family_name, file_name,
+        job_id, file_number, creation_reason,
+        (!report_status.ok() ? report_status : s),
+        (checksum_value.empty() ? kUnknownFileChecksum : checksum_value),
+        (checksum_method.empty() ? kUnknownFileChecksumFuncName
+                                 : checksum_method),
+        blob_count, blob_bytes);
+    return s;
+  }
+
+ private:
+#ifndef ROCKSDB_LITE
+  SstFileManager* sst_file_manager_;
+  InstrumentedMutex* mutex_;
+  ErrorHandler* error_handler_;
+#endif  // ROCKSDB_LITE
+  EventLogger* event_logger_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::string dbname_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,134 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileGarbage::CustomFieldTags : uint32_t {
+  kEndMarker,
+
+  // Add forward compatible fields here
+
+  /////////////////////////////////////////////////////////////////////
+
+  kForwardIncompatibleMask = 1 << 6,
+
+  // Add forward incompatible fields here
+};
+
+void BlobFileGarbage::EncodeTo(std::string* output) const {
+  PutVarint64(output, blob_file_number_);
+  PutVarint64(output, garbage_blob_count_);
+  PutVarint64(output, garbage_blob_bytes_);
+
+  // Encode any custom fields here. The format to use is a Varint32 tag (see
+  // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+  // fields will be ignored during decoding unless they're in the forward
+  // incompatible range.
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output);
+
+  PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileGarbage::DecodeFrom(Slice* input) {
+  constexpr char class_name[] = "BlobFileGarbage";
+
+  if (!GetVarint64(input, &blob_file_number_)) {
+    return Status::Corruption(class_name, "Error decoding blob file number");
+  }
+
+  if (!GetVarint64(input, &garbage_blob_count_)) {
+    return Status::Corruption(class_name, "Error decoding garbage blob count");
+  }
+
+  if (!GetVarint64(input, &garbage_blob_bytes_)) {
+    return Status::Corruption(class_name, "Error decoding garbage blob bytes");
+  }
+
+  while (true) {
+    uint32_t custom_field_tag = 0;
+    if (!GetVarint32(input, &custom_field_tag)) {
+      return Status::Corruption(class_name, "Error decoding custom field tag");
+    }
+
+    if (custom_field_tag == kEndMarker) {
+      break;
+    }
+
+    if (custom_field_tag & kForwardIncompatibleMask) {
+      return Status::Corruption(
+          class_name, "Forward incompatible custom field encountered");
+    }
+
+    Slice custom_field_value;
+    if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+      return Status::Corruption(class_name,
+                                "Error decoding custom field value");
+    }
+  }
+
+  return Status::OK();
+}
+
+std::string BlobFileGarbage::DebugString() const {
+  std::ostringstream oss;
+
+  oss << *this;
+
+  return oss.str();
+}
+
+std::string BlobFileGarbage::DebugJSON() const {
+  JSONWriter jw;
+
+  jw << *this;
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+  return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+         lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() &&
+         lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes();
+}
+
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+  return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileGarbage& blob_file_garbage) {
+  os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber()
+     << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount()
+     << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes();
+
+  return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileGarbage& blob_file_garbage) {
+  jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber()
+     << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount()
+     << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes();
+
+  return jw;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileGarbage {
+ public:
+  BlobFileGarbage() = default;
+
+  BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count,
+                  uint64_t garbage_blob_bytes)
+      : blob_file_number_(blob_file_number),
+        garbage_blob_count_(garbage_blob_count),
+        garbage_blob_bytes_(garbage_blob_bytes) {}
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+  uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+  void EncodeTo(std::string* output) const;
+  Status DecodeFrom(Slice* input);
+
+  std::string DebugString() const;
+  std::string DebugJSON() const;
+
+ private:
+  enum CustomFieldTags : uint32_t;
+
+  uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+  uint64_t garbage_blob_count_ = 0;
+  uint64_t garbage_blob_bytes_ = 0;
+};
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileGarbage& blob_file_garbage);
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileGarbage& blob_file_garbage);
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_garbage_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,173 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileGarbageTest : public testing::Test {
+ public:
+  static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) {
+    std::string encoded;
+    blob_file_garbage.EncodeTo(&encoded);
+
+    BlobFileGarbage decoded;
+    Slice input(encoded);
+    ASSERT_OK(decoded.DecodeFrom(&input));
+
+    ASSERT_EQ(blob_file_garbage, decoded);
+  }
+};
+
+TEST_F(BlobFileGarbageTest, Empty) {
+  BlobFileGarbage blob_file_garbage;
+
+  ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0);
+
+  TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, NonEmpty) {
+  constexpr uint64_t blob_file_number = 123;
+  constexpr uint64_t garbage_blob_count = 1;
+  constexpr uint64_t garbage_blob_bytes = 9876;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes);
+
+  TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, DecodeErrors) {
+  std::string str;
+  Slice slice(str);
+
+  BlobFileGarbage blob_file_garbage;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+  }
+
+  constexpr uint64_t blob_file_number = 123;
+  PutVarint64(&str, blob_file_number);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count"));
+  }
+
+  constexpr uint64_t garbage_blob_count = 4567;
+  PutVarint64(&str, garbage_blob_count);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes"));
+  }
+
+  constexpr uint64_t garbage_blob_bytes = 12345678;
+  PutVarint64(&str, garbage_blob_bytes);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+  }
+
+  constexpr uint32_t custom_tag = 2;
+  PutVarint32(&str, custom_tag);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+  }
+}
+
+TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_compatible_tag = 2;
+        PutVarint32(output, forward_compatible_tag);
+
+        PutLengthPrefixedSlice(output, "deadbeef");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 678;
+  constexpr uint64_t garbage_blob_count = 9999;
+  constexpr uint64_t garbage_blob_bytes = 100000000;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  TestEncodeDecode(blob_file_garbage);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+        PutVarint32(output, forward_incompatible_tag);
+
+        PutLengthPrefixedSlice(output, "foobar");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 456;
+  constexpr uint64_t garbage_blob_count = 100;
+  constexpr uint64_t garbage_blob_bytes = 2000000;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  std::string encoded;
+  blob_file_garbage.EncodeTo(&encoded);
+
+  BlobFileGarbage decoded_blob_file_addition;
+  Slice input(encoded);
+  const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_meta.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t SharedBlobFileMetaData::GetBlobFileSize() const {
+  return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize;
+}
+
+std::string SharedBlobFileMetaData::DebugString() const {
+  std::ostringstream oss;
+  oss << (*this);
+
+  return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const SharedBlobFileMetaData& shared_meta) {
+  os << "blob_file_number: " << shared_meta.GetBlobFileNumber()
+     << " total_blob_count: " << shared_meta.GetTotalBlobCount()
+     << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes()
+     << " checksum_method: " << shared_meta.GetChecksumMethod()
+     << " checksum_value: "
+     << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true);
+
+  return os;
+}
+
+std::string BlobFileMetaData::DebugString() const {
+  std::ostringstream oss;
+  oss << (*this);
+
+  return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) {
+  const auto& shared_meta = meta.GetSharedMeta();
+  assert(shared_meta);
+  os << (*shared_meta);
+
+  os << " linked_ssts: {";
+  for (uint64_t file_number : meta.GetLinkedSsts()) {
+    os << ' ' << file_number;
+  }
+  os << " }";
+
+  os << " garbage_blob_count: " << meta.GetGarbageBlobCount()
+     << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes();
+
+  return os;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_meta.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,170 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SharedBlobFileMetaData represents the immutable part of blob files' metadata,
+// like the blob file number, total number and size of blobs, or checksum
+// method and value. There is supposed to be one object of this class per blob
+// file (shared across all versions that include the blob file in question);
+// hence, the type is neither copyable nor movable. A blob file can be marked
+// obsolete when the corresponding SharedBlobFileMetaData object is destroyed.
+
+class SharedBlobFileMetaData {
+ public:
+  static std::shared_ptr<SharedBlobFileMetaData> Create(
+      uint64_t blob_file_number, uint64_t total_blob_count,
+      uint64_t total_blob_bytes, std::string checksum_method,
+      std::string checksum_value) {
+    return std::shared_ptr<SharedBlobFileMetaData>(new SharedBlobFileMetaData(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value)));
+  }
+
+  template <typename Deleter>
+  static std::shared_ptr<SharedBlobFileMetaData> Create(
+      uint64_t blob_file_number, uint64_t total_blob_count,
+      uint64_t total_blob_bytes, std::string checksum_method,
+      std::string checksum_value, Deleter deleter) {
+    return std::shared_ptr<SharedBlobFileMetaData>(
+        new SharedBlobFileMetaData(blob_file_number, total_blob_count,
+                                   total_blob_bytes, std::move(checksum_method),
+                                   std::move(checksum_value)),
+        deleter);
+  }
+
+  SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete;
+  SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete;
+
+  SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete;
+  SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete;
+
+  uint64_t GetBlobFileSize() const;
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+  uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+  const std::string& GetChecksumMethod() const { return checksum_method_; }
+  const std::string& GetChecksumValue() const { return checksum_value_; }
+
+  std::string DebugString() const;
+
+ private:
+  SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count,
+                         uint64_t total_blob_bytes, std::string checksum_method,
+                         std::string checksum_value)
+      : blob_file_number_(blob_file_number),
+        total_blob_count_(total_blob_count),
+        total_blob_bytes_(total_blob_bytes),
+        checksum_method_(std::move(checksum_method)),
+        checksum_value_(std::move(checksum_value)) {
+    assert(checksum_method_.empty() == checksum_value_.empty());
+  }
+
+  uint64_t blob_file_number_;
+  uint64_t total_blob_count_;
+  uint64_t total_blob_bytes_;
+  std::string checksum_method_;
+  std::string checksum_value_;
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const SharedBlobFileMetaData& shared_meta);
+
+// BlobFileMetaData contains the part of the metadata for blob files that can
+// vary across versions, like the amount of garbage in the blob file. In
+// addition, BlobFileMetaData objects point to and share the ownership of the
+// SharedBlobFileMetaData object for the corresponding blob file. Similarly to
+// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They
+// are meant to be jointly owned by the versions in which the blob file has the
+// same (immutable *and* mutable) state.
+
+class BlobFileMetaData {
+ public:
+  using LinkedSsts = std::unordered_set<uint64_t>;
+
+  static std::shared_ptr<BlobFileMetaData> Create(
+      std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+      LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+      uint64_t garbage_blob_bytes) {
+    return std::shared_ptr<BlobFileMetaData>(
+        new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts),
+                             garbage_blob_count, garbage_blob_bytes));
+  }
+
+  BlobFileMetaData(const BlobFileMetaData&) = delete;
+  BlobFileMetaData& operator=(const BlobFileMetaData&) = delete;
+
+  BlobFileMetaData(BlobFileMetaData&&) = delete;
+  BlobFileMetaData& operator=(BlobFileMetaData&&) = delete;
+
+  const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+    return shared_meta_;
+  }
+
+  uint64_t GetBlobFileSize() const {
+    assert(shared_meta_);
+    return shared_meta_->GetBlobFileSize();
+  }
+
+  uint64_t GetBlobFileNumber() const {
+    assert(shared_meta_);
+    return shared_meta_->GetBlobFileNumber();
+  }
+  uint64_t GetTotalBlobCount() const {
+    assert(shared_meta_);
+    return shared_meta_->GetTotalBlobCount();
+  }
+  uint64_t GetTotalBlobBytes() const {
+    assert(shared_meta_);
+    return shared_meta_->GetTotalBlobBytes();
+  }
+  const std::string& GetChecksumMethod() const {
+    assert(shared_meta_);
+    return shared_meta_->GetChecksumMethod();
+  }
+  const std::string& GetChecksumValue() const {
+    assert(shared_meta_);
+    return shared_meta_->GetChecksumValue();
+  }
+
+  const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; }
+
+  uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+  uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+  std::string DebugString() const;
+
+ private:
+  BlobFileMetaData(std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+                   LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+                   uint64_t garbage_blob_bytes)
+      : shared_meta_(std::move(shared_meta)),
+        linked_ssts_(std::move(linked_ssts)),
+        garbage_blob_count_(garbage_blob_count),
+        garbage_blob_bytes_(garbage_blob_bytes) {
+    assert(shared_meta_);
+    assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount());
+    assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes());
+  }
+
+  std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+  LinkedSsts linked_ssts_;
+  uint64_t garbage_blob_count_;
+  uint64_t garbage_blob_bytes_;
+};
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta);
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,582 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFileReader::Create(
+    const ImmutableOptions& immutable_options, const FileOptions& file_options,
+    uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
+    uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
+    std::unique_ptr<BlobFileReader>* blob_file_reader) {
+  assert(blob_file_reader);
+  assert(!*blob_file_reader);
+
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+
+  {
+    const Status s =
+        OpenFile(immutable_options, file_options, blob_file_read_hist,
+                 blob_file_number, io_tracer, &file_size, &file_reader);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(file_reader);
+
+  Statistics* const statistics = immutable_options.stats;
+
+  CompressionType compression_type = kNoCompression;
+
+  {
+    const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
+                                &compression_type);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s = ReadFooter(file_reader.get(), file_size, statistics);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  blob_file_reader->reset(
+      new BlobFileReader(std::move(file_reader), file_size, compression_type,
+                         immutable_options.clock, statistics));
+
+  return Status::OK();
+}
+
+Status BlobFileReader::OpenFile(
+    const ImmutableOptions& immutable_options, const FileOptions& file_opts,
+    HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+    const std::shared_ptr<IOTracer>& io_tracer, uint64_t* file_size,
+    std::unique_ptr<RandomAccessFileReader>* file_reader) {
+  assert(file_size);
+  assert(file_reader);
+
+  const auto& cf_paths = immutable_options.cf_paths;
+  assert(!cf_paths.empty());
+
+  const std::string blob_file_path =
+      BlobFileName(cf_paths.front().path, blob_file_number);
+
+  FileSystem* const fs = immutable_options.fs.get();
+  assert(fs);
+
+  constexpr IODebugContext* dbg = nullptr;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize");
+
+    const Status s =
+        fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+    return Status::Corruption("Malformed blob file");
+  }
+
+  std::unique_ptr<FSRandomAccessFile> file;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile");
+
+    const Status s =
+        fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(file);
+
+  if (immutable_options.advise_random_on_open) {
+    file->Hint(FSRandomAccessFile::kRandom);
+  }
+
+  file_reader->reset(new RandomAccessFileReader(
+      std::move(file), blob_file_path, immutable_options.clock, io_tracer,
+      immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS,
+      blob_file_read_hist, immutable_options.rate_limiter.get(),
+      immutable_options.listeners));
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
+                                  uint32_t column_family_id,
+                                  Statistics* statistics,
+                                  CompressionType* compression_type) {
+  assert(file_reader);
+  assert(compression_type);
+
+  Slice header_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile");
+
+    constexpr uint64_t read_offset = 0;
+    constexpr size_t read_size = BlobLogHeader::kSize;
+
+    const Status s =
+        ReadFromFile(file_reader, read_offset, read_size, statistics,
+                     &header_slice, &buf, &aligned_buf);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult",
+                             &header_slice);
+  }
+
+  BlobLogHeader header;
+
+  {
+    const Status s = header.DecodeFrom(header_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  constexpr ExpirationRange no_expiration_range;
+
+  if (header.has_ttl || header.expiration_range != no_expiration_range) {
+    return Status::Corruption("Unexpected TTL blob file");
+  }
+
+  if (header.column_family_id != column_family_id) {
+    return Status::Corruption("Column family ID mismatch");
+  }
+
+  *compression_type = header.compression;
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
+                                  uint64_t file_size, Statistics* statistics) {
+  assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
+  assert(file_reader);
+
+  Slice footer_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile");
+
+    const uint64_t read_offset = file_size - BlobLogFooter::kSize;
+    constexpr size_t read_size = BlobLogFooter::kSize;
+
+    const Status s =
+        ReadFromFile(file_reader, read_offset, read_size, statistics,
+                     &footer_slice, &buf, &aligned_buf);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult",
+                             &footer_slice);
+  }
+
+  BlobLogFooter footer;
+
+  {
+    const Status s = footer.DecodeFrom(footer_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  constexpr ExpirationRange no_expiration_range;
+
+  if (footer.expiration_range != no_expiration_range) {
+    return Status::Corruption("Unexpected TTL blob file");
+  }
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
+                                    uint64_t read_offset, size_t read_size,
+                                    Statistics* statistics, Slice* slice,
+                                    Buffer* buf, AlignedBuf* aligned_buf) {
+  assert(slice);
+  assert(buf);
+  assert(aligned_buf);
+
+  assert(file_reader);
+
+  RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size);
+
+  Status s;
+
+  if (file_reader->use_direct_io()) {
+    constexpr char* scratch = nullptr;
+
+    s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
+                          aligned_buf);
+  } else {
+    buf->reset(new char[read_size]);
+    constexpr AlignedBuf* aligned_scratch = nullptr;
+
+    s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
+                          buf->get(), aligned_scratch);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (slice->size() != read_size) {
+    return Status::Corruption("Failed to read data from blob file");
+  }
+
+  return Status::OK();
+}
+
+BlobFileReader::BlobFileReader(
+    std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size,
+    CompressionType compression_type, SystemClock* clock,
+    Statistics* statistics)
+    : file_reader_(std::move(file_reader)),
+      file_size_(file_size),
+      compression_type_(compression_type),
+      clock_(clock),
+      statistics_(statistics) {
+  assert(file_reader_);
+}
+
+BlobFileReader::~BlobFileReader() = default;
+
+Status BlobFileReader::GetBlob(const ReadOptions& read_options,
+                               const Slice& user_key, uint64_t offset,
+                               uint64_t value_size,
+                               CompressionType compression_type,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               PinnableSlice* value,
+                               uint64_t* bytes_read) const {
+  assert(value);
+
+  const uint64_t key_size = user_key.size();
+
+  if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+    return Status::Corruption("Invalid blob offset");
+  }
+
+  if (compression_type != compression_type_) {
+    return Status::Corruption("Compression type mismatch when reading blob");
+  }
+
+  // Note: if verify_checksum is set, we read the entire blob record to be able
+  // to perform the verification; otherwise, we just read the blob itself. Since
+  // the offset in BlobIndex actually points to the blob value, we need to make
+  // an adjustment in the former case.
+  const uint64_t adjustment =
+      read_options.verify_checksums
+          ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+          : 0;
+  assert(offset >= adjustment);
+
+  const uint64_t record_offset = offset - adjustment;
+  const uint64_t record_size = value_size + adjustment;
+
+  Slice record_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  bool prefetched = false;
+
+  if (prefetch_buffer) {
+    Status s;
+    constexpr bool for_compaction = true;
+
+    prefetched = prefetch_buffer->TryReadFromCache(
+        IOOptions(), file_reader_.get(), record_offset,
+        static_cast<size_t>(record_size), &record_slice, &s, for_compaction);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!prefetched) {
+    TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile");
+
+    const Status s = ReadFromFile(file_reader_.get(), record_offset,
+                                  static_cast<size_t>(record_size), statistics_,
+                                  &record_slice, &buf, &aligned_buf);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult",
+                           &record_slice);
+
+  if (read_options.verify_checksums) {
+    const Status s = VerifyBlob(record_slice, user_key, value_size);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  const Slice value_slice(record_slice.data() + adjustment, value_size);
+
+  {
+    const Status s = UncompressBlobIfNeeded(value_slice, compression_type,
+                                            clock_, statistics_, value);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (bytes_read) {
+    *bytes_read = record_size;
+  }
+
+  return Status::OK();
+}
+
+void BlobFileReader::MultiGetBlob(
+    const ReadOptions& read_options,
+    const autovector<std::reference_wrapper<const Slice>>& user_keys,
+    const autovector<uint64_t>& offsets,
+    const autovector<uint64_t>& value_sizes, autovector<Status*>& statuses,
+    autovector<PinnableSlice*>& values, uint64_t* bytes_read) const {
+  const size_t num_blobs = user_keys.size();
+  assert(num_blobs > 0);
+  assert(num_blobs == offsets.size());
+  assert(num_blobs == value_sizes.size());
+  assert(num_blobs == statuses.size());
+  assert(num_blobs == values.size());
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < offsets.size() - 1; ++i) {
+    assert(offsets[i] <= offsets[i + 1]);
+  }
+#endif  // !NDEBUG
+
+  std::vector<FSReadRequest> read_reqs(num_blobs);
+  autovector<uint64_t> adjustments;
+  uint64_t total_len = 0;
+  for (size_t i = 0; i < num_blobs; ++i) {
+    const size_t key_size = user_keys[i].get().size();
+    assert(IsValidBlobOffset(offsets[i], key_size, value_sizes[i], file_size_));
+    const uint64_t adjustment =
+        read_options.verify_checksums
+            ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+            : 0;
+    assert(offsets[i] >= adjustment);
+    adjustments.push_back(adjustment);
+    read_reqs[i].offset = offsets[i] - adjustment;
+    read_reqs[i].len = value_sizes[i] + adjustment;
+    total_len += read_reqs[i].len;
+  }
+
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
+
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  Status s;
+  bool direct_io = file_reader_->use_direct_io();
+  if (direct_io) {
+    for (size_t i = 0; i < read_reqs.size(); ++i) {
+      read_reqs[i].scratch = nullptr;
+    }
+  } else {
+    buf.reset(new char[total_len]);
+    std::ptrdiff_t pos = 0;
+    for (size_t i = 0; i < read_reqs.size(); ++i) {
+      read_reqs[i].scratch = buf.get() + pos;
+      pos += read_reqs[i].len;
+    }
+  }
+  TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile");
+  s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(),
+                              direct_io ? &aligned_buf : nullptr);
+  if (!s.ok()) {
+    for (auto& req : read_reqs) {
+      req.status.PermitUncheckedError();
+    }
+    for (size_t i = 0; i < num_blobs; ++i) {
+      assert(statuses[i]);
+      *statuses[i] = s;
+    }
+    return;
+  }
+
+  assert(s.ok());
+  for (size_t i = 0; i < num_blobs; ++i) {
+    auto& req = read_reqs[i];
+    assert(statuses[i]);
+    if (req.status.ok() && req.result.size() != req.len) {
+      req.status = IOStatus::Corruption("Failed to read data from blob file");
+    }
+    *statuses[i] = req.status;
+  }
+
+  if (read_options.verify_checksums) {
+    for (size_t i = 0; i < num_blobs; ++i) {
+      assert(statuses[i]);
+      if (!statuses[i]->ok()) {
+        continue;
+      }
+      const Slice& record_slice = read_reqs[i].result;
+      s = VerifyBlob(record_slice, user_keys[i], value_sizes[i]);
+      if (!s.ok()) {
+        assert(statuses[i]);
+        *statuses[i] = s;
+      }
+    }
+  }
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    assert(statuses[i]);
+    if (!statuses[i]->ok()) {
+      continue;
+    }
+    const Slice& record_slice = read_reqs[i].result;
+    const Slice value_slice(record_slice.data() + adjustments[i],
+                            value_sizes[i]);
+    s = UncompressBlobIfNeeded(value_slice, compression_type_, clock_,
+                               statistics_, values[i]);
+    if (!s.ok()) {
+      *statuses[i] = s;
+    }
+  }
+
+  if (bytes_read) {
+    uint64_t total_bytes = 0;
+    for (const auto& req : read_reqs) {
+      total_bytes += req.result.size();
+    }
+    *bytes_read = total_bytes;
+  }
+}
+
+Status BlobFileReader::VerifyBlob(const Slice& record_slice,
+                                  const Slice& user_key, uint64_t value_size) {
+  BlobLogRecord record;
+
+  const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize);
+
+  {
+    const Status s = record.DecodeHeaderFrom(header_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (record.key_size != user_key.size()) {
+    return Status::Corruption("Key size mismatch when reading blob");
+  }
+
+  if (record.value_size != value_size) {
+    return Status::Corruption("Value size mismatch when reading blob");
+  }
+
+  record.key =
+      Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size);
+  if (record.key != user_key) {
+    return Status::Corruption("Key mismatch when reading blob");
+  }
+
+  record.value = Slice(record.key.data() + record.key_size, value_size);
+
+  {
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC",
+                             &record);
+
+    const Status s = record.CheckBlobCRC();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlobFileReader::UncompressBlobIfNeeded(const Slice& value_slice,
+                                              CompressionType compression_type,
+                                              SystemClock* clock,
+                                              Statistics* statistics,
+                                              PinnableSlice* value) {
+  assert(value);
+
+  if (compression_type == kNoCompression) {
+    SaveValue(value_slice, value);
+
+    return Status::OK();
+  }
+
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                         compression_type);
+
+  size_t uncompressed_size = 0;
+  constexpr uint32_t compression_format_version = 2;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  CacheAllocationPtr output;
+
+  {
+    StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
+    output = UncompressData(info, value_slice.data(), value_slice.size(),
+                            &uncompressed_size, compression_format_version,
+                            allocator);
+  }
+
+  TEST_SYNC_POINT_CALLBACK(
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output);
+
+  if (!output) {
+    return Status::Corruption("Unable to uncompress blob");
+  }
+
+  SaveValue(Slice(output.get(), uncompressed_size), value);
+
+  return Status::OK();
+}
+
+void BlobFileReader::SaveValue(const Slice& src, PinnableSlice* dst) {
+  assert(dst);
+
+  if (dst->IsPinned()) {
+    dst->Reset();
+  }
+
+  dst->PinSelf(src);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "file/random_access_file_reader.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+struct ReadOptions;
+class Slice;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class Statistics;
+
+class BlobFileReader {
+ public:
+  static Status Create(const ImmutableOptions& immutable_options,
+                       const FileOptions& file_options,
+                       uint32_t column_family_id,
+                       HistogramImpl* blob_file_read_hist,
+                       uint64_t blob_file_number,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       std::unique_ptr<BlobFileReader>* reader);
+
+  BlobFileReader(const BlobFileReader&) = delete;
+  BlobFileReader& operator=(const BlobFileReader&) = delete;
+
+  ~BlobFileReader();
+
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 uint64_t offset, uint64_t value_size,
+                 CompressionType compression_type,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  // offsets must be sorted in ascending order by caller.
+  void MultiGetBlob(
+      const ReadOptions& read_options,
+      const autovector<std::reference_wrapper<const Slice>>& user_keys,
+      const autovector<uint64_t>& offsets,
+      const autovector<uint64_t>& value_sizes, autovector<Status*>& statuses,
+      autovector<PinnableSlice*>& values, uint64_t* bytes_read) const;
+
+  CompressionType GetCompressionType() const { return compression_type_; }
+
+  uint64_t GetFileSize() const { return file_size_; }
+
+ private:
+  BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+                 uint64_t file_size, CompressionType compression_type,
+                 SystemClock* clock, Statistics* statistics);
+
+  static Status OpenFile(const ImmutableOptions& immutable_options,
+                         const FileOptions& file_opts,
+                         HistogramImpl* blob_file_read_hist,
+                         uint64_t blob_file_number,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         uint64_t* file_size,
+                         std::unique_ptr<RandomAccessFileReader>* file_reader);
+
+  static Status ReadHeader(const RandomAccessFileReader* file_reader,
+                           uint32_t column_family_id, Statistics* statistics,
+                           CompressionType* compression_type);
+
+  static Status ReadFooter(const RandomAccessFileReader* file_reader,
+                           uint64_t file_size, Statistics* statistics);
+
+  using Buffer = std::unique_ptr<char[]>;
+
+  static Status ReadFromFile(const RandomAccessFileReader* file_reader,
+                             uint64_t read_offset, size_t read_size,
+                             Statistics* statistics, Slice* slice, Buffer* buf,
+                             AlignedBuf* aligned_buf);
+
+  static Status VerifyBlob(const Slice& record_slice, const Slice& user_key,
+                           uint64_t value_size);
+
+  static Status UncompressBlobIfNeeded(const Slice& value_slice,
+                                       CompressionType compression_type,
+                                       SystemClock* clock,
+                                       Statistics* statistics,
+                                       PinnableSlice* value);
+
+  static void SaveValue(const Slice& src, PinnableSlice* dst);
+
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  uint64_t file_size_;
+  CompressionType compression_type_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,974 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const std::vector<Slice>& keys,
+                   const std::vector<Slice>& blobs, CompressionType compression,
+                   std::vector<uint64_t>& blob_offsets,
+                   std::vector<uint64_t>& blob_sizes) {
+  assert(!immutable_options.cf_paths.empty());
+  size_t num = keys.size();
+  assert(num == blobs.size());
+  assert(num == blob_offsets.size());
+  assert(num == blob_sizes.size());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  BlobLogHeader header(column_family_id, compression, has_ttl,
+                       expiration_range_header);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  std::vector<std::string> compressed_blobs(num);
+  std::vector<Slice> blobs_to_write(num);
+  if (kNoCompression == compression) {
+    for (size_t i = 0; i < num; ++i) {
+      blobs_to_write[i] = blobs[i];
+      blob_sizes[i] = blobs[i].size();
+    }
+  } else {
+    CompressionOptions opts;
+    CompressionContext context(compression);
+    constexpr uint64_t sample_for_compression = 0;
+    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                         compression, sample_for_compression);
+
+    constexpr uint32_t compression_format_version = 2;
+
+    for (size_t i = 0; i < num; ++i) {
+      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+                               &compressed_blobs[i]));
+      blobs_to_write[i] = compressed_blobs[i];
+      blob_sizes[i] = compressed_blobs[i].size();
+    }
+  }
+
+  for (size_t i = 0; i < num; ++i) {
+    uint64_t key_offset = 0;
+    ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+                                        &blob_offsets[i]));
+  }
+
+  BlobLogFooter footer;
+  footer.blob_count = num;
+  footer.expiration_range = expiration_range_footer;
+
+  std::string checksum_method;
+  std::string checksum_value;
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+// Creates a test blob file with a single blob in it. Note: this method
+// makes it possible to test various corner cases by allowing the caller
+// to specify the contents of various blob file header/footer fields.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const Slice& key,
+                   const Slice& blob, CompressionType compression,
+                   uint64_t* blob_offset, uint64_t* blob_size) {
+  std::vector<Slice> keys{key};
+  std::vector<Slice> blobs{blob};
+  std::vector<uint64_t> blob_offsets{0};
+  std::vector<uint64_t> blob_sizes{0};
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, keys, blobs, compression, blob_offsets,
+                blob_sizes);
+  if (blob_offset) {
+    *blob_offset = blob_offsets[0];
+  }
+  if (blob_size) {
+    *blob_size = blob_sizes[0];
+  }
+}
+
+}  // anonymous namespace
+
+class BlobFileReaderTest : public testing::Test {
+ protected:
+  BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+  std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_CreateReaderAndGetBlob"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr size_t num_blobs = 3;
+  const std::vector<std::string> key_strs = {"key1", "key2", "key3"};
+  const std::vector<std::string> blob_strs = {"blob1", "blob2", "blob3"};
+
+  const std::vector<Slice> keys = {key_strs[0], key_strs[1], key_strs[2]};
+  const std::vector<Slice> blobs = {blob_strs[0], blob_strs[1], blob_strs[2]};
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  // Make sure the blob can be retrieved with and without checksum verification
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0],
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              &value, &bytes_read));
+    ASSERT_EQ(value, blobs[0]);
+    ASSERT_EQ(bytes_read, blob_sizes[0]);
+
+    // MultiGetBlob
+    bytes_read = 0;
+    size_t total_size = 0;
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    autovector<uint64_t> offsets{blob_offsets[0], blob_offsets[1],
+                                 blob_offsets[2]};
+    autovector<uint64_t> sizes{blob_sizes[0], blob_sizes[1], blob_sizes[2]};
+    std::array<Status, num_blobs> statuses_buf;
+    autovector<Status*> statuses{&statuses_buf[0], &statuses_buf[1],
+                                 &statuses_buf[2]};
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<PinnableSlice*> values{&value_buf[0], &value_buf[1],
+                                      &value_buf[2]};
+    reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses,
+                         values, &bytes_read);
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_OK(statuses_buf[i]);
+      ASSERT_EQ(value_buf[i], blobs[i]);
+      total_size += blob_sizes[i];
+    }
+    ASSERT_EQ(bytes_read, total_size);
+  }
+
+  read_options.verify_checksums = true;
+
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1],
+                              blob_sizes[1], kNoCompression, prefetch_buffer,
+                              &value, &bytes_read));
+    ASSERT_EQ(value, blobs[1]);
+
+    const uint64_t key_size = keys[1].size();
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_sizes[1]);
+  }
+
+  // Invalid offset (too close to start of file)
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[0], blob_offsets[0] - 1,
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Invalid offset (too close to end of file)
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[2], blob_offsets[2] + 1,
+                              blob_sizes[2], kNoCompression, prefetch_buffer,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Incorrect compression type
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[0], blob_offsets[0],
+                              blob_sizes[0], kZSTD, prefetch_buffer, &value,
+                              &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Incorrect key size
+  {
+    constexpr char shorter_key[] = "k";
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, shorter_key,
+                              blob_offsets[0] -
+                                  (keys[0].size() - sizeof(shorter_key) + 1),
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1);
+    key_refs[1] = std::cref(shorter_key_slice);
+
+    autovector<uint64_t> offsets{
+        blob_offsets[0],
+        blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()),
+        blob_offsets[2]};
+    autovector<uint64_t> sizes{blob_sizes[0], blob_sizes[1], blob_sizes[2]};
+    std::array<Status, num_blobs> statuses_buf;
+    autovector<Status*> statuses{&statuses_buf[0], &statuses_buf[1],
+                                 &statuses_buf[2]};
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<PinnableSlice*> values{&value_buf[0], &value_buf[1],
+                                      &value_buf[2]};
+    reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses,
+                         values, &bytes_read);
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i == 1) {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      } else {
+        ASSERT_OK(statuses_buf[i]);
+      }
+    }
+  }
+
+  // Incorrect key
+  {
+    constexpr char incorrect_key[] = "foo1";
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, incorrect_key, blob_offsets[0],
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1);
+    key_refs[2] = std::cref(wrong_key_slice);
+
+    autovector<uint64_t> offsets{blob_offsets[0], blob_offsets[1],
+                                 blob_offsets[2]};
+    autovector<uint64_t> sizes{blob_sizes[0], blob_sizes[1], blob_sizes[2]};
+    std::array<Status, num_blobs> statuses_buf;
+    autovector<Status*> statuses{&statuses_buf[0], &statuses_buf[1],
+                                 &statuses_buf[2]};
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<PinnableSlice*> values{&value_buf[0], &value_buf[1],
+                                      &value_buf[2]};
+    reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses,
+                         values, &bytes_read);
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i == num_blobs - 1) {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      } else {
+        ASSERT_OK(statuses_buf[i]);
+      }
+    }
+  }
+
+  // Incorrect value size
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[1], blob_offsets[1],
+                              blob_sizes[1] + 1, kNoCompression,
+                              prefetch_buffer, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    autovector<uint64_t> offsets{blob_offsets[0], blob_offsets[1],
+                                 blob_offsets[2]};
+    autovector<uint64_t> sizes{blob_sizes[0], blob_sizes[1] + 1, blob_sizes[2]};
+    std::array<Status, num_blobs> statuses_buf;
+    autovector<Status*> statuses{&statuses_buf[0], &statuses_buf[1],
+                                 &statuses_buf[2]};
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<PinnableSlice*> values{&value_buf[0], &value_buf[1],
+                                      &value_buf[2]};
+    reader->MultiGetBlob(read_options, key_refs, offsets, sizes, statuses,
+                         values, &bytes_read);
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i != 1) {
+        ASSERT_OK(statuses_buf[i]);
+      } else {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      }
+    }
+  }
+}
+
+TEST_F(BlobFileReaderTest, Malformed) {
+  // Write a blob file consisting of nothing but a header, and make sure we
+  // detect the error when we open it for reading
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr uint64_t blob_file_number = 1;
+
+  {
+    constexpr bool has_ttl = false;
+    constexpr ExpirationRange expiration_range;
+
+    const std::string blob_file_path =
+        BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                              FileOptions()));
+
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), blob_file_path, FileOptions(),
+                               immutable_options.clock));
+
+    constexpr Statistics* statistics = nullptr;
+    constexpr bool use_fsync = false;
+    constexpr bool do_flush = false;
+
+    BlobLogWriter blob_log_writer(std::move(file_writer),
+                                  immutable_options.clock, statistics,
+                                  blob_file_number, use_fsync, do_flush);
+
+    BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+                         expiration_range);
+
+    ASSERT_OK(blob_log_writer.WriteHeader(header));
+  }
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, TTL) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = true;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_ExpirationRangeInHeader"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  const ExpirationRange expiration_range_header(
+      1, 2);  // can be made constexpr when we adopt C++14
+  constexpr ExpirationRange expiration_range_footer;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, key, blob, kNoCompression, &blob_offset,
+                &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_ExpirationRangeInFooter"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range_header;
+  const ExpirationRange expiration_range_footer(
+      1, 2);  // can be made constexpr when we adopt C++14
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, key, blob, kNoCompression, &blob_offset,
+                &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_IncorrectColumnFamily"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  constexpr uint32_t incorrect_column_family_id = 2;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     incorrect_column_family_id,
+                                     blob_file_read_hist, blob_file_number,
+                                     nullptr /*IOTracer*/, &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, BlobCRCError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
+        BlobLogRecord* const record = static_cast<BlobLogRecord*>(arg);
+        assert(record);
+
+        record->blob_crc = 0xfaceb00c;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  PinnableSlice value;
+  uint64_t bytes_read = 0;
+
+  ASSERT_TRUE(reader
+                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                            kNoCompression, prefetch_buffer, &value,
+                            &bytes_read)
+                  .IsCorruption());
+  ASSERT_EQ(bytes_read, 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileReaderTest, Compression) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
+                kSnappyCompression, &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  // Make sure the blob can be retrieved with and without checksum verification
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+                              kSnappyCompression, prefetch_buffer, &value,
+                              &bytes_read));
+    ASSERT_EQ(value, blob);
+    ASSERT_EQ(bytes_read, blob_size);
+  }
+
+  read_options.verify_checksums = true;
+
+  {
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+                              kSnappyCompression, prefetch_buffer, &value,
+                              &bytes_read));
+    ASSERT_EQ(value, blob);
+
+    constexpr uint64_t key_size = sizeof(key) - 1;
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_size);
+  }
+}
+
+TEST_F(BlobFileReaderTest, UncompressionError) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_UncompressionError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
+                kSnappyCompression, &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
+        CacheAllocationPtr* const output =
+            static_cast<CacheAllocationPtr*>(arg);
+        assert(output);
+
+        output->reset();
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  PinnableSlice value;
+  uint64_t bytes_read = 0;
+
+  ASSERT_TRUE(reader
+                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                            kSnappyCompression, prefetch_buffer, &value,
+                            &bytes_read)
+                  .IsCorruption());
+  ASSERT_EQ(bytes_read, 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderIOErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileReaderIOErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get()));
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:GetFileSize",
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::ReadHeader:ReadFromFile",
+                            "BlobFileReader::ReadFooter:ReadFromFile",
+                            "BlobFileReader::GetBlob:ReadFromFile"}));
+
+TEST_P(BlobFileReaderIOErrorTest, IOError) {
+  // Simulates an I/O error during the specified step
+
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(fault_injection_env_.get(),
+                            "BlobFileReaderIOErrorTest_IOError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
+
+  const bool fail_during_create =
+      (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
+
+  if (fail_during_create) {
+    ASSERT_TRUE(s.IsIOError());
+  } else {
+    ASSERT_OK(s);
+
+    constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                              kNoCompression, prefetch_buffer, &value,
+                              &bytes_read)
+                    .IsIOError());
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderDecodingErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::ReadHeader:TamperWithResult",
+                            "BlobFileReader::ReadFooter:TamperWithResult",
+                            "BlobFileReader::GetBlob:TamperWithResult"}));
+
+TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderDecodingErrorTest_DecodingError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) {
+    Slice* const slice = static_cast<Slice*>(arg);
+    assert(slice);
+    assert(!slice->empty());
+
+    slice->remove_prefix(1);
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
+
+  const bool fail_during_create =
+      sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";
+
+  if (fail_during_create) {
+    ASSERT_TRUE(s.IsCorruption());
+  } else {
+    ASSERT_OK(s);
+
+    constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+    PinnableSlice value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                              kNoCompression, prefetch_buffer, &value,
+                              &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) {
+  uint64_t blob_file_number = kInvalidBlobFileNumber;
+  uint64_t bytes = 0;
+
+  const Status s = Parse(key, value, &blob_file_number, &bytes);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (blob_file_number == kInvalidBlobFileNumber) {
+    return Status::OK();
+  }
+
+  flows_[blob_file_number].AddInFlow(bytes);
+
+  return Status::OK();
+}
+
+Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) {
+  uint64_t blob_file_number = kInvalidBlobFileNumber;
+  uint64_t bytes = 0;
+
+  const Status s = Parse(key, value, &blob_file_number, &bytes);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (blob_file_number == kInvalidBlobFileNumber) {
+    return Status::OK();
+  }
+
+  // Note: in order to measure the amount of additional garbage, we only need to
+  // track the outflow for preexisting files, i.e. those that also had inflow.
+  // (Newly written files would only have outflow.)
+  auto it = flows_.find(blob_file_number);
+  if (it == flows_.end()) {
+    return Status::OK();
+  }
+
+  it->second.AddOutFlow(bytes);
+
+  return Status::OK();
+}
+
+Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value,
+                               uint64_t* blob_file_number, uint64_t* bytes) {
+  assert(blob_file_number);
+  assert(*blob_file_number == kInvalidBlobFileNumber);
+  assert(bytes);
+  assert(*bytes == 0);
+
+  ParsedInternalKey ikey;
+
+  {
+    constexpr bool log_err_key = false;
+    const Status s = ParseInternalKey(key, &ikey, log_err_key);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (ikey.type != kTypeBlobIndex) {
+    return Status::OK();
+  }
+
+  BlobIndex blob_index;
+
+  {
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (blob_index.IsInlined() || blob_index.HasTTL()) {
+    return Status::Corruption("Unexpected TTL/inlined blob index");
+  }
+
+  *blob_file_number = blob_index.file_number();
+  *bytes =
+      blob_index.size() +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size());
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <unordered_map>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// A class that can be used to compute the amount of additional garbage
+// generated by a compaction. It parses the keys and blob references in the
+// input and output of a compaction, and aggregates the "inflow" and "outflow"
+// on a per-blob file basis. The amount of additional garbage for any given blob
+// file can then be computed by subtracting the outflow from the inflow.
+class BlobGarbageMeter {
+ public:
+  // A class to store the number and total size of blobs on a per-blob file
+  // basis.
+  class BlobStats {
+   public:
+    void Add(uint64_t bytes) {
+      ++count_;
+      bytes_ += bytes;
+    }
+    void Add(uint64_t count, uint64_t bytes) {
+      count_ += count;
+      bytes_ += bytes;
+    }
+
+    uint64_t GetCount() const { return count_; }
+    uint64_t GetBytes() const { return bytes_; }
+
+   private:
+    uint64_t count_ = 0;
+    uint64_t bytes_ = 0;
+  };
+
+  // A class to keep track of the "inflow" and the "outflow" and to compute the
+  // amount of additional garbage for a given blob file.
+  class BlobInOutFlow {
+   public:
+    void AddInFlow(uint64_t bytes) {
+      in_flow_.Add(bytes);
+      assert(IsValid());
+    }
+    void AddOutFlow(uint64_t bytes) {
+      out_flow_.Add(bytes);
+      assert(IsValid());
+    }
+
+    const BlobStats& GetInFlow() const { return in_flow_; }
+    const BlobStats& GetOutFlow() const { return out_flow_; }
+
+    bool IsValid() const {
+      return in_flow_.GetCount() >= out_flow_.GetCount() &&
+             in_flow_.GetBytes() >= out_flow_.GetBytes();
+    }
+    bool HasGarbage() const {
+      assert(IsValid());
+      return in_flow_.GetCount() > out_flow_.GetCount();
+    }
+    uint64_t GetGarbageCount() const {
+      assert(IsValid());
+      assert(HasGarbage());
+      return in_flow_.GetCount() - out_flow_.GetCount();
+    }
+    uint64_t GetGarbageBytes() const {
+      assert(IsValid());
+      assert(HasGarbage());
+      return in_flow_.GetBytes() - out_flow_.GetBytes();
+    }
+
+   private:
+    BlobStats in_flow_;
+    BlobStats out_flow_;
+  };
+
+  Status ProcessInFlow(const Slice& key, const Slice& value);
+  Status ProcessOutFlow(const Slice& key, const Slice& value);
+
+  const std::unordered_map<uint64_t, BlobInOutFlow>& flows() const {
+    return flows_;
+  }
+
+ private:
+  static Status Parse(const Slice& key, const Slice& value,
+                      uint64_t* blob_file_number, uint64_t* bytes);
+
+  std::unordered_map<uint64_t, BlobInOutFlow> flows_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_garbage_meter_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,196 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(BlobGarbageMeterTest, MeasureGarbage) {
+  BlobGarbageMeter blob_garbage_meter;
+
+  struct BlobDescriptor {
+    std::string user_key;
+    uint64_t blob_file_number;
+    uint64_t offset;
+    uint64_t size;
+    CompressionType compression_type;
+    bool has_in_flow;
+    bool has_out_flow;
+
+    uint64_t GetExpectedBytes() const {
+      return size +
+             BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size());
+    }
+  };
+
+  // Note: blob file 4 has the same inflow and outflow and hence no additional
+  // garbage. Blob file 5 has less outflow than inflow and thus it does have
+  // additional garbage. Blob file 6 is a newly written file (i.e. no inflow,
+  // only outflow) and is thus not tracked by the meter.
+  std::vector<BlobDescriptor> blobs{
+      {"key", 4, 1234, 555, kLZ4Compression, true, true},
+      {"other_key", 4, 6789, 101010, kLZ4Compression, true, true},
+      {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true},
+      {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true},
+      {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false},
+      {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false},
+      {"new_key", 6, 7777, 9999, kNoCompression, false, true}};
+
+  for (const auto& blob : blobs) {
+    constexpr SequenceNumber seq = 123;
+    const InternalKey key(blob.user_key, seq, kTypeBlobIndex);
+    const Slice key_slice = key.Encode();
+
+    std::string value;
+    BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size,
+                          blob.compression_type);
+    const Slice value_slice(value);
+
+    if (blob.has_in_flow) {
+      ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+    }
+    if (blob.has_out_flow) {
+      ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+    }
+  }
+
+  const auto& flows = blob_garbage_meter.flows();
+  ASSERT_EQ(flows.size(), 2);
+
+  {
+    const auto it = flows.find(4);
+    ASSERT_NE(it, flows.end());
+
+    const auto& flow = it->second;
+
+    constexpr uint64_t expected_count = 2;
+    const uint64_t expected_bytes =
+        blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes();
+
+    const auto& in = flow.GetInFlow();
+    ASSERT_EQ(in.GetCount(), expected_count);
+    ASSERT_EQ(in.GetBytes(), expected_bytes);
+
+    const auto& out = flow.GetOutFlow();
+    ASSERT_EQ(out.GetCount(), expected_count);
+    ASSERT_EQ(out.GetBytes(), expected_bytes);
+
+    ASSERT_TRUE(flow.IsValid());
+    ASSERT_FALSE(flow.HasGarbage());
+  }
+
+  {
+    const auto it = flows.find(5);
+    ASSERT_NE(it, flows.end());
+
+    const auto& flow = it->second;
+
+    const auto& in = flow.GetInFlow();
+
+    constexpr uint64_t expected_in_count = 4;
+    const uint64_t expected_in_bytes =
+        blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() +
+        blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes();
+
+    ASSERT_EQ(in.GetCount(), expected_in_count);
+    ASSERT_EQ(in.GetBytes(), expected_in_bytes);
+
+    const auto& out = flow.GetOutFlow();
+
+    constexpr uint64_t expected_out_count = 2;
+    const uint64_t expected_out_bytes =
+        blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes();
+
+    ASSERT_EQ(out.GetCount(), expected_out_count);
+    ASSERT_EQ(out.GetBytes(), expected_out_bytes);
+
+    ASSERT_TRUE(flow.IsValid());
+    ASSERT_TRUE(flow.HasGarbage());
+    ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count);
+    ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes);
+  }
+}
+
+TEST(BlobGarbageMeterTest, PlainValue) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeValue);
+  const Slice key_slice = key.Encode();
+
+  constexpr char value[] = "value";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+  ASSERT_TRUE(blob_garbage_meter.flows().empty());
+}
+
+TEST(BlobGarbageMeterTest, CorruptInternalKey) {
+  constexpr char corrupt_key[] = "i_am_corrupt";
+  const Slice key_slice(corrupt_key);
+
+  constexpr char value[] = "value";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, CorruptBlobIndex) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeBlobIndex);
+  const Slice key_slice = key.Encode();
+
+  constexpr char value[] = "i_am_not_a_blob_index";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeBlobIndex);
+  const Slice key_slice = key.Encode();
+
+  constexpr uint64_t expiration = 1234567890;
+  constexpr char inlined_value[] = "inlined";
+
+  std::string value;
+  BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value);
+
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_index.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_index.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_index.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,187 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <sstream>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+//    kInlinedTTL:
+//      +------+------------+---------------+
+//      | type | expiration | value         |
+//      +------+------------+---------------+
+//      | char | varint64   | variable size |
+//      +------+------------+---------------+
+//
+//    kBlob:
+//      +------+-------------+----------+----------+-------------+
+//      | type | file number | offset   | size     | compression |
+//      +------+-------------+----------+----------+-------------+
+//      | char | varint64    | varint64 | varint64 | char        |
+//      +------+-------------+----------+----------+-------------+
+//
+//    kBlobTTL:
+//      +------+------------+-------------+----------+----------+-------------+
+//      | type | expiration | file number | offset   | size     | compression |
+//      +------+------------+-------------+----------+----------+-------------+
+//      | char | varint64   | varint64    | varint64 | varint64 | char        |
+//      +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+  enum class Type : unsigned char {
+    kInlinedTTL = 0,
+    kBlob = 1,
+    kBlobTTL = 2,
+    kUnknown = 3,
+  };
+
+  BlobIndex() : type_(Type::kUnknown) {}
+
+  BlobIndex(const BlobIndex&) = default;
+  BlobIndex& operator=(const BlobIndex&) = default;
+
+  bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+  bool HasTTL() const {
+    return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+  }
+
+  uint64_t expiration() const {
+    assert(HasTTL());
+    return expiration_;
+  }
+
+  const Slice& value() const {
+    assert(IsInlined());
+    return value_;
+  }
+
+  uint64_t file_number() const {
+    assert(!IsInlined());
+    return file_number_;
+  }
+
+  uint64_t offset() const {
+    assert(!IsInlined());
+    return offset_;
+  }
+
+  uint64_t size() const {
+    assert(!IsInlined());
+    return size_;
+  }
+
+  CompressionType compression() const {
+    assert(!IsInlined());
+    return compression_;
+  }
+
+  Status DecodeFrom(Slice slice) {
+    static const std::string kErrorMessage = "Error while decoding blob index";
+    assert(slice.size() > 0);
+    type_ = static_cast<Type>(*slice.data());
+    if (type_ >= Type::kUnknown) {
+      return Status::Corruption(
+          kErrorMessage,
+          "Unknown blob index type: " + ToString(static_cast<char>(type_)));
+    }
+    slice = Slice(slice.data() + 1, slice.size() - 1);
+    if (HasTTL()) {
+      if (!GetVarint64(&slice, &expiration_)) {
+        return Status::Corruption(kErrorMessage, "Corrupted expiration");
+      }
+    }
+    if (IsInlined()) {
+      value_ = slice;
+    } else {
+      if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+          GetVarint64(&slice, &size_) && slice.size() == 1) {
+        compression_ = static_cast<CompressionType>(*slice.data());
+      } else {
+        return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+      }
+    }
+    return Status::OK();
+  }
+
+  std::string DebugString(bool output_hex) const {
+    std::ostringstream oss;
+
+    if (IsInlined()) {
+      oss << "[inlined blob] value:" << value_.ToString(output_hex);
+    } else {
+      oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+          << " size:" << size_
+          << " compression: " << CompressionTypeToString(compression_);
+    }
+
+    if (HasTTL()) {
+      oss << " exp:" << expiration_;
+    }
+
+    return oss.str();
+  }
+
+  static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+                               const Slice& value) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(1 + kMaxVarint64Length + value.size());
+    dst->push_back(static_cast<char>(Type::kInlinedTTL));
+    PutVarint64(dst, expiration);
+    dst->append(value.data(), value.size());
+  }
+
+  static void EncodeBlob(std::string* dst, uint64_t file_number,
+                         uint64_t offset, uint64_t size,
+                         CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 3 + 2);
+    dst->push_back(static_cast<char>(Type::kBlob));
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+  static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+                            uint64_t file_number, uint64_t offset,
+                            uint64_t size, CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 4 + 2);
+    dst->push_back(static_cast<char>(Type::kBlobTTL));
+    PutVarint64(dst, expiration);
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+ private:
+  Type type_ = Type::kUnknown;
+  uint64_t expiration_ = 0;
+  Slice value_;
+  uint64_t file_number_ = 0;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+  CompressionType compression_ = kNoCompression;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,145 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_format.h"
+
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void BlobLogHeader::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogHeader::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed32(dst, version);
+  PutFixed32(dst, column_family_id);
+  unsigned char flags = (has_ttl ? 1 : 0);
+  dst->push_back(flags);
+  dst->push_back(compression);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
+}
+
+Status BlobLogHeader::DecodeFrom(Slice src) {
+  static const std::string kErrorMessage =
+      "Error while decoding blob log header";
+  if (src.size() != BlobLogHeader::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file header size");
+  }
+  uint32_t magic_number;
+  unsigned char flags;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
+      !GetFixed32(&src, &column_family_id)) {
+    return Status::Corruption(
+        kErrorMessage,
+        "Error decoding magic number, version and column family id");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (version != kVersion1) {
+    return Status::Corruption(kErrorMessage, "Unknown header version");
+  }
+  flags = src.data()[0];
+  compression = static_cast<CompressionType>(src.data()[1]);
+  has_ttl = (flags & 1) == 1;
+  src.remove_prefix(2);
+  if (!GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second)) {
+    return Status::Corruption(kErrorMessage, "Error decoding expiration range");
+  }
+  return Status::OK();
+}
+
+void BlobLogFooter::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogFooter::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed64(dst, blob_count);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
+  crc = crc32c::Value(dst->c_str(), dst->size());
+  crc = crc32c::Mask(crc);
+  PutFixed32(dst, crc);
+}
+
+Status BlobLogFooter::DecodeFrom(Slice src) {
+  static const std::string kErrorMessage =
+      "Error while decoding blob log footer";
+  if (src.size() != BlobLogFooter::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file footer size");
+  }
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t));
+  src_crc = crc32c::Mask(src_crc);
+  uint32_t magic_number = 0;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
+      !GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (src_crc != crc) {
+    return Status::Corruption(kErrorMessage, "CRC mismatch");
+  }
+  return Status::OK();
+}
+
+void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
+  PutFixed64(dst, key.size());
+  PutFixed64(dst, value.size());
+  PutFixed64(dst, expiration);
+  header_crc = crc32c::Value(dst->c_str(), dst->size());
+  header_crc = crc32c::Mask(header_crc);
+  PutFixed32(dst, header_crc);
+  blob_crc = crc32c::Value(key.data(), key.size());
+  blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
+  blob_crc = crc32c::Mask(blob_crc);
+  PutFixed32(dst, blob_crc);
+}
+
+Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
+  static const std::string kErrorMessage = "Error while decoding blob record";
+  if (src.size() != BlobLogRecord::kHeaderSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob record header size");
+  }
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
+  src_crc = crc32c::Mask(src_crc);
+  if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
+      !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
+      !GetFixed32(&src, &blob_crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
+  }
+  if (src_crc != header_crc) {
+    return Status::Corruption(kErrorMessage, "Header CRC mismatch");
+  }
+  return Status::OK();
+}
+
+Status BlobLogRecord::CheckBlobCRC() const {
+  uint32_t expected_crc = 0;
+  expected_crc = crc32c::Value(key.data(), key.size());
+  expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
+  expected_crc = crc32c::Mask(expected_crc);
+  if (expected_crc != blob_crc) {
+    return Status::Corruption("Blob CRC mismatch");
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_format.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_format.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,149 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Log format information shared by reader and writer.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint32_t kMagicNumber = 2395959;  // 0x00248f37
+constexpr uint32_t kVersion1 = 1;
+
+using ExpirationRange = std::pair<uint64_t, uint64_t>;
+
+// Format of blob log file header (30 bytes):
+//
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    | magic number | version |  cf id  | flags | compression | expiration range  |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    |   Fixed32    | Fixed32 | Fixed32 | char  |    char     | Fixed64   Fixed64 |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//
+// List of flags:
+//   has_ttl: Whether the file contain TTL data.
+//
+// Expiration range in the header is a rough range based on
+// blob_db_options.ttl_range_secs.
+struct BlobLogHeader {
+  static constexpr size_t kSize = 30;
+
+  BlobLogHeader() = default;
+  BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
+                bool _has_ttl, const ExpirationRange& _expiration_range)
+      : column_family_id(_column_family_id),
+        compression(_compression),
+        has_ttl(_has_ttl),
+        expiration_range(_expiration_range) {}
+
+  uint32_t version = kVersion1;
+  uint32_t column_family_id = 0;
+  CompressionType compression = kNoCompression;
+  bool has_ttl = false;
+  ExpirationRange expiration_range;
+
+  void EncodeTo(std::string* dst);
+
+  Status DecodeFrom(Slice slice);
+};
+
+// Format of blob log file footer (32 bytes):
+//
+//    +--------------+------------+-------------------+------------+
+//    | magic number | blob count | expiration range  | footer CRC |
+//    +--------------+------------+-------------------+------------+
+//    |   Fixed32    |  Fixed64   | Fixed64 + Fixed64 |   Fixed32  |
+//    +--------------+------------+-------------------+------------+
+//
+// The footer will be presented only when the blob file is properly closed.
+//
+// Unlike the same field in file header, expiration range in the footer is the
+// range of smallest and largest expiration of the data in this file.
+struct BlobLogFooter {
+  static constexpr size_t kSize = 32;
+
+  uint64_t blob_count = 0;
+  ExpirationRange expiration_range = std::make_pair(0, 0);
+  uint32_t crc = 0;
+
+  void EncodeTo(std::string* dst);
+
+  Status DecodeFrom(Slice slice);
+};
+
+// Blob record format (32 bytes header + key + value):
+//
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    | key length | value length | expiration | header CRC | blob CRC |   key   |   value   |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    |   Fixed64  |   Fixed64    |  Fixed64   |  Fixed32   | Fixed32  | key len | value len |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//
+// If file has has_ttl = false, expiration field is always 0, and the blob
+// doesn't has expiration.
+//
+// Also note that if compression is used, value is compressed value and value
+// length is compressed value length.
+//
+// Header CRC is the checksum of (key_len + val_len + expiration), while
+// blob CRC is the checksum of (key + value).
+//
+// We could use variable length encoding (Varint64) to save more space, but it
+// make reader more complicated.
+struct BlobLogRecord {
+  // header include fields up to blob CRC
+  static constexpr size_t kHeaderSize = 32;
+
+  // Note that the offset field of BlobIndex actually points to the blob value
+  // as opposed to the start of the blob record. The following method can
+  // be used to calculate the adjustment needed to read the blob record header.
+  static constexpr uint64_t CalculateAdjustmentForRecordHeader(
+      uint64_t key_size) {
+    return key_size + kHeaderSize;
+  }
+
+  uint64_t key_size = 0;
+  uint64_t value_size = 0;
+  uint64_t expiration = 0;
+  uint32_t header_crc = 0;
+  uint32_t blob_crc = 0;
+  Slice key;
+  Slice value;
+  std::unique_ptr<char[]> key_buf;
+  std::unique_ptr<char[]> value_buf;
+
+  uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
+
+  void EncodeHeaderTo(std::string* dst);
+
+  Status DecodeHeaderFrom(Slice src);
+
+  Status CheckBlobCRC() const;
+};
+
+// Checks whether a blob offset is potentially valid or not.
+inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size,
+                              uint64_t value_size, uint64_t file_size) {
+  if (value_offset <
+      BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) {
+    return false;
+  }
+
+  if (value_offset + value_size + BlobLogFooter::kSize > file_size) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,132 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_sequential_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/statistics.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogSequentialReader::BlobLogSequentialReader(
+    std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock,
+    Statistics* statistics)
+    : file_(std::move(file_reader)),
+      clock_(clock),
+      statistics_(statistics),
+      next_byte_(0) {}
+
+BlobLogSequentialReader::~BlobLogSequentialReader() = default;
+
+Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice,
+                                          char* buf) {
+  assert(slice);
+  assert(file_);
+
+  StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+  Status s = file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size),
+                         slice, buf, nullptr);
+  next_byte_ += size;
+  if (!s.ok()) {
+    return s;
+  }
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
+  if (slice->size() != size) {
+    return Status::Corruption("EOF reached while reading record");
+  }
+  return s;
+}
+
+Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) {
+  assert(header);
+  assert(next_byte_ == 0);
+
+  static_assert(BlobLogHeader::kSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogHeader::kSize");
+
+  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (buffer_.size() != BlobLogHeader::kSize) {
+    return Status::Corruption("EOF reached before file header");
+  }
+
+  return header->DecodeFrom(buffer_);
+}
+
+Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record,
+                                           ReadLevel level,
+                                           uint64_t* blob_offset) {
+  assert(record);
+  static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogRecord::kHeaderSize");
+
+  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (buffer_.size() != BlobLogRecord::kHeaderSize) {
+    return Status::Corruption("EOF reached before record header");
+  }
+
+  s = record->DecodeHeaderFrom(buffer_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  uint64_t kb_size = record->key_size + record->value_size;
+  if (blob_offset != nullptr) {
+    *blob_offset = next_byte_ + record->key_size;
+  }
+
+  switch (level) {
+    case kReadHeader:
+      next_byte_ += kb_size;
+      break;
+
+    case kReadHeaderKey:
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+      next_byte_ += record->value_size;
+      break;
+
+    case kReadHeaderKeyBlob:
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+      if (s.ok()) {
+        record->value_buf.reset(new char[record->value_size]);
+        s = ReadSlice(record->value_size, &record->value,
+                      record->value_buf.get());
+      }
+      if (s.ok()) {
+        s = record->CheckBlobCRC();
+      }
+      break;
+  }
+  return s;
+}
+
+Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) {
+  assert(footer);
+  static_assert(BlobLogFooter::kSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogFooter::kSize");
+
+  Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (buffer_.size() != BlobLogFooter::kSize) {
+    return Status::Corruption("EOF reached before file footer");
+  }
+
+  return footer->DecodeFrom(buffer_);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_sequential_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,83 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <memory>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c))
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReader;
+class Env;
+class Statistics;
+class Status;
+class SystemClock;
+
+/**
+ * BlobLogSequentialReader is a general purpose log stream reader
+ * implementation. The actual job of reading from the device is implemented by
+ * the RandomAccessFileReader interface.
+ *
+ * Please see BlobLogWriter for details on the file and record layout.
+ */
+
+class BlobLogSequentialReader {
+ public:
+  enum ReadLevel {
+    kReadHeader,
+    kReadHeaderKey,
+    kReadHeaderKeyBlob,
+  };
+
+  // Create a reader that will return log records from "*file_reader".
+  BlobLogSequentialReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+                          SystemClock* clock, Statistics* statistics);
+
+  // No copying allowed
+  BlobLogSequentialReader(const BlobLogSequentialReader&) = delete;
+  BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete;
+
+  ~BlobLogSequentialReader();
+
+  Status ReadHeader(BlobLogHeader* header);
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input. The contents filled in
+  // *record will only be valid until the next mutating operation on this
+  // reader.
+  // If blob_offset is non-null, return offset of the blob through it.
+  Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
+                    uint64_t* blob_offset = nullptr);
+
+  Status ReadFooter(BlobLogFooter* footer);
+
+  void ResetNextByte() { next_byte_ = 0; }
+
+  uint64_t GetNextByte() const { return next_byte_; }
+
+ private:
+  Status ReadSlice(uint64_t size, Slice* slice, char* buf);
+
+  const std::unique_ptr<RandomAccessFileReader> file_;
+  SystemClock* clock_;
+
+  Statistics* statistics_;
+
+  Slice buffer_;
+  char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize,
+                                   BlobLogRecord::kHeaderSize)];
+
+  // which byte to read next
+  uint64_t next_byte_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#undef MAX_HEADER_SIZE
\ No newline at end of file
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,172 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_log_writer.h"
+
+#include <cstdint>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogWriter::BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest,
+                             SystemClock* clock, Statistics* statistics,
+                             uint64_t log_number, bool use_fs, bool do_flush,
+                             uint64_t boffset)
+    : dest_(std::move(dest)),
+      clock_(clock),
+      statistics_(statistics),
+      log_number_(log_number),
+      block_offset_(boffset),
+      use_fsync_(use_fs),
+      do_flush_(do_flush),
+      last_elem_type_(kEtNone) {}
+
+BlobLogWriter::~BlobLogWriter() = default;
+
+Status BlobLogWriter::Sync() {
+  TEST_SYNC_POINT("BlobLogWriter::Sync");
+
+  StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+  Status s = dest_->Sync(use_fsync_);
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
+  return s;
+}
+
+Status BlobLogWriter::WriteHeader(BlobLogHeader& header) {
+  assert(block_offset_ == 0);
+  assert(last_elem_type_ == kEtNone);
+  std::string str;
+  header.EncodeTo(&str);
+
+  Status s = dest_->Append(Slice(str));
+  if (s.ok()) {
+    block_offset_ += str.size();
+    if (do_flush_) {
+      s = dest_->Flush();
+    }
+  }
+  last_elem_type_ = kEtFileHdr;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogHeader::kSize);
+  return s;
+}
+
+Status BlobLogWriter::AppendFooter(BlobLogFooter& footer,
+                                   std::string* checksum_method,
+                                   std::string* checksum_value) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string str;
+  footer.EncodeTo(&str);
+
+  Status s = dest_->Append(Slice(str));
+  if (s.ok()) {
+    block_offset_ += str.size();
+
+    s = Sync();
+
+    if (s.ok()) {
+      s = dest_->Close();
+
+      if (s.ok()) {
+        assert(!!checksum_method == !!checksum_value);
+
+        if (checksum_method) {
+          assert(checksum_method->empty());
+
+          std::string method = dest_->GetFileChecksumFuncName();
+          if (method != kUnknownFileChecksumFuncName) {
+            *checksum_method = std::move(method);
+          }
+        }
+        if (checksum_value) {
+          assert(checksum_value->empty());
+
+          std::string value = dest_->GetFileChecksum();
+          if (value != kUnknownFileChecksum) {
+            *checksum_value = std::move(value);
+          }
+        }
+      }
+    }
+
+    dest_.reset();
+  }
+
+  last_elem_type_ = kEtFileFooter;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogFooter::kSize);
+  return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+                                uint64_t expiration, uint64_t* key_offset,
+                                uint64_t* blob_offset) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, expiration);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+                                uint64_t* key_offset, uint64_t* blob_offset) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, 0);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key,
+                                        const Slice& val, uint64_t expiration) {
+  BlobLogRecord record;
+  record.key = key;
+  record.value = val;
+  record.expiration = expiration;
+  record.EncodeHeaderTo(buf);
+}
+
+Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf,
+                                         const Slice& key, const Slice& val,
+                                         uint64_t* key_offset,
+                                         uint64_t* blob_offset) {
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
+  Status s = dest_->Append(Slice(headerbuf));
+  if (s.ok()) {
+    s = dest_->Append(key);
+  }
+  if (s.ok()) {
+    s = dest_->Append(val);
+  }
+  if (do_flush_ && s.ok()) {
+    s = dest_->Flush();
+  }
+
+  *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
+  *blob_offset = *key_offset + key.size();
+  block_offset_ = *blob_offset + val.size();
+  last_elem_type_ = kEtRecord;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogRecord::kHeaderSize + key.size() + val.size());
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/blob_log_writer.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,83 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+class SystemClock;
+/**
+ * BlobLogWriter is the blob log stream writer. It provides an append-only
+ * abstraction for writing blob data.
+ *
+ *
+ * Look at blob_db_format.h to see the details of the record formats.
+ */
+
+class BlobLogWriter {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this BlobLogWriter is in use.
+  BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, SystemClock* clock,
+                Statistics* statistics, uint64_t log_number, bool use_fsync,
+                bool do_flush, uint64_t boffset = 0);
+  // No copying allowed
+  BlobLogWriter(const BlobLogWriter&) = delete;
+  BlobLogWriter& operator=(const BlobLogWriter&) = delete;
+
+  ~BlobLogWriter();
+
+  static void ConstructBlobHeader(std::string* buf, const Slice& key,
+                                  const Slice& val, uint64_t expiration);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
+                   uint64_t* blob_offset);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
+                   uint64_t* key_offset, uint64_t* blob_offset);
+
+  Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
+                            const Slice& val, uint64_t* key_offset,
+                            uint64_t* blob_offset);
+
+  Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method,
+                      std::string* checksum_value);
+
+  Status WriteHeader(BlobLogHeader& header);
+
+  WritableFileWriter* file() { return dest_.get(); }
+
+  const WritableFileWriter* file() const { return dest_.get(); }
+
+  uint64_t get_log_number() const { return log_number_; }
+
+  Status Sync();
+
+ private:
+  std::unique_ptr<WritableFileWriter> dest_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+  uint64_t log_number_;
+  uint64_t block_offset_;  // Current offset in block
+  bool use_fsync_;
+  bool do_flush_;
+
+ public:
+  enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
+  ElemType last_elem_type_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,1026 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <sstream>
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobBasicTest : public DBTestBase {
+ protected:
+  DBBlobBasicTest()
+      : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBBlobBasicTest, GetBlob) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get(key), blob_value);
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches; however, the blob itself can only be
+  // read from the blob file, so the read should return Incomplete.
+  ReadOptions read_options;
+  read_options.read_tier = kBlockCacheTier;
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+                  .IsIncomplete());
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobs) {
+  constexpr size_t min_blob_size = 6;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  // Put then retrieve three key-values. The first value is below the size limit
+  // and is thus stored inline; the other two are stored separately as blobs.
+  constexpr size_t num_keys = 3;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "short";
+  static_assert(sizeof(first_value) - 1 < min_blob_size,
+                "first_value too long to be inlined");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "long_value";
+  static_assert(sizeof(second_value) - 1 >= min_blob_size,
+                "second_value too short to be stored as blob");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches. The first (inlined) value should be
+  // successfully read; however, the two blob values could only be read from the
+  // blob file, so for those the read should return Incomplete.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_TRUE(statuses[1].IsIncomplete());
+
+    ASSERT_TRUE(statuses[2].IsIncomplete());
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
+  Options options = GetDefaultOptions();
+
+  // First, create an external SST file ["b"].
+  const std::string file_path = dbname_ + "/test.sst";
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions());
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    ASSERT_OK(sst_file_writer.Put("b", "b_value"));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  options.enable_blob_files = true;
+  options.min_blob_size = 1000;
+  options.use_direct_reads = true;
+  options.allow_ingest_behind = true;
+
+  // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+  // new table file when encountering a new key whose 1-byte prefix changes.
+  constexpr size_t key_len = 1;
+  options.sst_partitioner_factory =
+      NewSstPartitionerFixedPrefixFactory(key_len);
+
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    ROCKSDB_GTEST_SKIP("This test requires direct IO support");
+    return;
+  }
+  ASSERT_OK(s);
+
+  constexpr size_t num_keys = 3;
+  constexpr size_t blob_size = 3000;
+
+  constexpr char first_key[] = "a";
+  const std::string first_blob(blob_size, 'a');
+  ASSERT_OK(Put(first_key, first_blob));
+
+  constexpr char second_key[] = "b";
+  const std::string second_blob(2 * blob_size, 'b');
+  ASSERT_OK(Put(second_key, second_blob));
+
+  constexpr char third_key[] = "d";
+  const std::string third_blob(blob_size, 'd');
+  ASSERT_OK(Put(third_key, third_blob));
+
+  // first_blob, second_blob and third_blob in the same blob file.
+  //      SST                    Blob file
+  // L0  ["a",    "b",    "d"]   |'aaaa', 'bbbb', 'dddd'|
+  //       |       |       |         ^       ^        ^
+  //       |       |       |         |       |        |
+  //       |       |       +---------|-------|--------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_OK(Flush());
+
+  constexpr char fourth_key[] = "c";
+  const std::string fourth_blob(blob_size, 'c');
+  ASSERT_OK(Put(fourth_key, fourth_blob));
+  // fourth_blob in another blob file.
+  //      SST                    Blob file                 SST     Blob file
+  // L0  ["a",    "b",    "d"]   |'aaaa', 'bbbb', 'dddd'|  ["c"]   |'cccc'|
+  //       |       |       |         ^       ^        ^      |       ^
+  //       |       |       |         |       |        |      |       |
+  //       |       |       +---------|-------|--------+      +-------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+
+  // Due to the above sst partitioner, we get 4 L1 files. The blob files are
+  // unchanged.
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]   ["b"]   ["c"]       |       |   ["d"]       |
+  //       |       |       |         |       |               |
+  //       |       |       +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+  {
+    // Ingest the external SST file into bottommost level.
+    std::vector<std::string> ext_files{file_path};
+    IngestExternalFileOptions opts;
+    opts.ingest_behind = true;
+    ASSERT_OK(
+        db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts));
+  }
+
+  // Now the database becomes as follows.
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]   ["b"]   ["c"]       |       |   ["d"]       |
+  //       |       |       |         |       |               |
+  //       |       |       +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  //
+  // L6          ["b"]
+
+  {
+    // Compact ["b"] to bottommost level.
+    Slice begin = Slice(second_key);
+    Slice end = Slice(second_key);
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, &begin, &end));
+  }
+
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]           ["c"]       |       |   ["d"]       |
+  //       |               |         |       |               |
+  //       |               +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------|-----------------+
+  //               |
+  // L6          ["b"]
+  ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6));
+
+  bool called = false;
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) {
+        auto* aligned_reqs = static_cast<std::vector<FSReadRequest>*>(arg);
+        assert(aligned_reqs);
+        ASSERT_EQ(1, aligned_reqs->size());
+        called = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::array<Slice, num_keys> keys{{first_key, third_key, second_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    // The MultiGet(), when constructing the KeyContexts, will process the keys
+    // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1,
+    // while ["b"] resides in L6.
+    // Consequently, the original FSReadRequest list prepared by
+    // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as
+    // follows:
+    //
+    // ["a", offset=30, len=3033],
+    // ["d", offset=9096, len=3033],
+    // ["b", offset=3063, len=6033]
+    //
+    // If we do not sort them before calling MultiRead() in DirectIO, then the
+    // underlying IO merging logic will yield two requests.
+    //
+    // [offset=0, len=4096] (for "a")
+    // [offset=0, len=12288] (result of merging the request for "d" and "b")
+    //
+    // We need to sort them in Version::MultiGetBlob() so that the underlying
+    // IO merging logic in DirectIO mode works as expected. The correct
+    // behavior will be one aligned request:
+    //
+    // [offset=0, len=12288]
+
+    db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    ASSERT_TRUE(called);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_blob);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], third_blob);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], second_blob);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t kNumBlobFiles = 3;
+  constexpr size_t kNumBlobsPerFile = 3;
+  constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> value_strs;
+  for (size_t i = 0; i < kNumBlobFiles; ++i) {
+    for (size_t j = 0; j < kNumBlobsPerFile; ++j) {
+      std::string key = "key" + std::to_string(i) + "_" + std::to_string(j);
+      std::string value =
+          "value_as_blob" + std::to_string(i) + "_" + std::to_string(j);
+      ASSERT_OK(Put(key, value));
+      key_strs.push_back(key);
+      value_strs.push_back(value);
+    }
+    ASSERT_OK(Flush());
+  }
+  assert(key_strs.size() == kNumKeys);
+  std::array<Slice, kNumKeys> keys;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    keys[i] = key_strs[i];
+  }
+  std::array<PinnableSlice, kNumKeys> values;
+  std::array<Status, kNumKeys> statuses;
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                &values[0], &statuses[0]);
+
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_EQ(value_strs[i], values[i]);
+  }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+
+  // Fake a corrupt blob index.
+  const std::string blob_index("foobar");
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+
+  DestroyAndReopen(options);
+
+  constexpr size_t kNumOfKeys = 3;
+  std::array<std::string, kNumOfKeys> key_strs;
+  std::array<std::string, kNumOfKeys> value_strs;
+  std::array<Slice, kNumOfKeys + 1> keys;
+  for (size_t i = 0; i < kNumOfKeys; ++i) {
+    key_strs[i] = "foo" + std::to_string(i);
+    value_strs[i] = "blob_value" + std::to_string(i);
+    ASSERT_OK(Put(key_strs[i], value_strs[i]));
+    keys[i] = key_strs[i];
+  }
+
+  constexpr char key[] = "key";
+  {
+    // Fake a corrupt blob index.
+    const std::string blob_index("foobar");
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    keys[kNumOfKeys] = Slice(static_cast<const char*>(key), sizeof(key) - 1);
+  }
+
+  ASSERT_OK(Flush());
+
+  std::array<PinnableSlice, kNumOfKeys + 1> values;
+  std::array<Status, kNumOfKeys + 1> statuses;
+  db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1,
+                keys.data(), values.data(), statuses.data(),
+                /*sorted_input=*/false);
+  for (size_t i = 0; i < kNumOfKeys + 1; ++i) {
+    if (i != kNumOfKeys) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ("blob_value" + std::to_string(i), values[i]);
+    } else {
+      ASSERT_TRUE(statuses[i].IsCorruption());
+    }
+  }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t kNumOfKeys = 3;
+  std::array<std::string, kNumOfKeys> key_bufs;
+  std::array<std::string, kNumOfKeys> value_bufs;
+  std::array<Slice, kNumOfKeys> keys;
+  for (size_t i = 0; i < kNumOfKeys; ++i) {
+    key_bufs[i] = "foo" + std::to_string(i);
+    value_bufs[i] = "blob_value" + std::to_string(i);
+    ASSERT_OK(Put(key_bufs[i], value_bufs[i]));
+    keys[i] = key_bufs[i];
+  }
+  ASSERT_OK(Flush());
+
+  std::array<PinnableSlice, kNumOfKeys> values;
+  std::array<Status, kNumOfKeys> statuses;
+  ReadOptions read_opts;
+  read_opts.value_size_soft_limit = 1;
+  db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys,
+                keys.data(), values.data(), statuses.data(),
+                /*sorted_input=*/true);
+  for (const auto& s : statuses) {
+    ASSERT_TRUE(s.IsAborted());
+  }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, GenerateIOTracing) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::string trace_file = dbname_ + "/io_trace_file";
+
+  Reopen(options);
+  {
+    // Create IO trace file
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(
+        NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
+    ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
+
+    constexpr char key[] = "key";
+    constexpr char blob_value[] = "blob_value";
+
+    ASSERT_OK(Put(key, blob_value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(Get(key), blob_value);
+
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file));
+  }
+  {
+    // Parse trace file to check file operations related to blob files are
+    // recorded.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(
+        NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    // Read records.
+    int blob_files_op_count = 0;
+    Status status;
+    while (true) {
+      IOTraceRecord record;
+      status = reader.ReadIOOp(&record);
+      if (!status.ok()) {
+        break;
+      }
+      if (record.file_name.find("blob") != std::string::npos) {
+        blob_files_op_count++;
+      }
+    }
+    // Assuming blob files will have Append, Close and then Read operations.
+    ASSERT_GT(blob_files_op_count, 2);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  Reopen(options);
+
+  ASSERT_OK(dbfull()->DisableFileDeletions());
+  constexpr int kNumTableFiles = 2;
+  for (int i = 0; i < kNumTableFiles; ++i) {
+    for (char ch = 'a'; ch != 'c'; ++ch) {
+      std::string key(1, ch);
+      ASSERT_OK(Put(key, "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  Close();
+
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  std::string blob_file_path;
+  uint64_t max_blob_file_num = kInvalidBlobFileNumber;
+  for (const auto& fname : files) {
+    uint64_t file_num = 0;
+    FileType type;
+    if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
+        type == kBlobFile) {
+      if (file_num > max_blob_file_num) {
+        max_blob_file_num = file_num;
+        blob_file_path = dbname_ + "/" + fname;
+      }
+    }
+  }
+  ASSERT_OK(env_->DeleteFile(blob_file_path));
+
+  options.best_efforts_recovery = true;
+  Reopen(options);
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
+  ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
+}
+
+TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) {
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v3"));
+  ASSERT_OK(Flush());
+
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value));
+  ASSERT_EQ(Get("Key1"), "v1,v2,v3");
+}
+
+TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
+  constexpr size_t num_keys = 3;
+
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key0", "v0_0"));
+  ASSERT_OK(Put("Key1", "v1_0"));
+  ASSERT_OK(Put("Key2", "v2_0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_1"));
+  ASSERT_OK(Merge("Key1", "v1_1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_2"));
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
+
+  ASSERT_OK(statuses[1]);
+  ASSERT_EQ(values[1], "v1_0,v1_1");
+
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v2_0");
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, Properties) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key1[] = "key1";
+  constexpr size_t key1_size = sizeof(key1) - 1;
+
+  constexpr char key2[] = "key2";
+  constexpr size_t key2_size = sizeof(key2) - 1;
+
+  constexpr char key3[] = "key3";
+  constexpr size_t key3_size = sizeof(key3) - 1;
+
+  constexpr char blob[] = "0000000000";
+  constexpr size_t blob_size = sizeof(blob) - 1;
+
+  ASSERT_OK(Put(key1, blob));
+  ASSERT_OK(Put(key2, blob));
+  ASSERT_OK(Flush());
+
+  constexpr size_t first_blob_file_expected_size =
+      BlobLogHeader::kSize +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + blob_size +
+      BlobLogFooter::kSize;
+
+  ASSERT_OK(Put(key3, blob));
+  ASSERT_OK(Flush());
+
+  constexpr size_t second_blob_file_expected_size =
+      BlobLogHeader::kSize +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size +
+      BlobLogFooter::kSize;
+
+  constexpr size_t total_expected_size =
+      first_blob_file_expected_size + second_blob_file_expected_size;
+
+  // Number of blob files
+  uint64_t num_blob_files = 0;
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files));
+  ASSERT_EQ(num_blob_files, 2);
+
+  // Total size of live blob files
+  uint64_t live_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize,
+                                  &live_blob_file_size));
+  ASSERT_EQ(live_blob_file_size, total_expected_size);
+
+  // Total size of all blob files across all versions
+  // Note: this should be the same as above since we only have one
+  // version at this point.
+  uint64_t total_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+                                  &total_blob_file_size));
+  ASSERT_EQ(total_blob_file_size, total_expected_size);
+
+  // Delete key2 to create some garbage
+  ASSERT_OK(Delete(key2));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  constexpr size_t expected_garbage_size =
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) + blob_size;
+
+  // Blob file stats
+  std::string blob_stats;
+  ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats));
+
+  std::ostringstream oss;
+  oss << "Number of blob files: 2\nTotal size of blob files: "
+      << total_expected_size
+      << "\nTotal size of garbage in blob files: " << expected_garbage_size
+      << '\n';
+
+  ASSERT_EQ(blob_stats, oss.str());
+}
+
+TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key1[] = "key1";
+  constexpr char key2[] = "key2";
+  constexpr char key3[] = "key3";
+
+  constexpr size_t key_size = sizeof(key1) - 1;
+  static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2");
+  static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3");
+
+  constexpr char blob[] = "0000000000";
+  constexpr size_t blob_size = sizeof(blob) - 1;
+
+  ASSERT_OK(Put(key1, blob));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(key2, blob));
+  ASSERT_OK(Flush());
+
+  // Create an iterator to keep the current version alive
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  ASSERT_OK(iter->status());
+
+  // Note: the Delete and subsequent compaction results in the first blob file
+  // not making it to the final version. (It is still part of the previous
+  // version kept alive by the iterator though.) On the other hand, the Put
+  // results in a third blob file.
+  ASSERT_OK(Delete(key1));
+  ASSERT_OK(Put(key3, blob));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  // Total size of all blob files across all versions: between the two versions,
+  // we should have three blob files of the same size with one blob each.
+  // The version kept alive by the iterator contains the first and the second
+  // blob file, while the final version contains the second and the third blob
+  // file. (The second blob file is thus shared by the two versions but should
+  // be counted only once.)
+  uint64_t total_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+                                  &total_blob_file_size));
+  ASSERT_EQ(total_blob_file_size,
+            3 * (BlobLogHeader::kSize +
+                 BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                 blob_size + BlobLogFooter::kSize));
+}
+#endif  // !ROCKSDB_LITE
+
+class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
+                               public testing::WithParamInterface<std::string> {
+ protected:
+  DBBlobBasicIOErrorTest() : sync_point_(GetParam()) {
+    fault_injection_env_.reset(new FaultInjectionTestEnv(env_));
+  }
+  ~DBBlobBasicIOErrorTest() { Close(); }
+
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  std::string sync_point_;
+};
+
+class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest {
+ public:
+  DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::GetBlob:ReadFromFile"}));
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::MultiGetBlob:ReadFromFile"}));
+
+TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t num_keys = 2;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{first_key, second_key}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(statuses[0].IsIOError());
+  ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t num_keys = 2;
+
+  constexpr char key1[] = "key1";
+  constexpr char value1[] = "blob1";
+
+  ASSERT_OK(Put(key1, value1));
+  ASSERT_OK(Flush());
+
+  constexpr char key2[] = "key2";
+  constexpr char value2[] = "blob2";
+
+  ASSERT_OK(Put(key2, value2));
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{key1, key2}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  bool first_blob_file = true;
+  SyncPoint::GetInstance()->SetCallBack(
+      sync_point_, [&first_blob_file, this](void* /* arg */) {
+        if (first_blob_file) {
+          first_blob_file = false;
+          return;
+        }
+        fault_injection_env_->SetFilesystemActive(false,
+                                                  Status::IOError(sync_point_));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                keys.data(), values.data(), statuses.data());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(value1, values[0]);
+  ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+namespace {
+
+class ReadBlobCompactionFilter : public CompactionFilter {
+ public:
+  ReadBlobCompactionFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.read.blob";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType value_type,
+      const Slice& existing_value, std::string* new_value,
+      std::string* /*skip_until*/) const override {
+    if (value_type != CompactionFilter::ValueType::kValue) {
+      return CompactionFilter::Decision::kKeep;
+    }
+    assert(new_value);
+    new_value->assign(existing_value.data(), existing_value.size());
+    return CompactionFilter::Decision::kChangeValue;
+  }
+};
+
+}  // anonymous namespace
+
+TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ReadBlobCompactionFilter);
+  options.compaction_filter = compaction_filter_guard.get();
+
+  DestroyAndReopen(options);
+  constexpr char key[] = "foo";
+  constexpr char blob_value[] = "foo_blob_value";
+  ASSERT_OK(Put(key, blob_value));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,718 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCompactionTest : public DBTestBase {
+ public:
+  explicit DBBlobCompactionTest()
+      : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}
+
+#ifndef ROCKSDB_LITE
+  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetCompactionStats();
+  }
+#endif  // ROCKSDB_LITE
+};
+
+namespace {
+
+class FilterByKeyLength : public CompactionFilter {
+ public:
+  explicit FilterByKeyLength(size_t len) : length_threshold_(len) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.by.key.length";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() < length_threshold_) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+ private:
+  size_t length_threshold_;
+};
+
+class BadBlobCompactionFilter : public CompactionFilter {
+ public:
+  explicit BadBlobCompactionFilter(std::string prefix,
+                                   CompactionFilter::Decision filter_by_key,
+                                   CompactionFilter::Decision filter_v2)
+      : prefix_(std::move(prefix)),
+        filter_blob_by_key_(filter_by_key),
+        filter_v2_(filter_v2) {}
+  const char* Name() const override { return "rocksdb.compaction.filter.bad"; }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() >= prefix_.size() &&
+        0 == strncmp(prefix_.data(), key.data(), prefix_.size())) {
+      return CompactionFilter::Decision::kUndetermined;
+    }
+    return filter_blob_by_key_;
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return filter_v2_;
+  }
+
+ private:
+  const std::string prefix_;
+  const CompactionFilter::Decision filter_blob_by_key_;
+  const CompactionFilter::Decision filter_v2_;
+};
+
+class ValueBlindWriteFilter : public CompactionFilter {
+ public:
+  explicit ValueBlindWriteFilter(std::string new_val)
+      : new_value_(std::move(new_val)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.blind.write";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int level, const Slice& key, std::string* new_value,
+      std::string* skip_until) const override;
+
+ private:
+  const std::string new_value_;
+};
+
+CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey(
+    int /*level*/, const Slice& /*key*/, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(new_value);
+  new_value->assign(new_value_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class ValueMutationFilter : public CompactionFilter {
+ public:
+  explicit ValueMutationFilter(std::string padding)
+      : padding_(std::move(padding)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.value.mutation";
+  }
+  CompactionFilter::Decision FilterV2(int level, const Slice& key,
+                                      ValueType value_type,
+                                      const Slice& existing_value,
+                                      std::string* new_value,
+                                      std::string* skip_until) const override;
+
+ private:
+  const std::string padding_;
+};
+
+CompactionFilter::Decision ValueMutationFilter::FilterV2(
+    int /*level*/, const Slice& /*key*/, ValueType value_type,
+    const Slice& existing_value, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(CompactionFilter::ValueType::kBlobIndex != value_type);
+  if (CompactionFilter::ValueType::kValue != value_type) {
+    return CompactionFilter::Decision::kKeep;
+  }
+  assert(new_value);
+  new_value->assign(existing_value.data(), existing_value.size());
+  new_value->append(padding_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class AlwaysKeepFilter : public CompactionFilter {
+ public:
+  explicit AlwaysKeepFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.always.keep";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return CompactionFilter::Decision::kKeep;
+  }
+};
+
+class SkipUntilFilter : public CompactionFilter {
+ public:
+  explicit SkipUntilFilter(std::string skip_until)
+      : skip_until_(std::move(skip_until)) {}
+
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.skip.until";
+  }
+
+  CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */,
+                                      ValueType /* value_type */,
+                                      const Slice& /* existing_value */,
+                                      std::string* /* new_value */,
+                                      std::string* skip_until) const override {
+    assert(skip_until);
+    *skip_until = skip_until_;
+
+    return CompactionFilter::Decision::kRemoveAndSkipUntil;
+  }
+
+ private:
+  std::string skip_until_;
+};
+
+}  // anonymous namespace
+
+class DBBlobBadCompactionFilterTest
+    : public DBBlobCompactionTest,
+      public testing::WithParamInterface<
+          std::tuple<std::string, CompactionFilter::Decision,
+                     CompactionFilter::Decision>> {
+ public:
+  explicit DBBlobBadCompactionFilterTest()
+      : compaction_filter_guard_(new BadBlobCompactionFilter(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()))) {}
+
+ protected:
+  std::unique_ptr<CompactionFilter> compaction_filter_guard_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BadCompactionFilter, DBBlobBadCompactionFilterTest,
+    testing::Combine(
+        testing::Values("a"),
+        testing::Values(CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError),
+        testing::Values(CompactionFilter::Decision::kUndetermined,
+                        CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError)));
+
+TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr size_t kKeyLength = 2;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new FilterByKeyLength(kKeyLength));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  constexpr char short_key[] = "a";
+  constexpr char long_key[] = "abc";
+  constexpr char blob_value[] = "value";
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put(short_key, blob_value));
+  ASSERT_OK(Put(long_key, blob_value));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound());
+  value.clear();
+  ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
+  ASSERT_EQ("value", value);
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides between kKeep and kRemove solely based on key;
+  // this involves neither reading nor writing blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr char new_blob_value[] = "new_blob_value";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueBlindWriteFilter(new_blob_value));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::string> keys = {"a", "b", "c"};
+  const std::vector<std::string> values = {"a_value", "b_value", "c_value"};
+  assert(keys.size() == values.size());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    ASSERT_OK(Put(keys[i], values[i]));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& key : keys) {
+    ASSERT_EQ(new_blob_value, Get(key));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter unconditionally changes value in FilterBlobByKey;
+  // this involves writing but not reading blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, SkipUntilFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new SkipUntilFilter("z"));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  Reopen(options);
+
+  const std::vector<std::string> keys{"a", "b", "c"};
+  const std::vector<std::string> values{"a_value", "b_value", "c_value"};
+  assert(keys.size() == values.size());
+
+  for (size_t i = 0; i < keys.size(); ++i) {
+    ASSERT_OK(Put(keys[i], values[i]));
+  }
+
+  ASSERT_OK(Flush());
+
+  int process_in_flow_called = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow",
+      [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+                              /* end */ nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (const auto& key : keys) {
+    ASSERT_EQ(Get(key), "NOT_FOUND");
+  }
+
+  // Make sure SkipUntil was performed using iteration rather than Seek
+  ASSERT_EQ(process_in_flow_called, keys.size());
+
+  Close();
+}
+
+TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.compaction_filter = compaction_filter_guard_.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+
+  DestroyAndReopen(options);
+  std::string key(std::get<0>(GetParam()));
+  ASSERT_OK(Put(key, "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+  constexpr uint64_t expiration = 1234567890;
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  constexpr char padding[] = "_delta";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(padding));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::pair<std::string, std::string>> kvs = {
+      {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}};
+  for (const auto& kv : kvs) {
+    ASSERT_OK(Put(kv.first, kv.second));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& kv : kvs) {
+    ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter changes the value using the previous value in FilterV2;
+  // this involves reading and writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  // Mock a corrupted blob index
+  constexpr char key[] = "key";
+  std::string blob_idx("blob_idx");
+  WriteBatch write_batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&write_batch, 0, key, blob_idx));
+  ASSERT_OK(db_->Write(WriteOptions(), &write_batch));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new AlwaysKeepFilter());
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  ASSERT_EQ(blob_files, GetBlobFileNumbers());
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides to keep the existing value in FilterV2;
+  // this involves reading but not writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, TrackGarbage) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  // First table+blob file pair: 4 blobs with different keys
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Flush());
+
+  // Second table+blob file pair: overwrite 2 existing keys
+  constexpr char new_first_value[] = "new_first_value";
+  constexpr char new_second_value[] = "new_second_value";
+
+  ASSERT_OK(Put(first_key, new_first_value));
+  ASSERT_OK(Put(second_key, new_second_value));
+  ASSERT_OK(Flush());
+
+  // Compact them together. The first blob file should have 2 garbage blobs
+  // corresponding to the 2 overwritten keys.
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 2);
+
+  {
+    auto it = blob_files.begin();
+    const auto& meta = it->second;
+    assert(meta);
+
+    constexpr uint64_t first_expected_bytes =
+        sizeof(first_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+                                                          1);
+    constexpr uint64_t second_expected_bytes =
+        sizeof(second_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+                                                          1);
+    constexpr uint64_t third_expected_bytes =
+        sizeof(third_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) -
+                                                          1);
+    constexpr uint64_t fourth_expected_bytes =
+        sizeof(fourth_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) -
+                                                          1);
+
+    ASSERT_EQ(meta->GetTotalBlobCount(), 4);
+    ASSERT_EQ(meta->GetTotalBlobBytes(),
+              first_expected_bytes + second_expected_bytes +
+                  third_expected_bytes + fourth_expected_bytes);
+    ASSERT_EQ(meta->GetGarbageBlobCount(), 2);
+    ASSERT_EQ(meta->GetGarbageBlobBytes(),
+              first_expected_bytes + second_expected_bytes);
+  }
+
+  {
+    auto it = blob_files.rbegin();
+    const auto& meta = it->second;
+    assert(meta);
+
+    constexpr uint64_t new_first_expected_bytes =
+        sizeof(new_first_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+                                                          1);
+    constexpr uint64_t new_second_expected_bytes =
+        sizeof(new_second_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+                                                          1);
+
+    ASSERT_EQ(meta->GetTotalBlobCount(), 2);
+    ASSERT_EQ(meta->GetTotalBlobBytes(),
+              new_first_expected_bytes + new_second_expected_bytes);
+    ASSERT_EQ(meta->GetGarbageBlobCount(), 0);
+    ASSERT_EQ(meta->GetGarbageBlobBytes(), 0);
+  }
+}
+
+TEST_F(DBBlobCompactionTest, MergeBlobWithBase) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+  ASSERT_OK(Put("Key1", "v1_1"));
+  ASSERT_OK(Put("Key2", "v2_1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("Key1", "v1_2"));
+  ASSERT_OK(Merge("Key2", "v2_2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("Key1", "v1_3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3");
+  ASSERT_EQ(Get("Key2"), "v2_1,v2_2");
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("key", "pie"));
+  ASSERT_OK(Put("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "pie");
+  ASSERT_EQ(Get("foo"), "baz");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) {
+  Options options = GetDefaultOptions();
+
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter("pie"));
+
+  options.compaction_filter = compaction_filter_guard.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "limepie");
+  ASSERT_EQ(Get("foo"), "barpie");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("key", "pie"));
+  ASSERT_OK(Merge("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "lime,pie");
+  ASSERT_EQ(Get("foo"), "bar,baz");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_corruption_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,82 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCorruptionTest : public DBTestBase {
+ protected:
+  DBBlobCorruptionTest()
+      : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {}
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    uint64_t picked_number = kInvalidBlobFileNumber;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+          number > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.file_checksum_gen_factory =
+      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+  Reopen(options);
+
+  ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+  Close();
+
+  Corrupt(kBlobFile, 0, 2);
+
+  ASSERT_OK(TryReopen(options));
+
+  int count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+        const Status* s = static_cast<Status*>(arg);
+        ASSERT_NE(s, nullptr);
+        ++count;
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_EQ(1, count);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/db_blob_index_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,572 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+  enum Tier {
+    kMemtable = 0,
+    kImmutableMemtables = 1,
+    kL0SstFile = 2,
+    kLnSstFile = 3,
+  };
+  const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+                                       Tier::kImmutableMemtables,
+                                       Tier::kL0SstFile, Tier::kLnSstFile};
+
+  DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {}
+
+  ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+  ColumnFamilyData* cfd() {
+    return static_cast_with_check<ColumnFamilyHandleImpl>(cfh())->cfd();
+  }
+
+  Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+                      const Slice& blob_index) {
+    return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+                                            blob_index);
+  }
+
+  Status Write(WriteBatch* batch) {
+    return dbfull()->Write(WriteOptions(), batch);
+  }
+
+  std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+                      const Snapshot* snapshot = nullptr) {
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    PinnableSlice value;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh();
+    get_impl_options.value = &value;
+    get_impl_options.is_blob_index = is_blob_index;
+    auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
+    if (s.IsNotFound()) {
+      return "NOT_FOUND";
+    }
+    if (s.IsCorruption()) {
+      return "CORRUPTION";
+    }
+    if (s.IsNotSupported()) {
+      return "NOT_SUPPORTED";
+    }
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+
+  std::string GetBlobIndex(const Slice& key,
+                           const Snapshot* snapshot = nullptr) {
+    bool is_blob_index = false;
+    std::string value = GetImpl(key, &is_blob_index, snapshot);
+    if (!is_blob_index) {
+      return "NOT_BLOB";
+    }
+    return value;
+  }
+
+  ArenaWrappedDBIter* GetBlobIterator() {
+    return dbfull()->NewIteratorImpl(
+        ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+        nullptr /*read_callback*/, true /*expose_blob_index*/);
+  }
+
+  Options GetTestOptions() {
+    Options options;
+    options.env = CurrentOptions().env;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.disable_auto_compactions = true;
+    // Disable auto flushes.
+    options.max_write_buffer_number = 10;
+    options.min_write_buffer_number_to_merge = 10;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    return options;
+  }
+
+  void MoveDataTo(Tier tier) {
+    switch (tier) {
+      case Tier::kMemtable:
+        break;
+      case Tier::kImmutableMemtables:
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+        break;
+      case Tier::kL0SstFile:
+        ASSERT_OK(Flush());
+        break;
+      case Tier::kLnSstFile:
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("a", "dummy"));
+        ASSERT_OK(Put("z", "dummy"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+        ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+        break;
+    }
+  }
+};
+
+// Should be able to write kTypeBlobIndex to memtables and SST files.
+TEST_F(DBBlobIndexTest, Write) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    for (int i = 1; i <= 5; i++) {
+      std::string index = ToString(i);
+      WriteBatch batch;
+      ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index));
+      ASSERT_OK(Write(&batch));
+    }
+    MoveDataTo(tier);
+    for (int i = 1; i <= 5; i++) {
+      std::string index = ToString(i);
+      ASSERT_EQ("blob" + index, GetBlobIndex("key" + index));
+    }
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should be able to return blob index if is_blob_index is
+// provided, otherwise it should return Status::NotSupported (when reading from
+// memtable) or Status::Corruption (when reading from SST). Reading from SST
+// returns Corruption because we can't differentiate between the application
+// accidentally opening the base DB of a stacked BlobDB and actual corruption
+// when using the integrated BlobDB.
+TEST_F(DBBlobIndexTest, Get) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "value"));
+    ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index"));
+    ASSERT_OK(Write(&batch));
+    MoveDataTo(tier);
+    // Verify normal value
+    bool is_blob_index = false;
+    PinnableSlice value;
+    ASSERT_EQ("value", Get("key"));
+    ASSERT_EQ("value", GetImpl("key"));
+    ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+    ASSERT_FALSE(is_blob_index);
+    // Verify blob index
+    if (tier <= kImmutableMemtables) {
+      ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+    } else {
+      ASSERT_TRUE(Get("blob_key", &value).IsCorruption());
+      ASSERT_EQ("CORRUPTION", GetImpl("blob_key"));
+    }
+    ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index));
+    ASSERT_TRUE(is_blob_index);
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should NOT return Status::NotSupported/Status::Corruption
+// if blob index is updated with a normal value. See the test case above for
+// more details.
+TEST_F(DBBlobIndexTest, Updated) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index"));
+    }
+    ASSERT_OK(Write(&batch));
+    // Avoid blob values from being purged.
+    const Snapshot* snapshot = dbfull()->GetSnapshot();
+    ASSERT_OK(Put("key1", "new_value"));
+    ASSERT_OK(Merge("key2", "a"));
+    ASSERT_OK(Merge("key2", "b"));
+    ASSERT_OK(Merge("key2", "c"));
+    ASSERT_OK(Delete("key3"));
+    ASSERT_OK(SingleDelete("key4"));
+    ASSERT_OK(Delete("key5"));
+    ASSERT_OK(Merge("key5", "a"));
+    ASSERT_OK(Merge("key5", "b"));
+    ASSERT_OK(Merge("key5", "c"));
+    ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+    MoveDataTo(tier);
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot));
+    }
+    ASSERT_EQ("new_value", Get("key1"));
+    if (tier <= kImmutableMemtables) {
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+    } else {
+      ASSERT_EQ("CORRUPTION", GetImpl("key2"));
+    }
+    ASSERT_EQ("NOT_FOUND", Get("key3"));
+    ASSERT_EQ("NOT_FOUND", Get("key4"));
+    ASSERT_EQ("a,b,c", GetImpl("key5"));
+    for (int i = 6; i < 9; i++) {
+      ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
+    }
+    ASSERT_EQ("blob_index", GetBlobIndex("key9"));
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. When a blob iterator is used, it should set the
+// expose_blob_index flag for the underlying DBIter, and retrieve/return the
+// corresponding blob value. If a regular DBIter is created (i.e.
+// expose_blob_index is not set), it should return Status::Corruption.
+TEST_F(DBBlobIndexTest, Iterate) {
+  const std::vector<std::vector<ValueType>> data = {
+      /*00*/ {kTypeValue},
+      /*01*/ {kTypeBlobIndex},
+      /*02*/ {kTypeValue},
+      /*03*/ {kTypeBlobIndex, kTypeValue},
+      /*04*/ {kTypeValue},
+      /*05*/ {kTypeValue, kTypeBlobIndex},
+      /*06*/ {kTypeValue},
+      /*07*/ {kTypeDeletion, kTypeBlobIndex},
+      /*08*/ {kTypeValue},
+      /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+      /*10*/ {kTypeValue},
+      /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+      /*12*/ {kTypeValue},
+      /*13*/
+      {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+      /*14*/ {kTypeValue},
+      /*15*/ {kTypeBlobIndex},
+      /*16*/ {kTypeValue},
+  };
+
+  auto get_key = [](int index) {
+    char buf[20];
+    snprintf(buf, sizeof(buf), "%02d", index);
+    return "key" + std::string(buf);
+  };
+
+  auto get_value = [&](int index, int version) {
+    return get_key(index) + "_value" + ToString(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status().code());
+    if (expected_status == Status::kOk) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto create_normal_iterator = [&]() -> Iterator* {
+    return dbfull()->NewIterator(ReadOptions());
+  };
+
+  auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+  auto check_is_blob = [&](bool is_blob) {
+    return [is_blob](Iterator* iterator) {
+      ASSERT_EQ(is_blob,
+                reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+    };
+  };
+
+  auto verify = [&](int index, Status::Code expected_status,
+                    const Slice& forward_value, const Slice& backward_value,
+                    std::function<Iterator*()> create_iterator,
+                    std::function<void(Iterator*)> extra_check = nullptr) {
+    // Seek
+    auto* iterator = create_iterator();
+    ASSERT_OK(iterator->status());
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index));
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Next
+    iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index - 1));
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
+    iterator->Next();
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // SeekForPrev
+    iterator = create_iterator();
+    ASSERT_OK(iterator->status());
+    ASSERT_OK(iterator->Refresh());
+    iterator->SeekForPrev(get_key(index));
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Prev
+    iterator = create_iterator();
+    iterator->Seek(get_key(index + 1));
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
+    iterator->Prev();
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+  };
+
+  for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+    // Avoid values from being purged.
+    std::vector<const Snapshot*> snapshots;
+    DestroyAndReopen(GetTestOptions());
+
+    // fill data
+    for (int i = 0; i < static_cast<int>(data.size()); i++) {
+      for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+        std::string key = get_key(i);
+        std::string value = get_value(i, j);
+        WriteBatch batch;
+        switch (data[i][j]) {
+          case kTypeValue:
+            ASSERT_OK(Put(key, value));
+            break;
+          case kTypeDeletion:
+            ASSERT_OK(Delete(key));
+            break;
+          case kTypeSingleDeletion:
+            ASSERT_OK(SingleDelete(key));
+            break;
+          case kTypeMerge:
+            ASSERT_OK(Merge(key, value));
+            break;
+          case kTypeBlobIndex:
+            ASSERT_OK(PutBlobIndex(&batch, key, value));
+            ASSERT_OK(Write(&batch));
+            break;
+          default:
+            FAIL();
+        };
+      }
+      snapshots.push_back(dbfull()->GetSnapshot());
+    }
+    ASSERT_OK(
+        dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+    snapshots.push_back(dbfull()->GetSnapshot());
+    MoveDataTo(tier);
+
+    // Normal iterator
+    verify(1, Status::kCorruption, "", "", create_normal_iterator);
+    verify(3, Status::kCorruption, "", "", create_normal_iterator);
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_normal_iterator);
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_normal_iterator);
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_normal_iterator);
+    verify(11, Status::kCorruption, "", "", create_normal_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_normal_iterator);
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_normal_iterator);
+
+    // Iterator with blob support
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+
+#ifndef ROCKSDB_LITE
+    // Iterator with blob support and using seek.
+    ASSERT_OK(dbfull()->SetOptions(
+        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+#endif  // !ROCKSDB_LITE
+
+    for (auto* snapshot : snapshots) {
+      dbfull()->ReleaseSnapshot(snapshot);
+    }
+  }
+}
+
+TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
+  const std::vector<std::vector<std::string>> data = {
+      /*00*/ {"Put"},
+      /*01*/ {"Put", "Merge", "Merge", "Merge"},
+      /*02*/ {"Put"}};
+
+  auto get_key = [](size_t index) { return ("key" + std::to_string(index)); };
+
+  auto get_value = [&](size_t index, size_t version) {
+    return get_key(index) + "_value" + ToString(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status());
+    if (expected_status.ok()) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto verify = [&](size_t index, Status expected_status,
+                    const Slice& expected_value) {
+    // Seek
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Next
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index - 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Next();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // SeekForPrev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->SeekForPrev(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Prev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      iterator->Seek(get_key(index + 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Prev();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+  };
+
+  Options options = GetTestOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  DestroyAndReopen(options);
+
+  // fill data
+  for (size_t i = 0; i < data.size(); i++) {
+    for (size_t j = 0; j < data[i].size(); j++) {
+      std::string key = get_key(i);
+      std::string value = get_value(i, j);
+      if (data[i][j] == "Put") {
+        ASSERT_OK(Put(key, value));
+        ASSERT_OK(Flush());
+      } else if (data[i][j] == "Merge") {
+        ASSERT_OK(Merge(key, value));
+        ASSERT_OK(Flush());
+      }
+    }
+  }
+
+  std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," +
+                               get_value(1, 2) + "," + get_value(1, 3);
+  Status expected_status;
+  verify(1, expected_status, expected_value);
+
+#ifndef ROCKSDB_LITE
+  // Test DBIter::FindValueForCurrentKeyUsingSeek flow.
+  ASSERT_OK(dbfull()->SetOptions(cfh(),
+                                 {{"max_sequential_skip_in_iterations", "0"}}));
+  verify(1, expected_status, expected_value);
+#endif  // !ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,21 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/prefetch_buffer_collection.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer(
+    uint64_t file_number) {
+  auto& prefetch_buffer = prefetch_buffers_[file_number];
+  if (!prefetch_buffer) {
+    prefetch_buffer.reset(
+        new FilePrefetchBuffer(readahead_size_, readahead_size_));
+  }
+
+  return prefetch_buffer.get();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob/prefetch_buffer_collection.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+
+#include "file/file_prefetch_buffer.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class that owns a collection of FilePrefetchBuffers using the file number
+// as key. Used for implementing compaction readahead for blob files. Designed
+// to be accessed by a single thread only: every (sub)compaction needs its own
+// buffers since they are guaranteed to read different blobs from different
+// positions even when reading the same file.
+class PrefetchBufferCollection {
+ public:
+  explicit PrefetchBufferCollection(uint64_t readahead_size)
+      : readahead_size_(readahead_size) {
+    assert(readahead_size_ > 0);
+  }
+
+  FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number);
+
+ private:
+  uint64_t readahead_size_;
+  std::unordered_map<uint64_t, std::unique_ptr<FilePrefetchBuffer>>
+      prefetch_buffers_;  // maps file number to prefetch buffer
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob_index.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/blob_index.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/blob_index.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,179 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <sstream>
-#include <string>
-
-#include "rocksdb/options.h"
-#include "util/coding.h"
-#include "util/string_util.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-// BlobIndex is a pointer to the blob and metadata of the blob. The index is
-// stored in base DB as ValueType::kTypeBlobIndex.
-// There are three types of blob index:
-//
-//    kInlinedTTL:
-//      +------+------------+---------------+
-//      | type | expiration | value         |
-//      +------+------------+---------------+
-//      | char | varint64   | variable size |
-//      +------+------------+---------------+
-//
-//    kBlob:
-//      +------+-------------+----------+----------+-------------+
-//      | type | file number | offset   | size     | compression |
-//      +------+-------------+----------+----------+-------------+
-//      | char | varint64    | varint64 | varint64 | char        |
-//      +------+-------------+----------+----------+-------------+
-//
-//    kBlobTTL:
-//      +------+------------+-------------+----------+----------+-------------+
-//      | type | expiration | file number | offset   | size     | compression |
-//      +------+------------+-------------+----------+----------+-------------+
-//      | char | varint64   | varint64    | varint64 | varint64 | char        |
-//      +------+------------+-------------+----------+----------+-------------+
-//
-// There isn't a kInlined (without TTL) type since we can store it as a plain
-// value (i.e. ValueType::kTypeValue).
-class BlobIndex {
- public:
-  enum class Type : unsigned char {
-    kInlinedTTL = 0,
-    kBlob = 1,
-    kBlobTTL = 2,
-    kUnknown = 3,
-  };
-
-  BlobIndex() : type_(Type::kUnknown) {}
-
-  bool IsInlined() const { return type_ == Type::kInlinedTTL; }
-
-  bool HasTTL() const {
-    return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
-  }
-
-  uint64_t expiration() const {
-    assert(HasTTL());
-    return expiration_;
-  }
-
-  const Slice& value() const {
-    assert(IsInlined());
-    return value_;
-  }
-
-  uint64_t file_number() const {
-    assert(!IsInlined());
-    return file_number_;
-  }
-
-  uint64_t offset() const {
-    assert(!IsInlined());
-    return offset_;
-  }
-
-  uint64_t size() const {
-    assert(!IsInlined());
-    return size_;
-  }
-
-  Status DecodeFrom(Slice slice) {
-    static const std::string kErrorMessage = "Error while decoding blob index";
-    assert(slice.size() > 0);
-    type_ = static_cast<Type>(*slice.data());
-    if (type_ >= Type::kUnknown) {
-      return Status::Corruption(
-          kErrorMessage,
-          "Unknown blob index type: " + ToString(static_cast<char>(type_)));
-    }
-    slice = Slice(slice.data() + 1, slice.size() - 1);
-    if (HasTTL()) {
-      if (!GetVarint64(&slice, &expiration_)) {
-        return Status::Corruption(kErrorMessage, "Corrupted expiration");
-      }
-    }
-    if (IsInlined()) {
-      value_ = slice;
-    } else {
-      if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
-          GetVarint64(&slice, &size_) && slice.size() == 1) {
-        compression_ = static_cast<CompressionType>(*slice.data());
-      } else {
-        return Status::Corruption(kErrorMessage, "Corrupted blob offset");
-      }
-    }
-    return Status::OK();
-  }
-
-  std::string DebugString(bool output_hex) const {
-    std::ostringstream oss;
-
-    if (IsInlined()) {
-      oss << "[inlined blob] value:" << value_.ToString(output_hex);
-    } else {
-      oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
-          << " size:" << size_;
-    }
-
-    if (HasTTL()) {
-      oss << " exp:" << expiration_;
-    }
-
-    return oss.str();
-  }
-
-  static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
-                               const Slice& value) {
-    assert(dst != nullptr);
-    dst->clear();
-    dst->reserve(1 + kMaxVarint64Length + value.size());
-    dst->push_back(static_cast<char>(Type::kInlinedTTL));
-    PutVarint64(dst, expiration);
-    dst->append(value.data(), value.size());
-  }
-
-  static void EncodeBlob(std::string* dst, uint64_t file_number,
-                         uint64_t offset, uint64_t size,
-                         CompressionType compression) {
-    assert(dst != nullptr);
-    dst->clear();
-    dst->reserve(kMaxVarint64Length * 3 + 2);
-    dst->push_back(static_cast<char>(Type::kBlob));
-    PutVarint64(dst, file_number);
-    PutVarint64(dst, offset);
-    PutVarint64(dst, size);
-    dst->push_back(static_cast<char>(compression));
-  }
-
-  static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
-                            uint64_t file_number, uint64_t offset,
-                            uint64_t size, CompressionType compression) {
-    assert(dst != nullptr);
-    dst->clear();
-    dst->reserve(kMaxVarint64Length * 4 + 2);
-    dst->push_back(static_cast<char>(Type::kBlobTTL));
-    PutVarint64(dst, expiration);
-    PutVarint64(dst, file_number);
-    PutVarint64(dst, offset);
-    PutVarint64(dst, size);
-    dst->push_back(static_cast<char>(compression));
-  }
-
- private:
-  Type type_ = Type::kUnknown;
-  uint64_t expiration_ = 0;
-  Slice value_;
-  uint64_t file_number_ = 0;
-  uint64_t offset_ = 0;
-  uint64_t size_ = 0;
-  CompressionType compression_ = kNoCompression;
-};
-
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,19 +13,22 @@
 #include <deque>
 #include <vector>
 
+#include "db/blob/blob_file_builder.h"
 #include "db/compaction/compaction_iterator.h"
-#include "db/dbformat.h"
 #include "db/event_helpers.h"
 #include "db/internal_stats.h"
 #include "db/merge_helper.h"
+#include "db/output_validator.h"
 #include "db/range_del_aggregator.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "file/file_util.h"
 #include "file/filename.h"
 #include "file/read_write_util.h"
 #include "file/writable_file_writer.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -41,125 +44,172 @@
 
 class TableFactory;
 
-TableBuilder* NewTableBuilder(
-    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
-    WritableFileWriter* file, const CompressionType compression_type,
-    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
-    int level, const bool skip_filters, const uint64_t creation_time,
-    const uint64_t oldest_key_time, const uint64_t target_file_size,
-    const uint64_t file_creation_time) {
-  assert((column_family_id ==
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file) {
+  assert((tboptions.column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
-         column_family_name.empty());
-  return ioptions.table_factory->NewTableBuilder(
-      TableBuilderOptions(ioptions, moptions, internal_comparator,
-                          int_tbl_prop_collector_factories, compression_type,
-                          sample_for_compression, compression_opts,
-                          skip_filters, column_family_name, level,
-                          creation_time, oldest_key_time, target_file_size,
-                          file_creation_time),
-      column_family_id, file);
+         tboptions.column_family_name.empty());
+  return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file);
 }
 
 Status BuildTable(
-    const std::string& dbname, Env* env, FileSystem* fs,
-    const ImmutableCFOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
-    TableCache* table_cache, InternalIterator* iter,
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
     std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
         range_del_iters,
-    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
+    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
     std::vector<SequenceNumber> snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, const CompressionType compression,
-    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
-    bool paranoid_file_checks, InternalStats* internal_stats,
-    TableFileCreationReason reason, EventLogger* event_logger, int job_id,
-    const Env::IOPriority io_priority, TableProperties* table_properties,
-    int level, const uint64_t creation_time, const uint64_t oldest_key_time,
-    Env::WriteLifeTimeHint write_hint, const uint64_t file_creation_time) {
-  assert((column_family_id ==
+    SnapshotChecker* snapshot_checker, bool paranoid_file_checks,
+    InternalStats* internal_stats, IOStatus* io_status,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCreationReason blob_creation_reason, EventLogger* event_logger,
+    int job_id, const Env::IOPriority io_priority,
+    TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
+    const std::string* full_history_ts_low,
+    BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries,
+    uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) {
+  assert((tboptions.column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
-         column_family_name.empty());
+         tboptions.column_family_name.empty());
+  auto& mutable_cf_options = tboptions.moptions;
+  auto& ioptions = tboptions.ioptions;
   // Reports the IOStats for flush for every following bytes.
   const size_t kReportFlushIOStatsEvery = 1048576;
+  OutputValidator output_validator(
+      tboptions.internal_comparator,
+      /*enable_order_check=*/
+      mutable_cf_options.check_flush_compaction_key_order,
+      /*enable_hash=*/paranoid_file_checks);
   Status s;
   meta->fd.file_size = 0;
   iter->SeekToFirst();
   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
-      new CompactionRangeDelAggregator(&internal_comparator, snapshots));
+      new CompactionRangeDelAggregator(&tboptions.internal_comparator,
+                                       snapshots));
+  uint64_t num_unfragmented_tombstones = 0;
+  uint64_t total_tombstone_payload_bytes = 0;
   for (auto& range_del_iter : range_del_iters) {
+    num_unfragmented_tombstones +=
+        range_del_iter->num_unfragmented_tombstones();
+    total_tombstone_payload_bytes +=
+        range_del_iter->total_tombstone_payload_bytes();
     range_del_agg->AddTombstones(std::move(range_del_iter));
   }
 
   std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
                                     meta->fd.GetPathId());
+  std::vector<std::string> blob_file_paths;
+  std::string file_checksum = kUnknownFileChecksum;
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
 #ifndef ROCKSDB_LITE
-  EventHelpers::NotifyTableFileCreationStarted(
-      ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
+  EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
+                                               tboptions.column_family_name,
+                                               fname, job_id, tboptions.reason);
 #endif  // !ROCKSDB_LITE
-  TableProperties tp;
+  Env* env = db_options.env;
+  assert(env);
+  FileSystem* fs = db_options.fs.get();
+  assert(fs);
 
+  TableProperties tp;
   if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    std::unique_ptr<CompactionFilter> compaction_filter;
+    if (ioptions.compaction_filter_factory != nullptr &&
+        ioptions.compaction_filter_factory->ShouldFilterTableFileCreation(
+            tboptions.reason)) {
+      CompactionFilter::Context context;
+      context.is_full_compaction = false;
+      context.is_manual_compaction = false;
+      context.column_family_id = tboptions.column_family_id;
+      context.reason = tboptions.reason;
+      compaction_filter =
+          ioptions.compaction_filter_factory->CreateCompactionFilter(context);
+      if (compaction_filter != nullptr &&
+          !compaction_filter->IgnoreSnapshots()) {
+        s.PermitUncheckedError();
+        return Status::NotSupported(
+            "CompactionFilter::IgnoreSnapshots() = false is not supported "
+            "anymore.");
+      }
+    }
+
     TableBuilder* builder;
     std::unique_ptr<WritableFileWriter> file_writer;
-    // Currently we only enable dictionary compression during compaction to the
-    // bottommost level.
-    CompressionOptions compression_opts_for_flush(compression_opts);
-    compression_opts_for_flush.max_dict_bytes = 0;
-    compression_opts_for_flush.zstd_max_train_bytes = 0;
     {
       std::unique_ptr<FSWritableFile> file;
 #ifndef NDEBUG
       bool use_direct_writes = file_options.use_direct_writes;
       TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
 #endif  // !NDEBUG
-      s = NewWritableFile(fs, fname, &file, file_options);
+      IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
+      assert(s.ok());
+      s = io_s;
+      if (io_status->ok()) {
+        *io_status = io_s;
+      }
       if (!s.ok()) {
         EventHelpers::LogAndNotifyTableFileCreationFinished(
-            event_logger, ioptions.listeners, dbname, column_family_name, fname,
-            job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
+            event_logger, ioptions.listeners, dbname,
+            tboptions.column_family_name, fname, job_id, meta->fd,
+            kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum,
+            file_checksum_func_name);
         return s;
       }
+      FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
       file->SetIOPriority(io_priority);
       file->SetWriteLifeTimeHint(write_hint);
-
       file_writer.reset(new WritableFileWriter(
-          std::move(file), fname, file_options, env, ioptions.statistics,
-          ioptions.listeners, ioptions.sst_file_checksum_func));
+          std::move(file), fname, file_options, ioptions.clock, io_tracer,
+          ioptions.stats, ioptions.listeners,
+          ioptions.file_checksum_gen_factory.get(),
+          tmp_set.Contains(FileType::kTableFile), false));
+
+      builder = NewTableBuilder(tboptions, file_writer.get());
+    }
 
-      builder = NewTableBuilder(
-          ioptions, mutable_cf_options, internal_comparator,
-          int_tbl_prop_collector_factories, column_family_id,
-          column_family_name, file_writer.get(), compression,
-          sample_for_compression, compression_opts_for_flush, level,
-          false /* skip_filters */, creation_time, oldest_key_time,
-          0 /*target_file_size*/, file_creation_time);
-    }
-
-    MergeHelper merge(env, internal_comparator.user_comparator(),
-                      ioptions.merge_operator, nullptr, ioptions.info_log,
-                      true /* internal key corruption is not ok */,
-                      snapshots.empty() ? 0 : snapshots.back(),
-                      snapshot_checker);
+    MergeHelper merge(
+        env, tboptions.internal_comparator.user_comparator(),
+        ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
+        true /* internal key corruption is not ok */,
+        snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
+
+    std::unique_ptr<BlobFileBuilder> blob_file_builder(
+        (mutable_cf_options.enable_blob_files && blob_file_additions)
+            ? new BlobFileBuilder(
+                  versions, fs, &ioptions, &mutable_cf_options, &file_options,
+                  job_id, tboptions.column_family_id,
+                  tboptions.column_family_name, io_priority, write_hint,
+                  io_tracer, blob_callback, blob_creation_reason,
+                  &blob_file_paths, blob_file_additions)
+            : nullptr);
 
     CompactionIterator c_iter(
-        iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
-        &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
-        ShouldReportDetailedTime(env, ioptions.statistics),
-        true /* internal key corruption is not ok */, range_del_agg.get());
+        iter, tboptions.internal_comparator.user_comparator(), &merge,
+        kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
+        snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats),
+        true /* internal key corruption is not ok */, range_del_agg.get(),
+        blob_file_builder.get(), ioptions.allow_data_in_errors,
+        /*compaction=*/nullptr, compaction_filter.get(),
+        /*shutting_down=*/nullptr,
+        /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
+        /*manual_compaction_canceled=*/nullptr, db_options.info_log,
+        full_history_ts_low);
+
     c_iter.SeekToFirst();
     for (; c_iter.Valid(); c_iter.Next()) {
       const Slice& key = c_iter.key();
       const Slice& value = c_iter.value();
       const ParsedInternalKey& ikey = c_iter.ikey();
+      // Generate a rolling 64-bit hash of the key and values
+      // Note :
+      // Here "key" integrates 'sequence_number'+'kType'+'user key'.
+      s = output_validator.Add(key, value);
+      if (!s.ok()) {
+        break;
+      }
       builder->Add(key, value);
       meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
 
@@ -170,26 +220,39 @@
             ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
       }
     }
+    if (!s.ok()) {
+      c_iter.status().PermitUncheckedError();
+    } else if (!c_iter.status().ok()) {
+      s = c_iter.status();
+    }
 
-    auto range_del_it = range_del_agg->NewIterator();
-    for (range_del_it->SeekToFirst(); range_del_it->Valid();
-         range_del_it->Next()) {
-      auto tombstone = range_del_it->Tombstone();
-      auto kv = tombstone.Serialize();
-      builder->Add(kv.first.Encode(), kv.second);
-      meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
-                                     tombstone.seq_, internal_comparator);
+    if (s.ok()) {
+      auto range_del_it = range_del_agg->NewIterator();
+      for (range_del_it->SeekToFirst(); range_del_it->Valid();
+           range_del_it->Next()) {
+        auto tombstone = range_del_it->Tombstone();
+        auto kv = tombstone.Serialize();
+        builder->Add(kv.first.Encode(), kv.second);
+        meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
+                                       tombstone.seq_,
+                                       tboptions.internal_comparator);
+      }
     }
 
-    // Finish and check for builder errors
-    tp = builder->GetTableProperties();
-    bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
-    s = c_iter.status();
+    TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
+    const bool empty = builder->IsEmpty();
+    if (num_input_entries != nullptr) {
+      *num_input_entries =
+          c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
+    }
     if (!s.ok() || empty) {
       builder->Abandon();
     } else {
       s = builder->Finish();
     }
+    if (io_status->ok()) {
+      *io_status = builder->io_status();
+    }
 
     if (s.ok() && !empty) {
       uint64_t file_size = builder->FileSize();
@@ -197,24 +260,64 @@
       meta->marked_for_compaction = builder->NeedCompact();
       assert(meta->fd.GetFileSize() > 0);
       tp = builder->GetTableProperties(); // refresh now that builder is finished
+      if (memtable_payload_bytes != nullptr &&
+          memtable_garbage_bytes != nullptr) {
+        const CompactionIterationStats& ci_stats = c_iter.iter_stats();
+        uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes +
+                                       ci_stats.total_input_raw_value_bytes +
+                                       total_tombstone_payload_bytes;
+        uint64_t total_payload_bytes_written =
+            (tp.raw_key_size + tp.raw_value_size);
+        // Prevent underflow, which may still happen at this point
+        // since we only support inserts, deletes, and deleteRanges.
+        if (total_payload_bytes_written <= total_payload_bytes) {
+          *memtable_payload_bytes = total_payload_bytes;
+          *memtable_garbage_bytes =
+              total_payload_bytes - total_payload_bytes_written;
+        } else {
+          *memtable_payload_bytes = 0;
+          *memtable_garbage_bytes = 0;
+        }
+      }
       if (table_properties) {
         *table_properties = tp;
       }
-      // Add the checksum information to file metadata.
-      meta->file_checksum = builder->GetFileChecksum();
-      meta->file_checksum_func_name = builder->GetFileChecksumFuncName();
     }
     delete builder;
 
     // Finish and check for file errors
+    TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
     if (s.ok() && !empty) {
-      StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
-      s = file_writer->Sync(ioptions.use_fsync);
+      StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS);
+      *io_status = file_writer->Sync(ioptions.use_fsync);
     }
-    if (s.ok() && !empty) {
-      s = file_writer->Close();
+    TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
+    if (s.ok() && io_status->ok() && !empty) {
+      *io_status = file_writer->Close();
+    }
+    if (s.ok() && io_status->ok() && !empty) {
+      // Add the checksum information to file metadata.
+      meta->file_checksum = file_writer->GetFileChecksum();
+      meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
+      file_checksum = meta->file_checksum;
+      file_checksum_func_name = meta->file_checksum_func_name;
+    }
+
+    if (s.ok()) {
+      s = *io_status;
+    }
+
+    if (blob_file_builder) {
+      if (s.ok()) {
+        s = blob_file_builder->Finish();
+      } else {
+        blob_file_builder->Abandon(s);
+      }
+      blob_file_builder.reset();
     }
 
+    // TODO Also check the IO status when create the Iterator.
+
     if (s.ok() && !empty) {
       // Verify that the table is usable
       // We set for_compaction to false and don't OptimizeForCompactionTableRead
@@ -222,20 +325,32 @@
       // No matter whether use_direct_io_for_flush_and_compaction is true,
       // we will regrad this verification as user reads since the goal is
       // to cache it here for further user reads
+      ReadOptions read_options;
       std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
-          ReadOptions(), file_options, internal_comparator, *meta,
-          nullptr /* range_del_agg */,
-          mutable_cf_options.prefix_extractor.get(), nullptr,
+          read_options, file_options, tboptions.internal_comparator, *meta,
+          nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,
+          nullptr,
           (internal_stats == nullptr) ? nullptr
                                       : internal_stats->GetFileReadHist(0),
           TableReaderCaller::kFlush, /*arena=*/nullptr,
-          /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr,
-          /*largest_compaction_key*/ nullptr));
+          /*skip_filter=*/false, tboptions.level_at_creation,
+          MaxFileSizeForL0MetaPin(mutable_cf_options),
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key*/ nullptr,
+          /*allow_unprepared_value*/ false));
       s = it->status();
       if (s.ok() && paranoid_file_checks) {
+        OutputValidator file_validator(tboptions.internal_comparator,
+                                       /*enable_order_check=*/true,
+                                       /*enable_hash=*/true);
         for (it->SeekToFirst(); it->Valid(); it->Next()) {
+          // Generate a rolling 64-bit hash of the key and values
+          file_validator.Add(it->key(), it->value()).PermitUncheckedError();
         }
         s = it->status();
+        if (s.ok() && !output_validator.CompareValidator(file_validator)) {
+          s = Status::Corruption("Paranoid checksums do not match");
+        }
       }
     }
   }
@@ -246,16 +361,38 @@
   }
 
   if (!s.ok() || meta->fd.GetFileSize() == 0) {
-    fs->DeleteFile(fname, IOOptions(), nullptr);
+    TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");
+
+    constexpr IODebugContext* dbg = nullptr;
+
+    Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
+    ignored.PermitUncheckedError();
+
+    assert(blob_file_additions || blob_file_paths.empty());
+
+    if (blob_file_additions) {
+      for (const std::string& blob_file_path : blob_file_paths) {
+        ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
+                               /*force_bg=*/false, /*force_fg=*/false);
+        ignored.PermitUncheckedError();
+        TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
+      }
+    }
   }
 
+  Status status_for_listener = s;
   if (meta->fd.GetFileSize() == 0) {
     fname = "(nil)";
+    if (s.ok()) {
+      status_for_listener = Status::Aborted("Empty SST file not kept");
+    }
   }
   // Output to event logger and fire events.
   EventHelpers::LogAndNotifyTableFileCreationFinished(
-      event_logger, ioptions.listeners, dbname, column_family_name, fname,
-      job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
+      event_logger, ioptions.listeners, dbname, tboptions.column_family_name,
+      fname, job_id, meta->fd, meta->oldest_blob_file_number, tp,
+      tboptions.reason, status_for_listener, file_checksum,
+      file_checksum_func_name);
 
   return s;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -24,34 +24,20 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-struct Options;
 struct FileMetaData;
 
-class Env;
-struct EnvOptions;
-class Iterator;
+class VersionSet;
+class BlobFileAddition;
 class SnapshotChecker;
 class TableCache;
-class VersionEdit;
 class TableBuilder;
 class WritableFileWriter;
 class InternalStats;
+class BlobFileCompletionCallback;
 
-// @param column_family_name Name of the column family that is also identified
-//    by column_family_id, or empty string if unknown. It must outlive the
-//    TableBuilder returned by this function.
-TableBuilder* NewTableBuilder(
-    const ImmutableCFOptions& options, const MutableCFOptions& moptions,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
-    WritableFileWriter* file, const CompressionType compression_type,
-    const uint64_t sample_for_compression,
-    const CompressionOptions& compression_opts, int level,
-    const bool skip_filters = false, const uint64_t creation_time = 0,
-    const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
-    const uint64_t file_creation_time = 0);
+// Convenience function for NewTableBuilder on the embedded table_factory.
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file);
 
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
@@ -62,27 +48,27 @@
 // @param column_family_name Name of the column family that is also identified
 //    by column_family_id, or empty string if unknown.
 extern Status BuildTable(
-    const std::string& dbname, Env* env, FileSystem* fs,
-    const ImmutableCFOptions& options,
-    const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
-    TableCache* table_cache, InternalIterator* iter,
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
     std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
         range_del_iters,
-    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
+    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
     std::vector<SequenceNumber> snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, const CompressionType compression,
-    const uint64_t sample_for_compression,
-    const CompressionOptions& compression_opts, bool paranoid_file_checks,
-    InternalStats* internal_stats, TableFileCreationReason reason,
+    SnapshotChecker* snapshot_checker, bool paranoid_file_checks,
+    InternalStats* internal_stats, IOStatus* io_status,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCreationReason blob_creation_reason,
     EventLogger* event_logger = nullptr, int job_id = 0,
     const Env::IOPriority io_priority = Env::IO_HIGH,
-    TableProperties* table_properties = nullptr, int level = -1,
-    const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+    TableProperties* table_properties = nullptr,
     Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
-    const uint64_t file_creation_time = 0);
+    const std::string* full_history_ts_low = nullptr,
+    BlobFileCompletionCallback* blob_callback = nullptr,
+    uint64_t* num_input_entries = nullptr,
+    uint64_t* memtable_payload_bytes = nullptr,
+    uint64_t* memtable_garbage_bytes = nullptr);
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/c.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/c.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/c.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/c.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,7 +11,11 @@
 
 #include "rocksdb/c.h"
 
-#include <stdlib.h>
+#include <cstdlib>
+#include <map>
+#include <unordered_set>
+#include <vector>
+
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
@@ -24,6 +28,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
@@ -35,17 +40,13 @@
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/utilities/memory_util.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/write_batch.h"
-#include "rocksdb/perf_context.h"
 #include "utilities/merge_operators.h"
 
-#include <vector>
-#include <unordered_set>
-#include <map>
-
 using ROCKSDB_NAMESPACE::BackupableDBOptions;
 using ROCKSDB_NAMESPACE::BackupEngine;
 using ROCKSDB_NAMESPACE::BackupID;
@@ -60,7 +61,6 @@
 using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
 using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
 using ROCKSDB_NAMESPACE::CompactionFilter;
-using ROCKSDB_NAMESPACE::CompactionFilterContext;
 using ROCKSDB_NAMESPACE::CompactionFilterFactory;
 using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
 using ROCKSDB_NAMESPACE::CompactRangeOptions;
@@ -80,12 +80,15 @@
 using ROCKSDB_NAMESPACE::Iterator;
 using ROCKSDB_NAMESPACE::LiveFileMetaData;
 using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::LRUCacheOptions;
+using ROCKSDB_NAMESPACE::MemoryAllocator;
 using ROCKSDB_NAMESPACE::MemoryUtil;
 using ROCKSDB_NAMESPACE::MergeOperator;
-using ROCKSDB_NAMESPACE::MergeOperators;
 using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
+using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
 using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
 using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
 using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
 using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
 using ROCKSDB_NAMESPACE::Options;
@@ -104,6 +107,7 @@
 using ROCKSDB_NAMESPACE::Snapshot;
 using ROCKSDB_NAMESPACE::SstFileWriter;
 using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
 using ROCKSDB_NAMESPACE::Transaction;
 using ROCKSDB_NAMESPACE::TransactionDB;
 using ROCKSDB_NAMESPACE::TransactionDBOptions;
@@ -115,10 +119,8 @@
 using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
 using ROCKSDB_NAMESPACE::WriteOptions;
 
-using std::shared_ptr;
 using std::vector;
 using std::unordered_set;
-using std::map;
 
 extern "C" {
 
@@ -154,6 +156,12 @@
 struct rocksdb_logger_t {
   std::shared_ptr<Logger> rep;
 };
+struct rocksdb_lru_cache_options_t {
+  LRUCacheOptions rep;
+};
+struct rocksdb_memory_allocator_t {
+  std::shared_ptr<MemoryAllocator> rep;
+};
 struct rocksdb_cache_t {
   std::shared_ptr<Cache> rep;
 };
@@ -181,6 +189,9 @@
 struct rocksdb_transaction_t {
   Transaction* rep;
 };
+struct rocksdb_backupable_db_options_t {
+  BackupableDBOptions rep;
+};
 struct rocksdb_checkpoint_t {
   Checkpoint* rep;
 };
@@ -504,13 +515,13 @@
   return result;
 }
 
-rocksdb_t* rocksdb_open_for_read_only(
-    const rocksdb_options_t* options,
-    const char* name,
-    unsigned char error_if_log_file_exist,
-    char** errptr) {
+rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options,
+                                      const char* name,
+                                      unsigned char error_if_wal_file_exists,
+                                      char** errptr) {
   DB* db;
-  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name), &db, error_if_log_file_exist))) {
+  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name),
+                                            &db, error_if_wal_file_exists))) {
     return nullptr;
   }
   rocksdb_t* result = new rocksdb_t;
@@ -549,6 +560,18 @@
   return result;
 }
 
+rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts(
+    const rocksdb_backupable_db_options_t* options, rocksdb_env_t* env,
+    char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
 void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
                                              rocksdb_t* db,
                                              char** errptr) {
@@ -595,6 +618,15 @@
                                                        restore_options->rep));
 }
 
+void rocksdb_backup_engine_restore_db_from_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+    char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir),
+                                                 std::string(wal_dir),
+                                                 restore_options->rep));
+}
+
 const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
     rocksdb_backup_engine_t* be) {
   rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
@@ -636,6 +668,128 @@
   delete be;
 }
 
+rocksdb_backupable_db_options_t* rocksdb_backupable_db_options_create(
+    const char* backup_dir) {
+  return new rocksdb_backupable_db_options_t{
+      BackupableDBOptions(std::string(backup_dir))};
+}
+
+void rocksdb_backupable_db_options_set_backup_dir(
+    rocksdb_backupable_db_options_t* options, const char* backup_dir) {
+  options->rep.backup_dir = std::string(backup_dir);
+}
+
+void rocksdb_backupable_db_options_set_env(
+    rocksdb_backupable_db_options_t* options, rocksdb_env_t* env) {
+  options->rep.backup_env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_backupable_db_options_set_share_table_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.share_table_files = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_share_table_files(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.share_table_files;
+}
+
+void rocksdb_backupable_db_options_set_sync(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.sync = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_sync(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.sync;
+}
+
+void rocksdb_backupable_db_options_set_destroy_old_data(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.destroy_old_data = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_destroy_old_data(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.destroy_old_data;
+}
+
+void rocksdb_backupable_db_options_set_backup_log_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.backup_log_files = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_backup_log_files(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.backup_log_files;
+}
+
+void rocksdb_backupable_db_options_set_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit) {
+  options->rep.backup_rate_limit = limit;
+}
+
+uint64_t rocksdb_backupable_db_options_get_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.backup_rate_limit;
+}
+
+void rocksdb_backupable_db_options_set_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit) {
+  options->rep.restore_rate_limit = limit;
+}
+
+uint64_t rocksdb_backupable_db_options_get_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.restore_rate_limit;
+}
+
+void rocksdb_backupable_db_options_set_max_background_operations(
+    rocksdb_backupable_db_options_t* options, int val) {
+  options->rep.max_background_operations = val;
+}
+
+int rocksdb_backupable_db_options_get_max_background_operations(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.max_background_operations;
+}
+
+void rocksdb_backupable_db_options_set_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options, uint64_t size) {
+  options->rep.callback_trigger_interval_size = size;
+}
+
+uint64_t rocksdb_backupable_db_options_get_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.callback_trigger_interval_size;
+}
+
+void rocksdb_backupable_db_options_set_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options, int val) {
+  options->rep.max_valid_backups_to_open = val;
+}
+
+int rocksdb_backupable_db_options_get_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.max_valid_backups_to_open;
+}
+
+void rocksdb_backupable_db_options_set_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options, int val) {
+  options->rep.share_files_with_checksum_naming =
+      static_cast<BackupableDBOptions::ShareFilesNaming>(val);
+}
+
+int rocksdb_backupable_db_options_get_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options) {
+  return static_cast<int>(options->rep.share_files_with_checksum_naming);
+}
+
+void rocksdb_backupable_db_options_destroy(
+    rocksdb_backupable_db_options_t* options) {
+  delete options;
+}
+
 rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
                                                        char** errptr) {
   Checkpoint* checkpoint;
@@ -698,12 +852,47 @@
   return result;
 }
 
+rocksdb_t* rocksdb_open_column_families_with_ttl(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+    char** errptr) {
+  std::vector<int32_t> ttls_vec;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    ttls_vec.push_back(ttls[i]);
+
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  ROCKSDB_NAMESPACE::DBWithTTL* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &db, ttls_vec))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
 rocksdb_t* rocksdb_open_for_read_only_column_families(
     const rocksdb_options_t* db_options, const char* name,
     int num_column_families, const char* const* column_family_names,
     const rocksdb_options_t* const* column_family_options,
     rocksdb_column_family_handle_t** column_family_handles,
-    unsigned char error_if_log_file_exist, char** errptr) {
+    unsigned char error_if_wal_file_exists, char** errptr) {
   std::vector<ColumnFamilyDescriptor> column_families;
   for (int i = 0; i < num_column_families; i++) {
     column_families.push_back(ColumnFamilyDescriptor(
@@ -713,8 +902,10 @@
 
   DB* db;
   std::vector<ColumnFamilyHandle*> handles;
-  if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep),
-          std::string(name), column_families, &handles, &db, error_if_log_file_exist))) {
+  if (SaveError(errptr,
+                DB::OpenForReadOnly(DBOptions(db_options->rep),
+                                    std::string(name), column_families,
+                                    &handles, &db, error_if_wal_file_exists))) {
     return nullptr;
   }
 
@@ -796,6 +987,18 @@
   return handle;
 }
 
+rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, int ttl, char** errptr) {
+  ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl =
+      static_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(db->rep);
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep), ttl));
+  return handle;
+}
+
 void rocksdb_drop_column_family(
     rocksdb_t* db,
     rocksdb_column_family_handle_t* handle,
@@ -996,6 +1199,55 @@
   }
 }
 
+unsigned char rocksdb_key_may_exist(rocksdb_t* db,
+                                    const rocksdb_readoptions_t* options,
+                                    const char* key, size_t key_len,
+                                    char** value, size_t* val_len,
+                                    const char* timestamp, size_t timestamp_len,
+                                    unsigned char* value_found) {
+  std::string tmp;
+  std::string time;
+  if (timestamp) {
+    time.assign(timestamp, timestamp_len);
+  }
+  bool found = false;
+  const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len),
+                                           &tmp, timestamp ? &time : nullptr,
+                                           value_found ? &found : nullptr);
+  if (value_found) {
+    *value_found = found;
+    if (found) {
+      *val_len = tmp.size();
+      *value = CopyString(tmp);
+    }
+  }
+  return result;
+}
+
+unsigned char rocksdb_key_may_exist_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found) {
+  std::string tmp;
+  std::string time;
+  if (timestamp) {
+    time.assign(timestamp, timestamp_len);
+  }
+  bool found = false;
+  const bool result = db->rep->KeyMayExist(
+      options->rep, column_family->rep, Slice(key, key_len), &tmp,
+      timestamp ? &time : nullptr, value_found ? &found : nullptr);
+  if (value_found) {
+    *value_found = found;
+    if (found) {
+      *val_len = tmp.size();
+      *value = CopyString(tmp);
+    }
+  }
+  return result;
+}
+
 rocksdb_iterator_t* rocksdb_create_iterator(
     rocksdb_t* db,
     const rocksdb_readoptions_t* options) {
@@ -1148,34 +1400,39 @@
   }
 }
 
-void rocksdb_approximate_sizes(
-    rocksdb_t* db,
-    int num_ranges,
-    const char* const* range_start_key, const size_t* range_start_key_len,
-    const char* const* range_limit_key, const size_t* range_limit_key_len,
-    uint64_t* sizes) {
+void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges,
+                               const char* const* range_start_key,
+                               const size_t* range_start_key_len,
+                               const char* const* range_limit_key,
+                               const size_t* range_limit_key_len,
+                               uint64_t* sizes, char** errptr) {
   Range* ranges = new Range[num_ranges];
   for (int i = 0; i < num_ranges; i++) {
     ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
     ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
   }
-  db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
   delete[] ranges;
 }
 
 void rocksdb_approximate_sizes_cf(
-    rocksdb_t* db,
-    rocksdb_column_family_handle_t* column_family,
-    int num_ranges,
-    const char* const* range_start_key, const size_t* range_start_key_len,
-    const char* const* range_limit_key, const size_t* range_limit_key_len,
-    uint64_t* sizes) {
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) {
   Range* ranges = new Range[num_ranges];
   for (int i = 0; i < num_ranges; i++) {
     ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
     ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
   }
-  db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes);
+  Status s = db->rep->GetApproximateSizes(column_family->rep, ranges,
+                                          num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
   delete[] ranges;
 }
 
@@ -1256,6 +1513,10 @@
   SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
 }
 
+void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
+  SaveError(errptr, db->rep->FlushWAL(sync));
+}
+
 void rocksdb_disable_file_deletions(
     rocksdb_t* db,
     char** errptr) {
@@ -1466,6 +1727,11 @@
   b->rep.Delete(Slice(key, klen));
 }
 
+void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key,
+                                     size_t klen) {
+  b->rep.SingleDelete(Slice(key, klen));
+}
+
 void rocksdb_writebatch_delete_cf(
     rocksdb_writebatch_t* b,
     rocksdb_column_family_handle_t* column_family,
@@ -1473,6 +1739,12 @@
   b->rep.Delete(column_family->rep, Slice(key, klen));
 }
 
+void rocksdb_writebatch_singledelete_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep.SingleDelete(column_family->rep, Slice(key, klen));
+}
+
 void rocksdb_writebatch_deletev(
     rocksdb_writebatch_t* b,
     int num_keys, const char* const* keys_list,
@@ -1723,6 +1995,11 @@
   b->rep->Delete(Slice(key, klen));
 }
 
+void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b,
+                                        const char* key, size_t klen) {
+  b->rep->SingleDelete(Slice(key, klen));
+}
+
 void rocksdb_writebatch_wi_delete_cf(
     rocksdb_writebatch_wi_t* b,
     rocksdb_column_family_handle_t* column_family,
@@ -1730,6 +2007,12 @@
   b->rep->Delete(column_family->rep, Slice(key, klen));
 }
 
+void rocksdb_writebatch_wi_singledelete_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep->SingleDelete(column_family->rep, Slice(key, klen));
+}
+
 void rocksdb_writebatch_wi_deletev(
     rocksdb_writebatch_wi_t* b,
     int num_keys, const char* const* keys_list,
@@ -2154,6 +2437,10 @@
   delete options;
 }
 
+rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) {
+  return new rocksdb_options_t(*options);
+}
+
 void rocksdb_options_increase_parallelism(
     rocksdb_options_t* opt, int total_threads) {
   opt->rep.IncreaseParallelism(total_threads);
@@ -2179,6 +2466,10 @@
   opt->rep.allow_ingest_behind = v;
 }
 
+unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) {
+  return opt->rep.allow_ingest_behind;
+}
+
 void rocksdb_options_set_compaction_filter(
     rocksdb_options_t* opt,
     rocksdb_compactionfilter_t* filter) {
@@ -2196,6 +2487,10 @@
   opt->rep.compaction_readahead_size = s;
 }
 
+size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) {
+  return opt->rep.compaction_readahead_size;
+}
+
 void rocksdb_options_set_comparator(
     rocksdb_options_t* opt,
     rocksdb_comparator_t* cmp) {
@@ -2208,27 +2503,43 @@
   opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
 }
 
-
 void rocksdb_options_set_create_if_missing(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.create_if_missing = v;
 }
 
+unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) {
+  return opt->rep.create_if_missing;
+}
+
 void rocksdb_options_set_create_missing_column_families(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.create_missing_column_families = v;
 }
 
+unsigned char rocksdb_options_get_create_missing_column_families(
+    rocksdb_options_t* opt) {
+  return opt->rep.create_missing_column_families;
+}
+
 void rocksdb_options_set_error_if_exists(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.error_if_exists = v;
 }
 
+unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) {
+  return opt->rep.error_if_exists;
+}
+
 void rocksdb_options_set_paranoid_checks(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.paranoid_checks = v;
 }
 
+unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) {
+  return opt->rep.paranoid_checks;
+}
+
 void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
                                   const rocksdb_dbpath_t** dbpath_values,
                                   size_t num_paths) {
@@ -2254,57 +2565,107 @@
   opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
 }
 
+int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.info_log_level);
+}
+
 void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
                                               size_t s) {
   opt->rep.db_write_buffer_size = s;
 }
 
+size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) {
+  return opt->rep.db_write_buffer_size;
+}
+
 void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
   opt->rep.write_buffer_size = s;
 }
 
+size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) {
+  return opt->rep.write_buffer_size;
+}
+
 void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
   opt->rep.max_open_files = n;
 }
 
+int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) {
+  return opt->rep.max_open_files;
+}
+
 void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, int n) {
   opt->rep.max_file_opening_threads = n;
 }
 
+int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) {
+  return opt->rep.max_file_opening_threads;
+}
+
 void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
   opt->rep.max_total_wal_size = n;
 }
 
+uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) {
+  return opt->rep.max_total_wal_size;
+}
+
 void rocksdb_options_set_target_file_size_base(
     rocksdb_options_t* opt, uint64_t n) {
   opt->rep.target_file_size_base = n;
 }
 
+uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) {
+  return opt->rep.target_file_size_base;
+}
+
 void rocksdb_options_set_target_file_size_multiplier(
     rocksdb_options_t* opt, int n) {
   opt->rep.target_file_size_multiplier = n;
 }
 
+int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) {
+  return opt->rep.target_file_size_multiplier;
+}
+
 void rocksdb_options_set_max_bytes_for_level_base(
     rocksdb_options_t* opt, uint64_t n) {
   opt->rep.max_bytes_for_level_base = n;
 }
 
+uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) {
+  return opt->rep.max_bytes_for_level_base;
+}
+
 void rocksdb_options_set_level_compaction_dynamic_level_bytes(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.level_compaction_dynamic_level_bytes = v;
 }
 
+unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.level_compaction_dynamic_level_bytes;
+}
+
 void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt,
                                                         double n) {
   opt->rep.max_bytes_for_level_multiplier = n;
 }
 
+double rocksdb_options_get_max_bytes_for_level_multiplier(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_bytes_for_level_multiplier;
+}
+
 void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt,
                                               uint64_t n) {
   opt->rep.max_compaction_bytes = n;
 }
 
+uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) {
+  return opt->rep.max_compaction_bytes;
+}
+
 void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
     rocksdb_options_t* opt, int* level_values, size_t num_levels) {
   opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
@@ -2322,30 +2683,129 @@
   opt->rep.skip_stats_update_on_db_open = val;
 }
 
+unsigned char rocksdb_options_get_skip_stats_update_on_db_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.skip_stats_update_on_db_open;
+}
+
 void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
     rocksdb_options_t* opt, unsigned char val) {
   opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
 }
 
+unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.skip_checking_sst_file_sizes_on_db_open;
+}
+
+/* Blob Options Settings */
+void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
+                                           unsigned char val) {
+  opt->rep.enable_blob_files = val;
+}
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_files;
+}
+
+void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.min_blob_size = val;
+}
+
+uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) {
+  return opt->rep.min_blob_size;
+}
+
+void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.blob_file_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) {
+  return opt->rep.blob_file_size;
+}
+
+void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt,
+                                               int val) {
+  opt->rep.blob_compression_type = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) {
+  return opt->rep.blob_compression_type;
+}
+
+void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt,
+                                        unsigned char val) {
+  opt->rep.enable_blob_garbage_collection = val;
+}
+
+unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_garbage_collection;
+}
+
+void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt,
+                                            double val) {
+  opt->rep.blob_garbage_collection_age_cutoff = val;
+}
+
+double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
+  return opt->rep.blob_garbage_collection_age_cutoff;
+}
+
+void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt,
+                                                 double val) {
+  opt->rep.blob_garbage_collection_force_threshold = val;
+}
+
+double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) {
+  return opt->rep.blob_garbage_collection_force_threshold;
+}
+
+void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+                                                        uint64_t val) {
+  opt->rep.blob_compaction_readahead_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_compaction_readahead_size(
+    rocksdb_options_t* opt) {
+  return opt->rep.blob_compaction_readahead_size;
+}
+
 void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
   opt->rep.num_levels = n;
 }
 
+int rocksdb_options_get_num_levels(rocksdb_options_t* opt) {
+  return opt->rep.num_levels;
+}
+
 void rocksdb_options_set_level0_file_num_compaction_trigger(
     rocksdb_options_t* opt, int n) {
   opt->rep.level0_file_num_compaction_trigger = n;
 }
 
+int rocksdb_options_get_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt) {
+  return opt->rep.level0_file_num_compaction_trigger;
+}
+
 void rocksdb_options_set_level0_slowdown_writes_trigger(
     rocksdb_options_t* opt, int n) {
   opt->rep.level0_slowdown_writes_trigger = n;
 }
 
+int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) {
+  return opt->rep.level0_slowdown_writes_trigger;
+}
+
 void rocksdb_options_set_level0_stop_writes_trigger(
     rocksdb_options_t* opt, int n) {
   opt->rep.level0_stop_writes_trigger = n;
 }
 
+int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) {
+  return opt->rep.level0_stop_writes_trigger;
+}
+
 void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* /*opt*/,
                                                   int /*n*/) {}
 
@@ -2353,12 +2813,28 @@
   opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
 }
 
+int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.wal_recovery_mode);
+}
+
 void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
   opt->rep.compression = static_cast<CompressionType>(t);
 }
 
+int rocksdb_options_get_compression(rocksdb_options_t* opt) {
+  return opt->rep.compression;
+}
+
+void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.bottommost_compression = static_cast<CompressionType>(t);
+}
+
+int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) {
+  return opt->rep.bottommost_compression;
+}
+
 void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
-                                               int* level_values,
+                                               const int* level_values,
                                                size_t num_levels) {
   opt->rep.compression_per_level.resize(num_levels);
   for (size_t i = 0; i < num_levels; ++i) {
@@ -2371,7 +2847,7 @@
                                                         int w_bits, int level,
                                                         int strategy,
                                                         int max_dict_bytes,
-                                                        bool enabled) {
+                                                        unsigned char enabled) {
   opt->rep.bottommost_compression_opts.window_bits = w_bits;
   opt->rep.bottommost_compression_opts.level = level;
   opt->rep.bottommost_compression_opts.strategy = strategy;
@@ -2379,6 +2855,21 @@
   opt->rep.bottommost_compression_opts.enabled = enabled;
 }
 
+void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.zstd_max_train_bytes =
+      zstd_max_train_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes,
+    unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.max_dict_buffer_bytes =
+      max_dict_buffer_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
 void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
                                              int level, int strategy,
                                              int max_dict_bytes) {
@@ -2388,6 +2879,36 @@
   opt->rep.compression_opts.max_dict_bytes = max_dict_bytes;
 }
 
+void rocksdb_options_set_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt, int zstd_max_train_bytes) {
+  opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes;
+}
+
+int rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.zstd_max_train_bytes;
+}
+
+void rocksdb_options_set_compression_options_parallel_threads(
+    rocksdb_options_t* opt, int value) {
+  opt->rep.compression_opts.parallel_threads = value;
+}
+
+int rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.parallel_threads;
+}
+
+void rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) {
+  opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes;
+}
+
+uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.max_dict_buffer_bytes;
+}
+
 void rocksdb_options_set_prefix_extractor(
     rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
   opt->rep.prefix_extractor.reset(prefix_extractor);
@@ -2398,6 +2919,10 @@
   opt->rep.use_fsync = use_fsync;
 }
 
+int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) {
+  return opt->rep.use_fsync;
+}
+
 void rocksdb_options_set_db_log_dir(
     rocksdb_options_t* opt, const char* db_log_dir) {
   opt->rep.db_log_dir = db_log_dir;
@@ -2412,16 +2937,28 @@
   opt->rep.WAL_ttl_seconds = ttl;
 }
 
+uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) {
+  return opt->rep.WAL_ttl_seconds;
+}
+
 void rocksdb_options_set_WAL_size_limit_MB(
     rocksdb_options_t* opt, uint64_t limit) {
   opt->rep.WAL_size_limit_MB = limit;
 }
 
+uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) {
+  return opt->rep.WAL_size_limit_MB;
+}
+
 void rocksdb_options_set_manifest_preallocation_size(
     rocksdb_options_t* opt, size_t v) {
   opt->rep.manifest_preallocation_size = v;
 }
 
+size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) {
+  return opt->rep.manifest_preallocation_size;
+}
+
 // noop
 void rocksdb_options_set_purge_redundant_kvs_while_flush(
     rocksdb_options_t* /*opt*/, unsigned char /*v*/) {}
@@ -2431,41 +2968,91 @@
   opt->rep.use_direct_reads = v;
 }
 
+unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) {
+  return opt->rep.use_direct_reads;
+}
+
 void rocksdb_options_set_use_direct_io_for_flush_and_compaction(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.use_direct_io_for_flush_and_compaction = v;
 }
 
+unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction(
+    rocksdb_options_t* opt) {
+  return opt->rep.use_direct_io_for_flush_and_compaction;
+}
+
 void rocksdb_options_set_allow_mmap_reads(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.allow_mmap_reads = v;
 }
 
+unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) {
+  return opt->rep.allow_mmap_reads;
+}
+
 void rocksdb_options_set_allow_mmap_writes(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.allow_mmap_writes = v;
 }
 
+unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) {
+  return opt->rep.allow_mmap_writes;
+}
+
 void rocksdb_options_set_is_fd_close_on_exec(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.is_fd_close_on_exec = v;
 }
 
+unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) {
+  return opt->rep.is_fd_close_on_exec;
+}
+
 void rocksdb_options_set_skip_log_error_on_recovery(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.skip_log_error_on_recovery = v;
 }
 
+unsigned char rocksdb_options_get_skip_log_error_on_recovery(
+    rocksdb_options_t* opt) {
+  return opt->rep.skip_log_error_on_recovery;
+}
+
 void rocksdb_options_set_stats_dump_period_sec(
     rocksdb_options_t* opt, unsigned int v) {
   opt->rep.stats_dump_period_sec = v;
 }
 
+unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) {
+  return opt->rep.stats_dump_period_sec;
+}
+
+void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt,
+                                                  unsigned int v) {
+  opt->rep.stats_persist_period_sec = v;
+}
+
+unsigned int rocksdb_options_get_stats_persist_period_sec(
+    rocksdb_options_t* opt) {
+  return opt->rep.stats_persist_period_sec;
+}
+
 void rocksdb_options_set_advise_random_on_open(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.advise_random_on_open = v;
 }
 
+unsigned char rocksdb_options_get_advise_random_on_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.advise_random_on_open;
+}
+
+void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt,
+                                                         double v) {
+  opt->rep.experimental_mempurge_threshold = v;
+}
+
 void rocksdb_options_set_access_hint_on_compaction_start(
     rocksdb_options_t* opt, int v) {
   switch(v) {
@@ -2485,142 +3072,276 @@
       opt->rep.access_hint_on_compaction_start =
           ROCKSDB_NAMESPACE::Options::WILLNEED;
       break;
+    default:
+      assert(0);
   }
 }
 
+int rocksdb_options_get_access_hint_on_compaction_start(
+    rocksdb_options_t* opt) {
+  return opt->rep.access_hint_on_compaction_start;
+}
+
 void rocksdb_options_set_use_adaptive_mutex(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.use_adaptive_mutex = v;
 }
 
+unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) {
+  return opt->rep.use_adaptive_mutex;
+}
+
 void rocksdb_options_set_wal_bytes_per_sync(
     rocksdb_options_t* opt, uint64_t v) {
   opt->rep.wal_bytes_per_sync = v;
 }
 
+uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) {
+  return opt->rep.wal_bytes_per_sync;
+}
+
 void rocksdb_options_set_bytes_per_sync(
     rocksdb_options_t* opt, uint64_t v) {
   opt->rep.bytes_per_sync = v;
 }
 
+uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) {
+  return opt->rep.bytes_per_sync;
+}
+
 void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
                                                        uint64_t v) {
   opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
 }
 
+uint64_t rocksdb_options_get_writable_file_max_buffer_size(
+    rocksdb_options_t* opt) {
+  return opt->rep.writable_file_max_buffer_size;
+}
+
 void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
                                                          unsigned char v) {
   opt->rep.allow_concurrent_memtable_write = v;
 }
 
+unsigned char rocksdb_options_get_allow_concurrent_memtable_write(
+    rocksdb_options_t* opt) {
+  return opt->rep.allow_concurrent_memtable_write;
+}
+
 void rocksdb_options_set_enable_write_thread_adaptive_yield(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.enable_write_thread_adaptive_yield = v;
 }
 
+unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_write_thread_adaptive_yield;
+}
+
 void rocksdb_options_set_max_sequential_skip_in_iterations(
     rocksdb_options_t* opt, uint64_t v) {
   opt->rep.max_sequential_skip_in_iterations = v;
 }
 
+uint64_t rocksdb_options_get_max_sequential_skip_in_iterations(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_sequential_skip_in_iterations;
+}
+
 void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) {
   opt->rep.max_write_buffer_number = n;
 }
 
+int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_number;
+}
+
 void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) {
   opt->rep.min_write_buffer_number_to_merge = n;
 }
 
+int rocksdb_options_get_min_write_buffer_number_to_merge(
+    rocksdb_options_t* opt) {
+  return opt->rep.min_write_buffer_number_to_merge;
+}
+
 void rocksdb_options_set_max_write_buffer_number_to_maintain(
     rocksdb_options_t* opt, int n) {
   opt->rep.max_write_buffer_number_to_maintain = n;
 }
 
+int rocksdb_options_get_max_write_buffer_number_to_maintain(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_number_to_maintain;
+}
+
 void rocksdb_options_set_max_write_buffer_size_to_maintain(
     rocksdb_options_t* opt, int64_t n) {
   opt->rep.max_write_buffer_size_to_maintain = n;
 }
 
+int64_t rocksdb_options_get_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_size_to_maintain;
+}
+
 void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
                                                 unsigned char v) {
   opt->rep.enable_pipelined_write = v;
 }
 
+unsigned char rocksdb_options_get_enable_pipelined_write(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_pipelined_write;
+}
+
 void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
                                          unsigned char v) {
   opt->rep.unordered_write = v;
 }
 
+unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) {
+  return opt->rep.unordered_write;
+}
+
 void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
                                             uint32_t n) {
   opt->rep.max_subcompactions = n;
 }
 
+uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) {
+  return opt->rep.max_subcompactions;
+}
+
 void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
   opt->rep.max_background_jobs = n;
 }
 
+int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) {
+  return opt->rep.max_background_jobs;
+}
+
 void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
   opt->rep.max_background_compactions = n;
 }
 
+int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) {
+  return opt->rep.max_background_compactions;
+}
+
 void rocksdb_options_set_base_background_compactions(rocksdb_options_t* opt,
                                                      int n) {
   opt->rep.base_background_compactions = n;
 }
 
+int rocksdb_options_get_base_background_compactions(rocksdb_options_t* opt) {
+  return opt->rep.base_background_compactions;
+}
+
 void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
   opt->rep.max_background_flushes = n;
 }
 
+int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) {
+  return opt->rep.max_background_flushes;
+}
+
 void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
   opt->rep.max_log_file_size = v;
 }
 
+size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) {
+  return opt->rep.max_log_file_size;
+}
+
 void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) {
   opt->rep.log_file_time_to_roll = v;
 }
 
+size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) {
+  return opt->rep.log_file_time_to_roll;
+}
+
 void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
   opt->rep.keep_log_file_num = v;
 }
 
+size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) {
+  return opt->rep.keep_log_file_num;
+}
+
 void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
                                               size_t v) {
   opt->rep.recycle_log_file_num = v;
 }
 
+size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) {
+  return opt->rep.recycle_log_file_num;
+}
+
 void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) {
   opt->rep.soft_rate_limit = v;
 }
 
+double rocksdb_options_get_soft_rate_limit(rocksdb_options_t* opt) {
+  return opt->rep.soft_rate_limit;
+}
+
 void rocksdb_options_set_hard_rate_limit(rocksdb_options_t* opt, double v) {
   opt->rep.hard_rate_limit = v;
 }
 
+double rocksdb_options_get_hard_rate_limit(rocksdb_options_t* opt) {
+  return opt->rep.hard_rate_limit;
+}
+
 void rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
   opt->rep.soft_pending_compaction_bytes_limit = v;
 }
 
+size_t rocksdb_options_get_soft_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt) {
+  return opt->rep.soft_pending_compaction_bytes_limit;
+}
+
 void rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) {
   opt->rep.hard_pending_compaction_bytes_limit = v;
 }
 
+size_t rocksdb_options_get_hard_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt) {
+  return opt->rep.hard_pending_compaction_bytes_limit;
+}
+
 void rocksdb_options_set_rate_limit_delay_max_milliseconds(
     rocksdb_options_t* opt, unsigned int v) {
   opt->rep.rate_limit_delay_max_milliseconds = v;
 }
 
+unsigned int rocksdb_options_get_rate_limit_delay_max_milliseconds(
+    rocksdb_options_t* opt) {
+  return opt->rep.rate_limit_delay_max_milliseconds;
+}
+
 void rocksdb_options_set_max_manifest_file_size(
     rocksdb_options_t* opt, size_t v) {
   opt->rep.max_manifest_file_size = v;
 }
 
+size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) {
+  return opt->rep.max_manifest_file_size;
+}
+
 void rocksdb_options_set_table_cache_numshardbits(
     rocksdb_options_t* opt, int v) {
   opt->rep.table_cache_numshardbits = v;
 }
 
+int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) {
+  return opt->rep.table_cache_numshardbits;
+}
+
 void rocksdb_options_set_table_cache_remove_scan_count_limit(
     rocksdb_options_t* /*opt*/, int /*v*/) {
   // this option is deprecated
@@ -2631,19 +3352,38 @@
   opt->rep.arena_block_size = v;
 }
 
+size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) {
+  return opt->rep.arena_block_size;
+}
+
 void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) {
   opt->rep.disable_auto_compactions = disable;
 }
 
+unsigned char rocksdb_options_get_disable_auto_compactions(
+    rocksdb_options_t* opt) {
+  return opt->rep.disable_auto_compactions;
+}
+
 void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, int v) {
   opt->rep.optimize_filters_for_hits = v;
 }
 
+unsigned char rocksdb_options_get_optimize_filters_for_hits(
+    rocksdb_options_t* opt) {
+  return opt->rep.optimize_filters_for_hits;
+}
+
 void rocksdb_options_set_delete_obsolete_files_period_micros(
     rocksdb_options_t* opt, uint64_t v) {
   opt->rep.delete_obsolete_files_period_micros = v;
 }
 
+uint64_t rocksdb_options_get_delete_obsolete_files_period_micros(
+    rocksdb_options_t* opt) {
+  return opt->rep.delete_obsolete_files_period_micros;
+}
+
 void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
   opt->rep.PrepareForBulkLoad();
 }
@@ -2657,11 +3397,20 @@
   opt->rep.memtable_prefix_bloom_size_ratio = v;
 }
 
+double rocksdb_options_get_memtable_prefix_bloom_size_ratio(
+    rocksdb_options_t* opt) {
+  return opt->rep.memtable_prefix_bloom_size_ratio;
+}
+
 void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt,
                                                  size_t v) {
   opt->rep.memtable_huge_page_size = v;
 }
 
+size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) {
+  return opt->rep.memtable_huge_page_size;
+}
+
 void rocksdb_options_set_hash_skip_list_rep(
     rocksdb_options_t *opt, size_t bucket_count,
     int32_t skiplist_height, int32_t skiplist_branching_factor) {
@@ -2696,31 +3445,56 @@
   opt->rep.max_successive_merges = v;
 }
 
+size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) {
+  return opt->rep.max_successive_merges;
+}
+
 void rocksdb_options_set_bloom_locality(
     rocksdb_options_t* opt, uint32_t v) {
   opt->rep.bloom_locality = v;
 }
 
+uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) {
+  return opt->rep.bloom_locality;
+}
+
 void rocksdb_options_set_inplace_update_support(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.inplace_update_support = v;
 }
 
+unsigned char rocksdb_options_get_inplace_update_support(
+    rocksdb_options_t* opt) {
+  return opt->rep.inplace_update_support;
+}
+
 void rocksdb_options_set_inplace_update_num_locks(
     rocksdb_options_t* opt, size_t v) {
   opt->rep.inplace_update_num_locks = v;
 }
 
+size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) {
+  return opt->rep.inplace_update_num_locks;
+}
+
 void rocksdb_options_set_report_bg_io_stats(
     rocksdb_options_t* opt, int v) {
   opt->rep.report_bg_io_stats = v;
 }
 
+unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) {
+  return opt->rep.report_bg_io_stats;
+}
+
 void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
   opt->rep.compaction_style =
       static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(style);
 }
 
+int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) {
+  return opt->rep.compaction_style;
+}
+
 void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) {
   opt->rep.compaction_options_universal = *(uco->rep);
 }
@@ -2750,6 +3524,19 @@
   opt->rep.atomic_flush = atomic_flush;
 }
 
+unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) {
+  return opt->rep.atomic_flush;
+}
+
+void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt,
+                                          unsigned char manual_wal_flush) {
+  opt->rep.manual_wal_flush = manual_wal_flush;
+}
+
+unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) {
+  return opt->rep.manual_wal_flush;
+}
+
 rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
     int64_t rate_bytes_per_sec,
     int64_t refill_period_us,
@@ -2771,6 +3558,14 @@
   }
 }
 
+void rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+      compact_on_del =
+          NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger);
+  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
 void rocksdb_set_perf_level(int v) {
   PerfLevel level = static_cast<PerfLevel>(v);
   SetPerfLevel(level);
@@ -3064,7 +3859,8 @@
   delete filter;
 }
 
-rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(int bits_per_key, bool original_format) {
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(
+    double bits_per_key, bool original_format) {
   // Make a rocksdb_filterpolicy_t, but override all of its methods so
   // they delegate to a NewBloomFilterPolicy() instead of user
   // supplied C functions.
@@ -3099,14 +3895,63 @@
   return wrapper;
 }
 
-rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(int bits_per_key) {
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(
+    double bits_per_key) {
   return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false);
 }
 
-rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(int bits_per_key) {
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) {
   return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true);
 }
 
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format(
+    double bloom_equivalent_bits_per_key, int bloom_before_level) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewRibbonFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() override { delete rep_; }
+    const char* Name() const override { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n,
+                      std::string* dst) const override {
+      return rep_->CreateFilter(keys, n, dst);
+    }
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+      return rep_->KeyMayMatch(key, filter);
+    }
+    ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+        const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+        const override {
+      return rep_->GetBuilderWithContext(context);
+    }
+    ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+        const Slice& contents) const override {
+      return rep_->GetFilterBitsReader(contents);
+    }
+    static void DoNothing(void*) {}
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ =
+      NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level);
+  wrapper->state_ = nullptr;
+  wrapper->delete_filter_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon(
+    double bloom_equivalent_bits_per_key) {
+  return rocksdb_filterpolicy_create_ribbon_format(
+      bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid(
+    double bloom_equivalent_bits_per_key, int bloom_before_level) {
+  return rocksdb_filterpolicy_create_ribbon_format(
+      bloom_equivalent_bits_per_key, bloom_before_level);
+}
+
 rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
     void* state, void (*destructor)(void*),
     char* (*full_merge)(void*, const char* key, size_t key_length,
@@ -3149,11 +3994,20 @@
   opt->rep.verify_checksums = v;
 }
 
+unsigned char rocksdb_readoptions_get_verify_checksums(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.verify_checksums;
+}
+
 void rocksdb_readoptions_set_fill_cache(
     rocksdb_readoptions_t* opt, unsigned char v) {
   opt->rep.fill_cache = v;
 }
 
+unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) {
+  return opt->rep.fill_cache;
+}
+
 void rocksdb_readoptions_set_snapshot(
     rocksdb_readoptions_t* opt,
     const rocksdb_snapshot_t* snap) {
@@ -3190,11 +4044,19 @@
   opt->rep.read_tier = static_cast<ROCKSDB_NAMESPACE::ReadTier>(v);
 }
 
+int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) {
+  return static_cast<int>(opt->rep.read_tier);
+}
+
 void rocksdb_readoptions_set_tailing(
     rocksdb_readoptions_t* opt, unsigned char v) {
   opt->rep.tailing = v;
 }
 
+unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) {
+  return opt->rep.tailing;
+}
+
 void rocksdb_readoptions_set_managed(
     rocksdb_readoptions_t* opt, unsigned char v) {
   opt->rep.managed = v;
@@ -3205,37 +4067,89 @@
   opt->rep.readahead_size = v;
 }
 
+size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) {
+  return opt->rep.readahead_size;
+}
+
 void rocksdb_readoptions_set_prefix_same_as_start(
     rocksdb_readoptions_t* opt, unsigned char v) {
   opt->rep.prefix_same_as_start = v;
 }
 
+unsigned char rocksdb_readoptions_get_prefix_same_as_start(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.prefix_same_as_start;
+}
+
 void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
                                       unsigned char v) {
   opt->rep.pin_data = v;
 }
 
+unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) {
+  return opt->rep.pin_data;
+}
+
 void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
                                               unsigned char v) {
   opt->rep.total_order_seek = v;
 }
 
+unsigned char rocksdb_readoptions_get_total_order_seek(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.total_order_seek;
+}
+
 void rocksdb_readoptions_set_max_skippable_internal_keys(
     rocksdb_readoptions_t* opt,
     uint64_t v) {
   opt->rep.max_skippable_internal_keys = v;
 }
 
+uint64_t rocksdb_readoptions_get_max_skippable_internal_keys(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.max_skippable_internal_keys;
+}
+
 void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
     rocksdb_readoptions_t* opt, unsigned char v) {
   opt->rep.background_purge_on_iterator_cleanup = v;
 }
 
+unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.background_purge_on_iterator_cleanup;
+}
+
 void rocksdb_readoptions_set_ignore_range_deletions(
     rocksdb_readoptions_t* opt, unsigned char v) {
   opt->rep.ignore_range_deletions = v;
 }
 
+unsigned char rocksdb_readoptions_get_ignore_range_deletions(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.ignore_range_deletions;
+}
+
+void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt,
+                                      uint64_t microseconds) {
+  opt->rep.deadline = std::chrono::microseconds(microseconds);
+}
+
+uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) {
+  return opt->rep.deadline.count();
+}
+
+void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt,
+                                        uint64_t microseconds) {
+  opt->rep.io_timeout = std::chrono::microseconds(microseconds);
+}
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
+  return opt->rep.io_timeout.count();
+}
+
 rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
   return new rocksdb_writeoptions_t;
 }
@@ -3249,33 +4163,61 @@
   opt->rep.sync = v;
 }
 
+unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) {
+  return opt->rep.sync;
+}
+
 void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable) {
   opt->rep.disableWAL = disable;
 }
 
+unsigned char rocksdb_writeoptions_get_disable_WAL(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.disableWAL;
+}
+
 void rocksdb_writeoptions_set_ignore_missing_column_families(
     rocksdb_writeoptions_t* opt,
     unsigned char v) {
   opt->rep.ignore_missing_column_families = v;
 }
 
+unsigned char rocksdb_writeoptions_get_ignore_missing_column_families(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.ignore_missing_column_families;
+}
+
 void rocksdb_writeoptions_set_no_slowdown(
     rocksdb_writeoptions_t* opt,
     unsigned char v) {
   opt->rep.no_slowdown = v;
 }
 
+unsigned char rocksdb_writeoptions_get_no_slowdown(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.no_slowdown;
+}
+
 void rocksdb_writeoptions_set_low_pri(
     rocksdb_writeoptions_t* opt,
     unsigned char v) {
   opt->rep.low_pri = v;
 }
 
+unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) {
+  return opt->rep.low_pri;
+}
+
 void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
     rocksdb_writeoptions_t* opt, unsigned char v) {
   opt->rep.memtable_insert_hint_per_batch = v;
 }
 
+unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.memtable_insert_hint_per_batch;
+}
+
 rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
   return new rocksdb_compactoptions_t;
 }
@@ -3289,21 +4231,40 @@
   opt->rep.bottommost_level_compaction = static_cast<BottommostLevelCompaction>(v);
 }
 
+unsigned char rocksdb_compactoptions_get_bottommost_level_compaction(
+    rocksdb_compactoptions_t* opt) {
+  return static_cast<unsigned char>(opt->rep.bottommost_level_compaction);
+}
+
 void rocksdb_compactoptions_set_exclusive_manual_compaction(
     rocksdb_compactoptions_t* opt, unsigned char v) {
   opt->rep.exclusive_manual_compaction = v;
 }
 
+unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction(
+    rocksdb_compactoptions_t* opt) {
+  return opt->rep.exclusive_manual_compaction;
+}
+
 void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt,
                                              unsigned char v) {
   opt->rep.change_level = v;
 }
 
+unsigned char rocksdb_compactoptions_get_change_level(
+    rocksdb_compactoptions_t* opt) {
+  return opt->rep.change_level;
+}
+
 void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt,
                                              int n) {
   opt->rep.target_level = n;
 }
 
+int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) {
+  return opt->rep.target_level;
+}
+
 rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
   return new rocksdb_flushoptions_t;
 }
@@ -3317,20 +4278,70 @@
   opt->rep.wait = v;
 }
 
+unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) {
+  return opt->rep.wait;
+}
+
+rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create(
+    char** errptr) {
+  rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t;
+  ROCKSDB_NAMESPACE::JemallocAllocatorOptions options;
+  SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator(
+                        options, &allocator->rep));
+  return allocator;
+}
+
+void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) {
+  delete allocator;
+}
+
+rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() {
+  return new rocksdb_lru_cache_options_t;
+}
+
+void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt,
+                                            size_t capacity) {
+  opt->rep.capacity = capacity;
+}
+
+void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) {
+  opt->rep.memory_allocator = allocator->rep;
+}
+
 rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
   rocksdb_cache_t* c = new rocksdb_cache_t;
   c->rep = NewLRUCache(capacity);
   return c;
 }
 
+rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t* opt) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(opt->rep);
+  return c;
+}
+
 void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
   delete cache;
 }
 
+void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
+  cache->rep->DisownData();
+}
+
 void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
   cache->rep->SetCapacity(capacity);
 }
 
+size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
+  return cache->rep->GetCapacity();
+}
+
 size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
   return cache->rep->GetUsage();
 }
@@ -3368,10 +4379,36 @@
   env->rep->SetBackgroundThreads(n);
 }
 
+int rocksdb_env_get_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads();
+}
+
+void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env,
+                                                        int n) {
+  env->rep->SetBackgroundThreads(n, Env::BOTTOM);
+}
+
+int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::BOTTOM);
+}
+
 void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
   env->rep->SetBackgroundThreads(n, Env::HIGH);
 }
 
+int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::HIGH);
+}
+
+void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env,
+                                                     int n) {
+  env->rep->SetBackgroundThreads(n, Env::LOW);
+}
+
+int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::LOW);
+}
+
 void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
   env->rep->WaitForJoin();
 }
@@ -3558,10 +4595,11 @@
   delete st;
 }
 
-struct Wrapper : public rocksdb_slicetransform_t {
+struct SliceTransformWrapper : public rocksdb_slicetransform_t {
   const SliceTransform* rep_;
-  ~Wrapper() override { delete rep_; }
+  ~SliceTransformWrapper() override { delete rep_; }
   const char* Name() const override { return rep_->Name(); }
+  std::string GetId() const override { return rep_->GetId(); }
   Slice Transform(const Slice& src) const override {
     return rep_->Transform(src);
   }
@@ -3573,18 +4611,18 @@
 };
 
 rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) {
-  Wrapper* wrapper = new Wrapper;
+  SliceTransformWrapper* wrapper = new SliceTransformWrapper;
   wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen);
   wrapper->state_ = nullptr;
-  wrapper->destructor_ = &Wrapper::DoNothing;
+  wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
   return wrapper;
 }
 
 rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
-  Wrapper* wrapper = new Wrapper;
+  SliceTransformWrapper* wrapper = new SliceTransformWrapper;
   wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform();
   wrapper->state_ = nullptr;
-  wrapper->destructor_ = &Wrapper::DoNothing;
+  wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
   return wrapper;
 }
 
@@ -3599,32 +4637,62 @@
   uco->rep->size_ratio = ratio;
 }
 
+int rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->size_ratio;
+}
+
 void rocksdb_universal_compaction_options_set_min_merge_width(
   rocksdb_universal_compaction_options_t* uco, int w) {
   uco->rep->min_merge_width = w;
 }
 
+int rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->min_merge_width;
+}
+
 void rocksdb_universal_compaction_options_set_max_merge_width(
   rocksdb_universal_compaction_options_t* uco, int w) {
   uco->rep->max_merge_width = w;
 }
 
+int rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_merge_width;
+}
+
 void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
   rocksdb_universal_compaction_options_t* uco, int p) {
   uco->rep->max_size_amplification_percent = p;
 }
 
+int rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_size_amplification_percent;
+}
+
 void rocksdb_universal_compaction_options_set_compression_size_percent(
   rocksdb_universal_compaction_options_t* uco, int p) {
   uco->rep->compression_size_percent = p;
 }
 
+int rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->compression_size_percent;
+}
+
 void rocksdb_universal_compaction_options_set_stop_style(
   rocksdb_universal_compaction_options_t* uco, int style) {
   uco->rep->stop_style =
       static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
 }
 
+int rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t* uco) {
+  return static_cast<int>(uco->rep->stop_style);
+}
+
 void rocksdb_universal_compaction_options_destroy(
   rocksdb_universal_compaction_options_t* uco) {
   delete uco->rep;
@@ -3642,6 +4710,11 @@
   fifo_opts->rep.max_table_files_size = size;
 }
 
+uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.max_table_files_size;
+}
+
 void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts) {
   delete fifo_opts;
@@ -3665,6 +4738,11 @@
   return static_cast<int>(lf->rep.size());
 }
 
+const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf,
+                                                 int index) {
+  return lf->rep[index].column_family_name.c_str();
+}
+
 const char* rocksdb_livefiles_name(
   const rocksdb_livefiles_t* lf,
   int index) {
@@ -3831,6 +4909,27 @@
   opt->rep.set_snapshot = v;
 }
 
+char* rocksdb_optimistictransactiondb_property_value(
+    rocksdb_optimistictransactiondb_t* db, const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_optimistictransactiondb_property_int(
+    rocksdb_optimistictransactiondb_t* db, const char* propname,
+    uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
 rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
     rocksdb_transactiondb_t* txn_db,
     const rocksdb_options_t* column_family_options,
@@ -3901,6 +5000,27 @@
   delete snapshot;
 }
 
+char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db,
+                                           const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
+                                       const char* propname,
+                                       uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
 rocksdb_transaction_t* rocksdb_transaction_begin(
     rocksdb_transactiondb_t* txn_db,
     const rocksdb_writeoptions_t* write_options,
@@ -3940,7 +5060,10 @@
 
 const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
     rocksdb_transaction_t* txn) {
-  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  // This will be freed later on using free, so use malloc here to avoid a
+  // mismatch
+  rocksdb_snapshot_t* result =
+      (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t));
   result->rep = txn->rep->GetSnapshot();
   return result;
 }
@@ -4300,12 +5423,31 @@
   return old_txn;
 }
 
+// Write batch into OptimisticTransactionDB
+void rocksdb_optimistictransactiondb_write(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep));
+}
+
 void rocksdb_optimistictransactiondb_close(
     rocksdb_optimistictransactiondb_t* otxn_db) {
   delete otxn_db->rep;
   delete otxn_db;
 }
 
+rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create(
+    rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
 void rocksdb_free(void* ptr) { free(ptr); }
 
 rocksdb_pinnableslice_t* rocksdb_get_pinned(
@@ -4441,11 +5583,25 @@
   return memory_usage->cache_total;
 }
 
+void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt,
+                                           unsigned char val) {
+  opt->rep.dump_malloc_stats = val;
+}
+
+void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt,
+                                                      unsigned char val) {
+  opt->rep.memtable_whole_key_filtering = val;
+}
+
 // deletes container with memory usage estimates
 void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
   delete usage;
 }
 
+void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) {
+  CancelAllBackgroundWork(db->rep, wait);
+}
+
 }  // end extern "C"
 
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/c_test.c mariadb-10.11.13/storage/rocksdb/rocksdb/db/c_test.c
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/c_test.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/c_test.c	2025-05-19 16:14:27.000000000 +0000
@@ -7,12 +7,13 @@
 
 #ifndef ROCKSDB_LITE  // Lite does not support C API
 
-#include "rocksdb/c.h"
-
+#include <assert.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
+
+#include "rocksdb/c.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
@@ -58,7 +59,11 @@
 static const char* GetTempDir(void) {
     const char* ret = getenv("TEST_TMPDIR");
     if (ret == NULL || ret[0] == '\0')
-        ret = "/tmp";
+#ifdef OS_WIN
+      ret = getenv("TEMP");
+#else
+      ret = "/tmp";
+#endif
     return ret;
 }
 #ifdef _MSC_VER
@@ -85,10 +90,8 @@
     // ok
     return;
   } else {
-    fprintf(stderr, "%s: expected '%s', got '%s'\n",
-            phase,
-            (expected ? expected : "(null)"),
-            (v ? v : "(null"));
+    fprintf(stderr, "%s: expected '%s', got '%s'\n", phase,
+            (expected ? expected : "(null)"), (v ? v : "(null)"));
     abort();
   }
 }
@@ -513,6 +516,9 @@
   coptions = rocksdb_compactoptions_create();
   rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1);
 
+  rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
+                                                            10001);
+
   StartPhase("destroy");
   rocksdb_destroy_db(options, dbname, &err);
   Free(&err);
@@ -984,7 +990,9 @@
                   &err);
       CheckNoError(err);
     }
-    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes,
+                              &err);
+    CheckNoError(err);
     CheckCondition(sizes[0] > 0);
     CheckCondition(sizes[1] > 0);
   }
@@ -1010,7 +1018,36 @@
     CheckGet(db, roptions, "foo", NULL);
     rocksdb_release_snapshot(db, snap);
   }
-
+  StartPhase("snapshot_with_memtable_inplace_update");
+  {
+    rocksdb_close(db);
+    const rocksdb_snapshot_t* snap = NULL;
+    const char* s_key = "foo_snap";
+    const char* value1 = "hello_s1";
+    const char* value2 = "hello_s2";
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+    rocksdb_options_set_inplace_update_support(options, 1);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, s_key, 8, value1, 8, &err);
+    snap = rocksdb_create_snapshot(db);
+    assert(snap != NULL);
+    rocksdb_put(db, woptions, s_key, 8, value2, 8, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", NULL);
+    // snapshot syntax is invalid, because of inplace update supported is set
+    CheckGet(db, roptions, s_key, value2);
+    // restore the data and options
+    rocksdb_delete(db, woptions, s_key, 8, &err);
+    CheckGet(db, roptions, s_key, NULL);
+    rocksdb_release_snapshot(db, snap);
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_options_set_inplace_update_support(options, 0);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
   StartPhase("repair");
   {
     // If we do not compact here, then the lazy deletion of
@@ -1034,19 +1071,25 @@
   }
 
   StartPhase("filter");
-  for (run = 0; run <= 2; run++) {
-    // First run uses custom filter
-    // Second run uses old block-based bloom filter
-    // Third run uses full bloom filter
+  for (run = 0; run <= 4; run++) {
+    // run=0 uses custom filter
+    // run=1 uses old block-based bloom filter
+    // run=2 run uses full bloom filter
+    // run=3 uses Ribbon
+    // run=4 uses Ribbon-Bloom hybrid configuration
     CheckNoError(err);
     rocksdb_filterpolicy_t* policy;
     if (run == 0) {
       policy = rocksdb_filterpolicy_create(NULL, FilterDestroy, FilterCreate,
                                            FilterKeyMatch, NULL, FilterName);
     } else if (run == 1) {
-      policy = rocksdb_filterpolicy_create_bloom(8);
+      policy = rocksdb_filterpolicy_create_bloom(8.0);
+    } else if (run == 2) {
+      policy = rocksdb_filterpolicy_create_bloom_full(8.0);
+    } else if (run == 3) {
+      policy = rocksdb_filterpolicy_create_ribbon(8.0);
     } else {
-      policy = rocksdb_filterpolicy_create_bloom_full(8);
+      policy = rocksdb_filterpolicy_create_ribbon_hybrid(8.0, 1);
     }
     rocksdb_block_based_options_set_filter_policy(table_options, policy);
 
@@ -1112,10 +1155,12 @@
       } else if (run == 1) {
         // Essentially a fingerprint of the block-based Bloom schema
         CheckCondition(hits == 241);
+      } else if (run == 2 || run == 4) {
+        // Essentially a fingerprint of full Bloom schema, format_version=5
+        CheckCondition(hits == 188);
       } else {
-        // Essentially a fingerprint of the full Bloom schema(s),
-        // format_version < 5, which vary for three different CACHE_LINE_SIZEs
-        CheckCondition(hits == 224 || hits == 180 || hits == 125);
+        // Essentially a fingerprint of Ribbon schema
+        CheckCondition(hits == 226);
       }
       CheckCondition(
           (keys_to_query - hits) ==
@@ -1271,6 +1316,9 @@
     CheckPinGetCF(db, roptions, handles[1], "box", "c");
     rocksdb_writebatch_destroy(wb);
 
+    rocksdb_flush_wal(db, 1, &err);
+    CheckNoError(err);
+
     const char* keys[3] = { "box", "box", "barfooxx" };
     const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] };
     const size_t keys_sizes[3] = { 3, 3, 8 };
@@ -1296,6 +1344,29 @@
       Free(&vals[i]);
     }
 
+    {
+      unsigned char value_found = 0;
+
+      CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+                                            NULL, NULL, NULL, 0, NULL));
+      CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+                                            &vals[0], &vals_sizes[0], NULL, 0,
+                                            &value_found));
+      if (value_found) {
+        Free(&vals[0]);
+      }
+
+      CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+                                               "invalid_key", 11, NULL, NULL,
+                                               NULL, 0, NULL));
+      CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+                                               "invalid_key", 11, &vals[0],
+                                               &vals_sizes[0], NULL, 0, NULL));
+      if (value_found) {
+        Free(&vals[0]);
+      }
+    }
+
     rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]);
     CheckCondition(!rocksdb_iter_valid(iter));
     rocksdb_iter_seek_to_first(iter);
@@ -1461,6 +1532,1079 @@
     rocksdb_cuckoo_options_destroy(cuckoo_options);
   }
 
+  StartPhase("options");
+  {
+    rocksdb_options_t* o;
+    o = rocksdb_options_create();
+
+    // Set and check options.
+    rocksdb_options_set_allow_ingest_behind(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+    rocksdb_options_compaction_readahead_size(o, 10);
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+    rocksdb_options_set_create_if_missing(o, 1);
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+    rocksdb_options_set_create_missing_column_families(o, 1);
+    CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+    rocksdb_options_set_error_if_exists(o, 1);
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+    rocksdb_options_set_paranoid_checks(o, 1);
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+    rocksdb_options_set_info_log_level(o, 3);
+    CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+    rocksdb_options_set_write_buffer_size(o, 100);
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+    rocksdb_options_set_db_write_buffer_size(o, 1000);
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+    rocksdb_options_set_max_open_files(o, 21);
+    CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+    rocksdb_options_set_max_file_opening_threads(o, 5);
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+    rocksdb_options_set_max_total_wal_size(o, 400);
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+    rocksdb_options_set_num_levels(o, 7);
+    CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+    rocksdb_options_set_level0_file_num_compaction_trigger(o, 4);
+    CheckCondition(4 ==
+                   rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+    rocksdb_options_set_level0_slowdown_writes_trigger(o, 6);
+    CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+    rocksdb_options_set_level0_stop_writes_trigger(o, 8);
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+    rocksdb_options_set_target_file_size_base(o, 256);
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+    rocksdb_options_set_target_file_size_multiplier(o, 3);
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+    rocksdb_options_set_max_bytes_for_level_base(o, 1024);
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+    rocksdb_options_set_level_compaction_dynamic_level_bytes(o, 1);
+    CheckCondition(1 ==
+                   rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+    rocksdb_options_set_max_bytes_for_level_multiplier(o, 2.0);
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+    rocksdb_options_set_skip_stats_update_on_db_open(o, 1);
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1);
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+    rocksdb_options_set_max_write_buffer_number(o, 97);
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+    rocksdb_options_set_min_write_buffer_number_to_merge(o, 23);
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+    rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64);
+    CheckCondition(64 ==
+                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+    rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000);
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+    rocksdb_options_set_enable_pipelined_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+    rocksdb_options_set_unordered_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+    rocksdb_options_set_max_subcompactions(o, 123456);
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+    rocksdb_options_set_max_background_jobs(o, 2);
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+    rocksdb_options_set_max_background_compactions(o, 3);
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+    rocksdb_options_set_base_background_compactions(o, 4);
+    CheckCondition(4 == rocksdb_options_get_base_background_compactions(o));
+
+    rocksdb_options_set_max_background_flushes(o, 5);
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+    rocksdb_options_set_max_log_file_size(o, 6);
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+    rocksdb_options_set_log_file_time_to_roll(o, 7);
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+    rocksdb_options_set_keep_log_file_num(o, 8);
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+    rocksdb_options_set_recycle_log_file_num(o, 9);
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+    rocksdb_options_set_soft_rate_limit(o, 2.0);
+    CheckCondition(2.0 == rocksdb_options_get_soft_rate_limit(o));
+
+    rocksdb_options_set_hard_rate_limit(o, 4.0);
+    CheckCondition(4.0 == rocksdb_options_get_hard_rate_limit(o));
+
+    rocksdb_options_set_soft_pending_compaction_bytes_limit(o, 10);
+    CheckCondition(10 ==
+                   rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_hard_pending_compaction_bytes_limit(o, 11);
+    CheckCondition(11 ==
+                   rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_rate_limit_delay_max_milliseconds(o, 1);
+    CheckCondition(1 ==
+                   rocksdb_options_get_rate_limit_delay_max_milliseconds(o));
+
+    rocksdb_options_set_max_manifest_file_size(o, 12);
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+    rocksdb_options_set_table_cache_numshardbits(o, 13);
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+    rocksdb_options_set_arena_block_size(o, 14);
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+    rocksdb_options_set_use_fsync(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+    rocksdb_options_set_WAL_ttl_seconds(o, 15);
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+    rocksdb_options_set_WAL_size_limit_MB(o, 16);
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+    rocksdb_options_set_manifest_preallocation_size(o, 17);
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+    rocksdb_options_set_allow_mmap_reads(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+    rocksdb_options_set_allow_mmap_writes(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+    rocksdb_options_set_use_direct_reads(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+    rocksdb_options_set_use_direct_io_for_flush_and_compaction(o, 1);
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+    rocksdb_options_set_is_fd_close_on_exec(o, 1);
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+    rocksdb_options_set_skip_log_error_on_recovery(o, 1);
+    CheckCondition(1 == rocksdb_options_get_skip_log_error_on_recovery(o));
+
+    rocksdb_options_set_stats_dump_period_sec(o, 18);
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+    rocksdb_options_set_stats_persist_period_sec(o, 5);
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+    rocksdb_options_set_advise_random_on_open(o, 1);
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+    rocksdb_options_set_access_hint_on_compaction_start(o, 3);
+    CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+    rocksdb_options_set_use_adaptive_mutex(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+    rocksdb_options_set_bytes_per_sync(o, 19);
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+    rocksdb_options_set_wal_bytes_per_sync(o, 20);
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+    rocksdb_options_set_writable_file_max_buffer_size(o, 21);
+    CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+    rocksdb_options_set_allow_concurrent_memtable_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+    rocksdb_options_set_enable_write_thread_adaptive_yield(o, 1);
+    CheckCondition(1 ==
+                   rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+    rocksdb_options_set_max_sequential_skip_in_iterations(o, 22);
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+    rocksdb_options_set_disable_auto_compactions(o, 1);
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+    rocksdb_options_set_optimize_filters_for_hits(o, 1);
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+    rocksdb_options_set_delete_obsolete_files_period_micros(o, 23);
+    CheckCondition(23 ==
+                   rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+    rocksdb_options_set_memtable_prefix_bloom_size_ratio(o, 2.0);
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+    rocksdb_options_set_max_compaction_bytes(o, 24);
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+    rocksdb_options_set_memtable_huge_page_size(o, 25);
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+    rocksdb_options_set_max_successive_merges(o, 26);
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+    rocksdb_options_set_bloom_locality(o, 27);
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+    rocksdb_options_set_inplace_update_support(o, 1);
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+    rocksdb_options_set_inplace_update_num_locks(o, 28);
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+    rocksdb_options_set_report_bg_io_stats(o, 1);
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+    rocksdb_options_set_wal_recovery_mode(o, 2);
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+    rocksdb_options_set_compression(o, 5);
+    CheckCondition(5 == rocksdb_options_get_compression(o));
+
+    rocksdb_options_set_bottommost_compression(o, 4);
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+    rocksdb_options_set_compaction_style(o, 2);
+    CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+    rocksdb_options_set_atomic_flush(o, 1);
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+    rocksdb_options_set_manual_wal_flush(o, 1);
+    CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o));
+
+    /* Blob Options */
+    rocksdb_options_set_enable_blob_files(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
+
+    rocksdb_options_set_min_blob_size(o, 29);
+    CheckCondition(29 == rocksdb_options_get_min_blob_size(o));
+
+    rocksdb_options_set_blob_file_size(o, 30);
+    CheckCondition(30 == rocksdb_options_get_blob_file_size(o));
+
+    rocksdb_options_set_blob_compression_type(o, 4);
+    CheckCondition(4 == rocksdb_options_get_blob_compression_type(o));
+
+    rocksdb_options_set_enable_blob_gc(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o));
+
+    rocksdb_options_set_blob_gc_age_cutoff(o, 0.5);
+    CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o));
+
+    rocksdb_options_set_blob_gc_force_threshold(o, 0.75);
+    CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o));
+
+    rocksdb_options_set_blob_compaction_readahead_size(o, 262144);
+    CheckCondition(262144 ==
+                   rocksdb_options_get_blob_compaction_readahead_size(o));
+
+    // Create a copy that should be equal to the original.
+    rocksdb_options_t* copy;
+    copy = rocksdb_options_create_copy(o);
+
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(copy));
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(copy));
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_create_missing_column_families(copy));
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(copy));
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(copy));
+    CheckCondition(3 == rocksdb_options_get_info_log_level(copy));
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(copy));
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(copy));
+    CheckCondition(21 == rocksdb_options_get_max_open_files(copy));
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(copy));
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(copy));
+    CheckCondition(7 == rocksdb_options_get_num_levels(copy));
+    CheckCondition(
+        4 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+    CheckCondition(6 ==
+                   rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(copy));
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(copy));
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+    CheckCondition(
+        64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy));
+    CheckCondition(1 == rocksdb_options_get_unordered_write(copy));
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(copy));
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(copy));
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(copy));
+    CheckCondition(4 == rocksdb_options_get_base_background_compactions(copy));
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(copy));
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(copy));
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(copy));
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(copy));
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(copy));
+    CheckCondition(2.0 == rocksdb_options_get_soft_rate_limit(copy));
+    CheckCondition(4.0 == rocksdb_options_get_hard_rate_limit(copy));
+    CheckCondition(
+        10 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+    CheckCondition(
+        11 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_rate_limit_delay_max_milliseconds(copy));
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(copy));
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(copy));
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(copy));
+    CheckCondition(1 == rocksdb_options_get_use_fsync(copy));
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(copy));
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(copy));
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(copy));
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_log_error_on_recovery(copy));
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(copy));
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(copy));
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(copy));
+    CheckCondition(3 ==
+                   rocksdb_options_get_access_hint_on_compaction_start(copy));
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(copy));
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(copy));
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(copy));
+    CheckCondition(21 ==
+                   rocksdb_options_get_writable_file_max_buffer_size(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_allow_concurrent_memtable_write(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(copy));
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(copy));
+    CheckCondition(
+        23 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(copy));
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(copy));
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(copy));
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(copy));
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(copy));
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(copy));
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(copy));
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(copy));
+    CheckCondition(5 == rocksdb_options_get_compression(copy));
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(copy));
+    CheckCondition(2 == rocksdb_options_get_compaction_style(copy));
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(copy));
+
+    // Copies should be independent.
+    rocksdb_options_set_allow_ingest_behind(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_ingest_behind(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+    rocksdb_options_compaction_readahead_size(copy, 20);
+    CheckCondition(20 == rocksdb_options_get_compaction_readahead_size(copy));
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+    rocksdb_options_set_create_if_missing(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_create_if_missing(copy));
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+    rocksdb_options_set_create_missing_column_families(copy, 0);
+    CheckCondition(0 ==
+                   rocksdb_options_get_create_missing_column_families(copy));
+    CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+    rocksdb_options_set_error_if_exists(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_error_if_exists(copy));
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+    rocksdb_options_set_paranoid_checks(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_paranoid_checks(copy));
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+    rocksdb_options_set_info_log_level(copy, 2);
+    CheckCondition(2 == rocksdb_options_get_info_log_level(copy));
+    CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+    rocksdb_options_set_write_buffer_size(copy, 200);
+    CheckCondition(200 == rocksdb_options_get_write_buffer_size(copy));
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+    rocksdb_options_set_db_write_buffer_size(copy, 2000);
+    CheckCondition(2000 == rocksdb_options_get_db_write_buffer_size(copy));
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+    rocksdb_options_set_max_open_files(copy, 42);
+    CheckCondition(42 == rocksdb_options_get_max_open_files(copy));
+    CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+    rocksdb_options_set_max_file_opening_threads(copy, 3);
+    CheckCondition(3 == rocksdb_options_get_max_file_opening_threads(copy));
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+    rocksdb_options_set_max_total_wal_size(copy, 4000);
+    CheckCondition(4000 == rocksdb_options_get_max_total_wal_size(copy));
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+    rocksdb_options_set_num_levels(copy, 6);
+    CheckCondition(6 == rocksdb_options_get_num_levels(copy));
+    CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+    rocksdb_options_set_level0_file_num_compaction_trigger(copy, 14);
+    CheckCondition(
+        14 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+    CheckCondition(4 ==
+                   rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+    rocksdb_options_set_level0_slowdown_writes_trigger(copy, 61);
+    CheckCondition(61 ==
+                   rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+    CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+    rocksdb_options_set_level0_stop_writes_trigger(copy, 17);
+    CheckCondition(17 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+    rocksdb_options_set_target_file_size_base(copy, 128);
+    CheckCondition(128 == rocksdb_options_get_target_file_size_base(copy));
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+    rocksdb_options_set_target_file_size_multiplier(copy, 13);
+    CheckCondition(13 == rocksdb_options_get_target_file_size_multiplier(copy));
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+    rocksdb_options_set_max_bytes_for_level_base(copy, 900);
+    CheckCondition(900 == rocksdb_options_get_max_bytes_for_level_base(copy));
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+    rocksdb_options_set_level_compaction_dynamic_level_bytes(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+    rocksdb_options_set_max_bytes_for_level_multiplier(copy, 8.0);
+    CheckCondition(8.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+    rocksdb_options_set_skip_stats_update_on_db_open(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+    rocksdb_options_set_max_write_buffer_number(copy, 2000);
+    CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy));
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+    rocksdb_options_set_min_write_buffer_number_to_merge(copy, 146);
+    CheckCondition(146 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+    rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128);
+    CheckCondition(
+        128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+    CheckCondition(64 ==
+                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+    rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000);
+    CheckCondition(9000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+    rocksdb_options_set_enable_pipelined_write(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_enable_pipelined_write(copy));
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+    rocksdb_options_set_unordered_write(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_unordered_write(copy));
+    CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+    rocksdb_options_set_max_subcompactions(copy, 90001);
+    CheckCondition(90001 == rocksdb_options_get_max_subcompactions(copy));
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+    rocksdb_options_set_max_background_jobs(copy, 12);
+    CheckCondition(12 == rocksdb_options_get_max_background_jobs(copy));
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+    rocksdb_options_set_max_background_compactions(copy, 13);
+    CheckCondition(13 == rocksdb_options_get_max_background_compactions(copy));
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+    rocksdb_options_set_base_background_compactions(copy, 14);
+    CheckCondition(14 == rocksdb_options_get_base_background_compactions(copy));
+    CheckCondition(4 == rocksdb_options_get_base_background_compactions(o));
+
+    rocksdb_options_set_max_background_flushes(copy, 15);
+    CheckCondition(15 == rocksdb_options_get_max_background_flushes(copy));
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+    rocksdb_options_set_max_log_file_size(copy, 16);
+    CheckCondition(16 == rocksdb_options_get_max_log_file_size(copy));
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+    rocksdb_options_set_log_file_time_to_roll(copy, 17);
+    CheckCondition(17 == rocksdb_options_get_log_file_time_to_roll(copy));
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+    rocksdb_options_set_keep_log_file_num(copy, 18);
+    CheckCondition(18 == rocksdb_options_get_keep_log_file_num(copy));
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+    rocksdb_options_set_recycle_log_file_num(copy, 19);
+    CheckCondition(19 == rocksdb_options_get_recycle_log_file_num(copy));
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+    rocksdb_options_set_soft_rate_limit(copy, 4.0);
+    CheckCondition(4.0 == rocksdb_options_get_soft_rate_limit(copy));
+    CheckCondition(2.0 == rocksdb_options_get_soft_rate_limit(o));
+
+    rocksdb_options_set_hard_rate_limit(copy, 2.0);
+    CheckCondition(2.0 == rocksdb_options_get_hard_rate_limit(copy));
+    CheckCondition(4.0 == rocksdb_options_get_hard_rate_limit(o));
+
+    rocksdb_options_set_soft_pending_compaction_bytes_limit(copy, 110);
+    CheckCondition(
+        110 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+    CheckCondition(10 ==
+                   rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_hard_pending_compaction_bytes_limit(copy, 111);
+    CheckCondition(
+        111 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+    CheckCondition(11 ==
+                   rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_rate_limit_delay_max_milliseconds(copy, 0);
+    CheckCondition(0 ==
+                   rocksdb_options_get_rate_limit_delay_max_milliseconds(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_rate_limit_delay_max_milliseconds(o));
+
+    rocksdb_options_set_max_manifest_file_size(copy, 112);
+    CheckCondition(112 == rocksdb_options_get_max_manifest_file_size(copy));
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+    rocksdb_options_set_table_cache_numshardbits(copy, 113);
+    CheckCondition(113 == rocksdb_options_get_table_cache_numshardbits(copy));
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+    rocksdb_options_set_arena_block_size(copy, 114);
+    CheckCondition(114 == rocksdb_options_get_arena_block_size(copy));
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+    rocksdb_options_set_use_fsync(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_fsync(copy));
+    CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+    rocksdb_options_set_WAL_ttl_seconds(copy, 115);
+    CheckCondition(115 == rocksdb_options_get_WAL_ttl_seconds(copy));
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+    rocksdb_options_set_WAL_size_limit_MB(copy, 116);
+    CheckCondition(116 == rocksdb_options_get_WAL_size_limit_MB(copy));
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+    rocksdb_options_set_manifest_preallocation_size(copy, 117);
+    CheckCondition(117 ==
+                   rocksdb_options_get_manifest_preallocation_size(copy));
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+    rocksdb_options_set_allow_mmap_reads(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_mmap_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+    rocksdb_options_set_allow_mmap_writes(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_mmap_writes(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+    rocksdb_options_set_use_direct_reads(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_direct_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+    rocksdb_options_set_use_direct_io_for_flush_and_compaction(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+    rocksdb_options_set_is_fd_close_on_exec(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_is_fd_close_on_exec(copy));
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+    rocksdb_options_set_skip_log_error_on_recovery(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_skip_log_error_on_recovery(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_log_error_on_recovery(o));
+
+    rocksdb_options_set_stats_dump_period_sec(copy, 218);
+    CheckCondition(218 == rocksdb_options_get_stats_dump_period_sec(copy));
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+    rocksdb_options_set_stats_persist_period_sec(copy, 600);
+    CheckCondition(600 == rocksdb_options_get_stats_persist_period_sec(copy));
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+    rocksdb_options_set_advise_random_on_open(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_advise_random_on_open(copy));
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+    rocksdb_options_set_access_hint_on_compaction_start(copy, 2);
+    CheckCondition(2 ==
+                   rocksdb_options_get_access_hint_on_compaction_start(copy));
+    CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+    rocksdb_options_set_use_adaptive_mutex(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_adaptive_mutex(copy));
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+    rocksdb_options_set_bytes_per_sync(copy, 219);
+    CheckCondition(219 == rocksdb_options_get_bytes_per_sync(copy));
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+    rocksdb_options_set_wal_bytes_per_sync(copy, 120);
+    CheckCondition(120 == rocksdb_options_get_wal_bytes_per_sync(copy));
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+    rocksdb_options_set_writable_file_max_buffer_size(copy, 121);
+    CheckCondition(121 ==
+                   rocksdb_options_get_writable_file_max_buffer_size(copy));
+    CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+    rocksdb_options_set_allow_concurrent_memtable_write(copy, 0);
+    CheckCondition(0 ==
+                   rocksdb_options_get_allow_concurrent_memtable_write(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+    rocksdb_options_set_enable_write_thread_adaptive_yield(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+    rocksdb_options_set_max_sequential_skip_in_iterations(copy, 122);
+    CheckCondition(122 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+    rocksdb_options_set_disable_auto_compactions(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_disable_auto_compactions(copy));
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+    rocksdb_options_set_optimize_filters_for_hits(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_optimize_filters_for_hits(copy));
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+    rocksdb_options_set_delete_obsolete_files_period_micros(copy, 123);
+    CheckCondition(
+        123 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+    rocksdb_options_set_memtable_prefix_bloom_size_ratio(copy, 4.0);
+    CheckCondition(4.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+    rocksdb_options_set_max_compaction_bytes(copy, 124);
+    CheckCondition(124 == rocksdb_options_get_max_compaction_bytes(copy));
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+    rocksdb_options_set_memtable_huge_page_size(copy, 125);
+    CheckCondition(125 == rocksdb_options_get_memtable_huge_page_size(copy));
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+    rocksdb_options_set_max_successive_merges(copy, 126);
+    CheckCondition(126 == rocksdb_options_get_max_successive_merges(copy));
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+    rocksdb_options_set_bloom_locality(copy, 127);
+    CheckCondition(127 == rocksdb_options_get_bloom_locality(copy));
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+    rocksdb_options_set_inplace_update_support(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_inplace_update_support(copy));
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+    rocksdb_options_set_inplace_update_num_locks(copy, 128);
+    CheckCondition(128 == rocksdb_options_get_inplace_update_num_locks(copy));
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+    rocksdb_options_set_report_bg_io_stats(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_report_bg_io_stats(copy));
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+    rocksdb_options_set_wal_recovery_mode(copy, 1);
+    CheckCondition(1 == rocksdb_options_get_wal_recovery_mode(copy));
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+    rocksdb_options_set_compression(copy, 4);
+    CheckCondition(4 == rocksdb_options_get_compression(copy));
+    CheckCondition(5 == rocksdb_options_get_compression(o));
+
+    rocksdb_options_set_bottommost_compression(copy, 3);
+    CheckCondition(3 == rocksdb_options_get_bottommost_compression(copy));
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+    rocksdb_options_set_compaction_style(copy, 1);
+    CheckCondition(1 == rocksdb_options_get_compaction_style(copy));
+    CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+    rocksdb_options_set_atomic_flush(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_atomic_flush(copy));
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+    rocksdb_options_destroy(copy);
+    rocksdb_options_destroy(o);
+  }
+
+  StartPhase("read_options");
+  {
+    rocksdb_readoptions_t* ro;
+    ro = rocksdb_readoptions_create();
+
+    rocksdb_readoptions_set_verify_checksums(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_verify_checksums(ro));
+
+    rocksdb_readoptions_set_fill_cache(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_fill_cache(ro));
+
+    rocksdb_readoptions_set_read_tier(ro, 2);
+    CheckCondition(2 == rocksdb_readoptions_get_read_tier(ro));
+
+    rocksdb_readoptions_set_tailing(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_tailing(ro));
+
+    rocksdb_readoptions_set_readahead_size(ro, 100);
+    CheckCondition(100 == rocksdb_readoptions_get_readahead_size(ro));
+
+    rocksdb_readoptions_set_prefix_same_as_start(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_prefix_same_as_start(ro));
+
+    rocksdb_readoptions_set_pin_data(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_pin_data(ro));
+
+    rocksdb_readoptions_set_total_order_seek(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_total_order_seek(ro));
+
+    rocksdb_readoptions_set_max_skippable_internal_keys(ro, 200);
+    CheckCondition(200 ==
+                   rocksdb_readoptions_get_max_skippable_internal_keys(ro));
+
+    rocksdb_readoptions_set_background_purge_on_iterator_cleanup(ro, 1);
+    CheckCondition(
+        1 == rocksdb_readoptions_get_background_purge_on_iterator_cleanup(ro));
+
+    rocksdb_readoptions_set_ignore_range_deletions(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro));
+
+    rocksdb_readoptions_set_deadline(ro, 300);
+    CheckCondition(300 == rocksdb_readoptions_get_deadline(ro));
+
+    rocksdb_readoptions_set_io_timeout(ro, 400);
+    CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));
+
+    rocksdb_readoptions_destroy(ro);
+  }
+
+  StartPhase("write_options");
+  {
+    rocksdb_writeoptions_t* wo;
+    wo = rocksdb_writeoptions_create();
+
+    rocksdb_writeoptions_set_sync(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_sync(wo));
+
+    rocksdb_writeoptions_disable_WAL(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_disable_WAL(wo));
+
+    rocksdb_writeoptions_set_ignore_missing_column_families(wo, 1);
+    CheckCondition(1 ==
+                   rocksdb_writeoptions_get_ignore_missing_column_families(wo));
+
+    rocksdb_writeoptions_set_no_slowdown(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_no_slowdown(wo));
+
+    rocksdb_writeoptions_set_low_pri(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_low_pri(wo));
+
+    rocksdb_writeoptions_set_memtable_insert_hint_per_batch(wo, 1);
+    CheckCondition(1 ==
+                   rocksdb_writeoptions_get_memtable_insert_hint_per_batch(wo));
+
+    rocksdb_writeoptions_destroy(wo);
+  }
+
+  StartPhase("compact_options");
+  {
+    rocksdb_compactoptions_t* co;
+    co = rocksdb_compactoptions_create();
+
+    rocksdb_compactoptions_set_exclusive_manual_compaction(co, 1);
+    CheckCondition(1 ==
+                   rocksdb_compactoptions_get_exclusive_manual_compaction(co));
+
+    rocksdb_compactoptions_set_bottommost_level_compaction(co, 1);
+    CheckCondition(1 ==
+                   rocksdb_compactoptions_get_bottommost_level_compaction(co));
+
+    rocksdb_compactoptions_set_change_level(co, 1);
+    CheckCondition(1 == rocksdb_compactoptions_get_change_level(co));
+
+    rocksdb_compactoptions_set_target_level(co, 1);
+    CheckCondition(1 == rocksdb_compactoptions_get_target_level(co));
+
+    rocksdb_compactoptions_destroy(co);
+  }
+
+  StartPhase("flush_options");
+  {
+    rocksdb_flushoptions_t* fo;
+    fo = rocksdb_flushoptions_create();
+
+    rocksdb_flushoptions_set_wait(fo, 1);
+    CheckCondition(1 == rocksdb_flushoptions_get_wait(fo));
+
+    rocksdb_flushoptions_destroy(fo);
+  }
+
+  StartPhase("cache_options");
+  {
+    rocksdb_cache_t* co;
+    co = rocksdb_cache_create_lru(100);
+    CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+    rocksdb_cache_set_capacity(co, 200);
+    CheckCondition(200 == rocksdb_cache_get_capacity(co));
+
+    rocksdb_cache_destroy(co);
+  }
+
+  StartPhase("jemalloc_nodump_allocator");
+  {
+    rocksdb_memory_allocator_t* allocator;
+    allocator = rocksdb_jemalloc_nodump_allocator_create(&err);
+    if (err != NULL) {
+      // not supported on all platforms, allow unsupported error
+      const char* ni = "Not implemented: ";
+      size_t ni_len = strlen(ni);
+      size_t err_len = strlen(err);
+
+      CheckCondition(err_len >= ni_len);
+      CheckCondition(memcmp(ni, err, ni_len) == 0);
+      Free(&err);
+    } else {
+      rocksdb_cache_t* co;
+      rocksdb_lru_cache_options_t* copts;
+
+      copts = rocksdb_lru_cache_options_create();
+
+      rocksdb_lru_cache_options_set_capacity(copts, 100);
+      rocksdb_lru_cache_options_set_memory_allocator(copts, allocator);
+
+      co = rocksdb_cache_create_lru_opts(copts);
+      CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+      rocksdb_cache_destroy(co);
+      rocksdb_lru_cache_options_destroy(copts);
+    }
+    rocksdb_memory_allocator_destroy(allocator);
+  }
+
+  StartPhase("env");
+  {
+    rocksdb_env_t* e;
+    e = rocksdb_create_default_env();
+
+    rocksdb_env_set_background_threads(e, 10);
+    CheckCondition(10 == rocksdb_env_get_background_threads(e));
+
+    rocksdb_env_set_high_priority_background_threads(e, 20);
+    CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e));
+
+    rocksdb_env_set_low_priority_background_threads(e, 30);
+    CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e));
+
+    rocksdb_env_set_bottom_priority_background_threads(e, 40);
+    CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e));
+
+    rocksdb_env_destroy(e);
+  }
+
+  StartPhase("universal_compaction_options");
+  {
+    rocksdb_universal_compaction_options_t* uco;
+    uco = rocksdb_universal_compaction_options_create();
+
+    rocksdb_universal_compaction_options_set_size_ratio(uco, 5);
+    CheckCondition(5 ==
+                   rocksdb_universal_compaction_options_get_size_ratio(uco));
+
+    rocksdb_universal_compaction_options_set_min_merge_width(uco, 15);
+    CheckCondition(
+        15 == rocksdb_universal_compaction_options_get_min_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_merge_width(uco, 25);
+    CheckCondition(
+        25 == rocksdb_universal_compaction_options_get_max_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco,
+                                                                            35);
+    CheckCondition(
+        35 ==
+        rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+            uco));
+
+    rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45);
+    CheckCondition(
+        45 ==
+        rocksdb_universal_compaction_options_get_compression_size_percent(uco));
+
+    rocksdb_universal_compaction_options_set_stop_style(uco, 1);
+    CheckCondition(1 ==
+                   rocksdb_universal_compaction_options_get_stop_style(uco));
+
+    rocksdb_universal_compaction_options_destroy(uco);
+  }
+
+  StartPhase("fifo_compaction_options");
+  {
+    rocksdb_fifo_compaction_options_t* fco;
+    fco = rocksdb_fifo_compaction_options_create();
+
+    rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000);
+    CheckCondition(
+        100000 ==
+        rocksdb_fifo_compaction_options_get_max_table_files_size(fco));
+
+    rocksdb_fifo_compaction_options_destroy(fco);
+  }
+
+  StartPhase("backupable_db_option");
+  {
+    rocksdb_backupable_db_options_t* bdo;
+    bdo = rocksdb_backupable_db_options_create("path");
+
+    rocksdb_backupable_db_options_set_share_table_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backupable_db_options_get_share_table_files(bdo));
+
+    rocksdb_backupable_db_options_set_sync(bdo, 1);
+    CheckCondition(1 == rocksdb_backupable_db_options_get_sync(bdo));
+
+    rocksdb_backupable_db_options_set_destroy_old_data(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backupable_db_options_get_destroy_old_data(bdo));
+
+    rocksdb_backupable_db_options_set_backup_log_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backupable_db_options_get_backup_log_files(bdo));
+
+    rocksdb_backupable_db_options_set_backup_rate_limit(bdo, 123);
+    CheckCondition(123 ==
+                   rocksdb_backupable_db_options_get_backup_rate_limit(bdo));
+
+    rocksdb_backupable_db_options_set_restore_rate_limit(bdo, 37);
+    CheckCondition(37 ==
+                   rocksdb_backupable_db_options_get_restore_rate_limit(bdo));
+
+    rocksdb_backupable_db_options_set_max_background_operations(bdo, 20);
+    CheckCondition(
+        20 == rocksdb_backupable_db_options_get_max_background_operations(bdo));
+
+    rocksdb_backupable_db_options_set_callback_trigger_interval_size(bdo, 9000);
+    CheckCondition(
+        9000 ==
+        rocksdb_backupable_db_options_get_callback_trigger_interval_size(bdo));
+
+    rocksdb_backupable_db_options_set_max_valid_backups_to_open(bdo, 40);
+    CheckCondition(
+        40 == rocksdb_backupable_db_options_get_max_valid_backups_to_open(bdo));
+
+    rocksdb_backupable_db_options_set_share_files_with_checksum_naming(bdo, 2);
+    CheckCondition(
+        2 == rocksdb_backupable_db_options_get_share_files_with_checksum_naming(
+                 bdo));
+
+    rocksdb_backupable_db_options_destroy(bdo);
+  }
+
+  StartPhase("compression_options");
+  {
+    rocksdb_options_t* co;
+    co = rocksdb_options_create();
+
+    rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100);
+    CheckCondition(
+        100 ==
+        rocksdb_options_get_compression_options_zstd_max_train_bytes(co));
+
+    rocksdb_options_set_compression_options_parallel_threads(co, 2);
+    CheckCondition(
+        2 == rocksdb_options_get_compression_options_parallel_threads(co));
+
+    rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200);
+    CheckCondition(
+        200 ==
+        rocksdb_options_get_compression_options_max_dict_buffer_bytes(co));
+
+    rocksdb_options_destroy(co);
+  }
+
   StartPhase("iterate_upper_bound");
   {
     // Create new empty database
@@ -1840,6 +2984,54 @@
     CheckNoError(err);
   }
 
+  StartPhase("filter_with_prefix_seek");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(
+        options, rocksdb_slicetransform_create_fixed_prefix(1));
+    rocksdb_filterpolicy_t* filter_policy =
+        rocksdb_filterpolicy_create_bloom_full(8.0);
+    rocksdb_block_based_options_set_filter_policy(table_options, filter_policy);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    int i;
+    for (i = 0; i < 10; ++i) {
+      char key = '0' + (char)i;
+      rocksdb_put(db, woptions, &key, 1, "", 1, &err);
+      CheckNoError(err);
+    }
+
+    // Flush to generate an L0 so that filter will be used later.
+    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_flush(db, flush_options, &err);
+    rocksdb_flushoptions_destroy(flush_options);
+    CheckNoError(err);
+
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, ropts);
+
+    rocksdb_iter_seek(iter, "0", 1);
+    int cnt = 0;
+    while (rocksdb_iter_valid(iter)) {
+      ++cnt;
+      rocksdb_iter_next(iter);
+    }
+    CheckCondition(10 == cnt);
+
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_destroy(ropts);
+  }
+
+  StartPhase("cancel_all_background_work");
+  rocksdb_cancel_all_background_work(db, 1);
+
   StartPhase("cleanup");
   rocksdb_close(db);
   rocksdb_options_destroy(options);
@@ -1858,7 +3050,7 @@
 
 #else
 
-int main() {
+int main(void) {
   fprintf(stderr, "SKIPPED\n");
   return 0;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,9 +12,11 @@
 #include <algorithm>
 #include <cinttypes>
 #include <limits>
+#include <sstream>
 #include <string>
 #include <vector>
 
+#include "db/blob/blob_file_cache.h"
 #include "db/compaction/compaction_picker.h"
 #include "db/compaction/compaction_picker_fifo.h"
 #include "db/compaction/compaction_picker_level.h"
@@ -27,13 +29,15 @@
 #include "db/version_set.h"
 #include "db/write_controller.h"
 #include "file/sst_file_manager_impl.h"
-#include "memtable/hash_skiplist_rep.h"
+#include "logging/logging.h"
 #include "monitoring/thread_status_util.h"
 #include "options/options_helper.h"
 #include "port/port.h"
-#include "table/block_based/block_based_table_factory.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/table.h"
 #include "table/merging_iterator.h"
 #include "util/autovector.h"
+#include "util/cast_util.h"
 #include "util/compression.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -71,11 +75,6 @@
       bool defer_purge =
           db_->immutable_db_options().avoid_unnecessary_blocking_io;
       db_->PurgeObsoleteFiles(job_context, defer_purge);
-      if (defer_purge) {
-        mutex_->Lock();
-        db_->SchedulePurge();
-        mutex_->Unlock();
-      }
     }
     job_context.Clean();
   }
@@ -105,8 +104,9 @@
 
 void GetIntTblPropCollectorFactory(
     const ImmutableCFOptions& ioptions,
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories) {
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories) {
+  assert(int_tbl_prop_collector_factories);
+
   auto& collector_factories = ioptions.table_properties_collector_factories;
   for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
        ++i) {
@@ -147,6 +147,16 @@
           "should be nonzero if we're using zstd's dictionary generator.");
     }
   }
+
+  if (!CompressionTypeSupported(cf_options.blob_compression_type)) {
+    std::ostringstream oss;
+    oss << "The specified blob compression type "
+        << CompressionTypeToString(cf_options.blob_compression_type)
+        << " is not available.";
+
+    return Status::InvalidArgument(oss.str());
+  }
+
   return Status::OK();
 }
 
@@ -188,7 +198,7 @@
 namespace {
 const uint64_t kDefaultTtl = 0xfffffffffffffffe;
 const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
-};  // namespace
+}  // namespace
 
 ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                     const ColumnFamilyOptions& src) {
@@ -196,11 +206,13 @@
   size_t clamp_max = std::conditional<
       sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
       std::integral_constant<uint64_t, 64ull << 30>>::type::value;
-  ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, clamp_max);
+  ClipToRange(&result.write_buffer_size, (static_cast<size_t>(64)) << 10,
+              clamp_max);
   // if user sets arena_block_size, we trust user to use this value. Otherwise,
   // calculate a proper value from writer_buffer_size;
   if (result.arena_block_size <= 0) {
-    result.arena_block_size = result.write_buffer_size / 8;
+    result.arena_block_size =
+        std::min(size_t{1024 * 1024}, result.write_buffer_size / 8);
 
     // Align up to 4k
     const size_t align = 4 * 1024;
@@ -269,7 +281,7 @@
   }
 
   if (result.level0_file_num_compaction_trigger == 0) {
-    ROCKS_LOG_WARN(db_options.info_log.get(),
+    ROCKS_LOG_WARN(db_options.logger,
                    "level0_file_num_compaction_trigger cannot be 0");
     result.level0_file_num_compaction_trigger = 1;
   }
@@ -278,7 +290,7 @@
           result.level0_slowdown_writes_trigger ||
       result.level0_slowdown_writes_trigger <
           result.level0_file_num_compaction_trigger) {
-    ROCKS_LOG_WARN(db_options.info_log.get(),
+    ROCKS_LOG_WARN(db_options.logger,
                    "This condition must be satisfied: "
                    "level0_stop_writes_trigger(%d) >= "
                    "level0_slowdown_writes_trigger(%d) >= "
@@ -295,7 +307,7 @@
         result.level0_slowdown_writes_trigger) {
       result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
     }
-    ROCKS_LOG_WARN(db_options.info_log.get(),
+    ROCKS_LOG_WARN(db_options.logger,
                    "Adjust the value to "
                    "level0_stop_writes_trigger(%d)"
                    "level0_slowdown_writes_trigger(%d)"
@@ -322,7 +334,9 @@
   // was not used)
   auto sfm = static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
   for (size_t i = 0; i < result.cf_paths.size(); i++) {
-    DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path);
+    DeleteScheduler::CleanupDirectory(db_options.env, sfm,
+                                      result.cf_paths[i].path)
+        .PermitUncheckedError();
   }
 #endif
 
@@ -331,12 +345,18 @@
   }
 
   if (result.level_compaction_dynamic_level_bytes) {
-    if (result.compaction_style != kCompactionStyleLevel ||
-        result.cf_paths.size() > 1U) {
-      // 1. level_compaction_dynamic_level_bytes only makes sense for
-      //    level-based compaction.
-      // 2. we don't yet know how to make both of this feature and multiple
-      //    DB path work.
+    if (result.compaction_style != kCompactionStyleLevel) {
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "level_compaction_dynamic_level_bytes only makes sense"
+                     "for level-based compaction");
+      result.level_compaction_dynamic_level_bytes = false;
+    } else if (result.cf_paths.size() > 1U) {
+      // we don't yet know how to make both of this feature and multiple
+      // DB path work.
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "multiple cf_paths/db_paths and"
+                     "level_compaction_dynamic_level_bytes"
+                     "can't be used together");
       result.level_compaction_dynamic_level_bytes = false;
     }
   }
@@ -345,8 +365,8 @@
     result.max_compaction_bytes = result.target_file_size_base * 25;
   }
 
-  bool is_block_based_table =
-      (result.table_factory->Name() == BlockBasedTableFactory().Name());
+  bool is_block_based_table = (result.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
 
   const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
   if (result.ttl == kDefaultTtl) {
@@ -427,6 +447,9 @@
 
 void SuperVersion::Cleanup() {
   assert(refs.load(std::memory_order_relaxed) == 0);
+  // Since this SuperVersion object is being deleted,
+  // decrement reference to the immutable MemtableList
+  // this SV object was pointing to.
   imm->Unref(&to_delete);
   MemTable* m = mem->Unref();
   if (m != nullptr) {
@@ -436,9 +459,7 @@
     to_delete.push_back(m);
   }
   current->Unref();
-  if (cfd->Unref()) {
-    delete cfd;
-  }
+  cfd->UnrefAndTryDelete();
 }
 
 void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
@@ -456,10 +477,10 @@
 
 namespace {
 void SuperVersionUnrefHandle(void* ptr) {
-  // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
-  // destroyed. When former happens, the thread shouldn't see kSVInUse.
-  // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
-  // well.
+  // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets
+  // destroyed. When the former happens, the thread shouldn't see kSVInUse.
+  // When the latter happens, only super_version_ holds a reference
+  // to ColumnFamilyData, so no further queries are possible.
   SuperVersion* sv = static_cast<SuperVersion*>(ptr);
   bool was_last_ref __attribute__((__unused__));
   was_last_ref = sv->Unref();
@@ -471,12 +492,25 @@
 }
 }  // anonymous namespace
 
+std::vector<std::string> ColumnFamilyData::GetDbPaths() const {
+  std::vector<std::string> paths;
+  paths.reserve(ioptions_.cf_paths.size());
+  for (const DbPath& db_path : ioptions_.cf_paths) {
+    paths.emplace_back(db_path.path);
+  }
+  return paths;
+}
+
+const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId = port::kMaxUint32;
+
 ColumnFamilyData::ColumnFamilyData(
     uint32_t id, const std::string& name, Version* _dummy_versions,
     Cache* _table_cache, WriteBufferManager* write_buffer_manager,
     const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
-    const FileOptions& file_options, ColumnFamilySet* column_family_set,
-    BlockCacheTracer* const block_cache_tracer)
+    const FileOptions* file_options, ColumnFamilySet* column_family_set,
+    BlockCacheTracer* const block_cache_tracer,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    const std::string& db_session_id)
     : id_(id),
       name_(name),
       dummy_versions_(_dummy_versions),
@@ -507,7 +541,23 @@
       queued_for_compaction_(false),
       prev_compaction_needed_bytes_(0),
       allow_2pc_(db_options.allow_2pc),
-      last_memtable_id_(0) {
+      last_memtable_id_(0),
+      db_paths_registered_(false) {
+  if (id_ != kDummyColumnFamilyDataId) {
+    // TODO(cc): RegisterDbPaths can be expensive, considering moving it
+    // outside of this constructor which might be called with db mutex held.
+    // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+    // EnvWrapper, that's the main reason why we use env here.
+    Status s = ioptions_.env->RegisterDbPaths(GetDbPaths());
+    if (s.ok()) {
+      db_paths_registered_ = true;
+    } else {
+      ROCKS_LOG_ERROR(
+          ioptions_.logger,
+          "Failed to register data paths of column family (id: %d, name: %s)",
+          id_, name_.c_str());
+    }
+  }
   Ref();
 
   // Convert user defined table properties collector factories to internal ones.
@@ -516,9 +566,14 @@
   // if _dummy_versions is nullptr, then this is a dummy column family.
   if (_dummy_versions != nullptr) {
     internal_stats_.reset(
-        new InternalStats(ioptions_.num_levels, db_options.env, this));
+        new InternalStats(ioptions_.num_levels, ioptions_.clock, this));
     table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
-                                      block_cache_tracer));
+                                      block_cache_tracer, io_tracer,
+                                      db_session_id));
+    blob_file_cache_.reset(
+        new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
+                          internal_stats_->GetBlobFileReadHist(), io_tracer));
+
     if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
           new LevelCompactionPicker(ioptions_, &internal_comparator_));
@@ -532,13 +587,13 @@
     } else if (ioptions_.compaction_style == kCompactionStyleNone) {
       compaction_picker_.reset(new NullCompactionPicker(
           ioptions_, &internal_comparator_));
-      ROCKS_LOG_WARN(ioptions_.info_log,
+      ROCKS_LOG_WARN(ioptions_.logger,
                      "Column family %s does not use any background compaction. "
                      "Compactions can only be done via CompactFiles\n",
                      GetName().c_str());
 #endif  // !ROCKSDB_LITE
     } else {
-      ROCKS_LOG_ERROR(ioptions_.info_log,
+      ROCKS_LOG_ERROR(ioptions_.logger,
                       "Unable to recognize the specified compaction style %d. "
                       "Column family %s will use kCompactionStyleLevel.\n",
                       ioptions_.compaction_style, GetName().c_str());
@@ -547,12 +602,12 @@
     }
 
     if (column_family_set_->NumberOfColumnFamilies() < 10) {
-      ROCKS_LOG_INFO(ioptions_.info_log,
+      ROCKS_LOG_INFO(ioptions_.logger,
                      "--------------- Options for column family [%s]:\n",
                      name.c_str());
-      initial_cf_options_.Dump(ioptions_.info_log);
+      initial_cf_options_.Dump(ioptions_.logger);
     } else {
-      ROCKS_LOG_INFO(ioptions_.info_log, "\t(skipping printing options)\n");
+      ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n");
     }
   }
 
@@ -587,7 +642,7 @@
 
   if (dummy_versions_ != nullptr) {
     // List must be empty
-    assert(dummy_versions_->TEST_Next() == dummy_versions_);
+    assert(dummy_versions_->Next() == dummy_versions_);
     bool deleted __attribute__((__unused__));
     deleted = dummy_versions_->Unref();
     assert(deleted);
@@ -601,6 +656,18 @@
   for (MemTable* m : to_delete) {
     delete m;
   }
+
+  if (db_paths_registered_) {
+    // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+    // EnvWrapper, that's the main reason why we use env here.
+    Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths());
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          ioptions_.logger,
+          "Failed to unregister data paths of column family (id: %d, name: %s)",
+          id_, name_.c_str());
+    }
+  }
 }
 
 bool ColumnFamilyData::UnrefAndTryDelete() {
@@ -617,14 +684,13 @@
     // Only the super_version_ holds me
     SuperVersion* sv = super_version_;
     super_version_ = nullptr;
-    // Release SuperVersion reference kept in ThreadLocalPtr.
-    // This must be done outside of mutex_ since unref handler can lock mutex.
-    sv->db_mutex->Unlock();
+
+    // Release SuperVersion references kept in ThreadLocalPtr.
     local_sv_.reset();
-    sv->db_mutex->Lock();
 
     if (sv->Unref()) {
-      // May delete this ColumnFamilyData after calling Cleanup()
+      // Note: sv will delete this ColumnFamilyData during Cleanup()
+      assert(sv->cfd == this);
       sv->Cleanup();
       delete sv;
       return true;
@@ -651,9 +717,7 @@
   auto current_log = GetLogNumber();
 
   if (allow_2pc_) {
-    autovector<MemTable*> empty_list;
-    auto imm_prep_log =
-        imm()->PrecomputeMinLogContainingPrepSection(empty_list);
+    auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection();
     auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
 
     if (imm_prep_log > 0 && imm_prep_log < current_log) {
@@ -775,7 +839,8 @@
 ColumnFamilyData::GetWriteStallConditionAndCause(
     int num_unflushed_memtables, int num_l0_files,
     uint64_t num_compaction_needed_bytes,
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options,
+    const ImmutableCFOptions& immutable_cf_options) {
   if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
     return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
   } else if (!mutable_cf_options.disable_auto_compactions &&
@@ -789,7 +854,9 @@
             WriteStallCause::kPendingCompactionBytes};
   } else if (mutable_cf_options.max_write_buffer_number > 3 &&
              num_unflushed_memtables >=
-                 mutable_cf_options.max_write_buffer_number - 1) {
+                 mutable_cf_options.max_write_buffer_number - 1 &&
+             num_unflushed_memtables - 1 >=
+                 immutable_cf_options.min_write_buffer_number_to_merge) {
     return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
   } else if (!mutable_cf_options.disable_auto_compactions &&
              mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
@@ -817,7 +884,8 @@
 
     auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
         imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
-        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options);
+        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
+        *ioptions());
     write_stall_condition = write_stall_condition_and_cause.first;
     auto write_stall_cause = write_stall_condition_and_cause.second;
 
@@ -829,7 +897,7 @@
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stopping writes because we have %d immutable memtables "
           "(waiting for flush), max_write_buffer_number is set to %d",
           name_.c_str(), imm()->NumNotFlushed(),
@@ -842,7 +910,7 @@
         internal_stats_->AddCFStats(
             InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
       }
-      ROCKS_LOG_WARN(ioptions_.info_log,
+      ROCKS_LOG_WARN(ioptions_.logger,
                      "[%s] Stopping writes because we have %d level-0 files",
                      name_.c_str(), vstorage->l0_delay_trigger_count());
     } else if (write_stall_condition == WriteStallCondition::kStopped &&
@@ -851,7 +919,7 @@
       internal_stats_->AddCFStats(
           InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stopping writes because of estimated pending compaction "
           "bytes %" PRIu64,
           name_.c_str(), compaction_needed_bytes);
@@ -863,7 +931,7 @@
                      mutable_cf_options.disable_auto_compactions);
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stalling writes because we have %d immutable memtables "
           "(waiting for flush), max_write_buffer_number is set to %d "
           "rate %" PRIu64,
@@ -885,7 +953,7 @@
         internal_stats_->AddCFStats(
             InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
       }
-      ROCKS_LOG_WARN(ioptions_.info_log,
+      ROCKS_LOG_WARN(ioptions_.logger,
                      "[%s] Stalling writes because we have %d level-0 files "
                      "rate %" PRIu64,
                      name_.c_str(), vstorage->l0_delay_trigger_count(),
@@ -910,7 +978,7 @@
       internal_stats_->AddCFStats(
           InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stalling writes because of estimated pending compaction "
           "bytes %" PRIu64 " rate %" PRIu64,
           name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
@@ -924,7 +992,7 @@
         write_controller_token_ =
             write_controller->GetCompactionPressureToken();
         ROCKS_LOG_INFO(
-            ioptions_.info_log,
+            ioptions_.logger,
             "[%s] Increasing compaction threads because we have %d level-0 "
             "files ",
             name_.c_str(), vstorage->l0_delay_trigger_count());
@@ -938,7 +1006,7 @@
             write_controller->GetCompactionPressureToken();
         if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
           ROCKS_LOG_INFO(
-              ioptions_.info_log,
+              ioptions_.logger,
               "[%s] Increasing compaction threads because of estimated pending "
               "compaction "
               "bytes %" PRIu64,
@@ -983,6 +1051,10 @@
   return VersionSet::GetTotalSstFilesSize(dummy_versions_);
 }
 
+uint64_t ColumnFamilyData::GetTotalBlobFileSize() const {
+  return VersionSet::GetTotalBlobFileSize(dummy_versions_);
+}
+
 uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
   return current_->GetSstFilesSize();
 }
@@ -1003,17 +1075,19 @@
 }
 
 bool ColumnFamilyData::NeedsCompaction() const {
-  return compaction_picker_->NeedsCompaction(current_->storage_info());
+  return !mutable_cf_options_.disable_auto_compactions &&
+         compaction_picker_->NeedsCompaction(current_->storage_info());
 }
 
 Compaction* ColumnFamilyData::PickCompaction(
-    const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+    const MutableCFOptions& mutable_options,
+    const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
   SequenceNumber earliest_mem_seqno =
       std::min(mem_->GetEarliestSequenceNumber(),
                imm_.current()->GetEarliestSequenceNumber(false));
   auto* result = compaction_picker_->PickCompaction(
-      GetName(), mutable_options, current_->storage_info(), log_buffer,
-      earliest_mem_seqno);
+      GetName(), mutable_options, mutable_db_options, current_->storage_info(),
+      log_buffer, earliest_mem_seqno);
   if (result != nullptr) {
     result->SetInputVersion(current_);
   }
@@ -1029,7 +1103,7 @@
 
 Status ColumnFamilyData::RangesOverlapWithMemtables(
     const autovector<Range>& ranges, SuperVersion* super_version,
-    bool* overlap) {
+    bool allow_data_in_errors, bool* overlap) {
   assert(overlap != nullptr);
   *overlap = false;
   // Create an InternalIterator over all unflushed memtables
@@ -1048,10 +1122,12 @@
       super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq);
   range_del_agg.AddTombstones(
       std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
-  super_version->imm->AddRangeTombstoneIterators(read_opts, nullptr /* arena */,
-                                                 &range_del_agg);
-
   Status status;
+  status = super_version->imm->AddRangeTombstoneIterators(
+      read_opts, nullptr /* arena */, &range_del_agg);
+  // AddRangeTombstoneIterators always return Status::OK.
+  assert(status.ok());
+
   for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
     auto* vstorage = super_version->current->storage_info();
     auto* ucmp = vstorage->InternalComparator()->user_comparator();
@@ -1060,12 +1136,12 @@
     memtable_iter->Seek(range_start.Encode());
     status = memtable_iter->status();
     ParsedInternalKey seek_result;
-    if (status.ok()) {
-      if (memtable_iter->Valid() &&
-          !ParseInternalKey(memtable_iter->key(), &seek_result)) {
-        status = Status::Corruption("DB have corrupted keys");
-      }
+
+    if (status.ok() && memtable_iter->Valid()) {
+      status = ParseInternalKey(memtable_iter->key(), &seek_result,
+                                allow_data_in_errors);
     }
+
     if (status.ok()) {
       if (memtable_iter->Valid() &&
           ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
@@ -1083,14 +1159,16 @@
 const int ColumnFamilyData::kCompactToBaseLevel = -2;
 
 Compaction* ColumnFamilyData::CompactRange(
-    const MutableCFOptions& mutable_cf_options, int input_level,
+    const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, int input_level,
     int output_level, const CompactRangeOptions& compact_range_options,
     const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end, bool* conflict,
     uint64_t max_file_num_to_ignore) {
   auto* result = compaction_picker_->CompactRange(
-      GetName(), mutable_cf_options, current_->storage_info(), input_level,
-      output_level, compact_range_options, begin, end, compaction_end, conflict,
+      GetName(), mutable_cf_options, mutable_db_options,
+      current_->storage_info(), input_level, output_level,
+      compact_range_options, begin, end, compaction_end, conflict,
       max_file_num_to_ignore);
   if (result != nullptr) {
     result->SetInputVersion(current_);
@@ -1133,11 +1211,11 @@
   SuperVersion* sv = static_cast<SuperVersion*>(ptr);
   if (sv == SuperVersion::kSVObsolete ||
       sv->version_number != super_version_number_.load()) {
-    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
+    RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
     SuperVersion* sv_to_delete = nullptr;
 
     if (sv && sv->Unref()) {
-      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
+      RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
       db->mutex()->Lock();
       // NOTE: underlying resources held by superversion (sst files) might
       // not be released until the next background job.
@@ -1181,14 +1259,13 @@
 void ColumnFamilyData::InstallSuperVersion(
     SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) {
   db_mutex->AssertHeld();
-  return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_);
+  return InstallSuperVersion(sv_context, mutable_cf_options_);
 }
 
 void ColumnFamilyData::InstallSuperVersion(
-    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
+    SuperVersionContext* sv_context,
     const MutableCFOptions& mutable_cf_options) {
   SuperVersion* new_superversion = sv_context->new_superversion.release();
-  new_superversion->db_mutex = db_mutex;
   new_superversion->mutable_cf_options = mutable_cf_options;
   new_superversion->Init(this, mem_, imm_.current(), current_);
   SuperVersion* old_superversion = super_version_;
@@ -1260,7 +1337,8 @@
   }
 
   if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
-    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+    if (!cf_options.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
       return Status::NotSupported(
           "TTL is only supported in Block-Based Table format. ");
     }
@@ -1268,30 +1346,53 @@
 
   if (cf_options.periodic_compaction_seconds > 0 &&
       cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
-    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+    if (!cf_options.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
       return Status::NotSupported(
           "Periodic Compaction is only supported in "
           "Block-Based Table format. ");
     }
   }
+
+  if (cf_options.enable_blob_garbage_collection) {
+    if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
+        cf_options.blob_garbage_collection_age_cutoff > 1.0) {
+      return Status::InvalidArgument(
+          "The age cutoff for blob garbage collection should be in the range "
+          "[0.0, 1.0].");
+    }
+    if (cf_options.blob_garbage_collection_force_threshold < 0.0 ||
+        cf_options.blob_garbage_collection_force_threshold > 1.0) {
+      return Status::InvalidArgument(
+          "The garbage ratio threshold for forcing blob garbage collection "
+          "should be in the range [0.0, 1.0].");
+    }
+  }
+
+  if (cf_options.compaction_style == kCompactionStyleFIFO &&
+      db_options.max_open_files != -1 && cf_options.ttl > 0) {
+    return Status::NotSupported(
+        "FIFO compaction only supported with max_open_files = -1.");
+  }
+
   return s;
 }
 
 #ifndef ROCKSDB_LITE
 Status ColumnFamilyData::SetOptions(
-    const DBOptions& db_options,
+    const DBOptions& db_opts,
     const std::unordered_map<std::string, std::string>& options_map) {
-  MutableCFOptions new_mutable_cf_options;
-  Status s =
-      GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
-                                   ioptions_.info_log, &new_mutable_cf_options);
+  ColumnFamilyOptions cf_opts =
+      BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+  ConfigOptions config_opts;
+  config_opts.mutable_options_only = true;
+  Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
+                                           &cf_opts);
   if (s.ok()) {
-    ColumnFamilyOptions cf_options =
-        BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options);
-    s = ValidateOptions(db_options, cf_options);
+    s = ValidateOptions(db_opts, cf_opts);
   }
   if (s.ok()) {
-    mutable_cf_options_ = new_mutable_cf_options;
+    mutable_cf_options_ = MutableCFOptions(cf_opts);
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
   }
   return s;
@@ -1321,7 +1422,7 @@
 }
 
 Status ColumnFamilyData::AddDirectories(
-    std::map<std::string, std::shared_ptr<Directory>>* created_dirs) {
+    std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
   Status s;
   assert(created_dirs != nullptr);
   assert(data_dirs_.empty());
@@ -1329,8 +1430,9 @@
     auto existing_dir = created_dirs->find(p.path);
 
     if (existing_dir == created_dirs->end()) {
-      std::unique_ptr<Directory> path_directory;
-      s = DBImpl::CreateAndNewDirectory(ioptions_.env, p.path, &path_directory);
+      std::unique_ptr<FSDirectory> path_directory;
+      s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path,
+                                        &path_directory);
       if (!s.ok()) {
         return s;
       }
@@ -1345,7 +1447,7 @@
   return s;
 }
 
-Directory* ColumnFamilyData::GetDataDir(size_t path_id) const {
+FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
   if (data_dirs_.empty()) {
     return nullptr;
   }
@@ -1358,21 +1460,26 @@
                                  const ImmutableDBOptions* db_options,
                                  const FileOptions& file_options,
                                  Cache* table_cache,
-                                 WriteBufferManager* write_buffer_manager,
-                                 WriteController* write_controller,
-                                 BlockCacheTracer* const block_cache_tracer)
+                                 WriteBufferManager* _write_buffer_manager,
+                                 WriteController* _write_controller,
+                                 BlockCacheTracer* const block_cache_tracer,
+                                 const std::shared_ptr<IOTracer>& io_tracer,
+                                 const std::string& db_session_id)
     : max_column_family_(0),
+      file_options_(file_options),
       dummy_cfd_(new ColumnFamilyData(
-          0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options,
-          file_options, nullptr, block_cache_tracer)),
+          ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
+          nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr,
+          block_cache_tracer, io_tracer, db_session_id)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
-      file_options_(file_options),
       table_cache_(table_cache),
-      write_buffer_manager_(write_buffer_manager),
-      write_controller_(write_controller),
-      block_cache_tracer_(block_cache_tracer) {
+      write_buffer_manager_(_write_buffer_manager),
+      write_controller_(_write_controller),
+      block_cache_tracer_(block_cache_tracer),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {
   // initialize linked list
   dummy_cfd_->prev_ = dummy_cfd_;
   dummy_cfd_->next_ = dummy_cfd_;
@@ -1438,7 +1545,8 @@
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd = new ColumnFamilyData(
       id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
-      *db_options_, file_options_, this, block_cache_tracer_);
+      *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
+      db_session_id_);
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
   max_column_family_ = std::max(max_column_family_, id);
@@ -1454,20 +1562,6 @@
   return new_cfd;
 }
 
-// REQUIRES: DB mutex held
-void ColumnFamilySet::FreeDeadColumnFamilies() {
-  autovector<ColumnFamilyData*> to_delete;
-  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
-    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
-      to_delete.push_back(cfd);
-    }
-  }
-  for (auto cfd : to_delete) {
-    // this is very rare, so it's not a problem that we do it under a mutex
-    delete cfd;
-  }
-}
-
 // under a DB mutex AND from a write thread
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
   auto cfd_iter = column_family_data_.find(cfd->GetID());
@@ -1506,7 +1600,7 @@
 uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
   uint32_t column_family_id = 0;
   if (column_family != nullptr) {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
     column_family_id = cfh->GetID();
   }
   return column_family_id;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family.h	2025-05-19 16:14:27.000000000 +0000
@@ -44,6 +44,7 @@
 class InstrumentedMutex;
 class InstrumentedMutexLock;
 struct SuperVersionContext;
+class BlobFileCache;
 
 extern const double kIncSlowdownRatio;
 // This file contains a list of data structures for managing column family
@@ -207,8 +208,6 @@
   uint64_t version_number;
   WriteStallCondition write_stall_condition;
 
-  InstrumentedMutex* db_mutex;
-
   // should be called outside the mutex
   SuperVersion() = default;
   ~SuperVersion();
@@ -252,13 +251,12 @@
 
 extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                            const ColumnFamilyOptions& src);
-// Wrap user defined table proproties collector factories `from cf_options`
+// Wrap user defined table properties collector factories `from cf_options`
 // into internal ones in int_tbl_prop_collector_factories. Add a system internal
 // one too.
 extern void GetIntTblPropCollectorFactory(
     const ImmutableCFOptions& ioptions,
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories);
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories);
 
 class ColumnFamilySet;
 
@@ -278,17 +276,6 @@
   // holding a DB mutex, or as the leader in a write batch group).
   void Ref() { refs_.fetch_add(1); }
 
-  // Unref decreases the reference count, but does not handle deletion
-  // when the count goes to 0.  If this method returns true then the
-  // caller should delete the instance immediately, or later, by calling
-  // FreeDeadColumnFamilies().  Unref() can only be called while holding
-  // a DB mutex, or during single-threaded recovery.
-  bool Unref() {
-    int old_refs = refs_.fetch_sub(1);
-    assert(old_refs > 0);
-    return old_refs == 1;
-  }
-
   // UnrefAndTryDelete() decreases the reference count and do free if needed,
   // return true if this is freed else false, UnrefAndTryDelete() can only
   // be called while holding a DB mutex, or during single-threaded recovery.
@@ -325,7 +312,7 @@
   FlushReason GetFlushReason() const { return flush_reason_; }
   // thread-safe
   const FileOptions* soptions() const;
-  const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+  const ImmutableOptions* ioptions() const { return &ioptions_; }
   // REQUIRES: DB mutex held
   // This returns the MutableCFOptions used by current SuperVersion
   // You should use this API to reference MutableCFOptions most of the time.
@@ -359,12 +346,18 @@
 
   MemTableList* imm() { return &imm_; }
   MemTable* mem() { return mem_; }
+
+  bool IsEmpty() {
+    return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
+  }
+
   Version* current() { return current_; }
   Version* dummy_versions() { return dummy_versions_; }
   void SetCurrent(Version* _current);
   uint64_t GetNumLiveVersions() const;  // REQUIRE: DB mutex held
   uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
   uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
+  uint64_t GetTotalBlobFileSize() const;  // REQUIRE: DB mutex held
   void SetMemtable(MemTable* new_mem) {
     uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
     new_mem->SetID(memtable_id);
@@ -381,12 +374,14 @@
                          SequenceNumber earliest_seq);
 
   TableCache* table_cache() const { return table_cache_.get(); }
+  BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
 
   // See documentation in compaction_picker.h
   // REQUIRES: DB mutex held
   bool NeedsCompaction() const;
   // REQUIRES: DB mutex held
   Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+                             const MutableDBOptions& mutable_db_options,
                              LogBuffer* log_buffer);
 
   // Check if the passed range overlap with any running compactions.
@@ -403,7 +398,8 @@
   //
   // Thread-safe
   Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
-                                    SuperVersion* super_version, bool* overlap);
+                                    SuperVersion* super_version,
+                                    bool allow_data_in_errors, bool* overlap);
 
   // A flag to tell a manual compaction is to compact all levels together
   // instead of a specific level.
@@ -412,6 +408,7 @@
   static const int kCompactToBaseLevel;
   // REQUIRES: DB mutex held
   Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
+                           const MutableDBOptions& mutable_db_options,
                            int input_level, int output_level,
                            const CompactRangeOptions& compact_range_options,
                            const InternalKey* begin, const InternalKey* end,
@@ -428,8 +425,7 @@
     return internal_comparator_;
   }
 
-  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-  int_tbl_prop_collector_factories() const {
+  const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const {
     return &int_tbl_prop_collector_factories_;
   }
 
@@ -441,7 +437,7 @@
   // Get SuperVersion stored in thread local storage. If it does not exist,
   // get a reference from a current SuperVersion.
   SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
-  // Try to return SuperVersion back to thread local storage. Retrun true on
+  // Try to return SuperVersion back to thread local storage. Return true on
   // success and false on failure. It fails when the thread local storage
   // contains anything other than SuperVersion::kSVInUse flag.
   bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
@@ -455,7 +451,6 @@
   // the clients to allocate SuperVersion outside of mutex.
   // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
   void InstallSuperVersion(SuperVersionContext* sv_context,
-                           InstrumentedMutex* db_mutex,
                            const MutableCFOptions& mutable_cf_options);
   void InstallSuperVersion(SuperVersionContext* sv_context,
                            InstrumentedMutex* db_mutex);
@@ -475,9 +470,11 @@
     kPendingCompactionBytes,
   };
   static std::pair<WriteStallCondition, WriteStallCause>
-  GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
-                                 uint64_t num_compaction_needed_bytes,
-                                 const MutableCFOptions& mutable_cf_options);
+  GetWriteStallConditionAndCause(
+      int num_unflushed_memtables, int num_l0_files,
+      uint64_t num_compaction_needed_bytes,
+      const MutableCFOptions& mutable_cf_options,
+      const ImmutableCFOptions& immutable_cf_options);
 
   // Recalculate some small conditions, which are changed only during
   // compaction, adding new memtable and/or
@@ -500,11 +497,29 @@
   // created_dirs remembers directory created, so that we don't need to call
   // the same data creation operation again.
   Status AddDirectories(
-      std::map<std::string, std::shared_ptr<Directory>>* created_dirs);
+      std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs);
+
+  FSDirectory* GetDataDir(size_t path_id) const;
+
+  // full_history_ts_low_ can only increase.
+  void SetFullHistoryTsLow(std::string ts_low) {
+    assert(!ts_low.empty());
+    const Comparator* ucmp = user_comparator();
+    assert(ucmp);
+    if (full_history_ts_low_.empty() ||
+        ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
+      full_history_ts_low_ = std::move(ts_low);
+    }
+  }
 
-  Directory* GetDataDir(size_t path_id) const;
+  const std::string& GetFullHistoryTsLow() const {
+    return full_history_ts_low_;
+  }
 
   ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+  WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
+
+  static const uint32_t kDummyColumnFamilyDataId;
 
  private:
   friend class ColumnFamilySet;
@@ -513,9 +528,13 @@
                    WriteBufferManager* write_buffer_manager,
                    const ColumnFamilyOptions& options,
                    const ImmutableDBOptions& db_options,
-                   const FileOptions& file_options,
+                   const FileOptions* file_options,
                    ColumnFamilySet* column_family_set,
-                   BlockCacheTracer* const block_cache_tracer);
+                   BlockCacheTracer* const block_cache_tracer,
+                   const std::shared_ptr<IOTracer>& io_tracer,
+                   const std::string& db_session_id);
+
+  std::vector<std::string> GetDbPaths() const;
 
   uint32_t id_;
   const std::string name_;
@@ -527,16 +546,16 @@
   std::atomic<bool> dropped_;  // true if client dropped it
 
   const InternalKeyComparator internal_comparator_;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories_;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories_;
 
   const ColumnFamilyOptions initial_cf_options_;
-  const ImmutableCFOptions ioptions_;
+  const ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
 
   const bool is_delete_range_supported_;
 
   std::unique_ptr<TableCache> table_cache_;
+  std::unique_ptr<BlobFileCache> blob_file_cache_;
 
   std::unique_ptr<InternalStats> internal_stats_;
 
@@ -592,7 +611,11 @@
   std::atomic<uint64_t> last_memtable_id_;
 
   // Directories corresponding to cf_paths.
-  std::vector<std::shared_ptr<Directory>> data_dirs_;
+  std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
+
+  bool db_paths_registered_;
+
+  std::string full_history_ts_low_;
 };
 
 // ColumnFamilySet has interesting thread-safety requirements
@@ -605,10 +628,8 @@
 // held and it needs to be executed from the write thread. SetDropped() also
 // guarantees that it will be called only from single-threaded LogAndApply(),
 // but this condition is not that important.
-// * Iteration -- hold DB mutex, but you can release it in the body of
-// iteration. If you release DB mutex in body, reference the column
-// family before the mutex and unreference after you unlock, since the column
-// family might get dropped when the DB mutex is released
+// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
+// body of the iteration, wrap in a RefedColumnFamilySet.
 // * GetDefault() -- thread safe
 // * GetColumnFamily() -- either inside of DB mutex or from a write thread
 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
@@ -620,17 +641,12 @@
    public:
     explicit iterator(ColumnFamilyData* cfd)
         : current_(cfd) {}
+    // NOTE: minimum operators for for-loop iteration
     iterator& operator++() {
-      // dropped column families might still be included in this iteration
-      // (we're only removing them when client drops the last reference to the
-      // column family).
-      // dummy is never dead, so this will never be infinite
-      do {
-        current_ = current_->next_;
-      } while (current_->refs_.load(std::memory_order_relaxed) == 0);
+      current_ = current_->next_;
       return *this;
     }
-    bool operator!=(const iterator& other) {
+    bool operator!=(const iterator& other) const {
       return this->current_ != other.current_;
     }
     ColumnFamilyData* operator*() { return current_; }
@@ -642,9 +658,11 @@
   ColumnFamilySet(const std::string& dbname,
                   const ImmutableDBOptions* db_options,
                   const FileOptions& file_options, Cache* table_cache,
-                  WriteBufferManager* write_buffer_manager,
-                  WriteController* write_controller,
-                  BlockCacheTracer* const block_cache_tracer);
+                  WriteBufferManager* _write_buffer_manager,
+                  WriteController* _write_controller,
+                  BlockCacheTracer* const block_cache_tracer,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const std::string& db_session_id);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -667,12 +685,12 @@
   iterator begin() { return iterator(dummy_cfd_->next_); }
   iterator end() { return iterator(dummy_cfd_); }
 
-  // REQUIRES: DB mutex held
-  // Don't call while iterating over ColumnFamilySet
-  void FreeDeadColumnFamilies();
-
   Cache* get_table_cache() { return table_cache_; }
 
+  WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
+
+  WriteController* write_controller() { return write_controller_; }
+
  private:
   friend class ColumnFamilyData;
   // helper function that gets called from cfd destructor
@@ -690,6 +708,8 @@
   std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
 
   uint32_t max_column_family_;
+  const FileOptions file_options_;
+
   ColumnFamilyData* dummy_cfd_;
   // We don't hold the refcount here, since default column family always exists
   // We are also not responsible for cleaning up default_cfd_cache_. This is
@@ -699,11 +719,61 @@
 
   const std::string db_name_;
   const ImmutableDBOptions* const db_options_;
-  const FileOptions file_options_;
   Cache* table_cache_;
   WriteBufferManager* write_buffer_manager_;
   WriteController* write_controller_;
   BlockCacheTracer* const block_cache_tracer_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::string db_session_id_;
+};
+
+// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
+// iteration over the iterator, because the cfd is Refed and Unrefed during
+// each iteration to prevent concurrent CF drop from destroying it (until
+// Unref).
+class RefedColumnFamilySet {
+ public:
+  explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
+
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
+      MaybeRef(*wrapped_);
+    }
+    ~iterator() { MaybeUnref(*wrapped_); }
+    inline void MaybeRef(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->Ref();
+      }
+    }
+    inline void MaybeUnref(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->UnrefAndTryDelete();
+      }
+    }
+    // NOTE: minimum operators for for-loop iteration
+    inline iterator& operator++() {
+      ColumnFamilyData* old = *wrapped_;
+      ++wrapped_;
+      // Can only unref & potentially free cfd after accessing its next_
+      MaybeUnref(old);
+      MaybeRef(*wrapped_);
+      return *this;
+    }
+    inline bool operator!=(const iterator& other) const {
+      return this->wrapped_ != other.wrapped_;
+    }
+    inline ColumnFamilyData* operator*() { return *wrapped_; }
+
+   private:
+    ColumnFamilySet::iterator wrapped_;
+  };
+
+  iterator begin() { return iterator(wrapped_->begin()); }
+  iterator end() { return iterator(wrapped_->end()); }
+
+ private:
+  ColumnFamilySet* wrapped_;
 };
 
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/column_family_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/column_family_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,45 +8,37 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <algorithm>
-#include <vector>
 #include <string>
 #include <thread>
+#include <vector>
 
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
-#include "memtable/hash_skiplist_rep.h"
 #include "options/options_parser.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/object_registry.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 static const int kValueSize = 1000;
 
-namespace {
-std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-}  // anonymous namespace
-
 // counts how many operations were performed
-class EnvCounter : public EnvWrapper {
+class EnvCounter : public SpecialEnv {
  public:
   explicit EnvCounter(Env* base)
-      : EnvWrapper(base), num_new_writable_file_(0) {}
+      : SpecialEnv(base), num_new_writable_file_(0) {}
   int GetNumberOfNewWritableFileCalls() {
     return num_new_writable_file_;
   }
@@ -64,33 +56,30 @@
  public:
   explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
     Env* base_env = Env::Default();
-#ifndef ROCKSDB_LITE
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    if (test_env_uri) {
-      Env* test_env = nullptr;
-      Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
-      base_env = test_env;
-      EXPECT_OK(s);
-      EXPECT_NE(Env::Default(), base_env);
-    }
-#endif  // !ROCKSDB_LITE
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
     EXPECT_NE(nullptr, base_env);
     env_ = new EnvCounter(base_env);
+    env_->skip_fsync_ = true;
     dbname_ = test::PerThreadDBPath("column_family_test");
     db_options_.create_if_missing = true;
     db_options_.fail_if_options_file_error = true;
     db_options_.env = env_;
-    DestroyDB(dbname_, Options(db_options_, column_family_options_));
+    EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
   }
 
   ~ColumnFamilyTestBase() override {
     std::vector<ColumnFamilyDescriptor> column_families;
     for (auto h : handles_) {
       ColumnFamilyDescriptor cfdescriptor;
-      h->GetDescriptor(&cfdescriptor);
+      Status s = h->GetDescriptor(&cfdescriptor);
+#ifdef ROCKSDB_LITE
+      EXPECT_TRUE(s.IsNotSupported());
+#else
+      EXPECT_OK(s);
+#endif  // ROCKSDB_LITE
       column_families.push_back(cfdescriptor);
     }
-    Close();
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
     Destroy(column_families);
     delete env_;
@@ -109,11 +98,11 @@
       // preserves the implementation that was in place when all of the
       // magic values in this file were picked.
       *storage = std::string(kValueSize, ' ');
-      return Slice(*storage);
     } else {
       Random r(k);
-      return test::RandomString(&r, kValueSize, storage);
+      *storage = r.RandomString(kValueSize);
     }
+    return Slice(*storage);
   }
 
   void Build(int base, int n, int flush_every = 0) {
@@ -122,7 +111,7 @@
 
     for (int i = 0; i < n; i++) {
       if (flush_every != 0 && i != 0 && i % flush_every == 0) {
-        DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
         dbi->TEST_FlushMemTable();
       }
 
@@ -176,7 +165,7 @@
   void Close() {
     for (auto h : handles_) {
       if (h) {
-        db_->DestroyColumnFamilyHandle(h);
+        ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
       }
     }
     handles_.clear();
@@ -190,8 +179,8 @@
     std::vector<ColumnFamilyDescriptor> column_families;
     names_.clear();
     for (size_t i = 0; i < cf.size(); ++i) {
-      column_families.push_back(ColumnFamilyDescriptor(
-          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
       names_.push_back(cf[i]);
     }
     return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
@@ -202,8 +191,8 @@
     std::vector<ColumnFamilyDescriptor> column_families;
     names_.clear();
     for (size_t i = 0; i < cf.size(); ++i) {
-      column_families.push_back(ColumnFamilyDescriptor(
-          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
       names_.push_back(cf[i]);
     }
     return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
@@ -227,7 +216,7 @@
     Open({"default"});
   }
 
-  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
 
   int GetProperty(int cf, std::string property) {
     std::string value;
@@ -287,7 +276,11 @@
       // Verify the CF options of the returned CF handle.
       ColumnFamilyDescriptor desc;
       ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
-      RocksDBOptionsParser::VerifyCFOptions(desc.options, current_cf_opt);
+      // Need to sanitize the default column family options before comparing
+      // them.
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          ConfigOptions(), desc.options,
+          SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
 #endif  // !ROCKSDB_LITE
       cfi++;
     }
@@ -313,7 +306,7 @@
   void DropColumnFamilies(const std::vector<int>& cfs) {
     for (auto cf : cfs) {
       ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
-      db_->DestroyColumnFamilyHandle(handles_[cf]);
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[cf]));
       handles_[cf] = nullptr;
       names_[cf] = "";
     }
@@ -327,14 +320,14 @@
       // 10 bytes for key, rest is value
       if (!save) {
         ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11),
-                      RandomString(&rnd_, key_value_size - 10)));
+                      rnd_.RandomString(key_value_size - 10)));
       } else {
         std::string key = test::RandomKey(&rnd_, 11);
         keys_[cf].insert(key);
-        ASSERT_OK(Put(cf, key, RandomString(&rnd_, key_value_size - 10)));
+        ASSERT_OK(Put(cf, key, rnd_.RandomString(key_value_size - 10)));
       }
     }
-    db_->FlushWAL(false);
+    ASSERT_OK(db_->FlushWAL(/*sync=*/false));
   }
 
 #ifndef ROCKSDB_LITE  // TEST functions in DB are not supported in lite
@@ -561,14 +554,14 @@
 INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
                         testing::Values(test::kDefaultFormatVersion));
 INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
-                        testing::Values(test::kLatestFormatVersion));
+                        testing::Values(kLatestFormatVersion));
 
 TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
   for (int iter = 0; iter < 3; ++iter) {
     Open();
     CreateColumnFamilies({"one", "two", "three"});
     for (size_t i = 0; i < handles_.size(); ++i) {
-      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[i]);
       ASSERT_EQ(i, cfh->GetID());
     }
     if (iter == 1) {
@@ -584,7 +577,7 @@
     CreateColumnFamilies({"three2"});
     // ID 3 that was used for dropped column family "three" should not be
     // reused
-    auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
+    auto cfh3 = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[3]);
     ASSERT_EQ(4U, cfh3->GetID());
     Close();
     Destroy();
@@ -652,11 +645,11 @@
   // after flushing file B is deleted. At the same time, the min log number of
   // default CF is not written to manifest. Log file A still remains.
   // Flushed to SST file Y.
-  Flush(1);
-  Flush(0);
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(0));
   ASSERT_OK(Put(1, "bar", "v3"));  // seqID 4
   ASSERT_OK(Put(1, "foo", "v4"));  // seqID 5
-  db_->FlushWAL(false);
+  ASSERT_OK(db_->FlushWAL(/*sync=*/false));
 
   // Preserve file system state up to here to simulate a crash condition.
   fault_env->SetFilesystemActive(false);
@@ -707,19 +700,19 @@
   // and is set to current. Both CFs' min log number is set to file C so after
   // flushing file B is deleted. Log file A still remains.
   // Flushed to SST file Y.
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(0, "bar", "v2"));  // seqID 4
   ASSERT_OK(Put(2, "bar", "v2"));  // seqID 5
   ASSERT_OK(Put(1, "bar", "v3"));  // seqID 6
   // Flushing all column families. This forces all CFs' min log to current. This
   // is written to the manifest file. Log file C is cleared.
-  Flush(0);
-  Flush(1);
-  Flush(2);
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(2));
   // Write to log file D
   ASSERT_OK(Put(1, "bar", "v4"));  // seqID 7
   ASSERT_OK(Put(1, "bar", "v5"));  // seqID 8
-  db_->FlushWAL(false);
+  ASSERT_OK(db_->FlushWAL(/*sync=*/false));
   // Preserve file system state up to here to simulate a crash condition.
   fault_env->SetFilesystemActive(false);
   std::vector<std::string> names;
@@ -753,8 +746,8 @@
                     std::make_tuple(test::kDefaultFormatVersion, false)));
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, FlushEmptyCFTestWithParam,
-    testing::Values(std::make_tuple(test::kLatestFormatVersion, true),
-                    std::make_tuple(test::kLatestFormatVersion, false)));
+    testing::Values(std::make_tuple(kLatestFormatVersion, true),
+                    std::make_tuple(kLatestFormatVersion, false)));
 
 TEST_P(ColumnFamilyTest, AddDrop) {
   Open();
@@ -821,7 +814,7 @@
 }
 
 TEST_P(ColumnFamilyTest, DropTest) {
-  // first iteration - dont reopen DB before dropping
+  // first iteration - don't reopen DB before dropping
   // second iteration - reopen DB before dropping
   for (int iter = 0; iter < 2; ++iter) {
     Open({"default"});
@@ -848,13 +841,15 @@
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   WriteBatch batch;
-  batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
-  batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
+  ASSERT_OK(batch.Put(handles_[0], Slice("existing"), Slice("column-family")));
+  ASSERT_OK(
+      batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")));
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
   DropColumnFamilies({1});
   WriteOptions woptions_ignore_missing_cf;
   woptions_ignore_missing_cf.ignore_missing_column_families = true;
-  batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
+  ASSERT_OK(
+      batch.Put(handles_[0], Slice("still here"), Slice("column-family")));
   ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
   ASSERT_EQ("column-family", Get(0, "still here"));
   Status s = db_->Write(WriteOptions(), &batch);
@@ -893,11 +888,9 @@
   ASSERT_OK(env_->CreateDirIfMissing(dbname_));
   ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
   std::vector<std::string> old_files;
-  env_->GetChildren(backup_logs, &old_files);
+  ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
   for (auto& file : old_files) {
-    if (file != "." && file != "..") {
-      env_->DeleteFile(backup_logs + "/" + file);
-    }
+    ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
   }
 
   column_family_options_.merge_operator =
@@ -924,11 +917,9 @@
 
   // copy the logs to backup
   std::vector<std::string> logs;
-  env_->GetChildren(db_options_.wal_dir, &logs);
+  ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs));
   for (auto& log : logs) {
-    if (log != ".." && log != ".") {
-      CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
-    }
+    CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
   }
 
   // recover the DB
@@ -953,9 +944,7 @@
     if (iter == 0) {
       // copy the logs from backup back to wal dir
       for (auto& log : logs) {
-        if (log != ".." && log != ".") {
-          CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
-        }
+        CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
       }
     }
   }
@@ -982,13 +971,14 @@
     for (int i = 0; i < 3; ++i) {
       uint64_t max_total_in_memory_state =
           MaxTotalInMemoryState();
-      Flush(i);
+      ASSERT_OK(Flush(i));
       AssertMaxTotalInMemoryState(max_total_in_memory_state);
     }
     ASSERT_OK(Put(1, "foofoo", "bar"));
     ASSERT_OK(Put(0, "foofoo", "bar"));
 
     for (auto* it : iterators) {
+      ASSERT_OK(it->status());
       delete it;
     }
   }
@@ -1086,10 +1076,10 @@
   CreateColumnFamilies({"one"});
 
   WriteBatch batch;
-  batch.Put(handles_[0], Slice("foo"), Slice("bar"));
-  batch.Put(handles_[1], Slice("foo"), Slice("bar"));
+  ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar")));
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
-  Flush(0);
+  ASSERT_OK(Flush(0));
   fault_env->SetFilesystemActive(false);
 
   std::vector<std::string> names;
@@ -1099,7 +1089,7 @@
     }
   }
   Close();
-  fault_env->DropUnsyncedFileData();
+  ASSERT_OK(fault_env->DropUnsyncedFileData());
   fault_env->ResetState();
   Open(names, {});
 
@@ -2073,6 +2063,7 @@
   if (iter->Valid()) {
     result = iter->key().ToString() + "->" + iter->value().ToString();
   } else {
+    EXPECT_OK(iter->status());
     result = "(invalid)";
   }
   return result;
@@ -2231,7 +2222,7 @@
   // files for column family [one], because it's empty
   AssertCountLiveFiles(4);
 
-  Flush(0);
+  ASSERT_OK(Flush(0));
   ASSERT_EQ(0, dbfull()->TEST_total_log_size());
   Close();
 }
@@ -2287,6 +2278,8 @@
               // not a multiple of 4k, round up 4k
               expected_arena_block_size += 4 * 1024;
             }
+            expected_arena_block_size =
+                std::min(size_t{1024 * 1024}, expected_arena_block_size);
             ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
           }
         }
@@ -2327,7 +2320,7 @@
         ASSERT_OK(db_->DropColumnFamily(handles_[2]));
       } else {
         // delete CF two
-        db_->DestroyColumnFamilyHandle(handles_[2]);
+        ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[2]));
         handles_[2] = nullptr;
       }
       // Make sure iterator created can still be used.
@@ -2383,7 +2376,6 @@
   // 1MB should create ~10 files for each CF
   int kKeysNum = 10000;
   PutRandomData(1, kKeysNum, 100);
-
   {
     std::unique_ptr<Iterator> iterator(
         db_->NewIterator(ReadOptions(), handles_[1]));
@@ -2430,6 +2422,9 @@
 
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
                  Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
 
   // 1MB should create ~10 files for each CF
   int kKeysNum = 10000;
@@ -2444,6 +2439,9 @@
   // now we sleep again. this is just so we're certain that flush job finished
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
                  Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
   sleeping_task.WakeUp();
   sleeping_task.WaitUntilDone();
 
@@ -2977,7 +2975,8 @@
   SpecialEnv env(Env::Default());
   db_options_.env = &env;
   db_options_.max_background_flushes = 1;
-  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
   Open();
   CreateColumnFamilies({"one"});
   ASSERT_OK(Put(1, "fodor", "mirko"));
@@ -2993,6 +2992,9 @@
   test::SleepingBackgroundTask sleeping_task;
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
                  Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
 
   WriteOptions wo;
   wo.sync = true;
@@ -3019,14 +3021,16 @@
   SpecialEnv env(Env::Default());
   db_options_.env = &env;
   db_options_.max_background_flushes = 1;
-  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
   Open();
   CreateColumnFamilies({"one"});
   ASSERT_OK(Put(1, "fodor", "mirko"));
   // Create an iterator holding the current super version.
   Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+  ASSERT_OK(it->status());
   // A flush will make `it` hold the last reference of its super version.
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(0, "fodor", "mirko"));
@@ -3038,6 +3042,9 @@
   test::SleepingBackgroundTask sleeping_task;
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
                  Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
 
   WriteOptions wo;
   wo.sync = true;
@@ -3066,7 +3073,8 @@
   env.SetBackgroundThreads(2, Env::HIGH);
   db_options_.env = &env;
   db_options_.max_background_flushes = 1;
-  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(2));
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
   Open();
   CreateColumnFamilies({"one"});
   ASSERT_OK(Put(1, "fodor", "mirko"));
@@ -3074,8 +3082,9 @@
   ReadOptions ro;
   ro.background_purge_on_iterator_cleanup = true;
   Iterator* it = db_->NewIterator(ro, handles_[1]);
+  ASSERT_OK(it->status());
   // A flush will make `it` hold the last reference of its super version.
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(0, "fodor", "mirko"));
@@ -3123,13 +3132,14 @@
   env.SetBackgroundThreads(2, Env::HIGH);
   db_options_.env = &env;
   db_options_.max_background_flushes = 1;
-  column_family_options_.memtable_factory.reset(new SpecialSkipListFactory(3));
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(3));
   column_family_options_.level0_file_num_compaction_trigger = 2;
   Open();
   CreateColumnFamilies({"one"});
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(1, "fodar2", "mirko"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   // Create an iterator holding the current super version, as well as
   // the SST file just flushed.
@@ -3141,7 +3151,7 @@
 
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(1, "fodar2", "mirko"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   WaitForCompaction();
 
@@ -3168,6 +3178,8 @@
   // Deleting the iterator will clear its super version, triggering
   // closing all files
   it->Seek("");
+  ASSERT_OK(it->status());
+
   ASSERT_EQ(2, env.num_open_wal_file_.load());
   ASSERT_EQ(0, env.delete_count_.load());
 
@@ -3198,8 +3210,8 @@
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
 
-  Put(0, "", "");
-  Put(1, "foo", "bar");
+  ASSERT_OK(Put(0, "", ""));
+  ASSERT_OK(Put(1, "foo", "bar"));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
@@ -3209,12 +3221,12 @@
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  ROCKSDB_NAMESPACE::port::Thread thread([&] { db_->SyncWAL(); });
+  ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); });
 
   TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1");
-  Flush(1);
-  Put(1, "foo", "bar");
-  Flush(1);
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Flush(1));
 
   TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2");
 
@@ -3236,7 +3248,7 @@
   Build(0, 100);
 
   // Flush the 0th column family to force a roll of the wal log
-  Flush(0);
+  ASSERT_OK(Flush(0));
 
   // Add some more entries
   Build(100, 100);
@@ -3251,7 +3263,7 @@
     FileType type;
     if (!(ParseFileName(filenames[i], &number, &type))) continue;
 
-    if (type != kLogFile) continue;
+    if (type != kWalFile) continue;
 
     logfs.push_back(filenames[i]);
   }
@@ -3296,7 +3308,7 @@
   Close();
 
   // cleanup
-  env_->DeleteDir(backup_logs);
+  ASSERT_OK(env_->DeleteDir(backup_logs));
 }
 
 TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
@@ -3312,14 +3324,14 @@
 
   // Fill Column family 1.
   PutRandomData(1, 100, 100);
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(dbname_));
 
   // Fill column family 2
   PutRandomData(2, 100, 100);
-  Flush(2);
+  ASSERT_OK(Flush(2));
 
   // SST from Column family 2 should be generated in
   // db_paths which is dbname_ in this case.
@@ -3338,29 +3350,31 @@
   Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
 
   PutRandomData(1, 100, 100, true /* save */);
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   // Check that files are generated in appropriate paths.
   ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(dbname_));
 
   PutRandomData(2, 100, 100, true /* save */);
-  Flush(2);
+  ASSERT_OK(Flush(2));
 
   ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(dbname_));
 
   // Re-open and verify the keys.
   Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
   for (int cf = 1; cf != 3; ++cf) {
     ReadOptions read_options;
     read_options.readahead_size = 0;
     auto it = dbi->NewIterator(read_options, handles_[cf]);
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      ASSERT_OK(it->status());
       Slice key(it->key());
       ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString()));
     }
+    ASSERT_OK(it->status());
     delete it;
 
     for (const auto& key : keys_[cf]) {
@@ -3369,15 +3383,55 @@
   }
 }
 
-}  // namespace ROCKSDB_NAMESPACE
+TEST(ColumnFamilyTest, ValidateBlobGCCutoff) {
+  DBOptions db_options;
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
+  ColumnFamilyOptions cf_options;
+  cf_options.enable_blob_garbage_collection = true;
+
+  cf_options.blob_garbage_collection_age_cutoff = -0.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.5;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
 }
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  cf_options.enable_blob_garbage_collection = true;
+
+  cf_options.blob_garbage_collection_force_threshold = -0.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+
+  cf_options.blob_garbage_collection_force_threshold = 0.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 0.5;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 1.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 1.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compact_files_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compact_files_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compact_files_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compact_files_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,6 +16,7 @@
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -90,9 +91,9 @@
   // create couple files
   // Background compaction starts and waits in BackgroundCallCompaction:0
   for (int i = 0; i < kLevel0Trigger * 4; ++i) {
-    db->Put(WriteOptions(), ToString(i), "");
-    db->Put(WriteOptions(), ToString(100 - i), "");
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i), ""));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(100 - i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
   }
 
   ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
@@ -117,6 +118,78 @@
   delete db;
 }
 
+TEST_F(CompactFilesTest, MultipleLevel) {
+  Options options;
+  options.create_if_missing = true;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = 6;
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // create couple files in L0, L3, L4 and L5
+  for (int i = 5; i > 2; --i) {
+    collector->ClearFlushedFiles();
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
+
+    std::string prop;
+    ASSERT_TRUE(
+        db->GetProperty("rocksdb.num-files-at-level" + ToString(i), &prop));
+    ASSERT_EQ("1", prop);
+  }
+  ASSERT_OK(db->Put(WriteOptions(), ToString(0), ""));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  // Compact files except the file in L3
+  std::vector<std::string> files;
+  for (int i = 0; i < 6; ++i) {
+    if (i == 3) continue;
+    for (auto& file : meta.levels[i].files) {
+      files.push_back(file.db_path + "/" + file.name);
+    }
+  }
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"},
+      {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread thread([&] {
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0");
+    ASSERT_OK(db->Put(WriteOptions(), "bar", "v2"));
+    ASSERT_OK(db->Put(WriteOptions(), "foo", "v2"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1");
+  });
+
+  // Compaction cannot move up the data to higher level
+  // here we have input file from level 5, so the output level has to be >= 5
+  for (int invalid_output_level = 0; invalid_output_level < 5;
+       invalid_output_level++) {
+    s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
+    std::cout << s.ToString() << std::endl;
+    ASSERT_TRUE(s.IsInvalidArgument());
+  }
+
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
+  SyncPoint::GetInstance()->DisableProcessing();
+  thread.join();
+
+  delete db;
+}
+
 TEST_F(CompactFilesTest, ObsoleteFiles) {
   Options options;
   // to trigger compaction more easily
@@ -137,18 +210,18 @@
   DB* db = nullptr;
   DestroyDB(db_name_, options);
   Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
-  assert(db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
 
   // create couple files
   for (int i = 1000; i < 2000; ++i) {
-    db->Put(WriteOptions(), ToString(i),
-            std::string(kWriteBufferSize / 10, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(kWriteBufferSize / 10, 'a' + (i % 26))));
   }
 
   auto l0_files = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
-  reinterpret_cast<DBImpl*>(db)->TEST_WaitForCompact();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact());
 
   // verify all compaction input files are deleted
   for (auto fname : l0_files) {
@@ -181,15 +254,17 @@
 
   // create couple files
   for (int i = 0; i < 500; ++i) {
-    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(1000, 'a' + (i % 26))));
   }
-  reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   collector->ClearFlushedFiles();
   for (int i = 0; i < 500; ++i) {
-    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(1000, 'a' + (i % 26))));
   }
-  reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
   auto l0_files_2 = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
@@ -212,13 +287,13 @@
   DB* db = nullptr;
   DestroyDB(db_name_, options);
   Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(s);
   assert(db);
 
   // Create 5 files.
   for (int i = 0; i < 5; ++i) {
-    db->Put(WriteOptions(), "key" + ToString(i), "value");
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Put(WriteOptions(), "key" + ToString(i), "value"));
+    ASSERT_OK(db->Flush(FlushOptions()));
   }
 
   auto l0_files = collector->GetFlushedFiles();
@@ -236,8 +311,8 @@
 
   // In the meantime flush another file.
   TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0");
-  db->Put(WriteOptions(), "key5", "value");
-  db->Flush(FlushOptions());
+  ASSERT_OK(db->Put(WriteOptions(), "key5", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
   TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1");
 
   compaction_thread.join();
@@ -248,7 +323,7 @@
 
   // Make sure we can reopen the DB.
   s = DB::Open(options, db_name_, &db);
-  ASSERT_TRUE(s.ok());
+  ASSERT_OK(s);
   assert(db);
   delete db;
 }
@@ -292,8 +367,8 @@
   cf->SetDB(db);
 
   // Write one L0 file
-  db->Put(WriteOptions(), "K1", "V1");
-  db->Flush(FlushOptions());
+  ASSERT_OK(db->Put(WriteOptions(), "K1", "V1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
 
   // Compact all L0 files using CompactFiles
   ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
@@ -336,8 +411,8 @@
     DB* db = nullptr;
     ASSERT_OK(DB::Open(options, db_name_, &db));
 
-    db->Put(WriteOptions(), "key", "val");
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(db->Flush(FlushOptions()));
 
     auto l0_files = collector->GetFlushedFiles();
     ASSERT_EQ(1, l0_files.size());
@@ -376,14 +451,15 @@
   DB* db = nullptr;
   DestroyDB(db_name_, options);
   Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(s);
   assert(db);
 
   // create couple files
   for (int i = 0; i < 500; ++i) {
-    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(1000, 'a' + (i % 26))));
   }
-  reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   CompactionOptions co;
   co.compression = CompressionType::kLZ4Compression;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,160 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-#include "db/compacted_db_impl.h"
-#include "db/db_impl/db_impl.h"
-#include "db/version_set.h"
-#include "table/get_context.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-extern void MarkKeyMayExist(void* arg);
-extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
-                      const Slice& v, bool hit_and_return);
-
-CompactedDBImpl::CompactedDBImpl(
-  const DBOptions& options, const std::string& dbname)
-  : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr),
-    user_comparator_(nullptr) {
-}
-
-CompactedDBImpl::~CompactedDBImpl() {
-}
-
-size_t CompactedDBImpl::FindFile(const Slice& key) {
-  size_t right = files_.num_files - 1;
-  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
-    return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
-  };
-  return static_cast<size_t>(std::lower_bound(files_.files,
-                            files_.files + right, key, cmp) - files_.files);
-}
-
-Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
-                            const Slice& key, PinnableSlice* value) {
-  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
-                         GetContext::kNotFound, key, value, nullptr, nullptr,
-                         true, nullptr, nullptr);
-  LookupKey lkey(key, kMaxSequenceNumber);
-  files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(),
-                                                   &get_context, nullptr);
-  if (get_context.State() == GetContext::kFound) {
-    return Status::OK();
-  }
-  return Status::NotFound();
-}
-
-std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
-    const std::vector<ColumnFamilyHandle*>&,
-    const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  autovector<TableReader*, 16> reader_list;
-  for (const auto& key : keys) {
-    const FdWithKeyRange& f = files_.files[FindFile(key)];
-    if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
-      reader_list.push_back(nullptr);
-    } else {
-      LookupKey lkey(key, kMaxSequenceNumber);
-      f.fd.table_reader->Prepare(lkey.internal_key());
-      reader_list.push_back(f.fd.table_reader);
-    }
-  }
-  std::vector<Status> statuses(keys.size(), Status::NotFound());
-  values->resize(keys.size());
-  int idx = 0;
-  for (auto* r : reader_list) {
-    if (r != nullptr) {
-      PinnableSlice pinnable_val;
-      std::string& value = (*values)[idx];
-      GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
-                             GetContext::kNotFound, keys[idx], &pinnable_val,
-                             nullptr, nullptr, true, nullptr, nullptr);
-      LookupKey lkey(keys[idx], kMaxSequenceNumber);
-      r->Get(options, lkey.internal_key(), &get_context, nullptr);
-      value.assign(pinnable_val.data(), pinnable_val.size());
-      if (get_context.State() == GetContext::kFound) {
-        statuses[idx] = Status::OK();
-      }
-    }
-    ++idx;
-  }
-  return statuses;
-}
-
-Status CompactedDBImpl::Init(const Options& options) {
-  SuperVersionContext sv_context(/* create_superversion */ true);
-  mutex_.Lock();
-  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
-                            ColumnFamilyOptions(options));
-  Status s = Recover({cf}, true /* read only */, false, true);
-  if (s.ok()) {
-    cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
-              DefaultColumnFamily())->cfd();
-    cfd_->InstallSuperVersion(&sv_context, &mutex_);
-  }
-  mutex_.Unlock();
-  sv_context.Clean();
-  if (!s.ok()) {
-    return s;
-  }
-  NewThreadStatusCfInfo(cfd_);
-  version_ = cfd_->GetSuperVersion()->current;
-  user_comparator_ = cfd_->user_comparator();
-  auto* vstorage = version_->storage_info();
-  if (vstorage->num_non_empty_levels() == 0) {
-    return Status::NotSupported("no file exists");
-  }
-  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
-  // L0 should not have files
-  if (l0.num_files > 1) {
-    return Status::NotSupported("L0 contain more than 1 file");
-  }
-  if (l0.num_files == 1) {
-    if (vstorage->num_non_empty_levels() > 1) {
-      return Status::NotSupported("Both L0 and other level contain files");
-    }
-    files_ = l0;
-    return Status::OK();
-  }
-
-  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
-    if (vstorage->LevelFilesBrief(i).num_files > 0) {
-      return Status::NotSupported("Other levels also contain files");
-    }
-  }
-
-  int level = vstorage->num_non_empty_levels() - 1;
-  if (vstorage->LevelFilesBrief(level).num_files > 0) {
-    files_ = vstorage->LevelFilesBrief(level);
-    return Status::OK();
-  }
-  return Status::NotSupported("no file exists");
-}
-
-Status CompactedDBImpl::Open(const Options& options,
-                             const std::string& dbname, DB** dbptr) {
-  *dbptr = nullptr;
-
-  if (options.max_open_files != -1) {
-    return Status::InvalidArgument("require max_open_files = -1");
-  }
-  if (options.merge_operator.get() != nullptr) {
-    return Status::InvalidArgument("merge operator is not supported");
-  }
-  DBOptions db_options(options);
-  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
-  Status s = db->Init(options);
-  if (s.ok()) {
-    db->StartTimedTasks();
-    ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
-                   "Opened the db as fully compacted mode");
-    LogFlush(db->immutable_db_options_.info_log);
-    *dbptr = db.release();
-  }
-  return s;
-}
-
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compacted_db_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compacted_db_impl.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,113 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifndef ROCKSDB_LITE
-#include <string>
-#include <vector>
-#include "db/db_impl/db_impl.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class CompactedDBImpl : public DBImpl {
- public:
-  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
-  // No copying allowed
-  CompactedDBImpl(const CompactedDBImpl&) = delete;
-  void operator=(const CompactedDBImpl&) = delete;
-
-  virtual ~CompactedDBImpl();
-
-  static Status Open(const Options& options, const std::string& dbname,
-                     DB** dbptr);
-
-  // Implementations of the DB interface
-  using DB::Get;
-  virtual Status Get(const ReadOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     PinnableSlice* value) override;
-  using DB::MultiGet;
-  virtual std::vector<Status> MultiGet(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>&,
-      const std::vector<Slice>& keys, std::vector<std::string>* values)
-    override;
-
-  using DBImpl::Put;
-  virtual Status Put(const WriteOptions& /*options*/,
-                     ColumnFamilyHandle* /*column_family*/,
-                     const Slice& /*key*/, const Slice& /*value*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DBImpl::Merge;
-  virtual Status Merge(const WriteOptions& /*options*/,
-                       ColumnFamilyHandle* /*column_family*/,
-                       const Slice& /*key*/, const Slice& /*value*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DBImpl::Delete;
-  virtual Status Delete(const WriteOptions& /*options*/,
-                        ColumnFamilyHandle* /*column_family*/,
-                        const Slice& /*key*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  virtual Status Write(const WriteOptions& /*options*/,
-                       WriteBatch* /*updates*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DBImpl::CompactRange;
-  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
-                              ColumnFamilyHandle* /*column_family*/,
-                              const Slice* /*begin*/,
-                              const Slice* /*end*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-
-  virtual Status DisableFileDeletions() override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  virtual Status EnableFileDeletions(bool /*force*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  virtual Status GetLiveFiles(std::vector<std::string>& ret,
-                              uint64_t* manifest_file_size,
-                              bool /*flush_memtable*/) override {
-    return DBImpl::GetLiveFiles(ret, manifest_file_size,
-                                false /* flush_memtable */);
-  }
-  using DBImpl::Flush;
-  virtual Status Flush(const FlushOptions& /*options*/,
-                       ColumnFamilyHandle* /*column_family*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DB::IngestExternalFile;
-  virtual Status IngestExternalFile(
-      ColumnFamilyHandle* /*column_family*/,
-      const std::vector<std::string>& /*external_files*/,
-      const IngestExternalFileOptions& /*ingestion_options*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DB::CreateColumnFamilyWithImport;
-  virtual Status CreateColumnFamilyWithImport(
-      const ColumnFamilyOptions& /*options*/,
-      const std::string& /*column_family_name*/,
-      const ImportColumnFamilyOptions& /*import_options*/,
-      const ExportImportFilesMetaData& /*metadata*/,
-      ColumnFamilyHandle** /*handle*/) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-
- private:
-  friend class DB;
-  inline size_t FindFile(const Slice& key);
-  Status Init(const Options& options);
-
-  ColumnFamilyData* cfd_;
-  Version* version_;
-  const Comparator* user_comparator_;
-  LevelFilesBrief files_;
-};
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,275 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that wraps another one and ensures that any keys
+// returned are strictly within a range [start, end). If the underlying
+// iterator has already performed the bounds checking, it relies on that result;
+// otherwise, it performs the necessary key comparisons itself. Both bounds
+// are optional.
+class ClippingIterator : public InternalIterator {
+ public:
+  ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
+                   const Comparator* cmp)
+      : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
+    assert(iter_);
+    assert(cmp_);
+    assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0);
+
+    UpdateAndEnforceBounds();
+  }
+
+  bool Valid() const override { return valid_; }
+
+  void SeekToFirst() override {
+    if (start_) {
+      iter_->Seek(*start_);
+    } else {
+      iter_->SeekToFirst();
+    }
+
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekToLast() override {
+    if (end_) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+    } else {
+      iter_->SeekToLast();
+    }
+
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Seek(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      iter_->Seek(*start_);
+      UpdateAndEnforceUpperBound();
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      valid_ = false;
+      return;
+    }
+
+    iter_->Seek(target);
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      valid_ = false;
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+
+      UpdateAndEnforceLowerBound();
+      return;
+    }
+
+    iter_->SeekForPrev(target);
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Next() override {
+    assert(valid_);
+    iter_->Next();
+    UpdateAndEnforceUpperBound();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(valid_);
+    assert(result);
+
+    IterateResult res;
+    valid_ = iter_->NextAndGetResult(&res);
+
+    if (!valid_) {
+      return false;
+    }
+
+    if (end_) {
+      EnforceUpperBoundImpl(res.bound_check_result);
+
+      if (!valid_) {
+        return false;
+      }
+    }
+
+    res.bound_check_result = IterBoundCheck::kInbound;
+    *result = res;
+
+    return true;
+  }
+
+  void Prev() override {
+    assert(valid_);
+    iter_->Prev();
+    UpdateAndEnforceLowerBound();
+  }
+
+  Slice key() const override {
+    assert(valid_);
+    return iter_->key();
+  }
+
+  Slice user_key() const override {
+    assert(valid_);
+    return iter_->user_key();
+  }
+
+  Slice value() const override {
+    assert(valid_);
+    return iter_->value();
+  }
+
+  Status status() const override { return iter_->status(); }
+
+  bool PrepareValue() override {
+    assert(valid_);
+
+    if (iter_->PrepareValue()) {
+      return true;
+    }
+
+    assert(!iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(valid_);
+    return false;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(valid_);
+    return IterBoundCheck::kInbound;
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+
+  bool IsKeyPinned() const override {
+    assert(valid_);
+    return iter_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(valid_);
+    return iter_->IsValuePinned();
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override {
+    return iter_->GetProperty(prop_name, prop);
+  }
+
+ private:
+  void UpdateValid() {
+    assert(!iter_->Valid() || iter_->status().ok());
+
+    valid_ = iter_->Valid();
+  }
+
+  void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) {
+    if (bound_check_result == IterBoundCheck::kInbound) {
+      return;
+    }
+
+    if (bound_check_result == IterBoundCheck::kOutOfBound) {
+      valid_ = false;
+      return;
+    }
+
+    assert(bound_check_result == IterBoundCheck::kUnknown);
+
+    if (cmp_->Compare(key(), *end_) >= 0) {
+      valid_ = false;
+    }
+  }
+
+  void EnforceUpperBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!end_) {
+      return;
+    }
+
+    EnforceUpperBoundImpl(iter_->UpperBoundCheckResult());
+  }
+
+  void EnforceLowerBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!start_) {
+      return;
+    }
+
+    if (!iter_->MayBeOutOfLowerBound()) {
+      return;
+    }
+
+    if (cmp_->Compare(key(), *start_) < 0) {
+      valid_ = false;
+    }
+  }
+
+  void AssertBounds() {
+    assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0);
+    assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0);
+  }
+
+  void UpdateAndEnforceBounds() {
+    UpdateValid();
+    EnforceUpperBound();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceUpperBound() {
+    UpdateValid();
+    EnforceUpperBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceLowerBound() {
+    UpdateValid();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  InternalIterator* iter_;
+  const Slice* start_;
+  const Slice* end_;
+  const Comparator* cmp_;
+  bool valid_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/clipping_iterator_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,258 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/clipping_iterator.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A vector iterator which does its own bounds checking. This is for testing the
+// optimizations in the clipping iterator where we bypass the bounds checking if
+// the input iterator has already performed it.
+class BoundsCheckingVectorIterator : public VectorIterator {
+ public:
+  BoundsCheckingVectorIterator(const std::vector<std::string>& keys,
+                               const std::vector<std::string>& values,
+                               const Slice* start, const Slice* end,
+                               const Comparator* cmp)
+      : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) {
+    assert(cmp_);
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(Valid());
+    assert(result);
+
+    Next();
+
+    if (!Valid()) {
+      return false;
+    }
+
+    result->key = key();
+    result->bound_check_result = UpperBoundCheckResult();
+    result->value_prepared = true;
+
+    return true;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+
+    if (!start_) {
+      return false;
+    }
+
+    return cmp_->Compare(key(), *start_) < 0;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+
+    if (!end_) {
+      return IterBoundCheck::kInbound;
+    }
+
+    return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound
+                                            : IterBoundCheck::kInbound;
+  }
+
+ private:
+  const Slice* start_;
+  const Slice* end_;
+  const Comparator* cmp_;
+};
+
+class ClippingIteratorTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {};
+
+TEST_P(ClippingIteratorTest, Clip) {
+  const std::vector<std::string> keys{"key0", "key1", "key2", "key3", "key4",
+                                      "key5", "key6", "key7", "key8", "key9"};
+  const std::vector<std::string> values{
+      "unused0", "value1",  "value2",  "value3",  "unused4",
+      "unused5", "unused6", "unused7", "unused8", "unused9"};
+
+  assert(keys.size() == values.size());
+
+  // Note: the input always contains key1, key2, and key3; however, the clipping
+  // window is based on the test parameters: its left edge is a value in the
+  // range [0, 4], and its size is a value in the range [0, 5]
+  const std::vector<std::string> input_keys{keys[1], keys[2], keys[3]};
+  const std::vector<std::string> input_values{values[1], values[2], values[3]};
+
+  const bool use_bounds_checking_vec_it = std::get<0>(GetParam());
+
+  const size_t clip_start_idx = std::get<1>(GetParam());
+  const size_t clip_window_size = std::get<2>(GetParam());
+  const size_t clip_end_idx = clip_start_idx + clip_window_size;
+
+  const Slice start(keys[clip_start_idx]);
+  const Slice end(keys[clip_end_idx]);
+
+  std::unique_ptr<InternalIterator> input(
+      use_bounds_checking_vec_it
+          ? new BoundsCheckingVectorIterator(input_keys, input_values, &start,
+                                             &end, BytewiseComparator())
+          : new VectorIterator(input_keys, input_values, BytewiseComparator()));
+
+  ClippingIterator clip(input.get(), &start, &end, BytewiseComparator());
+
+  // The range the clipping iterator should return values from. This is
+  // essentially the intersection of the input range [1, 4) and the clipping
+  // window [clip_start_idx, clip_end_idx)
+  const size_t data_start_idx =
+      std::max(clip_start_idx, static_cast<size_t>(1));
+  const size_t data_end_idx = std::min(clip_end_idx, static_cast<size_t>(4));
+
+  // Range is empty; all Seeks should fail
+  if (data_start_idx >= data_end_idx) {
+    clip.SeekToFirst();
+    ASSERT_FALSE(clip.Valid());
+
+    clip.SeekToLast();
+    ASSERT_FALSE(clip.Valid());
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      clip.Seek(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+
+      clip.SeekForPrev(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    return;
+  }
+
+  // Range is non-empty; call SeekToFirst and iterate forward
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    clip.Next();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Next();
+  ASSERT_FALSE(clip.Valid());
+
+  // Do it again using NextAndGetResult
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    IterateResult result;
+    ASSERT_TRUE(clip.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[i]);
+    ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound);
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  IterateResult result;
+  ASSERT_FALSE(clip.NextAndGetResult(&result));
+  ASSERT_FALSE(clip.Valid());
+
+  // Call SeekToLast and iterate backward
+  clip.SeekToLast();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+  ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) {
+    clip.Prev();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Prev();
+  ASSERT_FALSE(clip.Valid());
+
+  // Call Seek/SeekForPrev for all keys; Seek should return the smallest key
+  // which is >= the target; SeekForPrev should return the largest key which is
+  // <= the target
+  for (size_t i = 0; i < keys.size(); ++i) {
+    clip.Seek(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_start_idx]);
+      ASSERT_EQ(clip.value(), values[data_start_idx]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    clip.SeekForPrev(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_FALSE(clip.Valid());
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+      ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ClippingIteratorTest, ClippingIteratorTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(5)),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(6))));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,12 +7,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "db/compaction/compaction.h"
+
 #include <cinttypes>
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/compaction/compaction.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/sst_partitioner.h"
 #include "test_util/sync_point.h"
 #include "util/string_util.h"
 
@@ -23,7 +25,7 @@
 
 int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
                       const InternalKey& b) {
-  auto c = user_cmp->Compare(a.user_key(), b.user_key());
+  auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
   if (c != 0) {
     return c;
   }
@@ -202,26 +204,24 @@
   return num_files_in_compaction == total_num_files;
 }
 
-Compaction::Compaction(VersionStorageInfo* vstorage,
-                       const ImmutableCFOptions& _immutable_cf_options,
-                       const MutableCFOptions& _mutable_cf_options,
-                       std::vector<CompactionInputFiles> _inputs,
-                       int _output_level, uint64_t _target_file_size,
-                       uint64_t _max_compaction_bytes, uint32_t _output_path_id,
-                       CompressionType _compression,
-                       CompressionOptions _compression_opts,
-                       uint32_t _max_subcompactions,
-                       std::vector<FileMetaData*> _grandparents,
-                       bool _manual_compaction, double _score,
-                       bool _deletion_compaction,
-                       CompactionReason _compaction_reason)
+Compaction::Compaction(
+    VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
+    const MutableCFOptions& _mutable_cf_options,
+    const MutableDBOptions& _mutable_db_options,
+    std::vector<CompactionInputFiles> _inputs, int _output_level,
+    uint64_t _target_file_size, uint64_t _max_compaction_bytes,
+    uint32_t _output_path_id, CompressionType _compression,
+    CompressionOptions _compression_opts, Temperature _output_temperature,
+    uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
+    bool _manual_compaction, double _score, bool _deletion_compaction,
+    CompactionReason _compaction_reason)
     : input_vstorage_(vstorage),
       start_level_(_inputs[0].level),
       output_level_(_output_level),
       max_output_file_size_(_target_file_size),
       max_compaction_bytes_(_max_compaction_bytes),
       max_subcompactions_(_max_subcompactions),
-      immutable_cf_options_(_immutable_cf_options),
+      immutable_options_(_immutable_options),
       mutable_cf_options_(_mutable_cf_options),
       input_version_(nullptr),
       number_levels_(vstorage->num_levels()),
@@ -229,6 +229,7 @@
       output_path_id_(_output_path_id),
       output_compression_(_compression),
       output_compression_opts_(_compression_opts),
+      output_temperature_(_output_temperature),
       deletion_compaction_(_deletion_compaction),
       inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
       grandparents_(std::move(_grandparents)),
@@ -237,19 +238,14 @@
       is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
       is_manual_compaction_(_manual_compaction),
       is_trivial_move_(false),
-      compaction_reason_(_compaction_reason) {
+      compaction_reason_(_compaction_reason),
+      notify_on_compaction_completion_(false) {
   MarkFilesBeingCompacted(true);
   if (is_manual_compaction_) {
     compaction_reason_ = CompactionReason::kManualCompaction;
   }
   if (max_subcompactions_ == 0) {
-    max_subcompactions_ = immutable_cf_options_.max_subcompactions;
-  }
-  if (!bottommost_level_) {
-    // Currently we only enable dictionary compression during compaction to the
-    // bottommost level.
-    output_compression_opts_.max_dict_bytes = 0;
-    output_compression_opts_.zstd_max_train_bytes = 0;
+    max_subcompactions_ = _mutable_db_options.max_subcompactions;
   }
 
 #ifndef NDEBUG
@@ -281,7 +277,7 @@
 
 bool Compaction::InputCompressionMatchesOutput() const {
   int base_level = input_vstorage_->base_level();
-  bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_,
+  bool matches = (GetCompressionType(immutable_options_, input_vstorage_,
                                      mutable_cf_options_, start_level_,
                                      base_level) == output_compression_);
   if (matches) {
@@ -306,13 +302,19 @@
   }
 
   if (is_manual_compaction_ &&
-      (immutable_cf_options_.compaction_filter != nullptr ||
-       immutable_cf_options_.compaction_filter_factory != nullptr)) {
+      (immutable_options_.compaction_filter != nullptr ||
+       immutable_options_.compaction_filter_factory != nullptr)) {
     // This is a manual compaction and we have a compaction filter that should
     // be executed, we cannot do a trivial move
     return false;
   }
 
+  if (start_level_ == output_level_) {
+    // It doesn't make sense if compaction picker picks files just to trivial
+    // move to the same level.
+    return false;
+  }
+
   // Used in universal compaction, where trivial move can be done if the
   // input files are non overlapping
   if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
@@ -328,6 +330,8 @@
 
   // assert inputs_.size() == 1
 
+  std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
+
   for (const auto& file : inputs_.front().files) {
     std::vector<FileMetaData*> file_grand_parents;
     if (output_level_ + 1 >= number_levels_) {
@@ -340,6 +344,13 @@
     if (compaction_size > max_compaction_bytes_) {
       return false;
     }
+
+    if (partitioner.get() != nullptr) {
+      if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
+                                         file->largest.user_key())) {
+        return false;
+      }
+    }
   }
 
   return true;
@@ -371,7 +382,13 @@
         auto* f = files[level_ptrs->at(lvl)];
         if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
           // We've advanced far enough
-          if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // In the presence of user-defined timestamp, we may need to handle
+          // the case in which f->smallest.user_key() (including ts) has the
+          // same user key, but the ts part is smaller. If so,
+          // Compare(user_key, f->smallest.user_key()) returns -1.
+          // That's why we need CompareWithoutTimestamp().
+          if (user_cmp->CompareWithoutTimestamp(user_key,
+                                                f->smallest.user_key()) >= 0) {
             // Key falls in this file's range, so it may
             // exist beyond output level
             return false;
@@ -500,14 +517,14 @@
   }
 
   if (max_output_file_size_ != port::kMaxUint64 &&
-      (immutable_cf_options_.compaction_style == kCompactionStyleLevel ||
+      (immutable_options_.compaction_style == kCompactionStyleLevel ||
        output_level() > 0)) {
     preallocation_size = std::min(max_output_file_size_, preallocation_size);
   }
 
   // Over-estimate slightly so we don't end up just barely crossing
   // the threshold
-  // No point to prellocate more than 1GB.
+  // No point to preallocate more than 1GB.
   return std::min(uint64_t{1073741824},
                   preallocation_size + (preallocation_size / 10));
 }
@@ -517,14 +534,35 @@
     return nullptr;
   }
 
+  if (!cfd_->ioptions()
+           ->compaction_filter_factory->ShouldFilterTableFileCreation(
+               TableFileCreationReason::kCompaction)) {
+    return nullptr;
+  }
+
   CompactionFilter::Context context;
   context.is_full_compaction = is_full_compaction_;
   context.is_manual_compaction = is_manual_compaction_;
   context.column_family_id = cfd_->GetID();
+  context.reason = TableFileCreationReason::kCompaction;
   return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
       context);
 }
 
+std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
+  if (!immutable_options_.sst_partitioner_factory) {
+    return nullptr;
+  }
+
+  SstPartitioner::Context context;
+  context.is_full_compaction = is_full_compaction_;
+  context.is_manual_compaction = is_manual_compaction_;
+  context.output_level = output_level_;
+  context.smallest_user_key = smallest_user_key_;
+  context.largest_user_key = largest_user_key_;
+  return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
+}
+
 bool Compaction::IsOutputLevelEmpty() const {
   return inputs_.back().level != output_level_ || inputs_.back().empty();
 }
@@ -533,6 +571,14 @@
   if (max_subcompactions_ <= 1 || cfd_ == nullptr) {
     return false;
   }
+
+  // Note: the subcompaction boundary picking logic does not currently guarantee
+  // that all user keys that differ only by timestamp get processed by the same
+  // subcompaction.
+  if (cfd_->user_comparator()->timestamp_size() > 0) {
+    return false;
+  }
+
   if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
     return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 &&
            !IsOutputLevelEmpty();
@@ -543,10 +589,42 @@
   }
 }
 
-uint64_t Compaction::MinInputFileOldestAncesterTime() const {
+bool Compaction::DoesInputReferenceBlobFiles() const {
+  assert(input_version_);
+
+  const VersionStorageInfo* storage_info = input_version_->storage_info();
+  assert(storage_info);
+
+  if (storage_info->GetBlobFiles().empty()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    for (const FileMetaData* meta : inputs_[i].files) {
+      assert(meta);
+
+      if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime(
+    const InternalKey* start, const InternalKey* end) const {
   uint64_t min_oldest_ancester_time = port::kMaxUint64;
+  const InternalKeyComparator& icmp =
+      column_family_data()->internal_comparator();
   for (const auto& level_files : inputs_) {
     for (const auto& file : level_files.files) {
+      if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
+        continue;
+      }
+      if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
+        continue;
+      }
       uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
       if (oldest_ancester_time != 0) {
         min_oldest_ancester_time =
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,6 +11,7 @@
 #include "db/version_set.h"
 #include "memory/arena.h"
 #include "options/cf_options.h"
+#include "rocksdb/sst_partitioner.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -69,12 +70,14 @@
 class Compaction {
  public:
   Compaction(VersionStorageInfo* input_version,
-             const ImmutableCFOptions& immutable_cf_options,
+             const ImmutableOptions& immutable_options,
              const MutableCFOptions& mutable_cf_options,
+             const MutableDBOptions& mutable_db_options,
              std::vector<CompactionInputFiles> inputs, int output_level,
              uint64_t target_file_size, uint64_t max_compaction_bytes,
              uint32_t output_path_id, CompressionType compression,
-             CompressionOptions compression_opts, uint32_t max_subcompactions,
+             CompressionOptions compression_opts,
+             Temperature output_temperature, uint32_t max_subcompactions,
              std::vector<FileMetaData*> grandparents,
              bool manual_compaction = false, double score = -1,
              bool deletion_compaction = false,
@@ -160,7 +163,7 @@
   CompressionType output_compression() const { return output_compression_; }
 
   // What compression options for output
-  CompressionOptions output_compression_opts() const {
+  const CompressionOptions& output_compression_opts() const {
     return output_compression_opts_;
   }
 
@@ -221,10 +224,10 @@
   // How many total levels are there?
   int number_levels() const { return number_levels_; }
 
-  // Return the ImmutableCFOptions that should be used throughout the compaction
+  // Return the ImmutableOptions that should be used throughout the compaction
   // procedure
-  const ImmutableCFOptions* immutable_cf_options() const {
-    return &immutable_cf_options_;
+  const ImmutableOptions* immutable_options() const {
+    return &immutable_options_;
   }
 
   // Return the MutableCFOptions that should be used throughout the compaction
@@ -255,12 +258,20 @@
   // Create a CompactionFilter from compaction_filter_factory
   std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
 
+  // Create a SstPartitioner from sst_partitioner_factory
+  std::unique_ptr<SstPartitioner> CreateSstPartitioner() const;
+
   // Is the input level corresponding to output_level_ empty?
   bool IsOutputLevelEmpty() const;
 
   // Should this compaction be broken up into smaller ones run in parallel?
   bool ShouldFormSubcompactions() const;
 
+  // Returns true iff at least one input file references a blob file.
+  //
+  // PRE: input version has been set.
+  bool DoesInputReferenceBlobFiles() const;
+
   // test function to validate the functionality of IsBottommostLevel()
   // function -- determines if compaction with inputs and storage is bottommost
   static bool TEST_IsBottommostLevel(
@@ -289,9 +300,24 @@
 
   uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
 
+  Temperature output_temperature() const { return output_temperature_; }
+
   uint32_t max_subcompactions() const { return max_subcompactions_; }
 
-  uint64_t MinInputFileOldestAncesterTime() const;
+  // start and end are sub compact range. Null if no boundary.
+  // This is used to filter out some input files' ancester's time range.
+  uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
+                                          const InternalKey* end) const;
+
+  // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
+  // compaction begin and compaction completion callbacks match.
+  void SetNotifyOnCompactionCompleted() {
+    notify_on_compaction_completion_ = true;
+  }
+
+  bool ShouldNotifyOnCompactionCompleted() const {
+    return notify_on_compaction_completion_;
+  }
 
  private:
   // mark (or clear) all files that are being compacted
@@ -325,7 +351,7 @@
   uint64_t max_output_file_size_;
   uint64_t max_compaction_bytes_;
   uint32_t max_subcompactions_;
-  const ImmutableCFOptions immutable_cf_options_;
+  const ImmutableOptions immutable_options_;
   const MutableCFOptions mutable_cf_options_;
   Version* input_version_;
   VersionEdit edit_;
@@ -336,7 +362,8 @@
   const uint32_t output_path_id_;
   CompressionType output_compression_;
   CompressionOptions output_compression_opts_;
-  // If true, then the comaction can be done by simply deleting input files.
+  Temperature output_temperature_;
+  // If true, then the compaction can be done by simply deleting input files.
   const bool deletion_compaction_;
 
   // Compaction input files organized by level. Constant after construction
@@ -376,6 +403,10 @@
 
   // Reason for compaction
   CompactionReason compaction_reason_;
+
+  // Notify on compaction completion only if listener was notified on compaction
+  // begin.
+  bool notify_on_compaction_completion_;
 };
 
 // Return sum of sizes of all files in `files`.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iteration_stats.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,8 +5,12 @@
 
 #pragma once
 
+#include <cstdint>
+
 #include "rocksdb/rocksdb_namespace.h"
 
+namespace ROCKSDB_NAMESPACE {
+
 struct CompactionIterationStats {
   // Compaction statistics
 
@@ -34,4 +38,12 @@
   // Single-Delete diagnostics for exceptional situations
   uint64_t num_single_del_fallthru = 0;
   uint64_t num_single_del_mismatch = 0;
+
+  // Blob related statistics
+  uint64_t num_blobs_read = 0;
+  uint64_t total_blob_bytes_read = 0;
+  uint64_t num_blobs_relocated = 0;
+  uint64_t total_blob_bytes_relocated = 0;
 };
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,53 +3,48 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <cinttypes>
-
 #include "db/compaction/compaction_iterator.h"
+
+#include <iterator>
+#include <limits>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
 #include "db/snapshot_checker.h"
+#include "logging/logging.h"
 #include "port/likely.h"
 #include "rocksdb/listener.h"
 #include "table/internal_iterator.h"
 #include "test_util/sync_point.h"
 
-#define DEFINITELY_IN_SNAPSHOT(seq, snapshot)                       \
-  ((seq) <= (snapshot) &&                                           \
-   (snapshot_checker_ == nullptr ||                                 \
-    LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
-           SnapshotCheckerResult::kInSnapshot)))
-
-#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot)                     \
-  ((seq) > (snapshot) ||                                              \
-   (snapshot_checker_ != nullptr &&                                   \
-    UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
-             SnapshotCheckerResult::kNotInSnapshot)))
-
-#define IN_EARLIEST_SNAPSHOT(seq) \
-  ((seq) <= earliest_snapshot_ && \
-   (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq))))
-
 namespace ROCKSDB_NAMESPACE {
-
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
     SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
     const SnapshotChecker* snapshot_checker, Env* env,
     bool report_detailed_time, bool expect_valid_internal_key,
-    CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
-    const CompactionFilter* compaction_filter,
+    CompactionRangeDelAggregator* range_del_agg,
+    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+    const Compaction* compaction, const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    const std::atomic<bool>* manual_compaction_paused,
-    const std::shared_ptr<Logger> info_log)
+    const std::atomic<int>* manual_compaction_paused,
+    const std::atomic<bool>* manual_compaction_canceled,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
           earliest_write_conflict_snapshot, snapshot_checker, env,
           report_detailed_time, expect_valid_internal_key, range_del_agg,
+          blob_file_builder, allow_data_in_errors,
           std::unique_ptr<CompactionProxy>(
-              compaction ? new CompactionProxy(compaction) : nullptr),
+              compaction ? new RealCompaction(compaction) : nullptr),
           compaction_filter, shutting_down, preserve_deletes_seqnum,
-          manual_compaction_paused, info_log) {}
+          manual_compaction_paused, manual_compaction_canceled, info_log,
+          full_history_ts_low) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -58,36 +53,54 @@
     const SnapshotChecker* snapshot_checker, Env* env,
     bool report_detailed_time, bool expect_valid_internal_key,
     CompactionRangeDelAggregator* range_del_agg,
+    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
     std::unique_ptr<CompactionProxy> compaction,
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    const std::atomic<bool>* manual_compaction_paused,
-    const std::shared_ptr<Logger> info_log)
-    : input_(input),
+    const std::atomic<int>* manual_compaction_paused,
+    const std::atomic<bool>* manual_compaction_canceled,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low)
+    : input_(input, cmp,
+             !compaction || compaction->DoesInputReferenceBlobFiles()),
       cmp_(cmp),
       merge_helper_(merge_helper),
       snapshots_(snapshots),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
       snapshot_checker_(snapshot_checker),
       env_(env),
+      clock_(env_->GetSystemClock().get()),
       report_detailed_time_(report_detailed_time),
       expect_valid_internal_key_(expect_valid_internal_key),
       range_del_agg_(range_del_agg),
+      blob_file_builder_(blob_file_builder),
       compaction_(std::move(compaction)),
       compaction_filter_(compaction_filter),
       shutting_down_(shutting_down),
       manual_compaction_paused_(manual_compaction_paused),
+      manual_compaction_canceled_(manual_compaction_canceled),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
+      info_log_(info_log),
+      allow_data_in_errors_(allow_data_in_errors),
+      timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0),
+      full_history_ts_low_(full_history_ts_low),
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
       merge_out_iter_(merge_helper_),
+      blob_garbage_collection_cutoff_file_number_(
+          ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())),
+      blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())),
+      prefetch_buffers_(
+          CreatePrefetchBufferCollectionIfNeeded(compaction_.get())),
       current_key_committed_(false),
-      info_log_(info_log) {
-  assert(compaction_filter_ == nullptr || compaction_ != nullptr);
+      cmp_with_history_ts_low_(0),
+      level_(compaction_ == nullptr ? 0 : compaction_->level()) {
   assert(snapshots_ != nullptr);
-  bottommost_level_ =
-      compaction_ == nullptr ? false : compaction_->bottommost_level();
+  bottommost_level_ = compaction_ == nullptr
+                          ? false
+                          : compaction_->bottommost_level() &&
+                                !compaction_->allow_ingest_behind();
   if (compaction_ != nullptr) {
     level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
   }
@@ -108,14 +121,16 @@
   for (size_t i = 1; i < snapshots_->size(); ++i) {
     assert(snapshots_->at(i - 1) < snapshots_->at(i));
   }
+  assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
+         timestamp_size_ == full_history_ts_low_->size());
 #endif
-  input_->SetPinnedItersMgr(&pinned_iters_mgr_);
+  input_.SetPinnedItersMgr(&pinned_iters_mgr_);
   TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }
 
 CompactionIterator::~CompactionIterator() {
-  // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime
-  input_->SetPinnedItersMgr(nullptr);
+  // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
+  input_.SetPinnedItersMgr(nullptr);
 }
 
 void CompactionIterator::ResetRecordCounts() {
@@ -142,14 +157,13 @@
     if (merge_out_iter_.Valid()) {
       key_ = merge_out_iter_.key();
       value_ = merge_out_iter_.value();
-      bool valid_key __attribute__((__unused__));
-      valid_key =  ParseInternalKey(key_, &ikey_);
+      Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
-      assert(valid_key);
-      if (!valid_key) {
-        ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
-                        key_.ToString(true).c_str());
+      assert(s.ok());
+      if (!s.ok()) {
+        ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s",
+                        s.getState());
       }
 
       // Keep current_key_ in sync.
@@ -169,7 +183,7 @@
     // Only advance the input iterator if there is no merge output and the
     // iterator is not already at the next record.
     if (!at_next_) {
-      input_->Next();
+      AdvanceInputIter();
     }
     NextFromInput();
   }
@@ -182,90 +196,191 @@
   PrepareOutput();
 }
 
-void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
                                               Slice* skip_until) {
-  if (compaction_filter_ != nullptr &&
-      (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) {
-    // If the user has specified a compaction filter and the sequence
-    // number is greater than any external snapshot, then invoke the
-    // filter. If the return value of the compaction filter is true,
-    // replace the entry with a deletion marker.
-    CompactionFilter::Decision filter;
-    compaction_filter_value_.clear();
-    compaction_filter_skip_until_.Clear();
-    CompactionFilter::ValueType value_type =
-        ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
-                                 : CompactionFilter::ValueType::kBlobIndex;
-    // Hack: pass internal key to BlobIndexCompactionFilter since it needs
-    // to get sequence number.
-    Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_;
-    {
-      StopWatchNano timer(env_, report_detailed_time_);
+  if (!compaction_filter_ ||
+      (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
+    return true;
+  }
+  bool error = false;
+  // If the user has specified a compaction filter and the sequence
+  // number is greater than any external snapshot, then invoke the
+  // filter. If the return value of the compaction filter is true,
+  // replace the entry with a deletion marker.
+  CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+  CompactionFilter::ValueType value_type =
+      ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                               : CompactionFilter::ValueType::kBlobIndex;
+  // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+  // to get sequence number.
+  assert(compaction_filter_);
+  Slice& filter_key =
+      (ikey_.type == kTypeValue ||
+       !compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
+          ? ikey_.user_key
+          : key_;
+  {
+    StopWatchNano timer(clock_, report_detailed_time_);
+    if (kTypeBlobIndex == ikey_.type) {
+      blob_value_.Reset();
+      filter = compaction_filter_->FilterBlobByKey(
+          level_, filter_key, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
+      if (CompactionFilter::Decision::kUndetermined == filter &&
+          !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+        if (compaction_ == nullptr) {
+          status_ =
+              Status::Corruption("Unexpected blob index outside of compaction");
+          valid_ = false;
+          return false;
+        }
+
+        // For integrated BlobDB impl, CompactionIterator reads blob value.
+        // For Stacked BlobDB impl, the corresponding CompactionFilter's
+        // FilterV2 method should read the blob value.
+        BlobIndex blob_index;
+        Status s = blob_index.DecodeFrom(value_);
+        if (!s.ok()) {
+          status_ = s;
+          valid_ = false;
+          return false;
+        }
+
+        FilePrefetchBuffer* prefetch_buffer =
+            prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+                                    blob_index.file_number())
+                              : nullptr;
+
+        uint64_t bytes_read = 0;
+
+        assert(blob_fetcher_);
+
+        s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index,
+                                     prefetch_buffer, &blob_value_,
+                                     &bytes_read);
+        if (!s.ok()) {
+          status_ = s;
+          valid_ = false;
+          return false;
+        }
+
+        ++iter_stats_.num_blobs_read;
+        iter_stats_.total_blob_bytes_read += bytes_read;
+
+        value_type = CompactionFilter::ValueType::kValue;
+      }
+    }
+    if (CompactionFilter::Decision::kUndetermined == filter) {
       filter = compaction_filter_->FilterV2(
-          compaction_->level(), filter_key, value_type, value_,
-          &compaction_filter_value_, compaction_filter_skip_until_.rep());
-      iter_stats_.total_filter_time +=
-          env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
-    }
-
-    if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
-        cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
-            0) {
-      // Can't skip to a key smaller than the current one.
-      // Keep the key as per FilterV2 documentation.
-      filter = CompactionFilter::Decision::kKeep;
-    }
-
-    if (filter == CompactionFilter::Decision::kRemove) {
-      // convert the current key to a delete; key_ is pointing into
-      // current_key_ at this point, so updating current_key_ updates key()
-      ikey_.type = kTypeDeletion;
-      current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
-      // no value associated with delete
-      value_.clear();
-      iter_stats_.num_record_drop_user++;
-    } else if (filter == CompactionFilter::Decision::kChangeValue) {
-      value_ = compaction_filter_value_;
-    } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
-      *need_skip = true;
-      compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
-                                                       kValueTypeForSeek);
-      *skip_until = compaction_filter_skip_until_.Encode();
+          level_, filter_key, value_type,
+          blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
     }
+    iter_stats_.total_filter_time +=
+        env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+  }
+
+  if (CompactionFilter::Decision::kUndetermined == filter) {
+    // Should not reach here, since FilterV2 should never return kUndetermined.
+    status_ =
+        Status::NotSupported("FilterV2() should never return kUndetermined");
+    valid_ = false;
+    return false;
+  }
+
+  if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+      cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+          0) {
+    // Can't skip to a key smaller than the current one.
+    // Keep the key as per FilterV2 documentation.
+    filter = CompactionFilter::Decision::kKeep;
   }
+
+  if (filter == CompactionFilter::Decision::kRemove) {
+    // convert the current key to a delete; key_ is pointing into
+    // current_key_ at this point, so updating current_key_ updates key()
+    ikey_.type = kTypeDeletion;
+    current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+    // no value associated with delete
+    value_.clear();
+    iter_stats_.num_record_drop_user++;
+  } else if (filter == CompactionFilter::Decision::kChangeValue) {
+    if (ikey_.type == kTypeBlobIndex) {
+      // value transfer from blob file to inlined data
+      ikey_.type = kTypeValue;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+    *need_skip = true;
+    compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                     kValueTypeForSeek);
+    *skip_until = compaction_filter_skip_until_.Encode();
+  } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
+    // Only the StackableDB-based BlobDB impl's compaction filter should return
+    // kChangeBlobIndex. Decision about rewriting blob and changing blob index
+    // in the integrated BlobDB impl is made in subsequent call to
+    // PrepareOutput() and its callees.
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "Only stacked BlobDB's internal compaction filter can return "
+          "kChangeBlobIndex.");
+      valid_ = false;
+      return false;
+    }
+    if (ikey_.type == kTypeValue) {
+      // value transfer from inlined data to blob file
+      ikey_.type = kTypeBlobIndex;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kIOError) {
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "CompactionFilter for integrated BlobDB should not return kIOError");
+      valid_ = false;
+      return false;
+    }
+    status_ = Status::IOError("Failed to access blob during compaction filter");
+    error = true;
+  }
+  return !error;
 }
 
 void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
 
-  while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+  while (!valid_ && input_.Valid() && !IsPausingManualCompaction() &&
          !IsShuttingDown()) {
-    key_ = input_->key();
-    value_ = input_->value();
+    key_ = input_.key();
+    value_ = input_.value();
     iter_stats_.num_input_records++;
 
-    if (!ParseInternalKey(key_, &ikey_)) {
+    Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+    if (!pik_status.ok()) {
+      iter_stats_.num_input_corrupt_records++;
+
       // If `expect_valid_internal_key_` is false, return the corrupted key
       // and let the caller decide what to do with it.
-      // TODO(noetzli): We should have a more elegant solution for this.
       if (expect_valid_internal_key_) {
-        assert(!"Corrupted internal key not expected.");
-        status_ = Status::Corruption("Corrupted internal key not expected.");
-        break;
+        status_ = pik_status;
+        return;
       }
       key_ = current_key_.SetInternalKey(key_);
       has_current_user_key_ = false;
       current_user_key_sequence_ = kMaxSequenceNumber;
       current_user_key_snapshot_ = 0;
-      iter_stats_.num_input_corrupt_records++;
       valid_ = true;
       break;
     }
     TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
 
     // Update input statistics
-    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
+        ikey_.type == kTypeDeletionWithTimestamp) {
       iter_stats_.num_input_deletion_records++;
     }
     iter_stats_.total_input_raw_key_bytes += key_.size();
@@ -278,25 +393,71 @@
     // merge_helper_->compaction_filter_skip_until_.
     Slice skip_until;
 
+    bool user_key_equal_without_ts = false;
+    int cmp_ts = 0;
+    if (has_current_user_key_) {
+      user_key_equal_without_ts =
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_);
+      // if timestamp_size_ > 0, then curr_ts_ has been initialized by a
+      // previous key.
+      cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp(
+                                     ExtractTimestampFromUserKey(
+                                         ikey_.user_key, timestamp_size_),
+                                     curr_ts_)
+                               : 0;
+    }
+
     // Check whether the user key changed. After this if statement current_key_
     // is a copy of the current input key (maybe converted to a delete by the
     // compaction filter). ikey_.user_key is pointing to the copy.
-    if (!has_current_user_key_ ||
-        !cmp_->Equal(ikey_.user_key, current_user_key_)) {
+    if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) {
       // First occurrence of this user key
       // Copy key for output
       key_ = current_key_.SetInternalKey(key_, &ikey_);
+
+      int prev_cmp_with_ts_low =
+          !full_history_ts_low_ ? 0
+          : curr_ts_.empty()
+              ? 0
+              : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_);
+
+      // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use
+      // in next iteration to compare with the timestamp of next key.
+      UpdateTimestampAndCompareWithFullHistoryLow();
+
+      // If
+      // (1) !has_current_user_key_, OR
+      // (2) timestamp is disabled, OR
+      // (3) all history will be preserved, OR
+      // (4) user key (excluding timestamp) is different from previous key, OR
+      // (5) timestamp is NO older than *full_history_ts_low_, OR
+      // (6) timestamp is the largest one older than full_history_ts_low_,
+      // then current_user_key_ must be treated as a different user key.
+      // This means, if a user key (excluding ts) is the same as the previous
+      // user key, and its ts is older than *full_history_ts_low_, then we
+      // consider this key for GC, e.g. it may be dropped if certain conditions
+      // match.
+      if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ ||
+          !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 ||
+          prev_cmp_with_ts_low >= 0) {
+        // Initialize for future comparison for rule (A) and etc.
+        current_user_key_sequence_ = kMaxSequenceNumber;
+        current_user_key_snapshot_ = 0;
+        has_current_user_key_ = true;
+      }
       current_user_key_ = ikey_.user_key;
-      has_current_user_key_ = true;
+
       has_outputted_key_ = false;
-      current_user_key_sequence_ = kMaxSequenceNumber;
-      current_user_key_snapshot_ = 0;
+
+      last_key_seq_zeroed_ = false;
+
       current_key_committed_ = KeyCommitted(ikey_.sequence);
 
       // Apply the compaction filter to the first committed version of the user
       // key.
-      if (current_key_committed_) {
-        InvokeFilterIfNeeded(&need_skip, &skip_until);
+      if (current_key_committed_ &&
+          !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+        break;
       }
     } else {
       // Update the current key to reflect the new sequence number/type without
@@ -316,8 +477,9 @@
         current_key_committed_ = KeyCommitted(ikey_.sequence);
         // Apply the compaction filter to the first committed version of the
         // user key.
-        if (current_key_committed_) {
-          InvokeFilterIfNeeded(&need_skip, &skip_until);
+        if (current_key_committed_ &&
+            !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+          break;
         }
       }
     }
@@ -331,8 +493,7 @@
     // If there are no snapshots, then this kv affect visibility at tip.
     // Otherwise, search though all existing snapshots to find the earliest
     // snapshot that is affected by this kv.
-    SequenceNumber last_sequence __attribute__((__unused__));
-    last_sequence = current_user_key_sequence_;
+    SequenceNumber last_sequence = current_user_key_sequence_;
     current_user_key_sequence_ = ikey_.sequence;
     SequenceNumber last_snapshot = current_user_key_snapshot_;
     SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
@@ -347,20 +508,25 @@
       // In the previous iteration we encountered a single delete that we could
       // not compact out.  We will keep this Put, but can drop it's data.
       // (See Optimization 3, below.)
-      assert(ikey_.type == kTypeValue);
-      if (ikey_.type != kTypeValue) {
+      assert(ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex);
+      if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex) {
         ROCKS_LOG_FATAL(info_log_,
                         "Unexpected key type %d for compaction output",
                         ikey_.type);
       }
-      assert(current_user_key_snapshot_ == last_snapshot);
-      if (current_user_key_snapshot_ != last_snapshot) {
+      assert(current_user_key_snapshot_ >= last_snapshot);
+      if (current_user_key_snapshot_ < last_snapshot) {
         ROCKS_LOG_FATAL(info_log_,
                         "current_user_key_snapshot_ (%" PRIu64
-                        ") != last_snapshot (%" PRIu64 ")",
+                        ") < last_snapshot (%" PRIu64 ")",
                         current_user_key_snapshot_, last_snapshot);
       }
 
+      if (ikey_.type == kTypeBlobIndex) {
+        ikey_.type = kTypeValue;
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      }
+
       value_.clear();
       valid_ = true;
       clear_and_output_next_key_ = false;
@@ -372,6 +538,25 @@
       // 2) We've already returned a record in this snapshot -OR-
       //    there are no earlier earliest_write_conflict_snapshot.
       //
+      // A note about 2) above:
+      // we try to determine whether there is any earlier write conflict
+      // checking snapshot by calling DefinitelyInSnapshot() with seq and
+      // earliest_write_conflict_snapshot as arguments. For write-prepared
+      // and write-unprepared transactions, if earliest_write_conflict_snapshot
+      // is evicted from WritePreparedTxnDB::commit_cache, then
+      // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns
+      // false, even if the seq is actually visible within
+      // earliest_write_conflict_snapshot. Consequently, CompactionIterator
+      // may try to zero out its sequence number, thus hitting assertion error
+      // in debug mode or cause incorrect DBIter return result.
+      // We observe that earliest_write_conflict_snapshot >= earliest_snapshot,
+      // and the seq zeroing logic depends on
+      // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot
+      // determine whether seq is **definitely** in
+      // earliest_write_conflict_snapshot, then we can additionally check if
+      // seq is definitely in earliest_snapshot. If the latter holds, then the
+      // former holds too.
+      //
       // Rule 1 is needed for SingleDelete correctness.  Rule 2 is needed to
       // allow Transactions to do write-conflict checking (if we compacted away
       // all keys, then we wouldn't know that a write happened in this
@@ -396,33 +581,78 @@
       // we can choose how to handle such a combinations of operations.  We will
       // try to compact out as much as we can in these cases.
       // We will report counts on these anomalous cases.
+      //
+      // Note: If timestamp is enabled, then record will be eligible for
+      // deletion, only if, along with above conditions (Rule 1 and Rule 2)
+      // full_history_ts_low_ is specified and timestamp for that key is less
+      // than *full_history_ts_low_. If it's not eligible for deletion, then we
+      // will output the SingleDelete. For Optimization 3 also, if
+      // full_history_ts_low_ is specified and timestamp for the key is less
+      // than *full_history_ts_low_ then only optimization will be applied.
 
       // The easiest way to process a SingleDelete during iteration is to peek
       // ahead at the next key.
+      const bool is_timestamp_eligible_for_gc =
+          (timestamp_size_ == 0 ||
+           (full_history_ts_low_ && cmp_with_history_ts_low_ < 0));
+
       ParsedInternalKey next_ikey;
-      input_->Next();
+      AdvanceInputIter();
 
       // Check whether the next key exists, is not corrupt, and is the same key
       // as the single delete.
-      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
-          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
-        // Check whether the next key belongs to the same snapshot as the
-        // SingleDelete.
-        if (prev_snapshot == 0 ||
-            DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) {
-          if (next_ikey.type == kTypeSingleDeletion) {
-            // We encountered two SingleDeletes in a row.  This could be due to
-            // unexpected user input.
-            // Skip the first SingleDelete and let the next iteration decide how
-            // to handle the second SingleDelete
+      if (input_.Valid() &&
+          ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+              .ok() &&
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+#ifndef NDEBUG
+        const Compaction* c =
+            compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+        TEST_SYNC_POINT_CALLBACK(
+            "CompactionIterator::NextFromInput:SingleDelete:1",
+            const_cast<Compaction*>(c));
+        if (last_key_seq_zeroed_) {
+          ++iter_stats_.num_record_drop_hidden;
+          ++iter_stats_.num_record_drop_obsolete;
+          assert(bottommost_level_);
+          AdvanceInputIter();
+        } else if (prev_snapshot == 0 ||
+                   DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) {
+          // Check whether the next key belongs to the same snapshot as the
+          // SingleDelete.
+
+          TEST_SYNC_POINT_CALLBACK(
+              "CompactionIterator::NextFromInput:SingleDelete:2", nullptr);
+          if (next_ikey.type == kTypeSingleDeletion ||
+              next_ikey.type == kTypeDeletion) {
+            // We encountered two SingleDeletes for same key in a row. This
+            // could be due to unexpected user input. If write-(un)prepared
+            // transaction is used, this could also be due to releasing an old
+            // snapshot between a Put and its matching SingleDelete.
+            // Furthermore, if write-(un)prepared transaction is rolled back
+            // after prepare, we will write a Delete to cancel a prior Put. If
+            // old snapshot is released between a later Put and its matching
+            // SingleDelete, we will end up with a Delete followed by
+            // SingleDelete.
+            // Skip the first SingleDelete and let the next iteration decide
+            // how to handle the second SingleDelete or Delete.
 
             // First SingleDelete has been skipped since we already called
-            // input_->Next().
+            // input_.Next().
             ++iter_stats_.num_record_drop_obsolete;
             ++iter_stats_.num_single_del_mismatch;
+          } else if (!is_timestamp_eligible_for_gc) {
+            // We cannot drop the SingleDelete as timestamp is enabled, and
+            // timestamp of this key is greater than or equal to
+            // *full_history_ts_low_. We will output the SingleDelete.
+            valid_ = true;
           } else if (has_outputted_key_ ||
-                     DEFINITELY_IN_SNAPSHOT(
-                         ikey_.sequence, earliest_write_conflict_snapshot_)) {
+                     DefinitelyInSnapshot(ikey_.sequence,
+                                          earliest_write_conflict_snapshot_) ||
+                     (earliest_snapshot_ < earliest_write_conflict_snapshot_ &&
+                      DefinitelyInSnapshot(ikey_.sequence,
+                                           earliest_snapshot_))) {
             // Found a matching value, we can drop the single delete and the
             // value.  It is safe to drop both records since we've already
             // outputted a key in this snapshot, or there is no earlier
@@ -439,9 +669,9 @@
 
             ++iter_stats_.num_record_drop_hidden;
             ++iter_stats_.num_record_drop_obsolete;
-            // Already called input_->Next() once.  Call it a second time to
+            // Already called input_.Next() once.  Call it a second time to
             // skip past the second key.
-            input_->Next();
+            AdvanceInputIter();
           } else {
             // Found a matching value, but we cannot drop both keys since
             // there is an earlier snapshot and we need to leave behind a record
@@ -455,11 +685,17 @@
             // Set up the Put to be outputted in the next iteration.
             // (Optimization 3).
             clear_and_output_next_key_ = true;
+            TEST_SYNC_POINT_CALLBACK(
+                "CompactionIterator::NextFromInput:KeepSDForWW",
+                /*arg=*/nullptr);
           }
         } else {
           // We hit the next snapshot without hitting a put, so the iterator
           // returns the single delete.
           valid_ = true;
+          TEST_SYNC_POINT_CALLBACK(
+              "CompactionIterator::NextFromInput:SingleDelete:3",
+              const_cast<Compaction*>(c));
         }
       } else {
         // We are at the end of the input, could not parse the next key, or hit
@@ -470,9 +706,11 @@
         // iteration. If the next key is corrupt, we return before the
         // comparison, so the value of has_current_user_key does not matter.
         has_current_user_key_ = false;
-        if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+        if (compaction_ != nullptr &&
+            DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
             compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
-                                                       &level_ptrs_)) {
+                                                       &level_ptrs_) &&
+            is_timestamp_eligible_for_gc) {
           // Key doesn't exist outside of this range.
           // Can compact out this SingleDelete.
           ++iter_stats_.num_record_drop_obsolete;
@@ -480,6 +718,11 @@
           if (!bottommost_level_) {
             ++iter_stats_.num_optimized_del_drop_obsolete;
           }
+        } else if (last_key_seq_zeroed_) {
+          // Skip.
+          ++iter_stats_.num_record_drop_hidden;
+          ++iter_stats_.num_record_drop_obsolete;
+          assert(bottommost_level_);
         } else {
           // Output SingleDelete
           valid_ = true;
@@ -508,10 +751,13 @@
                         last_sequence, current_user_key_sequence_);
       }
 
-      ++iter_stats_.num_record_drop_hidden;  // (A)
-      input_->Next();
-    } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
-               IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+      ++iter_stats_.num_record_drop_hidden;  // rule (A)
+      AdvanceInputIter();
+    } else if (compaction_ != nullptr &&
+               (ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
+               DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
                ikeyNotNeededForIncrementalSnapshot() &&
                compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                           &level_ptrs_)) {
@@ -534,30 +780,54 @@
       // given that:
       // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
       // (2) No value exist earlier than the deletion.
+      //
+      // Note also that a deletion marker of type kTypeDeletionWithTimestamp
+      // will be treated as a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
       ++iter_stats_.num_record_drop_obsolete;
       if (!bottommost_level_) {
         ++iter_stats_.num_optimized_del_drop_obsolete;
       }
-      input_->Next();
-    } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ &&
-               ikeyNotNeededForIncrementalSnapshot()) {
+      AdvanceInputIter();
+    } else if ((ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
+               bottommost_level_ && ikeyNotNeededForIncrementalSnapshot()) {
       // Handle the case where we have a delete key at the bottom most level
       // We can skip outputting the key iff there are no subsequent puts for this
       // key
+      assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
+                                 ikey_.user_key, &level_ptrs_));
       ParsedInternalKey next_ikey;
-      input_->Next();
-      // Skip over all versions of this key that happen to occur in the same snapshot
-      // range as the delete
-      while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
-             cmp_->Equal(ikey_.user_key, next_ikey.user_key) &&
+      AdvanceInputIter();
+#ifndef NDEBUG
+      const Compaction* c =
+          compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+      TEST_SYNC_POINT_CALLBACK(
+          "CompactionIterator::NextFromInput:BottommostDelete:1",
+          const_cast<Compaction*>(c));
+      // Skip over all versions of this key that happen to occur in the same
+      // snapshot range as the delete.
+      //
+      // Note that a deletion marker of type kTypeDeletionWithTimestamp will be
+      // considered to have a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
+      while (!IsPausingManualCompaction() && !IsShuttingDown() &&
+             input_.Valid() &&
+             (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+                  .ok()) &&
+             cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
              (prev_snapshot == 0 ||
-              DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) {
-        input_->Next();
+              DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) {
+        AdvanceInputIter();
       }
       // If you find you still need to output a row with this key, we need to output the
       // delete too
-      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
-          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+      if (input_.Valid() &&
+          (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+               .ok()) &&
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
         valid_ = true;
         at_next_ = true;
       }
@@ -569,12 +839,15 @@
       }
 
       pinned_iters_mgr_.StartPinning();
+
       // We know the merge type entry is not hidden, otherwise we would
       // have hit (A)
       // We encapsulate the merge related state machine in a different
       // object to minimize change to the existing flow.
-      Status s = merge_helper_->MergeUntil(input_, range_del_agg_,
-                                           prev_snapshot, bottommost_level_);
+      Status s = merge_helper_->MergeUntil(
+          &input_, range_del_agg_, prev_snapshot, bottommost_level_,
+          allow_data_in_errors_, blob_fetcher_.get(), prefetch_buffers_.get(),
+          &iter_stats_);
       merge_out_iter_.SeekToFirst();
 
       if (!s.ok() && !s.IsMergeInProgress()) {
@@ -585,14 +858,13 @@
         //       These will be correctly set below.
         key_ = merge_out_iter_.key();
         value_ = merge_out_iter_.value();
-        bool valid_key __attribute__((__unused__));
-        valid_key = ParseInternalKey(key_, &ikey_);
+        pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
         // MergeUntil stops when it encounters a corrupt key and does not
         // include them in the result, so we expect the keys here to valid.
-        assert(valid_key);
-        if (!valid_key) {
-          ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
-                          key_.ToString(true).c_str());
+        assert(pik_status.ok());
+        if (!pik_status.ok()) {
+          ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s",
+                          pik_status.getState());
         }
         // Keep current_key_ in sync.
         current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
@@ -618,14 +890,14 @@
       if (should_delete) {
         ++iter_stats_.num_record_drop_hidden;
         ++iter_stats_.num_record_drop_range_del;
-        input_->Next();
+        AdvanceInputIter();
       } else {
         valid_ = true;
       }
     }
 
     if (need_skip) {
-      input_->Seek(skip_until);
+      SkipUntil(skip_until);
     }
   }
 
@@ -638,25 +910,144 @@
   }
 }
 
-void CompactionIterator::PrepareOutput() {
-  if (valid_) {
-    if (compaction_filter_ && ikey_.type == kTypeBlobIndex) {
-      const auto blob_decision = compaction_filter_->PrepareBlobOutput(
-          user_key(), value_, &compaction_filter_value_);
-
-      if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
-        status_ = Status::Corruption(
-            "Corrupted blob reference encountered during GC");
+bool CompactionIterator::ExtractLargeValueIfNeededImpl() {
+  if (!blob_file_builder_) {
+    return false;
+  }
+
+  blob_index_.clear();
+  const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+
+    return false;
+  }
+
+  if (blob_index_.empty()) {
+    return false;
+  }
+
+  value_ = blob_index_;
+
+  return true;
+}
+
+void CompactionIterator::ExtractLargeValueIfNeeded() {
+  assert(ikey_.type == kTypeValue);
+
+  if (!ExtractLargeValueIfNeededImpl()) {
+    return;
+  }
+
+  ikey_.type = kTypeBlobIndex;
+  current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+}
+
+void CompactionIterator::GarbageCollectBlobIfNeeded() {
+  assert(ikey_.type == kTypeBlobIndex);
+
+  if (!compaction_) {
+    return;
+  }
+
+  // GC for integrated BlobDB
+  if (compaction_->enable_blob_garbage_collection()) {
+    BlobIndex blob_index;
+
+    {
+      const Status s = blob_index.DecodeFrom(value_);
+
+      if (!s.ok()) {
+        status_ = s;
         valid_ = false;
-      } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
-        status_ = Status::IOError("Could not relocate blob during GC");
+
+        return;
+      }
+    }
+
+    if (blob_index.file_number() >=
+        blob_garbage_collection_cutoff_file_number_) {
+      return;
+    }
+
+    FilePrefetchBuffer* prefetch_buffer =
+        prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+                                blob_index.file_number())
+                          : nullptr;
+
+    uint64_t bytes_read = 0;
+
+    {
+      assert(blob_fetcher_);
+
+      const Status s = blob_fetcher_->FetchBlob(
+          user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read);
+
+      if (!s.ok()) {
+        status_ = s;
         valid_ = false;
-      } else if (blob_decision ==
-                 CompactionFilter::BlobDecision::kChangeValue) {
-        value_ = compaction_filter_value_;
+
+        return;
       }
     }
 
+    ++iter_stats_.num_blobs_read;
+    iter_stats_.total_blob_bytes_read += bytes_read;
+
+    ++iter_stats_.num_blobs_relocated;
+    iter_stats_.total_blob_bytes_relocated += blob_index.size();
+
+    value_ = blob_value_;
+
+    if (ExtractLargeValueIfNeededImpl()) {
+      return;
+    }
+
+    ikey_.type = kTypeValue;
+    current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+
+    return;
+  }
+
+  // GC for stacked BlobDB
+  if (compaction_filter_ &&
+      compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+    const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+        user_key(), value_, &compaction_filter_value_);
+
+    if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+      status_ =
+          Status::Corruption("Corrupted blob reference encountered during GC");
+      valid_ = false;
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+      status_ = Status::IOError("Could not relocate blob during GC");
+      valid_ = false;
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) {
+      value_ = compaction_filter_value_;
+
+      return;
+    }
+  }
+}
+
+void CompactionIterator::PrepareOutput() {
+  if (valid_) {
+    if (ikey_.type == kTypeValue) {
+      ExtractLargeValueIfNeeded();
+    } else if (ikey_.type == kTypeBlobIndex) {
+      GarbageCollectBlobIfNeeded();
+    }
+
     // Zeroing out the sequence number leads to better compression.
     // If this is the bottommost level (no files in lower levels)
     // and the earliest snapshot is larger than this seqno
@@ -671,15 +1062,34 @@
     if (valid_ && compaction_ != nullptr &&
         !compaction_->allow_ingest_behind() &&
         ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ &&
-        IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) {
-      assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
-      if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+        DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+        ikey_.type != kTypeMerge) {
+      assert(ikey_.type != kTypeDeletion);
+      assert(ikey_.type != kTypeSingleDeletion ||
+             (timestamp_size_ || full_history_ts_low_));
+      if (ikey_.type == kTypeDeletion ||
+          (ikey_.type == kTypeSingleDeletion &&
+           (!timestamp_size_ || !full_history_ts_low_))) {
         ROCKS_LOG_FATAL(info_log_,
                         "Unexpected key type %d for seq-zero optimization",
                         ikey_.type);
       }
       ikey_.sequence = 0;
-      current_key_.UpdateInternalKey(0, ikey_.type);
+      last_key_seq_zeroed_ = true;
+      TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
+                               &ikey_);
+      if (!timestamp_size_) {
+        current_key_.UpdateInternalKey(0, ikey_.type);
+      } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
+        // We can also zero out timestamp for better compression.
+        // For the same user key (excluding timestamp), the timestamp-based
+        // history can be collapsed to save some space if the timestamp is
+        // older than *full_history_ts_low_.
+        const std::string kTsMin(timestamp_size_, static_cast<char>(0));
+        const Slice ts_slice = kTsMin;
+        ikey_.SetTimestamp(ts_slice);
+        current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+      }
     }
   }
 }
@@ -736,39 +1146,68 @@
          (ikey_.sequence < preserve_deletes_seqnum_);
 }
 
-bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
-  assert(snapshot_checker_ != nullptr);
-  bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber ||
-                        (earliest_snapshot_iter_ != snapshots_->end() &&
-                         *earliest_snapshot_iter_ == earliest_snapshot_));
-  assert(pre_condition);
-  if (!pre_condition) {
-    ROCKS_LOG_FATAL(info_log_,
-                    "Pre-Condition is not hold in IsInEarliestSnapshot");
+uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return 0;
   }
-  auto in_snapshot =
-      snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
-  while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
-    // Avoid the the current earliest_snapshot_ being return as
-    // earliest visible snapshot for the next value. So if a value's sequence
-    // is zero-ed out by PrepareOutput(), the next value will be compact out.
-    released_snapshots_.insert(earliest_snapshot_);
-    earliest_snapshot_iter_++;
 
-    if (earliest_snapshot_iter_ == snapshots_->end()) {
-      earliest_snapshot_ = kMaxSequenceNumber;
-    } else {
-      earliest_snapshot_ = *earliest_snapshot_iter_;
-    }
-    in_snapshot =
-        snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+  if (!compaction->enable_blob_garbage_collection()) {
+    return 0;
   }
-  assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
-  if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) {
-    ROCKS_LOG_FATAL(info_log_,
-                    "Unexpected released snapshot in IsInEarliestSnapshot");
+
+  const Version* const version = compaction->input_version();
+  assert(version);
+
+  const VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  auto it = blob_files.begin();
+  std::advance(
+      it, compaction->blob_garbage_collection_age_cutoff() * blob_files.size());
+
+  return it != blob_files.end() ? it->first
+                                : std::numeric_limits<uint64_t>::max();
+}
+
+std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return nullptr;
+  }
+
+  const Version* const version = compaction->input_version();
+  if (!version) {
+    return nullptr;
   }
-  return in_snapshot == SnapshotCheckerResult::kInSnapshot;
+
+  return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, ReadOptions()));
+}
+
+std::unique_ptr<PrefetchBufferCollection>
+CompactionIterator::CreatePrefetchBufferCollectionIfNeeded(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return nullptr;
+  }
+
+  if (!compaction->input_version()) {
+    return nullptr;
+  }
+
+  if (compaction->allow_mmap_reads()) {
+    return nullptr;
+  }
+
+  const uint64_t readahead_size = compaction->blob_compaction_readahead_size();
+  if (!readahead_size) {
+    return nullptr;
+  }
+
+  return std::unique_ptr<PrefetchBufferCollection>(
+      new PrefetchBufferCollection(readahead_size));
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cinttypes>
 #include <deque>
 #include <string>
 #include <unordered_set>
@@ -21,39 +22,153 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+class BlobFileBuilder;
+class BlobFetcher;
+class PrefetchBufferCollection;
+
+// A wrapper of internal iterator whose purpose is to count how
+// many entries there are in the iterator.
+class SequenceIterWrapper : public InternalIterator {
+ public:
+  SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
+                      bool need_count_entries)
+      : icmp_(cmp, /*named=*/false),
+        inner_iter_(iter),
+        need_count_entries_(need_count_entries) {}
+  bool Valid() const override { return inner_iter_->Valid(); }
+  Status status() const override { return inner_iter_->status(); }
+  void Next() override {
+    num_itered_++;
+    inner_iter_->Next();
+  }
+  void Seek(const Slice& target) override {
+    if (!need_count_entries_) {
+      inner_iter_->Seek(target);
+    } else {
+      // For flush cases, we need to count total number of entries, so we
+      // do Next() rather than Seek().
+      while (inner_iter_->Valid() &&
+             icmp_.Compare(inner_iter_->key(), target) < 0) {
+        Next();
+      }
+    }
+  }
+  Slice key() const override { return inner_iter_->key(); }
+  Slice value() const override { return inner_iter_->value(); }
+
+  // Unused InternalIterator methods
+  void SeekToFirst() override { assert(false); }
+  void Prev() override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+  uint64_t num_itered() const { return num_itered_; }
+
+ private:
+  InternalKeyComparator icmp_;
+  InternalIterator* inner_iter_;  // not owned
+  uint64_t num_itered_ = 0;
+  bool need_count_entries_;
+};
+
 class CompactionIterator {
  public:
   // A wrapper around Compaction. Has a much smaller interface, only what
   // CompactionIterator uses. Tests can override it.
   class CompactionProxy {
    public:
-    explicit CompactionProxy(const Compaction* compaction)
-        : compaction_(compaction) {}
-
     virtual ~CompactionProxy() = default;
-    virtual int level(size_t /*compaction_input_level*/ = 0) const {
-      return compaction_->level();
-    }
+
+    virtual int level() const = 0;
+
     virtual bool KeyNotExistsBeyondOutputLevel(
-        const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const = 0;
+
+    virtual bool bottommost_level() const = 0;
+
+    virtual int number_levels() const = 0;
+
+    virtual Slice GetLargestUserKey() const = 0;
+
+    virtual bool allow_ingest_behind() const = 0;
+
+    virtual bool preserve_deletes() const = 0;
+
+    virtual bool allow_mmap_reads() const = 0;
+
+    virtual bool enable_blob_garbage_collection() const = 0;
+
+    virtual double blob_garbage_collection_age_cutoff() const = 0;
+
+    virtual uint64_t blob_compaction_readahead_size() const = 0;
+
+    virtual const Version* input_version() const = 0;
+
+    virtual bool DoesInputReferenceBlobFiles() const = 0;
+
+    virtual const Compaction* real_compaction() const = 0;
+  };
+
+  class RealCompaction : public CompactionProxy {
+   public:
+    explicit RealCompaction(const Compaction* compaction)
+        : compaction_(compaction) {
+      assert(compaction_);
+      assert(compaction_->immutable_options());
+      assert(compaction_->mutable_cf_options());
+    }
+
+    int level() const override { return compaction_->level(); }
+
+    bool KeyNotExistsBeyondOutputLevel(
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const override {
       return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
     }
-    virtual bool bottommost_level() const {
+
+    bool bottommost_level() const override {
       return compaction_->bottommost_level();
     }
-    virtual int number_levels() const { return compaction_->number_levels(); }
-    virtual Slice GetLargestUserKey() const {
+
+    int number_levels() const override { return compaction_->number_levels(); }
+
+    Slice GetLargestUserKey() const override {
       return compaction_->GetLargestUserKey();
     }
-    virtual bool allow_ingest_behind() const {
-      return compaction_->immutable_cf_options()->allow_ingest_behind;
+
+    bool allow_ingest_behind() const override {
+      return compaction_->immutable_options()->allow_ingest_behind;
+    }
+
+    bool preserve_deletes() const override {
+      return compaction_->immutable_options()->preserve_deletes;
+    }
+
+    bool allow_mmap_reads() const override {
+      return compaction_->immutable_options()->allow_mmap_reads;
+    }
+
+    bool enable_blob_garbage_collection() const override {
+      return compaction_->mutable_cf_options()->enable_blob_garbage_collection;
     }
-    virtual bool preserve_deletes() const {
-      return compaction_->immutable_cf_options()->preserve_deletes;
+
+    double blob_garbage_collection_age_cutoff() const override {
+      return compaction_->mutable_cf_options()
+          ->blob_garbage_collection_age_cutoff;
     }
 
-   protected:
-    CompactionProxy() = default;
+    uint64_t blob_compaction_readahead_size() const override {
+      return compaction_->mutable_cf_options()->blob_compaction_readahead_size;
+    }
+
+    const Version* input_version() const override {
+      return compaction_->input_version();
+    }
+
+    bool DoesInputReferenceBlobFiles() const override {
+      return compaction_->DoesInputReferenceBlobFiles();
+    }
+
+    const Compaction* real_compaction() const override { return compaction_; }
 
    private:
     const Compaction* compaction_;
@@ -66,12 +181,15 @@
       const SnapshotChecker* snapshot_checker, Env* env,
       bool report_detailed_time, bool expect_valid_internal_key,
       CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
       const Compaction* compaction = nullptr,
       const CompactionFilter* compaction_filter = nullptr,
       const std::atomic<bool>* shutting_down = nullptr,
       const SequenceNumber preserve_deletes_seqnum = 0,
-      const std::atomic<bool>* manual_compaction_paused = nullptr,
-      const std::shared_ptr<Logger> info_log = nullptr);
+      const std::atomic<int>* manual_compaction_paused = nullptr,
+      const std::atomic<bool>* manual_compaction_canceled = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
   CompactionIterator(
@@ -81,12 +199,15 @@
       const SnapshotChecker* snapshot_checker, Env* env,
       bool report_detailed_time, bool expect_valid_internal_key,
       CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
       std::unique_ptr<CompactionProxy> compaction,
       const CompactionFilter* compaction_filter = nullptr,
       const std::atomic<bool>* shutting_down = nullptr,
       const SequenceNumber preserve_deletes_seqnum = 0,
-      const std::atomic<bool>* manual_compaction_paused = nullptr,
-      const std::shared_ptr<Logger> info_log = nullptr);
+      const std::atomic<int>* manual_compaction_paused = nullptr,
+      const std::atomic<bool>* manual_compaction_canceled = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr);
 
   ~CompactionIterator();
 
@@ -110,18 +231,39 @@
   bool Valid() const { return valid_; }
   const Slice& user_key() const { return current_user_key_; }
   const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
 
  private:
   // Processes the input stream to find the next output
   void NextFromInput();
 
-  // Do last preparations before presenting the output to the callee. At this
-  // point this only zeroes out the sequence number if possible for better
-  // compression.
+  // Do final preparations before presenting the output to the callee.
   void PrepareOutput();
 
+  // Passes the output value to the blob file builder (if any), and replaces it
+  // with the corresponding blob reference if it has been actually written to a
+  // blob file (i.e. if it passed the value size check). Returns true if the
+  // value got extracted to a blob file, false otherwise.
+  bool ExtractLargeValueIfNeededImpl();
+
+  // Extracts large values as described above, and updates the internal key's
+  // type to kTypeBlobIndex if the value got extracted. Should only be called
+  // for regular values (kTypeValue).
+  void ExtractLargeValueIfNeeded();
+
+  // Relocates valid blobs residing in the oldest blob files if garbage
+  // collection is enabled. Relocated blobs are written to new blob files or
+  // inlined in the LSM tree depending on the current settings (i.e.
+  // enable_blob_files and min_blob_size). Should only be called for blob
+  // references (kTypeBlobIndex).
+  //
+  // Note: the stacked BlobDB implementation's compaction filter based GC
+  // algorithm is also called from here.
+  void GarbageCollectBlobIfNeeded();
+
   // Invoke compaction filter if needed.
-  void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+  // Return true on success, false on failures (e.g.: kIOError).
+  bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
 
   // Given a sequence number, return the sequence number of the
   // earliest snapshot that this sequence number is visible in.
@@ -143,9 +285,32 @@
                SnapshotCheckerResult::kInSnapshot;
   }
 
-  bool IsInEarliestSnapshot(SequenceNumber sequence);
+  bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+  bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+  // Extract user-defined timestamp from user key if possible and compare it
+  // with *full_history_ts_low_ if applicable.
+  inline void UpdateTimestampAndCompareWithFullHistoryLow() {
+    if (!timestamp_size_) {
+      return;
+    }
+    Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+    curr_ts_.assign(ts.data(), ts.size());
+    if (full_history_ts_low_) {
+      cmp_with_history_ts_low_ =
+          cmp_->CompareTimestamp(ts, *full_history_ts_low_);
+    }
+  }
 
-  InternalIterator* input_;
+  static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber(
+      const CompactionProxy* compaction);
+  static std::unique_ptr<BlobFetcher> CreateBlobFetcherIfNeeded(
+      const CompactionProxy* compaction);
+  static std::unique_ptr<PrefetchBufferCollection>
+  CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction);
+
+  SequenceIterWrapper input_;
   const Comparator* cmp_;
   MergeHelper* merge_helper_;
   const std::vector<SequenceNumber>* snapshots_;
@@ -159,13 +324,16 @@
   const SequenceNumber earliest_write_conflict_snapshot_;
   const SnapshotChecker* const snapshot_checker_;
   Env* env_;
+  SystemClock* clock_;
   bool report_detailed_time_;
   bool expect_valid_internal_key_;
   CompactionRangeDelAggregator* range_del_agg_;
+  BlobFileBuilder* blob_file_builder_;
   std::unique_ptr<CompactionProxy> compaction_;
   const CompactionFilter* compaction_filter_;
   const std::atomic<bool>* shutting_down_;
-  const std::atomic<bool>* manual_compaction_paused_;
+  const std::atomic<int>* manual_compaction_paused_;
+  const std::atomic<bool>* manual_compaction_canceled_;
   const SequenceNumber preserve_deletes_seqnum_;
   bool bottommost_level_;
   bool valid_ = false;
@@ -173,6 +341,20 @@
   SequenceNumber earliest_snapshot_;
   SequenceNumber latest_snapshot_;
 
+  std::shared_ptr<Logger> info_log_;
+
+  bool allow_data_in_errors_;
+
+  // Comes from comparator.
+  const size_t timestamp_size_;
+
+  // Lower bound timestamp to retain full history in terms of user-defined
+  // timestamp. If a key's timestamp is older than full_history_ts_low_, then
+  // the key *may* be eligible for garbage collection (GC). The skipping logic
+  // is in `NextFromInput()` and `PrepareOutput()`.
+  // If nullptr, NO GC will be performed and all history will be preserved.
+  const std::string* const full_history_ts_low_;
+
   // State
   //
   // Points to a copy of the current compaction iterator output (current_key_)
@@ -191,11 +373,13 @@
   // Stores whether ikey_.user_key is valid. If set to false, the user key is
   // not compared against the current key in the underlying iterator.
   bool has_current_user_key_ = false;
-  bool at_next_ = false;  // If false, the iterator
-  // Holds a copy of the current compaction iterator output (or current key in
-  // the underlying iterator during NextFromInput()).
+  // If false, the iterator holds a copy of the current compaction iterator
+  // output (or current key in the underlying iterator during NextFromInput()).
+  bool at_next_ = false;
+
   IterKey current_key_;
   Slice current_user_key_;
+  std::string curr_ts_;
   SequenceNumber current_user_key_sequence_;
   SequenceNumber current_user_key_snapshot_;
 
@@ -210,6 +394,14 @@
   // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
   // merge operands and then releasing them after consuming them.
   PinnedIteratorsManager pinned_iters_mgr_;
+
+  uint64_t blob_garbage_collection_cutoff_file_number_;
+
+  std::unique_ptr<BlobFetcher> blob_fetcher_;
+  std::unique_ptr<PrefetchBufferCollection> prefetch_buffers_;
+
+  std::string blob_index_;
+  PinnableSlice blob_value_;
   std::string compaction_filter_value_;
   InternalKey compaction_filter_skip_until_;
   // "level_ptrs" holds indices that remember which file of an associated
@@ -224,7 +416,19 @@
   // Used to avoid purging uncommitted values. The application can specify
   // uncommitted values by providing a SnapshotChecker object.
   bool current_key_committed_;
-  std::shared_ptr<Logger> info_log_;
+
+  // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_)
+  int cmp_with_history_ts_low_;
+
+  const int level_;
+
+  // True if the previous internal key (same user key)'s sequence number has
+  // just been zeroed out during bottommost compaction.
+  bool last_key_seq_zeroed_{false};
+
+  void AdvanceInputIter() { input_.Next(); }
+
+  void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
 
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
@@ -233,8 +437,27 @@
 
   bool IsPausingManualCompaction() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
-    return manual_compaction_paused_ &&
-           manual_compaction_paused_->load(std::memory_order_relaxed);
+    return (manual_compaction_paused_ &&
+            manual_compaction_paused_->load(std::memory_order_relaxed) > 0) ||
+           (manual_compaction_canceled_ &&
+            manual_compaction_canceled_->load(std::memory_order_relaxed));
   }
 };
+
+inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
+                                                     SequenceNumber snapshot) {
+  return ((seq) <= (snapshot) &&
+          (snapshot_checker_ == nullptr ||
+           LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+                  SnapshotCheckerResult::kInSnapshot)));
+}
+
+inline bool CompactionIterator::DefinitelyNotInSnapshot(
+    SequenceNumber seq, SequenceNumber snapshot) {
+  return ((seq) > (snapshot) ||
+          (snapshot_checker_ != nullptr &&
+           UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+                    SnapshotCheckerResult::kNotInSnapshot)));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,15 +3,17 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "db/compaction/compaction_iterator.h"
 
 #include <string>
 #include <vector>
 
-#include "db/compaction/compaction_iterator.h"
+#include "db/dbformat.h"
 #include "port/port.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
+#include "util/vector_iterator.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -38,7 +40,7 @@
 
 // Compaction filter that gets stuck when it sees a particular key,
 // then gets unstuck when told to.
-// Always returns Decition::kRemove.
+// Always returns Decision::kRemove.
 class StallingFilter : public CompactionFilter {
  public:
   Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
@@ -86,7 +88,7 @@
   const char* Name() const override { return "AllKeysCompactionFilter"; }
 };
 
-class LoggingForwardVectorIterator : public InternalIterator {
+class LoggingForwardVectorIterator : public VectorIterator {
  public:
   struct Action {
     enum class Type {
@@ -108,22 +110,19 @@
 
   LoggingForwardVectorIterator(const std::vector<std::string>& keys,
                                const std::vector<std::string>& values)
-      : keys_(keys), values_(values), current_(keys.size()) {
-    assert(keys_.size() == values_.size());
+      : VectorIterator(keys, values) {
+    current_ = keys_.size();
   }
 
-  bool Valid() const override { return current_ < keys_.size(); }
-
   void SeekToFirst() override {
     log.emplace_back(Action::Type::SEEK_TO_FIRST);
-    current_ = 0;
+    VectorIterator::SeekToFirst();
   }
   void SeekToLast() override { assert(false); }
 
   void Seek(const Slice& target) override {
     log.emplace_back(Action::Type::SEEK, target.ToString());
-    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
-               keys_.begin();
+    VectorIterator::Seek(target);
   }
 
   void SeekForPrev(const Slice& /*target*/) override { assert(false); }
@@ -131,54 +130,66 @@
   void Next() override {
     assert(Valid());
     log.emplace_back(Action::Type::NEXT);
-    current_++;
+    VectorIterator::Next();
   }
   void Prev() override { assert(false); }
 
   Slice key() const override {
     assert(Valid());
-    return Slice(keys_[current_]);
+    return VectorIterator::key();
   }
   Slice value() const override {
     assert(Valid());
-    return Slice(values_[current_]);
+    return VectorIterator::value();
   }
 
-  Status status() const override { return Status::OK(); }
-
   std::vector<Action> log;
-
- private:
-  std::vector<std::string> keys_;
-  std::vector<std::string> values_;
-  size_t current_;
 };
 
 class FakeCompaction : public CompactionIterator::CompactionProxy {
  public:
-  FakeCompaction() = default;
+  int level() const override { return 0; }
 
-  int level(size_t /*compaction_input_level*/) const override { return 0; }
   bool KeyNotExistsBeyondOutputLevel(
       const Slice& /*user_key*/,
       std::vector<size_t>* /*level_ptrs*/) const override {
     return is_bottommost_level || key_not_exists_beyond_output_level;
   }
+
   bool bottommost_level() const override { return is_bottommost_level; }
+
   int number_levels() const override { return 1; }
+
   Slice GetLargestUserKey() const override {
     return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
   }
-  bool allow_ingest_behind() const override { return false; }
+
+  bool allow_ingest_behind() const override { return is_allow_ingest_behind; }
 
   bool preserve_deletes() const override { return false; }
 
+  bool allow_mmap_reads() const override { return false; }
+
+  bool enable_blob_garbage_collection() const override { return false; }
+
+  double blob_garbage_collection_age_cutoff() const override { return 0.0; }
+
+  uint64_t blob_compaction_readahead_size() const override { return 0; }
+
+  const Version* input_version() const override { return nullptr; }
+
+  bool DoesInputReferenceBlobFiles() const override { return false; }
+
+  const Compaction* real_compaction() const override { return nullptr; }
+
   bool key_not_exists_beyond_output_level = false;
 
   bool is_bottommost_level = false;
+
+  bool is_allow_ingest_behind = false;
 };
 
-// A simplifed snapshot checker which assumes each snapshot has a global
+// A simplified snapshot checker which assumes each snapshot has a global
 // last visible sequence.
 class TestSnapshotChecker : public SnapshotChecker {
  public:
@@ -214,6 +225,9 @@
   CompactionIteratorTest()
       : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
 
+  explicit CompactionIteratorTest(const Comparator* ucmp)
+      : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {}
+
   void InitIterators(
       const std::vector<std::string>& ks, const std::vector<std::string>& vs,
       const std::vector<std::string>& range_del_ks,
@@ -222,9 +236,11 @@
       SequenceNumber last_committed_sequence = kMaxSequenceNumber,
       MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
       bool bottommost_level = false,
-      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
     std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
-        new test::VectorIterator(range_del_ks, range_del_vs));
+        new VectorIterator(range_del_ks, range_del_vs, &icmp_));
     auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
         std::move(unfragmented_range_del_iter), icmp_);
     std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
@@ -234,9 +250,12 @@
     range_del_agg_->AddTombstones(std::move(range_del_iter));
 
     std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
-    if (filter || bottommost_level) {
+    if (filter || bottommost_level || key_not_exists_beyond_output_level) {
       compaction_proxy_ = new FakeCompaction();
       compaction_proxy_->is_bottommost_level = bottommost_level;
+      compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
+      compaction_proxy_->key_not_exists_beyond_output_level =
+          key_not_exists_beyond_output_level;
       compaction.reset(compaction_proxy_);
     }
     bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
@@ -249,13 +268,23 @@
                         0 /*latest_snapshot*/, snapshot_checker_.get(),
                         0 /*level*/, nullptr /*statistics*/, &shutting_down_));
 
+    if (c_iter_) {
+      // Since iter_ is still used in ~CompactionIterator(), we call
+      // ~CompactionIterator() first.
+      c_iter_.reset();
+    }
     iter_.reset(new LoggingForwardVectorIterator(ks, vs));
     iter_->SeekToFirst();
     c_iter_.reset(new CompactionIterator(
         iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
         earliest_write_conflict_snapshot, snapshot_checker_.get(),
         Env::Default(), false /* report_detailed_time */, false,
-        range_del_agg_.get(), std::move(compaction), filter, &shutting_down_));
+        range_del_agg_.get(), nullptr /* blob_file_builder */,
+        true /*allow_data_in_errors*/, std::move(compaction), filter,
+        &shutting_down_, /*preserve_deletes_seqnum=*/0,
+        /*manual_compaction_paused=*/nullptr,
+        /*manual_compaction_canceled=*/nullptr, /*info_log=*/nullptr,
+        full_history_ts_low));
   }
 
   void AddSnapshot(SequenceNumber snapshot,
@@ -266,6 +295,8 @@
 
   virtual bool UseSnapshotChecker() const { return false; }
 
+  virtual bool AllowIngestBehind() const { return false; }
+
   void RunTest(
       const std::vector<std::string>& input_keys,
       const std::vector<std::string>& input_values,
@@ -275,10 +306,13 @@
       MergeOperator* merge_operator = nullptr,
       CompactionFilter* compaction_filter = nullptr,
       bool bottommost_level = false,
-      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
     InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
                   last_committed_seq, merge_operator, compaction_filter,
-                  bottommost_level, earliest_write_conflict_snapshot);
+                  bottommost_level, earliest_write_conflict_snapshot,
+                  key_not_exists_beyond_output_level, full_history_ts_low);
     c_iter_->SeekToFirst();
     for (size_t i = 0; i < expected_keys.size(); i++) {
       std::string info = "i = " + ToString(i);
@@ -288,9 +322,15 @@
       ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
       c_iter_->Next();
     }
+    ASSERT_OK(c_iter_->status());
     ASSERT_FALSE(c_iter_->Valid());
   }
 
+  void ClearSnapshots() {
+    snapshots_.clear();
+    snapshot_map_.clear();
+  }
+
   const Comparator* cmp_;
   const InternalKeyComparator icmp_;
   std::vector<SequenceNumber> snapshots_;
@@ -312,6 +352,7 @@
                  test::KeyStr("a", 3, kTypeValue)},
                 {"", "val"}, {}, {}, 5);
   c_iter_->SeekToFirst();
+  ASSERT_OK(c_iter_->status());
   ASSERT_FALSE(c_iter_->Valid());
 }
 
@@ -333,6 +374,7 @@
   ASSERT_TRUE(c_iter_->Valid());
   ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
   c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
   ASSERT_FALSE(c_iter_->Valid());
 }
 
@@ -349,6 +391,7 @@
   ASSERT_TRUE(c_iter_->Valid());
   ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
   c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
   ASSERT_FALSE(c_iter_->Valid());
 }
 
@@ -370,6 +413,7 @@
   ASSERT_TRUE(c_iter_->Valid());
   ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
   c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
   ASSERT_FALSE(c_iter_->Valid());
 }
 
@@ -463,6 +507,7 @@
   ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
   ASSERT_EQ("hv91", c_iter_->value().ToString());
   c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
   ASSERT_FALSE(c_iter_->Valid());
 
   // Check that the compaction iterator did the correct sequence of calls on
@@ -656,6 +701,7 @@
   ASSERT_TRUE(c_iter_->Valid());
   ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
   c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
   ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
 }
 
@@ -666,7 +712,7 @@
   RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
           {"v1", "v2"},
           {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
-          {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/,
+          {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/,
           nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
           true /*bottommost_level*/);
 }
@@ -675,15 +721,14 @@
 // permanently.
 TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
   AddSnapshot(1);
-  RunTest({test::KeyStr("a", 1, kTypeDeletion),
-           test::KeyStr("b", 3, kTypeDeletion),
-           test::KeyStr("b", 1, kTypeValue)},
-          {"", "", ""},
-          {test::KeyStr("b", 3, kTypeDeletion),
-           test::KeyStr("b", 0, kTypeValue)},
-          {"", ""},
-          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
-          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion),
+       test::KeyStr("b", 1, kTypeValue)},
+      {"", "", ""},
+      {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)},
+      {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
 }
 
 // In bottommost level, single deletions earlier than earliest snapshot can be
@@ -693,10 +738,22 @@
   RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
            test::KeyStr("b", 2, kTypeSingleDeletion)},
           {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
-          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+          kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/,
           nullptr /*compaction_filter*/, true /*bottommost_level*/);
 }
 
+TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
 INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
                         testing::Values(true, false));
 
@@ -838,7 +895,7 @@
           {"v1", "v2", "v3"},
           {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
            test::KeyStr("c", 3, kTypeValue)},
-          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/,
+          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/,
           nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
           true /*bottommost_level*/);
 }
@@ -849,9 +906,7 @@
   RunTest(
       {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
        test::KeyStr("c", 3, kTypeDeletion)},
-      {"", "", ""},
-      {},
-      {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+      {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
       nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
       true /*bottommost_level*/);
 }
@@ -859,15 +914,14 @@
 TEST_F(CompactionIteratorWithSnapshotCheckerTest,
        NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
   AddSnapshot(2,1);
-  RunTest(
-      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
-          test::KeyStr("b", 3, kTypeValue)},
-      {"", "", ""},
-      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
-            test::KeyStr("b", 3, kTypeValue)},
-      {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
-      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
-      true /*bottommost_level*/);
+  RunTest({test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""},
+          {test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
 }
 
 TEST_F(CompactionIteratorWithSnapshotCheckerTest,
@@ -879,7 +933,7 @@
           {"", "", ""},
           {test::KeyStr("b", 2, kTypeSingleDeletion),
            test::KeyStr("c", 3, kTypeSingleDeletion)},
-          {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+          {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
           nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
           true /*bottommost_level*/);
 }
@@ -913,9 +967,24 @@
           2 /*earliest_write_conflict_snapshot*/);
 }
 
+// Same as above but with a blob index. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking_BlobIndex) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeBlobIndex)},
+          {"", "fake_blob_index"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
 // Compaction filter should keep uncommitted key as-is, and
-//   * Convert the latest velue to deletion, and/or
-//   * if latest value is a merge, apply filter to all suequent merges.
+//   * Convert the latest value to deletion, and/or
+//   * if latest value is a merge, apply filter to all subsequent merges.
 
 TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
   std::unique_ptr<CompactionFilter> compaction_filter(
@@ -968,6 +1037,323 @@
       compaction_filter.get());
 }
 
+// Tests how CompactionIterator work together with AllowIngestBehind.
+class CompactionIteratorWithAllowIngestBehindTest
+    : public CompactionIteratorTest {
+ public:
+  bool AllowIngestBehind() const override { return true; }
+};
+
+// When allow_ingest_behind is set, compaction iterator is not targeting
+// the bottommost level since there is no guarantee there won't be further
+// data ingested under the compaction output in future.
+TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+TEST_P(CompactionIteratorWithAllowIngestBehindTest,
+       MergeToPutIfEncounteredPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance,
+                        CompactionIteratorWithAllowIngestBehindTest,
+                        testing::Values(true, false));
+
+class CompactionIteratorTsGcTest : public CompactionIteratorTest {
+ public:
+  CompactionIteratorTsGcTest()
+      : CompactionIteratorTest(test::ComparatorWithU64Ts()) {}
+};
+
+TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"a3", "", "b2"};
+  std::string full_history_ts_low;
+  // All keys' timestamps are newer than or equal to 102, thus none of them
+  // will be eligible for GC.
+  PutFixed64(&full_history_ts_low, 102);
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const std::pair<bool, bool>& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "b5"};
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  {
+    // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+    // be preserved.
+    AddSnapshot(3);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "a2", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+    ClearSnapshots();
+  }
+  {
+    // No snapshot, the deletion marker should be preserved because the user
+    // key may appear beyond output level.
+    const std::vector<std::string> expected_keys = {input_keys[0],
+                                                    input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+  {
+    // No snapshot, the deletion marker can be dropped because the user key
+    // does not appear in higher levels.
+    const std::vector<std::string> expected_keys = {input_keys[3]};
+    const std::vector<std::string> expected_values = {"b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "a0"};
+  {
+    std::string full_history_ts_low;
+    // Keys whose timestamps larger than or equal to 102 will be preserved.
+    PutFixed64(&full_history_ts_low, 102);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[2]};
+    const std::vector<std::string> expected_values = {"", input_values[1],
+                                                      input_values[2]};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, DropTombstones) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+  const std::vector<std::string> expected_values = {"", "a2"};
+
+  // Take a snapshot at seq 2.
+  AddSnapshot(2);
+
+  {
+    // Non-bottommost level, but key does not exist beyond output level.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_sequence=*/kMaxSequenceNumber,
+            /*merge_op=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+  {
+    // Bottommost level
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, RewriteTs) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {
+      input_keys[0], input_keys[1], input_keys[2],
+      test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)};
+  const std::vector<std::string> expected_values = {"", "a2", "", "a0"};
+
+  AddSnapshot(1);
+  AddSnapshot(2);
+
+  {
+    // Bottommost level and need to rewrite both ts and seq.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a3", "b2"};
+  std::string full_history_ts_low;
+  // All keys' timestamps are newer than or equal to 102, thus none of them
+  // will be eligible for GC.
+  PutFixed64(&full_history_ts_low, 102);
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const std::pair<bool, bool>& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+  const std::vector<std::string> expected_values = {"", "a2"};
+
+  // Take a snapshot at seq 2.
+  AddSnapshot(2);
+  {
+    const std::vector<std::pair<bool, bool>> params = {
+        {false, false}, {false, true}, {true, true}};
+    for (const std::pair<bool, bool>& param : params) {
+      const bool bottommost_level = param.first;
+      const bool key_not_exists_beyond_output_level = param.second;
+      std::string full_history_ts_low;
+      PutFixed64(&full_history_ts_low, 102);
+      RunTest(input_keys, input_values, expected_keys, expected_values,
+              /*last_committed_seq=*/kMaxSequenceNumber,
+              /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+              bottommost_level,
+              /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+              key_not_exists_beyond_output_level, &full_history_ts_low);
+    }
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "b5"};
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  {
+    // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+    // be preserved.
+    AddSnapshot(3);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[2]};
+    const std::vector<std::string> expected_values = {"", "a2", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+    ClearSnapshots();
+  }
+  {
+    // No snapshot.
+    const std::vector<std::string> expected_keys = {input_keys[2]};
+    const std::vector<std::string> expected_values = {"b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance,
+                        CompactionIteratorTsGcTest,
+                        testing::Values(true, false));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "db/compaction/compaction_job.h"
+
 #include <algorithm>
 #include <cinttypes>
 #include <functional>
@@ -18,8 +20,12 @@
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_counting_iterator.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/blob/blob_garbage_meter.h"
 #include "db/builder.h"
-#include "db/compaction/compaction_job.h"
+#include "db/compaction/clipping_iterator.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
@@ -31,6 +37,7 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
+#include "db/output_validator.h"
 #include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "file/filename.h"
@@ -42,18 +49,23 @@
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/sst_partitioner.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
+#include "util/hash.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
@@ -95,6 +107,10 @@
       return "ExternalSstIngestion";
     case CompactionReason::kPeriodicCompaction:
       return "PeriodicCompaction";
+    case CompactionReason::kChangeTemperature:
+      return "ChangeTemperature";
+    case CompactionReason::kForcedBlobGC:
+      return "ForcedBlobGC";
     case CompactionReason::kNumOfReasons:
       // fall through
     default:
@@ -116,23 +132,37 @@
   // The return status of this subcompaction
   Status status;
 
+  // The return IO Status of this subcompaction
+  IOStatus io_status;
+
   // Files produced by this subcompaction
   struct Output {
+    Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
+           bool _enable_order_check, bool _enable_hash, bool _finished = false,
+           uint64_t precalculated_hash = 0)
+        : meta(std::move(_meta)),
+          validator(_icmp, _enable_order_check, _enable_hash,
+                    precalculated_hash),
+          finished(_finished) {}
     FileMetaData meta;
+    OutputValidator validator;
     bool finished;
     std::shared_ptr<const TableProperties> table_properties;
   };
 
   // State kept for output being generated
   std::vector<Output> outputs;
+  std::vector<BlobFileAddition> blob_file_additions;
+  std::unique_ptr<BlobGarbageMeter> blob_garbage_meter;
   std::unique_ptr<WritableFileWriter> outfile;
   std::unique_ptr<TableBuilder> builder;
+
   Output* current_output() {
     if (outputs.empty()) {
-      // This subcompaction's outptut could be empty if compaction was aborted
+      // This subcompaction's output could be empty if compaction was aborted
       // before this subcompaction had a chance to generate any output files.
       // When subcompactions are executed sequentially this is more likely and
-      // will be particulalry likely for the later subcompactions to be empty.
+      // will be particularly likely for the later subcompactions to be empty.
       // Once they are run in parallel however it should be much rarer.
       return nullptr;
     } else {
@@ -140,13 +170,20 @@
     }
   }
 
-  uint64_t current_output_file_size;
+  // Some identified files with old oldest ancester time and the range should be
+  // isolated out so that the output file(s) in that range can be merged down
+  // for TTL and clear the timestamps for the range.
+  std::vector<FileMetaData*> files_to_cut_for_ttl;
+  int cur_files_to_cut_for_ttl = -1;
+  int next_files_to_cut_for_ttl = 0;
+
+  uint64_t current_output_file_size = 0;
 
   // State during the subcompaction
-  uint64_t total_bytes;
-  uint64_t num_output_records;
+  uint64_t total_bytes = 0;
+  uint64_t num_output_records = 0;
   CompactionJobStats compaction_job_stats;
-  uint64_t approx_size;
+  uint64_t approx_size = 0;
   // An index that used to speed up ShouldStopBefore().
   size_t grandparent_index = 0;
   // The number of bytes overlapping between the current output and
@@ -154,49 +191,35 @@
   uint64_t overlapped_bytes = 0;
   // A flag determine whether the key has been seen in ShouldStopBefore()
   bool seen_key = false;
+  // sub compaction job id, which is used to identify different sub-compaction
+  // within the same compaction job.
+  const uint32_t sub_job_id;
 
-  SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
-                     uint64_t size = 0)
+  SubcompactionState(Compaction* c, Slice* _start, Slice* _end, uint64_t size,
+                     uint32_t _sub_job_id)
       : compaction(c),
         start(_start),
         end(_end),
-        outfile(nullptr),
-        builder(nullptr),
-        current_output_file_size(0),
-        total_bytes(0),
-        num_output_records(0),
         approx_size(size),
-        grandparent_index(0),
-        overlapped_bytes(0),
-        seen_key(false) {
+        sub_job_id(_sub_job_id) {
     assert(compaction != nullptr);
   }
 
-  SubcompactionState(SubcompactionState&& o) { *this = std::move(o); }
-
-  SubcompactionState& operator=(SubcompactionState&& o) {
-    compaction = std::move(o.compaction);
-    start = std::move(o.start);
-    end = std::move(o.end);
-    status = std::move(o.status);
-    outputs = std::move(o.outputs);
-    outfile = std::move(o.outfile);
-    builder = std::move(o.builder);
-    current_output_file_size = std::move(o.current_output_file_size);
-    total_bytes = std::move(o.total_bytes);
-    num_output_records = std::move(o.num_output_records);
-    compaction_job_stats = std::move(o.compaction_job_stats);
-    approx_size = std::move(o.approx_size);
-    grandparent_index = std::move(o.grandparent_index);
-    overlapped_bytes = std::move(o.overlapped_bytes);
-    seen_key = std::move(o.seen_key);
-    return *this;
+  // Adds the key and value to the builder
+  // If paranoid is true, adds the key-value to the paranoid hash
+  Status AddToBuilder(const Slice& key, const Slice& value) {
+    auto curr = current_output();
+    assert(builder != nullptr);
+    assert(curr != nullptr);
+    Status s = curr->validator.Add(key, value);
+    if (!s.ok()) {
+      return s;
+    }
+    builder->Add(key, value);
+    return Status::OK();
   }
 
-  // Because member std::unique_ptrs do not have these.
-  SubcompactionState(const SubcompactionState&) = delete;
-
-  SubcompactionState& operator=(const SubcompactionState&) = delete;
+  void FillFilesToCutForTtl();
 
   // Returns true iff we should stop building the current output
   // before processing "internal_key".
@@ -205,6 +228,7 @@
         &compaction->column_family_data()->internal_comparator();
     const std::vector<FileMetaData*>& grandparents = compaction->grandparents();
 
+    bool grandparant_file_switched = false;
     // Scan to find earliest grandparent file that contains key.
     while (grandparent_index < grandparents.size() &&
            icmp->Compare(internal_key,
@@ -212,6 +236,7 @@
                0) {
       if (seen_key) {
         overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize();
+        grandparant_file_switched = true;
       }
       assert(grandparent_index + 1 >= grandparents.size() ||
              icmp->Compare(
@@ -221,17 +246,99 @@
     }
     seen_key = true;
 
-    if (overlapped_bytes + curr_file_size >
-        compaction->max_compaction_bytes()) {
+    if (grandparant_file_switched && overlapped_bytes + curr_file_size >
+                                         compaction->max_compaction_bytes()) {
       // Too much overlap for current output; start new output
       overlapped_bytes = 0;
       return true;
     }
 
+    if (!files_to_cut_for_ttl.empty()) {
+      if (cur_files_to_cut_for_ttl != -1) {
+        // Previous key is inside the range of a file
+        if (icmp->Compare(internal_key,
+                          files_to_cut_for_ttl[cur_files_to_cut_for_ttl]
+                              ->largest.Encode()) > 0) {
+          next_files_to_cut_for_ttl = cur_files_to_cut_for_ttl + 1;
+          cur_files_to_cut_for_ttl = -1;
+          return true;
+        }
+      } else {
+        // Look for the key position
+        while (next_files_to_cut_for_ttl <
+               static_cast<int>(files_to_cut_for_ttl.size())) {
+          if (icmp->Compare(internal_key,
+                            files_to_cut_for_ttl[next_files_to_cut_for_ttl]
+                                ->smallest.Encode()) >= 0) {
+            if (icmp->Compare(internal_key,
+                              files_to_cut_for_ttl[next_files_to_cut_for_ttl]
+                                  ->largest.Encode()) <= 0) {
+              // With in the current file
+              cur_files_to_cut_for_ttl = next_files_to_cut_for_ttl;
+              return true;
+            }
+            // Beyond the current file
+            next_files_to_cut_for_ttl++;
+          } else {
+            // Still fall into the gap
+            break;
+          }
+        }
+      }
+    }
+
     return false;
   }
+
+  Status ProcessOutFlowIfNeeded(const Slice& key, const Slice& value) {
+    if (!blob_garbage_meter) {
+      return Status::OK();
+    }
+
+    return blob_garbage_meter->ProcessOutFlow(key, value);
+  }
 };
 
+void CompactionJob::SubcompactionState::FillFilesToCutForTtl() {
+  if (compaction->immutable_options()->compaction_style !=
+          CompactionStyle::kCompactionStyleLevel ||
+      compaction->immutable_options()->compaction_pri !=
+          CompactionPri::kMinOverlappingRatio ||
+      compaction->mutable_cf_options()->ttl == 0 ||
+      compaction->num_input_levels() < 2 || compaction->bottommost_level()) {
+    return;
+  }
+
+  // We define new file with oldest ancestor time to be younger than 1/4 TTL,
+  // and an old one to be older than 1/2 TTL time.
+  int64_t temp_current_time;
+  auto get_time_status = compaction->immutable_options()->clock->GetCurrentTime(
+      &temp_current_time);
+  if (!get_time_status.ok()) {
+    return;
+  }
+  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+  if (current_time < compaction->mutable_cf_options()->ttl) {
+    return;
+  }
+  uint64_t old_age_thres =
+      current_time - compaction->mutable_cf_options()->ttl / 2;
+
+  const std::vector<FileMetaData*>& olevel =
+      *(compaction->inputs(compaction->num_input_levels() - 1));
+  for (FileMetaData* file : olevel) {
+    // Worth filtering out by start and end?
+    uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+    // We put old files if they are not too small to prevent a flood
+    // of small files.
+    if (oldest_ancester_time < old_age_thres &&
+        file->fd.GetFileSize() >
+            compaction->mutable_cf_options()->target_file_size_base / 2) {
+      files_to_cut_for_ttl.push_back(file);
+    }
+  }
+}
+
 // Maintains state for the entire compaction
 struct CompactionJob::CompactionState {
   Compaction* const compaction;
@@ -241,21 +348,13 @@
   std::vector<CompactionJob::SubcompactionState> sub_compact_states;
   Status status;
 
-  uint64_t total_bytes;
-  uint64_t num_output_records;
-
-  explicit CompactionState(Compaction* c)
-      : compaction(c),
-        total_bytes(0),
-        num_output_records(0) {}
+  size_t num_output_files = 0;
+  uint64_t total_bytes = 0;
+  size_t num_blob_output_files = 0;
+  uint64_t total_blob_bytes = 0;
+  uint64_t num_output_records = 0;
 
-  size_t NumOutputFiles() {
-    size_t total = 0;
-    for (auto& s : sub_compact_states) {
-      total += s.outputs.size();
-    }
-    return total;
-  }
+  explicit CompactionState(Compaction* c) : compaction(c) {}
 
   Slice SmallestUserKey() {
     for (const auto& sub_compact_state : sub_compact_states) {
@@ -282,49 +381,78 @@
 };
 
 void CompactionJob::AggregateStatistics() {
+  assert(compact_);
+
   for (SubcompactionState& sc : compact_->sub_compact_states) {
+    auto& outputs = sc.outputs;
+
+    if (!outputs.empty() && !outputs.back().meta.fd.file_size) {
+      // An error occurred, so ignore the last output.
+      outputs.pop_back();
+    }
+
+    compact_->num_output_files += outputs.size();
     compact_->total_bytes += sc.total_bytes;
-    compact_->num_output_records += sc.num_output_records;
-  }
-  if (compaction_job_stats_) {
-    for (SubcompactionState& sc : compact_->sub_compact_states) {
-      compaction_job_stats_->Add(sc.compaction_job_stats);
+
+    const auto& blobs = sc.blob_file_additions;
+
+    compact_->num_blob_output_files += blobs.size();
+
+    for (const auto& blob : blobs) {
+      compact_->total_blob_bytes += blob.GetTotalBlobBytes();
     }
+
+    compact_->num_output_records += sc.num_output_records;
+
+    compaction_job_stats_->Add(sc.compaction_job_stats);
   }
 }
 
 CompactionJob::CompactionJob(
     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-    const FileOptions& file_options, VersionSet* versions,
-    const std::atomic<bool>* shutting_down,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
-    Directory* db_directory, Directory* output_directory, Statistics* stats,
+    FSDirectory* db_directory, FSDirectory* output_directory,
+    FSDirectory* blob_output_directory, Statistics* stats,
     InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
     std::vector<SequenceNumber> existing_snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
     const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
     EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
     const std::string& dbname, CompactionJobStats* compaction_job_stats,
-    Env::Priority thread_pri, const std::atomic<bool>* manual_compaction_paused)
-    : job_id_(job_id),
-      compact_(new CompactionState(compaction)),
-      compaction_job_stats_(compaction_job_stats),
+    Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+    const std::atomic<int>* manual_compaction_paused,
+    const std::atomic<bool>* manual_compaction_canceled,
+    const std::string& db_id, const std::string& db_session_id,
+    std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback)
+    : compact_(new CompactionState(compaction)),
       compaction_stats_(compaction->compaction_reason(), 1),
-      dbname_(dbname),
       db_options_(db_options),
+      mutable_db_options_copy_(mutable_db_options),
+      log_buffer_(log_buffer),
+      output_directory_(output_directory),
+      stats_(stats),
+      bottommost_level_(false),
+      write_hint_(Env::WLTH_NOT_SET),
+      job_id_(job_id),
+      compaction_job_stats_(compaction_job_stats),
+      dbname_(dbname),
+      db_id_(db_id),
+      db_session_id_(db_session_id),
       file_options_(file_options),
       env_(db_options.env),
-      fs_(db_options.fs.get()),
+      io_tracer_(io_tracer),
+      fs_(db_options.fs, io_tracer),
       file_options_for_read_(
           fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
       versions_(versions),
       shutting_down_(shutting_down),
       manual_compaction_paused_(manual_compaction_paused),
+      manual_compaction_canceled_(manual_compaction_canceled),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
-      log_buffer_(log_buffer),
       db_directory_(db_directory),
-      output_directory_(output_directory),
-      stats_(stats),
+      blob_output_directory_(blob_output_directory),
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
       existing_snapshots_(std::move(existing_snapshots)),
@@ -332,11 +460,12 @@
       snapshot_checker_(snapshot_checker),
       table_cache_(std::move(table_cache)),
       event_logger_(event_logger),
-      bottommost_level_(false),
       paranoid_file_checks_(paranoid_file_checks),
       measure_io_stats_(measure_io_stats),
-      write_hint_(Env::WLTH_NOT_SET),
-      thread_pri_(thread_pri) {
+      thread_pri_(thread_pri),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      blob_callback_(blob_callback) {
+  assert(compaction_job_stats_ != nullptr);
   assert(log_buffer_ != nullptr);
   const auto* cfd = compact_->compaction->column_family_data();
   ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
@@ -388,17 +517,16 @@
   // to ensure GetThreadList() can always show them all together.
   ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
 
-  if (compaction_job_stats_) {
-    compaction_job_stats_->is_manual_compaction =
-        compaction->is_manual_compaction();
-  }
+  compaction_job_stats_->is_manual_compaction =
+      compaction->is_manual_compaction();
+  compaction_job_stats_->is_full_compaction = compaction->is_full_compaction();
 }
 
 void CompactionJob::Prepare() {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PREPARE);
 
-  // Generate file_levels_ for compaction berfore making Iterator
+  // Generate file_levels_ for compaction before making Iterator
   auto* c = compact_->compaction;
   assert(c->column_family_data() != nullptr);
   assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
@@ -410,7 +538,7 @@
 
   if (c->ShouldFormSubcompactions()) {
     {
-      StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME);
+      StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
       GenSubcompactionBoundaries();
     }
     assert(sizes_.size() == boundaries_.size() + 1);
@@ -418,12 +546,18 @@
     for (size_t i = 0; i <= boundaries_.size(); i++) {
       Slice* start = i == 0 ? nullptr : &boundaries_[i - 1];
       Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
-      compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]);
+      compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i],
+                                                static_cast<uint32_t>(i));
     }
     RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
                       compact_->sub_compact_states.size());
   } else {
-    compact_->sub_compact_states.emplace_back(c, nullptr, nullptr);
+    constexpr Slice* start = nullptr;
+    constexpr Slice* end = nullptr;
+    constexpr uint64_t size = 0;
+
+    compact_->sub_compact_states.emplace_back(c, start, end, size,
+                                              /*sub_job_id*/ 0);
   }
 }
 
@@ -529,9 +663,10 @@
   int base_level = v->storage_info()->base_level();
   uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
       sum / min_file_fill_percent /
-      MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl,
-          c->immutable_cf_options()->compaction_style, base_level,
-          c->immutable_cf_options()->level_compaction_dynamic_level_bytes)));
+      MaxFileSizeForLevel(
+          *(c->mutable_cf_options()), out_lvl,
+          c->immutable_options()->compaction_style, base_level,
+          c->immutable_options()->level_compaction_dynamic_level_bytes)));
   uint64_t subcompactions =
       std::min({static_cast<uint64_t>(ranges.size()),
                 static_cast<uint64_t>(c->max_subcompactions()),
@@ -542,7 +677,7 @@
     // Greedily add ranges to the subcompaction until the sum of the ranges'
     // sizes becomes >= the expected mean size of a subcompaction
     sum = 0;
-    for (size_t i = 0; i < ranges.size() - 1; i++) {
+    for (size_t i = 0; i + 1 < ranges.size(); i++) {
       sum += ranges[i].size;
       if (subcompactions == 1) {
         // If there's only one left to schedule then it goes to the end so no
@@ -572,7 +707,7 @@
 
   const size_t num_threads = compact_->sub_compact_states.size();
   assert(num_threads > 0);
-  const uint64_t start_micros = env_->NowMicros();
+  const uint64_t start_micros = db_options_.clock->NowMicros();
 
   // Launch a thread for each of subcompactions 1...num_threads-1
   std::vector<port::Thread> thread_pool;
@@ -591,7 +726,7 @@
     thread.join();
   }
 
-  compaction_stats_.micros = env_->NowMicros() - start_micros;
+  compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros;
   compaction_stats_.cpu_micros = 0;
   for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) {
     compaction_stats_.cpu_micros +=
@@ -606,33 +741,62 @@
 
   // Check if any thread encountered an error during execution
   Status status;
+  IOStatus io_s;
+  bool wrote_new_blob_files = false;
+
   for (const auto& state : compact_->sub_compact_states) {
     if (!state.status.ok()) {
       status = state.status;
+      io_s = state.io_status;
       break;
     }
+
+    if (!state.blob_file_additions.empty()) {
+      wrote_new_blob_files = true;
+    }
   }
 
-  if (status.ok() && output_directory_) {
-    status = output_directory_->Fsync();
+  if (io_status_.ok()) {
+    io_status_ = io_s;
   }
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->FsyncWithDirOptions(
+          IOOptions(), dbg,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
 
+    if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+        blob_output_directory_ != output_directory_) {
+      io_s = blob_output_directory_->FsyncWithDirOptions(
+          IOOptions(), dbg,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+  }
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    status = io_s;
+  }
   if (status.ok()) {
     thread_pool.clear();
-    std::vector<const FileMetaData*> files_meta;
+    std::vector<const CompactionJob::SubcompactionState::Output*> files_output;
     for (const auto& state : compact_->sub_compact_states) {
       for (const auto& output : state.outputs) {
-        files_meta.emplace_back(&output.meta);
+        files_output.emplace_back(&output);
       }
     }
     ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-    auto prefix_extractor =
-        compact_->compaction->mutable_cf_options()->prefix_extractor.get();
-    std::atomic<size_t> next_file_meta_idx(0);
+    auto& prefix_extractor =
+        compact_->compaction->mutable_cf_options()->prefix_extractor;
+    std::atomic<size_t> next_file_idx(0);
     auto verify_table = [&](Status& output_status) {
       while (true) {
-        size_t file_idx = next_file_meta_idx.fetch_add(1);
-        if (file_idx >= files_meta.size()) {
+        size_t file_idx = next_file_idx.fetch_add(1);
+        if (file_idx >= files_output.size()) {
           break;
         }
         // Verify that the table is usable
@@ -641,21 +805,40 @@
         // No matter whether use_direct_io_for_flush_and_compaction is true,
         // we will regard this verification as user reads since the goal is
         // to cache it here for further user reads
+        ReadOptions read_options;
         InternalIterator* iter = cfd->table_cache()->NewIterator(
-            ReadOptions(), file_options_, cfd->internal_comparator(),
-            *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor,
+            read_options, file_options_, cfd->internal_comparator(),
+            files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
+            prefix_extractor,
             /*table_reader_ptr=*/nullptr,
             cfd->internal_stats()->GetFileReadHist(
                 compact_->compaction->output_level()),
             TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
             /*skip_filters=*/false, compact_->compaction->output_level(),
+            MaxFileSizeForL0MetaPin(
+                *compact_->compaction->mutable_cf_options()),
             /*smallest_compaction_key=*/nullptr,
-            /*largest_compaction_key=*/nullptr);
+            /*largest_compaction_key=*/nullptr,
+            /*allow_unprepared_value=*/false);
         auto s = iter->status();
 
         if (s.ok() && paranoid_file_checks_) {
-          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
-          s = iter->status();
+          OutputValidator validator(cfd->internal_comparator(),
+                                    /*_enable_order_check=*/true,
+                                    /*_enable_hash=*/true);
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+            s = validator.Add(iter->key(), iter->value());
+            if (!s.ok()) {
+              break;
+            }
+          }
+          if (s.ok()) {
+            s = iter->status();
+          }
+          if (s.ok() &&
+              !validator.CompareValidator(files_output[file_idx]->validator)) {
+            s = Status::Corruption("Paranoid checksums do not match");
+          }
         }
 
         delete iter;
@@ -686,7 +869,7 @@
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.outputs) {
       auto fn =
-          TableFileName(state.compaction->immutable_cf_options()->cf_paths,
+          TableFileName(state.compaction->immutable_options()->cf_paths,
                         output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
       tp[fn] = output.table_properties;
     }
@@ -696,6 +879,7 @@
   // Finish up all book-keeping to unify the subcompaction results
   AggregateStatistics();
   UpdateCompactionStats();
+
   RecordCompactionIOStats();
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
@@ -705,17 +889,26 @@
 }
 
 Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_INSTALL);
   db_mutex_->AssertHeld();
   Status status = compact_->status;
+
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  assert(cfd);
+
   cfd->internal_stats()->AddCompactionStats(
       compact_->compaction->output_level(), thread_pri_, compaction_stats_);
 
   if (status.ok()) {
     status = InstallCompactionResults(mutable_cf_options);
   }
+  if (!versions_->io_status().ok()) {
+    io_status_ = versions_->io_status();
+  }
+
   VersionStorageInfo::LevelSummaryStorage tmp;
   auto vstorage = cfd->current()->storage_info();
   const auto& stats = compaction_stats_;
@@ -725,63 +918,86 @@
   double bytes_read_per_sec = 0;
   double bytes_written_per_sec = 0;
 
-  if (stats.bytes_read_non_output_levels > 0) {
-    read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
-                      stats.bytes_read_non_output_levels) /
-                     static_cast<double>(stats.bytes_read_non_output_levels);
-    write_amp = stats.bytes_written /
-                static_cast<double>(stats.bytes_read_non_output_levels);
+  const uint64_t bytes_read_non_output_and_blob =
+      stats.bytes_read_non_output_levels + stats.bytes_read_blob;
+  const uint64_t bytes_read_all =
+      stats.bytes_read_output_level + bytes_read_non_output_and_blob;
+  const uint64_t bytes_written_all =
+      stats.bytes_written + stats.bytes_written_blob;
+
+  if (bytes_read_non_output_and_blob > 0) {
+    read_write_amp = (bytes_written_all + bytes_read_all) /
+                     static_cast<double>(bytes_read_non_output_and_blob);
+    write_amp =
+        bytes_written_all / static_cast<double>(bytes_read_non_output_and_blob);
   }
   if (stats.micros > 0) {
-    bytes_read_per_sec =
-        (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
-        static_cast<double>(stats.micros);
+    bytes_read_per_sec = bytes_read_all / static_cast<double>(stats.micros);
     bytes_written_per_sec =
-        stats.bytes_written / static_cast<double>(stats.micros);
+        bytes_written_all / static_cast<double>(stats.micros);
   }
 
+  const std::string& column_family_name = cfd->GetName();
+
+  constexpr double kMB = 1048576.0;
+
   ROCKS_LOG_BUFFER(
       log_buffer_,
       "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
-      "files in(%d, %d) out(%d) "
-      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-      "write-amplify(%.1f) %s, records in: %" PRIu64
+      "files in(%d, %d) out(%d +%d blob) "
+      "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), "
+      "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64
       ", records dropped: %" PRIu64 " output_compression: %s\n",
-      cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
-      bytes_written_per_sec, compact_->compaction->output_level(),
+      column_family_name.c_str(), vstorage->LevelSummary(&tmp),
+      bytes_read_per_sec, bytes_written_per_sec,
+      compact_->compaction->output_level(),
       stats.num_input_files_in_non_output_levels,
       stats.num_input_files_in_output_level, stats.num_output_files,
-      stats.bytes_read_non_output_levels / 1048576.0,
-      stats.bytes_read_output_level / 1048576.0,
-      stats.bytes_written / 1048576.0, read_write_amp, write_amp,
-      status.ToString().c_str(), stats.num_input_records,
+      stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB,
+      stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB,
+      stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp,
+      write_amp, status.ToString().c_str(), stats.num_input_records,
       stats.num_dropped_records,
       CompressionTypeToString(compact_->compaction->output_compression())
           .c_str());
 
+  const auto& blob_files = vstorage->GetBlobFiles();
+  if (!blob_files.empty()) {
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64
+                     "\n",
+                     column_family_name.c_str(), blob_files.begin()->first,
+                     blob_files.rbegin()->first);
+  }
+
   UpdateCompactionJobStats(stats);
 
-  auto stream = event_logger_->LogToBuffer(log_buffer_);
+  auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
   stream << "job" << job_id_ << "event"
          << "compaction_finished"
          << "compaction_time_micros" << stats.micros
          << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
          << compact_->compaction->output_level() << "num_output_files"
-         << compact_->NumOutputFiles() << "total_output_size"
-         << compact_->total_bytes << "num_input_records"
-         << stats.num_input_records << "num_output_records"
-         << compact_->num_output_records << "num_subcompactions"
-         << compact_->sub_compact_states.size() << "output_compression"
-         << CompressionTypeToString(compact_->compaction->output_compression());
+         << compact_->num_output_files << "total_output_size"
+         << compact_->total_bytes;
 
-  if (compaction_job_stats_ != nullptr) {
-    stream << "num_single_delete_mismatches"
-           << compaction_job_stats_->num_single_del_mismatch;
-    stream << "num_single_delete_fallthrough"
-           << compaction_job_stats_->num_single_del_fallthru;
+  if (compact_->num_blob_output_files > 0) {
+    stream << "num_blob_output_files" << compact_->num_blob_output_files
+           << "total_blob_output_size" << compact_->total_blob_bytes;
   }
 
-  if (measure_io_stats_ && compaction_job_stats_ != nullptr) {
+  stream << "num_input_records" << stats.num_input_records
+         << "num_output_records" << compact_->num_output_records
+         << "num_subcompactions" << compact_->sub_compact_states.size()
+         << "output_compression"
+         << CompressionTypeToString(compact_->compaction->output_compression());
+
+  stream << "num_single_delete_mismatches"
+         << compaction_job_stats_->num_single_del_mismatch;
+  stream << "num_single_delete_fallthrough"
+         << compaction_job_stats_->num_single_del_fallthru;
+
+  if (measure_io_stats_) {
     stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
     stream << "file_range_sync_nanos"
            << compaction_job_stats_->file_range_sync_nanos;
@@ -797,14 +1013,222 @@
   }
   stream.EndArray();
 
+  if (!blob_files.empty()) {
+    stream << "blob_file_head" << blob_files.begin()->first;
+    stream << "blob_file_tail" << blob_files.rbegin()->first;
+  }
+
   CleanupCompaction();
   return status;
 }
 
+#ifndef ROCKSDB_LITE
+CompactionServiceJobStatus
+CompactionJob::ProcessKeyValueCompactionWithCompactionService(
+    SubcompactionState* sub_compact) {
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+  assert(db_options_.compaction_service);
+
+  const Compaction* compaction = sub_compact->compaction;
+  CompactionServiceInput compaction_input;
+  compaction_input.output_level = compaction->output_level();
+
+  const std::vector<CompactionInputFiles>& inputs =
+      *(compact_->compaction->inputs());
+  for (const auto& files_per_level : inputs) {
+    for (const auto& file : files_per_level.files) {
+      compaction_input.input_files.emplace_back(
+          MakeTableFileName(file->fd.GetNumber()));
+    }
+  }
+  compaction_input.column_family.name =
+      compaction->column_family_data()->GetName();
+  compaction_input.column_family.options =
+      compaction->column_family_data()->GetLatestCFOptions();
+  compaction_input.db_options =
+      BuildDBOptions(db_options_, mutable_db_options_copy_);
+  compaction_input.snapshots = existing_snapshots_;
+  compaction_input.has_begin = sub_compact->start;
+  compaction_input.begin =
+      compaction_input.has_begin ? sub_compact->start->ToString() : "";
+  compaction_input.has_end = sub_compact->end;
+  compaction_input.end =
+      compaction_input.has_end ? sub_compact->end->ToString() : "";
+  compaction_input.approx_size = sub_compact->approx_size;
+
+  std::string compaction_input_binary;
+  Status s = compaction_input.Write(&compaction_input_binary);
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return CompactionServiceJobStatus::kFailure;
+  }
+
+  std::ostringstream input_files_oss;
+  bool is_first_one = true;
+  for (const auto& file : compaction_input.input_files) {
+    input_files_oss << (is_first_one ? "" : ", ") << file;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
+      compaction_input.column_family.name.c_str(), job_id_,
+      compaction_input.output_level, input_files_oss.str().c_str());
+  CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
+                                GetCompactionId(sub_compact), thread_pri_);
+  CompactionServiceJobStatus compaction_status =
+      db_options_.compaction_service->StartV2(info, compaction_input_binary);
+  switch (compaction_status) {
+    case CompactionServiceJobStatus::kSuccess:
+      break;
+    case CompactionServiceJobStatus::kFailure:
+      sub_compact->status = Status::Incomplete(
+          "CompactionService failed to start compaction job.");
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "[%s] [JOB %d] Remote compaction failed to start.",
+                     compaction_input.column_family.name.c_str(), job_id_);
+      return compaction_status;
+    case CompactionServiceJobStatus::kUseLocal:
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "[%s] [JOB %d] Remote compaction fallback to local by API Start.",
+          compaction_input.column_family.name.c_str(), job_id_);
+      return compaction_status;
+    default:
+      assert(false);  // unknown status
+      break;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Waiting for remote compaction...",
+                 compaction_input.column_family.name.c_str(), job_id_);
+  std::string compaction_result_binary;
+  compaction_status = db_options_.compaction_service->WaitForCompleteV2(
+      info, &compaction_result_binary);
+
+  if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction fallback to local by API "
+                   "WaitForComplete.",
+                   compaction_input.column_family.name.c_str(), job_id_);
+    return compaction_status;
+  }
+
+  CompactionServiceResult compaction_result;
+  s = CompactionServiceResult::Read(compaction_result_binary,
+                                    &compaction_result);
+
+  if (compaction_status == CompactionServiceJobStatus::kFailure) {
+    if (s.ok()) {
+      if (compaction_result.status.ok()) {
+        sub_compact->status = Status::Incomplete(
+            "CompactionService failed to run the compaction job (even though "
+            "the internal status is okay).");
+      } else {
+        // set the current sub compaction status with the status returned from
+        // remote
+        sub_compact->status = compaction_result.status;
+      }
+    } else {
+      sub_compact->status = Status::Incomplete(
+          "CompactionService failed to run the compaction job (and no valid "
+          "result is returned).");
+      compaction_result.status.PermitUncheckedError();
+    }
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction failed.",
+                   compaction_input.column_family.name.c_str(), job_id_);
+    return compaction_status;
+  }
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    compaction_result.status.PermitUncheckedError();
+    return CompactionServiceJobStatus::kFailure;
+  }
+  sub_compact->status = compaction_result.status;
+
+  std::ostringstream output_files_oss;
+  is_first_one = true;
+  for (const auto& file : compaction_result.output_files) {
+    output_files_oss << (is_first_one ? "" : ", ") << file.file_name;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Receive remote compaction result, output path: "
+                 "%s, files: %s",
+                 compaction_input.column_family.name.c_str(), job_id_,
+                 compaction_result.output_path.c_str(),
+                 output_files_oss.str().c_str());
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return CompactionServiceJobStatus::kFailure;
+  }
+
+  for (const auto& file : compaction_result.output_files) {
+    uint64_t file_num = versions_->NewFileNumber();
+    auto src_file = compaction_result.output_path + "/" + file.file_name;
+    auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths,
+                                  file_num, compaction->output_path_id());
+    s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return CompactionServiceJobStatus::kFailure;
+    }
+
+    FileMetaData meta;
+    uint64_t file_size;
+    s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return CompactionServiceJobStatus::kFailure;
+    }
+    meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
+                             file.smallest_seqno, file.largest_seqno);
+    meta.smallest.DecodeFrom(file.smallest_internal_key);
+    meta.largest.DecodeFrom(file.largest_internal_key);
+    meta.oldest_ancester_time = file.oldest_ancester_time;
+    meta.file_creation_time = file.file_creation_time;
+    meta.marked_for_compaction = file.marked_for_compaction;
+
+    auto cfd = compaction->column_family_data();
+    sub_compact->outputs.emplace_back(std::move(meta),
+                                      cfd->internal_comparator(), false, false,
+                                      true, file.paranoid_hash);
+  }
+  sub_compact->compaction_job_stats = compaction_result.stats;
+  sub_compact->num_output_records = compaction_result.num_output_records;
+  sub_compact->approx_size = compaction_input.approx_size;  // is this used?
+  sub_compact->total_bytes = compaction_result.total_bytes;
+  RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
+  RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
+             compaction_result.bytes_written);
+  return CompactionServiceJobStatus::kSuccess;
+}
+#endif  // !ROCKSDB_LITE
+
 void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
-  assert(sub_compact != nullptr);
+  assert(sub_compact);
+  assert(sub_compact->compaction);
 
-  uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000;
+#ifndef ROCKSDB_LITE
+  if (db_options_.compaction_service) {
+    CompactionServiceJobStatus comp_status =
+        ProcessKeyValueCompactionWithCompactionService(sub_compact);
+    if (comp_status == CompactionServiceJobStatus::kSuccess ||
+        comp_status == CompactionServiceJobStatus::kFailure) {
+      return;
+    }
+    // fallback to local compaction
+    assert(comp_status == CompactionServiceJobStatus::kUseLocal);
+  }
+#endif  // !ROCKSDB_LITE
+
+  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
 
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 
@@ -828,10 +1252,63 @@
   CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
                                              existing_snapshots_);
 
+  const Slice* const start = sub_compact->start;
+  const Slice* const end = sub_compact->end;
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+  read_options.fill_cache = false;
+  // Compaction iterators shouldn't be confined to a single prefix.
+  // Compactions use Seek() for
+  // (a) concurrent compactions,
+  // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
+  read_options.total_order_seek = true;
+
+  // Note: if we're going to support subcompactions for user-defined timestamps,
+  // the timestamp part will have to be stripped from the bounds here.
+  assert((!start && !end) || cfd->user_comparator()->timestamp_size() == 0);
+  read_options.iterate_lower_bound = start;
+  read_options.iterate_upper_bound = end;
+
   // Although the v2 aggregator is what the level iterator(s) know about,
   // the AddTombstones calls will be propagated down to the v1 aggregator.
-  std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
-      sub_compact->compaction, &range_del_agg, file_options_for_read_));
+  std::unique_ptr<InternalIterator> raw_input(
+      versions_->MakeInputIterator(read_options, sub_compact->compaction,
+                                   &range_del_agg, file_options_for_read_));
+  InternalIterator* input = raw_input.get();
+
+  IterKey start_ikey;
+  IterKey end_ikey;
+  Slice start_slice;
+  Slice end_slice;
+
+  if (start) {
+    start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+    start_slice = start_ikey.GetInternalKey();
+  }
+  if (end) {
+    end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
+    end_slice = end_ikey.GetInternalKey();
+  }
+
+  std::unique_ptr<InternalIterator> clip;
+  if (start || end) {
+    clip.reset(new ClippingIterator(
+        raw_input.get(), start ? &start_slice : nullptr,
+        end ? &end_slice : nullptr, &cfd->internal_comparator()));
+    input = clip.get();
+  }
+
+  std::unique_ptr<InternalIterator> blob_counter;
+
+  if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
+    sub_compact->blob_garbage_meter.reset(new BlobGarbageMeter);
+    blob_counter.reset(
+        new BlobCountingIterator(input, sub_compact->blob_garbage_meter.get()));
+    input = blob_counter.get();
+  }
+
+  input->SeekToFirst();
 
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
@@ -857,40 +1334,51 @@
   }
 
   MergeHelper merge(
-      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
+      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(),
       compaction_filter, db_options_.info_log.get(),
       false /* internal key corruption is expected */,
       existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
-      snapshot_checker_, compact_->compaction->level(),
-      db_options_.statistics.get());
+      snapshot_checker_, compact_->compaction->level(), db_options_.stats);
+
+  const MutableCFOptions* mutable_cf_options =
+      sub_compact->compaction->mutable_cf_options();
+  assert(mutable_cf_options);
+
+  std::vector<std::string> blob_file_paths;
+
+  std::unique_ptr<BlobFileBuilder> blob_file_builder(
+      mutable_cf_options->enable_blob_files
+          ? new BlobFileBuilder(
+                versions_, fs_.get(),
+                sub_compact->compaction->immutable_options(),
+                mutable_cf_options, &file_options_, job_id_, cfd->GetID(),
+                cfd->GetName(), Env::IOPriority::IO_LOW, write_hint_,
+                io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction,
+                &blob_file_paths, &sub_compact->blob_file_additions)
+          : nullptr);
 
   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
   TEST_SYNC_POINT_CALLBACK(
       "CompactionJob::Run():PausingManualCompaction:1",
       reinterpret_cast<void*>(
-          const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
-
-  Slice* start = sub_compact->start;
-  Slice* end = sub_compact->end;
-  if (start != nullptr) {
-    IterKey start_iter;
-    start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
-    input->Seek(start_iter.GetInternalKey());
-  } else {
-    input->SeekToFirst();
-  }
+          const_cast<std::atomic<int>*>(manual_compaction_paused_)));
 
   Status status;
+  const std::string* const full_history_ts_low =
+      full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
   sub_compact->c_iter.reset(new CompactionIterator(
-      input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
+      input, cfd->user_comparator(), &merge, versions_->LastSequence(),
       &existing_snapshots_, earliest_write_conflict_snapshot_,
-      snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
-      &range_del_agg, sub_compact->compaction, compaction_filter,
-      shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_,
-      db_options_.info_log));
+      snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_),
+      /*expect_valid_internal_key=*/true, &range_del_agg,
+      blob_file_builder.get(), db_options_.allow_data_in_errors,
+      sub_compact->compaction, compaction_filter, shutting_down_,
+      preserve_deletes_seqnum_, manual_compaction_paused_,
+      manual_compaction_canceled_, db_options_.info_log, full_history_ts_low));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
+    sub_compact->FillFilesToCutForTtl();
     // ShouldStopBefore() maintains state based on keys processed so far. The
     // compaction loop always calls it on the "next" key, thus won't tell it the
     // first key. So we do that here.
@@ -899,18 +1387,21 @@
   }
   const auto& c_iter_stats = c_iter->iter_stats();
 
+  std::unique_ptr<SstPartitioner> partitioner =
+      sub_compact->compaction->output_level() == 0
+          ? nullptr
+          : sub_compact->compaction->CreateSstPartitioner();
+  std::string last_key_for_partitioner;
+
   while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
     // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
     // returns true.
     const Slice& key = c_iter->key();
     const Slice& value = c_iter->value();
 
-    // If an end key (exclusive) is specified, check if the current key is
-    // >= than it and exit if it is because the iterator is out of its range
-    if (end != nullptr &&
-        cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) {
-      break;
-    }
+    assert(!end ||
+           cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
+
     if (c_iter_stats.num_input_records % kRecordStatsEvery ==
         kRecordStatsEvery - 1) {
       RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
@@ -925,10 +1416,18 @@
         break;
       }
     }
-    assert(sub_compact->builder != nullptr);
-    assert(sub_compact->current_output() != nullptr);
-    sub_compact->builder->Add(key, value);
-    sub_compact->current_output_file_size = sub_compact->builder->FileSize();
+    status = sub_compact->AddToBuilder(key, value);
+    if (!status.ok()) {
+      break;
+    }
+
+    status = sub_compact->ProcessOutFlowIfNeeded(key, value);
+    if (!status.ok()) {
+      break;
+    }
+
+    sub_compact->current_output_file_size =
+        sub_compact->builder->EstimatedFileSize();
     const ParsedInternalKey& ikey = c_iter->ikey();
     sub_compact->current_output()->meta.UpdateBoundaries(
         key, value, ikey.sequence, ikey.type);
@@ -943,33 +1442,39 @@
     // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
     // and 0.6MB instead of 1MB and 0.2MB)
     bool output_file_ended = false;
-    Status input_status;
     if (sub_compact->compaction->output_level() != 0 &&
         sub_compact->current_output_file_size >=
             sub_compact->compaction->max_output_file_size()) {
       // (1) this key terminates the file. For historical reasons, the iterator
       // status before advancing will be given to FinishCompactionOutputFile().
-      input_status = input->status();
       output_file_ended = true;
     }
     TEST_SYNC_POINT_CALLBACK(
         "CompactionJob::Run():PausingManualCompaction:2",
         reinterpret_cast<void*>(
-            const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
+            const_cast<std::atomic<int>*>(manual_compaction_paused_)));
+    if (partitioner.get()) {
+      last_key_for_partitioner.assign(c_iter->user_key().data_,
+                                      c_iter->user_key().size_);
+    }
     c_iter->Next();
     if (c_iter->status().IsManualCompactionPaused()) {
       break;
     }
-    if (!output_file_ended && c_iter->Valid() &&
-        sub_compact->compaction->output_level() != 0 &&
-        sub_compact->ShouldStopBefore(c_iter->key(),
-                                      sub_compact->current_output_file_size) &&
-        sub_compact->builder != nullptr) {
-      // (2) this key belongs to the next file. For historical reasons, the
-      // iterator status after advancing will be given to
-      // FinishCompactionOutputFile().
-      input_status = input->status();
-      output_file_ended = true;
+    if (!output_file_ended && c_iter->Valid()) {
+      if (((partitioner.get() &&
+            partitioner->ShouldPartition(PartitionerRequest(
+                last_key_for_partitioner, c_iter->user_key(),
+                sub_compact->current_output_file_size)) == kRequired) ||
+           (sub_compact->compaction->output_level() != 0 &&
+            sub_compact->ShouldStopBefore(
+                c_iter->key(), sub_compact->current_output_file_size))) &&
+          sub_compact->builder != nullptr) {
+        // (2) this key belongs to the next file. For historical reasons, the
+        // iterator status after advancing will be given to
+        // FinishCompactionOutputFile().
+        output_file_ended = true;
+      }
     }
     if (output_file_ended) {
       const Slice* next_key = nullptr;
@@ -977,14 +1482,18 @@
         next_key = &c_iter->key();
       }
       CompactionIterationStats range_del_out_stats;
-      status =
-          FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg,
-                                     &range_del_out_stats, next_key);
+      status = FinishCompactionOutputFile(input->status(), sub_compact,
+                                          &range_del_agg, &range_del_out_stats,
+                                          next_key);
       RecordDroppedKeys(range_del_out_stats,
                         &sub_compact->compaction_job_stats);
     }
   }
 
+  sub_compact->compaction_job_stats.num_blobs_read =
+      c_iter_stats.num_blobs_read;
+  sub_compact->compaction_job_stats.total_blob_bytes_read =
+      c_iter_stats.total_blob_bytes_read;
   sub_compact->compaction_job_stats.num_input_deletion_records =
       c_iter_stats.num_input_deletion_records;
   sub_compact->compaction_job_stats.num_corrupt_keys =
@@ -1000,6 +1509,16 @@
 
   RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
              c_iter_stats.total_filter_time);
+
+  if (c_iter_stats.num_blobs_relocated > 0) {
+    RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
+               c_iter_stats.num_blobs_relocated);
+  }
+  if (c_iter_stats.total_blob_bytes_relocated > 0) {
+    RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED,
+               c_iter_stats.total_blob_bytes_relocated);
+  }
+
   RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
   RecordCompactionIOStats();
 
@@ -1012,8 +1531,10 @@
     status = Status::ShutdownInProgress("Database shutdown");
   }
   if ((status.ok() || status.IsColumnFamilyDropped()) &&
-      (manual_compaction_paused_ &&
-       manual_compaction_paused_->load(std::memory_order_relaxed))) {
+      ((manual_compaction_paused_ &&
+        manual_compaction_paused_->load(std::memory_order_relaxed) > 0) ||
+       (manual_compaction_canceled_ &&
+        manual_compaction_canceled_->load(std::memory_order_relaxed)))) {
     status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
   if (status.ok()) {
@@ -1035,14 +1556,23 @@
     CompactionIterationStats range_del_out_stats;
     Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg,
                                           &range_del_out_stats);
-    if (status.ok()) {
+    if (!s.ok() && status.ok()) {
       status = s;
     }
     RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
   }
 
+  if (blob_file_builder) {
+    if (status.ok()) {
+      status = blob_file_builder->Finish();
+    } else {
+      blob_file_builder->Abandon(status);
+    }
+    blob_file_builder.reset();
+  }
+
   sub_compact->compaction_job_stats.cpu_micros =
-      env_->NowCPUNanos() / 1000 - prev_cpu_micros;
+      db_options_.clock->CPUMicros() - prev_cpu_micros;
 
   if (measure_io_stats_) {
     sub_compact->compaction_job_stats.file_write_nanos +=
@@ -1061,12 +1591,28 @@
       SetPerfLevel(prev_perf_level);
     }
   }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  if (!status.ok()) {
+    if (sub_compact->c_iter) {
+      sub_compact->c_iter->status().PermitUncheckedError();
+    }
+    if (input) {
+      input->status().PermitUncheckedError();
+    }
+  }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
 
   sub_compact->c_iter.reset();
-  input.reset();
+  blob_counter.reset();
+  clip.reset();
+  raw_input.reset();
   sub_compact->status = status;
 }
 
+uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) {
+  return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id;
+}
+
 void CompactionJob::RecordDroppedKeys(
     const CompactionIterationStats& c_iter_stats,
     CompactionJobStats* compaction_job_stats) {
@@ -1121,6 +1667,8 @@
 
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
   const Comparator* ucmp = cfd->user_comparator();
+  std::string file_checksum = kUnknownFileChecksum;
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
 
   // Check for iterator errors
   Status s = input_status;
@@ -1194,6 +1742,7 @@
     } else {
       it->SeekToFirst();
     }
+    TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
     for (; it->Valid(); it->Next()) {
       auto tombstone = it->Tombstone();
       if (upper_bound != nullptr) {
@@ -1221,6 +1770,7 @@
       auto kv = tombstone.Serialize();
       assert(lower_bound == nullptr ||
              ucmp->Compare(*lower_bound, kv.second) < 0);
+      // Range tombstone is not supported by output validator yet.
       sub_compact->builder->Add(kv.first.Encode(), kv.second);
       InternalKey smallest_candidate = std::move(kv.first);
       if (lower_bound != nullptr &&
@@ -1277,7 +1827,6 @@
       meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate,
                                      tombstone.seq_,
                                      cfd->internal_comparator());
-
       // The smallest key in a file is used for range tombstone truncation, so
       // it cannot have a seqnum of 0 (unless the smallest data key in a file
       // has a seqnum of 0). Otherwise, the truncated tombstone may expose
@@ -1286,7 +1835,6 @@
              ExtractInternalKeyFooter(meta->smallest.Encode()) !=
                  PackSequenceAndType(0, kTypeRangeDeletion));
     }
-    meta->marked_for_compaction = sub_compact->builder->NeedCompact();
   }
   const uint64_t current_entries = sub_compact->builder->NumEntries();
   if (s.ok()) {
@@ -1294,25 +1842,59 @@
   } else {
     sub_compact->builder->Abandon();
   }
+  IOStatus io_s = sub_compact->builder->io_status();
+  if (s.ok()) {
+    s = io_s;
+  }
   const uint64_t current_bytes = sub_compact->builder->FileSize();
   if (s.ok()) {
-    // Add the checksum information to file metadata.
-    meta->file_checksum = sub_compact->builder->GetFileChecksum();
-    meta->file_checksum_func_name =
-        sub_compact->builder->GetFileChecksumFuncName();
-
     meta->fd.file_size = current_bytes;
+    meta->marked_for_compaction = sub_compact->builder->NeedCompact();
+    // With accurate smallest and largest key, we can get a slightly more
+    // accurate oldest ancester time.
+    // This makes oldest ancester time in manifest more accurate than in
+    // table properties. Not sure how to resolve it.
+    if (meta->smallest.size() > 0 && meta->largest.size() > 0) {
+      uint64_t refined_oldest_ancester_time;
+      Slice new_smallest = meta->smallest.user_key();
+      Slice new_largest = meta->largest.user_key();
+      if (!new_largest.empty() && !new_smallest.empty()) {
+        refined_oldest_ancester_time =
+            sub_compact->compaction->MinInputFileOldestAncesterTime(
+                &(meta->smallest), &(meta->largest));
+        if (refined_oldest_ancester_time != port::kMaxUint64) {
+          meta->oldest_ancester_time = refined_oldest_ancester_time;
+        }
+      }
+    }
   }
   sub_compact->current_output()->finished = true;
   sub_compact->total_bytes += current_bytes;
 
   // Finish and check for file errors
   if (s.ok()) {
-    StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
-    s = sub_compact->outfile->Sync(db_options_.use_fsync);
+    StopWatch sw(db_options_.clock, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+    io_s = sub_compact->outfile->Sync(db_options_.use_fsync);
+  }
+  if (s.ok() && io_s.ok()) {
+    io_s = sub_compact->outfile->Close();
+  }
+  if (s.ok() && io_s.ok()) {
+    // Add the checksum information to file metadata.
+    meta->file_checksum = sub_compact->outfile->GetFileChecksum();
+    meta->file_checksum_func_name =
+        sub_compact->outfile->GetFileChecksumFuncName();
+    file_checksum = meta->file_checksum;
+    file_checksum_func_name = meta->file_checksum_func_name;
   }
   if (s.ok()) {
-    s = sub_compact->outfile->Close();
+    s = io_s;
+  }
+  if (sub_compact->io_status.ok()) {
+    sub_compact->io_status = io_s;
+    // Since this error is really a copy of the
+    // "normal" status, it does not also need to be checked
+    sub_compact->io_status.PermitUncheckedError();
   }
   sub_compact->outfile.reset();
 
@@ -1326,9 +1908,20 @@
     // This happens when the output level is bottom level, at the same time
     // the sub_compact output nothing.
     std::string fname =
-        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+        TableFileName(sub_compact->compaction->immutable_options()->cf_paths,
                       meta->fd.GetNumber(), meta->fd.GetPathId());
-    env_->DeleteFile(fname);
+
+    // TODO(AR) it is not clear if there are any larger implications if
+    // DeleteFile fails here
+    Status ds = env_->DeleteFile(fname);
+    if (!ds.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64
+          " at bottom level%s",
+          cfd->GetName().c_str(), job_id_, output_number,
+          meta->marked_for_compaction ? " (need compaction)" : "");
+    }
 
     // Also need to remove the file from outputs, or it will be added to the
     // VersionEdit.
@@ -1352,9 +1945,7 @@
   FileDescriptor output_fd;
   uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
   if (meta != nullptr) {
-    fname =
-        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
-                      meta->fd.GetNumber(), meta->fd.GetPathId());
+    fname = GetTableFileName(meta->fd.GetNumber());
     output_fd = meta->fd;
     oldest_blob_file_number = meta->oldest_blob_file_number;
   } else {
@@ -1363,14 +1954,18 @@
   EventHelpers::LogAndNotifyTableFileCreationFinished(
       event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
       job_id_, output_fd, oldest_blob_file_number, tp,
-      TableFileCreationReason::kCompaction, s);
+      TableFileCreationReason::kCompaction, s, file_checksum,
+      file_checksum_func_name);
 
 #ifndef ROCKSDB_LITE
   // Report new file to SstFileManagerImpl
   auto sfm =
       static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
   if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
-    sfm->OnAddFile(fname);
+    Status add_s = sfm->OnAddFile(fname);
+    if (!add_s.ok() && s.ok()) {
+      s = add_s;
+    }
     if (sfm->IsMaxAllowedSpaceReached()) {
       // TODO(ajkr): should we return OK() if max space was reached by the final
       // compaction output file (similarly to how flush works when full)?
@@ -1391,49 +1986,86 @@
 
 Status CompactionJob::InstallCompactionResults(
     const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
   db_mutex_->AssertHeld();
 
   auto* compaction = compact_->compaction;
-  // paranoia: verify that the files that we started with
-  // still exist in the current version and in the same original level.
-  // This ensures that a concurrent compaction did not erroneously
-  // pick the same files to compact_.
-  if (!versions_->VerifyCompactionFileConsistency(compaction)) {
-    Compaction::InputLevelSummaryBuffer inputs_summary;
-
-    ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted",
-                    compaction->column_family_data()->GetName().c_str(),
-                    job_id_, compaction->InputLevelSummary(&inputs_summary));
-    return Status::Corruption("Compaction input files inconsistent");
-  }
+  assert(compaction);
 
   {
     Compaction::InputLevelSummaryBuffer inputs_summary;
-    ROCKS_LOG_INFO(
-        db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
-        compaction->column_family_data()->GetName().c_str(), job_id_,
-        compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+                   compaction->column_family_data()->GetName().c_str(), job_id_,
+                   compaction->InputLevelSummary(&inputs_summary),
+                   compact_->total_bytes + compact_->total_blob_bytes);
   }
 
+  VersionEdit* const edit = compaction->edit();
+  assert(edit);
+
   // Add compaction inputs
-  compaction->AddInputDeletions(compact_->compaction->edit());
+  compaction->AddInputDeletions(edit);
+
+  std::unordered_map<uint64_t, BlobGarbageMeter::BlobStats> blob_total_garbage;
 
   for (const auto& sub_compact : compact_->sub_compact_states) {
     for (const auto& out : sub_compact.outputs) {
-      compaction->edit()->AddFile(compaction->output_level(), out.meta);
+      edit->AddFile(compaction->output_level(), out.meta);
+    }
+
+    for (const auto& blob : sub_compact.blob_file_additions) {
+      edit->AddBlobFile(blob);
     }
+
+    if (sub_compact.blob_garbage_meter) {
+      const auto& flows = sub_compact.blob_garbage_meter->flows();
+
+      for (const auto& pair : flows) {
+        const uint64_t blob_file_number = pair.first;
+        const BlobGarbageMeter::BlobInOutFlow& flow = pair.second;
+
+        assert(flow.IsValid());
+        if (flow.HasGarbage()) {
+          blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(),
+                                                   flow.GetGarbageBytes());
+        }
+      }
+    }
+  }
+
+  for (const auto& pair : blob_total_garbage) {
+    const uint64_t blob_file_number = pair.first;
+    const BlobGarbageMeter::BlobStats& stats = pair.second;
+
+    edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(),
+                             stats.GetBytes());
   }
+
   return versions_->LogAndApply(compaction->column_family_data(),
-                                mutable_cf_options, compaction->edit(),
-                                db_mutex_, db_directory_);
+                                mutable_cf_options, edit, db_mutex_,
+                                db_directory_);
 }
 
 void CompactionJob::RecordCompactionIOStats() {
   RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+  CompactionReason compaction_reason =
+      compact_->compaction->compaction_reason();
+  if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) {
+    RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written));
+  } else if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+    RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written));
+  } else if (compaction_reason == CompactionReason::kTtl) {
+    RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written));
+  }
   ThreadStatusUtil::IncreaseThreadOperationProperty(
       ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
   IOSTATS_RESET(bytes_read);
-  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
   ThreadStatusUtil::IncreaseThreadOperationProperty(
       ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
   IOSTATS_RESET(bytes_written);
@@ -1445,9 +2077,7 @@
   assert(sub_compact->builder == nullptr);
   // no need to lock because VersionSet::next_file_number_ is atomic
   uint64_t file_number = versions_->NewFileNumber();
-  std::string fname =
-      TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
-                    file_number, sub_compact->compaction->output_path_id());
+  std::string fname = GetTableFileName(file_number);
   // Fire events.
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 #ifndef ROCKSDB_LITE
@@ -1462,7 +2092,25 @@
   TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
                            &syncpoint_arg);
 #endif
-  Status s = NewWritableFile(fs_, fname, &writable_file, file_options_);
+
+  // Pass temperature of botommost files to FileSystem.
+  FileOptions fo_copy = file_options_;
+  Temperature temperature = sub_compact->compaction->output_temperature();
+  if (temperature == Temperature::kUnknown && bottommost_level_) {
+    temperature =
+        sub_compact->compaction->mutable_cf_options()->bottommost_temperature;
+  }
+  fo_copy.temperature = temperature;
+
+  Status s;
+  IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
+  s = io_s;
+  if (sub_compact->io_status.ok()) {
+    sub_compact->io_status = io_s;
+    // Since this error is really a copy of the io_s that is checked below as s,
+    // it does not also need to be checked.
+    sub_compact->io_status.PermitUncheckedError();
+  }
   if (!s.ok()) {
     ROCKS_LOG_ERROR(
         db_options_.info_log,
@@ -1474,13 +2122,14 @@
     EventHelpers::LogAndNotifyTableFileCreationFinished(
         event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
         fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
-        TableProperties(), TableFileCreationReason::kCompaction, s);
+        TableProperties(), TableFileCreationReason::kCompaction, s,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
     return s;
   }
 
   // Try to figure out the output file's oldest ancester time.
   int64_t temp_current_time = 0;
-  auto get_time_status = env_->GetCurrentTime(&temp_current_time);
+  auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time);
   // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
   if (!get_time_status.ok()) {
     ROCKS_LOG_WARN(db_options_.info_log,
@@ -1488,50 +2137,62 @@
                    get_time_status.ToString().c_str());
   }
   uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+  InternalKey tmp_start, tmp_end;
+  if (sub_compact->start != nullptr) {
+    tmp_start.SetMinPossibleForUserKey(*(sub_compact->start));
+  }
+  if (sub_compact->end != nullptr) {
+    tmp_end.SetMinPossibleForUserKey(*(sub_compact->end));
+  }
   uint64_t oldest_ancester_time =
-      sub_compact->compaction->MinInputFileOldestAncesterTime();
+      sub_compact->compaction->MinInputFileOldestAncesterTime(
+          (sub_compact->start != nullptr) ? &tmp_start : nullptr,
+          (sub_compact->end != nullptr) ? &tmp_end : nullptr);
   if (oldest_ancester_time == port::kMaxUint64) {
     oldest_ancester_time = current_time;
   }
 
   // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
   {
-    SubcompactionState::Output out;
-    out.meta.fd = FileDescriptor(file_number,
-                                 sub_compact->compaction->output_path_id(), 0);
-    out.meta.oldest_ancester_time = oldest_ancester_time;
-    out.meta.file_creation_time = current_time;
-    out.finished = false;
-    sub_compact->outputs.push_back(out);
+    FileMetaData meta;
+    meta.fd = FileDescriptor(file_number,
+                             sub_compact->compaction->output_path_id(), 0);
+    meta.oldest_ancester_time = oldest_ancester_time;
+    meta.file_creation_time = current_time;
+    meta.temperature = temperature;
+    sub_compact->outputs.emplace_back(
+        std::move(meta), cfd->internal_comparator(),
+        /*enable_order_check=*/
+        sub_compact->compaction->mutable_cf_options()
+            ->check_flush_compaction_key_order,
+        /*enable_hash=*/paranoid_file_checks_);
   }
 
   writable_file->SetIOPriority(Env::IOPriority::IO_LOW);
   writable_file->SetWriteLifeTimeHint(write_hint_);
+  FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
       sub_compact->compaction->OutputFilePreallocationSize()));
   const auto& listeners =
-      sub_compact->compaction->immutable_cf_options()->listeners;
-  sub_compact->outfile.reset(
-      new WritableFileWriter(std::move(writable_file), fname, file_options_,
-                             env_, db_options_.statistics.get(), listeners,
-                             db_options_.sst_file_checksum_func.get()));
-
-  // If the Column family flag is to only optimize filters for hits,
-  // we can skip creating filters if this is the bottommost_level where
-  // data is going to be found
-  bool skip_filters =
-      cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
+      sub_compact->compaction->immutable_options()->listeners;
+  sub_compact->outfile.reset(new WritableFileWriter(
+      std::move(writable_file), fname, file_options_, db_options_.clock,
+      io_tracer_, db_options_.stats, listeners,
+      db_options_.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile), false));
 
-  sub_compact->builder.reset(NewTableBuilder(
+  TableBuilderOptions tboptions(
       *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
       cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
-      cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(),
       sub_compact->compaction->output_compression(),
-      0 /*sample_for_compression */,
-      sub_compact->compaction->output_compression_opts(),
-      sub_compact->compaction->output_level(), skip_filters,
-      oldest_ancester_time, 0 /* oldest_key_time */,
-      sub_compact->compaction->max_output_file_size(), current_time));
+      sub_compact->compaction->output_compression_opts(), cfd->GetID(),
+      cfd->GetName(), sub_compact->compaction->output_level(),
+      bottommost_level_, TableFileCreationReason::kCompaction,
+      oldest_ancester_time, 0 /* oldest_key_time */, current_time, db_id_,
+      db_session_id_, sub_compact->compaction->max_output_file_size(),
+      file_number);
+  sub_compact->builder.reset(
+      NewTableBuilder(tboptions, sub_compact->outfile.get()));
   LogFlush(db_options_.info_log);
   return s;
 }
@@ -1554,6 +2215,9 @@
         TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber());
       }
     }
+    // TODO: sub_compact.io_status is not checked like status. Not sure if thats
+    // intentional. So ignoring the io_status as of now.
+    sub_compact.io_status.PermitUncheckedError();
   }
   delete compact_;
   compact_ = nullptr;
@@ -1571,6 +2235,8 @@
 #endif  // !ROCKSDB_LITE
 
 void CompactionJob::UpdateCompactionStats() {
+  assert(compact_);
+
   Compaction* compaction = compact_->compaction;
   compaction_stats_.num_input_files_in_non_output_levels = 0;
   compaction_stats_.num_input_files_in_output_level = 0;
@@ -1588,27 +2254,20 @@
     }
   }
 
-  uint64_t num_output_records = 0;
-
-  for (const auto& sub_compact : compact_->sub_compact_states) {
-    size_t num_output_files = sub_compact.outputs.size();
-    if (sub_compact.builder != nullptr) {
-      // An error occurred so ignore the last output.
-      assert(num_output_files > 0);
-      --num_output_files;
-    }
-    compaction_stats_.num_output_files += static_cast<int>(num_output_files);
-
-    num_output_records += sub_compact.num_output_records;
-
-    for (const auto& out : sub_compact.outputs) {
-      compaction_stats_.bytes_written += out.meta.fd.file_size;
-    }
-  }
+  assert(compaction_job_stats_);
+  compaction_stats_.bytes_read_blob =
+      compaction_job_stats_->total_blob_bytes_read;
+
+  compaction_stats_.num_output_files =
+      static_cast<int>(compact_->num_output_files);
+  compaction_stats_.num_output_files_blob =
+      static_cast<int>(compact_->num_blob_output_files);
+  compaction_stats_.bytes_written = compact_->total_bytes;
+  compaction_stats_.bytes_written_blob = compact_->total_blob_bytes;
 
-  if (compaction_stats_.num_input_records > num_output_records) {
+  if (compaction_stats_.num_input_records > compact_->num_output_records) {
     compaction_stats_.num_dropped_records =
-        compaction_stats_.num_input_records - num_output_records;
+        compaction_stats_.num_input_records - compact_->num_output_records;
   }
 }
 
@@ -1630,32 +2289,31 @@
 void CompactionJob::UpdateCompactionJobStats(
     const InternalStats::CompactionStats& stats) const {
 #ifndef ROCKSDB_LITE
-  if (compaction_job_stats_) {
-    compaction_job_stats_->elapsed_micros = stats.micros;
+  compaction_job_stats_->elapsed_micros = stats.micros;
 
-    // input information
-    compaction_job_stats_->total_input_bytes =
-        stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
-    compaction_job_stats_->num_input_records = stats.num_input_records;
-    compaction_job_stats_->num_input_files =
-        stats.num_input_files_in_non_output_levels +
-        stats.num_input_files_in_output_level;
-    compaction_job_stats_->num_input_files_at_output_level =
-        stats.num_input_files_in_output_level;
-
-    // output information
-    compaction_job_stats_->total_output_bytes = stats.bytes_written;
-    compaction_job_stats_->num_output_records = compact_->num_output_records;
-    compaction_job_stats_->num_output_files = stats.num_output_files;
-
-    if (compact_->NumOutputFiles() > 0U) {
-      CopyPrefix(compact_->SmallestUserKey(),
-                 CompactionJobStats::kMaxPrefixLength,
-                 &compaction_job_stats_->smallest_output_key_prefix);
-      CopyPrefix(compact_->LargestUserKey(),
-                 CompactionJobStats::kMaxPrefixLength,
-                 &compaction_job_stats_->largest_output_key_prefix);
-    }
+  // input information
+  compaction_job_stats_->total_input_bytes =
+      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+  compaction_job_stats_->num_input_records = stats.num_input_records;
+  compaction_job_stats_->num_input_files =
+      stats.num_input_files_in_non_output_levels +
+      stats.num_input_files_in_output_level;
+  compaction_job_stats_->num_input_files_at_output_level =
+      stats.num_input_files_in_output_level;
+
+  // output information
+  compaction_job_stats_->total_output_bytes = stats.bytes_written;
+  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
+  compaction_job_stats_->num_output_records = compact_->num_output_records;
+  compaction_job_stats_->num_output_files = stats.num_output_files;
+  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+
+  if (stats.num_output_files > 0) {
+    CopyPrefix(compact_->SmallestUserKey(),
+               CompactionJobStats::kMaxPrefixLength,
+               &compaction_job_stats_->smallest_output_key_prefix);
+    CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
+               &compaction_job_stats_->largest_output_key_prefix);
   }
 #else
   (void)stats;
@@ -1697,4 +2355,629 @@
   }
 }
 
+std::string CompactionJob::GetTableFileName(uint64_t file_number) {
+  return TableFileName(compact_->compaction->immutable_options()->cf_paths,
+                       file_number, compact_->compaction->output_path_id());
+}
+
+#ifndef ROCKSDB_LITE
+std::string CompactionServiceCompactionJob::GetTableFileName(
+    uint64_t file_number) {
+  return MakeTableFileName(output_path_, file_number);
+}
+
+void CompactionServiceCompactionJob::RecordCompactionIOStats() {
+  compaction_result_->bytes_read += IOSTATS(bytes_read);
+  compaction_result_->bytes_written += IOSTATS(bytes_written);
+  CompactionJob::RecordCompactionIOStats();
+}
+
+CompactionServiceCompactionJob::CompactionServiceCompactionJob(
+    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
+    LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
+    InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+    std::vector<SequenceNumber> existing_snapshots,
+    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+    const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+    const std::string& db_id, const std::string& db_session_id,
+    const std::string& output_path,
+    const CompactionServiceInput& compaction_service_input,
+    CompactionServiceResult* compaction_service_result)
+    : CompactionJob(
+          job_id, compaction, db_options, mutable_db_options, file_options,
+          versions, shutting_down, 0, log_buffer, nullptr, output_directory,
+          nullptr, stats, db_mutex, db_error_handler, existing_snapshots,
+          kMaxSequenceNumber, nullptr, table_cache, event_logger,
+          compaction->mutable_cf_options()->paranoid_file_checks,
+          compaction->mutable_cf_options()->report_bg_io_stats, dbname,
+          &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+          nullptr, nullptr, db_id, db_session_id,
+          compaction->column_family_data()->GetFullHistoryTsLow()),
+      output_path_(output_path),
+      compaction_input_(compaction_service_input),
+      compaction_result_(compaction_service_result) {}
+
+Status CompactionServiceCompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+
+  auto* c = compact_->compaction;
+  assert(c->column_family_data() != nullptr);
+  assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+
+  write_hint_ =
+      c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+  bottommost_level_ = c->bottommost_level();
+
+  Slice begin = compaction_input_.begin;
+  Slice end = compaction_input_.end;
+  compact_->sub_compact_states.emplace_back(
+      c, compaction_input_.has_begin ? &begin : nullptr,
+      compaction_input_.has_end ? &end : nullptr, compaction_input_.approx_size,
+      /*sub_job_id*/ 0);
+
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+  // Pick the only sub-compaction we should have
+  assert(compact_->sub_compact_states.size() == 1);
+  SubcompactionState* sub_compact = compact_->sub_compact_states.data();
+
+  ProcessKeyValueCompaction(sub_compact);
+
+  compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros;
+  compaction_stats_.cpu_micros = sub_compact->compaction_job_stats.cpu_micros;
+
+  RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.cpu_micros);
+
+  Status status = sub_compact->status;
+  IOStatus io_s = sub_compact->io_status;
+
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg,
+                                                    DirFsyncOptions());
+    }
+  }
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    status = io_s;
+  }
+  if (status.ok()) {
+    // TODO: Add verify_table()
+  }
+
+  // Finish up all book-keeping to unify the subcompaction results
+  AggregateStatistics();
+  UpdateCompactionStats();
+  RecordCompactionIOStats();
+
+  LogFlush(db_options_.info_log);
+  compact_->status = status;
+  compact_->status.PermitUncheckedError();
+
+  // Build compaction result
+  compaction_result_->output_level = compact_->compaction->output_level();
+  compaction_result_->output_path = output_path_;
+  for (const auto& output_file : sub_compact->outputs) {
+    auto& meta = output_file.meta;
+    compaction_result_->output_files.emplace_back(
+        MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
+        meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
+        meta.largest.Encode().ToString(), meta.oldest_ancester_time,
+        meta.file_creation_time, output_file.validator.GetHash(),
+        meta.marked_for_compaction);
+  }
+  compaction_result_->num_output_records = sub_compact->num_output_records;
+  compaction_result_->total_bytes = sub_compact->total_bytes;
+
+  return status;
+}
+
+void CompactionServiceCompactionJob::CleanupCompaction() {
+  CompactionJob::CleanupCompaction();
+}
+
+// Internal binary format for the input and result data
+enum BinaryFormatVersion : uint32_t {
+  kOptionsString = 1,  // Use string format similar to Option string format
+};
+
+// offset_of is used to get the offset of a class data member
+// ex: offset_of(&ColumnFamilyDescriptor::options)
+// This call will return the offset of options in ColumnFamilyDescriptor class
+//
+// This is the same as offsetof() but allow us to work with non standard-layout
+// classes and structures
+// refs:
+// http://en.cppreference.com/w/cpp/concept/StandardLayoutType
+// https://gist.github.com/graphitemaster/494f21190bb2c63c5516
+static ColumnFamilyDescriptor dummy_cfd("", ColumnFamilyOptions());
+template <typename T1>
+int offset_of(T1 ColumnFamilyDescriptor::*member) {
+  return int(size_t(&(dummy_cfd.*member)) - size_t(&dummy_cfd));
+}
+
+static CompactionServiceInput dummy_cs_input;
+template <typename T1>
+int offset_of(T1 CompactionServiceInput::*member) {
+  return int(size_t(&(dummy_cs_input.*member)) - size_t(&dummy_cs_input));
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
+    {"name",
+     {offset_of(&ColumnFamilyDescriptor::name), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"options",
+     {offset_of(&ColumnFamilyDescriptor::options), OptionType::kConfigurable,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto cf_options = static_cast<ColumnFamilyOptions*>(addr);
+        return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(),
+                                                value, cf_options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr);
+        std::string result;
+        auto status =
+            GetStringFromColumnFamilyOptions(opts, *cf_options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1);
+        const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2);
+        auto this_conf = CFOptionsAsConfigurable(*this_one);
+        auto that_conf = CFOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
+    {"column_family",
+     OptionTypeInfo::Struct("column_family", &cfd_type_info,
+                            offset_of(&CompactionServiceInput::column_family),
+                            OptionVerificationType::kNormal,
+                            OptionTypeFlags::kNone)},
+    {"db_options",
+     {offset_of(&CompactionServiceInput::db_options), OptionType::kConfigurable,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto options = static_cast<DBOptions*>(addr);
+        return GetDBOptionsFromString(opts, DBOptions(), value, options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto options = static_cast<const DBOptions*>(addr);
+        std::string result;
+        auto status = GetStringFromDBOptions(opts, *options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const DBOptions*>(addr1);
+        const auto that_one = static_cast<const DBOptions*>(addr2);
+        auto this_conf = DBOptionsAsConfigurable(*this_one);
+        auto that_conf = DBOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+    {"snapshots", OptionTypeInfo::Vector<uint64_t>(
+                      offset_of(&CompactionServiceInput::snapshots),
+                      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                      {0, OptionType::kUInt64T})},
+    {"input_files", OptionTypeInfo::Vector<std::string>(
+                        offset_of(&CompactionServiceInput::input_files),
+                        OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                        {0, OptionType::kEncodedString})},
+    {"output_level",
+     {offset_of(&CompactionServiceInput::output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"has_begin",
+     {offset_of(&CompactionServiceInput::has_begin), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"begin",
+     {offset_of(&CompactionServiceInput::begin), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"has_end",
+     {offset_of(&CompactionServiceInput::has_end), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"end",
+     {offset_of(&CompactionServiceInput::end), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"approx_size",
+     {offset_of(&CompactionServiceInput::approx_size), OptionType::kUInt64T,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cs_output_file_type_info = {
+        {"file_name",
+         {offsetof(struct CompactionServiceOutputFile, file_name),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, largest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, smallest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, largest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"oldest_ancester_time",
+         {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_creation_time",
+         {offsetof(struct CompactionServiceOutputFile, file_creation_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"paranoid_hash",
+         {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"marked_for_compaction",
+         {offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_job_stats_type_info = {
+        {"elapsed_micros",
+         {offsetof(struct CompactionJobStats, elapsed_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cpu_micros",
+         {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"num_input_records",
+         {offsetof(struct CompactionJobStats, num_input_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_blobs_read",
+         {offsetof(struct CompactionJobStats, num_blobs_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files",
+         {offsetof(struct CompactionJobStats, num_input_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_at_output_level",
+         {offsetof(struct CompactionJobStats, num_input_files_at_output_level),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_records",
+         {offsetof(struct CompactionJobStats, num_output_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files",
+         {offsetof(struct CompactionJobStats, num_output_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files_blob",
+         {offsetof(struct CompactionJobStats, num_output_files_blob),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_full_compaction",
+         {offsetof(struct CompactionJobStats, is_full_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_manual_compaction",
+         {offsetof(struct CompactionJobStats, is_manual_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_bytes",
+         {offsetof(struct CompactionJobStats, total_input_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_blob_bytes_read",
+         {offsetof(struct CompactionJobStats, total_blob_bytes_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes",
+         {offsetof(struct CompactionJobStats, total_output_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes_blob",
+         {offsetof(struct CompactionJobStats, total_output_bytes_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_records_replaced",
+         {offsetof(struct CompactionJobStats, num_records_replaced),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_key_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_key_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_value_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_value_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_deletion_records",
+         {offsetof(struct CompactionJobStats, num_input_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_expired_deletion_records",
+         {offsetof(struct CompactionJobStats, num_expired_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_corrupt_keys",
+         {offsetof(struct CompactionJobStats, num_corrupt_keys),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_write_nanos",
+         {offsetof(struct CompactionJobStats, file_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_range_sync_nanos",
+         {offsetof(struct CompactionJobStats, file_range_sync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_fsync_nanos",
+         {offsetof(struct CompactionJobStats, file_fsync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_prepare_write_nanos",
+         {offsetof(struct CompactionJobStats, file_prepare_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, smallest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, largest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_fallthru",
+         {offsetof(struct CompactionJobStats, num_single_del_fallthru),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_mismatch",
+         {offsetof(struct CompactionJobStats, num_single_del_mismatch),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+namespace {
+// this is a helper struct to serialize and deserialize class Status, because
+// Status's members are not public.
+struct StatusSerializationAdapter {
+  uint8_t code;
+  uint8_t subcode;
+  uint8_t severity;
+  std::string message;
+
+  StatusSerializationAdapter() {}
+  explicit StatusSerializationAdapter(const Status& s) {
+    code = s.code();
+    subcode = s.subcode();
+    severity = s.severity();
+    auto msg = s.getState();
+    message = msg ? msg : "";
+  }
+
+  Status GetStatus() {
+    return Status(static_cast<Status::Code>(code),
+                  static_cast<Status::SubCode>(subcode),
+                  static_cast<Status::Severity>(severity), message);
+  }
+};
+}  // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    status_adapter_type_info = {
+        {"code",
+         {offsetof(struct StatusSerializationAdapter, code),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"subcode",
+         {offsetof(struct StatusSerializationAdapter, subcode),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"severity",
+         {offsetof(struct StatusSerializationAdapter, severity),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"message",
+         {offsetof(struct StatusSerializationAdapter, message),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
+    {"status",
+     {offsetof(struct CompactionServiceResult, status),
+      OptionType::kCustomizable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto status_obj = static_cast<Status*>(addr);
+        StatusSerializationAdapter adapter;
+        Status s = OptionTypeInfo::ParseType(
+            opts, value, status_adapter_type_info, &adapter);
+        *status_obj = adapter.GetStatus();
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto status_obj = static_cast<const Status*>(addr);
+        StatusSerializationAdapter adapter(*status_obj);
+        std::string result;
+        Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info,
+                                                 &adapter, &result);
+        *value = "{" + result + "}";
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr1, const void* addr2, std::string* mismatch) {
+        const auto status1 = static_cast<const Status*>(addr1);
+        const auto status2 = static_cast<const Status*>(addr2);
+        StatusSerializationAdapter adatper1(*status1);
+        StatusSerializationAdapter adapter2(*status2);
+        return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info,
+                                             &adatper1, &adapter2, mismatch);
+      }}},
+    {"output_files",
+     OptionTypeInfo::Vector<CompactionServiceOutputFile>(
+         offsetof(struct CompactionServiceResult, output_files),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+         OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0,
+                                OptionVerificationType::kNormal,
+                                OptionTypeFlags::kNone))},
+    {"output_level",
+     {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"output_path",
+     {offsetof(struct CompactionServiceResult, output_path),
+      OptionType::kEncodedString, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"num_output_records",
+     {offsetof(struct CompactionServiceResult, num_output_records),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"total_bytes",
+     {offsetof(struct CompactionServiceResult, total_bytes),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_read",
+     {offsetof(struct CompactionServiceResult, bytes_read),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_written",
+     {offsetof(struct CompactionServiceResult, bytes_written),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"stats", OptionTypeInfo::Struct(
+                  "stats", &compaction_job_stats_type_info,
+                  offsetof(struct CompactionServiceResult, stats),
+                  OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
+Status CompactionServiceInput::Read(const std::string& data_str,
+                                    CompactionServiceInput* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceInput string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Input data version not supported: " +
+        ToString(format_version));
+  }
+}
+
+Status CompactionServiceInput::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output);
+}
+
+Status CompactionServiceResult::Read(const std::string& data_str,
+                                     CompactionServiceResult* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceResult string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Result data version not supported: " +
+        ToString(format_version));
+  }
+}
+
+Status CompactionServiceResult::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output);
+}
+
+#ifndef NDEBUG
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other,
+                                          std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other,
+                                       mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
+                                         std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other,
+                                       mismatch);
+}
+#endif  // NDEBUG
+#endif  // !ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job.h	2025-05-19 16:14:27.000000000 +0000
@@ -17,9 +17,9 @@
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_file_completion_callback.h"
 #include "db/column_family.h"
 #include "db/compaction/compaction_iterator.h"
-#include "db/dbformat.h"
 #include "db/flush_scheduler.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
@@ -50,6 +50,7 @@
 class ErrorHandler;
 class MemTable;
 class SnapshotChecker;
+class SystemClock;
 class TableCache;
 class Version;
 class VersionEdit;
@@ -62,25 +63,29 @@
 // if needed.
 class CompactionJob {
  public:
-  CompactionJob(int job_id, Compaction* compaction,
-                const ImmutableDBOptions& db_options,
-                const FileOptions& file_options, VersionSet* versions,
-                const std::atomic<bool>* shutting_down,
-                const SequenceNumber preserve_deletes_seqnum,
-                LogBuffer* log_buffer, Directory* db_directory,
-                Directory* output_directory, Statistics* stats,
-                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-                std::vector<SequenceNumber> existing_snapshots,
-                SequenceNumber earliest_write_conflict_snapshot,
-                const SnapshotChecker* snapshot_checker,
-                std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-                bool paranoid_file_checks, bool measure_io_stats,
-                const std::string& dbname,
-                CompactionJobStats* compaction_job_stats,
-                Env::Priority thread_pri,
-                const std::atomic<bool>* manual_compaction_paused = nullptr);
+  CompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
+      const FileOptions& file_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down,
+      const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
+      FSDirectory* db_directory, FSDirectory* output_directory,
+      FSDirectory* blob_output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      bool paranoid_file_checks, bool measure_io_stats,
+      const std::string& dbname, CompactionJobStats* compaction_job_stats,
+      Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+      const std::atomic<int>* manual_compaction_paused = nullptr,
+      const std::atomic<bool>* manual_compaction_canceled = nullptr,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      std::string full_history_ts_low = "",
+      BlobFileCompletionCallback* blob_callback = nullptr);
 
-  ~CompactionJob();
+  virtual ~CompactionJob();
 
   // no copy/move
   CompactionJob(CompactionJob&& job) = delete;
@@ -100,11 +105,39 @@
   // Add compaction input/output to the current version
   Status Install(const MutableCFOptions& mutable_cf_options);
 
- private:
+  // Return the IO status
+  IOStatus io_status() const { return io_status_; }
+
+ protected:
   struct SubcompactionState;
+  // CompactionJob state
+  struct CompactionState;
 
   void AggregateStatistics();
+  void UpdateCompactionStats();
+  void LogCompaction();
+  virtual void RecordCompactionIOStats();
+  void CleanupCompaction();
+
+  // Call compaction filter. Then iterate through input and compact the
+  // kv-pairs
+  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
 
+  CompactionState* compact_;
+  InternalStats::CompactionStats compaction_stats_;
+  const ImmutableDBOptions& db_options_;
+  const MutableDBOptions mutable_db_options_copy_;
+  LogBuffer* log_buffer_;
+  FSDirectory* output_directory_;
+  Statistics* stats_;
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+
+  Env::WriteLifeTimeHint write_hint_;
+
+  IOStatus io_status_;
+
+ private:
   // Generates a histogram representing potential divisions of key ranges from
   // the input. It adds the starting and/or ending keys of certain input files
   // to the working set and then finds the approximate size of data in between
@@ -112,12 +145,12 @@
   // consecutive groups such that each group has a similar size.
   void GenSubcompactionBoundaries();
 
+  CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
+      SubcompactionState* sub_compact);
+
   // update the thread status for starting a compaction.
   void ReportStartedCompaction(Compaction* compaction);
   void AllocateCompactionOutputFileNumbers();
-  // Call compaction filter. Then iterate through input and compact the
-  // kv-pairs
-  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
 
   Status FinishCompactionOutputFile(
       const Status& input_status, SubcompactionState* sub_compact,
@@ -125,45 +158,37 @@
       CompactionIterationStats* range_del_out_stats,
       const Slice* next_table_min_key = nullptr);
   Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
-  void RecordCompactionIOStats();
   Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
-  void CleanupCompaction();
   void UpdateCompactionJobStats(
     const InternalStats::CompactionStats& stats) const;
   void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
                          CompactionJobStats* compaction_job_stats = nullptr);
 
-  void UpdateCompactionStats();
   void UpdateCompactionInputStatsHelper(
       int* num_files, uint64_t* bytes_read, int input_level);
 
-  void LogCompaction();
-
-  int job_id_;
+  uint32_t job_id_;
 
-  // CompactionJob state
-  struct CompactionState;
-  CompactionState* compact_;
   CompactionJobStats* compaction_job_stats_;
-  InternalStats::CompactionStats compaction_stats_;
 
   // DBImpl state
   const std::string& dbname_;
-  const ImmutableDBOptions& db_options_;
+  const std::string db_id_;
+  const std::string db_session_id_;
   const FileOptions file_options_;
 
   Env* env_;
-  FileSystem* fs_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  FileSystemPtr fs_;
   // env_option optimized for compaction table reads
   FileOptions file_options_for_read_;
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
-  const std::atomic<bool>* manual_compaction_paused_;
+  const std::atomic<int>* manual_compaction_paused_;
+  const std::atomic<bool>* manual_compaction_canceled_;
   const SequenceNumber preserve_deletes_seqnum_;
-  LogBuffer* log_buffer_;
-  Directory* db_directory_;
-  Directory* output_directory_;
-  Statistics* stats_;
+  FSDirectory* db_directory_;
+  FSDirectory* blob_output_directory_;
   InstrumentedMutex* db_mutex_;
   ErrorHandler* db_error_handler_;
   // If there were two snapshots with seq numbers s1 and
@@ -183,16 +208,158 @@
 
   EventLogger* event_logger_;
 
-  // Is this compaction creating a file in the bottom most level?
-  bool bottommost_level_;
   bool paranoid_file_checks_;
   bool measure_io_stats_;
   // Stores the Slices that designate the boundaries for each subcompaction
   std::vector<Slice> boundaries_;
   // Stores the approx size of keys covered in the range of each subcompaction
   std::vector<uint64_t> sizes_;
-  Env::WriteLifeTimeHint write_hint_;
   Env::Priority thread_pri_;
+  std::string full_history_ts_low_;
+  BlobFileCompletionCallback* blob_callback_;
+
+  uint64_t GetCompactionId(SubcompactionState* sub_compact);
+
+  // Get table file name in where it's outputting to, which should also be in
+  // `output_directory_`.
+  virtual std::string GetTableFileName(uint64_t file_number);
+};
+
+// CompactionServiceInput is used the pass compaction information between two
+// db instances. It contains the information needed to do a compaction. It
+// doesn't contain the LSM tree information, which is passed though MANIFEST
+// file.
+struct CompactionServiceInput {
+  ColumnFamilyDescriptor column_family;
+
+  DBOptions db_options;
+
+  std::vector<SequenceNumber> snapshots;
+
+  // SST files for compaction, it should already be expended to include all the
+  // files needed for this compaction, for both input level files and output
+  // level files.
+  std::vector<std::string> input_files;
+  int output_level;
+
+  // information for subcompaction
+  bool has_begin = false;
+  std::string begin;
+  bool has_end = false;
+  std::string end;
+  uint64_t approx_size = 0;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceInput* obj);
+  Status Write(std::string* output);
+
+  // Initialize a dummy ColumnFamilyDescriptor
+  CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceInput* other);
+  bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceOutputFile is the metadata for the output SST file
+struct CompactionServiceOutputFile {
+  std::string file_name;
+  SequenceNumber smallest_seqno;
+  SequenceNumber largest_seqno;
+  std::string smallest_internal_key;
+  std::string largest_internal_key;
+  uint64_t oldest_ancester_time;
+  uint64_t file_creation_time;
+  uint64_t paranoid_hash;
+  bool marked_for_compaction;
+
+  CompactionServiceOutputFile() = default;
+  CompactionServiceOutputFile(
+      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
+      std::string _smallest_internal_key, std::string _largest_internal_key,
+      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+      uint64_t _paranoid_hash, bool _marked_for_compaction)
+      : file_name(name),
+        smallest_seqno(smallest),
+        largest_seqno(largest),
+        smallest_internal_key(std::move(_smallest_internal_key)),
+        largest_internal_key(std::move(_largest_internal_key)),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time),
+        paranoid_hash(_paranoid_hash),
+        marked_for_compaction(_marked_for_compaction) {}
+};
+
+// CompactionServiceResult contains the compaction result from a different db
+// instance, with these information, the primary db instance with write
+// permission is able to install the result to the DB.
+struct CompactionServiceResult {
+  Status status;
+  std::vector<CompactionServiceOutputFile> output_files;
+  int output_level;
+
+  // location of the output files
+  std::string output_path;
+
+  // some statistics about the compaction
+  uint64_t num_output_records = 0;
+  uint64_t total_bytes = 0;
+  uint64_t bytes_read = 0;
+  uint64_t bytes_written = 0;
+  CompactionJobStats stats;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceResult* obj);
+  Status Write(std::string* output);
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceResult* other);
+  bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceCompactionJob is an read-only compaction job, it takes
+// input information from `compaction_service_input` and put result information
+// in `compaction_service_result`, the SST files are generated to `output_path`.
+class CompactionServiceCompactionJob : private CompactionJob {
+ public:
+  CompactionServiceCompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
+      const FileOptions& file_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+      FSDirectory* output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+      const std::string& db_id, const std::string& db_session_id,
+      const std::string& output_path,
+      const CompactionServiceInput& compaction_service_input,
+      CompactionServiceResult* compaction_service_result);
+
+  // Run the compaction in current thread and return the result
+  Status Run();
+
+  void CleanupCompaction();
+
+  IOStatus io_status() const { return CompactionJob::io_status(); }
+
+ protected:
+  void RecordCompactionIOStats() override;
+
+ private:
+  // Get table file name in output_path
+  std::string GetTableFileName(uint64_t file_number) override;
+  // Specific the compaction output path, otherwise it uses default DB path
+  const std::string output_path_;
+
+  // Compaction job input
+  const CompactionServiceInput& compaction_input_;
+
+  // Compaction job result
+  CompactionServiceResult* compaction_result_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_stats_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -24,8 +24,6 @@
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
-#include "logging/logging.h"
-#include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "monitoring/thread_status_util.h"
 #include "port/stack_trace.h"
@@ -52,6 +50,7 @@
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/cast_util.h"
 #include "util/compression.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
@@ -126,9 +125,7 @@
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
 
-  DBImpl* dbfull() {
-    return reinterpret_cast<DBImpl*>(db_);
-  }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
                             const Options& options) {
@@ -271,10 +268,10 @@
     if (cf == 0) {
       // default cfd
       EXPECT_TRUE(db_->GetProperty(
-          "rocksdb.num-files-at-level" + NumberToString(level), &property));
+          "rocksdb.num-files-at-level" + ToString(level), &property));
     } else {
       EXPECT_TRUE(db_->GetProperty(
-          handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+          handles_[cf], "rocksdb.num-files-at-level" + ToString(level),
           &property));
     }
     return atoi(property.c_str());
@@ -299,15 +296,14 @@
     return result;
   }
 
-  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
+  Status Size(uint64_t* size, const Slice& start, const Slice& limit,
+              int cf = 0) {
     Range r(start, limit);
-    uint64_t size;
     if (cf == 0) {
-      db_->GetApproximateSizes(&r, 1, &size);
+      return db_->GetApproximateSizes(&r, 1, size);
     } else {
-      db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+      return db_->GetApproximateSizes(handles_[1], &r, 1, size);
     }
-    return size;
   }
 
   void Compact(int cf, const Slice& start, const Slice& limit,
@@ -460,6 +456,7 @@
     ASSERT_EQ(current_stats.num_output_files,
         stats.num_output_files);
 
+    ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction);
     ASSERT_EQ(current_stats.is_manual_compaction,
         stats.is_manual_compaction);
 
@@ -572,7 +569,7 @@
     uint64_t num_input_records, size_t key_size, size_t value_size,
     size_t num_output_files, uint64_t num_output_records,
     double compression_ratio, uint64_t num_records_replaced,
-    bool is_manual = true) {
+    bool is_full = false, bool is_manual = true) {
   CompactionJobStats stats;
   stats.Reset();
 
@@ -596,6 +593,7 @@
   stats.total_input_raw_value_bytes =
       num_input_records * value_size;
 
+  stats.is_full_compaction = is_full;
   stats.is_manual_compaction = is_manual;
 
   stats.num_records_replaced = num_records_replaced;
@@ -797,7 +795,7 @@
     }
 
     ASSERT_OK(Flush(1));
-    reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
 
     stats_checker->set_verify_next_comp_io_stats(true);
     std::atomic<bool> first_prepare_write(true);
@@ -895,7 +893,7 @@
   CompactRangeOptions cr_options;
   cr_options.change_level = true;
   cr_options.target_level = 2;
-  db_->CompactRange(cr_options, handles_[1], nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr));
   ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
 
   // Stage 2: Generate files including keys from the entire key range
@@ -982,26 +980,21 @@
     if (num_input_units == 0) {
       continue;
     }
+    // A full compaction only happens when the number of flushes equals to
+    // the number of compaction input runs.
+    bool is_full = num_flushes == num_input_units;
     // The following statement determines the expected smallest key
-    // based on whether it is a full compaction.  A full compaction only
-    // happens when the number of flushes equals to the number of compaction
-    // input runs.
-    uint64_t smallest_key =
-        (num_flushes == num_input_units) ?
-            key_base : key_base * (num_flushes - 1);
+    // based on whether it is a full compaction.
+    uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1);
 
-    stats_checker->AddExpectedStats(
-        NewManualCompactionJobStats(
-            Key(smallest_key, 10),
-            Key(smallest_key + key_base * num_input_units - key_interval, 10),
-            num_input_units,
-            num_input_units > 2 ? num_input_units / 2 : 0,
-            num_keys_per_table * num_input_units,
-            kKeySize, kValueSize,
-            num_input_units,
-            num_keys_per_table * num_input_units,
-            1.0, 0, false));
-    dbfull()->TEST_WaitForCompact();
+    stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+        Key(smallest_key, 10),
+        Key(smallest_key + key_base * num_input_units - key_interval, 10),
+        num_input_units, num_input_units > 2 ? num_input_units / 2 : 0,
+        num_keys_per_table * num_input_units, kKeySize, kValueSize,
+        num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full,
+        false));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
 
@@ -1012,7 +1005,7 @@
         &rnd, start_key, start_key + key_base - 1,
         kKeySize, kValueSize, key_interval,
         compression_ratio, 1);
-    reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
   }
   ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_job_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,8 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "db/compaction/compaction_job.h"
+
 #include <algorithm>
 #include <array>
 #include <cinttypes>
@@ -12,15 +14,16 @@
 #include <string>
 #include <tuple>
 
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
 #include "db/column_family.h"
-#include "db/compaction/compaction_job.h"
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
 #include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
@@ -67,30 +70,42 @@
 
 }  // namespace
 
-// TODO(icanadi) Make it simpler once we mock out VersionSet
-class CompactionJobTest : public testing::Test {
- public:
-  CompactionJobTest()
-      : env_(Env::Default()),
-        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
-        dbname_(test::PerThreadDBPath("compaction_job_test")),
+class CompactionJobTestBase : public testing::Test {
+ protected:
+  CompactionJobTestBase(std::string dbname, const Comparator* ucmp,
+                        std::function<std::string(uint64_t)> encode_u64_ts)
+      : dbname_(std::move(dbname)),
+        ucmp_(ucmp),
         db_options_(),
         mutable_cf_options_(cf_options_),
+        mutable_db_options_(),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
                                  &write_controller_,
-                                 /*block_cache_tracer=*/nullptr)),
+                                 /*block_cache_tracer=*/nullptr,
+                                 /*io_tracer=*/nullptr, /*db_session_id*/ "")),
         shutting_down_(false),
         preserve_deletes_seqnum_(0),
         mock_table_factory_(new mock::MockTableFactory()),
-        error_handler_(nullptr, db_options_, &mutex_) {
+        error_handler_(nullptr, db_options_, &mutex_),
+        encode_u64_ts_(std::move(encode_u64_ts)) {
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    env_ = base_env;
+    fs_ = env_->GetFileSystem();
+  }
+
+  void SetUp() override {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.env = env_;
     db_options_.fs = fs_;
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
+    cf_options_.comparator = ucmp_;
+    cf_options_.table_factory = mock_table_factory_;
   }
 
   std::string GenerateFileName(uint64_t file_number) {
@@ -101,9 +116,10 @@
     return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
   }
 
-  static std::string KeyStr(const std::string& user_key,
-                            const SequenceNumber seq_num, const ValueType t) {
-    return InternalKey(user_key, seq_num, t).Encode().ToString();
+  std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
+                     const ValueType t, uint64_t ts = 0) {
+    std::string user_key_with_ts = user_key + encode_u64_ts_(ts);
+    return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString();
   }
 
   static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
@@ -129,7 +145,7 @@
     return blob_index;
   }
 
-  void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
+  void AddMockFile(const mock::KVVector& contents, int level = 0) {
     assert(contents.size() > 0);
 
     bool first_key = true;
@@ -143,7 +159,8 @@
       std::string skey;
       std::string value;
       std::tie(skey, value) = kv;
-      bool parsed = ParseInternalKey(skey, &key);
+      const Status pik_status =
+          ParseInternalKey(skey, &key, true /* log_err_key */);
 
       smallest_seqno = std::min(smallest_seqno, key.sequence);
       largest_seqno = std::max(largest_seqno, key.sequence);
@@ -161,7 +178,7 @@
 
       first_key = false;
 
-      if (parsed && key.type == kTypeBlobIndex) {
+      if (pik_status.ok() && key.type == kTypeBlobIndex) {
         BlobIndex blob_index;
         const Status s = blob_index.DecodeFrom(value);
         if (!s.ok()) {
@@ -186,13 +203,16 @@
 
     VersionEdit edit;
     edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
-                 smallest_seqno, largest_seqno, false, oldest_blob_file_number,
-                 kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                 kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+                 smallest_seqno, largest_seqno, false, Temperature::kUnknown,
+                 oldest_blob_file_number, kUnknownOldestAncesterTime,
+                 kUnknownFileCreationTime, kUnknownFileChecksum,
+                 kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+                 kDisableUserTimestamp);
 
     mutex_.Lock();
-    versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
-                           mutable_cf_options_, &edit, &mutex_);
+    EXPECT_OK(
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, &edit, &mutex_));
     mutex_.Unlock();
   }
 
@@ -203,11 +223,11 @@
   }
 
   // returns expected result after compaction
-  stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) {
-    auto expected_results = mock::MakeMockFile();
-    const int kKeysPerFile = 10000;
-    const int kCorruptKeysPerFile = 200;
-    const int kMatchingKeys = kKeysPerFile / 2;
+  mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) {
+    stl_wrappers::KVMap expected_results;
+    constexpr int kKeysPerFile = 10000;
+    constexpr int kCorruptKeysPerFile = 200;
+    constexpr int kMatchingKeys = kKeysPerFile / 2;
     SequenceNumber sequence_number = 0;
 
     auto corrupt_id = [&](int id) {
@@ -230,49 +250,51 @@
           test::CorruptKeyType(&internal_key);
           test::CorruptKeyType(&bottommost_internal_key);
         }
-        contents.insert({ internal_key.Encode().ToString(), value });
+        contents.push_back({internal_key.Encode().ToString(), value});
         if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
           expected_results.insert(
-              { bottommost_internal_key.Encode().ToString(), value });
+              {bottommost_internal_key.Encode().ToString(), value});
         }
       }
+      mock::SortKVVector(&contents, ucmp_);
 
       AddMockFile(contents);
     }
 
     SetLastSequence(sequence_number);
 
-    return expected_results;
+    mock::KVVector expected_results_kvvector;
+    for (auto& kv : expected_results) {
+      expected_results_kvvector.push_back({kv.first, kv.second});
+    }
+
+    return expected_results_kvvector;
   }
 
   void NewDB() {
-    DestroyDB(dbname_, Options());
+    EXPECT_OK(DestroyDB(dbname_, Options()));
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
-    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
-                                   table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_,
-                                   /*block_cache_tracer=*/nullptr));
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
     compaction_job_stats_.Reset();
-    SetIdentityFile(env_, dbname_);
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
 
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
-      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
-      std::string db_id;
-      impl->GetDbIdentityFromIdentityFile(&db_id);
-      new_db.SetDBId(db_id);
-    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
 
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    std::unique_ptr<WritableFileWriter> file_writer;
+    const auto& fs = env_->GetFileSystem();
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
@@ -281,21 +303,22 @@
     }
     ASSERT_OK(s);
     // Make "CURRENT" file that points to the new manifest file.
-    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+    s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+
+    ASSERT_OK(s);
 
-    std::vector<ColumnFamilyDescriptor> column_families;
-    cf_options_.table_factory = mock_table_factory_;
     cf_options_.merge_operator = merge_op_;
     cf_options_.compaction_filter = compaction_filter_.get();
+    std::vector<ColumnFamilyDescriptor> column_families;
     column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
 
-    EXPECT_OK(versions_->Recover(column_families, false));
+    ASSERT_OK(versions_->Recover(column_families, false));
     cfd_ = versions_->GetColumnFamilySet()->GetDefault();
   }
 
   void RunCompaction(
       const std::vector<std::vector<FileMetaData*>>& input_files,
-      const stl_wrappers::KVMap& expected_results,
+      const mock::KVVector& expected_results,
       const std::vector<SequenceNumber>& snapshots = {},
       SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
       int output_level = 1, bool verify = true,
@@ -314,11 +337,12 @@
       num_input_files += level_files.size();
     }
 
-    Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(),
-                          *cfd->GetLatestMutableCFOptions(),
-                          compaction_input_files, output_level, 1024 * 1024,
-                          10 * 1024 * 1024, 0, kNoCompression,
-                          cfd->ioptions()->compression_opts, 0, {}, true);
+    Compaction compaction(
+        cfd->current()->storage_info(), *cfd->ioptions(),
+        *cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+        compaction_input_files, output_level, 1024 * 1024, 10 * 1024 * 1024, 0,
+        kNoCompression, cfd->GetLatestMutableCFOptions()->compression_opts,
+        Temperature::kUnknown, 0, {}, true);
     compaction.SetInputVersion(cfd->current());
 
     LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
@@ -326,22 +350,28 @@
     EventLogger event_logger(db_options_.info_log.get());
     // TODO(yiwu) add a mock snapshot checker and add test for it.
     SnapshotChecker* snapshot_checker = nullptr;
+    ASSERT_TRUE(full_history_ts_low_.empty() ||
+                ucmp_->timestamp_size() == full_history_ts_low_.size());
     CompactionJob compaction_job(
-        0, &compaction, db_options_, env_options_, versions_.get(),
-        &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr,
-        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+        0, &compaction, db_options_, mutable_db_options_, env_options_,
+        versions_.get(), &shutting_down_, preserve_deletes_seqnum_, &log_buffer,
+        nullptr, nullptr, nullptr, nullptr, &mutex_, &error_handler_, snapshots,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger, false, false, dbname_, &compaction_job_stats_,
-        Env::Priority::USER);
+        Env::Priority::USER, nullptr /* IOTracer */,
+        /*manual_compaction_paused=*/nullptr,
+        /*manual_compaction_canceled=*/nullptr, /*db_id=*/"",
+        /*db_session_id=*/"", full_history_ts_low_);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare();
     mutex_.Unlock();
-    Status s;
-    s = compaction_job.Run();
+    Status s = compaction_job.Run();
     ASSERT_OK(s);
+    ASSERT_OK(compaction_job.io_status());
     mutex_.Lock();
     ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+    ASSERT_OK(compaction_job.io_status());
     mutex_.Unlock();
 
     if (verify) {
@@ -363,13 +393,16 @@
     }
   }
 
+  std::shared_ptr<Env> env_guard_;
   Env* env_;
   std::shared_ptr<FileSystem> fs_;
   std::string dbname_;
+  const Comparator* const ucmp_;
   EnvOptions env_options_;
   ImmutableDBOptions db_options_;
   ColumnFamilyOptions cf_options_;
   MutableCFOptions mutable_cf_options_;
+  MutableDBOptions mutable_db_options_;
   std::shared_ptr<Cache> table_cache_;
   WriteController write_controller_;
   WriteBufferManager write_buffer_manager_;
@@ -383,6 +416,17 @@
   std::unique_ptr<CompactionFilter> compaction_filter_;
   std::shared_ptr<MergeOperator> merge_op_;
   ErrorHandler error_handler_;
+  std::string full_history_ts_low_;
+  const std::function<std::string(uint64_t)> encode_u64_ts_;
+};
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTest()
+      : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_test"),
+                              BytewiseComparator(),
+                              [](uint64_t /*ts*/) { return ""; }) {}
 };
 
 TEST_F(CompactionJobTest, Simple) {
@@ -395,7 +439,7 @@
   RunCompaction({ files }, expected_results);
 }
 
-TEST_F(CompactionJobTest, SimpleCorrupted) {
+TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) {
   NewDB();
 
   auto expected_results = CreateTwoFiles(true);
@@ -636,7 +680,7 @@
   SetLastSequence(11U);
   auto files = cfd_->current()->storage_info()->LevelFiles(0);
 
-  stl_wrappers::KVMap empty_map;
+  mock::KVVector empty_map;
   RunCompaction({files}, empty_map);
 }
 
@@ -989,7 +1033,7 @@
 // single deletion and the (single) deletion gets removed while the corrupt key
 // gets written out. TODO(noetzli): We probably want a better way to treat
 // corrupt keys.
-TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
+TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) {
   NewDB();
 
   auto file1 =
@@ -1063,10 +1107,312 @@
                 /* expected_oldest_blob_file_number */ 19);
 }
 
+TEST_F(CompactionJobTest, InputSerialization) {
+  // Setup a random CompactionServiceInput
+  CompactionServiceInput input;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  input.column_family.options.comparator = ReverseBytewiseComparator();
+  input.column_family.options.max_bytes_for_level_base =
+      rnd64.Uniform(UINT64_MAX);
+  input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
+  input.column_family.options.compression = kZSTD;
+  input.column_family.options.compression_opts.level = 4;
+  input.db_options.max_background_flushes = 10;
+  input.db_options.paranoid_checks = rnd.OneIn(2);
+  input.db_options.statistics = CreateDBStatistics();
+  input.db_options.env = env_;
+  while (!rnd.OneIn(10)) {
+    input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
+  }
+  while (!rnd.OneIn(10)) {
+    input.input_files.emplace_back(rnd.RandomString(
+        rnd.Uniform(kStrMaxLen - 1) +
+        1));  // input file name should have at least one character
+  }
+  input.output_level = 4;
+  input.has_begin = rnd.OneIn(2);
+  if (input.has_begin) {
+    input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+  input.has_end = rnd.OneIn(2);
+  if (input.has_end) {
+    input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+  input.approx_size = rnd64.Uniform(UINT64_MAX);
+
+  std::string output;
+  ASSERT_OK(input.Write(&output));
+
+  // Test deserialization
+  CompactionServiceInput deserialized1;
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&input));
+
+  // Test mismatch
+  deserialized1.db_options.max_background_flushes += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+
+  // Test unknown field
+  CompactionServiceInput deserialized2;
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&input));
+
+  // Test missing field
+  CompactionServiceInput deserialized3;
+  deserialized3.output_level = 0;
+  std::string to_remove = "output_level=4;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "output_level");
+
+  // manually set the value back, should match the original structure
+  deserialized3.output_level = 4;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&input));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceInput::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(CompactionJobTest, ResultSerialization) {
+  // Setup a random CompactionServiceResult
+  CompactionServiceResult result;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  std::vector<Status> status_list = {
+      Status::OK(),
+      Status::InvalidArgument("invalid option"),
+      Status::Aborted("failed to run"),
+      Status::NotSupported("not supported option"),
+  };
+  result.status =
+      status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
+  while (!rnd.OneIn(10)) {
+    result.output_files.emplace_back(
+        rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX), rnd.OneIn(2));
+  }
+  result.output_level = rnd.Uniform(10);
+  result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  result.num_output_records = rnd64.Uniform(UINT64_MAX);
+  result.total_bytes = rnd64.Uniform(UINT64_MAX);
+  result.bytes_read = 123;
+  result.bytes_written = rnd64.Uniform(UINT64_MAX);
+  result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_output_files = rnd.Uniform(1000);
+  result.stats.is_full_compaction = rnd.OneIn(2);
+  result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_input_files = 9;
+
+  std::string output;
+  ASSERT_OK(result.Write(&output));
+
+  // Test deserialization
+  CompactionServiceResult deserialized1;
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&result));
+
+  // Test mismatch
+  deserialized1.stats.num_input_files += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "stats.num_input_files");
+
+  // Test unknown field
+  CompactionServiceResult deserialized2;
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&result));
+
+  // Test missing field
+  CompactionServiceResult deserialized3;
+  deserialized3.bytes_read = 0;
+  std::string to_remove = "bytes_read=123;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "bytes_read");
+
+  deserialized3.bytes_read = 123;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&result));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceResult::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+  for (const auto& item : status_list) {
+    item.PermitUncheckedError();
+  }
+}
+
+class CompactionJobTimestampTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTimestampTest()
+      : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"),
+                              test::ComparatorWithU64Ts(), test::EncodeInt) {}
+};
+
+TEST_F(CompactionJobTimestampTest, GCDisabled) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+                          {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+                          {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+                          {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}});
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+       {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+  AddMockFile(file2);
+
+  SetLastSequence(10);
+
+  auto expected_results = mock::MakeMockFile(
+      {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+       {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+       {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+       {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+       {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"},
+       {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTimestampTest, NoKeyExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  AddMockFile(file2);
+
+  SetLastSequence(101);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  full_history_ts_low_ = encode_u64_ts_(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTimestampTest, AllKeysExpired) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""},
+       {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""},
+       {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"},
+       {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"},
+       {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}});
+  AddMockFile(file2);
+
+  SetLastSequence(7);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  full_history_ts_low_ = encode_u64_ts_(std::numeric_limits<uint64_t>::max());
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTimestampTest, SomeKeysExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"},
+       {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"},
+       {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}});
+  AddMockFile(file2);
+
+  SetLastSequence(6);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  full_history_ts_low_ = encode_u64_ts_(49);
+  RunCompaction({files}, expected_results);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.cc	2025-05-19 16:14:27.000000000 +0000
@@ -15,9 +15,11 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "db/column_family.h"
 #include "file/filename.h"
 #include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "monitoring/statistics.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
@@ -110,9 +112,9 @@
 
   // If bottommost_compression is set and we are compacting to the
   // bottommost level then we should use it.
-  if (ioptions.bottommost_compression != kDisableCompressionOption &&
+  if (mutable_cf_options.bottommost_compression != kDisableCompressionOption &&
       level >= (vstorage->num_non_empty_levels() - 1)) {
-    return ioptions.bottommost_compression;
+    return mutable_cf_options.bottommost_compression;
   }
   // If the user has specified a different compression level for each level,
   // then pick the compression for that level.
@@ -132,25 +134,23 @@
   }
 }
 
-CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options,
                                          const VersionStorageInfo* vstorage,
                                          int level,
                                          const bool enable_compression) {
   if (!enable_compression) {
-    return ioptions.compression_opts;
+    return cf_options.compression_opts;
   }
-  // If bottommost_compression is set and we are compacting to the
-  // bottommost level then we should use the specified compression options
-  // for the bottmomost_compression.
-  if (ioptions.bottommost_compression != kDisableCompressionOption &&
-      level >= (vstorage->num_non_empty_levels() - 1) &&
-      ioptions.bottommost_compression_opts.enabled) {
-    return ioptions.bottommost_compression_opts;
+  // If bottommost_compression_opts is enabled and we are compacting to the
+  // bottommost level then we should use the specified compression options.
+  if (level >= (vstorage->num_non_empty_levels() - 1) &&
+      cf_options.bottommost_compression_opts.enabled) {
+    return cf_options.bottommost_compression_opts;
   }
-  return ioptions.compression_opts;
+  return cf_options.compression_opts;
 }
 
-CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
+CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
                                    const InternalKeyComparator* icmp)
     : ioptions_(ioptions), icmp_(icmp) {}
 
@@ -332,7 +332,7 @@
     const CompactionOptions& compact_options,
     const std::vector<CompactionInputFiles>& input_files, int output_level,
     VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
-    uint32_t output_path_id) {
+    const MutableDBOptions& mutable_db_options, uint32_t output_path_id) {
   assert(input_files.size());
   // This compaction output should not overlap with a running compaction as
   // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
@@ -356,11 +356,11 @@
     compression_type = compact_options.compression;
   }
   auto c = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, input_files, output_level,
-      compact_options.output_file_size_limit,
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files,
+      output_level, compact_options.output_file_size_limit,
       mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
-      GetCompressionOptions(ioptions_, vstorage, output_level),
-      compact_options.max_subcompactions,
+      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+      Temperature::kUnknown, compact_options.max_subcompactions,
       /* grandparents */ {}, true);
   RegisterCompaction(c);
   return c;
@@ -532,7 +532,7 @@
       }
     }
     if (expand_inputs) {
-      ROCKS_LOG_INFO(ioptions_.info_log,
+      ROCKS_LOG_INFO(ioptions_.logger,
                      "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
                      "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
                      "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
@@ -554,16 +554,21 @@
   InternalKey start, limit;
   GetRange(inputs, output_level_inputs, &start, &limit);
   // Compute the set of grandparent files that overlap this compaction
-  // (parent == level+1; grandparent == level+2)
-  if (output_level_inputs.level + 1 < NumberLevels()) {
-    vstorage->GetOverlappingInputs(output_level_inputs.level + 1, &start,
-                                   &limit, grandparents);
+  // (parent == level+1; grandparent == level+2 or the first
+  // level after that has overlapping files)
+  for (int level = output_level_inputs.level + 1; level < NumberLevels();
+       level++) {
+    vstorage->GetOverlappingInputs(level, &start, &limit, grandparents);
+    if (!grandparents->empty()) {
+      break;
+    }
   }
 }
 
 Compaction* CompactionPicker::CompactRange(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, int input_level, int output_level,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    int input_level, int output_level,
     const CompactRangeOptions& compact_range_options, const InternalKey* begin,
     const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
     uint64_t max_file_num_to_ignore) {
@@ -626,18 +631,20 @@
     }
 
     Compaction* c = new Compaction(
-        vstorage, ioptions_, mutable_cf_options, std::move(inputs),
-        output_level,
+        vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+        std::move(inputs), output_level,
         MaxFileSizeForLevel(mutable_cf_options, output_level,
                             ioptions_.compaction_style),
         /* max_compaction_bytes */ LLONG_MAX,
         compact_range_options.target_path_id,
         GetCompressionType(ioptions_, vstorage, mutable_cf_options,
                            output_level, 1),
-        GetCompressionOptions(ioptions_, vstorage, output_level),
-        compact_range_options.max_subcompactions, /* grandparents */ {},
+        GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+        Temperature::kUnknown, compact_range_options.max_subcompactions,
+        /* grandparents */ {},
         /* is manual */ true);
     RegisterCompaction(c);
+    vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
     return c;
   }
 
@@ -670,17 +677,41 @@
   // two files overlap.
   if (input_level > 0) {
     const uint64_t limit = mutable_cf_options.max_compaction_bytes;
-    uint64_t total = 0;
+    uint64_t input_level_total = 0;
+    int hint_index = -1;
+    InternalKey* smallest = nullptr;
+    InternalKey* largest = nullptr;
     for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      if (!smallest) {
+        smallest = &inputs[i]->smallest;
+      }
+      largest = &inputs[i]->largest;
+
       uint64_t s = inputs[i]->compensated_file_size;
-      total += s;
-      if (total >= limit) {
+      uint64_t output_level_total = 0;
+      if (output_level < vstorage->num_non_empty_levels()) {
+        std::vector<FileMetaData*> files;
+        vstorage->GetOverlappingInputsRangeBinarySearch(
+            output_level, smallest, largest, &files, hint_index, &hint_index);
+        for (const auto& file : files) {
+          output_level_total += file->compensated_file_size;
+        }
+      }
+
+      input_level_total += s;
+
+      if (input_level_total + output_level_total >= limit) {
         covering_the_whole_range = false;
+        // still include the current file, so the compaction could be larger
+        // than max_compaction_bytes, which is also to make sure the compaction
+        // can make progress even `max_compaction_bytes` is small (e.g. smaller
+        // than an SST file).
         inputs.files.resize(i + 1);
         break;
       }
     }
   }
+
   assert(compact_range_options.target_path_id <
          static_cast<uint32_t>(ioptions_.cf_paths.size()));
 
@@ -778,8 +809,8 @@
   std::vector<FileMetaData*> grandparents;
   GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
   Compaction* compaction = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs),
-      output_level,
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(compaction_inputs), output_level,
       MaxFileSizeForLevel(mutable_cf_options, output_level,
                           ioptions_.compaction_style, vstorage->base_level(),
                           ioptions_.level_compaction_dynamic_level_bytes),
@@ -787,8 +818,9 @@
       compact_range_options.target_path_id,
       GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
                          vstorage->base_level()),
-      GetCompressionOptions(ioptions_, vstorage, output_level),
-      compact_range_options.max_subcompactions, std::move(grandparents),
+      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+      Temperature::kUnknown, compact_range_options.max_subcompactions,
+      std::move(grandparents),
       /* is manual compaction */ true);
 
   TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
@@ -1004,6 +1036,7 @@
   // any currently-existing files.
   for (auto file_num : *input_files) {
     bool found = false;
+    int input_file_level = -1;
     for (const auto& level_meta : cf_meta.levels) {
       for (const auto& file_meta : level_meta.files) {
         if (file_num == TableFileNameToNumber(file_meta.name)) {
@@ -1013,6 +1046,7 @@
                                    " is already being compacted.");
           }
           found = true;
+          input_file_level = level_meta.level;
           break;
         }
       }
@@ -1025,6 +1059,13 @@
           "Specified compaction input file " + MakeTableFileName("", file_num) +
           " does not exist in column family " + cf_meta.name + ".");
     }
+    if (input_file_level > output_level) {
+      return Status::InvalidArgument(
+          "Cannot compact file to up level, input file: " +
+          MakeTableFileName("", file_num) + " level " +
+          ToString(input_file_level) + " > output level " +
+          ToString(output_level));
+    }
   }
 
   return Status::OK();
@@ -1043,6 +1084,8 @@
     level0_compactions_in_progress_.insert(c);
   }
   compactions_in_progress_.insert(c);
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered",
+                           c);
 }
 
 void CompactionPicker::UnregisterCompaction(Compaction* c) {
@@ -1085,6 +1128,8 @@
   Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
   size_t random_file_index = static_cast<size_t>(rnd.Uniform(
       static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction",
+                           &random_file_index);
 
   if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
     // found the compaction!
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker.h	2025-05-19 16:14:27.000000000 +0000
@@ -46,7 +46,7 @@
 // compaction style specific logic for them.
 class CompactionPicker {
  public:
-  CompactionPicker(const ImmutableCFOptions& ioptions,
+  CompactionPicker(const ImmutableOptions& ioptions,
                    const InternalKeyComparator* icmp);
   virtual ~CompactionPicker();
 
@@ -56,7 +56,8 @@
   // describes the compaction.  Caller should delete the result.
   virtual Compaction* PickCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
       SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
 
   // Return a compaction object for compacting the range [begin,end] in
@@ -72,7 +73,8 @@
   // *compaction_end should point to valid InternalKey!
   virtual Compaction* CompactRange(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, int input_level, int output_level,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
       const CompactRangeOptions& compact_range_options,
       const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end, bool* manual_conflict,
@@ -113,6 +115,7 @@
                            const std::vector<CompactionInputFiles>& input_files,
                            int output_level, VersionStorageInfo* vstorage,
                            const MutableCFOptions& mutable_cf_options,
+                           const MutableDBOptions& mutable_db_options,
                            uint32_t output_path_id);
 
   // Converts a set of compaction input file numbers into
@@ -215,7 +218,7 @@
   }
 
  protected:
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
 
 // A helper function to SanitizeCompactionInputFiles() that
 // sanitizes "input_files" by adding necessary files.
@@ -241,7 +244,7 @@
 // compaction.
 class NullCompactionPicker : public CompactionPicker {
  public:
-  NullCompactionPicker(const ImmutableCFOptions& ioptions,
+  NullCompactionPicker(const ImmutableOptions& ioptions,
                        const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
   virtual ~NullCompactionPicker() {}
@@ -250,6 +253,7 @@
   Compaction* PickCompaction(
       const std::string& /*cf_name*/,
       const MutableCFOptions& /*mutable_cf_options*/,
+      const MutableDBOptions& /*mutable_db_options*/,
       VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
       SequenceNumber /* earliest_memtable_seqno */) override {
     return nullptr;
@@ -258,6 +262,7 @@
   // Always return "nullptr"
   Compaction* CompactRange(const std::string& /*cf_name*/,
                            const MutableCFOptions& /*mutable_cf_options*/,
+                           const MutableDBOptions& /*mutable_db_options*/,
                            VersionStorageInfo* /*vstorage*/,
                            int /*input_level*/, int /*output_level*/,
                            const CompactRangeOptions& /*compact_range_options*/,
@@ -305,9 +310,9 @@
                                    int level, int base_level,
                                    const bool enable_compression = true);
 
-CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
-                                         const VersionStorageInfo* vstorage,
-                                         int level,
-                                         const bool enable_compression = true);
+CompressionOptions GetCompressionOptions(
+    const MutableCFOptions& mutable_cf_options,
+    const VersionStorageInfo* vstorage, int level,
+    const bool enable_compression = true);
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,8 +13,10 @@
 #include <cinttypes>
 #include <string>
 #include <vector>
+
 #include "db/column_family.h"
 #include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -36,7 +38,8 @@
 
 Compaction* FIFOCompactionPicker::PickTTLCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
   assert(mutable_cf_options.ttl > 0);
 
   const int kLevel0 = 0;
@@ -44,7 +47,7 @@
   uint64_t total_size = GetTotalFilesSize(level_files);
 
   int64_t _current_time;
-  auto status = ioptions_.env->GetCurrentTime(&_current_time);
+  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
   if (!status.ok()) {
     ROCKS_LOG_BUFFER(log_buffer,
                      "[%s] FIFO compaction: Couldn't get current time: %s. "
@@ -70,18 +73,18 @@
   // avoid underflow
   if (current_time > mutable_cf_options.ttl) {
     for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
-      auto f = *ritr;
-      if (f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
-        auto creation_time =
+      FileMetaData* f = *ritr;
+      assert(f);
+      if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+        uint64_t creation_time =
             f->fd.table_reader->GetTableProperties()->creation_time;
         if (creation_time == 0 ||
             creation_time >= (current_time - mutable_cf_options.ttl)) {
           break;
         }
-        total_size -= f->compensated_file_size;
-        inputs[0].files.push_back(f);
       }
+      total_size -= f->compensated_file_size;
+      inputs[0].files.push_back(f);
     }
   }
 
@@ -96,24 +99,31 @@
   }
 
   for (const auto& f : inputs[0].files) {
+    uint64_t creation_time = 0;
+    assert(f);
+    if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+      creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+    }
     ROCKS_LOG_BUFFER(log_buffer,
                      "[%s] FIFO compaction: picking file %" PRIu64
                      " with creation time %" PRIu64 " for deletion",
-                     cf_name.c_str(), f->fd.GetNumber(),
-                     f->fd.table_reader->GetTableProperties()->creation_time);
+                     cf_name.c_str(), f->fd.GetNumber(), creation_time);
   }
 
   Compaction* c = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
-      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
-      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), 0, 0, 0, 0, kNoCompression,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
+      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      vstorage->CompactionScore(0),
       /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
   return c;
 }
 
 Compaction* FIFOCompactionPicker::PickSizeCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
   const int kLevel0 = 0;
   const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
   uint64_t total_size = GetTotalFilesSize(level_files);
@@ -142,11 +152,12 @@
               max_compact_bytes_per_del_file,
               mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
         Compaction* c = new Compaction(
-            vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
-            16 * 1024 * 1024 /* output file size limit */,
+            vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+            {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
             0 /* max compaction bytes, not applicable */,
             0 /* output path ID */, mutable_cf_options.compression,
-            ioptions_.compression_opts, 0 /* max_subcompactions */, {},
+            mutable_cf_options.compression_opts, Temperature::kUnknown,
+            0 /* max_subcompactions */, {},
             /* is manual */ false, vstorage->CompactionScore(0),
             /* is deletion compaction */ false,
             CompactionReason::kFIFOReduceNumFiles);
@@ -193,25 +204,139 @@
   }
 
   Compaction* c = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
-      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
-      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), 0, 0, 0, 0, kNoCompression,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
+      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      vstorage->CompactionScore(0),
       /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
   return c;
 }
 
+Compaction* FIFOCompactionPicker::PickCompactionToWarm(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
+    return nullptr;
+  }
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+
+  int64_t _current_time;
+  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on warm threshold. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. Parallel "
+        "compactions are not supported",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
+    uint64_t create_time_threshold =
+        current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
+    uint64_t compaction_size = 0;
+    // We will ideally identify a file qualifying for warm tier by knowing
+    // the timestamp for the youngest entry in the file. However, right now
+    // we don't have the information. We infer it by looking at timestamp
+    // of the next file's (which is just younger) oldest entry's timestamp.
+    FileMetaData* prev_file = nullptr;
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      FileMetaData* f = *ritr;
+      assert(f);
+      if (f->being_compacted) {
+        // Right now this probably won't happen as we never try to schedule
+        // two compactions in parallel, so here we just simply don't schedule
+        // anything.
+        return nullptr;
+      }
+      uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+      if (oldest_ancester_time == kUnknownOldestAncesterTime) {
+        // Older files might not have enough information. It is possible to
+        // handle these files by looking at newer files, but maintaining the
+        // logic isn't worth it.
+        break;
+      }
+      if (oldest_ancester_time > create_time_threshold) {
+        // The previous file (which has slightly older data) doesn't qualify
+        // for warm tier.
+        break;
+      }
+      if (prev_file != nullptr) {
+        compaction_size += prev_file->fd.GetFileSize();
+        if (compaction_size > mutable_cf_options.max_compaction_bytes) {
+          break;
+        }
+        inputs[0].files.push_back(prev_file);
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] FIFO compaction: picking file %" PRIu64
+                         " with next file's oldest time %" PRIu64 " for warm",
+                         cf_name.c_str(), prev_file->fd.GetNumber(),
+                         oldest_ancester_time);
+      }
+      if (f->temperature == Temperature::kUnknown ||
+          f->temperature == Temperature::kHot) {
+        prev_file = f;
+      } else if (!inputs[0].files.empty()) {
+        // A warm file newer than files picked.
+        break;
+      } else {
+        assert(prev_file == nullptr);
+      }
+    }
+  }
+
+  if (inputs[0].files.empty()) {
+    return nullptr;
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), 0, 0 /* output file size limit */,
+      0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
+      mutable_cf_options.compression, mutable_cf_options.compression_opts,
+      Temperature::kWarm,
+      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      vstorage->CompactionScore(0),
+      /* is deletion compaction */ false, CompactionReason::kChangeTemperature);
+  return c;
+}
+
 Compaction* FIFOCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
-    SequenceNumber /*earliest_memtable_seqno*/) {
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) {
   assert(vstorage->num_levels() == 1);
 
   Compaction* c = nullptr;
   if (mutable_cf_options.ttl > 0) {
-    c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+    c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
+                          vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
+                           vstorage, log_buffer);
   }
   if (c == nullptr) {
-    c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+    c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
+                             vstorage, log_buffer);
   }
   RegisterCompaction(c);
   return c;
@@ -219,7 +344,8 @@
 
 Compaction* FIFOCompactionPicker::CompactRange(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, int input_level, int output_level,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    int input_level, int output_level,
     const CompactRangeOptions& /*compact_range_options*/,
     const InternalKey* /*begin*/, const InternalKey* /*end*/,
     InternalKey** compaction_end, bool* /*manual_conflict*/,
@@ -231,9 +357,9 @@
   assert(input_level == 0);
   assert(output_level == 0);
   *compaction_end = nullptr;
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
-  Compaction* c =
-      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
+  Compaction* c = PickCompaction(cf_name, mutable_cf_options,
+                                 mutable_db_options, vstorage, &log_buffer);
   log_buffer.FlushBufferToLog();
   return c;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h	2025-05-19 16:14:27.000000000 +0000
@@ -15,18 +15,20 @@
 namespace ROCKSDB_NAMESPACE {
 class FIFOCompactionPicker : public CompactionPicker {
  public:
-  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+  FIFOCompactionPicker(const ImmutableOptions& ioptions,
                        const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
 
   virtual Compaction* PickCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* version, LogBuffer* log_buffer,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* version,
+      LogBuffer* log_buffer,
       SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
 
   virtual Compaction* CompactRange(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, int input_level, int output_level,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
       const CompactRangeOptions& compact_range_options,
       const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end, bool* manual_conflict,
@@ -41,13 +43,21 @@
  private:
   Compaction* PickTTLCompaction(const std::string& cf_name,
                                 const MutableCFOptions& mutable_cf_options,
+                                const MutableDBOptions& mutable_db_options,
                                 VersionStorageInfo* version,
                                 LogBuffer* log_buffer);
 
   Compaction* PickSizeCompaction(const std::string& cf_name,
                                  const MutableCFOptions& mutable_cf_options,
+                                 const MutableDBOptions& mutable_db_options,
                                  VersionStorageInfo* version,
                                  LogBuffer* log_buffer);
+
+  Compaction* PickCompactionToWarm(const std::string& cf_name,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   const MutableDBOptions& mutable_db_options,
+                                   VersionStorageInfo* version,
+                                   LogBuffer* log_buffer);
 };
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc	2025-05-19 16:14:27.000000000 +0000
@@ -31,6 +31,9 @@
   if (!vstorage->FilesMarkedForCompaction().empty()) {
     return true;
   }
+  if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
+    return true;
+  }
   for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
     if (vstorage->CompactionScore(i) >= 1) {
       return true;
@@ -49,14 +52,16 @@
                          CompactionPicker* compaction_picker,
                          LogBuffer* log_buffer,
                          const MutableCFOptions& mutable_cf_options,
-                         const ImmutableCFOptions& ioptions)
+                         const ImmutableOptions& ioptions,
+                         const MutableDBOptions& mutable_db_options)
       : cf_name_(cf_name),
         vstorage_(vstorage),
         earliest_mem_seqno_(earliest_mem_seqno),
         compaction_picker_(compaction_picker),
         log_buffer_(log_buffer),
         mutable_cf_options_(mutable_cf_options),
-        ioptions_(ioptions) {}
+        ioptions_(ioptions),
+        mutable_db_options_(mutable_db_options) {}
 
   // Pick and return a compaction.
   Compaction* PickCompaction();
@@ -93,9 +98,13 @@
   // otherwise, returns false.
   bool PickIntraL0Compaction();
 
-  void PickExpiredTtlFiles();
-
-  void PickFilesMarkedForPeriodicCompaction();
+  // Picks a file from level_files to compact.
+  // level_files is a vector of (level, file metadata) in ascending order of
+  // level. If compact_to_next_level is true, compact the file to the next
+  // level, otherwise, compact to the same level as the input file.
+  void PickFileToCompact(
+      const autovector<std::pair<int, FileMetaData*>>& level_files,
+      bool compact_to_next_level);
 
   const std::string& cf_name_;
   VersionStorageInfo* vstorage_;
@@ -115,7 +124,8 @@
   CompactionReason compaction_reason_ = CompactionReason::kUnknown;
 
   const MutableCFOptions& mutable_cf_options_;
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
+  const MutableDBOptions& mutable_db_options_;
   // Pick a path ID to place a newly generated file, with its level
   static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
                             const MutableCFOptions& mutable_cf_options,
@@ -124,72 +134,34 @@
   static const int kMinFilesForIntraL0Compaction = 4;
 };
 
-void LevelCompactionBuilder::PickExpiredTtlFiles() {
-  if (vstorage_->ExpiredTtlFiles().empty()) {
-    return;
-  }
-
-  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+void LevelCompactionBuilder::PickFileToCompact(
+    const autovector<std::pair<int, FileMetaData*>>& level_files,
+    bool compact_to_next_level) {
+  for (auto& level_file : level_files) {
     // If it's being compacted it has nothing to do here.
     // If this assert() fails that means that some function marked some
     // files as being_compacted, but didn't call ComputeCompactionScore()
     assert(!level_file.second->being_compacted);
     start_level_ = level_file.first;
-    output_level_ =
-        (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
-
-    if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+    if ((compact_to_next_level &&
+         start_level_ == vstorage_->num_non_empty_levels() - 1) ||
         (start_level_ == 0 &&
          !compaction_picker_->level0_compactions_in_progress()->empty())) {
-      return false;
-    }
-
-    start_level_inputs_.files = {level_file.second};
-    start_level_inputs_.level = start_level_;
-    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                      &start_level_inputs_);
-  };
-
-  for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
-    if (continuation(level_file)) {
-      // found the compaction!
-      return;
+      continue;
     }
-  }
-
-  start_level_inputs_.files.clear();
-}
-
-void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
-  if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
-    return;
-  }
-
-  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
-    // If it's being compacted it has nothing to do here.
-    // If this assert() fails that means that some function marked some
-    // files as being_compacted, but didn't call ComputeCompactionScore()
-    assert(!level_file.second->being_compacted);
-    output_level_ = start_level_ = level_file.first;
-
-    if (start_level_ == 0 &&
-        !compaction_picker_->level0_compactions_in_progress()->empty()) {
-      return false;
+    if (compact_to_next_level) {
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+    } else {
+      output_level_ = start_level_;
     }
-
     start_level_inputs_.files = {level_file.second};
     start_level_inputs_.level = start_level_;
-    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                      &start_level_inputs_);
-  };
-
-  for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
-    if (continuation(level_file)) {
-      // found the compaction!
+    if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                   &start_level_inputs_)) {
       return;
     }
   }
-
   start_level_inputs_.files.clear();
 }
 
@@ -238,64 +210,53 @@
           }
         }
       }
+    } else {
+      // Compaction scores are sorted in descending order, no further scores
+      // will be >= 1.
+      break;
     }
   }
+  if (!start_level_inputs_.empty()) {
+    return;
+  }
 
   // if we didn't find a compaction, check if there are any files marked for
   // compaction
-  if (start_level_inputs_.empty()) {
-    parent_index_ = base_index_ = -1;
+  parent_index_ = base_index_ = -1;
 
-    compaction_picker_->PickFilesMarkedForCompaction(
-        cf_name_, vstorage_, &start_level_, &output_level_,
-        &start_level_inputs_);
-    if (!start_level_inputs_.empty()) {
-      is_manual_ = true;
-      compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
-      return;
-    }
+  compaction_picker_->PickFilesMarkedForCompaction(
+      cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+    return;
   }
 
   // Bottommost Files Compaction on deleting tombstones
-  if (start_level_inputs_.empty()) {
-    size_t i;
-    for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
-         ++i) {
-      auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
-      assert(!level_and_file.second->being_compacted);
-      start_level_inputs_.level = output_level_ = start_level_ =
-          level_and_file.first;
-      start_level_inputs_.files = {level_and_file.second};
-      if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                     &start_level_inputs_)) {
-        break;
-      }
-    }
-    if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
-      start_level_inputs_.clear();
-    } else {
-      assert(!start_level_inputs_.empty());
-      compaction_reason_ = CompactionReason::kBottommostFiles;
-      return;
-    }
+  PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kBottommostFiles;
+    return;
   }
 
   // TTL Compaction
-  if (start_level_inputs_.empty()) {
-    PickExpiredTtlFiles();
-    if (!start_level_inputs_.empty()) {
-      compaction_reason_ = CompactionReason::kTtl;
-      return;
-    }
+  PickFileToCompact(vstorage_->ExpiredTtlFiles(), true);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kTtl;
+    return;
   }
 
   // Periodic Compaction
-  if (start_level_inputs_.empty()) {
-    PickFilesMarkedForPeriodicCompaction();
-    if (!start_level_inputs_.empty()) {
-      compaction_reason_ = CompactionReason::kPeriodicCompaction;
-      return;
-    }
+  PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kPeriodicCompaction;
+    return;
+  }
+
+  // Forced blob garbage collection
+  PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kForcedBlobGC;
+    return;
   }
 }
 
@@ -375,8 +336,8 @@
 
 Compaction* LevelCompactionBuilder::GetCompaction() {
   auto c = new Compaction(
-      vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
-      output_level_,
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(compaction_inputs_), output_level_,
       MaxFileSizeForLevel(mutable_cf_options_, output_level_,
                           ioptions_.compaction_style, vstorage_->base_level(),
                           ioptions_.level_compaction_dynamic_level_bytes),
@@ -384,7 +345,8 @@
       GetPathId(ioptions_, mutable_cf_options_, output_level_),
       GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
                          output_level_, vstorage_->base_level()),
-      GetCompressionOptions(ioptions_, vstorage_, output_level_),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
       start_level_score_, false /* deletion_compaction */, compaction_reason_);
 
@@ -433,7 +395,7 @@
           if (ioptions.level_compaction_dynamic_level_bytes) {
             // Currently, level_compaction_dynamic_level_bytes is ignored when
             // multiple db paths are specified. https://github.com/facebook/
-            // rocksdb/blob/master/db/column_family.cc.
+            // rocksdb/blob/main/db/column_family.cc.
             // Still, adding this check to avoid accidentally using
             // max_bytes_for_level_multiplier_additional
             level_size = static_cast<uint64_t>(
@@ -549,10 +511,11 @@
 
 Compaction* LevelCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
-    SequenceNumber earliest_mem_seqno) {
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) {
   LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
-                                 log_buffer, mutable_cf_options, ioptions_);
+                                 log_buffer, mutable_cf_options, ioptions_,
+                                 mutable_db_options);
   return builder.PickCompaction();
 }
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_level.h	2025-05-19 16:14:27.000000000 +0000
@@ -17,12 +17,13 @@
 // for description of Leveled compaction.
 class LevelCompactionPicker : public CompactionPicker {
  public:
-  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+  LevelCompactionPicker(const ImmutableOptions& ioptions,
                         const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
   virtual Compaction* PickCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
       SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
 
   virtual bool NeedsCompaction(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,16 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-
 #include <limits>
 #include <string>
 #include <utility>
+
 #include "db/compaction/compaction.h"
 #include "db/compaction/compaction_picker_fifo.h"
 #include "db/compaction/compaction_picker_level.h"
 #include "db/compaction/compaction_picker_universal.h"
-
-#include "logging/logging.h"
+#include "db/compaction/file_pri.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@@ -31,8 +30,9 @@
   const Comparator* ucmp_;
   InternalKeyComparator icmp_;
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
+  MutableDBOptions mutable_db_options_;
   LevelCompactionPicker level_compaction_picker;
   std::string cf_name_;
   CountingLogger logger_;
@@ -52,6 +52,7 @@
         icmp_(ucmp_),
         ioptions_(options_),
         mutable_cf_options_(options_),
+        mutable_db_options_(),
         level_compaction_picker(ioptions_, &icmp_),
         cf_name_("dummy"),
         log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
@@ -78,8 +79,17 @@
     vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
   }
 
+  // Create a new VersionStorageInfo object so we can add mode files and then
+  // merge it with the existing VersionStorageInfo
+  void AddVersionStorage() {
+    temp_vstorage_.reset(new VersionStorageInfo(
+        &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style,
+        vstorage_.get(), false));
+  }
+
   void DeleteVersionStorage() {
     vstorage_.reset();
+    temp_vstorage_.reset();
     files_.clear();
     file_map_.clear();
     input_files_.clear();
@@ -88,18 +98,28 @@
   void Add(int level, uint32_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
            SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
-           size_t compensated_file_size = 0) {
-    assert(level < vstorage_->num_levels());
+           size_t compensated_file_size = 0, bool marked_for_compact = false,
+           Temperature temperature = Temperature::kUnknown,
+           uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime) {
+    VersionStorageInfo* vstorage;
+    if (temp_vstorage_) {
+      vstorage = temp_vstorage_.get();
+    } else {
+      vstorage = vstorage_.get();
+    }
+    assert(level < vstorage->num_levels());
     FileMetaData* f = new FileMetaData(
         file_number, path_id, file_size,
         InternalKey(smallest, smallest_seq, kTypeValue),
         InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
-        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber,
         kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+        kDisableUserTimestamp, kDisableUserTimestamp);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
-    vstorage_->AddFile(level, f);
+    f->oldest_ancester_time = oldest_ancestor_time;
+    vstorage->AddFile(level, f);
     files_.emplace_back(f);
     file_map_.insert({file_number, {f, level}});
   }
@@ -122,8 +142,14 @@
   }
 
   void UpdateVersionStorageInfo() {
+    if (temp_vstorage_) {
+      VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
+                             vstorage_.get(), nullptr);
+      ASSERT_OK(builder.SaveTo(temp_vstorage_.get()));
+      vstorage_ = std::move(temp_vstorage_);
+    }
     vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
-    vstorage_->UpdateFilesByCompactionPri(ioptions_.compaction_pri);
+    vstorage_->UpdateFilesByCompactionPri(ioptions_, mutable_cf_options_);
     vstorage_->UpdateNumNonEmptyLevels();
     vstorage_->GenerateFileIndexer();
     vstorage_->GenerateLevelFilesBrief();
@@ -132,13 +158,36 @@
     vstorage_->ComputeFilesMarkedForCompaction();
     vstorage_->SetFinalized();
   }
+  void AddFileToVersionStorage(int level, uint32_t file_number,
+                               const char* smallest, const char* largest,
+                               uint64_t file_size = 1, uint32_t path_id = 0,
+                               SequenceNumber smallest_seq = 100,
+                               SequenceNumber largest_seq = 100,
+                               size_t compensated_file_size = 0,
+                               bool marked_for_compact = false) {
+    VersionStorageInfo* base_vstorage = vstorage_.release();
+    vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+                                           kCompactionStyleUniversal,
+                                           base_vstorage, false));
+    Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq,
+        largest_seq, compensated_file_size, marked_for_compact);
+
+    VersionBuilder builder(FileOptions(), &ioptions_, nullptr, base_vstorage,
+                           nullptr);
+    builder.SaveTo(vstorage_.get());
+    UpdateVersionStorageInfo();
+  }
+
+ private:
+  std::unique_ptr<VersionStorageInfo> temp_vstorage_;
 };
 
 TEST_F(CompactionPickerTest, Empty) {
   NewVersionStorage(6, kCompactionStyleLevel);
   UpdateVersionStorageInfo();
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -149,7 +198,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -162,7 +212,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -175,7 +226,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
@@ -193,7 +245,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(2U, compaction->num_input_files(1));
@@ -224,7 +277,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
@@ -271,7 +325,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -295,7 +350,8 @@
   ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -320,7 +376,8 @@
   ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -349,7 +406,8 @@
   ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -371,8 +429,8 @@
   mutable_cf_options_.max_bytes_for_level_multiplier = 10;
   NewVersionStorage(num_levels, kCompactionStyleLevel);
   Add(0, 1U, "150", "200");
-  Add(num_levels - 1, 3U, "200", "250", 300U);
-  Add(num_levels - 1, 4U, "300", "350", 3000U);
+  Add(num_levels - 1, 2U, "200", "250", 300U);
+  Add(num_levels - 1, 3U, "300", "350", 3000U);
   Add(num_levels - 1, 4U, "400", "450", 3U);
   Add(num_levels - 2, 5U, "150", "180", 300U);
   Add(num_levels - 2, 6U, "181", "350", 500U);
@@ -381,7 +439,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
@@ -438,7 +497,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
 
   // output level should be the one above the bottom-most
   ASSERT_EQ(1, compaction->output_level());
@@ -472,7 +532,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
 
   ASSERT_TRUE(!compaction->is_trivial_move());
 }
@@ -498,7 +559,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
 
   ASSERT_TRUE(compaction->is_trivial_move());
 }
@@ -526,7 +588,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
 
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
@@ -556,7 +619,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
 
   ASSERT_FALSE(compaction);
 }
@@ -582,14 +646,15 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
 
   ASSERT_FALSE(compaction);
 }
 
 TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
   // The case where universal periodic compaction couldn't form
-  // a compaction that inlcudes any file marked for periodic compaction.
+  // a compaction that includes any file marked for periodic compaction.
   // Right now we form the compaction anyway if it is more than one
   // sorted run. Just put the case here to validate that it doesn't
   // crash.
@@ -612,7 +677,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
   ASSERT_TRUE(!compaction ||
               compaction->start_level() != compaction->output_level());
 }
@@ -632,7 +698,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(0, compaction->start_level());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -656,7 +723,8 @@
 
   std::unique_ptr<Compaction> compaction(
       universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->start_level());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -665,6 +733,221 @@
   ASSERT_EQ(4, compaction->output_level());
 }
 
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 555555;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "310", "380", kFileSize, 0, 200, 251);
+  Add(3, 6U, "410", "880", kFileSize, 0, 200, 251);
+  Add(3, 7U, "910", "980", 1, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+  //  Add(4, 15U, "960", "970", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  // ASSERT_EQ(4U, compaction->num_input_files(1));
+  ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber());
+  ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 400000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(1, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(2, 5U, "310", "380", kFileSize, 0, 200, 251);
+  Add(2, 6U, "410", "880", kFileSize, 0, 200, 251);
+  Add(2, 7U, "910", "980", kFileSize, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(2, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) {
+  // Test bottom level files falling between gaps between two upper level
+  // files
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 300000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "000", "180", kFileSize, 0, 200, 251);
+  Add(3, 6U, "181", "190", kFileSize, 0, 200, 251);
+  Add(3, 7U, "710", "810", kFileSize, 0, 200, 251);
+  Add(3, 8U, "820", "830", kFileSize, 0, 200, 251);
+  Add(3, 9U, "900", "991", kFileSize, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(2, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(0, compaction->num_input_files(2));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) {
+  // Test compaction candidates always cover many files.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 3200000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+  // Generate files like following:
+  // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+  // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+  for (int i = 11; i < 79; i++) {
+    Add(3, 100 + i * 3, ToString(i * 100).c_str(),
+        ToString(i * 100 + 80).c_str(), kFileSize, 0, 200, 251);
+    // Add a tie breaker
+    if (i == 66) {
+      Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251);
+    }
+
+    Add(4, 100 + i * 3 + 1, ToString(i * 100 + 30).c_str(),
+        ToString(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+    Add(4, 100 + i * 3 + 2, ToString(i * 100 + 60).c_str(),
+        ToString(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+  }
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(6U, compaction->num_input_files(0));
+  ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber());
+  ASSERT_EQ(11, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) {
+  // Test compaction candidates always cover many files with some single
+  // files larger than size threshold.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 3200000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+  // Generate files like following:
+  // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+  // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+  for (int i = 11; i < 70; i++) {
+    Add(3, 100 + i * 3, ToString(i * 100).c_str(),
+        ToString(i * 100 + 80).c_str(),
+        i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251);
+
+    Add(4, 100 + i * 3 + 1, ToString(i * 100 + 30).c_str(),
+        ToString(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+    Add(4, 100 + i * 3 + 2, ToString(i * 100 + 60).c_str(),
+        ToString(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+  }
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(6U, compaction->num_input_files(0));
+  ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber());
+  ASSERT_EQ(13, compaction->num_input_files(1));
+}
+
 TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
   NewVersionStorage(1, kCompactionStyleFIFO);
   const int kFileCount =
@@ -681,18 +964,255 @@
 
   // verify whether compaction is needed based on the current
   // size of L0 files.
-  uint64_t current_size = 0;
   for (int i = 1; i <= kFileCount; ++i) {
     NewVersionStorage(1, kCompactionStyleFIFO);
     Add(0, i, ToString((i + 100) * 1000).c_str(),
-        ToString((i + 100) * 1000 + 999).c_str(),
-        kFileSize, 0, i * 100, i * 100 + 99);
-    current_size += kFileSize;
+        ToString((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100,
+        i * 100 + 99);
     UpdateVersionStorageInfo();
     ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
               vstorage_->CompactionScore(0) >= 1);
   }
 }
+
+TEST_F(CompactionPickerTest, FIFOToWarm1) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm2) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kUnknown, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  // Stop if a file is being compacted
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kWarm, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  // Stop if a file is being compacted
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+}
+
 #endif  // ROCKSDB_LITE
 
 TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
@@ -716,7 +1236,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Pick file 8 because it overlaps with 0 files on level 3.
@@ -735,11 +1256,11 @@
   Add(2, 6U, "150", "175",
       60000000U);  // Overlaps with file 26, 27, total size 521M
   Add(2, 7U, "176", "200", 60000000U);  // Overlaps with file 27, 28, total size
-                                        // 520M, the smalelst overlapping
+                                        // 520M, the smallest overlapping
   Add(2, 8U, "201", "300",
       60000000U);  // Overlaps with file 28, 29, total size 521M
 
-  Add(3, 26U, "100", "110", 261000000U);
+  Add(3, 25U, "100", "110", 261000000U);
   Add(3, 26U, "150", "170", 261000000U);
   Add(3, 27U, "171", "179", 260000000U);
   Add(3, 28U, "191", "220", 260000000U);
@@ -748,7 +1269,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 7 because overlapping ratio is the biggest.
@@ -775,7 +1297,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 8 because overlapping ratio is the biggest.
@@ -804,7 +1327,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 8 because overlapping ratio is the biggest.
@@ -831,7 +1355,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
 }
 
 // This test checks ExpandWhileOverlapping() by having overlapping user keys
@@ -848,7 +1373,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -867,7 +1393,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -894,7 +1421,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -924,7 +1452,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -947,7 +1476,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -968,7 +1498,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -988,7 +1519,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_GE(1U, compaction->num_input_files(0));
@@ -1016,7 +1548,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(3U, compaction->num_input_files(0));
@@ -1048,7 +1581,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -1088,7 +1622,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1126,7 +1661,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1135,6 +1671,66 @@
   ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
 }
 
+TEST_F(CompactionPickerTest, FileTtlBooster) {
+  // Set TTL to 2048
+  // TTL boosting for all levels starts at 1024,
+  // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960.
+  // From second last level (L5), range starts at
+  // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3).
+  // Boosting step 124 / 16 = 7.75 -> 7
+  //
+  const uint64_t kCurrentTime = 1000000;
+  FileMetaData meta;
+
+  {
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 3);
+
+    // Not triggering if the file is younger than ttl/2
+    meta.oldest_ancester_time = kCurrentTime - 1023;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - 1024;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime + 10;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+    // Within one boosting step
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+    // One boosting step
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7);
+    ASSERT_EQ(2, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8);
+    ASSERT_EQ(2, booster.GetBoostScore(&meta));
+
+    // Multiple boosting steps
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30);
+    ASSERT_EQ(5, booster.GetBoostScore(&meta));
+
+    // Very high boosting steps
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700);
+    ASSERT_EQ(101, booster.GetBoostScore(&meta));
+  }
+  {
+    // Test second last level
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 5);
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+    ASSERT_EQ(3, booster.GetBoostScore(&meta));
+  }
+  {
+    // Test last level
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 6);
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - 3000;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+  }
+}
+
 TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
   NewVersionStorage(6, kCompactionStyleLevel);
   mutable_cf_options_.level0_file_num_compaction_trigger = 2;
@@ -1148,7 +1744,7 @@
   Add(0, 32U, "001", "400", 1000000000U, 0, 0);
   Add(0, 33U, "001", "400", 1000000000U, 0, 0);
 
-  // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
   Add(1, 4U, "050", "300", 1000000000U, 0, 0);
   file_map_[4u].first->being_compacted = true;
   Add(1, 5U, "301", "350", 1000000000U, 0, 0);
@@ -1163,7 +1759,8 @@
   ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
   ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -1180,7 +1777,7 @@
   Add(0, 32U, "001", "400", 1000000000U, 0, 0);
   Add(0, 33U, "001", "400", 1000000000U, 0, 0);
 
-  // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
   Add(1, 4U, "050", "300", 1000000000U, 0, 0);
   Add(1, 5U, "301", "350", 1000000000U, 0, 0);
 
@@ -1193,7 +1790,8 @@
   ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
   ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
 }
 
@@ -1226,7 +1824,8 @@
   ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
   ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
 }
 
@@ -1255,7 +1854,7 @@
   // Size ratio L4/L3 is 9.9
   // After merge from L3, L4 size is 1000900
   Add(4, 11U, "400", "500", 999900);
-  Add(5, 11U, "400", "500", 8007200);
+  Add(5, 12U, "400", "500", 8007200);
 
   UpdateVersionStorageInfo();
 
@@ -1520,7 +2119,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1544,7 +2144,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(3U, compaction->num_input_files(0));
@@ -1568,16 +2169,43 @@
 
   Add(3, 5U, "120", "130", 7000U);
   Add(3, 6U, "170", "180", 7000U);
-  Add(3, 5U, "220", "230", 7000U);
-  Add(3, 5U, "270", "280", 7000U);
+  Add(3, 7U, "220", "230", 7000U);
+  Add(3, 8U, "270", "280", 7000U);
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-    cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
 }
 
+TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000u;
+  mutable_cf_options_.max_compaction_bytes = 10001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1);
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "100", "150", 3000U);
+  Add(1, 2U, "151", "200", 3001U);
+  Add(1, 3U, "201", "250", 3000U);
+  Add(1, 4U, "251", "300", 3000U);
+
+  Add(3, 5U, "120", "130", 7000U);
+  Add(3, 6U, "170", "180", 7000U);
+  Add(3, 7U, "220", "230", 7000U);
+  Add(3, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  // No trivial move, because partitioning is applied
+  ASSERT_TRUE(!compaction->IsTrivialMove());
+}
+
 TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
   mutable_cf_options_.max_bytes_for_level_base = 1000000u;
   mutable_cf_options_.max_compaction_bytes = 10000u;
@@ -1594,7 +2222,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-    cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_FALSE(compaction->IsTrivialMove());
 }
@@ -1619,7 +2248,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1628,7 +2258,8 @@
   ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
 
   compaction.reset(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1637,7 +2268,8 @@
   ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
 
   compaction.reset(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() == nullptr);
   ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
 }
@@ -1662,7 +2294,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -1692,7 +2325,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(4U, compaction->num_input_files(0));
@@ -1724,7 +2358,8 @@
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107));
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_, 107));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(4U, compaction->num_input_files(0));
@@ -1733,6 +2368,336 @@
   ASSERT_EQ(0, compaction->output_level());
 }
 
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a "regular" universal compaction is
+  // scheduled first, followed by a delete triggered compaction. The latter
+  // should fail
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a compaction to reduce sorted runs
+  ASSERT_EQ(CompactionReason::kUniversalSortedRunNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+
+  AddVersionStorage();
+  // Simulate a flush and mark the file for compaction
+  Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled first, followed by a "regular" compaction. The latter
+  // should fail
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+  Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250);
+  Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(3, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+
+  AddVersionStorage();
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+
+  bool input_level_overlap = false;
+  bool output_level_overlap = false;
+  // Let's mark 2 files in 2 different levels for compaction. The
+  // compaction picker will randomly pick one, so use the sync point to
+  // ensure a deterministic order. Loop until both cases are covered
+  size_t random_index = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) {
+        size_t* index = static_cast<size_t*>(arg);
+        *index = random_index;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  while (!input_level_overlap || !output_level_overlap) {
+    // Ensure that the L0 file gets picked first
+    random_index = !input_level_overlap ? 0 : 1;
+    UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+    NewVersionStorage(5, kCompactionStyleUniversal);
+
+    Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+    Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248);
+    Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249);
+    Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250);
+    Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true);
+    Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150);
+    Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150);
+    UpdateVersionStorageInfo();
+
+    std::unique_ptr<Compaction> compaction(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+
+    ASSERT_TRUE(compaction);
+    // Validate that its a delete triggered compaction
+    ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+              compaction->compaction_reason());
+    ASSERT_TRUE(compaction->start_level() == 0 ||
+                compaction->start_level() == 3);
+    if (compaction->start_level() == 0) {
+      // The L0 file was picked. The next compaction will detect an
+      // overlap on its input level
+      input_level_overlap = true;
+      ASSERT_EQ(3, compaction->output_level());
+      ASSERT_EQ(1U, compaction->num_input_files(0));
+      ASSERT_EQ(3U, compaction->num_input_files(1));
+    } else {
+      // The level 3 file was picked. The next compaction will pick
+      // the L0 file and will detect overlap when adding output
+      // level inputs
+      output_level_overlap = true;
+      ASSERT_EQ(4, compaction->output_level());
+      ASSERT_EQ(2U, compaction->num_input_files(0));
+      ASSERT_EQ(1U, compaction->num_input_files(1));
+    }
+
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    // After recomputing the compaction score, only one marked file will remain
+    random_index = 0;
+    std::unique_ptr<Compaction> compaction2(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+    ASSERT_FALSE(compaction2);
+    DeleteVersionStorage();
+  }
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled and should result in a full compaction
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[6].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a file is being compacted, and a
+  // delete triggered compaction is then scheduled. The latter should stop
+  // at the first file being compacted
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+  file_map_[3].first->being_compacted = true;
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled first, followed by a "regular" compaction. The latter
+  // should fail
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[6].first->being_compacted);
+
+  AddVersionStorage();
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction2);
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[1].first->being_compacted);
+  ASSERT_TRUE(file_map_[2].first->being_compacted);
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+
+  // This test makes sure the `files_marked_for_compaction_` is updated after
+  // creating manual compaction.
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+
+  // Add 3 files marked for compaction
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+  UpdateVersionStorageInfo();
+
+  // All 3 files are marked for compaction
+  ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size());
+
+  bool manual_conflict = false;
+  InternalKey* manual_end = NULL;
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.CompactRange(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(), NULL,
+          NULL, &manual_end, &manual_conflict, port::kMaxUint64));
+
+  ASSERT_TRUE(compaction);
+
+  ASSERT_EQ(CompactionReason::kManualCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(kNumLevels - 1, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+
+  // After creating the manual compaction, all files should be cleared from
+  // `FilesMarkedForCompaction`. So they won't be picked by others.
+  ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size());
+}
+
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc	2025-05-19 16:14:27.000000000 +0000
@@ -15,9 +15,11 @@
 #include <queue>
 #include <string>
 #include <utility>
+
 #include "db/column_family.h"
 #include "file/filename.h"
 #include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "monitoring/statistics.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
@@ -31,17 +33,16 @@
 // PickCompaction().
 class UniversalCompactionBuilder {
  public:
-  UniversalCompactionBuilder(const ImmutableCFOptions& ioptions,
-                             const InternalKeyComparator* icmp,
-                             const std::string& cf_name,
-                             const MutableCFOptions& mutable_cf_options,
-                             VersionStorageInfo* vstorage,
-                             UniversalCompactionPicker* picker,
-                             LogBuffer* log_buffer)
+  UniversalCompactionBuilder(
+      const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      UniversalCompactionPicker* picker, LogBuffer* log_buffer)
       : ioptions_(ioptions),
         icmp_(icmp),
         cf_name_(cf_name),
         mutable_cf_options_(mutable_cf_options),
+        mutable_db_options_(mutable_db_options),
         vstorage_(vstorage),
         picker_(picker),
         log_buffer_(log_buffer) {}
@@ -88,6 +89,14 @@
   // Pick Universal compaction to limit space amplification.
   Compaction* PickCompactionToReduceSizeAmp();
 
+  // Try to pick incremental compaction to reduce space amplification.
+  // It will return null if it cannot find a fanout within the threshold.
+  // Fanout is defined as
+  //    total size of files to compact at output level
+  //  --------------------------------------------------
+  //    total size of files to compact at other levels
+  Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold);
+
   Compaction* PickDeleteTriggeredCompaction();
 
   // Form a compaction from the sorted run indicated by start_index to the
@@ -103,25 +112,27 @@
   // because some files are being compacted.
   Compaction* PickPeriodicCompaction();
 
-  // Used in universal compaction when the enabled_trivial_move
+  // Used in universal compaction when the allow_trivial_move
   // option is set. Checks whether there are any overlapping files
   // in the input. Returns true if the input files are non
   // overlapping.
   bool IsInputFilesNonOverlapping(Compaction* c);
 
-  const ImmutableCFOptions& ioptions_;
+  uint64_t GetMaxOverlappingBytes() const;
+
+  const ImmutableOptions& ioptions_;
   const InternalKeyComparator* icmp_;
   double score_;
   std::vector<SortedRun> sorted_runs_;
   const std::string& cf_name_;
   const MutableCFOptions& mutable_cf_options_;
+  const MutableDBOptions& mutable_db_options_;
   VersionStorageInfo* vstorage_;
   UniversalCompactionPicker* picker_;
   LogBuffer* log_buffer_;
 
   static std::vector<SortedRun> CalculateSortedRuns(
-      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
-      const MutableCFOptions& mutable_cf_options);
+      const VersionStorageInfo& vstorage);
 
   // Pick a path ID to place a newly generated file, with its estimated file
   // size.
@@ -158,9 +169,9 @@
   const Comparator* ucmp_;
 };
 
-typedef std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
-                            SmallestKeyHeapComparator>
-    SmallestKeyHeap;
+using SmallestKeyHeap =
+    std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+                        SmallestKeyHeapComparator>;
 
 // This function creates the heap that is used to find if the files are
 // overlapping during universal compaction when the allow_trivial_move
@@ -278,11 +289,11 @@
 
 Compaction* UniversalCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
-    SequenceNumber /* earliest_memtable_seqno */) {
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) {
   UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
-                                     mutable_cf_options, vstorage, this,
-                                     log_buffer);
+                                     mutable_cf_options, mutable_db_options,
+                                     vstorage, this, log_buffer);
   return builder.PickCompaction();
 }
 
@@ -325,8 +336,7 @@
 
 std::vector<UniversalCompactionBuilder::SortedRun>
 UniversalCompactionBuilder::CalculateSortedRuns(
-    const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
-    const MutableCFOptions& mutable_cf_options) {
+    const VersionStorageInfo& vstorage) {
   std::vector<UniversalCompactionBuilder::SortedRun> ret;
   for (FileMetaData* f : vstorage.LevelFiles(0)) {
     ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
@@ -336,27 +346,16 @@
     uint64_t total_compensated_size = 0U;
     uint64_t total_size = 0U;
     bool being_compacted = false;
-    bool is_first = true;
     for (FileMetaData* f : vstorage.LevelFiles(level)) {
       total_compensated_size += f->compensated_file_size;
       total_size += f->fd.GetFileSize();
-      if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
-          true) {
-        if (f->being_compacted) {
-          being_compacted = f->being_compacted;
-        }
-      } else {
-        // Compaction always includes all files for a non-zero level, so for a
-        // non-zero level, all the files should share the same being_compacted
-        // value.
-        // This assumption is only valid when
-        // mutable_cf_options.compaction_options_universal.allow_trivial_move
-        // is false
-        assert(is_first || f->being_compacted == being_compacted);
-      }
-      if (is_first) {
+      // Size amp, read amp and periodic compactions always include all files
+      // for a non-zero level. However, a delete triggered compaction and
+      // a trivial move might pick a subset of files in a sorted run. So
+      // always check all files in a sorted run and mark the entire run as
+      // being compacted if one or more files are being compacted
+      if (f->being_compacted) {
         being_compacted = f->being_compacted;
-        is_first = false;
       }
     }
     if (total_compensated_size > 0) {
@@ -372,8 +371,7 @@
 Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
   score_ = vstorage_->CompactionScore(kLevel0);
-  sorted_runs_ =
-      CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
+  sorted_runs_ = CalculateSortedRuns(*vstorage_);
 
   if (sorted_runs_.size() == 0 ||
       (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
@@ -389,7 +387,7 @@
   VersionStorageInfo::LevelSummaryStorage tmp;
   ROCKS_LOG_BUFFER_MAX_SZ(
       log_buffer_, 3072,
-      "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
+      "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n",
       cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
 
   Compaction* c = nullptr;
@@ -475,7 +473,6 @@
 
 // validate that all the chosen files of L0 are non overlapping in time
 #ifndef NDEBUG
-  SequenceNumber prev_smallest_seqno = 0U;
   bool is_first = true;
 
   size_t level_index = 0U;
@@ -485,7 +482,6 @@
       if (is_first) {
         is_first = false;
       }
-      prev_smallest_seqno = f->fd.smallest_seqno;
     }
     level_index = 1U;
   }
@@ -497,22 +493,16 @@
                               &largest_seqno);
       if (is_first) {
         is_first = false;
-      } else if (prev_smallest_seqno > 0) {
-        // A level is considered as the bottommost level if there are
-        // no files in higher levels or if files in higher levels do
-        // not overlap with the files being compacted. Sequence numbers
-        // of files in bottommost level can be set to 0 to help
-        // compression. As a result, the following assert may not hold
-        // if the prev_smallest_seqno is 0.
-        assert(prev_smallest_seqno > largest_seqno);
       }
-      prev_smallest_seqno = smallest_seqno;
     }
   }
 #endif
   // update statistics
-  RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
-                    c->inputs(0)->size());
+  size_t num_files = 0;
+  for (auto& each_level : *c->inputs()) {
+    num_files += each_level.files.size();
+  }
+  RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
 
   picker_->RegisterCompaction(c);
   vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
@@ -737,6 +727,19 @@
                      cf_name_.c_str(), file_num_buf);
   }
 
+  std::vector<FileMetaData*> grandparents;
+  // Include grandparents for potential file cutting in incremental
+  // mode. It is for aligning file cutting boundaries across levels,
+  // so that subsequent compactions can pick files with aligned
+  // buffer.
+  // Single files are only picked up in incremental mode, so that
+  // there is no need for full range.
+  if (mutable_cf_options_.compaction_options_universal.incremental &&
+      first_index_after < sorted_runs_.size() &&
+      sorted_runs_[first_index_after].level > 1) {
+    grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level);
+  }
+
   CompactionReason compaction_reason;
   if (max_number_of_files_to_compact == UINT_MAX) {
     compaction_reason = CompactionReason::kUniversalSizeRatio;
@@ -744,21 +747,22 @@
     compaction_reason = CompactionReason::kUniversalSortedRunNum;
   }
   return new Compaction(
-      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
-      output_level,
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
       MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
-      LLONG_MAX, path_id,
+      GetMaxOverlappingBytes(), path_id,
       GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
                          1, enable_compression),
-      GetCompressionOptions(ioptions_, vstorage_, start_level,
+      GetCompressionOptions(mutable_cf_options_, vstorage_, start_level,
                             enable_compression),
-      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
-      score_, false /* deletion_compaction */, compaction_reason);
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, grandparents, /* is manual */ false, score_,
+      false /* deletion_compaction */, compaction_reason);
 }
 
 // Look at overall size amplification. If size amplification
-// exceeeds the configured value, then do a compaction
+// exceeds the configured value, then do a compaction
 // of the candidate files all the way upto the earliest
 // base file (overrides configured values of file-size ratios,
 // min_merge_width and max_merge_width).
@@ -779,7 +783,7 @@
   }
 
   // Skip files that are already being compacted
-  for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) {
+  for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
     sr = &sorted_runs_[loop];
     if (!sr->being_compacted) {
       start_index = loop;  // Consider this as the first candidate.
@@ -807,9 +811,11 @@
   }
 
   // keep adding up all the remaining files
-  for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) {
+  for (size_t loop = start_index; loop + 1 < sorted_runs_.size(); loop++) {
     sr = &sorted_runs_[loop];
     if (sr->being_compacted) {
+      // TODO with incremental compaction is supported, we might want to
+      // schedule some incremental compactions in parallel if needed.
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       ROCKS_LOG_BUFFER(
@@ -843,34 +849,288 @@
         " earliest-file-size %" PRIu64,
         cf_name_.c_str(), candidate_size, earliest_file_size);
   }
+  // Since incremental compaction can't include more than second last
+  // level, it can introduce penalty, compared to full compaction. We
+  // hard code the pentalty to be 80%. If we end up with a compaction
+  // fanout higher than 80% of full level compactions, we fall back
+  // to full level compaction.
+  // The 80% threshold is arbitrary and can be adjusted or made
+  // configurable in the future.
+  // This also prevent the case when compaction falls behind and we
+  // need to compact more levels for compactions to catch up.
+  if (mutable_cf_options_.compaction_options_universal.incremental) {
+    double fanout_threshold = static_cast<double>(earliest_file_size) /
+                              static_cast<double>(candidate_size) * 1.8;
+    Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold);
+    if (picked != nullptr) {
+      // As the feature is still incremental, picking incremental compaction
+      // might fail and we will fall bck to compacting full level.
+      return picked;
+    }
+  }
   return PickCompactionToOldest(start_index,
                                 CompactionReason::kUniversalSizeAmplification);
 }
 
+Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
+    double fanout_threshold) {
+  // Try find all potential compactions with total size just over
+  // options.max_compaction_size / 2, and take the one with the lowest
+  // fanout (defined in declaration of the function).
+  // This is done by having a sliding window of the files at the second
+  // lowest level, and keep expanding while finding overlapping in the
+  // last level. Once total size exceeds the size threshold, calculate
+  // the fanout value. And then shrinking from the small side of the
+  // window. Keep doing it until the end.
+  // Finally, we try to include upper level files if they fall into
+  // the range.
+  //
+  // Note that it is a similar problem as leveled compaction's
+  // kMinOverlappingRatio priority, but instead of picking single files
+  // we expand to a target compaction size. The reason is that in
+  // leveled compaction, actual fanout value tends to high, e.g. 10, so
+  // even with single file in down merging level, the extra size
+  // compacted in boundary files is at a lower ratio. But here users
+  // often have size of second last level size to be 1/4, 1/3 or even
+  // 1/2 of the bottommost level, so picking single file in second most
+  // level will cause significant waste, which is not desirable.
+  //
+  // This algorithm has lots of room to improve to pick more efficient
+  // compactions.
+  assert(sorted_runs_.size() >= 2);
+  int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level;
+  if (second_last_level == 0) {
+    // Can't split Level 0.
+    return nullptr;
+  }
+  int output_level = sorted_runs_.back().level;
+  const std::vector<FileMetaData*>& bottom_files =
+      vstorage_->LevelFiles(output_level);
+  const std::vector<FileMetaData*>& files =
+      vstorage_->LevelFiles(second_last_level);
+  assert(!bottom_files.empty());
+  assert(!files.empty());
+
+  //  std::unordered_map<uint64_t, uint64_t> file_to_order;
+
+  int picked_start_idx = 0;
+  int picked_end_idx = 0;
+  double picked_fanout = fanout_threshold;
+
+  // Use half target compaction bytes as anchor to stop growing second most
+  // level files, and reserve growing space for more overlapping bottom level,
+  // clean cut, files from other levels, etc.
+  uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2;
+  int start_idx = 0;
+  int bottom_end_idx = 0;
+  int bottom_start_idx = 0;
+  uint64_t non_bottom_size = 0;
+  uint64_t bottom_size = 0;
+  bool end_bottom_size_counted = false;
+  for (int end_idx = 0; end_idx < static_cast<int>(files.size()); end_idx++) {
+    FileMetaData* end_file = files[end_idx];
+
+    // Include bottom most level files smaller than the current second
+    // last level file.
+    int num_skipped = 0;
+    while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+           icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+                          end_file->smallest) < 0) {
+      if (!end_bottom_size_counted) {
+        bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+      }
+      bottom_end_idx++;
+      end_bottom_size_counted = false;
+      num_skipped++;
+    }
+
+    if (num_skipped > 1) {
+      // At least a file in the bottom most level falls into the file gap. No
+      // reason to include the file. We cut the range and start a new sliding
+      // window.
+      start_idx = end_idx;
+    }
+
+    if (start_idx == end_idx) {
+      // new sliding window.
+      non_bottom_size = 0;
+      bottom_size = 0;
+      bottom_start_idx = bottom_end_idx;
+      end_bottom_size_counted = false;
+    }
+
+    non_bottom_size += end_file->fd.file_size;
+
+    // Include all overlapping files in bottom level.
+    while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+           icmp_->Compare(bottom_files[bottom_end_idx]->smallest,
+                          end_file->largest) < 0) {
+      if (!end_bottom_size_counted) {
+        bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+        end_bottom_size_counted = true;
+      }
+      if (icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+                         end_file->largest) > 0) {
+        // next level file cross large boundary of current file.
+        break;
+      }
+      bottom_end_idx++;
+      end_bottom_size_counted = false;
+    }
+
+    if ((non_bottom_size + bottom_size > comp_thres_size ||
+         end_idx == static_cast<int>(files.size()) - 1) &&
+        non_bottom_size > 0) {  // Do we alow 0 size file at all?
+      // If it is a better compaction, remember it in picked* variables.
+      double fanout = static_cast<double>(bottom_size) /
+                      static_cast<double>(non_bottom_size);
+      if (fanout < picked_fanout) {
+        picked_start_idx = start_idx;
+        picked_end_idx = end_idx;
+        picked_fanout = fanout;
+      }
+      // Shrink from the start end to under comp_thres_size
+      while (non_bottom_size + bottom_size > comp_thres_size &&
+             start_idx <= end_idx) {
+        non_bottom_size -= files[start_idx]->fd.file_size;
+        start_idx++;
+        if (start_idx < static_cast<int>(files.size())) {
+          while (bottom_start_idx <= bottom_end_idx &&
+                 icmp_->Compare(bottom_files[bottom_start_idx]->largest,
+                                files[start_idx]->smallest) < 0) {
+            bottom_size -= bottom_files[bottom_start_idx]->fd.file_size;
+            bottom_start_idx++;
+          }
+        }
+      }
+    }
+  }
+
+  if (picked_fanout >= fanout_threshold) {
+    assert(picked_fanout == fanout_threshold);
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  CompactionInputFiles bottom_level_inputs;
+  CompactionInputFiles second_last_level_inputs;
+  second_last_level_inputs.level = second_last_level;
+  bottom_level_inputs.level = output_level;
+  for (int i = picked_start_idx; i <= picked_end_idx; i++) {
+    if (files[i]->being_compacted) {
+      return nullptr;
+    }
+    second_last_level_inputs.files.push_back(files[i]);
+  }
+  assert(!second_last_level_inputs.empty());
+  if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                       &second_last_level_inputs,
+                                       /*next_smallest=*/nullptr)) {
+    return nullptr;
+  }
+  // We might be able to avoid this binary search if we save and expand
+  // from bottom_start_idx and bottom_end_idx, but for now, we use
+  // SetupOtherInputs() for simplicity.
+  int parent_index = -1;  // Create and use bottom_start_idx?
+  if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                 &second_last_level_inputs,
+                                 &bottom_level_inputs, &parent_index,
+                                 /*base_index=*/-1)) {
+    return nullptr;
+  }
+
+  // Try to include files in upper levels if they fall into the range.
+  // Since we need to go from lower level up and this is in the reverse
+  // order, compared to level order, we first write to an reversed
+  // data structure and finally copy them to compaction inputs.
+  InternalKey smallest, largest;
+  picker_->GetRange(second_last_level_inputs, &smallest, &largest);
+  std::vector<CompactionInputFiles> inputs_reverse;
+  for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) {
+    SortedRun& sr = *it;
+    if (sr.level == 0) {
+      break;
+    }
+    std::vector<FileMetaData*> level_inputs;
+    vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest,
+                                            &level_inputs);
+    if (!level_inputs.empty()) {
+      inputs_reverse.push_back({});
+      inputs_reverse.back().level = sr.level;
+      inputs_reverse.back().files = level_inputs;
+      picker_->GetRange(inputs_reverse.back(), &smallest, &largest);
+    }
+  }
+  for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) {
+    inputs.push_back(*it);
+  }
+
+  inputs.push_back(second_last_level_inputs);
+  inputs.push_back(bottom_level_inputs);
+
+  // TODO support multi paths?
+  uint32_t path_id = 0;
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level, 1, true /* enable_compression */),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+                            true /* enable_compression */),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      score_, false /* deletion_compaction */,
+      CompactionReason::kUniversalSizeAmplification);
+}
+
 // Pick files marked for compaction. Typically, files are marked by
 // CompactOnDeleteCollector due to the presence of tombstones.
 Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
   CompactionInputFiles start_level_inputs;
   int output_level;
   std::vector<CompactionInputFiles> inputs;
+  std::vector<FileMetaData*> grandparents;
 
   if (vstorage_->num_levels() == 1) {
     // This is single level universal. Since we're basically trying to reclaim
     // space by processing files marked for compaction due to high tombstone
     // density, let's do the same thing as compaction to reduce size amp which
     // has the same goals.
-    bool compact = false;
+    int start_index = -1;
 
     start_level_inputs.level = 0;
     start_level_inputs.files.clear();
     output_level = 0;
-    for (FileMetaData* f : vstorage_->LevelFiles(0)) {
-      if (f->marked_for_compaction) {
-        compact = true;
+    // Find the first file marked for compaction. Ignore the last file
+    for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+      SortedRun* sr = &sorted_runs_[loop];
+      if (sr->being_compacted) {
+        continue;
       }
-      if (compact) {
+      FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+      if (f->marked_for_compaction) {
         start_level_inputs.files.push_back(f);
+        start_index =
+            static_cast<int>(loop);  // Consider this as the first candidate.
+        break;
+      }
+    }
+    if (start_index < 0) {
+      // Either no file marked, or they're already being compacted
+      return nullptr;
+    }
+
+    for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) {
+      SortedRun* sr = &sorted_runs_[loop];
+      if (sr->being_compacted) {
+        break;
       }
+
+      FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+      start_level_inputs.files.push_back(f);
     }
     if (start_level_inputs.size() <= 1) {
       // If only the last file in L0 is marked for compaction, ignore it
@@ -939,6 +1199,9 @@
       if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) {
         return nullptr;
       }
+
+      picker_->GetGrandparents(vstorage_, start_level_inputs,
+                               output_level_inputs, &grandparents);
     } else {
       inputs.push_back(start_level_inputs);
     }
@@ -952,16 +1215,17 @@
   uint32_t path_id =
       GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
   return new Compaction(
-      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
-      output_level,
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
       MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
-      /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
+      /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id,
       GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
                          output_level, 1),
-      GetCompressionOptions(ioptions_, vstorage_, output_level),
-      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
-      score_, false /* deletion_compaction */,
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, grandparents, /* is manual */ false, score_,
+      false /* deletion_compaction */,
       CompactionReason::kFilesMarkedForCompaction);
 }
 
@@ -1001,6 +1265,9 @@
       comp_reason_print_string = "size amp";
     } else {
       assert(false);
+      comp_reason_print_string = "unknown: ";
+      comp_reason_print_string.append(
+          std::to_string(static_cast<int>(compaction_reason)));
     }
 
     char file_num_buf[256];
@@ -1022,15 +1289,16 @@
   // compaction_options_universal.compression_size_percent,
   // because we always compact all the files, so always compress.
   return new Compaction(
-      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
-      output_level,
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
       MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
-      LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
-                         1, true /* enable_compression */),
-      GetCompressionOptions(ioptions_, vstorage_, start_level,
+      GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level, 1, true /* enable_compression */),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
                             true /* enable_compression */),
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
       score_, false /* deletion_compaction */, compaction_reason);
 }
@@ -1100,6 +1368,17 @@
 
   return c;
 }
+
+uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
+  if (!mutable_cf_options_.compaction_options_universal.incremental) {
+    return port::kMaxUint64;
+  } else {
+    // Try to align cutting boundary with files at the next level if the
+    // file isn't end up with 1/2 of target size, or it would overlap
+    // with two full size files at the next level.
+    return mutable_cf_options_.target_file_size_base / 2 * 3;
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h	2025-05-19 16:14:27.000000000 +0000
@@ -15,12 +15,13 @@
 namespace ROCKSDB_NAMESPACE {
 class UniversalCompactionPicker : public CompactionPicker {
  public:
-  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
+  UniversalCompactionPicker(const ImmutableOptions& ioptions,
                             const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
   virtual Compaction* PickCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
       SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
   virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/compaction_service_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,825 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestCompactionServiceBase {
+ public:
+  virtual int GetCompactionNum() = 0;
+
+  void OverrideStartStatus(CompactionServiceJobStatus s) {
+    is_override_start_status = true;
+    override_start_status = s;
+  }
+
+  void OverrideWaitStatus(CompactionServiceJobStatus s) {
+    is_override_wait_status = true;
+    override_wait_status = s;
+  }
+
+  void OverrideWaitResult(std::string str) {
+    is_override_wait_result = true;
+    override_wait_result = std::move(str);
+  }
+
+  void ResetOverride() {
+    is_override_wait_result = false;
+    is_override_start_status = false;
+    is_override_wait_status = false;
+  }
+
+  virtual ~TestCompactionServiceBase() = default;
+
+ protected:
+  bool is_override_start_status = false;
+  CompactionServiceJobStatus override_start_status =
+      CompactionServiceJobStatus::kFailure;
+  bool is_override_wait_status = false;
+  CompactionServiceJobStatus override_wait_status =
+      CompactionServiceJobStatus::kFailure;
+  bool is_override_wait_result = false;
+  std::string override_wait_result;
+};
+
+class MyTestCompactionServiceLegacy : public CompactionService,
+                                      public TestCompactionServiceBase {
+ public:
+  MyTestCompactionServiceLegacy(std::string db_path, Options& options,
+                                std::shared_ptr<Statistics>& statistics)
+      : db_path_(std::move(db_path)),
+        options_(options),
+        statistics_(statistics) {}
+
+  static const char* kClassName() { return "MyTestCompactionServiceLegacy"; }
+
+  const char* Name() const override { return kClassName(); }
+
+  CompactionServiceJobStatus Start(const std::string& compaction_service_input,
+                                   uint64_t job_id) override {
+    InstrumentedMutexLock l(&mutex_);
+    jobs_.emplace(job_id, compaction_service_input);
+    CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess;
+    if (is_override_start_status) {
+      return override_start_status;
+    }
+    return s;
+  }
+
+  CompactionServiceJobStatus WaitForComplete(
+      uint64_t job_id, std::string* compaction_service_result) override {
+    std::string compaction_input;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      auto i = jobs_.find(job_id);
+      if (i == jobs_.end()) {
+        return CompactionServiceJobStatus::kFailure;
+      }
+      compaction_input = std::move(i->second);
+      jobs_.erase(i);
+    }
+
+    if (is_override_wait_status) {
+      return override_wait_status;
+    }
+
+    CompactionServiceOptionsOverride options_override;
+    options_override.env = options_.env;
+    options_override.file_checksum_gen_factory =
+        options_.file_checksum_gen_factory;
+    options_override.comparator = options_.comparator;
+    options_override.merge_operator = options_.merge_operator;
+    options_override.compaction_filter = options_.compaction_filter;
+    options_override.compaction_filter_factory =
+        options_.compaction_filter_factory;
+    options_override.prefix_extractor = options_.prefix_extractor;
+    options_override.table_factory = options_.table_factory;
+    options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
+    options_override.statistics = statistics_;
+
+    Status s = DB::OpenAndCompact(
+        db_path_, db_path_ + "/" + ROCKSDB_NAMESPACE::ToString(job_id),
+        compaction_input, compaction_service_result, options_override);
+    if (is_override_wait_result) {
+      *compaction_service_result = override_wait_result;
+    }
+    compaction_num_.fetch_add(1);
+    if (s.ok()) {
+      return CompactionServiceJobStatus::kSuccess;
+    } else {
+      return CompactionServiceJobStatus::kFailure;
+    }
+  }
+
+  int GetCompactionNum() override { return compaction_num_.load(); }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::atomic_int compaction_num_{0};
+  std::map<uint64_t, std::string> jobs_;
+  const std::string db_path_;
+  Options options_;
+  std::shared_ptr<Statistics> statistics_;
+};
+
+class MyTestCompactionService : public CompactionService,
+                                public TestCompactionServiceBase {
+ public:
+  MyTestCompactionService(std::string db_path, Options& options,
+                          std::shared_ptr<Statistics>& statistics)
+      : db_path_(std::move(db_path)),
+        options_(options),
+        statistics_(statistics),
+        start_info_("na", "na", "na", 0, Env::TOTAL),
+        wait_info_("na", "na", "na", 0, Env::TOTAL) {}
+
+  static const char* kClassName() { return "MyTestCompactionService"; }
+
+  const char* Name() const override { return kClassName(); }
+
+  CompactionServiceJobStatus StartV2(
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) override {
+    InstrumentedMutexLock l(&mutex_);
+    start_info_ = info;
+    assert(info.db_name == db_path_);
+    jobs_.emplace(info.job_id, compaction_service_input);
+    CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess;
+    if (is_override_start_status) {
+      return override_start_status;
+    }
+    return s;
+  }
+
+  CompactionServiceJobStatus WaitForCompleteV2(
+      const CompactionServiceJobInfo& info,
+      std::string* compaction_service_result) override {
+    std::string compaction_input;
+    assert(info.db_name == db_path_);
+    {
+      InstrumentedMutexLock l(&mutex_);
+      wait_info_ = info;
+      auto i = jobs_.find(info.job_id);
+      if (i == jobs_.end()) {
+        return CompactionServiceJobStatus::kFailure;
+      }
+      compaction_input = std::move(i->second);
+      jobs_.erase(i);
+    }
+
+    if (is_override_wait_status) {
+      return override_wait_status;
+    }
+
+    CompactionServiceOptionsOverride options_override;
+    options_override.env = options_.env;
+    options_override.file_checksum_gen_factory =
+        options_.file_checksum_gen_factory;
+    options_override.comparator = options_.comparator;
+    options_override.merge_operator = options_.merge_operator;
+    options_override.compaction_filter = options_.compaction_filter;
+    options_override.compaction_filter_factory =
+        options_.compaction_filter_factory;
+    options_override.prefix_extractor = options_.prefix_extractor;
+    options_override.table_factory = options_.table_factory;
+    options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
+    options_override.statistics = statistics_;
+
+    Status s = DB::OpenAndCompact(
+        db_path_, db_path_ + "/" + ROCKSDB_NAMESPACE::ToString(info.job_id),
+        compaction_input, compaction_service_result, options_override);
+    if (is_override_wait_result) {
+      *compaction_service_result = override_wait_result;
+    }
+    compaction_num_.fetch_add(1);
+    if (s.ok()) {
+      return CompactionServiceJobStatus::kSuccess;
+    } else {
+      return CompactionServiceJobStatus::kFailure;
+    }
+  }
+
+  int GetCompactionNum() override { return compaction_num_.load(); }
+
+  CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
+  CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::atomic_int compaction_num_{0};
+  std::map<uint64_t, std::string> jobs_;
+  const std::string db_path_;
+  Options options_;
+  std::shared_ptr<Statistics> statistics_;
+  CompactionServiceJobInfo start_info_;
+  CompactionServiceJobInfo wait_info_;
+};
+
+// This is only for listing test classes
+enum TestCompactionServiceType {
+  MyTestCompactionServiceType,
+  MyTestCompactionServiceLegacyType,
+};
+
+class CompactionServiceTest
+    : public DBTestBase,
+      public testing::WithParamInterface<TestCompactionServiceType> {
+ public:
+  explicit CompactionServiceTest()
+      : DBTestBase("compaction_service_test", true) {}
+
+ protected:
+  void ReopenWithCompactionService(Options* options) {
+    options->env = env_;
+    primary_statistics_ = CreateDBStatistics();
+    options->statistics = primary_statistics_;
+    compactor_statistics_ = CreateDBStatistics();
+    TestCompactionServiceType cs_type = GetParam();
+    switch (cs_type) {
+      case MyTestCompactionServiceType:
+        compaction_service_ = std::make_shared<MyTestCompactionService>(
+            dbname_, *options, compactor_statistics_);
+        break;
+      case MyTestCompactionServiceLegacyType:
+        compaction_service_ = std::make_shared<MyTestCompactionServiceLegacy>(
+            dbname_, *options, compactor_statistics_);
+        break;
+      default:
+        assert(false);
+    }
+    options->compaction_service = compaction_service_;
+    DestroyAndReopen(*options);
+  }
+
+  Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); }
+
+  Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); }
+
+  TestCompactionServiceBase* GetCompactionService() {
+    CompactionService* cs = compaction_service_.get();
+    return dynamic_cast<TestCompactionServiceBase*>(cs);
+  }
+
+  void GenerateTestData() {
+    // Generate 20 files @ L2
+    for (int i = 0; i < 20; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 10 + j;
+        ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    // Generate 10 files @ L1 overlap with all 20 files @ L2
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 20 + j * 2;
+        ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+    ASSERT_EQ(FilesPerLevel(), "0,10,20");
+  }
+
+  void VerifyTestData() {
+    for (int i = 0; i < 200; i++) {
+      auto result = Get(Key(i));
+      if (i % 2) {
+        ASSERT_EQ(result, "value" + ToString(i));
+      } else {
+        ASSERT_EQ(result, "value_new" + ToString(i));
+      }
+    }
+  }
+
+ private:
+  std::shared_ptr<Statistics> compactor_statistics_;
+  std::shared_ptr<Statistics> primary_statistics_;
+  std::shared_ptr<CompactionService> compaction_service_;
+};
+
+TEST_P(CompactionServiceTest, BasicCompactions) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + ToString(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + ToString(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+
+  // make sure the compaction statistics is only recorded on the remote side
+  ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
+  ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+  // even with remote compaction, primary host still needs to read SST files to
+  // `verify_table()`.
+  ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+  // all the compaction write happens on the remote side
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+  ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES),
+            primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES));
+  // compactor is already the remote side, which doesn't have remote
+  ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+  ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            0);
+
+  // Test failed compaction
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+        // override job status
+        auto s = static_cast<Status*>(status);
+        *s = Status::Aborted("MyTestCompactionService failed to compact!");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s;
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      s = Put(Key(key_id), "value_new" + ToString(key_id));
+      if (s.IsAborted()) {
+        break;
+      }
+    }
+    if (s.IsAborted()) {
+      break;
+    }
+    s = Flush();
+    if (s.IsAborted()) {
+      break;
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (s.IsAborted()) {
+      break;
+    }
+  }
+  ASSERT_TRUE(s.IsAborted());
+}
+
+TEST_P(CompactionServiceTest, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  start_str = Key(120);
+  start = start_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  end_str = Key(92);
+  end = end_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+}
+
+TEST_P(CompactionServiceTest, FailedToStart) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+  my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure);
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+}
+
+TEST_P(CompactionServiceTest, InvalidResult) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+  my_cs->OverrideWaitResult("Invalid Str");
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_FALSE(s.ok());
+}
+
+TEST_P(CompactionServiceTest, SubCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  int compaction_num_before = my_cs->GetCompactionNum();
+
+  auto cro = CompactRangeOptions();
+  cro.max_subcompactions = 10;
+  Status s = db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(s);
+  VerifyTestData();
+  int compaction_num = my_cs->GetCompactionNum() - compaction_num_before;
+  // make sure there's sub-compaction by checking the compaction number
+  ASSERT_GE(compaction_num, 2);
+}
+
+class PartialDeleteCompactionFilter : public CompactionFilter {
+ public:
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& key, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    int i = std::stoi(key.ToString().substr(3));
+    if (i > 5 && i <= 105) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+  const char* Name() const override { return "PartialDeleteCompactionFilter"; }
+};
+
+TEST_P(CompactionServiceTest, CompactionFilter) {
+  Options options = CurrentOptions();
+  std::unique_ptr<CompactionFilter> delete_comp_filter(
+      new PartialDeleteCompactionFilter());
+  options.compaction_filter = delete_comp_filter.get();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i > 5 && i <= 105) {
+      ASSERT_EQ(result, "NOT_FOUND");
+    } else if (i % 2) {
+      ASSERT_EQ(result, "value" + ToString(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + ToString(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
+TEST_P(CompactionServiceTest, Snapshot) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  ASSERT_OK(Put(Key(1), "value1"));
+  ASSERT_OK(Put(Key(2), "value1"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), "value2"));
+  ASSERT_OK(Put(Key(3), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+  ASSERT_EQ("value1", Get(Key(1), s1));
+  ASSERT_EQ("value2", Get(Key(1)));
+  db_->ReleaseSnapshot(s1);
+}
+
+TEST_P(CompactionServiceTest, ConcurrentCompaction) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;
+  options.max_background_jobs = 20;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  std::vector<std::thread> threads;
+  for (const auto& file : meta.levels[1].files) {
+    threads.push_back(std::thread([&]() {
+      std::string fname = file.db_path + "/" + file.name;
+      ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2));
+    }));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + ToString(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + ToString(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_EQ(my_cs->GetCompactionNum(), 10);
+  ASSERT_EQ(FilesPerLevel(), "0,0,10");
+}
+
+TEST_P(CompactionServiceTest, CompactionInfo) {
+  // only test compaction info for new compaction service interface
+  if (GetParam() != MyTestCompactionServiceType) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  auto my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_GE(comp_num, 1);
+
+  CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(dbname_, info.db_name);
+  std::string db_id, db_session_id;
+  ASSERT_OK(db_->GetDbIdentity(db_id));
+  ASSERT_EQ(db_id, info.db_id);
+  ASSERT_OK(db_->GetDbSessionId(db_session_id));
+  ASSERT_EQ(db_session_id, info.db_session_id);
+  ASSERT_EQ(Env::LOW, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(dbname_, info.db_name);
+  ASSERT_EQ(db_id, info.db_id);
+  ASSERT_EQ(db_session_id, info.db_session_id);
+  ASSERT_EQ(Env::LOW, info.priority);
+
+  // Test priority USER
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  SstFileMetaData file = meta.levels[1].files[0];
+  ASSERT_OK(db_->CompactFiles(CompactionOptions(),
+                              {file.db_path + "/" + file.name}, 2));
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(Env::USER, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(Env::USER, info.priority);
+
+  // Test priority BOTTOM
+  env_->SetBackgroundThreads(1, Env::BOTTOM);
+  options.num_levels = 2;
+  ReopenWithCompactionService(&options);
+  my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(Env::BOTTOM, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(Env::BOTTOM, info.priority);
+}
+
+TEST_P(CompactionServiceTest, FallbackLocalAuto) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  uint64_t compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  uint64_t primary_write_bytes =
+      primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + ToString(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + ToString(i));
+    }
+  }
+
+  ASSERT_EQ(my_cs->GetCompactionNum(), 0);
+
+  // make sure the compaction statistics is only recorded on the local side
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0);
+}
+
+TEST_P(CompactionServiceTest, FallbackLocalManual) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  uint64_t compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  uint64_t primary_write_bytes =
+      primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  // re-enable remote compaction
+  my_cs->ResetOverride();
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  // make sure the compaction statistics is only recorded on the remote side
+  ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+
+  // return run local again with API WaitForComplete
+  my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal);
+  start_str = Key(120);
+  start = start_str;
+  comp_num = my_cs->GetCompactionNum();
+  compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+  ASSERT_EQ(my_cs->GetCompactionNum(),
+            comp_num);  // no remote compaction is run
+  // make sure the compaction statistics is only recorded on the local side
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+
+  // verify result after 2 manual compactions
+  VerifyTestData();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    CompactionServiceTest, CompactionServiceTest,
+    ::testing::Values(
+        TestCompactionServiceType::MyTestCompactionServiceType,
+        TestCompactionServiceType::MyTestCompactionServiceLegacyType));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/file_pri.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/file_pri.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/file_pri.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/file_pri.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include <algorithm>
+
+#include "db/version_edit.h"
+
+namespace ROCKSDB_NAMESPACE {
+// We boost files that are closer to TTL limit. This boosting could be
+// through FileMetaData.compensated_file_size but this compensated size
+// is widely used as something similar to file size so dramatically boost
+// the value might cause unintended consequences.
+//
+// This boosting algorithm can go very fancy, but here we use a simple
+// formula which can satisify:
+// (1) Different levels are triggered slightly differently to avoid
+//     too many cascading cases
+// (2) Files in the same level get boosting more when TTL gets closer.
+//
+// Don't do any boosting before TTL has past by half. This is to make
+// sure lower write amp for most of the case. And all levels should be
+// fully boosted when total TTL compaction threshold triggers.
+// Differientiate boosting ranges of each level by 1/2. This will make
+// range for each level exponentially increasing. We could do it by
+// having them to be equal, or go even fancier. We can adjust it after
+// we observe the behavior in production.
+// The threshold starting boosting:
+// +------------------------------------------------------------------ +
+// ^                            ^   ^     ^       ^                 ^
+// Age 0                        ... |     |    second last level    thresold
+//                                  |     |
+//                                  |  third last level
+//                                  |
+//                            forth last level
+//
+// We arbitrarily set with 0 when a file is aged boost_age_start and
+// grow linearly. The ratio is arbitrarily set so that when the next level
+// starts to boost, the previous level's boosting amount is 16.
+class FileTtlBooster {
+ public:
+  FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels,
+                 int level)
+      : current_time_(current_time) {
+    if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) {
+      enabled_ = false;
+      boost_age_start_ = 0;
+      boost_step_ = 1;
+    } else {
+      enabled_ = true;
+      uint64_t all_boost_start_age = ttl / 2;
+      uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age;
+      uint64_t boost_age_range =
+          all_boost_age_range >> (num_non_empty_levels - level - 1);
+      boost_age_start_ = all_boost_start_age + boost_age_range;
+      const uint64_t kBoostRatio = 16;
+      // prevent 0 value to avoid divide 0 error.
+      boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1});
+    }
+  }
+
+  uint64_t GetBoostScore(FileMetaData* f) {
+    if (!enabled_) {
+      return 1;
+    }
+    uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+    if (oldest_ancester_time >= current_time_) {
+      return 1;
+    }
+    uint64_t age = current_time_ - oldest_ancester_time;
+    if (age > boost_age_start_) {
+      // Use integer just for convenience.
+      // We could make all file_to_order double if we want.
+      // Technically this can overflow if users override timing and
+      // give a very high current time. Ignore the case for simplicity.
+      // Boosting is addition to current value, so +1. This will effectively
+      // make boosting to kick in after the first boost_step_ is reached.
+      return (age - boost_age_start_) / boost_step_ + 1;
+    }
+    return 1;
+  }
+
+ private:
+  bool enabled_;
+  uint64_t current_time_;
+  uint64_t boost_age_start_;
+  uint64_t boost_step_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/compaction/sst_partitioner.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "rocksdb/sst_partitioner.h"
+
+#include <algorithm>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo>
+    sst_fixed_prefix_type_info = {
+#ifndef ROCKSDB_LITE
+        {"length",
+         {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
+    : len_(len) {
+  RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info);
+}
+
+PartitionerResult SstPartitionerFixedPrefix::ShouldPartition(
+    const PartitionerRequest& request) {
+  Slice last_key_fixed(*request.prev_user_key);
+  if (last_key_fixed.size() > len_) {
+    last_key_fixed.size_ = len_;
+  }
+  Slice current_key_fixed(*request.current_user_key);
+  if (current_key_fixed.size() > len_) {
+    current_key_fixed.size_ = len_;
+  }
+  return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired
+                                                        : kNotRequired;
+}
+
+bool SstPartitionerFixedPrefix::CanDoTrivialMove(
+    const Slice& smallest_user_key, const Slice& largest_user_key) {
+  return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key,
+                                            0)) == kNotRequired;
+}
+
+std::unique_ptr<SstPartitioner>
+SstPartitionerFixedPrefixFactory::CreatePartitioner(
+    const SstPartitioner::Context& /* context */) const {
+  return std::unique_ptr<SstPartitioner>(new SstPartitionerFixedPrefix(len_));
+}
+
+std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
+    size_t prefix_len) {
+  return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static int RegisterSstPartitionerFactories(ObjectLibrary& library,
+                                           const std::string& /*arg*/) {
+  library.AddFactory<SstPartitionerFactory>(
+      SstPartitionerFixedPrefixFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<SstPartitionerFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new SstPartitionerFixedPrefixFactory(0));
+        return guard->get();
+      });
+  return 1;
+}
+}  // namespace
+#endif  // ROCKSDB_LITE
+
+Status SstPartitionerFactory::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<SstPartitionerFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
+                                                 result);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/comparator_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/comparator_db_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/comparator_db_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/comparator_db_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,10 +13,10 @@
 #include "test_util/testutil.h"
 #include "util/hash.h"
 #include "util/kv_map.h"
+#include "util/random.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
-using std::unique_ptr;
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
@@ -317,7 +317,7 @@
 INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
                         testing::Values(test::kDefaultFormatVersion));
 INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
-                        testing::Values(test::kLatestFormatVersion));
+                        testing::Values(kLatestFormatVersion));
 
 TEST_P(ComparatorDBTest, Bytewise) {
   for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
@@ -342,12 +342,12 @@
     std::vector<std::string> source_prefixes;
     // Randomly generate 5 prefixes
     for (int i = 0; i < 5; i++) {
-      source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
+      source_prefixes.push_back(rnd.HumanReadableString(8));
     }
     for (int j = 0; j < 20; j++) {
       int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
       std::string key = source_prefixes[prefix_index] +
-                        test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
+                        rnd.HumanReadableString(rnd.Uniform(8));
       source_strings.push_back(key);
     }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/convenience.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/convenience.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/convenience.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/convenience.cc	2025-05-19 16:14:27.000000000 +0000
@@ -14,7 +14,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 void CancelAllBackgroundWork(DB* db, bool wait) {
-  (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
+  (static_cast_with_check<DBImpl>(db->GetRootDB()))
       ->CancelAllBackgroundWork(wait);
 }
 
@@ -28,7 +28,7 @@
 Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
                            const RangePtr* ranges, size_t n,
                            bool include_end) {
-  return (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
+  return (static_cast_with_check<DBImpl>(db->GetRootDB()))
       ->DeleteFilesInRanges(column_family, ranges, n, include_end);
 }
 
@@ -44,7 +44,7 @@
   std::unique_ptr<FSRandomAccessFile> file;
   uint64_t file_size;
   InternalKeyComparator internal_comparator(options.comparator);
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
 
   Status s = ioptions.fs->NewRandomAccessFile(file_path,
                                               FileOptions(env_options),
@@ -59,9 +59,10 @@
       new RandomAccessFileReader(std::move(file), file_path));
   const bool kImmortal = true;
   s = ioptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions, options.prefix_extractor.get(), env_options,
+      TableReaderOptions(ioptions, options.prefix_extractor, env_options,
                          internal_comparator, false /* skip_filters */,
-                         !kImmortal, -1 /* level */),
+                         !kImmortal, false /* force_direct_prefetch */,
+                         -1 /* level */),
       std::move(file_reader), file_size, &table_reader,
       false /* prefetch_index_and_filter_in_cache */);
   if (!s.ok()) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/corruption_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/corruption_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/corruption_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/corruption_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,37 +9,65 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "rocksdb/db.h"
-
-#include <errno.h>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+
 #include <cinttypes>
+
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
+#include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/meta_blocks.h"
+#include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/random.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-static const int kValueSize = 1000;
+static constexpr int kValueSize = 1000;
+namespace {
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+  bool writable_file_error_;
+  int num_writable_file_errors_;
 
+  explicit ErrorEnv(Env* _target)
+      : EnvWrapper(_target),
+        writable_file_error_(false),
+        num_writable_file_errors_(0) {}
+  const char* Name() const override { return "ErrorEnv"; }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) override {
+    result->reset();
+    if (writable_file_error_) {
+      ++num_writable_file_errors_;
+      return Status::IOError(fname, "fake error");
+    }
+    return target()->NewWritableFile(fname, result, soptions);
+  }
+};
+}  // namespace
 class CorruptionTest : public testing::Test {
  public:
-  test::ErrorEnv env_;
+  std::shared_ptr<Env> env_guard_;
+  ErrorEnv* env_;
   std::string dbname_;
   std::shared_ptr<Cache> tiny_cache_;
   Options options_;
@@ -50,10 +78,16 @@
     // set it to 0), test SequenceNumberRecovery will fail, likely because of a
     // bug in recovery code. Keep it 4 for now to make the test passes.
     tiny_cache_ = NewLRUCache(100, 4);
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(base_env, nullptr);
+    env_ = new ErrorEnv(base_env);
     options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
-    options_.env = &env_;
-    dbname_ = test::PerThreadDBPath("corruption_test");
-    DestroyDB(dbname_, options_);
+    options_.env = env_;
+    dbname_ = test::PerThreadDBPath(env_, "corruption_test");
+    Status s = DestroyDB(dbname_, options_);
+    EXPECT_OK(s);
 
     db_ = nullptr;
     options_.create_if_missing = true;
@@ -65,8 +99,19 @@
   }
 
   ~CorruptionTest() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->LoadDependency({});
+    SyncPoint::GetInstance()->ClearAllCallBacks();
     delete db_;
-    DestroyDB(dbname_, Options());
+    db_ = nullptr;
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "db is still at %s\n", dbname_.c_str());
+    } else {
+      Options opts;
+      opts.env = env_->target();
+      EXPECT_OK(DestroyDB(dbname_, opts));
+    }
+    delete env_;
   }
 
   void CloseDb() {
@@ -81,7 +126,7 @@
     if (opt.env == Options().env) {
       // If env is not overridden, replace it with ErrorEnv.
       // Otherwise, the test already uses a non-default Env.
-      opt.env = &env_;
+      opt.env = env_;
     }
     opt.arena_block_size = 4096;
     BlockBasedTableOptions table_options;
@@ -101,22 +146,24 @@
     ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
   }
 
-  void Build(int n, int flush_every = 0) {
+  void Build(int n, int start, int flush_every) {
     std::string key_space, value_space;
     WriteBatch batch;
     for (int i = 0; i < n; i++) {
       if (flush_every != 0 && i != 0 && i % flush_every == 0) {
-        DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-        dbi->TEST_FlushMemTable();
+        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+        ASSERT_OK(dbi->TEST_FlushMemTable());
       }
       //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
-      Slice key = Key(i, &key_space);
+      Slice key = Key(i + start, &key_space);
       batch.Clear();
-      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(batch.Put(key, Value(i + start, &value_space)));
       ASSERT_OK(db_->Write(WriteOptions(), &batch));
     }
   }
 
+  void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); }
+
   void Check(int min_expected, int max_expected) {
     uint64_t next_expected = 0;
     uint64_t missed = 0;
@@ -131,6 +178,7 @@
     // occurred.
     Iterator* iter = db_->NewIterator(ReadOptions(false, true));
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
       uint64_t key;
       Slice in(iter->key());
       if (!ConsumeDecimalNumber(&in, &key) ||
@@ -147,6 +195,7 @@
         correct++;
       }
     }
+    iter->status().PermitUncheckedError();
     delete iter;
 
     fprintf(stderr,
@@ -157,47 +206,10 @@
     ASSERT_GE(max_expected, correct);
   }
 
-  void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
-    struct stat sbuf;
-    if (stat(fname.c_str(), &sbuf) != 0) {
-      const char* msg = strerror(errno);
-      FAIL() << fname << ": " << msg;
-    }
-
-    if (offset < 0) {
-      // Relative to end of file; make it absolute
-      if (-offset > sbuf.st_size) {
-        offset = 0;
-      } else {
-        offset = static_cast<int>(sbuf.st_size + offset);
-      }
-    }
-    if (offset > sbuf.st_size) {
-      offset = static_cast<int>(sbuf.st_size);
-    }
-    if (offset + bytes_to_corrupt > sbuf.st_size) {
-      bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
-    }
-
-    // Do it
-    std::string contents;
-    Status s = ReadFileToString(Env::Default(), fname, &contents);
-    ASSERT_TRUE(s.ok()) << s.ToString();
-    for (int i = 0; i < bytes_to_corrupt; i++) {
-      contents[i + offset] ^= 0x80;
-    }
-    s = WriteStringToFile(Env::Default(), contents, fname);
-    ASSERT_TRUE(s.ok()) << s.ToString();
-    Options options;
-    EnvOptions env_options;
-    options.file_system.reset(new LegacyFileSystemWrapper(options.env));
-    ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname));
-  }
-
   void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
     // Pick file to corrupt
     std::vector<std::string> filenames;
-    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
     uint64_t number;
     FileType type;
     std::string fname;
@@ -212,7 +224,7 @@
     }
     ASSERT_TRUE(!fname.empty()) << filetype;
 
-    CorruptFile(fname, offset, bytes_to_corrupt);
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
   }
 
   // corrupts exactly one file at level `level`. if no file found at level,
@@ -222,7 +234,8 @@
     db_->GetLiveFilesMetaData(&metadata);
     for (const auto& m : metadata) {
       if (m.level == level) {
-        CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
+        ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset,
+                                    bytes_to_corrupt));
         return;
       }
     }
@@ -256,11 +269,11 @@
       // preserves the implementation that was in place when all of the
       // magic values in this file were picked.
       *storage = std::string(kValueSize, ' ');
-      return Slice(*storage);
     } else {
       Random r(k);
-      return test::RandomString(&r, kValueSize, storage);
+      *storage = r.RandomString(kValueSize);
     }
+    return Slice(*storage);
   }
 };
 
@@ -277,8 +290,8 @@
   // is not available for WAL though.
   CloseDb();
 #endif
-  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
-  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  Corrupt(kWalFile, 19, 1);  // WriteBatch tag for first record
+  Corrupt(kWalFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
   ASSERT_TRUE(!TryReopen().ok());
   options_.paranoid_checks = false;
   Reopen(&options_);
@@ -288,14 +301,14 @@
 }
 
 TEST_F(CorruptionTest, RecoverWriteError) {
-  env_.writable_file_error_ = true;
+  env_->writable_file_error_ = true;
   Status s = TryReopen();
   ASSERT_TRUE(!s.ok());
 }
 
 TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
   // Do enough writing to force minor compaction
-  env_.writable_file_error_ = true;
+  env_->writable_file_error_ = true;
   const int num =
       static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
   std::string value_storage;
@@ -303,7 +316,7 @@
   bool failed = false;
   for (int i = 0; i < num; i++) {
     WriteBatch batch;
-    batch.Put("a", Value(100, &value_storage));
+    ASSERT_OK(batch.Put("a", Value(100, &value_storage)));
     s = db_->Write(WriteOptions(), &batch);
     if (!s.ok()) {
       failed = true;
@@ -311,17 +324,17 @@
     ASSERT_TRUE(!failed || !s.ok());
   }
   ASSERT_TRUE(!s.ok());
-  ASSERT_GE(env_.num_writable_file_errors_, 1);
-  env_.writable_file_error_ = false;
+  ASSERT_GE(env_->num_writable_file_errors_, 1);
+  env_->writable_file_error_ = false;
   Reopen();
 }
 
 TEST_F(CorruptionTest, TableFile) {
   Build(100);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_FlushMemTable();
-  dbi->TEST_CompactRange(0, nullptr, nullptr);
-  dbi->TEST_CompactRange(1, nullptr, nullptr);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
 
   Corrupt(kTableFile, 100, 1);
   Check(99, 99);
@@ -330,7 +343,7 @@
 
 TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   Options options;
-  SpecialEnv senv(Env::Default());
+  SpecialEnv senv(env_->target());
   options.env = &senv;
   // Disable block cache as we are going to check checksum for
   // the same file twice and measure number of reads.
@@ -341,10 +354,10 @@
   Reopen(&options);
 
   Build(10000);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_FlushMemTable();
-  dbi->TEST_CompactRange(0, nullptr, nullptr);
-  dbi->TEST_CompactRange(1, nullptr, nullptr);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
 
   senv.count_random_reads_ = true;
   senv.random_read_counter_.Reset();
@@ -388,14 +401,14 @@
   Reopen(&options);
   // build 2 tables, flush at 5000
   Build(10000, 5000);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_FlushMemTable();
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
 
   // corrupt an index block of an entire file
   Corrupt(kTableFile, -2000, 500);
   options.paranoid_checks = false;
   Reopen(&options);
-  dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi = static_cast_with_check<DBImpl>(db_);
   // one full file may be readable, since only one was corrupted
   // the other file should be fully non-readable, since index was corrupted
   Check(0, 5000);
@@ -435,9 +448,9 @@
 
 TEST_F(CorruptionTest, CorruptedDescriptor) {
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_FlushMemTable();
-  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
 
   Corrupt(kDescriptorFile, 0, 1000);
   Status s = TryReopen();
@@ -452,12 +465,13 @@
 
 TEST_F(CorruptionTest, CompactionInputError) {
   Options options;
+  options.env = env_;
   Reopen(&options);
   Build(10);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_FlushMemTable();
-  dbi->TEST_CompactRange(0, nullptr, nullptr);
-  dbi->TEST_CompactRange(1, nullptr, nullptr);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
   ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
 
   Corrupt(kTableFile, 100, 1);
@@ -472,29 +486,30 @@
 
 TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
   Options options;
+  options.env = env_;
   options.paranoid_checks = true;
   options.write_buffer_size = 131072;
   options.max_write_buffer_number = 2;
   Reopen(&options);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
 
   // Fill levels >= 1
   for (int level = 1; level < dbi->NumberLevels(); level++) {
-    dbi->Put(WriteOptions(), "", "begin");
-    dbi->Put(WriteOptions(), "~", "end");
-    dbi->TEST_FlushMemTable();
+    ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
+    ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
+    ASSERT_OK(dbi->TEST_FlushMemTable());
     for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
          ++comp_level) {
-      dbi->TEST_CompactRange(comp_level, nullptr, nullptr);
+      ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
     }
   }
 
   Reopen(&options);
 
-  dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi = static_cast_with_check<DBImpl>(db_);
   Build(10);
-  dbi->TEST_FlushMemTable();
-  dbi->TEST_WaitForCompact();
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_WaitForCompact());
   ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
 
   CorruptTableFileAtLevel(0, 100, 1);
@@ -518,8 +533,8 @@
 
 TEST_F(CorruptionTest, UnrelatedKeys) {
   Build(10);
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  dbi->TEST_FlushMemTable();
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
   Corrupt(kTableFile, 100, 1);
   ASSERT_NOK(dbi->VerifyChecksum());
 
@@ -528,7 +543,7 @@
   std::string v;
   ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
-  dbi->TEST_FlushMemTable();
+  ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
 }
@@ -542,37 +557,40 @@
   ASSERT_EQ(static_cast<size_t>(1), metadata.size());
   std::string filename = dbname_ + metadata[0].name;
 
-  std::unique_ptr<RandomAccessFile> file;
-  ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions()));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                 filename));
+  FileOptions file_opts;
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts,
+                                           &file_reader, nullptr));
 
   uint64_t file_size;
-  ASSERT_OK(options_.env->GetFileSize(filename, &file_size));
+  ASSERT_OK(
+      fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr));
 
   BlockHandle range_del_handle;
-  ASSERT_OK(FindMetaBlock(
+  ASSERT_OK(FindMetaBlockInFile(
       file_reader.get(), file_size, kBlockBasedTableMagicNumber,
-      ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle));
+      ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle));
 
   ASSERT_OK(TryReopen());
-  CorruptFile(filename, static_cast<int>(range_del_handle.offset()), 1);
+  ASSERT_OK(test::CorruptFile(env_, filename,
+                              static_cast<int>(range_del_handle.offset()), 1));
   ASSERT_TRUE(TryReopen().IsCorruption());
 }
 
 TEST_F(CorruptionTest, FileSystemStateCorrupted) {
   for (int iter = 0; iter < 2; ++iter) {
     Options options;
+    options.env = env_;
     options.paranoid_checks = true;
     options.create_if_missing = true;
     Reopen(&options);
     Build(10);
     ASSERT_OK(db_->Flush(FlushOptions()));
-    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
     std::vector<LiveFileMetaData> metadata;
     dbi->GetLiveFilesMetaData(&metadata);
-    ASSERT_GT(metadata.size(), size_t(0));
+    ASSERT_GT(metadata.size(), 0);
     std::string filename = dbname_ + metadata[0].name;
 
     delete db_;
@@ -580,25 +598,326 @@
 
     if (iter == 0) {  // corrupt file size
       std::unique_ptr<WritableFile> file;
-      env_.NewWritableFile(filename, &file, EnvOptions());
-      file->Append(Slice("corrupted sst"));
+      ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions()));
+      ASSERT_OK(file->Append(Slice("corrupted sst")));
       file.reset();
       Status x = TryReopen(&options);
       ASSERT_TRUE(x.IsCorruption());
     } else {  // delete the file
-      env_.DeleteFile(filename);
+      ASSERT_OK(env_->DeleteFile(filename));
       Status x = TryReopen(&options);
-      ASSERT_TRUE(x.IsPathNotFound());
+      ASSERT_TRUE(x.IsCorruption());
+    }
+
+    ASSERT_OK(DestroyDB(dbname_, options_));
+  }
+}
+
+static const auto& corruption_modes = {
+    mock::MockTableFactory::kCorruptNone, mock::MockTableFactory::kCorruptKey,
+    mock::MockTableFactory::kCorruptValue,
+    mock::MockTableFactory::kCorruptReorderKey};
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  Status s;
+  for (const auto& mode : corruption_modes) {
+    delete db_;
+    db_ = nullptr;
+    s = DestroyDB(dbname_, options);
+    ASSERT_OK(s);
+    std::shared_ptr<mock::MockTableFactory> mock =
+        std::make_shared<mock::MockTableFactory>();
+    options.table_factory = mock;
+    mock->SetCorruptionMode(mode);
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(10);
+    s = db_->Flush(FlushOptions());
+    if (mode == mock::MockTableFactory::kCorruptNone) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  options.check_flush_compaction_key_order = false;
+  Status s;
+  for (const auto& mode : corruption_modes) {
+    delete db_;
+    db_ = nullptr;
+    s = DestroyDB(dbname_, options);
+    std::shared_ptr<mock::MockTableFactory> mock =
+        std::make_shared<mock::MockTableFactory>();
+    options.table_factory = mock;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(100, 2);
+    // ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    mock->SetCorruptionMode(mode);
+    s = dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true);
+    if (mode == mock::MockTableFactory::kCorruptNone) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string start, end;
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    Build(10);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
     }
+    db_->ReleaseSnapshot(snap);
+  }
+}
 
-    DestroyDB(dbname_, options_);
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(10, 0, 0);
+    std::string start, end;
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(5, &start), Key(15, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(12, &start), Key(17, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(4, &end)));
+    Build(10, 10, 0);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
+    }
+    db_->ReleaseSnapshot(snap);
   }
 }
 
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    std::string start, end;
+    Build(10);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(6, &start), Key(8, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.allow_data_in_errors = true;
+  auto mode = mock::MockTableFactory::kCorruptKey;
+  delete db_;
+  db_ = nullptr;
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  std::shared_ptr<mock::MockTableFactory> mock =
+      std::make_shared<mock::MockTableFactory>();
+  mock->SetCorruptionMode(mode);
+  options.table_factory = mock;
+
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+  Build(100, 2);
+
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  Status s = dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = false;
+  options.create_if_missing = true;
+  options.check_flush_compaction_key_order = false;
+  delete db_;
+  db_ = nullptr;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  std::shared_ptr<mock::MockTableFactory> mock =
+      std::make_shared<mock::MockTableFactory>();
+  options.table_factory = mock;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+  mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
+  Build(100, 2);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone);
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+  ASSERT_NOK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
+}
+
+TEST_F(CorruptionTest, FlushKeyOrderCheck) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = false;
+  options.create_if_missing = true;
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+
+  int cnt = 0;
+  // Generate some out of order keys from the memtable
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableIterator::Next:0", [&](void* arg) {
+        MemTableRep::Iterator* mem_iter =
+            static_cast<MemTableRep::Iterator*>(arg);
+        if (++cnt == 3) {
+          mem_iter->Prev();
+          mem_iter->Prev();
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Status s = static_cast_with_check<DBImpl>(db_)->TEST_FlushMemTable();
+  ASSERT_NOK(s);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, DisableKeyOrderCheck) {
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}}));
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "OutputValidator::Add:order_check",
+      [&](void* /*arg*/) { ASSERT_TRUE(false); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
+  CloseDb();
+  Options options;
+  options.env = env_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.create_if_missing = true;
+  options.file_checksum_gen_factory =
+      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+  Reopen(&options);
+
+  Build(10, 5);
+
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+  CloseDb();
+
+  // Corrupt the first byte of each table file, this must be data block.
+  Corrupt(kTableFile, 0, 1);
+
+  ASSERT_OK(TryReopen(&options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NE(s, nullptr);
+        ++count;
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_EQ(1, count);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/cuckoo_table_db_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,6 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "table/cuckoo/cuckoo_table_factory.h"
@@ -13,6 +14,7 @@
 #include "table/meta_blocks.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -46,9 +48,7 @@
     return options;
   }
 
-  DBImpl* dbfull() {
-    return reinterpret_cast<DBImpl*>(db_);
-  }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
 
   // The following util methods are copied from plain_table_db_test.
   void Reopen(Options* options = nullptr) {
@@ -64,6 +64,15 @@
     ASSERT_OK(DB::Open(opts, dbname_, &db_));
   }
 
+  void DestroyAndReopen(Options* options) {
+    assert(options);
+    ASSERT_OK(db_->Close());
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+    Reopen(options);
+  }
+
   Status Put(const Slice& k, const Slice& v) {
     return db_->Put(WriteOptions(), k, v);
   }
@@ -86,8 +95,8 @@
 
   int NumTableFilesAtLevel(int level) {
     std::string property;
-    EXPECT_TRUE(db_->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    EXPECT_TRUE(db_->GetProperty("rocksdb.num-files-at-level" + ToString(level),
+                                 &property));
     return atoi(property.c_str());
   }
 
@@ -121,10 +130,11 @@
   ASSERT_OK(Put("key1", "v1"));
   ASSERT_OK(Put("key2", "v2"));
   ASSERT_OK(Put("key3", "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(3U, ptc.begin()->second->num_entries);
   ASSERT_EQ("1", FilesPerLevel());
@@ -138,9 +148,10 @@
   ASSERT_OK(Put("key4", "v4"));
   ASSERT_OK(Put("key5", "v5"));
   ASSERT_OK(Put("key6", "v6"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
   ASSERT_EQ(2U, ptc.size());
   auto row = ptc.begin();
   ASSERT_EQ(3U, row->second->num_entries);
@@ -156,8 +167,9 @@
   ASSERT_OK(Delete("key6"));
   ASSERT_OK(Delete("key5"));
   ASSERT_OK(Delete("key4"));
-  dbfull()->TEST_FlushMemTable();
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
   ASSERT_EQ(3U, ptc.size());
   row = ptc.begin();
   ASSERT_EQ(3U, row->second->num_entries);
@@ -178,10 +190,11 @@
   ASSERT_OK(Put("key1", "v1"));
   ASSERT_OK(Put("key2", "v2"));
   ASSERT_OK(Put("key1", "v3"));  // Duplicate
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(2U, ptc.begin()->second->num_entries);
   ASSERT_EQ("1", FilesPerLevel());
@@ -206,12 +219,12 @@
 TEST_F(CuckooTableDBTest, Uint64Comparator) {
   Options options = CurrentOptions();
   options.comparator = test::Uint64Comparator();
-  Reopen(&options);
+  DestroyAndReopen(&options);
 
   ASSERT_OK(Put(Uint64Key(1), "v1"));
   ASSERT_OK(Put(Uint64Key(2), "v2"));
   ASSERT_OK(Put(Uint64Key(3), "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   ASSERT_EQ("v1", Get(Uint64Key(1)));
   ASSERT_EQ("v2", Get(Uint64Key(2)));
@@ -220,10 +233,10 @@
 
   // Add more keys.
   ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
   ASSERT_OK(Put(Uint64Key(4), "v4"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_EQ("v1", Get(Uint64Key(1)));
   ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
   ASSERT_EQ("v0", Get(Uint64Key(3)));
@@ -243,11 +256,11 @@
   for (int idx = 0; idx < 28; ++idx) {
     ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
   }
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("1", FilesPerLevel());
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow trivial move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
   ASSERT_EQ("0,2", FilesPerLevel());
   for (int idx = 0; idx < 28; ++idx) {
     ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
@@ -266,15 +279,15 @@
   for (int idx = 0; idx < 11; ++idx) {
     ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
   }
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("1", FilesPerLevel());
 
   // Generate one more file in level-0, and should trigger level-0 compaction
   for (int idx = 0; idx < 11; ++idx) {
     ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
 
   ASSERT_EQ("0,1", FilesPerLevel());
   for (int idx = 0; idx < 11; ++idx) {
@@ -295,7 +308,7 @@
   ASSERT_OK(Put("key1", "v1"));
   ASSERT_OK(Put("key2", "v2"));
   ASSERT_OK(Put("key3", "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // Write some keys using plain table.
   std::shared_ptr<TableFactory> block_based_factory(
@@ -311,7 +324,7 @@
   Reopen(&options);
   ASSERT_OK(Put("key4", "v4"));
   ASSERT_OK(Put("key1", "v5"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // Write some keys using block based table.
   options.table_factory.reset(NewAdaptiveTableFactory(
@@ -320,7 +333,7 @@
   Reopen(&options);
   ASSERT_OK(Put("key5", "v6"));
   ASSERT_OK(Put("key2", "v7"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   ASSERT_EQ("v5", Get("key1"));
   ASSERT_EQ("v7", Get("key2"));
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_basic_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_basic_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_basic_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,30 +6,44 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+
 #include "db/db_test_util.h"
+#include "options/options_helper.h"
 #include "port/stack_trace.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/perf_context.h"
+#include "rocksdb/table.h"
 #include "rocksdb/utilities/debug.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
-#include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
 #include "test_util/sync_point.h"
 #endif
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBBasicTest : public DBTestBase {
  public:
-  DBBasicTest() : DBTestBase("/db_basic_test") {}
+  DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {}
 };
 
 TEST_F(DBBasicTest, OpenWhenOpen) {
   Options options = CurrentOptions();
   options.env = env_;
-  ROCKSDB_NAMESPACE::DB* db2 = nullptr;
-  ROCKSDB_NAMESPACE::Status s = DB::Open(options, dbname_, &db2);
-
+  DB* db2 = nullptr;
+  Status s = DB::Open(options, dbname_, &db2);
+  ASSERT_NOK(s) << [db2]() {
+    delete db2;
+    return "db2 open: ok";
+  }();
   ASSERT_EQ(Status::Code::kIOError, s.code());
   ASSERT_EQ(Status::SubCode::kNone, s.subcode());
   ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
@@ -37,6 +51,62 @@
   delete db2;
 }
 
+TEST_F(DBBasicTest, UniqueSession) {
+  Options options = CurrentOptions();
+  std::string sid1, sid2, sid3, sid4;
+
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  Reopen(options);
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(db_->GetDbSessionId(sid4));
+  Reopen(options);
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+
+  ASSERT_NE(sid1, sid2);
+  ASSERT_NE(sid1, sid3);
+  ASSERT_NE(sid2, sid3);
+
+  ASSERT_EQ(sid2, sid4);
+
+  // Expected compact format for session ids (see notes in implementation)
+  TestRegex expected("[0-9A-Z]{20}");
+  EXPECT_MATCHES_REGEX(sid1, expected);
+  EXPECT_MATCHES_REGEX(sid2, expected);
+  EXPECT_MATCHES_REGEX(sid3, expected);
+
+#ifndef ROCKSDB_LITE
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  // Test uniqueness between readonly open (sid1) and regular open (sid3)
+  ASSERT_NE(sid1, sid3);
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+
+  ASSERT_NE(sid1, sid2);
+
+  ASSERT_EQ(sid2, sid3);
+#endif  // ROCKSDB_LITE
+
+  CreateAndReopenWithCF({"goku"}, options);
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  ASSERT_OK(Put("bar", "e1"));
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_EQ("e1", Get("bar"));
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+  ReopenWithColumnFamilies({"default", "goku"}, options);
+  ASSERT_OK(db_->GetDbSessionId(sid4));
+
+  ASSERT_EQ(sid1, sid2);
+  ASSERT_EQ(sid2, sid3);
+
+  ASSERT_NE(sid1, sid4);
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBBasicTest, ReadOnlyDB) {
   ASSERT_OK(Put("foo", "v1"));
@@ -44,29 +114,46 @@
   ASSERT_OK(Put("foo", "v3"));
   Close();
 
+  auto verify_one_iter = [&](Iterator* iter) {
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ++count;
+    }
+    // Always expect two keys: "foo" and "bar"
+    ASSERT_EQ(count, 2);
+  };
+
+  auto verify_all_iters = [&]() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    verify_one_iter(iter);
+    delete iter;
+
+    std::vector<Iterator*> iters;
+    ASSERT_OK(db_->NewIterators(ReadOptions(),
+                                {dbfull()->DefaultColumnFamily()}, &iters));
+    ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+    verify_one_iter(iters[0]);
+    delete iters[0];
+  };
+
   auto options = CurrentOptions();
   assert(options.env == env_);
   ASSERT_OK(ReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
-  Iterator* iter = db_->NewIterator(ReadOptions());
-  int count = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    ASSERT_OK(iter->status());
-    ++count;
-  }
-  ASSERT_EQ(count, 2);
-  delete iter;
+  verify_all_iters();
   Close();
 
   // Reopen and flush memtable.
   Reopen(options);
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
+  verify_all_iters();
   ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
 }
 
@@ -81,7 +168,7 @@
   assert(options.env == env_);
   ASSERT_OK(ReadOnlyReopen(options));
   std::string db_id1;
-  db_->GetDbIdentity(db_id1);
+  ASSERT_OK(db_->GetDbIdentity(db_id1));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   Iterator* iter = db_->NewIterator(ReadOptions());
@@ -96,7 +183,7 @@
 
   // Reopen and flush memtable.
   Reopen(options);
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
@@ -104,7 +191,7 @@
   ASSERT_EQ("v2", Get("bar"));
   ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
   std::string db_id2;
-  db_->GetDbIdentity(db_id2);
+  ASSERT_OK(db_->GetDbIdentity(db_id2));
   ASSERT_EQ(db_id1, db_id2);
 }
 
@@ -119,7 +206,7 @@
   Reopen(options);
   // 1 L0 file, use CompactedDB if max_open_files = -1
   ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   ASSERT_OK(ReadOnlyReopen(options));
   Status s = Put("new", "value");
@@ -137,12 +224,12 @@
   Reopen(options);
   // Add more L0 files
   ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
   ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
-  Flush();
+  ASSERT_OK(Flush());
   Close();
 
   ASSERT_OK(ReadOnlyReopen(options));
@@ -159,7 +246,7 @@
   ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
   ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
   ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(3, NumTableFilesAtLevel(1));
   Close();
 
@@ -217,8 +304,8 @@
   int i = 0;
   while (NumTableFilesAtLevel(2, 1) == 0) {
     ASSERT_OK(Put(1, Key(i++), value));
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
   options.num_levels = 1;
@@ -272,8 +359,8 @@
     options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    Put(1, "a", Slice());
-    SingleDelete(1, "a");
+    ASSERT_OK(Put(1, "a", Slice()));
+    ASSERT_OK(SingleDelete(1, "a"));
     ASSERT_OK(Flush(1));
 
     ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
@@ -319,12 +406,19 @@
 
 TEST_F(DBBasicTest, CheckLock) {
   do {
-    DB* localdb;
+    DB* localdb = nullptr;
     Options options = CurrentOptions();
     ASSERT_OK(TryReopen(options));
 
     // second open should fail
-    ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
+    Status s = DB::Open(options, dbname_, &localdb);
+    ASSERT_NOK(s) << [localdb]() {
+      delete localdb;
+      return "localdb open: ok";
+    }();
+#ifdef OS_LINUX
+    ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos);
+#endif  // OS_LINUX
   } while (ChangeCompactOptions());
 }
 
@@ -392,7 +486,7 @@
   sleeping_task_low.WaitUntilDone();
 }
 
-TEST_F(DBBasicTest, FLUSH) {
+TEST_F(DBBasicTest, Flush) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     WriteOptions writeOpt = WriteOptions();
@@ -513,29 +607,30 @@
 
 #ifndef ROCKSDB_LITE
 TEST_F(DBBasicTest, Snapshot) {
+  env_->SetMockSleep();
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
-    Put(0, "foo", "0v1");
-    Put(1, "foo", "1v1");
+    ASSERT_OK(Put(0, "foo", "0v1"));
+    ASSERT_OK(Put(1, "foo", "1v1"));
 
     const Snapshot* s1 = db_->GetSnapshot();
     ASSERT_EQ(1U, GetNumSnapshots());
     uint64_t time_snap1 = GetTimeOldestSnapshots();
     ASSERT_GT(time_snap1, 0U);
     ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
-    Put(0, "foo", "0v2");
-    Put(1, "foo", "1v2");
+    ASSERT_OK(Put(0, "foo", "0v2"));
+    ASSERT_OK(Put(1, "foo", "1v2"));
 
-    env_->addon_time_.fetch_add(1);
+    env_->MockSleepForSeconds(1);
 
     const Snapshot* s2 = db_->GetSnapshot();
     ASSERT_EQ(2U, GetNumSnapshots());
     ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
     ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
-    Put(0, "foo", "0v3");
-    Put(1, "foo", "1v3");
+    ASSERT_OK(Put(0, "foo", "0v3"));
+    ASSERT_OK(Put(1, "foo", "1v3"));
 
     {
       ManagedSnapshot s3(db_);
@@ -543,8 +638,8 @@
       ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
       ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
 
-      Put(0, "foo", "0v4");
-      Put(1, "foo", "1v4");
+      ASSERT_OK(Put(0, "foo", "0v4"));
+      ASSERT_OK(Put(1, "foo", "1v4"));
       ASSERT_EQ("0v1", Get(0, "foo", s1));
       ASSERT_EQ("1v1", Get(1, "foo", s1));
       ASSERT_EQ("0v2", Get(0, "foo", s2));
@@ -584,60 +679,79 @@
 
 #endif  // ROCKSDB_LITE
 
-TEST_F(DBBasicTest, CompactBetweenSnapshots) {
+class DBBasicMultiConfigs : public DBBasicTest,
+                            public ::testing::WithParamInterface<int> {
+ public:
+  DBBasicMultiConfigs() { option_config_ = GetParam(); }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> option_configs;
+    for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+      if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) {
+        option_configs.push_back(option_config);
+      }
+    }
+    return option_configs;
+  }
+};
+
+TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) {
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
-  do {
-    Options options = CurrentOptions(options_override);
-    options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    Random rnd(301);
-    FillLevels("a", "z", 1);
+  Options options = CurrentOptions(options_override);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  FillLevels("a", "z", 1);
 
-    Put(1, "foo", "first");
-    const Snapshot* snapshot1 = db_->GetSnapshot();
-    Put(1, "foo", "second");
-    Put(1, "foo", "third");
-    Put(1, "foo", "fourth");
-    const Snapshot* snapshot2 = db_->GetSnapshot();
-    Put(1, "foo", "fifth");
-    Put(1, "foo", "sixth");
-
-    // All entries (including duplicates) exist
-    // before any compaction or flush is triggered.
-    ASSERT_EQ(AllEntriesFor("foo", 1),
-              "[ sixth, fifth, fourth, third, second, first ]");
-    ASSERT_EQ("sixth", Get(1, "foo"));
-    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
-    ASSERT_EQ("first", Get(1, "foo", snapshot1));
+  ASSERT_OK(Put(1, "foo", "first"));
+  const Snapshot* snapshot1 = db_->GetSnapshot();
+  ASSERT_OK(Put(1, "foo", "second"));
+  ASSERT_OK(Put(1, "foo", "third"));
+  ASSERT_OK(Put(1, "foo", "fourth"));
+  const Snapshot* snapshot2 = db_->GetSnapshot();
+  ASSERT_OK(Put(1, "foo", "fifth"));
+  ASSERT_OK(Put(1, "foo", "sixth"));
+
+  // All entries (including duplicates) exist
+  // before any compaction or flush is triggered.
+  ASSERT_EQ(AllEntriesFor("foo", 1),
+            "[ sixth, fifth, fourth, third, second, first ]");
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+  ASSERT_EQ("first", Get(1, "foo", snapshot1));
 
-    // After a flush, "second", "third" and "fifth" should
-    // be removed
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+  // After a flush, "second", "third" and "fifth" should
+  // be removed
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
 
-    // after we release the snapshot1, only two values left
-    db_->ReleaseSnapshot(snapshot1);
-    FillLevels("a", "z", 1);
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
-
-    // We have only one valid snapshot snapshot2. Since snapshot1 is
-    // not valid anymore, "first" should be removed by a compaction.
-    ASSERT_EQ("sixth", Get(1, "foo"));
-    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
-
-    // after we release the snapshot2, only one value should be left
-    db_->ReleaseSnapshot(snapshot2);
-    FillLevels("a", "z", 1);
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
-    ASSERT_EQ("sixth", Get(1, "foo"));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
-  } while (ChangeOptions(kSkipFIFOCompaction));
+  // after we release the snapshot1, only two values left
+  db_->ReleaseSnapshot(snapshot1);
+  FillLevels("a", "z", 1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
+
+  // We have only one valid snapshot snapshot2. Since snapshot1 is
+  // not valid anymore, "first" should be removed by a compaction.
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+  // after we release the snapshot2, only one value should be left
+  db_->ReleaseSnapshot(snapshot2);
+  FillLevels("a", "z", 1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
 }
 
+INSTANTIATE_TEST_CASE_P(
+    DBBasicMultiConfigs, DBBasicMultiConfigs,
+    ::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs()));
+
 TEST_F(DBBasicTest, DBOpen_Options) {
   Options options = CurrentOptions();
   Close();
@@ -685,18 +799,18 @@
     options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    Put(1, "foo", "v1");
+    ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Flush(1));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
 
     // Write two new keys
-    Put(1, "a", "begin");
-    Put(1, "z", "end");
-    Flush(1);
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
 
     // Case1: Delete followed by a put
-    Delete(1, "foo");
-    Put(1, "foo", "v2");
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
 
     // After the current memtable is flushed, the DEL should
@@ -704,66 +818,66 @@
     ASSERT_OK(Flush(1));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
 
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
 
     // Case 2: Delete followed by another delete
-    Delete(1, "foo");
-    Delete(1, "foo");
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
     ASSERT_OK(Flush(1));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
     // Case 3: Put followed by a delete
-    Put(1, "foo", "v3");
-    Delete(1, "foo");
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Delete(1, "foo"));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
     ASSERT_OK(Flush(1));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
     // Case 4: Put followed by another Put
-    Put(1, "foo", "v4");
-    Put(1, "foo", "v5");
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_OK(Put(1, "foo", "v5"));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
     ASSERT_OK(Flush(1));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
 
     // clear database
-    Delete(1, "foo");
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
     // Case 5: Put followed by snapshot followed by another Put
     // Both puts should remain.
-    Put(1, "foo", "v6");
+    ASSERT_OK(Put(1, "foo", "v6"));
     const Snapshot* snapshot = db_->GetSnapshot();
-    Put(1, "foo", "v7");
+    ASSERT_OK(Put(1, "foo", "v7"));
     ASSERT_OK(Flush(1));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
     db_->ReleaseSnapshot(snapshot);
 
     // clear database
-    Delete(1, "foo");
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
     // Case 5: snapshot followed by a put followed by another Put
     // Only the last put should remain.
     const Snapshot* snapshot1 = db_->GetSnapshot();
-    Put(1, "foo", "v8");
-    Put(1, "foo", "v9");
+    ASSERT_OK(Put(1, "foo", "v8"));
+    ASSERT_OK(Put(1, "foo", "v9"));
     ASSERT_OK(Flush(1));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
     db_->ReleaseSnapshot(snapshot1);
@@ -786,7 +900,7 @@
   ASSERT_OK(Put(7, "popovich", "popovich"));
 
   for (int i = 0; i < 8; ++i) {
-    Flush(i);
+    ASSERT_OK(Flush(i));
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), i + 1U);
   }
@@ -859,16 +973,24 @@
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBBasicTest, ChecksumTest) {
+class DBBlockChecksumTest : public DBBasicTest,
+                            public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+TEST_P(DBBlockChecksumTest, BlockChecksumTest) {
   BlockBasedTableOptions table_options;
+  table_options.format_version = GetParam();
   Options options = CurrentOptions();
-  // change when new checksum type added
-  int max_checksum = static_cast<int>(kxxHash64);
   const int kNumPerFile = 2;
 
+  const auto algs = GetSupportedChecksums();
+  const int algs_size = static_cast<int>(algs.size());
+
   // generate one table with each type of checksum
-  for (int i = 0; i <= max_checksum; ++i) {
-    table_options.checksum = static_cast<ChecksumType>(i);
+  for (int i = 0; i < algs_size; ++i) {
+    table_options.checksum = algs[i];
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     Reopen(options);
     for (int j = 0; j < kNumPerFile; ++j) {
@@ -878,15 +1000,20 @@
   }
 
   // with each valid checksum type setting...
-  for (int i = 0; i <= max_checksum; ++i) {
-    table_options.checksum = static_cast<ChecksumType>(i);
+  for (int i = 0; i < algs_size; ++i) {
+    table_options.checksum = algs[i];
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     Reopen(options);
     // verify every type of checksum (should be regardless of that setting)
-    for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) {
+    for (int j = 0; j < algs_size * kNumPerFile; ++j) {
       ASSERT_EQ(Key(j), Get(Key(j)));
     }
   }
+
+  // Now test invalid checksum type
+  table_options.checksum = static_cast<ChecksumType>(123);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
 }
 
 // On Windows you can have either memory mapped file or a file
@@ -919,44 +1046,46 @@
 #endif
 
 class TestEnv : public EnvWrapper {
-  public:
-   explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+ public:
+  explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+  static const char* kClassName() { return "TestEnv"; }
+  const char* Name() const override { return kClassName(); }
 
-   class TestLogger : public Logger {
-    public:
-     using Logger::Logv;
-     explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
-     ~TestLogger() override {
-       if (!closed_) {
-         CloseHelper();
-       }
-     }
-     void Logv(const char* /*format*/, va_list /*ap*/) override {}
-
-    protected:
-     Status CloseImpl() override { return CloseHelper(); }
-
-    private:
-     Status CloseHelper() {
-       env->CloseCountInc();
-       ;
-       return Status::IOError();
-     }
-     TestEnv* env;
-   };
-
-    void CloseCountInc() { close_count++; }
-
-    int GetCloseCount() { return close_count; }
-
-    Status NewLogger(const std::string& /*fname*/,
-                     std::shared_ptr<Logger>* result) override {
-      result->reset(new TestLogger(this));
-      return Status::OK();
+  class TestLogger : public Logger {
+   public:
+    using Logger::Logv;
+    explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+    ~TestLogger() override {
+      if (!closed_) {
+        CloseHelper().PermitUncheckedError();
+      }
     }
+    void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+   protected:
+    Status CloseImpl() override { return CloseHelper(); }
 
    private:
-    int close_count;
+    Status CloseHelper() {
+      env->CloseCountInc();
+      ;
+      return Status::IOError();
+    }
+    TestEnv* env;
+  };
+
+  void CloseCountInc() { close_count++; }
+
+  int GetCloseCount() { return close_count; }
+
+  Status NewLogger(const std::string& /*fname*/,
+                   std::shared_ptr<Logger>* result) override {
+    result->reset(new TestLogger(this));
+    return Status::OK();
+  }
+
+ private:
+  int close_count;
 };
 
 TEST_F(DBBasicTest, DBClose) {
@@ -1008,7 +1137,7 @@
   Options options = GetDefaultOptions();
   options.create_if_missing = true;
   options.manual_wal_flush = true;
-  options.write_buffer_size=100;
+  options.write_buffer_size = 100;
   options.env = fault_injection_env.get();
 
   Reopen(options);
@@ -1018,9 +1147,15 @@
   ASSERT_OK(Put("key3", "value3"));
   fault_injection_env->SetFilesystemActive(false);
   Status s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
+  // retry should return the same error
+  s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
   fault_injection_env->SetFilesystemActive(true);
+  // retry close() is no-op even the system is back. Could be improved if
+  // Close() is retry-able: #9029
+  s = dbfull()->Close();
   ASSERT_NE(s, Status::OK());
-
   Destroy(options);
 }
 
@@ -1048,7 +1183,7 @@
   }
 
   int get_sv_count = 0;
-  ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
         if (++get_sv_count == 2) {
@@ -1066,7 +1201,7 @@
         }
         if (get_sv_count == 11) {
           for (int i = 0; i < 8; ++i) {
-            auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+            auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
                             db->GetColumnFamilyHandle(i))
                             ->cfd();
             ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
@@ -1117,9 +1252,10 @@
   ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
 
   for (int cf = 0; cf < 8; ++cf) {
-    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
-                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(cf))
-                    ->cfd();
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(cf))
+            ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
   }
@@ -1179,9 +1315,10 @@
               "cf" + std::to_string(j) + "_val" + std::to_string(retries));
   }
   for (int i = 0; i < 8; ++i) {
-    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
-                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
-                    ->cfd();
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+            ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
   }
 }
@@ -1198,7 +1335,7 @@
   }
 
   int get_sv_count = 0;
-  ROCKSDB_NAMESPACE::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
         if (++get_sv_count == 2) {
@@ -1210,7 +1347,7 @@
         }
         if (get_sv_count == 8) {
           for (int i = 0; i < 8; ++i) {
-            auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+            auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
                             db->GetColumnFamilyHandle(i))
                             ->cfd();
             ASSERT_TRUE(
@@ -1238,13 +1375,36 @@
     ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
   }
   for (int i = 0; i < 8; ++i) {
-    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
-                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
-                    ->cfd();
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+            ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
   }
 }
 
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"one", "two"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(2, "baz", "xyz"));
+  ASSERT_OK(Put(1, "abc", "def"));
+
+  // Note: keys for the same CF do not form a consecutive range
+  std::vector<int> cfs{1, 2, 1};
+  std::vector<std::string> keys{"foo", "baz", "abc"};
+  std::vector<std::string> values;
+
+  values =
+      MultiGet(cfs, keys, /* snapshot */ nullptr, /* batched */ GetParam());
+
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "xyz");
+  ASSERT_EQ(values[2], "def");
+}
+
 INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
                         testing::Bool());
 
@@ -1289,14 +1449,18 @@
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBBasicTest, MultiGetBatchedSimpleSorted) {
+TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     SetPerfLevel(kEnableCount);
+    // To expand the power of this test, generate > 1 table file and
+    // mix with memtable
     ASSERT_OK(Put(1, "k1", "v1"));
     ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "k3", "v3"));
     ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Delete(1, "k4"));
     ASSERT_OK(Put(1, "k5", "v5"));
     ASSERT_OK(Delete(1, "no_key"));
@@ -1327,7 +1491,58 @@
     ASSERT_TRUE(s[5].IsNotFound());
 
     SetPerfLevel(kDisable);
-  } while (ChangeCompactOptions());
+  } while (ChangeOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) {
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateStringAppendOperator();
+  CreateAndReopenWithCF({"pikachu"}, opts);
+  SetPerfLevel(kEnableCount);
+  // To expand the power of this test, generate > 1 table file and
+  // mix with memtable
+  ASSERT_OK(Merge(1, "k1", "v1"));
+  ASSERT_OK(Merge(1, "k2", "v2"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k3", "v3"));
+  ASSERT_OK(Merge(1, "k4", "v4"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k4", "v4_2"));
+  ASSERT_OK(Merge(1, "k6", "v6"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k7", "v7"));
+  ASSERT_OK(Merge(1, "k8", "v8"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  get_perf_context()->Reset();
+
+  std::vector<Slice> keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"});
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+  std::vector<Status> s(keys.size());
+
+  db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                values.data(), s.data(), false);
+
+  ASSERT_EQ(values.size(), keys.size());
+  ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
+  ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8");
+  ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8");
+  ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+  ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3");
+  ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes);
+
+  for (Status& status : s) {
+    ASSERT_OK(status);
+  }
+
+  SetPerfLevel(kDisable);
 }
 
 TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
@@ -1340,12 +1555,12 @@
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(2);
@@ -1354,12 +1569,12 @@
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(1);
@@ -1368,12 +1583,12 @@
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   ASSERT_EQ(0, num_keys);
@@ -1419,12 +1634,12 @@
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(2);
@@ -1433,12 +1648,12 @@
     ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(1);
@@ -1447,18 +1662,19 @@
     ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   ASSERT_EQ(0, num_keys);
 
   for (int i = 0; i < 128; i += 9) {
-    ASSERT_OK(Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+    ASSERT_OK(
+        Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
   }
 
   std::vector<std::string> keys;
@@ -1490,6 +1706,310 @@
   }
 }
 
+TEST_F(DBBasicTest, MultiGetBatchedValueSizeInMemory) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  SetPerfLevel(kEnableCount);
+  ASSERT_OK(Put(1, "k1", "v_1"));
+  ASSERT_OK(Put(1, "k2", "v_2"));
+  ASSERT_OK(Put(1, "k3", "v_3"));
+  ASSERT_OK(Put(1, "k4", "v_4"));
+  ASSERT_OK(Put(1, "k5", "v_5"));
+  ASSERT_OK(Put(1, "k6", "v_6"));
+  std::vector<Slice> keys = {"k1", "k2", "k3", "k4", "k5", "k6"};
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> s(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+  get_perf_context()->Reset();
+  ReadOptions ro;
+  ro.value_size_soft_limit = 11;
+  db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                s.data(), false);
+
+  ASSERT_EQ(values.size(), keys.size());
+  for (unsigned int i = 0; i < 4; i++) {
+    ASSERT_EQ(std::string(values[i].data(), values[i].size()),
+              "v_" + std::to_string(i + 1));
+  }
+
+  for (unsigned int i = 4; i < 6; i++) {
+    ASSERT_TRUE(s[i].IsAborted());
+  }
+
+  ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes);
+  SetPerfLevel(kDisable);
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedValueSize) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+
+    ASSERT_OK(Put(1, "k6", "v6"));
+    ASSERT_OK(Put(1, "k7", "v7_"));
+    ASSERT_OK(Put(1, "k3", "v3_"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k11", "v11"));
+    ASSERT_OK(Delete(1, "no_key"));
+    ASSERT_OK(Put(1, "k8", "v8_"));
+    ASSERT_OK(Put(1, "k13", "v13"));
+    ASSERT_OK(Put(1, "k14", "v14"));
+    ASSERT_OK(Put(1, "k15", "v15"));
+    ASSERT_OK(Put(1, "k16", "v16"));
+    ASSERT_OK(Put(1, "k17", "v17"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_OK(Put(1, "k1", "v1_"));
+    ASSERT_OK(Put(1, "k2", "v2_"));
+    ASSERT_OK(Put(1, "k5", "v5_"));
+    ASSERT_OK(Put(1, "k9", "v9_"));
+    ASSERT_OK(Put(1, "k10", "v10"));
+    ASSERT_OK(Delete(1, "k2"));
+    ASSERT_OK(Delete(1, "k6"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15",
+                             "k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7",
+                             "k8", "k9", "no_key"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    ro.value_size_soft_limit = 20;
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+
+    // In memory keys
+    ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10");
+    ASSERT_TRUE(s[9].IsNotFound());  // k2
+    ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_");
+    ASSERT_TRUE(s[13].IsNotFound());  // k6
+    ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_");
+
+    // In sst files
+    ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13");
+    ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14");
+
+    // Remaining aborted after value_size exceeds.
+    ASSERT_TRUE(s[3].IsAborted());
+    ASSERT_TRUE(s[6].IsAborted());
+    ASSERT_TRUE(s[7].IsAborted());
+    ASSERT_TRUE(s[8].IsAborted());
+    ASSERT_TRUE(s[10].IsAborted());
+    ASSERT_TRUE(s[11].IsAborted());
+    ASSERT_TRUE(s[14].IsAborted());
+    ASSERT_TRUE(s[15].IsAborted());
+    ASSERT_TRUE(s[17].IsAborted());
+
+    // 6 kv pairs * 3 bytes per value (i.e. 18)
+    ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes);
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 64; i += 3) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 64; i += 5) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 64; i += 9) {
+    ASSERT_OK(
+        Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys_str;
+  for (int i = 10; i < 50; ++i) {
+    keys_str.push_back("key_" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys(keys_str.size());
+  for (int i = 0; i < 40; i++) {
+    keys[i] = Slice(keys_str[i]);
+  }
+
+  std::vector<PinnableSlice> values(keys_str.size());
+  std::vector<Status> statuses(keys_str.size());
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+  read_options.value_size_soft_limit = 380;
+  db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(),
+                keys.data(), values.data(), statuses.data());
+
+  ASSERT_EQ(values.size(), keys.size());
+
+  for (unsigned int j = 0; j < 26; ++j) {
+    int key = j + 10;
+    std::string value;
+    value.append("val_l2_" + std::to_string(key));
+    if (key % 3 == 0) {
+      value.append(",");
+      value.append("val_l1_" + std::to_string(key));
+    }
+    if (key % 5 == 0) {
+      value.append(",");
+      value.append("val_l0_" + std::to_string(key));
+    }
+    if (key % 9 == 0) {
+      value.append(",");
+      value.append("val_mem_" + std::to_string(key));
+    }
+    ASSERT_EQ(values[j], value);
+    ASSERT_OK(statuses[j]);
+  }
+
+  // All remaning keys status is set Status::Abort
+  for (unsigned int j = 26; j < 40; j++) {
+    ASSERT_TRUE(statuses[j].IsAborted());
+  }
+}
+
+TEST_F(DBBasicTest, MultiGetStats) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.partition_filters = true;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  int total_keys = 2000;
+  std::vector<std::string> keys_str(total_keys);
+  std::vector<Slice> keys(total_keys);
+  static size_t kMultiGetBatchSize = 100;
+  std::vector<PinnableSlice> values(kMultiGetBatchSize);
+  std::vector<Status> s(kMultiGetBatchSize);
+  ReadOptions read_opts;
+
+  Random rnd(309);
+  // Create Multiple SST files at multiple levels.
+  for (int i = 0; i < 500; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  for (int i = 501; i < 1000; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  for (int i = 1001; i < total_keys; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(1, 1);
+  Close();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(options.statistics->Reset());
+
+  db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250],
+                values.data(), s.data(), false);
+
+  ASSERT_EQ(values.size(), kMultiGetBatchSize);
+  HistogramData hist_data_blocks;
+  HistogramData hist_index_and_filter_blocks;
+  HistogramData hist_sst;
+
+  options.statistics->histogramData(NUM_DATA_BLOCKS_READ_PER_LEVEL,
+                                    &hist_data_blocks);
+  options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                                    &hist_index_and_filter_blocks);
+  options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);
+
+  // Maximum number of blocks read from a file system in a level.
+  ASSERT_EQ(hist_data_blocks.max, 32);
+  ASSERT_GT(hist_index_and_filter_blocks.max, 0);
+  // Maximum number of sst files read from file system in a level.
+  ASSERT_EQ(hist_sst.max, 2);
+
+  // Minimun number of blocks read in a level.
+  ASSERT_EQ(hist_data_blocks.min, 4);
+  ASSERT_GT(hist_index_and_filter_blocks.min, 0);
+  // Minimun number of sst files read in a level.
+  ASSERT_EQ(hist_sst.min, 1);
+}
+
 // Test class for batched MultiGet with prefix extractor
 // Param bool - If true, use partitioned filters
 //              If false, use full filter block
@@ -1565,11 +2085,11 @@
     ASSERT_OK(Put(1, "k2", "v2"));
     ASSERT_OK(Put(1, "k3", "v3"));
     ASSERT_OK(Put(1, "k4", "v4"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "k5", "v5"));
     const Snapshot* snap1 = dbfull()->GetSnapshot();
     ASSERT_OK(Delete(1, "k4"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     const Snapshot* snap2 = dbfull()->GetSnapshot();
 
     get_perf_context()->Reset();
@@ -1674,13 +2194,13 @@
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
 
   // Check non-default column family
-  for (size_t i = 0; i != kNumInserts - 1; ++i) {
+  for (size_t i = 0; i + 1 != kNumInserts; ++i) {
     ASSERT_OK(Put(1, std::to_string(i), "value"));
   }
-  for (size_t i = 0; i != kNumUpdates - 1; ++i) {
+  for (size_t i = 0; i + 1 != kNumUpdates; ++i) {
     ASSERT_OK(Put(1, std::to_string(i), "value1"));
   }
-  for (size_t i = 0; i != kNumDeletes - 1; ++i) {
+  for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
     ASSERT_OK(Delete(1, std::to_string(i)));
   }
   ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
@@ -1696,19 +2216,19 @@
   BlockBasedTableOptions table_options;
   table_options.pin_l0_filter_and_index_blocks_in_cache = true;
   table_options.block_size = 16 * 1024;
-  assert(table_options.block_size >
-          BlockBasedTable::kMultiGetReadStackBufSize);
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  ASSERT_TRUE(table_options.block_size >
+            BlockBasedTable::kMultiGetReadStackBufSize);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
 
   std::string zero_str(128, '\0');
   for (int i = 0; i < 100; ++i) {
     // Make the value compressible. A purely random string doesn't compress
     // and the resultant data block will not be compressed
-    std::string value(RandomString(&rnd, 128) + zero_str);
+    std::string value(rnd.RandomString(128) + zero_str);
     assert(Put(Key(i), value) == Status::OK());
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   std::vector<std::string> key_data(10);
   std::vector<Slice> keys;
@@ -1729,15 +2249,451 @@
                      keys.data(), values.data(), statuses.data(), true);
 }
 
-class DBBasicTestWithParallelIO
-    : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
+TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    for (size_t i = 0; i != 10000; ++i) {
+      std::string key_str = Key(static_cast<int>(i));
+      std::string value_str = std::to_string(cf) + "_" + std::to_string(i);
+
+      ASSERT_OK(Put(static_cast<int>(cf), key_str, value_str));
+      if (0 == (i % 1000)) {
+        ASSERT_OK(Flush(static_cast<int>(cf)));
+      }
+    }
+  }
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+  }
+  Close();
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    for (int i = 0; i != 10000; ++i) {
+      std::string key_str = Key(static_cast<int>(i));
+      std::string expected_value_str =
+          std::to_string(cf) + "_" + std::to_string(i);
+      ASSERT_EQ(expected_value_str, Get(static_cast<int>(cf), key_str));
+    }
+  }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.best_efforts_recovery = true;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+class TableFileListener : public EventListener {
  public:
-  DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") {
-    bool compressed_cache = std::get<0>(GetParam());
-    bool uncompressed_cache = std::get<1>(GetParam());
-    compression_enabled_ = std::get<2>(GetParam());
-    fill_cache_ = std::get<3>(GetParam());
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    cf_to_paths_[info.cf_name].push_back(info.file_path);
+  }
+  std::vector<std::string>& GetFiles(const std::string& cf_name) {
+    InstrumentedMutexLock lock(&mutex_);
+    return cf_to_paths_[cf_name];
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
+};
+}  // namespace
+
+TEST_F(DBBasicTest, LastSstFileNotInManifest) {
+  // If the last sst file is not tracked in MANIFEST,
+  // or the VersionEdit for the last sst file is not synced,
+  // on recovery, the last sst file should be deleted,
+  // and new sst files shouldn't reuse its file number.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  Close();
+
+  // Manually add a sst file.
+  constexpr uint64_t kSstFileNumber = 100;
+  const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber);
+  ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content",
+                              /* fname = */ kSstFile,
+                              /* should_sync = */ true));
+  ASSERT_OK(env_->FileExists(kSstFile));
+
+  TableFileListener* listener = new TableFileListener();
+  options.listeners.emplace_back(listener);
+  Reopen(options);
+  // kSstFile should already be deleted.
+  ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound());
+
+  ASSERT_OK(Put("k", "v"));
+  ASSERT_OK(Flush());
+  // New sst file should have file number > kSstFileNumber.
+  std::vector<std::string>& files =
+      listener->GetFiles(kDefaultColumnFamilyName);
+  ASSERT_EQ(files.size(), 1);
+  const std::string fname = files[0].erase(0, (dbname_ + "/").size());
+  uint64_t number = 0;
+  FileType type = kTableFile;
+  ASSERT_TRUE(ParseFileName(fname, &number, &type));
+  ASSERT_EQ(type, kTableFile);
+  ASSERT_GT(number, kSstFileNumber);
+}
+
+TEST_F(DBBasicTest, RecoverWithMissingFiles) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  TableFileListener* listener = new TableFileListener();
+  // Disable auto compaction to simplify SST file name tracking.
+  options.disable_auto_compactions = true;
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
+                                           "eevee"};
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    ASSERT_OK(Put(static_cast<int>(cf), "a", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+    ASSERT_OK(Put(static_cast<int>(cf), "b", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+    ASSERT_OK(Put(static_cast<int>(cf), "c", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+  }
+
+  // Delete and corrupt files
+  for (size_t i = 0; i < all_cf_names.size(); ++i) {
+    std::vector<std::string>& files = listener->GetFiles(all_cf_names[i]);
+    ASSERT_EQ(3, files.size());
+    std::string corrupted_data;
+    ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data));
+    ASSERT_OK(WriteStringToFile(
+        env_, corrupted_data.substr(0, corrupted_data.size() - 2),
+        files[files.size() - 1], /*should_sync=*/true));
+    for (int j = static_cast<int>(files.size() - 2); j >= static_cast<int>(i);
+         --j) {
+      ASSERT_OK(env_->DeleteFile(files[j]));
+    }
+  }
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies(all_cf_names, options);
+  // Verify data
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+    iter->SeekToFirst();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter.reset(db_->NewIterator(read_opts, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter.reset(db_->NewIterator(read_opts, handles_[2]));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", iter->key());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("b", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  ASSERT_OK(Flush());
+  Close();
+  {
+    // Hack by adding a new MANIFEST with high file number
+    std::string garbage(10, '\0');
+    ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000",
+                                /*should_sync=*/true));
+  }
+  {
+    // Hack by adding a corrupted SST not referenced by any MANIFEST
+    std::string garbage(10, '\0');
+    ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst",
+                                /*should_sync=*/true));
+  }
+
+  options.best_efforts_recovery = true;
+
+  Reopen(options);
+  ASSERT_OK(Put("bar", "value"));
+}
+
+TEST_F(DBBasicTest, RecoverWithNoCurrentFile) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Put(1, "bar", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Flush(1));
+  Close();
+  ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_)));
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  std::vector<std::string> cf_names;
+  ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names));
+  ASSERT_EQ(2, cf_names.size());
+  for (const auto& name : cf_names) {
+    ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu");
+  }
+}
+
+TEST_F(DBBasicTest, RecoverWithNoManifest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  Close();
+  {
+    // Delete all MANIFEST.
+    std::vector<std::string> files;
+    ASSERT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& file : files) {
+      uint64_t number = 0;
+      FileType type = kWalFile;
+      if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+        ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file));
+      }
+    }
+  }
+  options.best_efforts_recovery = true;
+  options.create_if_missing = false;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  options.create_if_missing = true;
+  Reopen(options);
+  // Since no MANIFEST exists, best-efforts recovery creates a new, empty db.
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  TableFileListener* listener = new TableFileListener();
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<std::string> kAllCfNames = {kDefaultColumnFamilyName, "pikachu"};
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (int cf = 0; cf < static_cast<int>(kAllCfNames.size()); ++cf) {
+    ASSERT_OK(Put(cf, "a", "0_value"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(Put(cf, "b", "0_value"));
+  }
+  // Delete files
+  for (size_t i = 0; i < kAllCfNames.size(); ++i) {
+    std::vector<std::string>& files = listener->GetFiles(kAllCfNames[i]);
+    ASSERT_EQ(1, files.size());
+    for (int j = static_cast<int>(files.size() - 1); j >= static_cast<int>(i);
+         --j) {
+      ASSERT_OK(env_->DeleteFile(files[j]));
+    }
+  }
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies(kAllCfNames, options);
+  // Verify WAL is not applied
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+  iter->SeekToFirst();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset(db_->NewIterator(read_opts, handles_[1]));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("a", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_F(DBBasicTest, DisableTrackWal) {
+  // If WAL tracking was enabled, and then disabled during reopen,
+  // the previously tracked WALs should be removed from MANIFEST.
+
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  // extremely small write buffer size,
+  // so that new WALs are created more frequently.
+  options.write_buffer_size = 100;
+  options.env = env_;
+  DestroyAndReopen(options);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->SyncWAL());
+  // Some WALs are tracked.
+  ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Disable WAL tracking.
+  options.track_and_verify_wals_in_manifest = false;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  // Previously tracked WALs are cleared.
+  ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Re-enable WAL tracking again.
+  options.track_and_verify_wals_in_manifest = true;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, ManifestChecksumMismatch) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("bar", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) {
+        auto* crc = reinterpret_cast<uint32_t*>(arg);
+        *crc = *crc + 1;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  Status s = db_->Put(write_opts, "foo", "value");
+  ASSERT_OK(s);
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(Put("foo", "value1"));
+  ASSERT_OK(Flush());
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBBasicTest, ConcurrentlyCloseDB) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::vector<std::thread> workers;
+  for (int i = 0; i < 10; i++) {
+    workers.push_back(std::thread([&]() {
+      auto s = db_->Close();
+      ASSERT_OK(s);
+    }));
+  }
+  for (auto& w : workers) {
+    w.join();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+class DBBasicTestTrackWal : public DBTestBase,
+                            public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestTrackWal()
+      : DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {}
+
+  int CountWalFiles() {
+    VectorLogPtr log_files;
+    EXPECT_OK(dbfull()->GetSortedWalFiles(log_files));
+    return static_cast<int>(log_files.size());
+  };
+};
+
+TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) {
+  // If a WAL becomes obsolete after flushing, but is not deleted from disk yet,
+  // then if SyncWAL is called afterwards, the obsolete WAL should not be
+  // tracked in MANIFEST.
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.track_and_verify_wals_in_manifest = true;
+  options.atomic_flush = GetParam();
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf"}, options);
+  ASSERT_EQ(handles_.size(), 2);  // default, cf
+  // Do not delete WALs.
+  ASSERT_OK(db_->DisableFileDeletions());
+  constexpr int n = 10;
+  std::vector<std::unique_ptr<LogFile>> wals(n);
+  for (size_t i = 0; i < n; i++) {
+    // Generate a new WAL for each key-value.
+    const int cf = i % 2;
+    ASSERT_OK(db_->GetCurrentWalFile(&wals[i]));
+    ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i)));
+    ASSERT_OK(Flush({0, 1}));
+  }
+  ASSERT_EQ(CountWalFiles(), n);
+  // Since all WALs are obsolete, no WAL should be tracked in MANIFEST.
+  ASSERT_OK(db_->SyncWAL());
+
+  // Manually delete all WALs.
+  Close();
+  for (const auto& wal : wals) {
+    ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+  }
+
+  // If SyncWAL tracks the obsolete WALs in MANIFEST,
+  // reopen will fail because the WALs are missing from disk.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options));
+  Destroy(options);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal,
+                        testing::Bool());
+#endif  // ROCKSDB_LITE
+
+class DBBasicTestMultiGet : public DBTestBase {
+ public:
+  DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache,
+                      bool uncompressed_cache, bool _compression_enabled,
+                      bool _fill_cache, uint32_t compression_parallel_threads)
+      : DBTestBase(test_dir, /*env_do_fsync=*/false) {
+    compression_enabled_ = _compression_enabled;
+    fill_cache_ = _fill_cache;
 
     if (compressed_cache) {
       std::shared_ptr<Cache> cache = NewLRUCache(1048576);
@@ -1760,10 +2716,17 @@
       compression_types = GetSupportedCompressions();
       // Not every platform may have compression libraries available, so
       // dynamically pick based on what's available
-      if (compression_types.size() == 0) {
-        compression_enabled_ = false;
+      CompressionType tmp_type = kNoCompression;
+      for (auto c_type : compression_types) {
+        if (c_type != kNoCompression) {
+          tmp_type = c_type;
+          break;
+        }
+      }
+      if (tmp_type != kNoCompression) {
+        options.compression = tmp_type;
       } else {
-        options.compression = compression_types[0];
+        compression_enabled_ = false;
       }
     }
 #else
@@ -1771,7 +2734,7 @@
     if (!Snappy_Supported()) {
       compression_enabled_ = false;
     }
-#endif //ROCKSDB_LITE
+#endif  // ROCKSDB_LITE
 
     table_options.block_cache = uncompressed_cache_;
     if (table_options.block_cache == nullptr) {
@@ -1782,28 +2745,57 @@
     table_options.block_cache_compressed = compressed_cache_;
     table_options.flush_block_policy_factory.reset(
         new MyFlushBlockPolicyFactory());
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     if (!compression_enabled_) {
       options.compression = kNoCompression;
+    } else {
+      options.compression_opts.parallel_threads = compression_parallel_threads;
     }
+    options_ = options;
     Reopen(options);
 
+    if (num_cfs > 1) {
+      for (int cf = 0; cf < num_cfs; ++cf) {
+        cf_names_.emplace_back("cf" + std::to_string(cf));
+      }
+      CreateColumnFamilies(cf_names_, options);
+      cf_names_.emplace_back("default");
+    }
+
     std::string zero_str(128, '\0');
-    for (int i = 0; i < 100; ++i) {
-      // Make the value compressible. A purely random string doesn't compress
-      // and the resultant data block will not be compressed
-      values_.emplace_back(RandomString(&rnd, 128) + zero_str);
-      assert(Put(Key(i), values_[i]) == Status::OK());
-    }
-    Flush();
-
-    for (int i = 0; i < 100; ++i) {
-      // block cannot gain space by compression
-      uncompressable_values_.emplace_back(RandomString(&rnd, 256) + '\0');
-      std::string tmp_key = "a" + Key(i);
-      assert(Put(tmp_key, uncompressable_values_[i]) == Status::OK());
+    for (int cf = 0; cf < num_cfs; ++cf) {
+      for (int i = 0; i < 100; ++i) {
+        // Make the value compressible. A purely random string doesn't compress
+        // and the resultant data block will not be compressed
+        values_.emplace_back(rnd.RandomString(128) + zero_str);
+        assert(((num_cfs == 1) ? Put(Key(i), values_[i])
+                               : Put(cf, Key(i), values_[i])) == Status::OK());
+      }
+      if (num_cfs == 1) {
+        EXPECT_OK(Flush());
+      } else {
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+      }
+
+      for (int i = 0; i < 100; ++i) {
+        // block cannot gain space by compression
+        uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0');
+        std::string tmp_key = "a" + Key(i);
+        assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i])
+                               : Put(cf, tmp_key, uncompressable_values_[i])) ==
+               Status::OK());
+      }
+      if (num_cfs == 1) {
+        EXPECT_OK(Flush());
+      } else {
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+      }
+    }
+    // Clear compressed cache, which is always pre-populated
+    if (compressed_cache_) {
+      compressed_cache_->SetCapacity(0);
+      compressed_cache_->SetCapacity(1048576);
     }
-    Flush();
   }
 
   bool CheckValue(int i, const std::string& value) {
@@ -1820,6 +2812,8 @@
     return false;
   }
 
+  const std::vector<std::string>& GetCFNames() const { return cf_names_; }
+
   int num_lookups() { return uncompressed_cache_->num_lookups(); }
   int num_found() { return uncompressed_cache_->num_found(); }
   int num_inserts() { return uncompressed_cache_->num_inserts(); }
@@ -1832,11 +2826,12 @@
   bool compression_enabled() { return compression_enabled_; }
   bool has_compressed_cache() { return compressed_cache_ != nullptr; }
   bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+  Options get_options() { return options_; }
 
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
 
- private:
+ protected:
   class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
    public:
     MyFlushBlockPolicyFactory() {}
@@ -1877,23 +2872,27 @@
     const BlockBuilder& data_block_builder_;
   };
 
-  class MyBlockCache : public Cache {
+  class MyBlockCache : public CacheWrapper {
    public:
-    explicit MyBlockCache(std::shared_ptr<Cache>& target)
-        : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {}
-
-    virtual const char* Name() const override { return "MyBlockCache"; }
-
-    virtual Status Insert(const Slice& key, void* value, size_t charge,
-                          void (*deleter)(const Slice& key, void* value),
-                          Handle** handle = nullptr,
-                          Priority priority = Priority::LOW) override {
+    explicit MyBlockCache(std::shared_ptr<Cache> target)
+        : CacheWrapper(target),
+          num_lookups_(0),
+          num_found_(0),
+          num_inserts_(0) {}
+
+    const char* Name() const override { return "MyBlockCache"; }
+
+    using Cache::Insert;
+    Status Insert(const Slice& key, void* value, size_t charge,
+                  void (*deleter)(const Slice& key, void* value),
+                  Handle** handle = nullptr,
+                  Priority priority = Priority::LOW) override {
       num_inserts_++;
       return target_->Insert(key, value, charge, deleter, handle, priority);
     }
 
-    virtual Handle* Lookup(const Slice& key,
-                           Statistics* stats = nullptr) override {
+    using Cache::Lookup;
+    Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
       num_lookups_++;
       Handle* handle = target_->Lookup(key, stats);
       if (handle != nullptr) {
@@ -1901,57 +2900,6 @@
       }
       return handle;
     }
-
-    virtual bool Ref(Handle* handle) override { return target_->Ref(handle); }
-
-    virtual bool Release(Handle* handle, bool force_erase = false) override {
-      return target_->Release(handle, force_erase);
-    }
-
-    virtual void* Value(Handle* handle) override {
-      return target_->Value(handle);
-    }
-
-    virtual void Erase(const Slice& key) override { target_->Erase(key); }
-    virtual uint64_t NewId() override { return target_->NewId(); }
-
-    virtual void SetCapacity(size_t capacity) override {
-      target_->SetCapacity(capacity);
-    }
-
-    virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override {
-      target_->SetStrictCapacityLimit(strict_capacity_limit);
-    }
-
-    virtual bool HasStrictCapacityLimit() const override {
-      return target_->HasStrictCapacityLimit();
-    }
-
-    virtual size_t GetCapacity() const override {
-      return target_->GetCapacity();
-    }
-
-    virtual size_t GetUsage() const override { return target_->GetUsage(); }
-
-    virtual size_t GetUsage(Handle* handle) const override {
-      return target_->GetUsage(handle);
-    }
-
-    virtual size_t GetPinnedUsage() const override {
-      return target_->GetPinnedUsage();
-    }
-
-    virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; }
-
-    virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                        bool thread_safe) override {
-      return target_->ApplyToAllCacheEntries(callback, thread_safe);
-    }
-
-    virtual void EraseUnRefEntries() override {
-      return target_->EraseUnRefEntries();
-    }
-
     int num_lookups() { return num_lookups_; }
 
     int num_found() { return num_found_; }
@@ -1959,7 +2907,6 @@
     int num_inserts() { return num_inserts_; }
 
    private:
-    std::shared_ptr<Cache> target_;
     int num_lookups_;
     int num_found_;
     int num_inserts_;
@@ -1967,10 +2914,24 @@
 
   std::shared_ptr<MyBlockCache> compressed_cache_;
   std::shared_ptr<MyBlockCache> uncompressed_cache_;
+  Options options_;
   bool compression_enabled_;
   std::vector<std::string> values_;
   std::vector<std::string> uncompressable_values_;
   bool fill_cache_;
+  std::vector<std::string> cf_names_;
+};
+
+class DBBasicTestWithParallelIO
+    : public DBBasicTestMultiGet,
+      public testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, uint32_t>> {
+ public:
+  DBBasicTestWithParallelIO()
+      : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1,
+                            std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<4>(GetParam())) {}
 };
 
 TEST_P(DBBasicTestWithParallelIO, MultiGet) {
@@ -2096,6 +3057,125 @@
   }
 }
 
+#ifndef ROCKSDB_LITE
+TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) {
+  class FakeDirectIOEnv : public EnvWrapper {
+    class FakeDirectIOSequentialFile;
+    class FakeDirectIORandomAccessFile;
+
+   public:
+    FakeDirectIOEnv(Env* env) : EnvWrapper(env) {}
+    static const char* kClassName() { return "FakeDirectIOEnv"; }
+    const char* Name() const override { return kClassName(); }
+
+    Status NewRandomAccessFile(const std::string& fname,
+                               std::unique_ptr<RandomAccessFile>* result,
+                               const EnvOptions& options) override {
+      std::unique_ptr<RandomAccessFile> file;
+      assert(options.use_direct_reads);
+      EnvOptions opts = options;
+      opts.use_direct_reads = false;
+      Status s = target()->NewRandomAccessFile(fname, &file, opts);
+      if (!s.ok()) {
+        return s;
+      }
+      result->reset(new FakeDirectIORandomAccessFile(std::move(file)));
+      return s;
+    }
+
+   private:
+    class FakeDirectIOSequentialFile : public SequentialFileWrapper {
+     public:
+      FakeDirectIOSequentialFile(std::unique_ptr<SequentialFile>&& file)
+          : SequentialFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIOSequentialFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<SequentialFile> file_;
+    };
+
+    class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper {
+     public:
+      FakeDirectIORandomAccessFile(std::unique_ptr<RandomAccessFile>&& file)
+          : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIORandomAccessFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<RandomAccessFile> file_;
+    };
+  };
+
+  std::unique_ptr<FakeDirectIOEnv> env(new FakeDirectIOEnv(env_));
+  Options opts = get_options();
+  opts.env = env.get();
+  opts.use_direct_reads = true;
+  Reopen(opts);
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  if (uncompressed_cache_) {
+    uncompressed_cache_->SetCapacity(0);
+    uncompressed_cache_->SetCapacity(1048576);
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads;
+  if (!compression_enabled() || !has_compressed_cache()) {
+    expected_reads += 2;
+  } else {
+    expected_reads += (read_from_cache ? 0 : 2);
+  }
+  if (env_->random_read_counter_.Read() != expected_reads) {
+    ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+  }
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
 TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
   std::vector<std::string> key_data(10);
   std::vector<Slice> keys;
@@ -2108,13 +3188,13 @@
   ro.fill_cache = fill_cache();
 
   SyncPoint::GetInstance()->SetCallBack(
-      "RetrieveMultipleBlocks:VerifyChecksum", [&](void *status) {
-      Status* s = static_cast<Status*>(status);
-      read_count++;
-      if (read_count == 2) {
-        *s = Status::Corruption();
-      }
-    });
+      "RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) {
+        Status* s = static_cast<Status*>(status);
+        read_count++;
+        if (read_count == 2) {
+          *s = Status::Corruption();
+        }
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   // Warm up the cache first
@@ -2127,7 +3207,7 @@
   dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
                      keys.data(), values.data(), statuses.data(), true);
   ASSERT_TRUE(CheckValue(0, values[0].ToString()));
-  //ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+  // ASSERT_TRUE(CheckValue(50, values[1].ToString()));
   ASSERT_EQ(statuses[0], Status::OK());
   ASSERT_EQ(statuses[1], Status::Corruption());
 
@@ -2145,10 +3225,10 @@
   ro.fill_cache = fill_cache();
 
   SyncPoint::GetInstance()->SetCallBack(
-      "TableCache::MultiGet:FindTable", [&](void *status) {
-      Status* s = static_cast<Status*>(status);
-      *s = Status::IOError();
-    });
+      "TableCache::MultiGet:FindTable", [&](void* status) {
+        Status* s = static_cast<Status*>(status);
+        *s = Status::IOError();
+      });
   // DB open will create table readers unless we reduce the table cache
   // capacity.
   // SanitizeOptions will set max_open_files to minimum of 20. Table cache
@@ -2157,10 +3237,10 @@
   // prevent file open during DB open and force the file to be opened
   // during MultiGet
   SyncPoint::GetInstance()->SetCallBack(
-      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void *arg) {
-      int* max_open_files = (int*)arg;
-      *max_open_files = 11;
-    });
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   Reopen(CurrentOptions());
@@ -2180,362 +3260,645 @@
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
-INSTANTIATE_TEST_CASE_P(
-    ParallelIO, DBBasicTestWithParallelIO,
-    // Params are as follows -
-    // Param 0 - Compressed cache enabled
-    // Param 1 - Uncompressed cache enabled
-    // Param 2 - Data compression enabled
-    // Param 3 - ReadOptions::fill_cache
-    ::testing::Combine(::testing::Bool(), ::testing::Bool(),
-                       ::testing::Bool(), ::testing::Bool()));
-
-class DBBasicTestWithTimestampBase : public DBTestBase {
- public:
-  explicit DBBasicTestWithTimestampBase(const std::string& dbname)
-      : DBTestBase(dbname) {}
+INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO,
+                        // Params are as follows -
+                        // Param 0 - Compressed cache enabled
+                        // Param 1 - Uncompressed cache enabled
+                        // Param 2 - Data compression enabled
+                        // Param 3 - ReadOptions::fill_cache
+                        // Param 4 - CompressionOptions::parallel_threads
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Values(1, 4)));
 
- protected:
-  class TestComparatorBase : public Comparator {
-   public:
-    explicit TestComparatorBase(size_t ts_sz) : Comparator(ts_sz) {}
+// Forward declaration
+class DeadlineFS;
 
-    const char* Name() const override { return "TestComparator"; }
+class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  DeadlineRandomAccessFile(DeadlineFS& fs,
+                           std::unique_ptr<FSRandomAccessFile>& file)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+  IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
-    void FindShortSuccessor(std::string*) const override {}
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
 
-    void FindShortestSeparator(std::string*, const Slice&) const override {}
+ private:
+  DeadlineFS& fs_;
+  std::unique_ptr<FSRandomAccessFile> file_;
+};
 
-    int Compare(const Slice& a, const Slice& b) const override {
-      int r = CompareWithoutTimestamp(a, b);
-      if (r != 0 || 0 == timestamp_size()) {
-        return r;
+class DeadlineFS : public FileSystemWrapper {
+ public:
+  // The error_on_delay parameter specifies whether a IOStatus::TimedOut()
+  // status should be returned after delaying the IO to exceed the timeout,
+  // or to simply delay but return success anyway. The latter mimics the
+  // behavior of PosixFileSystem, which does not enforce any timeout
+  explicit DeadlineFS(SpecialEnv* env, bool error_on_delay)
+      : FileSystemWrapper(env->GetFileSystem()),
+        deadline_(std::chrono::microseconds::zero()),
+        io_timeout_(std::chrono::microseconds::zero()),
+        env_(env),
+        timedout_(false),
+        ignore_deadline_(false),
+        error_on_delay_(error_on_delay) {}
+
+  static const char* kClassName() { return "DeadlineFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    EXPECT_OK(s);
+    result->reset(new DeadlineRandomAccessFile(*this, file));
+
+    const std::chrono::microseconds deadline = GetDeadline();
+    const std::chrono::microseconds io_timeout = GetIOTimeout();
+    if (deadline.count() || io_timeout.count()) {
+      AssertDeadline(deadline, io_timeout, opts.io_options);
+    }
+    return ShouldDelay(opts.io_options);
+  }
+
+  // Set a vector of {IO counter, delay in microseconds, return status} tuples
+  // that control when to inject a delay and duration of the delay
+  void SetDelayTrigger(const std::chrono::microseconds deadline,
+                       const std::chrono::microseconds io_timeout,
+                       const int trigger) {
+    delay_trigger_ = trigger;
+    io_count_ = 0;
+    deadline_ = deadline;
+    io_timeout_ = io_timeout;
+    timedout_ = false;
+  }
+
+  // Increment the IO counter and return a delay in microseconds
+  IOStatus ShouldDelay(const IOOptions& opts) {
+    if (timedout_) {
+      return IOStatus::TimedOut();
+    } else if (!deadline_.count() && !io_timeout_.count()) {
+      return IOStatus::OK();
+    }
+    if (!ignore_deadline_ && delay_trigger_ == io_count_++) {
+      env_->SleepForMicroseconds(static_cast<int>(opts.timeout.count() + 1));
+      timedout_ = true;
+      if (error_on_delay_) {
+        return IOStatus::TimedOut();
       }
-      return CompareTimestamp(
-          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
-          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
     }
+    return IOStatus::OK();
+  }
 
-    virtual int CompareImpl(const Slice& a, const Slice& b) const = 0;
+  const std::chrono::microseconds GetDeadline() {
+    return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_;
+  }
 
-    int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
-      assert(a.size() >= timestamp_size());
-      assert(b.size() >= timestamp_size());
-      Slice k1 = StripTimestampFromUserKey(a, timestamp_size());
-      Slice k2 = StripTimestampFromUserKey(b, timestamp_size());
+  const std::chrono::microseconds GetIOTimeout() {
+    return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_;
+  }
 
-      return CompareImpl(k1, k2);
-    }
+  bool TimedOut() { return timedout_; }
 
-    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
-      if (!ts1.data() && !ts2.data()) {
-        return 0;
-      } else if (ts1.data() && !ts2.data()) {
-        return 1;
-      } else if (!ts1.data() && ts2.data()) {
-        return -1;
-      }
-      assert(ts1.size() == ts2.size());
-      uint64_t low1 = 0;
-      uint64_t low2 = 0;
-      uint64_t high1 = 0;
-      uint64_t high2 = 0;
-      auto* ptr1 = const_cast<Slice*>(&ts1);
-      auto* ptr2 = const_cast<Slice*>(&ts2);
-      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
-          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
-        assert(false);
-      }
-      if (high1 < high2) {
-        return 1;
-      } else if (high1 > high2) {
-        return -1;
-      }
-      if (low1 < low2) {
-        return 1;
-      } else if (low1 > low2) {
-        return -1;
+  void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; }
+
+  void AssertDeadline(const std::chrono::microseconds deadline,
+                      const std::chrono::microseconds io_timeout,
+                      const IOOptions& opts) const {
+    // Give a leeway of +- 10us as it can take some time for the Get/
+    // MultiGet call to reach here, in order to avoid false alarms
+    std::chrono::microseconds now =
+        std::chrono::microseconds(env_->NowMicros());
+    std::chrono::microseconds timeout;
+    if (deadline.count()) {
+      timeout = deadline - now;
+      if (io_timeout.count()) {
+        timeout = std::min(timeout, io_timeout);
       }
-      return 0;
+    } else {
+      timeout = io_timeout;
+    }
+    if (opts.timeout != timeout) {
+      ASSERT_EQ(timeout, opts.timeout);
     }
-  };
-
-  Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) {
-    assert(nullptr != ts);
-    ts->clear();
-    PutFixed64(ts, low);
-    PutFixed64(ts, high);
-    assert(ts->size() == sizeof(low) + sizeof(high));
-    return Slice(*ts);
   }
+
+ private:
+  // The number of IOs to trigger the delay after
+  int delay_trigger_;
+  // Current IO count
+  int io_count_;
+  // ReadOptions deadline for the Get/MultiGet/Iterator
+  std::chrono::microseconds deadline_;
+  // ReadOptions io_timeout for the Get/MultiGet/Iterator
+  std::chrono::microseconds io_timeout_;
+  SpecialEnv* env_;
+  // Flag to indicate whether we injected a delay
+  bool timedout_;
+  // Temporarily ignore deadlines/timeouts
+  bool ignore_deadline_;
+  // Return IOStatus::TimedOut() or IOStatus::OK()
+  bool error_on_delay_;
 };
 
-class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
+IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len,
+                                        const IOOptions& opts, Slice* result,
+                                        char* scratch,
+                                        IODebugContext* dbg) const {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, opts);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch,
+                                        dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(opts);
+  }
+  return s;
+}
+
+IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs,
+                                             size_t num_reqs,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, options);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(options);
+  }
+  return s;
+}
+
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the MultiGet deadline feature
+class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
  public:
-  DBBasicTestWithTimestamp()
-      : DBBasicTestWithTimestampBase("/db_basic_test_with_timestamp") {}
-
- protected:
-  class TestComparator : public TestComparatorBase {
-   public:
-    const int kKeyPrefixLength =
-        3;  // 3: length of "key" in generated keys ("key" + std::to_string(j))
-    explicit TestComparator(size_t ts_sz) : TestComparatorBase(ts_sz) {}
-
-    int CompareImpl(const Slice& a, const Slice& b) const override {
-      int n1 = atoi(
-          std::string(a.data() + kKeyPrefixLength, a.size() - kKeyPrefixLength)
-              .c_str());
-      int n2 = atoi(
-          std::string(b.data() + kKeyPrefixLength, b.size() - kKeyPrefixLength)
-              .c_str());
-      return (n1 < n2) ? -1 : (n1 > n2) ? 1 : 0;
+  DBBasicTestMultiGetDeadline()
+      : DBBasicTestMultiGet(
+            "db_basic_test_multiget_deadline" /*Test dir*/,
+            10 /*# of column families*/, false /*compressed cache enabled*/,
+            true /*uncompressed cache enabled*/, true /*compression enabled*/,
+            true /*ReadOptions.fill_cache*/,
+            1 /*# of parallel compression threads*/) {}
+
+  inline void CheckStatus(std::vector<Status>& statuses, size_t num_ok) {
+    for (size_t i = 0; i < statuses.size(); ++i) {
+      if (i < num_ok) {
+        EXPECT_OK(statuses[i]);
+      } else {
+        if (statuses[i] != Status::TimedOut()) {
+          EXPECT_EQ(statuses[i], Status::TimedOut());
+        }
+      }
     }
-  };
+  }
 };
 
-#ifndef ROCKSDB_LITE
-// A class which remembers the name of each flushed file.
-class FlushedFileCollector : public EventListener {
- public:
-  FlushedFileCollector() {}
-  ~FlushedFileCollector() override {}
+TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  Options options = CurrentOptions();
 
-  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
-    InstrumentedMutexLock lock(&mutex_);
-    flushed_files_.push_back(info.file_path);
+  std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = env.get();
+  SetTimeElapseOnlySleepOnReopen(&options);
+  ReopenWithColumnFamilies(GetCFNames(), options);
+
+  // Test the non-batched version of MultiGet with multiple column
+  // families
+  std::vector<std::string> key_str;
+  size_t i;
+  for (i = 0; i < 5; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  std::vector<ColumnFamilyHandle*> cfs(key_str.size());
+  ;
+  std::vector<Slice> keys(key_str.size());
+  std::vector<std::string> values(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    cfs[i] = handles_[i];
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
   }
 
-  std::vector<std::string> GetFlushedFiles() {
-    std::vector<std::string> result;
-    {
-      InstrumentedMutexLock lock(&mutex_);
-      result = flushed_files_;
-    }
-    return result;
+  ReadOptions ro;
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  // Delay the first IO
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+
+  std::vector<Status> statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+  // The first key is successful because we check after the lookup, but
+  // subsequent keys fail due to deadline exceeded
+  CheckStatus(statuses, 1);
+
+  // Clear the cache
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  // Test non-batched Multiget with multiple column families and
+  // introducing an IO delay in one of the middle CFs
+  key_str.clear();
+  for (i = 0; i < 10; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  cfs.resize(key_str.size());
+  keys.resize(key_str.size());
+  values.resize(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    // 2 keys per CF
+    cfs[i] = handles_[i / 2];
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
+  }
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+  statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+  CheckStatus(statuses, 3);
+
+  // Test batched MultiGet with an IO delay in the first data block read.
+  // Both keys in the first CF should succeed as they're in the same data
+  // block and would form one batch, and we check for deadline between
+  // batches.
+  std::vector<PinnableSlice> pin_values(keys.size());
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 2);
+
+  // Similar to the previous one, but an IO delay in the third CF data block
+  // read
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 6);
+
+  // Similar to the previous one, but an IO delay in the last but one CF
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 8);
+
+  // Test batched MultiGet with single CF and lots of keys. Inject delay
+  // into the second batch of keys. As each batch is 32, the first 64 keys,
+  // i.e first two batches, should succeed and the rest should time out
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  key_str.clear();
+  for (i = 0; i < 100; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  keys.resize(key_str.size());
+  pin_values.clear();
+  pin_values.resize(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
   }
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+  dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 64);
+  Close();
+}
 
-  void ClearFlushedFiles() {
-    InstrumentedMutexLock lock(&mutex_);
-    flushed_files_.clear();
+TEST_F(DBBasicTest, ManifestWriteFailure) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_OK(*s);
+        // Manually overwrite return status
+        *s = Status::IOError();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("key", "value"));
+  ASSERT_NOK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+}
+
+TEST_F(DBBasicTest, DestroyDefaultCfHandle) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  for (const auto* h : handles_) {
+    ASSERT_NE(db_->DefaultColumnFamily(), h);
   }
 
- private:
-  std::vector<std::string> flushed_files_;
-  InstrumentedMutex mutex_;
-};
+  // We have two handles to the default column family. The two handles point to
+  // different ColumnFamilyHandle objects.
+  assert(db_->DefaultColumnFamily());
+  ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID());
+  assert(handles_[0]);
+  ASSERT_EQ(0U, handles_[0]->GetID());
+
+  // You can destroy handles_[...].
+  for (auto* h : handles_) {
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
+  }
+  handles_.clear();
+
+  // But you should not destroy db_->DefaultColumnFamily(), since it's going to
+  // be deleted in `DBImpl::CloseHelper()`. Before that, it may be used
+  // elsewhere internally too.
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument());
+}
 
-TEST_F(DBBasicTestWithTimestamp, PutAndGetWithCompaction) {
-  const int kNumKeysPerFile = 8192;
-  const size_t kNumTimestamps = 2;
-  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
-  const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
-  Options options = CurrentOptions();
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, VerifyFileChecksums) {
+  Options options = GetDefaultOptions();
   options.create_if_missing = true;
   options.env = env_;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
 
-  FlushedFileCollector* collector = new FlushedFileCollector();
-  options.listeners.emplace_back(collector);
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  Reopen(options);
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
 
-  std::string tmp;
-  size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
-  TestComparator test_cmp(ts_sz);
-  options.comparator = &test_cmp;
-  BlockBasedTableOptions bbto;
-  bbto.filter_policy.reset(NewBloomFilterPolicy(
-      10 /*bits_per_key*/, false /*use_block_based_builder*/));
-  bbto.whole_key_filtering = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-  size_t num_cfs = handles_.size();
-  ASSERT_EQ(2, num_cfs);
-  std::vector<std::string> write_ts_strs(kNumTimestamps);
-  std::vector<std::string> read_ts_strs(kNumTimestamps);
-  std::vector<Slice> write_ts_list;
-  std::vector<Slice> read_ts_list;
-
-  for (size_t i = 0; i != kNumTimestamps; ++i) {
-    write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
-    read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
-    const Slice& write_ts = write_ts_list.back();
-    WriteOptions wopts;
-    wopts.timestamp = &write_ts;
-    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
-      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
-        ASSERT_OK(Put(cf, "key" + std::to_string(j),
-                      "value_" + std::to_string(j) + "_" + std::to_string(i),
-                      wopts));
-        if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
-          // flush all keys with the same timestamp to two sst files, split at
-          // incremental positions such that lowerlevel[1].smallest.userkey ==
-          // higherlevel[0].largest.userkey
-          ASSERT_OK(Flush(cf));
-
-          // compact files (2 at each level) to a lower level such that all keys
-          // with the same timestamp is at one level, with newer versions at
-          // higher levels.
-          CompactionOptions compact_opt;
-          compact_opt.compression = kNoCompression;
-          db_->CompactFiles(compact_opt, handles_[cf],
-                            collector->GetFlushedFiles(),
-                            static_cast<int>(kNumTimestamps - i));
-          collector->ClearFlushedFiles();
-        }
-      }
-    }
-  }
-  const auto& verify_db_func = [&]() {
-    for (size_t i = 0; i != kNumTimestamps; ++i) {
-      ReadOptions ropts;
-      ropts.timestamp = &read_ts_list[i];
-      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
-        ColumnFamilyHandle* cfh = handles_[cf];
-        for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
-          std::string value;
-          ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
-          ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
-                    value);
-        }
-      }
+  // Write an L0 with checksum computed.
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Does the right thing but with the wrong name -- using it should lead to an
+  // error.
+  class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c {
+   public:
+    MisnamedFileChecksumGenerator(const FileChecksumGenContext& context)
+        : FileChecksumGenCrc32c(context) {}
+
+    const char* Name() const override { return "sha1"; }
+  };
+
+  class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory {
+   public:
+    std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+        const FileChecksumGenContext& context) override {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new MisnamedFileChecksumGenerator(context));
     }
   };
-  verify_db_func();
+
+  options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory());
+  Reopen(options);
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
 }
 #endif  // !ROCKSDB_LITE
 
-class DBBasicTestWithTimestampWithParam
-    : public DBBasicTestWithTimestampBase,
-      public testing::WithParamInterface<bool> {
- public:
-  DBBasicTestWithTimestampWithParam()
-      : DBBasicTestWithTimestampBase(
-            "/db_basic_test_with_timestamp_with_param") {}
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the deadline/timeout feature
+class DBBasicTestDeadline
+    : public DBBasicTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(DBBasicTestDeadline, PointLookupDeadline) {
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  bool set_deadline = std::get<0>(GetParam());
+  bool set_timeout = std::get<1>(GetParam());
+
+  for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+    if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+      continue;
+    }
+    option_config_ = option_config;
+    Options options = CurrentOptions();
+    if (options.use_direct_reads) {
+      continue;
+    }
+    options.env = env.get();
+    options.disable_auto_compactions = true;
+    Cache* block_cache = nullptr;
+    // Fileter block reads currently don't cause the request to get
+    // aborted on a read timeout, so its possible those block reads
+    // may get issued even if the deadline is past
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Get:BeforeFilterMatch",
+        [&](void* /*arg*/) { fs->IgnoreDeadline(true); });
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Get:AfterFilterMatch",
+        [&](void* /*arg*/) { fs->IgnoreDeadline(false); });
+    // DB open will create table readers unless we reduce the table cache
+    // capacity.
+    // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+    // is allocated with max_open_files - 10 as capacity. So override
+    // max_open_files to 11 so table cache capacity will become 1. This will
+    // prevent file open during DB open and force the file to be opened
+    // during MultiGet
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = (int*)arg;
+          *max_open_files = 11;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
 
- protected:
-  class TestComparator : public TestComparatorBase {
-   private:
-    const Comparator* cmp_without_ts_;
+    SetTimeElapseOnlySleepOnReopen(&options);
+    Reopen(options);
 
-   public:
-    explicit TestComparator(size_t ts_sz)
-        : TestComparatorBase(ts_sz), cmp_without_ts_(nullptr) {
-      cmp_without_ts_ = BytewiseComparator();
+    if (options.table_factory) {
+      block_cache = options.table_factory->GetOptions<Cache>(
+          TableFactory::kBlockCacheOpts());
     }
 
-    int CompareImpl(const Slice& a, const Slice& b) const override {
-      return cmp_without_ts_->Compare(a, b);
+    Random rnd(301);
+    for (int i = 0; i < 400; ++i) {
+      std::string key = "k" + ToString(i);
+      ASSERT_OK(Put(key, rnd.RandomString(100)));
     }
-  };
-};
+    ASSERT_OK(Flush());
 
-TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) {
-  const int kNumKeysPerFile = 8192;
-  const size_t kNumTimestamps = 6;
-  bool memtable_only = GetParam();
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.env = env_;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
-  std::string tmp;
-  size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
-  TestComparator test_cmp(ts_sz);
-  options.comparator = &test_cmp;
-  BlockBasedTableOptions bbto;
-  bbto.filter_policy.reset(NewBloomFilterPolicy(
-      10 /*bits_per_key*/, false /*use_block_based_builder*/));
-  bbto.whole_key_filtering = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    bool timedout = true;
+    // A timeout will be forced when the IO counter reaches this value
+    int io_deadline_trigger = 0;
+    // Keep incrementing io_deadline_trigger and call Get() until there is an
+    // iteration that doesn't cause a timeout. This ensures that we cover
+    // all file reads in the point lookup path that can potentially timeout
+    // and cause the Get() to fail.
+    while (timedout) {
+      ReadOptions ro;
+      if (set_deadline) {
+        ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+      }
+      if (set_timeout) {
+        ro.io_timeout = std::chrono::microseconds{5000};
+      }
+      fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
 
-  std::vector<CompressionType> compression_types;
-  compression_types.push_back(kNoCompression);
-  if (Zlib_Supported()) {
-    compression_types.push_back(kZlibCompression);
-  }
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  compression_types.push_back(kLZ4Compression);
-  compression_types.push_back(kLZ4HCCompression);
-#endif  // LZ4_VERSION_NUMBER >= 10400
-  if (ZSTD_Supported()) {
-    compression_types.push_back(kZSTD);
-  }
-
-  // Switch compression dictionary on/off to check key extraction
-  // correctness in kBuffered state
-  std::vector<uint32_t> max_dict_bytes_list = {0, 1 << 14};  // 0 or 16KB
-
-  for (auto compression_type : compression_types) {
-    for (uint32_t max_dict_bytes : max_dict_bytes_list) {
-      options.compression = compression_type;
-      options.compression_opts.max_dict_bytes = max_dict_bytes;
-      if (compression_type == kZSTD) {
-        options.compression_opts.zstd_max_train_bytes = max_dict_bytes;
-      }
-      options.target_file_size_base = 1 << 26;  // 64MB
-
-      DestroyAndReopen(options);
-      CreateAndReopenWithCF({"pikachu"}, options);
-      size_t num_cfs = handles_.size();
-      ASSERT_EQ(2, num_cfs);
-      std::vector<std::string> write_ts_strs(kNumTimestamps);
-      std::vector<std::string> read_ts_strs(kNumTimestamps);
-      std::vector<Slice> write_ts_list;
-      std::vector<Slice> read_ts_list;
-
-      for (size_t i = 0; i != kNumTimestamps; ++i) {
-        write_ts_list.emplace_back(
-            EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
-        read_ts_list.emplace_back(
-            EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
-        const Slice& write_ts = write_ts_list.back();
-        WriteOptions wopts;
-        wopts.timestamp = &write_ts;
-        for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
-          for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
-            ASSERT_OK(Put(
-                cf, "key" + std::to_string(j),
-                "value_" + std::to_string(j) + "_" + std::to_string(i), wopts));
-          }
-          if (!memtable_only) {
-            ASSERT_OK(Flush(cf));
-          }
-        }
+      block_cache->SetCapacity(0);
+      block_cache->SetCapacity(1048576);
+
+      std::string value;
+      Status s = dbfull()->Get(ro, "k50", &value);
+      if (fs->TimedOut()) {
+        ASSERT_EQ(s, Status::TimedOut());
+      } else {
+        timedout = false;
+        ASSERT_OK(s);
       }
-      const auto& verify_db_func = [&]() {
-        for (size_t i = 0; i != kNumTimestamps; ++i) {
-          ReadOptions ropts;
-          ropts.timestamp = &read_ts_list[i];
-          for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
-            ColumnFamilyHandle* cfh = handles_[cf];
-            for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps;
-                 ++j) {
-              std::string value;
-              ASSERT_OK(
-                  db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
-              ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
-                        value);
-            }
-          }
-        }
-      };
-      verify_db_func();
+      io_deadline_trigger++;
     }
+    // Reset the delay sequence in order to avoid false alarms during Reopen
+    fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+                        std::chrono::microseconds::zero(), 0);
   }
+  Close();
 }
 
-INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam,
-                        ::testing::Bool());
+TEST_P(DBBasicTestDeadline, IteratorDeadline) {
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  bool set_deadline = std::get<0>(GetParam());
+  bool set_timeout = std::get<1>(GetParam());
+
+  for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+    if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+      continue;
+    }
+    Options options = CurrentOptions();
+    if (options.use_direct_reads) {
+      continue;
+    }
+    options.env = env.get();
+    options.disable_auto_compactions = true;
+    Cache* block_cache = nullptr;
+    // DB open will create table readers unless we reduce the table cache
+    // capacity.
+    // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+    // is allocated with max_open_files - 10 as capacity. So override
+    // max_open_files to 11 so table cache capacity will become 1. This will
+    // prevent file open during DB open and force the file to be opened
+    // during MultiGet
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = (int*)arg;
+          *max_open_files = 11;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
 
-}  // namespace ROCKSDB_NAMESPACE
+    SetTimeElapseOnlySleepOnReopen(&options);
+    Reopen(options);
+
+    if (options.table_factory) {
+      block_cache = options.table_factory->GetOptions<Cache>(
+          TableFactory::kBlockCacheOpts());
+    }
+
+    Random rnd(301);
+    for (int i = 0; i < 400; ++i) {
+      std::string key = "k" + ToString(i);
+      ASSERT_OK(Put(key, rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    bool timedout = true;
+    // A timeout will be forced when the IO counter reaches this value
+    int io_deadline_trigger = 0;
+    // Keep incrementing io_deadline_trigger and call Get() until there is an
+    // iteration that doesn't cause a timeout. This ensures that we cover
+    // all file reads in the point lookup path that can potentially timeout
+    while (timedout) {
+      ReadOptions ro;
+      if (set_deadline) {
+        ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+      }
+      if (set_timeout) {
+        ro.io_timeout = std::chrono::microseconds{5000};
+      }
+      fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
+      block_cache->SetCapacity(0);
+      block_cache->SetCapacity(1048576);
+
+      Iterator* iter = dbfull()->NewIterator(ro);
+      int count = 0;
+      iter->Seek("k50");
+      while (iter->Valid() && count++ < 100) {
+        iter->Next();
+      }
+      if (fs->TimedOut()) {
+        ASSERT_FALSE(iter->Valid());
+        ASSERT_EQ(iter->status(), Status::TimedOut());
+      } else {
+        timedout = false;
+        ASSERT_OK(iter->status());
+      }
+      delete iter;
+      io_deadline_trigger++;
+    }
+    // Reset the delay sequence in order to avoid false alarms during Reopen
+    fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+                        std::chrono::microseconds::zero(), 0);
+  }
+  Close();
 }
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+// Param 0: If true, set read_options.deadline
+// Param 1: If true, set read_options.io_timeout
+INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, true)));
+}  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_blob_index_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_blob_index_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_blob_index_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_blob_index_test.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,436 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "db/arena_wrapped_db_iter.h"
-#include "db/column_family.h"
-#include "db/db_iter.h"
-#include "db/db_test_util.h"
-#include "db/dbformat.h"
-#include "db/write_batch_internal.h"
-#include "port/port.h"
-#include "port/stack_trace.h"
-#include "util/string_util.h"
-#include "utilities/merge_operators.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
-// should accept the value type on write, and report not supported value
-// for reads, unless caller request for it explicitly. The base rocksdb
-// doesn't understand format of actual blob index (the value).
-class DBBlobIndexTest : public DBTestBase {
- public:
-  enum Tier {
-    kMemtable = 0,
-    kImmutableMemtables = 1,
-    kL0SstFile = 2,
-    kLnSstFile = 3,
-  };
-  const std::vector<Tier> kAllTiers = {Tier::kMemtable,
-                                       Tier::kImmutableMemtables,
-                                       Tier::kL0SstFile, Tier::kLnSstFile};
-
-  DBBlobIndexTest() : DBTestBase("/db_blob_index_test") {}
-
-  ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
-
-  ColumnFamilyData* cfd() {
-    return reinterpret_cast<ColumnFamilyHandleImpl*>(cfh())->cfd();
-  }
-
-  Status PutBlobIndex(WriteBatch* batch, const Slice& key,
-                      const Slice& blob_index) {
-    return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
-                                            blob_index);
-  }
-
-  Status Write(WriteBatch* batch) {
-    return dbfull()->Write(WriteOptions(), batch);
-  }
-
-  std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
-                      const Snapshot* snapshot = nullptr) {
-    ReadOptions read_options;
-    read_options.snapshot = snapshot;
-    PinnableSlice value;
-    DBImpl::GetImplOptions get_impl_options;
-    get_impl_options.column_family = cfh();
-    get_impl_options.value = &value;
-    get_impl_options.is_blob_index = is_blob_index;
-    auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
-    if (s.IsNotFound()) {
-      return "NOT_FOUND";
-    }
-    if (s.IsNotSupported()) {
-      return "NOT_SUPPORTED";
-    }
-    if (!s.ok()) {
-      return s.ToString();
-    }
-    return value.ToString();
-  }
-
-  std::string GetBlobIndex(const Slice& key,
-                           const Snapshot* snapshot = nullptr) {
-    bool is_blob_index = false;
-    std::string value = GetImpl(key, &is_blob_index, snapshot);
-    if (!is_blob_index) {
-      return "NOT_BLOB";
-    }
-    return value;
-  }
-
-  ArenaWrappedDBIter* GetBlobIterator() {
-    return dbfull()->NewIteratorImpl(
-        ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
-        nullptr /*read_callback*/, true /*allow_blob*/);
-  }
-
-  Options GetTestOptions() {
-    Options options;
-    options.create_if_missing = true;
-    options.num_levels = 2;
-    options.disable_auto_compactions = true;
-    // Disable auto flushes.
-    options.max_write_buffer_number = 10;
-    options.min_write_buffer_number_to_merge = 10;
-    options.merge_operator = MergeOperators::CreateStringAppendOperator();
-    return options;
-  }
-
-  void MoveDataTo(Tier tier) {
-    switch (tier) {
-      case Tier::kMemtable:
-        break;
-      case Tier::kImmutableMemtables:
-        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
-        break;
-      case Tier::kL0SstFile:
-        ASSERT_OK(Flush());
-        break;
-      case Tier::kLnSstFile:
-        ASSERT_OK(Flush());
-        ASSERT_OK(Put("a", "dummy"));
-        ASSERT_OK(Put("z", "dummy"));
-        ASSERT_OK(Flush());
-        ASSERT_OK(
-            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-#ifndef ROCKSDB_LITE
-        ASSERT_EQ("0,1", FilesPerLevel());
-#endif  // !ROCKSDB_LITE
-        break;
-    }
-  }
-};
-
-// Should be able to write kTypeBlobIndex to memtables and SST files.
-TEST_F(DBBlobIndexTest, Write) {
-  for (auto tier : kAllTiers) {
-    DestroyAndReopen(GetTestOptions());
-    for (int i = 1; i <= 5; i++) {
-      std::string index = ToString(i);
-      WriteBatch batch;
-      ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index));
-      ASSERT_OK(Write(&batch));
-    }
-    MoveDataTo(tier);
-    for (int i = 1; i <= 5; i++) {
-      std::string index = ToString(i);
-      ASSERT_EQ("blob" + index, GetBlobIndex("key" + index));
-    }
-  }
-}
-
-// Get should be able to return blob index if is_blob_index is provided,
-// otherwise return Status::NotSupported status.
-TEST_F(DBBlobIndexTest, Get) {
-  for (auto tier : kAllTiers) {
-    DestroyAndReopen(GetTestOptions());
-    WriteBatch batch;
-    ASSERT_OK(batch.Put("key", "value"));
-    ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index"));
-    ASSERT_OK(Write(&batch));
-    MoveDataTo(tier);
-    // Verify normal value
-    bool is_blob_index = false;
-    PinnableSlice value;
-    ASSERT_EQ("value", Get("key"));
-    ASSERT_EQ("value", GetImpl("key"));
-    ASSERT_EQ("value", GetImpl("key", &is_blob_index));
-    ASSERT_FALSE(is_blob_index);
-    // Verify blob index
-    ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
-    ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
-    ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index));
-    ASSERT_TRUE(is_blob_index);
-  }
-}
-
-// Get should NOT return Status::NotSupported if blob index is updated with
-// a normal value.
-TEST_F(DBBlobIndexTest, Updated) {
-  for (auto tier : kAllTiers) {
-    DestroyAndReopen(GetTestOptions());
-    WriteBatch batch;
-    for (int i = 0; i < 10; i++) {
-      ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index"));
-    }
-    ASSERT_OK(Write(&batch));
-    // Avoid blob values from being purged.
-    const Snapshot* snapshot = dbfull()->GetSnapshot();
-    ASSERT_OK(Put("key1", "new_value"));
-    ASSERT_OK(Merge("key2", "a"));
-    ASSERT_OK(Merge("key2", "b"));
-    ASSERT_OK(Merge("key2", "c"));
-    ASSERT_OK(Delete("key3"));
-    ASSERT_OK(SingleDelete("key4"));
-    ASSERT_OK(Delete("key5"));
-    ASSERT_OK(Merge("key5", "a"));
-    ASSERT_OK(Merge("key5", "b"));
-    ASSERT_OK(Merge("key5", "c"));
-    ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
-    MoveDataTo(tier);
-    for (int i = 0; i < 10; i++) {
-      ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot));
-    }
-    ASSERT_EQ("new_value", Get("key1"));
-    ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
-    ASSERT_EQ("NOT_FOUND", Get("key3"));
-    ASSERT_EQ("NOT_FOUND", Get("key4"));
-    ASSERT_EQ("a,b,c", GetImpl("key5"));
-    for (int i = 6; i < 9; i++) {
-      ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
-    }
-    ASSERT_EQ("blob_index", GetBlobIndex("key9"));
-    dbfull()->ReleaseSnapshot(snapshot);
-  }
-}
-
-// Iterator should get blob value if allow_blob flag is set,
-// otherwise return Status::NotSupported status.
-TEST_F(DBBlobIndexTest, Iterate) {
-  const std::vector<std::vector<ValueType>> data = {
-      /*00*/ {kTypeValue},
-      /*01*/ {kTypeBlobIndex},
-      /*02*/ {kTypeValue},
-      /*03*/ {kTypeBlobIndex, kTypeValue},
-      /*04*/ {kTypeValue},
-      /*05*/ {kTypeValue, kTypeBlobIndex},
-      /*06*/ {kTypeValue},
-      /*07*/ {kTypeDeletion, kTypeBlobIndex},
-      /*08*/ {kTypeValue},
-      /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
-      /*10*/ {kTypeValue},
-      /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
-      /*12*/ {kTypeValue},
-      /*13*/
-      {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
-      /*14*/ {kTypeValue},
-      /*15*/ {kTypeBlobIndex},
-      /*16*/ {kTypeValue},
-  };
-
-  auto get_key = [](int index) {
-    char buf[20];
-    snprintf(buf, sizeof(buf), "%02d", index);
-    return "key" + std::string(buf);
-  };
-
-  auto get_value = [&](int index, int version) {
-    return get_key(index) + "_value" + ToString(version);
-  };
-
-  auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
-                            const Slice& expected_value) {
-    ASSERT_EQ(expected_status, iterator->status().code());
-    if (expected_status == Status::kOk) {
-      ASSERT_TRUE(iterator->Valid());
-      ASSERT_EQ(expected_value, iterator->value());
-    } else {
-      ASSERT_FALSE(iterator->Valid());
-    }
-  };
-
-  auto create_normal_iterator = [&]() -> Iterator* {
-    return dbfull()->NewIterator(ReadOptions());
-  };
-
-  auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
-
-  auto check_is_blob = [&](bool is_blob) {
-    return [is_blob](Iterator* iterator) {
-      ASSERT_EQ(is_blob,
-                reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
-    };
-  };
-
-  auto verify = [&](int index, Status::Code expected_status,
-                    const Slice& forward_value, const Slice& backward_value,
-                    std::function<Iterator*()> create_iterator,
-                    std::function<void(Iterator*)> extra_check = nullptr) {
-    // Seek
-    auto* iterator = create_iterator();
-    ASSERT_OK(iterator->Refresh());
-    iterator->Seek(get_key(index));
-    check_iterator(iterator, expected_status, forward_value);
-    if (extra_check) {
-      extra_check(iterator);
-    }
-    delete iterator;
-
-    // Next
-    iterator = create_iterator();
-    ASSERT_OK(iterator->Refresh());
-    iterator->Seek(get_key(index - 1));
-    ASSERT_TRUE(iterator->Valid());
-    iterator->Next();
-    check_iterator(iterator, expected_status, forward_value);
-    if (extra_check) {
-      extra_check(iterator);
-    }
-    delete iterator;
-
-    // SeekForPrev
-    iterator = create_iterator();
-    ASSERT_OK(iterator->Refresh());
-    iterator->SeekForPrev(get_key(index));
-    check_iterator(iterator, expected_status, backward_value);
-    if (extra_check) {
-      extra_check(iterator);
-    }
-    delete iterator;
-
-    // Prev
-    iterator = create_iterator();
-    iterator->Seek(get_key(index + 1));
-    ASSERT_TRUE(iterator->Valid());
-    iterator->Prev();
-    check_iterator(iterator, expected_status, backward_value);
-    if (extra_check) {
-      extra_check(iterator);
-    }
-    delete iterator;
-  };
-
-  for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
-    // Avoid values from being purged.
-    std::vector<const Snapshot*> snapshots;
-    DestroyAndReopen(GetTestOptions());
-
-    // fill data
-    for (int i = 0; i < static_cast<int>(data.size()); i++) {
-      for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
-        std::string key = get_key(i);
-        std::string value = get_value(i, j);
-        WriteBatch batch;
-        switch (data[i][j]) {
-          case kTypeValue:
-            ASSERT_OK(Put(key, value));
-            break;
-          case kTypeDeletion:
-            ASSERT_OK(Delete(key));
-            break;
-          case kTypeSingleDeletion:
-            ASSERT_OK(SingleDelete(key));
-            break;
-          case kTypeMerge:
-            ASSERT_OK(Merge(key, value));
-            break;
-          case kTypeBlobIndex:
-            ASSERT_OK(PutBlobIndex(&batch, key, value));
-            ASSERT_OK(Write(&batch));
-            break;
-          default:
-            assert(false);
-        };
-      }
-      snapshots.push_back(dbfull()->GetSnapshot());
-    }
-    ASSERT_OK(
-        dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
-    snapshots.push_back(dbfull()->GetSnapshot());
-    MoveDataTo(tier);
-
-    // Normal iterator
-    verify(1, Status::kNotSupported, "", "", create_normal_iterator);
-    verify(3, Status::kNotSupported, "", "", create_normal_iterator);
-    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
-           create_normal_iterator);
-    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
-           create_normal_iterator);
-    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
-           create_normal_iterator);
-    verify(11, Status::kNotSupported, "", "", create_normal_iterator);
-    verify(13, Status::kOk,
-           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
-           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
-           create_normal_iterator);
-    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
-           create_normal_iterator);
-
-    // Iterator with blob support
-    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
-           create_blob_iterator, check_is_blob(true));
-    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
-           create_blob_iterator, check_is_blob(true));
-    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
-    verify(13, Status::kOk,
-           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
-           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
-           create_blob_iterator, check_is_blob(false));
-
-#ifndef ROCKSDB_LITE
-    // Iterator with blob support and using seek.
-    ASSERT_OK(dbfull()->SetOptions(
-        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
-    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
-           create_blob_iterator, check_is_blob(true));
-    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
-           create_blob_iterator, check_is_blob(true));
-    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
-    verify(13, Status::kOk,
-           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
-           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
-           create_blob_iterator, check_is_blob(false));
-    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
-           create_blob_iterator, check_is_blob(false));
-#endif  // !ROCKSDB_LITE
-
-    for (auto* snapshot : snapshots) {
-      dbfull()->ReleaseSnapshot(snapshot);
-    }
-  }
-}
-
-}  // namespace ROCKSDB_NAMESPACE
-
-int main(int argc, char** argv) {
-  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_block_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_block_cache_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_block_cache_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_block_cache_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,10 +7,21 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include <cstdlib>
+#include <functional>
+#include <memory>
+
+#include "cache/cache_entry_roles.h"
 #include "cache/lru_cache.h"
+#include "db/column_family.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
 #include "util/compression.h"
+#include "util/defer.h"
+#include "util/random.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -32,7 +43,8 @@
   const size_t kNumBlocks = 10;
   const size_t kValueSize = 100;
 
-  DBBlockCacheTest() : DBTestBase("/db_block_cache_test") {}
+  DBBlockCacheTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/true) {}
 
   BlockBasedTableOptions GetTableOptions() {
     BlockBasedTableOptions table_options;
@@ -47,7 +59,7 @@
     options.avoid_flush_during_recovery = false;
     // options.compression = kNoCompression;
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     return options;
   }
 
@@ -144,6 +156,19 @@
     compressed_insert_count_ = new_insert_count;
     compressed_failure_count_ = new_failure_count;
   }
+
+#ifndef ROCKSDB_LITE
+  const std::array<size_t, kNumCacheEntryRoles> GetCacheEntryRoleCountsBg() {
+    // Verify in cache entry role stats
+    ColumnFamilyHandleImpl* cfh =
+        static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+    InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats();
+    InternalStats::CacheEntryRoleStats stats;
+    internal_stats_ptr->TEST_GetCacheEntryRoleStats(&stats,
+                                                    /*foreground=*/false);
+    return stats.entry_counts;
+  }
+#endif  // ROCKSDB_LITE
 };
 
 TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
@@ -153,9 +178,15 @@
   auto options = GetOptions(table_options);
   InitTable(options);
 
-  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
   table_options.block_cache = cache;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   RecordCacheCounters(options);
 
@@ -177,9 +208,15 @@
   auto options = GetOptions(table_options);
   InitTable(options);
 
-  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
   table_options.block_cache = cache;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   RecordCacheCounters(options);
 
@@ -187,7 +224,7 @@
   Iterator* iter = nullptr;
 
   // Load blocks into cache.
-  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
     iter = db_->NewIterator(read_options);
     iter->Seek(ToString(i));
     ASSERT_OK(iter->status());
@@ -209,12 +246,12 @@
   iter = nullptr;
 
   // Release iterators and access cache again.
-  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
     iterators[i].reset();
     CheckCacheCounters(options, 0, 0, 0, 0);
   }
   ASSERT_EQ(0, cache->GetPinnedUsage());
-  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
     iter = db_->NewIterator(read_options);
     iter->Seek(ToString(i));
     ASSERT_OK(iter->status());
@@ -225,34 +262,54 @@
 
 #ifdef SNAPPY
 TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
-  ReadOptions read_options;
-  auto table_options = GetTableOptions();
-  auto options = GetOptions(table_options);
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  table_options.block_size = 1;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   options.compression = CompressionType::kSnappyCompression;
-  InitTable(options);
 
-  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 0; i < kNumBlocks; i++) {
+    ASSERT_OK(Put(ToString(i), value));
+    ASSERT_OK(Flush());
+  }
+
+  ReadOptions read_options;
   std::shared_ptr<Cache> compressed_cache = NewLRUCache(1 << 25, 0, false);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
   table_options.block_cache = cache;
+  table_options.no_block_cache = false;
   table_options.block_cache_compressed = compressed_cache;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  table_options.max_auto_readahead_size = 0;
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   RecordCacheCounters(options);
 
-  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
-  Iterator* iter = nullptr;
-
   // Load blocks into cache.
   for (size_t i = 0; i < kNumBlocks - 1; i++) {
-    iter = db_->NewIterator(read_options);
-    iter->Seek(ToString(i));
-    ASSERT_OK(iter->status());
+    ASSERT_EQ(value, Get(ToString(i)));
     CheckCacheCounters(options, 1, 0, 1, 0);
     CheckCompressedCacheCounters(options, 1, 0, 1, 0);
-    iterators[i].reset(iter);
   }
+
   size_t usage = cache->GetUsage();
-  ASSERT_LT(0, usage);
+  ASSERT_EQ(0, usage);
   ASSERT_EQ(usage, cache->GetPinnedUsage());
   size_t compressed_usage = compressed_cache->GetUsage();
   ASSERT_LT(0, compressed_usage);
@@ -264,24 +321,158 @@
   cache->SetCapacity(usage);
   cache->SetStrictCapacityLimit(true);
   ASSERT_EQ(usage, cache->GetPinnedUsage());
-  iter = db_->NewIterator(read_options);
-  iter->Seek(ToString(kNumBlocks - 1));
-  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // Load last key block.
+  ASSERT_EQ("Result incomplete: Insert failed due to LRU cache being full.",
+            Get(ToString(kNumBlocks - 1)));
+  // Failure will also record the miss counter.
   CheckCacheCounters(options, 1, 0, 0, 1);
   CheckCompressedCacheCounters(options, 1, 0, 1, 0);
-  delete iter;
-  iter = nullptr;
 
   // Clear strict capacity limit flag. This time we shall hit compressed block
-  // cache.
+  // cache and load into block cache.
   cache->SetStrictCapacityLimit(false);
-  iter = db_->NewIterator(read_options);
-  iter->Seek(ToString(kNumBlocks - 1));
-  ASSERT_OK(iter->status());
+  // Load last key block.
+  ASSERT_EQ(value, Get(ToString(kNumBlocks - 1)));
   CheckCacheCounters(options, 1, 0, 1, 0);
   CheckCompressedCacheCounters(options, 0, 1, 0, 0);
-  delete iter;
-  iter = nullptr;
+}
+
+namespace {
+class PersistentCacheFromCache : public PersistentCache {
+ public:
+  PersistentCacheFromCache(std::shared_ptr<Cache> cache, bool read_only)
+      : cache_(cache), read_only_(read_only) {}
+
+  Status Insert(const Slice& key, const char* data,
+                const size_t size) override {
+    if (read_only_) {
+      return Status::NotSupported();
+    }
+    std::unique_ptr<char[]> copy{new char[size]};
+    std::copy_n(data, size, copy.get());
+    Status s = cache_->Insert(
+        key, copy.get(), size,
+        GetCacheEntryDeleterForRole<char[], CacheEntryRole::kMisc>());
+    if (s.ok()) {
+      copy.release();
+    }
+    return s;
+  }
+
+  Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+                size_t* size) override {
+    auto handle = cache_->Lookup(key);
+    if (handle) {
+      char* ptr = static_cast<char*>(cache_->Value(handle));
+      *size = cache_->GetCharge(handle);
+      data->reset(new char[*size]);
+      std::copy_n(ptr, *size, data->get());
+      cache_->Release(handle);
+      return Status::OK();
+    } else {
+      return Status::NotFound();
+    }
+  }
+
+  bool IsCompressed() override { return false; }
+
+  StatsType Stats() override { return StatsType(); }
+
+  std::string GetPrintableOptions() const override { return ""; }
+
+  uint64_t NewId() override { return cache_->NewId(); }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  bool read_only_;
+};
+
+class ReadOnlyCacheWrapper : public CacheWrapper {
+  using CacheWrapper::CacheWrapper;
+
+  using Cache::Insert;
+  Status Insert(const Slice& /*key*/, void* /*value*/, size_t /*charge*/,
+                void (*)(const Slice& key, void* value) /*deleter*/,
+                Handle** /*handle*/, Priority /*priority*/) override {
+    return Status::NotSupported();
+  }
+};
+
+}  // namespace
+
+TEST_F(DBBlockCacheTest, TestWithSameCompressed) {
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> rw_cache{NewLRUCache(1000000)};
+  std::shared_ptr<PersistentCacheFromCache> rw_pcache{
+      new PersistentCacheFromCache(rw_cache, /*read_only*/ false)};
+  // Exercise some obscure behavior with read-only wrappers
+  std::shared_ptr<Cache> ro_cache{new ReadOnlyCacheWrapper(rw_cache)};
+  std::shared_ptr<PersistentCacheFromCache> ro_pcache{
+      new PersistentCacheFromCache(rw_cache, /*read_only*/ true)};
+
+  // Simple same pointer
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache same as block_cache_compressed not "
+            "currently supported, and would be bad for performance anyway");
+
+  // Other cases
+  table_options.block_cache = ro_cache;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache and block_cache_compressed share "
+            "the same key space, which is not supported");
+
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed = ro_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache_compressed and block_cache share "
+            "the same key space, which is not supported");
+
+  table_options.block_cache = ro_cache;
+  table_options.block_cache_compressed.reset();
+  table_options.persistent_cache = rw_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache and persistent_cache share the same "
+            "key space, which is not supported");
+
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed.reset();
+  table_options.persistent_cache = ro_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: persistent_cache and block_cache share the same "
+            "key space, which is not supported");
+
+  table_options.block_cache.reset();
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = ro_cache;
+  table_options.persistent_cache = rw_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache_compressed and persistent_cache "
+            "share the same key space, which is not supported");
+
+  table_options.block_cache.reset();
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache = ro_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: persistent_cache and block_cache_compressed "
+            "share the same key space, which is not supported");
 }
 #endif  // SNAPPY
 
@@ -296,7 +487,7 @@
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "key", "val"));
@@ -352,7 +543,7 @@
 
   std::shared_ptr<Cache> cache = NewLRUCache(10, 0, true);
   table_options.block_cache = cache;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   ASSERT_OK(Put("key1", "val1"));
   ASSERT_OK(Put("key2", "val2"));
@@ -390,7 +581,7 @@
   std::shared_ptr<Cache> cache = NewLRUCache(co);
   table_options.block_cache = cache;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "longer_key", "val"));
@@ -429,6 +620,183 @@
   //           filter_bytes_insert);
 }
 
+#if (defined OS_LINUX || defined OS_WIN)
+TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 1; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(ToString(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(value, Get(ToString(i)));
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(kNumBlocks,
+            options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+}
+
+// This test cache data, index and filter blocks during flush.
+class DBBlockCacheTest1 : public DBTestBase,
+                          public ::testing::WithParamInterface<uint32_t> {
+ public:
+  const size_t kNumBlocks = 10;
+  const size_t kValueSize = 100;
+  DBBlockCacheTest1() : DBTestBase("db_block_cache_test1", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheTest1, DBBlockCacheTest1,
+                        ::testing::Values(1, 2, 3));
+
+TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+
+  uint32_t filter_type = GetParam();
+  switch (filter_type) {
+    case 1:  // partition_filter
+      table_options.partition_filters = true;
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      break;
+    case 2:  // block-based filter
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+      break;
+    case 3:  // full filter
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      break;
+    default:
+      assert(false);
+  }
+
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 1; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(ToString(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+    if (filter_type == 1) {
+      ASSERT_EQ(2 * i,
+                options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(2 * i,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    } else {
+      ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    }
+    ASSERT_EQ(value, Get(ToString(i)));
+
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(i * 3, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT));
+    if (filter_type == 1) {
+      ASSERT_EQ(i * 3,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+    } else {
+      ASSERT_EQ(i * 2,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+    }
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  }
+
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(kNumBlocks,
+            options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+  // Index and filter blocks are automatically warmed when the new table file
+  // is automatically opened at the end of compaction. This is not easily
+  // disabled so results in the new index and filter blocks being warmed.
+  if (filter_type == 1) {
+    EXPECT_EQ(2 * (1 + kNumBlocks),
+              options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2 * (1 + kNumBlocks),
+              options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+  } else {
+    EXPECT_EQ(1 + kNumBlocks,
+              options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(1 + kNumBlocks,
+              options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+  }
+}
+
+TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+
+  for (size_t i = 1; i <= 5; i++) {
+    ASSERT_OK(Put(ToString(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+    ASSERT_EQ(value, Get(ToString(i)));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(
+        0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}}));
+
+  for (size_t i = 6; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(ToString(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+    ASSERT_EQ(value, Get(ToString(i)));
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(
+        1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+}
+#endif
+
 namespace {
 
 // A mock cache wraps LRUCache, and record how many entries have been
@@ -443,15 +811,18 @@
                  false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) {
   }
 
-  Status Insert(const Slice& key, void* value, size_t charge,
-                void (*deleter)(const Slice& key, void* value), Handle** handle,
-                Priority priority) override {
+  using ShardedCache::Insert;
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper_cb, size_t charge,
+                Handle** handle, Priority priority) override {
+    DeleterFn delete_cb = helper_cb->del_cb;
     if (priority == Priority::LOW) {
       low_pri_insert_count++;
     } else {
       high_pri_insert_count++;
     }
-    return LRUCache::Insert(key, value, charge, deleter, handle, priority);
+    return LRUCache::Insert(key, value, charge, delete_cb, handle, priority);
   }
 };
 
@@ -471,7 +842,7 @@
     table_options.filter_policy.reset(NewBloomFilterPolicy(20));
     table_options.cache_index_and_filter_blocks_with_high_priority =
         priority == Cache::Priority::HIGH ? true : false;
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
 
     MockCache::high_pri_insert_count = 0;
@@ -517,6 +888,140 @@
   }
 }
 
+namespace {
+
+// An LRUCache wrapper that can falsely report "not found" on Lookup.
+// This allows us to manipulate BlockBasedTableReader into thinking
+// another thread inserted the data in between Lookup and Insert,
+// while mostly preserving the LRUCache interface/behavior.
+class LookupLiarCache : public CacheWrapper {
+  int nth_lookup_not_found_ = 0;
+
+ public:
+  explicit LookupLiarCache(std::shared_ptr<Cache> target)
+      : CacheWrapper(std::move(target)) {}
+
+  using Cache::Lookup;
+  Handle* Lookup(const Slice& key, Statistics* stats) override {
+    if (nth_lookup_not_found_ == 1) {
+      nth_lookup_not_found_ = 0;
+      return nullptr;
+    }
+    if (nth_lookup_not_found_ > 1) {
+      --nth_lookup_not_found_;
+    }
+    return CacheWrapper::Lookup(key, stats);
+  }
+
+  // 1 == next lookup, 2 == after next, etc.
+  void SetNthLookupNotFound(int n) { nth_lookup_not_found_ = n; }
+};
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, AddRedundantStats) {
+  const size_t capacity = size_t{1} << 25;
+  const int num_shard_bits = 0;  // 1 shard
+  int iterations_tested = 0;
+  for (std::shared_ptr<Cache> base_cache :
+       {NewLRUCache(capacity, num_shard_bits),
+        NewClockCache(capacity, num_shard_bits)}) {
+    if (!base_cache) {
+      // Skip clock cache when not supported
+      continue;
+    }
+    ++iterations_tested;
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    std::shared_ptr<LookupLiarCache> cache =
+        std::make_shared<LookupLiarCache>(base_cache);
+
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache = cache;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    // Create a new table.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(Put("bar", "value"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+    // Normal access filter+index+data.
+    ASSERT_EQ("value", Get("foo"));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    ASSERT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Againt access filter+index+data, but force redundant load+insert on index
+    cache->SetNthLookupNotFound(2);
+    ASSERT_EQ("value", Get("bar"));
+
+    ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    ASSERT_EQ(4, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Access just filter (with high probability), and force redundant
+    // load+insert
+    cache->SetNthLookupNotFound(1);
+    ASSERT_EQ("NOT_FOUND", Get("this key was not added"));
+
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    EXPECT_EQ(5, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    EXPECT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Access just data, forcing redundant load+insert
+    ReadOptions read_options;
+    std::unique_ptr<Iterator> iter{db_->NewIterator(read_options)};
+    cache->SetNthLookupNotFound(1);
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), "bar");
+
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    EXPECT_EQ(6, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    EXPECT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+  }
+  EXPECT_GE(iterations_tested, 1);
+}
+
 TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -526,7 +1031,7 @@
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = false;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "1_key", "val"));
@@ -541,7 +1046,7 @@
   // Create a new SST file. This will further trigger a compaction
   // and generate another file.
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(3, /* Totally 3 files created up to now */
             TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -556,7 +1061,7 @@
   ASSERT_OK(Put(1, "1_key4", "val4"));
   ASSERT_OK(Put(1, "9_key4", "val4"));
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(3, /* Totally 3 files created up to now */
             TestGetTickerCount(options, BLOCK_CACHE_ADD));
 }
@@ -631,7 +1136,7 @@
     std::string str;
     for (int i = 0; i < num_iter; i++) {
       if (i % 4 == 0) {  // high compression ratio
-        str = RandomString(&rnd, 1000);
+        str = rnd.RandomString(1000);
       }
       values.push_back(str);
       ASSERT_OK(Put(1, Key(i), values[i]));
@@ -701,8 +1206,9 @@
   Random rnd(301);
   for (auto compression_type : compression_types) {
     Options options = CurrentOptions();
-    options.compression = compression_type;
-    options.compression_opts.max_dict_bytes = 4096;
+    options.bottommost_compression = compression_type;
+    options.bottommost_compression_opts.max_dict_bytes = 4096;
+    options.bottommost_compression_opts.enabled = true;
     options.create_if_missing = true;
     options.num_levels = 2;
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
@@ -710,7 +1216,7 @@
     BlockBasedTableOptions table_options;
     table_options.cache_index_and_filter_blocks = true;
     table_options.block_cache.reset(new MockCache());
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
 
     RecordCacheCountersForCompressionDict(options);
@@ -718,12 +1224,12 @@
     for (int i = 0; i < kNumFiles; ++i) {
       ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
       for (int j = 0; j < kNumEntriesPerFile; ++j) {
-        std::string value = RandomString(&rnd, kNumBytesPerEntry);
+        std::string value = rnd.RandomString(kNumBytesPerEntry);
         ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
       }
       ASSERT_OK(Flush());
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
 
@@ -750,8 +1256,628 @@
   }
 }
 
+static void ClearCache(Cache* cache) {
+  auto roles = CopyCacheDeleterRoleMap();
+  std::deque<std::string> keys;
+  Cache::ApplyToAllEntriesOptions opts;
+  auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/,
+                      Cache::DeleterFn deleter) {
+    if (roles.find(deleter) == roles.end()) {
+      // Keep the stats collector
+      return;
+    }
+    keys.push_back(key.ToString());
+  };
+  cache->ApplyToAllEntries(callback, opts);
+  for (auto& k : keys) {
+    cache->Erase(k);
+  }
+}
+
+TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
+  const size_t capacity = size_t{1} << 25;
+  int iterations_tested = 0;
+  for (bool partition : {false, true}) {
+    for (std::shared_ptr<Cache> cache :
+         {NewLRUCache(capacity), NewClockCache(capacity)}) {
+      if (!cache) {
+        // Skip clock cache when not supported
+        continue;
+      }
+      ++iterations_tested;
+
+      Options options = CurrentOptions();
+      SetTimeElapseOnlySleepOnReopen(&options);
+      options.create_if_missing = true;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.max_open_files = 13;
+      options.table_cache_numshardbits = 0;
+      // If this wakes up, it could interfere with test
+      options.stats_dump_period_sec = 0;
+
+      BlockBasedTableOptions table_options;
+      table_options.block_cache = cache;
+      table_options.cache_index_and_filter_blocks = true;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+      if (partition) {
+        table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+        table_options.partition_filters = true;
+      }
+      table_options.metadata_cache_options.top_level_index_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.partition_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.unpartitioned_pinning =
+          PinningTier::kNone;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      DestroyAndReopen(options);
+
+      // Create a new table.
+      ASSERT_OK(Put("foo", "value"));
+      ASSERT_OK(Put("bar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_OK(Put("zfoo", "value"));
+      ASSERT_OK(Put("zbar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+      // Fresh cache
+      ClearCache(cache.get());
+
+      std::array<size_t, kNumCacheEntryRoles> expected{};
+      // For CacheEntryStatsCollector
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
+
+      // First access only filters
+      ASSERT_EQ("NOT_FOUND", Get("different from any key added"));
+      expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)] += 2;
+      if (partition) {
+        expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
+      }
+      // Within some time window, we will get cached entry stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Not enough to force a miss
+      env_->MockSleepForSeconds(45);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      // Now access index and data block
+      ASSERT_EQ("value", Get("foo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      // But inject a simulated long scan so that we need a longer
+      // interval to force a miss next time.
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this](void*) {
+            // To spend no more than 0.2% of time scanning, we would need
+            // interval of at least 10000s
+            env_->MockSleepForSeconds(20);
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+
+      // The same for other file
+      ASSERT_EQ("value", Get("zfoo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Because of the simulated long scan, this is not enough to force
+      // a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // But this is enough
+      env_->MockSleepForSeconds(10000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+
+      // Also check the GetProperty interface
+      std::map<std::string, std::string> values;
+      ASSERT_TRUE(
+          db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+      EXPECT_EQ(
+          ToString(expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]),
+          values["count.index-block"]);
+      EXPECT_EQ(
+          ToString(expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]),
+          values["count.data-block"]);
+      EXPECT_EQ(
+          ToString(expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)]),
+          values["count.filter-block"]);
+      EXPECT_EQ(
+          ToString(
+              prev_expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+          values["count.write-buffer"]);
+      EXPECT_EQ(ToString(expected[static_cast<size_t>(CacheEntryRole::kMisc)]),
+                values["count.misc"]);
+
+      // Add one for kWriteBuffer
+      {
+        WriteBufferManager wbm(size_t{1} << 20, cache);
+        wbm.ReserveMem(1024);
+        expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
+        // Now we check that the GetProperty interface is more agressive about
+        // re-scanning stats, but not totally aggressive.
+        // Within some time window, we will get cached entry stats
+        env_->MockSleepForSeconds(1);
+        EXPECT_EQ(ToString(prev_expected[static_cast<size_t>(
+                      CacheEntryRole::kWriteBuffer)]),
+                  values["count.write-buffer"]);
+        // Not enough for a "background" miss but enough for a "foreground" miss
+        env_->MockSleepForSeconds(45);
+
+        ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
+                                        &values));
+        EXPECT_EQ(
+            ToString(
+                expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+            values["count.write-buffer"]);
+      }
+      prev_expected = expected;
+
+      // With collector pinned in cache, we should be able to hit
+      // even if the cache is full
+      ClearCache(cache.get());
+      Cache::Handle* h = nullptr;
+      ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
+                              GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+                              &h, Cache::Priority::HIGH));
+      ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
+      expected = {};
+      // For CacheEntryStatsCollector
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+      // For Fill-it-up
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
+      // Still able to hit on saved stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(1000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      cache->Release(h);
+
+      // Now we test that the DB mutex is not held during scans, for the ways
+      // we know how to (possibly) trigger them. Without a better good way to
+      // check this, we simply inject an acquire & release of the DB mutex
+      // deep in the stat collection code. If we were already holding the
+      // mutex, that is UB that would at least be found by TSAN.
+      int scan_count = 0;
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this, &scan_count](void*) {
+            dbfull()->TEST_LockMutex();
+            dbfull()->TEST_UnlockMutex();
+            ++scan_count;
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      // Different things that might trigger a scan, with mock sleeps to
+      // force a miss.
+      env_->MockSleepForSeconds(10000);
+      dbfull()->DumpStats();
+      ASSERT_EQ(scan_count, 1);
+
+      env_->MockSleepForSeconds(10000);
+      ASSERT_TRUE(
+          db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+      ASSERT_EQ(scan_count, 2);
+
+      env_->MockSleepForSeconds(10000);
+      std::string value_str;
+      ASSERT_TRUE(
+          db_->GetProperty(DB::Properties::kBlockCacheEntryStats, &value_str));
+      ASSERT_EQ(scan_count, 3);
+
+      env_->MockSleepForSeconds(10000);
+      ASSERT_TRUE(db_->GetProperty(DB::Properties::kCFStats, &value_str));
+      // To match historical speed, querying this property no longer triggers
+      // a scan, even if results are old. But periodic dump stats should keep
+      // things reasonably updated.
+      ASSERT_EQ(scan_count, /*unchanged*/ 3);
+
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+    }
+    EXPECT_GE(iterations_tested, 1);
+  }
+}
+
 #endif  // ROCKSDB_LITE
 
+class DBBlockCacheKeyTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  DBBlockCacheKeyTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+  void SetUp() override {
+    use_compressed_cache_ = std::get<0>(GetParam());
+    exclude_file_numbers_ = std::get<1>(GetParam());
+  }
+
+  bool use_compressed_cache_;
+  bool exclude_file_numbers_;
+};
+
+// Disable LinkFile so that we can physically copy a DB using Checkpoint.
+// Disable file GetUniqueId to enable stable cache keys.
+class StableCacheKeyTestFS : public FaultInjectionTestFS {
+ public:
+  explicit StableCacheKeyTestFS(const std::shared_ptr<FileSystem>& base)
+      : FaultInjectionTestFS(base) {
+    SetFailGetUniqueId(true);
+  }
+
+  virtual ~StableCacheKeyTestFS() override {}
+
+  IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+                    IODebugContext*) override {
+    return IOStatus::NotSupported("Disabled");
+  }
+};
+
+TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
+  std::shared_ptr<StableCacheKeyTestFS> test_fs{
+      new StableCacheKeyTestFS(env_->GetFileSystem())};
+  std::unique_ptr<CompositeEnvWrapper> test_env{
+      new CompositeEnvWrapper(env_, test_fs)};
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.env = test_env.get();
+
+  BlockBasedTableOptions table_options;
+
+  int key_count = 0;
+  uint64_t expected_stat = 0;
+
+  std::function<void()> verify_stats;
+  if (use_compressed_cache_) {
+    if (!Snappy_Supported()) {
+      ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support");
+      return;
+    }
+    options.compression = CompressionType::kSnappyCompression;
+    table_options.no_block_cache = true;
+    table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false);
+    verify_stats = [&options, &expected_stat] {
+      // One for ordinary SST file and one for external SST file
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD));
+    };
+  } else {
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+    verify_stats = [&options, &expected_stat] {
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    };
+  }
+
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"koko"}, options);
+
+  if (exclude_file_numbers_) {
+    // Simulate something like old behavior without file numbers in properties.
+    // This is a "control" side of the test that also ensures safely degraded
+    // behavior on old files.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
+        [&](void* arg) {
+          TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+          props->orig_file_number = 0;
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  std::function<void()> perform_gets = [&key_count, &expected_stat, this]() {
+    if (exclude_file_numbers_) {
+      // No cache key reuse should happen, because we can't rely on current
+      // file number being stable
+      expected_stat += key_count;
+    } else {
+      // Cache keys should be stable
+      expected_stat = key_count;
+    }
+    for (int i = 0; i < key_count; ++i) {
+      ASSERT_EQ(Get(1, Key(i)), "abc");
+    }
+  };
+
+  // Ordinary SST files with same session id
+  const std::string something_compressible(500U, 'x');
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put(1, Key(key_count), "abc"));
+    ASSERT_OK(Put(1, Key(key_count) + "a", something_compressible));
+    ASSERT_OK(Flush(1));
+    ++key_count;
+  }
+
+#ifndef ROCKSDB_LITE
+  // Save an export of those ordinary SST files for later
+  std::string export_files_dir = dbname_ + "/exported";
+  ExportImportFilesMetaData* metadata_ptr_ = nullptr;
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+  checkpoint = nullptr;
+
+  // External SST files with same session id
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  std::vector<std::string> external;
+  for (int i = 0; i < 2; ++i) {
+    std::string f = dbname_ + "/external" + ToString(i) + ".sst";
+    external.push_back(f);
+    ASSERT_OK(sst_file_writer.Open(f));
+    ASSERT_OK(sst_file_writer.Put(Key(key_count), "abc"));
+    ASSERT_OK(
+        sst_file_writer.Put(Key(key_count) + "a", something_compressible));
+    ++key_count;
+    ExternalSstFileInfo external_info;
+    ASSERT_OK(sst_file_writer.Finish(&external_info));
+    IngestExternalFileOptions ingest_opts;
+    ASSERT_OK(db_->IngestExternalFile(handles_[1], {f}, ingest_opts));
+  }
+
+  if (exclude_file_numbers_) {
+    // FIXME(peterd): figure out where these extra ADDs are coming from
+    options.statistics->recordTick(BLOCK_CACHE_COMPRESSED_ADD,
+                                   uint64_t{0} - uint64_t{2});
+  }
+#endif
+
+  perform_gets();
+  verify_stats();
+
+  // Make sure we can cache hit after re-open
+  ReopenWithColumnFamilies({"default", "koko"}, options);
+
+  perform_gets();
+  verify_stats();
+
+  // Make sure we can cache hit even on a full copy of the DB. Using
+  // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
+  // (Checkpoint  not available in LITE mode to test this.)
+#ifndef ROCKSDB_LITE
+  auto db_copy_name = dbname_ + "-copy";
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
+  delete checkpoint;
+
+  Close();
+  Destroy(options);
+
+  // Switch to the DB copy
+  SaveAndRestore<std::string> save_dbname(&dbname_, db_copy_name);
+  ReopenWithColumnFamilies({"default", "koko"}, options);
+
+  perform_gets();
+  verify_stats();
+
+  // And ensure that re-importing + ingesting the same files into a
+  // different DB uses same cache keys
+  DestroyAndReopen(options);
+
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                              ImportColumnFamilyOptions(),
+                                              *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+  delete cfh;
+  cfh = nullptr;
+  delete metadata_ptr_;
+  metadata_ptr_ = nullptr;
+
+  DestroyDB(export_files_dir, options);
+
+  ReopenWithColumnFamilies({"default", "yoyo"}, options);
+
+  IngestExternalFileOptions ingest_opts;
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {external}, ingest_opts));
+
+  perform_gets();
+  verify_stats();
+#endif  // !ROCKSDB_LITE
+
+  Close();
+  Destroy(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
+class DBBlockCachePinningTest
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<bool, PinningTier, PinningTier, PinningTier>> {
+ public:
+  DBBlockCachePinningTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+  void SetUp() override {
+    partition_index_and_filters_ = std::get<0>(GetParam());
+    top_level_index_pinning_ = std::get<1>(GetParam());
+    partition_pinning_ = std::get<2>(GetParam());
+    unpartitioned_pinning_ = std::get<3>(GetParam());
+  }
+
+  bool partition_index_and_filters_;
+  PinningTier top_level_index_pinning_;
+  PinningTier partition_pinning_;
+  PinningTier unpartitioned_pinning_;
+};
+
+TEST_P(DBBlockCachePinningTest, TwoLevelDB) {
+  // Creates one file in L0 and one file in L1. Both files have enough data that
+  // their index and filter blocks are partitioned. The L1 file will also have
+  // a compression dictionary (those are trained only during compaction), which
+  // must be unpartitioned.
+  const int kKeySize = 32;
+  const int kBlockSize = 128;
+  const int kNumBlocksPerFile = 128;
+  const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize;
+
+  Options options = CurrentOptions();
+  // `kNoCompression` makes the unit test more portable. But it relies on the
+  // current behavior of persisting/accessing dictionary even when there's no
+  // (de)compression happening, which seems fairly likely to change over time.
+  options.compression = kNoCompression;
+  options.compression_opts.max_dict_bytes = 4 << 10;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 20 /* capacity */);
+  table_options.block_size = kBlockSize;
+  table_options.metadata_block_size = kBlockSize;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.metadata_cache_options.top_level_index_pinning =
+      top_level_index_pinning_;
+  table_options.metadata_cache_options.partition_pinning = partition_pinning_;
+  table_options.metadata_cache_options.unpartitioned_pinning =
+      unpartitioned_pinning_;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(10 /* bits_per_key */));
+  if (partition_index_and_filters_) {
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    table_options.partition_filters = true;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize)));
+    }
+    ASSERT_OK(Flush());
+    if (i == 0) {
+      // Prevent trivial move so file will be rewritten with dictionary and
+      // reopened with L1's pinning settings.
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    }
+  }
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Get base cache values
+  uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t compression_dict_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+
+  // Read a key from the L0 file
+  Get(Key(kNumKeysPerFile));
+  uint64_t expected_filter_misses = filter_misses;
+  uint64_t expected_index_misses = index_misses;
+  uint64_t expected_compression_dict_misses = compression_dict_misses;
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Read a key from the L1 file
+  Get(Key(0));
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone ||
+        top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone ||
+        partition_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone ||
+        unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone ||
+      unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBBlockCachePinningTest, DBBlockCachePinningTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll)));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_bloom_filter_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,10 +7,19 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <iomanip>
+#include <sstream>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
 #include "db/db_test_util.h"
+#include "options/options_helper.h"
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/perf_context.h"
 #include "table/block_based/filter_policy_internal.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -22,7 +31,8 @@
 
 class DBBloomFilterTest : public DBTestBase {
  public:
-  DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {}
+  DBBloomFilterTest()
+      : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {}
 };
 
 class DBBloomFilterTestWithParam : public DBTestBase,
@@ -35,7 +45,8 @@
   uint32_t format_version_;
 
  public:
-  DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {}
+  DBBloomFilterTestWithParam()
+      : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
 
   ~DBBloomFilterTestWithParam() override {}
 
@@ -80,13 +91,16 @@
     options_override.partition_filters = partition_filters_;
     options_override.metadata_block_size = 32;
     Options options = CurrentOptions(options_override);
-    if (partition_filters_ &&
-        static_cast<BlockBasedTableOptions*>(
-            options.table_factory->GetOptions())
-                ->index_type != BlockBasedTableOptions::kTwoLevelIndexSearch) {
-      // In the current implementation partitioned filters depend on partitioned
-      // indexes
-      continue;
+    if (partition_filters_) {
+      auto* table_options =
+          options.table_factory->GetOptions<BlockBasedTableOptions>();
+      if (table_options != nullptr &&
+          table_options->index_type !=
+              BlockBasedTableOptions::kTwoLevelIndexSearch) {
+        // In the current implementation partitioned filters depend on
+        // partitioned indexes
+        continue;
+      }
     }
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
     CreateAndReopenWithCF({"pikachu"}, options);
@@ -122,8 +136,8 @@
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
-                                true /* disallow trivial move */);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                          true /* disallow trivial move */));
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
@@ -172,7 +186,7 @@
     ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
     ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
 
-    dbfull()->Flush(fo);
+    ASSERT_OK(dbfull()->Flush(fo));
 
     ASSERT_EQ("foo", Get("barbarbar"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
@@ -238,7 +252,7 @@
     ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
     ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
 
-    dbfull()->Flush(fo);
+    ASSERT_OK(dbfull()->Flush(fo));
 
     ASSERT_EQ("foo", Get("barbarbar"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
@@ -291,7 +305,7 @@
     // ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    dbfull()->Flush(fo);
+    ASSERT_OK(dbfull()->Flush(fo));
 
     Reopen(options);
     ASSERT_EQ("NOT_FOUND", Get("foo"));
@@ -322,7 +336,7 @@
     // ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
     // Reopen with both of whole key off and prefix extractor enabled.
     // Still no bloom filter should be used.
@@ -345,7 +359,7 @@
     // ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
     options.prefix_extractor.reset();
     bbto.whole_key_filtering = true;
@@ -358,7 +372,7 @@
     // not filtered out by key ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    Flush();
+    ASSERT_OK(Flush());
 
     // Now we have two files:
     // File 1: An older file with prefix bloom.
@@ -461,7 +475,7 @@
     for (int i = 0; i < N; i += 100) {
       ASSERT_OK(Put(1, Key(i), Key(i)));
     }
-    Flush(1);
+    ASSERT_OK(Flush(1));
 
     // Prevent auto compactions triggered by seeks
     env_->delay_sstable_sync_.store(true, std::memory_order_release);
@@ -497,36 +511,50 @@
       ASSERT_LE(reads, 3 * N / 100);
     }
 
+#ifndef ROCKSDB_LITE
+    // Sanity check some table properties
+    std::map<std::string, std::string> props;
+    ASSERT_TRUE(db_->GetMapProperty(
+        handles_[1], DB::Properties::kAggregatedTableProperties, &props));
+    uint64_t nkeys = N + N / 100;
+    uint64_t filter_size = ParseUint64(props["filter_size"]);
+    EXPECT_LE(filter_size,
+              (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8);
+    EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8);
+
+    uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]);
+    EXPECT_EQ(num_filter_entries, nkeys);
+#endif  // ROCKSDB_LITE
+
     env_->delay_sstable_sync_.store(false, std::memory_order_release);
     Close();
   } while (ChangeCompactOptions());
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestDefFormatVersion,
     ::testing::Values(
         std::make_tuple(BFP::kDeprecatedBlock, false,
                         test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+        std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestWithParam,
     ::testing::Values(
         std::make_tuple(BFP::kDeprecatedBlock, false,
                         test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+        std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, DBBloomFilterTestWithParam,
     ::testing::Values(
-        std::make_tuple(BFP::kDeprecatedBlock, false,
-                        test::kLatestFormatVersion),
-        std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion),
-        std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion)));
-#endif  // ROCKSDB_VALGRIND_RUN
+        std::make_tuple(BFP::kDeprecatedBlock, false, kLatestFormatVersion),
+        std::make_tuple(BFP::kAutoBloom, true, kLatestFormatVersion),
+        std::make_tuple(BFP::kAutoBloom, false, kLatestFormatVersion)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
   while (ChangeFilterOptions()) {
@@ -641,6 +669,439 @@
   }
 }
 
+/*
+ * A cache wrapper that tracks peaks and increments of filter
+ * construction cache reservation.
+ *        p0
+ *       / \   p1
+ *      /   \  /\
+ *     /     \/  \
+ *  a /       b   \
+ * peaks = {p0, p1}
+ * increments = {p1-a, p2-b}
+ */
+class FilterConstructResPeakTrackingCache : public CacheWrapper {
+ public:
+  explicit FilterConstructResPeakTrackingCache(std::shared_ptr<Cache> target)
+      : CacheWrapper(std::move(target)),
+        cur_cache_res_(0),
+        cache_res_peak_(0),
+        cache_res_increment_(0),
+        last_peak_tracked_(false),
+        cache_res_increments_sum_(0) {}
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    Status s = target_->Insert(key, value, charge, deleter, handle, priority);
+    if (deleter == kNoopDeleterForFilterConstruction) {
+      if (last_peak_tracked_) {
+        cache_res_peak_ = 0;
+        cache_res_increment_ = 0;
+        last_peak_tracked_ = false;
+      }
+      cur_cache_res_ += charge;
+      cache_res_peak_ = std::max(cache_res_peak_, cur_cache_res_);
+      cache_res_increment_ += charge;
+    }
+    return s;
+  }
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool force_erase = false) override {
+    auto deleter = GetDeleter(handle);
+    if (deleter == kNoopDeleterForFilterConstruction) {
+      if (!last_peak_tracked_) {
+        cache_res_peaks_.push_back(cache_res_peak_);
+        cache_res_increments_sum_ += cache_res_increment_;
+        last_peak_tracked_ = true;
+      }
+      cur_cache_res_ -= GetCharge(handle);
+    }
+    bool is_successful = target_->Release(handle, force_erase);
+    return is_successful;
+  }
+
+  std::deque<std::size_t> GetReservedCachePeaks() { return cache_res_peaks_; }
+
+  std::size_t GetReservedCacheIncrementSum() {
+    return cache_res_increments_sum_;
+  }
+
+ private:
+  static const Cache::DeleterFn kNoopDeleterForFilterConstruction;
+
+  std::size_t cur_cache_res_;
+  std::size_t cache_res_peak_;
+  std::size_t cache_res_increment_;
+  bool last_peak_tracked_;
+  std::deque<std::size_t> cache_res_peaks_;
+  std::size_t cache_res_increments_sum_;
+};
+
+const Cache::DeleterFn
+    FilterConstructResPeakTrackingCache::kNoopDeleterForFilterConstruction =
+        CacheReservationManager::TEST_GetNoopDeleterForRole<
+            CacheEntryRole::kFilterConstruction>();
+
+// To align with the type of hash entry being reserved in implementation.
+using FilterConstructionReserveMemoryHash = uint64_t;
+
+class DBFilterConstructionReserveMemoryTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<bool, BloomFilterPolicy::Mode, bool>> {
+ public:
+  DBFilterConstructionReserveMemoryTestWithParam()
+      : DBTestBase("db_bloom_filter_tests",
+                   /*env_do_fsync=*/true),
+        num_key_(0),
+        reserve_table_builder_memory_(std::get<0>(GetParam())),
+        policy_(std::get<1>(GetParam())),
+        partition_filters_(std::get<2>(GetParam())) {
+    if (!reserve_table_builder_memory_ ||
+        policy_ == BloomFilterPolicy::Mode::kDeprecatedBlock ||
+        policy_ == BloomFilterPolicy::Mode::kLegacyBloom) {
+      // For these cases, we only interested in whether filter construction
+      // cache resevation happens instead of its accuracy. Therefore we don't
+      // need many keys.
+      num_key_ = 5;
+    } else if (partition_filters_) {
+      // For PartitionFilter case, since we set
+      // table_options.metadata_block_size big enough such that each partition
+      // trigger at least 1 dummy entry reservation each for hash entries and
+      // final filter, we need a large number of keys to ensure we have at least
+      // two partitions.
+      num_key_ = 18 * CacheReservationManager::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    } else if (policy_ == BloomFilterPolicy::Mode::kFastLocalBloom) {
+      // For Bloom Filter + FullFilter case, since we design the num_key_ to
+      // make hash entry cache reservation be a multiple of dummy entries, the
+      // correct behavior of charging final filter on top of it will trigger at
+      // least another dummy entry insertion. Therefore we can assert that
+      // behavior and we don't need a large number of keys to verify we
+      // indeed charge the final filter for cache reservation, even though final
+      // filter is a lot smaller than hash entries.
+      num_key_ = 1 * CacheReservationManager::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    } else {
+      // For Ribbon Filter + FullFilter case, we need a large enough number of
+      // keys so that charging final filter after releasing the hash entries
+      // reservation will trigger at least another dummy entry (or equivalently
+      // to saying, causing another peak in cache reservation) as banding
+      // reservation might not be a multiple of dummy entry.
+      num_key_ = 12 * CacheReservationManager::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    }
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions table_options;
+
+    // We set cache capacity big enough to prevent cache full for convenience in
+    // calculation.
+    constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024;
+
+    table_options.reserve_table_builder_memory = reserve_table_builder_memory_;
+    table_options.filter_policy.reset(new BloomFilterPolicy(10, policy_));
+    table_options.partition_filters = partition_filters_;
+    if (table_options.partition_filters) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      // We set table_options.metadata_block_size big enough so that each
+      // partition trigger at least 1 dummy entry insertion each for hash
+      // entries and final filter.
+      table_options.metadata_block_size = 409000;
+    }
+
+    LRUCacheOptions lo;
+    lo.capacity = kCacheCapacity;
+    lo.num_shard_bits = 0;  // 2^0 shard
+    lo.strict_capacity_limit = true;
+    cache_ = std::make_shared<FilterConstructResPeakTrackingCache>(
+        (NewLRUCache(lo)));
+    table_options.block_cache = cache_;
+
+    return table_options;
+  }
+
+  std::size_t GetNumKey() { return num_key_; }
+
+  bool ReserveTableBuilderMemory() { return reserve_table_builder_memory_; }
+
+  BloomFilterPolicy::Mode GetFilterPolicy() { return policy_; }
+
+  bool PartitionFilters() { return partition_filters_; }
+
+  std::shared_ptr<FilterConstructResPeakTrackingCache>
+  GetFilterConstructResPeakTrackingCache() {
+    return cache_;
+  }
+
+ private:
+  std::size_t num_key_;
+  bool reserve_table_builder_memory_;
+  BloomFilterPolicy::Mode policy_;
+  bool partition_filters_;
+  std::shared_ptr<FilterConstructResPeakTrackingCache> cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BlockBasedTableOptions, DBFilterConstructionReserveMemoryTestWithParam,
+    ::testing::Values(
+        std::make_tuple(false, BloomFilterPolicy::Mode::kFastLocalBloom, false),
+        std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, false),
+        std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, true),
+        std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon,
+                        false),
+        std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon,
+                        true),
+        std::make_tuple(true, BloomFilterPolicy::Mode::kDeprecatedBlock, false),
+        std::make_tuple(true, BloomFilterPolicy::Mode::kLegacyBloom, false)));
+
+// TODO: Speed up this test.
+// The current test inserts many keys (on the scale of dummy entry size)
+// in order to make small memory user (e.g, final filter, partitioned hash
+// entries/filter/banding) , which is proportional to the number of
+// keys, big enough so that its cache reservation triggers dummy entry insertion
+// and becomes observable in the test.
+//
+// However, inserting that many keys slows down this test and leaves future
+// developers an opportunity to speed it up.
+//
+// Possible approaches & challenges:
+// 1. Use sync point during cache reservation of filter construction
+//
+// Benefit: It does not rely on triggering dummy entry insertion
+// but the sync point to verify small memory user is charged correctly.
+//
+// Challenge: this approach is intrusive.
+//
+// 2. Make dummy entry size configurable and set it small in the test
+//
+// Benefit: It increases the precision of cache reservation and therefore
+// small memory usage can still trigger insertion of dummy entry.
+//
+// Challenge: change CacheReservationManager related APIs and a hack
+// might be needed to control the size of dummmy entry of
+// CacheReservationManager used in filter construction for testing
+// since CacheReservationManager is not exposed at the high level.
+//
+TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
+  Options options = CurrentOptions();
+  // We set write_buffer_size big enough so that in the case where there is
+  // filter construction cache reservation, flush won't be triggered before we
+  // manually trigger it for clean testing
+  options.write_buffer_size = 640 << 20;
+  options.table_factory.reset(
+      NewBlockBasedTableFactory(GetBlockBasedTableOptions()));
+  std::shared_ptr<FilterConstructResPeakTrackingCache> cache =
+      GetFilterConstructResPeakTrackingCache();
+  options.create_if_missing = true;
+  // Disable auto compaction to prevent its unexpected side effect
+  // to the number of keys per partition designed by us in the test
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  int num_key = static_cast<int>(GetNumKey());
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  ASSERT_EQ(cache->GetReservedCacheIncrementSum(), 0)
+      << "Flush was triggered too early in the test case with filter "
+         "construction cache reservation - please make sure no flush triggered "
+         "during the key insertions above";
+
+  ASSERT_OK(Flush());
+
+  bool reserve_table_builder_memory = ReserveTableBuilderMemory();
+  BloomFilterPolicy::Mode policy = GetFilterPolicy();
+  bool partition_filters = PartitionFilters();
+
+  std::deque<std::size_t> filter_construction_cache_res_peaks =
+      cache->GetReservedCachePeaks();
+  std::size_t filter_construction_cache_res_increments_sum =
+      cache->GetReservedCacheIncrementSum();
+
+  if (!reserve_table_builder_memory) {
+    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0);
+    return;
+  }
+
+  if (policy == BloomFilterPolicy::Mode::kDeprecatedBlock ||
+      policy == BloomFilterPolicy::Mode::kLegacyBloom) {
+    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0)
+        << "There shouldn't be filter construction cache reservation as this "
+           "feature does not support BloomFilterPolicy::Mode::kDeprecatedBlock "
+           "nor BloomFilterPolicy::Mode::kLegacyBloom";
+    return;
+  }
+
+  const std::size_t kDummyEntrySize =
+      CacheReservationManager::GetDummyEntrySize();
+
+  const std::size_t predicted_hash_entries_cache_res =
+      num_key * sizeof(FilterConstructionReserveMemoryHash);
+  ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0)
+      << "It's by this test's design that predicted_hash_entries_cache_res is "
+         "a multipe of dummy entry";
+
+  const std::size_t predicted_hash_entries_cache_res_dummy_entry_num =
+      predicted_hash_entries_cache_res / kDummyEntrySize;
+  const std::size_t predicted_final_filter_cache_res =
+      static_cast<std::size_t>(std::ceil(
+          1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 *
+          (policy == BloomFilterPolicy::Mode::kStandard128Ribbon ? 0.7 : 1))) *
+      kDummyEntrySize;
+  const std::size_t predicted_banding_cache_res =
+      static_cast<std::size_t>(
+          std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) *
+      kDummyEntrySize;
+
+  if (policy == BloomFilterPolicy::Mode::kFastLocalBloom) {
+    /* BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter
+     *        p0
+     *       /  \
+     *    b /    \
+     *     /      \
+     *    /        \
+     *  0/          \
+     *  hash entries = b - 0, final filter = p0 - b
+     *  p0 = hash entries + final filter
+     *
+     *  The test is designed in a way such that the reservation for b is a
+     *  multiple of dummy entries so that reservation for (p0 - b)
+     *  will trigger at least another dummy entry insertion.
+     *
+     * BloomFilterPolicy::Mode::kFastLocalBloom + PartitionedFilter
+     *                   p1
+     *                  /  \
+     *        p0     b'/    \
+     *       /  \     /      \
+     *    b /    \   /        \
+     *     /      \ /          \
+     *    /        a            \
+     *  0/                       \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a
+     *  parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b'
+     *
+     *  (increment p0 - 0) + (increment p1 - a)
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + final filter
+     *
+     */
+    if (!partition_filters) {
+      EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
+          << "Filter construction cache reservation should have only 1 peak in "
+             "case: BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter";
+      std::size_t filter_construction_cache_res_peak =
+          filter_construction_cache_res_peaks[0];
+      EXPECT_GT(filter_construction_cache_res_peak,
+                predicted_hash_entries_cache_res)
+          << "The testing number of hash entries is designed to make hash "
+             "entries cache reservation be multiples of dummy entries"
+             " so the correct behavior of charging final filter on top of it"
+             " should've triggered at least another dummy entry insertion";
+
+      std::size_t predicted_filter_construction_cache_res_peak =
+          predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 0.9);
+      EXPECT_LE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 1.1);
+      return;
+    } else {
+      EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
+          << "Filter construction cache reservation should have multiple peaks "
+             "in case: BloomFilterPolicy::Mode::kFastLocalBloom + "
+             "PartitionedFilter";
+      std::size_t predicted_filter_construction_cache_res_increments_sum =
+          predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 0.9);
+      EXPECT_LE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 1.1);
+      return;
+    }
+  }
+
+  if (policy == BloomFilterPolicy::Mode::kStandard128Ribbon) {
+    /* BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter
+     *        p0
+     *       /  \  p1
+     *      /    \/\
+     *   b /     b' \
+     *    /          \
+     *  0/            \
+     *  hash entries = b - 0, banding = p0 - b, final filter = p1 - b'
+     *  p0 = hash entries + banding
+     *
+     *  The test is designed in a way such that the reservation for (p1 - b')
+     *  will trigger at least another dummy entry insertion
+     *  (or equivelantly to saying, creating another peak).
+     *
+     *  BloomFilterPolicy::Mode::kStandard128Ribbon + PartitionedFilter
+     *                     p3
+     *        p0           /\  p4
+     *       /  \ p1      /  \ /\
+     *      /    \/\  b''/    a' \
+     *   b /     b' \   /         \
+     *    /          \ /           \
+     *  0/            a             \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a
+     *  partitioned banding1 = p0 - b, partitioned banding2 = p3 - b''
+     *  parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a'
+     *
+     *  (increment p0 - 0) + (increment p1 - b')
+     *  + (increment p3 - a) + (increment p4 - a')
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned banding1 + parittioned banding2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + banding + final filter
+     */
+    if (!partition_filters) {
+      ASSERT_GE(std::floor(1.0 * predicted_final_filter_cache_res /
+                           CacheReservationManager::GetDummyEntrySize()),
+                1)
+          << "Final filter cache reservation too small for this test - please "
+             "increase the number of keys";
+      EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2)
+          << "Filter construction cache reservation should have 2 peaks in "
+             "case: BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter. "
+             "The second peak is resulted from charging the final filter after "
+             "decreasing the hash entry reservation since the testing final "
+             "filter reservation is designed to be at least 1 dummy entry size";
+
+      std::size_t filter_construction_cache_res_peak =
+          filter_construction_cache_res_peaks[0];
+      std::size_t predicted_filter_construction_cache_res_peak =
+          predicted_hash_entries_cache_res + predicted_banding_cache_res;
+      EXPECT_GE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 0.9);
+      EXPECT_LE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 1.1);
+      return;
+    } else {
+      EXPECT_GE(filter_construction_cache_res_peaks.size(), 3)
+          << "Filter construction cache reservation should have more than 3 "
+             "peaks "
+             "in case: BloomFilterPolicy::Mode::kStandard128Ribbon + "
+             "PartitionedFilter";
+      std::size_t predicted_filter_construction_cache_res_increments_sum =
+          predicted_hash_entries_cache_res + predicted_banding_cache_res +
+          predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 0.9);
+      EXPECT_LE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 1.1);
+      return;
+    }
+  }
+}
+
 namespace {
 // A wrapped bloom over block-based FilterPolicy
 class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy {
@@ -765,6 +1226,14 @@
   const std::unique_ptr<const FilterPolicy> policy_otherwise_;
 };
 
+static std::map<TableFileCreationReason, std::string>
+    table_file_creation_reason_to_string{
+        {TableFileCreationReason::kCompaction, "kCompaction"},
+        {TableFileCreationReason::kFlush, "kFlush"},
+        {TableFileCreationReason::kMisc, "kMisc"},
+        {TableFileCreationReason::kRecovery, "kRecovery"},
+    };
+
 class TestingContextCustomFilterPolicy
     : public LevelAndStyleCustomFilterPolicy {
  public:
@@ -777,11 +1246,17 @@
       const FilterBuildingContext& context) const override {
     test_report_ += "cf=";
     test_report_ += context.column_family_name;
-    test_report_ += ",cs=";
+    test_report_ += ",s=";
     test_report_ +=
         OptionsHelper::compaction_style_to_string[context.compaction_style];
-    test_report_ += ",lv=";
-    test_report_ += std::to_string(context.level_at_creation);
+    test_report_ += ",n=";
+    test_report_ += ToString(context.num_levels);
+    test_report_ += ",l=";
+    test_report_ += ToString(context.level_at_creation);
+    test_report_ += ",b=";
+    test_report_ += ToString(int{context.is_bottommost});
+    test_report_ += ",r=";
+    test_report_ += table_file_creation_reason_to_string[context.reason];
     test_report_ += "\n";
 
     return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
@@ -799,18 +1274,21 @@
 }  // namespace
 
 TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+  auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+  Options options;
   for (bool fifo : {true, false}) {
-    Options options = CurrentOptions();
+    options = CurrentOptions();
+    options.max_open_files = fifo ? -1 : options.max_open_files;
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
     options.compaction_style =
         fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
 
     BlockBasedTableOptions table_options;
-    auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
     table_options.filter_policy = policy;
     table_options.format_version = 5;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
+    TryReopen(options);
     CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
 
     const int maxKey = 10000;
@@ -821,16 +1299,16 @@
     ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
     Flush(1);
     EXPECT_EQ(policy->DumpTestReport(),
-              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
-                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
 
     for (int i = maxKey / 2; i < maxKey; i++) {
       ASSERT_OK(Put(1, Key(i), Key(i)));
     }
     Flush(1);
     EXPECT_EQ(policy->DumpTestReport(),
-              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
-                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
 
     // Check that they can be found
     for (int i = 0; i < maxKey; i++) {
@@ -858,7 +1336,7 @@
       ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
                                   nullptr));
       EXPECT_EQ(policy->DumpTestReport(),
-                "cf=bob,cs=kCompactionStyleLevel,lv=1\n");
+                "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n");
 
       // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
       for (int i = 0; i < maxKey; i++) {
@@ -870,11 +1348,25 @@
         EXPECT_GE(useful_count, maxKey * 0.90);
         EXPECT_LE(useful_count, maxKey * 0.91);
       }
+    } else {
+#ifndef ROCKSDB_LITE
+      // Also try external SST file
+      {
+        std::string file_path = dbname_ + "/external.sst";
+        SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]);
+        ASSERT_OK(sst_file_writer.Open(file_path));
+        ASSERT_OK(sst_file_writer.Put("key", "value"));
+        ASSERT_OK(sst_file_writer.Finish());
+      }
+      // Note: kCompactionStyleLevel is default, ignored if num_levels == -1
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n");
+#endif
     }
 
     // Destroy
     ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
-    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
     handles_[1] = nullptr;
   }
 }
@@ -1010,6 +1502,63 @@
   ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
 }
 
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio = 0.015;
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  std::string key1("AA");
+  std::string key2("BB");
+  std::string key3("CC");
+  std::string key4("DD");
+  std::string key_not("EE");
+  std::string value1("Value1");
+  std::string value2("Value2");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key2, value2, WriteOptions()));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+
+  // Delete key2 and key3
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
+
+  // Read without snapshot
+  auto results = MultiGet({key_not, key1, key2, key3, key4});
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], "NOT_FOUND");
+  ASSERT_EQ(results[3], "NOT_FOUND");
+  ASSERT_EQ(results[4], value4);
+
+  // Also check Get
+  ASSERT_EQ(Get(key1), value1);
+  ASSERT_EQ(Get(key2), "NOT_FOUND");
+  ASSERT_EQ(Get(key3), "NOT_FOUND");
+  ASSERT_EQ(Get(key4), value4);
+
+  // Read with snapshot
+  results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], value2);
+  ASSERT_EQ(results[3], value3);
+  ASSERT_EQ(results[4], "NOT_FOUND");
+
+  // Also check Get
+  ASSERT_EQ(Get(key1, snapshot), value1);
+  ASSERT_EQ(Get(key2, snapshot), value2);
+  ASSERT_EQ(Get(key3, snapshot), value3);
+  ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
   constexpr size_t kPrefixSize = 8;
   const std::string kKey = "key";
@@ -1029,6 +1578,215 @@
   ASSERT_EQ(kKey, iter->key());
 }
 
+class DBBloomFilterTestVaryPrefixAndFormatVer
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, uint32_t>> {
+ protected:
+  bool use_prefix_;
+  uint32_t format_version_;
+
+ public:
+  DBBloomFilterTestVaryPrefixAndFormatVer()
+      : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
+
+  ~DBBloomFilterTestVaryPrefixAndFormatVer() override {}
+
+  void SetUp() override {
+    use_prefix_ = std::get<0>(GetParam());
+    format_version_ = std::get<1>(GetParam());
+  }
+
+  static std::string UKey(uint32_t i) { return Key(static_cast<int>(i)); }
+};
+
+TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) {
+  Options options = CurrentOptions();
+  if (use_prefix_) {
+    // Entire key from UKey()
+    options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+  }
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(20));
+  bbto.partition_filters = true;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  bbto.whole_key_filtering = !use_prefix_;
+  if (use_prefix_) {  // (not related to prefix, just alternating between)
+    // Make sure code appropriately deals with metadata block size setting
+    // that is "too small" (smaller than minimum size for filter builder)
+    bbto.metadata_block_size = 63;
+  } else {
+    // Make sure the test will work even on platforms with large minimum
+    // filter size, due to large cache line size.
+    // (Largest cache line size + 10+% overhead.)
+    bbto.metadata_block_size = 290;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  ReadOptions ropts;
+
+  constexpr uint32_t N = 12000;
+  // Add N/2 evens
+  for (uint32_t i = 0; i < N; i += 2) {
+    ASSERT_OK(Put(UKey(i), UKey(i)));
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(TotalTableFiles(), 1);
+#endif
+
+  constexpr uint32_t Q = 29;
+  // MultiGet In
+  std::array<std::string, Q> keys;
+  std::array<Slice, Q> key_slices;
+  std::array<ColumnFamilyHandle*, Q> column_families;
+  // MultiGet Out
+  std::array<Status, Q> statuses;
+  std::array<PinnableSlice, Q> values;
+
+  TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+
+  // Check that initial clump of keys only loads one partition filter from
+  // block cache.
+  // And that spread out keys load many partition filters.
+  // In both cases, mix present vs. not present keys.
+  for (uint32_t stride : {uint32_t{1}, (N / Q) | 1}) {
+    for (uint32_t i = 0; i < Q; ++i) {
+      keys[i] = UKey(i * stride);
+      key_slices[i] = Slice(keys[i]);
+      column_families[i] = db_->DefaultColumnFamily();
+      statuses[i] = Status();
+      values[i] = PinnableSlice();
+    }
+
+    db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+                  /*timestamps=*/nullptr, &statuses[0], true);
+
+    // Confirm correct status results
+    uint32_t number_not_found = 0;
+    for (uint32_t i = 0; i < Q; ++i) {
+      if ((i * stride % 2) == 0) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+        ++number_not_found;
+      }
+    }
+
+    // Confirm correct Bloom stats (no FPs)
+    uint64_t filter_useful = TestGetAndResetTickerCount(
+        options,
+        use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+    uint64_t filter_checked =
+        TestGetAndResetTickerCount(options, use_prefix_
+                                                ? BLOOM_FILTER_PREFIX_CHECKED
+                                                : BLOOM_FILTER_FULL_POSITIVE) +
+        (use_prefix_ ? 0 : filter_useful);
+    EXPECT_EQ(filter_useful, number_not_found);
+    EXPECT_EQ(filter_checked, Q);
+    if (!use_prefix_) {
+      EXPECT_EQ(
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+          Q - number_not_found);
+    }
+
+    // Confirm no duplicate loading same filter partition
+    uint64_t filter_accesses =
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+    if (stride == 1) {
+      EXPECT_EQ(filter_accesses, 1);
+    } else {
+      // for large stride
+      EXPECT_GE(filter_accesses, Q / 2 + 1);
+    }
+  }
+
+  // Check that a clump of keys (present and not) works when spanning
+  // two partitions
+  int found_spanning = 0;
+  for (uint32_t start = 0; start < N / 2;) {
+    for (uint32_t i = 0; i < Q; ++i) {
+      keys[i] = UKey(start + i);
+      key_slices[i] = Slice(keys[i]);
+      column_families[i] = db_->DefaultColumnFamily();
+      statuses[i] = Status();
+      values[i] = PinnableSlice();
+    }
+
+    db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+                  /*timestamps=*/nullptr, &statuses[0], true);
+
+    // Confirm correct status results
+    uint32_t number_not_found = 0;
+    for (uint32_t i = 0; i < Q; ++i) {
+      if (((start + i) % 2) == 0) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+        ++number_not_found;
+      }
+    }
+
+    // Confirm correct Bloom stats (might see some FPs)
+    uint64_t filter_useful = TestGetAndResetTickerCount(
+        options,
+        use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+    uint64_t filter_checked =
+        TestGetAndResetTickerCount(options, use_prefix_
+                                                ? BLOOM_FILTER_PREFIX_CHECKED
+                                                : BLOOM_FILTER_FULL_POSITIVE) +
+        (use_prefix_ ? 0 : filter_useful);
+    EXPECT_GE(filter_useful, number_not_found - 2);  // possible FP
+    EXPECT_EQ(filter_checked, Q);
+    if (!use_prefix_) {
+      EXPECT_EQ(
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+          Q - number_not_found);
+    }
+
+    // Confirm no duplicate loading of same filter partition
+    uint64_t filter_accesses =
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+    if (filter_accesses == 2) {
+      // Spanned across partitions.
+      ++found_spanning;
+      if (found_spanning >= 2) {
+        break;
+      } else {
+        // Ensure that at least once we have at least one present and
+        // one non-present key on both sides of partition boundary.
+        start += 2;
+      }
+    } else {
+      EXPECT_EQ(filter_accesses, 1);
+      // See explanation at "start += 2"
+      start += Q - 4;
+    }
+  }
+  EXPECT_TRUE(found_spanning >= 2);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer,
+                        DBBloomFilterTestVaryPrefixAndFormatVer,
+                        ::testing::Values(
+                            // (use_prefix, format_version)
+                            std::make_tuple(false, 2),
+                            std::make_tuple(false, 3),
+                            std::make_tuple(false, 4),
+                            std::make_tuple(false, 5),
+                            std::make_tuple(true, 2),
+                            std::make_tuple(true, 3),
+                            std::make_tuple(true, 4),
+                            std::make_tuple(true, 5)));
+
 #ifndef ROCKSDB_LITE
 namespace {
 namespace BFP2 {
@@ -1229,9 +1987,9 @@
   snprintf(buf, sizeof(buf), "%02d______:end", 10);
   keystr = std::string(buf);
   ASSERT_OK(dbtest->Put(keystr, keystr));
-  dbtest->Flush();
-  dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
-                                 nullptr);  // move to level 1
+  ASSERT_OK(dbtest->Flush());
+  ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+                                           nullptr));  // move to level 1
 
   // GROUP 1
   for (int i = 1; i <= small_range_sstfiles; i++) {
@@ -1343,27 +2101,26 @@
   for (int i = 0; i < numkeys; i += 2) {
     keys.push_back(i);
   }
-  std::random_shuffle(std::begin(keys), std::end(keys));
-
+  RandomShuffle(std::begin(keys), std::end(keys));
   int num_inserted = 0;
   for (int key : keys) {
     ASSERT_OK(Put(1, Key(key), "val"));
     if (++num_inserted % 1000 == 0) {
-      dbfull()->TEST_WaitForFlushMemTable();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
   }
   ASSERT_OK(Put(1, Key(0), "val"));
   ASSERT_OK(Put(1, Key(numkeys), "val"));
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   if (NumTableFilesAtLevel(0, 1) == 0) {
     // No Level 0 file. Create one.
     ASSERT_OK(Put(1, Key(0), "val"));
     ASSERT_OK(Put(1, Key(numkeys), "val"));
     ASSERT_OK(Flush(1));
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
   for (int i = 1; i < numkeys; i += 2) {
@@ -1468,7 +2225,8 @@
       BottommostLevelCompaction::kSkip;
   compact_options.change_level = true;
   compact_options.target_level = 7;
-  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsNotSupported());
 
   ASSERT_EQ(trivial_move, 1);
   ASSERT_EQ(non_trivial_move, 0);
@@ -1500,10 +2258,10 @@
 
 int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
   int count = 0;
-  for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK();
-       iter->Next()) {
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
     count++;
   }
+  EXPECT_OK(iter->status());
   return count;
 }
 
@@ -1516,6 +2274,7 @@
     int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
     options.create_if_missing = true;
+    options.env = CurrentOptions().env;
     options.prefix_extractor.reset(NewCappedPrefixTransform(4));
     options.disable_auto_compactions = true;
     options.statistics = CreateDBStatistics();
@@ -1532,7 +2291,7 @@
     ASSERT_OK(Put("abcdxxx1", "val2"));
     ASSERT_OK(Put("abcdxxx2", "val3"));
     ASSERT_OK(Put("abcdxxx3", "val4"));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     {
       // prefix_extractor has not changed, BF will always be read
       Slice upper_bound("abce");
@@ -1553,8 +2312,8 @@
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
-    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
-                        "rocksdb.FixedPrefix.5"));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.5");
     {
       // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
       Slice upper_bound("abce");
@@ -1646,6 +2405,7 @@
   for (auto bfp_impl : BFP::kAllFixedImpls) {
     int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
+    options.env = CurrentOptions().env;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
     options.disable_auto_compactions = true;
@@ -1672,8 +2432,8 @@
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
 
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
-    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
-                        "rocksdb.CappedPrefix.3"));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
     read_options.iterate_upper_bound = &upper_bound;
     std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
     ASSERT_EQ(CountIter(iter, "foo"), 2);
@@ -1689,7 +2449,7 @@
     ASSERT_OK(Put("foo4", "bar4"));
     ASSERT_OK(Put("foq5", "bar5"));
     ASSERT_OK(Put("fpb", "1"));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     {
       // BF is cappped:3 now
       std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
@@ -1706,14 +2466,14 @@
     }
 
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
-    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
-                        "rocksdb.FixedPrefix.2"));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.2");
     // third SST with fixed:2 BF
     ASSERT_OK(Put("foo6", "bar6"));
     ASSERT_OK(Put("foo7", "bar7"));
     ASSERT_OK(Put("foq8", "bar8"));
     ASSERT_OK(Put("fpc", "2"));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     {
       // BF is fixed:2 now
       std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
@@ -1754,8 +2514,8 @@
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
-    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
-                        "rocksdb.CappedPrefix.3"));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
     {
       std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter_all, "foo"), 6);
@@ -1795,9 +2555,8 @@
     // create a new CF and set prefix_extractor dynamically
     options.prefix_extractor.reset(NewCappedPrefixTransform(3));
     CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
-    ASSERT_EQ(0,
-              strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
-                     "rocksdb.CappedPrefix.3"));
+    ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
     ASSERT_OK(Put(2, "foo3", "bar3"));
     ASSERT_OK(Put(2, "foo4", "bar4"));
     ASSERT_OK(Put(2, "foo5", "bar5"));
@@ -1813,9 +2572,8 @@
     }
     ASSERT_OK(
         dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
-    ASSERT_EQ(0,
-              strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
-                     "rocksdb.FixedPrefix.2"));
+    ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.2");
     {
       std::unique_ptr<Iterator> iter(
           db_->NewIterator(read_options, handles_[2]));
@@ -1824,10 +2582,10 @@
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
-    dbfull()->DestroyColumnFamilyHandle(handles_[2]);
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2]));
     handles_[2] = nullptr;
     ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
-    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
     handles_[1] = nullptr;
     iteration++;
   }
@@ -1838,6 +2596,7 @@
 TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
   for (auto bfp_impl : BFP::kAllFixedImpls) {
     Options options;
+    options.env = CurrentOptions().env;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
     options.disable_auto_compactions = true;
@@ -1879,8 +2638,8 @@
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
 
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
-    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
-                        "rocksdb.CappedPrefix.3"));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
     {
       std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
       // "fp*" should be skipped
@@ -1899,6 +2658,55 @@
   }
 }
 
+TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
+  Options options = CurrentOptions();
+  constexpr size_t kNumKeys = 10000;
+  static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeys + 10));
+  options.create_if_missing = true;
+  constexpr size_t kPrefixLength = 4;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(50));
+  bbto.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  bbto.block_size = 128;
+  bbto.metadata_block_size = 128;
+  bbto.partition_filters = true;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const std::string value(64, '\0');
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i;
+    ASSERT_OK(db_->Put(write_opts, oss.str(), value));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  // Use legacy, implicit prefix seek
+  read_opts.total_order_seek = false;
+  read_opts.auto_prefix_mode = false;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    // Seek with a key after each one added but with same prefix. One will
+    // surely cross a partition boundary.
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
+    it->SeekForPrev(oss.str());
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(it->Valid());
+  }
+  it.reset();
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_filter_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -21,7 +21,8 @@
 
 class DBTestCompactionFilter : public DBTestBase {
  public:
-  DBTestCompactionFilter() : DBTestBase("/db_compaction_filter_test") {}
+  DBTestCompactionFilter()
+      : DBTestBase("db_compaction_filter_test", /*env_do_fsync=*/true) {}
 };
 
 // Param variant of DBTestBase::ChangeCompactOptions
@@ -41,11 +42,11 @@
         option_config_ == kUniversalSubcompactions) {
       assert(options.max_subcompactions > 1);
     }
-    TryReopen(options);
+    Reopen(options);
   }
 };
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 INSTANTIATE_TEST_CASE_P(
     CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam,
     ::testing::Values(DBTestBase::OptionConfig::kDefault,
@@ -54,11 +55,11 @@
                       DBTestBase::OptionConfig::kLevelSubcompactions,
                       DBTestBase::OptionConfig::kUniversalSubcompactions));
 #else
-// Run fewer cases in valgrind
+// Run fewer cases in non-full valgrind to save time.
 INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption,
                         DBTestCompactionFilterWithCompactParam,
                         ::testing::Values(DBTestBase::OptionConfig::kDefault));
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 class KeepFilter : public CompactionFilter {
  public:
@@ -81,6 +82,11 @@
     return true;
   }
 
+  bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+                          const Slice& /*operand*/) const override {
+    return true;
+  }
+
   const char* Name() const override { return "DeleteFilter"; }
 };
 
@@ -126,22 +132,6 @@
   const char* Name() const override { return "DeleteFilter"; }
 };
 
-class DelayFilter : public CompactionFilter {
- public:
-  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
-  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
-              std::string* /*new_value*/,
-              bool* /*value_changed*/) const override {
-    db_test->env_->addon_time_.fetch_add(1000);
-    return true;
-  }
-
-  const char* Name() const override { return "DelayFilter"; }
-
- private:
-  DBTestBase* db_test;
-};
-
 class ConditionalFilter : public CompactionFilter {
  public:
   explicit ConditionalFilter(const std::string* filtered_value)
@@ -205,18 +195,36 @@
   bool compaction_filter_created_;
 };
 
+// This filter factory is configured with a `TableFileCreationReason`. Only
+// table files created for that reason will undergo filtering. This
+// configurability makes it useful to tests for filtering non-compaction table
+// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery".
 class DeleteFilterFactory : public CompactionFilterFactory {
  public:
+  explicit DeleteFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
-    if (context.is_manual_compaction) {
-      return std::unique_ptr<CompactionFilter>(new DeleteFilter());
-    } else {
+    EXPECT_EQ(reason_, context.reason);
+    if (context.reason == TableFileCreationReason::kCompaction &&
+        !context.is_manual_compaction) {
+      // Table files created by automatic compaction do not undergo filtering.
+      // Presumably some tests rely on this.
       return std::unique_ptr<CompactionFilter>(nullptr);
     }
+    return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+  }
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
   }
 
   const char* Name() const override { return "DeleteFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
 };
 
 // Delete Filter Factory which ignores snapshots
@@ -248,20 +256,6 @@
   const char* Name() const override { return "SkipEvenFilterFactory"; }
 };
 
-class DelayFilterFactory : public CompactionFilterFactory {
- public:
-  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
-  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& /*context*/) override {
-    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
-  }
-
-  const char* Name() const override { return "DelayFilterFactory"; }
-
- private:
-  DBTestBase* db_test;
-};
-
 class ConditionalFilterFactory : public CompactionFilterFactory {
  public:
   explicit ConditionalFilterFactory(const Slice& filtered_value)
@@ -305,7 +299,7 @@
   for (int i = 0; i < 100000; i++) {
     char key[100];
     snprintf(key, sizeof(key), "B%010d", i);
-    Put(1, key, value);
+    ASSERT_OK(Put(1, key, value));
   }
   ASSERT_OK(Flush(1));
 
@@ -313,10 +307,10 @@
   // the compaction is each level invokes the filter for
   // all the keys in that level.
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
@@ -336,19 +330,21 @@
     InternalKeyComparator icmp(options.comparator);
     ReadRangeDelAggregator range_del_agg(&icmp,
                                          kMaxSequenceNumber /* upper_bound */);
+    ReadOptions read_options;
     ScopedArenaIterator iter(dbfull()->NewInternalIterator(
-        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
+        read_options, &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       total++;
       if (ikey.sequence != 0) {
         count++;
       }
       iter->Next();
     }
+    ASSERT_OK(iter->status());
   }
   ASSERT_EQ(total, 100000);
   ASSERT_EQ(count, 0);
@@ -365,10 +361,10 @@
   // means that all keys should pass at least once
   // via the compaction filter
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
@@ -376,7 +372,8 @@
 
   // create a new database with the compaction
   // filter in such a way that it deletes all keys
-  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
   options.create_if_missing = true;
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
@@ -397,10 +394,10 @@
   // verify that at the end of the compaction process,
   // nothing is left.
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 0);
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
@@ -415,6 +412,7 @@
       count++;
       iter->Next();
     }
+    ASSERT_OK(iter->status());
     ASSERT_EQ(count, 0);
   }
 
@@ -426,13 +424,14 @@
     InternalKeyComparator icmp(options.comparator);
     ReadRangeDelAggregator range_del_agg(&icmp,
                                          kMaxSequenceNumber /* upper_bound */);
+    ReadOptions read_options;
     ScopedArenaIterator iter(dbfull()->NewInternalIterator(
-        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
+        read_options, &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       ASSERT_NE(ikey.sequence, (unsigned)0);
       count++;
       iter->Next();
@@ -446,7 +445,8 @@
 // entries in VersionEdit, but none of the 'AddFile's.
 TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
   Options options = CurrentOptions();
-  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
   options.disable_auto_compactions = true;
   options.create_if_missing = true;
   DestroyAndReopen(options);
@@ -454,9 +454,9 @@
   // put some data
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(ToString(table * 100 + i), "val"));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   // this will produce empty file (delete compaction filter)
@@ -467,6 +467,7 @@
 
   Iterator* itr = db_->NewIterator(ReadOptions());
   itr->SeekToFirst();
+  ASSERT_OK(itr->status());
   // empty db
   ASSERT_TRUE(!itr->Valid());
 
@@ -474,6 +475,64 @@
 }
 #endif  // ROCKSDB_LITE
 
+TEST_F(DBTestCompactionFilter, CompactionFilterFlush) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by flush.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kFlush);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by recovery.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kRecovery);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
 TEST_P(DBTestCompactionFilterWithCompactParam,
        CompactionFilterWithValueChange) {
   Options options = CurrentOptions();
@@ -490,25 +549,25 @@
   for (int i = 0; i < 100001; i++) {
     char key[100];
     snprintf(key, sizeof(key), "B%010d", i);
-    Put(1, key, value);
+    ASSERT_OK(Put(1, key, value));
   }
 
   // push all files to  lower levels
   ASSERT_OK(Flush(1));
   if (option_config_ != kUniversalCompactionMultiLevel &&
       option_config_ != kUniversalSubcompactions) {
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   } else {
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
   }
 
   // re-write all data again
   for (int i = 0; i < 100001; i++) {
     char key[100];
     snprintf(key, sizeof(key), "B%010d", i);
-    Put(1, key, value);
+    ASSERT_OK(Put(1, key, value));
   }
 
   // push all files to  lower levels. This should
@@ -516,11 +575,11 @@
   ASSERT_OK(Flush(1));
   if (option_config_ != kUniversalCompactionMultiLevel &&
       option_config_ != kUniversalSubcompactions) {
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   } else {
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
   }
 
   // verify that all keys now have the new value that
@@ -558,7 +617,7 @@
   ASSERT_OK(Flush());
   std::string newvalue = Get("foo");
   ASSERT_EQ(newvalue, three);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("foo");
   ASSERT_EQ(newvalue, three);
 
@@ -566,12 +625,12 @@
   // merge keys.
   ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
   ASSERT_OK(Flush());
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("bar");
   ASSERT_EQ("NOT_FOUND", newvalue);
   ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
   ASSERT_OK(Flush());
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("bar");
   ASSERT_EQ(two, two);
 
@@ -582,7 +641,7 @@
   ASSERT_OK(Flush());
   newvalue = Get("foobar");
   ASSERT_EQ(newvalue, three);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("foobar");
   ASSERT_EQ(newvalue, three);
 
@@ -595,7 +654,7 @@
   ASSERT_OK(Flush());
   newvalue = Get("barfoo");
   ASSERT_EQ(newvalue, four);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("barfoo");
   ASSERT_EQ(newvalue, four);
 }
@@ -617,21 +676,21 @@
     for (int i = 0; i < num_keys_per_file; i++) {
       char key[100];
       snprintf(key, sizeof(key), "B%08d%02d", i, j);
-      Put(key, value);
+      ASSERT_OK(Put(key, value));
     }
-    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
     // Make sure next file is much smaller so automatic compaction will not
     // be triggered.
     num_keys_per_file /= 2;
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Force a manual compaction
   cfilter_count = 0;
   filter->expect_manual_compaction_.store(true);
   filter->expect_full_compaction_.store(true);
   filter->expect_cf_id_.store(0);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(cfilter_count, 700);
   ASSERT_EQ(NumSortedRuns(0), 1);
   ASSERT_TRUE(filter->compaction_filter_created());
@@ -644,13 +703,14 @@
     InternalKeyComparator icmp(options.comparator);
     ReadRangeDelAggregator range_del_agg(&icmp,
                                          kMaxSequenceNumber /* snapshots */);
+    ReadOptions read_options;
     ScopedArenaIterator iter(dbfull()->NewInternalIterator(
-        &arena, &range_del_agg, kMaxSequenceNumber));
+        read_options, &arena, &range_del_agg, kMaxSequenceNumber));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       total++;
       if (ikey.sequence != 0) {
         count++;
@@ -680,14 +740,14 @@
     for (int i = 0; i < num_keys_per_file; i++) {
       char key[100];
       snprintf(key, sizeof(key), "B%08d%02d", i, j);
-      Put(1, key, value);
+      ASSERT_OK(Put(1, key, value));
     }
-    Flush(1);
+    ASSERT_OK(Flush(1));
     // Make sure next file is much smaller so automatic compaction will not
     // be triggered.
     num_keys_per_file /= 2;
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_TRUE(filter->compaction_filter_created());
 }
@@ -706,9 +766,9 @@
   const Snapshot* snapshot = nullptr;
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10; ++i) {
-      Put(ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(ToString(table * 100 + i), "val"));
     }
-    Flush();
+    ASSERT_OK(Flush());
 
     if (table == 0) {
       snapshot = db_->GetSnapshot();
@@ -728,6 +788,7 @@
     read_options.snapshot = snapshot;
     std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
     iter->SeekToFirst();
+    ASSERT_OK(iter->status());
     int count = 0;
     while (iter->Valid()) {
       count++;
@@ -736,6 +797,7 @@
     ASSERT_EQ(count, 6);
     read_options.snapshot = nullptr;
     std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
+    ASSERT_OK(iter1->status());
     iter1->SeekToFirst();
     count = 0;
     while (iter1->Valid()) {
@@ -766,9 +828,9 @@
     for (int i = table * 6; i < 39 + table * 11; ++i) {
       char key[100];
       snprintf(key, sizeof(key), "%010d", table * 100 + i);
-      Put(key, std::to_string(table * 1000 + i));
+      ASSERT_OK(Put(key, std::to_string(table * 1000 + i)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   cfilter_skips = 0;
@@ -807,10 +869,10 @@
   options.create_if_missing = true;
   DestroyAndReopen(options);
 
-  Put("0000000010", "v10");
-  Put("0000000020", "v20");  // skipped
-  Put("0000000050", "v50");
-  Flush();
+  ASSERT_OK(Put("0000000010", "v10"));
+  ASSERT_OK(Put("0000000020", "v20"));  // skipped
+  ASSERT_OK(Put("0000000050", "v50"));
+  ASSERT_OK(Flush());
 
   cfilter_skips = 0;
   EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -848,13 +910,13 @@
   options.compaction_filter = new TestNotSupportedFilter();
   DestroyAndReopen(options);
 
-  Put("a", "v10");
-  Put("z", "v20");
-  Flush();
-
-  Put("a", "v10");
-  Put("z", "v20");
-  Flush();
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
 
   // Comapction should fail because IgnoreSnapshots() = false
   EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
@@ -863,6 +925,49 @@
   delete options.compaction_filter;
 }
 
+class TestNotSupportedFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit TestNotSupportedFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /* context */) override {
+    return std::unique_ptr<CompactionFilter>(new TestNotSupportedFilter());
+  }
+
+  const char* Name() const override { return "TestNotSupportedFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kFlush);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(Flush().IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kRecovery);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_compaction_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_compaction_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,16 +7,23 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <tuple>
+
+#include "db/blob/blob_index.h"
 #include "db/db_test_util.h"
+#include "env/mock_env.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/experimental.h"
 #include "rocksdb/sst_file_writer.h"
 #include "rocksdb/utilities/convenience.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
+#include "test_util/testutil.h"
 #include "util/concurrent_task_limiter_impl.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -25,14 +32,16 @@
 
 class DBCompactionTest : public DBTestBase {
  public:
-  DBCompactionTest() : DBTestBase("/db_compaction_test") {}
+  DBCompactionTest()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
 };
 
 class DBCompactionTestWithParam
     : public DBTestBase,
       public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
  public:
-  DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") {
+  DBCompactionTestWithParam()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
     max_subcompactions_ = std::get<0>(GetParam());
     exclusive_manual_compaction_ = std::get<1>(GetParam());
   }
@@ -45,12 +54,34 @@
   bool exclusive_manual_compaction_;
 };
 
+class DBCompactionTestWithBottommostParam
+    : public DBTestBase,
+      public testing::WithParamInterface<BottommostLevelCompaction> {
+ public:
+  DBCompactionTestWithBottommostParam()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
+    bottommost_level_compaction_ = GetParam();
+  }
+
+  BottommostLevelCompaction bottommost_level_compaction_;
+};
+
 class DBCompactionDirectIOTest : public DBCompactionTest,
                                  public ::testing::WithParamInterface<bool> {
  public:
   DBCompactionDirectIOTest() : DBCompactionTest() {}
 };
 
+// Param = true : target level is non-empty
+// Param = false: level between target level and source level
+//  is not empty.
+class ChangeLevelConflictsWithAuto
+    : public DBCompactionTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  ChangeLevelConflictsWithAuto() : DBCompactionTest() {}
+};
+
 namespace {
 
 class FlushedFileCollector : public EventListener {
@@ -151,27 +182,28 @@
       options.target_file_size_base * options.target_file_size_multiplier;
   options.max_bytes_for_level_multiplier = 2;
   options.disable_auto_compactions = false;
+  options.compaction_options_universal.max_size_amplification_percent = 100;
   return options;
 }
 
 bool HaveOverlappingKeyRanges(
     const Comparator* c,
     const SstFileMetaData& a, const SstFileMetaData& b) {
-  if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
-    if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+  if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
       // b.smallestkey <= a.smallestkey <= b.largestkey
       return true;
     }
-  } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+  } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
     // a.smallestkey < b.smallestkey <= a.largestkey
     return true;
   }
-  if (c->Compare(a.largestkey, b.largestkey) <= 0) {
-    if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+  if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+    if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
       // b.smallestkey <= a.largestkey <= b.largestkey
       return true;
     }
-  } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+  } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
     // a.smallestkey <= b.largestkey < a.largestkey
     return true;
   }
@@ -226,7 +258,7 @@
     const CompactionStatsCollector& collector) {
 #ifndef NDEBUG
   InternalStats* internal_stats_ptr = cfd.internal_stats();
-  ASSERT_TRUE(internal_stats_ptr != nullptr);
+  ASSERT_NE(internal_stats_ptr, nullptr);
   const std::vector<InternalStats::CompactionStats>& comp_stats =
       internal_stats_ptr->TEST_GetCompactionStats();
   const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
@@ -270,7 +302,7 @@
 }
 }  // anonymous namespace
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 // All the TEST_P tests run once with sub_compactions disabled (i.e.
 // options.max_subcompactions = 1) and once with it enabled
 TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
@@ -295,25 +327,47 @@
     const int kTestSize = kCDTKeysPerBuffer * 1024;
     std::vector<std::string> values;
     for (int k = 0; k < kTestSize; ++k) {
-      values.push_back(RandomString(&rnd, kCDTValueSize));
+      values.push_back(rnd.RandomString(kCDTValueSize));
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
 
     for (int k = 0; k < kTestSize; ++k) {
       ASSERT_OK(Delete(Key(k)));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
 
-    // must have much smaller db size.
-    ASSERT_GT(db_size[0] / 3, db_size[1]);
+    if (options.compaction_style == kCompactionStyleUniversal) {
+      // Claim: in universal compaction none of the original data will remain
+      // once compactions settle.
+      //
+      // Proof: The compensated size of the file containing the most tombstones
+      // is enough on its own to trigger size amp compaction. Size amp
+      // compaction is a full compaction, so all tombstones meet the obsolete
+      // keys they cover.
+      ASSERT_EQ(0, db_size[1]);
+    } else {
+      // Claim: in level compaction at most `db_size[0] / 2` of the original
+      // data will remain once compactions settle.
+      //
+      // Proof: Assume the original data is all in the bottom level. If it were
+      // not, it would meet its tombstone sooner. The original data size is
+      // large enough to require fanout to bottom level to be greater than
+      // `max_bytes_for_level_multiplier == 2`. In the level just above,
+      // tombstones must cover less than `db_size[0] / 4` bytes since fanout >=
+      // 2 and file size is compensated by doubling the size of values we expect
+      // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in
+      // levels above must cover less than `db_size[0] / 8` bytes of original
+      // data, `db_size[0] / 16`, and so on.
+      ASSERT_GT(db_size[0] / 2, db_size[1]);
+    }
   }
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
   //  For each options type we test following
@@ -343,7 +397,7 @@
     const int kTestSize = kCDTKeysPerBuffer;
     std::vector<std::string> values;
     for (int k = 0; k < kTestSize; ++k) {
-      values.push_back(RandomString(&rnd, kCDTValueSize));
+      values.push_back(rnd.RandomString(kCDTValueSize));
       ASSERT_OK(Put(Key(k), values[k]));
     }
 
@@ -357,8 +411,9 @@
     cro.bottommost_level_compaction =
         BottommostLevelCompaction::kForceOptimized;
 
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_TRUE(
+        dbfull()->CompactRange(cro, nullptr, nullptr).IsInvalidArgument());
 
     // check that normal user iterator doesn't see anything
     Iterator* db_iter = dbfull()->NewIterator(ReadOptions());
@@ -366,6 +421,7 @@
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
       i++;
     }
+    ASSERT_OK(db_iter->status());
     ASSERT_EQ(i, 0);
     delete db_iter;
 
@@ -373,6 +429,7 @@
     ReadOptions ro;
     ro.iter_start_seqnum=1;
     db_iter = dbfull()->NewIterator(ro);
+    ASSERT_OK(db_iter->status());
     i = 0;
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
       i++;
@@ -382,9 +439,10 @@
 
     // now all deletes should be gone
     SetPreserveDeletesSequenceNumber(100000000);
-    dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
     db_iter = dbfull()->NewIterator(ro);
+    ASSERT_TRUE(db_iter->status().IsInvalidArgument());
     i = 0;
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
       i++;
@@ -408,7 +466,7 @@
   const int kTestSize = kCDTKeysPerBuffer * 512;
   std::vector<std::string> values;
   for (int k = 0; k < kTestSize; ++k) {
-    values.push_back(RandomString(&rnd, kCDTValueSize));
+    values.push_back(rnd.RandomString(kCDTValueSize));
     ASSERT_OK(Put(Key(k), values[k]));
   }
 
@@ -446,6 +504,10 @@
   options.new_table_reader_for_compaction_inputs = true;
   options.max_open_files = 20;
   options.level0_file_num_compaction_trigger = 3;
+  // Avoid many shards with small max_open_files, where as little as
+  // two table insertions could lead to an LRU eviction, depending on
+  // hash values.
+  options.table_cache_numshardbits = 2;
   DestroyAndReopen(options);
   Random rnd(301);
 
@@ -470,8 +532,8 @@
     ASSERT_OK(Put(Key(10 - k), "bar"));
     if (k < options.level0_file_num_compaction_trigger - 1) {
       num_table_cache_lookup = 0;
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       // preloading iterator issues one table cache lookup and create
       // a new table reader, if not preloaded.
       int old_num_table_cache_lookup = num_table_cache_lookup;
@@ -489,8 +551,8 @@
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Preloading iterator issues one table cache lookup and creates
   // a new table reader. One file is created for flush and one for compaction.
   // Compaction inputs make no table cache look-up for data/range deletion
@@ -517,7 +579,7 @@
   cro.change_level = true;
   cro.target_level = 2;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
-  db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   // Only verifying compaction outputs issues one table cache lookup
   // for both data block and range deletion block).
   // May preload table cache too.
@@ -555,12 +617,12 @@
     const int kTestSize = kCDTKeysPerBuffer * 512;
     std::vector<std::string> values;
     for (int k = 0; k < kTestSize; ++k) {
-      values.push_back(RandomString(&rnd, kCDTValueSize));
+      values.push_back(rnd.RandomString(kCDTValueSize));
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
     Close();
 
     // round 2 --- disable auto-compactions and issue deletions.
@@ -571,11 +633,10 @@
     for (int k = 0; k < kTestSize; ++k) {
       ASSERT_OK(Delete(Key(k)));
     }
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
     Close();
-    // as auto_compaction is off, we shouldn't see too much reduce
-    // in db size.
-    ASSERT_LT(db_size[0] / 3, db_size[1]);
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
 
     // round 3 --- reopen db with auto_compaction on and see if
     // deletion compensation still work.
@@ -585,14 +646,86 @@
     for (int k = 0; k < kTestSize / 10; ++k) {
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
     // this time we're expecting significant drop in size.
-    ASSERT_GT(db_size[0] / 3, db_size[2]);
+    //
+    // See "CompactionDeletionTrigger" test for proof that at most
+    // `db_size[0] / 2` of the original data remains. In addition to that, this
+    // test inserts `db_size[0] / 10` to push the tombstones into SST files and
+    // then through automatic compactions. So in total `3 * db_size[0] / 5` of
+    // the original data may remain.
+    ASSERT_GT(3 * db_size[0] / 5, db_size[2]);
   }
 }
 
+TEST_F(DBCompactionTest, CompactRangeBottomPri) {
+  ASSERT_OK(Put(Key(50), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(100), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(200), ""));
+  ASSERT_OK(Flush());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,3", FilesPerLevel(0));
+
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Put(Key(199), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(2), ""));
+  ASSERT_OK(Put(Key(199), ""));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("2,0,3", FilesPerLevel(0));
+
+  // Now we have 2 L0 files, and 3 L2 files, and a manual compaction will
+  // be triggered.
+  // Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool
+  // and one compact to L2 in bottom pri pool.
+  int low_pri_count = 0;
+  int bottom_pri_count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) {
+        Env::Priority* pri = reinterpret_cast<Env::Priority*>(arg);
+        // First time is low pri pool in the test case.
+        if (low_pri_count == 0 && bottom_pri_count == 0) {
+          ASSERT_EQ(Env::Priority::LOW, *pri);
+        }
+        if (*pri == Env::Priority::LOW) {
+          low_pri_count++;
+        } else {
+          bottom_pri_count++;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(1, low_pri_count);
+  ASSERT_EQ(1, bottom_pri_count);
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  // Recompact bottom most level uses bottom pool
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ(1, low_pri_count);
+  ASSERT_EQ(2, bottom_pri_count);
+
+  env_->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  // Low pri pool is used if bottom pool has size 0.
+  ASSERT_EQ(2, low_pri_count);
+  ASSERT_EQ(2, bottom_pri_count);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
   uint64_t db_size[3];
   for (int test = 0; test < 2; ++test) {
@@ -607,12 +740,19 @@
     const int kTestSize = kCDTKeysPerBuffer * 512;
     std::vector<std::string> values;
     for (int k = 0; k < kTestSize; ++k) {
-      values.push_back(RandomString(&rnd, kCDTValueSize));
+      values.push_back(rnd.RandomString(kCDTValueSize));
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // L1 and L2 can fit deletions iff size compensation does not take effect,
+    // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining
+    // files at or above L2 down to L3 to ensure obsolete data does not
+    // accidentally meet its tombstone above L3. This makes the final size more
+    // deterministic and easy to see whether size compensation for deletions
+    // took effect.
+    MoveFilesToLevel(3 /* level */);
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
     Close();
 
     // round 2 --- disable auto-compactions and issue deletions.
@@ -625,27 +765,33 @@
     for (int k = 0; k < kTestSize; ++k) {
       ASSERT_OK(Delete(Key(k)));
     }
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
     Close();
-    // as auto_compaction is off, we shouldn't see too much reduce
-    // in db size.
-    ASSERT_LT(db_size[0] / 3, db_size[1]);
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
 
     // round 3 --- reopen db with auto_compaction on and see if
     // deletion compensation still work.
     options.disable_auto_compactions = false;
     Reopen(options);
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
 
     if (options.skip_stats_update_on_db_open) {
       // If update stats on DB::Open is disable, we don't expect
       // deletion entries taking effect.
-      ASSERT_LT(db_size[0] / 3, db_size[2]);
+      //
+      // The deletions are small enough to fit in L1 and L2, and obsolete keys
+      // were moved to L3+, so none of the original data should have been
+      // dropped.
+      ASSERT_LE(db_size[0], db_size[2]);
     } else {
       // Otherwise, we should see a significant drop in db size.
-      ASSERT_GT(db_size[0] / 3, db_size[2]);
+      //
+      // See "CompactionDeletionTrigger" test for proof that at most
+      // `db_size[0] / 2` of the original data remains.
+      ASSERT_GT(db_size[0] / 2, db_size[2]);
     }
   }
 }
@@ -660,7 +806,8 @@
   options.num_levels = 3;
   options.level0_file_num_compaction_trigger = 3;
   options.max_subcompactions = max_subcompactions_;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
@@ -670,24 +817,24 @@
     std::vector<std::string> values;
     // Write 100KB (100 values, each 1K)
     for (int i = 0; i < kNumKeysPerFile; i++) {
-      values.push_back(RandomString(&rnd, 990));
+      values.push_back(rnd.RandomString(990));
       ASSERT_OK(Put(1, Key(i), values[i]));
     }
     // put extra key to trigger flush
     ASSERT_OK(Put(1, "", ""));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
   }
 
   // generate one more file in level-0, and should trigger level-0 compaction
   std::vector<std::string> values;
   for (int i = 0; i < kNumKeysPerFile; i++) {
-    values.push_back(RandomString(&rnd, 990));
+    values.push_back(rnd.RandomString(990));
     ASSERT_OK(Put(1, Key(i), values[i]));
   }
   // put extra key to trigger flush
   ASSERT_OK(Put(1, "", ""));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
@@ -707,7 +854,8 @@
   options.level0_slowdown_writes_trigger = 20;
   options.soft_pending_compaction_bytes_limit = 1 << 30;  // Infinitely large
   options.max_background_compactions = 3;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
 
   // Block all threads in thread pool.
   const size_t kTotalTasks = 4;
@@ -729,7 +877,7 @@
       }
       // put extra key to trigger flush
       ASSERT_OK(Put(cf, "", ""));
-      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
       ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
     }
   }
@@ -746,7 +894,7 @@
     }
     // put extra key to trigger flush
     ASSERT_OK(Put(2, "", ""));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
     ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
               NumTableFilesAtLevel(0, 2));
   }
@@ -757,7 +905,7 @@
     sleeping_tasks[i].WakeUp();
     sleeping_tasks[i].WaitUntilDone();
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Verify number of compactions allowed will come back to 1.
 
@@ -774,7 +922,7 @@
       }
       // put extra key to trigger flush
       ASSERT_OK(Put(cf, "", ""));
-      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
       ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
     }
   }
@@ -801,14 +949,14 @@
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   std::vector<std::string> values;
   for (int i = 0; i < 80; i++) {
-    values.push_back(RandomString(&rnd, 100000));
+    values.push_back(rnd.RandomString(100000));
     ASSERT_OK(Put(1, Key(i), values[i]));
   }
 
   // Reopening moves updates to level-0
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
-                              true /* disallow trivial move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                        true /* disallow trivial move */));
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
@@ -852,27 +1000,27 @@
   DestroyAndReopen(options);
 
   // create first file and flush to l0
-  Put("4", "A");
-  Put("3", "A");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
-
-  Put("2", "A");
-  Delete("3");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   // move both files down to l1
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   for (int i = 0; i < 3; i++) {
-    Put("2", "B");
-    Flush();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ("NOT_FOUND", Get("3"));
 }
@@ -885,31 +1033,85 @@
   DestroyAndReopen(options);
 
   // create first file and flush to l0
-  Put("4", "A");
-  Put("3", "A");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
-
-  Put("2", "A");
-  SingleDelete("3");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(SingleDelete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   // move both files down to l1
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   for (int i = 0; i < 3; i++) {
-    Put("2", "B");
-    Flush();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ("NOT_FOUND", Get("3"));
 }
 
+TEST_F(DBCompactionTest, CompactionSstPartitioner) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  std::shared_ptr<SstPartitionerFactory> factory(
+      NewSstPartitionerFixedPrefixFactory(4));
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("aaaa1", "A2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // move both files down to l1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::vector<LiveFileMetaData> files;
+  dbfull()->GetLiveFilesMetaData(&files);
+  ASSERT_EQ(2, files.size());
+  ASSERT_EQ("A2", Get("aaaa1"));
+  ASSERT_EQ("B", Get("bbbb1"));
+}
+
+TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 1;
+  std::shared_ptr<SstPartitionerFactory> factory(
+      NewSstPartitionerFixedPrefixFactory(4));
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files;
+  dbfull()->GetLiveFilesMetaData(&files);
+  ASSERT_EQ(2, files.size());
+  ASSERT_EQ("A", Get("aaaa1"));
+  ASSERT_EQ("B", Get("bbbb1"));
+}
+
 TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleLevel;
@@ -931,22 +1133,23 @@
 
   // create first file and flush to l0
   for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
-    Put(key, std::string(key_len, 'A'));
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
     snaps.push_back(dbfull()->GetSnapshot());
   }
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   // create second file and flush to l0
   for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
-    Put(key, std::string(key_len, 'A'));
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
     snaps.push_back(dbfull()->GetSnapshot());
   }
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   // move both files down to l1
-  dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1);
+  ASSERT_OK(
+      dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1));
 
   // release snap so that first instance of key(3) can have seqId=0
   for (auto snap : snaps) {
@@ -955,12 +1158,12 @@
 
   // create 3 files in l0 so to trigger compaction
   for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
-    Put("2", std::string(1, 'A'));
-    Flush();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(Put("2", std::string(1, 'A')));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_OK(Put("", ""));
 }
 
@@ -975,12 +1178,12 @@
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
       // make l0 files' ranges overlap to avoid trivial move
-      Put(std::to_string(2 * i), std::string(1, 'A'));
-      Put(std::to_string(2 * i + 1), std::string(1, 'A'));
-      Flush();
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A')));
+      ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A')));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
     ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1);
   }
@@ -996,7 +1199,7 @@
   // note CompactionOptions::output_file_size_limit is unset.
   CompactionOptions compact_opt;
   compact_opt.compression = kNoCompression;
-  dbfull()->CompactFiles(compact_opt, input_filenames, 1);
+  ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1));
 }
 
 // Check that writes done during a memtable compaction are recovered
@@ -1039,7 +1242,7 @@
   Random rnd(301);
   std::vector<std::string> values;
   for (int i = 0; i < num_keys; i++) {
-    values.push_back(RandomString(&rnd, value_size));
+    values.push_back(rnd.RandomString(value_size));
     ASSERT_OK(Put(Key(i), values[i]));
   }
 
@@ -1057,7 +1260,7 @@
   cro.exclusive_manual_compaction = exclusive_manual_compaction_;
 
   // Compaction will initiate a trivial move from L0 to L1
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
   // File moved From L0 to L1
   ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);  // 0 files in L0
@@ -1111,7 +1314,7 @@
   std::map<int32_t, std::string> values;
   for (size_t i = 0; i < ranges.size(); i++) {
     for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
-      values[j] = RandomString(&rnd, value_size);
+      values[j] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(j), values[j]));
     }
     ASSERT_OK(Flush());
@@ -1126,7 +1329,7 @@
 
   // Since data is non-overlapping we expect compaction to initiate
   // a trivial move
-  db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   // We expect that all the files were trivially moved from L0 to L1
   ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
@@ -1157,13 +1360,13 @@
   };
   for (size_t i = 0; i < ranges.size(); i++) {
     for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
-      values[j] = RandomString(&rnd, value_size);
+      values[j] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(j), values[j]));
     }
     ASSERT_OK(Flush());
   }
 
-  db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
   for (size_t i = 0; i < ranges.size(); i++) {
     for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
@@ -1202,14 +1405,14 @@
 
   // file 1 [0 => 300]
   for (int32_t i = 0; i <= 300; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
 
   // file 2 [600 => 700]
   for (int32_t i = 600; i <= 700; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1283,14 +1486,14 @@
 
   // file 1 [0 => 100]
   for (int32_t i = 0; i < 100; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
 
   // file 2 [100 => 300]
   for (int32_t i = 100; i < 300; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1311,7 +1514,7 @@
 
   // file 3 [ 0 => 200]
   for (int32_t i = 0; i < 200; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1343,21 +1546,21 @@
   TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
   // file 4 [300 => 400)
   for (int32_t i = 300; i <= 400; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
 
   // file 5 [400 => 500)
   for (int32_t i = 400; i <= 500; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
 
   // file 6 [500 => 600)
   for (int32_t i = 500; i <= 600; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   // Second non-trivial compaction is triggered
@@ -1367,8 +1570,8 @@
   ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
   TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
 
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // After two non-trivial compactions are installed, there is 1 file in L6, and
   // 1 file in L1
   ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
@@ -1425,14 +1628,14 @@
 
   // file 1 [0 => 100]
   for (int32_t i = 0; i < 100; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
 
   // file 2 [100 => 300]
   for (int32_t i = 100; i < 300; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1451,7 +1654,7 @@
 
   // file 3 [ 0 => 200]
   for (int32_t i = 0; i < 200; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1481,9 +1684,9 @@
     for (int32_t j = 300; j < 4300; j++) {
       if (j == 2300) {
         ASSERT_OK(Flush());
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       }
-      values[j] = RandomString(&rnd, value_size);
+      values[j] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(j), values[j]));
     }
   }
@@ -1497,8 +1700,8 @@
   }
 
   TEST_SYNC_POINT("DBCompaction::PartialFill:2");
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   threads.join();
 
   for (int32_t i = 0; i < 4300; i++) {
@@ -1516,12 +1719,12 @@
   Options options = CurrentOptions();
   options.unordered_write = true;
   DestroyAndReopen(options);
-  Put("foo", "v1");
+  ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Flush());
 
-  Put("bar", "v1");
+  ASSERT_OK(Put("bar", "v1"));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  port::Thread writer([&]() { Put("foo", "v2"); });
+  port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); });
 
   TEST_SYNC_POINT(
       "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
@@ -1554,14 +1757,14 @@
 
   // file 1 [0 => 100]
   for (int32_t i = 0; i < 100; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
 
   // file 2 [100 => 300]
   for (int32_t i = 100; i < 300; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1577,7 +1780,7 @@
 
   // file 3 [ 0 => 200]
   for (int32_t i = 0; i < 200; i++) {
-    values[i] = RandomString(&rnd, value_size);
+    values[i] = rnd.RandomString(value_size);
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1587,15 +1790,15 @@
     for (int32_t j = 300; j < 4300; j++) {
       if (j == 2300) {
         ASSERT_OK(Flush());
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       }
-      values[j] = RandomString(&rnd, value_size);
+      values[j] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(j), values[j]));
     }
   }
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Verify level sizes
   uint64_t target_size = 4 * options.max_bytes_for_level_base;
@@ -1605,7 +1808,7 @@
                                         options.max_bytes_for_level_multiplier);
   }
 
-  size_t old_num_files = CountFiles();
+  const size_t old_num_files = CountFiles();
   std::string begin_string = Key(1000);
   std::string end_string = Key(2000);
   Slice begin(begin_string);
@@ -1640,7 +1843,7 @@
   compact_options.change_level = true;
   compact_options.target_level = 1;
   ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_OK(
       DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
@@ -1649,12 +1852,11 @@
   for (int32_t i = 0; i < 4300; i++) {
     ReadOptions roptions;
     std::string result;
-    Status s = db_->Get(roptions, Key(i), &result);
-    ASSERT_TRUE(s.IsNotFound());
+    ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound());
     deleted_count2++;
   }
   ASSERT_GT(deleted_count2, deleted_count);
-  size_t new_num_files = CountFiles();
+  const size_t new_num_files = CountFiles();
   ASSERT_GT(old_num_files, new_num_files);
 }
 
@@ -1676,7 +1878,7 @@
   for (auto i = 0; i < 10; i++) {
     for (auto j = 0; j < 100; j++) {
       auto k = i * 100 + j;
-      values[k] = RandomString(&rnd, value_size);
+      values[k] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(k), values[k]));
     }
     ASSERT_OK(Flush());
@@ -1808,15 +2010,15 @@
   // would cause `1 -> vals[0]` (an older key) to reappear.
   std::string vals[kNumL0Files];
   for (int i = 0; i < kNumL0Files; ++i) {
-    vals[i] = RandomString(&rnd, kValSize);
-    Put(Key(i), vals[i]);
-    Put(Key(i + 1), vals[i]);
-    Flush();
+    vals[i] = rnd.RandomString(kValSize);
+    ASSERT_OK(Put(Key(i), vals[i]));
+    ASSERT_OK(Put(Key(i + 1), vals[i]));
+    ASSERT_OK(Flush());
     if (i == 0) {
       snapshot = db_->GetSnapshot();
     }
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
   // "1 -> vals[0]" to reappear.
@@ -1850,7 +2052,7 @@
   std::vector<std::string> values;
   // File with keys [ 0 => 99 ]
   for (int i = 0; i < 100; i++) {
-    values.push_back(RandomString(&rnd, value_size));
+    values.push_back(rnd.RandomString(value_size));
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1868,7 +2070,7 @@
 
   // File with keys [ 100 => 199 ]
   for (int i = 100; i < 200; i++) {
-    values.push_back(RandomString(&rnd, value_size));
+    values.push_back(rnd.RandomString(value_size));
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -1895,7 +2097,7 @@
   options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
   options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
   options.compaction_style = kCompactionStyleLevel;
   options.write_buffer_size = 110 << 10;  // 110KB
   options.arena_block_size = 4 << 10;
@@ -1903,16 +2105,8 @@
   options.num_levels = 4;
   options.max_bytes_for_level_base = 400 * 1024;
   options.max_subcompactions = max_subcompactions_;
-  //  options = CurrentOptions(options);
 
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
+  DestroyAndReopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -2012,7 +2206,7 @@
   options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
   options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
   options.compaction_style = kCompactionStyleLevel;
   options.write_buffer_size = 110 << 10;  // 110KB
   options.arena_block_size = 4 << 10;
@@ -2020,16 +2214,8 @@
   options.num_levels = 4;
   options.max_bytes_for_level_base = 400 * 1024;
   options.max_subcompactions = max_subcompactions_;
-  //  options = CurrentOptions(options);
 
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
+  DestroyAndReopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -2130,7 +2316,7 @@
   options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
   options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
   options.memtable_factory.reset(
-    new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
   options.compaction_style = kCompactionStyleLevel;
   options.write_buffer_size = 110 << 10;  // 110KB
   options.arena_block_size = 4 << 10;
@@ -2149,7 +2335,7 @@
   option_vector.emplace_back(DBOptions(options), cf_opt1);
   CreateColumnFamilies({"one"},option_vector[1]);
 
-  // Configura CF2 specific paths.
+  // Configure CF2 specific paths.
   cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
   cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
   cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
@@ -2204,13 +2390,16 @@
   // Check that default column family uses db_paths.
   // And Column family "one" uses cf_paths.
 
-  // First three 110KB files are not going to second path.
-  // After that, (100K, 200K)
+  // The compaction in level0 outputs the sst files in level1.
+  // The first path cannot hold level1's data(400KB+400KB > 500KB),
+  // so every compaction move a sst file to second path. Please
+  // refer to LevelCompactionBuilder::GetPathId.
   for (int num = 0; num < 3; num++) {
     generate_file();
   }
+  check_sstfilecount(0, 1);
+  check_sstfilecount(1, 2);
 
-  // Another 110KB triggers a compaction to 400K file to fill up first path
   generate_file();
   check_sstfilecount(1, 3);
 
@@ -2263,10 +2452,10 @@
 
   for (int i = 0; i <= max_key_level_insert; i++) {
     // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
   }
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(TotalTableFiles(1, 4), 1);
   int non_level0_num_files = 0;
@@ -2302,7 +2491,8 @@
   compact_options.bottommost_level_compaction =
       BottommostLevelCompaction::kForce;
   compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-  dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_OK(
+      dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
 
   // Only 1 file in L0
   ASSERT_EQ("1", FilesPerLevel(1));
@@ -2321,11 +2511,11 @@
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
   for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
   }
-  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   for (int i = 1; i < options.num_levels; i++) {
     ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
@@ -2335,6 +2525,7 @@
   // compaction style
   std::string keys_in_db;
   Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+  ASSERT_OK(iter->status());
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     keys_in_db.append(iter->key().ToString());
     keys_in_db.push_back(',');
@@ -2372,24 +2563,24 @@
 TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Delete(1, "e");
-    Put(1, "", "");
+    ASSERT_OK(Delete(1, "e"));
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "c", "cv");
+    ASSERT_OK(Put(1, "c", "cv"));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "d", "dv");
+    ASSERT_OK(Put(1, "d", "dv"));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Delete(1, "d");
-    Delete(1, "b");
+    ASSERT_OK(Delete(1, "d"));
+    ASSERT_OK(Delete(1, "b"));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("(->)(c->cv)", Contents(1));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
@@ -2406,34 +2597,35 @@
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  Put(1, "foo", "");
-  Put(1, "bar", "");
-  Flush(1);
-  Put(1, "foo", "");
-  Put(1, "bar", "");
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
   // Generate four files in CF 0, which should trigger an auto compaction
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
 
   // The auto compaction is scheduled but waited until here
   TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
   // The auto compaction will wait until the manual compaction is registerd
   // before processing so that it will be cancelled.
-  dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
   ASSERT_EQ("0,1", FilesPerLevel(1));
 
   // Eventually the cancelled compaction will be rescheduled and executed.
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,1", FilesPerLevel(0));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -2459,7 +2651,7 @@
     ASSERT_EQ("1,1,1", FilesPerLevel(1));
 
     // Compaction range overlaps files
-    Compact(1, "p1", "p9");
+    Compact(1, "p", "q");
     ASSERT_EQ("0,0,1", FilesPerLevel(1));
 
     // Populate a different range
@@ -2478,7 +2670,7 @@
         options.statistics->getTickerCount(BLOCK_CACHE_ADD);
     CompactRangeOptions cro;
     cro.exclusive_manual_compaction = exclusive_manual_compaction_;
-    db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
     // Verify manual compaction doesn't fill block cache
     ASSERT_EQ(prev_block_cache_add,
               options.statistics->getTickerCount(BLOCK_CACHE_ADD));
@@ -2526,7 +2718,7 @@
     ASSERT_EQ("3", FilesPerLevel(1));
 
     // Compaction range overlaps files
-    Compact(1, "p1", "p9", 1);
+    Compact(1, "p", "q", 1);
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ("0,1", FilesPerLevel(1));
     ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
@@ -2559,7 +2751,8 @@
     CompactRangeOptions compact_options;
     compact_options.target_path_id = 1;
     compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-    db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,1", FilesPerLevel(1));
@@ -2616,10 +2809,10 @@
 
   Random rnd(301);
   for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+    ASSERT_OK(Put(1, ToString(key), rnd.RandomString(kTestValueSize)));
   }
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ColumnFamilyMetaData cf_meta;
   dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
@@ -2692,13 +2885,13 @@
   std::vector<std::string> keys;
   std::vector<std::string> values;
   for (int k = 0; k < kNumInsertedKeys; ++k) {
-    keys.emplace_back(RandomString(&rnd, kKeySize));
-    values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
+    keys.emplace_back(rnd.RandomString(kKeySize));
+    values.emplace_back(rnd.RandomString(kKvSize - kKeySize));
     ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
 
-  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
   // Make sure the number of L0 files can trigger compaction.
   ASSERT_GE(NumTableFilesAtLevel(0),
             options.level0_file_num_compaction_trigger);
@@ -2759,12 +2952,12 @@
     for (int i = 0; i < 2; ++i) {
       // Create 1MB sst file
       for (int j = 0; j < 100; ++j) {
-        ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+        ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
       }
       ASSERT_OK(Flush());
     }
     // this should execute L0->L1
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ("0,1", FilesPerLevel(0));
 
     // block compactions
@@ -2781,7 +2974,7 @@
     sleeping_task.WaitUntilDone();
 
     // this should execute L1->L2 (move)
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,0,1", FilesPerLevel(0));
 
@@ -2794,12 +2987,12 @@
     for (int i = 0; i < 2; ++i) {
       // Create 1MB sst file
       for (int j = 0; j < 100; ++j) {
-        ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
+        ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024)));
       }
       ASSERT_OK(Flush());
     }
     // this should execute both L0->L1 and L1->L2 (merge with previous file)
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,0,2", FilesPerLevel(0));
 
@@ -2807,6 +3000,7 @@
     ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
 
     listener->SetExpectedFileName(dbname_ + moved_file_name);
+    ASSERT_OK(iterator->status());
     iterator.reset();
 
     // this file should have been compacted away
@@ -2821,7 +3015,7 @@
   }
   Options options = CurrentOptions();
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
   options.compaction_style = kCompactionStyleLevel;
   options.write_buffer_size = 110 << 10;  // 110KB
   options.arena_block_size = 4 << 10;
@@ -2969,7 +3163,7 @@
   for (int num = 0; num < 10; num++) {
     GenerateNewRandomFile(&rnd);
   }
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       {{"CompactionJob::Run():Start",
@@ -2990,7 +3184,7 @@
       "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
 
   GenerateNewRandomFile(&rnd, /* nowait */ true);
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
   for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
        num++) {
@@ -3000,7 +3194,7 @@
 
   TEST_SYNC_POINT(
       "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 }
 
 static std::string ShortKey(int i) {
@@ -3052,7 +3246,7 @@
   std::vector<std::string> values;
   // File with keys [ 0 => 99 ]
   for (int i = 0; i < 100; i++) {
-    values.push_back(RandomString(&rnd, value_size));
+    values.push_back(rnd.RandomString(value_size));
     ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -3069,7 +3263,7 @@
 
   // File with keys [ 100 => 199 ]
   for (int i = 100; i < 200; i++) {
-    values.push_back(RandomString(&rnd, value_size));
+    values.push_back(rnd.RandomString(value_size));
     ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -3087,7 +3281,7 @@
 
   // File with keys [ 200 => 299 ]
   for (int i = 200; i < 300; i++) {
-    values.push_back(RandomString(&rnd, value_size));
+    values.push_back(rnd.RandomString(value_size));
     ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
@@ -3118,14 +3312,28 @@
   options.level0_file_num_compaction_trigger = 5;
   options.max_background_compactions = 2;
   options.max_subcompactions = max_subcompactions_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.write_buffer_size = 2 << 20;  // 2MB
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(64 << 20);  // 64MB
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
   DestroyAndReopen(options);
 
   const size_t kValueSize = 1 << 20;
   Random rnd(301);
-  std::string value(RandomString(&rnd, kValueSize));
+  std::string value(rnd.RandomString(kValueSize));
 
+  // The L0->L1 must be picked before we begin flushing files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"LevelCompactionPicker::PickCompactionBySize:0",
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::IntraL0Compaction:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
         "CompactionJob::Run():Start"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -3143,13 +3351,14 @@
   for (int i = 0; i < 10; ++i) {
     ASSERT_OK(Put(Key(0), ""));  // prevents trivial move
     if (i == 5) {
+      TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready");
       ASSERT_OK(Put(Key(i + 1), value + value));
     } else {
       ASSERT_OK(Put(Key(i + 1), value));
     }
     ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   std::vector<std::vector<FileMetaData>> level_to_files;
@@ -3162,6 +3371,16 @@
   for (int i = 0; i < 2; ++i) {
     ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
   }
+
+  // The index/filter in the file produced by intra-L0 should not be pinned.
+  // That means clearing unref'd entries in block cache and re-accessing the
+  // file produced by intra-L0 should bump the index block miss count.
+  uint64_t prev_index_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  table_options.block_cache->EraseUnRefEntries();
+  ASSERT_EQ("", Get(Key(0)));
+  ASSERT_EQ(prev_index_misses + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
 }
 
 TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
@@ -3176,10 +3395,16 @@
 
   const size_t kValueSize = 1 << 20;
   Random rnd(301);
-  std::string value(RandomString(&rnd, kValueSize));
+  std::string value(rnd.RandomString(kValueSize));
 
+  // The L0->L1 must be picked before we begin flushing files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"LevelCompactionPicker::PickCompactionBySize:0",
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+        "L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
         "CompactionJob::Run():Start"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -3203,10 +3428,15 @@
     } else {
       ASSERT_OK(Delete(Key(0)));
     }
+    if (i == 5) {
+      TEST_SYNC_POINT(
+          "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+          "L0ToL1Ready");
+    }
     ASSERT_OK(Put(Key(i + 1), value));
     ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   std::vector<std::vector<FileMetaData>> level_to_files;
@@ -3254,7 +3484,7 @@
       int key_idx = 0;
       GenerateNewFile(&rnd, &key_idx);
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ(1, num_bottom_pri_compactions);
 
@@ -3278,8 +3508,8 @@
   // So key 0, 2, and 4+ fall outside these levels' key-ranges.
   for (int level = 2; level >= 1; --level) {
     for (int i = 0; i < 2; ++i) {
-      Put(Key(2 * i + 1), "val");
-      Flush();
+      ASSERT_OK(Put(Key(2 * i + 1), "val"));
+      ASSERT_OK(Flush());
     }
     MoveFilesToLevel(level);
     ASSERT_EQ(2, NumTableFilesAtLevel(level));
@@ -3289,11 +3519,11 @@
   // - Tombstones for keys 2 and 4 can be dropped early.
   // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
   for (int i = 0; i < kNumL0Files; ++i) {
-    Put(Key(0), "val");  // sentinel to prevent trivial move
-    Delete(Key(i + 1));
-    Flush();
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Delete(Key(i + 1)));
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   for (int i = 0; i < kNumL0Files; ++i) {
     std::string value;
@@ -3357,10 +3587,10 @@
 TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
   // Regression test for bug of not pulling in L0 files that overlap the user-
   // specified input files in time- and key-ranges.
-  Put(Key(0), "old_val");
-  Flush();
-  Put(Key(0), "new_val");
-  Flush();
+  ASSERT_OK(Put(Key(0), "old_val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(0), "new_val"));
+  ASSERT_OK(Flush());
 
   ColumnFamilyMetaData cf_meta;
   dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
@@ -3376,6 +3606,41 @@
   ASSERT_EQ("new_val", Get(Key(0)));
 }
 
+TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  const Snapshot* snapshot = nullptr;
+  const int kMaxKey = 10;
+
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+    ASSERT_OK(Delete(Key(i)));
+    if (!snapshot) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey)));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // test DeleteFilesInRange() deletes the files already picked for compaction
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply:WriteManifestStart",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCompaction:Finish",
+        "VersionSet::LogAndApply:WriteManifestDone"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // release snapshot which mark bottommost file for compaction
+  db_->ReleaseSnapshot(snapshot);
+  std::string begin_string = Key(0);
+  std::string end_string = Key(kMaxKey + 1);
+  Slice begin(begin_string);
+  Slice end(end_string);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
   // bottom-level files may contain deletions due to snapshots protecting the
   // deleted keys. Once the snapshot is released, we should see files with many
@@ -3395,7 +3660,7 @@
   for (int i = 0; i < kNumLevelFiles; ++i) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
     if (i == kNumLevelFiles - 1) {
       snapshot = db_->GetSnapshot();
@@ -3406,12 +3671,12 @@
         ASSERT_OK(Delete(Key(j)));
       }
     }
-    Flush();
+    ASSERT_OK(Flush());
     if (i < kNumLevelFiles - 1) {
       ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
     }
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
 
   std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
@@ -3432,7 +3697,7 @@
                     CompactionReason::kBottommostFiles);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   db_->GetLiveFilesMetaData(&post_release_metadata);
   ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
 
@@ -3448,6 +3713,76 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) {
+  // bottom-level files may contain deletions due to snapshots protecting the
+  // deleted keys. Once the snapshot is released, we should see files with many
+  // such deletions undergo single-file compactions. But when disabling auto
+  // compactions, it shouldn't be triggered which may causing too many
+  // background jobs.
+  const int kNumKeysPerFile = 1024;
+  const int kNumLevelFiles = 4;
+  const int kValueSize = 128;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumLevelFiles;
+  // inflate it a bit to account for key/metadata overhead
+  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    if (i == kNumLevelFiles - 1) {
+      snapshot = db_->GetSnapshot();
+      // delete every other key after grabbing a snapshot, so these deletions
+      // and the keys they cover can't be dropped until after the snapshot is
+      // released.
+      for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+        ASSERT_OK(Delete(Key(j)));
+      }
+    }
+    ASSERT_OK(Flush());
+    if (i < kNumLevelFiles - 1) {
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr));
+  ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+  db_->GetLiveFilesMetaData(&pre_release_metadata);
+  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+  // files does not need to be preserved in case of a future snapshot.
+  ASSERT_OK(Put(Key(0), "val"));
+
+  // release snapshot and no compaction should be triggered.
+  std::atomic<int> num_compactions{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { num_compactions.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, num_compactions);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  db_->GetLiveFilesMetaData(&post_release_metadata);
+  ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+  for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+    const auto& pre_file = pre_release_metadata[i];
+    const auto& post_file = post_release_metadata[i];
+    ASSERT_EQ(1, pre_file.level);
+    ASSERT_EQ(1, post_file.level);
+    // each file is same as before with deletion markers/deleted keys.
+    ASSERT_EQ(post_file.size, pre_file.size);
+  }
+}
+
 TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
   const int kNumKeysPerFile = 32;
   const int kNumLevelFiles = 2;
@@ -3457,21 +3792,22 @@
   options.compression = kNoCompression;
   options.ttl = 24 * 60 * 60;  // 24 hours
   options.max_open_files = -1;
-  env_->time_elapse_only_sleep_ = false;
+  env_->SetMockSleep();
   options.env = env_;
 
-  env_->addon_time_.store(0);
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   DestroyAndReopen(options);
 
   Random rnd(301);
   for (int i = 0; i < kNumLevelFiles; ++i) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   MoveFilesToLevel(3);
   ASSERT_EQ("0,0,0,2", FilesPerLevel());
 
@@ -3480,44 +3816,45 @@
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("2,0,0,2", FilesPerLevel());
   MoveFilesToLevel(1);
   ASSERT_EQ("0,2,0,2", FilesPerLevel());
 
-  env_->addon_time_.fetch_add(36 * 60 * 60);  // 36 hours
+  env_->MockSleepForSeconds(36 * 60 * 60);  // 36 hours
   ASSERT_EQ("0,2,0,2", FilesPerLevel());
 
   // Just do a simple write + flush so that the Ttl expired files get
   // compacted.
   ASSERT_OK(Put("a", "1"));
-  Flush();
+  ASSERT_OK(Flush());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
         Compaction* compaction = reinterpret_cast<Compaction*>(arg);
         ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // All non-L0 files are deleted, as they contained only deleted data.
   ASSERT_EQ("1", FilesPerLevel());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   // Test dynamically changing ttl.
 
-  env_->addon_time_.store(0);
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   DestroyAndReopen(options);
 
   for (int i = 0; i < kNumLevelFiles; ++i) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   MoveFilesToLevel(3);
   ASSERT_EQ("0,0,0,2", FilesPerLevel());
 
@@ -3526,19 +3863,19 @@
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("2,0,0,2", FilesPerLevel());
   MoveFilesToLevel(1);
   ASSERT_EQ("0,2,0,2", FilesPerLevel());
 
   // Move time forward by 12 hours, and make sure that compaction still doesn't
   // trigger as ttl is set to 24 hours.
-  env_->addon_time_.fetch_add(12 * 60 * 60);
+  env_->MockSleepForSeconds(12 * 60 * 60);
   ASSERT_OK(Put("a", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("1,2,0,2", FilesPerLevel());
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -3551,13 +3888,14 @@
   // Dynamically change ttl to 10 hours.
   // This should trigger a ttl compaction, as 12 hours have already passed.
   ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // All non-L0 files are deleted, as they contained only deleted data.
   ASSERT_EQ("1", FilesPerLevel());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
+  env_->SetMockSleep();
   const int kValueSize = 100;
 
   for (bool if_restart : {false, true}) {
@@ -3588,10 +3926,10 @@
             }
           });
 
-      env_->time_elapse_only_sleep_ = false;
       options.env = env_;
 
-      env_->addon_time_.store(0);
+      // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
       DestroyAndReopen(options);
 
       int ttl_compactions = 0;
@@ -3608,9 +3946,9 @@
       // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
       Random rnd(301);
       for (int i = 1; i <= 100; ++i) {
-        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       // Get the first file's creation time. This will be the oldest file in the
       // DB. Compactions inolving this file's descendents should keep getting
       // this time.
@@ -3619,35 +3957,35 @@
                                       &level_to_files);
       uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time;
       // Add 1 hour and do another flush.
-      env_->addon_time_.fetch_add(1 * 60 * 60);
+      env_->MockSleepForSeconds(1 * 60 * 60);
       for (int i = 101; i <= 200; ++i) {
-        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       MoveFilesToLevel(6);
       ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
 
-      env_->addon_time_.fetch_add(1 * 60 * 60);
+      env_->MockSleepForSeconds(1 * 60 * 60);
       // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
       for (int i = 1; i <= 50; ++i) {
-        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
-      env_->addon_time_.fetch_add(1 * 60 * 60);
+      ASSERT_OK(Flush());
+      env_->MockSleepForSeconds(1 * 60 * 60);
       for (int i = 51; i <= 150; ++i) {
-        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       MoveFilesToLevel(4);
       ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
 
-      env_->addon_time_.fetch_add(1 * 60 * 60);
+      env_->MockSleepForSeconds(1 * 60 * 60);
       // Add one L1 file with key range: [26, 75].
       for (int i = 26; i <= 75; ++i) {
-        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       MoveFilesToLevel(1);
       ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
 
@@ -3671,15 +4009,15 @@
       // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
 
       // Add 25 hours and do a write
-      env_->addon_time_.fetch_add(25 * 60 * 60);
+      env_->MockSleepForSeconds(25 * 60 * 60);
 
       ASSERT_OK(Put(Key(1), "1"));
       if (if_restart) {
         Reopen(options);
       } else {
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
       ASSERT_EQ(5, ttl_compactions);
 
@@ -3687,14 +4025,14 @@
                                       &level_to_files);
       ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time);
 
-      env_->addon_time_.fetch_add(25 * 60 * 60);
+      env_->MockSleepForSeconds(25 * 60 * 60);
       ASSERT_OK(Put(Key(2), "1"));
       if (if_restart) {
         Reopen(options);
       } else {
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
       ASSERT_GE(ttl_compactions, 6);
 
@@ -3704,6 +4042,7 @@
 }
 
 TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
+  env_->SetMockSleep();
   const int kNumKeysPerFile = 32;
   const int kNumLevelFiles = 2;
   const int kValueSize = 100;
@@ -3735,10 +4074,10 @@
             }
           });
 
-      env_->time_elapse_only_sleep_ = false;
       options.env = env_;
 
-      env_->addon_time_.store(0);
+      // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
       DestroyAndReopen(options);
 
       int periodic_compactions = 0;
@@ -3755,21 +4094,21 @@
       Random rnd(301);
       for (int i = 0; i < kNumLevelFiles; ++i) {
         for (int j = 0; j < kNumKeysPerFile; ++j) {
-          ASSERT_OK(Put(Key(i * kNumKeysPerFile + j),
-                        RandomString(&rnd, kValueSize)));
+          ASSERT_OK(
+              Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
         }
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
       ASSERT_EQ("2", FilesPerLevel());
       ASSERT_EQ(0, periodic_compactions);
 
       // Add 50 hours and do a write
-      env_->addon_time_.fetch_add(50 * 60 * 60);
+      env_->MockSleepForSeconds(50 * 60 * 60);
       ASSERT_OK(Put("a", "1"));
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       // Assert that the files stay in the same level
       ASSERT_EQ("3", FilesPerLevel());
       // The two old files go through the periodic compaction process
@@ -3779,24 +4118,24 @@
       ASSERT_EQ("0,3", FilesPerLevel());
 
       // Add another 50 hours and do another write
-      env_->addon_time_.fetch_add(50 * 60 * 60);
+      env_->MockSleepForSeconds(50 * 60 * 60);
       ASSERT_OK(Put("b", "2"));
       if (if_restart) {
         Reopen(options);
       } else {
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("1,3", FilesPerLevel());
       // The three old files now go through the periodic compaction process. 2
       // + 3.
       ASSERT_EQ(5, periodic_compactions);
 
       // Add another 50 hours and do another write
-      env_->addon_time_.fetch_add(50 * 60 * 60);
+      env_->MockSleepForSeconds(50 * 60 * 60);
       ASSERT_OK(Put("c", "3"));
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("2,3", FilesPerLevel());
       // The four old files now go through the periodic compaction process. 5
       // + 4.
@@ -3817,10 +4156,11 @@
   const int kValueSize = 100;
 
   Options options = CurrentOptions();
-  env_->time_elapse_only_sleep_ = false;
+  env_->SetMockSleep();
   options.env = env_;
 
-  env_->addon_time_.store(0);
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   DestroyAndReopen(options);
 
   int periodic_compactions = 0;
@@ -3850,9 +4190,9 @@
   for (int i = 0; i < kNumFiles; ++i) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
     // Move the first two files to L2.
     if (i == 1) {
       MoveFilesToLevel(2);
@@ -3868,7 +4208,7 @@
 
   set_file_creation_time_to_zero = false;
   // Forward the clock by 2 days.
-  env_->addon_time_.fetch_add(2 * 24 * 60 * 60);
+  env_->MockSleepForSeconds(2 * 24 * 60 * 60);
   options.periodic_compaction_seconds = 1 * 24 * 60 * 60;  // 1 day
 
   Reopen(options);
@@ -3889,10 +4229,11 @@
   options.ttl = 10 * 60 * 60;  // 10 hours
   options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
   options.max_open_files = -1;   // needed for both periodic and ttl compactions
-  env_->time_elapse_only_sleep_ = false;
+  env_->SetMockSleep();
   options.env = env_;
 
-  env_->addon_time_.store(0);
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   DestroyAndReopen(options);
 
   int periodic_compactions = 0;
@@ -3913,11 +4254,11 @@
   for (int i = 0; i < kNumLevelFiles; ++i) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   MoveFilesToLevel(3);
 
@@ -3926,20 +4267,20 @@
   ASSERT_EQ(0, ttl_compactions);
 
   // Add some time greater than periodic_compaction_time.
-  env_->addon_time_.fetch_add(50 * 60 * 60);
+  env_->MockSleepForSeconds(50 * 60 * 60);
   ASSERT_OK(Put("a", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Files in the bottom level go through periodic compactions.
   ASSERT_EQ("1,0,0,2", FilesPerLevel());
   ASSERT_EQ(2, periodic_compactions);
   ASSERT_EQ(0, ttl_compactions);
 
   // Add a little more time than ttl
-  env_->addon_time_.fetch_add(11 * 60 * 60);
+  env_->MockSleepForSeconds(11 * 60 * 60);
   ASSERT_OK(Put("b", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Notice that the previous file in level 1 falls down to the bottom level
   // due to ttl compactions, one level at a time.
   // And bottom level files don't get picked up for ttl compactions.
@@ -3948,10 +4289,10 @@
   ASSERT_EQ(3, ttl_compactions);
 
   // Add some time greater than periodic_compaction_time.
-  env_->addon_time_.fetch_add(50 * 60 * 60);
+  env_->MockSleepForSeconds(50 * 60 * 60);
   ASSERT_OK(Put("c", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Previous L0 file falls one level at a time to bottom level due to ttl.
   // And all 4 bottom files go through periodic compactions.
   ASSERT_EQ("1,0,0,4", FilesPerLevel());
@@ -3961,6 +4302,67 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBCompactionTest, LevelTtlBooster) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 3;
+  const int kValueSize = 1000;
+
+  Options options = CurrentOptions();
+  options.ttl = 10 * 60 * 60;                           // 10 hours
+  options.periodic_compaction_seconds = 480 * 60 * 60;  // very long
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize};
+  options.max_open_files = -1;  // needed for both periodic and ttl compactions
+  options.compaction_pri = CompactionPri::kMinOverlappingRatio;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  MoveFilesToLevel(2);
+
+  ASSERT_EQ("0,0,3", FilesPerLevel());
+
+  // Create some files for L1
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  ASSERT_EQ("0,1,3", FilesPerLevel());
+
+  // Make the new L0 files qualify TTL boosting and generate one more to trigger
+  // L1 -> L2 compaction. Old files will be picked even if their priority is
+  // lower without boosting.
+  env_->MockSleepForSeconds(8 * 60 * 60);
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i),
+                    rnd.RandomString(kValueSize * 2)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ("0,1,2", FilesPerLevel());
+
+  ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize);
+}
+
 TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
   class TestCompactionFilter : public CompactionFilter {
     const char* Name() const override { return "TestCompactionFilter"; }
@@ -3981,9 +4383,10 @@
 
   Options options = CurrentOptions();
   TestCompactionFilter test_compaction_filter;
-  env_->time_elapse_only_sleep_ = false;
+  env_->SetMockSleep();
   options.env = env_;
-  env_->addon_time_.store(0);
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
 
   enum CompactionFilterType {
     kUseCompactionFilter,
@@ -4024,20 +4427,20 @@
     for (int i = 0; i < kNumLevelFiles; ++i) {
       for (int j = 0; j < kNumKeysPerFile; ++j) {
         ASSERT_OK(
-            Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+            Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("2", FilesPerLevel());
     ASSERT_EQ(0, periodic_compactions);
 
     // Add 31 days and do a write
-    env_->addon_time_.fetch_add(31 * 24 * 60 * 60);
+    env_->MockSleepForSeconds(31 * 24 * 60 * 60);
     ASSERT_OK(Put("a", "1"));
-    Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     // Assert that the files stay in the same level
     ASSERT_EQ("3", FilesPerLevel());
     // The two old files go through the periodic compaction process
@@ -4084,18 +4487,18 @@
     Random rnd(301);
     for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
       for (int k = 0; k < 2; ++k) {
-        ASSERT_OK(Put(Key(k), RandomString(&rnd, 1024)));
+        ASSERT_OK(Put(Key(k), rnd.RandomString(1024)));
       }
-      Flush();
+      ASSERT_OK(Flush());
     }
     auto manual_compaction_thread = port::Thread([this]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      db_->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
     });
 
     manual_compaction_thread.join();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_GT(NumTableFilesAtLevel(1), 0);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -4138,21 +4541,21 @@
 
     Random rnd(301);
     for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
-      ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
       FlushOptions flush_opts;
       flush_opts.wait = false;
       flush_opts.allow_write_stall = true;
-      dbfull()->Flush(flush_opts);
+      ASSERT_OK(dbfull()->Flush(flush_opts));
     }
 
     auto manual_compaction_thread = port::Thread([this]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      db_->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
     });
 
     manual_compaction_thread.join();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_GT(NumTableFilesAtLevel(1), 0);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -4186,14 +4589,13 @@
     Random rnd(301);
     for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
       for (int k = 0; k < 2; ++k) {
-        ASSERT_OK(Put(1, Key(k), RandomString(&rnd, 1024)));
+        ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024)));
       }
-      Flush(1);
+      ASSERT_OK(Flush(1));
     }
     auto manual_compaction_thread = port::Thread([this, i]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr);
       if (i == 0) {
         ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
                         .IsColumnFamilyDropped());
@@ -4213,7 +4615,7 @@
     manual_compaction_thread.join();
     TEST_SYNC_POINT(
         "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
@@ -4246,27 +4648,28 @@
   flush_opts.allow_write_stall = true;
   for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
     for (int j = 0; j < 2; ++j) {
-      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
     }
-    dbfull()->Flush(flush_opts);
+    ASSERT_OK(dbfull()->Flush(flush_opts));
   }
   auto manual_compaction_thread = port::Thread([this]() {
     CompactRangeOptions cro;
     cro.allow_write_stall = false;
-    db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   });
 
   TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
-  Put(ToString(0), RandomString(&rnd, 1024));
-  dbfull()->Flush(flush_opts);
-  Put(ToString(0), RandomString(&rnd, 1024));
+  ASSERT_OK(Put(ToString(0), rnd.RandomString(1024)));
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(Put(ToString(0), rnd.RandomString(1024)));
   TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
   manual_compaction_thread.join();
 
   // If CompactRange's flush was skipped, the final Put above will still be
   // in the active memtable.
   std::string num_keys_in_memtable;
-  db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable);
+  ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable,
+                               &num_keys_in_memtable));
   ASSERT_EQ(ToString(1), num_keys_in_memtable);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -4324,7 +4727,7 @@
       } else {
         ASSERT_EQ(2, num_memtable_entries);
         // flush anyways to prepare for next iteration
-        db_->Flush(FlushOptions());
+        ASSERT_OK(db_->Flush(FlushOptions()));
       }
     }
   }
@@ -4339,12 +4742,12 @@
 
   for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 5000; j++) {
-      Put(std::to_string(j), std::string(1, 'A'));
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
     }
     ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ColumnFamilyHandleImpl* cfh =
       static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
   ColumnFamilyData* cfd = cfh->cfd();
@@ -4429,7 +4832,7 @@
   ASSERT_OK(Delete("b"));
   ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
 
@@ -4476,7 +4879,8 @@
   options.level0_slowdown_writes_trigger = 64;
   options.level0_stop_writes_trigger = 64;
   options.max_background_jobs = kMaxBackgroundThreads; // Enough threads
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
   options.max_write_buffer_number = 10; // Enough memtables
   DestroyAndReopen(options);
 
@@ -4562,7 +4966,7 @@
     }
 
     for (unsigned int cf = 0; cf < cf_count; cf++) {
-      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
     }
   }
 
@@ -4580,7 +4984,7 @@
     }
     // put extra key to trigger flush
     ASSERT_OK(Put(0, "", ""));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
     ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
               NumTableFilesAtLevel(0, 0));
   }
@@ -4595,7 +4999,7 @@
   }
 
   for (unsigned int cf = 0; cf < cf_count; cf++) {
-    dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
   }
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -4617,7 +5021,7 @@
   // put extra key to trigger flush
   ASSERT_OK(Put(cf_test, "", ""));
 
-  dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]));
   ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
 
   Compact(cf_test, Key(0), Key(keyIndex));
@@ -4636,7 +5040,7 @@
   options.create_if_missing = true;
   options.disable_auto_compactions = true;
   options.use_direct_io_for_flush_and_compaction = GetParam();
-  options.env = new MockEnv(Env::Default());
+  options.env = MockEnv::Create(Env::Default());
   Reopen(options);
   bool readahead = false;
   SyncPoint::GetInstance()->SetCallBack(
@@ -4655,7 +5059,7 @@
   CreateAndReopenWithCF({"pikachu"}, options);
   MakeTables(3, "p", "q", 1);
   ASSERT_EQ("1,1,1", FilesPerLevel(1));
-  Compact(1, "p1", "p9");
+  Compact(1, "p", "q");
   ASSERT_EQ(readahead, options.use_direct_reads);
   ASSERT_EQ("0,0,1", FilesPerLevel(1));
   Destroy(options);
@@ -4668,7 +5072,8 @@
 class CompactionPriTest : public DBTestBase,
                           public testing::WithParamInterface<uint32_t> {
  public:
-  CompactionPriTest() : DBTestBase("/compaction_pri_test") {
+  CompactionPriTest()
+      : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) {
     compaction_pri_ = GetParam();
   }
 
@@ -4696,13 +5101,13 @@
   for (int i = 0; i < kNKeys; i++) {
     keys[i] = i;
   }
-  std::random_shuffle(std::begin(keys), std::end(keys));
+  RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
 
   for (int i = 0; i < kNKeys; i++) {
-    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102)));
   }
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   for (int i = 0; i < kNKeys; i++) {
     ASSERT_NE("NOT_FOUND", Get(Key(i)));
   }
@@ -4741,9 +5146,9 @@
   Random rnd(301);
   for (auto i = 0; i < 8; ++i) {
     for (auto j = 0; j < 10; ++j) {
-      Merge("foo", RandomString(&rnd, 1024));
+      ASSERT_OK(Merge("foo", rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   MoveFilesToLevel(2);
@@ -4756,7 +5161,7 @@
 
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 }
 
 TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
@@ -4764,7 +5169,7 @@
   // is in read-only mode. Verify it now at least returns, despite failing.
   const int kNumL0Files = 4;
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options opts = CurrentOptions();
   opts.disable_auto_compactions = true;
   opts.env = mock_env.get();
@@ -4773,9 +5178,9 @@
   Random rnd(301);
   for (int i = 0; i < kNumL0Files; ++i) {
     // Make sure files are overlapping in key-range to prevent trivial move.
-    Put("key1", RandomString(&rnd, 1024));
-    Put("key2", RandomString(&rnd, 1024));
-    Flush();
+    ASSERT_OK(Put("key1", rnd.RandomString(1024)));
+    ASSERT_OK(Put("key2", rnd.RandomString(1024)));
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
 
@@ -4783,7 +5188,7 @@
   mock_env->SetFilesystemActive(false);
   // Make sure this is outside `CompactRange`'s range so that it doesn't fail
   // early trying to flush memtable.
-  ASSERT_NOK(Put("key3", RandomString(&rnd, 1024)));
+  ASSERT_NOK(Put("key3", rnd.RandomString(1024)));
 
   // In the bug scenario, the first manual compaction would fail and forget to
   // unregister itself, causing the second one to hang forever due to conflict
@@ -4822,9 +5227,9 @@
   for (auto i = 0; i < 8; ++i) {
     for (auto j = 0; j < 10; ++j) {
       ASSERT_OK(
-          Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   MoveFilesToLevel(2);
@@ -4832,9 +5237,9 @@
   for (auto i = 0; i < 8; ++i) {
     for (auto j = 0; j < 10; ++j) {
       ASSERT_OK(
-          Put("bar" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+          Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   const std::vector<InternalStats::CompactionStats>& comp_stats =
       internal_stats_ptr->TEST_GetCompactionStats();
@@ -4843,7 +5248,7 @@
 
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
   const std::vector<InternalStats::CompactionStats>& comp_stats2 =
       internal_stats_ptr->TEST_GetCompactionStats();
@@ -4851,6 +5256,97 @@
   ASSERT_EQ(num, 0);
 }
 
+TEST_F(DBCompactionTest, ManualCompactionMax) {
+  uint64_t l1_avg_size = 0, l2_avg_size = 0;
+  auto generate_sst_func = [&]() {
+    Random rnd(301);
+    for (auto i = 0; i < 100; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    for (auto i = 0; i < 10; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+
+    std::vector<std::vector<FileMetaData>> level_to_files;
+    dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                    &level_to_files);
+
+    uint64_t total = 0;
+    for (const auto& file : level_to_files[1]) {
+      total += file.compensated_file_size;
+    }
+    l1_avg_size = total / level_to_files[1].size();
+
+    total = 0;
+    for (const auto& file : level_to_files[2]) {
+      total += file.compensated_file_size;
+    }
+    l2_avg_size = total / level_to_files[2].size();
+  };
+
+  std::atomic_int num_compactions(0);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+
+  // with default setting (1.6G by default), it should cover all files in 1
+  // compaction
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == 1);
+
+  // split the compaction to 5
+  int num_split = 5;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  opts.max_compaction_bytes = total_size / num_split;
+  opts.target_file_size_base = total_size / num_split;
+  Reopen(opts);
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+
+  // very small max_compaction_bytes, it should still move forward
+  opts.max_compaction_bytes = l1_avg_size / 2;
+  opts.target_file_size_base = l1_avg_size / 2;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() > 10);
+
+  // dynamically set the option
+  num_split = 2;
+  opts.max_compaction_bytes = 0;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  Status s = db_->SetOptions(
+      {{"max_compaction_bytes", std::to_string(total_size / num_split)},
+       {"target_file_size_base", std::to_string(total_size / num_split)}});
+  ASSERT_OK(s);
+
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+}
+
 TEST_F(DBCompactionTest, CompactionDuringShutdown) {
   Options opts = CurrentOptions();
   opts.level0_file_num_compaction_trigger = 2;
@@ -4866,16 +5362,17 @@
   for (auto i = 0; i < 2; ++i) {
     for (auto j = 0; j < 10; ++j) {
       ASSERT_OK(
-          Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
       [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.ok() || s.IsShutdownInProgress());
   ASSERT_OK(dbfull()->error_handler_.GetBGError());
 }
 
@@ -4889,7 +5386,7 @@
 
   // Generate an external SST file containing a single key, i.e. 99
   std::string sst_files_dir = dbname_ + "/sst_files/";
-  test::DestroyDir(env_, sst_files_dir);
+  ASSERT_OK(DestroyDir(env_, sst_files_dir));
   ASSERT_OK(env_->CreateDir(sst_files_dir));
   SstFileWriter sst_writer(EnvOptions(), options);
   const std::string sst_file_path = sst_files_dir + "test.sst";
@@ -4909,14 +5406,15 @@
   options.level0_file_num_compaction_trigger =
       options.level0_stop_writes_trigger;
   options.max_subcompactions = max_subcompactions_;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
   DestroyAndReopen(options);
   Random rnd(301);
 
   // Generate level0_stop_writes_trigger L0 files to trigger write stop
   for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
     for (int j = 0; j != kNumKeysPerFile; ++j) {
-      ASSERT_OK(Put(Key(j), RandomString(&rnd, 990)));
+      ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
     }
     if (0 == i) {
       // When we reach here, the memtables have kNumKeysPerFile keys. Note that
@@ -4928,7 +5426,7 @@
       // extra key to trigger flush.
       ASSERT_OK(Put("", ""));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
   }
   // When we reach this point, there will be level0_stop_writes_trigger L0
@@ -4958,10 +5456,11 @@
 
 TEST_F(DBCompactionTest, ConsistencyFailTest) {
   Options options = CurrentOptions();
+  options.force_consistency_checks = true;
   DestroyAndReopen(options);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "VersionBuilder::CheckConsistency", [&](void* arg) {
+      "VersionBuilder::CheckConsistency0", [&](void* arg) {
         auto p =
             reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
         // just swap the two FileMetaData so that we hit error
@@ -4975,11 +5474,59 @@
 
   for (int k = 0; k < 2; ++k) {
     ASSERT_OK(Put("foo", "bar"));
-    Flush();
+    Status s = Flush();
+    if (k < 1) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsCorruption());
+    }
   }
 
   ASSERT_NOK(Put("foo", "bar"));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest2) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.target_file_size_base = 1000;
+  options.level0_file_num_compaction_trigger = 2;
+  BlockBasedTableOptions bbto;
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency1", [&](void* arg) {
+        auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::string value = rnd.RandomString(1000);
+
+  ASSERT_OK(Put("foo1", value));
+  ASSERT_OK(Put("z", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo2", value));
+  ASSERT_OK(Put("z", ""));
+  Status s = Flush();
+  ASSERT_TRUE(s.ok() || s.IsCorruption());
+
+  // This probably returns non-OK, but we rely on the next Put()
+  // to determine the DB is frozen.
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
+  ASSERT_NOK(Put("foo", "bar"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
 void IngestOneKeyValue(DBImpl* db, const std::string& key,
@@ -5012,10 +5559,16 @@
   const size_t kValueSize = 1 << 20;
   Random rnd(301);
   std::atomic<int> pick_intra_l0_count(0);
-  std::string value(RandomString(&rnd, kValueSize));
+  std::string value(rnd.RandomString(kValueSize));
 
+  // The L0->L1 must be picked before we begin ingesting files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBCompactionTestWithParam::FlushAfterIntraL0:1",
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTestWithParam::"
+        "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
         "CompactionJob::Run():Start"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "FindIntraL0Compaction",
@@ -5043,19 +5596,20 @@
   ASSERT_OK(Put(Key(0), "a"));
 
   ASSERT_EQ(5, NumTableFilesAtLevel(0));
+  TEST_SYNC_POINT(
+      "DBCompactionTestWithParam::"
+      "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready");
 
   // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
   for (int i = 5; i < 10; i++) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
     IngestOneKeyValue(dbfull(), Key(i), value, options);
-    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
   }
 
-  TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1");
   // Put one key, to make biggest log sequence number in this memtable is bigger
   // than sst which would be ingested in next step.
   ASSERT_OK(Put(Key(2), "b"));
-  ASSERT_EQ(10, NumTableFilesAtLevel(0));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   std::vector<std::vector<FileMetaData>> level_to_files;
   dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
@@ -5080,8 +5634,8 @@
 
   const size_t kValueSize = 1 << 20;
   Random rnd(301);
-  std::string value(RandomString(&rnd, kValueSize));
-  std::string value2(RandomString(&rnd, kValueSize));
+  std::string value(rnd.RandomString(kValueSize));
+  std::string value2(rnd.RandomString(kValueSize));
   std::string bigvalue = value + value;
 
   // prevents trivial move
@@ -5093,8 +5647,14 @@
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
 
   std::atomic<int> pick_intra_l0_count(0);
+  // The L0->L1 must be picked before we begin ingesting files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1",
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTestWithParam::"
+        "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
         "CompactionJob::Run():Start"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "FindIntraL0Compaction",
@@ -5125,18 +5685,19 @@
   }
 
   ASSERT_EQ(6, NumTableFilesAtLevel(0));
+  TEST_SYNC_POINT(
+      "DBCompactionTestWithParam::"
+      "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready");
   // ingest file to trigger IntraL0Compaction
   for (int i = 6; i < 10; ++i) {
     ASSERT_EQ(i, NumTableFilesAtLevel(0));
     IngestOneKeyValue(dbfull(), Key(i), value2, options);
   }
-  ASSERT_EQ(10, NumTableFilesAtLevel(0));
 
   // Wake up flush job
   sleeping_tasks.WakeUp();
   sleeping_tasks.WaitUntilDone();
-  TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   uint64_t error_count = 0;
@@ -5151,7 +5712,1668 @@
   }
 }
 
-#endif // !defined(ROCKSDB_LITE)
+TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
+  constexpr int kSstNum = 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Generate some sst files on level 0 with sequence keys (no overlap)
+  for (int i = 0; i < kSstNum; i++) {
+    for (int j = 1; j < UCHAR_MAX; j++) {
+      auto key = std::string(kSstNum, '\0');
+      key[kSstNum - i] += static_cast<char>(j);
+      ASSERT_OK(Put(key, std::string(i % 1000, 'A')));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_EQ(ToString(kSstNum), FilesPerLevel(0));
+
+  auto cro = CompactRangeOptions();
+  cro.bottommost_level_compaction = bottommost_level_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce ||
+      bottommost_level_compaction_ ==
+          BottommostLevelCompaction::kForceOptimized) {
+    // Real compaction to compact all sst files from level 0 to 1 file on level
+    // 1
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+  } else {
+    // Just trivial move from level 0 -> 1
+    ASSERT_EQ("0," + ToString(kSstNum), FilesPerLevel(0));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam,
+    ::testing::Values(BottommostLevelCompaction::kSkip,
+                      BottommostLevelCompaction::kIfHaveCompactionFilter,
+                      BottommostLevelCompaction::kForce,
+                      BottommostLevelCompaction::kForceOptimized));
+
+TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  bool has_compaction = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 10);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10);
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+
+  has_compaction = false;
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 2);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+}
+
+TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  bool has_compaction = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 10);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+  has_compaction = false;
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 2);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+}
+
+TEST_P(ChangeLevelConflictsWithAuto, TestConflict) {
+  // A `CompactRange()` may race with an automatic compaction, we'll need
+  // to make sure it doesn't corrupte the data.
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // Run a qury to refitting to level 1 while another thread writing to
+  // the same level.
+  SyncPoint::GetInstance()->LoadDependency({
+      // The first two dependencies ensure the foreground creates an L0 file
+      // between the background compaction's L0->L1 and its L1->L2.
+      {
+          "DBImpl::CompactRange:BeforeRefit:1",
+          "AutoCompactionFinished1",
+      },
+      {
+          "AutoCompactionFinished2",
+          "DBImpl::CompactRange:BeforeRefit:2",
+      },
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread auto_comp([&] {
+    TEST_SYNC_POINT("AutoCompactionFinished1");
+    ASSERT_OK(Put("bar", "v2"));
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("bar", "v3"));
+    ASSERT_OK(Put("foo", "v3"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    TEST_SYNC_POINT("AutoCompactionFinished2");
+  });
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = GetParam() ? 1 : 0;
+    // This should return non-OK, but it's more important for the test to
+    // make sure that the DB is not corrupted.
+    ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  auto_comp.join();
+  // Refitting didn't happen.
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Write something to DB just make sure that consistency check didn't
+  // fail and make the DB readable.
+}
+
+INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto,
+                        ChangeLevelConflictsWithAuto, testing::Bool());
+
+TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) {
+  // A `CompactRange()` with `change_level == true` needs to execute its final
+  // step, `ReFitLevel()`, in isolation. Previously there was a bug where
+  // refitting could target the same level as an ongoing manual compaction,
+  // leading to overlapping files in that level.
+  //
+  // This test ensures that case is not possible by verifying any manual
+  // compaction issued during the `ReFitLevel()` phase fails with
+  // `Status::Incomplete`.
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  Reopen(options);
+
+  // Setup an LSM with three levels populated.
+  Random rnd(301);
+  int key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  GenerateNewFile(&rnd, &key_idx);
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+  // The background thread will refit L2->L1 while the
+  // foreground thread will try to simultaneously compact L0->L1.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      // The first two dependencies ensure the foreground creates an L0 file
+      // between the background compaction's L0->L1 and its L1->L2.
+      {
+          "DBImpl::RunManualCompaction()::1",
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "PutFG",
+      },
+      {
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "FlushedFG",
+          "DBImpl::RunManualCompaction()::2",
+      },
+      // The next two dependencies ensure the foreground invokes
+      // `CompactRange()` while the background is refitting. The
+      // foreground's `CompactRange()` is guaranteed to attempt an L0->L1
+      // as we set it up with an empty memtable and a new L0 file.
+      {
+          "DBImpl::CompactRange:PreRefitLevel",
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "CompactFG",
+      },
+      {
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "CompactedFG",
+          "DBImpl::CompactRange:PostRefitLevel",
+      },
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG");
+  // Make sure we have something new to compact in the foreground.
+  // Note key 1 is carefully chosen as it ensures the file we create here
+  // overlaps with one of the files being refitted L2->L1 in the background.
+  // If we chose key 0, the file created here would not overlap.
+  ASSERT_OK(Put(Key(1), "val"));
+  ASSERT_OK(Flush());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG");
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG");
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsIncomplete());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+      "CompactedFG");
+  refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
+  // This test is added to ensure that RefitLevel() error paths are clearing
+  // internal flags and to test that subsequent valid RefitLevel() calls
+  // succeeds
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  Reopen(options);
+
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  // Setup an LSM with three levels populated.
+  Random rnd(301);
+  int key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1", FilesPerLevel(0));
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  auto start_idx = key_idx;
+  GenerateNewFile(&rnd, &key_idx);
+  GenerateNewFile(&rnd, &key_idx);
+  auto end_idx = key_idx - 1;
+  ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+  // Next two CompactRange() calls are used to test exercise error paths within
+  // RefitLevel() before triggering a valid RefitLevel() call
+
+  // Trigger a refit to L1 first
+  {
+    std::string begin_string = Key(start_idx);
+    std::string end_string = Key(end_idx);
+    Slice begin(begin_string);
+    Slice end(end_string);
+
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end));
+  }
+  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+  // Try a refit from L2->L1 - this should fail and exercise error paths in
+  // RefitLevel()
+  {
+    // Select key range that matches the bottom most level (L2)
+    std::string begin_string = Key(0);
+    std::string end_string = Key(start_idx - 1);
+    Slice begin(begin_string);
+    Slice end(end_string);
+
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end));
+  }
+  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+  // Try a valid Refit request to ensure, the path is still working
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,5", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlob) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), third_value);
+  ASSERT_EQ(Get(second_key), third_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_EQ(l1_files.size(), 1);
+
+  const FileMetaData* const table_file = l1_files[0];
+  ASSERT_NE(table_file, nullptr);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.begin()->second;
+  ASSERT_NE(blob_file, nullptr);
+
+  ASSERT_EQ(table_file->smallest.user_key(), first_key);
+  ASSERT_EQ(table_file->largest.user_key(), second_key);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 2);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1);
+}
+
+class DBCompactionTestBlobError
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::string> {
+ public:
+  DBCompactionTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBCompactionTestBlobError, CompactionError) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_TRUE(l1_files.empty());
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_TRUE(blob_files.empty());
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_GT(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  }
+}
+
+class DBCompactionTestBlobGC
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::tuple<double, bool>> {
+ public:
+  DBCompactionTestBlobGC()
+      : blob_gc_age_cutoff_(std::get<0>(GetParam())),
+        updated_enable_blob_files_(std::get<1>(GetParam())) {}
+
+  double blob_gc_age_cutoff_;
+  bool updated_enable_blob_files_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC,
+                        ::testing::Combine(::testing::Values(0.0, 0.5, 1.0),
+                                           ::testing::Bool()));
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // one blob per file
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 4);
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  size_t expected_number_of_files = original_blob_files.size();
+
+  if (!updated_enable_blob_files_) {
+    ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+    expected_number_of_files -= cutoff_index;
+  }
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (blob_gc_age_cutoff_ > 0.0) {
+    ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+
+    if (updated_enable_blob_files_) {
+      // GC relocated some blobs to new blob files
+      ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+      ASSERT_EQ(compaction_stats[1].bytes_read_blob,
+                compaction_stats[1].bytes_written_blob);
+    } else {
+      // GC moved some blobs back to the LSM, no new blob files
+      ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    }
+  } else {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char corrupt_blob_index[] = "foobar";
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key,
+                                             corrupt_blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // options is not set, the checksum handoff will not be triggered
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, FIFOWarm) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleFIFO;
+  options.num_levels = 1;
+  options.max_open_files = -1;
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  CompactionOptionsFIFO fifo_options;
+  fifo_options.age_for_warm = 1000;
+  fifo_options.max_table_files_size = 100000000;
+  options.compaction_options_fifo = fifo_options;
+  env_->SetMockSleep();
+  Reopen(options);
+
+  int total_warm = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+        Temperature temperature = *(static_cast<Temperature*>(arg));
+        if (temperature == Temperature::kWarm) {
+          total_warm++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(4, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature);
+  ASSERT_EQ(2, total_warm);
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
+  const int kNumL0Files = 10;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Generate 2 levels of file to make sure the manual compaction is not skipped
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(1);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  port::Thread compact_thread1([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(0);
+    std::string end_str = Key(3);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  port::Thread compact_thread2([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(4);
+    std::string end_str = Key(7);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  // Disable manual compaction should cancel both manual compactions and both
+  // compaction should return incomplete.
+  db_->DisableManualCompaction();
+
+  compact_thread1.join();
+  compact_thread2.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+}
+
+TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  // make sure the manual compaction background is started but not yet set the
+  // status to in_progress, then cancel the manual compaction, which should not
+  // result in segfault
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction",
+        "DBCompactionTest::DisableJustStartedManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableJustStartedManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:InProgress",
+        "DBCompactionTest::DisableInProgressManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "CompactionJob::Run():Start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableInProgressManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  // Try close DB while manual compaction is canceled but still in the queue.
+  // And an auto-triggered compaction is also in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  // Close DB with manual compaction and auto triggered compaction in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  // manual compaction thread should return with Incomplete().
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest,
+       DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
+  // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
+  // for automatic compactions to drain before starting the manual compaction.
+  // This test verifies `DisableManualCompaction()` can cancel such a compaction
+  // without waiting for the drain to complete.
+  const int kNumL0Files = 4;
+
+  // Enforces manual compaction enters wait loop due to pending automatic
+  // compaction.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"},
+       {"DBImpl::RunManualCompaction:WaitScheduled",
+        "BackgroundCallCompaction:0"}});
+  // The automatic compaction will cancel the waiting manual compaction.
+  // Completing this implies the cancellation did not wait on automatic
+  // compactions to finish.
+  bool callback_completed = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+        db_->DisableManualCompaction();
+        callback_completed = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = true;
+  ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(callback_completed);
+}
+
+TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  Reopen(options);
+
+  // Setup an LSM with L2 populated.
+  Random rnd(301);
+  ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+  ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // The background thread will refit L2->L1 while the foreground thread will
+  // attempt to run a compaction on new data. The following dependencies
+  // ensure the background manual compaction's refitting phase disables manual
+  // compaction immediately before the foreground manual compaction can register
+  // itself. Manual compaction is kept disabled until the foreground manual
+  // checks for the failure once.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      // Only do Put()s for foreground CompactRange() once the background
+      // CompactRange() has reached the refitting phase.
+      {
+          "DBImpl::CompactRange:BeforeRefit:1",
+          "DBCompactionTest::ChangeLevelConflictsWithManual:"
+          "PreForegroundCompactRange",
+      },
+      // Right before we register the manual compaction, proceed with
+      // the refitting phase so manual compactions are disabled. Stay in
+      // the refitting phase with manual compactions disabled until it is
+      // noticed.
+      {
+          "DBImpl::RunManualCompaction:0",
+          "DBImpl::CompactRange:BeforeRefit:2",
+      },
+      {
+          "DBImpl::CompactRange:PreRefitLevel",
+          "DBImpl::RunManualCompaction:1",
+      },
+      {
+          "DBImpl::RunManualCompaction:PausedAtStart",
+          "DBImpl::CompactRange:PostRefitLevel",
+      },
+      // If compaction somehow were scheduled, let's let it run after reenabling
+      // manual compactions. This dependency is not expected to be hit but is
+      // here for speculatively coercing future bugs.
+      {
+          "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled",
+          "BackgroundCallCompaction:0",
+      },
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelConflictsWithManual:"
+      "PreForegroundCompactRange");
+  ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+  ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsIncomplete());
+
+  refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
+  // Flushes several files to trigger compaction while lock is released during
+  // a bottom-pri compaction. Verifies it does not get scheduled to thread pool
+  // because per-DB limit for compaction parallelism is one (default).
+  const int kNumL0Files = 4;
+  const int kNumLevels = 3;
+
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // Setup last level to be non-empty since it's a bit unclear whether
+  // compaction to an empty level would be considered "bottommost".
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(kNumLevels - 1);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkBottomCompaction",
+        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PreTriggerCompaction"},
+       {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PostTriggerCompaction",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_range_thread([&] {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  // Sleep in the low-pri thread so any newly scheduled compaction will be
+  // queued. Otherwise it might finish before we check its existence.
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+      "PreTriggerCompaction");
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  TEST_SYNC_POINT(
+      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+      "PostTriggerCompaction");
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  compact_range_thread.join();
+}
+
+#endif  // !defined(ROCKSDB_LITE)
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_dynamic_level_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -15,11 +15,14 @@
 #include "db/db_test_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/env.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 class DBTestDynamicLevel : public DBTestBase {
  public:
-  DBTestDynamicLevel() : DBTestBase("/db_dynamic_level_test") {}
+  DBTestDynamicLevel()
+      : DBTestBase("db_dynamic_level_test", /*env_do_fsync=*/true) {}
 };
 
 TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
@@ -27,7 +30,7 @@
     return;
   }
   // Use InMemoryEnv, or it would be too slow.
-  std::unique_ptr<Env> env(new MockEnv(env_));
+  std::unique_ptr<Env> env(NewMemEnv(env_));
 
   const int kNKeys = 1000;
   int keys[kNKeys];
@@ -50,7 +53,7 @@
       keys[i] = i;
     }
     if (ordered_insert == 0) {
-      std::random_shuffle(std::begin(keys), std::end(keys));
+      RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
     }
     for (int max_background_compactions = 1; max_background_compactions < 4;
          max_background_compactions += 2) {
@@ -80,9 +83,9 @@
 
       for (int i = 0; i < kNKeys; i++) {
         int key = keys[i];
-        ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102)));
-        ASSERT_OK(Put(Key(key), RandomString(&rnd, 102)));
-        ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102)));
+        ASSERT_OK(Put(Key(kNKeys + key), rnd.RandomString(102)));
+        ASSERT_OK(Put(Key(key), rnd.RandomString(102)));
+        ASSERT_OK(Put(Key(kNKeys * 2 + key), rnd.RandomString(102)));
         ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
         env_->SleepForMicroseconds(5000);
       }
@@ -100,7 +103,8 @@
       }
 
       // Test compact range works
-      dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(
+          dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
       // All data should be in the last level.
       ColumnFamilyMetaData cf_meta;
       db_->GetColumnFamilyMetaData(&cf_meta);
@@ -139,6 +143,7 @@
   options.max_background_compactions = 2;
   options.num_levels = 5;
   options.max_compaction_bytes = 0;  // Force not expanding in compactions
+  options.db_host_id = "";  // Setting this messes up the file size calculation
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -158,13 +163,13 @@
   // Put about 28K to L0
   for (int i = 0; i < 70; i++) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 380)));
+                  rnd.RandomString(380)));
   }
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(4U, int_prop);
 
@@ -175,14 +180,14 @@
   }));
   for (int i = 0; i < 70; i++) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 380)));
+                  rnd.RandomString(380)));
   }
 
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(3U, int_prop);
   ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
@@ -197,13 +202,13 @@
   // Write about 40K more
   for (int i = 0; i < 100; i++) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 380)));
+                  rnd.RandomString(380)));
   }
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(3U, int_prop);
 
@@ -216,7 +221,7 @@
   // Each file is about 11KB, with 9KB of data.
   for (int i = 0; i < 1300; i++) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 380)));
+                  rnd.RandomString(380)));
   }
 
   // Make sure that the compaction starts before the last bit of data is
@@ -231,8 +236,8 @@
   }));
 
   TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(2U, int_prop);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -257,11 +262,11 @@
   TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
   for (int i = 0; i < 2; i++) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 380)));
+                  rnd.RandomString(380)));
   }
   TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
 
-  Flush();
+  ASSERT_OK(Flush());
 
   thread.join();
 
@@ -299,7 +304,7 @@
   DestroyAndReopen(options);
 
   // Compact against empty DB
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   uint64_t int_prop;
   std::string str_prop;
@@ -310,16 +315,16 @@
 
   // Put about 7K to L0
   for (int i = 0; i < 140; i++) {
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
+    ASSERT_OK(
+        Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   if (NumTableFilesAtLevel(0) == 0) {
     // Make sure level 0 is not empty
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
-    Flush();
+    ASSERT_OK(
+        Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
+    ASSERT_OK(Flush());
   }
 
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
@@ -340,7 +345,7 @@
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(output_levels.size(), 2);
   ASSERT_TRUE(output_levels.find(3) != output_levels.end());
   ASSERT_TRUE(output_levels.find(4) != output_levels.end());
@@ -382,12 +387,12 @@
   const int total_keys = 3000;
   const int random_part_size = 100;
   for (int i = 0; i < total_keys; i++) {
-    std::string value = RandomString(&rnd, random_part_size);
+    std::string value = rnd.RandomString(random_part_size);
     PutFixed32(&value, static_cast<uint32_t>(i));
     ASSERT_OK(Put(Key(i), value));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   ASSERT_EQ(non_trivial, 0);
@@ -441,12 +446,12 @@
 
   int total_keys = 1000;
   for (int i = 0; i < total_keys; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
-    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
     ASSERT_OK(Delete(Key(i / 10)));
   }
   verify_func(total_keys, false);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   options.level_compaction_dynamic_level_bytes = true;
   options.disable_auto_compactions = true;
@@ -461,7 +466,7 @@
     CompactRangeOptions compact_options;
     compact_options.change_level = true;
     compact_options.target_level = options.num_levels - 1;
-    dbfull()->CompactRange(compact_options, nullptr, nullptr);
+    ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
     compaction_finished.store(true);
   });
   do {
@@ -475,13 +480,13 @@
 
   int total_keys2 = 2000;
   for (int i = total_keys; i < total_keys2; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
-    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
     ASSERT_OK(Delete(Key(i / 10)));
   }
 
   verify_func(total_keys2, false);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   verify_func(total_keys2, false);
 
   // Base level is not level 1
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_encryption_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_encryption_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_encryption_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_encryption_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,7 +16,15 @@
 
 class DBEncryptionTest : public DBTestBase {
  public:
-  DBEncryptionTest() : DBTestBase("/db_encryption_test") {}
+  DBEncryptionTest()
+      : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {}
+  Env* GetTargetEnv() {
+    if (encrypted_env_ != nullptr) {
+      return (static_cast<EnvWrapper*>(encrypted_env_))->target();
+    } else {
+      return env_;
+    }
+  }
 };
 
 #ifndef ROCKSDB_LITE
@@ -33,20 +41,20 @@
   auto status = env_->GetChildren(dbname_, &fileNames);
   ASSERT_OK(status);
 
-  auto defaultEnv = Env::Default();
+  Env* target = GetTargetEnv();
   int hits = 0;
   for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) {
-    if ((*it == "..") || (*it == ".")) {
+    if (*it == "LOCK") {
       continue;
     }
     auto filePath = dbname_ + "/" + *it;
     std::unique_ptr<SequentialFile> seqFile;
     auto envOptions = EnvOptions(CurrentOptions());
-    status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+    status = target->NewSequentialFile(filePath, &seqFile, envOptions);
     ASSERT_OK(status);
 
     uint64_t fileSize;
-    status = defaultEnv->GetFileSize(filePath, &fileSize);
+    status = target->GetFileSize(filePath, &fileSize);
     ASSERT_OK(status);
 
     std::string scratch;
@@ -84,7 +92,7 @@
 }
 
 TEST_F(DBEncryptionTest, ReadEmptyFile) {
-  auto defaultEnv = Env::Default();
+  auto defaultEnv = GetTargetEnv();
 
   // create empty file for reading it back in later
   auto envOptions = EnvOptions(CurrentOptions());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_filesnapshot.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_filesnapshot.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_filesnapshot.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_filesnapshot.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,72 +6,62 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <stdint.h>
 #include <algorithm>
-#include <cinttypes>
+#include <cstdint>
+#include <memory>
 #include <string>
+#include <vector>
+
 #include "db/db_impl/db_impl.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
 #include "file/file_util.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/types.h"
 #include "test_util/sync_point.h"
+#include "util/file_checksum_helper.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-Status DBImpl::DisableFileDeletions() {
-  InstrumentedMutexLock l(&mutex_);
-  ++disable_delete_obsolete_files_;
-  if (disable_delete_obsolete_files_ == 1) {
-    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled");
-  } else {
-    ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                   "File Deletions Disabled, but already disabled. Counter: %d",
-                   disable_delete_obsolete_files_);
-  }
-  return Status::OK();
-}
+Status DBImpl::FlushForGetLiveFiles() {
+  mutex_.AssertHeld();
 
-Status DBImpl::EnableFileDeletions(bool force) {
-  // Job id == 0 means that this is not our background process, but rather
-  // user thread
-  JobContext job_context(0);
-  bool file_deletion_enabled = false;
-  {
-    InstrumentedMutexLock l(&mutex_);
-    if (force) {
-      // if force, we need to enable file deletions right away
-      disable_delete_obsolete_files_ = 0;
-    } else if (disable_delete_obsolete_files_ > 0) {
-      --disable_delete_obsolete_files_;
-    }
-    if (disable_delete_obsolete_files_ == 0)  {
-      file_deletion_enabled = true;
-      FindObsoleteFiles(&job_context, true);
-      bg_cv_.SignalAll();
-    }
-  }
-  if (file_deletion_enabled) {
-    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
-    if (job_context.HaveSomethingToDelete()) {
-      PurgeObsoleteFiles(job_context);
+  // flush all dirty data to disk.
+  Status status;
+  if (immutable_db_options_.atomic_flush) {
+    autovector<ColumnFamilyData*> cfds;
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    mutex_.Unlock();
+    status =
+        AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles);
+    if (status.IsColumnFamilyDropped()) {
+      status = Status::OK();
     }
+    mutex_.Lock();
   } else {
-    ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                   "File Deletions Enable, but not really enabled. Counter: %d",
-                   disable_delete_obsolete_files_);
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      mutex_.Unlock();
+      status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+      mutex_.Lock();
+      if (!status.ok() && !status.IsColumnFamilyDropped()) {
+        break;
+      } else if (status.IsColumnFamilyDropped()) {
+        status = Status::OK();
+      }
+    }
   }
-  job_context.Clean();
-  LogFlush(immutable_db_options_.info_log);
-  return Status::OK();
-}
-
-int DBImpl::IsFileDeletionsEnabled() const {
-  return !disable_delete_obsolete_files_;
+  return status;
 }
 
 Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
@@ -82,34 +72,7 @@
   mutex_.Lock();
 
   if (flush_memtable) {
-    // flush all dirty data to disk.
-    Status status;
-    if (immutable_db_options_.atomic_flush) {
-      autovector<ColumnFamilyData*> cfds;
-      SelectColumnFamiliesForAtomicFlush(&cfds);
-      mutex_.Unlock();
-      status = AtomicFlushMemTables(cfds, FlushOptions(),
-                                    FlushReason::kGetLiveFiles);
-      mutex_.Lock();
-    } else {
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        if (cfd->IsDropped()) {
-          continue;
-        }
-        cfd->Ref();
-        mutex_.Unlock();
-        status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
-        TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
-        TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
-        mutex_.Lock();
-        cfd->UnrefAndTryDelete();
-        if (!status.ok()) {
-          break;
-        }
-      }
-    }
-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
-
+    Status status = FlushForGetLiveFiles();
     if (!status.ok()) {
       mutex_.Unlock();
       ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
@@ -118,27 +81,40 @@
     }
   }
 
-  // Make a set of all of the live *.sst files
-  std::vector<FileDescriptor> live;
+  // Make a set of all of the live table and blob files
+  std::vector<uint64_t> live_table_files;
+  std::vector<uint64_t> live_blob_files;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
     if (cfd->IsDropped()) {
       continue;
     }
-    cfd->current()->AddLiveFiles(&live);
+    cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
   }
 
   ret.clear();
-  ret.reserve(live.size() + 3);  // *.sst + CURRENT + MANIFEST + OPTIONS
+  ret.reserve(live_table_files.size() + live_blob_files.size() +
+              3);  // for CURRENT + MANIFEST + OPTIONS
 
   // create names of the live files. The names are not absolute
   // paths, instead they are relative to dbname_;
-  for (const auto& live_file : live) {
-    ret.push_back(MakeTableFileName("", live_file.GetNumber()));
+  for (const auto& table_file_number : live_table_files) {
+    ret.emplace_back(MakeTableFileName("", table_file_number));
   }
 
-  ret.push_back(CurrentFileName(""));
-  ret.push_back(DescriptorFileName("", versions_->manifest_file_number()));
-  ret.push_back(OptionsFileName("", versions_->options_file_number()));
+  for (const auto& blob_file_number : live_blob_files) {
+    ret.emplace_back(BlobFileName("", blob_file_number));
+  }
+
+  ret.emplace_back(CurrentFileName(""));
+  ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
+  // The OPTIONS file number is zero in read-write mode when OPTIONS file
+  // writing failed and the DB was configured with
+  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+  // number is zero when no OPTIONS file exist at all. In those cases we do not
+  // record any OPTIONS file in the live file list.
+  if (versions_->options_file_number() != 0) {
+    ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
+  }
 
   // find length of manifest file while holding the mutex lock
   *manifest_file_size = versions_->manifest_file_size();
@@ -148,19 +124,33 @@
 }
 
 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  // If caller disabled deletions, this function should return files that are
+  // guaranteed not to be deleted until deletions are re-enabled. We need to
+  // wait for pending purges to finish since WalManager doesn't know which
+  // files are going to be purged. Additional purges won't be scheduled as
+  // long as deletions are disabled (so the below loop must terminate).
+  // Also note that we disable deletions anyway to avoid the case where a
+  // file is deleted in the middle of the scan, causing IO error.
+  Status deletions_disabled = DisableFileDeletions();
   {
-    // If caller disabled deletions, this function should return files that are
-    // guaranteed not to be deleted until deletions are re-enabled. We need to
-    // wait for pending purges to finish since WalManager doesn't know which
-    // files are going to be purged. Additional purges won't be scheduled as
-    // long as deletions are disabled (so the below loop must terminate).
     InstrumentedMutexLock l(&mutex_);
-    while (disable_delete_obsolete_files_ > 0 &&
-           pending_purge_obsolete_files_ > 0) {
+    while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
       bg_cv_.Wait();
     }
   }
-  return wal_manager_.GetSortedWalFiles(files);
+
+  Status s = wal_manager_.GetSortedWalFiles(files);
+
+  // DisableFileDeletions / EnableFileDeletions not supported in read-only DB
+  if (deletions_disabled.ok()) {
+    Status s2 = EnableFileDeletions(/*force*/ false);
+    assert(s2.ok());
+    s2.PermitUncheckedError();
+  } else {
+    assert(deletions_disabled.IsNotSupported());
+  }
+
+  return s;
 }
 
 Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
@@ -172,6 +162,245 @@
 
   return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
 }
+
+Status DBImpl::GetLiveFilesStorageInfo(
+    const LiveFilesStorageInfoOptions& opts,
+    std::vector<LiveFileStorageInfo>* files) {
+  // To avoid returning partial results, only move to ouput on success
+  assert(files);
+  files->clear();
+  std::vector<LiveFileStorageInfo> results;
+
+  // NOTE: This implementation was largely migrated from Checkpoint.
+
+  Status s;
+  VectorLogPtr live_wal_files;
+  bool flush_memtable = true;
+  if (!immutable_db_options_.allow_2pc) {
+    if (opts.wal_size_for_flush == port::kMaxUint64) {
+      flush_memtable = false;
+    } else if (opts.wal_size_for_flush > 0) {
+      // If out standing log files are small, we skip the flush.
+      s = GetSortedWalFiles(live_wal_files);
+
+      if (!s.ok()) {
+        return s;
+      }
+
+      // Don't flush column families if total log size is smaller than
+      // log_size_for_flush. We copy the log files instead.
+      // We may be able to cover 2PC case too.
+      uint64_t total_wal_size = 0;
+      for (auto& wal : live_wal_files) {
+        total_wal_size += wal->SizeFileBytes();
+      }
+      if (total_wal_size < opts.wal_size_for_flush) {
+        flush_memtable = false;
+      }
+      live_wal_files.clear();
+    }
+  }
+
+  // This is a modified version of GetLiveFiles, to get access to more
+  // metadata.
+  mutex_.Lock();
+  if (flush_memtable) {
+    Status status = FlushForGetLiveFiles();
+    if (!status.ok()) {
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+                      status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live table and blob files
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    VersionStorageInfo& vsi = *cfd->current()->storage_info();
+    auto& cf_paths = cfd->ioptions()->cf_paths;
+
+    auto GetDir = [&](size_t path_id) {
+      // Matching TableFileName() behavior
+      if (path_id >= cf_paths.size()) {
+        assert(false);
+        return cf_paths.back().path;
+      } else {
+        return cf_paths[path_id].path;
+      }
+    };
+
+    for (int level = 0; level < vsi.num_levels(); ++level) {
+      const auto& level_files = vsi.LevelFiles(level);
+      for (const auto& meta : level_files) {
+        assert(meta);
+
+        results.emplace_back();
+        LiveFileStorageInfo& info = results.back();
+
+        info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
+        info.directory = GetDir(meta->fd.GetPathId());
+        info.file_number = meta->fd.GetNumber();
+        info.file_type = kTableFile;
+        info.size = meta->fd.GetFileSize();
+        if (opts.include_checksum_info) {
+          info.file_checksum_func_name = meta->file_checksum_func_name;
+          info.file_checksum = meta->file_checksum;
+          if (info.file_checksum_func_name.empty()) {
+            info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+            info.file_checksum = kUnknownFileChecksum;
+          }
+        }
+        info.temperature = meta->temperature;
+      }
+    }
+    const auto& blob_files = vsi.GetBlobFiles();
+    for (const auto& pair : blob_files) {
+      const auto& meta = pair.second;
+      assert(meta);
+
+      results.emplace_back();
+      LiveFileStorageInfo& info = results.back();
+
+      info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
+      info.directory = GetName();  // TODO?: support db_paths/cf_paths
+      info.file_number = meta->GetBlobFileNumber();
+      info.file_type = kBlobFile;
+      info.size = meta->GetBlobFileSize();
+      if (opts.include_checksum_info) {
+        info.file_checksum_func_name = meta->GetChecksumMethod();
+        info.file_checksum = meta->GetChecksumValue();
+        if (info.file_checksum_func_name.empty()) {
+          info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+          info.file_checksum = kUnknownFileChecksum;
+        }
+      }
+      // TODO?: info.temperature
+    }
+  }
+
+  // Capture some final info before releasing mutex
+  const uint64_t manifest_number = versions_->manifest_file_number();
+  const uint64_t manifest_size = versions_->manifest_file_size();
+  const uint64_t options_number = versions_->options_file_number();
+  const uint64_t options_size = versions_->options_file_size_;
+  const uint64_t min_log_num = MinLogNumberToKeep();
+
+  mutex_.Unlock();
+
+  std::string manifest_fname = DescriptorFileName(manifest_number);
+  {  // MANIFEST
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = manifest_fname;
+    info.directory = GetName();
+    info.file_number = manifest_number;
+    info.file_type = kDescriptorFile;
+    info.size = manifest_size;
+    info.trim_to_size = true;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  {  // CURRENT
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = kCurrentFileName;
+    info.directory = GetName();
+    info.file_type = kCurrentFile;
+    // CURRENT could be replaced so we have to record the contents we want
+    // for it
+    info.replacement_contents = manifest_fname + "\n";
+    info.size = manifest_fname.size() + 1;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  // The OPTIONS file number is zero in read-write mode when OPTIONS file
+  // writing failed and the DB was configured with
+  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+  // number is zero when no OPTIONS file exist at all. In those cases we do not
+  // record any OPTIONS file in the live file list.
+  if (options_number != 0) {
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = OptionsFileName(options_number);
+    info.directory = GetName();
+    info.file_number = options_number;
+    info.file_type = kOptionsFile;
+    info.size = options_size;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  // Some legacy testing stuff  TODO: carefully clean up obsolete parts
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
+
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
+
+  if (s.ok()) {
+    s = FlushWAL(false /* sync */);
+  }
+
+  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
+  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
+
+  // if we have more than one column family, we need to also get WAL files
+  if (s.ok()) {
+    s = GetSortedWalFiles(live_wal_files);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  size_t wal_size = live_wal_files.size();
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size());
+
+  // Link WAL files. Copy exact size of last one because it is the only one
+  // that has changes after the last flush.
+  auto wal_dir = immutable_db_options_.GetWalDir();
+  for (size_t i = 0; s.ok() && i < wal_size; ++i) {
+    if ((live_wal_files[i]->Type() == kAliveLogFile) &&
+        (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) {
+      results.emplace_back();
+      LiveFileStorageInfo& info = results.back();
+      auto f = live_wal_files[i]->PathName();
+      assert(!f.empty() && f[0] == '/');
+      info.relative_filename = f.substr(1);
+      info.directory = wal_dir;
+      info.file_number = live_wal_files[i]->LogNumber();
+      info.file_type = kWalFile;
+      info.size = live_wal_files[i]->SizeFileBytes();
+      // Only last should need to be trimmed
+      info.trim_to_size = (i + 1 == wal_size);
+      if (opts.include_checksum_info) {
+        info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+        info.file_checksum = kUnknownFileChecksum;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    // Only move output on success
+    *files = std::move(results);
+  }
+  return s;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_flush_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_flush_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_flush_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_flush_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,21 +8,31 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <atomic>
+#include <limits>
 
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "test_util/fault_injection_test_env.h"
+#include "rocksdb/utilities/transaction_db.h"
 #include "test_util/sync_point.h"
+#include "test_util/testutil.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
 class DBFlushTest : public DBTestBase {
  public:
-  DBFlushTest() : DBTestBase("/db_flush_test") {}
+  DBFlushTest() : DBTestBase("db_flush_test", /*env_do_fsync=*/true) {}
 };
 
 class DBFlushDirectIOTest : public DBFlushTest,
@@ -62,7 +72,7 @@
   ASSERT_OK(Put("bar", "v"));
   ASSERT_OK(dbfull()->Flush(no_wait));
   // If the issue is hit we will wait here forever.
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 #ifndef ROCKSDB_LITE
   ASSERT_EQ(2, TotalTableFiles());
 #endif  // ROCKSDB_LITE
@@ -78,41 +88,26 @@
   options.env = fault_injection_env.get();
 
   SyncPoint::GetInstance()->LoadDependency(
-      {{"DBFlushTest::SyncFail:GetVersionRefCount:1",
-        "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"},
-       {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
-        "DBFlushTest::SyncFail:GetVersionRefCount:2"},
-       {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+      {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
        {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
   CreateAndReopenWithCF({"pikachu"}, options);
-  Put("key", "value");
-  auto* cfd =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
-          ->cfd();
+  ASSERT_OK(Put("key", "value"));
   FlushOptions flush_options;
   flush_options.wait = false;
   ASSERT_OK(dbfull()->Flush(flush_options));
   // Flush installs a new super-version. Get the ref count after that.
-  auto current_before = cfd->current();
-  int refs_before = cfd->current()->TEST_refs();
-  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1");
-  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2");
-  int refs_after_picking_memtables = cfd->current()->TEST_refs();
-  ASSERT_EQ(refs_before + 1, refs_after_picking_memtables);
   fault_injection_env->SetFilesystemActive(false);
   TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
   TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
   fault_injection_env->SetFilesystemActive(true);
   // Now the background job will do the flush; wait for it.
-  dbfull()->TEST_WaitForFlushMemTable();
+  // Returns the IO error happend during flush.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("", FilesPerLevel());  // flush failed.
 #endif                             // ROCKSDB_LITE
-  // Backgroun flush job should release ref count to current version.
-  ASSERT_EQ(current_before, cfd->current());
-  ASSERT_EQ(refs_before, cfd->current()->TEST_refs());
   Destroy(options);
 }
 
@@ -125,7 +120,7 @@
   SyncPoint::GetInstance()->EnableProcessing();
 
   Reopen(options);
-  Put("key", "value");
+  ASSERT_OK(Put("key", "value"));
 
   FlushOptions flush_options;
   flush_options.wait = false;
@@ -135,7 +130,7 @@
   TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
 
   // Now the background job will do the flush; wait for it.
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   Destroy(options);
 }
@@ -145,7 +140,7 @@
   // scheduled in the low-pri (compaction) thread pool.
   Options options = CurrentOptions();
   options.level0_file_num_compaction_trigger = 4;
-  options.memtable_factory.reset(new SpecialSkipListFactory(1));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
   Reopen(options);
   env_->SetBackgroundThreads(0, Env::HIGH);
 
@@ -170,13 +165,73 @@
   ASSERT_OK(Put("key", "val"));
   for (int i = 0; i < 4; ++i) {
     ASSERT_OK(Put("key", "val"));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(4, num_flushes);
   ASSERT_EQ(1, num_compactions);
 }
 
+// Test when flush job is submitted to low priority thread pool and when DB is
+// closed in the meanwhile, CloseHelper doesn't hang.
+TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 1;
+  options.max_total_wal_size = 8192;
+
+  DestroyAndReopen(options);
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+
+  env_->SetBackgroundThreads(0, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  int num_flushes = 0;
+
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush",
+                                        [&](void* /*arg*/) { ++num_flushes; });
+
+  int num_low_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) {
+        num_low_flush_unscheduled++;
+        // There should be one flush job in low pool that needs to be
+        // unscheduled
+        ASSERT_EQ(num_low_flush_unscheduled, 1);
+      });
+
+  int num_high_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) {
+        num_high_flush_unscheduled++;
+        // There should be no flush job in high pool
+        ASSERT_EQ(num_high_flush_unscheduled, 0);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  // Block thread so that flush cannot be run and can be removed from the queue
+  // when called Unschedule.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Trigger flush and flush job will be scheduled to LOW priority thread.
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+
+  // Close DB and flush job in low priority queue will be removed without
+  // running.
+  Close();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_EQ(0, num_flushes);
+
+  TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key3", DummyString(8192)));
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ(1, num_flushes);
+}
+
 TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
   Options options = CurrentOptions();
   options.write_buffer_size = 100;
@@ -236,13 +291,1096 @@
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+// The following 3 tests are designed for testing garbage statistics at flush
+// time.
+//
+// ======= General Information ======= (from GitHub Wiki).
+// There are three scenarios where memtable flush can be triggered:
+//
+// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size
+//     after a write.
+// 2 - Total memtable size across all column families exceeds
+// DBOptions::db_write_buffer_size,
+//     or DBOptions::write_buffer_manager signals a flush. In this scenario
+//     the largest memtable will be flushed.
+// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size.
+//     In this scenario the memtable with the oldest data will be flushed,
+//     in order to allow the WAL file with data from this memtable to be
+//     purged.
+//
+// As a result, a memtable can be flushed before it is full. This is one
+// reason the generated SST file can be smaller than the corresponding
+// memtable. Compression is another factor to make SST file smaller than
+// corresponding memtable, since data in memtable is uncompressed.
+
+TEST_F(DBFlushTest, StatisticsGarbageBasic) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 64 << 20;
+
+  ASSERT_OK(TryReopen(options));
+
+  // Put multiple times the same key-values.
+  // The encoded length of a db entry in the memtable is
+  // defined in db/memtable.cc (MemTable::Add) as the variable:
+  // encoded_len=  VarintLength(internal_key_size)  --> =
+  // log_256(internal_key).
+  // Min # of bytes
+  //                                                       necessary to
+  //                                                       store
+  //                                                       internal_key_size.
+  //             + internal_key_size                --> = actual key string,
+  //             (size key_size: w/o term null char)
+  //                                                      + 8 bytes for
+  //                                                      fixed uint64 "seq
+  //                                                      number
+  // +
+  //                                                      insertion type"
+  //             + VarintLength(val_size)           --> = min # of bytes to
+  //             store val_size
+  //             + val_size                         --> = actual value
+  //             string
+  // For example, in our situation, "key1" : size 4, "value1" : size 6
+  // (the terminating null characters are not copied over to the memtable).
+  // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry.
+  // However in terms of raw data contained in the memtable, and written
+  // over to the SSTable, we only count internal_key_size and val_size,
+  // because this is the only raw chunk of bytes that contains everything
+  // necessary to reconstruct a user entry: sequence number, insertion type,
+  // key, and value.
+
+  // To test the relevance of our Memtable garbage statistics,
+  // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+  // we insert K-V pairs with 3 distinct keys (of length 4),
+  // and random values of arbitrary length RAND_VALUES_LENGTH,
+  // and we repeat this step NUM_REPEAT times total.
+  // At the end, we insert 3 final K-V pairs with the same 3 keys
+  // and known values (these will be the final values, of length 6).
+  // I chose NUM_REPEAT=2,000 such that no automatic flush is
+  // triggered (the number of bytes in the memtable is therefore
+  // well below any meaningful heuristic for a memtable of size 64MB).
+  // As a result, since each K-V pair is inserted as a payload
+  // of N meaningful bytes (sequence number, insertion type,
+  // key, and value = 8 + 4 + RAND_VALUE_LENGTH),
+  // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes
+  // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH +
+  // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we
+  // expect:
+  //      N = 8 + 4 + 172 = 184 bytes
+  //      MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes.
+  //      MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes.
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 172;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string VALUE1 = "value1";
+  const std::string VALUE2 = "value2";
+  const std::string VALUE3 = "value3";
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  ASSERT_OK(Put(KEY1, VALUE1));
+  ASSERT_OK(Put(KEY2, VALUE2));
+  ASSERT_OK(Put(KEY3, VALUE3));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() +
+      VALUE3.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the last K-V pairs have been successfully inserted,
+  // and that the valid values are VALUE1, VALUE2, VALUE3.
+  PinnableSlice value;
+  ASSERT_OK(Get(KEY1, &value));
+  ASSERT_EQ(value.ToString(), VALUE1);
+  ASSERT_OK(Get(KEY2, &value));
+  ASSERT_EQ(value.ToString(), VALUE2);
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value.ToString(), VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 37;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  WriteBatch batch;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(Delete(KEY1));
+    ASSERT_OK(Delete(KEY2));
+    ASSERT_OK(Delete(KEY3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of delete for KEY1, KEY2, KEY3 is written to
+  // SSTable to propagate the delete operations to K-V pairs
+  // that could have been inserted into the database during past Flush
+  // opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(Delete(KEY4));
+  ASSERT_OK(Delete(KEY5));
+  ASSERT_OK(Delete(KEY6));
+
+  // // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  ASSERT_NOK(Get(KEY3, &value));
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 1000;
+  const size_t RAND_VALUES_LENGTH = 42;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+  const std::string VALUE3 = "value3";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  // Also insert DeleteRange
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+                               KEY2));
+    // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3)
+    // is deleted.
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+                               KEY3));
+    // Delete ranges are stored as a regular K-V pair, with key=STARTKEY,
+    // value=ENDKEY.
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+        (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written
+  // to SSTable to propagate the deleteRange operations to K-V pairs that could
+  // have been inserted into the database during past Flush opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+      (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+
+  // Overwrite KEY3 with known value (VALUE3)
+  // Note that during the whole time KEY3 has never been deleted
+  // by the RangeDeletes.
+  ASSERT_OK(Put(KEY3, VALUE3));
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY3.size() + VALUE3.size() + sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      (KEY4.size() + KEY5.size() + sizeof(uint64_t)) +
+      (KEY5.size() + KEY6.size() + sizeof(uint64_t));
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  // And that KEY3's value is correct.
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value, VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+  TestFlushListener(Env* env, DBFlushTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+    db_closed = false;
+  }
+
+  ~TestFlushListener() override {
+    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
+  }
+
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    // remember the info for later checking the FlushJobInfo.
+    prev_fc_info_ = info;
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_GT(info.table_properties.data_size, 0U);
+    ASSERT_GT(info.table_properties.raw_key_size, 0U);
+    ASSERT_GT(info.table_properties.raw_value_size, 0U);
+    ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+    ASSERT_GT(info.table_properties.num_entries, 0U);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  }
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(info.cf_name);
+    if (info.triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (info.triggered_writes_stop) {
+      stop_count++;
+    }
+    // verify whether the previously created file matches the flushed file.
+    ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+    ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+    ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+    ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(),
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+    ASSERT_GT(info.thread_id, 0U);
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+  bool db_closing;
+  std::atomic_bool db_closed;
+  TableFileCreationInfo prev_fc_info_;
+
+ protected:
+  Env* env_;
+  DBFlushTest* test_;
+};
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBFlushTest, MemPurgeBasic) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 1.0;
+#ifndef ROCKSDB_LITE
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#endif  // !ROCKSDB_LITE
+  ASSERT_OK(TryReopen(options));
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string KEY1 = "IamKey1";
+  std::string KEY2 = "IamKey2";
+  std::string KEY3 = "IamKey3";
+  std::string KEY4 = "IamKey4";
+  std::string KEY5 = "IamKey5";
+  std::string KEY6 = "IamKey6";
+  std::string KEY7 = "IamKey7";
+  std::string KEY8 = "IamKey8";
+  std::string KEY9 = "IamKey9";
+  std::string RNDKEY1, RNDKEY2, RNDKEY3;
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  // Heavy overwrite workload,
+  // more than would fit in maximum allowed memtables.
+  Random rnd(719);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_KEYS_LENGTH = 57;
+  const size_t RAND_VALUES_LENGTH = 10240;
+  std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9, p_rv1,
+      p_rv2, p_rv3;
+
+  // Insert a very first set of keys that will be
+  // mempurged at least once.
+  p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+  ASSERT_OK(Put(KEY1, p_v1));
+  ASSERT_OK(Put(KEY2, p_v2));
+  ASSERT_OK(Put(KEY3, p_v3));
+  ASSERT_OK(Put(KEY4, p_v4));
+  ASSERT_EQ(Get(KEY1), p_v1);
+  ASSERT_EQ(Get(KEY2), p_v2);
+  ASSERT_EQ(Get(KEY3), p_v3);
+  ASSERT_EQ(Get(KEY4), p_v4);
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+    ASSERT_OK(Put(KEY5, p_v5));
+    ASSERT_OK(Put(KEY6, p_v6));
+    ASSERT_OK(Put(KEY7, p_v7));
+    ASSERT_OK(Put(KEY8, p_v8));
+    ASSERT_OK(Put(KEY9, p_v9));
+
+    ASSERT_EQ(Get(KEY1), p_v1);
+    ASSERT_EQ(Get(KEY2), p_v2);
+    ASSERT_EQ(Get(KEY3), p_v3);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+    ASSERT_EQ(Get(KEY6), p_v6);
+    ASSERT_EQ(Get(KEY7), p_v7);
+    ASSERT_EQ(Get(KEY8), p_v8);
+    ASSERT_EQ(Get(KEY9), p_v9);
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Insertion of of K-V pairs, no overwrites.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    RNDKEY1 = rnd.RandomString(RAND_KEYS_LENGTH);
+    RNDKEY2 = rnd.RandomString(RAND_KEYS_LENGTH);
+    RNDKEY3 = rnd.RandomString(RAND_KEYS_LENGTH);
+    p_rv1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_rv2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_rv3 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+    ASSERT_OK(Put(RNDKEY1, p_rv1));
+    ASSERT_OK(Put(RNDKEY2, p_rv2));
+    ASSERT_OK(Put(RNDKEY3, p_rv3));
+
+    ASSERT_EQ(Get(KEY1), p_v1);
+    ASSERT_EQ(Get(KEY2), p_v2);
+    ASSERT_EQ(Get(KEY3), p_v3);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+    ASSERT_EQ(Get(KEY6), p_v6);
+    ASSERT_EQ(Get(KEY7), p_v7);
+    ASSERT_EQ(Get(KEY8), p_v8);
+    ASSERT_EQ(Get(KEY9), p_v9);
+    ASSERT_EQ(Get(RNDKEY1), p_rv1);
+    ASSERT_EQ(Get(RNDKEY2), p_rv2);
+    ASSERT_EQ(Get(RNDKEY3), p_rv3);
+  }
+
+  // Assert that at least one flush to storage has been performed
+  EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+  // (which will consequently increase the number of mempurges recorded too).
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+
+  // Assert that there is no data corruption, even with
+  // a flush to storage.
+  ASSERT_EQ(Get(KEY1), p_v1);
+  ASSERT_EQ(Get(KEY2), p_v2);
+  ASSERT_EQ(Get(KEY3), p_v3);
+  ASSERT_EQ(Get(KEY4), p_v4);
+  ASSERT_EQ(Get(KEY5), p_v5);
+  ASSERT_EQ(Get(KEY6), p_v6);
+  ASSERT_EQ(Get(KEY7), p_v7);
+  ASSERT_EQ(Get(KEY8), p_v8);
+  ASSERT_EQ(Get(KEY9), p_v9);
+  ASSERT_EQ(Get(RNDKEY1), p_rv1);
+  ASSERT_EQ(Get(RNDKEY2), p_rv2);
+  ASSERT_EQ(Get(RNDKEY3), p_rv3);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
+  Options options = CurrentOptions();
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#endif  // !ROCKSDB_LITE
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 1.0;
+
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string KEY1 = "ThisIsKey1";
+  std::string KEY2 = "ThisIsKey2";
+  std::string KEY3 = "ThisIsKey3";
+  std::string KEY4 = "ThisIsKey4";
+  std::string KEY5 = "ThisIsKey5";
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  Random rnd(117);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_VALUES_LENGTH = 10240;
+
+  std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5;
+  int count = 0;
+  const int EXPECTED_COUNT_FORLOOP = 3;
+  const int EXPECTED_COUNT_END = 4;
+
+  ReadOptions ropt;
+  ropt.pin_data = true;
+  ropt.total_order_seek = true;
+  Iterator* iter = nullptr;
+
+  // Insertion of of K-V pairs, multiple times.
+  // Also insert DeleteRange
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v3b = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    ASSERT_OK(Put(KEY4, p_v4));
+    ASSERT_OK(Put(KEY5, p_v5));
+    ASSERT_OK(Delete(KEY2));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+                               KEY4));
+    ASSERT_OK(Put(KEY3, p_v3b));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+                               KEY3));
+    ASSERT_OK(Delete(KEY1));
+
+    ASSERT_EQ(Get(KEY1), NOT_FOUND);
+    ASSERT_EQ(Get(KEY2), NOT_FOUND);
+    ASSERT_EQ(Get(KEY3), p_v3b);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+
+    iter = db_->NewIterator(ropt);
+    iter->SeekToFirst();
+    count = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      key = (iter->key()).ToString(false);
+      value = (iter->value()).ToString(false);
+      if (key.compare(KEY3) == 0)
+        ASSERT_EQ(value, p_v3b);
+      else if (key.compare(KEY4) == 0)
+        ASSERT_EQ(value, p_v4);
+      else if (key.compare(KEY5) == 0)
+        ASSERT_EQ(value, p_v5);
+      else
+        ASSERT_EQ(value, NOT_FOUND);
+      count++;
+    }
+
+    // Expected count here is 3: KEY3, KEY4, KEY5.
+    ASSERT_EQ(count, EXPECTED_COUNT_FORLOOP);
+    if (iter) {
+      delete iter;
+    }
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Additional test for the iterator+memPurge.
+  ASSERT_OK(Put(KEY2, p_v2));
+  iter = db_->NewIterator(ropt);
+  iter->SeekToFirst();
+  ASSERT_OK(Put(KEY4, p_v4));
+  count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    key = (iter->key()).ToString(false);
+    value = (iter->value()).ToString(false);
+    if (key.compare(KEY2) == 0)
+      ASSERT_EQ(value, p_v2);
+    else if (key.compare(KEY3) == 0)
+      ASSERT_EQ(value, p_v3b);
+    else if (key.compare(KEY4) == 0)
+      ASSERT_EQ(value, p_v4);
+    else if (key.compare(KEY5) == 0)
+      ASSERT_EQ(value, p_v5);
+    else
+      ASSERT_EQ(value, NOT_FOUND);
+    count++;
+  }
+
+  // Expected count here is 4: KEY2, KEY3, KEY4, KEY5.
+  ASSERT_EQ(count, EXPECTED_COUNT_END);
+  if (iter) delete iter;
+
+  Close();
+}
+
+// Create a Compaction Fitler that will be invoked
+// at flush time and will update the value of a KV pair
+// if the key string is "lower" than the filter_key_ string.
+class ConditionalUpdateFilter : public CompactionFilter {
+ public:
+  explicit ConditionalUpdateFilter(const std::string* filtered_key)
+      : filtered_key_(filtered_key) {}
+  bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+              std::string* new_value, bool* value_changed) const override {
+    // If key<filtered_key_, update the value of the KV-pair.
+    if (key.compare(*filtered_key_) < 0) {
+      assert(new_value != nullptr);
+      *new_value = NEW_VALUE;
+      *value_changed = true;
+    }
+    return false /*do not remove this KV-pair*/;
+  }
+
+  const char* Name() const override { return "ConditionalUpdateFilter"; }
+
+ private:
+  const std::string* filtered_key_;
+};
+
+class ConditionalUpdateFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalUpdateFilterFactory(const Slice& filtered_key)
+      : filtered_key_(filtered_key.ToString()) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalUpdateFilter(&filtered_key_));
+  }
+
+  const char* Name() const override { return "ConditionalUpdateFilterFactory"; }
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    // This compaction filter will be invoked
+    // at flush time (and therefore at MemPurge time).
+    return (reason == TableFileCreationReason::kFlush);
+  }
+
+ private:
+  std::string filtered_key_;
+};
+
+TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) {
+  Options options = CurrentOptions();
+
+  std::string KEY1 = "ThisIsKey1";
+  std::string KEY2 = "ThisIsKey2";
+  std::string KEY3 = "ThisIsKey3";
+  std::string KEY4 = "ThisIsKey4";
+  std::string KEY5 = "ThisIsKey5";
+  std::string KEY6 = "ThisIsKey6";
+  std::string KEY7 = "ThisIsKey7";
+  std::string KEY8 = "ThisIsKey8";
+  std::string KEY9 = "ThisIsKey9";
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#endif  // !ROCKSDB_LITE
+  // Create a ConditionalUpdate compaction filter
+  // that will update all the values of the KV pairs
+  // where the keys are "lower" than KEY4.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalUpdateFilterFactory>(KEY4);
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 1.0;
+
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(53);
+  const size_t NUM_REPEAT = 1000;
+  const size_t RAND_VALUES_LENGTH = 10240;
+  std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9;
+
+  p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+  ASSERT_OK(Put(KEY1, p_v1));
+  ASSERT_OK(Put(KEY2, p_v2));
+  ASSERT_OK(Put(KEY3, p_v3));
+  ASSERT_OK(Put(KEY4, p_v4));
+  ASSERT_OK(Put(KEY5, p_v5));
+  ASSERT_OK(Delete(KEY1));
+
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary
+    // length RAND_VALUES_LENGTH bytes.
+    p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY6, p_v6));
+    ASSERT_OK(Put(KEY7, p_v7));
+    ASSERT_OK(Put(KEY8, p_v8));
+    ASSERT_OK(Put(KEY9, p_v9));
+
+    ASSERT_OK(Delete(KEY7));
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Verify that the ConditionalUpdateCompactionFilter
+  // updated the values of KEY2 and KEY3, and not KEY4 and KEY5.
+  ASSERT_EQ(Get(KEY1), NOT_FOUND);
+  ASSERT_EQ(Get(KEY2), NEW_VALUE);
+  ASSERT_EQ(Get(KEY3), NEW_VALUE);
+  ASSERT_EQ(Get(KEY4), p_v4);
+  ASSERT_EQ(Get(KEY5), p_v5);
+}
+
+TEST_F(DBFlushTest, DISABLED_MemPurgeWALSupport) {
+  Options options = CurrentOptions();
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 128KB.
+  options.write_buffer_size = 128 << 10;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 1.0;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t KVSIZE = 10;
+
+  do {
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(0, "bar", "v2"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    std::atomic<uint32_t> mempurge_count{0};
+    std::atomic<uint32_t> sst_count{0};
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::FlushJob:MemPurgeSuccessful",
+        [&](void* /*arg*/) { mempurge_count++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    std::vector<std::string> keys;
+    for (size_t k = 0; k < KVSIZE; k++) {
+      keys.push_back("IamKey" + std::to_string(k));
+    }
+
+    std::string RNDKEY, RNDVALUE;
+    const std::string NOT_FOUND = "NOT_FOUND";
+
+    // Heavy overwrite workload,
+    // more than would fit in maximum allowed memtables.
+    Random rnd(719);
+    const size_t NUM_REPEAT = 100;
+    const size_t RAND_KEY_LENGTH = 4096;
+    const size_t RAND_VALUES_LENGTH = 1024;
+    std::vector<std::string> values_default(KVSIZE), values_pikachu(KVSIZE);
+
+    // Insert a very first set of keys that will be
+    // mempurged at least once.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+    }
+
+    // Insert keys[0:KVSIZE/2] to
+    // both 'default' and 'pikachu' CFs.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      ASSERT_OK(Put(0, keys[k], values_default[k]));
+      ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+    }
+
+    // Check that the insertion was seamless.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+
+    // Insertion of of K-V pairs, multiple times (overwrites)
+    // into 'default' CF. Will trigger mempurge.
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      }
+
+      // Insert K-V into default CF.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        ASSERT_OK(Put(0, keys[k], values_default[k]));
+      }
+
+      // Check key validity, for all keys, both in
+      // default and pikachu CFs.
+      for (size_t k = 0; k < KVSIZE; k++) {
+        ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      }
+      // Note that at this point, only keys[0:KVSIZE/2]
+      // have been inserted into Pikachu.
+      for (size_t k = 0; k < KVSIZE / 2; k++) {
+        ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+      }
+    }
+
+    // Insertion of of K-V pairs, multiple times (overwrites)
+    // into 'pikachu' CF. Will trigger mempurge.
+    // Check that we keep the older logs for 'default' imm().
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      }
+
+      // Insert K-V into pikachu CF.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+      }
+
+      // Check key validity, for all keys,
+      // both in default and pikachu.
+      for (size_t k = 0; k < KVSIZE; k++) {
+        ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+        ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+      }
+    }
+
+    // Check that there was at least one mempurge
+    const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+    // Check that there was no SST files created during flush.
+    const uint32_t EXPECTED_SST_COUNT = 0;
+
+    EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+    if (options.experimental_mempurge_threshold ==
+        std::numeric_limits<double>::max()) {
+      EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Check that there was no data corruption anywhere,
+    // not in 'default' nor in 'Pikachu' CFs.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    // Check keys in 'Default' and 'Pikachu'.
+    // keys[0:KVSIZE/2] were for sure contained
+    // in the imm() at Reopen/recovery time.
+    for (size_t k = 0; k < KVSIZE; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+    // Insertion of random K-V pairs to trigger
+    // a flush in the Pikachu CF.
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
+      RNDVALUE = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(1, RNDKEY, RNDVALUE));
+    }
+    // ASsert than there was at least one flush to storage.
+    EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    // Since values in default are held in mutable mem()
+    // and imm(), check if the flush in pikachu didn't
+    // affect these values.
+    for (size_t k = 0; k < KVSIZE; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+    ASSERT_EQ(Get(1, RNDKEY), RNDVALUE);
+  } while (ChangeWalOptions());
+}
+
 TEST_P(DBFlushDirectIOTest, DirectIO) {
   Options options;
   options.create_if_missing = true;
   options.disable_auto_compactions = true;
   options.max_background_flushes = 2;
   options.use_direct_io_for_flush_and_compaction = GetParam();
-  options.env = new MockEnv(Env::Default());
+  options.env = MockEnv::Create(Env::Default());
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:create_file", [&](void* arg) {
         bool* use_direct_writes = static_cast<bool*>(arg);
@@ -305,7 +1443,8 @@
   // mode.
   fault_injection_env->SetFilesystemActive(false);
   ASSERT_OK(db_->ContinueBackgroundWork());
-  dbfull()->TEST_WaitForFlushMemTable();
+  // We ingested the error to env, so the returned status is not OK.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
 #ifndef ROCKSDB_LITE
   uint64_t num_bg_errors;
   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors,
@@ -379,9 +1518,9 @@
       DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
       InstrumentedMutex* mutex = db_impl->mutex();
       mutex->Lock();
-      auto* cfd =
-          reinterpret_cast<ColumnFamilyHandleImpl*>(db->DefaultColumnFamily())
-              ->cfd();
+      auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                      db->DefaultColumnFamily())
+                      ->cfd();
       ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
       mutex->Unlock();
     }
@@ -394,7 +1533,7 @@
   std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
 
   SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::BackgroundCallFlush:start",
+      {{"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
         "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
        {"DBImpl::FlushMemTableToOutputFile:Finish",
         "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
@@ -443,6 +1582,568 @@
 }
 #endif  // !ROCKSDB_LITE
 
+TEST_F(DBFlushTest, FlushWithBlob) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  constexpr char short_value[] = "short";
+  static_assert(sizeof(short_value) - 1 < min_blob_size,
+                "short_value too long");
+
+  constexpr char long_value[] = "long_value";
+  static_assert(sizeof(long_value) - 1 >= min_blob_size,
+                "long_value too short");
+
+  ASSERT_OK(Put("key1", short_value));
+  ASSERT_OK(Put("key2", long_value));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("key1"), short_value);
+  ASSERT_EQ(Get("key2"), long_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_EQ(l0_files.size(), 1);
+
+  const FileMetaData* const table_file = l0_files[0];
+  assert(table_file);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.begin()->second;
+  assert(blob_file);
+
+  ASSERT_EQ(table_file->smallest.user_key(), "key1");
+  ASSERT_EQ(table_file->largest.user_key(), "key2");
+  ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+  ASSERT_EQ(table_file->fd.largest_seqno, 2);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put("key3", "value3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, PickRightMemtables) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  options.create_if_missing = true;
+
+  const std::string test_cf_name = "test_cf";
+  options.max_write_buffer_number = 128;
+  CreateColumnFamilies({test_cf_name}, options);
+
+  Close();
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, test_cf_name}, options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "value"));
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "key", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:BeforeReLock", [&](void* /*arg*/) {
+        ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "what", "v"));
+        auto* cfhi =
+            static_cast_with_check<ColumnFamilyHandleImpl>(handles_[1]);
+        assert(cfhi);
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfhi->cfd()));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", [&](void* arg) {
+        auto* job = reinterpret_cast<FlushJob*>(arg);
+        assert(job);
+        const auto& mems = job->GetMemTables();
+        assert(mems.size() == 1);
+        assert(mems[0]);
+        ASSERT_EQ(1, mems[0]->GetID());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBFlushTestBlobError : public DBFlushTest,
+                             public testing::WithParamInterface<std::string> {
+ public:
+  DBFlushTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBFlushTestBlobError, FlushError) {
+  Options options;
+  options.enable_blob_files = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "blob"));
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_NOK(Flush());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_TRUE(l0_files.empty());
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_TRUE(blob_files.empty());
+
+  // Make sure the files generated by the failed job have been deleted
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kTableFile;
+
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+
+    ASSERT_NE(type, kTableFile);
+    ASSERT_NE(type, kBlobFile);
+  }
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_GT(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  }
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
+  class SimpleTestFlushListener : public EventListener {
+   public:
+    explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {}
+    ~SimpleTestFlushListener() override {}
+
+    void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+      ASSERT_EQ(static_cast<uint32_t>(0), info.cf_id);
+
+      ASSERT_OK(db->Delete(WriteOptions(), "foo"));
+      snapshot_ = db->GetSnapshot();
+      ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+
+      auto* dbimpl = static_cast_with_check<DBImpl>(db);
+      assert(dbimpl);
+
+      ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+      auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+      assert(cfhi);
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd()));
+    }
+
+    DBFlushTest* test_ = nullptr;
+    const Snapshot* snapshot_ = nullptr;
+  };
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  auto* listener = new SimpleTestFlushListener(this);
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
+
+  ManagedSnapshot snapshot_guard(db_);
+
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
+
+  const Snapshot* snapshot = listener->snapshot_;
+  assert(snapshot);
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+
+  // Using snapshot should not see "foo".
+  {
+    std::string value;
+    Status s = db_->Get(read_opts, "foo", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.allow_2pc = true;
+  options.atomic_flush = GetParam();
+  // 64MB so that memtable flush won't be trigger by the small writes.
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  // Destroy the DB to recreate as a TransactionDB.
+  Close();
+  Destroy(options, true);
+
+  // Create a TransactionDB.
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+  txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+  ASSERT_NE(txn_db, nullptr);
+  db_ = txn_db;
+
+  // Create two more columns other than default CF.
+  std::vector<std::string> cfs = {"puppy", "kitty"};
+  CreateColumnFamilies(cfs, options);
+  ASSERT_EQ(handles_.size(), 2);
+  ASSERT_EQ(handles_[0]->GetName(), cfs[0]);
+  ASSERT_EQ(handles_[1]->GetName(), cfs[1]);
+  const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1;
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  // txn1 only prepare, but does not commit.
+  // The WAL containing the prepared but uncommitted data must be kept.
+  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  // txn2 not only prepare, but also commit.
+  Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_NE(txn2, nullptr);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    ASSERT_OK(txn1->Put(handles_[i], "k1", "v1"));
+    ASSERT_OK(txn2->Put(handles_[i], "k2", "v2"));
+  }
+  // A txn must be named before prepare.
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn2->SetName("txn2"));
+  // Prepare writes to WAL, but not to memtable. (WriteCommitted)
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn2->Prepare());
+  // Commit writes to memtable.
+  ASSERT_OK(txn2->Commit());
+  delete txn1;
+  delete txn2;
+
+  // There are still data in memtable not flushed.
+  // But since data is small enough to reside in the active memtable,
+  // there are no immutable memtable.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
+  // Atomic flush memtables,
+  // the min log with prepared data should be written to MANIFEST.
+  std::vector<ColumnFamilyHandle*> cfs_to_flush(kNumCfToFlush);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    cfs_to_flush[i] = handles_[i];
+  }
+  ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush));
+
+  // There are no remaining data in memtable after flush.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+  }
+
+  // The recovered min log number with prepared data should be non-zero.
+  // In 2pc mode, MinLogNumberToKeep returns the
+  // VersionSet::min_log_number_to_keep_2pc recovered from MANIFEST, if it's 0,
+  // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST.
+  cfs.push_back(kDefaultColumnFamilyName);
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+  ASSERT_TRUE(db_impl->allow_2pc());
+  ASSERT_NE(db_impl->MinLogNumberToKeep(), 0);
+}
+#endif  // ROCKSDB_LITE
+
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -457,18 +2158,84 @@
   for (size_t i = 0; i != num_cfs; ++i) {
     ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
   }
+
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
   std::vector<int> cf_ids;
   for (size_t i = 0; i != num_cfs; ++i) {
     cf_ids.emplace_back(static_cast<int>(i));
   }
   ASSERT_OK(Flush(cf_ids));
+
   for (size_t i = 0; i != num_cfs; ++i) {
     auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
     ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
     ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
   }
 }
 
+TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const size_t num_cfs = handles_.size();
+  ASSERT_EQ(num_cfs, 2);
+  WriteOptions wopts;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+
+  {
+    // Flush the default CF only.
+    std::vector<int> cf_ids{0};
+    ASSERT_OK(Flush(cf_ids));
+
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    auto flushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[0]);
+    flushed_cfds.push_back(flushed_cfh->cfd());
+    flush_edits.push_back({});
+    auto unflushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[1]);
+
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              unflushed_cfh->cfd()->GetLogNumber());
+  }
+
+  {
+    // Flush all CFs.
+    std::vector<int> cf_ids;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      cf_ids.emplace_back(static_cast<int>(i));
+    }
+    ASSERT_OK(Flush(cf_ids));
+    uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber();
+
+    uint64_t min_log_number_to_keep = port::kMaxUint64;
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      flushed_cfds.push_back(cfh->cfd());
+      flush_edits.push_back({});
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber());
+    }
+    ASSERT_EQ(min_log_number_to_keep, log_num_after_flush);
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              min_log_number_to_keep);
+  }
+}
+
 TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -499,13 +2266,13 @@
   TEST_SYNC_POINT(
       "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
   if (options.atomic_flush) {
-    for (size_t i = 0; i != num_cfs - 1; ++i) {
+    for (size_t i = 0; i + 1 != num_cfs; ++i) {
       auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
       ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
       ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
     }
   } else {
-    for (size_t i = 0; i != num_cfs - 1; ++i) {
+    for (size_t i = 0; i + 1 != num_cfs; ++i) {
       auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
       ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
       ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
@@ -549,7 +2316,8 @@
   fault_injection_env->SetFilesystemActive(false);
   TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
   for (auto* cfh : handles_) {
-    dbfull()->TEST_WaitForFlushMemTable(cfh);
+    // Returns the IO error happend during flush.
+    ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh));
   }
   for (size_t i = 0; i != num_cfs; ++i) {
     auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
@@ -651,7 +2419,7 @@
   options.create_if_missing = true;
   options.atomic_flush = atomic_flush;
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(kNumKeysTriggerFlush));
+      test::NewSpecialSkipListFactory(kNumKeysTriggerFlush));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   for (int i = 0; i != kNumKeysTriggerFlush; ++i) {
@@ -770,6 +2538,122 @@
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+// In atomic flush, concurrent bg flush threads commit to the MANIFEST in
+// serial, in the order of their picked memtables for each column family.
+// Only when a bg flush thread finds out that its memtables are the earliest
+// unflushed ones for all the included column families will this bg flush
+// thread continue to commit to MANIFEST.
+// This unit test uses sync point to coordinate the execution of two bg threads
+// executing the same sequence of functions. The interleaving are as follows.
+// time            bg1                            bg2
+//  |   pick memtables to flush
+//  |   flush memtables cf1_m1, cf2_m1
+//  |   join MANIFEST write queue
+//  |                                     pick memtabls to flush
+//  |                                     flush memtables cf1_(m1+1)
+//  |                                     join MANIFEST write queue
+//  |                                     wait to write MANIFEST
+//  |   write MANIFEST
+//  |   IO error
+//  |                                     detect IO error and stop waiting
+//  V
+TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = true;
+  options.env = fault_injection_env.get();
+  // Set a larger value than default so that RocksDB can schedule concurrent
+  // background flush threads.
+  options.max_background_jobs = 8;
+  options.max_write_buffer_number = 8;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  assert(2 == handles_.size());
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+
+  ASSERT_OK(Put(0, "a", "v_0_a", write_opts));
+  ASSERT_OK(Put(1, "a", "v_1_a", write_opts));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"BgFlushThr2:WaitToCommit", "BgFlushThr1:BeforeWriteManifest"},
+  });
+
+  std::thread::id bg_flush_thr1, bg_flush_thr2;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCallFlush:start", [&](void*) {
+        if (bg_flush_thr1 == std::thread::id()) {
+          bg_flush_thr1 = std::this_thread::get_id();
+        } else if (bg_flush_thr2 == std::thread::id()) {
+          bg_flush_thr2 = std::this_thread::get_id();
+        }
+      });
+
+  int called = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", [&](void* arg) {
+        if (std::this_thread::get_id() == bg_flush_thr2) {
+          const auto* ptr = reinterpret_cast<std::pair<Status, bool>*>(arg);
+          assert(ptr);
+          if (0 == called) {
+            // When bg flush thread 2 reaches here for the first time.
+            ASSERT_OK(ptr->first);
+            ASSERT_TRUE(ptr->second);
+          } else if (1 == called) {
+            // When bg flush thread 2 reaches here for the second time.
+            ASSERT_TRUE(ptr->first.IsIOError());
+            ASSERT_FALSE(ptr->second);
+          }
+          ++called;
+          TEST_SYNC_POINT("BgFlushThr2:WaitToCommit");
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+      [&](void*) {
+        if (std::this_thread::get_id() == bg_flush_thr1) {
+          TEST_SYNC_POINT("BgFlushThr1:BeforeWriteManifest");
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (std::this_thread::get_id() != bg_flush_thr1) {
+          return;
+        }
+        ASSERT_OK(db_->Put(write_opts, "b", "v_1_b"));
+
+        FlushOptions flush_opts;
+        flush_opts.wait = false;
+        std::vector<ColumnFamilyHandle*> cfhs(1, db_->DefaultColumnFamily());
+        ASSERT_OK(dbfull()->Flush(flush_opts, cfhs));
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+        auto* ptr = reinterpret_cast<IOStatus*>(arg);
+        assert(ptr);
+        *ptr = IOStatus::IOError("Injected failure");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()->Flush(FlushOptions(), handles_).IsIOError());
+
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                         testing::Bool());
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,173 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/db_impl/compacted_db_impl.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "table/get_context.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
+                                 const std::string& dbname)
+    : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
+             /*read_only*/ true),
+      cfd_(nullptr),
+      version_(nullptr),
+      user_comparator_(nullptr) {}
+
+CompactedDBImpl::~CompactedDBImpl() {
+}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+  size_t right = files_.num_files - 1;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+  };
+  return static_cast<size_t>(std::lower_bound(files_.files,
+                            files_.files + right, key, cmp) - files_.files);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+                            const Slice& key, PinnableSlice* value) {
+  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, key, value, nullptr, nullptr,
+                         nullptr, true, nullptr, nullptr);
+  LookupKey lkey(key, kMaxSequenceNumber);
+  Status s = files_.files[FindFile(key)].fd.table_reader->Get(
+      options, lkey.internal_key(), &get_context, nullptr);
+  if (!s.ok() && !s.IsNotFound()) {
+    return s;
+  }
+  if (get_context.State() == GetContext::kFound) {
+    return Status::OK();
+  }
+  return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  autovector<TableReader*, 16> reader_list;
+  for (const auto& key : keys) {
+    const FdWithKeyRange& f = files_.files[FindFile(key)];
+    if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
+      reader_list.push_back(nullptr);
+    } else {
+      LookupKey lkey(key, kMaxSequenceNumber);
+      f.fd.table_reader->Prepare(lkey.internal_key());
+      reader_list.push_back(f.fd.table_reader);
+    }
+  }
+  std::vector<Status> statuses(keys.size(), Status::NotFound());
+  values->resize(keys.size());
+  int idx = 0;
+  for (auto* r : reader_list) {
+    if (r != nullptr) {
+      PinnableSlice pinnable_val;
+      std::string& value = (*values)[idx];
+      GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, keys[idx], &pinnable_val,
+                             nullptr, nullptr, nullptr, true, nullptr, nullptr);
+      LookupKey lkey(keys[idx], kMaxSequenceNumber);
+      Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
+      assert(static_cast<size_t>(idx) < statuses.size());
+      if (!s.ok() && !s.IsNotFound()) {
+        statuses[idx] = s;
+      } else {
+        value.assign(pinnable_val.data(), pinnable_val.size());
+        if (get_context.State() == GetContext::kFound) {
+          statuses[idx] = Status::OK();
+        }
+      }
+    }
+    ++idx;
+  }
+  return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  mutex_.Lock();
+  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+                            ColumnFamilyOptions(options));
+  Status s = Recover({cf}, true /* read only */, false, true);
+  if (s.ok()) {
+    cfd_ = static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+               ->cfd();
+    cfd_->InstallSuperVersion(&sv_context, &mutex_);
+  }
+  mutex_.Unlock();
+  sv_context.Clean();
+  if (!s.ok()) {
+    return s;
+  }
+  NewThreadStatusCfInfo(cfd_);
+  version_ = cfd_->GetSuperVersion()->current;
+  user_comparator_ = cfd_->user_comparator();
+  auto* vstorage = version_->storage_info();
+  if (vstorage->num_non_empty_levels() == 0) {
+    return Status::NotSupported("no file exists");
+  }
+  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+  // L0 should not have files
+  if (l0.num_files > 1) {
+    return Status::NotSupported("L0 contain more than 1 file");
+  }
+  if (l0.num_files == 1) {
+    if (vstorage->num_non_empty_levels() > 1) {
+      return Status::NotSupported("Both L0 and other level contain files");
+    }
+    files_ = l0;
+    return Status::OK();
+  }
+
+  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+    if (vstorage->LevelFilesBrief(i).num_files > 0) {
+      return Status::NotSupported("Other levels also contain files");
+    }
+  }
+
+  int level = vstorage->num_non_empty_levels() - 1;
+  if (vstorage->LevelFilesBrief(level).num_files > 0) {
+    files_ = vstorage->LevelFilesBrief(level);
+    return Status::OK();
+  }
+  return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options,
+                             const std::string& dbname, DB** dbptr) {
+  *dbptr = nullptr;
+
+  if (options.max_open_files != -1) {
+    return Status::InvalidArgument("require max_open_files = -1");
+  }
+  if (options.merge_operator.get() != nullptr) {
+    return Status::InvalidArgument("merge operator is not supported");
+  }
+  DBOptions db_options(options);
+  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+  Status s = db->Init(options);
+  if (s.ok()) {
+    db->StartPeriodicWorkScheduler();
+    ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
+                   "Opened the db as fully compacted mode");
+    LogFlush(db->immutable_db_options_.info_log);
+    *dbptr = db.release();
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,118 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactedDBImpl : public DBImpl {
+ public:
+  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&) = delete;
+  void operator=(const CompactedDBImpl&) = delete;
+
+  ~CompactedDBImpl() override;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DB** dbptr);
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>&,
+      const std::vector<Slice>& keys, std::vector<std::string>* values)
+    override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+ private:
+  friend class DB;
+  inline size_t FindFile(const Slice& key);
+  Status Init(const Options& options);
+
+  ColumnFamilyData* cfd_;
+  Version* version_;
+  const Comparator* user_comparator_;
+  LevelFilesBrief files_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.cc	2025-05-19 16:14:27.000000000 +0000
@@ -18,10 +18,10 @@
 #include <cstdio>
 #include <map>
 #include <set>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -45,6 +45,7 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
+#include "db/periodic_work_scheduler.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
@@ -52,7 +53,7 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "db/write_callback.h"
-#include "env/composite_env_wrapper.h"
+#include "env/unique_id_gen.h"
 #include "file/file_util.h"
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
@@ -60,9 +61,8 @@
 #include "logging/auto_roll_logger.h"
 #include "logging/log_buffer.h"
 #include "logging/logging.h"
-#include "memtable/hash_linklist_rep.h"
-#include "memtable/hash_skiplist_rep.h"
 #include "monitoring/in_memory_stats_history.h"
+#include "monitoring/instrumented_mutex.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/persistent_stats_history.h"
@@ -82,25 +82,29 @@
 #include "rocksdb/stats_history.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "rocksdb/version.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/get_context.h"
 #include "table/merging_iterator.h"
 #include "table/multiget_context.h"
+#include "table/sst_file_dumper.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
-#include "tools/sst_dump_tool_imp.h"
+#include "trace_replay/trace_replay.h"
 #include "util/autovector.h"
-#include "util/build_version.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
+#include "util/defer.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
+#include "utilities/trace/replayer_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -146,26 +150,31 @@
 }  // namespace
 
 DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
-               const bool seq_per_batch, const bool batch_per_txn)
+               const bool seq_per_batch, const bool batch_per_txn,
+               bool read_only)
     : dbname_(dbname),
       own_info_log_(options.info_log == nullptr),
-      initial_db_options_(SanitizeOptions(dbname, options)),
+      initial_db_options_(SanitizeOptions(dbname, options, read_only)),
       env_(initial_db_options_.env),
-      fs_(initial_db_options_.file_system),
+      io_tracer_(std::make_shared<IOTracer>()),
       immutable_db_options_(initial_db_options_),
+      fs_(immutable_db_options_.fs, io_tracer_),
       mutable_db_options_(initial_db_options_),
-      stats_(immutable_db_options_.statistics.get()),
-      mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS,
+      stats_(immutable_db_options_.stats),
+      mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
              immutable_db_options_.use_adaptive_mutex),
       default_cf_handle_(nullptr),
+      error_handler_(this, immutable_db_options_, &mutex_),
+      event_logger_(immutable_db_options_.info_log.get()),
       max_total_in_memory_state_(0),
       file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
       file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
           file_options_, immutable_db_options_)),
       seq_per_batch_(seq_per_batch),
       batch_per_txn_(batch_per_txn),
-      db_lock_(nullptr),
+      next_job_id_(1),
       shutting_down_(false),
+      db_lock_(nullptr),
       manual_compaction_paused_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
@@ -190,20 +199,22 @@
       bg_purge_scheduled_(0),
       disable_delete_obsolete_files_(0),
       pending_purge_obsolete_files_(0),
-      delete_obsolete_files_last_run_(env_->NowMicros()),
+      delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
       last_stats_dump_time_microsec_(0),
-      next_job_id_(1),
       has_unpersisted_data_(false),
       unable_to_release_oldest_log_(false),
       num_running_ingest_file_(0),
 #ifndef ROCKSDB_LITE
-      wal_manager_(immutable_db_options_, file_options_, seq_per_batch),
+      wal_manager_(immutable_db_options_, file_options_, io_tracer_,
+                   seq_per_batch),
 #endif  // ROCKSDB_LITE
-      event_logger_(immutable_db_options_.info_log.get()),
       bg_work_paused_(0),
       bg_compaction_paused_(0),
       refitting_level_(false),
       opened_successfully_(false),
+#ifndef ROCKSDB_LITE
+      periodic_work_scheduler_(nullptr),
+#endif  // ROCKSDB_LITE
       two_write_queues_(options.two_write_queues),
       manual_wal_flush_(options.manual_wal_flush),
       // last_sequencee_ is always maintained by the main queue that also writes
@@ -225,12 +236,15 @@
       own_sfm_(options.sst_file_manager == nullptr),
       preserve_deletes_(options.preserve_deletes),
       closed_(false),
-      error_handler_(this, immutable_db_options_, &mutex_),
-      atomic_flush_install_cv_(&mutex_) {
+      atomic_flush_install_cv_(&mutex_),
+      blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
+                     &error_handler_, &event_logger_,
+                     immutable_db_options_.listeners, dbname_) {
   // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
   // WriteUnprepared, which should use seq_per_batch_.
   assert(batch_per_txn_ || seq_per_batch_);
-  env_->GetAbsolutePath(dbname, &db_absolute_path_);
+  // TODO: Check for an error here
+  env_->GetAbsolutePath(dbname, &db_absolute_path_).PermitUncheckedError();
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
   // Give a large number for setting of "infinite" open files.
@@ -242,15 +256,18 @@
   co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
   co.metadata_charge_policy = kDontChargeCacheMetadata;
   table_cache_ = NewLRUCache(co);
+  SetDbSessionId();
+  assert(!db_session_id_.empty());
 
   versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
                                  table_cache_.get(), write_buffer_manager_,
-                                 &write_controller_, &block_cache_tracer_));
+                                 &write_controller_, &block_cache_tracer_,
+                                 io_tracer_, db_session_id_));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
   DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
-  DumpDBFileSummary(immutable_db_options_, dbname_);
+  DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_);
   immutable_db_options_.Dump(immutable_db_options_.info_log.get());
   mutable_db_options_.Dump(immutable_db_options_.info_log.get());
   DumpSupportInfo(immutable_db_options_.info_log.get());
@@ -259,6 +276,10 @@
   // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
   // is called by client and this seqnum is advanced.
   preserve_deletes_seqnum_.store(0);
+
+  if (write_buffer_manager_) {
+    wbm_stall_.reset(new WBMStallInterface());
+  }
 }
 
 Status DBImpl::Resume() {
@@ -294,22 +315,59 @@
 // 4. Schedule compactions if needed for all the CFs. This is needed as the
 //    flush in the prior step might have been a no-op for some CFs, which
 //    means a new super version wouldn't have been installed
-Status DBImpl::ResumeImpl() {
+Status DBImpl::ResumeImpl(DBRecoverContext context) {
   mutex_.AssertHeld();
   WaitForBackgroundWork();
 
-  Status bg_error = error_handler_.GetBGError();
   Status s;
   if (shutdown_initiated_) {
     // Returning shutdown status to SFM during auto recovery will cause it
     // to abort the recovery and allow the shutdown to progress
     s = Status::ShutdownInProgress();
   }
-  if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
-    ROCKS_LOG_INFO(
-        immutable_db_options_.info_log,
-        "DB resume requested but failed due to Fatal/Unrecoverable error");
-    s = bg_error;
+
+  if (s.ok()) {
+    Status bg_error = error_handler_.GetBGError();
+    if (bg_error.severity() > Status::Severity::kHardError) {
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "DB resume requested but failed due to Fatal/Unrecoverable error");
+      s = bg_error;
+    }
+  }
+
+  // Make sure the IO Status stored in version set is set to OK.
+  bool file_deletion_disabled = !IsFileDeletionsEnabled();
+  if (s.ok()) {
+    IOStatus io_s = versions_->io_status();
+    if (io_s.IsIOError()) {
+      // If resuming from IOError resulted from MANIFEST write, then assert
+      // that we must have already set the MANIFEST writer to nullptr during
+      // clean-up phase MANIFEST writing. We must have also disabled file
+      // deletions.
+      assert(!versions_->descriptor_log_);
+      assert(file_deletion_disabled);
+      // Since we are trying to recover from MANIFEST write error, we need to
+      // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted.
+      // Therefore, force writing a dummy version edit because we do not know
+      // whether there are flush jobs with non-empty data to flush, triggering
+      // appends to MANIFEST.
+      VersionEdit edit;
+      auto cfh =
+          static_cast_with_check<ColumnFamilyHandleImpl>(default_cf_handle_);
+      assert(cfh);
+      ColumnFamilyData* cfd = cfh->cfd();
+      const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions();
+      s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_,
+                                 directories_.GetDbDir());
+      if (!s.ok()) {
+        io_s = versions_->io_status();
+        if (!io_s.ok()) {
+          s = error_handler_.SetBGError(io_s,
+                                        BackgroundErrorReason::kManifestWrite);
+        }
+      }
+    }
   }
 
   // We cannot guarantee consistency of the WAL. So force flush Memtables of
@@ -322,18 +380,15 @@
       autovector<ColumnFamilyData*> cfds;
       SelectColumnFamiliesForAtomicFlush(&cfds);
       mutex_.Unlock();
-      s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery);
+      s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
       mutex_.Lock();
     } else {
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
         if (cfd->IsDropped()) {
           continue;
         }
-        cfd->Ref();
-        mutex_.Unlock();
-        s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
-        mutex_.Lock();
-        cfd->UnrefAndTryDelete();
+        InstrumentedMutexUnlock u(&mutex_);
+        s = FlushMemTable(cfd, flush_opts, context.flush_reason);
         if (!s.ok()) {
           break;
         }
@@ -348,9 +403,6 @@
 
   JobContext job_context(0);
   FindObsoleteFiles(&job_context, true);
-  if (s.ok()) {
-    s = error_handler_.ClearBGError();
-  }
   mutex_.Unlock();
 
   job_context.manifest_file_number = 1;
@@ -360,9 +412,42 @@
   job_context.Clean();
 
   if (s.ok()) {
-    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+    assert(versions_->io_status().ok());
+    // If we reach here, we should re-enable file deletions if it was disabled
+    // during previous error handling.
+    if (file_deletion_disabled) {
+      // Always return ok
+      s = EnableFileDeletions(/*force=*/true);
+      if (!s.ok()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "DB resume requested but could not enable file deletions [%s]",
+            s.ToString().c_str());
+        assert(false);
+      }
+    }
   }
+
   mutex_.Lock();
+  if (s.ok()) {
+    // This will notify and unblock threads waiting for error recovery to
+    // finish. Those previouly waiting threads can now proceed, which may
+    // include closing the db.
+    s = error_handler_.ClearBGError();
+  } else {
+    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
+    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
+    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
+    error_handler_.GetRecoveryError().PermitUncheckedError();
+  }
+
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]",
+                   s.ToString().c_str());
+  }
+
   // Check for shutdown again before scheduling further compactions,
   // since we released and re-acquired the lock above
   if (shutdown_initiated_) {
@@ -396,14 +481,12 @@
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "Shutdown: canceling all background work");
 
-  if (thread_dump_stats_ != nullptr) {
-    thread_dump_stats_->cancel();
-    thread_dump_stats_.reset();
-  }
-  if (thread_persist_stats_ != nullptr) {
-    thread_persist_stats_->cancel();
-    thread_persist_stats_.reset();
+#ifndef ROCKSDB_LITE
+  if (periodic_work_scheduler_ != nullptr) {
+    periodic_work_scheduler_->Unregister(this);
   }
+#endif  // !ROCKSDB_LITE
+
   InstrumentedMutexLock l(&mutex_);
   if (!shutting_down_.load(std::memory_order_acquire) &&
       has_unpersisted_data_.load(std::memory_order_relaxed) &&
@@ -412,20 +495,19 @@
       autovector<ColumnFamilyData*> cfds;
       SelectColumnFamiliesForAtomicFlush(&cfds);
       mutex_.Unlock();
-      AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      Status s =
+          AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      s.PermitUncheckedError();  //**TODO: What to do on error?
       mutex_.Lock();
     } else {
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
         if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
-          cfd->Ref();
-          mutex_.Unlock();
-          FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
-          mutex_.Lock();
-          cfd->UnrefAndTryDelete();
+          InstrumentedMutexUnlock u(&mutex_);
+          Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+          s.PermitUncheckedError();  //**TODO: What to do on error?
         }
       }
     }
-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
   }
 
   shutting_down_.store(true, std::memory_order_release);
@@ -447,19 +529,29 @@
   }
   mutex_.Unlock();
 
+  // Below check is added as recovery_error_ is not checked and it causes crash
+  // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is
+  // reached.
+  error_handler_.GetRecoveryError().PermitUncheckedError();
+
   // CancelAllBackgroundWork called with false means we just set the shutdown
   // marker. After this we do a variant of the waiting and unschedule work
   // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
   CancelAllBackgroundWork(false);
-  int bottom_compactions_unscheduled =
-      env_->UnSchedule(this, Env::Priority::BOTTOM);
-  int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
-  int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
-  Status ret;
+
+  // Cancel manual compaction if there's any
+  if (HasPendingManualCompaction()) {
+    DisableManualCompaction();
+  }
   mutex_.Lock();
-  bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
-  bg_compaction_scheduled_ -= compactions_unscheduled;
-  bg_flush_scheduled_ -= flushes_unscheduled;
+  // Unschedule all tasks for this DB
+  for (uint8_t i = 0; i < static_cast<uint8_t>(TaskType::kCount); i++) {
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH);
+  }
+
+  Status ret = Status::OK();
 
   // Wait for background work to finish
   while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
@@ -475,12 +567,45 @@
   flush_scheduler_.Clear();
   trim_history_scheduler_.Clear();
 
+  // For now, simply trigger a manual flush at close time
+  // on all the column families.
+  // TODO(bjlemaire): Check if this is needed. Also, in the
+  // future we can contemplate doing a more fine-grained
+  // flushing by first checking if there is a need for
+  // flushing (but need to implement something
+  // else than imm()->IsFlushPending() because the output
+  // memtables added to imm() dont trigger flushes).
+  if (immutable_db_options_.experimental_mempurge_threshold > 0.0) {
+    Status flush_ret;
+    mutex_.Unlock();
+    for (ColumnFamilyData* cf : *versions_->GetColumnFamilySet()) {
+      if (immutable_db_options_.atomic_flush) {
+        flush_ret = AtomicFlushMemTables({cf}, FlushOptions(),
+                                         FlushReason::kManualFlush);
+        if (!flush_ret.ok()) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "Atomic flush memtables failed upon closing (mempurge).");
+        }
+      } else {
+        flush_ret =
+            FlushMemTable(cf, FlushOptions(), FlushReason::kManualFlush);
+        if (!flush_ret.ok()) {
+          ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                         "Flush memtables failed upon closing (mempurge).");
+        }
+      }
+    }
+    mutex_.Lock();
+  }
+
   while (!flush_queue_.empty()) {
     const FlushRequest& flush_req = PopFirstFromFlushQueue();
     for (const auto& iter : flush_req) {
       iter.first->UnrefAndTryDelete();
     }
   }
+
   while (!compaction_queue_.empty()) {
     auto cfd = PopFirstFromCompactionQueue();
     cfd->UnrefAndTryDelete();
@@ -533,7 +658,7 @@
       ROCKS_LOG_WARN(
           immutable_db_options_.info_log,
           "Unable to Sync WAL file %s with error -- %s",
-          LogFileName(immutable_db_options_.wal_dir, log_number).c_str(),
+          LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
           s.ToString().c_str());
       // Retain the first error
       if (ret.ok()) {
@@ -567,7 +692,8 @@
   versions_.reset();
   mutex_.Unlock();
   if (db_lock_ != nullptr) {
-    env_->UnlockFile(db_lock_);
+    // TODO: Check for unlock error
+    env_->UnlockFile(db_lock_).PermitUncheckedError();
   }
 
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
@@ -586,11 +712,15 @@
 
   if (immutable_db_options_.info_log && own_info_log_) {
     Status s = immutable_db_options_.info_log->Close();
-    if (ret.ok()) {
+    if (!s.ok() && !s.IsNotSupported() && ret.ok()) {
       ret = s;
     }
   }
 
+  if (write_buffer_manager_ && wbm_stall_) {
+    write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get());
+  }
+
   if (ret.IsAborted()) {
     // Reserve IsAborted() error for those where users didn't release
     // certain resource and they can release them and come back and
@@ -603,9 +733,11 @@
 Status DBImpl::CloseImpl() { return CloseHelper(); }
 
 DBImpl::~DBImpl() {
+  InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
   if (!closed_) {
     closed_ = true;
-    CloseHelper();
+    closing_status_ = CloseHelper();
+    closing_status_.PermitUncheckedError();
   }
 }
 
@@ -620,44 +752,48 @@
 }
 
 const Status DBImpl::CreateArchivalDirectory() {
-  if (immutable_db_options_.wal_ttl_seconds > 0 ||
-      immutable_db_options_.wal_size_limit_mb > 0) {
-    std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir);
+  if (immutable_db_options_.WAL_ttl_seconds > 0 ||
+      immutable_db_options_.WAL_size_limit_MB > 0) {
+    std::string archivalPath =
+        ArchivalDirectory(immutable_db_options_.GetWalDir());
     return env_->CreateDirIfMissing(archivalPath);
   }
   return Status::OK();
 }
 
 void DBImpl::PrintStatistics() {
-  auto dbstats = immutable_db_options_.statistics.get();
+  auto dbstats = immutable_db_options_.stats;
   if (dbstats) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
                    dbstats->ToString().c_str());
   }
 }
 
-void DBImpl::StartTimedTasks() {
-  unsigned int stats_dump_period_sec = 0;
-  unsigned int stats_persist_period_sec = 0;
+void DBImpl::StartPeriodicWorkScheduler() {
+#ifndef ROCKSDB_LITE
+
+#ifndef NDEBUG
+  // It only used by test to disable scheduler
+  bool disable_scheduler = false;
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImpl::StartPeriodicWorkScheduler:DisableScheduler",
+      &disable_scheduler);
+  if (disable_scheduler) {
+    return;
+  }
+#endif  // !NDEBUG
+
   {
     InstrumentedMutexLock l(&mutex_);
-    stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec;
-    if (stats_dump_period_sec > 0) {
-      if (!thread_dump_stats_) {
-        thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
-            [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
-            static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond));
-      }
-    }
-    stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
-    if (stats_persist_period_sec > 0) {
-      if (!thread_persist_stats_) {
-        thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
-            [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
-            static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond));
-      }
-    }
+    periodic_work_scheduler_ = PeriodicWorkScheduler::Default();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::StartPeriodicWorkScheduler:Init",
+                             &periodic_work_scheduler_);
   }
+
+  periodic_work_scheduler_->Register(
+      this, mutable_db_options_.stats_dump_period_sec,
+      mutable_db_options_.stats_persist_period_sec);
+#endif  // !ROCKSDB_LITE
 }
 
 // esitmate the total size of stats_history_
@@ -683,8 +819,11 @@
   if (shutdown_initiated_) {
     return;
   }
-  uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
-  Statistics* statistics = immutable_db_options_.statistics.get();
+  TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning");
+  uint64_t now_seconds =
+      immutable_db_options_.clock->NowMicros() / kMicrosInSecond;
+
+  Statistics* statistics = immutable_db_options_.stats;
   if (!statistics) {
     return;
   }
@@ -703,29 +842,34 @@
 
   if (immutable_db_options_.persist_stats_to_disk) {
     WriteBatch batch;
+    Status s = Status::OK();
     if (stats_slice_initialized_) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
                      stats_slice_.size());
       for (const auto& stat : stats_map) {
-        char key[100];
-        int length =
-            EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
-        // calculate the delta from last time
-        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
-          uint64_t delta = stat.second - stats_slice_[stat.first];
-          batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)),
-                    ToString(delta));
+        if (s.ok()) {
+          char key[100];
+          int length =
+              EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+          // calculate the delta from last time
+          if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+            uint64_t delta = stat.second - stats_slice_[stat.first];
+            s = batch.Put(persist_stats_cf_handle_,
+                          Slice(key, std::min(100, length)), ToString(delta));
+          }
         }
       }
     }
     stats_slice_initialized_ = true;
     std::swap(stats_slice_, stats_map);
-    WriteOptions wo;
-    wo.low_pri = true;
-    wo.no_slowdown = true;
-    wo.sync = false;
-    Status s = Write(wo, &batch);
+    if (s.ok()) {
+      WriteOptions wo;
+      wo.low_pri = true;
+      wo.no_slowdown = true;
+      wo.sync = false;
+      s = Write(wo, &batch);
+    }
     if (!s.ok()) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "Writing to persistent stats CF failed -- %s",
@@ -774,6 +918,7 @@
                    " bytes, slice count: %" ROCKSDB_PRIszt,
                    stats_history_size, stats_history_.size());
   }
+  TEST_SYNC_POINT("DBImpl::PersistStats:End");
 #endif  // !ROCKSDB_LITE
 }
 
@@ -817,31 +962,50 @@
 void DBImpl::DumpStats() {
   TEST_SYNC_POINT("DBImpl::DumpStats:1");
 #ifndef ROCKSDB_LITE
-  const DBPropertyInfo* cf_property_info =
-      GetPropertyInfo(DB::Properties::kCFStats);
-  assert(cf_property_info != nullptr);
-  const DBPropertyInfo* db_property_info =
-      GetPropertyInfo(DB::Properties::kDBStats);
-  assert(db_property_info != nullptr);
-
   std::string stats;
   if (shutdown_initiated_) {
     return;
   }
+
+  TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
   {
     InstrumentedMutexLock l(&mutex_);
-    default_cf_internal_stats_->GetStringProperty(
-        *db_property_info, DB::Properties::kDBStats, &stats);
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+      if (cfd->initialized()) {
+        // Release DB mutex for gathering cache entry stats. Pass over all
+        // column families for this first so that other stats are dumped
+        // near-atomically.
+        InstrumentedMutexUnlock u(&mutex_);
+        cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
+      }
+    }
+
+    const std::string* property = &DB::Properties::kDBStats;
+    const DBPropertyInfo* property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    assert(!property_info->need_out_of_mutex);
+    default_cf_internal_stats_->GetStringProperty(*property_info, *property,
+                                                  &stats);
+
+    property = &DB::Properties::kCFStatsNoFileHistogram;
+    property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    assert(!property_info->need_out_of_mutex);
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (cfd->initialized()) {
-        cfd->internal_stats()->GetStringProperty(
-            *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
+        cfd->internal_stats()->GetStringProperty(*property_info, *property,
+                                                 &stats);
       }
     }
+
+    property = &DB::Properties::kCFFileHistogram;
+    property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    assert(!property_info->need_out_of_mutex);
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (cfd->initialized()) {
-        cfd->internal_stats()->GetStringProperty(
-            *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
+        cfd->internal_stats()->GetStringProperty(*property_info, *property,
+                                                 &stats);
       }
     }
   }
@@ -863,12 +1027,18 @@
   PrintStatistics();
 }
 
+void DBImpl::FlushInfoLog() {
+  if (shutdown_initiated_) {
+    return;
+  }
+  TEST_SYNC_POINT("DBImpl::FlushInfoLog:StartRunning");
+  LogFlush(immutable_db_options_.info_log);
+}
+
 Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
                                            int max_entries_to_print,
                                            std::string* out_str) {
-  auto* cfh =
-      static_cast_with_check<ColumnFamilyHandleImpl, ColumnFamilyHandle>(
-          column_family);
+  auto* cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   ColumnFamilyData* cfd = cfh->cfd();
 
   SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
@@ -890,9 +1060,9 @@
   }
 }
 
-Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
   assert(cfd);
-  Directory* ret_dir = cfd->GetDataDir(path_id);
+  FSDirectory* ret_dir = cfd->GetDataDir(path_id);
   if (ret_dir == nullptr) {
     return directories_.GetDataDir(path_id);
   }
@@ -907,7 +1077,8 @@
   (void)options_map;
   return Status::NotSupported("Not supported in ROCKSDB LITE");
 #else
-  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   if (options_map.empty()) {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "SetOptions() on column family [%s], empty input",
@@ -918,6 +1089,7 @@
   MutableCFOptions new_options;
   Status s;
   Status persist_options_status;
+  persist_options_status.PermitUncheckedError();  // Allow uninitialized access
   SuperVersionContext sv_context(/* create_superversion */ true);
   {
     auto db_options = GetDBOptions();
@@ -927,8 +1099,8 @@
       new_options = *cfd->GetLatestMutableCFOptions();
       // Append new version to recompute compaction score.
       VersionEdit dummy_edit;
-      versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
-                             directories_.GetDbDir());
+      s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
+                                 directories_.GetDbDir());
       // Trigger possible flush/compactions. This has to be before we persist
       // options to file, otherwise there will be a deadlock with writer
       // thread.
@@ -978,16 +1150,26 @@
 
   MutableDBOptions new_options;
   Status s;
-  Status persist_options_status;
+  Status persist_options_status = Status::OK();
   bool wal_changed = false;
   WriteContext write_context;
   {
     InstrumentedMutexLock l(&mutex_);
     s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
                                        &new_options);
+
     if (new_options.bytes_per_sync == 0) {
       new_options.bytes_per_sync = 1024 * 1024;
     }
+
+    if (MutableDBOptionsAreEqual(mutable_db_options_, new_options)) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "SetDBOptions(), input option value is not changed, "
+                     "skipping updating.");
+      persist_options_status.PermitUncheckedError();
+      return s;
+    }
+
     DBOptions new_db_options =
         BuildDBOptions(immutable_db_options_, new_options);
     if (s.ok()) {
@@ -1006,12 +1188,12 @@
     }
     if (s.ok()) {
       const BGJobLimits current_bg_job_limits =
-          GetBGJobLimits(immutable_db_options_.max_background_flushes,
+          GetBGJobLimits(mutable_db_options_.max_background_flushes,
                          mutable_db_options_.max_background_compactions,
                          mutable_db_options_.max_background_jobs,
                          /* parallelize_compactions */ true);
       const BGJobLimits new_bg_job_limits = GetBGJobLimits(
-          immutable_db_options_.max_background_flushes,
+          new_options.max_background_flushes,
           new_options.max_background_compactions,
           new_options.max_background_jobs, /* parallelize_compactions */ true);
 
@@ -1036,36 +1218,15 @@
       }
 
       if (new_options.stats_dump_period_sec !=
-          mutable_db_options_.stats_dump_period_sec) {
-        if (thread_dump_stats_) {
-          mutex_.Unlock();
-          thread_dump_stats_->cancel();
-          mutex_.Lock();
-        }
-        if (new_options.stats_dump_period_sec > 0) {
-          thread_dump_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
-              [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
-              static_cast<uint64_t>(new_options.stats_dump_period_sec) *
-                  kMicrosInSecond));
-        } else {
-          thread_dump_stats_.reset();
-        }
-      }
-      if (new_options.stats_persist_period_sec !=
-          mutable_db_options_.stats_persist_period_sec) {
-        if (thread_persist_stats_) {
-          mutex_.Unlock();
-          thread_persist_stats_->cancel();
-          mutex_.Lock();
-        }
-        if (new_options.stats_persist_period_sec > 0) {
-          thread_persist_stats_.reset(new ROCKSDB_NAMESPACE::RepeatableThread(
-              [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
-              static_cast<uint64_t>(new_options.stats_persist_period_sec) *
-                  kMicrosInSecond));
-        } else {
-          thread_persist_stats_.reset();
-        }
+              mutable_db_options_.stats_dump_period_sec ||
+          new_options.stats_persist_period_sec !=
+              mutable_db_options_.stats_persist_period_sec) {
+        mutex_.Unlock();
+        periodic_work_scheduler_->Unregister(this);
+        periodic_work_scheduler_->Register(
+            this, new_options.stats_dump_period_sec,
+            new_options.stats_persist_period_sec);
+        mutex_.Lock();
       }
       write_controller_.set_max_delayed_write_rate(
           new_options.delayed_write_rate);
@@ -1097,6 +1258,10 @@
       persist_options_status = WriteOptionsFile(
           false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
       write_thread_.ExitUnbatched(&w);
+    } else {
+      // To get here, we must have had invalid options and will not attempt to
+      // persist the options, which means the status is "OK/Uninitialized.
+      persist_options_status.PermitUncheckedError();
     }
   }
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
@@ -1147,25 +1312,25 @@
 
 Status DBImpl::FlushWAL(bool sync) {
   if (manual_wal_flush_) {
-    Status s;
+    IOStatus io_s;
     {
       // We need to lock log_write_mutex_ since logs_ might change concurrently
       InstrumentedMutexLock wl(&log_write_mutex_);
       log::Writer* cur_log_writer = logs_.back().writer;
-      s = cur_log_writer->WriteBuffer();
+      io_s = cur_log_writer->WriteBuffer();
     }
-    if (!s.ok()) {
+    if (!io_s.ok()) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
-                      s.ToString().c_str());
+                      io_s.ToString().c_str());
       // In case there is a fs error we should set it globally to prevent the
       // future writes
-      WriteStatusCheck(s);
+      IOStatusCheck(io_s);
       // whether sync or not, we should abort the rest of function upon error
-      return s;
+      return std::move(io_s);
     }
     if (!sync) {
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
-      return s;
+      return std::move(io_s);
     }
   }
   if (!sync) {
@@ -1217,21 +1382,36 @@
   TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
   RecordTick(stats_, WAL_FILE_SYNCED);
   Status status;
+  IOStatus io_s;
   for (log::Writer* log : logs_to_sync) {
-    status = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
-    if (!status.ok()) {
+    io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
+    if (!io_s.ok()) {
+      status = io_s;
       break;
     }
   }
+  if (!io_s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL Sync error %s",
+                    io_s.ToString().c_str());
+    // In case there is a fs error we should set it globally to prevent the
+    // future writes
+    IOStatusCheck(io_s);
+  }
   if (status.ok() && need_log_dir_sync) {
-    status = directories_.GetWalDir()->Fsync();
+    status = directories_.GetWalDir()->FsyncWithDirOptions(
+        IOOptions(), nullptr,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
   }
   TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
 
   TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
   {
     InstrumentedMutexLock l(&mutex_);
-    MarkLogsSynced(current_log_number, need_log_dir_sync, status);
+    if (status.ok()) {
+      status = MarkLogsSynced(current_log_number, need_log_dir_sync);
+    } else {
+      MarkLogsNotSynced(current_log_number);
+    }
   }
   TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
 
@@ -1249,7 +1429,7 @@
     // future writes
     WriteStatusCheck(status);
   }
-  return status;
+  return std::move(status);
 }
 
 Status DBImpl::UnlockWAL() {
@@ -1257,27 +1437,54 @@
   return Status::OK();
 }
 
-void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
-                            const Status& status) {
+Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
   mutex_.AssertHeld();
-  if (synced_dir && logfile_number_ == up_to && status.ok()) {
+  if (synced_dir && logfile_number_ == up_to) {
     log_dir_synced_ = true;
   }
+  VersionEdit synced_wals;
   for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
-    auto& log = *it;
-    assert(log.getting_synced);
-    if (status.ok() && logs_.size() > 1) {
-      logs_to_free_.push_back(log.ReleaseWriter());
+    auto& wal = *it;
+    assert(wal.getting_synced);
+    if (logs_.size() > 1) {
+      if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+          wal.writer->file()->GetFileSize() > 0) {
+        synced_wals.AddWal(wal.number,
+                           WalMetadata(wal.writer->file()->GetFileSize()));
+      }
+      logs_to_free_.push_back(wal.ReleaseWriter());
       // To modify logs_ both mutex_ and log_write_mutex_ must be held
       InstrumentedMutexLock l(&log_write_mutex_);
       it = logs_.erase(it);
     } else {
-      log.getting_synced = false;
+      wal.getting_synced = false;
       ++it;
     }
   }
-  assert(!status.ok() || logs_.empty() || logs_[0].number > up_to ||
+  assert(logs_.empty() || logs_[0].number > up_to ||
          (logs_.size() == 1 && !logs_[0].getting_synced));
+
+  Status s;
+  if (synced_wals.IsWalAddition()) {
+    // not empty, write to MANIFEST.
+    s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_);
+    if (!s.ok() && versions_->io_status().IsIOError()) {
+      s = error_handler_.SetBGError(versions_->io_status(),
+                                    BackgroundErrorReason::kManifestWrite);
+    }
+  }
+  log_sync_cv_.SignalAll();
+  return s;
+}
+
+void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
+  mutex_.AssertHeld();
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
+       ++it) {
+    auto& wal = *it;
+    assert(wal.getting_synced);
+    wal.getting_synced = false;
+  }
   log_sync_cv_.SignalAll();
 }
 
@@ -1298,23 +1505,49 @@
   }
 }
 
-InternalIterator* DBImpl::NewInternalIterator(
-    Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
-    ColumnFamilyHandle* column_family) {
+Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                   std::string* ts_low) {
+  if (ts_low == nullptr) {
+    return Status::InvalidArgument("ts_low is nullptr");
+  }
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  InstrumentedMutexLock l(&mutex_);
+  *ts_low = cfd->GetFullHistoryTsLow();
+  assert(cfd->user_comparator()->timestamp_size() == ts_low->size());
+  return Status::OK();
+}
+
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+                                              Arena* arena,
+                                              RangeDelAggregator* range_del_agg,
+                                              SequenceNumber sequence,
+                                              ColumnFamilyHandle* column_family,
+                                              bool allow_unprepared_value) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
     cfd = default_cf_handle_->cfd();
   } else {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
     cfd = cfh->cfd();
   }
 
   mutex_.Lock();
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   mutex_.Unlock();
-  ReadOptions roptions;
-  return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg,
-                             sequence);
+  return NewInternalIterator(read_options, cfd, super_version, arena,
+                             range_del_agg, sequence, allow_unprepared_value);
 }
 
 void DBImpl::SchedulePurge() {
@@ -1346,6 +1579,8 @@
     mutex_.Lock();
   }
 
+  assert(bg_purge_scheduled_ > 0);
+
   // Can't use iterator to go over purge_files_ because inside the loop we're
   // unlocking the mutex that protects purge_files_.
   while (!purge_files_.empty()) {
@@ -1413,17 +1648,7 @@
       delete state->super_version;
     }
     if (job_context.HaveSomethingToDelete()) {
-      if (state->background_purge) {
-        // PurgeObsoleteFiles here does not delete files. Instead, it adds the
-        // files to be deleted to a job queue, and deletes it in a separate
-        // background thread.
-        state->db->PurgeObsoleteFiles(job_context, true /* schedule only */);
-        state->mu->Lock();
-        state->db->SchedulePurge();
-        state->mu->Unlock();
-      } else {
-        state->db->PurgeObsoleteFiles(job_context);
-      }
+      state->db->PurgeObsoleteFiles(job_context, state->background_purge);
     }
     job_context.Clean();
   }
@@ -1437,7 +1662,8 @@
                                               SuperVersion* super_version,
                                               Arena* arena,
                                               RangeDelAggregator* range_del_agg,
-                                              SequenceNumber sequence) {
+                                              SequenceNumber sequence,
+                                              bool allow_unprepared_value) {
   InternalIterator* internal_iter;
   assert(arena != nullptr);
   assert(range_del_agg != nullptr);
@@ -1469,7 +1695,8 @@
     // Collect iterators for files in L0 - Ln
     if (read_options.read_tier != kMemtableTier) {
       super_version->current->AddIterators(read_options, file_options_,
-                                           &merge_iter_builder, range_del_agg);
+                                           &merge_iter_builder, range_del_agg,
+                                           allow_unprepared_value);
     }
     internal_iter = merge_iter_builder.Finish();
     IterState* cleanup =
@@ -1496,22 +1723,57 @@
 Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    PinnableSlice* value) {
+  return Get(read_options, column_family, key, value, /*timestamp=*/nullptr);
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableSlice* value, std::string* timestamp) {
   GetImplOptions get_impl_options;
   get_impl_options.column_family = column_family;
   get_impl_options.value = value;
-  return GetImpl(read_options, key, get_impl_options);
+  get_impl_options.timestamp = timestamp;
+  Status s = GetImpl(read_options, key, get_impl_options);
+  return s;
 }
 
+namespace {
+class GetWithTimestampReadCallback : public ReadCallback {
+ public:
+  explicit GetWithTimestampReadCallback(SequenceNumber seq)
+      : ReadCallback(seq) {}
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return seq <= max_visible_seq_;
+  }
+};
+}  // namespace
+
 Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
-                       GetImplOptions get_impl_options) {
+                       GetImplOptions& get_impl_options) {
   assert(get_impl_options.value != nullptr ||
          get_impl_options.merge_operands != nullptr);
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_GET);
+
+  assert(get_impl_options.column_family);
+  const Comparator* ucmp = get_impl_options.column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  GetWithTimestampReadCallback read_cb(0);  // Will call Refresh
+
+#ifndef NDEBUG
+  if (ts_sz > 0) {
+    assert(read_options.timestamp);
+    assert(read_options.timestamp->size() == ts_sz);
+  } else {
+    assert(!read_options.timestamp);
+  }
+#endif  // NDEBUG
+
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
-  auto cfh =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(get_impl_options.column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+      get_impl_options.column_family);
   auto cfd = cfh->cfd();
 
   if (tracer_) {
@@ -1519,7 +1781,8 @@
     // tracing is enabled.
     InstrumentedMutexLock lock(&trace_mutex_);
     if (tracer_) {
-      tracer_->Get(get_impl_options.column_family, key);
+      // TODO: maybe handle the tracing status?
+      tracer_->Get(get_impl_options.column_family, key).PermitUncheckedError();
     }
   }
 
@@ -1544,9 +1807,11 @@
     // data for the snapshot, so the reader would see neither data that was be
     // visible to the snapshot before compaction nor the newer data inserted
     // afterwards.
-    snapshot = last_seq_same_as_publish_seq_
-                   ? versions_->LastSequence()
-                   : versions_->LastPublishedSequence();
+    if (last_seq_same_as_publish_seq_) {
+      snapshot = versions_->LastSequence();
+    } else {
+      snapshot = versions_->LastPublishedSequence();
+    }
     if (get_impl_options.callback) {
       // The unprep_seqs are not published for write unprepared, so it could be
       // that max_visible_seq is larger. Seek to the std::max of the two.
@@ -1566,6 +1831,16 @@
       snapshot = get_impl_options.callback->max_visible_seq();
     }
   }
+  // If timestamp is used, we use read callback to ensure <key,t,s> is returned
+  // only if t <= read_opts.timestamp and s <= snapshot.
+  // HACK: temporarily overwrite input struct field but restore
+  SaveAndRestore<ReadCallback*> restore_callback(&get_impl_options.callback);
+  if (ts_sz > 0) {
+    assert(!get_impl_options
+                .callback);  // timestamp with callback is not supported
+    read_cb.Refresh(snapshot);
+    get_impl_options.callback = &read_cb;
+  }
   TEST_SYNC_POINT("DBImpl::GetImpl:3");
   TEST_SYNC_POINT("DBImpl::GetImpl:4");
 
@@ -1583,10 +1858,11 @@
   bool skip_memtable = (read_options.read_tier == kPersistedTier &&
                         has_unpersisted_data_.load(std::memory_order_relaxed));
   bool done = false;
+  std::string* timestamp = ts_sz > 0 ? get_impl_options.timestamp : nullptr;
   if (!skip_memtable) {
     // Get value associated with key
     if (get_impl_options.get_value) {
-      if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
+      if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), timestamp, &s,
                        &merge_context, &max_covering_tombstone_seq,
                        read_options, get_impl_options.callback,
                        get_impl_options.is_blob_index)) {
@@ -1594,9 +1870,10 @@
         get_impl_options.value->PinSelf();
         RecordTick(stats_, MEMTABLE_HIT);
       } else if ((s.ok() || s.IsMergeInProgress()) &&
-                 sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
-                              &merge_context, &max_covering_tombstone_seq,
-                              read_options, get_impl_options.callback,
+                 sv->imm->Get(lkey, get_impl_options.value->GetSelf(),
+                              timestamp, &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options,
+                              get_impl_options.callback,
                               get_impl_options.is_blob_index)) {
         done = true;
         get_impl_options.value->PinSelf();
@@ -1605,9 +1882,9 @@
     } else {
       // Get Merge Operands associated with key, Merge Operands should not be
       // merged and raw values should be returned to the user.
-      if (sv->mem->Get(lkey, nullptr, &s, &merge_context,
-                       &max_covering_tombstone_seq, read_options, nullptr,
-                       nullptr, false)) {
+      if (sv->mem->Get(lkey, /*value*/ nullptr, /*timestamp=*/nullptr, &s,
+                       &merge_context, &max_covering_tombstone_seq,
+                       read_options, nullptr, nullptr, false)) {
         done = true;
         RecordTick(stats_, MEMTABLE_HIT);
       } else if ((s.ok() || s.IsMergeInProgress()) &&
@@ -1623,11 +1900,12 @@
       return s;
     }
   }
+  PinnedIteratorsManager pinned_iters_mgr;
   if (!done) {
     PERF_TIMER_GUARD(get_from_output_files_time);
     sv->current->Get(
-        read_options, lkey, get_impl_options.value, &s, &merge_context,
-        &max_covering_tombstone_seq,
+        read_options, lkey, get_impl_options.value, timestamp, &s,
+        &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
         get_impl_options.get_value ? get_impl_options.value_found : nullptr,
         nullptr, nullptr,
         get_impl_options.get_value ? get_impl_options.callback : nullptr,
@@ -1675,17 +1953,49 @@
     const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_family,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_MULTIGET);
+  return MultiGet(read_options, column_family, keys, values,
+                  /*timestamps=*/nullptr);
+}
+
+std::vector<Status> DBImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values,
+    std::vector<std::string>* timestamps) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
+#ifndef NDEBUG
+  for (const auto* cfh : column_family) {
+    assert(cfh);
+    const Comparator* const ucmp = cfh->GetComparator();
+    assert(ucmp);
+    if (ucmp->timestamp_size() > 0) {
+      assert(read_options.timestamp);
+      assert(ucmp->timestamp_size() == read_options.timestamp->size());
+    } else {
+      assert(!read_options.timestamp);
+    }
+  }
+#endif  // NDEBUG
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(column_family, keys).PermitUncheckedError();
+    }
+  }
+
   SequenceNumber consistent_seqnum;
-  ;
 
   std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
       column_family.size());
   for (auto cf : column_family) {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(cf);
     auto cfd = cfh->cfd();
     if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
       multiget_cf_data.emplace(cfd->GetID(),
@@ -1704,6 +2014,9 @@
           read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
           &consistent_seqnum);
 
+  TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1");
+  TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2");
+
   // Contain a list of merge operations if merge occurs.
   MergeContext merge_context;
 
@@ -1711,6 +2024,9 @@
   size_t num_keys = keys.size();
   std::vector<Status> stat_list(num_keys);
   values->resize(num_keys);
+  if (timestamps) {
+    timestamps->resize(num_keys);
+  }
 
   // Keep track of bytes that we read for statistics-recording later
   uint64_t bytes_read = 0;
@@ -1721,13 +2037,25 @@
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
   size_t num_found = 0;
-  for (size_t i = 0; i < num_keys; ++i) {
-    merge_context.Clear();
-    Status& s = stat_list[i];
-    std::string* value = &(*values)[i];
+  size_t keys_read;
+  uint64_t curr_value_size = 0;
 
-    LookupKey lkey(keys[i], consistent_seqnum);
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = nullptr;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  for (keys_read = 0; keys_read < num_keys; ++keys_read) {
+    merge_context.Clear();
+    Status& s = stat_list[keys_read];
+    std::string* value = &(*values)[keys_read];
+    std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr;
+
+    LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp);
+    auto cfh =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family[keys_read]);
     SequenceNumber max_covering_tombstone_seq = 0;
     auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
     assert(mgd_iter != multiget_cf_data.end());
@@ -1738,13 +2066,15 @@
          has_unpersisted_data_.load(std::memory_order_relaxed));
     bool done = false;
     if (!skip_memtable) {
-      if (super_version->mem->Get(lkey, value, &s, &merge_context,
-                                  &max_covering_tombstone_seq, read_options)) {
+      if (super_version->mem->Get(lkey, value, timestamp, &s, &merge_context,
+                                  &max_covering_tombstone_seq, read_options,
+                                  read_callback)) {
         done = true;
         RecordTick(stats_, MEMTABLE_HIT);
-      } else if (super_version->imm->Get(lkey, value, &s, &merge_context,
+      } else if (super_version->imm->Get(lkey, value, timestamp, &s,
+                                         &merge_context,
                                          &max_covering_tombstone_seq,
-                                         read_options)) {
+                                         read_options, read_callback)) {
         done = true;
         RecordTick(stats_, MEMTABLE_HIT);
       }
@@ -1752,8 +2082,13 @@
     if (!done) {
       PinnableSlice pinnable_val;
       PERF_TIMER_GUARD(get_from_output_files_time);
-      super_version->current->Get(read_options, lkey, &pinnable_val, &s,
-                                  &merge_context, &max_covering_tombstone_seq);
+      PinnedIteratorsManager pinned_iters_mgr;
+      super_version->current->Get(read_options, lkey, &pinnable_val, timestamp,
+                                  &s, &merge_context,
+                                  &max_covering_tombstone_seq,
+                                  &pinned_iters_mgr, /*value_found=*/nullptr,
+                                  /*key_exists=*/nullptr,
+                                  /*seq=*/nullptr, read_callback);
       value->assign(pinnable_val.data(), pinnable_val.size());
       RecordTick(stats_, MEMTABLE_MISS);
     }
@@ -1761,6 +2096,28 @@
     if (s.ok()) {
       bytes_read += value->size();
       num_found++;
+      curr_value_size += value->size();
+      if (curr_value_size > read_options.value_size_soft_limit) {
+        while (++keys_read < num_keys) {
+          stat_list[keys_read] = Status::Aborted();
+        }
+        break;
+      }
+    }
+    if (read_options.deadline.count() &&
+        immutable_db_options_.clock->NowMicros() >
+            static_cast<uint64_t>(read_options.deadline.count())) {
+      break;
+    }
+  }
+
+  if (keys_read < num_keys) {
+    // The only reason to break out of the loop is when the deadline is
+    // exceeded
+    assert(immutable_db_options_.clock->NowMicros() >
+           static_cast<uint64_t>(read_options.deadline.count()));
+    for (++keys_read; keys_read < num_keys; ++keys_read) {
+      stat_list[keys_read] = Status::TimedOut();
     }
   }
 
@@ -1827,16 +2184,18 @@
       // version because a flush happening in between may compact away data for
       // the snapshot, but the snapshot is earlier than the data overwriting it,
       // so users may see wrong results.
-      *snapshot = last_seq_same_as_publish_seq_
-                      ? versions_->LastSequence()
-                      : versions_->LastPublishedSequence();
+      if (last_seq_same_as_publish_seq_) {
+        *snapshot = versions_->LastSequence();
+      } else {
+        *snapshot = versions_->LastPublishedSequence();
+      }
     }
   } else {
     // If we end up with the same issue of memtable geting sealed during 2
     // consecutive retries, it means the write rate is very high. In that case
     // its probably ok to take the mutex on the 3rd try so we can succeed for
     // sure
-    static const int num_retries = 3;
+    constexpr int num_retries = 3;
     for (int i = 0; i < num_retries; ++i) {
       last_try = (i == num_retries - 1);
       bool retry = false;
@@ -1860,12 +2219,15 @@
           // acquire the lock so we're sure to succeed
           mutex_.Lock();
         }
-        *snapshot = last_seq_same_as_publish_seq_
-                        ? versions_->LastSequence()
-                        : versions_->LastPublishedSequence();
+        if (last_seq_same_as_publish_seq_) {
+          *snapshot = versions_->LastSequence();
+        } else {
+          *snapshot = versions_->LastPublishedSequence();
+        }
       } else {
-        *snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                        ->number_;
+        *snapshot =
+            static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
+                ->number_;
       }
       for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
            ++cf_iter) {
@@ -1915,14 +2277,49 @@
                       ColumnFamilyHandle** column_families, const Slice* keys,
                       PinnableSlice* values, Status* statuses,
                       const bool sorted_input) {
+  return MultiGet(read_options, num_keys, column_families, keys, values,
+                  /*timestamps=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, std::string* timestamps,
+                      Status* statuses, const bool sorted_input) {
   if (num_keys == 0) {
     return;
   }
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_keys; ++i) {
+    ColumnFamilyHandle* cfh = column_families[i];
+    assert(cfh);
+    const Comparator* const ucmp = cfh->GetComparator();
+    assert(ucmp);
+    if (ucmp->timestamp_size() > 0) {
+      assert(read_options.timestamp);
+      assert(read_options.timestamp->size() == ucmp->timestamp_size());
+    } else {
+      assert(!read_options.timestamp);
+    }
+  }
+#endif  // NDEBUG
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError();
+    }
+  }
+
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
   autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   sorted_keys.resize(num_keys);
   for (size_t i = 0; i < num_keys; ++i) {
     key_context.emplace_back(column_families[i], keys[i], &values[i],
+                             timestamps ? &timestamps[i] : nullptr,
                              &statuses[i]);
   }
   for (size_t i = 0; i < num_keys; ++i) {
@@ -1934,20 +2331,18 @@
       multiget_cf_data;
   size_t cf_start = 0;
   ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+
   for (size_t i = 0; i < num_keys; ++i) {
     KeyContext* key_ctx = sorted_keys[i];
     if (key_ctx->column_family != cf) {
-      multiget_cf_data.emplace_back(
-          MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
+      multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr);
       cf_start = i;
       cf = key_ctx->column_family;
     }
   }
-  {
-    // multiget_cf_data.emplace_back(
-    // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
-    multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
-  }
+
+  multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+
   std::function<MultiGetColumnFamilyData*(
       autovector<MultiGetColumnFamilyData,
                  MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
@@ -1963,14 +2358,38 @@
       read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
       &consistent_seqnum);
 
-  for (auto cf_iter = multiget_cf_data.begin();
-       cf_iter != multiget_cf_data.end(); ++cf_iter) {
-    MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
-                 cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = nullptr;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  Status s;
+  auto cf_iter = multiget_cf_data.begin();
+  for (; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+    s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys,
+                     &sorted_keys, cf_iter->super_version, consistent_seqnum,
+                     read_callback);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  if (!s.ok()) {
+    assert(s.IsTimedOut() || s.IsAborted());
+    for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+      for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys;
+           ++i) {
+        *sorted_keys[i]->s = s;
+      }
+    }
+  }
+
+  for (const auto& iter : multiget_cf_data) {
     if (!unref_only) {
-      ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
+      ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version);
     } else {
-      cf_iter->cfd->GetSuperVersion()->Unref();
+      iter.cfd->GetSuperVersion()->Unref();
     }
   }
 }
@@ -1983,7 +2402,7 @@
         static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
     uint32_t cfd_id1 = cfh->cfd()->GetID();
     const Comparator* comparator = cfh->cfd()->user_comparator();
-    cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    cfh = static_cast<ColumnFamilyHandleImpl*>(rhs->column_family);
     uint32_t cfd_id2 = cfh->cfd()->GetID();
 
     if (cfd_id1 < cfd_id2) {
@@ -1993,7 +2412,8 @@
     }
 
     // Both keys are from the same column family
-    int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+    int cmp = comparator->CompareWithoutTimestamp(
+        *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false);
     if (cmp < 0) {
       return true;
     }
@@ -2006,48 +2426,47 @@
 void DBImpl::PrepareMultiGetKeys(
     size_t num_keys, bool sorted_input,
     autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
-#ifndef NDEBUG
   if (sorted_input) {
-    for (size_t index = 0; index < sorted_keys->size(); ++index) {
-      if (index > 0) {
-        KeyContext* lhs = (*sorted_keys)[index - 1];
-        KeyContext* rhs = (*sorted_keys)[index];
-        ColumnFamilyHandleImpl* cfh =
-            reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
-        uint32_t cfd_id1 = cfh->cfd()->GetID();
-        const Comparator* comparator = cfh->cfd()->user_comparator();
-        cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
-        uint32_t cfd_id2 = cfh->cfd()->GetID();
-
-        assert(cfd_id1 <= cfd_id2);
-        if (cfd_id1 < cfd_id2) {
-          continue;
-        }
-
-        // Both keys are from the same column family
-        int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
-        assert(cmp <= 0);
-      }
-      index++;
-    }
-  }
+#ifndef NDEBUG
+    assert(std::is_sorted(sorted_keys->begin(), sorted_keys->end(),
+                          CompareKeyContext()));
 #endif
-  if (!sorted_input) {
-    CompareKeyContext sort_comparator;
-    std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
-              sort_comparator);
+    return;
   }
+
+  std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+            CompareKeyContext());
 }
 
 void DBImpl::MultiGet(const ReadOptions& read_options,
                       ColumnFamilyHandle* column_family, const size_t num_keys,
                       const Slice* keys, PinnableSlice* values,
                       Status* statuses, const bool sorted_input) {
+  return MultiGet(read_options, column_family, num_keys, keys, values,
+                  /*timestamp=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const size_t num_keys,
+                      const Slice* keys, PinnableSlice* values,
+                      std::string* timestamps, Status* statuses,
+                      const bool sorted_input) {
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError();
+    }
+  }
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
   autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   sorted_keys.resize(num_keys);
   for (size_t i = 0; i < num_keys; ++i) {
-    key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
+    key_context.emplace_back(column_family, keys[i], &values[i],
+                             timestamps ? &timestamps[i] : nullptr,
+                             &statuses[i]);
   }
   for (size_t i = 0; i < num_keys; ++i) {
     sorted_keys[i] = &key_context[i];
@@ -2100,33 +2519,61 @@
     consistent_seqnum = callback->max_visible_seq();
   }
 
-  MultiGetImpl(read_options, 0, num_keys, sorted_keys,
-               multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
-               nullptr);
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = callback;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    assert(!read_callback);  // timestamp with callback is not supported
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+                          multiget_cf_data[0].super_version, consistent_seqnum,
+                          read_callback);
+  assert(s.ok() || s.IsTimedOut() || s.IsAborted());
   ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
                                multiget_cf_data[0].super_version);
 }
 
-void DBImpl::MultiGetImpl(
+// The actual implementation of batched MultiGet. Parameters -
+// start_key - Index in the sorted_keys vector to start processing from
+// num_keys - Number of keys to lookup, starting with sorted_keys[start_key]
+// sorted_keys - The entire batch of sorted keys for this CF
+//
+// The per key status is returned in the KeyContext structures pointed to by
+// sorted_keys. An overall Status is also returned, with the only possible
+// values being Status::OK() and Status::TimedOut(). The latter indicates
+// that the call exceeded read_options.deadline
+Status DBImpl::MultiGetImpl(
     const ReadOptions& read_options, size_t start_key, size_t num_keys,
     autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
     SuperVersion* super_version, SequenceNumber snapshot,
-    ReadCallback* callback, bool* is_blob_index) {
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_MULTIGET);
+    ReadCallback* callback) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
 
   // For each of the given keys, apply the entire "get" process as follows:
   // First look in the memtable, then in the immutable memtable (if any).
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
   size_t keys_left = num_keys;
+  Status s;
+  uint64_t curr_value_size = 0;
   while (keys_left) {
+    if (read_options.deadline.count() &&
+        immutable_db_options_.clock->NowMicros() >
+            static_cast<uint64_t>(read_options.deadline.count())) {
+      s = Status::TimedOut();
+      break;
+    }
+
     size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
                             ? MultiGetContext::MAX_BATCH_SIZE
                             : keys_left;
     MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
-                        batch_size, snapshot);
+                        batch_size, snapshot, read_options);
     MultiGetRange range = ctx.GetMultiGetRange();
+    range.AddValueSize(curr_value_size);
     bool lookup_current = false;
 
     keys_left -= batch_size;
@@ -2140,11 +2587,9 @@
         (read_options.read_tier == kPersistedTier &&
          has_unpersisted_data_.load(std::memory_order_relaxed));
     if (!skip_memtable) {
-      super_version->mem->MultiGet(read_options, &range, callback,
-                                   is_blob_index);
+      super_version->mem->MultiGet(read_options, &range, callback);
       if (!range.empty()) {
-        super_version->imm->MultiGet(read_options, &range, callback,
-                                     is_blob_index);
+        super_version->imm->MultiGet(read_options, &range, callback);
       }
       if (!range.empty()) {
         lookup_current = true;
@@ -2154,8 +2599,12 @@
     }
     if (lookup_current) {
       PERF_TIMER_GUARD(get_from_output_files_time);
-      super_version->current->MultiGet(read_options, &range, callback,
-                                       is_blob_index);
+      super_version->current->MultiGet(read_options, &range, callback);
+    }
+    curr_value_size = range.GetValueSize();
+    if (curr_value_size > read_options.value_size_soft_limit) {
+      s = Status::Aborted();
+      break;
     }
   }
 
@@ -2163,13 +2612,21 @@
   PERF_TIMER_GUARD(get_post_process_time);
   size_t num_found = 0;
   uint64_t bytes_read = 0;
-  for (size_t i = start_key; i < start_key + num_keys; ++i) {
+  for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) {
     KeyContext* key = (*sorted_keys)[i];
     if (key->s->ok()) {
       bytes_read += key->value->size();
       num_found++;
     }
   }
+  if (keys_left) {
+    assert(s.IsTimedOut() || s.IsAborted());
+    for (size_t i = start_key + num_keys - keys_left; i < start_key + num_keys;
+         ++i) {
+      KeyContext* key = (*sorted_keys)[i];
+      *key->s = s;
+    }
+  }
 
   RecordTick(stats_, NUMBER_MULTIGET_CALLS);
   RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
@@ -2178,6 +2635,8 @@
   RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
   PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
   PERF_TIMER_STOP(get_post_process_time);
+
+  return s;
 }
 
 Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
@@ -2252,7 +2711,6 @@
                                       const std::string& column_family_name,
                                       ColumnFamilyHandle** handle) {
   Status s;
-  Status persist_options_status;
   *handle = nullptr;
 
   DBOptions db_options =
@@ -2301,7 +2759,7 @@
       auto* cfd =
           versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
       assert(cfd != nullptr);
-      std::map<std::string, std::shared_ptr<Directory>> dummy_created_dirs;
+      std::map<std::string, std::shared_ptr<FSDirectory>> dummy_created_dirs;
       s = cfd->AddDirectories(&dummy_created_dirs);
     }
     if (s.ok()) {
@@ -2333,7 +2791,7 @@
   // this is outside the mutex
   if (s.ok()) {
     NewThreadStatusCfInfo(
-        reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
+        static_cast_with_check<ColumnFamilyHandleImpl>(*handle)->cfd());
   }
   return s;
 }
@@ -2370,7 +2828,7 @@
 }
 
 Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   if (cfd->GetID() == 0) {
     return Status::InvalidArgument("Can't drop default column family");
@@ -2436,7 +2894,8 @@
 
 bool DBImpl::KeyMayExist(const ReadOptions& read_options,
                          ColumnFamilyHandle* column_family, const Slice& key,
-                         std::string* value, bool* value_found) {
+                         std::string* value, std::string* timestamp,
+                         bool* value_found) {
   assert(value != nullptr);
   if (value_found != nullptr) {
     // falsify later if key-may-exist but can't fetch value
@@ -2449,6 +2908,7 @@
   get_impl_options.column_family = column_family;
   get_impl_options.value = &pinnable_val;
   get_impl_options.value_found = value_found;
+  get_impl_options.timestamp = timestamp;
   auto s = GetImpl(roptions, key, get_impl_options);
   value->assign(pinnable_val.data(), pinnable_val.size());
 
@@ -2471,6 +2931,13 @@
   }
   // if iterator wants internal keys, we can only proceed if
   // we can guarantee the deletes haven't been processed yet
+  if (read_options.iter_start_seqnum > 0 &&
+      !iter_start_seqnum_deprecation_warned_.exchange(true)) {
+    ROCKS_LOG_WARN(
+        immutable_db_options_.info_log,
+        "iter_start_seqnum is deprecated, will be removed in a future release. "
+        "Please try using user-defined timestamp instead.");
+  }
   if (immutable_db_options_.preserve_deletes &&
       read_options.iter_start_seqnum > 0 &&
       read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) {
@@ -2478,8 +2945,9 @@
         "Iterator requested internal keys which are too old and are not"
         " guaranteed to be preserved, try larger iter_start_seqnum opt."));
   }
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  auto cfd = cfh->cfd();
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  assert(cfd != nullptr);
   ReadCallback* read_callback = nullptr;  // No read callback provided.
   if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
@@ -2488,10 +2956,11 @@
 
 #else
     SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
-    auto iter = new ForwardIterator(this, read_options, cfd, sv);
+    auto iter = new ForwardIterator(this, read_options, cfd, sv,
+                                    /* allow_unprepared_value */ true);
     result = NewDBIterator(
         env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
-        cfd->user_comparator(), iter, kMaxSequenceNumber,
+        cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
         sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
         this, cfd);
 #endif
@@ -2499,10 +2968,11 @@
     // Note: no need to consider the special case of
     // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
     // WritePreparedTxnDB
-    auto snapshot = read_options.snapshot != nullptr
-                        ? read_options.snapshot->GetSequenceNumber()
-                        : versions_->LastSequence();
-    result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+    result = NewIteratorImpl(read_options, cfd,
+                             (read_options.snapshot != nullptr)
+                                 ? read_options.snapshot->GetSequenceNumber()
+                                 : kMaxSequenceNumber,
+                             read_callback);
   }
   return result;
 }
@@ -2511,10 +2981,28 @@
                                             ColumnFamilyData* cfd,
                                             SequenceNumber snapshot,
                                             ReadCallback* read_callback,
-                                            bool allow_blob,
+                                            bool expose_blob_index,
                                             bool allow_refresh) {
   SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
 
+  TEST_SYNC_POINT("DBImpl::NewIterator:1");
+  TEST_SYNC_POINT("DBImpl::NewIterator:2");
+
+  if (snapshot == kMaxSequenceNumber) {
+    // Note that the snapshot is assigned AFTER referencing the super
+    // version because otherwise a flush happening in between may compact away
+    // data for the snapshot, so the reader would see neither data that was be
+    // visible to the snapshot before compaction nor the newer data inserted
+    // afterwards.
+    // Note that the super version might not contain all the data available
+    // to this snapshot, but in that case it can see all the data in the
+    // super version, which is a valid consistent state after the user
+    // calls NewIterator().
+    snapshot = versions_->LastSequence();
+    TEST_SYNC_POINT("DBImpl::NewIterator:3");
+    TEST_SYNC_POINT("DBImpl::NewIterator:4");
+  }
+
   // Try to generate a DB iterator tree in continuous memory area to be
   // cache friendly. Here is an example of result:
   // +-------------------------------+
@@ -2558,14 +3046,15 @@
   // likely that any iterator pointer is close to the iterator it points to so
   // that they are likely to be in the same cache line and/or page.
   ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
-      sv->mutable_cf_options.max_sequential_skip_in_iterations,
-      sv->version_number, read_callback, this, cfd, allow_blob,
+      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current,
+      snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+      sv->version_number, read_callback, this, cfd, expose_blob_index,
       read_options.snapshot != nullptr ? false : allow_refresh);
 
-  InternalIterator* internal_iter =
-      NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                          db_iter->GetRangeDelAggregator(), snapshot);
+  InternalIterator* internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(),
+      db_iter->GetRangeDelAggregator(), snapshot,
+      /* allow_unprepared_value */ true);
   db_iter->SetIterUnderDBIter(internal_iter);
 
   return db_iter;
@@ -2591,12 +3080,13 @@
         "Tailing iterator not supported in RocksDB lite");
 #else
     for (auto cfh : column_families) {
-      auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
       SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
-      auto iter = new ForwardIterator(this, read_options, cfd, sv);
+      auto iter = new ForwardIterator(this, read_options, cfd, sv,
+                                      /* allow_unprepared_value */ true);
       iterators->push_back(NewDBIterator(
           env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
-          cfd->user_comparator(), iter, kMaxSequenceNumber,
+          cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
           sv->mutable_cf_options.max_sequential_skip_in_iterations,
           read_callback, this, cfd));
     }
@@ -2610,7 +3100,8 @@
                         : versions_->LastSequence();
     for (size_t i = 0; i < column_families.size(); ++i) {
       auto* cfd =
-          reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd();
+          static_cast_with_check<ColumnFamilyHandleImpl>(column_families[i])
+              ->cfd();
       iterators->push_back(
           NewIteratorImpl(read_options, cfd, snapshot, read_callback));
     }
@@ -2630,7 +3121,8 @@
 SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
                                       bool lock) {
   int64_t unix_time = 0;
-  env_->GetCurrentTime(&unix_time);  // Ignore error
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
   SnapshotImpl* s = new SnapshotImpl;
 
   if (lock) {
@@ -2656,7 +3148,7 @@
 }
 
 namespace {
-typedef autovector<ColumnFamilyData*, 2> CfdList;
+using CfdList = autovector<ColumnFamilyData*, 2>;
 bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
   for (const ColumnFamilyData* t : list) {
     if (t == cfd) {
@@ -2668,15 +3160,23 @@
 }  //  namespace
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  if (s == nullptr) {
+    // DBImpl::GetSnapshot() can return nullptr when snapshot
+    // not supported by specifying the condition:
+    // inplace_update_support enabled.
+    return;
+  }
   const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
   {
     InstrumentedMutexLock l(&mutex_);
     snapshots_.Delete(casted_s);
     uint64_t oldest_snapshot;
     if (snapshots_.empty()) {
-      oldest_snapshot = last_seq_same_as_publish_seq_
-                            ? versions_->LastSequence()
-                            : versions_->LastPublishedSequence();
+      if (last_seq_same_as_publish_seq_) {
+        oldest_snapshot = versions_->LastSequence();
+      } else {
+        oldest_snapshot = versions_->LastPublishedSequence();
+      }
     } else {
       oldest_snapshot = snapshots_.oldest()->number_;
     }
@@ -2717,7 +3217,7 @@
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                         TablePropertiesCollection* props) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
 
   // Increment the ref count
@@ -2739,7 +3239,7 @@
 Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
                                             const Range* range, std::size_t n,
                                             TablePropertiesCollection* props) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
 
   // Increment the ref count
@@ -2765,17 +3265,37 @@
 Env* DBImpl::GetEnv() const { return env_; }
 
 FileSystem* DB::GetFileSystem() const {
-  static LegacyFileSystemWrapper fs_wrap(GetEnv());
-  return &fs_wrap;
+  const auto& fs = GetEnv()->GetFileSystem();
+  return fs.get();
 }
 
 FileSystem* DBImpl::GetFileSystem() const {
   return immutable_db_options_.fs.get();
 }
 
+SystemClock* DBImpl::GetSystemClock() const {
+  return immutable_db_options_.clock;
+}
+
+#ifndef ROCKSDB_LITE
+
+Status DBImpl::StartIOTrace(const TraceOptions& trace_options,
+                            std::unique_ptr<TraceWriter>&& trace_writer) {
+  assert(trace_writer != nullptr);
+  return io_tracer_->StartIOTrace(GetSystemClock(), trace_options,
+                                  std::move(trace_writer));
+}
+
+Status DBImpl::EndIOTrace() {
+  io_tracer_->EndIOTrace();
+  return Status::OK();
+}
+
+#endif  // ROCKSDB_LITE
+
 Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
   InstrumentedMutexLock l(&mutex_);
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
                  cfh->cfd()->GetLatestCFOptions());
 }
@@ -2789,7 +3309,8 @@
                          const Slice& property, std::string* value) {
   const DBPropertyInfo* property_info = GetPropertyInfo(property);
   value->clear();
-  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   if (property_info == nullptr) {
     return false;
   } else if (property_info->handle_int) {
@@ -2801,16 +3322,21 @@
     }
     return ret_value;
   } else if (property_info->handle_string) {
-    InstrumentedMutexLock l(&mutex_);
-    return cfd->internal_stats()->GetStringProperty(*property_info, property,
-                                                    value);
+    if (property_info->need_out_of_mutex) {
+      return cfd->internal_stats()->GetStringProperty(*property_info, property,
+                                                      value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetStringProperty(*property_info, property,
+                                                      value);
+    }
   } else if (property_info->handle_string_dbimpl) {
-    std::string tmp_value;
-    bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value);
-    if (ret_value) {
-      *value = tmp_value;
+    if (property_info->need_out_of_mutex) {
+      return (this->*(property_info->handle_string_dbimpl))(value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return (this->*(property_info->handle_string_dbimpl))(value);
     }
-    return ret_value;
   }
   // Shouldn't reach here since exactly one of handle_string and handle_int
   // should be non-nullptr.
@@ -2823,13 +3349,19 @@
                             std::map<std::string, std::string>* value) {
   const DBPropertyInfo* property_info = GetPropertyInfo(property);
   value->clear();
-  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   if (property_info == nullptr) {
     return false;
   } else if (property_info->handle_map) {
-    InstrumentedMutexLock l(&mutex_);
-    return cfd->internal_stats()->GetMapProperty(*property_info, property,
-                                                 value);
+    if (property_info->need_out_of_mutex) {
+      return cfd->internal_stats()->GetMapProperty(*property_info, property,
+                                                   value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetMapProperty(*property_info, property,
+                                                   value);
+    }
   }
   // If we reach this point it means that handle_map is not provided for the
   // requested property
@@ -2842,7 +3374,8 @@
   if (property_info == nullptr || property_info->handle_int == nullptr) {
     return false;
   }
-  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   return GetIntPropertyInternal(cfd, *property_info, false, value);
 }
 
@@ -2860,17 +3393,17 @@
     }
   } else {
     SuperVersion* sv = nullptr;
-    if (!is_locked) {
-      sv = GetAndRefSuperVersion(cfd);
-    } else {
-      sv = cfd->GetSuperVersion();
+    if (is_locked) {
+      mutex_.Unlock();
     }
+    sv = GetAndRefSuperVersion(cfd);
 
     bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
         property_info, sv->current, value);
 
-    if (!is_locked) {
-      ReturnAndCleanupSuperVersion(cfd, sv);
+    ReturnAndCleanupSuperVersion(cfd, sv);
+    if (is_locked) {
+      mutex_.Lock();
     }
 
     return ret;
@@ -2879,7 +3412,7 @@
 
 bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
   assert(value != nullptr);
-  Statistics* statistics = immutable_db_options_.statistics.get();
+  Statistics* statistics = immutable_db_options_.stats;
   if (!statistics) {
     return false;
   }
@@ -2907,23 +3440,28 @@
   }
 
   uint64_t sum = 0;
+  bool ret = true;
   {
     // Needs mutex to protect the list of column families.
     InstrumentedMutexLock l(&mutex_);
     uint64_t value;
-    for (auto* cfd : *versions_->GetColumnFamilySet()) {
+    for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
       if (!cfd->initialized()) {
         continue;
       }
-      if (GetIntPropertyInternal(cfd, *property_info, true, &value)) {
+      ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
+      // GetIntPropertyInternal may release db mutex and re-acquire it.
+      mutex_.AssertHeld();
+      if (ret) {
         sum += value;
       } else {
-        return false;
+        ret = false;
+        break;
       }
     }
   }
   *aggregated_value = sum;
-  return true;
+  return ret;
 }
 
 SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
@@ -3015,7 +3553,7 @@
                                          uint64_t* const count,
                                          uint64_t* const size) {
   ColumnFamilyHandleImpl* cfh =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   ColumnFamilyData* cfd = cfh->cfd();
   SuperVersion* sv = GetAndRefSuperVersion(cfd);
 
@@ -3039,16 +3577,34 @@
     return Status::InvalidArgument("Invalid options");
   }
 
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+
   Version* v;
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   SuperVersion* sv = GetAndRefSuperVersion(cfd);
   v = sv->current;
 
   for (int i = 0; i < n; i++) {
+    Slice start = range[i].start;
+    Slice limit = range[i].limit;
+
+    // Add timestamp if needed
+    std::string start_with_ts, limit_with_ts;
+    if (ts_sz > 0) {
+      // Maximum timestamp means including all key with any timestamp
+      AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz);
+      // Append a maximum timestamp as the range limit is exclusive:
+      // [start, limit)
+      AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz);
+      start = start_with_ts;
+      limit = limit_with_ts;
+    }
     // Convert user_key into a corresponding internal key.
-    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
-    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek);
     sizes[i] = 0;
     if (options.include_files) {
       sizes[i] += versions_->ApproximateSize(
@@ -3100,14 +3656,13 @@
   FileType type;
   WalFileType log_type;
   if (!ParseFileName(name, &number, &type, &log_type) ||
-      (type != kTableFile && type != kLogFile)) {
+      (type != kTableFile && type != kWalFile)) {
     ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
                     name.c_str());
     return Status::InvalidArgument("Invalid file name");
   }
 
-  Status status;
-  if (type == kLogFile) {
+  if (type == kWalFile) {
     // Only allow deleting archived log files
     if (log_type != kArchivedLogFile) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
@@ -3115,7 +3670,7 @@
                       name.c_str());
       return Status::NotSupported("Delete only supported for archived logs");
     }
-    status = wal_manager_.DeleteFile(name, number);
+    Status status = wal_manager_.DeleteFile(name, number);
     if (!status.ok()) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                       "DeleteFile %s failed -- %s.\n", name.c_str(),
@@ -3124,6 +3679,7 @@
     return status;
   }
 
+  Status status;
   int level;
   FileMetaData* metadata;
   ColumnFamilyData* cfd;
@@ -3197,8 +3753,8 @@
 Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
                                    const RangePtr* ranges, size_t n,
                                    bool include_end) {
-  Status status;
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  Status status = Status::OK();
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   ColumnFamilyData* cfd = cfh->cfd();
   VersionEdit edit;
   std::set<FileMetaData*> deleted_files;
@@ -3252,11 +3808,13 @@
           deleted_files.insert(level_file);
           level_file->being_compacted = true;
         }
+        vstorage->ComputeCompactionScore(*cfd->ioptions(),
+                                         *cfd->GetLatestMutableCFOptions());
       }
     }
     if (edit.GetDeletedFiles().empty()) {
       job_context.Clean();
-      return Status::OK();
+      return status;
     }
     input_version->Ref();
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
@@ -3288,10 +3846,16 @@
   versions_->GetLiveFilesMetaData(metadata);
 }
 
+Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+  InstrumentedMutexLock l(&mutex_);
+  return versions_->GetLiveFilesChecksumInfo(checksum_list);
+}
+
 void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
                                      ColumnFamilyMetaData* cf_meta) {
   assert(column_family);
-  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   auto* sv = GetAndRefSuperVersion(cfd);
   {
     // Without mutex, Version::GetColumnFamilyMetaData will have data race with
@@ -3309,6 +3873,17 @@
   ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
+void DBImpl::GetAllColumnFamilyMetaData(
+    std::vector<ColumnFamilyMetaData>* metadata) {
+  InstrumentedMutexLock l(&mutex_);
+  for (auto cfd : *(versions_->GetColumnFamilySet())) {
+    {
+      metadata->emplace_back();
+      cfd->current()->GetColumnFamilyMetaData(&metadata->back());
+    }
+  }
+}
+
 #endif  // ROCKSDB_LITE
 
 Status DBImpl::CheckConsistency() {
@@ -3400,13 +3975,48 @@
     return s;
   }
 
-  // If last character is '\n' remove it from identity
+  // If last character is '\n' remove it from identity. (Old implementations
+  // of Env::GenerateUniqueId() would include a trailing '\n'.)
   if (identity->size() > 0 && identity->back() == '\n') {
     identity->pop_back();
   }
   return s;
 }
 
+Status DBImpl::GetDbSessionId(std::string& session_id) const {
+  session_id.assign(db_session_id_);
+  return Status::OK();
+}
+
+namespace {
+SemiStructuredUniqueIdGen* DbSessionIdGen() {
+  static SemiStructuredUniqueIdGen gen;
+  return &gen;
+}
+}  // namespace
+
+void DBImpl::TEST_ResetDbSessionIdGen() { DbSessionIdGen()->Reset(); }
+
+std::string DBImpl::GenerateDbSessionId(Env*) {
+  // See SemiStructuredUniqueIdGen for its desirable properties.
+  auto gen = DbSessionIdGen();
+
+  uint64_t lo, hi;
+  gen->GenerateNext(&hi, &lo);
+  if (lo == 0) {
+    // Avoid emitting session ID with lo==0, so that SST unique
+    // IDs can be more easily ensured non-zero
+    gen->GenerateNext(&hi, &lo);
+    assert(lo != 0);
+  }
+  return EncodeSessionId(hi, lo);
+}
+
+void DBImpl::SetDbSessionId() {
+  db_session_id_ = GenerateDbSessionId(env_);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_);
+}
+
 // Default implementation -- returns not supported status
 Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
                               const std::string& /*column_family_name*/,
@@ -3437,6 +4047,10 @@
 }
 
 Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
+  if (DefaultColumnFamily() == column_family) {
+    return Status::InvalidArgument(
+        "Cannot destroy the handle returned by DefaultColumnFamily()");
+  }
   delete column_family;
   return Status::OK();
 }
@@ -3444,30 +4058,27 @@
 DB::~DB() {}
 
 Status DBImpl::Close() {
-  if (!closed_) {
-    {
-      InstrumentedMutexLock l(&mutex_);
-      // If there is unreleased snapshot, fail the close call
-      if (!snapshots_.empty()) {
-        return Status::Aborted("Cannot close DB with unreleased snapshot.");
-      }
+  InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
+  if (closed_) {
+    return closing_status_;
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // If there is unreleased snapshot, fail the close call
+    if (!snapshots_.empty()) {
+      return Status::Aborted("Cannot close DB with unreleased snapshot.");
     }
-
-    closed_ = true;
-    return CloseImpl();
   }
-  return Status::OK();
+  closing_status_ = CloseImpl();
+  closed_ = true;
+  return closing_status_;
 }
 
 Status DB::ListColumnFamilies(const DBOptions& db_options,
                               const std::string& name,
                               std::vector<std::string>* column_families) {
-  FileSystem* fs = db_options.file_system.get();
-  LegacyFileSystemWrapper legacy_fs(db_options.env);
-  if (!fs) {
-    fs = &legacy_fs;
-  }
-  return VersionSet::ListColumnFamilies(column_families, name, fs);
+  const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+  return VersionSet::ListColumnFamilies(column_families, name, fs.get());
 }
 
 Snapshot::~Snapshot() {}
@@ -3477,13 +4088,13 @@
   ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
   Env* env = soptions.env;
   std::vector<std::string> filenames;
-  bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions);
+  bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
 
   // Reset the logger because it holds a handle to the
   // log file and prevents cleanup and directory removal
   soptions.info_log.reset();
   // Ignore error in case directory does not exist
-  env->GetChildren(dbname, &filenames);
+  env->GetChildren(dbname, &filenames).PermitUncheckedError();
 
   FileLock* lock;
   const std::string lockname = LockFileName(dbname);
@@ -3499,57 +4110,53 @@
         std::string path_to_delete = dbname + "/" + fname;
         if (type == kMetaDatabase) {
           del = DestroyDB(path_to_delete, options);
-        } else if (type == kTableFile || type == kLogFile) {
-          del = DeleteDBFile(&soptions, path_to_delete, dbname,
-                             /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+        } else if (type == kTableFile || type == kWalFile ||
+                   type == kBlobFile) {
+          del = DeleteDBFile(
+              &soptions, path_to_delete, dbname,
+              /*force_bg=*/false,
+              /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
         } else {
           del = env->DeleteFile(path_to_delete);
         }
-        if (result.ok() && !del.ok()) {
+        if (!del.ok() && result.ok()) {
           result = del;
         }
       }
     }
 
-    std::vector<std::string> paths;
-
-    for (const auto& path : options.db_paths) {
-      paths.emplace_back(path.path);
-    }
-    for (const auto& cf : column_families) {
-      for (const auto& path : cf.options.cf_paths) {
-        paths.emplace_back(path.path);
+    std::set<std::string> paths;
+    for (const DbPath& db_path : options.db_paths) {
+      paths.insert(db_path.path);
+    }
+    for (const ColumnFamilyDescriptor& cf : column_families) {
+      for (const DbPath& cf_path : cf.options.cf_paths) {
+        paths.insert(cf_path.path);
       }
     }
-
-    // Remove duplicate paths.
-    // Note that we compare only the actual paths but not path ids.
-    // This reason is that same path can appear at different path_ids
-    // for different column families.
-    std::sort(paths.begin(), paths.end());
-    paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
-
     for (const auto& path : paths) {
       if (env->GetChildren(path, &filenames).ok()) {
         for (const auto& fname : filenames) {
           if (ParseFileName(fname, &number, &type) &&
-              type == kTableFile) {  // Lock file will be deleted at end
-            std::string table_path = path + "/" + fname;
-            Status del = DeleteDBFile(&soptions, table_path, dbname,
+              (type == kTableFile ||
+               type == kBlobFile)) {  // Lock file will be deleted at end
+            std::string file_path = path + "/" + fname;
+            Status del = DeleteDBFile(&soptions, file_path, dbname,
                                       /*force_bg=*/false, /*force_fg=*/false);
-            if (result.ok() && !del.ok()) {
+            if (!del.ok() && result.ok()) {
               result = del;
             }
           }
         }
-        env->DeleteDir(path);
+        // TODO: Should we return an error if we cannot delete the directory?
+        env->DeleteDir(path).PermitUncheckedError();
       }
     }
 
     std::vector<std::string> walDirFiles;
     std::string archivedir = ArchivalDirectory(dbname);
     bool wal_dir_exists = false;
-    if (dbname != soptions.wal_dir) {
+    if (!soptions.IsWalDirSameAsDBPath(dbname)) {
       wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok();
       archivedir = ArchivalDirectory(soptions.wal_dir);
     }
@@ -3561,42 +4168,47 @@
     if (env->GetChildren(archivedir, &archiveFiles).ok()) {
       // Delete archival files.
       for (const auto& file : archiveFiles) {
-        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
           Status del =
               DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
                            /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
-          if (result.ok() && !del.ok()) {
+          if (!del.ok() && result.ok()) {
             result = del;
           }
         }
       }
-      env->DeleteDir(archivedir);
+      // Ignore error in case dir contains other files
+      env->DeleteDir(archivedir).PermitUncheckedError();
     }
 
     // Delete log files in the WAL dir
     if (wal_dir_exists) {
       for (const auto& file : walDirFiles) {
-        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
           Status del =
               DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
                            soptions.wal_dir, /*force_bg=*/false,
                            /*force_fg=*/!wal_in_db_path);
-          if (result.ok() && !del.ok()) {
+          if (!del.ok() && result.ok()) {
             result = del;
           }
         }
       }
-      env->DeleteDir(soptions.wal_dir);
+      // Ignore error in case dir contains other files
+      env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
     }
 
-    env->UnlockFile(lock);  // Ignore error since state is already gone
-    env->DeleteFile(lockname);
+    // Ignore error since state is already gone
+    env->UnlockFile(lock).PermitUncheckedError();
+    env->DeleteFile(lockname).PermitUncheckedError();
 
     // sst_file_manager holds a ref to the logger. Make sure the logger is
     // gone before trying to remove the directory.
     soptions.sst_file_manager.reset();
 
-    env->DeleteDir(dbname);  // Ignore error in case dir contains other files
+    // Ignore error in case dir contains other files
+    env->DeleteDir(dbname).PermitUncheckedError();
+    ;
   }
   return result;
 }
@@ -3634,11 +4246,13 @@
 
   TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
   TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
+  TEST_SYNC_POINT_CALLBACK("DBImpl::WriteOptionsFile:PersistOptions",
+                           &db_options);
 
   std::string file_name =
       TempOptionsFileName(GetName(), versions_->NewFileNumber());
   Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
-                                   GetFileSystem());
+                                   fs_.get());
 
   if (s.ok()) {
     s = RenameTempFileToOptionsFile(file_name);
@@ -3723,15 +4337,29 @@
   uint64_t options_file_number = versions_->NewFileNumber();
   std::string options_file_name =
       OptionsFileName(GetName(), options_file_number);
-  // Retry if the file name happen to conflict with an existing one.
-  s = GetEnv()->RenameFile(file_name, options_file_name);
+  uint64_t options_file_size = 0;
+  s = GetEnv()->GetFileSize(file_name, &options_file_size);
+  if (s.ok()) {
+    // Retry if the file name happen to conflict with an existing one.
+    s = GetEnv()->RenameFile(file_name, options_file_name);
+    std::unique_ptr<FSDirectory> dir_obj;
+    if (s.ok()) {
+      s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr);
+    }
+    if (s.ok()) {
+      s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
+                                       DirFsyncOptions(options_file_name));
+    }
+  }
   if (s.ok()) {
     InstrumentedMutexLock l(&mutex_);
     versions_->options_file_number_ = options_file_number;
+    versions_->options_file_size_ = options_file_size;
   }
 
   if (0 == disable_delete_obsolete_files_) {
-    DeleteObsoleteOptionsFiles();
+    // TODO: Should we check for errors here?
+    DeleteObsoleteOptionsFiles().PermitUncheckedError();
   }
   return s;
 #else
@@ -3772,16 +4400,17 @@
 //
 // A global method that can dump out the build version
 void DumpRocksDBBuildVersion(Logger* log) {
-#if !defined(IOS_CROSS_COMPILE)
-  // if we compile with Xcode, we don't run build_detect_version, so we don't
-  // generate util/build_version.cc
-  ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR,
-                   ROCKSDB_MINOR, ROCKSDB_PATCH);
-  ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha);
-  ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date);
-#else
-  (void)log;  // ignore "-Wunused-parameter"
-#endif
+  ROCKS_LOG_HEADER(log, "RocksDB version: %s\n",
+                   GetRocksVersionAsString().c_str());
+  const auto& props = GetRocksBuildProperties();
+  const auto& sha = props.find("rocksdb_build_git_sha");
+  if (sha != props.end()) {
+    ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str());
+  }
+  const auto date = props.find("rocksdb_build_date");
+  if (date != props.end()) {
+    ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str());
+  }
 }
 
 #ifndef ROCKSDB_LITE
@@ -3798,29 +4427,41 @@
 
   return earliest_seq;
 }
-#endif  // ROCKSDB_LITE
 
-#ifndef ROCKSDB_LITE
-Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
-                                       bool cache_only,
-                                       SequenceNumber lower_bound_seq,
-                                       SequenceNumber* seq,
-                                       bool* found_record_for_key,
-                                       bool* is_blob_index) {
+Status DBImpl::GetLatestSequenceForKey(
+    SuperVersion* sv, const Slice& key, bool cache_only,
+    SequenceNumber lower_bound_seq, SequenceNumber* seq, std::string* timestamp,
+    bool* found_record_for_key, bool* is_blob_index) {
   Status s;
   MergeContext merge_context;
   SequenceNumber max_covering_tombstone_seq = 0;
 
   ReadOptions read_options;
   SequenceNumber current_seq = versions_->LastSequence();
-  LookupKey lkey(key, current_seq);
+
+  ColumnFamilyData* cfd = sv->cfd;
+  assert(cfd);
+  const Comparator* const ucmp = cfd->user_comparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  std::string ts_buf;
+  if (ts_sz > 0) {
+    assert(timestamp);
+    ts_buf.assign(ts_sz, '\xff');
+  } else {
+    assert(!timestamp);
+  }
+  Slice ts(ts_buf);
+
+  LookupKey lkey(key, current_seq, ts_sz == 0 ? nullptr : &ts);
 
   *seq = kMaxSequenceNumber;
   *found_record_for_key = false;
 
   // Check if there is a record for this key in the latest memtable
-  sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
-               seq, read_options, nullptr /*read_callback*/, is_blob_index);
+  sv->mem->Get(lkey, /*value=*/nullptr, timestamp, &s, &merge_context,
+               &max_covering_tombstone_seq, seq, read_options,
+               nullptr /*read_callback*/, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -3830,6 +4471,10 @@
 
     return s;
   }
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
 
   if (*seq != kMaxSequenceNumber) {
     // Found a sequence number, no need to check immutable memtables
@@ -3845,8 +4490,9 @@
   }
 
   // Check if there is a record for this key in the immutable memtables
-  sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
-               seq, read_options, nullptr /*read_callback*/, is_blob_index);
+  sv->imm->Get(lkey, /*value=*/nullptr, timestamp, &s, &merge_context,
+               &max_covering_tombstone_seq, seq, read_options,
+               nullptr /*read_callback*/, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -3857,6 +4503,11 @@
     return s;
   }
 
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
+
   if (*seq != kMaxSequenceNumber) {
     // Found a sequence number, no need to check memtable history
     *found_record_for_key = true;
@@ -3871,9 +4522,9 @@
   }
 
   // Check if there is a record for this key in the immutable memtables
-  sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
-                          &max_covering_tombstone_seq, seq, read_options,
-                          is_blob_index);
+  sv->imm->GetFromHistory(lkey, /*value=*/nullptr, timestamp, &s,
+                          &merge_context, &max_covering_tombstone_seq, seq,
+                          read_options, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -3885,8 +4536,13 @@
     return s;
   }
 
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
   if (*seq != kMaxSequenceNumber) {
     // Found a sequence number, no need to check SST files
+    assert(0 == ts_sz || *timestamp != std::string(ts_sz, '\xff'));
     *found_record_for_key = true;
     return Status::OK();
   }
@@ -3899,8 +4555,10 @@
   // SST files if cache_only=true?
   if (!cache_only) {
     // Check tables
-    sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
-                     &max_covering_tombstone_seq, nullptr /* value_found */,
+    PinnedIteratorsManager pinned_iters_mgr;
+    sv->current->Get(read_options, lkey, /*value=*/nullptr, timestamp, &s,
+                     &merge_context, &max_covering_tombstone_seq,
+                     &pinned_iters_mgr, nullptr /* value_found */,
                      found_record_for_key, seq, nullptr /*read_callback*/,
                      is_blob_index);
 
@@ -3944,7 +4602,7 @@
     }
   }
   // Ingest multiple external SST files atomically.
-  size_t num_cfs = args.size();
+  const size_t num_cfs = args.size();
   for (size_t i = 0; i != num_cfs; ++i) {
     if (args[i].external_files.empty()) {
       char err_msg[128] = {0};
@@ -3981,14 +4639,11 @@
   std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
   for (const auto& arg : args) {
     auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
-    ingestion_jobs.emplace_back(
-        env_, versions_.get(), cfd, immutable_db_options_, file_options_,
-        &snapshots_, arg.options, &directories_, &event_logger_);
-  }
-  std::vector<std::pair<bool, Status>> exec_results;
-  for (size_t i = 0; i != num_cfs; ++i) {
-    exec_results.emplace_back(false, Status::OK());
+    ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_,
+                                file_options_, &snapshots_, arg.options,
+                                &directories_, &event_logger_, io_tracer_);
   }
+
   // TODO(yanqin) maybe make jobs run in parallel
   uint64_t start_file_number = next_file_number;
   for (size_t i = 1; i != num_cfs; ++i) {
@@ -3996,9 +4651,14 @@
     auto* cfd =
         static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
-    exec_results[i].second = ingestion_jobs[i].Prepare(
-        args[i].external_files, start_file_number, super_version);
-    exec_results[i].first = true;
+    Status es = ingestion_jobs[i].Prepare(
+        args[i].external_files, args[i].files_checksums,
+        args[i].files_checksum_func_names, args[i].file_temperature,
+        start_file_number, super_version);
+    // capture first error only
+    if (!es.ok() && status.ok()) {
+      status = es;
+    }
     CleanupSuperVersion(super_version);
   }
   TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
@@ -4007,22 +4667,18 @@
     auto* cfd =
         static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
-    exec_results[0].second = ingestion_jobs[0].Prepare(
-        args[0].external_files, next_file_number, super_version);
-    exec_results[0].first = true;
-    CleanupSuperVersion(super_version);
-  }
-  for (const auto& exec_result : exec_results) {
-    if (!exec_result.second.ok()) {
-      status = exec_result.second;
-      break;
+    Status es = ingestion_jobs[0].Prepare(
+        args[0].external_files, args[0].files_checksums,
+        args[0].files_checksum_func_names, args[0].file_temperature,
+        next_file_number, super_version);
+    if (!es.ok()) {
+      status = es;
     }
+    CleanupSuperVersion(super_version);
   }
   if (!status.ok()) {
     for (size_t i = 0; i != num_cfs; ++i) {
-      if (exec_results[i].first) {
-        ingestion_jobs[i].Cleanup(status);
-      }
+      ingestion_jobs[i].Cleanup(status);
     }
     InstrumentedMutexLock l(&mutex_);
     ReleaseFileNumberFromPendingOutputs(pending_output_elem);
@@ -4122,14 +4778,11 @@
     if (status.ok()) {
       int consumed_seqno_count =
           ingestion_jobs[0].ConsumedSequenceNumbersCount();
-#ifndef NDEBUG
       for (size_t i = 1; i != num_cfs; ++i) {
-        assert(!!consumed_seqno_count ==
-               !!ingestion_jobs[i].ConsumedSequenceNumbersCount());
-        consumed_seqno_count +=
-            ingestion_jobs[i].ConsumedSequenceNumbersCount();
+        consumed_seqno_count =
+            std::max(consumed_seqno_count,
+                     ingestion_jobs[i].ConsumedSequenceNumbersCount());
       }
-#endif
       if (consumed_seqno_count > 0) {
         const SequenceNumber last_seqno = versions_->LastSequence();
         versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
@@ -4184,6 +4837,15 @@
 #endif  // !NDEBUG
         }
       }
+    } else if (versions_->io_status().IsIOError()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      const IOStatus& io_s = versions_->io_status();
+      // Should handle return error?
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite);
     }
 
     // Resume writes to the DB
@@ -4243,11 +4905,11 @@
   }
 
   // Import sst files from metadata.
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(*handle);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(*handle);
   auto cfd = cfh->cfd();
-  ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
-                                   immutable_db_options_, file_options_,
-                                   import_options, metadata.files);
+  ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_,
+                                   file_options_, import_options,
+                                   metadata.files, io_tracer_);
 
   SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
   VersionEdit dummy_edit;
@@ -4338,15 +5000,49 @@
 
   import_job.Cleanup(status);
   if (!status.ok()) {
-    DropColumnFamily(*handle);
-    DestroyColumnFamilyHandle(*handle);
+    Status temp_s = DropColumnFamily(*handle);
+    if (!temp_s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DropColumnFamily failed with error %s",
+                      temp_s.ToString().c_str());
+    }
+    // Always returns Status::OK()
+    temp_s = DestroyColumnFamilyHandle(*handle);
+    assert(temp_s.ok());
     *handle = nullptr;
   }
   return status;
 }
 
+Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) {
+  return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true);
+}
+
 Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
+  return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false);
+}
+
+Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
+                                      bool use_file_checksum) {
+  // `bytes_read` stat is enabled based on compile-time support and cannot
+  // be dynamically toggled. So we do not need to worry about `PerfLevel`
+  // here, unlike many other `IOStatsContext` / `PerfContext` stats.
+  uint64_t prev_bytes_read = IOSTATS(bytes_read);
+
   Status s;
+
+  if (use_file_checksum) {
+    FileChecksumGenFactory* const file_checksum_gen_factory =
+        immutable_db_options_.file_checksum_gen_factory.get();
+    if (!file_checksum_gen_factory) {
+      s = Status::InvalidArgument(
+          "Cannot verify file checksum if options.file_checksum_gen_factory is "
+          "null");
+      return s;
+    }
+  }
+
+  // TODO: simplify using GetRefedColumnFamilySet?
   std::vector<ColumnFamilyData*> cfd_list;
   {
     InstrumentedMutexLock l(&mutex_);
@@ -4361,11 +5057,12 @@
   for (auto cfd : cfd_list) {
     sv_list.push_back(cfd->GetReferencedSuperVersion(this));
   }
+
   for (auto& sv : sv_list) {
     VersionStorageInfo* vstorage = sv->current->storage_info();
     ColumnFamilyData* cfd = sv->current->cfd();
     Options opts;
-    {
+    if (!use_file_checksum) {
       InstrumentedMutexLock l(&mutex_);
       opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
                      cfd->GetLatestCFOptions());
@@ -4373,17 +5070,50 @@
     for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
       for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
            j++) {
-        const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
+        const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j];
+        const auto& fd = fd_with_krange.fd;
+        const FileMetaData* fmeta = fd_with_krange.file_metadata;
+        assert(fmeta);
         std::string fname = TableFileName(cfd->ioptions()->cf_paths,
                                           fd.GetNumber(), fd.GetPathId());
-        s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_,
-                                                     read_options, fname);
+        if (use_file_checksum) {
+          s = VerifyFullFileChecksum(fmeta->file_checksum,
+                                     fmeta->file_checksum_func_name, fname,
+                                     read_options);
+        } else {
+          s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_,
+                                                       read_options, fname);
+        }
+        RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        prev_bytes_read = IOSTATS(bytes_read);
+      }
+    }
+
+    if (s.ok() && use_file_checksum) {
+      const auto& blob_files = vstorage->GetBlobFiles();
+      for (const auto& pair : blob_files) {
+        const uint64_t blob_file_number = pair.first;
+        const auto& meta = pair.second;
+        assert(meta);
+        const std::string blob_file_name = BlobFileName(
+            cfd->ioptions()->cf_paths.front().path, blob_file_number);
+        s = VerifyFullFileChecksum(meta->GetChecksumValue(),
+                                   meta->GetChecksumMethod(), blob_file_name,
+                                   read_options);
+        RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        prev_bytes_read = IOSTATS(bytes_read);
+        if (!s.ok()) {
+          break;
+        }
       }
     }
     if (!s.ok()) {
       break;
     }
   }
+
   bool defer_purge =
           immutable_db_options().avoid_unnecessary_blocking_io;
   {
@@ -4405,6 +5135,38 @@
       cfd->UnrefAndTryDelete();
     }
   }
+  RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+             IOSTATS(bytes_read) - prev_bytes_read);
+  return s;
+}
+
+Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                      const std::string& func_name_expected,
+                                      const std::string& fname,
+                                      const ReadOptions& read_options) {
+  Status s;
+  if (file_checksum_expected == kUnknownFileChecksum) {
+    return s;
+  }
+  std::string file_checksum;
+  std::string func_name;
+  s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum(
+      fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(),
+      func_name_expected, &file_checksum, &func_name,
+      read_options.readahead_size, immutable_db_options_.allow_mmap_reads,
+      io_tracer_, immutable_db_options_.rate_limiter.get());
+  if (s.ok()) {
+    assert(func_name_expected == func_name);
+    if (file_checksum != file_checksum_expected) {
+      std::ostringstream oss;
+      oss << fname << " file checksum mismatch, ";
+      oss << "expecting "
+          << Slice(file_checksum_expected).ToString(/*hex=*/true);
+      oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true);
+      s = Status::Corruption(oss.str());
+      TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s);
+    }
+  }
   return s;
 }
 
@@ -4437,7 +5199,8 @@
 Status DBImpl::StartTrace(const TraceOptions& trace_options,
                           std::unique_ptr<TraceWriter>&& trace_writer) {
   InstrumentedMutexLock lock(&trace_mutex_);
-  tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
+  tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options,
+                           std::move(trace_writer)));
   return Status::OK();
 }
 
@@ -4448,16 +5211,24 @@
     s = tracer_->Close();
     tracer_.reset();
   } else {
-    return Status::IOError("No trace file to close");
+    s = Status::IOError("No trace file to close");
   }
   return s;
 }
 
+Status DBImpl::NewDefaultReplayer(
+    const std::vector<ColumnFamilyHandle*>& handles,
+    std::unique_ptr<TraceReader>&& reader,
+    std::unique_ptr<Replayer>* replayer) {
+  replayer->reset(new ReplayerImpl(this, handles, std::move(reader)));
+  return Status::OK();
+}
+
 Status DBImpl::StartBlockCacheTrace(
     const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer) {
-  return block_cache_tracer_.StartTrace(env_, trace_options,
-                                        std::move(trace_writer));
+  return block_cache_tracer_.StartTrace(immutable_db_options_.clock,
+                                        trace_options, std::move(trace_writer));
 }
 
 Status DBImpl::EndBlockCacheTrace() {
@@ -4465,24 +5236,27 @@
   return Status::OK();
 }
 
-Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                                 const Slice& lower_bound,
+                                 const Slice upper_bound) {
   Status s;
   if (tracer_) {
     InstrumentedMutexLock lock(&trace_mutex_);
     if (tracer_) {
-      s = tracer_->IteratorSeek(cf_id, key);
+      s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound);
     }
   }
   return s;
 }
 
-Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
-                                        const Slice& key) {
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                        const Slice& lower_bound,
+                                        const Slice upper_bound) {
   Status s;
   if (tracer_) {
     InstrumentedMutexLock lock(&trace_mutex_);
     if (tracer_) {
-      s = tracer_->IteratorSeekForPrev(cf_id, key);
+      s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound);
     }
   }
   return s;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -20,8 +20,8 @@
 #include <vector>
 
 #include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
 #include "db/compaction/compaction_job.h"
-#include "db/dbformat.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "db/external_sst_file_ingestion_job.h"
@@ -50,12 +50,16 @@
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/status.h"
+#ifndef ROCKSDB_LITE
 #include "rocksdb/trace_reader_writer.h"
+#endif  // ROCKSDB_LITE
 #include "rocksdb/transaction_log.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/replayer.h"
+#endif  // ROCKSDB_LITE
 #include "rocksdb/write_buffer_manager.h"
+#include "table/merging_iterator.h"
 #include "table/scoped_arena_iterator.h"
-#include "trace_replay/block_cache_tracer.h"
-#include "trace_replay/trace_replay.h"
 #include "util/autovector.h"
 #include "util/hash.h"
 #include "util/repeatable_thread.h"
@@ -69,6 +73,10 @@
 class InMemoryStatsHistoryIterator;
 class MemTable;
 class PersistentStatsHistoryIterator;
+class PeriodicWorkScheduler;
+#ifndef NDEBUG
+class PeriodicWorkTestScheduler;
+#endif  // !NDEBUG
 class TableCache;
 class TaskLimiterToken;
 class Version;
@@ -82,13 +90,13 @@
 // Class to maintain directories for all database paths other than main one.
 class Directories {
  public:
-  Status SetDirectories(Env* env, const std::string& dbname,
-                        const std::string& wal_dir,
-                        const std::vector<DbPath>& data_paths);
+  IOStatus SetDirectories(FileSystem* fs, const std::string& dbname,
+                          const std::string& wal_dir,
+                          const std::vector<DbPath>& data_paths);
 
-  Directory* GetDataDir(size_t path_id) const {
+  FSDirectory* GetDataDir(size_t path_id) const {
     assert(path_id < data_dirs_.size());
-    Directory* ret_dir = data_dirs_[path_id].get();
+    FSDirectory* ret_dir = data_dirs_[path_id].get();
     if (ret_dir == nullptr) {
       // Should use db_dir_
       return db_dir_.get();
@@ -96,19 +104,19 @@
     return ret_dir;
   }
 
-  Directory* GetWalDir() {
+  FSDirectory* GetWalDir() {
     if (wal_dir_) {
       return wal_dir_.get();
     }
     return db_dir_.get();
   }
 
-  Directory* GetDbDir() { return db_dir_.get(); }
+  FSDirectory* GetDbDir() { return db_dir_.get(); }
 
  private:
-  std::unique_ptr<Directory> db_dir_;
-  std::vector<std::unique_ptr<Directory>> data_dirs_;
-  std::unique_ptr<Directory> wal_dir_;
+  std::unique_ptr<FSDirectory> db_dir_;
+  std::vector<std::unique_ptr<FSDirectory>> data_dirs_;
+  std::unique_ptr<FSDirectory> wal_dir_;
 };
 
 // While DB is the public interface of RocksDB, and DBImpl is the actual
@@ -127,7 +135,8 @@
 class DBImpl : public DB {
  public:
   DBImpl(const DBOptions& options, const std::string& dbname,
-         const bool seq_per_batch = false, const bool batch_per_txn = true);
+         const bool seq_per_batch = false, const bool batch_per_txn = true,
+         bool read_only = false);
   // No copying allowed
   DBImpl(const DBImpl&) = delete;
   void operator=(const DBImpl&) = delete;
@@ -163,6 +172,9 @@
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value, std::string* timestamp) override;
 
   using DB::GetMergeOperands;
   Status GetMergeOperands(const ReadOptions& options,
@@ -185,6 +197,11 @@
       const std::vector<ColumnFamilyHandle*>& column_family,
       const std::vector<Slice>& keys,
       std::vector<std::string>* values) override;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values,
+      std::vector<std::string>* timestamps) override;
 
   // This MultiGet is a batched version, which may be faster than calling Get
   // multiple times, especially if the keys have some spatial locality that
@@ -198,11 +215,22 @@
                         const size_t num_keys, const Slice* keys,
                         PinnableSlice* values, Status* statuses,
                         const bool sorted_input = false) override;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses,
+                        const bool sorted_input = false) override;
 
   virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
                         ColumnFamilyHandle** column_families, const Slice* keys,
                         PinnableSlice* values, Status* statuses,
                         const bool sorted_input = false) override;
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses,
+                        const bool sorted_input = false) override;
 
   virtual void MultiGetWithCallback(
       const ReadOptions& options, ColumnFamilyHandle* column_family,
@@ -230,7 +258,7 @@
   using DB::KeyMayExist;
   virtual bool KeyMayExist(const ReadOptions& options,
                            ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value,
+                           std::string* value, std::string* timestamp,
                            bool* value_found = nullptr) override;
 
   using DB::NewIterator;
@@ -327,16 +355,34 @@
 
   virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
 
+  // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire
+  // and release db_mutex
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                  std::string ts_low) override;
+
+  // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and
+  // release db_mutex
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                             std::string* ts_low) override;
+
   virtual Status GetDbIdentity(std::string& identity) const override;
 
   virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
 
+  virtual Status GetDbSessionId(std::string& session_id) const override;
+
   ColumnFamilyHandle* DefaultColumnFamily() const override;
 
   ColumnFamilyHandle* PersistentStatsColumnFamily() const;
 
   virtual Status Close() override;
 
+  virtual Status DisableFileDeletions() override;
+
+  virtual Status EnableFileDeletions(bool force) override;
+
+  virtual bool IsFileDeletionsEnabled() const;
+
   Status GetStatsHistory(
       uint64_t start_time, uint64_t end_time,
       std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
@@ -344,9 +390,6 @@
 #ifndef ROCKSDB_LITE
   using DB::ResetStats;
   virtual Status ResetStats() override;
-  virtual Status DisableFileDeletions() override;
-  virtual Status EnableFileDeletions(bool force) override;
-  virtual int IsFileDeletionsEnabled() const;
   // All the returned filenames start with "/"
   virtual Status GetLiveFiles(std::vector<std::string>&,
                               uint64_t* manifest_file_size,
@@ -369,13 +412,21 @@
   virtual void GetLiveFilesMetaData(
       std::vector<LiveFileMetaData>* metadata) override;
 
+  virtual Status GetLiveFilesChecksumInfo(
+      FileChecksumList* checksum_list) override;
+
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override;
+
   // Obtains the meta data of the specified column family of the DB.
-  // Status::NotFound() will be returned if the current DB does not have
-  // any column family match the specified name.
   // TODO(yhchiang): output parameter is placed in the end in this codebase.
   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
                                        ColumnFamilyMetaData* metadata) override;
 
+  void GetAllColumnFamilyMetaData(
+      std::vector<ColumnFamilyMetaData>* metadata) override;
+
   Status SuggestCompactRange(ColumnFamilyHandle* column_family,
                              const Slice* begin, const Slice* end) override;
 
@@ -399,8 +450,29 @@
       const ExportImportFilesMetaData& metadata,
       ColumnFamilyHandle** handle) override;
 
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_options) override;
+
   using DB::VerifyChecksum;
   virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+  // Verify the checksums of files in db. Currently only tables are checked.
+  //
+  // read_options: controls file I/O behavior, e.g. read ahead size while
+  //               reading all the live table files.
+  //
+  // use_file_checksum: if false, verify the block checksums of all live table
+  //                    in db. Otherwise, obtain the file checksums and compare
+  //                    with the MANIFEST. Currently, file checksums are
+  //                    recomputed by reading all table files.
+  //
+  // Returns: OK if there is no file whose file or block checksum mismatches.
+  Status VerifyChecksumInternal(const ReadOptions& read_options,
+                                bool use_file_checksum);
+
+  Status VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                const std::string& func_name_expected,
+                                const std::string& fpath,
+                                const ReadOptions& read_options);
 
   using DB::StartTrace;
   virtual Status StartTrace(
@@ -410,6 +482,12 @@
   using DB::EndTrace;
   virtual Status EndTrace() override;
 
+  using DB::NewDefaultReplayer;
+  virtual Status NewDefaultReplayer(
+      const std::vector<ColumnFamilyHandle*>& handles,
+      std::unique_ptr<TraceReader>&& reader,
+      std::unique_ptr<Replayer>* replayer) override;
+
   using DB::StartBlockCacheTrace;
   Status StartBlockCacheTrace(
       const TraceOptions& options,
@@ -418,6 +496,13 @@
   using DB::EndBlockCacheTrace;
   Status EndBlockCacheTrace() override;
 
+  using DB::StartIOTrace;
+  Status StartIOTrace(const TraceOptions& options,
+                      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndIOTrace;
+  Status EndIOTrace() override;
+
   using DB::GetPropertiesOfAllTables;
   virtual Status GetPropertiesOfAllTables(
       ColumnFamilyHandle* column_family,
@@ -429,10 +514,12 @@
 #endif  // ROCKSDB_LITE
 
   // ---- End of implementations of the DB interface ----
+  SystemClock* GetSystemClock() const;
 
   struct GetImplOptions {
     ColumnFamilyHandle* column_family = nullptr;
     PinnableSlice* value = nullptr;
+    std::string* timestamp = nullptr;
     bool* value_found = nullptr;
     ReadCallback* callback = nullptr;
     bool* is_blob_index = nullptr;
@@ -455,13 +542,14 @@
   // If get_impl_options.get_value = false get merge operands associated with
   // get_impl_options.key via get_impl_options.merge_operands
   Status GetImpl(const ReadOptions& options, const Slice& key,
-                 GetImplOptions get_impl_options);
+                 GetImplOptions& get_impl_options);
 
+  // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file.
   ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
                                       ColumnFamilyData* cfd,
                                       SequenceNumber snapshot,
                                       ReadCallback* read_callback,
-                                      bool allow_blob = false,
+                                      bool expose_blob_index = false,
                                       bool allow_refresh = true);
 
   virtual SequenceNumber GetLastPublishedSequence() const {
@@ -504,9 +592,15 @@
   // in the memtables, including memtable history.  If cache_only is false,
   // SST files will also be checked.
   //
+  // `key` should NOT have user-defined timestamp appended to user key even if
+  // timestamp is enabled.
+  //
   // If a key is found, *found_record_for_key will be set to true and
   // *seq will be set to the stored sequence number for the latest
-  // operation on this key or kMaxSequenceNumber if unknown.
+  // operation on this key or kMaxSequenceNumber if unknown. If user-defined
+  // timestamp is enabled for this column family and timestamp is not nullptr,
+  // then *timestamp will be set to the stored timestamp for the latest
+  // operation on this key.
   // If no key is found, *found_record_for_key will be set to false.
   //
   // Note: If cache_only=false, it is possible for *seq to be set to 0 if
@@ -530,12 +624,15 @@
   Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
                                  bool cache_only,
                                  SequenceNumber lower_bound_seq,
-                                 SequenceNumber* seq,
+                                 SequenceNumber* seq, std::string* timestamp,
                                  bool* found_record_for_key,
-                                 bool* is_blob_index = nullptr);
+                                 bool* is_blob_index);
 
-  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
-  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                           const Slice& lower_bound, const Slice upper_bound);
+  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                  const Slice& lower_bound,
+                                  const Slice upper_bound);
 #endif  // ROCKSDB_LITE
 
   // Similar to GetSnapshot(), but also lets the db know that this snapshot
@@ -561,9 +658,16 @@
   // Return an internal iterator over the current state of the database.
   // The keys of this iterator are internal keys (see format.h).
   // The returned iterator should be deleted when no longer needed.
+  // If allow_unprepared_value is true, the returned iterator may defer reading
+  // the value and so will require PrepareValue() to be called before value();
+  // allow_unprepared_value = false is convenient when this optimization is not
+  // useful, e.g. when reading the whole column family.
+  // @param read_options Must outlive the returned iterator.
   InternalIterator* NewInternalIterator(
-      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
-      ColumnFamilyHandle* column_family = nullptr);
+      const ReadOptions& read_options, Arena* arena,
+      RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+      ColumnFamilyHandle* column_family = nullptr,
+      bool allow_unprepared_value = false);
 
   LogsWithPrepTracker* logs_with_prep_tracker() {
     return &logs_with_prep_tracker_;
@@ -687,9 +791,14 @@
 
   const WriteController& write_controller() { return write_controller_; }
 
-  InternalIterator* NewInternalIterator(
-      const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
-      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);
+  // @param read_options Must outlive the returned iterator.
+  InternalIterator* NewInternalIterator(const ReadOptions& read_options,
+                                        ColumnFamilyData* cfd,
+                                        SuperVersion* super_version,
+                                        Arena* arena,
+                                        RangeDelAggregator* range_del_agg,
+                                        SequenceNumber sequence,
+                                        bool allow_unprepared_value);
 
   // hollow transactions shell used for recovery.
   // these will then be passed to TransactionDB so that
@@ -817,8 +926,8 @@
   InstrumentedMutex* mutex() const { return &mutex_; }
 
   // Initialize a brand new DB. The DB directory is expected to be empty before
-  // calling it.
-  Status NewDB();
+  // calling it. Push new manifest file name into `new_filenames`.
+  Status NewDB(std::vector<std::string>* new_filenames);
 
   // This is to be used only by internal rocksdb classes.
   static Status Open(const DBOptions& db_options, const std::string& name,
@@ -826,8 +935,9 @@
                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
                      const bool seq_per_batch, const bool batch_per_txn);
 
-  static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
-                                      std::unique_ptr<Directory>* directory);
+  static IOStatus CreateAndNewDirectory(
+      FileSystem* fs, const std::string& dirname,
+      std::unique_ptr<FSDirectory>* directory);
 
   // find stats map from stats_history_ with smallest timestamp in
   // the range of [start_time, end_time)
@@ -842,13 +952,15 @@
                                      int max_entries_to_print,
                                      std::string* out_str);
 
+  VersionSet* GetVersionSet() const { return versions_.get(); }
+
 #ifndef NDEBUG
   // Compact any files in the named level that overlap [*begin, *end]
   Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
                            ColumnFamilyHandle* column_family = nullptr,
                            bool disallow_trivial_move = false);
 
-  void TEST_SwitchWAL();
+  Status TEST_SwitchWAL();
 
   bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
 
@@ -872,6 +984,9 @@
   Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
                                    const FlushOptions& flush_opts);
 
+  // Wait for background threads to complete scheduled work.
+  Status TEST_WaitForBackgroundWork();
+
   // Wait for memtable compaction
   Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
 
@@ -880,9 +995,15 @@
   // is only for the special test of CancelledCompactions
   Status TEST_WaitForCompact(bool waitUnscheduled = false);
 
+  // Wait for any background purge
+  Status TEST_WaitForPurge();
+
+  // Get the background error status
+  Status TEST_GetBGError();
+
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
-  int64_t TEST_MaxNextLevelOverlappingBytes(
+  uint64_t TEST_MaxNextLevelOverlappingBytes(
       ColumnFamilyHandle* column_family = nullptr);
 
   // Return the current manifest file no.
@@ -894,8 +1015,10 @@
   // get total level0 file size. Only for testing.
   uint64_t TEST_GetLevel0TotalSize();
 
-  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
-                             std::vector<std::vector<FileMetaData>>* metadata);
+  void TEST_GetFilesMetaData(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::vector<FileMetaData>>* metadata,
+      std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata = nullptr);
 
   void TEST_LockMutex();
 
@@ -938,22 +1061,104 @@
   int TEST_BGCompactionsAllowed() const;
   int TEST_BGFlushesAllowed() const;
   size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
-  void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
-  void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
-  bool TEST_IsPersistentStatsEnabled() const;
+  void TEST_WaitForStatsDumpRun(std::function<void()> callback) const;
   size_t TEST_EstimateInMemoryStatsHistorySize() const;
+
+  uint64_t TEST_GetCurrentLogNumber() const {
+    InstrumentedMutexLock l(mutex());
+    assert(!logs_.empty());
+    return logs_.back().number;
+  }
+
+  const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
+    return files_grabbed_for_purge_;
+  }
+
+#ifndef ROCKSDB_LITE
+  PeriodicWorkTestScheduler* TEST_GetPeriodicWorkScheduler() const;
+#endif  // !ROCKSDB_LITE
+
 #endif  // NDEBUG
 
+  // persist stats to column family "_persistent_stats"
+  void PersistStats();
+
+  // dump rocksdb.stats to LOG
+  void DumpStats();
+
+  // flush LOG out of application buffer
+  void FlushInfoLog();
+
+  // Interface to block and signal the DB in case of stalling writes by
+  // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
+  // When DB needs to be blocked or signalled by WriteBufferManager,
+  // state_ is changed accordingly.
+  class WBMStallInterface : public StallInterface {
+   public:
+    enum State {
+      BLOCKED = 0,
+      RUNNING,
+    };
+
+    WBMStallInterface() : state_cv_(&state_mutex_) {
+      MutexLock lock(&state_mutex_);
+      state_ = State::RUNNING;
+    }
+
+    void SetState(State state) {
+      MutexLock lock(&state_mutex_);
+      state_ = state;
+    }
+
+    // Change the state_ to State::BLOCKED and wait until its state is
+    // changed by WriteBufferManager. When stall is cleared, Signal() is
+    // called to change the state and unblock the DB.
+    void Block() override {
+      MutexLock lock(&state_mutex_);
+      while (state_ == State::BLOCKED) {
+        TEST_SYNC_POINT("WBMStallInterface::BlockDB");
+        state_cv_.Wait();
+      }
+    }
+
+    // Called from WriteBufferManager. This function changes the state_
+    // to State::RUNNING indicating the stall is cleared and DB can proceed.
+    void Signal() override {
+      {
+        MutexLock lock(&state_mutex_);
+        state_ = State::RUNNING;
+      }
+      state_cv_.Signal();
+    }
+
+   private:
+    // Conditional variable and mutex to block and
+    // signal the DB during stalling process.
+    port::Mutex state_mutex_;
+    port::CondVar state_cv_;
+    // state represting whether DB is running or blocked because of stall by
+    // WriteBufferManager.
+    State state_;
+  };
+
+  static void TEST_ResetDbSessionIdGen();
+  static std::string GenerateDbSessionId(Env* env);
+
  protected:
   const std::string dbname_;
+  // TODO(peterd): unify with VersionSet::db_id_
   std::string db_id_;
+  // db_session_id_ is an identifier that gets reset
+  // every time the DB is opened
+  std::string db_session_id_;
   std::unique_ptr<VersionSet> versions_;
   // Flag to check whether we allocated and own the info log file
   bool own_info_log_;
   const DBOptions initial_db_options_;
   Env* const env_;
-  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<IOTracer> io_tracer_;
   const ImmutableDBOptions immutable_db_options_;
+  FileSystemPtr fs_;
   MutableDBOptions mutable_db_options_;
   Statistics* stats_;
   std::unordered_map<std::string, RecoveredTransaction*>
@@ -972,6 +1177,14 @@
   ColumnFamilyHandleImpl* default_cf_handle_;
   InternalStats* default_cf_internal_stats_;
 
+  // table_cache_ provides its own synchronization
+  std::shared_ptr<Cache> table_cache_;
+
+  ErrorHandler error_handler_;
+
+  // Unified interface for logging events
+  EventLogger event_logger_;
+
   // only used for dynamically adjusting max_total_wal_size. it is a sum of
   // [write_buffer_size * max_write_buffer_number] over all column families
   uint64_t max_total_in_memory_state_;
@@ -1002,12 +1215,22 @@
   // Default: true
   const bool batch_per_txn_;
 
+  // Each flush or compaction gets its own job id. this counter makes sure
+  // they're unique
+  std::atomic<int> next_job_id_;
+
+  std::atomic<bool> shutting_down_;
+
   // Except in DB::Open(), WriteOptionsFile can only be called when:
   // Persist options to options file.
   // If need_mutex_lock = false, the method will lock DB mutex.
   // If need_enter_write_thread = false, the method will enter write thread.
   Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
 
+  Status CompactRangeInternal(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end);
+
   // The following two functions can only be called when:
   // 1. WriteThread::Writer::EnterUnbatched() is used.
   // 2. db_mutex is NOT held
@@ -1036,6 +1259,8 @@
 #ifndef ROCKSDB_LITE
   void NotifyOnExternalFileIngested(
       ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
+
+  Status FlushForGetLiveFiles();
 #endif  // !ROCKSDB_LITE
 
   void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
@@ -1113,12 +1338,33 @@
   // skipped.
   virtual Status Recover(
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      bool read_only = false, bool error_if_log_file_exist = false,
-      bool error_if_data_exists_in_logs = false,
+      bool read_only = false, bool error_if_wal_file_exists = false,
+      bool error_if_data_exists_in_wals = false,
       uint64_t* recovered_seq = nullptr);
 
   virtual bool OwnTablesAndLogs() const { return true; }
 
+  // Set DB identity file, and write DB ID to manifest if necessary.
+  Status SetDBId(bool read_only);
+
+  // REQUIRES: db mutex held when calling this function, but the db mutex can
+  // be released and re-acquired. Db mutex will be held when the function
+  // returns.
+  // After recovery, there may be SST files in db/cf paths that are
+  // not referenced in the MANIFEST (e.g.
+  // 1. It's best effort recovery;
+  // 2. The VersionEdits referencing the SST files are appended to
+  // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are
+  // still not synced to MANIFEST during recovery.)
+  // We delete these SST files. In the
+  // meantime, we find out the largest file number present in the paths, and
+  // bump up the version set's next_file_number_ to be 1 + largest_file_number.
+  Status DeleteUnreferencedSstFiles();
+
+  // SetDbSessionId() should be called in the constuctor DBImpl()
+  // to ensure that db_session_id_ gets updated every time the DB is opened
+  void SetDbSessionId();
+
  private:
   friend class DB;
   friend class ErrorHandler;
@@ -1144,7 +1390,7 @@
   friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
 #ifndef NDEBUG
   friend class DBTest2_ReadCallbackTest_Test;
-  friend class WriteCallbackTest_WriteWithCallbackTest_Test;
+  friend class WriteCallbackPTest_WriteWithCallbackTest_Test;
   friend class XFTransactionWriteHandler;
   friend class DBBlobIndexTest;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
@@ -1171,6 +1417,7 @@
 
   struct LogFileNumberSize {
     explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+    LogFileNumberSize() {}
     void AddSize(uint64_t new_size) { size += new_size; }
     uint64_t number;
     uint64_t size = 0;
@@ -1245,21 +1492,34 @@
 
   // Information for a manual compaction
   struct ManualCompactionState {
+    ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
+                          int _output_level, uint32_t _output_path_id,
+                          bool _exclusive, bool _disallow_trivial_move,
+                          std::atomic<bool>* _canceled)
+        : cfd(_cfd),
+          input_level(_input_level),
+          output_level(_output_level),
+          output_path_id(_output_path_id),
+          exclusive(_exclusive),
+          disallow_trivial_move(_disallow_trivial_move),
+          canceled(_canceled) {}
+
     ColumnFamilyData* cfd;
     int input_level;
     int output_level;
     uint32_t output_path_id;
     Status status;
-    bool done;
-    bool in_progress;            // compaction request being processed?
-    bool incomplete;             // only part of requested range compacted
+    bool done = false;
+    bool in_progress = false;    // compaction request being processed?
+    bool incomplete = false;     // only part of requested range compacted
     bool exclusive;              // current behavior of only one manual
     bool disallow_trivial_move;  // Force actual compaction to run
-    const InternalKey* begin;    // nullptr means beginning of key range
-    const InternalKey* end;      // nullptr means end of key range
-    InternalKey* manual_end;     // how far we are compacting
-    InternalKey tmp_storage;     // Used to keep track of compaction progress
-    InternalKey tmp_storage1;    // Used to keep track of compaction progress
+    const InternalKey* begin = nullptr;  // nullptr means beginning of key range
+    const InternalKey* end = nullptr;    // nullptr means end of key range
+    InternalKey* manual_end = nullptr;   // how far we are compacting
+    InternalKey tmp_storage;      // Used to keep track of compaction progress
+    InternalKey tmp_storage1;     // Used to keep track of compaction progress
+    std::atomic<bool>* canceled;  // Compaction canceled by the user?
   };
   struct PrepickedCompaction {
     // background compaction takes ownership of `compaction`.
@@ -1276,6 +1536,7 @@
     DBImpl* db;
     // background compaction takes ownership of `prepicked_compaction`.
     PrepickedCompaction* prepicked_compaction;
+    Env::Priority compaction_pri_;
   };
 
   // Initialize the built-in column family for persistent stats. Depending on
@@ -1293,7 +1554,7 @@
   // Required: DB mutex held
   Status PersistentStatsProcessFormatVersion();
 
-  Status ResumeImpl();
+  Status ResumeImpl(DBRecoverContext context);
 
   void MaybeIgnoreError(Status* s) const;
 
@@ -1332,7 +1593,7 @@
   void ReleaseFileNumberFromPendingOutputs(
       std::unique_ptr<std::list<uint64_t>::iterator>& v);
 
-  Status SyncClosedLogs(JobContext* job_context);
+  IOStatus SyncClosedLogs(JobContext* job_context);
 
   // Flush the in-memory write buffer to storage.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful. Then
@@ -1370,6 +1631,12 @@
   Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                      MemTable* mem, VersionEdit* edit);
 
+  // Get the size of a log file and, if truncate is true, truncate the
+  // log file to its actual size, thereby freeing preallocated space.
+  // Return success even if truncate fails
+  Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                    LogFileNumberSize* log);
+
   // Restore alive_log_files_ and total_log_size_ after recovery.
   // It needs to run only when there's no flush during recovery
   // (e.g. avoid_flush_during_recovery=true). May also trigger flush
@@ -1380,6 +1647,10 @@
   //            `num_bytes` going through.
   Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
 
+  // Begin stalling of writes when memory usage increases beyond a certain
+  // threshold.
+  void WriteBufferManagerStallWrites();
+
   Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
                                       WriteBatch* my_batch);
 
@@ -1452,6 +1723,25 @@
     }
   }
 
+  // TaskType is used to identify tasks in thread-pool, currently only
+  // differentiate manual compaction, which could be unscheduled from the
+  // thread-pool.
+  enum class TaskType : uint8_t {
+    kDefault = 0,
+    kManualCompaction = 1,
+    kCount = 2,
+  };
+
+  // Task tag is used to identity tasks in thread-pool, which is
+  // dbImpl obj address + type
+  inline void* GetTaskTag(TaskType type) {
+    return GetTaskTag(static_cast<uint8_t>(type));
+  }
+
+  inline void* GetTaskTag(uint8_t type) {
+    return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
+  }
+
   // REQUIRES: mutex locked and in write thread.
   void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
 
@@ -1459,7 +1749,7 @@
   Status SwitchWAL(WriteContext* write_context);
 
   // REQUIRES: mutex locked and in write thread.
-  Status HandleWriteBufferFull(WriteContext* write_context);
+  Status HandleWriteBufferManagerFlush(WriteContext* write_context);
 
   // REQUIRES: mutex locked
   Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
@@ -1469,21 +1759,30 @@
                          WriteBatch* tmp_batch, size_t* write_with_wal,
                          WriteBatch** to_be_cached_state);
 
-  Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
-                    uint64_t* log_used, uint64_t* log_size);
+  IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
+                      uint64_t* log_used, uint64_t* log_size,
+                      bool with_db_mutex = false, bool with_log_mutex = false);
+
+  IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
+                      log::Writer* log_writer, uint64_t* log_used,
+                      bool need_log_sync, bool need_log_dir_sync,
+                      SequenceNumber sequence);
+
+  IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+                                uint64_t* log_used,
+                                SequenceNumber* last_sequence, size_t seq_inc);
 
-  Status WriteToWAL(const WriteThread::WriteGroup& write_group,
-                    log::Writer* log_writer, uint64_t* log_used,
-                    bool need_log_sync, bool need_log_dir_sync,
-                    SequenceNumber sequence);
-
-  Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
-                              uint64_t* log_used, SequenceNumber* last_sequence,
-                              size_t seq_inc);
+  // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+  // Caller must hold mutex_.
+  void WriteStatusCheckOnLocked(const Status& status);
 
   // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
   void WriteStatusCheck(const Status& status);
 
+  // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
+  // WAL, sync WAL fails, if paranoid check is enabled.
+  void IOStatusCheck(const IOStatus& status);
+
   // Used by WriteImpl to update bg_error_ in case of memtable insert error.
   void MemTableInsertStatusCheck(const Status& memtable_insert_status);
 
@@ -1517,7 +1816,7 @@
   // specified value, this flush request is considered to have completed its
   // work of flushing this column family. After completing the work for all
   // column families in this request, this flush is considered complete.
-  typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
+  using FlushRequest = std::vector<std::pair<ColumnFamilyData*, uint64_t>>;
 
   void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
                             FlushRequest* req);
@@ -1558,18 +1857,12 @@
                               LogBuffer* log_buffer);
 
   // Schedule background tasks
-  void StartTimedTasks();
+  void StartPeriodicWorkScheduler();
 
   void PrintStatistics();
 
   size_t EstimateInMemoryStatsHistorySize() const;
 
-  // persist stats to column family "_persistent_stats"
-  void PersistStats();
-
-  // dump rocksdb.stats to LOG
-  void DumpStats();
-
   // Return the minimum empty level that could hold the total data in the
   // input level. Return the input level, if such level could not be found.
   int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
@@ -1591,14 +1884,16 @@
       std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
 
   // helper function to call after some of the logs_ were synced
-  void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
+  Status MarkLogsSynced(uint64_t up_to, bool synced_dir);
+  // WALs with log number up to up_to are not synced successfully.
+  void MarkLogsNotSynced(uint64_t up_to);
 
   SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
                                 bool lock = true);
 
   uint64_t GetMaxTotalWalSize() const;
 
-  Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+  FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
 
   Status CloseHelper();
 
@@ -1648,8 +1943,8 @@
   size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
   Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
 
-  Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
-                   size_t preallocate_block_size, log::Writer** new_log);
+  IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                     size_t preallocate_block_size, log::Writer** new_log);
 
   // Validate self-consistency of DB options
   static Status ValidateOptions(const DBOptions& db_options);
@@ -1727,14 +2022,15 @@
   // to have acquired the SuperVersion and pass in a snapshot sequence number
   // in order to construct the LookupKeys. The start_key and num_keys specify
   // the range of keys in the sorted_keys vector for a single column family.
-  void MultiGetImpl(
+  Status MultiGetImpl(
       const ReadOptions& read_options, size_t start_key, size_t num_keys,
       autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
-      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
-      bool* is_blob_index);
+      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback);
 
-  // table_cache_ provides its own synchronization
-  std::shared_ptr<Cache> table_cache_;
+  Status DisableFileDeletionsWithLock();
+
+  Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+                                      std::string ts_low);
 
   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
   FileLock* db_lock_;
@@ -1749,8 +2045,13 @@
   // mutex_, the order should be first mutex_ and then log_write_mutex_.
   InstrumentedMutex log_write_mutex_;
 
-  std::atomic<bool> shutting_down_;
-  std::atomic<bool> manual_compaction_paused_;
+  // If zero, manual compactions are allowed to proceed. If non-zero, manual
+  // compactions may still be running, but will quickly fail with
+  // `Status::Incomplete`. The value indicates how many threads have paused
+  // manual compactions. It is accessed in read mode outside the DB mutex in
+  // compaction code paths.
+  std::atomic<int> manual_compaction_paused_;
+
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
   // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
@@ -1778,7 +2079,7 @@
   // accessed from the same write_thread_ without any locks. With
   // two_write_queues writes, where it can be updated in different threads,
   // read and writes are protected by log_write_mutex_ instead. This is to avoid
-  // expesnive mutex_ lock during WAL write, which update log_empty_.
+  // expensive mutex_ lock during WAL write, which update log_empty_.
   bool log_empty_;
 
   ColumnFamilyHandleImpl* persist_stats_cf_handle_;
@@ -1786,12 +2087,15 @@
   bool persistent_stats_cfd_exists_ = true;
 
   // Without two_write_queues, read and writes to alive_log_files_ are
-  // protected by mutex_. However since back() is never popped, and push_back()
-  // is done only from write_thread_, the same thread can access the item
-  // reffered by back() without mutex_. With two_write_queues_, writes
+  // protected by mutex_. With two_write_queues_, writes
   // are protected by locking both mutex_ and log_write_mutex_, and reads must
   // be under either mutex_ or log_write_mutex_.
   std::deque<LogFileNumberSize> alive_log_files_;
+  // Caching the result of `alive_log_files_.back()` so that we do not have to
+  // call `alive_log_files_.back()` in the write thread (WriteToWAL()) which
+  // requires locking db mutex if log_mutex_ is not already held in
+  // two-write-queues mode.
+  std::deque<LogFileNumberSize>::reverse_iterator alive_log_files_tail_;
   // Log files that aren't fully synced, and the current log file.
   // Synchronization:
   //  - push_back() is done from write_thread_ with locked mutex_ and
@@ -1895,7 +2199,7 @@
   std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
 
   // A vector to store the file numbers that have been assigned to certain
-  // JobContext. Current implementation tracks ssts only.
+  // JobContext. Current implementation tracks table and blob files only.
   std::unordered_set<uint64_t> files_grabbed_for_purge_;
 
   // A queue to store log writers to close
@@ -1952,10 +2256,6 @@
   // Number of threads intending to write to memtable
   std::atomic<size_t> pending_memtable_writes_ = {};
 
-  // Each flush or compaction gets its own job id. this counter makes sure
-  // they're unique
-  std::atomic<int> next_job_id_;
-
   // A flag indicating whether the current rocksdb database has any
   // data that is not yet persisted into either WAL or SST file.
   // Used when disableWAL is true.
@@ -1984,9 +2284,6 @@
   WalManager wal_manager_;
 #endif  // ROCKSDB_LITE
 
-  // Unified interface for logging events
-  EventLogger event_logger_;
-
   // A value of > 0 temporarily disables scheduling of background work
   int bg_work_paused_;
 
@@ -2013,15 +2310,15 @@
   // Only to be set during initialization
   std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
 
-  // handle for scheduling stats dumping at fixed intervals
-  // REQUIRES: mutex locked
-  std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_dump_stats_;
-
-  // handle for scheduling stats snapshoting at fixed intervals
-  // REQUIRES: mutex locked
-  std::unique_ptr<ROCKSDB_NAMESPACE::RepeatableThread> thread_persist_stats_;
+#ifndef ROCKSDB_LITE
+  // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog().
+  // Currently, it always use a global instance from
+  // PeriodicWorkScheduler::Default(). Only in unittest, it can be overrided by
+  // PeriodicWorkTestScheduler.
+  PeriodicWorkScheduler* periodic_work_scheduler_;
+#endif
 
-  // When set, we use a separate queue for writes that dont write to memtable.
+  // When set, we use a separate queue for writes that don't write to memtable.
   // In 2PC these are the writes at Prepare phase.
   const bool two_write_queues_;
   const bool manual_wal_flush_;
@@ -2053,8 +2350,10 @@
 
   // Flag to check whether Close() has been called on this DB
   bool closed_;
-
-  ErrorHandler error_handler_;
+  // save the closing status, for re-calling the close()
+  Status closing_status_;
+  // mutex for DB::Close()
+  InstrumentedMutex closing_mutex_;
 
   // Conditional variable to coordinate installation of atomic flush results.
   // With atomic flush, each bg thread installs the result of flushing multiple
@@ -2068,11 +2367,22 @@
   InstrumentedCondVar atomic_flush_install_cv_;
 
   bool wal_in_db_path_;
+
+  BlobFileCompletionCallback blob_callback_;
+
+  // Pointer to WriteBufferManager stalling interface.
+  std::unique_ptr<StallInterface> wbm_stall_;
+
+  // Indicate if deprecation warning message is logged before. Will be removed
+  // soon with the deprecated feature.
+  std::atomic_bool iter_start_seqnum_deprecation_warned_{false};
 };
 
-extern Options SanitizeOptions(const std::string& db, const Options& src);
+extern Options SanitizeOptions(const std::string& db, const Options& src,
+                               bool read_only = false);
 
-extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
+                                 bool read_only = false);
 
 extern CompressionType GetCompressionFlush(
     const ImmutableCFOptions& ioptions,
@@ -2084,18 +2394,37 @@
 // `memtables_to_flush`) will be flushed and thus will not depend on any WAL
 // file.
 // The function is only applicable to 2pc mode.
-extern uint64_t PrecomputeMinLogNumberToKeep(
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
     VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
-    autovector<VersionEdit*> edit_list,
+    const autovector<VersionEdit*>& edit_list,
     const autovector<MemTable*>& memtables_to_flush,
     LogsWithPrepTracker* prep_tracker);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+
+// In non-2PC mode, WALs with log number < the returned number can be
+// deleted after the cfd_to_flush column family is flushed successfully.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists);
 
 // `cfd_to_flush` is the column family whose memtable will be flushed and thus
 // will not depend on any WAL file. nullptr means no memtable is being flushed.
 // The function is only applicable to 2pc mode.
 extern uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
-    const autovector<MemTable*>& memtables_to_flush);
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
+// For atomic flush.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush);
 
 // Fix user-supplied options to be reasonable
 template <class T, class V>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,14 +6,15 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl/db_impl.h"
-
 #include <cinttypes>
+#include <deque>
 
 #include "db/builder.h"
+#include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
@@ -36,8 +37,10 @@
     // Pass the current bg_error_ to SFM so it can decide what checks to
     // perform. If this DB instance hasn't seen any error yet, the SFM can be
     // optimistic and not do disk space checks
-    enough_room =
-        sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError());
+    Status bg_error = error_handler_.GetBGError();
+    enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error);
+    bg_error.PermitUncheckedError();  // bg_error is just a copy of the Status
+                                      // from the error_handler_
     if (enough_room) {
       *sfm_reserved_compact_space = true;
     }
@@ -79,7 +82,7 @@
   return false;
 }
 
-Status DBImpl::SyncClosedLogs(JobContext* job_context) {
+IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
   TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
   mutex_.AssertHeld();
   autovector<log::Writer*, 1> logs_to_sync;
@@ -96,42 +99,52 @@
     logs_to_sync.push_back(log.writer);
   }
 
-  Status s;
+  IOStatus io_s;
   if (!logs_to_sync.empty()) {
     mutex_.Unlock();
 
+    assert(job_context);
+
     for (log::Writer* log : logs_to_sync) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
                      log->get_log_number());
-      s = log->file()->Sync(immutable_db_options_.use_fsync);
-      if (!s.ok()) {
+      io_s = log->file()->Sync(immutable_db_options_.use_fsync);
+      if (!io_s.ok()) {
         break;
       }
 
       if (immutable_db_options_.recycle_log_file_num > 0) {
-        s = log->Close();
-        if (!s.ok()) {
+        io_s = log->Close();
+        if (!io_s.ok()) {
           break;
         }
       }
     }
-    if (s.ok()) {
-      s = directories_.GetWalDir()->Fsync();
+    if (io_s.ok()) {
+      io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
     }
 
+    TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
+                             /*arg=*/nullptr);
     mutex_.Lock();
 
     // "number <= current_log_number - 1" is equivalent to
     // "number < current_log_number".
-    MarkLogsSynced(current_log_number - 1, true, s);
-    if (!s.ok()) {
-      error_handler_.SetBGError(s, BackgroundErrorReason::kFlush);
+    if (io_s.ok()) {
+      io_s = status_to_io_status(MarkLogsSynced(current_log_number - 1, true));
+    } else {
+      MarkLogsNotSynced(current_log_number - 1);
+    }
+    if (!io_s.ok()) {
       TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
-      return s;
+      return io_s;
     }
   }
-  return s;
+  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end");
+  return io_s;
 }
 
 Status DBImpl::FlushMemTableToOutputFile(
@@ -143,44 +156,98 @@
     SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
     Env::Priority thread_pri) {
   mutex_.AssertHeld();
+  assert(cfd);
+  assert(cfd->imm());
   assert(cfd->imm()->NumNotFlushed() != 0);
   assert(cfd->imm()->IsFlushPending());
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+  // If there are more than one column families, we need to make sure that
+  // all the log files except the most recent one are synced. Otherwise if
+  // the host crashes after flushing and before WAL is persistent, the
+  // flushed SST may contain data from write batches whose updates to
+  // other (unflushed) column families are missing.
+  const bool needs_to_sync_closed_wals =
+      logfile_number_ > 0 &&
+      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
+
+  // If needs_to_sync_closed_wals is true, we need to record the current
+  // maximum memtable ID of this column family so that a later PickMemtables()
+  // call will not pick memtables whose IDs are higher. This is due to the fact
+  // that SyncClosedLogs() may release the db mutex, and memtable switch can
+  // happen for this column family in the meantime. The newly created memtables
+  // have their data backed by unsynced WALs, thus they cannot be included in
+  // this flush job.
+  // Another reason why we must record the current maximum memtable ID of this
+  // column family: SyncClosedLogs() may release db mutex, thus it's possible
+  // for application to continue to insert into memtables increasing db's
+  // sequence number. The application may take a snapshot, but this snapshot is
+  // not included in `snapshot_seqs` which will be passed to flush job because
+  // `snapshot_seqs` has already been computed before this function starts.
+  // Recording the max memtable ID ensures that the flush job does not flush
+  // a memtable without knowing such snapshot(s).
+  uint64_t max_memtable_id = needs_to_sync_closed_wals
+                                 ? cfd->imm()->GetLatestMemTableID()
+                                 : port::kMaxUint64;
+
+  // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
+  // existing memtables of the column family when PickMemTable() is called
+  // later. Although we won't call SyncClosedLogs() in this case, we may still
+  // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
+  // releases and re-acquires the db mutex. In the meantime, the application
+  // can still insert into the memtables and increase the db's sequence number.
+  // The application can take a snapshot, hoping that the latest visible state
+  // to this snapshto is preserved. This is hard to guarantee since db mutex
+  // not held. This newly-created snapshot is not included in `snapshot_seqs`
+  // and the flush job is unaware of its presence. Consequently, the flush job
+  // may drop certain keys when generating the L0, causing incorrect data to be
+  // returned for snapshot read using this snapshot.
+  // To address this, we make sure NotifyOnFlushBegin() executes after memtable
+  // picking so that no new snapshot can be taken between the two functions.
 
   FlushJob flush_job(
-      dbname_, cfd, immutable_db_options_, mutable_cf_options,
-      nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(),
-      &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
-      snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
-      GetDataDir(cfd, 0U),
+      dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
+      file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U),
       GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
       &event_logger_, mutable_cf_options.report_bg_io_stats,
-      true /* sync_output_directory */, true /* write_manifest */, thread_pri);
-
+      true /* sync_output_directory */, true /* write_manifest */, thread_pri,
+      io_tracer_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(),
+      &blob_callback_);
   FileMetaData file_meta;
 
+  Status s;
+  bool need_cancel = false;
+  IOStatus log_io_s = IOStatus::OK();
+  if (needs_to_sync_closed_wals) {
+    // SyncClosedLogs() may unlock and re-lock the db_mutex.
+    log_io_s = SyncClosedLogs(job_context);
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+    }
+  } else {
+    TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
+  }
+  s = log_io_s;
+
+  // If the log sync failed, we do not need to pick memtable. Otherwise,
+  // num_flush_not_started_ needs to be rollback.
   TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
-  flush_job.PickMemTable();
-  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
+  if (s.ok()) {
+    flush_job.PickMemTable();
+    need_cancel = true;
+  }
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job);
 
 #ifndef ROCKSDB_LITE
   // may temporarily unlock and lock the mutex.
   NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
 #endif  // ROCKSDB_LITE
 
-  Status s;
-  if (logfile_number_ > 0 &&
-      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) {
-    // If there are more than one column families, we need to make sure that
-    // all the log files except the most recent one are synced. Otherwise if
-    // the host crashes after flushing and before WAL is persistent, the
-    // flushed SST may contain data from write batches whose updates to
-    // other column families are missing.
-    // SyncClosedLogs() may unlock and re-lock the db_mutex.
-    s = SyncClosedLogs(job_context);
-  } else {
-    TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
-  }
-
+  bool switched_to_mempurge = false;
   // Within flush_job.Run, rocksdb may call event listener to notify
   // file creation and deletion.
   //
@@ -188,10 +255,19 @@
   // and EventListener callback will be called when the db_mutex
   // is unlocked by the current thread.
   if (s.ok()) {
-    s = flush_job.Run(&logs_with_prep_tracker_, &file_meta);
-  } else {
+    s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
+                      &switched_to_mempurge);
+    need_cancel = false;
+  }
+
+  if (!s.ok() && need_cancel) {
     flush_job.Cancel();
   }
+  IOStatus io_s = IOStatus::OK();
+  io_s = flush_job.io_status();
+  if (s.ok()) {
+    s = io_s;
+  }
 
   if (s.ok()) {
     InstallSuperVersionAndScheduleWork(cfd, superversion_context,
@@ -199,17 +275,66 @@
     if (made_progress) {
       *made_progress = true;
     }
+
+    const std::string& column_family_name = cfd->GetName();
+
+    Version* const current = cfd->current();
+    assert(current);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    assert(storage_info);
+
     VersionStorageInfo::LevelSummaryStorage tmp;
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
-                     cfd->GetName().c_str(),
-                     cfd->current()->storage_info()->LevelSummary(&tmp));
+                     column_family_name.c_str(),
+                     storage_info->LevelSummary(&tmp));
+
+    const auto& blob_files = storage_info->GetBlobFiles();
+    if (!blob_files.empty()) {
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64
+                       "\n",
+                       column_family_name.c_str(), blob_files.begin()->first,
+                       blob_files.rbegin()->first);
+    }
   }
 
   if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
-    Status new_bg_error = s;
-    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
-  }
-  if (s.ok()) {
+    if (!io_s.ok() && !io_s.IsShutdownInProgress() &&
+        !io_s.IsColumnFamilyDropped()) {
+      assert(log_io_s.ok());
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      if (!versions_->io_status().ok()) {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is
+        // needed.
+        error_handler_.SetBGError(io_s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
+      } else {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    } else {
+      if (log_io_s.ok()) {
+        Status new_bg_error = s;
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
+    }
+  } else {
+    // If we got here, then we decided not to care about the i_os status (either
+    // from never needing it or ignoring the flush job status
+    io_s.PermitUncheckedError();
+  }
+  // If flush ran smoothly and no mempurge happened
+  // install new SST file path.
+  if (s.ok() && (!switched_to_mempurge)) {
 #ifndef ROCKSDB_LITE
     // may temporarily unlock and lock the mutex.
     NotifyOnFlushCompleted(cfd, mutable_cf_options,
@@ -220,7 +345,10 @@
       // Notify sst_file_manager that a new file was added
       std::string file_path = MakeTableFileName(
           cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
-      sfm->OnAddFile(file_path);
+      // TODO (PR7798).  We should only add the file to the FileManager if it
+      // exists. Otherwise, some tests may fail.  Ignore the error in the
+      // interim.
+      sfm->OnAddFile(file_path).PermitUncheckedError();
       if (sfm->IsMaxAllowedSpaceReached()) {
         Status new_bg_error =
             Status::SpaceLimit("Max allowed space was reached");
@@ -243,30 +371,22 @@
     return AtomicFlushMemTablesToOutputFiles(
         bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
   }
+  assert(bg_flush_args.size() == 1);
   std::vector<SequenceNumber> snapshot_seqs;
   SequenceNumber earliest_write_conflict_snapshot;
   SnapshotChecker* snapshot_checker;
   GetSnapshotContext(job_context, &snapshot_seqs,
                      &earliest_write_conflict_snapshot, &snapshot_checker);
-  Status status;
-  for (auto& arg : bg_flush_args) {
-    ColumnFamilyData* cfd = arg.cfd_;
-    MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
-    SuperVersionContext* superversion_context = arg.superversion_context_;
-    Status s = FlushMemTableToOutputFile(
-        cfd, mutable_cf_options, made_progress, job_context,
-        superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
-        snapshot_checker, log_buffer, thread_pri);
-    if (!s.ok()) {
-      status = s;
-      if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
-        // At this point, DB is not shutting down, nor is cfd dropped.
-        // Something is wrong, thus we break out of the loop.
-        break;
-      }
-    }
-  }
-  return status;
+  const auto& bg_flush_arg = bg_flush_args[0];
+  ColumnFamilyData* cfd = bg_flush_arg.cfd_;
+  MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+  SuperVersionContext* superversion_context =
+      bg_flush_arg.superversion_context_;
+  Status s = FlushMemTableToOutputFile(
+      cfd, mutable_cf_options, made_progress, job_context, superversion_context,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      log_buffer, thread_pri);
+  return s;
 }
 
 /*
@@ -301,7 +421,7 @@
   GetSnapshotContext(job_context, &snapshot_seqs,
                      &earliest_write_conflict_snapshot, &snapshot_checker);
 
-  autovector<Directory*> distinct_output_dirs;
+  autovector<FSDirectory*> distinct_output_dirs;
   autovector<std::string> distinct_output_dir_paths;
   std::vector<std::unique_ptr<FlushJob>> jobs;
   std::vector<MutableCFOptions> all_mutable_cf_options;
@@ -309,7 +429,7 @@
   all_mutable_cf_options.reserve(num_cfs);
   for (int i = 0; i < num_cfs; ++i) {
     auto cfd = cfds[i];
-    Directory* data_dir = GetDataDir(cfd, 0U);
+    FSDirectory* data_dir = GetDataDir(cfd, 0U);
     const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
 
     // Add to distinct output directories if eligible. Use linear search. Since
@@ -329,7 +449,7 @@
 
     all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
     const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
-    const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
+    uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_;
     jobs.emplace_back(new FlushJob(
         dbname_, cfd, immutable_db_options_, mutable_cf_options,
         max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
@@ -338,12 +458,16 @@
         data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
         stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
         false /* sync_output_directory */, false /* write_manifest */,
-        thread_pri));
-    jobs.back()->PickMemTable();
+        thread_pri, io_tracer_, db_id_, db_session_id_,
+        cfd->GetFullHistoryTsLow(), &blob_callback_));
   }
 
   std::vector<FileMetaData> file_meta(num_cfs);
+  // Use of deque<bool> because vector<bool>
+  // is specific and doesn't allow &v[i].
+  std::deque<bool> switched_to_mempurge(num_cfs, false);
   Status s;
+  IOStatus log_io_s = IOStatus::OK();
   assert(num_cfs == static_cast<int>(jobs.size()));
 
 #ifndef ROCKSDB_LITE
@@ -358,23 +482,48 @@
   if (logfile_number_ > 0) {
     // TODO (yanqin) investigate whether we should sync the closed logs for
     // single column family case.
-    s = SyncClosedLogs(job_context);
+    log_io_s = SyncClosedLogs(job_context);
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      if (total_log_size_ > 0) {
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+      } else {
+        // If the WAL is empty, we use different error reason
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    }
   }
+  s = log_io_s;
 
   // exec_status stores the execution status of flush_jobs as
   // <bool /* executed */, Status /* status code */>
   autovector<std::pair<bool, Status>> exec_status;
+  autovector<IOStatus> io_status;
+  std::vector<bool> pick_status;
   for (int i = 0; i != num_cfs; ++i) {
     // Initially all jobs are not executed, with status OK.
     exec_status.emplace_back(false, Status::OK());
+    io_status.emplace_back(IOStatus::OK());
+    pick_status.push_back(false);
+  }
+
+  if (s.ok()) {
+    for (int i = 0; i != num_cfs; ++i) {
+      jobs[i]->PickMemTable();
+      pick_status[i] = true;
+    }
   }
 
   if (s.ok()) {
+    assert(switched_to_mempurge.size() ==
+           static_cast<long unsigned int>(num_cfs));
     // TODO (yanqin): parallelize jobs with threads.
     for (int i = 1; i != num_cfs; ++i) {
       exec_status[i].second =
-          jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]);
+          jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i],
+                       &(switched_to_mempurge.at(i)));
       exec_status[i].first = true;
+      io_status[i] = jobs[i]->io_status();
     }
     if (num_cfs > 1) {
       TEST_SYNC_POINT(
@@ -384,9 +533,11 @@
     }
     assert(exec_status.size() > 0);
     assert(!file_meta.empty());
-    exec_status[0].second =
-        jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]);
+    exec_status[0].second = jobs[0]->Run(
+        &logs_with_prep_tracker_, file_meta.data() /* &file_meta[0] */,
+        switched_to_mempurge.empty() ? nullptr : &(switched_to_mempurge.at(0)));
     exec_status[0].first = true;
+    io_status[0] = jobs[0]->io_status();
 
     Status error_status;
     for (const auto& e : exec_status) {
@@ -405,6 +556,21 @@
     s = error_status.ok() ? s : error_status;
   }
 
+  IOStatus io_s = IOStatus::OK();
+  if (io_s.ok()) {
+    IOStatus io_error = IOStatus::OK();
+    for (int i = 0; i != static_cast<int>(io_status.size()); i++) {
+      if (!io_status[i].ok() && !io_status[i].IsShutdownInProgress() &&
+          !io_status[i].IsColumnFamilyDropped()) {
+        io_error = io_status[i];
+      }
+    }
+    io_s = io_error;
+    if (s.ok() && !io_s.ok()) {
+      s = io_s;
+    }
+  }
+
   if (s.IsColumnFamilyDropped()) {
     s = Status::OK();
   }
@@ -413,7 +579,9 @@
     // Sync on all distinct output directories.
     for (auto dir : distinct_output_dirs) {
       if (dir != nullptr) {
-        Status error_status = dir->Fsync();
+        Status error_status = dir->FsyncWithDirOptions(
+            IOOptions(), nullptr,
+            DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
         if (!error_status.ok()) {
           s = error_status;
           break;
@@ -426,12 +594,12 @@
     // Have to cancel the flush jobs that have NOT executed because we need to
     // unref the versions.
     for (int i = 0; i != num_cfs; ++i) {
-      if (!exec_status[i].first) {
+      if (pick_status[i] && !exec_status[i].first) {
         jobs[i]->Cancel();
       }
     }
     for (int i = 0; i != num_cfs; ++i) {
-      if (exec_status[i].first && exec_status[i].second.ok()) {
+      if (exec_status[i].second.ok() && exec_status[i].first) {
         auto& mems = jobs[i]->GetMemTables();
         cfds[i]->imm()->RollbackMemtableFlush(mems,
                                               file_meta[i].fd.GetNumber());
@@ -440,7 +608,15 @@
   }
 
   if (s.ok()) {
-    auto wait_to_install_func = [&]() {
+    const auto wait_to_install_func =
+        [&]() -> std::pair<Status, bool /*continue to wait*/> {
+      if (!versions_->io_status().ok()) {
+        // Something went wrong elsewhere, we cannot count on waiting for our
+        // turn to write/sync to MANIFEST or CURRENT. Just return.
+        return std::make_pair(versions_->io_status(), false);
+      } else if (shutting_down_.load(std::memory_order_acquire)) {
+        return std::make_pair(Status::ShutdownInProgress(), false);
+      }
       bool ready = true;
       for (size_t i = 0; i != cfds.size(); ++i) {
         const auto& mems = jobs[i]->GetMemTables();
@@ -464,18 +640,40 @@
           break;
         }
       }
-      return ready;
+      return std::make_pair(Status::OK(), !ready);
     };
 
     bool resuming_from_bg_err = error_handler_.IsDBStopped();
-    while ((!error_handler_.IsDBStopped() ||
-            error_handler_.GetRecoveryError().ok()) &&
-           !wait_to_install_func()) {
+    while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) {
+      std::pair<Status, bool> res = wait_to_install_func();
+
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", &res);
+
+      if (!res.first.ok()) {
+        s = res.first;
+        break;
+      } else if (!res.second) {
+        break;
+      }
       atomic_flush_install_cv_.Wait();
+
+      resuming_from_bg_err = error_handler_.IsDBStopped();
     }
 
-    s = resuming_from_bg_err ? error_handler_.GetRecoveryError()
-                             : error_handler_.GetBGError();
+    if (!resuming_from_bg_err) {
+      // If not resuming from bg err, then we determine future action based on
+      // whether we hit background error.
+      if (s.ok()) {
+        s = error_handler_.GetBGError();
+      }
+    } else if (s.ok()) {
+      // If resuming from bg err, we still rely on wait_to_install_func()'s
+      // result to determine future action. If wait_to_install_func() returns
+      // non-ok already, then we should not proceed to flush result
+      // installation.
+      s = error_handler_.GetRecoveryError();
+    }
   }
 
   if (s.ok()) {
@@ -483,6 +681,8 @@
     autovector<const autovector<MemTable*>*> mems_list;
     autovector<const MutableCFOptions*> mutable_cf_options_list;
     autovector<FileMetaData*> tmp_file_meta;
+    autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+        committed_flush_jobs_info;
     for (int i = 0; i != num_cfs; ++i) {
       const auto& mems = jobs[i]->GetMemTables();
       if (!cfds[i]->IsDropped() && !mems.empty()) {
@@ -490,29 +690,54 @@
         mems_list.emplace_back(&mems);
         mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
         tmp_file_meta.emplace_back(&file_meta[i]);
+#ifndef ROCKSDB_LITE
+        committed_flush_jobs_info.emplace_back(
+            jobs[i]->GetCommittedFlushJobsInfo());
+#endif  //! ROCKSDB_LITE
       }
     }
 
     s = InstallMemtableAtomicFlushResults(
         nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
-        versions_.get(), &mutex_, tmp_file_meta,
-        &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
+        versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta,
+        committed_flush_jobs_info, &job_context->memtables_to_free,
+        directories_.GetDbDir(), log_buffer);
   }
 
   if (s.ok()) {
     assert(num_cfs ==
            static_cast<int>(job_context->superversion_contexts.size()));
     for (int i = 0; i != num_cfs; ++i) {
+      assert(cfds[i]);
+
       if (cfds[i]->IsDropped()) {
         continue;
       }
       InstallSuperVersionAndScheduleWork(cfds[i],
                                          &job_context->superversion_contexts[i],
                                          all_mutable_cf_options[i]);
+
+      const std::string& column_family_name = cfds[i]->GetName();
+
+      Version* const current = cfds[i]->current();
+      assert(current);
+
+      const VersionStorageInfo* const storage_info = current->storage_info();
+      assert(storage_info);
+
       VersionStorageInfo::LevelSummaryStorage tmp;
       ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
-                       cfds[i]->GetName().c_str(),
-                       cfds[i]->current()->storage_info()->LevelSummary(&tmp));
+                       column_family_name.c_str(),
+                       storage_info->LevelSummary(&tmp));
+
+      const auto& blob_files = storage_info->GetBlobFiles();
+      if (!blob_files.empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Blob file summary: head=%" PRIu64
+                         ", tail=%" PRIu64 "\n",
+                         column_family_name.c_str(), blob_files.begin()->first,
+                         blob_files.rbegin()->first);
+      }
     }
     if (made_progress) {
       *made_progress = true;
@@ -521,7 +746,12 @@
     auto sfm = static_cast<SstFileManagerImpl*>(
         immutable_db_options_.sst_file_manager.get());
     assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
-    for (int i = 0; i != num_cfs; ++i) {
+    for (int i = 0; s.ok() && i != num_cfs; ++i) {
+      // If mempurge happened instead of Flush,
+      // no NotifyOnFlushCompleted call (no SST file created).
+      if (switched_to_mempurge[i]) {
+        continue;
+      }
       if (cfds[i]->IsDropped()) {
         continue;
       }
@@ -530,7 +760,10 @@
       if (sfm) {
         std::string file_path = MakeTableFileName(
             cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
-        sfm->OnAddFile(file_path);
+        // TODO (PR7798).  We should only add the file to the FileManager if it
+        // exists. Otherwise, some tests may fail.  Ignore the error in the
+        // interim.
+        sfm->OnAddFile(file_path).PermitUncheckedError();
         if (sfm->IsMaxAllowedSpaceReached() &&
             error_handler_.GetBGError().ok()) {
           Status new_bg_error =
@@ -543,9 +776,35 @@
 #endif  // ROCKSDB_LITE
   }
 
-  if (!s.ok() && !s.IsShutdownInProgress()) {
-    Status new_bg_error = s;
-    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+  // it is not because of CF drop.
+  if (!s.ok() && !s.IsColumnFamilyDropped()) {
+    if (!io_s.ok() && !io_s.IsColumnFamilyDropped()) {
+      assert(log_io_s.ok());
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      if (!versions_->io_status().ok()) {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor
+        // is needed.
+        error_handler_.SetBGError(io_s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
+      } else {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    } else {
+      if (log_io_s.ok()) {
+        Status new_bg_error = s;
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
+    }
   }
 
   return s;
@@ -644,29 +903,128 @@
 
 Status DBImpl::CompactRange(const CompactRangeOptions& options,
                             ColumnFamilyHandle* column_family,
-                            const Slice* begin, const Slice* end) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+                            const Slice* begin_without_ts,
+                            const Slice* end_without_ts) {
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  if (ts_sz == 0) {
+    return CompactRangeInternal(options, column_family, begin_without_ts,
+                                end_without_ts);
+  }
+
+  std::string begin_str;
+  std::string end_str;
+
+  // CompactRange compact all keys: [begin, end] inclusively. Add maximum
+  // timestamp to include all `begin` keys, and add minimal timestamp to include
+  // all `end` keys.
+  if (begin_without_ts != nullptr) {
+    AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz);
+  }
+  if (end_without_ts != nullptr) {
+    AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz);
+  }
+  Slice begin(begin_str);
+  Slice end(end_str);
+
+  Slice* begin_with_ts = begin_without_ts ? &begin : nullptr;
+  Slice* end_with_ts = end_without_ts ? &end : nullptr;
+
+  return CompactRangeInternal(options, column_family, begin_with_ts,
+                              end_with_ts);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                        std::string ts_low) {
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  if (cfd->user_comparator()->timestamp_size() != ts_low.size()) {
+    return Status::InvalidArgument("ts_low size mismatch");
+  }
+  return IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+                                            std::string ts_low) {
+  VersionEdit edit;
+  edit.SetColumnFamily(cfd->GetID());
+  edit.SetFullHistoryTsLow(ts_low);
+
+  InstrumentedMutexLock l(&mutex_);
+  std::string current_ts_low = cfd->GetFullHistoryTsLow();
+  const Comparator* ucmp = cfd->user_comparator();
+  assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty());
+  if (!current_ts_low.empty() &&
+      ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) {
+    return Status::InvalidArgument(
+        "Cannot decrease full_history_timestamp_low");
+  }
+
+  return versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+                                &mutex_);
+}
+
+Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
+                                    ColumnFamilyHandle* column_family,
+                                    const Slice* begin, const Slice* end) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
 
   if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
     return Status::InvalidArgument("Invalid target path ID");
   }
 
-  bool exclusive = options.exclusive_manual_compaction;
-
   bool flush_needed = true;
+
+  // Update full_history_ts_low if it's set
+  if (options.full_history_ts_low != nullptr &&
+      !options.full_history_ts_low->empty()) {
+    std::string ts_low = options.full_history_ts_low->ToString();
+    if (begin != nullptr || end != nullptr) {
+      return Status::InvalidArgument(
+          "Cannot specify compaction range with full_history_ts_low");
+    }
+    Status s = IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
+  }
+
+  Status s;
   if (begin != nullptr && end != nullptr) {
     // TODO(ajkr): We could also optimize away the flush in certain cases where
     // one/both sides of the interval are unbounded. But it requires more
     // changes to RangesOverlapWithMemtables.
     Range range(*begin, *end);
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
-    cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed);
+    s = cfd->RangesOverlapWithMemtables(
+        {range}, super_version, immutable_db_options_.allow_data_in_errors,
+        &flush_needed);
     CleanupSuperVersion(super_version);
   }
 
-  Status s;
-  if (flush_needed) {
+  if (s.ok() && flush_needed) {
     FlushOptions fo;
     fo.allow_write_stall = options.allow_write_stall;
     if (immutable_db_options_.atomic_flush) {
@@ -686,25 +1044,9 @@
     }
   }
 
-  int max_level_with_files = 0;
-  // max_file_num_to_ignore can be used to filter out newly created SST files,
-  // useful for bottom level compaction in a manual compaction
-  uint64_t max_file_num_to_ignore = port::kMaxUint64;
-  uint64_t next_file_number = port::kMaxUint64;
-  {
-    InstrumentedMutexLock l(&mutex_);
-    Version* base = cfd->current();
-    for (int level = 1; level < base->storage_info()->num_non_empty_levels();
-         level++) {
-      if (base->storage_info()->OverlapInLevel(level, begin, end)) {
-        max_level_with_files = level;
-      }
-    }
-    next_file_number = versions_->current_next_file_number();
-  }
-
-  int final_output_level = 0;
-
+  constexpr int kInvalidLevel = -1;
+  int final_output_level = kInvalidLevel;
+  bool exclusive = options.exclusive_manual_compaction;
   if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
       cfd->NumberLevels() > 1) {
     // Always compact all files together.
@@ -715,70 +1057,132 @@
     }
     s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
                             final_output_level, options, begin, end, exclusive,
-                            false, max_file_num_to_ignore);
+                            false, port::kMaxUint64);
   } else {
-    for (int level = 0; level <= max_level_with_files; level++) {
-      int output_level;
-      // in case the compaction is universal or if we're compacting the
-      // bottom-most level, the output level will be the same as input one.
-      // level 0 can never be the bottommost level (i.e. if all files are in
-      // level 0, we will compact to level 1)
-      if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
-          cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
-        output_level = level;
-      } else if (level == max_level_with_files && level > 0) {
-        if (options.bottommost_level_compaction ==
-            BottommostLevelCompaction::kSkip) {
-          // Skip bottommost level compaction
-          continue;
-        } else if (options.bottommost_level_compaction ==
-                       BottommostLevelCompaction::kIfHaveCompactionFilter &&
-                   cfd->ioptions()->compaction_filter == nullptr &&
-                   cfd->ioptions()->compaction_filter_factory == nullptr) {
-          // Skip bottommost level compaction since we don't have a compaction
-          // filter
-          continue;
+    int first_overlapped_level = kInvalidLevel;
+    int max_overlapped_level = kInvalidLevel;
+    {
+      SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+      Version* current_version = super_version->current;
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      bool overlap;
+      for (int level = 0;
+           level < current_version->storage_info()->num_non_empty_levels();
+           level++) {
+        overlap = true;
+        if (begin != nullptr && end != nullptr) {
+          Status status = current_version->OverlapWithLevelIterator(
+              ro, file_options_, *begin, *end, level, &overlap);
+          if (!status.ok()) {
+            overlap = current_version->storage_info()->OverlapInLevel(
+                level, begin, end);
+          }
+        } else {
+          overlap = current_version->storage_info()->OverlapInLevel(level,
+                                                                    begin, end);
         }
-        output_level = level;
-        // update max_file_num_to_ignore only for bottom level compaction
-        // because data in newly compacted files in middle levels may still need
-        // to be pushed down
-        max_file_num_to_ignore = next_file_number;
-      } else {
-        output_level = level + 1;
-        if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
-            cfd->ioptions()->level_compaction_dynamic_level_bytes &&
-            level == 0) {
-          output_level = ColumnFamilyData::kCompactToBaseLevel;
+        if (overlap) {
+          if (first_overlapped_level == kInvalidLevel) {
+            first_overlapped_level = level;
+          }
+          max_overlapped_level = level;
         }
       }
-      s = RunManualCompaction(cfd, level, output_level, options, begin, end,
-                              exclusive, false, max_file_num_to_ignore);
-      if (!s.ok()) {
-        break;
-      }
-      if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
-        final_output_level = cfd->NumberLevels() - 1;
-      } else if (output_level > final_output_level) {
-        final_output_level = output_level;
+      CleanupSuperVersion(super_version);
+    }
+    if (s.ok() && first_overlapped_level != kInvalidLevel) {
+      // max_file_num_to_ignore can be used to filter out newly created SST
+      // files, useful for bottom level compaction in a manual compaction
+      uint64_t max_file_num_to_ignore = port::kMaxUint64;
+      uint64_t next_file_number = versions_->current_next_file_number();
+      final_output_level = max_overlapped_level;
+      int output_level;
+      for (int level = first_overlapped_level; level <= max_overlapped_level;
+           level++) {
+        bool disallow_trivial_move = false;
+        // in case the compaction is universal or if we're compacting the
+        // bottom-most level, the output level will be the same as input one.
+        // level 0 can never be the bottommost level (i.e. if all files are in
+        // level 0, we will compact to level 1)
+        if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+            cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+          output_level = level;
+        } else if (level == max_overlapped_level && level > 0) {
+          if (options.bottommost_level_compaction ==
+              BottommostLevelCompaction::kSkip) {
+            // Skip bottommost level compaction
+            continue;
+          } else if (options.bottommost_level_compaction ==
+                         BottommostLevelCompaction::kIfHaveCompactionFilter &&
+                     cfd->ioptions()->compaction_filter == nullptr &&
+                     cfd->ioptions()->compaction_filter_factory == nullptr) {
+            // Skip bottommost level compaction since we don't have a compaction
+            // filter
+            continue;
+          }
+          output_level = level;
+          // update max_file_num_to_ignore only for bottom level compaction
+          // because data in newly compacted files in middle levels may still
+          // need to be pushed down
+          max_file_num_to_ignore = next_file_number;
+        } else {
+          output_level = level + 1;
+          if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+              cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+              level == 0) {
+            output_level = ColumnFamilyData::kCompactToBaseLevel;
+          }
+          // if it's a BottommostLevel compaction and `kForce*` compaction is
+          // set, disallow trivial move
+          if (level == max_overlapped_level &&
+              (options.bottommost_level_compaction ==
+                   BottommostLevelCompaction::kForce ||
+               options.bottommost_level_compaction ==
+                   BottommostLevelCompaction::kForceOptimized)) {
+            disallow_trivial_move = true;
+          }
+        }
+        s = RunManualCompaction(cfd, level, output_level, options, begin, end,
+                                exclusive, disallow_trivial_move,
+                                max_file_num_to_ignore);
+        if (!s.ok()) {
+          break;
+        }
+        if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+          final_output_level = cfd->NumberLevels() - 1;
+        } else if (output_level > final_output_level) {
+          final_output_level = output_level;
+        }
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
       }
-      TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
-      TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
     }
   }
-  if (!s.ok()) {
+  if (!s.ok() || final_output_level == kInvalidLevel) {
     LogFlush(immutable_db_options_.info_log);
     return s;
   }
 
   if (options.change_level) {
+    TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:1");
+    TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:2");
+
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
                    "[RefitLevel] waiting for background threads to stop");
+    DisableManualCompaction();
     s = PauseBackgroundWork();
     if (s.ok()) {
+      TEST_SYNC_POINT("DBImpl::CompactRange:PreRefitLevel");
       s = ReFitLevel(cfd, final_output_level, options.target_level);
-    }
-    ContinueBackgroundWork();
+      TEST_SYNC_POINT("DBImpl::CompactRange:PostRefitLevel");
+      // ContinueBackgroundWork always return Status::OK().
+      Status temp_s = ContinueBackgroundWork();
+      assert(temp_s.ok());
+    }
+    EnableManualCompaction();
+    TEST_SYNC_POINT(
+        "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled");
   }
   LogFlush(immutable_db_options_.info_log);
 
@@ -813,11 +1217,12 @@
     return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
   }
 
-  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   assert(cfd);
 
   Status s;
-  JobContext job_context(0, true);
+  JobContext job_context(next_job_id_.fetch_add(1), true);
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
 
@@ -884,7 +1289,7 @@
   if (shutting_down_.load(std::memory_order_acquire)) {
     return Status::ShutdownInProgress();
   }
-  if (manual_compaction_paused_.load(std::memory_order_acquire)) {
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
 
@@ -946,7 +1351,7 @@
   assert(cfd->compaction_picker());
   c.reset(cfd->compaction_picker()->CompactFiles(
       compact_options, input_files, output_level, version->storage_info(),
-      *cfd->GetLatestMutableCFOptions(), output_path_id));
+      *cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id));
   // we already sanitized the set of input files and checked for conflicts
   // without releasing the lock, so we're guaranteed a compaction can be formed.
   assert(c != nullptr);
@@ -968,15 +1373,18 @@
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJobStats compaction_job_stats;
   CompactionJob compaction_job(
-      job_context->job_id, c.get(), immutable_db_options_,
+      job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_,
       file_options_for_compaction_, versions_.get(), &shutting_down_,
       preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
-      GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_,
-      &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
-      snapshot_checker, table_cache_, &event_logger_,
+      GetDataDir(c->column_family_data(), c->output_path_id()),
+      GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      table_cache_, &event_logger_,
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
-      &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_);
+      &compaction_job_stats, Env::Priority::USER, io_tracer_,
+      &manual_compaction_paused_, nullptr, db_id_, db_session_id_,
+      c->column_family_data()->GetFullHistoryTsLow(), &blob_callback_);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -990,17 +1398,23 @@
   mutex_.Unlock();
   TEST_SYNC_POINT("CompactFilesImpl:0");
   TEST_SYNC_POINT("CompactFilesImpl:1");
-  compaction_job.Run();
+  // Ignore the status here, as it will be checked in the Install down below...
+  compaction_job.Run().PermitUncheckedError();
   TEST_SYNC_POINT("CompactFilesImpl:2");
   TEST_SYNC_POINT("CompactFilesImpl:3");
   mutex_.Lock();
 
   Status status = compaction_job.Install(*c->mutable_cf_options());
   if (status.ok()) {
+    assert(compaction_job.io_status().ok());
     InstallSuperVersionAndScheduleWork(c->column_family_data(),
                                        &job_context->superversion_contexts[0],
                                        *c->mutable_cf_options());
   }
+  // status above captures any error during compaction_job.Install, so its ok
+  // not check compaction_job.io_status() explicitly if we're not calling
+  // SetBGError
+  compaction_job.io_status().PermitUncheckedError();
   c->ReleaseCompactionFiles(s);
 #ifndef ROCKSDB_LITE
   // Need to make sure SstFileManager does its bookkeeping
@@ -1033,15 +1447,25 @@
                    "[%s] [JOB %d] Compaction error: %s",
                    c->column_family_data()->GetName().c_str(),
                    job_context->job_id, status.ToString().c_str());
-    error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    IOStatus io_s = compaction_job.io_status();
+    if (!io_s.ok()) {
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction);
+    } else {
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
   }
 
   if (output_file_names != nullptr) {
-    for (const auto newf : c->edit()->GetNewFiles()) {
-      (*output_file_names)
-          .push_back(TableFileName(c->immutable_cf_options()->cf_paths,
-                                   newf.second.fd.GetNumber(),
-                                   newf.second.fd.GetPathId()));
+    for (const auto& newf : c->edit()->GetNewFiles()) {
+      output_file_names->push_back(TableFileName(
+          c->immutable_options()->cf_paths, newf.second.fd.GetNumber(),
+          newf.second.fd.GetPathId()));
+    }
+
+    for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+      output_file_names->push_back(
+          BlobFileName(c->immutable_options()->cf_paths.front().path,
+                       blob_file.GetBlobFileNumber()));
     }
   }
 
@@ -1099,9 +1523,11 @@
     return;
   }
   if (c->is_manual_compaction() &&
-      manual_compaction_paused_.load(std::memory_order_acquire)) {
+      manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
     return;
   }
+
+  c->SetNotifyOnCompactionCompleted();
   Version* current = cfd->current();
   current->Ref();
   // release lock while notifying events
@@ -1109,46 +1535,11 @@
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
   {
     CompactionJobInfo info{};
-    info.cf_name = cfd->GetName();
-    info.status = st;
-    info.thread_id = env_->GetThreadID();
-    info.job_id = job_id;
-    info.base_input_level = c->start_level();
-    info.output_level = c->output_level();
-    info.stats = job_stats;
-    info.table_properties = c->GetOutputTableProperties();
-    info.compaction_reason = c->compaction_reason();
-    info.compression = c->output_compression();
-    for (size_t i = 0; i < c->num_input_levels(); ++i) {
-      for (const auto fmd : *c->inputs(i)) {
-        const FileDescriptor& desc = fmd->fd;
-        const uint64_t file_number = desc.GetNumber();
-        auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
-                                file_number, desc.GetPathId());
-        info.input_files.push_back(fn);
-        info.input_file_infos.push_back(CompactionFileInfo{
-            static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
-        if (info.table_properties.count(fn) == 0) {
-          std::shared_ptr<const TableProperties> tp;
-          auto s = current->GetTableProperties(&tp, fmd, &fn);
-          if (s.ok()) {
-            info.table_properties[fn] = tp;
-          }
-        }
-      }
-    }
-    for (const auto newf : c->edit()->GetNewFiles()) {
-      const FileMetaData& meta = newf.second;
-      const FileDescriptor& desc = meta.fd;
-      const uint64_t file_number = desc.GetNumber();
-      info.output_files.push_back(TableFileName(
-          c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
-      info.output_file_infos.push_back(CompactionFileInfo{
-          newf.first, file_number, meta.oldest_blob_file_number});
-    }
+    BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info);
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnCompactionBegin(this, info);
     }
+    info.status.PermitUncheckedError();
   }
   mutex_.Lock();
   current->Unref();
@@ -1172,10 +1563,11 @@
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
-  if (c->is_manual_compaction() &&
-      manual_compaction_paused_.load(std::memory_order_acquire)) {
+
+  if (c->ShouldNotifyOnCompactionCompleted() == false) {
     return;
   }
+
   Version* current = cfd->current();
   current->Ref();
   // release lock while notifying events
@@ -1212,8 +1604,6 @@
 
   SuperVersionContext sv_context(/* create_superversion */ true);
 
-  Status status;
-
   InstrumentedMutexLock guard_lock(&mutex_);
 
   // only allow one thread refitting
@@ -1232,20 +1622,32 @@
   }
 
   auto* vstorage = cfd->current()->storage_info();
-  if (to_level > level) {
-    if (level == 0) {
-      return Status::NotSupported(
-          "Cannot change from level 0 to other levels.");
-    }
-    // Check levels are empty for a trivial move
-    for (int l = level + 1; l <= to_level; l++) {
-      if (vstorage->NumLevelFiles(l) > 0) {
+  if (to_level != level) {
+    if (to_level > level) {
+      if (level == 0) {
+        refitting_level_ = false;
         return Status::NotSupported(
-            "Levels between source and target are not empty for a move.");
+            "Cannot change from level 0 to other levels.");
+      }
+      // Check levels are empty for a trivial move
+      for (int l = level + 1; l <= to_level; l++) {
+        if (vstorage->NumLevelFiles(l) > 0) {
+          refitting_level_ = false;
+          return Status::NotSupported(
+              "Levels between source and target are not empty for a move.");
+        }
+      }
+    } else {
+      // to_level < level
+      // Check levels are empty for a trivial move
+      for (int l = to_level; l < level; l++) {
+        if (vstorage->NumLevelFiles(l) > 0) {
+          refitting_level_ = false;
+          return Status::NotSupported(
+              "Levels between source and target are not empty for a move.");
+        }
       }
     }
-  }
-  if (to_level != level) {
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
                     cfd->current()->DebugString().data());
@@ -1254,19 +1656,20 @@
     edit.SetColumnFamily(cfd->GetID());
     for (const auto& f : vstorage->LevelFiles(level)) {
       edit.DeleteFile(level, f->fd.GetNumber());
-      edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
-                   f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction, f->oldest_blob_file_number,
-                   f->oldest_ancester_time, f->file_creation_time,
-                   f->file_checksum, f->file_checksum_func_name);
+      edit.AddFile(
+          to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
+          f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
+          f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
+          f->oldest_ancester_time, f->file_creation_time, f->file_checksum,
+          f->file_checksum_func_name, f->min_timestamp, f->max_timestamp);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
                     edit.DebugString().data());
 
-    status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
-                                    directories_.GetDbDir());
+    Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit,
+                                           &mutex_, directories_.GetDbDir());
+
     InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
 
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
@@ -1277,16 +1680,18 @@
                       "[%s] After refitting:\n%s", cfd->GetName().c_str(),
                       cfd->current()->DebugString().data());
     }
+    sv_context.Clean();
+    refitting_level_ = false;
+
+    return status;
   }
 
-  sv_context.Clean();
   refitting_level_ = false;
-
-  return status;
+  return Status::OK();
 }
 
 int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   return cfh->cfd()->NumberLevels();
 }
 
@@ -1295,7 +1700,7 @@
 }
 
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   InstrumentedMutexLock l(&mutex_);
   return cfh->cfd()
       ->GetSuperVersion()
@@ -1304,7 +1709,7 @@
 
 Status DBImpl::Flush(const FlushOptions& flush_options,
                      ColumnFamilyHandle* column_family) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
                  cfh->GetName().c_str());
   Status s;
@@ -1373,20 +1778,16 @@
          input_level >= 0);
 
   InternalKey begin_storage, end_storage;
-  CompactionArg* ca;
+  CompactionArg* ca = nullptr;
 
   bool scheduled = false;
+  bool unscheduled = false;
+  Env::Priority thread_pool_priority = Env::Priority::TOTAL;
   bool manual_conflict = false;
-  ManualCompactionState manual;
-  manual.cfd = cfd;
-  manual.input_level = input_level;
-  manual.output_level = output_level;
-  manual.output_path_id = compact_range_options.target_path_id;
-  manual.done = false;
-  manual.in_progress = false;
-  manual.incomplete = false;
-  manual.exclusive = exclusive;
-  manual.disallow_trivial_move = disallow_trivial_move;
+
+  ManualCompactionState manual(
+      cfd, input_level, output_level, compact_range_options.target_path_id,
+      exclusive, disallow_trivial_move, compact_range_options.canceled);
   // For universal compaction, we enforce every manual compaction to compact
   // all files.
   if (begin == nullptr ||
@@ -1410,10 +1811,24 @@
   TEST_SYNC_POINT("DBImpl::RunManualCompaction:1");
   InstrumentedMutexLock l(&mutex_);
 
+  if (manual_compaction_paused_ > 0) {
+    // Does not make sense to `AddManualCompaction()` in this scenario since
+    // `DisableManualCompaction()` just waited for the manual compaction queue
+    // to drain. So return immediately.
+    TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart");
+    manual.status =
+        Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    manual.done = true;
+    return manual.status;
+  }
+
   // When a manual compaction arrives, temporarily disable scheduling of
   // non-manual compactions and wait until the number of scheduled compaction
-  // jobs drops to zero. This is needed to ensure that this manual compaction
-  // can compact any range of keys/files.
+  // jobs drops to zero. This used to be needed to ensure that this manual
+  // compaction can compact any range of keys/files. Now it is optional
+  // (see `CompactRangeOptions::exclusive_manual_compaction`). The use case for
+  // `exclusive_manual_compaction=true` (the default) is unclear beyond not
+  // trusting the new code.
   //
   // HasPendingManualCompaction() is true when at least one thread is inside
   // RunManualCompaction(), i.e. during that time no other compaction will
@@ -1427,8 +1842,20 @@
   AddManualCompaction(&manual);
   TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
   if (exclusive) {
+    // Limitation: there's no way to wake up the below loop when user sets
+    // `*manual.canceled`. So `CompactRangeOptions::exclusive_manual_compaction`
+    // and `CompactRangeOptions::canceled` might not work well together.
     while (bg_bottom_compaction_scheduled_ > 0 ||
            bg_compaction_scheduled_ > 0) {
+      if (manual_compaction_paused_ > 0 ||
+          (manual.canceled != nullptr && *manual.canceled == true)) {
+        // Pretend the error came from compaction so the below cleanup/error
+        // handling code can process it.
+        manual.done = true;
+        manual.status =
+            Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+        break;
+      }
       TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
       ROCKS_LOG_INFO(
           immutable_db_options_.info_log,
@@ -1455,9 +1882,9 @@
         scheduled ||
         (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
          ((compaction = manual.cfd->CompactRange(
-               *manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
-               manual.output_level, compact_range_options, manual.begin,
-               manual.end, &manual.manual_end, &manual_conflict,
+               *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+               manual.input_level, manual.output_level, compact_range_options,
+               manual.begin, manual.end, &manual.manual_end, &manual_conflict,
                max_file_num_to_ignore)) == nullptr &&
           manual_conflict))) {
       // exclusive manual compactions should not see a conflict during
@@ -1465,6 +1892,23 @@
       assert(!exclusive || !manual_conflict);
       // Running either this or some other manual compaction
       bg_cv_.Wait();
+      if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) {
+        assert(thread_pool_priority != Env::Priority::TOTAL);
+        // unschedule all manual compactions
+        auto unscheduled_task_num = env_->UnSchedule(
+            GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
+        if (unscheduled_task_num > 0) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[%s] Unscheduled %d number of manual compactions from the "
+              "thread-pool",
+              cfd->GetName().c_str(), unscheduled_task_num);
+          // it may unschedule other manual compactions, notify others.
+          bg_cv_.SignalAll();
+        }
+        unscheduled = true;
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled");
+      }
       if (scheduled && manual.incomplete == true) {
         assert(!manual.in_progress);
         scheduled = false;
@@ -1487,10 +1931,25 @@
         assert(false);
       }
       manual.incomplete = false;
-      bg_compaction_scheduled_++;
-      env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
-                     &DBImpl::UnscheduleCompactionCallback);
+      if (compaction->bottommost_level() &&
+          env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+        bg_bottom_compaction_scheduled_++;
+        ca->compaction_pri_ = Env::Priority::BOTTOM;
+        env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca,
+                       Env::Priority::BOTTOM,
+                       GetTaskTag(TaskType::kManualCompaction),
+                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::BOTTOM;
+      } else {
+        bg_compaction_scheduled_++;
+        ca->compaction_pri_ = Env::Priority::LOW;
+        env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW,
+                       GetTaskTag(TaskType::kManualCompaction),
+                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::LOW;
+      }
       scheduled = true;
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled");
     }
   }
 
@@ -1498,6 +1957,13 @@
   assert(!manual.in_progress);
   assert(HasPendingManualCompaction());
   RemoveManualCompaction(&manual);
+  // if the manual job is unscheduled, try schedule other jobs in case there's
+  // any unscheduled compaction job which was blocked by exclusive manual
+  // compaction.
+  if (manual.status.IsIncomplete() &&
+      manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
+    MaybeScheduleFlushOrCompaction();
+  }
   bg_cv_.SignalAll();
   return manual.status;
 }
@@ -1519,8 +1985,9 @@
 Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
                              const FlushOptions& flush_options,
                              FlushReason flush_reason, bool writes_stopped) {
+  // This method should not be called if atomic_flush is true.
+  assert(!immutable_db_options_.atomic_flush);
   Status s;
-  uint64_t flush_memtable_id = 0;
   if (!flush_options.allow_write_stall) {
     bool flush_needed = true;
     s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
@@ -1529,7 +1996,9 @@
       return s;
     }
   }
-  FlushRequest flush_req;
+
+  autovector<FlushRequest> flush_reqs;
+  autovector<uint64_t> memtable_ids_to_wait;
   {
     WriteContext context;
     InstrumentedMutexLock guard_lock(&mutex_);
@@ -1544,16 +2013,27 @@
     }
     WaitForPendingWrites();
 
-    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
+    if (flush_reason != FlushReason::kErrorRecoveryRetryFlush &&
+        (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) {
+      // Note that, when flush reason is kErrorRecoveryRetryFlush, during the
+      // auto retry resume, we want to avoid creating new small memtables.
+      // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl
+      // will iterate through all the CFs and call FlushMemtable during auto
+      // retry resume, it is possible that in some CFs,
+      // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will
+      // be created and scheduled, status::OK() will be returned.
       s = SwitchMemtable(cfd, &context);
     }
+    const uint64_t flush_memtable_id = port::kMaxUint64;
     if (s.ok()) {
       if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
           !cached_recoverable_state_empty_.load()) {
-        flush_memtable_id = cfd->imm()->GetLatestMemTableID();
-        flush_req.emplace_back(cfd, flush_memtable_id);
+        FlushRequest req{{cfd, flush_memtable_id}};
+        flush_reqs.emplace_back(std::move(req));
+        memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID());
       }
-      if (immutable_db_options_.persist_stats_to_disk) {
+      if (immutable_db_options_.persist_stats_to_disk &&
+          flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
         ColumnFamilyData* cfd_stats =
             versions_->GetColumnFamilySet()->GetColumnFamily(
                 kPersistentStatsColumnFamilyName);
@@ -1576,16 +2056,19 @@
                            "to avoid holding old logs",
                            cfd->GetName().c_str());
             s = SwitchMemtable(cfd_stats, &context);
-            flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
-            flush_req.emplace_back(cfd_stats, flush_memtable_id);
+            FlushRequest req{{cfd_stats, flush_memtable_id}};
+            flush_reqs.emplace_back(std::move(req));
+            memtable_ids_to_wait.emplace_back(
+                cfd->imm()->GetLatestMemTableID());
           }
         }
       }
     }
 
-    if (s.ok() && !flush_req.empty()) {
-      for (auto& elem : flush_req) {
-        ColumnFamilyData* loop_cfd = elem.first;
+    if (s.ok() && !flush_reqs.empty()) {
+      for (const auto& req : flush_reqs) {
+        assert(req.size() == 1);
+        ColumnFamilyData* loop_cfd = req[0].first;
         loop_cfd->imm()->FlushRequested();
       }
       // If the caller wants to wait for this flush to complete, it indicates
@@ -1593,12 +2076,15 @@
       // other threads which may drop the column family concurrently.
       // Therefore, we increase the cfd's ref count.
       if (flush_options.wait) {
-        for (auto& elem : flush_req) {
-          ColumnFamilyData* loop_cfd = elem.first;
+        for (const auto& req : flush_reqs) {
+          assert(req.size() == 1);
+          ColumnFamilyData* loop_cfd = req[0].first;
           loop_cfd->Ref();
         }
       }
-      SchedulePendingFlush(flush_req, flush_reason);
+      for (const auto& req : flush_reqs) {
+        SchedulePendingFlush(req, flush_reason);
+      }
       MaybeScheduleFlushOrCompaction();
     }
 
@@ -1614,12 +2100,16 @@
   if (s.ok() && flush_options.wait) {
     autovector<ColumnFamilyData*> cfds;
     autovector<const uint64_t*> flush_memtable_ids;
-    for (auto& iter : flush_req) {
-      cfds.push_back(iter.first);
-      flush_memtable_ids.push_back(&(iter.second));
-    }
-    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
-                              (flush_reason == FlushReason::kErrorRecovery));
+    assert(flush_reqs.size() == memtable_ids_to_wait.size());
+    for (size_t i = 0; i < flush_reqs.size(); ++i) {
+      assert(flush_reqs[i].size() == 1);
+      cfds.push_back(flush_reqs[i][0].first);
+      flush_memtable_ids.push_back(&(memtable_ids_to_wait[i]));
+    }
+    s = WaitForFlushMemTables(
+        cfds, flush_memtable_ids,
+        (flush_reason == FlushReason::kErrorRecovery ||
+         flush_reason == FlushReason::kErrorRecoveryRetryFlush));
     InstrumentedMutexLock lock_guard(&mutex_);
     for (auto* tmp_cfd : cfds) {
       tmp_cfd->UnrefAndTryDelete();
@@ -1677,7 +2167,8 @@
       }
     }
     for (auto cfd : cfds) {
-      if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+      if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
+          flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
         continue;
       }
       cfd->Ref();
@@ -1720,8 +2211,10 @@
     for (auto& iter : flush_req) {
       flush_memtable_ids.push_back(&(iter.second));
     }
-    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
-                              (flush_reason == FlushReason::kErrorRecovery));
+    s = WaitForFlushMemTables(
+        cfds, flush_memtable_ids,
+        (flush_reason == FlushReason::kErrorRecovery ||
+         flush_reason == FlushReason::kErrorRecoveryRetryFlush));
     InstrumentedMutexLock lock_guard(&mutex_);
     for (auto* cfd : cfds) {
       cfd->UnrefAndTryDelete();
@@ -1793,12 +2286,12 @@
       // check whether one extra immutable memtable or an extra L0 file would
       // cause write stalling mode to be entered. It could still enter stall
       // mode due to pending compaction bytes, but that's less common
-      write_stall_condition =
-          ColumnFamilyData::GetWriteStallConditionAndCause(
-              cfd->imm()->NumNotFlushed() + 1,
-              vstorage->l0_delay_trigger_count() + 1,
-              vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
-              .first;
+      write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
+                                  cfd->imm()->NumNotFlushed() + 1,
+                                  vstorage->l0_delay_trigger_count() + 1,
+                                  vstorage->estimated_compaction_needed_bytes(),
+                                  mutable_cf_options, *cfd->ioptions())
+                                  .first;
     } while (write_stall_condition != WriteStallCondition::kNormal);
   }
   return Status::OK();
@@ -1821,16 +2314,29 @@
   int num = static_cast<int>(cfds.size());
   // Wait until the compaction completes
   InstrumentedMutexLock l(&mutex_);
+  Status s;
   // If the caller is trying to resume from bg error, then
   // error_handler_.IsDBStopped() is true.
   while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
-      return Status::ShutdownInProgress();
+      s = Status::ShutdownInProgress();
+      return s;
     }
     // If an error has occurred during resumption, then no need to wait.
+    // But flush operation may fail because of this error, so need to
+    // return the status.
     if (!error_handler_.GetRecoveryError().ok()) {
+      s = error_handler_.GetRecoveryError();
       break;
     }
+    // If BGWorkStopped, which indicate that there is a BG error and
+    // 1) soft error but requires no BG work, 2) no in auto_recovery_
+    if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() &&
+        error_handler_.GetBGError().severity() < Status::Severity::kHardError) {
+      s = error_handler_.GetBGError();
+      return s;
+    }
+
     // Number of column families that have been dropped.
     int num_dropped = 0;
     // Number of column families that have finished flush.
@@ -1846,7 +2352,8 @@
       }
     }
     if (1 == num_dropped && 1 == num) {
-      return Status::InvalidArgument("Cannot flush a dropped CF");
+      s = Status::ColumnFamilyDropped();
+      return s;
     }
     // Column families involved in this flush request have either been dropped
     // or finished flush. Then it's time to finish waiting.
@@ -1855,7 +2362,6 @@
     }
     bg_cv_.Wait();
   }
-  Status s;
   // If not resuming from bg error, and an error has caused the DB to stop,
   // then report the bg error to caller.
   if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
@@ -1879,11 +2385,25 @@
 }
 
 void DBImpl::DisableManualCompaction() {
-  manual_compaction_paused_.store(true, std::memory_order_release);
+  InstrumentedMutexLock l(&mutex_);
+  manual_compaction_paused_.fetch_add(1, std::memory_order_release);
+
+  // Wake up manual compactions waiting to start.
+  bg_cv_.SignalAll();
+
+  // Wait for any pending manual compactions to finish (typically through
+  // failing with `Status::Incomplete`) prior to returning. This way we are
+  // guaranteed no pending manual compaction will commit while manual
+  // compactions are "disabled".
+  while (HasPendingManualCompaction()) {
+    bg_cv_.Wait();
+  }
 }
 
 void DBImpl::EnableManualCompaction() {
-  manual_compaction_paused_.store(false, std::memory_order_release);
+  InstrumentedMutexLock l(&mutex_);
+  assert(manual_compaction_paused_ > 0);
+  manual_compaction_paused_.fetch_sub(1, std::memory_order_release);
 }
 
 void DBImpl::MaybeScheduleFlushOrCompaction() {
@@ -1956,10 +2476,12 @@
     return;
   }
 
-  while (bg_compaction_scheduled_ < bg_job_limits.max_compactions &&
+  while (bg_compaction_scheduled_ + bg_bottom_compaction_scheduled_ <
+             bg_job_limits.max_compactions &&
          unscheduled_compactions_ > 0) {
     CompactionArg* ca = new CompactionArg;
     ca->db = this;
+    ca->compaction_pri_ = Env::Priority::LOW;
     ca->prepicked_compaction = nullptr;
     bg_compaction_scheduled_++;
     unscheduled_compactions_--;
@@ -1970,7 +2492,7 @@
 
 DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const {
   mutex_.AssertHeld();
-  return GetBGJobLimits(immutable_db_options_.max_background_flushes,
+  return GetBGJobLimits(mutable_db_options_.max_background_flushes,
                         mutable_db_options_.max_background_compactions,
                         mutable_db_options_.max_background_jobs,
                         write_controller_.NeedSpeedupCompaction());
@@ -2019,6 +2541,17 @@
   assert(!flush_queue_.empty());
   FlushRequest flush_req = flush_queue_.front();
   flush_queue_.pop_front();
+  if (!immutable_db_options_.atomic_flush) {
+    assert(flush_req.size() == 1);
+  }
+  for (const auto& elem : flush_req) {
+    if (!immutable_db_options_.atomic_flush) {
+      ColumnFamilyData* cfd = elem.first;
+      assert(cfd);
+      assert(cfd->queued_for_flush());
+      cfd->set_queued_for_flush(false);
+    }
+  }
   // TODO: need to unset flush reason?
   return flush_req;
 }
@@ -2051,19 +2584,47 @@
 
 void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
                                   FlushReason flush_reason) {
+  mutex_.AssertHeld();
   if (flush_req.empty()) {
     return;
   }
-  for (auto& iter : flush_req) {
-    ColumnFamilyData* cfd = iter.first;
-    cfd->Ref();
-    cfd->SetFlushReason(flush_reason);
+  if (!immutable_db_options_.atomic_flush) {
+    // For the non-atomic flush case, we never schedule multiple column
+    // families in the same flush request.
+    assert(flush_req.size() == 1);
+    ColumnFamilyData* cfd = flush_req[0].first;
+    assert(cfd);
+    // Note: SchedulePendingFlush is always preceded
+    // with an imm()->FlushRequested() call. However,
+    // we want to make this code snipper more resilient to
+    // future changes. Therefore, we add the following if
+    // statement - note that calling it twice (or more)
+    // doesn't break anything.
+    if (immutable_db_options_.experimental_mempurge_threshold > 0.0) {
+      // If imm() contains silent memtables,
+      // requesting a flush will mark the imm_needed as true.
+      cfd->imm()->FlushRequested();
+    }
+    if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
+      cfd->Ref();
+      cfd->set_queued_for_flush(true);
+      cfd->SetFlushReason(flush_reason);
+      ++unscheduled_flushes_;
+      flush_queue_.push_back(flush_req);
+    }
+  } else {
+    for (auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      cfd->Ref();
+      cfd->SetFlushReason(flush_reason);
+    }
+    ++unscheduled_flushes_;
+    flush_queue_.push_back(flush_req);
   }
-  ++unscheduled_flushes_;
-  flush_queue_.push_back(flush_req);
 }
 
 void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+  mutex_.AssertHeld();
   if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
     AddToCompactionQueue(cfd);
     ++unscheduled_compactions_;
@@ -2083,8 +2644,7 @@
 
   IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush");
-  static_cast_with_check<DBImpl, DB>(fta.db_)->BackgroundCallFlush(
-      fta.thread_pri_);
+  static_cast_with_check<DBImpl>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
 }
 
@@ -2095,7 +2655,7 @@
   TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
   auto prepicked_compaction =
       static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
-  static_cast_with_check<DBImpl, DB>(ca.db)->BackgroundCallCompaction(
+  static_cast_with_check<DBImpl>(ca.db)->BackgroundCallCompaction(
       prepicked_compaction, Env::Priority::LOW);
   delete prepicked_compaction;
 }
@@ -2106,8 +2666,7 @@
   IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
   TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
   auto* prepicked_compaction = ca.prepicked_compaction;
-  assert(prepicked_compaction && prepicked_compaction->compaction &&
-         !prepicked_compaction->manual_compaction_state);
+  assert(prepicked_compaction && prepicked_compaction->compaction);
   ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
   delete prepicked_compaction;
 }
@@ -2120,10 +2679,27 @@
 }
 
 void DBImpl::UnscheduleCompactionCallback(void* arg) {
-  CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+  CompactionArg* ca_ptr = reinterpret_cast<CompactionArg*>(arg);
+  Env::Priority compaction_pri = ca_ptr->compaction_pri_;
+  if (Env::Priority::BOTTOM == compaction_pri) {
+    // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM
+    ca_ptr->db->bg_bottom_compaction_scheduled_--;
+  } else if (Env::Priority::LOW == compaction_pri) {
+    // Decrement bg_compaction_scheduled_ if priority is LOW
+    ca_ptr->db->bg_compaction_scheduled_--;
+  }
+  CompactionArg ca = *(ca_ptr);
   delete reinterpret_cast<CompactionArg*>(arg);
   if (ca.prepicked_compaction != nullptr) {
+    // if it's a manual compaction, set status to ManualCompactionPaused
+    if (ca.prepicked_compaction->manual_compaction_state) {
+      ca.prepicked_compaction->manual_compaction_state->done = true;
+      ca.prepicked_compaction->manual_compaction_state->status =
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
     if (ca.prepicked_compaction->compaction != nullptr) {
+      ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused));
       delete ca.prepicked_compaction->compaction;
     }
     delete ca.prepicked_compaction;
@@ -2132,6 +2708,14 @@
 }
 
 void DBImpl::UnscheduleFlushCallback(void* arg) {
+  // Decrement bg_flush_scheduled_ in flush callback
+  reinterpret_cast<FlushThreadArg*>(arg)->db_->bg_flush_scheduled_--;
+  Env::Priority flush_pri = reinterpret_cast<FlushThreadArg*>(arg)->thread_pri_;
+  if (Env::Priority::LOW == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback");
+  } else if (Env::Priority::HIGH == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback");
+  }
   delete reinterpret_cast<FlushThreadArg*>(arg);
   TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
 }
@@ -2169,6 +2753,11 @@
 
     for (const auto& iter : flush_req) {
       ColumnFamilyData* cfd = iter.first;
+      if (immutable_db_options_.experimental_mempurge_threshold > 0.0) {
+        // If imm() contains silent memtables,
+        // requesting a flush will mark the imm_needed as true.
+        cfd->imm()->FlushRequested();
+      }
       if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
         // can't flush this CF, try next one
         column_families_not_to_flush.push_back(cfd);
@@ -2220,10 +2809,12 @@
   bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
 
-  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:start");
+  TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCallFlush:start", nullptr);
 
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
   {
     InstrumentedMutexLock l(&mutex_);
     assert(bg_flush_scheduled_);
@@ -2252,7 +2843,7 @@
                       s.ToString().c_str(), error_cnt);
       log_buffer.FlushBufferToLog();
       LogFlush(immutable_db_options_.info_log);
-      env_->SleepForMicroseconds(1000000);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
       mutex_.Lock();
     }
 
@@ -2325,7 +2916,8 @@
     if (s.IsBusy()) {
       bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
       mutex_.Unlock();
-      env_->SleepForMicroseconds(10000);  // prevent hot loop
+      immutable_db_options_.clock->SleepForMicroseconds(
+          10000);  // prevent hot loop
       mutex_.Lock();
     } else if (!s.ok() && !s.IsShutdownInProgress() &&
                !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
@@ -2343,9 +2935,10 @@
                       "Accumulated background error counts: %" PRIu64,
                       s.ToString().c_str(), error_cnt);
       LogFlush(immutable_db_options_.info_log);
-      env_->SleepForMicroseconds(1000000);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
       mutex_.Lock();
     } else if (s.IsManualCompactionPaused()) {
+      assert(prepicked_compaction);
       ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
       assert(m);
       ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
@@ -2354,12 +2947,13 @@
 
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
-    // If compaction failed, we want to delete all temporary files that we might
-    // have created (they might not be all recorded in job_context in case of a
-    // failure). Thus, we force full scan in FindObsoleteFiles()
+    // If compaction failed, we want to delete all temporary files that we
+    // might have created (they might not be all recorded in job_context in
+    // case of a failure). Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
                                         !s.IsManualCompactionPaused() &&
-                                        !s.IsColumnFamilyDropped());
+                                        !s.IsColumnFamilyDropped() &&
+                                        !s.IsBusy());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
 
     // delete unnecessary files if any, this is done outside the mutex
@@ -2382,6 +2976,7 @@
 
     assert(num_running_compactions_ > 0);
     num_running_compactions_--;
+
     if (bg_thread_pri == Env::Priority::LOW) {
       bg_compaction_scheduled_--;
     } else {
@@ -2389,10 +2984,17 @@
       bg_bottom_compaction_scheduled_--;
     }
 
-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
-
     // See if there's more work to be done
     MaybeScheduleFlushOrCompaction();
+
+    if (prepicked_compaction != nullptr &&
+        prepicked_compaction->task_token != nullptr) {
+      // Releasing task tokens affects (and asserts on) the DB state, so
+      // must be done before we potentially signal the DB close process to
+      // proceed below.
+      prepicked_compaction->task_token.reset();
+    }
+
     if (made_progress ||
         (bg_compaction_scheduled_ == 0 &&
          bg_bottom_compaction_scheduled_ == 0) ||
@@ -2443,7 +3045,10 @@
     if (shutting_down_.load(std::memory_order_acquire)) {
       status = Status::ShutdownInProgress();
     } else if (is_manual &&
-               manual_compaction_paused_.load(std::memory_order_acquire)) {
+               manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+      status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    } else if (is_manual && manual_compaction->canceled &&
+               manual_compaction->canceled->load(std::memory_order_acquire)) {
       status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
     }
   } else {
@@ -2474,6 +3079,8 @@
     manual_compaction->in_progress = true;
   }
 
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress");
+
   std::unique_ptr<TaskLimiterToken> task_token;
 
   // InternalKey manual_end_storage;
@@ -2485,12 +3092,13 @@
     if (!c) {
       m->done = true;
       m->manual_end = nullptr;
-      ROCKS_LOG_BUFFER(log_buffer,
-                       "[%s] Manual compaction from level-%d from %s .. "
-                       "%s; nothing to do\n",
-                       m->cfd->GetName().c_str(), m->input_level,
-                       (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
-                       (m->end ? m->end->DebugString().c_str() : "(end)"));
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Manual compaction from level-%d from %s .. "
+          "%s; nothing to do\n",
+          m->cfd->GetName().c_str(), m->input_level,
+          (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+          (m->end ? m->end->DebugString(true).c_str() : "(end)"));
     } else {
       // First check if we have enough room to do the compaction
       bool enough_room = EnoughRoomForCompaction(
@@ -2509,11 +3117,11 @@
             "[%s] Manual compaction from level-%d to level-%d from %s .. "
             "%s; will stop at %s\n",
             m->cfd->GetName().c_str(), m->input_level, c->output_level(),
-            (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
-            (m->end ? m->end->DebugString().c_str() : "(end)"),
+            (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+            (m->end ? m->end->DebugString(true).c_str() : "(end)"),
             ((m->done || m->manual_end == nullptr)
                  ? "(end)"
-                 : m->manual_end->DebugString().c_str()));
+                 : m->manual_end->DebugString(true).c_str()));
       }
     }
   } else if (!is_prepicked && !compaction_queue_.empty()) {
@@ -2557,7 +3165,8 @@
       // compaction is not necessary. Need to make sure mutex is held
       // until we make a copy in the following code
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
-      c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
+      c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_,
+                                  log_buffer));
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
 
       if (c != nullptr) {
@@ -2570,7 +3179,7 @@
           c->column_family_data()
               ->current()
               ->storage_info()
-              ->ComputeCompactionScore(*(c->immutable_cf_options()),
+              ->ComputeCompactionScore(*(c->immutable_options()),
                                        *(c->mutable_cf_options()));
           AddToCompactionQueue(cfd);
           ++unscheduled_compactions_;
@@ -2581,8 +3190,12 @@
           status = Status::CompactionTooLarge();
         } else {
           // update statistics
-          RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
-                            c->inputs(0)->size());
+          size_t num_files = 0;
+          for (auto& each_level : *c->inputs()) {
+            num_files += each_level.files.size();
+          }
+          RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
           // There are three things that can change compaction score:
           // 1) When flush or compaction finish. This case is covered by
           // InstallSuperVersionAndScheduleWork
@@ -2606,6 +3219,7 @@
     }
   }
 
+  IOStatus io_s;
   if (!c) {
     // Nothing to do
     ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
@@ -2630,6 +3244,7 @@
     status = versions_->LogAndApply(c->column_family_data(),
                                     *c->mutable_cf_options(), c->edit(),
                                     &mutex_, directories_.GetDbDir());
+    io_s = versions_->io_status();
     InstallSuperVersionAndScheduleWork(c->column_family_data(),
                                        &job_context->superversion_contexts[0],
                                        *c->mutable_cf_options());
@@ -2665,13 +3280,13 @@
       for (size_t i = 0; i < c->num_input_files(l); i++) {
         FileMetaData* f = c->input(l, i);
         c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
-        c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
-                           f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
-                           f->largest, f->fd.smallest_seqno,
-                           f->fd.largest_seqno, f->marked_for_compaction,
-                           f->oldest_blob_file_number, f->oldest_ancester_time,
-                           f->file_creation_time, f->file_checksum,
-                           f->file_checksum_func_name);
+        c->edit()->AddFile(
+            c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+            f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
+            f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
+            f->oldest_blob_file_number, f->oldest_ancester_time,
+            f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
+            f->min_timestamp, f->max_timestamp);
 
         ROCKS_LOG_BUFFER(
             log_buffer,
@@ -2686,6 +3301,7 @@
     status = versions_->LogAndApply(c->column_family_data(),
                                     *c->mutable_cf_options(), c->edit(),
                                     &mutex_, directories_.GetDbDir());
+    io_s = versions_->io_status();
     // Use latest MutableCFOptions
     InstallSuperVersionAndScheduleWork(c->column_family_data(),
                                        &job_context->superversion_contexts[0],
@@ -2727,6 +3343,7 @@
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
     CompactionArg* ca = new CompactionArg;
     ca->db = this;
+    ca->compaction_pri_ = Env::Priority::BOTTOM;
     ca->prepicked_compaction = new PrepickedCompaction;
     ca->prepicked_compaction->compaction = c.release();
     ca->prepicked_compaction->manual_compaction_state = nullptr;
@@ -2750,28 +3367,34 @@
     assert(is_snapshot_supported_ || snapshots_.empty());
     CompactionJob compaction_job(
         job_context->job_id, c.get(), immutable_db_options_,
-        file_options_for_compaction_, versions_.get(), &shutting_down_,
-        preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
-        GetDataDir(c->column_family_data(), c->output_path_id()), stats_,
-        &mutex_, &error_handler_, snapshot_seqs,
-        earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
-        &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
+        mutable_db_options_, file_options_for_compaction_, versions_.get(),
+        &shutting_down_, preserve_deletes_seqnum_.load(), log_buffer,
+        directories_.GetDbDir(),
+        GetDataDir(c->column_family_data(), c->output_path_id()),
+        GetDataDir(c->column_family_data(), 0), stats_, &mutex_,
+        &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, table_cache_, &event_logger_,
+        c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
-        &compaction_job_stats, thread_pri,
-        is_manual ? &manual_compaction_paused_ : nullptr);
+        &compaction_job_stats, thread_pri, io_tracer_,
+        is_manual ? &manual_compaction_paused_ : nullptr,
+        is_manual ? manual_compaction->canceled : nullptr, db_id_,
+        db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
+        &blob_callback_);
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
-
     mutex_.Unlock();
     TEST_SYNC_POINT_CALLBACK(
         "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
-    compaction_job.Run();
+    // Should handle erorr?
+    compaction_job.Run().PermitUncheckedError();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
     mutex_.Lock();
 
     status = compaction_job.Install(*c->mutable_cf_options());
+    io_s = compaction_job.io_status();
     if (status.ok()) {
       InstallSuperVersionAndScheduleWork(c->column_family_data(),
                                          &job_context->superversion_contexts[0],
@@ -2781,6 +3404,13 @@
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
                              c->column_family_data());
   }
+
+  if (status.ok() && !io_s.ok()) {
+    status = io_s;
+  } else {
+    io_s.PermitUncheckedError();
+  }
+
   if (c != nullptr) {
     c->ReleaseCompactionFiles(status);
     *made_progress = true;
@@ -2806,7 +3436,19 @@
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
                    status.ToString().c_str());
-    error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    if (!io_s.ok()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      auto err_reason = versions_->io_status().ok()
+                            ? BackgroundErrorReason::kCompaction
+                            : BackgroundErrorReason::kManifestWrite;
+      error_handler_.SetBGError(io_s, err_reason);
+    } else {
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
     if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
       // Put this cfd back in the compaction queue so we can retry after some
       // time
@@ -2817,7 +3459,7 @@
       c->column_family_data()
           ->current()
           ->storage_info()
-          ->ComputeCompactionScore(*(c->immutable_cf_options()),
+          ->ComputeCompactionScore(*(c->immutable_options()),
                                    *(c->mutable_cf_options()));
       if (!cfd->queued_for_compaction()) {
         AddToCompactionQueue(cfd);
@@ -2873,6 +3515,7 @@
 }
 
 void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
+  assert(manual_compaction_paused_ == 0);
   manual_compaction_dequeue_.push_back(m);
 }
 
@@ -2958,7 +3601,7 @@
   if (m->cfd != m1->cfd) {
     return false;
   }
-  return true;
+  return false;
 }
 
 #ifndef ROCKSDB_LITE
@@ -2982,7 +3625,7 @@
     for (const auto fmd : *c->inputs(i)) {
       const FileDescriptor& desc = fmd->fd;
       const uint64_t file_number = desc.GetNumber();
-      auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number,
+      auto fn = TableFileName(c->immutable_options()->cf_paths, file_number,
                               desc.GetPathId());
       compaction_job_info->input_files.push_back(fn);
       compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
@@ -3001,10 +3644,34 @@
     const FileDescriptor& desc = meta.fd;
     const uint64_t file_number = desc.GetNumber();
     compaction_job_info->output_files.push_back(TableFileName(
-        c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+        c->immutable_options()->cf_paths, file_number, desc.GetPathId()));
     compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
         newf.first, file_number, meta.oldest_blob_file_number});
   }
+  compaction_job_info->blob_compression_type =
+      c->mutable_cf_options()->blob_compression_type;
+
+  // Update BlobFilesInfo.
+  for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+    BlobFileAdditionInfo blob_file_addition_info(
+        BlobFileName(c->immutable_options()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+        blob_file.GetTotalBlobBytes());
+    compaction_job_info->blob_file_addition_infos.emplace_back(
+        std::move(blob_file_addition_info));
+  }
+
+  // Update BlobFilesGarbageInfo.
+  for (const auto& blob_file : c->edit()->GetBlobFileGarbages()) {
+    BlobFileGarbageInfo blob_file_garbage_info(
+        BlobFileName(c->immutable_options()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetGarbageBlobCount(),
+        blob_file.GetGarbageBlobBytes());
+    compaction_job_info->blob_file_garbage_infos.emplace_back(
+        std::move(blob_file_garbage_info));
+  }
 }
 #endif
 
@@ -3037,7 +3704,7 @@
   if (UNLIKELY(sv_context->new_superversion == nullptr)) {
     sv_context->NewSuperVersion();
   }
-  cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
+  cfd->InstallSuperVersion(sv_context, mutable_cf_options);
 
   // There may be a small data race here. The snapshot tricking bottommost
   // compaction may already be released here. But assuming there will always be
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,6 +12,7 @@
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
+#include "db/periodic_work_scheduler.h"
 #include "monitoring/thread_status_updater.h"
 #include "util/cast_util.h"
 
@@ -21,12 +22,13 @@
   return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
 }
 
-void DBImpl::TEST_SwitchWAL() {
+Status DBImpl::TEST_SwitchWAL() {
   WriteContext write_context;
   InstrumentedMutexLock l(&mutex_);
   void* writer = TEST_BeginWrite();
-  SwitchWAL(&write_context);
+  auto s = SwitchWAL(&write_context);
   TEST_EndWrite(writer);
+  return s;
 }
 
 bool DBImpl::TEST_WALBufferIsEmpty(bool lock) {
@@ -41,13 +43,13 @@
   return res;
 }
 
-int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
     ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
     cfd = default_cf_handle_->cfd();
   } else {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
     cfd = cfh->cfd();
   }
   InstrumentedMutexLock l(&mutex_);
@@ -56,8 +58,9 @@
 
 void DBImpl::TEST_GetFilesMetaData(
     ColumnFamilyHandle* column_family,
-    std::vector<std::vector<FileMetaData>>* metadata) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    std::vector<std::vector<FileMetaData>>* metadata,
+    std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   InstrumentedMutexLock l(&mutex_);
   metadata->resize(NumberLevels());
@@ -70,6 +73,12 @@
       (*metadata)[level].push_back(*f);
     }
   }
+  if (blob_metadata != nullptr) {
+    blob_metadata->clear();
+    for (const auto& blob : cfd->current()->storage_info()->GetBlobFiles()) {
+      blob_metadata->push_back(blob.second);
+    }
+  }
 }
 
 uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
@@ -88,7 +97,7 @@
   if (column_family == nullptr) {
     cfd = default_cf_handle_->cfd();
   } else {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
     cfd = cfh->cfd();
   }
   int output_level =
@@ -131,7 +140,7 @@
   if (cfh == nullptr) {
     cfd = default_cf_handle_->cfd();
   } else {
-    auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
+    auto cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
     cfd = cfhi->cfd();
   }
   return FlushMemTable(cfd, fo, FlushReason::kTest);
@@ -147,12 +156,18 @@
   return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
 }
 
+Status DBImpl::TEST_WaitForBackgroundWork() {
+  InstrumentedMutexLock l(&mutex_);
+  WaitForBackgroundWork();
+  return error_handler_.GetBGError();
+}
+
 Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
     cfd = default_cf_handle_->cfd();
   } else {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
     cfd = cfh->cfd();
   }
   return WaitForFlushMemTable(cfd, nullptr, false);
@@ -169,12 +184,25 @@
   while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
           bg_flush_scheduled_ ||
           (wait_unscheduled && unscheduled_compactions_)) &&
-         (error_handler_.GetBGError() == Status::OK())) {
+         (error_handler_.GetBGError().ok())) {
+    bg_cv_.Wait();
+  }
+  return error_handler_.GetBGError();
+}
+
+Status DBImpl::TEST_WaitForPurge() {
+  InstrumentedMutexLock l(&mutex_);
+  while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) {
     bg_cv_.Wait();
   }
   return error_handler_.GetBGError();
 }
 
+Status DBImpl::TEST_GetBGError() {
+  InstrumentedMutexLock l(&mutex_);
+  return error_handler_.GetBGError();
+}
+
 void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
 
 void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
@@ -234,15 +262,14 @@
 
 uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
   autovector<MemTable*> empty_list;
-  return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr,
-                                            empty_list);
+  return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
 }
 
 Status DBImpl::TEST_GetLatestMutableCFOptions(
     ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
   InstrumentedMutexLock l(&mutex_);
 
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
   return Status::OK();
 }
@@ -271,21 +298,18 @@
   return GetWalPreallocateBlockSize(write_buffer_size);
 }
 
-void DBImpl::TEST_WaitForDumpStatsRun(std::function<void()> callback) const {
-  if (thread_dump_stats_ != nullptr) {
-    thread_dump_stats_->TEST_WaitForRun(callback);
-  }
-}
-
-void DBImpl::TEST_WaitForPersistStatsRun(std::function<void()> callback) const {
-  if (thread_persist_stats_ != nullptr) {
-    thread_persist_stats_->TEST_WaitForRun(callback);
+#ifndef ROCKSDB_LITE
+void DBImpl::TEST_WaitForStatsDumpRun(std::function<void()> callback) const {
+  if (periodic_work_scheduler_ != nullptr) {
+    static_cast<PeriodicWorkTestScheduler*>(periodic_work_scheduler_)
+        ->TEST_WaitForRun(callback);
   }
 }
 
-bool DBImpl::TEST_IsPersistentStatsEnabled() const {
-  return thread_persist_stats_ && thread_persist_stats_->IsRunning();
+PeriodicWorkTestScheduler* DBImpl::TEST_GetPeriodicWorkScheduler() const {
+  return static_cast<PeriodicWorkTestScheduler*>(periodic_work_scheduler_);
 }
+#endif  // !ROCKSDB_LITE
 
 size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
   return EstimateInMemoryStatsHistorySize();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,22 +7,23 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_impl/db_impl.h"
-
 #include <cinttypes>
 #include <vector>
 
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
+#include "logging/logging.h"
 #include "rocksdb/status.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
 Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
                                    const Slice* begin, const Slice* end) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   InternalKey start_key, end_key;
   if (begin != nullptr) {
@@ -75,7 +76,8 @@
                      "PromoteL0 FAILED. Target level %d does not exist\n",
                      target_level);
       job_context.Clean();
-      return Status::InvalidArgument("Target level does not exist");
+      status = Status::InvalidArgument("Target level does not exist");
+      return status;
     }
 
     // Sort L0 files by range.
@@ -95,7 +97,9 @@
                        "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
                        f->fd.GetNumber());
         job_context.Clean();
-        return Status::InvalidArgument("PromoteL0 called during L0 compaction");
+        status =
+            Status::InvalidArgument("PromoteL0 called during L0 compaction");
+        return status;
       }
 
       if (i == 0) continue;
@@ -106,7 +110,8 @@
                        " have overlapping ranges\n",
                        prev_f->fd.GetNumber(), f->fd.GetNumber());
         job_context.Clean();
-        return Status::InvalidArgument("L0 has overlapping files");
+        status = Status::InvalidArgument("L0 has overlapping files");
+        return status;
       }
     }
 
@@ -116,21 +121,23 @@
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "PromoteL0 FAILED. Level %d not empty\n", level);
         job_context.Clean();
-        return Status::InvalidArgument(
+        status = Status::InvalidArgument(
             "All levels up to target_level "
             "must be empty");
+        return status;
       }
     }
 
     edit.SetColumnFamily(cfd->GetID());
     for (const auto& f : l0_files) {
       edit.DeleteFile(0, f->fd.GetNumber());
-      edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
-                   f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction, f->oldest_blob_file_number,
-                   f->oldest_ancester_time, f->file_creation_time,
-                   f->file_checksum, f->file_checksum_func_name);
+      edit.AddFile(
+          target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+          f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
+          f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
+          f->oldest_blob_file_number, f->oldest_ancester_time,
+          f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
+          f->min_timestamp, f->max_timestamp);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_files.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,25 +6,24 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl/db_impl.h"
-
 #include <cinttypes>
 #include <set>
 #include <unordered_set>
+
+#include "db/db_impl/db_impl.h"
 #include "db/event_helpers.h"
 #include "db/memtable_list.h"
 #include "file/file_util.h"
+#include "file/filename.h"
 #include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 uint64_t DBImpl::MinLogNumberToKeep() {
-  if (allow_2pc()) {
-    return versions_->min_log_number_to_keep_2pc();
-  } else {
-    return versions_->MinLogNumberWithUnflushedData();
-  }
+  return versions_->min_log_number_to_keep();
 }
 
 uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
@@ -35,7 +34,71 @@
   return std::numeric_limits<uint64_t>::max();
 }
 
-// * Returns the list of live files in 'sst_live'
+Status DBImpl::DisableFileDeletions() {
+  Status s;
+  int my_disable_delete_obsolete_files;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    s = DisableFileDeletionsWithLock();
+    my_disable_delete_obsolete_files = disable_delete_obsolete_files_;
+  }
+  if (my_disable_delete_obsolete_files == 1) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled");
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Disabled, but already disabled. Counter: %d",
+                   my_disable_delete_obsolete_files);
+  }
+  return s;
+}
+
+// FIXME: can be inconsistent with DisableFileDeletions in cases like
+// DBImplReadOnly
+Status DBImpl::DisableFileDeletionsWithLock() {
+  mutex_.AssertHeld();
+  ++disable_delete_obsolete_files_;
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+  // Job id == 0 means that this is not our background process, but rather
+  // user thread
+  JobContext job_context(0);
+  int saved_counter;  // initialize on all paths
+  {
+    InstrumentedMutexLock l(&mutex_);
+    if (force) {
+      // if force, we need to enable file deletions right away
+      disable_delete_obsolete_files_ = 0;
+    } else if (disable_delete_obsolete_files_ > 0) {
+      --disable_delete_obsolete_files_;
+    }
+    saved_counter = disable_delete_obsolete_files_;
+    if (saved_counter == 0) {
+      FindObsoleteFiles(&job_context, true);
+      bg_cv_.SignalAll();
+    }
+  }
+  if (saved_counter == 0) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Enable, but not really enabled. Counter: %d",
+                   saved_counter);
+  }
+  job_context.Clean();
+  LogFlush(immutable_db_options_.info_log);
+  return Status::OK();
+}
+
+bool DBImpl::IsFileDeletionsEnabled() const {
+  return 0 == disable_delete_obsolete_files_;
+}
+
+// * Returns the list of live files in 'sst_live' and 'blob_live'.
 // If it's doing full scan:
 // * Returns the list of all files in the filesystem in
 // 'full_scan_candidate_files'.
@@ -62,7 +125,7 @@
              mutable_db_options_.delete_obsolete_files_period_micros == 0) {
     doing_the_full_scan = true;
   } else {
-    const uint64_t now_micros = env_->NowMicros();
+    const uint64_t now_micros = immutable_db_options_.clock->NowMicros();
     if ((delete_obsolete_files_last_run_ +
          mutable_db_options_.delete_obsolete_files_period_micros) <
         now_micros) {
@@ -76,26 +139,26 @@
   // Since job_context->min_pending_output is set, until file scan finishes,
   // mutex_ cannot be released. Otherwise, we might see no min_pending_output
   // here but later find newer generated unfinalized files while scanning.
-  if (!pending_outputs_.empty()) {
-    job_context->min_pending_output = *pending_outputs_.begin();
-  } else {
-    // delete all of them
-    job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
-  }
+  job_context->min_pending_output = MinObsoleteSstNumberToKeep();
 
   // Get obsolete files.  This function will also update the list of
   // pending files in VersionSet().
-  versions_->GetObsoleteFiles(&job_context->sst_delete_files,
-                              &job_context->manifest_delete_files,
-                              job_context->min_pending_output);
-
-  // Mark the elements in job_context->sst_delete_files as grabbedForPurge
-  // so that other threads calling FindObsoleteFiles with full_scan=true
-  // will not add these files to candidate list for purge.
+  versions_->GetObsoleteFiles(
+      &job_context->sst_delete_files, &job_context->blob_delete_files,
+      &job_context->manifest_delete_files, job_context->min_pending_output);
+
+  // Mark the elements in job_context->sst_delete_files and
+  // job_context->blob_delete_files as "grabbed for purge" so that other threads
+  // calling FindObsoleteFiles with full_scan=true will not add these files to
+  // candidate list for purge.
   for (const auto& sst_to_del : job_context->sst_delete_files) {
     MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
   }
 
+  for (const auto& blob_file : job_context->blob_delete_files) {
+    MarkAsGrabbedForPurge(blob_file.GetBlobFileNumber());
+  }
+
   // store the current filenum, lognum, etc
   job_context->manifest_file_number = versions_->manifest_file_number();
   job_context->pending_manifest_file_number =
@@ -103,7 +166,7 @@
   job_context->log_number = MinLogNumberToKeep();
   job_context->prev_log_number = versions_->prev_log_number();
 
-  versions_->AddLiveFiles(&job_context->sst_live);
+  versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live);
   if (doing_the_full_scan) {
     InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
                                   dbname_);
@@ -133,7 +196,8 @@
       // set of all files in the directory. We'll exclude files that are still
       // alive in the subsequent processings.
       std::vector<std::string> files;
-      env_->GetChildren(path, &files);  // Ignore errors
+      Status s = env_->GetChildren(path, &files);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
       for (const std::string& file : files) {
         uint64_t number;
         FileType type;
@@ -149,27 +213,30 @@
           continue;
         }
 
-        // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
+        // TODO(icanadi) clean up this mess to avoid having one-off "/"
+        // prefixes
         job_context->full_scan_candidate_files.emplace_back("/" + file, path);
       }
     }
 
     // Add log files in wal_dir
-    if (immutable_db_options_.wal_dir != dbname_) {
+    if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) {
       std::vector<std::string> log_files;
-      env_->GetChildren(immutable_db_options_.wal_dir,
-                        &log_files);  // Ignore errors
+      Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
       for (const std::string& log_file : log_files) {
         job_context->full_scan_candidate_files.emplace_back(
             log_file, immutable_db_options_.wal_dir);
       }
     }
+
     // Add info log files in db_log_dir
     if (!immutable_db_options_.db_log_dir.empty() &&
         immutable_db_options_.db_log_dir != dbname_) {
       std::vector<std::string> info_log_files;
-      // Ignore errors
-      env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files);
+      Status s =
+          env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
       for (std::string& log_file : info_log_files) {
         job_context->full_scan_candidate_files.emplace_back(
             log_file, immutable_db_options_.db_log_dir);
@@ -250,17 +317,22 @@
     return (first.file_path > second.file_path);
   }
 }
-};  // namespace
+}  // namespace
 
 // Delete obsolete files and log status and information of file deletion
 void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
                                     const std::string& path_to_sync,
                                     FileType type, uint64_t number) {
+  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion",
+                           const_cast<std::string*>(&fname));
+
   Status file_deletion_status;
-  if (type == kTableFile || type == kLogFile) {
-    file_deletion_status =
-        DeleteDBFile(&immutable_db_options_, fname, path_to_sync,
-                     /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_);
+  if (type == kTableFile || type == kBlobFile || type == kWalFile) {
+    // Rate limit WAL deletion only if its in the DB dir
+    file_deletion_status = DeleteDBFile(
+        &immutable_db_options_, fname, path_to_sync,
+        /*force_bg=*/false,
+        /*force_fg=*/(type == kWalFile) ? !wal_in_db_path_ : false);
   } else {
     file_deletion_status = env_->DeleteFile(fname);
   }
@@ -289,6 +361,11 @@
         &event_logger_, job_id, number, fname, file_deletion_status, GetName(),
         immutable_db_options_.listeners);
   }
+  if (type == kBlobFile) {
+    EventHelpers::LogAndNotifyBlobFileDeletion(
+        &event_logger_, immutable_db_options_.listeners, job_id, number, fname,
+        file_deletion_status, GetName());
+  }
 }
 
 // Diffs the files listed in filenames and those that do not
@@ -303,19 +380,19 @@
   // FindObsoleteFiles() should've populated this so nonzero
   assert(state.manifest_file_number != 0);
 
-  // Now, convert live list to an unordered map, WITHOUT mutex held;
-  // set is slow.
-  std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
-  for (const FileDescriptor& fd : state.sst_live) {
-    sst_live_map[fd.GetNumber()] = &fd;
-  }
+  // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow.
+  std::unordered_set<uint64_t> sst_live_set(state.sst_live.begin(),
+                                            state.sst_live.end());
+  std::unordered_set<uint64_t> blob_live_set(state.blob_live.begin(),
+                                             state.blob_live.end());
   std::unordered_set<uint64_t> log_recycle_files_set(
       state.log_recycle_files.begin(), state.log_recycle_files.end());
 
   auto candidate_files = state.full_scan_candidate_files;
   candidate_files.reserve(
       candidate_files.size() + state.sst_delete_files.size() +
-      state.log_delete_files.size() + state.manifest_delete_files.size());
+      state.blob_delete_files.size() + state.log_delete_files.size() +
+      state.manifest_delete_files.size());
   // We may ignore the dbname when generating the file names.
   for (auto& file : state.sst_delete_files) {
     candidate_files.emplace_back(
@@ -326,10 +403,15 @@
     file.DeleteMetadata();
   }
 
+  for (const auto& blob_file : state.blob_delete_files) {
+    candidate_files.emplace_back(BlobFileName(blob_file.GetBlobFileNumber()),
+                                 blob_file.GetPath());
+  }
+
+  auto wal_dir = immutable_db_options_.GetWalDir();
   for (auto file_num : state.log_delete_files) {
     if (file_num > 0) {
-      candidate_files.emplace_back(LogFileName(file_num),
-                                   immutable_db_options_.wal_dir);
+      candidate_files.emplace_back(LogFileName(file_num), wal_dir);
     }
   }
   for (const auto& filename : state.manifest_delete_files) {
@@ -382,7 +464,8 @@
   // Close WALs before trying to delete them.
   for (const auto w : state.logs_to_free) {
     // TODO: maybe check the return value of Close.
-    w->Close();
+    auto s = w->Close();
+    s.PermitUncheckedError();
   }
 
   bool own_files = OwnTablesAndLogs();
@@ -398,7 +481,7 @@
 
     bool keep = true;
     switch (type) {
-      case kLogFile:
+      case kWalFile:
         keep = ((number >= state.log_number) ||
                 (number == state.prev_log_number) ||
                 (log_recycle_files_set.find(number) !=
@@ -412,12 +495,19 @@
       case kTableFile:
         // If the second condition is not there, this makes
         // DontDeletePendingOutputs fail
-        keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+        keep = (sst_live_set.find(number) != sst_live_set.end()) ||
                number >= state.min_pending_output;
         if (!keep) {
           files_to_del.insert(number);
         }
         break;
+      case kBlobFile:
+        keep = number >= state.min_pending_output ||
+               (blob_live_set.find(number) != blob_live_set.end());
+        if (!keep) {
+          files_to_del.insert(number);
+        }
+        break;
       case kTempFile:
         // Any temp files that are currently being written to must
         // be recorded in pending_outputs_, which is inserted into "live".
@@ -427,7 +517,8 @@
         //
         // TODO(yhchiang): carefully modify the third condition to safely
         //                 remove the temp options files.
-        keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+        keep = (sst_live_set.find(number) != sst_live_set.end()) ||
+               (blob_live_set.find(number) != blob_live_set.end()) ||
                (number == state.pending_manifest_file_number) ||
                (to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
         break;
@@ -439,18 +530,11 @@
         break;
       case kOptionsFile:
         keep = (number >= optsfile_num2);
-        TEST_SYNC_POINT_CALLBACK(
-            "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1",
-            reinterpret_cast<void*>(&number));
-        TEST_SYNC_POINT_CALLBACK(
-            "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2",
-            reinterpret_cast<void*>(&keep));
         break;
       case kCurrentFile:
       case kDBLockFile:
       case kIdentityFile:
       case kMetaDatabase:
-      case kBlobFile:
         keep = true;
         break;
     }
@@ -466,9 +550,11 @@
       TableCache::Evict(table_cache_.get(), number);
       fname = MakeTableFileName(candidate_file.file_path, number);
       dir_to_sync = candidate_file.file_path;
+    } else if (type == kBlobFile) {
+      fname = BlobFileName(candidate_file.file_path, number);
+      dir_to_sync = candidate_file.file_path;
     } else {
-      dir_to_sync =
-          (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_;
+      dir_to_sync = (type == kWalFile) ? wal_dir : dbname_;
       fname = dir_to_sync +
               ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
                        (!to_delete.empty() && to_delete.front() == '/')
@@ -478,8 +564,8 @@
     }
 
 #ifndef ROCKSDB_LITE
-    if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 ||
-                             immutable_db_options_.wal_size_limit_mb > 0)) {
+    if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 ||
+                             immutable_db_options_.WAL_size_limit_MB > 0)) {
       wal_manager_.ArchiveWALFile(fname, number);
       continue;
     }
@@ -491,7 +577,6 @@
     if (!own_files) {
       continue;
     }
-    Status file_deletion_status;
     if (schedule_only) {
       InstrumentedMutexLock guard_lock(&mutex_);
       SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
@@ -555,6 +640,11 @@
   InstrumentedMutexLock l(&mutex_);
   --pending_purge_obsolete_files_;
   assert(pending_purge_obsolete_files_ >= 0);
+  if (schedule_only) {
+    // Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_
+    // while holding mutex (for GetSortedWalFiles() etc.)
+    SchedulePurge();
+  }
   if (pending_purge_obsolete_files_ == 0) {
     bg_cv_.SignalAll();
   }
@@ -568,26 +658,28 @@
 
   mutex_.Unlock();
   if (job_context.HaveSomethingToDelete()) {
-    PurgeObsoleteFiles(job_context);
+    bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io;
+    PurgeObsoleteFiles(job_context, defer_purge);
   }
   job_context.Clean();
   mutex_.Lock();
 }
 
 uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
-    const autovector<MemTable*>& memtables_to_flush) {
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
   uint64_t min_log = 0;
 
   // we must look through the memtables for two phase transactions
   // that have been committed but not yet flushed
+  std::unordered_set<MemTable*> memtables_to_flush_set(
+      memtables_to_flush.begin(), memtables_to_flush.end());
   for (auto loop_cfd : *vset->GetColumnFamilySet()) {
-    if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
+    if (loop_cfd->IsDropped()) {
       continue;
     }
 
     auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
-        memtables_to_flush);
+        &memtables_to_flush_set);
 
     if (log > 0 && (min_log == 0 || log < min_log)) {
       min_log = log;
@@ -603,16 +695,39 @@
   return min_log;
 }
 
-uint64_t PrecomputeMinLogNumberToKeep(
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  std::unordered_set<MemTable*> memtables_to_flush_set;
+  for (const autovector<MemTable*>* memtables : memtables_to_flush) {
+    memtables_to_flush_set.insert(memtables->begin(), memtables->end());
+  }
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped()) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        &memtables_to_flush_set);
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
     VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
-    autovector<VersionEdit*> edit_list,
-    const autovector<MemTable*>& memtables_to_flush,
-    LogsWithPrepTracker* prep_tracker) {
+    const autovector<VersionEdit*>& edit_list) {
   assert(vset != nullptr);
-  assert(prep_tracker != nullptr);
-  // Calculate updated min_log_number_to_keep
-  // Since the function should only be called in 2pc mode, log number in
-  // the version edit should be sufficient.
 
   // Precompute the min log number containing unflushed data for the column
   // family being flushed (`cfd_to_flush`).
@@ -636,6 +751,58 @@
     min_log_number_to_keep =
         std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
   }
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists) {
+  assert(vset != nullptr);
+  assert(!cfds_to_flush.empty());
+  assert(cfds_to_flush.size() == edit_lists.size());
+
+  uint64_t min_log_number_to_keep = port::kMaxUint64;
+  for (const auto& edit_list : edit_lists) {
+    uint64_t log = 0;
+    for (const auto& e : edit_list) {
+      if (e->HasLogNumber()) {
+        log = std::max(log, e->GetLogNumber());
+      }
+    }
+    if (log != 0) {
+      min_log_number_to_keep = std::min(min_log_number_to_keep, log);
+    }
+  }
+  if (min_log_number_to_keep == port::kMaxUint64) {
+    min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber();
+    for (size_t i = 1; i < cfds_to_flush.size(); i++) {
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber());
+    }
+  }
+
+  std::unordered_set<const ColumnFamilyData*> flushed_cfds(
+      cfds_to_flush.begin(), cfds_to_flush.end());
+  min_log_number_to_keep =
+      std::min(min_log_number_to_keep,
+               vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds));
+
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  // Calculate updated min_log_number_to_keep
+  // Since the function should only be called in 2pc mode, log number in
+  // the version edit should be sufficient.
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list);
 
   // if are 2pc we must consider logs containing prepared
   // sections of outstanding transactions.
@@ -654,14 +821,162 @@
     min_log_number_to_keep = min_log_in_prep_heap;
   }
 
-  uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
-      vset, &cfd_to_flush, memtables_to_flush);
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  assert(cfds_to_flush.size() == edit_lists.size());
+  assert(cfds_to_flush.size() == memtables_to_flush.size());
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists);
+
+  uint64_t min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
 
   if (min_log_refed_by_mem != 0 &&
       min_log_refed_by_mem < min_log_number_to_keep) {
     min_log_number_to_keep = min_log_refed_by_mem;
   }
+
   return min_log_number_to_keep;
 }
 
+Status DBImpl::SetDBId(bool read_only) {
+  Status s;
+  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
+  // the very first time.
+  if (db_id_.empty()) {
+    // Check for the IDENTITY file and create it if not there.
+    s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+    // Typically Identity file is created in NewDB() and for some reason if
+    // it is no longer available then at this point DB ID is not in Identity
+    // file or Manifest.
+    if (s.IsNotFound()) {
+      // Create a new DB ID, saving to file only if allowed
+      if (read_only) {
+        db_id_ = env_->GenerateUniqueId();
+        return Status::OK();
+      } else {
+        s = SetIdentityFile(env_, dbname_);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    } else if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
+    }
+    s = GetDbIdentityFromIdentityFile(&db_id_);
+    if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
+      VersionEdit edit;
+      edit.SetDBId(db_id_);
+      Options options;
+      MutableCFOptions mutable_cf_options(options);
+      versions_->db_id_ = db_id_;
+      s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                                 mutable_cf_options, &edit, &mutex_, nullptr,
+                                 /* new_descriptor_log */ false);
+    }
+  } else if (!read_only) {
+    s = SetIdentityFile(env_, dbname_, db_id_);
+  }
+  return s;
+}
+
+Status DBImpl::DeleteUnreferencedSstFiles() {
+  mutex_.AssertHeld();
+  std::vector<std::string> paths;
+  paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
+  for (const auto& db_path : immutable_db_options_.db_paths) {
+    paths.push_back(
+        NormalizePath(db_path.path + std::string(1, kFilePathSeparator)));
+  }
+  for (const auto* cfd : *versions_->GetColumnFamilySet()) {
+    for (const auto& cf_path : cfd->ioptions()->cf_paths) {
+      paths.push_back(
+          NormalizePath(cf_path.path + std::string(1, kFilePathSeparator)));
+    }
+  }
+  // Dedup paths
+  std::sort(paths.begin(), paths.end());
+  paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+  uint64_t next_file_number = versions_->current_next_file_number();
+  uint64_t largest_file_number = next_file_number;
+  std::set<std::string> files_to_delete;
+  Status s;
+  for (const auto& path : paths) {
+    std::vector<std::string> files;
+    s = env_->GetChildren(path, &files);
+    if (!s.ok()) {
+      break;
+    }
+    for (const auto& fname : files) {
+      uint64_t number = 0;
+      FileType type;
+      if (!ParseFileName(fname, &number, &type)) {
+        continue;
+      }
+      // path ends with '/' or '\\'
+      const std::string normalized_fpath = path + fname;
+      largest_file_number = std::max(largest_file_number, number);
+      if (type == kTableFile && number >= next_file_number &&
+          files_to_delete.find(normalized_fpath) == files_to_delete.end()) {
+        files_to_delete.insert(normalized_fpath);
+      }
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (largest_file_number >= next_file_number) {
+    versions_->next_file_number_.store(largest_file_number + 1);
+  }
+
+  VersionEdit edit;
+  edit.SetNextFile(versions_->next_file_number_.load());
+  assert(versions_->GetColumnFamilySet());
+  ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(default_cfd);
+  s = versions_->LogAndApply(
+      default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
+      directories_.GetDbDir(), /*new_descriptor_log*/ false);
+  if (!s.ok()) {
+    return s;
+  }
+
+  mutex_.Unlock();
+  for (const auto& fname : files_to_delete) {
+    s = env_->DeleteFile(fname);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  mutex_.Lock();
+  return s;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_open.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,45 +6,41 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl/db_impl.h"
-
 #include <cinttypes>
 
 #include "db/builder.h"
+#include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
+#include "db/periodic_work_scheduler.h"
 #include "env/composite_env_wrapper.h"
+#include "file/filename.h"
 #include "file/read_write_util.h"
 #include "file/sst_file_manager_impl.h"
 #include "file/writable_file_writer.h"
+#include "logging/logging.h"
 #include "monitoring/persistent_stats_history.h"
 #include "options/options_helper.h"
+#include "rocksdb/table.h"
 #include "rocksdb/wal_filter.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "test_util/sync_point.h"
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
-Options SanitizeOptions(const std::string& dbname, const Options& src) {
-  auto db_options = SanitizeOptions(dbname, DBOptions(src));
+Options SanitizeOptions(const std::string& dbname, const Options& src,
+                        bool read_only) {
+  auto db_options = SanitizeOptions(dbname, DBOptions(src), read_only);
   ImmutableDBOptions immutable_db_options(db_options);
   auto cf_options =
       SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
   return Options(db_options, cf_options);
 }
 
-DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
+                          bool read_only) {
   DBOptions result(src);
 
-  if (result.file_system == nullptr) {
-    if (result.env == Env::Default()) {
-      result.file_system = FileSystem::Default();
-    } else {
-      result.file_system.reset(new LegacyFileSystemWrapper(result.env));
-    }
-  } else {
-    if (result.env == nullptr) {
-      result.env = Env::Default();
-    }
+  if (result.env == nullptr) {
+    result.env = Env::Default();
   }
 
   // result.max_open_files means an "infinite" open files.
@@ -58,7 +54,7 @@
                              &result.max_open_files);
   }
 
-  if (result.info_log == nullptr) {
+  if (result.info_log == nullptr && !read_only) {
     Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
     if (!s.ok()) {
       // No place suitable for logging
@@ -98,25 +94,47 @@
   }
 
   if (result.recycle_log_file_num &&
-      (result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
+      (result.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords ||
+       result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
        result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
-    // kPointInTimeRecovery is inconsistent with recycle log file feature since
-    // we define the "end" of the log as the first corrupt record we encounter.
-    // kAbsoluteConsistency doesn't make sense because even a clean
-    // shutdown leaves old junk at the end of the log file.
+    // - kTolerateCorruptedTailRecords is inconsistent with recycle log file
+    //   feature. WAL recycling expects recovery success upon encountering a
+    //   corrupt record at the point where new data ends and recycled data
+    //   remains at the tail. However, `kTolerateCorruptedTailRecords` must fail
+    //   upon encountering any such corrupt record, as it cannot differentiate
+    //   between this and a real corruption, which would cause committed updates
+    //   to be truncated -- a violation of the recovery guarantee.
+    // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with
+    //   recycle log file feature temporarily due to a bug found introducing a
+    //   hole in the recovered data
+    //   (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236).
+    //   Besides this bug, we believe the features are fundamentally compatible.
     result.recycle_log_file_num = 0;
   }
 
-  if (result.wal_dir.empty()) {
+  if (result.db_paths.size() == 0) {
+    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+  } else if (result.wal_dir.empty()) {
     // Use dbname as default
     result.wal_dir = dbname;
   }
-  if (result.wal_dir.back() == '/') {
-    result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+  if (!result.wal_dir.empty()) {
+    // If there is a wal_dir already set, check to see if the wal_dir is the
+    // same as the dbname AND the same as the db_path[0] (which must exist from
+    // a few lines ago). If the wal_dir matches both of these values, then clear
+    // the wal_dir value, which will make wal_dir == dbname.  Most likely this
+    // condition was the result of reading an old options file where we forced
+    // wal_dir to be set (to dbname).
+    auto npath = NormalizePath(dbname + "/");
+    if (npath == NormalizePath(result.wal_dir + "/") &&
+        npath == NormalizePath(result.db_paths[0].path + "/")) {
+      result.wal_dir.clear();
+    }
   }
 
-  if (result.db_paths.size() == 0) {
-    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+  if (!result.wal_dir.empty() && result.wal_dir.back() == '/') {
+    result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
   }
 
   if (result.use_direct_reads && result.compaction_readahead_size == 0) {
@@ -137,7 +155,7 @@
 
 #ifndef ROCKSDB_LITE
   ImmutableDBOptions immutable_db_options(result);
-  if (!IsWalDirSameAsDBPath(&immutable_db_options)) {
+  if (!immutable_db_options.IsWalDirSameAsDBPath()) {
     // Either the WAL dir and db_paths[0]/db_name are not the same, or we
     // cannot tell for sure. In either case, assume they're different and
     // explicitly cleanup the trash log files (bypass DeleteScheduler)
@@ -145,13 +163,15 @@
     // DeleteScheduler::CleanupDirectory on the same dir later, it will be
     // safe
     std::vector<std::string> filenames;
-    result.env->GetChildren(result.wal_dir, &filenames);
+    auto wal_dir = immutable_db_options.GetWalDir();
+    Status s = result.env->GetChildren(wal_dir, &filenames);
+    s.PermitUncheckedError();  //**TODO: What to do on error?
     for (std::string& filename : filenames) {
       if (filename.find(".log.trash", filename.length() -
                                           std::string(".log.trash").length()) !=
           std::string::npos) {
-        std::string trash_file = result.wal_dir + "/" + filename;
-        result.env->DeleteFile(trash_file);
+        std::string trash_file = wal_dir + "/" + filename;
+        result.env->DeleteFile(trash_file).PermitUncheckedError();
       }
     }
   }
@@ -161,7 +181,8 @@
   // was not used)
   auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
   for (size_t i = 0; i < result.db_paths.size(); i++) {
-    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path);
+    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path)
+        .PermitUncheckedError();
   }
 
   // Create a default SstFileManager for purposes of tracking compaction size
@@ -171,7 +192,7 @@
         NewSstFileManager(result.env, result.info_log));
     result.sst_file_manager = sst_file_manager;
   }
-#endif
+#endif  // !ROCKSDB_LITE
 
   if (!result.paranoid_checks) {
     result.skip_checking_sst_file_sizes_on_db_open = true;
@@ -179,16 +200,23 @@
                    "file size check will be skipped during open.");
   }
 
+  if (result.preserve_deletes) {
+    ROCKS_LOG_WARN(
+        result.info_log,
+        "preserve_deletes is deprecated, will be removed in a future release. "
+        "Please try using user-defined timestamp instead.");
+  }
+
   return result;
 }
 
 namespace {
-Status SanitizeOptionsByTable(
+Status ValidateOptionsByTable(
     const DBOptions& db_opts,
     const std::vector<ColumnFamilyDescriptor>& column_families) {
   Status s;
   for (auto cf : column_families) {
-    s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
+    s = ValidateOptions(db_opts, cf.options);
     if (!s.ok()) {
       return s;
     }
@@ -252,10 +280,16 @@
         "atomic_flush is incompatible with enable_pipelined_write");
   }
 
+  // TODO remove this restriction
+  if (db_options.atomic_flush && db_options.best_efforts_recovery) {
+    return Status::InvalidArgument(
+        "atomic_flush is currently incompatible with best-efforts recovery");
+  }
+
   return Status::OK();
 }
 
-Status DBImpl::NewDB() {
+Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
   VersionEdit new_db;
   Status s = SetIdentityFile(env_, dbname_);
   if (!s.ok()) {
@@ -273,36 +307,47 @@
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   {
+    if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) {
+      fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+    }
     std::unique_ptr<FSWritableFile> file;
     FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
     s = NewWritableFile(fs_.get(), manifest, &file, file_options);
     if (!s.ok()) {
       return s;
     }
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
     file->SetPreallocationBlockSize(
         immutable_db_options_.manifest_preallocation_size);
     std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        std::move(file), manifest, file_options, env_, nullptr /* stats */,
-        immutable_db_options_.listeners));
+        std::move(file), manifest, file_options, immutable_db_options_.clock,
+        io_tracer_, nullptr /* stats */, immutable_db_options_.listeners,
+        nullptr, tmp_set.Contains(FileType::kDescriptorFile),
+        tmp_set.Contains(FileType::kDescriptorFile)));
     log::Writer log(std::move(file_writer), 0, false);
     std::string record;
     new_db.EncodeTo(&record);
     s = log.AddRecord(record);
     if (s.ok()) {
-      s = SyncManifest(env_, &immutable_db_options_, log.file());
+      s = SyncManifest(&immutable_db_options_, log.file());
     }
   }
   if (s.ok()) {
     // Make "CURRENT" file that points to the new manifest file.
-    s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir());
+    s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir());
+    if (new_filenames) {
+      new_filenames->emplace_back(
+          manifest.substr(manifest.find_last_of("/\\") + 1));
+    }
   } else {
-    fs_->DeleteFile(manifest, IOOptions(), nullptr);
+    fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
   }
   return s;
 }
 
-Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
-                                     std::unique_ptr<Directory>* directory) {
+IOStatus DBImpl::CreateAndNewDirectory(
+    FileSystem* fs, const std::string& dirname,
+    std::unique_ptr<FSDirectory>* directory) {
   // We call CreateDirIfMissing() as the directory may already exist (if we
   // are reopening a DB), when this happens we don't want creating the
   // directory to cause an error. However, we need to check if creating the
@@ -310,24 +355,24 @@
   // file not existing. One real-world example of this occurring is if
   // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
   // when dbname_ is "dir/db" but when "dir" doesn't exist.
-  Status s = env->CreateDirIfMissing(dirname);
-  if (!s.ok()) {
-    return s;
+  IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr);
+  if (!io_s.ok()) {
+    return io_s;
   }
-  return env->NewDirectory(dirname, directory);
+  return fs->NewDirectory(dirname, IOOptions(), directory, nullptr);
 }
 
-Status Directories::SetDirectories(Env* env, const std::string& dbname,
-                                   const std::string& wal_dir,
-                                   const std::vector<DbPath>& data_paths) {
-  Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_);
-  if (!s.ok()) {
-    return s;
+IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
+                                     const std::string& wal_dir,
+                                     const std::vector<DbPath>& data_paths) {
+  IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_);
+  if (!io_s.ok()) {
+    return io_s;
   }
   if (!wal_dir.empty() && dbname != wal_dir) {
-    s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_);
-    if (!s.ok()) {
-      return s;
+    io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_);
+    if (!io_s.ok()) {
+      return io_s;
     }
   }
 
@@ -337,28 +382,29 @@
     if (db_path == dbname) {
       data_dirs_.emplace_back(nullptr);
     } else {
-      std::unique_ptr<Directory> path_directory;
-      s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory);
-      if (!s.ok()) {
-        return s;
+      std::unique_ptr<FSDirectory> path_directory;
+      io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory);
+      if (!io_s.ok()) {
+        return io_s;
       }
       data_dirs_.emplace_back(path_directory.release());
     }
   }
   assert(data_dirs_.size() == data_paths.size());
-  return Status::OK();
+  return IOStatus::OK();
 }
 
 Status DBImpl::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
-    bool error_if_log_file_exist, bool error_if_data_exists_in_logs,
+    bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
     uint64_t* recovered_seq) {
   mutex_.AssertHeld();
 
   bool is_new_db = false;
   assert(db_lock_ == nullptr);
+  std::vector<std::string> files_in_dbname;
   if (!read_only) {
-    Status s = directories_.SetDirectories(env_, dbname_,
+    Status s = directories_.SetDirectories(fs_.get(), dbname_,
                                            immutable_db_options_.wal_dir,
                                            immutable_db_options_.db_paths);
     if (!s.ok()) {
@@ -371,10 +417,35 @@
     }
 
     std::string current_fname = CurrentFileName(dbname_);
-    s = env_->FileExists(current_fname);
+    // Path to any MANIFEST file in the db dir. It does not matter which one.
+    // Since best-efforts recovery ignores CURRENT file, existence of a
+    // MANIFEST indicates the recovery to recover existing db. If no MANIFEST
+    // can be found, a new db will be created.
+    std::string manifest_path;
+    if (!immutable_db_options_.best_efforts_recovery) {
+      s = env_->FileExists(current_fname);
+    } else {
+      s = Status::NotFound();
+      Status io_s = env_->GetChildren(dbname_, &files_in_dbname);
+      if (!io_s.ok()) {
+        s = io_s;
+        files_in_dbname.clear();
+      }
+      for (const std::string& file : files_in_dbname) {
+        uint64_t number = 0;
+        FileType type = kWalFile;  // initialize
+        if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+          // Found MANIFEST (descriptor log), thus best-efforts recovery does
+          // not have to treat the db as empty.
+          s = Status::OK();
+          manifest_path = dbname_ + "/" + file;
+          break;
+        }
+      }
+    }
     if (s.IsNotFound()) {
       if (immutable_db_options_.create_if_missing) {
-        s = NewDB();
+        s = NewDB(&files_in_dbname);
         is_new_db = true;
         if (!s.ok()) {
           return s;
@@ -399,14 +470,14 @@
       FileOptions customized_fs(file_options_);
       customized_fs.use_direct_reads |=
           immutable_db_options_.use_direct_io_for_flush_and_compaction;
-      s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
-                                   nullptr);
+      const std::string& fname =
+          manifest_path.empty() ? current_fname : manifest_path;
+      s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
       if (!s.ok()) {
         std::string error_str = s.ToString();
         // Check if unsupported Direct I/O is the root cause
         customized_fs.use_direct_reads = false;
-        s = fs_->NewRandomAccessFile(current_fname, customized_fs, &idfile,
-                                     nullptr);
+        s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
         if (s.ok()) {
           return Status::InvalidArgument(
               "Direct I/O is not supported by the specified DB.");
@@ -416,49 +487,45 @@
         }
       }
     }
+  } else if (immutable_db_options_.best_efforts_recovery) {
+    assert(files_in_dbname.empty());
+    Status s = env_->GetChildren(dbname_, &files_in_dbname);
+    if (s.IsNotFound()) {
+      return Status::InvalidArgument(dbname_,
+                                     "does not exist (open for read only)");
+    } else if (s.IsIOError()) {
+      return s;
+    }
+    assert(s.ok());
   }
   assert(db_id_.empty());
-  Status s = versions_->Recover(column_families, read_only, &db_id_);
+  Status s;
+  bool missing_table_file = false;
+  if (!immutable_db_options_.best_efforts_recovery) {
+    s = versions_->Recover(column_families, read_only, &db_id_);
+  } else {
+    assert(!files_in_dbname.empty());
+    s = versions_->TryRecover(column_families, read_only, files_in_dbname,
+                              &db_id_, &missing_table_file);
+    if (s.ok()) {
+      // TryRecover may delete previous column_family_set_.
+      column_family_memtables_.reset(
+          new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+    }
+  }
   if (!s.ok()) {
     return s;
   }
-  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
-  // the very first time.
-  if (db_id_.empty()) {
-    // Check for the IDENTITY file and create it if not there.
-    s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
-    // Typically Identity file is created in NewDB() and for some reason if
-    // it is no longer available then at this point DB ID is not in Identity
-    // file or Manifest.
-    if (s.IsNotFound()) {
-      s = SetIdentityFile(env_, dbname_);
-      if (!s.ok()) {
-        return s;
-      }
-    } else if (!s.ok()) {
-      assert(s.IsIOError());
-      return s;
-    }
-    s = GetDbIdentityFromIdentityFile(&db_id_);
-    if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
-      VersionEdit edit;
-      edit.SetDBId(db_id_);
-      Options options;
-      MutableCFOptions mutable_cf_options(options);
-      versions_->db_id_ = db_id_;
-      s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
-                             mutable_cf_options, &edit, &mutex_, nullptr,
-                             false);
-    }
-  } else {
-    s = SetIdentityFile(env_, dbname_, db_id_);
+  s = SetDBId(read_only);
+  if (s.ok() && !read_only) {
+    s = DeleteUnreferencedSstFiles();
   }
 
   if (immutable_db_options_.paranoid_checks && s.ok()) {
     s = CheckConsistency();
   }
   if (s.ok() && !read_only) {
-    std::map<std::string, std::shared_ptr<Directory>> created_dirs;
+    std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs;
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       s = cfd->AddDirectories(&created_dirs);
       if (!s.ok()) {
@@ -471,8 +538,9 @@
     s = InitPersistStatsColumnFamily();
   }
 
+  std::vector<std::string> files_in_wal_dir;
   if (s.ok()) {
-    // Initial max_total_in_memory_state_ before recovery logs. Log recovery
+    // Initial max_total_in_memory_state_ before recovery wals. Log recovery
     // may check this value to decide whether to flush.
     max_total_in_memory_state_ = 0;
     for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -497,59 +565,86 @@
     // Note that prev_log_number() is no longer used, but we pay
     // attention to it in case we are recovering a database
     // produced by an older version of rocksdb.
-    std::vector<std::string> filenames;
-    s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+    auto wal_dir = immutable_db_options_.GetWalDir();
+    if (!immutable_db_options_.best_efforts_recovery) {
+      s = env_->GetChildren(wal_dir, &files_in_wal_dir);
+    }
     if (s.IsNotFound()) {
-      return Status::InvalidArgument("wal_dir not found",
-                                     immutable_db_options_.wal_dir);
+      return Status::InvalidArgument("wal_dir not found", wal_dir);
     } else if (!s.ok()) {
       return s;
     }
 
-    std::vector<uint64_t> logs;
-    for (size_t i = 0; i < filenames.size(); i++) {
+    std::unordered_map<uint64_t, std::string> wal_files;
+    for (const auto& file : files_in_wal_dir) {
       uint64_t number;
       FileType type;
-      if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
+      if (ParseFileName(file, &number, &type) && type == kWalFile) {
         if (is_new_db) {
           return Status::Corruption(
               "While creating a new Db, wal_dir contains "
               "existing log file: ",
-              filenames[i]);
+              file);
         } else {
-          logs.push_back(number);
+          wal_files[number] = LogFileName(wal_dir, number);
         }
       }
     }
 
-    if (logs.size() > 0) {
-      if (error_if_log_file_exist) {
+    if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+      if (!immutable_db_options_.best_efforts_recovery) {
+        // Verify WALs in MANIFEST.
+        s = versions_->GetWalSet().CheckWals(env_, wal_files);
+      }  // else since best effort recovery does not recover from WALs, no need
+         // to check WALs.
+    } else if (!versions_->GetWalSet().GetWals().empty()) {
+      // Tracking is disabled, clear previously tracked WALs from MANIFEST,
+      // otherwise, in the future, if WAL tracking is enabled again,
+      // since the WALs deleted when WAL tracking is disabled are not persisted
+      // into MANIFEST, WAL check may fail.
+      VersionEdit edit;
+      WalNumber max_wal_number =
+          versions_->GetWalSet().GetWals().rbegin()->first;
+      edit.DeleteWalsBefore(max_wal_number + 1);
+      s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!wal_files.empty()) {
+      if (error_if_wal_file_exists) {
         return Status::Corruption(
-            "The db was opened in readonly mode with error_if_log_file_exist"
-            "flag but a log file already exists");
-      } else if (error_if_data_exists_in_logs) {
-        for (auto& log : logs) {
-          std::string fname = LogFileName(immutable_db_options_.wal_dir, log);
+            "The db was opened in readonly mode with error_if_wal_file_exists"
+            "flag but a WAL file already exists");
+      } else if (error_if_data_exists_in_wals) {
+        for (auto& wal_file : wal_files) {
           uint64_t bytes;
-          s = env_->GetFileSize(fname, &bytes);
+          s = env_->GetFileSize(wal_file.second, &bytes);
           if (s.ok()) {
             if (bytes > 0) {
               return Status::Corruption(
-                  "error_if_data_exists_in_logs is set but there are data "
-                  " in log files.");
+                  "error_if_data_exists_in_wals is set but there are data "
+                  " in WAL files.");
             }
           }
         }
       }
     }
 
-    if (!logs.empty()) {
-      // Recover in the order in which the logs were generated
-      std::sort(logs.begin(), logs.end());
-      bool corrupted_log_found = false;
-      s = RecoverLogFiles(logs, &next_sequence, read_only,
-                          &corrupted_log_found);
-      if (corrupted_log_found && recovered_seq != nullptr) {
+    if (!wal_files.empty()) {
+      // Recover in the order in which the wals were generated
+      std::vector<uint64_t> wals;
+      wals.reserve(wal_files.size());
+      for (const auto& wal_file : wal_files) {
+        wals.push_back(wal_file.first);
+      }
+      std::sort(wals.begin(), wals.end());
+
+      bool corrupted_wal_found = false;
+      s = RecoverLogFiles(wals, &next_sequence, read_only,
+                          &corrupted_wal_found);
+      if (corrupted_wal_found && recovered_seq != nullptr) {
         *recovered_seq = next_sequence;
       }
       if (!s.ok()) {
@@ -567,23 +662,37 @@
     // to reflect the most recent OPTIONS file. It does not matter for regular
     // read-write db instance because options_file_number_ will later be
     // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
-    std::vector<std::string> file_names;
+    std::vector<std::string> filenames;
     if (s.ok()) {
-      s = env_->GetChildren(GetName(), &file_names);
+      const std::string normalized_dbname = NormalizePath(dbname_);
+      const std::string normalized_wal_dir =
+          NormalizePath(immutable_db_options_.GetWalDir());
+      if (immutable_db_options_.best_efforts_recovery) {
+        filenames = std::move(files_in_dbname);
+      } else if (normalized_dbname == normalized_wal_dir) {
+        filenames = std::move(files_in_wal_dir);
+      } else {
+        s = env_->GetChildren(GetName(), &filenames);
+      }
     }
     if (s.ok()) {
       uint64_t number = 0;
       uint64_t options_file_number = 0;
       FileType type;
-      for (const auto& fname : file_names) {
+      for (const auto& fname : filenames) {
         if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
           options_file_number = std::max(number, options_file_number);
         }
       }
       versions_->options_file_number_ = options_file_number;
+      uint64_t options_file_size = 0;
+      if (options_file_number > 0) {
+        s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number),
+                              &options_file_size);
+      }
+      versions_->options_file_size_ = options_file_size;
     }
   }
-
   return s;
 }
 
@@ -612,41 +721,56 @@
         (kStatsCFCurrentFormatVersion < format_version_recovered &&
          kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
       if (!s_format.ok() || !s_compatible.ok()) {
-        ROCKS_LOG_INFO(
+        ROCKS_LOG_WARN(
             immutable_db_options_.info_log,
-            "Reading persistent stats version key failed. Format key: %s, "
-            "compatible key: %s",
+            "Recreating persistent stats column family since reading "
+            "persistent stats version key failed. Format key: %s, compatible "
+            "key: %s",
             s_format.ToString().c_str(), s_compatible.ToString().c_str());
       } else {
-        ROCKS_LOG_INFO(
+        ROCKS_LOG_WARN(
             immutable_db_options_.info_log,
-            "Disable persistent stats due to corrupted or incompatible format "
-            "version\n");
+            "Recreating persistent stats column family due to corrupted or "
+            "incompatible format version. Recovered format: %" PRIu64
+            "; recovered format compatible since: %" PRIu64 "\n",
+            format_version_recovered, compatible_version_recovered);
+      }
+      s = DropColumnFamily(persist_stats_cf_handle_);
+      if (s.ok()) {
+        s = DestroyColumnFamilyHandle(persist_stats_cf_handle_);
       }
-      DropColumnFamily(persist_stats_cf_handle_);
-      DestroyColumnFamilyHandle(persist_stats_cf_handle_);
       ColumnFamilyHandle* handle = nullptr;
-      ColumnFamilyOptions cfo;
-      OptimizeForPersistentStats(&cfo);
-      s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
-      persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
-      // should also persist version here because old stats CF is discarded
-      should_persist_format_version = true;
+      if (s.ok()) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+      }
+      if (s.ok()) {
+        persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+        // should also persist version here because old stats CF is discarded
+        should_persist_format_version = true;
+      }
     }
   }
-  if (s.ok() && should_persist_format_version) {
+  if (should_persist_format_version) {
     // Persistent stats CF being created for the first time, need to write
     // format version key
     WriteBatch batch;
-    batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
-              ToString(kStatsCFCurrentFormatVersion));
-    batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
-              ToString(kStatsCFCompatibleFormatVersion));
-    WriteOptions wo;
-    wo.low_pri = true;
-    wo.no_slowdown = true;
-    wo.sync = false;
-    s = Write(wo, &batch);
+    if (s.ok()) {
+      s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+                    ToString(kStatsCFCurrentFormatVersion));
+    }
+    if (s.ok()) {
+      s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+                    ToString(kStatsCFCompatibleFormatVersion));
+    }
+    if (s.ok()) {
+      WriteOptions wo;
+      wo.low_pri = true;
+      wo.no_slowdown = true;
+      wo.sync = false;
+      s = Write(wo, &batch);
+    }
   }
   mutex_.Lock();
   return s;
@@ -679,10 +803,10 @@
   return s;
 }
 
-// REQUIRES: log_numbers are sorted in ascending order
-Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+// REQUIRES: wal_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
                                SequenceNumber* next_sequence, bool read_only,
-                               bool* corrupted_log_found) {
+                               bool* corrupted_wal_found) {
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
     Logger* info_log;
@@ -690,10 +814,10 @@
     Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
     void Corruption(size_t bytes, const Status& s) override {
       ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
-                     (this->status == nullptr ? "(ignoring error) " : ""),
-                     fname, static_cast<int>(bytes), s.ToString().c_str());
-      if (this->status != nullptr && this->status->ok()) {
-        *this->status = s;
+                     (status == nullptr ? "(ignoring error) " : ""), fname,
+                     static_cast<int>(bytes), s.ToString().c_str());
+      if (status != nullptr && status->ok()) {
+        *status = s;
       }
     }
   };
@@ -712,10 +836,10 @@
     auto stream = event_logger_.Log();
     stream << "job" << job_id << "event"
            << "recovery_started";
-    stream << "log_files";
+    stream << "wal_files";
     stream.StartArray();
-    for (auto log_number : log_numbers) {
-      stream << log_number;
+    for (auto wal_number : wal_numbers) {
+      stream << wal_number;
     }
     stream.EndArray();
   }
@@ -738,25 +862,31 @@
   bool stop_replay_by_wal_filter = false;
   bool stop_replay_for_corruption = false;
   bool flushed = false;
-  uint64_t corrupted_log_number = kMaxSequenceNumber;
-  uint64_t min_log_number = MinLogNumberToKeep();
-  for (auto log_number : log_numbers) {
-    if (log_number < min_log_number) {
+  uint64_t corrupted_wal_number = kMaxSequenceNumber;
+  uint64_t min_wal_number = MinLogNumberToKeep();
+  if (!allow_2pc()) {
+    // In non-2pc mode, we skip WALs that do not back unflushed data.
+    min_wal_number =
+        std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
+  }
+  for (auto wal_number : wal_numbers) {
+    if (wal_number < min_wal_number) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "Skipping log #%" PRIu64
                      " since it is older than min log to keep #%" PRIu64,
-                     log_number, min_log_number);
+                     wal_number, min_wal_number);
       continue;
     }
     // The previous incarnation may not have written any MANIFEST
     // records after allocating this log number.  So we manually
     // update the file number allocation counter in VersionSet.
-    versions_->MarkFileNumberUsed(log_number);
+    versions_->MarkFileNumberUsed(wal_number);
     // Open the log file
-    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+    std::string fname =
+        LogFileName(immutable_db_options_.GetWalDir(), wal_number);
 
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                   "Recovering log #%" PRIu64 " mode %d", log_number,
+                   "Recovering log #%" PRIu64 " mode %d", wal_number,
                    static_cast<int>(immutable_db_options_.wal_recovery_mode));
     auto logFileDropped = [this, &fname]() {
       uint64_t bytes;
@@ -788,7 +918,8 @@
         }
       }
       file_reader.reset(new SequentialFileReader(
-          std::move(file), fname, immutable_db_options_.log_readahead_size));
+          std::move(file), fname, immutable_db_options_.log_readahead_size,
+          io_tracer_));
     }
 
     // Create the log reader.
@@ -808,7 +939,7 @@
     // to be skipped instead of propagating bad information (like overly
     // large sequence numbers).
     log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
-                       &reporter, true /*checksum*/, log_number);
+                       &reporter, true /*checksum*/, wal_number);
 
     // Determine if we should tolerate incomplete records at the tail end of the
     // Read all the records and add to a memtable
@@ -816,6 +947,8 @@
     Slice record;
     WriteBatch batch;
 
+    TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
+                             /*arg=*/nullptr);
     while (!stop_replay_by_wal_filter &&
            reader.ReadRecord(&record, &scratch,
                              immutable_db_options_.wal_recovery_mode) &&
@@ -825,7 +958,11 @@
                             Status::Corruption("log record too small"));
         continue;
       }
-      WriteBatchInternal::SetContents(&batch, record);
+
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (!status.ok()) {
+        return status;
+      }
       SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
 
       if (immutable_db_options_.wal_recovery_mode ==
@@ -850,7 +987,7 @@
 
         WalFilter::WalProcessingOption wal_processing_option =
             immutable_db_options_.wal_filter->LogRecordFound(
-                log_number, fname, batch, &new_batch, &batch_changed);
+                wal_number, fname, batch, &new_batch, &batch_changed);
 
         switch (wal_processing_option) {
           case WalFilter::WalProcessingOption::kContinueProcessing:
@@ -902,7 +1039,7 @@
                 " mode %d log filter %s returned "
                 "more records (%d) than original (%d) which is not allowed. "
                 "Aborting recovery.",
-                log_number,
+                wal_number,
                 static_cast<int>(immutable_db_options_.wal_recovery_mode),
                 immutable_db_options_.wal_filter->Name(), new_count,
                 original_count);
@@ -929,7 +1066,7 @@
       bool has_valid_writes = false;
       status = WriteBatchInternal::InsertInto(
           &batch, column_family_memtables_.get(), &flush_scheduler_,
-          &trim_history_scheduler_, true, log_number, this,
+          &trim_history_scheduler_, true, wal_number, this,
           false /* concurrent_memtable_writes */, next_sequence,
           &has_valid_writes, seq_per_batch_, batch_per_txn_);
       MaybeIgnoreError(&status);
@@ -949,7 +1086,7 @@
           cfd->UnrefAndTryDelete();
           // If this asserts, it means that InsertInto failed in
           // filtering updates to already-flushed column families
-          assert(cfd->GetLogNumber() <= log_number);
+          assert(cfd->GetLogNumber() <= wal_number);
           auto iter = version_edits.find(cfd->GetID());
           assert(iter != version_edits.end());
           VersionEdit* edit = &iter->second;
@@ -980,17 +1117,27 @@
         status = Status::OK();
       } else if (immutable_db_options_.wal_recovery_mode ==
                  WALRecoveryMode::kPointInTimeRecovery) {
+        if (status.IsIOError()) {
+          ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                          "IOError during point-in-time reading log #%" PRIu64
+                          " seq #%" PRIu64
+                          ". %s. This likely mean loss of synced WAL, "
+                          "thus recovery fails.",
+                          wal_number, *next_sequence,
+                          status.ToString().c_str());
+          return status;
+        }
         // We should ignore the error but not continue replaying
         status = Status::OK();
         stop_replay_for_corruption = true;
-        corrupted_log_number = log_number;
-        if (corrupted_log_found != nullptr) {
-          *corrupted_log_found = true;
+        corrupted_wal_number = wal_number;
+        if (corrupted_wal_found != nullptr) {
+          *corrupted_wal_found = true;
         }
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "Point in time recovered to log #%" PRIu64
                        " seq #%" PRIu64,
-                       log_number, *next_sequence);
+                       wal_number, *next_sequence);
       } else {
         assert(immutable_db_options_.wal_recovery_mode ==
                    WALRecoveryMode::kTolerateCorruptedTailRecords ||
@@ -1016,7 +1163,7 @@
   // corruption. This could during PIT recovery when the WAL is corrupted and
   // some (but not all) CFs are flushed
   // Exclude the PIT case where no log is dropped after the corruption point.
-  // This is to cover the case for empty logs after corrupted log, in which we
+  // This is to cover the case for empty wals after corrupted log, in which we
   // don't reset stop_replay_for_corruption.
   if (stop_replay_for_corruption == true &&
       (immutable_db_options_.wal_recovery_mode ==
@@ -1024,11 +1171,29 @@
        immutable_db_options_.wal_recovery_mode ==
            WALRecoveryMode::kTolerateCorruptedTailRecords)) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->GetLogNumber() > corrupted_log_number) {
+      // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
+      // the CF is still consistent: If a new column family is created during
+      // the flush and the WAL sync fails at the same time, the new CF points to
+      // the new WAL but the old WAL is curropted. Since the new CF is empty, it
+      // is still consistent. We add the check of CF sst file size to avoid the
+      // false positive alert.
+
+      // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
+      // the ignorance of a very rare inconsistency case caused in data
+      // canclation. One CF is empty due to KV deletion. But those operations
+      // are in the WAL. If the WAL is corrupted, the status of this CF might
+      // not be consistent with others. However, the consistency check will be
+      // bypassed due to empty CF.
+      // TODO: a better and complete implementation is needed to ensure strict
+      // consistency check in WAL recovery including hanlding the tailing
+      // issues.
+      if (cfd->GetLogNumber() > corrupted_wal_number &&
+          cfd->GetLiveSstFilesSize() > 0) {
         ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                         "Column family inconsistency: SST file contains data"
                         " beyond the point of corruption.");
-        return Status::Corruption("SST file is ahead of WALs");
+        return Status::Corruption("SST file is ahead of WALs in CF " +
+                                  cfd->GetName());
       }
     }
   }
@@ -1039,16 +1204,16 @@
   if (!read_only) {
     // no need to refcount since client still doesn't have access
     // to the DB and can not drop column families while we iterate
-    auto max_log_number = log_numbers.back();
+    const WalNumber max_wal_number = wal_numbers.back();
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       auto iter = version_edits.find(cfd->GetID());
       assert(iter != version_edits.end());
       VersionEdit* edit = &iter->second;
 
-      if (cfd->GetLogNumber() > max_log_number) {
+      if (cfd->GetLogNumber() > max_wal_number) {
         // Column family cfd has already flushed the data
-        // from all logs. Memtable has to be empty because
-        // we filter the updates based on log_number
+        // from all wals. Memtable has to be empty because
+        // we filter the updates based on wal_number
         // (in WriteBatch::InsertInto)
         assert(cfd->mem()->GetFirstSequenceNumber() == 0);
         assert(edit->NumEntries() == 0);
@@ -1080,13 +1245,13 @@
       // Update the log number info in the version edit corresponding to this
       // column family. Note that the version edits will be written to MANIFEST
       // together later.
-      // writing log_number in the manifest means that any log file
-      // with number strongly less than (log_number + 1) is already
+      // writing wal_number in the manifest means that any log file
+      // with number strongly less than (wal_number + 1) is already
       // recovered and should be ignored on next reincarnation.
-      // Since we already recovered max_log_number, we want all logs
-      // with numbers `<= max_log_number` (includes this one) to be ignored
+      // Since we already recovered max_wal_number, we want all wals
+      // with numbers `<= max_wal_number` (includes this one) to be ignored
       if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
-        edit->SetLogNumber(max_log_number + 1);
+        edit->SetLogNumber(max_wal_number + 1);
       }
     }
     if (status.ok()) {
@@ -1094,7 +1259,7 @@
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
-      versions_->MarkFileNumberUsed(max_log_number + 1);
+      versions_->MarkFileNumberUsed(max_wal_number + 1);
 
       autovector<ColumnFamilyData*> cfds;
       autovector<const MutableCFOptions*> cf_opts;
@@ -1106,6 +1271,21 @@
         assert(iter != version_edits.end());
         edit_lists.push_back({&iter->second});
       }
+
+      std::unique_ptr<VersionEdit> wal_deletion;
+      if (flushed) {
+        wal_deletion = std::unique_ptr<VersionEdit>(new VersionEdit());
+        if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+          wal_deletion->DeleteWalsBefore(max_wal_number + 1);
+        }
+        if (!allow_2pc()) {
+          // In non-2pc mode, flushing the memtables of the column families
+          // means we can advance min_log_number_to_keep.
+          wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1);
+        }
+        edit_lists.back().push_back(wal_deletion.get());
+      }
+
       // write MANIFEST with update
       status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
                                       directories_.GetDbDir(),
@@ -1113,8 +1293,17 @@
     }
   }
 
-  if (status.ok() && data_seen && !flushed) {
-    status = RestoreAliveLogFiles(log_numbers);
+  if (status.ok()) {
+    if (data_seen && !flushed) {
+      status = RestoreAliveLogFiles(wal_numbers);
+    } else {
+      // If there's no data in the WAL, or we flushed all the data, still
+      // truncate the log file. If the process goes into a crash loop before
+      // the file is deleted, the preallocated space will never get freed.
+      const bool truncate = !read_only;
+      GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr)
+          .PermitUncheckedError();
+    }
   }
 
   event_logger_.Log() << "job" << job_id << "event"
@@ -1123,8 +1312,43 @@
   return status;
 }
 
-Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
-  if (log_numbers.empty()) {
+Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                          LogFileNumberSize* log_ptr) {
+  LogFileNumberSize log(wal_number);
+  std::string fname =
+      LogFileName(immutable_db_options_.GetWalDir(), wal_number);
+  Status s;
+  // This gets the appear size of the wals, not including preallocated space.
+  s = env_->GetFileSize(fname, &log.size);
+  if (s.ok() && truncate) {
+    std::unique_ptr<FSWritableFile> last_log;
+    Status truncate_status = fs_->ReopenWritableFile(
+        fname,
+        fs_->OptimizeForLogWrite(
+            file_options_,
+            BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+        &last_log, nullptr);
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+    }
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Close(IOOptions(), nullptr);
+    }
+    // Not a critical error if fail to truncate.
+    if (!truncate_status.ok() && !truncate_status.IsNotSupported()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to truncate log #%" PRIu64 ": %s", wal_number,
+                     truncate_status.ToString().c_str());
+    }
+  }
+  if (log_ptr) {
+    *log_ptr = log;
+  }
+  return s;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
+  if (wal_numbers.empty()) {
     return Status::OK();
   }
   Status s;
@@ -1137,41 +1361,27 @@
   // FindObsoleteFiles()
   total_log_size_ = 0;
   log_empty_ = false;
-  for (auto log_number : log_numbers) {
-    LogFileNumberSize log(log_number);
-    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
-    // This gets the appear size of the logs, not including preallocated space.
-    s = env_->GetFileSize(fname, &log.size);
+  uint64_t min_wal_with_unflushed_data =
+      versions_->MinLogNumberWithUnflushedData();
+  for (auto wal_number : wal_numbers) {
+    if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
+      // In non-2pc mode, the WAL files not backing unflushed data are not
+      // alive, thus should not be added to the alive_log_files_.
+      continue;
+    }
+    // We preallocate space for wals, but then after a crash and restart, those
+    // preallocated space are not needed anymore. It is likely only the last
+    // log has such preallocated space, so we only truncate for the last log.
+    LogFileNumberSize log;
+    s = GetLogSizeAndMaybeTruncate(
+        wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
     if (!s.ok()) {
       break;
     }
     total_log_size_ += log.size;
     alive_log_files_.push_back(log);
-    // We preallocate space for logs, but then after a crash and restart, those
-    // preallocated space are not needed anymore. It is likely only the last
-    // log has such preallocated space, so we only truncate for the last log.
-    if (log_number == log_numbers.back()) {
-      std::unique_ptr<FSWritableFile> last_log;
-      Status truncate_status = fs_->ReopenWritableFile(
-          fname,
-          fs_->OptimizeForLogWrite(
-              file_options_,
-              BuildDBOptions(immutable_db_options_, mutable_db_options_)),
-          &last_log, nullptr);
-      if (truncate_status.ok()) {
-        truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
-      }
-      if (truncate_status.ok()) {
-        truncate_status = last_log->Close(IOOptions(), nullptr);
-      }
-      // Not a critical error if fail to truncate.
-      if (!truncate_status.ok()) {
-        ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                       "Failed to truncate log #%" PRIu64 ": %s", log_number,
-                       truncate_status.ToString().c_str());
-      }
-    }
   }
+  alive_log_files_tail_ = alive_log_files_.rbegin();
   if (two_write_queues_) {
     log_write_mutex_.Unlock();
   }
@@ -1181,8 +1391,17 @@
 Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                            MemTable* mem, VersionEdit* edit) {
   mutex_.AssertHeld();
-  const uint64_t start_micros = env_->NowMicros();
+  assert(cfd);
+  assert(cfd->imm());
+  // The immutable memtable list must be empty.
+  assert(std::numeric_limits<uint64_t>::max() ==
+         cfd->imm()->GetEarliestMemTableID());
+
+  const uint64_t start_micros = immutable_db_options_.clock->NowMicros();
+
   FileMetaData meta;
+  std::vector<BlobFileAddition> blob_file_additions;
+
   std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
       new std::list<uint64_t>::iterator(
           CaptureCurrentFileNumberInPendingOutputs()));
@@ -1206,7 +1425,8 @@
         cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
 
     int64_t _current_time = 0;
-    env_->GetCurrentTime(&_current_time);  // ignore error
+    immutable_db_options_.clock->GetCurrentTime(&_current_time)
+        .PermitUncheckedError();  // ignore error
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
     meta.oldest_ancester_time = current_time;
 
@@ -1228,18 +1448,26 @@
       if (range_del_iter != nullptr) {
         range_del_iters.emplace_back(range_del_iter);
       }
+
+      IOStatus io_s;
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(),
+          cfd->int_tbl_prop_collector_factories(),
+          GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+          mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, current_time,
+          0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
+          db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber());
       s = BuildTable(
-          dbname_, env_, fs_.get(), *cfd->ioptions(), mutable_cf_options,
+          dbname_, versions_.get(), immutable_db_options_, tboptions,
           file_options_for_compaction_, cfd->table_cache(), iter.get(),
-          std::move(range_del_iters), &meta, cfd->internal_comparator(),
-          cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
+          std::move(range_del_iters), &meta, &blob_file_additions,
           snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
-          GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
-          mutable_cf_options.sample_for_compression,
-          cfd->ioptions()->compression_opts, paranoid_file_checks,
-          cfd->internal_stats(), TableFileCreationReason::kRecovery,
-          &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
-          -1 /* level */, current_time, write_hint);
+          paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_,
+          BlobFileCreationReason::kRecovery, &event_logger_, job_id,
+          Env::IO_HIGH, nullptr /* table_properties */, write_hint,
+          nullptr /*full_history_ts_low*/, &blob_callback_);
       LogFlush(immutable_db_options_.info_log);
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                       "[%s] [WriteLevel0TableForRecovery]"
@@ -1247,29 +1475,54 @@
                       cfd->GetName().c_str(), meta.fd.GetNumber(),
                       meta.fd.GetFileSize(), s.ToString().c_str());
       mutex_.Lock();
+
+      // TODO(AR) is this ok?
+      if (!io_s.ok() && s.ok()) {
+        s = io_s;
+      }
     }
   }
   ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
-  int level = 0;
-  if (s.ok() && meta.fd.GetFileSize() > 0) {
-    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
-                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
-                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
-                  meta.marked_for_compaction, meta.oldest_blob_file_number,
-                  meta.oldest_ancester_time, meta.file_creation_time,
-                  meta.file_checksum, meta.file_checksum_func_name);
+  const bool has_output = meta.fd.GetFileSize() > 0;
+
+  constexpr int level = 0;
+
+  if (s.ok() && has_output) {
+    edit->AddFile(
+        level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(),
+        meta.smallest, meta.largest, meta.fd.smallest_seqno,
+        meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature,
+        meta.oldest_blob_file_number, meta.oldest_ancester_time,
+        meta.file_creation_time, meta.file_checksum,
+        meta.file_checksum_func_name, meta.min_timestamp, meta.max_timestamp);
+
+    for (const auto& blob : blob_file_additions) {
+      edit->AddBlobFile(blob);
+    }
   }
 
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
-  stats.micros = env_->NowMicros() - start_micros;
-  stats.bytes_written = meta.fd.GetFileSize();
-  stats.num_output_files = 1;
+  stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
+
+  if (has_output) {
+    stats.bytes_written = meta.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
+
+  const auto& blobs = edit->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+  }
+
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
   cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
-  cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
-                                    meta.fd.GetFileSize());
+  cfd->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
   RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
   return s;
 }
@@ -1311,52 +1564,55 @@
                       !kSeqPerBatch, kBatchPerTxn);
 }
 
-Status DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
-                         size_t preallocate_block_size, log::Writer** new_log) {
-  Status s;
+IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                           size_t preallocate_block_size,
+                           log::Writer** new_log) {
+  IOStatus io_s;
   std::unique_ptr<FSWritableFile> lfile;
 
   DBOptions db_options =
       BuildDBOptions(immutable_db_options_, mutable_db_options_);
   FileOptions opt_file_options =
       fs_->OptimizeForLogWrite(file_options_, db_options);
-  std::string log_fname =
-      LogFileName(immutable_db_options_.wal_dir, log_file_num);
+  std::string wal_dir = immutable_db_options_.GetWalDir();
+  std::string log_fname = LogFileName(wal_dir, log_file_num);
 
   if (recycle_log_number) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
                    "reusing log %" PRIu64 " from recycle list\n",
                    recycle_log_number);
-    std::string old_log_fname =
-        LogFileName(immutable_db_options_.wal_dir, recycle_log_number);
+    std::string old_log_fname = LogFileName(wal_dir, recycle_log_number);
     TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
     TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
-    s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
-                               &lfile, /*dbg=*/nullptr);
+    io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
+                                  &lfile, /*dbg=*/nullptr);
   } else {
-    s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
+    io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
   }
 
-  if (s.ok()) {
+  if (io_s.ok()) {
     lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
     lfile->SetPreallocationBlockSize(preallocate_block_size);
 
     const auto& listeners = immutable_db_options_.listeners;
-    std::unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(lfile), log_fname, opt_file_options,
-                               env_, nullptr /* stats */, listeners));
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(lfile), log_fname, opt_file_options,
+        immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners,
+        nullptr, tmp_set.Contains(FileType::kWalFile),
+        tmp_set.Contains(FileType::kWalFile)));
     *new_log = new log::Writer(std::move(file_writer), log_file_num,
                                immutable_db_options_.recycle_log_file_num > 0,
                                immutable_db_options_.manual_wal_flush);
   }
-  return s;
+  return io_s;
 }
 
 Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
                     const std::vector<ColumnFamilyDescriptor>& column_families,
                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
                     const bool seq_per_batch, const bool batch_per_txn) {
-  Status s = SanitizeOptionsByTable(db_options, column_families);
+  Status s = ValidateOptionsByTable(db_options, column_families);
   if (!s.ok()) {
     return s;
   }
@@ -1376,7 +1632,7 @@
   }
 
   DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
-  s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir);
+  s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir());
   if (s.ok()) {
     std::vector<std::string> paths;
     for (auto& db_path : impl->immutable_db_options_.db_paths) {
@@ -1400,19 +1656,15 @@
       impl->error_handler_.EnableAutoRecovery();
     }
   }
-
-  if (!s.ok()) {
-    delete impl;
-    return s;
+  if (s.ok()) {
+    s = impl->CreateArchivalDirectory();
   }
-
-  s = impl->CreateArchivalDirectory();
   if (!s.ok()) {
     delete impl;
     return s;
   }
 
-  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
 
   impl->mutex_.Lock();
   // Handles create_if_missing, error_if_exists
@@ -1429,6 +1681,7 @@
       InstrumentedMutexLock wl(&impl->log_write_mutex_);
       impl->logfile_number_ = new_log_number;
       assert(new_log != nullptr);
+      assert(impl->logs_.empty());
       impl->logs_.emplace_back(new_log_number, new_log);
     }
 
@@ -1454,7 +1707,7 @@
               break;
             }
           } else {
-            s = Status::InvalidArgument("Column family not found: ", cf.name);
+            s = Status::InvalidArgument("Column family not found", cf.name);
             break;
           }
         }
@@ -1472,18 +1725,16 @@
       }
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
+      impl->alive_log_files_tail_ = impl->alive_log_files_.rbegin();
       if (impl->two_write_queues_) {
         impl->log_write_mutex_.Unlock();
       }
-
-      impl->DeleteObsoleteFiles();
-      s = impl->directories_.GetDbDir()->Fsync();
     }
     if (s.ok()) {
       // In WritePrepared there could be gap in sequence numbers. This breaks
       // the trick we use in kPointInTimeRecovery which assumes the first seq in
       // the log right after the corrupted log is one larger than the last seq
-      // we read from the logs. To let this trick keep working, we add a dummy
+      // we read from the wals. To let this trick keep working, we add a dummy
       // entry with the expected sequence to the first log right after recovery.
       // In non-WritePrepared case also the new log after recovery could be
       // empty, and thus missing the consecutive seq hint to distinguish
@@ -1495,7 +1746,8 @@
         WriteOptions write_options;
         uint64_t log_used, log_size;
         log::Writer* log_writer = impl->logs_.back().writer;
-        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size);
+        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
+                             /*with_db_mutex==*/true);
         if (s.ok()) {
           // Need to fsync, otherwise it might get lost after a power reset.
           s = impl->FlushWAL(false);
@@ -1507,7 +1759,7 @@
     }
   }
   if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
-    // try to read format version but no need to fail Open() even if it fails
+    // try to read format version
     s = impl->PersistentStatsProcessFormatVersion();
   }
 
@@ -1550,7 +1802,11 @@
 
     *dbptr = impl;
     impl->opened_successfully_ = true;
+    impl->DeleteObsoleteFiles();
+    TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles");
     impl->MaybeScheduleFlushOrCompaction();
+  } else {
+    persist_options_status.PermitUncheckedError();
   }
   impl->mutex_.Unlock();
 
@@ -1558,6 +1814,12 @@
   auto sfm = static_cast<SstFileManagerImpl*>(
       impl->immutable_db_options_.sst_file_manager.get());
   if (s.ok() && sfm) {
+    // Set Statistics ptr for SstFileManager to dump the stats of
+    // DeleteScheduler.
+    sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics);
+    ROCKS_LOG_INFO(impl->immutable_db_options_.info_log,
+                   "SstFileManager instance %p", sfm);
+
     // Notify SstFileManager about all sst files that already exist in
     // db_paths[0] and cf_paths[0] when the DB is opened.
 
@@ -1568,6 +1830,8 @@
 
     std::vector<LiveFileMetaData> metadata;
 
+    // TODO: Once GetLiveFilesMetaData supports blob files, update the logic
+    // below to get known_file_sizes for blob files.
     impl->mutex_.Lock();
     impl->versions_->GetLiveFilesMetaData(&metadata);
     impl->mutex_.Unlock();
@@ -1593,20 +1857,22 @@
     paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
     for (auto& path : paths) {
       std::vector<std::string> existing_files;
-      impl->immutable_db_options_.env->GetChildren(path, &existing_files);
+      impl->immutable_db_options_.env->GetChildren(path, &existing_files)
+          .PermitUncheckedError();  //**TODO: What do to on error?
       for (auto& file_name : existing_files) {
         uint64_t file_number;
         FileType file_type;
         std::string file_path = path + "/" + file_name;
         if (ParseFileName(file_name, &file_number, &file_type) &&
-            file_type == kTableFile) {
+            (file_type == kTableFile || file_type == kBlobFile)) {
+          // TODO: Check for errors from OnAddFile?
           if (known_file_sizes.count(file_name)) {
             // We're assuming that each sst file name exists in at most one of
             // the paths.
-            sfm->OnAddFile(file_path, known_file_sizes.at(file_name),
-                           /* compaction */ false);
+            sfm->OnAddFile(file_path, known_file_sizes.at(file_name))
+                .PermitUncheckedError();
           } else {
-            sfm->OnAddFile(file_path);
+            sfm->OnAddFile(file_path).PermitUncheckedError();
           }
         }
       }
@@ -1620,6 +1886,7 @@
     sfm->ReserveDiskBuffer(max_write_buffer_size,
                            impl->immutable_db_options_.db_paths[0].path);
   }
+
 #endif  // !ROCKSDB_LITE
 
   if (s.ok()) {
@@ -1634,11 +1901,14 @@
           "DB::Open() failed --- Unable to persist Options file",
           persist_options_status.ToString());
     }
+  } else {
+    ROCKS_LOG_WARN(impl->immutable_db_options_.info_log,
+                   "Persisting Option File error: %s",
+                   persist_options_status.ToString().c_str());
   }
   if (s.ok()) {
-    impl->StartTimedTasks();
-  }
-  if (!s.ok()) {
+    impl->StartPeriodicWorkScheduler();
+  } else {
     for (auto* h : *handles) {
       delete h;
     }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,13 +4,15 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/db_impl/db_impl_readonly.h"
-#include "db/arena_wrapped_db_iter.h"
 
-#include "db/compacted_db_impl.h"
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_impl/compacted_db_impl.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -18,7 +20,8 @@
 
 DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                                const std::string& dbname)
-    : DBImpl(db_options, dbname) {
+    : DBImpl(db_options, dbname, /*seq_per_batch*/ false,
+             /*batch_per_txn*/ true, /*read_only*/ true) {
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "Opening the db in read only mode");
   LogFlush(immutable_db_options_.info_log);
@@ -35,7 +38,7 @@
   PERF_TIMER_GUARD(get_snapshot_time);
   Status s;
   SequenceNumber snapshot = versions_->LastSequence();
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   if (tracer_) {
     InstrumentedMutexLock lock(&trace_mutex_);
@@ -48,14 +51,17 @@
   SequenceNumber max_covering_tombstone_seq = 0;
   LookupKey lkey(key, snapshot);
   PERF_TIMER_STOP(get_snapshot_time);
-  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+                              /*timestamp=*/nullptr, &s, &merge_context,
                               &max_covering_tombstone_seq, read_options)) {
     pinnable_val->PinSelf();
     RecordTick(stats_, MEMTABLE_HIT);
   } else {
     PERF_TIMER_GUARD(get_from_output_files_time);
-    super_version->current->Get(read_options, lkey, pinnable_val, &s,
-                                &merge_context, &max_covering_tombstone_seq);
+    PinnedIteratorsManager pinned_iters_mgr;
+    super_version->current->Get(read_options, lkey, pinnable_val,
+                                /*timestamp=*/nullptr, &s, &merge_context,
+                                &max_covering_tombstone_seq, &pinned_iters_mgr);
     RecordTick(stats_, MEMTABLE_MISS);
   }
   RecordTick(stats_, NUMBER_KEYS_READ);
@@ -68,7 +74,7 @@
 
 Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
                                       ColumnFamilyHandle* column_family) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   SequenceNumber latest_snapshot = versions_->LastSequence();
@@ -80,12 +86,13 @@
   ReadCallback* read_callback = nullptr;  // No read callback provided.
   auto db_iter = NewArenaWrappedDbIterator(
       env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
-      read_seq,
+      super_version->current, read_seq,
       super_version->mutable_cf_options.max_sequential_skip_in_iterations,
       super_version->version_number, read_callback);
-  auto internal_iter =
-      NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
-                          db_iter->GetRangeDelAggregator(), read_seq);
+  auto internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+      db_iter->GetRangeDelAggregator(), read_seq,
+      /* allow_unprepared_value */ true);
   db_iter->SetIterUnderDBIter(internal_iter);
   return db_iter;
 }
@@ -108,15 +115,17 @@
           : latest_snapshot;
 
   for (auto cfh : column_families) {
-    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
     auto* sv = cfd->GetSuperVersion()->Ref();
     auto* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        sv->current, read_seq,
         sv->mutable_cf_options.max_sequential_skip_in_iterations,
         sv->version_number, read_callback);
-    auto* internal_iter =
-        NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                            db_iter->GetRangeDelAggregator(), read_seq);
+    auto* internal_iter = NewInternalIterator(
+        db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(),
+        db_iter->GetRangeDelAggregator(), read_seq,
+        /* allow_unprepared_value */ true);
     db_iter->SetIterUnderDBIter(internal_iter);
     iterators->push_back(db_iter);
   }
@@ -124,12 +133,37 @@
   return Status::OK();
 }
 
+namespace {
+// Return OK if dbname exists in the file system or create it if
+// create_if_missing
+Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
+                                     const std::string& dbname) {
+  Status s;
+  if (!db_options.create_if_missing) {
+    // Attempt to read "CURRENT" file
+    const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+    std::string manifest_path;
+    uint64_t manifest_file_number;
+    s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
+                                           &manifest_file_number);
+  } else {
+    // Historic behavior that doesn't necessarily make sense
+    s = db_options.env->CreateDirIfMissing(dbname);
+  }
+  return s;
+}
+}  // namespace
+
 Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
-                           DB** dbptr, bool /*error_if_log_file_exist*/) {
+                           DB** dbptr, bool /*error_if_wal_file_exists*/) {
+  Status s = OpenForReadOnlyCheckExistence(options, dbname);
+  if (!s.ok()) {
+    return s;
+  }
+
   *dbptr = nullptr;
 
   // Try to first open DB as fully compacted DB
-  Status s;
   s = CompactedDBImpl::Open(options, dbname, dbptr);
   if (s.ok()) {
     return s;
@@ -142,7 +176,8 @@
       ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
   std::vector<ColumnFamilyHandle*> handles;
 
-  s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+  s = DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+      db_options, dbname, column_families, &handles, dbptr);
   if (s.ok()) {
     assert(handles.size() == 1);
     // i can delete the handle since DBImpl is always holding a
@@ -156,7 +191,23 @@
     const DBOptions& db_options, const std::string& dbname,
     const std::vector<ColumnFamilyDescriptor>& column_families,
     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
-    bool error_if_log_file_exist) {
+    bool error_if_wal_file_exists) {
+  // If dbname does not exist in the file system, should not do anything
+  Status s = OpenForReadOnlyCheckExistence(db_options, dbname);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+      db_options, dbname, column_families, handles, dbptr,
+      error_if_wal_file_exists);
+}
+
+Status DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_wal_file_exists) {
   *dbptr = nullptr;
   handles->clear();
 
@@ -164,14 +215,14 @@
   DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
   impl->mutex_.Lock();
   Status s = impl->Recover(column_families, true /* read only */,
-                           error_if_log_file_exist);
+                           error_if_wal_file_exists);
   if (s.ok()) {
     // set column family handles
     for (auto cf : column_families) {
       auto cfd =
           impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
       if (cfd == nullptr) {
-        s = Status::InvalidArgument("Column family not found: ", cf.name);
+        s = Status::InvalidArgument("Column family not found", cf.name);
         break;
       }
       handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
@@ -189,7 +240,7 @@
     *dbptr = impl;
     for (auto* h : *handles) {
       impl->NewThreadStatusCfInfo(
-          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+          static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
     }
   } else {
     for (auto h : *handles) {
@@ -205,7 +256,7 @@
 
 Status DB::OpenForReadOnly(const Options& /*options*/,
                            const std::string& /*dbname*/, DB** /*dbptr*/,
-                           bool /*error_if_log_file_exist*/) {
+                           bool /*error_if_wal_file_exists*/) {
   return Status::NotSupported("Not supported in ROCKSDB_LITE.");
 }
 
@@ -213,7 +264,7 @@
     const DBOptions& /*db_options*/, const std::string& /*dbname*/,
     const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
     std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
-    bool /*error_if_log_file_exist*/) {
+    bool /*error_if_wal_file_exists*/) {
   return Status::NotSupported("Not supported in ROCKSDB_LITE.");
 }
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_readonly.h	2025-05-19 16:14:27.000000000 +0000
@@ -130,6 +130,15 @@
   }
 
  private:
+  // A "helper" function for DB::OpenForReadOnly without column families
+  // to reduce unnecessary I/O
+  // It has the same functionality as DB::OpenForReadOnly with column families
+  // but does not check the existence of dbname in the file system
+  static Status OpenForReadOnlyWithoutCheck(
+      const DBOptions& db_options, const std::string& dbname,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      bool error_if_wal_file_exists = false);
   friend class DB;
 };
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,15 +10,19 @@
 #include "db/arena_wrapped_db_iter.h"
 #include "db/merge_context.h"
 #include "logging/auto_roll_logger.h"
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
+#include "rocksdb/configurable.h"
 #include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
 DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
-                                 const std::string& dbname)
-    : DBImpl(db_options, dbname) {
+                                 const std::string& dbname,
+                                 std::string secondary_path)
+    : DBImpl(db_options, dbname, false, true, true),
+      secondary_path_(std::move(secondary_path)) {
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "Opening the db in secondary mode");
   LogFlush(immutable_db_options_.info_log);
@@ -28,8 +32,8 @@
 
 Status DBImplSecondary::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families,
-    bool /*readonly*/, bool /*error_if_log_file_exist*/,
-    bool /*error_if_data_exists_in_logs*/, uint64_t*) {
+    bool /*readonly*/, bool /*error_if_wal_file_exists*/,
+    bool /*error_if_data_exists_in_wals*/, uint64_t*) {
   mutex_.AssertHeld();
 
   JobContext job_context(0);
@@ -38,6 +42,9 @@
           ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
                     &manifest_reader_status_);
   if (!s.ok()) {
+    if (manifest_reader_status_) {
+      manifest_reader_status_->PermitUncheckedError();
+    }
     return s;
   }
   if (immutable_db_options_.paranoid_checks && s.ok()) {
@@ -94,10 +101,10 @@
   assert(logs != nullptr);
   std::vector<std::string> filenames;
   Status s;
-  s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+  s = env_->GetChildren(immutable_db_options_.GetWalDir(), &filenames);
   if (s.IsNotFound()) {
     return Status::InvalidArgument("Failed to open wal_dir",
-                                   immutable_db_options_.wal_dir);
+                                   immutable_db_options_.GetWalDir());
   } else if (!s.ok()) {
     return s;
   }
@@ -112,7 +119,7 @@
   for (size_t i = 0; i < filenames.size(); i++) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
+    if (ParseFileName(filenames[i], &number, &type) && type == kWalFile &&
         number >= log_number_min) {
       logs->push_back(number);
     }
@@ -137,7 +144,8 @@
     // initialize log reader from log_number
     // TODO: min_log_number_to_keep_2pc check needed?
     // Open the log file
-    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+    std::string fname =
+        LogFileName(immutable_db_options_.GetWalDir(), log_number);
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
                    "Recovering log #%" PRIu64 " mode %d", log_number,
                    static_cast<int>(immutable_db_options_.wal_recovery_mode));
@@ -153,7 +161,8 @@
         return status;
       }
       file_reader.reset(new SequentialFileReader(
-          std::move(file), fname, immutable_db_options_.log_readahead_size));
+          std::move(file), fname, immutable_db_options_.log_readahead_size,
+          io_tracer_));
     }
 
     // Create the log reader.
@@ -191,6 +200,8 @@
     auto it  = log_readers_.find(log_number);
     assert(it != log_readers_.end());
     log::FragmentBufferedReader* reader = it->second->reader_;
+    Status* wal_read_status = it->second->status_;
+    assert(wal_read_status);
     // Manually update the file number allocation counter in VersionSet.
     versions_->MarkFileNumberUsed(log_number);
 
@@ -202,13 +213,16 @@
 
     while (reader->ReadRecord(&record, &scratch,
                               immutable_db_options_.wal_recovery_mode) &&
-           status.ok()) {
+           wal_read_status->ok() && status.ok()) {
       if (record.size() < WriteBatchInternal::kHeader) {
         reader->GetReporter()->Corruption(
             record.size(), Status::Corruption("log record too small"));
         continue;
       }
-      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (!status.ok()) {
+        break;
+      }
       SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
       std::vector<uint32_t> column_family_ids;
       status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
@@ -294,6 +308,9 @@
         reader->GetReporter()->Corruption(record.size(), status);
       }
     }
+    if (status.ok() && !wal_read_status->ok()) {
+      status = *wal_read_status;
+    }
     if (!status.ok()) {
       return status;
     }
@@ -318,8 +335,8 @@
                                 ColumnFamilyHandle* column_family,
                                 const Slice& key, PinnableSlice* pinnable_val) {
   assert(pinnable_val != nullptr);
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_GET);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
   auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
@@ -340,15 +357,16 @@
   PERF_TIMER_STOP(get_snapshot_time);
 
   bool done = false;
-  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+                              /*timestamp=*/nullptr, &s, &merge_context,
                               &max_covering_tombstone_seq, read_options)) {
     done = true;
     pinnable_val->PinSelf();
     RecordTick(stats_, MEMTABLE_HIT);
   } else if ((s.ok() || s.IsMergeInProgress()) &&
              super_version->imm->Get(
-                 lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                 &max_covering_tombstone_seq, read_options)) {
+                 lkey, pinnable_val->GetSelf(), /*timestamp=*/nullptr, &s,
+                 &merge_context, &max_covering_tombstone_seq, read_options)) {
     done = true;
     pinnable_val->PinSelf();
     RecordTick(stats_, MEMTABLE_HIT);
@@ -359,8 +377,10 @@
   }
   if (!done) {
     PERF_TIMER_GUARD(get_from_output_files_time);
-    super_version->current->Get(read_options, lkey, pinnable_val, &s,
-                                &merge_context, &max_covering_tombstone_seq);
+    PinnedIteratorsManager pinned_iters_mgr;
+    super_version->current->Get(read_options, lkey, pinnable_val,
+                                /*timestamp=*/nullptr, &s, &merge_context,
+                                &max_covering_tombstone_seq, &pinned_iters_mgr);
     RecordTick(stats_, MEMTABLE_MISS);
   }
   {
@@ -386,7 +406,7 @@
         "ReadTier::kPersistedData is not yet supported in iterators."));
   }
   Iterator* result = nullptr;
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   ReadCallback* read_callback = nullptr;  // No read callback provided.
   if (read_options.tailing) {
@@ -397,7 +417,7 @@
     return NewErrorIterator(
         Status::NotSupported("snapshot not supported in secondary mode"));
   } else {
-    auto snapshot = versions_->LastSequence();
+    SequenceNumber snapshot(kMaxSequenceNumber);
     result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
   }
   return result;
@@ -405,17 +425,23 @@
 
 ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
     const ReadOptions& read_options, ColumnFamilyData* cfd,
-    SequenceNumber snapshot, ReadCallback* read_callback) {
+    SequenceNumber snapshot, ReadCallback* read_callback,
+    bool expose_blob_index, bool allow_refresh) {
   assert(nullptr != cfd);
   SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+  assert(snapshot == kMaxSequenceNumber);
+  snapshot = versions_->LastSequence();
+  assert(snapshot != kMaxSequenceNumber);
   auto db_iter = NewArenaWrappedDbIterator(
       env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
-      snapshot,
+      super_version->current, snapshot,
       super_version->mutable_cf_options.max_sequential_skip_in_iterations,
-      super_version->version_number, read_callback);
-  auto internal_iter =
-      NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
-                          db_iter->GetRangeDelAggregator(), snapshot);
+      super_version->version_number, read_callback, this, cfd,
+      expose_blob_index, read_options.snapshot ? false : allow_refresh);
+  auto internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+      db_iter->GetRangeDelAggregator(), snapshot,
+      /* allow_unprepared_value */ true);
   db_iter->SetIterUnderDBIter(internal_iter);
   return db_iter;
 }
@@ -507,7 +533,8 @@
   {
     InstrumentedMutexLock lock_guard(&mutex_);
     s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
-            ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+            ->ReadAndApply(&mutex_, &manifest_reader_,
+                           manifest_reader_status_.get(), &cfds_changed);
 
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
                    static_cast<uint64_t>(versions_->LastSequence()));
@@ -604,14 +631,14 @@
   }
 
   handles->clear();
-  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
+  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
   impl->versions_.reset(new ReactiveVersionSet(
       dbname, &impl->immutable_db_options_, impl->file_options_,
       impl->table_cache_.get(), impl->write_buffer_manager_,
-      &impl->write_controller_));
+      &impl->write_controller_, impl->io_tracer_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
-  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
 
   impl->mutex_.Lock();
   s = impl->Recover(column_families, true, false, false);
@@ -620,7 +647,7 @@
       auto cfd =
           impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
       if (nullptr == cfd) {
-        s = Status::InvalidArgument("Column family not found: ", cf.name);
+        s = Status::InvalidArgument("Column family not found", cf.name);
         break;
       }
       handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
@@ -639,7 +666,7 @@
     *dbptr = impl;
     for (auto h : *handles) {
       impl->NewThreadStatusCfInfo(
-          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+          static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
     }
   } else {
     for (auto h : *handles) {
@@ -650,6 +677,160 @@
   }
   return s;
 }
+
+Status DBImplSecondary::CompactWithoutInstallation(
+    ColumnFamilyHandle* cfh, const CompactionServiceInput& input,
+    CompactionServiceResult* result) {
+  InstrumentedMutexLock l(&mutex_);
+  auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+  if (!cfd) {
+    return Status::InvalidArgument("Cannot find column family" +
+                                   cfh->GetName());
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (const auto& file_name : input.input_files) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  auto* version = cfd->current();
+
+  ColumnFamilyMetaData cf_meta;
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+  ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions();
+  VersionStorageInfo* vstorage = version->storage_info();
+
+  // Use comp_options to reuse some CompactFiles functions
+  CompactionOptions comp_options;
+  comp_options.compression = kDisableCompressionOption;
+  comp_options.output_file_size_limit = MaxFileSizeForLevel(
+      *mutable_cf_options, input.output_level, cf_options.compaction_style,
+      vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes);
+
+  std::vector<CompactionInputFiles> input_files;
+  Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage, comp_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->CompactFiles(
+      comp_options, input_files, input.output_level, vstorage,
+      *mutable_cf_options, mutable_db_options_, 0));
+  assert(c != nullptr);
+
+  c->SetInputVersion(version);
+
+  // Create output directory if it's not existed yet
+  std::unique_ptr<FSDirectory> output_dir;
+  s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  const int job_id = next_job_id_.fetch_add(1);
+
+  CompactionServiceCompactionJob compaction_job(
+      job_id, c.get(), immutable_db_options_, mutable_db_options_,
+      file_options_for_compaction_, versions_.get(), &shutting_down_,
+      &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
+      input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
+      db_id_, db_session_id_, secondary_path_, input, result);
+
+  mutex_.Unlock();
+  s = compaction_job.Run();
+  mutex_.Lock();
+
+  // clean up
+  compaction_job.io_status().PermitUncheckedError();
+  compaction_job.CleanupCompaction();
+  c->ReleaseCompactionFiles(s);
+  c.reset();
+
+  TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End",
+                           &s);
+  result->status = s;
+  return s;
+}
+
+Status DB::OpenAndCompact(
+    const std::string& name, const std::string& output_directory,
+    const std::string& input, std::string* result,
+    const CompactionServiceOptionsOverride& override_options) {
+  CompactionServiceInput compaction_input;
+  Status s = CompactionServiceInput::Read(input, &compaction_input);
+  if (!s.ok()) {
+    return s;
+  }
+
+  compaction_input.db_options.max_open_files = -1;
+  compaction_input.db_options.compaction_service = nullptr;
+  if (compaction_input.db_options.statistics) {
+    compaction_input.db_options.statistics.reset();
+  }
+  compaction_input.db_options.env = override_options.env;
+  compaction_input.db_options.file_checksum_gen_factory =
+      override_options.file_checksum_gen_factory;
+  compaction_input.db_options.statistics = override_options.statistics;
+  compaction_input.column_family.options.comparator =
+      override_options.comparator;
+  compaction_input.column_family.options.merge_operator =
+      override_options.merge_operator;
+  compaction_input.column_family.options.compaction_filter =
+      override_options.compaction_filter;
+  compaction_input.column_family.options.compaction_filter_factory =
+      override_options.compaction_filter_factory;
+  compaction_input.column_family.options.prefix_extractor =
+      override_options.prefix_extractor;
+  compaction_input.column_family.options.table_factory =
+      override_options.table_factory;
+  compaction_input.column_family.options.sst_partitioner_factory =
+      override_options.sst_partitioner_factory;
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(compaction_input.column_family);
+  // TODO: we have to open default CF, because of an implementation limitation,
+  // currently we just use the same CF option from input, which is not collect
+  // and open may fail.
+  if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
+    column_families.emplace_back(kDefaultColumnFamilyName,
+                                 compaction_input.column_family.options);
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
+                          column_families, &handles, &db);
+  if (!s.ok()) {
+    return s;
+  }
+
+  CompactionServiceResult compaction_result;
+  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
+  assert(handles.size() > 0);
+  s = db_secondary->CompactWithoutInstallation(handles[0], compaction_input,
+                                               &compaction_result);
+
+  Status serialization_status = compaction_result.Write(result);
+
+  for (auto& handle : handles) {
+    delete handle;
+  }
+  delete db;
+  if (s.ok()) {
+    return serialization_status;
+  }
+  return s;
+}
+
 #else   // !ROCKSDB_LITE
 
 Status DB::OpenAsSecondary(const Options& /*options*/,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,7 +9,9 @@
 
 #include <string>
 #include <vector>
+
 #include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -71,14 +73,15 @@
 // effort attempts to catch up with the primary.
 class DBImplSecondary : public DBImpl {
  public:
-  DBImplSecondary(const DBOptions& options, const std::string& dbname);
+  DBImplSecondary(const DBOptions& options, const std::string& dbname,
+                  std::string secondary_path);
   ~DBImplSecondary() override;
 
   // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
   // and log_readers_ to facilitate future operations.
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
-                 bool read_only, bool error_if_log_file_exist,
-                 bool error_if_data_exists_in_logs,
+                 bool read_only, bool error_if_wal_file_exists,
+                 bool error_if_data_exists_in_wals,
                  uint64_t* = nullptr) override;
 
   // Implementations of the DB interface
@@ -96,7 +99,9 @@
   ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
                                       ColumnFamilyData* cfd,
                                       SequenceNumber snapshot,
-                                      ReadCallback* read_callback);
+                                      ReadCallback* read_callback,
+                                      bool expose_blob_index = false,
+                                      bool allow_refresh = true);
 
   Status NewIterators(const ReadOptions& options,
                       const std::vector<ColumnFamilyHandle*>& column_families,
@@ -222,6 +227,14 @@
   // not flag the missing file as inconsistency.
   Status CheckConsistency() override;
 
+#ifndef NDEBUG
+  Status TEST_CompactWithoutInstallation(ColumnFamilyHandle* cfh,
+                                         const CompactionServiceInput& input,
+                                         CompactionServiceResult* result) {
+    return CompactWithoutInstallation(cfh, input, result);
+  }
+#endif  // NDEBUG
+
  protected:
   // ColumnFamilyCollector is a write batch handler which does nothing
   // except recording unique column family IDs
@@ -269,6 +282,20 @@
       return AddColumnFamilyId(column_family_id);
     }
 
+    Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+    Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+    Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+    Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+    Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+      return Status::OK();
+    }
+
+    Status MarkNoop(bool) override { return Status::OK(); }
+
     const std::unordered_set<uint32_t>& column_families() const {
       return column_family_ids_;
     }
@@ -316,6 +343,13 @@
                          std::unordered_set<ColumnFamilyData*>* cfds_changed,
                          JobContext* job_context);
 
+  // Run compaction without installation, the output files will be placed in the
+  // secondary DB path. The LSM tree won't be changed, the secondary DB is still
+  // in read-only mode.
+  Status CompactWithoutInstallation(ColumnFamilyHandle* cfh,
+                                    const CompactionServiceInput& input,
+                                    CompactionServiceResult* result);
+
   std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
   std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
   std::unique_ptr<Status> manifest_reader_status_;
@@ -326,6 +360,8 @@
 
   // Current WAL number replayed for each column family.
   std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+
+  const std::string secondary_path_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_impl_write.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,14 +6,16 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl/db_impl.h"
-
 #include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "options/options_helper.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 // Convenience methods
@@ -24,7 +26,7 @@
 
 Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
                      const Slice& key, const Slice& val) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   if (!cfh->cfd()->ioptions()->merge_operator) {
     return Status::NotSupported("Provide a merge_operator when opening DB");
   } else {
@@ -73,10 +75,16 @@
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
   if (tracer_) {
     InstrumentedMutexLock lock(&trace_mutex_);
-    if (tracer_) {
-      tracer_->Write(my_batch);
+    if (tracer_ && !tracer_->IsWriteOrderPreserved()) {
+      // We don't have to preserve write order so can trace anywhere. It's more
+      // efficient to trace here than to add latency to a phase of the log/apply
+      // pipeline.
+      // TODO: maybe handle the tracing status?
+      tracer_->Write(my_batch).PermitUncheckedError();
     }
   }
   if (write_options.sync && write_options.disableWAL) {
@@ -100,11 +108,10 @@
   assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
          disable_memtable);
 
-  Status status;
   if (write_options.low_pri) {
-    status = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
-    if (!status.ok()) {
-      return status;
+    Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
+    if (!s.ok()) {
+      return s;
     }
   }
 
@@ -124,13 +131,13 @@
                                      ? batch_cnt
                                      // every key is a sub-batch consuming a seq
                                      : WriteBatchInternal::Count(my_batch);
-    uint64_t seq;
+    uint64_t seq = 0;
     // Use a write thread to i) optimize for WAL write, ii) publish last
     // sequence in in increasing order, iii) call pre_release_callback serially
-    status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback,
-                              log_used, log_ref, &seq, sub_batch_cnt,
-                              pre_release_callback, kDoAssignOrder,
-                              kDoPublishLastSeq, disable_memtable);
+    Status status = WriteImplWALOnly(
+        &write_thread_, write_options, my_batch, callback, log_used, log_ref,
+        &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder,
+        kDoPublishLastSeq, disable_memtable);
     TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
     if (!status.ok()) {
       return status;
@@ -154,12 +161,7 @@
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         disable_memtable, batch_cnt, pre_release_callback);
-
-  if (!write_options.disableWAL) {
-    RecordTick(stats_, WRITE_WITH_WAL);
-  }
-
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
 
   write_thread_.JoinBatchGroup(&w);
   if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
@@ -191,8 +193,6 @@
     }
     assert(w.state == WriteThread::STATE_COMPLETED);
     // STATE_COMPLETED conditional below handles exit
-
-    status = w.FinalStatus();
   }
   if (w.state == WriteThread::STATE_COMPLETED) {
     if (log_used != nullptr) {
@@ -206,7 +206,7 @@
   }
   // else we are the leader of the write batch group
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
-
+  Status status;
   // Once reaches this point, the current writer "w" will try to do its write
   // job.  It may also pick up some of the remaining writers in the "writers_"
   // when it finds suitable, and finish them in the same write batch.
@@ -220,7 +220,8 @@
 
   bool need_log_sync = write_options.sync;
   bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
-  if (!two_write_queues_ || !disable_memtable) {
+  assert(!two_write_queues_ || !disable_memtable);
+  {
     // With concurrent writes we do preprocess only in the write thread that
     // also does write to memtable to avoid sync issue on shared data structure
     // with the other thread
@@ -250,7 +251,20 @@
   last_batch_group_size_ =
       write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
 
+  IOStatus io_s;
+  Status pre_release_cb_status;
   if (status.ok()) {
+    // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+    // grabs but does not seem thread-safe.
+    if (tracer_) {
+      InstrumentedMutexLock lock(&trace_mutex_);
+      if (tracer_ && tracer_->IsWriteOrderPreserved()) {
+        for (auto* writer : write_group) {
+          // TODO: maybe handle the tracing status?
+          tracer_->Write(writer->batch).PermitUncheckedError();
+        }
+      }
+    }
     // Rules for when we can update the memtable concurrently
     // 1. supported by memtable
     // 2. Puts are not okay if inplace_update_support
@@ -322,21 +336,22 @@
     if (!two_write_queues_) {
       if (status.ok() && !write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
-        status = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
-                            need_log_dir_sync, last_sequence + 1);
+        io_s = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
+                          need_log_dir_sync, last_sequence + 1);
       }
     } else {
       if (status.ok() && !write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
         // LastAllocatedSequence is increased inside WriteToWAL under
         // wal_write_mutex_ to ensure ordered events in WAL
-        status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
-                                      seq_inc);
+        io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
+                                    seq_inc);
       } else {
         // Otherwise we inc seq number for memtable writes
         last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
       }
     }
+    status = io_s;
     assert(last_sequence != kMaxSequenceNumber);
     const SequenceNumber current_sequence = last_sequence + 1;
     last_sequence += seq_inc;
@@ -359,7 +374,7 @@
               writer->sequence, disable_memtable, writer->log_used, index++,
               pre_release_callback_cnt);
           if (!ws.ok()) {
-            status = ws;
+            status = pre_release_cb_status = ws;
             break;
           }
         }
@@ -411,12 +426,23 @@
   PERF_TIMER_START(write_pre_and_post_process_time);
 
   if (!w.CallbackFailed()) {
-    WriteStatusCheck(status);
+    if (!io_s.ok()) {
+      assert(pre_release_cb_status.ok());
+      IOStatusCheck(io_s);
+    } else {
+      WriteStatusCheck(pre_release_cb_status);
+    }
+  } else {
+    assert(io_s.ok() && pre_release_cb_status.ok());
   }
 
   if (need_log_sync) {
     mutex_.Lock();
-    MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
+    if (status.ok()) {
+      status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
+    } else {
+      MarkLogsNotSynced(logfile_number_);
+    }
     mutex_.Unlock();
     // Requesting sync with two_write_queues_ is expected to be very rare. We
     // hence provide a simple implementation that is not necessarily efficient.
@@ -456,13 +482,14 @@
                                   uint64_t* log_used, uint64_t log_ref,
                                   bool disable_memtable, uint64_t* seq_used) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
 
   WriteContext write_context;
 
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         disable_memtable);
   write_thread_.JoinBatchGroup(&w);
+  TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup");
   if (w.state == WriteThread::STATE_GROUP_LEADER) {
     WriteThread::WriteGroup wal_write_group;
     if (w.callback && !w.callback->AllowWriteBatching()) {
@@ -487,6 +514,17 @@
     size_t total_byte_size = 0;
 
     if (w.status.ok()) {
+      // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+      // grabs but does not seem thread-safe.
+      if (tracer_) {
+        InstrumentedMutexLock lock(&trace_mutex_);
+        if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+          for (auto* writer : wal_write_group) {
+            // TODO: maybe handle the tracing status?
+            tracer_->Write(writer->batch).PermitUncheckedError();
+          }
+        }
+      }
       SequenceNumber next_sequence = current_sequence;
       for (auto writer : wal_write_group) {
         if (writer->CheckCallback(this)) {
@@ -515,6 +553,9 @@
 
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
+    IOStatus io_s;
+    io_s.PermitUncheckedError();  // Allow io_s to be uninitialized
+
     if (w.status.ok() && !write_options.disableWAL) {
       PERF_TIMER_GUARD(write_wal_time);
       stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
@@ -524,24 +565,38 @@
                           wal_write_group.size - 1);
         RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
       }
-      w.status = WriteToWAL(wal_write_group, log_writer, log_used,
-                            need_log_sync, need_log_dir_sync, current_sequence);
+      io_s = WriteToWAL(wal_write_group, log_writer, log_used, need_log_sync,
+                        need_log_dir_sync, current_sequence);
+      w.status = io_s;
     }
 
     if (!w.CallbackFailed()) {
-      WriteStatusCheck(w.status);
+      if (!io_s.ok()) {
+        IOStatusCheck(io_s);
+      } else {
+        WriteStatusCheck(w.status);
+      }
     }
 
     if (need_log_sync) {
       mutex_.Lock();
-      MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status);
+      if (w.status.ok()) {
+        w.status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
+      } else {
+        MarkLogsNotSynced(logfile_number_);
+      }
       mutex_.Unlock();
     }
 
     write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
   }
 
+  // NOTE: the memtable_write_group is declared before the following
+  // `if` statement because its lifetime needs to be longer
+  // that the inner context  of the `if` as a reference to it
+  // may be used further below within the outer _write_thread
   WriteThread::WriteGroup memtable_write_group;
+
   if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
     PERF_TIMER_GUARD(write_memtable_time);
     assert(w.ShouldWriteToMemtable());
@@ -558,6 +613,10 @@
       versions_->SetLastSequence(memtable_write_group.last_sequence);
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
+  } else {
+    // NOTE: the memtable_write_group is never really used,
+    // so we need to set its status to pass ASSERT_STATUS_CHECKED
+    memtable_write_group.status.PermitUncheckedError();
   }
 
   if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
@@ -590,7 +649,7 @@
                                       SequenceNumber seq,
                                       const size_t sub_batch_cnt) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
 
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         false /*disable_memtable*/);
@@ -610,8 +669,6 @@
         0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
         seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
         write_options.memtable_insert_hint_per_batch);
-
-    WriteStatusCheck(w.status);
     if (write_options.disableWAL) {
       has_unpersisted_data_.store(true, std::memory_order_relaxed);
     }
@@ -626,6 +683,7 @@
     std::lock_guard<std::mutex> lck(switch_mutex_);
     switch_cv_.notify_all();
   }
+  WriteStatusCheck(w.status);
 
   if (!w.FinalStatus().ok()) {
     return w.FinalStatus();
@@ -642,12 +700,10 @@
     const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
     PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
     const PublishLastSeq publish_last_seq, const bool disable_memtable) {
-  Status status;
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         disable_memtable, sub_batch_cnt, pre_release_callback);
-  RecordTick(stats_, WRITE_WITH_WAL);
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
 
   write_thread->JoinBatchGroup(&w);
   assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
@@ -664,6 +720,8 @@
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
 
   if (publish_last_seq == kDoPublishLastSeq) {
+    Status status;
+
     // Currently we only use kDoPublishLastSeq in unordered_write
     assert(immutable_db_options_.unordered_write);
     WriteContext write_context;
@@ -676,7 +734,7 @@
       InstrumentedMutexLock l(&mutex_);
       bool need_log_sync = false;
       status = PreprocessWrite(write_options, &need_log_sync, &write_context);
-      WriteStatusCheck(status);
+      WriteStatusCheckOnLocked(status);
     }
     if (!status.ok()) {
       WriteThread::WriteGroup write_group;
@@ -691,6 +749,17 @@
   write_thread->EnterAsBatchGroupLeader(&w, &write_group);
   // Note: no need to update last_batch_group_size_ here since the batch writes
   // to WAL only
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+      for (auto* writer : write_group) {
+        // TODO: maybe handle the tracing status?
+        tracer_->Write(writer->batch).PermitUncheckedError();
+      }
+    }
+  }
 
   size_t pre_release_callback_cnt = 0;
   size_t total_byte_size = 0;
@@ -740,9 +809,12 @@
     }
     seq_inc = total_batch_cnt;
   }
+  Status status;
+  IOStatus io_s;
+  io_s.PermitUncheckedError();  // Allow io_s to be uninitialized
   if (!write_options.disableWAL) {
-    status =
-        ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+    io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+    status = io_s;
   } else {
     // Otherwise we inc seq number to do solely the seq allocation
     last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
@@ -777,7 +849,11 @@
   PERF_TIMER_START(write_pre_and_post_process_time);
 
   if (!w.CallbackFailed()) {
-    WriteStatusCheck(status);
+    if (!io_s.ok()) {
+      IOStatusCheck(io_s);
+    } else {
+      WriteStatusCheck(status);
+    }
   }
   if (status.ok()) {
     size_t index = 0;
@@ -812,17 +888,45 @@
   return status;
 }
 
+void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  // Caller must hold mutex_.
+  assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
+  mutex_.AssertHeld();
+  if (immutable_db_options_.paranoid_checks && !status.ok() &&
+      !status.IsBusy() && !status.IsIncomplete()) {
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+  }
+}
+
 void DBImpl::WriteStatusCheck(const Status& status) {
   // Is setting bg_error_ enough here?  This will at least stop
   // compaction and fail any further writes.
+  assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
   if (immutable_db_options_.paranoid_checks && !status.ok() &&
       !status.IsBusy() && !status.IsIncomplete()) {
     mutex_.Lock();
+    // Maybe change the return status to void?
     error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
     mutex_.Unlock();
   }
 }
 
+void DBImpl::IOStatusCheck(const IOStatus& io_status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
+       !io_status.IsBusy() && !io_status.IsIncomplete()) ||
+      io_status.IsIOFenced()) {
+    mutex_.Lock();
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
+    mutex_.Unlock();
+  }
+}
+
 void DBImpl::MemTableInsertStatusCheck(const Status& status) {
   // A non-OK status here indicates that the state implied by the
   // WAL has diverged from the in-memory state.  This could be
@@ -832,7 +936,9 @@
   if (!status.ok()) {
     mutex_.Lock();
     assert(!error_handler_.IsBGWorkStopped());
-    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable)
+        .PermitUncheckedError();
     mutex_.Unlock();
   }
 }
@@ -865,7 +971,7 @@
     // be flushed. We may end up with flushing much more DBs than needed. It's
     // suboptimal but still correct.
     WaitForPendingWrites();
-    status = HandleWriteBufferFull(write_context);
+    status = HandleWriteBufferManagerFlush(write_context);
   }
 
   if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
@@ -892,6 +998,20 @@
     PERF_TIMER_START(write_pre_and_post_process_time);
   }
 
+  // If memory usage exceeded beyond a certain threshold,
+  // write_buffer_manager_->ShouldStall() returns true to all threads writing to
+  // all DBs and writers will be stalled.
+  // It does soft checking because WriteBufferManager::buffer_limit_ has already
+  // exceeded at this point so no new write (including current one) will go
+  // through until memory usage is decreased.
+  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
+    if (write_options.no_slowdown) {
+      status = Status::Incomplete("Write stall");
+    } else {
+      WriteBufferManagerStallWrites();
+    }
+  }
+
   if (status.ok() && *need_log_sync) {
     // Wait until the parallel syncs are finished. Any sync process has to sync
     // the front log too so it is enough to check the status of front()
@@ -946,8 +1066,10 @@
     merged_batch = tmp_batch;
     for (auto writer : write_group) {
       if (!writer->CallbackFailed()) {
-        WriteBatchInternal::Append(merged_batch, writer->batch,
-                                   /*WAL_only*/ true);
+        Status s = WriteBatchInternal::Append(merged_batch, writer->batch,
+                                              /*WAL_only*/ true);
+        // Always returns Status::OK.
+        assert(s.ok());
         if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
           // We only need to cache the last of such write batch
           *to_be_cached_state = writer->batch;
@@ -961,10 +1083,20 @@
 
 // When two_write_queues_ is disabled, this function is called from the only
 // write thread. Otherwise this must be called holding log_write_mutex_.
-Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
-                          log::Writer* log_writer, uint64_t* log_used,
-                          uint64_t* log_size) {
+IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
+                            log::Writer* log_writer, uint64_t* log_used,
+                            uint64_t* log_size,
+                            bool with_db_mutex, bool with_log_mutex) {
   assert(log_size != nullptr);
+
+  // Assert mutex explicitly.
+  if (with_db_mutex) {
+    mutex_.AssertHeld();
+  } else if (two_write_queues_) {
+    log_write_mutex_.AssertHeld();
+    assert(with_log_mutex);
+  }
+
   Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
   *log_size = log_entry.size();
   // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
@@ -978,7 +1110,8 @@
   if (UNLIKELY(needs_locking)) {
     log_write_mutex_.Lock();
   }
-  Status status = log_writer->AddRecord(log_entry);
+  IOStatus io_s = log_writer->AddRecord(log_entry);
+
   if (UNLIKELY(needs_locking)) {
     log_write_mutex_.Unlock();
   }
@@ -986,19 +1119,22 @@
     *log_used = logfile_number_;
   }
   total_log_size_ += log_entry.size();
-  // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here
-  // since alive_log_files_ might be modified concurrently
-  alive_log_files_.back().AddSize(log_entry.size());
+  if (with_db_mutex || with_log_mutex) {
+    assert(alive_log_files_tail_ == alive_log_files_.rbegin());
+    assert(alive_log_files_tail_ != alive_log_files_.rend());
+  }
+  LogFileNumberSize& last_alive_log = *alive_log_files_tail_;
+  last_alive_log.AddSize(*log_size);
   log_empty_ = false;
-  return status;
+  return io_s;
 }
 
-Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
-                          log::Writer* log_writer, uint64_t* log_used,
-                          bool need_log_sync, bool need_log_dir_sync,
-                          SequenceNumber sequence) {
-  Status status;
-
+IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
+                            log::Writer* log_writer, uint64_t* log_used,
+                            bool need_log_sync, bool need_log_dir_sync,
+                            SequenceNumber sequence) {
+  IOStatus io_s;
+  assert(!two_write_queues_);
   assert(!write_group.leader->disable_wal);
   // Same holds for all in the batch group
   size_t write_with_wal = 0;
@@ -1016,14 +1152,14 @@
   WriteBatchInternal::SetSequence(merged_batch, sequence);
 
   uint64_t log_size;
-  status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
   if (to_be_cached_state) {
     cached_recoverable_state_ = *to_be_cached_state;
     cached_recoverable_state_empty_ = false;
   }
 
-  if (status.ok() && need_log_sync) {
-    StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
+  if (io_s.ok() && need_log_sync) {
+    StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
     // It's safe to access logs_ with unlocked mutex_ here because:
     //  - we've set getting_synced=true for all logs,
     //    so other threads won't pop from logs_ while we're here,
@@ -1031,24 +1167,43 @@
     //    writer thread, so no one will push to logs_,
     //  - as long as other threads don't modify it, it's safe to read
     //    from std::deque from multiple threads concurrently.
+    //
+    // Sync operation should work with locked log_write_mutex_, because:
+    //   when DBOptions.manual_wal_flush_ is set,
+    //   FlushWAL function will be invoked by another thread.
+    //   if without locked log_write_mutex_, the log file may get data
+    //   corruption
+
+    const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Lock();
+    }
+
     for (auto& log : logs_) {
-      status = log.writer->file()->Sync(immutable_db_options_.use_fsync);
-      if (!status.ok()) {
+      io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
+      if (!io_s.ok()) {
         break;
       }
     }
-    if (status.ok() && need_log_dir_sync) {
+
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Unlock();
+    }
+
+    if (io_s.ok() && need_log_dir_sync) {
       // We only sync WAL directory the first time WAL syncing is
       // requested, so that in case users never turn on WAL sync,
       // we can avoid the disk I/O in the write code path.
-      status = directories_.GetWalDir()->Fsync();
+      io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
     }
   }
 
   if (merged_batch == &tmp_batch_) {
     tmp_batch_.Clear();
   }
-  if (status.ok()) {
+  if (io_s.ok()) {
     auto stats = default_cf_internal_stats_;
     if (need_log_sync) {
       stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
@@ -1059,15 +1214,15 @@
     stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
     RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
   }
-  return status;
+  return io_s;
 }
 
-Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
-                                    uint64_t* log_used,
-                                    SequenceNumber* last_sequence,
-                                    size_t seq_inc) {
-  Status status;
+IOStatus DBImpl::ConcurrentWriteToWAL(
+    const WriteThread::WriteGroup& write_group, uint64_t* log_used,
+    SequenceNumber* last_sequence, size_t seq_inc) {
+  IOStatus io_s;
 
+  assert(two_write_queues_ || immutable_db_options_.unordered_write);
   assert(!write_group.leader->disable_wal);
   // Same holds for all in the batch group
   WriteBatch tmp_batch;
@@ -1092,14 +1247,15 @@
 
   log::Writer* log_writer = logs_.back().writer;
   uint64_t log_size;
-  status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+                    /*with_db_mutex=*/false, /*with_log_mutex=*/true);
   if (to_be_cached_state) {
     cached_recoverable_state_ = *to_be_cached_state;
     cached_recoverable_state_empty_ = false;
   }
   log_write_mutex_.Unlock();
 
-  if (status.ok()) {
+  if (io_s.ok()) {
     const bool concurrent = true;
     auto stats = default_cf_internal_stats_;
     stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
@@ -1109,7 +1265,7 @@
                       concurrent);
     RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
   }
-  return status;
+  return io_s;
 }
 
 Status DBImpl::WriteRecoverableState() {
@@ -1271,16 +1427,23 @@
     }
     for (auto cfd : cfds) {
       cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWalFull);
     }
-    FlushRequest flush_req;
-    GenerateFlushRequest(cfds, &flush_req);
-    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
     MaybeScheduleFlushOrCompaction();
   }
   return status;
 }
 
-Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
+Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
   mutex_.AssertHeld();
   assert(write_context != nullptr);
   Status status;
@@ -1292,7 +1455,7 @@
   // suboptimal but still correct.
   ROCKS_LOG_INFO(
       immutable_db_options_.info_log,
-      "Flushing column family with oldest memtable entry. Write buffer is "
+      "Flushing column family with oldest memtable entry. Write buffers are "
       "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".",
       write_buffer_manager_->memory_usage(),
       write_buffer_manager_->buffer_size());
@@ -1350,10 +1513,17 @@
     }
     for (const auto cfd : cfds) {
       cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
     }
-    FlushRequest flush_req;
-    GenerateFlushRequest(cfds, &flush_req);
-    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
     MaybeScheduleFlushOrCompaction();
   }
   return status;
@@ -1373,8 +1543,10 @@
   uint64_t time_delayed = 0;
   bool delayed = false;
   {
-    StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
-    uint64_t delay = write_controller_.GetDelay(env_, num_bytes);
+    StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
+                 &time_delayed);
+    uint64_t delay =
+        write_controller_.GetDelay(immutable_db_options_.clock, num_bytes);
     if (delay > 0) {
       if (write_options.no_slowdown) {
         return Status::Incomplete("Write stall");
@@ -1386,19 +1558,21 @@
       write_thread_.BeginWriteStall();
       TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
       mutex_.Unlock();
-      // We will delay the write until we have slept for delay ms or
-      // we don't need a delay anymore
-      const uint64_t kDelayInterval = 1000;
+      // We will delay the write until we have slept for `delay` microseconds
+      // or we don't need a delay anymore. We check for cancellation every 1ms
+      // (slightly longer because WriteController minimum delay is 1ms, in
+      // case of sleep imprecision, rounding, etc.)
+      const uint64_t kDelayInterval = 1001;
       uint64_t stall_end = sw.start_time() + delay;
       while (write_controller_.NeedsDelay()) {
-        if (env_->NowMicros() >= stall_end) {
+        if (immutable_db_options_.clock->NowMicros() >= stall_end) {
           // We already delayed this write `delay` microseconds
           break;
         }
 
         delayed = true;
         // Sleep for 0.001 seconds
-        env_->SleepForMicroseconds(kDelayInterval);
+        immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval);
       }
       mutex_.Lock();
       write_thread_.EndWriteStall();
@@ -1444,6 +1618,29 @@
   return s;
 }
 
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+void DBImpl::WriteBufferManagerStallWrites() {
+  mutex_.AssertHeld();
+  // First block future writer threads who want to add themselves to the queue
+  // of WriteThread.
+  write_thread_.BeginWriteStall();
+  mutex_.Unlock();
+
+  // Change the state to State::Blocked.
+  static_cast<WBMStallInterface*>(wbm_stall_.get())
+      ->SetState(WBMStallInterface::State::BLOCKED);
+  // Then WriteBufferManager will add DB instance to its queue
+  // and block this thread by calling WBMStallInterface::Block().
+  write_buffer_manager_->BeginWriteStall(wbm_stall_.get());
+  wbm_stall_->Block();
+
+  mutex_.Lock();
+  // Stall has ended. Signal writer threads so that they can add
+  // themselves to the WriteThread queue for writes.
+  write_thread_.EndWriteStall();
+}
+
 Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
                                             WriteBatch* my_batch) {
   assert(write_options.low_pri);
@@ -1517,11 +1714,9 @@
   }
   for (auto& cfd : cfds) {
     autovector<MemTable*> to_delete;
-    cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage());
-    if (!to_delete.empty()) {
-      for (auto m : to_delete) {
-        delete m;
-      }
+    bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_,
+                                           cfd->mem()->MemoryAllocatedBytes());
+    if (trimmed) {
       context->superversion_context.NewSuperVersion();
       assert(context->superversion_context.new_superversion.get() != nullptr);
       cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
@@ -1574,10 +1769,16 @@
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    } else {
+      for (auto* cfd : cfds) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+      }
     }
-    FlushRequest flush_req;
-    GenerateFlushRequest(cfds, &flush_req);
-    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
     MaybeScheduleFlushOrCompaction();
   }
   return status;
@@ -1605,10 +1806,9 @@
 // two_write_queues_ is true (This is to simplify the reasoning.)
 Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   mutex_.AssertHeld();
-  WriteThread::Writer nonmem_w;
-  std::unique_ptr<WritableFile> lfile;
   log::Writer* new_log = nullptr;
   MemTable* new_mem = nullptr;
+  IOStatus io_s;
 
   // Recoverable state is persisted in WAL. After memtable switch, WAL might
   // be deleted, so we write the state to memtable to be persisted as well.
@@ -1654,8 +1854,11 @@
   if (creating_new_log) {
     // TODO: Write buffer size passed in should be max of all CF's instead
     // of mutable_cf_options.write_buffer_size.
-    s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
-                  &new_log);
+    io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
+                     &new_log);
+    if (s.ok()) {
+      s = io_s;
+    }
   }
   if (s.ok()) {
     SequenceNumber seq = versions_->LastSequence();
@@ -1681,7 +1884,10 @@
     if (!logs_.empty()) {
       // Alway flush the buffer of the last log before switching to a new one
       log::Writer* cur_log_writer = logs_.back().writer;
-      s = cur_log_writer->WriteBuffer();
+      io_s = cur_log_writer->WriteBuffer();
+      if (s.ok()) {
+        s = io_s;
+      }
       if (!s.ok()) {
         ROCKS_LOG_WARN(immutable_db_options_.info_log,
                        "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
@@ -1696,6 +1902,7 @@
       log_dir_synced_ = false;
       logs_.emplace_back(logfile_number_, new_log);
       alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+      alive_log_files_tail_ = alive_log_files_.rbegin();
     }
     log_write_mutex_.Unlock();
   }
@@ -1703,45 +1910,92 @@
   if (!s.ok()) {
     // how do we fail if we're not creating new log?
     assert(creating_new_log);
-    if (new_mem) {
-      delete new_mem;
-    }
-    if (new_log) {
-      delete new_log;
-    }
-    SuperVersion* new_superversion =
-        context->superversion_context.new_superversion.release();
-    if (new_superversion != nullptr) {
-      delete new_superversion;
-    }
+    delete new_mem;
+    delete new_log;
+    context->superversion_context.new_superversion.reset();
     // We may have lost data from the WritableFileBuffer in-memory buffer for
     // the current log, so treat it as a fatal error and set bg_error
-    error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+    if (!io_s.ok()) {
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
+    } else {
+      error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+    }
     // Read back bg_error in order to get the right severity
     s = error_handler_.GetBGError();
     return s;
   }
 
-  for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
-    // all this is just optimization to delete logs that
-    // are no longer needed -- if CF is empty, that means it
-    // doesn't need that particular log to stay alive, so we just
-    // advance the log number. no need to persist this in the manifest
-    if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
-        loop_cfd->imm()->NumNotFlushed() == 0) {
-      if (creating_new_log) {
-        loop_cfd->SetLogNumber(logfile_number_);
+  bool empty_cf_updated = false;
+  if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+      !immutable_db_options_.allow_2pc && creating_new_log) {
+    // In non-2pc mode, WALs become obsolete if they do not contain unflushed
+    // data. Updating the empty CF's log number might cause some WALs to become
+    // obsolete. So we should track the WAL obsoletion event before actually
+    // updating the empty CF's log number.
+    uint64_t min_wal_number_to_keep =
+        versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
+    if (min_wal_number_to_keep >
+        versions_->GetWalSet().GetMinWalNumberToKeep()) {
+      // Get a snapshot of the empty column families.
+      // LogAndApply may release and reacquire db
+      // mutex, during that period, column family may become empty (e.g. its
+      // flush succeeds), then it affects the computed min_log_number_to_keep,
+      // so we take a snapshot for consistency of column family data
+      // status. If a column family becomes non-empty afterwards, its active log
+      // should still be the created new log, so the min_log_number_to_keep is
+      // not affected.
+      autovector<ColumnFamilyData*> empty_cfs;
+      for (auto cf : *versions_->GetColumnFamilySet()) {
+        if (cf->IsEmpty()) {
+          empty_cfs.push_back(cf);
+        }
+      }
+
+      VersionEdit wal_deletion;
+      wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+      s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_);
+      if (!s.ok() && versions_->io_status().IsIOError()) {
+        s = error_handler_.SetBGError(versions_->io_status(),
+                                      BackgroundErrorReason::kManifestWrite);
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      for (auto cf : empty_cfs) {
+        if (cf->IsEmpty()) {
+          cf->SetLogNumber(logfile_number_);
+          // MEMPURGE: No need to change this, because new adds
+          // should still receive new sequence numbers.
+          cf->mem()->SetCreationSeq(versions_->LastSequence());
+        }  // cf may become non-empty.
+      }
+      empty_cf_updated = true;
+    }
+  }
+  if (!empty_cf_updated) {
+    for (auto cf : *versions_->GetColumnFamilySet()) {
+      // all this is just optimization to delete logs that
+      // are no longer needed -- if CF is empty, that means it
+      // doesn't need that particular log to stay alive, so we just
+      // advance the log number. no need to persist this in the manifest
+      if (cf->IsEmpty()) {
+        if (creating_new_log) {
+          cf->SetLogNumber(logfile_number_);
+        }
+        cf->mem()->SetCreationSeq(versions_->LastSequence());
       }
-      loop_cfd->mem()->SetCreationSeq(versions_->LastSequence());
     }
   }
 
   cfd->mem()->SetNextLogNumber(logfile_number_);
+  assert(new_mem != nullptr);
   cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
   InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
                                      mutable_cf_options);
+
 #ifndef ROCKSDB_LITE
   mutex_.Unlock();
   // Notify client that memtable is sealed, now that we have successfully
@@ -1749,6 +2003,10 @@
   NotifyOnMemTableSealed(cfd, memtable_info);
   mutex_.Lock();
 #endif  // ROCKSDB_LITE
+  // It is possible that we got here without checking the value of i_os, but
+  // that is okay.  If we did, it most likely means that s was already an error.
+  // In any case, ignore any unchecked error for i_os here.
+  io_s.PermitUncheckedError();
   return s;
 }
 
@@ -1792,13 +2050,20 @@
   const Slice* ts = opt.timestamp;
   assert(nullptr != ts);
   size_t ts_sz = ts->size();
-  WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0,
-                   ts_sz);
-  Status s = batch.Put(column_family, key, value);
-  if (!s.ok()) {
-    return s;
+  assert(column_family->GetComparator());
+  assert(ts_sz == column_family->GetComparator()->timestamp_size());
+  WriteBatch batch;
+  Status s;
+  if (key.data() + key.size() == ts->data()) {
+    Slice key_with_ts = Slice(key.data(), key.size() + ts_sz);
+    s = batch.Put(column_family, key_with_ts, value);
+  } else {
+    std::array<Slice, 2> key_with_ts_slices{{key, *ts}};
+    SliceParts key_with_ts(key_with_ts_slices.data(), 2);
+    std::array<Slice, 1> value_slices{{value}};
+    SliceParts values(value_slices.data(), 1);
+    s = batch.Put(column_family, key_with_ts, values);
   }
-  s = batch.AssignTimestamp(*ts);
   if (!s.ok()) {
     return s;
   }
@@ -1807,23 +2072,77 @@
 
 Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
                   const Slice& key) {
+  if (nullptr == opt.timestamp) {
+    WriteBatch batch;
+    Status s = batch.Delete(column_family, key);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(opt, &batch);
+  }
+  const Slice* ts = opt.timestamp;
+  assert(ts != nullptr);
+  size_t ts_sz = ts->size();
+  assert(column_family->GetComparator());
+  assert(ts_sz == column_family->GetComparator()->timestamp_size());
   WriteBatch batch;
-  batch.Delete(column_family, key);
+  Status s;
+  if (key.data() + key.size() == ts->data()) {
+    Slice key_with_ts = Slice(key.data(), key.size() + ts_sz);
+    s = batch.Delete(column_family, key_with_ts);
+  } else {
+    std::array<Slice, 2> key_with_ts_slices{{key, *ts}};
+    SliceParts key_with_ts(key_with_ts_slices.data(), 2);
+    s = batch.Delete(column_family, key_with_ts);
+  }
+  if (!s.ok()) {
+    return s;
+  }
   return Write(opt, &batch);
 }
 
 Status DB::SingleDelete(const WriteOptions& opt,
                         ColumnFamilyHandle* column_family, const Slice& key) {
+  Status s;
+  if (opt.timestamp == nullptr) {
+    WriteBatch batch;
+    s = batch.SingleDelete(column_family, key);
+    if (!s.ok()) {
+      return s;
+    }
+    s = Write(opt, &batch);
+    return s;
+  }
+
+  const Slice* ts = opt.timestamp;
+  assert(ts != nullptr);
+  size_t ts_sz = ts->size();
+  assert(column_family->GetComparator());
+  assert(ts_sz == column_family->GetComparator()->timestamp_size());
   WriteBatch batch;
-  batch.SingleDelete(column_family, key);
-  return Write(opt, &batch);
+  if (key.data() + key.size() == ts->data()) {
+    Slice key_with_ts = Slice(key.data(), key.size() + ts_sz);
+    s = batch.SingleDelete(column_family, key_with_ts);
+  } else {
+    std::array<Slice, 2> key_with_ts_slices{{key, *ts}};
+    SliceParts key_with_ts(key_with_ts_slices.data(), 2);
+    s = batch.SingleDelete(column_family, key_with_ts);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  s = Write(opt, &batch);
+  return s;
 }
 
 Status DB::DeleteRange(const WriteOptions& opt,
                        ColumnFamilyHandle* column_family,
                        const Slice& begin_key, const Slice& end_key) {
   WriteBatch batch;
-  batch.DeleteRange(column_family, begin_key, end_key);
+  Status s = batch.DeleteRange(column_family, begin_key, end_key);
+  if (!s.ok()) {
+    return s;
+  }
   return Write(opt, &batch);
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_impl/db_secondary_test.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,869 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "db/db_impl/db_impl_secondary.h"
-#include "db/db_test_util.h"
-#include "port/stack_trace.h"
-#include "test_util/fault_injection_test_env.h"
-#include "test_util/sync_point.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-#ifndef ROCKSDB_LITE
-class DBSecondaryTest : public DBTestBase {
- public:
-  DBSecondaryTest()
-      : DBTestBase("/db_secondary_test"),
-        secondary_path_(),
-        handles_secondary_(),
-        db_secondary_(nullptr) {
-    secondary_path_ =
-        test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
-  }
-
-  ~DBSecondaryTest() override {
-    CloseSecondary();
-    if (getenv("KEEP_DB") != nullptr) {
-      fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
-    } else {
-      Options options;
-      options.env = env_;
-      EXPECT_OK(DestroyDB(secondary_path_, options));
-    }
-  }
-
- protected:
-  Status ReopenAsSecondary(const Options& options) {
-    return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
-  }
-
-  void OpenSecondary(const Options& options);
-
-  void OpenSecondaryWithColumnFamilies(
-      const std::vector<std::string>& column_families, const Options& options);
-
-  void CloseSecondary() {
-    for (auto h : handles_secondary_) {
-      db_secondary_->DestroyColumnFamilyHandle(h);
-    }
-    handles_secondary_.clear();
-    delete db_secondary_;
-    db_secondary_ = nullptr;
-  }
-
-  DBImplSecondary* db_secondary_full() {
-    return static_cast<DBImplSecondary*>(db_secondary_);
-  }
-
-  void CheckFileTypeCounts(const std::string& dir, int expected_log,
-                           int expected_sst, int expected_manifest) const;
-
-  std::string secondary_path_;
-  std::vector<ColumnFamilyHandle*> handles_secondary_;
-  DB* db_secondary_;
-};
-
-void DBSecondaryTest::OpenSecondary(const Options& options) {
-  Status s =
-      DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
-  ASSERT_OK(s);
-}
-
-void DBSecondaryTest::OpenSecondaryWithColumnFamilies(
-    const std::vector<std::string>& column_families, const Options& options) {
-  std::vector<ColumnFamilyDescriptor> cf_descs;
-  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
-  for (const auto& cf_name : column_families) {
-    cf_descs.emplace_back(cf_name, options);
-  }
-  Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
-                                 &handles_secondary_, &db_secondary_);
-  ASSERT_OK(s);
-}
-
-void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir,
-                                          int expected_log, int expected_sst,
-                                          int expected_manifest) const {
-  std::vector<std::string> filenames;
-  env_->GetChildren(dir, &filenames);
-
-  int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
-  for (auto file : filenames) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(file, &number, &type)) {
-      log_cnt += (type == kLogFile);
-      sst_cnt += (type == kTableFile);
-      manifest_cnt += (type == kDescriptorFile);
-    }
-  }
-  ASSERT_EQ(expected_log, log_cnt);
-  ASSERT_EQ(expected_sst, sst_cnt);
-  ASSERT_EQ(expected_manifest, manifest_cnt);
-}
-
-TEST_F(DBSecondaryTest, ReopenAsSecondary) {
-  Options options;
-  options.env = env_;
-  Reopen(options);
-  ASSERT_OK(Put("foo", "foo_value"));
-  ASSERT_OK(Put("bar", "bar_value"));
-  ASSERT_OK(dbfull()->Flush(FlushOptions()));
-  Close();
-
-  ASSERT_OK(ReopenAsSecondary(options));
-  ASSERT_EQ("foo_value", Get("foo"));
-  ASSERT_EQ("bar_value", Get("bar"));
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  auto db1 = static_cast<DBImplSecondary*>(db_);
-  ASSERT_NE(nullptr, db1);
-  Iterator* iter = db1->NewIterator(ropts);
-  ASSERT_NE(nullptr, iter);
-  size_t count = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    if (0 == count) {
-      ASSERT_EQ("bar", iter->key().ToString());
-      ASSERT_EQ("bar_value", iter->value().ToString());
-    } else if (1 == count) {
-      ASSERT_EQ("foo", iter->key().ToString());
-      ASSERT_EQ("foo_value", iter->value().ToString());
-    }
-    ++count;
-  }
-  delete iter;
-  ASSERT_EQ(2, count);
-}
-
-TEST_F(DBSecondaryTest, OpenAsSecondary) {
-  Options options;
-  options.env = env_;
-  options.level0_file_num_compaction_trigger = 4;
-  Reopen(options);
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
-    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
-    ASSERT_OK(Flush());
-  }
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  const auto verify_db_func = [&](const std::string& foo_val,
-                                  const std::string& bar_val) {
-    std::string value;
-    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
-    ASSERT_EQ(foo_val, value);
-    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
-    ASSERT_EQ(bar_val, value);
-    Iterator* iter = db_secondary_->NewIterator(ropts);
-    ASSERT_NE(nullptr, iter);
-    iter->Seek("foo");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("foo", iter->key().ToString());
-    ASSERT_EQ(foo_val, iter->value().ToString());
-    iter->Seek("bar");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("bar", iter->key().ToString());
-    ASSERT_EQ(bar_val, iter->value().ToString());
-    size_t count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ++count;
-    }
-    ASSERT_EQ(2, count);
-    delete iter;
-  };
-
-  verify_db_func("foo_value2", "bar_value2");
-
-  ASSERT_OK(Put("foo", "new_foo_value"));
-  ASSERT_OK(Put("bar", "new_bar_value"));
-  ASSERT_OK(Flush());
-
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  verify_db_func("new_foo_value", "new_bar_value");
-}
-
-namespace {
-class TraceFileEnv : public EnvWrapper {
- public:
-  explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
-  Status NewRandomAccessFile(const std::string& f,
-                             std::unique_ptr<RandomAccessFile>* r,
-                             const EnvOptions& env_options) override {
-    class TracedRandomAccessFile : public RandomAccessFile {
-     public:
-      TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
-                             std::atomic<int>& counter)
-          : target_(std::move(target)), files_closed_(counter) {}
-      ~TracedRandomAccessFile() override {
-        files_closed_.fetch_add(1, std::memory_order_relaxed);
-      }
-      Status Read(uint64_t offset, size_t n, Slice* result,
-                  char* scratch) const override {
-        return target_->Read(offset, n, result, scratch);
-      }
-
-     private:
-      std::unique_ptr<RandomAccessFile> target_;
-      std::atomic<int>& files_closed_;
-    };
-    Status s = target()->NewRandomAccessFile(f, r, env_options);
-    if (s.ok()) {
-      r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
-    }
-    return s;
-  }
-
-  int files_closed() const {
-    return files_closed_.load(std::memory_order_relaxed);
-  }
-
- private:
-  std::atomic<int> files_closed_{0};
-};
-}  // namespace
-
-TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
-  Options options;
-  options.env = env_;
-  options.max_open_files = 1;
-  options.disable_auto_compactions = true;
-  Reopen(options);
-  Options options1;
-  std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
-  options1.env = traced_env.get();
-  OpenSecondary(options1);
-
-  static const auto verify_db = [&]() {
-    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
-    std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
-    for (iter1->SeekToFirst(), iter2->SeekToFirst();
-         iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
-      ASSERT_EQ(iter1->key(), iter2->key());
-      ASSERT_EQ(iter1->value(), iter2->value());
-    }
-    ASSERT_FALSE(iter1->Valid());
-    ASSERT_FALSE(iter2->Valid());
-  };
-
-  ASSERT_OK(Put("a", "value"));
-  ASSERT_OK(Put("c", "value"));
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  verify_db();
-
-  ASSERT_OK(Put("b", "value"));
-  ASSERT_OK(Put("d", "value"));
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  verify_db();
-
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
-
-  Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
-  ASSERT_TRUE(s.IsNotSupported());
-  CloseSecondary();
-}
-
-TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
-  Options options;
-  options.env = env_;
-  options.level0_file_num_compaction_trigger = 4;
-  Reopen(options);
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
-    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
-  }
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  const auto verify_db_func = [&](const std::string& foo_val,
-                                  const std::string& bar_val) {
-    std::string value;
-    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
-    ASSERT_EQ(foo_val, value);
-    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
-    ASSERT_EQ(bar_val, value);
-    Iterator* iter = db_secondary_->NewIterator(ropts);
-    ASSERT_NE(nullptr, iter);
-    iter->Seek("foo");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("foo", iter->key().ToString());
-    ASSERT_EQ(foo_val, iter->value().ToString());
-    iter->Seek("bar");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("bar", iter->key().ToString());
-    ASSERT_EQ(bar_val, iter->value().ToString());
-    size_t count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ++count;
-    }
-    ASSERT_EQ(2, count);
-    delete iter;
-  };
-
-  verify_db_func("foo_value2", "bar_value2");
-
-  ASSERT_OK(Put("foo", "new_foo_value"));
-  ASSERT_OK(Put("bar", "new_bar_value"));
-
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  verify_db_func("new_foo_value", "new_bar_value");
-
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("foo", "new_foo_value_1"));
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  verify_db_func("new_foo_value_1", "new_bar_value");
-}
-
-TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
-  Options options;
-  options.env = env_;
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  std::vector<ColumnFamilyDescriptor> cf_descs;
-  cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
-  cf_descs.emplace_back("pikachu", options1);
-  cf_descs.emplace_back("eevee", options1);
-  Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
-                                 &handles_secondary_, &db_secondary_);
-  ASSERT_NOK(s);
-}
-
-TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
-  Options options;
-  options.env = env_;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-  ASSERT_EQ(0, handles_secondary_.size());
-  ASSERT_NE(nullptr, db_secondary_);
-
-  ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
-  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
-  ASSERT_OK(Flush(0 /*cf*/));
-  ASSERT_OK(Flush(1 /*cf*/));
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  std::string value;
-  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
-  ASSERT_EQ("foo_value", value);
-}
-
-TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
-  Options options;
-  options.env = env_;
-  Reopen(options);
-  Close();
-
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
-        "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
-       {"VersionSet::ProcessManifestWrites:AfterNewManifest",
-        "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
-        "1"}});
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  // Make sure db calls RecoverLogFiles so as to trigger a manifest write,
-  // which causes the db to switch to a new MANIFEST upon start.
-  port::Thread ro_db_thread([&]() {
-    Options options1;
-    options1.env = env_;
-    options1.max_open_files = -1;
-    OpenSecondary(options1);
-    CloseSecondary();
-  });
-  Reopen(options);
-  ro_db_thread.join();
-}
-
-TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
-  Options options;
-  options.env = env_;
-  options.level0_file_num_compaction_trigger = 4;
-  Reopen(options);
-  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
-    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
-    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
-    ASSERT_OK(dbfull()->Flush(FlushOptions()));
-  }
-  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  std::string value;
-  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
-  ASSERT_EQ("foo_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            value);
-  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
-  ASSERT_EQ("bar_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            value);
-  Iterator* iter = db_secondary_->NewIterator(ropts);
-  ASSERT_NE(nullptr, iter);
-  iter->Seek("bar");
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ("bar", iter->key().ToString());
-  ASSERT_EQ("bar_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            iter->value().ToString());
-  iter->Seek("foo");
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ("foo", iter->key().ToString());
-  ASSERT_EQ("foo_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            iter->value().ToString());
-  size_t count = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    ++count;
-  }
-  ASSERT_EQ(2, count);
-  delete iter;
-}
-
-TEST_F(DBSecondaryTest, MissingTableFile) {
-  int table_files_not_exist = 0;
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->SetCallBack(
-      "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers",
-      [&](void* arg) {
-        Status s = *reinterpret_cast<Status*>(arg);
-        if (s.IsPathNotFound()) {
-          ++table_files_not_exist;
-        } else if (!s.ok()) {
-          assert(false);  // Should not reach here
-        }
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
-  Options options;
-  options.env = env_;
-  options.level0_file_num_compaction_trigger = 4;
-  Reopen(options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-
-  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
-    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
-    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
-    ASSERT_OK(dbfull()->Flush(FlushOptions()));
-  }
-  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-
-  ASSERT_NE(nullptr, db_secondary_full());
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  std::string value;
-  ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
-  ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
-
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist);
-  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
-  ASSERT_EQ("foo_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            value);
-  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
-  ASSERT_EQ("bar_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            value);
-  Iterator* iter = db_secondary_->NewIterator(ropts);
-  ASSERT_NE(nullptr, iter);
-  iter->Seek("bar");
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ("bar", iter->key().ToString());
-  ASSERT_EQ("bar_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            iter->value().ToString());
-  iter->Seek("foo");
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ("foo", iter->key().ToString());
-  ASSERT_EQ("foo_value" +
-                std::to_string(options.level0_file_num_compaction_trigger - 1),
-            iter->value().ToString());
-  size_t count = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    ++count;
-  }
-  ASSERT_EQ(2, count);
-  delete iter;
-}
-
-TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
-  Options options;
-  options.env = env_;
-  const std::string kCfName1 = "pikachu";
-  CreateAndReopenWithCF({kCfName1}, options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondaryWithColumnFamilies({kCfName1}, options1);
-  ASSERT_EQ(2, handles_secondary_.size());
-
-  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
-  ASSERT_OK(Flush(1 /*cf*/));
-
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  std::string value;
-  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
-  ASSERT_EQ("foo_val_1", value);
-
-  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
-  Close();
-  CheckFileTypeCounts(dbname_, 1, 0, 1);
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  value.clear();
-  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
-  ASSERT_EQ("foo_val_1", value);
-}
-
-TEST_F(DBSecondaryTest, SwitchManifest) {
-  Options options;
-  options.env = env_;
-  options.level0_file_num_compaction_trigger = 4;
-  Reopen(options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-
-  const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
-  // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
-  // ..., 9.
-  const int kNumKeys = 10;
-  // Create two sst
-  for (int i = 0; i != kNumFiles; ++i) {
-    for (int j = 0; j != kNumKeys; ++j) {
-      ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
-    }
-    ASSERT_OK(Flush());
-  }
-
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  const auto& range_scan_db = [&]() {
-    ReadOptions tmp_ropts;
-    tmp_ropts.total_order_seek = true;
-    tmp_ropts.verify_checksums = true;
-    std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
-    int cnt = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
-      ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
-      ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
-                iter->value().ToString());
-    }
-  };
-
-  range_scan_db();
-
-  // While secondary instance still keeps old MANIFEST open, we close primary,
-  // restart primary, performs full compaction, close again, restart again so
-  // that next time secondary tries to catch up with primary, the secondary
-  // will skip the MANIFEST in middle.
-  Reopen(options);
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-
-  Reopen(options);
-  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
-
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  range_scan_db();
-}
-
-// Here, "Snapshot" refers to the version edits written by
-// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after
-// switching from the old one.
-TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
-  Options options;
-  options.env = env_;
-  options.disable_auto_compactions = true;
-  Reopen(options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-
-  ASSERT_OK(Put("0", "value0"));
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  std::string value;
-  ReadOptions ropts;
-  ropts.verify_checksums = true;
-  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
-  ASSERT_EQ("value0", value);
-
-  Reopen(options);
-  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-}
-
-TEST_F(DBSecondaryTest, SwitchWAL) {
-  const int kNumKeysPerMemtable = 1;
-  Options options;
-  options.env = env_;
-  options.max_write_buffer_number = 4;
-  options.min_write_buffer_number_to_merge = 2;
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(kNumKeysPerMemtable));
-  Reopen(options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-
-  const auto& verify_db = [](DB* db1, DB* db2) {
-    ASSERT_NE(nullptr, db1);
-    ASSERT_NE(nullptr, db2);
-    ReadOptions read_opts;
-    read_opts.verify_checksums = true;
-    std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
-    std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
-    it1->SeekToFirst();
-    it2->SeekToFirst();
-    for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
-      ASSERT_EQ(it1->key(), it2->key());
-      ASSERT_EQ(it1->value(), it2->value());
-    }
-    ASSERT_FALSE(it1->Valid());
-    ASSERT_FALSE(it2->Valid());
-
-    for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
-      std::string value;
-      ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
-      ASSERT_EQ(it1->value(), value);
-    }
-    for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
-      std::string value;
-      ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
-      ASSERT_EQ(it2->value(), value);
-    }
-  };
-  for (int k = 0; k != 16; ++k) {
-    ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
-    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-    verify_db(dbfull(), db_secondary_);
-  }
-}
-
-TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
-  const int kNumKeysPerMemtable = 1;
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
-        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
-  SyncPoint::GetInstance()->EnableProcessing();
-  const std::string kCFName1 = "pikachu";
-  Options options;
-  options.env = env_;
-  options.max_write_buffer_number = 4;
-  options.min_write_buffer_number_to_merge = 2;
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(kNumKeysPerMemtable));
-  CreateAndReopenWithCF({kCFName1}, options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondaryWithColumnFamilies({kCFName1}, options1);
-  ASSERT_EQ(2, handles_secondary_.size());
-
-  const auto& verify_db = [](DB* db1,
-                             const std::vector<ColumnFamilyHandle*>& handles1,
-                             DB* db2,
-                             const std::vector<ColumnFamilyHandle*>& handles2) {
-    ASSERT_NE(nullptr, db1);
-    ASSERT_NE(nullptr, db2);
-    ReadOptions read_opts;
-    read_opts.verify_checksums = true;
-    ASSERT_EQ(handles1.size(), handles2.size());
-    for (size_t i = 0; i != handles1.size(); ++i) {
-      std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
-      std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
-      it1->SeekToFirst();
-      it2->SeekToFirst();
-      for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
-        ASSERT_EQ(it1->key(), it2->key());
-        ASSERT_EQ(it1->value(), it2->value());
-      }
-      ASSERT_FALSE(it1->Valid());
-      ASSERT_FALSE(it2->Valid());
-
-      for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
-        std::string value;
-        ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
-        ASSERT_EQ(it1->value(), value);
-      }
-      for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
-        std::string value;
-        ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
-        ASSERT_EQ(it2->value(), value);
-      }
-    }
-  };
-  for (int k = 0; k != 8; ++k) {
-    ASSERT_OK(
-        Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
-    ASSERT_OK(
-        Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
-    TEST_SYNC_POINT(
-        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
-    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
-    SyncPoint::GetInstance()->ClearTrace();
-  }
-}
-
-TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
-  const int kNumKeysPerMemtable = 16;
-  Options options;
-  options.env = env_;
-  options.max_write_buffer_number = 4;
-  options.min_write_buffer_number_to_merge = 2;
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(kNumKeysPerMemtable));
-  Reopen(options);
-
-  Options options1;
-  options1.env = env_;
-  options1.max_open_files = -1;
-  OpenSecondary(options1);
-
-  WriteOptions write_opts;
-  WriteBatch wb;
-  wb.Put("key0", "value0");
-  wb.Put("key1", "value1");
-  ASSERT_OK(dbfull()->Write(write_opts, &wb));
-  ReadOptions read_opts;
-  std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
-  iter1->Seek("key0");
-  ASSERT_FALSE(iter1->Valid());
-  iter1->Seek("key1");
-  ASSERT_FALSE(iter1->Valid());
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  iter1->Seek("key0");
-  ASSERT_FALSE(iter1->Valid());
-  iter1->Seek("key1");
-  ASSERT_FALSE(iter1->Valid());
-  std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
-  iter2->Seek("key0");
-  ASSERT_TRUE(iter2->Valid());
-  ASSERT_EQ("value0", iter2->value());
-  iter2->Seek("key1");
-  ASSERT_TRUE(iter2->Valid());
-  ASSERT_EQ("value1", iter2->value());
-
-  {
-    WriteBatch wb1;
-    wb1.Put("key0", "value01");
-    wb1.Put("key1", "value11");
-    ASSERT_OK(dbfull()->Write(write_opts, &wb1));
-  }
-
-  {
-    WriteBatch wb2;
-    wb2.Put("key0", "new_value0");
-    wb2.Delete("key1");
-    ASSERT_OK(dbfull()->Write(write_opts, &wb2));
-  }
-
-  ASSERT_OK(Flush());
-
-  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
-  // iter3 should not see value01 and value11 at all.
-  iter3->Seek("key0");
-  ASSERT_TRUE(iter3->Valid());
-  ASSERT_EQ("new_value0", iter3->value());
-  iter3->Seek("key1");
-  ASSERT_FALSE(iter3->Valid());
-}
-
-TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
-  bool called = false;
-  Options options;
-  options.env = env_;
-  options.disable_auto_compactions = true;
-  Reopen(options);
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->SetCallBack(
-      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
-        ASSERT_NE(nullptr, arg);
-        called = true;
-        auto* s = reinterpret_cast<Status*>(arg);
-        ASSERT_NOK(*s);
-      });
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
-        "BackgroundCallCompaction:0"},
-       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
-        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  ASSERT_OK(Put("a", "value0"));
-  ASSERT_OK(Put("c", "value0"));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("b", "value1"));
-  ASSERT_OK(Put("d", "value1"));
-  ASSERT_OK(Flush());
-  port::Thread thread([this]() {
-    Options opts;
-    opts.env = env_;
-    opts.max_open_files = -1;
-    OpenSecondary(opts);
-  });
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  thread.join();
-  ASSERT_TRUE(called);
-}
-#endif  //! ROCKSDB_LITE
-
-}  // namespace ROCKSDB_NAMESPACE
-
-int main(int argc, char** argv) {
-  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,7 +17,8 @@
 namespace ROCKSDB_NAMESPACE {
 
 void DumpDBFileSummary(const ImmutableDBOptions& options,
-                       const std::string& dbname) {
+                       const std::string& dbname,
+                       const std::string& session_id) {
   if (options.info_log == nullptr) {
     return;
   }
@@ -32,6 +33,8 @@
   std::string file_info, wal_info;
 
   Header(options.info_log, "DB SUMMARY\n");
+  Header(options.info_log, "DB Session ID:  %s\n", session_id.c_str());
+
   // Get files in dbname dir
   if (!env->GetChildren(dbname, &files).ok()) {
     Error(options.info_log,
@@ -50,16 +53,25 @@
         Header(options.info_log, "IDENTITY file:  %s\n", file.c_str());
         break;
       case kDescriptorFile:
-        env->GetFileSize(dbname + "/" + file, &file_size);
-        Header(options.info_log, "MANIFEST file:  %s size: %" PRIu64 " Bytes\n",
-               file.c_str(), file_size);
-        break;
-      case kLogFile:
-        env->GetFileSize(dbname + "/" + file, &file_size);
-        char str[16];
-        snprintf(str, sizeof(str), "%" PRIu64, file_size);
-        wal_info.append(file).append(" size: ").
-            append(str).append(" ; ");
+        if (env->GetFileSize(dbname + "/" + file, &file_size).ok()) {
+          Header(options.info_log,
+                 "MANIFEST file:  %s size: %" PRIu64 " Bytes\n", file.c_str(),
+                 file_size);
+        } else {
+          Error(options.info_log, "Error when reading MANIFEST file: %s/%s\n",
+                dbname.c_str(), file.c_str());
+        }
+        break;
+      case kWalFile:
+        if (env->GetFileSize(dbname + "/" + file, &file_size).ok()) {
+          wal_info.append(file)
+              .append(" size: ")
+              .append(std::to_string(file_size))
+              .append(" ; ");
+        } else {
+          Error(options.info_log, "Error when reading LOG file: %s/%s\n",
+                dbname.c_str(), file.c_str());
+        }
         break;
       case kTableFile:
         if (++file_num < 10) {
@@ -97,27 +109,30 @@
   }
 
   // Get wal file in wal_dir
-  if (dbname.compare(options.wal_dir) != 0) {
-    if (!env->GetChildren(options.wal_dir, &files).ok()) {
-      Error(options.info_log,
-          "Error when reading %s dir\n",
-          options.wal_dir.c_str());
+  const auto& wal_dir = options.GetWalDir(dbname);
+  if (!options.IsWalDirSameAsDBPath(dbname)) {
+    if (!env->GetChildren(wal_dir, &files).ok()) {
+      Error(options.info_log, "Error when reading %s dir\n", wal_dir.c_str());
       return;
     }
     wal_info.clear();
     for (const std::string& file : files) {
       if (ParseFileName(file, &number, &type)) {
-        if (type == kLogFile) {
-          env->GetFileSize(options.wal_dir + "/" + file, &file_size);
-          char str[16];
-          snprintf(str, sizeof(str), "%" PRIu64, file_size);
-          wal_info.append(file).append(" size: ").
-              append(str).append(" ; ");
+        if (type == kWalFile) {
+          if (env->GetFileSize(wal_dir + "/" + file, &file_size).ok()) {
+            wal_info.append(file)
+                .append(" size: ")
+                .append(std::to_string(file_size))
+                .append(" ; ");
+          } else {
+            Error(options.info_log, "Error when reading LOG file %s/%s\n",
+                  wal_dir.c_str(), file.c_str());
+          }
         }
       }
     }
   }
-  Header(options.info_log, "Write Ahead Log file in %s: %s\n",
-         options.wal_dir.c_str(), wal_info.c_str());
+  Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(),
+         wal_info.c_str());
 }
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_info_dumper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_info_dumper.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,5 +10,6 @@
 
 namespace ROCKSDB_NAMESPACE {
 void DumpDBFileSummary(const ImmutableDBOptions& options,
-                       const std::string& dbname);
+                       const std::string& dbname,
+                       const std::string& session_id = "");
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_inplace_update_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,7 +13,8 @@
 
 class DBTestInPlaceUpdate : public DBTestBase {
  public:
-  DBTestInPlaceUpdate() : DBTestBase("/db_inplace_update_test") {}
+  DBTestInPlaceUpdate()
+      : DBTestBase("db_inplace_update_test", /*env_do_fsync=*/true) {}
 };
 
 TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) {
@@ -168,6 +169,36 @@
     ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
   } while (ChangeCompactOptions());
 }
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size, and
+    // run GetSnapshot and ReleaseSnapshot
+    int numValues = 2;
+    for (int i = numValues; i > 0; i--) {
+      const Snapshot* s = db_->GetSnapshot();
+      ASSERT_EQ(nullptr, s);
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+      // release s (nullptr)
+      db_->ReleaseSnapshot(s);
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_io_failure_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_io_failure_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_io_failure_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_io_failure_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,12 +9,14 @@
 
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBIOFailureTest : public DBTestBase {
  public:
-  DBIOFailureTest() : DBTestBase("/db_io_failure_test") {}
+  DBIOFailureTest() : DBTestBase("db_io_failure_test", /*env_do_fsync=*/true) {}
 };
 
 #ifndef ROCKSDB_LITE
@@ -33,7 +35,7 @@
     // Force out-of-space errors
     env_->drop_writes_.store(true, std::memory_order_release);
     env_->sleep_counter_.Reset();
-    env_->no_slowdown_ = true;
+    env_->SetMockSleep();
     for (int i = 0; i < 5; i++) {
       if (option_config_ != kUniversalCompactionMultiLevel &&
           option_config_ != kUniversalSubcompactions) {
@@ -41,11 +43,15 @@
           if (level > 0 && level == dbfull()->NumberLevels() - 1) {
             break;
           }
-          dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
-                                      true /* disallow trivial move */);
+          Status s =
+              dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+                                          true /* disallow trivial move */);
+          ASSERT_TRUE(s.ok() || s.IsCorruption());
         }
       } else {
-        dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        Status s =
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_TRUE(s.ok() || s.IsCorruption());
       }
     }
 
@@ -54,7 +60,8 @@
     ASSERT_EQ("5", property_value);
 
     env_->drop_writes_.store(false, std::memory_order_release);
-    ASSERT_LT(CountFiles(), num_files + 3);
+    const size_t count = CountFiles();
+    ASSERT_LT(count, num_files + 3);
 
     // Check that compaction attempts slept after errors
     // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
@@ -80,7 +87,8 @@
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("0", property_value);
 
-    dbfull()->TEST_FlushMemTable(true);
+    // ASSERT file is too short
+    ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption());
 
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("1", property_value);
@@ -164,7 +172,7 @@
     ASSERT_EQ("bar", Get("foo"));
 
     // Memtable compaction (will succeed)
-    Flush();
+    ASSERT_OK(Flush());
     ASSERT_EQ("bar", Get("foo"));
     const int last = 2;
     MoveFilesToLevel(2);
@@ -172,7 +180,8 @@
 
     // Merging compaction (will fail)
     error_type->store(true, std::memory_order_release);
-    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    ASSERT_NOK(
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr));  // Should fail
     ASSERT_EQ("bar", Get("foo"));
 
     error_type->store(false, std::memory_order_release);
@@ -190,7 +199,13 @@
 
     // Merging compaction (will fail)
     error_type->store(true, std::memory_order_release);
-    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    Status s =
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    if (iter == 0) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsIOError());
+    }
     ASSERT_EQ("bar", Get("foo"));
 
     // Recovery: should not lose data
@@ -218,18 +233,15 @@
   options.paranoid_checks = true;
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
   env_->log_write_error_.store(true, std::memory_order_release);
-  s = Put(1, "foo2", "bar2");
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
   env_->log_write_error_.store(false, std::memory_order_release);
-  s = Put(1, "foo3", "bar3");
   // the next put should fail, too
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(Put(1, "foo3", "bar3"));
   // but we're still able to read
   ASSERT_EQ("bar", Get(1, "foo"));
 
@@ -242,12 +254,10 @@
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
   env_->log_write_error_.store(true, std::memory_order_release);
-  s = Put(1, "foo2", "bar2");
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
   env_->log_write_error_.store(false, std::memory_order_release);
-  s = Put(1, "foo3", "bar3");
   // the next put should NOT fail
-  ASSERT_TRUE(s.ok());
+  ASSERT_OK(Put(1, "foo3", "bar3"));
 }
 #if !(defined NDEBUG) || !defined(OS_WIN)
 TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
@@ -260,29 +270,29 @@
   options.writable_file_max_buffer_size = 128 * 1024;
   options.bytes_per_sync = 128 * 1024;
   options.level0_file_num_compaction_trigger = 4;
-  options.memtable_factory.reset(new SpecialSkipListFactory(10));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(10));
   BlockBasedTableOptions table_options;
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
+  const char* io_error_msg = "range sync dummy error";
   std::atomic<int> range_sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
         if (range_sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("range sync dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   Random rnd(301);
   std::string rnd_str =
-      RandomString(&rnd, static_cast<int>(options.bytes_per_sync / 2));
-  std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024);
+      rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
 
   ASSERT_OK(Put(1, "foo", "bar"));
   // First 1MB doesn't get range synced
@@ -296,7 +306,9 @@
   ASSERT_OK(Put(1, "foo3_2", rnd_str));
   ASSERT_OK(Put(1, "foo3_3", rnd_str));
   ASSERT_OK(Put(1, "foo4", "bar"));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -326,12 +338,11 @@
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   Random rnd(301);
   std::string rnd_str =
-      RandomString(&rnd, static_cast<int>(options.bytes_per_sync / 2));
-  std::string rnd_str_512kb = RandomString(&rnd, 512 * 1024);
+      rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
 
   ASSERT_OK(Put(1, "foo", "bar"));
   // First 1MB doesn't get range synced
@@ -340,21 +351,22 @@
   ASSERT_OK(Put(1, "foo1_1", rnd_str));
   ASSERT_OK(Put(1, "foo1_2", rnd_str));
   ASSERT_OK(Put(1, "foo1_3", rnd_str));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo3_1", rnd_str));
   ASSERT_OK(Put(1, "foo3_2", rnd_str));
   ASSERT_OK(Put(1, "foo3_3", rnd_str));
   ASSERT_OK(Put(1, "foo4", "bar"));
-  Flush(1);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
 
+  const char* io_error_msg = "range sync dummy error";
   std::atomic<int> range_sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
         if (range_sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("range sync dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -363,7 +375,9 @@
                                  {
                                      {"disable_auto_compactions", "false"},
                                  }));
-  dbfull()->TEST_WaitForCompact();
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -383,17 +397,18 @@
   options.error_if_exists = false;
   options.paranoid_checks = true;
   options.level0_file_num_compaction_trigger = 4;
-  options.memtable_factory.reset(new SpecialSkipListFactory(2));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
+
+  const char* io_error_msg = "close dummy error";
   std::atomic<int> close_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Close", [&](void* arg) {
         if (close_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("close dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -402,7 +417,9 @@
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   ASSERT_OK(Put(1, "foo", "bar2"));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -427,25 +444,25 @@
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar2"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar3"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
+  const char* io_error_msg = "close dummy error";
   std::atomic<int> close_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Close", [&](void* arg) {
         if (close_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("close dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -454,7 +471,9 @@
                                  {
                                      {"disable_auto_compactions", "false"},
                                  }));
-  dbfull()->TEST_WaitForCompact();
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as compaction failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -474,17 +493,18 @@
   options.paranoid_checks = true;
   options.use_fsync = false;
   options.level0_file_num_compaction_trigger = 4;
-  options.memtable_factory.reset(new SpecialSkipListFactory(2));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
+
+  const char* io_error_msg = "sync dummy error";
   std::atomic<int> sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Sync", [&](void* arg) {
         if (sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("sync dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -493,7 +513,9 @@
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   ASSERT_OK(Put(1, "foo", "bar2"));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -519,25 +541,25 @@
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar2"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar3"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
+  const char* io_error_msg = "sync dummy error";
   std::atomic<int> sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Sync", [&](void* arg) {
         if (sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("close dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -546,7 +568,9 @@
                                  {
                                      {"disable_auto_compactions", "false"},
                                  }));
-  dbfull()->TEST_WaitForCompact();
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as compaction failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -564,5 +588,6 @@
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,9 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_iter.h"
-#include <string>
+
 #include <iostream>
 #include <limits>
+#include <string>
 
 #include "db/dbformat.h"
 #include "db/merge_context.h"
@@ -24,6 +25,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
 #include "trace_replay/trace_replay.h"
@@ -34,21 +36,26 @@
 namespace ROCKSDB_NAMESPACE {
 
 DBIter::DBIter(Env* _env, const ReadOptions& read_options,
-               const ImmutableCFOptions& cf_options,
+               const ImmutableOptions& ioptions,
                const MutableCFOptions& mutable_cf_options,
-               const Comparator* cmp, InternalIterator* iter, SequenceNumber s,
-               bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+               const Comparator* cmp, InternalIterator* iter,
+               const Version* version, SequenceNumber s, bool arena_mode,
+               uint64_t max_sequential_skip_in_iterations,
                ReadCallback* read_callback, DBImpl* db_impl,
-               ColumnFamilyData* cfd, bool allow_blob)
+               ColumnFamilyData* cfd, bool expose_blob_index)
     : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
       env_(_env),
-      logger_(cf_options.info_log),
+      clock_(ioptions.clock),
+      logger_(ioptions.logger),
       user_comparator_(cmp),
-      merge_operator_(cf_options.merge_operator),
+      merge_operator_(ioptions.merge_operator.get()),
       iter_(iter),
+      version_(version),
       read_callback_(read_callback),
       sequence_(s),
-      statistics_(cf_options.statistics),
+      statistics_(ioptions.stats),
+      max_skip_(max_sequential_skip_in_iterations),
+      max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
       num_internal_keys_skipped_(0),
       iterate_lower_bound_(read_options.iterate_lower_bound),
       iterate_upper_bound_(read_options.iterate_upper_bound),
@@ -63,22 +70,26 @@
       expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
                                      read_options.total_order_seek ||
                                      read_options.auto_prefix_mode),
-      allow_blob_(allow_blob),
+      read_tier_(read_options.read_tier),
+      verify_checksums_(read_options.verify_checksums),
+      expose_blob_index_(expose_blob_index),
       is_blob_(false),
       arena_mode_(arena_mode),
-      range_del_agg_(&cf_options.internal_comparator, s),
+      range_del_agg_(&ioptions.internal_comparator, s),
       db_impl_(db_impl),
       cfd_(cfd),
-      start_seqnum_(read_options.iter_start_seqnum) {
+      start_seqnum_(read_options.iter_start_seqnum),
+      timestamp_ub_(read_options.timestamp),
+      timestamp_lb_(read_options.iter_start_ts),
+      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
   RecordTick(statistics_, NO_ITERATOR_CREATED);
-  max_skip_ = max_sequential_skip_in_iterations;
-  max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
   if (pin_thru_lifetime_) {
     pinned_iters_mgr_.StartPinning();
   }
   if (iter_.iter()) {
     iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
   }
+  assert(timestamp_size_ == user_comparator_.timestamp_size());
 }
 
 Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
@@ -103,11 +114,11 @@
 }
 
 bool DBIter::ParseKey(ParsedInternalKey* ikey) {
-  if (!ParseInternalKey(iter_.key(), ikey)) {
-    status_ = Status::Corruption("corrupted internal key in DBIter");
+  Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
+  if (!s.ok()) {
+    status_ = Status::Corruption("In DBIter: ", s.getState());
     valid_ = false;
-    ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s",
-                    iter_.key().ToString(true).c_str());
+    ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
     return false;
   } else {
     return true;
@@ -118,7 +129,7 @@
   assert(valid_);
   assert(status_.ok());
 
-  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
   // Release temporarily pinned blocks from last operation
   ReleaseTempPinnedData();
   local_stats_.skip_count_ += num_internal_keys_skipped_;
@@ -143,13 +154,13 @@
 
   local_stats_.next_count_++;
   if (ok && iter_.Valid()) {
-    Slice prefix;
     if (prefix_same_as_start_) {
       assert(prefix_extractor_ != nullptr);
-      prefix = prefix_.GetUserKey();
+      const Slice prefix = prefix_.GetUserKey();
+      FindNextUserEntry(true /* skipping the current user key */, &prefix);
+    } else {
+      FindNextUserEntry(true /* skipping the current user key */, nullptr);
     }
-    FindNextUserEntry(true /* skipping the current user key */,
-                      prefix_same_as_start_ ? &prefix : nullptr);
   } else {
     is_key_seqnum_zero_ = false;
     valid_ = false;
@@ -160,6 +171,43 @@
   }
 }
 
+bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
+                                  const Slice& blob_index) {
+  assert(!is_blob_);
+
+  if (expose_blob_index_) {  // Stacked BlobDB implementation
+    is_blob_ = true;
+    return true;
+  }
+
+  if (!version_) {
+    status_ = Status::Corruption("Encountered unexpected blob index.");
+    valid_ = false;
+    return false;
+  }
+
+  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
+  // avoid having to copy options back and forth.
+  ReadOptions read_options;
+  read_options.read_tier = read_tier_;
+  read_options.verify_checksums = verify_checksums_;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr uint64_t* bytes_read = nullptr;
+
+  const Status s = version_->GetBlob(read_options, user_key, blob_index,
+                                     prefetch_buffer, &blob_value_, bytes_read);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+    return false;
+  }
+
+  is_blob_ = true;
+  return true;
+}
+
 // PRE: saved_key_ has the current user key if skipping_saved_key
 // POST: saved_key_ should have the next user key if valid_,
 //       if the current entry is a result of merge
@@ -216,19 +264,28 @@
       is_key_seqnum_zero_ = false;
       return false;
     }
+    Slice user_key_without_ts =
+        StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
 
     is_key_seqnum_zero_ = (ikey_.sequence == 0);
 
-    assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
-           user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
-    if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
-        user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
+    assert(iterate_upper_bound_ == nullptr ||
+           iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
+           user_comparator_.CompareWithoutTimestamp(
+               user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+               /*b_has_ts=*/false) < 0);
+    if (iterate_upper_bound_ != nullptr &&
+        iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
+        user_comparator_.CompareWithoutTimestamp(
+            user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+            /*b_has_ts=*/false) >= 0) {
       break;
     }
 
     assert(prefix == nullptr || prefix_extractor_ != nullptr);
     if (prefix != nullptr &&
-        prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) {
+        prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) !=
+            0) {
       assert(prefix_same_as_start_);
       break;
     }
@@ -237,24 +294,37 @@
       return false;
     }
 
-    if (IsVisible(ikey_.sequence)) {
+    assert(ikey_.user_key.size() >= timestamp_size_);
+    Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
+                                         ikey_.user_key, timestamp_size_)
+                                   : Slice();
+    bool more_recent = false;
+    if (IsVisible(ikey_.sequence, ts, &more_recent)) {
       // If the previous entry is of seqnum 0, the current entry will not
       // possibly be skipped. This condition can potentially be relaxed to
       // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
       // prone to bugs causing the same user key with the same sequence number.
-      if (!is_prev_key_seqnum_zero && skipping_saved_key &&
-          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <=
-              0) {
+      // Note that with current timestamp implementation, the same user key can
+      // have different timestamps and zero sequence number on the bottommost
+      // level. This may change in the future.
+      if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) &&
+          skipping_saved_key &&
+          CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
       } else {
         assert(!skipping_saved_key ||
-               user_comparator_.Compare(ikey_.user_key,
-                                        saved_key_.GetUserKey()) > 0);
+               CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
+        if (!iter_.PrepareValue()) {
+          assert(!iter_.status().ok());
+          valid_ = false;
+          return false;
+        }
         num_skipped = 0;
         reseek_done = false;
         switch (ikey_.type) {
           case kTypeDeletion:
+          case kTypeDeletionWithTimestamp:
           case kTypeSingleDeletion:
             // Arrange to skip all upcoming entries for this key since
             // they are hidden by this deletion.
@@ -263,7 +333,20 @@
             // 2) return ikey only if ikey.seqnum >= start_seqnum_
             // note that if deletion seqnum is < start_seqnum_ we
             // just skip it like in normal iterator.
-            if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_)  {
+            if (start_seqnum_ > 0) {
+              if (ikey_.sequence >= start_seqnum_) {
+                saved_key_.SetInternalKey(ikey_);
+                valid_ = true;
+                return true;
+              } else {
+                saved_key_.SetUserKey(
+                    ikey_.user_key,
+                    !pin_thru_lifetime_ ||
+                        !iter_.iter()->IsKeyPinned() /* copy */);
+                skipping_saved_key = true;
+                PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              }
+            } else if (timestamp_lb_) {
               saved_key_.SetInternalKey(ikey_);
               valid_ = true;
               return true;
@@ -278,11 +361,15 @@
           case kTypeValue:
           case kTypeBlobIndex:
             if (start_seqnum_ > 0) {
-              // we are taking incremental snapshot here
-              // incremental snapshots aren't supported on DB with range deletes
-              assert(ikey_.type != kTypeBlobIndex);
               if (ikey_.sequence >= start_seqnum_) {
                 saved_key_.SetInternalKey(ikey_);
+
+                if (ikey_.type == kTypeBlobIndex) {
+                  if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                    return false;
+                  }
+                }
+
                 valid_ = true;
                 return true;
               } else {
@@ -294,6 +381,17 @@
                         !iter_.iter()->IsKeyPinned() /* copy */);
                 skipping_saved_key = true;
               }
+            } else if (timestamp_lb_) {
+              saved_key_.SetInternalKey(ikey_);
+
+              if (ikey_.type == kTypeBlobIndex) {
+                if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                  return false;
+                }
+              }
+
+              valid_ = true;
+              return true;
             } else {
               saved_key_.SetUserKey(
                   ikey_.user_key, !pin_thru_lifetime_ ||
@@ -306,20 +404,13 @@
                 num_skipped = 0;
                 reseek_done = false;
                 PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-              } else if (ikey_.type == kTypeBlobIndex) {
-                if (!allow_blob_) {
-                  ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-                  status_ = Status::NotSupported(
-                      "Encounter unexpected blob index. Please open DB with "
-                      "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-                  valid_ = false;
-                  return false;
+              } else {
+                if (ikey_.type == kTypeBlobIndex) {
+                  if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                    return false;
+                  }
                 }
 
-                is_blob_ = true;
-                valid_ = true;
-                return true;
-              } else {
                 valid_ = true;
                 return true;
               }
@@ -346,18 +437,23 @@
             }
             break;
           default:
-            assert(false);
-            break;
+            valid_ = false;
+            status_ = Status::Corruption(
+                "Unknown value type: " +
+                std::to_string(static_cast<unsigned int>(ikey_.type)));
+            return false;
         }
       }
     } else {
-      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+      if (more_recent) {
+        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+      }
 
-      // This key was inserted after our snapshot was taken.
-      // If this happens too many times in a row for the same user key, we want
-      // to seek to the target sequence number.
-      int cmp =
-          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey());
+      // This key was inserted after our snapshot was taken or skipped by
+      // timestamp range. If this happens too many times in a row for the same
+      // user key, we want to seek to the target sequence number.
+      int cmp = user_comparator_.CompareWithoutTimestamp(
+          ikey_.user_key, saved_key_.GetUserKey());
       if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
         num_skipped++;
       } else {
@@ -388,8 +484,17 @@
         // We're looking for the next user-key but all we see are the same
         // user-key with decreasing sequence numbers. Fast forward to
         // sequence number 0 and type deletion (the smallest type).
-        AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
-                                                       0, kTypeDeletion));
+        if (timestamp_size_ == 0) {
+          AppendInternalKey(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
+        } else {
+          const std::string kTsMin(timestamp_size_, '\0');
+          AppendInternalKeyWithDifferentTimestamp(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
+              kTsMin);
+        }
         // Don't set skipping_saved_key = false because we may still see more
         // user-keys equal to saved_key_.
       } else {
@@ -398,9 +503,17 @@
         // Note that this only covers a case when a higher key was overwritten
         // many times since our snapshot was taken, not the case when a lot of
         // different keys were inserted after our snapshot was taken.
-        AppendInternalKey(&last_key,
-                          ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
-                                            kValueTypeForSeek));
+        if (timestamp_size_ == 0) {
+          AppendInternalKey(
+              &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                           kValueTypeForSeek));
+        } else {
+          AppendInternalKeyWithDifferentTimestamp(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                kValueTypeForSeek),
+              *timestamp_ub_);
+        }
       }
       iter_.Seek(last_key);
       RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
@@ -417,6 +530,7 @@
 // Scan from the newer entries to older entries.
 // PRE: iter_.key() points to the first merge type entry
 //      saved_key_ stores the user key
+//      iter_.PrepareValue() has been called
 // POST: saved_value_ has the merged value for the user key
 //       iter_ points to the next entry (or invalid)
 bool DBIter::MergeValuesNewToOld() {
@@ -436,7 +550,6 @@
   TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
 
   ParsedInternalKey ikey;
-  Status s;
   for (iter_.Next(); iter_.Valid(); iter_.Next()) {
     TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
     if (!ParseKey(&ikey)) {
@@ -446,23 +559,26 @@
     if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
       // hit the next user key, stop right here
       break;
-    } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
+    }
+    if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
                range_del_agg_.ShouldDelete(
                    ikey, RangeDelPositioningMode::kForwardTraversal)) {
       // hit a delete with the same user key, stop right here
       // iter_ is positioned after delete
       iter_.Next();
       break;
-    } else if (kTypeValue == ikey.type) {
+    }
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (kTypeValue == ikey.type) {
       // hit a put, merge the put value with operands and store the
       // final result in saved_value_. We are done!
       const Slice val = iter_.value();
-      s = MergeHelper::TimedFullMerge(
-          merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(),
-          &saved_value_, logger_, statistics_, env_, &pinned_value_, true);
+      Status s = Merge(&val, ikey.user_key);
       if (!s.ok()) {
-        valid_ = false;
-        status_ = s;
         return false;
       }
       // iter_ is positioned after put
@@ -479,19 +595,37 @@
           iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
       PERF_COUNTER_ADD(internal_merge_count, 1);
     } else if (kTypeBlobIndex == ikey.type) {
-      if (!allow_blob_) {
-        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-        status_ = Status::NotSupported(
-            "Encounter unexpected blob index. Please open DB with "
-            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-      } else {
+      if (expose_blob_index_) {
         status_ =
-            Status::NotSupported("Blob DB does not support merge operator.");
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
+      }
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
       }
+      valid_ = true;
+      const Slice blob_value = value();
+      Status s = Merge(&blob_value, ikey.user_key);
+      if (!s.ok()) {
+        return false;
+      }
+      is_blob_ = false;
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
+    } else {
       valid_ = false;
+      status_ = Status::Corruption(
+          "Unrecognized value type: " +
+          std::to_string(static_cast<unsigned int>(ikey.type)));
       return false;
-    } else {
-      assert(false);
     }
   }
 
@@ -504,16 +638,10 @@
   // a deletion marker.
   // feed null as the existing value to the merge operator, such that
   // client can differentiate this scenario and do things accordingly.
-  s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
-                                  nullptr, merge_context_.GetOperands(),
-                                  &saved_value_, logger_, statistics_, env_,
-                                  &pinned_value_, true);
+  Status s = Merge(nullptr, saved_key_.GetUserKey());
   if (!s.ok()) {
-    valid_ = false;
-    status_ = s;
     return false;
   }
-
   assert(status_.ok());
   return true;
 }
@@ -522,7 +650,7 @@
   assert(valid_);
   assert(status_.ok());
 
-  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
   bool ok = true;
@@ -557,9 +685,16 @@
   // If that's the case, seek iter_ to current key.
   if (!expect_total_order_inner_iter() || !iter_.Valid()) {
     IterKey last_key;
-    last_key.SetInternalKey(ParsedInternalKey(
-        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+    ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    if (timestamp_size_ > 0) {
+      // TODO: pre-create kTsMax.
+      const std::string kTsMax(timestamp_size_, '\xff');
+      pikey.SetTimestamp(kTsMax);
+    }
+    last_key.SetInternalKey(pikey);
     iter_.Seek(last_key.GetInternalKey());
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
   }
 
   direction_ = kForward;
@@ -610,6 +745,7 @@
         iter_.SeekToLast();
       }
     }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
   }
 
   direction_ = kReverse;
@@ -624,7 +760,9 @@
 
     assert(prefix == nullptr || prefix_extractor_ != nullptr);
     if (prefix != nullptr &&
-        prefix_extractor_->Transform(saved_key_.GetUserKey())
+        prefix_extractor_
+                ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(),
+                                                      timestamp_size_))
                 .compare(*prefix) != 0) {
       assert(prefix_same_as_start_);
       // Current key does not have the same prefix as start
@@ -633,11 +771,13 @@
     }
 
     assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
-           user_comparator_.Compare(saved_key_.GetUserKey(),
-                                    *iterate_lower_bound_) >= 0);
+           user_comparator_.CompareWithoutTimestamp(
+               saved_key_.GetUserKey(), /*a_has_ts=*/true,
+               *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
     if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
-        user_comparator_.Compare(saved_key_.GetUserKey(),
-                                 *iterate_lower_bound_) < 0) {
+        user_comparator_.CompareWithoutTimestamp(
+            saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+            /*b_has_ts=*/false) < 0) {
       // We've iterated earlier than the user-specified lower bound.
       valid_ = false;
       return;
@@ -682,8 +822,8 @@
   assert(iter_.Valid());
   merge_context_.Clear();
   current_entry_is_merged_ = false;
-  // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
-  // kTypeValue)
+  // last entry before merge (could be kTypeDeletion,
+  // kTypeDeletionWithTimestamp, kTypeSingleDeletion or kTypeValue)
   ValueType last_not_merge_type = kTypeDeletion;
   ValueType last_key_entry_type = kTypeDeletion;
 
@@ -697,10 +837,20 @@
       return false;
     }
 
-    if (!IsVisible(ikey.sequence) ||
-        !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+    if (!IsVisible(ikey.sequence, ts) ||
+        !user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
       break;
     }
+    if (!ts.empty()) {
+      saved_timestamp_.assign(ts.data(), ts.size());
+    }
     if (TooManyInternalKeysSkipped()) {
       return false;
     }
@@ -712,6 +862,11 @@
       return FindValueForCurrentKeyUsingSeek();
     }
 
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
     last_key_entry_type = ikey.type;
     switch (last_key_entry_type) {
       case kTypeValue:
@@ -720,14 +875,22 @@
                 ikey, RangeDelPositioningMode::kBackwardTraversal)) {
           last_key_entry_type = kTypeRangeDeletion;
           PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-        } else {
-          assert(iter_.iter()->IsValuePinned());
+        } else if (iter_.iter()->IsValuePinned()) {
           pinned_value_ = iter_.value();
+        } else {
+          valid_ = false;
+          status_ = Status::NotSupported(
+              "Backward iteration not supported if underlying iterator's value "
+              "cannot be pinned.");
         }
         merge_context_.Clear();
         last_not_merge_type = last_key_entry_type;
+        if (!status_.ok()) {
+          return false;
+        }
         break;
       case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
       case kTypeSingleDeletion:
         merge_context_.Clear();
         last_not_merge_type = last_key_entry_type;
@@ -749,7 +912,11 @@
         }
         break;
       default:
-        assert(false);
+        valid_ = false;
+        status_ = Status::Corruption(
+            "Unknown value type: " +
+            std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+        return false;
     }
 
     PERF_COUNTER_ADD(internal_key_skipped_count, 1);
@@ -763,9 +930,11 @@
   }
 
   Status s;
+  s.PermitUncheckedError();
   is_blob_ = false;
   switch (last_key_entry_type) {
     case kTypeDeletion:
+    case kTypeDeletionWithTimestamp:
     case kTypeSingleDeletion:
     case kTypeRangeDeletion:
       valid_ = false;
@@ -775,47 +944,52 @@
       if (last_not_merge_type == kTypeDeletion ||
           last_not_merge_type == kTypeSingleDeletion ||
           last_not_merge_type == kTypeRangeDeletion) {
-        s = MergeHelper::TimedFullMerge(
-            merge_operator_, saved_key_.GetUserKey(), nullptr,
-            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
-            env_, &pinned_value_, true);
+        s = Merge(nullptr, saved_key_.GetUserKey());
+        if (!s.ok()) {
+          return false;
+        }
+        return true;
       } else if (last_not_merge_type == kTypeBlobIndex) {
-        if (!allow_blob_) {
-          ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-          status_ = Status::NotSupported(
-              "Encounter unexpected blob index. Please open DB with "
-              "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-        } else {
+        if (expose_blob_index_) {
           status_ =
-              Status::NotSupported("Blob DB does not support merge operator.");
+              Status::NotSupported("BlobDB does not support merge operator.");
+          valid_ = false;
+          return false;
         }
-        valid_ = false;
-        return false;
+        if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+          return false;
+        }
+        valid_ = true;
+        const Slice blob_value = value();
+        s = Merge(&blob_value, saved_key_.GetUserKey());
+        if (!s.ok()) {
+          return false;
+        }
+        is_blob_ = false;
+        return true;
       } else {
         assert(last_not_merge_type == kTypeValue);
-        s = MergeHelper::TimedFullMerge(
-            merge_operator_, saved_key_.GetUserKey(), &pinned_value_,
-            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
-            env_, &pinned_value_, true);
+        s = Merge(&pinned_value_, saved_key_.GetUserKey());
+        if (!s.ok()) {
+          return false;
+        }
+        return true;
       }
       break;
     case kTypeValue:
       // do nothing - we've already has value in pinned_value_
       break;
     case kTypeBlobIndex:
-      if (!allow_blob_) {
-        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-        status_ = Status::NotSupported(
-            "Encounter unexpected blob index. Please open DB with "
-            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-        valid_ = false;
+      if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
         return false;
       }
-      is_blob_ = true;
       break;
     default:
-      assert(false);
-      break;
+      valid_ = false;
+      status_ = Status::Corruption(
+          "Unknown value type: " +
+          std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+      return false;
   }
   if (!s.ok()) {
     valid_ = false;
@@ -835,8 +1009,17 @@
   // FindValueForCurrentKeyUsingSeek()
   assert(pinned_iters_mgr_.PinningEnabled());
   std::string last_key;
-  AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
-                                                 sequence_, kValueTypeForSeek));
+  if (0 == timestamp_size_) {
+    AppendInternalKey(&last_key,
+                      ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                        kValueTypeForSeek));
+  } else {
+    AppendInternalKeyWithDifferentTimestamp(
+        &last_key,
+        ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                          kValueTypeForSeek),
+        *timestamp_ub_);
+  }
   iter_.Seek(last_key);
   RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
 
@@ -853,7 +1036,15 @@
     if (!ParseKey(&ikey)) {
       return false;
     }
-    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
       // No visible values for this key, even though FindValueForCurrentKey()
       // has seen some. This is possible if we're using a tailing iterator, and
       // the entries were discarded in a compaction.
@@ -861,7 +1052,7 @@
       return true;
     }
 
-    if (IsVisible(ikey.sequence)) {
+    if (IsVisible(ikey.sequence, ts)) {
       break;
     }
 
@@ -870,22 +1061,28 @@
 
   if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
       range_del_agg_.ShouldDelete(
-          ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+          ikey, RangeDelPositioningMode::kBackwardTraversal) ||
+      kTypeDeletionWithTimestamp == ikey.type) {
     valid_ = false;
     return true;
   }
-  if (ikey.type == kTypeBlobIndex && !allow_blob_) {
-    ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-    status_ = Status::NotSupported(
-        "Encounter unexpected blob index. Please open DB with "
-        "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+  if (!iter_.PrepareValue()) {
     valid_ = false;
     return false;
   }
+  if (timestamp_size_ > 0) {
+    Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
+    saved_timestamp_.assign(ts.data(), ts.size());
+  }
   if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
     assert(iter_.iter()->IsValuePinned());
     pinned_value_ = iter_.value();
-    is_blob_ = (ikey.type == kTypeBlobIndex);
+    if (ikey.type == kTypeBlobIndex) {
+      if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) {
+        return false;
+      }
+    }
+
     valid_ = true;
     return true;
   }
@@ -913,52 +1110,56 @@
     if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
       break;
     }
-
     if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
         range_del_agg_.ShouldDelete(
             ikey, RangeDelPositioningMode::kForwardTraversal)) {
       break;
-    } else if (ikey.type == kTypeValue) {
+    }
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (ikey.type == kTypeValue) {
       const Slice val = iter_.value();
-      Status s = MergeHelper::TimedFullMerge(
-          merge_operator_, saved_key_.GetUserKey(), &val,
-          merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
-          env_, &pinned_value_, true);
+      Status s = Merge(&val, saved_key_.GetUserKey());
       if (!s.ok()) {
-        valid_ = false;
-        status_ = s;
         return false;
       }
-      valid_ = true;
       return true;
     } else if (ikey.type == kTypeMerge) {
       merge_context_.PushOperand(
           iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
       PERF_COUNTER_ADD(internal_merge_count, 1);
     } else if (ikey.type == kTypeBlobIndex) {
-      if (!allow_blob_) {
-        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-        status_ = Status::NotSupported(
-            "Encounter unexpected blob index. Please open DB with "
-            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-      } else {
+      if (expose_blob_index_) {
         status_ =
-            Status::NotSupported("Blob DB does not support merge operator.");
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
+      }
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
       }
+      valid_ = true;
+      const Slice blob_value = value();
+      Status s = Merge(&blob_value, saved_key_.GetUserKey());
+      if (!s.ok()) {
+        return false;
+      }
+      is_blob_ = false;
+      return true;
+    } else {
       valid_ = false;
+      status_ = Status::Corruption(
+          "Unknown value type: " +
+          std::to_string(static_cast<unsigned int>(ikey.type)));
       return false;
-    } else {
-      assert(false);
     }
   }
 
-  Status s = MergeHelper::TimedFullMerge(
-      merge_operator_, saved_key_.GetUserKey(), nullptr,
-      merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_,
-      &pinned_value_, true);
+  Status s = Merge(nullptr, saved_key_.GetUserKey());
   if (!s.ok()) {
-    valid_ = false;
-    status_ = s;
     return false;
   }
 
@@ -981,6 +1182,19 @@
   return true;
 }
 
+Status DBIter::Merge(const Slice* val, const Slice& user_key) {
+  Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, user_key, val, merge_context_.GetOperands(),
+      &saved_value_, logger_, statistics_, clock_, &pinned_value_, true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return s;
+  }
+  valid_ = true;
+  return s;
+}
+
 // Move backwards until the key smaller than saved_key_.
 // Changes valid_ only if return value is false.
 bool DBIter::FindUserKeyBeforeSavedKey() {
@@ -992,7 +1206,8 @@
       return false;
     }
 
-    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+    if (user_comparator_.CompareWithoutTimestamp(ikey.user_key,
+                                                 saved_key_.GetUserKey()) < 0) {
       return true;
     }
 
@@ -1001,7 +1216,13 @@
     }
 
     assert(ikey.sequence != kMaxSequenceNumber);
-    if (!IsVisible(ikey.sequence)) {
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+    if (!IsVisible(ikey.sequence, ts)) {
       PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
     } else {
       PERF_COUNTER_ADD(internal_key_skipped_count, 1);
@@ -1010,8 +1231,14 @@
     if (num_skipped >= max_skip_) {
       num_skipped = 0;
       IterKey last_key;
-      last_key.SetInternalKey(ParsedInternalKey(
-          saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+      ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+      if (timestamp_size_ > 0) {
+        // TODO: pre-create kTsMax.
+        const std::string kTsMax(timestamp_size_, '\xff');
+        pikey.SetTimestamp(kTsMax);
+      }
+      last_key.SetInternalKey(pikey);
       // It would be more efficient to use SeekForPrev() here, but some
       // iterators may not support it.
       iter_.Seek(last_key.GetInternalKey());
@@ -1046,26 +1273,40 @@
   return false;
 }
 
-bool DBIter::IsVisible(SequenceNumber sequence) {
-  if (read_callback_ == nullptr) {
-    return sequence <= sequence_;
-  } else {
-    return read_callback_->IsVisible(sequence);
+bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
+                       bool* more_recent) {
+  // Remember that comparator orders preceding timestamp as larger.
+  // TODO(yanqin): support timestamp in read_callback_.
+  bool visible_by_seq = (read_callback_ == nullptr)
+                            ? sequence <= sequence_
+                            : read_callback_->IsVisible(sequence);
+
+  bool visible_by_ts =
+      (timestamp_ub_ == nullptr ||
+       user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
+      (timestamp_lb_ == nullptr ||
+       user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);
+
+  if (more_recent) {
+    *more_recent = !visible_by_seq;
   }
+  return visible_by_seq && visible_by_ts;
 }
 
 void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
   is_key_seqnum_zero_ = false;
   SequenceNumber seq = sequence_;
   saved_key_.Clear();
-  saved_key_.SetInternalKey(target, seq);
+  saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_);
 
   if (iterate_lower_bound_ != nullptr &&
-      user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
-          0) {
+      user_comparator_.CompareWithoutTimestamp(
+          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+          /*b_has_ts=*/false) < 0) {
     // Seek key is smaller than the lower bound.
     saved_key_.Clear();
-    saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
+    saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek,
+                              timestamp_ub_);
   }
 }
 
@@ -1074,23 +1315,50 @@
   saved_key_.Clear();
   // now saved_key is used to store internal key.
   saved_key_.SetInternalKey(target, 0 /* sequence_number */,
-                            kValueTypeForSeekForPrev);
+                            kValueTypeForSeekForPrev, timestamp_ub_);
+
+  if (timestamp_size_ > 0) {
+    const std::string kTsMin(timestamp_size_, '\0');
+    Slice ts = kTsMin;
+    saved_key_.UpdateInternalKey(/*seq=*/0, kValueTypeForSeekForPrev, &ts);
+  }
 
   if (iterate_upper_bound_ != nullptr &&
-      user_comparator_.Compare(saved_key_.GetUserKey(),
-                               *iterate_upper_bound_) >= 0) {
+      user_comparator_.CompareWithoutTimestamp(
+          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
+          /*b_has_ts=*/false) >= 0) {
     saved_key_.Clear();
-    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
+                              kValueTypeForSeekForPrev, timestamp_ub_);
+    if (timestamp_size_ > 0) {
+      const std::string kTsMax(timestamp_size_, '\xff');
+      Slice ts = kTsMax;
+      saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev,
+                                   &ts);
+    }
   }
 }
 
 void DBIter::Seek(const Slice& target) {
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
-  StopWatch sw(env_, statistics_, DB_SEEK);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
 
 #ifndef ROCKSDB_LITE
   if (db_impl_ != nullptr && cfd_ != nullptr) {
-    db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
+    // TODO: What do we do if this returns an error?
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound)
+        .PermitUncheckedError();
   }
 #endif  // ROCKSDB_LITE
 
@@ -1118,7 +1386,7 @@
   // we need to find out the next key that is visible to the user.
   ClearSavedValue();
   if (prefix_same_as_start_) {
-    // The case where the iterator needs to be invalidated if it has exausted
+    // The case where the iterator needs to be invalidated if it has exhausted
     // keys within the same prefix of the seek key.
     assert(prefix_extractor_ != nullptr);
     Slice target_prefix = prefix_extractor_->Transform(target);
@@ -1146,12 +1414,27 @@
 }
 
 void DBIter::SeekForPrev(const Slice& target) {
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
-  StopWatch sw(env_, statistics_, DB_SEEK);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
 
 #ifndef ROCKSDB_LITE
   if (db_impl_ != nullptr && cfd_ != nullptr) {
-    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+    // TODO: What do we do if this returns an error?
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_
+        ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound,
+                                   upper_bound)
+        .PermitUncheckedError();
   }
 #endif  // ROCKSDB_LITE
 
@@ -1178,7 +1461,7 @@
   // backward direction.
   ClearSavedValue();
   if (prefix_same_as_start_) {
-    // The case where the iterator needs to be invalidated if it has exausted
+    // The case where the iterator needs to be invalidated if it has exhausted
     // keys within the same prefix of the seek key.
     assert(prefix_extractor_ != nullptr);
     Slice target_prefix = prefix_extractor_->Transform(target);
@@ -1205,7 +1488,7 @@
     Seek(*iterate_lower_bound_);
     return;
   }
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek will be used.
   if (!expect_total_order_inner_iter()) {
@@ -1243,7 +1526,8 @@
   }
   if (valid_ && prefix_same_as_start_) {
     assert(prefix_extractor_ != nullptr);
-    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
   }
 }
 
@@ -1251,14 +1535,16 @@
   if (iterate_upper_bound_ != nullptr) {
     // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
     SeekForPrev(*iterate_upper_bound_);
-    if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
+    if (Valid() && 0 == user_comparator_.CompareWithoutTimestamp(
+                            *iterate_upper_bound_, /*a_has_ts=*/false, key(),
+                            /*b_has_ts=*/false)) {
       ReleaseTempPinnedData();
       PrevInternal(nullptr);
     }
     return;
   }
 
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek will be used.
   if (!expect_total_order_inner_iter()) {
@@ -1287,23 +1573,25 @@
   }
   if (valid_ && prefix_same_as_start_) {
     assert(prefix_extractor_ != nullptr);
-    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
   }
 }
 
 Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
-                        const ImmutableCFOptions& cf_options,
+                        const ImmutableOptions& ioptions,
                         const MutableCFOptions& mutable_cf_options,
                         const Comparator* user_key_comparator,
-                        InternalIterator* internal_iter,
+                        InternalIterator* internal_iter, const Version* version,
                         const SequenceNumber& sequence,
                         uint64_t max_sequential_skip_in_iterations,
                         ReadCallback* read_callback, DBImpl* db_impl,
-                        ColumnFamilyData* cfd, bool allow_blob) {
-  DBIter* db_iter = new DBIter(
-      env, read_options, cf_options, mutable_cf_options, user_key_comparator,
-      internal_iter, sequence, false, max_sequential_skip_in_iterations,
-      read_callback, db_impl, cfd, allow_blob);
+                        ColumnFamilyData* cfd, bool expose_blob_index) {
+  DBIter* db_iter =
+      new DBIter(env, read_options, ioptions, mutable_cf_options,
+                 user_key_comparator, internal_iter, version, sequence, false,
+                 max_sequential_skip_in_iterations, read_callback, db_impl, cfd,
+                 expose_blob_index);
   return db_iter;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,10 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <stdint.h>
+#include <cstdint>
 #include <string>
+
 #include "db/db_impl/db_impl.h"
-#include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
 #include "memory/arena.h"
 #include "options/cf_options.h"
@@ -21,6 +21,7 @@
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Version;
 
 // This file declares the factory functions of DBIter, in its original form
 // or a wrapped form with class ArenaWrappedDBIter, which is defined here.
@@ -66,7 +67,7 @@
   //        this->key().
   // (2) When moving backwards, the internal iterator is positioned
   //     just before all entries whose user key == this->key().
-  enum Direction { kForward, kReverse };
+  enum Direction : uint8_t { kForward, kReverse };
 
   // LocalStatistics contain Statistics counters that will be aggregated per
   // each iterator instance and then will be sent to the global statistics when
@@ -112,12 +113,12 @@
   };
 
   DBIter(Env* _env, const ReadOptions& read_options,
-         const ImmutableCFOptions& cf_options,
+         const ImmutableOptions& ioptions,
          const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
-         InternalIterator* iter, SequenceNumber s, bool arena_mode,
-         uint64_t max_sequential_skip_in_iterations,
+         InternalIterator* iter, const Version* version, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
          ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-         bool allow_blob);
+         bool expose_blob_index);
 
   // No copying allowed
   DBIter(const DBIter&) = delete;
@@ -140,18 +141,29 @@
   }
   ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; }
 
-  bool Valid() const override { return valid_; }
+  bool Valid() const override {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (valid_) {
+      status_.PermitUncheckedError();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    return valid_;
+  }
   Slice key() const override {
     assert(valid_);
-    if (start_seqnum_ > 0) {
+    if (start_seqnum_ > 0 || timestamp_lb_) {
       return saved_key_.GetInternalKey();
     } else {
-      return saved_key_.GetUserKey();
+      const Slice ukey_and_ts = saved_key_.GetUserKey();
+      return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_);
     }
   }
   Slice value() const override {
     assert(valid_);
-    if (current_entry_is_merged_) {
+
+    if (!expose_blob_index_ && is_blob_) {
+      return blob_value_;
+    } else if (current_entry_is_merged_) {
       // If pinned_value_ is set then the result of merge operator is one of
       // the merge operands and we should return it.
       return pinned_value_.data() ? pinned_value_ : saved_value_;
@@ -169,8 +181,18 @@
       return status_;
     }
   }
+  Slice timestamp() const override {
+    assert(valid_);
+    assert(timestamp_size_ > 0);
+    if (direction_ == kReverse) {
+      return saved_timestamp_;
+    }
+    const Slice ukey_and_ts = saved_key_.GetUserKey();
+    assert(timestamp_size_ < ukey_and_ts.size());
+    return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_);
+  }
   bool IsBlob() const {
-    assert(valid_ && (allow_blob_ || !is_blob_));
+    assert(valid_);
     return is_blob_;
   }
 
@@ -178,6 +200,8 @@
 
   void Next() final override;
   void Prev() final override;
+  // 'target' does not contain timestamp, even if user timestamp feature is
+  // enabled.
   void Seek(const Slice& target) final override;
   void SeekForPrev(const Slice& target) final override;
   void SeekToFirst() final override;
@@ -210,7 +234,7 @@
   // If `skipping_saved_key` is true, the function will keep iterating until it
   // finds a user key that is larger than `saved_key_`.
   // If `prefix` is not null, the iterator needs to stop when all keys for the
-  // prefix are exhausted and the interator is set to invalid.
+  // prefix are exhausted and the iterator is set to invalid.
   bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
   // Internal implementation of FindNextUserEntry().
   bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
@@ -221,7 +245,8 @@
   // entry can be found within the prefix.
   void PrevInternal(const Slice* prefix);
   bool TooManyInternalKeysSkipped(bool increment = true);
-  bool IsVisible(SequenceNumber sequence);
+  bool IsVisible(SequenceNumber sequence, const Slice& ts,
+                 bool* more_recent = nullptr);
 
   // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
   // is called
@@ -260,12 +285,29 @@
     return expect_total_order_inner_iter_;
   }
 
+  // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need
+  // to return versions of the same key. We cannot just skip if the key value
+  // is the same but timestamps are different but fall in timestamp range.
+  inline int CompareKeyForSkip(const Slice& a, const Slice& b) {
+    return timestamp_lb_ != nullptr
+               ? user_comparator_.Compare(a, b)
+               : user_comparator_.CompareWithoutTimestamp(a, b);
+  }
+
+  // Retrieves the blob value for the specified user key using the given blob
+  // index when using the integrated BlobDB implementation.
+  bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index);
+
+  Status Merge(const Slice* val, const Slice& user_key);
+
   const SliceTransform* prefix_extractor_;
   Env* const env_;
+  SystemClock* clock_;
   Logger* logger_;
   UserComparatorWrapper user_comparator_;
   const MergeOperator* const merge_operator_;
   IteratorWrapper iter_;
+  const Version* version_;
   ReadCallback* read_callback_;
   // Max visible sequence number. It is normally the snapshot seq unless we have
   // uncommitted data in db as in WriteUnCommitted.
@@ -279,6 +321,7 @@
   std::string saved_value_;
   Slice pinned_value_;
   // for prefix seek mode to support prev()
+  PinnableSlice blob_value_;
   Statistics* statistics_;
   uint64_t max_skip_;
   uint64_t max_skippable_internal_keys_;
@@ -308,7 +351,11 @@
   // Expect the inner iterator to maintain a total order.
   // prefix_extractor_ must be non-NULL if the value is false.
   const bool expect_total_order_inner_iter_;
-  bool allow_blob_;
+  ReadTier read_tier_;
+  bool verify_checksums_;
+  // Whether the iterator is allowed to expose blob references. Set to true when
+  // the stacked BlobDB implementation is used, false otherwise.
+  bool expose_blob_index_;
   bool is_blob_;
   bool arena_mode_;
   // List of operands for merge operator.
@@ -327,18 +374,22 @@
   // for diff snapshots we want the lower bound on the seqnum;
   // if this value > 0 iterator will return internal keys
   SequenceNumber start_seqnum_;
+  const Slice* const timestamp_ub_;
+  const Slice* const timestamp_lb_;
+  const size_t timestamp_size_;
+  std::string saved_timestamp_;
 };
 
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified `sequence` number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
     const MutableCFOptions& mutable_cf_options,
     const Comparator* user_key_comparator, InternalIterator* internal_iter,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
-    ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
+    const Version* version, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false);
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_stress_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -97,7 +97,8 @@
 
   bool MaybeFail() {
     if (rnd->Next() >=
-        std::numeric_limits<uint64_t>::max() * error_probability) {
+        static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+            error_probability) {
       return false;
     }
     if (rnd->Next() % 2) {
@@ -114,7 +115,8 @@
 
   void MaybeMutate() {
     if (rnd->Next() >=
-        std::numeric_limits<uint64_t>::max() * mutation_probability) {
+        static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+            mutation_probability) {
       return;
     }
     do {
@@ -126,8 +128,9 @@
       if (data->hidden.empty()) {
         hide_probability = 1;
       }
-      bool do_hide =
-          rnd->Next() < std::numeric_limits<uint64_t>::max() * hide_probability;
+      bool do_hide = rnd->Next() <
+                     static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+                         hide_probability;
       if (do_hide) {
         // Hide a random entry.
         size_t idx = rnd->Next() % data->entries.size();
@@ -508,9 +511,9 @@
                       target_hidden_fraction;
                   internal_iter->trace = trace;
                   db_iter.reset(NewDBIterator(
-                      env_, ropt, ImmutableCFOptions(options),
+                      env_, ropt, ImmutableOptions(options),
                       MutableCFOptions(options), BytewiseComparator(),
-                      internal_iter, sequence,
+                      internal_iter, nullptr /* version */, sequence,
                       options.max_sequential_skip_in_iterations,
                       nullptr /*read_callback*/));
                 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iter_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iter_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -99,9 +99,11 @@
     }
     for (auto it = data_.begin(); it != data_.end(); ++it) {
       ParsedInternalKey ikey;
-      bool ok __attribute__((__unused__)) = ParseInternalKey(it->first, &ikey);
-      assert(ok);
-      if (ikey.user_key != _key) {
+      Status pik_status =
+          ParseInternalKey(it->first, &ikey, true /* log_err_key */);
+      pik_status.PermitUncheckedError();
+      assert(pik_status.ok());
+      if (!pik_status.ok() || ikey.user_key != _key) {
         continue;
       }
       if (valid_ && data_.begin() + iter_ > it) {
@@ -235,7 +237,7 @@
 
 TEST_F(DBIteratorTest, DBIteratorPrevNext) {
   Options options;
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -250,9 +252,10 @@
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -283,9 +286,10 @@
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -310,9 +314,10 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -343,9 +348,10 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -379,12 +385,14 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
   // Test case to check SeekToLast with iterate_upper_bound set
   // (same key put may times - SeekToLast should start with the
@@ -409,9 +417,10 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 7, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -447,9 +456,10 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -473,12 +483,14 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
   // Test to check the SeekToLast() with the iterate_upper_bound set
   // (Deletion cases)
@@ -496,9 +508,10 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -532,9 +545,10 @@
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 7, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -562,9 +576,10 @@
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -583,6 +598,7 @@
 
     db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 
   {
@@ -605,9 +621,10 @@
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -637,9 +654,10 @@
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -659,7 +677,7 @@
 
 TEST_F(DBIteratorTest, DBIteratorEmpty) {
   Options options;
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   ReadOptions ro;
 
@@ -668,11 +686,13 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 
   {
@@ -680,11 +700,13 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 }
 
@@ -703,9 +725,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 2,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -726,6 +749,7 @@
 
   db_iter->Prev();
   ASSERT_TRUE(!db_iter->Valid());
+  ASSERT_OK(db_iter->status());
   ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
 }
 
@@ -733,7 +757,7 @@
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -748,9 +772,10 @@
 
       options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -769,6 +794,7 @@
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 
@@ -784,9 +810,10 @@
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -800,6 +827,7 @@
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
 
     {
@@ -813,9 +841,10 @@
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, 202, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 202 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -834,6 +863,7 @@
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 
@@ -846,14 +876,17 @@
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
 
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -863,9 +896,10 @@
     internal_iter->AddPut("c", "200");
     internal_iter->Finish();
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 200, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 200 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -873,6 +907,7 @@
 
     db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -881,6 +916,7 @@
 
     db_iter->Next();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 
   {
@@ -898,9 +934,10 @@
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -919,6 +956,7 @@
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 
@@ -933,9 +971,10 @@
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -958,13 +997,14 @@
 
       db_iter->Prev();
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 }
 
 TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
   Options options;
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   ReadOptions ro;
 
@@ -983,9 +1023,10 @@
 
     ro.max_skippable_internal_keys = 0;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1013,7 +1054,7 @@
 
     db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
-    ASSERT_TRUE(db_iter->status().ok());
+    ASSERT_OK(db_iter->status());
   }
 
   // Test to make sure that the request will *not* fail as incomplete if
@@ -1030,9 +1071,10 @@
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1075,9 +1117,10 @@
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1114,9 +1157,10 @@
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1150,9 +1194,10 @@
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1181,9 +1226,10 @@
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1219,9 +1265,10 @@
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1257,9 +1304,10 @@
 
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1311,9 +1359,10 @@
       options.max_sequential_skip_in_iterations = 1000;
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1350,9 +1399,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 1,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1378,9 +1428,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 0,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      0 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1403,9 +1454,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 2,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1428,9 +1480,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 4,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1447,7 +1500,7 @@
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -1462,9 +1515,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1485,9 +1539,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 1, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1508,9 +1563,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1531,9 +1587,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 3, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1554,9 +1611,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1577,9 +1635,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 5, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1600,9 +1659,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 6, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1621,9 +1681,10 @@
     internal_iter->AddPut("b", "val_b");
     internal_iter->Finish();
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->Seek("b");
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -1637,7 +1698,7 @@
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -1652,9 +1713,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1675,9 +1737,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 1, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1698,9 +1761,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1721,9 +1785,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 3, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -1740,9 +1805,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1763,9 +1829,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 5, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1786,9 +1853,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 6, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1802,7 +1870,7 @@
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -1829,9 +1897,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1864,9 +1933,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1905,9 +1975,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1946,9 +2017,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 5, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1992,9 +2064,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 6, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2039,9 +2112,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 7, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2080,9 +2154,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 9, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 9 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2127,9 +2202,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 13, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 13 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2175,9 +2251,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 14, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 14 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2206,9 +2283,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2237,9 +2315,10 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 10,
-        options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2304,9 +2383,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   db_iter->Seek("c");
   ASSERT_TRUE(db_iter->Valid());
@@ -2344,9 +2424,9 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10, 0 /* force seek */,
-      nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2373,9 +2453,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 1,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2400,8 +2481,9 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -2437,8 +2519,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, 3 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), key);
@@ -2465,8 +2549,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, 1 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2492,19 +2578,21 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 13,
-        options.max_sequential_skip_in_iterations, nullptr));
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        13 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     // Expecting InternalKeys in [5,8] range with correct type
     int seqnums[4] = {5,8,11,13};
     std::string user_keys[4] = {"1","2","3","4"};
     std::string values[4] = {"1c", "2c", "3c", "4b"};
     int i = 0;
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
-      FullKey fkey;
-      ParseFullKey(db_iter->key(), &fkey);
+      ParsedInternalKey fkey;
+      ASSERT_OK(
+          ParseInternalKey(db_iter->key(), &fkey, true /* log_err_key */));
       ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
-      ASSERT_EQ(EntryType::kEntryPut, fkey.type);
+      ASSERT_EQ(kTypeValue, fkey.type);
       ASSERT_EQ(seqnums[i], fkey.sequence);
       ASSERT_EQ(values[i], db_iter->value().ToString());
       i++;
@@ -2527,19 +2615,21 @@
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 13,
-        options.max_sequential_skip_in_iterations, nullptr));
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        13 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     // Expecting InternalKeys in [5,8] range with correct type
     int seqnums[4] = {5,8,11,13};
-    EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete,
-      EntryType::kEntryDelete,EntryType::kEntryPut};
+    ValueType key_types[4] = {kTypeDeletion, kTypeDeletion, kTypeDeletion,
+                              kTypeValue};
     std::string user_keys[4] = {"1","2","3","4"};
     std::string values[4] = {"", "", "", "4b"};
     int i = 0;
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
-      FullKey fkey;
-      ParseFullKey(db_iter->key(), &fkey);
+      ParsedInternalKey fkey;
+      ASSERT_OK(
+          ParseInternalKey(db_iter->key(), &fkey, true /* log_err_key */));
       ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
       ASSERT_EQ(key_types[i], fkey.type);
       ASSERT_EQ(seqnums[i], fkey.sequence);
@@ -2577,10 +2667,10 @@
         NewMergingIterator(&icomp_, &child_iters[0], 2u);
 
     db_iter_.reset(NewDBIterator(
-        env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_),
-        BytewiseComparator(), merge_iter,
+        env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
+        BytewiseComparator(), merge_iter, nullptr /* version */,
         8 /* read data earlier than seqId 8 */,
-        3 /* max iterators before reseek */, nullptr /*read_callback*/));
+        3 /* max iterators before reseek */, nullptr /* read_callback */));
   }
 
   Env* env_;
@@ -3017,9 +3107,10 @@
 
   ro.prefix_same_as_start = true;
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   int skipped_keys = 0;
 
@@ -3053,15 +3144,16 @@
     ro.iterate_lower_bound = &lower_bound;
     Options options;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     if (i == kNumKeys + 1) {
       // lower bound was beyond the last key
       ASSERT_FALSE(db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     } else {
       ASSERT_TRUE(db_iter->Valid());
       int expected;
@@ -3092,9 +3184,10 @@
   ro.iterate_lower_bound = &lower_bound;
   Options options;
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10 /* sequence */,
-      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   db_iter->SeekToLast();
   for (int i = kNumKeys; i >= kLowerBound; --i) {
@@ -3120,9 +3213,10 @@
   ro.iterate_lower_bound = &lower_bound;
   Options options;
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10 /* sequence */,
-      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   auto before_lower_bound_str = std::to_string(kLowerBound - 1);
   Slice before_lower_bound(lower_bound_str);
@@ -3145,9 +3239,10 @@
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ReadOptions(), ImmutableCFOptions(options),
-      MutableCFOptions(options), BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   db_iter->SeekForPrev("a");
   ASSERT_TRUE(db_iter->Valid());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iterator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iterator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_iterator_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_iterator_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,6 +17,8 @@
 #include "rocksdb/iostats_context.h"
 #include "rocksdb/perf_context.h"
 #include "table/block_based/flush_block_policy.h"
+#include "util/random.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -33,14 +35,15 @@
 class DBIteratorTest : public DBTestBase,
                        public testing::WithParamInterface<bool> {
  public:
-  DBIteratorTest() : DBTestBase("/db_iterator_test") {}
+  DBIteratorTest() : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {}
 
   Iterator* NewIterator(const ReadOptions& read_options,
                         ColumnFamilyHandle* column_family = nullptr) {
     if (column_family == nullptr) {
       column_family = db_->DefaultColumnFamily();
     }
-    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
     SequenceNumber seq = read_options.snapshot != nullptr
                              ? read_options.snapshot->GetSequenceNumber()
                              : db_->GetLatestSequenceNumber();
@@ -65,8 +68,8 @@
   // The test needs to be changed if kPersistedTier is supported in iterator.
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
-  Put(1, "1", "2");
-  Delete(1, "2");
+  ASSERT_OK(Put(1, "1", "2"));
+  ASSERT_OK(Delete(1, "2"));
   ReadOptions ropt;
   ropt.pin_data = false;
   {
@@ -170,10 +173,10 @@
 TEST_P(DBIteratorTest, IterSeekBeforePrev) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("0", "f"));
   ASSERT_OK(Put("1", "h"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("2", "j"));
   auto iter = NewIterator(ReadOptions());
   iter->Seek(Slice("c"));
@@ -193,11 +196,11 @@
   options.compression = kNoCompression;
   Reopen(options);
 
-  ASSERT_OK(Put("a", RandomString(&rnd, 400)));
-  ASSERT_OK(Put("aabb", RandomString(&rnd, 400)));
-  ASSERT_OK(Put("aaef", RandomString(&rnd, 400)));
-  ASSERT_OK(Put("b", RandomString(&rnd, 400)));
-  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", rnd.RandomString(400)));
+  ASSERT_OK(Put("aabb", rnd.RandomString(400)));
+  ASSERT_OK(Put("aaef", rnd.RandomString(400)));
+  ASSERT_OK(Put("b", rnd.RandomString(400)));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ReadOptions opts;
   Slice ub = Slice("aa");
   opts.iterate_upper_bound = &ub;
@@ -213,10 +216,10 @@
 TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("0", "f"));
   ASSERT_OK(Put("1", "h"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("2", "j"));
   auto iter = NewIterator(ReadOptions());
   iter->SeekForPrev(Slice("0"));
@@ -236,7 +239,7 @@
   ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
   ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
   ASSERT_OK(Put("a", "b"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
   ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
   ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
@@ -274,7 +277,7 @@
 
 TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("d", "e"));
@@ -300,7 +303,7 @@
 
 TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("d", "e"));
@@ -331,7 +334,7 @@
 
 TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
   ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("e", "f"));
@@ -375,6 +378,8 @@
     iter->SeekForPrev("foo");
     ASSERT_EQ(IterStatus(iter), "(invalid)");
 
+    ASSERT_OK(iter->status());
+
     delete iter;
   } while (ChangeCompactOptions());
 }
@@ -615,6 +620,40 @@
   delete iter;
 }
 
+TEST_F(DBIteratorTest, ReseekUponDirectionChange) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(
+      new StringAppendTESTOperator(/*delim_char=*/' '));
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Put("bar", "value"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToLast();
+    it->Prev();
+    it->Next();
+  }
+  ASSERT_EQ(1,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  const std::string merge_key("good");
+  ASSERT_OK(Put(merge_key, "orig"));
+  ASSERT_OK(Merge(merge_key, "suffix"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek(merge_key);
+    ASSERT_TRUE(it->Valid());
+    const uint64_t prev_reseek_count =
+        options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+    it->Prev();
+    ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount(
+                                         NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+}
+
 TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -781,18 +820,18 @@
 TEST_P(DBIteratorTest, IteratorPinsRef) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Put(1, "foo", "hello");
+    ASSERT_OK(Put(1, "foo", "hello"));
 
     // Get iterator that will yield the current contents of the DB.
     Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
 
     // Write to force compactions
-    Put(1, "foo", "newvalue1");
+    ASSERT_OK(Put(1, "foo", "newvalue1"));
     for (int i = 0; i < 100; i++) {
       // 100K values
       ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
     }
-    Put(1, "foo", "newvalue2");
+    ASSERT_OK(Put(1, "foo", "newvalue2"));
 
     iter->SeekToFirst();
     ASSERT_TRUE(iter->Valid());
@@ -807,8 +846,8 @@
 TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
   CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 
-  Put(1, "foo", "delete-cf-then-delete-iter");
-  Put(1, "hello", "value2");
+  ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter"));
+  ASSERT_OK(Put(1, "hello", "value2"));
 
   ColumnFamilyHandle* cf = handles_[1];
   ReadOptions ro;
@@ -818,7 +857,7 @@
   ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
 
   // delete CF handle
-  db_->DestroyColumnFamilyHandle(cf);
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
   handles_.erase(std::begin(handles_) + 1);
 
   // delete Iterator after CF handle is deleted
@@ -830,7 +869,7 @@
 TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
   CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 
-  Put(1, "foo", "drop-cf-then-delete-iter");
+  ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter"));
 
   ReadOptions ro;
   ColumnFamilyHandle* cf = handles_[1];
@@ -840,8 +879,8 @@
   ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
 
   // drop and delete CF
-  db_->DropColumnFamily(cf);
-  db_->DestroyColumnFamilyHandle(cf);
+  EXPECT_OK(db_->DropColumnFamily(cf));
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
   handles_.erase(std::begin(handles_) + 1);
 
   // delete Iterator after CF handle is dropped
@@ -1167,32 +1206,62 @@
     ropt.tailing = tailing;
     std::unique_ptr<Iterator> iter(NewIterator(ropt));
 
+    ropt.read_tier = ReadTier::kBlockCacheTier;
+    std::unique_ptr<Iterator> nonblocking_iter(NewIterator(ropt));
+
     iter->Seek("b10");
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ("b2", iter->key().ToString());
     EXPECT_EQ("y2", iter->value().ToString());
     EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
 
+    // The cache-only iterator should succeed too, using the blocks pulled into
+    // the cache by the previous iterator.
+    nonblocking_iter->Seek("b10");
+    ASSERT_TRUE(nonblocking_iter->Valid());
+    EXPECT_EQ("b2", nonblocking_iter->key().ToString());
+    EXPECT_EQ("y2", nonblocking_iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // ... but it shouldn't be able to step forward since the next block is
+    // not in cache yet.
+    nonblocking_iter->Next();
+    ASSERT_FALSE(nonblocking_iter->Valid());
+    ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
+    // ... nor should a seek to the next key succeed.
+    nonblocking_iter->Seek("b20");
+    ASSERT_FALSE(nonblocking_iter->Valid());
+    ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
     iter->Next();
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ("b3", iter->key().ToString());
     EXPECT_EQ("y3", iter->value().ToString());
-    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
-    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // After the blocking iterator loaded the next block, the nonblocking
+    // iterator's seek should succeed.
+    nonblocking_iter->Seek("b20");
+    ASSERT_TRUE(nonblocking_iter->Valid());
+    EXPECT_EQ("b3", nonblocking_iter->key().ToString());
+    EXPECT_EQ("y3", nonblocking_iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
 
     iter->Seek("c0");
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ("c0", iter->key().ToString());
     EXPECT_EQ("z1,z2", iter->value().ToString());
-    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
-    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(6, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
 
     iter->Next();
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ("c3", iter->key().ToString());
     EXPECT_EQ("z3", iter->value().ToString());
-    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
-    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
 
     iter.reset();
 
@@ -1207,13 +1276,13 @@
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ("b2", iter->key().ToString());
     EXPECT_EQ("y2", iter->value().ToString());
-    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
-    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
 
     iter->Next();
     ASSERT_FALSE(iter->Valid());
-    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
-    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
   }
 }
 
@@ -1275,9 +1344,9 @@
 
   // write three entries with different keys using Merge()
   WriteOptions wopts;
-  db_->Merge(wopts, "1", "data1");
-  db_->Merge(wopts, "2", "data2");
-  db_->Merge(wopts, "3", "data3");
+  ASSERT_OK(db_->Merge(wopts, "1", "data1"));
+  ASSERT_OK(db_->Merge(wopts, "2", "data2"));
+  ASSERT_OK(db_->Merge(wopts, "3", "data3"));
 
   std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
 
@@ -1329,7 +1398,7 @@
 
     std::vector<std::string> generated_keys(key_pool);
     for (int i = 0; i < key_pool; i++) {
-      generated_keys[i] = RandomString(&rnd, key_size);
+      generated_keys[i] = rnd.RandomString(key_size);
     }
 
     std::map<std::string, std::string> true_data;
@@ -1337,7 +1406,7 @@
     std::vector<std::string> deleted_keys;
     for (int i = 0; i < puts; i++) {
       auto& k = generated_keys[rnd.Next() % key_pool];
-      auto v = RandomString(&rnd, val_size);
+      auto v = rnd.RandomString(val_size);
 
       // Insert data to true_data map and to DB
       true_data[k] = v;
@@ -1361,7 +1430,7 @@
 
       if (run_config == TestConfig::FLUSH_EVERY_1000) {
         if (i && i % 1000 == 0) {
-          Flush();
+          ASSERT_OK(Flush());
         }
       }
     }
@@ -1370,7 +1439,7 @@
       Close();
       Reopen(options);
     } else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
-      db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
 
     ReadOptions ro;
@@ -1467,9 +1536,11 @@
 }
 };
 
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
   PinnedDataIteratorRandomized(TestConfig::NORMAL);
 }
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
   PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
@@ -1484,6 +1555,10 @@
   PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
 }
 
+INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance,
+                        DBIteratorTestForPinnedData,
+                        testing::Values(true, false));
+
 #ifndef ROCKSDB_LITE
 TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
   Options options = CurrentOptions();
@@ -1500,7 +1575,7 @@
   Random rnd(301);
   for (int i = 1; i <= 1000; i++) {
     std::string k = Key(i * 3);
-    std::string v = RandomString(&rnd, 100);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(Put(k, v));
     true_data[k] = v;
     if (i % 250 == 0) {
@@ -1514,7 +1589,7 @@
   // Generate 4 sst files in L0
   for (int i = 1; i <= 1000; i++) {
     std::string k = Key(i * 2);
-    std::string v = RandomString(&rnd, 100);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(Put(k, v));
     true_data[k] = v;
     if (i % 250 == 0) {
@@ -1526,7 +1601,7 @@
   // Add some keys/values in memtables
   for (int i = 1; i <= 1000; i++) {
     std::string k = Key(i);
-    std::string v = RandomString(&rnd, 100);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(Put(k, v));
     true_data[k] = v;
   }
@@ -1628,8 +1703,8 @@
 
   std::map<std::string, std::string> true_data;
   for (int i = 0; i < 1000; i++) {
-    std::string k = RandomString(&rnd, 10);
-    std::string v = RandomString(&rnd, 1000);
+    std::string k = rnd.RandomString(10);
+    std::string v = rnd.RandomString(1000);
     ASSERT_OK(Put(k, v));
     true_data[k] = v;
   }
@@ -1643,7 +1718,7 @@
     if (rnd.OneIn(2)) {
       ASSERT_OK(Delete(kv.first));
     } else {
-      std::string new_val = RandomString(&rnd, 1000);
+      std::string new_val = rnd.RandomString(1000);
       ASSERT_OK(Put(kv.first, new_val));
     }
   }
@@ -1736,6 +1811,7 @@
     Iterator* iter = NewIterator(ro);
     iter->SeekForPrev("c2");
     ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
     delete iter;
   }
 }
@@ -1791,6 +1867,7 @@
     Iterator* iter = NewIterator(ro);
     iter->SeekForPrev("c2");
     ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
     delete iter;
   }
 }
@@ -1873,7 +1950,7 @@
   DestroyAndReopen(options);
 
   const int kNumKeys = 500;
-  // Small number of merge operands to make sure that DBIter::Prev() dont
+  // Small number of merge operands to make sure that DBIter::Prev() don't
   // fall back to Seek()
   const int kNumMergeOperands = 3;
   // Use value size that will make sure that every block contain 1 key
@@ -1900,7 +1977,7 @@
 
   for (int i = 0; i < kNumKeys; i++) {
     gen_key = Key(i);
-    gen_val = RandomString(&rnd, kValSize);
+    gen_val = rnd.RandomString(kValSize);
 
     ASSERT_OK(Put(gen_key, gen_val));
     true_data[gen_key] = gen_val;
@@ -1908,7 +1985,7 @@
   ASSERT_OK(Flush());
 
   // Separate values and merge operands in different file so that we
-  // make sure that we dont merge them while flushing but actually
+  // make sure that we don't merge them while flushing but actually
   // merge them in the read path
   for (int i = 0; i < kNumKeys; i++) {
     if (rnd.PercentTrue(kNoMergeOpPercentage)) {
@@ -1918,7 +1995,7 @@
 
     for (int j = 0; j < kNumMergeOperands; j++) {
       gen_key = Key(i);
-      gen_val = RandomString(&rnd, kValSize);
+      gen_val = rnd.RandomString(kValSize);
 
       ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
       true_data[gen_key] += "," + gen_val;
@@ -2018,7 +2095,7 @@
   Random rnd(301);
   for (int i = 0; i < 1000; i++) {
     // Key 10 bytes / Value 10 bytes
-    ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+    ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
   }
 
   std::atomic<uint64_t> total_next(0);
@@ -2114,24 +2191,24 @@
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
   table_options.no_block_cache = true;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
 
   std::string value(1024, 'a');
   for (int i = 0; i < 100; i++) {
-    Put(Key(i), value);
+    ASSERT_OK(Put(Key(i), value));
   }
   ASSERT_OK(Flush());
   MoveFilesToLevel(2);
 
   for (int i = 0; i < 100; i++) {
-    Put(Key(i), value);
+    ASSERT_OK(Put(Key(i), value));
   }
   ASSERT_OK(Flush());
   MoveFilesToLevel(1);
 
   for (int i = 0; i < 100; i++) {
-    Put(Key(i), value);
+    ASSERT_OK(Put(Key(i), value));
   }
   ASSERT_OK(Flush());
 #ifndef ROCKSDB_LITE
@@ -2238,6 +2315,7 @@
   ASSERT_OK(Put("x", "y"));
 
   std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+  ASSERT_OK(iter->status());
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ(iter->key().compare(Slice("x")), 0);
@@ -2252,7 +2330,8 @@
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  iter->Refresh();
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
 
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
@@ -2263,7 +2342,7 @@
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
 
   ASSERT_OK(Put("m", "n"));
 
@@ -2276,7 +2355,8 @@
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  iter->Refresh();
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
 
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
@@ -2299,6 +2379,7 @@
   ReadOptions options;
   options.snapshot = snapshot;
   Iterator* iter = NewIterator(options);
+  ASSERT_OK(iter->status());
 
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
@@ -2314,8 +2395,8 @@
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  Status s;
-  s = iter->Refresh();
+  ASSERT_OK(iter->status());
+  Status s = iter->Refresh();
   ASSERT_TRUE(s.IsNotSupported());
   db_->ReleaseSnapshot(snapshot);
   delete iter;
@@ -2373,14 +2454,14 @@
 
 TEST_P(DBIteratorTest, TableFilter) {
   ASSERT_OK(Put("a", "1"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("b", "2"));
   ASSERT_OK(Put("c", "3"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("d", "4"));
   ASSERT_OK(Put("e", "5"));
   ASSERT_OK(Put("f", "6"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
 
   // Ensure the table_filter callback is called once for each table.
   {
@@ -2565,13 +2646,13 @@
   ReadOptions ropts;
   ropts.max_skippable_internal_keys = 2;
 
-  Put("1", "val_1");
+  ASSERT_OK(Put("1", "val_1"));
   // Add more tombstones than max_skippable_internal_keys so that Next() fails.
-  Delete("2");
-  Delete("3");
-  Delete("4");
-  Delete("5");
-  Put("6", "val_6");
+  ASSERT_OK(Delete("2"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Delete("4"));
+  ASSERT_OK(Delete("5"));
+  ASSERT_OK(Put("6", "val_6"));
 
   std::unique_ptr<Iterator> iter(NewIterator(ropts));
   iter->SeekToFirst();
@@ -2613,9 +2694,9 @@
   DestroyAndReopen(options);
 
   // Two records in sst file, each in its own block.
-  Put("b", "");
-  Put("d", "");
-  Flush();
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Put("d", ""));
+  ASSERT_OK(Flush());
 
   // Create a nonblocking iterator before writing to memtable.
   ReadOptions ropt;
@@ -2625,7 +2706,7 @@
   // Overwrite a key in memtable many times to hit
   // max_sequential_skip_in_iterations (which is 8 by default).
   for (int i = 0; i < 20; ++i) {
-    Put("c", "");
+    ASSERT_OK(Put("c", ""));
   }
 
   // Load the second block in sst file into the block cache.
@@ -2642,9 +2723,9 @@
 }
 
 TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
-  Put("a", "");
-  Put("b", "");
-  Flush();
+  ASSERT_OK(Put("a", ""));
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Flush());
 
   ReadOptions ropt;
   Slice ub = "b";
@@ -2674,7 +2755,7 @@
   Reopen(options);
 
   Random rnd(301);
-  std::string random_str = RandomString(&rnd, 180);
+  std::string random_str = rnd.RandomString(180);
 
   ASSERT_OK(Put("1", random_str));
   ASSERT_OK(Put("2", random_str));
@@ -2851,6 +2932,127 @@
   ASSERT_OK(iter->status());
 }
 
+TEST_P(DBIteratorTest, Blob) {
+  Options options = CurrentOptions();
+  options.enable_blob_files = true;
+  options.max_sequential_skip_in_iterations = 2;
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  // Note: we have 4 KVs (3 of which are hidden) for key "b" and
+  // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek
+  // anytime we move from "b" to "c" or vice versa.
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("c", "vc"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter_guard(NewIterator(ReadOptions()));
+  Iterator* const iter = iter_guard.get();
+
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("ax");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->SeekForPrev("d");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("bx");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->Seek("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Seek("z");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekForPrev("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->SeekForPrev("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  // Switch from reverse to forward
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  // Switch from forward to reverse
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+}
+
 INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
                         testing::Values(true, false));
 
@@ -2881,7 +3083,7 @@
 
   SequenceNumber seq2 = db_->GetLatestSequenceNumber();
   auto* cfd =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
           ->cfd();
   // The iterator are suppose to see data before seq1.
   Iterator* iter =
@@ -2989,6 +3191,44 @@
   delete iter;
 }
 
+TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.inplace_update_support = false;
+  options.env = env_;
+  DestroyAndReopen(options);
+  constexpr int kNumKeys = 10;
+
+  // Write kNumKeys to WAL.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    int count = 0;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ++count;
+    }
+    ASSERT_EQ(kNumKeys, count);
+  }
+
+  // Reopen and rebuild the memtable from WAL.
+  options.create_if_missing = false;
+  options.avoid_flush_during_recovery = true;
+  options.inplace_update_support = true;
+  options.allow_concurrent_memtable_write = false;
+  Reopen(options);
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    // Backward iteration not supported due to inplace_update_support = true.
+    ASSERT_TRUE(iter->status().IsNotSupported());
+    ASSERT_FALSE(iter->Valid());
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_kv_checksum_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,197 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class WriteBatchOpType {
+  kPut = 0,
+  kDelete,
+  kSingleDelete,
+  kDeleteRange,
+  kMerge,
+  kBlobIndex,
+  kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
+  using T = std::underlying_type<WriteBatchOpType>::type;
+  return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
+}
+
+class DbKvChecksumTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<WriteBatchOpType, char>> {
+ public:
+  DbKvChecksumTest()
+      : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
+    op_type_ = std::get<0>(GetParam());
+    corrupt_byte_addend_ = std::get<1>(GetParam());
+  }
+
+  std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle) {
+    Status s;
+    WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
+                  8 /* protection_bytes_per_entry */);
+    switch (op_type_) {
+      case WriteBatchOpType::kPut:
+        s = wb.Put(cf_handle, "key", "val");
+        break;
+      case WriteBatchOpType::kDelete:
+        s = wb.Delete(cf_handle, "key");
+        break;
+      case WriteBatchOpType::kSingleDelete:
+        s = wb.SingleDelete(cf_handle, "key");
+        break;
+      case WriteBatchOpType::kDeleteRange:
+        s = wb.DeleteRange(cf_handle, "begin", "end");
+        break;
+      case WriteBatchOpType::kMerge:
+        s = wb.Merge(cf_handle, "key", "val");
+        break;
+      case WriteBatchOpType::kBlobIndex:
+        // TODO(ajkr): use public API once available.
+        uint32_t cf_id;
+        if (cf_handle == nullptr) {
+          cf_id = 0;
+        } else {
+          cf_id = cf_handle->GetID();
+        }
+        s = WriteBatchInternal::PutBlobIndex(&wb, cf_id, "key", "val");
+        break;
+      case WriteBatchOpType::kNum:
+        assert(false);
+    }
+    return {std::move(wb), std::move(s)};
+  }
+
+  void CorruptNextByteCallBack(void* arg) {
+    Slice encoded = *static_cast<Slice*>(arg);
+    if (entry_len_ == port::kMaxSizet) {
+      // We learn the entry size on the first attempt
+      entry_len_ = encoded.size();
+    }
+    // All entries should be the same size
+    assert(entry_len_ == encoded.size());
+    char* buf = const_cast<char*>(encoded.data());
+    buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+    ++corrupt_byte_offset_;
+  }
+
+  bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; }
+
+ protected:
+  WriteBatchOpType op_type_;
+  char corrupt_byte_addend_;
+  size_t corrupt_byte_offset_ = 0;
+  size_t entry_len_ = port::kMaxSizet;
+};
+
+std::string GetTestNameSuffix(
+    ::testing::TestParamInfo<std::tuple<WriteBatchOpType, char>> info) {
+  std::ostringstream oss;
+  switch (std::get<0>(info.param)) {
+    case WriteBatchOpType::kPut:
+      oss << "Put";
+      break;
+    case WriteBatchOpType::kDelete:
+      oss << "Delete";
+      break;
+    case WriteBatchOpType::kSingleDelete:
+      oss << "SingleDelete";
+      break;
+    case WriteBatchOpType::kDeleteRange:
+      oss << "DeleteRange";
+      break;
+    case WriteBatchOpType::kMerge:
+      oss << "Merge";
+      break;
+    case WriteBatchOpType::kBlobIndex:
+      oss << "BlobIndex";
+      break;
+    case WriteBatchOpType::kNum:
+      assert(false);
+  }
+  oss << "Add"
+      << static_cast<int>(static_cast<unsigned char>(std::get<1>(info.param)));
+  return oss.str();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DbKvChecksumTest, DbKvChecksumTest,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Values(2, 103, 251)),
+    GetTestNameSuffix);
+
+TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted in its
+  // memtable entry by adding `corrupt_byte_addend_` to its original value. The
+  // test repeats until an attempt has been made on each byte in the encoded
+  // memtable entry. All attempts are expected to fail with `Status::Corruption`
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    Options options = CurrentOptions();
+    if (op_type_ == WriteBatchOpType::kMerge) {
+      options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    }
+    Reopen(options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto batch_and_status = GetWriteBatch(nullptr /* cf_handle */);
+    ASSERT_OK(batch_and_status.second);
+    ASSERT_TRUE(
+        db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_` to a non-default column family. Each attempt has
+  // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_`
+  // to its original value. The test repeats until an attempt has been made on
+  // each byte in the encoded memtable entry. All attempts are expected to fail
+  // with `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto batch_and_status = GetWriteBatch(handles_[1]);
+    ASSERT_OK(batch_and_status.second);
+    ASSERT_TRUE(
+        db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_log_iter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_log_iter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_log_iter_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_log_iter_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,13 +13,15 @@
 #if !defined(ROCKSDB_LITE)
 
 #include "db/db_test_util.h"
+#include "env/mock_env.h"
 #include "port/stack_trace.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBTestXactLogIterator : public DBTestBase {
  public:
-  DBTestXactLogIterator() : DBTestBase("/db_log_iter_test") {}
+  DBTestXactLogIterator()
+      : DBTestBase("db_log_iter_test", /*env_do_fsync=*/true) {}
 
   std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
       const SequenceNumber seq) {
@@ -32,9 +34,8 @@
 };
 
 namespace {
-SequenceNumber ReadRecords(
-    std::unique_ptr<TransactionLogIterator>& iter,
-    int& count) {
+SequenceNumber ReadRecords(std::unique_ptr<TransactionLogIterator>& iter,
+                           int& count, bool expect_ok = true) {
   count = 0;
   SequenceNumber lastSequence = 0;
   BatchResult res;
@@ -46,6 +47,11 @@
     EXPECT_OK(iter->status());
     iter->Next();
   }
+  if (expect_ok) {
+    EXPECT_OK(iter->status());
+  } else {
+    EXPECT_NOK(iter->status());
+  }
   return res.sequence;
 }
 
@@ -63,9 +69,9 @@
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, options);
-    Put(0, "key1", DummyString(1024));
-    Put(1, "key2", DummyString(1024));
-    Put(1, "key2", DummyString(1024));
+    ASSERT_OK(Put(0, "key1", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
     {
       auto iter = OpenTransactionLogIter(0);
@@ -74,9 +80,9 @@
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
     env_->SleepForMicroseconds(2 * 1000 * 1000);
     {
-      Put(0, "key4", DummyString(1024));
-      Put(1, "key5", DummyString(1024));
-      Put(0, "key6", DummyString(1024));
+      ASSERT_OK(Put(0, "key4", DummyString(1024)));
+      ASSERT_OK(Put(1, "key5", DummyString(1024)));
+      ASSERT_OK(Put(0, "key6", DummyString(1024)));
     }
     {
       auto iter = OpenTransactionLogIter(0);
@@ -108,15 +114,15 @@
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
       Options options = OptionsForLogIterTest();
       DestroyAndReopen(options);
-      Put("key1", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key2", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key3", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key4", DummyString(1024));
+      ASSERT_OK(Put("key1", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key2", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key3", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key4", DummyString(1024)));
       ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
-      dbfull()->FlushWAL(false);
+      ASSERT_OK(dbfull()->FlushWAL(false));
 
       {
         auto iter = OpenTransactionLogIter(0);
@@ -129,11 +135,11 @@
       // condition
       FlushOptions flush_options;
       flush_options.wait = false;
-      dbfull()->Flush(flush_options);
+      ASSERT_OK(dbfull()->Flush(flush_options));
 
       // "key5" would be written in a new memtable and log
-      Put("key5", DummyString(1024));
-      dbfull()->FlushWAL(false);
+      ASSERT_OK(Put("key5", DummyString(1024)));
+      ASSERT_OK(dbfull()->FlushWAL(false));
       {
         // this iter would miss "key4" if not fixed
         auto iter = OpenTransactionLogIter(0);
@@ -148,14 +154,14 @@
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
-    Put("key1", DummyString(1024));
+    ASSERT_OK(Put("key1", DummyString(1024)));
     auto iter = OpenTransactionLogIter(0);
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
     iter->Next();
     ASSERT_TRUE(!iter->Valid());
     ASSERT_OK(iter->status());
-    Put("key2", DummyString(1024));
+    ASSERT_OK(Put("key2", DummyString(1024)));
     iter->Next();
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -166,9 +172,9 @@
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
-    Put("key1", DummyString(1024));
-    Put("key2", DummyString(1023));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("key1", DummyString(1024)));
+    ASSERT_OK(Put("key2", DummyString(1023)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     Reopen(options);
     auto iter = OpenTransactionLogIter(0);
     ExpectRecords(2, iter);
@@ -179,31 +185,38 @@
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
+
     for (int i = 0; i < 1024; i++) {
-      Put("key"+ToString(i), DummyString(10));
+      ASSERT_OK(Put("key" + ToString(i), DummyString(10)));
     }
-    dbfull()->Flush(FlushOptions());
-    dbfull()->FlushWAL(false);
+
+    ASSERT_OK(Flush());
+    ASSERT_OK(db_->FlushWAL(false));
+
     // Corrupt this log to create a gap
-    ROCKSDB_NAMESPACE::VectorLogPtr wal_files;
-    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_OK(db_->DisableFileDeletions());
+
+    VectorLogPtr wal_files;
+    ASSERT_OK(db_->GetSortedWalFiles(wal_files));
+    ASSERT_FALSE(wal_files.empty());
+
     const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
-    if (mem_env_) {
-      mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2);
-    } else {
-      ASSERT_EQ(0, truncate(logfile_path.c_str(),
-                   wal_files.front()->SizeFileBytes() / 2));
-    }
+    ASSERT_OK(test::TruncateFile(env_, logfile_path,
+                                 wal_files.front()->SizeFileBytes() / 2));
+
+    ASSERT_OK(db_->EnableFileDeletions());
 
     // Insert a new entry to a new log file
-    Put("key1025", DummyString(10));
-    dbfull()->FlushWAL(false);
+    ASSERT_OK(Put("key1025", DummyString(10)));
+    ASSERT_OK(db_->FlushWAL(false));
+
     // Try to read from the beginning. Should stop before the gap and read less
     // than 1025 entries
     auto iter = OpenTransactionLogIter(0);
-    int count;
-    SequenceNumber last_sequence_read = ReadRecords(iter, count);
+    int count = 0;
+    SequenceNumber last_sequence_read = ReadRecords(iter, count, false);
     ASSERT_LT(last_sequence_read, 1025U);
+
     // Try to read past the gap, should be able to seek to key1025
     auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
     ExpectRecords(1, iter2);
@@ -216,15 +229,15 @@
     DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, options);
     WriteBatch batch;
-    batch.Put(handles_[1], "key1", DummyString(1024));
-    batch.Put(handles_[0], "key2", DummyString(1024));
-    batch.Put(handles_[1], "key3", DummyString(1024));
-    batch.Delete(handles_[0], "key2");
-    dbfull()->Write(WriteOptions(), &batch);
-    Flush(1);
-    Flush(0);
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Flush(0));
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    Put(1, "key4", DummyString(1024));
+    ASSERT_OK(Put(1, "key4", DummyString(1024)));
     auto iter = OpenTransactionLogIter(3);
     ExpectRecords(2, iter);
   } while (ChangeCompactOptions());
@@ -236,13 +249,13 @@
   CreateAndReopenWithCF({"pikachu"}, options);
   {
     WriteBatch batch;
-    batch.Put(handles_[1], "key1", DummyString(1024));
-    batch.Put(handles_[0], "key2", DummyString(1024));
-    batch.PutLogData(Slice("blob1"));
-    batch.Put(handles_[1], "key3", DummyString(1024));
-    batch.PutLogData(Slice("blob2"));
-    batch.Delete(handles_[0], "key2");
-    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob1")));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob2")));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
   }
 
@@ -267,7 +280,7 @@
       return Status::OK();
     }
   } handler;
-  res.writeBatchPtr->Iterate(&handler);
+  ASSERT_OK(res.writeBatchPtr->Iterate(&handler));
   ASSERT_EQ(
       "Put(1, key1, 1024)"
       "Put(0, key2, 1024)"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_logical_block_size_cache_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,513 @@
+// Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+
+#ifdef OS_LINUX
+#include "env/io_posix.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper {
+ public:
+  EnvWithCustomLogicalBlockSizeCache(Env* env, LogicalBlockSizeCache* cache)
+      : EnvWrapper(env), cache_(cache) {}
+
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return cache_->RefAndCacheLogicalBlockSize(paths);
+  }
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    cache_->UnrefAndTryRemoveCachedLogicalBlockSize(paths);
+    return Status::OK();
+  }
+
+ private:
+  LogicalBlockSizeCache* cache_;
+};
+
+class DBLogicalBlockSizeCacheTest : public testing::Test {
+ public:
+  DBLogicalBlockSizeCacheTest()
+      : dbname_(test::PerThreadDBPath("logical_block_size_cache_test")),
+        data_path_0_(dbname_ + "/data_path_0"),
+        data_path_1_(dbname_ + "/data_path_1"),
+        cf_path_0_(dbname_ + "/cf_path_0"),
+        cf_path_1_(dbname_ + "/cf_path_1") {
+    auto get_fd_block_size = [&](int fd) { return fd; };
+    auto get_dir_block_size = [&](const std::string& /*dir*/, size_t* size) {
+      *size = 1024;
+      return Status::OK();
+    };
+    cache_.reset(
+        new LogicalBlockSizeCache(get_fd_block_size, get_dir_block_size));
+    env_.reset(
+        new EnvWithCustomLogicalBlockSizeCache(Env::Default(), cache_.get()));
+  }
+
+ protected:
+  std::string dbname_;
+  std::string data_path_0_;
+  std::string data_path_1_;
+  std::string cf_path_0_;
+  std::string cf_path_1_;
+  std::unique_ptr<LogicalBlockSizeCache> cache_;
+  std::unique_ptr<Env> env_;
+};
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
+  // Tests that Open will cache the logical block size for data paths,
+  // and Close will remove the cached sizes.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}};
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+    }
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(data_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+    ASSERT_TRUE(cache_->Contains(data_path_1_));
+    ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+    ASSERT_OK(db->Close());
+    ASSERT_EQ(0, cache_->Size());
+    delete db;
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
+  // Tests that Open will cache the logical block size for data paths,
+  // and delete the db pointer will remove the cached sizes.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+    }
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
+  // Tests that CreateColumnFamily will cache the cf_paths,
+  // drop the column family handle won't drop the cache,
+  // drop and then delete the column family handle will drop the cache.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  ColumnFamilyHandle* cf = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  // Drop column family does not drop cache.
+  ASSERT_OK(db->DropColumnFamily(cf));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  // Delete handle will drop cache.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  delete db;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
+  // Tests that CreateColumnFamilies will cache the cf_paths,
+  // drop the column family handle won't drop the cache,
+  // drop and then delete the column family handle will drop the cache.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  std::vector<ColumnFamilyHandle*> cfs;
+  ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+  // Drop column family does not drop cache.
+  for (ColumnFamilyHandle* cf : cfs) {
+    ASSERT_OK(db->DropColumnFamily(cf));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+  }
+
+  // Delete one handle will not drop cache because another handle is still
+  // referencing cf_path_0_.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  // Delete the last handle will drop cache.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  delete db;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(dbname_, options,
+      {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
+  // Tests that Open two column families with the same cf_path will cache the
+  // cf_path and have 2 references to the cached size,
+  // drop the column family handle won't drop the cache,
+  // drop and then delete the column family handle will drop the cache.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    ColumnFamilyHandle* cf1 = nullptr;
+    ColumnFamilyHandle* cf2 = nullptr;
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+    ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1));
+    ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cf1));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cf2));
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+
+    std::vector<ColumnFamilyHandle*> cfs;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_,
+                         {{"cf1", cf_options},
+                          {"cf2", cf_options},
+                          {"default", ColumnFamilyOptions()}},
+                         &cfs, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_,
+                                    {{"cf1", cf_options},
+                                     {"cf2", cf_options},
+                                     {"default", ColumnFamilyOptions()}},
+                                    &cfs, &db));
+#endif
+    }
+
+    // Logical block sizes of dbname_ and cf_path_0_ are cached during Open.
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+    // Drop handles won't drop the cache.
+    ASSERT_OK(db->DropColumnFamily(cfs[0]));
+    ASSERT_OK(db->DropColumnFamily(cfs[1]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+    // Delete 1st handle won't drop the cache for cf_path_0_.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    // Delete 2nd handle will drop the cache for cf_path_0_.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+    // Delete the default handle won't affect the cache because db still refers
+    // to the default CF.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[2]));
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(DestroyDB(dbname_, options,
+      {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
+  // Tests that destroy column family without dropping won't drop the cache,
+  // because compaction and flush might still need to get logical block size
+  // when opening new files.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ColumnFamilyHandle* cf = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  // Delete handle won't drop cache.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  delete db;
+  ASSERT_EQ(0, cache_->Size());
+
+  // Open with column families.
+  std::vector<ColumnFamilyHandle*> cfs;
+  for (int i = 0; i < 2; i++) {
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(
+          options, dbname_,
+          {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(
+          options, dbname_,
+          {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+#endif
+    }
+    // cf_path_0_ and dbname_ are cached.
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    // Deleting handle won't drop cache.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
+  // Tests the cache behavior when there are multiple DBs sharing the same env
+  // with different db_paths and cf_paths.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+  DB* db0;
+  ASSERT_OK(DB::Open(options, data_path_0_, &db0));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+
+  ColumnFamilyOptions cf_options0;
+  cf_options0.cf_paths = {{cf_path_0_, 1024}};
+  ColumnFamilyHandle* cf0;
+  ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  DB* db1;
+  ASSERT_OK(DB::Open(options, data_path_1_, &db1));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+
+  ColumnFamilyOptions cf_options1;
+  cf_options1.cf_paths = {{cf_path_1_, 1024}};
+  ColumnFamilyHandle* cf1;
+  ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1));
+  ASSERT_EQ(4, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+  delete db0;
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+  ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}}));
+
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+  delete db1;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
+  // Tests the cache behavior when there are multiple DBs sharing the same env
+  // with the same db_paths and cf_paths.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  options.db_paths = {{data_path_0_, 1024}};
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+  DB* db0;
+  ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+
+  ColumnFamilyHandle* cf0;
+  ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  DB* db1;
+  ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  ColumnFamilyHandle* cf1;
+  ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+  delete db0;
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}}));
+
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+  delete db1;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}}));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_LINUX
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_memtable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_memtable_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_memtable_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_memtable_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,7 +17,7 @@
 
 class DBMemTableTest : public DBTestBase {
  public:
-  DBMemTableTest() : DBTestBase("/db_memtable_test") {}
+  DBMemTableTest() : DBTestBase("db_memtable_test", /*env_do_fsync=*/true) {}
 };
 
 class MockMemTableRep : public MemTableRep {
@@ -129,7 +129,6 @@
 TEST_F(DBMemTableTest, DuplicateSeq) {
   SequenceNumber seq = 123;
   std::string value;
-  Status s;
   MergeContext merge_context;
   Options options;
   InternalKeyComparator ikey_cmp(options.comparator);
@@ -140,28 +139,31 @@
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                                kMaxSequenceNumber, 0 /* column_family_id */);
 
   // Write some keys and make sure it returns false on duplicates
-  bool res;
-  res = mem->Add(seq, kTypeValue, "key", "value2");
-  ASSERT_TRUE(res);
-  res = mem->Add(seq, kTypeValue, "key", "value2");
-  ASSERT_FALSE(res);
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
   // Changing the type should still cause the duplicatae key
-  res = mem->Add(seq, kTypeMerge, "key", "value2");
-  ASSERT_FALSE(res);
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
   // Changing the seq number will make the key fresh
-  res = mem->Add(seq + 1, kTypeMerge, "key", "value2");
-  ASSERT_TRUE(res);
+  ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2",
+                     nullptr /* kv_prot_info */));
   // Test with different types for duplicate keys
-  res = mem->Add(seq, kTypeDeletion, "key", "");
-  ASSERT_FALSE(res);
-  res = mem->Add(seq, kTypeSingleDeletion, "key", "");
-  ASSERT_FALSE(res);
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
 
   // Test the duplicate keys under stress
   for (int i = 0; i < 10000; i++) {
@@ -169,11 +171,12 @@
     if (!insert_dup) {
       seq++;
     }
-    res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq));
+    Status s = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq),
+                        nullptr /* kv_prot_info */);
     if (insert_dup) {
-      ASSERT_FALSE(res);
+      ASSERT_TRUE(s.IsTryAgain());
     } else {
-      ASSERT_TRUE(res);
+      ASSERT_OK(s);
     }
   }
   delete mem;
@@ -181,26 +184,28 @@
   // Test with InsertWithHint
   options.memtable_insert_with_hint_prefix_extractor.reset(
       new TestPrefixExtractor());  // which uses _ to extract the prefix
-  ioptions = ImmutableCFOptions(options);
+  ioptions = ImmutableOptions(options);
   mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                      kMaxSequenceNumber, 0 /* column_family_id */);
   // Insert a duplicate key with _ in it
-  res = mem->Add(seq, kTypeValue, "key_1", "value");
-  ASSERT_TRUE(res);
-  res = mem->Add(seq, kTypeValue, "key_1", "value");
-  ASSERT_FALSE(res);
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)
+          .IsTryAgain());
   delete mem;
 
   // Test when InsertConcurrently will be invoked
   options.allow_concurrent_memtable_write = true;
-  ioptions = ImmutableCFOptions(options);
+  ioptions = ImmutableOptions(options);
   mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                      kMaxSequenceNumber, 0 /* column_family_id */);
   MemTablePostProcessInfo post_process_info;
-  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
-  ASSERT_TRUE(res);
-  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
-  ASSERT_FALSE(res);
+  ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value",
+                     nullptr /* kv_prot_info */, true, &post_process_info));
+  ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value",
+                       nullptr /* kv_prot_info */, true, &post_process_info)
+                  .IsTryAgain());
   delete mem;
 }
 
@@ -208,7 +213,6 @@
 TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
   int num_ops = 1000;
   std::string value;
-  Status s;
   MergeContext merge_context;
   Options options;
   // A merge operator that is not sensitive to concurrent writes since in this
@@ -220,15 +224,14 @@
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
   options.allow_concurrent_memtable_write = true;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                                kMaxSequenceNumber, 0 /* column_family_id */);
 
   // Put 0 as the base
   PutFixed64(&value, static_cast<uint64_t>(0));
-  bool res = mem->Add(0, kTypeValue, "key", value);
-  ASSERT_TRUE(res);
+  ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */));
   value.clear();
 
   // Write Merge concurrently
@@ -237,9 +240,8 @@
     std::string v1;
     for (int seq = 1; seq < num_ops / 2; seq++) {
       PutFixed64(&v1, seq);
-      bool res1 =
-          mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1);
-      ASSERT_TRUE(res1);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */,
+                         true, &post_process_info1));
       v1.clear();
     }
   });
@@ -248,9 +250,8 @@
     std::string v2;
     for (int seq = num_ops / 2; seq < num_ops; seq++) {
       PutFixed64(&v2, seq);
-      bool res2 =
-          mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2);
-      ASSERT_TRUE(res2);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */,
+                         true, &post_process_info2));
       v2.clear();
     }
   });
@@ -261,8 +262,9 @@
   ReadOptions roptions;
   SequenceNumber max_covering_tombstone_seq = 0;
   LookupKey lkey("key", kMaxSequenceNumber);
-  res = mem->Get(lkey, &value, &status, &merge_context,
-                 &max_covering_tombstone_seq, roptions);
+  bool res = mem->Get(lkey, &value, /*timestamp=*/nullptr, &status,
+                      &merge_context, &max_covering_tombstone_seq, roptions);
+  ASSERT_OK(status);
   ASSERT_TRUE(res);
   uint64_t ivalue = DecodeFixed64(Slice(value).data());
   uint64_t sum = 0;
@@ -303,19 +305,20 @@
   ASSERT_EQ(hint_bar, rep->last_hint_in());
   ASSERT_EQ(hint_bar, rep->last_hint_out());
   ASSERT_EQ(5, rep->num_insert_with_hint());
-  ASSERT_OK(Put("whitelisted", "vvv"));
+  ASSERT_OK(Put("NotInPrefixDomain", "vvv"));
   ASSERT_EQ(5, rep->num_insert_with_hint());
   ASSERT_EQ("foo_v1", Get("foo_k1"));
   ASSERT_EQ("foo_v2", Get("foo_k2"));
   ASSERT_EQ("foo_v3", Get("foo_k3"));
   ASSERT_EQ("bar_v1", Get("bar_k1"));
   ASSERT_EQ("bar_v2", Get("bar_k2"));
-  ASSERT_EQ("vvv", Get("whitelisted"));
+  ASSERT_EQ("vvv", Get("NotInPrefixDomain"));
 }
 
 TEST_F(DBMemTableTest, ColumnFamilyId) {
   // Verifies MemTableRepFactory is told the right column family id.
   Options options;
+  options.env = CurrentOptions().env;
   options.allow_concurrent_memtable_write = false;
   options.create_if_missing = true;
   options.memtable_factory.reset(new MockMemTableRepFactory());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operand_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,43 +8,85 @@
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/debug.h"
 #include "table/block_based/block_builder.h"
-#include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
 #include "test_util/sync_point.h"
 #endif
 #include "rocksdb/merge_operator.h"
+#include "utilities/fault_injection_env.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/sortlist.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+namespace {
+class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+ public:
+  LimitedStringAppendMergeOp(int limit, char delim)
+      : StringAppendTESTOperator(delim), limit_(limit) {}
+
+  const char* Name() const override {
+    return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+  }
+
+  bool ShouldMerge(const std::vector<Slice>& operands) const override {
+    if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  size_t limit_ = 0;
+};
+}  // namespace
+
 class DBMergeOperandTest : public DBTestBase {
  public:
-  DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {}
+  DBMergeOperandTest()
+      : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {}
 };
 
-TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
-  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
-   public:
-    LimitedStringAppendMergeOp(int limit, char delim)
-        : StringAppendTESTOperator(delim), limit_(limit) {}
+TEST_F(DBMergeOperandTest, MergeOperandReadAfterFreeBug) {
+  // There was a bug of reading merge operands after they are mistakely freed
+  // in DB::GetMergeOperands, which is surfaced by cache full.
+  // See PR#9507 for more.
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.env = env_;
+  BlockBasedTableOptions table_options;
 
-    const char* Name() const override {
-      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
-    }
+  // Small cache to simulate cache full
+  table_options.block_cache = NewLRUCache(1);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    bool ShouldMerge(const std::vector<Slice>& operands) const override {
-      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
-        return true;
-      }
-      return false;
-    }
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
 
-   private:
-    size_t limit_ = 0;
-  };
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v4"));
+
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(number_of_operands, 4);
+  ASSERT_EQ(values[0].ToString(), "v1");
+  ASSERT_EQ(values[1].ToString(), "v2");
+  ASSERT_EQ(values[2].ToString(), "v3");
+  ASSERT_EQ(values[3].ToString(), "v4");
+}
 
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   Options options;
   options.create_if_missing = true;
   // Use only the latest two merge operands.
@@ -58,29 +100,29 @@
   merge_operands_info.expected_max_number_of_operands = num_records;
 
   // k0 value in memtable
-  Put("k0", "PutARock");
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(Put("k0", "PutARock"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "PutARock");
 
   // k0.1 value in SST
-  Put("k0.1", "RockInSST");
+  ASSERT_OK(Put("k0.1", "RockInSST"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "RockInSST");
 
   // All k1 values are in memtable.
   ASSERT_OK(Merge("k1", "a"));
-  Put("k1", "x");
+  ASSERT_OK(Put("k1", "x"));
   ASSERT_OK(Merge("k1", "b"));
   ASSERT_OK(Merge("k1", "c"));
   ASSERT_OK(Merge("k1", "d"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "x");
   ASSERT_EQ(values[1], "b");
   ASSERT_EQ(values[2], "c");
@@ -97,13 +139,13 @@
 
   // All k1.1 values are in memtable.
   ASSERT_OK(Merge("k1.1", "r"));
-  Delete("k1.1");
+  ASSERT_OK(Delete("k1.1"));
   ASSERT_OK(Merge("k1.1", "c"));
   ASSERT_OK(Merge("k1.1", "k"));
   ASSERT_OK(Merge("k1.1", "s"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "c");
   ASSERT_EQ(values[1], "k");
   ASSERT_EQ(values[2], "s");
@@ -114,9 +156,9 @@
   ASSERT_OK(Merge("k2", "e"));
   ASSERT_OK(Merge("k2", "r"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "q");
   ASSERT_EQ(values[1], "w");
   ASSERT_EQ(values[2], "e");
@@ -124,30 +166,30 @@
 
   // All k2.1 values are flushed to L0 into a single file.
   ASSERT_OK(Merge("k2.1", "m"));
-  Put("k2.1", "l");
+  ASSERT_OK(Put("k2.1", "l"));
   ASSERT_OK(Merge("k2.1", "n"));
   ASSERT_OK(Merge("k2.1", "o"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "l,n,o");
 
   // All k2.2 values are flushed to L0 into a single file.
   ASSERT_OK(Merge("k2.2", "g"));
-  Delete("k2.2");
+  ASSERT_OK(Delete("k2.2"));
   ASSERT_OK(Merge("k2.2", "o"));
   ASSERT_OK(Merge("k2.2", "t"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "o,t");
 
   // Do some compaction that will make the following tests more predictable
   //  Slice start("PutARock");
   //  Slice end("t");
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   // All k3 values are flushed and are in different files.
   ASSERT_OK(Merge("k3", "ab"));
@@ -157,9 +199,9 @@
   ASSERT_OK(Merge("k3", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3", "de"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "ab");
   ASSERT_EQ(values[1], "bc");
   ASSERT_EQ(values[2], "cd");
@@ -168,14 +210,14 @@
   // All k3.1 values are flushed and are in different files.
   ASSERT_OK(Merge("k3.1", "ab"));
   ASSERT_OK(Flush());
-  Put("k3.1", "bc");
+  ASSERT_OK(Put("k3.1", "bc"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.1", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.1", "de"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "bc");
   ASSERT_EQ(values[1], "cd");
   ASSERT_EQ(values[2], "de");
@@ -183,14 +225,14 @@
   // All k3.2 values are flushed and are in different files.
   ASSERT_OK(Merge("k3.2", "ab"));
   ASSERT_OK(Flush());
-  Delete("k3.2");
+  ASSERT_OK(Delete("k3.2"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.2", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.2", "de"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "cd");
   ASSERT_EQ(values[1], "de");
 
@@ -205,32 +247,120 @@
   ASSERT_OK(Flush());
   MoveFilesToLevel(1);
   ASSERT_OK(Merge("k4", "ed"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "ba");
   ASSERT_EQ(values[1], "cb");
   ASSERT_EQ(values[2], "dc");
   ASSERT_EQ(values[3], "ed");
 
-  // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable
+  // First 3 k5 values are in SST and next 4 k5 values are in Immutable
+  // Memtable
   ASSERT_OK(Merge("k5", "who"));
   ASSERT_OK(Merge("k5", "am"));
   ASSERT_OK(Merge("k5", "i"));
   ASSERT_OK(Flush());
-  Put("k5", "remember");
+  ASSERT_OK(Put("k5", "remember"));
   ASSERT_OK(Merge("k5", "i"));
   ASSERT_OK(Merge("k5", "am"));
   ASSERT_OK(Merge("k5", "rocks"));
-  dbfull()->TEST_SwitchMemtable();
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k5", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "remember");
   ASSERT_EQ(values[1], "i");
   ASSERT_EQ(values[2], "am");
 }
 
+TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) {
+  Options options;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Put("k1", "x"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Put("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "q,w,e,r");
+
+  // Do some compaction that will make the following tests more predictable
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Put("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Put("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_merge_operator_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,7 @@
 #include "db/forward_iterator.h"
 #include "port/stack_trace.h"
 #include "rocksdb/merge_operator.h"
+#include "util/random.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
 
@@ -35,7 +36,8 @@
 // Test merge operator functionality.
 class DBMergeOperatorTest : public DBTestBase {
  public:
-  DBMergeOperatorTest() : DBTestBase("/db_merge_operator_test") {}
+  DBMergeOperatorTest()
+      : DBTestBase("db_merge_operator_test", /*env_do_fsync=*/false) {}
 
   std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
                                   const Slice& key,
@@ -92,7 +94,7 @@
   ASSERT_OK(Merge("k1", "c"));
   ASSERT_OK(Merge("k1", "d"));
   std::string value;
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k1", &value));
   // Make sure that only the latest two merge operands are used. If this was
   // not the case the value would be "a,b,c,d".
   ASSERT_EQ(value, "c,d");
@@ -103,7 +105,7 @@
   ASSERT_OK(Merge("k2", "c"));
   ASSERT_OK(Merge("k2", "d"));
   ASSERT_OK(Flush());
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k2", &value));
   ASSERT_EQ(value, "c,d");
 
   // All K3 values are flushed and are in different files.
@@ -114,7 +116,7 @@
   ASSERT_OK(Merge("k3", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3", "de"));
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k3", &value));
   ASSERT_EQ(value, "cd,de");
 
   // All K4 values are in different levels
@@ -128,7 +130,7 @@
   ASSERT_OK(Flush());
   MoveFilesToLevel(1);
   ASSERT_OK(Merge("k4", "de"));
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k4", &value));
   ASSERT_EQ(value, "cd,de");
 }
 
@@ -242,7 +244,7 @@
       std::string key = Key(key_id % 35);
       key_id++;
       for (int k = 0; k < kOperandsPerKeyPerFile; k++) {
-        std::string val = RandomString(&rnd, kOperandSize);
+        std::string val = rnd.RandomString(kOperandSize);
         ASSERT_OK(db_->Merge(WriteOptions(), key, val));
         if (true_data[key].size() == 0) {
           true_data[key] = val;
@@ -327,7 +329,7 @@
   for (int i = 0; i < kNumOperands; i++) {
     for (int j = 0; j < kNumKeys; j++) {
       std::string k = Key(j);
-      std::string v = RandomString(&rnd, kOperandSize);
+      std::string v = rnd.RandomString(kOperandSize);
       ASSERT_OK(db_->Merge(WriteOptions(), k, v));
 
       true_data[k] = std::max(true_data[k], v);
@@ -342,8 +344,9 @@
   // Code executed before merge operation
   merge_hook->before_merge_ = [&]() {
     // Evict all tables from cache before every merge operation
+    auto* table_cache = dbfull()->TEST_table_cache();
     for (uint64_t num : file_numbers) {
-      TableCache::Evict(dbfull()->TEST_table_cache(), num);
+      TableCache::Evict(table_cache, num);
     }
     // Decrease cache capacity to force all unrefed blocks to be evicted
     if (bbto.block_cache) {
@@ -364,7 +367,7 @@
   VerifyDBFromMap(true_data, &total_reads);
   ASSERT_EQ(merge_cnt, total_reads);
 
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   VerifyDBFromMap(true_data, &total_reads);
 }
@@ -383,7 +386,7 @@
   std::function<void()> writer_func = [&]() {
     int k = 0;
     for (int i = 0; i < kNumWrites; i++) {
-      db_->Merge(WriteOptions(), Key(k), Key(k));
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k)));
 
       if (i && i % kNumOperands == 0) {
         k++;
@@ -401,7 +404,7 @@
     ReadOptions ro;
     ro.tailing = true;
     Iterator* iter = db_->NewIterator(ro);
-
+    ASSERT_OK(iter->status());
     iter->SeekToFirst();
     for (int i = 0; i < (kNumWrites / kNumOperands); i++) {
       while (!iter->Valid()) {
@@ -414,6 +417,7 @@
 
       iter->Next();
     }
+    ASSERT_OK(iter->status());
 
     delete iter;
   };
@@ -447,12 +451,13 @@
   //    ForwardIterator to not pin it in some circumstances. This test
   //    reproduces it.
 
-  db_->Merge(WriteOptions(), "key", "sst");
-  db_->Flush(FlushOptions()); // Switch to SuperVersion A
-  db_->Merge(WriteOptions(), "key", "memtable");
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst"));
+  ASSERT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion A
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable"));
 
   // Pin SuperVersion A
   std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+  ASSERT_OK(someone_else->status());
 
   bool pushed_first_operand = false;
   bool stepped_to_next_operand = false;
@@ -460,7 +465,7 @@
       "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
         EXPECT_FALSE(pushed_first_operand);
         pushed_first_operand = true;
-        db_->Flush(FlushOptions()); // Switch to SuperVersion B
+        EXPECT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion B
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
@@ -475,7 +480,7 @@
   std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
   iter->Seek("key");
 
-  ASSERT_TRUE(iter->status().ok());
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
   EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
   EXPECT_TRUE(pushed_first_operand);
@@ -620,7 +625,7 @@
   // kNumPutBefore keys will have base values
   for (int i = 0; i < kNumPutBefore; i++) {
     std::string key = Key(rnd.Next() % kKeyRange);
-    std::string value = RandomString(&rnd, kOperandSize);
+    std::string value = rnd.RandomString(kOperandSize);
     ASSERT_OK(db_->Put(WriteOptions(), key, value));
 
     true_data[key] = value;
@@ -629,7 +634,7 @@
   // Do kTotalMerges merges
   for (int i = 0; i < kTotalMerges; i++) {
     std::string key = Key(rnd.Next() % kKeyRange);
-    std::string value = RandomString(&rnd, kOperandSize);
+    std::string value = rnd.RandomString(kOperandSize);
     ASSERT_OK(db_->Merge(WriteOptions(), key, value));
 
     if (true_data[key] < value) {
@@ -640,7 +645,7 @@
   // Overwrite random kNumPutAfter keys
   for (int i = 0; i < kNumPutAfter; i++) {
     std::string key = Key(rnd.Next() % kKeyRange);
-    std::string value = RandomString(&rnd, kOperandSize);
+    std::string value = rnd.RandomString(kOperandSize);
     ASSERT_OK(db_->Put(WriteOptions(), key, value));
 
     true_data[key] = value;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_options_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_options_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_options_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_options_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -27,38 +27,33 @@
 
 class DBOptionsTest : public DBTestBase {
  public:
-  DBOptionsTest() : DBTestBase("/db_options_test") {}
+  DBOptionsTest() : DBTestBase("db_options_test", /*env_do_fsync=*/true) {}
 
 #ifndef ROCKSDB_LITE
   std::unordered_map<std::string, std::string> GetMutableDBOptionsMap(
       const DBOptions& options) {
     std::string options_str;
-    GetStringFromDBOptions(&options_str, options);
-    std::unordered_map<std::string, std::string> options_map;
-    StringToMap(options_str, &options_map);
     std::unordered_map<std::string, std::string> mutable_map;
-    for (const auto opt : db_options_type_info) {
-      if (opt.second.is_mutable &&
-          opt.second.verification != OptionVerificationType::kDeprecated) {
-        mutable_map[opt.first] = options_map[opt.first];
-      }
-    }
+    ConfigOptions config_options(options);
+    config_options.delimiter = "; ";
+
+    EXPECT_OK(GetStringFromMutableDBOptions(
+        config_options, MutableDBOptions(options), &options_str));
+    EXPECT_OK(StringToMap(options_str, &mutable_map));
+
     return mutable_map;
   }
 
   std::unordered_map<std::string, std::string> GetMutableCFOptionsMap(
       const ColumnFamilyOptions& options) {
     std::string options_str;
-    GetStringFromColumnFamilyOptions(&options_str, options);
-    std::unordered_map<std::string, std::string> options_map;
-    StringToMap(options_str, &options_map);
+    ConfigOptions config_options;
+    config_options.delimiter = "; ";
+
     std::unordered_map<std::string, std::string> mutable_map;
-    for (const auto opt : cf_options_type_info) {
-      if (opt.second.is_mutable &&
-          opt.second.verification != OptionVerificationType::kDeprecated) {
-        mutable_map[opt.first] = options_map[opt.first];
-      }
-    }
+    EXPECT_OK(GetStringFromMutableCFOptions(
+        config_options, MutableCFOptions(options), &options_str));
+    EXPECT_OK(StringToMap(options_str, &mutable_map));
     return mutable_map;
   }
 
@@ -84,9 +79,85 @@
 #endif  // ROCKSDB_LITE
 };
 
+TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) {
+  Options options;
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+
+  ImmutableDBOptions db_options(options);
+  ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest);
+
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest);
+
+  Status s =
+      dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}});
+  ASSERT_FALSE(s.ok());
+}
+
 // RocksDB lite don't support dynamic options.
 #ifndef ROCKSDB_LITE
 
+TEST_F(DBOptionsTest, AvoidUpdatingOptions) {
+  Options options;
+  options.env = env_;
+  options.max_background_jobs = 4;
+  options.delayed_write_rate = 1024;
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  bool is_changed_stats = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::WriteOptionsFile:PersistOptions", [&](void* /*arg*/) {
+        ASSERT_FALSE(is_changed_stats);  // should only save options file once
+        is_changed_stats = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // helper function to check the status and reset after each check
+  auto is_changed = [&] {
+    bool ret = is_changed_stats;
+    is_changed_stats = false;
+    return ret;
+  };
+
+  // without changing the value, but it's sanitized to a different value
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "0"}}));
+  ASSERT_TRUE(is_changed());
+
+  // without changing the value
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_jobs", "4"}}));
+  ASSERT_FALSE(is_changed());
+
+  // changing the value
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+  ASSERT_TRUE(is_changed());
+
+  // update again
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+  ASSERT_FALSE(is_changed());
+
+  // without changing a default value
+  ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "false"}}));
+  ASSERT_FALSE(is_changed());
+
+  // now change
+  ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "true"}}));
+  ASSERT_TRUE(is_changed());
+
+  // multiple values without change
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"max_total_wal_size", "0"}, {"stats_dump_period_sec", "600"}}));
+  ASSERT_FALSE(is_changed());
+
+  // multiple values with change
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"max_open_files", "100"}, {"stats_dump_period_sec", "600"}}));
+  ASSERT_TRUE(is_changed());
+}
+
 TEST_F(DBOptionsTest, GetLatestDBOptions) {
   // GetOptions should be able to get latest option changed by SetOptions.
   Options options;
@@ -118,6 +189,127 @@
             GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
 }
 
+TEST_F(DBOptionsTest, SetMutableTableOptions) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.blob_file_size = 16384;
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;
+  bbto.block_size = 8192;
+  bbto.block_restart_interval = 7;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  Options c_opts = dbfull()->GetOptions(cfh);
+  const auto* c_bbto =
+      c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(c_bbto, nullptr);
+  ASSERT_EQ(c_opts.blob_file_size, 16384);
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 8192);
+  ASSERT_EQ(c_bbto->block_restart_interval, 7);
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"table_factory.block_restart_interval", "11"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Now set an option that is not mutable - options should not change
+  ASSERT_NOK(
+      dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that are not - options should not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.no_block_cache", "false"},
+            {"table_factory.block_size", "8192"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that do not exist - options should not
+  // change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "8192"},
+            {"table_factory.does_not_exist", "true"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Trying to change the table factory fails
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory", TableFactory::kPlainTableName()}}));
+
+  // Set some on the table and some on the Column Family
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"blob_file_size", "32768"},
+            {"table_factory.block_restart_interval", "13"}}));
+  c_opts = dbfull()->GetOptions(cfh);
+  ASSERT_EQ(c_opts.blob_file_size, 32768);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+  // Set some on the table and a bad one on the ColumnFamily - options should
+  // not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "1024"},
+            {"no_such_option", "32768"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+}
+
+TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) {
+  class DummySkipListFactory : public SkipListFactory {
+   public:
+    static const char* kClassName() { return "DummySkipListFactory"; }
+    const char* Name() const override { return kClassName(); }
+    explicit DummySkipListFactory() : SkipListFactory(2) {}
+  };
+  {
+    // Verify the DummySkipList cannot be created
+    ConfigOptions config_options;
+    config_options.ignore_unsupported_options = false;
+    std::unique_ptr<MemTableRepFactory> factory;
+    ASSERT_NOK(MemTableRepFactory::CreateFromString(
+        config_options, DummySkipListFactory::kClassName(), &factory));
+  }
+  Options options;
+  options.create_if_missing = true;
+  // Try with fail_if_options_file_error=false/true to update the options
+  for (bool on_error : {false, true}) {
+    options.fail_if_options_file_error = on_error;
+    options.env = env_;
+    options.disable_auto_compactions = false;
+
+    options.memtable_factory.reset(new DummySkipListFactory());
+    Reopen(options);
+
+    ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+    ASSERT_OK(
+        dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
+    ColumnFamilyDescriptor cfd;
+    ASSERT_OK(cfh->GetDescriptor(&cfd));
+    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+                 DummySkipListFactory::kClassName());
+    ColumnFamilyHandle* test = nullptr;
+    ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
+    ASSERT_OK(test->GetDescriptor(&cfd));
+    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+                 DummySkipListFactory::kClassName());
+
+    ASSERT_OK(dbfull()->DropColumnFamily(test));
+    delete test;
+  }
+}
+
 TEST_F(DBOptionsTest, SetBytesPerSync) {
   const size_t kValueSize = 1024 * 1024;  // 1MB
   Options options;
@@ -140,7 +332,7 @@
   WriteOptions write_opts;
   // should sync approximately 40MB/1MB ~= 40 times.
   for (i = 0; i < 40; i++) {
-    Put(Key(i), kValue, write_opts);
+    ASSERT_OK(Put(Key(i), kValue, write_opts));
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -156,7 +348,7 @@
   // should sync approximately 40MB*2/8MB ~= 10 times.
   // data will be 40*2MB because of previous Puts too.
   for (i = 0; i < 40; i++) {
-    Put(Key(i), kValue, write_opts);
+    ASSERT_OK(Put(Key(i), kValue, write_opts));
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -179,15 +371,16 @@
   options.env = env_;
   Reopen(options);
   ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
-  int counter = 0;
+  std::atomic_int counter{0};
   int low_bytes_per_sync = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+      "WritableFileWriter::RangeSync:0",
+      [&](void* /*arg*/) { counter.fetch_add(1); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   const std::string kValue(kValueSize, 'v');
   int i = 0;
   for (; i < 10; i++) {
-    Put(Key(i), kValue);
+    ASSERT_OK(Put(Key(i), kValue));
   }
   // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its
   // empty and will not get the new wal_bytes_per_sync value.
@@ -198,7 +391,7 @@
   counter = 0;
   i = 0;
   for (; i < 10; i++) {
-    Put(Key(i), kValue);
+    ASSERT_OK(Put(Key(i), kValue));
   }
   ASSERT_GT(counter, 0);
   ASSERT_GT(low_bytes_per_sync, 0);
@@ -233,9 +426,9 @@
   for (; i < 3; i++) {
     ASSERT_OK(Put("foo", ToString(i)));
     ASSERT_OK(Put("bar", ToString(i)));
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(unmatch_cnt, 0);
   ASSERT_GE(match_cnt, 11);
 
@@ -251,9 +444,9 @@
   for (; i < 3; i++) {
     ASSERT_OK(Put("foo", ToString(i)));
     ASSERT_OK(Put("bar", ToString(i)));
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(unmatch_cnt, 0);
   ASSERT_GE(match_cnt, 11);
 }
@@ -289,14 +482,14 @@
       DestroyAndReopen(options);
       int i = 0;
       for (; i < 1024; i++) {
-        Put(Key(i), kValue);
+        ASSERT_OK(Put(Key(i), kValue));
       }
-      Flush();
+      ASSERT_OK(Flush());
       for (; i < 1024 * 2; i++) {
-        Put(Key(i), kValue);
+        ASSERT_OK(Put(Key(i), kValue));
       }
-      Flush();
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       ASSERT_EQ(2, NumTableFilesAtLevel(0));
       uint64_t l0_size = SizeAtLevel(0);
 
@@ -318,7 +511,7 @@
           break;
       }
       Reopen(options);
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
       ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
 
@@ -365,7 +558,7 @@
       TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3");
 
       // Background compaction executed.
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
       ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
     }
@@ -382,12 +575,12 @@
     // Need to insert two keys to avoid trivial move.
     ASSERT_OK(Put("foo", ToString(i)));
     ASSERT_OK(Put("bar", ToString(i)));
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ("3", FilesPerLevel());
   ASSERT_OK(
       dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,1", FilesPerLevel());
 }
 
@@ -404,6 +597,20 @@
   ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed());
 }
 
+TEST_F(DBOptionsTest, SetBackgroundFlushThreads) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_flushes = 1;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(1, dbfull()->TEST_BGFlushesAllowed());
+  ASSERT_EQ(1, env_->GetBackgroundThreads(Env::Priority::HIGH));
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_flushes", "3"}}));
+  ASSERT_EQ(3, env_->GetBackgroundThreads(Env::Priority::HIGH));
+  ASSERT_EQ(3, dbfull()->TEST_BGFlushesAllowed());
+}
+
+
 TEST_F(DBOptionsTest, SetBackgroundJobs) {
   Options options;
   options.create_if_missing = true;
@@ -476,8 +683,7 @@
 TEST_F(DBOptionsTest, MaxTotalWalSizeChange) {
   Random rnd(1044);
   const auto value_size = size_t(1024);
-  std::string value;
-  test::RandomString(&rnd, value_size, &value);
+  std::string value = rnd.RandomString(value_size);
 
   Options options;
   options.create_if_missing = true;
@@ -496,7 +702,7 @@
   ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}}));
 
   for (size_t cf = 0; cf < handles_.size(); ++cf) {
-    dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
     ASSERT_EQ("1", FilesPerLevel(static_cast<int>(cf)));
   }
 }
@@ -547,10 +753,9 @@
 }
 
 TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) {
-  SpecialEnv env(env_);
-  env.time_elapse_only_sleep_ = true;
   Options options;
-  options.env = &env;
+  options.env = env_;
+  SetTimeElapseOnlySleepOnReopen(&options);
   options.create_if_missing = true;
   ASSERT_OK(TryReopen(options));
 
@@ -569,10 +774,10 @@
 
   assert_candidate_files_empty(dbfull(), true);
 
-  env.addon_time_.store(20);
+  env_->MockSleepForMicroseconds(20);
   assert_candidate_files_empty(dbfull(), true);
 
-  env.addon_time_.store(21);
+  env_->MockSleepForMicroseconds(1);
   assert_candidate_files_empty(dbfull(), false);
 
   Close();
@@ -599,6 +804,7 @@
 
 TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
   Options options;
+  options.env = CurrentOptions().env;
   options.delayed_write_rate = 0;
   Reopen(options);
   ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
@@ -610,6 +816,7 @@
 
 TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
   Options options;
+  options.env = CurrentOptions().env;
   options.compaction_style = kCompactionStyleUniversal;
 
   options.ttl = 0;
@@ -639,6 +846,7 @@
 
 TEST_F(DBOptionsTest, SanitizeTtlDefault) {
   Options options;
+  options.env = CurrentOptions().env;
   Reopen(options);
   ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
 
@@ -655,6 +863,7 @@
 TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
+  options.env = CurrentOptions().env;
   options.ttl = 0;
   Reopen(options);
   ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
@@ -680,17 +889,19 @@
 
 TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
   Options options;
+  options.env = CurrentOptions().env;
   options.compaction_style = kCompactionStyleFIFO;
   options.write_buffer_size = 10 << 10;  // 10KB
   options.arena_block_size = 4096;
   options.compression = kNoCompression;
   options.create_if_missing = true;
   options.compaction_options_fifo.allow_compaction = false;
-  env_->time_elapse_only_sleep_ = false;
+  env_->SetMockSleep();
   options.env = env_;
 
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   // Test dynamically changing ttl.
-  env_->addon_time_.store(0);
   options.ttl = 1 * 60 * 60;  // 1 hour
   ASSERT_OK(TryReopen(options));
 
@@ -698,30 +909,30 @@
   for (int i = 0; i < 10; i++) {
     // Generate and flush a file about 10KB.
     for (int j = 0; j < 10; j++) {
-      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
-  // Add 61 seconds to the time.
-  env_->addon_time_.fetch_add(61);
+  env_->MockSleepForSeconds(61);
 
   // No files should be compacted as ttl is set to 1 hour.
   ASSERT_EQ(dbfull()->GetOptions().ttl, 3600);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
   // Set ttl to 1 minute. So all files should get deleted.
   ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}}));
   ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   // Test dynamically changing compaction_options_fifo.max_table_files_size
-  env_->addon_time_.store(0);
   options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 00KB
   options.ttl = 0;
   DestroyAndReopen(options);
@@ -729,9 +940,9 @@
   for (int i = 0; i < 10; i++) {
     // Generate and flush a file about 10KB.
     for (int j = 0; j < 10; j++) {
-      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
@@ -739,7 +950,7 @@
   // No files should be compacted as max_table_files_size is set to 500 KB.
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
             500 << 10);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
   // Set max_table_files_size to 12 KB. So only 1 file should remain now.
@@ -747,7 +958,7 @@
       {{"compaction_options_fifo", "{max_table_files_size=12288;}"}}));
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
             12 << 10);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
 
@@ -761,9 +972,9 @@
   for (int i = 0; i < 10; i++) {
     // Generate and flush a file about 10KB.
     for (int j = 0; j < 10; j++) {
-      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
@@ -772,7 +983,7 @@
   // allow_compaction is false
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
             false);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
   // Set allow_compaction to true. So number of files should be between 1 and 5.
@@ -780,7 +991,7 @@
       {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
             true);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_GE(NumTableFilesAtLevel(0), 1);
   ASSERT_LE(NumTableFilesAtLevel(0), 5);
@@ -801,14 +1012,14 @@
   ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}}));
   ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size);
   for (int i = 0; i < 1024; i++) {
-    Put(Key(i), kValue);
+    ASSERT_OK(Put(Key(i), kValue));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (int i = 0; i < 1024 * 2; i++) {
-    Put(Key(i), kValue);
+    ASSERT_OK(Put(Key(i), kValue));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(256, env_->compaction_readahead_size_);
   Close();
 }
@@ -818,6 +1029,7 @@
   options.compaction_style = kCompactionStyleFIFO;
   options.write_buffer_size = 10 << 10;  // 10KB
   options.create_if_missing = true;
+  options.env = CurrentOptions().env;
 
   ASSERT_OK(TryReopen(options));
 
@@ -825,9 +1037,9 @@
   for (int i = 0; i < 10; i++) {
     // Generate and flush a file about 10KB.
     for (int j = 0; j < 10; j++) {
-      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
@@ -859,8 +1071,129 @@
   ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
 }
 
+TEST_F(DBOptionsTest, ChangeCompression) {
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+  Options options;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  options.compression = CompressionType::kLZ4Compression;
+  options.bottommost_compression = CompressionType::kNoCompression;
+  options.bottommost_compression_opts.level = 2;
+  options.bottommost_compression_opts.parallel_threads = 1;
+  options.env = CurrentOptions().env;
+
+  ASSERT_OK(TryReopen(options));
+
+  CompressionType compression_used = CompressionType::kLZ4Compression;
+  CompressionOptions compression_opt_used;
+  bool compacted = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* c = reinterpret_cast<Compaction*>(arg);
+        compression_used = c->output_compression();
+        compression_opt_used = c->output_compression_opts();
+        compacted = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kNoCompression, compression_used);
+  ASSERT_EQ(options.compression_opts.level, compression_opt_used.level);
+  ASSERT_EQ(options.compression_opts.parallel_threads,
+            compression_opt_used.parallel_threads);
+
+  compression_used = CompressionType::kLZ4Compression;
+  compacted = false;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"bottommost_compression", "kSnappyCompression"},
+       {"bottommost_compression_opts", "0:6:0:0:4:true"}}));
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kSnappyCompression, compression_used);
+  ASSERT_EQ(6, compression_opt_used.level);
+  // Right now parallel_level is not yet allowed to be changed.
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 #endif  // ROCKSDB_LITE
 
+TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) {
+  // Verify the bottommost compression options still take effect even when the
+  // bottommost compression type is left at its default value. Verify for both
+  // automatic and manual compaction.
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+
+  constexpr int kUpperCompressionLevel = 1;
+  constexpr int kBottommostCompressionLevel = 2;
+  constexpr int kNumL0Files = 2;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.compression = CompressionType::kLZ4Compression;
+  options.compression_opts.level = kUpperCompressionLevel;
+  options.bottommost_compression_opts.level = kBottommostCompressionLevel;
+  options.bottommost_compression_opts.enabled = true;
+  Reopen(options);
+
+  CompressionType compression_used = CompressionType::kDisableCompressionOption;
+  CompressionOptions compression_opt_used;
+  bool compacted = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) {
+        Compaction* c = static_cast<Compaction*>(arg);
+        compression_used = c->output_compression();
+        compression_opt_used = c->output_compression_opts();
+        compacted = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // First, verify for automatic compaction.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put("foo", "foofoofoo"));
+    ASSERT_OK(Put("bar", "foofoofoo"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+
+  // Second, verify for manual compaction.
+  compacted = false;
+  compression_used = CompressionType::kDisableCompressionOption;
+  compression_opt_used = CompressionOptions();
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_properties_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_properties_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_properties_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_properties_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -19,6 +19,7 @@
 #include "rocksdb/perf_context.h"
 #include "rocksdb/perf_level.h"
 #include "rocksdb/table.h"
+#include "test_util/mock_time_env.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
@@ -26,7 +27,27 @@
 
 class DBPropertiesTest : public DBTestBase {
  public:
-  DBPropertiesTest() : DBTestBase("/db_properties_test") {}
+  DBPropertiesTest()
+      : DBTestBase("db_properties_test", /*env_do_fsync=*/false) {}
+
+  void AssertDbStats(const std::map<std::string, std::string>& db_stats,
+                     double expected_uptime, int expected_user_bytes_written,
+                     int expected_wal_bytes_written,
+                     int expected_user_writes_by_self,
+                     int expected_user_writes_with_wal) {
+    ASSERT_EQ(std::to_string(expected_uptime), db_stats.at("db.uptime"));
+    ASSERT_EQ(std::to_string(expected_wal_bytes_written),
+              db_stats.at("db.wal_bytes_written"));
+    ASSERT_EQ("0", db_stats.at("db.wal_syncs"));
+    ASSERT_EQ(std::to_string(expected_user_bytes_written),
+              db_stats.at("db.user_bytes_written"));
+    ASSERT_EQ("0", db_stats.at("db.user_writes_by_other"));
+    ASSERT_EQ(std::to_string(expected_user_writes_by_self),
+              db_stats.at("db.user_writes_by_self"));
+    ASSERT_EQ(std::to_string(expected_user_writes_with_wal),
+              db_stats.at("db.user_writes_with_wal"));
+    ASSERT_EQ("0", db_stats.at("db.user_write_stall_micros"));
+  }
 };
 
 #ifndef ROCKSDB_LITE
@@ -52,12 +73,12 @@
 
     // Block sync calls
     env_->delay_sstable_sync_.store(true, std::memory_order_release);
-    Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
+    ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
     ASSERT_TRUE(dbfull()->GetProperty(
         handles_[1], "rocksdb.num-entries-active-mem-table", &num));
     ASSERT_EQ("2", num);
 
-    Put(1, "k2", std::string(100000, 'y'));  // Trigger compaction
+    ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger compaction
     ASSERT_TRUE(dbfull()->GetProperty(
         handles_[1], "rocksdb.num-entries-active-mem-table", &num));
     ASSERT_EQ("1", num);
@@ -97,10 +118,10 @@
   uint64_t v1, v2, v3;
   ASSERT_TRUE(
       dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1));
-  Put("12345678", "");
+  ASSERT_OK(Put("12345678", ""));
   ASSERT_TRUE(
       dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_TRUE(
       dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3));
 
@@ -126,8 +147,8 @@
   Random rnd(301);
   for (auto* handle : handles_) {
     for (int i = 0; i < kKeyNum; ++i) {
-      db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize),
-               RandomString(&rnd, kValueSize));
+      ASSERT_OK(db_->Put(WriteOptions(), handle, rnd.RandomString(kKeySize),
+                         rnd.RandomString(kValueSize)));
     }
   }
 
@@ -153,7 +174,7 @@
         DB::Properties::kEstimateTableReadersMem, &before_flush_trm));
 
     // Issue flush and expect larger memory usage of table readers.
-    db_->Flush(FlushOptions(), handle);
+    ASSERT_OK(db_->Flush(FlushOptions(), handle));
 
     ASSERT_TRUE(db_->GetAggregatedIntProperty(
         DB::Properties::kEstimateTableReadersMem, &after_flush_trm));
@@ -212,7 +233,7 @@
 
 void VerifyTableProperties(
     const TableProperties& base_tp, const TableProperties& new_tp,
-    double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1,
+    double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.18 : 0.1,
     double index_size_bias = 0.1, double data_size_bias = 0.1,
     double num_data_blocks_bias = 0.05) {
   VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
@@ -299,9 +320,9 @@
     for (int i = 0; i < files; i++) {
       int rows = files / 10;
       for (int j = 0; j < rows; j++) {
-        db_->Put(WriteOptions(), std::to_string(++key), "foo");
+        ASSERT_OK(db_->Put(WriteOptions(), std::to_string(++key), "foo"));
       }
-      db_->Flush(FlushOptions());
+      ASSERT_OK(db_->Flush(FlushOptions()));
     }
   }
   std::string num;
@@ -335,7 +356,7 @@
     table_options.filter_policy.reset(
         NewBloomFilterPolicy(kBloomBitsPerKey, false));
     table_options.block_size = 1024;
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     DestroyAndReopen(options);
 
@@ -346,23 +367,24 @@
     Random rnd(5632);
     for (int table = 1; table <= kTableCount; ++table) {
       for (int i = 0; i < kPutsPerTable; ++i) {
-        db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
-                 RandomString(&rnd, kValueSize));
+        ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+                           rnd.RandomString(kValueSize)));
       }
       for (int i = 0; i < kDeletionsPerTable; i++) {
-        db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+        ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
       }
       for (int i = 0; i < kMergeOperandsPerTable; i++) {
-        db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
-                   RandomString(&rnd, kValueSize));
+        ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+                             rnd.RandomString(kValueSize)));
       }
       for (int i = 0; i < kRangeDeletionsPerTable; i++) {
-        std::string start = RandomString(&rnd, kKeySize);
+        std::string start = rnd.RandomString(kKeySize);
         std::string end = start;
         end.resize(kValueSize);
-        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   start, end));
       }
-      db_->Flush(FlushOptions());
+      ASSERT_OK(db_->Flush(FlushOptions()));
     }
     std::string property;
     db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
@@ -409,11 +431,11 @@
   int key_index = 0;
   Random rnd(301);
   for (int num = 0; num < 8; num++) {
-    Put("foo", "bar");
+    ASSERT_OK(Put("foo", "bar"));
     GenerateNewFile(&rnd, &key_index);
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   std::string prop;
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
@@ -429,7 +451,7 @@
 
   // Reopen and issue Get(). See thee latency tracked
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   for (int key = 0; key < key_index; key++) {
     Get(Key(key));
   }
@@ -457,6 +479,7 @@
     std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
     for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
     }
+    ASSERT_OK(iter->status());
   }
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
   ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
@@ -470,9 +493,9 @@
   ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
   ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
   // put something and read it back , CF 1 should show histogram.
-  Put(1, "foo", "bar");
-  Flush(1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("bar", Get(1, "foo"));
 
   ASSERT_TRUE(
@@ -498,7 +521,7 @@
   ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
 
   // Clear internal stats
-  dbfull()->ResetStats();
+  ASSERT_OK(dbfull()->ResetStats());
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
   ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
   ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
@@ -533,7 +556,7 @@
   table_options.filter_policy.reset(
       NewBloomFilterPolicy(kBloomBitsPerKey, false));
   table_options.block_size = 1024;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   DestroyAndReopen(options);
 
@@ -546,24 +569,25 @@
   TableProperties tp, sum_tp, expected_tp;
   for (int table = 1; table <= kTableCount; ++table) {
     for (int i = 0; i < kPutsPerTable; ++i) {
-      db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
-               RandomString(&rnd, kValueSize));
+      ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+                         rnd.RandomString(kValueSize)));
     }
     for (int i = 0; i < kDeletionsPerTable; i++) {
-      db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+      ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
     }
     for (int i = 0; i < kMergeOperandsPerTable; i++) {
-      db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
-                 RandomString(&rnd, kValueSize));
+      ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+                           rnd.RandomString(kValueSize)));
     }
     for (int i = 0; i < kRangeDeletionsPerTable; i++) {
-      std::string start = RandomString(&rnd, kKeySize);
+      std::string start = rnd.RandomString(kKeySize);
       std::string end = start;
       end.resize(kValueSize);
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 start, end));
     }
-    db_->Flush(FlushOptions());
-    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     ResetTableProperties(&sum_tp);
     for (int level = 0; level < kMaxLevel; ++level) {
       db_->GetProperty(
@@ -603,7 +627,8 @@
           value_is_delta_encoded);
       // Gives larger bias here as index block size, filter block size,
       // and data block size become much harder to estimate in this test.
-      VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
+      VerifyTableProperties(expected_tp, tp, CACHE_LINE_SIZE >= 256 ? 0.6 : 0.5,
+                            0.4, 0.4, 0.25);
     }
   }
 }
@@ -828,7 +853,7 @@
   // Wait for compaction to be done. This is important because otherwise RocksDB
   // might schedule a compaction when reopening the database, failing assertion
   // (A) as a result.
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   options.max_open_files = 10;
   Reopen(options);
   // After reopening, no table reader is loaded, so no memory for table readers
@@ -856,7 +881,7 @@
     std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
-    Flush();
+    ASSERT_OK(Flush());
     ASSERT_TRUE(
         dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
     ASSERT_EQ(int_num, 2U);
@@ -865,7 +890,7 @@
     std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
 
     ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
-    Flush();
+    ASSERT_OK(Flush());
     ASSERT_TRUE(
         dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
     ASSERT_EQ(int_num, 3U);
@@ -920,11 +945,12 @@
   for (int r = 0; r < kNumRounds; ++r) {
     for (int f = 0; f < kFlushesPerRound; ++f) {
       for (int w = 0; w < kWritesPerFlush; ++w) {
-        Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+        ASSERT_OK(
+            Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
       }
     }
     // Make sure that there is no flush between getting the two properties.
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
     dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
     // in no iterator case, these two number should be the same.
@@ -938,12 +964,13 @@
     iters.push_back(db_->NewIterator(ReadOptions()));
     for (int f = 0; f < kFlushesPerRound; ++f) {
       for (int w = 0; w < kWritesPerFlush; ++w) {
-        Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+        ASSERT_OK(
+            Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
       }
     }
     // Force flush to prevent flush from happening between getting the
     // properties or after getting the properties and before the new round.
-    Flush();
+    ASSERT_OK(Flush());
 
     // In the second round, add iterators.
     dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
@@ -958,6 +985,7 @@
   // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks
   // whenever we release an iterator.
   for (auto* iter : iters) {
+    ASSERT_OK(iter->status());
     delete iter;
     dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
     // Expect the size shrinking
@@ -1007,19 +1035,19 @@
   uint64_t int_num;
 
   ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_TRUE(dbfull()->GetIntProperty(
       "rocksdb.estimate-pending-compaction-bytes", &int_num));
   ASSERT_EQ(int_num, 0U);
 
   ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_TRUE(dbfull()->GetIntProperty(
       "rocksdb.estimate-pending-compaction-bytes", &int_num));
   ASSERT_GT(int_num, 0U);
 
   ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_TRUE(dbfull()->GetIntProperty(
       "rocksdb.estimate-pending-compaction-bytes", &int_num));
   ASSERT_GT(int_num, 0U);
@@ -1027,7 +1055,7 @@
   sleeping_task_low.WakeUp();
   sleeping_task_low.WaitUntilDone();
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(dbfull()->GetIntProperty(
       "rocksdb.estimate-pending-compaction-bytes", &int_num));
   ASSERT_EQ(int_num, 0U);
@@ -1057,7 +1085,7 @@
       std::string key = ToString(i) + ToString(j) + "key";
       ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   // no compression at L0, so ratio is less than one
@@ -1065,7 +1093,7 @@
   ASSERT_GT(CompressionRatioAtLevel(0), 0.0);
   ASSERT_EQ(CompressionRatioAtLevel(1), -1.0);
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
 
   ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
   // Data at L1 should be highly compressed thanks to Snappy and redundant data
@@ -1168,6 +1196,61 @@
   }
 };
 
+class BlockCountingTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  static const std::string kNumSampledBlocksPropertyName;
+
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollector";
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    (*properties)[kNumSampledBlocksPropertyName] =
+        ToString(num_sampled_blocks_);
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_raw_bytes */,
+                uint64_t block_compressed_bytes_fast,
+                uint64_t block_compressed_bytes_slow) override {
+    if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) {
+      num_sampled_blocks_++;
+    }
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{
+        {kNumSampledBlocksPropertyName, ToString(num_sampled_blocks_)},
+    };
+  }
+
+ private:
+  uint32_t num_sampled_blocks_ = 0;
+};
+
+const std::string
+    BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName =
+        "NumSampledBlocks";
+
+class BlockCountingTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollectorFactory";
+  }
+
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /* context */) override {
+    return new BlockCountingTablePropertiesCollector();
+  }
+};
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
   Options options = CurrentOptions();
@@ -1180,9 +1263,9 @@
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+      ASSERT_OK(db_->Put(WriteOptions(), ToString(table * 100 + i), "val"));
     }
-    db_->Flush(FlushOptions());
+    ASSERT_OK(db_->Flush(FlushOptions()));
   }
 
   TablePropertiesCollection props;
@@ -1204,7 +1287,7 @@
 
   ASSERT_GT(collector_factory->num_created_, 0U);
   collector_factory->num_created_ = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_GT(collector_factory->num_created_, 0U);
 }
 #endif  // ROCKSDB_LITE
@@ -1220,9 +1303,9 @@
   // Create 2 files
   for (int table = 0; table < 2; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(1, ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(1, ToString(table * 100 + i), "val"));
     }
-    Flush(1);
+    ASSERT_OK(Flush(1));
   }
   ASSERT_GT(collector_factory->num_created_, 0U);
 
@@ -1230,15 +1313,15 @@
   // Trigger automatic compactions.
   for (int table = 0; table < 3; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(1, ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(1, ToString(table * 100 + i), "val"));
     }
-    Flush(1);
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_GT(collector_factory->num_created_, 0U);
 
   collector_factory->num_created_ = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
   ASSERT_GT(collector_factory->num_created_, 0U);
 
   // Come back to write to default column family
@@ -1247,9 +1330,9 @@
   // Create 4 tables in default column family
   for (int table = 0; table < 2; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(ToString(table * 100 + i), "val"));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_GT(collector_factory->num_created_, 0U);
 
@@ -1257,15 +1340,15 @@
   // Trigger automatic compactions.
   for (int table = 0; table < 3; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(ToString(table * 100 + i), "val"));
     }
-    Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_GT(collector_factory->num_created_, 0U);
 
   collector_factory->num_created_ = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_GT(collector_factory->num_created_, 0U);
 }
 
@@ -1296,18 +1379,18 @@
 
   const int kMaxKey = 1000;
   for (int i = 0; i < kMaxKey; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
-    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   if (NumTableFilesAtLevel(0) == 1) {
     // Clear Level 0 so that when later flush a file with deletions,
     // we don't trigger an organic compaction.
     ASSERT_OK(Put(Key(0), ""));
     ASSERT_OK(Put(Key(kMaxKey * 2), ""));
-    Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
@@ -1319,17 +1402,18 @@
       iter->Next();
       ++c;
     }
+    ASSERT_OK(iter->status());
     ASSERT_EQ(c, 200);
   }
 
-  Delete(Key(0));
+  ASSERT_OK(Delete(Key(0)));
   for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) {
-    Delete(Key(i));
+    ASSERT_OK(Delete(Key(i)));
   }
-  Delete(Key(kMaxKey * 2));
+  ASSERT_OK(Delete(Key(kMaxKey * 2)));
 
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   {
     SetPerfLevel(kEnableCount);
@@ -1340,6 +1424,7 @@
     while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
       iter->Next();
     }
+    ASSERT_OK(iter->status());
     ASSERT_EQ(c, 0);
     ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u);
     ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u);
@@ -1370,14 +1455,14 @@
   for (int i = 0; i < kMaxKey; i++) {
     ASSERT_OK(Put(Key(i), ""));
   }
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   for (int i = 1; i < kMaxKey - 1; i++) {
-    Delete(Key(i));
+    ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
 
   // Restart the DB. Although number of files didn't reach
@@ -1385,7 +1470,7 @@
   // still be triggered because of the need-compaction hint.
   options.disable_auto_compactions = false;
   Reopen(options);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   {
     SetPerfLevel(kEnableCount);
@@ -1395,6 +1480,7 @@
     for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
       c++;
     }
+    ASSERT_OK(iter->status());
     ASSERT_EQ(c, 2);
     ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
     // We iterate every key twice. Is it a bug?
@@ -1403,25 +1489,149 @@
   }
 }
 
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
+  // Sampled compression requires at least one of the following four types.
+  if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
+      !ZSTD_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.table_properties_collector_factories.emplace_back(
+      std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
+
+  for (bool sample_for_compression : {false, true}) {
+    // For simplicity/determinism, sample 100% when enabled, or 0% when disabled
+    options.sample_for_compression = sample_for_compression ? 1 : 0;
+
+    DestroyAndReopen(options);
+
+    // Setup the following LSM:
+    //
+    // L0_0 ["a", "b"]
+    // L1_0 ["a", "b"]
+    //
+    // L0_0 was created by flush. L1_0 was created by compaction. Each file
+    // contains one data block.
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put("a", "val"));
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Flush());
+      if (i == 1) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+
+    // A `BlockAdd()` should have been seen for files generated by flush or
+    // compaction when `sample_for_compression` is enabled.
+    TablePropertiesCollection file_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+    ASSERT_EQ(2, file_to_props.size());
+    for (const auto& file_and_props : file_to_props) {
+      auto& user_props = file_and_props.second->user_collected_properties;
+      ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
+                                      kNumSampledBlocksPropertyName) !=
+                  user_props.end());
+      ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
+                                  kNumSampledBlocksPropertyName),
+                ToString(sample_for_compression ? 1 : 0));
+    }
+  }
+}
+
+class CompressionSamplingDBPropertiesTest
+    : public DBPropertiesTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {}
+
+ protected:
+  const bool fast_;
+};
+
+INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest,
+                        CompressionSamplingDBPropertiesTest, ::testing::Bool());
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_P(CompressionSamplingDBPropertiesTest,
+       EstimateDataSizeWithCompressionSampling) {
+  Options options = CurrentOptions();
+  if (fast_) {
+    // One of the following light compression libraries must be present.
+    if (LZ4_Supported()) {
+      options.compression = kLZ4Compression;
+    } else if (Snappy_Supported()) {
+      options.compression = kSnappyCompression;
+    } else {
+      return;
+    }
+  } else {
+    // One of the following heavy compression libraries must be present.
+    if (ZSTD_Supported()) {
+      options.compression = kZSTD;
+    } else if (Zlib_Supported()) {
+      options.compression = kZlibCompression;
+    } else {
+      return;
+    }
+  }
+  options.disable_auto_compactions = true;
+  // For simplicity/determinism, sample 100%.
+  options.sample_for_compression = 1;
+  Reopen(options);
+
+  // Setup the following LSM:
+  //
+  // L0_0 ["a", "b"]
+  // L1_0 ["a", "b"]
+  //
+  // L0_0 was created by flush. L1_0 was created by compaction. Each file
+  // contains one data block. The value consists of compressible data so the
+  // data block should be stored compressed.
+  std::string val(1024, 'a');
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("a", val));
+    ASSERT_OK(Put("b", val));
+    ASSERT_OK(Flush());
+    if (i == 1) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+  }
+
+  TablePropertiesCollection file_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+  ASSERT_EQ(2, file_to_props.size());
+  for (const auto& file_and_props : file_to_props) {
+    ASSERT_GT(file_and_props.second->data_size, 0);
+    if (fast_) {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->fast_compression_estimated_data_size);
+    } else {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->slow_compression_estimated_data_size);
+    }
+  }
+}
+
 TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
-  Options options;
+  Options options = CurrentOptions();
   Reopen(options);
-  Put("foo", "bar");
-  Delete("foo");
-  Delete("foo");
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Delete("foo"));
+  ASSERT_OK(Delete("foo"));
   uint64_t num_keys = 0;
   ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys));
   ASSERT_EQ(0, num_keys);
 }
 
 TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
-  std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
   uint64_t oldest_key_time = 0;
-  Options options;
-  options.env = mock_env.get();
+  Options options = CurrentOptions();
+  SetTimeElapseOnlySleepOnReopen(&options);
 
   // "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
-  mock_env->set_current_time(100);
   for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
                           kCompactionStyleNone}) {
     options.compaction_style = compaction;
@@ -1432,60 +1642,61 @@
         DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
   }
 
+  int64_t mock_start_time;
+  ASSERT_OK(env_->GetCurrentTime(&mock_start_time));
+
   options.compaction_style = kCompactionStyleFIFO;
   options.ttl = 300;
+  options.max_open_files = -1;
   options.compaction_options_fifo.allow_compaction = false;
   DestroyAndReopen(options);
 
-  mock_env->set_current_time(100);
+  env_->MockSleepForSeconds(100);
   ASSERT_OK(Put("k1", "v1"));
   ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
                                        &oldest_key_time));
-  ASSERT_EQ(100, oldest_key_time);
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
   ASSERT_OK(Flush());
   ASSERT_EQ("1", FilesPerLevel());
   ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
                                        &oldest_key_time));
-  ASSERT_EQ(100, oldest_key_time);
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
 
-  mock_env->set_current_time(200);
+  env_->MockSleepForSeconds(100);  // -> 200
   ASSERT_OK(Put("k2", "v2"));
   ASSERT_OK(Flush());
   ASSERT_EQ("2", FilesPerLevel());
   ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
                                        &oldest_key_time));
-  ASSERT_EQ(100, oldest_key_time);
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
 
-  mock_env->set_current_time(300);
+  env_->MockSleepForSeconds(100);  // -> 300
   ASSERT_OK(Put("k3", "v3"));
   ASSERT_OK(Flush());
   ASSERT_EQ("3", FilesPerLevel());
   ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
                                        &oldest_key_time));
-  ASSERT_EQ(100, oldest_key_time);
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
 
-  mock_env->set_current_time(450);
+  env_->MockSleepForSeconds(150);  // -> 450
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("2", FilesPerLevel());
   ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
                                        &oldest_key_time));
-  ASSERT_EQ(200, oldest_key_time);
+  ASSERT_EQ(200, oldest_key_time - mock_start_time);
 
-  mock_env->set_current_time(550);
+  env_->MockSleepForSeconds(100);  // -> 550
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("1", FilesPerLevel());
   ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
                                        &oldest_key_time));
-  ASSERT_EQ(300, oldest_key_time);
+  ASSERT_EQ(300, oldest_key_time - mock_start_time);
 
-  mock_env->set_current_time(650);
+  env_->MockSleepForSeconds(100);  // -> 650
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("", FilesPerLevel());
   ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
                                         &oldest_key_time));
-
-  // Close before mock_env destructs.
-  Close();
 }
 
 TEST_F(DBPropertiesTest, SstFilesSize) {
@@ -1516,6 +1727,7 @@
   std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
 
   Options options;
+  options.env = CurrentOptions().env;
   options.disable_auto_compactions = true;
   options.listeners.push_back(listener);
   Reopen(options);
@@ -1588,11 +1800,11 @@
 
   for (int i = 0; i < kNumL0Files; ++i) {
     // Make sure they overlap in keyspace to prevent trivial move
-    Put("key1", "val");
-    Put("key2", "val");
-    Flush();
+    ASSERT_OK(Put("key1", "val"));
+    ASSERT_OK(Put("key2", "val"));
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(listener->Validated());
 }
 
@@ -1600,6 +1812,8 @@
   Options options;
   uint64_t value;
 
+  options.env = CurrentOptions().env;
+
   // Block cache properties are not available for tables other than
   // block-based table.
   options.table_factory.reset(NewPlainTableFactory());
@@ -1650,7 +1864,8 @@
 
   // Insert unpinned item to the cache and check size.
   constexpr size_t kSize1 = 50;
-  block_cache->Insert("item1", nullptr /*value*/, kSize1, nullptr /*deleter*/);
+  ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, kSize1,
+                                nullptr /*deleter*/));
   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
   ASSERT_EQ(kCapacity, value);
   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
@@ -1662,8 +1877,8 @@
   // Insert pinned item to the cache and check size.
   constexpr size_t kSize2 = 30;
   Cache::Handle* item2 = nullptr;
-  block_cache->Insert("item2", nullptr /*value*/, kSize2, nullptr /*deleter*/,
-                      &item2);
+  ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, kSize2,
+                                nullptr /*deleter*/, &item2));
   ASSERT_NE(nullptr, item2);
   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
   ASSERT_EQ(kCapacity, value);
@@ -1676,8 +1891,8 @@
   // Insert another pinned item to make the cache over-sized.
   constexpr size_t kSize3 = 80;
   Cache::Handle* item3 = nullptr;
-  block_cache->Insert("item3", nullptr /*value*/, kSize3, nullptr /*deleter*/,
-                      &item3);
+  ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, kSize3,
+                                nullptr /*deleter*/, &item3));
   ASSERT_NE(nullptr, item2);
   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
   ASSERT_EQ(kCapacity, value);
@@ -1701,7 +1916,80 @@
   ASSERT_EQ(0, value);
 }
 
+TEST_F(DBPropertiesTest, GetMapPropertyDbStats) {
+  auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+  CompositeEnvWrapper env(env_, mock_clock);
+
+  Options opts = CurrentOptions();
+  opts.env = &env;
+  Reopen(opts);
+
+  {
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 0.0 /* expected_uptime */,
+                  0 /* expected_user_bytes_written */,
+                  0 /* expected_wal_bytes_written */,
+                  0 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  {
+    mock_clock->SleepForMicroseconds(1500000);
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  0 /* expected_user_bytes_written */,
+                  0 /* expected_wal_bytes_written */,
+                  0 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  int expected_user_bytes_written = 0;
+  {
+    // Write with WAL disabled.
+    WriteOptions write_opts;
+    write_opts.disableWAL = true;
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "val"));
+    expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+
+    ASSERT_OK(db_->Write(write_opts, &batch));
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  expected_user_bytes_written,
+                  0 /* expected_wal_bytes_written */,
+                  1 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  int expected_wal_bytes_written = 0;
+  {
+    // Write with WAL enabled.
+    WriteBatch batch;
+    ASSERT_OK(batch.Delete("key"));
+    expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+    expected_wal_bytes_written += static_cast<int>(batch.GetDataSize());
+
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  expected_user_bytes_written, expected_wal_bytes_written,
+                  2 /* expected_user_writes_by_self */,
+                  1 /* expected_user_writes_with_wal */);
+  }
+
+  Close();
+}
+
 #endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_range_del_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_range_del_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_range_del_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_range_del_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,13 +7,14 @@
 #include "port/stack_trace.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBRangeDelTest : public DBTestBase {
  public:
-  DBRangeDelTest() : DBTestBase("/db_range_del_test") {}
+  DBRangeDelTest() : DBTestBase("db_range_del_test", /*env_do_fsync=*/false) {}
 
   std::string GetNumericStr(int key) {
     uint64_t uint64_key = static_cast<uint64_t>(key);
@@ -47,6 +48,21 @@
   ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported());
 }
 
+TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "b"));
+  ASSERT_EQ("val", Get("b"));
+}
+
+TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+  ASSERT_TRUE(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a")
+          .IsInvalidArgument());
+  ASSERT_EQ("val", Get("b"));
+}
+
 TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
   do {
     DestroyAndReopen(CurrentOptions());
@@ -57,6 +73,15 @@
   } while (ChangeOptions(kRangeDelSkipConfigs));
 }
 
+TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) {
+  Options opts = CurrentOptions();
+  opts.compression_opts.max_dict_bytes = 16384;
+  Reopen(opts);
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr2"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+}
+
 TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
   do {
     Options opts = CurrentOptions();
@@ -66,13 +91,14 @@
 
     // snapshot protects range tombstone from dropping due to becoming obsolete.
     const Snapshot* snapshot = db_->GetSnapshot();
-    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
-    db_->Flush(FlushOptions());
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
 
     ASSERT_EQ(1, NumTableFilesAtLevel(0));
     ASSERT_EQ(0, NumTableFilesAtLevel(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                                true /* disallow_trivial_move */);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                          true /* disallow_trivial_move */));
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(1, NumTableFilesAtLevel(1));
     ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
@@ -92,7 +118,7 @@
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
   options.level0_file_num_compaction_trigger = kNumFiles;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
   options.num_levels = 2;
   options.target_file_size_base = kFileBytes;
   BlockBasedTableOptions table_options;
@@ -102,28 +128,29 @@
 
   // snapshot protects range tombstone from dropping due to becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
 
   Random rnd(301);
   for (int i = 0; i < kNumFiles; ++i) {
     std::vector<std::string> values;
     // Write 12K (4 values, each 3K)
     for (int j = 0; j < kNumPerFile; j++) {
-      values.push_back(RandomString(&rnd, 3 << 10));
+      values.push_back(rnd.RandomString(3 << 10));
       ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
       if (j == 0 && i > 0) {
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       }
     }
   }
   // put extra key to trigger final flush
   ASSERT_OK(Put("", ""));
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_EQ(2, NumTableFilesAtLevel(1));
   db_->ReleaseSnapshot(snapshot);
@@ -139,42 +166,61 @@
   opts.disable_auto_compactions = true;
   opts.level0_file_num_compaction_trigger = kNumFiles;
   opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
-  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
   // Want max_compaction_bytes to trigger the end of compaction output file, not
   // target_file_size_base, so make the latter much bigger
-  opts.target_file_size_base = 100 * opts.max_compaction_bytes;
-  Reopen(opts);
+  //  opts.target_file_size_base = 100 * opts.max_compaction_bytes;
+  opts.target_file_size_base = 1;
+  DestroyAndReopen(opts);
 
   // snapshot protects range tombstone from dropping due to becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
 
+  Random rnd(301);
+
+  ASSERT_OK(Put(GetNumericStr(0), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(
+      Put(GetNumericStr(kNumPerFile - 1), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(GetNumericStr(kNumPerFile), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(
+      Put(GetNumericStr(kNumPerFile * 2 - 1), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(NumTableFilesAtLevel(2), 2);
+
+  ASSERT_OK(db_->SetOptions(
+      db_->DefaultColumnFamily(),
+      {{"target_file_size_base", ToString(100 * opts.max_compaction_bytes)}}));
+
   // It spans the whole key-range, thus will be included in all output files
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
                              GetNumericStr(0),
                              GetNumericStr(kNumFiles * kNumPerFile - 1)));
-  Random rnd(301);
+
   for (int i = 0; i < kNumFiles; ++i) {
     std::vector<std::string> values;
     // Write 1MB (256 values, each 4K)
     for (int j = 0; j < kNumPerFile; j++) {
-      values.push_back(RandomString(&rnd, kBytesPerVal));
+      values.push_back(rnd.RandomString(kBytesPerVal));
       ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
     }
     // extra entry to trigger SpecialSkipListFactory's flush
     ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
   }
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+                                        /*column_family=*/nullptr,
+                                        /*disallow_trivial_move=*/true));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GE(NumTableFilesAtLevel(1), 2);
-
   std::vector<std::vector<FileMetaData>> files;
   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
 
-  for (size_t i = 0; i < files[1].size() - 1; ++i) {
+  for (size_t i = 0; i + 1 < files[1].size(); ++i) {
     ASSERT_TRUE(InternalKeyComparator(opts.comparator)
                     .Compare(files[1][i].largest, files[1][i + 1].smallest) <
                 0);
@@ -205,10 +251,10 @@
 }
 
 TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
-  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
-  db_->Put(WriteOptions(), "b2", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "b2", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
   // first iteration verifies query correctness in memtable, second verifies
@@ -225,8 +271,9 @@
 }
 
 TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
-  db_->Put(WriteOptions(), "unused", "val");  // prevents empty after compaction
-  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "unused",
+                     "val"));  // prevents empty after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
@@ -238,8 +285,8 @@
 
   for (int i = 0; i < 2; ++i) {
     if (i > 0) {
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                                  true /* disallow_trivial_move */);
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                            true /* disallow_trivial_move */));
       ASSERT_EQ(0, NumTableFilesAtLevel(0));
       ASSERT_EQ(1, NumTableFilesAtLevel(1));
     }
@@ -253,7 +300,7 @@
   const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
   Options opts = CurrentOptions();
   opts.comparator = test::Uint64Comparator();
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   // Write a third before snapshot, a third between snapshot and tombstone, and
   // a third after the tombstone. Keys older than snapshot or newer than the
@@ -263,12 +310,13 @@
     if (i == kNum / 3) {
       snapshot = db_->GetSnapshot();
     } else if (i == 2 * kNum / 3) {
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
   }
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   for (int i = 0; i < kNum; ++i) {
     ReadOptions read_opts;
@@ -290,32 +338,35 @@
   Options opts = CurrentOptions();
   opts.comparator = test::Uint64Comparator();
   opts.disable_auto_compactions = true;
-  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
   opts.num_levels = 2;
   opts.statistics = CreateDBStatistics();
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   for (int i = 0; i < kNumFiles; ++i) {
     if (i > 0) {
       // range tombstone covers first half of the previous file
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr((i - 1) * kNumPerFile),
-                       GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2));
+      ASSERT_OK(db_->DeleteRange(
+          WriteOptions(), db_->DefaultColumnFamily(),
+          GetNumericStr((i - 1) * kNumPerFile),
+          GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)));
     }
     // Make sure a given key appears in each file so compaction won't be able to
     // use trivial move, which would happen if the ranges were non-overlapping.
     // Also, we need an extra element since flush is only triggered when the
     // number of keys is one greater than SpecialSkipListFactory's limit.
     // We choose a key outside the key-range used by the test to avoid conflict.
-    db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles),
+                       "val"));
 
     for (int j = 0; j < kNumPerFile; ++j) {
-      db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val");
+      ASSERT_OK(
+          db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
   }
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GT(NumTableFilesAtLevel(1), 0);
   ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
@@ -345,7 +396,7 @@
   options.level0_file_num_compaction_trigger = kNumFiles;
   options.max_bytes_for_level_base = 2 * kFileBytes;
   options.max_subcompactions = 4;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
   options.num_levels = 3;
   options.target_file_size_base = kFileBytes;
   options.target_file_size_multiplier = 1;
@@ -357,18 +408,18 @@
       if (i > 0) {
         // delete [95,105) in two files, [295,305) in next two
         int mid = (j + (1 - j % 2)) * kNumPerFile;
-        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                         Key(mid - 5), Key(mid + 5));
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
       }
       std::vector<std::string> values;
       // Write 100KB (100 values, each 1K)
       for (int k = 0; k < kNumPerFile; k++) {
-        values.push_back(RandomString(&rnd, 990));
+        values.push_back(rnd.RandomString(990));
         ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
       }
       // put extra key to trigger flush
       ASSERT_OK(Put("", ""));
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       if (j < kNumFiles - 1) {
         // background compaction may happen early for kNumFiles'th file
         ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
@@ -384,7 +435,7 @@
         // oversized L0 (relative to base_level) causes the compaction to run
         // earlier.
         ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
-        dbfull()->TEST_WaitForCompact();
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
         ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                                   {{"disable_auto_compactions", "true"}}));
         ASSERT_EQ(NumTableFilesAtLevel(0), 0);
@@ -404,7 +455,7 @@
   options.compaction_style = kCompactionStyleUniversal;
   options.level0_file_num_compaction_trigger = kFilesPerLevel;
   options.max_subcompactions = 4;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
   options.num_levels = kNumLevels;
   options.target_file_size_base = kNumPerFile << 10;
   options.target_file_size_multiplier = 1;
@@ -417,24 +468,24 @@
         // insert range deletions [95,105) in two files, [295,305) in next two
         // to prepare L1 for later manual compaction.
         int mid = (j + (1 - j % 2)) * kNumPerFile;
-        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                         Key(mid - 5), Key(mid + 5));
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
       }
       std::vector<std::string> values;
       // Write 100KB (100 values, each 1K)
       for (int k = 0; k < kNumPerFile; k++) {
-        values.push_back(RandomString(&rnd, 990));
+        values.push_back(rnd.RandomString(990));
         ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
       }
       // put extra key to trigger flush
       ASSERT_OK(Put("", ""));
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       if (j < kFilesPerLevel - 1) {
         // background compaction may happen early for kFilesPerLevel'th file
         ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
       }
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
     ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
   }
@@ -444,7 +495,7 @@
   // probably means universal compaction + subcompaction + range deletion are
   // compatible.
   ASSERT_OK(dbfull()->RunManualCompaction(
-      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
           ->cfd(),
       1 /* input_level */, 2 /* output_level */, CompactRangeOptions(),
       nullptr /* begin */, nullptr /* end */, true /* exclusive */,
@@ -457,7 +508,7 @@
   const int kNumPerFile = 3, kNumFiles = 3;
   Options opts = CurrentOptions();
   opts.disable_auto_compactions = true;
-  opts.memtable_factory.reset(new SpecialSkipListFactory(2 * kNumPerFile));
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(2 * kNumPerFile));
   opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
   opts.num_levels = 2;
   Reopen(opts);
@@ -467,17 +518,17 @@
   for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
     if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
       // Delete merge operands from all but the last file
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
-                       "key_");
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
     }
     std::string val;
     PutFixed64(&val, i);
-    db_->Merge(WriteOptions(), "key", val);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
     // we need to prevent trivial move using Puts so compaction will actually
     // process the merge operands.
-    db_->Put(WriteOptions(), "prevent_trivial_move", "");
+    ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", ""));
     if (i > 0 && i % kNumPerFile == 0) {
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
   }
 
@@ -488,7 +539,7 @@
   PutFixed64(&expected, 45);  // 1+2+...+9
   ASSERT_EQ(expected, actual);
 
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   expected.clear();
   ASSERT_OK(db_->Get(read_opts, "key", &actual));
@@ -534,19 +585,19 @@
   opts.statistics = CreateDBStatistics();
   Reopen(opts);
 
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
-                   "dr10");  // obsolete after compaction
-  db_->Put(WriteOptions(), "key", "val");
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr10"));  // obsolete after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
   const Snapshot* snapshot = db_->GetSnapshot();
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
-                   "dr20");  // protected by snapshot
-  db_->Put(WriteOptions(), "key", "val");
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+                             "dr20"));  // protected by snapshot
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(1));
   ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
@@ -581,34 +632,36 @@
   opts.comparator = test::Uint64Comparator();
   opts.level0_file_num_compaction_trigger = 4;
   opts.level0_stop_writes_trigger = 4;
-  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
   opts.num_levels = 2;
   BlockBasedTableOptions bbto;
   bbto.cache_index_and_filter_blocks = true;
   bbto.block_cache = NewLRUCache(8 << 20);
   opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   // Hold a snapshot so range deletions can't become obsolete during compaction
   // to bottommost level (i.e., L1).
   const Snapshot* snapshot = db_->GetSnapshot();
   for (int i = 0; i < kNum; ++i) {
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
     if (i > 0) {
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
     if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
   }
   // Must be > 1 so the first L1 file can be closed before scan finishes
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_GT(NumTableFilesAtLevel(1), 1);
   std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
 
   ReadOptions read_opts;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
   int expected = kRangeEnd;
   iter->SeekToFirst();
   for (auto file_number : file_numbers) {
@@ -626,12 +679,22 @@
   ASSERT_EQ(kNum, expected);
   delete iter;
   db_->ReleaseSnapshot(snapshot);
+
+  // Also test proper cache handling in GetRangeTombstoneIterator,
+  // via TablesRangeTombstoneSummary. (This once triggered memory leak
+  // report with ASAN.)
+  opts.max_open_files = 1;
+  Reopen(opts);
+
+  std::string str;
+  ASSERT_OK(dbfull()->TablesRangeTombstoneSummary(db_->DefaultColumnFamily(),
+                                                  100, &str));
 }
 
 TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
   do {
     DestroyAndReopen(CurrentOptions());
-    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
     ASSERT_OK(
         db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 
@@ -650,13 +713,13 @@
     // memtable can hold. It switches the active memtable to immutable (flush is
     // prevented by the above options) upon inserting an element that would
     // overflow the memtable.
-    opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+    opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
     DestroyAndReopen(opts);
 
-    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
     ASSERT_OK(
         db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-    db_->Put(WriteOptions(), "blah", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "blah", "val"));
 
     ReadOptions read_opts;
     std::string value;
@@ -667,7 +730,7 @@
 TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
   do {
     DestroyAndReopen(CurrentOptions());
-    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
     // snapshot prevents key from being deleted during flush
     const Snapshot* snapshot = db_->GetSnapshot();
     ASSERT_OK(
@@ -690,11 +753,11 @@
   for (int i = 0; i < kNumMergeOps; ++i) {
     std::string val;
     PutFixed64(&val, i);
-    db_->Merge(WriteOptions(), "key", val);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
     if (i == kNumMergeOps / 2) {
       // deletes [0, 5]
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
-                       "key_");
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
     }
   }
 
@@ -715,19 +778,19 @@
   Options opts = CurrentOptions();
   opts.max_write_buffer_number = 4;
   opts.min_write_buffer_number_to_merge = 3;
-  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
   Reopen(opts);
 
-  db_->Put(WriteOptions(), "sst_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
   // snapshot prevents key from being deleted during flush
   const Snapshot* snapshot = db_->GetSnapshot();
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
   ASSERT_OK(db_->Flush(FlushOptions()));
-  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 
@@ -744,21 +807,23 @@
   const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
   Options opts = CurrentOptions();
   opts.comparator = test::Uint64Comparator();
-  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
-  Reopen(opts);
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(opts);
 
   // Write half of the keys before the tombstone and half after the tombstone.
   // Only covered keys (i.e., within the range and older than the tombstone)
   // should be deleted.
   for (int i = 0; i < kNum; ++i) {
     if (i == kNum / 2) {
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
   }
   ReadOptions read_opts;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
 
   int expected = 0;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -777,8 +842,8 @@
   const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
   Options opts = CurrentOptions();
   opts.comparator = test::Uint64Comparator();
-  opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
-  Reopen(opts);
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(opts);
 
   const Snapshot* snapshot = nullptr;
   // Put a snapshot before the range tombstone, verify an iterator using that
@@ -786,14 +851,16 @@
   for (int i = 0; i < kNum; ++i) {
     if (i == kNum / 2) {
       snapshot = db_->GetSnapshot();
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
   }
   ReadOptions read_opts;
   read_opts.snapshot = snapshot;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
 
   int expected = 0;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -809,25 +876,26 @@
   Options opts = CurrentOptions();
   opts.max_write_buffer_number = 4;
   opts.min_write_buffer_number_to_merge = 3;
-  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
   Reopen(opts);
 
-  db_->Put(WriteOptions(), "sst_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
   // snapshot prevents key from being deleted during flush
   const Snapshot* snapshot = db_->GetSnapshot();
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
   ASSERT_OK(db_->Flush(FlushOptions()));
-  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 
   ReadOptions read_opts;
   read_opts.ignore_range_deletions = true;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
   int i = 0;
   std::string expected[] = {"imm_key", "mem_key", "sst_key"};
   for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
@@ -841,7 +909,7 @@
 
 #ifndef ROCKSDB_UBSAN_RUN
 TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
-  db_->Put(WriteOptions(), "key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
   // snapshot prevents key from being deleted during flush
   const Snapshot* snapshot = db_->GetSnapshot();
   ASSERT_OK(
@@ -857,6 +925,7 @@
       iter->SeekToFirst();
     }
     ASSERT_TRUE(iter->status().IsNotSupported());
+
     delete iter;
     if (i == 0) {
       ASSERT_OK(db_->Flush(FlushOptions()));
@@ -866,7 +935,6 @@
   }
   db_->ReleaseSnapshot(snapshot);
 }
-
 #endif  // !ROCKSDB_UBSAN_RUN
 
 TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
@@ -910,8 +978,8 @@
   ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(1));
 
-  db_->EnableAutoCompaction({db_->DefaultColumnFamily()});
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   db_->ReleaseSnapshot(snapshot);
 }
 
@@ -933,7 +1001,7 @@
   for (int i = 0; i < kNumKeys; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
                              Key(kNumKeys)));
   for (int i = 0; i < kNumKeys; ++i) {
@@ -956,7 +1024,7 @@
   options.compression = kNoCompression;
   options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(2 /* num_entries_flush */));
+      test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
   options.target_file_size_base = kValueBytes;
   // i == 0: CompactFiles
   // i == 1: CompactRange
@@ -971,24 +1039,24 @@
 
     // snapshot protects range tombstone from dropping due to becoming obsolete.
     const Snapshot* snapshot = db_->GetSnapshot();
-    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
-                     Key(2 * kNumFilesPerLevel));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(0), Key(2 * kNumFilesPerLevel)));
 
     Random rnd(301);
-    std::string value = RandomString(&rnd, kValueBytes);
+    std::string value = rnd.RandomString(kValueBytes);
     for (int j = 0; j < kNumFilesPerLevel; ++j) {
       // give files overlapping key-ranges to prevent trivial move
       ASSERT_OK(Put(Key(j), value));
       ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
       if (j > 0) {
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
         ASSERT_EQ(j, NumTableFilesAtLevel(0));
       }
     }
     // put extra key to trigger final flush
     ASSERT_OK(Put("", ""));
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
 
@@ -1006,7 +1074,7 @@
     } else if (i == 2) {
       ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                                 {{"max_bytes_for_level_base", "10000"}}));
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ(1, NumTableFilesAtLevel(1));
     }
     ASSERT_GT(NumTableFilesAtLevel(2), 0);
@@ -1024,7 +1092,7 @@
   options.compression = kNoCompression;
   options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(2 /* num_entries_flush */));
+      test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
   options.target_file_size_base = kValueBytes;
   options.disable_auto_compactions = true;
 
@@ -1040,15 +1108,15 @@
   // A snapshot protects the range tombstone from dropping due to
   // becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                   Key(0), Key(2 * kNumFilesPerLevel));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(2 * kNumFilesPerLevel)));
 
   // Create 2 additional sstables in L0. Note that the first sstable
   // contains the range tombstone.
   //   [key000000#3,1, key000004#72057594037927935,15]
   //   [key000001#5,1, key000002#6,1]
   Random rnd(301);
-  std::string value = RandomString(&rnd, kValueBytes);
+  std::string value = rnd.RandomString(kValueBytes);
   for (int j = 0; j < kNumFilesPerLevel; ++j) {
     // Give files overlapping key-ranges to prevent a trivial move when we
     // compact from L0 to L1.
@@ -1080,7 +1148,7 @@
     ASSERT_EQ(value, Get(Key(2)));
     auto begin_str = Key(3);
     const ROCKSDB_NAMESPACE::Slice begin = begin_str;
-    dbfull()->TEST_CompactRange(1, &begin, nullptr);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
     ASSERT_EQ(1, NumTableFilesAtLevel(1));
     ASSERT_EQ(2, NumTableFilesAtLevel(2));
     ASSERT_EQ(value, Get(Key(2)));
@@ -1099,7 +1167,7 @@
     //     [key000002#6,1, key000004#72057594037927935,15]
     auto begin_str = Key(0);
     const ROCKSDB_NAMESPACE::Slice begin = begin_str;
-    dbfull()->TEST_CompactRange(1, &begin, &begin);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin));
     ASSERT_EQ(0, NumTableFilesAtLevel(1));
     ASSERT_EQ(3, NumTableFilesAtLevel(2));
   }
@@ -1183,7 +1251,7 @@
   const Snapshot* snapshot = nullptr;
   for (int i = 0; i < kNumFiles; ++i) {
     for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
-      auto value = RandomString(&rnd, kValueBytes);
+      auto value = rnd.RandomString(kValueBytes);
       ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
     }
     if (i == kNumFiles - 1) {
@@ -1200,9 +1268,9 @@
   std::string value;
   ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
 
-  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
-                              nullptr /* end */, nullptr /* column_family */,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   // Now we have multiple files at L1 all containing a single user key, thus
   // guaranteeing overlap in the file endpoints.
@@ -1213,9 +1281,9 @@
 
   // Compact and verify again. It's worthwhile because now the files have
   // tighter endpoints, so we can verify that doesn't mess anything up.
-  dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */,
-                              nullptr /* end */, nullptr /* column_family */,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      1 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
   ASSERT_GT(NumTableFilesAtLevel(2), 1);
   ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
 
@@ -1267,7 +1335,7 @@
   const Snapshot* snapshots[] = {nullptr, nullptr};
   for (int i = 0; i < kNumFiles; ++i) {
     for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
-      auto value = RandomString(&rnd, kValueBytes);
+      auto value = rnd.RandomString(kValueBytes);
       std::string key;
       if (i < kNumFiles / 2) {
         key = Key(0);
@@ -1291,6 +1359,7 @@
 
   auto get_key_count = [this]() -> int {
     auto* iter = db_->NewIterator(ReadOptions());
+    assert(iter->status().ok());
     iter->SeekToFirst();
     int keys_found = 0;
     for (; iter->Valid(); iter->Next()) {
@@ -1313,7 +1382,7 @@
   // Now overwrite a few keys that are in L1 files that definitely don't have
   // overlapping boundary keys.
   for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
-    auto value = RandomString(&rnd, kValueBytes);
+    auto value = rnd.RandomString(kValueBytes);
     ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
   }
   ASSERT_OK(db_->Flush(FlushOptions()));
@@ -1360,7 +1429,7 @@
   const Snapshot* snapshot = nullptr;
   for (int i = 0; i < kNumFiles; ++i) {
     for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
-      auto value = RandomString(&rnd, kValueBytes);
+      auto value = rnd.RandomString(kValueBytes);
       ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
       if (i == 0 && j == kNumKeys) {
         // Take snapshot to prevent covered merge operands from being dropped or
@@ -1393,6 +1462,7 @@
   ASSERT_GT(NumTableFilesAtLevel(1), 1);
 
   auto* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
   iter->SeekToLast();
   int keys_found = 0;
   for (; iter->Valid(); iter->Prev()) {
@@ -1419,11 +1489,12 @@
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
                              Key(10)));
 
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   ReadOptions read_opts;
   read_opts.snapshot = snapshot;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
 
   iter->SeekToFirst();
   ASSERT_TRUE(iter->Valid());
@@ -1466,6 +1537,7 @@
   ReadOptions read_opts;
   read_opts.snapshot = snapshot.get();
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+  ASSERT_OK(iter->status());
 
   TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
 
@@ -1500,10 +1572,10 @@
     for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
       std::string key(1, first_char);
       key.append(Key(i));
-      std::string value = RandomString(&rnd, kValueBytes);
+      std::string value = rnd.RandomString(kValueBytes);
       ASSERT_OK(Put(key, value));
     }
-    db_->Flush(FlushOptions());
+    ASSERT_OK(db_->Flush(FlushOptions()));
     MoveFilesToLevel(2);
   }
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
@@ -1522,7 +1594,7 @@
   // TODO(ajkr): remove this `Put` after file cutting accounts for range
   // tombstones (#3977).
   ASSERT_OK(Put("c" + Key(1), "value"));
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
   // and the range tombstone is only placed in the second SST.
@@ -1530,9 +1602,9 @@
   Slice begin_key(begin_key_storage);
   std::string end_key_storage("d");
   Slice end_key(end_key_storage);
-  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
-                              &end_key /* end */, nullptr /* column_family */,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, &begin_key /* begin */, &end_key /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
   ASSERT_EQ(2, NumTableFilesAtLevel(1));
 
   std::vector<LiveFileMetaData> all_metadata;
@@ -1575,6 +1647,7 @@
   const int kNumPerFile = 4, kNumFiles = 2;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  options.target_file_size_base = 9 * 1024;
   options.max_compaction_bytes = 9 * 1024;
   DestroyAndReopen(options);
   Random rnd(301);
@@ -1582,7 +1655,7 @@
     std::vector<std::string> values;
     // Write 12K (4 values, each 3K)
     for (int j = 0; j < kNumPerFile; j++) {
-      values.push_back(RandomString(&rnd, 3 << 10));
+      values.push_back(rnd.RandomString(3 << 10));
       ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
     }
   }
@@ -1597,15 +1670,15 @@
 
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
 
   // The tombstone range is not broken up into multiple SSTs which may incur a
   // large compaction with L2.
   ASSERT_EQ(1, NumTableFilesAtLevel(1));
   std::vector<std::vector<FileMetaData>> files;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
   ASSERT_EQ(1, NumTableFilesAtLevel(2));
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 }
@@ -1614,6 +1687,7 @@
   const int kNumPerFile = 4, kNumFiles = 2;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  options.target_file_size_base = 9 * 1024;
   options.max_compaction_bytes = 9 * 1024;
   DestroyAndReopen(options);
   Random rnd(301);
@@ -1621,7 +1695,7 @@
     std::vector<std::string> values;
     // Write 12K (4 values, each 3K)
     for (int j = 0; j < kNumPerFile; j++) {
-      values.push_back(RandomString(&rnd, 3 << 10));
+      values.push_back(rnd.RandomString(3 << 10));
       ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
     }
   }
@@ -1638,17 +1712,46 @@
 
   // The key range is broken up into three SSTs to avoid a future big compaction
   // with the grandparent
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
   ASSERT_EQ(3, NumTableFilesAtLevel(1));
 
-  std::vector<std::vector<FileMetaData>> files;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
-  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(
+      3, NumTableFilesAtLevel(
+             2));  // L1->L2 compaction size is limited to max_compaction_bytes
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 }
 
+TEST_F(DBRangeDelTest, IteratorRefresh) {
+  // Refreshing an iterator after a range tombstone is added should cause the
+  // deleted range of keys to disappear.
+  for (bool sv_changed : {false, true}) {
+    ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+    ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+
+    auto* iter = db_->NewIterator(ReadOptions());
+    ASSERT_OK(iter->status());
+
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "key2", "key3"));
+
+    if (sv_changed) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(iter->Refresh());
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    ASSERT_EQ("key1", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+
+    delete iter;
+  }
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_secondary_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_secondary_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_secondary_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_secondary_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,1260 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTest : public DBTestBase {
+ public:
+  DBSecondaryTest()
+      : DBTestBase("db_secondary_test", /*env_do_fsync=*/true),
+        secondary_path_(),
+        handles_secondary_(),
+        db_secondary_(nullptr) {
+    secondary_path_ =
+        test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+  }
+
+  ~DBSecondaryTest() override {
+    CloseSecondary();
+    if (getenv("KEEP_DB") != nullptr) {
+      fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(secondary_path_, options));
+    }
+  }
+
+ protected:
+  Status ReopenAsSecondary(const Options& options) {
+    return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+  }
+
+  void OpenSecondary(const Options& options);
+
+  Status TryOpenSecondary(const Options& options);
+
+  void OpenSecondaryWithColumnFamilies(
+      const std::vector<std::string>& column_families, const Options& options);
+
+  void CloseSecondary() {
+    for (auto h : handles_secondary_) {
+      ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h));
+    }
+    handles_secondary_.clear();
+    delete db_secondary_;
+    db_secondary_ = nullptr;
+  }
+
+  DBImplSecondary* db_secondary_full() {
+    return static_cast<DBImplSecondary*>(db_secondary_);
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int expected_log,
+                           int expected_sst, int expected_manifest) const;
+
+  std::string secondary_path_;
+  std::vector<ColumnFamilyHandle*> handles_secondary_;
+  DB* db_secondary_;
+};
+
+void DBSecondaryTest::OpenSecondary(const Options& options) {
+  ASSERT_OK(TryOpenSecondary(options));
+}
+
+Status DBSecondaryTest::TryOpenSecondary(const Options& options) {
+  Status s =
+      DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+  return s;
+}
+
+void DBSecondaryTest::OpenSecondaryWithColumnFamilies(
+    const std::vector<std::string>& column_families, const Options& options) {
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  for (const auto& cf_name : column_families) {
+    cf_descs.emplace_back(cf_name, options);
+  }
+  Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_OK(s);
+}
+
+void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir,
+                                          int expected_log, int expected_sst,
+                                          int expected_manifest) const {
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+  int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+  for (auto file : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(file, &number, &type)) {
+      log_cnt += (type == kWalFile);
+      sst_cnt += (type == kTableFile);
+      manifest_cnt += (type == kDescriptorFile);
+    }
+  }
+  ASSERT_EQ(expected_log, log_cnt);
+  ASSERT_EQ(expected_sst, sst_cnt);
+  ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+TEST_F(DBSecondaryTest, NonExistingDb) {
+  Destroy(last_options_);
+
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.max_open_files = -1;
+  const std::string dbname = "/doesnt/exist";
+  Status s =
+      DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_);
+  ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Put("bar", "bar_value"));
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  Close();
+
+  ASSERT_OK(ReopenAsSecondary(options));
+  ASSERT_EQ("foo_value", Get("foo"));
+  ASSERT_EQ("bar_value", Get("bar"));
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  auto db1 = static_cast<DBImplSecondary*>(db_);
+  ASSERT_NE(nullptr, db1);
+  Iterator* iter = db1->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    if (0 == count) {
+      ASSERT_EQ("bar", iter->key().ToString());
+      ASSERT_EQ("bar_value", iter->value().ToString());
+    } else if (1 == count) {
+      ASSERT_EQ("foo", iter->key().ToString());
+      ASSERT_EQ("foo_value", iter->value().ToString());
+    }
+    ++count;
+  }
+  delete iter;
+  ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input,
+                                                                 &result));
+
+  ASSERT_EQ(result.output_files.size(), 1);
+  InternalKey smallest, largest;
+  smallest.DecodeFrom(result.output_files[0].smallest_internal_key);
+  largest.DecodeFrom(result.output_files[0].largest_internal_key);
+  ASSERT_EQ(smallest.user_key().ToString(), "bar");
+  ASSERT_EQ(largest.user_key().ToString(), "foo");
+  ASSERT_EQ(result.output_level, 1);
+  ASSERT_EQ(result.output_path, this->secondary_path_);
+  ASSERT_EQ(result.num_output_records, 2);
+  ASSERT_GT(result.bytes_written, 0);
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  const int kRangeL2 = 10;
+  const int kRangeL1 = 30;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL2), "value" + ToString(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + ToString(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL1), "value" + ToString(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + ToString(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(1);
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put(Key(i * 30), "value" + ToString(i)));
+    ASSERT_OK(Put(Key(i * 30 + 50), "value" + ToString(i)));
+    ASSERT_OK(Flush());
+  }
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  // pick 2 files on level 0 for compaction, which has 3 overlap files on L1
+  CompactionServiceInput input1;
+  input1.input_files.push_back(meta.levels[0].files[2].name);
+  input1.input_files.push_back(meta.levels[0].files[3].name);
+  input1.input_files.push_back(meta.levels[1].files[0].name);
+  input1.input_files.push_back(meta.levels[1].files[1].name);
+  input1.input_files.push_back(meta.levels[1].files[2].name);
+
+  input1.output_level = 1;
+
+  options.max_open_files = -1;
+  Close();
+
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input1,
+                                                                 &result));
+  ASSERT_OK(result.status);
+
+  // pick 2 files on level 1 for compaction, which has 6 overlap files on L2
+  CompactionServiceInput input2;
+  input2.input_files.push_back(meta.levels[1].files[1].name);
+  input2.input_files.push_back(meta.levels[1].files[2].name);
+  for (int i = 3; i < 9; i++) {
+    input2.input_files.push_back(meta.levels[2].files[i].name);
+  }
+
+  input2.output_level = 2;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2,
+                                                                 &result));
+  ASSERT_OK(result.status);
+
+  CloseSecondary();
+
+  // delete all l2 files, without update manifest
+  for (auto& file : meta.levels[2].files) {
+    ASSERT_OK(env_->DeleteFile(dbname_ + file.name));
+  }
+  OpenSecondary(options);
+  cfh = db_secondary_->DefaultColumnFamily();
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2,
+                                                                  &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  // TODO: L0 -> L1 compaction should success, currently version is not built
+  // if files is missing.
+  //  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh,
+  //  input1, &result));
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+
+  // trigger compaction to delete the files for secondary instance compaction
+  ASSERT_OK(Put("foo", "foo_value" + std::to_string(3)));
+  ASSERT_OK(Put("bar", "bar_value" + std::to_string(3)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s =
+      db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+
+  Close();
+
+  ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0]));
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s =
+      db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  input.input_files.erase(input.input_files.begin());
+
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input,
+                                                                 &result));
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+}
+
+namespace {
+class TraceFileEnv : public EnvWrapper {
+ public:
+  explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
+  static const char* kClassName() { return "TraceFileEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& env_options) override {
+    class TracedRandomAccessFile : public RandomAccessFile {
+     public:
+      TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+                             std::atomic<int>& counter)
+          : target_(std::move(target)), files_closed_(counter) {}
+      ~TracedRandomAccessFile() override {
+        files_closed_.fetch_add(1, std::memory_order_relaxed);
+      }
+      Status Read(uint64_t offset, size_t n, Slice* result,
+                  char* scratch) const override {
+        return target_->Read(offset, n, result, scratch);
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      std::atomic<int>& files_closed_;
+    };
+    Status s = target()->NewRandomAccessFile(f, r, env_options);
+    if (s.ok()) {
+      r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
+    }
+    return s;
+  }
+
+  int files_closed() const {
+    return files_closed_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<int> files_closed_{0};
+};
+}  // namespace
+
+TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
+  Options options;
+  options.env = env_;
+  options.max_open_files = 1;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Options options1;
+  std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
+  options1.env = traced_env.get();
+  OpenSecondary(options1);
+
+  static const auto verify_db = [&]() {
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
+    for (iter1->SeekToFirst(), iter2->SeekToFirst();
+         iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+      ASSERT_EQ(iter1->key(), iter2->key());
+      ASSERT_EQ(iter1->value(), iter2->value());
+    }
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_FALSE(iter2->Valid());
+  };
+
+  ASSERT_OK(Put("a", "value"));
+  ASSERT_OK(Put("c", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Put("d", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
+
+  Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
+  ASSERT_TRUE(s.IsNotSupported());
+  CloseSecondary();
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "new_foo_value_1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value_1", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, SecondaryTailingBug_ISSUE_8467) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+  }
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto verify_db = [&](const std::string& foo_val,
+                             const std::string& bar_val) {
+    std::string value;
+    ReadOptions ropts;
+    Status s = db_secondary_->Get(ropts, "foo", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(foo_val, value);
+
+    s = db_secondary_->Get(ropts, "bar", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(bar_val, value);
+  };
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db("foo_value2", "bar_value2");
+  }
+}
+
+TEST_F(DBSecondaryTest, RefreshIterator) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  std::unique_ptr<Iterator> it(db_secondary_->NewIterator(ReadOptions()));
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    if (0 == i) {
+      it->Seek("foo");
+      ASSERT_FALSE(it->Valid());
+      ASSERT_OK(it->status());
+
+      ASSERT_OK(it->Refresh());
+
+      it->Seek("foo");
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value0", it->value());
+    } else {
+      it->Seek("foo");
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value" + std::to_string(i - 1), it->value());
+      ASSERT_OK(it->status());
+
+      ASSERT_OK(it->Refresh());
+
+      it->Seek("foo");
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value" + std::to_string(i), it->value());
+    }
+  }
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+  cf_descs.emplace_back("pikachu", options1);
+  cf_descs.emplace_back("eevee", options1);
+  Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_EQ(0, handles_secondary_.size());
+  ASSERT_NE(nullptr, db_secondary_);
+
+  ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Flush(0 /*cf*/));
+  ASSERT_OK(Flush(1 /*cf*/));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+        "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+       {"DBImpl::Open:AfterDeleteFiles",
+        "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+        "1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread ro_db_thread([&]() {
+    Options options1;
+    options1.env = env_;
+    options1.max_open_files = -1;
+    Status s = TryOpenSecondary(options1);
+    ASSERT_TRUE(s.IsTryAgain());
+
+    // Try again
+    OpenSecondary(options1);
+    CloseSecondary();
+  });
+  Reopen(options);
+  ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_NE(nullptr, db_secondary_full());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+  Options options;
+  options.env = env_;
+  const std::string kCfName1 = "pikachu";
+  CreateAndReopenWithCF({kCfName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+  ASSERT_OK(Flush(1 /*cf*/));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  Close();
+  CheckFileTypeCounts(dbname_, 1, 0, 1);
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  value.clear();
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  const std::string cf1_name("test_cf");
+  CreateAndReopenWithCF({cf1_name}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+                                  options1);
+
+  const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+  // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+  // ..., 9.
+  const int kNumKeys = 10;
+  // Create two sst
+  for (int i = 0; i != kNumFiles; ++i) {
+    for (int j = 0; j != kNumKeys; ++j) {
+      ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  const auto& range_scan_db = [&]() {
+    ReadOptions tmp_ropts;
+    tmp_ropts.total_order_seek = true;
+    tmp_ropts.verify_checksums = true;
+    std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+    int cnt = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+      ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+      ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+                iter->value().ToString());
+    }
+  };
+
+  range_scan_db();
+
+  // While secondary instance still keeps old MANIFEST open, we close primary,
+  // restart primary, performs full compaction, close again, restart again so
+  // that next time secondary tries to catch up with primary, the secondary
+  // will skip the MANIFEST in middle.
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  range_scan_db();
+}
+
+TEST_F(DBSecondaryTest, SwitchManifestTwice) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  const std::string cf1_name("test_cf");
+  CreateAndReopenWithCF({cf1_name}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+                                  options1);
+
+  ASSERT_OK(Put("0", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::string value;
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value0", value);
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(Put("0", "value1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value1", value);
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) {
+  const int kNumKeysPerMemtable = 1;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto& verify_db = [](DB* db1, DB* db2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+    std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+    it1->SeekToFirst();
+    it2->SeekToFirst();
+    for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+      ASSERT_EQ(it1->key(), it2->key());
+      ASSERT_EQ(it1->value(), it2->value());
+    }
+    ASSERT_FALSE(it1->Valid());
+    ASSERT_FALSE(it2->Valid());
+
+    for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+      std::string value;
+      ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+      ASSERT_EQ(it1->value(), value);
+    }
+    for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+      std::string value;
+      ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+      ASSERT_EQ(it2->value(), value);
+    }
+  };
+  for (int k = 0; k != 16; ++k) {
+    ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), db_secondary_);
+  }
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) {
+  const int kNumKeysPerMemtable = 1;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kCFName1 = "pikachu";
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  CreateAndReopenWithCF({kCFName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  const auto& verify_db = [](DB* db1,
+                             const std::vector<ColumnFamilyHandle*>& handles1,
+                             DB* db2,
+                             const std::vector<ColumnFamilyHandle*>& handles2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    ASSERT_EQ(handles1.size(), handles2.size());
+    for (size_t i = 0; i != handles1.size(); ++i) {
+      std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+      std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+      it1->SeekToFirst();
+      it2->SeekToFirst();
+      for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+        ASSERT_EQ(it1->key(), it2->key());
+        ASSERT_EQ(it1->value(), it2->value());
+      }
+      ASSERT_FALSE(it1->Valid());
+      ASSERT_FALSE(it2->Valid());
+
+      for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+        std::string value;
+        ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+        ASSERT_EQ(it1->value(), value);
+      }
+      for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+        std::string value;
+        ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+        ASSERT_EQ(it2->value(), value);
+      }
+    }
+  };
+  for (int k = 0; k != 8; ++k) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+      ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+    }
+    TEST_SYNC_POINT(
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    SyncPoint::GetInstance()->ClearTrace();
+  }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+  const int kNumKeysPerMemtable = 16;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  WriteOptions write_opts;
+  WriteBatch wb;
+  ASSERT_OK(wb.Put("key0", "value0"));
+  ASSERT_OK(wb.Put("key1", "value1"));
+  ASSERT_OK(dbfull()->Write(write_opts, &wb));
+  ReadOptions read_opts;
+  std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(iter1->status());
+  std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+  iter2->Seek("key0");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value0", iter2->value());
+  iter2->Seek("key1");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_OK(iter2->status());
+  ASSERT_EQ("value1", iter2->value());
+
+  {
+    WriteBatch wb1;
+    ASSERT_OK(wb1.Put("key0", "value01"));
+    ASSERT_OK(wb1.Put("key1", "value11"));
+    ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+  }
+
+  {
+    WriteBatch wb2;
+    ASSERT_OK(wb2.Put("key0", "new_value0"));
+    ASSERT_OK(wb2.Delete("key1"));
+    ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+  }
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+  // iter3 should not see value01 and value11 at all.
+  iter3->Seek("key0");
+  ASSERT_TRUE(iter3->Valid());
+  ASSERT_EQ("new_value0", iter3->value());
+  iter3->Seek("key1");
+  ASSERT_FALSE(iter3->Valid());
+  ASSERT_OK(iter3->status());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+  bool called = false;
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        called = true;
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "value0"));
+  ASSERT_OK(Put("c", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "value1"));
+  ASSERT_OK(Put("d", "value1"));
+  ASSERT_OK(Flush());
+  port::Thread thread([this]() {
+    Options opts;
+    opts.env = env_;
+    opts.max_open_files = -1;
+    OpenSecondary(opts);
+  });
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+  ASSERT_TRUE(called);
+}
+
+TEST_F(DBSecondaryTest, StartFromInconsistent) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options1;
+  options1.env = env_;
+  Status s = TryOpenSecondary(options1);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+
+  Options options1;
+  options1.env = env_;
+  OpenSecondary(options1);
+
+  {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ReadOptions(), "foo", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  ASSERT_OK(Put("bar", "value1"));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = db_secondary_->TryCatchUpWithPrimary();
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, OpenWithTransactionDB) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  // Destroy the DB to recreate as a TransactionDB.
+  Close();
+  Destroy(options, true);
+
+  // Create a TransactionDB.
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+  ASSERT_NE(txn_db, nullptr);
+  db_ = txn_db;
+
+  std::vector<std::string> cfs = {"new_CF"};
+  CreateColumnFamilies(cfs, options);
+  ASSERT_EQ(handles_.size(), 1);
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_OK(txn1->Put(handles_[0], "k1", "v1"));
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  ASSERT_OK(TryOpenSecondary(options));
+}
+
+#endif  //! ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_sst_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_sst_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_sst_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_sst_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,12 +12,13 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_manager.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBSSTTest : public DBTestBase {
  public:
-  DBSSTTest() : DBTestBase("/db_sst_test") {}
+  DBSSTTest() : DBTestBase("db_sst_test", /*env_do_fsync=*/true) {}
 };
 
 #ifndef ROCKSDB_LITE
@@ -97,7 +98,7 @@
   for (int i = 0; i < 10; ++i) {
     GenerateNewFile(&rnd, &key_id, false);
   }
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   int const num_files = GetSstFileCount(dbname_);
   ASSERT_GT(num_files, 0);
@@ -140,6 +141,7 @@
 
   // Just open the DB with the option set to true and check that we don't crash.
   Options options;
+  options.env = env_;
   options.skip_checking_sst_file_sizes_on_db_open = true;
   Reopen(options);
 
@@ -163,12 +165,12 @@
   for (int i = 0; i < 2; ++i) {
     // Create 1MB sst file
     for (int j = 0; j < 100; ++j) {
-      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+      ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
     }
     ASSERT_OK(Flush());
   }
   // this should execute both L0->L1 and L1->(move)->L2 compactions
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,1", FilesPerLevel(0));
 
   // If the moved file is actually deleted (the move-safeguard in
@@ -211,12 +213,12 @@
   for (int i = 0; i < 2; ++i) {
     // Create 1MB sst file
     for (int j = 0; j < 100; ++j) {
-      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+      ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
     }
     ASSERT_OK(Flush());
   }
   // this should execute both L0->L1 and L1->(move)->L2 compactions
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,1", FilesPerLevel(0));
 
   test::SleepingBackgroundTask blocking_thread;
@@ -242,7 +244,7 @@
   // write_buffer_size. The flush will be blocked with block_first_time
   // pending_file is protecting all the files created after
   for (int j = 0; j < 256; ++j) {
-    ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024)));
+    ASSERT_OK(Put(Key(j), rnd.RandomString(10 * 1024)));
   }
   blocking_thread.WaitUntilSleeping();
 
@@ -262,9 +264,9 @@
   // finish the flush!
   blocking_thread.WakeUp();
   blocking_thread.WaitUntilDone();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   // File just flushed is too big for L0 and L1 so gets moved to L2.
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0));
 
   metadata.clear();
@@ -300,14 +302,18 @@
   for (int i = 0; i < 25; i++) {
     GenerateNewRandomFile(&rnd);
     ASSERT_OK(Flush());
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     // Verify that we are tracking all sst files in dbname_
-    ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles());
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
   }
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  auto files_in_db = GetAllSSTFiles();
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
   // Verify that we are tracking all sst files in dbname_
   ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
   // Verify the total files size
@@ -341,7 +347,272 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBSSTTest, RateLimitedDelete) {
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(files_added, blob_files.size());
+  // No blob file is obsoleted.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  // No files were moved.
+  ASSERT_EQ(files_moved, 0);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+
+  // Verify that we are tracking all sst and blob files in dbname_
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  // Verify the total files size
+  uint64_t total_files_size = 0;
+  for (auto& file_to_size : files_in_db) {
+    total_files_size += file_to_size.second;
+  }
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+  Close();
+
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Verify that we track all the files again after the DB is closed and opened.
+  Close();
+
+  sst_file_manager.reset(NewSstFileManager(env_));
+  options.sst_file_manager = sst_file_manager;
+  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Reopen(options);
+
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Destroy DB and it will remove all the blob files from sst file manager and
+  // blob files deletion will go through ScheduleFileDeletion.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  ASSERT_EQ(files_deleted, blob_files.size());
+  ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+  constexpr char fifth_key[] = "fifth_key";
+  constexpr char fifth_value[] = "fifth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Put(fifth_key, fifth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 5);
+  ASSERT_EQ(files_added, 5);
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  ASSERT_EQ(files_moved, 0);
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  size_t expected_number_of_files = original_blob_files.size();
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+  expected_number_of_files -= cutoff_index;
+  files_added = 0;
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+  ASSERT_EQ(Get(fifth_key), fifth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+  // No new file is added.
+  ASSERT_EQ(files_added, 0);
+  ASSERT_EQ(files_deleted, cutoff_index);
+  ASSERT_EQ(files_scheduled_to_delete, cutoff_index);
+  ASSERT_EQ(files_moved, 0);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(files_deleted, 5);
+  ASSERT_EQ(files_scheduled_to_delete, 5);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBSSTTestRateLimit : public DBSSTTest,
+                           public ::testing::WithParamInterface<bool> {
+ public:
+  DBSSTTestRateLimit() : DBSSTTest() {}
+  ~DBSSTTestRateLimit() override {}
+};
+
+TEST_P(DBSSTTestRateLimit, RateLimitedDelete) {
   Destroy(last_options_);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
       {"DBSSTTest::RateLimitedDelete:1",
@@ -356,38 +627,38 @@
       "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
         // Turn timed wait into a simulated sleep
         uint64_t* abs_time_us = static_cast<uint64_t*>(arg);
-        int64_t cur_time = 0;
-        env_->GetCurrentTime(&cur_time);
-        if (*abs_time_us > static_cast<uint64_t>(cur_time)) {
-          env_->addon_time_.fetch_add(*abs_time_us -
-                                      static_cast<uint64_t>(cur_time));
+        uint64_t cur_time = env_->NowMicros();
+        if (*abs_time_us > cur_time) {
+          env_->MockSleepForMicroseconds(*abs_time_us - cur_time);
         }
 
-        // Randomly sleep shortly
-        env_->addon_time_.fetch_add(
-            static_cast<uint64_t>(Random::GetTLSInstance()->Uniform(10)));
-
-        // Set wait until time to before current to force not to sleep.
-        int64_t real_cur_time = 0;
-        Env::Default()->GetCurrentTime(&real_cur_time);
-        *abs_time_us = static_cast<uint64_t>(real_cur_time);
+        // Plus an additional short, random amount
+        env_->MockSleepForMicroseconds(Random::GetTLSInstance()->Uniform(10));
+
+        // Set wait until time to before (actual) current time to force not
+        // to sleep
+        *abs_time_us = Env::Default()->NowMicros();
+      });
+
+  // Disable PeriodicWorkScheduler as it also has TimedWait, which could update
+  // the simulated sleep time
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::StartPeriodicWorkScheduler:DisableScheduler", [&](void* arg) {
+        bool* disable_scheduler = static_cast<bool*>(arg);
+        *disable_scheduler = true;
       });
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  env_->no_slowdown_ = true;
-  env_->time_elapse_only_sleep_ = true;
+  bool different_wal_dir = GetParam();
   Options options = CurrentOptions();
+  SetTimeElapseOnlySleepOnReopen(&options);
   options.disable_auto_compactions = true;
-  // Need to disable stats dumping and persisting which also use
-  // RepeatableThread, one of whose member variables is of type
-  // InstrumentedCondVar. The callback for
-  // InstrumentedCondVar::TimedWaitInternal can be triggered by stats dumping
-  // and persisting threads and cause time_spent_deleting measurement to become
-  // incorrect.
-  options.stats_dump_period_sec = 0;
-  options.stats_persist_period_sec = 0;
   options.env = env_;
+  options.statistics = CreateDBStatistics();
+  if (different_wal_dir) {
+    options.wal_dir = alternative_wal_dir_;
+  }
 
   int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
   Status s;
@@ -399,8 +670,10 @@
   sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
 
   WriteOptions wo;
-  wo.disableWAL = true;
-  ASSERT_OK(TryReopen(options));
+  if (!different_wal_dir) {
+    wo.disableWAL = true;
+  }
+  Reopen(options);
   // Create 4 files in L0
   for (char v = 'a'; v <= 'd'; v++) {
     ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
@@ -437,10 +710,16 @@
   }
   ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
   ASSERT_LT(time_spent_deleting, expected_penlty * 1.1);
+  ASSERT_EQ(4, options.statistics->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(
+      0, options.statistics->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+INSTANTIATE_TEST_CASE_P(RateLimitedDelete, DBSSTTestRateLimit,
+                        ::testing::Bool());
+
 TEST_F(DBSSTTest, RateLimitedWALDelete) {
   Destroy(last_options_);
 
@@ -449,8 +728,6 @@
       "DeleteScheduler::BackgroundEmptyTrash:Wait",
       [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
 
-  env_->no_slowdown_ = true;
-  env_->time_elapse_only_sleep_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
   options.compression = kNoCompression;
@@ -464,6 +741,7 @@
   options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
   sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+  SetTimeElapseOnlySleepOnReopen(&options);
 
   ASSERT_OK(TryReopen(options));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -494,10 +772,11 @@
 }
 
 class DBWALTestWithParam
-    : public DBSSTTest,
+    : public DBTestBase,
       public testing::WithParamInterface<std::tuple<std::string, bool>> {
  public:
-  DBWALTestWithParam() {
+  explicit DBWALTestWithParam()
+      : DBTestBase("db_wal_test_with_params", /*env_do_fsync=*/true) {
     wal_dir_ = std::get<0>(GetParam());
     wal_dir_same_as_dbname_ = std::get<1>(GetParam());
   }
@@ -510,8 +789,8 @@
   class MyEnv : public EnvWrapper {
    public:
     MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
-
-    Status DeleteFile(const std::string& fname) {
+    const char* Name() const override { return "MyEnv"; }
+    Status DeleteFile(const std::string& fname) override {
       if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
         return Status::OK();
       }
@@ -525,7 +804,7 @@
     bool fake_log_delete;
   };
 
-  std::unique_ptr<MyEnv> env(new MyEnv(Env::Default()));
+  std::unique_ptr<MyEnv> env(new MyEnv(env_));
   Destroy(last_options_);
 
   env->set_fake_log_delete(true);
@@ -545,10 +824,17 @@
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
   sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
 
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
 
   // Create 4 files in L0
   for (char v = 'a'; v <= 'd'; v++) {
+    if (v == 'c') {
+      // Maximize the change that the last log file will be preserved in trash
+      // before restarting the DB.
+      // We have to set this on the 2nd to last file for it to delay deletion
+      // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
+      options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
+    }
     ASSERT_OK(Put("Key2", DummyString(1024, v)));
     ASSERT_OK(Put("Key3", DummyString(1024, v)));
     ASSERT_OK(Put("Key4", DummyString(1024, v)));
@@ -567,11 +853,11 @@
   if (!wal_dir_same_as_dbname_) {
     // Forcibly create some trash log files
     std::unique_ptr<WritableFile> result;
-    env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
-                         EnvOptions());
+    ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+                                   EnvOptions()));
     result.reset();
   }
-  env->GetChildren(options.wal_dir, &filenames);
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
   for (const std::string& fname : filenames) {
     if (fname.find(".log.trash") != std::string::npos) {
       trash_log_count++;
@@ -580,11 +866,11 @@
   ASSERT_GE(trash_log_count, 1);
 
   env->set_fake_log_delete(false);
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
 
   filenames.clear();
   trash_log_count = 0;
-  env->GetChildren(options.wal_dir, &filenames);
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
   for (const std::string& fname : filenames) {
     if (fname.find(".log.trash") != std::string::npos) {
       trash_log_count++;
@@ -608,13 +894,13 @@
   Destroy(last_options_);
 
   // Add some trash files to the db directory so the DB can clean them up
-  env_->CreateDirIfMissing(dbname_);
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
   ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
   ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
   ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
 
   // Reopen the DB and verify that it deletes existing trash files
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
   sfm->WaitForEmptyTrash();
   ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
   ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
@@ -733,7 +1019,7 @@
   int num_sst_files = 0;
   int num_wal_files = 0;
   std::vector<std::string> db_files;
-  env_->GetChildren(dbname_, &db_files);
+  ASSERT_OK(env_->GetChildren(dbname_, &db_files));
   for (std::string f : db_files) {
     if (f.substr(f.find_last_of(".") + 1) == "sst") {
       num_sst_files++;
@@ -747,7 +1033,9 @@
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
 
   sfm->SetDeleteRateBytesPerSecond(1024 * 1024);
-  sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+  // Set an extra high trash ratio to prevent immediate/non-rate limited
+  // deletions
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1000.0);
   ASSERT_OK(DestroyDB(dbname_, options));
   sfm->WaitForEmptyTrash();
   ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files);
@@ -766,12 +1054,13 @@
 
   // Generate a file containing 100 keys.
   for (int i = 0; i < 100; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
 
   uint64_t first_file_size = 0;
-  auto files_in_db = GetAllSSTFiles(&first_file_size);
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size));
   ASSERT_EQ(sfm->GetTotalSize(), first_file_size);
 
   // Set the maximum allowed space usage to the current total size
@@ -782,6 +1071,68 @@
   ASSERT_NOK(Flush());
 }
 
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  uint64_t files_size = 0;
+  uint64_t total_files_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size));
+  // Make sure blob files are considered by SSTFileManage in size limits.
+  ASSERT_GT(files_size, 0);
+  total_files_size = files_size;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size));
+  total_files_size += files_size;
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Set the maximum allowed space usage to the current total size.
+  sfm->SetMaxAllowedSpaceUsage(total_files_size + 1);
+
+  bool max_allowed_space_reached = false;
+  bool delete_blob_file = false;
+  // Sync point called after blob file is closed and max allowed space is
+  // checked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached",
+      [&](void* /*arg*/) { max_allowed_space_reached = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable::AfterDeleteFile",
+      [&](void* /*arg*/) { delete_blob_file = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {
+          "BuildTable::AfterDeleteFile",
+          "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1",
+      },
+  });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "val1"));
+  // This flush will fail
+  ASSERT_NOK(Flush());
+  ASSERT_TRUE(max_allowed_space_reached);
+
+  TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1");
+  ASSERT_TRUE(delete_blob_file);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBSSTTest, CancellingCompactionsWorks) {
   std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
   auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
@@ -807,20 +1158,21 @@
 
   // Generate a file containing 10 keys.
   for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
   uint64_t total_file_size = 0;
-  auto files_in_db = GetAllSSTFiles(&total_file_size);
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
   // Set the maximum allowed space usage to the current total size
   sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
 
   // Generate another file to trigger compaction.
   for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   // Because we set a callback in CancelledCompaction, we actually
   // let the compaction run
@@ -828,6 +1180,12 @@
   ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
   // Make sure the stat is bumped
   ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(COMPACTION_CANCELLED), 0);
+  ASSERT_EQ(0,
+            dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                FILES_MARKED_TRASH));
+  ASSERT_EQ(4,
+            dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                FILES_DELETED_IMMEDIATELY));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
@@ -848,25 +1206,28 @@
 
   // Generate a file containing 10 keys.
   for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
   uint64_t total_file_size = 0;
-  auto files_in_db = GetAllSSTFiles(&total_file_size);
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
   // Set the maximum allowed space usage to the current total size
   sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
 
   // Generate another file to trigger compaction.
   for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
 
   // OK, now trigger a manual compaction
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsCompactionTooLarge());
 
   // Wait for manual compaction to get scheduled and finish
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
   // Make sure the stat is bumped
@@ -876,10 +1237,13 @@
 
   // Now make sure CompactFiles also gets cancelled
   auto l0_files = collector->GetFlushedFiles();
-  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
+  ASSERT_TRUE(
+      dbfull()
+          ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0)
+          .IsCompactionTooLarge());
 
   // Wait for manual compaction to get scheduled and finish
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
                 COMPACTION_CANCELLED),
@@ -894,8 +1258,9 @@
       "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                   l0_files, 0));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
   ASSERT_GT(completed_compactions, 0);
@@ -955,14 +1320,15 @@
     // It is easy to detect if the test is stuck in a loop. No need for
     // complex termination logic.
     while (true) {
-      auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50));
+      auto s = Put(rnd.RandomString(10), rnd.RandomString(50));
       if (!s.ok()) {
         break;
       }
     }
     ASSERT_TRUE(bg_error_set);
     uint64_t total_sst_files_size = 0;
-    GetAllSSTFiles(&total_sst_files_size);
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size));
     ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
@@ -998,7 +1364,7 @@
     CompactRangeOptions compact_options;
     compact_options.change_level = true;
     compact_options.target_level = 2;
-    db_->CompactRange(compact_options, nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
 
     // Create 12 Files in L0
     for (int i = 0; i < 12; i++) {
@@ -1033,13 +1399,16 @@
   // we encode table properties as varint64. Force time to be 0 to work around
   // it. Should remove the workaround after we propagate the property on
   // compaction.
-  std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
-  mock_env->set_current_time(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:oldest_ancester_time", [&](void* arg) {
+        uint64_t* current_time = static_cast<uint64_t*>(arg);
+        *current_time = 0;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
   options.compression = kNoCompression;
-  options.env = mock_env.get();
   DestroyAndReopen(options);
   // Generate 5 files in L0
   for (int i = 0; i < 5; i++) {
@@ -1047,7 +1416,7 @@
       std::string val = "val_file_" + ToString(i);
       ASSERT_OK(Put(Key(j), val));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ("5", FilesPerLevel(0));
 
@@ -1071,6 +1440,7 @@
 
   // hold current version
   std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
 
   // Compact 5 files into 1 file in L0
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -1094,12 +1464,13 @@
 
   // hold current version
   std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
 
   // Delete all keys and compact, this will delete all live files
   for (int i = 0; i < 10; i++) {
     ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("", FilesPerLevel(0));
 
@@ -1113,6 +1484,7 @@
   // Total SST files = 6 (5 original files + compacted file)
   ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
 
+  ASSERT_OK(iter1->status());
   iter1.reset();
   ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
                                        &total_sst_files_size));
@@ -1120,6 +1492,7 @@
   // Total SST files = 1 (compacted file)
   ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
 
+  ASSERT_OK(iter2->status());
   iter2.reset();
   ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
                                        &total_sst_files_size));
@@ -1127,8 +1500,7 @@
   // Total SST files = 0
   ASSERT_EQ(total_sst_files_size, 0);
 
-  // Close db before mock_env destruct.
-  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
@@ -1139,7 +1511,7 @@
   // Generate 5 files in L0
   for (int i = 0; i < 5; i++) {
     ASSERT_OK(Put(Key(i), "val"));
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ("5", FilesPerLevel(0));
 
@@ -1164,6 +1536,7 @@
 
   // hold current version
   std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
 
   // Compaction will do trivial move from L0 to L1
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -1187,12 +1560,13 @@
 
   // hold current version
   std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
 
   // Delete all keys and compact, this will delete all live files
   for (int i = 0; i < 5; i++) {
     ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("", FilesPerLevel(0));
 
@@ -1206,7 +1580,9 @@
   // Total SST files = 5 (used in 2 version)
   ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
 
+  ASSERT_OK(iter1->status());
   iter1.reset();
+  ASSERT_OK(iter2->status());
   iter2.reset();
 
   ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
@@ -1216,6 +1592,103 @@
   ASSERT_EQ(total_sst_files_size, 0);
 }
 
+// This test if blob files are recorded by SST File Manager when Compaction job
+// creates/delete them and in case of AtomicFlush.
+TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+  options.atomic_flush = true;
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  ASSERT_OK(Put("key_1", "value_1"));
+  ASSERT_OK(Put("key_2", "value_2"));
+  ASSERT_OK(Put("key_3", "value_3"));
+  ASSERT_OK(Put("key_4", "value_4"));
+  ASSERT_OK(Flush());
+
+  // Overwrite will create the garbage data.
+  ASSERT_OK(Put("key_3", "new_value_3"));
+  ASSERT_OK(Put("key_4", "new_value_4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(files_added, 3);
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  files_added = 0;
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  // Compaction job will create a new file and delete the older files.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(files_added, 1);
+  ASSERT_EQ(files_scheduled_to_delete, 1);
+
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(files_deleted, 1);
+
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  ASSERT_EQ(files_scheduled_to_delete, 4);
+
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(files_deleted, 4);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
@@ -1223,5 +1696,6 @@
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_statistics_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_statistics_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_statistics_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_statistics_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,12 +9,14 @@
 #include "monitoring/thread_status_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/statistics.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBStatisticsTest : public DBTestBase {
  public:
-  DBStatisticsTest() : DBTestBase("/db_statistics_test") {}
+  DBStatisticsTest()
+      : DBTestBase("db_statistics_test", /*env_do_fsync=*/true) {}
 };
 
 TEST_F(DBStatisticsTest, CompressionStatsTest) {
@@ -55,7 +57,7 @@
   Random rnd(301);
   for (int i = 0; i < kNumKeysWritten; ++i) {
     // compressible string
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
   }
   ASSERT_OK(Flush());
   ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0);
@@ -75,7 +77,7 @@
   // Check that compressions do not occur when turned off
   for (int i = 0; i < kNumKeysWritten; ++i) {
     // compressible string
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
   }
   ASSERT_OK(Flush());
   ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED)
@@ -135,11 +137,73 @@
       ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
       options.statistics->histogramData(DB_WRITE, &histogram_data);
       ASSERT_GT(histogram_data.max, 0.0);
-      options.statistics->Reset();
+      ASSERT_OK(options.statistics->Reset());
     }
   }
 }
 
+TEST_F(DBStatisticsTest, ExcludeTickers) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  options.statistics->set_stats_level(StatsLevel::kExceptTickers);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_EQ(0, options.statistics->getTickerCount(BYTES_WRITTEN));
+  options.statistics->set_stats_level(StatsLevel::kExceptHistogramOrTimers);
+  Reopen(options);
+  ASSERT_EQ("value", Get("foo"));
+  ASSERT_GT(options.statistics->getTickerCount(BYTES_READ), 0);
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBStatisticsTest, VerifyChecksumReadStat) {
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Reopen(options);
+
+  // Expected to be populated regardless of `PerfLevel` in user thread
+  SetPerfLevel(kDisable);
+
+  {
+    // Scenario 0: only WAL data. Not verified so require ticker to be zero.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+    ASSERT_OK(db_->VerifyChecksum());
+    ASSERT_EQ(0,
+              options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+  }
+
+  // Create one SST.
+  ASSERT_OK(Flush());
+  std::unordered_map<std::string, uint64_t> table_files;
+  uint64_t table_files_size = 0;
+  GetAllDataFiles(kTableFile, &table_files, &table_files_size);
+
+  {
+    // Scenario 1: Table verified in `VerifyFileChecksums()`. This should read
+    // the whole file so we require the ticker stat exactly matches the file
+    // size.
+    ASSERT_OK(options.statistics->Reset());
+    ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+    ASSERT_EQ(table_files_size,
+              options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+  }
+
+  {
+    // Scenario 2: Table verified in `VerifyChecksum()`. This opens a
+    // `TableReader` to verify each block. It can involve duplicate reads of the
+    // same data so we set a lower-bound only.
+    ASSERT_OK(options.statistics->Reset());
+    ASSERT_OK(db_->VerifyChecksum());
+    ASSERT_GE(options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES),
+              table_files_size);
+  }
+}
+
+#endif  // !ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_table_properties_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_table_properties_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_table_properties_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_table_properties_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,11 +11,16 @@
 #include <vector>
 
 #include "db/db_test_util.h"
+#include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
+#include "rocksdb/types.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 #ifndef ROCKSDB_LITE
 
@@ -42,12 +47,16 @@
 
   ASSERT_EQ(props.size(), unique_entries.size());
   ASSERT_EQ(expected_entries_size, sum);
+
+  VerifySstUniqueIds(props);
 }
 }  // namespace
 
-class DBTablePropertiesTest : public DBTestBase {
+class DBTablePropertiesTest : public DBTestBase,
+                              public testing::WithParamInterface<std::string> {
  public:
-  DBTablePropertiesTest() : DBTestBase("/db_table_properties_test") {}
+  DBTablePropertiesTest()
+      : DBTestBase("db_table_properties_test", /*env_do_fsync=*/false) {}
   TablePropertiesCollection TestGetPropertiesOfTablesInRange(
       std::vector<Range> ranges, std::size_t* num_properties = nullptr,
       std::size_t* num_files = nullptr);
@@ -56,21 +65,49 @@
 TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   Options options = CurrentOptions();
   options.level0_file_num_compaction_trigger = 8;
+  // Part of strategy to prevent pinning table files
+  options.max_open_files = 42;
   Reopen(options);
+
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
+    // Use old meta name for table properties for one file
+    if (table == 3) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) {
+            *reinterpret_cast<const std::string**>(meta) =
+                &kPropertiesBlockOldName;
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+    }
+    // Build file
     for (int i = 0; i < 10 + table; ++i) {
-      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+      ASSERT_OK(db_->Put(WriteOptions(), ToString(table * 100 + i), "val"));
     }
-    db_->Flush(FlushOptions());
+    ASSERT_OK(db_->Flush(FlushOptions()));
   }
+  SyncPoint::GetInstance()->DisableProcessing();
+  std::string original_session_id;
+  ASSERT_OK(db_->GetDbSessionId(original_session_id));
+
+  // Part of strategy to prevent pinning table files
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEditHandler::LoadTables:skip_load_table_files",
+      [&](void* skip_load) { *reinterpret_cast<bool*>(skip_load) = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
 
   // 1. Read table properties directly from file
   Reopen(options);
+  // Clear out auto-opened files
+  dbfull()->TEST_table_cache()->EraseUnRefEntries();
+  ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 
   // 2. Put two tables to table cache and
   Reopen(options);
+  // Clear out auto-opened files
+  dbfull()->TEST_table_cache()->EraseUnRefEntries();
+  ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
   // fetch key from 1st and 2nd table, which will internally place that table to
   // the table cache.
   for (int i = 0; i < 2; ++i) {
@@ -81,12 +118,113 @@
 
   // 3. Put all tables to table cache
   Reopen(options);
-  // fetch key from 1st and 2nd table, which will internally place that table to
-  // the table cache.
+  // fetch key from all tables, which will place them in table cache.
   for (int i = 0; i < 4; ++i) {
     Get(ToString(i * 100 + 0));
   }
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 4. Try to read CORRUPT properties (a) directly from file, and (b)
+  // through reader on Get
+
+  // It's not practical to prevent table file read on Open, so we
+  // corrupt after open and after purging table cache.
+  for (bool direct : {true, false}) {
+    Reopen(options);
+    // Clear out auto-opened files
+    dbfull()->TEST_table_cache()->EraseUnRefEntries();
+    ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    std::string sst_file = props.begin()->first;
+
+    // Corrupt the file's TableProperties using session id
+    std::string contents;
+    ASSERT_OK(
+        ReadFileToString(env_->GetFileSystem().get(), sst_file, &contents));
+    size_t pos = contents.find(original_session_id);
+    ASSERT_NE(pos, std::string::npos);
+    ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+                                /*verify checksum fails*/ false));
+
+    // Try to read CORRUPT properties
+    if (direct) {
+      ASSERT_TRUE(db_->GetPropertiesOfAllTables(&props).IsCorruption());
+    } else {
+      bool found_corruption = false;
+      for (int i = 0; i < 4; ++i) {
+        std::string result = Get(ToString(i * 100 + 0));
+        if (result.find_first_of("Corruption: block checksum mismatch") !=
+            std::string::npos) {
+          found_corruption = true;
+        }
+      }
+      ASSERT_TRUE(found_corruption);
+    }
+
+    // UN-corrupt file for next iteration
+    ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+                                /*verify checksum fails*/ false));
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTablePropertiesTest, InvalidIgnored) {
+  // RocksDB versions 2.5 - 2.7 generate some properties that Block considers
+  // invalid in some way. This approximates that.
+
+  // Inject properties block data that Block considers invalid
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:BlockData",
+      [&](void* block_data) {
+        *reinterpret_cast<Slice*>(block_data) = Slice("X");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Build file
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), ToString(i), "val"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Not crashing is good enough
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+}
+
+TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> factory;
+  std::string id = CompactOnDeletionCollectorFactory::kClassName();
+  ASSERT_OK(
+      TablePropertiesCollectorFactory::CreateFromString(options, id, &factory));
+  auto del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(0U, del_factory->GetWindowSize());
+  ASSERT_EQ(0U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+  ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+      options, "window_size=100; deletion_trigger=90; id=" + id, &factory));
+  del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(100U, del_factory->GetWindowSize());
+  ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+  ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+      options,
+      "window_size=100; deletion_trigger=90; deletion_ratio=0.5; id=" + id,
+      &factory));
+  del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(100U, del_factory->GetWindowSize());
+  ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.5, del_factory->GetDeletionRatio());
 }
 
 TablePropertiesCollection
@@ -154,16 +292,16 @@
 
   // build a decent LSM
   for (int i = 0; i < 10000; i++) {
-    ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   if (NumTableFilesAtLevel(0) == 0) {
-    ASSERT_OK(Put(test::RandomKey(&rnd, 5), RandomString(&rnd, 102)));
-    Flush();
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+    ASSERT_OK(Flush());
   }
 
-  db_->PauseBackgroundWork();
+  ASSERT_OK(db_->PauseBackgroundWork());
 
   // Ensure that we have at least L0, L1 and L2
   ASSERT_GT(NumTableFilesAtLevel(0), 0);
@@ -231,8 +369,8 @@
   // Create one table per CF, then verify it was created with the column family
   // name property.
   for (uint32_t cf = 0; cf < 2; ++cf) {
-    Put(cf, "key", "val");
-    Flush(cf);
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Flush(cf));
 
     TablePropertiesCollection fname_to_props;
     ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
@@ -251,7 +389,89 @@
   }
 }
 
-TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
+TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) {
+  CreateAndReopenWithCF({"goku"}, CurrentOptions());
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    std::string id, sid;
+    ASSERT_OK(db_->GetDbIdentity(id));
+    ASSERT_OK(db_->GetDbSessionId(sid));
+    ASSERT_EQ(id, fname_to_props.begin()->second->db_id);
+    ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id);
+  }
+}
+
+class DBTableHostnamePropertyTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<int, std::string>> {
+ public:
+  DBTableHostnamePropertyTest()
+      : DBTestBase("db_table_hostname_property_test",
+                   /*env_do_fsync=*/false) {}
+};
+
+TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) {
+  option_config_ = std::get<0>(GetParam());
+  Options opts = CurrentOptions();
+  std::string expected_host_id = std::get<1>(GetParam());
+  ;
+  if (expected_host_id == kHostnameForDbHostId) {
+    ASSERT_OK(env_->GetHostNameString(&expected_host_id));
+  } else {
+    opts.db_host_id = expected_host_id;
+  }
+  CreateAndReopenWithCF({"goku"}, opts);
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBTableHostnamePropertyTest, DBTableHostnamePropertyTest,
+    ::testing::Values(
+        // OptionConfig, override db_host_location
+        std::make_tuple(DBTestBase::OptionConfig::kDefault,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, ""),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "")));
+
+class DeletionTriggeredCompactionTestListener : public EventListener {
+ public:
+  void OnCompactionBegin(DB* , const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+  }
+
+  void OnCompactionCompleted(DB* , const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+  }
+};
+
+TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
   int kNumKeys = 1000;
   int kWindowSize = 100;
   int kNumDelsTrigger = 90;
@@ -259,28 +479,37 @@
     NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
 
   Options opts = CurrentOptions();
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+  if(GetParam() == "kCompactionStyleUniversal") {
+    opts.compaction_style = kCompactionStyleUniversal;
+  }
   Reopen(opts);
 
   // add an L1 file to prevent tombstones from dropping due to obsolescence
   // during flush
-  Put(Key(0), "val");
-  Flush();
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
   MoveFilesToLevel(1);
 
+  DeletionTriggeredCompactionTestListener *listener =
+    new DeletionTriggeredCompactionTestListener();
+  opts.listeners.emplace_back(listener);
+  Reopen(opts);
+
   for (int i = 0; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
-  ASSERT_GT(NumTableFilesAtLevel(1), 0);
 
   // Change the window size and deletion trigger and ensure new values take
   // effect
@@ -293,16 +522,15 @@
   for (int i = 0; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
-  ASSERT_GT(NumTableFilesAtLevel(1), 0);
 
   // Change the window size to disable delete triggered compaction
   kWindowSize = 0;
@@ -313,18 +541,75 @@
   for (int i = 0; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED));
+  ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED));
+}
 
+TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) {
+  constexpr int kNumKeys = 1000;
+  constexpr int kWindowSize = 0;
+  constexpr int kNumDelsTrigger = 0;
+  constexpr double kDeletionRatio = 0.1;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger,
+                                           kDeletionRatio);
+
+  Options opts = CurrentOptions();
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+  Reopen(opts);
+
+  // Add an L2 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  auto* listener = new DeletionTriggeredCompactionTestListener();
+  opts.listeners.emplace_back(listener);
+  Reopen(opts);
+
+  // Generate one L0 with kNumKeys Put.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "not important"));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another L0 with kNumKeys Delete.
+  // This file, due to deletion ratio, will trigger compaction: 2@0 files to L1.
+  // The resulting L1 file has only one tombstone for user key 'Key(0)'.
+  // Again, due to deletion ratio, a compaction will be triggered: 1@1 + 1@2
+  // files to L2. However, the resulting file is empty because the tombstone
+  // and value are both dropped.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_EQ(0, NumTableFilesAtLevel(i));
+  }
 }
 
+INSTANTIATE_TEST_CASE_P(
+    DBTablePropertiesTest,
+    DBTablePropertiesTest,
+    ::testing::Values(
+      "kCompactionStyleLevel",
+      "kCompactionStyleUniversal"
+      ));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_tailing_iter_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -20,7 +20,8 @@
 
 class DBTestTailingIterator : public DBTestBase {
  public:
-  DBTestTailingIterator() : DBTestBase("/db_tailing_iterator_test") {}
+  DBTestTailingIterator()
+      : DBTestBase("db_tailing_iterator_test", /*env_do_fsync=*/true) {}
 };
 
 TEST_F(DBTestTailingIterator, TailingIteratorSingle) {
@@ -30,6 +31,7 @@
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
   iter->SeekToFirst();
   ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
 
   // add a record and check that iter can see it
   ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
@@ -47,6 +49,7 @@
   read_options.tailing = true;
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   std::string value(1024, 'a');
 
   const int num_records = 10000;
@@ -69,7 +72,9 @@
   read_options.tailing = true;
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
   std::string value(1024, 'a');
 
   const int num_records = 1000;
@@ -137,8 +142,11 @@
   Slice keyu(bufe, 20);
   read_options.iterate_upper_bound = &keyu;
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
   std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iterh->status());
   std::string value(1024, 'a');
   bool file_iters_deleted = false;
   bool file_iters_renewed_null = false;
@@ -178,7 +186,7 @@
 
     if (i % 100 == 99) {
       ASSERT_OK(Flush(1));
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       if (i == 299) {
         file_iters_deleted = true;
       }
@@ -224,6 +232,7 @@
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
   read_options.read_tier = kBlockCacheTier;
   std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iteri->status());
   char buf5[32];
   snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
   Slice target1(buf5, 20);
@@ -235,6 +244,7 @@
   options.table_factory.reset(NewBlockBasedTableFactory());
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
   iter.reset(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   for (int i = 2 * num_records; i > 0; --i) {
     char buf1[32];
     char buf2[32];
@@ -261,6 +271,7 @@
   read_options.tailing = true;
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
 
   // write a single record, read it using the iterator, then delete it
   ASSERT_OK(Put(1, "0test", "test"));
@@ -308,6 +319,7 @@
   CreateAndReopenWithCF({"pikachu"}, options);
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   ASSERT_OK(Put(1, "0101", "test"));
 
   ASSERT_OK(Flush(1));
@@ -338,6 +350,7 @@
   ASSERT_OK(db_->Put(WriteOptions(), key, value));
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
   iter->SeekToFirst();
   // we either see the entry or it's not in cache
   ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
@@ -368,6 +381,7 @@
   }
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
   // Seek to 00001.  We expect to find 00002.
   std::string start_key = "00001";
   iter->Seek(start_key);
@@ -403,6 +417,7 @@
   ASSERT_OK(Put(1, "21", "21"));
 
   std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(it->status());
   it->Seek("12");
   ASSERT_TRUE(it->Valid());
   ASSERT_EQ("12", it->key().ToString());
@@ -410,7 +425,7 @@
   it->Next();
   // Not valid since "21" is over the upper bound.
   ASSERT_FALSE(it->Valid());
-
+  ASSERT_OK(it->status());
   // This keeps track of the number of times NeedToSeekImmutable() was true.
   int immutable_seeks = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -423,6 +438,7 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
   ASSERT_EQ(0, immutable_seeks);
 }
 
@@ -477,6 +493,8 @@
   it->Next();
   ASSERT_TRUE(it->Valid());
   ASSERT_EQ("40", it->key().ToString());
+
+  ASSERT_OK(it->status());
 }
 
 TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) {
@@ -495,6 +513,7 @@
   ASSERT_OK(Flush());
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
 
   iter->Seek("aa");
   ASSERT_TRUE(iter->Valid());
@@ -517,6 +536,7 @@
   ASSERT_OK(Flush());
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
 
   iter->SeekToFirst();
   ASSERT_TRUE(iter->Valid());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,11 +11,13 @@
 // in Release build.
 // which is a pity, it is a good test
 #include <fcntl.h>
+
 #include <algorithm>
 #include <set>
 #include <thread>
 #include <unordered_set>
 #include <utility>
+
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
@@ -24,7 +26,8 @@
 #endif
 
 #include "cache/lru_cache.h"
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
@@ -33,7 +36,6 @@
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
-#include "memtable/hash_linklist_rep.h"
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@@ -52,27 +54,30 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/thread_status.h"
+#include "rocksdb/types.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/compression.h"
 #include "util/mutexlock.h"
+#include "util/random.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+// Note that whole DBTest and its child classes disable fsync on files
+// and directories for speed.
+// If fsync needs to be covered in a test, put it in other places.
 class DBTest : public DBTestBase {
  public:
-  DBTest() : DBTestBase("/db_test") {}
+  DBTest() : DBTestBase("db_test", /*env_do_fsync=*/false) {}
 };
 
 class DBTestWithParam
@@ -93,7 +98,7 @@
 };
 
 TEST_F(DBTest, MockEnvTest) {
-  std::unique_ptr<MockEnv> env{new MockEnv(Env::Default())};
+  std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
@@ -126,7 +131,7 @@
 
 // TEST_FlushMemTable() is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
   ASSERT_OK(dbi->TEST_FlushMemTable());
 
   for (size_t i = 0; i < 3; ++i) {
@@ -174,7 +179,7 @@
   ASSERT_TRUE(!iterator->Valid());
   delete iterator;
 
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
   ASSERT_OK(dbi->TEST_FlushMemTable());
 
   for (size_t i = 0; i < 3; ++i) {
@@ -245,17 +250,21 @@
       wo.sync = sync;
       wo.disableWAL = disableWAL;
       wo.no_slowdown = true;
-      dbfull()->Put(wo, "foo", "bar");
+      // Large enough to exceed allowance for one time interval
+      std::string large_value(1024, 'x');
+      // Perhaps ideally this first write would fail because of delay, but
+      // the current implementation does not guarantee that.
+      dbfull()->Put(wo, "foo", large_value).PermitUncheckedError();
       // We need the 2nd write to trigger delay. This is because delay is
       // estimated based on the last write size which is 0 for the first write.
-      ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2"));
+      ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value));
       ASSERT_GE(sleep_count.load(), 0);
       ASSERT_GE(wait_count.load(), 0);
       token.reset();
 
-      token = dbfull()->TEST_write_controler().GetDelayToken(1000000000);
+      token = dbfull()->TEST_write_controler().GetDelayToken(1000000);
       wo.no_slowdown = false;
-      ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3"));
+      ASSERT_OK(dbfull()->Put(wo, "foo3", large_value));
       ASSERT_GE(sleep_count.load(), 1);
       token.reset();
     }
@@ -308,7 +317,7 @@
   wo.sync = false;
   wo.disableWAL = false;
   wo.no_slowdown = false;
-  dbfull()->Put(wo, "foo", "bar");
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
   // We need the 2nd write to trigger delay. This is because delay is
   // estimated based on the last write size which is 0 for the first write.
   ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
@@ -366,7 +375,7 @@
   wo.sync = false;
   wo.disableWAL = false;
   wo.no_slowdown = false;
-  dbfull()->Put(wo, "foo", "bar");
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
   // We need the 2nd write to trigger delay. This is because delay is
   // estimated based on the last write size which is 0 for the first write.
   ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
@@ -435,7 +444,7 @@
   wo.sync = false;
   wo.disableWAL = false;
   wo.no_slowdown = false;
-  dbfull()->Put(wo, "foo", "bar");
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
   // We need the 2nd write to trigger delay. This is because delay is
   // estimated based on the last write size which is 0 for the first write.
   ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
@@ -616,24 +625,24 @@
 
     // Put values on second level (so that they will not be in the same
     // compaction as the other operations.
-    Put(1, "foo", "first");
-    Put(1, "bar", "one");
+    ASSERT_OK(Put(1, "foo", "first"));
+    ASSERT_OK(Put(1, "bar", "one"));
     ASSERT_OK(Flush(1));
     MoveFilesToLevel(2, 1);
 
     // (Single) delete hidden by a put
-    SingleDelete(1, "foo");
-    Put(1, "foo", "second");
-    Delete(1, "bar");
-    Put(1, "bar", "two");
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "second"));
+    ASSERT_OK(Delete(1, "bar"));
+    ASSERT_OK(Put(1, "bar", "two"));
     ASSERT_OK(Flush(1));
 
-    SingleDelete(1, "foo");
-    Delete(1, "bar");
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Delete(1, "bar"));
     ASSERT_OK(Flush(1));
 
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
 
     ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
@@ -654,9 +663,9 @@
     options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    Put(1, "foo", Slice());
-    Put(1, "a", Slice());
-    SingleDelete(1, "a");
+    ASSERT_OK(Put(1, "foo", Slice()));
+    ASSERT_OK(Put(1, "a", Slice()));
+    ASSERT_OK(SingleDelete(1, "a"));
     ASSERT_OK(Flush(1));
 
     ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
@@ -764,8 +773,8 @@
 
     // Block sync calls
     env_->delay_sstable_sync_.store(true, std::memory_order_release);
-    Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
-    Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
+    ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+    ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
     // Release sync calls
@@ -843,19 +852,19 @@
     // occurring at level 1 (instead of the correct level 0).
 
     // Step 1: First place sstables in levels 0 and 2
-    Put(1, "a", "begin");
-    Put(1, "z", "end");
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
     ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    Put(1, "a", "begin");
-    Put(1, "z", "end");
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
     ASSERT_OK(Flush(1));
     ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
     ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
 
     // Step 2: clear level 1 if necessary.
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
     ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
     ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
@@ -866,7 +875,7 @@
     }
 
     // Step 4: Wait for compaction to finish
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
@@ -902,6 +911,9 @@
       static_cast<int64_t>(options.write_buffer_size);
   options.max_write_buffer_number = 2;
   options.write_buffer_size = 120 * 1024;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull;
+  options.listeners.push_back(flush_listener);
   CreateAndReopenWithCF({"pikachu"}, options);
   std::vector<port::Thread> threads;
 
@@ -914,7 +926,7 @@
     WriteOptions wo;
     // this should fill up 2 memtables
     for (int k = 0; k < 5000; ++k) {
-      ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), ""));
+      ASSERT_OK(db_->Put(wo, handles_[a & 1], rnd.RandomString(13), ""));
     }
   };
 
@@ -973,7 +985,7 @@
   bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
               std::string* /*new_value*/,
               bool* /*value_changed*/) const override {
-    db_test->env_->addon_time_.fetch_add(1000);
+    db_test->env_->MockSleepForMicroseconds(1000);
     return true;
   }
 
@@ -1018,10 +1030,10 @@
 }
 
 void CheckColumnFamilyMeta(
-    const ColumnFamilyMetaData& cf_meta,
+    const ColumnFamilyMetaData& cf_meta, const std::string& cf_name,
     const std::vector<std::vector<FileMetaData>>& files_by_level,
     uint64_t start_time, uint64_t end_time) {
-  ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName);
+  ASSERT_EQ(cf_meta.name, cf_name);
   ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
 
   uint64_t cf_size = 0;
@@ -1115,6 +1127,53 @@
 }
 
 #ifndef ROCKSDB_LITE
+void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number,
+                 uint64_t total_blob_count, uint64_t total_blob_bytes,
+                 const std::string& checksum_method,
+                 const std::string& checksum_value,
+                 uint64_t garbage_blob_count = 0,
+                 uint64_t garbage_blob_bytes = 0) {
+  ColumnFamilyData* cfd =
+      (static_cast<const ColumnFamilyHandleImpl*>(cfh))->cfd();
+  assert(cfd);
+
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  // Add a live blob file.
+
+  auto shared_meta = SharedBlobFileMetaData::Create(
+      blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+      checksum_value);
+
+  auto meta = BlobFileMetaData::Create(std::move(shared_meta),
+                                       BlobFileMetaData::LinkedSsts(),
+                                       garbage_blob_count, garbage_blob_bytes);
+
+  storage_info->AddBlobFile(std::move(meta));
+}
+
+static void CheckBlobMetaData(
+    const BlobMetaData& bmd, uint64_t blob_file_number,
+    uint64_t total_blob_count, uint64_t total_blob_bytes,
+    const std::string& checksum_method, const std::string& checksum_value,
+    uint64_t garbage_blob_count = 0, uint64_t garbage_blob_bytes = 0) {
+  ASSERT_EQ(bmd.blob_file_number, blob_file_number);
+  ASSERT_EQ(bmd.blob_file_name, BlobFileName("", blob_file_number));
+  ASSERT_EQ(bmd.blob_file_size,
+            total_blob_bytes + BlobLogHeader::kSize + BlobLogFooter::kSize);
+
+  ASSERT_EQ(bmd.total_blob_count, total_blob_count);
+  ASSERT_EQ(bmd.total_blob_bytes, total_blob_bytes);
+  ASSERT_EQ(bmd.garbage_blob_count, garbage_blob_count);
+  ASSERT_EQ(bmd.garbage_blob_bytes, garbage_blob_bytes);
+  ASSERT_EQ(bmd.checksum_method, checksum_method);
+  ASSERT_EQ(bmd.checksum_value, checksum_value);
+}
+
 TEST_F(DBTest, MetaDataTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -1144,7 +1203,7 @@
     // Fill up the rest of the file with random values.
     GenerateNewFile(&rnd, &key_index, /* nowait */ true);
 
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   std::vector<std::vector<FileMetaData>> files_by_level;
@@ -1155,13 +1214,71 @@
 
   ColumnFamilyMetaData cf_meta;
   db_->GetColumnFamilyMetaData(&cf_meta);
-  CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time);
-
+  CheckColumnFamilyMeta(cf_meta, kDefaultColumnFamilyName, files_by_level,
+                        start_time, end_time);
   std::vector<LiveFileMetaData> live_file_meta;
   db_->GetLiveFilesMetaData(&live_file_meta);
   CheckLiveFilesMeta(live_file_meta, files_by_level);
 }
 
+TEST_F(DBTest, AllMetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+  Random rnd(301);
+  dbfull()->TEST_LockMutex();
+  for (int cf = 0; cf < 2; cf++) {
+    AddBlobFile(handles_[cf], blob_file_number * (cf + 1),
+                total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+                checksum_method, checksum_value);
+  }
+  dbfull()->TEST_UnlockMutex();
+
+  std::vector<ColumnFamilyMetaData> all_meta;
+  db_->GetAllColumnFamilyMetaData(&all_meta);
+
+  std::vector<std::vector<FileMetaData>> default_files_by_level;
+  std::vector<std::vector<FileMetaData>> pikachu_files_by_level;
+  dbfull()->TEST_GetFilesMetaData(handles_[0], &default_files_by_level);
+  dbfull()->TEST_GetFilesMetaData(handles_[1], &pikachu_files_by_level);
+
+  options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+  ASSERT_EQ(all_meta.size(), 2);
+  for (int cf = 0; cf < 2; cf++) {
+    const auto& cfmd = all_meta[cf];
+    if (cf == 0) {
+      CheckColumnFamilyMeta(cfmd, "default", default_files_by_level, start_time,
+                            end_time);
+    } else {
+      CheckColumnFamilyMeta(cfmd, "pikachu", pikachu_files_by_level, start_time,
+                            end_time);
+    }
+    ASSERT_EQ(cfmd.blob_files.size(), 1U);
+    const auto& bmd = cfmd.blob_files[0];
+    ASSERT_EQ(cfmd.blob_file_count, 1U);
+    ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+    ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+    CheckBlobMetaData(bmd, blob_file_number * (cf + 1),
+                      total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+                      checksum_method, checksum_value);
+  }
+}
+
 namespace {
 void MinLevelHelper(DBTest* self, Options& options) {
   Random rnd(301);
@@ -1171,20 +1288,20 @@
     std::vector<std::string> values;
     // Write 120KB (12 values, each 10K)
     for (int i = 0; i < 12; i++) {
-      values.push_back(DBTestBase::RandomString(&rnd, 10000));
+      values.push_back(rnd.RandomString(10000));
       ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
     }
-    self->dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(self->dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
   }
 
   // generate one more file in level-0, and should trigger level-0 compaction
   std::vector<std::string> values;
   for (int i = 0; i < 12; i++) {
-    values.push_back(DBTestBase::RandomString(&rnd, 10000));
+    values.push_back(rnd.RandomString(10000));
     ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
   }
-  self->dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(self->dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
@@ -1294,7 +1411,7 @@
 
     Random rnd(301);
     std::string value =
-        RandomString(&rnd, static_cast<int>(2 * options.write_buffer_size));
+        rnd.RandomString(static_cast<int>(2 * options.write_buffer_size));
     for (int i = 0; i < 5 * kMaxFiles; i++) {
       ASSERT_OK(Put(1, "key", value));
       ASSERT_LE(TotalTableFiles(1), kMaxFiles);
@@ -1303,51 +1420,6 @@
 }
 #endif  // ROCKSDB_LITE
 
-TEST_F(DBTest, SparseMerge) {
-  do {
-    Options options = CurrentOptions();
-    options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    FillLevels("A", "Z", 1);
-
-    // Suppose there is:
-    //    small amount of data with prefix A
-    //    large amount of data with prefix B
-    //    small amount of data with prefix C
-    // and that recent updates have made small changes to all three prefixes.
-    // Check that we do not do a compaction that merges all of B in one shot.
-    const std::string value(1000, 'x');
-    Put(1, "A", "va");
-    // Write approximately 100MB of "B" values
-    for (int i = 0; i < 100000; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      Put(1, key, value);
-    }
-    Put(1, "C", "vc");
-    ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-
-    // Make sparse update
-    Put(1, "A", "va2");
-    Put(1, "B100", "bvalue2");
-    Put(1, "C", "vc2");
-    ASSERT_OK(Flush(1));
-
-    // Compactions should not cause us to create a situation where
-    // a file overlaps too much data at the next level.
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-  } while (ChangeCompactOptions());
-}
-
 #ifndef ROCKSDB_LITE
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
@@ -1370,7 +1442,7 @@
   const int N = 128;
   Random rnd(301);
   for (int i = 0; i < N; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
   }
 
   uint64_t size;
@@ -1380,33 +1452,37 @@
   SizeApproximationOptions size_approx_options;
   size_approx_options.include_memtabtles = true;
   size_approx_options.include_files = true;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_GT(size, 6000);
   ASSERT_LT(size, 204800);
   // Zero if not including mem table
-  db_->GetApproximateSizes(&r, 1, &size);
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   for (int i = 0; i < N; i++) {
-    ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
   }
 
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(100);
   end = Key(1020);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_GT(size, 6000);
 
   options.max_write_buffer_number = 8;
@@ -1421,58 +1497,64 @@
     keys[i * 3 + 1] = i * 5 + 1;
     keys[i * 3 + 2] = i * 5 + 2;
   }
-  std::random_shuffle(std::begin(keys), std::end(keys));
+  // MemTable entry counting is estimated and can vary greatly depending on
+  // layout. Thus, using deterministic seed for test stability.
+  RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
 
   for (int i = 0; i < N * 3; i++) {
-    ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(keys[i] + 1000), rnd.RandomString(1024)));
   }
 
   start = Key(100);
   end = Key(300);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_GT(size, 6000);
 
   start = Key(2100);
   end = Key(2300);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
   uint64_t size_with_mt, size_without_mt;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
-                           &size_with_mt);
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
   ASSERT_GT(size_with_mt, 6000);
-  db_->GetApproximateSizes(&r, 1, &size_without_mt);
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
   ASSERT_EQ(size_without_mt, 0);
 
-  Flush();
+  ASSERT_OK(Flush());
 
   for (int i = 0; i < N; i++) {
-    ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(i + 1000), rnd.RandomString(1024)));
   }
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
-                           &size_with_mt);
-  db_->GetApproximateSizes(&r, 1, &size_without_mt);
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
   ASSERT_GT(size_with_mt, size_without_mt);
   ASSERT_GT(size_without_mt, 6000);
 
   // Check that include_memtabtles flag works as expected
   size_approx_options.include_memtabtles = false;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, size_without_mt);
 
   // Check that files_size_error_margin works as expected, when the heuristic
@@ -1481,63 +1563,92 @@
   end = Key(1000 + N - 2);
   r = Range(start, end);
   size_approx_options.files_size_error_margin = -1.0;  // disabled
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   uint64_t size2;
   size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2));
   ASSERT_EQ(size, size2);
 }
 
 TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+  // Roughly 4 keys per data block, 1000 keys per file,
+  // with filter substantially larger than a data block
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(16));
+  table_options.block_size = 100;
   Options options = CurrentOptions();
-  options.write_buffer_size = 1024 * 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.write_buffer_size = 24 * 1024;
   options.compression = kNoCompression;
   options.create_if_missing = true;
-  options.target_file_size_base = 1024 * 1024;
+  options.target_file_size_base = 24 * 1024;
   DestroyAndReopen(options);
   const auto default_cf = db_->DefaultColumnFamily();
 
   const int N = 64000;
   Random rnd(301);
   for (int i = 0; i < N; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
   }
   // Flush everything to files
-  Flush();
+  ASSERT_OK(Flush());
   // Compact the entire key space into the next level
-  db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr));
 
   // Write more keys
   for (int i = N; i < (N + N / 4); i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
   }
   // Flush everything to files again
-  Flush();
+  ASSERT_OK(Flush());
 
   // Wait for compaction to finish
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  const std::string start = Key(0);
-  const std::string end = Key(2 * N);
-  const Range r(start, end);
-
-  SizeApproximationOptions size_approx_options;
-  size_approx_options.include_memtabtles = false;
-  size_approx_options.include_files = true;
-  size_approx_options.files_size_error_margin = -1.0;  // disabled
-
-  // Get the precise size without any approximation heuristic
-  uint64_t size;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
-  ASSERT_NE(size, 0);
+  {
+    const std::string start = Key(0);
+    const std::string end = Key(2 * N);
+    const Range r(start, end);
+
+    SizeApproximationOptions size_approx_options;
+    size_approx_options.include_memtabtles = false;
+    size_approx_options.include_files = true;
+    size_approx_options.files_size_error_margin = -1.0;  // disabled
+
+    // Get the precise size without any approximation heuristic
+    uint64_t size;
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size));
+    ASSERT_NE(size, 0);
+
+    // Get the size with an approximation heuristic
+    uint64_t size2;
+    const double error_margin = 0.2;
+    size_approx_options.files_size_error_margin = error_margin;
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size2));
+    ASSERT_LT(size2, size * (1 + error_margin));
+    ASSERT_GT(size2, size * (1 - error_margin));
+  }
 
-  // Get the size with an approximation heuristic
-  uint64_t size2;
-  const double error_margin = 0.2;
-  size_approx_options.files_size_error_margin = error_margin;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
-  ASSERT_LT(size2, size * (1 + error_margin));
-  ASSERT_GT(size2, size * (1 - error_margin));
+  {
+    // Ensure that metadata is not falsely attributed only to the last data in
+    // the file. (In some applications, filters can be large portion of data
+    // size.)
+    // Perform many queries over small range, enough to ensure crossing file
+    // boundary, and make sure we never see a spike for large filter.
+    for (int i = 0; i < 3000; i += 10) {
+      const std::string start = Key(i);
+      const std::string end = Key(i + 11);  // overlap by 1 key
+      const Range r(start, end);
+      uint64_t size;
+      ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+      ASSERT_LE(size, 11 * 100);
+    }
+  }
 }
 
 TEST_F(DBTest, GetApproximateMemTableStats) {
@@ -1550,7 +1661,7 @@
   const int N = 128;
   Random rnd(301);
   for (int i = 0; i < N; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
   }
 
   uint64_t count;
@@ -1572,7 +1683,7 @@
   ASSERT_EQ(count, 0);
   ASSERT_EQ(size, 0);
 
-  Flush();
+  ASSERT_OK(Flush());
 
   start = Key(50);
   end = Key(60);
@@ -1582,7 +1693,7 @@
   ASSERT_EQ(size, 0);
 
   for (int i = 0; i < N; i++) {
-    ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
   }
 
   start = Key(100);
@@ -1602,9 +1713,12 @@
     DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+    uint64_t size;
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
 
     // Write 8MB (80 values, each 100K)
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
@@ -1613,11 +1727,12 @@
     static const int S2 = 105000;  // Allow some expansion from metadata
     Random rnd(301);
     for (int i = 0; i < N; i++) {
-      ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1)));
+      ASSERT_OK(Put(1, Key(i), rnd.RandomString(S1)));
     }
 
     // 0 because GetApproximateSizes() does not account for memtable space
-    ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0));
+    ASSERT_OK(Size("", Key(50), 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
 
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
@@ -1625,20 +1740,23 @@
 
       for (int compact_start = 0; compact_start < N; compact_start += 10) {
         for (int i = 0; i < N; i += 10) {
-          ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i));
-          ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1),
-                              S2 * (i + 1)));
-          ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10));
+          ASSERT_OK(Size("", Key(i), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * i, S2 * i));
+          ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size));
+          ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1)));
+          ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * 10, S2 * 10));
         }
-        ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50));
-        ASSERT_TRUE(
-            Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50));
+        ASSERT_OK(Size("", Key(50), 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+        ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
 
         std::string cstart_str = Key(compact_start);
         std::string cend_str = Key(compact_start + 9);
         Slice cstart = cstart_str;
         Slice cend = cend_str;
-        dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]);
+        ASSERT_OK(dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]));
       }
 
       ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
@@ -1656,33 +1774,45 @@
     CreateAndReopenWithCF({"pikachu"}, options);
 
     Random rnd(301);
-    std::string big1 = RandomString(&rnd, 100000);
-    ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000)));
-    ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000)));
+    std::string big1 = rnd.RandomString(100000);
+    ASSERT_OK(Put(1, Key(0), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(1), rnd.RandomString(10000)));
     ASSERT_OK(Put(1, Key(2), big1));
-    ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(3), rnd.RandomString(10000)));
     ASSERT_OK(Put(1, Key(4), big1));
-    ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000)));
-    ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000)));
-    ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(5), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(6), rnd.RandomString(300000)));
+    ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000)));
 
     // Check sizes across recovery by reopening a few times
+    uint64_t size;
     for (int run = 0; run < 3; run++) {
       ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
-      ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
-      ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
-      ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
-      ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
-      ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
-      ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000));
-      ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000));
-      ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000));
-      ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000));
+      ASSERT_OK(Size("", Key(0), 1, &size));
+      ASSERT_TRUE(Between(size, 0, 0));
+      ASSERT_OK(Size("", Key(1), 1, &size));
+      ASSERT_TRUE(Between(size, 10000, 11000));
+      ASSERT_OK(Size("", Key(2), 1, &size));
+      ASSERT_TRUE(Between(size, 20000, 21000));
+      ASSERT_OK(Size("", Key(3), 1, &size));
+      ASSERT_TRUE(Between(size, 120000, 121000));
+      ASSERT_OK(Size("", Key(4), 1, &size));
+      ASSERT_TRUE(Between(size, 130000, 131000));
+      ASSERT_OK(Size("", Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 230000, 232000));
+      ASSERT_OK(Size("", Key(6), 1, &size));
+      ASSERT_TRUE(Between(size, 240000, 242000));
+      // Ensure some overhead is accounted for, even without including all
+      ASSERT_OK(Size("", Key(7), 1, &size));
+      ASSERT_TRUE(Between(size, 540500, 545000));
+      ASSERT_OK(Size("", Key(8), 1, &size));
+      ASSERT_TRUE(Between(size, 550500, 555000));
 
-      ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000));
+      ASSERT_OK(Size(Key(3), Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 110100, 111000));
 
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
     }
     // ApproximateOffsetOf() is not yet implemented in plain table format.
   } while (ChangeOptions(kSkipPlainTable));
@@ -1691,29 +1821,30 @@
 
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, Snapshot) {
+  env_->SetMockSleep();
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
-    Put(0, "foo", "0v1");
-    Put(1, "foo", "1v1");
+    ASSERT_OK(Put(0, "foo", "0v1"));
+    ASSERT_OK(Put(1, "foo", "1v1"));
 
     const Snapshot* s1 = db_->GetSnapshot();
     ASSERT_EQ(1U, GetNumSnapshots());
     uint64_t time_snap1 = GetTimeOldestSnapshots();
     ASSERT_GT(time_snap1, 0U);
     ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
-    Put(0, "foo", "0v2");
-    Put(1, "foo", "1v2");
+    ASSERT_OK(Put(0, "foo", "0v2"));
+    ASSERT_OK(Put(1, "foo", "1v2"));
 
-    env_->addon_time_.fetch_add(1);
+    env_->MockSleepForSeconds(1);
 
     const Snapshot* s2 = db_->GetSnapshot();
     ASSERT_EQ(2U, GetNumSnapshots());
     ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
     ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
-    Put(0, "foo", "0v3");
-    Put(1, "foo", "1v3");
+    ASSERT_OK(Put(0, "foo", "0v3"));
+    ASSERT_OK(Put(1, "foo", "1v3"));
 
     {
       ManagedSnapshot s3(db_);
@@ -1721,8 +1852,8 @@
       ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
       ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
 
-      Put(0, "foo", "0v4");
-      Put(1, "foo", "1v4");
+      ASSERT_OK(Put(0, "foo", "0v4"));
+      ASSERT_OK(Put(1, "foo", "1v4"));
       ASSERT_EQ("0v1", Get(0, "foo", s1));
       ASSERT_EQ("1v1", Get(1, "foo", s1));
       ASSERT_EQ("0v2", Get(0, "foo", s2));
@@ -1763,35 +1894,38 @@
 TEST_F(DBTest, HiddenValuesAreRemoved) {
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
+  uint64_t size;
   do {
     Options options = CurrentOptions(options_override);
     CreateAndReopenWithCF({"pikachu"}, options);
     Random rnd(301);
     FillLevels("a", "z", 1);
 
-    std::string big = RandomString(&rnd, 50000);
-    Put(1, "foo", big);
-    Put(1, "pastfoo", "v");
+    std::string big = rnd.RandomString(50000);
+    ASSERT_OK(Put(1, "foo", big));
+    ASSERT_OK(Put(1, "pastfoo", "v"));
     const Snapshot* snapshot = db_->GetSnapshot();
-    Put(1, "foo", "tiny");
-    Put(1, "pastfoo2", "v2");  // Advance sequence number one more
+    ASSERT_OK(Put(1, "foo", "tiny"));
+    ASSERT_OK(Put(1, "pastfoo2", "v2"));  // Advance sequence number one more
 
     ASSERT_OK(Flush(1));
     ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
 
     ASSERT_EQ(big, Get(1, "foo", snapshot));
-    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000));
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 50000, 60000));
     db_->ReleaseSnapshot(snapshot);
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
     Slice x("x");
-    dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
     ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
-    dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
 
-    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 1000));
     // ApproximateOffsetOf() is not yet implemented in plain table format,
     // which is used by Size().
   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
@@ -1817,26 +1951,26 @@
     options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    Put(1, "foo", "first");
+    ASSERT_OK(Put(1, "foo", "first"));
     const Snapshot* snapshot = db_->GetSnapshot();
-    SingleDelete(1, "foo");
-    Put(1, "foo", "second");
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "second"));
     ASSERT_OK(Flush(1));
 
     ASSERT_EQ("first", Get(1, "foo", snapshot));
     ASSERT_EQ("second", Get(1, "foo"));
 
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
     ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
 
-    SingleDelete(1, "foo");
+    ASSERT_OK(SingleDelete(1, "foo"));
 
     ASSERT_EQ("first", Get(1, "foo", snapshot));
     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
 
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
 
     ASSERT_EQ("first", Get(1, "foo", snapshot));
     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
@@ -1852,7 +1986,7 @@
 TEST_F(DBTest, DeletionMarkers1) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
-  Put(1, "foo", "v1");
+  ASSERT_OK(Put(1, "foo", "v1"));
   ASSERT_OK(Flush(1));
   const int last = 2;
   MoveFilesToLevel(last, 1);
@@ -1860,24 +1994,25 @@
   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
 
   // Place a table at level last-1 to prevent merging with preceding mutation
-  Put(1, "a", "begin");
-  Put(1, "z", "end");
-  Flush(1);
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(last - 1, 1);
   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
   ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
 
-  Delete(1, "foo");
-  Put(1, "foo", "v2");
+  ASSERT_OK(Delete(1, "foo"));
+  ASSERT_OK(Put(1, "foo", "v2"));
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
   ASSERT_OK(Flush(1));  // Moves to level last-2
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
   Slice z("z");
-  dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]));
   // DEL eliminated, but v1 remains because we aren't compacting that level
   // (DEL can be eliminated because v2 hides v1).
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
-  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
   // Merging last-1 w/ last, so we are the base level for "foo", so
   // DEL is removed.  (as is v1).
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
@@ -1886,7 +2021,7 @@
 TEST_F(DBTest, DeletionMarkers2) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
-  Put(1, "foo", "v1");
+  ASSERT_OK(Put(1, "foo", "v1"));
   ASSERT_OK(Flush(1));
   const int last = 2;
   MoveFilesToLevel(last, 1);
@@ -1894,21 +2029,23 @@
   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
 
   // Place a table at level last-1 to prevent merging with preceding mutation
-  Put(1, "a", "begin");
-  Put(1, "z", "end");
-  Flush(1);
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(last - 1, 1);
   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
   ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
 
-  Delete(1, "foo");
+  ASSERT_OK(Delete(1, "foo"));
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
   ASSERT_OK(Flush(1));  // Moves to level last-2
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]));
   // DEL kept: "last" file overlaps
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
   // Merging last-1 w/ last, so we are the base level for "foo", so
   // DEL is removed.  (as is v1).
   ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
@@ -1923,11 +2060,11 @@
     // 0.
     ASSERT_OK(Put(1, "100", "v100"));
     ASSERT_OK(Put(1, "999", "v999"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     MoveFilesToLevel(2, 1);
     ASSERT_OK(Delete(1, "100"));
     ASSERT_OK(Delete(1, "999"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     MoveFilesToLevel(1, 1);
     ASSERT_EQ("0,1,1", FilesPerLevel(1));
 
@@ -1937,23 +2074,30 @@
     // Note that files are sorted by smallest key.
     ASSERT_OK(Put(1, "300", "v300"));
     ASSERT_OK(Put(1, "500", "v500"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "200", "v200"));
     ASSERT_OK(Put(1, "600", "v600"));
     ASSERT_OK(Put(1, "900", "v900"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_EQ("2,1,1", FilesPerLevel(1));
 
+    // BEGIN addition to existing test
+    // Take this opportunity to verify SST unique ids (including Plain table)
+    TablePropertiesCollection tbc;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc));
+    VerifySstUniqueIds(tbc);
+    // END addition to existing test
+
     // Compact away the placeholder files we created initially
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]));
     ASSERT_EQ("2", FilesPerLevel(1));
 
     // Do a memtable compaction.  Before bug-fix, the compaction would
     // not detect the overlap with level-0 files and would incorrectly place
     // the deletion in a deeper level.
     ASSERT_OK(Delete(1, "600"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_EQ("3", FilesPerLevel(1));
     ASSERT_EQ("NOT_FOUND", Get(1, "600"));
   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
@@ -2099,7 +2243,7 @@
 
   ASSERT_OK(Put(1, "a", "123"));
   ASSERT_OK(Put(1, "b", "234"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(3, 1);
   Close();
 
@@ -2159,7 +2303,7 @@
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
     std::vector<std::string> values;
     for (int i = 0; i < 80; i++) {
-      values.push_back(RandomString(&rnd, 100000));
+      values.push_back(rnd.RandomString(100000));
       ASSERT_OK(Put((i < 40), Key(i), values[i]));
     }
 
@@ -2170,8 +2314,8 @@
     uint64_t manifest_number = 0;
     uint64_t manifest_size = 0;
     std::vector<std::string> files;
-    dbfull()->DisableFileDeletions();
-    dbfull()->GetLiveFiles(files, &manifest_size);
+    ASSERT_OK(dbfull()->DisableFileDeletions());
+    ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
 
     // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF)
     ASSERT_EQ(files.size(), 5U);
@@ -2181,7 +2325,10 @@
 
     // copy these files to a new snapshot directory
     std::string snapdir = dbname_ + ".snapdir/";
-    ASSERT_OK(env_->CreateDirIfMissing(snapdir));
+    if (env_->FileExists(snapdir).ok()) {
+      ASSERT_OK(DestroyDir(env_, snapdir));
+    }
+    ASSERT_OK(env_->CreateDir(snapdir));
 
     for (size_t i = 0; i < files.size(); i++) {
       // our clients require that GetLiveFiles returns
@@ -2197,22 +2344,21 @@
       // latest manifest file
       if (ParseFileName(files[i].substr(1), &number, &type)) {
         if (type == kDescriptorFile) {
-          if (number > manifest_number) {
-            manifest_number = number;
-            ASSERT_GE(size, manifest_size);
-            size = manifest_size;  // copy only valid MANIFEST data
-          }
+          ASSERT_EQ(manifest_number, 0);
+          manifest_number = number;
+          ASSERT_GE(size, manifest_size);
+          size = manifest_size;  // copy only valid MANIFEST data
         }
       }
       CopyFile(src, dest, size);
     }
 
     // release file snapshot
-    dbfull()->DisableFileDeletions();
+    ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false));
     // overwrite one key, this key should not appear in the snapshot
     std::vector<std::string> extras;
     for (unsigned int i = 0; i < 1; i++) {
-      extras.push_back(RandomString(&rnd, 100000));
+      extras.push_back(rnd.RandomString(100000));
       ASSERT_OK(Put(0, Key(i), extras[i]));
     }
 
@@ -2232,7 +2378,7 @@
     ReadOptions roptions;
     std::string val;
     for (unsigned int i = 0; i < 80; i++) {
-      stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val);
+      ASSERT_OK(snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val));
       ASSERT_EQ(values[i].compare(val), 0);
     }
     for (auto cfh : cf_handles) {
@@ -2245,8 +2391,8 @@
     uint64_t new_manifest_number = 0;
     uint64_t new_manifest_size = 0;
     std::vector<std::string> newfiles;
-    dbfull()->DisableFileDeletions();
-    dbfull()->GetLiveFiles(newfiles, &new_manifest_size);
+    ASSERT_OK(dbfull()->DisableFileDeletions());
+    ASSERT_OK(dbfull()->GetLiveFiles(newfiles, &new_manifest_size));
 
     // find the new manifest file. assert that this manifest file is
     // the same one as in the previous snapshot. But its size should be
@@ -2258,20 +2404,41 @@
       // latest manifest file
       if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
         if (type == kDescriptorFile) {
-          if (number > new_manifest_number) {
-            uint64_t size;
-            new_manifest_number = number;
-            ASSERT_OK(env_->GetFileSize(src, &size));
-            ASSERT_GE(size, new_manifest_size);
-          }
+          ASSERT_EQ(new_manifest_number, 0);
+          uint64_t size;
+          new_manifest_number = number;
+          ASSERT_OK(env_->GetFileSize(src, &size));
+          ASSERT_GE(size, new_manifest_size);
         }
       }
     }
     ASSERT_EQ(manifest_number, new_manifest_number);
     ASSERT_GT(new_manifest_size, manifest_size);
 
-    // release file snapshot
-    dbfull()->DisableFileDeletions();
+    // Also test GetLiveFilesStorageInfo
+    std::vector<LiveFileStorageInfo> new_infos;
+    ASSERT_OK(dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(),
+                                                &new_infos));
+
+    // Close DB (while deletions disabled)
+    Close();
+
+    // Validate
+    for (auto& info : new_infos) {
+      std::string path = info.directory + "/" + info.relative_filename;
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(path, &size));
+      if (info.trim_to_size) {
+        ASSERT_LE(info.size, size);
+      } else if (!info.replacement_contents.empty()) {
+        ASSERT_EQ(info.size, info.replacement_contents.size());
+      } else {
+        ASSERT_EQ(info.size, size);
+      }
+      if (info.file_type == kDescriptorFile) {
+        ASSERT_EQ(info.file_number, manifest_number);
+      }
+    }
   } while (ChangeCompactOptions());
 }
 
@@ -2292,7 +2459,7 @@
 
     uint64_t manifest_size = 0;
     std::vector<std::string> files;
-    dbfull()->GetLiveFiles(files, &manifest_size);
+    ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
 
     for (const std::string& f : files) {
       uint64_t number = 0;
@@ -2300,7 +2467,7 @@
       if (ParseFileName(f.substr(1), &number, &type)) {
         if (type == kDescriptorFile) {
           uint64_t size_on_disk;
-          env_->GetFileSize(dbname_ + "/" + f, &size_on_disk);
+          ASSERT_OK(env_->GetFileSize(dbname_ + "/" + f, &size_on_disk));
           ASSERT_EQ(manifest_size, size_on_disk);
           break;
         }
@@ -2309,16 +2476,58 @@
     Close();
   } while (ChangeCompactOptions());
 }
+
+TEST_F(DBTest, GetLiveBlobFiles) {
+  // Note: the following prevents an otherwise harmless data race between the
+  // test setup code (AddBlobFile) below and the periodic stat dumping thread.
+  Options options = CurrentOptions();
+  options.stats_dump_period_sec = 0;
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  Reopen(options);
+
+  AddBlobFile(db_->DefaultColumnFamily(), blob_file_number, total_blob_count,
+              total_blob_bytes, checksum_method, checksum_value,
+              garbage_blob_count, garbage_blob_bytes);
+  // Make sure it appears in the results returned by GetLiveFiles.
+  uint64_t manifest_size = 0;
+  std::vector<std::string> files;
+  ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+  ASSERT_FALSE(files.empty());
+  ASSERT_EQ(files[0], BlobFileName("", blob_file_number));
+
+  ColumnFamilyMetaData cfmd;
+
+  db_->GetColumnFamilyMetaData(&cfmd);
+  ASSERT_EQ(cfmd.blob_files.size(), 1);
+  const BlobMetaData& bmd = cfmd.blob_files[0];
+
+  CheckBlobMetaData(bmd, blob_file_number, total_blob_count, total_blob_bytes,
+                    checksum_method, checksum_value, garbage_blob_count,
+                    garbage_blob_bytes);
+  ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+  ASSERT_EQ(cfmd.blob_file_count, 1U);
+  ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+}
 #endif
 
 TEST_F(DBTest, PurgeInfoLogs) {
   Options options = CurrentOptions();
   options.keep_log_file_num = 5;
   options.create_if_missing = true;
+  options.env = env_;
   for (int mode = 0; mode <= 1; mode++) {
     if (mode == 1) {
       options.db_log_dir = dbname_ + "_logs";
-      env_->CreateDirIfMissing(options.db_log_dir);
+      ASSERT_OK(env_->CreateDirIfMissing(options.db_log_dir));
     } else {
       options.db_log_dir = "";
     }
@@ -2327,8 +2536,8 @@
     }
 
     std::vector<std::string> files;
-    env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir,
-                      &files);
+    ASSERT_OK(env_->GetChildren(
+        options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files));
     int info_log_count = 0;
     for (std::string file : files) {
       if (file.find("LOG") != std::string::npos) {
@@ -2340,19 +2549,18 @@
     Destroy(options);
     // For mode (1), test DestroyDB() to delete all the logs under DB dir.
     // For mode (2), no info log file should have been put under DB dir.
+    // Since dbname_ has no children, there is no need to loop db_files
     std::vector<std::string> db_files;
-    env_->GetChildren(dbname_, &db_files);
-    for (std::string file : db_files) {
-      ASSERT_TRUE(file.find("LOG") == std::string::npos);
-    }
+    ASSERT_TRUE(env_->GetChildren(dbname_, &db_files).IsNotFound());
+    ASSERT_TRUE(db_files.empty());
 
     if (mode == 1) {
       // Cleaning up
-      env_->GetChildren(options.db_log_dir, &files);
+      ASSERT_OK(env_->GetChildren(options.db_log_dir, &files));
       for (std::string file : files) {
-        env_->DeleteFile(options.db_log_dir + "/" + file);
+        ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file));
       }
-      env_->DeleteDir(options.db_log_dir);
+      ASSERT_OK(env_->DeleteDir(options.db_log_dir));
     }
   }
 }
@@ -2368,9 +2576,7 @@
 
 struct MTState {
   DBTest* test;
-  std::atomic<bool> stop;
   std::atomic<int> counter[kNumThreads];
-  std::atomic<bool> thread_done[kNumThreads];
 };
 
 struct MTThread {
@@ -2384,10 +2590,13 @@
   int id = t->id;
   DB* db = t->state->test->db_;
   int counter = 0;
+  std::shared_ptr<SystemClock> clock = SystemClock::Default();
+  auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U;
+
   fprintf(stderr, "... starting thread %d\n", id);
   Random rnd(1000 + id);
   char valbuf[1500];
-  while (t->state->stop.load(std::memory_order_acquire) == false) {
+  while (clock->NowMicros() < end_micros) {
     t->state->counter[id].store(counter, std::memory_order_release);
 
     int key = rnd.Uniform(kNumKeys);
@@ -2407,7 +2616,8 @@
         for (int cf = 0; cf < kColumnFamilies; ++cf) {
           snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
                    static_cast<int>(counter), cf, unique_id);
-          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+          ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+                              Slice(valbuf)));
         }
         ASSERT_OK(db->Write(WriteOptions(), &batch));
       } else {
@@ -2415,7 +2625,8 @@
         for (int cf = 0; cf < kColumnFamilies; ++cf) {
           snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
                    static_cast<int>(counter), cf, unique_id);
-          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+          ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+                              Slice(valbuf)));
         }
         ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
       }
@@ -2482,7 +2693,6 @@
     }
     counter++;
   }
-  t->state->thread_done[id].store(true, std::memory_order_release);
   fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
 }
 
@@ -2521,10 +2731,8 @@
   // Initialize state
   MTState mt;
   mt.test = this;
-  mt.stop.store(false, std::memory_order_release);
   for (int id = 0; id < kNumThreads; id++) {
     mt.counter[id].store(0, std::memory_order_release);
-    mt.thread_done[id].store(false, std::memory_order_release);
   }
 
   // Start threads
@@ -2536,16 +2744,7 @@
     env_->StartThread(MTThreadBody, &thread[id]);
   }
 
-  // Let them run for a while
-  env_->SleepForMicroseconds(kTestSeconds * 1000000);
-
-  // Stop the threads and wait for them to finish
-  mt.stop.store(true, std::memory_order_release);
-  for (int id = 0; id < kNumThreads; id++) {
-    while (mt.thread_done[id].load(std::memory_order_acquire) == false) {
-      env_->SleepForMicroseconds(100000);
-    }
-  }
+  env_->WaitForJoin();
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -2636,7 +2835,7 @@
 #endif  // TRAVIS
 
 namespace {
-typedef std::map<std::string, std::string> KVMap;
+using KVMap = std::map<std::string, std::string>;
 }
 
 class ModelDB : public DB {
@@ -2657,7 +2856,10 @@
   Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
              const Slice& v) override {
     WriteBatch batch;
-    batch.Put(cf, k, v);
+    Status s = batch.Put(cf, k, v);
+    if (!s.ok()) {
+      return s;
+    }
     return Write(o, &batch);
   }
   using DB::Close;
@@ -2666,21 +2868,30 @@
   Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
                 const Slice& key) override {
     WriteBatch batch;
-    batch.Delete(cf, key);
+    Status s = batch.Delete(cf, key);
+    if (!s.ok()) {
+      return s;
+    }
     return Write(o, &batch);
   }
   using DB::SingleDelete;
   Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
                       const Slice& key) override {
     WriteBatch batch;
-    batch.SingleDelete(cf, key);
+    Status s = batch.SingleDelete(cf, key);
+    if (!s.ok()) {
+      return s;
+    }
     return Write(o, &batch);
   }
   using DB::Merge;
   Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
                const Slice& v) override {
     WriteBatch batch;
-    batch.Merge(cf, k, v);
+    Status s = batch.Merge(cf, k, v);
+    if (!s.ok()) {
+      return s;
+    }
     return Write(o, &batch);
   }
   using DB::Get;
@@ -2929,15 +3140,27 @@
 
   Status SyncWAL() override { return Status::OK(); }
 
-#ifndef ROCKSDB_LITE
   Status DisableFileDeletions() override { return Status::OK(); }
 
   Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
+#ifndef ROCKSDB_LITE
+
   Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
                       bool /*flush_memtable*/ = true) override {
     return Status::OK();
   }
 
+  Status GetLiveFilesChecksumInfo(
+      FileChecksumList* /*checksum_list*/) override {
+    return Status::OK();
+  }
+
+  Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& /*opts*/,
+      std::vector<LiveFileStorageInfo>* /*files*/) override {
+    return Status::OK();
+  }
+
   Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
     return Status::OK();
   }
@@ -2970,12 +3193,26 @@
     return Status::OK();
   }
 
+  Status GetDbSessionId(std::string& /*session_id*/) const override {
+    return Status::OK();
+  }
+
   SequenceNumber GetLatestSequenceNumber() const override { return 0; }
 
   bool SetPreserveDeletesSequenceNumber(SequenceNumber /*seqnum*/) override {
     return true;
   }
 
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+                                  std::string /*ts_low*/) override {
+    return Status::OK();
+  }
+
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+                             std::string* /*ts_low*/) override {
+    return Status::OK();
+  }
+
   ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
 
  private:
@@ -3025,7 +3262,7 @@
   std::string name_ = "";
 };
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 static std::string RandomKey(Random* rnd, int minimum = 0) {
   int len;
   do {
@@ -3061,7 +3298,7 @@
       fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
               step, EscapeString(miter->key()).c_str(),
               EscapeString(miter->value()).c_str(),
-              EscapeString(miter->value()).c_str());
+              EscapeString(dbiter->value()).c_str());
       ok = false;
     }
   }
@@ -3125,8 +3362,8 @@
     }
     if (p < 45) {  // Put
       k = RandomKey(&rnd, minimum);
-      v = RandomString(&rnd,
-                       rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8));
+      v = rnd.RandomString(rnd.OneIn(20) ? 100 + rnd.Uniform(100)
+                                         : rnd.Uniform(8));
       ASSERT_OK(model.Put(WriteOptions(), k, v));
       ASSERT_OK(db_->Put(WriteOptions(), k, v));
     } else if (p < 90) {  // Delete
@@ -3144,10 +3381,10 @@
           // we have multiple entries in the write batch for the same key
         }
         if (rnd.OneIn(2)) {
-          v = RandomString(&rnd, rnd.Uniform(10));
-          b.Put(k, v);
+          v = rnd.RandomString(rnd.Uniform(10));
+          ASSERT_OK(b.Put(k, v));
         } else {
-          b.Delete(k);
+          ASSERT_OK(b.Delete(k));
         }
       }
       ASSERT_OK(model.Write(WriteOptions(), &b));
@@ -3180,7 +3417,7 @@
   if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
   if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
   // create a DB with block prefix index
@@ -3192,7 +3429,7 @@
 
   Reopen(options);
   ASSERT_OK(Put("k1", "v1"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("k2", "v2"));
 
   // Reopen it without prefix extractor, make sure everything still works.
@@ -3205,6 +3442,27 @@
   ASSERT_EQ("v1", Get("k1"));
   ASSERT_EQ("v2", Get("k2"));
 }
+TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewCappedPrefixTransform(2));
+
+  Reopen(options);
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk", "v3"));
+  ASSERT_OK(Put("k", "v4"));
+  Flush();
+
+  ASSERT_EQ("v1", Get("kk1"));
+  ASSERT_EQ("v2", Get("kk2"));
+
+  ASSERT_EQ("v3", Get("kk"));
+  ASSERT_EQ("v4", Get("k"));
+}
 
 TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) {
   // create a DB with block prefix index
@@ -3225,7 +3483,7 @@
 
   Reopen(options);
   ASSERT_OK(Put("k1", "v1"));
-  Flush();
+  ASSERT_OK(Flush());
 
   CompactRangeOptions cro;
   cro.change_level = true;
@@ -3314,7 +3572,7 @@
     Random rnd(301);
     for (int i = 0; i < 6; ++i) {
       for (int j = 0; j < 110; ++j) {
-        ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 100 + j), rnd.RandomString(980)));
       }
       // flush should happen here
       ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
@@ -3352,9 +3610,9 @@
   for (int i = 0; i < 60; i++) {
     // Generate and flush a file about 20KB.
     for (int j = 0; j < 20; j++) {
-      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+      ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
     }
-    Flush();
+    ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   // It should be compacted to 10 files.
@@ -3363,9 +3621,9 @@
   for (int i = 0; i < 60; i++) {
     // Generate and flush a file about 20KB.
     for (int j = 0; j < 20; j++) {
-      ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980)));
+      ASSERT_OK(Put(ToString(i * 20 + j + 2000), rnd.RandomString(980)));
     }
-    Flush();
+    ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
@@ -3393,10 +3651,10 @@
   Random rnd(301);
   for (int i = 0; i < 3; i++) {
     // Each file contains a different key which will be dropped later.
-    ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500)));
+    ASSERT_OK(Put("a" + ToString(i), rnd.RandomString(500)));
     ASSERT_OK(Put("key" + ToString(i), ""));
-    ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500)));
-    Flush();
+    ASSERT_OK(Put("z" + ToString(i), rnd.RandomString(500)));
+    ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
@@ -3405,10 +3663,10 @@
   }
   for (int i = 0; i < 3; i++) {
     // Each file contains a different key which will be dropped later.
-    ASSERT_OK(Put("a" + ToString(i), RandomString(&rnd, 500)));
+    ASSERT_OK(Put("a" + ToString(i), rnd.RandomString(500)));
     ASSERT_OK(Delete("key" + ToString(i)));
-    ASSERT_OK(Put("z" + ToString(i), RandomString(&rnd, 500)));
-    Flush();
+    ASSERT_OK(Put("z" + ToString(i), rnd.RandomString(500)));
+    ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
@@ -3418,17 +3676,21 @@
 }
 
 // Check that FIFO-with-TTL is not supported with max_open_files != -1.
+// Github issue #8014
 TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
-  Options options;
+  Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleFIFO;
   options.create_if_missing = true;
   options.ttl = 600;  // seconds
 
-  // TTL is now supported with max_open_files != -1.
+  // TTL is not supported with max_open_files != -1.
+  options.max_open_files = 0;
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
   options.max_open_files = 100;
-  options = CurrentOptions(options);
-  ASSERT_OK(TryReopen(options));
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
 
+  // TTL is supported with unlimited max_open_files
   options.max_open_files = -1;
   ASSERT_OK(TryReopen(options));
 }
@@ -3460,13 +3722,14 @@
   options.arena_block_size = 4096;
   options.compression = kNoCompression;
   options.create_if_missing = true;
-  env_->time_elapse_only_sleep_ = false;
+  env_->SetMockSleep();
   options.env = env_;
 
   // Test to make sure that all files with expired ttl are deleted on next
   // manual compaction.
   {
-    env_->addon_time_.store(0);
+    // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = false;
     options.ttl = 1 * 60 * 60 ;  // 1 hour
@@ -3477,25 +3740,22 @@
     for (int i = 0; i < 10; i++) {
       // Generate and flush a file about 10KB.
       for (int j = 0; j < 10; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
     // Sleep for 2 hours -- which is much greater than TTL.
-    // Note: Couldn't use SleepForMicroseconds because it takes an int instead
-    // of uint64_t. Hence used addon_time_ directly.
-    // env_->SleepForMicroseconds(2 * 60 * 60 * 1000 * 1000);
-    env_->addon_time_.fetch_add(2 * 60 * 60);
+    env_->MockSleepForSeconds(2 * 60 * 60);
 
     // Since no flushes and compactions have run, the db should still be in
     // the same state even after considerable time has passed.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
-    dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   }
 
@@ -3512,15 +3772,15 @@
     for (int i = 0; i < 10; i++) {
       // Generate and flush a file about 10KB.
       for (int j = 0; j < 10; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
     // Sleep for 2 hours -- which is much greater than TTL.
-    env_->addon_time_.fetch_add(2 * 60 * 60);
+    env_->MockSleepForSeconds(2 * 60 * 60);
     // Just to make sure that we are in the same state even after sleeping.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
@@ -3528,9 +3788,9 @@
     // Create 1 more file to trigger TTL compaction. The old files are dropped.
     for (int i = 0; i < 1; i++) {
       for (int j = 0; j < 10; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
     }
 
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -3554,24 +3814,24 @@
     for (int i = 0; i < 3; i++) {
       // Generate and flush a file about 10KB.
       for (int j = 0; j < 10; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     ASSERT_EQ(NumTableFilesAtLevel(0), 3);
 
     // Sleep for 2 hours -- which is much greater than TTL.
-    env_->addon_time_.fetch_add(2 * 60 * 60);
+    env_->MockSleepForSeconds(2 * 60 * 60);
     // Just to make sure that we are in the same state even after sleeping.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 3);
 
     for (int i = 0; i < 5; i++) {
       for (int j = 0; j < 140; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     // Size limit is still guaranteed.
@@ -3592,9 +3852,9 @@
     for (int i = 0; i < 10; i++) {
       // Generate and flush a file about 10KB.
       for (int j = 0; j < 10; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
@@ -3603,7 +3863,7 @@
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
 
     // Sleep for 2 hours -- which is much greater than TTL.
-    env_->addon_time_.fetch_add(2 * 60 * 60);
+    env_->MockSleepForSeconds(2 * 60 * 60);
     // Just to make sure that we are in the same state even after sleeping.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
@@ -3611,9 +3871,9 @@
     // Create 10 more files. The old 5 files are dropped as their ttl expired.
     for (int i = 0; i < 10; i++) {
       for (int j = 0; j < 10; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
@@ -3636,9 +3896,9 @@
     for (int i = 0; i < 60; i++) {
       // Generate and flush a file about 20KB.
       for (int j = 0; j < 20; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     // It should be compacted to 10 files.
@@ -3647,9 +3907,9 @@
     for (int i = 0; i < 60; i++) {
       // Generate and flush a file about 20KB.
       for (int j = 0; j < 20; j++) {
-        ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980)));
+        ASSERT_OK(Put(ToString(i * 20 + j + 2000), rnd.RandomString(980)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
 
@@ -3690,8 +3950,7 @@
   uint64_t start = env_->NowMicros();
   // Write ~96M data
   for (int64_t i = 0; i < (96 << 10); ++i) {
-    ASSERT_OK(
-        Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
   }
   uint64_t elapsed = env_->NowMicros() - start;
   double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed;
@@ -3709,8 +3968,7 @@
   start = env_->NowMicros();
   // Write ~96M data
   for (int64_t i = 0; i < (96 << 10); ++i) {
-    ASSERT_OK(
-        Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
   }
   rate_limiter_drains =
       TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
@@ -3735,8 +3993,7 @@
   start = env_->NowMicros();
   // Write ~96M data
   for (int64_t i = 0; i < (96 << 10); ++i) {
-    ASSERT_OK(
-        Put(RandomString(&rnd, 32), RandomString(&rnd, (1 << 10) + 1), wo));
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
   }
   elapsed = env_->NowMicros() - start;
   rate_limiter_drains =
@@ -3753,13 +4010,66 @@
   ASSERT_LT(ratio, 0.6);
 }
 
+// This is a mocked customed rate limiter without implementing optional APIs
+// (e.g, RateLimiter::GetTotalPendingRequests())
+class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter {
+ public:
+  MockedRateLimiterWithNoOptionalAPIImpl() {}
+
+  ~MockedRateLimiterWithNoOptionalAPIImpl() override {}
+
+  const char* Name() const override {
+    return "MockedRateLimiterWithNoOptionalAPI";
+  }
+  void SetBytesPerSecond(int64_t bytes_per_second) override {
+    (void)bytes_per_second;
+  }
+
+  using RateLimiter::Request;
+  void Request(const int64_t bytes, const Env::IOPriority pri,
+               Statistics* stats) override {
+    (void)bytes;
+    (void)pri;
+    (void)stats;
+  }
+
+  int64_t GetSingleBurstBytes() const override { return 200; }
+
+  int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    (void)pri;
+    return 0;
+  }
+
+  int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    (void)pri;
+    return 0;
+  }
+
+  int64_t GetBytesPerSecond() const override { return 0; }
+};
+
+// To test that customed rate limiter not implementing optional APIs (e.g,
+// RateLimiter::GetTotalPendingRequests()) works fine with RocksDB basic
+// operations (e.g, Put, Get, Flush)
+TEST_F(DBTest, CustomedRateLimiterWithNoOptionalAPIImplTest) {
+  Options options = CurrentOptions();
+  options.rate_limiter.reset(new MockedRateLimiterWithNoOptionalAPIImpl());
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("abc", "def"));
+  ASSERT_EQ(Get("abc"), "def");
+  ASSERT_OK(Flush());
+  ASSERT_EQ(Get("abc"), "def");
+}
+
 TEST_F(DBTest, TableOptionsSanitizeTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   DestroyAndReopen(options);
   ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
 
-  options.table_factory.reset(new PlainTableFactory());
+  options.table_factory.reset(NewPlainTableFactory());
   options.prefix_extractor.reset(NewNoopTransform());
   Destroy(options);
   ASSERT_TRUE(!TryReopen(options).IsNotSupported());
@@ -3840,7 +4150,7 @@
 
     ASSERT_OK(Put("abc", "def"));
     ASSERT_EQ("def", Get("abc"));
-    Flush();
+    ASSERT_OK(Flush());
     ASSERT_EQ("def", Get("abc"));
   }
 }
@@ -3849,9 +4159,9 @@
   std::vector<port::Thread> threads;
   dbfull()->TEST_LockMutex();
   auto w = dbfull()->TEST_BeginWrite();
-  threads.emplace_back([&] { Put("a", "b"); });
+  threads.emplace_back([&] { ASSERT_OK(Put("a", "b")); });
   env_->SleepForMicroseconds(10000);
-  threads.emplace_back([&] { Flush(); });
+  threads.emplace_back([&] { ASSERT_OK(Flush()); });
   env_->SleepForMicroseconds(10000);
   dbfull()->TEST_UnlockMutex();
   dbfull()->TEST_LockMutex();
@@ -3866,6 +4176,7 @@
 TEST_F(DBTest, ConcurrentFlushWAL) {
   const size_t cnt = 100;
   Options options;
+  options.env = env_;
   WriteOptions wopt;
   ReadOptions ropt;
   for (bool two_write_queues : {false, true}) {
@@ -3878,7 +4189,8 @@
       threads.emplace_back([&] {
         for (size_t i = 0; i < cnt; i++) {
           auto istr = ToString(i);
-          db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr);
+          ASSERT_OK(db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr,
+                             "b" + istr));
         }
       });
       if (two_write_queues) {
@@ -3886,14 +4198,15 @@
           for (size_t i = cnt; i < 2 * cnt; i++) {
             auto istr = ToString(i);
             WriteBatch batch;
-            batch.Put("a" + istr, "b" + istr);
-            dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true);
+            ASSERT_OK(batch.Put("a" + istr, "b" + istr));
+            ASSERT_OK(
+                dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true));
           }
         });
       }
       threads.emplace_back([&] {
         for (size_t i = 0; i < cnt * 100; i++) {  // FlushWAL is faster than Put
-          db_->FlushWAL(false);
+          ASSERT_OK(db_->FlushWAL(false));
         }
       });
       for (auto& t : threads) {
@@ -3913,6 +4226,39 @@
   }
 }
 
+// This test failure will be caught with a probability
+TEST_F(DBTest, ManualFlushWalAndWriteRace) {
+  Options options;
+  options.env = env_;
+  options.manual_wal_flush = true;
+  options.create_if_missing = true;
+
+  DestroyAndReopen(options);
+
+  WriteOptions wopts;
+  wopts.sync = true;
+
+  port::Thread writeThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      auto istr = ToString(i);
+      ASSERT_OK(dbfull()->Put(wopts, "key_" + istr, "value_" + istr));
+    }
+  });
+  port::Thread flushThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(dbfull()->FlushWAL(false));
+    }
+  });
+
+  writeThread.join();
+  flushThread.join();
+  ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1"));
+  ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2"));
+  Reopen(options);
+  ASSERT_EQ("value1", Get("foo1"));
+  ASSERT_EQ("value2", Get("foo2"));
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, DynamicMemtableOptions) {
   const uint64_t k64KB = 1 << 16;
@@ -3936,7 +4282,7 @@
     const int kNumPutsBeforeWaitForFlush = 64;
     Random rnd(301);
     for (int i = 0; i < size; i++) {
-      ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
 
       // The following condition prevents a race condition between flush jobs
       // acquiring work and this thread filling up multiple memtables. Without
@@ -3944,10 +4290,10 @@
       // multiple memtables are flushed into a single L0 file. This race
       // condition affects assertion (A).
       if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       }
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   };
 
   // Test write_buffer_size
@@ -3957,7 +4303,7 @@
   ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
 
   // Clean up L0
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
   // Increase buffer size
@@ -4010,7 +4356,7 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   while (!sleeping_task_low.WokenUp() && count < 256) {
-    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
     count++;
   }
   ASSERT_GT(static_cast<double>(count), 128 * 0.8);
@@ -4023,14 +4369,14 @@
       {"max_write_buffer_number", "8"},
   }));
   // Clean up memtable and L0
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   sleeping_task_low.Reset();
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
                  Env::Priority::LOW);
   count = 0;
   while (!sleeping_task_low.WokenUp() && count < 1024) {
-    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
     count++;
   }
 // Windows fails this test. Will tune in the future and figure out
@@ -4046,7 +4392,7 @@
       {"max_write_buffer_number", "4"},
   }));
   // Clean up memtable and L0
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   sleeping_task_low.Reset();
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
@@ -4054,7 +4400,7 @@
 
   count = 0;
   while (!sleeping_task_low.WokenUp() && count < 1024) {
-    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
     count++;
   }
 // Windows fails this test. Will tune in the future and figure out
@@ -4146,7 +4492,7 @@
                                                                      true);
     }
   }
-  db_->DropColumnFamily(handles_[2]);
+  ASSERT_OK(db_->DropColumnFamily(handles_[2]));
   delete handles_[2];
   handles_.erase(handles_.begin() + 2);
   env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
@@ -4188,17 +4534,19 @@
   VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
 
   uint64_t num_running_flushes = 0;
-  db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+                                  &num_running_flushes));
   ASSERT_EQ(num_running_flushes, 0);
 
-  Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
-  Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
+  ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+  ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
 
   // The first sync point is to make sure there's one flush job
   // running when we perform VerifyOperationCount().
   TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
   VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
-  db_->GetIntProperty(DB::Properties::kNumRunningFlushes, &num_running_flushes);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+                                  &num_running_flushes));
   ASSERT_EQ(num_running_flushes, 1);
   // This second sync point is to ensure the flush job will not
   // be completed until we already perform VerifyOperationCount().
@@ -4241,15 +4589,15 @@
     for (int file = 0; file < kNumL0Files; ++file) {
       for (int key = 0; key < kEntriesPerBuffer; ++key) {
         ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer),
-                      RandomString(&rnd, kTestValueSize)));
+                      rnd.RandomString(kTestValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
     }
     // This makes sure a compaction won't be scheduled until
     // we have done with the above Put Phase.
     uint64_t num_running_compactions = 0;
-    db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
-                        &num_running_compactions);
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                                    &num_running_compactions));
     ASSERT_EQ(num_running_compactions, 0);
     TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
     ASSERT_GE(NumTableFilesAtLevel(0),
@@ -4265,8 +4613,8 @@
       // If thread tracking is not enabled, compaction count should be 0.
       VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
     }
-    db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
-                        &num_running_compactions);
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                                    &num_running_compactions));
     ASSERT_EQ(num_running_compactions, 1);
     // TODO(yhchiang): adding assert to verify each compaction stage.
     TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
@@ -4297,7 +4645,7 @@
     ASSERT_EQ("1,1,1", FilesPerLevel(1));
 
     // Compaction range overlaps files
-    Compact(1, "p1", "p9");
+    Compact(1, "p", "q");
     ASSERT_EQ("0,0,1", FilesPerLevel(1));
 
     // Populate a different range
@@ -4312,7 +4660,9 @@
     MakeTables(1, "a", "z", 1);
     ASSERT_EQ("1,0,2", FilesPerLevel(1));
     CancelAllBackgroundWork(db_);
-    db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+    ASSERT_TRUE(
+        db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)
+            .IsShutdownInProgress());
     ASSERT_EQ("1,0,2", FilesPerLevel(1));
 
     if (iter == 0) {
@@ -4389,10 +4739,10 @@
   int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
   for (int file = 0; file < 16 * kNumL0Files; ++file) {
     for (int k = 0; k < kEntriesPerBuffer; ++k) {
-      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+      ASSERT_OK(Put(ToString(key++), rnd.RandomString(kTestValueSize)));
     }
 
-    Status s = env_->GetThreadList(&thread_list);
+    ASSERT_OK(env_->GetThreadList(&thread_list));
     for (auto thread : thread_list) {
       operation_count[thread.operation_type]++;
     }
@@ -4412,12 +4762,12 @@
   ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
   CancelAllBackgroundWork(db_);
   TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Record the number of compactions at a time.
   for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
     operation_count[i] = 0;
   }
-  Status s = env_->GetThreadList(&thread_list);
+  ASSERT_OK(env_->GetThreadList(&thread_list));
   for (auto thread : thread_list) {
     operation_count[thread.operation_type]++;
   }
@@ -4476,10 +4826,10 @@
   int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
   for (int file = 0; file < 16 * kNumL0Files; ++file) {
     for (int k = 0; k < kEntriesPerBuffer; ++k) {
-      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+      ASSERT_OK(Put(ToString(key++), rnd.RandomString(kTestValueSize)));
     }
 
-    Status s = env_->GetThreadList(&thread_list);
+    ASSERT_OK(env_->GetThreadList(&thread_list));
     for (auto thread : thread_list) {
       operation_count[thread.operation_type]++;
     }
@@ -4499,12 +4849,12 @@
   CancelAllBackgroundWork(db_);
   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Record the number of compactions at a time.
   for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
     operation_count[i] = 0;
   }
-  Status s = env_->GetThreadList(&thread_list);
+  ASSERT_OK(env_->GetThreadList(&thread_list));
   for (auto thread : thread_list) {
     operation_count[thread.operation_type]++;
   }
@@ -4530,10 +4880,11 @@
   for (int i = 0; i < kNKeys; i++) {
     keys[i] = i;
   }
-  std::random_shuffle(std::begin(keys), std::end(keys));
+  RandomShuffle(std::begin(keys), std::end(keys));
 
   Random rnd(301);
   Options options;
+  options.env = env_;
   options.create_if_missing = true;
   options.db_write_buffer_size = 20480;
   options.write_buffer_size = 20480;
@@ -4563,8 +4914,8 @@
   for (int i = 0; i < 20; i++) {
     ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(2), 0);
@@ -4576,10 +4927,11 @@
   for (int i = 21; i < 120; i++) {
     ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+
   ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
             120U * 4000U + 50U * 24);
   // Make sure data in files in L3 is not compacted by removing all files
@@ -4613,7 +4965,7 @@
   for (int i = 0; i < kNKeys; i++) {
     keys[i] = i;
   }
-  std::random_shuffle(std::begin(keys), std::end(keys));
+  RandomShuffle(std::begin(keys), std::end(keys));
 
   Random rnd(301);
   Options options;
@@ -4626,7 +4978,7 @@
   options.level0_stop_writes_trigger = 2;
   options.soft_pending_compaction_bytes_limit = 1024 * 1024;
   options.target_file_size_base = 20;
-
+  options.env = env_;
   options.level_compaction_dynamic_level_bytes = true;
   options.max_bytes_for_level_base = 200;
   options.max_bytes_for_level_multiplier = 8;
@@ -4662,17 +5014,17 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   for (int i = 0; i < 100; i++) {
-    std::string value = RandomString(&rnd, 200);
+    std::string value = rnd.RandomString(200);
     ASSERT_OK(Put(Key(keys[i]), value));
     if (i % 25 == 24) {
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
   }
 
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 
@@ -4707,11 +5059,11 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   for (int i = 101; i < 500; i++) {
-    std::string value = RandomString(&rnd, 200);
+    std::string value = rnd.RandomString(200);
     ASSERT_OK(Put(Key(keys[i]), value));
     if (i % 100 == 99) {
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
   }
 
@@ -4759,9 +5111,9 @@
   auto gen_l0_kb = [this](int start, int size, int stride) {
     Random rnd(301);
     for (int i = 0; i < size; i++) {
-      ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(start + stride * i), rnd.RandomString(1024)));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   };
 
   // Write 3 files that have the same key range.
@@ -4772,7 +5124,7 @@
   gen_l0_kb(0, 64, 1);
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
   gen_l0_kb(0, 64, 1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,1", FilesPerLevel());
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
@@ -4791,7 +5143,7 @@
   gen_l0_kb(0, 64, 1);
   ASSERT_EQ("1,1", FilesPerLevel());
   gen_l0_kb(0, 64, 1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,2", FilesPerLevel());
   metadata.clear();
   db_->GetLiveFilesMetaData(&metadata);
@@ -4813,7 +5165,7 @@
   for (int i = 0; i < 96; ++i) {
     gen_l0_kb(i, 64, 96);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_GT(SizeAtLevel(1), k1MB / 2);
   ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
 
@@ -4834,7 +5186,7 @@
   for (int i = 0; i < 20; ++i) {
     gen_l0_kb(i, 64, 32);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
   ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
 
@@ -4842,8 +5194,8 @@
   // Clean up memtable and L0. Block compaction threads. If continue to write
   // and flush memtables. We should see put stop after 8 memtable flushes
   // since level0_stop_writes_trigger = 8
-  dbfull()->TEST_FlushMemTable(true, true);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // Block compaction
   test::SleepingBackgroundTask sleeping_task_low;
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
@@ -4854,8 +5206,8 @@
   Random rnd(301);
   WriteOptions wo;
   while (count < 64) {
-    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-    dbfull()->TEST_FlushMemTable(true, true);
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
     count++;
     if (dbfull()->TEST_write_controler().IsStopped()) {
       sleeping_task_low.WakeUp();
@@ -4871,8 +5223,8 @@
   // Block compaction thread again. Perform the put and memtable flushes
   // until we see the stop after 6 memtable flushes.
   ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}}));
-  dbfull()->TEST_FlushMemTable(true);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
   // Block compaction again
@@ -4882,8 +5234,8 @@
   sleeping_task_low.WaitUntilSleeping();
   count = 0;
   while (count < 64) {
-    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-    dbfull()->TEST_FlushMemTable(true, true);
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
     count++;
     if (dbfull()->TEST_write_controler().IsStopped()) {
       sleeping_task_low.WakeUp();
@@ -4900,29 +5252,29 @@
   // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
   // L0 files do not change after the call.
   ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}}));
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
   for (int i = 0; i < 4; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
     // Wait for compaction so that put won't stop
-    dbfull()->TEST_FlushMemTable(true);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 4);
 
   // Enable auto compaction and perform the same test, # of L0 files should be
   // reduced after compaction.
   ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
   for (int i = 0; i < 4; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
     // Wait for compaction so that put won't stop
-    dbfull()->TEST_FlushMemTable(true);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_LT(NumTableFilesAtLevel(0), 4);
 }
 
@@ -4936,6 +5288,7 @@
   Options options;
   options.ttl = 0;
   options.create_if_missing = true;
+  options.env = env_;
   DestroyAndReopen(options);
 
   // Initial defaults
@@ -4997,6 +5350,7 @@
 TEST_F(DBTest, DynamicUniversalCompactionOptions) {
   Options options;
   options.create_if_missing = true;
+  options.env = env_;
   DestroyAndReopen(options);
 
   // Initial defaults
@@ -5075,12 +5429,13 @@
   DestroyAndReopen(options);
   Random rnd(301);
 
-  const int kCDTKeysPerBuffer = 4;
-  const int kTestSize = kCDTKeysPerBuffer * 4096;
-  const int kTotalIteration = 100;
+  constexpr int kCDTKeysPerBuffer = 4;
+  constexpr int kTestSize = kCDTKeysPerBuffer * 4096;
+  constexpr int kTotalIteration = 20;
   // the second half of the test involves in random failure
   // of file creation.
-  const int kRandomFailureTest = kTotalIteration / 2;
+  constexpr int kRandomFailureTest = kTotalIteration / 2;
+
   std::vector<std::string> values;
   for (int i = 0; i < kTestSize; ++i) {
     values.push_back("NOT_FOUND");
@@ -5091,7 +5446,7 @@
     }
     for (int k = 0; k < kTestSize; ++k) {
       // here we expect some of the Put fails.
-      std::string value = RandomString(&rnd, 100);
+      std::string value = rnd.RandomString(100);
       Status s = Put(Key(k), Slice(value));
       if (s.ok()) {
         // update the latest successful put
@@ -5105,8 +5460,8 @@
   }
 
   // If rocksdb does not do the correct job, internal assert will fail here.
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(dbfull()->TEST_WaitForFlushMemTable().IsIOError());
+  ASSERT_TRUE(dbfull()->TEST_WaitForCompact().IsIOError());
 
   // verify we have the latest successful update
   for (int k = 0; k < kTestSize; ++k) {
@@ -5140,11 +5495,11 @@
     int key1 = key_start + 1;
     int key2 = key_start + 2;
     Random rnd(301);
-    ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8)));
+    ASSERT_OK(Put(Key(key0), rnd.RandomString(8)));
     for (int i = 0; i < 10; ++i) {
-      ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8)));
+      ASSERT_OK(Put(Key(key1), rnd.RandomString(8)));
     }
-    ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8)));
+    ASSERT_OK(Put(Key(key2), rnd.RandomString(8)));
     std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
     iter->Seek(Key(key1));
     ASSERT_TRUE(iter->Valid());
@@ -5160,14 +5515,14 @@
 
   ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}}));
   // Clear memtable and make new option effective
-  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
   // Trigger reseek
   assert_reseek_count(200, 1);
 
   ASSERT_OK(
       dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}}));
   // Clear memtable and make new option effective
-  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
   // No reseek
   assert_reseek_count(300, 1);
 
@@ -5210,45 +5565,56 @@
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
                                                      &mutable_cf_options));
   ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+  ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1], {{"check_flush_compaction_key_order", "false"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order);
 }
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+  const int kNumLevels = 3;
+  const int kNumKeysPerLevel = 10000;
+  const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel;
+
   Options options = CurrentOptions();
-  options.write_buffer_size = 32 * 1024;
-  options.target_file_size_base = 32 * 1024;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 4;
-  options.max_bytes_for_level_base = 64 * 1024;
-  options.max_write_buffer_number = 2;
-  options.max_background_compactions = 8;
-  options.max_background_flushes = 8;
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  CreateAndReopenWithCF({"mypikachu"}, options);
+  Reopen(options);
 
-  int numkeys = 20000;
-  for (int i = 0; i < numkeys; i++) {
-    ASSERT_OK(Put(1, Key(i), "val"));
+  // After the below loop there will be one file on each of L0, L1, and L2.
+  int key = 0;
+  for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) {
+    for (int i = 0; i < kNumKeysPerLevel; ++i) {
+      ASSERT_OK(Put(Key(key), "val"));
+      key++;
+    }
+    ASSERT_OK(Flush());
+    for (int input_level = 0; input_level < output_level; ++input_level) {
+      // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to
+      // `input_level + 1`.
+      ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr));
+    }
   }
+  assert(key == kNumKeysPerDb);
+
   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
-
-  for (int i = 0; i < numkeys; i++) {
-    ASSERT_EQ(Get(1, Key(i)), "val");
+  for (int i = 0; i < kNumKeysPerDb; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
   }
 
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 
-  ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
-                         TestGetTickerCount(options, GET_HIT_L1) +
-                         TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+  ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) +
+                               TestGetTickerCount(options, GET_HIT_L1) +
+                               TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 }
 
 TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
@@ -5284,7 +5650,7 @@
       Random rnd(301);
       for (int i = 0; i < kNumKeysWritten; ++i) {
         // compressible string
-        ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+        ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
       }
 
       table_options.format_version = first_table_version == 1 ? 2 : 1;
@@ -5319,12 +5685,20 @@
                  &sleeping_task_high, Env::Priority::HIGH);
 
   std::vector<std::string> filenames;
-  env_->GetChildren(dbname_, &filenames);
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  // In Windows, LOCK file cannot be deleted because it is locked by db_test
+  // After closing db_test, the LOCK file is unlocked and can be deleted
   // Delete archival files.
+  bool deleteDir = true;
   for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(dbname_ + "/" + filenames[i]);
+    Status s = env_->DeleteFile(dbname_ + "/" + filenames[i]);
+    if (!s.ok()) {
+      deleteDir = false;
+    }
+  }
+  if (deleteDir) {
+    ASSERT_OK(env_->DeleteDir(dbname_));
   }
-  env_->DeleteDir(dbname_);
   DestroyAndReopen(options);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -5360,9 +5734,10 @@
  public:
   explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
 
-  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+  bool FullMergeV2(const MergeOperationInput& merge_in,
                    MergeOperationOutput* merge_out) const override {
-    db_test_->env_->addon_time_.fetch_add(1000);
+    db_test_->env_->MockSleepForMicroseconds(1000 *
+                                             merge_in.operand_list.size());
     merge_out->new_value = "";
     return true;
   }
@@ -5378,16 +5753,16 @@
 
   // Enable time profiling
   SetPerfLevel(kEnableTime);
-  this->env_->addon_time_.store(0);
-  this->env_->time_elapse_only_sleep_ = true;
-  this->env_->no_slowdown_ = true;
   Options options = CurrentOptions();
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   options.merge_operator.reset(new DelayedMergeOperator(this));
+  SetTimeElapseOnlySleepOnReopen(&options);
   DestroyAndReopen(options);
 
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
-  db_->Put(WriteOptions(), "foo", one);
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", one));
   ASSERT_OK(Flush());
   ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
   ASSERT_OK(Flush());
@@ -5398,9 +5773,9 @@
   opt.verify_checksums = true;
   opt.snapshot = nullptr;
   std::string result;
-  db_->Get(opt, "foo", &result);
+  ASSERT_OK(db_->Get(opt, "foo", &result));
 
-  ASSERT_EQ(1000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+  ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
 
   ReadOptions read_options;
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
@@ -5411,11 +5786,10 @@
   }
 
   ASSERT_EQ(1, count);
-  ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+  ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
 #ifdef ROCKSDB_USING_THREAD_STATUS
   ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  this->env_->time_elapse_only_sleep_ = false;
 }
 
 #ifndef ROCKSDB_LITE
@@ -5425,18 +5799,24 @@
   options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   options.merge_operator.reset(new DelayedMergeOperator(this));
-  options.compaction_style = kCompactionStyleUniversal;
+  options.disable_auto_compactions = true;
   options.max_subcompactions = max_subcompactions_;
+  SetTimeElapseOnlySleepOnReopen(&options);
   DestroyAndReopen(options);
 
-  for (int i = 0; i < 1000; i++) {
+  constexpr unsigned n = 1000;
+  for (unsigned i = 0; i < n; i++) {
     ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
     ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+  ASSERT_EQ(uint64_t{n} * 1000000U,
+            TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
 }
 
 TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
@@ -5448,14 +5828,17 @@
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   options.statistics->set_stats_level(kExceptTimeForMutex);
   options.max_subcompactions = max_subcompactions_;
+  SetTimeElapseOnlySleepOnReopen(&options);
   DestroyAndReopen(options);
 
+  unsigned n = 0;
   // put some data
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(ToString(table * 100 + i), "val"));
+      ++n;
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   CompactRangeOptions cro;
@@ -5467,7 +5850,9 @@
 
   Iterator* itr = db_->NewIterator(ReadOptions());
   itr->SeekToFirst();
-  ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0);
+  ASSERT_OK(itr->status());
+  ASSERT_EQ(uint64_t{n} * 1000000U,
+            TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME));
   delete itr;
 }
 #endif  // ROCKSDB_LITE
@@ -5480,7 +5865,7 @@
   Reopen(options);
 
   for (int i = 0; i < 100000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
     // only 2 memtables will be alive, so logs_to_free needs to always be below
     // 2
     ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
@@ -5500,7 +5885,7 @@
 #endif  // ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE
-TEST_F(DBTest, SuggestCompactRangeTest) {
+TEST_F(DBTest, DISABLED_SuggestCompactRangeTest) {
   class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
    public:
     std::unique_ptr<CompactionFilter> CreateCompactionFilter(
@@ -5521,8 +5906,8 @@
   };
 
   Options options = CurrentOptions();
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
   options.compaction_style = kCompactionStyleLevel;
   options.compaction_filter_factory.reset(
       new CompactionFilterFactoryGetContext());
@@ -5578,7 +5963,7 @@
   // compact it three times
   for (int i = 0; i < 3; ++i) {
     ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
   // All files are compacted
@@ -5591,7 +5976,7 @@
   // nonoverlapping with the file on level 0
   Slice start("a"), end("b");
   ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // should not compact the level 0 file
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
@@ -5599,7 +5984,7 @@
   start = Slice("j");
   end = Slice("m");
   ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual(
       options.compaction_filter_factory.get()));
 
@@ -5608,6 +5993,7 @@
   ASSERT_EQ(1, NumTableFilesAtLevel(1));
 }
 
+
 TEST_F(DBTest, PromoteL0) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
@@ -5624,7 +6010,7 @@
   std::map<int32_t, std::string> values;
   for (const auto& range : ranges) {
     for (int32_t j = range.first; j < range.second; j++) {
-      values[j] = RandomString(&rnd, value_size);
+      values[j] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(j), values[j]));
     }
     ASSERT_OK(Flush());
@@ -5685,8 +6071,8 @@
 
   Random rnd(301);
   for (int i = 0; i < kNumL0Files; ++i) {
-    ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
-    Flush();
+    ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
@@ -5724,7 +6110,7 @@
   for (int i = 0; i < 2; ++i) {
     // put two keys to ensure no trivial move
     for (int j = 0; j < 2; ++j) {
-      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
     }
     ASSERT_OK(Flush());
   }
@@ -5738,7 +6124,7 @@
   for (int i = 0; i < kNumL0Files; ++i) {
     // put two keys to ensure no trivial move
     for (int j = 0; j < 2; ++j) {
-      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
     }
     ASSERT_OK(Flush());
   }
@@ -5750,7 +6136,7 @@
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   manual_compaction_thread.join();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 }
 
 #ifndef ROCKSDB_LITE
@@ -5767,7 +6153,7 @@
   for (int i = 0; i < 2; ++i) {
     // put two keys to ensure no trivial move
     for (int j = 0; j < 2; ++j) {
-      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
     }
     ASSERT_OK(Flush());
   }
@@ -5790,6 +6176,7 @@
   port::Thread manual_compaction_thread([&]() {
       auto s = db_->CompactFiles(CompactionOptions(),
           db_->DefaultColumnFamily(), input_files, 0);
+      ASSERT_OK(s);
   });
 
   TEST_SYNC_POINT(
@@ -5797,7 +6184,7 @@
   // generate enough files to trigger compaction
   for (int i = 0; i < 20; ++i) {
     for (int j = 0; j < 2; ++j) {
-      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
     }
     ASSERT_OK(Flush());
   }
@@ -5808,7 +6195,7 @@
           "DBTest::CompactFilesShouldTriggerAutoCompaction:End");
 
   manual_compaction_thread.join();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
   ASSERT_LE(cf_meta_data.levels[0].files.size(),
@@ -5833,7 +6220,7 @@
       for (;;) {
         std::string data(3000, j++ % 127 + 20);
         data += ToString(j);
-        batch.Put(handles_[0], Slice(data), Slice(data));
+        ASSERT_OK(batch.Put(handles_[0], Slice(data), Slice(data)));
         if (batch.GetDataSize() > write_size) {
           break;
         }
@@ -5918,7 +6305,6 @@
   Options options = CurrentOptions();
   env_->SetBackgroundThreads(1, Env::LOW);
   options.env = env_;
-  env_->no_slowdown_ = true;
   options.write_buffer_size = 100000000;
   options.max_write_buffer_number = 256;
   options.max_background_compactions = 1;
@@ -5927,8 +6313,9 @@
   options.level0_stop_writes_trigger = 999999;
   options.delayed_write_rate = 20000000;  // Start with 200MB/s
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(kEntriesPerMemTable));
+      test::NewSpecialSkipListFactory(kEntriesPerMemTable));
 
+  SetTimeElapseOnlySleepOnReopen(&options);
   CreateAndReopenWithCF({"pikachu"}, options);
 
   // Block compactions
@@ -5937,14 +6324,14 @@
                  Env::Priority::LOW);
 
   for (int i = 0; i < 3; i++) {
-    Put(Key(i), std::string(10000, 'x'));
-    Flush();
+    ASSERT_OK(Put(Key(i), std::string(10000, 'x')));
+    ASSERT_OK(Flush());
   }
 
   // These writes will be slowed down to 1KB/s
   uint64_t estimated_sleep_time = 0;
   Random rnd(301);
-  Put("", "");
+  ASSERT_OK(Put("", ""));
   uint64_t cur_rate = options.delayed_write_rate;
   for (int i = 0; i < kTotalFlushes; i++) {
     uint64_t size_memtable = 0;
@@ -5953,26 +6340,23 @@
       // Spread the size range to more.
       size_t entry_size = rand_num * rand_num * rand_num;
       WriteOptions wo;
-      Put(Key(i), std::string(entry_size, 'x'), wo);
+      ASSERT_OK(Put(Key(i), std::string(entry_size, 'x'), wo));
       size_memtable += entry_size + 18;
       // Occasionally sleep a while
       if (rnd.Uniform(20) == 6) {
         env_->SleepForMicroseconds(2666);
       }
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     estimated_sleep_time += size_memtable * 1000000u / cur_rate;
     // Slow down twice. One for memtable switch and one for flush finishes.
     cur_rate = static_cast<uint64_t>(static_cast<double>(cur_rate) *
                                      kIncSlowdownRatio * kIncSlowdownRatio);
   }
   // Estimate the total sleep time fall into the rough range.
-  ASSERT_GT(env_->addon_time_.load(),
-            static_cast<int64_t>(estimated_sleep_time / 2));
-  ASSERT_LT(env_->addon_time_.load(),
-            static_cast<int64_t>(estimated_sleep_time * 2));
+  ASSERT_GT(env_->NowMicros(), estimated_sleep_time / 2);
+  ASSERT_LT(env_->NowMicros(), estimated_sleep_time * 2);
 
-  env_->no_slowdown_ = false;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   sleeping_task_low.WakeUp();
   sleeping_task_low.WaitUntilDone();
@@ -5992,7 +6376,7 @@
   options.max_bytes_for_level_base = 10000000000u;
   options.max_background_compactions = 1;
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
 
   env_->SetBackgroundThreads(1, Env::LOW);
   test::SleepingBackgroundTask sleeping_task_low;
@@ -6013,14 +6397,14 @@
   int key_idx = 0;
   for (int num = 0; num < 5; num++) {
     GenerateNewFile(&rnd, &key_idx, true);
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
 
   ASSERT_EQ(0, callback_count.load());
 
   for (int num = 0; num < 5; num++) {
     GenerateNewFile(&rnd, &key_idx, true);
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
   ASSERT_GE(callback_count.load(), 1);
 
@@ -6102,25 +6486,25 @@
 
   // Generating 360KB in Level 3
   for (int i = 0; i < 72; i++) {
-    Put(Key(i), std::string(5000, 'x'));
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
     if (i % 10 == 0) {
-      dbfull()->TEST_FlushMemTable(true, true);
+      ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
     }
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   MoveFilesToLevel(3);
 
   // Generating 360KB in Level 2
   for (int i = 0; i < 72; i++) {
-    Put(Key(i), std::string(5000, 'x'));
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
     if (i % 10 == 0) {
-      dbfull()->TEST_FlushMemTable(true, true);
+      ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
     }
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   MoveFilesToLevel(2);
 
-  Put(Key(0), "");
+  ASSERT_OK(Put(Key(0), ""));
 
   test::SleepingBackgroundTask sleeping_task_low;
   // Block compactions
@@ -6130,11 +6514,11 @@
 
   // Create 3 L0 files, making score of L0 to be 3.
   for (int i = 0; i < 3; i++) {
-    Put(Key(i), std::string(5000, 'x'));
-    Put(Key(100 - i), std::string(5000, 'x'));
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(100 - i), std::string(5000, 'x')));
     // Flush the file. File size is around 30KB.
     InstallFlushCallback();
-    dbfull()->TEST_FlushMemTable(true, true);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
     WaitForFlush();
   }
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
@@ -6143,7 +6527,7 @@
   sleeping_task_low.WakeUp();
   sleeping_task_low.WaitUntilDone();
   sleeping_task_low.Reset();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Now there is one L1 file but doesn't trigger soft_rate_limit
   // The L1 file size is around 30KB.
@@ -6165,11 +6549,11 @@
   sleeping_task_low.WaitUntilSleeping();
   // Create 3 L0 files, making score of L0 to be 3
   for (int i = 0; i < 3; i++) {
-    Put(Key(10 + i), std::string(5000, 'x'));
-    Put(Key(90 - i), std::string(5000, 'x'));
+    ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(90 - i), std::string(5000, 'x')));
     // Flush the file. File size is around 30KB.
     InstallFlushCallback();
-    dbfull()->TEST_FlushMemTable(true, true);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
     WaitForFlush();
   }
 
@@ -6188,11 +6572,11 @@
 
   // Create 3 L0 files, making score of L0 to be 3, higher than L0.
   for (int i = 0; i < 3; i++) {
-    Put(Key(20 + i), std::string(5000, 'x'));
-    Put(Key(80 - i), std::string(5000, 'x'));
+    ASSERT_OK(Put(Key(20 + i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(80 - i), std::string(5000, 'x')));
     // Flush the file. File size is around 30KB.
     InstallFlushCallback();
-    dbfull()->TEST_FlushMemTable(true, true);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
     WaitForFlush();
   }
   // Wake up sleep task to enable compaction to run and waits
@@ -6220,8 +6604,8 @@
       {"max_bytes_for_level_base", "5000"},
   }));
 
-  Put("", "");
-  Flush();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
 
@@ -6241,7 +6625,7 @@
   options.disable_auto_compactions = true;
   int kNumKeysPerMemtable = 3;
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(kNumKeysPerMemtable));
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
 
   Reopen(options);
   test::SleepingBackgroundTask sleeping_task;
@@ -6254,12 +6638,12 @@
   for (int i = 0; i < 3; i++) {
     // Fill one mem table
     for (int j = 0; j < kNumKeysPerMemtable; j++) {
-      Put(Key(j), "");
+      ASSERT_OK(Put(Key(j), ""));
     }
     ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
   }
   // Inserting a new entry would create a new mem table, triggering slow down.
-  Put(Key(0), "");
+  ASSERT_OK(Put(Key(0), ""));
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
 
   sleeping_task.WakeUp();
@@ -6404,18 +6788,18 @@
 
   std::vector<port::Thread> threads;
   std::atomic<bool> done(false);
-  db_->PauseBackgroundWork();
+  ASSERT_OK(db_->PauseBackgroundWork());
   threads.emplace_back([&]() {
     Random rnd(301);
     for (int i = 0; i < 10000; ++i) {
-      Put(RandomString(&rnd, 10), RandomString(&rnd, 10));
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
     }
     done.store(true);
   });
   env_->SleepForMicroseconds(200000);
   // make sure the thread is not done
   ASSERT_FALSE(done.load());
-  db_->ContinueBackgroundWork();
+  ASSERT_OK(db_->ContinueBackgroundWork());
   for (auto& t : threads) {
     t.join();
   }
@@ -6450,6 +6834,7 @@
         {
           port::Thread tmp_thread([&] {
             auto it = db_->NewIterator(ReadOptions());
+            ASSERT_OK(it->status());
             delete it;
           });
           tmp_thread.join();
@@ -6486,10 +6871,11 @@
 
   Options options = CurrentOptions();
   options.max_open_files = -1;
-  env_->time_elapse_only_sleep_ = false;
+  env_->SetMockSleep();
   options.env = env_;
 
-  env_->addon_time_.store(0);
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   DestroyAndReopen(options);
 
   bool set_file_creation_time_to_zero = true;
@@ -6500,7 +6886,7 @@
   const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
 
   // Add 50 hours
-  env_->addon_time_.fetch_add(50 * 60 * 60);
+  env_->MockSleepForSeconds(50 * 60 * 60);
 
   int64_t time_2 = 0;
   env_->GetCurrentTime(&time_2);
@@ -6538,9 +6924,9 @@
   for (int i = 0; i < kNumLevelFiles; ++i) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   // At this point there should be 2 files, one with file_creation_time = 0 and
@@ -6554,18 +6940,18 @@
   set_file_creation_time_to_zero = false;
   options = CurrentOptions();
   options.max_open_files = -1;
-  env_->time_elapse_only_sleep_ = false;
   options.env = env_;
 
-  env_->addon_time_.store(0);
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   DestroyAndReopen(options);
 
   for (int i = 0; i < kNumLevelFiles; ++i) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   // At this point there should be 2 files with non-zero file creation time.
@@ -6585,18 +6971,50 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_size_to_maintain = 10000;
+  options.write_buffer_size = 160000;
+  Reopen(options);
+  Random rnd(301);
+  bool memory_limit_exceeded = false;
+
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  for (int i = 0; i < 1000; i++) {
+    std::string value = rnd.RandomString(1000);
+    ASSERT_OK(Put("keykey_" + std::to_string(i), value));
+
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+    const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage();
+    const uint64_t size_all_mem_table =
+        cur_active_mem + cfd->imm()->ApproximateMemoryUsage();
+
+    // Errors out if memory usage keeps on increasing beyond the limit.
+    // Once memory limit exceeds,  memory_limit_exceeded  is set and if
+    // size_all_mem_table doesn't drop out in the next write then it errors out
+    // (not expected behaviour). If memory usage drops then
+    // memory_limit_exceeded is set to false.
+    if ((size_all_mem_table > cur_active_mem) &&
+        (cur_active_mem >=
+         static_cast<uint64_t>(options.max_write_buffer_size_to_maintain)) &&
+        (size_all_mem_table >
+         static_cast<uint64_t>(options.max_write_buffer_size_to_maintain) +
+             options.write_buffer_size)) {
+      ASSERT_FALSE(memory_limit_exceeded);
+      memory_limit_exceeded = true;
+    } else {
+      memory_limit_exceeded = false;
+    }
+  }
+}
+
 #endif
 
 }  // namespace ROCKSDB_NAMESPACE
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
-}
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test2.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test2.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test2.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test2.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,31 +6,212 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+
 #include <atomic>
 #include <cstdlib>
 #include <functional>
+#include <memory>
 
 #include "db/db_test_util.h"
 #include "db/read_callback.h"
+#include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
 #include "rocksdb/persistent_cache.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/replayer.h"
 #include "rocksdb/wal_filter.h"
-#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBTest2 : public DBTestBase {
  public:
-  DBTest2() : DBTestBase("/db_test2") {}
+  DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
+
+ protected:
+#ifndef ROCKSDB_LITE
+  uint64_t GetSstSizeHelper(Temperature temperature) {
+    std::string prop;
+    EXPECT_TRUE(
+        dbfull()->GetProperty(DB::Properties::kLiveSstFilesSizeAtTemperature +
+                                  ToString(static_cast<uint8_t>(temperature)),
+                              &prop));
+    return static_cast<uint64_t>(std::atoi(prop.c_str()));
+  }
+#endif  // ROCKSDB_LITE
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, OpenForReadOnly) {
+  DB* db_ptr = nullptr;
+  std::string dbname = test::PerThreadDBPath("db_readonly");
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // OpenForReadOnly should fail but will create <dbname> in the file system
+  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+  // Since <dbname> is created, we should be able to delete the dir
+  // We first get the list files under <dbname>
+  // There should not be any subdirectories -- this is not checked here
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname, &files));
+  for (auto& f : files) {
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+  }
+  // <dbname> should be empty now and we should be able to delete it
+  ASSERT_OK(env_->DeleteDir(dbname));
+  options.create_if_missing = false;
+  // OpenForReadOnly should fail since <dbname> was successfully deleted
+  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+  // With create_if_missing false, there should not be a dir in the file system
+  ASSERT_NOK(env_->FileExists(dbname));
+}
+
+TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
+  DB* db_ptr = nullptr;
+  std::string dbname = test::PerThreadDBPath("db_readonly");
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  column_families.push_back(ColumnFamilyDescriptor("goku", cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  // OpenForReadOnly should fail but will create <dbname> in the file system
+  ASSERT_NOK(
+      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+  // Since <dbname> is created, we should be able to delete the dir
+  // We first get the list files under <dbname>
+  // There should not be any subdirectories -- this is not checked here
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname, &files));
+  for (auto& f : files) {
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+  }
+  // <dbname> should be empty now and we should be able to delete it
+  ASSERT_OK(env_->DeleteDir(dbname));
+  options.create_if_missing = false;
+  // OpenForReadOnly should fail since <dbname> was successfully deleted
+  ASSERT_NOK(
+      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+  // With create_if_missing false, there should not be a dir in the file system
+  ASSERT_NOK(env_->FileExists(dbname));
+}
+
+class TestReadOnlyWithCompressedCache
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  TestReadOnlyWithCompressedCache()
+      : DBTestBase("test_readonly_with_compressed_cache",
+                   /*env_do_fsync=*/true) {
+    max_open_files_ = std::get<0>(GetParam());
+    use_mmap_ = std::get<1>(GetParam());
+  }
+  int max_open_files_;
+  bool use_mmap_;
+};
+
+TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) {
+  if (use_mmap_ && !IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires MMAP support");
+    return;
+  }
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar"));
+  ASSERT_OK(Flush());
+
+  DB* db_ptr = nullptr;
+  Options options = CurrentOptions();
+  options.allow_mmap_reads = use_mmap_;
+  options.max_open_files = max_open_files_;
+  options.compression = kSnappyCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.statistics = CreateDBStatistics();
+
+  ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr));
+
+  std::string v;
+  ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("bar", v);
+  ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+  ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("bar", v);
+  if (Snappy_Supported()) {
+    if (use_mmap_) {
+      ASSERT_EQ(0,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+    } else {
+      ASSERT_EQ(1,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+    }
+  }
+
+  delete db_ptr;
+}
+
+INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache,
+                        TestReadOnlyWithCompressedCache,
+                        ::testing::Combine(::testing::Values(-1, 100),
+                                           ::testing::Bool()));
+
+class PartitionedIndexTestListener : public EventListener {
+ public:
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    ASSERT_GT(info.table_properties.index_partitions, 1);
+    ASSERT_EQ(info.table_properties.index_key_is_user_key, 0);
+  }
 };
 
+TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
+  const int kValueSize = 10500;
+  const int kNumEntriesPerFile = 1000;
+  const int kNumFiles = 3;
+  const int kNumDistinctKeys = 30;
+
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+  PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.listeners.emplace_back(listener);
+  std::vector<const Snapshot*> snapshots;
+  Reopen(options);
+  Random rnd(301);
+
+  for (int i = 0; i < kNumFiles; i++) {
+    for (int j = 0; j < kNumEntriesPerFile; j++) {
+      int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
+      std::string value = rnd.RandomString(kValueSize);
+      ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
+      snapshots.push_back(db_->GetSnapshot());
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (auto s : snapshots) {
+    db_->ReleaseSnapshot(s);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
 class PrefixFullBloomWithReverseComparator
     : public DBTestBase,
       public ::testing::WithParamInterface<bool> {
  public:
   PrefixFullBloomWithReverseComparator()
-      : DBTestBase("/prefix_bloom_reverse") {}
+      : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {}
   void SetUp() override { if_cache_filter_ = GetParam(); }
   bool if_cache_filter_;
 };
@@ -56,7 +237,7 @@
   ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
   ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
 
-  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
 
   if (bbto.block_cache) {
     bbto.block_cache->EraseUnRefEntries();
@@ -88,18 +269,20 @@
                         PrefixFullBloomWithReverseComparator, testing::Bool());
 
 TEST_F(DBTest2, IteratorPropertyVersionNumber) {
-  Put("", "");
+  ASSERT_OK(Put("", ""));
   Iterator* iter1 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter1->status());
   std::string prop_value;
   ASSERT_OK(
       iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
   uint64_t version_number1 =
       static_cast<uint64_t>(std::atoi(prop_value.c_str()));
 
-  Put("", "");
-  Flush();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
 
   Iterator* iter2 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter2->status());
   ASSERT_OK(
       iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
   uint64_t version_number2 =
@@ -107,9 +290,10 @@
 
   ASSERT_GT(version_number2, version_number1);
 
-  Put("", "");
+  ASSERT_OK(Put("", ""));
 
   Iterator* iter3 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter3->status());
   ASSERT_OK(
       iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
   uint64_t version_number3 =
@@ -136,11 +320,11 @@
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
-  Put(1, "a", "begin");
-  Put(1, "z", "end");
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
   ASSERT_OK(Flush(1));
   TryReopenWithColumnFamilies({"default", "pikachu"}, options);
 
@@ -156,10 +340,10 @@
   options.merge_operator = MergeOperators::CreatePutOperator();
   options.disable_auto_compactions = true;
   DestroyAndReopen(options);
-  Put("poi", "Finch");
-  db_->Merge(WriteOptions(), "poi", "Reese");
-  db_->Merge(WriteOptions(), "poi", "Shaw");
-  db_->Merge(WriteOptions(), "poi", "Root");
+  ASSERT_OK(Put("poi", "Finch"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root"));
   options.max_successive_merges = 2;
   Reopen(options);
 }
@@ -170,7 +354,7 @@
       public testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
   DBTestSharedWriteBufferAcrossCFs()
-      : DBTestBase("/db_test_shared_write_buffer") {}
+      : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {}
   void SetUp() override {
     use_old_interface_ = std::get<0>(GetParam());
     cost_cache_ = std::get<1>(GetParam());
@@ -182,6 +366,10 @@
 TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   Options options = CurrentOptions();
   options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
 
   // Avoid undeterministic value by malloc_usable_size();
   // Force arena block size to 1
@@ -217,14 +405,15 @@
   wo.disableWAL = true;
 
   std::function<void()> wait_flush = [&]() {
-    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
-    dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
   };
 
   // Create some data and flush "default" and "nikitich" so that they
   // are newer CFs created.
+  flush_listener->expected_flush_reason = FlushReason::kManualFlush;
   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
   Flush(3);
   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
@@ -235,6 +424,7 @@
   ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
             static_cast<uint64_t>(1));
 
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
   ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
   if (cost_cache_) {
     ASSERT_GE(cache->GetUsage(), 256 * 1024);
@@ -359,6 +549,10 @@
   std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
   Options options = CurrentOptions();
   options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
   // Avoid undeterministic value by malloc_usable_size();
   // Force arena block size to 1
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -389,13 +583,14 @@
   wo.disableWAL = true;
 
   std::function<void()> wait_flush = [&]() {
-    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
-    static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
   };
 
   // Trigger a flush on cf2
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
   ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
   wait_flush();
   ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
@@ -407,7 +602,7 @@
 
   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
   wait_flush();
-  static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
   {
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
                   GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
@@ -438,7 +633,7 @@
   wait_flush();
   ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
   wait_flush();
-  static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
   {
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
               static_cast<uint64_t>(1));
@@ -562,9 +757,9 @@
     for (size_t i = 0; i < batch_keys.size(); i++) {
       WriteBatch batch;
       for (size_t j = 0; j < batch_keys[i].size(); j++) {
-        batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+        ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
       }
-      dbfull()->Write(WriteOptions(), &batch);
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
     }
 
     WalFilter::WalProcessingOption wal_processing_option =
@@ -583,14 +778,14 @@
       TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
     if (wal_processing_option ==
       WalFilter::WalProcessingOption::kCorruptedRecord) {
-      assert(!status.ok());
+      ASSERT_NOK(status);
       // In case of corruption we can turn off paranoid_checks to reopen
       // databse
       options.paranoid_checks = false;
       ReopenWithColumnFamilies({ "default", "pikachu" }, options);
     }
     else {
-      assert(status.ok());
+      ASSERT_OK(status);
     }
 
     // Compute which keys we expect to be found
@@ -647,7 +842,7 @@
       break;
     }
     default:
-      assert(false);  // unhandled case
+      FAIL();  // unhandled case
     }
 
     bool checked_after_reopen = false;
@@ -690,7 +885,7 @@
       num_keys_added_(0) {}
     void Put(const Slice& key, const Slice& value) override {
       if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
-        new_write_batch_->Put(key, value);
+        ASSERT_OK(new_write_batch_->Put(key, value));
         ++num_keys_added_;
       }
     }
@@ -717,8 +912,12 @@
                                   bool* batch_changed) const override {
       if (current_record_index_ >= change_records_from_index_) {
         ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
-        batch.Iterate(&handler);
-        *batch_changed = true;
+        Status s = batch.Iterate(&handler);
+        if (s.ok()) {
+          *batch_changed = true;
+        } else {
+          assert(false);
+        }
       }
 
       // Filter is passed as a const object for RocksDB to not modify the
@@ -750,9 +949,9 @@
   for (size_t i = 0; i < batch_keys.size(); i++) {
     WriteBatch batch;
     for (size_t j = 0; j < batch_keys[i].size(); j++) {
-      batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
     }
-    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
   }
 
   // Create a test filter that would apply wal_processing_option at the first
@@ -811,8 +1010,12 @@
    WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch,
                                  bool* batch_changed) const override {
      *new_batch = batch;
-     new_batch->Put("key_extra", "value_extra");
-     *batch_changed = true;
+     Status s = new_batch->Put("key_extra", "value_extra");
+     if (s.ok()) {
+       *batch_changed = true;
+     } else {
+       assert(false);
+     }
      return WalProcessingOption::kContinueProcessing;
    }
 
@@ -838,9 +1041,9 @@
   for (size_t i = 0; i < batch_keys.size(); i++) {
     WriteBatch batch;
     for (size_t j = 0; j < batch_keys[i].size(); j++) {
-      batch.Put(handles_[0], batch_keys[i][j], DummyString(1024));
+      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
     }
-    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
   }
 
   // Create a test filter that would add extra keys
@@ -923,7 +1126,11 @@
         }
       } handler(log_number, cf_log_number_map_, cf_wal_keys_);
 
-      batch.Iterate(&handler);
+      Status s = batch.Iterate(&handler);
+      if (!s.ok()) {
+        // TODO(AR) is this ok?
+        return WalProcessingOption::kCorruptedRecord;
+      }
 
       return WalProcessingOption::kContinueProcessing;
    }
@@ -958,14 +1165,16 @@
   for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
     WriteBatch batch;
     for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
-      batch.Put(handles_[0], batch_keys_pre_flush[i][j], DummyString(1024));
-      batch.Put(handles_[1], batch_keys_pre_flush[i][j], DummyString(1024));
+      ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j],
+                          DummyString(1024)));
+      ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j],
+                          DummyString(1024)));
     }
-    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
   }
 
   //Flush default column-family
-  db_->Flush(FlushOptions(), handles_[0]);
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[0]));
 
   // Do some more writes
   std::vector<std::vector<std::string>> batch_keys_post_flush(3);
@@ -981,10 +1190,12 @@
   for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
     WriteBatch batch;
     for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
-      batch.Put(handles_[0], batch_keys_post_flush[i][j], DummyString(1024));
-      batch.Put(handles_[1], batch_keys_post_flush[i][j], DummyString(1024));
+      ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j],
+                          DummyString(1024)));
+      ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j],
+                          DummyString(1024)));
     }
-    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
   }
 
   // On Recovery we should only find the second batch applicable to default CF
@@ -1011,10 +1222,10 @@
     for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
       Slice key_from_the_log(keys_cf[index++]);
       Slice batch_key(batch_keys_post_flush[i][j]);
-      ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
     }
   }
-  ASSERT_TRUE(index == keys_cf.size());
+  ASSERT_EQ(index, keys_cf.size());
 
   index = 0;
   keys_cf = cf_wal_keys[name_id_map["pikachu"]];
@@ -1023,7 +1234,7 @@
     for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
       Slice key_from_the_log(keys_cf[index++]);
       Slice batch_key(batch_keys_pre_flush[i][j]);
-      ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
     }
   }
 
@@ -1031,10 +1242,10 @@
     for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
       Slice key_from_the_log(keys_cf[index++]);
       Slice batch_key(batch_keys_post_flush[i][j]);
-      ASSERT_TRUE(key_from_the_log.compare(batch_key) == 0);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
     }
   }
-  ASSERT_TRUE(index == keys_cf.size());
+  ASSERT_EQ(index, keys_cf.size());
 }
 
 TEST_F(DBTest2, PresetCompressionDict) {
@@ -1054,7 +1265,7 @@
   options.disable_auto_compactions = true;
   options.level0_file_num_compaction_trigger = kNumL0Files;
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
   options.num_levels = 2;
   options.target_file_size_base = kL0FileBytes;
   options.target_file_size_multiplier = 2;
@@ -1121,7 +1332,7 @@
       std::string seq_datas[10];
       for (int j = 0; j < 10; ++j) {
         seq_datas[j] =
-            RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
       }
 
       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
@@ -1131,11 +1342,11 @@
           ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
                         seq_datas[(key_num / 10) % 10]));
         }
-        dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
         ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
       }
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
-                                  true /* disallow_trivial_move */);
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                            true /* disallow_trivial_move */));
       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
       ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
 
@@ -1189,14 +1400,14 @@
   options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
 
   Random rnd(301);
   for (int i = 0; i < kNumFiles; ++i) {
     for (int j = 0; j < kNumEntriesPerFile; ++j) {
       ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
-                    RandomString(&rnd, kNumBytesPerEntry)));
+                    rnd.RandomString(kNumBytesPerEntry)));
     }
     ASSERT_OK(Flush());
     MoveFilesToLevel(1);
@@ -1234,6 +1445,236 @@
   }
 }
 
+class PresetCompressionDictTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
+ public:
+  PresetCompressionDictTest()
+      : DBTestBase("db_test2", false /* env_do_fsync */),
+        compression_type_(std::get<0>(GetParam())),
+        bottommost_(std::get<1>(GetParam())) {}
+
+ protected:
+  const CompressionType compression_type_;
+  const bool bottommost_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTest2, PresetCompressionDictTest,
+    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
+                       ::testing::Bool()));
+
+TEST_P(PresetCompressionDictTest, Flush) {
+  // Verifies that dictionary is generated and written during flush only when
+  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
+  // size of the dictionary is within expectations according to the limit on
+  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t i = 0; i <= kKeysPerFile; ++i) {
+    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the flush finishes.
+  if (bottommost_) {
+    // Flush is never considered bottommost. This should change in the future
+    // since flushed files may have nothing underneath them, like the one in
+    // this test case.
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
+  // dictionary. Also verifies the size of the dictionary is within expectations
+  // according to the limit on buffering set by
+  // `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t j = 0; j <= kKeysPerFile; ++j) {
+    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
+  // file is not bottommost due to the existing L2 file covering the same key-
+  // range.
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the compaction finishes.
+  if (bottommost_) {
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
+  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
+  // verifies the size of the dictionary is within expectations according to the
+  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  ASSERT_GT(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      prev_compression_dict_bytes_inserted);
+  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
+  // digested dictionary format.
+  if (compression_type_ != kZSTD &&
+      compression_type_ != kZSTDNotFinalCompression) {
+    // Although we limited buffering to `kBlockLen`, there may be up to two
+    // blocks of data included in the dictionary since we only check limit after
+    // each block is built.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+  }
+}
+
 class CompactionCompressionListener : public EventListener {
  public:
   explicit CompactionCompressionListener(Options* db_options)
@@ -1244,9 +1685,9 @@
     int bottommost_level = 0;
     for (int level = 0; level < db->NumberLevels(); level++) {
       std::string files_at_level;
-      ASSERT_TRUE(
-          db->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
-                          &files_at_level));
+      ASSERT_TRUE(db->GetProperty(
+          "rocksdb.num-files-at-level" + ROCKSDB_NAMESPACE::ToString(level),
+          &files_at_level));
       if (files_at_level != "0") {
         bottommost_level = level;
       }
@@ -1268,6 +1709,151 @@
   const Options* db_options_;
 };
 
+enum CompressionFailureType {
+  kTestCompressionFail,
+  kTestDecompressionFail,
+  kTestDecompressionCorruption
+};
+
+class CompressionFailuresTest
+    : public DBTest2,
+      public testing::WithParamInterface<std::tuple<
+          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
+ public:
+  CompressionFailuresTest() {
+    std::tie(compression_failure_type_, compression_type_,
+             compression_max_dict_bytes_, compression_parallel_threads_) =
+        GetParam();
+  }
+
+  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
+  CompressionType compression_type_ = kNoCompression;
+  uint32_t compression_max_dict_bytes_ = 0;
+  uint32_t compression_parallel_threads_ = 0;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTest2, CompressionFailuresTest,
+    ::testing::Combine(::testing::Values(kTestCompressionFail,
+                                         kTestDecompressionFail,
+                                         kTestDecompressionCorruption),
+                       ::testing::ValuesIn(GetSupportedCompressions()),
+                       ::testing::Values(0, 10), ::testing::Values(1, 4)));
+
+TEST_P(CompressionFailuresTest, CompressionFailures) {
+  if (compression_type_ == kNoCompression) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 512;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 512;
+  table_options.verify_compression = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  options.compression = compression_type_;
+  options.compression_opts.parallel_threads = compression_parallel_threads_;
+  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
+  options.bottommost_compression_opts.parallel_threads =
+      compression_parallel_threads_;
+  options.bottommost_compression_opts.max_dict_bytes =
+      compression_max_dict_bytes_;
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "CompressData:TamperWithReturnValue", [](void* arg) {
+          bool* ret = static_cast<bool*>(arg);
+          *ret = false;
+        });
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UncompressBlockContentsForCompressionType:TamperWithReturnValue",
+        [](void* arg) {
+          Status* ret = static_cast<Status*>(arg);
+          ASSERT_OK(*ret);
+          *ret = Status::Corruption("kTestDecompressionFail");
+        });
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UncompressBlockContentsForCompressionType:"
+        "TamperWithDecompressionOutput",
+        [](void* arg) {
+          BlockContents* contents = static_cast<BlockContents*>(arg);
+          // Ensure uncompressed data != original data
+          const size_t len = contents->data.size() + 1;
+          std::unique_ptr<char[]> fake_data(new char[len]());
+          *contents = BlockContents(std::move(fake_data), len);
+        });
+  }
+
+  std::map<std::string, std::string> key_value_written;
+
+  const int kKeySize = 5;
+  const int kValUnitSize = 16;
+  const int kValSize = 256;
+  Random rnd(405);
+
+  Status s = Status::OK();
+
+  DestroyAndReopen(options);
+  // Write 10 random files
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 5; j++) {
+      std::string key = rnd.RandomString(kKeySize);
+      // Ensure good compression ratio
+      std::string valueUnit = rnd.RandomString(kValUnitSize);
+      std::string value;
+      for (int k = 0; k < kValSize; k += kValUnitSize) {
+        value += valueUnit;
+      }
+      s = Put(key, value);
+      if (compression_failure_type_ == kTestCompressionFail) {
+        key_value_written[key] = value;
+        ASSERT_OK(s);
+      }
+    }
+    s = Flush();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    if (i == 4) {
+      // Make compression fail at the mid of table building
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    // Should be kNoCompression, check content consistency
+    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      std::string key = db_iter->key().ToString();
+      std::string value = db_iter->value().ToString();
+      ASSERT_NE(key_value_written.find(key), key_value_written.end());
+      ASSERT_EQ(key_value_written[key], value);
+      key_value_written.erase(key);
+    }
+    ASSERT_EQ(0, key_value_written.size());
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Could not decompress: kTestDecompressionFail");
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Decompressed block did not match raw block");
+  }
+}
+
 TEST_F(DBTest2, CompressionOptions) {
   if (!Zlib_Supported() || !Snappy_Supported()) {
     return;
@@ -1288,6 +1874,10 @@
   const int kValSize = 20;
   Random rnd(301);
 
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+  std::map<std::string, std::string> key_value_written;
+
   for (int iter = 0; iter <= 2; iter++) {
     listener->max_level_checked = 0;
 
@@ -1312,19 +1902,38 @@
       options.bottommost_compression = kDisableCompressionOption;
     }
 
-    DestroyAndReopen(options);
-    // Write 10 random files
-    for (int i = 0; i < 10; i++) {
-      for (int j = 0; j < 5; j++) {
-        ASSERT_OK(
-            Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValSize)));
+    for (auto num_threads : compression_parallel_threads) {
+      options.compression_opts.parallel_threads = num_threads;
+      options.bottommost_compression_opts.parallel_threads = num_threads;
+
+      DestroyAndReopen(options);
+      // Write 10 random files
+      for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < 5; j++) {
+          std::string key = rnd.RandomString(kKeySize);
+          std::string value = rnd.RandomString(kValSize);
+          key_value_written[key] = value;
+          ASSERT_OK(Put(key, value));
+        }
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
       }
-      ASSERT_OK(Flush());
-      dbfull()->TEST_WaitForCompact();
-    }
 
-    // Make sure that we wrote enough to check all 7 levels
-    ASSERT_EQ(listener->max_level_checked, 6);
+      // Make sure that we wrote enough to check all 7 levels
+      ASSERT_EQ(listener->max_level_checked, 6);
+
+      // Make sure database content is the same as key_value_written
+      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+        std::string key = db_iter->key().ToString();
+        std::string value = db_iter->value().ToString();
+        ASSERT_NE(key_value_written.find(key), key_value_written.end());
+        ASSERT_EQ(key_value_written[key], value);
+        key_value_written.erase(key);
+      }
+      ASSERT_OK(db_iter->status());
+      ASSERT_EQ(0, key_value_written.size());
+    }
   }
 }
 
@@ -1375,7 +1984,7 @@
   // 4 Files in L0
   for (int i = 0; i < 4; i++) {
     for (int j = 0; j < 10; j++) {
-      ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
     }
     ASSERT_OK(Flush());
   }
@@ -1390,7 +1999,7 @@
   // Another 6 L0 files to trigger compaction again
   for (int i = 0; i < 6; i++) {
     for (int j = 0; j < 10; j++) {
-      ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10)));
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
     }
     ASSERT_OK(Flush());
   }
@@ -1404,7 +2013,7 @@
   // Hold NotifyOnCompactionCompleted in the unlock mutex section
   TEST_SYNC_POINT("DBTest2::CompactionStall:3");
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_LT(NumTableFilesAtLevel(0),
             options.level0_file_num_compaction_trigger);
   ASSERT_GT(listener->compacted_files_cnt_.load(),
@@ -1425,8 +2034,8 @@
   // This snapshot will have sequence number 0 what is expected behaviour.
   const Snapshot* s1 = db_->GetSnapshot();
 
-  Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
-  Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
+  ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+  ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
 
   db_->ReleaseSnapshot(s1);
 }
@@ -1436,20 +2045,20 @@
   Options options;
   options = CurrentOptions(options);
   std::vector<const Snapshot*> snapshots;
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
   SequenceNumber oldest_ww_snap, first_ww_snap;
 
-  Put("k", "v");  // inc seq
+  ASSERT_OK(Put("k", "v"));  // inc seq
   snapshots.push_back(db_->GetSnapshot());
   snapshots.push_back(db_->GetSnapshot());
-  Put("k", "v");  // inc seq
+  ASSERT_OK(Put("k", "v"));  // inc seq
   snapshots.push_back(db_->GetSnapshot());
   snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
   first_ww_snap = snapshots.back()->GetSequenceNumber();
-  Put("k", "v");  // inc seq
+  ASSERT_OK(Put("k", "v"));  // inc seq
   snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
   snapshots.push_back(db_->GetSnapshot());
-  Put("k", "v");  // inc seq
+  ASSERT_OK(Put("k", "v"));  // inc seq
   snapshots.push_back(db_->GetSnapshot());
 
   {
@@ -1469,7 +2078,8 @@
     : public DBTestBase,
       public testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
-  PinL0IndexAndFilterBlocksTest() : DBTestBase("/db_pin_l0_index_bloom_test") {}
+  PinL0IndexAndFilterBlocksTest()
+      : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {}
   void SetUp() override {
     infinite_max_files_ = std::get<0>(GetParam());
     disallow_preload_ = std::get<1>(GetParam());
@@ -1485,22 +2095,22 @@
     table_options.cache_index_and_filter_blocks = true;
     table_options.pin_l0_filter_and_index_blocks_in_cache = true;
     table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-    options->table_factory.reset(new BlockBasedTableFactory(table_options));
+    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
     CreateAndReopenWithCF({"pikachu"}, *options);
 
-    Put(1, "a", "begin");
-    Put(1, "z", "end");
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
     ASSERT_OK(Flush(1));
     // move this table to L1
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
 
     // reset block cache
     table_options.block_cache = NewLRUCache(64 * 1024);
     options->table_factory.reset(NewBlockBasedTableFactory(table_options));
     TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
     // create new table at L0
-    Put(1, "a2", "begin2");
-    Put(1, "z2", "end2");
+    ASSERT_OK(Put(1, "a2", "begin2"));
+    ASSERT_OK(Put(1, "z2", "end2"));
     ASSERT_OK(Flush(1));
 
     if (close_afterwards) {
@@ -1525,7 +2135,7 @@
   table_options.cache_index_and_filter_blocks = true;
   table_options.pin_l0_filter_and_index_blocks_in_cache = true;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "key", "val"));
@@ -1544,7 +2154,7 @@
 
   std::string value;
   // Miss and hit count should remain the same, they're all pinned.
-  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+  ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value));
   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
@@ -1672,18 +2282,18 @@
   // cache read for both of index and filter. If prefetch doesn't explicitly
   // happen, it will happen when verifying the file.
   Compact(1, "a", "zzzzz");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   if (!disallow_preload_) {
     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
     ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
   } else {
     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
     ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
   }
 
   // Bloom and index hit will happen when a Get() happens.
@@ -1692,12 +2302,12 @@
     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
     ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
   } else {
     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
     ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+    ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
   }
 }
 
@@ -1710,8 +2320,8 @@
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest2, MaxCompactionBytesTest) {
   Options options = CurrentOptions();
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
   options.compaction_style = kCompactionStyleLevel;
   options.write_buffer_size = 200 << 10;
   options.arena_block_size = 4 << 10;
@@ -1743,10 +2353,10 @@
   GenerateNewRandomFile(&rnd);
   // Add three more small files that overlap with the previous file
   for (int i = 0; i < 3; i++) {
-    Put("a", "z");
+    ASSERT_OK(Put("a", "z"));
     ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Output files to L1 are cut to three pieces, according to
   // options.max_compaction_bytes
@@ -1779,6 +2389,10 @@
     return PersistentCache::StatsType();
   }
 
+  uint64_t NewId() override {
+    return last_id_.fetch_add(1, std::memory_order_relaxed);
+  }
+
   Status Insert(const Slice& page_key, const char* data,
                 const size_t size) override {
     MutexLock _(&lock_);
@@ -1819,6 +2433,7 @@
   const bool is_compressed_ = true;
   size_t size_ = 0;
   const size_t max_size_ = 10 * 1024;  // 10KiB
+  std::atomic<uint64_t> last_id_{1};
 };
 
 #ifdef OS_LINUX
@@ -1831,6 +2446,9 @@
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Flush());
   env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
 
   // CPU timing is not enabled with kEnableTimeExceptForMutex
   SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
@@ -1838,19 +2456,20 @@
   ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
   ASSERT_EQ(0, env_->now_cpu_count_.load());
 
-  uint64_t kDummyAddonTime = uint64_t{1000000000000};
+  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
 
   // Add time to NowNanos() reading.
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "TableCache::FindTable:0",
-      [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
   ASSERT_EQ("bar", Get("foo"));
   ASSERT_GT(env_->now_cpu_count_.load(), 2);
-  ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime);
-  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+  ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
 
   SetPerfLevel(PerfLevel::kDisable);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -1873,6 +2492,9 @@
   std::string last_key = "k" + ToString(kNumEntries - 1);
   std::string last_value = "v" + ToString(kNumEntries - 1);
   env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
 
   // CPU timing is not enabled with kEnableTimeExceptForMutex
   SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
@@ -1895,17 +2517,19 @@
   ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
   iter->Prev();
   ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
   ASSERT_EQ("v0", iter->value().ToString());
   ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
   ASSERT_EQ(0, env_->now_cpu_count_.load());
   delete iter;
 
-  uint64_t kDummyAddonTime = uint64_t{1000000000000};
+  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
 
   // Add time to NowNanos() reading.
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "TableCache::FindTable:0",
-      [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
@@ -1922,19 +2546,20 @@
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ("v0", iter->value().ToString());
   ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
-  ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonTime);
+  ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos);
   iter->Next();
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ("v1", iter->value().ToString());
   ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
-  ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonTime);
+  ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos);
   iter->Prev();
   ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
   ASSERT_EQ("v0", iter->value().ToString());
   ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
-  ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonTime);
+  ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos);
   ASSERT_GE(env_->now_cpu_count_.load(), 12);
-  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
 
   SetPerfLevel(PerfLevel::kDisable);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -1942,10 +2567,7 @@
 }
 #endif  // OS_LINUX
 
-// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache
-// breaks when that function is not implemented and no regular block cache is
-// provided.
-#if !defined(OS_SOLARIS) && !defined(OS_WIN)
+#if !defined OS_SOLARIS
 TEST_F(DBTest2, PersistentCache) {
   int num_iter = 80;
 
@@ -1988,7 +2610,7 @@
       std::string str;
       for (int i = 0; i < num_iter; i++) {
         if (i % 4 == 0) {  // high compression ratio
-          str = RandomString(&rnd, 1000);
+          str = rnd.RandomString(1000);
         }
         values.push_back(str);
         ASSERT_OK(Put(1, Key(i), values[i]));
@@ -2009,7 +2631,7 @@
     }
   }
 }
-#endif  // !defined(OS_SOLARIS) && !defined(OS_WIN)
+#endif  // !defined OS_SOLARIS
 
 namespace {
 void CountSyncPoint() {
@@ -2086,7 +2708,7 @@
 
     Random rnd(301);
     for (size_t i = 0; i < kNumEntries; i++) {
-      ASSERT_OK(Put(Key(static_cast<int>(i)), RandomString(&rnd, 100)));
+      ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(100)));
     }
     ASSERT_OK(Flush());
 
@@ -2130,6 +2752,7 @@
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
     }
+    ASSERT_OK(iter->status());
     delete iter;
 
     // Read amp is on average 100% since we read all what we loaded in memory
@@ -2152,26 +2775,30 @@
   {
     const int kIdBufLen = 100;
     char id_buf[kIdBufLen];
+    Status s = Status::NotSupported();
 #ifndef OS_WIN
     // You can't open a directory on windows using random access file
     std::unique_ptr<RandomAccessFile> file;
-    ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions()));
-    if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
-      // fs holding db directory doesn't support getting a unique file id,
-      // this means that running this test will fail because lru_cache will load
-      // the blocks again regardless of them being already in the cache
-      return;
-    }
-#else
-    std::unique_ptr<Directory> dir;
-    ASSERT_OK(env_->NewDirectory(dbname_, &dir));
-    if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
-      // fs holding db directory doesn't support getting a unique file id,
-      // this means that running this test will fail because lru_cache will load
-      // the blocks again regardless of them being already in the cache
-      return;
+    s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
+    if (s.ok()) {
+      if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
     }
 #endif
+    if (!s.ok()) {
+      std::unique_ptr<Directory> dir;
+      ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+      if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
+    }
   }
   uint32_t bytes_per_bit[2] = {1, 16};
   for (size_t k = 0; k < 2; k++) {
@@ -2193,14 +2820,13 @@
 
     Random rnd(301);
     for (int i = 0; i < kNumEntries; i++) {
-      ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+      ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
     }
     ASSERT_OK(Flush());
 
     Close();
     Reopen(options);
 
-    uint64_t total_useful_bytes = 0;
     std::set<int> read_keys;
     std::string value;
     // Iter1: Read half the DB, Read even keys
@@ -2211,8 +2837,6 @@
 
       if (read_keys.find(i) == read_keys.end()) {
         auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
-        total_useful_bytes +=
-            GetEncodedEntrySize(internal_key.size(), value.size());
         read_keys.insert(i);
       }
     }
@@ -2239,8 +2863,6 @@
 
       if (read_keys.find(i) == read_keys.end()) {
         auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
-        total_useful_bytes +=
-            GetEncodedEntrySize(internal_key.size(), value.size());
         read_keys.insert(i);
       }
     }
@@ -2416,22 +3038,22 @@
   Random rnd(301);
   // Generate a file containing 10 keys.
   for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
 
   // Generate another file containing same keys
   for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
 
   int manual_compactions_paused = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
-        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
-        ASSERT_FALSE(paused->load(std::memory_order_acquire));
-        paused->store(true, std::memory_order_release);
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        paused->fetch_add(1, std::memory_order_release);
         manual_compactions_paused += 1;
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -2445,10 +3067,12 @@
   }
 
   // OK, now trigger a manual compaction
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsManualCompactionPaused());
 
   // Wait for compactions to get scheduled and stopped
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   // Get file names after compaction is stopped
   files_meta.clear();
@@ -2463,10 +3087,12 @@
 
   manual_compactions_paused = 0;
   // Now make sure CompactFiles also not run
-  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
-                         files_before_compact, 0);
+  ASSERT_TRUE(dbfull()
+                  ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                 files_before_compact, 0)
+                  .IsManualCompactionPaused());
   // Wait for manual compaction to get scheduled and finish
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   files_meta.clear();
   files_after_compact.clear();
@@ -2495,7 +3121,7 @@
   for (int i = 0; i < 2; i++) {
     // Generate a file containing 10 keys.
     for (int j = 0; j < 100; j++) {
-      ASSERT_OK(Put(Key(j), RandomString(&rnd, 50)));
+      ASSERT_OK(Put(Key(j), rnd.RandomString(50)));
     }
     ASSERT_OK(Flush());
   }
@@ -2517,9 +3143,9 @@
     for (int i = 0; i < options.num_levels; i++) {
       for (int j = 0; j < options.num_levels - i + 1; j++) {
         for (int k = 0; k < 1000; k++) {
-          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
         }
-        Flush();
+        ASSERT_OK(Flush());
       }
 
       for (int l = 1; l < options.num_levels - i; l++) {
@@ -2540,8 +3166,10 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   dbfull()->DisableManualCompaction();
-  dbfull()->CompactRange(compact_options, nullptr, nullptr);
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
   // As manual compaction disabled, not even reach sync point
   ASSERT_EQ(run_manual_compactions, 0);
 #ifndef ROCKSDB_LITE
@@ -2551,8 +3179,8 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
       "CompactionJob::Run():PausingManualCompaction:1");
   dbfull()->EnableManualCompaction();
-  dbfull()->CompactRange(compact_options, nullptr, nullptr);
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
 #endif  // !ROCKSDB_LITE
@@ -2571,9 +3199,9 @@
     for (int i = 0; i < options.num_levels; i++) {
       for (int j = 0; j < options.num_levels - i + 1; j++) {
         for (int k = 0; k < 1000; k++) {
-          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
         }
-        Flush();
+        ASSERT_OK(Flush());
       }
 
       for (int l = 1; l < options.num_levels - i; l++) {
@@ -2590,16 +3218,17 @@
   int run_manual_compactions = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
-        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
-        ASSERT_FALSE(paused->load(std::memory_order_acquire));
-        paused->store(true, std::memory_order_release);
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        paused->fetch_add(1, std::memory_order_release);
         run_manual_compactions++;
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  dbfull()->EnableManualCompaction();
-  dbfull()->CompactRange(compact_options, nullptr, nullptr);
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
   ASSERT_EQ(run_manual_compactions, 1);
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
@@ -2608,8 +3237,8 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
       "CompactionJob::Run():PausingManualCompaction:2");
   dbfull()->EnableManualCompaction();
-  dbfull()->CompactRange(compact_options, nullptr, nullptr);
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
 #endif  // !ROCKSDB_LITE
@@ -2617,6 +3246,360 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, CancelManualCompaction1) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Setup a callback to disable compactions after a couple of levels are
+  // compacted
+  int compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1",
+      [&](void* /*arg*/) { ++compactions_run; });
+
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // Since compactions are disabled, we shouldn't start compacting.
+  // E.g. we should call the compaction function exactly one time.
+  ASSERT_EQ(compactions_run, 0);
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+        if (compactions_run == 3) {
+          compact_options.canceled->store(true, std::memory_order_release);
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(compactions_run, 3);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction2) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+  compact_options.max_subcompactions = 1;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int compactions_run = 0;
+  std::atomic<int> kv_compactions{0};
+  int compactions_stopped_at = 0;
+  int kv_compactions_stopped_at = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+        int kv_compactions_run =
+            kv_compactions.fetch_add(1, std::memory_order_release);
+        if (kv_compactions_run == 5) {
+          compact_options.canceled->store(true, std::memory_order_release);
+          kv_compactions_stopped_at = kv_compactions_run;
+          compactions_stopped_at = compactions_run;
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
+  // the canceled variable from the single compacting thread (via callback),
+  // this value is deterministically kv_compactions_stopped_at + 1.
+  ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
+  ASSERT_EQ(compactions_run, compactions_stopped_at);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionIterator::ProcessKV");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class CancelCompactionListener : public EventListener {
+ public:
+  CancelCompactionListener()
+      : num_compaction_started_(0), num_compaction_ended_(0) {}
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    num_compaction_started_++;
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.status.code(), code_);
+    ASSERT_EQ(ci.status.subcode(), subcode_);
+    num_compaction_ended_++;
+  }
+
+  std::atomic<size_t> num_compaction_started_;
+  std::atomic<size_t> num_compaction_ended_;
+  Status::Code code_;
+  Status::SubCode subcode_;
+};
+
+TEST_F(DBTest2, CancelManualCompactionWithListener) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+  compact_options.max_subcompactions = 1;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  CancelCompactionListener* listener = new CancelCompactionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+        compact_options.canceled->store(true, std::memory_order_release);
+      });
+
+  int running_compaction = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::FinishCompactionOutputFile1",
+      [&](void* /*arg*/) { running_compaction++; });
+
+  // Case I: 1 Notify begin compaction, 2 DisableManualCompaction, 3 Compaction
+  // not run, 4 Notify compaction end.
+  listener->code_ = Status::kIncomplete;
+  listener->subcode_ = Status::SubCode::kManualCompactionPaused;
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+  ASSERT_EQ(running_compaction, 0);
+
+  listener->num_compaction_started_ = 0;
+  listener->num_compaction_ended_ = 0;
+
+  // Case II: 1 DisableManualCompaction, 2 Notify begin compaction (return
+  // without notifying), 3 Notify compaction end (return without notifying).
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+  ASSERT_EQ(running_compaction, 0);
+
+  // Case III: 1 Notify begin compaction, 2 Compaction in between
+  // 3. DisableManualCompaction, , 4 Notify compaction end.
+  // compact_options.canceled->store(false, std::memory_order_release);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionIterator:ProcessKV");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) {
+        compact_options.canceled->store(true, std::memory_order_release);
+      });
+
+  listener->code_ = Status::kOk;
+  listener->subcode_ = Status::SubCode::kNone;
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+  // Compaction job will succeed.
+  ASSERT_GT(running_compaction, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) {
+  int num_levels = 3;
+  const int kNumFilesTrigger = 4;
+
+  Options options = CurrentOptions();
+  env_->SetBackgroundThreads(0, Env::Priority::HIGH);
+  env_->SetBackgroundThreads(0, Env::Priority::LOW);
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  options.env = env_;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+
+  CancelCompactionListener* listener = new CancelCompactionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  int num_bottom_thread_compaction_scheduled = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+      [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; });
+
+  int num_compaction_jobs = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* /*arg*/) { num_compaction_jobs++; });
+
+  listener->code_ = Status::kOk;
+  listener->subcode_ = Status::SubCode::kNone;
+
+  Random rnd(301);
+  for (int i = 0; i < 1; ++i) {
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+      // use no_wait above because that one waits for flush and compaction. We
+      // don't want to wait for compaction because the full compaction is
+      // intentionally blocked while more files are flushed.
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(num_bottom_thread_compaction_scheduled, 0);
+  ASSERT_EQ(num_compaction_jobs, 1);
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBTest2, OptimizeForPointLookup) {
   Options options = CurrentOptions();
   Close();
@@ -2625,7 +3608,7 @@
 
   ASSERT_OK(Put("foo", "v1"));
   ASSERT_EQ("v1", Get("foo"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_EQ("v1", Get("foo"));
 }
 
@@ -2635,11 +3618,11 @@
   options.OptimizeForSmallDb();
 
   // Find the cache object
-  ASSERT_EQ(std::string(BlockBasedTableFactory::kName),
-            std::string(options.table_factory->Name()));
-  BlockBasedTableOptions* table_options =
-      reinterpret_cast<BlockBasedTableOptions*>(
-          options.table_factory->GetOptions());
+  ASSERT_TRUE(options.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
+  auto table_options =
+      options.table_factory->GetOptions<BlockBasedTableOptions>();
+
   ASSERT_TRUE(table_options != nullptr);
   std::shared_ptr<Cache> cache = table_options->block_cache;
 
@@ -2651,7 +3634,7 @@
   ASSERT_NE(0, cache->GetUsage());
 
   ASSERT_EQ("v1", Get("foo"));
-  Flush();
+  ASSERT_OK(Flush());
 
   size_t prev_size = cache->GetUsage();
   // Remember block cache size, so that we can find that
@@ -2666,6 +3649,101 @@
 
 #endif  // ROCKSDB_LITE
 
+TEST_F(DBTest2, IterRaceFlush1) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"},
+       {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush:2");
+  });
+
+  // iterator is created after the first Put(), and its snapshot sequence is
+  // assigned after second Put(), so it must see v2.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v2", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRaceFlush2) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"},
+       {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2");
+  });
+
+  // iterator is created after the first Put(), and its snapshot sequence is
+  // assigned before second Put(), thus it must see v1.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v1", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRefreshRaceFlush) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"},
+       {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2");
+  });
+
+  // iterator is refreshed after the first Put(), and its sequence number is
+  // assigned after second Put(), thus it must see v2.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    ASSERT_OK(it->status());
+    ASSERT_OK(it->Refresh());
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v2", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBTest2, GetRaceFlush1) {
   ASSERT_OK(Put("foo", "v1"));
 
@@ -2678,7 +3756,7 @@
   ROCKSDB_NAMESPACE::port::Thread t1([&] {
     TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
     ASSERT_OK(Put("foo", "v2"));
-    Flush();
+    ASSERT_OK(Flush());
     TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
   });
 
@@ -2701,7 +3779,7 @@
   port::Thread t1([&] {
     TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
     ASSERT_OK(Put("foo", "v2"));
-    Flush();
+    ASSERT_OK(Flush());
     TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
   });
 
@@ -2774,6 +3852,7 @@
   ASSERT_EQ("second", value);
   // nothing should be returned using memtable-only iterator after flushing.
   it = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_OK(it->status());
   count = 0;
   for (it->SeekToFirst(); it->Valid(); it->Next()) {
     ASSERT_TRUE(it->Valid());
@@ -2781,11 +3860,13 @@
   }
   ASSERT_TRUE(!it->Valid());
   ASSERT_EQ(0, count);
+  ASSERT_OK(it->status());
   delete it;
 
   // Add a key to memtable
   ASSERT_OK(Put(1, "foobar", "third"));
   it = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_OK(it->status());
   count = 0;
   for (it->SeekToFirst(); it->Valid(); it->Next()) {
     ASSERT_TRUE(it->Valid());
@@ -2795,6 +3876,7 @@
   }
   ASSERT_TRUE(!it->Valid());
   ASSERT_EQ(1, count);
+  ASSERT_OK(it->status());
   delete it;
 }
 
@@ -2823,28 +3905,28 @@
   WriteOptions wo;
   for (int i = 0; i < 6; i++) {
     wo.low_pri = false;
-    Put("", "", wo);
+    ASSERT_OK(Put("", "", wo));
     wo.low_pri = true;
-    Put("", "", wo);
-    Flush();
+    ASSERT_OK(Put("", "", wo));
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ(0, rate_limit_count.load());
   wo.low_pri = true;
-  Put("", "", wo);
+  ASSERT_OK(Put("", "", wo));
   ASSERT_EQ(1, rate_limit_count.load());
   wo.low_pri = false;
-  Put("", "", wo);
+  ASSERT_OK(Put("", "", wo));
   ASSERT_EQ(1, rate_limit_count.load());
 
   TEST_SYNC_POINT("DBTest.LowPriWrite:0");
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   wo.low_pri = true;
-  Put("", "", wo);
+  ASSERT_OK(Put("", "", wo));
   ASSERT_EQ(1, rate_limit_count.load());
   wo.low_pri = false;
-  Put("", "", wo);
+  ASSERT_OK(Put("", "", wo));
   ASSERT_EQ(1, rate_limit_count.load());
 }
 
@@ -2862,7 +3944,8 @@
     Options options = CurrentOptions();
     options.compression = kNoCompression;
     options.level0_file_num_compaction_trigger = kNumL0Files;
-    options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+    options.memtable_factory.reset(
+        test::NewSpecialSkipListFactory(kNumKeysPerFile));
     options.new_table_reader_for_compaction_inputs = true;
     // takes roughly one second, split into 100 x 10ms intervals. Each interval
     // permits 5.12KB, which is smaller than the block size, so this test
@@ -2877,17 +3960,19 @@
     BlockBasedTableOptions bbto;
     bbto.block_size = 16384;
     bbto.no_block_cache = true;
-    options.table_factory.reset(new BlockBasedTableFactory(bbto));
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
     DestroyAndReopen(options);
 
     for (int i = 0; i < kNumL0Files; ++i) {
       for (int j = 0; j <= kNumKeysPerFile; ++j) {
         ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
       }
-      dbfull()->TEST_WaitForFlushMemTable();
-      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      if (i + 1 < kNumL0Files) {
+        ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+      }
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
 
     ASSERT_EQ(0, options.rate_limiter->GetTotalBytesThrough(Env::IO_HIGH));
@@ -2906,6 +3991,7 @@
                             direct_io_extra));
 
     Iterator* iter = db_->NewIterator(ReadOptions());
+    ASSERT_OK(iter->status());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
     }
@@ -2922,11 +4008,12 @@
 // is on levels higher than the new num_levels.
 TEST_F(DBTest2, ReduceLevel) {
   Options options;
+  options.env = env_;
   options.disable_auto_compactions = true;
   options.num_levels = 7;
   Reopen(options);
-  Put("foo", "bar");
-  Flush();
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
   MoveFilesToLevel(6);
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
@@ -2934,7 +4021,7 @@
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
   compact_options.target_level = 1;
-  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("0,1", FilesPerLevel());
 #endif  // !ROCKSDB_LITE
@@ -2950,6 +4037,7 @@
   Options options;
   options.disable_auto_compactions = true;
   options.num_levels = 7;
+  options.env = env_;
   Reopen(options);
   std::vector<const Snapshot*> snapshots;
   // Try to create a db with multiple layers and a memtable
@@ -2962,35 +4050,35 @@
   // the DB instead of assuming what seq the DB used.
   int i = 1;
   for (; i < 10; i++) {
-    Put(key, value + std::to_string(i));
+    ASSERT_OK(Put(key, value + std::to_string(i)));
     // Take a snapshot to avoid the value being removed during compaction
     auto snapshot = dbfull()->GetSnapshot();
     snapshots.push_back(snapshot);
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (; i < 20; i++) {
-    Put(key, value + std::to_string(i));
+    ASSERT_OK(Put(key, value + std::to_string(i)));
     // Take a snapshot to avoid the value being removed during compaction
     auto snapshot = dbfull()->GetSnapshot();
     snapshots.push_back(snapshot);
   }
-  Flush();
+  ASSERT_OK(Flush());
   MoveFilesToLevel(6);
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
 #endif  // !ROCKSDB_LITE
   for (; i < 30; i++) {
-    Put(key, value + std::to_string(i));
+    ASSERT_OK(Put(key, value + std::to_string(i)));
     auto snapshot = dbfull()->GetSnapshot();
     snapshots.push_back(snapshot);
   }
-  Flush();
+  ASSERT_OK(Flush());
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
 #endif  // !ROCKSDB_LITE
   // And also add some values to the memtable
   for (; i < 40; i++) {
-    Put(key, value + std::to_string(i));
+    ASSERT_OK(Put(key, value + std::to_string(i)));
     auto snapshot = dbfull()->GetSnapshot();
     snapshots.push_back(snapshot);
   }
@@ -3063,40 +4151,46 @@
       [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  Put("key", "val");
+  ASSERT_OK(Put("key", "val"));
   FlushOptions flush_opts;
   flush_opts.wait = false;
   db_->Flush(flush_opts);
   TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
 
-  db_->DisableFileDeletions();
+  ASSERT_OK(db_->DisableFileDeletions());
   VectorLogPtr log_files;
-  db_->GetSortedWalFiles(log_files);
+  ASSERT_OK(db_->GetSortedWalFiles(log_files));
   TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
   for (const auto& log_file : log_files) {
     ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
   }
 
-  db_->EnableFileDeletions();
+  ASSERT_OK(db_->EnableFileDeletions());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(DBTest2, TestNumPread) {
   Options options = CurrentOptions();
+  bool prefetch_supported =
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
   // disable block cache
   BlockBasedTableOptions table_options;
   table_options.no_block_cache = true;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   env_->count_random_reads_ = true;
-
   env_->random_file_open_counter_.store(0);
   ASSERT_OK(Put("bar", "foo"));
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Flush());
-  // After flush, we'll open the file and read footer, meta block,
-  // property block and index block.
-  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
   ASSERT_EQ(1, env_->random_file_open_counter_.load());
 
   // One pread per a normal data block read
@@ -3112,19 +4206,30 @@
   ASSERT_OK(Put("bar2", "foo2"));
   ASSERT_OK(Put("foo2", "bar2"));
   ASSERT_OK(Flush());
-  // After flush, we'll open the file and read footer, meta block,
-  // property block and index block.
-  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
   ASSERT_EQ(1, env_->random_file_open_counter_.load());
 
-  // Compaction needs two input blocks, which requires 2 preads, and
-  // generate a new SST file which needs 4 preads (footer, meta block,
-  // property block and index block). In total 6.
   env_->random_file_open_counter_.store(0);
   env_->random_read_counter_.Reset();
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_EQ(6, env_->random_read_counter_.Read());
-  // All compactin input files should have already been opened.
+  if (prefetch_supported) {
+    // Compaction needs two input blocks, which requires 2 preads, and
+    // generate a new SST file which needs 4 preads (footer, meta block,
+    // property block and index block). In total 6.
+    ASSERT_EQ(6, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch off, compaction needs two input blocks,
+    // followed by a single buffered read.  In total 3.
+    ASSERT_EQ(3, env_->random_read_counter_.Read());
+  }
+  // All compaction input files should have already been opened.
   ASSERT_EQ(1, env_->random_file_open_counter_.load());
 
   // One pread per a normal data block read
@@ -3136,6 +4241,118 @@
   ASSERT_EQ(0, env_->random_file_open_counter_.load());
 }
 
+class TraceExecutionResultHandler : public TraceRecordResult::Handler {
+ public:
+  TraceExecutionResultHandler() {}
+  ~TraceExecutionResultHandler() override {}
+
+  virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceWrite: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        writes_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(
+      const SingleValueTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceGet: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        gets_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(
+      const MultiValuesTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    for (const Status& s : result.GetMultiStatus()) {
+      s.PermitUncheckedError();
+    }
+    switch (result.GetTraceType()) {
+      case kTraceMultiGet: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        multigets_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(const IteratorTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceIteratorSeek:
+      case kTraceIteratorSeekForPrev: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        seeks_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  void Reset() {
+    total_latency_ = 0;
+    cnt_ = 0;
+    writes_ = 0;
+    gets_ = 0;
+    seeks_ = 0;
+    multigets_ = 0;
+  }
+
+  double GetAvgLatency() const {
+    return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_;
+  }
+
+  int GetNumWrites() const { return writes_; }
+
+  int GetNumGets() const { return gets_; }
+
+  int GetNumIterSeeks() const { return seeks_; }
+
+  int GetNumMultiGets() const { return multigets_; }
+
+ private:
+  std::atomic<uint64_t> total_latency_{0};
+  std::atomic<uint32_t> cnt_{0};
+  std::atomic<int> writes_{0};
+  std::atomic<int> gets_{0};
+  std::atomic<int> seeks_{0};
+  std::atomic<int> multigets_{0};
+};
+
 TEST_F(DBTest2, TraceAndReplay) {
   Options options = CurrentOptions();
   options.merge_operator = MergeOperators::CreatePutOperator();
@@ -3154,6 +4371,170 @@
   ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
   ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
 
+  // 5 Writes
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  // 6th Write
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  // 2 Seek(ForPrev)s
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");  // Seek 1
+  single_iter->SeekForPrev("g");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  // 2 Gets
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  // 7th and 8th Write, 3rd Get
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  // Total Write x 8, Get x 3, Seek x 2.
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+  TraceExecutionResultHandler res_handler;
+  std::function<void(Status, std::unique_ptr<TraceRecordResult> &&)> res_cb =
+      [&res_handler](Status exec_s, std::unique_ptr<TraceRecordResult>&& res) {
+        ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported());
+        if (res != nullptr) {
+          ASSERT_OK(res->Accept(&res_handler));
+          res.reset();
+        }
+      };
+
+  // Unprepared replay should fail with Status::Incomplete()
+  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+  ASSERT_OK(replayer->Prepare());
+  // Ok to repeatedly Prepare().
+  ASSERT_OK(replayer->Prepare());
+  // Replay using 1 thread, 1x speed.
+  ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  // Re-replay should fail with Status::Incomplete() if Prepare() was not
+  // called. Currently we don't distinguish between unprepared and trace end.
+  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+
+  // Re-replay using 2 threads, 2x speed.
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Re-replay using 2 threads, 1/2 speed.
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  replayer.reset();
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceAndManualReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
   ASSERT_OK(Put(0, "a", "1"));
   ASSERT_OK(Merge(0, "b", "2"));
   ASSERT_OK(Delete(0, "c"));
@@ -3171,6 +4552,37 @@
   single_iter = db_->NewIterator(ro);
   single_iter->Seek("f");
   single_iter->SeekForPrev("g");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  // Write some sequenced keys for testing lower/upper bounds of iterator.
+  batch.Clear();
+  ASSERT_OK(batch.Put("iter-0", "iter-0"));
+  ASSERT_OK(batch.Put("iter-1", "iter-1"));
+  ASSERT_OK(batch.Put("iter-2", "iter-2"));
+  ASSERT_OK(batch.Put("iter-3", "iter-3"));
+  ASSERT_OK(batch.Put("iter-4", "iter-4"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  ReadOptions bounded_ro = ro;
+  Slice lower_bound("iter-1");
+  Slice upper_bound("iter-3");
+  bounded_ro.iterate_lower_bound = &lower_bound;
+  bounded_ro.iterate_upper_bound = &upper_bound;
+  single_iter = db_->NewIterator(bounded_ro);
+  single_iter->Seek("iter-0");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-1");
+  single_iter->Seek("iter-2");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  single_iter->Seek("iter-4");
+  ASSERT_FALSE(single_iter->Valid());
+  single_iter->SeekForPrev("iter-0");
+  ASSERT_FALSE(single_iter->Valid());
+  single_iter->SeekForPrev("iter-2");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  single_iter->SeekForPrev("iter-4");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  ASSERT_OK(single_iter->status());
   delete single_iter;
 
   ASSERT_EQ("1", Get(0, "a"));
@@ -3180,14 +4592,18 @@
   ASSERT_OK(Put(1, "rocksdb", "rocks"));
   ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
 
+  // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2.
+  // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6
+  // Seek(ForPrev)s.
+  // Total Write x 9, Get x 3, Seek x 8
   ASSERT_OK(db_->EndTrace());
   // These should not get into the trace file as it is after EndTrace.
-  Put("hello", "world");
-  Merge("foo", "bar");
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
 
   // Open another db, replay, and verify the data
   std::string value;
-  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
   ASSERT_OK(DestroyDB(dbname2, options));
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
@@ -3209,7 +4625,9 @@
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3218,8 +4636,76 @@
 
   std::unique_ptr<TraceReader> trace_reader;
   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
-  Replayer replayer(db2, handles_, std::move(trace_reader));
-  ASSERT_OK(replayer.Replay());
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+  TraceExecutionResultHandler res_handler;
+
+  // Manual replay for 2 times. The 2nd checks if the replay can restart.
+  std::unique_ptr<TraceRecord> record;
+  std::unique_ptr<TraceRecordResult> result;
+  for (int i = 0; i < 2; i++) {
+    // Next should fail if unprepared.
+    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+    ASSERT_OK(replayer->Prepare());
+    Status s = Status::OK();
+    // Looping until trace end.
+    while (s.ok()) {
+      s = replayer->Next(&record);
+      // Skip unsupported operations.
+      if (s.IsNotSupported()) {
+        continue;
+      }
+      if (s.ok()) {
+        ASSERT_OK(replayer->Execute(record, &result));
+        if (result != nullptr) {
+          ASSERT_OK(result->Accept(&res_handler));
+          if (record->GetTraceType() == kTraceIteratorSeek ||
+              record->GetTraceType() == kTraceIteratorSeekForPrev) {
+            IteratorSeekQueryTraceRecord* iter_rec =
+                dynamic_cast<IteratorSeekQueryTraceRecord*>(record.get());
+            IteratorTraceExecutionResult* iter_res =
+                dynamic_cast<IteratorTraceExecutionResult*>(result.get());
+            // Check if lower/upper bounds are correctly saved and decoded.
+            std::string lower_str = iter_rec->GetLowerBound().ToString();
+            std::string upper_str = iter_rec->GetUpperBound().ToString();
+            std::string iter_key = iter_res->GetKey().ToString();
+            std::string iter_value = iter_res->GetValue().ToString();
+            if (!lower_str.empty() && !upper_str.empty()) {
+              ASSERT_EQ(lower_str, "iter-1");
+              ASSERT_EQ(upper_str, "iter-3");
+              if (iter_res->GetValid()) {
+                // If iterator is valid, then lower_bound <= key < upper_bound.
+                ASSERT_GE(iter_key, lower_str);
+                ASSERT_LT(iter_key, upper_str);
+              } else {
+                // If iterator is invalid, then
+                //   key < lower_bound or key >= upper_bound.
+                ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str);
+              }
+            }
+            // If iterator is invalid, the key and value should be empty.
+            if (!iter_res->GetValid()) {
+              ASSERT_TRUE(iter_key.empty());
+              ASSERT_TRUE(iter_value.empty());
+            }
+          }
+          result.reset();
+        }
+      }
+    }
+    // Status::Incomplete() will be returned when manually reading the trace
+    // end, or Prepare() was not called.
+    ASSERT_TRUE(s.IsIncomplete());
+    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+    ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+    ASSERT_EQ(res_handler.GetNumWrites(), 9);
+    ASSERT_EQ(res_handler.GetNumGets(), 3);
+    ASSERT_EQ(res_handler.GetNumIterSeeks(), 8);
+    ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+    res_handler.Reset();
+  }
 
   ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
   ASSERT_EQ("1", value);
@@ -3233,6 +4719,138 @@
   ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
   ASSERT_EQ("rocks", value);
 
+  // Test execution of artificially created TraceRecords.
+  uint64_t fake_ts = 1U;
+  // Write
+  batch.Clear();
+  ASSERT_OK(batch.Put("trace-record-write1", "write1"));
+  ASSERT_OK(batch.Put("trace-record-write2", "write2"));
+  record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Write x 1
+  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value));
+  ASSERT_EQ("write1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value));
+  ASSERT_EQ("write2", value);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 1);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Get related
+  // Get an existing key.
+  record.reset(new GetQueryTraceRecord(handles[0]->GetID(),
+                                       "trace-record-write1", fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Get x 1
+  // Get an non-existing key, should still return Status::OK().
+  record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get",
+                                       fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Get x 2
+  // Get from an invalid (non-existing) cf_id.
+  uint32_t invalid_cf_id = handles[1]->GetID() + 1;
+  record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+  ASSERT_TRUE(result == nullptr);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 2);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Iteration related
+  for (IteratorSeekQueryTraceRecord::SeekType seekType :
+       {IteratorSeekQueryTraceRecord::kSeek,
+        IteratorSeekQueryTraceRecord::kSeekForPrev}) {
+    // Seek to an existing key.
+    record.reset(new IteratorSeekQueryTraceRecord(
+        seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++));
+    ASSERT_OK(replayer->Execute(record, &result));
+    ASSERT_TRUE(result != nullptr);
+    ASSERT_OK(result->Accept(&res_handler));  // Seek x 1 in one iteration
+    // Seek to an non-existing key, should still return Status::OK().
+    record.reset(new IteratorSeekQueryTraceRecord(
+        seekType, handles[0]->GetID(), "trace-record-get", fake_ts++));
+    ASSERT_OK(replayer->Execute(record, &result));
+    ASSERT_TRUE(result != nullptr);
+    ASSERT_OK(result->Accept(&res_handler));  // Seek x 2 in one iteration
+    // Seek from an invalid cf_id.
+    record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id,
+                                                  "whatever", fake_ts++));
+    ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+    ASSERT_TRUE(result == nullptr);
+  }
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 4);  // Seek x 2 in two iterations
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // MultiGet related
+  // Get existing keys.
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a", "foo"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 1
+  // Get all non-existing keys, should still return Status::OK().
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"no1", "no2"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 2
+  // Get mixed of existing and non-existing keys, should still return
+  // Status::OK().
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a", "no2"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  MultiValuesTraceExecutionResult* mvr =
+      dynamic_cast<MultiValuesTraceExecutionResult*>(result.get());
+  ASSERT_TRUE(mvr != nullptr);
+  ASSERT_OK(mvr->GetMultiStatus()[0]);
+  ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound());
+  ASSERT_EQ(mvr->GetValues()[0], "1");
+  ASSERT_EQ(mvr->GetValues()[1], "");
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 3
+  // Get from an invalid (non-existing) cf_id.
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>(
+          {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}),
+      std::vector<std::string>({"a", "foo", "whatever"}), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+  ASSERT_TRUE(result == nullptr);
+  // Empty MultiGet
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>(), std::vector<std::string>(), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+  ASSERT_TRUE(result == nullptr);
+  // MultiGet size mismatch
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a"}), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+  ASSERT_TRUE(result == nullptr);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 3);
+  res_handler.Reset();
+
+  replayer.reset();
+
   for (auto handle : handles) {
     delete handle;
   }
@@ -3261,7 +4879,7 @@
   ASSERT_OK(Put(0, "c", "1"));
   ASSERT_OK(db_->EndTrace());
 
-  std::string dbname2 = test::TmpDir(env_) + "/db_replay2";
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2");
   std::string value;
   ASSERT_OK(DestroyDB(dbname2, options));
 
@@ -3284,7 +4902,9 @@
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3294,8 +4914,12 @@
 
   std::unique_ptr<TraceReader> trace_reader;
   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
-  Replayer replayer(db2, handles_, std::move(trace_reader));
-  ASSERT_OK(replayer.Replay());
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
 
   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
   ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
@@ -3330,7 +4954,7 @@
   ASSERT_OK(Put(0, "e", "5"));
   ASSERT_OK(db_->EndTrace());
 
-  std::string dbname2 = test::TmpDir(env_) + "/db_replay_sampling";
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling");
   std::string value;
   ASSERT_OK(DestroyDB(dbname2, options));
 
@@ -3352,7 +4976,9 @@
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
@@ -3363,8 +4989,12 @@
 
   std::unique_ptr<TraceReader> trace_reader;
   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
-  Replayer replayer(db2, handles_, std::move(trace_reader));
-  ASSERT_OK(replayer.Replay());
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
 
   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
   ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
@@ -3425,12 +5055,12 @@
 
   ASSERT_OK(db_->EndTrace());
   // These should not get into the trace file as it is after EndTrace.
-  Put("hello", "world");
-  Merge("foo", "bar");
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
 
   // Open another db, replay, and verify the data
   std::string value;
-  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
   ASSERT_OK(DestroyDB(dbname2, options));
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
@@ -3452,7 +5082,9 @@
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3461,8 +5093,12 @@
 
   std::unique_ptr<TraceReader> trace_reader;
   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
-  Replayer replayer(db2, handles_, std::move(trace_reader));
-  ASSERT_OK(replayer.Replay());
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
 
   // All the key-values should not present since we filter out the WRITE ops.
   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
@@ -3479,7 +5115,7 @@
   ASSERT_OK(DestroyDB(dbname2, options));
 
   // Set up a new db.
-  std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read";
+  std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
   ASSERT_OK(DestroyDB(dbname3, options));
 
   DB* db3_init = nullptr;
@@ -3498,7 +5134,7 @@
   handles.clear();
 
   DB* db3 =  nullptr;
-  ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3));
+  ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3554,6 +5190,11 @@
 
 TEST_F(DBTest2, PinnableSliceAndMmapReads) {
   Options options = CurrentOptions();
+  options.env = env_;
+  if (!IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires default environment");
+    return;
+  }
   options.allow_mmap_reads = true;
   options.max_open_files = 100;
   options.compression = kNoCompression;
@@ -3568,9 +5209,9 @@
   ASSERT_FALSE(pinned_value.IsPinned());
   ASSERT_EQ(pinned_value.ToString(), "bar");
 
-  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
-                              nullptr /* end */, nullptr /* column_family */,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
 
   // Ensure pinned_value doesn't rely on memory munmap'd by the above
   // compaction. It crashes if it does.
@@ -3606,18 +5247,18 @@
   bbto.cache_index_and_filter_blocks = false;
   bbto.block_cache = NewLRUCache(100000);
   bbto.block_size = 400;  // small block size
-  options.table_factory.reset(new BlockBasedTableFactory(bbto));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   Reopen(options);
 
   Random rnd(301);
-  std::string v = RandomString(&rnd, 400);
+  std::string v = rnd.RandomString(400);
 
   // Since v is the size of a block, each key should take a block
   // of 400+ bytes.
-  Put("1", v);
-  Put("3", v);
-  Put("5", v);
-  Put("7", v);
+  ASSERT_OK(Put("1", v));
+  ASSERT_OK(Put("3", v));
+  ASSERT_OK(Put("5", v));
+  ASSERT_OK(Put("7", v));
   ASSERT_OK(Flush());
 
   ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
@@ -3646,16 +5287,18 @@
     iter->Seek("3");
     ASSERT_TRUE(iter->Valid());
 
+    ASSERT_OK(iter->status());
+
     ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
     ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
   }
   ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
 
   // Test compaction case
-  Put("2", v);
-  Put("5", v);
-  Put("6", v);
-  Put("8", v);
+  ASSERT_OK(Put("2", v));
+  ASSERT_OK(Put("5", v));
+  ASSERT_OK(Put("6", v));
+  ASSERT_OK(Put("8", v));
   ASSERT_OK(Flush());
 
   // Clear existing data in block cache
@@ -3714,20 +5357,20 @@
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  Put("1", "1");
-  Put("9", "1");
-  Flush();
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
 
   expected_lower_bound = 0;
   expected_higher_bound = 8 * 1024;
 
-  Put("1", "1");
-  Put("9", "1");
-  Flush();
-
-  Put("1", "1");
-  Put("9", "1");
-  Flush();
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
 
   // Full compaction to make sure there is no L0 file after the open.
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -3760,13 +5403,13 @@
   options.max_open_files = -1;
   Reopen(options);
 
-  Put("1", "1");
-  Put("9", "1");
-  Flush();
-
-  Put("1", "1");
-  Put("9", "1");
-  Flush();
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
 
   ASSERT_TRUE(called.load());
   called = false;
@@ -3797,7 +5440,7 @@
   CreateColumnFamilies({"test1", "test2"}, Options());
   ASSERT_EQ(handles_.size(), 2);
 
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
   port::Thread user_thread1([&]() {
     auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
     ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
@@ -3832,6 +5475,7 @@
   SyncPoint::GetInstance()->EnableProcessing();
 
   Options options;
+  options.env = env_;
   options.num_levels = 2;
   options.disable_auto_compactions = true;
   Reopen(options);
@@ -3866,31 +5510,36 @@
   GetSstFiles(env_, dbname_, &files);
   ASSERT_EQ(files.size(), 2);
 
-  port::Thread user_thread1(
-      [&]() { db_->CompactFiles(CompactionOptions(), handle, files, 1); });
+  Status user_thread1_status;
+  port::Thread user_thread1([&]() {
+    user_thread1_status =
+        db_->CompactFiles(CompactionOptions(), handle, files, 1);
+  });
 
+  Status user_thread2_status;
   port::Thread user_thread2([&]() {
-    ASSERT_OK(db_->IngestExternalFile(handle, {external_file2},
-                                      IngestExternalFileOptions()));
+    user_thread2_status = db_->IngestExternalFile(handle, {external_file2},
+                                                  IngestExternalFileOptions());
     TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
   });
 
   user_thread1.join();
   user_thread2.join();
 
+  ASSERT_OK(user_thread1_status);
+  ASSERT_OK(user_thread2_status);
+
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 #endif  // ROCKSDB_LITE
 
-// TODO: figure out why this test fails in appveyor
-#ifndef OS_WIN
 TEST_F(DBTest2, MultiDBParallelOpenTest) {
   const int kNumDbs = 2;
   Options options = CurrentOptions();
   std::vector<std::string> dbnames;
   for (int i = 0; i < kNumDbs; ++i) {
-    dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i));
+    dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + ToString(i)));
     ASSERT_OK(DestroyDB(dbnames.back(), options));
   }
 
@@ -3915,7 +5564,6 @@
   }
 
   // Verify non-empty DBs can be recovered in parallel
-  dbs.clear();
   open_threads.clear();
   for (int i = 0; i < kNumDbs; ++i) {
     open_threads.emplace_back(
@@ -3932,11 +5580,11 @@
     ASSERT_OK(DestroyDB(dbnames[i], options));
   }
 }
-#endif  // OS_WIN
 
 namespace {
 class DummyOldStats : public Statistics {
  public:
+  const char* Name() const override { return "DummyOldStats"; }
   uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
   void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
     num_rt++;
@@ -3956,8 +5604,8 @@
   }
   bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
   std::string ToString() const override { return ""; }
-  int num_rt = 0;
-  int num_mt = 0;
+  std::atomic<int> num_rt{0};
+  std::atomic<int> num_mt{0};
 };
 }  // namespace
 
@@ -3969,7 +5617,7 @@
   options.statistics = stats;
   Reopen(options);
 
-  Put("foo", "bar");
+  ASSERT_OK(Put("foo", "bar"));
   ASSERT_EQ("bar", Get("foo"));
   ASSERT_OK(Flush());
   ASSERT_EQ("bar", Get("foo"));
@@ -4017,6 +5665,7 @@
   ASSERT_OK(Put("bbb1", ""));
 
   Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
 
   // Seeking into f1, the iterator will check bloom filter which returns the
   // file iterator ot be invalidate, and the cursor will put into f2, with
@@ -4055,6 +5704,7 @@
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
   Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
 
   // Bloom filter is filterd out by f1.
   // This is just one of several valid position following the contract.
@@ -4062,6 +5712,7 @@
   // the behavior of the current implementation. If underlying implementation
   // changes, the test might fail here.
   iter->Seek("bbb1");
+  ASSERT_OK(iter->status());
   ASSERT_FALSE(iter->Valid());
 
   delete iter;
@@ -4149,7 +5800,7 @@
     for (const auto& f : filenames) {
       uint64_t number;
       FileType type;
-      if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
+      if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
         std::string fname = dbname_ + "/" + f;
         std::string file_content;
         ASSERT_OK(ReadFileToString(env_, fname, &file_content));
@@ -4208,6 +5859,7 @@
     ReadOptions ro;
     ro.total_order_seek = true;
     std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    ASSERT_OK(iter->status());
     iter->Seek("e");
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ("x", iter->key().ToString());
@@ -4225,6 +5877,7 @@
 
   ASSERT_OK(Put("a", "a"));
   Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
   ASSERT_OK(Flush());
   size_t value = options.write_buffer_manager->memory_usage();
   ASSERT_GT(value, base_value);
@@ -4283,7 +5936,7 @@
   ASSERT_OK(Put("key", "2"));
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
-  Flush();
+  ASSERT_OK(Flush());
   CompactRangeOptions cro;
   cro.change_level = true;
   cro.target_level = 2;
@@ -4291,14 +5944,14 @@
                                    nullptr));
 
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
-  Flush();
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("0,4,1", FilesPerLevel());
 #endif  // ROCKSDB_LITE
@@ -4306,6 +5959,24 @@
   ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
 }
 
+TEST_F(DBTest2, FileConsistencyCheckInOpen) {
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        Status* ret_s = static_cast<Status*>(arg);
+        *ret_s = Status::Corruption("fcc");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  ASSERT_NOK(TryReopen(options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
   // create a DB with block prefix index
   BlockBasedTableOptions table_options;
@@ -4320,15 +5991,16 @@
   Reopen(options);
 
   Random rnd(301);
-  std::string large_value = RandomString(&rnd, 500);
+  std::string large_value = rnd.RandomString(500);
 
   ASSERT_OK(Put("a1", large_value));
   ASSERT_OK(Put("x1", large_value));
   ASSERT_OK(Put("y1", large_value));
-  Flush();
+  ASSERT_OK(Flush());
 
   {
     std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_OK(iterator->status());
     iterator->SeekForPrev("x3");
     ASSERT_TRUE(iterator->Valid());
     ASSERT_EQ("x1", iterator->key().ToString());
@@ -4367,6 +6039,46 @@
   }
 }
 
+TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
+  Options options = last_options_;
+  options.env = env_;
+  options.max_open_files = 20;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  bbto.metadata_block_size = 128;
+  bbto.block_size = 128;
+  bbto.block_cache = NewLRUCache(16777216);
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Force no table cache so every read will preload the SST file.
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  bbto.block_cache->SetCapacity(0);
+
+  Random rnd(301);
+  for (int i = 0; i < 4096; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(32)));
+  }
+  ASSERT_OK(Flush());
+
+  // Try different random failures in table open for 300 times.
+  for (int i = 0; i < 300; i++) {
+    env_->num_reads_fails_ = 0;
+    env_->rand_reads_fail_odd_ = 8;
+
+    std::string value;
+    Status s = dbfull()->Get(ReadOptions(), Key(1), &value);
+    if (env_->num_reads_fails_ > 0) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+  }
+
+  env_->rand_reads_fail_odd_ = 0;
+}
+
 TEST_F(DBTest2, ChangePrefixExtractor) {
   for (bool use_partitioned_filter : {true, false}) {
     // create a DB with block prefix index
@@ -4400,7 +6112,7 @@
     ASSERT_OK(Put("xx1", ""));
     ASSERT_OK(Put("xz1", ""));
     ASSERT_OK(Put("zz", ""));
-    Flush();
+    ASSERT_OK(Flush());
 
     // After reopening DB with prefix size 2 => 1, prefix extractor
     // won't take effective unless it won't change results based
@@ -4410,6 +6122,7 @@
 
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+      ASSERT_OK(iterator->status());
       iterator->Seek("xa");
       ASSERT_TRUE(iterator->Valid());
       ASSERT_EQ("xb", iterator->key().ToString());
@@ -4434,6 +6147,7 @@
 
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      ASSERT_OK(iterator->status());
 
       // SeekForPrev() never uses prefix bloom if it is changed.
       iterator->SeekForPrev("xg0");
@@ -4448,6 +6162,7 @@
     ub = Slice(ub_str);
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      ASSERT_OK(iterator->status());
 
       iterator->Seek("x");
       ASSERT_TRUE(iterator->Valid());
@@ -4494,6 +6209,8 @@
       if (expect_filter_check) {
         ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
       }
+
+      ASSERT_OK(iterator->status());
     }
     {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
@@ -4511,6 +6228,8 @@
       if (expect_filter_check) {
         ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
       }
+
+      ASSERT_OK(iterator->status());
     }
 
     ub_str = "xg9";
@@ -4523,6 +6242,7 @@
       if (expect_filter_check) {
         ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
       }
+      ASSERT_OK(iterator->status());
     }
   }
 }
@@ -4542,29 +6262,29 @@
   Reopen(options);
 
   ASSERT_OK(Put("b1", "ok"));
-  Flush();
+  ASSERT_OK(Flush());
 
   // Flushing several files so that the chance that hash bucket
   // is empty fo "b" in at least one of the files is high.
   ASSERT_OK(Put("a1", ""));
   ASSERT_OK(Put("c1", ""));
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(Put("a2", ""));
   ASSERT_OK(Put("c2", ""));
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(Put("a3", ""));
   ASSERT_OK(Put("c3", ""));
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(Put("a4", ""));
   ASSERT_OK(Put("c4", ""));
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(Put("a5", ""));
   ASSERT_OK(Put("c5", ""));
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_EQ("ok", Get("b1"));
 }
@@ -4582,12 +6302,12 @@
   Reopen(options);
 
   Random rnd(301);
-  std::string large_value = RandomString(&rnd, 500);
+  std::string large_value = rnd.RandomString(500);
 
   ASSERT_OK(Put("a1", large_value));
   ASSERT_OK(Put("x1", large_value));
   ASSERT_OK(Put("y1", large_value));
-  Flush();
+  ASSERT_OK(Flush());
 
   ReadOptions ro;
   ro.total_order_seek = false;
@@ -4598,6 +6318,7 @@
     ASSERT_TRUE(iterator->Valid());
     ASSERT_EQ("x1", iterator->key().ToString());
     ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+    ASSERT_OK(iterator->status());
   }
 
   std::string ub_str = "b9";
@@ -4609,6 +6330,7 @@
     iterator->Seek("b1");
     ASSERT_FALSE(iterator->Valid());
     ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+    ASSERT_OK(iterator->status());
   }
 
   ub_str = "z";
@@ -4619,6 +6341,7 @@
     ASSERT_TRUE(iterator->Valid());
     ASSERT_EQ("x1", iterator->key().ToString());
     ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+    ASSERT_OK(iterator->status());
   }
 
   ub_str = "c";
@@ -4628,6 +6351,7 @@
     iterator->Seek("b1");
     ASSERT_FALSE(iterator->Valid());
     ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+    ASSERT_OK(iterator->status());
   }
 
   // The same queries without recreating iterator
@@ -4640,6 +6364,7 @@
     iterator->Seek("b1");
     ASSERT_FALSE(iterator->Valid());
     ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+    ASSERT_OK(iterator->status());
 
     ub_str = "z";
     ub = Slice(ub_str);
@@ -4676,16 +6401,469 @@
     ASSERT_EQ("a1", iterator->key().ToString());
   }
 }
+
+class RenameCurrentTest : public DBTestBase,
+                          public testing::WithParamInterface<std::string> {
+ public:
+  RenameCurrentTest()
+      : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
+        sync_point_(GetParam()) {}
+
+  ~RenameCurrentTest() override {}
+
+  void SetUp() override {
+    env_->no_file_overwrite_.store(true, std::memory_order_release);
+  }
+
+  void TearDown() override {
+    env_->no_file_overwrite_.store(false, std::memory_order_release);
+  }
+
+  void SetupSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
+      Status* s = reinterpret_cast<Status*>(arg);
+      assert(s);
+      *s = Status::IOError("Injected IO error.");
+    });
+  }
+
+  const std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
+                        ::testing::Values("SetCurrentFile:BeforeRename",
+                                          "SetCurrentFile:AfterRename"));
+
+TEST_P(RenameCurrentTest, Open) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = TryReopen(options);
+  ASSERT_NOK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+}
+
+TEST_P(RenameCurrentTest, Flush) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("key", "value"));
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(Flush());
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("value", Get("key"));
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_P(RenameCurrentTest, Compaction) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("a", "a_value"));
+  ASSERT_OK(Put("c", "c_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b", "b_value"));
+  ASSERT_OK(Put("d", "d_value"));
+  ASSERT_OK(Flush());
+
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                               /*end=*/nullptr));
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ("d_value", Get("d"));
+}
+
+TEST_F(DBTest2, BottommostTemperature) {
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  options.level0_file_num_compaction_trigger = 2;
+  options.statistics = CreateDBStatistics();
+  Reopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("bar"));
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // reopen and check the information is persisted
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
+}
+
+TEST_F(DBTest2, BottommostTemperatureUniversal) {
+  const int kTriggerNum = 3;
+  const int kNumLevels = 5;
+  const int kBottommostLevel = kNumLevels - 1;
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kTriggerNum;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  for (int i = 0; i < kTriggerNum; i++) {
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("bar", "bar"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+
+  // Update bottommost temperature
+  options.bottommost_temperature = Temperature::kWarm;
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  // Should not impact existing ones
+  ASSERT_EQ(Temperature::kUnknown,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+
+  // new generated file should have the new settings
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+}
 #endif  // ROCKSDB_LITE
-}  // namespace ROCKSDB_NAMESPACE
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
+// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
+TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  bool should_inject_error = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:BeforeReadWal",
+      [&](void* /*arg*/) { should_inject_error = true; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "LogReader::ReadMore:AfterReadFile", [&](void* arg) {
+        if (should_inject_error) {
+          ASSERT_NE(nullptr, arg);
+          *reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  options.avoid_flush_during_recovery = true;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsIOError());
 }
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:Start:1",
+        "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
+       {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
+        "DBImpl::BackgroundCallFlush:Start:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1"}, Options());
+  ASSERT_OK(Put("foo", "bar"));
+
+  // Creating a CF when a flush is going on, log is synced but the
+  // closed log file is not synced and corrupted.
+  port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
+  CreateColumnFamilies({"test2"}, Options());
+  env_->corrupt_in_sync_ = true;
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
+  flush_thread.join();
+  env_->corrupt_in_sync_ = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopening the DB should not corrupt anything
+  Options options = CurrentOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
+}
+
+TEST_F(DBTest2, RenameDirectory) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  Close();
+  auto old_dbname = dbname_;
+  auto new_dbname = dbname_ + "_2";
+  EXPECT_OK(env_->RenameFile(dbname_, new_dbname));
+  options.create_if_missing = false;
+  dbname_ = new_dbname;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_EQ("value0", Get("foo"));
+  Destroy(options);
+  dbname_ = old_dbname;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
+  Destroy(last_options_);
+
+  Options options = CurrentOptions();
+  options.max_write_buffer_size_to_maintain = 64 << 10;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.comparator = test::ComparatorWithU64Ts();
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  constexpr uint64_t kTsU64Value = 12;
+
+  for (uint64_t key = 0; key < 100; ++key) {
+    std::string ts_str;
+    PutFixed64(&ts_str, kTsU64Value);
+    Slice ts = ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &ts;
+
+    std::string key_str;
+    PutFixed64(&key_str, key);
+    std::reverse(key_str.begin(), key_str.end());
+    ASSERT_OK(Put(key_str, "value", write_opts));
+  }
+
+  ASSERT_OK(Flush());
+
+  constexpr bool cache_only = true;
+  constexpr SequenceNumber lower_bound_seq = 0;
+  auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(
+      dbfull()->DefaultColumnFamily());
+  assert(cfhi);
+  assert(cfhi->cfd());
+  SuperVersion* sv = cfhi->cfd()->GetSuperVersion();
+  for (uint64_t key = 0; key < 100; ++key) {
+    std::string key_str;
+    PutFixed64(&key_str, key);
+    std::reverse(key_str.begin(), key_str.end());
+    std::string ts;
+    SequenceNumber seq = kMaxSequenceNumber;
+    bool found_record_for_key = false;
+    bool is_blob_index = false;
+
+    const Status s = dbfull()->GetLatestSequenceForKey(
+        sv, key_str, cache_only, lower_bound_seq, &seq, &ts,
+        &found_record_for_key, &is_blob_index);
+    ASSERT_OK(s);
+    std::string expected_ts;
+    PutFixed64(&expected_ts, kTsU64Value);
+    ASSERT_EQ(expected_ts, ts);
+    ASSERT_TRUE(found_record_for_key);
+    ASSERT_FALSE(is_blob_index);
+  }
+
+  // Verify that no read to SST files.
+  ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0));
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,21 +8,35 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_test_util.h"
+
 #include "db/forward_iterator.h"
+#include "env/mock_env.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/env_encryption.h"
+#include "rocksdb/unique_id.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "table/format.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+namespace {
+int64_t MaybeCurrentTime(Env* env) {
+  int64_t time = 1337346000;  // arbitrary fallback default
+  env->GetCurrentTime(&time).PermitUncheckedError();
+  return time;
+}
+}  // namespace
+
 // Special Env used to delay background operations
 
-SpecialEnv::SpecialEnv(Env* base)
+SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep)
     : EnvWrapper(base),
+      maybe_starting_time_(MaybeCurrentTime(base)),
       rnd_(301),
       sleep_counter_(this),
-      addon_time_(0),
-      time_elapse_only_sleep_(false),
-      no_slowdown_(false) {
+      time_elapse_only_sleep_(time_elapse_only_sleep),
+      no_slowdown_(time_elapse_only_sleep) {
   delay_sstable_sync_.store(false, std::memory_order_release);
   drop_writes_.store(false, std::memory_order_release);
   no_space_.store(false, std::memory_order_release);
@@ -32,6 +46,7 @@
   manifest_sync_error_.store(false, std::memory_order_release);
   manifest_write_error_.store(false, std::memory_order_release);
   log_write_error_.store(false, std::memory_order_release);
+  no_file_overwrite_.store(false, std::memory_order_release);
   random_file_open_counter_.store(0, std::memory_order_relaxed);
   delete_count_.store(0, std::memory_order_relaxed);
   num_open_wal_file_.store(0);
@@ -43,37 +58,33 @@
   non_writable_count_ = 0;
   table_write_callback_ = nullptr;
 }
-#ifndef ROCKSDB_LITE
-ROT13BlockCipher rot13Cipher_(16);
-#endif  // ROCKSDB_LITE
-
-DBTestBase::DBTestBase(const std::string path)
+DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
     : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
   Env* base_env = Env::Default();
-#ifndef ROCKSDB_LITE
-  const char* test_env_uri = getenv("TEST_ENV_URI");
-  if (test_env_uri) {
-    Env* test_env = nullptr;
-    Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
-    base_env = test_env;
-    EXPECT_OK(s);
-    EXPECT_NE(Env::Default(), base_env);
-  }
-#endif  // !ROCKSDB_LITE
+  ConfigOptions config_options;
+  EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_));
   EXPECT_NE(nullptr, base_env);
   if (getenv("MEM_ENV")) {
-    mem_env_ = new MockEnv(base_env);
+    mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock());
   }
 #ifndef ROCKSDB_LITE
   if (getenv("ENCRYPTED_ENV")) {
-    encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env,
-                                     new CTREncryptionProvider(rot13Cipher_));
+    std::shared_ptr<EncryptionProvider> provider;
+    std::string provider_id = getenv("ENCRYPTED_ENV");
+    if (provider_id.find("=") == std::string::npos &&
+        !EndsWith(provider_id, "://test")) {
+      provider_id = provider_id + "://test";
+    }
+    EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id,
+                                                   &provider));
+    encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider);
   }
 #endif  // !ROCKSDB_LITE
   env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
                                        : (mem_env_ ? mem_env_ : base_env));
   env_->SetBackgroundThreads(1, Env::LOW);
   env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->skip_fsync_ = !env_do_fsync;
   dbname_ = test::PerThreadDBPath(env_, path);
   alternative_wal_dir_ = dbname_ + "/wal";
   alternative_db_log_dir_ = dbname_ + "/db_log_dir";
@@ -189,28 +200,28 @@
     Destroy(last_options_);
     auto options = CurrentOptions();
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kUniversalCompaction) {
     option_config_ = kUniversalCompactionMultiLevel;
     Destroy(last_options_);
     auto options = CurrentOptions();
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kUniversalCompactionMultiLevel) {
     option_config_ = kLevelSubcompactions;
     Destroy(last_options_);
     auto options = CurrentOptions();
     assert(options.max_subcompactions > 1);
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kLevelSubcompactions) {
     option_config_ = kUniversalSubcompactions;
     Destroy(last_options_);
     auto options = CurrentOptions();
     assert(options.max_subcompactions > 1);
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else {
     return false;
@@ -225,7 +236,7 @@
     auto options = CurrentOptions();
     Destroy(options);
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kDBLogDir) {
     option_config_ = kWalDirAndMmapReads;
@@ -233,14 +244,14 @@
     auto options = CurrentOptions();
     Destroy(options);
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kWalDirAndMmapReads) {
     option_config_ = kRecycleLogFiles;
     Destroy(last_options_);
     auto options = CurrentOptions();
     Destroy(options);
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else {
     return false;
@@ -320,7 +331,7 @@
   return GetOptions(option_config_, default_options, options_override);
 }
 
-Options DBTestBase::GetDefaultOptions() {
+Options DBTestBase::GetDefaultOptions() const {
   Options options;
   options.write_buffer_size = 4090 * 4096;
   options.target_file_size_base = 2 * 1024 * 1024;
@@ -328,6 +339,10 @@
   options.max_open_files = 5000;
   options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
   options.compaction_pri = CompactionPri::kByCompensatedSize;
+  options.env = env_;
+  if (!env_->skip_fsync_) {
+    options.track_and_verify_wals_in_manifest = true;
+  }
   return options;
 }
 
@@ -356,28 +371,28 @@
       options.unordered_write = false;
       break;
     case kPlainTableFirstBytePrefix:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewFixedPrefixTransform(1));
       options.allow_mmap_reads = can_allow_mmap;
       options.max_sequential_skip_in_iterations = 999999;
       set_block_based_table_factory = false;
       break;
     case kPlainTableCappedPrefix:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewCappedPrefixTransform(8));
       options.allow_mmap_reads = can_allow_mmap;
       options.max_sequential_skip_in_iterations = 999999;
       set_block_based_table_factory = false;
       break;
     case kPlainTableCappedPrefixNonMmap:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewCappedPrefixTransform(8));
       options.allow_mmap_reads = false;
       options.max_sequential_skip_in_iterations = 999999;
       set_block_based_table_factory = false;
       break;
     case kPlainTableAllBytesPrefix:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewNoopTransform());
       options.allow_mmap_reads = can_allow_mmap;
       options.max_sequential_skip_in_iterations = 999999;
@@ -399,20 +414,7 @@
         options.use_direct_reads = true;
         options.use_direct_io_for_flush_and_compaction = true;
         options.compaction_readahead_size = 2 * 1024 * 1024;
-  #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
-      !defined(OS_AIX) && !defined(OS_OPENBSD)
-        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-            "NewWritableFile:O_DIRECT", [&](void* arg) {
-              int* val = static_cast<int*>(arg);
-              *val &= ~O_DIRECT;
-            });
-        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-            "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
-              int* val = static_cast<int*>(arg);
-              *val &= ~O_DIRECT;
-            });
-        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-#endif
+        SetupSyncPointsToMockDirectIO();
         break;
       }
 #endif  // ROCKSDB_LITE
@@ -474,16 +476,15 @@
     case kInfiniteMaxOpenFiles:
       options.max_open_files = -1;
       break;
-    case kxxHashChecksum: {
-      table_options.checksum = kxxHash;
-      break;
-    }
-    case kxxHash64Checksum: {
-      table_options.checksum = kxxHash64;
+    case kXXH3Checksum: {
+      table_options.checksum = kXXH3;
+      // Thrown in here for basic coverage:
+      options.DisableExtraChecks();
       break;
     }
     case kFIFOCompaction: {
       options.compaction_style = kCompactionStyleFIFO;
+      options.max_open_files = -1;
       break;
     }
     case kBlockBasedTableWithPrefixHashIndex: {
@@ -497,6 +498,7 @@
       break;
     }
     case kBlockBasedTableWithPartitionedIndex: {
+      table_options.format_version = 3;
       table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
       options.prefix_extractor.reset(NewNoopTransform());
       break;
@@ -517,6 +519,11 @@
       table_options.index_block_restart_interval = 8;
       break;
     }
+    case kBlockBasedTableWithLatestFormat: {
+      // In case different from default
+      table_options.format_version = kLatestFormatVersion;
+      break;
+    }
     case kOptimizeFiltersForHits: {
       options.optimize_filters_for_hits = true;
       set_block_based_table_factory = true;
@@ -608,6 +615,39 @@
   ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
 }
 
+void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) {
+  time_elapse_only_sleep_on_reopen_ = true;
+
+  // Need to disable stats dumping and persisting which also use
+  // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
+  // With time_elapse_only_sleep_, this can hang on some platforms (MacOS)
+  // because (a) on some platforms, pthread_cond_timedwait does not appear
+  // to release the lock for other threads to operate if the deadline time
+  // is already passed, and (b) TimedWait calls are currently a bad abstraction
+  // because the deadline parameter is usually computed from Env time,
+  // but is interpreted in real clock time.
+  options->stats_dump_period_sec = 0;
+  options->stats_persist_period_sec = 0;
+}
+
+void DBTestBase::MaybeInstallTimeElapseOnlySleep(const DBOptions& options) {
+  if (time_elapse_only_sleep_on_reopen_) {
+    assert(options.env == env_ ||
+           static_cast_with_check<CompositeEnvWrapper>(options.env)
+                   ->env_target() == env_);
+    assert(options.stats_dump_period_sec == 0);
+    assert(options.stats_persist_period_sec == 0);
+    // We cannot set these before destroying the last DB because they might
+    // cause a deadlock or similar without the appropriate options set in
+    // the DB.
+    env_->time_elapse_only_sleep_ = true;
+    env_->no_slowdown_ = true;
+  } else {
+    // Going back in same test run is not yet supported, so no
+    // reset in this case.
+  }
+}
+
 Status DBTestBase::TryReopenWithColumnFamilies(
     const std::vector<std::string>& cfs, const std::vector<Options>& options) {
   Close();
@@ -618,6 +658,7 @@
   }
   DBOptions db_opts = DBOptions(options[0]);
   last_options_ = options[0];
+  MaybeInstallTimeElapseOnlySleep(db_opts);
   return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
 }
 
@@ -634,7 +675,7 @@
 
 void DBTestBase::Close() {
   for (auto h : handles_) {
-    db_->DestroyColumnFamilyHandle(h);
+    EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
   }
   handles_.clear();
   delete db_;
@@ -644,7 +685,7 @@
 void DBTestBase::DestroyAndReopen(const Options& options) {
   // Destroy using last options
   Destroy(last_options_);
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
 }
 
 void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
@@ -652,7 +693,8 @@
   if (delete_cf_paths) {
     for (size_t i = 0; i < handles_.size(); ++i) {
       ColumnFamilyDescriptor cfdescriptor;
-      handles_[i]->GetDescriptor(&cfdescriptor);
+      // GetDescriptor is not implemented for ROCKSDB_LITE
+      handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
       column_families.push_back(cfdescriptor);
     }
   }
@@ -661,6 +703,7 @@
 }
 
 Status DBTestBase::ReadOnlyReopen(const Options& options) {
+  MaybeInstallTimeElapseOnlySleep(options);
   return DB::OpenForReadOnly(options, dbname_, &db_);
 }
 
@@ -670,11 +713,12 @@
   // Note: operator= is an unsafe approach here since it destructs
   // std::shared_ptr in the same order of their creation, in contrast to
   // destructors which destructs them in the opposite order of creation. One
-  // particular problme is that the cache destructor might invoke callback
+  // particular problem is that the cache destructor might invoke callback
   // functions that use Option members such as statistics. To work around this
-  // problem, we manually call destructor of table_facotry which eventually
+  // problem, we manually call destructor of table_factory which eventually
   // clears the block cache.
   last_options_ = options;
+  MaybeInstallTimeElapseOnlySleep(options);
   return DB::Open(options, dbname_, &db_);
 }
 
@@ -909,12 +953,13 @@
   InternalKeyComparator icmp(options.comparator);
   ReadRangeDelAggregator range_del_agg(&icmp,
                                        kMaxSequenceNumber /* upper_bound */);
+  ReadOptions read_options;
   ScopedArenaIterator iter;
   if (cf == 0) {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
                                            kMaxSequenceNumber));
   } else {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
                                            kMaxSequenceNumber, handles_[cf]));
   }
   InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
@@ -927,7 +972,8 @@
     bool first = true;
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      if (!ParseInternalKey(iter->key(), &ikey)) {
+      if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) !=
+          Status::OK()) {
         result += "CORRUPTED";
       } else {
         if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
@@ -1029,12 +1075,12 @@
   std::string property;
   if (cf == 0) {
     // default cfd
-    EXPECT_TRUE(db_->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    EXPECT_TRUE(db_->GetProperty("rocksdb.num-files-at-level" + ToString(level),
+                                 &property));
   } else {
-    EXPECT_TRUE(db_->GetProperty(
-        handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
-        &property));
+    EXPECT_TRUE(db_->GetProperty(handles_[cf],
+                                 "rocksdb.num-files-at-level" + ToString(level),
+                                 &property));
   }
   return atoi(property.c_str());
 }
@@ -1044,12 +1090,10 @@
   if (cf == 0) {
     // default cfd
     EXPECT_TRUE(db_->GetProperty(
-        "rocksdb.compression-ratio-at-level" + NumberToString(level),
-        &property));
+        "rocksdb.compression-ratio-at-level" + ToString(level), &property));
   } else {
     EXPECT_TRUE(db_->GetProperty(
-        handles_[cf],
-        "rocksdb.compression-ratio-at-level" + NumberToString(level),
+        handles_[cf], "rocksdb.compression-ratio-at-level" + ToString(level),
         &property));
   }
   return std::stod(property);
@@ -1084,29 +1128,77 @@
   result.resize(last_non_zero_offset);
   return result;
 }
+
 #endif  // !ROCKSDB_LITE
 
+std::vector<uint64_t> DBTestBase::GetBlobFileNumbers() {
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  std::vector<uint64_t> result;
+  result.reserve(blob_files.size());
+
+  for (const auto& blob_file : blob_files) {
+    result.emplace_back(blob_file.first);
+  }
+
+  return result;
+}
+
 size_t DBTestBase::CountFiles() {
+  size_t count = 0;
   std::vector<std::string> files;
-  env_->GetChildren(dbname_, &files);
+  if (env_->GetChildren(dbname_, &files).ok()) {
+    count += files.size();
+  }
 
-  std::vector<std::string> logfiles;
   if (dbname_ != last_options_.wal_dir) {
-    env_->GetChildren(last_options_.wal_dir, &logfiles);
+    if (env_->GetChildren(last_options_.wal_dir, &files).ok()) {
+      count += files.size();
+    }
   }
 
-  return files.size() + logfiles.size();
+  return count;
+};
+
+Status DBTestBase::CountFiles(size_t* count) {
+  std::vector<std::string> files;
+  Status s = env_->GetChildren(dbname_, &files);
+  if (!s.ok()) {
+    return s;
+  }
+  size_t files_count = files.size();
+
+  if (dbname_ != last_options_.wal_dir) {
+    s = env_->GetChildren(last_options_.wal_dir, &files);
+    if (!s.ok()) {
+      return s;
+    }
+    *count = files_count + files.size();
+  }
+
+  return Status::OK();
 }
 
-uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) {
+Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
+                        uint64_t* size) {
   Range r(start, limit);
-  uint64_t size;
   if (cf == 0) {
-    db_->GetApproximateSizes(&r, 1, &size);
+    return db_->GetApproximateSizes(&r, 1, size);
   } else {
-    db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+    return db_->GetApproximateSizes(handles_[1], &r, 1, size);
   }
-  return size;
 }
 
 void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
@@ -1147,9 +1239,9 @@
 void DBTestBase::MoveFilesToLevel(int level, int cf) {
   for (int l = 0; l < level; ++l) {
     if (cf > 0) {
-      dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]);
+      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]));
     } else {
-      dbfull()->TEST_CompactRange(l, nullptr, nullptr);
+      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr));
     }
   }
 }
@@ -1176,7 +1268,7 @@
 
 void DBTestBase::GetSstFiles(Env* env, std::string path,
                              std::vector<std::string>* files) {
-  env->GetChildren(path, files);
+  EXPECT_OK(env->GetChildren(path, files));
 
   files->erase(
       std::remove_if(files->begin(), files->end(), [](std::string name) {
@@ -1196,24 +1288,24 @@
 void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
                                  bool nowait) {
   for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
-    ASSERT_OK(Put(cf, Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+    ASSERT_OK(Put(cf, Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
     (*key_idx)++;
   }
   if (!nowait) {
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 }
 
 // this will generate non-overlapping files since it keeps increasing key_idx
 void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
   for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
-    ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+    ASSERT_OK(Put(Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
     (*key_idx)++;
   }
   if (!nowait) {
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 }
 
@@ -1221,12 +1313,12 @@
 
 void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
   for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) {
-    ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 2000)));
+    ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(2000)));
   }
-  ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 200)));
+  ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200)));
   if (!nowait) {
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 }
 
@@ -1324,21 +1416,22 @@
                                        kMaxSequenceNumber /* upper_bound */);
   // This should be defined after range_del_agg so that it destructs the
   // assigned iterator before it range_del_agg is already destructed.
+  ReadOptions read_options;
   ScopedArenaIterator iter;
   if (cf != 0) {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
                                            kMaxSequenceNumber, handles_[cf]));
   } else {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena, &range_del_agg,
                                            kMaxSequenceNumber));
   }
   iter->SeekToFirst();
-  ASSERT_EQ(iter->status().ok(), true);
+  ASSERT_OK(iter->status());
   int seq = numValues;
   while (iter->Valid()) {
     ParsedInternalKey ikey;
     ikey.clear();
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
 
     // checks sequence number for updates
     ASSERT_EQ(ikey.sequence, (unsigned)seq--);
@@ -1371,36 +1464,40 @@
   ASSERT_OK(destfile->Close());
 }
 
-std::unordered_map<std::string, uint64_t> DBTestBase::GetAllSSTFiles(
-    uint64_t* total_size) {
-  std::unordered_map<std::string, uint64_t> res;
-
+Status DBTestBase::GetAllDataFiles(
+    const FileType file_type, std::unordered_map<std::string, uint64_t>* files,
+    uint64_t* total_size /* = nullptr */) {
   if (total_size) {
     *total_size = 0;
   }
-  std::vector<std::string> files;
-  env_->GetChildren(dbname_, &files);
-  for (auto& file_name : files) {
-    uint64_t number;
-    FileType type;
-    std::string file_path = dbname_ + "/" + file_name;
-    if (ParseFileName(file_name, &number, &type) && type == kTableFile) {
-      uint64_t file_size = 0;
-      env_->GetFileSize(file_path, &file_size);
-      res[file_path] = file_size;
-      if (total_size) {
-        *total_size += file_size;
+  std::vector<std::string> children;
+  Status s = env_->GetChildren(dbname_, &children);
+  if (s.ok()) {
+    for (auto& file_name : children) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file_name, &number, &type) && type == file_type) {
+        std::string file_path = dbname_ + "/" + file_name;
+        uint64_t file_size = 0;
+        s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          break;
+        }
+        (*files)[file_path] = file_size;
+        if (total_size) {
+          *total_size += file_size;
+        }
       }
     }
   }
-  return res;
+  return s;
 }
 
 std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
                                                       const std::string& path) {
   std::vector<std::string> files;
   std::vector<uint64_t> file_numbers;
-  env->GetChildren(path, &files);
+  EXPECT_OK(env->GetChildren(path, &files));
   uint64_t number;
   FileType type;
   for (size_t i = 0; i < files.size(); ++i) {
@@ -1532,13 +1629,14 @@
   InternalKeyComparator icmp(last_options_.comparator);
   ReadRangeDelAggregator range_del_agg(&icmp,
                                        kMaxSequenceNumber /* upper_bound */);
-  auto iter =
-      dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber);
+  ReadOptions read_options;
+  auto iter = dbfull()->NewInternalIterator(read_options, &arena,
+                                            &range_del_agg, kMaxSequenceNumber);
   iter->SeekToFirst();
   for (auto p : true_data) {
     ASSERT_TRUE(iter->Valid());
     ParsedInternalKey ikey;
-    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
     ASSERT_EQ(p.first, ikey.user_key);
     ASSERT_EQ(p.second, iter->value());
     iter->Next();
@@ -1561,4 +1659,14 @@
 }
 #endif  // ROCKSDB_LITE
 
+void VerifySstUniqueIds(const TablePropertiesCollection& props) {
+  ASSERT_FALSE(props.empty());  // suspicious test if empty
+  std::unordered_set<std::string> seen;
+  for (auto& pair : props) {
+    std::string id;
+    ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id));
+    ASSERT_TRUE(seen.insert(id).second);
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_test_util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_test_util.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,9 +10,9 @@
 #pragma once
 
 #include <fcntl.h>
-#include <cinttypes>
 
 #include <algorithm>
+#include <cinttypes>
 #include <map>
 #include <set>
 #include <string>
@@ -22,10 +22,7 @@
 #include <vector>
 
 #include "db/db_impl/db_impl.h"
-#include "db/dbformat.h"
-#include "env/mock_env.h"
 #include "file/filename.h"
-#include "memtable/hash_linklist_rep.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/convenience.h"
@@ -38,21 +35,18 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/checkpoint.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
-#include "test_util/mock_time_env.h"
-#include "util/compression.h"
-#include "util/mutexlock.h"
-
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
-#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
+class MockEnv;
 
 namespace anon {
 class AtomicCounter {
@@ -116,98 +110,13 @@
 
 enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
 
-// A hacky skip list mem table that triggers flush after number of entries.
-class SpecialMemTableRep : public MemTableRep {
- public:
-  explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable,
-                              int num_entries_flush)
-      : MemTableRep(allocator),
-        memtable_(memtable),
-        num_entries_flush_(num_entries_flush),
-        num_entries_(0) {}
-
-  virtual KeyHandle Allocate(const size_t len, char** buf) override {
-    return memtable_->Allocate(len, buf);
-  }
-
-  // Insert key into the list.
-  // REQUIRES: nothing that compares equal to key is currently in the list.
-  virtual void Insert(KeyHandle handle) override {
-    num_entries_++;
-    memtable_->Insert(handle);
-  }
-
-  void InsertConcurrently(KeyHandle handle) override {
-    num_entries_++;
-    memtable_->Insert(handle);
-  }
-
-  // Returns true iff an entry that compares equal to key is in the list.
-  virtual bool Contains(const char* key) const override {
-    return memtable_->Contains(key);
-  }
-
-  virtual size_t ApproximateMemoryUsage() override {
-    // Return a high memory usage when number of entries exceeds the threshold
-    // to trigger a flush.
-    return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024;
-  }
-
-  virtual void Get(const LookupKey& k, void* callback_args,
-                   bool (*callback_func)(void* arg,
-                                         const char* entry)) override {
-    memtable_->Get(k, callback_args, callback_func);
-  }
-
-  uint64_t ApproximateNumEntries(const Slice& start_ikey,
-                                 const Slice& end_ikey) override {
-    return memtable_->ApproximateNumEntries(start_ikey, end_ikey);
-  }
-
-  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
-    return memtable_->GetIterator(arena);
-  }
-
-  virtual ~SpecialMemTableRep() override {}
-
- private:
-  std::unique_ptr<MemTableRep> memtable_;
-  int num_entries_flush_;
-  int num_entries_;
-};
-
-// The factory for the hacky skip list mem table that triggers flush after
-// number of entries exceeds a threshold.
-class SpecialSkipListFactory : public MemTableRepFactory {
- public:
-  // After number of inserts exceeds `num_entries_flush` in a mem table, trigger
-  // flush.
-  explicit SpecialSkipListFactory(int num_entries_flush)
-      : num_entries_flush_(num_entries_flush) {}
-
-  using MemTableRepFactory::CreateMemTableRep;
-  virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Allocator* allocator,
-      const SliceTransform* transform, Logger* /*logger*/) override {
-    return new SpecialMemTableRep(
-        allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0),
-        num_entries_flush_);
-  }
-  virtual const char* Name() const override { return "SkipListFactory"; }
-
-  bool IsInsertConcurrentlySupported() const override {
-    return factory_.IsInsertConcurrentlySupported();
-  }
-
- private:
-  SkipListFactory factory_;
-  int num_entries_flush_;
-};
-
 // Special Env used to delay background operations
 class SpecialEnv : public EnvWrapper {
  public:
-  explicit SpecialEnv(Env* base);
+  explicit SpecialEnv(Env* base, bool time_elapse_only_sleep = false);
+
+  static const char* kClassName() { return "SpecialEnv"; }
+  const char* Name() const override { return kClassName(); }
 
   Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& soptions) override {
@@ -233,6 +142,11 @@
           return base_->Append(data);
         }
       }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
       Status PositionedAppend(const Slice& data, uint64_t offset) override {
         if (env_->table_write_callback_) {
           (*env_->table_write_callback_)();
@@ -247,6 +161,11 @@
           return base_->PositionedAppend(data, offset);
         }
       }
+      Status PositionedAppend(
+          const Slice& data, uint64_t offset,
+          const DataVerificationInfo& /* verification_info */) override {
+        return PositionedAppend(data, offset);
+      }
       Status Truncate(uint64_t size) override { return base_->Truncate(size); }
       Status RangeSync(uint64_t offset, uint64_t nbytes) override {
         Status s = base_->RangeSync(offset, nbytes);
@@ -276,7 +195,10 @@
         while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
           env_->SleepForMicroseconds(100000);
         }
-        Status s = base_->Sync();
+        Status s;
+        if (!env_->skip_fsync_) {
+          s = base_->Sync();
+        }
 #if !(defined NDEBUG) || !defined(OS_WIN)
         TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s);
 #endif  // !(defined NDEBUG) || !defined(OS_WIN)
@@ -294,6 +216,9 @@
       Status Allocate(uint64_t offset, uint64_t len) override {
         return base_->Allocate(offset, len);
       }
+      size_t GetUniqueId(char* id, size_t max_size) const override {
+        return base_->GetUniqueId(id, max_size);
+      }
     };
     class ManifestFile : public WritableFile {
      public:
@@ -306,6 +231,12 @@
           return base_->Append(data);
         }
       }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
+
       Status Truncate(uint64_t size) override { return base_->Truncate(size); }
       Status Close() override { return base_->Close(); }
       Status Flush() override { return base_->Flush(); }
@@ -314,7 +245,11 @@
         if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated sync error");
         } else {
-          return base_->Sync();
+          if (env_->skip_fsync_) {
+            return Status::OK();
+          } else {
+            return base_->Sync();
+          }
         }
       }
       uint64_t GetFileSize() override { return base_->GetFileSize(); }
@@ -353,15 +288,26 @@
 #endif
         return s;
       }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
       Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      void PrepareWrite(size_t offset, size_t len) override {
+        base_->PrepareWrite(offset, len);
+      }
+      void SetPreallocationBlockSize(size_t size) override {
+        base_->SetPreallocationBlockSize(size);
+      }
       Status Close() override {
 // SyncPoint is not supported in Released Windows Mode.
 #if !(defined NDEBUG) || !defined(OS_WIN)
         // Check preallocation size
-        // preallocation size is never passed to base file.
-        size_t preallocation_size = preallocation_block_size();
+        size_t block_size, last_allocated_block;
+        base_->GetPreallocationStatus(&block_size, &last_allocated_block);
         TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
-                                 &preallocation_size);
+                                 &block_size);
 #endif  // !(defined NDEBUG) || !defined(OS_WIN)
 
         return base_->Close();
@@ -369,7 +315,15 @@
       Status Flush() override { return base_->Flush(); }
       Status Sync() override {
         ++env_->sync_counter_;
-        return base_->Sync();
+        if (env_->corrupt_in_sync_) {
+          EXPECT_OK(Append(std::string(33000, ' ')));
+          return Status::IOError("Ingested Sync Failure");
+        }
+        if (env_->skip_fsync_) {
+          return Status::OK();
+        } else {
+          return base_->Sync();
+        }
       }
       bool IsSyncThreadSafe() const override {
         return env_->is_wal_sync_thread_safe_.load();
@@ -382,6 +336,40 @@
       SpecialEnv* env_;
       std::unique_ptr<WritableFile> base_;
     };
+    class OtherFile : public WritableFile {
+     public:
+      OtherFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override { return base_->Append(data); }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        if (env_->skip_fsync_) {
+          return Status::OK();
+        } else {
+          return base_->Sync();
+        }
+      }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+
+    if (no_file_overwrite_.load(std::memory_order_acquire) &&
+        target()->FileExists(f).ok()) {
+      return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true.");
+    }
 
     if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
       uint32_t random_number;
@@ -416,6 +404,8 @@
         r->reset(new ManifestFile(this, std::move(*r)));
       } else if (strstr(f.c_str(), "log") != nullptr) {
         r->reset(new WalFile(this, std::move(*r)));
+      } else {
+        r->reset(new OtherFile(this, std::move(*r)));
       }
     }
     return s;
@@ -452,12 +442,44 @@
       std::atomic<size_t>* bytes_read_;
     };
 
+    class RandomFailureFile : public RandomAccessFile {
+     public:
+      RandomFailureFile(std::unique_ptr<RandomAccessFile>&& target,
+                        std::atomic<uint64_t>* failure_cnt, uint32_t fail_odd)
+          : target_(std::move(target)),
+            fail_cnt_(failure_cnt),
+            fail_odd_(fail_odd) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        if (Random::GetTLSInstance()->OneIn(fail_odd_)) {
+          fail_cnt_->fetch_add(1);
+          return Status::IOError("random error");
+        }
+        return target_->Read(offset, n, result, scratch);
+      }
+
+      virtual Status Prefetch(uint64_t offset, size_t n) override {
+        return target_->Prefetch(offset, n);
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      std::atomic<uint64_t>* fail_cnt_;
+      uint32_t fail_odd_;
+    };
+
     Status s = target()->NewRandomAccessFile(f, r, soptions);
     random_file_open_counter_++;
-    if (s.ok() && count_random_reads_) {
-      r->reset(new CountingFile(std::move(*r), &random_read_counter_,
-                                &random_read_bytes_counter_));
+    if (s.ok()) {
+      if (count_random_reads_) {
+        r->reset(new CountingFile(std::move(*r), &random_read_counter_,
+                                  &random_read_bytes_counter_));
+      } else if (rand_reads_fail_odd_ > 0) {
+        r->reset(new RandomFailureFile(std::move(*r), &num_reads_fails_,
+                                       rand_reads_fail_odd_));
+      }
     }
+
     if (s.ok() && soptions.compaction_readahead_size > 0) {
       compaction_readahead_size_ = soptions.compaction_readahead_size;
     }
@@ -493,20 +515,35 @@
   virtual void SleepForMicroseconds(int micros) override {
     sleep_counter_.Increment();
     if (no_slowdown_ || time_elapse_only_sleep_) {
-      addon_time_.fetch_add(micros);
+      addon_microseconds_.fetch_add(micros);
     }
     if (!no_slowdown_) {
       target()->SleepForMicroseconds(micros);
     }
   }
 
+  void MockSleepForMicroseconds(int64_t micros) {
+    sleep_counter_.Increment();
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(micros);
+  }
+
+  void MockSleepForSeconds(int64_t seconds) {
+    sleep_counter_.Increment();
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(seconds * 1000000);
+  }
+
   virtual Status GetCurrentTime(int64_t* unix_time) override {
     Status s;
-    if (!time_elapse_only_sleep_) {
+    if (time_elapse_only_sleep_) {
+      *unix_time = maybe_starting_time_;
+    } else {
       s = target()->GetCurrentTime(unix_time);
     }
     if (s.ok()) {
-      *unix_time += addon_time_.load();
+      // mock microseconds elapsed to seconds of time
+      *unix_time += addon_microseconds_.load() / 1000000;
     }
     return s;
   }
@@ -518,12 +555,12 @@
 
   virtual uint64_t NowNanos() override {
     return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) +
-           addon_time_.load() * 1000;
+           addon_microseconds_.load() * 1000;
   }
 
   virtual uint64_t NowMicros() override {
     return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) +
-           addon_time_.load();
+           addon_microseconds_.load();
   }
 
   virtual Status DeleteFile(const std::string& fname) override {
@@ -531,6 +568,37 @@
     return target()->DeleteFile(fname);
   }
 
+  void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; }
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
+    if (!skip_fsync_) {
+      return target()->NewDirectory(name, result);
+    } else {
+      class NoopDirectory : public Directory {
+       public:
+        NoopDirectory() {}
+        ~NoopDirectory() {}
+
+        Status Fsync() override { return Status::OK(); }
+      };
+
+      result->reset(new NoopDirectory());
+      return Status::OK();
+    }
+  }
+
+  Status RenameFile(const std::string& src, const std::string& dest) override {
+    rename_count_.fetch_add(1);
+    if (rename_error_.load(std::memory_order_acquire)) {
+      return Status::NotSupported("Simulated `RenameFile()` error.");
+    }
+    return target()->RenameFile(src, dest);
+  }
+
+  // Something to return when mocking current time
+  const int64_t maybe_starting_time_;
+
   Random rnd_;
   port::Mutex rnd_mutex_;  // Lock to pretect rnd_
 
@@ -555,13 +623,21 @@
   // Force write to log files to fail while this pointer is non-nullptr
   std::atomic<bool> log_write_error_;
 
+  // Force `RenameFile()` to fail while this pointer is non-nullptr
+  std::atomic<bool> rename_error_{false};
+
   // Slow down every log write, in micro-seconds.
   std::atomic<int> log_write_slowdown_;
 
+  // If true, returns Status::NotSupported for file overwrite.
+  std::atomic<bool> no_file_overwrite_;
+
   // Number of WAL files that are still open for write.
   std::atomic<int> num_open_wal_file_;
 
   bool count_random_reads_;
+  uint32_t rand_reads_fail_odd_ = 0;
+  std::atomic<uint64_t> num_reads_fails_;
   anon::AtomicCounter random_read_counter_;
   std::atomic<size_t> random_read_bytes_counter_;
   std::atomic<int> random_file_open_counter_;
@@ -575,6 +651,12 @@
 
   std::atomic<int> sync_counter_;
 
+  // If true, all fsync to files and directories are skipped.
+  bool skip_fsync_ = false;
+
+  // If true, ingest the corruption to file during sync.
+  bool corrupt_in_sync_ = false;
+
   std::atomic<uint32_t> non_writeable_rate_;
 
   std::atomic<uint32_t> new_writable_count_;
@@ -583,25 +665,33 @@
 
   std::function<void()>* table_write_callback_;
 
-  std::atomic<int64_t> addon_time_;
-
   std::atomic<int> now_cpu_count_;
 
   std::atomic<int> delete_count_;
 
-  std::atomic<bool> time_elapse_only_sleep_;
-
-  bool no_slowdown_;
+  std::atomic<int> rename_count_{0};
 
   std::atomic<bool> is_wal_sync_thread_safe_{true};
 
   std::atomic<size_t> compaction_readahead_size_{};
+
+ private:  // accessing these directly is prone to error
+  friend class DBTestBase;
+
+  std::atomic<int64_t> addon_microseconds_{0};
+
+  // Do not modify in the env of a running DB (could cause deadlock)
+  std::atomic<bool> time_elapse_only_sleep_;
+
+  bool no_slowdown_;
 };
 
 #ifndef ROCKSDB_LITE
 class OnFileDeletionListener : public EventListener {
  public:
   OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {}
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "OnFileDeletionListener"; }
 
   void SetExpectedFileName(const std::string file_name) {
     expected_file_name_ = file_name;
@@ -623,6 +713,19 @@
   size_t matched_count_;
   std::string expected_file_name_;
 };
+
+class FlushCounterListener : public EventListener {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "FlushCounterListener"; }
+  std::atomic<int> count{0};
+  std::atomic<FlushReason> expected_flush_reason{FlushReason::kOthers};
+
+  void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+    count++;
+    ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason);
+  }
+};
 #endif
 
 // A test merge operator mimics put but also fails if one of merge operands is
@@ -647,6 +750,86 @@
   virtual const char* Name() const override { return "TestPutOperator"; }
 };
 
+// A wrapper around Cache that can easily be extended with instrumentation,
+// etc.
+class CacheWrapper : public Cache {
+ public:
+  explicit CacheWrapper(std::shared_ptr<Cache> target)
+      : target_(std::move(target)) {}
+
+  const char* Name() const override { return target_->Name(); }
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    return target_->Insert(key, value, charge, deleter, handle, priority);
+  }
+
+  using Cache::Lookup;
+  Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
+    return target_->Lookup(key, stats);
+  }
+
+  bool Ref(Handle* handle) override { return target_->Ref(handle); }
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool force_erase = false) override {
+    return target_->Release(handle, force_erase);
+  }
+
+  void* Value(Handle* handle) override { return target_->Value(handle); }
+
+  void Erase(const Slice& key) override { target_->Erase(key); }
+  uint64_t NewId() override { return target_->NewId(); }
+
+  void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); }
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+    target_->SetStrictCapacityLimit(strict_capacity_limit);
+  }
+
+  bool HasStrictCapacityLimit() const override {
+    return target_->HasStrictCapacityLimit();
+  }
+
+  size_t GetCapacity() const override { return target_->GetCapacity(); }
+
+  size_t GetUsage() const override { return target_->GetUsage(); }
+
+  size_t GetUsage(Handle* handle) const override {
+    return target_->GetUsage(handle);
+  }
+
+  size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); }
+
+  size_t GetCharge(Handle* handle) const override {
+    return target_->GetCharge(handle);
+  }
+
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return target_->GetDeleter(handle);
+  }
+
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe) override {
+    target_->ApplyToAllCacheEntries(callback, thread_safe);
+  }
+
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    target_->ApplyToAllEntries(callback, opts);
+  }
+
+  void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
+
+ protected:
+  std::shared_ptr<Cache> target_;
+};
+
 class DBTestBase : public testing::Test {
  public:
   // Sequence of option configurations to try
@@ -674,7 +857,7 @@
     kUniversalCompactionMultiLevel = 20,
     kCompressedBlockCache = 21,
     kInfiniteMaxOpenFiles = 22,
-    kxxHashChecksum = 23,
+    kXXH3Checksum = 23,
     kFIFOCompaction = 24,
     kOptimizeFiltersForHits = 25,
     kRowCache = 26,
@@ -687,9 +870,9 @@
     kBlockBasedTableWithIndexRestartInterval,
     kBlockBasedTableWithPartitionedIndex,
     kBlockBasedTableWithPartitionedIndexFormat4,
+    kBlockBasedTableWithLatestFormat,
     kPartitionedFilterWithNewTableReaderForCompactions,
     kUniversalSubcompactions,
-    kxxHash64Checksum,
     kUnorderedWrite,
     // This must be the last line
     kEnd,
@@ -730,16 +913,13 @@
       // requires.
       kSkipMmapReads;
 
-  explicit DBTestBase(const std::string path);
+  // `env_do_fsync` decides whether the special Env would do real
+  // fsync for files and directories. Skipping fsync can speed up
+  // tests, but won't cover the exact fsync logic.
+  DBTestBase(const std::string path, bool env_do_fsync);
 
   ~DBTestBase();
 
-  static std::string RandomString(Random* rnd, int len) {
-    std::string r;
-    test::RandomString(rnd, len, &r);
-    return r;
-  }
-
   static std::string Key(int i) {
     char buf[100];
     snprintf(buf, sizeof(buf), "key%06d", i);
@@ -773,14 +953,17 @@
                          const anon::OptionsOverride& options_override =
                              anon::OptionsOverride()) const;
 
-  static Options GetDefaultOptions();
+  Options GetDefaultOptions() const;
+
+  Options GetOptions(int option_config) const {
+    return GetOptions(option_config, GetDefaultOptions());
+  }
 
-  Options GetOptions(int option_config,
-                     const Options& default_options = GetDefaultOptions(),
+  Options GetOptions(int option_config, const Options& default_options,
                      const anon::OptionsOverride& options_override =
                          anon::OptionsOverride()) const;
 
-  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
                             const Options& options);
@@ -886,12 +1069,20 @@
   int TotalTableFiles(int cf = 0, int levels = -1);
 #endif  // ROCKSDB_LITE
 
+  std::vector<uint64_t> GetBlobFileNumbers();
+
   // Return spread of files per level
   std::string FilesPerLevel(int cf = 0);
 
   size_t CountFiles();
 
-  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0);
+  Status CountFiles(size_t* count);
+
+  Status Size(const Slice& start, const Slice& limit, uint64_t* size) {
+    return Size(start, limit, 0, size);
+  }
+
+  Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size);
 
   void Compact(int cf, const Slice& start, const Slice& limit,
                uint32_t target_path_id);
@@ -969,8 +1160,9 @@
   void CopyFile(const std::string& source, const std::string& destination,
                 uint64_t size = 0);
 
-  std::unordered_map<std::string, uint64_t> GetAllSSTFiles(
-      uint64_t* total_size = nullptr);
+  Status GetAllDataFiles(const FileType file_type,
+                         std::unordered_map<std::string, uint64_t>* sst_files,
+                         uint64_t* total_size = nullptr);
 
   std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
 
@@ -995,6 +1187,19 @@
                                       Tickers ticker_type) {
     return options.statistics->getAndResetTickerCount(ticker_type);
   }
+
+  // Note: reverting this setting within the same test run is not yet
+  // supported
+  void SetTimeElapseOnlySleepOnReopen(DBOptions* options);
+
+ private:  // Prone to error on direct use
+  void MaybeInstallTimeElapseOnlySleep(const DBOptions& options);
+
+  bool time_elapse_only_sleep_on_reopen_ = false;
 };
 
+// For verifying that all files generated by current version have SST
+// unique ids.
+void VerifySstUniqueIds(const TablePropertiesCollection& props);
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_universal_compaction_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,6 +12,8 @@
 #if !defined(ROCKSDB_LITE)
 #include "rocksdb/utilities/table_properties_collectors.h"
 #include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -25,8 +27,8 @@
     : public DBTestBase,
       public ::testing::WithParamInterface<std::tuple<int, bool>> {
  public:
-  explicit DBTestUniversalCompactionBase(
-      const std::string& path) : DBTestBase(path) {}
+  explicit DBTestUniversalCompactionBase(const std::string& path)
+      : DBTestBase(path, /*env_do_fsync=*/false) {}
   void SetUp() override {
     num_levels_ = std::get<0>(GetParam());
     exclusive_manual_compaction_ = std::get<1>(GetParam());
@@ -43,7 +45,8 @@
 
 class DBTestUniversalCompaction2 : public DBTestBase {
  public:
-  DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {}
+  DBTestUniversalCompaction2()
+      : DBTestBase("db_universal_compaction_test2", /*env_do_fsync=*/false) {}
 };
 
 namespace {
@@ -90,36 +93,6 @@
   std::atomic_bool expect_full_compaction_;
   std::atomic_bool expect_manual_compaction_;
 };
-
-class DelayFilter : public CompactionFilter {
- public:
-  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
-  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
-              std::string* /*new_value*/,
-              bool* /*value_changed*/) const override {
-    db_test->env_->addon_time_.fetch_add(1000);
-    return true;
-  }
-
-  const char* Name() const override { return "DelayFilter"; }
-
- private:
-  DBTestBase* db_test;
-};
-
-class DelayFilterFactory : public CompactionFilterFactory {
- public:
-  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
-  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& /*context*/) override {
-    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
-  }
-
-  const char* Name() const override { return "DelayFilterFactory"; }
-
- private:
-  DBTestBase* db_test;
-};
 }  // namespace
 
 // Make sure we don't trigger a problem if the trigger condtion is given
@@ -154,11 +127,11 @@
   for (int num = 0; num < 16; num++) {
     // Write 100KB file. And immediately it should be compacted to one file.
     GenerateNewFile(&rnd, &key_idx);
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumSortedRuns(0), 1);
   }
   ASSERT_OK(Put(Key(key_idx), ""));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumSortedRuns(0), 1);
 }
 
@@ -179,7 +152,7 @@
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   options.optimize_filters_for_hits = true;
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  options.memtable_factory.reset(new SpecialSkipListFactory(3));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(3));
 
   DestroyAndReopen(options);
 
@@ -190,15 +163,15 @@
                  Env::Priority::LOW);
 
   for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
-    Put(Key(num * 10), "val");
+    ASSERT_OK(Put(Key(num * 10), "val"));
     if (num) {
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
-    Put(Key(30 + num * 10), "val");
-    Put(Key(60 + num * 10), "val");
+    ASSERT_OK(Put(Key(30 + num * 10), "val"));
+    ASSERT_OK(Put(Key(60 + num * 10), "val"));
   }
-  Put("", "");
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   // Query set of non existing keys
   for (int i = 5; i < 90; i += 10) {
@@ -218,7 +191,7 @@
 
   // Unblock compaction and wait it for happening.
   sleeping_task_low.WakeUp();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // The same queries will not trigger bloom filter
   for (int i = 5; i < 90; i += 10) {
@@ -322,7 +295,7 @@
   //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
   //   new file of size 1.
   GenerateNewFile(1, &rnd, &key_idx);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Level-0 compaction is triggered, but no file will be picked up.
   ASSERT_EQ(NumSortedRuns(1), 4);
 
@@ -331,7 +304,7 @@
   //   a new file of size 1.
   filter->expect_full_compaction_.store(true);
   GenerateNewFile(1, &rnd, &key_idx);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // All files at level 0 will be compacted into a single one.
   ASSERT_EQ(NumSortedRuns(1), 1);
 
@@ -361,10 +334,10 @@
        num++) {
     // Write 110KB (11 values, each 10K)
     for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
     ASSERT_EQ(NumSortedRuns(1), num + 1);
   }
   ASSERT_EQ(NumSortedRuns(1), 2);
@@ -374,7 +347,7 @@
   // but will instead trigger size amplification.
   ASSERT_OK(Flush(1));
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Verify that size amplification did occur
   ASSERT_EQ(NumSortedRuns(1), 1);
@@ -419,10 +392,10 @@
        num++) {
     // Write 110KB (11 values, each 10K)
     for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
     ASSERT_EQ(NumSortedRuns(1), num + 1);
   }
   ASSERT_EQ(NumSortedRuns(1), 2);
@@ -432,7 +405,7 @@
   // but could instead trigger size amplification if it's set
   // to 110.
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Verify compaction did not happen
   ASSERT_EQ(NumSortedRuns(1), 3);
 
@@ -453,7 +426,7 @@
   ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
                       .max_size_amplification_percent);
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Verify that size amplification did happen
   ASSERT_EQ(NumSortedRuns(1), 1);
   ASSERT_EQ(total_picked_compactions, 1);
@@ -498,10 +471,10 @@
   for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
     // Write 110KB (11 values, each 10K)
     for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
     ASSERT_EQ(NumSortedRuns(1), num + 1);
   }
   ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger);
@@ -509,7 +482,7 @@
   // Flush whatever is remaining in memtable. This is typically small, about
   // 30KB.
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Verify compaction did not happen
   ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1);
   ASSERT_EQ(total_picked_compactions, 0);
@@ -538,7 +511,7 @@
   ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
             2u);
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Files in L0 are approx: 0.3 (30KB), 1, 1, 1.
   // On compaction: the files are below the size amp threshold, so we
@@ -576,10 +549,10 @@
   ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
   Random rnd(301);
   for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+    ASSERT_OK(Put(1, ToString(key), rnd.RandomString(kTestValueSize)));
   }
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ColumnFamilyMetaData cf_meta;
   dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   std::vector<std::string> compaction_input_file_names;
@@ -639,17 +612,17 @@
   // Generate 3 overlapping files
   Random rnd(301);
   for (int i = 0; i < 210; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
   }
   ASSERT_OK(Flush());
 
   for (int i = 200; i < 300; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
   }
   ASSERT_OK(Flush());
 
   for (int i = 250; i < 260; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
   }
   ASSERT_OK(Flush());
 
@@ -659,11 +632,11 @@
   compact_options.change_level = true;
   compact_options.target_level = 4;
   compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-  db_->CompactRange(compact_options, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 class DBTestUniversalCompactionMultiLevels
     : public DBTestUniversalCompactionBase {
  public:
@@ -693,7 +666,7 @@
     ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
   }
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   for (int i = num_keys; i < num_keys * 2; i++) {
     ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
@@ -740,7 +713,7 @@
   std::vector<std::string> values;
 
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(trivial_move, 0);
   ASSERT_GT(non_trivial_move, 0);
@@ -764,6 +737,7 @@
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
   options.num_levels = num_levels_;
+  options.env = env_;
   options.write_buffer_size = 1 << 10;  // 1KB
   options.level0_file_num_compaction_trigger = 3;
   options.max_background_compactions = 3;
@@ -803,7 +777,7 @@
   for (int i = 0; i < num_keys * 2; i++) {
     ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(num_compactions_running.load(), 0);
@@ -881,7 +855,7 @@
 
   // Hold the 1st compaction from finishing
   TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // There should only be one picked compaction as the score drops below one
   // after the first one is picked.
@@ -929,7 +903,7 @@
 
   // Hold the 1st and 2nd compaction from finishing
   TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // This time we will trigger a compaction because of size ratio and
   // another compaction because of number of files that are not compacted
@@ -940,7 +914,7 @@
 INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel,
                         ::testing::Combine(::testing::Values(1, 10),
                                            ::testing::Values(false)));
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
   Options options = CurrentOptions();
@@ -960,17 +934,17 @@
   for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
     // Write 100KB (100 values, each 1K)
     for (int i = 0; i < 100; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 990)));
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(990)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
 
     if (num < options.level0_file_num_compaction_trigger - 1) {
       ASSERT_EQ(NumSortedRuns(1), num + 1);
     }
   }
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumSortedRuns(1), 1);
 }
 
@@ -998,20 +972,20 @@
        num++) {
     // Write 100KB (100 values, each 1K)
     for (int i = 0; i < 100; i++) {
-      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+      ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(NumSortedRuns(), num + 1);
   }
 
   // Generate one more file at level-0, which should trigger level-0
   // compaction.
   for (int i = 0; i < 100; i++) {
-    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
     key_idx++;
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Suppose each file flushed from mem table has size 1. Now we compact
   // (level0_file_num_compaction_trigger+1)=4 files and should have a big
   // file of size 4.
@@ -1024,25 +998,25 @@
   //   First, clean up memtable before inserting new data. This will generate
   //   a level-0 file, with size around 0.4 (according to previously written
   //   data amount).
-  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
   for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
        num++) {
     // Write 110KB (11 values, each 10K)
     for (int i = 0; i < 100; i++) {
-      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+      ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(NumSortedRuns(), num + 3);
   }
 
   // Generate one more file at level-0, which should trigger level-0
   // compaction.
   for (int i = 0; i < 100; i++) {
-    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
     key_idx++;
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
   // After compaction, we should have 3 files, with size 4, 0.4, 2.
   ASSERT_EQ(NumSortedRuns(), 3);
@@ -1050,10 +1024,10 @@
   //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
   //   more file at level-0, which should trigger level-0 compaction.
   for (int i = 0; i < 100; i++) {
-    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
     key_idx++;
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Level-0 compaction is triggered, but no file will be picked up.
   ASSERT_EQ(NumSortedRuns(), 4);
 }
@@ -1082,8 +1056,8 @@
       ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
 
@@ -1094,8 +1068,8 @@
       ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
 
@@ -1107,8 +1081,8 @@
       ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
 
@@ -1120,8 +1094,8 @@
       ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
 }
@@ -1150,13 +1124,13 @@
       ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
       key_idx++;
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 // Test that checks trivial move in universal compaction
 TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
   int32_t trivial_move = 0;
@@ -1197,7 +1171,7 @@
   std::vector<std::string> values;
 
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(trivial_move, 0);
   ASSERT_GT(non_trivial_move, 0);
@@ -1243,13 +1217,13 @@
   std::vector<std::string> values;
 
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(trivial_move, 0);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
   Options options = CurrentOptions();
@@ -1258,7 +1232,7 @@
   options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
   options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
   options.compaction_style = kCompactionStyleUniversal;
   options.compaction_options_universal.size_ratio = 5;
   options.write_buffer_size = 111 << 10;  // 114KB
@@ -1267,12 +1241,14 @@
   options.num_levels = 1;
 
   std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+    // Delete archival files.
+    for (size_t i = 0; i < filenames.size(); ++i) {
+      ASSERT_OK(
+          env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+    }
+    ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
   }
-  env_->DeleteDir(options.db_paths[1].path);
   Reopen(options);
 
   Random rnd(301);
@@ -1360,7 +1336,7 @@
   options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
   options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
   options.compaction_style = kCompactionStyleUniversal;
   options.compaction_options_universal.size_ratio = 10;
   options.write_buffer_size = 111 << 10;  // 114KB
@@ -1524,18 +1500,19 @@
   options.num_levels = 1;
   options.write_buffer_size = 200 << 10;  // 200KB
   options.level0_file_num_compaction_trigger = 3;
-  options.memtable_factory.reset(new SpecialSkipListFactory(KNumKeysPerFile));
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysPerFile));
   options = CurrentOptions(options);
   CreateAndReopenWithCF({"pikachu"}, options);
 
   for (int i = 0; i <= max_key1; i++) {
     // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Stage 2: reopen with universal compaction, num_levels=4
   options.compaction_style = kCompactionStyleUniversal;
@@ -1548,12 +1525,12 @@
   // Insert more keys
   for (int i = max_key1 + 1; i <= max_key2; i++) {
     // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   verify_func(max_key2);
   // Compaction to non-L0 has happened.
@@ -1568,7 +1545,8 @@
   compact_options.change_level = true;
   compact_options.target_level = 0;
   compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-  dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_OK(
+      dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
   // Need to restart it once to remove higher level records in manifest.
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
   // Final reopen
@@ -1580,12 +1558,12 @@
   // Insert more keys
   for (int i = max_key2 + 1; i <= max_key3; i++) {
     // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   verify_func(max_key3);
 }
 
@@ -1604,15 +1582,17 @@
   options.level0_file_num_compaction_trigger = 2;
   options.num_levels = 1;
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
 
   std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+    // Delete archival files.
+    for (size_t i = 0; i < filenames.size(); ++i) {
+      ASSERT_OK(
+          env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+    }
+    ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
   }
-  env_->DeleteDir(options.db_paths[1].path);
   Reopen(options);
 
   Random rnd(301);
@@ -1700,6 +1680,7 @@
   Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
+  options.max_background_compactions = 2;
   options.num_levels = num_levels_;
   options.write_buffer_size = 100 << 10;     // 100KB
   options.target_file_size_base = 32 << 10;  // 32KB
@@ -1708,6 +1689,10 @@
   options.compaction_options_universal.max_size_amplification_percent = 110;
   DestroyAndReopen(options);
 
+  // Need to get a token to enable compaction parallelism up to
+  // `max_background_compactions` jobs.
+  auto pressure_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       {// wait for the full compaction to be picked before adding files intended
        // for the second one.
@@ -1727,14 +1712,14 @@
       // use no_wait above because that one waits for flush and compaction. We
       // don't want to wait for compaction because the full compaction is
       // intentionally blocked while more files are flushed.
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
     if (i == 0) {
       TEST_SYNC_POINT(
           "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
     }
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // First compaction should output to bottom level. Second should output to L0
   // since older L0 files pending compaction prevent it from being placed lower.
@@ -1752,7 +1737,7 @@
   const int kNumFilesTrigger = 8;
   Options options = CurrentOptions();
   options.memtable_factory.reset(
-      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
   options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2;
   options.compaction_options_universal.max_size_amplification_percent =
       static_cast<unsigned int>(-1);
@@ -1773,7 +1758,7 @@
     int key_idx = 0;
     GenerateNewFile(&rnd, &key_idx);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Compacting the first four files was enough to bring the score below one so
   // there's no need to schedule any more compactions.
   ASSERT_EQ(1, num_compactions_attempted);
@@ -1803,9 +1788,9 @@
   auto stop_token =
       dbfull()->TEST_write_controler().GetCompactionPressureToken();
 
-  Put("key", "val");
-  Flush();
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
   ColumnFamilyMetaData cf_meta;
   ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily();
@@ -1829,10 +1814,10 @@
   TEST_SYNC_POINT(
       "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
   for (int i = 0; i < 2; ++i) {
-    Put("key", "val");
-    Flush();
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   compact_files_thread.join();
 }
@@ -1863,7 +1848,7 @@
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
   MakeTables(3, "p", "q", 1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(2, TotalLiveFiles(1));
   ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
@@ -1872,7 +1857,7 @@
   CompactRangeOptions compact_options;
   compact_options.target_path_id = 1;
   compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
   ASSERT_EQ(1, TotalLiveFiles(1));
   ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
@@ -1895,7 +1880,7 @@
   // Full compaction to DB path 0
   compact_options.target_path_id = 0;
   compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
   ASSERT_EQ(1, TotalLiveFiles(1));
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
@@ -1932,27 +1917,28 @@
   // during flush
   int i;
   for (i = 0; i < 2000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   //  MoveFilesToLevel(6);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   for (i = 1999; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
 
+#if defined(ENABLE_SINGLE_LEVEL_DTC)
 TEST_F(DBTestUniversalCompaction2, SingleLevel) {
   const int kNumKeys = 3000;
   const int kWindowSize = 100;
@@ -1974,23 +1960,24 @@
   // during flush
   int i;
   for (i = 0; i < 2000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   for (i = 1999; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush()(;
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 }
+#endif  // ENABLE_SINGLE_LEVEL_DTC
 
 TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
   const int kWindowSize = 100;
@@ -2011,50 +1998,50 @@
   // during flush
   int i;
   for (i = 0; i < 500; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 500; i < 1000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 1000; i < 1500; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 1500; i < 2000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 
   for (i = 1999; i < 2333; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2333; i < 2666; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2666; i < 2999; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
   ASSERT_GT(NumTableFilesAtLevel(5), 0);
 
   for (i = 1900; i < 2100; ++i) {
-    Delete(Key(i));
+    ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
   ASSERT_EQ(0, NumTableFilesAtLevel(2));
@@ -2083,23 +2070,23 @@
   // during flush
   int i;
   for (i = 0; i < 2000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2000; i < 3000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 3500; i < 4000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2900; i < 3100; ++i) {
-    Delete(Key(i));
+    ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
@@ -2125,23 +2112,23 @@
   // during flush
   int i;
   for (i = 0; i < 2000; ++i) {
-    Put(Key(i), "val");
+    ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   //  MoveFilesToLevel(6);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   for (i = 1999; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_EQ(0, NumTableFilesAtLevel(6));
   ASSERT_GT(NumTableFilesAtLevel(5), 0);
@@ -2150,7 +2137,7 @@
 TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
   Options options;
   options.compaction_style = kCompactionStyleUniversal;
-
+  options.env = env_;
   KeepFilterFactory* filter = new KeepFilterFactory(true);
   options.compaction_filter_factory.reset(filter);
   Reopen(options);
@@ -2182,9 +2169,11 @@
   opts.compaction_options_universal.max_size_amplification_percent = 200;
   opts.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
   opts.num_levels = 5;
-  env_->addon_time_.store(0);
+  env_->SetMockSleep();
   Reopen(opts);
 
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
   int periodic_compactions = 0;
   int start_level = -1;
   int output_level = -1;
@@ -2203,16 +2192,16 @@
 
   // Case 1: Oldest flushed file excceeds periodic compaction threshold.
   ASSERT_OK(Put("foo", "bar"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_EQ(0, periodic_compactions);
   // Move clock forward so that the flushed file would qualify periodic
   // compaction.
-  env_->addon_time_.store(48 * 60 * 60 + 100);
+  env_->MockSleepForSeconds(48 * 60 * 60 + 100);
 
   // Another flush would trigger compaction the oldest file.
   ASSERT_OK(Put("foo", "bar2"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(1, periodic_compactions);
   ASSERT_EQ(0, start_level);
@@ -2222,16 +2211,16 @@
   periodic_compactions = 0;
   // A flush doesn't trigger a periodic compaction when threshold not hit
   ASSERT_OK(Put("foo", "bar2"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, periodic_compactions);
 
   // After periodic compaction threshold hits, a flush will trigger
   // a compaction
   ASSERT_OK(Put("foo", "bar2"));
-  env_->addon_time_.fetch_add(48 * 60 * 60 + 100);
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  env_->MockSleepForSeconds(48 * 60 * 60 + 100);
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(1, periodic_compactions);
   ASSERT_EQ(0, start_level);
   ASSERT_EQ(4, output_level);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_wal_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_wal_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_wal_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_wal_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,26 +8,58 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_test_util.h"
-#include "env/composite_env_wrapper.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "test_util/fault_injection_test_env.h"
+#include "rocksdb/file_system.h"
 #include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
-class DBWALTest : public DBTestBase {
- public:
-  DBWALTest() : DBTestBase("/db_wal_test") {}
+class DBWALTestBase : public DBTestBase {
+ protected:
+  explicit DBWALTestBase(const std::string& dir_name)
+      : DBTestBase(dir_name, /*env_do_fsync=*/true) {}
 
 #if defined(ROCKSDB_PLATFORM_POSIX)
+ public:
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+  bool IsFallocateSupported() {
+    // Test fallocate support of running file system.
+    // Skip this test if fallocate is not supported.
+    std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
+    int fd = -1;
+    do {
+      fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    } while (fd < 0 && errno == EINTR);
+    assert(fd > 0);
+    int alloc_status = fallocate(fd, 0, 0, 1);
+    int err_number = errno;
+    close(fd);
+    assert(env_->DeleteFile(fname_test_fallocate) == Status::OK());
+    if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
+      fprintf(stderr, "Skipped preallocated space check: %s\n",
+              errnoStr(err_number).c_str());
+      return false;
+    }
+    assert(alloc_status == 0);
+    return true;
+  }
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+
   uint64_t GetAllocatedFileSize(std::string file_name) {
     struct stat sbuf;
     int err = stat(file_name.c_str(), &sbuf);
     assert(err == 0);
     return sbuf.st_blocks * 512;
   }
-#endif
+#endif  // ROCKSDB_PLATFORM_POSIX
+};
+
+class DBWALTest : public DBWALTestBase {
+ public:
+  DBWALTest() : DBWALTestBase("/db_wal_test") {}
 };
 
 // A SpecialEnv enriched to give more insight about deleted files
@@ -40,8 +72,8 @@
     InstrumentedMutexLock l(&env_mutex_);
     if (f == skipped_wal) {
       deleted_wal_reopened = true;
-      if (IsWAL(f) && largetest_deleted_wal.size() != 0 &&
-          f.compare(largetest_deleted_wal) <= 0) {
+      if (IsWAL(f) && largest_deleted_wal.size() != 0 &&
+          f.compare(largest_deleted_wal) <= 0) {
         gap_in_wals = true;
       }
     }
@@ -55,9 +87,9 @@
       // remember its name partly because the application might attempt to
       // delete the file again.
       if (skipped_wal.size() != 0 && skipped_wal != fname) {
-        if (largetest_deleted_wal.size() == 0 ||
-            largetest_deleted_wal.compare(fname) < 0) {
-          largetest_deleted_wal = fname;
+        if (largest_deleted_wal.size() == 0 ||
+            largest_deleted_wal.compare(fname) < 0) {
+          largest_deleted_wal = fname;
         }
       } else {
         skipped_wal = fname;
@@ -75,7 +107,7 @@
   // the wal whose actual delete was skipped by the env
   std::string skipped_wal = "";
   // the largest WAL that was requested to be deleted
-  std::string largetest_deleted_wal = "";
+  std::string largest_deleted_wal = "";
   // number of WALs that were successfully deleted
   std::atomic<size_t> deleted_wal_cnt = {0};
   // the WAL whose delete from fs was skipped is reopened during recovery
@@ -86,7 +118,8 @@
 
 class DBWALTestWithEnrichedEnv : public DBTestBase {
  public:
-  DBWALTestWithEnrichedEnv() : DBTestBase("/db_wal_test") {
+  DBWALTestWithEnrichedEnv()
+      : DBTestBase("db_wal_test", /*env_do_fsync=*/true) {
     enriched_env_ = new EnrichedSpecialEnv(env_->target());
     auto options = CurrentOptions();
     options.env = enriched_env_;
@@ -330,18 +363,319 @@
   } while (ChangeWalOptions());
 }
 
+TEST_F(DBWALTest, RecoverWithBlob) {
+  // Write a value that's below the prospective size limit for blobs and another
+  // one that's above. Note that blob files are not actually enabled at this
+  // point.
+  constexpr uint64_t min_blob_size = 10;
+
+  constexpr char short_value[] = "short";
+  static_assert(sizeof(short_value) - 1 < min_blob_size,
+                "short_value too long");
+
+  constexpr char long_value[] = "long_value";
+  static_assert(sizeof(long_value) - 1 >= min_blob_size,
+                "long_value too short");
+
+  ASSERT_OK(Put("key1", short_value));
+  ASSERT_OK(Put("key2", long_value));
+
+  // There should be no files just yet since we haven't flushed.
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+    ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+  }
+
+  // Reopen the database with blob files enabled. A new table file/blob file
+  // pair should be written during recovery.
+  Options options;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  ASSERT_EQ(Get("key1"), short_value);
+  ASSERT_EQ(Get("key2"), long_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  ASSERT_NE(versions, nullptr);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_EQ(l0_files.size(), 1);
+
+  const FileMetaData* const table_file = l0_files[0];
+  ASSERT_NE(table_file, nullptr);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.begin()->second;
+  ASSERT_NE(blob_file, nullptr);
+
+  ASSERT_EQ(table_file->smallest.user_key(), "key1");
+  ASSERT_EQ(table_file->largest.user_key(), "key2");
+  ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+  ASSERT_EQ(table_file->fd.largest_seqno, 2);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(DBWALTest, RecoverWithBlobMultiSST) {
+  // Write several large (4 KB) values without flushing. Note that blob files
+  // are not actually enabled at this point.
+  std::string large_value(1 << 12, 'a');
+
+  constexpr int num_keys = 64;
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_OK(Put(Key(i), large_value));
+  }
+
+  // There should be no files just yet since we haven't flushed.
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+    ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+  }
+
+  // Reopen the database with blob files enabled and write buffer size set to a
+  // smaller value. Multiple table files+blob files should be written and added
+  // to the Version during recovery.
+  Options options;
+  options.write_buffer_size = 1 << 16;  // 64 KB
+  options.enable_blob_files = true;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_EQ(Get(Key(i)), large_value);
+  }
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  ASSERT_NE(versions, nullptr);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_GT(l0_files.size(), 1);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_GT(blob_files.size(), 1);
+
+  ASSERT_EQ(l0_files.size(), blob_files.size());
+}
+
+TEST_F(DBWALTest, WALWithChecksumHandoff) {
+#ifndef ROCKSDB_ASSERT_STATUS_CHECKED
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  do {
+    Options options = CurrentOptions();
+
+    options.checksum_handoff_file_types.Add(FileType::kWalFile);
+    options.env = fault_fs_env.get();
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    // This put, data is persisted by Flush
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    writeOpt.disableWAL = false;
+    // Data is persisted in the WAL
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3"));
+    // The hash does not match, write fails
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Due to the write failure, Get should not find
+    ASSERT_NE("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "zoo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+    // Each write will be similated as corrupted.
+    fault_fs->IngestDataCorruptionBeforeWrite();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4"));
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_NE("v4", Get(1, "foo"));
+    ASSERT_NE("v4", Get(1, "bar"));
+    fault_fs->NoDataCorruptionBeforeWrite();
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+    // The file system does not provide checksum method and verification.
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v5", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "bar"));
+
+    Destroy(options);
+  } while (ChangeWalOptions());
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+}
+
+class DBRecoveryTestBlobError
+    : public DBWALTest,
+      public testing::WithParamInterface<std::string> {
+ public:
+  DBRecoveryTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) {
+  // Write a value. Note that blob files are not actually enabled at this point.
+  ASSERT_OK(Put("key", "blob"));
+
+  // Reopen with blob files enabled but make blob file writing fail during
+  // recovery.
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.enable_blob_files = true;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  ASSERT_NOK(TryReopen(options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Make sure the files generated by the failed recovery have been deleted.
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kTableFile;
+
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+
+    ASSERT_NE(type, kTableFile);
+    ASSERT_NE(type, kBlobFile);
+  }
+}
+
 TEST_F(DBWALTest, IgnoreRecoveredLog) {
   std::string backup_logs = dbname_ + "/backup_logs";
 
   do {
     // delete old files in backup_logs directory
-    env_->CreateDirIfMissing(backup_logs);
+    ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
     std::vector<std::string> old_files;
-    env_->GetChildren(backup_logs, &old_files);
+    ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
     for (auto& file : old_files) {
-      if (file != "." && file != "..") {
-        env_->DeleteFile(backup_logs + "/" + file);
-      }
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
     }
     Options options = CurrentOptions();
     options.create_if_missing = true;
@@ -359,11 +693,9 @@
 
     // copy the logs to backup
     std::vector<std::string> logs;
-    env_->GetChildren(options.wal_dir, &logs);
+    ASSERT_OK(env_->GetChildren(options.wal_dir, &logs));
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
-      }
+      CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
     }
 
     // recover the DB
@@ -374,9 +706,7 @@
 
     // copy the logs from backup back to wal dir
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-      }
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
     }
     // this should ignore the log files, recovery should not happen again
     // if the recovery happens, the same merge operator would be called twice,
@@ -390,11 +720,9 @@
     Close();
 
     // copy the logs from backup back to wal dir
-    env_->CreateDirIfMissing(options.wal_dir);
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-      }
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
     }
     // assert that we successfully recovered only from logs, even though we
     // destroyed the DB
@@ -405,16 +733,14 @@
     // Recovery will fail if DB directory doesn't exist.
     Destroy(options);
     // copy the logs from backup back to wal dir
-    env_->CreateDirIfMissing(options.wal_dir);
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-        // we won't be needing this file no more
-        env_->DeleteFile(backup_logs + "/" + log);
-      }
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      // we won't be needing this file no more
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log));
     }
     Status s = TryReopen(options);
-    ASSERT_TRUE(!s.ok());
+    ASSERT_NOK(s);
     Destroy(options);
   } while (ChangeWalOptions());
 }
@@ -452,9 +778,9 @@
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -471,9 +797,9 @@
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -491,9 +817,9 @@
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -512,9 +838,9 @@
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -522,7 +848,10 @@
 #endif  // !(defined NDEBUG) || !defined(OS_WIN)
 
 #ifndef ROCKSDB_LITE
-TEST_F(DBWALTest, FullPurgePreservesRecycledLog) {
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) {
+  // TODO(ajkr): Disabled until WAL recycling is fixed for
+  // `kPointInTimeRecovery`.
+
   // For github issue #1303
   for (int i = 0; i < 2; ++i) {
     Options options = CurrentOptions();
@@ -558,7 +887,10 @@
   }
 }
 
-TEST_F(DBWALTest, FullPurgePreservesLogPendingReuse) {
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) {
+  // TODO(ajkr): Disabled until WAL recycling is fixed for
+  // `kPointInTimeRecovery`.
+
   // Ensures full purge cannot delete a WAL while it's in the process of being
   // recycled. In particular, we force the full purge after a file has been
   // chosen for reuse, but before it has been renamed.
@@ -734,7 +1066,7 @@
   // Make 'dobrynia' to be flushed and new WAL file to be created
   ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
   ASSERT_OK(Put(2, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
   {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(1));
@@ -788,7 +1120,7 @@
   // Make 'nikitich' memtable to be flushed
   ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
   ASSERT_OK(Put(3, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
   // 4 memtable are not flushed, 1 sst file
   {
     auto tables = ListTableFiles(env_, dbname_);
@@ -808,7 +1140,7 @@
   ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
   // make it flush
   ASSERT_OK(Put(3, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
   // There are still 4 memtable not flushed, and 2 sst tables
   ASSERT_OK(Put(0, Key(1), DummyString(1)));
   ASSERT_OK(Put(1, Key(1), DummyString(1)));
@@ -856,10 +1188,10 @@
   for (uint64_t b = 0; b < kNumBatches; b++) {
     batch.Clear();
     for (int i = 0; i < kBatchSize; i++) {
-      batch.Put(Key(i), DummyString(128));
+      ASSERT_OK(batch.Put(Key(i), DummyString(128)));
     }
 
-    dbfull()->Write(wo, &batch);
+    ASSERT_OK(dbfull()->Write(wo, &batch));
   }
 
   ASSERT_OK(dbfull()->SyncWAL());
@@ -887,7 +1219,7 @@
   ASSERT_OK(Flush(0));
   ASSERT_OK(Put(0, "key", "v5", wal_on));  // seq id 5
   ASSERT_EQ("v5", Get(0, "key"));
-  dbfull()->FlushWAL(false);
+  ASSERT_OK(dbfull()->FlushWAL(false));
   // Simulate a crash.
   fault_env->SetFilesystemActive(false);
   Close();
@@ -905,16 +1237,16 @@
 class RecoveryTestHelper {
  public:
   // Number of WAL files to generate
-  static const int kWALFilesCount = 10;
+  static constexpr int kWALFilesCount = 10;
   // Starting number for the WAL file name like 00010.log
-  static const int kWALFileOffset = 10;
+  static constexpr int kWALFileOffset = 10;
   // Keys to be written per WAL file
-  static const int kKeysPerWALFile = 133;
+  static constexpr int kKeysPerWALFile = 133;
   // Size of the value
-  static const int kValueSize = 96;
+  static constexpr int kValueSize = 96;
 
   // Create WAL files with values filled in
-  static void FillData(DBWALTest* test, const Options& options,
+  static void FillData(DBWALTestBase* test, const Options& options,
                        const size_t wal_count, size_t* count) {
     // Calling internal functions requires sanitized options.
     Options sanitized_options = SanitizeOptions(test->dbname_, options);
@@ -923,29 +1255,31 @@
     *count = 0;
 
     std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
-    EnvOptions env_options;
+    FileOptions file_options;
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
 
     std::unique_ptr<VersionSet> versions;
     std::unique_ptr<WalManager> wal_manager;
     WriteController write_controller;
 
-    versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
+    versions.reset(new VersionSet(test->dbname_, &db_options, file_options,
                                   table_cache.get(), &write_buffer_manager,
                                   &write_controller,
-                                  /*block_cache_tracer=*/nullptr));
+                                  /*block_cache_tracer=*/nullptr,
+                                  /*io_tracer=*/nullptr, /*db_session_id*/ ""));
 
-    wal_manager.reset(new WalManager(db_options, env_options));
+    wal_manager.reset(
+        new WalManager(db_options, file_options, /*io_tracer=*/nullptr));
 
     std::unique_ptr<log::Writer> current_log_writer;
 
     for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
       uint64_t current_log_number = j;
       std::string fname = LogFileName(test->dbname_, current_log_number);
-      std::unique_ptr<WritableFile> file;
-      ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
-      std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-          NewLegacyWritableFileWrapper(std::move(file)), fname, env_options));
+      std::unique_ptr<WritableFileWriter> file_writer;
+      ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(),
+                                           fname, file_options, &file_writer,
+                                           nullptr));
       current_log_writer.reset(
           new log::Writer(std::move(file_writer), current_log_number,
                           db_options.recycle_log_file_num > 0));
@@ -954,12 +1288,13 @@
       for (int i = 0; i < kKeysPerWALFile; i++) {
         std::string key = "key" + ToString((*count)++);
         std::string value = test->DummyString(kValueSize);
-        assert(current_log_writer.get() != nullptr);
+        ASSERT_NE(current_log_writer.get(), nullptr);
         uint64_t seq = versions->LastSequence() + 1;
         batch.Clear();
-        batch.Put(key, value);
+        ASSERT_OK(batch.Put(key, value));
         WriteBatchInternal::SetSequence(&batch, seq);
-        current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
+        ASSERT_OK(current_log_writer->AddRecord(
+            WriteBatchInternal::Contents(&batch)));
         versions->SetLastAllocatedSequence(seq);
         versions->SetLastPublishedSequence(seq);
         versions->SetLastSequence(seq);
@@ -968,7 +1303,7 @@
   }
 
   // Recreate and fill the store with some data
-  static size_t FillData(DBWALTest* test, Options* options) {
+  static size_t FillData(DBWALTestBase* test, Options* options) {
     options->create_if_missing = true;
     test->DestroyAndReopen(*options);
     test->Close();
@@ -979,7 +1314,7 @@
   }
 
   // Read back all the keys we wrote and return the number of keys found
-  static size_t GetData(DBWALTest* test) {
+  static size_t GetData(DBWALTestBase* test) {
     size_t count = 0;
     for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) {
       if (test->Get("key" + ToString(i)) != "NOT_FOUND") {
@@ -990,7 +1325,7 @@
   }
 
   // Manuall corrupt the specified WAL
-  static void CorruptWAL(DBWALTest* test, const Options& options,
+  static void CorruptWAL(DBWALTestBase* test, const Options& options,
                          const double off, const double len,
                          const int wal_file_id, const bool trunc = false) {
     Env* env = options.env;
@@ -1007,104 +1342,110 @@
     test->Close();
 #endif
     if (trunc) {
-      ASSERT_EQ(0, truncate(fname.c_str(), static_cast<int64_t>(size * off)));
+      ASSERT_OK(
+          test::TruncateFile(env, fname, static_cast<uint64_t>(size * off)));
     } else {
-      InduceCorruption(fname, static_cast<size_t>(size * off + 8),
-                       static_cast<size_t>(size * len));
+      ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(size * off + 8),
+                                  static_cast<int>(size * len), false));
     }
   }
+};
 
-  // Overwrite data with 'a' from offset for length len
-  static void InduceCorruption(const std::string& filename, size_t offset,
-                               size_t len) {
-    ASSERT_GT(len, 0U);
-
-    int fd = open(filename.c_str(), O_RDWR);
-
-    // On windows long is 32-bit
-    ASSERT_LE(offset, std::numeric_limits<long>::max());
-
-    ASSERT_GT(fd, 0);
-    ASSERT_EQ(offset, lseek(fd, static_cast<long>(offset), SEEK_SET));
-
-    void* buf = alloca(len);
-    memset(buf, 'b', len);
-    ASSERT_EQ(len, write(fd, buf, static_cast<unsigned int>(len)));
+class DBWALTestWithParams
+    : public DBWALTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, int, int>> {
+ public:
+  DBWALTestWithParams() : DBWALTestBase("/db_wal_test_with_params") {}
+};
 
-    close(fd);
-  }
+INSTANTIATE_TEST_CASE_P(
+    Wal, DBWALTestWithParams,
+    ::testing::Combine(::testing::Bool(), ::testing::Range(0, 4, 1),
+                       ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+                                        RecoveryTestHelper::kWALFileOffset +
+                                            RecoveryTestHelper::kWALFilesCount,
+                                        1)));
+
+class DBWALTestWithParamsVaryingRecoveryMode
+    : public DBWALTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, int, int, WALRecoveryMode>> {
+ public:
+  DBWALTestWithParamsVaryingRecoveryMode()
+      : DBWALTestBase("/db_wal_test_with_params_mode") {}
 };
 
+INSTANTIATE_TEST_CASE_P(
+    Wal, DBWALTestWithParamsVaryingRecoveryMode,
+    ::testing::Combine(
+        ::testing::Bool(), ::testing::Range(0, 4, 1),
+        ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+                         RecoveryTestHelper::kWALFileOffset +
+                             RecoveryTestHelper::kWALFilesCount,
+                         1),
+        ::testing::Values(WALRecoveryMode::kTolerateCorruptedTailRecords,
+                          WALRecoveryMode::kAbsoluteConsistency,
+                          WALRecoveryMode::kPointInTimeRecovery,
+                          WALRecoveryMode::kSkipAnyCorruptedRecords)));
+
 // Test scope:
 // - We expect to open the data store when there is incomplete trailing writes
 // at the end of any of the logs
 // - We do not expect to open the data store for corruption
-TEST_F(DBWALTest, kTolerateCorruptedTailRecords) {
-  const int jstart = RecoveryTestHelper::kWALFileOffset;
-  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
-
-  for (auto trunc : {true, false}) {        /* Corruption style */
-    for (int i = 0; i < 3; i++) {           /* Corruption offset position */
-      for (int j = jstart; j < jend; j++) { /* WAL file */
-        // Fill data for testing
-        Options options = CurrentOptions();
-        const size_t row_count = RecoveryTestHelper::FillData(this, &options);
-        // test checksum failure or parsing
-        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
-                                       /*len%=*/.1, /*wal=*/j, trunc);
-
-        if (trunc) {
-          options.wal_recovery_mode =
-              WALRecoveryMode::kTolerateCorruptedTailRecords;
-          options.create_if_missing = false;
-          ASSERT_OK(TryReopen(options));
-          const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
-          ASSERT_TRUE(i == 0 || recovered_row_count > 0);
-          ASSERT_LT(recovered_row_count, row_count);
-        } else {
-          options.wal_recovery_mode =
-              WALRecoveryMode::kTolerateCorruptedTailRecords;
-          ASSERT_NOK(TryReopen(options));
-        }
-      }
-    }
+TEST_P(DBWALTestWithParams, kTolerateCorruptedTailRecords) {
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+
+  // Fill data for testing
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+  // test checksum failure or parsing
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+  if (trunc) {
+    options.create_if_missing = false;
+    ASSERT_OK(TryReopen(options));
+    const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+    ASSERT_TRUE(corrupt_offset == 0 || recovered_row_count > 0);
+    ASSERT_LT(recovered_row_count, row_count);
+  } else {
+    ASSERT_NOK(TryReopen(options));
   }
 }
 
 // Test scope:
 // We don't expect the data store to be opened if there is any corruption
 // (leading, middle or trailing -- incomplete writes or corruption)
-TEST_F(DBWALTest, kAbsoluteConsistency) {
-  const int jstart = RecoveryTestHelper::kWALFileOffset;
-  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
-
+TEST_P(DBWALTestWithParams, kAbsoluteConsistency) {
   // Verify clean slate behavior
   Options options = CurrentOptions();
   const size_t row_count = RecoveryTestHelper::FillData(this, &options);
-  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
   options.create_if_missing = false;
   ASSERT_OK(TryReopen(options));
   ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count);
 
-  for (auto trunc : {true, false}) { /* Corruption style */
-    for (int i = 0; i < 4; i++) {    /* Corruption offset position */
-      if (trunc && i == 0) {
-        continue;
-      }
-
-      for (int j = jstart; j < jend; j++) { /* wal files */
-        // fill with new date
-        RecoveryTestHelper::FillData(this, &options);
-        // corrupt the wal
-        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
-                                       /*len%=*/.1, j, trunc);
-        // verify
-        options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
-        options.create_if_missing = false;
-        ASSERT_NOK(TryReopen(options));
-      }
-    }
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+
+  if (trunc && corrupt_offset == 0) {
+    return;
   }
+
+  // fill with new date
+  RecoveryTestHelper::FillData(this, &options);
+  // corrupt the wal
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+  // verify
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  options.create_if_missing = false;
+  ASSERT_NOK(TryReopen(options));
 }
 
 // Test scope:
@@ -1129,100 +1470,186 @@
 
   ASSERT_OK(Put(1, "key3", "val3"));
   // Corrupt WAL at location of key3
-  RecoveryTestHelper::InduceCorruption(
-      fname, static_cast<size_t>(offset_to_corrupt), static_cast<size_t>(4));
+  ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(offset_to_corrupt),
+                              4, false));
   ASSERT_OK(Put(2, "key4", "val4"));
   ASSERT_OK(Put(1, "key5", "val5"));
-  Flush(2);
+  ASSERT_OK(Flush(2));
 
   // PIT recovery & verify
   options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
   ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
 }
 
+TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+  // The following make sure there are two bg flush threads.
+  options.max_background_jobs = 8;
+
+  const std::string cf1_name("cf1");
+  CreateAndReopenWithCF({cf1_name}, options);
+  assert(handles_.size() == 2);
+
+  {
+    dbfull()->TEST_LockMutex();
+    ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes);
+    dbfull()->TEST_UnlockMutex();
+  }
+
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[1]));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[0]));
+
+  bool called = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // This callback will be called when the first bg flush thread reaches the
+  // point before entering the MANIFEST write queue after flushing the SST
+  // file.
+  // The purpose of the sync points here is to ensure both bg flush threads
+  // finish computing `min_wal_number_to_keep` before any of them updates the
+  // `log_number` for the column family that's being flushed.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep",
+      [&](void* /*arg*/) {
+        dbfull()->mutex()->AssertHeld();
+        if (!called) {
+          // We are the first bg flush thread in the MANIFEST write queue.
+          // We set up the dependency between sync points for two threads that
+          // will be executing the same code.
+          // For the interleaving of events, see
+          // https://github.com/facebook/rocksdb/pull/9715.
+          // bg flush thread1 will release the db mutex while in the MANIFEST
+          // write queue. In the meantime, bg flush thread2 locks db mutex and
+          // computes the min_wal_number_to_keep (before thread1 writes to
+          // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins
+          // the MANIFEST write queue afterwards and bg flush thread1 proceeds
+          // with writing to MANIFEST.
+          called = true;
+          SyncPoint::GetInstance()->LoadDependency({
+              {"VersionSet::LogAndApply:WriteManifestStart",
+               "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"},
+              {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2",
+               "VersionSet::LogAndApply:WriteManifest"},
+          });
+        } else {
+          // The other bg flush thread has already been in the MANIFEST write
+          // queue, and we are after.
+          TEST_SYNC_POINT(
+              "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+  ASSERT_TRUE(called);
+
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  DB* db1 = nullptr;
+  Status s = DB::OpenForReadOnly(options, dbname_, &db1);
+  ASSERT_OK(s);
+  assert(db1);
+  delete db1;
+}
+
 // Test scope:
 // - We expect to open data store under all circumstances
 // - We expect only data upto the point where the first error was encountered
-TEST_F(DBWALTest, kPointInTimeRecovery) {
-  const int jstart = RecoveryTestHelper::kWALFileOffset;
-  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+TEST_P(DBWALTestWithParams, kPointInTimeRecovery) {
   const int maxkeys =
       RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile;
 
-  for (auto trunc : {true, false}) {        /* Corruption style */
-    for (int i = 0; i < 4; i++) {           /* Offset of corruption */
-      for (int j = jstart; j < jend; j++) { /* WAL file */
-        // Fill data for testing
-        Options options = CurrentOptions();
-        const size_t row_count = RecoveryTestHelper::FillData(this, &options);
-
-        // Corrupt the wal
-        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
-                                       /*len%=*/.1, j, trunc);
-
-        // Verify
-        options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
-        options.create_if_missing = false;
-        ASSERT_OK(TryReopen(options));
-
-        // Probe data for invariants
-        size_t recovered_row_count = RecoveryTestHelper::GetData(this);
-        ASSERT_LT(recovered_row_count, row_count);
-
-        bool expect_data = true;
-        for (size_t k = 0; k < maxkeys; ++k) {
-          bool found = Get("key" + ToString(i)) != "NOT_FOUND";
-          if (expect_data && !found) {
-            expect_data = false;
-          }
-          ASSERT_EQ(found, expect_data);
-        }
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
 
-        const size_t min = RecoveryTestHelper::kKeysPerWALFile *
-                           (j - RecoveryTestHelper::kWALFileOffset);
-        ASSERT_GE(recovered_row_count, min);
-        if (!trunc && i != 0) {
-          const size_t max = RecoveryTestHelper::kKeysPerWALFile *
-                             (j - RecoveryTestHelper::kWALFileOffset + 1);
-          ASSERT_LE(recovered_row_count, max);
-        }
+  // Fill data for testing
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+  // Corrupt the wal
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  // Verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+
+  // Probe data for invariants
+  size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+  ASSERT_LT(recovered_row_count, row_count);
+
+  // Verify a prefix of keys were recovered. But not in the case of full WAL
+  // truncation, because we have no way to know there was a corruption when
+  // truncation happened on record boundaries (preventing recovery holes in
+  // that case requires using `track_and_verify_wals_in_manifest`).
+  if (!trunc || corrupt_offset != 0) {
+    bool expect_data = true;
+    for (size_t k = 0; k < maxkeys; ++k) {
+      bool found = Get("key" + ToString(k)) != "NOT_FOUND";
+      if (expect_data && !found) {
+        expect_data = false;
       }
+      ASSERT_EQ(found, expect_data);
     }
   }
+
+  const size_t min = RecoveryTestHelper::kKeysPerWALFile *
+                     (wal_file_id - RecoveryTestHelper::kWALFileOffset);
+  ASSERT_GE(recovered_row_count, min);
+  if (!trunc && corrupt_offset != 0) {
+    const size_t max = RecoveryTestHelper::kKeysPerWALFile *
+                       (wal_file_id - RecoveryTestHelper::kWALFileOffset + 1);
+    ASSERT_LE(recovered_row_count, max);
+  }
 }
 
 // Test scope:
 // - We expect to open the data store under all scenarios
 // - We expect to have recovered records past the corruption zone
-TEST_F(DBWALTest, kSkipAnyCorruptedRecords) {
-  const int jstart = RecoveryTestHelper::kWALFileOffset;
-  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
-
-  for (auto trunc : {true, false}) {        /* Corruption style */
-    for (int i = 0; i < 4; i++) {           /* Corruption offset */
-      for (int j = jstart; j < jend; j++) { /* wal files */
-        // Fill data for testing
-        Options options = CurrentOptions();
-        const size_t row_count = RecoveryTestHelper::FillData(this, &options);
-
-        // Corrupt the WAL
-        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
-                                       /*len%=*/.1, j, trunc);
-
-        // Verify behavior
-        options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
-        options.create_if_missing = false;
-        ASSERT_OK(TryReopen(options));
-
-        // Probe data for invariants
-        size_t recovered_row_count = RecoveryTestHelper::GetData(this);
-        ASSERT_LT(recovered_row_count, row_count);
+TEST_P(DBWALTestWithParams, kSkipAnyCorruptedRecords) {
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
 
-        if (!trunc) {
-          ASSERT_TRUE(i != 0 || recovered_row_count > 0);
-        }
-      }
-    }
+  // Fill data for testing
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+  // Corrupt the WAL
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  // Verify behavior
+  options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+
+  // Probe data for invariants
+  size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+  ASSERT_LT(recovered_row_count, row_count);
+
+  if (!trunc) {
+    ASSERT_TRUE(corrupt_offset != 0 || recovered_row_count > 0);
   }
 }
 
@@ -1288,7 +1715,7 @@
   for (int i = 0; i < 2; ++i) {
     if (i > 0) {
       // Flush() triggers deletion of obsolete tracked files
-      Flush();
+      ASSERT_OK(Flush());
     }
     VectorLogPtr log_files;
     ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
@@ -1330,7 +1757,7 @@
   ASSERT_EQ(Get("foo"), "foo_v2");
   ASSERT_EQ(Get("bar"), "bar_v2");
   // manual flush and insert again
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_EQ(Get("foo"), "foo_v2");
   ASSERT_EQ(Get("bar"), "bar_v2");
   ASSERT_OK(Put("foo", "foo_v3"));
@@ -1351,7 +1778,9 @@
 
   auto countWalFiles = [this]() {
     VectorLogPtr log_files;
-    dbfull()->GetSortedWalFiles(log_files);
+    if (!dbfull()->GetSortedWalFiles(log_files).ok()) {
+      return size_t{0};
+    }
     return log_files.size();
   };
 
@@ -1359,11 +1788,11 @@
   CreateAndReopenWithCF({"one", "two"}, options);
   ASSERT_OK(Put(0, "key1", kSmallValue));
   ASSERT_OK(Put(1, "key2", kLargeValue));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_EQ(1, countWalFiles());
   ASSERT_OK(Put(0, "key3", kSmallValue));
   ASSERT_OK(Put(2, "key4", kLargeValue));
-  Flush(2);
+  ASSERT_OK(Flush(2));
   ASSERT_EQ(2, countWalFiles());
 
   // Reopen, insert and flush.
@@ -1377,9 +1806,9 @@
   ASSERT_OK(Put(0, "key5", kLargeValue));
   ASSERT_OK(Put(1, "key6", kLargeValue));
   ASSERT_EQ(3, countWalFiles());
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(2, "key7", kLargeValue));
-  dbfull()->FlushWAL(false);
+  ASSERT_OK(dbfull()->FlushWAL(false));
   ASSERT_EQ(4, countWalFiles());
 
   // Reopen twice and validate.
@@ -1401,9 +1830,8 @@
 //   2. Open with avoid_flush_during_recovery = true;
 //   3. Append more data without flushing, which creates new WAL log.
 //   4. Open again. See if it can correctly handle previous corruption.
-TEST_F(DBWALTest, RecoverFromCorruptedWALWithoutFlush) {
-  const int jstart = RecoveryTestHelper::kWALFileOffset;
-  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+TEST_P(DBWALTestWithParamsVaryingRecoveryMode,
+       RecoverFromCorruptedWALWithoutFlush) {
   const int kAppendKeys = 100;
   Options options = CurrentOptions();
   options.avoid_flush_during_recovery = true;
@@ -1422,60 +1850,47 @@
     delete iter;
     return data;
   };
-  for (auto& mode : wal_recovery_mode_string_map) {
-    options.wal_recovery_mode = mode.second;
-    for (auto trunc : {true, false}) {
-      for (int i = 0; i < 4; i++) {
-        for (int j = jstart; j < jend; j++) {
-          // Create corrupted WAL
-          RecoveryTestHelper::FillData(this, &options);
-          RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
-                                         /*len%=*/.1, /*wal=*/j, trunc);
-          // Skip the test if DB won't open.
-          if (!TryReopen(options).ok()) {
-            ASSERT_TRUE(options.wal_recovery_mode ==
-                            WALRecoveryMode::kAbsoluteConsistency ||
-                        (!trunc &&
-                         options.wal_recovery_mode ==
-                             WALRecoveryMode::kTolerateCorruptedTailRecords));
-            continue;
-          }
-          ASSERT_OK(TryReopen(options));
-          // Append some more data.
-          for (int k = 0; k < kAppendKeys; k++) {
-            std::string key = "extra_key" + ToString(k);
-            std::string value = DummyString(RecoveryTestHelper::kValueSize);
-            ASSERT_OK(Put(key, value));
-          }
-          // Save data for comparison.
-          auto data = getAll();
-          // Reopen. Verify data.
-          ASSERT_OK(TryReopen(options));
-          auto actual_data = getAll();
-          ASSERT_EQ(data, actual_data);
-        }
-      }
-    }
+
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  WALRecoveryMode recovery_mode = std::get<3>(GetParam());
+
+  options.wal_recovery_mode = recovery_mode;
+  // Create corrupted WAL
+  RecoveryTestHelper::FillData(this, &options);
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+  // Skip the test if DB won't open.
+  if (!TryReopen(options).ok()) {
+    ASSERT_TRUE(options.wal_recovery_mode ==
+                    WALRecoveryMode::kAbsoluteConsistency ||
+                (!trunc && options.wal_recovery_mode ==
+                               WALRecoveryMode::kTolerateCorruptedTailRecords));
+    return;
   }
+  ASSERT_OK(TryReopen(options));
+  // Append some more data.
+  for (int k = 0; k < kAppendKeys; k++) {
+    std::string key = "extra_key" + ToString(k);
+    std::string value = DummyString(RecoveryTestHelper::kValueSize);
+    ASSERT_OK(Put(key, value));
+  }
+  // Save data for comparison.
+  auto data = getAll();
+  // Reopen. Verify data.
+  ASSERT_OK(TryReopen(options));
+  auto actual_data = getAll();
+  ASSERT_EQ(data, actual_data);
 }
 
 // Tests that total log size is recovered if we set
 // avoid_flush_during_recovery=true.
 // Flush should trigger if max_total_wal_size is reached.
 TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
-  class TestFlushListener : public EventListener {
-   public:
-    std::atomic<int> count{0};
-
-    TestFlushListener() = default;
-
-    void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
-      count++;
-      assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason);
-    }
-  };
-  std::shared_ptr<TestFlushListener> test_listener =
-      std::make_shared<TestFlushListener>();
+  auto test_listener = std::make_shared<FlushCounterListener>();
+  test_listener->expected_flush_reason = FlushReason::kWalFull;
 
   constexpr size_t kKB = 1024;
   constexpr size_t kMB = 1024 * 1024;
@@ -1515,7 +1930,9 @@
             1 * kMB);
   // Write one more key to trigger flush.
   ASSERT_OK(Put(0, "foo", "v2"));
-  dbfull()->TEST_WaitForFlushMemTable();
+  for (auto* h : handles_) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h));
+  }
   // Flushed two column families.
   ASSERT_EQ(2, test_listener->count.load());
 }
@@ -1527,7 +1944,16 @@
 TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
   constexpr size_t kKB = 1024;
   Options options = CurrentOptions();
+  options.env = env_;
   options.avoid_flush_during_recovery = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
   DestroyAndReopen(options);
   size_t preallocated_size =
       dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
@@ -1549,6 +1975,175 @@
   ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
             preallocated_size);
 }
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  options.avoid_flush_during_shutdown = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  // The log file has preallocated space.
+  Close();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // After the flush during Open, the log file should get deleted.  However,
+  // if  the process is in a crash loop, the log file may not get
+  // deleted and thte preallocated space will keep accumulating. So we need
+  // to ensure it gets trtuncated.
+  EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted  environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  Close();
+  std::vector<std::string> filenames;
+  std::string last_log;
+  uint64_t last_log_num = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  for (auto fname : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(fname, &number, &type, nullptr)) {
+      if (type == kWalFile && number > last_log_num) {
+        last_log = fname;
+      }
+    }
+  }
+  ASSERT_NE(last_log, "");
+  last_log = dbname_ + '/' + last_log;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close",
+      [](void* arg) { *(reinterpret_cast<size_t*>(arg)) = 0; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Preallocate space for the empty log file. This could happen if WAL data
+  // was buffered in memory and the process crashed.
+  std::unique_ptr<WritableFile> log_file;
+  ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions()));
+  log_file->SetPreallocationBlockSize(preallocated_size);
+  log_file->PrepareWrite(0, 4096);
+  log_file.reset();
+
+  ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size);
+
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // The preallocated space should be truncated.
+  EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  // create DB and close with file truncate disabled
+  std::atomic_bool enable_truncate{false};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close", [&](void* arg) {
+        if (!enable_truncate) {
+          *(reinterpret_cast<size_t*>(arg)) = 0;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName());
+  ASSERT_GE(db_size, preallocated_size);
+  Close();
+
+  // enable truncate and open DB as readonly, the file should not be truncated
+  // and DB size is not changed.
+  enable_truncate = true;
+  ASSERT_OK(ReadOnlyReopen(options));
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName());
+  // The preallocated space should NOT be truncated.
+  // the DB size is almost the same.
+  ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size,
+              db_size / 100);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
 #endif  // ROCKSDB_FALLOCATE_PRESENT
 #endif  // ROCKSDB_PLATFORM_POSIX
 
@@ -1566,9 +2161,9 @@
   wo.disableWAL = false;
 
   WriteBatch batch;
-  batch.Put("foo", "bar");
+  ASSERT_OK(batch.Put("foo", "bar"));
   batch.MarkWalTerminationPoint();
-  batch.Put("foo2", "bar2");
+  ASSERT_OK(batch.Put("foo2", "bar2"));
 
   ASSERT_OK(dbfull()->Write(wo, &batch));
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,3217 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestampBase : public DBTestBase {
+ public:
+  explicit DBBasicTestWithTimestampBase(const std::string& dbname)
+      : DBTestBase(dbname, /*env_do_fsync=*/true) {}
+
+ protected:
+  static std::string Key1(uint64_t k) {
+    std::string ret;
+    PutFixed64(&ret, k);
+    std::reverse(ret.begin(), ret.end());
+    return ret;
+  }
+
+  static std::string KeyWithPrefix(std::string prefix, uint64_t k) {
+    std::string ret;
+    PutFixed64(&ret, k);
+    std::reverse(ret.begin(), ret.end());
+    return prefix + ret;
+  }
+
+  static std::vector<Slice> ConvertStrToSlice(
+      std::vector<std::string>& strings) {
+    std::vector<Slice> ret;
+    for (const auto& s : strings) {
+      ret.emplace_back(s);
+    }
+    return ret;
+  }
+
+  class TestComparator : public Comparator {
+   private:
+    const Comparator* cmp_without_ts_;
+
+   public:
+    explicit TestComparator(size_t ts_sz)
+        : Comparator(ts_sz), cmp_without_ts_(nullptr) {
+      cmp_without_ts_ = BytewiseComparator();
+    }
+
+    const char* Name() const override { return "TestComparator"; }
+
+    void FindShortSuccessor(std::string*) const override {}
+
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+    int Compare(const Slice& a, const Slice& b) const override {
+      int r = CompareWithoutTimestamp(a, b);
+      if (r != 0 || 0 == timestamp_size()) {
+        return r;
+      }
+      return -CompareTimestamp(
+          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+    }
+
+    using Comparator::CompareWithoutTimestamp;
+    int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                                bool b_has_ts) const override {
+      if (a_has_ts) {
+        assert(a.size() >= timestamp_size());
+      }
+      if (b_has_ts) {
+        assert(b.size() >= timestamp_size());
+      }
+      Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a;
+      Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b;
+      return cmp_without_ts_->Compare(lhs, rhs);
+    }
+
+    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+      if (!ts1.data() && !ts2.data()) {
+        return 0;
+      } else if (ts1.data() && !ts2.data()) {
+        return 1;
+      } else if (!ts1.data() && ts2.data()) {
+        return -1;
+      }
+      assert(ts1.size() == ts2.size());
+      uint64_t low1 = 0;
+      uint64_t low2 = 0;
+      uint64_t high1 = 0;
+      uint64_t high2 = 0;
+      const size_t kSize = ts1.size();
+      std::unique_ptr<char[]> ts1_buf(new char[kSize]);
+      memcpy(ts1_buf.get(), ts1.data(), ts1.size());
+      std::unique_ptr<char[]> ts2_buf(new char[kSize]);
+      memcpy(ts2_buf.get(), ts2.data(), ts2.size());
+      Slice ts1_copy = Slice(ts1_buf.get(), kSize);
+      Slice ts2_copy = Slice(ts2_buf.get(), kSize);
+      auto* ptr1 = const_cast<Slice*>(&ts1_copy);
+      auto* ptr2 = const_cast<Slice*>(&ts2_copy);
+      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+        assert(false);
+      }
+      if (high1 < high2) {
+        return -1;
+      } else if (high1 > high2) {
+        return 1;
+      }
+      if (low1 < low2) {
+        return -1;
+      } else if (low1 > low2) {
+        return 1;
+      }
+      return 0;
+    }
+  };
+
+  std::string Timestamp(uint64_t low, uint64_t high) {
+    std::string ts;
+    PutFixed64(&ts, low);
+    PutFixed64(&ts, high);
+    return ts;
+  }
+
+  void CheckIterUserEntry(const Iterator* it, const Slice& expected_key,
+                          ValueType expected_value_type,
+                          const Slice& expected_value,
+                          const Slice& expected_ts) const {
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ(expected_key, it->key());
+    if (kTypeValue == expected_value_type) {
+      ASSERT_EQ(expected_value, it->value());
+    }
+    ASSERT_EQ(expected_ts, it->timestamp());
+  }
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      SequenceNumber expected_seq, ValueType expected_val_type,
+                      const Slice& expected_value, const Slice& expected_ts) {
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    std::string ukey_and_ts;
+    ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+    ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+    ParsedInternalKey parsed_ikey;
+    ASSERT_OK(
+        ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+    ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
+    ASSERT_EQ(expected_val_type, parsed_ikey.type);
+    ASSERT_EQ(expected_seq, parsed_ikey.sequence);
+    if (expected_val_type == kTypeValue) {
+      ASSERT_EQ(expected_value, it->value());
+    }
+    ASSERT_EQ(expected_ts, it->timestamp());
+  }
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      ValueType expected_val_type, const Slice& expected_value,
+                      const Slice& expected_ts) {
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    std::string ukey_and_ts;
+    ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+    ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+
+    ParsedInternalKey parsed_ikey;
+    ASSERT_OK(
+        ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+    ASSERT_EQ(expected_val_type, parsed_ikey.type);
+    ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
+    if (expected_val_type == kTypeValue) {
+      ASSERT_EQ(expected_value, it->value());
+    }
+    ASSERT_EQ(expected_ts, it->timestamp());
+  }
+};
+
+class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+  DBBasicTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {}
+};
+
+TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.avoid_flush_during_shutdown = true;
+  DestroyAndReopen(options);
+
+  Options options1 = CurrentOptions();
+  options1.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options1.comparator = &test_cmp;
+  ColumnFamilyHandle* handle = nullptr;
+  Status s = db_->CreateColumnFamily(options1, "data", &handle);
+  ASSERT_OK(s);
+
+  WriteBatch wb;
+  ASSERT_OK(wb.Put("a", "value"));
+  {
+    std::string key("a");
+    std::string ts(kTimestampSize, '\0');
+    std::array<Slice, 2> key_with_ts_slices{{key, ts}};
+    SliceParts key_with_ts(key_with_ts_slices.data(), 2);
+    std::string value_str("value");
+    Slice value_slice(value_str.data(), value_str.size());
+    SliceParts value(&value_slice, 1);
+    ASSERT_OK(wb.Put(handle, key_with_ts, value));
+  }
+  {
+    std::string ts = Timestamp(1, 0);
+    std::vector<Slice> ts_list({Slice(), ts});
+    ASSERT_OK(wb.AssignTimestamps(ts_list));
+    ASSERT_OK(db_->Write(WriteOptions(), &wb));
+  }
+
+  const auto verify_db = [this](ColumnFamilyHandle* h) {
+    ASSERT_EQ("value", Get("a"));
+    std::string ts = Timestamp(1, 0);
+    Slice read_ts_slice(ts);
+    ReadOptions read_opts;
+    read_opts.timestamp = &read_ts_slice;
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, h, "a", &value));
+    ASSERT_EQ("value", value);
+  };
+
+  verify_db(handle);
+
+  delete handle;
+  Close();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back("data", options1);
+  options.create_if_missing = false;
+  s = DB::Open(options, dbname_, cf_descs, &handles_, &db_);
+  ASSERT_OK(s);
+
+  verify_db(handles_[1]);
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar"));
+  ASSERT_OK(Flush());
+
+  std::string start_str = "foo";
+  std::string end_str = "foo2";
+  Slice start(start_str), end(end_str);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GcPreserveLatestVersionBelowFullHistoryLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  std::string ts_str = Timestamp(1, 0);
+  WriteOptions wopts;
+  Slice ts = ts_str;
+  wopts.timestamp = &ts;
+  ASSERT_OK(db_->Put(wopts, "k1", "v1"));
+  ASSERT_OK(db_->Put(wopts, "k2", "v2"));
+  ASSERT_OK(db_->Put(wopts, "k3", "v3"));
+
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  wopts.timestamp = &ts;
+  ASSERT_OK(db_->Delete(wopts, "k3"));
+
+  ts_str = Timestamp(4, 0);
+  ts = ts_str;
+  wopts.timestamp = &ts;
+  ASSERT_OK(db_->Put(wopts, "k1", "v5"));
+
+  ts_str = Timestamp(3, 0);
+  ts = ts_str;
+  CompactRangeOptions cro;
+  cro.full_history_ts_low = &ts;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions ropts;
+  ropts.timestamp = &ts;
+  std::string value;
+  Status s = db_->Get(ropts, "k1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("v1", value);
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  const std::string kKey = "test kKey";
+
+  // Test set ts_low first and flush()
+  int current_ts_low = 5;
+  std::string ts_low_str = Timestamp(current_ts_low, 0);
+  Slice ts_low = ts_low_str;
+  CompactRangeOptions comp_opts;
+  comp_opts.full_history_ts_low = &ts_low;
+  comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  auto result_ts_low = cfd->GetFullHistoryTsLow();
+
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  for (int i = 0; i < 10; i++) {
+    WriteOptions write_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, kKey, Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  // TODO return a non-ok for read ts < current_ts_low and test it.
+  for (int i = 0; i < 10; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    if (i < current_ts_low - 1) {
+      ASSERT_TRUE(status.IsNotFound());
+    } else {
+      ASSERT_OK(status);
+      ASSERT_TRUE(value.compare(Key(i)) == 0);
+    }
+  }
+
+  // Test set ts_low and then trigger compaction
+  for (int i = 10; i < 20; i++) {
+    WriteOptions write_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, kKey, Key(i)));
+  }
+
+  ASSERT_OK(Flush());
+
+  current_ts_low = 15;
+  ts_low_str = Timestamp(current_ts_low, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+  result_ts_low = cfd->GetFullHistoryTsLow();
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  // TODO return a non-ok for read ts < current_ts_low and test it.
+  for (int i = current_ts_low; i < 20; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    ASSERT_OK(status);
+    ASSERT_TRUE(value.compare(Key(i)) == 0);
+  }
+
+  // Test invalid compaction with range
+  Slice start(kKey), end(kKey);
+  Status s = db_->CompactRange(comp_opts, &start, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, &start, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, nullptr, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Test invalid compaction with the decreasing ts_low
+  ts_low_str = Timestamp(current_ts_low - 1, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  s = db_->CompactRange(comp_opts, nullptr, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  std::string ts_low_str = Timestamp(9, 0);
+  ASSERT_OK(
+      db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str));
+  std::string result_ts_low;
+  ASSERT_OK(db_->GetFullHistoryTsLow(nullptr, &result_ts_low));
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low_str, result_ts_low) == 0);
+  // test increase full_history_low backward
+  std::string ts_low_str_back = Timestamp(8, 0);
+  auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                         ts_low_str_back);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow with a timestamp whose length is longger
+  // than the cf's timestamp size
+  std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a');
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                    ts_low_str_long);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow with a timestamp which is null
+  std::string ts_low_str_null = "";
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                    ts_low_str_null);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow for a column family that does not enable
+  // timestamp
+  options.comparator = BytewiseComparator();
+  DestroyAndReopen(options);
+  ts_low_str = Timestamp(10, 0);
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test GetFullHistoryTsLow for a column family that does not enable
+  // timestamp
+  std::string current_ts_low;
+  s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), &current_ts_low);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db_->Put(write_opts, Key(i), rnd.RandomString(1024)));
+  }
+
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = true;
+  size_approx_options.include_files = true;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+
+  // test multiple ranges
+  std::vector<Range> ranges;
+  std::string start_tmp = Key(10);
+  std::string end_tmp = Key(20);
+  ranges.emplace_back(Range(start_tmp, end_tmp));
+  ranges.emplace_back(Range(start, end));
+  uint64_t range_sizes[2];
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf,
+                                     ranges.data(), 2, range_sizes));
+
+  ASSERT_EQ(range_sizes[1], size);
+
+  // Zero if not including mem table
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  // Test range boundaries
+  ASSERT_OK(db_->Put(write_opts, Key(1000), rnd.RandomString(1024)));
+  // Should include start key
+  start = Key(1000);
+  end = Key(1100);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 0);
+
+  // Should exclude end key
+  start = Key(900);
+  end = Key(1000);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleIterate) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    Slice write_ts = write_timestamps[i];
+    write_opts.timestamp = &write_ts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Create 2 tables
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10; i++) {
+      WriteOptions write_opts;
+      std::string ts_str = Timestamp(i, 0);
+      Slice ts = ts_str;
+      write_opts.timestamp = &ts;
+      ASSERT_OK(db_->Put(write_opts, "key", Key(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(2U, props.size());
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+                user_collected.end());
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("rocksdb.timestamp_min"), Timestamp(0, 0));
+    ASSERT_EQ(user_collected.at("rocksdb.timestamp_max"), Timestamp(9, 0));
+  }
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+class DBBasicTestWithTimestampTableOptions
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
+ public:
+  explicit DBBasicTestWithTimestampTableOptions()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_table_options") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampTableOptions,
+    testing::Values(
+        BlockBasedTableOptions::IndexType::kBinarySearch,
+        BlockBasedTableOptions::IndexType::kHashSearch,
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey));
+
+TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = GetParam();
+  bbto.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator cmp(kTimestampSize);
+  options.comparator = &cmp;
+  DestroyAndReopen(options);
+  constexpr uint64_t kNumKeys = 1024;
+  for (uint64_t k = 0; k < kNumKeys; ++k) {
+    WriteOptions write_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, Key1(k), "value" + std::to_string(k)));
+  }
+  ASSERT_OK(Flush());
+  {
+    ReadOptions read_opts;
+    read_opts.total_order_seek = true;
+    std::string ts_str = Timestamp(2, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    // verify Get()
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      std::string value_from_get;
+      std::string key_str(it->key().data(), it->key().size());
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, &timestamp));
+      ASSERT_EQ(it->value(), value_from_get);
+      ASSERT_EQ(Timestamp(1, 0), timestamp);
+    }
+
+    // verify MultiGet()
+    constexpr uint64_t step = 2;
+    static_assert(0 == (kNumKeys % step),
+                  "kNumKeys must be a multiple of step");
+    for (uint64_t k = 0; k < kNumKeys; k += 2) {
+      std::vector<std::string> key_strs;
+      std::vector<Slice> keys;
+      for (size_t i = 0; i < step; ++i) {
+        key_strs.push_back(Key1(k + i));
+      }
+      for (size_t i = 0; i < step; ++i) {
+        keys.emplace_back(key_strs[i]);
+      }
+      std::vector<std::string> values;
+      std::vector<std::string> timestamps;
+      std::vector<Status> statuses =
+          db_->MultiGet(read_opts, keys, &values, &timestamps);
+      ASSERT_EQ(step, statuses.size());
+      ASSERT_EQ(step, values.size());
+      ASSERT_EQ(step, timestamps.size());
+      for (uint64_t i = 0; i < step; ++i) {
+        ASSERT_OK(statuses[i]);
+        ASSERT_EQ("value" + std::to_string(k + i), values[i]);
+        ASSERT_EQ(Timestamp(1, 0), timestamps[i]);
+      }
+    }
+  }
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(1, 0);
+  ts = read_ts;
+  read_opts.timestamp = &ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+
+    iter->Seek("bbb");
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  // All of the keys or this test must be longer than 3 characters
+  constexpr int kMinKeyLen = 3;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(2, 0);
+  ts = read_ts;
+  read_opts.timestamp = &ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar2"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (int i = 3; i < 9; ++i) {
+    ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i),
+                       "bar" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(2, 0);
+  ts = read_ts;
+  read_opts.timestamp = &ts;
+  std::string up_bound = "foo5";  // exclusive
+  Slice up_bound_slice = up_bound;
+  std::string lo_bound = "foo2";  // inclusive
+  Slice lo_bound_slice = lo_bound;
+  read_opts.iterate_upper_bound = &up_bound_slice;
+  read_opts.iterate_lower_bound = &lo_bound_slice;
+  read_opts.auto_prefix_mode = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekForPrev("g");
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
+                                               Timestamp(4, 3)};
+  const std::vector<std::tuple<std::string, std::string>> kvs = {
+      std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")};
+  for (const auto& ts : timestamps) {
+    WriteBatch wb;
+    for (const auto& kv : kvs) {
+      const std::string& key = std::get<0>(kv);
+      const std::string& value = std::get<1>(kv);
+      std::array<Slice, 2> key_with_ts_slices{{Slice(key), Slice(ts)}};
+      SliceParts key_with_ts(key_with_ts_slices.data(), 2);
+      std::array<Slice, 1> value_slices{{Slice(value)}};
+      SliceParts values(value_slices.data(), 1);
+      ASSERT_OK(wb.Put(key_with_ts, values));
+    }
+
+    ASSERT_OK(wb.AssignTimestamp(ts));
+    ASSERT_OK(db_->Write(WriteOptions(), &wb));
+  }
+  std::string read_ts_str = Timestamp(5, 3);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+
+  it->SeekToFirst();
+  ASSERT_TRUE(it->Valid());
+  it->Prev();
+  ASSERT_FALSE(it->Valid());
+
+  it->SeekToLast();
+  ASSERT_TRUE(it->Valid());
+  uint64_t prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(0, prev_reseek_count);
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->Seek(std::get<0>(kvs[0]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(1, prev_reseek_count);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->SeekForPrev(std::get<0>(kvs[1]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it.reset();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
+  constexpr int kNumKeysPerFile = 128;
+  constexpr uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+                                                       Timestamp(1, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    Slice write_ts = write_timestamps[i];
+    write_opts.timestamp = &write_ts;
+    for (uint64_t key = 0; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    Slice read_ts_lb = read_timestamps_lb[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                     "value" + std::to_string(i), write_timestamps[i]);
+      if (i > 0) {
+        it->Next();
+        CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                       "value" + std::to_string(i - 1),
+                       write_timestamps[i - 1]);
+      }
+    }
+    size_t expected_count = kMaxKey + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  // Delete all keys@ts=5 and check iteration result with start ts set
+  {
+    std::string write_timestamp = Timestamp(5, 0);
+    WriteOptions write_opts;
+    Slice write_ts = write_timestamp;
+    write_opts.timestamp = &write_ts;
+    for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+      Status s = db_->Delete(write_opts, Key1(key));
+      ASSERT_OK(s);
+    }
+
+    std::string read_timestamp = Timestamp(6, 0);
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamp;
+    read_opts.timestamp = &read_ts;
+    std::string read_timestamp_lb = Timestamp(2, 0);
+    Slice read_ts_lb = read_timestamp_lb;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+                     write_ts);
+      // Skip key@ts=3 and land on tombstone key@ts=5
+      it->Next();
+    }
+    ASSERT_EQ(kMaxKey + 1, count);
+  }
+  Close();
+}
+
+class DBBasicDeletionTestWithTimestamp
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<enum ValueType> {
+ public:
+  DBBasicDeletionTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("db_basic_deletion_test_with_timestamp") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicDeletionTestWithTimestamp,
+    ::testing::Values(ValueType::kTypeSingleDeletion,
+                      ValueType::kTypeDeletionWithTimestamp));
+
+TEST_P(DBBasicDeletionTestWithTimestamp, ForwardIterateStartSeqnum) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 0xffffffffffffffff;
+  const uint64_t kMinKey = kMaxKey - 1023;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  ValueType op_type = GetParam();
+  // Need to disable compaction to bottommost level when sequence number will be
+  // zeroed out, causing the verification of sequence number to fail in this
+  // test.
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::vector<SequenceNumber> start_seqs;
+
+  const int kNumTimestamps = 4;
+  std::vector<std::string> write_ts_list;
+  for (int t = 0; t != kNumTimestamps; ++t) {
+    write_ts_list.push_back(Timestamp(2 * t, /*do not care*/ 17));
+  }
+  WriteOptions write_opts;
+  for (size_t i = 0; i != write_ts_list.size(); ++i) {
+    Slice write_ts = write_ts_list[i];
+    write_opts.timestamp = &write_ts;
+    for (uint64_t k = kMaxKey; k >= kMinKey; --k) {
+      Status s;
+      if (k % 2) {
+        s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i));
+      } else {
+        if (op_type == ValueType::kTypeDeletionWithTimestamp) {
+          s = db_->Delete(write_opts, Key1(k));
+        } else if (op_type == ValueType::kTypeSingleDeletion) {
+          s = db_->SingleDelete(write_opts, Key1(k));
+        }
+      }
+      ASSERT_OK(s);
+    }
+    start_seqs.push_back(db_->GetLatestSequenceNumber());
+  }
+  std::vector<std::string> read_ts_list;
+  for (int t = 0; t != kNumTimestamps - 1; ++t) {
+    read_ts_list.push_back(Timestamp(2 * t + 3, /*do not care*/ 17));
+  }
+
+  ReadOptions read_opts;
+  // Scan with only read_opts.iter_start_seqnum set.
+  for (size_t i = 0; i != read_ts_list.size(); ++i) {
+    Slice read_ts = read_ts_list[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_seqnum = start_seqs[i] + 1;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
+    uint64_t key = kMinKey;
+    for (iter->Seek(Key1(kMinKey)); iter->Valid(); iter->Next()) {
+      CheckIterEntry(
+          iter.get(), Key1(key), expected_seq, (key % 2) ? kTypeValue : op_type,
+          (key % 2) ? "value" + std::to_string(i + 1) : std::string(),
+          write_ts_list[i + 1]);
+      ++key;
+      --expected_seq;
+    }
+  }
+  // Scan with both read_opts.iter_start_seqnum and read_opts.iter_start_ts set.
+  std::vector<std::string> read_ts_lb_list;
+  for (int t = 0; t < kNumTimestamps - 1; ++t) {
+    read_ts_lb_list.push_back(Timestamp(2 * t, /*do not care*/ 17));
+  }
+  for (size_t i = 0; i < read_ts_list.size(); ++i) {
+    Slice read_ts = read_ts_list[i];
+    Slice read_ts_lb = read_ts_lb_list[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    read_opts.iter_start_seqnum = start_seqs[i] + 1;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    uint64_t key = kMinKey;
+    SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
+    for (it->Seek(Key1(kMinKey)); it->Valid(); it->Next()) {
+      CheckIterEntry(it.get(), Key1(key), expected_seq,
+                     (key % 2) ? kTypeValue : op_type,
+                     "value" + std::to_string(i + 1), write_ts_list[i + 1]);
+      ++key;
+      --expected_seq;
+    }
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Insert kNumKeys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    s = db_->Put(write_opts, "foo", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+    ts_str = Timestamp(kNumKeys, 0);
+    ts = ts_str;
+    read_opts.timestamp = &ts;
+    iter.reset(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue,
+                       "value" + std::to_string(kNumKeys - 1), ts_str);
+    ASSERT_EQ(
+        2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Write kNumKeys + 1 keys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    s = db_->Put(write_opts, "a", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    WriteBatch batch;
+    const std::string dummy_ts(kTimestampSize, '\0');
+    {
+      std::array<Slice, 2> key_with_ts_slices{{"a", dummy_ts}};
+      SliceParts key_with_ts(key_with_ts_slices.data(), 2);
+      std::array<Slice, 1> value_slices{{"new_value"}};
+      SliceParts values(value_slices.data(), 1);
+      ASSERT_OK(batch.Put(key_with_ts, values));
+    }
+    {
+      std::string key_with_ts("b");
+      key_with_ts.append(dummy_ts);
+      ASSERT_OK(batch.Put(key_with_ts, "new_value"));
+    }
+    s = batch.AssignTimestamp(ts_str);
+    ASSERT_OK(s);
+    s = db_->Write(write_opts, &batch);
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("a");
+    iter->Next();
+    CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &ts;
+    Status s = db_->Put(write_opts, "b", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, "a", "value"));
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    iter->Prev();
+    CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  // Read with MultiGet
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  // random data
+  for (int i = 0; i < 3; i++) {
+    auto key = ToString(i * 10);
+    auto value = ToString(i * 10);
+    Slice key_slice = key;
+    Slice value_slice = value;
+    ASSERT_OK(db_->Put(write_opts, key_slice, value_slice));
+    ASSERT_OK(Flush());
+  }
+
+  // Make num_levels to 2 to do key range filtering of sst files
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+  // Read with MultiGet
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<std::string> values(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+  std::vector<Status> statuses =
+      db_->MultiGet(read_opts, cfhs, keys, &values, &timestamps);
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  constexpr size_t max_skippable_internal_keys = 2;
+  const size_t kNumKeys = max_skippable_internal_keys + 2;
+  WriteOptions write_opts;
+  Status s;
+  {
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, "a", "value"));
+  }
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    s = db_->Put(write_opts, "b", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    iter->Next();
+    ASSERT_TRUE(iter->status().IsIncomplete());
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) {
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  constexpr size_t max_skippable_internal_keys = 2;
+  const size_t kNumKeys = max_skippable_internal_keys + 2;
+  WriteOptions write_opts;
+  Status s;
+  {
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, "b", "value"));
+  }
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    s = db_->Put(write_opts, "a", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    iter->Prev();
+    ASSERT_TRUE(iter->status().IsIncomplete());
+  }
+  Close();
+}
+
+// Create two L0, and compact them to a new L1. In this test, L1 is L_bottom.
+// Two L0s:
+//       f1                                  f2
+// <a, 1, kTypeValue>    <a, 3, kTypeDeletionWithTimestamp>...<b, 2, kTypeValue>
+// Since f2.smallest < f1.largest < f2.largest
+// f1 and f2 will be the inputs of a real compaction instead of trivial move.
+TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.num_levels = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+  ASSERT_OK(db_->Put(write_opts, "a", "value0"));
+  ASSERT_OK(Flush());
+
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  write_opts.timestamp = &ts;
+  ASSERT_OK(db_->Put(write_opts, "b", "value0"));
+  ts_str = Timestamp(3, 0);
+  ts = ts_str;
+  write_opts.timestamp = &ts;
+  ASSERT_OK(db_->Delete(write_opts, "a"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions read_opts;
+  ts_str = Timestamp(1, 0);
+  ts = ts_str;
+  read_opts.timestamp = &ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value0", value);
+
+  ts_str = Timestamp(3, 0);
+  ts = ts_str;
+  read_opts.timestamp = &ts;
+  s = db_->Get(read_opts, "a", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Time-travel to the past before deletion
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  read_opts.timestamp = &ts;
+  s = db_->Get(read_opts, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value0", value);
+  Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampFilterPrefixSettings
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const FilterPolicy>, bool, bool,
+                     std::shared_ptr<const SliceTransform>, bool, double,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTimestampFilterPrefixSettings()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_filter_prefix") {}
+};
+
+TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = std::get<1>(GetParam());
+  bbto.cache_index_and_filter_blocks = std::get<2>(GetParam());
+  bbto.index_type = std::get<6>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.prefix_extractor = std::get<3>(GetParam());
+  options.memtable_whole_key_filtering = std::get<4>(GetParam());
+  options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam());
+
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  const int kMaxKey = 1000;
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  int idx = 0;
+  for (; idx < kMaxKey / 4; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar"));
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (; idx < kMaxKey / 2; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar"));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (; idx < kMaxKey; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar"));
+  }
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+
+  ReadOptions read_opts_total_order;
+  read_opts_total_order.timestamp = &ts;
+  read_opts_total_order.total_order_seek = true;
+
+  for (idx = 0; idx < kMaxKey; idx++) {
+    size_t batch_size = 4;
+    std::vector<std::string> keys_str(batch_size);
+    std::vector<PinnableSlice> values(batch_size);
+    std::vector<Status> statuses(batch_size);
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+
+    keys_str[0] = Key1(idx);
+    keys_str[1] = KeyWithPrefix("foo", idx);
+    keys_str[2] = Key1(kMaxKey + idx);
+    keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx);
+
+    auto keys = ConvertStrToSlice(keys_str);
+
+    db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                  statuses.data());
+
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(statuses[i]);
+    }
+    for (int i = 2; i < 4; i++) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    }
+
+    for (int i = 0; i < 2; i++) {
+      std::string value;
+      ASSERT_OK(db_->Get(read_opts, keys[i], &value));
+      std::unique_ptr<Iterator> it1(db_->NewIterator(read_opts));
+      ASSERT_NE(nullptr, it1);
+      ASSERT_OK(it1->status());
+      // TODO(zjay) Fix seek with prefix
+      // it1->Seek(keys[i]);
+      // ASSERT_TRUE(it1->Valid());
+    }
+
+    for (int i = 2; i < 4; i++) {
+      std::string value;
+      Status s = db_->Get(read_opts, keys[i], &value);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampFilterPrefixSettings,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const FilterPolicy>(nullptr),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10, true)),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10,
+                                                                     false))),
+        ::testing::Bool(), ::testing::Bool(),
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Bool(), ::testing::Values(0, 0.1),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class DataVisibilityTest : public DBBasicTestWithTimestampBase {
+ public:
+  DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") {
+    // Initialize test data
+    for (int i = 0; i < kTestDataSize; i++) {
+      test_data_[i].key = "key" + ToString(i);
+      test_data_[i].value = "value" + ToString(i);
+      test_data_[i].timestamp = Timestamp(i, 0);
+      test_data_[i].ts = i;
+      test_data_[i].seq_num = kMaxSequenceNumber;
+    }
+  }
+
+ protected:
+  struct TestData {
+    std::string key;
+    std::string value;
+    int ts;
+    std::string timestamp;
+    SequenceNumber seq_num;
+  };
+
+  constexpr static int kTestDataSize = 3;
+  TestData test_data_[kTestDataSize];
+
+  void PutTestData(int index, ColumnFamilyHandle* cfh = nullptr) {
+    ASSERT_LE(index, kTestDataSize);
+    WriteOptions write_opts;
+    Slice ts_slice = test_data_[index].timestamp;
+    write_opts.timestamp = &ts_slice;
+
+    if (cfh == nullptr) {
+      ASSERT_OK(
+          db_->Put(write_opts, test_data_[index].key, test_data_[index].value));
+      const Snapshot* snap = db_->GetSnapshot();
+      test_data_[index].seq_num = snap->GetSequenceNumber();
+      if (index > 0) {
+        ASSERT_GT(test_data_[index].seq_num, test_data_[index - 1].seq_num);
+      }
+      db_->ReleaseSnapshot(snap);
+    } else {
+      ASSERT_OK(db_->Put(write_opts, cfh, test_data_[index].key,
+                         test_data_[index].value));
+    }
+  }
+
+  void AssertVisibility(int ts, SequenceNumber seq,
+                        std::vector<Status> statuses) {
+    ASSERT_EQ(kTestDataSize, statuses.size());
+    for (int i = 0; i < kTestDataSize; i++) {
+      if (test_data_[i].seq_num <= seq && test_data_[i].ts <= ts) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+      }
+    }
+  }
+
+  std::vector<Slice> GetKeys() {
+    std::vector<Slice> ret(kTestDataSize);
+    for (int i = 0; i < kTestDataSize; i++) {
+      ret[i] = test_data_[i].key;
+    }
+    return ret;
+  }
+
+  void VerifyDefaultCF(int ts, const Snapshot* snap = nullptr) {
+    ReadOptions read_opts;
+    std::string read_ts = Timestamp(ts, 0);
+    Slice read_ts_slice = read_ts;
+    read_opts.timestamp = &read_ts_slice;
+    read_opts.snapshot = snap;
+
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+    std::vector<ColumnFamilyHandle*> cfs(kTestDataSize, cfh);
+    SequenceNumber seq =
+        snap ? snap->GetSequenceNumber() : kMaxSequenceNumber - 1;
+
+    // There're several MultiGet interfaces with not exactly the same
+    // implementations, query data with all of them.
+    auto keys = GetKeys();
+    std::vector<std::string> values;
+    auto s1 = db_->MultiGet(read_opts, cfs, keys, &values);
+    AssertVisibility(ts, seq, s1);
+
+    auto s2 = db_->MultiGet(read_opts, keys, &values);
+    AssertVisibility(ts, seq, s2);
+
+    std::vector<std::string> timestamps;
+    auto s3 = db_->MultiGet(read_opts, cfs, keys, &values, &timestamps);
+    AssertVisibility(ts, seq, s3);
+
+    auto s4 = db_->MultiGet(read_opts, keys, &values, &timestamps);
+    AssertVisibility(ts, seq, s4);
+
+    std::vector<PinnableSlice> values_ps5(kTestDataSize);
+    std::vector<Status> s5(kTestDataSize);
+    db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps5.data(),
+                  s5.data());
+    AssertVisibility(ts, seq, s5);
+
+    std::vector<PinnableSlice> values_ps6(kTestDataSize);
+    std::vector<Status> s6(kTestDataSize);
+    std::vector<std::string> timestamps_array(kTestDataSize);
+    db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps6.data(),
+                  timestamps_array.data(), s6.data());
+    AssertVisibility(ts, seq, s6);
+
+    std::vector<PinnableSlice> values_ps7(kTestDataSize);
+    std::vector<Status> s7(kTestDataSize);
+    db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+                  values_ps7.data(), s7.data());
+    AssertVisibility(ts, seq, s7);
+
+    std::vector<PinnableSlice> values_ps8(kTestDataSize);
+    std::vector<Status> s8(kTestDataSize);
+    db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+                  values_ps8.data(), timestamps_array.data(), s8.data());
+    AssertVisibility(ts, seq, s8);
+  }
+
+  void VerifyDefaultCF(const Snapshot* snap = nullptr) {
+    for (int i = 0; i <= kTestDataSize; i++) {
+      VerifyDefaultCF(i, snap);
+    }
+  }
+};
+constexpr int DataVisibilityTest::kTestDataSize;
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=90
+//           ts=100
+//           seq=10
+//                               seq'=11
+//                               write finishes
+//         GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=1t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot1) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::GetImpl:3",
+       "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut",
+       "DBImpl::GetImpl:4"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts_str = Timestamp(1, 0);
+    Slice write_ts = write_ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &write_ts;
+    TEST_SYNC_POINT(
+        "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut");
+    Status s = db_->Put(write_opts, "foo", "value");
+    ASSERT_OK(s);
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut");
+  });
+  ReadOptions read_opts;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+
+  writer_thread.join();
+  ASSERT_TRUE(s.IsNotFound());
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=90
+//           ts=100
+//           seq=10
+//                               seq'=11
+//                               write finishes
+//                               Flush
+//         GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot2) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::GetImpl:3",
+       "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut",
+       "DBImpl::GetImpl:4"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts_str = Timestamp(1, 0);
+    Slice write_ts = write_ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &write_ts;
+    TEST_SYNC_POINT(
+        "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut");
+    Status s = db_->Put(write_opts, "foo", "value");
+    ASSERT_OK(s);
+    ASSERT_OK(Flush());
+
+    write_ts_str = Timestamp(2, 0);
+    write_ts = write_ts_str;
+    write_opts.timestamp = &write_ts;
+    s = db_->Put(write_opts, "bar", "value");
+    ASSERT_OK(s);
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut");
+  });
+  ReadOptions read_opts;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  writer_thread.join();
+  ASSERT_TRUE(s.IsNotFound());
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader               writer
+//       seq=10
+//                            ts'=90
+//       ts=100
+//                            seq'=11
+//                            write finishes
+//       GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot1) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap",
+       "DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithSnapshot1:AfterPut",
+       "DBImpl::GetImpl:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts_str = Timestamp(1, 0);
+    Slice write_ts = write_ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &write_ts;
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:BeforePut");
+    Status s = db_->Put(write_opts, "foo", "value");
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:AfterPut");
+    ASSERT_OK(s);
+  });
+  ReadOptions read_opts;
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap");
+  read_opts.snapshot = snap;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  writer_thread.join();
+
+  ASSERT_TRUE(s.IsNotFound());
+
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader               writer
+//       seq=10
+//                            ts'=90
+//       ts=100
+//                            seq'=11
+//                            write finishes
+//                            Flush
+//       GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot2) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap",
+       "DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts_str = Timestamp(1, 0);
+    Slice write_ts = write_ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &write_ts;
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot2:BeforePut");
+    Status s = db_->Put(write_opts, "foo", "value1");
+    ASSERT_OK(s);
+    ASSERT_OK(Flush());
+
+    write_ts_str = Timestamp(2, 0);
+    write_ts = write_ts_str;
+    write_opts.timestamp = &write_ts;
+    s = db_->Put(write_opts, "bar", "value2");
+    ASSERT_OK(s);
+  });
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap");
+  writer_thread.join();
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.snapshot = snap;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//      reader                writer
+//                            ts'=90
+//      ts=100
+//      seq=10
+//                            seq'=11
+//                            write finishes
+//      scan(ts,seq)
+// <k, t1, s1> can be seen in scan as long as ts>=t1 AND seq>=s1. If ts>=t1 but
+// seq<s1, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithoutSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::NewIterator:3",
+       "DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut");
+    for (int i = 0; i < 3; ++i) {
+      std::string write_ts_str = Timestamp(i + 1, 0);
+      Slice write_ts = write_ts_str;
+      write_opts.timestamp = &write_ts;
+      Status s = db_->Put(write_opts, "key" + std::to_string(i),
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  });
+  std::string read_ts_str = Timestamp(10, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  read_opts.timestamp = &read_ts;
+  Iterator* it = db_->NewIterator(read_opts);
+  ASSERT_NE(nullptr, it);
+  writer_thread.join();
+  it->SeekToFirst();
+  ASSERT_FALSE(it->Valid());
+  delete it;
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader         writer
+//       seq=10
+//                      ts'=90
+//       ts=100         seq'=11
+//                      write finishes
+//       scan(ts,seq)
+// <k, t1, s1> can be seen by the scan only if t1<=ts AND s1<=seq. If t1<=ts
+// but s1>seq, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot",
+       "DataVisibilityTest::RangeScanWithSnapshot:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithSnapshot:BeforePut");
+    for (int i = 0; i < 3; ++i) {
+      std::string write_ts_str = Timestamp(i + 1, 0);
+      Slice write_ts = write_ts_str;
+      write_opts.timestamp = &write_ts;
+      Status s = db_->Put(write_opts, "key" + std::to_string(i),
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  });
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot");
+
+  writer_thread.join();
+
+  std::string read_ts_str = Timestamp(10, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.snapshot = snap;
+  read_opts.total_order_seek = true;
+  read_opts.timestamp = &read_ts;
+  Iterator* it = db_->NewIterator(read_opts);
+  ASSERT_NE(nullptr, it);
+  it->Seek("key0");
+  ASSERT_FALSE(it->Valid());
+
+  delete it;
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+// Query each combination and make sure for MultiGet key <k, t1, s1>, only
+// return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithTimestamp) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  const Snapshot* snap0 = db_->GetSnapshot();
+  PutTestData(0);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+
+  const Snapshot* snap1 = db_->GetSnapshot();
+  PutTestData(1);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+
+  ASSERT_OK(Flush());
+
+  const Snapshot* snap2 = db_->GetSnapshot();
+  PutTestData(2);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+  VerifyDefaultCF(snap2);
+
+  db_->ReleaseSnapshot(snap0);
+  db_->ReleaseSnapshot(snap1);
+  db_->ReleaseSnapshot(snap2);
+
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=0, 1
+//           ts=3
+//           seq=10
+//                               seq'=11, 12
+//                               write finishes
+//         MultiGet(ts,seq)
+// For MultiGet <k, t1, s1>, only return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::MultiGet:AfterGetSeqNum1",
+       "DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"},
+      {"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut",
+       "DBImpl::MultiGet:AfterGetSeqNum2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut");
+    PutTestData(0);
+    PutTestData(1);
+    TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut");
+  });
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(kTestDataSize, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  auto keys = GetKeys();
+  std::vector<std::string> values;
+  auto ss = db_->MultiGet(read_opts, keys, &values);
+
+  writer_thread.join();
+  for (auto s : ss) {
+    ASSERT_TRUE(s.IsNotFound());
+  }
+  VerifyDefaultCF();
+  Close();
+}
+
+TEST_F(DataVisibilityTest, MultiGetCrossCF) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  CreateAndReopenWithCF({"second"}, options);
+  ColumnFamilyHandle* second_cf = handles_[1];
+
+  const Snapshot* snap0 = db_->GetSnapshot();
+  PutTestData(0);
+  PutTestData(0, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+
+  const Snapshot* snap1 = db_->GetSnapshot();
+  PutTestData(1);
+  PutTestData(1, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+
+  ASSERT_OK(Flush());
+
+  const Snapshot* snap2 = db_->GetSnapshot();
+  PutTestData(2);
+  PutTestData(2, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+  VerifyDefaultCF(snap2);
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(kTestDataSize, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  read_opts.snapshot = snap1;
+  auto keys = GetKeys();
+  auto keys2 = GetKeys();
+  keys.insert(keys.end(), keys2.begin(), keys2.end());
+  std::vector<ColumnFamilyHandle*> cfs(kTestDataSize,
+                                       db_->DefaultColumnFamily());
+  std::vector<ColumnFamilyHandle*> cfs2(kTestDataSize, second_cf);
+  cfs.insert(cfs.end(), cfs2.begin(), cfs2.end());
+
+  std::vector<std::string> values;
+  auto ss = db_->MultiGet(read_opts, cfs, keys, &values);
+  for (int i = 0; i < 2 * kTestDataSize; i++) {
+    if (i % 3 == 0) {
+      // only the first key for each column family should be returned
+      ASSERT_OK(ss[i]);
+    } else {
+      ASSERT_TRUE(ss[i].IsNotFound());
+    }
+  }
+
+  db_->ReleaseSnapshot(snap0);
+  db_->ReleaseSnapshot(snap1);
+  db_->ReleaseSnapshot(snap2);
+  Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampCompressionSettings
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const FilterPolicy>, CompressionType,
+                     uint32_t, uint32_t>> {
+ public:
+  DBBasicTestWithTimestampCompressionSettings()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_compression") {}
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGet) {
+  const int kNumKeysPerFile = 1024;
+  const size_t kNumTimestamps = 4;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  options.target_file_size_base = 1 << 26;  // 64MB
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    wopts.timestamp = &write_ts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+        ASSERT_OK(Put(cf, Key1(j),
+                      "value_" + std::to_string(j) + "_" + std::to_string(i),
+                      wopts));
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+          std::string value;
+          ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value));
+          ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                    value);
+        }
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  const int kNumKeysPerFile = 1024;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  options.target_file_size_base = 1 << 26;  // 64MB
+
+  DestroyAndReopen(options);
+
+  const size_t kNumL0Files =
+      static_cast<size_t>(Options().level0_file_num_compaction_trigger);
+  {
+    // Half of the keys will go through Deletion and remaining half with
+    // SingleDeletion. Generate enough L0 files with ts=1 to trigger compaction
+    // to L1
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    WriteOptions wopts;
+    wopts.timestamp = &ts;
+    for (size_t i = 0; i < kNumL0Files; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(db_->Put(wopts, Key1(j), "value" + std::to_string(i)));
+      }
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Generate another L0 at ts=3
+    ts_str = Timestamp(3, 0);
+    ts = ts_str;
+    wopts.timestamp = &ts;
+    for (int i = 0; i < kNumKeysPerFile; ++i) {
+      std::string key_str = Key1(i);
+      Slice key(key_str);
+      if ((i % 3) == 0) {
+        if (i < kNumKeysPerFile / 2) {
+          ASSERT_OK(db_->Delete(wopts, key));
+        } else {
+          ASSERT_OK(db_->SingleDelete(wopts, key));
+        }
+      } else {
+        ASSERT_OK(db_->Put(wopts, key, "new_value"));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    // Populate memtable at ts=5
+    ts_str = Timestamp(5, 0);
+    ts = ts_str;
+    wopts.timestamp = &ts;
+    for (int i = 0; i != kNumKeysPerFile; ++i) {
+      std::string key_str = Key1(i);
+      Slice key(key_str);
+      if ((i % 3) == 1) {
+        if (i < kNumKeysPerFile / 2) {
+          ASSERT_OK(db_->Delete(wopts, key));
+        } else {
+          ASSERT_OK(db_->SingleDelete(wopts, key));
+        }
+      } else if ((i % 3) == 2) {
+        ASSERT_OK(db_->Put(wopts, key, "new_value_2"));
+      }
+    }
+  }
+  {
+    std::string ts_str = Timestamp(6, 0);
+    Slice ts = ts_str;
+    ReadOptions ropts;
+    ropts.timestamp = &ts;
+    for (uint64_t i = 0; i != static_cast<uint64_t>(kNumKeysPerFile); ++i) {
+      std::string value;
+      Status s = db_->Get(ropts, Key1(i), &value);
+      if ((i % 3) == 2) {
+        ASSERT_OK(s);
+        ASSERT_EQ("new_value_2", value);
+      } else {
+        ASSERT_TRUE(s.IsNotFound());
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::vector<std::string> result;
+    {
+      InstrumentedMutexLock lock(&mutex_);
+      result = flushed_files_;
+    }
+    return result;
+  }
+
+  void ClearFlushedFiles() {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  InstrumentedMutex mutex_;
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGetWithCompaction) {
+  const int kNumKeysPerFile = 1024;
+  const size_t kNumTimestamps = 2;
+  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+  const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  const auto& verify_records_func = [&](size_t i, size_t begin, size_t end,
+                                        ColumnFamilyHandle* cfh) {
+    std::string value;
+    std::string timestamp;
+
+    ReadOptions ropts;
+    const Slice read_ts = read_ts_list[i];
+    ropts.timestamp = &read_ts;
+    std::string expected_timestamp =
+        std::string(write_ts_list[i].data(), write_ts_list[i].size());
+
+    for (size_t j = begin; j <= end; ++j) {
+      ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value, &timestamp));
+      ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), value);
+      ASSERT_EQ(expected_timestamp, timestamp);
+    }
+  };
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    wopts.timestamp = &write_ts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      size_t memtable_get_start = 0;
+      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+        ASSERT_OK(Put(cf, Key1(j),
+                      "value_" + std::to_string(j) + "_" + std::to_string(i),
+                      wopts));
+        if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
+          verify_records_func(i, memtable_get_start, j, handles_[cf]);
+          memtable_get_start = j + 1;
+
+          // flush all keys with the same timestamp to two sst files, split at
+          // incremental positions such that lowerlevel[1].smallest.userkey ==
+          // higherlevel[0].largest.userkey
+          ASSERT_OK(Flush(cf));
+          ASSERT_OK(dbfull()->TEST_WaitForCompact());  // wait for flush (which
+                                                       // is also a compaction)
+
+          // compact files (2 at each level) to a lower level such that all
+          // keys with the same timestamp is at one level, with newer versions
+          // at higher levels.
+          CompactionOptions compact_opt;
+          compact_opt.compression = kNoCompression;
+          ASSERT_OK(db_->CompactFiles(compact_opt, handles_[cf],
+                                      collector->GetFlushedFiles(),
+                                      static_cast<int>(kNumTimestamps - i)));
+          collector->ClearFlushedFiles();
+        }
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      std::string expected_timestamp(write_ts_list[i].data(),
+                                     write_ts_list[i].size());
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        verify_records_func(i, 0, kNumKeysPerTimestamp - 1, cfh);
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) {
+  const int kNumKeysPerFile = 8192;
+  const size_t kNumTimestamps = 2;
+  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.memtable_whole_key_filtering = true;
+
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(
+      10 /*bits_per_key*/, false /*use_block_based_builder*/));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  const auto& verify_records_func = [&](size_t i, ColumnFamilyHandle* cfh) {
+    std::vector<Slice> keys;
+    std::vector<std::string> key_vals;
+    std::vector<std::string> values;
+    std::vector<std::string> timestamps;
+
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      key_vals.push_back(Key1(j));
+    }
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      keys.push_back(key_vals[j]);
+    }
+
+    ReadOptions ropts;
+    const Slice read_ts = read_ts_list[i];
+    ropts.timestamp = &read_ts;
+    std::string expected_timestamp(write_ts_list[i].data(),
+                                   write_ts_list[i].size());
+
+    std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+    std::vector<Status> statuses =
+        db_->MultiGet(ropts, cfhs, keys, &values, &timestamps);
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      ASSERT_OK(statuses[j]);
+      ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                values[j]);
+      ASSERT_EQ(expected_timestamp, timestamps[j]);
+    }
+  };
+
+  const std::string dummy_ts(ts_sz, '\0');
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice& write_ts = write_ts_list.back();
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      WriteOptions wopts;
+      WriteBatch batch;
+      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+        const std::string key = Key1(j);
+        const std::string value =
+            "value_" + std::to_string(j) + "_" + std::to_string(i);
+        std::array<Slice, 2> key_with_ts_slices{{key, dummy_ts}};
+        SliceParts key_with_ts(key_with_ts_slices.data(), 2);
+        std::array<Slice, 1> value_slices{{value}};
+        SliceParts values(value_slices.data(), 1);
+        ASSERT_OK(batch.Put(handles_[cf], key_with_ts, values));
+      }
+      ASSERT_OK(batch.AssignTimestamp(write_ts));
+      ASSERT_OK(db_->Write(wopts, &batch));
+
+      verify_records_func(i, handles_[cf]);
+
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        verify_records_func(i, cfh);
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+  ASSERT_OK(db_->Put(write_opts, "foo", "value"));
+  ASSERT_OK(db_->Put(write_opts, "bar", "value"));
+  ASSERT_OK(db_->Put(write_opts, "fooxxxxxxxxxxxxxxxx", "value"));
+  ASSERT_OK(db_->Put(write_opts, "barxxxxxxxxxxxxxxxx", "value"));
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  {
+    ColumnFamilyHandle* column_families[] = {cfh, cfh};
+    Slice keys[] = {"foo", "bar"};
+    PinnableSlice values[] = {PinnableSlice(), PinnableSlice()};
+    Status statuses[] = {Status::OK(), Status::OK()};
+    dbfull()->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0],
+                       &values[0], &statuses[0], /*sorted_input=*/false);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+  }
+  {
+    ColumnFamilyHandle* column_families[] = {cfh, cfh, cfh, cfh};
+    // Make user keys longer than configured timestamp size (16 bytes) to
+    // verify RocksDB does not use the trailing bytes 'x' as timestamp.
+    Slice keys[] = {"fooxxxxxxxxxxxxxxxx", "barxxxxxxxxxxxxxxxx", "foo", "bar"};
+    PinnableSlice values[] = {PinnableSlice(), PinnableSlice(), PinnableSlice(),
+                              PinnableSlice()};
+    Status statuses[] = {Status::OK(), Status::OK(), Status::OK(),
+                         Status::OK()};
+    dbfull()->MultiGet(read_opts, /*num_keys=*/4, &column_families[0], &keys[0],
+                       &values[0], &statuses[0], /*sorted_input=*/false);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+  }
+  Close();
+}
+
+#endif  // !ROCKSDB_LITE
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampCompressionSettings,
+    ::testing::Combine(
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10, false))),
+        ::testing::Values(kNoCompression, kZlibCompression, kLZ4Compression,
+                          kLZ4HCCompression, kZSTD),
+        ::testing::Values(0, 1 << 14), ::testing::Values(1, 4)));
+
+class DBBasicTestWithTimestampPrefixSeek
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const SliceTransform>,
+                     std::shared_ptr<const FilterPolicy>, bool,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTimestampPrefixSeek()
+      : DBBasicTestWithTimestampBase(
+            "/db_basic_test_with_timestamp_prefix_seek") {}
+};
+
+TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) {
+  const size_t kNumKeysPerFile = 128;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor = std::get<0>(GetParam());
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const uint64_t kMaxKey = 0xffffffffffffffff;
+  const uint64_t kMinKey = 0xfffffffffffff000;
+  const std::vector<std::string> write_ts_list = {Timestamp(3, 0xffffffff),
+                                                  Timestamp(6, 0xffffffff)};
+  WriteOptions write_opts;
+  {
+    for (size_t i = 0; i != write_ts_list.size(); ++i) {
+      Slice write_ts = write_ts_list[i];
+      write_opts.timestamp = &write_ts;
+      for (uint64_t key = kMaxKey; key >= kMinKey; --key) {
+        Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i));
+        ASSERT_OK(s);
+      }
+    }
+  }
+  const std::vector<std::string> read_ts_list = {Timestamp(5, 0xffffffff),
+                                                 Timestamp(9, 0xffffffff)};
+  {
+    ReadOptions read_opts;
+    read_opts.total_order_seek = false;
+    read_opts.prefix_same_as_start = std::get<2>(GetParam());
+    fprintf(stdout, "%s %s %d\n", options.prefix_extractor->Name(),
+            bbto.filter_policy ? bbto.filter_policy->Name() : "null",
+            static_cast<int>(read_opts.prefix_same_as_start));
+    for (size_t i = 0; i != read_ts_list.size(); ++i) {
+      Slice read_ts = read_ts_list[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+
+      // Seek to kMaxKey
+      iter->Seek(Key1(kMaxKey));
+      CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue,
+                         "value" + std::to_string(i), write_ts_list[i]);
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+
+      // Seek to kMinKey
+      iter->Seek(Key1(kMinKey));
+      CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue,
+                         "value" + std::to_string(i), write_ts_list[i]);
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+    }
+    const std::vector<uint64_t> targets = {kMinKey, kMinKey + 0x10,
+                                           kMinKey + 0x100, kMaxKey};
+    const SliceTransform* const pe = options.prefix_extractor.get();
+    ASSERT_NE(nullptr, pe);
+    const size_t kPrefixShift =
+        8 * (Key1(0).size() - pe->Transform(Key1(0)).size());
+    const uint64_t kPrefixMask =
+        ~((static_cast<uint64_t>(1) << kPrefixShift) - 1);
+    const uint64_t kNumKeysWithinPrefix =
+        (static_cast<uint64_t>(1) << kPrefixShift);
+    for (size_t i = 0; i != read_ts_list.size(); ++i) {
+      Slice read_ts = read_ts_list[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+      // Forward and backward iterate.
+      for (size_t j = 0; j != targets.size(); ++j) {
+        std::string start_key = Key1(targets[j]);
+        uint64_t expected_ub =
+            (targets[j] & kPrefixMask) - 1 + kNumKeysWithinPrefix;
+        uint64_t expected_key = targets[j];
+        size_t count = 0;
+        it->Seek(Key1(targets[j]));
+        while (it->Valid()) {
+          std::string saved_prev_key;
+          saved_prev_key.assign(it->key().data(), it->key().size());
+
+          // Out of prefix
+          if (!read_opts.prefix_same_as_start &&
+              pe->Transform(saved_prev_key) != pe->Transform(start_key)) {
+            break;
+          }
+          CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+                             "value" + std::to_string(i), write_ts_list[i]);
+          ++count;
+          ++expected_key;
+          it->Next();
+        }
+        ASSERT_EQ(expected_ub - targets[j] + 1, count);
+
+        count = 0;
+        expected_key = targets[j];
+        it->SeekForPrev(start_key);
+        uint64_t expected_lb = (targets[j] & kPrefixMask);
+        while (it->Valid()) {
+          // Out of prefix
+          if (!read_opts.prefix_same_as_start &&
+              pe->Transform(it->key()) != pe->Transform(start_key)) {
+            break;
+          }
+          CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+                             "value" + std::to_string(i), write_ts_list[i]);
+          ++count;
+          --expected_key;
+          it->Prev();
+        }
+        ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count);
+      }
+    }
+  }
+  Close();
+}
+
+// TODO(yanqin): consider handling non-fixed-length prefix extractors, e.g.
+// NoopTransform.
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampPrefixSeek,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10 /*bits_per_key*/, false)),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(20 /*bits_per_key*/,
+                                                   false))),
+        ::testing::Bool(),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+
+class DBBasicTestWithTsIterTombstones
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const SliceTransform>,
+                     std::shared_ptr<const FilterPolicy>, int,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTsIterTombstones()
+      : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {}
+};
+
+TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) {
+  constexpr size_t kNumKeysPerFile = 128;
+  Options options = CurrentOptions();
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor = std::get<0>(GetParam());
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.num_levels = std::get<2>(GetParam());
+  DestroyAndReopen(options);
+  std::vector<std::string> write_ts_strs = {Timestamp(2, 0), Timestamp(4, 0)};
+  constexpr uint64_t kMaxKey = 0xffffffffffffffff;
+  constexpr uint64_t kMinKey = 0xfffffffffffff000;
+  // Insert kMinKey...kMaxKey
+  uint64_t key = kMinKey;
+  WriteOptions write_opts;
+  Slice ts = write_ts_strs[0];
+  write_opts.timestamp = &ts;
+  do {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+    if (kMaxKey == key) {
+      break;
+    }
+    ++key;
+  } while (true);
+
+  ts = write_ts_strs[1];
+  write_opts.timestamp = &ts;
+  for (key = kMaxKey; key >= kMinKey; --key) {
+    Status s;
+    if (0 != (key % 2)) {
+      s = db_->Put(write_opts, Key1(key), "value1" + std::to_string(key));
+    } else {
+      s = db_->Delete(write_opts, Key1(key));
+    }
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  {
+    std::string read_ts = Timestamp(4, 0);
+    ts = read_ts;
+    ReadOptions read_opts;
+    read_opts.total_order_seek = true;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    size_t count = 0;
+    key = kMinKey + 1;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++count, key += 2) {
+      ASSERT_EQ(Key1(key), iter->key());
+      ASSERT_EQ("value1" + std::to_string(key), iter->value());
+    }
+    ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+
+    for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid();
+         key -= 2, ++count, iter->Prev()) {
+      ASSERT_EQ(Key1(key), iter->key());
+      ASSERT_EQ("value1" + std::to_string(key), iter->value());
+    }
+    ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTsIterTombstones,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10, false)),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(20, false))),
+        ::testing::Values(2, 6),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,121 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+std::string Key1(uint64_t key) {
+  std::string ret;
+  PutFixed64(&ret, key);
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+std::string Timestamp(uint64_t ts) {
+  std::string ret;
+  PutFixed64(&ret, ts);
+  return ret;
+}
+}  // anonymous namespace
+
+class TimestampCompatibleCompactionTest : public DBTestBase {
+ public:
+  TimestampCompatibleCompactionTest()
+      : DBTestBase("ts_compatible_compaction_test", /*env_do_fsync=*/true) {}
+
+  std::string Get(const std::string& key, uint64_t ts) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(ts);
+    Slice ts_slice = ts_str;
+    read_opts.timestamp = &ts_slice;
+    std::string value;
+    Status s = db_->Get(read_opts, key, &value);
+    if (s.IsNotFound()) {
+      value.assign("NOT_FOUND");
+    } else if (!s.ok()) {
+      value.assign(s.ToString());
+    }
+    return value;
+  }
+};
+
+TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_style = kCompactionStyleLevel;
+  options.comparator = test::ComparatorWithU64Ts();
+  options.level0_file_num_compaction_trigger = 3;
+  constexpr size_t kNumKeysPerFile = 101;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        const auto* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_NE(nullptr, compaction);
+        ASSERT_EQ(0, compaction->start_level());
+        ASSERT_EQ(1, compaction->num_input_levels());
+        // Check that all 3 L0 ssts are picked for level compaction.
+        ASSERT_EQ(3, compaction->num_input_files(0));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  // Write a L0 with keys 0, 1, ..., 99 with ts from 100 to 199.
+  uint64_t ts = 100;
+  uint64_t key = 0;
+  WriteOptions write_opts;
+  for (; key < kNumKeysPerFile - 1; ++key, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    Slice ts_slice = ts_str;
+    write_opts.timestamp = &ts_slice;
+    ASSERT_OK(db_->Put(write_opts, Key1(key), "foo_" + std::to_string(key)));
+  }
+  // Write another L0 with keys 99 with newer ts.
+  ASSERT_OK(Flush());
+  uint64_t saved_read_ts1 = ts++;
+  key = 99;
+  for (int i = 0; i < 4; ++i, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    Slice ts_slice = ts_str;
+    write_opts.timestamp = &ts_slice;
+    ASSERT_OK(db_->Put(write_opts, Key1(key), "bar_" + std::to_string(key)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t saved_read_ts2 = ts++;
+  // Write another L0 with keys 99, 100, 101, ..., 150
+  for (; key <= 150; ++key, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    Slice ts_slice = ts_str;
+    write_opts.timestamp = &ts_slice;
+    ASSERT_OK(db_->Put(write_opts, Key1(key), "foo1_" + std::to_string(key)));
+  }
+  ASSERT_OK(Flush());
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  uint64_t read_ts = ts;
+  ASSERT_EQ("foo_99", Get(Key1(99), saved_read_ts1));
+  ASSERT_EQ("bar_99", Get(Key1(99), saved_read_ts2));
+  ASSERT_EQ("foo1_99", Get(Key1(99), read_ts));
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,793 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "db/write_thread.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWriteBufferManagerTest : public DBTestBase,
+                                 public testing::WithParamInterface<bool> {
+ public:
+  DBWriteBufferManagerTest()
+      : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {}
+  bool cost_cache_;
+};
+
+TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3".
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  // This make sures write will go through and if stall was in effect, it will
+  // end.
+  ASSERT_OK(Put(0, Key(2), DummyString(1), wo));
+}
+
+// Test Single DB with multiple writer threads get blocked when
+// WriteBufferManager execeeds buffer_size_ and flush is waiting to be
+// finished.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.SignalAll();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        InstrumentedMutexLock lock(&mutex);
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        w_set.insert(w);
+        // Allow the flush to continue if all writer threads are blocked.
+        if (w_set.size() == (unsigned long)num_writers) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  std::function<void(int)> writer = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    Status tmp = Put(cf, Slice(key), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  and they will be blocked.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(writer, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_writers; i++) {
+    threads.emplace_back(writer, i % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_writers);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
+// is waiting to be finished but DBs tries to write meanwhile.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager Limit exceeded.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.Signal();
+          // Since this is the last DB, signal Flush to continue.
+          if (wait_count_db == num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  // Write to DB.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // db_ will write and will be blocked (as Flush will on hold and will create
+  // stall in effect).
+  //  |
+  //  multiple dbs writers will be created to write to that db and they will be
+  //  blocked.
+  //  |
+  //  |
+  //  Last writer will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(write_db, db_);
+  // Wait untill first DB is blocked and then create the multiple writers for
+  // different DBs which will be blocked from getting added to the queue because
+  // stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_dbs; i++) {
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple DBs and multiple columns get
+// blocked when stall by WriteBufferManager is in effect.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          thread_num.fetch_add(1);
+          cv.Signal();
+          // Allow the flush to continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        {
+          InstrumentedMutexLock lock(&mutex);
+          w_set.insert(w);
+          thread_num.fetch_add(1);
+          // Allow the flush continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  // Write to multiple columns of db_.
+  std::function<void(int)> write_cf = [&](int cf) {
+    Status tmp = Put(cf, Key(3), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+  // Write to multiple DBs.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s2 = s2 && tmp.ok();
+  };
+
+  // Flow:
+  // thread will write to db_ will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded and will create stall in effect).
+  //  |
+  //  |
+  //  multiple writers threads writing to different DBs and to db_ across
+  //  multiple columns will be created and they will be blocked due to stall.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+  threads.emplace_back(write_db, db_);
+  // Wait untill first thread is blocked and then create the multiple writer
+  // threads.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i++) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_cf, i % 3);
+    // Write to different dbs.
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_dbs);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ by passing
+// different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // db_ is completed.
+
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.SignalAll();
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+          w_slowdown_set.insert(w);
+          // Allow the flush continue if all writer threads are blocked.
+          if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load(
+                                          std::memory_order_relaxed) ==
+              (unsigned long)num_writers) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+
+  std::function<void(int)> write_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(int)> write_no_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      // Allow the flush continue if all writer threads are blocked.
+      if (w_slowdown_set.size() +
+              (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) ==
+          (unsigned long)num_writers) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  with different values of WriteOptions.no_slowdown. Some of them will
+  //  be blocked and some of them will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_writers; i += 2) {
+    threads.emplace_back(write_no_slow_down, (i) % 4);
+    threads.emplace_back(write_slow_down, (i + 1) % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_slowdown_set.size(), num_writers / 2);
+  // Number of Writer threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ and different
+// dbs by passing different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 4;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.Signal();
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        InstrumentedMutexLock lock(&mutex);
+        w_slowdown_set.insert(w);
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  std::function<void(DB*)> write_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(DB*)> write_no_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      if (w_slowdown_set.size() +
+              (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                              wait_count_db) ==
+          (unsigned long)(2 * num_dbs + 1)) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // first thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  of db_ and different DBs with different values of
+  //  WriteOptions.no_slowdown. Some of them will be blocked and some of them
+  //  will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, db_);
+  // Wait untill first thread writing to DB is blocked and then
+  // create the multiple writers.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i += 2) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_slow_down, db_);
+    writer_threads.emplace_back(write_no_slow_down, db_);
+    // Write to different DBs.
+    threads.emplace_back(write_slow_down, dbs[i]);
+    threads.emplace_back(write_no_slow_down, dbs[i + 1]);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ((num_dbs / 2) + 1, wait_count_db);
+  // Number of writer threads writing to db_ blocked from getting added to the
+  // queue.
+  ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2);
+  // Number of threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs);
+
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
+                        testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/db_write_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/db_write_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,25 +4,27 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <atomic>
+#include <fstream>
 #include <memory>
 #include <thread>
 #include <vector>
-#include <fstream>
+
 #include "db/db_test_util.h"
 #include "db/write_batch_internal.h"
 #include "db/write_thread.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
+#include "util/random.h"
 #include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 // Test variations of WriteImpl.
 class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
  public:
-  DBWriteTest() : DBTestBase("/db_write_test") {}
+  DBWriteTest() : DBTestBase("db_write_test", /*env_do_fsync=*/true) {}
 
   Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
 
@@ -40,6 +42,126 @@
   ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
 }
 
+TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
+  Options options = GetOptions();
+  options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
+      4;
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+  port::Mutex mutex;
+  port::CondVar cv(&mutex);
+  // Guarded by mutex
+  int writers = 0;
+
+  Reopen(options);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
+  };
+  std::function<void(void*)> unblock_main_thread_func = [&](void*) {
+    mutex.Lock();
+    ++writers;
+    cv.SignalAll();
+    mutex.Unlock();
+  };
+
+  // Create 3 L0 files and schedule 4th without waiting
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2",
+        "DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"},
+       // Make compaction start wait for the write stall to be detected and
+       // implemented by a write group leader
+       {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Schedule creation of 4th L0 file without waiting. This will seal the
+  // memtable and then wait for a sync point before writing the file. We need
+  // to do it this way because SwitchMemtable() needs to enter the
+  // write_thread
+  FlushOptions fopt;
+  fopt.wait = false;
+  ASSERT_OK(dbfull()->Flush(fopt));
+
+  // Create a mix of slowdown/no_slowdown write threads
+  mutex.Lock();
+  // First leader
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 1) {
+    cv.Wait();
+  }
+
+  // Second leader. Will stall writes
+  // Build a writers list with no slowdown in the middle:
+  //  +-------------+
+  //  | slowdown    +<----+ newest
+  //  +--+----------+
+  //     |
+  //     v
+  //  +--+----------+
+  //  | no slowdown |
+  //  +--+----------+
+  //     |
+  //     v
+  //  +--+----------+
+  //  | slowdown    +
+  //  +-------------+
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 2) {
+    cv.Wait();
+  }
+  threads.emplace_back(write_no_slowdown_func);
+  while (writers != 3) {
+    cv.Wait();
+  }
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 4) {
+    cv.Wait();
+  }
+
+  mutex.Unlock();
+
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1");
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
+  // This would have triggered a write stall. Unblock the write group leader
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2");
+  // The leader is going to create missing newer links. When the leader
+  // finishes, the next leader is going to delay writes and fail writers with
+  // no_slowdown
+
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3");
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
   Options options = GetOptions();
   options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4;
@@ -47,6 +169,8 @@
   std::atomic<int> thread_num(0);
   port::Mutex mutex;
   port::CondVar cv(&mutex);
+  // Guarded by mutex
+  int writers = 0;
 
   Reopen(options);
 
@@ -55,29 +179,31 @@
     std::string key = "foo" + std::to_string(a);
     WriteOptions wo;
     wo.no_slowdown = false;
-    dbfull()->Put(wo, key, "bar");
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
   };
   std::function<void()> write_no_slowdown_func = [&]() {
     int a = thread_num.fetch_add(1);
     std::string key = "foo" + std::to_string(a);
     WriteOptions wo;
     wo.no_slowdown = true;
-    dbfull()->Put(wo, key, "bar");
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
   };
   std::function<void(void *)> unblock_main_thread_func = [&](void *) {
     mutex.Lock();
+    ++writers;
     cv.SignalAll();
     mutex.Unlock();
   };
 
   // Create 3 L0 files and schedule 4th without waiting
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
@@ -98,28 +224,28 @@
   // write_thread
   FlushOptions fopt;
   fopt.wait = false;
-  dbfull()->Flush(fopt);
+  ASSERT_OK(dbfull()->Flush(fopt));
 
   // Create a mix of slowdown/no_slowdown write threads
   mutex.Lock();
   // First leader
   threads.emplace_back(write_slowdown_func);
-  cv.Wait();
+  while (writers != 1) {
+    cv.Wait();
+  }
   // Second leader. Will stall writes
   threads.emplace_back(write_slowdown_func);
-  cv.Wait();
   threads.emplace_back(write_no_slowdown_func);
-  cv.Wait();
   threads.emplace_back(write_slowdown_func);
-  cv.Wait();
   threads.emplace_back(write_no_slowdown_func);
-  cv.Wait();
   threads.emplace_back(write_slowdown_func);
-  cv.Wait();
+  while (writers != 6) {
+    cv.Wait();
+  }
   mutex.Unlock();
 
   TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
-  dbfull()->TEST_WaitForFlushMemTable(nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
   // This would have triggered a write stall. Unblock the write group leader
   TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
   // The leader is going to create missing newer links. When the leader finishes,
@@ -129,12 +255,14 @@
   for (auto& t : threads) {
     t.join();
   }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
 TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
   constexpr int kNumThreads = 5;
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetOptions();
   options.env = mock_env.get();
   Reopen(options);
@@ -181,6 +309,11 @@
     threads[i].join();
   }
   ASSERT_EQ(1, leader_count);
+
+  // The Failed PUT operations can cause a BG error to be set.
+  // Mark it as Checked for the ASSERT_STATUS_CHECKED
+  dbfull()->Resume().PermitUncheckedError();
+
   // Close before mock_env destruct.
   Close();
 }
@@ -194,7 +327,7 @@
   ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
   ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
   // try the 2nd wal created during SwitchWAL
-  dbfull()->TEST_SwitchWAL();
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
   ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
   ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
   ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
@@ -203,7 +336,7 @@
 
 TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetOptions();
   options.env = mock_env.get();
   Reopen(options);
@@ -225,7 +358,9 @@
     }
     */
     if (!options.manual_wal_flush) {
-      ASSERT_FALSE(res.ok());
+      ASSERT_NOK(res);
+    } else {
+      ASSERT_OK(res);
     }
   }
   // Close before mock_env destruct.
@@ -235,7 +370,7 @@
 TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
   Random rnd(301);
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetOptions();
   options.env = mock_env.get();
   options.writable_file_max_buffer_size = 4 * 1024 * 1024;
@@ -246,7 +381,7 @@
   mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
   Status s;
   for (int i = 0; i < 4 * 512; ++i) {
-    s = Put(Key(i), RandomString(&rnd, 1024));
+    s = Put(Key(i), rnd.RandomString(1024));
     if (!s.ok()) {
       break;
     }
@@ -269,7 +404,7 @@
   ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
   ASSERT_OK(dbfull()->UnlockWAL());
   // try the 2nd wal created during SwitchWAL
-  dbfull()->TEST_SwitchWAL();
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
   ASSERT_OK(Put("key" + ToString(0), "value"));
   ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
   ASSERT_OK(dbfull()->LockWAL());
@@ -297,13 +432,14 @@
               ROCKSDB_NAMESPACE::WriteOptions write_option_default;
               std::string no_wal_key = no_wal_key_prefix + std::to_string(t) +
                                        "_" + std::to_string(i);
-              this->Put(no_wal_key, no_wal_value, write_option_disable);
+              ASSERT_OK(
+                  this->Put(no_wal_key, no_wal_value, write_option_disable));
               std::string wal_key =
                   wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
-              this->Put(wal_key, wal_value, write_option_default);
-              dbfull()->SyncWAL();
+              ASSERT_OK(this->Put(wal_key, wal_value, write_option_default));
+              ASSERT_OK(dbfull()->SyncWAL());
             }
-            return 0;
+            return;
         });
     }
     for (auto& t: threads) {
@@ -325,5 +461,6 @@
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,7 +9,10 @@
 #include "db/dbformat.h"
 
 #include <stdio.h>
+
 #include <cinttypes>
+
+#include "db/lookup_key.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "util/coding.h"
@@ -23,14 +26,9 @@
 // and the value type is embedded as the low 8 bits in the sequence
 // number in internal keys, we need to use the highest-numbered
 // ValueType, not the lowest).
-const ValueType kValueTypeForSeek = kTypeBlobIndex;
+const ValueType kValueTypeForSeek = kTypeDeletionWithTimestamp;
 const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
-
-uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
-  assert(seq <= kMaxSequenceNumber);
-  assert(IsExtendedValueType(t));
-  return (seq << 8) | t;
-}
+const std::string kDisableUserTimestamp("");
 
 EntryType GetEntryType(ValueType value_type) {
   switch (value_type) {
@@ -38,6 +36,8 @@
       return kEntryPut;
     case kTypeDeletion:
       return kEntryDelete;
+    case kTypeDeletionWithTimestamp:
+      return kEntryDeleteWithTimestamp;
     case kTypeSingleDeletion:
       return kEntrySingleDelete;
     case kTypeMerge:
@@ -51,41 +51,53 @@
   }
 }
 
-bool ParseFullKey(const Slice& internal_key, FullKey* fkey) {
-  ParsedInternalKey ikey;
-  if (!ParseInternalKey(internal_key, &ikey)) {
-    return false;
-  }
-  fkey->user_key = ikey.user_key;
-  fkey->sequence = ikey.sequence;
-  fkey->type = GetEntryType(ikey.type);
-  return true;
-}
-
-void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) {
-  *seq = packed >> 8;
-  *t = static_cast<ValueType>(packed & 0xff);
-
-  assert(*seq <= kMaxSequenceNumber);
-  assert(IsExtendedValueType(*t));
-}
-
 void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
   result->append(key.user_key.data(), key.user_key.size());
   PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
 }
 
+void AppendInternalKeyWithDifferentTimestamp(std::string* result,
+                                             const ParsedInternalKey& key,
+                                             const Slice& ts) {
+  assert(key.user_key.size() >= ts.size());
+  result->append(key.user_key.data(), key.user_key.size() - ts.size());
+  result->append(ts.data(), ts.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
 void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
                              ValueType t) {
   PutFixed64(result, PackSequenceAndType(s, t));
 }
 
-std::string ParsedInternalKey::DebugString(bool hex) const {
+void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
+  result->append(key.data(), key.size());
+  result->append(kTsMin.data(), ts_sz);
+}
+
+void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
+  result->append(key.data(), key.size());
+  result->append(kTsMax.data(), ts_sz);
+}
+
+std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
+  std::string result = "'";
+  if (log_err_key) {
+    result += user_key.ToString(hex);
+  } else {
+    result += "<redacted>";
+  }
+
   char buf[50];
   snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
            static_cast<int>(type));
-  std::string result = "'";
-  result += user_key.ToString(hex);
+
   result += buf;
   return result;
 }
@@ -93,8 +105,8 @@
 std::string InternalKey::DebugString(bool hex) const {
   std::string result;
   ParsedInternalKey parsed;
-  if (ParseInternalKey(rep_, &parsed)) {
-    result = parsed.DebugString(hex);
+  if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
+    result = parsed.DebugString(true /* log_err_key */, hex);  // TODO
   } else {
     result = "(bad)";
     result.append(EscapeString(rep_));
@@ -102,7 +114,12 @@
   return result;
 }
 
-const char* InternalKeyComparator::Name() const { return name_.c_str(); }
+const char* InternalKeyComparator::Name() const {
+  if (name_.empty()) {
+    return "rocksdb.anonymous.InternalKeyComparator";
+  }
+  return name_.c_str();
+}
 
 int InternalKeyComparator::Compare(const ParsedInternalKey& a,
                                    const ParsedInternalKey& b) const {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,19 +9,14 @@
 
 #pragma once
 #include <stdio.h>
+
 #include <memory>
 #include <string>
 #include <utility>
-#include "db/lookup_key.h"
-#include "db/merge_context.h"
-#include "logging/logging.h"
-#include "monitoring/perf_context_imp.h"
+
 #include "rocksdb/comparator.h"
-#include "rocksdb/db.h"
-#include "rocksdb/filter_policy.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "rocksdb/table.h"
 #include "rocksdb/types.h"
 #include "util/coding.h"
 #include "util/user_comparator_wrapper.h"
@@ -69,7 +64,9 @@
   // generated by WriteUnprepared write policy is not mistakenly read by
   // another.
   kTypeBeginUnprepareXID = 0x13,  // WAL only.
-  kMaxValue = 0x7F                // Not used for storing records.
+  kTypeDeletionWithTimestamp = 0x14,
+  kTypeCommitXIDAndTimestamp = 0x15,  // WAL only
+  kMaxValue = 0x7F                    // Not used for storing records.
 };
 
 // Defined in dbformat.cc
@@ -79,7 +76,8 @@
 // Checks whether a type is an inline value type
 // (i.e. a type used in memtable skiplist and sst file datablock).
 inline bool IsValueType(ValueType t) {
-  return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex;
+  return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex ||
+         kTypeDeletionWithTimestamp == t;
 }
 
 // Checks whether a type is from user operation
@@ -94,6 +92,11 @@
 
 static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;
 
+constexpr uint64_t kNumInternalBytes = 8;
+
+// Defined in dbformat.cc
+extern const std::string kDisableUserTimestamp;
+
 // The data structure that represents an internal key in the way that user_key,
 // sequence number and type are stored in separated forms.
 struct ParsedInternalKey {
@@ -102,59 +105,95 @@
   ValueType type;
 
   ParsedInternalKey()
-      : sequence(kMaxSequenceNumber)  // Make code analyzer happy
-  {}  // Intentionally left uninitialized (for speed)
+      : sequence(kMaxSequenceNumber),
+        type(kTypeDeletion)  // Make code analyzer happy
+  {}                         // Intentionally left uninitialized (for speed)
+  // u contains timestamp if user timestamp feature is enabled.
   ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
       : user_key(u), sequence(seq), type(t) {}
-  std::string DebugString(bool hex = false) const;
+  std::string DebugString(bool log_err_key, bool hex) const;
 
   void clear() {
     user_key.clear();
     sequence = 0;
     type = kTypeDeletion;
   }
+
+  void SetTimestamp(const Slice& ts) {
+    assert(ts.size() <= user_key.size());
+    const char* addr = user_key.data() + user_key.size() - ts.size();
+    memcpy(const_cast<char*>(addr), ts.data(), ts.size());
+  }
 };
 
 // Return the length of the encoding of "key".
 inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
-  return key.user_key.size() + 8;
+  return key.user_key.size() + kNumInternalBytes;
 }
 
 // Pack a sequence number and a ValueType into a uint64_t
-extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
+inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(IsExtendedValueType(t));
+  return (seq << 8) | t;
+}
 
 // Given the result of PackSequenceAndType, store the sequence number in *seq
 // and the ValueType in *t.
-extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t);
+inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq,
+                                  ValueType* t) {
+  *seq = packed >> 8;
+  *t = static_cast<ValueType>(packed & 0xff);
+
+  // Commented the following two assertions in order to test key-value checksum
+  // on corrupted keys without crashing ("DbKvChecksumTest").
+  // assert(*seq <= kMaxSequenceNumber);
+  // assert(IsExtendedValueType(*t));
+}
 
 EntryType GetEntryType(ValueType value_type);
 
 // Append the serialization of "key" to *result.
 extern void AppendInternalKey(std::string* result,
                               const ParsedInternalKey& key);
+
+// Append the serialization of "key" to *result, replacing the original
+// timestamp with argument ts.
+extern void AppendInternalKeyWithDifferentTimestamp(
+    std::string* result, const ParsedInternalKey& key, const Slice& ts);
+
 // Serialized internal key consists of user key followed by footer.
 // This function appends the footer to *result, assuming that *result already
 // contains the user key at the end.
 extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
                                     ValueType t);
 
+// Append the key and a minimal timestamp to *result
+extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
+// Append the key and a maximal timestamp to *result
+extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
 // Attempt to parse an internal key from "internal_key".  On success,
 // stores the parsed data in "*result", and returns true.
 //
 // On error, returns false, leaves "*result" in an undefined state.
-extern bool ParseInternalKey(const Slice& internal_key,
-                             ParsedInternalKey* result);
+extern Status ParseInternalKey(const Slice& internal_key,
+                               ParsedInternalKey* result, bool log_err_key);
 
 // Returns the user key portion of an internal key.
 inline Slice ExtractUserKey(const Slice& internal_key) {
-  assert(internal_key.size() >= 8);
-  return Slice(internal_key.data(), internal_key.size() - 8);
+  assert(internal_key.size() >= kNumInternalBytes);
+  return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes);
 }
 
 inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
                                              size_t ts_sz) {
-  assert(internal_key.size() >= 8 + ts_sz);
-  return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz);
+  assert(internal_key.size() >= kNumInternalBytes + ts_sz);
+  return Slice(internal_key.data(),
+               internal_key.size() - kNumInternalBytes - ts_sz);
 }
 
 inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
@@ -162,10 +201,15 @@
   return Slice(user_key.data(), user_key.size() - ts_sz);
 }
 
+inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  assert(user_key.size() >= ts_sz);
+  return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz);
+}
+
 inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
-  assert(internal_key.size() >= 8);
+  assert(internal_key.size() >= kNumInternalBytes);
   const size_t n = internal_key.size();
-  return DecodeFixed64(internal_key.data() + n - 8);
+  return DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
 }
 
 inline ValueType ExtractValueType(const Slice& internal_key) {
@@ -186,10 +230,22 @@
   std::string name_;
 
  public:
-  explicit InternalKeyComparator(const Comparator* c)
-      : user_comparator_(c),
-        name_("rocksdb.InternalKeyComparator:" +
-              std::string(user_comparator_.Name())) {}
+  // `InternalKeyComparator`s constructed with the default constructor are not
+  // usable and will segfault on any attempt to use them for comparisons.
+  InternalKeyComparator() = default;
+
+  // @param named If true, assign a name to this comparator based on the
+  //    underlying comparator's name. This involves an allocation and copy in
+  //    this constructor to precompute the result of `Name()`. To avoid this
+  //    overhead, set `named` to false. In that case, `Name()` will return a
+  //    generic name that is non-specific to the underlying comparator.
+  explicit InternalKeyComparator(const Comparator* c, bool named = true)
+      : Comparator(c->timestamp_size()), user_comparator_(c) {
+    if (named) {
+      name_ = "rocksdb.InternalKeyComparator:" +
+              std::string(user_comparator_.Name());
+    }
+  }
   virtual ~InternalKeyComparator() {}
 
   virtual const char* Name() const override;
@@ -206,6 +262,12 @@
 
   int Compare(const InternalKey& a, const InternalKey& b) const;
   int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+  // In this `Compare()` overload, the sequence numbers provided in
+  // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a`
+  // and `b`, respectively. To disable sequence number override(s), provide the
+  // value `kDisableGlobalSequenceNumber`.
+  int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b,
+              SequenceNumber b_global_seqno) const;
   virtual const Comparator* GetRootComparator() const override {
     return user_comparator_.GetRootComparator();
   }
@@ -238,7 +300,8 @@
 
   bool Valid() const {
     ParsedInternalKey parsed;
-    return ParseInternalKey(Slice(rep_), &parsed);
+    return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */)
+                .ok());  // TODO
   }
 
   void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
@@ -271,7 +334,7 @@
     AppendInternalKeyFooter(&rep_, s, t);
   }
 
-  std::string DebugString(bool hex = false) const;
+  std::string DebugString(bool hex) const;
 };
 
 inline int InternalKeyComparator::Compare(const InternalKey& a,
@@ -279,36 +342,47 @@
   return Compare(a.Encode(), b.Encode());
 }
 
-inline bool ParseInternalKey(const Slice& internal_key,
-                             ParsedInternalKey* result) {
+inline Status ParseInternalKey(const Slice& internal_key,
+                               ParsedInternalKey* result, bool log_err_key) {
   const size_t n = internal_key.size();
-  if (n < 8) return false;
-  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+
+  if (n < kNumInternalBytes) {
+    return Status::Corruption("Corrupted Key: Internal Key too small. Size=" +
+                              std::to_string(n) + ". ");
+  }
+
+  uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
   unsigned char c = num & 0xff;
   result->sequence = num >> 8;
   result->type = static_cast<ValueType>(c);
   assert(result->type <= ValueType::kMaxValue);
-  result->user_key = Slice(internal_key.data(), n - 8);
-  return IsExtendedValueType(result->type);
+  result->user_key = Slice(internal_key.data(), n - kNumInternalBytes);
+
+  if (IsExtendedValueType(result->type)) {
+    return Status::OK();
+  } else {
+    return Status::Corruption("Corrupted Key",
+                              result->DebugString(log_err_key, true));
+  }
 }
 
 // Update the sequence number in the internal key.
 // Guarantees not to invalidate ikey.data().
 inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
   size_t ikey_sz = ikey->size();
-  assert(ikey_sz >= 8);
+  assert(ikey_sz >= kNumInternalBytes);
   uint64_t newval = (seq << 8) | t;
 
   // Note: Since C++11, strings are guaranteed to be stored contiguously and
   // string::operator[]() is guaranteed not to change ikey.data().
-  EncodeFixed64(&(*ikey)[ikey_sz - 8], newval);
+  EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval);
 }
 
 // Get the sequence number from the internal key
 inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
   const size_t n = internal_key.size();
-  assert(n >= 8);
-  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  assert(n >= kNumInternalBytes);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
   return num >> 8;
 }
 
@@ -347,8 +421,8 @@
     if (IsUserKey()) {
       return Slice(key_, key_size_);
     } else {
-      assert(key_size_ >= 8);
-      return Slice(key_, key_size_ - 8);
+      assert(key_size_ >= kNumInternalBytes);
+      return Slice(key_, key_size_ - kNumInternalBytes);
     }
   }
 
@@ -406,9 +480,9 @@
   // and returns a Slice referencing the new copy.
   Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
     size_t key_n = key.size();
-    assert(key_n >= 8);
+    assert(key_n >= kNumInternalBytes);
     SetInternalKey(key);
-    ikey->user_key = Slice(key_, key_n - 8);
+    ikey->user_key = Slice(key_, key_n - kNumInternalBytes);
     return Slice(key_, key_n);
   }
 
@@ -423,35 +497,48 @@
 
   // Update the sequence number in the internal key.  Guarantees not to
   // invalidate slices to the key (and the user key).
-  void UpdateInternalKey(uint64_t seq, ValueType t) {
+  void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
     assert(!IsKeyPinned());
-    assert(key_size_ >= 8);
+    assert(key_size_ >= kNumInternalBytes);
+    if (ts) {
+      assert(key_size_ >= kNumInternalBytes + ts->size());
+      memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(),
+             ts->size());
+    }
     uint64_t newval = (seq << 8) | t;
-    EncodeFixed64(&buf_[key_size_ - 8], newval);
+    EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
   }
 
   bool IsKeyPinned() const { return (key_ != buf_); }
 
+  // user_key does not have timestamp.
   void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
                       SequenceNumber s,
-                      ValueType value_type = kValueTypeForSeek) {
+                      ValueType value_type = kValueTypeForSeek,
+                      const Slice* ts = nullptr) {
     size_t psize = key_prefix.size();
     size_t usize = user_key.size();
-    EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t));
+    size_t ts_sz = (ts != nullptr ? ts->size() : 0);
+    EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz);
     if (psize > 0) {
       memcpy(buf_, key_prefix.data(), psize);
     }
     memcpy(buf_ + psize, user_key.data(), usize);
-    EncodeFixed64(buf_ + usize + psize, PackSequenceAndType(s, value_type));
+    if (ts) {
+      memcpy(buf_ + psize + usize, ts->data(), ts_sz);
+    }
+    EncodeFixed64(buf_ + usize + psize + ts_sz,
+                  PackSequenceAndType(s, value_type));
 
     key_ = buf_;
-    key_size_ = psize + usize + sizeof(uint64_t);
+    key_size_ = psize + usize + sizeof(uint64_t) + ts_sz;
     is_user_key_ = false;
   }
 
   void SetInternalKey(const Slice& user_key, SequenceNumber s,
-                      ValueType value_type = kValueTypeForSeek) {
-    SetInternalKey(Slice(), user_key, s, value_type);
+                      ValueType value_type = kValueTypeForSeek,
+                      const Slice* ts = nullptr) {
+    SetInternalKey(Slice(), user_key, s, value_type, ts);
   }
 
   void Reserve(size_t size) {
@@ -528,7 +615,7 @@
   void EnlargeBuffer(size_t key_size);
 };
 
-// Convert from a SliceTranform of user keys, to a SliceTransform of
+// Convert from a SliceTransform of user keys, to a SliceTransform of
 // user keys.
 class InternalKeySliceTransform : public SliceTransform {
  public:
@@ -568,7 +655,7 @@
 
 // Read record from a write batch piece from input.
 // tag, column_family, key, value and blob are return values. Callers own the
-// Slice they point to.
+// slice they point to.
 // Tag is defined as ValueType.
 // input will be advanced to after the record.
 extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
@@ -625,8 +712,10 @@
   //    decreasing type (though sequence# should be enough to disambiguate)
   int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
   if (r == 0) {
-    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
-    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    const uint64_t anum =
+        DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes);
+    const uint64_t bnum =
+        DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes);
     if (anum > bnum) {
       r = -1;
     } else if (anum < bnum) {
@@ -644,14 +733,42 @@
   int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
   if (r == 0) {
     // Shift the number to exclude the last byte which contains the value type
-    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8;
-    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8;
+    const uint64_t anum =
+        DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8;
+    const uint64_t bnum =
+        DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8;
     if (anum > bnum) {
       r = -1;
     } else if (anum < bnum) {
       r = +1;
     }
   }
+  return r;
+}
+
+inline int InternalKeyComparator::Compare(const Slice& a,
+                                          SequenceNumber a_global_seqno,
+                                          const Slice& b,
+                                          SequenceNumber b_global_seqno) const {
+  int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b));
+  if (r == 0) {
+    uint64_t a_footer, b_footer;
+    if (a_global_seqno == kDisableGlobalSequenceNumber) {
+      a_footer = ExtractInternalKeyFooter(a);
+    } else {
+      a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a));
+    }
+    if (b_global_seqno == kDisableGlobalSequenceNumber) {
+      b_footer = ExtractInternalKeyFooter(b);
+    } else {
+      b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b));
+    }
+    if (a_footer > b_footer) {
+      r = -1;
+    } else if (a_footer < b_footer) {
+      r = +1;
+    }
+  }
   return r;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/dbformat_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/dbformat_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,8 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/dbformat.h"
-#include "logging/logging.h"
+
 #include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -41,12 +42,12 @@
   Slice in(encoded);
   ParsedInternalKey decoded("", 0, kTypeValue);
 
-  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
   ASSERT_EQ(key, decoded.user_key.ToString());
   ASSERT_EQ(seq, decoded.sequence);
   ASSERT_EQ(vt, decoded.type);
 
-  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+  ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */));
 }
 
 class FormatTest : public testing::Test {};
@@ -186,7 +187,7 @@
 
   Slice in(ikey);
   ParsedInternalKey decoded;
-  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
   ASSERT_EQ(user_key, decoded.user_key.ToString());
   ASSERT_EQ(new_seq, decoded.sequence);
   ASSERT_EQ(new_val_type, decoded.type);
@@ -203,5 +204,6 @@
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/deletefile_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/deletefile_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/deletefile_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/deletefile_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -35,12 +35,12 @@
   const std::string wal_dir_;
 
   DeleteFileTest()
-      : DBTestBase("/deletefile_test"),
+      : DBTestBase("deletefile_test", /*env_do_fsync=*/true),
         numlevels_(7),
         wal_dir_(dbname_ + "/wal_files") {}
 
   void SetOptions(Options* options) {
-    assert(options);
+    ASSERT_NE(options, nullptr);
     options->delete_obsolete_files_period_micros = 0;  // always do full purge
     options->enable_thread_tracking = true;
     options->write_buffer_size = 1024 * 1024 * 1000;
@@ -105,21 +105,27 @@
   void CheckFileTypeCounts(const std::string& dir, int required_log,
                            int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
-    env_->GetChildren(dir, &filenames);
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
 
     int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
     for (auto file : filenames) {
       uint64_t number;
       FileType type;
       if (ParseFileName(file, &number, &type)) {
-        log_cnt += (type == kLogFile);
+        log_cnt += (type == kWalFile);
         sst_cnt += (type == kTableFile);
         manifest_cnt += (type == kDescriptorFile);
       }
     }
-    ASSERT_EQ(required_log, log_cnt);
-    ASSERT_EQ(required_sst, sst_cnt);
-    ASSERT_EQ(required_manifest, manifest_cnt);
+    if (required_log >= 0) {
+      ASSERT_EQ(required_log, log_cnt);
+    }
+    if (required_sst >= 0) {
+      ASSERT_EQ(required_sst, sst_cnt);
+    }
+    if (required_manifest >= 0) {
+      ASSERT_EQ(required_manifest, manifest_cnt);
+    }
   }
 
   static void DoSleep(void* arg) {
@@ -180,7 +186,8 @@
   ASSERT_TRUE(status.IsInvalidArgument());
 
   // Lowest level file deletion should succeed.
-  ASSERT_OK(db_->DeleteFile(level2file));
+  status = db_->DeleteFile(level2file);
+  ASSERT_OK(status);
 }
 
 TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
@@ -201,7 +208,7 @@
   compact_options.change_level = true;
   compact_options.target_level = 2;
   Slice first_slice(first), last_slice(last);
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 1 sst after compaction
   CheckFileTypeCounts(dbname_, 0, 1, 1);
 
@@ -210,7 +217,9 @@
   Iterator *itr = nullptr;
   CreateTwoLevels();
   itr = db_->NewIterator(ReadOptions());
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  ASSERT_OK(itr->status());
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
   delete itr;
@@ -237,7 +246,8 @@
   ReadOptions read_options;
   read_options.background_purge_on_iterator_cleanup = true;
   itr = db_->NewIterator(read_options);
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
   test::SleepingBackgroundTask sleeping_task_before;
@@ -260,6 +270,41 @@
   CheckFileTypeCounts(dbname_, 0, 1, 1);
 }
 
+TEST_F(DeleteFileTest, PurgeDuringOpen) {
+  Options options = CurrentOptions();
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+  Close();
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+                                         EnvOptions()));
+  ASSERT_OK(file->Close());
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  options.avoid_unnecessary_blocking_io = false;
+  options.create_if_missing = false;
+  Reopen(options);
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+  Close();
+
+  // test background purge
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+                                         EnvOptions()));
+  ASSERT_OK(file->Close());
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  // the obsolete file is not deleted until the background purge job is ran
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1");
+  ASSERT_OK(dbfull()->TEST_WaitForPurge());
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+}
+
 TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
   Options options = CurrentOptions();
   SetOptions(&options);
@@ -306,6 +351,11 @@
     do_test(false);
   }
 
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForPurge());
+
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->LoadDependency(
@@ -313,9 +363,6 @@
         "DBImpl::BGWorkPurge:start"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
-  options.avoid_unnecessary_blocking_io = true;
-  options.create_if_missing = false;
-  Reopen(options);
   {
     SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
     do_test(true);
@@ -344,11 +391,12 @@
     ReadOptions read_options;
     read_options.background_purge_on_iterator_cleanup = true;
     itr = db_->NewIterator(read_options);
+    ASSERT_OK(itr->status());
     // ReadOptions is deleted, but iterator cleanup function should not be
     // affected
   }
 
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
   delete itr;
@@ -382,9 +430,11 @@
   ReadOptions read_options;
   read_options.background_purge_on_iterator_cleanup = true;
   Iterator* itr1 = db_->NewIterator(read_options);
+  ASSERT_OK(itr1->status());
   CreateTwoLevels();
   Iterator* itr2 = db_->NewIterator(read_options);
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(itr2->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 5 sst files after 2 compactions with 2 live iterators
   CheckFileTypeCounts(dbname_, 0, 5, 1);
 
@@ -417,6 +467,7 @@
   CreateTwoLevels();
   ReadOptions read_options;
   Iterator* it = db_->NewIterator(read_options);
+  ASSERT_OK(it->status());
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
 
@@ -432,7 +483,7 @@
   Status status = db_->DeleteFile(level2file);
   fprintf(stdout, "Deletion status %s: %s\n",
           level2file.c_str(), status.ToString().c_str());
-  ASSERT_TRUE(status.ok());
+  ASSERT_OK(status);
   it->SeekToFirst();
   int numKeysIterated = 0;
   while(it->Valid()) {
@@ -452,7 +503,7 @@
 
   AddKeys(10, 0);
   VectorLogPtr logfiles;
-  db_->GetSortedWalFiles(logfiles);
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
   ASSERT_GT(logfiles.size(), 0UL);
   // Take the last log file which is expected to be alive and try to delete it
   // Should not succeed because live logs are not allowed to be deleted
@@ -461,7 +512,7 @@
   ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   fprintf(stdout, "Deleting alive log file %s\n",
           alive_log->PathName().c_str());
-  ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+  ASSERT_NOK(db_->DeleteFile(alive_log->PathName()));
   ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   logfiles.clear();
 
@@ -469,10 +520,10 @@
   // Call Flush again to flush out memtable and move alive log to archived log
   // and try to delete the archived log file
   FlushOptions fopts;
-  db_->Flush(fopts);
+  ASSERT_OK(db_->Flush(fopts));
   AddKeys(10, 0);
-  db_->Flush(fopts);
-  db_->GetSortedWalFiles(logfiles);
+  ASSERT_OK(db_->Flush(fopts));
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
   ASSERT_GT(logfiles.size(), 0UL);
   std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
   ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
@@ -480,8 +531,8 @@
   fprintf(stdout, "Deleting archived log file %s\n",
           archived_log->PathName().c_str());
   ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
-  ASSERT_EQ(Status::NotFound(),
-            env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+  ASSERT_TRUE(
+      env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound());
 }
 
 TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
@@ -520,6 +571,7 @@
 
   {
     std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    ASSERT_OK(itr->status());
     int count = 0;
     for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
       ASSERT_OK(itr->status());
@@ -544,14 +596,6 @@
 
 }  // namespace ROCKSDB_NAMESPACE
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
-}
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,9 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include "db/error_handler.h"
+
 #include "db/db_impl/db_impl.h"
 #include "db/event_helpers.h"
 #include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -32,6 +34,14 @@
                          Status::Code::kIOError, Status::SubCode::kSpaceLimit,
                          true),
          Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
         // Errors during BG flush
         {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
                          Status::SubCode::kNoSpace, true),
@@ -42,6 +52,12 @@
         {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
                          Status::SubCode::kSpaceLimit, true),
          Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kIOFenced, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kIOFenced, false),
+         Status::Severity::kFatalError},
         // Errors during Write
         {std::make_tuple(BackgroundErrorReason::kWriteCallback,
                          Status::Code::kIOError, Status::SubCode::kNoSpace,
@@ -51,9 +67,74 @@
                          Status::Code::kIOError, Status::SubCode::kNoSpace,
                          false),
          Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during MANIFEST write
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush with WAL disabled
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during MANIFEST write when WAL is disabled
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+
 };
 
-std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>, Status::Severity>
+std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
+         Status::Severity>
     DefaultErrorSeverityMap = {
         // Errors during BG compaction
         {std::make_tuple(BackgroundErrorReason::kCompaction,
@@ -75,11 +156,11 @@
         {std::make_tuple(BackgroundErrorReason::kFlush,
                          Status::Code::kCorruption, false),
          Status::Severity::kNoError},
-        {std::make_tuple(BackgroundErrorReason::kFlush,
-                         Status::Code::kIOError, true),
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         true),
          Status::Severity::kFatalError},
-        {std::make_tuple(BackgroundErrorReason::kFlush,
-                         Status::Code::kIOError, false),
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         false),
          Status::Severity::kNoError},
         // Errors during Write
         {std::make_tuple(BackgroundErrorReason::kWriteCallback,
@@ -94,30 +175,55 @@
         {std::make_tuple(BackgroundErrorReason::kWriteCallback,
                          Status::Code::kIOError, false),
          Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush with WAL disabled
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, false),
+         Status::Severity::kFatalError},
 };
 
 std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
     DefaultReasonMap = {
         // Errors during BG compaction
         {std::make_tuple(BackgroundErrorReason::kCompaction, true),
-          Status::Severity::kFatalError},
+         Status::Severity::kFatalError},
         {std::make_tuple(BackgroundErrorReason::kCompaction, false),
-          Status::Severity::kNoError},
+         Status::Severity::kNoError},
         // Errors during BG flush
         {std::make_tuple(BackgroundErrorReason::kFlush, true),
-          Status::Severity::kFatalError},
+         Status::Severity::kFatalError},
         {std::make_tuple(BackgroundErrorReason::kFlush, false),
-          Status::Severity::kNoError},
+         Status::Severity::kNoError},
         // Errors during Write
         {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
-          Status::Severity::kFatalError},
+         Status::Severity::kFatalError},
         {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
-          Status::Severity::kFatalError},
+         Status::Severity::kFatalError},
         // Errors during Memtable update
         {std::make_tuple(BackgroundErrorReason::kMemTable, true),
-          Status::Severity::kFatalError},
+         Status::Severity::kFatalError},
         {std::make_tuple(BackgroundErrorReason::kMemTable, false),
-          Status::Severity::kFatalError},
+         Status::Severity::kFatalError},
 };
 
 void ErrorHandler::CancelErrorRecovery() {
@@ -138,6 +244,10 @@
       recovery_in_prog_ = false;
     }
   }
+
+  // If auto recovery is also runing to resume from the retryable error,
+  // we should wait and end the auto recovery.
+  EndAutoRecovery();
 #endif
 }
 
@@ -159,16 +269,23 @@
 // This can also get called as part of a recovery operation. In that case, we
 // also track the error separately in recovery_error_ so we can tell in the
 // end whether recovery succeeded or not
-Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) {
+const Status& ErrorHandler::SetBGError(const Status& bg_err,
+                                       BackgroundErrorReason reason) {
   db_mutex_->AssertHeld();
-
   if (bg_err.ok()) {
-    return Status::OK();
+    return bg_err;
   }
 
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+  }
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "ErrorHandler: Set regular background error\n");
+
   bool paranoid = db_options_.paranoid_checks;
   Status::Severity sev = Status::Severity::kFatalError;
   Status new_bg_err;
+  DBRecoverContext context;
   bool found = false;
 
   {
@@ -210,7 +327,8 @@
   }
 
   // Allow some error specific overrides
-  if (new_bg_err == Status::NoSpace()) {
+  if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+      new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
     new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
   }
 
@@ -227,18 +345,146 @@
     }
   }
 
+  recover_context_ = context;
   if (auto_recovery) {
     recovery_in_prog_ = true;
 
     // Kick-off error specific recovery
-    if (bg_error_ == Status::NoSpace()) {
+    if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+        new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
       RecoverFromNoSpace();
     }
   }
   return bg_error_;
 }
 
-Status ErrorHandler::OverrideNoSpaceError(Status bg_error,
+// This is the main function for looking at IO related error during the
+// background operations. The main logic is:
+// 1) File scope IO error is treated as retryable IO error in the write
+//    path. In RocksDB, If a file has write IO error and it is at file scope,
+//    RocksDB never write to the same file again. RocksDB will create a new
+//    file and rewrite the whole content. Thus, it is retryable.
+// 1) if the error is caused by data loss, the error is mapped to
+//    unrecoverable error. Application/user must take action to handle
+//    this situation (File scope case is excluded).
+// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
+//     or its retryable flag is set and not a data loss error), auto resume
+//     will be called and the auto resume can be controlled by resume count
+//     and resume interval options. There are three sub-cases:
+//    a) if the error happens during compaction, it is mapped to a soft error.
+//       the compaction thread will reschedule a new compaction.
+//    b) if the error happens during flush and also WAL is empty, it is mapped
+//       to a soft error. Note that, it includes the case that IO error happens
+//       in SST or manifest write during flush.
+//    c) all other errors are mapped to hard error.
+// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason
+//    reason) will be called to handle other error cases.
+const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
+                                       BackgroundErrorReason reason) {
+  db_mutex_->AssertHeld();
+  if (bg_io_err.ok()) {
+    return bg_io_err;
+  }
+  ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
+                 bg_io_err.ToString().c_str());
+
+  if (recovery_in_prog_ && recovery_io_error_.ok()) {
+    recovery_io_error_ = bg_io_err;
+  }
+  if (BackgroundErrorReason::kManifestWrite == reason ||
+      BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+    // Always returns ok
+    ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions");
+    db_->DisableFileDeletionsWithLock().PermitUncheckedError();
+  }
+
+  Status new_bg_io_err = bg_io_err;
+  DBRecoverContext context;
+  if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
+      bg_io_err.GetDataLoss()) {
+    // First, data loss (non file scope) is treated as unrecoverable error. So
+    // it can directly overwrite any existing bg_error_.
+    bool auto_recovery = false;
+    Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
+    CheckAndSetRecoveryAndBGError(bg_err);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(
+        db_options_.info_log,
+        "ErrorHandler: Set background IO error as unrecoverable error\n");
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &bg_err, db_mutex_, &auto_recovery);
+    recover_context_ = context;
+    return bg_error_;
+  } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
+             (bg_io_err.GetScope() ==
+                  IOStatus::IOErrorScope::kIOErrorScopeFile ||
+              bg_io_err.GetRetryable())) {
+    // Second, check if the error is a retryable IO error (file scope IO error
+    // is also treated as retryable IO error in RocksDB write path). if it is
+    // retryable error and its severity is higher than bg_error_, overwrite the
+    // bg_error_ with new error. In current stage, for retryable IO error of
+    // compaction, treat it as soft error. In other cases, treat the retryable
+    // IO error as hard error. Note that, all the NoSpace error should be
+    // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
+    // it is retryable or file scope, this logic will be bypassed.
+    bool auto_recovery = false;
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &new_bg_io_err, db_mutex_,
+                                          &auto_recovery);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "ErrorHandler: Set background retryable IO error\n");
+    if (BackgroundErrorReason::kCompaction == reason) {
+      // We map the retryable IO error during compaction to soft error. Since
+      // compaction can reschedule by itself. We will not set the BG error in
+      // this case
+      // TODO:  a better way to set or clean the retryable IO error which
+      // happens during compaction SST file write.
+      if (bg_error_stats_ != nullptr) {
+        RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+      }
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "ErrorHandler: Compaction will schedule by itself to resume\n");
+      return bg_error_;
+    } else if (BackgroundErrorReason::kFlushNoWAL == reason ||
+               BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+      // When the BG Retryable IO error reason is flush without WAL,
+      // We map it to a soft error. At the same time, all the background work
+      // should be stopped except the BG work from recovery. Therefore, we
+      // set the soft_error_no_bg_work_ to true. At the same time, since DB
+      // continues to receive writes when BG error is soft error, to avoid
+      // to many small memtable being generated during auto resume, the flush
+      // reason is set to kErrorRecoveryRetryFlush.
+      Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
+      CheckAndSetRecoveryAndBGError(bg_err);
+      soft_error_no_bg_work_ = true;
+      context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+      recover_context_ = context;
+      return StartRecoverFromRetryableBGIOError(bg_io_err);
+    } else {
+      Status bg_err(new_bg_io_err, Status::Severity::kHardError);
+      CheckAndSetRecoveryAndBGError(bg_err);
+      recover_context_ = context;
+      return StartRecoverFromRetryableBGIOError(bg_io_err);
+    }
+  } else {
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    return SetBGError(new_bg_io_err, reason);
+  }
+}
+
+Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
                                           bool* auto_recovery) {
 #ifndef ROCKSDB_LITE
   if (bg_error.severity() >= Status::Severity::kFatalError) {
@@ -294,10 +540,17 @@
   // Signal that recovery succeeded
   if (recovery_error_.ok()) {
     Status old_bg_error = bg_error_;
+    // old_bg_error is only for notifying listeners, so may not be checked
+    old_bg_error.PermitUncheckedError();
+    // Clear and check the recovery IO and BG error
     bg_error_ = Status::OK();
+    recovery_io_error_ = IOStatus::OK();
+    bg_error_.PermitUncheckedError();
+    recovery_io_error_.PermitUncheckedError();
     recovery_in_prog_ = false;
-    EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
-                                                 old_bg_error, db_mutex_);
+    soft_error_no_bg_work_ = false;
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
+                                           bg_error_, db_mutex_);
   }
   return recovery_error_;
 #else
@@ -308,6 +561,7 @@
 Status ErrorHandler::RecoverFromBGError(bool is_manual) {
 #ifndef ROCKSDB_LITE
   InstrumentedMutexLock l(db_mutex_);
+  bool no_bg_work_original_flag = soft_error_no_bg_work_;
   if (is_manual) {
     // If its a manual recovery and there's a background recovery in progress
     // return busy status
@@ -315,9 +569,24 @@
       return Status::Busy();
     }
     recovery_in_prog_ = true;
+
+    // In manual resume, we allow the bg work to run. If it is a auto resume,
+    // the bg work should follow this tag.
+    soft_error_no_bg_work_ = false;
+
+    // In manual resume, if the bg error is a soft error and also requires
+    // no bg work, the error must be recovered by call the flush with
+    // flush reason: kErrorRecoveryRetryFlush. In other case, the flush
+    // reason is set to kErrorRecovery.
+    if (no_bg_work_original_flag) {
+      recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+    } else {
+      recover_context_.flush_reason = FlushReason::kErrorRecovery;
+    }
   }
 
-  if (bg_error_.severity() == Status::Severity::kSoftError) {
+  if (bg_error_.severity() == Status::Severity::kSoftError &&
+      recover_context_.flush_reason == FlushReason::kErrorRecovery) {
     // Simply clear the background error and return
     recovery_error_ = Status::OK();
     return ClearBGError();
@@ -327,7 +596,14 @@
   // during the recovery process. While recovering, the only operations that
   // can generate background errors should be the flush operations
   recovery_error_ = Status::OK();
-  Status s = db_->ResumeImpl();
+  recovery_error_.PermitUncheckedError();
+  Status s = db_->ResumeImpl(recover_context_);
+  if (s.ok()) {
+    soft_error_no_bg_work_ = false;
+  } else {
+    soft_error_no_bg_work_ = no_bg_work_original_flag;
+  }
+
   // For manual recover, shutdown, and fatal error  cases, set
   // recovery_in_prog_ to false. For automatic background recovery, leave it
   // as is regardless of success or failure as it will be retried
@@ -341,4 +617,186 @@
   return bg_error_;
 #endif
 }
+
+const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
+    const IOStatus& io_error) {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+  if (bg_error_.ok()) {
+    return bg_error_;
+  } else if (io_error.ok()) {
+    return io_error;
+  } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
+    // Auto resume BG error is not enabled, directly return bg_error_.
+    return bg_error_;
+  }
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+  }
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
+  if (recovery_thread_) {
+    // In this case, if recovery_in_prog_ is false, current thread should
+    // wait the previous recover thread to finish and create a new thread
+    // to recover from the bg error.
+    db_mutex_->Unlock();
+    recovery_thread_->join();
+    db_mutex_->Lock();
+  }
+
+  recovery_in_prog_ = true;
+  recovery_thread_.reset(
+      new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
+
+  if (recovery_io_error_.ok() && recovery_error_.ok()) {
+    return recovery_error_;
+  } else {
+    return bg_error_;
+  }
+#else
+  (void)io_error;
+  return bg_error_;
+#endif
+}
+
+// Automatic recover from Retryable BG IO error. Must be called after db
+// mutex is released.
+void ErrorHandler::RecoverFromRetryableBGIOError() {
+#ifndef ROCKSDB_LITE
+  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
+  InstrumentedMutexLock l(db_mutex_);
+  if (end_recovery_) {
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                           Status::ShutdownInProgress(),
+                                           db_mutex_);
+    return;
+  }
+  DBRecoverContext context = recover_context_;
+  int resume_count = db_options_.max_bgerror_resume_count;
+  uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
+  uint64_t retry_count = 0;
+  // Recover from the retryable error. Create a separate thread to do it.
+  while (resume_count > 0) {
+    if (end_recovery_) {
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             Status::ShutdownInProgress(),
+                                             db_mutex_);
+      return;
+    }
+    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
+    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
+    recovery_io_error_ = IOStatus::OK();
+    recovery_error_ = Status::OK();
+    retry_count++;
+    Status s = db_->ResumeImpl(context);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
+    }
+    if (s.IsShutdownInProgress() ||
+        bg_error_.severity() >= Status::Severity::kFatalError) {
+      // If DB shutdown in progress or the error severity is higher than
+      // Hard Error, stop auto resume and returns.
+      recovery_in_prog_ = false;
+      if (bg_error_stats_ != nullptr) {
+        RecordInHistogram(bg_error_stats_.get(),
+                          ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+      }
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             bg_error_, db_mutex_);
+      return;
+    }
+    if (!recovery_io_error_.ok() &&
+        recovery_error_.severity() <= Status::Severity::kHardError &&
+        recovery_io_error_.GetRetryable()) {
+      // If new BG IO error happens during auto recovery and it is retryable
+      // and its severity is Hard Error or lower, the auto resmue sleep for
+      // a period of time and redo auto resume if it is allowed.
+      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
+      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
+      int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
+      cv_.TimedWait(wait_until);
+    } else {
+      // There are three possibility: 1) recover_io_error is set during resume
+      // and the error is not retryable, 2) recover is successful, 3) other
+      // error happens during resume and cannot be resumed here.
+      if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) {
+        // recover from the retryable IO error and no other BG errors. Clean
+        // the bg_error and notify user.
+        TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
+        Status old_bg_error = bg_error_;
+        bg_error_ = Status::OK();
+        bg_error_.PermitUncheckedError();
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
+        if (bg_error_stats_ != nullptr) {
+          RecordTick(bg_error_stats_.get(),
+                     ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
+        recovery_in_prog_ = false;
+        if (soft_error_no_bg_work_) {
+          soft_error_no_bg_work_ = false;
+        }
+        return;
+      } else {
+        // In this case: 1) recovery_io_error is more serious or not retryable
+        // 2) other Non IO recovery_error happens. The auto recovery stops.
+        recovery_in_prog_ = false;
+        if (bg_error_stats_ != nullptr) {
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, bg_error_,
+            !recovery_io_error_.ok()
+                ? recovery_io_error_
+                : (!recovery_error_.ok() ? recovery_error_ : s),
+            db_mutex_);
+        return;
+      }
+    }
+    resume_count--;
+  }
+  recovery_in_prog_ = false;
+  EventHelpers::NotifyOnErrorRecoveryEnd(
+      db_options_.listeners, bg_error_,
+      Status::Aborted("Exceeded resume retry count"), db_mutex_);
+  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
+  if (bg_error_stats_ != nullptr) {
+    RecordInHistogram(bg_error_stats_.get(),
+                      ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+  }
+  return;
+#else
+  return;
+#endif
+}
+
+void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = bg_err;
+  }
+  if (bg_err.severity() > bg_error_.severity()) {
+    bg_error_ = bg_err;
+  }
+  return;
+}
+
+void ErrorHandler::EndAutoRecovery() {
+  db_mutex_->AssertHeld();
+  if (!end_recovery_) {
+    end_recovery_ = true;
+  }
+  cv_.SignalAll();
+  db_mutex_->Unlock();
+  if (recovery_thread_) {
+    recovery_thread_->join();
+  }
+  db_mutex_->Lock();
+  return;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler.h	2025-05-19 16:14:27.000000000 +0000
@@ -6,6 +6,7 @@
 
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
+#include "rocksdb/io_status.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/status.h"
 
@@ -13,18 +14,36 @@
 
 class DBImpl;
 
+// This structure is used to store the DB recovery context. The context is
+// the information that related to the recover actions. For example, it contains
+// FlushReason, which tells the flush job why this flush is called.
+struct DBRecoverContext {
+  FlushReason flush_reason;
+
+  DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {}
+
+  DBRecoverContext(FlushReason reason) : flush_reason(reason) {}
+};
+
 class ErrorHandler {
   public:
    ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
                 InstrumentedMutex* db_mutex)
        : db_(db),
          db_options_(db_options),
-         bg_error_(Status::OK()),
-         recovery_error_(Status::OK()),
+         cv_(db_mutex),
+         end_recovery_(false),
+         recovery_thread_(nullptr),
          db_mutex_(db_mutex),
          auto_recovery_(false),
-         recovery_in_prog_(false) {}
-   ~ErrorHandler() {}
+         recovery_in_prog_(false),
+         soft_error_no_bg_work_(false),
+         bg_error_stats_(db_options.statistics) {
+     // Clear the checked flag for uninitialized errors
+     bg_error_.PermitUncheckedError();
+     recovery_error_.PermitUncheckedError();
+     recovery_io_error_.PermitUncheckedError();
+   }
 
    void EnableAutoRecovery() { auto_recovery_ = true; }
 
@@ -32,11 +51,14 @@
                                      Status::Code code,
                                      Status::SubCode subcode);
 
-   Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+   const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+   const Status& SetBGError(const IOStatus& bg_io_err,
+                            BackgroundErrorReason reason);
 
-   Status GetBGError() { return bg_error_; }
+   Status GetBGError() const { return bg_error_; }
 
-   Status GetRecoveryError() { return recovery_error_; }
+   Status GetRecoveryError() const { return recovery_error_; }
 
    Status ClearBGError();
 
@@ -48,14 +70,18 @@
     bool IsBGWorkStopped() {
       return !bg_error_.ok() &&
              (bg_error_.severity() >= Status::Severity::kHardError ||
-              !auto_recovery_);
+              !auto_recovery_ || soft_error_no_bg_work_);
     }
 
+    bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
+
     bool IsRecoveryInProgress() { return recovery_in_prog_; }
 
     Status RecoverFromBGError(bool is_manual = false);
     void CancelErrorRecovery();
 
+    void EndAutoRecovery();
+
    private:
     DBImpl* db_;
     const ImmutableDBOptions& db_options_;
@@ -63,13 +89,37 @@
     // A separate Status variable used to record any errors during the
     // recovery process from hard errors
     Status recovery_error_;
+    // A separate IO Status variable used to record any IO errors during
+    // the recovery process. At the same time, recovery_error_ is also set.
+    IOStatus recovery_io_error_;
+    // The condition variable used with db_mutex during auto resume for time
+    // wait.
+    InstrumentedCondVar cv_;
+    bool end_recovery_;
+    std::unique_ptr<port::Thread> recovery_thread_;
+
     InstrumentedMutex* db_mutex_;
     // A flag indicating whether automatic recovery from errors is enabled
     bool auto_recovery_;
     bool recovery_in_prog_;
+    // A flag to indicate that for the soft error, we should not allow any
+    // background work except the work is from recovery.
+    bool soft_error_no_bg_work_;
+
+    // Used to store the context for recover, such as flush reason.
+    DBRecoverContext recover_context_;
+
+    // The pointer of DB statistics.
+    std::shared_ptr<Statistics> bg_error_stats_;
 
-    Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
+    Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
     void RecoverFromNoSpace();
+    const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
+    void RecoverFromRetryableBGIOError();
+    // First, if it is in recovery and the recovery_error is ok. Set the
+    // recovery_error_ to bg_err. Second, if the severity is higher than the
+    // current bg_error_, overwrite it.
+    void CheckAndSetRecoveryAndBGError(const Status& bg_err);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_fs_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,2663 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/stack_trace.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBErrorHandlingFSTest : public DBTestBase {
+ public:
+  DBErrorHandlingFSTest()
+      : DBTestBase("db_error_handling_fs_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::string GetManifestNameFromLiveFiles() {
+    std::vector<std::string> live_files;
+    uint64_t manifest_size;
+
+    Status s = dbfull()->GetLiveFiles(live_files, &manifest_size, false);
+    if (!s.ok()) {
+      return "";
+    }
+    for (auto& file : live_files) {
+      uint64_t num = 0;
+      FileType type;
+      if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
+        return file;
+      }
+    }
+    return "";
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+class ErrorHandlerFSListener : public EventListener {
+ public:
+  ErrorHandlerFSListener()
+      : mutex_(),
+        cv_(&mutex_),
+        no_auto_recovery_(false),
+        recovery_complete_(false),
+        file_creation_started_(false),
+        override_bg_error_(false),
+        file_count_(0),
+        fault_fs_(nullptr) {}
+  ~ErrorHandlerFSListener() {
+    file_creation_error_.PermitUncheckedError();
+    bg_error_.PermitUncheckedError();
+    new_bg_error_.PermitUncheckedError();
+  }
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*ti*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    file_creation_started_ = true;
+    if (file_count_ > 0) {
+      if (--file_count_ == 0) {
+        fault_fs_->SetFilesystemActive(false, file_creation_error_);
+        file_creation_error_ = IOStatus::OK();
+      }
+    }
+    cv_.SignalAll();
+  }
+
+  void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, Status bg_error,
+                            bool* auto_recovery) override {
+    bg_error.PermitUncheckedError();
+    if (*auto_recovery && no_auto_recovery_) {
+      *auto_recovery = false;
+    }
+  }
+
+  void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override {
+    InstrumentedMutexLock l(&mutex_);
+    recovery_complete_ = true;
+    cv_.SignalAll();
+    new_bg_error_ = info.new_bg_error;
+  }
+
+  bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!recovery_complete_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    if (recovery_complete_) {
+      recovery_complete_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!file_creation_started_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    file_creation_started_ = false;
+  }
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (override_bg_error_) {
+      *bg_error = bg_error_;
+      override_bg_error_ = false;
+    }
+  }
+
+  void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+  void OverrideBGError(Status bg_err) {
+    bg_error_ = bg_err;
+    override_bg_error_ = true;
+  }
+
+  void InjectFileCreationError(FaultInjectionTestFS* fs, int file_count,
+                               IOStatus io_s) {
+    fault_fs_ = fs;
+    file_count_ = file_count;
+    file_creation_error_ = io_s;
+  }
+
+  Status new_bg_error() { return new_bg_error_; }
+
+ private:
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cv_;
+  bool no_auto_recovery_;
+  bool recovery_complete_;
+  bool file_creation_started_;
+  bool override_bg_error_;
+  int file_count_;
+  IOStatus file_creation_error_;
+  Status bg_error_;
+  Status new_bg_error_;
+  FaultInjectionTestFS* fault_fs_;
+};
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  Destroy(options);
+}
+
+// All the NoSpace IOError will be handled as the regular BG Error no matter the
+// retryable flag is set of not. So the auto resume for retryable IO Error will
+// not be triggered. Also, it is mapped as hard error.
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+
+  ASSERT_OK(Put(Key(2), "val2"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+
+  ASSERT_OK(Put(Key(2), "val2"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  // not file scope, but retyrable set
+  error_msg.SetDataLoss(false);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem);
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.atomic_flush = true;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+// The flush error is injected before we finish the table build
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Destroy(options);
+}
+
+// The retryable IO error is injected before we sync table
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+// The retryable IO error is injected before we close the table file
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_fs_->SetFilesystemActive(true);
+
+  // This Resume() will attempt to create a new manifest file and fail again
+  s = dbfull()->Resume();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // A successful Resume() will create a new manifest file
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+       // Wait for DB instance to clear bg_error before calling
+       // TEST_WaitForCompact
+       {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false,
+                                         IOStatus::NoSpace("Out of space"));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  // This Flush will trigger a compaction, which will fail when appending to
+  // the manifest
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  // Clear all errors so when the compaction is retried, it will succeed
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+  TEST_SYNC_POINT("CompactionManifestWriteError:2");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(
+      Status(Status::NoSpace(), Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, CorruptionError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::Corruption("Corruption"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_NOK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  s = Put(Key(1), "val");
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  // We should be able to shutdown the database while auto recovery is going
+  // on in the background
+  Close();
+  DestroyDB(dbname_, options).PermitUncheckedError();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Reopen(options);
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 0;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOError());
+  }
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  ASSERT_OK(dbfull()->Resume());
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 1; i < 4; ++i) {
+      for (auto j = 0; j < 100; ++j) {
+        ASSERT_OK(batch.Put(handles_[i], Key(j), rnd.RandomString(1024)));
+      }
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    // Write to one CF
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(handles_[2], Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsNoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  for (auto i = 1; i < 4; ++i) {
+    // Every CF should have been flushed
+    ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+  }
+
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+  std::vector<std::unique_ptr<Env>> fault_envs;
+  std::vector<FaultInjectionTestFS*> fault_fs;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerFSListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+    std::shared_ptr<FileSystem> fs(fault_fs.back());
+    fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+    options[i].env = fault_envs.back().get();
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    // Setup for returning error for the 3rd SST, which would be level 1
+    listener[i]->InjectFileCreationError(fault_fs[i], 3,
+                                         IOStatus::NoSpace("Out of space"));
+    snprintf(buf, sizeof(buf), "_%d", i);
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+    fault_fs[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+    EXPECT_TRUE(
+        db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(
+        db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    delete db[i];
+    fault_fs[i]->SetFilesystemActive(true);
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    }
+  }
+  options.clear();
+  sfm.reset();
+  delete def_env;
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+  std::vector<std::unique_ptr<Env>> fault_envs;
+  std::vector<FaultInjectionTestFS*> fault_fs;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerFSListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+    std::shared_ptr<FileSystem> fs(fault_fs.back());
+    fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+    options[i].env = fault_envs.back().get();
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    switch (i) {
+      case 0:
+        // Setup for returning error for the 3rd SST, which would be level 1
+        listener[i]->InjectFileCreationError(fault_fs[i], 3,
+                                             IOStatus::NoSpace("Out of space"));
+        break;
+      case 1:
+        // Setup for returning error after the 1st SST, which would result
+        // in a hard error
+        listener[i]->InjectFileCreationError(fault_fs[i], 2,
+                                             IOStatus::NoSpace("Out of space"));
+        break;
+      default:
+        break;
+    }
+    snprintf(buf, sizeof(buf), "_%d", i);
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    if (i != 1) {
+      ASSERT_OK(db[i]->Flush(FlushOptions()));
+    } else {
+      ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace());
+    }
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    switch (i) {
+      case 0:
+        ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+        break;
+      case 1:
+        ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+        break;
+      case 2:
+        ASSERT_OK(s);
+        break;
+    }
+    fault_fs[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    if (i < 2) {
+      ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    }
+    if (i == 1) {
+      ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+    }
+    EXPECT_TRUE(
+        db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(
+        db[i]->GetProperty("rocksdb.num-files-at-level" + ToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    fault_fs[i]->SetFilesystemActive(true);
+    delete db[i];
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    }
+  }
+  options.clear();
+  delete def_env;
+}
+
+// When Put the KV-pair, the write option is set to disable WAL.
+// If retryable error happens in this condition, map the bg error
+// to soft error and trigger auto resume. During auto resume, SwitchMemtable
+// is disabled to avoid small SST tables. Write can still be applied before
+// the bg error is cleaned unless the memtable is full.
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:LoopOut",
+        "FLushWritNoWALRetryableeErrorAutoRecover1:1"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1");
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val1", Get(Key(1)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  s = Flush();
+  // Since auto resume fails, the bg error is not cleand, flush will
+  // return the bg_error set before.
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  // call auto resume
+  ASSERT_OK(dbfull()->Resume());
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  // After resume is successful, the flush should be ok.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val3", Get(Key(3)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  s = Flush();
+  // Since auto resume is successful, the bg error is cleaned, flush will
+  // be successful.
+  ASSERT_OK(s);
+  ASSERT_EQ("val2", Get(Key(2)));
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error. Activate the FS before the
+// first resume. Resume is successful
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  ASSERT_EQ("val1", Get(Key(1)));
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(2), "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Never activate the FS and auto resume should fail at the end
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
+  // Fail all the resume and let user to resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"FLushWritRetryableeErrorAutoRecover2:0",
+        "RecoverFromRetryableBGIOError:BeforeStart"},
+       {"RecoverFromRetryableBGIOError:LoopOut",
+        "FLushWritRetryableeErrorAutoRecover2:1"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0");
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ("val1", Get(Key(1)));
+  // Auto resume fails due to FS does not recover during resume. User call
+  // resume manually here.
+  s = dbfull()->Resume();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(s);
+  ASSERT_OK(Put(Key(2), "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Fail the first resume and let the second resume be successful.
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
+  // Fail the first resume and let the second resume be successful
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ManifestWriteRetryableErrorAutoRecover:0"},
+       {"ManifestWriteRetryableErrorAutoRecover:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ManifestWriteRetryableErrorAutoRecover:2"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) {
+  // Fail the first resume and let the second resume be successful
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val", wo));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ManifestWriteNoWALRetryableErrorAutoRecover:0"},
+       {"ManifestWriteNoWALRetryableErrorAutoRecover:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ManifestWriteNoWALRetryableErrorAutoRecover:2"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest,
+       CompactionManifestWriteRetryableErrorAutoRecover) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  ASSERT_OK(Flush());
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteErrorAR:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteErrorAR:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+       {"CompactionManifestWriteErrorAR:2",
+        "RecoverFromRetryableBGIOError:BeforeStart"},
+       // Fail the first resume, before the wait in resume
+       {"RecoverFromRetryableBGIOError:BeforeResume0",
+        "CompactionManifestWriteErrorAR:3"},
+       // Activate the FS before the second resume
+       {"CompactionManifestWriteErrorAR:4",
+        "RecoverFromRetryableBGIOError:BeforeResume1"},
+       // Wait the auto resume be sucessful
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "CompactionManifestWriteErrorAR:5"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
+  // In this test, in the first round of compaction, the FS is set to error.
+  // So the first compaction fails due to retryable IO error and it is mapped
+  // to soft error. Then, compaction is rescheduled, in the second round of
+  // compaction, the FS is set to active and compaction is successful, so
+  // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
+  // point.
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::atomic<bool> fail_first(false);
+  std::atomic<bool> fail_second(true);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::FinishCompactionOutputFile1",
+        "CompactionWriteRetryableErrorAutoRecover0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile", [&](void*) {
+        if (fail_first.load() && fail_second.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+          fail_second.store(false);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(s);
+  TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"},
+         {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"},
+         {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"},
+         {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}});
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(true, s.IsIOError());
+    TEST_SYNC_POINT("WALWriteErrorDone");
+
+    TEST_SYNC_POINT("WALWriteError1:0");
+    fault_fs_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    TEST_SYNC_POINT("WALWriteError1:1");
+    TEST_SYNC_POINT("WALWriteError1:2");
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
+  // Fail the first recover and try second time.
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"RecoverFromRetryableBGIOError:BeforeWait0", "WALWriteError2:0"},
+         {"WALWriteError2:1", "RecoverFromRetryableBGIOError:BeforeWait1"},
+         {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError2:2"}});
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(true, s.IsIOError());
+
+    TEST_SYNC_POINT("WALWriteError2:0");
+    fault_fs_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    TEST_SYNC_POINT("WALWriteError2:1");
+    TEST_SYNC_POINT("WALWriteError2:2");
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+// Fail auto resume from a flush retryable error and verify that
+// OnErrorRecoveryEnd listener callback is called
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(listener->new_bg_error(), Status::Aborted());
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+
+  Destroy(options);
+}
+
+class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
+                                   public testing::WithParamInterface<bool> {};
+
+TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Close();
+}
+
+TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->EnableAutoRecovery(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::IOFenced("IO fenced"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOFenced());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOFenced());
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBErrorHandlingFSTest, DBErrorHandlingFencingTest,
+                        ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/error_handler_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/error_handler_test.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,871 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef ROCKSDB_LITE
-
-#include "db/db_test_util.h"
-#include "port/stack_trace.h"
-#include "rocksdb/perf_context.h"
-#include "rocksdb/sst_file_manager.h"
-#include "test_util/fault_injection_test_env.h"
-#if !defined(ROCKSDB_LITE)
-#include "test_util/sync_point.h"
-#endif
-
-namespace ROCKSDB_NAMESPACE {
-
-class DBErrorHandlingTest : public DBTestBase {
- public:
-  DBErrorHandlingTest() : DBTestBase("/db_error_handling_test") {}
-
-  std::string GetManifestNameFromLiveFiles() {
-    std::vector<std::string> live_files;
-    uint64_t manifest_size;
-
-    dbfull()->GetLiveFiles(live_files, &manifest_size, false);
-    for (auto& file : live_files) {
-      uint64_t num = 0;
-      FileType type;
-      if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
-        return file;
-      }
-    }
-    return "";
-  }
-};
-
-class DBErrorHandlingEnv : public EnvWrapper {
-  public:
-    DBErrorHandlingEnv() : EnvWrapper(Env::Default()),
-      trig_no_space(false), trig_io_error(false) {}
-
-    void SetTrigNoSpace() {trig_no_space = true;}
-    void SetTrigIoError() {trig_io_error = true;}
-  private:
-    bool trig_no_space;
-    bool trig_io_error;
-};
-
-class ErrorHandlerListener : public EventListener {
- public:
-  ErrorHandlerListener()
-      : mutex_(),
-        cv_(&mutex_),
-        no_auto_recovery_(false),
-        recovery_complete_(false),
-        file_creation_started_(false),
-        override_bg_error_(false),
-        file_count_(0),
-        fault_env_(nullptr) {}
-
-  void OnTableFileCreationStarted(
-      const TableFileCreationBriefInfo& /*ti*/) override {
-    InstrumentedMutexLock l(&mutex_);
-    file_creation_started_ = true;
-    if (file_count_ > 0) {
-      if (--file_count_ == 0) {
-        fault_env_->SetFilesystemActive(false, file_creation_error_);
-        file_creation_error_ = Status::OK();
-      }
-    }
-    cv_.SignalAll();
-  }
-
-  void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
-                            Status /*bg_error*/,
-                            bool* auto_recovery) override {
-    if (*auto_recovery && no_auto_recovery_) {
-      *auto_recovery = false;
-    }
-  }
-
-  void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
-    InstrumentedMutexLock l(&mutex_);
-    recovery_complete_ = true;
-    cv_.SignalAll();
-  }
-
-  bool WaitForRecovery(uint64_t /*abs_time_us*/) {
-    InstrumentedMutexLock l(&mutex_);
-    while (!recovery_complete_) {
-      cv_.Wait(/*abs_time_us*/);
-    }
-    if (recovery_complete_) {
-      recovery_complete_ = false;
-      return true;
-    }
-    return false;
-  }
-
-  void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
-    InstrumentedMutexLock l(&mutex_);
-    while (!file_creation_started_) {
-      cv_.Wait(/*abs_time_us*/);
-    }
-    file_creation_started_ = false;
-  }
-
-  void OnBackgroundError(BackgroundErrorReason /*reason*/,
-                         Status* bg_error) override {
-    if (override_bg_error_) {
-      *bg_error = bg_error_;
-      override_bg_error_ = false;
-    }
-  }
-
-  void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
-
-  void OverrideBGError(Status bg_err) {
-    bg_error_ = bg_err;
-    override_bg_error_ = true;
-  }
-
-  void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count,
-                               Status s) {
-    fault_env_ = env;
-    file_count_ = file_count;
-    file_creation_error_ = s;
-  }
-
- private:
-  InstrumentedMutex mutex_;
-  InstrumentedCondVar cv_;
-  bool no_auto_recovery_;
-  bool recovery_complete_;
-  bool file_creation_started_;
-  bool override_bg_error_;
-  int file_count_;
-  Status file_creation_error_;
-  Status bg_error_;
-  FaultInjectionTestEnv* fault_env_;
-};
-
-TEST_F(DBErrorHandlingTest, FLushWriteError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.env = fault_env.get();
-  options.listeners.emplace_back(listener);
-  Status s;
-
-  listener->EnableAutoRecovery(false);
-  DestroyAndReopen(options);
-
-  Put(Key(0), "val");
-  SyncPoint::GetInstance()->SetCallBack(
-      "FlushJob::Start", [&](void *) {
-    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-  });
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  SyncPoint::GetInstance()->DisableProcessing();
-  fault_env->SetFilesystemActive(true);
-  s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
-
-  Reopen(options);
-  ASSERT_EQ("val", Get(Key(0)));
-  Destroy(options);
-}
-
-TEST_F(DBErrorHandlingTest, ManifestWriteError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.env = fault_env.get();
-  options.listeners.emplace_back(listener);
-  Status s;
-  std::string old_manifest;
-  std::string new_manifest;
-
-  listener->EnableAutoRecovery(false);
-  DestroyAndReopen(options);
-  old_manifest = GetManifestNameFromLiveFiles();
-
-  Put(Key(0), "val");
-  Flush();
-  Put(Key(1), "val");
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::LogAndApply:WriteManifest", [&](void *) {
-    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-  });
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->DisableProcessing();
-  fault_env->SetFilesystemActive(true);
-  s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
-
-  new_manifest = GetManifestNameFromLiveFiles();
-  ASSERT_NE(new_manifest, old_manifest);
-
-  Reopen(options);
-  ASSERT_EQ("val", Get(Key(0)));
-  ASSERT_EQ("val", Get(Key(1)));
-  Close();
-}
-
-TEST_F(DBErrorHandlingTest, DoubleManifestWriteError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.env = fault_env.get();
-  options.listeners.emplace_back(listener);
-  Status s;
-  std::string old_manifest;
-  std::string new_manifest;
-
-  listener->EnableAutoRecovery(false);
-  DestroyAndReopen(options);
-  old_manifest = GetManifestNameFromLiveFiles();
-
-  Put(Key(0), "val");
-  Flush();
-  Put(Key(1), "val");
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::LogAndApply:WriteManifest", [&](void *) {
-    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-  });
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  fault_env->SetFilesystemActive(true);
-
-  // This Resume() will attempt to create a new manifest file and fail again
-  s = dbfull()->Resume();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  fault_env->SetFilesystemActive(true);
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->DisableProcessing();
-
-  // A successful Resume() will create a new manifest file
-  s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
-
-  new_manifest = GetManifestNameFromLiveFiles();
-  ASSERT_NE(new_manifest, old_manifest);
-
-  Reopen(options);
-  ASSERT_EQ("val", Get(Key(0)));
-  ASSERT_EQ("val", Get(Key(1)));
-  Close();
-}
-
-TEST_F(DBErrorHandlingTest, CompactionManifestWriteError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.level0_file_num_compaction_trigger = 2;
-  options.listeners.emplace_back(listener);
-  options.env = fault_env.get();
-  Status s;
-  std::string old_manifest;
-  std::string new_manifest;
-  std::atomic<bool> fail_manifest(false);
-  DestroyAndReopen(options);
-  old_manifest = GetManifestNameFromLiveFiles();
-
-  Put(Key(0), "val");
-  Put(Key(2), "val");
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      // Wait for flush of 2nd L0 file before starting compaction
-      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
-        "BackgroundCallCompaction:0"},
-       // Wait for compaction to detect manifest write error
-       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
-       // Make compaction thread wait for error to be cleared
-       {"CompactionManifestWriteError:1",
-        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
-       // Wait for DB instance to clear bg_error before calling
-       // TEST_WaitForCompact
-       {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}});
-  // trigger manifest write failure in compaction thread
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
-        if (fail_manifest.load()) {
-          fault_env->SetFilesystemActive(false,
-                                         Status::NoSpace("Out of space"));
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  Put(Key(1), "val");
-  // This Flush will trigger a compaction, which will fail when appending to
-  // the manifest
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-
-  TEST_SYNC_POINT("CompactionManifestWriteError:0");
-  // Clear all errors so when the compaction is retried, it will succeed
-  fault_env->SetFilesystemActive(true);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  TEST_SYNC_POINT("CompactionManifestWriteError:1");
-  TEST_SYNC_POINT("CompactionManifestWriteError:2");
-
-  s = dbfull()->TEST_WaitForCompact();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ASSERT_EQ(s, Status::OK());
-
-  new_manifest = GetManifestNameFromLiveFiles();
-  ASSERT_NE(new_manifest, old_manifest);
-  Reopen(options);
-  ASSERT_EQ("val", Get(Key(0)));
-  ASSERT_EQ("val", Get(Key(1)));
-  ASSERT_EQ("val", Get(Key(2)));
-  Close();
-}
-
-TEST_F(DBErrorHandlingTest, CompactionWriteError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.level0_file_num_compaction_trigger = 2;
-  options.listeners.emplace_back(listener);
-  options.env = fault_env.get();
-  Status s;
-  DestroyAndReopen(options);
-
-  Put(Key(0), "va;");
-  Put(Key(2), "va;");
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-
-  listener->OverrideBGError(
-      Status(Status::NoSpace(), Status::Severity::kHardError)
-      );
-  listener->EnableAutoRecovery(false);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
-        "BackgroundCallCompaction:0"}});
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "BackgroundCallCompaction:0", [&](void*) {
-        fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  Put(Key(1), "val");
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-
-  s = dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-
-  fault_env->SetFilesystemActive(true);
-  s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
-  Destroy(options);
-}
-
-TEST_F(DBErrorHandlingTest, CorruptionError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.level0_file_num_compaction_trigger = 2;
-  options.env = fault_env.get();
-  Status s;
-  DestroyAndReopen(options);
-
-  Put(Key(0), "va;");
-  Put(Key(2), "va;");
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
-        "BackgroundCallCompaction:0"}});
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "BackgroundCallCompaction:0", [&](void*) {
-        fault_env->SetFilesystemActive(false, Status::Corruption("Corruption"));
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  Put(Key(1), "val");
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-
-  s = dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(s.severity(),
-            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
-
-  fault_env->SetFilesystemActive(true);
-  s = dbfull()->Resume();
-  ASSERT_NE(s, Status::OK());
-  Destroy(options);
-}
-
-TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.env = fault_env.get();
-  options.listeners.emplace_back(listener);
-  Status s;
-
-  listener->EnableAutoRecovery();
-  DestroyAndReopen(options);
-
-  Put(Key(0), "val");
-  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
-    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-  });
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  SyncPoint::GetInstance()->DisableProcessing();
-  fault_env->SetFilesystemActive(true);
-  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
-
-  s = Put(Key(1), "val");
-  ASSERT_EQ(s, Status::OK());
-
-  Reopen(options);
-  ASSERT_EQ("val", Get(Key(0)));
-  ASSERT_EQ("val", Get(Key(1)));
-  Destroy(options);
-}
-
-TEST_F(DBErrorHandlingTest, FailRecoverFlushError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.env = fault_env.get();
-  options.listeners.emplace_back(listener);
-  Status s;
-
-  listener->EnableAutoRecovery();
-  DestroyAndReopen(options);
-
-  Put(Key(0), "val");
-  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
-    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-  });
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  // We should be able to shutdown the database while auto recovery is going
-  // on in the background
-  Close();
-  DestroyDB(dbname_, options);
-}
-
-TEST_F(DBErrorHandlingTest, WALWriteError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.writable_file_max_buffer_size = 32768;
-  options.env = fault_env.get();
-  options.listeners.emplace_back(listener);
-  Status s;
-  Random rnd(301);
-
-  listener->EnableAutoRecovery();
-  DestroyAndReopen(options);
-
-  {
-    WriteBatch batch;
-
-    for (auto i = 0; i<100; ++i) {
-      batch.Put(Key(i), RandomString(&rnd, 1024));
-    }
-
-    WriteOptions wopts;
-    wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
-  };
-
-  {
-    WriteBatch batch;
-    int write_error = 0;
-
-    for (auto i = 100; i<199; ++i) {
-      batch.Put(Key(i), RandomString(&rnd, 1024));
-    }
-
-    SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
-      write_error++;
-      if (write_error > 2) {
-        fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-      }
-    });
-    SyncPoint::GetInstance()->EnableProcessing();
-    WriteOptions wopts;
-    wopts.sync = true;
-    s = dbfull()->Write(wopts, &batch);
-    ASSERT_EQ(s, s.NoSpace());
-  }
-  SyncPoint::GetInstance()->DisableProcessing();
-  fault_env->SetFilesystemActive(true);
-  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
-  for (auto i=0; i<199; ++i) {
-    if (i < 100) {
-      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
-    } else {
-      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
-    }
-  }
-  Reopen(options);
-  for (auto i=0; i<199; ++i) {
-    if (i < 100) {
-      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
-    } else {
-      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
-    }
-  }
-  Close();
-}
-
-TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) {
-  std::unique_ptr<FaultInjectionTestEnv> fault_env(
-      new FaultInjectionTestEnv(Env::Default()));
-  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
-  Options options = GetDefaultOptions();
-  options.create_if_missing = true;
-  options.writable_file_max_buffer_size = 32768;
-  options.env = fault_env.get();
-  options.listeners.emplace_back(listener);
-  Status s;
-  Random rnd(301);
-
-  listener->EnableAutoRecovery();
-  CreateAndReopenWithCF({"one", "two", "three"}, options);
-
-  {
-    WriteBatch batch;
-
-    for (auto i = 1; i < 4; ++i) {
-      for (auto j = 0; j < 100; ++j) {
-        batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024));
-      }
-    }
-
-    WriteOptions wopts;
-    wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
-  };
-
-  {
-    WriteBatch batch;
-    int write_error = 0;
-
-    // Write to one CF
-    for (auto i = 100; i < 199; ++i) {
-      batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024));
-    }
-
-    SyncPoint::GetInstance()->SetCallBack(
-        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
-          write_error++;
-          if (write_error > 2) {
-            fault_env->SetFilesystemActive(false,
-                                           Status::NoSpace("Out of space"));
-          }
-        });
-    SyncPoint::GetInstance()->EnableProcessing();
-    WriteOptions wopts;
-    wopts.sync = true;
-    s = dbfull()->Write(wopts, &batch);
-    ASSERT_EQ(s, s.NoSpace());
-  }
-  SyncPoint::GetInstance()->DisableProcessing();
-  fault_env->SetFilesystemActive(true);
-  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
-
-  for (auto i = 1; i < 4; ++i) {
-    // Every CF should have been flushed
-    ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
-  }
-
-  for (auto i = 1; i < 4; ++i) {
-    for (auto j = 0; j < 199; ++j) {
-      if (j < 100) {
-        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
-      } else {
-        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
-      }
-    }
-  }
-  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
-  for (auto i = 1; i < 4; ++i) {
-    for (auto j = 0; j < 199; ++j) {
-      if (j < 100) {
-        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
-      } else {
-        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
-      }
-    }
-  }
-  Close();
-}
-
-TEST_F(DBErrorHandlingTest, MultiDBCompactionError) {
-  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
-  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
-  std::vector<Options> options;
-  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
-  std::vector<DB*> db;
-  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
-  int kNumDbInstances = 3;
-  Random rnd(301);
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    listener.emplace_back(new ErrorHandlerListener());
-    options.emplace_back(GetDefaultOptions());
-    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
-    options[i].create_if_missing = true;
-    options[i].level0_file_num_compaction_trigger = 2;
-    options[i].writable_file_max_buffer_size = 32768;
-    options[i].env = fault_env[i].get();
-    options[i].listeners.emplace_back(listener[i]);
-    options[i].sst_file_manager = sfm;
-    DB* dbptr;
-    char buf[16];
-
-    listener[i]->EnableAutoRecovery();
-    // Setup for returning error for the 3rd SST, which would be level 1
-    listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
-                                         Status::NoSpace("Out of space"));
-    snprintf(buf, sizeof(buf), "_%d", i);
-    DestroyDB(dbname_ + std::string(buf), options[i]);
-    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
-              Status::OK());
-    db.emplace_back(dbptr);
-  }
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    WriteBatch batch;
-
-    for (auto j = 0; j <= 100; ++j) {
-      batch.Put(Key(j), RandomString(&rnd, 1024));
-    }
-
-    WriteOptions wopts;
-    wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
-    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
-  }
-
-  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    WriteBatch batch;
-
-    // Write to one CF
-    for (auto j = 100; j < 199; ++j) {
-      batch.Put(Key(j), RandomString(&rnd, 1024));
-    }
-
-    WriteOptions wopts;
-    wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
-    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
-  }
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
-    ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
-    fault_env[i]->SetFilesystemActive(true);
-  }
-
-  def_env->SetFilesystemActive(true);
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    std::string prop;
-    ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
-    ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
-              Status::OK());
-    EXPECT_TRUE(db[i]->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
-    EXPECT_EQ(atoi(prop.c_str()), 0);
-    EXPECT_TRUE(db[i]->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
-    EXPECT_EQ(atoi(prop.c_str()), 1);
-  }
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    char buf[16];
-    snprintf(buf, sizeof(buf), "_%d", i);
-    delete db[i];
-    fault_env[i]->SetFilesystemActive(true);
-    if (getenv("KEEP_DB")) {
-      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
-    } else {
-      Status s = DestroyDB(dbname_ + std::string(buf), options[i]);
-    }
-  }
-  options.clear();
-  sfm.reset();
-  delete def_env;
-}
-
-TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) {
-  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
-  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
-  std::vector<Options> options;
-  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
-  std::vector<DB*> db;
-  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
-  int kNumDbInstances = 3;
-  Random rnd(301);
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    listener.emplace_back(new ErrorHandlerListener());
-    options.emplace_back(GetDefaultOptions());
-    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
-    options[i].create_if_missing = true;
-    options[i].level0_file_num_compaction_trigger = 2;
-    options[i].writable_file_max_buffer_size = 32768;
-    options[i].env = fault_env[i].get();
-    options[i].listeners.emplace_back(listener[i]);
-    options[i].sst_file_manager = sfm;
-    DB* dbptr;
-    char buf[16];
-
-    listener[i]->EnableAutoRecovery();
-    switch (i) {
-      case 0:
-        // Setup for returning error for the 3rd SST, which would be level 1
-        listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
-                                             Status::NoSpace("Out of space"));
-        break;
-      case 1:
-        // Setup for returning error after the 1st SST, which would result
-        // in a hard error
-        listener[i]->InjectFileCreationError(fault_env[i].get(), 2,
-                                             Status::NoSpace("Out of space"));
-        break;
-      default:
-        break;
-    }
-    snprintf(buf, sizeof(buf), "_%d", i);
-    DestroyDB(dbname_ + std::string(buf), options[i]);
-    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
-              Status::OK());
-    db.emplace_back(dbptr);
-  }
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    WriteBatch batch;
-
-    for (auto j = 0; j <= 100; ++j) {
-      batch.Put(Key(j), RandomString(&rnd, 1024));
-    }
-
-    WriteOptions wopts;
-    wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
-    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
-  }
-
-  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    WriteBatch batch;
-
-    // Write to one CF
-    for (auto j = 100; j < 199; ++j) {
-      batch.Put(Key(j), RandomString(&rnd, 1024));
-    }
-
-    WriteOptions wopts;
-    wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
-    if (i != 1) {
-      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
-    } else {
-      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace());
-    }
-  }
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
-    switch (i) {
-      case 0:
-        ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
-        break;
-      case 1:
-        ASSERT_EQ(s.severity(), Status::Severity::kHardError);
-        break;
-      case 2:
-        ASSERT_EQ(s, Status::OK());
-        break;
-    }
-    fault_env[i]->SetFilesystemActive(true);
-  }
-
-  def_env->SetFilesystemActive(true);
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    std::string prop;
-    if (i < 2) {
-      ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
-    }
-    if (i == 1) {
-      ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
-                Status::OK());
-    }
-    EXPECT_TRUE(db[i]->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
-    EXPECT_EQ(atoi(prop.c_str()), 0);
-    EXPECT_TRUE(db[i]->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
-    EXPECT_EQ(atoi(prop.c_str()), 1);
-  }
-
-  for (auto i = 0; i < kNumDbInstances; ++i) {
-    char buf[16];
-    snprintf(buf, sizeof(buf), "_%d", i);
-    fault_env[i]->SetFilesystemActive(true);
-    delete db[i];
-    if (getenv("KEEP_DB")) {
-      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
-    } else {
-      DestroyDB(dbname_ + std::string(buf), options[i]);
-    }
-  }
-  options.clear();
-  delete def_env;
-}
-
-}  // namespace ROCKSDB_NAMESPACE
-
-int main(int argc, char** argv) {
-  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-#include <stdio.h>
-
-int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,7 +5,18 @@
 
 #include "db/event_helpers.h"
 
+#include "rocksdb/convenience.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/customizable_util.h"
+
 namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+Status EventListener::CreateFromString(const ConfigOptions& config_options,
+                                       const std::string& id,
+                                       std::shared_ptr<EventListener>* result) {
+  return LoadSharedObject<EventListener>(config_options, id, nullptr, result);
+}
+#endif  // ROCKSDB_LITE
 
 namespace {
 template <class T>
@@ -26,6 +37,9 @@
     const std::vector<std::shared_ptr<EventListener>>& listeners,
     const std::string& db_name, const std::string& cf_name,
     const std::string& file_path, int job_id, TableFileCreationReason reason) {
+  if (listeners.empty()) {
+    return;
+  }
   TableFileCreationBriefInfo info;
   info.db_name = db_name;
   info.cf_name = cf_name;
@@ -43,7 +57,7 @@
     BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
     bool* auto_recovery) {
 #ifndef ROCKSDB_LITE
-  if (listeners.size() == 0U) {
+  if (listeners.empty()) {
     return;
   }
   db_mutex->AssertHeld();
@@ -51,6 +65,7 @@
   db_mutex->Unlock();
   for (auto& listener : listeners) {
     listener->OnBackgroundError(reason, bg_error);
+    bg_error->PermitUncheckedError();
     if (*auto_recovery) {
       listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
     }
@@ -71,14 +86,18 @@
     const std::string& db_name, const std::string& cf_name,
     const std::string& file_path, int job_id, const FileDescriptor& fd,
     uint64_t oldest_blob_file_number, const TableProperties& table_properties,
-    TableFileCreationReason reason, const Status& s) {
+    TableFileCreationReason reason, const Status& s,
+    const std::string& file_checksum,
+    const std::string& file_checksum_func_name) {
   if (s.ok() && event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
     jwriter << "cf_name" << cf_name << "job" << job_id << "event"
             << "table_file_creation"
             << "file_number" << fd.GetNumber() << "file_size"
-            << fd.GetFileSize();
+            << fd.GetFileSize() << "file_checksum"
+            << Slice(file_checksum).ToString(true) << "file_checksum_func_name"
+            << file_checksum_func_name;
 
     // table_properties
     {
@@ -104,6 +123,7 @@
                             table_properties.num_entries)
               << "num_data_blocks" << table_properties.num_data_blocks
               << "num_entries" << table_properties.num_entries
+              << "num_filter_entries" << table_properties.num_filter_entries
               << "num_deletions" << table_properties.num_deletions
               << "num_merge_operands" << table_properties.num_merge_operands
               << "num_range_deletions" << table_properties.num_range_deletions
@@ -121,7 +141,14 @@
               << table_properties.compression_options << "creation_time"
               << table_properties.creation_time << "oldest_key_time"
               << table_properties.oldest_key_time << "file_creation_time"
-              << table_properties.file_creation_time;
+              << table_properties.file_creation_time
+              << "slow_compression_estimated_data_size"
+              << table_properties.slow_compression_estimated_data_size
+              << "fast_compression_estimated_data_size"
+              << table_properties.fast_compression_estimated_data_size
+              << "db_id" << table_properties.db_id << "db_session_id"
+              << table_properties.db_session_id << "orig_file_number"
+              << table_properties.orig_file_number;
 
       // user collected properties
       for (const auto& prop : table_properties.readable_properties) {
@@ -140,7 +167,7 @@
   }
 
 #ifndef ROCKSDB_LITE
-  if (listeners.size() == 0) {
+  if (listeners.empty()) {
     return;
   }
   TableFileCreationInfo info;
@@ -152,9 +179,12 @@
   info.table_properties = table_properties;
   info.reason = reason;
   info.status = s;
+  info.file_checksum = file_checksum;
+  info.file_checksum_func_name = file_checksum_func_name;
   for (auto& listener : listeners) {
     listener->OnTableFileCreated(info);
   }
+  info.status.PermitUncheckedError();
 #else
   (void)listeners;
   (void)db_name;
@@ -184,6 +214,9 @@
   event_logger->Log(jwriter);
 
 #ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
   TableFileDeletionInfo info;
   info.db_name = dbname;
   info.job_id = job_id;
@@ -192,6 +225,7 @@
   for (auto& listener : listeners) {
     listener->OnTableFileDeleted(info);
   }
+  info.status.PermitUncheckedError();
 #else
   (void)file_path;
   (void)dbname;
@@ -199,25 +233,126 @@
 #endif  // !ROCKSDB_LITE
 }
 
-void EventHelpers::NotifyOnErrorRecoveryCompleted(
+void EventHelpers::NotifyOnErrorRecoveryEnd(
     const std::vector<std::shared_ptr<EventListener>>& listeners,
-    Status old_bg_error, InstrumentedMutex* db_mutex) {
+    const Status& old_bg_error, const Status& new_bg_error,
+    InstrumentedMutex* db_mutex) {
 #ifndef ROCKSDB_LITE
-  if (listeners.size() == 0U) {
-    return;
-  }
-  db_mutex->AssertHeld();
-  // release lock while notifying events
-  db_mutex->Unlock();
-  for (auto& listener : listeners) {
-    listener->OnErrorRecoveryCompleted(old_bg_error);
+  if (!listeners.empty()) {
+    db_mutex->AssertHeld();
+    // release lock while notifying events
+    db_mutex->Unlock();
+    for (auto& listener : listeners) {
+      BackgroundErrorRecoveryInfo info;
+      info.old_bg_error = old_bg_error;
+      info.new_bg_error = new_bg_error;
+      listener->OnErrorRecoveryCompleted(old_bg_error);
+      listener->OnErrorRecoveryEnd(info);
+      info.old_bg_error.PermitUncheckedError();
+      info.new_bg_error.PermitUncheckedError();
+    }
+    db_mutex->Lock();
   }
-  db_mutex->Lock();
 #else
   (void)listeners;
   (void)old_bg_error;
+  (void)new_bg_error;
   (void)db_mutex;
 #endif  // ROCKSDB_LITE
 }
 
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyBlobFileCreationStarted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id,
+    BlobFileCreationReason creation_reason) {
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileCreationBriefInfo info(db_name, cf_name, file_path, job_id,
+                                 creation_reason);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileCreationStarted(info);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+void EventHelpers::LogAndNotifyBlobFileCreationFinished(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, uint64_t file_number,
+    BlobFileCreationReason creation_reason, const Status& s,
+    const std::string& file_checksum,
+    const std::string& file_checksum_func_name, uint64_t total_blob_count,
+    uint64_t total_blob_bytes) {
+  if (s.ok() && event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+    jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+            << "blob_file_creation"
+            << "file_number" << file_number << "total_blob_count"
+            << total_blob_count << "total_blob_bytes" << total_blob_bytes
+            << "file_checksum" << file_checksum << "file_checksum_func_name"
+            << file_checksum_func_name << "status" << s.ToString();
+
+    jwriter.EndObject();
+    event_logger->Log(jwriter);
+  }
+
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileCreationInfo info(db_name, cf_name, file_path, job_id,
+                            creation_reason, total_blob_count, total_blob_bytes,
+                            s, file_checksum, file_checksum_func_name);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileCreated(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)db_name;
+  (void)file_path;
+  (void)creation_reason;
+#endif
+}
+
+void EventHelpers::LogAndNotifyBlobFileDeletion(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+    uint64_t file_number, const std::string& file_path, const Status& status,
+    const std::string& dbname) {
+  if (event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+
+    jwriter << "job" << job_id << "event"
+            << "blob_file_deletion"
+            << "file_number" << file_number;
+    if (!status.ok()) {
+      jwriter << "status" << status.ToString();
+    }
+
+    jwriter.EndObject();
+    event_logger->Log(jwriter);
+  }
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileDeletionInfo info(dbname, file_path, job_id, status);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileDeleted(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)dbname;
+  (void)file_path;
+#endif  // !ROCKSDB_LITE
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/event_helpers.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/event_helpers.h	2025-05-19 16:14:27.000000000 +0000
@@ -35,15 +35,42 @@
       const std::string& db_name, const std::string& cf_name,
       const std::string& file_path, int job_id, const FileDescriptor& fd,
       uint64_t oldest_blob_file_number, const TableProperties& table_properties,
-      TableFileCreationReason reason, const Status& s);
+      TableFileCreationReason reason, const Status& s,
+      const std::string& file_checksum,
+      const std::string& file_checksum_func_name);
   static void LogAndNotifyTableFileDeletion(
       EventLogger* event_logger, int job_id,
       uint64_t file_number, const std::string& file_path,
       const Status& status, const std::string& db_name,
       const std::vector<std::shared_ptr<EventListener>>& listeners);
-  static void NotifyOnErrorRecoveryCompleted(
+  static void NotifyOnErrorRecoveryEnd(
       const std::vector<std::shared_ptr<EventListener>>& listeners,
-      Status bg_error, InstrumentedMutex* db_mutex);
+      const Status& old_bg_error, const Status& new_bg_error,
+      InstrumentedMutex* db_mutex);
+
+#ifndef ROCKSDB_LITE
+  static void NotifyBlobFileCreationStarted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id,
+      BlobFileCreationReason creation_reason);
+#endif  // !ROCKSDB_LITE
+
+  static void LogAndNotifyBlobFileCreationFinished(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, uint64_t file_number,
+      BlobFileCreationReason creation_reason, const Status& s,
+      const std::string& file_checksum,
+      const std::string& file_checksum_func_name, uint64_t total_blob_count,
+      uint64_t total_blob_bytes);
+
+  static void LogAndNotifyBlobFileDeletion(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+      uint64_t file_number, const std::string& file_path, const Status& status,
+      const std::string& db_name);
 
  private:
   static void LogAndNotifyTableFileCreation(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_basic_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,11 +6,14 @@
 #include <functional>
 
 #include "db/db_test_util.h"
+#include "db/version_edit.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
-#include "test_util/fault_injection_test_env.h"
+#include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -19,15 +22,32 @@
     : public DBTestBase,
       public ::testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
-  ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") {
-    sst_files_dir_ = dbname_ + "/sst_files/";
-    fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+  ExternalSSTFileBasicTest()
+      : DBTestBase("external_sst_file_basic_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "_sst_files/";
+    fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_));
     DestroyAndRecreateExternalSSTFilesDir();
+
+    // Check if the Env supports RandomRWFile
+    std::string file_path = sst_files_dir_ + "test_random_rw_file";
+    std::unique_ptr<WritableFile> wfile;
+    assert(env_->NewWritableFile(file_path, &wfile, EnvOptions()).ok());
+    wfile.reset();
+    std::unique_ptr<RandomRWFile> rwfile;
+    Status s = env_->NewRandomRWFile(file_path, &rwfile, EnvOptions());
+    if (s.IsNotSupported()) {
+      random_rwfile_supported_ = false;
+    } else {
+      EXPECT_OK(s);
+      random_rwfile_supported_ = true;
+    }
+    rwfile.reset();
+    EXPECT_OK(env_->DeleteFile(file_path));
   }
 
   void DestroyAndRecreateExternalSSTFilesDir() {
-    test::DestroyDir(env_, sst_files_dir_);
-    env_->CreateDir(sst_files_dir_);
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
   }
 
   Status DeprecatedAddFile(const std::vector<std::string>& files,
@@ -41,6 +61,29 @@
     return db_->IngestExternalFile(files, opts);
   }
 
+  Status AddFileWithFileChecksum(
+      const std::vector<std::string>& files,
+      const std::vector<std::string>& files_checksums,
+      const std::vector<std::string>& files_checksum_func_names,
+      bool verify_file_checksum = true, bool move_files = false,
+      bool skip_snapshot_check = false, bool write_global_seqno = true) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    opts.write_global_seqno = write_global_seqno;
+    opts.verify_file_checksum = verify_file_checksum;
+
+    IngestExternalFileArg arg;
+    arg.column_family = db_->DefaultColumnFamily();
+    arg.external_files = files;
+    arg.options = opts;
+    arg.files_checksums = files_checksums;
+    arg.files_checksum_func_names = files_checksum_func_names;
+    return db_->IngestExternalFiles({arg});
+  }
+
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<int> keys,
       const std::vector<ValueType>& value_types,
@@ -137,12 +180,23 @@
   }
 
   ~ExternalSSTFileBasicTest() override {
-    test::DestroyDir(env_, sst_files_dir_);
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
   }
 
  protected:
   std::string sst_files_dir_;
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
+  bool random_rwfile_supported_;
+#ifndef ROCKSDB_LITE
+  uint64_t GetSstSizeHelper(Temperature temperature) {
+    std::string prop;
+    EXPECT_TRUE(
+        dbfull()->GetProperty(DB::Properties::kLiveSstFilesSizeAtTemperature +
+                                  ToString(static_cast<uint8_t>(temperature)),
+                              &prop));
+    return static_cast<uint64_t>(std::atoi(prop.c_str()));
+  }
+#endif  // ROCKSDB_LITE
 };
 
 TEST_F(ExternalSSTFileBasicTest, Basic) {
@@ -162,7 +216,7 @@
   }
   ExternalSstFileInfo file1_info;
   Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
 
   // Current file size should be non-zero after success write.
   ASSERT_GT(sst_file_writer.FileSize(), 0);
@@ -174,16 +228,18 @@
   ASSERT_EQ(file1_info.num_range_del_entries, 0);
   ASSERT_EQ(file1_info.smallest_range_del_key, "");
   ASSERT_EQ(file1_info.largest_range_del_key, "");
+  ASSERT_EQ(file1_info.file_checksum, kUnknownFileChecksum);
+  ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName);
   // sst_file_writer already finished, cannot add this value
   s = sst_file_writer.Put(Key(100), "bad_val");
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
   s = sst_file_writer.DeleteRange(Key(100), Key(200));
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
 
   DestroyAndReopen(options);
   // Add file using file path
   s = DeprecatedAddFile({file1});
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
   for (int k = 0; k < 100; k++) {
     ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
@@ -192,6 +248,391 @@
   DestroyAndRecreateExternalSSTFilesDir();
 }
 
+class ChecksumVerifyHelper {
+ private:
+  Options options_;
+
+ public:
+  ChecksumVerifyHelper(Options& options) : options_(options) {}
+  ~ChecksumVerifyHelper() {}
+
+  Status GetSingleFileChecksumAndFuncName(
+      const std::string& file_path, std::string* file_checksum,
+      std::string* file_checksum_func_name) {
+    Status s;
+    EnvOptions soptions;
+    std::unique_ptr<SequentialFile> file_reader;
+    s = options_.env->NewSequentialFile(file_path, &file_reader, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+    std::unique_ptr<char[]> scratch(new char[2048]);
+    Slice result;
+    FileChecksumGenFactory* file_checksum_gen_factory =
+        options_.file_checksum_gen_factory.get();
+    if (file_checksum_gen_factory == nullptr) {
+      *file_checksum = kUnknownFileChecksum;
+      *file_checksum_func_name = kUnknownFileChecksumFuncName;
+      return Status::OK();
+    } else {
+      FileChecksumGenContext gen_context;
+      std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
+      *file_checksum_func_name = file_checksum_gen->Name();
+      s = file_reader->Read(2048, &result, scratch.get());
+      if (!s.ok()) {
+        return s;
+      }
+      while (result.size() != 0) {
+        file_checksum_gen->Update(scratch.get(), result.size());
+        s = file_reader->Read(2048, &result, scratch.get());
+        if (!s.ok()) {
+          return s;
+        }
+      }
+      file_checksum_gen->Finalize();
+      *file_checksum = file_checksum_gen->GetChecksum();
+    }
+    return Status::OK();
+  }
+};
+
+TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  ChecksumVerifyHelper checksum_helper(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  std::string file_checksum, file_checksum_func_name;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file1, &file_checksum, &file_checksum_func_name));
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+  ASSERT_EQ(file1_info.num_range_del_entries, 0);
+  ASSERT_EQ(file1_info.smallest_range_del_key, "");
+  ASSERT_EQ(file1_info.largest_range_del_key, "");
+  ASSERT_EQ(file1_info.file_checksum, file_checksum);
+  ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name);
+  // sst_file_writer already finished, cannot add this value
+  s = sst_file_writer.Put(Key(100), "bad_val");
+  ASSERT_NOK(s) << s.ToString();
+  s = sst_file_writer.DeleteRange(Key(100), Key(200));
+  ASSERT_NOK(s) << s.ToString();
+
+  DestroyAndReopen(options);
+  // Add file using file path
+  s = DeprecatedAddFile({file1});
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  for (int k = 0; k < 100; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
+  Options old_options = CurrentOptions();
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  const ImmutableCFOptions ioptions(options);
+  ChecksumVerifyHelper checksum_helper(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file01.sst (1000 => 1099)
+  std::string file1 = sst_files_dir_ + "file01.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 1000; k < 1100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(1000));
+  ASSERT_EQ(file1_info.largest_key, Key(1099));
+  std::string file_checksum1, file_checksum_func_name1;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file1, &file_checksum1, &file_checksum_func_name1));
+  ASSERT_EQ(file1_info.file_checksum, file_checksum1);
+  ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1);
+
+  // file02.sst (1100 => 1299)
+  std::string file2 = sst_files_dir_ + "file02.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 1100; k < 1300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(1100));
+  ASSERT_EQ(file2_info.largest_key, Key(1299));
+  std::string file_checksum2, file_checksum_func_name2;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file2, &file_checksum2, &file_checksum_func_name2));
+  ASSERT_EQ(file2_info.file_checksum, file_checksum2);
+  ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2);
+
+  // file03.sst (1300 => 1499)
+  std::string file3 = sst_files_dir_ + "file03.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 1300; k < 1500; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 200);
+  ASSERT_EQ(file3_info.smallest_key, Key(1300));
+  ASSERT_EQ(file3_info.largest_key, Key(1499));
+  std::string file_checksum3, file_checksum_func_name3;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file3, &file_checksum3, &file_checksum_func_name3));
+  ASSERT_EQ(file3_info.file_checksum, file_checksum3);
+  ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3);
+
+  // file04.sst (1500 => 1799)
+  std::string file4 = sst_files_dir_ + "file04.sst";
+  ASSERT_OK(sst_file_writer.Open(file4));
+  for (int k = 1500; k < 1800; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file4_info;
+  s = sst_file_writer.Finish(&file4_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file4_info.file_path, file4);
+  ASSERT_EQ(file4_info.num_entries, 300);
+  ASSERT_EQ(file4_info.smallest_key, Key(1500));
+  ASSERT_EQ(file4_info.largest_key, Key(1799));
+  std::string file_checksum4, file_checksum_func_name4;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file4, &file_checksum4, &file_checksum_func_name4));
+  ASSERT_EQ(file4_info.file_checksum, file_checksum4);
+  ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4);
+
+  // file05.sst (1800 => 1899)
+  std::string file5 = sst_files_dir_ + "file05.sst";
+  ASSERT_OK(sst_file_writer.Open(file5));
+  for (int k = 1800; k < 2000; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file5_info;
+  s = sst_file_writer.Finish(&file5_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file5_info.file_path, file5);
+  ASSERT_EQ(file5_info.num_entries, 200);
+  ASSERT_EQ(file5_info.smallest_key, Key(1800));
+  ASSERT_EQ(file5_info.largest_key, Key(1999));
+  std::string file_checksum5, file_checksum_func_name5;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file5, &file_checksum5, &file_checksum_func_name5));
+  ASSERT_EQ(file5_info.file_checksum, file_checksum5);
+  ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5);
+
+  // file06.sst (2000 => 2199)
+  std::string file6 = sst_files_dir_ + "file06.sst";
+  ASSERT_OK(sst_file_writer.Open(file6));
+  for (int k = 2000; k < 2200; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file6_info;
+  s = sst_file_writer.Finish(&file6_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file6_info.file_path, file6);
+  ASSERT_EQ(file6_info.num_entries, 200);
+  ASSERT_EQ(file6_info.smallest_key, Key(2000));
+  ASSERT_EQ(file6_info.largest_key, Key(2199));
+  std::string file_checksum6, file_checksum_func_name6;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file6, &file_checksum6, &file_checksum_func_name6));
+  ASSERT_EQ(file6_info.file_checksum, file_checksum6);
+  ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6);
+
+  s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"},
+                              {file_checksum1}, true, false, false, false);
+  // does not care the checksum input since db does not enable file checksum
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file1));
+  std::vector<LiveFileMetaData> live_files;
+  dbfull()->GetLiveFilesMetaData(&live_files);
+  std::set<std::string> set1;
+  for (auto f : live_files) {
+    set1.insert(f.name);
+    ASSERT_EQ(f.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(f.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  }
+
+  // check the temperature of the file being ingested
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[6].files[0].temperature);
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+
+  // Reopen Db with checksum enabled
+  Reopen(options);
+  // Enable verify_file_checksum option
+  // The checksum vector does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"},
+                              {file_checksum_func_name2}, true, false, false,
+                              false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // The checksum name does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false,
+                              false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // The checksum itself does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2},
+                              true, false, false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // All matches, ingestion is successful
+  s = AddFileWithFileChecksum({file2}, {file_checksum2},
+                              {file_checksum_func_name2}, true, false, false,
+                              false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files1;
+  dbfull()->GetLiveFilesMetaData(&live_files1);
+  for (auto f : live_files1) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum2);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name2);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(env_->FileExists(file2));
+
+  // Enable verify_file_checksum option
+  // No checksum information is provided, generate it when ingesting
+  std::vector<std::string> checksum, checksum_func;
+  s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false,
+                              false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files2;
+  dbfull()->GetLiveFilesMetaData(&live_files2);
+  for (auto f : live_files2) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum3);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file3));
+
+  // Does not enable verify_file_checksum options
+  // The checksum name does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false,
+                              false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Does not enable verify_file_checksum options
+  // Checksum function name matches, store the checksum being ingested.
+  s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4},
+                              false, false, false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files3;
+  dbfull()->GetLiveFilesMetaData(&live_files3);
+  for (auto f : live_files3) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_FALSE(f.file_checksum == file_checksum4);
+      ASSERT_EQ(f.file_checksum, "asd");
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file4));
+
+  // enable verify_file_checksum options, DB enable checksum, and enable
+  // write_global_seq. So the checksum stored is different from the one
+  // ingested due to the sequence number changes.
+  s = AddFileWithFileChecksum({file5}, {file_checksum5},
+                              {file_checksum_func_name5}, true, false, false,
+                              true);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files4;
+  dbfull()->GetLiveFilesMetaData(&live_files4);
+  for (auto f : live_files4) {
+    if (set1.find(f.name) == set1.end()) {
+      std::string cur_checksum5, cur_checksum_func_name5;
+      ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+          dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5));
+      ASSERT_EQ(f.file_checksum, cur_checksum5);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file5));
+
+  // Does not enable verify_file_checksum options and also the ingested file
+  // checksum information is empty. DB will generate and store the checksum
+  // in Manifest.
+  std::vector<std::string> files_c6, files_name6;
+  s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false,
+                              false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files6;
+  dbfull()->GetLiveFilesMetaData(&live_files6);
+  for (auto f : live_files6) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum6);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file6));
+  db_->GetColumnFamilyMetaData(&metadata);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+}
+
 TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   Options options = CurrentOptions();
   const ImmutableCFOptions ioptions(options);
@@ -206,7 +647,7 @@
   }
   ExternalSstFileInfo file1_info;
   Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file1_info.file_path, file1);
   ASSERT_EQ(file1_info.num_entries, 100);
   ASSERT_EQ(file1_info.smallest_key, Key(0));
@@ -220,7 +661,7 @@
   }
   ExternalSstFileInfo file2_info;
   s = sst_file_writer.Finish(&file2_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file2_info.file_path, file2);
   ASSERT_EQ(file2_info.num_entries, 200);
   ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -234,23 +675,23 @@
   }
   ExternalSstFileInfo file3_info;
   s = sst_file_writer.Finish(&file3_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file3_info.file_path, file3);
   ASSERT_EQ(file3_info.num_entries, 15);
   ASSERT_EQ(file3_info.smallest_key, Key(110));
   ASSERT_EQ(file3_info.largest_key, Key(124));
 
   s = DeprecatedAddFile({file1}, true /* move file */);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
 
   s = DeprecatedAddFile({file2}, false /* copy file */);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file2));
 
   // This file has overlapping values with the existing data
   s = DeprecatedAddFile({file3}, true /* move file */);
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file3));
 
   for (int k = 0; k < 300; k++) {
@@ -706,12 +1147,31 @@
        "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
 
   for (size_t i = 0; i < test_cases.size(); i++) {
+    bool no_sync = false;
     SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
       fault_injection_test_env_->SetFilesystemActive(false);
     });
     SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
       fault_injection_test_env_->SetFilesystemActive(true);
     });
+    if (i == 0) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) {
+            Status* status = static_cast<Status*>(s);
+            if (status->IsNotSupported()) {
+              no_sync = true;
+            }
+          });
+    }
+    if (i == 2) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "ExternalSstFileIngestionJob::NewRandomRWFile", [&](void* s) {
+            Status* status = static_cast<Status*>(s);
+            if (status->IsNotSupported()) {
+              no_sync = true;
+            }
+          });
+    }
     SyncPoint::GetInstance()->EnableProcessing();
 
     DestroyAndReopen(options);
@@ -720,6 +1180,7 @@
     }
 
     Options sst_file_writer_options;
+    sst_file_writer_options.env = fault_injection_test_env_.get();
     std::unique_ptr<SstFileWriter> sst_file_writer(
         new SstFileWriter(EnvOptions(), sst_file_writer_options));
     std::string file_name =
@@ -736,7 +1197,12 @@
     if (i == 2) {
       ingest_opt.write_global_seqno = true;
     }
-    ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok());
+    Status s = db_->IngestExternalFile({file_name}, ingest_opt);
+    if (no_sync) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
     db_->ReleaseSnapshot(snapshot);
 
     SyncPoint::GetInstance()->DisableProcessing();
@@ -745,20 +1211,56 @@
   }
 }
 
+TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) {
+        Status* s = static_cast<Status*>(arg);
+        *s = Status::NotSupported();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name =
+      sst_files_dir_ + "reopen_not_supported_test_" + ".sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+  ASSERT_OK(sst_file_writer->Finish());
+
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = true;
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+  db_->ReleaseSnapshot(snapshot);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+
 TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
   Options options;
   options.create_if_missing = true;
-  SpecialEnv senv(Env::Default());
+  SpecialEnv senv(env_);
   options.env = &senv;
   DestroyAndReopen(options);
 
   Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
   std::unique_ptr<SstFileWriter> sst_file_writer(
       new SstFileWriter(EnvOptions(), sst_file_writer_options));
   std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
   ASSERT_OK(sst_file_writer->Open(file_name));
   Random rnd(301);
-  std::string value = DBTestBase::RandomString(&rnd, 4000);
+  std::string value = rnd.RandomString(4000);
   for (int i = 0; i < 5000; i++) {
     ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
   }
@@ -796,6 +1298,45 @@
   Destroy(options);
 }
 
+TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) {
+  for (int i = 5; i < 25; i++) {
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+                       Key(i) + "_val"));
+  }
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file.sst (delete 0 => 30)
+  std::string file = sst_files_dir_ + "file.sst";
+  ASSERT_OK(sst_file_writer.Open(file));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(30)));
+  ExternalSstFileInfo file_info;
+  ASSERT_OK(sst_file_writer.Finish(&file_info));
+  ASSERT_EQ(file_info.file_path, file);
+  ASSERT_EQ(file_info.num_entries, 0);
+  ASSERT_EQ(file_info.smallest_key, "");
+  ASSERT_EQ(file_info.largest_key, "");
+  ASSERT_EQ(file_info.num_range_del_entries, 1);
+  ASSERT_EQ(file_info.smallest_range_del_key, Key(0));
+  ASSERT_EQ(file_info.largest_range_del_key, Key(30));
+
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.snapshot_consistency = true;
+  ifo.allow_global_seqno = true;
+  ifo.write_global_seqno = true;
+  ifo.verify_checksums_before_ingest = false;
+  ASSERT_OK(db_->IngestExternalFile({file}, ifo));
+
+  for (int i = 5; i < 25; i++) {
+    std::string res;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &res).IsNotFound());
+  }
+}
+
 TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   int kNumLevels = 7;
   Options options = CurrentOptions();
@@ -896,7 +1437,7 @@
   ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
   ExternalSstFileInfo file8_info;
   Status s = sst_file_writer.Finish(&file8_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file8_info.file_path, file8);
   ASSERT_EQ(file8_info.num_entries, 0);
   ASSERT_EQ(file8_info.smallest_key, "");
@@ -911,7 +1452,7 @@
   ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
   ExternalSstFileInfo file9_info;
   s = sst_file_writer.Finish(&file9_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file9_info.file_path, file9);
   ASSERT_EQ(file9_info.num_entries, 0);
   ASSERT_EQ(file9_info.smallest_key, "");
@@ -923,7 +1464,7 @@
   // Range deletion tombstones are exclusive on their end key, so these SSTs
   // should not be considered as overlapping.
   s = DeprecatedAddFile({file8, file9});
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
   DestroyAndRecreateExternalSSTFilesDir();
 }
@@ -964,6 +1505,10 @@
 }
 
 TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) {
+  if (!random_rwfile_supported_) {
+    ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
+    return;
+  }
   SyncPoint::GetInstance()->DisableProcessing();
   int file_id = 0;
   EnvOptions env_options;
@@ -1013,6 +1558,11 @@
 TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
   bool verify_checksums_before_ingest = std::get<1>(GetParam());
   if (!verify_checksums_before_ingest) {
+    ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest");
+    return;
+  }
+  if (!random_rwfile_supported_) {
+    ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
     return;
   }
   uint64_t props_block_offset = 0;
@@ -1111,6 +1661,141 @@
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
 }
 
+TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) {
+  // Repro https://github.com/facebook/rocksdb/issues/6245.
+  // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction
+  // via trivial move. The bug happened when L1 files were incorrectly sorted
+  // resulting in an old value for "k" returned by `Get()`.
+  Options options = CurrentOptions();
+
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  ASSERT_OK(sst_file_writer.Put("k", "b"));
+
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  IngestExternalFileOptions ifo;
+  s = db_->IngestExternalFile({file1}, ifo);
+  ASSERT_OK(s);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(Get("k"), "b");
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
+  Options options = CurrentOptions();
+  const ImmutableCFOptions ioptions(options);
+  options.bottommost_temperature = Temperature::kWarm;
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+
+  // create file01.sst (1000 => 1099) and ingest it
+  std::string file1 = sst_files_dir_ + "file01.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 1000; k < 1100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s);
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(1000));
+  ASSERT_EQ(file1_info.largest_key, Key(1099));
+
+  std::vector<std::string> files;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  Temperature file_temperature = Temperature::kWarm;
+
+  files.push_back(file1);
+  IngestExternalFileOptions in_opts;
+  in_opts.move_files = false;
+  in_opts.snapshot_consistency = true;
+  in_opts.allow_global_seqno = false;
+  in_opts.allow_blocking_flush = false;
+  in_opts.write_global_seqno = true;
+  in_opts.verify_file_checksum = false;
+  IngestExternalFileArg arg;
+  arg.column_family = db_->DefaultColumnFamily();
+  arg.external_files = files;
+  arg.options = in_opts;
+  arg.files_checksums = files_checksums;
+  arg.files_checksum_func_names = files_checksum_func_names;
+  arg.file_temperature = file_temperature;
+  s = db_->IngestExternalFiles({arg});
+  ASSERT_OK(s);
+
+  // check the temperature of the file being ingested
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 1);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // reopen and check the information is persisted
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+}
+
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
                         testing::Values(std::make_tuple(true, true),
                                         std::make_tuple(true, false),
@@ -1124,5 +1809,6 @@
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,6 +17,7 @@
 #include "db/version_edit.h"
 #include "file/file_util.h"
 #include "file/random_access_file_reader.h"
+#include "logging/logging.h"
 #include "table/merging_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
@@ -28,26 +29,39 @@
 
 Status ExternalSstFileIngestionJob::Prepare(
     const std::vector<std::string>& external_files_paths,
-    uint64_t next_file_number, SuperVersion* sv) {
+    const std::vector<std::string>& files_checksums,
+    const std::vector<std::string>& files_checksum_func_names,
+    const Temperature& file_temperature, uint64_t next_file_number,
+    SuperVersion* sv) {
   Status status;
 
   // Read the information of files we are ingesting
   for (const std::string& file_path : external_files_paths) {
     IngestedFileInfo file_to_ingest;
-    status = GetIngestedFileInfo(file_path, &file_to_ingest, sv);
+    status =
+        GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
     if (!status.ok()) {
       return status;
     }
-    files_to_ingest_.push_back(file_to_ingest);
-  }
 
-  for (const IngestedFileInfo& f : files_to_ingest_) {
-    if (f.cf_id !=
+    if (file_to_ingest.cf_id !=
             TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
-        f.cf_id != cfd_->GetID()) {
+        file_to_ingest.cf_id != cfd_->GetID()) {
       return Status::InvalidArgument(
-          "External file column family id dont match");
+          "External file column family id don't match");
+    }
+
+    if (file_to_ingest.num_entries == 0 &&
+        file_to_ingest.num_range_deletions == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!file_to_ingest.smallest_internal_key.Valid() ||
+        !file_to_ingest.largest_internal_key.Valid()) {
+      return Status::Corruption("Generated table have corrupted keys");
     }
+
+    files_to_ingest_.emplace_back(std::move(file_to_ingest));
   }
 
   const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
@@ -55,7 +69,7 @@
   if (num_files == 0) {
     return Status::InvalidArgument("The list of files is empty");
   } else if (num_files > 1) {
-    // Verify that passed files dont have overlapping ranges
+    // Verify that passed files don't have overlapping ranges
     autovector<const IngestedFileInfo*> sorted_files;
     for (size_t i = 0; i < num_files; i++) {
       sorted_files.push_back(&files_to_ingest_[i]);
@@ -68,7 +82,7 @@
                                    info2->smallest_internal_key) < 0;
         });
 
-    for (size_t i = 0; i < num_files - 1; i++) {
+    for (size_t i = 0; i + 1 < num_files; i++) {
       if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
                             sorted_files[i + 1]->smallest_internal_key) >= 0) {
         files_overlap_ = true;
@@ -77,24 +91,18 @@
     }
   }
 
-  if (ingestion_options_.ingest_behind && files_overlap_) {
-    return Status::NotSupported("Files have overlapping ranges");
+  // Hanlde the file temperature
+  for (size_t i = 0; i < num_files; i++) {
+    files_to_ingest_[i].file_temperature = file_temperature;
   }
 
-  for (IngestedFileInfo& f : files_to_ingest_) {
-    if (f.num_entries == 0 && f.num_range_deletions == 0) {
-      return Status::InvalidArgument("File contain no entries");
-    }
-
-    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
-      return Status::Corruption("Generated table have corrupted keys");
-    }
+  if (ingestion_options_.ingest_behind && files_overlap_) {
+    return Status::NotSupported("Files have overlapping ranges");
   }
 
   // Copy/Move external files into DB
   std::unordered_set<size_t> ingestion_path_ids;
   for (IngestedFileInfo& f : files_to_ingest_) {
-    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
     f.copy_file = false;
     const std::string path_outside_db = f.external_file_path;
     const std::string path_inside_db =
@@ -108,17 +116,26 @@
         // directory before ingest the file. For integrity of RocksDB we need
         // to sync the file.
         std::unique_ptr<FSWritableFile> file_to_sync;
-        status = fs_->ReopenWritableFile(path_inside_db, env_options_,
-                                         &file_to_sync, nullptr);
-        if (status.ok()) {
-          TEST_SYNC_POINT(
-              "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
-          status = SyncIngestedFile(file_to_sync.get());
-          TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
-          if (!status.ok()) {
-            ROCKS_LOG_WARN(db_options_.info_log,
-                           "Failed to sync ingested file %s: %s",
-                           path_inside_db.c_str(), status.ToString().c_str());
+        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
+                                           &file_to_sync, nullptr);
+        TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
+                                 &s);
+        // Some file systems (especially remote/distributed) don't support
+        // reopening a file for writing and don't require reopening and
+        // syncing the file. Ignore the NotSupported error in that case.
+        if (!s.IsNotSupported()) {
+          status = s;
+          if (status.ok()) {
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+            status = SyncIngestedFile(file_to_sync.get());
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+            if (!status.ok()) {
+              ROCKS_LOG_WARN(db_options_.info_log,
+                             "Failed to sync ingested file %s: %s",
+                             path_inside_db.c_str(), status.ToString().c_str());
+            }
           }
         }
       } else if (status.IsNotSupported() &&
@@ -134,21 +151,26 @@
       TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
                                nullptr);
       // CopyFile also sync the new file.
-      status = CopyFile(fs_, path_outside_db, path_inside_db, 0,
-                        db_options_.use_fsync);
+      status = CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+                        db_options_.use_fsync, io_tracer_);
     }
     TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
     if (!status.ok()) {
       break;
     }
     f.internal_file_path = path_inside_db;
+    // Initialize the checksum information of ingested files.
+    f.file_checksum = kUnknownFileChecksum;
+    f.file_checksum_func_name = kUnknownFileChecksumFuncName;
     ingestion_path_ids.insert(f.fd.GetPathId());
   }
 
   TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
   if (status.ok()) {
     for (auto path_id : ingestion_path_ids) {
-      status = directories_->GetDataDir(path_id)->Fsync();
+      status = directories_->GetDataDir(path_id)->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
       if (!status.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
                        "Failed to sync directory %" ROCKSDB_PRIszt
@@ -160,14 +182,141 @@
   }
   TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
 
+  // Generate and check the sst file checksum. Note that, if
+  // IngestExternalFileOptions::write_global_seqno is true, we will not update
+  // the checksum information in the files_to_ingests_ here, since the file is
+  // upadted with the new global_seqno. After global_seqno is updated, DB will
+  // generate the new checksum and store it in the Manifest. In all other cases
+  // if ingestion_options_.write_global_seqno == true and
+  // verify_file_checksum is false, we only check the checksum function name.
+  if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) {
+    if (ingestion_options_.verify_file_checksum == false &&
+        files_checksums.size() == files_to_ingest_.size() &&
+        files_checksum_func_names.size() == files_to_ingest_.size()) {
+      // Only when verify_file_checksum == false and the checksum for ingested
+      // files are provided, DB will use the provided checksum and does not
+      // generate the checksum for ingested files.
+      need_generate_file_checksum_ = false;
+    } else {
+      need_generate_file_checksum_ = true;
+    }
+    FileChecksumGenContext gen_context;
+    std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+        db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator(
+            gen_context);
+    std::vector<std::string> generated_checksums;
+    std::vector<std::string> generated_checksum_func_names;
+    // Step 1: generate the checksum for ingested sst file.
+    if (need_generate_file_checksum_) {
+      for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+        std::string generated_checksum;
+        std::string generated_checksum_func_name;
+        std::string requested_checksum_func_name;
+        IOStatus io_s = GenerateOneFileChecksum(
+            fs_.get(), files_to_ingest_[i].internal_file_path,
+            db_options_.file_checksum_gen_factory.get(),
+            requested_checksum_func_name, &generated_checksum,
+            &generated_checksum_func_name,
+            ingestion_options_.verify_checksums_readahead_size,
+            db_options_.allow_mmap_reads, io_tracer_,
+            db_options_.rate_limiter.get());
+        if (!io_s.ok()) {
+          status = io_s;
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Sst file checksum generation of file: %s failed: %s",
+                         files_to_ingest_[i].internal_file_path.c_str(),
+                         status.ToString().c_str());
+          break;
+        }
+        if (ingestion_options_.write_global_seqno == false) {
+          files_to_ingest_[i].file_checksum = generated_checksum;
+          files_to_ingest_[i].file_checksum_func_name =
+              generated_checksum_func_name;
+        }
+        generated_checksums.push_back(generated_checksum);
+        generated_checksum_func_names.push_back(generated_checksum_func_name);
+      }
+    }
+
+    // Step 2: based on the verify_file_checksum and ingested checksum
+    // information, do the verification.
+    if (status.ok()) {
+      if (files_checksums.size() == files_to_ingest_.size() &&
+          files_checksum_func_names.size() == files_to_ingest_.size()) {
+        // Verify the checksum and checksum function name.
+        if (ingestion_options_.verify_file_checksum) {
+          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+            if (files_checksum_func_names[i] !=
+                generated_checksum_func_names[i]) {
+              status = Status::InvalidArgument(
+                  "Checksum function name does not match with the checksum "
+                  "function name of this DB");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  external_files_paths[i].c_str(), status.ToString().c_str());
+              break;
+            }
+            if (files_checksums[i] != generated_checksums[i]) {
+              status = Status::Corruption(
+                  "Ingested checksum does not match with the generated "
+                  "checksum");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  files_to_ingest_[i].internal_file_path.c_str(),
+                  status.ToString().c_str());
+              break;
+            }
+          }
+        } else {
+          // If verify_file_checksum is not enabled, we only verify the
+          // checksum function name. If it does not match, fail the ingestion.
+          // If matches, we trust the ingested checksum information and store
+          // in the Manifest.
+          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+            if (files_checksum_func_names[i] != file_checksum_gen->Name()) {
+              status = Status::InvalidArgument(
+                  "Checksum function name does not match with the checksum "
+                  "function name of this DB");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  external_files_paths[i].c_str(), status.ToString().c_str());
+              break;
+            }
+            files_to_ingest_[i].file_checksum = files_checksums[i];
+            files_to_ingest_[i].file_checksum_func_name =
+                files_checksum_func_names[i];
+          }
+        }
+      } else if (files_checksums.size() != files_checksum_func_names.size() ||
+                 (files_checksums.size() == files_checksum_func_names.size() &&
+                  files_checksums.size() != 0)) {
+        // The checksum or checksum function name vector are not both empty
+        // and they are incomplete.
+        status = Status::InvalidArgument(
+            "The checksum information of ingested sst files are nonempty and "
+            "the size of checksums or the size of the checksum function "
+            "names "
+            "does not match with the number of ingested sst files");
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "The ingested sst files checksum information is incomplete: %s",
+            status.ToString().c_str());
+      }
+    }
+  }
+
   // TODO: The following is duplicated with Cleanup().
   if (!status.ok()) {
+    IOOptions io_opts;
     // We failed, remove all files that we copied into the db
     for (IngestedFileInfo& f : files_to_ingest_) {
       if (f.internal_file_path.empty()) {
         continue;
       }
-      Status s = env_->DeleteFile(f.internal_file_path);
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
       if (!s.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
                        "AddFile() clean up for file %s failed : %s",
@@ -186,8 +335,8 @@
     ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
                         file_to_ingest.largest_internal_key.user_key());
   }
-  Status status =
-      cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
+  Status status = cfd_->RangesOverlapWithMemtables(
+      ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
   if (status.ok() && *flush_needed &&
       !ingestion_options_.allow_blocking_flush) {
     status = Status::InvalidArgument("External file requires flush");
@@ -205,6 +354,12 @@
   // with the files we are ingesting
   bool need_flush = false;
   status = NeedsFlush(&need_flush, super_version);
+  if (!status.ok()) {
+    return status;
+  }
+  if (need_flush) {
+    return Status::TryAgain();
+  }
   assert(status.ok() && need_flush == false);
 #endif
 
@@ -212,7 +367,7 @@
 
   if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
     // We need to assign a global sequence number to all the files even
-    // if the dont overlap with any ranges since we have snapshots
+    // if the don't overlap with any ranges since we have snapshots
     force_global_seqno = true;
   }
   // It is safe to use this instead of LastAllocatedSequence since we are
@@ -230,9 +385,32 @@
           super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
           last_seqno, &f, &assigned_seqno);
     }
+
+    // Modify the smallest/largest internal key to include the sequence number
+    // that we just learned. Only overwrite sequence number zero. There could
+    // be a nonzero sequence number already to indicate a range tombstone's
+    // exclusive endpoint.
+    ParsedInternalKey smallest_parsed, largest_parsed;
+    if (status.ok()) {
+      status = ParseInternalKey(*f.smallest_internal_key.rep(),
+                                &smallest_parsed, false /* log_err_key */);
+    }
+    if (status.ok()) {
+      status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
+                                false /* log_err_key */);
+    }
     if (!status.ok()) {
       return status;
     }
+    if (smallest_parsed.sequence == 0) {
+      UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
+                        smallest_parsed.type);
+    }
+    if (largest_parsed.sequence == 0) {
+      UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
+                        largest_parsed.type);
+    }
+
     status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
     TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
                              &assigned_seqno);
@@ -245,21 +423,29 @@
       return status;
     }
 
+    status = GenerateChecksumForIngestedFile(&f);
+    if (!status.ok()) {
+      return status;
+    }
+
     // We use the import time as the ancester time. This is the time the data
     // is written to the database.
     int64_t temp_current_time = 0;
     uint64_t current_time = kUnknownFileCreationTime;
     uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
-    if (env_->GetCurrentTime(&temp_current_time).ok()) {
+    if (clock_->GetCurrentTime(&temp_current_time).ok()) {
       current_time = oldest_ancester_time =
           static_cast<uint64_t>(temp_current_time);
     }
-
-    edit_.AddFile(
-        f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
+    FileMetaData f_metadata(
+        f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
         f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
-        f.assigned_seqno, false, kInvalidBlobFileNumber, oldest_ancester_time,
-        current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+        f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
+        oldest_ancester_time, current_time, f.file_checksum,
+        f.file_checksum_func_name, kDisableUserTimestamp,
+        kDisableUserTimestamp);
+    f_metadata.temperature = f.file_temperature;
+    edit_.AddFile(f.picked_level, f_metadata);
   }
   return status;
 }
@@ -268,7 +454,7 @@
   // Update internal stats for new ingested files
   uint64_t total_keys = 0;
   uint64_t total_l0_files = 0;
-  uint64_t total_time = env_->NowMicros() - job_start_time_;
+  uint64_t total_time = clock_->NowMicros() - job_start_time_;
 
   EventLoggerStream stream = event_logger_->Log();
   stream << "event"
@@ -324,6 +510,7 @@
 }
 
 void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
+  IOOptions io_opts;
   if (!status.ok()) {
     // We failed to add the files to the database
     // remove all the files we copied
@@ -331,7 +518,7 @@
       if (f.internal_file_path.empty()) {
         continue;
       }
-      Status s = env_->DeleteFile(f.internal_file_path);
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
       if (!s.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
                        "AddFile() clean up for file %s failed : %s",
@@ -343,7 +530,7 @@
   } else if (status.ok() && ingestion_options_.move_files) {
     // The files were moved and added successfully, remove original file links
     for (IngestedFileInfo& f : files_to_ingest_) {
-      Status s = env_->DeleteFile(f.external_file_path);
+      Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
       if (!s.ok()) {
         ROCKS_LOG_WARN(
             db_options_.info_log,
@@ -356,8 +543,8 @@
 }
 
 Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
-    const std::string& external_file, IngestedFileInfo* file_to_ingest,
-    SuperVersion* sv) {
+    const std::string& external_file, uint64_t new_file_number,
+    IngestedFileInfo* file_to_ingest, SuperVersion* sv) {
   file_to_ingest->external_file_path = external_file;
 
   // Get external file size
@@ -367,6 +554,10 @@
     return status;
   }
 
+  // Assign FD with number
+  file_to_ingest->fd =
+      FileDescriptor(new_file_number, 0, file_to_ingest->file_size);
+
   // Create TableReader for external file
   std::unique_ptr<TableReader> table_reader;
   std::unique_ptr<FSRandomAccessFile> sst_file;
@@ -377,13 +568,18 @@
   if (!status.ok()) {
     return status;
   }
-  sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file),
-                                                   external_file));
+  sst_file_reader.reset(new RandomAccessFileReader(
+      std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
 
   status = cfd_->ioptions()->table_factory->NewTableReader(
-      TableReaderOptions(*cfd_->ioptions(),
-                         sv->mutable_cf_options.prefix_extractor.get(),
-                         env_options_, cfd_->internal_comparator()),
+      TableReaderOptions(
+          *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+          env_options_, cfd_->internal_comparator(),
+          /*skip_filters*/ false, /*immortal*/ false,
+          /*force_direct_prefetch*/ false, /*level*/ -1,
+          /*block_cache_tracer*/ nullptr,
+          /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+          /*cur_file_num*/ new_file_number),
       std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
   if (!status.ok()) {
     return status;
@@ -423,14 +619,12 @@
 
     // Set the global sequence number
     file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
-    auto offsets_iter = props->properties_offsets.find(
-        ExternalSstFilePropertyNames::kGlobalSeqno);
-    if (offsets_iter == props->properties_offsets.end() ||
-        offsets_iter->second == 0) {
+    if (props->external_sst_file_global_seqno_offset == 0) {
       file_to_ingest->global_seqno_offset = 0;
       return Status::Corruption("Was not able to find file global seqno field");
     }
-    file_to_ingest->global_seqno_offset = static_cast<size_t>(offsets_iter->second);
+    file_to_ingest->global_seqno_offset =
+        static_cast<size_t>(props->external_sst_file_global_seqno_offset);
   } else if (file_to_ingest->version == 1) {
     // SST file V1 should not have global seqno field
     assert(seqno_iter == uprops.end());
@@ -467,22 +661,28 @@
   file_to_ingest->largest_internal_key =
       InternalKey("", 0, ValueType::kTypeValue);
   bool bounds_set = false;
+  bool allow_data_in_errors = db_options_.allow_data_in_errors;
   iter->SeekToFirst();
   if (iter->Valid()) {
-    if (!ParseInternalKey(iter->key(), &key)) {
-      return Status::Corruption("external file have corrupted keys");
+    Status pik_status =
+        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
     }
     if (key.sequence != 0) {
-      return Status::Corruption("external file have non zero sequence number");
+      return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->smallest_internal_key.SetFrom(key);
 
     iter->SeekToLast();
-    if (!ParseInternalKey(iter->key(), &key)) {
-      return Status::Corruption("external file have corrupted keys");
+    pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
     }
     if (key.sequence != 0) {
-      return Status::Corruption("external file have non zero sequence number");
+      return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->largest_internal_key.SetFrom(key);
 
@@ -495,8 +695,11 @@
   if (range_del_iter != nullptr) {
     for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
          range_del_iter->Next()) {
-      if (!ParseInternalKey(range_del_iter->key(), &key)) {
-        return Status::Corruption("external file have corrupted keys");
+      Status pik_status =
+          ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
+      if (!pik_status.ok()) {
+        return Status::Corruption("Corrupted key in external file. ",
+                                  pik_status.getState());
       }
       RangeTombstone tombstone(key, range_del_iter->value());
 
@@ -570,10 +773,11 @@
         const std::vector<FileMetaData*>& level_files =
             vstorage->LevelFiles(lvl);
         const SequenceNumber level_largest_seqno =
-            (*max_element(level_files.begin(), level_files.end(),
-                          [](FileMetaData* f1, FileMetaData* f2) {
-                            return f1->fd.largest_seqno < f2->fd.largest_seqno;
-                          }))
+            (*std::max_element(level_files.begin(), level_files.end(),
+                               [](FileMetaData* f1, FileMetaData* f2) {
+                                 return f1->fd.largest_seqno <
+                                        f2->fd.largest_seqno;
+                               }))
                 ->fd.largest_seqno;
         // should only assign seqno to current level's largest seqno when
         // the file fits
@@ -588,7 +792,7 @@
       continue;
     }
 
-    // We dont overlap with any keys in this level, but we still need to check
+    // We don't overlap with any keys in this level, but we still need to check
     // if our file can fit in it
     if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
       target_level = lvl;
@@ -646,7 +850,7 @@
     return Status::InvalidArgument("Global seqno is required, but disabled");
   } else if (file_to_ingest->global_seqno_offset == 0) {
     return Status::InvalidArgument(
-        "Trying to set global seqno for a file that dont have a global seqno "
+        "Trying to set global seqno for a file that don't have a global seqno "
         "field");
   }
 
@@ -658,14 +862,18 @@
     Status status =
         fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_,
                              &rwfile, nullptr);
+    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile",
+                             &status);
     if (status.ok()) {
+      FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_,
+                              file_to_ingest->internal_file_path);
       std::string seqno_val;
       PutFixed64(&seqno_val, seqno);
-      status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val,
-                             IOOptions(), nullptr);
+      status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
+                            IOOptions(), nullptr);
       if (status.ok()) {
         TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
-        status = SyncIngestedFile(rwfile.get());
+        status = SyncIngestedFile(fsptr.get());
         TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
         if (!status.ok()) {
           ROCKS_LOG_WARN(db_options_.info_log,
@@ -687,6 +895,33 @@
   return Status::OK();
 }
 
+IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
+    IngestedFileInfo* file_to_ingest) {
+  if (db_options_.file_checksum_gen_factory == nullptr ||
+      need_generate_file_checksum_ == false ||
+      ingestion_options_.write_global_seqno == false) {
+    // If file_checksum_gen_factory is not set, we are not able to generate
+    // the checksum. if write_global_seqno is false, it means we will use
+    // file checksum generated during Prepare(). This step will be skipped.
+    return IOStatus::OK();
+  }
+  std::string file_checksum;
+  std::string file_checksum_func_name;
+  std::string requested_checksum_func_name;
+  IOStatus io_s = GenerateOneFileChecksum(
+      fs_.get(), file_to_ingest->internal_file_path,
+      db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
+      &file_checksum, &file_checksum_func_name,
+      ingestion_options_.verify_checksums_readahead_size,
+      db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get());
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  file_to_ingest->file_checksum = file_checksum;
+  file_to_ingest->file_checksum_func_name = file_checksum_func_name;
+  return IOStatus::OK();
+}
+
 bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
     const IngestedFileInfo* file_to_ingest, int level) {
   if (level == 0) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_ingestion_job.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,19 +9,20 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/dbformat.h"
 #include "db/internal_stats.h"
 #include "db/snapshot_impl.h"
+#include "env/file_system_tracer.h"
 #include "logging/event_logger.h"
 #include "options/db_options.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/sst_file_writer.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Directories;
+class SystemClock;
 
 struct IngestedFileInfo {
   // External file path
@@ -63,18 +64,25 @@
   // ingestion_options.move_files is false by default, thus copy_file is true
   // by default.
   bool copy_file = true;
+  // The checksum of ingested file
+  std::string file_checksum;
+  // The name of checksum function that generate the checksum
+  std::string file_checksum_func_name;
+  // The temperature of the file to be ingested
+  Temperature file_temperature = Temperature::kUnknown;
 };
 
 class ExternalSstFileIngestionJob {
  public:
   ExternalSstFileIngestionJob(
-      Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+      VersionSet* versions, ColumnFamilyData* cfd,
       const ImmutableDBOptions& db_options, const EnvOptions& env_options,
       SnapshotList* db_snapshots,
       const IngestExternalFileOptions& ingestion_options,
-      Directories* directories, EventLogger* event_logger)
-      : env_(env),
-        fs_(db_options.fs.get()),
+      Directories* directories, EventLogger* event_logger,
+      const std::shared_ptr<IOTracer>& io_tracer)
+      : clock_(db_options.clock),
+        fs_(db_options.fs, io_tracer),
         versions_(versions),
         cfd_(cfd),
         db_options_(db_options),
@@ -83,14 +91,18 @@
         ingestion_options_(ingestion_options),
         directories_(directories),
         event_logger_(event_logger),
-        job_start_time_(env_->NowMicros()),
-        consumed_seqno_count_(0) {
+        job_start_time_(clock_->NowMicros()),
+        consumed_seqno_count_(0),
+        io_tracer_(io_tracer) {
     assert(directories != nullptr);
   }
 
   // Prepare the job by copying external files into the DB.
   Status Prepare(const std::vector<std::string>& external_files_paths,
-                 uint64_t next_file_number, SuperVersion* sv);
+                 const std::vector<std::string>& files_checksums,
+                 const std::vector<std::string>& files_checksum_func_names,
+                 const Temperature& file_temperature, uint64_t next_file_number,
+                 SuperVersion* sv);
 
   // Check if we need to flush the memtable before running the ingestion job
   // This will be true if the files we are ingesting are overlapping with any
@@ -126,10 +138,11 @@
   // Open the external file and populate `file_to_ingest` with all the
   // external information we need to ingest this file.
   Status GetIngestedFileInfo(const std::string& external_file,
+                             uint64_t new_file_number,
                              IngestedFileInfo* file_to_ingest,
                              SuperVersion* sv);
 
-  // Assign `file_to_ingest` the appropriate sequence number and  the lowest
+  // Assign `file_to_ingest` the appropriate sequence number and the lowest
   // possible level that it can be ingested to according to compaction_style.
   // REQUIRES: Mutex held
   Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
@@ -148,6 +161,8 @@
   // Set the file global sequence number to `seqno`
   Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
                                           SequenceNumber seqno);
+  // Generate the file checksum and store in the IngestedFileInfo
+  IOStatus GenerateChecksumForIngestedFile(IngestedFileInfo* file_to_ingest);
 
   // Check if `file_to_ingest` can fit in level `level`
   // REQUIRES: Mutex held
@@ -158,8 +173,8 @@
   template <typename TWritableFile>
   Status SyncIngestedFile(TWritableFile* file);
 
-  Env* env_;
-  FileSystem* fs_;
+  SystemClock* clock_;
+  FileSystemPtr fs_;
   VersionSet* versions_;
   ColumnFamilyData* cfd_;
   const ImmutableDBOptions& db_options_;
@@ -175,6 +190,10 @@
   // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
   // ingested in L0
   bool files_overlap_{false};
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true and DB
+  // file_checksum_gen_factory is set, DB will generate checksum each file.
+  bool need_generate_file_checksum_{true};
+  std::shared_ptr<IOTracer> io_tracer_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/external_sst_file_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/external_sst_file_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,13 +6,19 @@
 #ifndef ROCKSDB_LITE
 
 #include <functional>
+
 #include "db/db_test_util.h"
+#include "db/dbformat.h"
 #include "file/filename.h"
+#include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/sst_file_reader.h"
 #include "rocksdb/sst_file_writer.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/thread_guard.h"
+#include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -21,6 +27,8 @@
  public:
   ExternalSSTTestEnv(Env* t, bool fail_link)
       : EnvWrapper(t), fail_link_(fail_link) {}
+  static const char* kClassName() { return "ExternalSSTTestEnv"; }
+  const char* Name() const override { return kClassName(); }
 
   Status LinkFile(const std::string& s, const std::string& t) override {
     if (fail_link_) {
@@ -35,16 +43,33 @@
   bool fail_link_;
 };
 
+class ExternalSSTFileTestBase : public DBTestBase {
+ public:
+  ExternalSSTFileTestBase()
+      : DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    DestroyAndRecreateExternalSSTFilesDir();
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
+  }
+
+  ~ExternalSSTFileTestBase() override {
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+  }
+
+ protected:
+  std::string sst_files_dir_;
+};
+
 class ExternSSTFileLinkFailFallbackTest
-    : public DBTestBase,
+    : public ExternalSSTFileTestBase,
       public ::testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
   ExternSSTFileLinkFailFallbackTest()
-      : DBTestBase("/external_sst_file_test"),
-        test_env_(new ExternalSSTTestEnv(env_, true)) {
-    sst_files_dir_ = dbname_ + "/sst_files/";
-    test::DestroyDir(env_, sst_files_dir_);
-    env_->CreateDir(sst_files_dir_);
+      : test_env_(new ExternalSSTTestEnv(env_, true)) {
     options_ = CurrentOptions();
     options_.disable_auto_compactions = true;
     options_.env = test_env_;
@@ -59,24 +84,15 @@
   }
 
  protected:
-  std::string sst_files_dir_;
   Options options_;
   ExternalSSTTestEnv* test_env_;
 };
 
 class ExternalSSTFileTest
-    : public DBTestBase,
+    : public ExternalSSTFileTestBase,
       public ::testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
-  ExternalSSTFileTest() : DBTestBase("/external_sst_file_test") {
-    sst_files_dir_ = dbname_ + "/sst_files/";
-    DestroyAndRecreateExternalSSTFilesDir();
-  }
-
-  void DestroyAndRecreateExternalSSTFilesDir() {
-    test::DestroyDir(env_, sst_files_dir_);
-    env_->CreateDir(sst_files_dir_);
-  }
+  ExternalSSTFileTest() {}
 
   Status GenerateOneExternalFile(
       const Options& options, ColumnFamilyHandle* cfh,
@@ -111,7 +127,7 @@
     for (const auto& entry : data) {
       s = sst_file_writer.Put(entry.first, entry.second);
       if (!s.ok()) {
-        sst_file_writer.Finish();
+        sst_file_writer.Finish().PermitUncheckedError();
         return s;
       }
     }
@@ -166,7 +182,7 @@
     for (auto& entry : data) {
       s = sst_file_writer.Put(entry.first, entry.second);
       if (!s.ok()) {
-        sst_file_writer.Finish();
+        sst_file_writer.Finish().PermitUncheckedError();
         return s;
       }
     }
@@ -208,11 +224,10 @@
     size_t num_cfs = column_families.size();
     assert(ifos.size() == num_cfs);
     assert(data.size() == num_cfs);
-    Status s;
     std::vector<IngestExternalFileArg> args(num_cfs);
     for (size_t i = 0; i != num_cfs; ++i) {
       std::string external_file_path;
-      s = GenerateOneExternalFile(
+      Status s = GenerateOneExternalFile(
           options, column_families[i], data[i], file_id, sort_data,
           &external_file_path,
           true_data.size() == num_cfs ? &true_data[i] : nullptr);
@@ -225,8 +240,7 @@
       args[i].external_files.push_back(external_file_path);
       args[i].options = ifos[i];
     }
-    s = db_->IngestExternalFiles(args);
-    return s;
+    return db_->IngestExternalFiles(args);
   }
 
   Status GenerateAndAddExternalFile(
@@ -277,11 +291,8 @@
     return db_->IngestExternalFile(files, opts);
   }
 
-  ~ExternalSSTFileTest() override { test::DestroyDir(env_, sst_files_dir_); }
-
  protected:
   int last_file_id_ = 0;
-  std::string sst_files_dir_;
 };
 
 TEST_F(ExternalSSTFileTest, Basic) {
@@ -300,8 +311,7 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file1_info;
-    Status s = sst_file_writer.Finish(&file1_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
 
     // Current file size should be non-zero after success write.
     ASSERT_GT(sst_file_writer.FileSize(), 0);
@@ -314,8 +324,7 @@
     ASSERT_EQ(file1_info.smallest_range_del_key, "");
     ASSERT_EQ(file1_info.largest_range_del_key, "");
     // sst_file_writer already finished, cannot add this value
-    s = sst_file_writer.Put(Key(100), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
 
     // file2.sst (100 => 199)
     std::string file2 = sst_files_dir_ + "file2.sst";
@@ -324,11 +333,9 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     // Cannot add this key because it's not after last added key
-    s = sst_file_writer.Put(Key(99), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
     ExternalSstFileInfo file2_info;
-    s = sst_file_writer.Finish(&file2_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
     ASSERT_EQ(file2_info.file_path, file2);
     ASSERT_EQ(file2_info.num_entries, 100);
     ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -342,9 +349,8 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file3_info;
-    s = sst_file_writer.Finish(&file3_info);
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
 
-    ASSERT_TRUE(s.ok()) << s.ToString();
     // Current file size should be non-zero after success finish.
     ASSERT_GT(sst_file_writer.FileSize(), 0);
     ASSERT_EQ(file3_info.file_path, file3);
@@ -360,8 +366,7 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file4_info;
-    s = sst_file_writer.Finish(&file4_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
     ASSERT_EQ(file4_info.file_path, file4);
     ASSERT_EQ(file4_info.num_entries, 10);
     ASSERT_EQ(file4_info.smallest_key, Key(30));
@@ -374,8 +379,7 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file5_info;
-    s = sst_file_writer.Finish(&file5_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
     ASSERT_EQ(file5_info.file_path, file5);
     ASSERT_EQ(file5_info.num_entries, 100);
     ASSERT_EQ(file5_info.smallest_key, Key(400));
@@ -384,10 +388,9 @@
     // file6.sst (delete 400 => 500)
     std::string file6 = sst_files_dir_ + "file6.sst";
     ASSERT_OK(sst_file_writer.Open(file6));
-    sst_file_writer.DeleteRange(Key(400), Key(500));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
     ExternalSstFileInfo file6_info;
-    s = sst_file_writer.Finish(&file6_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
     ASSERT_EQ(file6_info.file_path, file6);
     ASSERT_EQ(file6_info.num_entries, 0);
     ASSERT_EQ(file6_info.smallest_key, "");
@@ -399,17 +402,16 @@
     // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
     std::string file7 = sst_files_dir_ + "file7.sst";
     ASSERT_OK(sst_file_writer.Open(file7));
-    sst_file_writer.DeleteRange(Key(500), Key(550));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550)));
     for (int k = 520; k < 560; k += 2) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
-    sst_file_writer.DeleteRange(Key(525), Key(575));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575)));
     for (int k = 560; k < 600; k += 2) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file7_info;
-    s = sst_file_writer.Finish(&file7_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
     ASSERT_EQ(file7_info.file_path, file7);
     ASSERT_EQ(file7_info.num_entries, 40);
     ASSERT_EQ(file7_info.smallest_key, Key(520));
@@ -421,10 +423,9 @@
     // file8.sst (delete 600 => 700)
     std::string file8 = sst_files_dir_ + "file8.sst";
     ASSERT_OK(sst_file_writer.Open(file8));
-    sst_file_writer.DeleteRange(Key(600), Key(700));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700)));
     ExternalSstFileInfo file8_info;
-    s = sst_file_writer.Finish(&file8_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file8_info));
     ASSERT_EQ(file8_info.file_path, file8);
     ASSERT_EQ(file8_info.num_entries, 0);
     ASSERT_EQ(file8_info.smallest_key, "");
@@ -436,13 +437,11 @@
     // Cannot create an empty sst file
     std::string file_empty = sst_files_dir_ + "file_empty.sst";
     ExternalSstFileInfo file_empty_info;
-    s = sst_file_writer.Finish(&file_empty_info);
-    ASSERT_NOK(s);
+    ASSERT_NOK(sst_file_writer.Finish(&file_empty_info));
 
     DestroyAndReopen(options);
     // Add file using file path
-    s = DeprecatedAddFile({file1});
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(DeprecatedAddFile({file1}));
     ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
     for (int k = 0; k < 100; k++) {
       ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
@@ -463,12 +462,10 @@
     }
 
     // This file has overlapping values with the existing data
-    s = DeprecatedAddFile({file3});
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile({file3}));
 
     // This file has overlapping values with the existing data
-    s = DeprecatedAddFile({file4});
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile({file4}));
 
     // Overwrite values of keys divisible by 5
     for (int k = 0; k < 200; k += 5) {
@@ -476,17 +473,16 @@
     }
     ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
 
-    // Key range of file5 (400 => 499) dont overlap with any keys in DB
+    // Key range of file5 (400 => 499) don't overlap with any keys in DB
     ASSERT_OK(DeprecatedAddFile({file5}));
 
     // This file has overlapping values with the existing data
-    s = DeprecatedAddFile({file6});
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile({file6}));
 
-    // Key range of file7 (500 => 598) dont overlap with any keys in DB
+    // Key range of file7 (500 => 598) don't overlap with any keys in DB
     ASSERT_OK(DeprecatedAddFile({file7}));
 
-    // Key range of file7 (600 => 700) dont overlap with any keys in DB
+    // Key range of file7 (600 => 700) don't overlap with any keys in DB
     ASSERT_OK(DeprecatedAddFile({file8}));
 
     // Make sure values are correct before and after flush/compaction
@@ -609,15 +605,13 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file1_info;
-    Status s = sst_file_writer.Finish(&file1_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
     ASSERT_EQ(file1_info.file_path, file1);
     ASSERT_EQ(file1_info.num_entries, 100);
     ASSERT_EQ(file1_info.smallest_key, Key(0));
     ASSERT_EQ(file1_info.largest_key, Key(99));
     // sst_file_writer already finished, cannot add this value
-    s = sst_file_writer.Put(Key(100), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
 
     // file2.sst (100 => 199)
     std::string file2 = sst_files_dir_ + "file2.sst";
@@ -626,11 +620,9 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     // Cannot add this key because it's not after last added key
-    s = sst_file_writer.Put(Key(99), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
     ExternalSstFileInfo file2_info;
-    s = sst_file_writer.Finish(&file2_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
     ASSERT_EQ(file2_info.file_path, file2);
     ASSERT_EQ(file2_info.num_entries, 100);
     ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -644,8 +636,7 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file3_info;
-    s = sst_file_writer.Finish(&file3_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
     ASSERT_EQ(file3_info.file_path, file3);
     ASSERT_EQ(file3_info.num_entries, 5);
     ASSERT_EQ(file3_info.smallest_key, Key(195));
@@ -659,8 +650,7 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file4_info;
-    s = sst_file_writer.Finish(&file4_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
     ASSERT_EQ(file4_info.file_path, file4);
     ASSERT_EQ(file4_info.num_entries, 10);
     ASSERT_EQ(file4_info.smallest_key, Key(30));
@@ -673,8 +663,7 @@
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file5_info;
-    s = sst_file_writer.Finish(&file5_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
     ASSERT_EQ(file5_info.file_path, file5);
     ASSERT_EQ(file5_info.num_entries, 100);
     ASSERT_EQ(file5_info.smallest_key, Key(200));
@@ -686,8 +675,7 @@
     ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
     ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
     ExternalSstFileInfo file6_info;
-    s = sst_file_writer.Finish(&file6_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
     ASSERT_EQ(file6_info.file_path, file6);
     ASSERT_EQ(file6_info.num_entries, 0);
     ASSERT_EQ(file6_info.smallest_key, "");
@@ -701,8 +689,7 @@
     ASSERT_OK(sst_file_writer.Open(file7));
     ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
     ExternalSstFileInfo file7_info;
-    s = sst_file_writer.Finish(&file7_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
     ASSERT_EQ(file7_info.file_path, file7);
     ASSERT_EQ(file7_info.num_entries, 0);
     ASSERT_EQ(file7_info.smallest_key, "");
@@ -722,17 +709,13 @@
     DestroyAndReopen(options);
 
     // These lists of files have key ranges that overlap with each other
-    s = DeprecatedAddFile(file_list1);
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(file_list1));
     // Both of the following overlap on the range deletion tombstone.
-    s = DeprecatedAddFile(file_list4);
-    ASSERT_FALSE(s.ok()) << s.ToString();
-    s = DeprecatedAddFile(file_list5);
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(file_list4));
+    ASSERT_NOK(DeprecatedAddFile(file_list5));
 
     // Add files using file path list
-    s = DeprecatedAddFile(file_list0);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(DeprecatedAddFile(file_list0));
     ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
     for (int k = 0; k < 200; k++) {
       ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
@@ -773,8 +756,7 @@
     }
 
     // This file list has overlapping values with the existing data
-    s = DeprecatedAddFile(file_list3);
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(file_list3));
 
     // Overwrite values of keys divisible by 5
     for (int k = 0; k < 200; k += 5) {
@@ -842,16 +824,14 @@
       for (int k = i * 100; k < (i + 1) * 100; k++) {
         ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
       }
-      Status s = sst_file_writer.Finish(&files_info[i]);
-      ASSERT_TRUE(s.ok()) << s.ToString();
+      ASSERT_OK(sst_file_writer.Finish(&files_info[i]));
       ASSERT_EQ(files_info[i].file_path, files[i]);
       ASSERT_EQ(files_info[i].num_entries, 100);
       ASSERT_EQ(files_info[i].smallest_key, Key(i * 100));
       ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1));
     }
     files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst");
-    auto s = DeprecatedAddFile(files);
-    ASSERT_NOK(s) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(files));
     for (int k = 0; k < n * 100; k++) {
       ASSERT_EQ("NOT_FOUND", Get(Key(k)));
     }
@@ -873,17 +853,14 @@
 
   // file1.sst (0 => 500)
   std::string sst_file_path = sst_files_dir_ + "file1.sst";
-  Status s = sst_file_writer.Open(sst_file_path);
-  ASSERT_OK(s);
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
   for (int i = 0; i < 500; i++) {
     std::string k = Key(i);
-    s = sst_file_writer.Put(k, k + "_val");
-    ASSERT_OK(s);
+    ASSERT_OK(sst_file_writer.Put(k, k + "_val"));
   }
 
   ExternalSstFileInfo sst_file_info;
-  s = sst_file_writer.Finish(&sst_file_info);
-  ASSERT_OK(s);
+  ASSERT_OK(sst_file_writer.Finish(&sst_file_info));
 
   options.delete_obsolete_files_period_micros = 0;
   options.disable_auto_compactions = true;
@@ -895,12 +872,11 @@
         ASSERT_OK(Flush());
         ASSERT_OK(Put("aaa", "xxx"));
         ASSERT_OK(Flush());
-        db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  s = DeprecatedAddFile({sst_file_path});
-  ASSERT_OK(s);
+  ASSERT_OK(DeprecatedAddFile({sst_file_path}));
 
   for (int i = 0; i < 500; i++) {
     std::string k = Key(i);
@@ -923,8 +899,7 @@
     ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
   }
   ExternalSstFileInfo file1_info;
-  Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(sst_file_writer.Finish(&file1_info));
   ASSERT_EQ(file1_info.file_path, file1);
   ASSERT_EQ(file1_info.num_entries, 100);
   ASSERT_EQ(file1_info.smallest_key, Key(0));
@@ -937,8 +912,7 @@
     ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
   }
   ExternalSstFileInfo file2_info;
-  s = sst_file_writer.Finish(&file2_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(sst_file_writer.Finish(&file2_info));
   ASSERT_EQ(file2_info.file_path, file2);
   ASSERT_EQ(file2_info.num_entries, 200);
   ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -967,8 +941,7 @@
     ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
   }
   ExternalSstFileInfo file3_info;
-  s = sst_file_writer.Finish(&file3_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(sst_file_writer.Finish(&file3_info));
   ASSERT_EQ(file3_info.file_path, file3);
   ASSERT_EQ(file3_info.num_entries, 100);
   ASSERT_EQ(file3_info.smallest_key, Key(300));
@@ -985,6 +958,7 @@
 }
 
 TEST_F(ExternalSSTFileTest, MultiThreaded) {
+  env_->skip_fsync_ = true;
   // Bulk load 10 files every file contain 1000 keys
   int num_files = 10;
   int keys_per_file = 1000;
@@ -1013,8 +987,7 @@
         ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
       }
 
-      Status s = sst_file_writer.Finish();
-      ASSERT_TRUE(s.ok()) << s.ToString();
+      ASSERT_OK(sst_file_writer.Finish());
     };
     // Write num_files files in parallel
     std::vector<port::Thread> sst_writer_threads;
@@ -1076,8 +1049,7 @@
     // Overwrite values of keys divisible by 100
     for (int k = 0; k < num_files * keys_per_file; k += 100) {
       std::string key = Key(k);
-      Status s = Put(key, key + "_new");
-      ASSERT_TRUE(s.ok());
+      ASSERT_OK(Put(key, key + "_new"));
     }
 
     for (int i = 0; i < 2; i++) {
@@ -1097,6 +1069,7 @@
 }
 
 TEST_F(ExternalSSTFileTest, OverlappingRanges) {
+  env_->skip_fsync_ = true;
   Random rnd(301);
   SequenceNumber assigned_seqno = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -1120,6 +1093,7 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   do {
     Options options = CurrentOptions();
+    env_->skip_fsync_ = true;
     DestroyAndReopen(options);
 
     SstFileWriter sst_file_writer(EnvOptions(), options);
@@ -1159,7 +1133,8 @@
 
         // Generate the file containing the range
         std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
-        ASSERT_OK(sst_file_writer.Open(file_name));
+        s = sst_file_writer.Open(file_name);
+        ASSERT_OK(s);
         for (int k = range_start; k <= range_end; k++) {
           s = sst_file_writer.Put(Key(k), range_val);
           ASSERT_OK(s);
@@ -1204,10 +1179,10 @@
 
       // Flush / Compact the DB
       if (i && i % 50 == 0) {
-        Flush();
+        ASSERT_OK(Flush());
       }
       if (i && i % 75 == 0) {
-        db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
       }
     }
 
@@ -1228,6 +1203,7 @@
 }
 
 TEST_P(ExternalSSTFileTest, PickedLevel) {
+  env_->skip_fsync_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = false;
   options.level0_file_num_compaction_trigger = 4;
@@ -1284,7 +1260,7 @@
   // Hold compaction from finishing
   TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2");
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   EXPECT_EQ(FilesPerLevel(), "1,1,1,2");
 
   size_t kcnt = 0;
@@ -1294,6 +1270,7 @@
 }
 
 TEST_F(ExternalSSTFileTest, PickedLevelBug) {
+  env_->skip_fsync_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = false;
   options.level0_file_num_compaction_trigger = 3;
@@ -1319,8 +1296,11 @@
   // We have 2 overlapping files in L0
   EXPECT_EQ(FilesPerLevel(), "2");
 
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::PickedLevelBug:0"},
+      {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+        "ExternalSSTFileTest::PickedLevelBug:0"},
        {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"},
        {"ExternalSSTFileTest::PickedLevelBug:2",
         "DBImpl::RunManualCompaction:0"},
@@ -1334,37 +1314,47 @@
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  // While writing the MANIFEST start a thread that will ask for compaction
-  ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() {
-    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  });
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
-
-  // Start a thread that will ingest a new file
-  ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() {
-    file_keys = {1, 2, 3};
-    ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, 1));
-  });
+  Status bg_compact_status;
+  Status bg_addfile_status;
 
-  // Wait for AddFile to start picking levels and writing MANIFEST
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
-
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
-
-  // We need to verify that no compactions can run while AddFile is
-  // ingesting the files into the levels it find suitable. So we will
-  // wait for 2 seconds to give a chance for compactions to run during
-  // this period, and then make sure that no compactions where able to run
-  env_->SleepForMicroseconds(1000000 * 2);
-  ASSERT_FALSE(bg_compact_started.load());
-
-  // Hold AddFile from finishing writing the MANIFEST
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+  {
+    // While writing the MANIFEST start a thread that will ask for compaction
+    ThreadGuard bg_compact(port::Thread([&]() {
+      bg_compact_status =
+          db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }));
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+
+    // Start a thread that will ingest a new file
+    ThreadGuard bg_addfile(port::Thread([&]() {
+      file_keys = {1, 2, 3};
+      bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1);
+    }));
+
+    // Wait for AddFile to start picking levels and writing MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
+
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
+
+    // We need to verify that no compactions can run while AddFile is
+    // ingesting the files into the levels it find suitable. So we will
+    // wait for 2 seconds to give a chance for compactions to run during
+    // this period, and then make sure that no compactions where able to run
+    env_->SleepForMicroseconds(1000000 * 2);
+    bool bg_compact_started_tmp = bg_compact_started.load();
+
+    // Hold AddFile from finishing writing the MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+
+    // check the status at the end, so even if the ASSERT fails the threads
+    // could be joined and return.
+    ASSERT_FALSE(bg_compact_started_tmp);
+  }
 
-  bg_addfile.join();
-  bg_compact.join();
+  ASSERT_OK(bg_addfile_status);
+  ASSERT_OK(bg_compact_status);
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   int total_keys = 0;
   Iterator* iter = db_->NewIterator(ReadOptions());
@@ -1401,7 +1391,7 @@
 
   // After full compaction, there should be only 1 file.
   std::vector<std::string> files;
-  env_->GetChildren(dbname_, &files);
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
   int num_sst_files = 0;
   for (auto& f : files) {
     uint64_t number;
@@ -1413,7 +1403,9 @@
   ASSERT_EQ(1, num_sst_files);
 }
 
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
+  env_->skip_fsync_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = false;
   options.level0_file_num_compaction_trigger = 2;
@@ -1469,8 +1461,10 @@
     }
   }
 }
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
+  env_->skip_fsync_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = false;
   options.level0_file_num_compaction_trigger = 4;
@@ -1521,7 +1515,7 @@
   TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2");
 
   // Output of the compaction will go to L3
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   EXPECT_EQ(FilesPerLevel(), "1,0,0,2");
 
   Close();
@@ -1609,15 +1603,15 @@
               generated_files[7]};
   ASSERT_NOK(DeprecatedAddFile(in_files));
 
-  // These 2 files dont overlap with each other
+  // These 2 files don't overlap with each other
   in_files = {generated_files[0], generated_files[2]};
   ASSERT_OK(DeprecatedAddFile(in_files));
 
-  // These 2 files dont overlap with each other but overlap with keys in DB
+  // These 2 files don't overlap with each other but overlap with keys in DB
   in_files = {generated_files[3], generated_files[7]};
   ASSERT_NOK(DeprecatedAddFile(in_files));
 
-  // Files dont overlap and dont overlap with DB key range
+  // Files don't overlap and don't overlap with DB key range
   in_files = {generated_files[4], generated_files[6], generated_files[8]};
   ASSERT_OK(DeprecatedAddFile(in_files));
 
@@ -1663,7 +1657,7 @@
   cro.exclusive_manual_compaction = false;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -1715,9 +1709,9 @@
   Options options = CurrentOptions();
   options.unordered_write = true;
   DestroyAndReopen(options);
-  Put("foo", "v1");
+  ASSERT_OK(Put("foo", "v1"));
   SyncPoint::GetInstance()->EnableProcessing();
-  port::Thread writer([&]() { Put("bar", "v2"); });
+  port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); });
 
   TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL");
   ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1,
@@ -1729,7 +1723,9 @@
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+  env_->skip_fsync_ = true;
   Options options = CurrentOptions();
   options.IncreaseParallelism(20);
   options.level0_slowdown_writes_trigger = 256;
@@ -1746,10 +1742,8 @@
     for (int i = 0; i < 500; i++) {
       std::vector<std::pair<std::string, std::string>> random_data;
       for (int j = 0; j < 100; j++) {
-        std::string k;
-        std::string v;
-        test::RandomString(&rnd, rnd.Next() % 20, &k);
-        test::RandomString(&rnd, rnd.Next() % 50, &v);
+        std::string k = rnd.RandomString(rnd.Next() % 20);
+        std::string v = rnd.RandomString(rnd.Next() % 50);
         random_data.emplace_back(k, v);
       }
 
@@ -1767,10 +1761,11 @@
     }
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     VerifyDBFromMap(true_data, &kcnt, false);
   }
 }
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   Options options = CurrentOptions();
@@ -1797,7 +1792,7 @@
       options, file_data, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
 
-  // This file dont overlap with anything in the DB, will go to L4
+  // This file don't overlap with anything in the DB, will go to L4
   ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
 
   // Insert 80 -> 130 using AddFile
@@ -1822,7 +1817,7 @@
       options, file_data, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
 
-  // This file dont overlap with anything in the DB and fit in L4 as well
+  // This file don't overlap with anything in the DB and fit in L4 as well
   ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
 
   // Insert 10 -> 40 using AddFile
@@ -1851,8 +1846,8 @@
     ASSERT_OK(Put(Key(k), "memtable"));
     true_data[Key(k)] = "memtable";
   }
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   bool write_global_seqno = std::get<0>(GetParam());
@@ -1861,40 +1856,40 @@
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {90, 100, 110}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {19, 20, 21}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_EQ(entries_in_memtable, 0);
 
   for (int k : {200, 201, 205, 206}) {
     ASSERT_OK(Put(Key(k), "memtable"));
     true_data[Key(k)] = "memtable";
   }
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   // No need for flush, this file keys fit between the memtable keys
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {202, 203, 204}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {206, 207}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_EQ(entries_in_memtable, 0);
 
   size_t kcnt = 0;
@@ -2001,7 +1996,8 @@
     if (running_threads.load() == 0) {
       break;
     }
-    env_->SleepForMicroseconds(500000);
+    // Make sure we do a "real sleep", not a mock one.
+    SystemClock::Default()->SleepForMicroseconds(500000);
   }
 
   ASSERT_EQ(running_threads.load(), 0);
@@ -2059,16 +2055,16 @@
 
   IngestExternalFileOptions ifo;
 
-  // SST CF dont match
+  // SST CF don't match
   ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo));
-  // SST CF dont match
+  // SST CF don't match
   ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo));
   // SST CF match
   ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo));
 
-  // SST CF dont match
+  // SST CF don't match
   ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo));
-  // SST CF dont match
+  // SST CF don't match
   ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo));
   // SST CF match
   ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo));
@@ -2292,7 +2288,7 @@
     ASSERT_OK(Put(Key(i), "memtable"));
     true_data[Key(i)] = "memtable";
   }
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // Universal picker should go at second from the bottom level
   ASSERT_EQ("0,1", FilesPerLevel());
   ASSERT_OK(GenerateAndAddExternalFile(
@@ -2306,7 +2302,7 @@
       verify_checksums_before_ingest, true /*ingest_behind*/,
       false /*sort_data*/, &true_data));
   ASSERT_EQ("0,1,1", FilesPerLevel());
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // bottom level should be empty
   ASSERT_EQ("0,1", FilesPerLevel());
 
@@ -2383,20 +2379,67 @@
   Random rnd(301);
   std::vector<std::pair<std::string, std::string>> random_data;
   for (int i = 0; i < kNumEntries; i++) {
-    std::string val;
-    test::RandomString(&rnd, kNumBytesPerEntry, &val);
+    std::string val = rnd.RandomString(kNumBytesPerEntry);
     random_data.emplace_back(Key(i), std::move(val));
   }
   ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
   ASSERT_EQ(1, num_compression_dicts);
 }
 
+class ExternalSSTBlockChecksumTest
+    : public ExternalSSTFileTestBase,
+      public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, ExternalSSTBlockChecksumTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+// Very slow, not worth the cost to run regularly
+TEST_P(ExternalSSTBlockChecksumTest, DISABLED_HugeBlockChecksum) {
+  BlockBasedTableOptions table_options;
+  table_options.format_version = GetParam();
+  for (auto t : GetSupportedChecksums()) {
+    table_options.checksum = t;
+    Options options = CurrentOptions();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // 2^32 - 1, will lead to data block with more than 2^32 bytes
+    size_t huge_size = port::kMaxUint32;
+
+    std::string f = sst_files_dir_ + "f.sst";
+    ASSERT_OK(sst_file_writer.Open(f));
+    {
+      Random64 r(123);
+      std::string huge(huge_size, 0);
+      for (size_t j = 0; j + 7 < huge_size; j += 8) {
+        EncodeFixed64(&huge[j], r.Next());
+      }
+      ASSERT_OK(sst_file_writer.Put("Huge", huge));
+    }
+
+    ExternalSstFileInfo f_info;
+    ASSERT_OK(sst_file_writer.Finish(&f_info));
+    ASSERT_GT(f_info.file_size, uint64_t{huge_size} + 10);
+
+    SstFileReader sst_file_reader(options);
+    ASSERT_OK(sst_file_reader.Open(f));
+    ASSERT_OK(sst_file_reader.VerifyChecksum());
+  }
+}
+
 TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
       new FaultInjectionTestEnv(env_));
   Options options = CurrentOptions();
   options.env = fault_injection_env.get();
   CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  // Exercise different situations in different column families: two are empty
+  // (so no new sequence number is needed), but at least one overlaps with the
+  // DB and needs to bump the sequence number.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "oldvalue"));
+
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
@@ -2420,9 +2463,8 @@
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
-  Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                         -1, true, true_data);
-  ASSERT_OK(s);
+  ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                        -1, true, true_data));
   Close();
   ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
                            options);
@@ -2603,9 +2645,8 @@
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
   port::Thread ingest_thread([&]() {
-    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                           -1, true, true_data);
-    ASSERT_NOK(s);
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
   });
   TEST_SYNC_POINT(
       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
@@ -2673,9 +2714,8 @@
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
   port::Thread ingest_thread([&]() {
-    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                           -1, true, true_data);
-    ASSERT_NOK(s);
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
   });
   TEST_SYNC_POINT(
       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
@@ -2748,9 +2788,8 @@
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
   port::Thread ingest_thread([&]() {
-    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                           -1, true, true_data);
-    ASSERT_NOK(s);
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
   });
   TEST_SYNC_POINT(
       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
@@ -2761,7 +2800,7 @@
       "PartialManifestWriteFail:1");
   ingest_thread.join();
 
-  fault_injection_env->DropUnsyncedFileData();
+  ASSERT_OK(fault_injection_env->DropUnsyncedFileData());
   fault_injection_env->SetFilesystemActive(true);
   Close();
   ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
@@ -2796,7 +2835,103 @@
   // sure that it won't enter the 2nd writer queue for the second time.
   std::vector<std::pair<std::string, std::string>> data;
   data.push_back(std::make_pair("1001", "v2"));
-  GenerateAndAddExternalFile(options, data);
+  ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true));
+}
+
+TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  constexpr size_t kValueSize = 8;
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueSize);
+
+  // Write some key to make global seqno larger than zero
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("ab" + Key(i), value));
+  }
+  // Get a Snapshot to make RocksDB assign global seqno to ingested sst files.
+  auto snap = dbfull()->GetSnapshot();
+
+  std::string fname = sst_files_dir_ + "test_file";
+  ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+  ASSERT_OK(writer.Open(fname));
+  std::string key1 = "ab";
+  std::string key2 = "ab";
+
+  // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+  // key is composed as (seqno << 8 | value_type), and here `1` represents
+  // ValueType::kTypeValue
+
+  PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+  key2 += "cdefghijkl";
+
+  ASSERT_OK(writer.Put(key1, value));
+  ASSERT_OK(writer.Put(key2, value));
+
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer.Finish(&info));
+
+  ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+                                         IngestExternalFileOptions()));
+  dbfull()->ReleaseSnapshot(snap);
+  ASSERT_EQ(value, Get(key1));
+  // You will get error here
+  ASSERT_EQ(value, Get(key2));
+}
+
+TEST_P(ExternalSSTFileTest,
+       DeltaEncodingWhileGlobalSeqnoPresentIteratorSwitch) {
+  // Regression test for bug where global seqno corrupted the shared bytes
+  // buffer when switching from reverse iteration to forward iteration.
+  constexpr size_t kValueSize = 8;
+  Options options = CurrentOptions();
+
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueSize);
+
+  std::string key0 = "aa";
+  std::string key1 = "ab";
+  // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+  // key is composed as (seqno << 8 | value_type), and here `1` represents
+  // ValueType::kTypeValue
+  std::string key2 = "ab";
+  PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+  key2 += "cdefghijkl";
+  std::string key3 = key2 + "_";
+
+  // Write some key to make global seqno larger than zero
+  ASSERT_OK(Put(key0, value));
+
+  std::string fname = sst_files_dir_ + "test_file";
+  ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+  ASSERT_OK(writer.Open(fname));
+
+  // key0 is a dummy to ensure the turnaround point (key1) comes from Prev
+  // cache rather than block (restart keys are pinned in block).
+  ASSERT_OK(writer.Put(key0, value));
+  ASSERT_OK(writer.Put(key1, value));
+  ASSERT_OK(writer.Put(key2, value));
+  ASSERT_OK(writer.Put(key3, value));
+
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer.Finish(&info));
+
+  ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+                                         IngestExternalFileOptions()));
+  ReadOptions read_opts;
+  // Prevents Seek() when switching directions, which circumvents the bug.
+  read_opts.total_order_seek = true;
+  Iterator* iter = db_->NewIterator(read_opts);
+  // Scan backwards to key2. File iterator will then be positioned at key1.
+  iter->Seek(key3);
+  ASSERT_EQ(key3, iter->key());
+  iter->Prev();
+  ASSERT_EQ(key2, iter->key());
+  // Scan forwards and make sure key3 is present. Previously key3 would be
+  // corrupted by the global seqno from key1.
+  iter->Next();
+  ASSERT_EQ(key3, iter->key());
+  delete iter;
 }
 
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/fault_injection_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/fault_injection_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/fault_injection_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/fault_injection_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,17 +16,21 @@
 #include "db/version_set.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
-#include "logging/logging.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/mutexlock.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#ifndef NDEBUG
+#include "utilities/fault_injection_fs.h"
+#endif
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -57,7 +61,6 @@
 
   bool sequential_order_;
 
- protected:
  public:
   enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
   enum ResetMethod {
@@ -81,7 +84,11 @@
         sync_use_compact_(true),
         base_env_(nullptr),
         env_(nullptr),
-        db_(nullptr) {}
+        db_(nullptr) {
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &system_env_, &env_guard_));
+    EXPECT_NE(system_env_, nullptr);
+  }
 
   ~FaultInjectionTest() override {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -94,7 +101,7 @@
       return false;
     } else {
       if (option_config_ == kMultiLevels) {
-        base_env_.reset(new MockEnv(Env::Default()));
+        base_env_.reset(MockEnv::Create(system_env_));
       }
       return true;
     }
@@ -146,8 +153,7 @@
     assert(tiny_cache_ == nullptr);
     assert(env_ == nullptr);
 
-    env_ =
-        new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
+    env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : system_env_);
 
     options_ = CurrentOptions();
     options_.env = env_;
@@ -192,7 +198,7 @@
     for (int i = start_idx; i < start_idx + num_vals; i++) {
       Slice key = Key(i, &key_space);
       batch.Clear();
-      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(batch.Put(key, Value(i, &value_space)));
       ASSERT_OK(db_->Write(write_options, &batch));
     }
   }
@@ -249,7 +255,8 @@
   // Return the value to associate with the specified key
   Slice Value(int k, std::string* storage) const {
     Random r(k);
-    return test::RandomString(&r, kValueSize, storage);
+    *storage = r.RandomString(kValueSize);
+    return Slice(*storage);
   }
 
   void CloseDB() {
@@ -271,12 +278,12 @@
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
     }
-
+    ASSERT_OK(iter->status());
     delete iter;
 
     FlushOptions flush_options;
     flush_options.wait = true;
-    db_->Flush(flush_options);
+    ASSERT_OK(db_->Flush(flush_options));
   }
 
   // rnd cannot be null for kResetDropRandomUnsyncedData
@@ -309,7 +316,7 @@
 
     Build(write_options, 0, num_pre_sync);
     if (sync_use_compact_) {
-      db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
     write_options.sync = false;
     Build(write_options, num_pre_sync, num_post_sync);
@@ -341,9 +348,13 @@
   }
 
   void WaitCompactionFinish() {
-    static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact();
+    ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
     ASSERT_OK(db_->Put(WriteOptions(), "", ""));
   }
+
+ private:
+  Env* system_env_;
+  std::shared_ptr<Env> env_guard_;
 };
 
 class FaultInjectionTestSplitted : public FaultInjectionTest {};
@@ -408,7 +419,7 @@
   write_options.sync = true;
   ASSERT_OK(
       db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
-  db_->FlushWAL(false);
+  ASSERT_OK(db_->FlushWAL(false));
 
   env_->SetFilesystemActive(false);
   NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
@@ -449,7 +460,7 @@
   Build(WriteOptions(), 0, kNumKeys);
   FlushOptions flush_options;
   flush_options.wait = true;
-  db_->Flush(flush_options);
+  ASSERT_OK(db_->Flush(flush_options));
   ASSERT_OK(db_->Put(WriteOptions(), "", ""));
   TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
   TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
@@ -520,9 +531,9 @@
   wo.sync = true;
   wo.disableWAL = false;
   WriteBatch batch;
-  batch.Put("cats", "dogs");
+  ASSERT_OK(batch.Put("cats", "dogs"));
   batch.MarkWalTerminationPoint();
-  batch.Put("boys", "girls");
+  ASSERT_OK(batch.Put("boys", "girls"));
   ASSERT_OK(db_->Write(wo, &batch));
 
   env_->SetFilesystemActive(false);
@@ -535,6 +546,76 @@
   ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
 }
 
+TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) {
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  fault_fs->EnableWriteErrorInjection();
+  fault_fs->SetFilesystemDirectWritable(false);
+  const std::string file_name = NormalizePath(dbname_ + "/test_file");
+  std::unique_ptr<log::Writer> log_writer = nullptr;
+  constexpr uint64_t log_number = 0;
+  {
+    std::unique_ptr<FSWritableFile> file;
+    const Status s =
+        fault_fs->NewWritableFile(file_name, FileOptions(), &file, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> fwriter(
+        new WritableFileWriter(std::move(file), file_name, FileOptions()));
+    log_writer.reset(new log::Writer(std::move(fwriter), log_number,
+                                     /*recycle_log_files=*/false));
+  }
+
+  fault_fs->SetRandomWriteError(
+      0xdeadbeef, /*one_in=*/1, IOStatus::IOError("Injected IOError"),
+      /*inject_for_all_file_types=*/true, /*types=*/{});
+
+  {
+    VersionEdit edit;
+    edit.SetColumnFamily(0);
+    std::string buf;
+    assert(edit.EncodeTo(&buf));
+    const Status s = log_writer->AddRecord(buf);
+    ASSERT_NOK(s);
+  }
+
+  fault_fs->DisableWriteErrorInjection();
+
+  // Closing the log writer will cause WritableFileWriter::Close() and flush
+  // remaining data from its buffer to underlying file.
+  log_writer.reset();
+
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    Status s =
+        fault_fs->NewSequentialFile(file_name, FileOptions(), &file, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<SequentialFileReader> freader(
+        new SequentialFileReader(std::move(file), file_name));
+    Status log_read_s;
+    class LogReporter : public log::Reader::Reporter {
+     public:
+      Status* status_;
+      explicit LogReporter(Status* _s) : status_(_s) {}
+      void Corruption(size_t /*bytes*/, const Status& _s) override {
+        if (status_->ok()) {
+          *status_ = _s;
+        }
+      }
+    } reporter(&log_read_s);
+    std::unique_ptr<log::Reader> log_reader(new log::Reader(
+        nullptr, std::move(freader), &reporter, /*checksum=*/true, log_number));
+    Slice record;
+    std::string data;
+    size_t count = 0;
+    while (log_reader->ReadRecord(&record, &data) && log_read_s.ok()) {
+      VersionEdit edit;
+      ASSERT_OK(edit.DecodeFrom(data));
+      ++count;
+    }
+    // Verify that only one version edit exists in the file.
+    ASSERT_EQ(1, count);
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(
     FaultTest, FaultInjectionTest,
     ::testing::Values(std::make_tuple(false, kDefault, kEnd),
@@ -551,5 +632,6 @@
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/filename_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/filename_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/filename_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/filename_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,7 +10,6 @@
 #include "file/filename.h"
 
 #include "db/dbformat.h"
-#include "logging/logging.h"
 #include "port/port.h"
 #include "test_util/testharness.h"
 
@@ -35,23 +34,23 @@
     FileType type;
     char mode;
   } cases[] = {
-        {"100.log", 100, kLogFile, kAllMode},
-        {"0.log", 0, kLogFile, kAllMode},
-        {"0.sst", 0, kTableFile, kAllMode},
-        {"CURRENT", 0, kCurrentFile, kAllMode},
-        {"LOCK", 0, kDBLockFile, kAllMode},
-        {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
-        {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
-        {"METADB-2", 2, kMetaDatabase, kAllMode},
-        {"METADB-7", 7, kMetaDatabase, kAllMode},
-        {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
-        {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
-        {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
-        {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
-        {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
-        {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
-        {"18446744073709551615.log", 18446744073709551615ull, kLogFile,
-         kAllMode}, };
+      {"100.log", 100, kWalFile, kAllMode},
+      {"0.log", 0, kWalFile, kAllMode},
+      {"0.sst", 0, kTableFile, kAllMode},
+      {"CURRENT", 0, kCurrentFile, kAllMode},
+      {"LOCK", 0, kDBLockFile, kAllMode},
+      {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+      {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+      {"METADB-2", 2, kMetaDatabase, kAllMode},
+      {"METADB-7", 7, kMetaDatabase, kAllMode},
+      {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+      {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+      {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode},
+  };
   for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
     for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
       InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
@@ -108,7 +107,7 @@
 TEST_F(FileNameTest, InfoLogFileName) {
   std::string dbname = ("/data/rocksdb");
   std::string db_absolute_path;
-  Env::Default()->GetAbsolutePath(dbname, &db_absolute_path);
+  ASSERT_OK(Env::Default()->GetAbsolutePath(dbname, &db_absolute_path));
 
   ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, ""));
   ASSERT_EQ("/data/rocksdb/LOG.old.666",
@@ -142,7 +141,7 @@
   ASSERT_EQ("foo/", std::string(fname.data(), 4));
   ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
   ASSERT_EQ(192U, number);
-  ASSERT_EQ(kLogFile, type);
+  ASSERT_EQ(kWalFile, type);
 
   fname = TableFileName({DbPath("bar", 0)}, 200, 0);
   std::string fname1 =
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.cc	2025-05-19 16:14:27.000000000 +0000
@@ -39,8 +39,6 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "table/block_based/block.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
@@ -77,28 +75,32 @@
       return "Manual Flush";
     case FlushReason::kErrorRecovery:
       return "Error Recovery";
+    case FlushReason::kWalFull:
+      return "WAL Full";
     default:
       return "Invalid";
   }
 }
 
-FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
-                   const ImmutableDBOptions& db_options,
-                   const MutableCFOptions& mutable_cf_options,
-                   const uint64_t* max_memtable_id,
-                   const FileOptions& file_options, VersionSet* versions,
-                   InstrumentedMutex* db_mutex,
-                   std::atomic<bool>* shutting_down,
-                   std::vector<SequenceNumber> existing_snapshots,
-                   SequenceNumber earliest_write_conflict_snapshot,
-                   SnapshotChecker* snapshot_checker, JobContext* job_context,
-                   LogBuffer* log_buffer, Directory* db_directory,
-                   Directory* output_file_directory,
-                   CompressionType output_compression, Statistics* stats,
-                   EventLogger* event_logger, bool measure_io_stats,
-                   const bool sync_output_directory, const bool write_manifest,
-                   Env::Priority thread_pri)
+FlushJob::FlushJob(
+    const std::string& dbname, ColumnFamilyData* cfd,
+    const ImmutableDBOptions& db_options,
+    const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+    const FileOptions& file_options, VersionSet* versions,
+    InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+    std::vector<SequenceNumber> existing_snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, JobContext* job_context,
+    LogBuffer* log_buffer, FSDirectory* db_directory,
+    FSDirectory* output_file_directory, CompressionType output_compression,
+    Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+    const bool sync_output_directory, const bool write_manifest,
+    Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+    const std::string& db_id, const std::string& db_session_id,
+    std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback)
     : dbname_(dbname),
+      db_id_(db_id),
+      db_session_id_(db_session_id),
       cfd_(cfd),
       db_options_(db_options),
       mutable_cf_options_(mutable_cf_options),
@@ -123,13 +125,18 @@
       edit_(nullptr),
       base_(nullptr),
       pick_memtable_called(false),
-      thread_pri_(thread_pri) {
+      thread_pri_(thread_pri),
+      io_tracer_(io_tracer),
+      clock_(db_options_.clock),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      blob_callback_(blob_callback) {
   // Update the thread status to indicate flush.
   ReportStartedFlush();
   TEST_SYNC_POINT("FlushJob::FlushJob()");
 }
 
 FlushJob::~FlushJob() {
+  io_status_.PermitUncheckedError();
   ThreadStatusUtil::ResetThreadStatus();
 }
 
@@ -159,7 +166,6 @@
       ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
   IOSTATS_RESET(bytes_written);
 }
-
 void FlushJob::PickMemTable() {
   db_mutex_->AssertHeld();
   assert(!pick_memtable_called);
@@ -190,8 +196,8 @@
   base_->Ref();  // it is likely that we do not need this reference
 }
 
-Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
-                     FileMetaData* file_meta) {
+Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
+                     bool* switched_to_mempurge) {
   TEST_SYNC_POINT("FlushJob::Start");
   db_mutex_->AssertHeld();
   assert(pick_memtable_called);
@@ -221,9 +227,43 @@
     prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
     prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
   }
-
-  // This will release and re-acquire the mutex.
-  Status s = WriteLevel0Table();
+  Status mempurge_s = Status::NotFound("No MemPurge.");
+  if ((db_options_.experimental_mempurge_threshold > 0.0) &&
+      (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) &&
+      (!mems_.empty()) && MemPurgeDecider()) {
+    mempurge_s = MemPurge();
+    if (!mempurge_s.ok()) {
+      // Mempurge is typically aborted when the output
+      // bytes cannot be contained onto a single output memtable.
+      if (mempurge_s.IsAborted()) {
+        ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n",
+                       mempurge_s.ToString().c_str());
+      } else {
+        // However the mempurge process can also fail for
+        // other reasons (eg: new_mem->Add() fails).
+        ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n",
+                       mempurge_s.ToString().c_str());
+      }
+    } else {
+      if (switched_to_mempurge) {
+        *switched_to_mempurge = true;
+      } else {
+        // The mempurge process was successful, but no switch_to_mempurge
+        // pointer provided so no way to propagate the state of flush job.
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Mempurge process succeeded"
+                       "but no 'switched_to_mempurge' ptr provided.\n");
+      }
+    }
+  }
+  Status s;
+  if (mempurge_s.ok()) {
+    base_->Unref();
+    s = Status::OK();
+  } else {
+    // This will release and re-acquire the mutex.
+    s = WriteLevel0Table();
+  }
 
   if (s.ok() && cfd_->IsDropped()) {
     s = Status::ColumnFamilyDropped("Column family dropped during compaction");
@@ -238,10 +278,17 @@
   } else if (write_manifest_) {
     TEST_SYNC_POINT("FlushJob::InstallResults");
     // Replace immutable memtable with the generated Table
+    IOStatus tmp_io_s;
     s = cfd_->imm()->TryInstallMemtableFlushResults(
         cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
         meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
-        log_buffer_, &committed_flush_jobs_info_);
+        log_buffer_, &committed_flush_jobs_info_, &tmp_io_s,
+        !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted),
+                              but 'false' if mempurge successful: no new min log number
+                              or new level 0 file path to write to manifest. */);
+    if (!tmp_io_s.ok()) {
+      io_status_ = tmp_io_s;
+    }
   }
 
   if (s.ok() && file_meta != nullptr) {
@@ -262,6 +309,13 @@
     stream << vstorage->NumLevelFiles(level);
   }
   stream.EndArray();
+
+  const auto& blob_files = vstorage->GetBlobFiles();
+  if (!blob_files.empty()) {
+    stream << "blob_file_head" << blob_files.begin()->first;
+    stream << "blob_file_tail" << blob_files.rbegin()->first;
+  }
+
   stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
 
   if (measure_io_stats_) {
@@ -289,13 +343,457 @@
   base_->Unref();
 }
 
+Status FlushJob::MemPurge() {
+  Status s;
+  db_mutex_->AssertHeld();
+  db_mutex_->Unlock();
+  assert(!mems_.empty());
+
+  // Measure purging time.
+  const uint64_t start_micros = clock_->NowMicros();
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
+
+  MemTable* new_mem = nullptr;
+  // For performance/log investigation purposes:
+  // look at how much useful payload we harvest in the new_mem.
+  // This value is then printed to the DB log.
+  double new_mem_capacity = 0.0;
+
+  // Create two iterators, one for the memtable data (contains
+  // info from puts + deletes), and one for the memtable
+  // Range Tombstones (from DeleteRanges).
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  Arena arena;
+  std::vector<InternalIterator*> memtables;
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      range_del_iters;
+  for (MemTable* m : mems_) {
+    memtables.push_back(m->NewIterator(ro, &arena));
+    auto* range_del_iter = m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+    if (range_del_iter != nullptr) {
+      range_del_iters.emplace_back(range_del_iter);
+    }
+  }
+
+  assert(!memtables.empty());
+  SequenceNumber first_seqno = kMaxSequenceNumber;
+  SequenceNumber earliest_seqno = kMaxSequenceNumber;
+  // Pick first and earliest seqno as min of all first_seqno
+  // and earliest_seqno of the mempurged memtables.
+  for (const auto& mem : mems_) {
+    first_seqno = mem->GetFirstSequenceNumber() < first_seqno
+                      ? mem->GetFirstSequenceNumber()
+                      : first_seqno;
+    earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno
+                         ? mem->GetEarliestSequenceNumber()
+                         : earliest_seqno;
+  }
+
+  ScopedArenaIterator iter(
+      NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(),
+                         static_cast<int>(memtables.size()), &arena));
+
+  auto* ioptions = cfd_->ioptions();
+
+  // Place iterator at the First (meaning most recent) key node.
+  iter->SeekToFirst();
+
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
+                                       existing_snapshots_));
+  for (auto& rd_iter : range_del_iters) {
+    range_del_agg->AddTombstones(std::move(rd_iter));
+  }
+
+  // If there is valid data in the memtable,
+  // or at least range tombstones, copy over the info
+  // to the new memtable.
+  if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    // MaxSize is the size of a memtable.
+    size_t maxSize = mutable_cf_options_.write_buffer_size;
+    std::unique_ptr<CompactionFilter> compaction_filter;
+    if (ioptions->compaction_filter_factory != nullptr &&
+        ioptions->compaction_filter_factory->ShouldFilterTableFileCreation(
+            TableFileCreationReason::kFlush)) {
+      CompactionFilter::Context ctx;
+      ctx.is_full_compaction = false;
+      ctx.is_manual_compaction = false;
+      ctx.column_family_id = cfd_->GetID();
+      ctx.reason = TableFileCreationReason::kFlush;
+      compaction_filter =
+          ioptions->compaction_filter_factory->CreateCompactionFilter(ctx);
+      if (compaction_filter != nullptr &&
+          !compaction_filter->IgnoreSnapshots()) {
+        s = Status::NotSupported(
+            "CompactionFilter::IgnoreSnapshots() = false is not supported "
+            "anymore.");
+        return s;
+      }
+    }
+
+    new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
+                           mutable_cf_options_, cfd_->write_buffer_mgr(),
+                           earliest_seqno, cfd_->GetID());
+    assert(new_mem != nullptr);
+
+    Env* env = db_options_.env;
+    assert(env);
+    MergeHelper merge(
+        env, (cfd_->internal_comparator()).user_comparator(),
+        (ioptions->merge_operator).get(), compaction_filter.get(),
+        ioptions->logger, true /* internal key corruption is not ok */,
+        existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+        snapshot_checker_);
+    CompactionIterator c_iter(
+        iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
+        kMaxSequenceNumber, &existing_snapshots_,
+        earliest_write_conflict_snapshot_, snapshot_checker_, env,
+        ShouldReportDetailedTime(env, ioptions->stats),
+        true /* internal key corruption is not ok */, range_del_agg.get(),
+        nullptr, ioptions->allow_data_in_errors,
+        /*compaction=*/nullptr, compaction_filter.get(),
+        /*shutting_down=*/nullptr,
+        /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
+        /*manual_compaction_canceled=*/nullptr, ioptions->info_log,
+        &(cfd_->GetFullHistoryTsLow()));
+
+    // Set earliest sequence number in the new memtable
+    // to be equal to the earliest sequence number of the
+    // memtable being flushed (See later if there is a need
+    // to update this number!).
+    new_mem->SetEarliestSequenceNumber(earliest_seqno);
+    // Likewise for first seq number.
+    new_mem->SetFirstSequenceNumber(first_seqno);
+    SequenceNumber new_first_seqno = kMaxSequenceNumber;
+
+    c_iter.SeekToFirst();
+
+    // Key transfer
+    for (; c_iter.Valid(); c_iter.Next()) {
+      const ParsedInternalKey ikey = c_iter.ikey();
+      const Slice value = c_iter.value();
+      new_first_seqno =
+          ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno;
+
+      // Should we update "OldestKeyTime" ???? -> timestamp appear
+      // to still be an "experimental" feature.
+      s = new_mem->Add(
+          ikey.sequence, ikey.type, ikey.user_key, value,
+          nullptr,   // KV protection info set as nullptr since it
+                     // should only be useful for the first add to
+                     // the original memtable.
+          false,     // : allow concurrent_memtable_writes_
+                     // Not seen as necessary for now.
+          nullptr,   // get_post_process_info(m) must be nullptr
+                     // when concurrent_memtable_writes is switched off.
+          nullptr);  // hint, only used when concurrent_memtable_writes_
+                     // is switched on.
+      if (!s.ok()) {
+        break;
+      }
+
+      // If new_mem has size greater than maxSize,
+      // then rollback to regular flush operation,
+      // and destroy new_mem.
+      if (new_mem->ApproximateMemoryUsage() > maxSize) {
+        s = Status::Aborted("Mempurge filled more than one memtable.");
+        new_mem_capacity = 1.0;
+        break;
+      }
+    }
+
+    // Check status and propagate
+    // potential error status from c_iter
+    if (!s.ok()) {
+      c_iter.status().PermitUncheckedError();
+    } else if (!c_iter.status().ok()) {
+      s = c_iter.status();
+    }
+
+    // Range tombstone transfer.
+    if (s.ok()) {
+      auto range_del_it = range_del_agg->NewIterator();
+      for (range_del_it->SeekToFirst(); range_del_it->Valid();
+           range_del_it->Next()) {
+        auto tombstone = range_del_it->Tombstone();
+        new_first_seqno =
+            tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno;
+        s = new_mem->Add(
+            tombstone.seq_,        // Sequence number
+            kTypeRangeDeletion,    // KV type
+            tombstone.start_key_,  // Key is start key.
+            tombstone.end_key_,    // Value is end key.
+            nullptr,               // KV protection info set as nullptr since it
+                                   // should only be useful for the first add to
+                                   // the original memtable.
+            false,                 // : allow concurrent_memtable_writes_
+                                   // Not seen as necessary for now.
+            nullptr,               // get_post_process_info(m) must be nullptr
+                      // when concurrent_memtable_writes is switched off.
+            nullptr);  // hint, only used when concurrent_memtable_writes_
+                       // is switched on.
+
+        if (!s.ok()) {
+          break;
+        }
+
+        // If new_mem has size greater than maxSize,
+        // then rollback to regular flush operation,
+        // and destroy new_mem.
+        if (new_mem->ApproximateMemoryUsage() > maxSize) {
+          s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+          new_mem_capacity = 1.0;
+          break;
+        }
+      }
+    }
+
+    // If everything happened smoothly and new_mem contains valid data,
+    // decide if it is flushed to storage or kept in the imm()
+    // memtable list (memory).
+    if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) {
+      // Rectify the first sequence number, which (unlike the earliest seq
+      // number) needs to be present in the new memtable.
+      new_mem->SetFirstSequenceNumber(new_first_seqno);
+
+      // The new_mem is added to the list of immutable memtables
+      // only if it filled at less than 100% capacity and isn't flagged
+      // as in need of being flushed.
+      if (new_mem->ApproximateMemoryUsage() < maxSize &&
+          !(new_mem->ShouldFlushNow())) {
+        db_mutex_->Lock();
+        uint64_t new_mem_id = mems_[0]->GetID();
+
+        new_mem->SetID(new_mem_id);
+
+        // This addition will not trigger another flush, because
+        // we do not call SchedulePendingFlush().
+        cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free);
+        new_mem->Ref();
+#ifndef ROCKSDB_LITE
+        // Piggyback FlushJobInfo on the first flushed memtable.
+        db_mutex_->AssertHeld();
+        meta_.fd.file_size = 0;
+        mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
+        db_mutex_->Unlock();
+      } else {
+        s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+        new_mem_capacity = 1.0;
+        if (new_mem) {
+          job_context_->memtables_to_free.push_back(new_mem);
+        }
+      }
+    } else {
+      // In this case, the newly allocated new_mem is empty.
+      assert(new_mem != nullptr);
+      job_context_->memtables_to_free.push_back(new_mem);
+    }
+  }
+
+  // Reacquire the mutex for WriteLevel0 function.
+  db_mutex_->Lock();
+
+  // If mempurge successful, don't write input tables to level0,
+  // but write any full output table to level0.
+  if (s.ok()) {
+    TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful");
+  } else {
+    TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
+  }
+  const uint64_t micros = clock_->NowMicros() - start_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Mempurge lasted %" PRIu64
+                 " microseconds, and %" PRIu64
+                 " cpu "
+                 "microseconds. Status is %s ok. Perc capacity: %f\n",
+                 cfd_->GetName().c_str(), job_context_->job_id, micros,
+                 cpu_micros, s.ok() ? "" : "not", new_mem_capacity);
+
+  return s;
+}
+
+bool FlushJob::MemPurgeDecider() {
+  double threshold = db_options_.experimental_mempurge_threshold;
+  // Never trigger mempurge if threshold is not a strictly positive value.
+  if (!(threshold > 0.0)) {
+    return false;
+  }
+  if (threshold > (1.0 * mems_.size())) {
+    return true;
+  }
+  // Payload and useful_payload (in bytes).
+  // The useful payload ratio of a given MemTable
+  // is estimated to be useful_payload/payload.
+  uint64_t payload = 0, useful_payload = 0, entry_size = 0;
+
+  // Local variables used repetitively inside the for-loop
+  // when iterating over the sampled entries.
+  Slice key_slice, value_slice;
+  ParsedInternalKey res;
+  SnapshotImpl min_snapshot;
+  std::string vget;
+  Status mget_s, parse_s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0, sqno = 0,
+                 min_seqno_snapshot = 0;
+  bool get_res, can_be_useful_payload, not_in_next_mems;
+
+  // If estimated_useful_payload is > threshold,
+  // then flush to storage, else MemPurge.
+  double estimated_useful_payload = 0.0;
+  // Cochran formula for determining sample size.
+  // 95% confidence interval, 7% precision.
+  //    n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0
+  double n0 = 196.0;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+
+  // Iterate over each memtable of the set.
+  for (auto mem_iter = std::begin(mems_); mem_iter != std::end(mems_);
+       mem_iter++) {
+    MemTable* mt = *mem_iter;
+
+    // Else sample from the table.
+    uint64_t nentries = mt->num_entries();
+    // Corrected Cochran formula for small populations
+    // (converges to n0 for large populations).
+    uint64_t target_sample_size =
+        static_cast<uint64_t>(ceil(n0 / (1.0 + (n0 / nentries))));
+    std::unordered_set<const char*> sentries = {};
+    // Populate sample entries set.
+    mt->UniqueRandomSample(target_sample_size, &sentries);
+
+    // Estimate the garbage ratio by comparing if
+    // each sample corresponds to a valid entry.
+    for (const char* ss : sentries) {
+      key_slice = GetLengthPrefixedSlice(ss);
+      parse_s = ParseInternalKey(key_slice, &res, true /*log_err_key*/);
+      if (!parse_s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Memtable Decider: ParseInternalKey did not parse "
+                       "key_slice %s successfully.",
+                       key_slice.data());
+      }
+
+      // Size of the entry is "key size (+ value size if KV entry)"
+      entry_size = key_slice.size();
+      if (res.type == kTypeValue) {
+        value_slice =
+            GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+        entry_size += value_slice.size();
+      }
+
+      // Count entry bytes as payload.
+      payload += entry_size;
+
+      LookupKey lkey(res.user_key, kMaxSequenceNumber);
+
+      // Paranoia: zero out these values just in case.
+      max_covering_tombstone_seq = 0;
+      sqno = 0;
+
+      // Pick the oldest existing snapshot that is more recent
+      // than the sequence number of the sampled entry.
+      min_seqno_snapshot = kMaxSequenceNumber;
+      for (SequenceNumber seq_num : existing_snapshots_) {
+        if (seq_num > res.sequence && seq_num < min_seqno_snapshot) {
+          min_seqno_snapshot = seq_num;
+        }
+      }
+      min_snapshot.number_ = min_seqno_snapshot;
+      ro.snapshot =
+          min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr;
+
+      // Estimate if the sample entry is valid or not.
+      get_res = mt->Get(lkey, &vget, nullptr, &mget_s, &merge_context,
+                        &max_covering_tombstone_seq, &sqno, ro);
+      if (!get_res) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Memtable Get returned false when Get(sampled entry). "
+            "Yet each sample entry should exist somewhere in the memtable, "
+            "unrelated to whether it has been deleted or not.");
+      }
+
+      // TODO(bjlemaire): evaluate typeMerge.
+      // This is where the sampled entry is estimated to be
+      // garbage or not. Note that this is a garbage *estimation*
+      // because we do not include certain items such as
+      // CompactionFitlers triggered at flush, or if the same delete
+      // has been inserted twice or more in the memtable.
+
+      // Evaluate if the entry can be useful payload
+      // Situation #1: entry is a KV entry, was found in the memtable mt
+      //               and the sequence numbers match.
+      can_be_useful_payload = (res.type == kTypeValue) && get_res &&
+                              mget_s.ok() && (sqno == res.sequence);
+
+      // Situation #2: entry is a delete entry, was found in the memtable mt
+      //               (because gres==true) and no valid KV entry is found.
+      //               (note: duplicate delete entries are also taken into
+      //               account here, because the sequence number 'sqno'
+      //               in memtable->Get(&sqno) operation is set to be equal
+      //               to the most recent delete entry as well).
+      can_be_useful_payload |=
+          ((res.type == kTypeDeletion) || (res.type == kTypeSingleDeletion)) &&
+          mget_s.IsNotFound() && get_res && (sqno == res.sequence);
+
+      // If there is a chance that the entry is useful payload
+      // Verify that the entry does not appear in the following memtables
+      // (memtables with greater memtable ID/larger sequence numbers).
+      if (can_be_useful_payload) {
+        not_in_next_mems = true;
+        for (auto next_mem_iter = mem_iter + 1;
+             next_mem_iter != std::end(mems_); next_mem_iter++) {
+          if ((*next_mem_iter)
+                  ->Get(lkey, &vget, nullptr, &mget_s, &merge_context,
+                        &max_covering_tombstone_seq, &sqno, ro)) {
+            not_in_next_mems = false;
+            break;
+          }
+        }
+        if (not_in_next_mems) {
+          useful_payload += entry_size;
+        }
+      }
+    }
+    if (payload > 0) {
+      // We use the estimated useful payload ratio to
+      // evaluate how many of the memtable bytes are useful bytes.
+      estimated_useful_payload +=
+          (mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload);
+
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "Mempurge sampling - found garbage ratio from sampling: %f.\n",
+          (payload - useful_payload) * 1.0 / payload);
+    } else {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Mempurge sampling: null payload measured, and collected "
+                     "sample size is %zu\n.",
+                     sentries.size());
+    }
+  }
+  // We convert the total number of useful payload bytes
+  // into the proportion of memtable necessary to store all these bytes.
+  // We compare this proportion with the threshold value.
+  return ((estimated_useful_payload / mutable_cf_options_.write_buffer_size) <
+          threshold);
+}
+
 Status FlushJob::WriteLevel0Table() {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_FLUSH_WRITE_L0);
   db_mutex_->AssertHeld();
-  const uint64_t start_micros = db_options_.env->NowMicros();
-  const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000;
+  const uint64_t start_micros = clock_->NowMicros();
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
   Status s;
+
+  std::vector<BlobFileAddition> blob_file_additions;
+
   {
     auto write_hint = cfd_->CalculateSSTWriteHint(0);
     db_mutex_->Unlock();
@@ -342,7 +840,7 @@
 
     {
       ScopedArenaIterator iter(
-          NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
+          NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
                              static_cast<int>(memtables.size()), &arena));
       ROCKS_LOG_INFO(db_options_.info_log,
                      "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
@@ -352,7 +850,7 @@
       TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
                                &output_compression_);
       int64_t _current_time = 0;
-      auto status = db_options_.env->GetCurrentTime(&_current_time);
+      auto status = clock_->GetCurrentTime(&_current_time);
       // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
       if (!status.ok()) {
         ROCKS_LOG_WARN(
@@ -368,23 +866,65 @@
 
       // It's not clear whether oldest_key_time is always available. In case
       // it is not available, use current_time.
-      meta_.oldest_ancester_time = std::min(current_time, oldest_key_time);
+      uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time);
+
+      TEST_SYNC_POINT_CALLBACK(
+          "FlushJob::WriteLevel0Table:oldest_ancester_time",
+          &oldest_ancester_time);
+      meta_.oldest_ancester_time = oldest_ancester_time;
+
       meta_.file_creation_time = current_time;
 
+      uint64_t creation_time = (cfd_->ioptions()->compaction_style ==
+                                CompactionStyle::kCompactionStyleFIFO)
+                                   ? current_time
+                                   : meta_.oldest_ancester_time;
+
+      uint64_t num_input_entries = 0;
+      uint64_t memtable_payload_bytes = 0;
+      uint64_t memtable_garbage_bytes = 0;
+      IOStatus io_s;
+      const std::string* const full_history_ts_low =
+          (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
+      TableBuilderOptions tboptions(
+          *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
+          cfd_->int_tbl_prop_collector_factories(), output_compression_,
+          mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kFlush, creation_time, oldest_key_time,
+          current_time, db_id_, db_session_id_, 0 /* target_file_size */,
+          meta_.fd.GetNumber());
       s = BuildTable(
-          dbname_, db_options_.env, db_options_.fs.get(), *cfd_->ioptions(),
-          mutable_cf_options_, file_options_, cfd_->table_cache(), iter.get(),
-          std::move(range_del_iters), &meta_, cfd_->internal_comparator(),
-          cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
-          cfd_->GetName(), existing_snapshots_,
+          dbname_, versions_, db_options_, tboptions, file_options_,
+          cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
+          &blob_file_additions, existing_snapshots_,
           earliest_write_conflict_snapshot_, snapshot_checker_,
-          output_compression_, mutable_cf_options_.sample_for_compression,
-          cfd_->ioptions()->compression_opts,
           mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
-          TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
-          Env::IO_HIGH, &table_properties_, 0 /* level */,
-          meta_.oldest_ancester_time, oldest_key_time, write_hint,
-          current_time);
+          &io_s, io_tracer_, BlobFileCreationReason::kFlush, event_logger_,
+          job_context_->job_id, Env::IO_HIGH, &table_properties_, write_hint,
+          full_history_ts_low, blob_callback_, &num_input_entries,
+          &memtable_payload_bytes, &memtable_garbage_bytes);
+      if (!io_s.ok()) {
+        io_status_ = io_s;
+      }
+      if (num_input_entries != total_num_entries && s.ok()) {
+        std::string msg = "Expected " + ToString(total_num_entries) +
+                          " entries in memtables, but read " +
+                          ToString(num_input_entries);
+        ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+                       cfd_->GetName().c_str(), job_context_->job_id,
+                       msg.c_str());
+        if (db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+      if (tboptions.reason == TableFileCreationReason::kFlush) {
+        TEST_SYNC_POINT("DBImpl::FlushJob:Flush");
+        RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+                   memtable_payload_bytes);
+        RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+                   memtable_garbage_bytes);
+      }
       LogFlush(db_options_.info_log);
     }
     ROCKS_LOG_INFO(db_options_.info_log,
@@ -397,7 +937,9 @@
                    meta_.marked_for_compaction ? " (needs compaction)" : "");
 
     if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
-      s = output_file_directory_->Fsync();
+      s = output_file_directory_->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
     }
     TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
     db_mutex_->Lock();
@@ -406,7 +948,10 @@
 
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
-  if (s.ok() && meta_.fd.GetFileSize() > 0) {
+  const bool has_output = meta_.fd.GetFileSize() > 0;
+
+  if (s.ok() && has_output) {
+    TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated");
     // if we have more than 1 background thread, then we cannot
     // insert files directly into higher levels because some other
     // threads could be concurrently producing compacted files for
@@ -415,9 +960,13 @@
     edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
                    meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
                    meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
-                   meta_.marked_for_compaction, meta_.oldest_blob_file_number,
-                   meta_.oldest_ancester_time, meta_.file_creation_time,
-                   meta_.file_checksum, meta_.file_checksum_func_name);
+                   meta_.marked_for_compaction, meta_.temperature,
+                   meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
+                   meta_.file_creation_time, meta_.file_checksum,
+                   meta_.file_checksum_func_name, meta_.min_timestamp,
+                   meta_.max_timestamp);
+
+    edit_->SetBlobFileAdditions(std::move(blob_file_additions));
   }
 #ifndef ROCKSDB_LITE
   // Piggyback FlushJobInfo on the first first flushed memtable.
@@ -426,14 +975,36 @@
 
   // Note that here we treat flush as level 0 compaction in internal stats
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
-  stats.micros = db_options_.env->NowMicros() - start_micros;
-  stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros;
-  stats.bytes_written = meta_.fd.GetFileSize();
+  const uint64_t micros = clock_->NowMicros() - start_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+  stats.micros = micros;
+  stats.cpu_micros = cpu_micros;
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Flush lasted %" PRIu64
+                 " microseconds, and %" PRIu64 " cpu microseconds.\n",
+                 cfd_->GetName().c_str(), job_context_->job_id, micros,
+                 cpu_micros);
+
+  if (has_output) {
+    stats.bytes_written = meta_.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
+
+  const auto& blobs = edit_->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+  }
+
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
   RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
   cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
-  cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
-                                     meta_.fd.GetFileSize());
+  cfd_->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
   RecordFlushIOStats();
+
   return s;
 }
 
@@ -455,8 +1026,21 @@
   info->largest_seqno = meta_.fd.largest_seqno;
   info->table_properties = table_properties_;
   info->flush_reason = cfd_->GetFlushReason();
+  info->blob_compression_type = mutable_cf_options_.blob_compression_type;
+
+  // Update BlobFilesInfo.
+  for (const auto& blob_file : edit_->GetBlobFileAdditions()) {
+    BlobFileAdditionInfo blob_file_addition_info(
+        BlobFileName(cfd_->ioptions()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+        blob_file.GetTotalBlobBytes());
+    info->blob_file_addition_infos.emplace_back(
+        std::move(blob_file_addition_info));
+  }
   return info;
 }
+
 #endif  // !ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job.h	2025-05-19 16:14:27.000000000 +0000
@@ -17,8 +17,8 @@
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_file_completion_callback.h"
 #include "db/column_family.h"
-#include "db/dbformat.h"
 #include "db/flush_scheduler.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
@@ -60,18 +60,21 @@
   // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
   FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
            const ImmutableDBOptions& db_options,
-           const MutableCFOptions& mutable_cf_options,
-           const uint64_t* max_memtable_id, const FileOptions& file_options,
-           VersionSet* versions, InstrumentedMutex* db_mutex,
-           std::atomic<bool>* shutting_down,
+           const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+           const FileOptions& file_options, VersionSet* versions,
+           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
            std::vector<SequenceNumber> existing_snapshots,
            SequenceNumber earliest_write_conflict_snapshot,
            SnapshotChecker* snapshot_checker, JobContext* job_context,
-           LogBuffer* log_buffer, Directory* db_directory,
-           Directory* output_file_directory, CompressionType output_compression,
-           Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+           LogBuffer* log_buffer, FSDirectory* db_directory,
+           FSDirectory* output_file_directory,
+           CompressionType output_compression, Statistics* stats,
+           EventLogger* event_logger, bool measure_io_stats,
            const bool sync_output_directory, const bool write_manifest,
-           Env::Priority thread_pri);
+           Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+           const std::string& db_id = "", const std::string& db_session_id = "",
+           std::string full_history_ts_low = "",
+           BlobFileCompletionCallback* blob_callback = nullptr);
 
   ~FlushJob();
 
@@ -79,7 +82,8 @@
   // Once PickMemTable() is called, either Run() or Cancel() has to be called.
   void PickMemTable();
   Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
-             FileMetaData* file_meta = nullptr);
+             FileMetaData* file_meta = nullptr,
+             bool* switched_to_mempurge = nullptr);
   void Cancel();
   const autovector<MemTable*>& GetMemTables() const { return mems_; }
 
@@ -89,25 +93,52 @@
   }
 #endif  // !ROCKSDB_LITE
 
+  // Return the IO status
+  IOStatus io_status() const { return io_status_; }
+
  private:
   void ReportStartedFlush();
   void ReportFlushInputSize(const autovector<MemTable*>& mems);
   void RecordFlushIOStats();
   Status WriteLevel0Table();
+
+  // Memtable Garbage Collection algorithm: a MemPurge takes the list
+  // of immutable memtables and filters out (or "purge") the outdated bytes
+  // out of it. The output (the filtered bytes, or "useful payload") is
+  // then transfered into a new memtable. If this memtable is filled, then
+  // the mempurge is aborted and rerouted to a regular flush process. Else,
+  // depending on the heuristics, placed onto the immutable memtable list.
+  // The addition to the imm list will not trigger a flush operation. The
+  // flush of the imm list will instead be triggered once the mutable memtable
+  // is added to the imm list.
+  // This process is typically intended for workloads with heavy overwrites
+  // when we want to avoid SSD writes (and reads) as much as possible.
+  // "MemPurge" is an experimental feature still at a very early stage
+  // of development. At the moment it is only compatible with the Get, Put,
+  // Delete operations as well as Iterators and CompactionFilters.
+  // For this early version, "MemPurge" is called by setting the
+  // options.experimental_mempurge_threshold value as >0.0. When this is
+  // the case, ALL automatic flush operations (kWRiteBufferManagerFull) will
+  // first go through the MemPurge process. Therefore, we strongly
+  // recommend all users not to set this flag as true given that the MemPurge
+  // process has not matured yet.
+  Status MemPurge();
+  bool MemPurgeDecider();
 #ifndef ROCKSDB_LITE
   std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
 #endif  // !ROCKSDB_LITE
 
   const std::string& dbname_;
+  const std::string db_id_;
+  const std::string db_session_id_;
   ColumnFamilyData* cfd_;
   const ImmutableDBOptions& db_options_;
   const MutableCFOptions& mutable_cf_options_;
-  // Pointer to a variable storing the largest memtable id to flush in this
+  // A variable storing the largest memtable id to flush in this
   // flush job. RocksDB uses this variable to select the memtables to flush in
   // this job. All memtables in this column family with an ID smaller than or
-  // equal to *max_memtable_id_ will be selected for flush. If null, then all
-  // memtables in the column family will be selected.
-  const uint64_t* max_memtable_id_;
+  // equal to max_memtable_id_ will be selected for flush.
+  uint64_t max_memtable_id_;
   const FileOptions file_options_;
   VersionSet* versions_;
   InstrumentedMutex* db_mutex_;
@@ -117,8 +148,8 @@
   SnapshotChecker* snapshot_checker_;
   JobContext* job_context_;
   LogBuffer* log_buffer_;
-  Directory* db_directory_;
-  Directory* output_file_directory_;
+  FSDirectory* db_directory_;
+  FSDirectory* output_file_directory_;
   CompressionType output_compression_;
   Statistics* stats_;
   EventLogger* event_logger_;
@@ -153,6 +184,13 @@
   Version* base_;
   bool pick_memtable_called;
   Env::Priority thread_pri_;
+  IOStatus io_status_;
+
+  const std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+
+  const std::string full_history_ts_low_;
+  BlobFileCompletionCallback* blob_callback_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_job_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_job_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,22 +3,25 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "db/flush_job.h"
+
 #include <algorithm>
 #include <array>
 #include <map>
 #include <string>
 
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
-#include "db/flush_job.h"
 #include "db/version_set.h"
 #include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -26,49 +29,35 @@
 // TODO(icanadi) Mock out everything else:
 // 1. VersionSet
 // 2. Memtable
-class FlushJobTest : public testing::Test {
- public:
-  FlushJobTest()
+class FlushJobTestBase : public testing::Test {
+ protected:
+  FlushJobTestBase(std::string dbname, const Comparator* ucmp)
       : env_(Env::Default()),
-        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
-        dbname_(test::PerThreadDBPath("flush_job_test")),
+        fs_(env_->GetFileSystem()),
+        dbname_(std::move(dbname)),
+        ucmp_(ucmp),
         options_(),
         db_options_(options_),
         column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         shutting_down_(false),
-        mock_table_factory_(new mock::MockTableFactory()) {
-    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
-    db_options_.db_paths.emplace_back(dbname_,
-                                      std::numeric_limits<uint64_t>::max());
-    db_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    // TODO(icanadi) Remove this once we mock out VersionSet
-    NewDB();
-    std::vector<ColumnFamilyDescriptor> column_families;
-    cf_options_.table_factory = mock_table_factory_;
-    for (const auto& cf_name : column_family_names_) {
-      column_families.emplace_back(cf_name, cf_options_);
-    }
+        mock_table_factory_(new mock::MockTableFactory()) {}
 
-    db_options_.env = env_;
-    db_options_.fs = fs_;
-    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
-                                   table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_,
-                                   /*block_cache_tracer=*/nullptr));
-    EXPECT_OK(versions_->Recover(column_families, false));
+  virtual ~FlushJobTestBase() {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "db is still in %s\n", dbname_.c_str());
+    } else {
+      // destroy versions_ to release all file handles
+      versions_.reset();
+      EXPECT_OK(DestroyDir(env_, dbname_));
+    }
   }
 
   void NewDB() {
-    SetIdentityFile(env_, dbname_);
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
-      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
-      std::string db_id;
-      impl->GetDbIdentityFromIdentityFile(&db_id);
-      new_db.SetDBId(db_id);
-    }
+
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
@@ -80,6 +69,7 @@
       VersionEdit new_cf;
       new_cf.AddColumnFamily(column_family_names_[i]);
       new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetComparatorName(ucmp_->Name());
       new_cf.SetLogNumber(0);
       new_cf.SetNextFile(2);
       new_cf.SetLastSequence(last_seq++);
@@ -87,17 +77,19 @@
     }
 
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), manifest, EnvOptions()));
+
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
       new_db.EncodeTo(&record);
       s = log.AddRecord(record);
+      ASSERT_OK(s);
 
       for (const auto& e : new_cfs) {
         record.clear();
@@ -108,12 +100,42 @@
     }
     ASSERT_OK(s);
     // Make "CURRENT" file that points to the new manifest file.
-    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+    s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+    ASSERT_OK(s);
+  }
+
+  void SetUp() override {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.statistics = CreateDBStatistics();
+
+    cf_options_.comparator = ucmp_;
+
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : column_family_names_) {
+      column_families.emplace_back(cf_name, cf_options_);
+    }
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families, false));
   }
 
   Env* env_;
   std::shared_ptr<FileSystem> fs_;
   std::string dbname_;
+  const Comparator* const ucmp_;
   EnvOptions env_options_;
   Options options_;
   ImmutableDBOptions db_options_;
@@ -128,19 +150,26 @@
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
 };
 
+class FlushJobTest : public FlushJobTestBase {
+ public:
+  FlushJobTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"),
+                         BytewiseComparator()) {}
+};
+
 TEST_F(FlushJobTest, Empty) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
   EventLogger event_logger(db_options_.info_log.get());
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
-                     db_options_, *cfd->GetLatestMutableCFOptions(),
-                     nullptr /* memtable_id */, env_options_, versions_.get(),
-                     &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
-                     kNoCompression, nullptr, &event_logger, false,
-                     true /* sync_output_directory */,
-                     true /* write_manifest */, Env::Priority::USER);
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */,
+      env_options_, versions_.get(), &mutex_, &shutting_down_, {},
+      kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
+      nullptr, kNoCompression, nullptr, &event_logger, false,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/);
   {
     InstrumentedMutexLock l(&mutex_);
     flush_job.PickMemTable();
@@ -164,25 +193,26 @@
   for (int i = 1; i < 10000; ++i) {
     std::string key(ToString((i + 1000) % 10000));
     std::string value("value" + key);
-    new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+    ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
     if ((i + 1000) % 10000 < 9995) {
       InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
-      inserted_keys.insert({internal_key.Encode().ToString(), value});
+      inserted_keys.push_back({internal_key.Encode().ToString(), value});
     }
   }
 
   {
-    new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
+    ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995",
+                           "9999a", nullptr /* kv_prot_info */));
     InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
-    inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
+    inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"});
   }
 
-#ifndef ROCKSDB_LITE
   // Note: the first two blob references will not be considered when resolving
   // the oldest blob file referenced (the first one is inlined TTL, while the
   // second one is TTL and thus points to a TTL blob file).
-  constexpr std::array<uint64_t, 6> blob_file_numbers{
-      kInvalidBlobFileNumber, 5, 103, 17, 102, 101};
+  constexpr std::array<uint64_t, 6> blob_file_numbers{{
+      kInvalidBlobFileNumber, 5, 103, 17, 102, 101}};
   for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
     std::string key(ToString(i + 10001));
     std::string blob_index;
@@ -200,13 +230,13 @@
     }
 
     const SequenceNumber seq(i + 10001);
-    new_mem->Add(seq, kTypeBlobIndex, key, blob_index);
+    ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index,
+                           nullptr /* kv_prot_info */));
 
     InternalKey internal_key(key, seq, kTypeBlobIndex);
-    inserted_keys.emplace_hint(inserted_keys.end(),
-                               internal_key.Encode().ToString(), blob_index);
+    inserted_keys.push_back({internal_key.Encode().ToString(), blob_index});
   }
-#endif
+  mock::SortKVVector(&inserted_keys);
 
   autovector<MemTable*> to_delete;
   cfd->imm()->Add(new_mem, &to_delete);
@@ -216,14 +246,14 @@
 
   EventLogger event_logger(db_options_.info_log.get());
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
-                     db_options_, *cfd->GetLatestMutableCFOptions(),
-                     nullptr /* memtable_id */, env_options_, versions_.get(),
-                     &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
-                     kNoCompression, db_options_.statistics.get(),
-                     &event_logger, true, true /* sync_output_directory */,
-                     true /* write_manifest */, Env::Priority::USER);
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */,
+      env_options_, versions_.get(), &mutex_, &shutting_down_, {},
+      kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
+      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
+      true, true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/);
 
   HistogramData hist;
   FileMetaData file_meta;
@@ -237,12 +267,8 @@
   ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
   ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
   ASSERT_EQ(1, file_meta.fd.smallest_seqno);
-#ifndef ROCKSDB_LITE
   ASSERT_EQ(10006, file_meta.fd.largest_seqno);
   ASSERT_EQ(17, file_meta.oldest_blob_file_number);
-#else
-  ASSERT_EQ(10000, file_meta.fd.largest_seqno);
-#endif
   mock_table_factory_->AssertSingleFile(inserted_keys);
   job_context.Clean();
 }
@@ -266,8 +292,8 @@
     for (size_t j = 0; j < num_keys_per_table; ++j) {
       std::string key(ToString(j + i * num_keys_per_table));
       std::string value("value" + key);
-      mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key,
-               value);
+      ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+                         key, value, nullptr /* kv_prot_info */));
     }
   }
 
@@ -282,15 +308,14 @@
   assert(memtable_ids.size() == num_mems);
   uint64_t smallest_memtable_id = memtable_ids.front();
   uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
-
-  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
-                     db_options_, *cfd->GetLatestMutableCFOptions(),
-                     &flush_memtable_id, env_options_, versions_.get(), &mutex_,
-                     &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker,
-                     &job_context, nullptr, nullptr, nullptr, kNoCompression,
-                     db_options_.statistics.get(), &event_logger, true,
-                     true /* sync_output_directory */,
-                     true /* write_manifest */, Env::Priority::USER);
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/);
   HistogramData hist;
   FileMetaData file_meta;
   mutex_.Lock();
@@ -340,7 +365,8 @@
       for (size_t j = 0; j != num_keys_per_memtable; ++j) {
         std::string key(ToString(j + i * num_keys_per_memtable));
         std::string value("value" + key);
-        mem->Add(curr_seqno++, kTypeValue, key, value);
+        ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
       }
 
       cfd->imm()->Add(mem, &to_delete);
@@ -357,12 +383,12 @@
     std::vector<SequenceNumber> snapshot_seqs;
     flush_jobs.emplace_back(new FlushJob(
         dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
-        &memtable_ids[k], env_options_, versions_.get(), &mutex_,
+        memtable_ids[k], env_options_, versions_.get(), &mutex_,
         &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
         &job_context, nullptr, nullptr, nullptr, kNoCompression,
         db_options_.statistics.get(), &event_logger, true,
         false /* sync_output_directory */, false /* write_manifest */,
-        Env::Priority::USER));
+        Env::Priority::USER, nullptr /*IOTracer*/));
     k++;
   }
   HistogramData hist;
@@ -392,10 +418,18 @@
   for (auto cfd : all_cfds) {
     mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
   }
+  autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+      committed_flush_jobs_info;
+#ifndef ROCKSDB_LITE
+  for (auto& job : flush_jobs) {
+    committed_flush_jobs_info.push_back(job->GetCommittedFlushJobsInfo());
+  }
+#endif  //! ROCKSDB_LITE
 
   Status s = InstallMemtableAtomicFlushResults(
       nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
-      versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free,
+      versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs,
+      committed_flush_jobs_info, &job_context.memtables_to_free,
       nullptr /* db_directory */, nullptr /* log_buffer */);
   ASSERT_OK(s);
 
@@ -448,9 +482,10 @@
     std::string key(ToString(i));
     int insertions = rnd.Uniform(max_inserts_per_keys);
     for (int j = 0; j < insertions; ++j) {
-      std::string value(test::RandomHumanReadableString(&rnd, 10));
+      std::string value(rnd.HumanReadableString(10));
       auto seqno = ++current_seqno;
-      new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value);
+      ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value,
+                             nullptr /* kv_prot_info */));
       // a key is visible only if:
       // 1. it's the last one written (j == insertions - 1)
       // 2. there's a snapshot pointing at it
@@ -458,10 +493,11 @@
                      (snapshots_set.find(seqno) != snapshots_set.end());
       if (visible) {
         InternalKey internal_key(key, seqno, kTypeValue);
-        inserted_keys.insert({internal_key.Encode().ToString(), value});
+        inserted_keys.push_back({internal_key.Encode().ToString(), value});
       }
     }
   }
+  mock::SortKVVector(&inserted_keys);
 
   autovector<MemTable*> to_delete;
   cfd->imm()->Add(new_mem, &to_delete);
@@ -471,14 +507,14 @@
 
   EventLogger event_logger(db_options_.info_log.get());
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
-                     db_options_, *cfd->GetLatestMutableCFOptions(),
-                     nullptr /* memtable_id */, env_options_, versions_.get(),
-                     &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
-                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
-                     kNoCompression, db_options_.statistics.get(),
-                     &event_logger, true, true /* sync_output_directory */,
-                     true /* write_manifest */, Env::Priority::USER);
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */,
+      env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots,
+      kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
+      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
+      true, true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/);
   mutex_.Lock();
   flush_job.PickMemTable();
   ASSERT_OK(flush_job.Run());
@@ -490,6 +526,136 @@
   job_context.Clean();
 }
 
+class FlushJobTimestampTest : public FlushJobTestBase {
+ public:
+  FlushJobTimestampTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"),
+                         test::ComparatorWithU64Ts()) {}
+
+  void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts,
+                             SequenceNumber seq, ValueType value_type,
+                             Slice value) {
+    std::string key_str(std::move(key));
+    PutFixed64(&key_str, ts);
+    ASSERT_OK(memtable->Add(seq, value_type, key_str, value,
+                            nullptr /* kv_prot_info */));
+  }
+
+ protected:
+  static constexpr uint64_t kStartTs = 10;
+  static constexpr SequenceNumber kStartSeq = 0;
+  SequenceNumber curr_seq_{kStartSeq};
+  std::atomic<uint64_t> curr_ts_{kStartTs};
+};
+
+TEST_F(FlushJobTimestampTest, AllKeysExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    uint64_t ts = curr_ts_.fetch_add(1);
+    SequenceNumber seq = (curr_seq_++);
+    AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                          ValueType::kTypeDeletionWithTimestamp, "");
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  constexpr SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(),
+      &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker,
+      &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string key = test::EncodeInt(0);
+    key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1));
+    InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp);
+    ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode());
+  }
+
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(FlushJobTimestampTest, NoKeyExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, 0);
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(),
+      &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker,
+      &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string ukey = test::EncodeInt(0);
+    std::string smallest_key =
+        ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1);
+    std::string largest_key = ukey + test::EncodeInt(kStartTs);
+    InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue);
+    InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue);
+    ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(largest.Encode(), fmeta.largest.Encode());
+  }
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_scheduler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_scheduler.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/flush_scheduler.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/flush_scheduler.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,10 +5,11 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <atomic>
+#include <cstdint>
 #include <mutex>
 #include <set>
+
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -33,10 +33,11 @@
 //     iter.Next()
 class ForwardLevelIterator : public InternalIterator {
  public:
-  ForwardLevelIterator(const ColumnFamilyData* const cfd,
-                       const ReadOptions& read_options,
-                       const std::vector<FileMetaData*>& files,
-                       const SliceTransform* prefix_extractor)
+  ForwardLevelIterator(
+      const ColumnFamilyData* const cfd, const ReadOptions& read_options,
+      const std::vector<FileMetaData*>& files,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      bool allow_unprepared_value)
       : cfd_(cfd),
         read_options_(read_options),
         files_(files),
@@ -44,7 +45,10 @@
         file_index_(std::numeric_limits<uint32_t>::max()),
         file_iter_(nullptr),
         pinned_iters_mgr_(nullptr),
-        prefix_extractor_(prefix_extractor) {}
+        prefix_extractor_(prefix_extractor),
+        allow_unprepared_value_(allow_unprepared_value) {
+    status_.PermitUncheckedError();  // Allow uninitialized status through
+  }
 
   ~ForwardLevelIterator() override {
     // Reset current pointer
@@ -82,8 +86,9 @@
         prefix_extractor_, /*table_reader_ptr=*/nullptr,
         /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
         /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+        /*max_file_size_for_l0_meta_pin=*/0,
         /*smallest_compaction_key=*/nullptr,
-        /*largest_compaction_key=*/nullptr);
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
     file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
     valid_ = false;
     if (!range_del_agg.IsEmpty()) {
@@ -171,6 +176,16 @@
     }
     return Status::OK();
   }
+  bool PrepareValue() override {
+    assert(valid_);
+    if (file_iter_->PrepareValue()) {
+      return true;
+    }
+
+    assert(!file_iter_->Valid());
+    valid_ = false;
+    return false;
+  }
   bool IsKeyPinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_->IsKeyPinned();
@@ -196,17 +211,21 @@
   Status status_;
   InternalIterator* file_iter_;
   PinnedIteratorsManager* pinned_iters_mgr_;
-  const SliceTransform* prefix_extractor_;
+  // Kept alive by ForwardIterator::sv_->mutable_cf_options
+  const std::shared_ptr<const SliceTransform>& prefix_extractor_;
+  const bool allow_unprepared_value_;
 };
 
 ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
                                  ColumnFamilyData* cfd,
-                                 SuperVersion* current_sv)
+                                 SuperVersion* current_sv,
+                                 bool allow_unprepared_value)
     : db_(db),
       read_options_(read_options),
       cfd_(cfd),
       prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()),
       user_comparator_(cfd->user_comparator()),
+      allow_unprepared_value_(allow_unprepared_value),
       immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
       sv_(current_sv),
       mutable_iter_(nullptr),
@@ -222,6 +241,12 @@
   if (sv_) {
     RebuildIterators(false);
   }
+
+  // immutable_status_ is a local aggregation of the
+  // status of the immutable Iterators.
+  // We have to PermitUncheckedError in case it is never
+  // used, otherwise it will fail ASSERT_STATUS_CHECKED.
+  immutable_status_.PermitUncheckedError();
 }
 
 ForwardIterator::~ForwardIterator() {
@@ -402,7 +427,7 @@
       if (seek_to_first) {
         l0_iters_[i]->SeekToFirst();
       } else {
-        // If the target key passes over the larget key, we are sure Next()
+        // If the target key passes over the largest key, we are sure Next()
         // won't go over this file.
         if (user_comparator_->Compare(target_user_key,
                                       l0[i]->largest.user_key()) > 0) {
@@ -560,6 +585,22 @@
   return immutable_status_;
 }
 
+bool ForwardIterator::PrepareValue() {
+  assert(valid_);
+  if (current_->PrepareValue()) {
+    return true;
+  }
+
+  assert(!current_->Valid());
+  assert(!current_->status().ok());
+  assert(current_ != mutable_iter_); // memtable iterator can't fail
+  assert(immutable_status_.ok());
+
+  valid_ = false;
+  immutable_status_ = current_->status();
+  return false;
+}
+
 Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) {
   assert(prop != nullptr);
   if (prop_name == "rocksdb.iterator.super-version-number") {
@@ -629,8 +670,10 @@
         sv_->mem->NewRangeTombstoneIterator(
             read_options_, sv_->current->version_set()->LastSequence()));
     range_del_agg.AddTombstones(std::move(range_del_iter));
-    sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
-                                         &range_del_agg);
+    // Always return Status::OK().
+    Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+                                                         &range_del_agg);
+    assert(temp_s.ok());
   }
   has_iter_trimmed_for_upper_bound_ = false;
 
@@ -650,14 +693,15 @@
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        sv_->mutable_cf_options.prefix_extractor.get(),
+        sv_->mutable_cf_options.prefix_extractor,
         /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
         TableReaderCaller::kUserIterator, /*arena=*/nullptr,
         /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
         /*smallest_compaction_key=*/nullptr,
-        /*largest_compaction_key=*/nullptr));
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
   }
-  BuildLevelIterators(vstorage);
+  BuildLevelIterators(vstorage, sv_);
   current_ = nullptr;
   is_prev_set_ = false;
 
@@ -691,8 +735,10 @@
         svnew->mem->NewRangeTombstoneIterator(
             read_options_, sv_->current->version_set()->LastSequence()));
     range_del_agg.AddTombstones(std::move(range_del_iter));
-    svnew->imm->AddRangeTombstoneIterators(read_options_, &arena_,
-                                           &range_del_agg);
+    // Always return Status::OK().
+    Status temp_s = svnew->imm->AddRangeTombstoneIterators(
+        read_options_, &arena_, &range_del_agg);
+    assert(temp_s.ok());
   }
 
   const auto* vstorage = sv_->current->storage_info();
@@ -727,12 +773,13 @@
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
         *l0_files_new[inew],
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        svnew->mutable_cf_options.prefix_extractor.get(),
+        svnew->mutable_cf_options.prefix_extractor,
         /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
         TableReaderCaller::kUserIterator, /*arena=*/nullptr,
         /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(svnew->mutable_cf_options),
         /*smallest_compaction_key=*/nullptr,
-        /*largest_compaction_key=*/nullptr));
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
   }
 
   for (auto* f : l0_iters_) {
@@ -745,7 +792,7 @@
     DeleteIterator(l);
   }
   level_iters_.clear();
-  BuildLevelIterators(vstorage_new);
+  BuildLevelIterators(vstorage_new, svnew);
   current_ = nullptr;
   is_prev_set_ = false;
   SVCleanup();
@@ -759,7 +806,8 @@
   }
 }
 
-void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage) {
+void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage,
+                                          SuperVersion* sv) {
   level_iters_.reserve(vstorage->num_levels() - 1);
   for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
     const auto& level_files = vstorage->LevelFiles(level);
@@ -775,7 +823,7 @@
     } else {
       level_iters_.push_back(new ForwardLevelIterator(
           cfd_, read_options_, level_files,
-          sv_->mutable_cf_options.prefix_extractor.get()));
+          sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_));
     }
   }
 }
@@ -791,12 +839,13 @@
     l0_iters_[i] = cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
         *l0_files[i], /*range_del_agg=*/nullptr,
-        sv_->mutable_cf_options.prefix_extractor.get(),
+        sv_->mutable_cf_options.prefix_extractor,
         /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
         TableReaderCaller::kUserIterator, /*arena=*/nullptr,
         /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
         /*smallest_compaction_key=*/nullptr,
-        /*largest_compaction_key=*/nullptr);
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
     l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
   }
 
@@ -945,9 +994,9 @@
 uint32_t ForwardIterator::FindFileInRange(
     const std::vector<FileMetaData*>& files, const Slice& internal_key,
     uint32_t left, uint32_t right) {
-  auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool {
+  auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool {
     return cfd_->internal_comparator().InternalKeyComparator::Compare(
-            f->largest.Encode(), key) < 0;
+            f->largest.Encode(), k) < 0;
   };
   const auto &b = files.begin();
   return static_cast<uint32_t>(std::lower_bound(b + left,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,7 +10,6 @@
 #include <vector>
 #include <queue>
 
-#include "db/dbformat.h"
 #include "memory/arena.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
@@ -39,8 +38,9 @@
   const Comparator* comparator_;
 };
 
-typedef std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
-                            MinIterComparator> MinIterHeap;
+using MinIterHeap =
+    std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
+                        MinIterComparator>;
 
 /**
  * ForwardIterator is a special type of iterator that only supports Seek()
@@ -52,7 +52,8 @@
 class ForwardIterator : public InternalIterator {
  public:
   ForwardIterator(DBImpl* db, const ReadOptions& read_options,
-                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
+                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr,
+                  bool allow_unprepared_value = false);
   virtual ~ForwardIterator();
 
   void SeekForPrev(const Slice& /*target*/) override {
@@ -75,6 +76,7 @@
   virtual Slice key() const override;
   virtual Slice value() const override;
   virtual Status status() const override;
+  virtual bool PrepareValue() override;
   virtual Status GetProperty(std::string prop_name, std::string* prop) override;
   virtual void SetPinnedItersMgr(
       PinnedIteratorsManager* pinned_iters_mgr) override;
@@ -95,7 +97,8 @@
 
   void RebuildIterators(bool refresh_sv);
   void RenewIterators();
-  void BuildLevelIterators(const VersionStorageInfo* vstorage);
+  void BuildLevelIterators(const VersionStorageInfo* vstorage,
+                           SuperVersion* sv);
   void ResetIncompleteIterators();
   void SeekInternal(const Slice& internal_key, bool seek_to_first);
   void UpdateCurrent();
@@ -120,6 +123,7 @@
   ColumnFamilyData* const cfd_;
   const SliceTransform* const prefix_extractor_;
   const Comparator* user_comparator_;
+  const bool allow_unprepared_value_;
   MinIterHeap immutable_min_heap_;
 
   SuperVersion* sv_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/forward_iterator_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -272,7 +272,6 @@
       : db_(db), thread_(&StatsThread::run, this) {}
 
   void run() {
-    //    using namespace std::chrono;
     auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
     uint64_t wlast = 0, rlast = 0;
     while (!done_.load()) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 #include "db/version_edit.h"
 #include "file/file_util.h"
 #include "file/random_access_file_reader.h"
+#include "logging/logging.h"
 #include "table/merging_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
@@ -26,14 +27,14 @@
   for (const auto& file_metadata : metadata_) {
     const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
     IngestedFileInfo file_to_import;
-    status = GetIngestedFileInfo(file_path, &file_to_import, sv);
+    status =
+        GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv);
     if (!status.ok()) {
       return status;
     }
     files_to_import_.push_back(file_to_import);
   }
 
-  const auto ucmp = cfd_->internal_comparator().user_comparator();
   auto num_files = files_to_import_.size();
   if (num_files == 0) {
     return Status::InvalidArgument("The list of files is empty");
@@ -55,17 +56,18 @@
         }
       }
 
-      std::sort(sorted_files.begin(), sorted_files.end(),
-                [&ucmp](const IngestedFileInfo* info1,
-                        const IngestedFileInfo* info2) {
-                  return sstableKeyCompare(ucmp, info1->smallest_internal_key,
-                                           info2->smallest_internal_key) < 0;
-                });
-
-      for (size_t i = 0; i < sorted_files.size() - 1; i++) {
-        if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
-                              sorted_files[i + 1]->smallest_internal_key) >=
-            0) {
+      std::sort(
+          sorted_files.begin(), sorted_files.end(),
+          [this](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+            return cfd_->internal_comparator().Compare(
+                       info1->smallest_internal_key,
+                       info2->smallest_internal_key) < 0;
+          });
+
+      for (size_t i = 0; i + 1 < sorted_files.size(); i++) {
+        if (cfd_->internal_comparator().Compare(
+                sorted_files[i]->largest_internal_key,
+                sorted_files[i + 1]->smallest_internal_key) >= 0) {
           return Status::InvalidArgument("Files have overlapping ranges");
         }
       }
@@ -85,8 +87,6 @@
   // Copy/Move external files into DB
   auto hardlink_files = import_options_.move_files;
   for (auto& f : files_to_import_) {
-    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
-
     const auto path_outside_db = f.external_file_path;
     const auto path_inside_db = TableFileName(
         cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
@@ -100,8 +100,8 @@
       }
     }
     if (!hardlink_files) {
-      status = CopyFile(fs_, path_outside_db, path_inside_db, 0,
-                        db_options_.use_fsync);
+      status = CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+                        db_options_.use_fsync, io_tracer_);
     }
     if (!status.ok()) {
       break;
@@ -140,7 +140,7 @@
   int64_t temp_current_time = 0;
   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
   uint64_t current_time = kUnknownOldestAncesterTime;
-  if (env_->GetCurrentTime(&temp_current_time).ok()) {
+  if (clock_->GetCurrentTime(&temp_current_time).ok()) {
     current_time = oldest_ancester_time =
         static_cast<uint64_t>(temp_current_time);
   }
@@ -152,9 +152,10 @@
     edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
                   f.fd.GetFileSize(), f.smallest_internal_key,
                   f.largest_internal_key, file_metadata.smallest_seqno,
-                  file_metadata.largest_seqno, false, kInvalidBlobFileNumber,
-                  oldest_ancester_time, current_time, kUnknownFileChecksum,
-                  kUnknownFileChecksumFuncName);
+                  file_metadata.largest_seqno, false, file_metadata.temperature,
+                  kInvalidBlobFileNumber, oldest_ancester_time, current_time,
+                  kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                  kDisableUserTimestamp, kDisableUserTimestamp);
 
     // If incoming sequence number is higher, update local sequence number.
     if (file_metadata.largest_seqno > versions_->LastSequence()) {
@@ -196,8 +197,8 @@
 }
 
 Status ImportColumnFamilyJob::GetIngestedFileInfo(
-    const std::string& external_file, IngestedFileInfo* file_to_import,
-    SuperVersion* sv) {
+    const std::string& external_file, uint64_t new_file_number,
+    IngestedFileInfo* file_to_import, SuperVersion* sv) {
   file_to_import->external_file_path = external_file;
 
   // Get external file size
@@ -207,6 +208,10 @@
     return status;
   }
 
+  // Assign FD with number
+  file_to_import->fd =
+      FileDescriptor(new_file_number, 0, file_to_import->file_size);
+
   // Create TableReader for external file
   std::unique_ptr<TableReader> table_reader;
   std::unique_ptr<FSRandomAccessFile> sst_file;
@@ -217,13 +222,18 @@
   if (!status.ok()) {
     return status;
   }
-  sst_file_reader.reset(
-      new RandomAccessFileReader(std::move(sst_file), external_file));
+  sst_file_reader.reset(new RandomAccessFileReader(
+      std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
 
   status = cfd_->ioptions()->table_factory->NewTableReader(
-      TableReaderOptions(*cfd_->ioptions(),
-                         sv->mutable_cf_options.prefix_extractor.get(),
-                         env_options_, cfd_->internal_comparator()),
+      TableReaderOptions(
+          *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+          env_options_, cfd_->internal_comparator(),
+          /*skip_filters*/ false, /*immortal*/ false,
+          /*force_direct_prefetch*/ false, /*level*/ -1,
+          /*block_cache_tracer*/ nullptr,
+          /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+          /*cur_file_num*/ new_file_number),
       std::move(sst_file_reader), file_to_import->file_size, &table_reader);
   if (!status.ok()) {
     return status;
@@ -252,15 +262,21 @@
 
   // Get first (smallest) key from file
   iter->SeekToFirst();
-  if (!ParseInternalKey(iter->key(), &key)) {
-    return Status::Corruption("external file have corrupted keys");
+  Status pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
   }
   file_to_import->smallest_internal_key.SetFrom(key);
 
   // Get last (largest) key from file
   iter->SeekToLast();
-  if (!ParseInternalKey(iter->key(), &key)) {
-    return Status::Corruption("external file have corrupted keys");
+  pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
   }
   file_to_import->largest_internal_key.SetFrom(key);
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_job.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_job.h	2025-05-19 16:14:27.000000000 +0000
@@ -4,35 +4,37 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/dbformat.h"
 #include "db/external_sst_file_ingestion_job.h"
 #include "db/snapshot_impl.h"
 #include "options/db_options.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
 #include "rocksdb/metadata.h"
 #include "rocksdb/sst_file_writer.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
+struct EnvOptions;
+class SystemClock;
 
 // Imports a set of sst files as is into a new column family. Logic is similar
 // to ExternalSstFileIngestionJob.
 class ImportColumnFamilyJob {
  public:
-  ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+  ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd,
                         const ImmutableDBOptions& db_options,
                         const EnvOptions& env_options,
                         const ImportColumnFamilyOptions& import_options,
-                        const std::vector<LiveFileMetaData>& metadata)
-      : env_(env),
+                        const std::vector<LiveFileMetaData>& metadata,
+                        const std::shared_ptr<IOTracer>& io_tracer)
+      : clock_(db_options.clock),
         versions_(versions),
         cfd_(cfd),
         db_options_(db_options),
-        fs_(db_options_.fs.get()),
+        fs_(db_options_.fs, io_tracer),
         env_options_(env_options),
         import_options_(import_options),
-        metadata_(metadata) {}
+        metadata_(metadata),
+        io_tracer_(io_tracer) {}
 
   // Prepare the job by copying external files into the DB.
   Status Prepare(uint64_t next_file_number, SuperVersion* sv);
@@ -54,19 +56,21 @@
   // Open the external file and populate `file_to_import` with all the
   // external information we need to import this file.
   Status GetIngestedFileInfo(const std::string& external_file,
+                             uint64_t new_file_number,
                              IngestedFileInfo* file_to_import,
                              SuperVersion* sv);
 
-  Env* env_;
+  SystemClock* clock_;
   VersionSet* versions_;
   ColumnFamilyData* cfd_;
   const ImmutableDBOptions& db_options_;
-  FileSystem* fs_;
+  const FileSystemPtr fs_;
   const EnvOptions& env_options_;
   autovector<IngestedFileInfo> files_to_import_;
   VersionEdit edit_;
   const ImportColumnFamilyOptions& import_options_;
   std::vector<LiveFileMetaData> metadata_;
+  const std::shared_ptr<IOTracer> io_tracer_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/import_column_family_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/import_column_family_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -1,20 +1,23 @@
 #ifndef ROCKSDB_LITE
 
 #include <functional>
+
 #include "db/db_test_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class ImportColumnFamilyTest : public DBTestBase {
  public:
-  ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") {
+  ImportColumnFamilyTest()
+      : DBTestBase("import_column_family_test", /*env_do_fsync=*/true) {
     sst_files_dir_ = dbname_ + "/sst_files/";
+    export_files_dir_ = test::PerThreadDBPath(env_, "export");
     DestroyAndRecreateExternalSSTFilesDir();
-    export_files_dir_ = test::TmpDir(env_) + "/export";
     import_cfh_ = nullptr;
     import_cfh2_ = nullptr;
     metadata_ptr_ = nullptr;
@@ -22,27 +25,27 @@
 
   ~ImportColumnFamilyTest() {
     if (import_cfh_) {
-      db_->DropColumnFamily(import_cfh_);
-      db_->DestroyColumnFamilyHandle(import_cfh_);
+      EXPECT_OK(db_->DropColumnFamily(import_cfh_));
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
       import_cfh_ = nullptr;
     }
     if (import_cfh2_) {
-      db_->DropColumnFamily(import_cfh2_);
-      db_->DestroyColumnFamilyHandle(import_cfh2_);
+      EXPECT_OK(db_->DropColumnFamily(import_cfh2_));
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh2_));
       import_cfh2_ = nullptr;
     }
     if (metadata_ptr_) {
       delete metadata_ptr_;
       metadata_ptr_ = nullptr;
     }
-    test::DestroyDir(env_, sst_files_dir_);
-    test::DestroyDir(env_, export_files_dir_);
+    EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+    EXPECT_OK(DestroyDir(env_, export_files_dir_));
   }
 
   void DestroyAndRecreateExternalSSTFilesDir() {
-    test::DestroyDir(env_, sst_files_dir_);
-    env_->CreateDir(sst_files_dir_);
-    test::DestroyDir(env_, export_files_dir_);
+    EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+    EXPECT_OK(env_->CreateDir(sst_files_dir_));
+    EXPECT_OK(DestroyDir(env_, export_files_dir_));
   }
 
   LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
@@ -101,9 +104,9 @@
     ASSERT_NE(import_cfh_, nullptr);
 
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, "K1", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value));
     ASSERT_EQ(value, "V1");
-    db_->Get(ReadOptions(), import_cfh_, "K2", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value));
     ASSERT_EQ(value, "V2");
     ASSERT_OK(db_->DropColumnFamily(import_cfh_));
     ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
@@ -122,9 +125,9 @@
     ASSERT_NE(import_cfh_, nullptr);
 
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, "K3", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value));
     ASSERT_EQ(value, "V1");
-    db_->Get(ReadOptions(), import_cfh_, "K4", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value));
     ASSERT_EQ(value, "V2");
   }
 }
@@ -140,7 +143,7 @@
   const std::string file3_sst = sst_files_dir_ + file3_sst_name;
   ASSERT_OK(sfw_cf1.Open(file3_sst));
   for (int i = 0; i < 100; ++i) {
-    sfw_cf1.Put(Key(i), Key(i) + "_val");
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_val"));
   }
   ASSERT_OK(sfw_cf1.Finish());
 
@@ -149,7 +152,7 @@
   const std::string file2_sst = sst_files_dir_ + file2_sst_name;
   ASSERT_OK(sfw_cf1.Open(file2_sst));
   for (int i = 0; i < 100; i += 2) {
-    sfw_cf1.Put(Key(i), Key(i) + "_overwrite1");
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"));
   }
   ASSERT_OK(sfw_cf1.Finish());
 
@@ -158,7 +161,7 @@
   const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
   ASSERT_OK(sfw_cf1.Open(file1a_sst));
   for (int i = 0; i < 52; i += 4) {
-    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
   }
   ASSERT_OK(sfw_cf1.Finish());
 
@@ -167,7 +170,7 @@
   const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
   ASSERT_OK(sfw_cf1.Open(file1b_sst));
   for (int i = 52; i < 100; i += 4) {
-    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
   }
   ASSERT_OK(sfw_cf1.Finish());
 
@@ -176,7 +179,7 @@
   const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
   ASSERT_OK(sfw_cf1.Open(file0a_sst));
   for (int i = 0; i < 100; i += 16) {
-    sfw_cf1.Put(Key(i), Key(i) + "_overwrite3");
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"));
   }
   ASSERT_OK(sfw_cf1.Finish());
 
@@ -185,7 +188,7 @@
   const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
   ASSERT_OK(sfw_cf1.Open(file0b_sst));
   for (int i = 0; i < 100; i += 16) {
-    sfw_cf1.Put(Key(i), Key(i) + "_overwrite4");
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"));
   }
   ASSERT_OK(sfw_cf1.Finish());
 
@@ -211,7 +214,7 @@
 
   for (int i = 0; i < 100; i++) {
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
     if (i % 16 == 0) {
       ASSERT_EQ(value, Key(i) + "_overwrite4");
     } else if (i % 4 == 0) {
@@ -232,7 +235,7 @@
   ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
   for (int i = 0; i < 100; i++) {
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
     if (i % 5 == 0) {
       ASSERT_EQ(value, Key(i) + "_overwrite5");
     } else if (i % 16 == 0) {
@@ -251,7 +254,7 @@
       db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
   for (int i = 0; i < 100; i++) {
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
     if (i % 5 == 0) {
       ASSERT_EQ(value, Key(i) + "_overwrite5");
     } else if (i % 16 == 0) {
@@ -271,7 +274,7 @@
   CreateAndReopenWithCF({"koko"}, options);
 
   for (int i = 0; i < 100; ++i) {
-    Put(1, Key(i), Key(i) + "_val");
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
   }
   ASSERT_OK(Flush(1));
 
@@ -280,13 +283,13 @@
 
   // Overwrite the value in the same set of keys.
   for (int i = 0; i < 100; ++i) {
-    Put(1, Key(i), Key(i) + "_overwrite");
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
   }
 
   // Flush to create L0 file.
   ASSERT_OK(Flush(1));
   for (int i = 0; i < 100; ++i) {
-    Put(1, Key(i), Key(i) + "_overwrite2");
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
   }
 
   // Flush again to create another L0 file. It should have higher sequencer.
@@ -315,12 +318,12 @@
   std::string value1, value2;
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Get(1, Key(i)), value1);
   }
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
     ASSERT_EQ(Get(1, Key(i)), value2);
   }
 
@@ -337,16 +340,16 @@
         db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
   }
   for (int i = 25; i < 50; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite3", value1);
   }
   for (int i = 50; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite2", value1);
   }
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
     ASSERT_EQ(Get(1, Key(i)), value2);
   }
 
@@ -360,16 +363,16 @@
         db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
   }
   for (int i = 25; i < 50; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite3", value1);
   }
   for (int i = 50; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite2", value1);
   }
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
     ASSERT_EQ(Get(1, Key(i)), value2);
   }
 }
@@ -379,7 +382,7 @@
   CreateAndReopenWithCF({"koko"}, options);
 
   for (int i = 0; i < 100; ++i) {
-    Put(1, Key(i), Key(i) + "_val");
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
   }
   ASSERT_OK(Flush(1));
 
@@ -389,14 +392,14 @@
 
   // Overwrite the value in the same set of keys.
   for (int i = 0; i < 50; ++i) {
-    Put(1, Key(i), Key(i) + "_overwrite");
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
   }
 
   // Flush to create L0 file.
   ASSERT_OK(Flush(1));
 
   for (int i = 0; i < 25; ++i) {
-    Put(1, Key(i), Key(i) + "_overwrite2");
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
   }
 
   // Flush again to create another L0 file. It should have higher sequencer.
@@ -411,7 +414,7 @@
 
   // Create a new db and import the files.
   DB* db_copy;
-  test::DestroyDir(env_, dbname_ + "/db_copy");
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* cfh = nullptr;
   ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
@@ -421,13 +424,75 @@
 
   for (int i = 0; i < 100; ++i) {
     std::string value;
-    db_copy->Get(ReadOptions(), cfh, Key(i), &value);
+    ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value));
     ASSERT_EQ(Get(1, Key(i)), value);
   }
-  db_copy->DropColumnFamily(cfh);
-  db_copy->DestroyColumnFamilyHandle(cfh);
+  ASSERT_OK(db_copy->DropColumnFamily(cfh));
+  ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
+  delete db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+}
+
+TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
+  // Imports a column family containing a level where two files overlap at their
+  // endpoints. "Overlap" means the largest user key in one file is the same as
+  // the smallest user key in the second file.
+  const int kFileBytes = 128 << 10;  // 128KB
+  const int kValueBytes = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 2;
+  CreateAndReopenWithCF({"koko"}, options);
+
+  Random rnd(301);
+  // Every key is snapshot protected to ensure older versions will not be
+  // dropped during compaction.
+  std::vector<const Snapshot*> snapshots;
+  snapshots.reserve(kFileBytes / kValueBytes * kNumFiles);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(Put(1, "key", value));
+      snapshots.push_back(db_->GetSnapshot());
+    }
+    ASSERT_OK(Flush(1));
+  }
+
+  // Compact to create overlapping L1 files.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  {
+    std::string value;
+    ASSERT_OK(db_copy->Get(ReadOptions(), cfh, "key", &value));
+  }
+  ASSERT_OK(db_copy->DropColumnFamily(cfh));
+  ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
   delete db_copy;
-  test::DestroyDir(env_, dbname_ + "/db_copy");
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  for (const Snapshot* snapshot : snapshots) {
+    db_->ReleaseSnapshot(snapshot);
+  }
 }
 
 TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,14 +12,21 @@
 
 #include <algorithm>
 #include <cinttypes>
+#include <cstddef>
 #include <limits>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_entry_stats.h"
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
-#include "table/block_based/block_based_table_factory.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "table/block_based/cachable_entry.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -49,6 +56,27 @@
         {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
         {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
         {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}},
+        {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}},
+        {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}},
+};
+
+const std::map<InternalStats::InternalDBStatsType, DBStatInfo>
+    InternalStats::db_stats_type_to_info = {
+        {InternalStats::kIntStatsWalFileBytes,
+         DBStatInfo{"db.wal_bytes_written"}},
+        {InternalStats::kIntStatsWalFileSynced, DBStatInfo{"db.wal_syncs"}},
+        {InternalStats::kIntStatsBytesWritten,
+         DBStatInfo{"db.user_bytes_written"}},
+        {InternalStats::kIntStatsNumKeysWritten,
+         DBStatInfo{"db.user_keys_written"}},
+        {InternalStats::kIntStatsWriteDoneByOther,
+         DBStatInfo{"db.user_writes_by_other"}},
+        {InternalStats::kIntStatsWriteDoneBySelf,
+         DBStatInfo{"db.user_writes_by_self"}},
+        {InternalStats::kIntStatsWriteWithWal,
+         DBStatInfo{"db.user_writes_with_wal"}},
+        {InternalStats::kIntStatsWriteStallMicros,
+         DBStatInfo{"db.user_write_stall_micros"}},
 };
 
 namespace {
@@ -60,12 +88,14 @@
                            const std::string& group_by) {
   int written_size =
       snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
+  written_size = std::min(written_size, static_cast<int>(len));
   auto hdr = [](LevelStatType t) {
     return InternalStats::compaction_level_stats.at(t).header_name.c_str();
   };
   int line_size = snprintf(
       buf + written_size, len - written_size,
-      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s "
+      "%s\n",
       // Note that we skip COMPACTED_FILES and merge it with Files column
       group_by.c_str(), hdr(LevelStatType::NUM_FILES),
       hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
@@ -76,9 +106,11 @@
       hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
       hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
       hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
-      hdr(LevelStatType::KEY_DROP));
+      hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB),
+      hdr(LevelStatType::W_BLOB_GB));
 
   written_size += line_size;
+  written_size = std::min(written_size, static_cast<int>(len));
   snprintf(buf + written_size, len - written_size, "%s\n",
            std::string(line_size, '-').c_str());
 }
@@ -87,10 +119,12 @@
                        int num_files, int being_compacted,
                        double total_file_size, double score, double w_amp,
                        const InternalStats::CompactionStats& stats) {
-  uint64_t bytes_read =
-      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
-  int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
-  double elapsed = (stats.micros + 1) / kMicrosInSec;
+  const uint64_t bytes_read = stats.bytes_read_non_output_levels +
+                              stats.bytes_read_output_level +
+                              stats.bytes_read_blob;
+  const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+  const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
+  const double elapsed = (stats.micros + 1) / kMicrosInSec;
 
   (*level_stats)[LevelStatType::NUM_FILES] = num_files;
   (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted;
@@ -105,8 +139,7 @@
   (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
   (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
   (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed;
-  (*level_stats)[LevelStatType::WRITE_MBPS] =
-      stats.bytes_written / kMB / elapsed;
+  (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed;
   (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
   (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
   (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
@@ -116,6 +149,8 @@
       static_cast<double>(stats.num_input_records);
   (*level_stats)[LevelStatType::KEY_DROP] =
       static_cast<double>(stats.num_dropped_records);
+  (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB;
+  (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB;
 }
 
 void PrintLevelStats(char* buf, size_t len, const std::string& name,
@@ -140,7 +175,9 @@
       "%9d "      /*  Comp(cnt) */
       "%8.3f "    /*  Avg(sec) */
       "%7s "      /*  KeyIn */
-      "%6s\n",    /*  KeyDrop */
+      "%6s "      /*  KeyDrop */
+      "%9.1f "    /*  Rblob(GB) */
+      "%9.1f\n",  /*  Wblob(GB) */
       name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
       static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
       BytesToHumanString(
@@ -165,7 +202,9 @@
           .c_str(),
       NumberToHumanString(
           static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
-          .c_str());
+          .c_str(),
+      stat_value.at(LevelStatType::R_BLOB_GB),
+      stat_value.at(LevelStatType::W_BLOB_GB));
 }
 
 void PrintLevelStats(char* buf, size_t len, const std::string& name,
@@ -206,6 +245,7 @@
 static const std::string cf_file_histogram = "cf-file-histogram";
 static const std::string dbstats = "dbstats";
 static const std::string levelstats = "levelstats";
+static const std::string block_cache_entry_stats = "block-cache-entry-stats";
 static const std::string num_immutable_mem_table = "num-immutable-mem-table";
 static const std::string num_immutable_mem_table_flushed =
     "num-immutable-mem-table-flushed";
@@ -242,6 +282,8 @@
 static const std::string base_level_str = "base-level";
 static const std::string total_sst_files_size = "total-sst-files-size";
 static const std::string live_sst_files_size = "live-sst-files-size";
+static const std::string live_sst_files_size_at_temperature =
+    "live-sst-files-size-at-temperature";
 static const std::string estimate_pending_comp_bytes =
     "estimate-pending-compaction-bytes";
 static const std::string aggregated_table_properties =
@@ -258,6 +300,10 @@
 static const std::string block_cache_usage = "block-cache-usage";
 static const std::string block_cache_pinned_usage = "block-cache-pinned-usage";
 static const std::string options_statistics = "options-statistics";
+static const std::string num_blob_files = "num-blob-files";
+static const std::string blob_stats = "blob-stats";
+static const std::string total_blob_file_size = "total-blob-file-size";
+static const std::string live_blob_file_size = "live-blob-file-size";
 
 const std::string DB::Properties::kNumFilesAtLevelPrefix =
     rocksdb_prefix + num_files_at_level_prefix;
@@ -272,6 +318,8 @@
     rocksdb_prefix + cf_file_histogram;
 const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
 const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
+const std::string DB::Properties::kBlockCacheEntryStats =
+    rocksdb_prefix + block_cache_entry_stats;
 const std::string DB::Properties::kNumImmutableMemTable =
     rocksdb_prefix + num_immutable_mem_table;
 const std::string DB::Properties::kNumImmutableMemTableFlushed =
@@ -347,6 +395,15 @@
     rocksdb_prefix + block_cache_pinned_usage;
 const std::string DB::Properties::kOptionsStatistics =
     rocksdb_prefix + options_statistics;
+const std::string DB::Properties::kLiveSstFilesSizeAtTemperature =
+    rocksdb_prefix + live_sst_files_size_at_temperature;
+const std::string DB::Properties::kNumBlobFiles =
+    rocksdb_prefix + num_blob_files;
+const std::string DB::Properties::kBlobStats = rocksdb_prefix + blob_stats;
+const std::string DB::Properties::kTotalBlobFileSize =
+    rocksdb_prefix + total_blob_file_size;
+const std::string DB::Properties::kLiveBlobFileSize =
+    rocksdb_prefix + live_blob_file_size;
 
 const std::unordered_map<std::string, DBPropertyInfo>
     InternalStats::ppt_name_to_info = {
@@ -370,15 +427,20 @@
          {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr,
           nullptr}},
         {DB::Properties::kDBStats,
-         {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}},
+         {false, &InternalStats::HandleDBStats, nullptr,
+          &InternalStats::HandleDBMapStats, nullptr}},
+        {DB::Properties::kBlockCacheEntryStats,
+         {true, &InternalStats::HandleBlockCacheEntryStats, nullptr,
+          &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}},
         {DB::Properties::kSSTables,
          {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
         {DB::Properties::kAggregatedTableProperties,
          {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
-          nullptr, nullptr}},
+          &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}},
         {DB::Properties::kAggregatedTablePropertiesAtLevel,
          {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
-          nullptr, nullptr, nullptr}},
+          nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap,
+          nullptr}},
         {DB::Properties::kNumImmutableMemTable,
          {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
           nullptr}},
@@ -456,6 +518,9 @@
         {DB::Properties::kLiveSstFilesSize,
          {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr,
           nullptr}},
+        {DB::Properties::kLiveSstFilesSizeAtTemperature,
+         {true, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr,
+          nullptr, nullptr}},
         {DB::Properties::kEstimatePendingCompactionBytes,
          {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes,
           nullptr, nullptr}},
@@ -484,10 +549,253 @@
          {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr,
           nullptr}},
         {DB::Properties::kOptionsStatistics,
-         {false, nullptr, nullptr, nullptr,
+         {true, nullptr, nullptr, nullptr,
           &DBImpl::GetPropertyHandleOptionsStatistics}},
+        {DB::Properties::kNumBlobFiles,
+         {false, nullptr, &InternalStats::HandleNumBlobFiles, nullptr,
+          nullptr}},
+        {DB::Properties::kBlobStats,
+         {false, &InternalStats::HandleBlobStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kTotalBlobFileSize,
+         {false, nullptr, &InternalStats::HandleTotalBlobFileSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveBlobFileSize,
+         {false, nullptr, &InternalStats::HandleLiveBlobFileSize, nullptr,
+          nullptr}},
 };
 
+InternalStats::InternalStats(int num_levels, SystemClock* clock,
+                             ColumnFamilyData* cfd)
+    : db_stats_{},
+      cf_stats_value_{},
+      cf_stats_count_{},
+      comp_stats_(num_levels),
+      comp_stats_by_pri_(Env::Priority::TOTAL),
+      file_read_latency_(num_levels),
+      bg_error_count_(0),
+      number_levels_(num_levels),
+      clock_(clock),
+      cfd_(cfd),
+      started_at_(clock->NowMicros()) {
+  Cache* block_cache = nullptr;
+  bool ok = GetBlockCacheForStats(&block_cache);
+  if (ok) {
+    assert(block_cache);
+    // Extract or create stats collector. Could fail in rare cases.
+    Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
+        block_cache, clock_, &cache_entry_stats_collector_);
+    if (s.ok()) {
+      assert(cache_entry_stats_collector_);
+    } else {
+      assert(!cache_entry_stats_collector_);
+    }
+  } else {
+    assert(!block_cache);
+  }
+}
+
+void InternalStats::TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats,
+                                                bool foreground) {
+  CollectCacheEntryStats(foreground);
+  if (cache_entry_stats_collector_) {
+    cache_entry_stats_collector_->GetStats(stats);
+  }
+}
+
+void InternalStats::CollectCacheEntryStats(bool foreground) {
+  // This function is safe to call from any thread because
+  // cache_entry_stats_collector_ field is const after constructor
+  // and ->GetStats does its own synchronization, which also suffices for
+  // cache_entry_stats_.
+
+  if (!cache_entry_stats_collector_) {
+    return;  // nothing to do (e.g. no block cache)
+  }
+
+  // For "background" collections, strictly cap the collection time by
+  // expanding effective cache TTL. For foreground, be more aggressive about
+  // getting latest data.
+  int min_interval_seconds = foreground ? 10 : 180;
+  // 1/500 = max of 0.2% of one CPU thread
+  int min_interval_factor = foreground ? 10 : 500;
+  cache_entry_stats_collector_->CollectStats(min_interval_seconds,
+                                             min_interval_factor);
+}
+
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+InternalStats::CacheEntryRoleStats::GetEntryCallback() {
+  return [&](const Slice& /*key*/, void* /*value*/, size_t charge,
+             Cache::DeleterFn deleter) {
+    auto e = role_map_.find(deleter);
+    size_t role_idx;
+    if (e == role_map_.end()) {
+      role_idx = static_cast<size_t>(CacheEntryRole::kMisc);
+    } else {
+      role_idx = static_cast<size_t>(e->second);
+    }
+    entry_counts[role_idx]++;
+    total_charges[role_idx] += charge;
+  };
+}
+
+void InternalStats::CacheEntryRoleStats::BeginCollection(
+    Cache* cache, SystemClock*, uint64_t start_time_micros) {
+  Clear();
+  last_start_time_micros_ = start_time_micros;
+  ++collection_count;
+  role_map_ = CopyCacheDeleterRoleMap();
+  std::ostringstream str;
+  str << cache->Name() << "@" << static_cast<void*>(cache) << "#"
+      << port::GetProcessID();
+  cache_id = str.str();
+  cache_capacity = cache->GetCapacity();
+}
+
+void InternalStats::CacheEntryRoleStats::EndCollection(
+    Cache*, SystemClock*, uint64_t end_time_micros) {
+  last_end_time_micros_ = end_time_micros;
+}
+
+void InternalStats::CacheEntryRoleStats::SkippedCollection() {
+  ++copies_of_last_collection;
+}
+
+uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const {
+  if (last_end_time_micros_ > last_start_time_micros_) {
+    return last_end_time_micros_ - last_start_time_micros_;
+  } else {
+    return 0U;
+  }
+}
+
+std::string InternalStats::CacheEntryRoleStats::ToString(
+    SystemClock* clock) const {
+  std::ostringstream str;
+  str << "Block cache " << cache_id
+      << " capacity: " << BytesToHumanString(cache_capacity)
+      << " collections: " << collection_count
+      << " last_copies: " << copies_of_last_collection
+      << " last_secs: " << (GetLastDurationMicros() / 1000000.0)
+      << " secs_since: "
+      << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n";
+  str << "Block cache entry stats(count,size,portion):";
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    if (entry_counts[i] > 0) {
+      str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i]
+          << "," << BytesToHumanString(total_charges[i]) << ","
+          << (100.0 * total_charges[i] / cache_capacity) << "%)";
+    }
+  }
+  str << "\n";
+  return str.str();
+}
+
+void InternalStats::CacheEntryRoleStats::ToMap(
+    std::map<std::string, std::string>* values, SystemClock* clock) const {
+  values->clear();
+  auto& v = *values;
+  v["id"] = cache_id;
+  v["capacity"] = ROCKSDB_NAMESPACE::ToString(cache_capacity);
+  v["secs_for_last_collection"] =
+      ROCKSDB_NAMESPACE::ToString(GetLastDurationMicros() / 1000000.0);
+  v["secs_since_last_collection"] = ROCKSDB_NAMESPACE::ToString(
+      (clock->NowMicros() - last_end_time_micros_) / 1000000U);
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    std::string role = kCacheEntryRoleToHyphenString[i];
+    v["count." + role] = ROCKSDB_NAMESPACE::ToString(entry_counts[i]);
+    v["bytes." + role] = ROCKSDB_NAMESPACE::ToString(total_charges[i]);
+    v["percent." + role] =
+        ROCKSDB_NAMESPACE::ToString(100.0 * total_charges[i] / cache_capacity);
+  }
+}
+
+bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
+                                               Slice /*suffix*/) {
+  if (!cache_entry_stats_collector_) {
+    return false;
+  }
+  CollectCacheEntryStats(/*foreground*/ true);
+  CacheEntryRoleStats stats;
+  cache_entry_stats_collector_->GetStats(&stats);
+  *value = stats.ToString(clock_);
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  if (!cache_entry_stats_collector_) {
+    return false;
+  }
+  CollectCacheEntryStats(/*foreground*/ true);
+  CacheEntryRoleStats stats;
+  cache_entry_stats_collector_->GetStats(&stats);
+  stats.ToMap(values, clock_);
+  return true;
+}
+
+bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value,
+                                                        Slice suffix) {
+  uint64_t temperature;
+  bool ok = ConsumeDecimalNumber(&suffix, &temperature) && suffix.empty();
+  if (!ok) {
+    return false;
+  }
+
+  uint64_t size = 0;
+  const auto* vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); level++) {
+    for (const auto& file_meta : vstorage->LevelFiles(level)) {
+      if (static_cast<uint8_t>(file_meta->temperature) == temperature) {
+        size += file_meta->fd.GetFileSize();
+      }
+    }
+  }
+
+  *value = ToString(size);
+  return true;
+}
+
+bool InternalStats::HandleNumBlobFiles(uint64_t* value, DBImpl* /*db*/,
+                                       Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  const auto& blob_files = vstorage->GetBlobFiles();
+  *value = blob_files.size();
+  return true;
+}
+
+bool InternalStats::HandleBlobStats(std::string* value, Slice /*suffix*/) {
+  std::ostringstream oss;
+  auto* current_version = cfd_->current();
+  const auto& blob_files = current_version->storage_info()->GetBlobFiles();
+  uint64_t current_num_blob_files = blob_files.size();
+  uint64_t current_file_size = 0;
+  uint64_t current_garbage_size = 0;
+  for (const auto& pair : blob_files) {
+    const auto& meta = pair.second;
+    current_file_size += meta->GetBlobFileSize();
+    current_garbage_size += meta->GetGarbageBlobBytes();
+  }
+  oss << "Number of blob files: " << current_num_blob_files
+      << "\nTotal size of blob files: " << current_file_size
+      << "\nTotal size of garbage in blob files: " << current_garbage_size
+      << '\n';
+  value->append(oss.str());
+  return true;
+}
+
+bool InternalStats::HandleTotalBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  *value = cfd_->GetTotalBlobFileSize();
+  return true;
+}
+
+bool InternalStats::HandleLiveBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = vstorage->GetTotalBlobFileSize();
+  return true;
+}
+
 const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
   std::string ppt_name = GetPropertyNameAndArg(property).first.ToString();
   auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name);
@@ -507,11 +815,12 @@
 }
 
 bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
-                                   const Slice& /*property*/,
+                                   const Slice& property,
                                    std::map<std::string, std::string>* value) {
   assert(value != nullptr);
   assert(property_info.handle_map != nullptr);
-  return (this->*(property_info.handle_map))(value);
+  Slice arg = GetPropertyNameAndArg(property).second;
+  return (this->*(property_info.handle_map))(value, arg);
 }
 
 bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info,
@@ -587,7 +896,7 @@
 }
 
 bool InternalStats::HandleCFMapStats(
-    std::map<std::string, std::string>* cf_stats) {
+    std::map<std::string, std::string>* cf_stats, Slice /*suffix*/) {
   DumpCFMapStats(cf_stats);
   return true;
 }
@@ -609,6 +918,12 @@
   return true;
 }
 
+bool InternalStats::HandleDBMapStats(
+    std::map<std::string, std::string>* db_stats, Slice /*suffix*/) {
+  DumpDBMapStats(db_stats);
+  return true;
+}
+
 bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) {
   DumpDBStats(value);
   return true;
@@ -631,7 +946,27 @@
   return true;
 }
 
-bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value,
+static std::map<std::string, std::string> MapUint64ValuesToString(
+    const std::map<std::string, uint64_t>& from) {
+  std::map<std::string, std::string> to;
+  for (const auto& e : from) {
+    to[e.first] = ToString(e.second);
+  }
+  return to;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values,
                                                            Slice suffix) {
   uint64_t level;
   bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
@@ -644,7 +979,24 @@
   if (!s.ok()) {
     return false;
   }
-  *value = tp->ToString();
+  *values = tp->ToString();
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap(
+    std::map<std::string, std::string>* values, Slice suffix) {
+  uint64_t level;
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(
+      &tp, static_cast<int>(level));
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
   return true;
 }
 
@@ -698,21 +1050,24 @@
 bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
                                                 Version* /*version*/) {
   // Current size of the active memtable
-  *value = cfd_->mem()->ApproximateMemoryUsage();
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast();
   return true;
 }
 
 bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
                                               Version* /*version*/) {
   // Current size of the active memtable + immutable memtables
-  *value = cfd_->mem()->ApproximateMemoryUsage() +
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
            cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
   return true;
 }
 
 bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
                                            Version* /*version*/) {
-  *value = cfd_->mem()->ApproximateMemoryUsage() +
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
            cfd_->imm()->ApproximateMemoryUsage();
   return true;
 }
@@ -798,7 +1153,7 @@
 
 bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
                                                  Version* /*version*/) {
-  *value = db->IsFileDeletionsEnabled();
+  *value = db->IsFileDeletionsEnabled() ? 1 : 0;
   return true;
 }
 
@@ -903,29 +1258,19 @@
   return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
 }
 
-bool InternalStats::HandleBlockCacheStat(Cache** block_cache) {
+bool InternalStats::GetBlockCacheForStats(Cache** block_cache) {
   assert(block_cache != nullptr);
-  auto* table_factory = cfd_->ioptions()->table_factory;
+  auto* table_factory = cfd_->ioptions()->table_factory.get();
   assert(table_factory != nullptr);
-  if (BlockBasedTableFactory::kName != table_factory->Name()) {
-    return false;
-  }
-  auto* table_options =
-      reinterpret_cast<BlockBasedTableOptions*>(table_factory->GetOptions());
-  if (table_options == nullptr) {
-    return false;
-  }
-  *block_cache = table_options->block_cache.get();
-  if (table_options->no_block_cache || *block_cache == nullptr) {
-    return false;
-  }
-  return true;
+  *block_cache =
+      table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+  return *block_cache != nullptr;
 }
 
 bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/,
                                              Version* /*version*/) {
   Cache* block_cache;
-  bool ok = HandleBlockCacheStat(&block_cache);
+  bool ok = GetBlockCacheForStats(&block_cache);
   if (!ok) {
     return false;
   }
@@ -936,7 +1281,7 @@
 bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/,
                                           Version* /*version*/) {
   Cache* block_cache;
-  bool ok = HandleBlockCacheStat(&block_cache);
+  bool ok = GetBlockCacheForStats(&block_cache);
   if (!ok) {
     return false;
   }
@@ -947,7 +1292,7 @@
 bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
                                                 Version* /*version*/) {
   Cache* block_cache;
-  bool ok = HandleBlockCacheStat(&block_cache);
+  bool ok = GetBlockCacheForStats(&block_cache);
   if (!ok) {
     return false;
   }
@@ -955,10 +1300,21 @@
   return true;
 }
 
+void InternalStats::DumpDBMapStats(
+    std::map<std::string, std::string>* db_stats) {
+  for (int i = 0; i < static_cast<int>(kIntStatsNumMax); ++i) {
+    InternalDBStatsType type = static_cast<InternalDBStatsType>(i);
+    (*db_stats)[db_stats_type_to_info.at(type).property_name] =
+        std::to_string(GetDBStats(type));
+  }
+  double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
+  (*db_stats)["db.uptime"] = std::to_string(seconds_up);
+}
+
 void InternalStats::DumpDBStats(std::string* value) {
   char buf[1000];
   // DB-level stats, only available from default column family
-  double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+  double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
   double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
   snprintf(buf, sizeof(buf),
            "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
@@ -995,8 +1351,10 @@
            NumberToHumanString(write_other + write_self).c_str(),
            NumberToHumanString(num_keys_written).c_str(),
            NumberToHumanString(write_self).c_str(),
-           (write_other + write_self) / static_cast<double>(write_self + 1),
-           user_bytes_written / kGB, user_bytes_written / kMB / seconds_up);
+           (write_other + write_self) /
+               std::max(1.0, static_cast<double>(write_self)),
+           user_bytes_written / kGB,
+           user_bytes_written / kMB / std::max(seconds_up, 0.001));
   value->append(buf);
   // WAL
   snprintf(buf, sizeof(buf),
@@ -1004,8 +1362,8 @@
            "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
            NumberToHumanString(write_with_wal).c_str(),
            NumberToHumanString(wal_synced).c_str(),
-           write_with_wal / static_cast<double>(wal_synced + 1),
-           wal_bytes / kGB, wal_bytes / kMB / seconds_up);
+           write_with_wal / std::max(1.0, static_cast<double>(wal_synced)),
+           wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001));
   value->append(buf);
   // Stall
   AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
@@ -1028,7 +1386,7 @@
       NumberToHumanString(interval_num_keys_written).c_str(),
       NumberToHumanString(interval_write_self).c_str(),
       static_cast<double>(interval_write_other + interval_write_self) /
-          (interval_write_self + 1),
+          std::max(1.0, static_cast<double>(interval_write_self)),
       (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
       (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
           std::max(interval_seconds_up, 0.001)),
@@ -1039,15 +1397,15 @@
   uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
   uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
 
-  snprintf(
-      buf, sizeof(buf),
-      "Interval WAL: %s writes, %s syncs, "
-      "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
-      NumberToHumanString(interval_write_with_wal).c_str(),
-      NumberToHumanString(interval_wal_synced).c_str(),
-      interval_write_with_wal / static_cast<double>(interval_wal_synced + 1),
-      interval_wal_bytes / kGB,
-      interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  snprintf(buf, sizeof(buf),
+           "Interval WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(interval_write_with_wal).c_str(),
+           NumberToHumanString(interval_wal_synced).c_str(),
+           interval_write_with_wal /
+               std::max(1.0, static_cast<double>(interval_wal_synced)),
+           interval_wal_bytes / kGB,
+           interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
   value->append(buf);
 
   // Stall
@@ -1080,9 +1438,10 @@
  */
 void InternalStats::DumpCFMapStats(
     std::map<std::string, std::string>* cf_stats) {
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
   CompactionStats compaction_stats_sum;
   std::map<int, std::map<LevelStatType, double>> levels_stats;
-  DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
   for (auto const& level_ent : levels_stats) {
     auto level_str =
         level_ent.first == -1 ? "Sum" : "L" + ToString(level_ent.first);
@@ -1099,9 +1458,10 @@
 }
 
 void InternalStats::DumpCFMapStats(
+    const VersionStorageInfo* vstorage,
     std::map<int, std::map<LevelStatType, double>>* levels_stats,
     CompactionStats* compaction_stats_sum) {
-  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+  assert(vstorage);
 
   int num_levels_to_check =
       (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
@@ -1142,12 +1502,14 @@
       if (level == 0) {
         input_bytes = curr_ingest;
       } else {
-        input_bytes = comp_stats_[level].bytes_read_non_output_levels;
+        input_bytes = comp_stats_[level].bytes_read_non_output_levels +
+                      comp_stats_[level].bytes_read_blob;
       }
       double w_amp =
           (input_bytes == 0)
               ? 0.0
-              : static_cast<double>(comp_stats_[level].bytes_written) /
+              : static_cast<double>(comp_stats_[level].bytes_written +
+                                    comp_stats_[level].bytes_written_blob) /
                     input_bytes;
       std::map<LevelStatType, double> level_stats;
       PrepareLevelStats(&level_stats, files, files_being_compacted[level],
@@ -1157,8 +1519,11 @@
     }
   }
   // Cumulative summary
-  double w_amp = compaction_stats_sum->bytes_written /
-                 static_cast<double>(curr_ingest + 1);
+  double w_amp = (0 == curr_ingest)
+                     ? 0.0
+                     : (compaction_stats_sum->bytes_written +
+                        compaction_stats_sum->bytes_written_blob) /
+                           static_cast<double>(curr_ingest);
   // Stats summary across levels
   std::map<LevelStatType, double> sum_stats;
   PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted,
@@ -1224,9 +1589,10 @@
   value->append(buf);
 
   // Print stats for each level
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
   std::map<int, std::map<LevelStatType, double>> levels_stats;
   CompactionStats compaction_stats_sum;
-  DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
   for (int l = 0; l < number_levels_; ++l) {
     if (levels_stats.find(l) != levels_stats.end()) {
       PrintLevelStats(buf, sizeof(buf), "L" + ToString(l), levels_stats[l]);
@@ -1262,7 +1628,8 @@
   CompactionStats interval_stats(compaction_stats_sum);
   interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
   double w_amp =
-      interval_stats.bytes_written / static_cast<double>(interval_ingest);
+      (interval_stats.bytes_written + interval_stats.bytes_written_blob) /
+      static_cast<double>(interval_ingest);
   PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
   value->append(buf);
 
@@ -1281,7 +1648,14 @@
     }
   }
 
-  double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+  snprintf(buf, sizeof(buf),
+           "\nBlob file count: %" ROCKSDB_PRIszt ", total size: %.1f GB\n\n",
+           vstorage->GetBlobFiles().size(),
+           vstorage->GetTotalBlobFileSize() / kGB);
+  value->append(buf);
+
+  uint64_t now_micros = clock_->NowMicros();
+  double seconds_up = (now_micros - started_at_) / kMicrosInSec;
   double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
   snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
            seconds_up, interval_seconds_up);
@@ -1321,16 +1695,20 @@
   uint64_t compact_micros = 0;
   for (int level = 0; level < number_levels_; level++) {
     compact_bytes_read += comp_stats_[level].bytes_read_output_level +
-                          comp_stats_[level].bytes_read_non_output_levels;
-    compact_bytes_write += comp_stats_[level].bytes_written;
+                          comp_stats_[level].bytes_read_non_output_levels +
+                          comp_stats_[level].bytes_read_blob;
+    compact_bytes_write += comp_stats_[level].bytes_written +
+                           comp_stats_[level].bytes_written_blob;
     compact_micros += comp_stats_[level].micros;
   }
 
   snprintf(buf, sizeof(buf),
            "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
            "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
-           compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up,
-           compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up,
+           compact_bytes_write / kGB,
+           compact_bytes_write / kMB / std::max(seconds_up, 0.001),
+           compact_bytes_read / kGB,
+           compact_bytes_read / kMB / std::max(seconds_up, 0.001),
            compact_micros / kMicrosInSec);
   value->append(buf);
 
@@ -1393,24 +1771,45 @@
   cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile;
   cf_stats_snapshot_.comp_stats = compaction_stats_sum;
   cf_stats_snapshot_.stall_count = total_stall_count;
+
+  // Do not gather cache entry stats during CFStats because DB
+  // mutex is held. Only dump last cached collection (rely on DB
+  // periodic stats dump to update)
+  if (cache_entry_stats_collector_) {
+    CacheEntryRoleStats stats;
+    // thread safe
+    cache_entry_stats_collector_->GetStats(&stats);
+
+    constexpr uint64_t kDayInMicros = uint64_t{86400} * 1000000U;
+
+    // Skip if stats are extremely old (> 1 day, incl not yet populated)
+    if (now_micros - stats.last_end_time_micros_ < kDayInMicros) {
+      value->append(stats.ToString(clock_));
+    }
+  }
 }
 
 void InternalStats::DumpCFFileHistogram(std::string* value) {
-  char buf[2000];
-  snprintf(buf, sizeof(buf),
-           "\n** File Read Latency Histogram By Level [%s] **\n",
-           cfd_->GetName().c_str());
-  value->append(buf);
+  assert(value);
+  assert(cfd_);
+
+  std::ostringstream oss;
+  oss << "\n** File Read Latency Histogram By Level [" << cfd_->GetName()
+      << "] **\n";
 
   for (int level = 0; level < number_levels_; level++) {
     if (!file_read_latency_[level].Empty()) {
-      char buf2[5000];
-      snprintf(buf2, sizeof(buf2),
-               "** Level %d read latency histogram (micros):\n%s\n", level,
-               file_read_latency_[level].ToString().c_str());
-      value->append(buf2);
+      oss << "** Level " << level << " read latency histogram (micros):\n"
+          << file_read_latency_[level].ToString() << '\n';
     }
   }
+
+  if (!blob_file_read_latency_.Empty()) {
+    oss << "** Blob file read latency histogram (micros):\n"
+        << blob_file_read_latency_.ToString() << '\n';
+  }
+
+  value->append(oss.str());
 }
 
 #else
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/internal_stats.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/internal_stats.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,16 +9,22 @@
 //
 
 #pragma once
+
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "cache/cache_entry_roles.h"
 #include "db/version_set.h"
+#include "rocksdb/system_clock.h"
 
 class ColumnFamilyData;
 
 namespace ROCKSDB_NAMESPACE {
 
+template <class Stats>
+class CacheEntryStatsCollector;
 class DBImpl;
 class MemTableList;
 
@@ -44,7 +50,9 @@
                                     Version* version);
 
   // @param props Map of general properties to populate
-  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props);
+  // @param suffix Argument portion of the property. (see handle_string)
+  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props,
+                                    Slice suffix);
 
   // handle the string type properties rely on DBImpl methods
   // @param value Value-result argument for storing the property's string value
@@ -76,6 +84,8 @@
   AVG_SEC,
   KEY_IN,
   KEY_DROP,
+  R_BLOB_GB,
+  W_BLOB_GB,
   TOTAL  // total number of types
 };
 
@@ -86,6 +96,11 @@
   std::string header_name;
 };
 
+struct DBStatInfo {
+  // This what will be property_name in the flat map returned to the user
+  std::string property_name;
+};
+
 class InternalStats {
  public:
   static const std::map<LevelStatType, LevelStat> compaction_level_stats;
@@ -120,18 +135,9 @@
     kIntStatsNumMax,
   };
 
-  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
-      : db_stats_{},
-        cf_stats_value_{},
-        cf_stats_count_{},
-        comp_stats_(num_levels),
-        comp_stats_by_pri_(Env::Priority::TOTAL),
-        file_read_latency_(num_levels),
-        bg_error_count_(0),
-        number_levels_(num_levels),
-        env_(env),
-        cfd_(cfd),
-        started_at_(env->NowMicros()) {}
+  static const std::map<InternalDBStatsType, DBStatInfo> db_stats_type_to_info;
+
+  InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
 
   // Per level compaction stats.  comp_stats_[level] stores the stats for
   // compactions that produced data for the specified "level".
@@ -139,32 +145,42 @@
     uint64_t micros;
     uint64_t cpu_micros;
 
-    // The number of bytes read from all non-output levels
+    // The number of bytes read from all non-output levels (table files)
     uint64_t bytes_read_non_output_levels;
 
-    // The number of bytes read from the compaction output level.
+    // The number of bytes read from the compaction output level (table files)
     uint64_t bytes_read_output_level;
 
-    // Total number of bytes written during compaction
+    // The number of bytes read from blob files
+    uint64_t bytes_read_blob;
+
+    // Total number of bytes written to table files during compaction
     uint64_t bytes_written;
 
-    // Total number of bytes moved to the output level
+    // Total number of bytes written to blob files during compaction
+    uint64_t bytes_written_blob;
+
+    // Total number of bytes moved to the output level (table files)
     uint64_t bytes_moved;
 
-    // The number of compaction input files in all non-output levels.
+    // The number of compaction input files in all non-output levels (table
+    // files)
     int num_input_files_in_non_output_levels;
 
-    // The number of compaction input files in the output level.
+    // The number of compaction input files in the output level (table files)
     int num_input_files_in_output_level;
 
-    // The number of compaction output files.
+    // The number of compaction output files (table files)
     int num_output_files;
 
+    // The number of compaction output files (blob files)
+    int num_output_files_blob;
+
     // Total incoming entries during compaction between levels N and N+1
     uint64_t num_input_records;
 
     // Accumulated diff number of entries
-    // (num input entries - num output entires) for compaction  levels N and N+1
+    // (num input entries - num output entries) for compaction levels N and N+1
     uint64_t num_dropped_records;
 
     // Number of compactions done
@@ -178,11 +194,14 @@
           cpu_micros(0),
           bytes_read_non_output_levels(0),
           bytes_read_output_level(0),
+          bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
           num_input_files_in_output_level(0),
           num_output_files(0),
+          num_output_files_blob(0),
           num_input_records(0),
           num_dropped_records(0),
           count(0) {
@@ -197,11 +216,14 @@
           cpu_micros(0),
           bytes_read_non_output_levels(0),
           bytes_read_output_level(0),
+          bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
           num_input_files_in_output_level(0),
           num_output_files(0),
+          num_output_files_blob(0),
           num_input_records(0),
           num_dropped_records(0),
           count(c) {
@@ -222,12 +244,15 @@
           cpu_micros(c.cpu_micros),
           bytes_read_non_output_levels(c.bytes_read_non_output_levels),
           bytes_read_output_level(c.bytes_read_output_level),
+          bytes_read_blob(c.bytes_read_blob),
           bytes_written(c.bytes_written),
+          bytes_written_blob(c.bytes_written_blob),
           bytes_moved(c.bytes_moved),
           num_input_files_in_non_output_levels(
               c.num_input_files_in_non_output_levels),
           num_input_files_in_output_level(c.num_input_files_in_output_level),
           num_output_files(c.num_output_files),
+          num_output_files_blob(c.num_output_files_blob),
           num_input_records(c.num_input_records),
           num_dropped_records(c.num_dropped_records),
           count(c.count) {
@@ -242,12 +267,15 @@
       cpu_micros = c.cpu_micros;
       bytes_read_non_output_levels = c.bytes_read_non_output_levels;
       bytes_read_output_level = c.bytes_read_output_level;
+      bytes_read_blob = c.bytes_read_blob;
       bytes_written = c.bytes_written;
+      bytes_written_blob = c.bytes_written_blob;
       bytes_moved = c.bytes_moved;
       num_input_files_in_non_output_levels =
           c.num_input_files_in_non_output_levels;
       num_input_files_in_output_level = c.num_input_files_in_output_level;
       num_output_files = c.num_output_files;
+      num_output_files_blob = c.num_output_files_blob;
       num_input_records = c.num_input_records;
       num_dropped_records = c.num_dropped_records;
       count = c.count;
@@ -264,11 +292,14 @@
       this->cpu_micros = 0;
       this->bytes_read_non_output_levels = 0;
       this->bytes_read_output_level = 0;
+      this->bytes_read_blob = 0;
       this->bytes_written = 0;
+      this->bytes_written_blob = 0;
       this->bytes_moved = 0;
       this->num_input_files_in_non_output_levels = 0;
       this->num_input_files_in_output_level = 0;
       this->num_output_files = 0;
+      this->num_output_files_blob = 0;
       this->num_input_records = 0;
       this->num_dropped_records = 0;
       this->count = 0;
@@ -283,13 +314,16 @@
       this->cpu_micros += c.cpu_micros;
       this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
       this->bytes_read_output_level += c.bytes_read_output_level;
+      this->bytes_read_blob += c.bytes_read_blob;
       this->bytes_written += c.bytes_written;
+      this->bytes_written_blob += c.bytes_written_blob;
       this->bytes_moved += c.bytes_moved;
       this->num_input_files_in_non_output_levels +=
           c.num_input_files_in_non_output_levels;
       this->num_input_files_in_output_level +=
           c.num_input_files_in_output_level;
       this->num_output_files += c.num_output_files;
+      this->num_output_files_blob += c.num_output_files_blob;
       this->num_input_records += c.num_input_records;
       this->num_dropped_records += c.num_dropped_records;
       this->count += c.count;
@@ -304,13 +338,16 @@
       this->cpu_micros -= c.cpu_micros;
       this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
       this->bytes_read_output_level -= c.bytes_read_output_level;
+      this->bytes_read_blob -= c.bytes_read_blob;
       this->bytes_written -= c.bytes_written;
+      this->bytes_written_blob -= c.bytes_written_blob;
       this->bytes_moved -= c.bytes_moved;
       this->num_input_files_in_non_output_levels -=
           c.num_input_files_in_non_output_levels;
       this->num_input_files_in_output_level -=
           c.num_input_files_in_output_level;
       this->num_output_files -= c.num_output_files;
+      this->num_output_files_blob -= c.num_output_files_blob;
       this->num_input_records -= c.num_input_records;
       this->num_dropped_records -= c.num_dropped_records;
       this->count -= c.count;
@@ -321,6 +358,39 @@
     }
   };
 
+  // For use with CacheEntryStatsCollector
+  struct CacheEntryRoleStats {
+    uint64_t cache_capacity = 0;
+    std::string cache_id;
+    std::array<uint64_t, kNumCacheEntryRoles> total_charges;
+    std::array<size_t, kNumCacheEntryRoles> entry_counts;
+    uint32_t collection_count = 0;
+    uint32_t copies_of_last_collection = 0;
+    uint64_t last_start_time_micros_ = 0;
+    uint64_t last_end_time_micros_ = 0;
+
+    void Clear() {
+      // Wipe everything except collection_count
+      uint32_t saved_collection_count = collection_count;
+      *this = CacheEntryRoleStats();
+      collection_count = saved_collection_count;
+    }
+
+    void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+    std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+    GetEntryCallback();
+    void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+    void SkippedCollection();
+
+    std::string ToString(SystemClock* clock) const;
+    void ToMap(std::map<std::string, std::string>* values,
+               SystemClock* clock) const;
+
+   private:
+    std::unordered_map<Cache::DeleterFn, CacheEntryRole> role_map_;
+    uint64_t GetLastDurationMicros() const;
+  };
+
   void Clear() {
     for (int i = 0; i < kIntStatsNumMax; i++) {
       db_stats_[i].store(0);
@@ -335,10 +405,11 @@
     for (auto& h : file_read_latency_) {
       h.Clear();
     }
+    blob_file_read_latency_.Clear();
     cf_stats_snapshot_.Clear();
     db_stats_snapshot_.Clear();
     bg_error_count_ = 0;
-    started_at_ = env_->NowMicros();
+    started_at_ = clock_->NowMicros();
   }
 
   void AddCompactionStats(int level, Env::Priority thread_pri,
@@ -375,6 +446,8 @@
     return &file_read_latency_[level];
   }
 
+  HistogramImpl* GetBlobFileReadHist() { return &blob_file_read_latency_; }
+
   uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
 
   uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
@@ -392,18 +465,31 @@
   bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
                                 Version* version, uint64_t* value);
 
+  // Unless there is a recent enough collection of the stats, collect and
+  // saved new cache entry stats. If `foreground`, require data to be more
+  // recent to skip re-collection.
+  //
+  // This should only be called while NOT holding the DB mutex.
+  void CollectCacheEntryStats(bool foreground);
+
+  const uint64_t* TEST_GetCFStatsValue() const { return cf_stats_value_; }
+
   const std::vector<CompactionStats>& TEST_GetCompactionStats() const {
     return comp_stats_;
   }
 
+  void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground);
+
   // Store a mapping from the user-facing DB::Properties string to our
   // DBPropertyInfo struct used internally for retrieving properties.
   static const std::unordered_map<std::string, DBPropertyInfo> ppt_name_to_info;
 
  private:
+  void DumpDBMapStats(std::map<std::string, std::string>* db_stats);
   void DumpDBStats(std::string* value);
   void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
   void DumpCFMapStats(
+      const VersionStorageInfo* vstorage,
       std::map<int, std::map<LevelStatType, double>>* level_stats,
       CompactionStats* compaction_stats_sum);
   void DumpCFMapStatsByPriority(
@@ -413,17 +499,25 @@
   void DumpCFStatsNoFileHistogram(std::string* value);
   void DumpCFFileHistogram(std::string* value);
 
-  bool HandleBlockCacheStat(Cache** block_cache);
+  bool GetBlockCacheForStats(Cache** block_cache);
 
   // Per-DB stats
   std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
   // Per-ColumnFamily stats
   uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
   uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
+  // Initialize/reference the collector in constructor so that we don't need
+  // additional synchronization in InternalStats, relying on synchronization
+  // in CacheEntryStatsCollector::GetStats. This collector is pinned in cache
+  // (through a shared_ptr) so that it does not get immediately ejected from
+  // a full cache, which would force a re-scan on the next GetStats.
+  std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
+      cache_entry_stats_collector_;
   // Per-ColumnFamily/level compaction stats
   std::vector<CompactionStats> comp_stats_;
   std::vector<CompactionStats> comp_stats_by_pri_;
   std::vector<HistogramImpl> file_read_latency_;
+  HistogramImpl blob_file_read_latency_;
 
   // Used to compute per-interval statistics
   struct CFStatsSnapshot {
@@ -519,14 +613,21 @@
   bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
   bool HandleLevelStats(std::string* value, Slice suffix);
   bool HandleStats(std::string* value, Slice suffix);
-  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats);
+  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats,
+                        Slice suffix);
   bool HandleCFStats(std::string* value, Slice suffix);
   bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
   bool HandleCFFileHistogram(std::string* value, Slice suffix);
+  bool HandleDBMapStats(std::map<std::string, std::string>* compaction_stats,
+                        Slice suffix);
   bool HandleDBStats(std::string* value, Slice suffix);
   bool HandleSsTables(std::string* value, Slice suffix);
   bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
   bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
+  bool HandleAggregatedTablePropertiesMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleAggregatedTablePropertiesAtLevelMap(
+      std::map<std::string, std::string>* values, Slice suffix);
   bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
                                   Version* version);
   bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db,
@@ -581,6 +682,14 @@
   bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
   bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
                                    Version* version);
+  bool HandleBlockCacheEntryStats(std::string* value, Slice suffix);
+  bool HandleBlockCacheEntryStatsMap(std::map<std::string, std::string>* values,
+                                     Slice suffix);
+  bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix);
+  bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlobStats(std::string* value, Slice suffix);
+  bool HandleTotalBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
   // Total number of background errors encountered. Every time a flush task
   // or compaction task fails, this counter is incremented. The failure can
   // be caused by any possible reason, including file system errors, out of
@@ -589,7 +698,7 @@
   uint64_t bg_error_count_;
 
   const int number_levels_;
-  Env* env_;
+  SystemClock* clock_;
   ColumnFamilyData* cfd_;
   uint64_t started_at_;
 };
@@ -628,18 +737,22 @@
     kIntStatsNumMax,
   };
 
-  InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}
+  InternalStats(int /*num_levels*/, SystemClock* /*clock*/,
+                ColumnFamilyData* /*cfd*/) {}
 
   struct CompactionStats {
     uint64_t micros;
     uint64_t cpu_micros;
     uint64_t bytes_read_non_output_levels;
     uint64_t bytes_read_output_level;
+    uint64_t bytes_read_blob;
     uint64_t bytes_written;
+    uint64_t bytes_written_blob;
     uint64_t bytes_moved;
     int num_input_files_in_non_output_levels;
     int num_input_files_in_output_level;
     int num_output_files;
+    int num_output_files_blob;
     uint64_t num_input_records;
     uint64_t num_dropped_records;
     int count;
@@ -667,6 +780,8 @@
 
   HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
 
+  HistogramImpl* GetBlobFileReadHist() { return nullptr; }
+
   uint64_t GetBackgroundErrorCount() const { return 0; }
 
   uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/job_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/job_context.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/job_context.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/job_context.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,8 +12,9 @@
 #include <string>
 #include <vector>
 
-#include "db/log_writer.h"
 #include "db/column_family.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -23,7 +24,7 @@
 struct SuperVersionContext {
   struct WriteStallNotification {
     WriteStallInfo write_stall_info;
-    const ImmutableCFOptions* immutable_cf_options;
+    const ImmutableOptions* immutable_options;
   };
 
   autovector<SuperVersion*> superversions_to_free;
@@ -57,15 +58,16 @@
 #endif
   }
 
-  void PushWriteStallNotification(
-      WriteStallCondition old_cond, WriteStallCondition new_cond,
-      const std::string& name, const ImmutableCFOptions* ioptions) {
+  void PushWriteStallNotification(WriteStallCondition old_cond,
+                                  WriteStallCondition new_cond,
+                                  const std::string& name,
+                                  const ImmutableOptions* ioptions) {
 #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
     WriteStallNotification notif;
     notif.write_stall_info.cf_name = name;
     notif.write_stall_info.condition.prev = old_cond;
     notif.write_stall_info.condition.cur = new_cond;
-    notif.immutable_cf_options = ioptions;
+    notif.immutable_options = ioptions;
     write_stall_notifications.push_back(notif);
 #else
     (void)old_cond;
@@ -79,7 +81,7 @@
 #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
     // notify listeners on changed write stall conditions
     for (auto& notif : write_stall_notifications) {
-      for (auto& listener : notif.immutable_cf_options->listeners) {
+      for (auto& listener : notif.immutable_options->listeners) {
         listener->OnStallConditionsChanged(notif.write_stall_info);
       }
     }
@@ -102,8 +104,9 @@
 
 struct JobContext {
   inline bool HaveSomethingToDelete() const {
-    return full_scan_candidate_files.size() || sst_delete_files.size() ||
-           log_delete_files.size() || manifest_delete_files.size();
+    return !(full_scan_candidate_files.empty() && sst_delete_files.empty() &&
+             blob_delete_files.empty() && log_delete_files.empty() &&
+             manifest_delete_files.empty());
   }
 
   inline bool HaveSomethingToClean() const {
@@ -115,7 +118,7 @@
       }
     }
     return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
-           sv_have_sth;
+           job_snapshot != nullptr || sv_have_sth;
   }
 
   // Structure to store information for candidate files to delete.
@@ -140,11 +143,17 @@
   std::vector<CandidateFileInfo> full_scan_candidate_files;
 
   // the list of all live sst files that cannot be deleted
-  std::vector<FileDescriptor> sst_live;
+  std::vector<uint64_t> sst_live;
 
-  // a list of sst files that we need to delete
+  // the list of sst files that we need to delete
   std::vector<ObsoleteFileInfo> sst_delete_files;
 
+  // the list of all live blob files that cannot be deleted
+  std::vector<uint64_t> blob_live;
+
+  // the list of blob files that we need to delete
+  std::vector<ObsoleteBlobFileInfo> blob_delete_files;
+
   // a list of log files that we need to delete
   std::vector<uint64_t> log_delete_files;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/kv_checksum.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/kv_checksum.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/kv_checksum.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/kv_checksum.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,394 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains classes containing fields to protect individual entries.
+// The classes are named "ProtectionInfo<suffix>", where <suffix> indicates the
+// combination of fields that are covered. Each field has a single letter
+// abbreviation as follows.
+//
+// K = key
+// V = value
+// O = optype aka value type
+// S = seqno
+// C = CF ID
+//
+// Then, for example, a class that protects an entry consisting of key, value,
+// optype, and CF ID (i.e., a `WriteBatch` entry) would be named
+// `ProtectionInfoKVOC`.
+//
+// The `ProtectionInfo.*` classes are templated on the integer type used to hold
+// the XOR of hashes for each field. Only unsigned integer types are supported,
+// and the maximum supported integer width is 64 bits. When the integer type is
+// narrower than the hash values, we lop off the most significant bits to make
+// them fit.
+//
+// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do
+// not currently make the byte order consistent for integer fields before
+// hashing them, so the resulting values are endianness-dependent.
+
+#pragma once
+
+#include <type_traits>
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename T>
+class ProtectionInfo;
+template <typename T>
+class ProtectionInfoKVO;
+template <typename T>
+class ProtectionInfoKVOC;
+template <typename T>
+class ProtectionInfoKVOS;
+
+// Aliases for 64-bit protection infos.
+using ProtectionInfo64 = ProtectionInfo<uint64_t>;
+using ProtectionInfoKVO64 = ProtectionInfoKVO<uint64_t>;
+using ProtectionInfoKVOC64 = ProtectionInfoKVOC<uint64_t>;
+using ProtectionInfoKVOS64 = ProtectionInfoKVOS<uint64_t>;
+
+template <typename T>
+class ProtectionInfo {
+ public:
+  ProtectionInfo() = default;
+
+  Status GetStatus() const;
+  ProtectionInfoKVO<T> ProtectKVO(const Slice& key, const Slice& value,
+                                  ValueType op_type) const;
+  ProtectionInfoKVO<T> ProtectKVO(const SliceParts& key,
+                                  const SliceParts& value,
+                                  ValueType op_type) const;
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+  friend class ProtectionInfoKVOS<T>;
+  friend class ProtectionInfoKVOC<T>;
+
+  // Each field is hashed with an independent value so we can catch fields being
+  // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
+  // and we should instead vary our seeds by a large odd number. This value by
+  // which we increment (0xD28AAD72F49BD50B) was taken from
+  // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd
+  // number. The values are computed manually since the Windows C++ compiler
+  // complains about the overflow when adding constants.
+  static const uint64_t kSeedK = 0;
+  static const uint64_t kSeedV = 0xD28AAD72F49BD50B;
+  static const uint64_t kSeedO = 0xA5155AE5E937AA16;
+  static const uint64_t kSeedS = 0x77A00858DDD37F21;
+  static const uint64_t kSeedC = 0x4A2AB5CBD26F542C;
+
+  ProtectionInfo(T val) : val_(val) {
+    static_assert(sizeof(ProtectionInfo<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return val_; }
+  void SetVal(T val) { val_ = val; }
+
+  T val_ = 0;
+};
+
+template <typename T>
+class ProtectionInfoKVO {
+ public:
+  ProtectionInfoKVO() = default;
+
+  ProtectionInfo<T> StripKVO(const Slice& key, const Slice& value,
+                             ValueType op_type) const;
+  ProtectionInfo<T> StripKVO(const SliceParts& key, const SliceParts& value,
+                             ValueType op_type) const;
+
+  ProtectionInfoKVOC<T> ProtectC(ColumnFamilyId column_family_id) const;
+  ProtectionInfoKVOS<T> ProtectS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key);
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key);
+  void UpdateV(const Slice& old_value, const Slice& new_value);
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
+  void UpdateO(ValueType old_op_type, ValueType new_op_type);
+
+ private:
+  friend class ProtectionInfo<T>;
+  friend class ProtectionInfoKVOS<T>;
+  friend class ProtectionInfoKVOC<T>;
+
+  explicit ProtectionInfoKVO(T val) : info_(val) {
+    static_assert(sizeof(ProtectionInfoKVO<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return info_.GetVal(); }
+  void SetVal(T val) { info_.SetVal(val); }
+
+  ProtectionInfo<T> info_;
+};
+
+template <typename T>
+class ProtectionInfoKVOC {
+ public:
+  ProtectionInfoKVOC() = default;
+
+  ProtectionInfoKVO<T> StripC(ColumnFamilyId column_family_id) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvo_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateC(ColumnFamilyId old_column_family_id,
+               ColumnFamilyId new_column_family_id);
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+
+  explicit ProtectionInfoKVOC(T val) : kvo_(val) {
+    static_assert(sizeof(ProtectionInfoKVOC<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return kvo_.GetVal(); }
+  void SetVal(T val) { kvo_.SetVal(val); }
+
+  ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+class ProtectionInfoKVOS {
+ public:
+  ProtectionInfoKVOS() = default;
+
+  ProtectionInfoKVO<T> StripS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvo_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateS(SequenceNumber old_sequence_number,
+               SequenceNumber new_sequence_number);
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+
+  explicit ProtectionInfoKVOS(T val) : kvo_(val) {
+    static_assert(sizeof(ProtectionInfoKVOS<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return kvo_.GetVal(); }
+  void SetVal(T val) { kvo_.SetVal(val); }
+
+  ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+Status ProtectionInfo<T>::GetStatus() const {
+  if (val_ != 0) {
+    return Status::Corruption("ProtectionInfo mismatch");
+  }
+  return Status::OK();
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const Slice& key,
+                                                   const Slice& value,
+                                                   ValueType op_type) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const SliceParts& key,
+                                                   const SliceParts& value,
+                                                   ValueType op_type) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const Slice& old_key, const Slice& new_key) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const SliceParts& old_key,
+                                   const SliceParts& new_key) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const Slice& old_value,
+                                   const Slice& new_value) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const SliceParts& old_value,
+                                   const SliceParts& new_value) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateO(ValueType old_op_type,
+                                   ValueType new_op_type) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&old_op_type),
+                                      sizeof(old_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&new_op_type),
+                                      sizeof(new_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const Slice& key,
+                                                 const Slice& value,
+                                                 ValueType op_type) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const SliceParts& key,
+                                                 const SliceParts& value,
+                                                 ValueType op_type) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOC<T> ProtectionInfoKVO<T>::ProtectC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVOC<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOC<T>::StripC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOC<T>::UpdateC(ColumnFamilyId old_column_family_id,
+                                    ColumnFamilyId new_column_family_id) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_column_family_id),
+                  sizeof(old_column_family_id), ProtectionInfo<T>::kSeedC));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_column_family_id),
+                  sizeof(new_column_family_id), ProtectionInfo<T>::kSeedC));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfoKVOS<T> ProtectionInfoKVO<T>::ProtectS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVOS<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOS<T>::StripS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOS<T>::UpdateS(SequenceNumber old_sequence_number,
+                                    SequenceNumber new_sequence_number) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_sequence_number),
+                  sizeof(old_sequence_number), ProtectionInfo<T>::kSeedS));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_sequence_number),
+                  sizeof(new_sequence_number), ProtectionInfo<T>::kSeedS));
+  SetVal(val);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/listener_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/listener_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/listener_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/listener_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,15 +3,13 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
-#include "logging/logging.h"
-#include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
@@ -24,8 +22,6 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/block_based/block_based_table_factory.h"
-#include "table/plain/plain_table_factory.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -41,7 +37,7 @@
 
 class EventListenerTest : public DBTestBase {
  public:
-  EventListenerTest() : DBTestBase("/listener_test") {}
+  EventListenerTest() : DBTestBase("listener_test", /*env_do_fsync=*/true) {}
 
   static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
                              uint64_t size) {
@@ -195,10 +191,10 @@
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (int i = 1; i < 8; ++i) {
     ASSERT_OK(Flush(i));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
                                      nullptr, nullptr));
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
   ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
@@ -214,6 +210,10 @@
       : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
     db_closed = false;
   }
+
+  virtual ~TestFlushListener() {
+    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
+  }
   void OnTableFileCreated(
       const TableFileCreationInfo& info) override {
     // remember the info for later checking the FlushJobInfo.
@@ -227,6 +227,8 @@
     ASSERT_GT(info.table_properties.raw_value_size, 0U);
     ASSERT_GT(info.table_properties.num_data_blocks, 0U);
     ASSERT_GT(info.table_properties.num_entries, 0U);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
 
 #ifdef ROCKSDB_USING_THREAD_STATUS
     // Verify the id of the current thread that created this table
@@ -272,6 +274,9 @@
     ASSERT_TRUE(test_);
     if (db == test_->db_) {
       std::vector<std::vector<FileMetaData>> files_by_level;
+      ASSERT_LT(info.cf_id, test_->handles_.size());
+      ASSERT_GE(info.cf_id, 0u);
+      ASSERT_NE(test_->handles_[info.cf_id], nullptr);
       test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
                                              &files_by_level);
 
@@ -334,7 +339,7 @@
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (int i = 1; i < 8; ++i) {
     ASSERT_OK(Flush(i));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
   }
@@ -353,32 +358,39 @@
 #ifdef ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  TestFlushListener* listener = new TestFlushListener(options.env, this);
-  options.listeners.emplace_back(listener);
-  options.table_properties_collector_factories.push_back(
-      std::make_shared<TestPropertiesCollectorFactory>());
-  std::vector<std::string> cf_names = {
-      "pikachu", "ilya", "muromec", "dobrynia",
-      "nikitich", "alyosha", "popovich"};
-  CreateAndReopenWithCF(cf_names, options);
-
-  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
-  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
-  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
-  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
-  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
-  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
-  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
-  for (int i = 1; i < 8; ++i) {
-    ASSERT_OK(Flush(i));
-    ASSERT_EQ(listener->flushed_dbs_.size(), i);
-    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
-  }
+  for (auto atomic_flush : {false, true}) {
+    options.atomic_flush = atomic_flush;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    TestFlushListener* listener = new TestFlushListener(options.env, this);
+    options.listeners.emplace_back(listener);
+    options.table_properties_collector_factories.push_back(
+        std::make_shared<TestPropertiesCollectorFactory>());
+    std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                         "dobrynia", "nikitich", "alyosha",
+                                         "popovich"};
+    CreateAndReopenWithCF(cf_names, options);
+
+    ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+    ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+    ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+    ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+    ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+    ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+    ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+    for (int i = 1; i < 8; ++i) {
+      ASSERT_OK(Flush(i));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_EQ(listener->flushed_dbs_.size(), i);
+      ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+    }
 
-  // make sure callback functions are called in the right order
-  for (size_t i = 0; i < cf_names.size(); i++) {
-    ASSERT_EQ(listener->flushed_dbs_[i], db_);
-    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+    // make sure callback functions are called in the right order
+    for (size_t i = 0; i < cf_names.size(); i++) {
+      ASSERT_EQ(listener->flushed_dbs_[i], db_);
+      ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+    }
+    Close();
   }
 }
 
@@ -418,7 +430,7 @@
     ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
     for (size_t c = 0; c < cf_names.size(); ++c) {
       ColumnFamilyHandle* handle;
-      db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
+      ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle));
       handles.push_back(handle);
     }
 
@@ -436,7 +448,8 @@
   for (size_t c = 0; c < cf_names.size(); ++c) {
     for (int d = 0; d < kNumDBs; ++d) {
       ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
-      reinterpret_cast<DBImpl*>(dbs[d])->TEST_WaitForFlushMemTable();
+      ASSERT_OK(
+          static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable());
     }
   }
 
@@ -495,13 +508,16 @@
   // keep writing until writes are forced to stop.
   for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
        ++i) {
-    Put(1, ToString(i), std::string(10000, 'x'), WriteOptions());
+    ASSERT_OK(Put(1, ToString(i), std::string(10000, 'x'), WriteOptions()));
     FlushOptions fo;
     fo.allow_write_stall = true;
-    db_->Flush(fo, handles_[1]);
+    ASSERT_OK(db_->Flush(fo, handles_[1]));
     db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   }
   ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
+  // We don't want the listener executing during DBTestBase::Close() due to
+  // race on handles_.
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
 }
 
 class TestCompactionReasonListener : public EventListener {
@@ -519,8 +535,8 @@
   Options options;
   options.env = CurrentOptions().env;
   options.create_if_missing = true;
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
 
   TestCompactionReasonListener* listener = new TestCompactionReasonListener();
   options.listeners.emplace_back(listener);
@@ -535,7 +551,7 @@
   for (int i = 0; i < 4; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(listener->compaction_reasons_.size(), 1);
   ASSERT_EQ(listener->compaction_reasons_[0],
@@ -552,14 +568,14 @@
   }
 
   // Do a trivial move from L0 -> L1
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   options.max_bytes_for_level_base = 1;
   Close();
   listener->compaction_reasons_.clear();
   Reopen(options);
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_GT(listener->compaction_reasons_.size(), 1);
 
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -571,7 +587,7 @@
   listener->compaction_reasons_.clear();
   Reopen(options);
 
-  Put("key", "value");
+  ASSERT_OK(Put("key", "value"));
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
@@ -585,8 +601,8 @@
   Options options;
   options.env = CurrentOptions().env;
   options.create_if_missing = true;
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
 
   TestCompactionReasonListener* listener = new TestCompactionReasonListener();
   options.listeners.emplace_back(listener);
@@ -605,7 +621,7 @@
   for (int i = 0; i < 8; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -623,7 +639,7 @@
   for (int i = 0; i < 8; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -635,7 +651,7 @@
   listener->compaction_reasons_.clear();
   Reopen(options);
 
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -647,8 +663,8 @@
   Options options;
   options.env = CurrentOptions().env;
   options.create_if_missing = true;
-  options.memtable_factory.reset(
-      new SpecialSkipListFactory(DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
 
   TestCompactionReasonListener* listener = new TestCompactionReasonListener();
   options.listeners.emplace_back(listener);
@@ -664,7 +680,7 @@
   for (int i = 0; i < 4; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -676,7 +692,9 @@
  public:
   class TestEnv : public EnvWrapper {
    public:
-    TestEnv() : EnvWrapper(Env::Default()) {}
+    explicit TestEnv(Env* t) : EnvWrapper(t) {}
+    static const char* kClassName() { return "TestEnv"; }
+    const char* Name() const override { return kClassName(); }
 
     void SetStatus(Status s) { status_ = s; }
 
@@ -688,7 +706,7 @@
           return status_;
         }
       }
-      return Env::Default()->NewWritableFile(fname, result, options);
+      return target()->NewWritableFile(fname, result, options);
     }
 
    private:
@@ -751,6 +769,8 @@
     ASSERT_GT(info.cf_name.size(), 0U);
     ASSERT_GT(info.file_path.size(), 0U);
     ASSERT_GT(info.job_id, 0);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
     if (info.status.ok()) {
       ASSERT_GT(info.table_properties.data_size, 0U);
       ASSERT_GT(info.table_properties.raw_key_size, 0U);
@@ -760,57 +780,72 @@
     } else {
       if (idx >= 0) {
         failure_[idx]++;
+        last_failure_ = info.status;
       }
     }
   }
 
-  TestEnv test_env;
   int started_[2];
   int finished_[2];
   int failure_[2];
+  Status last_failure_;
 };
 
 TEST_F(EventListenerTest, TableFileCreationListenersTest) {
   auto listener = std::make_shared<TableFileCreationListener>();
   Options options;
+  std::unique_ptr<TableFileCreationListener::TestEnv> test_env(
+      new TableFileCreationListener::TestEnv(CurrentOptions().env));
   options.create_if_missing = true;
   options.listeners.push_back(listener);
-  options.env = &listener->test_env;
+  options.env = test_env.get();
   DestroyAndReopen(options);
 
   ASSERT_OK(Put("foo", "aaa"));
   ASSERT_OK(Put("bar", "bbb"));
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
-
   ASSERT_OK(Put("foo", "aaa1"));
   ASSERT_OK(Put("bar", "bbb1"));
-  listener->test_env.SetStatus(Status::NotSupported("not supported"));
+  test_env->SetStatus(Status::NotSupported("not supported"));
   ASSERT_NOK(Flush());
   listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
-  listener->test_env.SetStatus(Status::OK());
+  ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+  test_env->SetStatus(Status::OK());
 
   Reopen(options);
   ASSERT_OK(Put("foo", "aaa2"));
   ASSERT_OK(Put("bar", "bbb2"));
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
 
   const Slice kRangeStart = "a";
   const Slice kRangeEnd = "z";
-  dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0);
 
+  // Verify that an empty table file that is immediately deleted gives Aborted
+  // status to listener.
+  ASSERT_OK(Put("baz", "z"));
+  ASSERT_OK(SingleDelete("baz"));
+  ASSERT_OK(Flush());
+  listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+  ASSERT_TRUE(listener->last_failure_.IsAborted());
+
   ASSERT_OK(Put("foo", "aaa3"));
   ASSERT_OK(Put("bar", "bbb3"));
   ASSERT_OK(Flush());
-  listener->test_env.SetStatus(Status::NotSupported("not supported"));
-  dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
-  dbfull()->TEST_WaitForCompact();
+  test_env->SetStatus(Status::NotSupported("not supported"));
+  ASSERT_NOK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
   listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1);
+  ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+  Close();
 }
 
 class MemTableSealedListener : public EventListener {
@@ -831,6 +866,7 @@
 TEST_F(EventListenerTest, MemTableSealedListenerTest) {
   auto listener = std::make_shared<MemTableSealedListener>();
   Options options;
+  options.env = CurrentOptions().env;
   options.create_if_missing = true;
   options.listeners.push_back(listener);
   DestroyAndReopen(options);
@@ -895,7 +931,7 @@
       // can succeed.
       *bg_error = Status::OK();
       env_->drop_writes_.store(false, std::memory_order_release);
-      env_->no_slowdown_ = false;
+      env_->SetMockSleep(false);
     }
     ++counter_;
   }
@@ -909,7 +945,7 @@
   options.create_if_missing = true;
   options.env = env_;
   options.listeners.push_back(listener);
-  options.memtable_factory.reset(new SpecialSkipListFactory(1));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
   options.paranoid_checks = true;
   DestroyAndReopen(options);
 
@@ -921,7 +957,7 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   env_->drop_writes_.store(true, std::memory_order_release);
-  env_->no_slowdown_ = true;
+  env_->SetMockSleep();
 
   ASSERT_OK(Put("key0", "val"));
   ASSERT_OK(Put("key1", "val"));
@@ -940,7 +976,7 @@
   options.env = env_;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.push_back(listener);
-  options.memtable_factory.reset(new SpecialSkipListFactory(2));
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
   options.paranoid_checks = true;
   DestroyAndReopen(options);
 
@@ -955,7 +991,7 @@
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
 
   env_->drop_writes_.store(true, std::memory_order_release);
-  env_->no_slowdown_ = true;
+  env_->SetMockSleep();
   ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(1, listener->counter());
@@ -977,6 +1013,21 @@
     file_reads_success_.store(0);
     file_writes_.store(0);
     file_writes_success_.store(0);
+    file_flushes_.store(0);
+    file_flushes_success_.store(0);
+    file_closes_.store(0);
+    file_closes_success_.store(0);
+    file_syncs_.store(0);
+    file_syncs_success_.store(0);
+    file_truncates_.store(0);
+    file_truncates_success_.store(0);
+    file_seq_reads_.store(0);
+    blob_file_reads_.store(0);
+    blob_file_writes_.store(0);
+    blob_file_flushes_.store(0);
+    blob_file_closes_.store(0);
+    blob_file_syncs_.store(0);
+    blob_file_truncates_.store(0);
   }
 
   void OnFileReadFinish(const FileOperationInfo& info) override {
@@ -984,6 +1035,12 @@
     if (info.status.ok()) {
       ++file_reads_success_;
     }
+    if (info.path.find("MANIFEST") != std::string::npos) {
+      ++file_seq_reads_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_reads_;
+    }
     ReportDuration(info);
   }
 
@@ -992,6 +1049,53 @@
     if (info.status.ok()) {
       ++file_writes_success_;
     }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_writes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileFlushFinish(const FileOperationInfo& info) override {
+    ++file_flushes_;
+    if (info.status.ok()) {
+      ++file_flushes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_flushes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileCloseFinish(const FileOperationInfo& info) override {
+    ++file_closes_;
+    if (info.status.ok()) {
+      ++file_closes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_closes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileSyncFinish(const FileOperationInfo& info) override {
+    ++file_syncs_;
+    if (info.status.ok()) {
+      ++file_syncs_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_syncs_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileTruncateFinish(const FileOperationInfo& info) override {
+    ++file_truncates_;
+    if (info.status.ok()) {
+      ++file_truncates_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_truncates_;
+    }
     ReportDuration(info);
   }
 
@@ -1001,12 +1105,25 @@
   std::atomic<size_t> file_reads_success_;
   std::atomic<size_t> file_writes_;
   std::atomic<size_t> file_writes_success_;
+  std::atomic<size_t> file_flushes_;
+  std::atomic<size_t> file_flushes_success_;
+  std::atomic<size_t> file_closes_;
+  std::atomic<size_t> file_closes_success_;
+  std::atomic<size_t> file_syncs_;
+  std::atomic<size_t> file_syncs_success_;
+  std::atomic<size_t> file_truncates_;
+  std::atomic<size_t> file_truncates_success_;
+  std::atomic<size_t> file_seq_reads_;
+  std::atomic<size_t> blob_file_reads_;
+  std::atomic<size_t> blob_file_writes_;
+  std::atomic<size_t> blob_file_flushes_;
+  std::atomic<size_t> blob_file_closes_;
+  std::atomic<size_t> blob_file_syncs_;
+  std::atomic<size_t> blob_file_truncates_;
 
  private:
   void ReportDuration(const FileOperationInfo& info) const {
-    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
-        info.finish_timestamp - info.start_timestamp);
-    ASSERT_GT(duration.count(), 0);
+    ASSERT_GT(info.duration.count(), 0);
   }
 };
 
@@ -1018,18 +1135,430 @@
   TestFileOperationListener* listener = new TestFileOperationListener();
   options.listeners.emplace_back(listener);
 
+  options.use_direct_io_for_flush_and_compaction = false;
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    options.use_direct_io_for_flush_and_compaction = false;
+  } else {
+    ASSERT_OK(s);
+  }
   DestroyAndReopen(options);
   ASSERT_OK(Put("foo", "aaa"));
-  dbfull()->Flush(FlushOptions());
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_GE(listener->file_writes_.load(),
             listener->file_writes_success_.load());
   ASSERT_GT(listener->file_writes_.load(), 0);
+  ASSERT_GE(listener->file_flushes_.load(),
+            listener->file_flushes_success_.load());
+  ASSERT_GT(listener->file_flushes_.load(), 0);
   Close();
 
   Reopen(options);
   ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
   ASSERT_GT(listener->file_reads_.load(), 0);
+  ASSERT_GE(listener->file_closes_.load(),
+            listener->file_closes_success_.load());
+  ASSERT_GT(listener->file_closes_.load(), 0);
+  ASSERT_GE(listener->file_syncs_.load(), listener->file_syncs_success_.load());
+  ASSERT_GT(listener->file_syncs_.load(), 0);
+  if (true == options.use_direct_io_for_flush_and_compaction) {
+    ASSERT_GE(listener->file_truncates_.load(),
+              listener->file_truncates_success_.load());
+    ASSERT_GT(listener->file_truncates_.load(), 0);
+  }
+}
+
+TEST_F(EventListenerTest, OnBlobFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  ASSERT_GT(listener->blob_file_writes_.load(), 0U);
+  ASSERT_GT(listener->blob_file_flushes_.load(), 0U);
+  Close();
+
+  Reopen(options);
+  ASSERT_GT(listener->blob_file_closes_.load(), 0U);
+  ASSERT_GT(listener->blob_file_syncs_.load(), 0U);
+  if (true == options.use_direct_io_for_flush_and_compaction) {
+    ASSERT_GT(listener->blob_file_truncates_.load(), 0U);
+  }
+}
+
+TEST_F(EventListenerTest, ReadManifestAndWALOnRecovery) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  options.use_direct_io_for_flush_and_compaction = false;
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    options.use_direct_io_for_flush_and_compaction = false;
+  } else {
+    ASSERT_OK(s);
+  }
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  Close();
+
+  size_t seq_reads = listener->file_seq_reads_.load();
+  Reopen(options);
+  ASSERT_GT(listener->file_seq_reads_.load(), seq_reads);
+}
+
+class BlobDBJobLevelEventListenerTest : public EventListener {
+ public:
+  explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test)
+      : test_(test), call_count_(0) {}
+
+  std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData(
+      const VersionStorageInfo::BlobFiles& blob_files,
+      uint64_t blob_file_number) {
+    const auto it = blob_files.find(blob_file_number);
+
+    if (it == blob_files.end()) {
+      return nullptr;
+    }
+
+    const auto& meta = it->second;
+    assert(meta);
+
+    return meta;
+  }
+
+  const VersionStorageInfo::BlobFiles& GetBlobFiles() {
+    VersionSet* const versions = test_->dbfull()->GetVersionSet();
+    assert(versions);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    EXPECT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    EXPECT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    EXPECT_NE(storage_info, nullptr);
+
+    const auto& blob_files = storage_info->GetBlobFiles();
+    return blob_files;
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (const auto& fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    call_count_++;
+    EXPECT_FALSE(info.blob_file_addition_infos.empty());
+    const auto& blob_files = GetBlobFiles();
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      flushed_files_.push_back(info.file_path);
+    }
+    EXPECT_EQ(info.blob_compression_type, kNoCompression);
+
+    for (const auto& blob_file_addition_info : info.blob_file_addition_infos) {
+      const auto meta = GetBlobFileMetaData(
+          blob_files, blob_file_addition_info.blob_file_number);
+      EXPECT_EQ(meta->GetBlobFileNumber(),
+                blob_file_addition_info.blob_file_number);
+      EXPECT_EQ(meta->GetTotalBlobBytes(),
+                blob_file_addition_info.total_blob_bytes);
+      EXPECT_EQ(meta->GetTotalBlobCount(),
+                blob_file_addition_info.total_blob_count);
+      EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+    }
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    call_count_++;
+    EXPECT_FALSE(ci.blob_file_garbage_infos.empty());
+    const auto& blob_files = GetBlobFiles();
+    EXPECT_EQ(ci.blob_compression_type, kNoCompression);
+
+    for (const auto& blob_file_addition_info : ci.blob_file_addition_infos) {
+      const auto meta = GetBlobFileMetaData(
+          blob_files, blob_file_addition_info.blob_file_number);
+      EXPECT_EQ(meta->GetBlobFileNumber(),
+                blob_file_addition_info.blob_file_number);
+      EXPECT_EQ(meta->GetTotalBlobBytes(),
+                blob_file_addition_info.total_blob_bytes);
+      EXPECT_EQ(meta->GetTotalBlobCount(),
+                blob_file_addition_info.total_blob_count);
+      EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+    }
+
+    for (const auto& blob_file_garbage_info : ci.blob_file_garbage_infos) {
+      EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+      EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+      EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+      EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+    }
+  }
+
+  EventListenerTest* test_;
+  uint32_t call_count_;
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+// Test OnFlushCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnFlushCompleted) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  options.min_blob_size = 0;
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("Key1"), "blob_value1");
+  ASSERT_EQ(Get("Key2"), "blob_value2");
+  ASSERT_EQ(Get("Key3"), "blob_value3");
+
+  ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test OnCompactionCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnCompactionCompleted) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  blob_event_listener->call_count_ = 0;
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  // Make sure, OnCompactionCompleted is called.
+  ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test CompactFiles calls OnCompactionCompleted EventListener for blob files
+// and populate the blob files info.
+TEST_F(EventListenerTest, BlobDBCompactFiles) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> output_file_names;
+  CompactionJobInfo compaction_job_info;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction
+  // which will be populated in output_files_names.
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), blob_event_listener->GetFlushedFiles(), 1, -1,
+      &output_file_names, &compaction_job_info));
+
+  bool is_blob_in_output = false;
+  for (const auto& file : output_file_names) {
+    if (EndsWith(file, ".blob")) {
+      is_blob_in_output = true;
+    }
+  }
+  ASSERT_TRUE(is_blob_in_output);
+
+  for (const auto& blob_file_addition_info :
+       compaction_job_info.blob_file_addition_infos) {
+    EXPECT_GT(blob_file_addition_info.blob_file_number, 0U);
+    EXPECT_GT(blob_file_addition_info.total_blob_bytes, 0U);
+    EXPECT_GT(blob_file_addition_info.total_blob_count, 0U);
+    EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+  }
+
+  for (const auto& blob_file_garbage_info :
+       compaction_job_info.blob_file_garbage_infos) {
+    EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+    EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+    EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+    EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+  }
+}
+
+class BlobDBFileLevelEventListener : public EventListener {
+ public:
+  void OnBlobFileCreationStarted(
+      const BlobFileCreationBriefInfo& info) override {
+    files_started_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.cf_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+  }
+
+  void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
+    files_created_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.cf_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+    EXPECT_GT(info.total_blob_count, 0U);
+    EXPECT_GT(info.total_blob_bytes, 0U);
+    EXPECT_EQ(info.file_checksum, kUnknownFileChecksum);
+    EXPECT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+    EXPECT_TRUE(info.status.ok());
+  }
+
+  void OnBlobFileDeleted(const BlobFileDeletionInfo& info) override {
+    files_deleted_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+    EXPECT_TRUE(info.status.ok());
+  }
+
+  void CheckCounters() {
+    EXPECT_EQ(files_started_, files_created_);
+    EXPECT_GT(files_started_, 0U);
+    EXPECT_GT(files_deleted_, 0U);
+    EXPECT_LT(files_deleted_, files_created_);
+  }
+
+ private:
+  std::atomic<uint32_t> files_started_{};
+  std::atomic<uint32_t> files_created_{};
+  std::atomic<uint32_t> files_deleted_{};
+};
+
+TEST_F(EventListenerTest, BlobDBFileTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  BlobDBFileLevelEventListener* blob_event_listener =
+      new BlobDBFileLevelEventListener();
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  blob_event_listener->CheckCounters();
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,11 +11,11 @@
 
 #include <stdio.h>
 #include "file/sequence_file_reader.h"
+#include "port/lang.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/util.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace log {
@@ -119,16 +119,26 @@
         break;
 
       case kBadHeader:
-        if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
-          // in clean shutdown we don't expect any error in the log files
+        if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+            wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+          // In clean shutdown we don't expect any error in the log files.
+          // In point-in-time recovery an incomplete record at the end could
+          // produce a hole in the recovered data. Report an error here, which
+          // higher layers can choose to ignore when it's provable there is no
+          // hole.
           ReportCorruption(drop_size, "truncated header");
         }
         FALLTHROUGH_INTENDED;
 
       case kEof:
         if (in_fragmented_record) {
-          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
-            // in clean shutdown we don't expect any error in the log files
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
             ReportCorruption(scratch->size(), "error reading trailing data");
           }
           // This can be caused by the writer dying immediately after
@@ -142,8 +152,13 @@
         if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
           // Treat a record from a previous instance of the log as EOF.
           if (in_fragmented_record) {
-            if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
-              // in clean shutdown we don't expect any error in the log files
+            if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+                wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+              // In clean shutdown we don't expect any error in the log files.
+              // In point-in-time recovery an incomplete record at the end could
+              // produce a hole in the recovered data. Report an error here,
+              // which higher layers can choose to ignore when it's provable
+              // there is no hole.
               ReportCorruption(scratch->size(), "error reading trailing data");
             }
             // This can be caused by the writer dying immediately after
@@ -164,6 +179,20 @@
         break;
 
       case kBadRecordLen:
+        if (eof_) {
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
+            ReportCorruption(drop_size, "truncated record body");
+          }
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+
       case kBadRecordChecksum:
         if (recycled_ &&
             wal_recovery_mode ==
@@ -202,6 +231,10 @@
   return last_record_offset_;
 }
 
+uint64_t Reader::LastRecordEnd() {
+  return end_of_buffer_offset_ - buffer_.size();
+}
+
 void Reader::UnmarkEOF() {
   if (read_error_) {
     return;
@@ -281,6 +314,7 @@
     // Last read was a full read, so this is a trailer to skip
     buffer_.clear();
     Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+    TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status);
     end_of_buffer_offset_ += buffer_.size();
     if (!status.ok()) {
       buffer_.clear();
@@ -350,18 +384,14 @@
       }
     }
     if (header_size + length > buffer_.size()) {
+      assert(buffer_.size() >= static_cast<size_t>(header_size));
       *drop_size = buffer_.size();
       buffer_.clear();
-      if (!eof_) {
-        return kBadRecordLen;
-      }
-      // If the end of the file has been reached without reading |length|
-      // bytes of payload, assume the writer died in the middle of writing the
-      // record. Don't report a corruption unless requested.
-      if (*drop_size) {
-        return kBadHeader;
-      }
-      return kEof;
+      // If the end of the read has been reached without seeing
+      // `header_size + length` bytes of payload, report a corruption. The
+      // higher layers can decide how to handle it based on the recovery mode,
+      // whether this occurred at EOF, whether this is the final WAL, etc.
+      return kBadRecordLen;
     }
 
     if (type == kZeroType && length == 0) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -49,7 +49,6 @@
   //
   // If "checksum" is true, verify checksums if available.
   Reader(std::shared_ptr<Logger> info_log,
-         // @lint-ignore TXT2 T25377293 Grandfathered in
          std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
          bool checksum, uint64_t log_num);
   // No copying allowed
@@ -72,6 +71,11 @@
   // Undefined before the first call to ReadRecord.
   uint64_t LastRecordOffset();
 
+  // Returns the first physical offset after the last record returned by
+  // ReadRecord, or zero before first call to ReadRecord. This can also be
+  // thought of as the "current" position in processing the file bytes.
+  uint64_t LastRecordEnd();
+
   // returns true if the reader has encountered an eof condition.
   bool IsEOF() {
     return eof_;
@@ -159,7 +163,6 @@
 class FragmentBufferedReader : public Reader {
  public:
   FragmentBufferedReader(std::shared_ptr<Logger> info_log,
-                         // @lint-ignore TXT2 T25377293 Grandfathered in
                          std::unique_ptr<SequentialFileReader>&& _file,
                          Reporter* reporter, bool checksum, uint64_t log_num)
       : Reader(info_log, std::move(_file), reporter, checksum, log_num),
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,7 +9,6 @@
 
 #include "db/log_reader.h"
 #include "db/log_writer.h"
-#include "env/composite_env_wrapper.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
@@ -50,7 +49,7 @@
 // get<1>(tuple): true if allow retry after read EOF, false otherwise
 class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
  private:
-  class StringSource : public SequentialFile {
+  class StringSource : public FSSequentialFile {
    public:
     Slice& contents_;
     bool force_error_;
@@ -68,7 +67,8 @@
           returned_partial_(false),
           fail_after_read_partial_(fail_after_read_partial) {}
 
-    Status Read(size_t n, Slice* result, char* scratch) override {
+    IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+                  char* scratch, IODebugContext* /*dbg*/) override {
       if (fail_after_read_partial_) {
         EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
       }
@@ -81,7 +81,7 @@
           contents_.remove_prefix(force_error_position_);
           force_error_ = false;
           returned_partial_ = true;
-          return Status::Corruption("read error");
+          return IOStatus::Corruption("read error");
         }
       }
 
@@ -106,28 +106,21 @@
       *result = Slice(scratch, n);
 
       contents_.remove_prefix(n);
-      return Status::OK();
+      return IOStatus::OK();
     }
 
-    Status Skip(uint64_t n) override {
+    IOStatus Skip(uint64_t n) override {
       if (n > contents_.size()) {
         contents_.clear();
-        return Status::NotFound("in-memory file skipepd past end");
+        return IOStatus::NotFound("in-memory file skipepd past end");
       }
 
       contents_.remove_prefix(n);
 
-      return Status::OK();
+      return IOStatus::OK();
     }
   };
 
-  inline StringSource* GetStringSourceFromLegacyReader(
-      SequentialFileReader* reader) {
-    LegacySequentialFileWrapper* file =
-        static_cast<LegacySequentialFileWrapper*>(reader->file());
-    return static_cast<StringSource*>(file->target());
-  }
-
   class ReportCollector : public Reader::Reporter {
    public:
     size_t dropped_bytes_;
@@ -140,29 +133,17 @@
     }
   };
 
-  std::string& dest_contents() {
-    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
-    assert(dest);
-    return dest->contents_;
-  }
-
-  const std::string& dest_contents() const {
-    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
-    assert(dest);
-    return dest->contents_;
-  }
-
-  void reset_source_contents() {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    assert(src);
-    src->contents_ = dest_contents();
-  }
+  std::string& dest_contents() { return sink_->contents_; }
+
+  const std::string& dest_contents() const { return sink_->contents_; }
+
+  void reset_source_contents() { source_->contents_ = dest_contents(); }
 
   Slice reader_contents_;
-  std::unique_ptr<WritableFileWriter> dest_holder_;
-  std::unique_ptr<SequentialFileReader> source_holder_;
+  test::StringSink* sink_;
+  StringSource* source_;
   ReportCollector report_;
-  Writer writer_;
+  std::unique_ptr<Writer> writer_;
   std::unique_ptr<Reader> reader_;
 
  protected:
@@ -171,19 +152,23 @@
  public:
   LogTest()
       : reader_contents_(),
-        dest_holder_(test::GetWritableFileWriter(
-            new test::StringSink(&reader_contents_), "" /* don't care */)),
-        source_holder_(test::GetSequentialFileReader(
-            new StringSource(reader_contents_, !std::get<1>(GetParam())),
-            "" /* file name */)),
-        writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())),
+        sink_(new test::StringSink(&reader_contents_)),
+        source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))),
         allow_retry_read_(std::get<1>(GetParam())) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(sink_holder), "" /* don't care */, FileOptions()));
+    writer_.reset(
+        new Writer(std::move(file_writer), 123, std::get<0>(GetParam())));
+    std::unique_ptr<FSSequentialFile> source_holder(source_);
+    std::unique_ptr<SequentialFileReader> file_reader(
+        new SequentialFileReader(std::move(source_holder), "" /* file name */));
     if (allow_retry_read_) {
-      reader_.reset(new FragmentBufferedReader(
-          nullptr, std::move(source_holder_), &report_, true /* checksum */,
-          123 /* log_number */));
+      reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader),
+                                               &report_, true /* checksum */,
+                                               123 /* log_number */));
     } else {
-      reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_,
+      reader_.reset(new Reader(nullptr, std::move(file_reader), &report_,
                                true /* checksum */, 123 /* log_number */));
     }
   }
@@ -191,7 +176,7 @@
   Slice* get_reader_contents() { return &reader_contents_; }
 
   void Write(const std::string& msg) {
-    writer_.AddRecord(Slice(msg));
+    ASSERT_OK(writer_->AddRecord(Slice(msg)));
   }
 
   size_t WrittenBytes() const {
@@ -219,11 +204,7 @@
     dest_contents()[offset] = new_byte;
   }
 
-  void ShrinkSize(int bytes) {
-    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
-    assert(dest);
-    dest->Drop(bytes);
-  }
+  void ShrinkSize(int bytes) { sink_->Drop(bytes); }
 
   void FixChecksum(int header_offset, int len, bool recyclable) {
     // Compute crc of type/len/data
@@ -235,9 +216,8 @@
   }
 
   void ForceError(size_t position = 0) {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    src->force_error_ = true;
-    src->force_error_position_ = position;
+    source_->force_error_ = true;
+    source_->force_error_position_ = position;
   }
 
   size_t DroppedBytes() const {
@@ -249,14 +229,12 @@
   }
 
   void ForceEOF(size_t position = 0) {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    src->force_eof_ = true;
-    src->force_eof_position_ = position;
+    source_->force_eof_ = true;
+    source_->force_eof_position_ = position;
   }
 
   void UnmarkEOF() {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    src->returned_partial_ = false;
+    source_->returned_partial_ = false;
     reader_->UnmarkEOF();
   }
 
@@ -465,7 +443,7 @@
   ShrinkSize(1);
   ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
   ASSERT_GT(DroppedBytes(), 0U);
-  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
 }
 
 TEST_P(LogTest, ChecksumMismatch) {
@@ -573,9 +551,7 @@
   ShrinkSize(1);
   ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
   ASSERT_GT(DroppedBytes(), 0U);
-  ASSERT_EQ("OK", MatchError(
-                      "Corruption: truncated headerCorruption: "
-                      "error reading trailing data"));
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
 }
 
 TEST_P(LogTest, ErrorJoinsRecords) {
@@ -687,12 +663,13 @@
   while (get_reader_contents()->size() < log::kBlockSize * 2) {
     Write("xxxxxxxxxxxxxxxx");
   }
-  std::unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
-      new test::OverwritingStringSink(get_reader_contents()),
-      "" /* don't care */));
+  std::unique_ptr<FSWritableFile> sink(
+      new test::OverwritingStringSink(get_reader_contents()));
+  std::unique_ptr<WritableFileWriter> dest_holder(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
   Writer recycle_writer(std::move(dest_holder), 123, true);
-  recycle_writer.AddRecord(Slice("foooo"));
-  recycle_writer.AddRecord(Slice("bar"));
+  ASSERT_OK(recycle_writer.AddRecord(Slice("foooo")));
+  ASSERT_OK(recycle_writer.AddRecord(Slice("bar")));
   ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2);
   ASSERT_EQ("foooo", Read());
   ASSERT_EQ("bar", Read());
@@ -720,10 +697,9 @@
   };
 
   Slice contents_;
-  std::unique_ptr<WritableFileWriter> dest_holder_;
+  test::StringSink* sink_;
   std::unique_ptr<Writer> log_writer_;
   Env* env_;
-  EnvOptions env_options_;
   const std::string test_dir_;
   const std::string log_file_;
   std::unique_ptr<WritableFileWriter> writer_;
@@ -734,61 +710,58 @@
  public:
   RetriableLogTest()
       : contents_(),
-        dest_holder_(nullptr),
+        sink_(new test::StringSink(&contents_)),
         log_writer_(nullptr),
         env_(Env::Default()),
         test_dir_(test::PerThreadDBPath("retriable_log_test")),
         log_file_(test_dir_ + "/log"),
         writer_(nullptr),
         reader_(nullptr),
-        log_reader_(nullptr) {}
+        log_reader_(nullptr) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> wfw(new WritableFileWriter(
+        std::move(sink_holder), "" /* file name */, FileOptions()));
+    log_writer_.reset(new Writer(std::move(wfw), 123, GetParam()));
+  }
 
   Status SetupTestEnv() {
-    dest_holder_.reset(test::GetWritableFileWriter(
-        new test::StringSink(&contents_), "" /* file name */));
-    assert(dest_holder_ != nullptr);
-    log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam()));
-    assert(log_writer_ != nullptr);
-
     Status s;
-    s = env_->CreateDirIfMissing(test_dir_);
-    std::unique_ptr<WritableFile> writable_file;
+    FileOptions fopts;
+    auto fs = env_->GetFileSystem();
+    s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr);
+    std::unique_ptr<FSWritableFile> writable_file;
     if (s.ok()) {
-      s = env_->NewWritableFile(log_file_, &writable_file, env_options_);
+      s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr);
     }
     if (s.ok()) {
-      writer_.reset(new WritableFileWriter(
-          NewLegacyWritableFileWrapper(std::move(writable_file)), log_file_,
-          env_options_));
-      assert(writer_ != nullptr);
+      writer_.reset(
+          new WritableFileWriter(std::move(writable_file), log_file_, fopts));
+      EXPECT_NE(writer_, nullptr);
     }
-    std::unique_ptr<SequentialFile> seq_file;
+    std::unique_ptr<FSSequentialFile> seq_file;
     if (s.ok()) {
-      s = env_->NewSequentialFile(log_file_, &seq_file, env_options_);
+      s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr);
     }
     if (s.ok()) {
-      reader_.reset(new SequentialFileReader(
-          NewLegacySequentialFileWrapper(seq_file), log_file_));
-      assert(reader_ != nullptr);
+      reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_));
+      EXPECT_NE(reader_, nullptr);
       log_reader_.reset(new FragmentBufferedReader(
           nullptr, std::move(reader_), &report_, true /* checksum */,
           123 /* log_number */));
-      assert(log_reader_ != nullptr);
+      EXPECT_NE(log_reader_, nullptr);
     }
     return s;
   }
 
-  std::string contents() {
-    auto file = test::GetStringSinkFromLegacyWriter(log_writer_->file());
-    assert(file != nullptr);
-    return file->contents_;
-  }
+  std::string contents() { return sink_->contents_; }
 
-  void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); }
+  void Encode(const std::string& msg) {
+    ASSERT_OK(log_writer_->AddRecord(Slice(msg)));
+  }
 
   void Write(const Slice& data) {
-    writer_->Append(data);
-    writer_->Sync(true);
+    ASSERT_OK(writer_->Append(data));
+    ASSERT_OK(writer_->Sync(true));
   }
 
   bool TryRead(std::string* result) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -33,14 +33,14 @@
 
 Writer::~Writer() {
   if (dest_) {
-    WriteBuffer();
+    WriteBuffer().PermitUncheckedError();
   }
 }
 
-Status Writer::WriteBuffer() { return dest_->Flush(); }
+IOStatus Writer::WriteBuffer() { return dest_->Flush(); }
 
-Status Writer::Close() {
-  Status s;
+IOStatus Writer::Close() {
+  IOStatus s;
   if (dest_) {
     s = dest_->Close();
     dest_.reset();
@@ -48,7 +48,7 @@
   return s;
 }
 
-Status Writer::AddRecord(const Slice& slice) {
+IOStatus Writer::AddRecord(const Slice& slice) {
   const char* ptr = slice.data();
   size_t left = slice.size();
 
@@ -59,7 +59,7 @@
   // Fragment the record if necessary and emit it.  Note that if slice
   // is empty, we still want to iterate once to emit a single
   // zero-length record
-  Status s;
+  IOStatus s;
   bool begin = true;
   do {
     const int64_t leftover = kBlockSize - block_offset_;
@@ -114,7 +114,7 @@
 
 bool Writer::TEST_BufferIsEmpty() { return dest_->TEST_BufferIsEmpty(); }
 
-Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
   assert(n <= 0xffff);  // Must fit in two bytes
 
   size_t header_size;
@@ -145,14 +145,17 @@
   }
 
   // Compute the crc of the record type and the payload.
-  crc = crc32c::Extend(crc, ptr, n);
+  uint32_t payload_crc = crc32c::Value(ptr, n);
+  crc = crc32c::Crc32cCombine(crc, payload_crc, n);
   crc = crc32c::Mask(crc);  // Adjust for storage
+  TEST_SYNC_POINT_CALLBACK("LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum",
+                           &crc);
   EncodeFixed32(buf, crc);
 
   // Write the header and the payload
-  Status s = dest_->Append(Slice(buf, header_size));
+  IOStatus s = dest_->Append(Slice(buf, header_size));
   if (s.ok()) {
-    s = dest_->Append(Slice(ptr, n));
+    s = dest_->Append(Slice(ptr, n), payload_crc);
   }
   block_offset_ += header_size + n;
   return s;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/log_writer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/log_writer.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,11 +8,11 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 
-#include <stdint.h>
-
+#include <cstdint>
 #include <memory>
 
 #include "db/log_format.h"
+#include "rocksdb/io_status.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 
@@ -79,16 +79,16 @@
 
   ~Writer();
 
-  Status AddRecord(const Slice& slice);
+  IOStatus AddRecord(const Slice& slice);
 
   WritableFileWriter* file() { return dest_.get(); }
   const WritableFileWriter* file() const { return dest_.get(); }
 
   uint64_t get_log_number() const { return log_number_; }
 
-  Status WriteBuffer();
+  IOStatus WriteBuffer();
 
-  Status Close();
+  IOStatus Close();
 
   bool TEST_BufferIsEmpty();
 
@@ -103,7 +103,7 @@
   // record type stored in the header.
   uint32_t type_crc_[kMaxRecordType + 1];
 
-  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+  IOStatus EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
 
   // If true, it does not flush after each write. Instead it relies on the upper
   // layer to manually does the flush by calling ::WriteBuffer()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/logs_with_prep_tracker.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,8 +5,8 @@
 //
 #pragma once
 
-#include <stdint.h>
 #include <cassert>
+#include <cstdint>
 #include <cstdlib>
 #include <mutex>
 #include <unordered_map>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/lookup_key.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/lookup_key.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/lookup_key.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/lookup_key.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,7 +10,6 @@
 #pragma once
 #include <string>
 #include <utility>
-#include "rocksdb/db.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/types.h"
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/malloc_stats.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/malloc_stats.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/malloc_stats.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/malloc_stats.cc	2025-05-19 16:14:27.000000000 +0000
@@ -19,10 +19,10 @@
 
 #ifdef ROCKSDB_JEMALLOC
 
-typedef struct {
+struct MallocStatus {
   char* cur;
   char* end;
-} MallocStatus;
+};
 
 static void GetJemallocStatus(void* mstat_arg, const char* status) {
   MallocStatus* mstat = reinterpret_cast<MallocStatus*>(mstat_arg);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/manual_compaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/manual_compaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/manual_compaction_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/manual_compaction_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,8 +4,6 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 // Test for issue 178: a manual compaction causes deleted data to reappear.
-#include <iostream>
-#include <sstream>
 #include <cstdlib>
 
 #include "port/port.h"
@@ -15,7 +13,19 @@
 #include "rocksdb/write_batch.h"
 #include "test_util/testharness.h"
 
-using namespace ROCKSDB_NAMESPACE;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionStyle;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DestroyDB;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
 
 namespace {
 
@@ -40,8 +50,9 @@
  public:
   ManualCompactionTest() {
     // Get rid of any state from an old run.
-    dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath("rocksdb_cbug_test");
-    DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options());
+    dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath(
+        "rocksdb_manual_compaction_test");
+    DestroyDB(dbname_, Options());
   }
 
   std::string dbname_;
@@ -60,28 +71,55 @@
   const char* Name() const override { return "DestroyAllCompactionFilter"; }
 };
 
+class LogCompactionFilter : public CompactionFilter {
+ public:
+  const char* Name() const override { return "LogCompactionFilter"; }
+
+  bool Filter(int level, const Slice& key, const Slice& /*existing_value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    key_level_[key.ToString()] = level;
+    return false;
+  }
+
+  void Reset() { key_level_.clear(); }
+
+  size_t NumKeys() const { return key_level_.size(); }
+
+  int KeyLevel(const Slice& key) {
+    auto it = key_level_.find(key.ToString());
+    if (it == key_level_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+ private:
+  mutable std::map<std::string, int> key_level_;
+};
+
 TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
   for (int iter = 0; iter < 2; ++iter) {
     DB* db;
     Options options;
     if (iter == 0) { // level compaction
       options.num_levels = 3;
-      options.compaction_style = kCompactionStyleLevel;
+      options.compaction_style = CompactionStyle::kCompactionStyleLevel;
     } else { // universal compaction
-      options.compaction_style = kCompactionStyleUniversal;
+      options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
     }
     options.create_if_missing = true;
-    options.compression = ROCKSDB_NAMESPACE::kNoCompression;
+    options.compression = CompressionType::kNoCompression;
     options.compaction_filter = new DestroyAllCompactionFilter();
     ASSERT_OK(DB::Open(options, dbname_, &db));
 
-    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
-    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
-    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
-    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy")));
 
     Slice key4("key4");
-    db->CompactRange(CompactRangeOptions(), nullptr, &key4);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4));
     Iterator* itr = db->NewIterator(ReadOptions());
     itr->SeekToFirst();
     ASSERT_TRUE(itr->Valid());
@@ -100,46 +138,45 @@
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
   // specific scenario.
-  ROCKSDB_NAMESPACE::DB* db;
-  ROCKSDB_NAMESPACE::Options db_options;
+  DB* db;
+  Options db_options;
   db_options.write_buffer_size = 1024;
   db_options.create_if_missing = true;
-  db_options.compression = ROCKSDB_NAMESPACE::kNoCompression;
-  ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(db_options, dbname_, &db));
+  db_options.compression = CompressionType::kNoCompression;
+  ASSERT_OK(DB::Open(db_options, dbname_, &db));
 
   // create first key range
-  ROCKSDB_NAMESPACE::WriteBatch batch;
+  WriteBatch batch;
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Put(Key1(i), "value for range 1 key");
+    ASSERT_OK(batch.Put(Key1(i), "value for range 1 key"));
   }
-  ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
 
   // create second key range
   batch.Clear();
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Put(Key2(i), "value for range 2 key");
+    ASSERT_OK(batch.Put(Key2(i), "value for range 2 key"));
   }
-  ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
 
   // delete second key range
   batch.Clear();
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Delete(Key2(i));
+    ASSERT_OK(batch.Delete(Key2(i)));
   }
-  ASSERT_OK(db->Write(ROCKSDB_NAMESPACE::WriteOptions(), &batch));
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
 
   // compact database
   std::string start_key = Key1(0);
   std::string end_key = Key1(kNumKeys - 1);
-  ROCKSDB_NAMESPACE::Slice least(start_key.data(), start_key.size());
-  ROCKSDB_NAMESPACE::Slice greatest(end_key.data(), end_key.size());
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
 
   // commenting out the line below causes the example to work correctly
-  db->CompactRange(CompactRangeOptions(), &least, &greatest);
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest));
 
   // count the keys
-  ROCKSDB_NAMESPACE::Iterator* iter =
-      db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+  Iterator* iter = db->NewIterator(ReadOptions());
   int num_keys = 0;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     num_keys++;
@@ -149,7 +186,119 @@
 
   // close database
   delete db;
-  DestroyDB(dbname_, ROCKSDB_NAMESPACE::Options());
+  DestroyDB(dbname_, Options());
+}
+
+TEST_F(ManualCompactionTest, SkipLevel) {
+  DB* db;
+  Options options;
+  options.num_levels = 3;
+  // Initially, flushed L0 files won't exceed 100.
+  options.level0_file_num_compaction_trigger = 100;
+  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options.create_if_missing = true;
+  options.compression = CompressionType::kNoCompression;
+  LogCompactionFilter* filter = new LogCompactionFilter();
+  options.compaction_filter = filter;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+
+  WriteOptions wo;
+  FlushOptions fo;
+  ASSERT_OK(db->Put(wo, "1", ""));
+  ASSERT_OK(db->Flush(fo));
+  ASSERT_OK(db->Put(wo, "2", ""));
+  ASSERT_OK(db->Flush(fo));
+  ASSERT_OK(db->Put(wo, "4", ""));
+  ASSERT_OK(db->Put(wo, "8", ""));
+  ASSERT_OK(db->Flush(fo));
+
+  {
+    // L0: 1, 2, [4, 8]
+    // no file has keys in range [5, 7]
+    Slice start("5");
+    Slice end("7");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2, [4, 8]
+    // [3, 7] overlaps with 4 in L0
+    Slice start("3");
+    Slice end("7");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(2, filter->NumKeys());
+    ASSERT_EQ(0, filter->KeyLevel("4"));
+    ASSERT_EQ(0, filter->KeyLevel("8"));
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // no file has keys in range (-inf, 0]
+    Slice end("0");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // no file has keys in range [9, inf)
+    Slice start("9");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // [2, 2] overlaps with 2 in L0
+    Slice start("2");
+    Slice end("2");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(1, filter->NumKeys());
+    ASSERT_EQ(0, filter->KeyLevel("2"));
+  }
+
+  {
+    // L0: 1
+    // L1: 2, [4, 8]
+    // [2, 5] overlaps with 2 and [4, 8) in L1, skip L0
+    Slice start("2");
+    Slice end("5");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(3, filter->NumKeys());
+    ASSERT_EQ(1, filter->KeyLevel("2"));
+    ASSERT_EQ(1, filter->KeyLevel("4"));
+    ASSERT_EQ(1, filter->KeyLevel("8"));
+  }
+
+  {
+    // L0: 1
+    // L1: [2, 4, 8]
+    // [0, inf) overlaps all files
+    Slice start("0");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+    ASSERT_EQ(4, filter->NumKeys());
+    // 1 is first compacted to L1 and then further compacted into [2, 4, 8],
+    // so finally the logged level for 1 is L1.
+    ASSERT_EQ(1, filter->KeyLevel("1"));
+    ASSERT_EQ(1, filter->KeyLevel("2"));
+    ASSERT_EQ(1, filter->KeyLevel("4"));
+    ASSERT_EQ(1, filter->KeyLevel("8"));
+  }
+
+  delete filter;
+  delete db;
+  DestroyDB(dbname_, options);
 }
 
 }  // anonymous namespace
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,22 +13,27 @@
 #include <array>
 #include <limits>
 #include <memory>
+
 #include "db/dbformat.h"
+#include "db/kv_checksum.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
+#include "logging/logging.h"
 #include "memory/arena.h"
 #include "memory/memory_usage.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
+#include "port/lang.h"
 #include "port/port.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
@@ -36,12 +41,11 @@
 #include "util/autovector.h"
 #include "util/coding.h"
 #include "util/mutexlock.h"
-#include "util/util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 ImmutableMemTableOptions::ImmutableMemTableOptions(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     const MutableCFOptions& mutable_cf_options)
     : arena_block_size(mutable_cf_options.arena_block_size),
       memtable_prefix_bloom_bits(
@@ -56,12 +60,13 @@
       inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
       inplace_callback(ioptions.inplace_callback),
       max_successive_merges(mutable_cf_options.max_successive_merges),
-      statistics(ioptions.statistics),
-      merge_operator(ioptions.merge_operator),
-      info_log(ioptions.info_log) {}
+      statistics(ioptions.stats),
+      merge_operator(ioptions.merge_operator.get()),
+      info_log(ioptions.logger),
+      allow_data_in_errors(ioptions.allow_data_in_errors) {}
 
 MemTable::MemTable(const InternalKeyComparator& cmp,
-                   const ImmutableCFOptions& ioptions,
+                   const ImmutableOptions& ioptions,
                    const MutableCFOptions& mutable_cf_options,
                    WriteBufferManager* write_buffer_manager,
                    SequenceNumber latest_seq, uint32_t column_family_id)
@@ -79,9 +84,9 @@
              mutable_cf_options.memtable_huge_page_size),
       table_(ioptions.memtable_factory->CreateMemTableRep(
           comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
-          ioptions.info_log, column_family_id)),
+          ioptions.logger, column_family_id)),
       range_del_table_(SkipListFactory().CreateMemTableRep(
-          comparator_, &arena_, nullptr /* transform */, ioptions.info_log,
+          comparator_, &arena_, nullptr /* transform */, ioptions.logger,
           column_family_id)),
       is_range_del_table_empty_(true),
       data_size_(0),
@@ -101,9 +106,9 @@
                  : 0),
       prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
       flush_state_(FLUSH_NOT_REQUESTED),
-      env_(ioptions.env),
+      clock_(ioptions.clock),
       insert_with_hint_prefix_extractor_(
-          ioptions.memtable_insert_with_hint_prefix_extractor),
+          ioptions.memtable_insert_with_hint_prefix_extractor.get()),
       oldest_key_time_(std::numeric_limits<uint64_t>::max()),
       atomic_flush_seqno_(kMaxSequenceNumber),
       approximate_memory_usage_(0) {
@@ -117,7 +122,7 @@
     bloom_filter_.reset(
         new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
                          6 /* hard coded 6 probes */,
-                         moptions_.memtable_huge_page_size, ioptions.info_log));
+                         moptions_.memtable_huge_page_size, ioptions.logger));
   }
 }
 
@@ -220,7 +225,7 @@
   uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
   if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
     int64_t current_time = 0;
-    auto s = env_->GetCurrentTime(&current_time);
+    auto s = clock_->GetCurrentTime(&current_time);
     if (s.ok()) {
       assert(current_time >= 0);
       // If fail, the timestamp is already set.
@@ -327,9 +332,11 @@
     PERF_COUNTER_ADD(seek_on_memtable_count, 1);
     if (bloom_) {
       // iterator should only use prefix bloom filter
-      Slice user_k(ExtractUserKey(k));
-      if (prefix_extractor_->InDomain(user_k) &&
-          !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts) &&
+          !bloom_->MayContain(
+              prefix_extractor_->Transform(user_k_without_ts))) {
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
         valid_ = false;
         return;
@@ -344,9 +351,11 @@
     PERF_TIMER_GUARD(seek_on_memtable_time);
     PERF_COUNTER_ADD(seek_on_memtable_count, 1);
     if (bloom_) {
-      Slice user_k(ExtractUserKey(k));
-      if (prefix_extractor_->InDomain(user_k) &&
-          !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts) &&
+          !bloom_->MayContain(
+              prefix_extractor_->Transform(user_k_without_ts))) {
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
         valid_ = false;
         return;
@@ -375,8 +384,19 @@
     PERF_COUNTER_ADD(next_on_memtable_count, 1);
     assert(Valid());
     iter_->Next();
+    TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
     valid_ = iter_->Valid();
   }
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = valid_;
+    if (is_valid) {
+      result->key = key();
+      result->bound_check_result = IterBoundCheck::kUnknown;
+      result->value_prepared = true;
+    }
+    return is_valid;
+  }
   void Prev() override {
     PERF_COUNTER_ADD(prev_on_memtable_count, 1);
     assert(Valid());
@@ -428,11 +448,13 @@
       is_range_del_table_empty_.load(std::memory_order_relaxed)) {
     return nullptr;
   }
+  return NewRangeTombstoneIteratorInternal(read_options, read_seq);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
+    const ReadOptions& read_options, SequenceNumber read_seq) {
   auto* unfragmented_iter = new MemTableIterator(
       *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
-  if (unfragmented_iter == nullptr) {
-    return nullptr;
-  }
   auto fragmented_tombstone_list =
       std::make_shared<FragmentedRangeTombstoneList>(
           std::unique_ptr<InternalIterator>(unfragmented_iter),
@@ -444,7 +466,7 @@
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
-  return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())];
+  return &locks_[GetSliceRangedNPHash(key, locks_.size())];
 }
 
 MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
@@ -468,10 +490,52 @@
   return {entry_count * (data_size / n), entry_count};
 }
 
-bool MemTable::Add(SequenceNumber s, ValueType type,
-                   const Slice& key, /* user key */
-                   const Slice& value, bool allow_concurrent,
-                   MemTablePostProcessInfo* post_process_info, void** hint) {
+Status MemTable::VerifyEncodedEntry(Slice encoded,
+                                    const ProtectionInfoKVOS64& kv_prot_info) {
+  uint32_t ikey_len = 0;
+  if (!GetVarint32(&encoded, &ikey_len)) {
+    return Status::Corruption("Unable to parse internal key length");
+  }
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  if (ikey_len < 8 + ts_sz) {
+    return Status::Corruption("Internal key length too short");
+  }
+  if (ikey_len > encoded.size()) {
+    return Status::Corruption("Internal key length too long");
+  }
+  uint32_t value_len = 0;
+  const size_t user_key_len = ikey_len - 8;
+  Slice key(encoded.data(), user_key_len);
+  encoded.remove_prefix(user_key_len);
+
+  uint64_t packed = DecodeFixed64(encoded.data());
+  ValueType value_type = kMaxValue;
+  SequenceNumber sequence_number = kMaxSequenceNumber;
+  UnPackSequenceAndType(packed, &sequence_number, &value_type);
+  encoded.remove_prefix(8);
+
+  if (!GetVarint32(&encoded, &value_len)) {
+    return Status::Corruption("Unable to parse value length");
+  }
+  if (value_len < encoded.size()) {
+    return Status::Corruption("Value length too short");
+  }
+  if (value_len > encoded.size()) {
+    return Status::Corruption("Value length too long");
+  }
+  Slice value(encoded.data(), value_len);
+
+  return kv_prot_info.StripS(sequence_number)
+      .StripKVO(key, value, value_type)
+      .GetStatus();
+}
+
+Status MemTable::Add(SequenceNumber s, ValueType type,
+                     const Slice& key, /* user key */
+                     const Slice& value,
+                     const ProtectionInfoKVOS64* kv_prot_info,
+                     bool allow_concurrent,
+                     MemTablePostProcessInfo* post_process_info, void** hint) {
   // Format of an entry is concatenation of:
   //  key_size     : varint32 of internal_key.size()
   //  key bytes    : char[internal_key.size()]
@@ -498,7 +562,17 @@
   p = EncodeVarint32(p, val_size);
   memcpy(p, value.data(), val_size);
   assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+  if (kv_prot_info != nullptr) {
+    Slice encoded(buf, encoded_len);
+    TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded);
+    Status status = VerifyEncodedEntry(encoded, *kv_prot_info);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
   size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz);
 
   if (!allow_concurrent) {
     // Extract prefix for insert with hint.
@@ -507,12 +581,12 @@
       Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
       bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
       if (UNLIKELY(!res)) {
-        return res;
+        return Status::TryAgain("key+seq exists");
       }
     } else {
       bool res = table->InsertKey(handle);
       if (UNLIKELY(!res)) {
-        return res;
+        return Status::TryAgain("key+seq exists");
       }
     }
 
@@ -528,11 +602,11 @@
     }
 
     if (bloom_filter_ && prefix_extractor_ &&
-        prefix_extractor_->InDomain(key)) {
-      bloom_filter_->Add(prefix_extractor_->Transform(key));
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz));
+      bloom_filter_->Add(key_without_ts);
     }
 
     // The first sequence number inserted into the memtable
@@ -553,7 +627,7 @@
                    ? table->InsertKeyConcurrently(handle)
                    : table->InsertKeyWithHintConcurrently(handle, hint);
     if (UNLIKELY(!res)) {
-      return res;
+      return Status::TryAgain("key+seq exists");
     }
 
     assert(post_process_info != nullptr);
@@ -564,11 +638,12 @@
     }
 
     if (bloom_filter_ && prefix_extractor_ &&
-        prefix_extractor_->InDomain(key)) {
-      bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->AddConcurrently(
+          prefix_extractor_->Transform(key_without_ts));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz));
+      bloom_filter_->AddConcurrently(key_without_ts);
     }
 
     // atomically update first_seqno_ and earliest_seqno_.
@@ -587,7 +662,7 @@
     is_range_del_table_empty_.store(false, std::memory_order_relaxed);
   }
   UpdateOldestKeyTime();
-  return true;
+  return Status::OK();
 }
 
 // Callback from MemTable::Get()
@@ -600,6 +675,7 @@
   bool* merge_in_progress;
   std::string* value;
   SequenceNumber seq;
+  std::string* timestamp;
   const MergeOperator* merge_operator;
   // the merge operations encountered;
   MergeContext* merge_context;
@@ -609,10 +685,11 @@
   Statistics* statistics;
   bool inplace_update_support;
   bool do_merge;
-  Env* env_;
+  SystemClock* clock;
+
   ReadCallback* callback_;
   bool* is_blob_index;
-
+  bool allow_data_in_errors;
   bool CheckCallback(SequenceNumber _seq) {
     if (callback_) {
       return callback_->IsVisible(_seq);
@@ -640,12 +717,15 @@
   // Check that it belongs to same user key.  We do not check the
   // sequence number since the Seek() call above should have skipped
   // all entries with overly large sequence numbers.
-  uint32_t key_length;
+  uint32_t key_length = 0;
   const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  assert(key_length >= 8);
   Slice user_key_slice = Slice(key_ptr, key_length - 8);
-  if (s->mem->GetInternalKeyComparator()
-          .user_comparator()
-          ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
+  const Comparator* user_comparator =
+      s->mem->GetInternalKeyComparator().user_comparator();
+  size_t ts_sz = user_comparator->timestamp_size();
+  if (user_comparator->EqualWithoutTimestamp(user_key_slice,
+                                             s->key->user_key())) {
     // Correct user key
     const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
     ValueType type;
@@ -690,7 +770,7 @@
               *(s->status) = MergeHelper::TimedFullMerge(
                   merge_operator, s->key->user_key(), &v,
                   merge_context->GetOperands(), s->value, s->logger,
-                  s->statistics, s->env_, nullptr /* result_operand */, true);
+                  s->statistics, s->clock, nullptr /* result_operand */, true);
             }
           } else {
             // Preserve the value with the goal of returning it as part of
@@ -713,9 +793,15 @@
         if (s->is_blob_index != nullptr) {
           *(s->is_blob_index) = (type == kTypeBlobIndex);
         }
+
+        if (ts_sz > 0 && s->timestamp != nullptr) {
+          Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
+          s->timestamp->assign(ts.data(), ts.size());
+        }
         return false;
       }
       case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
       case kTypeSingleDeletion:
       case kTypeRangeDeletion: {
         if (*(s->merge_in_progress)) {
@@ -723,7 +809,7 @@
             *(s->status) = MergeHelper::TimedFullMerge(
                 merge_operator, s->key->user_key(), nullptr,
                 merge_context->GetOperands(), s->value, s->logger,
-                s->statistics, s->env_, nullptr /* result_operand */, true);
+                s->statistics, s->clock, nullptr /* result_operand */, true);
           }
         } else {
           *(s->status) = Status::NotFound();
@@ -751,15 +837,24 @@
           *(s->status) = MergeHelper::TimedFullMerge(
               merge_operator, s->key->user_key(), nullptr,
               merge_context->GetOperands(), s->value, s->logger, s->statistics,
-              s->env_, nullptr /* result_operand */, true);
+              s->clock, nullptr /* result_operand */, true);
           *(s->found_final_value) = true;
           return false;
         }
         return true;
       }
-      default:
-        assert(false);
-        return true;
+      default: {
+        std::string msg("Corrupted value not expected.");
+        if (s->allow_data_in_errors) {
+          msg.append("Unrecognized value type: " +
+                     std::to_string(static_cast<int>(type)) + ". ");
+          msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) +
+                     ". ");
+          msg.append("seq: " + std::to_string(seq) + ".");
+        }
+        *(s->status) = Status::Corruption(msg.c_str());
+        return false;
+      }
     }
   }
 
@@ -767,7 +862,8 @@
   return false;
 }
 
-bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
+bool MemTable::Get(const LookupKey& key, std::string* value,
+                   std::string* timestamp, Status* s,
                    MergeContext* merge_context,
                    SequenceNumber* max_covering_tombstone_seq,
                    SequenceNumber* seq, const ReadOptions& read_opts,
@@ -788,22 +884,21 @@
                  range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()));
   }
 
-  Slice user_key = key.user_key();
   bool found_final_value = false;
   bool merge_in_progress = s->IsMergeInProgress();
   bool may_contain = true;
   size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz);
   if (bloom_filter_) {
     // when both memtable_whole_key_filtering and prefix_extractor_ are set,
     // only do whole key filtering for Get() to save CPU
     if (moptions_.memtable_whole_key_filtering) {
-      may_contain =
-          bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
+      may_contain = bloom_filter_->MayContain(user_key_without_ts);
     } else {
       assert(prefix_extractor_);
-      may_contain =
-          !prefix_extractor_->InDomain(user_key) ||
-          bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
+      may_contain = !prefix_extractor_->InDomain(user_key_without_ts) ||
+                    bloom_filter_->MayContain(
+                        prefix_extractor_->Transform(user_key_without_ts));
     }
   }
 
@@ -816,7 +911,7 @@
       PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
     }
     GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
-                 is_blob_index, value, s, merge_context, seq,
+                 is_blob_index, value, timestamp, s, merge_context, seq,
                  &found_final_value, &merge_in_progress);
   }
 
@@ -831,7 +926,8 @@
 void MemTable::GetFromTable(const LookupKey& key,
                             SequenceNumber max_covering_tombstone_seq,
                             bool do_merge, ReadCallback* callback,
-                            bool* is_blob_index, std::string* value, Status* s,
+                            bool* is_blob_index, std::string* value,
+                            std::string* timestamp, Status* s,
                             MergeContext* merge_context, SequenceNumber* seq,
                             bool* found_final_value, bool* merge_in_progress) {
   Saver saver;
@@ -840,6 +936,7 @@
   saver.merge_in_progress = merge_in_progress;
   saver.key = &key;
   saver.value = value;
+  saver.timestamp = timestamp;
   saver.seq = kMaxSequenceNumber;
   saver.mem = this;
   saver.merge_context = merge_context;
@@ -848,16 +945,17 @@
   saver.logger = moptions_.info_log;
   saver.inplace_update_support = moptions_.inplace_update_support;
   saver.statistics = moptions_.statistics;
-  saver.env_ = env_;
+  saver.clock = clock_;
   saver.callback_ = callback;
   saver.is_blob_index = is_blob_index;
   saver.do_merge = do_merge;
+  saver.allow_data_in_errors = moptions_.allow_data_in_errors;
   table_->Get(key, &saver, SaveValue);
   *seq = saver.seq;
 }
 
 void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                        ReadCallback* callback, bool* is_blob) {
+                        ReadCallback* callback) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -865,52 +963,59 @@
   }
   PERF_TIMER_GUARD(get_from_memtable_time);
 
+  // For now, memtable Bloom filter is effectively disabled if there are any
+  // range tombstones. This is the simplest way to ensure range tombstones are
+  // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
+  bool no_range_del = read_options.ignore_range_deletions ||
+                      is_range_del_table_empty_.load(std::memory_order_relaxed);
   MultiGetRange temp_range(*range, range->begin(), range->end());
-  if (bloom_filter_) {
-    std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
-    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
-    autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+  if (bloom_filter_ && no_range_del) {
+    bool whole_key =
+        !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
+    std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
+    std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
     int num_keys = 0;
     for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (!prefix_extractor_) {
-        keys[num_keys++] = &iter->ukey;
-      } else if (prefix_extractor_->InDomain(iter->ukey)) {
-        prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey));
-        keys[num_keys++] = &prefixes.back();
-      }
-    }
-    bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
-    int idx = 0;
-    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) {
+      if (whole_key) {
+        bloom_keys[num_keys] = iter->ukey_without_ts;
+        range_indexes[num_keys++] = iter.index();
+      } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+        bloom_keys[num_keys] =
+            prefix_extractor_->Transform(iter->ukey_without_ts);
+        range_indexes[num_keys++] = iter.index();
+      } else {
+        // TODO: consider not counting these as Bloom hits to more closely
+        // match bloom_sst_hit_count
         PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
-        continue;
       }
-      if (!may_match[idx]) {
-        temp_range.SkipKey(iter);
+    }
+    bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
+    for (int i = 0; i < num_keys; ++i) {
+      if (!may_match[i]) {
+        temp_range.SkipIndex(range_indexes[i]);
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
       } else {
         PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
       }
-      idx++;
     }
   }
   for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-    SequenceNumber seq = kMaxSequenceNumber;
     bool found_final_value{false};
     bool merge_in_progress = iter->s->IsMergeInProgress();
-    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
-        NewRangeTombstoneIterator(
-            read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
-    if (range_del_iter != nullptr) {
+    if (!no_range_del) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          NewRangeTombstoneIteratorInternal(
+              read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
       iter->max_covering_tombstone_seq = std::max(
           iter->max_covering_tombstone_seq,
           range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
     }
+    SequenceNumber dummy_seq;
     GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
-                 callback, is_blob, iter->value->GetSelf(), iter->s,
-                 &(iter->merge_context), &seq, &found_final_value,
-                 &merge_in_progress);
+                 callback, &iter->is_blob_index, iter->value->GetSelf(),
+                 iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
+                 &found_final_value, &merge_in_progress);
 
     if (!found_final_value && merge_in_progress) {
       *(iter->s) = Status::MergeInProgress();
@@ -918,16 +1023,26 @@
 
     if (found_final_value) {
       iter->value->PinSelf();
+      range->AddValueSize(iter->value->size());
       range->MarkKeyDone(iter);
       RecordTick(moptions_.statistics, MEMTABLE_HIT);
+      if (range->GetValueSize() > read_options.value_size_soft_limit) {
+        // Set all remaining keys in range to Abort
+        for (auto range_iter = range->begin(); range_iter != range->end();
+             ++range_iter) {
+          range->MarkKeyDone(range_iter);
+          *(range_iter->s) = Status::Aborted();
+        }
+        break;
+      }
     }
   }
   PERF_COUNTER_ADD(get_from_memtable_count, 1);
 }
 
-void MemTable::Update(SequenceNumber seq,
-                      const Slice& key,
-                      const Slice& value) {
+Status MemTable::Update(SequenceNumber seq, const Slice& key,
+                        const Slice& value,
+                        const ProtectionInfoKVOS64* kv_prot_info) {
   LookupKey lkey(key, seq);
   Slice mem_key = lkey.memtable_key();
 
@@ -971,22 +1086,26 @@
                  (unsigned)(VarintLength(key_length) + key_length +
                             VarintLength(value.size()) + value.size()));
           RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
-          return;
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+            // `seq` is swallowed and `existing_seq` prevails.
+            updated_kv_prot_info.UpdateS(seq, existing_seq);
+            Slice encoded(entry, p + value.size() - entry);
+            return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+          }
+          return Status::OK();
         }
       }
     }
   }
 
-  // key doesn't exist
-  bool add_res __attribute__((__unused__));
-  add_res = Add(seq, kTypeValue, key, value);
-  // We already checked unused != seq above. In that case, Add should not fail.
-  assert(add_res);
+  // The latest value is not `kTypeValue` or key doesn't exist
+  return Add(seq, kTypeValue, key, value, kv_prot_info);
 }
 
-bool MemTable::UpdateCallback(SequenceNumber seq,
-                              const Slice& key,
-                              const Slice& delta) {
+Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
+                                const Slice& delta,
+                                const ProtectionInfoKVOS64* kv_prot_info) {
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
@@ -1012,8 +1131,8 @@
       // Correct user key
       const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
       ValueType type;
-      uint64_t unused;
-      UnPackSequenceAndType(tag, &unused, &type);
+      uint64_t existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
       switch (type) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
@@ -1040,16 +1159,35 @@
             }
             RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
             UpdateFlushState();
-            return true;
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              // `seq` is swallowed and `existing_seq` prevails.
+              updated_kv_prot_info.UpdateS(seq, existing_seq);
+              updated_kv_prot_info.UpdateV(delta,
+                                           Slice(prev_buffer, new_prev_size));
+              Slice encoded(entry, prev_buffer + new_prev_size - entry);
+              return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+            }
+            return Status::OK();
           } else if (status == UpdateStatus::UPDATED) {
-            Add(seq, kTypeValue, key, Slice(str_value));
+            Status s;
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(delta, str_value);
+              s = Add(seq, kTypeValue, key, Slice(str_value),
+                      &updated_kv_prot_info);
+            } else {
+              s = Add(seq, kTypeValue, key, Slice(str_value),
+                      nullptr /* kv_prot_info */);
+            }
             RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
             UpdateFlushState();
-            return true;
+            return s;
           } else if (status == UpdateStatus::UPDATE_FAILED) {
-            // No action required. Return.
+            // `UPDATE_FAILED` is named incorrectly. It indicates no update
+            // happened. It does not indicate a failure happened.
             UpdateFlushState();
-            return true;
+            return Status::OK();
           }
         }
         default:
@@ -1057,9 +1195,8 @@
       }
     }
   }
-  // If the latest value is not kTypeValue
-  // or key doesn't exist
-  return false;
+  // The latest value is not `kTypeValue` or key doesn't exist
+  return Status::NotFound();
 }
 
 size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,8 +14,11 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
+
 #include "db/dbformat.h"
+#include "db/kv_checksum.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
 #include "db/version_edit.h"
@@ -24,7 +27,6 @@
 #include "monitoring/instrumented_mutex.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "table/multiget_context.h"
 #include "util/dynamic_bloom.h"
@@ -36,9 +38,10 @@
 class Mutex;
 class MemTableIterator;
 class MergeContext;
+class SystemClock;
 
 struct ImmutableMemTableOptions {
-  explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
+  explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions,
                                     const MutableCFOptions& mutable_cf_options);
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
@@ -54,6 +57,7 @@
   Statistics* statistics;
   MergeOperator* merge_operator;
   Logger* info_log;
+  bool allow_data_in_errors;
 };
 
 // Batched counters to updated when inserting keys in one write batch.
@@ -69,7 +73,7 @@
 // Note:  Many of the methods in this class have comments indicating that
 // external synchronization is required as these methods are not thread-safe.
 // It is up to higher layers of code to decide how to prevent concurrent
-// invokation of these methods.  This is usually done by acquiring either
+// invocation of these methods.  This is usually done by acquiring either
 // the db mutex or the single writer thread.
 //
 // Some of these methods are documented to only require external
@@ -100,7 +104,7 @@
   // used, but this may prevent some transactions from succeeding until the
   // first key is inserted into the memtable.
   explicit MemTable(const InternalKeyComparator& comparator,
-                    const ImmutableCFOptions& ioptions,
+                    const ImmutableOptions& ioptions,
                     const MutableCFOptions& mutable_cf_options,
                     WriteBufferManager* write_buffer_manager,
                     SequenceNumber earliest_seq, uint32_t column_family_id);
@@ -136,12 +140,39 @@
   // operations on the same MemTable (unless this Memtable is immutable).
   size_t ApproximateMemoryUsage();
 
-  // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
   // require external synchronization. The value may be less accurate though
   size_t ApproximateMemoryUsageFast() const {
     return approximate_memory_usage_.load(std::memory_order_relaxed);
   }
 
+  // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
+  size_t MemoryAllocatedBytes() const {
+    return table_->ApproximateMemoryUsage() +
+           range_del_table_->ApproximateMemoryUsage() +
+           arena_.MemoryAllocatedBytes();
+  }
+
+  // Returns a vector of unique random memtable entries of size 'sample_size'.
+  //
+  // Note: the entries are stored in the unordered_set as length-prefixed keys,
+  //       hence their representation in the set as "const char*".
+  // Note2: the size of the output set 'entries' is not enforced to be strictly
+  //        equal to 'target_sample_size'. Its final size might be slightly
+  //        greater or slightly less than 'target_sample_size'
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  // REQUIRES: SkipList memtable representation. This function is not
+  // implemented for any other type of memtable representation (vectorrep,
+  // hashskiplist,...).
+  void UniqueRandomSample(const uint64_t& target_sample_size,
+                          std::unordered_set<const char*>* entries) {
+    // TODO(bjlemaire): at the moment, only supported by skiplistrep.
+    // Extend it to all other memtable representations.
+    table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
+  }
+
   // This method heuristically determines if the memtable should continue to
   // host more data.
   bool ShouldScheduleFlush() const {
@@ -174,6 +205,9 @@
   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options, SequenceNumber read_seq);
 
+  Status VerifyEncodedEntry(Slice encoded,
+                            const ProtectionInfoKVOS64& kv_prot_info);
+
   // Add an entry into memtable that maps key to value at the
   // specified sequence number and with the specified type.
   // Typically value will be empty if type==kTypeDeletion.
@@ -181,12 +215,14 @@
   // REQUIRES: if allow_concurrent = false, external synchronization to prevent
   // simultaneous operations on the same MemTable.
   //
-  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
-  // the <key, seq> already exists.
-  bool Add(SequenceNumber seq, ValueType type, const Slice& key,
-           const Slice& value, bool allow_concurrent = false,
-           MemTablePostProcessInfo* post_process_info = nullptr,
-           void** hint = nullptr);
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  Status Add(SequenceNumber seq, ValueType type, const Slice& key,
+             const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
+             bool allow_concurrent = false,
+             MemTablePostProcessInfo* post_process_info = nullptr,
+             void** hint = nullptr);
 
   // Used to Get value associated with key or Get Merge Operands associated
   // with key.
@@ -212,50 +248,62 @@
            MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr, bool do_merge = true) {
+    return Get(key, value, /*timestamp=*/nullptr, s, merge_context,
+               max_covering_tombstone_seq, seq, read_opts, callback,
+               is_blob_index, do_merge);
+  }
+
+  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
+           Status* s, MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr, bool do_merge = true);
 
-  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context,
+  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
+           Status* s, MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr, bool do_merge = true) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
-               read_opts, callback, is_blob_index, do_merge);
+    return Get(key, value, timestamp, s, merge_context,
+               max_covering_tombstone_seq, &seq, read_opts, callback,
+               is_blob_index, do_merge);
   }
 
   void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                ReadCallback* callback, bool* is_blob);
+                ReadCallback* callback);
 
-  // Attempts to update the new_value inplace, else does normal Add
-  // Pseudocode
-  //   if key exists in current memtable && prev_value is of type kTypeValue
-  //     if new sizeof(new_value) <= sizeof(prev_value)
-  //       update inplace
-  //     else add(key, new_value)
-  //   else add(key, new_value)
+  // If `key` exists in current memtable with type `kTypeValue` and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // adds the new value to the memtable out-of-place.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
   //
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  void Update(SequenceNumber seq,
-              const Slice& key,
-              const Slice& value);
-
-  // If prev_value for key exists, attempts to update it inplace.
-  // else returns false
-  // Pseudocode
-  //   if key exists in current memtable && prev_value is of type kTypeValue
-  //     new_value = delta(prev_value)
-  //     if sizeof(new_value) <= sizeof(prev_value)
-  //       update inplace
-  //     else add(key, new_value)
-  //   else return false
+  Status Update(SequenceNumber seq, const Slice& key, const Slice& value,
+                const ProtectionInfoKVOS64* kv_prot_info);
+
+  // If `key` exists in current memtable with type `kTypeValue` and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // if `key` exists in current memtable with type `kTypeValue`, adds the new
+  // value to the memtable out-of-place.
+  //
+  // Returns `Status::NotFound` if `key` does not exist in current memtable or
+  // the latest version of `key` does not have `kTypeValue`.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
   //
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  bool UpdateCallback(SequenceNumber seq,
-                      const Slice& key,
-                      const Slice& delta);
+  Status UpdateCallback(SequenceNumber seq, const Slice& key,
+                        const Slice& delta,
+                        const ProtectionInfoKVOS64* kv_prot_info);
 
   // Returns the number of successive merge entries starting from the newest
   // entry for the key up to the last non-merge entry or last entry for the
@@ -321,6 +369,14 @@
     return first_seqno_.load(std::memory_order_relaxed);
   }
 
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  void SetFirstSequenceNumber(SequenceNumber first_seqno) {
+    return first_seqno_.store(first_seqno, std::memory_order_relaxed);
+  }
+
   // Returns the sequence number that is guaranteed to be smaller than or equal
   // to the sequence number of any key that could be inserted into this
   // memtable. It can then be assumed that any write with a larger(or equal)
@@ -332,6 +388,15 @@
     return earliest_seqno_.load(std::memory_order_relaxed);
   }
 
+  // Sets the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  // Used only for MemPurge operation
+  void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) {
+    return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed);
+  }
+
   // DB's latest sequence ID when the memtable is created. This number
   // may be updated to a more recent one before any key is inserted.
   SequenceNumber GetCreationSeq() const { return creation_seq_; }
@@ -434,6 +499,9 @@
   }
 #endif  // !ROCKSDB_LITE
 
+  // Returns a heuristic flush decision
+  bool ShouldFlushNow();
+
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
 
@@ -492,7 +560,7 @@
 
   std::atomic<FlushStateEnum> flush_state_;
 
-  Env* env_;
+  SystemClock* clock_;
 
   // Extract sequential insert prefixes.
   const SliceTransform* insert_with_hint_prefix_extractor_;
@@ -513,7 +581,7 @@
   SequenceNumber atomic_flush_seqno_;
 
   // keep track of memory usage in table_, arena_, and range_del_table_.
-  // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
   std::atomic<uint64_t> approximate_memory_usage_;
 
 #ifndef ROCKSDB_LITE
@@ -521,9 +589,6 @@
   std::unique_ptr<FlushJobInfo> flush_job_info_;
 #endif  // !ROCKSDB_LITE
 
-  // Returns a heuristic flush decision
-  bool ShouldFlushNow();
-
   // Updates flush_state_ using ShouldFlushNow()
   void UpdateFlushState();
 
@@ -532,9 +597,13 @@
   void GetFromTable(const LookupKey& key,
                     SequenceNumber max_covering_tombstone_seq, bool do_merge,
                     ReadCallback* callback, bool* is_blob_index,
-                    std::string* value, Status* s, MergeContext* merge_context,
-                    SequenceNumber* seq, bool* found_final_value,
-                    bool* merge_in_progress);
+                    std::string* value, std::string* timestamp, Status* s,
+                    MergeContext* merge_context, SequenceNumber* seq,
+                    bool* found_final_value, bool* merge_in_progress);
+
+  // Always returns non-null and assumes certain pre-checks are done
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+      const ReadOptions& read_options, SequenceNumber read_seq);
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,15 +5,18 @@
 //
 #include "db/memtable_list.h"
 
+#include <algorithm>
 #include <cinttypes>
 #include <limits>
 #include <queue>
 #include <string>
+
 #include "db/db_impl/db_impl.h"
 #include "db/memtable.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
 #include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -43,22 +46,20 @@
 }
 
 MemTableListVersion::MemTableListVersion(
-    size_t* parent_memtable_list_memory_usage, MemTableListVersion* old)
+    size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old)
     : max_write_buffer_number_to_maintain_(
-          old->max_write_buffer_number_to_maintain_),
+          old.max_write_buffer_number_to_maintain_),
       max_write_buffer_size_to_maintain_(
-          old->max_write_buffer_size_to_maintain_),
+          old.max_write_buffer_size_to_maintain_),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
-  if (old != nullptr) {
-    memlist_ = old->memlist_;
-    for (auto& m : memlist_) {
-      m->Ref();
-    }
+  memlist_ = old.memlist_;
+  for (auto& m : memlist_) {
+    m->Ref();
+  }
 
-    memlist_history_ = old->memlist_history_;
-    for (auto& m : memlist_history_) {
-      m->Ref();
-    }
+  memlist_history_ = old.memlist_history_;
+  for (auto& m : memlist_history_) {
+    m->Ref();
   }
 }
 
@@ -104,20 +105,21 @@
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
-                              Status* s, MergeContext* merge_context,
+                              std::string* timestamp, Status* s,
+                              MergeContext* merge_context,
                               SequenceNumber* max_covering_tombstone_seq,
                               SequenceNumber* seq, const ReadOptions& read_opts,
                               ReadCallback* callback, bool* is_blob_index) {
-  return GetFromList(&memlist_, key, value, s, merge_context,
+  return GetFromList(&memlist_, key, value, timestamp, s, merge_context,
                      max_covering_tombstone_seq, seq, read_opts, callback,
                      is_blob_index);
 }
 
 void MemTableListVersion::MultiGet(const ReadOptions& read_options,
-                                   MultiGetRange* range, ReadCallback* callback,
-                                   bool* is_blob) {
+                                   MultiGetRange* range,
+                                   ReadCallback* callback) {
   for (auto memtable : memlist_) {
-    memtable->MultiGet(read_options, range, callback, is_blob);
+    memtable->MultiGet(read_options, range, callback);
     if (range->empty()) {
       return;
     }
@@ -128,9 +130,9 @@
     const LookupKey& key, Status* s, MergeContext* merge_context,
     SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
   for (MemTable* memtable : memlist_) {
-    bool done = memtable->Get(key, nullptr, s, merge_context,
-                              max_covering_tombstone_seq, read_opts, nullptr,
-                              nullptr, false);
+    bool done = memtable->Get(key, /*value*/ nullptr, /*timestamp*/ nullptr, s,
+                              merge_context, max_covering_tombstone_seq,
+                              read_opts, nullptr, nullptr, false);
     if (done) {
       return true;
     }
@@ -139,17 +141,17 @@
 }
 
 bool MemTableListVersion::GetFromHistory(
-    const LookupKey& key, std::string* value, Status* s,
+    const LookupKey& key, std::string* value, std::string* timestamp, Status* s,
     MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
     SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
-  return GetFromList(&memlist_history_, key, value, s, merge_context,
+  return GetFromList(&memlist_history_, key, value, timestamp, s, merge_context,
                      max_covering_tombstone_seq, seq, read_opts,
                      nullptr /*read_callback*/, is_blob_index);
 }
 
 bool MemTableListVersion::GetFromList(
     std::list<MemTable*>* list, const LookupKey& key, std::string* value,
-    Status* s, MergeContext* merge_context,
+    std::string* timestamp, Status* s, MergeContext* merge_context,
     SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
     const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) {
   *seq = kMaxSequenceNumber;
@@ -157,9 +159,9 @@
   for (auto& memtable : *list) {
     SequenceNumber current_seq = kMaxSequenceNumber;
 
-    bool done =
-        memtable->Get(key, value, s, merge_context, max_covering_tombstone_seq,
-                      &current_seq, read_opts, callback, is_blob_index);
+    bool done = memtable->Get(key, value, timestamp, s, merge_context,
+                              max_covering_tombstone_seq, &current_seq,
+                              read_opts, callback, is_blob_index);
     if (*seq == kMaxSequenceNumber) {
       // Store the most recent sequence number of any operation on this key.
       // Since we only care about the most recent change, we only need to
@@ -257,8 +259,8 @@
 void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
   assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
   AddMemTable(m);
-
-  TrimHistory(to_delete, m->ApproximateMemoryUsage());
+  // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast
+  TrimHistory(to_delete, 0);
 }
 
 // Removes m from list of memtables not flushed.  Caller should NOT Unref m.
@@ -280,16 +282,16 @@
 }
 
 // return the total memory usage assuming the oldest flushed memtable is dropped
-size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() const {
+size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const {
   size_t total_memtable_size = 0;
   for (auto& memtable : memlist_) {
-    total_memtable_size += memtable->ApproximateMemoryUsage();
+    total_memtable_size += memtable->MemoryAllocatedBytes();
   }
   for (auto& memtable : memlist_history_) {
-    total_memtable_size += memtable->ApproximateMemoryUsage();
+    total_memtable_size += memtable->MemoryAllocatedBytes();
   }
   if (!memlist_history_.empty()) {
-    total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage();
+    total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes();
   }
   return total_memtable_size;
 }
@@ -299,7 +301,7 @@
     // calculate the total memory usage after dropping the oldest flushed
     // memtable, compare with max_write_buffer_size_to_maintain_ to decide
     // whether to trim history
-    return ApproximateMemoryUsageExcludingLast() + usage >=
+    return MemoryAllocatedBytesExcludingLast() + usage >=
            static_cast<size_t>(max_write_buffer_size_to_maintain_);
   } else if (max_write_buffer_number_to_maintain_ > 0) {
     return memlist_.size() + memlist_history_.size() >
@@ -310,14 +312,17 @@
 }
 
 // Make sure we don't use up too much space in history
-void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+bool MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
                                       size_t usage) {
+  bool ret = false;
   while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
     MemTable* x = memlist_history_.back();
     memlist_history_.pop_back();
 
     UnrefMemTable(to_delete, x);
+    ret = true;
   }
+  return ret;
 }
 
 // Returns true if there is at least one memtable on which flush has
@@ -332,18 +337,26 @@
 }
 
 // Returns the memtables that need to be flushed.
-void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
+void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
                                         autovector<MemTable*>* ret) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
   const auto& memlist = current_->memlist_;
   bool atomic_flush = false;
+
+  // Note: every time MemTableList::Add(mem) is called, it adds the new mem
+  // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by
+  // iterating through the memlist starting at the end, the vector<MemTable*>
+  // ret is filled with memtables already sorted in increasing MemTable ID.
+  // However, when the mempurge feature is activated, new memtables with older
+  // IDs will be added to the memlist. Therefore we std::sort(ret) at the end to
+  // return a vector of memtables sorted by increasing memtable ID.
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
     if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
       atomic_flush = true;
     }
-    if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
+    if (m->GetID() > max_memtable_id) {
       break;
     }
     if (!m->flush_in_progress_) {
@@ -359,6 +372,15 @@
   if (!atomic_flush || num_flush_not_started_ == 0) {
     flush_requested_ = false;  // start-flush request is complete
   }
+
+  // Sort the list of memtables by increasing memtable ID.
+  // This is useful when the mempurge feature is activated
+  // and the memtables are not guaranteed to be sorted in
+  // the memlist vector.
+  std::sort(ret->begin(), ret->end(),
+            [](const MemTable* m1, const MemTable* m2) -> bool {
+              return m1->GetID() < m2->GetID();
+            });
 }
 
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
@@ -387,9 +409,10 @@
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
     const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
     VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
-    autovector<MemTable*>* to_delete, Directory* db_directory,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
     LogBuffer* log_buffer,
-    std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info) {
+    std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+    IOStatus* io_s, bool write_edits) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
   mu->AssertHeld();
@@ -442,9 +465,18 @@
       }
       if (it == memlist.rbegin() || batch_file_number != m->file_number_) {
         batch_file_number = m->file_number_;
-        ROCKS_LOG_BUFFER(log_buffer,
-                         "[%s] Level-0 commit table #%" PRIu64 " started",
-                         cfd->GetName().c_str(), m->file_number_);
+        if (m->edit_.GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64 " started",
+                           cfd->GetName().c_str(), m->file_number_);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files) started",
+                           cfd->GetName().c_str(), m->file_number_,
+                           m->edit_.GetBlobFileAdditions().size());
+        }
+
         edit_list.push_back(&m->edit_);
         memtables_to_flush.push_back(m);
 #ifndef ROCKSDB_LITE
@@ -461,67 +493,67 @@
 
     // TODO(myabandeh): Not sure how batch_count could be 0 here.
     if (batch_count > 0) {
+      uint64_t min_wal_number_to_keep = 0;
+      assert(edit_list.size() > 0);
       if (vset->db_options()->allow_2pc) {
-        assert(edit_list.size() > 0);
-        // We piggyback the information of  earliest log file to keep in the
+        // Note that if mempurge is successful, the edit_list will
+        // not be applicable (contains info of new min_log number to keep,
+        // and level 0 file path of SST file created during normal flush,
+        // so both pieces of information are irrelevant after a successful
+        // mempurge operation).
+        min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+            vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
+
+        // We piggyback the information of earliest log file to keep in the
         // manifest entry for the last file flushed.
-        edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep(
-            vset, *cfd, edit_list, memtables_to_flush, prep_tracker));
+      } else {
+        min_wal_number_to_keep =
+            PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
       }
+      edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
 
-      // this can release and reacquire the mutex.
-      s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
-                            db_directory);
-
-      // we will be changing the version in the next code path,
-      // so we better create a new one, since versions are immutable
-      InstallNewVersion();
-
-      // All the later memtables that have the same filenum
-      // are part of the same batch. They can be committed now.
-      uint64_t mem_id = 1;  // how many memtables have been flushed.
-
-      // commit new state only if the column family is NOT dropped.
-      // The reason is as follows (refer to
-      // ColumnFamilyTest.FlushAndDropRaceCondition).
-      // If the column family is dropped, then according to LogAndApply, its
-      // corresponding flush operation is NOT written to the MANIFEST. This
-      // means the DB is not aware of the L0 files generated from the flush.
-      // By committing the new state, we remove the memtable from the memtable
-      // list. Creating an iterator on this column family will not be able to
-      // read full data since the memtable is removed, and the DB is not aware
-      // of the L0 files, causing MergingIterator unable to build child
-      // iterators. RocksDB contract requires that the iterator can be created
-      // on a dropped column family, and we must be able to
-      // read full data as long as column family handle is not deleted, even if
-      // the column family is dropped.
-      if (s.ok() && !cfd->IsDropped()) {  // commit new state
-        while (batch_count-- > 0) {
-          MemTable* m = current_->memlist_.back();
-          ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64
-                                       ": memtable #%" PRIu64 " done",
-                           cfd->GetName().c_str(), m->file_number_, mem_id);
-          assert(m->file_number_ > 0);
-          current_->Remove(m, to_delete);
-          UpdateCachedValuesFromMemTableListVersion();
-          ResetTrimHistoryNeeded();
-          ++mem_id;
+      std::unique_ptr<VersionEdit> wal_deletion;
+      if (vset->db_options()->track_and_verify_wals_in_manifest) {
+        if (min_wal_number_to_keep >
+            vset->GetWalSet().GetMinWalNumberToKeep()) {
+          wal_deletion.reset(new VersionEdit);
+          wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
+          edit_list.push_back(wal_deletion.get());
         }
+        TEST_SYNC_POINT_CALLBACK(
+            "MemTableList::TryInstallMemtableFlushResults:"
+            "AfterComputeMinWalToKeep",
+            nullptr);
+      }
+
+      const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
+                                      to_delete, mu](const Status& status) {
+        RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer,
+                                      to_delete, mu);
+      };
+      if (write_edits) {
+        // this can release and reacquire the mutex.
+        s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
+                              db_directory, /*new_descriptor_log=*/false,
+                              /*column_family_options=*/nullptr,
+                              manifest_write_cb);
+        *io_s = vset->io_status();
       } else {
-        for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
-          MemTable* m = *it;
-          // commit failed. setup state so that we can flush again.
-          ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64
-                                       ": memtable #%" PRIu64 " failed",
-                           m->file_number_, mem_id);
-          m->flush_completed_ = false;
-          m->flush_in_progress_ = false;
-          m->edit_.Clear();
-          num_flush_not_started_++;
-          m->file_number_ = 0;
-          imm_flush_needed.store(true, std::memory_order_release);
-          ++mem_id;
-        }
+        // If write_edit is false (e.g: successful mempurge),
+        // then remove old memtables, wake up manifest write queue threads,
+        // and don't commit anything to the manifest file.
+        RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer,
+                                      to_delete, mu);
+        // Note: cfd->SetLogNumber is only called when a VersionEdit
+        // is written to MANIFEST. When mempurge is succesful, we skip
+        // this step, therefore cfd->GetLogNumber is always is
+        // earliest log with data unflushed.
+        // Notify new head of manifest write queue.
+        // wake up all the waiting writers
+        // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters
+        // needed or investigate more.
+        vset->WakeUpWaitingManifestWriters();
+        *io_s = IOStatus::OK();
       }
     }
   }
@@ -535,7 +567,7 @@
   InstallNewVersion();
   // this method is used to move mutable memtable into an immutable list.
   // since mutable memtable is already refcounted by the DBImpl,
-  // and when moving to the imutable list we don't unref it,
+  // and when moving to the immutable list we don't unref it,
   // we don't have to ref the memtable here. we just take over the
   // reference from the DBImpl.
   current_->Add(m, to_delete);
@@ -548,11 +580,12 @@
   ResetTrimHistoryNeeded();
 }
 
-void MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+bool MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
   InstallNewVersion();
-  current_->TrimHistory(to_delete, usage);
+  bool ret = current_->TrimHistory(to_delete, usage);
   UpdateCachedValuesFromMemTableListVersion();
   ResetTrimHistoryNeeded();
+  return ret;
 }
 
 // Returns an estimate of the number of bytes of data in use.
@@ -566,9 +599,9 @@
 
 size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
 
-size_t MemTableList::ApproximateMemoryUsageExcludingLast() const {
-  const size_t usage =
-      current_memory_usage_excluding_last_.load(std::memory_order_relaxed);
+size_t MemTableList::MemoryAllocatedBytesExcludingLast() const {
+  const size_t usage = current_memory_allocted_bytes_excluding_last_.load(
+      std::memory_order_relaxed);
   return usage;
 }
 
@@ -579,9 +612,9 @@
 
 void MemTableList::UpdateCachedValuesFromMemTableListVersion() {
   const size_t total_memtable_size =
-      current_->ApproximateMemoryUsageExcludingLast();
-  current_memory_usage_excluding_last_.store(total_memtable_size,
-                                             std::memory_order_relaxed);
+      current_->MemoryAllocatedBytesExcludingLast();
+  current_memory_allocted_bytes_excluding_last_.store(
+      total_memtable_size, std::memory_order_relaxed);
 
   const bool has_history = current_->HasHistory();
   current_has_history_.store(has_history, std::memory_order_relaxed);
@@ -600,27 +633,99 @@
   } else {
     // somebody else holds the current version, we need to create new one
     MemTableListVersion* version = current_;
-    current_ = new MemTableListVersion(&current_memory_usage_, current_);
+    current_ = new MemTableListVersion(&current_memory_usage_, *version);
     current_->Ref();
     version->Unref();
   }
 }
 
+void MemTableList::RemoveMemTablesOrRestoreFlags(
+    const Status& s, ColumnFamilyData* cfd, size_t batch_count,
+    LogBuffer* log_buffer, autovector<MemTable*>* to_delete,
+    InstrumentedMutex* mu) {
+  assert(mu);
+  mu->AssertHeld();
+  assert(to_delete);
+  // we will be changing the version in the next code path,
+  // so we better create a new one, since versions are immutable
+  InstallNewVersion();
+
+  // All the later memtables that have the same filenum
+  // are part of the same batch. They can be committed now.
+  uint64_t mem_id = 1;  // how many memtables have been flushed.
+
+  // commit new state only if the column family is NOT dropped.
+  // The reason is as follows (refer to
+  // ColumnFamilyTest.FlushAndDropRaceCondition).
+  // If the column family is dropped, then according to LogAndApply, its
+  // corresponding flush operation is NOT written to the MANIFEST. This
+  // means the DB is not aware of the L0 files generated from the flush.
+  // By committing the new state, we remove the memtable from the memtable
+  // list. Creating an iterator on this column family will not be able to
+  // read full data since the memtable is removed, and the DB is not aware
+  // of the L0 files, causing MergingIterator unable to build child
+  // iterators. RocksDB contract requires that the iterator can be created
+  // on a dropped column family, and we must be able to
+  // read full data as long as column family handle is not deleted, even if
+  // the column family is dropped.
+  if (s.ok() && !cfd->IsDropped()) {  // commit new state
+    while (batch_count-- > 0) {
+      MemTable* m = current_->memlist_.back();
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      assert(m->file_number_ > 0);
+      current_->Remove(m, to_delete);
+      UpdateCachedValuesFromMemTableListVersion();
+      ResetTrimHistoryNeeded();
+      ++mem_id;
+    }
+  } else {
+    for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
+      MemTable* m = *it;
+      // commit failed. setup state so that we can flush again.
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64
+                         " failed",
+                         m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " failed",
+                         m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      m->flush_completed_ = false;
+      m->flush_in_progress_ = false;
+      m->edit_.Clear();
+      num_flush_not_started_++;
+      m->file_number_ = 0;
+      imm_flush_needed.store(true, std::memory_order_release);
+      ++mem_id;
+    }
+  }
+}
+
 uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
-    const autovector<MemTable*>& memtables_to_flush) {
+    const std::unordered_set<MemTable*>* memtables_to_flush) {
   uint64_t min_log = 0;
 
   for (auto& m : current_->memlist_) {
-    // Assume the list is very short, we can live with O(m*n). We can optimize
-    // if the performance has some problem.
-    bool should_skip = false;
-    for (MemTable* m_to_flush : memtables_to_flush) {
-      if (m == m_to_flush) {
-        should_skip = true;
-        break;
-      }
-    }
-    if (should_skip) {
+    if (memtables_to_flush && memtables_to_flush->count(m)) {
       continue;
     }
 
@@ -640,8 +745,11 @@
     const autovector<ColumnFamilyData*>& cfds,
     const autovector<const MutableCFOptions*>& mutable_cf_options_list,
     const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
-    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_metas,
-    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_metas,
+    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+        committed_flush_jobs_info,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
     LogBuffer* log_buffer) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
@@ -652,6 +760,10 @@
   if (imm_lists != nullptr) {
     assert(imm_lists->size() == num);
   }
+  if (num == 0) {
+    return Status::OK();
+  }
+
   for (size_t k = 0; k != num; ++k) {
 #ifndef NDEBUG
     const auto* imm =
@@ -666,6 +778,17 @@
       (*mems_list[k])[i]->SetFlushCompleted(true);
       (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
     }
+#ifndef ROCKSDB_LITE
+    if (committed_flush_jobs_info[k]) {
+      assert(!mems_list[k]->empty());
+      assert((*mems_list[k])[0]);
+      std::unique_ptr<FlushJobInfo> flush_job_info =
+          (*mems_list[k])[0]->ReleaseFlushJobInfo();
+      committed_flush_jobs_info[k]->push_back(std::move(flush_job_info));
+    }
+#else   //! ROCKSDB_LITE
+    (void)committed_flush_jobs_info;
+#endif  // ROCKSDB_LITE
   }
 
   Status s;
@@ -680,12 +803,36 @@
     ++num_entries;
     edit_lists.emplace_back(edits);
   }
+
+  WalNumber min_wal_number_to_keep = 0;
+  if (vset->db_options()->allow_2pc) {
+    min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+        vset, cfds, edit_lists, mems_list, prep_tracker);
+  } else {
+    min_wal_number_to_keep =
+        PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
+  }
+  edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
+
+  std::unique_ptr<VersionEdit> wal_deletion;
+  if (vset->db_options()->track_and_verify_wals_in_manifest) {
+    if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
+      wal_deletion.reset(new VersionEdit);
+      wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
+      edit_lists.back().push_back(wal_deletion.get());
+      ++num_entries;
+    }
+  }
+
   // Mark the version edits as an atomic group if the number of version edits
   // exceeds 1.
   if (cfds.size() > 1) {
-    for (auto& edits : edit_lists) {
-      assert(edits.size() == 1);
-      edits[0]->MarkAtomicGroup(--num_entries);
+    for (size_t i = 0; i < edit_lists.size(); i++) {
+      assert((edit_lists[i].size() == 1) ||
+             ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1)));
+      for (auto& e : edit_lists[i]) {
+        e->MarkAtomicGroup(--num_entries);
+      }
     }
     assert(0 == num_entries);
   }
@@ -708,11 +855,25 @@
       for (auto m : *mems_list[i]) {
         assert(m->GetFileNumber() > 0);
         uint64_t mem_id = m->GetID();
-        ROCKS_LOG_BUFFER(log_buffer,
-                         "[%s] Level-0 commit table #%" PRIu64
-                         ": memtable #%" PRIu64 " done",
-                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
-                         mem_id);
+
+        const VersionEdit* const edit = m->GetEdits();
+        assert(edit);
+
+        if (edit->GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           ": memtable #%" PRIu64 " done",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           mem_id);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files)"
+                           ": memtable #%" PRIu64 " done",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           edit->GetBlobFileAdditions().size(), mem_id);
+        }
+
         imm->current_->Remove(m, to_delete);
         imm->UpdateCachedValuesFromMemTableListVersion();
         imm->ResetTrimHistoryNeeded();
@@ -723,11 +884,25 @@
       auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
       for (auto m : *mems_list[i]) {
         uint64_t mem_id = m->GetID();
-        ROCKS_LOG_BUFFER(log_buffer,
-                         "[%s] Level-0 commit table #%" PRIu64
-                         ": memtable #%" PRIu64 " failed",
-                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
-                         mem_id);
+
+        const VersionEdit* const edit = m->GetEdits();
+        assert(edit);
+
+        if (edit->GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           ": memtable #%" PRIu64 " failed",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           mem_id);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files)"
+                           ": memtable #%" PRIu64 " failed",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           edit->GetBlobFileAdditions().size(), mem_id);
+        }
+
         m->SetFlushCompleted(false);
         m->SetFlushInProgress(false);
         m->GetEdits()->Clear();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,7 +12,6 @@
 #include <string>
 #include <vector>
 
-#include "db/dbformat.h"
 #include "db/logs_with_prep_tracker.h"
 #include "db/memtable.h"
 #include "db/range_del_aggregator.h"
@@ -44,7 +43,7 @@
 class MemTableListVersion {
  public:
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
-                               MemTableListVersion* old = nullptr);
+                               const MemTableListVersion& old);
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
                                int max_write_buffer_number_to_maintain,
                                int64_t max_write_buffer_size_to_maintain);
@@ -58,24 +57,25 @@
   // If any operation was found for this key, its most recent sequence number
   // will be stored in *seq on success (regardless of whether true/false is
   // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
-  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context,
+  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
+           Status* s, MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr);
 
-  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context,
+  bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
+           Status* s, MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
-               read_opts, callback, is_blob_index);
+    return Get(key, value, timestamp, s, merge_context,
+               max_covering_tombstone_seq, &seq, read_opts, callback,
+               is_blob_index);
   }
 
   void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                ReadCallback* callback, bool* is_blob);
+                ReadCallback* callback);
 
   // Returns all the merge operands corresponding to the key by searching all
   // memtables starting from the most recent one.
@@ -88,18 +88,20 @@
   // have already been flushed.  Should only be used from in-memory only
   // queries (such as Transaction validation) as the history may contain
   // writes that are also present in the SST files.
-  bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+  bool GetFromHistory(const LookupKey& key, std::string* value,
+                      std::string* timestamp, Status* s,
                       MergeContext* merge_context,
                       SequenceNumber* max_covering_tombstone_seq,
                       SequenceNumber* seq, const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr);
-  bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+  bool GetFromHistory(const LookupKey& key, std::string* value,
+                      std::string* timestamp, Status* s,
                       MergeContext* merge_context,
                       SequenceNumber* max_covering_tombstone_seq,
                       const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return GetFromHistory(key, value, s, merge_context,
+    return GetFromHistory(key, value, timestamp, s, merge_context,
                           max_covering_tombstone_seq, &seq, read_opts,
                           is_blob_index);
   }
@@ -135,9 +137,11 @@
       const autovector<ColumnFamilyData*>& cfds,
       const autovector<const MutableCFOptions*>& mutable_cf_options_list,
       const autovector<const autovector<MemTable*>*>& mems_list,
-      VersionSet* vset, InstrumentedMutex* mu,
-      const autovector<FileMetaData*>& file_meta,
-      autovector<MemTable*>* to_delete, Directory* db_directory,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer);
 
   // REQUIRE: m is an immutable memtable
@@ -145,10 +149,12 @@
   // REQUIRE: m is an immutable memtable
   void Remove(MemTable* m, autovector<MemTable*>* to_delete);
 
-  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+  // Return true if memtable is trimmed
+  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
 
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
-                   std::string* value, Status* s, MergeContext* merge_context,
+                   std::string* value, std::string* timestamp, Status* s,
+                   MergeContext* merge_context,
                    SequenceNumber* max_covering_tombstone_seq,
                    SequenceNumber* seq, const ReadOptions& read_opts,
                    ReadCallback* callback = nullptr,
@@ -162,7 +168,7 @@
   // excluding the last MemTable in memlist_history_. The reason for excluding
   // the last MemTable is to see if dropping the last MemTable will keep total
   // memory usage above or equal to max_write_buffer_size_to_maintain_
-  size_t ApproximateMemoryUsageExcludingLast() const;
+  size_t MemoryAllocatedBytesExcludingLast() const;
 
   // Whether this version contains flushed memtables that are only kept around
   // for transaction conflict checking.
@@ -215,7 +221,7 @@
         commit_in_progress_(false),
         flush_requested_(false),
         current_memory_usage_(0),
-        current_memory_usage_excluding_last_(0),
+        current_memory_allocted_bytes_excluding_last_(0),
         current_has_history_(false) {
     current_->Ref();
   }
@@ -246,7 +252,7 @@
 
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
-  void PickMemtablesToFlush(const uint64_t* max_memtable_id,
+  void PickMemtablesToFlush(uint64_t max_memtable_id,
                             autovector<MemTable*>* mems);
 
   // Reset status of the given memtable list back to pending state so that
@@ -260,33 +266,39 @@
       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
       const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
       VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
-      autovector<MemTable*>* to_delete, Directory* db_directory,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer,
-      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info);
+      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+      IOStatus* io_s, bool write_edits = true);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
+  // By default, adding memtables will flag that the memtable list needs to be
+  // flushed, but in certain situations, like after a mempurge, we may want to
+  // avoid flushing the memtable list upon addition of a memtable.
   void Add(MemTable* m, autovector<MemTable*>* to_delete);
 
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
-  // Returns the cached current_memory_usage_excluding_last_ value.
-  size_t ApproximateMemoryUsageExcludingLast() const;
+  // Returns the cached current_memory_allocted_bytes_excluding_last_ value.
+  size_t MemoryAllocatedBytesExcludingLast() const;
 
   // Returns the cached current_has_history_ value.
   bool HasHistory() const;
 
-  // Updates current_memory_usage_excluding_last_ and current_has_history_
-  // from MemTableListVersion. Must be called whenever InstallNewVersion is
-  // called.
+  // Updates current_memory_allocted_bytes_excluding_last_ and
+  // current_has_history_ from MemTableListVersion. Must be called whenever
+  // InstallNewVersion is called.
   void UpdateCachedValuesFromMemTableListVersion();
 
   // `usage` is the current size of the mutable Memtable. When
   // max_write_buffer_size_to_maintain is used, total size of mutable and
   // immutable memtables is checked against it to decide whether to trim
   // memtable list.
-  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+  //
+  // Return true if memtable is trimmed
+  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
 
   // Returns an estimate of the number of bytes of data used by
   // the unflushed mem-tables.
@@ -300,7 +312,18 @@
   // non-empty (regardless of the min_write_buffer_number_to_merge
   // parameter). This flush request will persist until the next time
   // PickMemtablesToFlush() is called.
-  void FlushRequested() { flush_requested_ = true; }
+  void FlushRequested() {
+    flush_requested_ = true;
+    // If there are some memtables stored in imm() that dont trigger
+    // flush (eg: mempurge output memtable), then update imm_flush_needed.
+    // Note: if race condition and imm_flush_needed is set to true
+    // when there is num_flush_not_started_==0, then there is no
+    // impact whatsoever. Imm_flush_needed is only used in an assert
+    // in IsFlushPending().
+    if (num_flush_not_started_ > 0) {
+      imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
 
   bool HasFlushRequested() { return flush_requested_; }
 
@@ -327,7 +350,7 @@
   // Returns the min log containing the prep section after memtables listsed in
   // `memtables_to_flush` are flushed and their status is persisted in manifest.
   uint64_t PrecomputeMinLogContainingPrepSection(
-      const autovector<MemTable*>& memtables_to_flush);
+      const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
 
   uint64_t GetEarliestMemTableID() const {
     auto& memlist = current_->memlist_;
@@ -373,14 +396,23 @@
       const autovector<ColumnFamilyData*>& cfds,
       const autovector<const MutableCFOptions*>& mutable_cf_options_list,
       const autovector<const autovector<MemTable*>*>& mems_list,
-      VersionSet* vset, InstrumentedMutex* mu,
-      const autovector<FileMetaData*>& file_meta,
-      autovector<MemTable*>* to_delete, Directory* db_directory,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer);
 
   // DB mutex held
   void InstallNewVersion();
 
+  // DB mutex held
+  // Called after writing to MANIFEST
+  void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
+                                     size_t batch_count, LogBuffer* log_buffer,
+                                     autovector<MemTable*>* to_delete,
+                                     InstrumentedMutex* mu);
+
   const int min_write_buffer_number_to_merge_;
 
   MemTableListVersion* current_;
@@ -398,8 +430,8 @@
   // The current memory usage.
   size_t current_memory_usage_;
 
-  // Cached value of current_->ApproximateMemoryUsageExcludingLast().
-  std::atomic<size_t> current_memory_usage_excluding_last_;
+  // Cached value of current_->MemoryAllocatedBytesExcludingLast().
+  std::atomic<size_t> current_memory_allocted_bytes_excluding_last_;
 
   // Cached value of current_->HasHistory().
   std::atomic<bool> current_has_history_;
@@ -416,7 +448,10 @@
     const autovector<ColumnFamilyData*>& cfds,
     const autovector<const MutableCFOptions*>& mutable_cf_options_list,
     const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
-    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
-    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_meta,
+    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+        committed_flush_jobs_info,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
     LogBuffer* log_buffer);
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/memtable_list_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/memtable_list_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -30,14 +30,14 @@
   MemTableListTest() : db(nullptr), file_number(1) {
     dbname = test::PerThreadDBPath("memtable_list_test");
     options.create_if_missing = true;
-    DestroyDB(dbname, options);
+    EXPECT_OK(DestroyDB(dbname, options));
   }
 
   // Create a test db if not yet created
   void CreateDB() {
     if (db == nullptr) {
       options.create_if_missing = true;
-      DestroyDB(dbname, options);
+      EXPECT_OK(DestroyDB(dbname, options));
       // Open DB only with default column family
       ColumnFamilyOptions cf_options;
       std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -65,18 +65,20 @@
   ~MemTableListTest() override {
     if (db) {
       std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+#ifndef ROCKSDB_LITE
       for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
-        handles[i]->GetDescriptor(&cf_descs[i]);
+        EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i]));
       }
+#endif  // !ROCKSDB_LITE
       for (auto h : handles) {
         if (h) {
-          db->DestroyColumnFamilyHandle(h);
+          EXPECT_OK(db->DestroyColumnFamilyHandle(h));
         }
       }
       handles.clear();
       delete db;
       db = nullptr;
-      DestroyDB(dbname, options, cf_descs);
+      EXPECT_OK(DestroyDB(dbname, options, cf_descs));
     }
   }
 
@@ -92,7 +94,6 @@
     CreateDB();
     // Create a mock VersionSet
     DBOptions db_options;
-    db_options.file_system = FileSystem::Default();
     ImmutableDBOptions immutable_db_options(db_options);
     EnvOptions env_options;
     std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
@@ -101,7 +102,8 @@
 
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
-                        &write_controller, /*block_cache_tracer=*/nullptr);
+                        &write_controller, /*block_cache_tracer=*/nullptr,
+                        /*io_tracer=*/nullptr, /*db_session_id*/ "");
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -115,13 +117,15 @@
     auto cfd = column_family_set->GetDefault();
     EXPECT_TRUE(nullptr != cfd);
     uint64_t file_num = file_number.fetch_add(1);
+    IOStatus io_s;
     // Create dummy mutex.
     InstrumentedMutex mutex;
     InstrumentedMutexLock l(&mutex);
     std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
     Status s = list->TryInstallMemtableFlushResults(
         cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
-        file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+        file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info, &io_s);
+    EXPECT_OK(io_s);
     return s;
   }
 
@@ -139,7 +143,6 @@
     CreateDB();
     // Create a mock VersionSet
     DBOptions db_options;
-    db_options.file_system.reset(new LegacyFileSystemWrapper(db_options.env));
 
     ImmutableDBOptions immutable_db_options(db_options);
     EnvOptions env_options;
@@ -149,7 +152,8 @@
 
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
-                        &write_controller, /*block_cache_tracer=*/nullptr);
+                        &write_controller, /*block_cache_tracer=*/nullptr,
+                        /*io_tracer=*/nullptr, /*db_session_id*/ "");
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -178,11 +182,21 @@
     for (auto& meta : file_metas) {
       file_meta_ptrs.push_back(&meta);
     }
+    std::vector<std::list<std::unique_ptr<FlushJobInfo>>>
+        committed_flush_jobs_info_storage(cf_ids.size());
+    autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+        committed_flush_jobs_info;
+    for (int i = 0; i < static_cast<int>(cf_ids.size()); ++i) {
+      committed_flush_jobs_info.push_back(
+          &committed_flush_jobs_info_storage[i]);
+    }
+
     InstrumentedMutex mutex;
     InstrumentedMutexLock l(&mutex);
     return InstallMemtableAtomicFlushResults(
-        &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex,
-        file_meta_ptrs, to_delete, nullptr, &log_buffer);
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions,
+        nullptr /* prep_tracker */, &mutex, file_meta_ptrs,
+        committed_flush_jobs_info, to_delete, nullptr, &log_buffer);
   }
 };
 
@@ -195,7 +209,7 @@
   ASSERT_FALSE(list.IsFlushPending());
 
   autovector<MemTable*> mems;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &mems);
   ASSERT_EQ(0, mems.size());
 
   autovector<MemTable*> to_delete;
@@ -221,15 +235,16 @@
   autovector<MemTable*> to_delete;
 
   LookupKey lkey("key1", seq);
-  bool found = list.current()->Get(lkey, &value, &s, &merge_context,
-                                   &max_covering_tombstone_seq, ReadOptions());
+  bool found = list.current()->Get(
+      lkey, &value, /*timestamp*/nullptr, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Create a MemTable
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
 
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
@@ -237,26 +252,33 @@
   mem->Ref();
 
   // Write some keys to this memtable.
-  mem->Add(++seq, kTypeDeletion, "key1", "");
-  mem->Add(++seq, kTypeValue, "key2", "value2");
-  mem->Add(++seq, kTypeValue, "key1", "value1");
-  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
 
   // Fetch the newly written keys
   merge_context.Clear();
-  found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+  found = mem->Get(LookupKey("key1", seq), &value,
+                   /*timestamp*/nullptr, &s, &merge_context,
                    &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value1");
 
   merge_context.Clear();
-  found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context,
+  found = mem->Get(LookupKey("key1", 2), &value,
+                   /*timestamp*/nullptr, &s, &merge_context,
                    &max_covering_tombstone_seq, ReadOptions());
   // MemTable found out that this key is *not* found (at this sequence#)
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+  found = mem->Get(LookupKey("key2", seq), &value,
+                   /*timestamp*/nullptr, &s, &merge_context,
                    &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.2");
@@ -275,36 +297,39 @@
                                 kMaxSequenceNumber, 0 /* column_family_id */);
   mem2->Ref();
 
-  mem2->Add(++seq, kTypeDeletion, "key1", "");
-  mem2->Add(++seq, kTypeValue, "key2", "value2.3");
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3",
+                      nullptr /* kv_prot_info */));
 
   // Add second memtable to list
   list.Add(mem2, &to_delete);
 
   // Fetch keys via MemTableList
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(
+      LookupKey("key1", seq), &value, /*timestamp*/nullptr, &s,
+      &merge_context, &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s,
-                              &merge_context, &max_covering_tombstone_seq,
-                              ReadOptions());
+  found = list.current()->Get(
+      LookupKey("key1", saved_seq), &value, /*timestamp*/nullptr,
+      &s, &merge_context, &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ("value1", value);
 
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(
+      LookupKey("key2", seq), &value, /*timestamp*/nullptr, &s,
+      &merge_context, &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.3");
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context,
-                              &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(
+      LookupKey("key2", 1), &value, /*timestamp*/nullptr, &s,
+      &merge_context, &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   ASSERT_EQ(2, list.NumNotFlushed());
@@ -319,7 +344,7 @@
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
   int max_write_buffer_number_to_maintain = 2;
-  int64_t max_write_buffer_size_to_maintain = 2000;
+  int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
   MemTableList list(min_write_buffer_number_to_merge,
                     max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
@@ -333,15 +358,16 @@
   autovector<MemTable*> to_delete;
 
   LookupKey lkey("key1", seq);
-  bool found = list.current()->Get(lkey, &value, &s, &merge_context,
-                                   &max_covering_tombstone_seq, ReadOptions());
+  bool found = list.current()->Get(
+      lkey, &value, /*timestamp*/nullptr, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Create a MemTable
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
 
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
@@ -349,19 +375,24 @@
   mem->Ref();
 
   // Write some keys to this memtable.
-  mem->Add(++seq, kTypeDeletion, "key1", "");
-  mem->Add(++seq, kTypeValue, "key2", "value2");
-  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
 
   // Fetch the newly written keys
   merge_context.Clear();
-  found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+  found = mem->Get(LookupKey("key1", seq), &value,
+                   /*timestamp*/nullptr, &s, &merge_context,
                    &max_covering_tombstone_seq, ReadOptions());
   // MemTable found out that this key is *not* found (at this sequence#)
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+  found = mem->Get(LookupKey("key2", seq), &value,
+                   /*timestamp*/nullptr, &s, &merge_context,
                    &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.2");
@@ -372,22 +403,22 @@
 
   // Fetch keys via MemTableList
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key1", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key2", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ("value2.2", value);
 
   // Flush this memtable from the list.
   // (It will then be a part of the memtable history).
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
   MutableCFOptions mutable_cf_options(options);
@@ -400,27 +431,27 @@
 
   // Verify keys are no longer in MemTableList
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key1", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key2", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Verify keys are present in history
   merge_context.Clear();
   found = list.current()->GetFromHistory(
-      LookupKey("key1", seq), &value, &s, &merge_context,
+      LookupKey("key1", seq), &value, /*timestamp*/nullptr, &s, &merge_context,
       &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = list.current()->GetFromHistory(
-      LookupKey("key2", seq), &value, &s, &merge_context,
+      LookupKey("key2", seq), &value, /*timestamp*/nullptr, &s, &merge_context,
       &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found);
   ASSERT_EQ("value2.2", value);
@@ -431,15 +462,17 @@
                                 kMaxSequenceNumber, 0 /* column_family_id */);
   mem2->Ref();
 
-  mem2->Add(++seq, kTypeDeletion, "key1", "");
-  mem2->Add(++seq, kTypeValue, "key3", "value3");
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3",
+                      nullptr /* kv_prot_info */));
 
   // Add second memtable to list
   list.Add(mem2, &to_delete);
   ASSERT_EQ(0, to_delete.size());
 
   to_flush.clear();
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
   // Flush second memtable
@@ -462,42 +495,42 @@
 
   // Verify keys are no longer in MemTableList
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key1", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key2", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key3", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Verify that the second memtable's keys are in the history
   merge_context.Clear();
   found = list.current()->GetFromHistory(
-      LookupKey("key1", seq), &value, &s, &merge_context,
+      LookupKey("key1", seq), &value, /*timestamp*/nullptr, &s, &merge_context,
       &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = list.current()->GetFromHistory(
-      LookupKey("key3", seq), &value, &s, &merge_context,
+      LookupKey("key3", seq), &value, /*timestamp*/nullptr, &s, &merge_context,
       &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found);
   ASSERT_EQ("value3", value);
 
   // Verify that key2 from the first memtable is no longer in the history
   merge_context.Clear();
-  found =
-      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                          &max_covering_tombstone_seq, ReadOptions());
+  found = list.current()->Get(LookupKey("key2", seq), &value,
+                              /*timestamp*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Cleanup
@@ -515,7 +548,7 @@
 
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   InternalKeyComparator cmp(BytewiseComparator());
   WriteBufferManager wb(options.db_write_buffer_size);
   autovector<MemTable*> to_delete;
@@ -542,11 +575,16 @@
     std::string value;
     MergeContext merge_context;
 
-    mem->Add(++seq, kTypeValue, "key1", ToString(i));
-    mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
-    mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
-    mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
-    mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i),
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "",
+                       nullptr /* kv_prot_info */));
 
     tables.push_back(mem);
   }
@@ -555,7 +593,7 @@
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
 
   // Request a flush even though there is nothing to flush
@@ -564,7 +602,7 @@
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Attempt to 'flush' to clear request for flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -588,7 +626,7 @@
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(2, to_flush.size());
   ASSERT_EQ(2, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -609,7 +647,7 @@
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(3, to_flush.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -617,7 +655,7 @@
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush2;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2);
   ASSERT_EQ(0, to_flush2.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -635,7 +673,7 @@
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush again
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2);
   ASSERT_EQ(1, to_flush2.size());
   ASSERT_EQ(4, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -656,7 +694,7 @@
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   // Should pick 4 of 5 since 1 table has been picked in to_flush2
   ASSERT_EQ(4, to_flush.size());
   ASSERT_EQ(5, list.NumNotFlushed());
@@ -665,7 +703,7 @@
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush3;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush3);
   ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -726,7 +764,7 @@
   autovector<MemTable*> to_flush4;
   list.FlushRequested();
   ASSERT_TRUE(list.HasFlushRequested());
-  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  list.PickMemtablesToFlush(memtable_id, &to_flush4);
   ASSERT_TRUE(to_flush4.empty());
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -737,7 +775,7 @@
   // equal to 5. Therefore, only tables[5] will be selected.
   memtable_id = 5;
   list.FlushRequested();
-  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  list.PickMemtablesToFlush(memtable_id, &to_flush4);
   ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -779,7 +817,7 @@
 
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   InternalKeyComparator cmp(BytewiseComparator());
   WriteBufferManager wb(options.db_write_buffer_size);
 
@@ -811,11 +849,16 @@
 
       std::string value;
 
-      mem->Add(++seq, kTypeValue, "key1", ToString(i));
-      mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
-      mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
-      mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
-      mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i),
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "",
+                         nullptr /* kv_prot_info */));
 
       elem.push_back(mem);
     }
@@ -829,7 +872,8 @@
     auto* list = lists[i];
     ASSERT_FALSE(list->IsFlushPending());
     ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
-    list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]);
+    list->PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */,
+                               &flush_candidates[i]);
     ASSERT_EQ(0, flush_candidates[i].size());
   }
   // Request flush even though there is nothing to flush
@@ -859,8 +903,7 @@
   // Pick memtables to flush
   for (auto i = 0; i != num_cfs; ++i) {
     flush_candidates[i].clear();
-    lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
-                                   &flush_candidates[i]);
+    lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]);
     ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
               static_cast<uint64_t>(flush_candidates[i].size()));
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_context.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_context.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_context.h	2025-05-19 16:14:27.000000000 +0000
@@ -68,7 +68,7 @@
   }
 
   // Get the operand at the index.
-  Slice GetOperand(int index) {
+  Slice GetOperand(int index) const {
     assert(operand_list_);
 
     SetDirectionForward();
@@ -76,13 +76,21 @@
   }
 
   // Same as GetOperandsDirectionForward
-  const std::vector<Slice>& GetOperands() {
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperands() const {
     return GetOperandsDirectionForward();
   }
 
   // Return all the operands in the order as they were merged (passed to
   // FullMerge or FullMergeV2)
-  const std::vector<Slice>& GetOperandsDirectionForward() {
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionForward() const {
     if (!operand_list_) {
       return empty_operand_list;
     }
@@ -93,7 +101,11 @@
 
   // Return all the operands in the reversed order relative to how they were
   // merged (passed to FullMerge or FullMergeV2)
-  const std::vector<Slice>& GetOperandsDirectionBackward() {
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionBackward() const {
     if (!operand_list_) {
       return empty_operand_list;
     }
@@ -110,14 +122,14 @@
     }
   }
 
-  void SetDirectionForward() {
+  void SetDirectionForward() const {
     if (operands_reversed_ == true) {
       std::reverse(operand_list_->begin(), operand_list_->end());
       operands_reversed_ = false;
     }
   }
 
-  void SetDirectionBackward() {
+  void SetDirectionBackward() const {
     if (operands_reversed_ == false) {
       std::reverse(operand_list_->begin(), operand_list_->end());
       operands_reversed_ = true;
@@ -125,10 +137,10 @@
   }
 
   // List of operands
-  std::unique_ptr<std::vector<Slice>> operand_list_;
+  mutable std::unique_ptr<std::vector<Slice>> operand_list_;
   // Copy of operands that are not pinned.
   std::unique_ptr<std::vector<std::unique_ptr<std::string>>> copied_operands_;
-  bool operands_reversed_ = true;
+  mutable bool operands_reversed_ = true;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,10 @@
 
 #include <string>
 
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/compaction/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
@@ -14,6 +18,7 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 
@@ -28,6 +33,7 @@
                          Statistics* stats,
                          const std::atomic<bool>* shutting_down)
     : env_(env),
+      clock_(env->GetSystemClock().get()),
       user_comparator_(user_comparator),
       user_merge_operator_(user_merge_operator),
       compaction_filter_(compaction_filter),
@@ -39,7 +45,7 @@
       snapshot_checker_(snapshot_checker),
       level_(level),
       keys_(),
-      filter_timer_(env_),
+      filter_timer_(clock_),
       total_filter_time_(0U),
       stats_(stats) {
   assert(user_comparator_ != nullptr);
@@ -52,7 +58,7 @@
                                    const Slice& key, const Slice* value,
                                    const std::vector<Slice>& operands,
                                    std::string* result, Logger* logger,
-                                   Statistics* statistics, Env* env,
+                                   Statistics* statistics, SystemClock* clock,
                                    Slice* result_operand,
                                    bool update_num_ops_stats) {
   assert(merge_operator != nullptr);
@@ -75,7 +81,7 @@
   MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
   {
     // Setup to time the merge
-    StopWatchNano timer(env, statistics != nullptr);
+    StopWatchNano timer(clock, statistics != nullptr);
     PERF_TIMER_GUARD(merge_operator_time_nanos);
 
     // Do the merge
@@ -116,7 +122,11 @@
 Status MergeHelper::MergeUntil(InternalIterator* iter,
                                CompactionRangeDelAggregator* range_del_agg,
                                const SequenceNumber stop_before,
-                               const bool at_bottom) {
+                               const bool at_bottom,
+                               const bool allow_data_in_errors,
+                               const BlobFetcher* blob_fetcher,
+                               PrefetchBufferCollection* prefetch_buffers,
+                               CompactionIterationStats* c_iter_stats) {
   // Get a copy of the internal key, before it's invalidated by iter->Next()
   // Also maintain the list of merge operands seen.
   assert(HasOperator());
@@ -138,27 +148,27 @@
   // orig_ikey is backed by original_key if keys_.empty()
   // orig_ikey is backed by keys_.back() if !keys_.empty()
   ParsedInternalKey orig_ikey;
-  bool succ = ParseInternalKey(original_key, &orig_ikey);
-  assert(succ);
-  if (!succ) {
-    return Status::Corruption("Cannot parse key in MergeUntil");
-  }
 
-  Status s;
+  Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors);
+  assert(s.ok());
+  if (!s.ok()) return s;
+
   bool hit_the_next_user_key = false;
   for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
     if (IsShuttingDown()) {
-      return Status::ShutdownInProgress();
+      s = Status::ShutdownInProgress();
+      return s;
     }
 
     ParsedInternalKey ikey;
     assert(keys_.size() == merge_context_.GetNumOperands());
 
-    if (!ParseInternalKey(iter->key(), &ikey)) {
+    Status pik_status =
+        ParseInternalKey(iter->key(), &ikey, allow_data_in_errors);
+    if (!pik_status.ok()) {
       // stop at corrupted key
       if (assert_valid_internal_key_) {
-        assert(!"Corrupted internal key not expected.");
-        return Status::Corruption("Corrupted internal key not expected.");
+        return pik_status;
       }
       break;
     } else if (first_key) {
@@ -182,7 +192,6 @@
 
     assert(IsValueType(ikey.type));
     if (ikey.type != kTypeMerge) {
-
       // hit a put/delete/single delete
       //   => merge the put value or a nullptr with operands_
       //   => store result in operands_.back() (and update keys_.back())
@@ -193,7 +202,7 @@
       // the compaction iterator to write out the key we're currently at, which
       // is the put/delete we just encountered.
       if (keys_.empty()) {
-        return Status::OK();
+        return s;
       }
 
       // TODO(noetzli) If the merge operator returns false, we are currently
@@ -201,19 +210,52 @@
       // want. Also if we're in compaction and it's a put, it would be nice to
       // run compaction filter on it.
       const Slice val = iter->value();
+      PinnableSlice blob_value;
       const Slice* val_ptr;
-      if (kTypeValue == ikey.type &&
+      if ((kTypeValue == ikey.type || kTypeBlobIndex == ikey.type) &&
           (range_del_agg == nullptr ||
            !range_del_agg->ShouldDelete(
                ikey, RangeDelPositioningMode::kForwardTraversal))) {
-        val_ptr = &val;
+        if (ikey.type == kTypeBlobIndex) {
+          BlobIndex blob_index;
+
+          s = blob_index.DecodeFrom(val);
+          if (!s.ok()) {
+            return s;
+          }
+
+          FilePrefetchBuffer* prefetch_buffer =
+              prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer(
+                                     blob_index.file_number())
+                               : nullptr;
+
+          uint64_t bytes_read = 0;
+
+          assert(blob_fetcher);
+
+          s = blob_fetcher->FetchBlob(ikey.user_key, blob_index,
+                                      prefetch_buffer, &blob_value,
+                                      &bytes_read);
+          if (!s.ok()) {
+            return s;
+          }
+
+          val_ptr = &blob_value;
+
+          if (c_iter_stats) {
+            ++c_iter_stats->num_blobs_read;
+            c_iter_stats->total_blob_bytes_read += bytes_read;
+          }
+        } else {
+          val_ptr = &val;
+        }
       } else {
         val_ptr = nullptr;
       }
       std::string merge_result;
       s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
                          merge_context_.GetOperands(), &merge_result, logger_,
-                         stats_, env_);
+                         stats_, clock_);
 
       // We store the result in keys_.back() and operands_.back()
       // if nothing went wrong (i.e.: no operand corruption on disk)
@@ -268,7 +310,10 @@
         if (keys_.size() == 1) {
           // we need to re-anchor the orig_ikey because it was anchored by
           // original_key before
-          ParseInternalKey(keys_.back(), &orig_ikey);
+          pik_status =
+              ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors);
+          pik_status.PermitUncheckedError();
+          assert(pik_status.ok());
         }
         if (filter == CompactionFilter::Decision::kKeep) {
           merge_context_.PushOperand(
@@ -284,14 +329,14 @@
         keys_.clear();
         merge_context_.Clear();
         has_compaction_filter_skip_until_ = true;
-        return Status::OK();
+        return s;
       }
     }
   }
 
   if (merge_context_.GetNumOperands() == 0) {
     // we filtered out all the merge operands
-    return Status::OK();
+    return s;
   }
 
   // We are sure we have seen this key's entire history if:
@@ -321,7 +366,7 @@
     std::string merge_result;
     s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr,
                        merge_context_.GetOperands(), &merge_result, logger_,
-                       stats_, env_);
+                       stats_, clock_);
     if (s.ok()) {
       // The original key encountered
       // We are certain that keys_ is not empty here (see assertions couple of
@@ -344,7 +389,7 @@
       bool merge_success = false;
       std::string merge_result;
       {
-        StopWatchNano timer(env_, stats_ != nullptr);
+        StopWatchNano timer(clock_, stats_ != nullptr);
         PERF_TIMER_GUARD(merge_operator_time_nanos);
         merge_success = user_merge_operator_->PartialMergeMulti(
             orig_ikey.user_key,
@@ -410,7 +455,9 @@
                                                        kValueTypeForSeek);
     }
   }
-  total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+  if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+    total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+  }
   return ret;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,12 +9,12 @@
 #include <string>
 #include <vector>
 
-#include "db/dbformat.h"
 #include "db/merge_context.h"
 #include "db/range_del_aggregator.h"
 #include "db/snapshot_checker.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "util/stop_watch.h"
 
@@ -25,6 +25,10 @@
 class Logger;
 class MergeOperator;
 class Statistics;
+class SystemClock;
+class BlobFetcher;
+class PrefetchBufferCollection;
+struct CompactionIterationStats;
 
 class MergeHelper {
  public:
@@ -48,7 +52,7 @@
                                const Slice& key, const Slice* value,
                                const std::vector<Slice>& operands,
                                std::string* result, Logger* logger,
-                               Statistics* statistics, Env* env,
+                               Statistics* statistics, SystemClock* clock,
                                Slice* result_operand = nullptr,
                                bool update_num_ops_stats = false);
 
@@ -66,6 +70,12 @@
   //                   0 means no restriction
   // at_bottom:   (IN) true if the iterator covers the bottem level, which means
   //                   we could reach the start of the history of this user key.
+  // allow_data_in_errors: (IN) if true, data details will be displayed in
+  //                   error/log messages.
+  // blob_fetcher: (IN) blob fetcher object for the compaction's input version.
+  // prefetch_buffers: (IN/OUT) a collection of blob file prefetch buffers
+  //                            used for compaction readahead.
+  // c_iter_stats: (OUT) compaction iteration statistics.
   //
   // Returns one of the following statuses:
   // - OK: Entries were successfully merged.
@@ -78,9 +88,12 @@
   //
   // REQUIRED: The first key in the input is not corrupted.
   Status MergeUntil(InternalIterator* iter,
-                    CompactionRangeDelAggregator* range_del_agg = nullptr,
-                    const SequenceNumber stop_before = 0,
-                    const bool at_bottom = false);
+                    CompactionRangeDelAggregator* range_del_agg,
+                    const SequenceNumber stop_before, const bool at_bottom,
+                    const bool allow_data_in_errors,
+                    const BlobFetcher* blob_fetcher,
+                    PrefetchBufferCollection* prefetch_buffers,
+                    CompactionIterationStats* c_iter_stats);
 
   // Filters a merge operand using the compaction filter specified
   // in the constructor. Returns the decision that the filter made.
@@ -137,6 +150,7 @@
 
  private:
   Env* env_;
+  SystemClock* clock_;
   const Comparator* user_comparator_;
   const MergeOperator* user_merge_operator_;
   const CompactionFilter* compaction_filter_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_helper_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_helper_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,34 +3,39 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "db/merge_helper.h"
+
 #include <algorithm>
 #include <string>
 #include <vector>
 
-#include "db/merge_helper.h"
+#include "db/dbformat.h"
 #include "rocksdb/comparator.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
+#include "util/vector_iterator.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class MergeHelperTest : public testing::Test {
  public:
-  MergeHelperTest() { env_ = Env::Default(); }
+  MergeHelperTest() : icmp_(BytewiseComparator()) { env_ = Env::Default(); }
 
   ~MergeHelperTest() override = default;
 
   Status Run(SequenceNumber stop_before, bool at_bottom,
              SequenceNumber latest_snapshot = 0) {
-    iter_.reset(new test::VectorIterator(ks_, vs_));
+    iter_.reset(new VectorIterator(ks_, vs_, &icmp_));
     iter_->SeekToFirst();
-    merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(),
+    merge_helper_.reset(new MergeHelper(env_, icmp_.user_comparator(),
                                         merge_op_.get(), filter_.get(), nullptr,
                                         false, latest_snapshot));
-    return merge_helper_->MergeUntil(iter_.get(), nullptr /* range_del_agg */,
-                                     stop_before, at_bottom);
+    return merge_helper_->MergeUntil(
+        iter_.get(), nullptr /* range_del_agg */, stop_before, at_bottom,
+        false /* allow_data_in_errors */, nullptr /* blob_fetcher */,
+        nullptr /* prefetch_buffers */, nullptr /* c_iter_stats */);
   }
 
   void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
@@ -45,7 +50,8 @@
   }
 
   Env* env_;
-  std::unique_ptr<test::VectorIterator> iter_;
+  InternalKeyComparator icmp_;
+  std::unique_ptr<VectorIterator> iter_;
   std::shared_ptr<MergeOperator> merge_op_;
   std::unique_ptr<MergeHelper> merge_helper_;
   std::vector<std::string> ks_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/merge_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/merge_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,8 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include <assert.h>
-#include <memory>
+
 #include <iostream>
+#include <memory>
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
@@ -18,6 +19,7 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
+#include "util/coding.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -47,12 +49,8 @@
       return true;
     }
 
-    return mergeOperator_->PartialMerge(
-        key,
-        *existing_value,
-        value,
-        new_value,
-        logger);
+    return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
+                                        logger);
   }
 
   bool PartialMergeMulti(const Slice& key,
@@ -71,6 +69,31 @@
   std::shared_ptr<MergeOperator> mergeOperator_;
 };
 
+class EnvMergeTest : public EnvWrapper {
+ public:
+  EnvMergeTest() : EnvWrapper(Env::Default()) {}
+  static const char* kClassName() { return "MergeEnv"; }
+  const char* Name() const override { return kClassName(); }
+  //  ~EnvMergeTest() override {}
+
+  uint64_t NowNanos() override {
+    ++now_nanos_count_;
+    return target()->NowNanos();
+  }
+
+  static uint64_t now_nanos_count_;
+
+  static std::unique_ptr<EnvMergeTest> singleton_;
+
+  static EnvMergeTest* GetInstance() {
+    if (nullptr == singleton_) singleton_.reset(new EnvMergeTest);
+    return singleton_.get();
+  }
+};
+
+uint64_t EnvMergeTest::now_nanos_count_{0};
+std::unique_ptr<EnvMergeTest> EnvMergeTest::singleton_;
+
 std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
                            const size_t max_successive_merges = 0) {
   DB* db;
@@ -78,8 +101,9 @@
   options.create_if_missing = true;
   options.merge_operator = std::make_shared<CountMergeOperator>();
   options.max_successive_merges = max_successive_merges;
+  options.env = EnvMergeTest::GetInstance();
+  EXPECT_OK(DestroyDB(dbname, Options()));
   Status s;
-  DestroyDB(dbname, Options());
 // DBWithTTL is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
   if (ttl) {
@@ -93,10 +117,11 @@
   assert(!ttl);
   s = DB::Open(options, dbname, &db);
 #endif  // !ROCKSDB_LITE
-  if (!s.ok()) {
-    std::cerr << s.ToString() << std::endl;
-    assert(false);
-  }
+  EXPECT_OK(s);
+  assert(s.ok());
+  // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for
+  // session ID)
+  EnvMergeTest::now_nanos_count_ = 0;
   return std::shared_ptr<DB>(db);
 }
 
@@ -106,7 +131,6 @@
 // set, add, get and remove
 // This is a quick implementation without a Merge operation.
 class Counters {
-
  protected:
   std::shared_ptr<DB> db_;
 
@@ -190,7 +214,6 @@
     return get(key, &base) && set(key, base + value);
   }
 
-
   // convenience functions for testing
   void assert_set(const std::string& key, uint64_t value) {
     assert(set(key, value));
@@ -202,27 +225,25 @@
     uint64_t value = default_;
     int result = get(key, &value);
     assert(result);
-    if (result == 0) exit(1); // Disable unused variable warning.
+    if (result == 0) exit(1);  // Disable unused variable warning.
     return value;
   }
 
   void assert_add(const std::string& key, uint64_t value) {
     int result = add(key, value);
     assert(result);
-    if (result == 0) exit(1); // Disable unused variable warning.
+    if (result == 0) exit(1);  // Disable unused variable warning.
   }
 };
 
 // Implement 'add' directly with the new Merge operation
 class MergeBasedCounters : public Counters {
  private:
-  WriteOptions merge_option_; // for merge
+  WriteOptions merge_option_;  // for merge
 
  public:
   explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
-      : Counters(db, defaultCount),
-        merge_option_() {
-  }
+      : Counters(db, defaultCount), merge_option_() {}
 
   // mapped to a rocksdb Merge operation
   bool add(const std::string& key, uint64_t value) override {
@@ -243,34 +264,37 @@
 void dumpDb(DB* db) {
   auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
   for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    //uint64_t value = DecodeFixed64(it->value().data());
-    //std::cout << it->key().ToString() << ": " << value << std::endl;
+    // uint64_t value = DecodeFixed64(it->value().data());
+    // std::cout << it->key().ToString() << ": " << value << std::endl;
   }
   assert(it->status().ok());  // Check for any errors found during the scan
 }
 
 void testCounters(Counters& counters, DB* db, bool test_compaction) {
-
   FlushOptions o;
   o.wait = true;
 
   counters.assert_set("a", 1);
 
-  if (test_compaction) db->Flush(o);
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
 
-  assert(counters.assert_get("a") == 1);
+  ASSERT_EQ(counters.assert_get("a"), 1);
 
   counters.assert_remove("b");
 
   // defaut value is 0 if non-existent
-  assert(counters.assert_get("b") == 0);
+  ASSERT_EQ(counters.assert_get("b"), 0);
 
   counters.assert_add("a", 2);
 
-  if (test_compaction) db->Flush(o);
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
 
   // 1+2 = 3
-  assert(counters.assert_get("a")== 3);
+  ASSERT_EQ(counters.assert_get("a"), 3);
 
   dumpDb(db);
 
@@ -280,25 +304,114 @@
     counters.assert_add("b", i);
     sum += i;
   }
-  assert(counters.assert_get("b") == sum);
+  ASSERT_EQ(counters.assert_get("b"), sum);
 
   dumpDb(db);
 
   if (test_compaction) {
-    db->Flush(o);
+    ASSERT_OK(db->Flush(o));
 
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
     dumpDb(db);
 
-    assert(counters.assert_get("a")== 3);
-    assert(counters.assert_get("b") == sum);
+    ASSERT_EQ(counters.assert_get("a"), 3);
+    ASSERT_EQ(counters.assert_get("b"), sum);
   }
 }
 
+void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
+  ASSERT_OK(db->Put({}, "1", "1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  std::atomic<int> cnt{0};
+  const auto get_thread_id = [&cnt]() {
+    thread_local int thread_id{cnt++};
+    return thread_id;
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (1 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_compact_thread:0");
+        } else if (2 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_flush_thread:0");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (0 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:0");
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:1");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) {
+        auto* mutex = reinterpret_cast<InstrumentedMutex*>(arg);
+        mutex->AssertHeld();
+        int thread_id = get_thread_id();
+        ASSERT_EQ(2, thread_id);
+        mutex->Unlock();
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:1");
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:2");
+        mutex->Lock();
+      });
+  SyncPoint::GetInstance()->LoadDependency({
+      {"testCountersWithFlushAndCompaction::set_options_thread:0",
+       "testCountersWithCompactionAndFlush:BeforeCompact"},
+      {"testCountersWithFlushAndCompaction::bg_compact_thread:0",
+       "testCountersWithFlushAndCompaction:BeforeIncCounters"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:0",
+       "testCountersWithFlushAndCompaction::set_options_thread:1"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:1",
+       "testCountersWithFlushAndCompaction:BeforeVerification"},
+      {"testCountersWithFlushAndCompaction:AfterGet",
+       "testCountersWithFlushAndCompaction::bg_flush_thread:2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread set_options_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->SetOptions(
+        {{"disable_auto_compactions", "false"}}));
+  });
+  TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact");
+  port::Thread compact_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->CompactRange(
+        CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters");
+  counters.add("test-key", 1);
+
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db->Flush(flush_opts));
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification");
+  std::string expected;
+  PutFixed64(&expected, 1);
+  std::string actual;
+  Status s = db->Get(ReadOptions(), "test-key", &actual);
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet");
+  set_options_thread.join();
+  compact_thread.join();
+  ASSERT_OK(s);
+  ASSERT_EQ(expected, actual);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
                          size_t num_merges) {
-
   counters.assert_remove("z");
   uint64_t sum = 0;
 
@@ -308,14 +421,14 @@
     sum += i;
 
     if (i % (max_num_merges + 1) == 0) {
-      assert(num_merge_operator_calls == max_num_merges + 1);
+      ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1);
     } else {
-      assert(num_merge_operator_calls == 0);
+      ASSERT_EQ(num_merge_operator_calls, 0);
     }
 
     resetNumMergeOperatorCalls();
-    assert(counters.assert_get("z") == sum);
-    assert(num_merge_operator_calls == i % (max_num_merges + 1));
+    ASSERT_EQ(counters.assert_get("z"), sum);
+    ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1));
   }
 }
 
@@ -332,8 +445,8 @@
     counters->assert_add("b", i);
     tmp_sum += i;
   }
-  db->Flush(o);
-  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(tmp_sum, counters->assert_get("b"));
   if (count > max_merge) {
     // in this case, FullMerge should be called instead.
@@ -346,20 +459,23 @@
   // Test case 2: partial merge should not be called when a put is found.
   resetNumPartialMergeCalls();
   tmp_sum = 0;
-  db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10");
+  ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"));
   for (size_t i = 1; i <= count; i++) {
     counters->assert_add("c", i);
     tmp_sum += i;
   }
-  db->Flush(o);
-  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(tmp_sum, counters->assert_get("c"));
   ASSERT_EQ(num_partial_merge_calls, 0U);
+  // NowNanos was previously called in MergeHelper::FilterMerge(), which
+  // harmed performance.
+  ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U);
 }
 
 void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
                                     size_t num_merges) {
-  assert(num_merges > max_num_merges);
+  ASSERT_GT(num_merges, max_num_merges);
 
   Slice key("BatchSuccessiveMerge");
   uint64_t merge_value = 1;
@@ -370,15 +486,12 @@
   // Create the batch
   WriteBatch batch;
   for (size_t i = 0; i < num_merges; ++i) {
-    batch.Merge(key, merge_value_slice);
+    ASSERT_OK(batch.Merge(key, merge_value_slice));
   }
 
   // Apply to memtable and count the number of merges
   resetNumMergeOperatorCalls();
-  {
-    Status s = db->Write(WriteOptions(), &batch);
-    assert(s.ok());
-  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
   ASSERT_EQ(
       num_merge_operator_calls,
       static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
@@ -386,10 +499,7 @@
   // Get the value
   resetNumMergeOperatorCalls();
   std::string get_value_str;
-  {
-    Status s = db->Get(ReadOptions(), key, &get_value_str);
-    assert(s.ok());
-  }
+  ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str));
   assert(get_value_str.size() == sizeof(uint64_t));
   uint64_t get_value = DecodeFixed64(&get_value_str[0]);
   ASSERT_EQ(get_value, num_merges * merge_value);
@@ -398,7 +508,6 @@
 }
 
 void runTest(const std::string& dbname, const bool use_ttl = false) {
-
   {
     auto db = OpenDb(dbname, use_ttl);
 
@@ -413,7 +522,7 @@
     }
   }
 
-  DestroyDB(dbname, Options());
+  ASSERT_OK(DestroyDB(dbname, Options()));
 
   {
     size_t max_merge = 5;
@@ -422,7 +531,8 @@
     testCounters(counters, db.get(), use_compression);
     testSuccessiveMerge(counters, max_merge, max_merge * 2);
     testSingleBatchSuccessiveMerge(db.get(), 5, 7);
-    DestroyDB(dbname, Options());
+    ASSERT_OK(db->Close());
+    ASSERT_OK(DestroyDB(dbname, Options()));
   }
 
   {
@@ -433,14 +543,16 @@
       auto db = OpenDb(dbname, use_ttl, max_merge);
       MergeBasedCounters counters(db, 0);
       testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
-      DestroyDB(dbname, Options());
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
     }
     {
       auto db = OpenDb(dbname, use_ttl, max_merge);
       MergeBasedCounters counters(db, 0);
       testPartialMerge(&counters, db.get(), max_merge, min_merge,
                        min_merge * 10);
-      DestroyDB(dbname, Options());
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
     }
   }
 
@@ -451,15 +563,15 @@
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
-      db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
 
     DB* reopen_db;
     ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
     std::string value;
-    ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok()));
+    ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value));
     delete reopen_db;
-    DestroyDB(dbname, Options());
+    ASSERT_OK(DestroyDB(dbname, Options()));
   }
 
   /* Temporary remove this test
@@ -488,6 +600,19 @@
   runTest(test::PerThreadDBPath("merge_testdbttl"),
           true);  // Run test on TTL database
 }
+
+TEST_F(MergeTest, MergeWithCompactionAndFlush) {
+  const std::string dbname =
+      test::PerThreadDBPath("merge_with_compaction_and_flush");
+  {
+    auto db = OpenDb(dbname);
+    {
+      MergeBasedCounters counters(db, 0);
+      testCountersWithFlushAndCompaction(counters, db.get());
+    }
+  }
+  ASSERT_OK(DestroyDB(dbname, Options()));
+}
 #endif  // !ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/obsolete_files_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/obsolete_files_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/obsolete_files_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/obsolete_files_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 #ifndef ROCKSDB_LITE
 
 #include <stdlib.h>
+#include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
@@ -27,17 +28,14 @@
 #include "test_util/testutil.h"
 #include "util/string_util.h"
 
-using std::cerr;
-using std::cout;
-using std::endl;
-using std::flush;
 
 namespace ROCKSDB_NAMESPACE {
 
 class ObsoleteFilesTest : public DBTestBase {
  public:
   ObsoleteFilesTest()
-      : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {}
+      : DBTestBase("obsolete_files_test", /*env_do_fsync=*/true),
+        wal_dir_(dbname_ + "/wal_files") {}
 
   void AddKeys(int numkeys, int startkey) {
     WriteOptions options;
@@ -56,14 +54,16 @@
       AddKeys(numKeysPerFile, startKey);
       startKey += numKeysPerFile;
       ASSERT_OK(dbfull()->TEST_FlushMemTable());
-      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_OK(
+          dbfull()->TEST_WaitForCompact());  // wait for background flush (flush
+                                             // is also a kind of compaction).
     }
   }
 
   void CheckFileTypeCounts(const std::string& dir, int required_log,
                            int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
-    env_->GetChildren(dir, &filenames);
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
 
     int log_cnt = 0;
     int sst_cnt = 0;
@@ -72,7 +72,7 @@
       uint64_t number;
       FileType type;
       if (ParseFileName(file, &number, &type)) {
-        log_cnt += (type == kLogFile);
+        log_cnt += (type == kWalFile);
         sst_cnt += (type == kTableFile);
         manifest_cnt += (type == kDescriptorFile);
       }
@@ -96,6 +96,12 @@
     options.WAL_ttl_seconds = 300;     // Used to test log files
     options.WAL_size_limit_MB = 1024;  // Used to test log files
     options.wal_dir = wal_dir_;
+
+    // Note: the following prevents an otherwise harmless data race between the
+    // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the
+    // periodic stat dumping thread.
+    options.stats_dump_period_sec = 0;
+
     Destroy(options);
     Reopen(options);
   }
@@ -145,18 +151,6 @@
 
 TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
   ReopenDB();
-  SyncPoint::GetInstance()->DisableProcessing();
-  std::vector<uint64_t> optsfiles_nums;
-  std::vector<bool> optsfiles_keep;
-  SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", [&](void* arg) {
-        optsfiles_nums.push_back(*reinterpret_cast<uint64_t*>(arg));
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", [&](void* arg) {
-        optsfiles_keep.push_back(*reinterpret_cast<bool*>(arg));
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
 
   createLevel0Files(2, 50000);
   CheckFileTypeCounts(wal_dir_, 1, 0, 0);
@@ -172,7 +166,6 @@
     }
   }
   ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
-  ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size());
 
   Close();
 
@@ -193,15 +186,127 @@
   ASSERT_EQ(2, opts_file_count);
 }
 
-}  // namespace ROCKSDB_NAMESPACE
+TEST_F(ObsoleteFilesTest, BlobFiles) {
+  ReopenDB();
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  const ImmutableCFOptions* const ioptions = cfd->ioptions();
+  assert(ioptions);
+  assert(!ioptions->cf_paths.empty());
+
+  const std::string& path = ioptions->cf_paths.front().path;
+
+  // Add an obsolete blob file.
+  constexpr uint64_t first_blob_file_number = 234;
+  versions->AddObsoleteBlobFile(first_blob_file_number, path);
+
+  // Add a live blob file.
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  constexpr uint64_t second_blob_file_number = 456;
+  constexpr uint64_t second_total_blob_count = 100;
+  constexpr uint64_t second_total_blob_bytes = 2000000;
+  constexpr char second_checksum_method[] = "CRC32B";
+  constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+  auto shared_meta = SharedBlobFileMetaData::Create(
+      second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+      second_checksum_method, second_checksum_value);
+
+  constexpr uint64_t second_garbage_blob_count = 0;
+  constexpr uint64_t second_garbage_blob_bytes = 0;
+
+  auto meta = BlobFileMetaData::Create(
+      std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+      second_garbage_blob_count, second_garbage_blob_bytes);
+
+  storage_info->AddBlobFile(std::move(meta));
+
+  // Check for obsolete files and make sure the first blob file is picked up
+  // and grabbed for purge. The second blob file should be on the live list.
+  constexpr int job_id = 0;
+  JobContext job_context{job_id};
+
+  dbfull()->TEST_LockMutex();
+  constexpr bool force_full_scan = false;
+  dbfull()->FindObsoleteFiles(&job_context, force_full_scan);
+  dbfull()->TEST_UnlockMutex();
+
+  ASSERT_TRUE(job_context.HaveSomethingToDelete());
+  ASSERT_EQ(job_context.blob_delete_files.size(), 1);
+  ASSERT_EQ(job_context.blob_delete_files[0].GetBlobFileNumber(),
+            first_blob_file_number);
+
+  const auto& files_grabbed_for_purge =
+      dbfull()->TEST_GetFilesGrabbedForPurge();
+  ASSERT_NE(files_grabbed_for_purge.find(first_blob_file_number),
+            files_grabbed_for_purge.end());
+
+  ASSERT_EQ(job_context.blob_live.size(), 1);
+  ASSERT_EQ(job_context.blob_live[0], second_blob_file_number);
+
+  // Hack the job context a bit by adding a few files to the full scan
+  // list and adjusting the pending file number. We add the two files
+  // above as well as two additional ones, where one is old
+  // and should be cleaned up, and the other is still pending.
+  constexpr uint64_t old_blob_file_number = 123;
+  constexpr uint64_t pending_blob_file_number = 567;
+
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(old_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(first_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(second_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(pending_blob_file_number), path);
+
+  job_context.min_pending_output = pending_blob_file_number;
+
+  // Purge obsolete files and make sure we purge the old file and the first file
+  // (and keep the second file and the pending file).
+  std::vector<std::string> deleted_files;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", [&](void* arg) {
+        const std::string* file = static_cast<std::string*>(arg);
+        assert(file);
+
+        constexpr char blob_extension[] = ".blob";
+
+        if (file->find(blob_extension) != std::string::npos) {
+          deleted_files.emplace_back(*file);
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->PurgeObsoleteFiles(job_context);
+  job_context.Clean();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(files_grabbed_for_purge.find(first_blob_file_number),
+            files_grabbed_for_purge.end());
+
+  std::sort(deleted_files.begin(), deleted_files.end());
+  const std::vector<std::string> expected_deleted_files{
+      BlobFileName(path, old_blob_file_number),
+      BlobFileName(path, first_blob_file_number)};
+
+  ASSERT_EQ(deleted_files, expected_deleted_files);
 }
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+}  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/options_file_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/options_file_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/options_file_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/options_file_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -25,7 +25,7 @@
                         std::unordered_set<std::string>* filename_history,
                         int* options_files_count) {
   std::vector<std::string> filenames;
-  db->GetEnv()->GetChildren(db->GetName(), &filenames);
+  EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
   uint64_t number;
   FileType type;
   *options_files_count = 0;
@@ -42,7 +42,7 @@
     DB* db, const std::unordered_set<std::string>& past_filenames) {
   std::vector<std::string> filenames;
   std::unordered_set<std::string> current_filenames;
-  db->GetEnv()->GetChildren(db->GetName(), &filenames);
+  EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
   uint64_t number;
   FileType type;
   for (auto filename : filenames) {
@@ -65,7 +65,7 @@
   const int kReopenCount = 20;
   Options opt;
   opt.create_if_missing = true;
-  DestroyDB(dbname_, opt);
+  ASSERT_OK(DestroyDB(dbname_, opt));
   std::unordered_set<std::string> filename_history;
   DB* db;
   for (int i = 0; i < kReopenCount; ++i) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/output_validator.h"
+
+#include "test_util/sync_point.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status OutputValidator::Add(const Slice& key, const Slice& value) {
+  if (enable_hash_) {
+    // Generate a rolling 64-bit hash of the key and values
+    paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_);
+    paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_);
+  }
+  if (enable_order_check_) {
+    TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check",
+                             /*arg=*/nullptr);
+    if (key.size() < kNumInternalBytes) {
+      return Status::Corruption(
+          "Compaction tries to write a key without internal bytes.");
+    }
+    // prev_key_ starts with empty.
+    if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) {
+      return Status::Corruption("Compaction sees out-of-order keys.");
+    }
+    prev_key_.assign(key.data(), key.size());
+  }
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/output_validator.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/output_validator.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A class that validates key/value that is inserted to an SST file.
+// Pass every key/value of the file using OutputValidator::Add()
+// and the class validates key order and optionally calculate a hash
+// of all the key and value.
+class OutputValidator {
+ public:
+  explicit OutputValidator(const InternalKeyComparator& icmp,
+                           bool enable_order_check, bool enable_hash,
+                           uint64_t precalculated_hash = 0)
+      : icmp_(icmp),
+        paranoid_hash_(precalculated_hash),
+        enable_order_check_(enable_order_check),
+        enable_hash_(enable_hash) {}
+
+  // Add a key to the KV sequence, and return whether the key follows
+  // criteria, e.g. key is ordered.
+  Status Add(const Slice& key, const Slice& value);
+
+  // Compare result of two key orders are the same. It can be used
+  // to compare the keys inserted into a file, and what is read back.
+  // Return true if the validation passes.
+  bool CompareValidator(const OutputValidator& other_validator) {
+    return GetHash() == other_validator.GetHash();
+  }
+
+  // Not (yet) intended to be persisted, so subject to change
+  // without notice between releases.
+  uint64_t GetHash() const { return paranoid_hash_; }
+
+ private:
+  const InternalKeyComparator& icmp_;
+  std::string prev_key_;
+  uint64_t paranoid_hash_ = 0;
+  bool enable_order_check_;
+  bool enable_hash_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/perf_context_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/perf_context_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/perf_context_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/perf_context_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,6 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
+#include "rocksdb/perf_context.h"
+
 #include <algorithm>
 #include <iostream>
 #include <thread>
@@ -15,8 +17,8 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
-#include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
@@ -76,12 +78,12 @@
     std::string key = "k" + ToString(i);
     std::string value = "v" + ToString(i);
 
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
   }
 
   for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
     std::string key = "k" + ToString(i);
-    db->Delete(write_options, key);
+    ASSERT_OK(db->Delete(write_options, key));
   }
 
   HistogramImpl hist_get;
@@ -91,7 +93,7 @@
     std::string value;
 
     get_perf_context()->Reset();
-    StopWatchNano timer(Env::Default());
+    StopWatchNano timer(SystemClock::Default().get());
     timer.Start();
     auto status = db->Get(read_options, key, &value);
     auto elapsed_nanos = timer.ElapsedNanos();
@@ -110,16 +112,15 @@
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
     get_perf_context()->Reset();
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(SystemClock::Default().get(), true);
     iter->SeekToFirst();
     hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count);
     auto elapsed_nanos = timer.ElapsedNanos();
 
     if (FLAGS_verbose) {
-      std::cout << "SeekToFirst uesr key comparison: \n"
-                << hist_seek_to_first.ToString()
-                << "ikey skipped: " << get_perf_context()->internal_key_skipped_count
-                << "\n"
+      std::cout << "SeekToFirst user key comparison: \n"
+                << hist_seek_to_first.ToString() << "ikey skipped: "
+                << get_perf_context()->internal_key_skipped_count << "\n"
                 << "idelete skipped: "
                 << get_perf_context()->internal_delete_skipped_count << "\n"
                 << "elapsed: " << elapsed_nanos << "\n";
@@ -132,7 +133,7 @@
     std::string key = "k" + ToString(i);
 
     get_perf_context()->Reset();
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(SystemClock::Default().get(), true);
     iter->Seek(key);
     auto elapsed_nanos = timer.ElapsedNanos();
     hist_seek.Add(get_perf_context()->user_key_comparison_count);
@@ -146,7 +147,7 @@
 
     get_perf_context()->Reset();
     ASSERT_TRUE(iter->Valid());
-    StopWatchNano timer2(Env::Default(), true);
+    StopWatchNano timer2(SystemClock::Default().get(), true);
     iter->Next();
     auto elapsed_nanos2 = timer2.ElapsedNanos();
     if (FLAGS_verbose) {
@@ -156,7 +157,7 @@
   }
 
   if (FLAGS_verbose) {
-    std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+    std::cout << "Seek user key comparison: \n" << hist_seek.ToString();
   }
 }
 
@@ -165,7 +166,7 @@
   const int kTotalIterations = 1000000;
   std::vector<uint64_t> timings(kTotalIterations);
 
-  StopWatchNano timer(Env::Default(), true);
+  StopWatchNano timer(SystemClock::Default().get(), true);
   for (auto& timing : timings) {
     timing = timer.ElapsedNanos(true /* reset */);
   }
@@ -186,7 +187,7 @@
   uint64_t elapsed = 0;
   std::vector<uint64_t> timings(kTotalIterations);
 
-  StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
+  StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed);
   for (auto& timing : timings) {
     timing = elapsed;
   }
@@ -251,7 +252,7 @@
   }
 
   if (FLAGS_random_key) {
-    std::random_shuffle(keys.begin(), keys.end());
+    RandomShuffle(std::begin(keys), std::end(keys));
   }
 #ifndef NDEBUG
   ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
@@ -270,7 +271,7 @@
     std::vector<std::string> values;
 
     get_perf_context()->Reset();
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
     if (++num_mutex_waited > 3) {
 #ifndef NDEBUG
       ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
@@ -314,7 +315,10 @@
     hist_get.Add(get_perf_context()->user_key_comparison_count);
 
     get_perf_context()->Reset();
-    db->MultiGet(read_options, multiget_keys, &values);
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
     hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
     hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
     hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
@@ -324,9 +328,10 @@
   }
 
   if (FLAGS_verbose) {
-    std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
-              << "Get uesr key comparison: \n" << hist_get.ToString()
-              << "MultiGet uesr key comparison: \n" << hist_get.ToString();
+    std::cout << "Put user key comparison: \n"
+              << hist_put.ToString() << "Get user key comparison: \n"
+              << hist_get.ToString() << "MultiGet user key comparison: \n"
+              << hist_get.ToString();
     std::cout << "Put(): Pre and Post Process Time: \n"
               << hist_write_pre_post.ToString() << " Writing WAL time: \n"
               << hist_write_wal_time.ToString() << "\n"
@@ -428,7 +433,10 @@
     hist_get.Add(get_perf_context()->user_key_comparison_count);
 
     get_perf_context()->Reset();
-    db->MultiGet(read_options, multiget_keys, &values);
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
     hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
     hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
     hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
@@ -438,8 +446,9 @@
   }
 
   if (FLAGS_verbose) {
-    std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
-              << "ReadOnly MultiGet uesr key comparison: \n"
+    std::cout << "ReadOnly Get user key comparison: \n"
+              << hist_get.ToString()
+              << "ReadOnly MultiGet user key comparison: \n"
               << hist_mget.ToString();
 
     std::cout << "ReadOnly Get(): Time to get snapshot: \n"
@@ -524,7 +533,7 @@
   }
 
   if (FLAGS_random_key) {
-    std::random_shuffle(keys.begin(), keys.end());
+    RandomShuffle(std::begin(keys), std::end(keys));
   }
 
   HistogramImpl hist_put_time;
@@ -532,14 +541,14 @@
   HistogramImpl hist_time_diff;
 
   SetPerfLevel(kEnableTime);
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(SystemClock::Default().get());
   for (const int i : keys) {
     std::string key = "k" + ToString(i);
     std::string value = "v" + ToString(i);
 
     get_perf_context()->Reset();
     timer.Start();
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
     auto put_time = timer.ElapsedNanos();
     hist_put_time.Add(put_time);
     hist_wal_time.Add(get_perf_context()->write_wal_time);
@@ -573,7 +582,7 @@
     iter->Next();
     hist_next.Add(get_perf_context()->user_key_comparison_count);
   }
-
+  ASSERT_OK(iter->status());
   if (FLAGS_verbose) {
     std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n"
               << hist_next.ToString();
@@ -585,25 +594,26 @@
   for (PerfLevel perf_level_test :
        {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
     for (int c = 0; c < 2; ++c) {
-    InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
-    mutex.Lock();
-    ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
-      SetPerfLevel(perf_level_test);
-      get_perf_context()->Reset();
-      ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+      InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                              stats_code[c]);
       mutex.Lock();
-      mutex.Unlock();
-      if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
-          stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+      ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
+        SetPerfLevel(perf_level_test);
+        get_perf_context()->Reset();
         ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
-      } else {
-        // increment the counter only when it's a DB Mutex
-        ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
-      }
-    });
-    Env::Default()->SleepForMicroseconds(100);
-    mutex.Unlock();
-    child_thread.join();
+        mutex.Lock();
+        mutex.Unlock();
+        if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
+            stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+          ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+        } else {
+          // increment the counter only when it's a DB Mutex
+          ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
+        }
+      });
+      SystemClock::Default()->SleepForMicroseconds(100);
+      mutex.Unlock();
+      child_thread.join();
   }
   }
 }
@@ -612,7 +622,8 @@
   SetPerfLevel(kEnableTime);
   int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
   for (int c = 0; c < 2; ++c) {
-    InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
+    InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                            stats_code[c]);
     InstrumentedCondVar lock(&mutex);
     get_perf_context()->Reset();
     mutex.Lock();
@@ -817,6 +828,11 @@
 }
 
 TEST_F(PerfContextTest, CPUTimer) {
+  if (SystemClock::Default()->CPUNanos() == 0) {
+    ROCKSDB_GTEST_SKIP("Target without CPUNanos support");
+    return;
+  }
+
   DestroyDB(kDbName, Options());
   auto db = OpenDb();
   WriteOptions write_options;
@@ -830,7 +846,7 @@
     std::string value = "v" + i_str;
     max_str = max_str > i_str ? max_str : i_str;
 
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
   }
   std::string last_key = "k" + max_str;
   std::string last_value = "v" + max_str;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,117 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_work_scheduler.h"
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/system_clock.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+PeriodicWorkScheduler::PeriodicWorkScheduler(
+    const std::shared_ptr<SystemClock>& clock) {
+  timer = std::unique_ptr<Timer>(new Timer(clock.get()));
+}
+
+void PeriodicWorkScheduler::Register(DBImpl* dbi,
+                                     unsigned int stats_dump_period_sec,
+                                     unsigned int stats_persist_period_sec) {
+  MutexLock l(&timer_mu_);
+  static std::atomic<uint64_t> initial_delay(0);
+  timer->Start();
+  if (stats_dump_period_sec > 0) {
+    timer->Add([dbi]() { dbi->DumpStats(); }, GetTaskName(dbi, "dump_st"),
+               initial_delay.fetch_add(1) %
+                   static_cast<uint64_t>(stats_dump_period_sec) *
+                   kMicrosInSecond,
+               static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond);
+  }
+  if (stats_persist_period_sec > 0) {
+    timer->Add(
+        [dbi]() { dbi->PersistStats(); }, GetTaskName(dbi, "pst_st"),
+        initial_delay.fetch_add(1) %
+            static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond,
+        static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond);
+  }
+  timer->Add([dbi]() { dbi->FlushInfoLog(); },
+             GetTaskName(dbi, "flush_info_log"),
+             initial_delay.fetch_add(1) % kDefaultFlushInfoLogPeriodSec *
+                 kMicrosInSecond,
+             kDefaultFlushInfoLogPeriodSec * kMicrosInSecond);
+}
+
+void PeriodicWorkScheduler::Unregister(DBImpl* dbi) {
+  MutexLock l(&timer_mu_);
+  timer->Cancel(GetTaskName(dbi, "dump_st"));
+  timer->Cancel(GetTaskName(dbi, "pst_st"));
+  timer->Cancel(GetTaskName(dbi, "flush_info_log"));
+  if (!timer->HasPendingTask()) {
+    timer->Shutdown();
+  }
+}
+
+PeriodicWorkScheduler* PeriodicWorkScheduler::Default() {
+  // Always use the default SystemClock for the scheduler, as we only use the
+  // NowMicros which is the same for all clocks. The Env could only be
+  // overridden in test.
+  static PeriodicWorkScheduler scheduler(SystemClock::Default());
+  return &scheduler;
+}
+
+std::string PeriodicWorkScheduler::GetTaskName(DBImpl* dbi,
+                                               const std::string& func_name) {
+  std::string db_session_id;
+  // TODO: Should this error be ignored?
+  dbi->GetDbSessionId(db_session_id).PermitUncheckedError();
+  return db_session_id + ":" + func_name;
+}
+
+#ifndef NDEBUG
+
+// Get the static scheduler. For a new SystemClock, it needs to re-create the
+// internal timer, so only re-create it when there's no running task. Otherwise,
+// return the existing scheduler. Which means if the unittest needs to update
+// MockClock, Close all db instances and then re-open them.
+PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default(
+    const std::shared_ptr<SystemClock>& clock) {
+  static PeriodicWorkTestScheduler scheduler(clock);
+  static port::Mutex mutex;
+  {
+    MutexLock l(&mutex);
+    if (scheduler.timer.get() != nullptr &&
+        scheduler.timer->TEST_GetPendingTaskNum() == 0) {
+      {
+        MutexLock timer_mu_guard(&scheduler.timer_mu_);
+        scheduler.timer->Shutdown();
+      }
+      scheduler.timer.reset(new Timer(clock.get()));
+    }
+  }
+  return &scheduler;
+}
+
+void PeriodicWorkTestScheduler::TEST_WaitForRun(
+    std::function<void()> callback) const {
+  if (timer != nullptr) {
+    timer->TEST_WaitForRun(callback);
+  }
+}
+
+size_t PeriodicWorkTestScheduler::TEST_GetValidTaskNum() const {
+  if (timer != nullptr) {
+    return timer->TEST_GetPendingTaskNum();
+  }
+  return 0;
+}
+
+PeriodicWorkTestScheduler::PeriodicWorkTestScheduler(
+    const std::shared_ptr<SystemClock>& clock)
+    : PeriodicWorkScheduler(clock) {}
+
+#endif  // !NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,78 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "util/timer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+// PeriodicWorkScheduler is a singleton object, which is scheduling/running
+// DumpStats(), PersistStats(), and FlushInfoLog() for all DB instances. All DB
+// instances use the same object from `Default()`.
+//
+// Internally, it uses a single threaded timer object to run the periodic work
+// functions. Timer thread will always be started since the info log flushing
+// cannot be disabled.
+class PeriodicWorkScheduler {
+ public:
+  static PeriodicWorkScheduler* Default();
+
+  PeriodicWorkScheduler() = delete;
+  PeriodicWorkScheduler(const PeriodicWorkScheduler&) = delete;
+  PeriodicWorkScheduler(PeriodicWorkScheduler&&) = delete;
+  PeriodicWorkScheduler& operator=(const PeriodicWorkScheduler&) = delete;
+  PeriodicWorkScheduler& operator=(PeriodicWorkScheduler&&) = delete;
+
+  void Register(DBImpl* dbi, unsigned int stats_dump_period_sec,
+                unsigned int stats_persist_period_sec);
+
+  void Unregister(DBImpl* dbi);
+
+  // Periodically flush info log out of application buffer at a low frequency.
+  // This improves debuggability in case of RocksDB hanging since it ensures the
+  // log messages leading up to the hang will eventually become visible in the
+  // log.
+  static const uint64_t kDefaultFlushInfoLogPeriodSec = 10;
+
+ protected:
+  std::unique_ptr<Timer> timer;
+  // `timer_mu_` serves two purposes currently:
+  // (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as
+  //     they are currently not implemented in a thread-safe way; and
+  // (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and
+  //     the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically.
+  port::Mutex timer_mu_;
+
+  explicit PeriodicWorkScheduler(const std::shared_ptr<SystemClock>& clock);
+
+ private:
+  std::string GetTaskName(DBImpl* dbi, const std::string& func_name);
+};
+
+#ifndef NDEBUG
+// PeriodicWorkTestScheduler is for unittest, which can specify the SystemClock
+// It also contains functions for unittest.
+class PeriodicWorkTestScheduler : public PeriodicWorkScheduler {
+ public:
+  static PeriodicWorkTestScheduler* Default(
+      const std::shared_ptr<SystemClock>& clock);
+
+  void TEST_WaitForRun(std::function<void()> callback) const;
+
+  size_t TEST_GetValidTaskNum() const;
+
+ private:
+  explicit PeriodicWorkTestScheduler(const std::shared_ptr<SystemClock>& clock);
+};
+#endif  // !NDEBUG
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,236 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_work_scheduler.h"
+
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class PeriodicWorkSchedulerTest : public DBTestBase {
+ public:
+  PeriodicWorkSchedulerTest()
+      : DBTestBase("periodic_work_scheduler_test", /*env_do_fsync=*/true) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+  }
+
+ protected:
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) {
+          auto* periodic_work_scheduler_ptr =
+              reinterpret_cast<PeriodicWorkScheduler**>(arg);
+          *periodic_work_scheduler_ptr =
+              PeriodicWorkTestScheduler::Default(mock_clock_);
+        });
+  }
+};
+
+TEST_F(PeriodicWorkSchedulerTest, Basic) {
+  constexpr unsigned int kPeriodSec =
+      PeriodicWorkScheduler::kDefaultFlushInfoLogPeriodSec;
+  Close();
+  Options options;
+  options.stats_dump_period_sec = kPeriodSec;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.create_if_missing = true;
+  options.env = mock_env_.get();
+
+  int dump_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:StartRunning",
+                                        [&](void*) { dump_st_counter++; });
+
+  int pst_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+                                        [&](void*) { pst_st_counter++; });
+
+  int flush_info_log_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushInfoLog:StartRunning",
+      [&](void*) { flush_info_log_counter++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_GT(kPeriodSec, 1u);
+  dbfull()->TEST_WaitForStatsDumpRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec) - 1);
+  });
+
+  auto scheduler = dbfull()->TEST_GetPeriodicWorkScheduler();
+  ASSERT_NE(nullptr, scheduler);
+  ASSERT_EQ(3, scheduler->TEST_GetValidTaskNum());
+
+  ASSERT_EQ(1, dump_st_counter);
+  ASSERT_EQ(1, pst_st_counter);
+  ASSERT_EQ(1, flush_info_log_counter);
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+  ASSERT_EQ(2, dump_st_counter);
+  ASSERT_EQ(2, pst_st_counter);
+  ASSERT_EQ(2, flush_info_log_counter);
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+  ASSERT_EQ(3, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(3, flush_info_log_counter);
+
+  // Disable scheduler with SetOption
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"stats_dump_period_sec", "0"}, {"stats_persist_period_sec", "0"}}));
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  // Info log flush should still run.
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+  ASSERT_EQ(3, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(4, flush_info_log_counter);
+
+  scheduler = dbfull()->TEST_GetPeriodicWorkScheduler();
+  ASSERT_EQ(1u, scheduler->TEST_GetValidTaskNum());
+
+  // Re-enable one task
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}}));
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  scheduler = dbfull()->TEST_GetPeriodicWorkScheduler();
+  ASSERT_NE(nullptr, scheduler);
+  ASSERT_EQ(2, scheduler->TEST_GetValidTaskNum());
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+  ASSERT_EQ(4, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(5, flush_info_log_counter);
+
+  Close();
+}
+
+TEST_F(PeriodicWorkSchedulerTest, MultiInstances) {
+  constexpr int kPeriodSec = 5;
+  const int kInstanceNum = 10;
+
+  Close();
+  Options options;
+  options.stats_dump_period_sec = kPeriodSec;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.create_if_missing = true;
+  options.env = mock_env_.get();
+
+  int dump_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:2",
+                                        [&](void*) { dump_st_counter++; });
+
+  int pst_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+                                        [&](void*) { pst_st_counter++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto dbs = std::vector<DB*>(kInstanceNum);
+  for (int i = 0; i < kInstanceNum; i++) {
+    ASSERT_OK(
+        DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
+  }
+
+  auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1]);
+  auto scheduler = dbi->TEST_GetPeriodicWorkScheduler();
+  ASSERT_EQ(kInstanceNum * 3, scheduler->TEST_GetValidTaskNum());
+
+  int expected_run = kInstanceNum;
+  dbi->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  expected_run += kInstanceNum;
+  dbi->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  expected_run += kInstanceNum;
+  dbi->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  int half = kInstanceNum / 2;
+  for (int i = 0; i < half; i++) {
+    delete dbs[i];
+  }
+
+  expected_run += (kInstanceNum - half) * 2;
+
+  dbi->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  dbi->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  for (int i = half; i < kInstanceNum; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    delete dbs[i];
+  }
+}
+
+TEST_F(PeriodicWorkSchedulerTest, MultiEnv) {
+  constexpr int kDumpPeriodSec = 5;
+  constexpr int kPersistPeriodSec = 10;
+  Close();
+  Options options1;
+  options1.stats_dump_period_sec = kDumpPeriodSec;
+  options1.stats_persist_period_sec = kPersistPeriodSec;
+  options1.create_if_missing = true;
+  options1.env = mock_env_.get();
+
+  Reopen(options1);
+
+  std::unique_ptr<Env> mock_env2(
+      new CompositeEnvWrapper(Env::Default(), mock_clock_));
+  Options options2;
+  options2.stats_dump_period_sec = kDumpPeriodSec;
+  options2.stats_persist_period_sec = kPersistPeriodSec;
+  options2.create_if_missing = true;
+  options1.env = mock_env2.get();
+
+  std::string dbname = test::PerThreadDBPath("multi_env_test");
+  DB* db;
+  ASSERT_OK(DB::Open(options2, dbname, &db));
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+
+  ASSERT_EQ(dbi->TEST_GetPeriodicWorkScheduler(),
+            dbfull()->TEST_GetPeriodicWorkScheduler());
+
+  ASSERT_OK(db->Close());
+  delete db;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/pinned_iterators_manager.h	2025-05-19 16:14:27.000000000 +0000
@@ -43,7 +43,7 @@
     }
   }
 
-  typedef void (*ReleaseFunction)(void* arg1);
+  using ReleaseFunction = void (*)(void* arg1);
   void PinPtr(void* ptr, ReleaseFunction release_func) {
     assert(pinning_enabled);
     if (ptr == nullptr) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/plain_table_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/plain_table_db_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/plain_table_db_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/plain_table_db_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,7 +16,6 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
-#include "logging/logging.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
@@ -32,26 +31,27 @@
 #include "table/table_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/cast_util.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
+#include "util/random.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
-using std::unique_ptr;
 
 namespace ROCKSDB_NAMESPACE {
 class PlainTableKeyDecoderTest : public testing::Test {};
 
 TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
-  std::string tmp;
   Random rnd(301);
   const uint32_t kLength = 2222;
-  Slice contents = test::RandomString(&rnd, kLength, &tmp);
+  std::string tmp = rnd.RandomString(kLength);
+  Slice contents(tmp);
   test::StringSource* string_source =
       new test::StringSource(contents, 0, false);
-
+  std::unique_ptr<FSRandomAccessFile> holder(string_source);
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(string_source));
+      new RandomAccessFileReader(std::move(holder), "test"));
   std::unique_ptr<PlainTableReaderFileInfo> file_info(
       new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
                                    kLength));
@@ -146,9 +146,7 @@
     return options;
   }
 
-  DBImpl* dbfull() {
-    return reinterpret_cast<DBImpl*>(db_);
-  }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
 
   void Reopen(Options* options = nullptr) {
     ASSERT_OK(TryReopen(options));
@@ -222,8 +220,8 @@
 
   int NumTableFilesAtLevel(int level) {
     std::string property;
-    EXPECT_TRUE(db_->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    EXPECT_TRUE(db_->GetProperty("rocksdb.num-files-at-level" + ToString(level),
+                                 &property));
     return atoi(property.c_str());
   }
 
@@ -264,31 +262,26 @@
 
 class TestPlainTableReader : public PlainTableReader {
  public:
-  TestPlainTableReader(const EnvOptions& env_options,
-                       const InternalKeyComparator& icomparator,
-                       EncodingType encoding_type, uint64_t file_size,
-                       int bloom_bits_per_key, double hash_table_ratio,
-                       size_t index_sparseness,
-                       const TableProperties* table_properties,
-                       std::unique_ptr<RandomAccessFileReader>&& file,
-                       const ImmutableCFOptions& ioptions,
-                       const SliceTransform* prefix_extractor,
-                       bool* expect_bloom_not_match, bool store_index_in_file,
-                       uint32_t column_family_id,
-                       const std::string& column_family_name)
+  TestPlainTableReader(
+      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
+      EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key,
+      double hash_table_ratio, size_t index_sparseness,
+      std::unique_ptr<TableProperties>&& props,
+      std::unique_ptr<RandomAccessFileReader>&& file,
+      const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor,
+      bool* expect_bloom_not_match, bool store_index_in_file,
+      uint32_t column_family_id, const std::string& column_family_name)
       : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
-                         encoding_type, file_size, table_properties,
+                         encoding_type, file_size, props.get(),
                          prefix_extractor),
         expect_bloom_not_match_(expect_bloom_not_match) {
     Status s = MmapDataIfNeeded();
     EXPECT_TRUE(s.ok());
 
-    s = PopulateIndex(const_cast<TableProperties*>(table_properties),
-                      bloom_bits_per_key, hash_table_ratio, index_sparseness,
-                      2 * 1024 * 1024);
+    s = PopulateIndex(props.get(), bloom_bits_per_key, hash_table_ratio,
+                      index_sparseness, 2 * 1024 * 1024);
     EXPECT_TRUE(s.ok());
 
-    TableProperties* props = const_cast<TableProperties*>(table_properties);
     EXPECT_EQ(column_family_id, static_cast<uint32_t>(props->column_family_id));
     EXPECT_EQ(column_family_name, props->column_family_name);
     if (store_index_in_file) {
@@ -302,7 +295,7 @@
         EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
       }
     }
-    table_properties_.reset(props);
+    table_properties_ = std::move(props);
   }
 
   ~TestPlainTableReader() override {}
@@ -336,31 +329,30 @@
         column_family_id_(column_family_id),
         column_family_name_(std::move(column_family_name)) {}
 
+  using PlainTableFactory::NewTableReader;
   Status NewTableReader(
-      const TableReaderOptions& table_reader_options,
+      const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
       std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
       std::unique_ptr<TableReader>* table,
       bool /*prefetch_index_and_filter_in_cache*/) const override {
-    TableProperties* props = nullptr;
-    auto s =
-        ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                            table_reader_options.ioptions, &props,
-                            true /* compression_type_missing */);
+    std::unique_ptr<TableProperties> props;
+    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                                 table_reader_options.ioptions, &props);
     EXPECT_TRUE(s.ok());
 
     if (store_index_in_file_) {
       BlockHandle bloom_block_handle;
-      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
-                        table_reader_options.ioptions,
-                        BloomBlockBuilder::kBloomBlock, &bloom_block_handle,
-                        /* compression_type_missing */ true);
+      s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+                              table_reader_options.ioptions,
+                              BloomBlockBuilder::kBloomBlock,
+                              &bloom_block_handle);
       EXPECT_TRUE(s.ok());
 
       BlockHandle index_block_handle;
-      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
-                        table_reader_options.ioptions,
-                        PlainTableIndexBuilder::kPlainTableIndexBlock,
-                        &index_block_handle, /* compression_type_missing */ true);
+      s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+                              table_reader_options.ioptions,
+                              PlainTableIndexBuilder::kPlainTableIndexBlock,
+                              &index_block_handle);
       EXPECT_TRUE(s.ok());
     }
 
@@ -374,9 +366,9 @@
     std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
         table_reader_options.env_options,
         table_reader_options.internal_comparator, encoding_type, file_size,
-        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), table_reader_options.ioptions,
-        table_reader_options.prefix_extractor, expect_bloom_not_match_,
+        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_,
+        std::move(props), std::move(file), table_reader_options.ioptions,
+        table_reader_options.prefix_extractor.get(), expect_bloom_not_match_,
         store_index_in_file_, column_family_id_, column_family_name_));
 
     *table = std::move(new_reader);
@@ -396,7 +388,7 @@
 TEST_P(PlainTableDBTest, BadOptions1) {
   // Build with a prefix extractor
   ASSERT_OK(Put("1000000000000foo", "v1"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // Bad attempt to re-open without a prefix extractor
   Options options = CurrentOptions();
@@ -427,7 +419,9 @@
   // Build without a prefix extractor
   // (apparently works even if hash_table_ratio > 0)
   ASSERT_OK(Put("1000000000000foo", "v1"));
-  dbfull()->TEST_FlushMemTable();
+  // Build without a prefix extractor, this call will fail and returns the
+  // status for this bad attempt.
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable());
 
   // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
   Status s = TryReopen(&options);
@@ -502,14 +496,15 @@
           ASSERT_OK(Put("1000000000000foo", "v1"));
           ASSERT_OK(Put("0000000000000bar", "v2"));
           ASSERT_OK(Put("1000000000000foo", "v3"));
-          dbfull()->TEST_FlushMemTable();
+          ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
           ASSERT_TRUE(dbfull()->GetIntProperty(
               "rocksdb.estimate-table-readers-mem", &int_num));
           ASSERT_GT(int_num, 0U);
 
           TablePropertiesCollection ptc;
-          reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+          ASSERT_OK(
+              reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
           ASSERT_EQ(1U, ptc.size());
           auto row = ptc.begin();
           auto tp = row->second;
@@ -594,23 +589,23 @@
         DestroyAndReopen(&options);
         ASSERT_OK(Put("0000000000000bar", "b"));
         ASSERT_OK(Put("1000000000000foo", "v1"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
         ASSERT_OK(Put("1000000000000foo", "v2"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v2", Get("1000000000000foo"));
 
         ASSERT_OK(Put("0000000000000eee", "v3"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v3", Get("0000000000000eee"));
 
         ASSERT_OK(Delete("0000000000000bar"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
 
         ASSERT_OK(Put("0000000000000eee", "v5"));
         ASSERT_OK(Put("9000000000000eee", "v5"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v5", Get("0000000000000eee"));
 
         // Test Bloom Filter
@@ -650,7 +645,7 @@
     DestroyAndReopen(&options);
     ASSERT_OK(Put("0000000000000bar", "b"));
     ASSERT_OK(Put("1000000000000foo", "v1"));
-    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
     int copied = 0;
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -728,7 +723,7 @@
         ASSERT_OK(Put("1000000000foo005", "v__5"));
         ASSERT_OK(Put("1000000000foo007", "v__7"));
         ASSERT_OK(Put("1000000000foo008", "v__8"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v1", Get("1000000000foo001"));
         ASSERT_EQ("v__3", Get("1000000000foo003"));
         Iterator* iter = dbfull()->NewIterator(ReadOptions());
@@ -798,7 +793,7 @@
             expect_bloom_not_match = false;
           }
         }
-
+        ASSERT_OK(iter->status());
         delete iter;
       }
     }
@@ -839,7 +834,7 @@
     for (unsigned i = 0; i < 2345; ++i) {
       ASSERT_OK(Put(NthKey(i, 'y'), "added"));
     }
-    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
     ASSERT_EQ("added", Get(NthKey(42, 'y')));
 
     for (unsigned i = 0; i < 32; ++i) {
@@ -897,7 +892,7 @@
     ASSERT_OK(Put(key_list[i], ToString(i)));
   }
 
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   Iterator* iter = dbfull()->NewIterator(ReadOptions());
   iter->Seek(key_list[0]);
@@ -945,7 +940,7 @@
     ASSERT_OK(Put(key_list[i], ToString(i)));
   }
 
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   Iterator* iter = dbfull()->NewIterator(ReadOptions());
   iter->Seek(key_list[0]);
@@ -980,7 +975,7 @@
   ASSERT_OK(Put("1000000000foo005", "v__5"));
   ASSERT_OK(Put("1000000000foo007", "v__7"));
   ASSERT_OK(Put("1000000000foo008", "v__8"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_EQ("v1", Get("1000000000foo001"));
   ASSERT_EQ("v__3", Get("1000000000foo003"));
   Iterator* iter = dbfull()->NewIterator(ReadOptions());
@@ -1058,7 +1053,7 @@
       ASSERT_OK(Put("2000000000000fo2", "v"));
       ASSERT_OK(Put("2000000000000fo3", "v"));
 
-      dbfull()->TEST_FlushMemTable();
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
       ASSERT_EQ("v1", Get("5000000000000fo0"));
       ASSERT_EQ("v2", Get("5000000000000fo1"));
@@ -1119,6 +1114,7 @@
       iter->Seek("8000000000000fo2");
       ASSERT_TRUE(!iter->Valid());
 
+      ASSERT_OK(iter->status());
       delete iter;
     }
   }
@@ -1152,7 +1148,7 @@
       ASSERT_OK(Put("2000000000000fo2", "v"));
       ASSERT_OK(Put("2000000000000fo3", "v"));
 
-      dbfull()->TEST_FlushMemTable();
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
       ASSERT_EQ("v1", Get("5000000000000fo0"));
       ASSERT_EQ("v2", Get("5000000000000fo1"));
@@ -1212,6 +1208,7 @@
       iter->Seek("8000000000000fo2");
       ASSERT_TRUE(!iter->Valid());
 
+      ASSERT_OK(iter->status());
       delete iter;
     }
   }
@@ -1234,7 +1231,7 @@
   ASSERT_OK(Put("5000000000000fo1", "v2"));
   ASSERT_OK(Put("5000000000000fo2", "v3"));
 
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   ASSERT_EQ("v1", Get("5000000000000fo0"));
   ASSERT_EQ("v2", Get("5000000000000fo1"));
@@ -1258,6 +1255,7 @@
   iter->Seek("8000000000000fo2");
   ASSERT_TRUE(!iter->Valid());
 
+  ASSERT_OK(iter->status());
   delete iter;
 }
 
@@ -1267,15 +1265,9 @@
   return std::string(buf);
 }
 
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-
 TEST_P(PlainTableDBTest, CompactionTrigger) {
   Options options = CurrentOptions();
-  options.write_buffer_size = 120 << 10;  // 100KB
+  options.write_buffer_size = 120 << 10;  // 120KB
   options.num_levels = 3;
   options.level0_file_num_compaction_trigger = 3;
   Reopen(&options);
@@ -1287,22 +1279,22 @@
     std::vector<std::string> values;
     // Write 120KB (10 values, each 12K)
     for (int i = 0; i < 10; i++) {
-      values.push_back(RandomString(&rnd, 12000));
+      values.push_back(rnd.RandomString(12 << 10));
       ASSERT_OK(Put(Key(i), values[i]));
     }
     ASSERT_OK(Put(Key(999), ""));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
   }
 
   //generate one more file in level-0, and should trigger level-0 compaction
   std::vector<std::string> values;
   for (int i = 0; i < 12; i++) {
-    values.push_back(RandomString(&rnd, 10000));
+    values.push_back(rnd.RandomString(10000));
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Put(Key(999), ""));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
@@ -1318,7 +1310,7 @@
   ASSERT_OK(Put("1000000000000foo", "v1"));
   ASSERT_OK(Put("0000000000000bar", "v2"));
   ASSERT_OK(Put("1000000000000foo", "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   options.create_if_missing = false;
   std::shared_ptr<TableFactory> block_based_factory(
@@ -1334,7 +1326,7 @@
 
   ASSERT_OK(Put("2000000000000foo", "v4"));
   ASSERT_OK(Put("3000000000000bar", "v5"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_EQ("v4", Get("2000000000000foo"));
   ASSERT_EQ("v5", Get("3000000000000bar"));
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/pre_release_callback.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/pre_release_callback.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/pre_release_callback.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/pre_release_callback.h	2025-05-19 16:14:27.000000000 +0000
@@ -6,11 +6,10 @@
 #pragma once
 
 #include "rocksdb/status.h"
+#include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class DB;
-
 class PreReleaseCallback {
  public:
   virtual ~PreReleaseCallback() {}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/prefix_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/prefix_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/prefix_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/prefix_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -25,8 +25,10 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/table.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
@@ -310,7 +312,7 @@
     ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006"));
     ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011"));
     ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk"));
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Flush(FlushOptions()));
     std::string result;
     auto db_iter = db->NewIterator(ReadOptions());
 
@@ -330,7 +332,7 @@
     ASSERT_OK(db->Put(write_options, "pikachu", "1"));
     ASSERT_OK(db->Put(write_options, "Meowth", "1"));
     ASSERT_OK(db->Put(write_options, "Mewtwo", "idk"));
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Flush(FlushOptions()));
     std::string result;
     auto db_iter = db->NewIterator(ReadOptions());
 
@@ -350,7 +352,7 @@
       std::cout << "*** Mem table: " << options.memtable_factory->Name()
                 << " number of buckets: " << num_buckets
                 << std::endl;
-      DestroyDB(kDbName, Options());
+      ASSERT_OK(DestroyDB(kDbName, Options()));
       auto db = OpenDb();
       WriteOptions write_options;
       ReadOptions read_options;
@@ -370,9 +372,11 @@
       ASSERT_TRUE(v16 == iter->value());
       iter->Next();
       ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
 
       SeekIterator(iter.get(), 2, 0);
       ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
 
       ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
       ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
@@ -396,9 +400,11 @@
       ASSERT_TRUE(v17 == iter->value());
       iter->Next();
       ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
 
       SeekIterator(iter.get(), 2, 0);
       ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
 
       // 3. Insert an entry for the same prefix as the head of the bucket.
       Slice v15("v15");
@@ -523,7 +529,7 @@
     while (NextOptions(num_buckets)) {
       std::cout << "*** Mem table: " << options.memtable_factory->Name()
                 << " number of buckets: " << num_buckets << std::endl;
-      DestroyDB(kDbName, Options());
+      ASSERT_OK(DestroyDB(kDbName, Options()));
       auto db = OpenDb();
       WriteOptions write_options;
       ReadOptions read_options;
@@ -538,11 +544,11 @@
       PutKey(db.get(), write_options, 12345, 8, v18);
       PutKey(db.get(), write_options, 12345, 9, v19);
       PutKey(db.get(), write_options, 12346, 8, v16);
-      db->Flush(FlushOptions());
+      ASSERT_OK(db->Flush(FlushOptions()));
       TestKey test_key(12346, 8);
       std::string s;
-      db->Delete(write_options, TestKeyToSlice(s, test_key));
-      db->Flush(FlushOptions());
+      ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key)));
+      ASSERT_OK(db->Flush(FlushOptions()));
       read_options.prefix_same_as_start = true;
       std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
       SeekIterator(iter.get(), 12345, 6);
@@ -567,6 +573,7 @@
       // Verify seeking past the prefix won't return a result.
       SeekIterator(iter.get(), 12345, 10);
       ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
     }
   }
 }
@@ -575,7 +582,7 @@
   while (NextOptions(FLAGS_bucket_count)) {
     std::cout << "*** Mem table: " << options.memtable_factory->Name()
         << std::endl;
-    DestroyDB(kDbName, Options());
+    ASSERT_OK(DestroyDB(kDbName, Options()));
     auto db = OpenDb();
     WriteOptions write_options;
     ReadOptions read_options;
@@ -586,12 +593,11 @@
     }
 
     if (FLAGS_random_prefix) {
-      std::random_shuffle(prefixes.begin(), prefixes.end());
+      RandomShuffle(prefixes.begin(), prefixes.end());
     }
 
     HistogramImpl hist_put_time;
     HistogramImpl hist_put_comparison;
-
     // insert x random prefix, each with y continuous element.
     for (auto prefix : prefixes) {
        for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
@@ -602,7 +608,7 @@
         std::string value(FLAGS_value_size, 0);
 
         get_perf_context()->Reset();
-        StopWatchNano timer(Env::Default(), true);
+        StopWatchNano timer(SystemClock::Default().get(), true);
         ASSERT_OK(db->Put(write_options, key, value));
         hist_put_time.Add(timer.ElapsedNanos());
         hist_put_comparison.Add(get_perf_context()->user_key_comparison_count);
@@ -625,7 +631,7 @@
       std::string value = "v" + ToString(0);
 
       get_perf_context()->Reset();
-      StopWatchNano timer(Env::Default(), true);
+      StopWatchNano timer(SystemClock::Default().get(), true);
       auto key_prefix = options.prefix_extractor->Transform(key);
       uint64_t total_keys = 0;
       for (iter->Seek(key);
@@ -659,11 +665,12 @@
       Slice key = TestKeyToSlice(s, test_key);
 
       get_perf_context()->Reset();
-      StopWatchNano timer(Env::Default(), true);
+      StopWatchNano timer(SystemClock::Default().get(), true);
       iter->Seek(key);
       hist_no_seek_time.Add(timer.ElapsedNanos());
       hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
       ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
     }
 
     std::cout << "non-existing Seek key comparison: \n"
@@ -682,7 +689,7 @@
   for (size_t m = 1; m < 100; m++) {
     std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: "
               << options.memtable_factory->Name() << std::endl;
-    DestroyDB(kDbName, Options());
+    ASSERT_OK(DestroyDB(kDbName, Options()));
     auto db = OpenDb();
     WriteOptions write_options;
     ReadOptions read_options;
@@ -707,7 +714,7 @@
         }
       }
       if (i < 2) {
-        db->Flush(FlushOptions());
+        ASSERT_OK(db->Flush(FlushOptions()));
       }
     }
 
@@ -767,6 +774,7 @@
             SliceToTestKey(iter->key()).prefix != stored_prefix) {
           break;
         }
+        ASSERT_OK(iter->status());
         stored_prefix = SliceToTestKey(iter->key()).prefix;
         ASSERT_TRUE(iter->Valid());
         ASSERT_NE(it, whole_map.end());
@@ -798,7 +806,7 @@
   options.memtable_factory.reset(new SkipListFactory);
   options.write_buffer_size = 1024 * 1024;
   std::string v13("v13");
-  DestroyDB(kDbName, Options());
+  ASSERT_OK(DestroyDB(kDbName, Options()));
   auto db = OpenDb();
   WriteOptions write_options;
   ReadOptions read_options;
@@ -806,17 +814,20 @@
   PutKey(db.get(), write_options, TestKey(1, 4), "v14");
   PutKey(db.get(), write_options, TestKey(3, 3), "v33");
   PutKey(db.get(), write_options, TestKey(3, 4), "v34");
-  db->Flush(FlushOptions());
-  reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   PutKey(db.get(), write_options, TestKey(1, 1), "v11");
   PutKey(db.get(), write_options, TestKey(1, 3), "v13");
   PutKey(db.get(), write_options, TestKey(2, 1), "v21");
   PutKey(db.get(), write_options, TestKey(2, 2), "v22");
-  db->Flush(FlushOptions());
-  reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
   SeekIterator(iter.get(), 1, 5);
   iter->Prev();
+  ASSERT_TRUE(iter->Valid());
   ASSERT_EQ(iter->value(), v13);
 }
 
@@ -831,27 +842,29 @@
   Slice upper_bound = TestKeyToSlice(s, upper_bound_key);
 
   {
-    DestroyDB(kDbName, Options());
+    ASSERT_OK(DestroyDB(kDbName, Options()));
     auto db = OpenDb();
     WriteOptions write_options;
     ReadOptions read_options;
     read_options.iterate_upper_bound = &upper_bound;
     PutKey(db.get(), write_options, TestKey(1, 2), "v12");
     PutKey(db.get(), write_options, TestKey(1, 4), "v14");
-    db->Flush(FlushOptions());
-    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
     PutKey(db.get(), write_options, TestKey(1, 1), "v11");
     PutKey(db.get(), write_options, TestKey(1, 3), "v13");
     PutKey(db.get(), write_options, TestKey(2, 1), "v21");
     PutKey(db.get(), write_options, TestKey(2, 2), "v22");
-    db->Flush(FlushOptions());
-    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
     iter->SeekToLast();
     ASSERT_EQ(iter->value(), v14);
   }
   {
-    DestroyDB(kDbName, Options());
+    ASSERT_OK(DestroyDB(kDbName, Options()));
     auto db = OpenDb();
     WriteOptions write_options;
     ReadOptions read_options;
@@ -860,12 +873,14 @@
     PutKey(db.get(), write_options, TestKey(1, 4), "v14");
     PutKey(db.get(), write_options, TestKey(3, 3), "v33");
     PutKey(db.get(), write_options, TestKey(3, 4), "v34");
-    db->Flush(FlushOptions());
-    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
     PutKey(db.get(), write_options, TestKey(1, 1), "v11");
     PutKey(db.get(), write_options, TestKey(1, 3), "v13");
-    db->Flush(FlushOptions());
-    reinterpret_cast<DBImpl*>(db.get())->TEST_WaitForFlushMemTable();
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
     iter->SeekToLast();
     ASSERT_EQ(iter->value(), v14);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -33,17 +33,22 @@
   if (smallest != nullptr) {
     pinned_bounds_.emplace_back();
     auto& parsed_smallest = pinned_bounds_.back();
-    if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) {
-      assert(false);
-    }
+    Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
+
     smallest_ = &parsed_smallest;
   }
   if (largest != nullptr) {
     pinned_bounds_.emplace_back();
     auto& parsed_largest = pinned_bounds_.back();
-    if (!ParseInternalKey(largest->Encode(), &parsed_largest)) {
-      assert(false);
-    }
+
+    Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
+
     if (parsed_largest.type == kTypeRangeDeletion &&
         parsed_largest.sequence == kMaxSequenceNumber) {
       // The file boundary has been artificially extended by a range tombstone.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator.h	2025-05-19 16:14:27.000000000 +0000
@@ -43,12 +43,12 @@
 
   void InternalNext();
 
-  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // Seeks to the tombstone with the highest visible sequence number that covers
   // target (a user key). If no such tombstone exists, the position will be at
   // the earliest tombstone that ends after target.
   void Seek(const Slice& target);
 
-  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // Seeks to the tombstone with the highest visible sequence number that covers
   // target (a user key). If no such tombstone exists, the position will be at
   // the latest tombstone that starts before target.
   void SeekForPrev(const Slice& target);
@@ -283,9 +283,14 @@
 
   bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) {
     ParsedInternalKey parsed;
-    if (!ParseInternalKey(key, &parsed)) {
+
+    Status pik_status =
+        ParseInternalKey(key, &parsed, false /* log_err_key */);  // TODO
+    assert(pik_status.ok());
+    if (!pik_status.ok()) {
       return false;
     }
+
     return ShouldDelete(parsed, mode);
   }
   virtual bool ShouldDelete(const ParsedInternalKey& parsed,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,24 +11,24 @@
 }
 #else
 
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 #include <memory>
 #include <random>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/comparator.h"
-#include "rocksdb/env.h"
-#include "test_util/testutil.h"
+#include "rocksdb/system_clock.h"
 #include "util/coding.h"
+#include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
-
-#include "util/gflags_compat.h"
+#include "util/vector_iterator.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
@@ -147,8 +147,8 @@
     keys.push_back(key_and_value.first.Encode().ToString());
     values.push_back(key_and_value.second.ToString());
   }
-  return std::unique_ptr<test::VectorIterator>(
-      new test::VectorIterator(keys, values));
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &icmp));
 }
 
 // convert long to a big-endian slice key
@@ -172,6 +172,8 @@
   ParseCommandLineFlags(&argc, &argv, true);
 
   Stats stats;
+  ROCKSDB_NAMESPACE::SystemClock* clock =
+      ROCKSDB_NAMESPACE::SystemClock::Default().get();
   ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed);
   std::default_random_engine random_gen(FLAGS_seed);
   std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
@@ -206,8 +208,6 @@
                 ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j);
       }
 
-      auto range_del_iter =
-          ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones);
       fragmented_range_tombstone_lists.emplace_back(
           new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList(
               ROCKSDB_NAMESPACE::MakeRangeDelIterator(
@@ -220,7 +220,7 @@
                   ROCKSDB_NAMESPACE::kMaxSequenceNumber));
 
       ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones(
-          ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+          clock, true /* auto_start */);
       range_del_agg.AddTombstones(std::move(fragmented_range_del_iter));
       stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
     }
@@ -237,7 +237,7 @@
       parsed_key.user_key = key_string;
 
       ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete(
-          ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+          clock, true /* auto_start */);
       range_del_agg.ShouldDelete(parsed_key, mode);
       uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_del_aggregator_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,6 +13,7 @@
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "test_util/testutil.h"
+#include "util/vector_iterator.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -30,8 +31,8 @@
     keys.push_back(key_and_value.first.Encode().ToString());
     values.push_back(key_and_value.second.ToString());
   }
-  return std::unique_ptr<test::VectorIterator>(
-      new test::VectorIterator(keys, values));
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &bytewise_icmp));
 }
 
 std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,12 +6,11 @@
 #include "db/range_tombstone_fragmenter.h"
 
 #include <algorithm>
+#include <cinttypes>
+#include <cstdio>
 #include <functional>
 #include <set>
 
-#include <stdio.h>
-#include <cinttypes>
-
 #include "util/autovector.h"
 #include "util/kv_map.h"
 #include "util/vector_iterator.h"
@@ -26,12 +25,15 @@
     return;
   }
   bool is_sorted = true;
-  int num_tombstones = 0;
   InternalKey pinned_last_start_key;
   Slice last_start_key;
+  num_unfragmented_tombstones_ = 0;
+  total_tombstone_payload_bytes_ = 0;
   for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
-       unfragmented_tombstones->Next(), num_tombstones++) {
-    if (num_tombstones > 0 &&
+       unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
+    if (num_unfragmented_tombstones_ > 0 &&
         icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
       is_sorted = false;
       break;
@@ -51,10 +53,14 @@
 
   // Sort the tombstones before fragmenting them.
   std::vector<std::string> keys, values;
-  keys.reserve(num_tombstones);
-  values.reserve(num_tombstones);
+  keys.reserve(num_unfragmented_tombstones_);
+  values.reserve(num_unfragmented_tombstones_);
+  // Reset the counter to zero for the next iteration over keys.
+  total_tombstone_payload_bytes_ = 0;
   for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
        unfragmented_tombstones->Next()) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
     keys.emplace_back(unfragmented_tombstones->key().data(),
                       unfragmented_tombstones->key().size());
     values.emplace_back(unfragmented_tombstones->value().data(),
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter.h	2025-05-19 16:14:27.000000000 +0000
@@ -68,6 +68,14 @@
   // number in [lower, upper].
   bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;
 
+  uint64_t num_unfragmented_tombstones() const {
+    return num_unfragmented_tombstones_;
+  }
+
+  uint64_t total_tombstone_payload_bytes() const {
+    return total_tombstone_payload_bytes_;
+  }
+
  private:
   // Given an ordered range tombstone iterator unfragmented_tombstones,
   // "fragment" the tombstones into non-overlapping pieces, and store them in
@@ -82,6 +90,8 @@
   std::set<SequenceNumber> seq_set_;
   std::list<std::string> pinned_slices_;
   PinnedIteratorsManager pinned_iters_mgr_;
+  uint64_t num_unfragmented_tombstones_;
+  uint64_t total_tombstone_payload_bytes_;
 };
 
 // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
@@ -180,6 +190,13 @@
   SequenceNumber upper_bound() const { return upper_bound_; }
   SequenceNumber lower_bound() const { return lower_bound_; }
 
+  uint64_t num_unfragmented_tombstones() const {
+    return tombstones_->num_unfragmented_tombstones();
+  }
+  uint64_t total_tombstone_payload_bytes() const {
+    return tombstones_->total_tombstone_payload_bytes();
+  }
+
  private:
   using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/range_tombstone_fragmenter_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,8 +6,10 @@
 #include "db/range_tombstone_fragmenter.h"
 
 #include "db/db_test_util.h"
+#include "db/dbformat.h"
 #include "rocksdb/comparator.h"
 #include "test_util/testutil.h"
+#include "util/vector_iterator.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -25,8 +27,8 @@
     keys.push_back(key_and_value.first.Encode().ToString());
     values.push_back(key_and_value.second.ToString());
   }
-  return std::unique_ptr<test::VectorIterator>(
-      new test::VectorIterator(keys, values));
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &bytewise_icmp));
 }
 
 void CheckIterPosition(const RangeTombstone& tombstone,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/read_callback.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/read_callback.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/read_callback.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/read_callback.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,13 +5,14 @@
 
 #pragma once
 
+#include "db/dbformat.h"
 #include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class ReadCallback {
  public:
-  ReadCallback(SequenceNumber last_visible_seq)
+  explicit ReadCallback(SequenceNumber last_visible_seq)
       : max_visible_seq_(last_visible_seq) {}
   ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
       : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair.cc	2025-05-19 16:14:27.000000000 +0000
@@ -62,6 +62,7 @@
 #ifndef ROCKSDB_LITE
 
 #include <cinttypes>
+
 #include "db/builder.h"
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
@@ -71,9 +72,9 @@
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "file/writable_file_writer.h"
+#include "logging/logging.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -94,15 +95,16 @@
            const ColumnFamilyOptions& default_cf_opts,
            const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
       : dbname_(dbname),
+        db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)),
         env_(db_options.env),
-        env_options_(),
+        file_options_(),
         db_options_(SanitizeOptions(dbname_, db_options)),
         immutable_db_options_(ImmutableDBOptions(db_options_)),
         icmp_(default_cf_opts.comparator),
         default_cf_opts_(
             SanitizeOptions(immutable_db_options_, default_cf_opts)),
-        default_cf_iopts_(
-            ImmutableCFOptions(immutable_db_options_, default_cf_opts_)),
+        default_iopts_(
+            ImmutableOptions(immutable_db_options_, default_cf_opts_)),
         unknown_cf_opts_(
             SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
         create_unknown_cfs_(create_unknown_cfs),
@@ -110,16 +112,19 @@
             // TableCache can be small since we expect each table to be opened
             // once.
             NewLRUCache(10, db_options_.table_cache_numshardbits)),
-        table_cache_(new TableCache(default_cf_iopts_, env_options_,
+        table_cache_(new TableCache(default_iopts_, &file_options_,
                                     raw_table_cache_.get(),
-                                    /*block_cache_tracer=*/nullptr)),
+                                    /*block_cache_tracer=*/nullptr,
+                                    /*io_tracer=*/nullptr, db_session_id_)),
         wb_(db_options_.db_write_buffer_size),
         wc_(db_options_.delayed_write_rate),
-        vset_(dbname_, &immutable_db_options_, env_options_,
+        vset_(dbname_, &immutable_db_options_, file_options_,
               raw_table_cache_.get(), &wb_, &wc_,
-              /*block_cache_tracer=*/nullptr),
+              /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+              db_session_id_),
         next_file_number_(1),
-        db_lock_(nullptr) {
+        db_lock_(nullptr),
+        closed_(false) {
     for (const auto& cfd : column_families) {
       cf_name_to_opts_[cfd.name] = cfd.options;
     }
@@ -163,29 +168,37 @@
     return status;
   }
 
-  ~Repairer() {
-    if (db_lock_ != nullptr) {
-      env_->UnlockFile(db_lock_);
+  Status Close() {
+    Status s = Status::OK();
+    if (!closed_) {
+      if (db_lock_ != nullptr) {
+        s = env_->UnlockFile(db_lock_);
+        db_lock_ = nullptr;
+      }
+      closed_ = true;
     }
-    delete table_cache_;
+    return s;
   }
 
+  ~Repairer() { Close().PermitUncheckedError(); }
+
   Status Run() {
     Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
     if (!status.ok()) {
       return status;
     }
     status = FindFiles();
+    DBImpl* db_impl = nullptr;
     if (status.ok()) {
       // Discard older manifests and start a fresh one
       for (size_t i = 0; i < manifests_.size(); i++) {
         ArchiveFile(dbname_ + "/" + manifests_[i]);
       }
       // Just create a DBImpl temporarily so we can reuse NewDB()
-      DBImpl* db_impl = new DBImpl(db_options_, dbname_);
-      status = db_impl->NewDB();
-      delete db_impl;
+      db_impl = new DBImpl(db_options_, dbname_);
+      status = db_impl->NewDB(/*new_filenames=*/nullptr);
     }
+    delete db_impl;
 
     if (status.ok()) {
       // Recover using the fresh manifest created by NewDB()
@@ -229,17 +242,18 @@
   };
 
   std::string const dbname_;
+  std::string db_session_id_;
   Env* const env_;
-  const EnvOptions env_options_;
+  const FileOptions file_options_;
   const DBOptions db_options_;
   const ImmutableDBOptions immutable_db_options_;
   const InternalKeyComparator icmp_;
   const ColumnFamilyOptions default_cf_opts_;
-  const ImmutableCFOptions default_cf_iopts_;  // table_cache_ holds reference
+  const ImmutableOptions default_iopts_;  // table_cache_ holds reference
   const ColumnFamilyOptions unknown_cf_opts_;
   const bool create_unknown_cfs_;
   std::shared_ptr<Cache> raw_table_cache_;
-  TableCache* table_cache_;
+  std::unique_ptr<TableCache> table_cache_;
   WriteBufferManager wb_;
   WriteController wc_;
   VersionSet vset_;
@@ -254,6 +268,7 @@
   // Lock over the persistent DB state. Non-nullptr iff successfully
   // acquired.
   FileLock* db_lock_;
+  bool closed_;
 
   Status FindFiles() {
     std::vector<std::string> filenames;
@@ -265,21 +280,15 @@
     }
 
     // search wal_dir if user uses a customize wal_dir
-    bool same = false;
-    Status status = env_->AreFilesSame(db_options_.wal_dir, dbname_, &same);
-    if (status.IsNotSupported()) {
-      same = db_options_.wal_dir == dbname_;
-      status = Status::OK();
-    } else if (!status.ok()) {
-      return status;
-    }
-
+    bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_);
     if (!same) {
-      to_search_paths.push_back(db_options_.wal_dir);
+      to_search_paths.push_back(immutable_db_options_.wal_dir);
     }
 
     for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
-      status = env_->GetChildren(to_search_paths[path_id], &filenames);
+      ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n",
+                     to_search_paths[path_id].c_str());
+      Status status = env_->GetChildren(to_search_paths[path_id], &filenames);
       if (!status.ok()) {
         return status;
       }
@@ -297,7 +306,7 @@
             if (number + 1 > next_file_number_) {
               next_file_number_ = number + 1;
             }
-            if (type == kLogFile) {
+            if (type == kWalFile) {
               logs_.push_back(number);
             } else if (type == kTableFile) {
               table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
@@ -316,10 +325,11 @@
   }
 
   void ConvertLogFilesToTables() {
+    const auto& wal_dir = immutable_db_options_.GetWalDir();
     for (size_t i = 0; i < logs_.size(); i++) {
       // we should use LogFileName(wal_dir, logs_[i]) here. user might uses wal_dir option.
-      std::string logname = LogFileName(db_options_.wal_dir, logs_[i]);
-      Status status = ConvertLogToTable(logs_[i]);
+      std::string logname = LogFileName(wal_dir, logs_[i]);
+      Status status = ConvertLogToTable(wal_dir, logs_[i]);
       if (!status.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
                        "Log #%" PRIu64 ": ignoring conversion error: %s",
@@ -329,7 +339,7 @@
     }
   }
 
-  Status ConvertLogToTable(uint64_t log) {
+  Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) {
     struct LogReporter : public log::Reader::Reporter {
       Env* env;
       std::shared_ptr<Logger> info_log;
@@ -342,15 +352,15 @@
     };
 
     // Open the log file
-    std::string logname = LogFileName(db_options_.wal_dir, log);
-    std::unique_ptr<SequentialFile> lfile;
-    Status status = env_->NewSequentialFile(
-        logname, &lfile, env_->OptimizeForLogRead(env_options_));
+    std::string logname = LogFileName(wal_dir, log);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<SequentialFileReader> lfile_reader;
+    Status status = SequentialFileReader::Create(
+        fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader,
+        nullptr);
     if (!status.ok()) {
       return status;
     }
-    std::unique_ptr<SequentialFileReader> lfile_reader(new SequentialFileReader(
-        NewLegacySequentialFileWrapper(lfile), logname));
 
     // Create the log reader.
     LogReporter reporter;
@@ -382,15 +392,16 @@
             record.size(), Status::Corruption("log record too small"));
         continue;
       }
-      WriteBatchInternal::SetContents(&batch, record);
-      status =
-          WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
-      if (status.ok()) {
+      Status record_status = WriteBatchInternal::SetContents(&batch, record);
+      if (record_status.ok()) {
+        record_status =
+            WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
+      }
+      if (record_status.ok()) {
         counter += WriteBatchInternal::Count(&batch);
       } else {
         ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
-                       log, status.ToString().c_str());
-        status = Status::OK();  // Keep going with rest of file
+                       log, record_status.ToString().c_str());
       }
     }
 
@@ -410,7 +421,8 @@
       Arena arena;
       ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
       int64_t _current_time = 0;
-      status = env_->GetCurrentTime(&_current_time);  // ignore error
+      immutable_db_options_.clock->GetCurrentTime(&_current_time)
+          .PermitUncheckedError();  // ignore error
       const uint64_t current_time = static_cast<uint64_t>(_current_time);
       SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
 
@@ -423,18 +435,26 @@
         range_del_iters.emplace_back(range_del_iter);
       }
 
-      LegacyFileSystemWrapper fs(env_);
-      status = BuildTable(
-          dbname_, env_, &fs, *cfd->ioptions(),
-          *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_,
-          iter.get(), std::move(range_del_iters), &meta,
+      IOStatus io_s;
+      CompressionOptions default_compression;
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
           cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
-          cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber,
-          snapshot_checker, kNoCompression, 0 /* sample_for_compression */,
-          CompressionOptions(), false, nullptr /* internal_stats */,
-          TableFileCreationReason::kRecovery, nullptr /* event_logger */,
-          0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
-          -1 /* level */, current_time, write_hint);
+          kNoCompression, default_compression, cfd->GetID(), cfd->GetName(),
+          -1 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, current_time,
+          0 /* oldest_key_time */, 0 /* file_creation_time */,
+          "DB Repairer" /* db_id */, db_session_id_, 0 /*target_file_size*/,
+          meta.fd.GetNumber());
+      status = BuildTable(
+          dbname_, /* versions */ nullptr, immutable_db_options_, tboptions,
+          file_options_, table_cache_.get(), iter.get(),
+          std::move(range_del_iters), &meta, nullptr /* blob_file_additions */,
+          {}, kMaxSequenceNumber, snapshot_checker,
+          false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s,
+          nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery,
+          nullptr /* event_logger */, 0 /* job_id */, Env::IO_HIGH,
+          nullptr /* table_properties */, write_hint);
       ROCKS_LOG_INFO(db_options_.info_log,
                      "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
                      log, counter, meta.fd.GetNumber(),
@@ -481,8 +501,8 @@
                                 file_size);
     std::shared_ptr<const TableProperties> props;
     if (status.ok()) {
-      status = table_cache_->GetTableProperties(env_options_, icmp_, t->meta.fd,
-                                                &props);
+      status = table_cache_->GetTableProperties(file_options_, icmp_,
+                                                t->meta.fd, &props);
     }
     if (status.ok()) {
       t->column_family_id = static_cast<uint32_t>(props->column_family_id);
@@ -522,20 +542,24 @@
       ReadOptions ropts;
       ropts.total_order_seek = true;
       InternalIterator* iter = table_cache_->NewIterator(
-          ropts, env_options_, cfd->internal_comparator(), t->meta,
+          ropts, file_options_, cfd->internal_comparator(), t->meta,
           nullptr /* range_del_agg */,
-          cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
+          cfd->GetLatestMutableCFOptions()->prefix_extractor,
           /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
           TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
-          /*level=*/-1, /*smallest_compaction_key=*/nullptr,
-          /*largest_compaction_key=*/nullptr);
+          /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false);
       ParsedInternalKey parsed;
       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
         Slice key = iter->key();
-        if (!ParseInternalKey(key, &parsed)) {
+        Status pik_status =
+            ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors);
+        if (!pik_status.ok()) {
           ROCKS_LOG_ERROR(db_options_.info_log,
-                          "Table #%" PRIu64 ": unparsable key %s",
-                          t->meta.fd.GetNumber(), EscapeString(key).c_str());
+                          "Table #%" PRIu64 ": unparsable key - %s",
+                          t->meta.fd.GetNumber(), pik_status.getState());
           continue;
         }
 
@@ -553,6 +577,30 @@
                      t->meta.fd.GetNumber(), counter,
                      status.ToString().c_str());
     }
+    if (status.ok()) {
+      // XXX/FIXME: This is just basic, naive handling of range tombstones,
+      // like call to UpdateBoundariesForRange in builder.cc where we assume
+      // an SST file is a full sorted run. This probably needs the extra logic
+      // from compaction_job.cc around call to UpdateBoundariesForRange (to
+      // handle range tombstones extendingg beyond range of other entries).
+      ReadOptions ropts;
+      std::unique_ptr<FragmentedRangeTombstoneIterator> r_iter;
+      status = table_cache_->GetRangeTombstoneIterator(
+          ropts, cfd->internal_comparator(), t->meta, &r_iter);
+
+      if (r_iter) {
+        r_iter->SeekToFirst();
+
+        while (r_iter->Valid()) {
+          auto tombstone = r_iter->Tombstone();
+          auto kv = tombstone.Serialize();
+          t->meta.UpdateBoundariesForRange(
+              kv.first, tombstone.SerializeEndKey(), tombstone.seq_,
+              cfd->internal_comparator());
+          r_iter->Next();
+        }
+      }
+    }
     return status;
   }
 
@@ -585,9 +633,10 @@
             table->meta.fd.GetFileSize(), table->meta.smallest,
             table->meta.largest, table->meta.fd.smallest_seqno,
             table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
-            table->meta.oldest_blob_file_number,
+            table->meta.temperature, table->meta.oldest_blob_file_number,
             table->meta.oldest_ancester_time, table->meta.file_creation_time,
-            table->meta.file_checksum, table->meta.file_checksum_func_name);
+            table->meta.file_checksum, table->meta.file_checksum_func_name,
+            table->meta.min_timestamp, table->meta.max_timestamp);
       }
       assert(next_file_number_ > 0);
       vset_.MarkFileNumberUsed(next_file_number_ - 1);
@@ -614,7 +663,7 @@
       new_dir.assign(fname.data(), slash - fname.data());
     }
     new_dir.append("/lost");
-    env_->CreateDir(new_dir);  // Ignore error
+    env_->CreateDir(new_dir).PermitUncheckedError();  // Ignore error
     std::string new_file = new_dir;
     new_file.append("/");
     new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
@@ -646,12 +695,16 @@
                 ) {
   ColumnFamilyOptions default_cf_opts;
   Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+                    ColumnFamilyOptions() /* unknown_cf_opts */,
+                    false /* create_unknown_cfs */);
+  status = repairer.Run();
   if (status.ok()) {
-    Repairer repairer(dbname, db_options, column_families,
-                      default_cf_opts,
-                      ColumnFamilyOptions() /* unknown_cf_opts */,
-                      false /* create_unknown_cfs */);
-    status = repairer.Run();
+    status = repairer.Close();
   }
   return status;
 }
@@ -661,29 +714,33 @@
                 const ColumnFamilyOptions& unknown_cf_opts) {
   ColumnFamilyOptions default_cf_opts;
   Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+                    unknown_cf_opts, true /* create_unknown_cfs */);
+  status = repairer.Run();
   if (status.ok()) {
-    Repairer repairer(dbname, db_options,
-                      column_families, default_cf_opts,
-                      unknown_cf_opts, true /* create_unknown_cfs */);
-    status = repairer.Run();
+    status = repairer.Close();
   }
   return status;
 }
 
 Status RepairDB(const std::string& dbname, const Options& options) {
   Options opts(options);
-  if (opts.file_system == nullptr) {
-    opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
-    ;
-  }
-
   DBOptions db_options(opts);
   ColumnFamilyOptions cf_options(opts);
+
   Repairer repairer(dbname, db_options,
                     {}, cf_options /* default_cf_opts */,
                     cf_options /* unknown_cf_opts */,
                     true /* create_unknown_cfs */);
-  return repairer.Run();
+  Status status = repairer.Run();
+  if (status.ok()) {
+    status = repairer.Close();
+  }
+  return status;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/repair_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/repair_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "rocksdb/options.h"
 #ifndef ROCKSDB_LITE
 
 #include <algorithm>
@@ -22,30 +23,35 @@
 #ifndef ROCKSDB_LITE
 class RepairTest : public DBTestBase {
  public:
-  RepairTest() : DBTestBase("/repair_test") {}
+  RepairTest() : DBTestBase("repair_test", /*env_do_fsync=*/true) {}
 
-  std::string GetFirstSstPath() {
+  Status GetFirstSstPath(std::string* first_sst_path) {
+    assert(first_sst_path != nullptr);
+    first_sst_path->clear();
     uint64_t manifest_size;
     std::vector<std::string> files;
-    db_->GetLiveFiles(files, &manifest_size);
-    auto sst_iter =
-        std::find_if(files.begin(), files.end(), [](const std::string& file) {
-          uint64_t number;
-          FileType type;
-          bool ok = ParseFileName(file, &number, &type);
-          return ok && type == kTableFile;
-        });
-    return sst_iter == files.end() ? "" : dbname_ + *sst_iter;
+    Status s = db_->GetLiveFiles(files, &manifest_size);
+    if (s.ok()) {
+      auto sst_iter =
+          std::find_if(files.begin(), files.end(), [](const std::string& file) {
+            uint64_t number;
+            FileType type;
+            bool ok = ParseFileName(file, &number, &type);
+            return ok && type == kTableFile;
+          });
+      *first_sst_path = sst_iter == files.end() ? "" : dbname_ + *sst_iter;
+    }
+    return s;
   }
 };
 
 TEST_F(RepairTest, LostManifest) {
   // Add a couple SST files, delete the manifest, and verify RepairDB() saves
   // the day.
-  Put("key", "val");
-  Flush();
-  Put("key2", "val2");
-  Flush();
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
   // Need to get path before Close() deletes db_, but delete it after Close() to
   // ensure Close() didn't change the manifest.
   std::string manifest_path =
@@ -61,12 +67,41 @@
   ASSERT_EQ(Get("key2"), "val2");
 }
 
+TEST_F(RepairTest, LostManifestMoreDbFeatures) {
+  // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+  // the day.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(Put("key4", "val4"));
+  ASSERT_OK(Flush());
+  // Test an SST file containing only a range tombstone
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key2",
+                             "key3z"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  Reopen(CurrentOptions());
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "NOT_FOUND");
+  ASSERT_EQ(Get("key3"), "NOT_FOUND");
+  ASSERT_EQ(Get("key4"), "val4");
+}
+
 TEST_F(RepairTest, CorruptManifest) {
   // Manifest is in an invalid format. Expect a full recovery.
-  Put("key", "val");
-  Flush();
-  Put("key2", "val2");
-  Flush();
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
   // Need to get path before Close() deletes db_, but overwrite it after Close()
   // to ensure Close() didn't change the manifest.
   std::string manifest_path =
@@ -75,8 +110,8 @@
   Close();
   ASSERT_OK(env_->FileExists(manifest_path));
 
-  LegacyFileSystemWrapper fs(env_);
-  CreateFile(&fs, manifest_path, "blah", false /* use_fsync */);
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah",
+                       false /* use_fsync */));
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
   Reopen(CurrentOptions());
 
@@ -87,13 +122,13 @@
 TEST_F(RepairTest, IncompleteManifest) {
   // In this case, the manifest is valid but does not reference all of the SST
   // files. Expect a full recovery.
-  Put("key", "val");
-  Flush();
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
   std::string orig_manifest_path =
       DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
   CopyFile(orig_manifest_path, orig_manifest_path + ".tmp");
-  Put("key2", "val2");
-  Flush();
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
   // Need to get path before Close() deletes db_, but overwrite it after Close()
   // to ensure Close() didn't change the manifest.
   std::string new_manifest_path =
@@ -113,10 +148,10 @@
 TEST_F(RepairTest, PostRepairSstFileNumbering) {
   // Verify after a DB is repaired, new files will be assigned higher numbers
   // than old files.
-  Put("key", "val");
-  Flush();
-  Put("key2", "val2");
-  Flush();
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
   uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
   Close();
 
@@ -130,11 +165,12 @@
 TEST_F(RepairTest, LostSst) {
   // Delete one of the SST files but preserve the manifest that refers to it,
   // then verify the DB is still usable for the intact SST.
-  Put("key", "val");
-  Flush();
-  Put("key2", "val2");
-  Flush();
-  auto sst_path = GetFirstSstPath();
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  std::string sst_path;
+  ASSERT_OK(GetFirstSstPath(&sst_path));
   ASSERT_FALSE(sst_path.empty());
   ASSERT_OK(env_->DeleteFile(sst_path));
 
@@ -149,15 +185,16 @@
 TEST_F(RepairTest, CorruptSst) {
   // Corrupt one of the SST files but preserve the manifest that refers to it,
   // then verify the DB is still usable for the intact SST.
-  Put("key", "val");
-  Flush();
-  Put("key2", "val2");
-  Flush();
-  auto sst_path = GetFirstSstPath();
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  std::string sst_path;
+  ASSERT_OK(GetFirstSstPath(&sst_path));
   ASSERT_FALSE(sst_path.empty());
 
-  LegacyFileSystemWrapper fs(env_);
-  CreateFile(&fs, sst_path, "blah", false /* use_fsync */);
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah",
+                       false /* use_fsync */));
 
   Close();
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
@@ -170,13 +207,16 @@
 TEST_F(RepairTest, UnflushedSst) {
   // This test case invokes repair while some data is unflushed, then verifies
   // that data is in the db.
-  Put("key", "val");
+  ASSERT_OK(Put("key", "val"));
   VectorLogPtr wal_files;
   ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
   ASSERT_EQ(wal_files.size(), 1);
-  uint64_t total_ssts_size;
-  GetAllSSTFiles(&total_ssts_size);
-  ASSERT_EQ(total_ssts_size, 0);
+  {
+    uint64_t total_ssts_size;
+    std::unordered_map<std::string, uint64_t> sst_files;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+    ASSERT_EQ(total_ssts_size, 0);
+  }
   // Need to get path before Close() deletes db_, but delete it after Close() to
   // ensure Close() didn't change the manifest.
   std::string manifest_path =
@@ -190,8 +230,12 @@
 
   ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
   ASSERT_EQ(wal_files.size(), 0);
-  GetAllSSTFiles(&total_ssts_size);
-  ASSERT_GT(total_ssts_size, 0);
+  {
+    uint64_t total_ssts_size;
+    std::unordered_map<std::string, uint64_t> sst_files;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+    ASSERT_GT(total_ssts_size, 0);
+  }
   ASSERT_EQ(Get("key"), "val");
 }
 
@@ -199,14 +243,17 @@
   do {
     Options options = CurrentOptions();
     DestroyAndReopen(options);
-    Put("key", "val");
-    Put("foo", "bar");
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(Put("foo", "bar"));
     VectorLogPtr wal_files;
     ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
     ASSERT_EQ(wal_files.size(), 1);
-    uint64_t total_ssts_size;
-    GetAllSSTFiles(&total_ssts_size);
-    ASSERT_EQ(total_ssts_size, 0);
+    {
+      uint64_t total_ssts_size;
+      std::unordered_map<std::string, uint64_t> sst_files;
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+      ASSERT_EQ(total_ssts_size, 0);
+    }
     std::string manifest_path =
       DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
 
@@ -221,8 +268,12 @@
     Reopen(options);
     ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
     ASSERT_EQ(wal_files.size(), 0);
-    GetAllSSTFiles(&total_ssts_size);
-    ASSERT_GT(total_ssts_size, 0);
+    {
+      uint64_t total_ssts_size;
+      std::unordered_map<std::string, uint64_t> sst_files;
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+      ASSERT_GT(total_ssts_size, 0);
+    }
     ASSERT_EQ(Get("key"), "val");
     ASSERT_EQ(Get("foo"), "bar");
 
@@ -238,13 +289,13 @@
   CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions());
   for (int i = 0; i < kNumCfs; ++i) {
     for (int j = 0; j < kEntriesPerCf; ++j) {
-      Put(i, "key" + ToString(j), "val" + ToString(j));
+      ASSERT_OK(Put(i, "key" + ToString(j), "val" + ToString(j)));
       if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) {
         // Leave one unflushed so we can verify WAL entries are properly
         // associated with column families.
         continue;
       }
-      Flush(i);
+      ASSERT_OK(Flush(i));
     }
   }
 
@@ -283,12 +334,12 @@
                            std::vector<Options>{opts, rev_opts});
   for (int i = 0; i < kNumCfs; ++i) {
     for (int j = 0; j < kEntriesPerCf; ++j) {
-      Put(i, "key" + ToString(j), "val" + ToString(j));
+      ASSERT_OK(Put(i, "key" + ToString(j), "val" + ToString(j)));
       if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) {
         // Leave one unflushed so we can verify RepairDB's flush logic
         continue;
       }
-      Flush(i);
+      ASSERT_OK(Flush(i));
     }
   }
   Close();
@@ -308,7 +359,7 @@
   // Examine table properties to verify RepairDB() used the right options when
   // converting WAL->SST
   TablePropertiesCollection fname_to_props;
-  db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props);
+  ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props));
   ASSERT_EQ(fname_to_props.size(), 2U);
   for (const auto& fname_and_props : fname_to_props) {
     std::string comparator_name (
@@ -342,8 +393,8 @@
     }
   }
 
-  Put("key", "val");
-  Flush();
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
   Close();
 
   ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/snapshot_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/snapshot_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/snapshot_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/snapshot_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 #pragma once
 #include <vector>
 
+#include "db/dbformat.h"
 #include "rocksdb/db.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -23,7 +24,7 @@
   SequenceNumber number_;  // const after creation
   // It indicates the smallest uncommitted data at the time the snapshot was
   // taken. This is currently used by WritePrepared transactions to limit the
-  // scope of queries to IsInSnpashot.
+  // scope of queries to IsInSnapshot.
   SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
 
   virtual SequenceNumber GetSequenceNumber() const override { return number_; }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,9 +13,11 @@
 #include "db/range_tombstone_fragmenter.h"
 #include "db/snapshot_impl.h"
 #include "db/version_edit.h"
+#include "file/file_util.h"
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
 #include "monitoring/perf_context_imp.h"
+#include "rocksdb/advanced_options.h"
 #include "rocksdb/statistics.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/get_context.h"
@@ -62,14 +64,21 @@
 
 }  // namespace
 
-TableCache::TableCache(const ImmutableCFOptions& ioptions,
-                       const FileOptions& file_options, Cache* const cache,
-                       BlockCacheTracer* const block_cache_tracer)
+const int kLoadConcurency = 128;
+
+TableCache::TableCache(const ImmutableOptions& ioptions,
+                       const FileOptions* file_options, Cache* const cache,
+                       BlockCacheTracer* const block_cache_tracer,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_session_id)
     : ioptions_(ioptions),
-      file_options_(file_options),
+      file_options_(*file_options),
       cache_(cache),
       immortal_tables_(false),
-      block_cache_tracer_(block_cache_tracer) {
+      block_cache_tracer_(block_cache_tracer),
+      loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {
   if (ioptions_.row_cache) {
     // If the same cache is shared by multiple instances, we need to
     // disambiguate its entries.
@@ -89,38 +98,54 @@
 }
 
 Status TableCache::GetTableReader(
-    const FileOptions& file_options,
+    const ReadOptions& ro, const FileOptions& file_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
     bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
     std::unique_ptr<TableReader>* table_reader,
-    const SliceTransform* prefix_extractor, bool skip_filters, int level,
-    bool prefetch_index_and_filter_in_cache) {
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
   std::string fname =
       TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
   std::unique_ptr<FSRandomAccessFile> file;
-  Status s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
-                                               nullptr);
-  RecordTick(ioptions_.statistics, NO_FILE_OPENS);
-  if (s.IsPathNotFound()) {
+  FileOptions fopts = file_options;
+  fopts.temperature = file_temperature;
+  Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+  if (s.ok()) {
+    s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
+  }
+  if (s.ok()) {
+    RecordTick(ioptions_.stats, NO_FILE_OPENS);
+  } else if (s.IsPathNotFound()) {
     fname = Rocks2LevelTableFileName(fname);
-    s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, nullptr);
-    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+    s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+    if (s.ok()) {
+      s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
+                                            nullptr);
+    }
+    if (s.ok()) {
+      RecordTick(ioptions_.stats, NO_FILE_OPENS);
+    }
   }
 
   if (s.ok()) {
     if (!sequential_mode && ioptions_.advise_random_on_open) {
       file->Hint(FSRandomAccessFile::kRandom);
     }
-    StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+    StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
     std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(
-            std::move(file), fname, ioptions_.env,
-            record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
-            file_read_hist, ioptions_.rate_limiter, ioptions_.listeners));
+            std::move(file), fname, ioptions_.clock, io_tracer_,
+            record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS,
+            file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners,
+            file_temperature));
     s = ioptions_.table_factory->NewTableReader(
-        TableReaderOptions(ioptions_, prefix_extractor, file_options,
-                           internal_comparator, skip_filters, immortal_tables_,
-                           level, fd.largest_seqno, block_cache_tracer_),
+        ro,
+        TableReaderOptions(
+            ioptions_, prefix_extractor, file_options, internal_comparator,
+            skip_filters, immortal_tables_, false /* force_direct_prefetch */,
+            level, fd.largest_seqno, block_cache_tracer_,
+            max_file_size_for_l0_meta_pin, db_session_id_, fd.GetNumber()),
         std::move(file_reader), fd.GetFileSize(), table_reader,
         prefetch_index_and_filter_in_cache);
     TEST_SYNC_POINT("TableCache::GetTableReader:0");
@@ -135,16 +160,15 @@
   cache_->Erase(key);
 }
 
-Status TableCache::FindTable(const FileOptions& file_options,
-                             const InternalKeyComparator& internal_comparator,
-                             const FileDescriptor& fd, Cache::Handle** handle,
-                             const SliceTransform* prefix_extractor,
-                             const bool no_io, bool record_read_stats,
-                             HistogramImpl* file_read_hist, bool skip_filters,
-                             int level,
-                             bool prefetch_index_and_filter_in_cache) {
-  PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env);
-  Status s;
+Status TableCache::FindTable(
+    const ReadOptions& ro, const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+    Cache::Handle** handle,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist,
+    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+  PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
   uint64_t number = fd.GetNumber();
   Slice key = GetSliceForFileNumber(&number);
   *handle = cache_->Lookup(key);
@@ -152,17 +176,25 @@
                            const_cast<bool*>(&no_io));
 
   if (*handle == nullptr) {
-    if (no_io) {  // Don't do IO and return a not-found status
+    if (no_io) {
       return Status::Incomplete("Table not found in table_cache, no_io is set");
     }
+    MutexLock load_lock(loader_mutex_.get(key));
+    // We check the cache again under loading mutex
+    *handle = cache_->Lookup(key);
+    if (*handle != nullptr) {
+      return Status::OK();
+    }
+
     std::unique_ptr<TableReader> table_reader;
-    s = GetTableReader(file_options, internal_comparator, fd,
-                       false /* sequential mode */, record_read_stats,
-                       file_read_hist, &table_reader, prefix_extractor,
-                       skip_filters, level, prefetch_index_and_filter_in_cache);
+    Status s = GetTableReader(
+        ro, file_options, internal_comparator, fd, false /* sequential mode */,
+        record_read_stats, file_read_hist, &table_reader, prefix_extractor,
+        skip_filters, level, prefetch_index_and_filter_in_cache,
+        max_file_size_for_l0_meta_pin, file_temperature);
     if (!s.ok()) {
       assert(table_reader == nullptr);
-      RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
+      RecordTick(ioptions_.stats, NO_FILE_ERRORS);
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
     } else {
@@ -173,18 +205,21 @@
         table_reader.release();
       }
     }
+    return s;
   }
-  return s;
+  return Status::OK();
 }
 
 InternalIterator* TableCache::NewIterator(
     const ReadOptions& options, const FileOptions& file_options,
     const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
-    RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
+    RangeDelAggregator* range_del_agg,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
     TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
     TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+    size_t max_file_size_for_l0_meta_pin,
     const InternalKey* smallest_compaction_key,
-    const InternalKey* largest_compaction_key) {
+    const InternalKey* largest_compaction_key, bool allow_unprepared_value) {
   PERF_TIMER_GUARD(new_table_iterator_nanos);
 
   Status s;
@@ -197,10 +232,12 @@
   auto& fd = file_meta.fd;
   table_reader = fd.table_reader;
   if (table_reader == nullptr) {
-    s = FindTable(file_options, icomparator, fd, &handle, prefix_extractor,
-                  options.read_tier == kBlockCacheTier /* no_io */,
-                  !for_compaction /* record_read_stats */, file_read_hist,
-                  skip_filters, level);
+    s = FindTable(
+        options, file_options, icomparator, fd, &handle, prefix_extractor,
+        options.read_tier == kBlockCacheTier /* no_io */,
+        !for_compaction /* record_read_stats */, file_read_hist, skip_filters,
+        level, true /* prefetch_index_and_filter_in_cache */,
+        max_file_size_for_l0_meta_pin, file_meta.temperature);
     if (s.ok()) {
       table_reader = GetTableReaderFromHandle(handle);
     }
@@ -211,9 +248,9 @@
         !options.table_filter(*table_reader->GetTableProperties())) {
       result = NewEmptyInternalIterator<Slice>(arena);
     } else {
-      result = table_reader->NewIterator(options, prefix_extractor, arena,
-                                   skip_filters, caller,
-                                   file_options.compaction_readahead_size);
+      result = table_reader->NewIterator(
+          options, prefix_extractor.get(), arena, skip_filters, caller,
+          file_options.compaction_readahead_size, allow_unprepared_value);
     }
     if (handle != nullptr) {
       result->RegisterCleanup(&UnrefEntry, cache_, handle);
@@ -265,19 +302,27 @@
     const InternalKeyComparator& internal_comparator,
     const FileMetaData& file_meta,
     std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+  assert(out_iter);
   const FileDescriptor& fd = file_meta.fd;
   Status s;
   TableReader* t = fd.table_reader;
   Cache::Handle* handle = nullptr;
   if (t == nullptr) {
-    s = FindTable(file_options_, internal_comparator, fd, &handle);
+    s = FindTable(options, file_options_, internal_comparator, fd, &handle);
     if (s.ok()) {
       t = GetTableReaderFromHandle(handle);
     }
   }
   if (s.ok()) {
+    // Note: NewRangeTombstoneIterator could return nullptr
     out_iter->reset(t->NewRangeTombstoneIterator(options));
-    assert(out_iter);
+  }
+  if (handle) {
+    if (*out_iter) {
+      (*out_iter)->RegisterCleanup(&UnrefEntry, cache_, handle);
+    } else {
+      ReleaseHandle(handle);
+    }
   }
   return s;
 }
@@ -303,8 +348,7 @@
   // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
   if (options.snapshot != nullptr &&
       (get_context->has_callback() ||
-       static_cast_with_check<const SnapshotImpl, const Snapshot>(
-           options.snapshot)
+       static_cast_with_check<const SnapshotImpl>(options.snapshot)
                ->GetSequenceNumber() <= fd.largest_seqno)) {
     // We should consider to use options.snapshot->GetSequenceNumber()
     // instead of GetInternalKeySeqno(k), which will make the code
@@ -346,22 +390,22 @@
                                  ioptions_.row_cache.get(), row_handle);
     replayGetContextLog(*found_row_cache_entry, user_key, get_context,
                         &value_pinner);
-    RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+    RecordTick(ioptions_.stats, ROW_CACHE_HIT);
     found = true;
   } else {
-    RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+    RecordTick(ioptions_.stats, ROW_CACHE_MISS);
   }
   return found;
 }
 #endif  // ROCKSDB_LITE
 
-Status TableCache::Get(const ReadOptions& options,
-                       const InternalKeyComparator& internal_comparator,
-                       const FileMetaData& file_meta, const Slice& k,
-                       GetContext* get_context,
-                       const SliceTransform* prefix_extractor,
-                       HistogramImpl* file_read_hist, bool skip_filters,
-                       int level) {
+Status TableCache::Get(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    HistogramImpl* file_read_hist, bool skip_filters, int level,
+    size_t max_file_size_for_l0_meta_pin) {
   auto& fd = file_meta.fd;
   std::string* row_cache_entry = nullptr;
   bool done = false;
@@ -384,12 +428,15 @@
   Status s;
   TableReader* t = fd.table_reader;
   Cache::Handle* handle = nullptr;
-  if (!done && s.ok()) {
+  if (!done) {
+    assert(s.ok());
     if (t == nullptr) {
-      s = FindTable(
-          file_options_, internal_comparator, fd, &handle, prefix_extractor,
-          options.read_tier == kBlockCacheTier /* no_io */,
-          true /* record_read_stats */, file_read_hist, skip_filters, level);
+      s = FindTable(options, file_options_, internal_comparator, fd, &handle,
+                    prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    true /* record_read_stats */, file_read_hist, skip_filters,
+                    level, true /* prefetch_index_and_filter_in_cache */,
+                    max_file_size_for_l0_meta_pin, file_meta.temperature);
       if (s.ok()) {
         t = GetTableReaderFromHandle(handle);
       }
@@ -408,7 +455,7 @@
     }
     if (s.ok()) {
       get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
-      s = t->Get(options, k, get_context, prefix_extractor, skip_filters);
+      s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters);
       get_context->SetReplayLog(nullptr);
     } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
       // Couldn't find Table in cache but treat as kFound if no_io set
@@ -424,8 +471,11 @@
     size_t charge =
         row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string);
     void* row_ptr = new std::string(std::move(*row_cache_entry));
-    ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
-                                &DeleteEntry<std::string>);
+    // If row cache is full, it's OK to continue.
+    ioptions_.row_cache
+        ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                 &DeleteEntry<std::string>)
+        .PermitUncheckedError();
   }
 #endif  // ROCKSDB_LITE
 
@@ -436,13 +486,12 @@
 }
 
 // Batched version of TableCache::MultiGet.
-Status TableCache::MultiGet(const ReadOptions& options,
-                            const InternalKeyComparator& internal_comparator,
-                            const FileMetaData& file_meta,
-                            const MultiGetContext::Range* mget_range,
-                            const SliceTransform* prefix_extractor,
-                            HistogramImpl* file_read_hist, bool skip_filters,
-                            int level) {
+Status TableCache::MultiGet(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    HistogramImpl* file_read_hist, bool skip_filters, int level) {
   auto& fd = file_meta.fd;
   Status s;
   TableReader* t = fd.table_reader;
@@ -467,8 +516,8 @@
 
     for (auto miter = table_range.begin(); miter != table_range.end();
          ++miter) {
-      const Slice& user_key = miter->ukey;
-      ;
+      const Slice& user_key = miter->ukey_with_ts;
+
       GetContext* get_context = miter->get_context;
 
       if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
@@ -486,10 +535,12 @@
   // found in the row cache and thus the range may now be empty
   if (s.ok() && !table_range.empty()) {
     if (t == nullptr) {
-      s = FindTable(
-          file_options_, internal_comparator, fd, &handle, prefix_extractor,
-          options.read_tier == kBlockCacheTier /* no_io */,
-          true /* record_read_stats */, file_read_hist, skip_filters, level);
+      s = FindTable(options, file_options_, internal_comparator, fd, &handle,
+                    prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    true /* record_read_stats */, file_read_hist, skip_filters,
+                    level, true /* prefetch_index_and_filter_in_cache */,
+                    0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
       TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
       if (s.ok()) {
         t = GetTableReaderFromHandle(handle);
@@ -504,14 +555,14 @@
              ++iter) {
           SequenceNumber* max_covering_tombstone_seq =
               iter->get_context->max_covering_tombstone_seq();
-          *max_covering_tombstone_seq =
-              std::max(*max_covering_tombstone_seq,
-                       range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey));
+          *max_covering_tombstone_seq = std::max(
+              *max_covering_tombstone_seq,
+              range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
         }
       }
     }
     if (s.ok()) {
-      t->MultiGet(options, &table_range, prefix_extractor, skip_filters);
+      t->MultiGet(options, &table_range, prefix_extractor.get(), skip_filters);
     } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
       for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
         Status* status = iter->s;
@@ -531,7 +582,7 @@
     for (auto miter = table_range.begin(); miter != table_range.end();
          ++miter) {
       std::string& row_cache_entry = row_cache_entries[row_idx++];
-      const Slice& user_key = miter->ukey;
+      const Slice& user_key = miter->ukey_with_ts;
       ;
       GetContext* get_context = miter->get_context;
 
@@ -544,8 +595,11 @@
         size_t charge =
             row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string);
         void* row_ptr = new std::string(std::move(row_cache_entry));
-        ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
-                                    &DeleteEntry<std::string>);
+        // If row cache is full, it's OK.
+        ioptions_.row_cache
+            ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                     &DeleteEntry<std::string>)
+            .PermitUncheckedError();
       }
     }
   }
@@ -561,19 +615,18 @@
     const FileOptions& file_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
     std::shared_ptr<const TableProperties>* properties,
-    const SliceTransform* prefix_extractor, bool no_io) {
-  Status s;
+    const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) {
   auto table_reader = fd.table_reader;
   // table already been pre-loaded?
   if (table_reader) {
     *properties = table_reader->GetTableProperties();
 
-    return s;
+    return Status::OK();
   }
 
   Cache::Handle* table_handle = nullptr;
-  s = FindTable(file_options, internal_comparator, fd, &table_handle,
-                prefix_extractor, no_io);
+  Status s = FindTable(ReadOptions(), file_options, internal_comparator, fd,
+                       &table_handle, prefix_extractor, no_io);
   if (!s.ok()) {
     return s;
   }
@@ -587,8 +640,7 @@
 size_t TableCache::GetMemoryUsageByTableReader(
     const FileOptions& file_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
-    const SliceTransform* prefix_extractor) {
-  Status s;
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
   auto table_reader = fd.table_reader;
   // table already been pre-loaded?
   if (table_reader) {
@@ -596,8 +648,8 @@
   }
 
   Cache::Handle* table_handle = nullptr;
-  s = FindTable(file_options, internal_comparator, fd, &table_handle,
-                prefix_extractor, true);
+  Status s = FindTable(ReadOptions(), file_options, internal_comparator, fd,
+                       &table_handle, prefix_extractor, true);
   if (!s.ok()) {
     return 0;
   }
@@ -608,6 +660,16 @@
   return ret;
 }
 
+bool TableCache::HasEntry(Cache* cache, uint64_t file_number) {
+  Cache::Handle* handle = cache->Lookup(GetSliceForFileNumber(&file_number));
+  if (handle) {
+    cache->Release(handle);
+    return true;
+  } else {
+    return false;
+  }
+}
+
 void TableCache::Evict(Cache* cache, uint64_t file_number) {
   cache->Erase(GetSliceForFileNumber(&file_number));
 }
@@ -615,14 +677,14 @@
 uint64_t TableCache::ApproximateOffsetOf(
     const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
     const InternalKeyComparator& internal_comparator,
-    const SliceTransform* prefix_extractor) {
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
   uint64_t result = 0;
   TableReader* table_reader = fd.table_reader;
   Cache::Handle* table_handle = nullptr;
   if (table_reader == nullptr) {
     const bool for_compaction = (caller == TableReaderCaller::kCompaction);
-    Status s = FindTable(file_options_, internal_comparator, fd, &table_handle,
-                         prefix_extractor, false /* no_io */,
+    Status s = FindTable(ReadOptions(), file_options_, internal_comparator, fd,
+                         &table_handle, prefix_extractor, false /* no_io */,
                          !for_compaction /* record_read_stats */);
     if (s.ok()) {
       table_reader = GetTableReaderFromHandle(table_handle);
@@ -642,14 +704,14 @@
 uint64_t TableCache::ApproximateSize(
     const Slice& start, const Slice& end, const FileDescriptor& fd,
     TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
-    const SliceTransform* prefix_extractor) {
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
   uint64_t result = 0;
   TableReader* table_reader = fd.table_reader;
   Cache::Handle* table_handle = nullptr;
   if (table_reader == nullptr) {
     const bool for_compaction = (caller == TableReaderCaller::kCompaction);
-    Status s = FindTable(file_options_, internal_comparator, fd, &table_handle,
-                         prefix_extractor, false /* no_io */,
+    Status s = FindTable(ReadOptions(), file_options_, internal_comparator, fd,
+                         &table_handle, prefix_extractor, false /* no_io */,
                          !for_compaction /* record_read_stats */);
     if (s.ok()) {
       table_reader = GetTableReaderFromHandle(table_handle);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_cache.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,9 +10,9 @@
 // Thread-safe (provides internal synchronization)
 
 #pragma once
+#include <cstdint>
 #include <string>
 #include <vector>
-#include <stdint.h>
 
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
@@ -48,9 +48,11 @@
 // ioptions.row_cache
 class TableCache {
  public:
-  TableCache(const ImmutableCFOptions& ioptions,
-             const FileOptions& storage_options, Cache* cache,
-             BlockCacheTracer* const block_cache_tracer);
+  TableCache(const ImmutableOptions& ioptions,
+             const FileOptions* storage_options, Cache* cache,
+             BlockCacheTracer* const block_cache_tracer,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_session_id);
   ~TableCache();
 
   // Return an iterator for the specified file number (the corresponding
@@ -60,6 +62,7 @@
   // the returned iterator.  The returned "*table_reader_ptr" object is owned
   // by the cache and should not be deleted, and is valid for as long as the
   // returned iterator is live.
+  // @param options Must outlive the returned iterator.
   // @param range_del_agg If non-nullptr, adds range deletions to the
   //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
   // @param for_compaction If true, a new TableReader may be allocated (but
@@ -70,10 +73,12 @@
       const ReadOptions& options, const FileOptions& toptions,
       const InternalKeyComparator& internal_comparator,
       const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
-      const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
-      HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
-      bool skip_filters, int level, const InternalKey* smallest_compaction_key,
-      const InternalKey* largest_compaction_key);
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+      TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+      size_t max_file_size_for_l0_meta_pin,
+      const InternalKey* smallest_compaction_key,
+      const InternalKey* largest_compaction_key, bool allow_unprepared_value);
 
   // If a seek to internal key "k" in specified file finds an entry,
   // call get_context->SaveValue() repeatedly until
@@ -85,13 +90,13 @@
   //                       recorded
   // @param skip_filters Disables loading/accessing the filter block
   // @param level The level this table is at, -1 for "not set / don't know"
-  Status Get(const ReadOptions& options,
-             const InternalKeyComparator& internal_comparator,
-             const FileMetaData& file_meta, const Slice& k,
-             GetContext* get_context,
-             const SliceTransform* prefix_extractor = nullptr,
-             HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
-             int level = -1);
+  Status Get(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
 
   // Return the range delete tombstone iterator of the file specified by
   // `file_meta`.
@@ -110,17 +115,20 @@
   //                   in the embedded GetContext
   // @param skip_filters Disables loading/accessing the filter block
   // @param level The level this table is at, -1 for "not set / don't know"
-  Status MultiGet(const ReadOptions& options,
-                  const InternalKeyComparator& internal_comparator,
-                  const FileMetaData& file_meta,
-                  const MultiGetContext::Range* mget_range,
-                  const SliceTransform* prefix_extractor = nullptr,
-                  HistogramImpl* file_read_hist = nullptr,
-                  bool skip_filters = false, int level = -1);
+  Status MultiGet(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      int level = -1);
 
   // Evict any entry for the specified file number
   static void Evict(Cache* cache, uint64_t file_number);
 
+  // Query whether specified file number is currently in cache
+  static bool HasEntry(Cache* cache, uint64_t file_number);
+
   // Clean table handle and erase it from the table cache
   // Used in DB close, or the file is not live anymore.
   void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
@@ -128,14 +136,16 @@
   // Find table reader
   // @param skip_filters Disables loading/accessing the filter block
   // @param level == -1 means not specified
-  Status FindTable(const FileOptions& toptions,
-                   const InternalKeyComparator& internal_comparator,
-                   const FileDescriptor& file_fd, Cache::Handle**,
-                   const SliceTransform* prefix_extractor = nullptr,
-                   const bool no_io = false, bool record_read_stats = true,
-                   HistogramImpl* file_read_hist = nullptr,
-                   bool skip_filters = false, int level = -1,
-                   bool prefetch_index_and_filter_in_cache = true);
+  Status FindTable(
+      const ReadOptions& ro, const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileDescriptor& file_fd, Cache::Handle**,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      const bool no_io = false, bool record_read_stats = true,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      int level = -1, bool prefetch_index_and_filter_in_cache = true,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      Temperature file_temperature = Temperature::kUnknown);
 
   // Get TableReader from a cache handle.
   TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
@@ -146,12 +156,13 @@
   // @returns: `properties` will be reset on success. Please note that we will
   //            return Status::Incomplete() if table is not present in cache and
   //            we set `no_io` to be true.
-  Status GetTableProperties(const FileOptions& toptions,
-                            const InternalKeyComparator& internal_comparator,
-                            const FileDescriptor& file_meta,
-                            std::shared_ptr<const TableProperties>* properties,
-                            const SliceTransform* prefix_extractor = nullptr,
-                            bool no_io = false);
+  Status GetTableProperties(
+      const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileDescriptor& file_meta,
+      std::shared_ptr<const TableProperties>* properties,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool no_io = false);
 
   // Return total memory usage of the table reader of the file.
   // 0 if table reader of the file is not loaded.
@@ -159,27 +170,28 @@
       const FileOptions& toptions,
       const InternalKeyComparator& internal_comparator,
       const FileDescriptor& fd,
-      const SliceTransform* prefix_extractor = nullptr);
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
 
   // Returns approximated offset of a key in a file represented by fd.
   uint64_t ApproximateOffsetOf(
       const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
       const InternalKeyComparator& internal_comparator,
-      const SliceTransform* prefix_extractor = nullptr);
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
 
   // Returns approximated data size between start and end keys in a file
   // represented by fd (the start key must not be greater than the end key).
-  uint64_t ApproximateSize(const Slice& start, const Slice& end,
-                           const FileDescriptor& fd, TableReaderCaller caller,
-                           const InternalKeyComparator& internal_comparator,
-                           const SliceTransform* prefix_extractor = nullptr);
+  uint64_t ApproximateSize(
+      const Slice& start, const Slice& end, const FileDescriptor& fd,
+      TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
 
   // Release the handle from a cache
   void ReleaseHandle(Cache::Handle* handle);
 
   Cache* get_cache() const { return cache_; }
 
-  // Capacity of the backing Cache that indicates inifinite TableCache capacity.
+  // Capacity of the backing Cache that indicates infinite TableCache capacity.
   // For example when max_open_files is -1 we set the backing Cache to this.
   static const int kInfiniteCapacity = 0x400000;
 
@@ -193,14 +205,16 @@
 
  private:
   // Build a table reader
-  Status GetTableReader(const FileOptions& file_options,
-                        const InternalKeyComparator& internal_comparator,
-                        const FileDescriptor& fd, bool sequential_mode,
-                        bool record_read_stats, HistogramImpl* file_read_hist,
-                        std::unique_ptr<TableReader>* table_reader,
-                        const SliceTransform* prefix_extractor = nullptr,
-                        bool skip_filters = false, int level = -1,
-                        bool prefetch_index_and_filter_in_cache = true);
+  Status GetTableReader(
+      const ReadOptions& ro, const FileOptions& file_options,
+      const InternalKeyComparator& internal_comparator,
+      const FileDescriptor& fd, bool sequential_mode, bool record_read_stats,
+      HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool skip_filters = false, int level = -1,
+      bool prefetch_index_and_filter_in_cache = true,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      Temperature file_temperature = Temperature::kUnknown);
 
   // Create a key prefix for looking up the row cache. The prefix is of the
   // format row_cache_id + fd_number + seq_no. Later, the user key can be
@@ -215,12 +229,15 @@
   bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
                        size_t prefix_size, GetContext* get_context);
 
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   const FileOptions& file_options_;
   Cache* const cache_;
   std::string row_cache_id_;
   bool immortal_tables_;
   BlockCacheTracer* const block_cache_tracer_;
+  Striped<port::Mutex, Slice> loader_mutex_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::string db_session_id_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.cc	2025-05-19 16:14:27.000000000 +0000
@@ -33,8 +33,9 @@
                                                     const Slice& value,
                                                     uint64_t file_size) {
   ParsedInternalKey ikey;
-  if (!ParseInternalKey(key, &ikey)) {
-    return Status::InvalidArgument("Invalid internal key");
+  Status s = ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!s.ok()) {
+    return s;
   }
 
   return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type),
@@ -42,10 +43,10 @@
 }
 
 void UserKeyTablePropertiesCollector::BlockAdd(
-    uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast,
-    uint64_t blockCompressedBytesSlow) {
-  return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast,
-                              blockCompressedBytesSlow);
+    uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow) {
+  return collector_->BlockAdd(block_raw_bytes, block_compressed_bytes_fast,
+                              block_compressed_bytes_slow);
 }
 
 Status UserKeyTablePropertiesCollector::Finish(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector.h	2025-05-19 16:14:27.000000000 +0000
@@ -6,12 +6,14 @@
 // This file defines a collection of statistics collectors.
 #pragma once
 
-#include "rocksdb/table_properties.h"
-
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table_properties.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 // Base class for internal table properties collector.
@@ -27,9 +29,9 @@
   virtual Status InternalAdd(const Slice& key, const Slice& value,
                              uint64_t file_size) = 0;
 
-  virtual void BlockAdd(uint64_t blockRawBytes,
-                        uint64_t blockCompressedBytesFast,
-                        uint64_t blockCompressedBytesSlow) = 0;
+  virtual void BlockAdd(uint64_t block_raw_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) = 0;
 
   virtual UserCollectedProperties GetReadableProperties() const = 0;
 
@@ -42,12 +44,15 @@
   virtual ~IntTblPropCollectorFactory() {}
   // has to be thread-safe
   virtual IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t column_family_id) = 0;
+      uint32_t column_family_id, int level_at_creation) = 0;
 
   // The name of the properties collector can be used for debugging purpose.
   virtual const char* Name() const = 0;
 };
 
+using IntTblPropCollectorFactories =
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>;
+
 // When rocksdb creates a new table, it will encode all "user keys" into
 // "internal keys", which contains meta information of a given entry.
 //
@@ -64,9 +69,9 @@
   virtual Status InternalAdd(const Slice& key, const Slice& value,
                              uint64_t file_size) override;
 
-  virtual void BlockAdd(uint64_t blockRawBytes,
-                        uint64_t blockCompressedBytesFast,
-                        uint64_t blockCompressedBytesSlow) override;
+  virtual void BlockAdd(uint64_t block_raw_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) override;
 
   virtual Status Finish(UserCollectedProperties* properties) override;
 
@@ -89,9 +94,10 @@
       std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
       : user_collector_factory_(user_collector_factory) {}
   virtual IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t column_family_id) override {
+      uint32_t column_family_id, int level_at_creation) override {
     TablePropertiesCollectorFactory::Context context;
     context.column_family_id = column_family_id;
+    context.level_at_creation = level_at_creation;
     return new UserKeyTablePropertiesCollector(
         user_collector_factory_->CreateTablePropertiesCollector(context));
   }
@@ -104,4 +110,66 @@
   std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
 };
 
+// When rocksdb creates a newtable, it will encode all "user keys" into
+// "internal keys". This class collects min/max timestamp from the encoded
+// internal key when Add() is invoked.
+//
+// @param cmp  the user comparator to compare the timestamps in internal key.
+class TimestampTablePropertiesCollector : public IntTblPropCollector {
+ public:
+  explicit TimestampTablePropertiesCollector(const Comparator* cmp)
+      : cmp_(cmp),
+        timestamp_min_(kDisableUserTimestamp),
+        timestamp_max_(kDisableUserTimestamp) {}
+
+  Status InternalAdd(const Slice& key, const Slice& /* value */,
+                     uint64_t /* file_size */) override {
+    auto user_key = ExtractUserKey(key);
+    assert(cmp_ && cmp_->timestamp_size() > 0);
+    if (user_key.size() < cmp_->timestamp_size()) {
+      return Status::Corruption(
+          "User key size mismatch when comparing to timestamp size.");
+    }
+    auto timestamp_in_key =
+        ExtractTimestampFromUserKey(user_key, cmp_->timestamp_size());
+    if (timestamp_max_ == kDisableUserTimestamp ||
+        cmp_->CompareTimestamp(timestamp_in_key, timestamp_max_) > 0) {
+      timestamp_max_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+    }
+    if (timestamp_min_ == kDisableUserTimestamp ||
+        cmp_->CompareTimestamp(timestamp_min_, timestamp_in_key) > 0) {
+      timestamp_min_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+    }
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_raw_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
+    return;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    assert(timestamp_min_.size() == timestamp_max_.size() &&
+           timestamp_max_.size() == cmp_->timestamp_size());
+    properties->insert({"rocksdb.timestamp_min", timestamp_min_});
+    properties->insert({"rocksdb.timestamp_max", timestamp_max_});
+    return Status::OK();
+  }
+
+  const char* Name() const override {
+    return "TimestampTablePropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return {{"rocksdb.timestamp_min", Slice(timestamp_min_).ToString(true)},
+            {"rocksdb.timestamp_max", Slice(timestamp_max_).ToString(true)}};
+  }
+
+ protected:
+  const Comparator* const cmp_;
+  std::string timestamp_min_;
+  std::string timestamp_max_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/table_properties_collector_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,6 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "db/table_properties_collector.h"
+
 #include <map>
 #include <memory>
 #include <string>
@@ -11,11 +13,10 @@
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "db/table_properties_collector.h"
-#include "env/composite_env_wrapper.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "options/cf_options.h"
+#include "rocksdb/flush_block_policy.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/meta_blocks.h"
@@ -39,24 +40,23 @@
 namespace {
 static const uint32_t kTestColumnFamilyId = 66;
 static const std::string kTestColumnFamilyName = "test_column_fam";
+static const int kTestLevel = 1;
 
-void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
-                 const MutableCFOptions& moptions,
-                 const InternalKeyComparator& internal_comparator,
-                 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-                     int_tbl_prop_collector_factories,
-                 std::unique_ptr<WritableFileWriter>* writable,
-                 std::unique_ptr<TableBuilder>* builder) {
-  std::unique_ptr<WritableFile> wf(new test::StringSink);
+void MakeBuilder(
+    const Options& options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& moptions,
+    const InternalKeyComparator& internal_comparator,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    std::unique_ptr<WritableFileWriter>* writable,
+    std::unique_ptr<TableBuilder>* builder) {
+  std::unique_ptr<FSWritableFile> wf(new test::StringSink);
   writable->reset(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                             "" /* don't care */, EnvOptions()));
-  int unknown_level = -1;
-  builder->reset(NewTableBuilder(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
+  TableBuilderOptions tboptions(
       ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
-      kTestColumnFamilyId, kTestColumnFamilyName, writable->get(),
-      options.compression, options.sample_for_compression,
-      options.compression_opts, unknown_level));
+      options.compression, options.compression_opts, kTestColumnFamilyId,
+      kTestColumnFamilyName, kTestLevel);
+  builder->reset(NewTableBuilder(tboptions, writable->get()));
 }
 }  // namespace
 
@@ -176,9 +176,9 @@
     return Status::OK();
   }
 
-  void BlockAdd(uint64_t /* blockRawBytes */,
-                uint64_t /* blockCompressedBytesFast */,
-                uint64_t /* blockCompressedBytesSlow */) override {
+  void BlockAdd(uint64_t /* block_raw_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
     // Nothing to do.
     return;
   }
@@ -199,6 +199,7 @@
   TablePropertiesCollector* CreateTablePropertiesCollector(
       TablePropertiesCollectorFactory::Context context) override {
     EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
+    EXPECT_EQ(kTestLevel, context.level_at_creation);
     if (!backward_mode_) {
       return new RegularKeysStartWithA();
     } else {
@@ -206,7 +207,7 @@
     }
   }
   IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t /*column_family_id*/) override {
+      uint32_t /*column_family_id*/, int /* level_at_creation */) override {
     return new RegularKeysStartWithAInternal();
   }
   const char* Name() const override { return "RegularKeysStartWithA"; }
@@ -262,10 +263,9 @@
   // -- Step 1: build table
   std::unique_ptr<TableBuilder> builder;
   std::unique_ptr<WritableFileWriter> writer;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   if (test_int_tbl_prop_collector) {
     int_tbl_prop_collector_factories.emplace_back(
         new RegularKeysStartWithAFactory(backward_mode));
@@ -284,17 +284,16 @@
   writer->Flush();
 
   // -- Step 2: Read properties
-  LegacyWritableFileWrapper* file =
-      static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
-  test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+  test::StringSink* fwf =
+      static_cast<test::StringSink*>(writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(fwf->contents()));
   std::unique_ptr<RandomAccessFileReader> fake_file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(fwf->contents())));
-  TableProperties* props;
+      new RandomAccessFileReader(std::move(source), "test"));
+
+  std::unique_ptr<TableProperties> props;
   Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
-                                 magic_number, ioptions, &props,
-                                 true /* compression_type_missing */);
-  std::unique_ptr<TableProperties> props_guard(props);
+                                 magic_number, ioptions, &props);
   ASSERT_OK(s);
 
   auto user_collected = props->user_collected_properties;
@@ -394,8 +393,7 @@
   Options options;
   test::PlainInternalKeyComparator pikc(options.comparator);
 
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   options.table_factory = table_factory;
   if (sanitized) {
     options.table_properties_collector_factories.emplace_back(
@@ -408,11 +406,11 @@
     options.info_log = std::make_shared<test::NullLogger>();
     options = SanitizeOptions("db",            // just a place holder
                               options);
-    ImmutableCFOptions ioptions(options);
+    ImmutableOptions ioptions(options);
     GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
     options.comparator = comparator;
   }
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
 
   for (int iter = 0; iter < 2; ++iter) {
@@ -425,19 +423,18 @@
     ASSERT_OK(builder->Finish());
     writable->Flush();
 
-    LegacyWritableFileWrapper* file =
-        static_cast<LegacyWritableFileWrapper*>(writable->writable_file());
-    test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+    test::StringSink* fwf =
+        static_cast<test::StringSink*>(writable->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(fwf->contents()));
     std::unique_ptr<RandomAccessFileReader> reader(
-        test::GetRandomAccessFileReader(
-            new test::StringSource(fwf->contents())));
-    TableProperties* props;
-    Status s =
-        ReadTableProperties(reader.get(), fwf->contents().size(), magic_number,
-                            ioptions, &props, true /* compression_type_missing */);
+        new RandomAccessFileReader(std::move(source), "test"));
+
+    std::unique_ptr<TableProperties> props;
+    Status s = ReadTableProperties(reader.get(), fwf->contents().size(),
+                                   magic_number, ioptions, &props);
     ASSERT_OK(s);
 
-    std::unique_ptr<TableProperties> props_guard(props);
     auto user_collected = props->user_collected_properties;
     uint64_t deleted = GetDeletedKeys(user_collected);
     ASSERT_EQ(5u, deleted);  // deletes + single-deletes
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,7 +17,7 @@
     const TransactionLogIterator::ReadOptions& read_options,
     const EnvOptions& soptions, const SequenceNumber seq,
     std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
-    const bool seq_per_batch)
+    const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer)
     : dir_(dir),
       options_(options),
       read_options_(read_options),
@@ -30,10 +30,11 @@
       current_batch_seq_(0),
       current_last_seq_(0),
       versions_(versions),
-      seq_per_batch_(seq_per_batch) {
+      seq_per_batch_(seq_per_batch),
+      io_tracer_(io_tracer) {
   assert(files_ != nullptr);
   assert(versions_ != nullptr);
-
+  current_status_.PermitUncheckedError();  // Clear on start
   reporter_.env = options_->env;
   reporter_.info_log = options_->info_log.get();
   SeekToStartSequence(); // Seek till starting sequence
@@ -42,7 +43,7 @@
 Status TransactionLogIteratorImpl::OpenLogFile(
     const LogFile* log_file,
     std::unique_ptr<SequentialFileReader>* file_reader) {
-  FileSystem* fs = options_->fs.get();
+  FileSystemPtr fs(options_->fs, io_tracer_);
   std::unique_ptr<FSSequentialFile> file;
   std::string fname;
   Status s;
@@ -62,7 +63,8 @@
     }
   }
   if (s.ok()) {
-    file_reader->reset(new SequentialFileReader(std::move(file), fname));
+    file_reader->reset(new SequentialFileReader(
+        std::move(file), fname, io_tracer_, options_->listeners));
   }
   return s;
 }
@@ -223,7 +225,8 @@
 
 void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
   std::unique_ptr<WriteBatch> batch(new WriteBatch());
-  WriteBatchInternal::SetContents(batch.get(), record);
+  Status s = WriteBatchInternal::SetContents(batch.get(), record);
+  s.PermitUncheckedError();  // TODO: What should we do with this error?
 
   SequenceNumber expected_seq = current_last_seq_ + 1;
   // If the iterator has started, then confirm that we get continuous batches
@@ -263,6 +266,10 @@
       sequence_++;
       return Status::OK();
     }
+    Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+      ++sequence_;
+      return Status::OK();
+    }
 
     Status PutCF(uint32_t /*cf*/, const Slice& /*key*/,
                  const Slice& /*val*/) override {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/transaction_log_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/transaction_log_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 #include "db/log_reader.h"
 #include "db/version_set.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
@@ -63,7 +64,7 @@
       const TransactionLogIterator::ReadOptions& read_options,
       const EnvOptions& soptions, const SequenceNumber seqNum,
       std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
-      const bool seq_per_batch);
+      const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer);
 
   virtual bool Valid() override;
 
@@ -122,6 +123,7 @@
   // Update current batch if a continuous batch is found, else return false
   void UpdateCurrentWriteBatch(const Slice& record);
   Status OpenLogReader(const LogFile* file);
+  std::shared_ptr<IOTracer> io_tracer_;
 };
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -14,13 +14,16 @@
 #include <cinttypes>
 #include <functional>
 #include <map>
+#include <memory>
 #include <set>
+#include <sstream>
 #include <thread>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_file_meta.h"
 #include "db/dbformat.h"
 #include "db/internal_stats.h"
 #include "db/table_cache.h"
@@ -31,90 +34,242 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
-  if (a->fd.largest_seqno != b->fd.largest_seqno) {
-    return a->fd.largest_seqno > b->fd.largest_seqno;
-  }
-  if (a->fd.smallest_seqno != b->fd.smallest_seqno) {
-    return a->fd.smallest_seqno > b->fd.smallest_seqno;
-  }
-  // Break ties by file number
-  return a->fd.GetNumber() > b->fd.GetNumber();
-}
+class VersionBuilder::Rep {
+  class NewestFirstBySeqNo {
+   public:
+    bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+      assert(lhs);
+      assert(rhs);
 
-namespace {
-bool BySmallestKey(FileMetaData* a, FileMetaData* b,
-                   const InternalKeyComparator* cmp) {
-  int r = cmp->Compare(a->smallest, b->smallest);
-  if (r != 0) {
-    return (r < 0);
-  }
-  // Break ties by file number
-  return (a->fd.GetNumber() < b->fd.GetNumber());
-}
-}  // namespace
+      if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
+        return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
+      }
 
-class VersionBuilder::Rep {
- private:
-  // Helper to sort files_ in v
-  // kLevel0 -- NewestFirstBySeqNo
-  // kLevelNon0 -- BySmallestKey
-  struct FileComparator {
-    enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
-    const InternalKeyComparator* internal_comparator;
-
-    FileComparator() : internal_comparator(nullptr) {}
-
-    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
-      switch (sort_method) {
-        case kLevel0:
-          return NewestFirstBySeqNo(f1, f2);
-        case kLevelNon0:
-          return BySmallestKey(f1, f2, internal_comparator);
+      if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
+        return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
       }
-      assert(false);
-      return false;
+
+      // Break ties by file number
+      return lhs->fd.GetNumber() > rhs->fd.GetNumber();
     }
   };
 
+  class BySmallestKey {
+   public:
+    explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {}
+
+    bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+      assert(lhs);
+      assert(rhs);
+      assert(cmp_);
+
+      const int r = cmp_->Compare(lhs->smallest, rhs->smallest);
+      if (r != 0) {
+        return (r < 0);
+      }
+
+      // Break ties by file number
+      return (lhs->fd.GetNumber() < rhs->fd.GetNumber());
+    }
+
+   private:
+    const InternalKeyComparator* cmp_;
+  };
+
   struct LevelState {
     std::unordered_set<uint64_t> deleted_files;
     // Map from file number to file meta data.
     std::unordered_map<uint64_t, FileMetaData*> added_files;
   };
 
+  // A class that represents the accumulated changes (like additional garbage or
+  // newly linked/unlinked SST files) for a given blob file after applying a
+  // series of VersionEdits.
+  class BlobFileMetaDataDelta {
+   public:
+    bool IsEmpty() const {
+      return !additional_garbage_count_ && !additional_garbage_bytes_ &&
+             newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty();
+    }
+
+    uint64_t GetAdditionalGarbageCount() const {
+      return additional_garbage_count_;
+    }
+
+    uint64_t GetAdditionalGarbageBytes() const {
+      return additional_garbage_bytes_;
+    }
+
+    const std::unordered_set<uint64_t>& GetNewlyLinkedSsts() const {
+      return newly_linked_ssts_;
+    }
+
+    const std::unordered_set<uint64_t>& GetNewlyUnlinkedSsts() const {
+      return newly_unlinked_ssts_;
+    }
+
+    void AddGarbage(uint64_t count, uint64_t bytes) {
+      additional_garbage_count_ += count;
+      additional_garbage_bytes_ += bytes;
+    }
+
+    void LinkSst(uint64_t sst_file_number) {
+      assert(newly_linked_ssts_.find(sst_file_number) ==
+             newly_linked_ssts_.end());
+
+      // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be
+      // linked to and unlinked from the same blob file in the case of a trivial
+      // move.)
+      auto it = newly_unlinked_ssts_.find(sst_file_number);
+
+      if (it != newly_unlinked_ssts_.end()) {
+        newly_unlinked_ssts_.erase(it);
+      } else {
+        newly_linked_ssts_.emplace(sst_file_number);
+      }
+    }
+
+    void UnlinkSst(uint64_t sst_file_number) {
+      assert(newly_unlinked_ssts_.find(sst_file_number) ==
+             newly_unlinked_ssts_.end());
+
+      // Reconcile with newly linked SSTs on the fly. (Note: an SST can be
+      // linked to and unlinked from the same blob file in the case of a trivial
+      // move.)
+      auto it = newly_linked_ssts_.find(sst_file_number);
+
+      if (it != newly_linked_ssts_.end()) {
+        newly_linked_ssts_.erase(it);
+      } else {
+        newly_unlinked_ssts_.emplace(sst_file_number);
+      }
+    }
+
+   private:
+    uint64_t additional_garbage_count_ = 0;
+    uint64_t additional_garbage_bytes_ = 0;
+    std::unordered_set<uint64_t> newly_linked_ssts_;
+    std::unordered_set<uint64_t> newly_unlinked_ssts_;
+  };
+
+  // A class that represents the state of a blob file after applying a series of
+  // VersionEdits. In addition to the resulting state, it also contains the
+  // delta (see BlobFileMetaDataDelta above). The resulting state can be used to
+  // identify obsolete blob files, while the delta makes it possible to
+  // efficiently detect trivial moves.
+  class MutableBlobFileMetaData {
+   public:
+    // To be used for brand new blob files
+    explicit MutableBlobFileMetaData(
+        std::shared_ptr<SharedBlobFileMetaData>&& shared_meta)
+        : shared_meta_(std::move(shared_meta)) {}
+
+    // To be used for pre-existing blob files
+    explicit MutableBlobFileMetaData(
+        const std::shared_ptr<BlobFileMetaData>& meta)
+        : shared_meta_(meta->GetSharedMeta()),
+          linked_ssts_(meta->GetLinkedSsts()),
+          garbage_blob_count_(meta->GetGarbageBlobCount()),
+          garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {}
+
+    const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+      return shared_meta_;
+    }
+
+    uint64_t GetBlobFileNumber() const {
+      assert(shared_meta_);
+      return shared_meta_->GetBlobFileNumber();
+    }
+
+    bool HasDelta() const { return !delta_.IsEmpty(); }
+
+    const std::unordered_set<uint64_t>& GetLinkedSsts() const {
+      return linked_ssts_;
+    }
+
+    uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+
+    uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+    bool AddGarbage(uint64_t count, uint64_t bytes) {
+      assert(shared_meta_);
+
+      if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() ||
+          garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) {
+        return false;
+      }
+
+      delta_.AddGarbage(count, bytes);
+
+      garbage_blob_count_ += count;
+      garbage_blob_bytes_ += bytes;
+
+      return true;
+    }
+
+    void LinkSst(uint64_t sst_file_number) {
+      delta_.LinkSst(sst_file_number);
+
+      assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end());
+      linked_ssts_.emplace(sst_file_number);
+    }
+
+    void UnlinkSst(uint64_t sst_file_number) {
+      delta_.UnlinkSst(sst_file_number);
+
+      assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end());
+      linked_ssts_.erase(sst_file_number);
+    }
+
+   private:
+    std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+    // Accumulated changes
+    BlobFileMetaDataDelta delta_;
+    // Resulting state after applying the changes
+    BlobFileMetaData::LinkedSsts linked_ssts_;
+    uint64_t garbage_blob_count_ = 0;
+    uint64_t garbage_blob_bytes_ = 0;
+  };
+
   const FileOptions& file_options_;
-  Logger* info_log_;
+  const ImmutableCFOptions* const ioptions_;
   TableCache* table_cache_;
   VersionStorageInfo* base_vstorage_;
+  VersionSet* version_set_;
   int num_levels_;
   LevelState* levels_;
-  // Store states of levels larger than num_levels_. We do this instead of
+  // Store sizes of levels larger than num_levels_. We do this instead of
   // storing them in levels_ to avoid regression in case there are no files
   // on invalid levels. The version is not consistent if in the end the files
   // on invalid levels don't cancel out.
-  std::map<int, std::unordered_set<uint64_t>> invalid_levels_;
+  std::unordered_map<int, size_t> invalid_level_sizes_;
   // Whether there are invalid new files or invalid deletion on levels larger
   // than num_levels_.
   bool has_invalid_levels_;
-  FileComparator level_zero_cmp_;
-  FileComparator level_nonzero_cmp_;
+  // Current levels of table files affected by additions/deletions.
+  std::unordered_map<uint64_t, int> table_file_levels_;
+  NewestFirstBySeqNo level_zero_cmp_;
+  BySmallestKey level_nonzero_cmp_;
+
+  // Mutable metadata objects for all blob files affected by the series of
+  // version edits.
+  std::map<uint64_t, MutableBlobFileMetaData> mutable_blob_file_metas_;
 
  public:
-  Rep(const FileOptions& file_options, Logger* info_log,
-      TableCache* table_cache,
-      VersionStorageInfo* base_vstorage)
+  Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
+      TableCache* table_cache, VersionStorageInfo* base_vstorage,
+      VersionSet* version_set)
       : file_options_(file_options),
-        info_log_(info_log),
+        ioptions_(ioptions),
         table_cache_(table_cache),
         base_vstorage_(base_vstorage),
+        version_set_(version_set),
         num_levels_(base_vstorage->num_levels()),
-        has_invalid_levels_(false) {
+        has_invalid_levels_(false),
+        level_nonzero_cmp_(base_vstorage_->InternalComparator()) {
+    assert(ioptions_);
+
     levels_ = new LevelState[num_levels_];
-    level_zero_cmp_.sort_method = FileComparator::kLevel0;
-    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
-    level_nonzero_cmp_.internal_comparator =
-        base_vstorage_->InternalComparator();
   }
 
   ~Rep() {
@@ -140,204 +295,836 @@
     }
   }
 
-  Status CheckConsistency(VersionStorageInfo* vstorage) {
+  // Mapping used for checking the consistency of links between SST files and
+  // blob files. It is built using the forward links (table file -> blob file),
+  // and is subsequently compared with the inverse mapping stored in the
+  // BlobFileMetaData objects.
+  using ExpectedLinkedSsts =
+      std::unordered_map<uint64_t, BlobFileMetaData::LinkedSsts>;
+
+  static void UpdateExpectedLinkedSsts(
+      uint64_t table_file_number, uint64_t blob_file_number,
+      ExpectedLinkedSsts* expected_linked_ssts) {
+    assert(expected_linked_ssts);
+
+    if (blob_file_number == kInvalidBlobFileNumber) {
+      return;
+    }
+
+    (*expected_linked_ssts)[blob_file_number].emplace(table_file_number);
+  }
+
+  template <typename Checker>
+  Status CheckConsistencyDetailsForLevel(
+      const VersionStorageInfo* vstorage, int level, Checker checker,
+      const std::string& sync_point,
+      ExpectedLinkedSsts* expected_linked_ssts) const {
 #ifdef NDEBUG
-    if (!vstorage->force_consistency_checks()) {
-      // Dont run consistency checks in release mode except if
-      // explicitly asked to
+    (void)sync_point;
+#endif
+
+    assert(vstorage);
+    assert(level >= 0 && level < num_levels_);
+    assert(expected_linked_ssts);
+
+    const auto& level_files = vstorage->LevelFiles(level);
+
+    if (level_files.empty()) {
       return Status::OK();
     }
-#endif
-    // make sure the files are sorted correctly
-    for (int level = 0; level < num_levels_; level++) {
-      auto& level_files = vstorage->LevelFiles(level);
-      for (size_t i = 1; i < level_files.size(); i++) {
-        auto f1 = level_files[i - 1];
-        auto f2 = level_files[i];
+
+    assert(level_files[0]);
+    UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(),
+                             level_files[0]->oldest_blob_file_number,
+                             expected_linked_ssts);
+
+    for (size_t i = 1; i < level_files.size(); ++i) {
+      assert(level_files[i]);
+      UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(),
+                               level_files[i]->oldest_blob_file_number,
+                               expected_linked_ssts);
+
+      auto lhs = level_files[i - 1];
+      auto rhs = level_files[i];
+
 #ifndef NDEBUG
-        auto pair = std::make_pair(&f1, &f2);
-        TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair);
+      auto pair = std::make_pair(&lhs, &rhs);
+      TEST_SYNC_POINT_CALLBACK(sync_point, &pair);
 #endif
-        if (level == 0) {
-          if (!level_zero_cmp_(f1, f2)) {
-            fprintf(stderr, "L0 files are not sorted properly");
-            return Status::Corruption("L0 files are not sorted properly");
+
+      const Status s = checker(lhs, rhs);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Make sure table files are sorted correctly and that the links between
+  // table files and blob files are consistent.
+  Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    ExpectedLinkedSsts expected_linked_ssts;
+
+    if (num_levels_ > 0) {
+      // Check L0
+      {
+        auto l0_checker = [this](const FileMetaData* lhs,
+                                 const FileMetaData* rhs) {
+          assert(lhs);
+          assert(rhs);
+
+          if (!level_zero_cmp_(lhs, rhs)) {
+            std::ostringstream oss;
+            oss << "L0 files are not sorted properly: files #"
+                << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+            return Status::Corruption("VersionBuilder", oss.str());
           }
 
-          if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
+          if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) {
             // This is an external file that we ingested
-            SequenceNumber external_file_seqno = f2->fd.smallest_seqno;
-            if (!(external_file_seqno < f1->fd.largest_seqno ||
+            const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno;
+
+            if (!(external_file_seqno < lhs->fd.largest_seqno ||
                   external_file_seqno == 0)) {
-              fprintf(stderr,
-                      "L0 file with seqno %" PRIu64 " %" PRIu64
-                      " vs. file with global_seqno %" PRIu64 "\n",
-                      f1->fd.smallest_seqno, f1->fd.largest_seqno,
-                      external_file_seqno);
-              return Status::Corruption(
-                  "L0 file with seqno " +
-                  NumberToString(f1->fd.smallest_seqno) + " " +
-                  NumberToString(f1->fd.largest_seqno) +
-                  " vs. file with global_seqno" +
-                  NumberToString(external_file_seqno) + " with fileNumber " +
-                  NumberToString(f1->fd.GetNumber()));
+              std::ostringstream oss;
+              oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+                  << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+                  << " vs. file #" << rhs->fd.GetNumber()
+                  << " with global_seqno " << external_file_seqno;
+
+              return Status::Corruption("VersionBuilder", oss.str());
             }
-          } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
-            fprintf(stderr,
-                    "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64
-                    " %" PRIu64 "\n",
-                    f1->fd.smallest_seqno, f1->fd.largest_seqno,
-                    f2->fd.smallest_seqno, f2->fd.largest_seqno);
-            return Status::Corruption(
-                "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) +
-                " " + NumberToString(f1->fd.largest_seqno) + " " +
-                NumberToString(f1->fd.GetNumber()) + " vs. " +
-                NumberToString(f2->fd.smallest_seqno) + " " +
-                NumberToString(f2->fd.largest_seqno) + " " +
-                NumberToString(f2->fd.GetNumber()));
+          } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) {
+            std::ostringstream oss;
+            oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+                << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+                << " vs. file #" << rhs->fd.GetNumber() << " with seqno "
+                << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno;
+
+            return Status::Corruption("VersionBuilder", oss.str());
           }
-        } else {
-          if (!level_nonzero_cmp_(f1, f2)) {
-            fprintf(stderr, "L%d files are not sorted properly", level);
-            return Status::Corruption("L" + NumberToString(level) +
-                                      " files are not sorted properly");
+
+          return Status::OK();
+        };
+
+        const Status s = CheckConsistencyDetailsForLevel(
+            vstorage, /* level */ 0, l0_checker,
+            "VersionBuilder::CheckConsistency0", &expected_linked_ssts);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+
+      // Check L1 and up
+      const InternalKeyComparator* const icmp = vstorage->InternalComparator();
+      assert(icmp);
+
+      for (int level = 1; level < num_levels_; ++level) {
+        auto checker = [this, level, icmp](const FileMetaData* lhs,
+                                           const FileMetaData* rhs) {
+          assert(lhs);
+          assert(rhs);
+
+          if (!level_nonzero_cmp_(lhs, rhs)) {
+            std::ostringstream oss;
+            oss << 'L' << level << " files are not sorted properly: files #"
+                << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+            return Status::Corruption("VersionBuilder", oss.str());
           }
 
-          // Make sure there is no overlap in levels > 0
-          if (vstorage->InternalComparator()->Compare(f1->largest,
-                                                      f2->smallest) >= 0) {
-            fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level,
-                    (f1->largest).DebugString(true).c_str(),
-                    (f2->smallest).DebugString(true).c_str());
-            return Status::Corruption(
-                "L" + NumberToString(level) + " have overlapping ranges " +
-                (f1->largest).DebugString(true) + " vs. " +
-                (f2->smallest).DebugString(true));
+          // Make sure there is no overlap in level
+          if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) {
+            std::ostringstream oss;
+            oss << 'L' << level << " has overlapping ranges: file #"
+                << lhs->fd.GetNumber()
+                << " largest key: " << lhs->largest.DebugString(true)
+                << " vs. file #" << rhs->fd.GetNumber()
+                << " smallest key: " << rhs->smallest.DebugString(true);
+
+            return Status::Corruption("VersionBuilder", oss.str());
           }
+
+          return Status::OK();
+        };
+
+        const Status s = CheckConsistencyDetailsForLevel(
+            vstorage, level, checker, "VersionBuilder::CheckConsistency1",
+            &expected_linked_ssts);
+        if (!s.ok()) {
+          return s;
         }
       }
     }
-    return Status::OK();
+
+    // Make sure that all blob files in the version have non-garbage data and
+    // the links between them and the table files are consistent.
+    const auto& blob_files = vstorage->GetBlobFiles();
+    for (const auto& pair : blob_files) {
+      const uint64_t blob_file_number = pair.first;
+      const auto& blob_file_meta = pair.second;
+      assert(blob_file_meta);
+
+      if (blob_file_meta->GetGarbageBlobCount() >=
+          blob_file_meta->GetTotalBlobCount()) {
+        std::ostringstream oss;
+        oss << "Blob file #" << blob_file_number
+            << " consists entirely of garbage";
+
+        return Status::Corruption("VersionBuilder", oss.str());
+      }
+
+      if (blob_file_meta->GetLinkedSsts() !=
+          expected_linked_ssts[blob_file_number]) {
+        std::ostringstream oss;
+        oss << "Links are inconsistent between table files and blob file #"
+            << blob_file_number;
+
+        return Status::Corruption("VersionBuilder", oss.str());
+      }
+    }
+
+    Status ret_s;
+    TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn",
+                             &ret_s);
+    return ret_s;
   }
 
-  Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
-                                    int level) {
+  Status CheckConsistency(const VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    // Always run consistency checks in debug build
 #ifdef NDEBUG
-    if (!base_vstorage_->force_consistency_checks()) {
-      // Dont run consistency checks in release mode except if
-      // explicitly asked to
+    if (!vstorage->force_consistency_checks()) {
       return Status::OK();
     }
 #endif
-    // a file to be deleted better exist in the previous version
-    bool found = false;
-    for (int l = 0; !found && l < num_levels_; l++) {
-      const std::vector<FileMetaData*>& base_files =
-          base_vstorage_->LevelFiles(l);
-      for (size_t i = 0; i < base_files.size(); i++) {
-        FileMetaData* f = base_files[i];
-        if (f->fd.GetNumber() == number) {
-          found = true;
-          break;
-        }
+    Status s = CheckConsistencyDetails(vstorage);
+    if (s.IsCorruption() && s.getState()) {
+      // Make it clear the error is due to force_consistency_checks = 1 or
+      // debug build
+#ifdef NDEBUG
+      auto prefix = "force_consistency_checks";
+#else
+      auto prefix = "force_consistency_checks(DEBUG)";
+#endif
+      s = Status::Corruption(prefix, s.getState());
+    } else {
+      // was only expecting corruption with message, or OK
+      assert(s.ok());
+    }
+    return s;
+  }
+
+  bool CheckConsistencyForNumLevels() const {
+    // Make sure there are no files on or beyond num_levels().
+    if (has_invalid_levels_) {
+      return false;
+    }
+
+    for (const auto& pair : invalid_level_sizes_) {
+      const size_t level_size = pair.second;
+      if (level_size != 0) {
+        return false;
       }
     }
-    // if the file did not exist in the previous version, then it
-    // is possibly moved from lower level to higher level in current
-    // version
-    for (int l = level + 1; !found && l < num_levels_; l++) {
-      auto& level_added = levels_[l].added_files;
-      auto got = level_added.find(number);
-      if (got != level_added.end()) {
-        found = true;
-        break;
+
+    return true;
+  }
+
+  bool IsBlobFileInVersion(uint64_t blob_file_number) const {
+    auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+    if (mutable_it != mutable_blob_file_metas_.end()) {
+      return true;
+    }
+
+    assert(base_vstorage_);
+
+    const auto& base_blob_files = base_vstorage_->GetBlobFiles();
+
+    auto base_it = base_blob_files.find(blob_file_number);
+    if (base_it != base_blob_files.end()) {
+      return true;
+    }
+
+    return false;
+  }
+
+  MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData(
+      uint64_t blob_file_number) {
+    auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+    if (mutable_it != mutable_blob_file_metas_.end()) {
+      return &mutable_it->second;
+    }
+
+    assert(base_vstorage_);
+
+    const auto& base_blob_files = base_vstorage_->GetBlobFiles();
+
+    auto base_it = base_blob_files.find(blob_file_number);
+    if (base_it != base_blob_files.end()) {
+      assert(base_it->second);
+
+      mutable_it = mutable_blob_file_metas_
+                       .emplace(blob_file_number,
+                                MutableBlobFileMetaData(base_it->second))
+                       .first;
+      return &mutable_it->second;
+    }
+
+    return nullptr;
+  }
+
+  Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) {
+    const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber();
+
+    if (IsBlobFileInVersion(blob_file_number)) {
+      std::ostringstream oss;
+      oss << "Blob file #" << blob_file_number << " already added";
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    // Note: we use C++11 for now but in C++14, this could be done in a more
+    // elegant way using generalized lambda capture.
+    VersionSet* const vs = version_set_;
+    const ImmutableCFOptions* const ioptions = ioptions_;
+
+    auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) {
+      if (vs) {
+        assert(ioptions);
+        assert(!ioptions->cf_paths.empty());
+        assert(shared_meta);
+
+        vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(),
+                                ioptions->cf_paths.front().path);
       }
+
+      delete shared_meta;
+    };
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, blob_file_addition.GetTotalBlobCount(),
+        blob_file_addition.GetTotalBlobBytes(),
+        blob_file_addition.GetChecksumMethod(),
+        blob_file_addition.GetChecksumValue(), deleter);
+
+    mutable_blob_file_metas_.emplace(
+        blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
+
+    return Status::OK();
+  }
+
+  Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
+    const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber();
+
+    MutableBlobFileMetaData* const mutable_meta =
+        GetOrCreateMutableBlobFileMetaData(blob_file_number);
+
+    if (!mutable_meta) {
+      std::ostringstream oss;
+      oss << "Blob file #" << blob_file_number << " not found";
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(),
+                                  blob_file_garbage.GetGarbageBlobBytes())) {
+      std::ostringstream oss;
+      oss << "Garbage overflow for blob file #" << blob_file_number;
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    return Status::OK();
+  }
+
+  int GetCurrentLevelForTableFile(uint64_t file_number) const {
+    auto it = table_file_levels_.find(file_number);
+    if (it != table_file_levels_.end()) {
+      return it->second;
+    }
+
+    assert(base_vstorage_);
+    return base_vstorage_->GetFileLocation(file_number).GetLevel();
+  }
+
+  uint64_t GetOldestBlobFileNumberForTableFile(int level,
+                                               uint64_t file_number) const {
+    assert(level < num_levels_);
+
+    const auto& added_files = levels_[level].added_files;
+
+    auto it = added_files.find(file_number);
+    if (it != added_files.end()) {
+      const FileMetaData* const meta = it->second;
+      assert(meta);
+
+      return meta->oldest_blob_file_number;
     }
 
-    // maybe this file was added in a previous edit that was Applied
-    if (!found) {
-      auto& level_added = levels_[level].added_files;
-      auto got = level_added.find(number);
-      if (got != level_added.end()) {
-        found = true;
+    assert(base_vstorage_);
+    const FileMetaData* const meta =
+        base_vstorage_->GetFileMetaDataByNumber(file_number);
+    assert(meta);
+
+    return meta->oldest_blob_file_number;
+  }
+
+  Status ApplyFileDeletion(int level, uint64_t file_number) {
+    assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+    const int current_level = GetCurrentLevelForTableFile(file_number);
+
+    if (level != current_level) {
+      if (level >= num_levels_) {
+        has_invalid_levels_ = true;
+      }
+
+      std::ostringstream oss;
+      oss << "Cannot delete table file #" << file_number << " from level "
+          << level << " since it is ";
+      if (current_level ==
+          VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+        oss << "not in the LSM tree";
+      } else {
+        oss << "on level " << current_level;
       }
+
+      return Status::Corruption("VersionBuilder", oss.str());
     }
-    if (!found) {
-      fprintf(stderr, "not found %" PRIu64 "\n", number);
-      return Status::Corruption("not found " + NumberToString(number));
+
+    if (level >= num_levels_) {
+      assert(invalid_level_sizes_[level] > 0);
+      --invalid_level_sizes_[level];
+
+      table_file_levels_[file_number] =
+          VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
+      return Status::OK();
+    }
+
+    const uint64_t blob_file_number =
+        GetOldestBlobFileNumberForTableFile(level, file_number);
+
+    if (blob_file_number != kInvalidBlobFileNumber) {
+      MutableBlobFileMetaData* const mutable_meta =
+          GetOrCreateMutableBlobFileMetaData(blob_file_number);
+      if (mutable_meta) {
+        mutable_meta->UnlinkSst(file_number);
+      }
     }
+
+    auto& level_state = levels_[level];
+
+    auto& add_files = level_state.added_files;
+    auto add_it = add_files.find(file_number);
+    if (add_it != add_files.end()) {
+      UnrefFile(add_it->second);
+      add_files.erase(add_it);
+    }
+
+    auto& del_files = level_state.deleted_files;
+    assert(del_files.find(file_number) == del_files.end());
+    del_files.emplace(file_number);
+
+    table_file_levels_[file_number] =
+        VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
     return Status::OK();
   }
 
-  bool CheckConsistencyForNumLevels() {
-    // Make sure there are no files on or beyond num_levels().
-    if (has_invalid_levels_) {
-      return false;
+  Status ApplyFileAddition(int level, const FileMetaData& meta) {
+    assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+    const uint64_t file_number = meta.fd.GetNumber();
+
+    const int current_level = GetCurrentLevelForTableFile(file_number);
+
+    if (current_level !=
+        VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+      if (level >= num_levels_) {
+        has_invalid_levels_ = true;
+      }
+
+      std::ostringstream oss;
+      oss << "Cannot add table file #" << file_number << " to level " << level
+          << " since it is already in the LSM tree on level " << current_level;
+      return Status::Corruption("VersionBuilder", oss.str());
     }
-    for (auto& level : invalid_levels_) {
-      if (level.second.size() > 0) {
-        return false;
+
+    if (level >= num_levels_) {
+      ++invalid_level_sizes_[level];
+      table_file_levels_[file_number] = level;
+
+      return Status::OK();
+    }
+
+    auto& level_state = levels_[level];
+
+    auto& del_files = level_state.deleted_files;
+    auto del_it = del_files.find(file_number);
+    if (del_it != del_files.end()) {
+      del_files.erase(del_it);
+    }
+
+    FileMetaData* const f = new FileMetaData(meta);
+    f->refs = 1;
+
+    auto& add_files = level_state.added_files;
+    assert(add_files.find(file_number) == add_files.end());
+    add_files.emplace(file_number, f);
+
+    const uint64_t blob_file_number = f->oldest_blob_file_number;
+
+    if (blob_file_number != kInvalidBlobFileNumber) {
+      MutableBlobFileMetaData* const mutable_meta =
+          GetOrCreateMutableBlobFileMetaData(blob_file_number);
+      if (mutable_meta) {
+        mutable_meta->LinkSst(file_number);
       }
     }
-    return true;
+
+    table_file_levels_[file_number] = level;
+
+    return Status::OK();
   }
 
   // Apply all of the edits in *edit to the current state.
-  Status Apply(VersionEdit* edit) {
-    Status s = CheckConsistency(base_vstorage_);
-    if (!s.ok()) {
-      return s;
+  Status Apply(const VersionEdit* edit) {
+    {
+      const Status s = CheckConsistency(base_vstorage_);
+      if (!s.ok()) {
+        return s;
+      }
     }
 
-    // Delete files
-    const auto& del = edit->GetDeletedFiles();
-    for (const auto& del_file : del) {
-      const auto level = del_file.first;
-      const auto number = del_file.second;
-      if (level < num_levels_) {
-        levels_[level].deleted_files.insert(number);
-        CheckConsistencyForDeletes(edit, number, level);
-
-        auto exising = levels_[level].added_files.find(number);
-        if (exising != levels_[level].added_files.end()) {
-          UnrefFile(exising->second);
-          levels_[level].added_files.erase(exising);
-        }
-      } else {
-        if (invalid_levels_[level].erase(number) == 0) {
-          // Deleting an non-existing file on invalid level.
-          has_invalid_levels_ = true;
-        }
+    // Note: we process the blob file related changes first because the
+    // table file addition/deletion logic depends on the blob files
+    // already being there.
+
+    // Add new blob files
+    for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) {
+      const Status s = ApplyBlobFileAddition(blob_file_addition);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Increase the amount of garbage for blob files affected by GC
+    for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) {
+      const Status s = ApplyBlobFileGarbage(blob_file_garbage);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Delete table files
+    for (const auto& deleted_file : edit->GetDeletedFiles()) {
+      const int level = deleted_file.first;
+      const uint64_t file_number = deleted_file.second;
+
+      const Status s = ApplyFileDeletion(level, file_number);
+      if (!s.ok()) {
+        return s;
       }
     }
 
-    // Add new files
+    // Add new table files
     for (const auto& new_file : edit->GetNewFiles()) {
       const int level = new_file.first;
-      if (level < num_levels_) {
-        FileMetaData* f = new FileMetaData(new_file.second);
-        f->refs = 1;
-
-        assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
-               levels_[level].added_files.end());
-        levels_[level].deleted_files.erase(f->fd.GetNumber());
-        levels_[level].added_files[f->fd.GetNumber()] = f;
+      const FileMetaData& meta = new_file.second;
+
+      const Status s = ApplyFileAddition(level, meta);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Helper function template for merging the blob file metadata from the base
+  // version with the mutable metadata representing the state after applying the
+  // edits. The function objects process_base and process_mutable are
+  // respectively called to handle a base version object when there is no
+  // matching mutable object, and a mutable object when there is no matching
+  // base version object. process_both is called to perform the merge when a
+  // given blob file appears both in the base version and the mutable list. The
+  // helper stops processing objects if a function object returns false. Blob
+  // files with a file number below first_blob_file are not processed.
+  template <typename ProcessBase, typename ProcessMutable, typename ProcessBoth>
+  void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base,
+                          ProcessMutable process_mutable,
+                          ProcessBoth process_both) const {
+    assert(base_vstorage_);
+
+    const auto& base_blob_files = base_vstorage_->GetBlobFiles();
+    auto base_it = base_blob_files.lower_bound(first_blob_file);
+    const auto base_it_end = base_blob_files.end();
+
+    auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file);
+    const auto mutable_it_end = mutable_blob_file_metas_.end();
+
+    while (base_it != base_it_end && mutable_it != mutable_it_end) {
+      const uint64_t base_blob_file_number = base_it->first;
+      const uint64_t mutable_blob_file_number = mutable_it->first;
+
+      if (base_blob_file_number < mutable_blob_file_number) {
+        const auto& base_meta = base_it->second;
+
+        if (!process_base(base_meta)) {
+          return;
+        }
+
+        ++base_it;
+      } else if (mutable_blob_file_number < base_blob_file_number) {
+        const auto& mutable_meta = mutable_it->second;
+
+        if (!process_mutable(mutable_meta)) {
+          return;
+        }
+
+        ++mutable_it;
       } else {
-        uint64_t number = new_file.second.fd.GetNumber();
-        auto& lvls = invalid_levels_[level];
-        if (lvls.count(number) == 0) {
-          lvls.insert(number);
-        } else {
-          // Creating an already existing file on invalid level.
-          has_invalid_levels_ = true;
+        assert(base_blob_file_number == mutable_blob_file_number);
+
+        const auto& base_meta = base_it->second;
+        const auto& mutable_meta = mutable_it->second;
+
+        if (!process_both(base_meta, mutable_meta)) {
+          return;
         }
+
+        ++base_it;
+        ++mutable_it;
       }
     }
-    return s;
+
+    while (base_it != base_it_end) {
+      const auto& base_meta = base_it->second;
+
+      if (!process_base(base_meta)) {
+        return;
+      }
+
+      ++base_it;
+    }
+
+    while (mutable_it != mutable_it_end) {
+      const auto& mutable_meta = mutable_it->second;
+
+      if (!process_mutable(mutable_meta)) {
+        return;
+      }
+
+      ++mutable_it;
+    }
+  }
+
+  // Helper function template for finding the first blob file that has linked
+  // SSTs.
+  template <typename Meta>
+  static bool CheckLinkedSsts(const Meta& meta,
+                              uint64_t* min_oldest_blob_file_num) {
+    assert(min_oldest_blob_file_num);
+
+    if (!meta.GetLinkedSsts().empty()) {
+      assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber);
+
+      *min_oldest_blob_file_num = meta.GetBlobFileNumber();
+
+      return false;
+    }
+
+    return true;
+  }
+
+  // Find the oldest blob file that has linked SSTs.
+  uint64_t GetMinOldestBlobFileNumber() const {
+    uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber;
+
+    auto process_base =
+        [&min_oldest_blob_file_num](
+            const std::shared_ptr<BlobFileMetaData>& base_meta) {
+          assert(base_meta);
+
+          return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num);
+        };
+
+    auto process_mutable = [&min_oldest_blob_file_num](
+                               const MutableBlobFileMetaData& mutable_meta) {
+      return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+    };
+
+    auto process_both = [&min_oldest_blob_file_num](
+                            const std::shared_ptr<BlobFileMetaData>& base_meta,
+                            const MutableBlobFileMetaData& mutable_meta) {
+#ifndef NDEBUG
+      assert(base_meta);
+      assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+#else
+      (void)base_meta;
+#endif
+
+      // Look at mutable_meta since it supersedes *base_meta
+      return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+    };
+
+    MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable,
+                       process_both);
+
+    return min_oldest_blob_file_num;
   }
 
-  // Save the current state in *v.
-  Status SaveTo(VersionStorageInfo* vstorage) {
+  static std::shared_ptr<BlobFileMetaData> CreateBlobFileMetaData(
+      const MutableBlobFileMetaData& mutable_meta) {
+    return BlobFileMetaData::Create(
+        mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(),
+        mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
+  }
+
+  // Add the blob file specified by meta to *vstorage if it is determined to
+  // contain valid data (blobs).
+  template <typename Meta>
+  static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) {
+    assert(vstorage);
+    assert(meta);
+
+    if (meta->GetLinkedSsts().empty() &&
+        meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
+      return;
+    }
+
+    vstorage->AddBlobFile(std::forward<Meta>(meta));
+  }
+
+  // Merge the blob file metadata from the base version with the changes (edits)
+  // applied, and save the result into *vstorage.
+  void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    const uint64_t oldest_blob_file_with_linked_ssts =
+        GetMinOldestBlobFileNumber();
+
+    auto process_base =
+        [vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
+          assert(base_meta);
+
+          AddBlobFileIfNeeded(vstorage, base_meta);
+
+          return true;
+        };
+
+    auto process_mutable =
+        [vstorage](const MutableBlobFileMetaData& mutable_meta) {
+          AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+          return true;
+        };
+
+    auto process_both = [vstorage](
+                            const std::shared_ptr<BlobFileMetaData>& base_meta,
+                            const MutableBlobFileMetaData& mutable_meta) {
+      assert(base_meta);
+      assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+
+      if (!mutable_meta.HasDelta()) {
+        assert(base_meta->GetGarbageBlobCount() ==
+               mutable_meta.GetGarbageBlobCount());
+        assert(base_meta->GetGarbageBlobBytes() ==
+               mutable_meta.GetGarbageBlobBytes());
+        assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
+
+        AddBlobFileIfNeeded(vstorage, base_meta);
+
+        return true;
+      }
+
+      AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+      return true;
+    };
+
+    MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base,
+                       process_mutable, process_both);
+  }
+
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                    FileMetaData* f) const {
+    const uint64_t file_number = f->fd.GetNumber();
+
+    const auto& level_state = levels_[level];
+
+    const auto& del_files = level_state.deleted_files;
+    const auto del_it = del_files.find(file_number);
+
+    if (del_it != del_files.end()) {
+      // f is to-be-deleted table file
+      vstorage->RemoveCurrentStats(f);
+    } else {
+      const auto& add_files = level_state.added_files;
+      const auto add_it = add_files.find(file_number);
+
+      // Note: if the file appears both in the base version and in the added
+      // list, the added FileMetaData supersedes the one in the base version.
+      if (add_it != add_files.end() && add_it->second != f) {
+        vstorage->RemoveCurrentStats(f);
+      } else {
+        vstorage->AddFile(level, f);
+      }
+    }
+  }
+
+  template <typename Cmp>
+  void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
+    // Merge the set of added files with the set of pre-existing files.
+    // Drop any deleted files.  Store the result in *vstorage.
+    const auto& base_files = base_vstorage_->LevelFiles(level);
+    const auto& unordered_added_files = levels_[level].added_files;
+    vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
+
+    // Sort added files for the level.
+    std::vector<FileMetaData*> added_files;
+    added_files.reserve(unordered_added_files.size());
+    for (const auto& pair : unordered_added_files) {
+      added_files.push_back(pair.second);
+    }
+    std::sort(added_files.begin(), added_files.end(), cmp);
+
+    auto base_iter = base_files.begin();
+    auto base_end = base_files.end();
+    auto added_iter = added_files.begin();
+    auto added_end = added_files.end();
+    while (added_iter != added_end || base_iter != base_end) {
+      if (base_iter == base_end ||
+          (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+        MaybeAddFile(vstorage, level, *added_iter++);
+      } else {
+        MaybeAddFile(vstorage, level, *base_iter++);
+      }
+    }
+  }
+
+  void SaveSSTFilesTo(VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    if (!num_levels_) {
+      return;
+    }
+
+    SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_);
+
+    for (int level = 1; level < num_levels_; ++level) {
+      SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
+    }
+  }
+
+  // Save the current state in *vstorage.
+  Status SaveTo(VersionStorageInfo* vstorage) const {
     Status s = CheckConsistency(base_vstorage_);
     if (!s.ok()) {
       return s;
@@ -348,56 +1135,19 @@
       return s;
     }
 
-    for (int level = 0; level < num_levels_; level++) {
-      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
-      // Merge the set of added files with the set of pre-existing files.
-      // Drop any deleted files.  Store the result in *v.
-      const auto& base_files = base_vstorage_->LevelFiles(level);
-      const auto& unordered_added_files = levels_[level].added_files;
-      vstorage->Reserve(level,
-                        base_files.size() + unordered_added_files.size());
-
-      // Sort added files for the level.
-      std::vector<FileMetaData*> added_files;
-      added_files.reserve(unordered_added_files.size());
-      for (const auto& pair : unordered_added_files) {
-        added_files.push_back(pair.second);
-      }
-      std::sort(added_files.begin(), added_files.end(), cmp);
+    SaveSSTFilesTo(vstorage);
 
-#ifndef NDEBUG
-      FileMetaData* prev_added_file = nullptr;
-      for (const auto& added : added_files) {
-        if (level > 0 && prev_added_file != nullptr) {
-          assert(base_vstorage_->InternalComparator()->Compare(
-                     prev_added_file->smallest, added->smallest) <= 0);
-        }
-        prev_added_file = added;
-      }
-#endif
-
-      auto base_iter = base_files.begin();
-      auto base_end = base_files.end();
-      auto added_iter = added_files.begin();
-      auto added_end = added_files.end();
-      while (added_iter != added_end || base_iter != base_end) {
-        if (base_iter == base_end ||
-                (added_iter != added_end && cmp(*added_iter, *base_iter))) {
-          MaybeAddFile(vstorage, level, *added_iter++);
-        } else {
-          MaybeAddFile(vstorage, level, *base_iter++);
-        }
-      }
-    }
+    SaveBlobFilesTo(vstorage);
 
     s = CheckConsistency(vstorage);
     return s;
   }
 
-  Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
-                           bool prefetch_index_and_filter_in_cache,
-                           bool is_initial_load,
-                           const SliceTransform* prefix_extractor) {
+  Status LoadTableHandlers(
+      InternalStats* internal_stats, int max_threads,
+      bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      size_t max_file_size_for_l0_meta_pin) {
     assert(table_cache_ != nullptr);
 
     size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
@@ -405,7 +1155,7 @@
     size_t max_load = port::kMaxSizet;
 
     if (!always_load) {
-      // If it is initial loading and not set to always laoding all the
+      // If it is initial loading and not set to always loading all the
       // files, we only load up to kInitialLoadLimit files, to limit the
       // time reopening the DB.
       const size_t kInitialLoadLimit = 16;
@@ -462,11 +1212,13 @@
         auto* file_meta = files_meta[file_idx].first;
         int level = files_meta[file_idx].second;
         statuses[file_idx] = table_cache_->FindTable(
-            file_options_, *(base_vstorage_->InternalComparator()),
-            file_meta->fd, &file_meta->table_reader_handle, prefix_extractor,
-            false /*no_io */, true /* record_read_stats */,
+            ReadOptions(), file_options_,
+            *(base_vstorage_->InternalComparator()), file_meta->fd,
+            &file_meta->table_reader_handle, prefix_extractor, false /*no_io */,
+            true /* record_read_stats */,
             internal_stats->GetFileReadHist(level), false, level,
-            prefetch_index_and_filter_in_cache);
+            prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
+            file_meta->temperature);
         if (file_meta->table_reader_handle != nullptr) {
           // Load table_reader
           file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
@@ -483,63 +1235,75 @@
     for (auto& t : threads) {
       t.join();
     }
+    Status ret;
     for (const auto& s : statuses) {
       if (!s.ok()) {
-        return s;
+        if (ret.ok()) {
+          ret = s;
+        }
       }
     }
-    return Status::OK();
-  }
-
-  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
-    if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
-      // f is to-be-deleted table file
-      vstorage->RemoveCurrentStats(f);
-    } else {
-      vstorage->AddFile(level, f, info_log_);
-    }
+    return ret;
   }
 };
 
 VersionBuilder::VersionBuilder(const FileOptions& file_options,
+                               const ImmutableCFOptions* ioptions,
                                TableCache* table_cache,
                                VersionStorageInfo* base_vstorage,
-                               Logger* info_log)
-    : rep_(new Rep(file_options, info_log, table_cache, base_vstorage)) {}
+                               VersionSet* version_set)
+    : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
+                   version_set)) {}
 
-VersionBuilder::~VersionBuilder() { delete rep_; }
-
-Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
-  return rep_->CheckConsistency(vstorage);
-}
-
-Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
-                                                  uint64_t number, int level) {
-  return rep_->CheckConsistencyForDeletes(edit, number, level);
-}
+VersionBuilder::~VersionBuilder() = default;
 
 bool VersionBuilder::CheckConsistencyForNumLevels() {
   return rep_->CheckConsistencyForNumLevels();
 }
 
-Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); }
+Status VersionBuilder::Apply(const VersionEdit* edit) {
+  return rep_->Apply(edit);
+}
 
-Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const {
   return rep_->SaveTo(vstorage);
 }
 
 Status VersionBuilder::LoadTableHandlers(
     InternalStats* internal_stats, int max_threads,
     bool prefetch_index_and_filter_in_cache, bool is_initial_load,
-    const SliceTransform* prefix_extractor) {
-  return rep_->LoadTableHandlers(internal_stats, max_threads,
-                                 prefetch_index_and_filter_in_cache,
-                                 is_initial_load, prefix_extractor);
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    size_t max_file_size_for_l0_meta_pin) {
+  return rep_->LoadTableHandlers(
+      internal_stats, max_threads, prefetch_index_and_filter_in_cache,
+      is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin);
+}
+
+uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
+  return rep_->GetMinOldestBlobFileNumber();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+    ColumnFamilyData* cfd)
+    : version_builder_(new VersionBuilder(
+          cfd->current()->version_set()->file_options(), cfd->ioptions(),
+          cfd->table_cache(), cfd->current()->storage_info(),
+          cfd->current()->version_set())),
+      version_(cfd->current()) {
+  version_->Ref();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+    ColumnFamilyData* cfd, Version* v)
+    : version_builder_(new VersionBuilder(
+          cfd->current()->version_set()->file_options(), cfd->ioptions(),
+          cfd->table_cache(), v->storage_info(), v->version_set())),
+      version_(v) {
+  assert(version_ != cfd->current());
 }
 
-void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
-                                  FileMetaData* f) {
-  rep_->MaybeAddFile(vstorage, level, f);
+BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() {
+  version_->Unref();
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,41 +8,62 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 #pragma once
+
+#include <memory>
+
 #include "rocksdb/file_system.h"
 #include "rocksdb/slice_transform.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+struct ImmutableCFOptions;
 class TableCache;
 class VersionStorageInfo;
 class VersionEdit;
 struct FileMetaData;
 class InternalStats;
+class Version;
+class VersionSet;
+class ColumnFamilyData;
 
 // A helper class so we can efficiently apply a whole sequence
 // of edits to a particular state without creating intermediate
 // Versions that contain full copies of the intermediate state.
 class VersionBuilder {
  public:
-  VersionBuilder(const FileOptions& file_options, TableCache* table_cache,
-                 VersionStorageInfo* base_vstorage, Logger* info_log = nullptr);
+  VersionBuilder(const FileOptions& file_options,
+                 const ImmutableCFOptions* ioptions, TableCache* table_cache,
+                 VersionStorageInfo* base_vstorage, VersionSet* version_set);
   ~VersionBuilder();
-  Status CheckConsistency(VersionStorageInfo* vstorage);
-  Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
-                                    int level);
+
   bool CheckConsistencyForNumLevels();
-  Status Apply(VersionEdit* edit);
-  Status SaveTo(VersionStorageInfo* vstorage);
-  Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
-                           bool prefetch_index_and_filter_in_cache,
-                           bool is_initial_load,
-                           const SliceTransform* prefix_extractor);
-  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
+  Status Apply(const VersionEdit* edit);
+  Status SaveTo(VersionStorageInfo* vstorage) const;
+  Status LoadTableHandlers(
+      InternalStats* internal_stats, int max_threads,
+      bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      size_t max_file_size_for_l0_meta_pin);
+  uint64_t GetMinOldestBlobFileNumber() const;
 
  private:
   class Rep;
-  Rep* rep_;
+  std::unique_ptr<Rep> rep_;
+};
+
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
+class BaseReferencedVersionBuilder {
+ public:
+  explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd);
+  BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v);
+  ~BaseReferencedVersionBuilder();
+  VersionBuilder* version_builder() const { return version_builder_.get(); }
+
+ private:
+  std::unique_ptr<VersionBuilder> version_builder_;
+  Version* version_;
 };
 
-extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b);
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_builder_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_builder_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,10 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <cstring>
+#include <iomanip>
+#include <memory>
+#include <sstream>
 #include <string>
+
 #include "db/version_edit.h"
 #include "db/version_set.h"
-#include "logging/logging.h"
+#include "rocksdb/advanced_options.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@@ -18,7 +23,7 @@
   const Comparator* ucmp_;
   InternalKeyComparator icmp_;
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
   VersionStorageInfo vstorage_;
   uint32_t file_num_;
@@ -52,19 +57,22 @@
     return InternalKey(ukey, smallest_seq, kTypeValue);
   }
 
-  void Add(int level, uint32_t file_number, const char* smallest,
+  void Add(int level, uint64_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
            SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
            uint64_t num_entries = 0, uint64_t num_deletions = 0,
            bool sampled = false, SequenceNumber smallest_seqno = 0,
-           SequenceNumber largest_seqno = 0) {
+           SequenceNumber largest_seqno = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
     assert(level < vstorage_.num_levels());
     FileMetaData* f = new FileMetaData(
         file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
         GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
-        /* marked_for_compact */ false, kInvalidBlobFileNumber,
-        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+        /* marked_for_compact */ false, Temperature::kUnknown,
+        oldest_blob_file_number, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+        kDisableUserTimestamp);
     f->compensated_file_size = file_size;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
@@ -75,8 +83,77 @@
     }
   }
 
+  void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+               uint64_t total_blob_bytes, std::string checksum_method,
+               std::string checksum_value,
+               BlobFileMetaData::LinkedSsts linked_ssts,
+               uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value));
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 garbage_blob_count, garbage_blob_bytes);
+
+    vstorage_.AddBlobFile(std::move(meta));
+  }
+
+  void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) {
+    constexpr int level = 0;
+    constexpr char smallest[] = "bar";
+    constexpr char largest[] = "foo";
+    constexpr uint64_t file_size = 100;
+    constexpr uint32_t path_id = 0;
+    constexpr SequenceNumber smallest_seq = 0;
+    constexpr SequenceNumber largest_seq = 0;
+    constexpr uint64_t num_entries = 0;
+    constexpr uint64_t num_deletions = 0;
+    constexpr bool sampled = false;
+
+    Add(level, table_file_number, smallest, largest, file_size, path_id,
+        smallest_seq, largest_seq, num_entries, num_deletions, sampled,
+        smallest_seq, largest_seq, blob_file_number);
+  }
+
+  void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number,
+                          uint64_t blob_file_number) {
+    assert(edit);
+
+    constexpr int level = 0;
+    constexpr uint32_t path_id = 0;
+    constexpr uint64_t file_size = 100;
+    constexpr char smallest[] = "bar";
+    constexpr char largest[] = "foo";
+    constexpr SequenceNumber smallest_seqno = 100;
+    constexpr SequenceNumber largest_seqno = 300;
+    constexpr bool marked_for_compaction = false;
+
+    edit->AddFile(level, table_file_number, path_id, file_size,
+                  GetInternalKey(smallest), GetInternalKey(largest),
+                  smallest_seqno, largest_seqno, marked_for_compaction,
+                  Temperature::kUnknown, blob_file_number,
+                  kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                  kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                  kDisableUserTimestamp, kDisableUserTimestamp);
+  }
+
+  static std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData(
+      const VersionStorageInfo::BlobFiles& blob_files,
+      uint64_t blob_file_number) {
+    const auto it = blob_files.find(blob_file_number);
+
+    if (it == blob_files.end()) {
+      return std::shared_ptr<BlobFileMetaData>();
+    }
+
+    const auto& meta = it->second;
+    assert(meta);
+
+    return meta;
+  }
+
   void UpdateVersionStorageInfo() {
-    vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri);
+    vstorage_.UpdateFilesByCompactionPri(ioptions_, mutable_cf_options_);
     vstorage_.UpdateNumNonEmptyLevels();
     vstorage_.GenerateFileIndexer();
     vstorage_.GenerateLevelFilesBrief();
@@ -115,19 +192,23 @@
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.DeleteFile(3, 27U);
 
   EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
 
-  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
 
   VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
                                   kCompactionStyleLevel, nullptr, false);
-  version_builder.Apply(&version_edit);
-  version_builder.SaveTo(&new_vstorage);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
 
   ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
   ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
@@ -152,20 +233,24 @@
   VersionEdit version_edit;
   version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
 
   EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
 
-  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
 
   VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
                                   kCompactionStyleLevel, nullptr, false);
-  version_builder.Apply(&version_edit);
-  version_builder.SaveTo(&new_vstorage);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
 
   ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
   ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3));
@@ -192,9 +277,10 @@
   VersionEdit version_edit;
   version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
   version_edit.DeleteFile(4, 6U);
@@ -202,13 +288,16 @@
   version_edit.DeleteFile(4, 8U);
 
   EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
 
-  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
 
   VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
                                   kCompactionStyleLevel, nullptr, false);
-  version_builder.Apply(&version_edit);
-  version_builder.SaveTo(&new_vstorage);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
 
   ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
   ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4));
@@ -223,38 +312,46 @@
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
                        GetInternalKey("450"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
                        GetInternalKey("650"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
                        GetInternalKey("550"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
                        GetInternalKey("750"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
 
   EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
 
-  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
 
   VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
                                   kCompactionStyleLevel, nullptr, false);
-  version_builder.Apply(&version_edit);
-  version_builder.SaveTo(&new_vstorage);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
 
   ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
 
@@ -265,60 +362,1277 @@
   UpdateVersionStorageInfo();
 
   EnvOptions env_options;
-  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
   VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
                                   kCompactionStyleLevel, nullptr, false);
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
                        GetInternalKey("350"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
                        GetInternalKey("450"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
                        GetInternalKey("650"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
                        GetInternalKey("550"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
                        GetInternalKey("750"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
-  version_builder.Apply(&version_edit);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
+  ASSERT_OK(version_builder.Apply(&version_edit));
 
   VersionEdit version_edit2;
   version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
                        GetInternalKey("950"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
   version_edit2.DeleteFile(2, 616);
   version_edit2.DeleteFile(2, 636);
   version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
                        GetInternalKey("850"), 200, 200, false,
-                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-                       kUnknownFileCreationTime, kUnknownFileChecksum,
-                       kUnknownFileChecksumFuncName);
-  version_builder.Apply(&version_edit2);
+                       Temperature::kUnknown, kInvalidBlobFileNumber,
+                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                       kDisableUserTimestamp, kDisableUserTimestamp);
 
-  version_builder.SaveTo(&new_vstorage);
+  ASSERT_OK(version_builder.Apply(&version_edit2));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
 
   ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
 
   UnrefFilesInVersion(&new_vstorage);
 }
 
+TEST_F(VersionBuilderTest, ApplyFileDeletionIncorrectLevel) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+
+  Add(level, file_number, smallest, largest);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int incorrect_level = 3;
+
+  edit.DeleteFile(incorrect_level, file_number);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot delete table file #2345 from level 3 since "
+                          "it is on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) {
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int level = 3;
+  constexpr uint64_t file_number = 1234;
+
+  edit.DeleteFile(level, file_number);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot delete table file #1234 from level 3 since "
+                          "it is not in the LSM tree"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr uint64_t file_size = 10000;
+  constexpr uint32_t path_id = 0;
+  constexpr SequenceNumber smallest_seq = 100;
+  constexpr SequenceNumber largest_seq = 500;
+  constexpr uint64_t num_entries = 0;
+  constexpr uint64_t num_deletions = 0;
+  constexpr bool sampled = false;
+  constexpr SequenceNumber smallest_seqno = 1;
+  constexpr SequenceNumber largest_seqno = 1000;
+
+  Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq,
+      largest_seq, num_entries, num_deletions, sampled, smallest_seqno,
+      largest_seqno);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit deletion;
+
+  deletion.DeleteFile(level, file_number);
+
+  ASSERT_OK(builder.Apply(&deletion));
+
+  VersionEdit addition;
+
+  constexpr bool marked_for_compaction = false;
+
+  addition.AddFile(level, file_number, path_id, file_size,
+                   GetInternalKey(smallest, smallest_seq),
+                   GetInternalKey(largest, largest_seq), smallest_seqno,
+                   largest_seqno, marked_for_compaction, Temperature::kUnknown,
+                   kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                   kUnknownFileCreationTime, kUnknownFileChecksum,
+                   kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+                   kDisableUserTimestamp);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+  ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+
+  Add(level, file_number, smallest, largest);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int new_level = 2;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 10000;
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  edit.AddFile(new_level, file_number, path_id, file_size,
+               GetInternalKey(smallest), GetInternalKey(largest),
+               smallest_seqno, largest_seqno, marked_for_compaction,
+               Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kDisableUserTimestamp, kDisableUserTimestamp);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot add table file #2345 to level 2 since it is "
+                          "already in the LSM tree on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int level = 3;
+  constexpr uint64_t file_number = 2345;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 10000;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest),
+               GetInternalKey(largest), smallest_seqno, largest_seqno,
+               marked_for_compaction, Temperature::kUnknown,
+               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+               kDisableUserTimestamp);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  VersionEdit other_edit;
+
+  constexpr int new_level = 2;
+
+  other_edit.AddFile(new_level, file_number, path_id, file_size,
+                     GetInternalKey(smallest), GetInternalKey(largest),
+                     smallest_seqno, largest_seqno, marked_for_compaction,
+                     Temperature::kUnknown, kInvalidBlobFileNumber,
+                     kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                     kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                     kDisableUserTimestamp, kDisableUserTimestamp);
+
+  const Status s = builder.Apply(&other_edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot add table file #2345 to level 2 since it is "
+                          "already in the LSM tree on level 3"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 10000;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  addition.AddFile(level, file_number, path_id, file_size,
+                   GetInternalKey(smallest), GetInternalKey(largest),
+                   smallest_seqno, largest_seqno, marked_for_compaction,
+                   Temperature::kUnknown, kInvalidBlobFileNumber,
+                   kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+                   kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                   kDisableUserTimestamp, kDisableUserTimestamp);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  VersionEdit deletion;
+
+  deletion.DeleteFile(level, file_number);
+
+  ASSERT_OK(builder.Apply(&deletion));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+  ASSERT_FALSE(new_vstorage.GetFileLocation(file_number).IsValid());
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAddition) {
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&edit, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = GetBlobFileMetaData(new_blob_files, blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(), 0);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(), 0);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyInBase) {
+  // Attempt to add a blob file that is already present in the base version.
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+          checksum_value, BlobFileMetaData::LinkedSsts(), garbage_blob_count,
+          garbage_blob_bytes);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyApplied) {
+  // Attempt to add the same blob file twice using version edits.
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) {
+  // Increase the amount of garbage for a blob file present in the base version.
+
+  constexpr uint64_t table_file_number = 1;
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+          checksum_value, BlobFileMetaData::LinkedSsts{table_file_number},
+          garbage_blob_count, garbage_blob_bytes);
+
+  const auto meta =
+      GetBlobFileMetaData(vstorage_.GetBlobFiles(), blob_file_number);
+  ASSERT_NE(meta, nullptr);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  AddDummyFile(table_file_number, blob_file_number);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t new_garbage_blob_count = 456;
+  constexpr uint64_t new_garbage_blob_bytes = 111111;
+
+  edit.AddBlobFileGarbage(blob_file_number, new_garbage_blob_count,
+                          new_garbage_blob_bytes);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = GetBlobFileMetaData(new_blob_files, blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetSharedMeta(), meta->GetSharedMeta());
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(),
+            garbage_blob_count + new_garbage_blob_count);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(),
+            garbage_blob_bytes + new_garbage_blob_bytes);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) {
+  // Increase the amount of garbage for a blob file added using a version edit.
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                       checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  VersionEdit garbage;
+
+  garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                             garbage_blob_bytes);
+
+  ASSERT_OK(builder.Apply(&garbage));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = GetBlobFileMetaData(new_blob_files, blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileNotFound) {
+  // Attempt to increase the amount of garbage for a blob file that is
+  // neither in the base version, nor was it added using a version edit.
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t garbage_blob_count = 5678;
+  constexpr uint64_t garbage_blob_bytes = 999999;
+
+  edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                          garbage_blob_bytes);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 not found"));
+}
+
+TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) {
+  // Test that VersionEdits that would result in the count/total size of garbage
+  // exceeding the count/total size of all blobs are rejected.
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                       checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  {
+    // Garbage blob count overflow
+    constexpr uint64_t garbage_blob_count = 5679;
+    constexpr uint64_t garbage_blob_bytes = 999999;
+
+    VersionEdit garbage;
+
+    garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                               garbage_blob_bytes);
+
+    const Status s = builder.Apply(&garbage);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+  }
+
+  {
+    // Garbage blob bytes overflow
+    constexpr uint64_t garbage_blob_count = 5678;
+    constexpr uint64_t garbage_blob_bytes = 1000000;
+
+    VersionEdit garbage;
+
+    garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                               garbage_blob_bytes);
+
+    const Status s = builder.Apply(&garbage);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+  }
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesTo) {
+  // Add three blob files to base version.
+  for (uint64_t i = 3; i >= 1; --i) {
+    const uint64_t table_file_number = i;
+    const uint64_t blob_file_number = i;
+    const uint64_t total_blob_count = i * 1000;
+    const uint64_t total_blob_bytes = i * 1000000;
+    const uint64_t garbage_blob_count = i * 100;
+    const uint64_t garbage_blob_bytes = i * 20000;
+
+    AddBlob(blob_file_number, total_blob_count, total_blob_bytes,
+            /* checksum_method */ std::string(),
+            /* checksum_value */ std::string(),
+            BlobFileMetaData::LinkedSsts{table_file_number}, garbage_blob_count,
+            garbage_blob_bytes);
+
+    // Add dummy table file to ensure the blob file is referenced.
+    AddDummyFile(table_file_number, blob_file_number);
+  }
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  // Add some garbage to the second and third blob files. The second blob file
+  // remains valid since it does not consist entirely of garbage yet. The third
+  // blob file is all garbage after the edit and will not be part of the new
+  // version. The corresponding dummy table file is also removed for
+  // consistency.
+  edit.AddBlobFileGarbage(/* blob_file_number */ 2,
+                          /* garbage_blob_count */ 200,
+                          /* garbage_blob_bytes */ 100000);
+  edit.AddBlobFileGarbage(/* blob_file_number */ 3,
+                          /* garbage_blob_count */ 2700,
+                          /* garbage_blob_bytes */ 2940000);
+  edit.DeleteFile(/* level */ 0, /* file_number */ 3);
+
+  // Add a fourth blob file.
+  edit.AddBlobFile(/* blob_file_number */ 4, /* total_blob_count */ 4000,
+                   /* total_blob_bytes */ 4000000,
+                   /* checksum_method */ std::string(),
+                   /* checksum_value */ std::string());
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 3);
+
+  const auto meta1 = GetBlobFileMetaData(new_blob_files, 1);
+
+  ASSERT_NE(meta1, nullptr);
+  ASSERT_EQ(meta1->GetBlobFileNumber(), 1);
+  ASSERT_EQ(meta1->GetTotalBlobCount(), 1000);
+  ASSERT_EQ(meta1->GetTotalBlobBytes(), 1000000);
+  ASSERT_EQ(meta1->GetGarbageBlobCount(), 100);
+  ASSERT_EQ(meta1->GetGarbageBlobBytes(), 20000);
+
+  const auto meta2 = GetBlobFileMetaData(new_blob_files, 2);
+
+  ASSERT_NE(meta2, nullptr);
+  ASSERT_EQ(meta2->GetBlobFileNumber(), 2);
+  ASSERT_EQ(meta2->GetTotalBlobCount(), 2000);
+  ASSERT_EQ(meta2->GetTotalBlobBytes(), 2000000);
+  ASSERT_EQ(meta2->GetGarbageBlobCount(), 400);
+  ASSERT_EQ(meta2->GetGarbageBlobBytes(), 140000);
+
+  const auto meta4 = GetBlobFileMetaData(new_blob_files, 4);
+
+  ASSERT_NE(meta4, nullptr);
+  ASSERT_EQ(meta4->GetBlobFileNumber(), 4);
+  ASSERT_EQ(meta4->GetTotalBlobCount(), 4000);
+  ASSERT_EQ(meta4->GetTotalBlobBytes(), 4000000);
+  ASSERT_EQ(meta4->GetGarbageBlobCount(), 0);
+  ASSERT_EQ(meta4->GetGarbageBlobBytes(), 0);
+
+  // Delete the first table file, which makes the first blob file obsolete
+  // since it's at the head and unreferenced.
+  VersionBuilder second_builder(env_options, &ioptions_, table_cache,
+                                &new_vstorage, version_set);
+
+  VersionEdit second_edit;
+  second_edit.DeleteFile(/* level */ 0, /* file_number */ 1);
+
+  ASSERT_OK(second_builder.Apply(&second_edit));
+
+  VersionStorageInfo newer_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                    kCompactionStyleLevel, &new_vstorage,
+                                    force_consistency_checks);
+
+  ASSERT_OK(second_builder.SaveTo(&newer_vstorage));
+
+  const auto& newer_blob_files = newer_vstorage.GetBlobFiles();
+  ASSERT_EQ(newer_blob_files.size(), 2);
+
+  const auto newer_meta1 = GetBlobFileMetaData(newer_blob_files, 1);
+
+  ASSERT_EQ(newer_meta1, nullptr);
+
+  UnrefFilesInVersion(&newer_vstorage);
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) {
+  // When multiple background jobs (flushes/compactions) are executing in
+  // parallel, it is possible for the VersionEdit adding blob file K to be
+  // applied *after* the VersionEdit adding blob file N (for N > K). This test
+  // case makes sure this is handled correctly.
+
+  // Add blob file #4 (referenced by table file #3) to base version.
+  constexpr uint64_t base_table_file_number = 3;
+  constexpr uint64_t base_blob_file_number = 4;
+  constexpr uint64_t base_total_blob_count = 100;
+  constexpr uint64_t base_total_blob_bytes = 1 << 20;
+
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] = "\xfa\xce\xb0\x0c";
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  AddDummyFile(base_table_file_number, base_blob_file_number);
+  AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes,
+          checksum_method, checksum_value,
+          BlobFileMetaData::LinkedSsts{base_table_file_number},
+          garbage_blob_count, garbage_blob_bytes);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  // Add blob file #2 (referenced by table file #1).
+  constexpr int level = 0;
+  constexpr uint64_t table_file_number = 1;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 1 << 12;
+  constexpr char smallest[] = "key1";
+  constexpr char largest[] = "key987";
+  constexpr SequenceNumber smallest_seqno = 0;
+  constexpr SequenceNumber largest_seqno = 0;
+  constexpr bool marked_for_compaction = false;
+
+  constexpr uint64_t blob_file_number = 2;
+  static_assert(blob_file_number < base_blob_file_number,
+                "Added blob file should have a smaller file number");
+
+  constexpr uint64_t total_blob_count = 234;
+  constexpr uint64_t total_blob_bytes = 1 << 22;
+
+  edit.AddFile(
+      level, table_file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, blob_file_number,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime, checksum_value,
+      checksum_method, kDisableUserTimestamp, kDisableUserTimestamp);
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 2);
+
+  const auto base_meta =
+      GetBlobFileMetaData(new_blob_files, base_blob_file_number);
+
+  ASSERT_NE(base_meta, nullptr);
+  ASSERT_EQ(base_meta->GetBlobFileNumber(), base_blob_file_number);
+  ASSERT_EQ(base_meta->GetTotalBlobCount(), base_total_blob_count);
+  ASSERT_EQ(base_meta->GetTotalBlobBytes(), base_total_blob_bytes);
+  ASSERT_EQ(base_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(base_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+  ASSERT_EQ(base_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(base_meta->GetChecksumValue(), checksum_value);
+
+  const auto added_meta = GetBlobFileMetaData(new_blob_files, blob_file_number);
+
+  ASSERT_NE(added_meta, nullptr);
+  ASSERT_EQ(added_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(added_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(added_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(added_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(added_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+  ASSERT_EQ(added_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(added_meta->GetChecksumValue(), checksum_value);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
+  // Initialize base version. The first table file points to a valid blob file
+  // in this version; the second one does not refer to any blob files.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+  Add(/* level */ 1, /* file_number */ 23, /* smallest */ "201",
+      /* largest */ "300", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 200, /* largest_seq */ 200,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 200, /* largest_seqno */ 200,
+      kInvalidBlobFileNumber);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+  UpdateVersionStorageInfo();
+
+  // Add a new table file that points to the existing blob file, and add a
+  // new table file--blob file pair.
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddFile(/* level */ 1, /* file_number */ 606, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("701"),
+               /* largest */ GetInternalKey("750"), /* smallest_seqno */ 200,
+               /* largest_seqno */ 200, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+               kDisableUserTimestamp);
+
+  edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("801"),
+               /* largest */ GetInternalKey("850"), /* smallest_seqno */ 200,
+               /* largest_seqno */ 200, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+               kDisableUserTimestamp);
+  edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000,
+                   /* total_blob_bytes */ 200000,
+                   /* checksum_method */ std::string(),
+                   /* checksum_value */ std::string());
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesInconsistentLinks) {
+  // Initialize base version. Links between the table file and the blob file
+  // are inconsistent.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 256);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(
+      s.getState(),
+      "Links are inconsistent between table files and blob file #16"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbage) {
+  // Initialize base version. The table file points to a blob file that is
+  // all garbage.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(
+      std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbageLinkedSsts) {
+  // Initialize base version, with a table file pointing to a blob file
+  // that has no garbage at this point.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 0, /* garbage_blob_bytes */ 0);
+
+  UpdateVersionStorageInfo();
+
+  // Mark the entire blob file garbage but do not remove the linked SST.
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddBlobFileGarbage(/* blob_file_number */ 16,
+                          /* garbage_blob_count */ 1000,
+                          /* garbage_blob_bytes */ 1000000);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(
+      std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
+  // Initialize base version. Table files 1..10 are linked to blob files 1..5,
+  // while table files 11..20 are not linked to any blob files.
+
+  for (uint64_t i = 1; i <= 10; ++i) {
+    std::ostringstream oss;
+    oss << std::setw(2) << std::setfill('0') << i;
+
+    const std::string key = oss.str();
+
+    Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+        /* largest */ key.c_str(), /* file_size */ 100,
+        /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+        /* num_entries */ 0, /* num_deletions */ 0,
+        /* sampled */ false, /* smallest_seqno */ i * 100,
+        /* largest_seqno */ i * 100,
+        /* oldest_blob_file_number */ ((i - 1) % 5) + 1);
+  }
+
+  for (uint64_t i = 1; i <= 5; ++i) {
+    AddBlob(/* blob_file_number */ i, /* total_blob_count */ 2000,
+            /* total_blob_bytes */ 2000000,
+            /* checksum_method */ std::string(),
+            /* checksum_value */ std::string(),
+            BlobFileMetaData::LinkedSsts{i, i + 5},
+            /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+  }
+
+  for (uint64_t i = 11; i <= 20; ++i) {
+    std::ostringstream oss;
+    oss << std::setw(2) << std::setfill('0') << i;
+
+    const std::string key = oss.str();
+
+    Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+        /* largest */ key.c_str(), /* file_size */ 100,
+        /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+        /* num_entries */ 0, /* num_deletions */ 0,
+        /* sampled */ false, /* smallest_seqno */ i * 100,
+        /* largest_seqno */ i * 100, kInvalidBlobFileNumber);
+  }
+
+  UpdateVersionStorageInfo();
+
+  {
+    const auto& blob_files = vstorage_.GetBlobFiles();
+    ASSERT_EQ(blob_files.size(), 5);
+
+    const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+    for (size_t i = 0; i < 5; ++i) {
+      const auto meta =
+          GetBlobFileMetaData(blob_files, /* blob_file_number */ i + 1);
+      ASSERT_NE(meta, nullptr);
+      ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+    }
+  }
+
+  VersionEdit edit;
+
+  // Add an SST that references a blob file.
+  edit.AddFile(
+      /* level */ 1, /* file_number */ 21, /* path_id */ 0,
+      /* file_size */ 100, /* smallest */ GetInternalKey("21", 2100),
+      /* largest */ GetInternalKey("21", 2100), /* smallest_seqno */ 2100,
+      /* largest_seqno */ 2100, /* marked_for_compaction */ false,
+      Temperature::kUnknown,
+      /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime,
+      kUnknownFileCreationTime, kUnknownFileChecksum,
+      kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp);
+
+  // Add an SST that does not reference any blob files.
+  edit.AddFile(
+      /* level */ 1, /* file_number */ 22, /* path_id */ 0,
+      /* file_size */ 100, /* smallest */ GetInternalKey("22", 2200),
+      /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200,
+      /* largest_seqno */ 2200, /* marked_for_compaction */ false,
+      Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+      kUnknownFileCreationTime, kUnknownFileChecksum,
+      kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp);
+
+  // Delete a file that references a blob file.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 6);
+
+  // Delete a file that does not reference any blob files.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 16);
+
+  // Trivially move a file that references a blob file. Note that we save
+  // the original BlobFileMetaData object so we can check that no new object
+  // gets created.
+  auto meta3 =
+      GetBlobFileMetaData(vstorage_.GetBlobFiles(), /* blob_file_number */ 3);
+
+  edit.DeleteFile(/* level */ 1, /* file_number */ 3);
+  edit.AddFile(/* level */ 2, /* file_number */ 3, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("03", 300),
+               /* largest */ GetInternalKey("03", 300),
+               /* smallest_seqno */ 300,
+               /* largest_seqno */ 300, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+               kDisableUserTimestamp);
+
+  // Trivially move a file that does not reference any blob files.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 13);
+  edit.AddFile(/* level */ 2, /* file_number */ 13, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("13", 1300),
+               /* largest */ GetInternalKey("13", 1300),
+               /* smallest_seqno */ 1300,
+               /* largest_seqno */ 1300, /* marked_for_compaction */ false,
+               Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kDisableUserTimestamp, kDisableUserTimestamp);
+
+  // Add one more SST file that references a blob file, then promptly
+  // delete it in a second version edit before the new version gets saved.
+  // This file should not show up as linked to the blob file in the new version.
+  edit.AddFile(/* level */ 1, /* file_number */ 23, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("23", 2300),
+               /* largest */ GetInternalKey("23", 2300),
+               /* smallest_seqno */ 2300,
+               /* largest_seqno */ 2300, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+               kDisableUserTimestamp);
+
+  VersionEdit edit2;
+
+  edit2.DeleteFile(/* level */ 1, /* file_number */ 23);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  ASSERT_OK(builder.Apply(&edit));
+  ASSERT_OK(builder.Apply(&edit2));
+
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  {
+    const auto& blob_files = new_vstorage.GetBlobFiles();
+    ASSERT_EQ(blob_files.size(), 5);
+
+    const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+        {1, 21}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+    for (size_t i = 0; i < 5; ++i) {
+      const auto meta =
+          GetBlobFileMetaData(blob_files, /* blob_file_number */ i + 1);
+      ASSERT_NE(meta, nullptr);
+      ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+    }
+
+    // Make sure that no new BlobFileMetaData got created for the blob file
+    // affected by the trivial move.
+    ASSERT_EQ(GetBlobFileMetaData(blob_files, /* blob_file_number */ 3), meta3);
+  }
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) {
+  Add(0, 1U, "150", "200", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.DeleteFile(0, 1U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr,
+                                  true /* force_consistency_checks */);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  VersionBuilder version_builder2(env_options, &ioptions_, table_cache,
+                                 &new_vstorage, version_set);
+  VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr,
+                                  true /* force_consistency_checks */);
+  ASSERT_NOK(version_builder2.Apply(&version_edit));
+
+  UnrefFilesInVersion(&new_vstorage);
+  UnrefFilesInVersion(&new_vstorage2);
+}
+
 TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
   const uint32_t kTotalSamples = 20;
   const uint32_t kNumLevels = 5;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,7 +9,7 @@
 
 #include "db/version_edit.h"
 
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
 #include "db/version_set.h"
 #include "logging/event_logger.h"
 #include "rocksdb/slice.h"
@@ -18,61 +18,10 @@
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
-// The unknown file checksum.
-const std::string kUnknownFileChecksum("");
-// The unknown sst file checksum function name.
-const std::string kUnknownFileChecksumFuncName("Unknown");
-// Mask for an identified tag from the future which can be safely ignored.
-const uint32_t kTagSafeIgnoreMask = 1 << 13;
-
-// Tag numbers for serialized VersionEdit.  These numbers are written to
-// disk and should not be changed. The number should be forward compatible so
-// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
-// between Tag and kTagSafeIgnoreMask field.
-enum Tag : uint32_t {
-  kComparator = 1,
-  kLogNumber = 2,
-  kNextFileNumber = 3,
-  kLastSequence = 4,
-  kCompactPointer = 5,
-  kDeletedFile = 6,
-  kNewFile = 7,
-  // 8 was used for large value refs
-  kPrevLogNumber = 9,
-  kMinLogNumberToKeep = 10,
-  // Ignore-able field
-  kDbId = kTagSafeIgnoreMask + 1,
-
-  // these are new formats divergent from open source leveldb
-  kNewFile2 = 100,
-  kNewFile3 = 102,
-  kNewFile4 = 103,      // 4th (the latest) format version of adding files
-  kColumnFamily = 200,  // specify column family for version edit
-  kColumnFamilyAdd = 201,
-  kColumnFamilyDrop = 202,
-  kMaxColumnFamily = 203,
-
-  kInAtomicGroup = 300,
-};
-
-enum CustomTag : uint32_t {
-  kTerminate = 1,  // The end of customized fields
-  kNeedCompaction = 2,
-  // Since Manifest is not entirely currently forward-compatible, and the only
-  // forward-compatible part is the CutsomtTag of kNewFile, we currently encode
-  // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
-  // removed when manifest becomes forward-comptabile.
-  kMinLogNumberToKeepHack = 3,
-  kOldestBlobFileNumber = 4,
-  kOldestAncesterTime = 5,
-  kFileCreationTime = 6,
-  kFileChecksum = 7,
-  kFileChecksumFuncName = 8,
-  kPathId = 65,
-};
-// If this bit for the custom tag is set, opening DB should fail if
-// we don't know this field.
-uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6;
+
+namespace {
+
+}  // anonymous namespace
 
 uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
   assert(number <= kFileNumberMask);
@@ -89,7 +38,6 @@
   fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
   fd.largest_seqno = std::max(fd.largest_seqno, seqno);
 
-#ifndef ROCKSDB_LITE
   if (value_type == kTypeBlobIndex) {
     BlobIndex blob_index;
     const Status s = blob_index.DecodeFrom(value);
@@ -116,10 +64,6 @@
       oldest_blob_file_number = blob_index.file_number();
     }
   }
-#else
-  (void)value;
-  (void)value_type;
-#endif
 }
 
 void VersionEdit::Clear() {
@@ -142,12 +86,17 @@
   has_last_sequence_ = false;
   deleted_files_.clear();
   new_files_.clear();
+  blob_file_additions_.clear();
+  blob_file_garbages_.clear();
+  wal_additions_.clear();
+  wal_deletion_.Reset();
   column_family_ = 0;
   is_column_family_add_ = false;
   is_column_family_drop_ = false;
   column_family_name_.clear();
   is_in_atomic_group_ = false;
   remaining_entries_ = 0;
+  full_history_ts_low_.clear();
 }
 
 bool VersionEdit::EncodeTo(std::string* dst) const {
@@ -217,45 +166,60 @@
     //   tag kNeedCompaction:
     //        now only can take one char value 1 indicating need-compaction
     //
-    PutVarint32(dst, CustomTag::kOldestAncesterTime);
+    PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
     std::string varint_oldest_ancester_time;
     PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
     TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
                              &varint_oldest_ancester_time);
     PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
 
-    PutVarint32(dst, CustomTag::kFileCreationTime);
+    PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
     std::string varint_file_creation_time;
     PutVarint64(&varint_file_creation_time, f.file_creation_time);
     TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
                              &varint_file_creation_time);
     PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
 
-    PutVarint32(dst, CustomTag::kFileChecksum);
+    PutVarint32(dst, NewFileCustomTag::kFileChecksum);
     PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
 
-    PutVarint32(dst, CustomTag::kFileChecksumFuncName);
+    PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
     PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
 
+    if (f.max_timestamp != kDisableUserTimestamp) {
+      if (f.min_timestamp.size() != f.max_timestamp.size()) {
+        assert(false);
+        return false;
+      }
+      PutVarint32(dst, NewFileCustomTag::kMinTimestamp);
+      PutLengthPrefixedSlice(dst, Slice(f.min_timestamp));
+      PutVarint32(dst, NewFileCustomTag::kMaxTimestamp);
+      PutLengthPrefixedSlice(dst, Slice(f.max_timestamp));
+    }
     if (f.fd.GetPathId() != 0) {
-      PutVarint32(dst, CustomTag::kPathId);
+      PutVarint32(dst, NewFileCustomTag::kPathId);
       char p = static_cast<char>(f.fd.GetPathId());
       PutLengthPrefixedSlice(dst, Slice(&p, 1));
     }
+    if (f.temperature != Temperature::kUnknown) {
+      PutVarint32(dst, NewFileCustomTag::kTemperature);
+      char p = static_cast<char>(f.temperature);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
     if (f.marked_for_compaction) {
-      PutVarint32(dst, CustomTag::kNeedCompaction);
+      PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
       char p = static_cast<char>(1);
       PutLengthPrefixedSlice(dst, Slice(&p, 1));
     }
     if (has_min_log_number_to_keep_ && !min_log_num_written) {
-      PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
+      PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
       std::string varint_log_number;
       PutFixed64(&varint_log_number, min_log_number_to_keep_);
       PutLengthPrefixedSlice(dst, Slice(varint_log_number));
       min_log_num_written = true;
     }
     if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
-      PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
+      PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
       std::string oldest_blob_file_number;
       PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
       PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
@@ -263,7 +227,31 @@
     TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
                              dst);
 
-    PutVarint32(dst, CustomTag::kTerminate);
+    PutVarint32(dst, NewFileCustomTag::kTerminate);
+  }
+
+  for (const auto& blob_file_addition : blob_file_additions_) {
+    PutVarint32(dst, kBlobFileAddition);
+    blob_file_addition.EncodeTo(dst);
+  }
+
+  for (const auto& blob_file_garbage : blob_file_garbages_) {
+    PutVarint32(dst, kBlobFileGarbage);
+    blob_file_garbage.EncodeTo(dst);
+  }
+
+  for (const auto& wal_addition : wal_additions_) {
+    PutVarint32(dst, kWalAddition2);
+    std::string encoded;
+    wal_addition.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    PutVarint32(dst, kWalDeletion2);
+    std::string encoded;
+    wal_deletion_.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
   }
 
   // 0 is default and does not need to be explicitly written
@@ -284,6 +272,11 @@
     PutVarint32(dst, kInAtomicGroup);
     PutVarint32(dst, remaining_entries_);
   }
+
+  if (HasFullHistoryTsLow()) {
+    PutVarint32(dst, kFullHistoryTsLow);
+    PutLengthPrefixedSlice(dst, full_history_ts_low_);
+  }
   return true;
 }
 
@@ -319,9 +312,6 @@
   uint64_t file_size = 0;
   SequenceNumber smallest_seqno = 0;
   SequenceNumber largest_seqno = kMaxSequenceNumber;
-  // Since this is the only forward-compatible part of the code, we hack new
-  // extension into this record. When we do, we set this boolean to distinguish
-  // the record from the normal NewFile records.
   if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
       GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
       GetInternalKey(input, &f.largest) &&
@@ -335,6 +325,10 @@
         return "new-file4 custom field";
       }
       if (custom_tag == kTerminate) {
+        if (f.min_timestamp.size() != f.max_timestamp.size()) {
+          assert(false);
+          return "new-file4 custom field timestamp size mismatch error";
+        }
         break;
       }
       if (!GetLengthPrefixedSlice(input, &field)) {
@@ -385,6 +379,22 @@
             return "invalid oldest blob file number";
           }
           break;
+        case kTemperature:
+          if (field.size() != 1) {
+            return "temperature field wrong size";
+          } else {
+            Temperature casted_field = static_cast<Temperature>(field[0]);
+            if (casted_field <= Temperature::kCold) {
+              f.temperature = casted_field;
+            }
+          }
+          break;
+        case kMinTimestamp:
+          f.min_timestamp = field.ToString();
+          break;
+        case kMaxTimestamp:
+          f.max_timestamp = field.ToString();
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
@@ -404,6 +414,11 @@
 
 Status VersionEdit::DecodeFrom(const Slice& src) {
   Clear();
+#ifndef NDEBUG
+  bool ignore_ignorable_tags = false;
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags",
+                           &ignore_ignorable_tags);
+#endif
   Slice input = src;
   const char* msg = nullptr;
   uint32_t tag = 0;
@@ -414,6 +429,11 @@
   Slice str;
   InternalKey key;
   while (msg == nullptr && GetVarint32(&input, &tag)) {
+#ifndef NDEBUG
+    if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) {
+      tag = kTagSafeIgnoreMask;
+    }
+#endif
     switch (tag) {
       case kDbId:
         if (GetLengthPrefixedSlice(&input, &str)) {
@@ -571,6 +591,86 @@
         break;
       }
 
+      case kBlobFileAddition:
+      case kBlobFileAddition_DEPRECATED: {
+        BlobFileAddition blob_file_addition;
+        const Status s = blob_file_addition.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        AddBlobFile(std::move(blob_file_addition));
+        break;
+      }
+
+      case kBlobFileGarbage:
+      case kBlobFileGarbage_DEPRECATED: {
+        BlobFileGarbage blob_file_garbage;
+        const Status s = blob_file_garbage.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        AddBlobFileGarbage(std::move(blob_file_garbage));
+        break;
+      }
+
+      case kWalAddition: {
+        WalAddition wal_addition;
+        const Status s = wal_addition.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_additions_.emplace_back(std::move(wal_addition));
+        break;
+      }
+
+      case kWalAddition2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalAddition not prefixed by length";
+          break;
+        }
+
+        WalAddition wal_addition;
+        const Status s = wal_addition.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_additions_.emplace_back(std::move(wal_addition));
+        break;
+      }
+
+      case kWalDeletion: {
+        WalDeletion wal_deletion;
+        const Status s = wal_deletion.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_deletion_ = std::move(wal_deletion);
+        break;
+      }
+
+      case kWalDeletion2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalDeletion not prefixed by length";
+          break;
+        }
+
+        WalDeletion wal_deletion;
+        const Status s = wal_deletion.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_deletion_ = std::move(wal_deletion);
+        break;
+      }
+
       case kColumnFamily:
         if (!GetVarint32(&input, &column_family_)) {
           if (!msg) {
@@ -603,6 +703,16 @@
         }
         break;
 
+      case kFullHistoryTsLow:
+        if (!GetLengthPrefixedSlice(&input, &str)) {
+          msg = "full_history_ts_low";
+        } else if (str.empty()) {
+          msg = "full_history_ts_low: empty";
+        } else {
+          full_history_ts_low_.assign(str.data(), str.size());
+        }
+        break;
+
       default:
         if (tag & kTagSafeIgnoreMask) {
           // Tag from future which can be safely ignored.
@@ -691,15 +801,49 @@
       r.append(" blob_file:");
       AppendNumberTo(&r, f.oldest_blob_file_number);
     }
+    if (f.min_timestamp != kDisableUserTimestamp) {
+      assert(f.max_timestamp != kDisableUserTimestamp);
+      r.append(" min_timestamp:");
+      r.append(Slice(f.min_timestamp).ToString(true));
+      r.append(" max_timestamp:");
+      r.append(Slice(f.max_timestamp).ToString(true));
+    }
     r.append(" oldest_ancester_time:");
     AppendNumberTo(&r, f.oldest_ancester_time);
     r.append(" file_creation_time:");
     AppendNumberTo(&r, f.file_creation_time);
     r.append(" file_checksum:");
-    r.append(f.file_checksum);
+    r.append(Slice(f.file_checksum).ToString(true));
     r.append(" file_checksum_func_name: ");
     r.append(f.file_checksum_func_name);
+    if (f.temperature != Temperature::kUnknown) {
+      r.append(" temperature: ");
+      // Maybe change to human readable format whenthe feature becomes
+      // permanent
+      r.append(ToString(static_cast<int>(f.temperature)));
+    }
+  }
+
+  for (const auto& blob_file_addition : blob_file_additions_) {
+    r.append("\n  BlobFileAddition: ");
+    r.append(blob_file_addition.DebugString());
+  }
+
+  for (const auto& blob_file_garbage : blob_file_garbages_) {
+    r.append("\n  BlobFileGarbage: ");
+    r.append(blob_file_garbage.DebugString());
   }
+
+  for (const auto& wal_addition : wal_additions_) {
+    r.append("\n  WalAddition: ");
+    r.append(wal_addition.DebugString());
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    r.append("\n  WalDeletion: ");
+    r.append(wal_deletion_.DebugString());
+  }
+
   r.append("\n  ColumnFamily: ");
   AppendNumberTo(&r, column_family_);
   if (is_column_family_add_) {
@@ -714,6 +858,10 @@
     AppendNumberTo(&r, remaining_entries_);
     r.append(" entries remains");
   }
+  if (HasFullHistoryTsLow()) {
+    r.append("\n FullHistoryTsLow: ");
+    r.append(Slice(full_history_ts_low_).ToString(hex_key));
+  }
   r.append("\n}\n");
   return r;
 }
@@ -773,15 +921,81 @@
       jw << "FileSize" << f.fd.GetFileSize();
       jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
       jw << "LargestIKey" << f.largest.DebugString(hex_key);
+      if (f.min_timestamp != kDisableUserTimestamp) {
+        assert(f.max_timestamp != kDisableUserTimestamp);
+        jw << "MinTimestamp" << Slice(f.min_timestamp).ToString(true);
+        jw << "MaxTimestamp" << Slice(f.max_timestamp).ToString(true);
+      }
+      jw << "OldestAncesterTime" << f.oldest_ancester_time;
+      jw << "FileCreationTime" << f.file_creation_time;
+      jw << "FileChecksum" << Slice(f.file_checksum).ToString(true);
+      jw << "FileChecksumFuncName" << f.file_checksum_func_name;
+      if (f.temperature != Temperature::kUnknown) {
+        jw << "temperature" << ToString(static_cast<int>(f.temperature));
+      }
       if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
         jw << "OldestBlobFile" << f.oldest_blob_file_number;
       }
+      if (f.temperature != Temperature::kUnknown) {
+        // Maybe change to human readable format whenthe feature becomes
+        // permanent
+        jw << "Temperature" << static_cast<int>(f.temperature);
+      }
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!blob_file_additions_.empty()) {
+    jw << "BlobFileAdditions";
+
+    jw.StartArray();
+
+    for (const auto& blob_file_addition : blob_file_additions_) {
+      jw.StartArrayedObject();
+      jw << blob_file_addition;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!blob_file_garbages_.empty()) {
+    jw << "BlobFileGarbages";
+
+    jw.StartArray();
+
+    for (const auto& blob_file_garbage : blob_file_garbages_) {
+      jw.StartArrayedObject();
+      jw << blob_file_garbage;
       jw.EndArrayedObject();
     }
 
     jw.EndArray();
   }
 
+  if (!wal_additions_.empty()) {
+    jw << "WalAdditions";
+
+    jw.StartArray();
+
+    for (const auto& wal_addition : wal_additions_) {
+      jw.StartArrayedObject();
+      jw << wal_addition;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    jw << "WalDeletion";
+    jw.StartObject();
+    jw << wal_deletion_;
+    jw.EndObject();
+  }
+
   jw << "ColumnFamily" << column_family_;
 
   if (is_column_family_add_) {
@@ -794,6 +1008,10 @@
     jw << "AtomicGroup" << remaining_entries_;
   }
 
+  if (HasFullHistoryTsLow()) {
+    jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
+  }
+
   jw.EndObject();
 
   return jw.Get();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,24 +13,93 @@
 #include <string>
 #include <utility>
 #include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_garbage.h"
 #include "db/dbformat.h"
+#include "db/wal_edit.h"
 #include "memory/arena.h"
+#include "rocksdb/advanced_options.h"
 #include "rocksdb/cache.h"
 #include "table/table_reader.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
+enum Tag : uint32_t {
+  kComparator = 1,
+  kLogNumber = 2,
+  kNextFileNumber = 3,
+  kLastSequence = 4,
+  kCompactPointer = 5,
+  kDeletedFile = 6,
+  kNewFile = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber = 9,
+  kMinLogNumberToKeep = 10,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2 = 100,
+  kNewFile3 = 102,
+  kNewFile4 = 103,      // 4th (the latest) format version of adding files
+  kColumnFamily = 200,  // specify column family for version edit
+  kColumnFamilyAdd = 201,
+  kColumnFamilyDrop = 202,
+  kMaxColumnFamily = 203,
+
+  kInAtomicGroup = 300,
+
+  kBlobFileAddition = 400,
+  kBlobFileGarbage,
+
+  // Mask for an unidentified tag from the future which can be safely ignored.
+  kTagSafeIgnoreMask = 1 << 13,
+
+  // Forward compatible (aka ignorable) records
+  kDbId,
+  kBlobFileAddition_DEPRECATED,
+  kBlobFileGarbage_DEPRECATED,
+  kWalAddition,
+  kWalDeletion,
+  kFullHistoryTsLow,
+  kWalAddition2,
+  kWalDeletion2,
+};
+
+enum NewFileCustomTag : uint32_t {
+  kTerminate = 1,  // The end of customized fields
+  kNeedCompaction = 2,
+  // Since Manifest is not entirely forward-compatible, we currently encode
+  // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed
+  // when manifest becomes forward-compatible.
+  kMinLogNumberToKeepHack = 3,
+  kOldestBlobFileNumber = 4,
+  kOldestAncesterTime = 5,
+  kFileCreationTime = 6,
+  kFileChecksum = 7,
+  kFileChecksumFuncName = 8,
+  kTemperature = 9,
+  kMinTimestamp = 10,
+  kMaxTimestamp = 11,
+
+  // If this bit for the custom tag is set, opening DB should fail if
+  // we don't know this field.
+  kCustomTagNonSafeIgnoreMask = 1 << 6,
+
+  // Forward incompatible (aka unignorable) fields
+  kPathId,
+};
+
 class VersionSet;
 
 constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
-constexpr uint64_t kInvalidBlobFileNumber = 0;
 constexpr uint64_t kUnknownOldestAncesterTime = 0;
 constexpr uint64_t kUnknownFileCreationTime = 0;
 
-extern const std::string kUnknownFileChecksum;
-extern const std::string kUnknownFileChecksumFuncName;
-
 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
 
 // A copyable structure contains information needed to read data from an SST
@@ -123,6 +192,7 @@
 
   bool marked_for_compaction = false;  // True if client asked us nicely to
                                        // compact this file.
+  Temperature temperature = Temperature::kUnknown;
 
   // Used only in BlobDB. The file number of the oldest blob file this SST file
   // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
@@ -130,7 +200,7 @@
 
   // The file could be the compaction output from other SST files, which could
   // in turn be outputs for compact older SST files. We track the memtable
-  // flush timestamp for the oldest SST file that eventaully contribute data
+  // flush timestamp for the oldest SST file that eventually contribute data
   // to this file. 0 means the information is not available.
   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
 
@@ -142,6 +212,10 @@
 
   // File checksum function name
   std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+  // Min (oldest) timestamp of keys in this file
+  std::string min_timestamp;
+  // Max (newest) timestamp of keys in this file
+  std::string max_timestamp;
 
   FileMetaData() = default;
 
@@ -149,18 +223,23 @@
                const InternalKey& smallest_key, const InternalKey& largest_key,
                const SequenceNumber& smallest_seq,
                const SequenceNumber& largest_seq, bool marked_for_compact,
-               uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
-               uint64_t _file_creation_time, const std::string& _file_checksum,
-               const std::string& _file_checksum_func_name)
+               Temperature _temperature, uint64_t oldest_blob_file,
+               uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+               const std::string& _file_checksum,
+               const std::string& _file_checksum_func_name,
+               std::string _min_timestamp, std::string _max_timestamp)
       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
         smallest(smallest_key),
         largest(largest_key),
         marked_for_compaction(marked_for_compact),
+        temperature(_temperature),
         oldest_blob_file_number(oldest_blob_file),
         oldest_ancester_time(_oldest_ancester_time),
         file_creation_time(_file_creation_time),
         file_checksum(_file_checksum),
-        file_checksum_func_name(_file_checksum_func_name) {
+        file_checksum_func_name(_file_checksum_func_name),
+        min_timestamp(std::move(_min_timestamp)),
+        max_timestamp(std::move(_max_timestamp)) {
     TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
   }
 
@@ -307,16 +386,16 @@
   bool HasLastSequence() const { return has_last_sequence_; }
   SequenceNumber GetLastSequence() const { return last_sequence_; }
 
-  // Delete the specified "file" from the specified "level".
+  // Delete the specified table file from the specified level.
   void DeleteFile(int level, uint64_t file) {
     deleted_files_.emplace(level, file);
   }
 
-  // Retrieve the files deleted as well as their associated levels.
+  // Retrieve the table files deleted as well as their associated levels.
   using DeletedFiles = std::set<std::pair<int, uint64_t>>;
   const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
 
-  // Add the specified file at the specified level.
+  // Add the specified table file at the specified level.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
   // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
@@ -325,29 +404,120 @@
                uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno, bool marked_for_compaction,
-               uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
-               uint64_t file_creation_time, const std::string& file_checksum,
-               const std::string& file_checksum_func_name) {
+               Temperature temperature, uint64_t oldest_blob_file_number,
+               uint64_t oldest_ancester_time, uint64_t file_creation_time,
+               const std::string& file_checksum,
+               const std::string& file_checksum_func_name,
+               const std::string& min_timestamp,
+               const std::string& max_timestamp) {
     assert(smallest_seqno <= largest_seqno);
     new_files_.emplace_back(
-        level, FileMetaData(file, file_path_id, file_size, smallest, largest,
-                            smallest_seqno, largest_seqno,
-                            marked_for_compaction, oldest_blob_file_number,
-                            oldest_ancester_time, file_creation_time,
-                            file_checksum, file_checksum_func_name));
+        level,
+        FileMetaData(file, file_path_id, file_size, smallest, largest,
+                     smallest_seqno, largest_seqno, marked_for_compaction,
+                     temperature, oldest_blob_file_number, oldest_ancester_time,
+                     file_creation_time, file_checksum, file_checksum_func_name,
+                     min_timestamp, max_timestamp));
+    if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
+      SetLastSequence(largest_seqno);
+    }
   }
 
   void AddFile(int level, const FileMetaData& f) {
     assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
     new_files_.emplace_back(level, f);
+    if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) {
+      SetLastSequence(f.fd.largest_seqno);
+    }
   }
 
-  // Retrieve the files added as well as their associated levels.
+  // Retrieve the table files added as well as their associated levels.
   using NewFiles = std::vector<std::pair<int, FileMetaData>>;
   const NewFiles& GetNewFiles() const { return new_files_; }
 
+  // Add a new blob file.
+  void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count,
+                   uint64_t total_blob_bytes, std::string checksum_method,
+                   std::string checksum_value) {
+    blob_file_additions_.emplace_back(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value));
+  }
+
+  void AddBlobFile(BlobFileAddition blob_file_addition) {
+    blob_file_additions_.emplace_back(std::move(blob_file_addition));
+  }
+
+  // Retrieve all the blob files added.
+  using BlobFileAdditions = std::vector<BlobFileAddition>;
+  const BlobFileAdditions& GetBlobFileAdditions() const {
+    return blob_file_additions_;
+  }
+
+  void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) {
+    assert(blob_file_additions_.empty());
+    blob_file_additions_ = std::move(blob_file_additions);
+  }
+
+  // Add garbage for an existing blob file.  Note: intentionally broken English
+  // follows.
+  void AddBlobFileGarbage(uint64_t blob_file_number,
+                          uint64_t garbage_blob_count,
+                          uint64_t garbage_blob_bytes) {
+    blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count,
+                                     garbage_blob_bytes);
+  }
+
+  void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) {
+    blob_file_garbages_.emplace_back(std::move(blob_file_garbage));
+  }
+
+  // Retrieve all the blob file garbage added.
+  using BlobFileGarbages = std::vector<BlobFileGarbage>;
+  const BlobFileGarbages& GetBlobFileGarbages() const {
+    return blob_file_garbages_;
+  }
+
+  void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) {
+    assert(blob_file_garbages_.empty());
+    blob_file_garbages_ = std::move(blob_file_garbages);
+  }
+
+  // Add a WAL (either just created or closed).
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+  void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) {
+    assert(NumEntries() == wal_additions_.size());
+    wal_additions_.emplace_back(number, std::move(metadata));
+  }
+
+  // Retrieve all the added WALs.
+  const WalAdditions& GetWalAdditions() const { return wal_additions_; }
+
+  bool IsWalAddition() const { return !wal_additions_.empty(); }
+
+  // Delete a WAL (either directly deleted or archived).
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+  void DeleteWalsBefore(WalNumber number) {
+    assert((NumEntries() == 1) == !wal_deletion_.IsEmpty());
+    wal_deletion_ = WalDeletion(number);
+  }
+
+  const WalDeletion& GetWalDeletion() const { return wal_deletion_; }
+
+  bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); }
+
+  bool IsWalManipulation() const {
+    size_t entries = NumEntries();
+    return (entries > 0) && ((entries == wal_additions_.size()) ||
+                             (entries == !wal_deletion_.IsEmpty()));
+  }
+
   // Number of edits
-  size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); }
+  size_t NumEntries() const {
+    return new_files_.size() + deleted_files_.size() +
+           blob_file_additions_.size() + blob_file_garbages_.size() +
+           wal_additions_.size() + !wal_deletion_.IsEmpty();
+  }
 
   void SetColumnFamily(uint32_t column_family_id) {
     column_family_ = column_family_id;
@@ -375,6 +545,10 @@
     return is_column_family_add_ || is_column_family_drop_;
   }
 
+  bool IsColumnFamilyAdd() const { return is_column_family_add_; }
+
+  bool IsColumnFamilyDrop() const { return is_column_family_drop_; }
+
   void MarkAtomicGroup(uint32_t remaining_entries) {
     is_in_atomic_group_ = true;
     remaining_entries_ = remaining_entries;
@@ -382,6 +556,16 @@
   bool IsInAtomicGroup() const { return is_in_atomic_group_; }
   uint32_t GetRemainingEntries() const { return remaining_entries_; }
 
+  bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); }
+  const std::string& GetFullHistoryTsLow() const {
+    assert(HasFullHistoryTsLow());
+    return full_history_ts_low_;
+  }
+  void SetFullHistoryTsLow(std::string full_history_ts_low) {
+    assert(!full_history_ts_low.empty());
+    full_history_ts_low_ = std::move(full_history_ts_low);
+  }
+
   // return true on success.
   bool EncodeTo(std::string* dst) const;
   Status DecodeFrom(const Slice& src);
@@ -391,6 +575,11 @@
 
  private:
   friend class ReactiveVersionSet;
+  friend class VersionEditHandlerBase;
+  friend class ListColumnFamiliesHandler;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
   friend class VersionSet;
   friend class Version;
   friend class AtomicGroupReadBuffer;
@@ -421,6 +610,12 @@
   DeletedFiles deleted_files_;
   NewFiles new_files_;
 
+  BlobFileAdditions blob_file_additions_;
+  BlobFileGarbages blob_file_garbages_;
+
+  WalAdditions wal_additions_;
+  WalDeletion wal_deletion_;
+
   // Each version edit record should have column_family_ set
   // If it's not set, it is default (0)
   uint32_t column_family_ = 0;
@@ -433,6 +628,8 @@
 
   bool is_in_atomic_group_ = false;
   uint32_t remaining_entries_ = 0;
+
+  std::string full_history_ts_low_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,980 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit_handler.h"
+
+#include <cinttypes>
+#include <sstream>
+
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "logging/logging.h"
+#include "monitoring/persistent_stats_history.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void VersionEditHandlerBase::Iterate(log::Reader& reader,
+                                     Status* log_read_status) {
+  Slice record;
+  std::string scratch;
+  assert(log_read_status);
+  assert(log_read_status->ok());
+
+  size_t recovered_edits = 0;
+  Status s = Initialize();
+  while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() &&
+         reader.ReadRecord(&record, &scratch) && log_read_status->ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+
+    s = read_buffer_.AddEdit(&edit);
+    if (!s.ok()) {
+      break;
+    }
+    ColumnFamilyData* cfd = nullptr;
+    if (edit.is_in_atomic_group_) {
+      if (read_buffer_.IsFull()) {
+        for (auto& e : read_buffer_.replay_buffer()) {
+          s = ApplyVersionEdit(e, &cfd);
+          if (!s.ok()) {
+            break;
+          }
+          ++recovered_edits;
+        }
+        if (!s.ok()) {
+          break;
+        }
+        read_buffer_.Clear();
+      }
+    } else {
+      s = ApplyVersionEdit(edit, &cfd);
+      if (s.ok()) {
+        ++recovered_edits;
+      }
+    }
+  }
+  if (!log_read_status->ok()) {
+    s = *log_read_status;
+  }
+
+  CheckIterationResult(reader, &s);
+
+  if (!s.ok()) {
+    if (s.IsCorruption()) {
+      // when we find a Corruption error, something is
+      // wrong with the underlying file. in this case we
+      // want to report the filename, so in here we append
+      // the filename to the Corruption message
+      assert(reader.file());
+
+      // build a new error message
+      std::stringstream message;
+      // append previous dynamic state message
+      const char* state = s.getState();
+      if (state != nullptr) {
+        message << state;
+        message << ' ';
+      }
+      // append the filename to the corruption message
+      message << "in file " << reader.file()->file_name();
+      // overwrite the status with the extended status
+      s = Status(s.code(), s.subcode(), s.severity(), message.str());
+    }
+    status_ = s;
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish",
+                           &recovered_edits);
+}
+
+Status ListColumnFamiliesHandler::ApplyVersionEdit(
+    VersionEdit& edit, ColumnFamilyData** /*unused*/) {
+  Status s;
+  if (edit.is_column_family_add_) {
+    if (column_family_names_.find(edit.column_family_) !=
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest adding the same column family twice");
+    } else {
+      column_family_names_.insert(
+          {edit.column_family_, edit.column_family_name_});
+    }
+  } else if (edit.is_column_family_drop_) {
+    if (column_family_names_.find(edit.column_family_) ==
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest - dropping non-existing column family");
+    } else {
+      column_family_names_.erase(edit.column_family_);
+    }
+  }
+  return s;
+}
+
+Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
+                                               ColumnFamilyData** /*unused*/) {
+  for (const auto& deleted_file : edit.GetDeletedFiles()) {
+    Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_file : edit.GetNewFiles()) {
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_file.second.fd.GetNumber(), new_file.second.file_checksum,
+        new_file.second.file_checksum_func_name);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_blob_file : edit.GetBlobFileAdditions()) {
+    std::string checksum_value = new_blob_file.GetChecksumValue();
+    std::string checksum_method = new_blob_file.GetChecksumMethod();
+    assert(checksum_value.empty() == checksum_method.empty());
+    if (checksum_method.empty()) {
+      checksum_value = kUnknownFileChecksum;
+      checksum_method = kUnknownFileChecksumFuncName;
+    }
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+VersionEditHandler::VersionEditHandler(
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+    VersionSet* version_set, bool track_missing_files,
+    bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
+    bool skip_load_table_files)
+    : VersionEditHandlerBase(),
+      read_only_(read_only),
+      column_families_(std::move(column_families)),
+      version_set_(version_set),
+      track_missing_files_(track_missing_files),
+      no_error_if_files_missing_(no_error_if_files_missing),
+      io_tracer_(io_tracer),
+      skip_load_table_files_(skip_load_table_files),
+      initialized_(false) {
+  assert(version_set_ != nullptr);
+}
+
+Status VersionEditHandler::Initialize() {
+  Status s;
+  if (!initialized_) {
+    for (const auto& cf_desc : column_families_) {
+      name_to_options_.emplace(cf_desc.name, cf_desc.options);
+    }
+    auto default_cf_iter = name_to_options_.find(kDefaultColumnFamilyName);
+    if (default_cf_iter == name_to_options_.end()) {
+      s = Status::InvalidArgument("Default column family not specified");
+    }
+    if (s.ok()) {
+      VersionEdit default_cf_edit;
+      default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+      default_cf_edit.SetColumnFamily(0);
+      ColumnFamilyData* cfd =
+          CreateCfAndInit(default_cf_iter->second, default_cf_edit);
+      assert(cfd != nullptr);
+#ifdef NDEBUG
+      (void)cfd;
+#endif
+      initialized_ = true;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
+                                            ColumnFamilyData** cfd) {
+  Status s;
+  if (edit.is_column_family_add_) {
+    s = OnColumnFamilyAdd(edit, cfd);
+  } else if (edit.is_column_family_drop_) {
+    s = OnColumnFamilyDrop(edit, cfd);
+  } else if (edit.IsWalAddition()) {
+    s = OnWalAddition(edit);
+  } else if (edit.IsWalDeletion()) {
+    s = OnWalDeletion(edit);
+  } else {
+    s = OnNonCfOperation(edit, cfd);
+  }
+  if (s.ok()) {
+    assert(cfd != nullptr);
+    s = ExtractInfoFromVersionEdit(*cfd, edit);
+  }
+  return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
+                                             ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  Status s;
+  if (cf_in_builders || cf_in_not_found) {
+    s = Status::Corruption("MANIFEST adding the same column family twice: " +
+                           edit.column_family_name_);
+  }
+  if (s.ok()) {
+    auto cf_options = name_to_options_.find(edit.column_family_name_);
+    // implicitly add persistent_stats column family without requiring user
+    // to specify
+    ColumnFamilyData* tmp_cfd = nullptr;
+    bool is_persistent_stats_column_family =
+        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+    if (cf_options == name_to_options_.end() &&
+        !is_persistent_stats_column_family) {
+      column_families_not_found_.emplace(edit.column_family_,
+                                         edit.column_family_name_);
+    } else {
+      if (is_persistent_stats_column_family) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        tmp_cfd = CreateCfAndInit(cfo, edit);
+      } else {
+        tmp_cfd = CreateCfAndInit(cf_options->second, edit);
+      }
+      *cfd = tmp_cfd;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
+                                              ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  ColumnFamilyData* tmp_cfd = nullptr;
+  Status s;
+  if (cf_in_builders) {
+    tmp_cfd = DestroyCfAndCleanup(edit);
+  } else if (cf_in_not_found) {
+    column_families_not_found_.erase(edit.column_family_);
+  } else {
+    s = Status::Corruption("MANIFEST - dropping non-existing column family");
+  }
+  *cfd = tmp_cfd;
+  return s;
+}
+
+Status VersionEditHandler::OnWalAddition(VersionEdit& edit) {
+  assert(edit.IsWalAddition());
+  return version_set_->wals_.AddWals(edit.GetWalAdditions());
+}
+
+Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
+  assert(edit.IsWalDeletion());
+  return version_set_->wals_.DeleteWalsBefore(
+      edit.GetWalDeletion().GetLogNumber());
+}
+
+Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
+                                            ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  Status s;
+  if (!cf_in_not_found) {
+    if (!cf_in_builders) {
+      s = Status::Corruption(
+          "MANIFEST record referencing unknown column family");
+    }
+    ColumnFamilyData* tmp_cfd = nullptr;
+    if (s.ok()) {
+      auto builder_iter = builders_.find(edit.column_family_);
+      assert(builder_iter != builders_.end());
+      tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily(
+          edit.column_family_);
+      assert(tmp_cfd != nullptr);
+      s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false);
+      if (s.ok()) {
+        s = builder_iter->second->version_builder()->Apply(&edit);
+      }
+    }
+    *cfd = tmp_cfd;
+  }
+  return s;
+}
+
+// TODO maybe cache the computation result
+bool VersionEditHandler::HasMissingFiles() const {
+  bool ret = false;
+  for (const auto& elem : cf_to_missing_files_) {
+    const auto& missing_files = elem.second;
+    if (!missing_files.empty()) {
+      ret = true;
+      break;
+    }
+  }
+  if (!ret) {
+    for (const auto& elem : cf_to_missing_blob_files_high_) {
+      if (elem.second != kInvalidBlobFileNumber) {
+        ret = true;
+        break;
+      }
+    }
+  }
+  return ret;
+}
+
+void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit,
+                                             bool* cf_in_not_found,
+                                             bool* cf_in_builders) const {
+  assert(cf_in_not_found != nullptr);
+  assert(cf_in_builders != nullptr);
+  // Not found means that user didn't supply that column
+  // family option AND we encountered column family add
+  // record. Once we encounter column family drop record,
+  // we will delete the column family from
+  // column_families_not_found.
+  bool in_not_found = column_families_not_found_.find(edit.column_family_) !=
+                      column_families_not_found_.end();
+  // in builders means that user supplied that column family
+  // option AND that we encountered column family add record
+  bool in_builders = builders_.find(edit.column_family_) != builders_.end();
+  // They cannot both be true
+  assert(!(in_not_found && in_builders));
+  *cf_in_not_found = in_not_found;
+  *cf_in_builders = in_builders;
+}
+
+void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
+                                              Status* s) {
+  assert(s != nullptr);
+  if (!s->ok()) {
+    // Do nothing here.
+  } else if (!version_edit_params_.has_log_number_ ||
+             !version_edit_params_.has_next_file_number_ ||
+             !version_edit_params_.has_last_sequence_) {
+    std::string msg("no ");
+    if (!version_edit_params_.has_log_number_) {
+      msg.append("log_file_number, ");
+    }
+    if (!version_edit_params_.has_next_file_number_) {
+      msg.append("next_file_number, ");
+    }
+    if (!version_edit_params_.has_last_sequence_) {
+      msg.append("last_sequence, ");
+    }
+    msg = msg.substr(0, msg.size() - 2);
+    msg.append(" entry in MANIFEST");
+    *s = Status::Corruption(msg);
+  }
+  // There were some column families in the MANIFEST that weren't specified
+  // in the argument. This is OK in read_only mode
+  if (s->ok() && MustOpenAllColumnFamilies() &&
+      !column_families_not_found_.empty()) {
+    std::string msg;
+    for (const auto& cf : column_families_not_found_) {
+      msg.append(", ");
+      msg.append(cf.second);
+    }
+    msg = msg.substr(2);
+    *s = Status::InvalidArgument("Column families not opened: " + msg);
+  }
+  if (s->ok()) {
+    version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily(
+        version_edit_params_.max_column_family_);
+    version_set_->MarkMinLogNumberToKeep(
+        version_edit_params_.min_log_number_to_keep_);
+    version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
+    version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
+    for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      auto builder_iter = builders_.find(cfd->GetID());
+      assert(builder_iter != builders_.end());
+      auto* builder = builder_iter->second->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        *s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (read_only_) {
+        cfd->table_cache()->SetTablesAreImmortal();
+      }
+      *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false,
+                      /*is_initial_load=*/true);
+      if (!s->ok()) {
+        // If s is IOError::PathNotFound, then we mark the db as corrupted.
+        if (s->IsPathNotFound()) {
+          *s = Status::Corruption("Corruption: " + s->ToString());
+        }
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->column_family_set_)) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      VersionEdit edit;
+      *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true);
+      if (!s->ok()) {
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    version_set_->manifest_file_size_ = reader.GetReadOffset();
+    assert(version_set_->manifest_file_size_ > 0);
+    version_set_->next_file_number_.store(
+        version_edit_params_.next_file_number_ + 1);
+    SequenceNumber last_seq = version_edit_params_.last_sequence_;
+    assert(last_seq != kMaxSequenceNumber);
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_allocated_sequence_.load()) {
+      version_set_->last_allocated_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_published_sequence_.load()) {
+      version_set_->last_published_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_sequence_.load()) {
+      version_set_->last_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->descriptor_last_sequence_) {
+      // This is the maximum last sequence of all `VersionEdit`s iterated. It
+      // may be greater than the maximum `largest_seqno` of all files in case
+      // the newest data referred to by the MANIFEST has been dropped or had its
+      // sequence number zeroed through compaction.
+      version_set_->descriptor_last_sequence_ = last_seq;
+    }
+    version_set_->prev_log_number_ = version_edit_params_.prev_log_number_;
+  }
+}
+
+ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
+    const ColumnFamilyOptions& cf_options, const VersionEdit& edit) {
+  ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit);
+  assert(cfd != nullptr);
+  cfd->set_initialized();
+  assert(builders_.find(edit.column_family_) == builders_.end());
+  builders_.emplace(edit.column_family_,
+                    VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd)));
+  if (track_missing_files_) {
+    cf_to_missing_files_.emplace(edit.column_family_,
+                                 std::unordered_set<uint64_t>());
+    cf_to_missing_blob_files_high_.emplace(edit.column_family_,
+                                           kInvalidBlobFileNumber);
+  }
+  return cfd;
+}
+
+ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
+    const VersionEdit& edit) {
+  auto builder_iter = builders_.find(edit.column_family_);
+  assert(builder_iter != builders_.end());
+  builders_.erase(builder_iter);
+  if (track_missing_files_) {
+    auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_);
+    assert(missing_files_iter != cf_to_missing_files_.end());
+    cf_to_missing_files_.erase(missing_files_iter);
+
+    auto missing_blob_files_high_iter =
+        cf_to_missing_blob_files_high_.find(edit.column_family_);
+    assert(missing_blob_files_high_iter !=
+           cf_to_missing_blob_files_high_.end());
+    cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
+  }
+  ColumnFamilyData* ret =
+      version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_);
+  assert(ret != nullptr);
+  ret->SetDropped();
+  ret->UnrefAndTryDelete();
+  ret = nullptr;
+  return ret;
+}
+
+Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
+                                              ColumnFamilyData* cfd,
+                                              bool force_create_version) {
+  assert(cfd->initialized());
+  Status s;
+  if (force_create_version) {
+    auto builder_iter = builders_.find(cfd->GetID());
+    assert(builder_iter != builders_.end());
+    auto* builder = builder_iter->second->version_builder();
+    auto* v = new Version(cfd, version_set_, version_set_->file_options_,
+                          *cfd->GetLatestMutableCFOptions(), io_tracer_,
+                          version_set_->current_version_number_++);
+    s = builder->SaveTo(v->storage_info());
+    if (s.ok()) {
+      // Install new version
+      v->PrepareApply(
+          *cfd->GetLatestMutableCFOptions(),
+          !(version_set_->db_options_->skip_stats_update_on_db_open));
+      version_set_->AppendVersion(cfd, v);
+    } else {
+      delete v;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
+                                      bool prefetch_index_and_filter_in_cache,
+                                      bool is_initial_load) {
+  bool skip_load_table_files = skip_load_table_files_;
+  TEST_SYNC_POINT_CALLBACK(
+      "VersionEditHandler::LoadTables:skip_load_table_files",
+      &skip_load_table_files);
+  if (skip_load_table_files) {
+    return Status::OK();
+  }
+  assert(cfd != nullptr);
+  assert(!cfd->IsDropped());
+  auto builder_iter = builders_.find(cfd->GetID());
+  assert(builder_iter != builders_.end());
+  assert(builder_iter->second != nullptr);
+  VersionBuilder* builder = builder_iter->second->version_builder();
+  assert(builder);
+  Status s = builder->LoadTableHandlers(
+      cfd->internal_stats(),
+      version_set_->db_options_->max_file_opening_threads,
+      prefetch_index_and_filter_in_cache, is_initial_load,
+      cfd->GetLatestMutableCFOptions()->prefix_extractor,
+      MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
+  if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
+    s = Status::OK();
+  }
+  if (!s.ok() && !version_set_->db_options_->paranoid_checks) {
+    s = Status::OK();
+  }
+  return s;
+}
+
+Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                                      const VersionEdit& edit) {
+  Status s;
+  if (edit.has_db_id_) {
+    version_set_->db_id_ = edit.GetDbId();
+    version_edit_params_.SetDBId(edit.db_id_);
+  }
+  if (cfd != nullptr) {
+    if (edit.has_log_number_) {
+      if (cfd->GetLogNumber() > edit.log_number_) {
+        ROCKS_LOG_WARN(
+            version_set_->db_options()->info_log,
+            "MANIFEST corruption detected, but ignored - Log numbers in "
+            "records NOT monotonically increasing");
+      } else {
+        cfd->SetLogNumber(edit.log_number_);
+        version_edit_params_.SetLogNumber(edit.log_number_);
+      }
+    }
+    if (edit.has_comparator_ &&
+        edit.comparator_ != cfd->user_comparator()->Name()) {
+      if (!cf_to_cmp_names_) {
+        s = Status::InvalidArgument(
+            cfd->user_comparator()->Name(),
+            "does not match existing comparator " + edit.comparator_);
+      } else {
+        cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_);
+      }
+    }
+    if (edit.HasFullHistoryTsLow()) {
+      const std::string& new_ts = edit.GetFullHistoryTsLow();
+      cfd->SetFullHistoryTsLow(new_ts);
+    }
+  }
+
+  if (s.ok()) {
+    if (edit.has_prev_log_number_) {
+      version_edit_params_.SetPrevLogNumber(edit.prev_log_number_);
+    }
+    if (edit.has_next_file_number_) {
+      version_edit_params_.SetNextFile(edit.next_file_number_);
+    }
+    if (edit.has_max_column_family_) {
+      version_edit_params_.SetMaxColumnFamily(edit.max_column_family_);
+    }
+    if (edit.has_min_log_number_to_keep_) {
+      version_edit_params_.min_log_number_to_keep_ =
+          std::max(version_edit_params_.min_log_number_to_keep_,
+                   edit.min_log_number_to_keep_);
+    }
+    if (edit.has_last_sequence_) {
+      // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This
+      // is legacy behavior that cannot change without breaking downgrade
+      // compatibility.
+      assert(!version_edit_params_.has_last_sequence_ ||
+             version_edit_params_.last_sequence_ <= edit.last_sequence_);
+      version_edit_params_.SetLastSequence(edit.last_sequence_);
+    }
+    if (!version_edit_params_.has_prev_log_number_) {
+      version_edit_params_.SetPrevLogNumber(0);
+    }
+  }
+  return s;
+}
+
+VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+    VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer)
+    : VersionEditHandler(read_only, column_families, version_set,
+                         /*track_missing_files=*/true,
+                         /*no_error_if_files_missing=*/true, io_tracer) {}
+
+VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
+  for (const auto& elem : versions_) {
+    delete elem.second;
+  }
+  versions_.clear();
+}
+
+void VersionEditHandlerPointInTime::CheckIterationResult(
+    const log::Reader& reader, Status* s) {
+  VersionEditHandler::CheckIterationResult(reader, s);
+  assert(s != nullptr);
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->column_family_set_)) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      auto v_iter = versions_.find(cfd->GetID());
+      if (v_iter != versions_.end()) {
+        assert(v_iter->second != nullptr);
+
+        version_set_->AppendVersion(cfd, v_iter->second);
+        versions_.erase(v_iter);
+      }
+    }
+  } else {
+    for (const auto& elem : versions_) {
+      delete elem.second;
+    }
+    versions_.clear();
+  }
+}
+
+ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(
+    const VersionEdit& edit) {
+  ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit);
+  auto v_iter = versions_.find(edit.column_family_);
+  if (v_iter != versions_.end()) {
+    delete v_iter->second;
+    versions_.erase(v_iter);
+  }
+  return cfd;
+}
+
+Status VersionEditHandlerPointInTime::MaybeCreateVersion(
+    const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
+  assert(cfd != nullptr);
+  if (!force_create_version) {
+    assert(edit.column_family_ == cfd->GetID());
+  }
+  auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
+  assert(missing_files_iter != cf_to_missing_files_.end());
+  std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
+
+  auto missing_blob_files_high_iter =
+      cf_to_missing_blob_files_high_.find(cfd->GetID());
+  assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
+  const uint64_t prev_missing_blob_file_high =
+      missing_blob_files_high_iter->second;
+
+  VersionBuilder* builder = nullptr;
+
+  if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
+    auto builder_iter = builders_.find(cfd->GetID());
+    assert(builder_iter != builders_.end());
+    builder = builder_iter->second->version_builder();
+    assert(builder != nullptr);
+  }
+
+  // At this point, we have not yet applied the new version edits read from the
+  // MANIFEST. We check whether we have any missing table and blob files.
+  const bool prev_has_missing_files =
+      !missing_files.empty() ||
+      (prev_missing_blob_file_high != kInvalidBlobFileNumber &&
+       prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
+
+  for (const auto& file : edit.GetDeletedFiles()) {
+    uint64_t file_num = file.second;
+    auto fiter = missing_files.find(file_num);
+    if (fiter != missing_files.end()) {
+      missing_files.erase(fiter);
+    }
+  }
+
+  assert(!cfd->ioptions()->cf_paths.empty());
+  Status s;
+  for (const auto& elem : edit.GetNewFiles()) {
+    const FileMetaData& meta = elem.second;
+    const FileDescriptor& fd = meta.fd;
+    uint64_t file_num = fd.GetNumber();
+    const std::string fpath =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
+    s = VerifyFile(fpath, meta);
+    if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+      missing_files.insert(file_num);
+      s = Status::OK();
+    } else if (!s.ok()) {
+      break;
+    }
+  }
+
+  uint64_t missing_blob_file_num = prev_missing_blob_file_high;
+  for (const auto& elem : edit.GetBlobFileAdditions()) {
+    uint64_t file_num = elem.GetBlobFileNumber();
+    s = VerifyBlobFile(cfd, file_num, elem);
+    if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+      missing_blob_file_num = std::max(missing_blob_file_num, file_num);
+      s = Status::OK();
+    } else if (!s.ok()) {
+      break;
+    }
+  }
+
+  bool has_missing_blob_files = false;
+  if (missing_blob_file_num != kInvalidBlobFileNumber &&
+      missing_blob_file_num >= prev_missing_blob_file_high) {
+    missing_blob_files_high_iter->second = missing_blob_file_num;
+    has_missing_blob_files = true;
+  } else if (missing_blob_file_num < prev_missing_blob_file_high) {
+    assert(false);
+  }
+
+  // We still have not applied the new version edit, but have tried to add new
+  // table and blob files after verifying their presence and consistency.
+  // Therefore, we know whether we will see new missing table and blob files
+  // later after actually applying the version edit. We perform the check here
+  // and record the result.
+  const bool has_missing_files =
+      !missing_files.empty() || has_missing_blob_files;
+
+  bool missing_info = !version_edit_params_.has_log_number_ ||
+                      !version_edit_params_.has_next_file_number_ ||
+                      !version_edit_params_.has_last_sequence_;
+
+  // Create version before apply edit. The version will represent the state
+  // before applying the version edit.
+  // A new version will created if:
+  // 1) no error has occurred so far, and
+  // 2) log_number_, next_file_number_ and last_sequence_ are known, and
+  // 3) any of the following:
+  //   a) no missing file before, but will have missing file(s) after applying
+  //      this version edit.
+  //   b) no missing file after applying the version edit, and the caller
+  //      explicitly request that a new version be created.
+  if (s.ok() && !missing_info &&
+      ((has_missing_files && !prev_has_missing_files) ||
+       (!has_missing_files && force_create_version))) {
+    if (!builder) {
+      auto builder_iter = builders_.find(cfd->GetID());
+      assert(builder_iter != builders_.end());
+      builder = builder_iter->second->version_builder();
+      assert(builder);
+    }
+
+    auto* version = new Version(cfd, version_set_, version_set_->file_options_,
+                                *cfd->GetLatestMutableCFOptions(), io_tracer_,
+                                version_set_->current_version_number_++);
+    s = builder->SaveTo(version->storage_info());
+    if (s.ok()) {
+      version->PrepareApply(
+          *cfd->GetLatestMutableCFOptions(),
+          !version_set_->db_options_->skip_stats_update_on_db_open);
+      auto v_iter = versions_.find(cfd->GetID());
+      if (v_iter != versions_.end()) {
+        delete v_iter->second;
+        v_iter->second = version;
+      } else {
+        versions_.emplace(cfd->GetID(), version);
+      }
+    } else {
+      delete version;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath,
+                                                 const FileMetaData& fmeta) {
+  return version_set_->VerifyFileMetadata(fpath, fmeta);
+}
+
+Status VersionEditHandlerPointInTime::VerifyBlobFile(
+    ColumnFamilyData* cfd, uint64_t blob_file_num,
+    const BlobFileAddition& blob_addition) {
+  BlobFileCache* blob_file_cache = cfd->blob_file_cache();
+  assert(blob_file_cache);
+  CacheHandleGuard<BlobFileReader> blob_file_reader;
+  Status s =
+      blob_file_cache->GetBlobFileReader(blob_file_num, &blob_file_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  // TODO: verify checksum
+  (void)blob_addition;
+  return s;
+}
+
+Status ManifestTailer::Initialize() {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::Initialize();
+  }
+  assert(Mode::kCatchUp == mode_);
+  Status s;
+  if (!initialized_) {
+    ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+    assert(cfd_set);
+    ColumnFamilyData* default_cfd = cfd_set->GetDefault();
+    assert(default_cfd);
+    auto builder_iter = builders_.find(default_cfd->GetID());
+    assert(builder_iter != builders_.end());
+
+    Version* dummy_version = default_cfd->dummy_versions();
+    assert(dummy_version);
+    Version* base_version = dummy_version->Next();
+    assert(base_version);
+    base_version->Ref();
+    VersionBuilderUPtr new_builder(
+        new BaseReferencedVersionBuilder(default_cfd, base_version));
+    builder_iter->second = std::move(new_builder);
+
+    initialized_ = true;
+  }
+  return s;
+}
+
+Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit,
+                                        ColumnFamilyData** cfd) {
+  Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  if (s.ok()) {
+    assert(cfd);
+    if (*cfd) {
+      cfds_changed_.insert(*cfd);
+    }
+  }
+  return s;
+}
+
+Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
+                                         ColumnFamilyData** cfd) {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::OnColumnFamilyAdd(edit, cfd);
+  }
+  assert(Mode::kCatchUp == mode_);
+  ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+  assert(cfd_set);
+  ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily());
+  assert(cfd);
+  *cfd = tmp_cfd;
+  if (!tmp_cfd) {
+    // For now, ignore new column families created after Recover() succeeds.
+    return Status::OK();
+  }
+  auto builder_iter = builders_.find(edit.GetColumnFamily());
+  assert(builder_iter != builders_.end());
+
+  Version* dummy_version = tmp_cfd->dummy_versions();
+  assert(dummy_version);
+  Version* base_version = dummy_version->Next();
+  assert(base_version);
+  base_version->Ref();
+  VersionBuilderUPtr new_builder(
+      new BaseReferencedVersionBuilder(tmp_cfd, base_version));
+  builder_iter->second = std::move(new_builder);
+
+#ifndef NDEBUG
+  auto version_iter = versions_.find(edit.GetColumnFamily());
+  assert(version_iter == versions_.end());
+#endif  // !NDEBUG
+  return Status::OK();
+}
+
+void ManifestTailer::CheckIterationResult(const log::Reader& reader,
+                                          Status* s) {
+  VersionEditHandlerPointInTime::CheckIterationResult(reader, s);
+  assert(s);
+  if (s->ok()) {
+    if (Mode::kRecovery == mode_) {
+      mode_ = Mode::kCatchUp;
+    } else {
+      assert(Mode::kCatchUp == mode_);
+    }
+  }
+}
+
+Status ManifestTailer::VerifyFile(const std::string& fpath,
+                                  const FileMetaData& fmeta) {
+  Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta);
+  // TODO: Open file or create hard link to prevent the file from being
+  // deleted.
+  return s;
+}
+
+void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
+                                               Status* s) {
+  VersionEditHandler::CheckIterationResult(reader, s);
+  if (!s->ok()) {
+    fprintf(stdout, "%s\n", s->ToString().c_str());
+    return;
+  }
+  assert(cf_to_cmp_names_);
+  for (auto* cfd : *(version_set_->column_family_set_)) {
+    fprintf(stdout,
+            "--------------- Column family \"%s\"  (ID %" PRIu32
+            ") --------------\n",
+            cfd->GetName().c_str(), cfd->GetID());
+    fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber());
+    auto it = cf_to_cmp_names_->find(cfd->GetID());
+    if (it != cf_to_cmp_names_->end()) {
+      fprintf(stdout,
+              "comparator: <%s>, but the comparator object is not available.\n",
+              it->second.c_str());
+    } else {
+      fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name());
+    }
+    assert(cfd->current());
+
+    // Print out DebugStrings. Can include non-terminating null characters.
+    fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char),
+           cfd->current()->DebugString(hex_).size(), stdout);
+  }
+  fprintf(stdout,
+          "next_file_number %" PRIu64 " last_sequence %" PRIu64
+          "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
+          " min_log_number_to_keep %" PRIu64 "\n",
+          version_set_->current_next_file_number(),
+          version_set_->LastSequence(), version_set_->prev_log_number(),
+          version_set_->column_family_set_->GetMaxColumnFamily(),
+          version_set_->min_log_number_to_keep());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_handler.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_handler.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,309 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FileMetaData;
+
+class VersionEditHandlerBase {
+ public:
+  explicit VersionEditHandlerBase()
+      : max_manifest_read_size_(std::numeric_limits<uint64_t>::max()) {}
+
+  virtual ~VersionEditHandlerBase() {}
+
+  void Iterate(log::Reader& reader, Status* log_read_status);
+
+  const Status& status() const { return status_; }
+
+  AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; }
+
+ protected:
+  explicit VersionEditHandlerBase(uint64_t max_read_size)
+      : max_manifest_read_size_(max_read_size) {}
+  virtual Status Initialize() { return Status::OK(); }
+
+  virtual Status ApplyVersionEdit(VersionEdit& edit,
+                                  ColumnFamilyData** cfd) = 0;
+
+  virtual void CheckIterationResult(const log::Reader& /*reader*/,
+                                    Status* /*s*/) {}
+
+  void ClearReadBuffer() { read_buffer_.Clear(); }
+
+  Status status_;
+
+ private:
+  AtomicGroupReadBuffer read_buffer_;
+  const uint64_t max_manifest_read_size_;
+};
+
+class ListColumnFamiliesHandler : public VersionEditHandlerBase {
+ public:
+  ListColumnFamiliesHandler() : VersionEditHandlerBase() {}
+
+  ~ListColumnFamiliesHandler() override {}
+
+  const std::map<uint32_t, std::string> GetColumnFamilyNames() const {
+    return column_family_names_;
+  }
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  // default column family is always implicitly there
+  std::map<uint32_t, std::string> column_family_names_{
+      {0, kDefaultColumnFamilyName}};
+};
+
+class FileChecksumRetriever : public VersionEditHandlerBase {
+ public:
+  FileChecksumRetriever(uint64_t max_read_size,
+                        FileChecksumList& file_checksum_list)
+      : VersionEditHandlerBase(max_read_size),
+        file_checksum_list_(file_checksum_list) {}
+
+  ~FileChecksumRetriever() override {}
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  FileChecksumList& file_checksum_list_;
+};
+
+using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
+
+// A class used for scanning MANIFEST file.
+// VersionEditHandler reads a MANIFEST file, parses the version edits, and
+// builds the version set's in-memory state, e.g. the version storage info for
+// the versions of column families.
+// To use this class and its subclasses,
+// 1. Create an object of VersionEditHandler or its subclasses.
+//    VersionEditHandler handler(read_only, column_families, version_set,
+//                               track_missing_files,
+//                               no_error_if_files_missing);
+// 2. Status s = handler.Iterate(reader, &db_id);
+// 3. Check s and handle possible errors.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandler is shared by multiple threads.
+class VersionEditHandler : public VersionEditHandlerBase {
+ public:
+  explicit VersionEditHandler(
+      bool read_only,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      VersionSet* version_set, bool track_missing_files,
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandler(read_only, column_families, version_set,
+                           track_missing_files, no_error_if_files_missing,
+                           io_tracer, /*skip_load_table_files=*/false) {}
+
+  ~VersionEditHandler() override {}
+
+  const VersionEditParams& GetVersionEditParams() const {
+    return version_edit_params_;
+  }
+
+  bool HasMissingFiles() const;
+
+  void GetDbId(std::string* db_id) const {
+    if (db_id && version_edit_params_.has_db_id_) {
+      *db_id = version_edit_params_.db_id_;
+    }
+  }
+
+ protected:
+  explicit VersionEditHandler(
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+      VersionSet* version_set, bool track_missing_files,
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files);
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnWalAddition(VersionEdit& edit);
+
+  Status OnWalDeletion(VersionEdit& edit);
+
+  Status Initialize() override;
+
+  void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
+                           bool* cf_in_builders) const;
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+  ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options,
+                                    const VersionEdit& edit);
+
+  virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
+
+  virtual Status MaybeCreateVersion(const VersionEdit& edit,
+                                    ColumnFamilyData* cfd,
+                                    bool force_create_version);
+
+  Status LoadTables(ColumnFamilyData* cfd,
+                    bool prefetch_index_and_filter_in_cache,
+                    bool is_initial_load);
+
+  virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
+
+  const bool read_only_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  VersionSet* version_set_;
+  std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
+  std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
+  // Keeps track of column families in manifest that were not found in
+  // column families parameters. if those column families are not dropped
+  // by subsequent manifest records, Recover() will return failure status.
+  std::unordered_map<uint32_t, std::string> column_families_not_found_;
+  VersionEditParams version_edit_params_;
+  const bool track_missing_files_;
+  std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
+      cf_to_missing_files_;
+  std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
+  bool no_error_if_files_missing_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  bool skip_load_table_files_;
+  bool initialized_;
+  std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
+
+ private:
+  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                    const VersionEdit& edit);
+};
+
+// A class similar to its base class, i.e. VersionEditHandler.
+// VersionEditHandlerPointInTime restores the versions to the most recent point
+// in time such that at this point, the version does not have missing files.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandlerPointInTime is shared by multiple threads.
+class VersionEditHandlerPointInTime : public VersionEditHandler {
+ public:
+  VersionEditHandlerPointInTime(
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+      VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer);
+  ~VersionEditHandlerPointInTime() override;
+
+ protected:
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+  ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
+  Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
+                            bool force_create_version) override;
+  virtual Status VerifyFile(const std::string& fpath,
+                            const FileMetaData& fmeta);
+  virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
+                                const BlobFileAddition& blob_addition);
+
+  std::unordered_map<uint32_t, Version*> versions_;
+};
+
+class ManifestTailer : public VersionEditHandlerPointInTime {
+ public:
+  explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
+                          VersionSet* version_set,
+                          const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
+                                      version_set, io_tracer),
+        mode_(Mode::kRecovery) {}
+
+  void PrepareToReadNewManifest() {
+    initialized_ = false;
+    ClearReadBuffer();
+  }
+
+  std::unordered_set<ColumnFamilyData*>& GetUpdatedColumnFamilies() {
+    return cfds_changed_;
+  }
+
+ protected:
+  Status Initialize() override;
+
+  bool MustOpenAllColumnFamilies() const override { return false; }
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+  Status VerifyFile(const std::string& fpath,
+                    const FileMetaData& fmeta) override;
+
+  enum Mode : uint8_t {
+    kRecovery = 0,
+    kCatchUp = 1,
+  };
+
+  Mode mode_;
+  std::unordered_set<ColumnFamilyData*> cfds_changed_;
+};
+
+class DumpManifestHandler : public VersionEditHandler {
+ public:
+  DumpManifestHandler(std::vector<ColumnFamilyDescriptor> column_families,
+                      VersionSet* version_set,
+                      const std::shared_ptr<IOTracer>& io_tracer, bool verbose,
+                      bool hex, bool json)
+      : VersionEditHandler(
+            /*read_only=*/true, column_families, version_set,
+            /*track_missing_files=*/false,
+            /*no_error_if_files_missing=*/false, io_tracer,
+            /*skip_load_table_files=*/true),
+        verbose_(verbose),
+        hex_(hex),
+        json_(json),
+        count_(0) {
+    cf_to_cmp_names_.reset(new std::unordered_map<uint32_t, std::string>());
+  }
+
+  ~DumpManifestHandler() override {}
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override {
+    // Write out each individual edit
+    if (verbose_ && !json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    } else if (json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    }
+    ++count_;
+    return VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  }
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+ private:
+  const bool verbose_;
+  const bool hex_;
+  const bool json_;
+  int count_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_edit_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_edit_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,9 +8,13 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_edit.h"
+
+#include "rocksdb/advanced_options.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -36,8 +40,9 @@
     edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
-                 kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber,
-                 888, 678, "234", "crc32c");
+                 kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown,
+                 kInvalidBlobFileNumber, 888, 678, "234", "crc32c", "123",
+                 "345");
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -54,23 +59,27 @@
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true, kInvalidBlobFileNumber,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123",
+               "234");
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false, kInvalidBlobFileNumber,
+               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "345",
+               "543");
   edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
                InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
-               kBig + 602, true, kInvalidBlobFileNumber, 666, 888,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+               kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               "456", "567");
   edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
                InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
-               kBig + 603, true, 1001, kUnknownOldestAncesterTime,
-               kUnknownFileCreationTime, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName);
+               kBig + 603, true, Temperature::kUnknown, 1001,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "678",
+               "789");
   ;
 
   edit.DeleteFile(4, 700);
@@ -102,6 +111,14 @@
   ASSERT_EQ(kInvalidBlobFileNumber,
             new_files[2].second.oldest_blob_file_number);
   ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
+  ASSERT_EQ("123", new_files[0].second.min_timestamp);
+  ASSERT_EQ("234", new_files[0].second.max_timestamp);
+  ASSERT_EQ("345", new_files[1].second.min_timestamp);
+  ASSERT_EQ("543", new_files[1].second.max_timestamp);
+  ASSERT_EQ("456", new_files[2].second.min_timestamp);
+  ASSERT_EQ("567", new_files[2].second.max_timestamp);
+  ASSERT_EQ("678", new_files[3].second.min_timestamp);
+  ASSERT_EQ("789", new_files[3].second.max_timestamp);
 }
 
 TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
@@ -109,13 +126,15 @@
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true, kInvalidBlobFileNumber,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123",
+               "234");
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false, kInvalidBlobFileNumber, 686, 868, "234",
-               "crc32c");
+               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
+               686, 868, "234", "crc32c", kDisableUserTimestamp,
+               kDisableUserTimestamp);
   edit.DeleteFile(4, 700);
 
   edit.SetComparatorName("foo");
@@ -154,6 +173,10 @@
   ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
   ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
   ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
+  ASSERT_EQ("123", new_files[0].second.min_timestamp);
+  ASSERT_EQ("234", new_files[0].second.max_timestamp);
+  ASSERT_EQ(kDisableUserTimestamp, new_files[1].second.min_timestamp);
+  ASSERT_EQ(kDisableUserTimestamp, new_files[1].second.max_timestamp);
 }
 
 TEST_F(VersionEditTest, NewFile4NotSupportedField) {
@@ -161,9 +184,10 @@
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true, kInvalidBlobFileNumber,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
                kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kDisableUserTimestamp, kDisableUserTimestamp);
 
   edit.SetComparatorName("foo");
   edit.SetLogNumber(kBig + 100);
@@ -191,9 +215,10 @@
 TEST_F(VersionEditTest, EncodeEmptyFile) {
   VersionEdit edit;
   edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
-               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-               kUnknownFileCreationTime, kUnknownFileChecksum,
-               kUnknownFileChecksumFuncName);
+               Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kDisableUserTimestamp, kDisableUserTimestamp);
   std::string buffer;
   ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
@@ -278,6 +303,314 @@
   TestEncodeDecode(edit);
 }
 
+TEST_F(VersionEditTest, BlobFileAdditionAndGarbage) {
+  VersionEdit edit;
+
+  const std::string checksum_method_prefix = "Hash";
+  const std::string checksum_value_prefix = "Value";
+
+  for (uint64_t blob_file_number = 1; blob_file_number <= 10;
+       ++blob_file_number) {
+    const uint64_t total_blob_count = blob_file_number << 10;
+    const uint64_t total_blob_bytes = blob_file_number << 20;
+
+    std::string checksum_method(checksum_method_prefix);
+    AppendNumberTo(&checksum_method, blob_file_number);
+
+    std::string checksum_value(checksum_value_prefix);
+    AppendNumberTo(&checksum_value, blob_file_number);
+
+    edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                     checksum_method, checksum_value);
+
+    const uint64_t garbage_blob_count = total_blob_count >> 2;
+    const uint64_t garbage_blob_bytes = total_blob_bytes >> 1;
+
+    edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                            garbage_blob_bytes);
+  }
+
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AddWalEncodeDecode) {
+  VersionEdit edit;
+  for (uint64_t log_number = 1; log_number <= 20; log_number++) {
+    WalMetadata meta;
+    bool has_size = rand() % 2 == 0;
+    if (has_size) {
+      meta.SetSyncedSizeInBytes(rand() % 1000);
+    }
+    edit.AddWal(log_number, meta);
+  }
+  TestEncodeDecode(edit);
+}
+
+static std::string PrefixEncodedWalAdditionWithLength(
+    const std::string& encoded) {
+  std::string ret;
+  PutVarint32(&ret, Tag::kWalAddition2);
+  PutLengthPrefixedSlice(&ret, encoded);
+  return ret;
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) {
+  std::string encoded;
+
+  {
+    // No log number.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+                std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // log number should be varint64,
+    // but we only encode 128 which is not a valid representation of varint64.
+    char c = 0;
+    unsigned char* ptr = reinterpret_cast<unsigned char*>(&c);
+    *ptr = 128;
+    encoded.append(1, c);
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+                std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadTag) {
+  constexpr WalNumber kLogNumber = 100;
+  constexpr uint64_t kSizeInBytes = 100;
+
+  std::string encoded;
+  PutVarint64(&encoded, kLogNumber);
+
+  {
+    // No tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // Only has size tag, no terminate tag.
+    std::string encoded_with_size = encoded;
+    PutVarint32(&encoded_with_size,
+                static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+    PutVarint64(&encoded_with_size, kSizeInBytes);
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_size);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // Only has terminate tag.
+    std::string encoded_with_terminate = encoded;
+    PutVarint32(&encoded_with_terminate,
+                static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_terminate);
+    VersionEdit edit;
+    ASSERT_OK(edit.DecodeFrom(encoded_edit));
+    auto& wal_addition = edit.GetWalAdditions()[0];
+    ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber);
+    ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize());
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeNoSize) {
+  constexpr WalNumber kLogNumber = 100;
+
+  std::string encoded;
+  PutVarint64(&encoded, kLogNumber);
+  PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+  // No real size after the size tag.
+
+  {
+    // Without terminate tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") !=
+                std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // With terminate tag.
+    PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    // The terminate tag is misunderstood as the size.
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDebug) {
+  constexpr int n = 2;
+  constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+  constexpr std::array<uint64_t, n> kSizeInBytes{{100, 200}};
+
+  VersionEdit edit;
+  for (int i = 0; i < n; i++) {
+    edit.AddWal(kLogNumbers[i], WalMetadata(kSizeInBytes[i]));
+  }
+
+  const WalAdditions& wals = edit.GetWalAdditions();
+
+  ASSERT_TRUE(edit.IsWalAddition());
+  ASSERT_EQ(wals.size(), n);
+  for (int i = 0; i < n; i++) {
+    const WalAddition& wal = wals[i];
+    ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]);
+    ASSERT_EQ(wal.GetMetadata().GetSyncedSizeInBytes(), kSizeInBytes[i]);
+  }
+
+  std::string expected_str = "VersionEdit {\n";
+  for (int i = 0; i < n; i++) {
+    std::stringstream ss;
+    ss << "  WalAddition: log_number: " << kLogNumbers[i]
+       << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n";
+    expected_str += ss.str();
+  }
+  expected_str += "  ColumnFamily: 0\n}\n";
+  ASSERT_EQ(edit.DebugString(true), expected_str);
+
+  std::string expected_json = "{\"EditNumber\": 4, \"WalAdditions\": [";
+  for (int i = 0; i < n; i++) {
+    std::stringstream ss;
+    ss << "{\"LogNumber\": " << kLogNumbers[i] << ", "
+       << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}";
+    if (i < n - 1) ss << ", ";
+    expected_json += ss.str();
+  }
+  expected_json += "], \"ColumnFamily\": 0}";
+  ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, DeleteWalEncodeDecode) {
+  VersionEdit edit;
+  edit.DeleteWalsBefore(rand() % 100);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, DeleteWalDebug) {
+  constexpr int n = 2;
+  constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+
+  VersionEdit edit;
+  edit.DeleteWalsBefore(kLogNumbers[n - 1]);
+
+  const WalDeletion& wal = edit.GetWalDeletion();
+
+  ASSERT_TRUE(edit.IsWalDeletion());
+  ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]);
+
+  std::string expected_str = "VersionEdit {\n";
+  {
+    std::stringstream ss;
+    ss << "  WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n";
+    expected_str += ss.str();
+  }
+  expected_str += "  ColumnFamily: 0\n}\n";
+  ASSERT_EQ(edit.DebugString(true), expected_str);
+
+  std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": ";
+  {
+    std::stringstream ss;
+    ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}";
+    expected_json += ss.str();
+  }
+  expected_json += ", \"ColumnFamily\": 0}";
+  ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, FullHistoryTsLow) {
+  VersionEdit edit;
+  ASSERT_FALSE(edit.HasFullHistoryTsLow());
+  std::string ts = test::EncodeInt(0);
+  edit.SetFullHistoryTsLow(ts);
+  TestEncodeDecode(edit);
+}
+
+// Tests that if RocksDB is downgraded, the new types of VersionEdits
+// that have a tag larger than kTagSafeIgnoreMask can be safely ignored.
+TEST_F(VersionEditTest, IgnorableTags) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) {
+        bool* ignore = static_cast<bool*>(arg);
+        *ignore = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t kPrevLogNumber = 100;
+  constexpr uint64_t kLogNumber = 200;
+  constexpr uint64_t kNextFileNumber = 300;
+  constexpr uint64_t kColumnFamilyId = 400;
+
+  VersionEdit edit;
+  // Add some ignorable entries.
+  for (int i = 0; i < 2; i++) {
+    edit.AddWal(i + 1, WalMetadata(i + 2));
+  }
+  edit.SetDBId("db_id");
+  // Add unignorable entries.
+  edit.SetPrevLogNumber(kPrevLogNumber);
+  edit.SetLogNumber(kLogNumber);
+  // Add more ignorable entries.
+  edit.DeleteWalsBefore(100);
+  // Add unignorable entry.
+  edit.SetNextFile(kNextFileNumber);
+  // Add more ignorable entries.
+  edit.SetFullHistoryTsLow("ts");
+  // Add unignorable entry.
+  edit.SetColumnFamily(kColumnFamilyId);
+
+  std::string encoded;
+  ASSERT_TRUE(edit.EncodeTo(&encoded));
+
+  VersionEdit decoded;
+  ASSERT_OK(decoded.DecodeFrom(encoded));
+
+  // Check that all ignorable entries are ignored.
+  ASSERT_FALSE(decoded.HasDbId());
+  ASSERT_FALSE(decoded.HasFullHistoryTsLow());
+  ASSERT_FALSE(decoded.IsWalAddition());
+  ASSERT_FALSE(decoded.IsWalDeletion());
+  ASSERT_TRUE(decoded.GetWalAdditions().empty());
+  ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty());
+
+  // Check that unignorable entries are still present.
+  ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber);
+  ASSERT_EQ(edit.GetLogNumber(), kLogNumber);
+  ASSERT_EQ(edit.GetNextFile(), kNextFileNumber);
+  ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,17 +9,24 @@
 
 #include "db/version_set.h"
 
-#include <stdio.h>
 #include <algorithm>
 #include <array>
 #include <cinttypes>
+#include <cstdio>
 #include <list>
 #include <map>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "compaction/compaction.h"
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/file_pri.h"
 #include "db/internal_stats.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -29,13 +36,16 @@
 #include "db/pinned_iterators_manager.h"
 #include "db/table_cache.h"
 #include "db/version_builder.h"
+#include "db/version_edit_handler.h"
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
 #include "file/read_write_util.h"
 #include "file/writable_file_writer.h"
+#include "logging/logging.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -49,6 +59,7 @@
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
@@ -88,9 +99,9 @@
   *overlap = false;
   if (iter->Valid()) {
     ParsedInternalKey seek_result;
-    if (!ParseInternalKey(iter->key(), &seek_result)) {
-      return Status::Corruption("DB have corrupted keys");
-    }
+    Status s = ParseInternalKey(iter->key(), &seek_result,
+                                false /* log_err_key */);  // TODO
+    if (!s.ok()) return s;
 
     if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
         0) {
@@ -109,10 +120,9 @@
 // are MergeInProgress).
 class FilePicker {
  public:
-  FilePicker(std::vector<FileMetaData*>* files, const Slice& user_key,
-             const Slice& ikey, autovector<LevelFilesBrief>* file_levels,
-             unsigned int num_levels, FileIndexer* file_indexer,
-             const Comparator* user_comparator,
+  FilePicker(const Slice& user_key, const Slice& ikey,
+             autovector<LevelFilesBrief>* file_levels, unsigned int num_levels,
+             FileIndexer* file_indexer, const Comparator* user_comparator,
              const InternalKeyComparator* internal_comparator)
       : num_levels_(num_levels),
         curr_level_(static_cast<unsigned int>(-1)),
@@ -120,9 +130,6 @@
         hit_file_level_(static_cast<unsigned int>(-1)),
         search_left_bound_(0),
         search_right_bound_(FileIndexer::kLevelMaxIndex),
-#ifndef NDEBUG
-        files_(files),
-#endif
         level_files_brief_(file_levels),
         is_hit_file_last_in_level_(false),
         curr_file_level_(nullptr),
@@ -131,9 +138,6 @@
         file_indexer_(file_indexer),
         user_comparator_(user_comparator),
         internal_comparator_(internal_comparator) {
-#ifdef NDEBUG
-    (void)files;
-#endif
     // Setup member variables to search first level.
     search_ended_ = !PrepareNextLevel();
     if (!search_ended_) {
@@ -203,23 +207,7 @@
             }
           }
         }
-#ifndef NDEBUG
-        // Sanity check to make sure that the files are correctly sorted
-        if (prev_file_) {
-          if (curr_level_ != 0) {
-            int comp_sign = internal_comparator_->Compare(
-                prev_file_->largest_key, f->smallest_key);
-            assert(comp_sign < 0);
-          } else {
-            // level == 0, the current file cannot be newer than the previous
-            // one. Use compressed data structure, has no attribute seqNo
-            assert(curr_index_in_curr_level_ > 0);
-            assert(!NewestFirstBySeqNo(files_[0][curr_index_in_curr_level_],
-                  files_[0][curr_index_in_curr_level_-1]));
-          }
-        }
-        prev_file_ = f;
-#endif
+
         returned_file_level_ = curr_level_;
         if (curr_level_ > 0 && cmp_largest < 0) {
           // No more files to search in this level.
@@ -251,9 +239,6 @@
   unsigned int hit_file_level_;
   int32_t search_left_bound_;
   int32_t search_right_bound_;
-#ifndef NDEBUG
-  std::vector<FileMetaData*>* files_;
-#endif
   autovector<LevelFilesBrief>* level_files_brief_;
   bool search_ended_;
   bool is_hit_file_last_in_level_;
@@ -265,9 +250,6 @@
   FileIndexer* file_indexer_;
   const Comparator* user_comparator_;
   const InternalKeyComparator* internal_comparator_;
-#ifndef NDEBUG
-  FdWithKeyRange* prev_file_;
-#endif
 
   // Setup local variables to search next level.
   // Returns false if there are no more levels to search.
@@ -337,9 +319,7 @@
       }
       start_index_in_curr_level_ = start_index;
       curr_index_in_curr_level_ = start_index;
-#ifndef NDEBUG
-      prev_file_ = nullptr;
-#endif
+
       return true;
     }
     // curr_level_ = num_levels_. So, no more levels to search.
@@ -364,6 +344,7 @@
         range_(range),
         batch_iter_(range->begin()),
         batch_iter_prev_(range->begin()),
+        upper_key_(range->begin()),
         maybe_repeat_key_(false),
         current_level_range_(*range, range->begin(), range->end()),
         current_file_range_(*range, range->begin(), range->end()),
@@ -400,7 +381,7 @@
   int GetCurrentLevel() const { return curr_level_; }
 
   // Iterates through files in the current level until it finds a file that
-  // contains atleast one key from the MultiGet batch
+  // contains at least one key from the MultiGet batch
   bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
                                   size_t* file_index, FdWithKeyRange** fd,
                                   bool* is_last_key_in_file) {
@@ -432,7 +413,7 @@
             !file_hit)) {
       struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
       f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
-      Slice& user_key = batch_iter_->ukey;
+      Slice& user_key = batch_iter_->ukey_without_ts;
 
       // Do key range filtering of files or/and fractional cascading if:
       // (1) not all the files are in level 0, or
@@ -446,17 +427,17 @@
         // Check if key is within a file's range. If search left bound and
         // right bound point to the same find, we are sure key falls in
         // range.
+        int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+            user_key, false, ExtractUserKey(f->smallest_key), true);
+
         assert(curr_level_ == 0 ||
                fp_ctx.curr_index_in_curr_level ==
                    fp_ctx.start_index_in_curr_level ||
-               user_comparator_->Compare(user_key,
-                                         ExtractUserKey(f->smallest_key)) <= 0);
+               cmp_smallest <= 0);
 
-        int cmp_smallest = user_comparator_->Compare(
-            user_key, ExtractUserKey(f->smallest_key));
         if (cmp_smallest >= 0) {
-          cmp_largest = user_comparator_->Compare(
-              user_key, ExtractUserKey(f->largest_key));
+          cmp_largest = user_comparator_->CompareWithoutTimestamp(
+              user_key, false, ExtractUserKey(f->largest_key), true);
         } else {
           cmp_largest = -1;
         }
@@ -480,9 +461,20 @@
       }
       if (cmp_largest == 0) {
         // cmp_largest is 0, which means the next key will not be in this
-        // file, so stop looking further. Also don't increment megt_iter_
-        // as we may have to look for this key in the next file if we don't
-        // find it in this one
+        // file, so stop looking further. However, its possible there are
+        // duplicates in the batch, so find the upper bound for the batch
+        // in this file (upper_key_) by skipping past the duplicates. We
+        // leave batch_iter_ as is since we may have to pick up from there
+        // for the next file, if this file has a merge value rather than
+        // final value
+        upper_key_ = batch_iter_;
+        ++upper_key_;
+        while (upper_key_ != current_level_range_.end() &&
+               user_comparator_->CompareWithoutTimestamp(
+                   batch_iter_->ukey_without_ts, false,
+                   upper_key_->ukey_without_ts, false) == 0) {
+          ++upper_key_;
+        }
         break;
       } else {
         if (curr_level_ == 0) {
@@ -502,6 +494,12 @@
     *fd = f;
     *file_index = curr_file_index;
     *is_last_key_in_file = cmp_largest == 0;
+    if (!*is_last_key_in_file) {
+      // If the largest key in the batch overlapping the file is not the
+      // largest key in the file, upper_ley_ would not have been updated so
+      // update it here
+      upper_key_ = batch_iter_;
+    }
     return file_hit;
   }
 
@@ -523,7 +521,7 @@
           // file regardless for all keys not found yet
           if (current_level_range_.CheckKeyDone(batch_iter_) ||
               curr_level_ == 0) {
-            ++batch_iter_;
+            batch_iter_ = upper_key_;
           }
         }
         // batch_iter_prev_ will become the start key for the next file
@@ -543,18 +541,20 @@
                                       &is_last_key_in_file)) {
         search_ended_ = !PrepareNextLevel();
       } else {
-        MultiGetRange::Iterator upper_key = batch_iter_;
         if (is_last_key_in_file) {
           // Since cmp_largest is 0, batch_iter_ still points to the last key
           // that falls in this file, instead of the next one. Increment
-          // upper_key so we can set the range properly for SST MultiGet
-          ++upper_key;
-          ++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level);
+          // the file index for all keys between batch_iter_ and upper_key_
+          auto tmp_iter = batch_iter_;
+          while (tmp_iter != upper_key_) {
+            ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
+            ++tmp_iter;
+          }
           maybe_repeat_key_ = true;
         }
         // Set the range for this file
         current_file_range_ =
-            MultiGetRange(next_file_range, batch_iter_prev_, upper_key);
+            MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
         returned_file_level_ = curr_level_;
         hit_file_level_ = curr_level_;
         is_hit_file_last_in_level_ =
@@ -606,6 +606,7 @@
   // key found in the previous SST file, in order to serve as the start of
   // the batch key range for the next SST file
   MultiGetRange::Iterator batch_iter_prev_;
+  MultiGetRange::Iterator upper_key_;
   bool maybe_repeat_key_;
   MultiGetRange current_level_range_;
   MultiGetRange current_file_range_;
@@ -625,7 +626,7 @@
       if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
           curr_file_level_->num_files) {
         batch_iter_prev_ = current_level_range_.begin();
-        batch_iter_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
         return true;
       }
     }
@@ -720,7 +721,7 @@
       }
       if (level_contains_keys) {
         batch_iter_prev_ = current_level_range_.begin();
-        batch_iter_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
         return true;
       }
       curr_level_++;
@@ -852,15 +853,18 @@
 
 class LevelIterator final : public InternalIterator {
  public:
+  // @param read_options Must outlive this iterator.
   LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
                 const FileOptions& file_options,
                 const InternalKeyComparator& icomparator,
                 const LevelFilesBrief* flevel,
-                const SliceTransform* prefix_extractor, bool should_sample,
-                HistogramImpl* file_read_hist, TableReaderCaller caller,
-                bool skip_filters, int level, RangeDelAggregator* range_del_agg,
+                const std::shared_ptr<const SliceTransform>& prefix_extractor,
+                bool should_sample, HistogramImpl* file_read_hist,
+                TableReaderCaller caller, bool skip_filters, int level,
+                RangeDelAggregator* range_del_agg,
                 const std::vector<AtomicCompactionUnitBoundary>*
-                    compaction_boundaries = nullptr)
+                    compaction_boundaries = nullptr,
+                bool allow_unprepared_value = false)
       : table_cache_(table_cache),
         read_options_(read_options),
         file_options_(file_options),
@@ -872,11 +876,13 @@
         should_sample_(should_sample),
         caller_(caller),
         skip_filters_(skip_filters),
+        allow_unprepared_value_(allow_unprepared_value),
         file_index_(flevel_->num_files),
         level_(level),
         range_del_agg_(range_del_agg),
         pinned_iters_mgr_(nullptr),
-        compaction_boundaries_(compaction_boundaries) {
+        compaction_boundaries_(compaction_boundaries),
+        is_next_read_sequential_(false) {
     // Empty level is not supported.
     assert(flevel_ != nullptr && flevel_->num_files > 0);
   }
@@ -906,14 +912,21 @@
     return file_iter_.iter() ? file_iter_.status() : Status::OK();
   }
 
+  bool PrepareValue() override {
+    return file_iter_.PrepareValue();
+  }
+
   inline bool MayBeOutOfLowerBound() override {
     assert(Valid());
     return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
   }
 
-  inline bool MayBeOutOfUpperBound() override {
-    assert(Valid());
-    return file_iter_.MayBeOutOfUpperBound();
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    if (Valid()) {
+      return file_iter_.UpperBoundCheckResult();
+    } else {
+      return IterBoundCheck::kUnknown;
+    }
   }
 
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
@@ -940,13 +953,6 @@
   void SetFileIterator(InternalIterator* iter);
   void InitFileIterator(size_t new_file_index);
 
-  // Called by both of Next() and NextAndGetResult(). Force inline.
-  void NextImpl() {
-    assert(Valid());
-    file_iter_.Next();
-    SkipEmptyFileForward();
-  }
-
   const Slice& file_smallest_key(size_t file_index) {
     assert(file_index < flevel_->num_files);
     return flevel_->files[file_index].smallest_key;
@@ -955,8 +961,8 @@
   bool KeyReachedUpperBound(const Slice& internal_key) {
     return read_options_.iterate_upper_bound != nullptr &&
            user_comparator_.CompareWithoutTimestamp(
-               ExtractUserKey(internal_key),
-               *read_options_.iterate_upper_bound) >= 0;
+               ExtractUserKey(internal_key), /*a_has_ts=*/true,
+               *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0;
   }
 
   InternalIterator* NewFileIterator() {
@@ -977,8 +983,9 @@
         read_options_, file_options_, icomparator_, *file_meta.file_metadata,
         range_del_agg_, prefix_extractor_,
         nullptr /* don't need reference to table */, file_read_hist_, caller_,
-        /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key,
-        largest_compaction_key);
+        /*arena=*/nullptr, skip_filters_, level_,
+        /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
+        largest_compaction_key, allow_unprepared_value_);
   }
 
   // Check if current file being fully within iterate_lower_bound.
@@ -989,14 +996,14 @@
     if (read_options_.iterate_lower_bound != nullptr &&
         file_index_ < flevel_->num_files) {
       may_be_out_of_lower_bound_ =
-          user_comparator_.Compare(
-              ExtractUserKey(file_smallest_key(file_index_)),
-              *read_options_.iterate_lower_bound) < 0;
+          user_comparator_.CompareWithoutTimestamp(
+              ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true,
+              *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0;
     }
   }
 
   TableCache* table_cache_;
-  const ReadOptions read_options_;
+  const ReadOptions& read_options_;
   const FileOptions& file_options_;
   const InternalKeyComparator& icomparator_;
   const UserComparatorWrapper user_comparator_;
@@ -1005,12 +1012,13 @@
   // `prefix_extractor_` may be non-null even for total order seek. Checking
   // this variable is not the right way to identify whether prefix iterator
   // is used.
-  const SliceTransform* prefix_extractor_;
+  const std::shared_ptr<const SliceTransform>& prefix_extractor_;
 
   HistogramImpl* file_read_hist_;
   bool should_sample_;
   TableReaderCaller caller_;
   bool skip_filters_;
+  bool allow_unprepared_value_;
   bool may_be_out_of_lower_bound_ = true;
   size_t file_index_;
   int level_;
@@ -1021,6 +1029,8 @@
   // To be propagated to RangeDelAggregator in order to safely truncate range
   // tombstones.
   const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
+
+  bool is_next_read_sequential_;
 };
 
 void LevelIterator::Seek(const Slice& target) {
@@ -1063,13 +1073,17 @@
     //    next key after the prefix, or make the iterator invalid.
     // A side benefit will be that it invalidates the iterator earlier so that
     // the upper level merging iterator can merge fewer child iterators.
-    Slice target_user_key = ExtractUserKey(target);
-    Slice file_user_key = ExtractUserKey(file_iter_.key());
-    if (prefix_extractor_->InDomain(target_user_key) &&
-        (!prefix_extractor_->InDomain(file_user_key) ||
-         user_comparator_.Compare(
-             prefix_extractor_->Transform(target_user_key),
-             prefix_extractor_->Transform(file_user_key)) != 0)) {
+    size_t ts_sz = user_comparator_.timestamp_size();
+    Slice target_user_key_without_ts =
+        ExtractUserKeyAndStripTimestamp(target, ts_sz);
+    Slice file_user_key_without_ts =
+        ExtractUserKeyAndStripTimestamp(file_iter_.key(), ts_sz);
+    if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
+        (!prefix_extractor_->InDomain(file_user_key_without_ts) ||
+         user_comparator_.CompareWithoutTimestamp(
+             prefix_extractor_->Transform(target_user_key_without_ts), false,
+             prefix_extractor_->Transform(file_user_key_without_ts),
+             false) != 0)) {
       SetFileIterator(nullptr);
     }
   }
@@ -1108,14 +1122,28 @@
   CheckMayBeOutOfLowerBound();
 }
 
-void LevelIterator::Next() { NextImpl(); }
+void LevelIterator::Next() {
+  assert(Valid());
+  file_iter_.Next();
+  SkipEmptyFileForward();
+}
 
 bool LevelIterator::NextAndGetResult(IterateResult* result) {
-  NextImpl();
-  bool is_valid = Valid();
-  if (is_valid) {
-    result->key = key();
-    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+  assert(Valid());
+  bool is_valid = file_iter_.NextAndGetResult(result);
+  if (!is_valid) {
+    is_next_read_sequential_ = true;
+    SkipEmptyFileForward();
+    is_next_read_sequential_ = false;
+    is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->bound_check_result = file_iter_.UpperBoundCheckResult();
+      // Ideally, we should return the real file_iter_.value_prepared but the
+      // information is not here. It would casue an extra PrepareValue()
+      // for the first key of a file.
+      result->value_prepared = !allow_unprepared_value_;
+    }
   }
   return is_valid;
 }
@@ -1130,7 +1158,8 @@
   bool seen_empty_file = false;
   while (file_iter_.iter() == nullptr ||
          (!file_iter_.Valid() && file_iter_.status().ok() &&
-          !file_iter_.iter()->IsOutOfBound())) {
+          file_iter_.iter()->UpperBoundCheckResult() !=
+              IterBoundCheck::kOutOfBound)) {
     seen_empty_file = true;
     // Move to next file
     if (file_index_ >= flevel_->num_files - 1) {
@@ -1172,6 +1201,12 @@
   }
 
   InternalIterator* old_iter = file_iter_.Set(iter);
+
+  // Update the read pattern for PrefetchBuffer.
+  if (is_next_read_sequential_) {
+    file_iter_.UpdateReadaheadState(old_iter);
+  }
+
   if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
     pinned_iters_mgr_->PinIterator(old_iter);
   } else {
@@ -1202,28 +1237,6 @@
 }
 }  // anonymous namespace
 
-// A wrapper of version builder which references the current version in
-// constructor and unref it in the destructor.
-// Both of the constructor and destructor need to be called inside DB Mutex.
-class BaseReferencedVersionBuilder {
- public:
-  explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
-      : version_builder_(new VersionBuilder(
-            cfd->current()->version_set()->file_options(), cfd->table_cache(),
-            cfd->current()->storage_info(), cfd->ioptions()->info_log)),
-        version_(cfd->current()) {
-    version_->Ref();
-  }
-  ~BaseReferencedVersionBuilder() {
-    version_->Unref();
-  }
-  VersionBuilder* version_builder() { return version_builder_.get(); }
-
- private:
-  std::unique_ptr<VersionBuilder> version_builder_;
-  Version* version_;
-};
-
 Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
                                    const FileMetaData* file_meta,
                                    const std::string* fname) const {
@@ -1231,7 +1244,7 @@
   auto ioptions = cfd_->ioptions();
   Status s = table_cache->GetTableProperties(
       file_options_, cfd_->internal_comparator(), file_meta->fd, tp,
-      mutable_cf_options_.prefix_extractor.get(), true /* no io */);
+      mutable_cf_options_.prefix_extractor, true /* no io */);
   if (s.ok()) {
     return s;
   }
@@ -1259,24 +1272,23 @@
     return s;
   }
 
-  TableProperties* raw_table_properties;
-  // By setting the magic number to kInvalidTableMagicNumber, we can by
-  // pass the magic number check in the footer.
+  // By setting the magic number to kNullTableMagicNumber, we can bypass
+  // the magic number check in the footer.
   std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(
-          std::move(file), file_name, nullptr /* env */, nullptr /* stats */,
-          0 /* hist_type */, nullptr /* file_read_hist */,
+          std::move(file), file_name, nullptr /* env */, io_tracer_,
+          nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
           nullptr /* rate_limiter */, ioptions->listeners));
+  std::unique_ptr<TableProperties> props;
   s = ReadTableProperties(
       file_reader.get(), file_meta->fd.GetFileSize(),
-      Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
-      &raw_table_properties, false /* compression_type_missing */);
+      Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
+      &props);
   if (!s.ok()) {
     return s;
   }
-  RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
-
-  *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
+  *tp = std::move(props);
+  RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
   return s;
 }
 
@@ -1425,7 +1437,7 @@
     for (size_t i = 0; i < file_level.num_files; i++) {
       total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
           file_options_, cfd_->internal_comparator(), file_level.files[i].fd,
-          mutable_cf_options_.prefix_extractor.get());
+          mutable_cf_options_.prefix_extractor);
     }
   }
   return total_usage;
@@ -1440,6 +1452,10 @@
   cf_meta->file_count = 0;
   cf_meta->levels.clear();
 
+  cf_meta->blob_file_size = 0;
+  cf_meta->blob_file_count = 0;
+  cf_meta->blob_files.clear();
+
   auto* ioptions = cfd_->ioptions();
   auto* vstorage = storage_info();
 
@@ -1457,15 +1473,16 @@
         file_path = ioptions->cf_paths.back().path;
       }
       const uint64_t file_number = file->fd.GetNumber();
-      files.emplace_back(SstFileMetaData{
+      files.emplace_back(
           MakeTableFileName("", file_number), file_number, file_path,
           static_cast<size_t>(file->fd.GetFileSize()), file->fd.smallest_seqno,
           file->fd.largest_seqno, file->smallest.user_key().ToString(),
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
-          file->being_compacted, file->oldest_blob_file_number,
-          file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(),
-          file->file_checksum, file->file_checksum_func_name});
+          file->being_compacted, file->temperature,
+          file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
+          file->TryGetFileCreationTime(), file->file_checksum,
+          file->file_checksum_func_name);
       files.back().num_entries = file->num_entries;
       files.back().num_deletions = file->num_deletions;
       level_size += file->fd.GetFileSize();
@@ -1474,6 +1491,17 @@
         level, level_size, std::move(files));
     cf_meta->size += level_size;
   }
+  for (const auto& iter : vstorage->GetBlobFiles()) {
+    const auto meta = iter.second.get();
+    cf_meta->blob_files.emplace_back(
+        meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()),
+        ioptions->cf_paths.front().path, meta->GetBlobFileSize(),
+        meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(),
+        meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(),
+        meta->GetChecksumMethod(), meta->GetChecksumValue());
+    cf_meta->blob_file_count++;
+    cf_meta->blob_file_size += meta->GetBlobFileSize();
+  }
 }
 
 uint64_t Version::GetSstFilesSize() {
@@ -1554,12 +1582,13 @@
 void Version::AddIterators(const ReadOptions& read_options,
                            const FileOptions& soptions,
                            MergeIteratorBuilder* merge_iter_builder,
-                           RangeDelAggregator* range_del_agg) {
+                           RangeDelAggregator* range_del_agg,
+                           bool allow_unprepared_value) {
   assert(storage_info_.finalized_);
 
   for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
     AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
-                         range_del_agg);
+                         range_del_agg, allow_unprepared_value);
   }
 }
 
@@ -1567,7 +1596,8 @@
                                    const FileOptions& soptions,
                                    MergeIteratorBuilder* merge_iter_builder,
                                    int level,
-                                   RangeDelAggregator* range_del_agg) {
+                                   RangeDelAggregator* range_del_agg,
+                                   bool allow_unprepared_value) {
   assert(storage_info_.finalized_);
   if (level >= storage_info_.num_non_empty_levels()) {
     // This is an empty level
@@ -1587,12 +1617,12 @@
       merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
           read_options, soptions, cfd_->internal_comparator(),
           *file.file_metadata, range_del_agg,
-          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          mutable_cf_options_.prefix_extractor, nullptr,
           cfd_->internal_stats()->GetFileReadHist(0),
           TableReaderCaller::kUserIterator, arena,
-          /*skip_filters=*/false, /*level=*/0,
+          /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
           /*smallest_compaction_key=*/nullptr,
-          /*largest_compaction_key=*/nullptr));
+          /*largest_compaction_key=*/nullptr, allow_unprepared_value));
     }
     if (should_sample) {
       // Count ones for every L0 files. This is done per iterator creation
@@ -1611,10 +1641,11 @@
     merge_iter_builder->AddIterator(new (mem) LevelIterator(
         cfd_->table_cache(), read_options, soptions,
         cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
-        mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+        mutable_cf_options_.prefix_extractor, should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
         TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
-        range_del_agg, /*largest_compaction_key=*/nullptr));
+        range_del_agg,
+        /*compaction_boundaries=*/nullptr, allow_unprepared_value));
   }
 }
 
@@ -1645,12 +1676,13 @@
       ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
           read_options, file_options, cfd_->internal_comparator(),
           *file->file_metadata, &range_del_agg,
-          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          mutable_cf_options_.prefix_extractor, nullptr,
           cfd_->internal_stats()->GetFileReadHist(0),
           TableReaderCaller::kUserIterator, &arena,
-          /*skip_filters=*/false, /*level=*/0,
+          /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
           /*smallest_compaction_key=*/nullptr,
-          /*largest_compaction_key=*/nullptr));
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false));
       status = OverlapWithIterator(
           ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
       if (!status.ok() || *overlap) {
@@ -1662,7 +1694,7 @@
     ScopedArenaIterator iter(new (mem) LevelIterator(
         cfd_->table_cache(), read_options, file_options,
         cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
-        mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+        mutable_cf_options_.prefix_extractor, should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
         TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
         &range_del_agg));
@@ -1726,15 +1758,17 @@
 Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
                  const FileOptions& file_opt,
                  const MutableCFOptions mutable_cf_options,
+                 const std::shared_ptr<IOTracer>& io_tracer,
                  uint64_t version_number)
     : env_(vset->env_),
+      clock_(vset->clock_),
       cfd_(column_family_data),
-      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
-      db_statistics_((cfd_ == nullptr) ? nullptr
-                                       : cfd_->ioptions()->statistics),
+      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger),
+      db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats),
       table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
-      merge_operator_((cfd_ == nullptr) ? nullptr
-                                        : cfd_->ioptions()->merge_operator),
+      blob_file_cache_(cfd_ ? cfd_->blob_file_cache() : nullptr),
+      merge_operator_(
+          (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
       storage_info_(
           (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
           (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
@@ -1751,12 +1785,188 @@
       refs_(0),
       file_options_(file_opt),
       mutable_cf_options_(mutable_cf_options),
-      version_number_(version_number) {}
+      max_file_size_for_l0_meta_pin_(
+          MaxFileSizeForL0MetaPin(mutable_cf_options_)),
+      version_number_(version_number),
+      io_tracer_(io_tracer) {}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const Slice& blob_index_slice,
+                        FilePrefetchBuffer* prefetch_buffer,
+                        PinnableSlice* value, uint64_t* bytes_read) const {
+  BlobIndex blob_index;
+
+  {
+    Status s = blob_index.DecodeFrom(blob_index_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value,
+                 bytes_read);
+}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const BlobIndex& blob_index,
+                        FilePrefetchBuffer* prefetch_buffer,
+                        PinnableSlice* value, uint64_t* bytes_read) const {
+  assert(value);
+
+  if (read_options.read_tier == kBlockCacheTier) {
+    return Status::Incomplete("Cannot read blob: no disk I/O allowed");
+  }
+
+  if (blob_index.HasTTL() || blob_index.IsInlined()) {
+    return Status::Corruption("Unexpected TTL/inlined blob index");
+  }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+
+  const uint64_t blob_file_number = blob_index.file_number();
+
+  const auto it = blob_files.find(blob_file_number);
+  if (it == blob_files.end()) {
+    return Status::Corruption("Invalid blob file number");
+  }
+
+  CacheHandleGuard<BlobFileReader> blob_file_reader;
+
+  {
+    assert(blob_file_cache_);
+    const Status s = blob_file_cache_->GetBlobFileReader(blob_file_number,
+                                                         &blob_file_reader);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(blob_file_reader.GetValue());
+  const Status s = blob_file_reader.GetValue()->GetBlob(
+      read_options, user_key, blob_index.offset(), blob_index.size(),
+      blob_index.compression(), prefetch_buffer, value, bytes_read);
+
+  return s;
+}
+
+void Version::MultiGetBlob(
+    const ReadOptions& read_options, MultiGetRange& range,
+    std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs) {
+  if (read_options.read_tier == kBlockCacheTier) {
+    Status s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+    for (const auto& elem : blob_rqs) {
+      for (const auto& blob_rq : elem.second) {
+        const KeyContext& key_context = blob_rq.second;
+        assert(key_context.s);
+        assert(key_context.s->ok());
+        *(key_context.s) = s;
+        assert(key_context.get_context);
+        auto& get_context = *(key_context.get_context);
+        get_context.MarkKeyMayExist();
+      }
+    }
+    return;
+  }
+
+  assert(!blob_rqs.empty());
+  Status status;
+  const auto& blob_files = storage_info_.GetBlobFiles();
+  for (auto& elem : blob_rqs) {
+    uint64_t blob_file_number = elem.first;
+    if (blob_files.find(blob_file_number) == blob_files.end()) {
+      auto& blobs_in_file = elem.second;
+      for (const auto& blob : blobs_in_file) {
+        const KeyContext& key_context = blob.second;
+        *(key_context.s) = Status::Corruption("Invalid blob file number");
+      }
+      continue;
+    }
+    CacheHandleGuard<BlobFileReader> blob_file_reader;
+    assert(blob_file_cache_);
+    status = blob_file_cache_->GetBlobFileReader(blob_file_number,
+                                                 &blob_file_reader);
+    assert(!status.ok() || blob_file_reader.GetValue());
+
+    auto& blobs_in_file = elem.second;
+    if (!status.ok()) {
+      for (const auto& blob : blobs_in_file) {
+        const KeyContext& key_context = blob.second;
+        *(key_context.s) = status;
+      }
+      continue;
+    }
+
+    assert(blob_file_reader.GetValue());
+    const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
+    const CompressionType compression =
+        blob_file_reader.GetValue()->GetCompressionType();
+
+    // sort blobs_in_file by file offset.
+    std::sort(
+        blobs_in_file.begin(), blobs_in_file.end(),
+        [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
+          assert(lhs.first.file_number() == rhs.first.file_number());
+          return lhs.first.offset() < rhs.first.offset();
+        });
+
+    autovector<std::reference_wrapper<const KeyContext>> blob_read_key_contexts;
+    autovector<std::reference_wrapper<const Slice>> user_keys;
+    autovector<uint64_t> offsets;
+    autovector<uint64_t> value_sizes;
+    autovector<Status*> statuses;
+    autovector<PinnableSlice*> values;
+    for (const auto& blob : blobs_in_file) {
+      const auto& blob_index = blob.first;
+      const KeyContext& key_context = blob.second;
+      if (blob_index.HasTTL() || blob_index.IsInlined()) {
+        *(key_context.s) =
+            Status::Corruption("Unexpected TTL/inlined blob index");
+        continue;
+      }
+      const uint64_t key_size = key_context.ukey_with_ts.size();
+      const uint64_t offset = blob_index.offset();
+      const uint64_t value_size = blob_index.size();
+      if (!IsValidBlobOffset(offset, key_size, value_size, file_size)) {
+        *(key_context.s) = Status::Corruption("Invalid blob offset");
+        continue;
+      }
+      if (blob_index.compression() != compression) {
+        *(key_context.s) =
+            Status::Corruption("Compression type mismatch when reading a blob");
+        continue;
+      }
+      blob_read_key_contexts.emplace_back(std::cref(key_context));
+      user_keys.emplace_back(std::cref(key_context.ukey_with_ts));
+      offsets.push_back(blob_index.offset());
+      value_sizes.push_back(blob_index.size());
+      statuses.push_back(key_context.s);
+      values.push_back(key_context.value);
+    }
+    blob_file_reader.GetValue()->MultiGetBlob(read_options, user_keys, offsets,
+                                              value_sizes, statuses, values,
+                                              /*bytes_read=*/nullptr);
+    size_t num = blob_read_key_contexts.size();
+    assert(num == user_keys.size());
+    assert(num == offsets.size());
+    assert(num == value_sizes.size());
+    assert(num == statuses.size());
+    assert(num == values.size());
+    for (size_t i = 0; i < num; ++i) {
+      if (statuses[i]->ok()) {
+        range.AddValueSize(blob_read_key_contexts[i].get().value->size());
+        if (range.GetValueSize() > read_options.value_size_soft_limit) {
+          *(blob_read_key_contexts[i].get().s) = Status::Aborted();
+        }
+      }
+    }
+  }
+}
 
 void Version::Get(const ReadOptions& read_options, const LookupKey& k,
-                  PinnableSlice* value, Status* status,
+                  PinnableSlice* value, std::string* timestamp, Status* status,
                   MergeContext* merge_context,
-                  SequenceNumber* max_covering_tombstone_seq, bool* value_found,
+                  SequenceNumber* max_covering_tombstone_seq,
+                  PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
                   bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
                   bool* is_blob, bool do_merge) {
   Slice ikey = k.internal_key();
@@ -1769,29 +1979,37 @@
     *key_exists = true;
   }
 
-  PinnedIteratorsManager pinned_iters_mgr;
   uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
   if (vset_ && vset_->block_cache_tracer_ &&
       vset_->block_cache_tracer_->is_tracing_enabled()) {
     tracing_get_id = vset_->block_cache_tracer_->NextGetId();
   }
+
+  // Note: the old StackableDB-based BlobDB passes in
+  // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we
+  // need to provide it here.
+  bool is_blob_index = false;
+  bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
+  BlobFetcher blob_fetcher(this, read_options);
+
+  assert(pinned_iters_mgr);
   GetContext get_context(
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
-      do_merge ? value : nullptr, value_found, merge_context, do_merge,
-      max_covering_tombstone_seq, this->env_, seq,
-      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
-      tracing_get_id);
+      do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found,
+      merge_context, do_merge, max_covering_tombstone_seq, clock_, seq,
+      merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
+      tracing_get_id, &blob_fetcher);
 
   // Pin blocks that we read to hold merge operands
   if (merge_operator_) {
-    pinned_iters_mgr.StartPinning();
+    pinned_iters_mgr->StartPinning();
   }
 
-  FilePicker fp(
-      storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_,
-      storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
-      user_comparator(), internal_comparator());
+  FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
+                storage_info_.num_non_empty_levels_,
+                &storage_info_.file_indexer_, user_comparator(),
+                internal_comparator());
   FdWithKeyRange* f = fp.GetNextFile();
 
   while (f != nullptr) {
@@ -1807,20 +2025,23 @@
     bool timer_enabled =
         GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
         get_perf_context()->per_level_perf_context_enabled;
-    StopWatchNano timer(env_, timer_enabled /* auto_start */);
+    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
     *status = table_cache_->Get(
         read_options, *internal_comparator(), *f->file_metadata, ikey,
-        &get_context, mutable_cf_options_.prefix_extractor.get(),
+        &get_context, mutable_cf_options_.prefix_extractor,
         cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
         IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
                         fp.IsHitFileLastInLevel()),
-        fp.GetCurrentLevel());
+        fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_);
     // TODO: examine the behavior for corrupted key
     if (timer_enabled) {
       PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
-                                fp.GetCurrentLevel());
+                                fp.GetHitFileLevel());
     }
     if (!status->ok()) {
+      if (db_statistics_ != nullptr) {
+        get_context.ReportCounters();
+      }
       return;
     }
 
@@ -1845,8 +2066,26 @@
         } else if (fp.GetHitFileLevel() >= 2) {
           RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
         }
+
         PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
                                   fp.GetHitFileLevel());
+
+        if (is_blob_index) {
+          if (do_merge && value) {
+            constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+            constexpr uint64_t* bytes_read = nullptr;
+
+            *status = GetBlob(read_options, user_key, *value, prefetch_buffer,
+                              value, bytes_read);
+            if (!status->ok()) {
+              if (status->IsIncomplete()) {
+                get_context.MarkKeyMayExist();
+              }
+              return;
+            }
+          }
+        }
+
         return;
       case GetContext::kDeleted:
         // Use empty error message for speed
@@ -1855,7 +2094,7 @@
       case GetContext::kCorrupt:
         *status = Status::Corruption("corrupted key for ", user_key);
         return;
-      case GetContext::kBlobIndex:
+      case GetContext::kUnexpectedBlobIndex:
         ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
         *status = Status::NotSupported(
             "Encounter unexpected blob index. Please open DB with "
@@ -1882,7 +2121,7 @@
     std::string* str_value = value != nullptr ? value->GetSelf() : nullptr;
     *status = MergeHelper::TimedFullMerge(
         merge_operator_, user_key, nullptr, merge_context->GetOperands(),
-        str_value, info_log_, db_statistics_, env_,
+        str_value, info_log_, db_statistics_, clock_,
         nullptr /* result_operand */, true);
     if (LIKELY(value != nullptr)) {
       value->PinSelf();
@@ -1896,7 +2135,7 @@
 }
 
 void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                       ReadCallback* callback, bool* is_blob) {
+                       ReadCallback* callback) {
   PinnedIteratorsManager pinned_iters_mgr;
 
   // Pin blocks that we read to hold merge operands
@@ -1913,15 +2152,16 @@
   // use autovector in order to avoid unnecessary construction of GetContext
   // objects, which is expensive
   autovector<GetContext, 16> get_ctx;
+  BlobFetcher blob_fetcher(this, read_options);
   for (auto iter = range->begin(); iter != range->end(); ++iter) {
     assert(iter->s->ok() || iter->s->IsMergeInProgress());
     get_ctx.emplace_back(
         user_comparator(), merge_operator_, info_log_, db_statistics_,
-        iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
-        iter->value, nullptr, &(iter->merge_context), true,
-        &iter->max_covering_tombstone_seq, this->env_, nullptr,
-        merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
-        tracing_mget_id);
+        iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
+        iter->ukey_with_ts, iter->value, iter->timestamp, nullptr,
+        &(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_,
+        nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
+        &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
     // MergeInProgress status, if set, has been transferred to the get_context
     // state, so we set status to ok here. From now on, the iter status will
     // be used for IO errors, and get_context state will be used for any
@@ -1940,24 +2180,52 @@
       &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_,
       &storage_info_.file_indexer_, user_comparator(), internal_comparator());
   FdWithKeyRange* f = fp.GetNextFile();
+  Status s;
+  uint64_t num_index_read = 0;
+  uint64_t num_filter_read = 0;
+  uint64_t num_data_read = 0;
+  uint64_t num_sst_read = 0;
+
+  MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
+  // blob_file => [[blob_idx, it], ...]
+  std::unordered_map<uint64_t, BlobReadRequests> blob_rqs;
+  int level = -1;
 
   while (f != nullptr) {
     MultiGetRange file_range = fp.CurrentFileRange();
     bool timer_enabled =
         GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
         get_perf_context()->per_level_perf_context_enabled;
-    StopWatchNano timer(env_, timer_enabled /* auto_start */);
-    Status s = table_cache_->MultiGet(
+
+    // Report MultiGet stats per level.
+    if (level >= 0 && level != (int)fp.GetHitFileLevel()) {
+      // Dump the stats if the search has moved to the next level and
+      // reset for next level.
+      RecordInHistogram(db_statistics_,
+                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                        num_index_read + num_filter_read);
+      RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
+                        num_data_read);
+      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
+      num_filter_read = 0;
+      num_index_read = 0;
+      num_data_read = 0;
+      num_sst_read = 0;
+      level = fp.GetHitFileLevel();
+    }
+
+    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+    s = table_cache_->MultiGet(
         read_options, *internal_comparator(), *f->file_metadata, &file_range,
-        mutable_cf_options_.prefix_extractor.get(),
+        mutable_cf_options_.prefix_extractor,
         cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
         IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
                         fp.IsHitFileLastInLevel()),
-        fp.GetCurrentLevel());
+        fp.GetHitFileLevel());
     // TODO: examine the behavior for corrupted key
     if (timer_enabled) {
       PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
-                                fp.GetCurrentLevel());
+                                fp.GetHitFileLevel());
     }
     if (!s.ok()) {
       // TODO: Set status for individual keys appropriately
@@ -1968,7 +2236,8 @@
       return;
     }
     uint64_t batch_size = 0;
-    for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+    for (auto iter = file_range.begin(); s.ok() && iter != file_range.end();
+         ++iter) {
       GetContext& get_context = *iter->get_context;
       Status* status = iter->s;
       // The Status in the KeyContext takes precedence over GetContext state
@@ -1985,6 +2254,16 @@
         sample_file_read_inc(f->file_metadata);
       }
       batch_size++;
+      num_index_read += get_context.get_context_stats_.num_index_read;
+      num_filter_read += get_context.get_context_stats_.num_filter_read;
+      num_data_read += get_context.get_context_stats_.num_data_read;
+      num_sst_read += get_context.get_context_stats_.num_sst_read;
+      // Reset these stats since they're specific to a level
+      get_context.get_context_stats_.num_index_read = 0;
+      get_context.get_context_stats_.num_filter_read = 0;
+      get_context.get_context_stats_.num_data_read = 0;
+      get_context.get_context_stats_.num_sst_read = 0;
+
       // report the counters before returning
       if (get_context.State() != GetContext::kNotFound &&
           get_context.State() != GetContext::kMerge &&
@@ -2012,9 +2291,33 @@
           } else if (fp.GetHitFileLevel() >= 2) {
             RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
           }
+
           PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
                                     fp.GetHitFileLevel());
+
           file_range.MarkKeyDone(iter);
+
+          if (iter->is_blob_index) {
+            if (iter->value) {
+              const Slice& blob_index_slice = *(iter->value);
+              BlobIndex blob_index;
+              Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
+              if (tmp_s.ok()) {
+                const uint64_t blob_file_num = blob_index.file_number();
+                blob_rqs[blob_file_num].emplace_back(
+                    std::make_pair(blob_index, std::cref(*iter)));
+              } else {
+                *(iter->s) = tmp_s;
+              }
+            }
+          } else {
+            file_range.AddValueSize(iter->value->size());
+            if (file_range.GetValueSize() >
+                read_options.value_size_soft_limit) {
+              s = Status::Aborted();
+              break;
+            }
+          }
           continue;
         case GetContext::kDeleted:
           // Use empty error message for speed
@@ -2026,7 +2329,7 @@
               Status::Corruption("corrupted key for ", iter->lkey->user_key());
           file_range.MarkKeyDone(iter);
           continue;
-        case GetContext::kBlobIndex:
+        case GetContext::kUnexpectedBlobIndex:
           ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
           *status = Status::NotSupported(
               "Encounter unexpected blob index. Please open DB with "
@@ -2035,15 +2338,27 @@
           continue;
       }
     }
+
     RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
-    if (file_picker_range.empty()) {
+    if (!s.ok() || file_picker_range.empty()) {
       break;
     }
     f = fp.GetNextFile();
   }
 
+  // Dump stats for most recent level
+  RecordInHistogram(db_statistics_, NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                    num_index_read + num_filter_read);
+  RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
+                    num_data_read);
+  RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
+
+  if (s.ok() && !blob_rqs.empty()) {
+    MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs);
+  }
+
   // Process any left over keys
-  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+  for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) {
     GetContext& get_context = *iter->get_context;
     Status* status = iter->s;
     Slice user_key = iter->lkey->user_key();
@@ -2064,16 +2379,27 @@
           iter->value != nullptr ? iter->value->GetSelf() : nullptr;
       *status = MergeHelper::TimedFullMerge(
           merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
-          str_value, info_log_, db_statistics_, env_,
+          str_value, info_log_, db_statistics_, clock_,
           nullptr /* result_operand */, true);
       if (LIKELY(iter->value != nullptr)) {
         iter->value->PinSelf();
+        range->AddValueSize(iter->value->size());
+        range->MarkKeyDone(iter);
+        if (range->GetValueSize() > read_options.value_size_soft_limit) {
+          s = Status::Aborted();
+          break;
+        }
       }
     } else {
       range->MarkKeyDone(iter);
       *status = Status::NotFound();  // Use an empty error message for speed
     }
   }
+
+  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+    range->MarkKeyDone(iter);
+    *(iter->s) = s;
+  }
 }
 
 bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
@@ -2095,10 +2421,14 @@
 void Version::PrepareApply(
     const MutableCFOptions& mutable_cf_options,
     bool update_stats) {
+  TEST_SYNC_POINT_CALLBACK(
+      "Version::PrepareApply:forced_check",
+      reinterpret_cast<void*>(&storage_info_.force_consistency_checks_));
   UpdateAccumulatedStats(update_stats);
   storage_info_.UpdateNumNonEmptyLevels();
   storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
-  storage_info_.UpdateFilesByCompactionPri(cfd_->ioptions()->compaction_pri);
+  storage_info_.UpdateFilesByCompactionPri(*cfd_->ioptions(),
+                                           mutable_cf_options);
   storage_info_.GenerateFileIndexer();
   storage_info_.GenerateLevelFilesBrief();
   storage_info_.GenerateLevel0NonOverlapping();
@@ -2343,13 +2673,13 @@
 }
 
 namespace {
-uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
+uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
                                  const MutableCFOptions& mutable_cf_options,
                                  const std::vector<FileMetaData*>& files) {
   uint32_t ttl_expired_files_count = 0;
 
   int64_t _current_time;
-  auto status = ioptions.env->GetCurrentTime(&_current_time);
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
   if (status.ok()) {
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
     for (FileMetaData* f : files) {
@@ -2367,7 +2697,7 @@
 }  // anonymous namespace
 
 void VersionStorageInfo::ComputeCompactionScore(
-    const ImmutableCFOptions& immutable_cf_options,
+    const ImmutableOptions& immutable_options,
     const MutableCFOptions& mutable_cf_options) {
   for (int level = 0; level <= MaxInputLevel(); level++) {
     double score;
@@ -2396,6 +2726,11 @@
         // compaction score for the whole DB. Adding other levels as if
         // they are L0 files.
         for (int i = 1; i < num_levels(); i++) {
+          // Its possible that a subset of the files in a level may be in a
+          // compaction, due to delete triggered compaction or trivial move.
+          // In that case, the below check may not catch a level being
+          // compacted as it only checks the first file. The worst that can
+          // happen is a scheduled compaction thread will find nothing to do.
           if (!files_[i].empty() && !files_[i][0]->being_compacted) {
             num_sorted_runs++;
           }
@@ -2405,7 +2740,12 @@
       if (compaction_style_ == kCompactionStyleFIFO) {
         score = static_cast<double>(total_size) /
                 mutable_cf_options.compaction_options_fifo.max_table_files_size;
-        if (mutable_cf_options.compaction_options_fifo.allow_compaction) {
+        if (mutable_cf_options.compaction_options_fifo.allow_compaction ||
+            mutable_cf_options.compaction_options_fifo.age_for_warm > 0) {
+          // Warm tier move can happen at any time. It's too expensive to
+          // check very file's timestamp now. For now, just trigger it
+          // slightly more frequently than FIFO compaction so that this
+          // happens first.
           score = std::max(
               static_cast<double>(num_sorted_runs) /
                   mutable_cf_options.level0_file_num_compaction_trigger,
@@ -2414,10 +2754,9 @@
         if (mutable_cf_options.ttl > 0) {
           score = std::max(
               static_cast<double>(GetExpiredTtlFilesCount(
-                  immutable_cf_options, mutable_cf_options, files_[level])),
+                  immutable_options, mutable_cf_options, files_[level])),
               score);
         }
-
       } else {
         score = static_cast<double>(num_sorted_runs) /
                 mutable_cf_options.level0_file_num_compaction_trigger;
@@ -2425,9 +2764,21 @@
           // Level-based involves L0->L0 compactions that can lead to oversized
           // L0 files. Take into account size as well to avoid later giant
           // compactions to the base level.
-          score = std::max(
-              score, static_cast<double>(total_size) /
-                     mutable_cf_options.max_bytes_for_level_base);
+          uint64_t l0_target_size = mutable_cf_options.max_bytes_for_level_base;
+          if (immutable_options.level_compaction_dynamic_level_bytes &&
+              level_multiplier_ != 0.0) {
+            // Prevent L0 to Lbase fanout from growing larger than
+            // `level_multiplier_`. This prevents us from getting stuck picking
+            // L0 forever even when it is hurting write-amp. That could happen
+            // in dynamic level compaction's write-burst mode where the base
+            // level's target size can grow to be enormous.
+            l0_target_size =
+                std::max(l0_target_size,
+                         static_cast<uint64_t>(level_max_bytes_[base_level_] /
+                                               level_multiplier_));
+          }
+          score =
+              std::max(score, static_cast<double>(total_size) / l0_target_size);
         }
       }
     } else {
@@ -2462,12 +2813,21 @@
   ComputeFilesMarkedForCompaction();
   ComputeBottommostFilesMarkedForCompaction();
   if (mutable_cf_options.ttl > 0) {
-    ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
+    ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
   }
   if (mutable_cf_options.periodic_compaction_seconds > 0) {
     ComputeFilesMarkedForPeriodicCompaction(
-        immutable_cf_options, mutable_cf_options.periodic_compaction_seconds);
+        immutable_options, mutable_cf_options.periodic_compaction_seconds);
   }
+
+  if (mutable_cf_options.enable_blob_garbage_collection &&
+      mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 &&
+      mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) {
+    ComputeFilesMarkedForForcedBlobGC(
+        mutable_cf_options.blob_garbage_collection_age_cutoff,
+        mutable_cf_options.blob_garbage_collection_force_threshold);
+  }
+
   EstimateCompactionBytesNeeded(mutable_cf_options);
 }
 
@@ -2495,13 +2855,13 @@
 }
 
 void VersionStorageInfo::ComputeExpiredTtlFiles(
-    const ImmutableCFOptions& ioptions, const uint64_t ttl) {
+    const ImmutableOptions& ioptions, const uint64_t ttl) {
   assert(ttl > 0);
 
   expired_ttl_files_.clear();
 
   int64_t _current_time;
-  auto status = ioptions.env->GetCurrentTime(&_current_time);
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
   if (!status.ok()) {
     return;
   }
@@ -2521,14 +2881,14 @@
 }
 
 void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     const uint64_t periodic_compaction_seconds) {
   assert(periodic_compaction_seconds > 0);
 
   files_marked_for_periodic_compaction_.clear();
 
   int64_t temp_current_time;
-  auto status = ioptions.env->GetCurrentTime(&temp_current_time);
+  auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
   if (!status.ok()) {
     return;
   }
@@ -2562,7 +2922,7 @@
           status = ioptions.env->GetFileModificationTime(
               file_path, &file_modification_time);
           if (!status.ok()) {
-            ROCKS_LOG_WARN(ioptions.info_log,
+            ROCKS_LOG_WARN(ioptions.logger,
                            "Can't get file modification time: %s: %s",
                            file_path.c_str(), status.ToString().c_str());
             continue;
@@ -2577,6 +2937,106 @@
   }
 }
 
+void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
+    double blob_garbage_collection_age_cutoff,
+    double blob_garbage_collection_force_threshold) {
+  files_marked_for_forced_blob_gc_.clear();
+
+  if (blob_files_.empty()) {
+    return;
+  }
+
+  // Number of blob files eligible for GC based on age
+  const size_t cutoff_count = static_cast<size_t>(
+      blob_garbage_collection_age_cutoff * blob_files_.size());
+  if (!cutoff_count) {
+    return;
+  }
+
+  // Compute the sum of total and garbage bytes over the oldest batch of blob
+  // files. The oldest batch is defined as the set of blob files which are
+  // kept alive by the same SSTs as the very oldest one. Here is a toy example.
+  // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
+  // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
+  // potentially some higher-numbered ones, while SST 3 relies on blob file 12
+  // and potentially some higher-numbered ones. Then, the SST to oldest blob
+  // file mapping is as follows:
+  //
+  // SST file number               Oldest blob file number
+  // 1                             10
+  // 2                             10
+  // 3                             12
+  //
+  // This is what the same thing looks like from the blob files' POV. (Note that
+  // the linked SSTs simply denote the inverse mapping of the above.)
+  //
+  // Blob file number              Linked SST set
+  // 10                            {1, 2}
+  // 11                            {}
+  // 12                            {3}
+  // 13                            {}
+  //
+  // Then, the oldest batch of blob files consists of blob files 10 and 11,
+  // and we can get rid of them by forcing the compaction of SSTs 1 and 2.
+  //
+  // Note that the overall ratio of garbage computed for the batch has to exceed
+  // blob_garbage_collection_force_threshold and the entire batch has to be
+  // eligible for GC according to blob_garbage_collection_age_cutoff in order
+  // for us to schedule any compactions.
+  const auto oldest_it = blob_files_.begin();
+
+  const auto& oldest_meta = oldest_it->second;
+  assert(oldest_meta);
+
+  const auto& linked_ssts = oldest_meta->GetLinkedSsts();
+  assert(!linked_ssts.empty());
+
+  size_t count = 1;
+  uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
+  uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
+
+  auto it = oldest_it;
+  for (++it; it != blob_files_.end(); ++it) {
+    const auto& meta = it->second;
+    assert(meta);
+
+    if (!meta->GetLinkedSsts().empty()) {
+      break;
+    }
+
+    if (++count > cutoff_count) {
+      return;
+    }
+
+    sum_total_blob_bytes += meta->GetTotalBlobBytes();
+    sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
+  }
+
+  if (sum_garbage_blob_bytes <
+      blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
+    return;
+  }
+
+  for (uint64_t sst_file_number : linked_ssts) {
+    const FileLocation location = GetFileLocation(sst_file_number);
+    assert(location.IsValid());
+
+    const int level = location.GetLevel();
+    assert(level >= 0);
+
+    const size_t pos = location.GetPosition();
+
+    FileMetaData* const sst_meta = files_[level][pos];
+    assert(sst_meta);
+
+    if (sst_meta->being_compacted) {
+      continue;
+    }
+
+    files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
+  }
+}
+
 namespace {
 
 // used to sort files by size
@@ -2585,7 +3045,7 @@
   FileMetaData* file;
 };
 
-// Compator that is used to sort files based on their size
+// Comparator that is used to sort files based on their size
 // In normal mode: descending size
 bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
   return (first.file->compensated_file_size >
@@ -2593,31 +3053,29 @@
 }
 } // anonymous namespace
 
-void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) {
-  auto* level_files = &files_[level];
-  // Must not overlap
-#ifndef NDEBUG
-  if (level > 0 && !level_files->empty() &&
-      internal_comparator_->Compare(
-          (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) {
-    auto* f2 = (*level_files)[level_files->size() - 1];
-    if (info_log != nullptr) {
-      Error(info_log, "Adding new file %" PRIu64
-                      " range (%s, %s) to level %d but overlapping "
-                      "with existing file %" PRIu64 " %s %s",
-            f->fd.GetNumber(), f->smallest.DebugString(true).c_str(),
-            f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(),
-            f2->smallest.DebugString(true).c_str(),
-            f2->largest.DebugString(true).c_str());
-      LogFlush(info_log);
-    }
-    assert(false);
-  }
-#else
-  (void)info_log;
-#endif
+void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
+  auto& level_files = files_[level];
+  level_files.push_back(f);
+
   f->refs++;
-  level_files->push_back(f);
+
+  const uint64_t file_number = f->fd.GetNumber();
+
+  assert(file_locations_.find(file_number) == file_locations_.end());
+  file_locations_.emplace(file_number,
+                          FileLocation(level, level_files.size() - 1));
+}
+
+void VersionStorageInfo::AddBlobFile(
+    std::shared_ptr<BlobFileMetaData> blob_file_meta) {
+  assert(blob_file_meta);
+
+  const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber();
+
+  auto it = blob_files_.lower_bound(blob_file_number);
+  assert(it == blob_files_.end() || it->first != blob_file_number);
+
+  blob_files_.emplace_hint(it, blob_file_number, std::move(blob_file_meta));
 }
 
 // Version::PrepareApply() need to be called before calling the function, or
@@ -2681,11 +3139,22 @@
 // Sort `temp` based on ratio of overlapping size over file size
 void SortFileByOverlappingRatio(
     const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
-    const std::vector<FileMetaData*>& next_level_files,
+    const std::vector<FileMetaData*>& next_level_files, SystemClock* clock,
+    int level, int num_non_empty_levels, uint64_t ttl,
     std::vector<Fsize>* temp) {
   std::unordered_map<uint64_t, uint64_t> file_to_order;
   auto next_level_it = next_level_files.begin();
 
+  int64_t curr_time;
+  Status status = clock->GetCurrentTime(&curr_time);
+  if (!status.ok()) {
+    // If we can't get time, disable TTL.
+    ttl = 0;
+  }
+
+  FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl,
+                             num_non_empty_levels, level);
+
   for (auto& file : files) {
     uint64_t overlapping_bytes = 0;
     // Skip files in next level that is smaller than current file
@@ -2705,9 +3174,12 @@
       next_level_it++;
     }
 
+    uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1;
+    assert(ttl_boost_score > 0);
     assert(file->compensated_file_size != 0);
-    file_to_order[file->fd.GetNumber()] =
-        overlapping_bytes * 1024u / file->compensated_file_size;
+    file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U /
+                                          file->compensated_file_size /
+                                          ttl_boost_score;
   }
 
   std::sort(temp->begin(), temp->end(),
@@ -2719,7 +3191,7 @@
 }  // namespace
 
 void VersionStorageInfo::UpdateFilesByCompactionPri(
-    CompactionPri compaction_pri) {
+    const ImmutableOptions& ioptions, const MutableCFOptions& options) {
   if (compaction_style_ == kCompactionStyleNone ||
       compaction_style_ == kCompactionStyleFIFO ||
       compaction_style_ == kCompactionStyleUniversal) {
@@ -2744,7 +3216,7 @@
     if (num > temp.size()) {
       num = temp.size();
     }
-    switch (compaction_pri) {
+    switch (ioptions.compaction_pri) {
       case kByCompensatedSize:
         std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
                           CompareCompensatedSizeDescending);
@@ -2765,7 +3237,8 @@
         break;
       case kMinOverlappingRatio:
         SortFileByOverlappingRatio(*internal_comparator_, files_[level],
-                                   files_[level + 1], &temp);
+                                   files_[level + 1], ioptions.clock, level,
+                                   num_non_empty_levels_, options.ttl, &temp);
         break;
       default:
         assert(false);
@@ -2846,8 +3319,7 @@
   bottommost_files_mark_threshold_ = kMaxSequenceNumber;
   for (auto& level_and_file : bottommost_files_) {
     if (!level_and_file.second->being_compacted &&
-        level_and_file.second->fd.largest_seqno != 0 &&
-        level_and_file.second->num_deletions > 1) {
+        level_and_file.second->fd.largest_seqno != 0) {
       // largest_seqno might be nonzero due to containing the final key in an
       // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
       // ensures the file really contains deleted or overwritten keys.
@@ -3006,7 +3478,7 @@
 // specified range. From that file, iterate backwards and
 // forwards to find all overlapping files.
 // if within_range is set, then only store the maximum clean inputs
-// within range [begin, end]. "clean" means there is a boudnary
+// within range [begin, end]. "clean" means there is a boundary
 // between the files in "*inputs" and the surrounding files
 void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
     int level, const InternalKey* begin, const InternalKey* end,
@@ -3173,7 +3645,7 @@
   return scratch->buffer;
 }
 
-int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
+uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
   uint64_t result = 0;
   std::vector<FileMetaData*> overlaps;
   for (int level = 1; level < num_levels() - 1; level++) {
@@ -3196,7 +3668,7 @@
   return level_max_bytes_[level];
 }
 
-void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
                                             const MutableCFOptions& options) {
   // Special logic to set number of sorted runs.
   // It is to match the previous behavior when all files are in L0.
@@ -3286,7 +3758,7 @@
         // base_bytes_min. We set it be base_bytes_min.
         base_level_size = base_bytes_min + 1U;
         base_level_ = first_non_empty_level;
-        ROCKS_LOG_INFO(ioptions.info_log,
+        ROCKS_LOG_INFO(ioptions.logger,
                        "More existing levels in DB than needed. "
                        "max_bytes_for_level_multiplier may not be guaranteed.");
       } else {
@@ -3317,7 +3789,7 @@
         //   1. the L0 size is larger than level size base, or
         //   2. number of L0 files reaches twice the L0->L1 compaction trigger
         // We don't do this otherwise to keep the LSM-tree structure stable
-        // unless the L0 compation is backlogged.
+        // unless the L0 compaction is backlogged.
         base_level_size = l0_size;
         if (base_level_ == num_levels_ - 1) {
           level_multiplier_ = 1.0;
@@ -3345,22 +3817,23 @@
 }
 
 uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
-  // Estimate the live data size by adding up the size of the last level for all
-  // key ranges. Note: Estimate depends on the ordering of files in level 0
-  // because files in level 0 can be overlapping.
+  // Estimate the live data size by adding up the size of a maximal set of
+  // sst files with no range overlap in same or higher level. The less
+  // compacted, the more optimistic (smaller) this estimate is. Also,
+  // for multiple sorted runs within a level, file order will matter.
   uint64_t size = 0;
 
   auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
     return internal_comparator_->Compare(*x, *y) < 0;
   };
-  // (Ordered) map of largest keys in non-overlapping files
+  // (Ordered) map of largest keys in files being included in size estimate
   std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
 
   for (int l = num_levels_ - 1; l >= 0; l--) {
     bool found_end = false;
     for (auto file : files_[l]) {
-      // Find the first file where the largest key is larger than the smallest
-      // key of the current file. If this file does not overlap with the
+      // Find the first file already included with largest key is larger than
+      // the smallest key of `file`. If that file does not overlap with the
       // current file, none of the files in the map does. If there is
       // no potential overlap, we can safely insert the rest of this level
       // (if the level is not 0) into the map without checking again because
@@ -3375,6 +3848,14 @@
       }
     }
   }
+  // For BlobDB, the result also includes the exact value of live bytes in the
+  // blob files of the version.
+  const auto& blobFiles = GetBlobFiles();
+  for (const auto& pair : blobFiles) {
+    const auto& meta = pair.second;
+    size += meta->GetTotalBlobBytes();
+    size -= meta->GetGarbageBlobBytes();
+  }
   return size;
 }
 
@@ -3409,13 +3890,27 @@
   return false;
 }
 
-void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
-  for (int level = 0; level < storage_info_.num_levels(); level++) {
-    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
-    for (const auto& file : files) {
-      live->push_back(file->fd);
+void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                           std::vector<uint64_t>* live_blob_files) const {
+  assert(live_table_files);
+  assert(live_blob_files);
+
+  for (int level = 0; level < storage_info_.num_levels(); ++level) {
+    const auto& level_files = storage_info_.LevelFiles(level);
+    for (const auto& meta : level_files) {
+      assert(meta);
+
+      live_table_files->emplace_back(meta->fd.GetNumber());
     }
   }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+  for (const auto& pair : blob_files) {
+    const auto& meta = pair.second;
+    assert(meta);
+
+    live_blob_files->emplace_back(meta->GetBlobFileNumber());
+  }
 }
 
 std::string Version::DebugString(bool hex, bool print_stats) const {
@@ -3462,6 +3957,21 @@
       r.append("\n");
     }
   }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+  if (!blob_files.empty()) {
+    r.append("--- blob files --- version# ");
+    AppendNumberTo(&r, version_number_);
+    r.append(" ---\n");
+    for (const auto& pair : blob_files) {
+      const auto& blob_file_meta = pair.second;
+      assert(blob_file_meta);
+
+      r.append(blob_file_meta->DebugString());
+      r.push_back('\n');
+    }
+  }
+
   return r;
 }
 
@@ -3473,15 +3983,30 @@
   ColumnFamilyData* cfd;
   const MutableCFOptions mutable_cf_options;
   const autovector<VersionEdit*>& edit_list;
+  const std::function<void(const Status&)> manifest_write_callback;
 
-  explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
-                          const MutableCFOptions& cf_options,
-                          const autovector<VersionEdit*>& e)
+  explicit ManifestWriter(
+      InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+      const MutableCFOptions& cf_options, const autovector<VersionEdit*>& e,
+      const std::function<void(const Status&)>& manifest_wcb)
       : done(false),
         cv(mu),
         cfd(_cfd),
         mutable_cf_options(cf_options),
-        edit_list(e) {}
+        edit_list(e),
+        manifest_write_callback(manifest_wcb) {}
+  ~ManifestWriter() { status.PermitUncheckedError(); }
+
+  bool IsAllWalEdits() const {
+    bool all_wal_edits = true;
+    for (const auto& e : edit_list) {
+      if (!e->IsWalManipulation()) {
+        all_wal_edits = false;
+        break;
+      }
+    }
+    return all_wal_edits;
+  }
 };
 
 Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
@@ -3534,17 +4059,23 @@
                        const FileOptions& storage_options, Cache* table_cache,
                        WriteBufferManager* write_buffer_manager,
                        WriteController* write_controller,
-                       BlockCacheTracer* const block_cache_tracer)
-    : column_family_set_(new ColumnFamilySet(
-          dbname, _db_options, storage_options, table_cache,
-          write_buffer_manager, write_controller, block_cache_tracer)),
+                       BlockCacheTracer* const block_cache_tracer,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_session_id)
+    : column_family_set_(
+          new ColumnFamilySet(dbname, _db_options, storage_options, table_cache,
+                              write_buffer_manager, write_controller,
+                              block_cache_tracer, io_tracer, db_session_id)),
+      table_cache_(table_cache),
       env_(_db_options->env),
-      fs_(_db_options->fs.get()),
+      fs_(_db_options->fs, io_tracer),
+      clock_(_db_options->clock),
       dbname_(dbname),
       db_options_(_db_options),
       next_file_number_(2),
       manifest_file_number_(0),  // Filled by Recover()
       options_file_number_(0),
+      options_file_size_(0),
       pending_manifest_file_number_(0),
       last_sequence_(0),
       last_allocated_sequence_(0),
@@ -3553,21 +4084,50 @@
       current_version_number_(0),
       manifest_file_size_(0),
       file_options_(storage_options),
-      block_cache_tracer_(block_cache_tracer) {}
+      block_cache_tracer_(block_cache_tracer),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {}
 
 VersionSet::~VersionSet() {
   // we need to delete column_family_set_ because its destructor depends on
   // VersionSet
-  Cache* table_cache = column_family_set_->get_table_cache();
   column_family_set_.reset();
   for (auto& file : obsolete_files_) {
     if (file.metadata->table_reader_handle) {
-      table_cache->Release(file.metadata->table_reader_handle);
-      TableCache::Evict(table_cache, file.metadata->fd.GetNumber());
+      table_cache_->Release(file.metadata->table_reader_handle);
+      TableCache::Evict(table_cache_, file.metadata->fd.GetNumber());
     }
     file.DeleteMetadata();
   }
   obsolete_files_.clear();
+  io_status_.PermitUncheckedError();
+}
+
+void VersionSet::Reset() {
+  if (column_family_set_) {
+    WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
+    WriteController* wc = column_family_set_->write_controller();
+    column_family_set_.reset(new ColumnFamilySet(
+        dbname_, db_options_, file_options_, table_cache_, wbm, wc,
+        block_cache_tracer_, io_tracer_, db_session_id_));
+  }
+  db_id_.clear();
+  next_file_number_.store(2);
+  min_log_number_to_keep_.store(0);
+  manifest_file_number_ = 0;
+  options_file_number_ = 0;
+  pending_manifest_file_number_ = 0;
+  last_sequence_.store(0);
+  last_allocated_sequence_.store(0);
+  last_published_sequence_.store(0);
+  prev_log_number_ = 0;
+  descriptor_log_.reset();
+  current_version_number_ = 0;
+  manifest_writers_.clear();
+  manifest_file_size_ = 0;
+  obsolete_files_.clear();
+  obsolete_manifests_.clear();
+  wals_.Reset();
 }
 
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
@@ -3600,8 +4160,9 @@
 
 Status VersionSet::ProcessManifestWrites(
     std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
-    Directory* db_directory, bool new_descriptor_log,
+    FSDirectory* db_directory, bool new_descriptor_log,
     const ColumnFamilyOptions* new_cf_options) {
+  mu->AssertHeld();
   assert(!writers.empty());
   ManifestWriter& first_writer = writers.front();
   ManifestWriter* last_writer = &first_writer;
@@ -3614,9 +4175,16 @@
   autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
   std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
 
+  // Tracking `max_last_sequence` is needed to ensure we write
+  // `VersionEdit::last_sequence_`s in non-decreasing order according to the
+  // recovery code's requirement. It also allows us to defer updating
+  // `descriptor_last_sequence_` until the apply phase, after the log phase
+  // succeeds.
+  SequenceNumber max_last_sequence = descriptor_last_sequence_;
+
   if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
     // No group commits for column family add or drop
-    LogAndApplyCFHelper(first_writer.edit_list.front());
+    LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
     batch_edits.push_back(first_writer.edit_list.front());
   } else {
     auto it = manifest_writers_.cbegin();
@@ -3678,16 +4246,22 @@
         }
       }
       if (version == nullptr) {
-        version = new Version(last_writer->cfd, this, file_options_,
-                              last_writer->mutable_cf_options,
-                              current_version_number_++);
-        versions.push_back(version);
-        mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
-        builder_guards.emplace_back(
-            new BaseReferencedVersionBuilder(last_writer->cfd));
-        builder = builder_guards.back()->version_builder();
+        // WAL manipulations do not need to be applied to versions.
+        if (!last_writer->IsAllWalEdits()) {
+          version = new Version(last_writer->cfd, this, file_options_,
+                                last_writer->mutable_cf_options, io_tracer_,
+                                current_version_number_++);
+          versions.push_back(version);
+          mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+          builder_guards.emplace_back(
+              new BaseReferencedVersionBuilder(last_writer->cfd));
+          builder = builder_guards.back()->version_builder();
+        }
+        assert(last_writer->IsAllWalEdits() || builder);
+        assert(last_writer->IsAllWalEdits() || version);
+        TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion",
+                                 version);
       }
-      assert(builder != nullptr);  // make checker happy
       for (const auto& e : last_writer->edit_list) {
         if (e->is_in_atomic_group_) {
           if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
@@ -3698,7 +4272,8 @@
         } else if (group_start != std::numeric_limits<size_t>::max()) {
           group_start = std::numeric_limits<size_t>::max();
         }
-        Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+        Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
+                                     &max_last_sequence, mu);
         if (!s.ok()) {
           // free up the allocated memory
           for (auto v : versions) {
@@ -3760,9 +4335,6 @@
   }
 #endif  // NDEBUG
 
-  uint64_t new_manifest_file_size = 0;
-  Status s;
-
   assert(pending_manifest_file_number_ == 0);
   if (!descriptor_log_ ||
       manifest_file_size_ > db_options_->max_manifest_file_size) {
@@ -3776,6 +4348,7 @@
   // reads its content after releasing db mutex to avoid race with
   // SwitchMemtable().
   std::unordered_map<uint32_t, MutableCFState> curr_state;
+  VersionEdit wal_additions;
   if (new_descriptor_log) {
     pending_manifest_file_number_ = NewFileNumber();
     batch_edits.back()->SetNextFile(next_file_number_.load());
@@ -3788,15 +4361,25 @@
     }
     for (const auto* cfd : *column_family_set_) {
       assert(curr_state.find(cfd->GetID()) == curr_state.end());
-      curr_state[cfd->GetID()] = {cfd->GetLogNumber()};
+      curr_state.emplace(std::make_pair(
+          cfd->GetID(),
+          MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow())));
+    }
+
+    for (const auto& wal : wals_.GetWals()) {
+      wal_additions.AddWal(wal.first, wal.second);
     }
   }
 
+  uint64_t new_manifest_file_size = 0;
+  Status s;
+  IOStatus io_s;
+  IOStatus manifest_io_status;
   {
     FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
     mu->Unlock();
-
-    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
     if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
       for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
         assert(!builder_guards.empty() &&
@@ -3805,10 +4388,11 @@
                builder_guards.size() == versions.size());
         ColumnFamilyData* cfd = versions[i]->cfd_;
         s = builder_guards[i]->version_builder()->LoadTableHandlers(
-            cfd->internal_stats(), cfd->ioptions()->optimize_filters_for_hits,
+            cfd->internal_stats(), 1 /* max_threads */,
             true /* prefetch_index_and_filter_in_cache */,
             false /* is_initial_load */,
-            mutable_cf_options_ptrs[i]->prefix_extractor.get());
+            mutable_cf_options_ptrs[i]->prefix_extractor,
+            MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]));
         if (!s.ok()) {
           if (db_options_->paranoid_checks) {
             break;
@@ -3827,18 +4411,24 @@
       std::string descriptor_fname =
           DescriptorFileName(dbname_, pending_manifest_file_number_);
       std::unique_ptr<FSWritableFile> descriptor_file;
-      s = NewWritableFile(fs_, descriptor_fname, &descriptor_file,
-                          opt_file_opts);
-      if (s.ok()) {
+      io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
+                             opt_file_opts);
+      if (io_s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
             db_options_->manifest_preallocation_size);
-
+        FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
         std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-            std::move(descriptor_file), descriptor_fname, opt_file_opts, env_,
-            nullptr, db_options_->listeners));
+            std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
+            io_tracer_, nullptr, db_options_->listeners, nullptr,
+            tmp_set.Contains(FileType::kDescriptorFile),
+            tmp_set.Contains(FileType::kDescriptorFile)));
         descriptor_log_.reset(
             new log::Writer(std::move(file_writer), 0, false));
-        s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get());
+        s = WriteCurrentStateToManifest(curr_state, wal_additions,
+                                        descriptor_log_.get(), io_s);
+      } else {
+        manifest_io_status = io_s;
+        s = io_s;
       }
     }
 
@@ -3860,8 +4450,8 @@
                                  e->DebugString(true));
           break;
         }
-        TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord",
-                         rocksdb_kill_odds * REDUCE_ODDS2);
+        TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
+                                     REDUCE_ODDS2);
 #ifndef NDEBUG
         if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
           TEST_SYNC_POINT_CALLBACK(
@@ -3872,15 +4462,21 @@
         }
         ++idx;
 #endif /* !NDEBUG */
-        s = descriptor_log_->AddRecord(record);
-        if (!s.ok()) {
+        io_s = descriptor_log_->AddRecord(record);
+        if (!io_s.ok()) {
+          s = io_s;
+          manifest_io_status = io_s;
           break;
         }
       }
       if (s.ok()) {
-        s = SyncManifest(env_, db_options_, descriptor_log_->file());
+        io_s = SyncManifest(db_options_, descriptor_log_->file());
+        manifest_io_status = io_s;
+        TEST_SYNC_POINT_CALLBACK(
+            "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
       }
-      if (!s.ok()) {
+      if (!io_s.ok()) {
+        s = io_s;
         ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
                         s.ToString().c_str());
       }
@@ -3888,10 +4484,15 @@
 
     // If we just created a new descriptor file, install it by writing a
     // new CURRENT file that points to it.
+    if (s.ok()) {
+      assert(manifest_io_status.ok());
+    }
     if (s.ok() && new_descriptor_log) {
-      s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
-                         db_directory);
-      TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest");
+      io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
+                            db_directory);
+      if (!io_s.ok()) {
+        s = io_s;
+      }
     }
 
     if (s.ok()) {
@@ -3910,6 +4511,28 @@
     mu->Lock();
   }
 
+  if (s.ok()) {
+    // Apply WAL edits, DB mutex must be held.
+    for (auto& e : batch_edits) {
+      if (e->IsWalAddition()) {
+        s = wals_.AddWals(e->GetWalAdditions());
+      } else if (e->IsWalDeletion()) {
+        s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
+  if (!io_s.ok()) {
+    if (io_status_.ok()) {
+      io_status_ = io_s;
+    }
+  } else if (!io_status_.ok()) {
+    io_status_ = io_s;
+  }
+
   // Append the old manifest file to the obsolete_manifest_ list to be deleted
   // by PurgeObsoleteFiles later.
   if (s.ok() && new_descriptor_log) {
@@ -3922,32 +4545,34 @@
     if (first_writer.edit_list.front()->is_column_family_add_) {
       assert(batch_edits.size() == 1);
       assert(new_cf_options != nullptr);
+      assert(max_last_sequence == descriptor_last_sequence_);
       CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
     } else if (first_writer.edit_list.front()->is_column_family_drop_) {
       assert(batch_edits.size() == 1);
+      assert(max_last_sequence == descriptor_last_sequence_);
       first_writer.cfd->SetDropped();
       first_writer.cfd->UnrefAndTryDelete();
     } else {
       // Each version in versions corresponds to a column family.
       // For each column family, update its log number indicating that logs
       // with number smaller than this should be ignored.
-      for (const auto version : versions) {
-        uint64_t max_log_number_in_batch = 0;
-        uint32_t cf_id = version->cfd_->GetID();
-        for (const auto& e : batch_edits) {
-          if (e->has_log_number_ && e->column_family_ == cf_id) {
-            max_log_number_in_batch =
-                std::max(max_log_number_in_batch, e->log_number_);
+      uint64_t last_min_log_number_to_keep = 0;
+      for (const auto& e : batch_edits) {
+        ColumnFamilyData* cfd = nullptr;
+        if (!e->IsColumnFamilyManipulation()) {
+          cfd = column_family_set_->GetColumnFamily(e->column_family_);
+          // e would not have been added to batch_edits if its corresponding
+          // column family is dropped.
+          assert(cfd);
+        }
+        if (cfd) {
+          if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) {
+            cfd->SetLogNumber(e->log_number_);
+          }
+          if (e->HasFullHistoryTsLow()) {
+            cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow());
           }
         }
-        if (max_log_number_in_batch != 0) {
-          assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch);
-          version->cfd_->SetLogNumber(max_log_number_in_batch);
-        }
-      }
-
-      uint64_t last_min_log_number_to_keep = 0;
-      for (auto& e : batch_edits) {
         if (e->has_min_log_number_to_keep_) {
           last_min_log_number_to_keep =
               std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
@@ -3955,8 +4580,7 @@
       }
 
       if (last_min_log_number_to_keep != 0) {
-        // Should only be set in 2PC mode.
-        MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
+        MarkMinLogNumberToKeep(last_min_log_number_to_keep);
       }
 
       for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
@@ -3964,6 +4588,8 @@
         AppendVersion(cfd, versions[i]);
       }
     }
+    assert(max_last_sequence >= descriptor_last_sequence_);
+    descriptor_last_sequence_ = max_last_sequence;
     manifest_file_number_ = pending_manifest_file_number_;
     manifest_file_size_ = new_manifest_file_size;
     prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
@@ -3978,22 +4604,75 @@
     for (auto v : versions) {
       delete v;
     }
+    if (manifest_io_status.ok()) {
+      manifest_file_number_ = pending_manifest_file_number_;
+      manifest_file_size_ = new_manifest_file_size;
+    }
     // If manifest append failed for whatever reason, the file could be
     // corrupted. So we need to force the next version update to start a
     // new manifest file.
     descriptor_log_.reset();
-    if (new_descriptor_log) {
+    // If manifest operations failed, then we know the CURRENT file still
+    // points to the original MANIFEST. Therefore, we can safely delete the
+    // new MANIFEST.
+    // If manifest operations succeeded, and we are here, then it is possible
+    // that renaming tmp file to CURRENT failed.
+    //
+    // On local POSIX-compliant FS, the CURRENT must point to the original
+    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
+    // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
+    // process not to crash and continue using the db. Any future LogAndApply()
+    // call will switch to a new MANIFEST and update CURRENT, still ignoring
+    // this one.
+    //
+    // On non-local FS, it is
+    // possible that the rename operation succeeded on the server (remote)
+    // side, but the client somehow returns a non-ok status to RocksDB. Note
+    // that this does not violate atomicity. Should we delete the new MANIFEST
+    // successfully, a subsequent recovery attempt will likely see the CURRENT
+    // pointing to the new MANIFEST, thus fail. We will not be able to open the
+    // DB again. Therefore, if manifest operations succeed, we should keep the
+    // the new MANIFEST. If the process proceeds, any future LogAndApply() call
+    // will switch to a new MANIFEST and update CURRENT. If user tries to
+    // re-open the DB,
+    // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
+    // b) CURRENT points to the original MANIFEST, and the original MANIFEST
+    //    also exists.
+    if (new_descriptor_log && !manifest_io_status.ok()) {
       ROCKS_LOG_INFO(db_options_->info_log,
                      "Deleting manifest %" PRIu64 " current manifest %" PRIu64
                      "\n",
-                     manifest_file_number_, pending_manifest_file_number_);
-      env_->DeleteFile(
+                     pending_manifest_file_number_, manifest_file_number_);
+      Status manifest_del_status = env_->DeleteFile(
           DescriptorFileName(dbname_, pending_manifest_file_number_));
+      if (!manifest_del_status.ok()) {
+        ROCKS_LOG_WARN(db_options_->info_log,
+                       "Failed to delete manifest %" PRIu64 ": %s",
+                       pending_manifest_file_number_,
+                       manifest_del_status.ToString().c_str());
+      }
     }
   }
 
   pending_manifest_file_number_ = 0;
 
+#ifndef NDEBUG
+  // This is here kind of awkwardly because there's no other consistency
+  // checks on `VersionSet`'s updates for the new `Version`s. We might want
+  // to move it to a dedicated function, or remove it if we gain enough
+  // confidence in `descriptor_last_sequence_`.
+  if (s.ok()) {
+    for (const auto* v : versions) {
+      const auto* vstorage = v->storage_info();
+      for (int level = 0; level < vstorage->num_levels(); ++level) {
+        for (const auto& file : vstorage->LevelFiles(level)) {
+          assert(file->fd.largest_seqno <= descriptor_last_sequence_);
+        }
+      }
+    }
+  }
+#endif  // NDEBUG
+
   // wake up all the waiting writers
   while (true) {
     ManifestWriter* ready = manifest_writers_.front();
@@ -4007,6 +4686,9 @@
     }
     ready->status = s;
     ready->done = true;
+    if (ready->manifest_write_callback) {
+      (ready->manifest_write_callback)(s);
+    }
     if (need_signal) {
       ready->cv.Signal();
     }
@@ -4020,14 +4702,23 @@
   return s;
 }
 
-// 'datas' is gramatically incorrect. We still use this notation to indicate
+void VersionSet::WakeUpWaitingManifestWriters() {
+  // wake up all the waiting writers
+  // Notify new head of manifest write queue.
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+}
+
+// 'datas' is grammatically incorrect. We still use this notation to indicate
 // that this variable represents a collection of column_family_data.
 Status VersionSet::LogAndApply(
     const autovector<ColumnFamilyData*>& column_family_datas,
     const autovector<const MutableCFOptions*>& mutable_cf_options_list,
     const autovector<autovector<VersionEdit*>>& edit_lists,
-    InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log,
-    const ColumnFamilyOptions* new_cf_options) {
+    InstrumentedMutex* mu, FSDirectory* db_directory, bool new_descriptor_log,
+    const ColumnFamilyOptions* new_cf_options,
+    const std::vector<std::function<void(const Status&)>>& manifest_wcbs) {
   mu->AssertHeld();
   int num_edits = 0;
   for (const auto& elist : edit_lists) {
@@ -4057,12 +4748,16 @@
     assert(static_cast<size_t>(num_cfds) == edit_lists.size());
   }
   for (int i = 0; i < num_cfds; ++i) {
+    const auto wcb =
+        manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i];
     writers.emplace_back(mu, column_family_datas[i],
-                         *mutable_cf_options_list[i], edit_lists[i]);
+                         *mutable_cf_options_list[i], edit_lists[i], wcb);
     manifest_writers_.push_back(&writers[i]);
   }
   assert(!writers.empty());
   ManifestWriter& first_writer = writers.front();
+  TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting",
+                           nullptr);
   while (!first_writer.done && &first_writer != manifest_writers_.front()) {
     first_writer.cv.Wait();
   }
@@ -4074,6 +4769,7 @@
     for (const auto& writer : writers) {
       assert(writer.done);
     }
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu);
 #endif /* !NDEBUG */
     return first_writer.status;
   }
@@ -4100,16 +4796,13 @@
                                new_cf_options);
 }
 
-void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
+void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
+                                     SequenceNumber* max_last_sequence) {
+  assert(max_last_sequence != nullptr);
   assert(edit->IsColumnFamilyManipulation());
   edit->SetNextFile(next_file_number_.load());
-  // The log might have data that is not visible to memtbale and hence have not
-  // updated the last_sequence_ yet. It is also possible that the log has is
-  // expecting some new data that is not written yet. Since LastSequence is an
-  // upper bound on the sequence, it is ok to record
-  // last_allocated_sequence_ as the last sequence.
-  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
-                                                      : last_sequence_);
+  assert(!edit->HasLastSequence());
+  edit->SetLastSequence(*max_last_sequence);
   if (edit->is_column_family_drop_) {
     // if we drop column family, we have to make sure to save max column family,
     // so that we don't reuse existing ID
@@ -4119,12 +4812,14 @@
 
 Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
                                      VersionBuilder* builder, VersionEdit* edit,
+                                     SequenceNumber* max_last_sequence,
                                      InstrumentedMutex* mu) {
 #ifdef NDEBUG
   (void)cfd;
 #endif
   mu->AssertHeld();
   assert(!edit->IsColumnFamilyManipulation());
+  assert(max_last_sequence != nullptr);
 
   if (edit->has_log_number_) {
     assert(edit->log_number_ >= cfd->GetLogNumber());
@@ -4135,161 +4830,17 @@
     edit->SetPrevLogNumber(prev_log_number_);
   }
   edit->SetNextFile(next_file_number_.load());
-  // The log might have data that is not visible to memtbale and hence have not
-  // updated the last_sequence_ yet. It is also possible that the log has is
-  // expecting some new data that is not written yet. Since LastSequence is an
-  // upper bound on the sequence, it is ok to record
-  // last_allocated_sequence_ as the last sequence.
-  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
-                                                      : last_sequence_);
-
-  Status s = builder->Apply(edit);
-
-  return s;
-}
-
-Status VersionSet::ApplyOneVersionEditToBuilder(
-    VersionEdit& edit,
-    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
-    std::unordered_map<int, std::string>& column_families_not_found,
-    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
-        builders,
-    VersionEditParams* version_edit_params) {
-  // Not found means that user didn't supply that column
-  // family option AND we encountered column family add
-  // record. Once we encounter column family drop record,
-  // we will delete the column family from
-  // column_families_not_found.
-  bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
-                          column_families_not_found.end());
-  // in builders means that user supplied that column family
-  // option AND that we encountered column family add record
-  bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
-
-  // they can't both be true
-  assert(!(cf_in_not_found && cf_in_builders));
-
-  ColumnFamilyData* cfd = nullptr;
-
-  if (edit.is_column_family_add_) {
-    if (cf_in_builders || cf_in_not_found) {
-      return Status::Corruption(
-          "Manifest adding the same column family twice: " +
-          edit.column_family_name_);
-    }
-    auto cf_options = name_to_options.find(edit.column_family_name_);
-    // implicitly add persistent_stats column family without requiring user
-    // to specify
-    bool is_persistent_stats_column_family =
-        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
-    if (cf_options == name_to_options.end() &&
-        !is_persistent_stats_column_family) {
-      column_families_not_found.insert(
-          {edit.column_family_, edit.column_family_name_});
-    } else {
-      // recover persistent_stats CF from a DB that already contains it
-      if (is_persistent_stats_column_family) {
-        ColumnFamilyOptions cfo;
-        OptimizeForPersistentStats(&cfo);
-        cfd = CreateColumnFamily(cfo, &edit);
-      } else {
-        cfd = CreateColumnFamily(cf_options->second, &edit);
-      }
-      cfd->set_initialized();
-      builders.insert(std::make_pair(
-          edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
-                                   new BaseReferencedVersionBuilder(cfd))));
-    }
-  } else if (edit.is_column_family_drop_) {
-    if (cf_in_builders) {
-      auto builder = builders.find(edit.column_family_);
-      assert(builder != builders.end());
-      builders.erase(builder);
-      cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-      assert(cfd != nullptr);
-      if (cfd->UnrefAndTryDelete()) {
-        cfd = nullptr;
-      } else {
-        // who else can have reference to cfd!?
-        assert(false);
-      }
-    } else if (cf_in_not_found) {
-      column_families_not_found.erase(edit.column_family_);
-    } else {
-      return Status::Corruption(
-          "Manifest - dropping non-existing column family");
-    }
-  } else if (!cf_in_not_found) {
-    if (!cf_in_builders) {
-      return Status::Corruption(
-          "Manifest record referencing unknown column family");
-    }
-
-    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-    // this should never happen since cf_in_builders is true
-    assert(cfd != nullptr);
-
-    // if it is not column family add or column family drop,
-    // then it's a file add/delete, which should be forwarded
-    // to builder
-    auto builder = builders.find(edit.column_family_);
-    assert(builder != builders.end());
-    Status s = builder->second->version_builder()->Apply(&edit);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
-}
-
-Status VersionSet::ExtractInfoFromVersionEdit(
-    ColumnFamilyData* cfd, const VersionEdit& from_edit,
-    VersionEditParams* version_edit_params) {
-  if (cfd != nullptr) {
-    if (from_edit.has_db_id_) {
-      version_edit_params->SetDBId(from_edit.db_id_);
-    }
-    if (from_edit.has_log_number_) {
-      if (cfd->GetLogNumber() > from_edit.log_number_) {
-        ROCKS_LOG_WARN(
-            db_options_->info_log,
-            "MANIFEST corruption detected, but ignored - Log numbers in "
-            "records NOT monotonically increasing");
-      } else {
-        cfd->SetLogNumber(from_edit.log_number_);
-        version_edit_params->SetLogNumber(from_edit.log_number_);
-      }
-    }
-    if (from_edit.has_comparator_ &&
-        from_edit.comparator_ != cfd->user_comparator()->Name()) {
-      return Status::InvalidArgument(
-          cfd->user_comparator()->Name(),
-          "does not match existing comparator " + from_edit.comparator_);
-    }
-  }
-
-  if (from_edit.has_prev_log_number_) {
-    version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
-  }
-
-  if (from_edit.has_next_file_number_) {
-    version_edit_params->SetNextFile(from_edit.next_file_number_);
-  }
-
-  if (from_edit.has_max_column_family_) {
-    version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
-  }
-
-  if (from_edit.has_min_log_number_to_keep_) {
-    version_edit_params->min_log_number_to_keep_ =
-        std::max(version_edit_params->min_log_number_to_keep_,
-                 from_edit.min_log_number_to_keep_);
+  if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) {
+    *max_last_sequence = edit->GetLastSequence();
+  } else {
+    edit->SetLastSequence(*max_last_sequence);
   }
 
-  if (from_edit.has_last_sequence_) {
-    version_edit_params->SetLastSequence(from_edit.last_sequence_);
-  }
-  return Status::OK();
+  // The builder can be nullptr only if edit is WAL manipulation,
+  // because WAL edits do not need to be applied to versions,
+  // we return Status::OK() in this case.
+  assert(builder || edit->IsWalManipulation());
+  return builder ? builder->Apply(edit) : Status::OK();
 }
 
 Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
@@ -4319,91 +4870,16 @@
   if (dbname.back() != '/') {
     manifest_path->push_back('/');
   }
-  *manifest_path += fname;
+  manifest_path->append(fname);
   return Status::OK();
 }
 
-Status VersionSet::ReadAndRecover(
-    log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
-    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
-    std::unordered_map<int, std::string>& column_families_not_found,
-    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
-        builders,
-    VersionEditParams* version_edit_params, std::string* db_id) {
-  assert(reader != nullptr);
-  assert(read_buffer != nullptr);
-  Status s;
-  Slice record;
-  std::string scratch;
-  size_t recovered_edits = 0;
-  while (reader->ReadRecord(&record, &scratch) && s.ok()) {
-    VersionEdit edit;
-    s = edit.DecodeFrom(record);
-    if (!s.ok()) {
-      break;
-    }
-    if (edit.has_db_id_) {
-      db_id_ = edit.GetDbId();
-      if (db_id != nullptr) {
-        db_id->assign(edit.GetDbId());
-      }
-    }
-    s = read_buffer->AddEdit(&edit);
-    if (!s.ok()) {
-      break;
-    }
-    if (edit.is_in_atomic_group_) {
-      if (read_buffer->IsFull()) {
-        // Apply edits in an atomic group when we have read all edits in the
-        // group.
-        for (auto& e : read_buffer->replay_buffer()) {
-          s = ApplyOneVersionEditToBuilder(e, name_to_options,
-                                           column_families_not_found, builders,
-                                           version_edit_params);
-          if (!s.ok()) {
-            break;
-          }
-          recovered_edits++;
-        }
-        if (!s.ok()) {
-          break;
-        }
-        read_buffer->Clear();
-      }
-    } else {
-      // Apply a normal edit immediately.
-      s = ApplyOneVersionEditToBuilder(edit, name_to_options,
-                                       column_families_not_found, builders,
-                                       version_edit_params);
-      if (s.ok()) {
-        recovered_edits++;
-      }
-    }
-  }
-  if (!s.ok()) {
-    // Clear the buffer if we fail to decode/apply an edit.
-    read_buffer->Clear();
-  }
-  TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
-                           &recovered_edits);
-  return s;
-}
-
 Status VersionSet::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
     std::string* db_id) {
-  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
-  for (const auto& cf : column_families) {
-    cf_name_to_options.emplace(cf.name, cf.options);
-  }
-  // keeps track of column families in manifest that were not found in
-  // column families parameters. if those column families are not dropped
-  // by subsequent manifest records, Recover() will return failure status
-  std::unordered_map<int, std::string> column_families_not_found;
-
   // Read "CURRENT" file, which contains a pointer to the current manifest file
   std::string manifest_path;
-  Status s = GetCurrentManifestPath(dbname_, fs_, &manifest_path,
+  Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
                                     &manifest_file_number_);
   if (!s.ok()) {
     return s;
@@ -4421,140 +4897,34 @@
     if (!s.ok()) {
       return s;
     }
-    manifest_file_reader.reset(
-        new SequentialFileReader(std::move(manifest_file), manifest_path,
-                                 db_options_->log_readahead_size));
-  }
-
-  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
-      builders;
-
-  // add default column family
-  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
-  if (default_cf_iter == cf_name_to_options.end()) {
-    return Status::InvalidArgument("Default column family not specified");
-  }
-  VersionEdit default_cf_edit;
-  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
-  default_cf_edit.SetColumnFamily(0);
-  ColumnFamilyData* default_cfd =
-      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
-  // In recovery, nobody else can access it, so it's fine to set it to be
-  // initialized earlier.
-  default_cfd->set_initialized();
-  builders.insert(
-      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
-                            new BaseReferencedVersionBuilder(default_cfd))));
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+  }
   uint64_t current_manifest_file_size = 0;
-  VersionEditParams version_edit_params;
+  uint64_t log_number = 0;
   {
     VersionSet::LogReporter reporter;
-    reporter.status = &s;
+    Status log_read_status;
+    reporter.status = &log_read_status;
     log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
                        true /* checksum */, 0 /* log_number */);
-    Slice record;
-    std::string scratch;
-    AtomicGroupReadBuffer read_buffer;
-    s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options,
-                       column_families_not_found, builders,
-                       &version_edit_params, db_id);
-    current_manifest_file_size = reader.GetReadOffset();
-    assert(current_manifest_file_size != 0);
-  }
-
-  if (s.ok()) {
-    if (!version_edit_params.has_next_file_number_) {
-      s = Status::Corruption("no meta-nextfile entry in descriptor");
-    } else if (!version_edit_params.has_log_number_) {
-      s = Status::Corruption("no meta-lognumber entry in descriptor");
-    } else if (!version_edit_params.has_last_sequence_) {
-      s = Status::Corruption("no last-sequence-number entry in descriptor");
-    }
-
-    if (!version_edit_params.has_prev_log_number_) {
-      version_edit_params.SetPrevLogNumber(0);
-    }
-
-    column_family_set_->UpdateMaxColumnFamily(
-        version_edit_params.max_column_family_);
-
-    // When reading DB generated using old release, min_log_number_to_keep=0.
-    // All log files will be scanned for potential prepare entries.
-    MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_);
-    MarkFileNumberUsed(version_edit_params.prev_log_number_);
-    MarkFileNumberUsed(version_edit_params.log_number_);
-  }
-
-  // there were some column families in the MANIFEST that weren't specified
-  // in the argument. This is OK in read_only mode
-  if (read_only == false && !column_families_not_found.empty()) {
-    std::string list_of_not_found;
-    for (const auto& cf : column_families_not_found) {
-      list_of_not_found += ", " + cf.second;
-    }
-    list_of_not_found = list_of_not_found.substr(2);
-    s = Status::InvalidArgument(
-        "You have to open all column families. Column families not opened: " +
-        list_of_not_found);
-  }
-
-  if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      assert(builders.count(cfd->GetID()) > 0);
-      auto* builder = builders[cfd->GetID()]->version_builder();
-      if (!builder->CheckConsistencyForNumLevels()) {
-        s = Status::InvalidArgument(
-            "db has more levels than options.num_levels");
-        break;
-      }
+    VersionEditHandler handler(read_only, column_families,
+                               const_cast<VersionSet*>(this),
+                               /*track_missing_files=*/false,
+                               /*no_error_if_files_missing=*/false, io_tracer_);
+    handler.Iterate(reader, &log_read_status);
+    s = handler.status();
+    if (s.ok()) {
+      log_number = handler.GetVersionEditParams().log_number_;
+      current_manifest_file_size = reader.GetReadOffset();
+      assert(current_manifest_file_size != 0);
+      handler.GetDbId(db_id);
     }
   }
 
   if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      if (read_only) {
-        cfd->table_cache()->SetTablesAreImmortal();
-      }
-      assert(cfd->initialized());
-      auto builders_iter = builders.find(cfd->GetID());
-      assert(builders_iter != builders.end());
-      auto builder = builders_iter->second->version_builder();
-
-      // unlimited table cache. Pre-load table handle now.
-      // Need to do it out of the mutex.
-      s = builder->LoadTableHandlers(
-          cfd->internal_stats(), db_options_->max_file_opening_threads,
-          false /* prefetch_index_and_filter_in_cache */,
-          true /* is_initial_load */,
-          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
-      if (!s.ok()) {
-        if (db_options_->paranoid_checks) {
-          return s;
-        }
-        s = Status::OK();
-      }
-
-      Version* v = new Version(cfd, this, file_options_,
-                               *cfd->GetLatestMutableCFOptions(),
-                               current_version_number_++);
-      builder->SaveTo(v->storage_info());
-
-      // Install recovered version
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
-          !(db_options_->skip_stats_update_on_db_open));
-      AppendVersion(cfd, v);
-    }
-
     manifest_file_size_ = current_manifest_file_size;
-    next_file_number_.store(version_edit_params.next_file_number_ + 1);
-    last_allocated_sequence_ = version_edit_params.last_sequence_;
-    last_published_sequence_ = version_edit_params.last_sequence_;
-    last_sequence_ = version_edit_params.last_sequence_;
-    prev_log_number_ = version_edit_params.prev_log_number_;
-
     ROCKS_LOG_INFO(
         db_options_->info_log,
         "Recovered from manifest file:%s succeeded,"
@@ -4563,9 +4933,8 @@
         ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
         ",min_log_number_to_keep is %" PRIu64 "\n",
         manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
-        last_sequence_.load(), version_edit_params.log_number_,
-        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
-        min_log_number_to_keep_2pc());
+        last_sequence_.load(), log_number, prev_log_number_,
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
@@ -4581,10 +4950,152 @@
   return s;
 }
 
+namespace {
+class ManifestPicker {
+ public:
+  explicit ManifestPicker(const std::string& dbname,
+                          const std::vector<std::string>& files_in_dbname);
+  // REQUIRES Valid() == true
+  std::string GetNextManifest(uint64_t* file_number, std::string* file_name);
+  bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); }
+
+ private:
+  const std::string& dbname_;
+  // MANIFEST file names(s)
+  std::vector<std::string> manifest_files_;
+  std::vector<std::string>::const_iterator manifest_file_iter_;
+};
+
+ManifestPicker::ManifestPicker(const std::string& dbname,
+                               const std::vector<std::string>& files_in_dbname)
+    : dbname_(dbname) {
+  // populate manifest files
+  assert(!files_in_dbname.empty());
+  for (const auto& fname : files_in_dbname) {
+    uint64_t file_num = 0;
+    FileType file_type;
+    bool parse_ok = ParseFileName(fname, &file_num, &file_type);
+    if (parse_ok && file_type == kDescriptorFile) {
+      manifest_files_.push_back(fname);
+    }
+  }
+  // seek to first manifest
+  std::sort(manifest_files_.begin(), manifest_files_.end(),
+            [](const std::string& lhs, const std::string& rhs) {
+              uint64_t num1 = 0;
+              uint64_t num2 = 0;
+              FileType type1;
+              FileType type2;
+              bool parse_ok1 = ParseFileName(lhs, &num1, &type1);
+              bool parse_ok2 = ParseFileName(rhs, &num2, &type2);
+#ifndef NDEBUG
+              assert(parse_ok1);
+              assert(parse_ok2);
+#else
+              (void)parse_ok1;
+              (void)parse_ok2;
+#endif
+              return num1 > num2;
+            });
+  manifest_file_iter_ = manifest_files_.begin();
+}
+
+std::string ManifestPicker::GetNextManifest(uint64_t* number,
+                                            std::string* file_name) {
+  assert(Valid());
+  std::string ret;
+  if (manifest_file_iter_ != manifest_files_.end()) {
+    ret.assign(dbname_);
+    if (ret.back() != kFilePathSeparator) {
+      ret.push_back(kFilePathSeparator);
+    }
+    ret.append(*manifest_file_iter_);
+    if (number) {
+      FileType type;
+      bool parse = ParseFileName(*manifest_file_iter_, number, &type);
+      assert(type == kDescriptorFile);
+#ifndef NDEBUG
+      assert(parse);
+#else
+      (void)parse;
+#endif
+    }
+    if (file_name) {
+      *file_name = *manifest_file_iter_;
+    }
+    ++manifest_file_iter_;
+  }
+  return ret;
+}
+}  // namespace
+
+Status VersionSet::TryRecover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    const std::vector<std::string>& files_in_dbname, std::string* db_id,
+    bool* has_missing_table_file) {
+  ManifestPicker manifest_picker(dbname_, files_in_dbname);
+  if (!manifest_picker.Valid()) {
+    return Status::Corruption("Cannot locate MANIFEST file in " + dbname_);
+  }
+  Status s;
+  std::string manifest_path =
+      manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+  while (!manifest_path.empty()) {
+    s = TryRecoverFromOneManifest(manifest_path, column_families, read_only,
+                                  db_id, has_missing_table_file);
+    if (s.ok() || !manifest_picker.Valid()) {
+      break;
+    }
+    Reset();
+    manifest_path =
+        manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+  }
+  return s;
+}
+
+Status VersionSet::TryRecoverFromOneManifest(
+    const std::string& manifest_path,
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id, bool* has_missing_table_file) {
+  ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n",
+                 manifest_path.c_str());
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  Status s;
+  {
+    std::unique_ptr<FSSequentialFile> manifest_file;
+    s = fs_->NewSequentialFile(manifest_path,
+                               fs_->OptimizeForManifestRead(file_options_),
+                               &manifest_file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+  }
+
+  assert(s.ok());
+  VersionSet::LogReporter reporter;
+  reporter.status = &s;
+  log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+                     /*checksum=*/true, /*log_num=*/0);
+  VersionEditHandlerPointInTime handler_pit(
+      read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
+
+  handler_pit.Iterate(reader, &s);
+
+  handler_pit.GetDbId(db_id);
+
+  assert(nullptr != has_missing_table_file);
+  *has_missing_table_file = handler_pit.HasMissingFiles();
+
+  return handler_pit.status();
+}
+
 Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
                                       const std::string& dbname,
                                       FileSystem* fs) {
-  // these are just for performance reasons, not correcntes,
+  // these are just for performance reasons, not correctness,
   // so we're fine using the defaults
   FileOptions soptions;
   // Read "CURRENT" file, which contains a pointer to the current manifest file
@@ -4603,51 +5114,27 @@
     if (!s.ok()) {
       return s;
   }
-  file_reader.reset(new SequentialFileReader(std::move(file), manifest_path));
+  file_reader.reset(new SequentialFileReader(std::move(file), manifest_path,
+                                             nullptr /*IOTracer*/));
   }
 
-  std::map<uint32_t, std::string> column_family_names;
-  // default column family is always implicitly there
-  column_family_names.insert({0, kDefaultColumnFamilyName});
   VersionSet::LogReporter reporter;
   reporter.status = &s;
   log::Reader reader(nullptr, std::move(file_reader), &reporter,
                      true /* checksum */, 0 /* log_number */);
-  Slice record;
-  std::string scratch;
-  while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-    VersionEdit edit;
-    s = edit.DecodeFrom(record);
-    if (!s.ok()) {
-      break;
-    }
-    if (edit.is_column_family_add_) {
-      if (column_family_names.find(edit.column_family_) !=
-          column_family_names.end()) {
-        s = Status::Corruption("Manifest adding the same column family twice");
-        break;
-      }
-      column_family_names.insert(
-          {edit.column_family_, edit.column_family_name_});
-    } else if (edit.is_column_family_drop_) {
-      if (column_family_names.find(edit.column_family_) ==
-          column_family_names.end()) {
-        s = Status::Corruption(
-            "Manifest - dropping non-existing column family");
-        break;
-      }
-      column_family_names.erase(edit.column_family_);
-    }
-  }
 
+  ListColumnFamiliesHandler handler;
+  handler.Iterate(reader, &s);
+
+  assert(column_families);
   column_families->clear();
-  if (s.ok()) {
-    for (const auto& iter : column_family_names) {
+  if (handler.status().ok()) {
+    for (const auto& iter : handler.GetColumnFamilyNames()) {
       column_families->push_back(iter.second);
     }
   }
 
-  return s;
+  return handler.status();
 }
 
 #ifndef ROCKSDB_LITE
@@ -4667,7 +5154,8 @@
   WriteController wc(options->delayed_write_rate);
   WriteBufferManager wb(options->db_write_buffer_size);
   VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr);
+                      nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+                      /*db_session_id*/ "");
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -4720,7 +5208,19 @@
   }
 
   if (first_nonempty_level > 0) {
-    new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level);
+    auto& new_last_level = new_files_list[new_levels - 1];
+
+    new_last_level = vstorage->LevelFiles(first_nonempty_level);
+
+    for (size_t i = 0; i < new_last_level.size(); ++i) {
+      const FileMetaData* const meta = new_last_level[i];
+      assert(meta);
+
+      const uint64_t file_number = meta->fd.GetNumber();
+
+      vstorage->file_locations_[file_number] =
+          VersionStorageInfo::FileLocation(new_levels - 1, i);
+    }
   }
 
   delete[] vstorage -> files_;
@@ -4737,14 +5237,16 @@
 }
 
 // Get the checksum information including the checksum and checksum function
-// name of all SST files in VersionSet. Store the information in
+// name of all SST and blob files in VersionSet. Store the information in
 // FileChecksumList which contains a map from file number to its checksum info.
 // If DB is not running, make sure call VersionSet::Recover() to load the file
 // metadata from Manifest to VersionSet before calling this function.
 Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
   // Clean the previously stored checksum information if any.
+  Status s;
   if (checksum_list == nullptr) {
-    return Status::InvalidArgument("checksum_list is nullptr");
+    s = Status::InvalidArgument("checksum_list is nullptr");
+    return s;
   }
   checksum_list->reset();
 
@@ -4752,16 +5254,45 @@
     if (cfd->IsDropped() || !cfd->initialized()) {
       continue;
     }
+    /* SST files */
     for (int level = 0; level < cfd->NumberLevels(); level++) {
       for (const auto& file :
            cfd->current()->storage_info()->LevelFiles(level)) {
-        checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
-                                             file->file_checksum,
-                                             file->file_checksum_func_name);
+        s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
+                                                 file->file_checksum,
+                                                 file->file_checksum_func_name);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+
+    /* Blob files */
+    const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles();
+    for (const auto& pair : blob_files) {
+      const uint64_t blob_file_number = pair.first;
+      const auto& meta = pair.second;
+
+      assert(meta);
+      assert(blob_file_number == meta->GetBlobFileNumber());
+
+      std::string checksum_value = meta->GetChecksumValue();
+      std::string checksum_method = meta->GetChecksumMethod();
+      assert(checksum_value.empty() == checksum_method.empty());
+      if (meta->GetChecksumMethod().empty()) {
+        checksum_value = kUnknownFileChecksum;
+        checksum_method = kUnknownFileChecksumFuncName;
+      }
+
+      s = checksum_list->InsertOneFileChecksum(blob_file_number, checksum_value,
+                                               checksum_method);
+      if (!s.ok()) {
+        return s;
       }
     }
   }
-  return Status::OK();
+
+  return s;
 }
 
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
@@ -4771,205 +5302,31 @@
   Status s;
   {
     std::unique_ptr<FSSequentialFile> file;
-    s = options.file_system->NewSequentialFile(
+    const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem();
+    s = fs->NewSequentialFile(
         dscname,
-        options.file_system->OptimizeForManifestRead(file_options_), &file,
+        fs->OptimizeForManifestRead(file_options_), &file,
         nullptr);
     if (!s.ok()) {
       return s;
     }
     file_reader.reset(new SequentialFileReader(
-        std::move(file), dscname, db_options_->log_readahead_size));
+        std::move(file), dscname, db_options_->log_readahead_size, io_tracer_));
   }
 
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t previous_log_number = 0;
-  int count = 0;
-  std::unordered_map<uint32_t, std::string> comparators;
-  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
-      builders;
-
-  // add default column family
-  VersionEdit default_cf_edit;
-  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
-  default_cf_edit.SetColumnFamily(0);
-  ColumnFamilyData* default_cfd =
-      CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
-  builders.insert(
-      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
-                            new BaseReferencedVersionBuilder(default_cfd))));
-
+  std::vector<ColumnFamilyDescriptor> column_families(
+      1, ColumnFamilyDescriptor(kDefaultColumnFamilyName, options));
+  DumpManifestHandler handler(column_families, this, io_tracer_, verbose, hex,
+                              json);
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
     log::Reader reader(nullptr, std::move(file_reader), &reporter,
                        true /* checksum */, 0 /* log_number */);
-    Slice record;
-    std::string scratch;
-    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-
-      // Write out each individual edit
-      if (verbose && !json) {
-        printf("%s\n", edit.DebugString(hex).c_str());
-      } else if (json) {
-        printf("%s\n", edit.DebugJSON(count, hex).c_str());
-      }
-      count++;
-
-      bool cf_in_builders =
-          builders.find(edit.column_family_) != builders.end();
-
-      if (edit.has_comparator_) {
-        comparators.insert({edit.column_family_, edit.comparator_});
-      }
-
-      ColumnFamilyData* cfd = nullptr;
-
-      if (edit.is_column_family_add_) {
-        if (cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest adding the same column family twice");
-          break;
-        }
-        cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
-        cfd->set_initialized();
-        builders.insert(std::make_pair(
-            edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
-                                     new BaseReferencedVersionBuilder(cfd))));
-      } else if (edit.is_column_family_drop_) {
-        if (!cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest - dropping non-existing column family");
-          break;
-        }
-        auto builder_iter = builders.find(edit.column_family_);
-        builders.erase(builder_iter);
-        comparators.erase(edit.column_family_);
-        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-        assert(cfd != nullptr);
-        cfd->UnrefAndTryDelete();
-        cfd = nullptr;
-      } else {
-        if (!cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest record referencing unknown column family");
-          break;
-        }
-
-        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-        // this should never happen since cf_in_builders is true
-        assert(cfd != nullptr);
-
-        // if it is not column family add or column family drop,
-        // then it's a file add/delete, which should be forwarded
-        // to builder
-        auto builder = builders.find(edit.column_family_);
-        assert(builder != builders.end());
-        s = builder->second->version_builder()->Apply(&edit);
-        if (!s.ok()) {
-          break;
-        }
-      }
-
-      if (cfd != nullptr && edit.has_log_number_) {
-        cfd->SetLogNumber(edit.log_number_);
-      }
-
-
-      if (edit.has_prev_log_number_) {
-        previous_log_number = edit.prev_log_number_;
-        have_prev_log_number = true;
-      }
-
-      if (edit.has_next_file_number_) {
-        next_file = edit.next_file_number_;
-        have_next_file = true;
-      }
-
-      if (edit.has_last_sequence_) {
-        last_sequence = edit.last_sequence_;
-        have_last_sequence = true;
-      }
-
-      if (edit.has_max_column_family_) {
-        column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_);
-      }
-
-      if (edit.has_min_log_number_to_keep_) {
-        MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_);
-      }
-    }
-  }
-  file_reader.reset();
-
-  if (s.ok()) {
-    if (!have_next_file) {
-      s = Status::Corruption("no meta-nextfile entry in descriptor");
-      printf("no meta-nextfile entry in descriptor");
-    } else if (!have_last_sequence) {
-      printf("no last-sequence-number entry in descriptor");
-      s = Status::Corruption("no last-sequence-number entry in descriptor");
-    }
-
-    if (!have_prev_log_number) {
-      previous_log_number = 0;
-    }
-  }
-
-  if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      auto builders_iter = builders.find(cfd->GetID());
-      assert(builders_iter != builders.end());
-      auto builder = builders_iter->second->version_builder();
-
-      Version* v = new Version(cfd, this, file_options_,
-                               *cfd->GetLatestMutableCFOptions(),
-                               current_version_number_++);
-      builder->SaveTo(v->storage_info());
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
-
-      printf("--------------- Column family \"%s\"  (ID %" PRIu32
-             ") --------------\n",
-             cfd->GetName().c_str(), cfd->GetID());
-      printf("log number: %" PRIu64 "\n", cfd->GetLogNumber());
-      auto comparator = comparators.find(cfd->GetID());
-      if (comparator != comparators.end()) {
-        printf("comparator: %s\n", comparator->second.c_str());
-      } else {
-        printf("comparator: <NO COMPARATOR>\n");
-      }
-      printf("%s \n", v->DebugString(hex).c_str());
-      delete v;
-    }
-
-    next_file_number_.store(next_file + 1);
-    last_allocated_sequence_ = last_sequence;
-    last_published_sequence_ = last_sequence;
-    last_sequence_ = last_sequence;
-    prev_log_number_ = previous_log_number;
-
-    printf("next_file_number %" PRIu64 " last_sequence %" PRIu64
-           "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
-           " min_log_number_to_keep "
-           "%" PRIu64 "\n",
-           next_file_number_.load(), last_sequence, previous_log_number,
-           column_family_set_->GetMaxColumnFamily(),
-           min_log_number_to_keep_2pc());
+    handler.Iterate(reader, &s);
   }
 
-  return s;
+  return handler.status();
 }
 #endif  // ROCKSDB_LITE
 
@@ -4982,15 +5339,15 @@
 }
 // Called only either from ::LogAndApply which is protected by mutex or during
 // recovery which is single-threaded.
-void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
-  if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
-    min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
+void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
+  if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
+    min_log_number_to_keep_.store(number, std::memory_order_relaxed);
   }
 }
 
 Status VersionSet::WriteCurrentStateToManifest(
     const std::unordered_map<uint32_t, MutableCFState>& curr_state,
-    log::Writer* log) {
+    const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
   // WARNING: This method doesn't hold a mutex!!
@@ -4999,6 +5356,7 @@
   // LogAndApply. Column family manipulations can only happen within LogAndApply
   // (the same single thread), so we're safe to iterate.
 
+  assert(io_s.ok());
   if (db_options_->write_dbid_to_manifest) {
     VersionEdit edit_for_db_id;
     assert(!db_id_.empty());
@@ -5008,13 +5366,30 @@
       return Status::Corruption("Unable to Encode VersionEdit:" +
                                 edit_for_db_id.DebugString(true));
     }
-    Status add_record = log->AddRecord(db_id_record);
-    if (!add_record.ok()) {
-      return add_record;
+    io_s = log->AddRecord(db_id_record);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  // Save WALs.
+  if (!wal_additions.GetWalAdditions().empty()) {
+    TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal",
+                             const_cast<VersionEdit*>(&wal_additions));
+    std::string record;
+    if (!wal_additions.EncodeTo(&record)) {
+      return Status::Corruption("Unable to Encode VersionEdit: " +
+                                wal_additions.DebugString(true));
+    }
+    io_s = log->AddRecord(record);
+    if (!io_s.ok()) {
+      return io_s;
     }
   }
 
   for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
     if (cfd->IsDropped()) {
       continue;
     }
@@ -5035,9 +5410,9 @@
         return Status::Corruption(
             "Unable to Encode VersionEdit:" + edit.DebugString(true));
       }
-      Status s = log->AddRecord(record);
-      if (!s.ok()) {
-        return s;
+      io_s = log->AddRecord(record);
+      if (!io_s.ok()) {
+        return io_s;
       }
     }
 
@@ -5046,29 +5421,69 @@
       VersionEdit edit;
       edit.SetColumnFamily(cfd->GetID());
 
+      assert(cfd->current());
+      assert(cfd->current()->storage_info());
+
       for (int level = 0; level < cfd->NumberLevels(); level++) {
         for (const auto& f :
              cfd->current()->storage_info()->LevelFiles(level)) {
-          edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
-                       f->fd.GetFileSize(), f->smallest, f->largest,
-                       f->fd.smallest_seqno, f->fd.largest_seqno,
-                       f->marked_for_compaction, f->oldest_blob_file_number,
-                       f->oldest_ancester_time, f->file_creation_time,
-                       f->file_checksum, f->file_checksum_func_name);
+          edit.AddFile(
+              level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
+              f->smallest, f->largest, f->fd.smallest_seqno,
+              f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
+              f->oldest_blob_file_number, f->oldest_ancester_time,
+              f->file_creation_time, f->file_checksum,
+              f->file_checksum_func_name, f->min_timestamp, f->max_timestamp);
         }
       }
+
+      const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles();
+      for (const auto& pair : blob_files) {
+        const uint64_t blob_file_number = pair.first;
+        const auto& meta = pair.second;
+
+        assert(meta);
+        assert(blob_file_number == meta->GetBlobFileNumber());
+
+        edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(),
+                         meta->GetTotalBlobBytes(), meta->GetChecksumMethod(),
+                         meta->GetChecksumValue());
+        if (meta->GetGarbageBlobCount() > 0) {
+          edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(),
+                                  meta->GetGarbageBlobBytes());
+        }
+      }
+
       const auto iter = curr_state.find(cfd->GetID());
       assert(iter != curr_state.end());
       uint64_t log_number = iter->second.log_number;
       edit.SetLogNumber(log_number);
+
+      if (cfd->GetID() == 0) {
+        // min_log_number_to_keep is for the whole db, not for specific column family.
+        // So it does not need to be set for every column family, just need to be set once.
+        // Since default CF can never be dropped, we set the min_log to the default CF here.
+        uint64_t min_log = min_log_number_to_keep();
+        if (min_log != 0) {
+          edit.SetMinLogNumberToKeep(min_log);
+        }
+      }
+
+      const std::string& full_history_ts_low = iter->second.full_history_ts_low;
+      if (!full_history_ts_low.empty()) {
+        edit.SetFullHistoryTsLow(full_history_ts_low);
+      }
+
+      edit.SetLastSequence(descriptor_last_sequence_);
+
       std::string record;
       if (!edit.EncodeTo(&record)) {
         return Status::Corruption(
             "Unable to Encode VersionEdit:" + edit.DebugString(true));
       }
-      Status s = log->AddRecord(record);
-      if (!s.ok()) {
-        return s;
+      io_s = log->AddRecord(record);
+      if (!io_s.ok()) {
+        return io_s;
       }
     }
   }
@@ -5193,7 +5608,8 @@
                         static_cast<uint64_t>(total_full_size * margin)) {
     total_full_size += total_intersecting_size / 2;
   } else {
-    // Estimate for all the first files, at each level
+    // Estimate for all the first files (might also be last files), at each
+    // level
     for (const auto file_ptr : first_files) {
       total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
     }
@@ -5230,7 +5646,7 @@
     if (table_cache != nullptr) {
       result = table_cache->ApproximateOffsetOf(
           key, f.file_metadata->fd, caller, icmp,
-          v->GetMutableCFOptions().prefix_extractor.get());
+          v->GetMutableCFOptions().prefix_extractor);
     }
   }
   return result;
@@ -5270,64 +5686,82 @@
   }
   return table_cache->ApproximateSize(
       start, end, f.file_metadata->fd, caller, icmp,
-      v->GetMutableCFOptions().prefix_extractor.get());
+      v->GetMutableCFOptions().prefix_extractor);
 }
 
-void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
+void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                              std::vector<uint64_t>* live_blob_files) const {
+  assert(live_table_files);
+  assert(live_blob_files);
+
   // pre-calculate space requirement
-  int64_t total_files = 0;
+  size_t total_table_files = 0;
+  size_t total_blob_files = 0;
+
+  assert(column_family_set_);
   for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
     if (!cfd->initialized()) {
       continue;
     }
-    Version* dummy_versions = cfd->dummy_versions();
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
+      assert(v);
+
       const auto* vstorage = v->storage_info();
-      for (int level = 0; level < vstorage->num_levels(); level++) {
-        total_files += vstorage->LevelFiles(level).size();
+      assert(vstorage);
+
+      for (int level = 0; level < vstorage->num_levels(); ++level) {
+        total_table_files += vstorage->LevelFiles(level).size();
       }
+
+      total_blob_files += vstorage->GetBlobFiles().size();
     }
   }
 
   // just one time extension to the right size
-  live_list->reserve(live_list->size() + static_cast<size_t>(total_files));
+  live_table_files->reserve(live_table_files->size() + total_table_files);
+  live_blob_files->reserve(live_blob_files->size() + total_blob_files);
 
+  assert(column_family_set_);
   for (auto cfd : *column_family_set_) {
+    assert(cfd);
     if (!cfd->initialized()) {
       continue;
     }
+
     auto* current = cfd->current();
     bool found_current = false;
-    Version* dummy_versions = cfd->dummy_versions();
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      v->AddLiveFiles(live_list);
+      v->AddLiveFiles(live_table_files, live_blob_files);
       if (v == current) {
         found_current = true;
       }
     }
+
     if (!found_current && current != nullptr) {
       // Should never happen unless it is a bug.
       assert(false);
-      current->AddLiveFiles(live_list);
+      current->AddLiveFiles(live_table_files, live_blob_files);
     }
   }
 }
 
 InternalIterator* VersionSet::MakeInputIterator(
-    const Compaction* c, RangeDelAggregator* range_del_agg,
+    const ReadOptions& read_options, const Compaction* c,
+    RangeDelAggregator* range_del_agg,
     const FileOptions& file_options_compactions) {
   auto cfd = c->column_family_data();
-  ReadOptions read_options;
-  read_options.verify_checksums = true;
-  read_options.fill_cache = false;
-  // Compaction iterators shouldn't be confined to a single prefix.
-  // Compactions use Seek() for
-  // (a) concurrent compactions,
-  // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
-  read_options.total_order_seek = true;
-
   // Level-0 files have to be merged together.  For other levels,
   // we will make a concatenating iterator per level.
   // TODO(opt): use concatenating iterator for level-0 if there is no overlap
@@ -5343,26 +5777,28 @@
         for (size_t i = 0; i < flevel->num_files; i++) {
           list[num++] = cfd->table_cache()->NewIterator(
               read_options, file_options_compactions,
-              cfd->internal_comparator(),
-              *flevel->files[i].file_metadata, range_del_agg,
-              c->mutable_cf_options()->prefix_extractor.get(),
+              cfd->internal_comparator(), *flevel->files[i].file_metadata,
+              range_del_agg, c->mutable_cf_options()->prefix_extractor,
               /*table_reader_ptr=*/nullptr,
               /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
               /*arena=*/nullptr,
-              /*skip_filters=*/false, /*level=*/static_cast<int>(which),
+              /*skip_filters=*/false,
+              /*level=*/static_cast<int>(c->level(which)),
+              MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
               /*smallest_compaction_key=*/nullptr,
-              /*largest_compaction_key=*/nullptr);
+              /*largest_compaction_key=*/nullptr,
+              /*allow_unprepared_value=*/false);
         }
       } else {
         // Create concatenating iterator for the files from this level
         list[num++] = new LevelIterator(
             cfd->table_cache(), read_options, file_options_compactions,
             cfd->internal_comparator(), c->input_levels(which),
-            c->mutable_cf_options()->prefix_extractor.get(),
+            c->mutable_cf_options()->prefix_extractor,
             /*should_sample=*/false,
             /*no per level latency histogram=*/nullptr,
             TableReaderCaller::kCompaction, /*skip_filters=*/false,
-            /*level=*/static_cast<int>(which), range_del_agg,
+            /*level=*/static_cast<int>(c->level(which)), range_del_agg,
             c->boundaries(which));
       }
     }
@@ -5375,57 +5811,6 @@
   return result;
 }
 
-// verify that the files listed in this compaction are present
-// in the current version
-bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
-#ifndef NDEBUG
-  Version* version = c->column_family_data()->current();
-  const VersionStorageInfo* vstorage = version->storage_info();
-  if (c->input_version() != version) {
-    ROCKS_LOG_INFO(
-        db_options_->info_log,
-        "[%s] compaction output being applied to a different base version from"
-        " input version",
-        c->column_family_data()->GetName().c_str());
-
-    if (vstorage->compaction_style_ == kCompactionStyleLevel &&
-        c->start_level() == 0 && c->num_input_levels() > 2U) {
-      // We are doing a L0->base_level compaction. The assumption is if
-      // base level is not L1, levels from L1 to base_level - 1 is empty.
-      // This is ensured by having one compaction from L0 going on at the
-      // same time in level-based compaction. So that during the time, no
-      // compaction/flush can put files to those levels.
-      for (int l = c->start_level() + 1; l < c->output_level(); l++) {
-        if (vstorage->NumLevelFiles(l) != 0) {
-          return false;
-        }
-      }
-    }
-  }
-
-  for (size_t input = 0; input < c->num_input_levels(); ++input) {
-    int level = c->level(input);
-    for (size_t i = 0; i < c->num_input_files(input); ++i) {
-      uint64_t number = c->input(input, i)->fd.GetNumber();
-      bool found = false;
-      for (size_t j = 0; j < vstorage->files_[level].size(); j++) {
-        FileMetaData* f = vstorage->files_[level][j];
-        if (f->fd.GetNumber() == number) {
-          found = true;
-          break;
-        }
-      }
-      if (!found) {
-        return false;  // input files non existent in current version
-      }
-    }
-  }
-#else
-  (void)c;
-#endif
-  return true;     // everything good
-}
-
 Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
                                       FileMetaData** meta,
                                       ColumnFamilyData** cfd) {
@@ -5483,6 +5868,9 @@
         filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
         filemetadata.file_checksum = file->file_checksum;
         filemetadata.file_checksum_func_name = file->file_checksum_func_name;
+        filemetadata.temperature = file->temperature;
+        filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
+        filemetadata.file_creation_time = file->TryGetFileCreationTime();
         metadata->push_back(filemetadata);
       }
     }
@@ -5490,28 +5878,46 @@
 }
 
 void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                                  std::vector<ObsoleteBlobFileInfo>* blob_files,
                                   std::vector<std::string>* manifest_filenames,
                                   uint64_t min_pending_output) {
+  assert(files);
+  assert(blob_files);
+  assert(manifest_filenames);
+  assert(files->empty());
+  assert(blob_files->empty());
   assert(manifest_filenames->empty());
-  obsolete_manifests_.swap(*manifest_filenames);
+
   std::vector<ObsoleteFileInfo> pending_files;
   for (auto& f : obsolete_files_) {
     if (f.metadata->fd.GetNumber() < min_pending_output) {
-      files->push_back(std::move(f));
+      files->emplace_back(std::move(f));
     } else {
-      pending_files.push_back(std::move(f));
+      pending_files.emplace_back(std::move(f));
     }
   }
   obsolete_files_.swap(pending_files);
+
+  std::vector<ObsoleteBlobFileInfo> pending_blob_files;
+  for (auto& blob_file : obsolete_blob_files_) {
+    if (blob_file.GetBlobFileNumber() < min_pending_output) {
+      blob_files->emplace_back(std::move(blob_file));
+    } else {
+      pending_blob_files.emplace_back(std::move(blob_file));
+    }
+  }
+  obsolete_blob_files_.swap(pending_blob_files);
+
+  obsolete_manifests_.swap(*manifest_filenames);
 }
 
 ColumnFamilyData* VersionSet::CreateColumnFamily(
-    const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
+    const ColumnFamilyOptions& cf_options, const VersionEdit* edit) {
   assert(edit->is_column_family_add_);
 
   MutableCFOptions dummy_cf_options;
   Version* dummy_versions =
-      new Version(nullptr, this, file_options_, dummy_cf_options);
+      new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_);
   // Ref() dummy version once so that later we can call Unref() to delete it
   // by avoiding calling "delete" explicitly (~Version is private)
   dummy_versions->Ref();
@@ -5520,7 +5926,7 @@
       cf_options);
 
   Version* v = new Version(new_cfd, this, file_options_,
-                           *new_cfd->GetLatestMutableCFOptions(),
+                           *new_cfd->GetLatestMutableCFOptions(), io_tracer_,
                            current_version_number_++);
 
   // Fill level target base information.
@@ -5561,16 +5967,46 @@
   return total_files_size;
 }
 
-ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname,
-                                       const ImmutableDBOptions* _db_options,
-                                       const FileOptions& _file_options,
-                                       Cache* table_cache,
-                                       WriteBufferManager* write_buffer_manager,
-                                       WriteController* write_controller)
+uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
+  std::unordered_set<uint64_t> unique_blob_files;
+  uint64_t all_v_blob_file_size = 0;
+  for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    // iterate all the versions
+    auto* vstorage = v->storage_info();
+    const auto& blob_files = vstorage->GetBlobFiles();
+    for (const auto& pair : blob_files) {
+      if (unique_blob_files.find(pair.first) == unique_blob_files.end()) {
+        // find Blob file that has not been counted
+        unique_blob_files.insert(pair.first);
+        const auto& meta = pair.second;
+        all_v_blob_file_size += meta->GetBlobFileSize();
+      }
+    }
+  }
+  return all_v_blob_file_size;
+}
+
+Status VersionSet::VerifyFileMetadata(const std::string& fpath,
+                                      const FileMetaData& meta) const {
+  uint64_t fsize = 0;
+  Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
+  if (status.ok()) {
+    if (fsize != meta.fd.GetFileSize()) {
+      status = Status::Corruption("File size mismatch: " + fpath);
+    }
+  }
+  return status;
+}
+
+ReactiveVersionSet::ReactiveVersionSet(
+    const std::string& dbname, const ImmutableDBOptions* _db_options,
+    const FileOptions& _file_options, Cache* table_cache,
+    WriteBufferManager* write_buffer_manager, WriteController* write_controller,
+    const std::shared_ptr<IOTracer>& io_tracer)
     : VersionSet(dbname, _db_options, _file_options, table_cache,
                  write_buffer_manager, write_controller,
-                 /*block_cache_tracer=*/nullptr),
-      number_of_edits_to_skip_(0) {}
+                 /*block_cache_tracer=*/nullptr, io_tracer,
+                 /*db_session_id*/ "") {}
 
 ReactiveVersionSet::~ReactiveVersionSet() {}
 
@@ -5583,423 +6019,124 @@
   assert(manifest_reporter != nullptr);
   assert(manifest_reader_status != nullptr);
 
-  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
-  for (const auto& cf : column_families) {
-    cf_name_to_options.insert({cf.name, cf.options});
-  }
-
-  // add default column family
-  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
-  if (default_cf_iter == cf_name_to_options.end()) {
-    return Status::InvalidArgument("Default column family not specified");
-  }
-  VersionEdit default_cf_edit;
-  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
-  default_cf_edit.SetColumnFamily(0);
-  ColumnFamilyData* default_cfd =
-      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
-  // In recovery, nobody else can access it, so it's fine to set it to be
-  // initialized earlier.
-  default_cfd->set_initialized();
-  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
-      builders;
-  std::unordered_map<int, std::string> column_families_not_found;
-  builders.insert(
-      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
-                            new BaseReferencedVersionBuilder(default_cfd))));
-
   manifest_reader_status->reset(new Status());
   manifest_reporter->reset(new LogReporter());
-  static_cast<LogReporter*>(manifest_reporter->get())->status =
+  static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
       manifest_reader_status->get();
   Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
-  log::Reader* reader = manifest_reader->get();
-
-  int retry = 0;
-  VersionEdit version_edit;
-  while (s.ok() && retry < 1) {
-    assert(reader != nullptr);
-    Slice record;
-    std::string scratch;
-    s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options,
-                       column_families_not_found, builders, &version_edit);
-    if (s.ok()) {
-      bool enough = version_edit.has_next_file_number_ &&
-                    version_edit.has_log_number_ &&
-                    version_edit.has_last_sequence_;
-      if (enough) {
-        for (const auto& cf : column_families) {
-          auto cfd = column_family_set_->GetColumnFamily(cf.name);
-          if (cfd == nullptr) {
-            enough = false;
-            break;
-          }
-        }
-      }
-      if (enough) {
-        for (const auto& cf : column_families) {
-          auto cfd = column_family_set_->GetColumnFamily(cf.name);
-          assert(cfd != nullptr);
-          if (!cfd->IsDropped()) {
-            auto builder_iter = builders.find(cfd->GetID());
-            assert(builder_iter != builders.end());
-            auto builder = builder_iter->second->version_builder();
-            assert(builder != nullptr);
-            s = builder->LoadTableHandlers(
-                cfd->internal_stats(), db_options_->max_file_opening_threads,
-                false /* prefetch_index_and_filter_in_cache */,
-                true /* is_initial_load */,
-                cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
-            if (!s.ok()) {
-              enough = false;
-              if (s.IsPathNotFound()) {
-                s = Status::OK();
-              }
-              break;
-            }
-          }
-        }
-      }
-      if (enough) {
-        break;
-      }
-    }
-    ++retry;
+  if (!s.ok()) {
+    return s;
   }
+  log::Reader* reader = manifest_reader->get();
+  assert(reader);
 
-  if (s.ok()) {
-    if (!version_edit.has_prev_log_number_) {
-      version_edit.prev_log_number_ = 0;
-    }
-    column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
-
-    MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
-    MarkFileNumberUsed(version_edit.prev_log_number_);
-    MarkFileNumberUsed(version_edit.log_number_);
+  manifest_tailer_.reset(new ManifestTailer(
+      column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
 
-    for (auto cfd : *column_family_set_) {
-      assert(builders.count(cfd->GetID()) > 0);
-      auto builder = builders[cfd->GetID()]->version_builder();
-      if (!builder->CheckConsistencyForNumLevels()) {
-        s = Status::InvalidArgument(
-            "db has more levels than options.num_levels");
-        break;
-      }
-    }
-  }
+  manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
 
-  if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      assert(cfd->initialized());
-      auto builders_iter = builders.find(cfd->GetID());
-      assert(builders_iter != builders.end());
-      auto* builder = builders_iter->second->version_builder();
-
-      Version* v = new Version(cfd, this, file_options_,
-                               *cfd->GetLatestMutableCFOptions(),
-                               current_version_number_++);
-      builder->SaveTo(v->storage_info());
-
-      // Install recovered version
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
-                      !(db_options_->skip_stats_update_on_db_open));
-      AppendVersion(cfd, v);
-    }
-    next_file_number_.store(version_edit.next_file_number_ + 1);
-    last_allocated_sequence_ = version_edit.last_sequence_;
-    last_published_sequence_ = version_edit.last_sequence_;
-    last_sequence_ = version_edit.last_sequence_;
-    prev_log_number_ = version_edit.prev_log_number_;
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      ROCKS_LOG_INFO(db_options_->info_log,
-                     "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
-                     cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
-    }
-  }
-  return s;
+  return manifest_tailer_->status();
 }
 
 Status ReactiveVersionSet::ReadAndApply(
     InstrumentedMutex* mu,
     std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    Status* manifest_read_status,
     std::unordered_set<ColumnFamilyData*>* cfds_changed) {
   assert(manifest_reader != nullptr);
   assert(cfds_changed != nullptr);
   mu->AssertHeld();
 
   Status s;
-  uint64_t applied_edits = 0;
-  while (s.ok()) {
-    Slice record;
-    std::string scratch;
-    log::Reader* reader = manifest_reader->get();
-    std::string old_manifest_path = reader->file()->file_name();
-    while (reader->ReadRecord(&record, &scratch)) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-
-      // Skip the first VersionEdits of each MANIFEST generated by
-      // VersionSet::WriteCurrentStatetoManifest.
-      if (number_of_edits_to_skip_ > 0) {
-        ColumnFamilyData* cfd =
-            column_family_set_->GetColumnFamily(edit.column_family_);
-        if (cfd != nullptr && !cfd->IsDropped()) {
-          --number_of_edits_to_skip_;
-        }
-        continue;
-      }
-
-      s = read_buffer_.AddEdit(&edit);
-      if (!s.ok()) {
-        break;
-      }
-      VersionEdit temp_edit;
-      if (edit.is_in_atomic_group_) {
-        if (read_buffer_.IsFull()) {
-          // Apply edits in an atomic group when we have read all edits in the
-          // group.
-          for (auto& e : read_buffer_.replay_buffer()) {
-            s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
-            if (!s.ok()) {
-              break;
-            }
-            applied_edits++;
-          }
-          if (!s.ok()) {
-            break;
-          }
-          read_buffer_.Clear();
-        }
-      } else {
-        // Apply a normal edit immediately.
-        s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
-        if (s.ok()) {
-          applied_edits++;
-        }
-      }
-    }
-    if (!s.ok()) {
-      // Clear the buffer if we fail to decode/apply an edit.
-      read_buffer_.Clear();
-    }
-    // It's possible that:
-    // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
-    // 2) we have finished reading the current MANIFEST.
-    // 3) we have encountered an IOError reading the current MANIFEST.
-    // We need to look for the next MANIFEST and start from there. If we cannot
-    // find the next MANIFEST, we should exit the loop.
-    s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
-    reader = manifest_reader->get();
-    if (s.ok()) {
-      if (reader->file()->file_name() == old_manifest_path) {
-        // Still processing the same MANIFEST, thus no need to continue this
-        // loop since no record is available if we have reached here.
-        break;
-      } else {
-        // We have switched to a new MANIFEST whose first records have been
-        // generated by VersionSet::WriteCurrentStatetoManifest. Since the
-        // secondary instance has already finished recovering upon start, there
-        // is no need for the secondary to process these records. Actually, if
-        // the secondary were to replay these records, the secondary may end up
-        // adding the same SST files AGAIN to each column family, causing
-        // consistency checks done by VersionBuilder to fail. Therefore, we
-        // record the number of records to skip at the beginning of the new
-        // MANIFEST and ignore them.
-        number_of_edits_to_skip_ = 0;
-        for (auto* cfd : *column_family_set_) {
-          if (cfd->IsDropped()) {
-            continue;
-          }
-          // Increase number_of_edits_to_skip by 2 because
-          // WriteCurrentStatetoManifest() writes 2 version edits for each
-          // column family at the beginning of the newly-generated MANIFEST.
-          // TODO(yanqin) remove hard-coded value.
-          if (db_options_->write_dbid_to_manifest) {
-            number_of_edits_to_skip_ += 3;
-          } else {
-            number_of_edits_to_skip_ += 2;
-          }
-        }
-      }
-    }
+  log::Reader* reader = manifest_reader->get();
+  assert(reader);
+  s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
+  if (!s.ok()) {
+    return s;
   }
-
+  manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
+  s = manifest_tailer_->status();
   if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      auto builder_iter = active_version_builders_.find(cfd->GetID());
-      if (builder_iter == active_version_builders_.end()) {
-        continue;
-      }
-      auto builder = builder_iter->second->version_builder();
-      if (!builder->CheckConsistencyForNumLevels()) {
-        s = Status::InvalidArgument(
-            "db has more levels than options.num_levels");
-        break;
-      }
-    }
+    *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
   }
-  TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits",
-                           &applied_edits);
+
   return s;
 }
 
-Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
-    VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
-    VersionEdit* version_edit) {
-  ColumnFamilyData* cfd =
-      column_family_set_->GetColumnFamily(edit.column_family_);
-
-  // If we cannot find this column family in our column family set, then it
-  // may be a new column family created by the primary after the secondary
-  // starts. It is also possible that the secondary instance opens only a subset
-  // of column families. Ignore it for now.
-  if (nullptr == cfd) {
-    return Status::OK();
+Status ReactiveVersionSet::MaybeSwitchManifest(
+    log::Reader::Reporter* reporter,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
+  assert(manifest_reader != nullptr);
+  Status s;
+  std::string manifest_path;
+  s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
+                             &manifest_file_number_);
+  if (!s.ok()) {
+    return s;
   }
-  if (active_version_builders_.find(edit.column_family_) ==
-          active_version_builders_.end() &&
-      !cfd->IsDropped()) {
-    std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
-        new BaseReferencedVersionBuilder(cfd));
-    active_version_builders_.insert(
-        std::make_pair(edit.column_family_, std::move(builder_guard)));
-  }
-
-  auto builder_iter = active_version_builders_.find(edit.column_family_);
-  assert(builder_iter != active_version_builders_.end());
-  auto builder = builder_iter->second->version_builder();
-  assert(builder != nullptr);
-
-  if (edit.is_column_family_add_) {
-    // TODO (yanqin) for now the secondary ignores column families created
-    // after Open. This also simplifies handling of switching to a new MANIFEST
-    // and processing the snapshot of the system at the beginning of the
+  std::unique_ptr<FSSequentialFile> manifest_file;
+  if (manifest_reader->get() != nullptr &&
+      manifest_reader->get()->file()->file_name() == manifest_path) {
+    // CURRENT points to the same MANIFEST as before, no need to switch
     // MANIFEST.
-  } else if (edit.is_column_family_drop_) {
-    // Drop the column family by setting it to be 'dropped' without destroying
-    // the column family handle.
-    // TODO (haoyu) figure out how to handle column faimly drop for
-    // secondary instance. (Is it possible that the ref count for cfd is 0 but
-    // the ref count for its versions is higher than 0?)
-    cfd->SetDropped();
-    if (cfd->UnrefAndTryDelete()) {
-      cfd = nullptr;
-    }
-    active_version_builders_.erase(builder_iter);
-  } else {
-    Status s = builder->Apply(&edit);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
-  if (!s.ok()) {
     return s;
   }
-
-  if (cfd != nullptr && !cfd->IsDropped()) {
-    s = builder->LoadTableHandlers(
-        cfd->internal_stats(), db_options_->max_file_opening_threads,
-        false /* prefetch_index_and_filter_in_cache */,
-        false /* is_initial_load */,
-        cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
-    TEST_SYNC_POINT_CALLBACK(
-        "ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
-        "AfterLoadTableHandlers",
-        &s);
-
-    if (s.ok()) {
-      auto version = new Version(cfd, this, file_options_,
-                                 *cfd->GetLatestMutableCFOptions(),
-                                 current_version_number_++);
-      builder->SaveTo(version->storage_info());
-      version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
-      AppendVersion(cfd, version);
-      active_version_builders_.erase(builder_iter);
-      if (cfds_changed->count(cfd) == 0) {
-        cfds_changed->insert(cfd);
-      }
-    } else if (s.IsPathNotFound()) {
-      s = Status::OK();
-    }
-    // Some other error has occurred during LoadTableHandlers.
-  }
-
-  if (version_edit->HasNextFile()) {
-    next_file_number_.store(version_edit->next_file_number_ + 1);
-  }
-  if (version_edit->has_last_sequence_) {
-    last_allocated_sequence_ = version_edit->last_sequence_;
-    last_published_sequence_ = version_edit->last_sequence_;
-    last_sequence_ = version_edit->last_sequence_;
-  }
-  if (version_edit->has_prev_log_number_) {
-    prev_log_number_ = version_edit->prev_log_number_;
-    MarkFileNumberUsed(version_edit->prev_log_number_);
+  assert(nullptr == manifest_reader->get() ||
+         manifest_reader->get()->file()->file_name() != manifest_path);
+  s = fs_->FileExists(manifest_path, IOOptions(), nullptr);
+  if (s.IsNotFound()) {
+    return Status::TryAgain(
+        "The primary may have switched to a new MANIFEST and deleted the old "
+        "one.");
+  } else if (!s.ok()) {
+    return s;
   }
-  if (version_edit->has_log_number_) {
-    MarkFileNumberUsed(version_edit->log_number_);
+  TEST_SYNC_POINT(
+      "ReactiveVersionSet::MaybeSwitchManifest:"
+      "AfterGetCurrentManifestPath:0");
+  TEST_SYNC_POINT(
+      "ReactiveVersionSet::MaybeSwitchManifest:"
+      "AfterGetCurrentManifestPath:1");
+  // The primary can also delete the MANIFEST while the secondary is reading
+  // it. This is OK on POSIX. For other file systems, maybe create a hard link
+  // to MANIFEST. The hard link should be cleaned up later by the secondary.
+  s = fs_->NewSequentialFile(manifest_path,
+                             fs_->OptimizeForManifestRead(file_options_),
+                             &manifest_file, nullptr);
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  if (s.ok()) {
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+    manifest_reader->reset(new log::FragmentBufferedReader(
+        nullptr, std::move(manifest_file_reader), reporter, true /* checksum */,
+        0 /* log_number */));
+    ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
+                   manifest_path.c_str());
+    if (manifest_tailer_) {
+      manifest_tailer_->PrepareToReadNewManifest();
+    }
+  } else if (s.IsPathNotFound()) {
+    // This can happen if the primary switches to a new MANIFEST after the
+    // secondary reads the CURRENT file but before the secondary actually tries
+    // to open the MANIFEST.
+    s = Status::TryAgain(
+        "The primary may have switched to a new MANIFEST and deleted the old "
+        "one.");
   }
-  column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_);
-  MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_);
   return s;
 }
 
-Status ReactiveVersionSet::MaybeSwitchManifest(
-    log::Reader::Reporter* reporter,
-    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
-  assert(manifest_reader != nullptr);
-  Status s;
-  do {
-    std::string manifest_path;
-    s = GetCurrentManifestPath(dbname_, fs_, &manifest_path,
-                               &manifest_file_number_);
-    std::unique_ptr<FSSequentialFile> manifest_file;
-    if (s.ok()) {
-      if (nullptr == manifest_reader->get() ||
-          manifest_reader->get()->file()->file_name() != manifest_path) {
-        TEST_SYNC_POINT(
-            "ReactiveVersionSet::MaybeSwitchManifest:"
-            "AfterGetCurrentManifestPath:0");
-        TEST_SYNC_POINT(
-            "ReactiveVersionSet::MaybeSwitchManifest:"
-            "AfterGetCurrentManifestPath:1");
-        s = fs_->NewSequentialFile(manifest_path,
-                                   env_->OptimizeForManifestRead(file_options_),
-                                   &manifest_file, nullptr);
-      } else {
-        // No need to switch manifest.
-        break;
-      }
-    }
-    std::unique_ptr<SequentialFileReader> manifest_file_reader;
-    if (s.ok()) {
-      manifest_file_reader.reset(
-          new SequentialFileReader(std::move(manifest_file), manifest_path,
-                                   db_options_->log_readahead_size));
-      manifest_reader->reset(new log::FragmentBufferedReader(
-          nullptr, std::move(manifest_file_reader), reporter,
-          true /* checksum */, 0 /* log_number */));
-      ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
-                     manifest_path.c_str());
-      // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
-      // active_version_builders_ map because we choose to construct the
-      // versions from scratch, thanks to the first part of each MANIFEST
-      // written by VersionSet::WriteCurrentStatetoManifest. This is not
-      // necessary, but we choose this at present for the sake of simplicity.
-      active_version_builders_.clear();
-    }
-  } while (s.IsPathNotFound());
-  return s;
+#ifndef NDEBUG
+uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
+}
+#endif  // !NDEBUG
+
+std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().replay_buffer();
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,8 +11,9 @@
 // newest version is called "current".  Older versions may be kept
 // around to provide a consistent view to live iterators.
 //
-// Each Version keeps track of a set of Table files per level.  The
-// entire set of versions is maintained in a VersionSet.
+// Each Version keeps track of a set of table files per level, as well as a
+// set of blob files. The entire set of versions is maintained in a
+// VersionSet.
 //
 // Version,VersionSet are thread-compatible, but require external
 // synchronization on all accesses.
@@ -25,9 +26,12 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "cache/cache_helpers.h"
+#include "db/blob/blob_file_meta.h"
 #include "db/column_family.h"
 #include "db/compaction/compaction.h"
 #include "db/compaction/compaction_picker.h"
@@ -40,6 +44,7 @@
 #include "db/version_builder.h"
 #include "db/version_edit.h"
 #include "db/write_controller.h"
+#include "env/file_system_tracer.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
 #include "port/port.h"
@@ -55,6 +60,7 @@
 class Writer;
 }
 
+class BlobIndex;
 class Compaction;
 class LogBuffer;
 class LookupKey;
@@ -65,6 +71,8 @@
 class MergeContext;
 class ColumnFamilySet;
 class MergeIteratorBuilder;
+class SystemClock;
+class ManifestTailer;
 
 // VersionEdit is always supposed to be valid and it is used to point at
 // entries in Manifest. Ideally it should not be used as a container to
@@ -102,7 +110,7 @@
 
 // Information of the storage associated with each Version, including number of
 // levels of LSM tree, files information at each level, files marked for
-// compaction, etc.
+// compaction, blob files, etc.
 class VersionStorageInfo {
  public:
   VersionStorageInfo(const InternalKeyComparator* internal_comparator,
@@ -117,7 +125,9 @@
 
   void Reserve(int level, size_t size) { files_[level].reserve(size); }
 
-  void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
+  void AddFile(int level, FileMetaData* f);
+
+  void AddBlobFile(std::shared_ptr<BlobFileMetaData> blob_file_meta);
 
   void SetFinalized();
 
@@ -140,7 +150,7 @@
   // We use compaction scores to figure out which compaction to do next
   // REQUIRES: db_mutex held!!
   // TODO find a better way to pass compaction_options_fifo.
-  void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options,
+  void ComputeCompactionScore(const ImmutableOptions& immutable_options,
                               const MutableCFOptions& mutable_cf_options);
 
   // Estimate est_comp_needed_bytes_
@@ -153,13 +163,13 @@
 
   // This computes ttl_expired_files_ and is called by
   // ComputeCompactionScore()
-  void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions,
+  void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions,
                               const uint64_t ttl);
 
   // This computes files_marked_for_periodic_compaction_ and is called by
   // ComputeCompactionScore()
   void ComputeFilesMarkedForPeriodicCompaction(
-      const ImmutableCFOptions& ioptions,
+      const ImmutableOptions& ioptions,
       const uint64_t periodic_compaction_seconds);
 
   // This computes bottommost_files_marked_for_compaction_ and is called by
@@ -174,12 +184,21 @@
   // REQUIRES: DB mutex held
   void ComputeBottommostFilesMarkedForCompaction();
 
+  // This computes files_marked_for_forced_blob_gc_ and is called by
+  // ComputeCompactionScore()
+  //
+  // REQUIRES: DB mutex held
+  void ComputeFilesMarkedForForcedBlobGC(
+      double blob_garbage_collection_age_cutoff,
+      double blob_garbage_collection_force_threshold);
+
   // Generate level_files_brief_ from files_
   void GenerateLevelFilesBrief();
   // Sort all files for this version based on their file size and
   // record results in files_by_compaction_pri_. The largest files are listed
   // first.
-  void UpdateFilesByCompactionPri(CompactionPri compaction_pri);
+  void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options,
+                                  const MutableCFOptions& mutable_cf_options);
 
   void GenerateLevel0NonOverlapping();
   bool level0_non_overlapping() const {
@@ -279,6 +298,75 @@
     return files_[level];
   }
 
+  class FileLocation {
+   public:
+    FileLocation() = default;
+    FileLocation(int level, size_t position)
+        : level_(level), position_(position) {}
+
+    int GetLevel() const { return level_; }
+    size_t GetPosition() const { return position_; }
+
+    bool IsValid() const { return level_ >= 0; }
+
+    bool operator==(const FileLocation& rhs) const {
+      return level_ == rhs.level_ && position_ == rhs.position_;
+    }
+
+    bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); }
+
+    static FileLocation Invalid() { return FileLocation(); }
+
+   private:
+    int level_ = -1;
+    size_t position_ = 0;
+  };
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  FileLocation GetFileLocation(uint64_t file_number) const {
+    const auto it = file_locations_.find(file_number);
+
+    if (it == file_locations_.end()) {
+      return FileLocation::Invalid();
+    }
+
+    assert(it->second.GetLevel() < num_levels_);
+    assert(it->second.GetPosition() < files_[it->second.GetLevel()].size());
+    assert(files_[it->second.GetLevel()][it->second.GetPosition()]);
+    assert(files_[it->second.GetLevel()][it->second.GetPosition()]
+               ->fd.GetNumber() == file_number);
+
+    return it->second;
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const {
+    auto location = GetFileLocation(file_number);
+
+    if (!location.IsValid()) {
+      return nullptr;
+    }
+
+    return files_[location.GetLevel()][location.GetPosition()];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  using BlobFiles = std::map<uint64_t, std::shared_ptr<BlobFileMetaData>>;
+  const BlobFiles& GetBlobFiles() const { return blob_files_; }
+
+  uint64_t GetTotalBlobFileSize() const {
+    uint64_t total_blob_bytes = 0;
+
+    for (const auto& pair : blob_files_) {
+      const auto& meta = pair.second;
+      assert(meta);
+
+      total_blob_bytes += meta->GetBlobFileSize();
+    }
+
+    return total_blob_bytes;
+  }
+
   const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
     assert(level < static_cast<int>(level_files_brief_.size()));
     return level_files_brief_[level];
@@ -325,6 +413,14 @@
     return bottommost_files_marked_for_compaction_;
   }
 
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC()
+      const {
+    assert(finalized_);
+    return files_marked_for_forced_blob_gc_;
+  }
+
   int base_level() const { return base_level_; }
   double level_multiplier() const { return level_multiplier_; }
 
@@ -368,7 +464,7 @@
 
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
-  int64_t MaxNextLevelOverlappingBytes();
+  uint64_t MaxNextLevelOverlappingBytes();
 
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
@@ -395,7 +491,7 @@
     next_file_to_compact_by_size_[level] = 0;
   }
 
-  const InternalKeyComparator* InternalComparator() {
+  const InternalKeyComparator* InternalComparator() const {
     return internal_comparator_;
   }
 
@@ -403,7 +499,7 @@
   uint64_t MaxBytesForLevel(int level) const;
 
   // Must be called after any change to MutableCFOptions.
-  void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+  void CalculateBaseBytes(const ImmutableOptions& ioptions,
                           const MutableCFOptions& options);
 
   // Returns an estimate of the amount of live data in bytes.
@@ -453,6 +549,14 @@
   // in increasing order of keys
   std::vector<FileMetaData*>* files_;
 
+  // Map of all table files in version. Maps file number to (level, position on
+  // level).
+  using FileLocations = std::unordered_map<uint64_t, FileLocation>;
+  FileLocations file_locations_;
+
+  // Map of blob files in version by number.
+  BlobFiles blob_files_;
+
   // Level that L0 data should be compacted to. All levels < base_level_ should
   // be empty. -1 if it is not level-compaction so it's not applicable.
   int base_level_;
@@ -499,6 +603,8 @@
   autovector<std::pair<int, FileMetaData*>>
       bottommost_files_marked_for_compaction_;
 
+  autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_;
+
   // Threshold for needing to mark another bottommost file. Maintain it so we
   // can quickly check when releasing a snapshot whether more bottommost files
   // became eligible for compaction. It's defined as the min of the max nonzero
@@ -553,20 +659,28 @@
 };
 
 using MultiGetRange = MultiGetContext::Range;
-// A column family's version consists of the SST files owned by the column
-// family at a certain point in time.
+// A column family's version consists of the table and blob files owned by
+// the column family at a certain point in time.
 class Version {
  public:
   // Append to *iters a sequence of iterators that will
   // yield the contents of this Version when merged together.
-  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  void AddIterators(const ReadOptions&, const FileOptions& soptions,
+  // @param read_options Must outlive any iterator built by
+  // `merger_iter_builder`.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo).
+  void AddIterators(const ReadOptions& read_options,
+                    const FileOptions& soptions,
                     MergeIteratorBuilder* merger_iter_builder,
-                    RangeDelAggregator* range_del_agg);
+                    RangeDelAggregator* range_del_agg,
+                    bool allow_unprepared_value);
 
-  void AddIteratorsForLevel(const ReadOptions&, const FileOptions& soptions,
+  // @param read_options Must outlive any iterator built by
+  // `merger_iter_builder`.
+  void AddIteratorsForLevel(const ReadOptions& read_options,
+                            const FileOptions& soptions,
                             MergeIteratorBuilder* merger_iter_builder,
-                            int level, RangeDelAggregator* range_del_agg);
+                            int level, RangeDelAggregator* range_del_agg,
+                            bool allow_unprepared_value);
 
   Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
                                   const Slice& smallest_user_key,
@@ -594,15 +708,39 @@
   //    If the key has any merge operands then store them in
   //    merge_context.operands_list and don't merge the operands
   // REQUIRES: lock is not held
+  // REQUIRES: pinned_iters_mgr != nullptr
   void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
-           Status* status, MergeContext* merge_context,
+           std::string* timestamp, Status* status, MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
+           PinnedIteratorsManager* pinned_iters_mgr,
            bool* value_found = nullptr, bool* key_exists = nullptr,
            SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
            bool* is_blob = nullptr, bool do_merge = true);
 
   void MultiGet(const ReadOptions&, MultiGetRange* range,
-                ReadCallback* callback = nullptr, bool* is_blob = nullptr);
+                ReadCallback* callback = nullptr);
+
+  // Interprets blob_index_slice as a blob reference, and (assuming the
+  // corresponding blob file is part of this Version) retrieves the blob and
+  // saves it in *value.
+  // REQUIRES: blob_index_slice stores an encoded blob reference
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const Slice& blob_index_slice,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  // Retrieves a blob using a blob reference and saves it in *value,
+  // assuming the corresponding blob file is part of this Version.
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const BlobIndex& blob_index,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  using BlobReadRequest =
+      std::pair<BlobIndex, std::reference_wrapper<const KeyContext>>;
+  using BlobReadRequests = std::vector<BlobReadRequest>;
+  void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range,
+                    std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs);
 
   // Loads some stats information from files. Call without mutex held. It needs
   // to be called before applying the version to the version set.
@@ -616,8 +754,10 @@
   // and return true. Otherwise, return false.
   bool Unref();
 
-  // Add all files listed in the current version to *live.
-  void AddLiveFiles(std::vector<FileDescriptor>* live);
+  // Add all files listed in the current version to *live_table_files and
+  // *live_blob_files.
+  void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                    std::vector<uint64_t>* live_blob_files) const;
 
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false, bool print_stats = false) const;
@@ -662,14 +802,13 @@
 
   ColumnFamilyData* cfd() const { return cfd_; }
 
-  // Return the next Version in the linked list. Used for debug only
-  Version* TEST_Next() const {
-    return next_;
-  }
+  // Return the next Version in the linked list.
+  Version* Next() const { return next_; }
 
   int TEST_refs() const { return refs_; }
 
   VersionStorageInfo* storage_info() { return &storage_info_; }
+  const VersionStorageInfo* storage_info() const { return &storage_info_; }
 
   VersionSet* version_set() { return vset_; }
 
@@ -685,9 +824,12 @@
 
  private:
   Env* env_;
-  FileSystem* fs_;
+  SystemClock* clock_;
+
   friend class ReactiveVersionSet;
   friend class VersionSet;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
 
   const InternalKeyComparator* internal_comparator() const {
     return storage_info_.internal_comparator_;
@@ -696,10 +838,6 @@
     return storage_info_.user_comparator_;
   }
 
-  bool PrefixMayMatch(const ReadOptions& read_options,
-                      InternalIterator* level_iter,
-                      const Slice& internal_prefix) const;
-
   // Returns true if the filter blocks in the specified level will not be
   // checked during read operations. In certain cases (trivial move or preload),
   // the filter block may already be cached, but we still do not access it such
@@ -715,15 +853,11 @@
   // This accumulated stats will be used in compaction.
   void UpdateAccumulatedStats(bool update_stats);
 
-  // Sort all files for this version based on their file size and
-  // record results in files_by_compaction_pri_. The largest files are listed
-  // first.
-  void UpdateFilesByCompactionPri();
-
   ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
   Logger* info_log_;
   Statistics* db_statistics_;
   TableCache* table_cache_;
+  BlobFileCache* blob_file_cache_;
   const MergeOperator* merge_operator_;
 
   VersionStorageInfo storage_info_;
@@ -733,13 +867,18 @@
   int refs_;                    // Number of live refs to this version
   const FileOptions file_options_;
   const MutableCFOptions mutable_cf_options_;
+  // Cached value to avoid recomputing it on every read.
+  const size_t max_file_size_for_l0_meta_pin_;
 
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
   uint64_t version_number_;
+  std::shared_ptr<IOTracer> io_tracer_;
 
   Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
-          MutableCFOptions mutable_cf_options, uint64_t version_number = 0);
+          MutableCFOptions mutable_cf_options,
+          const std::shared_ptr<IOTracer>& io_tracer,
+          uint64_t version_number = 0);
 
   ~Version();
 
@@ -778,10 +917,24 @@
   }
 };
 
+class ObsoleteBlobFileInfo {
+ public:
+  ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path)
+      : blob_file_number_(blob_file_number), path_(std::move(path)) {}
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  const std::string& GetPath() const { return path_; }
+
+ private:
+  uint64_t blob_file_number_;
+  std::string path_;
+};
+
 class BaseReferencedVersionBuilder;
 
 class AtomicGroupReadBuffer {
  public:
+  AtomicGroupReadBuffer() = default;
   Status AddEdit(VersionEdit* edit);
   void Clear();
   bool IsFull() const;
@@ -806,13 +959,26 @@
              const FileOptions& file_options, Cache* table_cache,
              WriteBufferManager* write_buffer_manager,
              WriteController* write_controller,
-             BlockCacheTracer* const block_cache_tracer);
+             BlockCacheTracer* const block_cache_tracer,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_session_id);
   // No copying allowed
   VersionSet(const VersionSet&) = delete;
   void operator=(const VersionSet&) = delete;
 
   virtual ~VersionSet();
 
+  Status LogAndApplyToDefaultColumnFamily(
+      VersionEdit* edit, InstrumentedMutex* mu,
+      FSDirectory* db_directory = nullptr, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault();
+    const MutableCFOptions* cf_options =
+        default_cf->GetLatestMutableCFOptions();
+    return LogAndApply(default_cf, *cf_options, edit, mu, db_directory,
+                       new_descriptor_log, column_family_options);
+  }
+
   // Apply *edit to the current version to form a new descriptor that
   // is both saved to persistent state and installed as the new
   // current version.  Will release *mu while actually writing to the file.
@@ -822,7 +988,7 @@
   Status LogAndApply(
       ColumnFamilyData* column_family_data,
       const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
-      InstrumentedMutex* mu, Directory* db_directory = nullptr,
+      InstrumentedMutex* mu, FSDirectory* db_directory = nullptr,
       bool new_descriptor_log = false,
       const ColumnFamilyOptions* column_family_options = nullptr) {
     autovector<ColumnFamilyData*> cfds;
@@ -842,8 +1008,9 @@
       ColumnFamilyData* column_family_data,
       const MutableCFOptions& mutable_cf_options,
       const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
-      Directory* db_directory = nullptr, bool new_descriptor_log = false,
-      const ColumnFamilyOptions* column_family_options = nullptr) {
+      FSDirectory* db_directory = nullptr, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr,
+      const std::function<void(const Status&)>& manifest_wcb = {}) {
     autovector<ColumnFamilyData*> cfds;
     cfds.emplace_back(column_family_data);
     autovector<const MutableCFOptions*> mutable_cf_options_list;
@@ -851,7 +1018,8 @@
     autovector<autovector<VersionEdit*>> edit_lists;
     edit_lists.emplace_back(edit_list);
     return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
-                       db_directory, new_descriptor_log, column_family_options);
+                       db_directory, new_descriptor_log, column_family_options,
+                       {manifest_wcb});
   }
 
   // The across-multi-cf batch version. If edit_lists contain more than
@@ -861,14 +1029,17 @@
       const autovector<ColumnFamilyData*>& cfds,
       const autovector<const MutableCFOptions*>& mutable_cf_options_list,
       const autovector<autovector<VersionEdit*>>& edit_lists,
-      InstrumentedMutex* mu, Directory* db_directory = nullptr,
+      InstrumentedMutex* mu, FSDirectory* db_directory = nullptr,
       bool new_descriptor_log = false,
-      const ColumnFamilyOptions* new_cf_options = nullptr);
+      const ColumnFamilyOptions* new_cf_options = nullptr,
+      const std::vector<std::function<void(const Status&)>>& manifest_wcbs =
+          {});
 
   static Status GetCurrentManifestPath(const std::string& dbname,
                                        FileSystem* fs,
                                        std::string* manifest_filename,
                                        uint64_t* manifest_file_number);
+  void WakeUpWaitingManifestWriters();
 
   // Recover the last saved descriptor from persistent storage.
   // If read_only == true, Recover() will not complain if some column families
@@ -876,6 +1047,18 @@
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                  bool read_only = false, std::string* db_id = nullptr);
 
+  Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                    bool read_only,
+                    const std::vector<std::string>& files_in_dbname,
+                    std::string* db_id, bool* has_missing_table_file);
+
+  // Try to recover the version set to the most recent consistent state
+  // recorded in the specified manifest.
+  Status TryRecoverFromOneManifest(
+      const std::string& manifest_path,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only, std::string* db_id, bool* has_missing_table_file);
+
   // Reads a manifest file and returns a list of column families in
   // column_families.
   static Status ListColumnFamilies(std::vector<std::string>* column_families,
@@ -905,6 +1088,8 @@
 
 #endif  // ROCKSDB_LITE
 
+  const std::string& DbSessionId() const { return db_session_id_; }
+
   // Return the current manifest file number
   uint64_t manifest_file_number() const { return manifest_file_number_; }
 
@@ -916,8 +1101,8 @@
 
   uint64_t current_next_file_number() const { return next_file_number_.load(); }
 
-  uint64_t min_log_number_to_keep_2pc() const {
-    return min_log_number_to_keep_2pc_.load();
+  uint64_t min_log_number_to_keep() const {
+    return min_log_number_to_keep_.load();
   }
 
   // Allocate and return a new file number
@@ -975,7 +1160,7 @@
   // Mark the specified log number as deleted
   // REQUIRED: this is only called during single-threaded recovery or repair, or
   // from ::LogAndApply where the global mutex is held.
-  void MarkMinLogNumberToKeep2PC(uint64_t number);
+  void MarkMinLogNumberToKeep(uint64_t number);
 
   // Return the log file number for the log file that is currently
   // being compacted, or zero if there is no such log file.
@@ -984,15 +1169,35 @@
   // Returns the minimum log number which still has data not flushed to any SST
   // file.
   // In non-2PC mode, all the log numbers smaller than this number can be safely
-  // deleted.
+  // deleted, although we still use `min_log_number_to_keep_` to determine when
+  // to delete a WAL file.
   uint64_t MinLogNumberWithUnflushedData() const {
     return PreComputeMinLogNumberWithUnflushedData(nullptr);
   }
+
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // Empty column families' log number is considered to be
+  // new_log_number_for_empty_cf.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      uint64_t new_log_number_for_empty_cf) const {
+    uint64_t min_log_num = port::kMaxUint64;
+    for (auto cfd : *column_family_set_) {
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      uint64_t num =
+          cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber();
+      if (min_log_num > num && !cfd->IsDropped()) {
+        min_log_num = num;
+      }
+    }
+    return min_log_num;
+  }
   // Returns the minimum log number which still has data not flushed to any SST
   // file, except data from `cfd_to_skip`.
   uint64_t PreComputeMinLogNumberWithUnflushedData(
       const ColumnFamilyData* cfd_to_skip) const {
-    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    uint64_t min_log_num = port::kMaxUint64;
     for (auto cfd : *column_family_set_) {
       if (cfd == cfd_to_skip) {
         continue;
@@ -1005,15 +1210,36 @@
     }
     return min_log_num;
   }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfds_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const {
+    uint64_t min_log_num = port::kMaxUint64;
+    for (auto cfd : *column_family_set_) {
+      if (cfds_to_skip.count(cfd)) {
+        continue;
+      }
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
 
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
+  // @param read_options Must outlive the returned iterator.
   InternalIterator* MakeInputIterator(
-      const Compaction* c, RangeDelAggregator* range_del_agg,
+      const ReadOptions& read_options, const Compaction* c,
+      RangeDelAggregator* range_del_agg,
       const FileOptions& file_options_compactions);
 
-  // Add all files listed in any live version to *live.
-  void AddLiveFiles(std::vector<FileDescriptor>* live_list);
+  // Add all files listed in any live version to *live_table_files and
+  // *live_blob_files. Note that these lists may contain duplicates.
+  void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                    std::vector<uint64_t>* live_blob_files) const;
 
   // Return the approximate size of data to be scanned for range [start, end)
   // in levels [start_level, end_level). If end_level == -1 it will search
@@ -1026,23 +1252,30 @@
   // Return the size of the current manifest file
   uint64_t manifest_file_size() const { return manifest_file_size_; }
 
-  // verify that the files that we started with for a compaction
-  // still exist in the current version and in the same original level.
-  // This ensures that a concurrent compaction did not erroneously
-  // pick the same files to compact.
-  bool VerifyCompactionFileConsistency(Compaction* c);
-
   Status GetMetadataForFile(uint64_t number, int* filelevel,
                             FileMetaData** metadata, ColumnFamilyData** cfd);
 
   // This function doesn't support leveldb SST filenames
   void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
 
+  void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
+    assert(table_cache_);
+
+    table_cache_->Erase(GetSlice(&blob_file_number));
+
+    obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
+  }
+
   void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                        std::vector<ObsoleteBlobFileInfo>* blob_files,
                         std::vector<std::string>* manifest_filenames,
                         uint64_t min_pending_output);
 
   ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  RefedColumnFamilySet GetRefedColumnFamilySet() {
+    return RefedColumnFamilySet(GetColumnFamilySet());
+  }
+
   const FileOptions& file_options() { return file_options_; }
   void ChangeFileOptions(const MutableDBOptions& new_options) {
     file_options_.writable_file_max_buffer_size =
@@ -1055,20 +1288,51 @@
 
   static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
 
+  static uint64_t GetTotalBlobFileSize(Version* dummy_versions);
+
+  // Get the IO Status returned by written Manifest.
+  const IOStatus& io_status() const { return io_status_; }
+
+  // The returned WalSet needs to be accessed with DB mutex held.
+  const WalSet& GetWalSet() const { return wals_; }
+
+  void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) {
+    assert(cfd);
+
+    const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+    Version* const version =
+        new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_);
+
+    constexpr bool update_stats = false;
+    version->PrepareApply(mutable_cf_options, update_stats);
+    AppendVersion(cfd, version);
+  }
+
  protected:
+  using VersionBuilderMap =
+      std::unordered_map<uint32_t,
+                         std::unique_ptr<BaseReferencedVersionBuilder>>;
+
   struct ManifestWriter;
 
   friend class Version;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
   friend class DBImpl;
   friend class DBImplReadOnly;
 
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
     virtual void Corruption(size_t /*bytes*/, const Status& s) override {
-      if (this->status->ok()) *this->status = s;
+      if (status->ok()) {
+        *status = s;
+      }
     }
   };
 
+  void Reset();
+
   // Returns approximated offset of a key in a file for a given version.
   uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
                                const Slice& key, TableReaderCaller caller);
@@ -1081,59 +1345,52 @@
 
   struct MutableCFState {
     uint64_t log_number;
+    std::string full_history_ts_low;
+
+    explicit MutableCFState() = default;
+    explicit MutableCFState(uint64_t _log_number, std::string ts_low)
+        : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {}
   };
 
   // Save current contents to *log
   Status WriteCurrentStateToManifest(
       const std::unordered_map<uint32_t, MutableCFState>& curr_state,
-      log::Writer* log);
+      const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s);
 
   void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
 
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
-                                       VersionEdit* edit);
+                                       const VersionEdit* edit);
 
-  Status ReadAndRecover(
-      log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
-      const std::unordered_map<std::string, ColumnFamilyOptions>&
-          name_to_options,
-      std::unordered_map<int, std::string>& column_families_not_found,
-      std::unordered_map<
-          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
-      VersionEditParams* version_edit, std::string* db_id = nullptr);
+  Status VerifyFileMetadata(const std::string& fpath,
+                            const FileMetaData& meta) const;
 
-  // REQUIRES db mutex
-  Status ApplyOneVersionEditToBuilder(
-      VersionEdit& edit,
-      const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_opts,
-      std::unordered_map<int, std::string>& column_families_not_found,
-      std::unordered_map<
-          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
-      VersionEditParams* version_edit);
-
-  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
-                                    const VersionEdit& from_edit,
-                                    VersionEditParams* version_edit_params);
+  // Protected by DB mutex.
+  WalSet wals_;
 
   std::unique_ptr<ColumnFamilySet> column_family_set_;
-
+  Cache* table_cache_;
   Env* const env_;
-  FileSystem* const fs_;
+  FileSystemPtr const fs_;
+  SystemClock* const clock_;
   const std::string dbname_;
   std::string db_id_;
   const ImmutableDBOptions* const db_options_;
   std::atomic<uint64_t> next_file_number_;
-  // Any log number equal or lower than this should be ignored during recovery,
-  // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
-  // number is ignored.
-  std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
+  // Any WAL number smaller than this should be ignored during recovery,
+  // and is qualified for being deleted.
+  std::atomic<uint64_t> min_log_number_to_keep_ = {0};
   uint64_t manifest_file_number_;
   uint64_t options_file_number_;
+  uint64_t options_file_size_;
   uint64_t pending_manifest_file_number_;
   // The last seq visible to reads. It normally indicates the last sequence in
   // the memtable but when using two write queues it could also indicate the
   // last sequence in the WAL visible to reads.
   std::atomic<uint64_t> last_sequence_;
+  // The last sequence number of data committed to the descriptor (manifest
+  // file).
+  SequenceNumber descriptor_last_sequence_ = 0;
   // The last seq that is already allocated. It is applicable only when we have
   // two write queues. In that case seq might or might not have appreated in
   // memtable but it is expected to appear in the WAL.
@@ -1160,6 +1417,7 @@
   uint64_t manifest_file_size_;
 
   std::vector<ObsoleteFileInfo> obsolete_files_;
+  std::vector<ObsoleteBlobFileInfo> obsolete_blob_files_;
   std::vector<std::string> obsolete_manifests_;
 
   // env options for all reads and writes except compactions
@@ -1167,16 +1425,25 @@
 
   BlockCacheTracer* const block_cache_tracer_;
 
+  // Store the IO status when Manifest is written
+  IOStatus io_status_;
+
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  std::string db_session_id_;
+
  private:
   // REQUIRES db mutex at beginning. may release and re-acquire db mutex
   Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
-                               InstrumentedMutex* mu, Directory* db_directory,
+                               InstrumentedMutex* mu, FSDirectory* db_directory,
                                bool new_descriptor_log,
                                const ColumnFamilyOptions* new_cf_options);
 
-  void LogAndApplyCFHelper(VersionEdit* edit);
+  void LogAndApplyCFHelper(VersionEdit* edit,
+                           SequenceNumber* max_last_sequence);
   Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
-                           VersionEdit* edit, InstrumentedMutex* mu);
+                           VersionEdit* edit, SequenceNumber* max_last_sequence,
+                           InstrumentedMutex* mu);
 };
 
 // ReactiveVersionSet represents a collection of versions of the column
@@ -1189,30 +1456,28 @@
                      const ImmutableDBOptions* _db_options,
                      const FileOptions& _file_options, Cache* table_cache,
                      WriteBufferManager* write_buffer_manager,
-                     WriteController* write_controller);
+                     WriteController* write_controller,
+                     const std::shared_ptr<IOTracer>& io_tracer);
 
   ~ReactiveVersionSet() override;
 
   Status ReadAndApply(
       InstrumentedMutex* mu,
       std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+      Status* manifest_read_status,
       std::unordered_set<ColumnFamilyData*>* cfds_changed);
 
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                  std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
                  std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
                  std::unique_ptr<Status>* manifest_reader_status);
+#ifndef NDEBUG
+  uint64_t TEST_read_edits_in_atomic_group() const;
+#endif  //! NDEBUG
 
-  uint64_t TEST_read_edits_in_atomic_group() const {
-    return read_buffer_.TEST_read_edits_in_atomic_group();
-  }
-  std::vector<VersionEdit>& replay_buffer() {
-    return read_buffer_.replay_buffer();
-  }
+  std::vector<VersionEdit>& replay_buffer();
 
  protected:
-  using VersionSet::ApplyOneVersionEditToBuilder;
-
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
       VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
@@ -1223,12 +1488,7 @@
       std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
 
  private:
-  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
-      active_version_builders_;
-  AtomicGroupReadBuffer read_buffer_;
-  // Number of version edits to skip by ReadAndApply at the beginning of a new
-  // MANIFEST created by primary.
-  int number_of_edits_to_skip_;
+  std::unique_ptr<ManifestTailer> manifest_tailer_;
 
   using VersionSet::LogAndApply;
   using VersionSet::Recover;
@@ -1237,9 +1497,10 @@
       const autovector<ColumnFamilyData*>& /*cfds*/,
       const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
       const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
-      InstrumentedMutex* /*mu*/, Directory* /*db_directory*/,
-      bool /*new_descriptor_log*/,
-      const ColumnFamilyOptions* /*new_cf_option*/) override {
+      InstrumentedMutex* /*mu*/, FSDirectory* /*db_directory*/,
+      bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/,
+      const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/)
+      override {
     return Status::NotSupported("not supported in reactive mode");
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/version_set_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/version_set_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,9 +8,15 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_set.h"
+
+#include <algorithm>
+
 #include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
-#include "logging/logging.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -39,9 +45,11 @@
         files_.size() + 1, 0, 0,
         InternalKey(smallest, smallest_seq, kTypeValue),
         InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
-        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
-        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+        largest_seq, /* marked_for_compact */ false, Temperature::kUnknown,
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+        kDisableUserTimestamp);
     files_.push_back(f);
   }
 
@@ -95,13 +103,13 @@
   return opt;
 }
 
-class VersionStorageInfoTest : public testing::Test {
+class VersionStorageInfoTestBase : public testing::Test {
  public:
   const Comparator* ucmp_;
   InternalKeyComparator icmp_;
   std::shared_ptr<CountingLogger> logger_;
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
   VersionStorageInfo vstorage_;
 
@@ -110,17 +118,19 @@
     return InternalKey(ukey, smallest_seq, kTypeValue);
   }
 
-  VersionStorageInfoTest()
-      : ucmp_(BytewiseComparator()),
+  explicit VersionStorageInfoTestBase(const Comparator* ucmp)
+      : ucmp_(ucmp),
         icmp_(ucmp_),
         logger_(new CountingLogger()),
         options_(GetOptionsWithNumLevels(6, logger_)),
         ioptions_(options_),
         mutable_cf_options_(options_),
-        vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, nullptr, false) {}
+        vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel,
+                  /*src_vstorage=*/nullptr,
+                  /*_force_consistency_checks=*/false) {}
 
-  ~VersionStorageInfoTest() override {
-    for (int i = 0; i < vstorage_.num_levels(); i++) {
+  ~VersionStorageInfoTestBase() override {
+    for (int i = 0; i < vstorage_.num_levels(); ++i) {
       for (auto* f : vstorage_.LevelFiles(i)) {
         if (--f->refs == 0) {
           delete f;
@@ -130,31 +140,56 @@
   }
 
   void Add(int level, uint32_t file_number, const char* smallest,
-           const char* largest, uint64_t file_size = 0) {
-    assert(level < vstorage_.num_levels());
-    FileMetaData* f = new FileMetaData(
-        file_number, 0, file_size, GetInternalKey(smallest, 0),
-        GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
-        /* marked_for_compact */ false, kInvalidBlobFileNumber,
-        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
-    f->compensated_file_size = file_size;
-    vstorage_.AddFile(level, f);
+           const char* largest, uint64_t file_size = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+    constexpr SequenceNumber dummy_seq = 0;
+
+    Add(level, file_number, GetInternalKey(smallest, dummy_seq),
+        GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number);
   }
 
   void Add(int level, uint32_t file_number, const InternalKey& smallest,
-           const InternalKey& largest, uint64_t file_size = 0) {
+           const InternalKey& largest, uint64_t file_size = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
     assert(level < vstorage_.num_levels());
     FileMetaData* f = new FileMetaData(
         file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
         /* largest_seq */ 0, /* marked_for_compact */ false,
-        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
-        kUnknownFileCreationTime, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName);
+        Temperature::kUnknown, oldest_blob_file_number,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+        kDisableUserTimestamp, kDisableUserTimestamp);
     f->compensated_file_size = file_size;
     vstorage_.AddFile(level, f);
   }
 
+  void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+               uint64_t total_blob_bytes,
+               BlobFileMetaData::LinkedSsts linked_ssts,
+               uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        /* checksum_method */ std::string(),
+        /* checksum_value */ std::string());
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 garbage_blob_count, garbage_blob_bytes);
+
+    vstorage_.AddBlobFile(std::move(meta));
+  }
+
+  void Finalize() {
+    vstorage_.UpdateNumNonEmptyLevels();
+    vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+    vstorage_.UpdateFilesByCompactionPri(ioptions_, mutable_cf_options_);
+    vstorage_.GenerateFileIndexer();
+    vstorage_.GenerateLevelFilesBrief();
+    vstorage_.GenerateLevel0NonOverlapping();
+    vstorage_.GenerateBottommostFiles();
+
+    vstorage_.SetFinalized();
+  }
+
   std::string GetOverlappingFiles(int level, const InternalKey& begin,
                                   const InternalKey& end) {
     std::vector<FileMetaData*> inputs;
@@ -171,6 +206,13 @@
   }
 };
 
+class VersionStorageInfoTest : public VersionStorageInfoTestBase {
+ public:
+  VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {}
+
+  ~VersionStorageInfoTest() override {}
+};
+
 TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
   ioptions_.level_compaction_dynamic_level_bytes = false;
   mutable_cf_options_.max_bytes_for_level_base = 10;
@@ -362,19 +404,19 @@
   Add(2, 3U, "6", "8", 1U);  // Partial overlap with last level
   Add(3, 4U, "1", "9", 1U);  // Contains range of last level
   Add(4, 5U, "4", "5", 1U);  // Inside range of last level
-  Add(4, 5U, "6", "7", 1U);  // Inside range of last level
-  Add(5, 6U, "4", "7", 10U);
+  Add(4, 6U, "6", "7", 1U);  // Inside range of last level
+  Add(5, 7U, "4", "7", 10U);
   ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
 }
 
 TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
   Add(0, 1U, "9", "9", 1U);  // Level 0 is not ordered
-  Add(0, 1U, "5", "6", 1U);  // Ignored because of [5,6] in l1
-  Add(1, 1U, "1", "2", 1U);  // Ignored because of [2,3] in l2
-  Add(1, 2U, "3", "4", 1U);  // Ignored because of [2,3] in l2
-  Add(1, 3U, "5", "6", 1U);
-  Add(2, 4U, "2", "3", 1U);
-  Add(3, 5U, "7", "8", 1U);
+  Add(0, 2U, "5", "6", 1U);  // Ignored because of [5,6] in l1
+  Add(1, 3U, "1", "2", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 4U, "3", "4", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 5U, "5", "6", 1U);
+  Add(2, 6U, "2", "3", 1U);
+  Add(3, 7U, "7", "8", 1U);
   ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
 }
 
@@ -411,6 +453,244 @@
       1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
 }
 
+TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) {
+  Add(0, 11U, "1", "2", 5000U);
+  Add(0, 12U, "1", "2", 5000U);
+
+  Add(2, 7U, "1", "2", 8000U);
+
+  ASSERT_EQ(vstorage_.GetFileLocation(11U),
+            VersionStorageInfo::FileLocation(0, 0));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr);
+
+  ASSERT_EQ(vstorage_.GetFileLocation(12U),
+            VersionStorageInfo::FileLocation(0, 1));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr);
+
+  ASSERT_EQ(vstorage_.GetFileLocation(7U),
+            VersionStorageInfo::FileLocation(2, 0));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr);
+
+  ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid());
+  ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr);
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) {
+  // No SST or blob files in VersionStorageInfo
+  Finalize();
+
+  constexpr double age_cutoff = 0.5;
+  constexpr double force_threshold = 0.75;
+  vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+  ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGC) {
+  // Add three L0 SSTs (1, 2, and 3) and four blob files (10, 11, 12, and 13).
+  // The first two SSTs have the same oldest blob file, namely, the very oldest
+  // one (10), while the third SST's oldest blob file reference points to the
+  // third blob file (12). Thus, the oldest batch of blob files contains the
+  // first two blob files 10 and 11, and assuming they are eligible for GC based
+  // on the age cutoff, compacting away the SSTs 1 and 2 will eliminate them.
+
+  constexpr int level = 0;
+
+  constexpr uint64_t first_sst = 1;
+  constexpr uint64_t second_sst = 2;
+  constexpr uint64_t third_sst = 3;
+
+  constexpr uint64_t first_blob = 10;
+  constexpr uint64_t second_blob = 11;
+  constexpr uint64_t third_blob = 12;
+  constexpr uint64_t fourth_blob = 13;
+
+  {
+    constexpr char smallest[] = "bar1";
+    constexpr char largest[] = "foo1";
+    constexpr uint64_t file_size = 1000;
+
+    Add(level, first_sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr char smallest[] = "bar2";
+    constexpr char largest[] = "foo2";
+    constexpr uint64_t file_size = 2000;
+
+    Add(level, second_sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr char smallest[] = "bar3";
+    constexpr char largest[] = "foo3";
+    constexpr uint64_t file_size = 3000;
+
+    Add(level, third_sst, smallest, largest, file_size, third_blob);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 10;
+    constexpr uint64_t total_blob_bytes = 100000;
+    constexpr uint64_t garbage_blob_count = 2;
+    constexpr uint64_t garbage_blob_bytes = 15000;
+
+    AddBlob(first_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{first_sst, second_sst},
+            garbage_blob_count, garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 4;
+    constexpr uint64_t total_blob_bytes = 400000;
+    constexpr uint64_t garbage_blob_count = 3;
+    constexpr uint64_t garbage_blob_bytes = 235000;
+
+    AddBlob(second_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 20;
+    constexpr uint64_t total_blob_bytes = 1000000;
+    constexpr uint64_t garbage_blob_count = 8;
+    constexpr uint64_t garbage_blob_bytes = 123456;
+
+    AddBlob(third_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 128;
+    constexpr uint64_t total_blob_bytes = 789012345;
+    constexpr uint64_t garbage_blob_count = 67;
+    constexpr uint64_t garbage_blob_bytes = 88888888;
+
+    AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  Finalize();
+
+  assert(vstorage_.num_levels() > 0);
+  const auto& level_files = vstorage_.LevelFiles(level);
+
+  assert(level_files.size() == 3);
+  assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst);
+  assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst);
+  assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst);
+
+  // No blob files eligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.1;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Part of the oldest batch of blob files (specifically, the second file) is
+  // ineligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.25;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+  // is below threshold
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.6;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+  // meets threshold
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.5;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+    ASSERT_EQ(ssts_to_be_compacted.size(), 2);
+
+    std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
+              [](const std::pair<int, FileMetaData*>& lhs,
+                 const std::pair<int, FileMetaData*>& rhs) {
+                assert(lhs.second);
+                assert(rhs.second);
+                return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
+              });
+
+    const autovector<std::pair<int, FileMetaData*>>
+        expected_ssts_to_be_compacted{{level, level_files[0]},
+                                      {level, level_files[1]}};
+
+    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+    ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
+  }
+}
+
+class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase {
+ public:
+  VersionStorageInfoTimestampTest()
+      : VersionStorageInfoTestBase(test::ComparatorWithU64Ts()) {}
+  ~VersionStorageInfoTimestampTest() override {}
+  std::string Timestamp(uint64_t ts) const {
+    std::string ret;
+    PutFixed64(&ret, ts);
+    return ret;
+  }
+  std::string PackUserKeyAndTimestamp(const Slice& ukey, uint64_t ts) const {
+    std::string ret;
+    ret.assign(ukey.data(), ukey.size());
+    PutFixed64(&ret, ts);
+    return ret;
+  }
+};
+
+TEST_F(VersionStorageInfoTimestampTest, GetOverlappingInputs) {
+  Add(/*level=*/1, /*file_number=*/1, /*smallest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/9), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/8), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+  Add(/*level=*/1, /*file_number=*/2, /*smallest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/5), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("b", /*ts=*/10), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+  Add(/*level=*/1, /*file_number=*/3, /*smallest=*/
+      {PackUserKeyAndTimestamp("c", /*ts=*/12), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("d", /*ts=*/1), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+  vstorage_.UpdateNumNonEmptyLevels();
+  vstorage_.GenerateLevelFilesBrief();
+  ASSERT_EQ(
+      "1,2",
+      GetOverlappingFiles(
+          /*level=*/1,
+          {PackUserKeyAndTimestamp("a", /*ts=*/12), /*s=*/0, kTypeValue},
+          {PackUserKeyAndTimestamp("a", /*ts=*/11), /*s=*/0, kTypeValue}));
+  ASSERT_EQ("3",
+            GetOverlappingFiles(
+                /*level=*/1,
+                {PackUserKeyAndTimestamp("c", /*ts=*/15), /*s=*/0, kTypeValue},
+                {PackUserKeyAndTimestamp("c", /*ts=*/2), /*s=*/0, kTypeValue}));
+}
 
 class FindLevelFileTest : public testing::Test {
  public:
@@ -611,40 +891,69 @@
   const static std::string kColumnFamilyName3;
   int num_initial_edits_;
 
-  VersionSetTestBase()
-      : env_(Env::Default()),
-        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
-        dbname_(test::PerThreadDBPath("version_set_test")),
-        db_options_(),
+  explicit VersionSetTestBase(const std::string& name)
+      : env_(nullptr),
+        dbname_(test::PerThreadDBPath(name)),
+        options_(),
+        db_options_(options_),
+        cf_options_(options_),
+        immutable_options_(db_options_, cf_options_),
         mutable_cf_options_(cf_options_),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         shutting_down_(false),
         mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
-    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
+    if (env_ == Env::Default() && getenv("MEM_ENV")) {
+      env_guard_.reset(NewMemEnv(Env::Default()));
+      env_ = env_guard_.get();
+    }
+    EXPECT_NE(nullptr, env_);
 
+    fs_ = env_->GetFileSystem();
+    EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
+
+    options_.env = env_;
     db_options_.env = env_;
     db_options_.fs = fs_;
-    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
-                                   table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_,
-                                   /*block_cache_tracer=*/nullptr)),
-        reactive_versions_ = std::make_shared<ReactiveVersionSet>(
-            dbname_, &db_options_, env_options_, table_cache_.get(),
-            &write_buffer_manager_, &write_controller_);
+    immutable_options_.env = env_;
+    immutable_options_.fs = fs_;
+    immutable_options_.clock = env_->GetSystemClock().get();
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    reactive_versions_ = std::make_shared<ReactiveVersionSet>(
+        dbname_, &db_options_, env_options_, table_cache_.get(),
+        &write_buffer_manager_, &write_controller_, nullptr);
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
   }
 
-  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
-                       SequenceNumber* last_seqno,
-                       std::unique_ptr<log::Writer>* log_writer) {
+  virtual ~VersionSetTestBase() {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "DB is still at %s\n", dbname_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(dbname_, options));
+    }
+  }
+
+ protected:
+  virtual void PrepareManifest(
+      std::vector<ColumnFamilyDescriptor>* column_families,
+      SequenceNumber* last_seqno, std::unique_ptr<log::Writer>* log_writer) {
     assert(column_families != nullptr);
     assert(last_seqno != nullptr);
     assert(log_writer != nullptr);
     VersionEdit new_db;
     if (db_options_.write_dbid_to_manifest) {
-      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
       std::string db_id;
       impl->GetDbIdentityFromIdentityFile(&db_id);
       new_db.SetDBId(db_id);
@@ -671,13 +980,13 @@
     }
     *last_seqno = last_seq;
     num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
+    std::unique_ptr<WritableFileWriter> file_writer;
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
     {
       log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
       std::string record;
@@ -700,27 +1009,104 @@
 
   // Create DB with 3 column families.
   void NewDB() {
-    std::vector<ColumnFamilyDescriptor> column_families;
     SequenceNumber last_seqno;
     std::unique_ptr<log::Writer> log_writer;
     SetIdentityFile(env_, dbname_);
-    PrepareManifest(&column_families, &last_seqno, &log_writer);
+    PrepareManifest(&column_families_, &last_seqno, &log_writer);
     log_writer.reset();
     // Make "CURRENT" file point to the new manifest file.
-    Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+    Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
     ASSERT_OK(s);
 
-    EXPECT_OK(versions_->Recover(column_families, false));
-    EXPECT_EQ(column_families.size(),
+    EXPECT_OK(versions_->Recover(column_families_, false));
+    EXPECT_EQ(column_families_.size(),
               versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
   }
 
+  void ReopenDB() {
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families_, false));
+  }
+
+  void VerifyManifest(std::string* manifest_path) const {
+    assert(manifest_path != nullptr);
+    uint64_t manifest_file_number = 0;
+    Status s = versions_->GetCurrentManifestPath(
+        dbname_, fs_.get(), manifest_path, &manifest_file_number);
+    ASSERT_OK(s);
+    ASSERT_EQ(1, manifest_file_number);
+  }
+
+  Status LogAndApplyToDefaultCF(VersionEdit& edit) {
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, &edit, &mutex_);
+    mutex_.Unlock();
+    return s;
+  }
+
+  Status LogAndApplyToDefaultCF(
+      const autovector<std::unique_ptr<VersionEdit>>& edits) {
+    autovector<VersionEdit*> vedits;
+    for (auto& e : edits) {
+      vedits.push_back(e.get());
+    }
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, vedits, &mutex_);
+    mutex_.Unlock();
+    return s;
+  }
+
+  void CreateNewManifest() {
+    constexpr FSDirectory* db_directory = nullptr;
+    constexpr bool new_descriptor_log = true;
+    mutex_.Lock();
+    VersionEdit dummy;
+    ASSERT_OK(versions_->LogAndApply(
+        versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
+        &dummy, &mutex_, db_directory, new_descriptor_log));
+    mutex_.Unlock();
+  }
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& cf_name,
+                                       const ColumnFamilyOptions& cf_options) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(cf_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    new_cf.SetColumnFamily(new_id);
+    new_cf.SetLogNumber(0);
+    new_cf.SetComparatorName(cf_options.comparator->Name());
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(/*column_family_data=*/nullptr,
+                               MutableCFOptions(cf_options), &new_cf, &mutex_,
+                               /*db_directory=*/nullptr,
+                               /*new_descriptor_log=*/false, &cf_options);
+    mutex_.Unlock();
+    EXPECT_OK(s);
+    ColumnFamilyData* cfd =
+        versions_->GetColumnFamilySet()->GetColumnFamily(cf_name);
+    EXPECT_NE(nullptr, cfd);
+    return cfd;
+  }
+
+  Env* mem_env_;
   Env* env_;
+  std::shared_ptr<Env> env_guard_;
   std::shared_ptr<FileSystem> fs_;
   const std::string dbname_;
   EnvOptions env_options_;
+  Options options_;
   ImmutableDBOptions db_options_;
   ColumnFamilyOptions cf_options_;
+  ImmutableOptions immutable_options_;
   MutableCFOptions mutable_cf_options_;
   std::shared_ptr<Cache> table_cache_;
   WriteController write_controller_;
@@ -730,6 +1116,7 @@
   InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
 };
 
 const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
@@ -738,7 +1125,7 @@
 
 class VersionSetTest : public VersionSetTestBase, public testing::Test {
  public:
-  VersionSetTest() : VersionSetTestBase() {}
+  VersionSetTest() : VersionSetTestBase("version_set_test") {}
 };
 
 TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
@@ -777,10 +1164,849 @@
   EXPECT_EQ(kGroupSize - 1, count);
 }
 
+TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) {
+  // Initialize the database and add a couple of blob files, one with some
+  // garbage in it, and one without any garbage.
+  NewDB();
+
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  {
+    constexpr uint64_t blob_file_number = 123;
+    constexpr uint64_t total_blob_count = 456;
+    constexpr uint64_t total_blob_bytes = 77777777;
+    constexpr char checksum_method[] = "SHA1";
+    constexpr char checksum_value[] =
+        "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c"
+        "\x52\x5c\xbd";
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+        checksum_value);
+
+    constexpr uint64_t garbage_blob_count = 89;
+    constexpr uint64_t garbage_blob_bytes = 1000000;
+
+    auto meta = BlobFileMetaData::Create(
+        std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+        garbage_blob_count, garbage_blob_bytes);
+
+    storage_info->AddBlobFile(std::move(meta));
+  }
+
+  {
+    constexpr uint64_t blob_file_number = 234;
+    constexpr uint64_t total_blob_count = 555;
+    constexpr uint64_t total_blob_bytes = 66666;
+    constexpr char checksum_method[] = "CRC32";
+    constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+        checksum_value);
+
+    constexpr uint64_t garbage_blob_count = 0;
+    constexpr uint64_t garbage_blob_bytes = 0;
+
+    auto meta = BlobFileMetaData::Create(
+        std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+        garbage_blob_count, garbage_blob_bytes);
+
+    storage_info->AddBlobFile(std::move(meta));
+  }
+
+  // Force the creation of a new manifest file and make sure metadata for
+  // the blob files is re-persisted.
+  size_t addition_encoded = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields",
+      [&](void* /* arg */) { ++addition_encoded; });
+
+  size_t garbage_encoded = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields",
+      [&](void* /* arg */) { ++garbage_encoded; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateNewManifest();
+
+  ASSERT_EQ(addition_encoded, 2);
+  ASSERT_EQ(garbage_encoded, 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(VersionSetTest, AddLiveBlobFiles) {
+  // Initialize the database and add a blob file.
+  NewDB();
+
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const first_version = cfd->current();
+  assert(first_version);
+
+  VersionStorageInfo* const first_storage_info = first_version->storage_info();
+  assert(first_storage_info);
+
+  constexpr uint64_t first_blob_file_number = 234;
+  constexpr uint64_t first_total_blob_count = 555;
+  constexpr uint64_t first_total_blob_bytes = 66666;
+  constexpr char first_checksum_method[] = "CRC32";
+  constexpr char first_checksum_value[] = "\x3d\x87\xff\x57";
+
+  auto first_shared_meta = SharedBlobFileMetaData::Create(
+      first_blob_file_number, first_total_blob_count, first_total_blob_bytes,
+      first_checksum_method, first_checksum_value);
+
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  auto first_meta = BlobFileMetaData::Create(
+      std::move(first_shared_meta), BlobFileMetaData::LinkedSsts(),
+      garbage_blob_count, garbage_blob_bytes);
+
+  first_storage_info->AddBlobFile(first_meta);
+
+  // Reference the version so it stays alive even after the following version
+  // edit.
+  first_version->Ref();
+
+  // Get live files directly from version.
+  std::vector<uint64_t> version_table_files;
+  std::vector<uint64_t> version_blob_files;
+
+  first_version->AddLiveFiles(&version_table_files, &version_blob_files);
+
+  ASSERT_EQ(version_blob_files.size(), 1);
+  ASSERT_EQ(version_blob_files[0], first_blob_file_number);
+
+  // Create a new version containing an additional blob file.
+  versions_->TEST_CreateAndAppendVersion(cfd);
+
+  Version* const second_version = cfd->current();
+  assert(second_version);
+  assert(second_version != first_version);
+
+  VersionStorageInfo* const second_storage_info =
+      second_version->storage_info();
+  assert(second_storage_info);
+
+  constexpr uint64_t second_blob_file_number = 456;
+  constexpr uint64_t second_total_blob_count = 100;
+  constexpr uint64_t second_total_blob_bytes = 2000000;
+  constexpr char second_checksum_method[] = "CRC32B";
+  constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+  auto second_shared_meta = SharedBlobFileMetaData::Create(
+      second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+      second_checksum_method, second_checksum_value);
+
+  auto second_meta = BlobFileMetaData::Create(
+      std::move(second_shared_meta), BlobFileMetaData::LinkedSsts(),
+      garbage_blob_count, garbage_blob_bytes);
+
+  second_storage_info->AddBlobFile(std::move(first_meta));
+  second_storage_info->AddBlobFile(std::move(second_meta));
+
+  // Get all live files from version set. Note that the result contains
+  // duplicates.
+  std::vector<uint64_t> all_table_files;
+  std::vector<uint64_t> all_blob_files;
+
+  versions_->AddLiveFiles(&all_table_files, &all_blob_files);
+
+  ASSERT_EQ(all_blob_files.size(), 3);
+  ASSERT_EQ(all_blob_files[0], first_blob_file_number);
+  ASSERT_EQ(all_blob_files[1], first_blob_file_number);
+  ASSERT_EQ(all_blob_files[2], second_blob_file_number);
+
+  // Clean up previous version.
+  first_version->Unref();
+}
+
+TEST_F(VersionSetTest, ObsoleteBlobFile) {
+  // Initialize the database and add a blob file that is entirely garbage
+  // and thus can immediately be marked obsolete.
+  NewDB();
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes);
+
+  mutex_.Lock();
+  Status s =
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options_, &edit, &mutex_);
+  mutex_.Unlock();
+
+  ASSERT_OK(s);
+
+  // Make sure blob files from the pending number range are not returned
+  // as obsolete.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_TRUE(blob_files.empty());
+  }
+
+  // Make sure the blob file is returned as obsolete if it's not in the pending
+  // range.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_EQ(blob_files.size(), 1);
+    ASSERT_EQ(blob_files[0].GetBlobFileNumber(), blob_file_number);
+  }
+
+  // Make sure it's not returned a second time.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_TRUE(blob_files.empty());
+  }
+}
+
+TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) {
+  NewDB();
+
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_EQ(versions[0], nullptr);
+}
+
+// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit.
+TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) {
+  NewDB();
+
+  const std::string kDBId = "db_db";
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_NE(versions[0], nullptr);
+}
+
+TEST_F(VersionSetTest, WalAddition) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced for several times before closing.
+  {
+    for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) {
+      uint64_t size = kSizeInBytes - size_delta;
+      WalMetadata wal(size);
+      VersionEdit edit;
+      edit.AddWal(kLogNumber, wal);
+
+      ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+      const auto& wals = versions_->GetWalSet().GetWals();
+      ASSERT_EQ(wals.size(), 1);
+      ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+      ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+      ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size);
+    }
+  }
+
+  // The WAL is closed.
+  {
+    WalMetadata wal(kSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalCloseWithoutSync) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+  constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced before closing.
+  {
+    WalMetadata wal(kSyncedSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+
+  // A new WAL with larger log number is created,
+  // implicitly marking the current WAL closed.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber + 1);
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+    ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalDeletion) {
+  NewDB();
+
+  constexpr WalNumber kClosedLogNumber = 10;
+  constexpr WalNumber kNonClosedLogNumber = 20;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // Add a non-closed and a closed WAL.
+  {
+    VersionEdit edit;
+    edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes));
+    edit.AddWal(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+    ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Delete the closed WAL.
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Force the creation of a new MANIFEST file,
+  // only the non-closed WAL should be written to the new MANIFEST.
+  {
+    std::vector<WalAddition> wal_additions;
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) {
+          VersionEdit* edit = reinterpret_cast<VersionEdit*>(arg);
+          ASSERT_TRUE(edit->IsWalAddition());
+          for (auto& addition : edit->GetWalAdditions()) {
+            wal_additions.push_back(addition);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    CreateNewManifest();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    ASSERT_EQ(wal_additions.size(), 1);
+    ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber);
+    ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize());
+  }
+
+  // Recover from the new MANIFEST, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+}
+
+TEST_F(VersionSetTest, WalCreateTwice) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+
+  VersionEdit edit;
+  edit.AddWal(kLogNumber);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+  Status s = LogAndApplyToDefaultCF(edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+              std::string::npos)
+      << s.ToString();
+}
+
+TEST_F(VersionSetTest, WalCreateAfterClose) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Create the same WAL again.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+                std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionSetTest, AddWalWithSmallerSize) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Add the same WAL with smaller synced size.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes / 2);
+    edit.AddWal(kLogNumber, wal);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        s.ToString().find(
+            "WAL 10 must not have smaller synced size than previous one") !=
+        std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber0 = 10;
+  constexpr WalNumber kLogNumber1 = 20;
+  constexpr WalNumber kNonExistingNumber = 15;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add closed WALs.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber0, wal);
+    edit.AddWal(kLogNumber1, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Delete WALs before a non-existing WAL.
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonExistingNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber1) != wals.end());
+  }
+}
+
+TEST_F(VersionSetTest, DeleteAllWals) {
+  NewDB();
+
+  constexpr WalNumber kMaxLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kMaxLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kMaxLogNumber + 10);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, all WALs are deleted.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 0);
+  }
+}
+
+TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
+  NewDB();
+
+  constexpr int kAtomicGroupSize = 7;
+  constexpr uint64_t kNumWals = 5;
+  const std::string kDBId = "db_db";
+
+  int remaining = kAtomicGroupSize;
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add 5 WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+    edits.back()->MarkAtomicGroup(--remaining);
+  }
+  // One edit with the min log number set.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+  edits.back()->MarkAtomicGroup(--remaining);
+  // Delete the first added 4 WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals);
+  edits.back()->MarkAtomicGroup(--remaining);
+  ASSERT_EQ(remaining, 0);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  // Recover a new VersionSet, the min log number and the last WAL should be
+  // kept.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    std::string db_id;
+    ASSERT_OK(
+        new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
+
+    ASSERT_EQ(db_id, kDBId);
+
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNumWals) != wals.end());
+    ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize());
+    ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals);
+  }
+}
+
+class VersionSetWithTimestampTest : public VersionSetTest {
+ public:
+  static const std::string kNewCfName;
+
+  explicit VersionSetWithTimestampTest() : VersionSetTest() {}
+
+  void SetUp() override {
+    NewDB();
+    Options options;
+    options.comparator = test::ComparatorWithU64Ts();
+    cfd_ = CreateColumnFamily(kNewCfName, options);
+    EXPECT_NE(nullptr, cfd_);
+    EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions());
+    column_families_.emplace_back(kNewCfName, options);
+  }
+
+  void TearDown() override {
+    for (auto* e : edits_) {
+      delete e;
+    }
+    edits_.clear();
+  }
+
+  void GenVersionEditsToSetFullHistoryTsLow(
+      const std::vector<uint64_t>& ts_lbs) {
+    for (const auto ts_lb : ts_lbs) {
+      VersionEdit* edit = new VersionEdit;
+      edit->SetColumnFamily(cfd_->GetID());
+      std::string ts_str = test::EncodeInt(ts_lb);
+      edit->SetFullHistoryTsLow(ts_str);
+      edits_.emplace_back(edit);
+    }
+  }
+
+  void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
+    std::unique_ptr<VersionSet> vset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
+                            /*db_id=*/nullptr));
+    for (auto* cfd : *(vset->GetColumnFamilySet())) {
+      ASSERT_NE(nullptr, cfd);
+      if (cfd->GetName() == kNewCfName) {
+        ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow());
+      } else {
+        ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty());
+      }
+    }
+  }
+
+  void DoTest(const std::vector<uint64_t>& ts_lbs) {
+    if (ts_lbs.empty()) {
+      return;
+    }
+
+    GenVersionEditsToSetFullHistoryTsLow(ts_lbs);
+
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()),
+                               edits_, &mutex_);
+    mutex_.Unlock();
+    ASSERT_OK(s);
+    VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end()));
+  }
+
+ protected:
+  ColumnFamilyData* cfd_{nullptr};
+  // edits_ must contain and own pointers to heap-alloc VersionEdit objects.
+  autovector<VersionEdit*> edits_;
+};
+
+const std::string VersionSetWithTimestampTest::kNewCfName("new_cf");
+
+TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) {
+  constexpr uint64_t kTsLow = 100;
+  DoTest({kTsLow});
+}
+
+// Simulate the application increasing full_history_ts_low.
+TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {100, 101, 102, 103};
+  DoTest(ts_lbs);
+}
+
+// Simulate the application trying to decrease full_history_ts_low
+// unsuccessfully. If the application calls public API sequentially to
+// decrease the lower bound ts, RocksDB will return an InvalidArgument
+// status before involving VersionSet. Only when multiple threads trying
+// to decrease the lower bound concurrently will this case ever happen. Even
+// so, the lower bound cannot be decreased. The application will be notified
+// via return value of the API.
+TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {103, 102, 101, 100};
+  DoTest(ts_lbs);
+}
+
 class VersionSetAtomicGroupTest : public VersionSetTestBase,
                                   public testing::Test {
  public:
-  VersionSetAtomicGroupTest() : VersionSetTestBase() {}
+  VersionSetAtomicGroupTest()
+      : VersionSetTestBase("version_set_atomic_group_test") {}
 
   void SetUp() override {
     PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
@@ -796,7 +2022,7 @@
       edits_[i].MarkAtomicGroup(--remaining);
       edits_[i].SetLastSequence(last_seqno_++);
     }
-    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
   }
 
   void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
@@ -808,7 +2034,7 @@
       edits_[i].MarkAtomicGroup(--remaining);
       edits_[i].SetLastSequence(last_seqno_++);
     }
-    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
   }
 
   void SetupCorruptedAtomicGroup(int atomic_group_size) {
@@ -822,7 +2048,7 @@
       }
       edits_[i].SetLastSequence(last_seqno_++);
     }
-    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
   }
 
   void SetupIncorrectAtomicGroup(int atomic_group_size) {
@@ -838,7 +2064,7 @@
       }
       edits_[i].SetLastSequence(last_seqno_++);
     }
-    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
   }
 
   void SetupTestSyncPoints() {
@@ -860,13 +2086,10 @@
           last_in_atomic_group_ = true;
         });
     SyncPoint::GetInstance()->SetCallBack(
-        "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) {
-          num_recovered_edits_ = *reinterpret_cast<int*>(arg);
+        "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) {
+          num_recovered_edits_ = *reinterpret_cast<size_t*>(arg);
         });
     SyncPoint::GetInstance()->SetCallBack(
-        "ReactiveVersionSet::ReadAndApply:AppliedEdits",
-        [&](void* arg) { num_applied_edits_ = *reinterpret_cast<int*>(arg); });
-    SyncPoint::GetInstance()->SetCallBack(
         "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
         [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
     SyncPoint::GetInstance()->SetCallBack(
@@ -904,8 +2127,7 @@
   bool first_in_atomic_group_ = false;
   bool last_in_atomic_group_ = false;
   int num_edits_in_atomic_group_ = 0;
-  int num_recovered_edits_ = 0;
-  int num_applied_edits_ = 0;
+  size_t num_recovered_edits_ = 0;
   VersionEdit corrupted_edit_;
   VersionEdit edit_with_incorrect_group_size_;
   std::unique_ptr<log::Writer> log_writer_;
@@ -921,7 +2143,6 @@
   EXPECT_TRUE(first_in_atomic_group_);
   EXPECT_TRUE(last_in_atomic_group_);
   EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -943,7 +2164,6 @@
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
   EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -956,20 +2176,20 @@
   EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
                                         &manifest_reporter,
                                         &manifest_reader_status));
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
   AddNewEditsToLog(kAtomicGroupSize);
   InstrumentedMutex mu;
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_TRUE(first_in_atomic_group_);
   EXPECT_TRUE(last_in_atomic_group_);
   // The recover should clean up the replay buffer.
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
-  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -985,7 +2205,6 @@
   EXPECT_FALSE(last_in_atomic_group_);
   EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
   EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1017,14 +2236,13 @@
   InstrumentedMutex mu;
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   // Reactive version set should be empty now.
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
   EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1041,13 +2259,14 @@
                                         &manifest_reader_status));
   EXPECT_EQ(column_families_.size(),
             reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
   // Write a few edits in an atomic group.
   AddNewEditsToLog(kNumberOfPersistedVersionEdits);
   InstrumentedMutex mu;
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_TRUE(first_in_atomic_group_);
   EXPECT_FALSE(last_in_atomic_group_);
@@ -1056,8 +2275,6 @@
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
               kNumberOfPersistedVersionEdits);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
-  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1104,8 +2321,8 @@
   // Write the corrupted edits.
   AddNewEditsToLog(kAtomicGroupSize);
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
             corrupted_edit_.DebugString());
@@ -1154,8 +2371,8 @@
                                         &manifest_reader_status));
   AddNewEditsToLog(kAtomicGroupSize);
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_EQ(edits_[1].DebugString(),
             edit_with_incorrect_group_size_.DebugString());
@@ -1164,7 +2381,8 @@
 class VersionSetTestDropOneCF : public VersionSetTestBase,
                                 public testing::TestWithParam<std::string> {
  public:
-  VersionSetTestDropOneCF() : VersionSetTestBase() {}
+  VersionSetTestDropOneCF()
+      : VersionSetTestBase("version_set_test_drop_one_cf") {}
 };
 
 // This test simulates the following execution sequence
@@ -1189,7 +2407,7 @@
   SequenceNumber last_seqno;
   std::unique_ptr<log::Writer> log_writer;
   PrepareManifest(&column_families, &last_seqno, &log_writer);
-  Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
   ASSERT_OK(s);
 
   EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
@@ -1268,10 +2486,7 @@
   mutex_.Unlock();
   ASSERT_OK(s);
   ASSERT_EQ(1, called);
-  if (cfd_to_drop->Unref()) {
-    delete cfd_to_drop;
-    cfd_to_drop = nullptr;
-  }
+  cfd_to_drop->UnrefAndTryDelete();
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -1279,6 +2494,737 @@
     testing::Values(VersionSetTestBase::kColumnFamilyName1,
                     VersionSetTestBase::kColumnFamilyName2,
                     VersionSetTestBase::kColumnFamilyName3));
+
+class EmptyDefaultCfNewManifest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  EmptyDefaultCfNewManifest() : VersionSetTestBase("version_set_new_db_test") {}
+  // Emulate DBImpl::NewDB()
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+                       SequenceNumber* /*last_seqno*/,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(log_writer != nullptr);
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    const std::string manifest_path = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
+    ASSERT_OK(s);
+    log_writer->reset(new log::Writer(std::move(file_writer), 0, true));
+    std::string record;
+    ASSERT_TRUE(new_db.EncodeTo(&record));
+    s = (*log_writer)->AddRecord(record);
+    ASSERT_OK(s);
+    // Create new column family
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+    new_cf.SetColumnFamily(1);
+    new_cf.SetLastSequence(2);
+    new_cf.SetNextFile(2);
+    record.clear();
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = (*log_writer)->AddRecord(record);
+    ASSERT_OK(s);
+  }
+
+ protected:
+  bool write_dbid_to_manifest_ = false;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+// Create db, create column family. Cf creation will switch to a new MANIFEST.
+// Then reopen db, trying to recover.
+TEST_F(EmptyDefaultCfNewManifest, Recover) {
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  log_writer_.reset();
+  Status s =
+      SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+  column_families.emplace_back(VersionSetTestBase::kColumnFamilyName1,
+                               cf_options_);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(
+      manifest_path, column_families, false, &db_id, &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_FALSE(has_missing_table_file);
+}
+
+class VersionSetTestEmptyDb
+    : public VersionSetTestBase,
+      public testing::TestWithParam<
+          std::tuple<bool, bool, std::vector<std::string>>> {
+ public:
+  static const std::string kUnknownColumnFamilyName;
+  VersionSetTestEmptyDb() : VersionSetTestBase("version_set_test_empty_db") {}
+
+ protected:
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+                       SequenceNumber* /*last_seqno*/,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(nullptr != log_writer);
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    const std::string manifest_path = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
+    ASSERT_OK(s);
+    {
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+  }
+
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  log_writer_.reset();
+  Status s =
+      SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string> cf_names = std::get<2>(GetParam());
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Only a subset of column families in the MANIFEST.
+  VersionEdit new_cf1;
+  new_cf1.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+  new_cf1.SetColumnFamily(1);
+  Status s;
+  {
+    std::string record;
+    new_cf1.EncodeTo(&record);
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  {
+    VersionEdit tmp_edit;
+    tmp_edit.SetColumnFamily(4);
+    tmp_edit.SetLogNumber(0);
+    tmp_edit.SetNextFile(2);
+    tmp_edit.SetLastSequence(0);
+    std::string record;
+    ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  {
+    VersionEdit tmp_edit;
+    tmp_edit.SetLogNumber(0);
+    tmp_edit.SetNextFile(2);
+    tmp_edit.SetLastSequence(0);
+    std::string record;
+    ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else if (read_only) {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+  } else if (cf_names.size() == all_cf_names.size()) {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+  } else if (cf_names.size() < all_cf_names.size()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+    ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(
+        kUnknownColumnFamilyName);
+    ASSERT_EQ(nullptr, cfd);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BestEffortRecovery, VersionSetTestEmptyDb,
+    testing::Combine(
+        /*write_dbid_to_manifest=*/testing::Bool(),
+        /*read_only=*/testing::Bool(),
+        /*cf_names=*/
+        testing::Values(
+            std::vector<std::string>(),
+            std::vector<std::string>({kDefaultColumnFamilyName}),
+            std::vector<std::string>({VersionSetTestBase::kColumnFamilyName1,
+                                      VersionSetTestBase::kColumnFamilyName2,
+                                      VersionSetTestBase::kColumnFamilyName3}),
+            std::vector<std::string>({kDefaultColumnFamilyName,
+                                      VersionSetTestBase::kColumnFamilyName1}),
+            std::vector<std::string>({kDefaultColumnFamilyName,
+                                      VersionSetTestBase::kColumnFamilyName1,
+                                      VersionSetTestBase::kColumnFamilyName2,
+                                      VersionSetTestBase::kColumnFamilyName3}),
+            std::vector<std::string>(
+                {kDefaultColumnFamilyName,
+                 VersionSetTestBase::kColumnFamilyName1,
+                 VersionSetTestBase::kColumnFamilyName2,
+                 VersionSetTestBase::kColumnFamilyName3,
+                 VersionSetTestEmptyDb::kUnknownColumnFamilyName}))));
+
+class VersionSetTestMissingFiles : public VersionSetTestBase,
+                                   public testing::Test {
+ public:
+  VersionSetTestMissingFiles()
+      : VersionSetTestBase("version_set_test_missing_files"),
+        block_based_table_options_(),
+        table_factory_(std::make_shared<BlockBasedTableFactory>(
+            block_based_table_options_)),
+        internal_comparator_(
+            std::make_shared<InternalKeyComparator>(options_.comparator)) {}
+
+ protected:
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+                       SequenceNumber* last_seqno,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+    ASSERT_OK(s);
+    log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    {
+      std::string record;
+      ASSERT_TRUE(new_db.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
+    uint32_t cf_id = 1;  // default cf id is 0
+    cf_options_.table_factory = table_factory_;
+    for (const auto& cf_name : cf_names) {
+      column_families->emplace_back(cf_name, cf_options_);
+      if (cf_name == kDefaultColumnFamilyName) {
+        continue;
+      }
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(cf_name);
+      new_cf.SetColumnFamily(cf_id);
+      std::string record;
+      ASSERT_TRUE(new_cf.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+
+      VersionEdit cf_files;
+      cf_files.SetColumnFamily(cf_id);
+      cf_files.SetLogNumber(0);
+      record.clear();
+      ASSERT_TRUE(cf_files.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+      ++cf_id;
+    }
+    SequenceNumber seq = 2;
+    {
+      VersionEdit edit;
+      edit.SetNextFile(7);
+      edit.SetLastSequence(seq);
+      std::string record;
+      ASSERT_TRUE(edit.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+    *last_seqno = seq + 1;
+  }
+
+  struct SstInfo {
+    uint64_t file_number;
+    std::string column_family;
+    std::string key;  // the only key
+    int level = 0;
+    SstInfo(uint64_t file_num, const std::string& cf_name,
+            const std::string& _key)
+        : SstInfo(file_num, cf_name, _key, 0) {}
+    SstInfo(uint64_t file_num, const std::string& cf_name,
+            const std::string& _key, int lvl)
+        : file_number(file_num),
+          column_family(cf_name),
+          key(_key),
+          level(lvl) {}
+  };
+
+  // Create dummy sst, return their metadata. Note that only file name and size
+  // are used.
+  void CreateDummyTableFiles(const std::vector<SstInfo>& file_infos,
+                             std::vector<FileMetaData>* file_metas) {
+    assert(file_metas != nullptr);
+    for (const auto& info : file_infos) {
+      uint64_t file_num = info.file_number;
+      std::string fname = MakeTableFileName(dbname_, file_num);
+      std::unique_ptr<FSWritableFile> file;
+      Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr);
+      ASSERT_OK(s);
+      std::unique_ptr<WritableFileWriter> fwriter(new WritableFileWriter(
+          std::move(file), fname, FileOptions(), env_->GetSystemClock().get()));
+      IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+      std::unique_ptr<TableBuilder> builder(table_factory_->NewTableBuilder(
+          TableBuilderOptions(
+              immutable_options_, mutable_cf_options_, *internal_comparator_,
+              &int_tbl_prop_collector_factories, kNoCompression,
+              CompressionOptions(),
+              TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+              info.column_family, info.level),
+          fwriter.get()));
+      InternalKey ikey(info.key, 0, ValueType::kTypeValue);
+      builder->Add(ikey.Encode(), "value");
+      ASSERT_OK(builder->Finish());
+      fwriter->Flush();
+      uint64_t file_size = 0;
+      s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
+      ASSERT_OK(s);
+      ASSERT_NE(0, file_size);
+      file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
+                               ikey, 0, 0, false, Temperature::kUnknown, 0, 0,
+                               0, kUnknownFileChecksum,
+                               kUnknownFileChecksumFuncName,
+                               kDisableUserTimestamp, kDisableUserTimestamp);
+    }
+  }
+
+  // This method updates last_sequence_.
+  void WriteFileAdditionAndDeletionToManifest(
+      uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
+      const std::vector<std::pair<int, uint64_t>>& deleted_files) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cf);
+    for (const auto& elem : added_files) {
+      int level = elem.first;
+      edit.AddFile(level, elem.second);
+    }
+    for (const auto& elem : deleted_files) {
+      int level = elem.first;
+      edit.DeleteFile(level, elem.second);
+    }
+    edit.SetLastSequence(last_seqno_);
+    ++last_seqno_;
+    assert(log_writer_.get() != nullptr);
+    std::string record;
+    ASSERT_TRUE(edit.EncodeTo(&record));
+    Status s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+
+  BlockBasedTableOptions block_based_table_options_;
+  std::shared_ptr<TableFactory> table_factory_;
+  std::shared_ptr<InternalKeyComparator> internal_comparator_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (uint64_t file_num = 10; file_num < 15; ++file_num) {
+    std::string smallest_ukey = "a";
+    std::string largest_ukey = "b";
+    InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+    InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+    FileMetaData meta = FileMetaData(
+        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+        kDisableUserTimestamp, kDisableUserTimestamp);
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  std::vector<std::pair<int, uint64_t>> deleted_files;
+  deleted_files.emplace_back(0, 10);
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_TRUE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    ASSERT_TRUE(files.empty());
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (size_t i = 3; i != 5; ++i) {
+    added_files.emplace_back(0, file_metas[i]);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+
+  added_files.clear();
+  for (uint64_t file_num = 120; file_num < 130; ++file_num) {
+    std::string smallest_ukey = "a";
+    std::string largest_ukey = "b";
+    InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+    InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+    FileMetaData meta = FileMetaData(
+        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+        kDisableUserTimestamp, kDisableUserTimestamp);
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_TRUE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    if (cfd->GetName() == kDefaultColumnFamilyName) {
+      ASSERT_EQ(2, files.size());
+      for (const auto* fmeta : files) {
+        if (fmeta->fd.GetNumber() != 107 && fmeta->fd.GetNumber() != 110) {
+          ASSERT_FALSE(true);
+        }
+      }
+    } else {
+      ASSERT_TRUE(files.empty());
+    }
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (const auto& meta : file_metas) {
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  std::vector<std::pair<int, uint64_t>> deleted_files;
+  deleted_files.emplace_back(/*level=*/0, 100);
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_FALSE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    if (cfd->GetName() == kDefaultColumnFamilyName) {
+      ASSERT_EQ(existing_files.size() - deleted_files.size(), files.size());
+      bool has_deleted_file = false;
+      for (const auto* fmeta : files) {
+        if (fmeta->fd.GetNumber() == 100) {
+          has_deleted_file = true;
+          break;
+        }
+      }
+      ASSERT_FALSE(has_deleted_file);
+    } else {
+      ASSERT_TRUE(files.empty());
+    }
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
+  db_options_.allow_2pc = true;
+  NewDB();
+
+  SstInfo sst(100, kDefaultColumnFamilyName, "a");
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles({sst}, &file_metas);
+
+  constexpr WalNumber kMinWalNumberToKeep2PC = 10;
+  VersionEdit edit;
+  edit.AddFile(0, file_metas[0]);
+  edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+
+  for (int i = 0; i < 3; i++) {
+    CreateNewManifest();
+    ReopenDB();
+    ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,204 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void WalAddition::EncodeTo(std::string* dst) const {
+  PutVarint64(dst, number_);
+
+  if (metadata_.HasSyncedSize()) {
+    PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+    PutVarint64(dst, metadata_.GetSyncedSizeInBytes());
+  }
+
+  PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+}
+
+Status WalAddition::DecodeFrom(Slice* src) {
+  constexpr char class_name[] = "WalAddition";
+
+  if (!GetVarint64(src, &number_)) {
+    return Status::Corruption(class_name, "Error decoding WAL log number");
+  }
+
+  while (true) {
+    uint32_t tag_value = 0;
+    if (!GetVarint32(src, &tag_value)) {
+      return Status::Corruption(class_name, "Error decoding tag");
+    }
+    WalAdditionTag tag = static_cast<WalAdditionTag>(tag_value);
+    switch (tag) {
+      case WalAdditionTag::kSyncedSize: {
+        uint64_t size = 0;
+        if (!GetVarint64(src, &size)) {
+          return Status::Corruption(class_name, "Error decoding WAL file size");
+        }
+        metadata_.SetSyncedSizeInBytes(size);
+        break;
+      }
+      // TODO: process future tags such as checksum.
+      case WalAdditionTag::kTerminate:
+        return Status::OK();
+      default: {
+        std::stringstream ss;
+        ss << "Unknown tag " << tag_value;
+        return Status::Corruption(class_name, ss.str());
+      }
+    }
+  }
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) {
+  jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes"
+     << wal.GetMetadata().GetSyncedSizeInBytes();
+  return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal) {
+  os << "log_number: " << wal.GetLogNumber()
+     << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes();
+  return os;
+}
+
+std::string WalAddition::DebugString() const {
+  std::ostringstream oss;
+  oss << *this;
+  return oss.str();
+}
+
+void WalDeletion::EncodeTo(std::string* dst) const {
+  PutVarint64(dst, number_);
+}
+
+Status WalDeletion::DecodeFrom(Slice* src) {
+  constexpr char class_name[] = "WalDeletion";
+
+  if (!GetVarint64(src, &number_)) {
+    return Status::Corruption(class_name, "Error decoding WAL log number");
+  }
+
+  return Status::OK();
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal) {
+  jw << "LogNumber" << wal.GetLogNumber();
+  return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal) {
+  os << "log_number: " << wal.GetLogNumber();
+  return os;
+}
+
+std::string WalDeletion::DebugString() const {
+  std::ostringstream oss;
+  oss << *this;
+  return oss.str();
+}
+
+Status WalSet::AddWal(const WalAddition& wal) {
+  if (wal.GetLogNumber() < min_wal_number_to_keep_) {
+    // The WAL has been obsolete, ignore it.
+    return Status::OK();
+  }
+
+  auto it = wals_.lower_bound(wal.GetLogNumber());
+  bool existing = it != wals_.end() && it->first == wal.GetLogNumber();
+  if (existing && !wal.GetMetadata().HasSyncedSize()) {
+    std::stringstream ss;
+    ss << "WAL " << wal.GetLogNumber() << " is created more than once";
+    return Status::Corruption("WalSet::AddWal", ss.str());
+  }
+  // If the WAL has synced size, it must >= the previous size.
+  if (wal.GetMetadata().HasSyncedSize() && existing &&
+      it->second.HasSyncedSize() &&
+      wal.GetMetadata().GetSyncedSizeInBytes() <
+          it->second.GetSyncedSizeInBytes()) {
+    std::stringstream ss;
+    ss << "WAL " << wal.GetLogNumber()
+       << " must not have smaller synced size than previous one";
+    return Status::Corruption("WalSet::AddWal", ss.str());
+  }
+  if (existing) {
+    it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes());
+  } else {
+    wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()});
+  }
+  return Status::OK();
+}
+
+Status WalSet::AddWals(const WalAdditions& wals) {
+  Status s;
+  for (const WalAddition& wal : wals) {
+    s = AddWal(wal);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+Status WalSet::DeleteWalsBefore(WalNumber wal) {
+  if (wal > min_wal_number_to_keep_) {
+    min_wal_number_to_keep_ = wal;
+    wals_.erase(wals_.begin(), wals_.lower_bound(wal));
+  }
+  return Status::OK();
+}
+
+void WalSet::Reset() {
+  wals_.clear();
+  min_wal_number_to_keep_ = 0;
+}
+
+Status WalSet::CheckWals(
+    Env* env,
+    const std::unordered_map<WalNumber, std::string>& logs_on_disk) const {
+  assert(env != nullptr);
+
+  Status s;
+  for (const auto& wal : wals_) {
+    const uint64_t log_number = wal.first;
+    const WalMetadata& wal_meta = wal.second;
+
+    if (!wal_meta.HasSyncedSize()) {
+      // The WAL and WAL directory is not even synced,
+      // so the WAL's inode may not be persisted,
+      // then the WAL might not show up when listing WAL directory.
+      continue;
+    }
+
+    if (logs_on_disk.find(log_number) == logs_on_disk.end()) {
+      std::stringstream ss;
+      ss << "Missing WAL with log number: " << log_number << ".";
+      s = Status::Corruption(ss.str());
+      break;
+    }
+
+    uint64_t log_file_size = 0;
+    s = env->GetFileSize(logs_on_disk.at(log_number), &log_file_size);
+    if (!s.ok()) {
+      break;
+    }
+    if (log_file_size < wal_meta.GetSyncedSizeInBytes()) {
+      std::stringstream ss;
+      ss << "Size mismatch: WAL (log number: " << log_number
+         << ") in MANIFEST is " << wal_meta.GetSyncedSizeInBytes()
+         << " bytes , but actually is " << log_file_size << " bytes on disk.";
+      s = Status::Corruption(ss.str());
+      break;
+    }
+  }
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,166 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// WAL related classes used in VersionEdit and VersionSet.
+// Modifications to WalAddition and WalDeletion may need to update
+// VersionEdit and its related tests.
+
+#pragma once
+
+#include <map>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "logging/event_logger.h"
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+using WalNumber = uint64_t;
+
+// Metadata of a WAL.
+class WalMetadata {
+ public:
+  WalMetadata() = default;
+
+  explicit WalMetadata(uint64_t synced_size_bytes)
+      : synced_size_bytes_(synced_size_bytes) {}
+
+  bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; }
+
+  void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; }
+
+  uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; }
+
+ private:
+  // The size of WAL is unknown, used when the WAL is not synced yet or is
+  // empty.
+  constexpr static uint64_t kUnknownWalSize = port::kMaxUint64;
+
+  // Size of the most recently synced WAL in bytes.
+  uint64_t synced_size_bytes_ = kUnknownWalSize;
+};
+
+// These tags are persisted to MANIFEST, so it's part of the user API.
+enum class WalAdditionTag : uint32_t {
+  // Indicates that there are no more tags.
+  kTerminate = 1,
+  // Synced Size in bytes.
+  kSyncedSize = 2,
+  // Add tags in the future, such as checksum?
+};
+
+// Records the event of adding a WAL in VersionEdit.
+class WalAddition {
+ public:
+  WalAddition() : number_(0), metadata_() {}
+
+  explicit WalAddition(WalNumber number) : number_(number), metadata_() {}
+
+  WalAddition(WalNumber number, WalMetadata meta)
+      : number_(number), metadata_(std::move(meta)) {}
+
+  WalNumber GetLogNumber() const { return number_; }
+
+  const WalMetadata& GetMetadata() const { return metadata_; }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* src);
+
+  std::string DebugString() const;
+
+ private:
+  WalNumber number_;
+  WalMetadata metadata_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal);
+
+using WalAdditions = std::vector<WalAddition>;
+
+// Records the event of deleting WALs before the specified log number.
+class WalDeletion {
+ public:
+  WalDeletion() : number_(kEmpty) {}
+
+  explicit WalDeletion(WalNumber number) : number_(number) {}
+
+  WalNumber GetLogNumber() const { return number_; }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* src);
+
+  std::string DebugString() const;
+
+  bool IsEmpty() const { return number_ == kEmpty; }
+
+  void Reset() { number_ = kEmpty; }
+
+ private:
+  static constexpr WalNumber kEmpty = 0;
+
+  WalNumber number_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal);
+
+// Used in VersionSet to keep the current set of WALs.
+//
+// When a WAL is synced or becomes obsoleted,
+// a VersionEdit is logged to MANIFEST and
+// the WAL is added to or deleted from WalSet.
+//
+// Not thread safe, needs external synchronization such as holding DB mutex.
+class WalSet {
+ public:
+  // Add WAL(s).
+  // If the WAL is closed,
+  // then there must be an existing unclosed WAL,
+  // otherwise, return Status::Corruption.
+  // Can happen when applying a VersionEdit or recovering from MANIFEST.
+  Status AddWal(const WalAddition& wal);
+  Status AddWals(const WalAdditions& wals);
+
+  // Delete WALs with log number smaller than the specified wal number.
+  // Can happen when applying a VersionEdit or recovering from MANIFEST.
+  Status DeleteWalsBefore(WalNumber wal);
+
+  // Resets the internal state.
+  void Reset();
+
+  // WALs with number less than MinWalNumberToKeep should not exist in WalSet.
+  WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; }
+
+  const std::map<WalNumber, WalMetadata>& GetWals() const { return wals_; }
+
+  // Checks whether there are missing or corrupted WALs.
+  // Returns Status::OK if there is no missing nor corrupted WAL,
+  // otherwise returns Status::Corruption.
+  // logs_on_disk is a map from log number to the log filename.
+  // Note that logs_on_disk may contain logs that is obsolete but
+  // haven't been deleted from disk.
+  Status CheckWals(
+      Env* env,
+      const std::unordered_map<WalNumber, std::string>& logs_on_disk) const;
+
+ private:
+  std::map<WalNumber, WalMetadata> wals_;
+  // WAL number < min_wal_number_to_keep_ should not exist in wals_.
+  // It's monotonically increasing, in-memory only, not written to MANIFEST.
+  WalNumber min_wal_number_to_keep_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_edit_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_edit_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,214 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(WalSet, AddDeleteReset) {
+  WalSet wals;
+  ASSERT_TRUE(wals.GetWals().empty());
+
+  // Create WAL 1 - 10.
+  for (WalNumber log_number = 1; log_number <= 10; log_number++) {
+    wals.AddWal(WalAddition(log_number));
+  }
+  ASSERT_EQ(wals.GetWals().size(), 10);
+
+  // Delete WAL 1 - 5.
+  wals.DeleteWalsBefore(6);
+  ASSERT_EQ(wals.GetWals().size(), 5);
+
+  WalNumber expected_log_number = 6;
+  for (auto it : wals.GetWals()) {
+    WalNumber log_number = it.first;
+    ASSERT_EQ(log_number, expected_log_number++);
+  }
+
+  wals.Reset();
+  ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, Overwrite) {
+  constexpr WalNumber kNumber = 100;
+  constexpr uint64_t kBytes = 200;
+  WalSet wals;
+  wals.AddWal(WalAddition(kNumber));
+  ASSERT_FALSE(wals.GetWals().at(kNumber).HasSyncedSize());
+  wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes)));
+  ASSERT_TRUE(wals.GetWals().at(kNumber).HasSyncedSize());
+  ASSERT_EQ(wals.GetWals().at(kNumber).GetSyncedSizeInBytes(), kBytes);
+}
+
+TEST(WalSet, SmallerSyncedSize) {
+  constexpr WalNumber kNumber = 100;
+  constexpr uint64_t kBytes = 100;
+  WalSet wals;
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes))));
+  Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0)));
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(
+      s.ToString().find(
+          "WAL 100 must not have smaller synced size than previous one") !=
+      std::string::npos);
+}
+
+TEST(WalSet, CreateTwice) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+  Status s = wals.AddWal(WalAddition(kNumber));
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") !=
+              std::string::npos);
+}
+
+TEST(WalSet, DeleteAllWals) {
+  constexpr WalNumber kMaxWalNumber = 10;
+  WalSet wals;
+  for (WalNumber i = 1; i <= kMaxWalNumber; i++) {
+    wals.AddWal(WalAddition(i));
+  }
+  ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1));
+}
+
+TEST(WalSet, AddObsoleteWal) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+  ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, MinWalNumberToKeep) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1);
+}
+
+class WalSetTest : public DBTestBase {
+ public:
+  WalSetTest() : DBTestBase("WalSetTest", /* env_do_fsync */ true) {}
+
+  void SetUp() override {
+    test_dir_ = test::PerThreadDBPath("wal_set_test");
+    ASSERT_OK(env_->CreateDir(test_dir_));
+  }
+
+  void TearDown() override {
+    EXPECT_OK(DestroyDir(env_, test_dir_));
+    logs_on_disk_.clear();
+    wals_.Reset();
+  }
+
+  void CreateWalOnDisk(WalNumber number, const std::string& fname,
+                       uint64_t size_bytes) {
+    std::unique_ptr<WritableFile> f;
+    std::string fpath = Path(fname);
+    ASSERT_OK(env_->NewWritableFile(fpath, &f, EnvOptions()));
+    std::string content(size_bytes, '0');
+    ASSERT_OK(f->Append(content));
+    ASSERT_OK(f->Close());
+
+    logs_on_disk_[number] = fpath;
+  }
+
+  void AddWalToWalSet(WalNumber number, uint64_t size_bytes) {
+    // Create WAL.
+    ASSERT_OK(wals_.AddWal(WalAddition(number)));
+    // Close WAL.
+    WalMetadata wal(size_bytes);
+    ASSERT_OK(wals_.AddWal(WalAddition(number, wal)));
+  }
+
+  Status CheckWals() const { return wals_.CheckWals(env_, logs_on_disk_); }
+
+ private:
+  std::string test_dir_;
+  std::unordered_map<WalNumber, std::string> logs_on_disk_;
+  WalSet wals_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+TEST_F(WalSetTest, CheckEmptyWals) { ASSERT_OK(CheckWals()); }
+
+TEST_F(WalSetTest, CheckWals) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100;
+    std::stringstream ss;
+    ss << "log" << number;
+    std::string fname = ss.str();
+    CreateWalOnDisk(number, fname, size);
+    // log 0 - 5 are obsolete.
+    if (number > 5) {
+      AddWalToWalSet(number, size);
+    }
+  }
+  ASSERT_OK(CheckWals());
+}
+
+TEST_F(WalSetTest, CheckMissingWals) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100;
+    AddWalToWalSet(number, size);
+    // logs with even number are missing from disk.
+    if (number % 2) {
+      std::stringstream ss;
+      ss << "log" << number;
+      std::string fname = ss.str();
+      CreateWalOnDisk(number, fname, size);
+    }
+  }
+
+  Status s = CheckWals();
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+  // The first log with even number is missing.
+  std::stringstream expected_err;
+  expected_err << "Missing WAL with log number: " << 2;
+  ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+      << s.ToString();
+}
+
+TEST_F(WalSetTest, CheckWalsWithShrinkedSize) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100 + 1;
+    AddWalToWalSet(number, size);
+    // logs with even number have shrinked size.
+    std::stringstream ss;
+    ss << "log" << number;
+    std::string fname = ss.str();
+    CreateWalOnDisk(number, fname, (number % 2) ? size : size - 1);
+  }
+
+  Status s = CheckWals();
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+  // The first log with even number has wrong size.
+  std::stringstream expected_err;
+  expected_err << "Size mismatch: WAL (log number: " << 2 << ")";
+  ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+      << s.ToString();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.cc	2025-05-19 16:14:27.000000000 +0000
@@ -37,7 +37,7 @@
 #ifndef ROCKSDB_LITE
 
 Status WalManager::DeleteFile(const std::string& fname, uint64_t number) {
-  auto s = env_->DeleteFile(db_options_.wal_dir + "/" + fname);
+  auto s = env_->DeleteFile(wal_dir_ + "/" + fname);
   if (s.ok()) {
     MutexLock l(&read_first_record_cache_mutex_);
     read_first_record_cache_.erase(number);
@@ -52,7 +52,7 @@
   Status s;
   // list wal files in main db dir.
   VectorLogPtr logs;
-  s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
+  s = GetSortedWalsOfType(wal_dir_, logs, kAliveLogFile);
   if (!s.ok()) {
     return s;
   }
@@ -65,7 +65,7 @@
 
   files.clear();
   // list wal files in archive dir.
-  std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
+  std::string archivedir = ArchivalDirectory(wal_dir_);
   Status exists = env_->FileExists(archivedir);
   if (exists.ok()) {
     s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
@@ -120,8 +120,8 @@
     return s;
   }
   iter->reset(new TransactionLogIteratorImpl(
-      db_options_.wal_dir, &db_options_, read_options, file_options_, seq,
-      std::move(wal_files), version_set, seq_per_batch_));
+      wal_dir_, &db_options_, read_options, file_options_, seq,
+      std::move(wal_files), version_set, seq_per_batch_, io_tracer_));
   return (*iter)->status();
 }
 
@@ -134,14 +134,14 @@
 //    b. get sorted non-empty archived logs
 //    c. delete what should be deleted
 void WalManager::PurgeObsoleteWALFiles() {
-  bool const ttl_enabled = db_options_.wal_ttl_seconds > 0;
-  bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0;
+  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
   if (!ttl_enabled && !size_limit_enabled) {
     return;
   }
 
-  int64_t current_time;
-  Status s = env_->GetCurrentTime(&current_time);
+  int64_t current_time = 0;
+  Status s = db_options_.clock->GetCurrentTime(&current_time);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s",
                     s.ToString().c_str());
@@ -150,7 +150,7 @@
   }
   uint64_t const now_seconds = static_cast<uint64_t>(current_time);
   uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
-                                     ? db_options_.wal_ttl_seconds / 2
+                                     ? db_options_.WAL_ttl_seconds / 2
                                      : kDefaultIntervalToDeleteObsoleteWAL;
 
   if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
@@ -159,7 +159,7 @@
 
   purge_wal_files_last_run_ = now_seconds;
 
-  std::string archival_dir = ArchivalDirectory(db_options_.wal_dir);
+  std::string archival_dir = ArchivalDirectory(wal_dir_);
   std::vector<std::string> files;
   s = env_->GetChildren(archival_dir, &files);
   if (!s.ok()) {
@@ -171,11 +171,10 @@
 
   size_t log_files_num = 0;
   uint64_t log_file_size = 0;
-
   for (auto& f : files) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
       std::string const file_path = archival_dir + "/" + f;
       if (ttl_enabled) {
         uint64_t file_m_time;
@@ -186,7 +185,7 @@
                          s.ToString().c_str());
           continue;
         }
-        if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
+        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
           s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
                            /*force_fg=*/!wal_in_db_path_);
           if (!s.ok()) {
@@ -235,17 +234,21 @@
     return;
   }
 
-  size_t const files_keep_num =
-      static_cast<size_t>(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size);
+  size_t const files_keep_num = static_cast<size_t>(
+      db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size);
   if (log_files_num <= files_keep_num) {
     return;
   }
 
   size_t files_del_num = log_files_num - files_keep_num;
   VectorLogPtr archived_logs;
-  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
-
-  if (files_del_num > archived_logs.size()) {
+  s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Unable to get archived WALs from: %s: %s",
+                   archival_dir.c_str(), s.ToString().c_str());
+    files_del_num = 0;
+  } else if (files_del_num > archived_logs.size()) {
     ROCKS_LOG_WARN(db_options_.info_log,
                    "Trying to delete more archived log files than "
                    "exist. Deleting all");
@@ -254,8 +257,7 @@
 
   for (size_t i = 0; i < files_del_num; ++i) {
     std::string const file_path = archived_logs[i]->PathName();
-    s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path,
-                     db_options_.wal_dir, false,
+    s = DeleteDBFile(&db_options_, wal_dir_ + "/" + file_path, wal_dir_, false,
                      /*force_fg=*/!wal_in_db_path_);
     if (!s.ok()) {
       ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
@@ -269,7 +271,7 @@
 }
 
 void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
-  auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number);
+  auto archived_log_name = ArchivedLogFileName(wal_dir_, number);
   // The sync point below is used in (DBTest,TransactionLogIteratorRace)
   TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
   Status s = env_->RenameFile(fname, archived_log_name);
@@ -292,7 +294,7 @@
   for (const auto& f : all_files) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
       SequenceNumber sequence;
       Status s = ReadFirstRecord(log_type, number, &sequence);
       if (!s.ok()) {
@@ -334,10 +336,8 @@
   std::sort(
       log_files.begin(), log_files.end(),
       [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
-        LogFileImpl* a_impl =
-            static_cast_with_check<LogFileImpl, LogFile>(a.get());
-        LogFileImpl* b_impl =
-            static_cast_with_check<LogFileImpl, LogFile>(b.get());
+        LogFileImpl* a_impl = static_cast_with_check<LogFileImpl>(a.get());
+        LogFileImpl* b_impl = static_cast_with_check<LogFileImpl>(b.get());
         return *a_impl < *b_impl;
       });
   return status;
@@ -387,7 +387,7 @@
   }
   Status s;
   if (type == kAliveLogFile) {
-    std::string fname = LogFileName(db_options_.wal_dir, number);
+    std::string fname = LogFileName(wal_dir_, number);
     s = ReadFirstLine(fname, number, sequence);
     if (!s.ok() && env_->FileExists(fname).ok()) {
       // return any error that is not caused by non-existing file
@@ -397,8 +397,7 @@
 
   if (type == kArchivedLogFile || !s.ok()) {
     //  check if the file got moved to archive.
-    std::string archived_file =
-        ArchivedLogFileName(db_options_.wal_dir, number);
+    std::string archived_file = ArchivedLogFileName(wal_dir_, number);
     s = ReadFirstLine(archived_file, number, sequence);
     // maybe the file was deleted from archive dir. If that's the case, return
     // Status::OK(). The caller with identify this as empty file because
@@ -428,7 +427,7 @@
   Status s;
 
   uint64_t size_bytes;
-  s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes);
+  s = env_->GetFileSize(LogFileName(wal_dir_, number), &size_bytes);
 
   if (!s.ok()) {
     return s;
@@ -469,7 +468,7 @@
                                          fs_->OptimizeForLogRead(file_options_),
                                          &file, nullptr);
   std::unique_ptr<SequentialFileReader> file_reader(
-      new SequentialFileReader(std::move(file), fname));
+      new SequentialFileReader(std::move(file), fname, io_tracer_));
 
   if (!status.ok()) {
     return status;
@@ -494,14 +493,19 @@
       // TODO read record's till the first no corrupt entry?
     } else {
       WriteBatch batch;
-      WriteBatchInternal::SetContents(&batch, record);
-      *sequence = WriteBatchInternal::Sequence(&batch);
-      return Status::OK();
+      // We can overwrite an existing non-OK Status since it'd only reach here
+      // with `paranoid_checks == false`.
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (status.ok()) {
+        *sequence = WriteBatchInternal::Sequence(&batch);
+        return status;
+      }
     }
   }
 
-  // ReadRecord returns false on EOF, which means that the log file is empty. we
-  // return status.ok() in that case and set sequence number to 0
+  // ReadRecord might have returned false on EOF, which means that the log file
+  // is empty. Or, a failure may have occurred while processing the first entry.
+  // In any case, return status and set sequence number to 0.
   *sequence = 0;
   return status;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager.h	2025-05-19 16:14:27.000000000 +0000
@@ -36,14 +36,18 @@
 class WalManager {
  public:
   WalManager(const ImmutableDBOptions& db_options,
-             const FileOptions& file_options, const bool seq_per_batch = false)
+             const FileOptions& file_options,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const bool seq_per_batch = false)
       : db_options_(db_options),
         file_options_(file_options),
         env_(db_options.env),
-        fs_(db_options.fs.get()),
+        fs_(db_options.fs, io_tracer),
         purge_wal_files_last_run_(0),
         seq_per_batch_(seq_per_batch),
-        wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {}
+        wal_dir_(db_options_.GetWalDir()),
+        wal_in_db_path_(db_options_.IsWalDirSameAsDBPath()),
+        io_tracer_(io_tracer) {}
 
   Status GetSortedWalFiles(VectorLogPtr& files);
 
@@ -91,7 +95,7 @@
   const ImmutableDBOptions& db_options_;
   const FileOptions file_options_;
   Env* env_;
-  FileSystem* fs_;
+  const FileSystemPtr fs_;
 
   // ------- WalManager state -------
   // cache for ReadFirstRecord() calls
@@ -103,11 +107,15 @@
 
   bool seq_per_batch_;
 
+  const std::string& wal_dir_;
+
   bool wal_in_db_path_;
 
   // obsolete files will be deleted every this seconds if ttl deletion is
   // enabled and archive size_limit is disabled.
   static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+
+  std::shared_ptr<IOTracer> io_tracer_;
 };
 
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/wal_manager_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/wal_manager_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,20 +5,21 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "db/wal_manager.h"
+
 #include <map>
 #include <string>
 
-#include "rocksdb/cache.h"
-#include "rocksdb/write_batch.h"
-#include "rocksdb/write_buffer_manager.h"
-
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
 #include "db/version_set.h"
-#include "db/wal_manager.h"
 #include "env/mock_env.h"
 #include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -31,13 +32,12 @@
 class WalManagerTest : public testing::Test {
  public:
   WalManagerTest()
-      : env_(new MockEnv(Env::Default())),
-        dbname_(test::PerThreadDBPath("wal_manager_test")),
+      : dbname_(test::PerThreadDBPath("wal_manager_test")),
         db_options_(),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         current_log_number_(0) {
-    DestroyDB(dbname_, Options());
+    env_.reset(MockEnv::Create(Env::Default())), DestroyDB(dbname_, Options());
   }
 
   void Init() {
@@ -47,19 +47,22 @@
                                       std::numeric_limits<uint64_t>::max());
     db_options_.wal_dir = dbname_;
     db_options_.env = env_.get();
-    fs_.reset(new LegacyFileSystemWrapper(env_.get()));
-    db_options_.fs = fs_;
+    db_options_.fs = env_->GetFileSystem();
+    db_options_.clock = env_->GetSystemClock().get();
 
-    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
-                                   table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_,
-                                   /*block_cache_tracer=*/nullptr));
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
 
-    wal_manager_.reset(new WalManager(db_options_, env_options_));
+    wal_manager_.reset(
+        new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
   }
 
   void Reopen() {
-    wal_manager_.reset(new WalManager(db_options_, env_options_));
+    wal_manager_.reset(
+        new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
   }
 
   // NOT thread safe
@@ -67,9 +70,10 @@
     assert(current_log_writer_.get() != nullptr);
     uint64_t seq =  versions_->LastSequence() + 1;
     WriteBatch batch;
-    batch.Put(key, value);
+    ASSERT_OK(batch.Put(key, value));
     WriteBatchInternal::SetSequence(&batch, seq);
-    current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
+    ASSERT_OK(
+        current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)));
     versions_->SetLastAllocatedSequence(seq);
     versions_->SetLastPublishedSequence(seq);
     versions_->SetLastSequence(seq);
@@ -79,10 +83,10 @@
   void RollTheLog(bool /*archived*/) {
     current_log_number_++;
     std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
-    std::unique_ptr<WritableFile> file;
-    ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), fname, env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer,
+                                         nullptr));
     current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
   }
 
@@ -113,7 +117,6 @@
   WriteBufferManager write_buffer_manager_;
   std::unique_ptr<VersionSet> versions_;
   std::unique_ptr<WalManager> wal_manager_;
-  std::shared_ptr<LegacyFileSystemWrapper> fs_;
 
   std::unique_ptr<log::Writer> current_log_writer_;
   uint64_t current_log_number_;
@@ -122,8 +125,9 @@
 TEST_F(WalManagerTest, ReadFirstRecordCache) {
   Init();
   std::string path = dbname_ + "/000001.log";
-  std::unique_ptr<WritableFile> file;
-  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file,
+                                                   nullptr));
 
   SequenceNumber s;
   ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s));
@@ -133,14 +137,14 @@
       wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
   ASSERT_EQ(s, 0U);
 
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(file)), path, EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), path, FileOptions()));
   log::Writer writer(std::move(file_writer), 1,
                      db_options_.recycle_log_file_num > 0);
   WriteBatch batch;
-  batch.Put("foo", "bar");
+  ASSERT_OK(batch.Put("foo", "bar"));
   WriteBatchInternal::SetSequence(&batch, 10);
-  writer.AddRecord(WriteBatchInternal::Contents(&batch));
+  ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
 
   // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
   // Waiting for lei to finish with db_test
@@ -165,14 +169,14 @@
 uint64_t GetLogDirSize(std::string dir_path, Env* env) {
   uint64_t dir_size = 0;
   std::vector<std::string> files;
-  env->GetChildren(dir_path, &files);
+  EXPECT_OK(env->GetChildren(dir_path, &files));
   for (auto& f : files) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
       std::string const file_path = dir_path + "/" + f;
       uint64_t file_size;
-      env->GetFileSize(file_path, &file_size);
+      EXPECT_OK(env->GetFileSize(file_path, &file_size));
       dir_size += file_size;
     }
   }
@@ -182,9 +186,9 @@
     Env* env, const std::string& path, const FileType expected_file_type) {
   std::vector<std::string> files;
   std::vector<uint64_t> file_numbers;
-  env->GetChildren(path, &files);
   uint64_t number;
   FileType type;
+  EXPECT_OK(env->GetChildren(path, &files));
   for (size_t i = 0; i < files.size(); ++i) {
     if (ParseFileName(files[i], &number, &type)) {
       if (type == expected_file_type) {
@@ -207,13 +211,14 @@
     EXPECT_OK(iter->status());
     iter->Next();
   }
+  EXPECT_OK(iter->status());
   return count;
 }
 }  // namespace
 
 TEST_F(WalManagerTest, WALArchivalSizeLimit) {
-  db_options_.wal_ttl_seconds = 0;
-  db_options_.wal_size_limit_mb = 1000;
+  db_options_.WAL_ttl_seconds = 0;
+  db_options_.WAL_size_limit_MB = 1000;
   Init();
 
   // TEST : Create WalManager with huge size limit and no ttl.
@@ -221,7 +226,7 @@
   // Count the archived log files that survived.
   // Assert that all of them did.
   // Change size limit. Re-open WalManager.
-  // Assert that archive is not greater than wal_size_limit_mb after
+  // Assert that archive is not greater than WAL_size_limit_MB after
   // PurgeObsoleteWALFiles()
   // Set ttl and time_to_check_ to small values. Re-open db.
   // Assert that there are no archived logs left.
@@ -230,27 +235,27 @@
   CreateArchiveLogs(20, 5000);
 
   std::vector<std::uint64_t> log_files =
-      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_EQ(log_files.size(), 20U);
 
-  db_options_.wal_size_limit_mb = 8;
+  db_options_.WAL_size_limit_MB = 8;
   Reopen();
   wal_manager_->PurgeObsoleteWALFiles();
 
   uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
-  ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024);
+  ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
 
-  db_options_.wal_ttl_seconds = 1;
-  env_->FakeSleepForMicroseconds(2 * 1000 * 1000);
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(2 * 1000 * 1000);
   Reopen();
   wal_manager_->PurgeObsoleteWALFiles();
 
-  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_TRUE(log_files.empty());
 }
 
 TEST_F(WalManagerTest, WALArchivalTtl) {
-  db_options_.wal_ttl_seconds = 1000;
+  db_options_.WAL_ttl_seconds = 1000;
   Init();
 
   // TEST : Create WalManager with a ttl and no size limit.
@@ -263,15 +268,15 @@
   CreateArchiveLogs(20, 5000);
 
   std::vector<uint64_t> log_files =
-      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_GT(log_files.size(), 0U);
 
-  db_options_.wal_ttl_seconds = 1;
-  env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(3 * 1000 * 1000);
   Reopen();
   wal_manager_->PurgeObsoleteWALFiles();
 
-  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_TRUE(log_files.empty());
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch.cc	2025-05-19 16:14:27.000000000 +0000
@@ -46,6 +46,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/snapshot_impl.h"
@@ -53,13 +54,14 @@
 #include "db/write_batch_internal.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
+#include "port/lang.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
 #include "util/autovector.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/duplicate_detector.h"
 #include "util/string_util.h"
-#include "util/util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -132,110 +134,16 @@
     return Status::OK();
   }
 
-  Status MarkRollback(const Slice&) override {
-    content_flags |= ContentFlags::HAS_ROLLBACK;
-    return Status::OK();
-  }
-};
-
-class TimestampAssigner : public WriteBatch::Handler {
- public:
-  explicit TimestampAssigner(const Slice& ts)
-      : timestamp_(ts), timestamps_(kEmptyTimestampList) {}
-  explicit TimestampAssigner(const std::vector<Slice>& ts_list)
-      : timestamps_(ts_list) {
-    SanityCheck();
-  }
-  ~TimestampAssigner() override {}
-
-  Status PutCF(uint32_t, const Slice& key, const Slice&) override {
-    AssignTimestamp(key);
-    ++idx_;
-    return Status::OK();
-  }
-
-  Status DeleteCF(uint32_t, const Slice& key) override {
-    AssignTimestamp(key);
-    ++idx_;
-    return Status::OK();
-  }
-
-  Status SingleDeleteCF(uint32_t, const Slice& key) override {
-    AssignTimestamp(key);
-    ++idx_;
-    return Status::OK();
-  }
-
-  Status DeleteRangeCF(uint32_t, const Slice& begin_key,
-                       const Slice& end_key) override {
-    AssignTimestamp(begin_key);
-    AssignTimestamp(end_key);
-    ++idx_;
-    return Status::OK();
-  }
-
-  Status MergeCF(uint32_t, const Slice& key, const Slice&) override {
-    AssignTimestamp(key);
-    ++idx_;
-    return Status::OK();
-  }
-
-  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
-    // TODO (yanqin): support blob db in the future.
-    return Status::OK();
-  }
-
-  Status MarkBeginPrepare(bool) override {
-    // TODO (yanqin): support in the future.
-    return Status::OK();
-  }
-
-  Status MarkEndPrepare(const Slice&) override {
-    // TODO (yanqin): support in the future.
-    return Status::OK();
-  }
-
-  Status MarkCommit(const Slice&) override {
-    // TODO (yanqin): support in the future.
+  Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_COMMIT;
     return Status::OK();
   }
 
   Status MarkRollback(const Slice&) override {
-    // TODO (yanqin): support in the future.
+    content_flags |= ContentFlags::HAS_ROLLBACK;
     return Status::OK();
   }
-
- private:
-  void SanityCheck() const {
-    assert(!timestamps_.empty());
-#ifndef NDEBUG
-    const size_t ts_sz = timestamps_[0].size();
-    for (size_t i = 1; i != timestamps_.size(); ++i) {
-      assert(ts_sz == timestamps_[i].size());
-    }
-#endif  // !NDEBUG
-  }
-
-  void AssignTimestamp(const Slice& key) {
-    assert(timestamps_.empty() || idx_ < timestamps_.size());
-    const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
-    size_t ts_sz = ts.size();
-    char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
-    memcpy(ptr, ts.data(), ts_sz);
-  }
-
-  static const std::vector<Slice> kEmptyTimestampList;
-  const Slice timestamp_;
-  const std::vector<Slice>& timestamps_;
-  size_t idx_ = 0;
-
-  // No copy or move.
-  TimestampAssigner(const TimestampAssigner&) = delete;
-  TimestampAssigner(TimestampAssigner&&) = delete;
-  TimestampAssigner& operator=(const TimestampAssigner&) = delete;
-  TimestampAssigner&& operator=(TimestampAssigner&&) = delete;
 };
-const std::vector<Slice> TimestampAssigner::kEmptyTimestampList;
 
 }  // anon namespace
 
@@ -244,42 +152,49 @@
 };
 
 WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
-    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) {
+    : content_flags_(0), max_bytes_(max_bytes), rep_() {
   rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
                    ? reserved_bytes
                    : WriteBatchInternal::kHeader);
   rep_.resize(WriteBatchInternal::kHeader);
 }
 
-WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz)
-    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) {
-  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ?
-    reserved_bytes : WriteBatchInternal::kHeader);
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes,
+                       size_t protection_bytes_per_key)
+    : content_flags_(0), max_bytes_(max_bytes), rep_() {
+  // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per
+  // entry.
+  assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8);
+  if (protection_bytes_per_key != 0) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+  }
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+                   ? reserved_bytes
+                   : WriteBatchInternal::kHeader);
   rep_.resize(WriteBatchInternal::kHeader);
 }
 
 WriteBatch::WriteBatch(const std::string& rep)
-    : content_flags_(ContentFlags::DEFERRED),
-      max_bytes_(0),
-      rep_(rep),
-      timestamp_size_(0) {}
+    : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {}
 
 WriteBatch::WriteBatch(std::string&& rep)
     : content_flags_(ContentFlags::DEFERRED),
       max_bytes_(0),
-      rep_(std::move(rep)),
-      timestamp_size_(0) {}
+      rep_(std::move(rep)) {}
 
 WriteBatch::WriteBatch(const WriteBatch& src)
     : wal_term_point_(src.wal_term_point_),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
       max_bytes_(src.max_bytes_),
-      rep_(src.rep_),
-      timestamp_size_(src.timestamp_size_) {
+      rep_(src.rep_) {
   if (src.save_points_ != nullptr) {
     save_points_.reset(new SavePoints());
     save_points_->stack = src.save_points_->stack;
   }
+  if (src.prot_info_ != nullptr) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+    prot_info_->entries_ = src.prot_info_->entries_;
+  }
 }
 
 WriteBatch::WriteBatch(WriteBatch&& src) noexcept
@@ -287,8 +202,8 @@
       wal_term_point_(std::move(src.wal_term_point_)),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
       max_bytes_(src.max_bytes_),
-      rep_(std::move(src.rep_)),
-      timestamp_size_(src.timestamp_size_) {}
+      prot_info_(std::move(src.prot_info_)),
+      rep_(std::move(src.rep_)) {}
 
 WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
   if (&src != this) {
@@ -331,6 +246,9 @@
     }
   }
 
+  if (prot_info_ != nullptr) {
+    prot_info_->entries_.clear();
+  }
   wal_term_point_.clear();
 }
 
@@ -340,7 +258,8 @@
   auto rv = content_flags_.load(std::memory_order_relaxed);
   if ((rv & ContentFlags::DEFERRED) != 0) {
     BatchContentClassifier classifier;
-    Iterate(&classifier);
+    // Should we handle status here?
+    Iterate(&classifier).PermitUncheckedError();
     rv = classifier.content_flags;
 
     // this method is conceptually const, because it is performing a lazy
@@ -358,6 +277,13 @@
   wal_term_point_.content_flags = content_flags_;
 }
 
+size_t WriteBatch::GetProtectionBytesPerKey() const {
+  if (prot_info_ != nullptr) {
+    return prot_info_->GetBytesPerKey();
+  }
+  return 0;
+}
+
 bool WriteBatch::HasPut() const {
   return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
 }
@@ -495,6 +421,11 @@
         return Status::Corruption("bad EndPrepare XID");
       }
       break;
+    case kTypeCommitXIDAndTimestamp:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad commit timestamp");
+      }
+      FALLTHROUGH_INTENDED;
     case kTypeCommitXID:
       if (!GetLengthPrefixedSlice(input, xid)) {
         return Status::Corruption("bad Commit XID");
@@ -639,7 +570,8 @@
       case kTypeBeginPrepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
-        handler->MarkBeginPrepare();
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
         empty_batch = false;
         if (!handler->WriteAfterCommit()) {
           s = Status::NotSupported(
@@ -658,7 +590,8 @@
       case kTypeBeginPersistedPrepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
-        handler->MarkBeginPrepare();
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
         empty_batch = false;
         if (handler->WriteAfterCommit()) {
           s = Status::NotSupported(
@@ -671,7 +604,8 @@
       case kTypeBeginUnprepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
-        handler->MarkBeginPrepare(true /* unprepared */);
+        s = handler->MarkBeginPrepare(true /* unprepared */);
+        assert(s.ok());
         empty_batch = false;
         if (handler->WriteAfterCommit()) {
           s = Status::NotSupported(
@@ -690,23 +624,37 @@
       case kTypeEndPrepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
-        handler->MarkEndPrepare(xid);
+        s = handler->MarkEndPrepare(xid);
+        assert(s.ok());
         empty_batch = true;
         break;
       case kTypeCommitXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
-        handler->MarkCommit(xid);
+        s = handler->MarkCommit(xid);
+        assert(s.ok());
         empty_batch = true;
         break;
+      case kTypeCommitXIDAndTimestamp:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+        // key stores the commit timestamp.
+        assert(!key.empty());
+        s = handler->MarkCommitWithTimestamp(xid, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = true;
+        }
+        break;
       case kTypeRollbackXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
-        handler->MarkRollback(xid);
+        s = handler->MarkRollback(xid);
+        assert(s.ok());
         empty_batch = true;
         break;
       case kTypeNoop:
-        handler->MarkNoop(empty_batch);
+        s = handler->MarkNoop(empty_batch);
+        assert(s.ok());
         empty_batch = true;
         break;
       default:
@@ -728,7 +676,7 @@
   return b->is_latest_persistent_state_;
 }
 
-void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) {
+void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) {
   b->is_latest_persistent_state_ = true;
 }
 
@@ -769,18 +717,22 @@
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
-  if (0 == b->timestamp_size_) {
-    PutLengthPrefixedSlice(&b->rep_, key);
-  } else {
-    PutVarint32(&b->rep_,
-                static_cast<uint32_t>(key.size() + b->timestamp_size_));
-    b->rep_.append(key.data(), key.size());
-    b->rep_.append(b->timestamp_size_, '\0');
-  }
+  PutLengthPrefixedSlice(&b->rep_, key);
   PutLengthPrefixedSlice(&b->rep_, value);
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
       std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // Technically the optype could've been `kTypeColumnFamilyValue` with the
+    // CF ID encoded in the `WriteBatch`. That distinction is unimportant
+    // however since we verify CF ID is correct, as well as all other fields
+    // (a missing/extra encoded CF ID would corrupt another field). It is
+    // convenient to consolidate on `kTypeValue` here as that is what will be
+    // inserted into memtable.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeValue)
+                                             .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -825,15 +777,18 @@
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
-  if (0 == b->timestamp_size_) {
-    PutLengthPrefixedSliceParts(&b->rep_, key);
-  } else {
-    PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_);
-  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
   PutLengthPrefixedSliceParts(&b->rep_, value);
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
       std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeValue)
+                                             .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -889,6 +844,19 @@
   return Status::OK();
 }
 
+Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b,
+                                                   const Slice& xid,
+                                                   const Slice& commit_ts) {
+  assert(!commit_ts.empty());
+  b->rep_.push_back(static_cast<char>(kTypeCommitXIDAndTimestamp));
+  PutLengthPrefixedSlice(&b->rep_, commit_ts);
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_COMMIT,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
 Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
   b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
   PutLengthPrefixedSlice(&b->rep_, xid);
@@ -912,6 +880,14 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, "" /* value */, kTypeDeletion)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -934,6 +910,16 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key,
+                        SliceParts(nullptr /* _parts */, 0 /* _num_parts */),
+                        kTypeDeletion)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -958,6 +944,14 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_SINGLE_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, "" /* value */, kTypeSingleDeletion)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -982,6 +976,17 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_SINGLE_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key,
+                        SliceParts(nullptr /* _parts */,
+                                   0 /* _num_parts */) /* value */,
+                        kTypeSingleDeletion)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1007,6 +1012,15 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE_RANGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1032,6 +1046,15 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE_RANGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1064,6 +1087,13 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_MERGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeMerge)
+                                             .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1094,6 +1124,13 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_MERGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeMerge)
+                                             .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1119,6 +1156,14 @@
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_BLOB_INDEX,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, value, kTypeBlobIndex)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1157,6 +1202,9 @@
     Clear();
   } else {
     rep_.resize(savepoint.size);
+    if (prot_info_ != nullptr) {
+      prot_info_->entries_.resize(savepoint.count);
+    }
     WriteBatchInternal::SetCount(this, savepoint.count);
     content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
   }
@@ -1175,13 +1223,17 @@
   return Status::OK();
 }
 
-Status WriteBatch::AssignTimestamp(const Slice& ts) {
-  TimestampAssigner ts_assigner(ts);
+Status WriteBatch::AssignTimestamp(
+    const Slice& ts, std::function<Status(uint32_t, size_t&)> checker) {
+  TimestampAssigner ts_assigner(prot_info_.get(), std::move(checker), ts);
   return Iterate(&ts_assigner);
 }
 
-Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
-  TimestampAssigner ts_assigner(ts_list);
+Status WriteBatch::AssignTimestamps(
+    const std::vector<Slice>& ts_list,
+    std::function<Status(uint32_t, size_t&)> checker) {
+  SimpleListTimestampAssigner ts_assigner(prot_info_.get(), std::move(checker),
+                                          ts_list);
   return Iterate(&ts_assigner);
 }
 
@@ -1198,6 +1250,8 @@
   DBImpl* db_;
   const bool concurrent_memtable_writes_;
   bool       post_info_created_;
+  const WriteBatch::ProtectionInfo* prot_info_;
+  size_t prot_info_idx_;
 
   bool* has_valid_writes_;
   // On some (!) platforms just default creating
@@ -1260,6 +1314,16 @@
       (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_);
   }
 
+  const ProtectionInfoKVOC64* NextProtectionInfo() {
+    const ProtectionInfoKVOC64* res = nullptr;
+    if (prot_info_ != nullptr) {
+      assert(prot_info_idx_ < prot_info_->entries_.size());
+      res = &prot_info_->entries_[prot_info_idx_];
+      ++prot_info_idx_;
+    }
+    return res;
+  }
+
  protected:
   bool WriteBeforePrepare() const override { return write_before_prepare_; }
   bool WriteAfterCommit() const override { return write_after_commit_; }
@@ -1272,6 +1336,7 @@
                    bool ignore_missing_column_families,
                    uint64_t recovering_log_number, DB* db,
                    bool concurrent_memtable_writes,
+                   const WriteBatch::ProtectionInfo* prot_info,
                    bool* has_valid_writes = nullptr, bool seq_per_batch = false,
                    bool batch_per_txn = true, bool hint_per_batch = false)
       : sequence_(_sequence),
@@ -1281,9 +1346,11 @@
         ignore_missing_column_families_(ignore_missing_column_families),
         recovering_log_number_(recovering_log_number),
         log_number_ref_(0),
-        db_(static_cast_with_check<DBImpl, DB>(db)),
+        db_(static_cast_with_check<DBImpl>(db)),
         concurrent_memtable_writes_(concurrent_memtable_writes),
         post_info_created_(false),
+        prot_info_(prot_info),
+        prot_info_idx_(0),
         has_valid_writes_(has_valid_writes),
         rebuilding_trx_(nullptr),
         rebuilding_trx_seq_(0),
@@ -1341,6 +1408,10 @@
   }
 
   void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
+  void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) {
+    prot_info_ = prot_info;
+    prot_info_idx_ = 0;
+  }
 
   SequenceNumber sequence() const { return sequence_; }
 
@@ -1396,28 +1467,34 @@
   }
 
   Status PutCFImpl(uint32_t column_family_id, const Slice& key,
-                   const Slice& value, ValueType value_type) {
+                   const Slice& value, ValueType value_type,
+                   const ProtectionInfoKVOS64* kv_prot_info) {
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key,
+                                     value);
       // else insert the values to the memtable right away
     }
 
-    Status seek_status;
-    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                             key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
-      return seek_status;
+      return ret_status;
     }
-    Status ret_status;
+    assert(ret_status.ok());
 
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
@@ -1425,23 +1502,17 @@
     // any kind of transactions including the ones that use seq_per_batch
     assert(!seq_per_batch_ || !moptions->inplace_update_support);
     if (!moptions->inplace_update_support) {
-      bool mem_res =
-          mem->Add(sequence_, value_type, key, value,
+      ret_status =
+          mem->Add(sequence_, value_type, key, value, kv_prot_info,
                    concurrent_memtable_writes_, get_post_process_info(mem),
                    hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
-      if (UNLIKELY(!mem_res)) {
-        assert(seq_per_batch_);
-        ret_status = Status::TryAgain("key+seq exists");
-        const bool BATCH_BOUNDRY = true;
-        MaybeAdvanceSeq(BATCH_BOUNDRY);
-      }
     } else if (moptions->inplace_callback == nullptr) {
       assert(!concurrent_memtable_writes_);
-      mem->Update(sequence_, key, value);
+      ret_status = mem->Update(sequence_, key, value, kv_prot_info);
     } else {
       assert(!concurrent_memtable_writes_);
-      if (mem->UpdateCallback(sequence_, key, value)) {
-      } else {
+      ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info);
+      if (ret_status.IsNotFound()) {
         // key not found in memtable. Do sst get, update, add
         SnapshotImpl read_from_snapshot;
         read_from_snapshot.number_ = sequence_;
@@ -1455,223 +1526,354 @@
         std::string merged_value;
 
         auto cf_handle = cf_mems_->GetColumnFamilyHandle();
-        Status s = Status::NotSupported();
+        Status get_status = Status::NotSupported();
         if (db_ != nullptr && recovering_log_number_ == 0) {
           if (cf_handle == nullptr) {
             cf_handle = db_->DefaultColumnFamily();
           }
-          s = db_->Get(ropts, cf_handle, key, &prev_value);
+          get_status = db_->Get(ropts, cf_handle, key, &prev_value);
         }
-
-        char* prev_buffer = const_cast<char*>(prev_value.c_str());
-        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
-        auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
-                                                 s.ok() ? &prev_size : nullptr,
-                                                 value, &merged_value);
-        if (status == UpdateStatus::UPDATED_INPLACE) {
-          // prev_value is updated in-place with final value.
-          bool mem_res __attribute__((__unused__));
-          mem_res = mem->Add(
-              sequence_, value_type, key, Slice(prev_buffer, prev_size));
-          assert(mem_res);
-          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
-        } else if (status == UpdateStatus::UPDATED) {
-          // merged_value contains the final value.
-          bool mem_res __attribute__((__unused__));
-          mem_res =
-              mem->Add(sequence_, value_type, key, Slice(merged_value));
-          assert(mem_res);
-          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+        // Intentionally overwrites the `NotFound` in `ret_status`.
+        if (!get_status.ok() && !get_status.IsNotFound()) {
+          ret_status = get_status;
+        } else {
+          ret_status = Status::OK();
+        }
+        if (ret_status.ok()) {
+          UpdateStatus update_status;
+          char* prev_buffer = const_cast<char*>(prev_value.c_str());
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+          if (get_status.ok()) {
+            update_status = moptions->inplace_callback(prev_buffer, &prev_size,
+                                                       value, &merged_value);
+          } else {
+            update_status = moptions->inplace_callback(
+                nullptr /* existing_value */, nullptr /* existing_value_size */,
+                value, &merged_value);
+          }
+          if (update_status == UpdateStatus::UPDATED_INPLACE) {
+            assert(get_status.ok());
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value,
+                                           Slice(prev_buffer, prev_size));
+              // prev_value is updated in-place with final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    &updated_kv_prot_info);
+            } else {
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          } else if (update_status == UpdateStatus::UPDATED) {
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value, merged_value);
+              // merged_value contains the final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(merged_value), &updated_kv_prot_info);
+            } else {
+              // merged_value contains the final value.
+              ret_status =
+                  mem->Add(sequence_, value_type, key, Slice(merged_value),
+                           nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          }
         }
       }
     }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
-    }
-    // Since all Puts are logged in transaction logs (if enabled), always bump
-    // sequence number. Even if the update eventually fails and does not result
-    // in memtable add/update.
-    MaybeAdvanceSeq();
-    CheckMemtableFull();
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                           key, value);
+    }
     return ret_status;
   }
 
   Status PutCF(uint32_t column_family_id, const Slice& key,
                const Slice& value) override {
-    return PutCFImpl(column_family_id, key, value, kTypeValue);
+    const auto* kv_prot_info = NextProtectionInfo();
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      return PutCFImpl(column_family_id, key, value, kTypeValue,
+                       &mem_kv_prot_info);
+    }
+    return PutCFImpl(column_family_id, key, value, kTypeValue,
+                     nullptr /* kv_prot_info */);
   }
 
   Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
-                    const Slice& value, ValueType delete_type) {
+                    const Slice& value, ValueType delete_type,
+                    const ProtectionInfoKVOS64* kv_prot_info) {
     Status ret_status;
     MemTable* mem = cf_mems_->GetMemTable();
-    bool mem_res =
-        mem->Add(sequence_, delete_type, key, value,
+    ret_status =
+        mem->Add(sequence_, delete_type, key, value, kv_prot_info,
                  concurrent_memtable_writes_, get_post_process_info(mem),
                  hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
-    if (UNLIKELY(!mem_res)) {
+    if (UNLIKELY(ret_status.IsTryAgain())) {
       assert(seq_per_batch_);
-      ret_status = Status::TryAgain("key+seq exists");
-      const bool BATCH_BOUNDRY = true;
-      MaybeAdvanceSeq(BATCH_BOUNDRY);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
     }
-    MaybeAdvanceSeq();
-    CheckMemtableFull();
     return ret_status;
   }
 
   Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
       // else insert the values to the memtable right away
     }
 
-    Status seek_status;
-    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status =
+            WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
-      return seek_status;
+      return ret_status;
     }
 
-    auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion);
+    ColumnFamilyData* cfd = cf_mems_->current();
+    assert(!cfd || cfd->user_comparator());
+    const size_t ts_sz = (cfd && cfd->user_comparator())
+                             ? cfd->user_comparator()->timestamp_size()
+                             : 0;
+    const ValueType delete_type =
+        (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp;
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type);
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              nullptr /* kv_prot_info */);
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status =
+          WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
     }
     return ret_status;
   }
 
   Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+                                              key);
       // else insert the values to the memtable right away
     }
 
-    Status seek_status;
-    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
-                                         key);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                      column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
-      return seek_status;
+      return ret_status;
     }
+    assert(ret_status.ok());
 
-    auto ret_status =
-        DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, nullptr /* kv_prot_info */);
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                    column_family_id, key);
     }
     return ret_status;
   }
 
   Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
                        const Slice& end_key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
-                                      begin_key, end_key);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                             begin_key, end_key);
       // else insert the values to the memtable right away
     }
 
-    Status seek_status;
-    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
-                                        begin_key, end_key);
-        // TODO(myabandeh): when transactional DeleteRange support is added,
-        // check if end_key must also be added.
-        batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::DeleteRange(
+            rebuilding_trx_, column_family_id, begin_key, end_key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
-      return seek_status;
+      return ret_status;
     }
+    assert(ret_status.ok());
+
     if (db_ != nullptr) {
       auto cf_handle = cf_mems_->GetColumnFamilyHandle();
       if (cf_handle == nullptr) {
         cf_handle = db_->DefaultColumnFamily();
       }
-      auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cf_handle)->cfd();
+      auto* cfd =
+          static_cast_with_check<ColumnFamilyHandleImpl>(cf_handle)->cfd();
       if (!cfd->is_delete_range_supported()) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
         return Status::NotSupported(
             std::string("DeleteRange not supported for table type ") +
             cfd->ioptions()->table_factory->Name() + " in CF " +
             cfd->GetName());
       }
+      int cmp = cfd->user_comparator()->Compare(begin_key, end_key);
+      if (cmp > 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        // It's an empty range where endpoints appear mistaken. Don't bother
+        // applying it to the DB, and return an error to the user.
+        return Status::InvalidArgument("end key comes before start key");
+      } else if (cmp == 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        // It's an empty range. Don't bother applying it to the DB.
+        return Status::OK();
+      }
     }
 
-    auto ret_status =
-        DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, nullptr /* kv_prot_info */);
+    }
     // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
     if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
-                                      begin_key, end_key);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::DeleteRange(
+          rebuilding_trx_, column_family_id, begin_key, end_key);
     }
     return ret_status;
   }
 
   Status MergeCF(uint32_t column_family_id, const Slice& key,
                  const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+                                       value);
       // else insert the values to the memtable right away
     }
 
-    Status seek_status;
-    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
-                                  value);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::Merge(rebuilding_trx_,
+                                               column_family_id, key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
-      return seek_status;
+      return ret_status;
     }
+    assert(ret_status.ok());
 
-    Status ret_status;
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
+    if (moptions->merge_operator == nullptr) {
+      return Status::InvalidArgument(
+          "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`");
+    }
     bool perform_merge = false;
     assert(!concurrent_memtable_writes_ ||
            moptions->max_successive_merges == 0);
@@ -1709,65 +1911,97 @@
       if (cf_handle == nullptr) {
         cf_handle = db_->DefaultColumnFamily();
       }
-      db_->Get(read_options, cf_handle, key, &get_value);
-      Slice get_value_slice = Slice(get_value);
-
-      // 2) Apply this merge
-      auto merge_operator = moptions->merge_operator;
-      assert(merge_operator);
-
-      std::string new_value;
-
-      Status merge_status = MergeHelper::TimedFullMerge(
-          merge_operator, key, &get_value_slice, {value}, &new_value,
-          moptions->info_log, moptions->statistics, Env::Default());
-
-      if (!merge_status.ok()) {
-        // Failed to merge!
-        // Store the delta in memtable
+      Status get_status = db_->Get(read_options, cf_handle, key, &get_value);
+      if (!get_status.ok()) {
+        // Failed to read a key we know exists. Store the delta in memtable.
         perform_merge = false;
       } else {
-        // 3) Add value to memtable
-        assert(!concurrent_memtable_writes_);
-        bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
-        if (UNLIKELY(!mem_res)) {
-          assert(seq_per_batch_);
-          ret_status = Status::TryAgain("key+seq exists");
-          const bool BATCH_BOUNDRY = true;
-          MaybeAdvanceSeq(BATCH_BOUNDRY);
+        Slice get_value_slice = Slice(get_value);
+
+        // 2) Apply this merge
+        auto merge_operator = moptions->merge_operator;
+        assert(merge_operator);
+
+        std::string new_value;
+        Status merge_status = MergeHelper::TimedFullMerge(
+            merge_operator, key, &get_value_slice, {value}, &new_value,
+            moptions->info_log, moptions->statistics,
+            SystemClock::Default().get());
+
+        if (!merge_status.ok()) {
+          // Failed to merge!
+          // Store the delta in memtable
+          perform_merge = false;
+        } else {
+          // 3) Add value to memtable
+          assert(!concurrent_memtable_writes_);
+          if (kv_prot_info != nullptr) {
+            auto merged_kv_prot_info =
+                kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+            merged_kv_prot_info.UpdateV(value, new_value);
+            merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue);
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  &merged_kv_prot_info);
+          } else {
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  nullptr /* kv_prot_info */);
+          }
         }
       }
     }
 
     if (!perform_merge) {
-      // Add merge operator to memtable
-      bool mem_res =
-          mem->Add(sequence_, kTypeMerge, key, value,
-                   concurrent_memtable_writes_, get_post_process_info(mem));
-      if (UNLIKELY(!mem_res)) {
-        assert(seq_per_batch_);
-        ret_status = Status::TryAgain("key+seq exists");
-        const bool BATCH_BOUNDRY = true;
-        MaybeAdvanceSeq(BATCH_BOUNDRY);
+      assert(ret_status.ok());
+      // Add merge operand to memtable
+      if (kv_prot_info != nullptr) {
+        auto mem_kv_prot_info =
+            kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+        ret_status =
+            mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info,
+                     concurrent_memtable_writes_, get_post_process_info(mem));
+      } else {
+        ret_status = mem->Add(
+            sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */,
+            concurrent_memtable_writes_, get_post_process_info(mem));
       }
     }
 
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id,
+                                             key, value);
     }
-    MaybeAdvanceSeq();
-    CheckMemtableFull();
     return ret_status;
   }
 
   Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
                         const Slice& value) override {
-    // Same as PutCF except for value type.
-    return PutCFImpl(column_family_id, key, value, kTypeBlobIndex);
+    const auto* kv_prot_info = NextProtectionInfo();
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      // Same as PutCF except for value type.
+      return PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                       &mem_kv_prot_info);
+    } else {
+      return PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                       nullptr /* kv_prot_info */);
+    }
   }
 
   void CheckMemtableFull() {
@@ -1799,8 +2033,8 @@
           const MemTable* const mem = cfd->mem();
           assert(mem);
 
-          if (mem->ApproximateMemoryUsageFast() +
-                      imm->ApproximateMemoryUsageExcludingLast() >=
+          if (mem->MemoryAllocatedBytes() +
+                      imm->MemoryAllocatedBytesExcludingLast() >=
                   size_to_maintain &&
               imm->MarkTrimHistoryNeeded()) {
             trim_history_scheduler_->ScheduleWork(cfd);
@@ -1885,6 +2119,8 @@
     Status s;
 
     if (recovering_log_number_ != 0) {
+      // We must hold db mutex in recovery.
+      db_->mutex()->AssertHeld();
       // in recovery when we encounter a commit marker
       // we lookup this transaction in our set of rebuilt transactions
       // and commit.
@@ -1927,6 +2163,76 @@
     return s;
   }
 
+  Status MarkCommitWithTimestamp(const Slice& name,
+                                 const Slice& commit_ts) override {
+    assert(db_);
+
+    Status s;
+
+    if (recovering_log_number_ != 0) {
+      // In recovery, db mutex must be held.
+      db_->mutex()->AssertHeld();
+      // in recovery when we encounter a commit marker
+      // we lookup this transaction in our set of rebuilt transactions
+      // and commit.
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+      // the log containing the prepared section may have
+      // been released in the last incarnation because the
+      // data was flushed to L0
+      if (trx) {
+        // at this point individual CF lognumbers will prevent
+        // duplicate re-insertion of values.
+        assert(0 == log_number_ref_);
+        if (write_after_commit_) {
+          // write_after_commit_ can only have one batch in trx.
+          assert(trx->batches_.size() == 1);
+          const auto& batch_info = trx->batches_.begin()->second;
+          // all inserts must reference this trx log number
+          log_number_ref_ = batch_info.log_number_;
+          const auto checker = [this](uint32_t cf, size_t& ts_sz) {
+            assert(db_);
+            VersionSet* const vset = db_->GetVersionSet();
+            assert(vset);
+            ColumnFamilySet* const cf_set = vset->GetColumnFamilySet();
+            assert(cf_set);
+            ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf);
+            assert(cfd);
+            const auto* const ucmp = cfd->user_comparator();
+            assert(ucmp);
+            if (ucmp->timestamp_size() == 0) {
+              ts_sz = 0;
+            } else if (ucmp->timestamp_size() != ts_sz) {
+              return Status::InvalidArgument("Timestamp size mismatch");
+            }
+            return Status::OK();
+          };
+          s = batch_info.batch_->AssignTimestamp(commit_ts, checker);
+          if (s.ok()) {
+            s = batch_info.batch_->Iterate(this);
+            log_number_ref_ = 0;
+          }
+        }
+        // else the values are already inserted before the commit
+
+        if (s.ok()) {
+          db_->DeleteRecoveredTransaction(name.ToString());
+        }
+        if (has_valid_writes_) {
+          *has_valid_writes_ = true;
+        }
+      }
+    } else {
+      // When writes are not delayed until commit, there is no connection
+      // between a memtable write and the WAL that supports it. So the commit
+      // need not reference any log as the only log to which it depends.
+      assert(!write_after_commit_ || log_number_ref_ > 0);
+    }
+    constexpr bool batch_boundary = true;
+    MaybeAdvanceSeq(batch_boundary);
+
+    return s;
+  }
+
   Status MarkRollback(const Slice& name) override {
     assert(db_);
 
@@ -1973,8 +2279,8 @@
   MemTableInserter inserter(
       sequence, memtables, flush_scheduler, trim_history_scheduler,
       ignore_missing_column_families, recovery_log_number, db,
-      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
-      batch_per_txn);
+      concurrent_memtable_writes, nullptr /* prot_info */,
+      nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
   for (auto w : write_group) {
     if (w->CallbackFailed()) {
       continue;
@@ -1987,6 +2293,7 @@
     }
     SetSequence(w->batch, inserter.sequence());
     inserter.set_log_number_ref(w->log_ref);
+    inserter.set_prot_info(w->batch->prot_info_.get());
     w->status = w->batch->Iterate(&inserter);
     if (!w->status.ok()) {
       return w->status;
@@ -2008,13 +2315,15 @@
   (void)batch_cnt;
 #endif
   assert(writer->ShouldWriteToMemtable());
-  MemTableInserter inserter(
-      sequence, memtables, flush_scheduler, trim_history_scheduler,
-      ignore_missing_column_families, log_number, db,
-      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
-      batch_per_txn, hint_per_batch);
+  MemTableInserter inserter(sequence, memtables, flush_scheduler,
+                            trim_history_scheduler,
+                            ignore_missing_column_families, log_number, db,
+                            concurrent_memtable_writes, nullptr /* prot_info */,
+                            nullptr /*has_valid_writes*/, seq_per_batch,
+                            batch_per_txn, hint_per_batch);
   SetSequence(writer->batch, sequence);
   inserter.set_log_number_ref(writer->log_ref);
+  inserter.set_prot_info(writer->batch->prot_info_.get());
   Status s = writer->batch->Iterate(&inserter);
   assert(!seq_per_batch || batch_cnt != 0);
   assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
@@ -2034,8 +2343,8 @@
   MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
                             trim_history_scheduler,
                             ignore_missing_column_families, log_number, db,
-                            concurrent_memtable_writes, has_valid_writes,
-                            seq_per_batch, batch_per_txn);
+                            concurrent_memtable_writes, batch->prot_info_.get(),
+                            has_valid_writes, seq_per_batch, batch_per_txn);
   Status s = batch->Iterate(&inserter);
   if (next_seq != nullptr) {
     *next_seq = inserter.sequence();
@@ -2048,6 +2357,7 @@
 
 Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
   assert(contents.size() >= WriteBatchInternal::kHeader);
+  assert(b->prot_info_ == nullptr);
   b->rep_.assign(contents.data(), contents.size());
   b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
   return Status::OK();
@@ -2055,6 +2365,8 @@
 
 Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
                                   const bool wal_only) {
+  assert(dst->Count() == 0 ||
+         (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
   size_t src_len;
   int src_count;
   uint32_t src_flags;
@@ -2071,6 +2383,13 @@
     src_flags = src->content_flags_.load(std::memory_order_relaxed);
   }
 
+  if (dst->prot_info_ != nullptr) {
+    std::copy(src->prot_info_->entries_.begin(),
+              src->prot_info_->entries_.begin() + src_count,
+              std::back_inserter(dst->prot_info_->entries_));
+  } else if (src->prot_info_ != nullptr) {
+    dst->prot_info_.reset(new WriteBatch::ProtectionInfo(*src->prot_info_));
+  }
   SetCount(dst, Count(dst) + src_count);
   assert(src->rep_.size() >= WriteBatchInternal::kHeader);
   dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_internal.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_internal.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_internal.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_internal.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,8 +8,11 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <array>
 #include <vector>
+
 #include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
 #include "db/trim_history_scheduler.h"
 #include "db/write_thread.h"
 #include "rocksdb/db.h"
@@ -17,6 +20,7 @@
 #include "rocksdb/types.h"
 #include "rocksdb/write_batch.h"
 #include "util/autovector.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -61,6 +65,14 @@
   MemTable* mem_;
 };
 
+struct WriteBatch::ProtectionInfo {
+  // `WriteBatch` usually doesn't contain a huge number of keys so protecting
+  // with a fixed, non-configurable eight bytes per key may work well enough.
+  autovector<ProtectionInfoKVOC64> entries_;
+
+  size_t GetBytesPerKey() const { return 8; }
+};
+
 // WriteBatchInternal provides static methods for manipulating a
 // WriteBatch that we don't want in the public WriteBatch interface.
 class WriteBatchInternal {
@@ -112,6 +124,9 @@
 
   static Status MarkCommit(WriteBatch* batch, const Slice& xid);
 
+  static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid,
+                                        const Slice& commit_ts);
+
   static Status InsertNoop(WriteBatch* batch);
 
   // Return the number of entries in the batch.
@@ -204,7 +219,7 @@
 
   // This write batch includes the latest state that should be persisted. Such
   // state meant to be used only during recovery.
-  static void SetAsLastestPersistentState(WriteBatch* b);
+  static void SetAsLatestPersistentState(WriteBatch* b);
   static bool IsLatestPersistentState(const WriteBatch* b);
 };
 
@@ -232,6 +247,9 @@
     if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
       batch_->rep_.resize(savepoint_.size);
       WriteBatchInternal::SetCount(batch_, savepoint_.count);
+      if (batch_->prot_info_ != nullptr) {
+        batch_->prot_info_->entries_.resize(savepoint_.count);
+      }
       batch_->content_flags_.store(savepoint_.content_flags,
                                    std::memory_order_relaxed);
       return Status::MemoryLimit();
@@ -247,4 +265,165 @@
 #endif
 };
 
+template <typename Derived>
+class TimestampAssignerBase : public WriteBatch::Handler {
+ public:
+  explicit TimestampAssignerBase(
+      WriteBatch::ProtectionInfo* prot_info,
+      std::function<Status(uint32_t, size_t&)>&& checker)
+      : prot_info_(prot_info), checker_(std::move(checker)) {}
+
+  ~TimestampAssignerBase() override {}
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return AssignTimestamp(cf, key);
+  }
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    return AssignTimestamp(cf, key);
+  }
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    return AssignTimestamp(cf, key);
+  }
+
+  Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+                       const Slice&) override {
+    return AssignTimestamp(cf, begin_key);
+  }
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return AssignTimestamp(cf, key);
+  }
+
+  Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return AssignTimestamp(cf, key);
+  }
+
+  Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+  Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+  Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+  Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+  Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
+
+ protected:
+  Status AssignTimestamp(uint32_t cf, const Slice& key) {
+    Status s = static_cast_with_check<Derived>(this)->AssignTimestampImpl(
+        cf, key, idx_);
+    ++idx_;
+    return s;
+  }
+
+  Status CheckTimestampSize(uint32_t cf, size_t& ts_sz) {
+    return checker_(cf, ts_sz);
+  }
+
+  Status UpdateTimestampIfNeeded(size_t ts_sz, const Slice& key,
+                                 const Slice& ts) {
+    if (ts_sz > 0) {
+      assert(ts_sz == ts.size());
+      UpdateProtectionInformationIfNeeded(key, ts);
+      UpdateTimestamp(key, ts);
+    }
+    return Status::OK();
+  }
+
+  void UpdateProtectionInformationIfNeeded(const Slice& key, const Slice& ts) {
+    if (prot_info_ != nullptr) {
+      const size_t ts_sz = ts.size();
+      SliceParts old_key(&key, 1);
+      Slice key_no_ts(key.data(), key.size() - ts_sz);
+      std::array<Slice, 2> new_key_cmpts{{key_no_ts, ts}};
+      SliceParts new_key(new_key_cmpts.data(), 2);
+      prot_info_->entries_[idx_].UpdateK(old_key, new_key);
+    }
+  }
+
+  void UpdateTimestamp(const Slice& key, const Slice& ts) {
+    const size_t ts_sz = ts.size();
+    char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
+    assert(ptr);
+    memcpy(ptr, ts.data(), ts_sz);
+  }
+
+  // No copy or move.
+  TimestampAssignerBase(const TimestampAssignerBase&) = delete;
+  TimestampAssignerBase(TimestampAssignerBase&&) = delete;
+  TimestampAssignerBase& operator=(const TimestampAssignerBase&) = delete;
+  TimestampAssignerBase& operator=(TimestampAssignerBase&&) = delete;
+
+  WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
+  const std::function<Status(uint32_t, size_t&)> checker_{};
+  size_t idx_ = 0;
+};
+
+class SimpleListTimestampAssigner
+    : public TimestampAssignerBase<SimpleListTimestampAssigner> {
+ public:
+  explicit SimpleListTimestampAssigner(
+      WriteBatch::ProtectionInfo* prot_info,
+      std::function<Status(uint32_t, size_t&)>&& checker,
+      const std::vector<Slice>& timestamps)
+      : TimestampAssignerBase<SimpleListTimestampAssigner>(prot_info,
+                                                           std::move(checker)),
+        timestamps_(timestamps) {}
+
+  ~SimpleListTimestampAssigner() override {}
+
+ private:
+  friend class TimestampAssignerBase<SimpleListTimestampAssigner>;
+
+  Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t idx) {
+    if (idx >= timestamps_.size()) {
+      return Status::InvalidArgument("Need more timestamps for the assignment");
+    }
+    const Slice& ts = timestamps_[idx];
+    size_t ts_sz = ts.size();
+    const Status s = this->CheckTimestampSize(cf, ts_sz);
+    if (!s.ok()) {
+      return s;
+    }
+    return this->UpdateTimestampIfNeeded(ts_sz, key, ts);
+  }
+
+  const std::vector<Slice>& timestamps_;
+};
+
+class TimestampAssigner : public TimestampAssignerBase<TimestampAssigner> {
+ public:
+  explicit TimestampAssigner(WriteBatch::ProtectionInfo* prot_info,
+                             std::function<Status(uint32_t, size_t&)>&& checker,
+                             const Slice& ts)
+      : TimestampAssignerBase<TimestampAssigner>(prot_info, std::move(checker)),
+        timestamp_(ts) {
+    assert(!timestamp_.empty());
+  }
+  ~TimestampAssigner() override {}
+
+ private:
+  friend class TimestampAssignerBase<TimestampAssigner>;
+
+  Status AssignTimestampImpl(uint32_t cf, const Slice& key, size_t /*idx*/) {
+    if (timestamp_.empty()) {
+      return Status::InvalidArgument("Timestamp is empty");
+    }
+    size_t ts_sz = timestamp_.size();
+    const Status s = this->CheckTimestampSize(cf, ts_sz);
+    if (!s.ok()) {
+      return s;
+    }
+    return this->UpdateTimestampIfNeeded(ts_sz, key, timestamp_);
+  }
+
+  const Slice timestamp_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_batch_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_batch_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,28 +7,35 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "rocksdb/db.h"
-
 #include <memory>
+
 #include "db/column_family.h"
+#include "db/db_test_util.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
 #include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-static std::string PrintContents(WriteBatch* b) {
+static std::string PrintContents(WriteBatch* b,
+                                 bool merge_operator_supported = true) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  if (merge_operator_supported) {
+    options.merge_operator.reset(new TestPutOperator());
+  }
+  ImmutableOptions ioptions(options);
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                                kMaxSequenceNumber, 0 /* column_family_id */);
@@ -59,10 +66,11 @@
     if (iter == nullptr) {
       continue;
     }
+    EXPECT_OK(iter->status());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ParsedInternalKey ikey;
       ikey.clear();
-      EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey));
+      EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       switch (ikey.type) {
         case kTypeValue:
           state.append("Put(");
@@ -110,18 +118,21 @@
           break;
       }
       state.append("@");
-      state.append(NumberToString(ikey.sequence));
+      state.append(ToString(ikey.sequence));
     }
+    EXPECT_OK(iter->status());
   }
-  EXPECT_EQ(b->HasPut(), put_count > 0);
-  EXPECT_EQ(b->HasDelete(), delete_count > 0);
-  EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
-  EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
-  EXPECT_EQ(b->HasMerge(), merge_count > 0);
-  if (!s.ok()) {
+  if (s.ok()) {
+    EXPECT_EQ(b->HasPut(), put_count > 0);
+    EXPECT_EQ(b->HasDelete(), delete_count > 0);
+    EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
+    EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
+    EXPECT_EQ(b->HasMerge(), merge_count > 0);
+    if (count != WriteBatchInternal::Count(b)) {
+      state.append("CountMismatch()");
+    }
+  } else {
     state.append(s.ToString());
-  } else if (count != WriteBatchInternal::Count(b)) {
-    state.append("CountMismatch()");
   }
   delete mem->Unref();
   return state;
@@ -138,10 +149,10 @@
 
 TEST_F(WriteBatchTest, Multiple) {
   WriteBatch batch;
-  batch.Put(Slice("foo"), Slice("bar"));
-  batch.Delete(Slice("box"));
-  batch.DeleteRange(Slice("bar"), Slice("foo"));
-  batch.Put(Slice("baz"), Slice("boo"));
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
+  ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo")));
+  ASSERT_OK(batch.Put(Slice("baz"), Slice("boo")));
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
   ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
@@ -156,12 +167,12 @@
 
 TEST_F(WriteBatchTest, Corruption) {
   WriteBatch batch;
-  batch.Put(Slice("foo"), Slice("bar"));
-  batch.Delete(Slice("box"));
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
   WriteBatchInternal::SetSequence(&batch, 200);
   Slice contents = WriteBatchInternal::Contents(&batch);
-  WriteBatchInternal::SetContents(&batch,
-                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_OK(WriteBatchInternal::SetContents(
+      &batch, Slice(contents.data(), contents.size() - 1)));
   ASSERT_EQ("Put(foo, bar)@200"
             "Corruption: bad WriteBatch Delete",
             PrintContents(&batch));
@@ -171,24 +182,24 @@
   WriteBatch b1, b2;
   WriteBatchInternal::SetSequence(&b1, 200);
   WriteBatchInternal::SetSequence(&b2, 300);
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("",
             PrintContents(&b1));
   ASSERT_EQ(0u, b1.Count());
-  b2.Put("a", "va");
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(b2.Put("a", "va"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("Put(a, va)@200",
             PrintContents(&b1));
   ASSERT_EQ(1u, b1.Count());
   b2.Clear();
-  b2.Put("b", "vb");
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(b2.Put("b", "vb"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("Put(a, va)@200"
             "Put(b, vb)@201",
             PrintContents(&b1));
   ASSERT_EQ(2u, b1.Count());
-  b2.Delete("foo");
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(b2.Delete("foo"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("Put(a, va)@200"
             "Put(b, vb)@202"
             "Put(b, vb)@201"
@@ -196,11 +207,11 @@
             PrintContents(&b1));
   ASSERT_EQ(4u, b1.Count());
   b2.Clear();
-  b2.Put("c", "cc");
-  b2.Put("d", "dd");
+  ASSERT_OK(b2.Put("c", "cc"));
+  ASSERT_OK(b2.Put("d", "dd"));
   b2.MarkWalTerminationPoint();
-  b2.Put("e", "ee");
-  WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true);
+  ASSERT_OK(b2.Put("e", "ee"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true));
   ASSERT_EQ(
       "Put(a, va)@200"
       "Put(b, vb)@202"
@@ -223,10 +234,10 @@
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ("", PrintContents(&batch));
   ASSERT_EQ(0u, batch.Count());
-  batch.Put("a", "va");
+  ASSERT_OK(batch.Put("a", "va"));
   ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
   ASSERT_EQ(1u, batch.Count());
-  batch.SingleDelete("a");
+  ASSERT_OK(batch.SingleDelete("a"));
   ASSERT_EQ(
       "SingleDelete(a)@101"
       "Put(a, va)@100",
@@ -307,6 +318,11 @@
       seen += "MarkCommit(" + xid.ToString() + ")";
       return Status::OK();
     }
+    Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override {
+      seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " +
+              ts.ToString(true) + ")";
+      return Status::OK();
+    }
     Status MarkRollback(const Slice& xid) override {
       seen += "MarkRollback(" + xid.ToString() + ")";
       return Status::OK();
@@ -316,7 +332,7 @@
 
 TEST_F(WriteBatchTest, PutNotImplemented) {
   WriteBatch batch;
-  batch.Put(Slice("k1"), Slice("v1"));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
 
@@ -326,7 +342,7 @@
 
 TEST_F(WriteBatchTest, DeleteNotImplemented) {
   WriteBatch batch;
-  batch.Delete(Slice("k2"));
+  ASSERT_OK(batch.Delete(Slice("k2")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
 
@@ -336,7 +352,7 @@
 
 TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
   WriteBatch batch;
-  batch.SingleDelete(Slice("k2"));
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
 
@@ -346,7 +362,7 @@
 
 TEST_F(WriteBatchTest, MergeNotImplemented) {
   WriteBatch batch;
-  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
 
@@ -354,16 +370,26 @@
   ASSERT_OK(batch.Iterate(&handler));
 }
 
+TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ(
+      "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator "
+      "!= nullptr`",
+      PrintContents(&batch, false /* merge_operator_supported */));
+}
+
 TEST_F(WriteBatchTest, Blob) {
   WriteBatch batch;
-  batch.Put(Slice("k1"), Slice("v1"));
-  batch.Put(Slice("k2"), Slice("v2"));
-  batch.Put(Slice("k3"), Slice("v3"));
-  batch.PutLogData(Slice("blob1"));
-  batch.Delete(Slice("k2"));
-  batch.SingleDelete(Slice("k3"));
-  batch.PutLogData(Slice("blob2"));
-  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.Put(Slice("k3"), Slice("v3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k2")));
+  ASSERT_OK(batch.SingleDelete(Slice("k3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
   ASSERT_EQ(6u, batch.Count());
   ASSERT_EQ(
       "Merge(foo, bar)@5"
@@ -375,7 +401,7 @@
       PrintContents(&batch));
 
   TestHandler handler;
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "Put(k1, v1)"
       "Put(k2, v2)"
@@ -390,19 +416,19 @@
 
 TEST_F(WriteBatchTest, PrepareCommit) {
   WriteBatch batch;
-  WriteBatchInternal::InsertNoop(&batch);
-  batch.Put(Slice("k1"), Slice("v1"));
-  batch.Put(Slice("k2"), Slice("v2"));
+  ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
   batch.SetSavePoint();
-  WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"));
+  ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")));
   Status s = batch.RollbackToSavePoint();
   ASSERT_EQ(s, Status::NotFound());
-  WriteBatchInternal::MarkCommit(&batch, Slice("xid1"));
-  WriteBatchInternal::MarkRollback(&batch, Slice("xid1"));
+  ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1")));
+  ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1")));
   ASSERT_EQ(2u, batch.Count());
 
   TestHandler handler;
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "MarkBeginPrepare(false)"
       "Put(k1, v1)"
@@ -419,7 +445,7 @@
 TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
   // Insert key and value of 3GB and push total batch size to 12GB.
   static const size_t kKeyValueSize = 4u;
-  static const uint32_t kNumUpdates = uint32_t(3 << 30);
+  static const uint32_t kNumUpdates = uint32_t{3} << 30;
   std::string raw(kKeyValueSize, 'A');
   WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
   char c = 'A';
@@ -430,7 +456,7 @@
     raw[0] = c;
     raw[raw.length() - 1] = c;
     c++;
-    batch.Put(raw, raw);
+    ASSERT_OK(batch.Put(raw, raw));
   }
 
   ASSERT_EQ(kNumUpdates, batch.Count());
@@ -472,7 +498,7 @@
     bool Continue() override { return num_seen < kNumUpdates; }
   } handler;
 
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(kNumUpdates, handler.num_seen);
 }
 
@@ -486,7 +512,7 @@
   for (char i = 0; i < 2; i++) {
     raw[0] = 'A' + i;
     raw[raw.length() - 1] = 'A' - i;
-    batch.Put(raw, raw);
+    ASSERT_OK(batch.Put(raw, raw));
   }
 
   ASSERT_EQ(2u, batch.Count());
@@ -523,7 +549,7 @@
     bool Continue() override { return num_seen < 2; }
   } handler;
 
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(2, handler.num_seen);
 }
 
@@ -558,14 +584,14 @@
     bool Continue() override { return num_seen < 5; }
   } handler;
 
-  batch.Put(Slice("k1"), Slice("v1"));
-  batch.Put(Slice("k2"), Slice("v2"));
-  batch.PutLogData(Slice("blob1"));
-  batch.Delete(Slice("k1"));
-  batch.SingleDelete(Slice("k2"));
-  batch.PutLogData(Slice("blob2"));
-  batch.Merge(Slice("foo"), Slice("bar"));
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k1")));
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "Put(k1, v1)"
       "Put(k2, v2)"
@@ -577,22 +603,22 @@
 
 TEST_F(WriteBatchTest, PutGatherSlices) {
   WriteBatch batch;
-  batch.Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
 
   {
     // Try a write where the key is one slice but the value is two
     Slice key_slice("baz");
     Slice value_slices[2] = { Slice("header"), Slice("payload") };
-    batch.Put(SliceParts(&key_slice, 1),
-              SliceParts(value_slices, 2));
+    ASSERT_OK(
+        batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2)));
   }
 
   {
     // One where the key is composite but the value is a single slice
     Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
     Slice value_slice("value");
-    batch.Put(SliceParts(key_slices, 3),
-              SliceParts(&value_slice, 1));
+    ASSERT_OK(
+        batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1)));
   }
 
   WriteBatchInternal::SetSequence(&batch, 100);
@@ -608,31 +634,34 @@
  public:
   explicit ColumnFamilyHandleImplDummy(int id)
       : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+  explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        id_(id),
+        ucmp_(ucmp) {}
   uint32_t GetID() const override { return id_; }
-  const Comparator* GetComparator() const override {
-    return BytewiseComparator();
-  }
+  const Comparator* GetComparator() const override { return ucmp_; }
 
  private:
   uint32_t id_;
+  const Comparator* const ucmp_ = BytewiseComparator();
 };
 }  // namespace anonymous
 
 TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
   WriteBatch batch;
   ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
-  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
-  batch.Delete(&eight, Slice("eightfoo"));
-  batch.SingleDelete(&two, Slice("twofoo"));
-  batch.DeleteRange(&two, Slice("3foo"), Slice("4foo"));
-  batch.Merge(&three, Slice("threethree"), Slice("3three"));
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Merge(Slice("omom"), Slice("nom"));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
 
   TestHandler handler;
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "Put(foo, bar)"
       "PutCF(2, twofoo, bar2)"
@@ -650,14 +679,14 @@
 TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
   WriteBatchWithIndex batch;
   ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
-  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
-  batch.Delete(&eight, Slice("eightfoo"));
-  batch.SingleDelete(&two, Slice("twofoo"));
-  batch.Merge(&three, Slice("threethree"), Slice("3three"));
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Merge(Slice("omom"), Slice("nom"));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
 
   std::unique_ptr<WBWIIterator> iter;
 
@@ -736,7 +765,7 @@
   ASSERT_TRUE(!iter->Valid());
 
   TestHandler handler;
-  batch.GetWriteBatch()->Iterate(&handler);
+  ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler));
   ASSERT_EQ(
       "Put(foo, bar)"
       "PutCF(2, twofoo, bar2)"
@@ -755,12 +784,12 @@
   WriteBatch batch;
   batch.SetSavePoint();
 
-  batch.Put("A", "a");
-  batch.Put("B", "b");
+  ASSERT_OK(batch.Put("A", "a"));
+  ASSERT_OK(batch.Put("B", "b"));
   batch.SetSavePoint();
 
-  batch.Put("C", "c");
-  batch.Delete("A");
+  ASSERT_OK(batch.Put("C", "c"));
+  ASSERT_OK(batch.Delete("A"));
   batch.SetSavePoint();
   batch.SetSavePoint();
 
@@ -779,8 +808,8 @@
       "Put(B, b)@1",
       PrintContents(&batch));
 
-  batch.Delete("A");
-  batch.Put("B", "bb");
+  ASSERT_OK(batch.Delete("A"));
+  ASSERT_OK(batch.Put("B", "bb"));
 
   ASSERT_OK(batch.RollbackToSavePoint());
   ASSERT_EQ("", PrintContents(&batch));
@@ -789,12 +818,12 @@
   ASSERT_TRUE(s.IsNotFound());
   ASSERT_EQ("", PrintContents(&batch));
 
-  batch.Put("D", "d");
-  batch.Delete("A");
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
 
   batch.SetSavePoint();
 
-  batch.Put("A", "aaa");
+  ASSERT_OK(batch.Put("A", "aaa"));
 
   ASSERT_OK(batch.RollbackToSavePoint());
   ASSERT_EQ(
@@ -804,8 +833,8 @@
 
   batch.SetSavePoint();
 
-  batch.Put("D", "d");
-  batch.Delete("A");
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
 
   ASSERT_OK(batch.RollbackToSavePoint());
   ASSERT_EQ(
@@ -826,7 +855,7 @@
   ASSERT_TRUE(s.IsNotFound());
   ASSERT_EQ("", PrintContents(&batch2));
 
-  batch2.Delete("A");
+  ASSERT_OK(batch2.Delete("A"));
   batch2.SetSavePoint();
 
   s = batch2.RollbackToSavePoint();
@@ -838,7 +867,7 @@
 
   batch2.SetSavePoint();
 
-  batch2.Delete("B");
+  ASSERT_OK(batch2.Delete("B"));
   ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
 
   batch2.SetSavePoint();
@@ -861,7 +890,7 @@
   ASSERT_EQ("", PrintContents(&batch3));
 
   batch3.SetSavePoint();
-  batch3.Delete("A");
+  ASSERT_OK(batch3.Delete("A"));
 
   s = batch3.PopSavePoint();
   ASSERT_OK(s);
@@ -880,6 +909,173 @@
   ASSERT_TRUE(s.IsMemoryLimit());
 }
 
+namespace {
+class TimestampChecker : public WriteBatch::Handler {
+ public:
+  explicit TimestampChecker(
+      std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps, Slice ts)
+      : cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {}
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override {
+    auto cf_iter = cf_to_ucmps_.find(cf);
+    if (cf_iter == cf_to_ucmps_.end()) {
+      return Status::Corruption();
+    }
+    const Comparator* const ucmp = cf_iter->second;
+    assert(ucmp);
+    size_t ts_sz = ucmp->timestamp_size();
+    if (ts_sz == 0) {
+      return Status::OK();
+    }
+    if (key.size() < ts_sz) {
+      return Status::Corruption();
+    }
+    Slice ts = ExtractTimestampFromUserKey(key, ts_sz);
+    if (ts.compare(timestamp_) != 0) {
+      return Status::Corruption();
+    }
+    return Status::OK();
+  }
+
+ private:
+  std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps_;
+  Slice timestamp_;
+};
+
+Status CheckTimestampsInWriteBatch(
+    WriteBatch& wb, Slice timestamp,
+    std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps) {
+  TimestampChecker ts_checker(cf_to_ucmps, timestamp);
+  return wb.Iterate(&ts_checker);
+}
+}  // namespace
+
+TEST_F(WriteBatchTest, AssignTimestamps) {
+  // We assume the last eight bytes of each key is reserved for timestamps.
+  // Therefore, we must make sure each key is longer than eight bytes.
+  constexpr size_t key_size = 16;
+  constexpr size_t num_of_keys = 10;
+  std::vector<std::string> key_strs(num_of_keys, std::string(key_size, '\0'));
+
+  ColumnFamilyHandleImplDummy cf0(0);
+  ColumnFamilyHandleImplDummy cf4(4, test::ComparatorWithU64Ts());
+  ColumnFamilyHandleImplDummy cf5(5, test::ComparatorWithU64Ts());
+
+  const std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps = {
+      {0, cf0.GetComparator()},
+      {4, cf4.GetComparator()},
+      {5, cf5.GetComparator()}};
+
+  WriteBatch batch;
+  // Write to the batch. We will assign timestamps later.
+  for (const auto& key_str : key_strs) {
+    ASSERT_OK(batch.Put(&cf0, key_str, "value"));
+    ASSERT_OK(batch.Put(&cf4, key_str, "value"));
+    ASSERT_OK(batch.Put(&cf5, key_str, "value"));
+  }
+
+  static constexpr size_t timestamp_size = sizeof(uint64_t);
+  const auto checker1 = [](uint32_t cf, size_t& ts_sz) {
+    if (cf == 4 || cf == 5) {
+      if (ts_sz != timestamp_size) {
+        return Status::InvalidArgument("Timestamp size mismatch");
+      }
+    } else if (cf == 0) {
+      ts_sz = 0;
+      return Status::OK();
+    } else {
+      return Status::Corruption("Invalid cf");
+    }
+    return Status::OK();
+  };
+  ASSERT_OK(
+      batch.AssignTimestamp(std::string(timestamp_size, '\xfe'), checker1));
+  ASSERT_OK(CheckTimestampsInWriteBatch(
+      batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps));
+
+  // We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to
+  // simulate the case in which a transaction enables indexing for some writes
+  // while disables indexing for other writes. A transaction uses a
+  // WriteBatchWithIndex object to buffer writes (we consider Write-committed
+  // policy only). If indexing is enabled, then writes go through
+  // WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a
+  // mapping from cf to user comparators. If indexing is disabled, a transaction
+  // writes directly to the underlying raw WriteBatch. We will need to track the
+  // comparator information for the column families to which un-indexed writes
+  // are performed. When calling AssignTimestamp(s) API of WriteBatch, we need
+  // indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform
+  // checking.
+  std::unordered_map<uint32_t, const Comparator*> indexed_cf_to_ucmps = {
+      {0, cf0.GetComparator()}, {4, cf4.GetComparator()}};
+  std::unordered_set<uint32_t> non_indexed_cfs_with_ts = {cf5.GetID()};
+  const auto checker2 = [&indexed_cf_to_ucmps, &non_indexed_cfs_with_ts](
+                            uint32_t cf, size_t& ts_sz) {
+    if (non_indexed_cfs_with_ts.count(cf) > 0) {
+      if (ts_sz != timestamp_size) {
+        return Status::InvalidArgument("Timestamp size mismatch");
+      }
+      return Status::OK();
+    }
+    auto cf_iter = indexed_cf_to_ucmps.find(cf);
+    if (cf_iter == indexed_cf_to_ucmps.end()) {
+      return Status::Corruption("Unknown cf");
+    }
+    const Comparator* const ucmp = cf_iter->second;
+    assert(ucmp);
+    if (ucmp->timestamp_size() == 0) {
+      ts_sz = 0;
+    } else if (ts_sz != ucmp->timestamp_size()) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+    return Status::OK();
+  };
+  ASSERT_OK(
+      batch.AssignTimestamp(std::string(timestamp_size, '\xef'), checker2));
+  ASSERT_OK(CheckTimestampsInWriteBatch(
+      batch, std::string(timestamp_size, '\xef'), cf_to_ucmps));
+
+  std::vector<std::string> ts_strs;
+  for (size_t i = 0; i < 3 * key_strs.size(); ++i) {
+    if (0 == (i % 3)) {
+      ts_strs.emplace_back();
+    } else {
+      ts_strs.emplace_back(std::string(timestamp_size, '\xee'));
+    }
+  }
+  std::vector<Slice> ts_vec(ts_strs.size());
+  for (size_t i = 0; i < ts_vec.size(); ++i) {
+    ts_vec[i] = ts_strs[i];
+  }
+  const auto checker3 = [&cf_to_ucmps](uint32_t cf, size_t& ts_sz) {
+    auto cf_iter = cf_to_ucmps.find(cf);
+    if (cf_iter == cf_to_ucmps.end()) {
+      return Status::Corruption("Invalid cf");
+    }
+    const Comparator* const ucmp = cf_iter->second;
+    assert(ucmp);
+    if (ucmp->timestamp_size() != ts_sz) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+    return Status::OK();
+  };
+  ASSERT_OK(batch.AssignTimestamps(ts_vec, checker3));
+  ASSERT_OK(CheckTimestampsInWriteBatch(
+      batch, std::string(timestamp_size, '\xee'), cf_to_ucmps));
+}
+
+TEST_F(WriteBatchTest, CommitWithTimestamp) {
+  WriteBatch wb;
+  const std::string txn_name = "xid1";
+  std::string ts;
+  constexpr uint64_t commit_ts = 23;
+  PutFixed64(&ts, commit_ts);
+  ASSERT_OK(WriteBatchInternal::MarkCommitWithTimestamp(&wb, txn_name, ts));
+  TestHandler handler;
+  ASSERT_OK(wb.Iterate(&handler));
+  ASSERT_EQ("MarkCommitWithTimestamp(" + txn_name + ", " +
+                Slice(ts).ToString(true) + ")",
+            handler.seen);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_callback_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_callback_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_callback_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_callback_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -84,13 +84,35 @@
   bool AllowWriteBatching() override { return allow_batching_; }
 };
 
-TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class WriteCallbackPTest
+    : public WriteCallbackTest,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
+ public:
+  WriteCallbackPTest() {
+    std::tie(unordered_write_, seq_per_batch_, two_queues_, allow_parallel_,
+             allow_batching_, enable_WAL_, enable_pipelined_write_) =
+        GetParam();
+  }
+
+ protected:
+  bool unordered_write_;
+  bool seq_per_batch_;
+  bool two_queues_;
+  bool allow_parallel_;
+  bool allow_batching_;
+  bool enable_WAL_;
+  bool enable_pipelined_write_;
+};
+
+TEST_P(WriteCallbackPTest, WriteWithCallbackTest) {
   struct WriteOP {
     WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; }
 
     void Put(const string& key, const string& val) {
       kvs_.push_back(std::make_pair(key, val));
-      write_batch_.Put(key, val);
+      ASSERT_OK(write_batch_.Put(key, val));
     }
 
     void Clear() {
@@ -124,254 +146,239 @@
       {false, false, true, false, true},
   };
 
-  for (auto& unordered_write : {true, false}) {
-  for (auto& seq_per_batch : {true, false}) {
-  for (auto& two_queues : {true, false}) {
-    for (auto& allow_parallel : {true, false}) {
-      for (auto& allow_batching : {true, false}) {
-        for (auto& enable_WAL : {true, false}) {
-          for (auto& enable_pipelined_write : {true, false}) {
-            for (auto& write_group : write_scenarios) {
-              Options options;
-              options.create_if_missing = true;
-              options.unordered_write = unordered_write;
-              options.allow_concurrent_memtable_write = allow_parallel;
-              options.enable_pipelined_write = enable_pipelined_write;
-              options.two_write_queues = two_queues;
-              // Skip unsupported combinations
-              if (options.enable_pipelined_write && seq_per_batch) {
-                continue;
-              }
-              if (options.enable_pipelined_write && options.two_write_queues) {
-                continue;
-              }
-              if (options.unordered_write &&
-                  !options.allow_concurrent_memtable_write) {
-                continue;
-              }
-              if (options.unordered_write && options.enable_pipelined_write) {
-                continue;
-              }
-
-              ReadOptions read_options;
-              DB* db;
-              DBImpl* db_impl;
-
-              DestroyDB(dbname, options);
-
-              DBOptions db_options(options);
-              ColumnFamilyOptions cf_options(options);
-              std::vector<ColumnFamilyDescriptor> column_families;
-              column_families.push_back(
-                  ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
-              std::vector<ColumnFamilyHandle*> handles;
-              auto open_s =
-                  DBImpl::Open(db_options, dbname, column_families, &handles,
-                               &db, seq_per_batch, true /* batch_per_txn */);
-              ASSERT_OK(open_s);
-              assert(handles.size() == 1);
-              delete handles[0];
-
-              db_impl = dynamic_cast<DBImpl*>(db);
-              ASSERT_TRUE(db_impl);
-
-              // Writers that have called JoinBatchGroup.
-              std::atomic<uint64_t> threads_joining(0);
-              // Writers that have linked to the queue
-              std::atomic<uint64_t> threads_linked(0);
-              // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
-              std::atomic<uint64_t> threads_verified(0);
-
-              std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
-              ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
-
-              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-                  "WriteThread::JoinBatchGroup:Start", [&](void*) {
-                    uint64_t cur_threads_joining = threads_joining.fetch_add(1);
-                    // Wait for the last joined writer to link to the queue.
-                    // In this way the writers link to the queue one by one.
-                    // This allows us to confidently detect the first writer
-                    // who increases threads_linked as the leader.
-                    while (threads_linked.load() < cur_threads_joining) {
-                    }
-                  });
-
-              // Verification once writers call JoinBatchGroup.
-              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-                  "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
-                    uint64_t cur_threads_linked = threads_linked.fetch_add(1);
-                    bool is_leader = false;
-                    bool is_last = false;
-
-                    // who am i
-                    is_leader = (cur_threads_linked == 0);
-                    is_last = (cur_threads_linked == write_group.size() - 1);
-
-                    // check my state
-                    auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
-
-                    if (is_leader) {
-                      ASSERT_TRUE(writer->state ==
-                                  WriteThread::State::STATE_GROUP_LEADER);
-                    } else {
-                      ASSERT_TRUE(writer->state ==
-                                  WriteThread::State::STATE_INIT);
-                    }
-
-                    // (meta test) the first WriteOP should indeed be the first
-                    // and the last should be the last (all others can be out of
-                    // order)
-                    if (is_leader) {
-                      ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
-                                  !write_group.front().callback_.should_fail_);
-                    } else if (is_last) {
-                      ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
-                                  !write_group.back().callback_.should_fail_);
-                    }
-
-                    threads_verified.fetch_add(1);
-                    // Wait here until all verification in this sync-point
-                    // callback finish for all writers.
-                    while (threads_verified.load() < write_group.size()) {
-                    }
-                  });
-
-              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-                  "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
-                    // check my state
-                    auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
-
-                    if (!allow_batching) {
-                      // no batching so everyone should be a leader
-                      ASSERT_TRUE(writer->state ==
-                                  WriteThread::State::STATE_GROUP_LEADER);
-                    } else if (!allow_parallel) {
-                      ASSERT_TRUE(writer->state ==
-                                      WriteThread::State::STATE_COMPLETED ||
-                                  (enable_pipelined_write &&
-                                   writer->state ==
-                                       WriteThread::State::
-                                           STATE_MEMTABLE_WRITER_LEADER));
-                    }
-                  });
-
-              std::atomic<uint32_t> thread_num(0);
-              std::atomic<char> dummy_key(0);
-
-              // Each write thread create a random write batch and write to DB
-              // with a write callback.
-              std::function<void()> write_with_callback_func = [&]() {
-                uint32_t i = thread_num.fetch_add(1);
-                Random rnd(i);
-
-                // leaders gotta lead
-                while (i > 0 && threads_verified.load() < 1) {
-                }
-
-                // loser has to lose
-                while (i == write_group.size() - 1 &&
-                       threads_verified.load() < write_group.size() - 1) {
-                }
-
-                auto& write_op = write_group.at(i);
-                write_op.Clear();
-                write_op.callback_.allow_batching_ = allow_batching;
-
-                // insert some keys
-                for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
-                  // grab unique key
-                  char my_key = dummy_key.fetch_add(1);
-
-                  string skey(5, my_key);
-                  string sval(10, my_key);
-                  write_op.Put(skey, sval);
-
-                  if (!write_op.callback_.should_fail_ && !seq_per_batch) {
-                    seq.fetch_add(1);
-                  }
-                }
-                if (!write_op.callback_.should_fail_ && seq_per_batch) {
-                  seq.fetch_add(1);
-                }
-
-                WriteOptions woptions;
-                woptions.disableWAL = !enable_WAL;
-                woptions.sync = enable_WAL;
-                Status s;
-                if (seq_per_batch) {
-                  class PublishSeqCallback : public PreReleaseCallback {
-                   public:
-                    PublishSeqCallback(DBImpl* db_impl_in)
-                        : db_impl_(db_impl_in) {}
-                    Status Callback(SequenceNumber last_seq, bool /*not used*/,
-                                    uint64_t, size_t /*index*/,
-                                    size_t /*total*/) override {
-                      db_impl_->SetLastPublishedSequence(last_seq);
-                      return Status::OK();
-                    }
-                    DBImpl* db_impl_;
-                  } publish_seq_callback(db_impl);
-                  // seq_per_batch requires a natural batch separator or Noop
-                  WriteBatchInternal::InsertNoop(&write_op.write_batch_);
-                  const size_t ONE_BATCH = 1;
-                  s = db_impl->WriteImpl(
-                      woptions, &write_op.write_batch_, &write_op.callback_,
-                      nullptr, 0, false, nullptr, ONE_BATCH,
-                      two_queues ? &publish_seq_callback : nullptr);
-                } else {
-                  s = db_impl->WriteWithCallback(
-                      woptions, &write_op.write_batch_, &write_op.callback_);
-                }
-
-                if (write_op.callback_.should_fail_) {
-                  ASSERT_TRUE(s.IsBusy());
-                } else {
-                  ASSERT_OK(s);
-                }
-              };
-
-              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-              // do all the writes
-              std::vector<port::Thread> threads;
-              for (uint32_t i = 0; i < write_group.size(); i++) {
-                threads.emplace_back(write_with_callback_func);
-              }
-              for (auto& t : threads) {
-                t.join();
-              }
-
-              ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-
-              // check for keys
-              string value;
-              for (auto& w : write_group) {
-                ASSERT_TRUE(w.callback_.was_called_.load());
-                for (auto& kvp : w.kvs_) {
-                  if (w.callback_.should_fail_) {
-                    ASSERT_TRUE(
-                        db->Get(read_options, kvp.first, &value).IsNotFound());
-                  } else {
-                    ASSERT_OK(db->Get(read_options, kvp.first, &value));
-                    ASSERT_EQ(value, kvp.second);
-                  }
-                }
-              }
-
-              ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
-
-              delete db;
-              DestroyDB(dbname, options);
-            }
+  for (auto& write_group : write_scenarios) {
+    Options options;
+    options.create_if_missing = true;
+    options.unordered_write = unordered_write_;
+    options.allow_concurrent_memtable_write = allow_parallel_;
+    options.enable_pipelined_write = enable_pipelined_write_;
+    options.two_write_queues = two_queues_;
+    // Skip unsupported combinations
+    if (options.enable_pipelined_write && seq_per_batch_) {
+      continue;
+    }
+    if (options.enable_pipelined_write && options.two_write_queues) {
+      continue;
+    }
+    if (options.unordered_write && !options.allow_concurrent_memtable_write) {
+      continue;
+    }
+    if (options.unordered_write && options.enable_pipelined_write) {
+      continue;
+    }
+
+    ReadOptions read_options;
+    DB* db;
+    DBImpl* db_impl;
+
+    DestroyDB(dbname, options);
+
+    DBOptions db_options(options);
+    ColumnFamilyOptions cf_options(options);
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.push_back(
+        ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+    std::vector<ColumnFamilyHandle*> handles;
+    auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles,
+                               &db, seq_per_batch_, true /* batch_per_txn */);
+    ASSERT_OK(open_s);
+    assert(handles.size() == 1);
+    delete handles[0];
+
+    db_impl = dynamic_cast<DBImpl*>(db);
+    ASSERT_TRUE(db_impl);
+
+    // Writers that have called JoinBatchGroup.
+    std::atomic<uint64_t> threads_joining(0);
+    // Writers that have linked to the queue
+    std::atomic<uint64_t> threads_linked(0);
+    // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
+    std::atomic<uint64_t> threads_verified(0);
+
+    std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
+    ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Start", [&](void*) {
+          uint64_t cur_threads_joining = threads_joining.fetch_add(1);
+          // Wait for the last joined writer to link to the queue.
+          // In this way the writers link to the queue one by one.
+          // This allows us to confidently detect the first writer
+          // who increases threads_linked as the leader.
+          while (threads_linked.load() < cur_threads_joining) {
+          }
+        });
+
+    // Verification once writers call JoinBatchGroup.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+          uint64_t cur_threads_linked = threads_linked.fetch_add(1);
+          bool is_leader = false;
+          bool is_last = false;
+
+          // who am i
+          is_leader = (cur_threads_linked == 0);
+          is_last = (cur_threads_linked == write_group.size() - 1);
+
+          // check my state
+          auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+          if (is_leader) {
+            ASSERT_TRUE(writer->state ==
+                        WriteThread::State::STATE_GROUP_LEADER);
+          } else {
+            ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT);
+          }
+
+          // (meta test) the first WriteOP should indeed be the first
+          // and the last should be the last (all others can be out of
+          // order)
+          if (is_leader) {
+            ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                        !write_group.front().callback_.should_fail_);
+          } else if (is_last) {
+            ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                        !write_group.back().callback_.should_fail_);
+          }
+
+          threads_verified.fetch_add(1);
+          // Wait here until all verification in this sync-point
+          // callback finish for all writers.
+          while (threads_verified.load() < write_group.size()) {
+          }
+        });
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
+          // check my state
+          auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+          if (!allow_batching_) {
+            // no batching so everyone should be a leader
+            ASSERT_TRUE(writer->state ==
+                        WriteThread::State::STATE_GROUP_LEADER);
+          } else if (!allow_parallel_) {
+            ASSERT_TRUE(writer->state == WriteThread::State::STATE_COMPLETED ||
+                        (enable_pipelined_write_ &&
+                         writer->state ==
+                             WriteThread::State::STATE_MEMTABLE_WRITER_LEADER));
+          }
+        });
+
+    std::atomic<uint32_t> thread_num(0);
+    std::atomic<char> dummy_key(0);
+
+    // Each write thread create a random write batch and write to DB
+    // with a write callback.
+    std::function<void()> write_with_callback_func = [&]() {
+      uint32_t i = thread_num.fetch_add(1);
+      Random rnd(i);
+
+      // leaders gotta lead
+      while (i > 0 && threads_verified.load() < 1) {
+      }
+
+      // loser has to lose
+      while (i == write_group.size() - 1 &&
+             threads_verified.load() < write_group.size() - 1) {
+      }
+
+      auto& write_op = write_group.at(i);
+      write_op.Clear();
+      write_op.callback_.allow_batching_ = allow_batching_;
+
+      // insert some keys
+      for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
+        // grab unique key
+        char my_key = dummy_key.fetch_add(1);
+
+        string skey(5, my_key);
+        string sval(10, my_key);
+        write_op.Put(skey, sval);
+
+        if (!write_op.callback_.should_fail_ && !seq_per_batch_) {
+          seq.fetch_add(1);
+        }
+      }
+      if (!write_op.callback_.should_fail_ && seq_per_batch_) {
+        seq.fetch_add(1);
+      }
+
+      WriteOptions woptions;
+      woptions.disableWAL = !enable_WAL_;
+      woptions.sync = enable_WAL_;
+      Status s;
+      if (seq_per_batch_) {
+        class PublishSeqCallback : public PreReleaseCallback {
+         public:
+          PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {}
+          Status Callback(SequenceNumber last_seq, bool /*not used*/, uint64_t,
+                          size_t /*index*/, size_t /*total*/) override {
+            db_impl_->SetLastPublishedSequence(last_seq);
+            return Status::OK();
           }
+          DBImpl* db_impl_;
+        } publish_seq_callback(db_impl);
+        // seq_per_batch_ requires a natural batch separator or Noop
+        ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_));
+        const size_t ONE_BATCH = 1;
+        s = db_impl->WriteImpl(woptions, &write_op.write_batch_,
+                               &write_op.callback_, nullptr, 0, false, nullptr,
+                               ONE_BATCH,
+                               two_queues_ ? &publish_seq_callback : nullptr);
+      } else {
+        s = db_impl->WriteWithCallback(woptions, &write_op.write_batch_,
+                                       &write_op.callback_);
+      }
+
+      if (write_op.callback_.should_fail_) {
+        ASSERT_TRUE(s.IsBusy());
+      } else {
+        ASSERT_OK(s);
+      }
+    };
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // do all the writes
+    std::vector<port::Thread> threads;
+    for (uint32_t i = 0; i < write_group.size(); i++) {
+      threads.emplace_back(write_with_callback_func);
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    // check for keys
+    string value;
+    for (auto& w : write_group) {
+      ASSERT_TRUE(w.callback_.was_called_.load());
+      for (auto& kvp : w.kvs_) {
+        if (w.callback_.should_fail_) {
+          ASSERT_TRUE(db->Get(read_options, kvp.first, &value).IsNotFound());
+        } else {
+          ASSERT_OK(db->Get(read_options, kvp.first, &value));
+          ASSERT_EQ(value, kvp.second);
         }
       }
     }
-  }
-  }
+
+    ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
+
+    delete db;
+    DestroyDB(dbname, options);
   }
 }
 
+INSTANTIATE_TEST_CASE_P(WriteCallbackPTest, WriteCallbackPTest,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
 TEST_F(WriteCallbackTest, WriteCallBackTest) {
   Options options;
   WriteOptions write_options;
@@ -391,8 +398,8 @@
 
   WriteBatch wb;
 
-  wb.Put("a", "value.a");
-  wb.Delete("x");
+  ASSERT_OK(wb.Put("a", "value.a"));
+  ASSERT_OK(wb.Delete("x"));
 
   // Test a simple Write
   s = db->Write(write_options, &wb);
@@ -406,7 +413,7 @@
   WriteCallbackTestWriteCallback1 callback1;
   WriteBatch wb2;
 
-  wb2.Put("a", "value.a2");
+  ASSERT_OK(wb2.Put("a", "value.a2"));
 
   s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
   ASSERT_OK(s);
@@ -420,7 +427,7 @@
   WriteCallbackTestWriteCallback2 callback2;
   WriteBatch wb3;
 
-  wb3.Put("a", "value.a3");
+  ASSERT_OK(wb3.Put("a", "value.a3"));
 
   s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
   ASSERT_NOK(s);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,10 +5,12 @@
 
 #include "db/write_controller.h"
 
+#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <ratio>
-#include "rocksdb/env.h"
+
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -19,10 +21,14 @@
 
 std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
     uint64_t write_rate) {
-  total_delayed_++;
-  // Reset counters.
-  last_refill_time_ = 0;
-  bytes_left_ = 0;
+  if (0 == total_delayed_++) {
+    // Starting delay, so reset counters.
+    next_refill_time_ = 0;
+    credit_in_bytes_ = 0;
+  }
+  // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in
+  // next_refill_time_ will be based on an old rate. This rate will apply
+  // for subsequent additional debts and for the next refill.
   set_delayed_write_rate(write_rate);
   return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
 }
@@ -42,7 +48,7 @@
 // If it turns out to be a performance issue, we can redesign the thread
 // synchronization model here.
 // The function trust caller will sleep micros returned.
-uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {
+uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) {
   if (total_stopped_.load(std::memory_order_relaxed) > 0) {
     return 0;
   }
@@ -50,64 +56,51 @@
     return 0;
   }
 
-  const uint64_t kMicrosPerSecond = 1000000;
-  const uint64_t kRefillInterval = 1024U;
-
-  if (bytes_left_ >= num_bytes) {
-    bytes_left_ -= num_bytes;
+  if (credit_in_bytes_ >= num_bytes) {
+    credit_in_bytes_ -= num_bytes;
     return 0;
   }
   // The frequency to get time inside DB mutex is less than one per refill
   // interval.
-  auto time_now = NowMicrosMonotonic(env);
+  auto time_now = NowMicrosMonotonic(clock);
 
-  uint64_t sleep_debt = 0;
-  uint64_t time_since_last_refill = 0;
-  if (last_refill_time_ != 0) {
-    if (last_refill_time_ > time_now) {
-      sleep_debt = last_refill_time_ - time_now;
-    } else {
-      time_since_last_refill = time_now - last_refill_time_;
-      bytes_left_ +=
-          static_cast<uint64_t>(static_cast<double>(time_since_last_refill) /
-                                kMicrosPerSecond * delayed_write_rate_);
-      if (time_since_last_refill >= kRefillInterval &&
-          bytes_left_ > num_bytes) {
-        // If refill interval already passed and we have enough bytes
-        // return without extra sleeping.
-        last_refill_time_ = time_now;
-        bytes_left_ -= num_bytes;
-        return 0;
-      }
+  const uint64_t kMicrosPerSecond = 1000000;
+  // Refill every 1 ms
+  const uint64_t kMicrosPerRefill = 1000;
+
+  if (next_refill_time_ == 0) {
+    // Start with an initial allotment of bytes for one interval
+    next_refill_time_ = time_now;
+  }
+  if (next_refill_time_ <= time_now) {
+    // Refill based on time interval plus any extra elapsed
+    uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill;
+    credit_in_bytes_ += static_cast<uint64_t>(
+        1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999);
+    next_refill_time_ = time_now + kMicrosPerRefill;
+
+    if (credit_in_bytes_ >= num_bytes) {
+      // Avoid delay if possible, to reduce DB mutex release & re-aquire.
+      credit_in_bytes_ -= num_bytes;
+      return 0;
     }
   }
 
-  uint64_t single_refill_amount =
-      delayed_write_rate_ * kRefillInterval / kMicrosPerSecond;
-  if (bytes_left_ + single_refill_amount >= num_bytes) {
-    // Wait until a refill interval
-    // Never trigger expire for less than one refill interval to avoid to get
-    // time.
-    bytes_left_ = bytes_left_ + single_refill_amount - num_bytes;
-    last_refill_time_ = time_now + kRefillInterval;
-    return kRefillInterval + sleep_debt;
-  }
-
-  // Need to refill more than one interval. Need to sleep longer. Check
-  // whether expiration will hit
-
-  // Sleep just until `num_bytes` is allowed.
-  uint64_t sleep_amount =
-      static_cast<uint64_t>(num_bytes /
-                            static_cast<long double>(delayed_write_rate_) *
-                            kMicrosPerSecond) +
-      sleep_debt;
-  last_refill_time_ = time_now + sleep_amount;
-  return sleep_amount;
+  // We need to delay to avoid exceeding write rate.
+  assert(num_bytes > credit_in_bytes_);
+  uint64_t bytes_over_budget = num_bytes - credit_in_bytes_;
+  uint64_t needed_delay = static_cast<uint64_t>(
+      1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond);
+
+  credit_in_bytes_ = 0;
+  next_refill_time_ += needed_delay;
+
+  // Minimum delay of refill interval, to reduce DB mutex contention.
+  return std::max(next_refill_time_ - time_now, kMicrosPerRefill);
 }
 
-uint64_t WriteController::NowMicrosMonotonic(Env* env) {
-  return env->NowNanos() / std::milli::den;
+uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) {
+  return clock->NowNanos() / std::milli::den;
 }
 
 StopWriteToken::~StopWriteToken() {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,7 +13,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-class Env;
+class SystemClock;
 class WriteControllerToken;
 
 // WriteController is controlling write stalls in our write code-path. Write
@@ -27,8 +27,8 @@
       : total_stopped_(0),
         total_delayed_(0),
         total_compaction_pressure_(0),
-        bytes_left_(0),
-        last_refill_time_(0),
+        credit_in_bytes_(0),
+        next_refill_time_(0),
         low_pri_rate_limiter_(
             NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
     set_max_delayed_write_rate(_delayed_write_rate);
@@ -57,7 +57,7 @@
   // return how many microseconds the caller needs to sleep after the call
   // num_bytes: how many number of bytes to put into the DB.
   // Prerequisite: DB mutex held.
-  uint64_t GetDelay(Env* env, uint64_t num_bytes);
+  uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes);
   void set_delayed_write_rate(uint64_t write_rate) {
     // avoid divide 0
     if (write_rate == 0) {
@@ -85,7 +85,7 @@
   RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
 
  private:
-  uint64_t NowMicrosMonotonic(Env* env);
+  uint64_t NowMicrosMonotonic(SystemClock* clock);
 
   friend class WriteControllerToken;
   friend class StopWriteToken;
@@ -95,11 +95,14 @@
   std::atomic<int> total_stopped_;
   std::atomic<int> total_delayed_;
   std::atomic<int> total_compaction_pressure_;
-  uint64_t bytes_left_;
-  uint64_t last_refill_time_;
-  // write rate set when initialization or by `DBImpl::SetDBOptions`
+
+  // Number of bytes allowed to write without delay
+  uint64_t credit_in_bytes_;
+  // Next time that we can add more credit of bytes
+  uint64_t next_refill_time_;
+  // Write rate set when initialization or by `DBImpl::SetDBOptions`
   uint64_t max_delayed_write_rate_;
-  // current write rate
+  // Current write rate (bytes / second)
   uint64_t delayed_write_rate_;
 
   std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_controller_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_controller_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,128 +3,240 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include <ratio>
-
 #include "db/write_controller.h"
 
-#include "rocksdb/env.h"
+#include <array>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-class WriteControllerTest : public testing::Test {};
-
-class TimeSetEnv : public EnvWrapper {
+namespace {
+class TimeSetClock : public SystemClockWrapper {
  public:
-  explicit TimeSetEnv() : EnvWrapper(nullptr) {}
+  explicit TimeSetClock() : SystemClockWrapper(nullptr) {}
+  const char* Name() const override { return "TimeSetClock"; }
   uint64_t now_micros_ = 6666;
   uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
 };
+}  // namespace
+class WriteControllerTest : public testing::Test {
+ public:
+  WriteControllerTest() { clock_ = std::make_shared<TimeSetClock>(); }
+  std::shared_ptr<TimeSetClock> clock_;
+};
+
+// Make tests easier to read
+#define MILLION *1000000u
+#define MB MILLION
+#define MBPS MILLION
+#define SECS MILLION  // in microseconds
+
+TEST_F(WriteControllerTest, BasicAPI) {
+  WriteController controller(40 MBPS);  // also set max delayed rate
+  EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  // set, get
+  controller.set_delayed_write_rate(20 MBPS);
+  EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  {
+    // set with token, get
+    auto delay_token_0 = controller.GetDelayToken(10 MBPS);
+    EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS);
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    // test with delay
+    EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 2 SECS;  // pay the "debt"
+
+    auto delay_token_1 = controller.GetDelayToken(2 MBPS);
+    EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 10 SECS;  // pay the "debt"
+
+    auto delay_token_2 = controller.GetDelayToken(1 MBPS);
+    EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 20 SECS;  // pay the "debt"
+
+    auto delay_token_3 = controller.GetDelayToken(20 MBPS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 1 SECS;  // pay the "debt"
+
+    // 60M is more than the max rate of 40M. Max rate will be used.
+    EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+    auto delay_token_4 =
+        controller.GetDelayToken(controller.delayed_write_rate() * 3);
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    EXPECT_EQ(static_cast<uint64_t>(0.5 SECS),
+              controller.GetDelay(clock_.get(), 20 MB));
+
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+
+    // Test stop tokens
+    {
+      auto stop_token_1 = controller.GetStopToken();
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      {
+        auto stop_token_2 = controller.GetStopToken();
+        EXPECT_TRUE(controller.IsStopped());
+        EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      }
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+    }
+    // Stop tokens released
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    // pay the previous "debt"
+    clock_->now_micros_ += static_cast<uint64_t>(0.5 SECS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB));
+  }
 
-TEST_F(WriteControllerTest, ChangeDelayRateTest) {
-  TimeSetEnv env;
-  WriteController controller(40000000u);  // also set max delayed rate
-  controller.set_delayed_write_rate(10000000u);
+  // Delay tokens released
+  EXPECT_FALSE(controller.NeedsDelay());
+}
+
+TEST_F(WriteControllerTest, StartFilled) {
+  WriteController controller(10 MBPS);
+
+  // Attempt to write two things that combined would be allowed within
+  // a single refill interval
   auto delay_token_0 =
       controller.GetDelayToken(controller.delayed_write_rate());
-  ASSERT_EQ(static_cast<uint64_t>(2000000),
-            controller.GetDelay(&env, 20000000u));
-  auto delay_token_1 = controller.GetDelayToken(2000000u);
-  ASSERT_EQ(static_cast<uint64_t>(10000000),
-            controller.GetDelay(&env, 20000000u));
-  auto delay_token_2 = controller.GetDelayToken(1000000u);
-  ASSERT_EQ(static_cast<uint64_t>(20000000),
-            controller.GetDelay(&env, 20000000u));
-  auto delay_token_3 = controller.GetDelayToken(20000000u);
-  ASSERT_EQ(static_cast<uint64_t>(1000000),
-            controller.GetDelay(&env, 20000000u));
-  // This is more than max rate. Max delayed rate will be used.
-  auto delay_token_4 =
-      controller.GetDelayToken(controller.delayed_write_rate() * 3);
-  ASSERT_EQ(static_cast<uint64_t>(500000),
-            controller.GetDelay(&env, 20000000u));
+
+  // Verify no delay because write rate has not been exceeded within
+  // refill interval.
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+  // Allow refill (kMicrosPerRefill)
+  clock_->now_micros_ += 1000;
+
+  // Again
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+  // Control: something bigger that would exceed write rate within interval
+  uint64_t delay = controller.GetDelay(clock_.get(), 10 MB);
+  EXPECT_GT(1.0 * delay, 0.999 SECS);
+  EXPECT_LT(1.0 * delay, 1.001 SECS);
+}
+
+TEST_F(WriteControllerTest, DebtAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Accumulate a time delay debt with no passage of time, like many column
+  // families delaying writes simultaneously. (Old versions of WriteController
+  // would reset the debt on every GetDelayToken.)
+  uint64_t debt = 0;
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;
+
+  // Now accumulate debt with some passage of time.
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Debt is accumulated in time, not in bytes, so this new write
+    // limit is not applied to prior requested delays, even it they are
+    // in progress.
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+    uint64_t credit = debt / 2;
+    clock_->now_micros_ += credit;
+    debt -= credit;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;    // consistent state
+  (void)debt;  // appease clang-analyze
+
+  // Verify paid down
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+
+  // Accumulate another debt, without accounting, and releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Big and small are delayed
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB));
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+    tokens[i].reset();
+  }
+  // All tokens released.
+  // Verify that releasing all tokens pays down debt, even with no time passage.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
 }
 
-TEST_F(WriteControllerTest, SanityTest) {
-  WriteController controller(10000000u);
-  auto stop_token_1 = controller.GetStopToken();
-  auto stop_token_2 = controller.GetStopToken();
-
-  ASSERT_TRUE(controller.IsStopped());
-  stop_token_1.reset();
-  ASSERT_TRUE(controller.IsStopped());
-  stop_token_2.reset();
-  ASSERT_FALSE(controller.IsStopped());
-
-  TimeSetEnv env;
-
-  auto delay_token_1 = controller.GetDelayToken(10000000u);
-  ASSERT_EQ(static_cast<uint64_t>(2000000),
-            controller.GetDelay(&env, 20000000u));
-
-  env.now_micros_ += 1999900u;  // sleep debt 1000
-
-  auto delay_token_2 = controller.GetDelayToken(10000000u);
-  // Rate reset after changing the token.
-  ASSERT_EQ(static_cast<uint64_t>(2000000),
-            controller.GetDelay(&env, 20000000u));
-
-  env.now_micros_ += 1999900u;  // sleep debt 1000
-
-  // One refill: 10240 bytes allowed, 1000 used, 9240 left
-  ASSERT_EQ(static_cast<uint64_t>(1124), controller.GetDelay(&env, 1000u));
-  env.now_micros_ += 1124u;  // sleep debt 0
-
-  delay_token_2.reset();
-  // 1000 used, 8240 left
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
-
-  env.now_micros_ += 100u;  // sleep credit 100
-  // 1000 used, 7240 left
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
-
-  env.now_micros_ += 100u;  // sleep credit 200
-  // One refill: 10240 fileed, sleep credit generates 2000. 8000 used
-  //             7240 + 10240 + 2000 - 8000 = 11480 left
-  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
-
-  env.now_micros_ += 200u;  // sleep debt 824
-  // 1000 used, 10480 left.
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
-
-  env.now_micros_ += 200u;  // sleep debt 624
-  // Out of bound sleep, still 10480 left
-  ASSERT_EQ(static_cast<uint64_t>(3000624u),
-            controller.GetDelay(&env, 30000000u));
-
-  env.now_micros_ += 3000724u;  // sleep credit 100
-  // 6000 used, 4480 left.
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 6000u));
-
-  env.now_micros_ += 200u;  // sleep credit 300
-  // One refill, credit 4480 balance + 3000 credit + 10240 refill
-  // Use 8000, 9720 left
-  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
-
-  env.now_micros_ += 3024u;  // sleep credit 2000
-
-  // 1720 left
-  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
-
-  // 1720 balance + 20000 credit = 20170 left
-  // Use 8000, 12170 left
-  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
-
-  // 4170 left
-  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
-
-  // Need a refill
-  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 9000u));
-
-  delay_token_1.reset();
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 30000000u));
-  delay_token_1.reset();
-  ASSERT_FALSE(controller.IsStopped());
+// This may or may not be a "good" feature, but it's an old feature
+TEST_F(WriteControllerTest, CreditAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Ensure started
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit
+  uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */;
+  clock_->now_micros_ += credit;
+
+  // Spend some credit (burst of I/O)
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB));
+    // In WriteController, credit is accumulated in bytes, not in time.
+    // After an "unnecessary" delay, all of our time credit will be
+    // translated to bytes on the next operation, in this case with
+    // setting 1 MBPS. So regardless of the rate at delay time, we just
+    // account for the bytes.
+    credit -= 63 MB;
+  }
+  // Spend remaining credit
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit));
+  // Verify
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit, no accounting
+  clock_->now_micros_ += 1000 SECS;
+
+  // Spend a small amount, releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB));
+    tokens[i].reset();
+  }
+
+  // All tokens released.
+  // Verify credit is wiped away on new delay.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.cc	2025-05-19 16:14:27.000000000 +0000
@@ -208,6 +208,7 @@
 }
 
 void WriteThread::SetState(Writer* w, uint8_t new_state) {
+  assert(w);
   auto state = w->state.load(std::memory_order_acquire);
   if (state == STATE_LOCKED_WAITING ||
       !w->state.compare_exchange_strong(state, new_state)) {
@@ -240,6 +241,7 @@
         MutexLock lock(&stall_mu_);
         writers = newest_writer->load(std::memory_order_relaxed);
         if (writers == &write_stall_dummy_) {
+          TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w);
           stall_cv_.Wait();
           // Load newest_writers_ again since it may have changed
           writers = newest_writer->load(std::memory_order_relaxed);
@@ -344,7 +346,13 @@
       prev->link_older = w->link_older;
       w->status = Status::Incomplete("Write stall");
       SetState(w, STATE_COMPLETED);
-      if (prev->link_older) {
+      // Only update `link_newer` if it's already set.
+      // `CreateMissingNewerLinks()` will update the nullptr `link_newer` later,
+      // which assumes the the first non-nullptr `link_newer` is the last
+      // nullptr link in the writer list.
+      // If `link_newer` is set here, `CreateMissingNewerLinks()` may stop
+      // updating the whole list when it sees the first non nullptr link.
+      if (prev->link_older && prev->link_older->link_newer) {
         prev->link_older->link_newer = prev;
       }
       w = prev->link_older;
@@ -438,6 +446,7 @@
   // (newest_writer) is inclusive. Iteration goes from old to new.
   Writer* w = leader;
   while (w != newest_writer) {
+    assert(w->link_newer);
     w = w->link_newer;
 
     if (w->sync && !leader->sync) {
@@ -457,6 +466,11 @@
       break;
     }
 
+    if (w->protection_bytes_per_key != leader->protection_bytes_per_key) {
+      // Do not mix writes with different levels of integrity protection.
+      break;
+    }
+
     if (w->batch == nullptr) {
       // Do not include those writes with nullptr batch. Those are not writes,
       // those are something else. They want to be alone
@@ -464,7 +478,7 @@
     }
 
     if (w->callback != nullptr && !w->callback->AllowWriteBatching()) {
-      // dont batch writes that don't want to be batched
+      // don't batch writes that don't want to be batched
       break;
     }
 
@@ -512,6 +526,7 @@
 
     Writer* w = leader;
     while (w != newest_writer) {
+      assert(w->link_newer);
       w = w->link_newer;
 
       if (w->batch == nullptr) {
@@ -568,6 +583,7 @@
     if (w == last_writer) {
       break;
     }
+    assert(next);
     w = next;
   }
   // Note that leader has to exit last, since it owns the write group.
@@ -599,6 +615,8 @@
   }
   // else we're the last parallel worker and should perform exit duties.
   w->status = write_group->status;
+  // Callers of this function must ensure w->status is checked.
+  write_group->status.PermitUncheckedError();
   return true;
 }
 
@@ -615,11 +633,17 @@
 
 static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader");
 void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
-                                         Status status) {
+                                         Status& status) {
   Writer* leader = write_group.leader;
   Writer* last_writer = write_group.last_writer;
   assert(leader->link_older == nullptr);
 
+  // If status is non-ok already, then write_group.status won't have the chance
+  // of being propagated to caller.
+  if (!status.ok()) {
+    write_group.status.PermitUncheckedError();
+  }
+
   // Propagate memtable write error to the whole group.
   if (status.ok() && !write_group.status.ok()) {
     status = write_group.status;
@@ -721,6 +745,7 @@
     // leader now
 
     while (last_writer != leader) {
+      assert(last_writer);
       last_writer->status = status;
       // we need to read link_older before calling SetState, because as soon
       // as it is marked committed the other thread's Await may return and
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.h mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db/write_thread.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db/write_thread.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,11 +5,11 @@
 
 #pragma once
 
-#include <assert.h>
-#include <stdint.h>
 #include <atomic>
+#include <cassert>
 #include <chrono>
 #include <condition_variable>
+#include <cstdint>
 #include <mutex>
 #include <type_traits>
 #include <vector>
@@ -36,7 +36,7 @@
     // non-parallel informs a follower that its writes have been committed
     // (-> STATE_COMPLETED), or when a leader that has chosen to perform
     // updates in parallel and needs this Writer to apply its batch (->
-    // STATE_PARALLEL_FOLLOWER).
+    // STATE_PARALLEL_MEMTABLE_WRITER).
     STATE_INIT = 1,
 
     // The state used to inform a waiting Writer that it has become the
@@ -119,6 +119,7 @@
     bool disable_wal;
     bool disable_memtable;
     size_t batch_cnt;  // if non-zero, number of sub-batches in the write batch
+    size_t protection_bytes_per_key;
     PreReleaseCallback* pre_release_callback;
     uint64_t log_used;  // log number that this batch was inserted into
     uint64_t log_ref;   // log number that memtable insert should reference
@@ -128,7 +129,7 @@
     WriteGroup* write_group;
     SequenceNumber sequence;  // the sequence number to use for the first key
     Status status;
-    Status callback_status;   // status returned by callback->Callback()
+    Status callback_status;  // status returned by callback->Callback()
 
     std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
     std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
@@ -142,6 +143,7 @@
           disable_wal(false),
           disable_memtable(false),
           batch_cnt(0),
+          protection_bytes_per_key(0),
           pre_release_callback(nullptr),
           log_used(0),
           log_ref(0),
@@ -163,6 +165,7 @@
           disable_wal(write_options.disableWAL),
           disable_memtable(_disable_memtable),
           batch_cnt(_batch_cnt),
+          protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
           pre_release_callback(_pre_release_callback),
           log_used(0),
           log_ref(_log_ref),
@@ -179,6 +182,8 @@
         StateMutex().~mutex();
         StateCV().~condition_variable();
       }
+      status.PermitUncheckedError();
+      callback_status.PermitUncheckedError();
     }
 
     bool CheckCallback(DB* db) {
@@ -241,7 +246,7 @@
     std::condition_variable& StateCV() {
       assert(made_waitable);
       return *static_cast<std::condition_variable*>(
-                 static_cast<void*>(&state_cv_bytes));
+          static_cast<void*>(&state_cv_bytes));
     }
   };
 
@@ -268,7 +273,7 @@
   // STATE_GROUP_LEADER.  If w has been made part of a sequential batch
   // group and the leader has performed the write, returns STATE_DONE.
   // If w has been made part of a parallel batch group and is responsible
-  // for updating the memtable, returns STATE_PARALLEL_FOLLOWER.
+  // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER.
   //
   // The db mutex SHOULD NOT be held when calling this function, because
   // it will block.
@@ -289,7 +294,7 @@
   //
   // WriteGroup* write_group: the write group
   // Status status:           Status of write operation
-  void ExitAsBatchGroupLeader(WriteGroup& write_group, Status status);
+  void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status);
 
   // Exit batch group on behalf of batch group leader.
   void ExitAsBatchGroupFollower(Writer* w);
@@ -305,8 +310,8 @@
   // the next leader if needed.
   void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
 
-  // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the
-  // non-leader members of this write batch group.  Sets Writer::sequence
+  // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of
+  // the non-leader members of this write batch group.  Sets Writer::sequence
   // before waking them up.
   //
   // WriteGroup* write_group: Extra state used to coordinate the parallel add
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -1,14 +1,17 @@
 add_executable(db_stress${ARTIFACT_SUFFIX}
-  db_stress.cc
-  db_stress_tool.cc
   batched_ops_stress.cc
   cf_consistency_stress.cc
+  db_stress.cc
   db_stress_common.cc
   db_stress_driver.cc
-  db_stress_test_base.cc
-  db_stress_shared_state.cc
   db_stress_gflags.cc
+  db_stress_listener.cc
+  db_stress_shared_state.cc
+  db_stress_stat.cc
+  db_stress_test_base.cc
   db_stress_tool.cc
+  expected_state.cc
+  multi_ops_txns_stress.cc
   no_batched_ops_stress.cc)
-target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB})
+target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
 list(APPEND tool_deps db_stress)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,6 +16,8 @@
   BatchedOpsStressTest() {}
   virtual ~BatchedOpsStressTest() {}
 
+  bool IsStateTracked() const override { return false; }
+
   // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
   // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
   // Also refer BatchedOpsStressTest::TestGet
@@ -31,7 +33,8 @@
     std::string keys[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
     std::string values[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
     Slice value_slices[10];
-    WriteBatch batch;
+    WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                     FLAGS_batch_protection_bytes_per_key);
     Status s;
     auto cfh = column_families_[rand_column_families[0]];
     std::string key_str = Key(rand_keys[0]);
@@ -66,7 +69,8 @@
                     std::unique_ptr<MutexLock>& /* lock */) override {
     std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"};
 
-    WriteBatch batch;
+    WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                     FLAGS_batch_protection_bytes_per_key);
     Status s;
     auto cfh = column_families_[rand_column_families[0]];
     std::string key_str = Key(rand_keys[0]);
@@ -228,7 +232,8 @@
       for (size_t i = 1; i < num_prefixes; i++) {
         if (values[i] != values[0]) {
           fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
-                  key_str[i].c_str(), StringToHex(values[0].ToString()).c_str(),
+                  StringToHex(key_str[i]).c_str(),
+                  StringToHex(values[0].ToString()).c_str(),
                   StringToHex(values[i].ToString()).c_str());
           // we continue after error rather than exiting so that we can
           // find more errors if any
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,7 @@
 
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
+#include "file/file_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 class CfConsistencyStressTest : public StressTest {
@@ -17,6 +18,8 @@
 
   ~CfConsistencyStressTest() override {}
 
+  bool IsStateTracked() const override { return false; }
+
   Status TestPut(ThreadState* thread, WriteOptions& write_opts,
                  const ReadOptions& /* read_opts */,
                  const std::vector<int>& rand_column_families,
@@ -282,70 +285,6 @@
     return column_families_[thread->rand.Next() % column_families_.size()];
   }
 
-#ifdef ROCKSDB_LITE
-  Status TestCheckpoint(ThreadState* /* thread */,
-                        const std::vector<int>& /* rand_column_families */,
-                        const std::vector<int64_t>& /* rand_keys */) override {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestCheckpoint\n");
-    std::terminate();
-  }
-#else
-  Status TestCheckpoint(ThreadState* thread,
-                        const std::vector<int>& /* rand_column_families */,
-                        const std::vector<int64_t>& /* rand_keys */) override {
-    std::string checkpoint_dir =
-        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
-
-    // We need to clear DB including manifest files, so make a copy
-    Options opt_copy = options_;
-    opt_copy.env = db_stress_env->target();
-    DestroyDB(checkpoint_dir, opt_copy);
-
-    Checkpoint* checkpoint = nullptr;
-    Status s = Checkpoint::Create(db_, &checkpoint);
-    if (s.ok()) {
-      s = checkpoint->CreateCheckpoint(checkpoint_dir);
-    }
-    std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* checkpoint_db = nullptr;
-    if (s.ok()) {
-      delete checkpoint;
-      checkpoint = nullptr;
-      Options options(options_);
-      options.listeners.clear();
-      std::vector<ColumnFamilyDescriptor> cf_descs;
-      // TODO(ajkr): `column_family_names_` is not safe to access here when
-      // `clear_column_family_one_in != 0`. But we can't easily switch to
-      // `ListColumnFamilies` to get names because it won't necessarily give
-      // the same order as `column_family_names_`.
-      if (FLAGS_clear_column_family_one_in == 0) {
-        for (const auto& name : column_family_names_) {
-          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
-        }
-        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
-                                &cf_handles, &checkpoint_db);
-      }
-    }
-    if (checkpoint_db != nullptr) {
-      for (auto cfh : cf_handles) {
-        delete cfh;
-      }
-      cf_handles.clear();
-      delete checkpoint_db;
-      checkpoint_db = nullptr;
-    }
-    DestroyDB(checkpoint_dir, opt_copy);
-    if (!s.ok()) {
-      fprintf(stderr, "A checkpoint operation failed with: %s\n",
-              s.ToString().c_str());
-    }
-    return s;
-  }
-#endif  // !ROCKSDB_LITE
-
   void VerifyDb(ThreadState* thread) const override {
     ReadOptions options(FLAGS_verify_checksum, true);
     // We must set total_order_seek to true because we are doing a SeekToFirst
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress.cc	2025-05-19 16:14:27.000000000 +0000
@@ -15,7 +15,7 @@
   return 1;
 }
 #else
-#include <rocksdb/db_stress_tool.h>
+#include "rocksdb/db_stress_tool.h"
 
 int main(int argc, char** argv) {
   return ROCKSDB_NAMESPACE::db_stress_tool(argc, argv);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,9 +10,18 @@
 
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
+
 #include <cmath>
 
-ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env = nullptr;
+#include "util/file_checksum_helper.h"
+#include "util/xxhash.h"
+
+ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr;
+#ifndef NDEBUG
+// If non-null, injects read error at a rate specified by the
+// read_fault_one_in or write_fault_one_in flag
+std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
+#endif // NDEBUG
 enum ROCKSDB_NAMESPACE::CompressionType compression_type_e =
     ROCKSDB_NAMESPACE::kSnappyCompression;
 enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e =
@@ -21,7 +30,7 @@
     ROCKSDB_NAMESPACE::kCRC32c;
 enum RepFactory FLAGS_rep_factory = kSkipList;
 std::vector<double> sum_probs(100001);
-int64_t zipf_sum_size = 100000;
+constexpr int64_t zipf_sum_size = 100000;
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -151,8 +160,10 @@
     snprintf(buf, 4, "%X", value[i]);
     tmp.append(buf);
   }
-  fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
-          key, sz, tmp.c_str());
+  auto key_str = Key(key);
+  Slice key_slice = key_str;
+  fprintf(stdout, "[CF %d] %s (%" PRIi64 ") == > (%" ROCKSDB_PRIszt ") %s\n",
+          cf, key_slice.ToString(true).c_str(), key, sz, tmp.c_str());
 }
 
 // Note that if hot_key_alpha != 0, it generates the key based on Zipfian
@@ -214,12 +225,129 @@
       ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
   assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
   (void)max_sz;
-  *((uint32_t*)v) = rand;
+  PutUnaligned(reinterpret_cast<uint32_t*>(v), rand);
   for (size_t i = sizeof(uint32_t); i < value_sz; i++) {
     v[i] = (char)(rand ^ i);
   }
   v[value_sz] = '\0';
   return value_sz;  // the size of the value set.
 }
+
+uint32_t GetValueBase(Slice s) {
+  assert(s.size() >= sizeof(uint32_t));
+  uint32_t res;
+  GetUnaligned(reinterpret_cast<const uint32_t*>(s.data()), &res);
+  return res;
+}
+
+std::string NowNanosStr() {
+  uint64_t t = db_stress_env->NowNanos();
+  std::string ret;
+  PutFixed64(&ret, t);
+  return ret;
+}
+
+std::string GenerateTimestampForRead() { return NowNanosStr(); }
+
+namespace {
+
+class MyXXH64Checksum : public FileChecksumGenerator {
+ public:
+  explicit MyXXH64Checksum(bool big) : big_(big) {
+    state_ = XXH64_createState();
+    XXH64_reset(state_, 0);
+  }
+
+  virtual ~MyXXH64Checksum() override { XXH64_freeState(state_); }
+
+  void Update(const char* data, size_t n) override {
+    XXH64_update(state_, data, n);
+  }
+
+  void Finalize() override {
+    assert(str_.empty());
+    uint64_t digest = XXH64_digest(state_);
+    // Store as little endian raw bytes
+    PutFixed64(&str_, digest);
+    if (big_) {
+      // Throw in some more data for stress testing (448 bits total)
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+      PutFixed64(&str_, GetSliceHash64(str_));
+    }
+  }
+
+  std::string GetChecksum() const override {
+    assert(!str_.empty());
+    return str_;
+  }
+
+  const char* Name() const override {
+    return big_ ? "MyBigChecksum" : "MyXXH64Checksum";
+  }
+
+ private:
+  bool big_;
+  XXH64_state_t* state_;
+  std::string str_;
+};
+
+class DbStressChecksumGenFactory : public FileChecksumGenFactory {
+  std::string default_func_name_;
+
+  std::unique_ptr<FileChecksumGenerator> CreateFromFuncName(
+      const std::string& func_name) {
+    std::unique_ptr<FileChecksumGenerator> rv;
+    if (func_name == "FileChecksumCrc32c") {
+      rv.reset(new FileChecksumGenCrc32c(FileChecksumGenContext()));
+    } else if (func_name == "MyXXH64Checksum") {
+      rv.reset(new MyXXH64Checksum(false /* big */));
+    } else if (func_name == "MyBigChecksum") {
+      rv.reset(new MyXXH64Checksum(true /* big */));
+    } else {
+      // Should be a recognized function when we get here
+      assert(false);
+    }
+    return rv;
+  }
+
+ public:
+  explicit DbStressChecksumGenFactory(const std::string& default_func_name)
+      : default_func_name_(default_func_name) {}
+
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) override {
+    if (context.requested_checksum_func_name.empty()) {
+      return CreateFromFuncName(default_func_name_);
+    } else {
+      return CreateFromFuncName(context.requested_checksum_func_name);
+    }
+  }
+
+  const char* Name() const override { return "FileChecksumGenCrc32cFactory"; }
+};
+
+}  // namespace
+
+std::shared_ptr<FileChecksumGenFactory> GetFileChecksumImpl(
+    const std::string& name) {
+  // Translate from friendly names to internal names
+  std::string internal_name;
+  if (name == "crc32c") {
+    internal_name = "FileChecksumCrc32c";
+  } else if (name == "xxh64") {
+    internal_name = "MyXXH64Checksum";
+  } else if (name == "big") {
+    internal_name = "MyBigChecksum";
+  } else {
+    assert(name.empty() || name == "none");
+    return nullptr;
+  }
+  return std::make_shared<DbStressChecksumGenFactory>(internal_name);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_common.h	2025-05-19 16:14:27.000000000 +0000
@@ -26,6 +26,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
+
 #include <algorithm>
 #include <array>
 #include <chrono>
@@ -58,6 +59,7 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/write_batch.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
@@ -66,12 +68,6 @@
 #include "util/random.h"
 #include "util/string_util.h"
 #include "utilities/blob_db/blob_db.h"
-// SyncPoint is not supported in Released Windows Mode.
-#if !(defined NDEBUG) || !defined(OS_WIN)
-#include "test_util/sync_point.h"
-#endif  // !(defined NDEBUG) || !defined(OS_WIN)
-#include "test_util/testutil.h"
-
 #include "utilities/merge_operators.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
@@ -91,6 +87,7 @@
 DECLARE_bool(test_batches_snapshots);
 DECLARE_bool(atomic_flush);
 DECLARE_bool(test_cf_consistency);
+DECLARE_bool(test_multi_ops_txns);
 DECLARE_int32(threads);
 DECLARE_int32(ttl);
 DECLARE_int32(value_size_mult);
@@ -112,6 +109,7 @@
 DECLARE_int32(open_files);
 DECLARE_int64(compressed_cache_size);
 DECLARE_int32(compaction_style);
+DECLARE_int32(num_levels);
 DECLARE_int32(level0_file_num_compaction_trigger);
 DECLARE_int32(level0_slowdown_writes_trigger);
 DECLARE_int32(level0_stop_writes_trigger);
@@ -128,36 +126,46 @@
 DECLARE_int32(universal_max_merge_width);
 DECLARE_int32(universal_max_size_amplification_percent);
 DECLARE_int32(clear_column_family_one_in);
-DECLARE_int32(get_live_files_and_wal_files_one_in);
+DECLARE_int32(get_live_files_one_in);
+DECLARE_int32(get_sorted_wal_files_one_in);
+DECLARE_int32(get_current_wal_file_one_in);
 DECLARE_int32(set_options_one_in);
 DECLARE_int32(set_in_place_one_in);
 DECLARE_int64(cache_size);
+DECLARE_int32(cache_numshardbits);
 DECLARE_bool(cache_index_and_filter_blocks);
+DECLARE_int32(top_level_index_pinning);
+DECLARE_int32(partition_pinning);
+DECLARE_int32(unpartitioned_pinning);
 DECLARE_bool(use_clock_cache);
 DECLARE_uint64(subcompactions);
 DECLARE_uint64(periodic_compaction_seconds);
 DECLARE_uint64(compaction_ttl);
 DECLARE_bool(allow_concurrent_memtable_write);
+DECLARE_double(experimental_mempurge_threshold);
 DECLARE_bool(enable_write_thread_adaptive_yield);
 DECLARE_int32(reopen);
 DECLARE_double(bloom_bits);
 DECLARE_bool(use_block_based_filter);
+DECLARE_int32(ribbon_starting_level);
 DECLARE_bool(partition_filters);
+DECLARE_bool(optimize_filters_for_memory);
 DECLARE_int32(index_type);
 DECLARE_string(db);
 DECLARE_string(secondaries_base);
 DECLARE_bool(test_secondary);
-DECLARE_string(expected_values_path);
+DECLARE_string(expected_values_dir);
 DECLARE_bool(verify_checksum);
 DECLARE_bool(mmap_read);
 DECLARE_bool(mmap_write);
 DECLARE_bool(use_direct_reads);
 DECLARE_bool(use_direct_io_for_flush_and_compaction);
+DECLARE_bool(mock_direct_io);
 DECLARE_bool(statistics);
 DECLARE_bool(sync);
 DECLARE_bool(use_fsync);
 DECLARE_int32(kill_random_test);
-DECLARE_string(kill_prefix_blacklist);
+DECLARE_string(kill_exclude_prefixes);
 DECLARE_bool(disable_wal);
 DECLARE_uint64(recycle_log_file_num);
 DECLARE_int64(target_file_size_base);
@@ -167,15 +175,19 @@
 DECLARE_int32(range_deletion_width);
 DECLARE_uint64(rate_limiter_bytes_per_sec);
 DECLARE_bool(rate_limit_bg_reads);
+DECLARE_uint64(sst_file_manager_bytes_per_sec);
+DECLARE_uint64(sst_file_manager_bytes_per_truncate);
 DECLARE_bool(use_txn);
 DECLARE_uint64(txn_write_policy);
 DECLARE_bool(unordered_write);
 DECLARE_int32(backup_one_in);
+DECLARE_uint64(backup_max_size);
 DECLARE_int32(checkpoint_one_in);
 DECLARE_int32(ingest_external_file_one_in);
 DECLARE_int32(ingest_external_file_width);
 DECLARE_int32(compact_files_one_in);
 DECLARE_int32(compact_range_one_in);
+DECLARE_int32(mark_for_compaction_one_file_in);
 DECLARE_int32(flush_one_in);
 DECLARE_int32(pause_background_one_in);
 DECLARE_int32(compact_range_width);
@@ -192,13 +204,17 @@
 DECLARE_int32(nooverwritepercent);
 DECLARE_int32(iterpercent);
 DECLARE_uint64(num_iterations);
+DECLARE_int32(customopspercent);
 DECLARE_string(compression_type);
 DECLARE_string(bottommost_compression_type);
 DECLARE_int32(compression_max_dict_bytes);
 DECLARE_int32(compression_zstd_max_train_bytes);
+DECLARE_int32(compression_parallel_threads);
+DECLARE_uint64(compression_max_dict_buffer_bytes);
 DECLARE_string(checksum_type);
 DECLARE_string(hdfs);
 DECLARE_string(env_uri);
+DECLARE_string(fs_uri);
 DECLARE_uint64(ops_per_thread);
 DECLARE_uint64(log2_keys_per_lock);
 DECLARE_uint64(max_manifest_file_size);
@@ -211,13 +227,17 @@
 DECLARE_int32(sync_wal_one_in);
 DECLARE_bool(avoid_unnecessary_blocking_io);
 DECLARE_bool(write_dbid_to_manifest);
+DECLARE_bool(avoid_flush_during_recovery);
 DECLARE_uint64(max_write_batch_group_size_bytes);
 DECLARE_bool(level_compaction_dynamic_level_bytes);
 DECLARE_int32(verify_checksum_one_in);
 DECLARE_int32(verify_db_one_in);
 DECLARE_int32(continuous_verification_interval);
+DECLARE_int32(get_property_one_in);
+DECLARE_string(file_checksum_impl);
 
 #ifndef ROCKSDB_LITE
+// Options for StackableDB-based BlobDB
 DECLARE_bool(use_blob_db);
 DECLARE_uint64(blob_db_min_blob_size);
 DECLARE_uint64(blob_db_bytes_per_sync);
@@ -225,14 +245,46 @@
 DECLARE_bool(blob_db_enable_gc);
 DECLARE_double(blob_db_gc_cutoff);
 #endif  // !ROCKSDB_LITE
+
+// Options for integrated BlobDB
+DECLARE_bool(allow_setting_blob_options_dynamically);
+DECLARE_bool(enable_blob_files);
+DECLARE_uint64(min_blob_size);
+DECLARE_uint64(blob_file_size);
+DECLARE_string(blob_compression_type);
+DECLARE_bool(enable_blob_garbage_collection);
+DECLARE_double(blob_garbage_collection_age_cutoff);
+DECLARE_double(blob_garbage_collection_force_threshold);
+DECLARE_uint64(blob_compaction_readahead_size);
+
 DECLARE_int32(approximate_size_one_in);
+DECLARE_bool(sync_fault_injection);
 
-const long KB = 1024;
-const int kRandomValueMaxFactor = 3;
-const int kValueMaxLen = 100;
+DECLARE_bool(best_efforts_recovery);
+DECLARE_bool(skip_verifydb);
+DECLARE_bool(enable_compaction_filter);
+DECLARE_bool(paranoid_file_checks);
+DECLARE_bool(fail_if_options_file_error);
+DECLARE_uint64(batch_protection_bytes_per_key);
+
+DECLARE_uint64(user_timestamp_size);
+DECLARE_string(secondary_cache_uri);
+DECLARE_int32(secondary_cache_fault_one_in);
+
+DECLARE_int32(prepopulate_block_cache);
+
+constexpr long KB = 1024;
+constexpr int kRandomValueMaxFactor = 3;
+constexpr int kValueMaxLen = 100;
 
 // wrapped posix or hdfs environment
-extern ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env;
+extern ROCKSDB_NAMESPACE::Env* db_stress_env;
+#ifndef NDEBUG
+namespace ROCKSDB_NAMESPACE {
+class FaultInjectionTestFS;
+}  // namespace ROCKSDB_NAMESPACE
+extern std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
+#endif
 
 extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e;
 extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e;
@@ -424,19 +476,10 @@
 
   assert(size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t));
 
-  // Pad with zeros to make it a multiple of 8. This function may be called
-  // with a prefix, in which case we return the first index that falls
-  // inside or outside that prefix, dependeing on whether the prefix is
-  // the start of upper bound of a scan
-  unsigned int pad = sizeof(uint64_t) - (size_key % sizeof(uint64_t));
-  if (pad < sizeof(uint64_t)) {
-    big_endian_key.append(pad, '\0');
-    size_key += pad;
-  }
-
   std::string little_endian_key;
   little_endian_key.resize(size_key);
-  for (size_t start = 0; start < size_key; start += sizeof(uint64_t)) {
+  for (size_t start = 0; start + sizeof(uint64_t) <= size_key;
+       start += sizeof(uint64_t)) {
     size_t end = start + sizeof(uint64_t);
     for (size_t i = 0; i < sizeof(uint64_t); ++i) {
       little_endian_key[start + i] = big_endian_key[end - 1 - i];
@@ -455,17 +498,40 @@
     uint64_t pfx = prefixes[i];
     key += (pfx / key_gen_ctx.weights[i]) * key_gen_ctx.window +
            pfx % key_gen_ctx.weights[i];
+    if (i < prefixes.size() - 1) {
+      // The encoding writes a `key_gen_ctx.weights[i] - 1` that counts for
+      // `key_gen_ctx.weights[i]` when there are more prefixes to come. So we
+      // need to add back the one here as we're at a non-last prefix.
+      ++key;
+    }
   }
   *key_p = key;
   return true;
 }
 
+// Given a string prefix, map it to the first corresponding index in the
+// expected values buffer.
+inline bool GetFirstIntValInPrefix(std::string big_endian_prefix,
+                                   uint64_t* key_p) {
+  size_t size_key = big_endian_prefix.size();
+  // Pad with zeros to make it a multiple of 8. This function may be called
+  // with a prefix, in which case we return the first index that falls
+  // inside or outside that prefix, dependeing on whether the prefix is
+  // the start of upper bound of a scan
+  unsigned int pad = sizeof(uint64_t) - (size_key % sizeof(uint64_t));
+  if (pad < sizeof(uint64_t)) {
+    big_endian_prefix.append(pad, '\0');
+  }
+  return GetIntVal(std::move(big_endian_prefix), key_p);
+}
+
 extern inline uint64_t GetPrefixKeyCount(const std::string& prefix,
                                          const std::string& ub) {
   uint64_t start = 0;
   uint64_t end = 0;
 
-  if (!GetIntVal(prefix, &start) || !GetIntVal(ub, &end)) {
+  if (!GetFirstIntValInPrefix(prefix, &start) ||
+      !GetFirstIntValInPrefix(ub, &end)) {
     return 0;
   }
 
@@ -501,11 +567,20 @@
                                           uint64_t iteration);
 
 extern size_t GenerateValue(uint32_t rand, char* v, size_t max_sz);
+extern uint32_t GetValueBase(Slice s);
 
 extern StressTest* CreateCfConsistencyStressTest();
 extern StressTest* CreateBatchedOpsStressTest();
 extern StressTest* CreateNonBatchedOpsStressTest();
+extern StressTest* CreateMultiOpsTxnsStressTest();
+extern void CheckAndSetOptionsForMultiOpsTxnStressTest();
 extern void InitializeHotKeyGenerator(double alpha);
 extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key);
+
+extern std::string GenerateTimestampForRead();
+extern std::string NowNanosStr();
+
+std::shared_ptr<FileChecksumGenFactory> GetFileChecksumImpl(
+    const std::string& name);
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_shared_state.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// DbStressCompactionFilter is safe to use with db_stress as it does not perform
+// any mutation. It only makes `kRemove` decisions for keys that are already
+// non-existent according to the `SharedState`.
+class DbStressCompactionFilter : public CompactionFilter {
+ public:
+  DbStressCompactionFilter(SharedState* state, int cf_id)
+      : state_(state), cf_id_(cf_id) {}
+
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    if (state_ == nullptr) {
+      return Decision::kKeep;
+    }
+    if (key.empty() || ('0' <= key[0] && key[0] <= '9')) {
+      // It is likely leftover from a test_batches_snapshots run. Below this
+      // conditional, the test_batches_snapshots key format is not handled
+      // properly. Just keep it to be safe.
+      return Decision::kKeep;
+    }
+    uint64_t key_num = 0;
+    bool ok = GetIntVal(key.ToString(), &key_num);
+    assert(ok);
+    (void)ok;
+    port::Mutex* key_mutex = state_->GetMutexForKey(cf_id_, key_num);
+    if (!key_mutex->TryLock()) {
+      return Decision::kKeep;
+    }
+    // Reaching here means we acquired the lock.
+
+    bool key_exists = state_->Exists(cf_id_, key_num);
+
+    key_mutex->Unlock();
+
+    if (!key_exists) {
+      return Decision::kRemove;
+    }
+    return Decision::kKeep;
+  }
+
+  const char* Name() const override { return "DbStressCompactionFilter"; }
+
+ private:
+  SharedState* const state_;
+  const int cf_id_;
+};
+
+class DbStressCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  DbStressCompactionFilterFactory() : state_(nullptr) {}
+
+  void SetSharedState(SharedState* state) {
+    MutexLock state_mutex_guard(&state_mutex_);
+    state_ = state;
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    MutexLock state_mutex_guard(&state_mutex_);
+    return std::unique_ptr<CompactionFilter>(
+        new DbStressCompactionFilter(state_, context.column_family_id));
+  }
+
+  const char* Name() const override {
+    return "DbStressCompactionFilterFactory";
+  }
+
+ private:
+  port::Mutex state_mutex_;
+  SharedState* state_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,13 +10,14 @@
 
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 void ThreadBody(void* v) {
   ThreadState* thread = reinterpret_cast<ThreadState*>(v);
   SharedState* shared = thread->shared;
 
-  if (shared->ShouldVerifyAtBeginning()) {
+  if (!FLAGS_skip_verifydb && shared->ShouldVerifyAtBeginning()) {
     thread->shared->GetStressTest()->VerifyDb(thread);
   }
   {
@@ -42,7 +43,9 @@
     }
   }
 
-  thread->shared->GetStressTest()->VerifyDb(thread);
+  if (!FLAGS_skip_verifydb) {
+    thread->shared->GetStressTest()->VerifyDb(thread);
+  }
 
   {
     MutexLock l(shared->GetMutex());
@@ -54,31 +57,42 @@
 }
 
 bool RunStressTest(StressTest* stress) {
+  SystemClock* clock = db_stress_env->GetSystemClock().get();
   stress->InitDb();
-
   SharedState shared(db_stress_env, stress);
-  if (FLAGS_read_only) {
-    stress->InitReadonlyDb(&shared);
-  }
+  stress->FinishInitDb(&shared);
 
-  uint32_t n = shared.GetNumThreads();
+#ifndef NDEBUG
+  if (FLAGS_sync_fault_injection) {
+    fault_fs_guard->SetFilesystemDirectWritable(false);
+  }
+#endif
 
-  uint64_t now = db_stress_env->NowMicros();
+  uint32_t n = FLAGS_threads;
+  uint64_t now = clock->NowMicros();
   fprintf(stdout, "%s Initializing worker threads\n",
-          db_stress_env->TimeToString(now / 1000000).c_str());
-  std::vector<ThreadState*> threads(n);
-  for (uint32_t i = 0; i < n; i++) {
-    threads[i] = new ThreadState(i, &shared);
-    db_stress_env->StartThread(ThreadBody, threads[i]);
-  }
+          clock->TimeToString(now / 1000000).c_str());
+
   ThreadState bg_thread(0, &shared);
-  if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
-    db_stress_env->StartThread(PoolSizeChangeThread, &bg_thread);
-  }
   ThreadState continuous_verification_thread(0, &shared);
-  if (FLAGS_continuous_verification_interval > 0) {
-    db_stress_env->StartThread(DbVerificationThread,
-                               &continuous_verification_thread);
+  std::vector<ThreadState*> threads(n);
+  {
+    MutexLock l(shared.GetMutex());
+
+    for (uint32_t i = 0; i < n; i++) {
+      shared.IncThreads();
+      threads[i] = new ThreadState(i, &shared);
+      db_stress_env->StartThread(ThreadBody, threads[i]);
+    }
+    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+      shared.IncBgThreads();
+      db_stress_env->StartThread(PoolSizeChangeThread, &bg_thread);
+    }
+    if (FLAGS_continuous_verification_interval > 0) {
+      shared.IncBgThreads();
+      db_stress_env->StartThread(DbVerificationThread,
+                                 &continuous_verification_thread);
+    }
   }
 
   // Each thread goes through the following states:
@@ -98,9 +112,9 @@
       }
     }
 
-    now = db_stress_env->NowMicros();
+    now = clock->NowMicros();
     fprintf(stdout, "%s Starting database operations\n",
-            db_stress_env->TimeToString(now / 1000000).c_str());
+            clock->TimeToString(now / 1000000).c_str());
 
     shared.SetStart();
     shared.GetCondVar()->SignalAll();
@@ -108,13 +122,16 @@
       shared.GetCondVar()->Wait();
     }
 
-    now = db_stress_env->NowMicros();
+    now = clock->NowMicros();
     if (FLAGS_test_batches_snapshots) {
       fprintf(stdout, "%s Limited verification already done during gets\n",
-              db_stress_env->TimeToString((uint64_t)now / 1000000).c_str());
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
+    } else if (FLAGS_skip_verifydb) {
+      fprintf(stdout, "%s Verification skipped\n",
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
     } else {
       fprintf(stdout, "%s Starting verification\n",
-              db_stress_env->TimeToString((uint64_t)now / 1000000).c_str());
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
     }
 
     shared.SetStartVerify();
@@ -133,10 +150,11 @@
     delete threads[i];
     threads[i] = nullptr;
   }
-  now = db_stress_env->NowMicros();
-  if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) {
+  now = clock->NowMicros();
+  if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots &&
+      !shared.HasVerificationFailedYet()) {
     fprintf(stdout, "%s Verification successful\n",
-            db_stress_env->TimeToString(now / 1000000).c_str());
+            clock->TimeToString(now / 1000000).c_str());
   }
   stress->PrintStatistics();
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_env_wrapper.h	2025-05-19 16:14:27.000000000 +0000
@@ -15,18 +15,24 @@
 class DbStressEnvWrapper : public EnvWrapper {
  public:
   explicit DbStressEnvWrapper(Env* t) : EnvWrapper(t) {}
+  static const char* kClassName() { return "DbStressEnv"; }
+  const char* Name() const override { return kClassName(); }
 
   Status DeleteFile(const std::string& f) override {
     // We determine whether it is a manifest file by searching a strong,
     // so that there will be false positive if the directory path contains the
     // keyword but it is unlikely.
-    // Checkpoint directory needs to be exempted.
+    // Checkpoint, backup, and restore directories needs to be exempted.
     if (!if_preserve_all_manifests ||
         f.find("MANIFEST-") == std::string::npos ||
-        f.find("checkpoint") != std::string::npos) {
+        f.find("checkpoint") != std::string::npos ||
+        f.find(".backup") != std::string::npos ||
+        f.find(".restore") != std::string::npos) {
       return target()->DeleteFile(f);
     }
-    return Status::OK();
+    // Rename the file instead of deletion to keep the history, and
+    // at the same time it is not visible to RocksDB.
+    return target()->RenameFile(f, f + "_renamed_");
   }
 
   // If true, all manifest files will not be delted in DeleteFile().
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc	2025-05-19 16:14:27.000000000 +0000
@@ -19,7 +19,10 @@
   return true;
 }
 
-DEFINE_uint64(seed, 2341234, "Seed for PRNG");
+DEFINE_uint64(seed, 2341234,
+              "Seed for PRNG. When --nooverwritepercent is "
+              "nonzero and --expected_values_dir is nonempty, this value "
+              "must be fixed across invocations.");
 static const bool FLAGS_seed_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
 
@@ -87,6 +90,11 @@
             "multiple column families are consistent. Setting this implies "
             "`atomic_flush=true` is set true if `disable_wal=false`.\n");
 
+DEFINE_bool(test_multi_ops_txns, false,
+            "If set, runs stress test dedicated to verifying multi-ops "
+            "transactions on a simple relational table with primary and "
+            "secondary index.");
+
 DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
 
 DEFINE_int32(ttl, -1,
@@ -186,6 +194,9 @@
 DEFINE_int32(compaction_style, ROCKSDB_NAMESPACE::Options().compaction_style,
              "");
 
+DEFINE_int32(num_levels, ROCKSDB_NAMESPACE::Options().num_levels,
+             "Number of levels in the DB");
+
 DEFINE_int32(level0_file_num_compaction_trigger,
              ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
              "Level0 compaction start trigger");
@@ -256,10 +267,21 @@
              "it again. If N == 0, never drop/create column families. "
              "When test_batches_snapshots is true, this flag has no effect");
 
-DEFINE_int32(get_live_files_and_wal_files_one_in, 1000000,
-             "With a chance of 1/N, call GetLiveFiles, GetSortedWalFiles "
-             "and GetCurrentWalFile to verify if it returns correctly. If "
-             "N == 0, never call the three interfaces.");
+DEFINE_int32(get_live_files_one_in, 1000000,
+             "With a chance of 1/N, call GetLiveFiles to verify if it returns "
+             "correctly. If N == 0, do not call the interface.");
+
+DEFINE_int32(
+    get_sorted_wal_files_one_in, 1000000,
+    "With a chance of 1/N, call GetSortedWalFiles to verify if it returns "
+    "correctly. (Note that this API may legitimately return an error.) If N == "
+    "0, do not call the interface.");
+
+DEFINE_int32(
+    get_current_wal_file_one_in, 1000000,
+    "With a chance of 1/N, call GetCurrentWalFile to verify if it returns "
+    "correctly. (Note that this API may legitimately return an error.) If N == "
+    "0, do not call the interface.");
 
 DEFINE_int32(set_options_one_in, 0,
              "With a chance of 1/N, change some random options");
@@ -270,9 +292,32 @@
 DEFINE_int64(cache_size, 2LL * KB * KB * KB,
              "Number of bytes to use as a cache of uncompressed data.");
 
+DEFINE_int32(cache_numshardbits, 6,
+             "Number of shards for the block cache"
+             " is 2 ** cache_numshardbits. Negative means use default settings."
+             " This is applied only if FLAGS_cache_size is non-negative.");
+
 DEFINE_bool(cache_index_and_filter_blocks, false,
             "True if indexes/filters should be cached in block cache.");
 
+DEFINE_int32(
+    top_level_index_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for top-level indexes into metadata partitions (see "
+    "`enum PinningTier` in table.h)");
+
+DEFINE_int32(
+    partition_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for metadata partitions (see `enum PinningTier` in "
+    "table.h)");
+
+DEFINE_int32(
+    unpartitioned_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for unpartitioned metadata blocks (see `enum PinningTier` "
+    "in table.h)");
+
 DEFINE_bool(use_clock_cache, false,
             "Replace default LRU block cache with clock cache.");
 
@@ -289,37 +334,87 @@
 DEFINE_bool(allow_concurrent_memtable_write, false,
             "Allow multi-writers to update mem tables in parallel.");
 
+DEFINE_double(experimental_mempurge_threshold, 0.0,
+              "Maximum estimated useful payload that triggers a "
+              "mempurge process to collect memtable garbage bytes.");
+
 DEFINE_bool(enable_write_thread_adaptive_yield, true,
             "Use a yielding spin loop for brief writer thread waits.");
 
 #ifndef ROCKSDB_LITE
-// BlobDB Options
-DEFINE_bool(use_blob_db, false, "Use BlobDB.");
+// Options for StackableDB-based BlobDB
+DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB.");
 
-DEFINE_uint64(blob_db_min_blob_size,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-              "Smallest blob to store in a file. Blobs smaller than this "
-              "will be inlined with the key in the LSM tree.");
-
-DEFINE_uint64(blob_db_bytes_per_sync,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-              "Sync blob files once per every N bytes written.");
+DEFINE_uint64(
+    blob_db_min_blob_size,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
+    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
+    "smaller than this will be inlined with the key in the LSM tree.");
+
+DEFINE_uint64(
+    blob_db_bytes_per_sync,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
+    "[Stacked BlobDB] Sync blob files once per every N bytes written.");
 
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
-              "Target size of each blob file.");
+              "[Stacked BlobDB] Target size of each blob file.");
 
 DEFINE_bool(
     blob_db_enable_gc,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
-    "Enable BlobDB garbage collection.");
+    "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
 DEFINE_double(
     blob_db_gc_cutoff,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "Cutoff ratio for BlobDB garbage collection.");
+    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
 #endif  // !ROCKSDB_LITE
 
+// Options for integrated BlobDB
+DEFINE_bool(allow_setting_blob_options_dynamically, false,
+            "[Integrated BlobDB] Allow setting blob options dynamically.");
+
+DEFINE_bool(
+    enable_blob_files,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
+    "[Integrated BlobDB] Enable writing large values to separate blob files.");
+
+DEFINE_uint64(min_blob_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
+              "[Integrated BlobDB] The size of the smallest value to be stored "
+              "separately in a blob file.");
+
+DEFINE_uint64(blob_file_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
+              "[Integrated BlobDB] The size limit for blob files.");
+
+DEFINE_string(blob_compression_type, "none",
+              "[Integrated BlobDB] The compression algorithm to use for large "
+              "values stored in blob files.");
+
+DEFINE_bool(enable_blob_garbage_collection,
+            ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                .enable_blob_garbage_collection,
+            "[Integrated BlobDB] Enable blob garbage collection.");
+
+DEFINE_double(blob_garbage_collection_age_cutoff,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_age_cutoff,
+              "[Integrated BlobDB] The cutoff in terms of blob file age for "
+              "garbage collection.");
+
+DEFINE_double(blob_garbage_collection_force_threshold,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_force_threshold,
+              "[Integrated BlobDB] The threshold for the ratio of garbage in "
+              "the oldest blob files for forcing garbage collection.");
+
+DEFINE_uint64(blob_compaction_readahead_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_compaction_readahead_size,
+              "[Integrated BlobDB] Compaction readahead for blob files.");
+
 static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
 
@@ -343,10 +438,21 @@
             "use block based filter"
             "instead of full filter for block based table");
 
+DEFINE_int32(
+    ribbon_starting_level, 999,
+    "Use Bloom filter on levels below specified and Ribbon beginning on level "
+    "specified. Flush is considered level -1. 999 or more -> always Bloom. 0 "
+    "-> Ribbon except Bloom for flush. -1 -> always Ribbon.");
+
 DEFINE_bool(partition_filters, false,
             "use partitioned filters "
             "for block-based table");
 
+DEFINE_bool(
+    optimize_filters_for_memory,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
+    "Minimize memory footprint of filters");
+
 DEFINE_int32(
     index_type,
     static_cast<int32_t>(
@@ -361,12 +467,14 @@
 DEFINE_bool(test_secondary, false, "Test secondary instance.");
 
 DEFINE_string(
-    expected_values_path, "",
-    "File where the array of expected uint32_t values will be stored. If "
-    "provided and non-empty, the DB state will be verified against these "
-    "values after recovery. --max_key and --column_family must be kept the "
-    "same across invocations of this program that use the same "
-    "--expected_values_path.");
+    expected_values_dir, "",
+    "Dir where files containing info about the latest/historical values will "
+    "be stored. If provided and non-empty, the DB state will be verified "
+    "against values from these files after recovery. --max_key and "
+    "--column_family must be kept the same across invocations of this program "
+    "that use the same --expected_values_dir. Currently historical values are "
+    "only tracked when --sync_fault_injection is set. See --seed and "
+    "--nooverwritepercent for further requirements.");
 
 DEFINE_bool(verify_checksum, false,
             "Verify checksum for every block read from storage");
@@ -384,6 +492,9 @@
             ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
             "Use O_DIRECT for writing data");
 
+DEFINE_bool(mock_direct_io, false,
+            "Mock direct IO by not using O_DIRECT for direct IO read");
+
 DEFINE_bool(statistics, false, "Create database statistics");
 
 DEFINE_bool(sync, false, "Sync all writes to disk");
@@ -395,12 +506,11 @@
              "probability 1/this");
 static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
-extern int rocksdb_kill_odds;
 
-DEFINE_string(kill_prefix_blacklist, "",
+DEFINE_string(kill_exclude_prefixes, "",
               "If non-empty, kill points with prefix in the list given will be"
               " skipped. Items are comma-separated.");
-extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
+extern std::vector<std::string> rocksdb_kill_exclude_prefixes;
 
 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
 
@@ -430,6 +540,14 @@
 DEFINE_bool(rate_limit_bg_reads, false,
             "Use options.rate_limiter on compaction reads");
 
+DEFINE_uint64(sst_file_manager_bytes_per_sec, 0,
+              "Set `Options::sst_file_manager` to delete at this rate. By "
+              "default the deletion rate is unbounded.");
+
+DEFINE_uint64(sst_file_manager_bytes_per_truncate, 0,
+              "Set `Options::sst_file_manager` to delete in chunks of this "
+              "many bytes. By default whole files will be deleted.");
+
 DEFINE_bool(use_txn, false,
             "Use TransactionDB. Currently the default write policy is "
             "TxnDBWritePolicy::WRITE_PREPARED");
@@ -449,6 +567,10 @@
              "every N operations on average.  0 indicates CreateNewBackup() "
              "is disabled.");
 
+DEFINE_uint64(backup_max_size, 100 * 1024 * 1024,
+              "If non-zero, skip checking backup/restore when DB size in "
+              "bytes exceeds this setting.");
+
 DEFINE_int32(checkpoint_one_in, 0,
              "If non-zero, then CreateCheckpoint() will be called once for "
              "every N operations on average.  0 indicates CreateCheckpoint() "
@@ -470,6 +592,12 @@
              "If non-zero, then CompactRange() will be called once for every N "
              "operations on average.  0 indicates CompactRange() is disabled.");
 
+DEFINE_int32(mark_for_compaction_one_file_in, 0,
+             "A `TablePropertiesCollectorFactory` will be registered, which "
+             "creates a `TablePropertiesCollector` with `NeedCompact()` "
+             "returning true once for every N files on average. 0 or negative "
+             "mean `NeedCompact()` always returns false.");
+
 DEFINE_int32(flush_one_in, 0,
              "If non-zero, then Flush() will be called once for every N ops "
              "on average.  0 indicates calls to Flush() are disabled.");
@@ -537,7 +665,8 @@
 
 DEFINE_int32(nooverwritepercent, 60,
              "Ratio of keys without overwrite to total workload (expressed as "
-             " a percentage)");
+             "a percentage). When --expected_values_dir is nonempty, must "
+             "keep this value constant across invocations.");
 static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
 
@@ -551,6 +680,10 @@
 static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
 
+DEFINE_int32(
+    customopspercent, 0,
+    "Ratio of custom operations to total workload (expressed as a percentage)");
+
 DEFINE_string(compression_type, "snappy",
               "Algorithm to use to compress the database");
 
@@ -562,16 +695,31 @@
              "Maximum size of training data passed to zstd's dictionary "
              "trainer.");
 
+DEFINE_int32(compression_parallel_threads, 1,
+             "Number of threads for parallel compression.");
+
+DEFINE_uint64(compression_max_dict_buffer_bytes, 0,
+              "Buffering limit for SST file data to sample for dictionary "
+              "compression.");
+
 DEFINE_string(bottommost_compression_type, "disable",
               "Algorithm to use to compress bottommost level of the database. "
               "\"disable\" means disabling the feature");
 
 DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
 
-DEFINE_string(hdfs, "", "Name of hdfs environment");
+DEFINE_string(hdfs, "",
+              "Name of hdfs environment. Mutually exclusive with"
+              " --env_uri and --fs_uri.");
+
+DEFINE_string(
+    env_uri, "",
+    "URI for env lookup. Mutually exclusive with --hdfs and --fs_uri");
 
-DEFINE_string(env_uri, "",
-              "URI for env lookup. Mutually exclusive with --hdfs");
+DEFINE_string(fs_uri, "",
+              "URI for registry Filesystem lookup. Mutually exclusive"
+              " with --hdfs and --env_uri."
+              " Creates a default environment with the specified filesystem.");
 
 DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
 static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) =
@@ -627,6 +775,10 @@
             ROCKSDB_NAMESPACE::Options().write_dbid_to_manifest,
             "Write DB_ID to manifest");
 
+DEFINE_bool(avoid_flush_during_recovery,
+            ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
+            "Avoid flush during recovery");
+
 DEFINE_uint64(max_write_batch_group_size_bytes,
               ROCKSDB_NAMESPACE::Options().max_write_batch_group_size_bytes,
               "Max write batch group size");
@@ -652,4 +804,79 @@
 DEFINE_int32(approximate_size_one_in, 64,
              "If non-zero, DB::GetApproximateSizes() will be called against"
              " random key ranges.");
+
+DEFINE_int32(read_fault_one_in, 1000,
+            "On non-zero, enables fault injection on read");
+
+DEFINE_int32(get_property_one_in, 1000,
+             "If non-zero, then DB::GetProperty() will be called to get various"
+             " properties for every N ops on average. 0 indicates that"
+             " GetProperty() will be not be called.");
+
+DEFINE_bool(sync_fault_injection, false,
+            "If true, FaultInjectionTestFS will be used for write operations, "
+            "and unsynced data in DB will lost after crash. In such a case we "
+            "track DB changes in a trace file (\"*.trace\") in "
+            "--expected_values_dir for verifying there are no holes in the "
+            "recovered data.");
+
+DEFINE_bool(best_efforts_recovery, false,
+            "If true, use best efforts recovery.");
+DEFINE_bool(skip_verifydb, false, "If true, skip VerifyDb() calls.");
+
+DEFINE_bool(enable_compaction_filter, false,
+            "If true, configures a compaction filter that returns a kRemove "
+            "decision for deleted keys.");
+
+DEFINE_bool(paranoid_file_checks, true,
+            "After writing every SST file, reopen it and read all the keys "
+            "and validate checksums");
+
+DEFINE_bool(fail_if_options_file_error, false,
+            "Fail operations that fail to detect or properly persist options "
+            "file.");
+
+DEFINE_uint64(batch_protection_bytes_per_key, 0,
+              "If nonzero, enables integrity protection in `WriteBatch` at the "
+              "specified number of bytes per key. Currently the only supported "
+              "nonzero value is eight.");
+
+DEFINE_string(file_checksum_impl, "none",
+              "Name of an implementation for file_checksum_gen_factory, or "
+              "\"none\" for null.");
+
+DEFINE_int32(write_fault_one_in, 0,
+             "On non-zero, enables fault injection on write");
+
+DEFINE_uint64(user_timestamp_size, 0,
+              "Number of bytes for a user-defined timestamp. Currently, only "
+              "8-byte is supported");
+
+DEFINE_int32(open_metadata_write_fault_one_in, 0,
+             "On non-zero, enables fault injection on file metadata write "
+             "during DB reopen.");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a customized secondary cache object");
+DEFINE_int32(secondary_cache_fault_one_in, 0,
+             "On non-zero, enables fault injection in secondary cache inserts"
+             " and lookups");
+#endif  // ROCKSDB_LITE
+DEFINE_int32(open_write_fault_one_in, 0,
+             "On non-zero, enables fault injection on file writes "
+             "during DB reopen.");
+DEFINE_int32(open_read_fault_one_in, 0,
+             "On non-zero, enables fault injection on file reads "
+             "during DB reopen.");
+DEFINE_int32(injest_error_severity, 1,
+             "The severity of the injested IO Error. 1 is soft error (e.g. "
+             "retryable error), 2 is fatal error, and the default is "
+             "retryable error.");
+DEFINE_int32(prepopulate_block_cache,
+             static_cast<int32_t>(ROCKSDB_NAMESPACE::BlockBasedTableOptions::
+                                      PrepopulateBlockCache::kDisable),
+             "Options related to cache warming (see `enum "
+             "PrepopulateBlockCache` in table.h)");
+
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,148 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db_stress_tool/db_stress_listener.h"
+
+#include <cstdint>
+
+#include "rocksdb/file_system.h"
+#include "util/coding_lean.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef GFLAGS
+#ifndef ROCKSDB_LITE
+
+// TODO: consider using expected_values_dir instead, but this is more
+// convenient for now.
+UniqueIdVerifier::UniqueIdVerifier(const std::string& db_name)
+    : path_(db_name + "/.unique_ids") {
+  // We expect such a small number of files generated during this test
+  // (thousands?), checking full 192-bit IDs for uniqueness is a very
+  // weak check. For a stronger check, we pick a specific 64-bit
+  // subsequence from the ID to check for uniqueness. All bits of the
+  // ID should be high quality, and 64 bits should be unique with
+  // very good probability for the quantities in this test.
+  offset_ = Random::GetTLSInstance()->Uniform(17);  // 0 to 16
+
+  // Use default FileSystem to avoid fault injection, etc.
+  FileSystem& fs = *FileSystem::Default();
+  IOOptions opts;
+
+  Status st = fs.CreateDirIfMissing(db_name, opts, nullptr);
+  if (!st.ok()) {
+    fprintf(stderr, "Failed to create directory %s: %s\n", db_name.c_str(),
+            st.ToString().c_str());
+    exit(1);
+  }
+
+  {
+    std::unique_ptr<FSSequentialFile> reader;
+    Status s =
+        fs.NewSequentialFile(path_, FileOptions(), &reader, /*dbg*/ nullptr);
+    if (s.ok()) {
+      // Load from file
+      std::string id(24U, '\0');
+      Slice result;
+      for (;;) {
+        s = reader->Read(id.size(), opts, &result, &id[0], /*dbg*/ nullptr);
+        if (!s.ok()) {
+          fprintf(stderr, "Error reading unique id file: %s\n",
+                  s.ToString().c_str());
+          assert(false);
+        }
+        if (result.size() < id.size()) {
+          // EOF
+          if (result.size() != 0) {
+            // Corrupt file. Not a DB bug but could happen if OS doesn't provide
+            // good guarantees on process crash.
+            fprintf(stdout, "Warning: clearing corrupt unique id file\n");
+            id_set_.clear();
+            reader.reset();
+            s = fs.DeleteFile(path_, opts, /*dbg*/ nullptr);
+            assert(s.ok());
+          }
+          break;
+        }
+        VerifyNoWrite(id);
+      }
+    } else {
+      // Newly created is ok.
+      // But FileSystem doesn't tell us whether non-existence was the cause of
+      // the failure. (Issue #9021)
+      Status s2 = fs.FileExists(path_, opts, /*dbg*/ nullptr);
+      if (!s2.IsNotFound()) {
+        fprintf(stderr, "Error opening unique id file: %s\n",
+                s.ToString().c_str());
+        assert(false);
+      }
+    }
+  }
+  fprintf(stdout, "(Re-)verified %zu unique IDs\n", id_set_.size());
+  Status s = fs.ReopenWritableFile(path_, FileOptions(), &data_file_writer_,
+                                   /*dbg*/ nullptr);
+  if (!s.ok()) {
+    fprintf(stderr, "Error opening unique id file for append: %s\n",
+            s.ToString().c_str());
+    assert(false);
+  }
+}
+
+UniqueIdVerifier::~UniqueIdVerifier() {
+  data_file_writer_->Close(IOOptions(), /*dbg*/ nullptr);
+}
+
+void UniqueIdVerifier::VerifyNoWrite(const std::string& id) {
+  assert(id.size() == 24);
+  bool is_new = id_set_.insert(DecodeFixed64(&id[offset_])).second;
+  if (!is_new) {
+    fprintf(stderr,
+            "Duplicate partial unique ID found (offset=%zu, count=%zu)\n",
+            offset_, id_set_.size());
+    assert(false);
+  }
+}
+
+void UniqueIdVerifier::Verify(const std::string& id) {
+  assert(id.size() == 24);
+  std::lock_guard<std::mutex> lock(mutex_);
+  // If we accumulate more than ~4 million IDs, there would be > 1 in 1M
+  // natural chance of collision. Thus, simply stop checking at that point.
+  if (id_set_.size() >= 4294967) {
+    return;
+  }
+  IOStatus s =
+      data_file_writer_->Append(Slice(id), IOOptions(), /*dbg*/ nullptr);
+  if (!s.ok()) {
+    fprintf(stderr, "Error writing to unique id file: %s\n",
+            s.ToString().c_str());
+    assert(false);
+  }
+  s = data_file_writer_->Flush(IOOptions(), /*dbg*/ nullptr);
+  if (!s.ok()) {
+    fprintf(stderr, "Error flushing unique id file: %s\n",
+            s.ToString().c_str());
+    assert(false);
+  }
+  VerifyNoWrite(id);
+}
+
+void DbStressListener::VerifyTableFileUniqueId(
+    const TableProperties& new_file_properties, const std::string& file_path) {
+  // Verify unique ID
+  std::string id;
+  Status s = GetUniqueIdFromTableProperties(new_file_properties, &id);
+  if (!s.ok()) {
+    fprintf(stderr, "Error getting SST unique id for %s: %s\n",
+            file_path.c_str(), s.ToString().c_str());
+    assert(false);
+  }
+  unique_ids_.Verify(id);
+}
+
+#endif  // !ROCKSDB_LITE
+#endif  // GFLAGS
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h	2025-05-19 16:14:27.000000000 +0000
@@ -6,12 +6,45 @@
 #ifdef GFLAGS
 #pragma once
 
+#include <mutex>
+#include <unordered_set>
+
+#include "file/filename.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/unique_id.h"
 #include "util/gflags_compat.h"
+#include "util/random.h"
 
 DECLARE_int32(compact_files_one_in);
 
 namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+// Verify across process executions that all seen IDs are unique
+class UniqueIdVerifier {
+ public:
+  explicit UniqueIdVerifier(const std::string& db_name);
+  ~UniqueIdVerifier();
+
+  void Verify(const std::string& id);
+
+ private:
+  void VerifyNoWrite(const std::string& id);
+
+ private:
+  std::mutex mutex_;
+  // IDs persisted to a hidden file inside DB dir
+  std::string path_;
+  std::unique_ptr<FSWritableFile> data_file_writer_;
+  // Starting byte for which 8 bytes to check in memory within 24 byte ID
+  size_t offset_;
+  // Working copy of the set of 8 byte pieces
+  std::unordered_set<uint64_t> id_set_;
+};
+
 class DbStressListener : public EventListener {
  public:
   DbStressListener(const std::string& db_name,
@@ -20,8 +53,12 @@
       : db_name_(db_name),
         db_paths_(db_paths),
         column_families_(column_families),
-        num_pending_file_creations_(0) {}
-#ifndef ROCKSDB_LITE
+        num_pending_file_creations_(0),
+        unique_ids_(db_name) {}
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "DBStressListener"; }
+
   ~DbStressListener() override { assert(num_pending_file_creations_ == 0); }
   void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
     assert(IsValidColumnFamilyName(info.cf_name));
@@ -64,15 +101,15 @@
   void OnTableFileCreated(const TableFileCreationInfo& info) override {
     assert(info.db_name == db_name_);
     assert(IsValidColumnFamilyName(info.cf_name));
-    if (info.file_size) {
-      VerifyFilePath(info.file_path);
-    }
     assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
-    if (info.status.ok() && info.file_size > 0) {
+    if (info.status.ok()) {
+      assert(info.file_size > 0);
+      VerifyFilePath(info.file_path);
       assert(info.table_properties.data_size > 0 ||
              info.table_properties.num_range_deletions > 0);
       assert(info.table_properties.raw_key_size > 0);
       assert(info.table_properties.num_entries > 0);
+      VerifyTableFileUniqueId(info.table_properties, info.file_path);
     }
     --num_pending_file_creations_;
   }
@@ -86,9 +123,12 @@
     RandomSleep();
   }
 
-  void OnExternalFileIngested(
-      DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) override {
+  void OnExternalFileIngested(DB* /*db*/,
+                              const ExternalFileIngestionInfo& info) override {
     RandomSleep();
+    // Here we assume that each generated external file is ingested
+    // exactly once (or thrown away in case of crash)
+    VerifyTableFileUniqueId(info.table_properties, info.internal_file_path);
   }
 
   void OnBackgroundError(BackgroundErrorReason /* reason */,
@@ -206,17 +246,23 @@
 #endif  // !NDEBUG
   }
 
+  // Unique id is verified using the TableProperties. file_path is only used
+  // for reporting.
+  void VerifyTableFileUniqueId(const TableProperties& new_file_properties,
+                               const std::string& file_path);
+
   void RandomSleep() {
     std::this_thread::sleep_for(
         std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
   }
-#endif  // !ROCKSDB_LITE
 
  private:
   std::string db_name_;
   std::vector<DbPath> db_paths_;
   std::vector<ColumnFamilyDescriptor> column_families_;
   std::atomic<int> num_pending_file_creations_;
+  UniqueIdVerifier unique_ids_;
 };
+#endif  // !ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.cc	2025-05-19 16:14:27.000000000 +0000
@@ -14,5 +14,14 @@
 namespace ROCKSDB_NAMESPACE {
 const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
 const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff;
+#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
+#if defined(OS_SOLARIS)
+__thread bool SharedState::ignore_read_error;
+#else
+thread_local bool SharedState::ignore_read_error;
+#endif // OS_SOLARIS
+#else
+bool SharedState::ignore_read_error;
+#endif // ROCKSDB_SUPPORT_THREAD_LOCAL
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,6 +11,11 @@
 #pragma once
 
 #include "db_stress_tool/db_stress_stat.h"
+#include "db_stress_tool/expected_state.h"
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+#include "test_util/sync_point.h"
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
 #include "util/gflags_compat.h"
 
 DECLARE_uint64(seed);
@@ -19,11 +24,18 @@
 DECLARE_int32(threads);
 DECLARE_int32(column_families);
 DECLARE_int32(nooverwritepercent);
-DECLARE_string(expected_values_path);
+DECLARE_string(expected_values_dir);
 DECLARE_int32(clear_column_family_one_in);
 DECLARE_bool(test_batches_snapshots);
 DECLARE_int32(compaction_thread_pool_adjust_interval);
 DECLARE_int32(continuous_verification_interval);
+DECLARE_int32(read_fault_one_in);
+DECLARE_int32(write_fault_one_in);
+DECLARE_int32(open_metadata_write_fault_one_in);
+DECLARE_int32(open_write_fault_one_in);
+DECLARE_int32(open_read_fault_one_in);
+
+DECLARE_int32(injest_error_severity);
 
 namespace ROCKSDB_NAMESPACE {
 class StressTest;
@@ -37,12 +49,26 @@
   // indicates a key should definitely be deleted
   static const uint32_t DELETION_SENTINEL;
 
-  SharedState(Env* env, StressTest* stress_test)
+  // Errors when reading filter blocks are ignored, so we use a thread
+  // local variable updated via sync points to keep track of errors injected
+  // while reading filter blocks in order to ignore the Get/MultiGet result
+  // for those calls
+#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
+#if defined(OS_SOLARIS)
+  static __thread bool ignore_read_error;
+#else
+  static thread_local bool ignore_read_error;
+#endif // OS_SOLARIS
+#else
+  static bool ignore_read_error;
+#endif // ROCKSDB_SUPPORT_THREAD_LOCAL
+
+  SharedState(Env* /*env*/, StressTest* stress_test)
       : cv_(&mu_),
         seed_(static_cast<uint32_t>(FLAGS_seed)),
         max_key_(FLAGS_max_key),
         log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
-        num_threads_(FLAGS_threads),
+        num_threads_(0),
         num_initialized_(0),
         num_populated_(0),
         vote_reopen_(0),
@@ -56,7 +82,7 @@
         verification_failure_(false),
         should_stop_test_(false),
         no_overwrite_ids_(FLAGS_column_families),
-        values_(nullptr),
+        expected_state_manager_(nullptr),
         printing_verification_results_(false) {
     // Pick random keys in each column family that will not experience
     // overwrite
@@ -85,64 +111,38 @@
     }
     delete[] permutation;
 
-    size_t expected_values_size =
-        sizeof(std::atomic<uint32_t>) * FLAGS_column_families * max_key_;
-    bool values_init_needed = false;
     Status status;
-    if (!FLAGS_expected_values_path.empty()) {
+    // TODO: We should introduce a way to explicitly disable verification
+    // during shutdown. When that is disabled and FLAGS_expected_values_dir
+    // is empty (disabling verification at startup), we can skip tracking
+    // expected state. Only then should we permit bypassing the below feature
+    // compatibility checks.
+    if (!FLAGS_expected_values_dir.empty()) {
       if (!std::atomic<uint32_t>{}.is_lock_free()) {
         status = Status::InvalidArgument(
-            "Cannot use --expected_values_path on platforms without lock-free "
+            "Cannot use --expected_values_dir on platforms without lock-free "
             "std::atomic<uint32_t>");
       }
       if (status.ok() && FLAGS_clear_column_family_one_in > 0) {
         status = Status::InvalidArgument(
-            "Cannot use --expected_values_path on when "
+            "Cannot use --expected_values_dir on when "
             "--clear_column_family_one_in is greater than zero.");
       }
-      uint64_t size = 0;
-      if (status.ok()) {
-        status = env->GetFileSize(FLAGS_expected_values_path, &size);
-      }
-      std::unique_ptr<WritableFile> wfile;
-      if (status.ok() && size == 0) {
-        const EnvOptions soptions;
-        status =
-            env->NewWritableFile(FLAGS_expected_values_path, &wfile, soptions);
-      }
-      if (status.ok() && size == 0) {
-        std::string buf(expected_values_size, '\0');
-        status = wfile->Append(buf);
-        values_init_needed = true;
-      }
-      if (status.ok()) {
-        status = env->NewMemoryMappedFileBuffer(FLAGS_expected_values_path,
-                                                &expected_mmap_buffer_);
-      }
-      if (status.ok()) {
-        assert(expected_mmap_buffer_->GetLen() == expected_values_size);
-        values_ = static_cast<std::atomic<uint32_t>*>(
-            expected_mmap_buffer_->GetBase());
-        assert(values_ != nullptr);
+    }
+    if (status.ok()) {
+      if (FLAGS_expected_values_dir.empty()) {
+        expected_state_manager_.reset(
+            new AnonExpectedStateManager(FLAGS_max_key, FLAGS_column_families));
       } else {
-        fprintf(stderr, "Failed opening shared file '%s' with error: %s\n",
-                FLAGS_expected_values_path.c_str(), status.ToString().c_str());
-        assert(values_ == nullptr);
+        expected_state_manager_.reset(new FileExpectedStateManager(
+            FLAGS_max_key, FLAGS_column_families, FLAGS_expected_values_dir));
       }
+      status = expected_state_manager_->Open();
     }
-    if (values_ == nullptr) {
-      values_allocation_.reset(
-          new std::atomic<uint32_t>[FLAGS_column_families * max_key_]);
-      values_ = &values_allocation_[0];
-      values_init_needed = true;
-    }
-    assert(values_ != nullptr);
-    if (values_init_needed) {
-      for (int i = 0; i < FLAGS_column_families; ++i) {
-        for (int j = 0; j < max_key_; ++j) {
-          Delete(i, j, false /* pending */);
-        }
-      }
+    if (!status.ok()) {
+      fprintf(stderr, "Failed setting up expected state with error: %s\n",
+              status.ToString().c_str());
+      exit(1);
     }
 
     if (FLAGS_test_batches_snapshots) {
@@ -163,18 +163,24 @@
         ptr.reset(new port::Mutex);
       }
     }
-    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
-      ++num_bg_threads_;
-      fprintf(stdout, "Starting compaction_thread_pool_adjust_thread\n");
-    }
-    if (FLAGS_continuous_verification_interval > 0) {
-      ++num_bg_threads_;
-      fprintf(stdout, "Starting continuous_verification_thread\n");
+#ifndef NDEBUG
+    if (FLAGS_read_fault_one_in) {
+      SyncPoint::GetInstance()->SetCallBack("FaultInjectionIgnoreError",
+                                            IgnoreReadErrorCallback);
+      SyncPoint::GetInstance()->EnableProcessing();
+    }
+#endif // NDEBUG
+  }
+
+  ~SharedState() {
+#ifndef NDEBUG
+    if (FLAGS_read_fault_one_in) {
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      SyncPoint::GetInstance()->DisableProcessing();
     }
+#endif
   }
 
-  ~SharedState() {}
-
   port::Mutex* GetMutex() { return &mu_; }
 
   port::CondVar* GetCondVar() { return &cv_; }
@@ -185,6 +191,8 @@
 
   uint32_t GetNumThreads() const { return num_threads_; }
 
+  void IncThreads() { num_threads_++; }
+
   void IncInitialized() { num_initialized_++; }
 
   void IncOperated() { num_populated_++; }
@@ -217,89 +225,84 @@
 
   bool ShouldStopTest() const { return should_stop_test_.load(); }
 
+  // Returns a lock covering `key` in `cf`.
   port::Mutex* GetMutexForKey(int cf, int64_t key) {
     return key_locks_[cf][key >> log2_keys_per_lock_].get();
   }
 
+  // Acquires locks for all keys in `cf`.
   void LockColumnFamily(int cf) {
     for (auto& mutex : key_locks_[cf]) {
       mutex->Lock();
     }
   }
 
+  // Releases locks for all keys in `cf`.
   void UnlockColumnFamily(int cf) {
     for (auto& mutex : key_locks_[cf]) {
       mutex->Unlock();
     }
   }
 
-  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
-    return values_[cf * max_key_ + key];
+  Status SaveAtAndAfter(DB* db) {
+    return expected_state_manager_->SaveAtAndAfter(db);
   }
 
+  bool HasHistory() { return expected_state_manager_->HasHistory(); }
+
+  Status Restore(DB* db) { return expected_state_manager_->Restore(db); }
+
+  // Requires external locking covering all keys in `cf`.
   void ClearColumnFamily(int cf) {
-    std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
-              DELETION_SENTINEL);
+    return expected_state_manager_->ClearColumnFamily(cf);
   }
 
   // @param pending True if the update may have started but is not yet
   //    guaranteed finished. This is useful for crash-recovery testing when the
   //    process may crash before updating the expected values array.
+  //
+  // Requires external locking covering `key` in `cf`.
   void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
-    if (!pending) {
-      // prevent expected-value update from reordering before Write
-      std::atomic_thread_fence(std::memory_order_release);
-    }
-    Value(cf, key).store(pending ? UNKNOWN_SENTINEL : value_base,
-                         std::memory_order_relaxed);
-    if (pending) {
-      // prevent Write from reordering before expected-value update
-      std::atomic_thread_fence(std::memory_order_release);
-    }
+    return expected_state_manager_->Put(cf, key, value_base, pending);
   }
 
-  uint32_t Get(int cf, int64_t key) const { return Value(cf, key); }
+  // Requires external locking covering `key` in `cf`.
+  uint32_t Get(int cf, int64_t key) const {
+    return expected_state_manager_->Get(cf, key);
+  }
 
   // @param pending See comment above Put()
   // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
   bool Delete(int cf, int64_t key, bool pending) {
-    if (Value(cf, key) == DELETION_SENTINEL) {
-      return false;
-    }
-    Put(cf, key, DELETION_SENTINEL, pending);
-    return true;
+    return expected_state_manager_->Delete(cf, key, pending);
   }
 
   // @param pending See comment above Put()
   // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
   bool SingleDelete(int cf, int64_t key, bool pending) {
-    return Delete(cf, key, pending);
+    return expected_state_manager_->Delete(cf, key, pending);
   }
 
   // @param pending See comment above Put()
   // Returns number of keys deleted by the call.
+  //
+  // Requires external locking covering keys in `[begin_key, end_key)` in `cf`.
   int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
-    int covered = 0;
-    for (int64_t key = begin_key; key < end_key; ++key) {
-      if (Delete(cf, key, pending)) {
-        ++covered;
-      }
-    }
-    return covered;
+    return expected_state_manager_->DeleteRange(cf, begin_key, end_key,
+                                                pending);
   }
 
   bool AllowsOverwrite(int64_t key) {
     return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
   }
 
+  // Requires external locking covering `key` in `cf`.
   bool Exists(int cf, int64_t key) {
-    // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
-    // is disallowed can't be accidentally added a second time, in which case
-    // SingleDelete wouldn't be able to properly delete the key. It does allow
-    // the case where a SingleDelete might be added which covers nothing, but
-    // that's not a correctness issue.
-    uint32_t expected_value = Value(cf, key).load();
-    return expected_value != DELETION_SENTINEL;
+    return expected_state_manager_->Exists(cf, key);
   }
 
   uint32_t GetSeed() const { return seed_; }
@@ -308,6 +311,8 @@
 
   bool ShouldStopBgThread() { return should_stop_bg_thread_; }
 
+  void IncBgThreads() { ++num_bg_threads_; }
+
   void IncBgThreadsFinished() { ++bg_thread_finished_; }
 
   bool BgThreadsFinished() const {
@@ -315,7 +320,7 @@
   }
 
   bool ShouldVerifyAtBeginning() const {
-    return expected_mmap_buffer_.get() != nullptr;
+    return !FLAGS_expected_values_dir.empty();
   }
 
   bool PrintingVerificationResults() {
@@ -329,12 +334,16 @@
   }
 
  private:
+  static void IgnoreReadErrorCallback(void*) {
+    ignore_read_error = true;
+  }
+
   port::Mutex mu_;
   port::CondVar cv_;
   const uint32_t seed_;
   const int64_t max_key_;
   const uint32_t log2_keys_per_lock_;
-  const int num_threads_;
+  int num_threads_;
   long num_initialized_;
   long num_populated_;
   long vote_reopen_;
@@ -351,12 +360,10 @@
   // Keys that should not be overwritten
   std::unordered_set<size_t> no_overwrite_ids_;
 
-  std::atomic<uint32_t>* values_;
-  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
+  std::unique_ptr<ExpectedStateManager> expected_state_manager_;
   // Has to make it owned by a smart ptr as port::Mutex is not copyable
   // and storing it in the container may require copying depending on the impl.
   std::vector<std::vector<std::unique_ptr<port::Mutex>>> key_locks_;
-  std::unique_ptr<MemoryMappedFileBuffer> expected_mmap_buffer_;
   std::atomic<bool> printing_verification_results_;
 };
 
@@ -380,6 +387,8 @@
     std::string value;
     // optional state of all keys in the db
     std::vector<bool>* key_vec;
+
+    std::string timestamp;
   };
   std::queue<std::pair<uint64_t, SnapshotState>> snapshot_queue;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,17 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include "db_stress_tool/db_stress_stat.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
+std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_stat.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,9 +11,9 @@
 
 #include "monitoring/histogram.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/snapshot.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 
@@ -21,9 +21,10 @@
 DECLARE_bool(progress_reports);
 
 namespace ROCKSDB_NAMESPACE {
+
 // Database statistics
-static std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
-static std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
+extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
+extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
 
 class Stats {
  private:
@@ -42,6 +43,7 @@
   long range_deletions_;
   long covered_by_range_deletions_;
   long errors_;
+  long verified_errors_;
   long num_compact_files_succeed_;
   long num_compact_files_failed_;
   int next_report_;
@@ -67,11 +69,12 @@
     range_deletions_ = 0;
     covered_by_range_deletions_ = 0;
     errors_ = 0;
+    verified_errors_ = 0;
     bytes_ = 0;
     seconds_ = 0;
     num_compact_files_succeed_ = 0;
     num_compact_files_failed_ = 0;
-    start_ = Env::Default()->NowMicros();
+    start_ = SystemClock::Default()->NowMicros();
     last_op_finish_ = start_;
     finish_ = start_;
   }
@@ -90,6 +93,7 @@
     range_deletions_ += other.range_deletions_;
     covered_by_range_deletions_ = other.covered_by_range_deletions_;
     errors_ += other.errors_;
+    verified_errors_ += other.verified_errors_;
     bytes_ += other.bytes_;
     seconds_ += other.seconds_;
     num_compact_files_succeed_ += other.num_compact_files_succeed_;
@@ -99,13 +103,13 @@
   }
 
   void Stop() {
-    finish_ = Env::Default()->NowMicros();
+    finish_ = SystemClock::Default()->NowMicros();
     seconds_ = (finish_ - start_) * 1e-6;
   }
 
   void FinishedSingleOp() {
     if (FLAGS_histogram) {
-      auto now = Env::Default()->NowMicros();
+      auto now = SystemClock::Default()->NowMicros();
       auto micros = now - last_op_finish_;
       hist_.Add(micros);
       if (micros > 20000) {
@@ -163,6 +167,8 @@
 
   void AddErrors(long n) { errors_ += n; }
 
+  void AddVerifiedErrors(long n) { verified_errors_ += n; }
+
   void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
 
   void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_table_properties_collector.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,65 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+
+DECLARE_int32(mark_for_compaction_one_file_in);
+
+namespace ROCKSDB_NAMESPACE {
+
+// A `DbStressTablePropertiesCollector` ignores what keys/values were added to
+// the table, adds no properties to the table, and decides at random whether the
+// table will be marked for compaction according to
+// `FLAGS_mark_for_compaction_one_file_in`.
+class DbStressTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  DbStressTablePropertiesCollector()
+      : need_compact_(Random::GetTLSInstance()->OneInOpt(
+            FLAGS_mark_for_compaction_one_file_in)) {}
+
+  virtual Status AddUserKey(const Slice& /* key */, const Slice& /* value */,
+                            EntryType /*type*/, SequenceNumber /*seq*/,
+                            uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+
+  virtual Status Finish(UserCollectedProperties* /* properties */) override {
+    return Status::OK();
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+  virtual const char* Name() const override {
+    return "DbStressTablePropertiesCollector";
+  }
+
+  virtual bool NeedCompact() const override { return need_compact_; }
+
+ private:
+  const bool need_compact_;
+};
+
+// A `DbStressTablePropertiesCollectorFactory` creates
+// `DbStressTablePropertiesCollectorFactory`s.
+class DbStressTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /* context */) override {
+    return new DbStressTablePropertiesCollector();
+  }
+
+  virtual const char* Name() const override {
+    return "DbStressTablePropertiesCollectorFactory";
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,26 +10,64 @@
 
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_compaction_filter.h"
 #include "db_stress_tool/db_stress_driver.h"
+#include "db_stress_tool/db_stress_table_properties_collector.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "util/cast_util.h"
+#include "utilities/backupable/backupable_db_impl.h"
+#include "utilities/fault_injection_fs.h"
+#include "utilities/fault_injection_secondary_cache.h"
 
 namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+std::shared_ptr<const FilterPolicy> CreateFilterPolicy() {
+  if (FLAGS_bloom_bits < 0) {
+    return BlockBasedTableOptions().filter_policy;
+  }
+  const FilterPolicy* new_policy;
+  if (FLAGS_use_block_based_filter) {
+    if (FLAGS_ribbon_starting_level < 999) {
+      fprintf(
+          stderr,
+          "Cannot combine use_block_based_filter and ribbon_starting_level\n");
+      exit(1);
+    } else {
+      new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, true);
+    }
+  } else if (FLAGS_ribbon_starting_level >= 999) {
+    // Use Bloom API
+    new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false);
+  } else {
+    new_policy = NewRibbonFilterPolicy(
+        FLAGS_bloom_bits, /* bloom_before_level */ FLAGS_ribbon_starting_level);
+  }
+  return std::shared_ptr<const FilterPolicy>(new_policy);
+}
+
+}  // namespace
+
 StressTest::StressTest()
-    : cache_(NewCache(FLAGS_cache_size)),
+    : cache_(NewCache(FLAGS_cache_size, FLAGS_cache_numshardbits)),
       compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)),
-      filter_policy_(FLAGS_bloom_bits >= 0
-                         ? FLAGS_use_block_based_filter
-                               ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
-                               : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
-                         : nullptr),
+      filter_policy_(CreateFilterPolicy()),
       db_(nullptr),
 #ifndef ROCKSDB_LITE
       txn_db_(nullptr),
 #endif
+      clock_(db_stress_env->GetSystemClock().get()),
       new_column_family_name_(1),
       num_times_reopened_(0),
       db_preload_finished_(false),
-      cmp_db_(nullptr) {
+      cmp_db_(nullptr),
+      is_db_stopped_(false) {
   if (FLAGS_destroy_db_initially) {
     std::vector<std::string> files;
     db_stress_env->GetChildren(FLAGS_db, &files);
@@ -40,6 +78,7 @@
     }
 
     Options options;
+    options.env = db_stress_env;
     // Remove files without preserving manfiest files
 #ifndef ROCKSDB_LITE
     const Status s = !FLAGS_use_blob_db
@@ -82,7 +121,9 @@
   delete cmp_db_;
 }
 
-std::shared_ptr<Cache> StressTest::NewCache(size_t capacity) {
+std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
+                                            int32_t num_shard_bits) {
+  ConfigOptions config_options;
   if (capacity <= 0) {
     return nullptr;
   }
@@ -94,8 +135,46 @@
     }
     return cache;
   } else {
-    return NewLRUCache((size_t)capacity);
+    LRUCacheOptions opts;
+    opts.capacity = capacity;
+    opts.num_shard_bits = num_shard_bits;
+#ifndef ROCKSDB_LITE
+    std::shared_ptr<SecondaryCache> secondary_cache;
+    if (!FLAGS_secondary_cache_uri.empty()) {
+      Status s = SecondaryCache::CreateFromString(
+          config_options, FLAGS_secondary_cache_uri, &secondary_cache);
+      if (secondary_cache == nullptr) {
+        fprintf(stderr,
+                "No secondary cache registered matching string: %s status=%s\n",
+                FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+        exit(1);
+      }
+      if (FLAGS_secondary_cache_fault_one_in > 0) {
+        secondary_cache = std::make_shared<FaultInjectionSecondaryCache>(
+            secondary_cache, static_cast<uint32_t>(FLAGS_seed),
+            FLAGS_secondary_cache_fault_one_in);
+      }
+      opts.secondary_cache = secondary_cache;
+    }
+#endif
+    return NewLRUCache(opts);
+  }
+}
+
+std::vector<std::string> StressTest::GetBlobCompressionTags() {
+  std::vector<std::string> compression_tags{"kNoCompression"};
+
+  if (Snappy_Supported()) {
+    compression_tags.emplace_back("kSnappyCompression");
+  }
+  if (LZ4_Supported()) {
+    compression_tags.emplace_back("kLZ4Compression");
   }
+  if (ZSTD_Supported()) {
+    compression_tags.emplace_back("kZSTD");
+  }
+
+  return compression_tags;
 }
 
 bool StressTest::BuildOptionsTable() {
@@ -176,6 +255,25 @@
       {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
   };
 
+  if (FLAGS_allow_setting_blob_options_dynamically) {
+    options_tbl.emplace("enable_blob_files",
+                        std::vector<std::string>{"false", "true"});
+    options_tbl.emplace("min_blob_size",
+                        std::vector<std::string>{"0", "8", "16"});
+    options_tbl.emplace("blob_file_size",
+                        std::vector<std::string>{"1M", "16M", "256M", "1G"});
+    options_tbl.emplace("blob_compression_type", GetBlobCompressionTags());
+    options_tbl.emplace("enable_blob_garbage_collection",
+                        std::vector<std::string>{"false", "true"});
+    options_tbl.emplace(
+        "blob_garbage_collection_age_cutoff",
+        std::vector<std::string>{"0.0", "0.25", "0.5", "0.75", "1.0"});
+    options_tbl.emplace("blob_garbage_collection_force_threshold",
+                        std::vector<std::string>{"0.5", "0.75", "1.0"});
+    options_tbl.emplace("blob_compaction_readahead_size",
+                        std::vector<std::string>{"0", "1M", "4M"});
+  }
+
   options_table_ = std::move(options_tbl);
 
   for (const auto& iter : options_table_) {
@@ -185,28 +283,64 @@
 }
 
 void StressTest::InitDb() {
-  uint64_t now = db_stress_env->NowMicros();
+  uint64_t now = clock_->NowMicros();
   fprintf(stdout, "%s Initializing db_stress\n",
-          db_stress_env->TimeToString(now / 1000000).c_str());
+          clock_->TimeToString(now / 1000000).c_str());
   PrintEnv();
   Open();
   BuildOptionsTable();
 }
 
-void StressTest::InitReadonlyDb(SharedState* shared) {
-  uint64_t now = db_stress_env->NowMicros();
-  fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
-          db_stress_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
-  PreloadDbAndReopenAsReadOnly(FLAGS_max_key, shared);
+void StressTest::FinishInitDb(SharedState* shared) {
+  if (FLAGS_read_only) {
+    uint64_t now = clock_->NowMicros();
+    fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
+            clock_->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
+    PreloadDbAndReopenAsReadOnly(FLAGS_max_key, shared);
+  }
+
+  if (shared->HasHistory()) {
+    // The way it works right now is, if there's any history, that means the
+    // previous run mutating the DB had all its operations traced, in which case
+    // we should always be able to `Restore()` the expected values to match the
+    // `db_`'s current seqno.
+    Status s = shared->Restore(db_);
+    if (!s.ok()) {
+      fprintf(stderr, "Error restoring historical expected values: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) {
+    Status s = shared->SaveAtAndAfter(db_);
+    if (!s.ok()) {
+      fprintf(stderr, "Error enabling history tracing: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  if (FLAGS_enable_compaction_filter) {
+    auto* compaction_filter_factory =
+        reinterpret_cast<DbStressCompactionFilterFactory*>(
+            options_.compaction_filter_factory.get());
+    assert(compaction_filter_factory);
+    // This must be called only after any potential `SharedState::Restore()` has
+    // completed in order for the `compaction_filter_factory` to operate on the
+    // correct latest values file.
+    compaction_filter_factory->SetSharedState(shared);
+    fprintf(stdout, "Compaction filter factory: %s\n",
+            compaction_filter_factory->Name());
+  }
 }
 
 bool StressTest::VerifySecondaries() {
 #ifndef ROCKSDB_LITE
   if (FLAGS_test_secondary) {
-    uint64_t now = db_stress_env->NowMicros();
-    fprintf(
-        stdout, "%s Start to verify secondaries against primary\n",
-        db_stress_env->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
+    uint64_t now = clock_->NowMicros();
+    fprintf(stdout, "%s Start to verify secondaries against primary\n",
+            clock_->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
   }
   for (size_t k = 0; k != secondaries_.size(); ++k) {
     Status s = secondaries_[k]->TryCatchUpWithPrimary();
@@ -248,10 +382,9 @@
     }
   }
   if (FLAGS_test_secondary) {
-    uint64_t now = db_stress_env->NowMicros();
-    fprintf(
-        stdout, "%s Verification of secondaries succeeded\n",
-        db_stress_env->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
+    uint64_t now = clock_->NowMicros();
+    fprintf(stdout, "%s Verification of secondaries succeeded\n",
+            clock_->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
   }
 #endif  // ROCKSDB_LITE
   return true;
@@ -265,6 +398,11 @@
   }
   ReadOptions ropt;
   ropt.snapshot = snap_state.snapshot;
+  Slice ts;
+  if (!snap_state.timestamp.empty()) {
+    ts = snap_state.timestamp;
+    ropt.timestamp = &ts;
+  }
   PinnableSlice exp_v(&snap_state.value);
   exp_v.PinSelf();
   PinnableSlice v;
@@ -316,9 +454,11 @@
 
 void StressTest::VerificationAbort(SharedState* shared, std::string msg, int cf,
                                    int64_t key) const {
+  auto key_str = Key(key);
+  Slice key_slice = key_str;
   fprintf(stderr,
-          "Verification failed for column family %d key %" PRIi64 ": %s\n", cf,
-          key, msg.c_str());
+          "Verification failed for column family %d key %s (%" PRIi64 "): %s\n",
+          cf, key_slice.ToString(true).c_str(), key, msg.c_str());
   shared->SetVerificationFailure();
 }
 
@@ -368,6 +508,13 @@
         }
       } else {
         if (!FLAGS_use_txn) {
+          std::string ts_str;
+          Slice ts;
+          if (FLAGS_user_timestamp_size > 0) {
+            ts_str = NowNanosStr();
+            ts = ts_str;
+            write_opts.timestamp = &ts;
+          }
           s = db_->Put(write_opts, cfh, key, v);
         } else {
 #ifndef ROCKSDB_LITE
@@ -408,9 +555,9 @@
 #endif
 
     db_preload_finished_.store(true);
-    auto now = db_stress_env->NowMicros();
+    auto now = clock_->NowMicros();
     fprintf(stdout, "%s Reopening database in read-only\n",
-            db_stress_env->TimeToString(now / 1000000).c_str());
+            clock_->TimeToString(now / 1000000).c_str());
     // Reopen as read-only, can ignore all options related to updates
     Open();
   } else {
@@ -453,6 +600,8 @@
   }
   static std::atomic<uint64_t> txn_id = {0};
   TransactionOptions txn_options;
+  txn_options.lock_timeout = 600000;  // 10 min
+  txn_options.deadlock_detect = true;
   *txn = txn_db_->BeginTransaction(write_opts, txn_options);
   auto istr = std::to_string(txn_id.fetch_add(1));
   Status s = (*txn)->SetName("xid" + istr);
@@ -493,13 +642,40 @@
     write_opts.sync = true;
   }
   write_opts.disableWAL = FLAGS_disable_wal;
-  const int prefixBound = static_cast<int>(FLAGS_readpercent) +
-                          static_cast<int>(FLAGS_prefixpercent);
-  const int writeBound = prefixBound + static_cast<int>(FLAGS_writepercent);
-  const int delBound = writeBound + static_cast<int>(FLAGS_delpercent);
-  const int delRangeBound = delBound + static_cast<int>(FLAGS_delrangepercent);
+  const int prefix_bound = static_cast<int>(FLAGS_readpercent) +
+                           static_cast<int>(FLAGS_prefixpercent);
+  const int write_bound = prefix_bound + static_cast<int>(FLAGS_writepercent);
+  const int del_bound = write_bound + static_cast<int>(FLAGS_delpercent);
+  const int delrange_bound =
+      del_bound + static_cast<int>(FLAGS_delrangepercent);
+  const int iterate_bound =
+      delrange_bound + static_cast<int>(FLAGS_iterpercent);
+
   const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
 
+#ifndef NDEBUG
+  if (FLAGS_read_fault_one_in) {
+    fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(),
+                                            FLAGS_read_fault_one_in);
+  }
+  if (FLAGS_write_fault_one_in) {
+    IOStatus error_msg;
+    if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) {
+      error_msg = IOStatus::IOError("Retryable IO Error");
+      error_msg.SetRetryable(true);
+    } else if (FLAGS_injest_error_severity == 2) {
+      // Ingest the fatal error
+      error_msg = IOStatus::IOError("Fatal IO Error");
+      error_msg.SetDataLoss(true);
+    }
+    std::vector<FileType> types = {FileType::kTableFile,
+                                   FileType::kDescriptorFile,
+                                   FileType::kCurrentFile};
+    fault_fs_guard->SetRandomWriteError(
+        thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg,
+        /*inject_for_all_file_types=*/false, types);
+  }
+#endif // NDEBUG
   thread->stats.Start();
   for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
     if (thread->shared->HasVerificationFailedYet() ||
@@ -591,13 +767,29 @@
       }
 
 #ifndef ROCKSDB_LITE
-      // Every 1 in N verify the one of the following: 1) GetLiveFiles
-      // 2) GetSortedWalFiles 3) GetCurrentWalFile. Each time, randomly select
-      // one of them to run the test.
-      if (thread->rand.OneInOpt(FLAGS_get_live_files_and_wal_files_one_in)) {
-        Status status = VerifyGetLiveAndWalFiles(thread);
+      // Verify GetLiveFiles with a 1 in N chance.
+      if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in) &&
+          !FLAGS_write_fault_one_in) {
+        Status status = VerifyGetLiveFiles();
+        if (!status.ok()) {
+          VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status);
+        }
+      }
+
+      // Verify GetSortedWalFiles with a 1 in N chance.
+      if (thread->rand.OneInOpt(FLAGS_get_sorted_wal_files_one_in)) {
+        Status status = VerifyGetSortedWalFiles();
+        if (!status.ok()) {
+          VerificationAbort(shared, "VerifyGetSortedWalFiles status not OK",
+                            status);
+        }
+      }
+
+      // Verify GetCurrentWalFile with a 1 in N chance.
+      if (thread->rand.OneInOpt(FLAGS_get_current_wal_file_one_in)) {
+        Status status = VerifyGetCurrentWalFile();
         if (!status.ok()) {
-          VerificationAbort(shared, "VerifyGetLiveAndWalFiles status not OK",
+          VerificationAbort(shared, "VerifyGetCurrentWalFile status not OK",
                             status);
         }
       }
@@ -618,6 +810,10 @@
           VerificationAbort(shared, "VerifyChecksum status not OK", status);
         }
       }
+
+      if (thread->rand.OneInOpt(FLAGS_get_property_one_in)) {
+        TestGetProperty(thread);
+      }
 #endif
 
       std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
@@ -627,10 +823,23 @@
       }
 
       if (thread->rand.OneInOpt(FLAGS_backup_one_in)) {
-        Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Backup/restore gave inconsistent state",
-                            s);
+        // Beyond a certain DB size threshold, this test becomes heavier than
+        // it's worth.
+        uint64_t total_size = 0;
+        if (FLAGS_backup_max_size > 0) {
+          std::vector<FileAttributes> files;
+          db_stress_env->GetChildrenFileAttributes(FLAGS_db, &files);
+          for (auto& file : files) {
+            total_size += file.size_bytes;
+          }
+        }
+
+        if (total_size <= FLAGS_backup_max_size) {
+          Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                              s);
+          }
         }
       }
 
@@ -661,6 +870,20 @@
         }
       }
 
+      // Assign timestamps if necessary.
+      std::string read_ts_str;
+      std::string write_ts_str;
+      Slice read_ts;
+      Slice write_ts;
+      if (ShouldAcquireMutexOnKey() && FLAGS_user_timestamp_size > 0) {
+        read_ts_str = GenerateTimestampForRead();
+        read_ts = read_ts_str;
+        read_opts.timestamp = &read_ts;
+        write_ts_str = NowNanosStr();
+        write_ts = write_ts_str;
+        write_opts.timestamp = &write_ts;
+      }
+
       int prob_op = thread->rand.Uniform(100);
       // Reset this in case we pick something other than a read op. We don't
       // want to use a stale value when deciding at the beginning of the loop
@@ -683,7 +906,7 @@
         } else {
           TestGet(thread, read_opts, rand_column_families, rand_keys);
         }
-      } else if (prob_op < prefixBound) {
+      } else if (prob_op < prefix_bound) {
         assert(static_cast<int>(FLAGS_readpercent) <= prob_op);
         // OPERATION prefix scan
         // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
@@ -691,22 +914,22 @@
         // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
         // prefix
         TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
-      } else if (prob_op < writeBound) {
-        assert(prefixBound <= prob_op);
+      } else if (prob_op < write_bound) {
+        assert(prefix_bound <= prob_op);
         // OPERATION write
         TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
                 value, lock);
-      } else if (prob_op < delBound) {
-        assert(writeBound <= prob_op);
+      } else if (prob_op < del_bound) {
+        assert(write_bound <= prob_op);
         // OPERATION delete
         TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
-      } else if (prob_op < delRangeBound) {
-        assert(delBound <= prob_op);
+      } else if (prob_op < delrange_bound) {
+        assert(del_bound <= prob_op);
         // OPERATION delete range
         TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
                         lock);
-      } else {
-        assert(delRangeBound <= prob_op);
+      } else if (prob_op < iterate_bound) {
+        assert(delrange_bound <= prob_op);
         // OPERATION iterate
         int num_seeks = static_cast<int>(
             std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
@@ -714,6 +937,9 @@
         rand_keys = GenerateNKeys(thread, num_seeks, i);
         i += num_seeks - 1;
         TestIterate(thread, read_opts, rand_column_families, rand_keys);
+      } else {
+        assert(iterate_bound <= prob_op);
+        TestCustomOperations(thread, rand_column_families);
       }
       thread->stats.FinishedSingleOp();
 #ifndef ROCKSDB_LITE
@@ -751,8 +977,16 @@
   std::vector<std::string> boundaries;
   for (const LevelMetaData& lmd : cfmd.levels) {
     for (const SstFileMetaData& sfmd : lmd.files) {
-      boundaries.push_back(sfmd.smallestkey);
-      boundaries.push_back(sfmd.largestkey);
+      // If FLAGS_user_timestamp_size > 0, then both smallestkey and largestkey
+      // have timestamps.
+      const auto& skey = sfmd.smallestkey;
+      const auto& lkey = sfmd.largestkey;
+      assert(skey.size() >= FLAGS_user_timestamp_size);
+      assert(lkey.size() >= FLAGS_user_timestamp_size);
+      boundaries.push_back(
+          skey.substr(0, skey.size() - FLAGS_user_timestamp_size));
+      boundaries.push_back(
+          lkey.substr(0, lkey.size() - FLAGS_user_timestamp_size));
     }
   }
   if (boundaries.empty()) {
@@ -902,6 +1136,7 @@
     // iterators with the same set-up, and it doesn't hurt to check them
     // to be equal.
     ReadOptions cmp_ro;
+    cmp_ro.timestamp = readoptionscopy.timestamp;
     cmp_ro.snapshot = snapshot;
     cmp_ro.total_order_seek = true;
     ColumnFamilyHandle* cmp_cfh =
@@ -976,28 +1211,23 @@
 }
 
 #ifndef ROCKSDB_LITE
-// Test the return status of GetLiveFiles, GetSortedWalFiles, and
-// GetCurrentWalFile. Each time, randomly select one of them to run
-// and return the status.
-Status StressTest::VerifyGetLiveAndWalFiles(ThreadState* thread) {
-  int case_num = thread->rand.Uniform(3);
-  if (case_num == 0) {
-    std::vector<std::string> live_file;
-    uint64_t manifest_size;
-    return db_->GetLiveFiles(live_file, &manifest_size);
-  }
-
-  if (case_num == 1) {
-    VectorLogPtr log_ptr;
-    return db_->GetSortedWalFiles(log_ptr);
-  }
-
-  if (case_num == 2) {
-    std::unique_ptr<LogFile> cur_wal_file;
-    return db_->GetCurrentWalFile(&cur_wal_file);
-  }
-  assert(false);
-  return Status::Corruption("Undefined case happens!");
+// Test the return status of GetLiveFiles.
+Status StressTest::VerifyGetLiveFiles() const {
+  std::vector<std::string> live_file;
+  uint64_t manifest_size = 0;
+  return db_->GetLiveFiles(live_file, &manifest_size);
+}
+
+// Test the return status of GetSortedWalFiles.
+Status StressTest::VerifyGetSortedWalFiles() const {
+  VectorLogPtr log_ptr;
+  return db_->GetSortedWalFiles(log_ptr);
+}
+
+// Test the return status of GetCurrentWalFile.
+Status StressTest::VerifyGetCurrentWalFile() const {
+  std::unique_ptr<LogFile> cur_wal_file;
+  return db_->GetCurrentWalFile(&cur_wal_file);
 }
 #endif  // !ROCKSDB_LITE
 
@@ -1026,21 +1256,25 @@
     *diverged = true;
     return;
   } else if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr &&
-             (options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >=
-                  0 ||
+             (options_.comparator->CompareWithoutTimestamp(
+                  *ro.iterate_lower_bound, /*a_has_ts=*/false, seek_key,
+                  /*b_has_ts=*/false) >= 0 ||
               (ro.iterate_upper_bound != nullptr &&
-               options_.comparator->Compare(*ro.iterate_lower_bound,
-                                            *ro.iterate_upper_bound) >= 0))) {
+               options_.comparator->CompareWithoutTimestamp(
+                   *ro.iterate_lower_bound, /*a_has_ts=*/false,
+                   *ro.iterate_upper_bound, /*b_has_ts*/ false) >= 0))) {
     // Lower bound behavior is not well defined if it is larger than
     // seek key or upper bound. Disable the check for now.
     *diverged = true;
     return;
   } else if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr &&
-             (options_.comparator->Compare(*ro.iterate_upper_bound, seek_key) <=
-                  0 ||
+             (options_.comparator->CompareWithoutTimestamp(
+                  *ro.iterate_upper_bound, /*a_has_ts=*/false, seek_key,
+                  /*b_has_ts=*/false) <= 0 ||
               (ro.iterate_lower_bound != nullptr &&
-               options_.comparator->Compare(*ro.iterate_lower_bound,
-                                            *ro.iterate_upper_bound) >= 0))) {
+               options_.comparator->CompareWithoutTimestamp(
+                   *ro.iterate_lower_bound, /*a_has_ts=*/false,
+                   *ro.iterate_upper_bound, /*b_has_ts=*/false) >= 0))) {
     // Uppder bound behavior is not well defined if it is smaller than
     // seek key or lower bound. Disable the check for now.
     *diverged = true;
@@ -1109,9 +1343,13 @@
       if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
           (!iter->Valid() &&
            (ro.iterate_upper_bound == nullptr ||
-            cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
+            cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
+                                         *ro.iterate_upper_bound,
+                                         /*b_has_ts=*/false) < 0) &&
            (ro.iterate_lower_bound == nullptr ||
-            cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) {
+            cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
+                                         *ro.iterate_lower_bound,
+                                         /*b_has_ts=*/false) > 0))) {
         fprintf(stderr,
                 "Iterator diverged from control iterator which"
                 " has value %s %s\n",
@@ -1169,35 +1407,137 @@
 Status StressTest::TestBackupRestore(
     ThreadState* thread, const std::vector<int>& rand_column_families,
     const std::vector<int64_t>& rand_keys) {
-  // Note the column families chosen by `rand_column_families` cannot be
-  // dropped while the locks for `rand_keys` are held. So we should not have
-  // to worry about accessing those column families throughout this function.
-  assert(rand_column_families.size() == rand_keys.size());
   std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
   std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
   BackupableDBOptions backup_opts(backup_dir);
+  // For debugging, get info_log from live options
+  backup_opts.info_log = db_->GetDBOptions().info_log.get();
+  if (thread->rand.OneIn(10)) {
+    backup_opts.share_table_files = false;
+  } else {
+    backup_opts.share_table_files = true;
+    if (thread->rand.OneIn(5)) {
+      backup_opts.share_files_with_checksum = false;
+    } else {
+      backup_opts.share_files_with_checksum = true;
+      if (thread->rand.OneIn(2)) {
+        // old
+        backup_opts.share_files_with_checksum_naming =
+            BackupableDBOptions::kLegacyCrc32cAndFileSize;
+      } else {
+        // new
+        backup_opts.share_files_with_checksum_naming =
+            BackupableDBOptions::kUseDbSessionId;
+      }
+      if (thread->rand.OneIn(2)) {
+        backup_opts.share_files_with_checksum_naming =
+            backup_opts.share_files_with_checksum_naming |
+            BackupableDBOptions::kFlagIncludeFileSize;
+      }
+    }
+  }
   BackupEngine* backup_engine = nullptr;
+  std::string from = "a backup/restore operation";
   Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
+  if (!s.ok()) {
+    from = "BackupEngine::Open";
+  }
   if (s.ok()) {
-    s = backup_engine->CreateNewBackup(db_);
+    if (thread->rand.OneIn(2)) {
+      TEST_FutureSchemaVersion2Options test_opts;
+      test_opts.crc32c_checksums = thread->rand.OneIn(2) == 0;
+      test_opts.file_sizes = thread->rand.OneIn(2) == 0;
+      TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts);
+    }
+    CreateBackupOptions create_opts;
+    if (FLAGS_disable_wal) {
+      // The verification can only work when latest value of `key` is backed up,
+      // which requires flushing in case of WAL disabled.
+      //
+      // Note this triggers a flush with a key lock held. Meanwhile, operations
+      // like flush/compaction may attempt to grab key locks like in
+      // `DbStressCompactionFilter`. The philosophy around preventing deadlock
+      // is the background operation key lock acquisition only tries but does
+      // not wait for the lock. So here in the foreground it is OK to hold the
+      // lock and wait on a background operation (flush).
+      create_opts.flush_before_backup = true;
+    }
+    s = backup_engine->CreateNewBackup(create_opts, db_);
+    if (!s.ok()) {
+      from = "BackupEngine::CreateNewBackup";
+    }
   }
   if (s.ok()) {
     delete backup_engine;
     backup_engine = nullptr;
     s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
+    if (!s.ok()) {
+      from = "BackupEngine::Open (again)";
+    }
   }
+  std::vector<BackupInfo> backup_info;
+  // If inplace_not_restore, we verify the backup by opening it as a
+  // read-only DB. If !inplace_not_restore, we restore it to a temporary
+  // directory for verification.
+  bool inplace_not_restore = thread->rand.OneIn(3);
   if (s.ok()) {
-    s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
-                                                 restore_dir /* wal_dir */);
+    backup_engine->GetBackupInfo(&backup_info,
+                                 /*include_file_details*/ inplace_not_restore);
+    if (backup_info.empty()) {
+      s = Status::NotFound("no backups found");
+      from = "BackupEngine::GetBackupInfo";
+    }
   }
-  if (s.ok()) {
-    s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
+  if (s.ok() && thread->rand.OneIn(2)) {
+    s = backup_engine->VerifyBackup(
+        backup_info.front().backup_id,
+        thread->rand.OneIn(2) /* verify_with_checksum */);
+    if (!s.ok()) {
+      from = "BackupEngine::VerifyBackup";
+    }
+  }
+  const bool allow_persistent = thread->tid == 0;  // not too many
+  bool from_latest = false;
+  int count = static_cast<int>(backup_info.size());
+  if (s.ok() && !inplace_not_restore) {
+    if (count > 1) {
+      s = backup_engine->RestoreDBFromBackup(
+          RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
+          restore_dir /* db_dir */, restore_dir /* wal_dir */);
+      if (!s.ok()) {
+        from = "BackupEngine::RestoreDBFromBackup";
+      }
+    } else {
+      from_latest = true;
+      s = backup_engine->RestoreDBFromLatestBackup(RestoreOptions(),
+                                                   restore_dir /* db_dir */,
+                                                   restore_dir /* wal_dir */);
+      if (!s.ok()) {
+        from = "BackupEngine::RestoreDBFromLatestBackup";
+      }
+    }
+  }
+  if (s.ok() && !inplace_not_restore) {
+    // Purge early if restoring, to ensure the restored directory doesn't
+    // have some secret dependency on the backup directory.
+    uint32_t to_keep = 0;
+    if (allow_persistent) {
+      // allow one thread to keep up to 2 backups
+      to_keep = thread->rand.Uniform(3);
+    }
+    s = backup_engine->PurgeOldBackups(to_keep);
+    if (!s.ok()) {
+      from = "BackupEngine::PurgeOldBackups";
+    }
   }
   DB* restored_db = nullptr;
   std::vector<ColumnFamilyHandle*> restored_cf_handles;
-  if (s.ok()) {
+  // Not yet implemented: opening restored BlobDB or TransactionDB
+  if (s.ok() && !FLAGS_use_txn && !FLAGS_use_blob_db) {
     Options restore_options(options_);
     restore_options.listeners.clear();
+    // Avoid dangling/shared file descriptors, for reliable destroy
+    restore_options.sst_file_manager = nullptr;
     std::vector<ColumnFamilyDescriptor> cf_descriptors;
     // TODO(ajkr): `column_family_names_` is not safe to access here when
     // `clear_column_family_one_in != 0`. But we can't easily switch to
@@ -1207,35 +1547,61 @@
     for (auto name : column_family_names_) {
       cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
     }
-    s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
-                 &restored_cf_handles, &restored_db);
+    if (inplace_not_restore) {
+      BackupInfo& info = backup_info[thread->rand.Uniform(count)];
+      restore_options.env = info.env_for_open.get();
+      s = DB::OpenForReadOnly(DBOptions(restore_options), info.name_for_open,
+                              cf_descriptors, &restored_cf_handles,
+                              &restored_db);
+      if (!s.ok()) {
+        from = "DB::OpenForReadOnly in backup/restore";
+      }
+    } else {
+      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
+                   &restored_cf_handles, &restored_db);
+      if (!s.ok()) {
+        from = "DB::Open in backup/restore";
+      }
+    }
   }
-  // for simplicity, currently only verifies existence/non-existence of a few
-  // keys
-  for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
-    std::string key_str = Key(rand_keys[i]);
+  // Note the column families chosen by `rand_column_families` cannot be
+  // dropped while the locks for `rand_keys` are held. So we should not have
+  // to worry about accessing those column families throughout this function.
+  //
+  // For simplicity, currently only verifies existence/non-existence of a
+  // single key
+  for (size_t i = 0; restored_db && s.ok() && i < rand_column_families.size();
+       ++i) {
+    std::string key_str = Key(rand_keys[0]);
     Slice key = key_str;
     std::string restored_value;
+    ReadOptions read_opts;
+    std::string ts_str;
+    Slice ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      ts_str = GenerateTimestampForRead();
+      ts = ts_str;
+      read_opts.timestamp = &ts;
+    }
     Status get_status = restored_db->Get(
-        ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
+        read_opts, restored_cf_handles[rand_column_families[i]], key,
         &restored_value);
-    bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+    bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
     if (get_status.ok()) {
-      if (!exists) {
+      if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
         s = Status::Corruption("key exists in restore but not in original db");
       }
     } else if (get_status.IsNotFound()) {
-      if (exists) {
+      if (exists && from_latest && ShouldAcquireMutexOnKey()) {
         s = Status::Corruption("key exists in original db but not in restore");
       }
     } else {
       s = get_status;
+      if (!s.ok()) {
+        from = "DB::Get in backup/restore";
+      }
     }
   }
-  if (backup_engine != nullptr) {
-    delete backup_engine;
-    backup_engine = nullptr;
-  }
   if (restored_db != nullptr) {
     for (auto* cf_handle : restored_cf_handles) {
       restored_db->DestroyColumnFamilyHandle(cf_handle);
@@ -1243,14 +1609,44 @@
     delete restored_db;
     restored_db = nullptr;
   }
+  if (s.ok() && inplace_not_restore) {
+    // Purge late if inplace open read-only
+    uint32_t to_keep = 0;
+    if (allow_persistent) {
+      // allow one thread to keep up to 2 backups
+      to_keep = thread->rand.Uniform(3);
+    }
+    s = backup_engine->PurgeOldBackups(to_keep);
+    if (!s.ok()) {
+      from = "BackupEngine::PurgeOldBackups";
+    }
+  }
+  if (backup_engine != nullptr) {
+    delete backup_engine;
+    backup_engine = nullptr;
+  }
+  if (s.ok()) {
+    // Preserve directories on failure, or allowed persistent backup
+    if (!allow_persistent) {
+      s = DestroyDir(db_stress_env, backup_dir);
+      if (!s.ok()) {
+        from = "Destroy backup dir";
+      }
+    }
+  }
+  if (s.ok()) {
+    s = DestroyDir(db_stress_env, restore_dir);
+    if (!s.ok()) {
+      from = "Destroy restore dir";
+    }
+  }
   if (!s.ok()) {
-    fprintf(stderr, "A backup/restore operation failed with: %s\n",
+    fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
             s.ToString().c_str());
   }
   return s;
 }
 
-#ifndef ROCKSDB_LITE
 Status StressTest::TestApproximateSize(
     ThreadState* thread, uint64_t iteration,
     const std::vector<int>& rand_column_families,
@@ -1292,33 +1688,52 @@
   return db_->GetApproximateSizes(
       sao, column_families_[rand_column_families[0]], &range, 1, &result);
 }
-#endif  // ROCKSDB_LITE
 
 Status StressTest::TestCheckpoint(ThreadState* thread,
                                   const std::vector<int>& rand_column_families,
                                   const std::vector<int64_t>& rand_keys) {
-  // Note the column families chosen by `rand_column_families` cannot be
-  // dropped while the locks for `rand_keys` are held. So we should not have
-  // to worry about accessing those column families throughout this function.
-  assert(rand_column_families.size() == rand_keys.size());
   std::string checkpoint_dir =
       FLAGS_db + "/.checkpoint" + ToString(thread->tid);
   Options tmp_opts(options_);
   tmp_opts.listeners.clear();
-  tmp_opts.env = db_stress_env->target();
+  tmp_opts.env = db_stress_env;
 
   DestroyDB(checkpoint_dir, tmp_opts);
 
+  if (db_stress_env->FileExists(checkpoint_dir).ok()) {
+    // If the directory might still exist, try to delete the files one by one.
+    // Likely a trash file is still there.
+    Status my_s = DestroyDir(db_stress_env, checkpoint_dir);
+    if (!my_s.ok()) {
+      fprintf(stderr, "Fail to destory directory before checkpoint: %s",
+              my_s.ToString().c_str());
+    }
+  }
+
   Checkpoint* checkpoint = nullptr;
   Status s = Checkpoint::Create(db_, &checkpoint);
   if (s.ok()) {
     s = checkpoint->CreateCheckpoint(checkpoint_dir);
+    if (!s.ok()) {
+      fprintf(stderr, "Fail to create checkpoint to %s\n",
+              checkpoint_dir.c_str());
+      std::vector<std::string> files;
+      Status my_s = db_stress_env->GetChildren(checkpoint_dir, &files);
+      if (my_s.ok()) {
+        for (const auto& f : files) {
+          fprintf(stderr, " %s\n", f.c_str());
+        }
+      } else {
+        fprintf(stderr, "Fail to get files under the directory to %s\n",
+                my_s.ToString().c_str());
+      }
+    }
   }
+  delete checkpoint;
+  checkpoint = nullptr;
   std::vector<ColumnFamilyHandle*> cf_handles;
   DB* checkpoint_db = nullptr;
   if (s.ok()) {
-    delete checkpoint;
-    checkpoint = nullptr;
     Options options(options_);
     options.listeners.clear();
     std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -1326,6 +1741,7 @@
     // `clear_column_family_one_in != 0`. But we can't easily switch to
     // `ListColumnFamilies` to get names because it won't necessarily give
     // the same order as `column_family_names_`.
+    assert(FLAGS_clear_column_family_one_in == 0);
     if (FLAGS_clear_column_family_one_in == 0) {
       for (const auto& name : column_family_names_) {
         cf_descs.emplace_back(name, ColumnFamilyOptions(options));
@@ -1335,21 +1751,24 @@
     }
   }
   if (checkpoint_db != nullptr) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
     for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
-      std::string key_str = Key(rand_keys[i]);
+      std::string key_str = Key(rand_keys[0]);
       Slice key = key_str;
       std::string value;
       Status get_status = checkpoint_db->Get(
           ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
       bool exists =
-          thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+          thread->shared->Exists(rand_column_families[i], rand_keys[0]);
       if (get_status.ok()) {
-        if (!exists) {
+        if (!exists && ShouldAcquireMutexOnKey()) {
           s = Status::Corruption(
               "key exists in checkpoint but not in original db");
         }
       } else if (get_status.IsNotFound()) {
-        if (exists) {
+        if (exists && ShouldAcquireMutexOnKey()) {
           s = Status::Corruption(
               "key exists in original db but not in checkpoint");
         }
@@ -1365,20 +1784,92 @@
     checkpoint_db = nullptr;
   }
 
-  DestroyDB(checkpoint_dir, tmp_opts);
-
   if (!s.ok()) {
     fprintf(stderr, "A checkpoint operation failed with: %s\n",
             s.ToString().c_str());
+  } else {
+    DestroyDB(checkpoint_dir, tmp_opts);
   }
   return s;
 }
 
+void StressTest::TestGetProperty(ThreadState* thread) const {
+  std::unordered_set<std::string> levelPropertyNames = {
+      DB::Properties::kAggregatedTablePropertiesAtLevel,
+      DB::Properties::kCompressionRatioAtLevelPrefix,
+      DB::Properties::kNumFilesAtLevelPrefix,
+  };
+  std::unordered_set<std::string> unknownPropertyNames = {
+      DB::Properties::kEstimateOldestKeyTime,
+      DB::Properties::kOptionsStatistics,
+      DB::Properties::
+          kLiveSstFilesSizeAtTemperature,  // similar to levelPropertyNames, it
+                                           // requires a number suffix
+  };
+  unknownPropertyNames.insert(levelPropertyNames.begin(),
+                              levelPropertyNames.end());
+
+  std::string prop;
+  for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) {
+    bool res = db_->GetProperty(ppt_name_and_info.first, &prop);
+    if (unknownPropertyNames.find(ppt_name_and_info.first) ==
+        unknownPropertyNames.end()) {
+      if (!res) {
+        fprintf(stderr, "Failed to get DB property: %s\n",
+                ppt_name_and_info.first.c_str());
+        thread->shared->SetVerificationFailure();
+      }
+      if (ppt_name_and_info.second.handle_int != nullptr) {
+        uint64_t prop_int;
+        if (!db_->GetIntProperty(ppt_name_and_info.first, &prop_int)) {
+          fprintf(stderr, "Failed to get Int property: %s\n",
+                  ppt_name_and_info.first.c_str());
+          thread->shared->SetVerificationFailure();
+        }
+      }
+      if (ppt_name_and_info.second.handle_map != nullptr) {
+        std::map<std::string, std::string> prop_map;
+        if (!db_->GetMapProperty(ppt_name_and_info.first, &prop_map)) {
+          fprintf(stderr, "Failed to get Map property: %s\n",
+                  ppt_name_and_info.first.c_str());
+          thread->shared->SetVerificationFailure();
+        }
+      }
+    }
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+  db_->GetColumnFamilyMetaData(&cf_meta_data);
+  int level_size = static_cast<int>(cf_meta_data.levels.size());
+  for (int level = 0; level < level_size; level++) {
+    for (const auto& ppt_name : levelPropertyNames) {
+      bool res = db_->GetProperty(ppt_name + std::to_string(level), &prop);
+      if (!res) {
+        fprintf(stderr, "Failed to get DB property: %s\n",
+                (ppt_name + std::to_string(level)).c_str());
+        thread->shared->SetVerificationFailure();
+      }
+    }
+  }
+
+  // Test for an invalid property name
+  if (thread->rand.OneIn(100)) {
+    if (db_->GetProperty("rocksdb.invalid_property_name", &prop)) {
+      fprintf(stderr, "Failed to return false for invalid property name\n");
+      thread->shared->SetVerificationFailure();
+    }
+  }
+}
+
 void StressTest::TestCompactFiles(ThreadState* thread,
                                   ColumnFamilyHandle* column_family) {
   ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
   db_->GetColumnFamilyMetaData(column_family, &cf_meta_data);
 
+  if (cf_meta_data.levels.empty()) {
+    return;
+  }
+
   // Randomly compact up to three consecutive files from a level
   const int kMaxRetry = 3;
   for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
@@ -1424,6 +1915,9 @@
 
 Status StressTest::TestFlush(const std::vector<int>& rand_column_families) {
   FlushOptions flush_opts;
+  if (FLAGS_atomic_flush) {
+    return db_->Flush(flush_opts, column_families_);
+  }
   std::vector<ColumnFamilyHandle*> cfhs;
   std::for_each(rand_column_families.begin(), rand_column_families.end(),
                 [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
@@ -1442,7 +1936,7 @@
   // 1 chance in 625 of pausing full 16s.)
   int pwr2_micros =
       std::min(thread->rand.Uniform(25), thread->rand.Uniform(25));
-  db_stress_env->SleepForMicroseconds(1 << pwr2_micros);
+  clock_->SleepForMicroseconds(1 << pwr2_micros);
   return db_->ContinueBackgroundWork();
 }
 
@@ -1451,8 +1945,9 @@
                                      const std::string& keystr, uint64_t i) {
   Slice key = keystr;
   ColumnFamilyHandle* column_family = column_families_[rand_column_family];
+  ReadOptions ropt;
 #ifndef ROCKSDB_LITE
-  auto db_impl = reinterpret_cast<DBImpl*>(db_->GetRootDB());
+  auto db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
   const bool ww_snapshot = thread->rand.OneIn(10);
   const Snapshot* snapshot =
       ww_snapshot ? db_impl->GetSnapshotForWriteConflictBoundary()
@@ -1460,8 +1955,19 @@
 #else
   const Snapshot* snapshot = db_->GetSnapshot();
 #endif  // !ROCKSDB_LITE
-  ReadOptions ropt;
   ropt.snapshot = snapshot;
+
+  // Ideally, we want snapshot taking and timestamp generation to be atomic
+  // here, so that the snapshot corresponds to the timestamp. However, it is
+  // not possible with current GetSnapshot() API.
+  std::string ts_str;
+  Slice ts;
+  if (FLAGS_user_timestamp_size > 0) {
+    ts_str = GenerateTimestampForRead();
+    ts = ts_str;
+    ropt.timestamp = &ts;
+  }
+
   std::string value_at;
   // When taking a snapshot, we also read a key from that snapshot. We
   // will later read the same key before releasing the snapshot and
@@ -1483,10 +1989,14 @@
     }
   }
 
-  ThreadState::SnapshotState snap_state = {
-      snapshot, rand_column_family, column_family->GetName(),
-      keystr,   status_at,          value_at,
-      key_vec};
+  ThreadState::SnapshotState snap_state = {snapshot,
+                                           rand_column_family,
+                                           column_family->GetName(),
+                                           keystr,
+                                           status_at,
+                                           value_at,
+                                           key_vec,
+                                           ts_str};
   uint64_t hold_for = FLAGS_snapshot_hold_ops;
   if (FLAGS_long_running_snapshots) {
     // Hold 10% of snapshots for 10x more
@@ -1591,6 +2101,13 @@
   ReadOptions ro;
   ro.snapshot = snapshot;
   ro.total_order_seek = true;
+  std::string ts_str;
+  Slice ts;
+  if (FLAGS_user_timestamp_size > 0) {
+    ts_str = GenerateTimestampForRead();
+    ts = ts_str;
+    ro.timestamp = &ts;
+  }
   std::unique_ptr<Iterator> it(db_->NewIterator(ro, column_family));
   for (it->Seek(start_key);
        it->Valid() && options_.comparator->Compare(it->key(), end_key) <= 0;
@@ -1617,7 +2134,7 @@
   fprintf(stdout, "TransactionDB             : %s\n",
           FLAGS_use_txn ? "true" : "false");
 #ifndef ROCKSDB_LITE
-  fprintf(stdout, "BlobDB                    : %s\n",
+  fprintf(stdout, "Stacked BlobDB            : %s\n",
           FLAGS_use_blob_db ? "true" : "false");
 #endif  // !ROCKSDB_LITE
   fprintf(stdout, "Read only mode            : %s\n",
@@ -1634,7 +2151,7 @@
           (unsigned long)FLAGS_ops_per_thread);
   std::string ttl_state("unused");
   if (FLAGS_ttl > 0) {
-    ttl_state = NumberToString(FLAGS_ttl);
+    ttl_state = ToString(FLAGS_ttl);
   }
   fprintf(stdout, "Time to live(sec)         : %s\n", ttl_state.c_str());
   fprintf(stdout, "Read percentage           : %d%%\n", FLAGS_readpercent);
@@ -1645,6 +2162,7 @@
   fprintf(stdout, "No overwrite percentage   : %d%%\n",
           FLAGS_nooverwritepercent);
   fprintf(stdout, "Iterate percentage        : %d%%\n", FLAGS_iterpercent);
+  fprintf(stdout, "Custom ops percentage     : %d%%\n", FLAGS_customopspercent);
   fprintf(stdout, "DB-write-buffer-size      : %" PRIu64 "\n",
           FLAGS_db_write_buffer_size);
   fprintf(stdout, "Write-buffer-size         : %d\n", FLAGS_write_buffer_size);
@@ -1668,6 +2186,8 @@
           bottommost_compression.c_str());
   std::string checksum = ChecksumTypeToString(checksum_type_e);
   fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
+  fprintf(stdout, "File checksum impl        : %s\n",
+          FLAGS_file_checksum_impl.c_str());
   fprintf(stdout, "Bloom bits / key          : %s\n",
           FormatDoubleParam(FLAGS_bloom_bits).c_str());
   fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
@@ -1690,13 +2210,16 @@
 
   fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
 
-  fprintf(stdout, "Test kill odd             : %d\n", rocksdb_kill_odds);
-  if (!rocksdb_kill_prefix_blacklist.empty()) {
+#ifndef NDEBUG
+  KillPoint* kp = KillPoint::GetInstance();
+  fprintf(stdout, "Test kill odd             : %d\n", kp->rocksdb_kill_odds);
+  if (!kp->rocksdb_kill_exclude_prefixes.empty()) {
     fprintf(stdout, "Skipping kill points prefixes:\n");
-    for (auto& p : rocksdb_kill_prefix_blacklist) {
+    for (auto& p : kp->rocksdb_kill_exclude_prefixes) {
       fprintf(stdout, "  %s\n", p.c_str());
     }
   }
+#endif
   fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
           FLAGS_periodic_compaction_seconds);
   fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
@@ -1709,6 +2232,18 @@
           FLAGS_max_write_batch_group_size_bytes);
   fprintf(stdout, "Use dynamic level         : %d\n",
           static_cast<int>(FLAGS_level_compaction_dynamic_level_bytes));
+  fprintf(stdout, "Read fault one in         : %d\n", FLAGS_read_fault_one_in);
+  fprintf(stdout, "Write fault one in        : %d\n", FLAGS_write_fault_one_in);
+  fprintf(stdout, "Open metadata write fault one in:\n");
+  fprintf(stdout, "                            %d\n",
+          FLAGS_open_metadata_write_fault_one_in);
+  fprintf(stdout, "Sync fault injection      : %d\n", FLAGS_sync_fault_injection);
+  fprintf(stdout, "Best efforts recovery     : %d\n",
+          static_cast<int>(FLAGS_best_efforts_recovery));
+  fprintf(stdout, "Fail if OPTIONS file error: %d\n",
+          static_cast<int>(FLAGS_fail_if_options_file_error));
+  fprintf(stdout, "User timestamp size bytes : %d\n",
+          static_cast<int>(FLAGS_user_timestamp_size));
 
   fprintf(stdout, "------------------------------------------------\n");
 }
@@ -1723,6 +2258,12 @@
     block_based_options.block_cache = cache_;
     block_based_options.cache_index_and_filter_blocks =
         FLAGS_cache_index_and_filter_blocks;
+    block_based_options.metadata_cache_options.top_level_index_pinning =
+        static_cast<PinningTier>(FLAGS_top_level_index_pinning);
+    block_based_options.metadata_cache_options.partition_pinning =
+        static_cast<PinningTier>(FLAGS_partition_pinning);
+    block_based_options.metadata_cache_options.unpartitioned_pinning =
+        static_cast<PinningTier>(FLAGS_unpartitioned_pinning);
     block_based_options.block_cache_compressed = compressed_cache_;
     block_based_options.checksum = checksum_type_e;
     block_based_options.block_size = FLAGS_block_size;
@@ -1732,8 +2273,13 @@
         static_cast<int32_t>(FLAGS_index_block_restart_interval);
     block_based_options.filter_policy = filter_policy_;
     block_based_options.partition_filters = FLAGS_partition_filters;
+    block_based_options.optimize_filters_for_memory =
+        FLAGS_optimize_filters_for_memory;
     block_based_options.index_type =
         static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
+    block_based_options.prepopulate_block_cache =
+        static_cast<BlockBasedTableOptions::PrepopulateBlockCache>(
+            FLAGS_prepopulate_block_cache);
     options_.table_factory.reset(
         NewBlockBasedTableFactory(block_based_options));
     options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
@@ -1783,12 +2329,18 @@
     options_.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
     options_.compression_opts.zstd_max_train_bytes =
         FLAGS_compression_zstd_max_train_bytes;
+    options_.compression_opts.parallel_threads =
+        FLAGS_compression_parallel_threads;
+    options_.compression_opts.max_dict_buffer_bytes =
+        FLAGS_compression_max_dict_buffer_bytes;
     options_.create_if_missing = true;
     options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
     options_.inplace_update_support = FLAGS_in_place_update;
     options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
     options_.allow_concurrent_memtable_write =
         FLAGS_allow_concurrent_memtable_write;
+    options_.experimental_mempurge_threshold =
+        FLAGS_experimental_mempurge_threshold;
     options_.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
     options_.ttl = FLAGS_compaction_ttl;
     options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
@@ -1806,10 +2358,29 @@
     options_.avoid_unnecessary_blocking_io =
         FLAGS_avoid_unnecessary_blocking_io;
     options_.write_dbid_to_manifest = FLAGS_write_dbid_to_manifest;
+    options_.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
     options_.max_write_batch_group_size_bytes =
         FLAGS_max_write_batch_group_size_bytes;
     options_.level_compaction_dynamic_level_bytes =
         FLAGS_level_compaction_dynamic_level_bytes;
+    options_.file_checksum_gen_factory =
+        GetFileChecksumImpl(FLAGS_file_checksum_impl);
+    options_.track_and_verify_wals_in_manifest = true;
+
+    // Integrated BlobDB
+    options_.enable_blob_files = FLAGS_enable_blob_files;
+    options_.min_blob_size = FLAGS_min_blob_size;
+    options_.blob_file_size = FLAGS_blob_file_size;
+    options_.blob_compression_type =
+        StringToCompressionType(FLAGS_blob_compression_type.c_str());
+    options_.enable_blob_garbage_collection =
+        FLAGS_enable_blob_garbage_collection;
+    options_.blob_garbage_collection_age_cutoff =
+        FLAGS_blob_garbage_collection_age_cutoff;
+    options_.blob_garbage_collection_force_threshold =
+        FLAGS_blob_garbage_collection_force_threshold;
+    options_.blob_compaction_readahead_size =
+        FLAGS_blob_compaction_readahead_size;
   } else {
 #ifdef ROCKSDB_LITE
     fprintf(stderr, "--options_file not supported in lite mode\n");
@@ -1839,6 +2410,21 @@
       options_.new_table_reader_for_compaction_inputs = true;
     }
   }
+  if (FLAGS_sst_file_manager_bytes_per_sec > 0 ||
+      FLAGS_sst_file_manager_bytes_per_truncate > 0) {
+    Status status;
+    options_.sst_file_manager.reset(NewSstFileManager(
+        db_stress_env, options_.info_log, "" /* trash_dir */,
+        static_cast<int64_t>(FLAGS_sst_file_manager_bytes_per_sec),
+        true /* delete_existing_trash */, &status,
+        0.25 /* max_trash_db_ratio */,
+        FLAGS_sst_file_manager_bytes_per_truncate));
+    if (!status.ok()) {
+      fprintf(stderr, "SstFileManager creation failed: %s\n",
+              status.ToString().c_str());
+      exit(1);
+    }
+  }
 
   if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
     fprintf(stderr,
@@ -1874,10 +2460,47 @@
   } else {
     options_.merge_operator = MergeOperators::CreatePutOperator();
   }
+  if (FLAGS_enable_compaction_filter) {
+    options_.compaction_filter_factory =
+        std::make_shared<DbStressCompactionFilterFactory>();
+  }
+  options_.table_properties_collector_factories.emplace_back(
+      std::make_shared<DbStressTablePropertiesCollectorFactory>());
+
+  options_.best_efforts_recovery = FLAGS_best_efforts_recovery;
+  options_.paranoid_file_checks = FLAGS_paranoid_file_checks;
+  options_.fail_if_options_file_error = FLAGS_fail_if_options_file_error;
+
+  if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
+       FLAGS_allow_setting_blob_options_dynamically) &&
+      FLAGS_best_efforts_recovery) {
+    fprintf(stderr,
+            "Integrated BlobDB is currently incompatible with best-effort "
+            "recovery\n");
+    exit(1);
+  }
+
+  fprintf(stdout,
+          "Integrated BlobDB: blob files enabled %d, min blob size %" PRIu64
+          ", blob file size %" PRIu64
+          ", blob compression type %s, blob GC enabled %d, cutoff %f, force "
+          "threshold %f, blob compaction readahead size %" PRIu64 "\n",
+          options_.enable_blob_files, options_.min_blob_size,
+          options_.blob_file_size,
+          CompressionTypeToString(options_.blob_compression_type).c_str(),
+          options_.enable_blob_garbage_collection,
+          options_.blob_garbage_collection_age_cutoff,
+          options_.blob_garbage_collection_force_threshold,
+          options_.blob_compaction_readahead_size);
 
   fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
 
   Status s;
+
+  if (FLAGS_user_timestamp_size > 0) {
+    CheckAndSetOptionsForUserTimestamp();
+  }
+
   if (FLAGS_ttl == -1) {
     std::vector<std::string> existing_column_families;
     s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
@@ -1927,36 +2550,130 @@
       column_family_names_.push_back(name);
     }
     options_.listeners.clear();
+#ifndef ROCKSDB_LITE
     options_.listeners.emplace_back(
         new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
+#endif  // !ROCKSDB_LITE
     options_.create_missing_column_families = true;
     if (!FLAGS_use_txn) {
-#ifndef ROCKSDB_LITE
-      if (FLAGS_use_blob_db) {
-        blob_db::BlobDBOptions blob_db_options;
-        blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
-        blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
-        blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
-        blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
-        blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
-
-        blob_db::BlobDB* blob_db = nullptr;
-        s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db,
-                                  cf_descriptors, &column_families_, &blob_db);
-        if (s.ok()) {
-          db_ = blob_db;
-        }
-      } else
+#ifndef NDEBUG
+      // Determine whether we need to ingest file metadata write failures
+      // during DB reopen. If it does, enable it.
+      // Only ingest metadata error if it is reopening, as initial open
+      // failure doesn't need to be handled.
+      // TODO cover transaction DB is not covered in this fault test too.
+      bool ingest_meta_error = false;
+      bool ingest_write_error = false;
+      bool ingest_read_error = false;
+      if ((FLAGS_open_metadata_write_fault_one_in ||
+           FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) &&
+          fault_fs_guard
+              ->FileExists(FLAGS_db + "/CURRENT", IOOptions(), nullptr)
+              .ok()) {
+        if (!FLAGS_sync) {
+          // When DB Stress is not sync mode, we expect all WAL writes to
+          // WAL is durable. Buffering unsynced writes will cause false
+          // positive in crash tests. Before we figure out a way to
+          // solve it, skip WAL from failure injection.
+          fault_fs_guard->SetSkipDirectWritableTypes({kWalFile});
+        }
+        ingest_meta_error = FLAGS_open_metadata_write_fault_one_in;
+        ingest_write_error = FLAGS_open_write_fault_one_in;
+        ingest_read_error = FLAGS_open_read_fault_one_in;
+        if (ingest_meta_error) {
+          fault_fs_guard->EnableMetadataWriteErrorInjection();
+          fault_fs_guard->SetRandomMetadataWriteError(
+              FLAGS_open_metadata_write_fault_one_in);
+        }
+        if (ingest_write_error) {
+          fault_fs_guard->SetFilesystemDirectWritable(false);
+          fault_fs_guard->EnableWriteErrorInjection();
+          fault_fs_guard->SetRandomWriteError(
+              static_cast<uint32_t>(FLAGS_seed), FLAGS_open_write_fault_one_in,
+              IOStatus::IOError("Injected Open Error"),
+              /*inject_for_all_file_types=*/true, /*types=*/{});
+        }
+        if (ingest_read_error) {
+          fault_fs_guard->SetRandomReadError(FLAGS_open_read_fault_one_in);
+        }
+      }
+      while (true) {
+#endif  // NDEBUG
+#ifndef ROCKSDB_LITE
+        // StackableDB-based BlobDB
+        if (FLAGS_use_blob_db) {
+          blob_db::BlobDBOptions blob_db_options;
+          blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
+          blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
+          blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
+          blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
+          blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
+
+          blob_db::BlobDB* blob_db = nullptr;
+          s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db,
+                                    cf_descriptors, &column_families_,
+                                    &blob_db);
+          if (s.ok()) {
+            db_ = blob_db;
+          }
+        } else
 #endif  // !ROCKSDB_LITE
-      {
-        if (db_preload_finished_.load() && FLAGS_read_only) {
-          s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors,
-                                  &column_families_, &db_);
-        } else {
-          s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                       &column_families_, &db_);
+        {
+          if (db_preload_finished_.load() && FLAGS_read_only) {
+            s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db,
+                                    cf_descriptors, &column_families_, &db_);
+          } else {
+            s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                         &column_families_, &db_);
+          }
+        }
+
+#ifndef NDEBUG
+        if (ingest_meta_error || ingest_write_error || ingest_read_error) {
+          fault_fs_guard->SetFilesystemDirectWritable(true);
+          fault_fs_guard->DisableMetadataWriteErrorInjection();
+          fault_fs_guard->DisableWriteErrorInjection();
+          fault_fs_guard->SetSkipDirectWritableTypes({});
+          fault_fs_guard->SetRandomReadError(0);
+          if (s.ok()) {
+            // Ingested errors might happen in background compactions. We
+            // wait for all compactions to finish to make sure DB is in
+            // clean state before executing queries.
+            s = static_cast_with_check<DBImpl>(db_->GetRootDB())
+                    ->TEST_WaitForCompact(true);
+            if (!s.ok()) {
+              for (auto cf : column_families_) {
+                delete cf;
+              }
+              column_families_.clear();
+              delete db_;
+              db_ = nullptr;
+            }
+          }
+          if (!s.ok()) {
+            // After failure to opening a DB due to IO error, retry should
+            // successfully open the DB with correct data if no IO error shows
+            // up.
+            ingest_meta_error = false;
+            ingest_write_error = false;
+            ingest_read_error = false;
+
+            Random rand(static_cast<uint32_t>(FLAGS_seed));
+            if (rand.OneIn(2)) {
+              fault_fs_guard->DeleteFilesCreatedAfterLastDirSync(IOOptions(),
+                                                                 nullptr);
+            }
+            if (rand.OneIn(3)) {
+              fault_fs_guard->DropUnsyncedFileData();
+            } else if (rand.OneIn(2)) {
+              fault_fs_guard->DropRandomUnsyncedFileData(&rand);
+            }
+            continue;
+          }
         }
+        break;
       }
+#endif  // NDEBUG
     } else {
 #ifndef ROCKSDB_LITE
       TransactionDBOptions txn_db_options;
@@ -2000,7 +2717,7 @@
     assert(!s.ok() || column_families_.size() ==
                           static_cast<size_t>(FLAGS_column_families));
 
-    if (FLAGS_test_secondary) {
+    if (s.ok() && FLAGS_test_secondary) {
 #ifndef ROCKSDB_LITE
       secondaries_.resize(FLAGS_threads);
       std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
@@ -2021,13 +2738,12 @@
           break;
         }
       }
-      assert(s.ok());
 #else
       fprintf(stderr, "Secondary is not supported in RocksDBLite\n");
       exit(1);
 #endif
     }
-    if (FLAGS_continuous_verification_interval > 0 && !cmp_db_) {
+    if (s.ok() && FLAGS_continuous_verification_interval > 0 && !cmp_db_) {
       Options tmp_opts;
       // TODO(yanqin) support max_open_files != -1 for secondary instance.
       tmp_opts.max_open_files = -1;
@@ -2077,7 +2793,7 @@
   // the db via a callbac ii) they hold on to a snapshot and the upcoming
   // ::Close would complain about it.
   const bool write_prepared = FLAGS_use_txn && FLAGS_txn_write_policy != 0;
-  bool bg_canceled = false;
+  bool bg_canceled __attribute__((unused)) = false;
   if (write_prepared || thread->rand.OneIn(2)) {
     const bool wait =
         write_prepared || static_cast<bool>(thread->rand.OneIn(2));
@@ -2085,7 +2801,6 @@
     bg_canceled = wait;
   }
   assert(!write_prepared || bg_canceled);
-  (void) bg_canceled;
 #else
   (void) thread;
 #endif
@@ -2123,11 +2838,80 @@
   secondaries_.clear();
 
   num_times_reopened_++;
-  auto now = db_stress_env->NowMicros();
+  auto now = clock_->NowMicros();
   fprintf(stdout, "%s Reopening database for the %dth time\n",
-          db_stress_env->TimeToString(now / 1000000).c_str(),
-          num_times_reopened_);
+          clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
   Open();
+
+  if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) {
+    Status s = thread->shared->SaveAtAndAfter(db_);
+    if (!s.ok()) {
+      fprintf(stderr, "Error enabling history tracing: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+}
+
+void StressTest::CheckAndSetOptionsForUserTimestamp() {
+  assert(FLAGS_user_timestamp_size > 0);
+  const Comparator* const cmp = test::ComparatorWithU64Ts();
+  assert(cmp);
+  if (FLAGS_user_timestamp_size != cmp->timestamp_size()) {
+    fprintf(stderr,
+            "Only -user_timestamp_size=%d is supported in stress test.\n",
+            static_cast<int>(cmp->timestamp_size()));
+    exit(1);
+  }
+  if (FLAGS_use_merge || FLAGS_use_full_merge_v1) {
+    fprintf(stderr, "Merge does not support timestamp yet.\n");
+    exit(1);
+  }
+  if (FLAGS_delrangepercent > 0) {
+    fprintf(stderr, "DeleteRange does not support timestamp yet.\n");
+    exit(1);
+  }
+  if (FLAGS_use_txn) {
+    fprintf(stderr, "TransactionDB does not support timestamp yet.\n");
+    exit(1);
+  }
+  if (FLAGS_read_only) {
+    fprintf(stderr, "When opened as read-only, timestamp not supported.\n");
+    exit(1);
+  }
+  if (FLAGS_test_secondary || FLAGS_secondary_catch_up_one_in > 0 ||
+      FLAGS_continuous_verification_interval > 0) {
+    fprintf(stderr, "Secondary instance does not support timestamp.\n");
+    exit(1);
+  }
+  if (FLAGS_checkpoint_one_in > 0) {
+    fprintf(stderr,
+            "-checkpoint_one_in=%d requires "
+            "DBImplReadOnly, which is not supported with timestamp\n",
+            FLAGS_checkpoint_one_in);
+    exit(1);
+  }
+#ifndef ROCKSDB_LITE
+  if (FLAGS_enable_blob_files || FLAGS_use_blob_db) {
+    fprintf(stderr, "BlobDB not supported with timestamp.\n");
+    exit(1);
+  }
+#endif  // !ROCKSDB_LITE
+  if (FLAGS_enable_compaction_filter) {
+    fprintf(stderr, "CompactionFilter not supported with timestamp.\n");
+    exit(1);
+  }
+  if (FLAGS_test_cf_consistency || FLAGS_test_batches_snapshots) {
+    fprintf(stderr,
+            "Due to per-key ts-seq ordering constraint, only the (default) "
+            "non-batched test is supported with timestamp.\n");
+    exit(1);
+  }
+  if (FLAGS_ingest_external_file_one_in > 0) {
+    fprintf(stderr, "Bulk loading may not support timestamp yet.\n");
+    exit(1);
+  }
+  options_.comparator = cmp;
 }
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,6 +13,7 @@
 #include "db_stress_tool/db_stress_shared_state.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 class Transaction;
 class TransactionDB;
 
@@ -22,12 +23,16 @@
 
   virtual ~StressTest();
 
-  std::shared_ptr<Cache> NewCache(size_t capacity);
+  std::shared_ptr<Cache> NewCache(size_t capacity, int32_t num_shard_bits);
+
+  static std::vector<std::string> GetBlobCompressionTags();
 
   bool BuildOptionsTable();
 
   void InitDb();
-  void InitReadonlyDb(SharedState*);
+  // The initialization work is split into two parts to avoid a circular
+  // dependency with `SharedState`.
+  virtual void FinishInitDb(SharedState*);
 
   // Return false if verification fails.
   bool VerifySecondaries();
@@ -60,6 +65,9 @@
 
   virtual bool ShouldAcquireMutexOnKey() const { return false; }
 
+  // Returns true if DB state is tracked by the stress test.
+  virtual bool IsStateTracked() const = 0;
+
   virtual std::vector<int> GenerateColumnFamilies(
       const int /* num_column_families */, int rand_column_family) const {
     return {rand_column_family};
@@ -184,13 +192,23 @@
 
   Status MaybeReleaseSnapshots(ThreadState* thread, uint64_t i);
 #ifndef ROCKSDB_LITE
-  Status VerifyGetLiveAndWalFiles(ThreadState* thread);
+  Status VerifyGetLiveFiles() const;
+  Status VerifyGetSortedWalFiles() const;
+  Status VerifyGetCurrentWalFile() const;
+  void TestGetProperty(ThreadState* thread) const;
+
   virtual Status TestApproximateSize(
       ThreadState* thread, uint64_t iteration,
       const std::vector<int>& rand_column_families,
       const std::vector<int64_t>& rand_keys);
 #endif  // !ROCKSDB_LITE
 
+  virtual Status TestCustomOperations(
+      ThreadState* /*thread*/,
+      const std::vector<int>& /*rand_column_families*/) {
+    return Status::NotSupported("TestCustomOperations() must be overridden");
+  }
+
   void VerificationAbort(SharedState* shared, std::string msg, Status s) const;
 
   void VerificationAbort(SharedState* shared, std::string msg, int cf,
@@ -202,6 +220,8 @@
 
   void Reopen(ThreadState* thread);
 
+  void CheckAndSetOptionsForUserTimestamp();
+
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> compressed_cache_;
   std::shared_ptr<const FilterPolicy> filter_policy_;
@@ -210,6 +230,7 @@
   TransactionDB* txn_db_;
 #endif
   Options options_;
+  SystemClock* clock_;
   std::vector<ColumnFamilyHandle*> column_families_;
   std::vector<std::string> column_family_names_;
   std::atomic<int> new_column_family_name_;
@@ -225,6 +246,7 @@
   // Fields used for continuous verification from another thread
   DB* cmp_db_;
   std::vector<ColumnFamilyHandle*> cmp_cfhs_;
+  bool is_db_stopped_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc	2025-05-19 16:14:27.000000000 +0000
@@ -23,11 +23,16 @@
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
 #include "db_stress_tool/db_stress_driver.h"
+#include "rocksdb/convenience.h"
+#ifndef NDEBUG
+#include "utilities/fault_injection_fs.h"
+#endif
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
 static std::shared_ptr<ROCKSDB_NAMESPACE::DbStressEnvWrapper> env_wrapper_guard;
+static std::shared_ptr<CompositeEnvWrapper> fault_env_guard;
 }  // namespace
 
 KeyGenContext key_gen_ctx;
@@ -41,6 +46,11 @@
   SanitizeDoubleParam(&FLAGS_memtable_prefix_bloom_size_ratio);
   SanitizeDoubleParam(&FLAGS_max_bytes_for_level_multiplier);
 
+#ifndef NDEBUG
+  if (FLAGS_mock_direct_io) {
+    SetupSyncPointsToMockDirectIO();
+  }
+#endif
   if (FLAGS_statistics) {
     dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
     if (FLAGS_test_secondary) {
@@ -54,24 +64,64 @@
 
   Env* raw_env;
 
+  int env_opts =
+      !FLAGS_hdfs.empty() + !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
+  if (env_opts > 1) {
+    fprintf(stderr,
+            "Error: --hdfs, --env_uri and --fs_uri are mutually exclusive\n");
+    exit(1);
+  }
+
   if (!FLAGS_hdfs.empty()) {
-    if (!FLAGS_env_uri.empty()) {
-      fprintf(stderr, "Cannot specify both --hdfs and --env_uri.\n");
-      exit(1);
-    }
     raw_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs);
-  } else if (!FLAGS_env_uri.empty()) {
-    Status s = Env::LoadEnv(FLAGS_env_uri, &raw_env, &env_guard);
-    if (raw_env == nullptr) {
-      fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
+  } else {
+    Status s = Env::CreateFromUri(ConfigOptions(), FLAGS_env_uri, FLAGS_fs_uri,
+                                  &raw_env, &env_guard);
+    if (!s.ok()) {
+      fprintf(stderr, "Error Creating Env URI: %s: %s\n", FLAGS_env_uri.c_str(),
+              s.ToString().c_str());
       exit(1);
     }
-  } else {
-    raw_env = Env::Default();
   }
+
+#ifndef NDEBUG
+  if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection ||
+      FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in ||
+      FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) {
+    FaultInjectionTestFS* fs =
+        new FaultInjectionTestFS(raw_env->GetFileSystem());
+    fault_fs_guard.reset(fs);
+    if (FLAGS_write_fault_one_in) {
+      fault_fs_guard->SetFilesystemDirectWritable(false);
+    } else {
+      fault_fs_guard->SetFilesystemDirectWritable(true);
+    }
+    fault_env_guard =
+        std::make_shared<CompositeEnvWrapper>(raw_env, fault_fs_guard);
+    raw_env = fault_env_guard.get();
+  }
+  if (FLAGS_write_fault_one_in) {
+    SyncPoint::GetInstance()->SetCallBack(
+        "BuildTable:BeforeFinishBuildTable",
+        [&](void*) { fault_fs_guard->EnableWriteErrorInjection(); });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
+#endif
+
   env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
   db_stress_env = env_wrapper_guard.get();
 
+#ifndef NDEBUG
+  if (FLAGS_write_fault_one_in) {
+    // In the write injection case, we need to use the FS interface and returns
+    // the IOStatus with different error and flags. Therefore,
+    // DbStressEnvWrapper cannot be used which will swallow the FS
+    // implementations. We should directly use the raw_env which is the
+    // CompositeEnvWrapper of env and fault_fs.
+    db_stress_env = raw_env;
+  }
+#endif
+
   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
   // The number of background threads should be at least as much the
@@ -92,17 +142,26 @@
             "test_batches_snapshots test!\n");
     exit(1);
   }
-  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) {
+  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0 &&
+      !FLAGS_memtable_whole_key_filtering) {
     fprintf(stderr,
-            "Error: please specify positive prefix_size in order to use "
-            "memtable_prefix_bloom_size_ratio\n");
+            "Error: please specify positive prefix_size or enable whole key "
+            "filtering in order to use memtable_prefix_bloom_size_ratio\n");
     exit(1);
   }
   if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent +
-       FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent) != 100) {
-    fprintf(stderr,
-            "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != "
-            "100!\n");
+       FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent +
+       FLAGS_customopspercent) != 100) {
+    fprintf(
+        stderr,
+        "Error: "
+        "Read(-readpercent=%d)+Prefix(-prefixpercent=%d)+Write(-writepercent=%"
+        "d)+Delete(-delpercent=%d)+DeleteRange(-delrangepercent=%d)"
+        "+Iterate(-iterpercent=%d)+CustomOps(-customopspercent=%d) percents != "
+        "100!\n",
+        FLAGS_readpercent, FLAGS_prefixpercent, FLAGS_writepercent,
+        FLAGS_delpercent, FLAGS_delrangepercent, FLAGS_iterpercent,
+        FLAGS_customopspercent);
     exit(1);
   }
   if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
@@ -195,9 +254,52 @@
         "Must set -test_secondary=true if secondary_catch_up_one_in > 0.\n");
     exit(1);
   }
+  if (FLAGS_best_efforts_recovery && !FLAGS_skip_verifydb &&
+      !FLAGS_disable_wal) {
+    fprintf(stderr,
+            "With best-efforts recovery, either skip_verifydb or disable_wal "
+            "should be set to true.\n");
+    exit(1);
+  }
+  if (FLAGS_skip_verifydb) {
+    if (FLAGS_verify_db_one_in > 0) {
+      fprintf(stderr,
+              "Must set -verify_db_one_in=0 if skip_verifydb is true.\n");
+      exit(1);
+    }
+    if (FLAGS_continuous_verification_interval > 0) {
+      fprintf(stderr,
+              "Must set -continuous_verification_interval=0 if skip_verifydb "
+              "is true.\n");
+      exit(1);
+    }
+  }
+  if (FLAGS_enable_compaction_filter &&
+      (FLAGS_acquire_snapshot_one_in > 0 || FLAGS_compact_range_one_in > 0 ||
+       FLAGS_iterpercent > 0 || FLAGS_test_batches_snapshots ||
+       FLAGS_test_cf_consistency)) {
+    fprintf(
+        stderr,
+        "Error: acquire_snapshot_one_in, compact_range_one_in, iterpercent, "
+        "test_batches_snapshots  must all be 0 when using compaction filter\n");
+    exit(1);
+  }
+  if (FLAGS_batch_protection_bytes_per_key > 0 &&
+      !FLAGS_test_batches_snapshots) {
+    fprintf(stderr,
+            "Error: test_batches_snapshots must be enabled when "
+            "batch_protection_bytes_per_key > 0\n");
+    exit(1);
+  }
+  if (FLAGS_test_multi_ops_txns) {
+    CheckAndSetOptionsForMultiOpsTxnStressTest();
+  }
 
-  rocksdb_kill_odds = FLAGS_kill_random_test;
-  rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
+#ifndef NDEBUG
+  KillPoint* kp = KillPoint::GetInstance();
+  kp->rocksdb_kill_odds = FLAGS_kill_random_test;
+  kp->rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes);
+#endif
 
   unsigned int levels = FLAGS_max_key_len;
   std::vector<std::string> weights;
@@ -224,7 +326,7 @@
     }
   } else {
     uint64_t keys_per_level = key_gen_ctx.window / levels;
-    for (unsigned int level = 0; level < levels - 1; ++level) {
+    for (unsigned int level = 0; level + 1 < levels; ++level) {
       key_gen_ctx.weights.emplace_back(keys_per_level);
     }
     key_gen_ctx.weights.emplace_back(key_gen_ctx.window -
@@ -236,6 +338,8 @@
     stress.reset(CreateCfConsistencyStressTest());
   } else if (FLAGS_test_batches_snapshots) {
     stress.reset(CreateBatchedOpsStressTest());
+  } else if (FLAGS_test_multi_ops_txns) {
+    stress.reset(CreateMultiOpsTxnsStressTest());
   } else {
     stress.reset(CreateNonBatchedOpsStressTest());
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,616 @@
+//  Copyright (c) 2021-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include "db_stress_tool/expected_state.h"
+
+#include "db_stress_tool/db_stress_common.h"
+#include "db_stress_tool/db_stress_shared_state.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ExpectedState::ExpectedState(size_t max_key, size_t num_column_families)
+    : max_key_(max_key),
+      num_column_families_(num_column_families),
+      values_(nullptr) {}
+
+void ExpectedState::ClearColumnFamily(int cf) {
+  std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
+            SharedState::DELETION_SENTINEL);
+}
+
+void ExpectedState::Put(int cf, int64_t key, uint32_t value_base,
+                        bool pending) {
+  if (!pending) {
+    // prevent expected-value update from reordering before Write
+    std::atomic_thread_fence(std::memory_order_release);
+  }
+  Value(cf, key).store(pending ? SharedState::UNKNOWN_SENTINEL : value_base,
+                       std::memory_order_relaxed);
+  if (pending) {
+    // prevent Write from reordering before expected-value update
+    std::atomic_thread_fence(std::memory_order_release);
+  }
+}
+
+uint32_t ExpectedState::Get(int cf, int64_t key) const {
+  return Value(cf, key);
+}
+
+bool ExpectedState::Delete(int cf, int64_t key, bool pending) {
+  if (Value(cf, key) == SharedState::DELETION_SENTINEL) {
+    return false;
+  }
+  Put(cf, key, SharedState::DELETION_SENTINEL, pending);
+  return true;
+}
+
+bool ExpectedState::SingleDelete(int cf, int64_t key, bool pending) {
+  return Delete(cf, key, pending);
+}
+
+int ExpectedState::DeleteRange(int cf, int64_t begin_key, int64_t end_key,
+                               bool pending) {
+  int covered = 0;
+  for (int64_t key = begin_key; key < end_key; ++key) {
+    if (Delete(cf, key, pending)) {
+      ++covered;
+    }
+  }
+  return covered;
+}
+
+bool ExpectedState::Exists(int cf, int64_t key) {
+  // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
+  // is disallowed can't be accidentally added a second time, in which case
+  // SingleDelete wouldn't be able to properly delete the key. It does allow
+  // the case where a SingleDelete might be added which covers nothing, but
+  // that's not a correctness issue.
+  uint32_t expected_value = Value(cf, key).load();
+  return expected_value != SharedState::DELETION_SENTINEL;
+}
+
+void ExpectedState::Reset() {
+  for (size_t i = 0; i < num_column_families_; ++i) {
+    for (size_t j = 0; j < max_key_; ++j) {
+      Delete(static_cast<int>(i), j, false /* pending */);
+    }
+  }
+}
+
+FileExpectedState::FileExpectedState(std::string expected_state_file_path,
+                                     size_t max_key, size_t num_column_families)
+    : ExpectedState(max_key, num_column_families),
+      expected_state_file_path_(expected_state_file_path) {}
+
+Status FileExpectedState::Open(bool create) {
+  size_t expected_values_size = GetValuesLen();
+
+  Env* default_env = Env::Default();
+
+  Status status;
+  if (create) {
+    std::unique_ptr<WritableFile> wfile;
+    const EnvOptions soptions;
+    status = default_env->NewWritableFile(expected_state_file_path_, &wfile,
+                                          soptions);
+    if (status.ok()) {
+      std::string buf(expected_values_size, '\0');
+      status = wfile->Append(buf);
+    }
+  }
+  if (status.ok()) {
+    status = default_env->NewMemoryMappedFileBuffer(
+        expected_state_file_path_, &expected_state_mmap_buffer_);
+  }
+  if (status.ok()) {
+    assert(expected_state_mmap_buffer_->GetLen() == expected_values_size);
+    values_ = static_cast<std::atomic<uint32_t>*>(
+        expected_state_mmap_buffer_->GetBase());
+    assert(values_ != nullptr);
+    if (create) {
+      Reset();
+    }
+  } else {
+    assert(values_ == nullptr);
+  }
+  return status;
+}
+
+AnonExpectedState::AnonExpectedState(size_t max_key, size_t num_column_families)
+    : ExpectedState(max_key, num_column_families) {}
+
+#ifndef NDEBUG
+Status AnonExpectedState::Open(bool create) {
+#else
+Status AnonExpectedState::Open(bool /* create */) {
+#endif
+  // AnonExpectedState only supports being freshly created.
+  assert(create);
+  values_allocation_.reset(
+      new std::atomic<uint32_t>[GetValuesLen() /
+                                sizeof(std::atomic<uint32_t>)]);
+  values_ = &values_allocation_[0];
+  Reset();
+  return Status::OK();
+}
+
+ExpectedStateManager::ExpectedStateManager(size_t max_key,
+                                           size_t num_column_families)
+    : max_key_(max_key),
+      num_column_families_(num_column_families),
+      latest_(nullptr) {}
+
+ExpectedStateManager::~ExpectedStateManager() {}
+
+const std::string FileExpectedStateManager::kLatestBasename = "LATEST";
+const std::string FileExpectedStateManager::kStateFilenameSuffix = ".state";
+const std::string FileExpectedStateManager::kTraceFilenameSuffix = ".trace";
+const std::string FileExpectedStateManager::kTempFilenamePrefix = ".";
+const std::string FileExpectedStateManager::kTempFilenameSuffix = ".tmp";
+
+FileExpectedStateManager::FileExpectedStateManager(
+    size_t max_key, size_t num_column_families,
+    std::string expected_state_dir_path)
+    : ExpectedStateManager(max_key, num_column_families),
+      expected_state_dir_path_(std::move(expected_state_dir_path)) {
+  assert(!expected_state_dir_path_.empty());
+}
+
+Status FileExpectedStateManager::Open() {
+  // Before doing anything, sync directory state with ours. That is, determine
+  // `saved_seqno_`, and create any necessary missing files.
+  std::vector<std::string> expected_state_dir_children;
+  Status s = Env::Default()->GetChildren(expected_state_dir_path_,
+                                         &expected_state_dir_children);
+  bool found_trace = false;
+  if (s.ok()) {
+    for (size_t i = 0; i < expected_state_dir_children.size(); ++i) {
+      const auto& filename = expected_state_dir_children[i];
+      if (filename.size() >= kStateFilenameSuffix.size() &&
+          filename.rfind(kStateFilenameSuffix) ==
+              filename.size() - kStateFilenameSuffix.size() &&
+          filename.rfind(kLatestBasename, 0) == std::string::npos) {
+        SequenceNumber found_seqno = ParseUint64(
+            filename.substr(0, filename.size() - kStateFilenameSuffix.size()));
+        if (saved_seqno_ == kMaxSequenceNumber || found_seqno > saved_seqno_) {
+          saved_seqno_ = found_seqno;
+        }
+      }
+    }
+    // Check if crash happened after creating state file but before creating
+    // trace file.
+    if (saved_seqno_ != kMaxSequenceNumber) {
+      std::string saved_seqno_trace_path =
+          GetPathForFilename(ToString(saved_seqno_) + kTraceFilenameSuffix);
+      Status exists_status = Env::Default()->FileExists(saved_seqno_trace_path);
+      if (exists_status.ok()) {
+        found_trace = true;
+      } else if (exists_status.IsNotFound()) {
+        found_trace = false;
+      } else {
+        s = exists_status;
+      }
+    }
+  }
+  if (s.ok() && saved_seqno_ != kMaxSequenceNumber && !found_trace) {
+    // Create an empty trace file so later logic does not need to distinguish
+    // missing vs. empty trace file.
+    std::unique_ptr<WritableFile> wfile;
+    const EnvOptions soptions;
+    std::string saved_seqno_trace_path =
+        GetPathForFilename(ToString(saved_seqno_) + kTraceFilenameSuffix);
+    s = Env::Default()->NewWritableFile(saved_seqno_trace_path, &wfile,
+                                        soptions);
+  }
+
+  if (s.ok()) {
+    s = Clean();
+  }
+
+  std::string expected_state_file_path =
+      GetPathForFilename(kLatestBasename + kStateFilenameSuffix);
+  bool found = false;
+  if (s.ok()) {
+    Status exists_status = Env::Default()->FileExists(expected_state_file_path);
+    if (exists_status.ok()) {
+      found = true;
+    } else if (exists_status.IsNotFound()) {
+      found = false;
+    } else {
+      s = exists_status;
+    }
+  }
+
+  if (!found) {
+    // Initialize the file in a temp path and then rename it. That way, in case
+    // this process is killed during setup, `Clean()` will take care of removing
+    // the incomplete expected values file.
+    std::string temp_expected_state_file_path =
+        GetTempPathForFilename(kLatestBasename + kStateFilenameSuffix);
+    FileExpectedState temp_expected_state(temp_expected_state_file_path,
+                                          max_key_, num_column_families_);
+    if (s.ok()) {
+      s = temp_expected_state.Open(true /* create */);
+    }
+    if (s.ok()) {
+      s = Env::Default()->RenameFile(temp_expected_state_file_path,
+                                     expected_state_file_path);
+    }
+  }
+
+  if (s.ok()) {
+    latest_.reset(new FileExpectedState(std::move(expected_state_file_path),
+                                        max_key_, num_column_families_));
+    s = latest_->Open(false /* create */);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status FileExpectedStateManager::SaveAtAndAfter(DB* db) {
+  SequenceNumber seqno = db->GetLatestSequenceNumber();
+
+  std::string state_filename = ToString(seqno) + kStateFilenameSuffix;
+  std::string state_file_temp_path = GetTempPathForFilename(state_filename);
+  std::string state_file_path = GetPathForFilename(state_filename);
+
+  std::string latest_file_path =
+      GetPathForFilename(kLatestBasename + kStateFilenameSuffix);
+
+  std::string trace_filename = ToString(seqno) + kTraceFilenameSuffix;
+  std::string trace_file_path = GetPathForFilename(trace_filename);
+
+  // Populate a tempfile and then rename it to atomically create "<seqno>.state"
+  // with contents from "LATEST.state"
+  Status s =
+      CopyFile(FileSystem::Default(), latest_file_path, state_file_temp_path,
+               0 /* size */, false /* use_fsync */);
+  if (s.ok()) {
+    s = FileSystem::Default()->RenameFile(state_file_temp_path, state_file_path,
+                                          IOOptions(), nullptr /* dbg */);
+  }
+  SequenceNumber old_saved_seqno = 0;
+  if (s.ok()) {
+    old_saved_seqno = saved_seqno_;
+    saved_seqno_ = seqno;
+  }
+
+  // If there is a crash now, i.e., after "<seqno>.state" was created but before
+  // "<seqno>.trace" is created, it will be treated as if "<seqno>.trace" were
+  // present but empty.
+
+  // Create "<seqno>.trace" directly. It is initially empty so no need for
+  // tempfile.
+  std::unique_ptr<TraceWriter> trace_writer;
+  if (s.ok()) {
+    EnvOptions soptions;
+    // Disable buffering so traces will not get stuck in application buffer.
+    soptions.writable_file_max_buffer_size = 0;
+    s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path,
+                           &trace_writer);
+  }
+  if (s.ok()) {
+    TraceOptions trace_opts;
+    trace_opts.filter |= kTraceFilterGet;
+    trace_opts.filter |= kTraceFilterMultiGet;
+    trace_opts.filter |= kTraceFilterIteratorSeek;
+    trace_opts.filter |= kTraceFilterIteratorSeekForPrev;
+    trace_opts.preserve_write_order = true;
+    s = db->StartTrace(trace_opts, std::move(trace_writer));
+  }
+
+  // Delete old state/trace files. Deletion order does not matter since we only
+  // delete after successfully saving new files, so old files will never be used
+  // again, even if we crash.
+  if (s.ok() && old_saved_seqno != kMaxSequenceNumber &&
+      old_saved_seqno != saved_seqno_) {
+    s = Env::Default()->DeleteFile(
+        GetPathForFilename(ToString(old_saved_seqno) + kStateFilenameSuffix));
+  }
+  if (s.ok() && old_saved_seqno != kMaxSequenceNumber &&
+      old_saved_seqno != saved_seqno_) {
+    s = Env::Default()->DeleteFile(
+        GetPathForFilename(ToString(old_saved_seqno) + kTraceFilenameSuffix));
+  }
+  return s;
+}
+#else   // ROCKSDB_LITE
+Status FileExpectedStateManager::SaveAtAndAfter(DB* /* db */) {
+  return Status::NotSupported();
+}
+#endif  // ROCKSDB_LITE
+
+bool FileExpectedStateManager::HasHistory() {
+  return saved_seqno_ != kMaxSequenceNumber;
+}
+
+#ifndef ROCKSDB_LITE
+
+namespace {
+
+// An `ExpectedStateTraceRecordHandler` applies a configurable number of
+// write operation trace records to the configured expected state. It is used in
+// `FileExpectedStateManager::Restore()` to sync the expected state with the
+// DB's post-recovery state.
+class ExpectedStateTraceRecordHandler : public TraceRecord::Handler,
+                                        public WriteBatch::Handler {
+ public:
+  ExpectedStateTraceRecordHandler(uint64_t max_write_ops, ExpectedState* state)
+      : max_write_ops_(max_write_ops), state_(state) {}
+
+  ~ExpectedStateTraceRecordHandler() { assert(IsDone()); }
+
+  // True if we have already reached the limit on write operations to apply.
+  bool IsDone() { return num_write_ops_ == max_write_ops_; }
+
+  Status Handle(const WriteQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    if (IsDone()) {
+      return Status::OK();
+    }
+    WriteBatch batch(record.GetWriteBatchRep().ToString());
+    return batch.Iterate(this);
+  }
+
+  // Ignore reads.
+  Status Handle(const GetQueryTraceRecord& /* record */,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    return Status::OK();
+  }
+
+  // Ignore reads.
+  Status Handle(const IteratorSeekQueryTraceRecord& /* record */,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    return Status::OK();
+  }
+
+  // Ignore reads.
+  Status Handle(const MultiGetQueryTraceRecord& /* record */,
+                std::unique_ptr<TraceRecordResult>* /* result */) override {
+    return Status::OK();
+  }
+
+  // Below are the WriteBatch::Handler overrides. We could use a separate
+  // object, but it's convenient and works to share state with the
+  // `TraceRecord::Handler`.
+
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    uint64_t key_id;
+    if (!GetIntVal(key.ToString(), &key_id)) {
+      return Status::Corruption("unable to parse key", key.ToString());
+    }
+    uint32_t value_id = GetValueBase(value);
+
+    state_->Put(column_family_id, static_cast<int64_t>(key_id), value_id,
+                false /* pending */);
+    ++num_write_ops_;
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    uint64_t key_id;
+    if (!GetIntVal(key.ToString(), &key_id)) {
+      return Status::Corruption("unable to parse key", key.ToString());
+    }
+
+    state_->Delete(column_family_id, static_cast<int64_t>(key_id),
+                   false /* pending */);
+    ++num_write_ops_;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    return DeleteCF(column_family_id, key);
+  }
+
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override {
+    uint64_t begin_key_id, end_key_id;
+    if (!GetIntVal(begin_key.ToString(), &begin_key_id)) {
+      return Status::Corruption("unable to parse begin key",
+                                begin_key.ToString());
+    }
+    if (!GetIntVal(end_key.ToString(), &end_key_id)) {
+      return Status::Corruption("unable to parse end key", end_key.ToString());
+    }
+
+    state_->DeleteRange(column_family_id, static_cast<int64_t>(begin_key_id),
+                        static_cast<int64_t>(end_key_id), false /* pending */);
+    ++num_write_ops_;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+    return PutCF(column_family_id, key, value);
+  }
+
+ private:
+  uint64_t num_write_ops_ = 0;
+  uint64_t max_write_ops_;
+  ExpectedState* state_;
+};
+
+}  // anonymous namespace
+
+Status FileExpectedStateManager::Restore(DB* db) {
+  assert(HasHistory());
+  SequenceNumber seqno = db->GetLatestSequenceNumber();
+  if (seqno < saved_seqno_) {
+    return Status::Corruption("DB is older than any restorable expected state");
+  }
+
+  std::string state_filename = ToString(saved_seqno_) + kStateFilenameSuffix;
+  std::string state_file_path = GetPathForFilename(state_filename);
+
+  std::string latest_file_temp_path =
+      GetTempPathForFilename(kLatestBasename + kStateFilenameSuffix);
+  std::string latest_file_path =
+      GetPathForFilename(kLatestBasename + kStateFilenameSuffix);
+
+  std::string trace_filename = ToString(saved_seqno_) + kTraceFilenameSuffix;
+  std::string trace_file_path = GetPathForFilename(trace_filename);
+
+  std::unique_ptr<TraceReader> trace_reader;
+  Status s = NewFileTraceReader(Env::Default(), EnvOptions(), trace_file_path,
+                                &trace_reader);
+
+  if (s.ok()) {
+    // We are going to replay on top of "`seqno`.state" to create a new
+    // "LATEST.state". Start off by creating a tempfile so we can later make the
+    // new "LATEST.state" appear atomically using `RenameFile()`.
+    s = CopyFile(FileSystem::Default(), state_file_path, latest_file_temp_path,
+                 0 /* size */, false /* use_fsync */);
+  }
+
+  {
+    std::unique_ptr<Replayer> replayer;
+    std::unique_ptr<ExpectedState> state;
+    std::unique_ptr<ExpectedStateTraceRecordHandler> handler;
+    if (s.ok()) {
+      state.reset(new FileExpectedState(latest_file_temp_path, max_key_,
+                                        num_column_families_));
+      s = state->Open(false /* create */);
+    }
+    if (s.ok()) {
+      handler.reset(new ExpectedStateTraceRecordHandler(seqno - saved_seqno_,
+                                                        state.get()));
+      // TODO(ajkr): An API limitation requires we provide `handles` although
+      // they will be unused since we only use the replayer for reading records.
+      // Just give a default CFH for now to satisfy the requirement.
+      s = db->NewDefaultReplayer({db->DefaultColumnFamily()} /* handles */,
+                                 std::move(trace_reader), &replayer);
+    }
+
+    if (s.ok()) {
+      s = replayer->Prepare();
+    }
+    for (;;) {
+      std::unique_ptr<TraceRecord> record;
+      s = replayer->Next(&record);
+      if (!s.ok()) {
+        break;
+      }
+      std::unique_ptr<TraceRecordResult> res;
+      record->Accept(handler.get(), &res);
+    }
+    if (s.IsCorruption() && handler->IsDone()) {
+      // There could be a corruption reading the tail record of the trace due to
+      // `db_stress` crashing while writing it. It shouldn't matter as long as
+      // we already found all the write ops we need to catch up the expected
+      // state.
+      s = Status::OK();
+    }
+    if (s.IsIncomplete()) {
+      // OK because `Status::Incomplete` is expected upon finishing all the
+      // trace records.
+      s = Status::OK();
+    }
+  }
+
+  if (s.ok()) {
+    s = FileSystem::Default()->RenameFile(latest_file_temp_path,
+                                          latest_file_path, IOOptions(),
+                                          nullptr /* dbg */);
+  }
+  if (s.ok()) {
+    latest_.reset(new FileExpectedState(latest_file_path, max_key_,
+                                        num_column_families_));
+    s = latest_->Open(false /* create */);
+  }
+
+  // Delete old state/trace files. We must delete the state file first.
+  // Otherwise, a crash-recovery immediately after deleting the trace file could
+  // lead to `Restore()` unable to replay to `seqno`.
+  if (s.ok()) {
+    s = Env::Default()->DeleteFile(state_file_path);
+  }
+  if (s.ok()) {
+    saved_seqno_ = kMaxSequenceNumber;
+    s = Env::Default()->DeleteFile(trace_file_path);
+  }
+  return s;
+}
+#else   // ROCKSDB_LITE
+Status FileExpectedStateManager::Restore(DB* /* db */) {
+  return Status::NotSupported();
+}
+#endif  // ROCKSDB_LITE
+
+Status FileExpectedStateManager::Clean() {
+  std::vector<std::string> expected_state_dir_children;
+  Status s = Env::Default()->GetChildren(expected_state_dir_path_,
+                                         &expected_state_dir_children);
+  // An incomplete `Open()` or incomplete `SaveAtAndAfter()` could have left
+  // behind invalid temporary files. An incomplete `SaveAtAndAfter()` could have
+  // also left behind stale state/trace files. An incomplete `Restore()` could
+  // have left behind stale trace files.
+  for (size_t i = 0; s.ok() && i < expected_state_dir_children.size(); ++i) {
+    const auto& filename = expected_state_dir_children[i];
+    if (filename.rfind(kTempFilenamePrefix, 0 /* pos */) == 0 &&
+        filename.size() >= kTempFilenameSuffix.size() &&
+        filename.rfind(kTempFilenameSuffix) ==
+            filename.size() - kTempFilenameSuffix.size()) {
+      // Delete all temp files.
+      s = Env::Default()->DeleteFile(GetPathForFilename(filename));
+    } else if (filename.size() >= kStateFilenameSuffix.size() &&
+               filename.rfind(kStateFilenameSuffix) ==
+                   filename.size() - kStateFilenameSuffix.size() &&
+               filename.rfind(kLatestBasename, 0) == std::string::npos &&
+               ParseUint64(filename.substr(
+                   0, filename.size() - kStateFilenameSuffix.size())) <
+                   saved_seqno_) {
+      assert(saved_seqno_ != kMaxSequenceNumber);
+      // Delete stale state files.
+      s = Env::Default()->DeleteFile(GetPathForFilename(filename));
+    } else if (filename.size() >= kTraceFilenameSuffix.size() &&
+               filename.rfind(kTraceFilenameSuffix) ==
+                   filename.size() - kTraceFilenameSuffix.size() &&
+               ParseUint64(filename.substr(
+                   0, filename.size() - kTraceFilenameSuffix.size())) <
+                   saved_seqno_) {
+      // Delete stale trace files.
+      s = Env::Default()->DeleteFile(GetPathForFilename(filename));
+    }
+  }
+  return s;
+}
+
+std::string FileExpectedStateManager::GetTempPathForFilename(
+    const std::string& filename) {
+  assert(!expected_state_dir_path_.empty());
+  std::string expected_state_dir_path_slash =
+      expected_state_dir_path_.back() == '/' ? expected_state_dir_path_
+                                             : expected_state_dir_path_ + "/";
+  return expected_state_dir_path_slash + kTempFilenamePrefix + filename +
+         kTempFilenameSuffix;
+}
+
+std::string FileExpectedStateManager::GetPathForFilename(
+    const std::string& filename) {
+  assert(!expected_state_dir_path_.empty());
+  std::string expected_state_dir_path_slash =
+      expected_state_dir_path_.back() == '/' ? expected_state_dir_path_
+                                             : expected_state_dir_path_ + "/";
+  return expected_state_dir_path_slash + filename;
+}
+
+AnonExpectedStateManager::AnonExpectedStateManager(size_t max_key,
+                                                   size_t num_column_families)
+    : ExpectedStateManager(max_key, num_column_families) {}
+
+Status AnonExpectedStateManager::Open() {
+  latest_.reset(new AnonExpectedState(max_key_, num_column_families_));
+  return latest_->Open(true /* create */);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/expected_state.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,287 @@
+//  Copyright (c) 2021-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <memory>
+
+#include "db/dbformat.h"
+#include "file/file_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An `ExpectedState` provides read/write access to expected values for every
+// key.
+class ExpectedState {
+ public:
+  explicit ExpectedState(size_t max_key, size_t num_column_families);
+
+  virtual ~ExpectedState() {}
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  virtual Status Open(bool create) = 0;
+
+  // Requires external locking covering all keys in `cf`.
+  void ClearColumnFamily(int cf);
+
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  //
+  // Requires external locking covering `key` in `cf`.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending);
+
+  // Requires external locking covering `key` in `cf`.
+  uint32_t Get(int cf, int64_t key) const;
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool Delete(int cf, int64_t key, bool pending);
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool SingleDelete(int cf, int64_t key, bool pending);
+
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  //
+  // Requires external locking covering keys in `[begin_key, end_key)` in `cf`.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending);
+
+  // Requires external locking covering `key` in `cf`.
+  bool Exists(int cf, int64_t key);
+
+ private:
+  // Requires external locking covering `key` in `cf`.
+  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
+    return values_[cf * max_key_ + key];
+  }
+
+  const size_t max_key_;
+  const size_t num_column_families_;
+
+ protected:
+  size_t GetValuesLen() const {
+    return sizeof(std::atomic<uint32_t>) * num_column_families_ * max_key_;
+  }
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  void Reset();
+
+  std::atomic<uint32_t>* values_;
+};
+
+// A `FileExpectedState` implements `ExpectedState` backed by a file.
+class FileExpectedState : public ExpectedState {
+ public:
+  explicit FileExpectedState(std::string expected_state_file_path,
+                             size_t max_key, size_t num_column_families);
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open(bool create) override;
+
+ private:
+  const std::string expected_state_file_path_;
+  std::unique_ptr<MemoryMappedFileBuffer> expected_state_mmap_buffer_;
+};
+
+// An `AnonExpectedState` implements `ExpectedState` backed by a memory
+// allocation.
+class AnonExpectedState : public ExpectedState {
+ public:
+  explicit AnonExpectedState(size_t max_key, size_t num_column_families);
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open(bool create) override;
+
+ private:
+  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
+};
+
+// An `ExpectedStateManager` manages data about the expected state of the
+// database. It exposes operations for reading and modifying the latest
+// expected state.
+class ExpectedStateManager {
+ public:
+  explicit ExpectedStateManager(size_t max_key, size_t num_column_families);
+
+  virtual ~ExpectedStateManager();
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  virtual Status Open() = 0;
+
+  // Saves expected values for the current state of `db` and begins tracking
+  // changes. Following a successful `SaveAtAndAfter()`, `Restore()` can be
+  // called on the same DB, as long as its state does not roll back to before
+  // its current state.
+  //
+  // Requires external locking preventing concurrent execution with any other
+  // member function. Furthermore, `db` must not be mutated while this function
+  // is executing.
+  virtual Status SaveAtAndAfter(DB* db) = 0;
+
+  // Returns true if at least one state of historical expected values can be
+  // restored.
+  //
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  virtual bool HasHistory() = 0;
+
+  // Restores expected values according to the current state of `db`. See
+  // `SaveAtAndAfter()` for conditions where this can be called.
+  //
+  // Requires external locking preventing concurrent execution with any other
+  // member function. Furthermore, `db` must not be mutated while this function
+  // is executing.
+  virtual Status Restore(DB* db) = 0;
+
+  // Requires external locking covering all keys in `cf`.
+  void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); }
+
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  //
+  // Requires external locking covering `key` in `cf`.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
+    return latest_->Put(cf, key, value_base, pending);
+  }
+
+  // Requires external locking covering `key` in `cf`.
+  uint32_t Get(int cf, int64_t key) const { return latest_->Get(cf, key); }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool Delete(int cf, int64_t key, bool pending) {
+    return latest_->Delete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  //
+  // Requires external locking covering `key` in `cf`.
+  bool SingleDelete(int cf, int64_t key, bool pending) {
+    return latest_->SingleDelete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  //
+  // Requires external locking covering keys in `[begin_key, end_key)` in `cf`.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
+    return latest_->DeleteRange(cf, begin_key, end_key, pending);
+  }
+
+  // Requires external locking covering `key` in `cf`.
+  bool Exists(int cf, int64_t key) { return latest_->Exists(cf, key); }
+
+ protected:
+  const size_t max_key_;
+  const size_t num_column_families_;
+  std::unique_ptr<ExpectedState> latest_;
+};
+
+// A `FileExpectedStateManager` implements an `ExpectedStateManager` backed by
+// a directory of files containing data about the expected state of the
+// database.
+class FileExpectedStateManager : public ExpectedStateManager {
+ public:
+  explicit FileExpectedStateManager(size_t max_key, size_t num_column_families,
+                                    std::string expected_state_dir_path);
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open() override;
+
+  // See `ExpectedStateManager::SaveAtAndAfter()` API doc.
+  //
+  // This implementation makes a copy of "LATEST.state" into
+  // "<current seqno>.state", and starts a trace in "<current seqno>.trace".
+  // Due to using external files, a following `Restore()` can happen even
+  // from a different process.
+  Status SaveAtAndAfter(DB* db) override;
+
+  // See `ExpectedStateManager::HasHistory()` API doc.
+  bool HasHistory() override;
+
+  // See `ExpectedStateManager::Restore()` API doc.
+  //
+  // Say `db->GetLatestSequenceNumber()` was `a` last time `SaveAtAndAfter()`
+  // was called and now it is `b`. Then this function replays `b - a` write
+  // operations from "`a`.trace" onto "`a`.state", and then copies the resulting
+  // file into "LATEST.state".
+  Status Restore(DB* db) override;
+
+ private:
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Clean();
+
+  std::string GetTempPathForFilename(const std::string& filename);
+  std::string GetPathForFilename(const std::string& filename);
+
+  static const std::string kLatestBasename;
+  static const std::string kStateFilenameSuffix;
+  static const std::string kTraceFilenameSuffix;
+  static const std::string kTempFilenamePrefix;
+  static const std::string kTempFilenameSuffix;
+
+  const std::string expected_state_dir_path_;
+  SequenceNumber saved_seqno_ = kMaxSequenceNumber;
+};
+
+// An `AnonExpectedStateManager` implements an `ExpectedStateManager` backed by
+// a memory allocation containing data about the expected state of the database.
+class AnonExpectedStateManager : public ExpectedStateManager {
+ public:
+  explicit AnonExpectedStateManager(size_t max_key, size_t num_column_families);
+
+  // See `ExpectedStateManager::SaveAtAndAfter()` API doc.
+  //
+  // This implementation returns `Status::NotSupported` since we do not
+  // currently have a need to keep history of expected state within a process.
+  Status SaveAtAndAfter(DB* /* db */) override {
+    return Status::NotSupported();
+  }
+
+  // See `ExpectedStateManager::HasHistory()` API doc.
+  bool HasHistory() override { return false; }
+
+  // See `ExpectedStateManager::Restore()` API doc.
+  //
+  // This implementation returns `Status::NotSupported` since we do not
+  // currently have a need to keep history of expected state within a process.
+  Status Restore(DB* /* db */) override { return Status::NotSupported(); }
+
+  // Requires external locking preventing concurrent execution with any other
+  // member function.
+  Status Open() override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,1037 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/multi_ops_txns_stress.h"
+
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/defer.h"
+#ifndef NDEBUG
+#include "utilities/fault_injection_fs.h"
+#endif  // NDEBUG
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: move these to gflags.
+static constexpr uint32_t kInitNumC = 1000;
+#ifndef ROCKSDB_LITE
+static constexpr uint32_t kInitialCARatio = 3;
+#endif  // ROCKSDB_LITE
+static constexpr bool kDoPreload = true;
+
+std::string MultiOpsTxnsStressTest::Record::EncodePrimaryKey(uint32_t a) {
+  char buf[8];
+  EncodeFixed32(buf, kPrimaryIndexId);
+  std::reverse(buf, buf + 4);
+  EncodeFixed32(buf + 4, a);
+  std::reverse(buf + 4, buf + 8);
+  return std::string(buf, sizeof(buf));
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c) {
+  char buf[8];
+  EncodeFixed32(buf, kSecondaryIndexId);
+  std::reverse(buf, buf + 4);
+  EncodeFixed32(buf + 4, c);
+  std::reverse(buf + 4, buf + 8);
+  return std::string(buf, sizeof(buf));
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey(uint32_t c,
+                                                               uint32_t a) {
+  char buf[12];
+  EncodeFixed32(buf, kSecondaryIndexId);
+  std::reverse(buf, buf + 4);
+  EncodeFixed32(buf + 4, c);
+  EncodeFixed32(buf + 8, a);
+  std::reverse(buf + 4, buf + 8);
+  std::reverse(buf + 8, buf + 12);
+  return std::string(buf, sizeof(buf));
+}
+
+std::tuple<Status, uint32_t, uint32_t>
+MultiOpsTxnsStressTest::Record::DecodePrimaryIndexValue(
+    Slice primary_index_value) {
+  if (primary_index_value.size() != 8) {
+    return std::tuple<Status, uint32_t, uint32_t>{Status::Corruption(""), 0, 0};
+  }
+  uint32_t b = 0;
+  uint32_t c = 0;
+  if (!GetFixed32(&primary_index_value, &b) ||
+      !GetFixed32(&primary_index_value, &c)) {
+    assert(false);
+    return std::tuple<Status, uint32_t, uint32_t>{Status::Corruption(""), 0, 0};
+  }
+  return std::tuple<Status, uint32_t, uint32_t>{Status::OK(), b, c};
+}
+
+std::pair<Status, uint32_t>
+MultiOpsTxnsStressTest::Record::DecodeSecondaryIndexValue(
+    Slice secondary_index_value) {
+  if (secondary_index_value.size() != 4) {
+    return std::make_pair(Status::Corruption(""), 0);
+  }
+  uint32_t crc = 0;
+  bool result __attribute__((unused)) =
+      GetFixed32(&secondary_index_value, &crc);
+  assert(result);
+  return std::make_pair(Status::OK(), crc);
+}
+
+std::pair<std::string, std::string>
+MultiOpsTxnsStressTest::Record::EncodePrimaryIndexEntry() const {
+  std::string primary_index_key = EncodePrimaryKey();
+  std::string primary_index_value = EncodePrimaryIndexValue();
+  return std::make_pair(primary_index_key, primary_index_value);
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodePrimaryKey() const {
+  return EncodePrimaryKey(a_);
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodePrimaryIndexValue() const {
+  char buf[8];
+  EncodeFixed32(buf, b_);
+  EncodeFixed32(buf + 4, c_);
+  return std::string(buf, sizeof(buf));
+}
+
+std::pair<std::string, std::string>
+MultiOpsTxnsStressTest::Record::EncodeSecondaryIndexEntry() const {
+  std::string secondary_index_key;
+  char buf[12];
+  EncodeFixed32(buf, kSecondaryIndexId);
+  std::reverse(buf, buf + 4);
+  EncodeFixed32(buf + 4, c_);
+  EncodeFixed32(buf + 8, a_);
+  std::reverse(buf + 4, buf + 8);
+  std::reverse(buf + 8, buf + 12);
+  secondary_index_key.assign(buf, sizeof(buf));
+
+  // Secondary index value is always 4-byte crc32 of the secondary key
+  std::string secondary_index_value;
+  uint32_t crc = crc32c::Value(buf, sizeof(buf));
+  PutFixed32(&secondary_index_value, crc);
+  return std::make_pair(secondary_index_key, secondary_index_value);
+}
+
+std::string MultiOpsTxnsStressTest::Record::EncodeSecondaryKey() const {
+  char buf[12];
+  EncodeFixed32(buf, kSecondaryIndexId);
+  std::reverse(buf, buf + 4);
+  EncodeFixed32(buf + 4, c_);
+  EncodeFixed32(buf + 8, a_);
+  std::reverse(buf + 4, buf + 8);
+  std::reverse(buf + 8, buf + 12);
+  return std::string(buf, sizeof(buf));
+}
+
+Status MultiOpsTxnsStressTest::Record::DecodePrimaryIndexEntry(
+    Slice primary_index_key, Slice primary_index_value) {
+  if (primary_index_key.size() != 8) {
+    assert(false);
+    return Status::Corruption("Primary index key length is not 8");
+  }
+
+  const char* const index_id_buf = primary_index_key.data();
+  uint32_t index_id =
+      static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[0])) << 24;
+  index_id += static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[1]))
+              << 16;
+  index_id += static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[2]))
+              << 8;
+  index_id +=
+      static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[3]));
+  primary_index_key.remove_prefix(sizeof(uint32_t));
+  if (index_id != kPrimaryIndexId) {
+    std::ostringstream oss;
+    oss << "Unexpected primary index id: " << index_id;
+    return Status::Corruption(oss.str());
+  }
+
+  const char* const buf = primary_index_key.data();
+  a_ = static_cast<uint32_t>(static_cast<unsigned char>(buf[0])) << 24;
+  a_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[1])) << 16;
+  a_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[2])) << 8;
+  a_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[3]));
+
+  if (primary_index_value.size() != 8) {
+    return Status::Corruption("Primary index value length is not 8");
+  }
+  GetFixed32(&primary_index_value, &b_);
+  GetFixed32(&primary_index_value, &c_);
+  return Status::OK();
+}
+
+Status MultiOpsTxnsStressTest::Record::DecodeSecondaryIndexEntry(
+    Slice secondary_index_key, Slice secondary_index_value) {
+  if (secondary_index_key.size() != 12) {
+    return Status::Corruption("Secondary index key length is not 12");
+  }
+  uint32_t crc =
+      crc32c::Value(secondary_index_key.data(), secondary_index_key.size());
+
+  const char* const index_id_buf = secondary_index_key.data();
+  uint32_t index_id =
+      static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[0])) << 24;
+  index_id += static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[1]))
+              << 16;
+  index_id += static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[2]))
+              << 8;
+  index_id +=
+      static_cast<uint32_t>(static_cast<unsigned char>(index_id_buf[3]));
+  secondary_index_key.remove_prefix(sizeof(uint32_t));
+  if (index_id != kSecondaryIndexId) {
+    std::ostringstream oss;
+    oss << "Unexpected secondary index id: " << index_id;
+    return Status::Corruption(oss.str());
+  }
+
+  const char* const buf = secondary_index_key.data();
+  assert(secondary_index_key.size() == 8);
+  c_ = static_cast<uint32_t>(static_cast<unsigned char>(buf[0])) << 24;
+  c_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[1])) << 16;
+  c_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[2])) << 8;
+  c_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[3]));
+
+  a_ = static_cast<uint32_t>(static_cast<unsigned char>(buf[4])) << 24;
+  a_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[5])) << 16;
+  a_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[6])) << 8;
+  a_ += static_cast<uint32_t>(static_cast<unsigned char>(buf[7]));
+
+  if (secondary_index_value.size() != 4) {
+    return Status::Corruption("Secondary index value length is not 4");
+  }
+  uint32_t val = 0;
+  GetFixed32(&secondary_index_value, &val);
+  if (val != crc) {
+    std::ostringstream oss;
+    oss << "Secondary index key checksum mismatch, stored: " << val
+        << ", recomputed: " << crc;
+    return Status::Corruption(oss.str());
+  }
+  return Status::OK();
+}
+
+void MultiOpsTxnsStressTest::FinishInitDb(SharedState* shared) {
+  if (FLAGS_enable_compaction_filter) {
+    // TODO (yanqin) enable compaction filter
+  }
+  if (kDoPreload) {
+    ReopenAndPreloadDb(shared);
+  }
+}
+
+void MultiOpsTxnsStressTest::ReopenAndPreloadDb(SharedState* shared) {
+  (void)shared;
+#ifndef ROCKSDB_LITE
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  for (const auto* handle : column_families_) {
+    cf_descs.emplace_back(handle->GetName(), ColumnFamilyOptions(options_));
+  }
+  CancelAllBackgroundWork(db_, /*wait=*/true);
+  for (auto* handle : column_families_) {
+    delete handle;
+  }
+  column_families_.clear();
+  delete db_;
+  db_ = nullptr;
+  txn_db_ = nullptr;
+
+  TransactionDBOptions txn_db_opts;
+  txn_db_opts.skip_concurrency_control = true;  // speed-up preloading
+  Status s = TransactionDB::Open(options_, txn_db_opts, FLAGS_db, cf_descs,
+                                 &column_families_, &txn_db_);
+  if (s.ok()) {
+    db_ = txn_db_;
+  } else {
+    fprintf(stderr, "Failed to open db: %s\n", s.ToString().c_str());
+    exit(1);
+  }
+
+  PreloadDb(shared, kInitNumC);
+
+  // Reopen
+  CancelAllBackgroundWork(db_, /*wait=*/true);
+  for (auto* handle : column_families_) {
+    delete handle;
+  }
+  column_families_.clear();
+  s = db_->Close();
+  if (!s.ok()) {
+    fprintf(stderr, "Error during closing db: %s\n", s.ToString().c_str());
+    exit(1);
+  }
+  delete db_;
+  db_ = nullptr;
+  txn_db_ = nullptr;
+
+  Open();
+#endif  // !ROCKSDB_LITE
+}
+
+// Used for point-lookup transaction
+Status MultiOpsTxnsStressTest::TestGet(
+    ThreadState* thread, const ReadOptions& read_opts,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  uint32_t a = ChooseA(thread);
+  return PointLookupTxn(thread, read_opts, a);
+}
+
+// Not used.
+std::vector<Status> MultiOpsTxnsStressTest::TestMultiGet(
+    ThreadState* /*thread*/, const ReadOptions& /*read_opts*/,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  return std::vector<Status>{Status::NotSupported()};
+}
+
+Status MultiOpsTxnsStressTest::TestPrefixScan(
+    ThreadState* thread, const ReadOptions& read_opts,
+    const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& rand_keys) {
+  (void)thread;
+  (void)read_opts;
+  (void)rand_column_families;
+  (void)rand_keys;
+  return Status::OK();
+}
+
+// Given a key K, this creates an iterator which scans to K and then
+// does a random sequence of Next/Prev operations.
+Status MultiOpsTxnsStressTest::TestIterate(
+    ThreadState* thread, const ReadOptions& read_opts,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  uint32_t c = thread->rand.Next() % kInitNumC;
+  return RangeScanTxn(thread, read_opts, c);
+}
+
+// Not intended for use.
+Status MultiOpsTxnsStressTest::TestPut(ThreadState* /*thread*/,
+                                       WriteOptions& /*write_opts*/,
+                                       const ReadOptions& /*read_opts*/,
+                                       const std::vector<int>& /*cf_ids*/,
+                                       const std::vector<int64_t>& /*keys*/,
+                                       char (&value)[100],
+                                       std::unique_ptr<MutexLock>& /*lock*/) {
+  (void)value;
+  return Status::NotSupported();
+}
+
+// Not intended for use.
+Status MultiOpsTxnsStressTest::TestDelete(
+    ThreadState* /*thread*/, WriteOptions& /*write_opts*/,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/,
+    std::unique_ptr<MutexLock>& /*lock*/) {
+  return Status::NotSupported();
+}
+
+// Not intended for use.
+Status MultiOpsTxnsStressTest::TestDeleteRange(
+    ThreadState* /*thread*/, WriteOptions& /*write_opts*/,
+    const std::vector<int>& /*rand_column_families*/,
+    const std::vector<int64_t>& /*rand_keys*/,
+    std::unique_ptr<MutexLock>& /*lock*/) {
+  return Status::NotSupported();
+}
+
+void MultiOpsTxnsStressTest::TestIngestExternalFile(
+    ThreadState* thread, const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/,
+    std::unique_ptr<MutexLock>& /*lock*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)rand_column_families;
+}
+
+void MultiOpsTxnsStressTest::TestCompactRange(
+    ThreadState* thread, int64_t /*rand_key*/, const Slice& /*start_key*/,
+    ColumnFamilyHandle* column_family) {
+  // TODO (yanqin).
+  // May use GetRangeHash() for validation before and after DB::CompactRange()
+  // completes.
+  (void)thread;
+  (void)column_family;
+}
+
+Status MultiOpsTxnsStressTest::TestBackupRestore(
+    ThreadState* thread, const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)rand_column_families;
+  return Status::OK();
+}
+
+Status MultiOpsTxnsStressTest::TestCheckpoint(
+    ThreadState* thread, const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)rand_column_families;
+  return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+Status MultiOpsTxnsStressTest::TestApproximateSize(
+    ThreadState* thread, uint64_t iteration,
+    const std::vector<int>& rand_column_families,
+    const std::vector<int64_t>& /*rand_keys*/) {
+  // TODO (yanqin)
+  (void)thread;
+  (void)iteration;
+  (void)rand_column_families;
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+Status MultiOpsTxnsStressTest::TestCustomOperations(
+    ThreadState* thread, const std::vector<int>& rand_column_families) {
+  (void)rand_column_families;
+  // Randomly choose from 0, 1, and 2.
+  // TODO (yanqin) allow user to configure probability of each operation.
+  uint32_t rand = thread->rand.Uniform(3);
+  Status s;
+  if (0 == rand) {
+    // Update primary key.
+    uint32_t old_a = ChooseA(thread);
+    uint32_t new_a = GenerateNextA();
+    s = PrimaryKeyUpdateTxn(thread, old_a, new_a);
+  } else if (1 == rand) {
+    // Update secondary key.
+    uint32_t old_c = thread->rand.Next() % kInitNumC;
+    int count = 0;
+    uint32_t new_c = 0;
+    do {
+      ++count;
+      new_c = thread->rand.Next() % kInitNumC;
+    } while (count < 100 && new_c == old_c);
+    if (count >= 100) {
+      // If we reach here, it means our random number generator has a serious
+      // problem, or kInitNumC is chosen poorly.
+      std::terminate();
+    }
+    s = SecondaryKeyUpdateTxn(thread, old_c, new_c);
+  } else if (2 == rand) {
+    // Update primary index value.
+    uint32_t a = ChooseA(thread);
+    s = UpdatePrimaryIndexValueTxn(thread, a, /*b_delta=*/1);
+  } else {
+    // Should never reach here.
+    assert(false);
+  }
+  return s;
+}
+
+Status MultiOpsTxnsStressTest::PrimaryKeyUpdateTxn(ThreadState* thread,
+                                                   uint32_t old_a,
+                                                   uint32_t new_a) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)old_a;
+  (void)new_a;
+  return Status::NotSupported();
+#else
+  std::string old_pk = Record::EncodePrimaryKey(old_a);
+  std::string new_pk = Record::EncodePrimaryKey(new_a);
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+  txn->SetSnapshotOnNextOperation(/*notifier=*/nullptr);
+
+  const Defer cleanup([&s, thread, txn, this]() {
+    if (s.ok()) {
+      // Two gets, one for existing pk, one for locking potential new pk.
+      thread->stats.AddGets(/*ngets=*/2, /*nfounds=*/1);
+      thread->stats.AddDeletes(1);
+      thread->stats.AddBytesForWrites(
+          /*nwrites=*/2,
+          Record::kPrimaryIndexEntrySize + Record::kSecondaryIndexEntrySize);
+      thread->stats.AddSingleDeletes(1);
+      return;
+    }
+    if (s.IsNotFound()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0);
+    } else if (s.IsBusy()) {
+      // ignore.
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+
+  ReadOptions ropts;
+  std::string value;
+  s = txn->GetForUpdate(ropts, old_pk, &value);
+  if (!s.ok()) {
+    return s;
+  }
+  std::string empty_value;
+  s = txn->GetForUpdate(ropts, new_pk, &empty_value);
+  if (s.ok()) {
+    assert(!empty_value.empty());
+    s = Status::Busy();
+    return s;
+  }
+
+  auto result = Record::DecodePrimaryIndexValue(value);
+  s = std::get<0>(result);
+  if (!s.ok()) {
+    return s;
+  }
+  uint32_t b = std::get<1>(result);
+  uint32_t c = std::get<2>(result);
+
+  ColumnFamilyHandle* cf = db_->DefaultColumnFamily();
+  s = txn->Delete(cf, old_pk, /*assume_tracked=*/true);
+  if (!s.ok()) {
+    return s;
+  }
+  s = txn->Put(cf, new_pk, value, /*assume_tracked=*/true);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto* wb = txn->GetWriteBatch();
+  assert(wb);
+
+  std::string old_sk = Record::EncodeSecondaryKey(c, old_a);
+  s = wb->SingleDelete(old_sk);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Record record(new_a, b, c);
+  std::string new_sk;
+  std::string new_crc;
+  std::tie(new_sk, new_crc) = record.EncodeSecondaryIndexEntry();
+  s = wb->Put(new_sk, new_crc);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = CommitTxn(txn);
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::SecondaryKeyUpdateTxn(ThreadState* thread,
+                                                     uint32_t old_c,
+                                                     uint32_t new_c) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)old_c;
+  (void)new_c;
+  return Status::NotSupported();
+#else
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  Iterator* it = nullptr;
+  long iterations = 0;
+  const Defer cleanup([&s, thread, &it, txn, this, &iterations]() {
+    delete it;
+    if (s.ok()) {
+      thread->stats.AddIterations(iterations);
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1);
+      thread->stats.AddSingleDeletes(1);
+      thread->stats.AddBytesForWrites(
+          /*nwrites=*/2,
+          Record::kPrimaryIndexEntrySize + Record::kSecondaryIndexEntrySize);
+      return;
+    } else if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() ||
+               s.IsMergeInProgress()) {
+      // ww-conflict detected, or
+      // lock cannot be acquired, or
+      // memtable history is not large enough for conflict checking, or
+      // Merge operation cannot be resolved.
+      // TODO (yanqin) add stats for other cases?
+    } else if (s.IsNotFound()) {
+      // ignore.
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+
+  // TODO (yanqin) try SetSnapshotOnNextOperation(). We currently need to take
+  // a snapshot here because we will later verify that point lookup in the
+  // primary index using GetForUpdate() returns the same value for 'c' as the
+  // iterator. The iterator does not need a snapshot though, because it will be
+  // assigned the current latest (published) sequence in the db, which will be
+  // no smaller than the snapshot created here. The GetForUpdate will perform
+  // ww conflict checking to ensure GetForUpdate() (using the snapshot) sees
+  // the same data as this iterator.
+  txn->SetSnapshot();
+  std::string old_sk_prefix = Record::EncodeSecondaryKey(old_c);
+  std::string iter_ub_str = Record::EncodeSecondaryKey(old_c + 1);
+  Slice iter_ub = iter_ub_str;
+  ReadOptions ropts;
+  if (thread->rand.OneIn(2)) {
+    ropts.snapshot = txn->GetSnapshot();
+  }
+  ropts.total_order_seek = true;
+  ropts.iterate_upper_bound = &iter_ub;
+  it = txn->GetIterator(ropts);
+
+  assert(it);
+  it->Seek(old_sk_prefix);
+  if (!it->Valid()) {
+    s = Status::NotFound();
+    return s;
+  }
+  auto* wb = txn->GetWriteBatch();
+  assert(wb);
+
+  do {
+    ++iterations;
+    Record record;
+    s = record.DecodeSecondaryIndexEntry(it->key(), it->value());
+    if (!s.ok()) {
+      VerificationAbort(thread->shared, "Cannot decode secondary key", s);
+      break;
+    }
+    // At this point, record.b is not known yet, thus we need to access
+    // primary index.
+    std::string pk = Record::EncodePrimaryKey(record.a_value());
+    std::string value;
+    ReadOptions read_opts;
+    read_opts.snapshot = txn->GetSnapshot();
+    s = txn->GetForUpdate(read_opts, pk, &value);
+    if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() ||
+        s.IsMergeInProgress()) {
+      // Write conflict, or cannot acquire lock, or memtable size is not large
+      // enough, or merge cannot be resolved.
+      break;
+    } else if (!s.ok()) {
+      // We can also fail verification here.
+      VerificationAbort(thread->shared, "pk should exist, but does not", s);
+      break;
+    }
+    auto result = Record::DecodePrimaryIndexValue(value);
+    s = std::get<0>(result);
+    if (!s.ok()) {
+      VerificationAbort(thread->shared, "Cannot decode primary index value", s);
+      break;
+    }
+    uint32_t b = std::get<1>(result);
+    uint32_t c = std::get<2>(result);
+    if (c != old_c) {
+      std::ostringstream oss;
+      oss << "c in primary index does not match secondary index: " << c
+          << " != " << old_c;
+      s = Status::Corruption();
+      VerificationAbort(thread->shared, oss.str(), s);
+      break;
+    }
+    Record new_rec(record.a_value(), b, new_c);
+    std::string new_primary_index_value = new_rec.EncodePrimaryIndexValue();
+    ColumnFamilyHandle* cf = db_->DefaultColumnFamily();
+    s = txn->Put(cf, pk, new_primary_index_value, /*assume_tracked=*/true);
+    if (!s.ok()) {
+      break;
+    }
+    std::string old_sk = it->key().ToString(/*hex=*/false);
+    std::string new_sk;
+    std::string new_crc;
+    std::tie(new_sk, new_crc) = new_rec.EncodeSecondaryIndexEntry();
+    s = wb->SingleDelete(old_sk);
+    if (!s.ok()) {
+      break;
+    }
+    s = wb->Put(new_sk, new_crc);
+    if (!s.ok()) {
+      break;
+    }
+
+    it->Next();
+  } while (it->Valid());
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = CommitTxn(txn);
+
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::UpdatePrimaryIndexValueTxn(ThreadState* thread,
+                                                          uint32_t a,
+                                                          uint32_t b_delta) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)a;
+  (void)b_delta;
+  return Status::NotSupported();
+#else
+  std::string pk_str = Record::EncodePrimaryKey(a);
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  const Defer cleanup([&s, thread, txn, this]() {
+    if (s.ok()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1);
+      thread->stats.AddBytesForWrites(
+          /*nwrites=*/1, /*nbytes=*/Record::kPrimaryIndexEntrySize);
+      return;
+    }
+    if (s.IsNotFound()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0);
+    } else if (s.IsInvalidArgument()) {
+      // ignored.
+    } else if (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain() ||
+               s.IsMergeInProgress()) {
+      // ignored.
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+  ReadOptions ropts;
+  std::string value;
+  s = txn->GetForUpdate(ropts, pk_str, &value);
+  if (!s.ok()) {
+    return s;
+  }
+  auto result = Record::DecodePrimaryIndexValue(value);
+  if (!std::get<0>(result).ok()) {
+    return s;
+  }
+  uint32_t b = std::get<1>(result) + b_delta;
+  uint32_t c = std::get<2>(result);
+  Record record(a, b, c);
+  std::string primary_index_value = record.EncodePrimaryIndexValue();
+  ColumnFamilyHandle* cf = db_->DefaultColumnFamily();
+  s = txn->Put(cf, pk_str, primary_index_value, /*assume_tracked=*/true);
+  if (!s.ok()) {
+    return s;
+  }
+  s = CommitTxn(txn);
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::PointLookupTxn(ThreadState* thread,
+                                              ReadOptions ropts, uint32_t a) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)ropts;
+  (void)a;
+  return Status::NotSupported();
+#else
+  std::string pk_str = Record::EncodePrimaryKey(a);
+  // pk may or may not exist
+  PinnableSlice value;
+
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  const Defer cleanup([&s, thread, txn, this]() {
+    if (s.ok()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1);
+      return;
+    } else if (s.IsNotFound()) {
+      thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/0);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+
+  s = txn->Get(ropts, db_->DefaultColumnFamily(), pk_str, &value);
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+Status MultiOpsTxnsStressTest::RangeScanTxn(ThreadState* thread,
+                                            ReadOptions ropts, uint32_t c) {
+#ifdef ROCKSDB_LITE
+  (void)thread;
+  (void)ropts;
+  (void)c;
+  return Status::NotSupported();
+#else
+  std::string sk = Record::EncodeSecondaryKey(c);
+
+  Transaction* txn = nullptr;
+  WriteOptions wopts;
+  Status s = NewTxn(wopts, &txn);
+  if (!s.ok()) {
+    assert(!txn);
+    thread->stats.AddErrors(1);
+    return s;
+  }
+
+  assert(txn);
+
+  const Defer cleanup([&s, thread, txn, this]() {
+    if (s.ok()) {
+      thread->stats.AddIterations(1);
+      return;
+    }
+    thread->stats.AddErrors(1);
+    RollbackTxn(txn).PermitUncheckedError();
+  });
+  std::unique_ptr<Iterator> iter(txn->GetIterator(ropts));
+  iter->Seek(sk);
+  if (iter->status().ok()) {
+    s = txn->Commit();
+  } else {
+    s = iter->status();
+  }
+  // TODO (yanqin) more Seek/SeekForPrev/Next/Prev/SeekToFirst/SeekToLast
+  return s;
+#endif  // !ROCKSDB_LITE
+}
+
+void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const {
+  if (thread->shared->HasVerificationFailedYet()) {
+    return;
+  }
+  const Snapshot* const snapshot = db_->GetSnapshot();
+  assert(snapshot);
+  ManagedSnapshot snapshot_guard(db_, snapshot);
+
+  // TODO (yanqin) with a probability, we can use either forward or backward
+  // iterator in subsequent checks. We can also use more advanced features in
+  // range scan. For now, let's just use simple forward iteration with
+  // total_order_seek = true.
+
+  // First, iterate primary index.
+  size_t primary_index_entries_count = 0;
+  {
+    char buf[4];
+    EncodeFixed32(buf, Record::kPrimaryIndexId + 1);
+    std::reverse(buf, buf + sizeof(buf));
+    std::string iter_ub_str(buf, sizeof(buf));
+    Slice iter_ub = iter_ub_str;
+
+    ReadOptions ropts;
+    ropts.snapshot = snapshot;
+    ropts.total_order_seek = true;
+    ropts.iterate_upper_bound = &iter_ub;
+
+    std::unique_ptr<Iterator> it(db_->NewIterator(ropts));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      ++primary_index_entries_count;
+    }
+  }
+
+  // Second, iterate secondary index.
+  size_t secondary_index_entries_count = 0;
+  {
+    char buf[4];
+    EncodeFixed32(buf, Record::kSecondaryIndexId);
+    std::reverse(buf, buf + sizeof(buf));
+    const std::string start_key(buf, sizeof(buf));
+
+    ReadOptions ropts;
+    ropts.snapshot = snapshot;
+    ropts.total_order_seek = true;
+
+    std::unique_ptr<Iterator> it(db_->NewIterator(ropts));
+    for (it->Seek(start_key); it->Valid(); it->Next()) {
+      ++secondary_index_entries_count;
+      Record record;
+      Status s = record.DecodeSecondaryIndexEntry(it->key(), it->value());
+      if (!s.ok()) {
+        VerificationAbort(thread->shared, "Cannot decode secondary index entry",
+                          s);
+        return;
+      }
+      // After decoding secondary index entry, we know a and c. Crc is verified
+      // in decoding phase.
+      //
+      // Form a primary key and search in the primary index.
+      std::string pk = Record::EncodePrimaryKey(record.a_value());
+      std::string value;
+      s = db_->Get(ropts, pk, &value);
+      if (!s.ok()) {
+        std::ostringstream oss;
+        oss << "Error searching pk " << Slice(pk).ToString(true) << ". "
+            << s.ToString();
+        VerificationAbort(thread->shared, oss.str(), s);
+        return;
+      }
+      auto result = Record::DecodePrimaryIndexValue(value);
+      s = std::get<0>(result);
+      if (!s.ok()) {
+        std::ostringstream oss;
+        oss << "Error decoding primary index value "
+            << Slice(value).ToString(true) << ". " << s.ToString();
+        VerificationAbort(thread->shared, oss.str(), s);
+      }
+      uint32_t c_in_primary = std::get<2>(result);
+      if (c_in_primary != record.c_value()) {
+        std::ostringstream oss;
+        oss << "Pk/sk mismatch. pk: (c=" << c_in_primary
+            << "), sk: (c=" << record.c_value() << ")";
+        VerificationAbort(thread->shared, oss.str(), s);
+      }
+    }
+  }
+
+  if (secondary_index_entries_count != primary_index_entries_count) {
+    std::ostringstream oss;
+    oss << "Pk/sk mismatch: primary index has " << primary_index_entries_count
+        << " entries. Secondary index has " << secondary_index_entries_count
+        << " entries.";
+    VerificationAbort(thread->shared, oss.str(), Status::OK());
+  }
+}
+
+uint32_t MultiOpsTxnsStressTest::ChooseA(ThreadState* thread) {
+  uint32_t rnd = thread->rand.Uniform(5);
+  uint32_t next_a_low = next_a_.load(std::memory_order_relaxed);
+  assert(next_a_low != 0);
+  if (rnd == 0) {
+    return next_a_low - 1;
+  }
+
+  uint32_t result = 0;
+  result = thread->rand.Next() % next_a_low;
+  if (thread->rand.OneIn(3)) {
+    return result;
+  }
+  uint32_t next_a_high = next_a_.load(std::memory_order_relaxed);
+  // A higher chance that this a still exists.
+  return next_a_low + (next_a_high - next_a_low) / 2;
+}
+
+uint32_t MultiOpsTxnsStressTest::GenerateNextA() {
+  return next_a_.fetch_add(1, std::memory_order_relaxed);
+}
+
+void MultiOpsTxnsStressTest::PreloadDb(SharedState* shared, size_t num_c) {
+#ifdef ROCKSDB_LITE
+  (void)shared;
+  (void)num_c;
+#else
+  // TODO (yanqin) maybe parallelize. Currently execute in single thread.
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  wopts.sync = false;
+  Random rnd(shared->GetSeed());
+  assert(txn_db_);
+  for (uint32_t c = 0; c < static_cast<uint32_t>(num_c); ++c) {
+    for (uint32_t a = c * kInitialCARatio; a < ((c + 1) * kInitialCARatio);
+         ++a) {
+      Record record(a, /*_b=*/rnd.Next(), c);
+      WriteBatch wb;
+      const auto primary_index_entry = record.EncodePrimaryIndexEntry();
+      Status s = wb.Put(primary_index_entry.first, primary_index_entry.second);
+      assert(s.ok());
+      const auto secondary_index_entry = record.EncodeSecondaryIndexEntry();
+      s = wb.Put(secondary_index_entry.first, secondary_index_entry.second);
+      assert(s.ok());
+      s = txn_db_->Write(wopts, &wb);
+      assert(s.ok());
+
+      // TODO (yanqin): make the following check optional, especially when data
+      // size is large.
+      Record tmp_rec;
+      tmp_rec.SetB(record.b_value());
+      s = tmp_rec.DecodeSecondaryIndexEntry(secondary_index_entry.first,
+                                            secondary_index_entry.second);
+      assert(s.ok());
+      assert(tmp_rec == record);
+    }
+  }
+  Status s = db_->Flush(FlushOptions());
+  assert(s.ok());
+  next_a_.store(static_cast<uint32_t>((num_c + 1) * kInitialCARatio));
+  fprintf(stdout, "DB preloaded with %d entries\n",
+          static_cast<int>(num_c * kInitialCARatio));
+#endif  // !ROCKSDB_LITE
+}
+
+StressTest* CreateMultiOpsTxnsStressTest() {
+  return new MultiOpsTxnsStressTest();
+}
+
+void CheckAndSetOptionsForMultiOpsTxnStressTest() {
+#ifndef ROCKSDB_LITE
+  if (FLAGS_test_batches_snapshots || FLAGS_test_cf_consistency) {
+    fprintf(stderr,
+            "-test_multi_ops_txns is not compatible with "
+            "-test_bathces_snapshots and -test_cf_consistency\n");
+    exit(1);
+  }
+  if (!FLAGS_use_txn) {
+    fprintf(stderr, "-use_txn must be true if -test_multi_ops_txns\n");
+    exit(1);
+  }
+  if (FLAGS_clear_column_family_one_in > 0) {
+    fprintf(stderr,
+            "-test_multi_ops_txns is not compatible with clearing column "
+            "families\n");
+    exit(1);
+  }
+  if (FLAGS_column_families > 1) {
+    // TODO (yanqin) support separating primary index and secondary index in
+    // different column families.
+    fprintf(stderr,
+            "-test_multi_ops_txns currently does not use more than one column "
+            "family\n");
+    exit(1);
+  }
+  if (FLAGS_writepercent > 0 || FLAGS_delpercent > 0 ||
+      FLAGS_delrangepercent > 0) {
+    fprintf(stderr,
+            "-test_multi_ops_txns requires that -writepercent, -delpercent and "
+            "-delrangepercent be 0\n");
+    exit(1);
+  }
+#else
+  fprintf(stderr, "-test_multi_ops_txns not supported in ROCKSDB_LITE mode\n");
+  exit(1);
+#endif  // !ROCKSDB_LITE
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,302 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifdef GFLAGS
+#include "db_stress_tool/db_stress_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This file defines MultiOpsTxnsStress so that we can stress test RocksDB
+// transactions on a simple, emulated relational table.
+//
+// The record format is similar to the example found at
+// https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format.
+//
+// The table is created by
+// ```
+// create table t1 (
+//   a int primary key,
+//   b int,
+//   c int,
+//   key(c),
+//   )
+// ```
+//
+// (For simplicity, we use uint32_t for int here.)
+//
+// For this table, there is a primary index using `a`, as well as a secondary
+// index using `c` and `a`.
+//
+// Primary key format:
+// | index id | M(a) |
+// Primary index value:
+// | b | c |
+// M(a) represents the big-endian format of a.
+//
+// Secondary key format:
+// | index id | M(c) | M(a) |
+// Secondary index value:
+// | crc32 |
+// Similarly to M(a), M(c) is the big-endian format of c.
+//
+// The in-memory representation of a record is defined in class
+// MultiOpsTxnsStress:Record that includes a number of helper methods to
+// encode/decode primary index keys, primary index values, secondary index keys,
+// secondary index values, etc.
+//
+// Sometimes primary index and secondary index reside on different column
+// families, but sometimes they colocate in the same column family. Current
+// implementation puts them in the same (default) column family, and this is
+// subject to future change if we find it interesting to test the other case.
+//
+// Class MultiOpsTxnsStressTest has the following transactions for testing.
+//
+// 1. Primary key update
+// UPDATE t1 SET a = 3 WHERE a = 2;
+// ```
+// tx->GetForUpdate(primary key a=2)
+// tx->GetForUpdate(primary key a=3)
+// tx->Delete(primary key a=2)
+// tx->Put(primary key a=3, value)
+// tx->batch->SingleDelete(secondary key a=2)
+// tx->batch->Put(secondary key a=3, value)
+// tx->Prepare()
+// Tx->Commit()
+// ```
+//
+// 2. Secondary key update
+// UPDATE t1 SET c = 3 WHERE c = 2;
+// ```
+// iter->Seek(secondary key)
+// // Get corresponding primary key value(s) from iterator
+// tx->GetForUpdate(primary key)
+// tx->Put(primary key, value c=3)
+// tx->batch->SingleDelete(secondary key c=2)
+// tx->batch->Put(secondary key c=3)
+// tx->Prepare()
+// tx->Commit()
+// ```
+//
+// 3. Primary index value update
+// UPDATE t1 SET b = b + 1 WHERE a = 2;
+// ```
+// tx->GetForUpdate(primary key a=2)
+// tx->Put(primary key a=2, value b=b+1)
+// tx->Prepare()
+// tx->Commit()
+// ```
+//
+// 4. Point lookup
+// SELECT * FROM t1 WHERE a = 3;
+// ```
+// tx->Get(primary key a=3)
+// tx->Commit()
+// ```
+//
+// 5. Range scan
+// SELECT * FROM t1 WHERE c = 2;
+// ```
+// it = tx->GetIterator()
+// it->Seek(secondary key c=2)
+// tx->Commit()
+// ```
+
+class MultiOpsTxnsStressTest : public StressTest {
+ public:
+  class Record {
+   public:
+    static constexpr uint32_t kPrimaryIndexId = 1;
+    static constexpr uint32_t kSecondaryIndexId = 2;
+
+    static constexpr size_t kPrimaryIndexEntrySize = 8 + 8;
+    static constexpr size_t kSecondaryIndexEntrySize = 12 + 4;
+
+    static_assert(kPrimaryIndexId < kSecondaryIndexId,
+                  "kPrimaryIndexId must be smaller than kSecondaryIndexId");
+
+    static_assert(sizeof(kPrimaryIndexId) == sizeof(uint32_t),
+                  "kPrimaryIndexId must be 4 bytes");
+    static_assert(sizeof(kSecondaryIndexId) == sizeof(uint32_t),
+                  "kSecondaryIndexId must be 4 bytes");
+
+    // Used for generating search key to probe primary index.
+    static std::string EncodePrimaryKey(uint32_t a);
+    // Used for generating search prefix to probe secondary index.
+    static std::string EncodeSecondaryKey(uint32_t c);
+    // Used for generating search key to probe secondary index.
+    static std::string EncodeSecondaryKey(uint32_t c, uint32_t a);
+
+    static std::tuple<Status, uint32_t, uint32_t> DecodePrimaryIndexValue(
+        Slice primary_index_value);
+
+    static std::pair<Status, uint32_t> DecodeSecondaryIndexValue(
+        Slice secondary_index_value);
+
+    Record() = default;
+    Record(uint32_t _a, uint32_t _b, uint32_t _c) : a_(_a), b_(_b), c_(_c) {}
+
+    bool operator==(const Record& other) const {
+      return a_ == other.a_ && b_ == other.b_ && c_ == other.c_;
+    }
+
+    bool operator!=(const Record& other) const { return !(*this == other); }
+
+    std::pair<std::string, std::string> EncodePrimaryIndexEntry() const;
+
+    std::string EncodePrimaryKey() const;
+
+    std::string EncodePrimaryIndexValue() const;
+
+    std::pair<std::string, std::string> EncodeSecondaryIndexEntry() const;
+
+    std::string EncodeSecondaryKey() const;
+
+    Status DecodePrimaryIndexEntry(Slice primary_index_key,
+                                   Slice primary_index_value);
+
+    Status DecodeSecondaryIndexEntry(Slice secondary_index_key,
+                                     Slice secondary_index_value);
+
+    uint32_t a_value() const { return a_; }
+    uint32_t b_value() const { return b_; }
+    uint32_t c_value() const { return c_; }
+
+    void SetA(uint32_t _a) { a_ = _a; }
+    void SetB(uint32_t _b) { b_ = _b; }
+    void SetC(uint32_t _c) { c_ = _c; }
+
+    std::string ToString() const {
+      std::string ret("(");
+      ret.append(std::to_string(a_));
+      ret.append(",");
+      ret.append(std::to_string(b_));
+      ret.append(",");
+      ret.append(std::to_string(c_));
+      ret.append(")");
+      return ret;
+    }
+
+   private:
+    friend class InvariantChecker;
+
+    uint32_t a_{0};
+    uint32_t b_{0};
+    uint32_t c_{0};
+  };
+
+  MultiOpsTxnsStressTest() {}
+
+  ~MultiOpsTxnsStressTest() override {}
+
+  void FinishInitDb(SharedState*) override;
+
+  void ReopenAndPreloadDb(SharedState* shared);
+
+  bool IsStateTracked() const override { return false; }
+
+  Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                 const std::vector<int>& rand_column_families,
+                 const std::vector<int64_t>& rand_keys) override;
+
+  std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) override;
+
+  Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts,
+                        const std::vector<int>& rand_column_families,
+                        const std::vector<int64_t>& rand_keys) override;
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
+                     const std::vector<int>& rand_column_families,
+                     const std::vector<int64_t>& rand_keys) override;
+
+  Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                 const ReadOptions& read_opts, const std::vector<int>& cf_ids,
+                 const std::vector<int64_t>& keys, char (&value)[100],
+                 std::unique_ptr<MutexLock>& lock) override;
+
+  Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                    const std::vector<int>& rand_column_families,
+                    const std::vector<int64_t>& rand_keys,
+                    std::unique_ptr<MutexLock>& lock) override;
+
+  Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         std::unique_ptr<MutexLock>& lock) override;
+
+  void TestIngestExternalFile(ThreadState* thread,
+                              const std::vector<int>& rand_column_families,
+                              const std::vector<int64_t>& rand_keys,
+                              std::unique_ptr<MutexLock>& lock) override;
+
+  void TestCompactRange(ThreadState* thread, int64_t rand_key,
+                        const Slice& start_key,
+                        ColumnFamilyHandle* column_family) override;
+
+  Status TestBackupRestore(ThreadState* thread,
+                           const std::vector<int>& rand_column_families,
+                           const std::vector<int64_t>& rand_keys) override;
+
+  Status TestCheckpoint(ThreadState* thread,
+                        const std::vector<int>& rand_column_families,
+                        const std::vector<int64_t>& rand_keys) override;
+
+#ifndef ROCKSDB_LITE
+  Status TestApproximateSize(ThreadState* thread, uint64_t iteration,
+                             const std::vector<int>& rand_column_families,
+                             const std::vector<int64_t>& rand_keys) override;
+#endif  // !ROCKSDB_LITE
+
+  Status TestCustomOperations(
+      ThreadState* thread,
+      const std::vector<int>& rand_column_families) override;
+
+  Status PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a,
+                             uint32_t new_a);
+
+  Status SecondaryKeyUpdateTxn(ThreadState* thread, uint32_t old_c,
+                               uint32_t new_c);
+
+  Status UpdatePrimaryIndexValueTxn(ThreadState* thread, uint32_t a,
+                                    uint32_t b_delta);
+
+  Status PointLookupTxn(ThreadState* thread, ReadOptions ropts, uint32_t a);
+
+  Status RangeScanTxn(ThreadState* thread, ReadOptions ropts, uint32_t c);
+
+  void VerifyDb(ThreadState* thread) const override;
+
+ protected:
+  uint32_t ChooseA(ThreadState* thread);
+
+  uint32_t GenerateNextA();
+
+ private:
+  void PreloadDb(SharedState* shared, size_t num_c);
+
+  // TODO (yanqin) encapsulate the selection of keys a separate class.
+  std::atomic<uint32_t> next_a_{0};
+};
+
+class InvariantChecker {
+ public:
+  static_assert(sizeof(MultiOpsTxnsStressTest::Record().a_) == sizeof(uint32_t),
+                "MultiOpsTxnsStressTest::Record::a_ must be 4 bytes");
+  static_assert(sizeof(MultiOpsTxnsStressTest::Record().b_) == sizeof(uint32_t),
+                "MultiOpsTxnsStressTest::Record::b_ must be 4 bytes");
+  static_assert(sizeof(MultiOpsTxnsStressTest::Record().c_) == sizeof(uint32_t),
+                "MultiOpsTxnsStressTest::Record::c_ must be 4 bytes");
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,9 @@
 
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
+#ifndef NDEBUG
+#include "utilities/fault_injection_fs.h"
+#endif // NDEBUG
 
 namespace ROCKSDB_NAMESPACE {
 class NonBatchedOpsStressTest : public StressTest {
@@ -19,6 +22,13 @@
 
   void VerifyDb(ThreadState* thread) const override {
     ReadOptions options(FLAGS_verify_checksum, true);
+    std::string ts_str;
+    Slice ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      ts_str = GenerateTimestampForRead();
+      ts = ts_str;
+      options.timestamp = &ts;
+    }
     auto shared = thread->shared;
     const int64_t max_key = shared->GetMaxKey();
     const int64_t keys_per_thread = max_key / shared->GetNumThreads();
@@ -33,8 +43,8 @@
       if (thread->shared->HasVerificationFailedYet()) {
         break;
       }
-      if (!thread->rand.OneIn(2)) {
-        // Use iterator to verify this range
+      if (thread->rand.OneIn(3)) {
+        // 1/3 chance use iterator to verify this range
         Slice prefix;
         std::string seek_key = Key(start);
         std::unique_ptr<Iterator> iter(
@@ -79,8 +89,8 @@
                           from_db.data(), from_db.length());
           }
         }
-      } else {
-        // Use Get to verify this range
+      } else if (thread->rand.OneIn(2)) {
+        // 1/3 chance use Get to verify this range
         for (auto i = start; i < end; i++) {
           if (thread->shared->HasVerificationFailedYet()) {
             break;
@@ -96,6 +106,38 @@
                           from_db.data(), from_db.length());
           }
         }
+      } else {
+        // 1/3 chance use MultiGet to verify this range
+        for (auto i = start; i < end;) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          // Keep the batch size to some reasonable value
+          size_t batch_size = thread->rand.Uniform(128) + 1;
+          batch_size = std::min<size_t>(batch_size, end - i);
+          std::vector<std::string> keystrs(batch_size);
+          std::vector<Slice> keys(batch_size);
+          std::vector<PinnableSlice> values(batch_size);
+          std::vector<Status> statuses(batch_size);
+          for (size_t j = 0; j < batch_size; ++j) {
+            keystrs[j] = Key(i + j);
+            keys[j] = Slice(keystrs[j].data(), keystrs[j].length());
+          }
+          db_->MultiGet(options, column_families_[cf], batch_size, keys.data(),
+                        values.data(), statuses.data());
+          for (size_t j = 0; j < batch_size; ++j) {
+            Status s = statuses[j];
+            std::string from_db = values[j].ToString();
+            VerifyValue(static_cast<int>(cf), i + j, options, shared, from_db,
+                        s, true);
+            if (from_db.length()) {
+              PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i + j),
+                            from_db.data(), from_db.length());
+            }
+          }
+
+          i += batch_size;
+        }
       }
     }
   }
@@ -137,6 +179,8 @@
 
   bool ShouldAcquireMutexOnKey() const override { return true; }
 
+  bool IsStateTracked() const override { return true; }
+
   Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
                  const std::vector<int>& rand_column_families,
                  const std::vector<int64_t>& rand_keys) override {
@@ -144,18 +188,52 @@
     std::string key_str = Key(rand_keys[0]);
     Slice key = key_str;
     std::string from_db;
+    int error_count = 0;
+
+#ifndef NDEBUG
+    if (fault_fs_guard) {
+      fault_fs_guard->EnableErrorInjection();
+      SharedState::ignore_read_error = false;
+    }
+#endif // NDEBUG
     Status s = db_->Get(read_opts, cfh, key, &from_db);
+#ifndef NDEBUG
+    if (fault_fs_guard) {
+      error_count = fault_fs_guard->GetAndResetErrorCount();
+    }
+#endif // NDEBUG
     if (s.ok()) {
+#ifndef NDEBUG
+      if (fault_fs_guard) {
+        if (error_count && !SharedState::ignore_read_error) {
+          // Grab mutex so multiple thread don't try to print the
+          // stack trace at the same time
+          MutexLock l(thread->shared->GetMutex());
+          fprintf(stderr, "Didn't get expected error from Get\n");
+          fprintf(stderr, "Callstack that injected the fault\n");
+          fault_fs_guard->PrintFaultBacktrace();
+          std::terminate();
+        }
+      }
+#endif // NDEBUG
       // found case
       thread->stats.AddGets(1, 1);
     } else if (s.IsNotFound()) {
       // not found case
       thread->stats.AddGets(1, 0);
     } else {
-      // errors case
-      fprintf(stderr, "TestGet error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
+      if (error_count == 0) {
+        // errors case
+        thread->stats.AddErrors(1);
+      } else {
+        thread->stats.AddVerifiedErrors(1);
+      }
+    }
+#ifndef NDEBUG
+    if (fault_fs_guard) {
+      fault_fs_guard->DisableErrorInjection();
     }
+#endif // NDEBUG
     return s;
   }
 
@@ -171,6 +249,15 @@
     std::vector<PinnableSlice> values(num_keys);
     std::vector<Status> statuses(num_keys);
     ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+    int error_count = 0;
+    // Do a consistency check between Get and MultiGet. Don't do it too
+    // often as it will slow db_stress down
+    bool do_consistency_check = thread->rand.OneIn(4);
+
+    ReadOptions readoptionscopy = read_opts;
+    if (do_consistency_check) {
+      readoptionscopy.snapshot = db_->GetSnapshot();
+    }
 
     // To appease clang analyzer
     const bool use_txn = FLAGS_use_txn;
@@ -231,18 +318,98 @@
     }
 
     if (!use_txn) {
-      db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
+#ifndef NDEBUG
+      if (fault_fs_guard) {
+        fault_fs_guard->EnableErrorInjection();
+        SharedState::ignore_read_error = false;
+      }
+#endif // NDEBUG
+      db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
                     statuses.data());
+#ifndef NDEBUG
+      if (fault_fs_guard) {
+        error_count = fault_fs_guard->GetAndResetErrorCount();
+      }
+#endif // NDEBUG
     } else {
 #ifndef ROCKSDB_LITE
-      txn->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
+      txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
                     statuses.data());
-      RollbackTxn(txn);
 #endif
     }
 
-    for (const auto& s : statuses) {
-      if (s.ok()) {
+#ifndef NDEBUG
+    if (fault_fs_guard && error_count && !SharedState::ignore_read_error) {
+      int stat_nok = 0;
+      for (const auto& s : statuses) {
+        if (!s.ok() && !s.IsNotFound()) {
+          stat_nok++;
+        }
+      }
+
+      if (stat_nok < error_count) {
+        // Grab mutex so multiple thread don't try to print the
+        // stack trace at the same time
+        MutexLock l(thread->shared->GetMutex());
+        fprintf(stderr, "Didn't get expected error from MultiGet. \n");
+        fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n", num_keys,
+                error_count, stat_nok);
+        fprintf(stderr, "Callstack that injected the fault\n");
+        fault_fs_guard->PrintFaultBacktrace();
+        std::terminate();
+      }
+    }
+    if (fault_fs_guard) {
+      fault_fs_guard->DisableErrorInjection();
+    }
+#endif // NDEBUG
+
+    for (size_t i = 0; i < statuses.size(); ++i) {
+      Status s = statuses[i];
+      bool is_consistent = true;
+      // Only do the consistency check if no error was injected and MultiGet
+      // didn't return an unexpected error
+      if (do_consistency_check && !error_count && (s.ok() || s.IsNotFound())) {
+        Status tmp_s;
+        std::string value;
+
+        if (use_txn) {
+#ifndef ROCKSDB_LITE
+          tmp_s = txn->Get(readoptionscopy, cfh, keys[i], &value);
+#endif  // ROCKSDB_LITE
+        } else {
+          tmp_s = db_->Get(readoptionscopy, cfh, keys[i], &value);
+        }
+        if (!tmp_s.ok() && !tmp_s.IsNotFound()) {
+          fprintf(stderr, "Get error: %s\n", s.ToString().c_str());
+          is_consistent = false;
+        } else if (!s.ok() && tmp_s.ok()) {
+          fprintf(stderr, "MultiGet returned different results with key %s\n",
+                  keys[i].ToString(true).c_str());
+          fprintf(stderr, "Get returned ok, MultiGet returned not found\n");
+          is_consistent = false;
+        } else if (s.ok() && tmp_s.IsNotFound()) {
+          fprintf(stderr, "MultiGet returned different results with key %s\n",
+                  keys[i].ToString(true).c_str());
+          fprintf(stderr, "MultiGet returned ok, Get returned not found\n");
+          is_consistent = false;
+        } else if (s.ok() && value != values[i].ToString()) {
+          fprintf(stderr, "MultiGet returned different results with key %s\n",
+                  keys[i].ToString(true).c_str());
+          fprintf(stderr, "MultiGet returned value %s\n",
+                  values[i].ToString(true).c_str());
+          fprintf(stderr, "Get returned value %s\n", value.c_str());
+          is_consistent = false;
+        }
+      }
+
+      if (!is_consistent) {
+        fprintf(stderr, "TestMultiGet error: is_consistent is false\n");
+        thread->stats.AddErrors(1);
+        // Fail fast to preserve the DB state
+        thread->shared->SetVerificationFailure();
+        break;
+      } else if (s.ok()) {
         // found case
         thread->stats.AddGets(1, 1);
       } else if (s.IsNotFound()) {
@@ -252,11 +419,24 @@
         // With txn this is sometimes expected.
         thread->stats.AddGets(1, 1);
       } else {
-        // errors case
-        fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str());
-        thread->stats.AddErrors(1);
+        if (error_count == 0) {
+          // errors case
+          fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str());
+          thread->stats.AddErrors(1);
+        } else {
+          thread->stats.AddVerifiedErrors(1);
+        }
       }
     }
+
+    if (readoptionscopy.snapshot) {
+      db_->ReleaseSnapshot(readoptionscopy.snapshot);
+    }
+    if (use_txn) {
+#ifndef ROCKSDB_LITE
+      RollbackTxn(txn);
+#endif
+    }
     return statuses;
   }
 
@@ -308,6 +488,8 @@
     int64_t max_key = shared->GetMaxKey();
     int64_t rand_key = rand_keys[0];
     int rand_column_family = rand_column_families[0];
+    std::string write_ts_str;
+    Slice write_ts;
     while (!shared->AllowsOverwrite(rand_key) &&
            (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
       lock.reset();
@@ -315,6 +497,11 @@
       rand_column_family = thread->rand.Next() % FLAGS_column_families;
       lock.reset(
           new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+      if (FLAGS_user_timestamp_size > 0) {
+        write_ts_str = NowNanosStr();
+        write_ts = write_ts_str;
+        write_opts.timestamp = &write_ts;
+      }
     }
 
     std::string key_str = Key(rand_key);
@@ -369,8 +556,18 @@
     }
     shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
     if (!s.ok()) {
-      fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
-      std::terminate();
+      if (FLAGS_injest_error_severity >= 2) {
+        if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
+          is_db_stopped_ = true;
+        } else if (!is_db_stopped_ ||
+                   s.severity() < Status::Severity::kFatalError) {
+          fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
+      } else {
+        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
     }
     thread->stats.AddBytesForWrites(1, sz);
     PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
@@ -390,6 +587,8 @@
     // OPERATION delete
     // If the chosen key does not allow overwrite and it does not exist,
     // choose another key.
+    std::string write_ts_str;
+    Slice write_ts;
     while (!shared->AllowsOverwrite(rand_key) &&
            !shared->Exists(rand_column_family, rand_key)) {
       lock.reset();
@@ -397,6 +596,11 @@
       rand_column_family = thread->rand.Next() % FLAGS_column_families;
       lock.reset(
           new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+      if (FLAGS_user_timestamp_size > 0) {
+        write_ts_str = NowNanosStr();
+        write_ts = write_ts_str;
+        write_opts.timestamp = &write_ts;
+      }
     }
 
     std::string key_str = Key(rand_key);
@@ -425,8 +629,19 @@
       shared->Delete(rand_column_family, rand_key, false /* pending */);
       thread->stats.AddDeletes(1);
       if (!s.ok()) {
-        fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
-        std::terminate();
+        if (FLAGS_injest_error_severity >= 2) {
+          if (!is_db_stopped_ &&
+              s.severity() >= Status::Severity::kFatalError) {
+            is_db_stopped_ = true;
+          } else if (!is_db_stopped_ ||
+                     s.severity() < Status::Severity::kFatalError) {
+            fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+            std::terminate();
+          }
+        } else {
+          fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
       }
     } else {
       shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
@@ -447,8 +662,19 @@
       shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
       thread->stats.AddSingleDeletes(1);
       if (!s.ok()) {
-        fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
-        std::terminate();
+        if (FLAGS_injest_error_severity >= 2) {
+          if (!is_db_stopped_ &&
+              s.severity() >= Status::Severity::kFatalError) {
+            is_db_stopped_ = true;
+          } else if (!is_db_stopped_ ||
+                     s.severity() < Status::Severity::kFatalError) {
+            fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
+            std::terminate();
+          }
+        } else {
+          fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
       }
     }
     return s;
@@ -494,8 +720,18 @@
     Slice end_key = end_keystr;
     Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
     if (!s.ok()) {
-      fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
-      std::terminate();
+      if (FLAGS_injest_error_severity >= 2) {
+        if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
+          is_db_stopped_ = true;
+        } else if (!is_db_stopped_ ||
+                   s.severity() < Status::Severity::kFatalError) {
+          fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
+          std::terminate();
+        }
+      } else {
+        fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
     }
     int covered = shared->DeleteRange(rand_column_family, rand_key,
                                       rand_key + FLAGS_range_deletion_width,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/defs.bzl mariadb-10.11.13/storage/rocksdb/rocksdb/defs.bzl
--- mariadb-10.11.11/storage/rocksdb/rocksdb/defs.bzl	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/defs.bzl	2025-05-19 16:14:27.000000000 +0000
@@ -3,6 +3,7 @@
 # defs.bzl - Definitions for Facebook-specific buck build integration
 # in TARGETS
 
+load("@fbcode_macros//build_defs:coverage.bzl", "coverage")
 load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary")
 load("@fbcode_macros//build_defs:custom_unittest.bzl", "custom_unittest")
 
@@ -35,8 +36,21 @@
         external_deps = rocksdb_external_deps,
     )
 
+    binary_path = "$(location :{})".format(test_bin)
+
+    base_path = native.package_name()
+    tags = []
+    if coverage.is_coverage_enabled(base_path):
+        # This tag instructs testpilot to use
+        # the lower-memory coverage runner
+        # (e.g. it tells testpilot that the binary
+        # is actually instrumented with coverage info)
+        tags = ["coverage"]
+
     custom_unittest(
         name = test_name,
-        command = [TEST_RUNNER, "$(location :{})".format(test_bin)],
+        command = [TEST_RUNNER, binary_path],
         type = ttype,
+        env = {"BUCK_BASE_BINARY": binary_path},
+        tags = tags,
     )
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile	2025-05-19 16:14:27.000000000 +0000
@@ -1,2 +1,4 @@
 source 'https://rubygems.org'
-gem 'github-pages', '~> 104'
+gem 'github-pages', '~> 209'
+
+gem "webrick", "~> 1.7"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile.lock mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile.lock
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/Gemfile.lock	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/Gemfile.lock	2025-05-19 16:14:27.000000000 +0000
@@ -1,146 +1,267 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (4.2.7)
-      i18n (~> 0.7)
-      json (~> 1.7, >= 1.7.7)
+    activesupport (6.0.3.4)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
       minitest (~> 5.1)
-      thread_safe (~> 0.3, >= 0.3.4)
       tzinfo (~> 1.1)
-    addressable (2.4.0)
+      zeitwerk (~> 2.2, >= 2.2.2)
+    addressable (2.8.0)
+      public_suffix (>= 2.0.2, < 5.0)
     coffee-script (2.4.1)
       coffee-script-source
       execjs
-    coffee-script-source (1.12.2)
+    coffee-script-source (1.11.1)
     colorator (1.1.0)
-    concurrent-ruby (1.0.5)
-    ethon (0.11.0)
+    commonmarker (0.17.13)
+      ruby-enum (~> 0.5)
+    concurrent-ruby (1.1.7)
+    dnsruby (1.61.5)
+      simpleidn (~> 0.1)
+    em-websocket (0.5.2)
+      eventmachine (>= 0.12.9)
+      http_parser.rb (~> 0.6.0)
+    ethon (0.12.0)
       ffi (>= 1.3.0)
+    eventmachine (1.2.7)
     execjs (2.7.0)
-    faraday (0.15.2)
+    faraday (1.3.0)
+      faraday-net_http (~> 1.0)
       multipart-post (>= 1.2, < 3)
-    ffi (1.9.25)
+      ruby2_keywords
+    faraday-net_http (1.0.0)
+    ffi (1.14.2)
     forwardable-extended (2.6.0)
-    gemoji (2.1.0)
-    github-pages (104)
-      activesupport (= 4.2.7)
-      github-pages-health-check (= 1.2.0)
-      jekyll (>= 3.8.4)
-      jekyll-avatar (= 0.4.2)
-      jekyll-coffeescript (= 1.0.1)
-      jekyll-feed (= 0.8.0)
-      jekyll-gist (= 1.4.0)
-      jekyll-github-metadata (= 2.2.0)
-      jekyll-mentions (= 1.2.0)
+    gemoji (3.0.1)
+    github-pages (209)
+      github-pages-health-check (= 1.16.1)
+      jekyll (= 3.9.0)
+      jekyll-avatar (= 0.7.0)
+      jekyll-coffeescript (= 1.1.1)
+      jekyll-commonmark-ghpages (= 0.1.6)
+      jekyll-default-layout (= 0.1.4)
+      jekyll-feed (= 0.15.1)
+      jekyll-gist (= 1.5.0)
+      jekyll-github-metadata (= 2.13.0)
+      jekyll-mentions (= 1.6.0)
+      jekyll-optional-front-matter (= 0.3.2)
       jekyll-paginate (= 1.1.0)
-      jekyll-redirect-from (= 0.11.0)
-      jekyll-sass-converter (= 1.3.0)
-      jekyll-seo-tag (= 2.1.0)
-      jekyll-sitemap (= 0.12.0)
-      jekyll-swiss (= 0.4.0)
-      jemoji (= 0.7.0)
-      kramdown (= 1.11.1)
-      liquid (= 3.0.6)
-      listen (= 3.0.6)
+      jekyll-readme-index (= 0.3.0)
+      jekyll-redirect-from (= 0.16.0)
+      jekyll-relative-links (= 0.6.1)
+      jekyll-remote-theme (= 0.4.2)
+      jekyll-sass-converter (= 1.5.2)
+      jekyll-seo-tag (= 2.6.1)
+      jekyll-sitemap (= 1.4.0)
+      jekyll-swiss (= 1.0.0)
+      jekyll-theme-architect (= 0.1.1)
+      jekyll-theme-cayman (= 0.1.1)
+      jekyll-theme-dinky (= 0.1.1)
+      jekyll-theme-hacker (= 0.1.2)
+      jekyll-theme-leap-day (= 0.1.1)
+      jekyll-theme-merlot (= 0.1.1)
+      jekyll-theme-midnight (= 0.1.1)
+      jekyll-theme-minimal (= 0.1.1)
+      jekyll-theme-modernist (= 0.1.1)
+      jekyll-theme-primer (= 0.5.4)
+      jekyll-theme-slate (= 0.1.1)
+      jekyll-theme-tactile (= 0.1.1)
+      jekyll-theme-time-machine (= 0.1.1)
+      jekyll-titles-from-headings (= 0.5.3)
+      jemoji (= 0.12.0)
+      kramdown (= 2.3.1)
+      kramdown-parser-gfm (= 1.1.0)
+      liquid (= 4.0.3)
       mercenary (~> 0.3)
-      minima (= 2.0.0)
-      rouge (= 1.11.1)
+      minima (= 2.5.1)
+      nokogiri (>= 1.10.4, < 2.0)
+      rouge (= 3.23.0)
       terminal-table (~> 1.4)
-    github-pages-health-check (1.2.0)
+    github-pages-health-check (1.16.1)
       addressable (~> 2.3)
-      net-dns (~> 0.8)
+      dnsruby (~> 1.60)
       octokit (~> 4.0)
-      public_suffix (~> 1.4)
-      typhoeus (~> 0.7)
-    html-pipeline (2.4.2)
+      public_suffix (~> 3.0)
+      typhoeus (~> 1.3)
+    html-pipeline (2.14.0)
       activesupport (>= 2)
-      nokogiri (~> 1.8.2)
-    i18n (0.7.0)
-    jekyll (3.8.4)
+      nokogiri (>= 1.4)
+    http_parser.rb (0.6.0)
+    i18n (0.9.5)
+      concurrent-ruby (~> 1.0)
+    jekyll (3.9.0)
       addressable (~> 2.4)
       colorator (~> 1.0)
+      em-websocket (~> 0.5)
+      i18n (~> 0.7)
       jekyll-sass-converter (~> 1.0)
-      jekyll-watch (~> 1.1)
-      kramdown (~> 1.3)
-      liquid (~> 3.0)
+      jekyll-watch (~> 2.0)
+      kramdown (>= 1.17, < 3)
+      liquid (~> 4.0)
       mercenary (~> 0.3.3)
       pathutil (~> 0.9)
-      rouge (~> 1.7)
+      rouge (>= 1.7, < 4)
       safe_yaml (~> 1.0)
-    jekyll-avatar (0.4.2)
-      jekyll (~> 3.0)
-    jekyll-coffeescript (1.0.1)
+    jekyll-avatar (0.7.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-coffeescript (1.1.1)
       coffee-script (~> 2.2)
-    jekyll-feed (0.8.0)
-      jekyll (~> 3.3)
-    jekyll-gist (1.4.0)
+      coffee-script-source (~> 1.11.1)
+    jekyll-commonmark (1.3.1)
+      commonmarker (~> 0.14)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-commonmark-ghpages (0.1.6)
+      commonmarker (~> 0.17.6)
+      jekyll-commonmark (~> 1.2)
+      rouge (>= 2.0, < 4.0)
+    jekyll-default-layout (0.1.4)
+      jekyll (~> 3.0)
+    jekyll-feed (0.15.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-gist (1.5.0)
       octokit (~> 4.2)
-    jekyll-github-metadata (2.2.0)
-      jekyll (~> 3.1)
+    jekyll-github-metadata (2.13.0)
+      jekyll (>= 3.4, < 5.0)
       octokit (~> 4.0, != 4.4.0)
-    jekyll-mentions (1.2.0)
-      activesupport (~> 4.0)
+    jekyll-mentions (1.6.0)
       html-pipeline (~> 2.3)
-      jekyll (~> 3.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-optional-front-matter (0.3.2)
+      jekyll (>= 3.0, < 5.0)
     jekyll-paginate (1.1.0)
-    jekyll-redirect-from (0.11.0)
-      jekyll (>= 2.0)
-    jekyll-sass-converter (1.3.0)
-      sass (~> 3.2)
-    jekyll-seo-tag (2.1.0)
-      jekyll (~> 3.3)
-    jekyll-sitemap (0.12.0)
-      jekyll (~> 3.3)
-    jekyll-swiss (0.4.0)
-    jekyll-watch (1.5.0)
-      listen (~> 3.0, < 3.1)
-    jemoji (0.7.0)
-      activesupport (~> 4.0)
-      gemoji (~> 2.0)
+    jekyll-readme-index (0.3.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-redirect-from (0.16.0)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-relative-links (0.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-remote-theme (0.4.2)
+      addressable (~> 2.0)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
+      rubyzip (>= 1.3.0, < 3.0)
+    jekyll-sass-converter (1.5.2)
+      sass (~> 3.4)
+    jekyll-seo-tag (2.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-swiss (1.0.0)
+    jekyll-theme-architect (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-cayman (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-dinky (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-hacker (0.1.2)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-leap-day (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-merlot (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-midnight (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-minimal (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-modernist (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-primer (0.5.4)
+      jekyll (> 3.5, < 5.0)
+      jekyll-github-metadata (~> 2.9)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-slate (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-tactile (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-time-machine (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-titles-from-headings (0.5.3)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-watch (2.2.1)
+      listen (~> 3.0)
+    jemoji (0.12.0)
+      gemoji (~> 3.0)
       html-pipeline (~> 2.2)
-      jekyll (>= 3.0)
-    json (1.8.3)
-    kramdown (1.11.1)
-    liquid (3.0.6)
-    listen (3.0.6)
-      rb-fsevent (>= 0.9.3)
-      rb-inotify (>= 0.9.7)
+      jekyll (>= 3.0, < 5.0)
+    kramdown (2.3.1)
+      rexml
+    kramdown-parser-gfm (1.1.0)
+      kramdown (~> 2.0)
+    liquid (4.0.3)
+    listen (3.4.0)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
     mercenary (0.3.6)
-    mini_portile2 (2.3.0)
-    minima (2.0.0)
-    minitest (5.9.1)
-    multipart-post (2.0.0)
-    net-dns (0.8.0)
-    nokogiri (~> 1.8.2)
-      mini_portile2 (~> 2.3.0)
-    octokit (4.4.1)
-      sawyer (~> 0.7.0, >= 0.5.3)
-    pathutil (0.14.0)
+    mini_portile2 (2.6.1)
+    minima (2.5.1)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-feed (~> 0.9)
+      jekyll-seo-tag (~> 2.1)
+    minitest (5.14.3)
+    multipart-post (2.1.1)
+    nokogiri (1.12.5)
+      mini_portile2 (~> 2.6.1)
+      racc (~> 1.4)
+    octokit (4.20.0)
+      faraday (>= 0.9)
+      sawyer (~> 0.8.0, >= 0.5.3)
+    pathutil (0.16.2)
       forwardable-extended (~> 2.6)
-    public_suffix (1.5.3)
-    rb-fsevent (0.9.8)
-    rb-inotify (0.9.7)
-      ffi (>= 0.5.0)
-    rouge (1.11.1)
-    safe_yaml (1.0.4)
-    sass (3.4.22)
-    sawyer (0.7.0)
-      addressable (>= 2.3.5, < 2.5)
-      faraday (~> 0.8, < 0.10)
-    terminal-table (1.7.3)
-      unicode-display_width (~> 1.1.1)
-    thread_safe (0.3.5)
-    typhoeus (0.8.0)
-      ethon (>= 0.8.0)
-    tzinfo (1.2.2)
+    public_suffix (3.1.1)
+    racc (1.5.2)
+    rb-fsevent (0.10.4)
+    rb-inotify (0.10.1)
+      ffi (~> 1.0)
+    rexml (3.2.5)
+    rouge (3.23.0)
+    ruby-enum (0.8.0)
+      i18n
+    ruby2_keywords (0.0.2)
+    rubyzip (2.3.0)
+    safe_yaml (1.0.5)
+    sass (3.7.4)
+      sass-listen (~> 4.0.0)
+    sass-listen (4.0.0)
+      rb-fsevent (~> 0.9, >= 0.9.4)
+      rb-inotify (~> 0.9, >= 0.9.7)
+    sawyer (0.8.2)
+      addressable (>= 2.3.5)
+      faraday (> 0.8, < 2.0)
+    simpleidn (0.1.1)
+      unf (~> 0.1.4)
+    terminal-table (1.8.0)
+      unicode-display_width (~> 1.1, >= 1.1.1)
+    thread_safe (0.3.6)
+    typhoeus (1.4.0)
+      ethon (>= 0.9.0)
+    tzinfo (1.2.9)
       thread_safe (~> 0.1)
-    unicode-display_width (1.1.1)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.7)
+    unicode-display_width (1.7.0)
+    webrick (1.7.0)
+    zeitwerk (2.4.2)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  github-pages (~> 104)
+  github-pages (~> 209)
+  webrick (~> 1.7)
 
 BUNDLED WITH
-   1.13.1
+   2.2.3
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_config.yml mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_config.yml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_config.yml	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_config.yml	2025-05-19 16:14:27.000000000 +0000
@@ -81,5 +81,5 @@
 redcarpet:
   extensions: [with_toc_data]
 
-gems:
+plugins:
   - jekyll-redirect-from
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/authors.yml mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/authors.yml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/authors.yml	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/authors.yml	2025-05-19 16:14:27.000000000 +0000
@@ -68,3 +68,6 @@
 fgwu:
   full_name: Fenggang Wu
   fbid: 100002297362180
+
+ltamasi:
+  full_name: Levi Tamasi
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/nav.yml mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/nav.yml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_data/nav.yml	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_data/nav.yml	2025-05-19 16:14:27.000000000 +0000
@@ -7,11 +7,11 @@
   category: external
 
 - title: API (C++)
-  href: https://github.com/facebook/rocksdb/tree/master/include/rocksdb
+  href: https://github.com/facebook/rocksdb/tree/main/include/rocksdb
   category: external
 
 - title: API (Java)
-  href: https://github.com/facebook/rocksdb/tree/master/java/src/main/java/org/rocksdb
+  href: https://github.com/facebook/rocksdb/tree/main/java/src/main/java/org/rocksdb
   category: external
 
 - title: Support
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_docs/getting-started.md mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_docs/getting-started.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_docs/getting-started.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_docs/getting-started.md	2025-05-19 16:14:27.000000000 +0000
@@ -73,6 +73,6 @@
 
 Here are some specific details about the RocksDB implementation:
 
-- [Architecture Guide](https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide)
-- [Format of an immutable Table file](https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format)
-- [Format of a log file](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)
+- [RocksDB Overview](https://github.com/facebook/rocksdb/wiki/RocksDB-Overview)
+- [Immutable BlockBased Table file format](https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format)
+- [Log file format](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_includes/doc.html mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_includes/doc.html
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_includes/doc.html	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_includes/doc.html	2025-05-19 16:14:27.000000000 +0000
@@ -18,7 +18,7 @@
     {% else %}
       {{ content }}
 
-      <p><a class="edit-page-link" href="https://github.com/{{ site.ghrepo }}/blob/master/docs/{{ page.path }}" target="_blank">Edit on GitHub</a></p>
+      <p><a class="edit-page-link" href="https://github.com/{{ site.ghrepo }}/blob/main/docs/{{ page.path }}" target="_blank">Edit on GitHub</a></p>
     {% endif %}
   </article>
   {% include doc_paging.html %}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -17,7 +17,7 @@
 
 The usual Mapnik workflow is to load the map data into a SQL-based database and then define map layers with SQL statements. To render a tile, Mapnik needs to execute a couple of SQL queries. The benefit of this approach is that you don't need to reload your database when you change your map style. You can just change your SQL query and Mapnik picks it up. In our model, we decided to precompute the features we need for each tile. We need to know the map style before we create the database. However, when rendering the map tile, we only fetch the features that we need to render.
 
-We haven't open sourced the RocksDB Mapnik plugin or the database loading pipeline. However, the spatial indexing is available in RocksDB under a name [SpatialDB](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/utilities/spatial_db.h). The API is focused on map rendering use-case, but we hope that it can also be used for other spatial-based applications.
+We haven't open sourced the RocksDB Mapnik plugin or the database loading pipeline. However, the spatial indexing is available in RocksDB under a name [SpatialDB](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/utilities/spatial_db.h). The API is focused on map rendering use-case, but we hope that it can also be used for other spatial-based applications.
 
 Let's take a tour of the API. When you create a spatial database, you specify the spatial indexes that need to be built. Each spatial index is defined by a bounding box and granularity. For map rendering, we create a spatial index for each zoom levels. Higher zoom levels have more granularity.
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -138,7 +138,7 @@
 ## The API
 
 
-The GetThreadList API is defined in [include/rocksdb/env.h](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/env.h#L317-L318), which is an Env
+The GetThreadList API is defined in [include/rocksdb/env.h](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/env.h#L317-L318), which is an Env
 function:
 
 ```c++
@@ -151,7 +151,7 @@
 
 The `GetThreadList()` API simply returns a vector of `ThreadStatus`, each describes
 the current status of a thread. The `ThreadStatus` structure, defined in
-[include/rocksdb/thread_status.h](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/thread_status.h), contains the following information:
+[include/rocksdb/thread_status.h](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/thread_status.h), contains the following information:
 
 ```c++
 // An unique ID for the thread.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -9,14 +9,14 @@
 
 ## 4.8.0 (5/2/2016)
 
-### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-change-1)Public API Change
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-change-1)Public API Change
 
   * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes.
   * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see [https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
   * Expose estimate of per-level compression ratio via DB property: "rocksdb.compression-ratio-at-levelN".
   * Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
 
-### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#new-features-2)New Features
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#new-features-2)New Features
 
   * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size.
 
@@ -24,25 +24,25 @@
 
 <!--truncate-->
 
-## [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#470-482016)4.7.0 (4/8/2016)
+## [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#470-482016)4.7.0 (4/8/2016)
 
-### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-change-2)Public API Change
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-change-2)Public API Change
 
   * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too.
   * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See [https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File](https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File) for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options.
 
 <br/>
 
-## [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#460-3102016)4.6.0 (3/10/2016)
+## [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#460-3102016)4.6.0 (3/10/2016)
 
-### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-changes-1)Public API Changes
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#public-api-changes-1)Public API Changes
 
   * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier
   * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly.
   * Tickers [NUMBER_DB_NEXT, NUMBER_DB_PREV, NUMBER_DB_NEXT_FOUND, NUMBER_DB_PREV_FOUND, ITER_BYTES_READ] are not updated immediately. The are updated when the Iterator is deleted.
   * Add monotonically increasing counter (DB property "rocksdb.current-super-version-number") that increments upon any change to the LSM tree.
 
-### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#new-features-3)New Features
+### [](https://github.com/facebook/rocksdb/blob/main/HISTORY.md#new-features-3)New Features
 
   * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification.
   * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -34,4 +34,4 @@
 }
 ```
 
-You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/master/examples/simple_example.cc) demonstrates that with more examples.
+You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/main/examples/simple_example.cc) demonstrates that with more examples.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2017-08-25-flushwal.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -21,6 +21,6 @@
 
 ### Success story: MyRocks
 
-Some applications that use RocksDB, already have other machinsims in place to provide reliability. MySQL for example uses 2PC (two-phase commit) to write to both binlog as well as the storage engine such as InnoDB and MyRocks. The group commit logic in MySQL allows the 1st phase (Prepare) to be run in parallel but after a commit group is formed performs the 2nd phase (Commit) in a serial manner. This makes low commit latency in the storage engine essential for acheiving high throughput. The commit in MyRocks includes writing to the RocksDB WAL, which as explaiend above, by default incures the latency of flushing the WAL new appends to the OS buffer.
+Some applications that use RocksDB, already have other machinsims in place to provide reliability. MySQL for example uses 2PC (two-phase commit) to write to both binlog as well as the storage engine such as InnoDB and MyRocks. The group commit logic in MySQL allows the 1st phase (Prepare) to be run in parallel but after a commit group is formed performs the 2nd phase (Commit) in a serial manner. This makes low commit latency in the storage engine essential for achieving high throughput. The commit in MyRocks includes writing to the RocksDB WAL, which as explaiend above, by default incures the latency of flushing the WAL new appends to the OS buffer.
 
 Since binlog helps in recovering from some failure scenarios, MySQL can provide reliability without however needing a storage WAL flush after each individual commit. MyRocks benefits from this property, disables automatic WAL flush in RocksDB, and manually calls `::FlushWAL` when requested by MySQL.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -15,17 +15,17 @@
 ### Overview
 
 Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
-[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)).
 Users provide the Rocksdb configuration that they want to improve upon (as the
 familiar Rocksdb OPTIONS file —
-[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini))
+[example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini))
 and the path of the file which contains Rocksdb logs and statistics.
-The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py)
+The [Advisor](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser_example.py)
 creates appropriate DataSource objects (for Rocksdb
-[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py),
-[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py),
-[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.)
-and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py).
+[logs](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser.py).
 The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
 The Advisor's output gives information about which rules were triggered,
 why they were triggered and what each of them suggests. Each suggestion
@@ -55,4 +55,4 @@
 
 ### Read more
 
-For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/master/tools/advisor/README.md).
+For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/main/tools/advisor/README.md).
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-04-12-universal-improvements.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,46 @@
+---
+title: (Call For Contribution) Make Universal Compaction More Incremental
+layout: post
+author: sdong
+category: blog
+---
+
+### Motivation
+
+Universal Compaction is an important compaction style, but few changes were made after we made the structure multi-leveled. Yet the major restriction of always compacting full sorted run is not relaxed. Compared to Leveled Compaction, where we usually only compile several SST files together, in universal compaction, we frequently compact GBs of data. Two issues with this gap: 1. it makes it harder to unify universal and leveled compaction; 2. periodically data is fully compacted, and in the mean time space is doubled. To ease the problem, we can break the restriction and do similar as leveled compaction, and bring it closer to unified compaction.
+
+We call for help for making following improvements.
+
+
+### How Universal Compaction Works
+
+In universal, whole levels are compacted together to satisfy two conditions (See [wiki page](https://github.com/facebook/rocksdb/wiki/Universal-Compaction) for more details):
+
+1. total size / bottommost level size > a threshold, or
+2. total number of sorted runs (non-0 levels + L0 files) is within a threshold
+
+1 is to limit extra space overhead used for dead data and 2 is for read performance.
+
+If 1 is triggered, likely a full compaction will be triggered. If 2 is triggered, RocksDB compact some sorted runs to bring the number down. It does it by using a simple heuristic so that less writes needed for that purpose over time: it starts from compacting smaller files, but if total size to compact is similar to or larger than size of the next level, it will take that level together, as soon on (whether it is the best heuristic is another question and we’ve never seriously looked at it).
+
+### How We Can Improve?
+
+Let’s start from condition 1. Here we do full compaction but is not necessary.  A simple optimization would be to compact so that just enough files are merged into the bottommost level (Lmax) to satisfy condition 1. It would work if we only need to pick some files from Lmax-1, or if it is cheaper over time, we can pick some files from other levels too.
+
+Then condition 2. If we finish condition 1, there might be holes in some ranges in older levels. These holes might make it possible that only by compacting some sub ranges, we can fix the LSM-tree for condition 2. RocksDB can take single files into consideration and apply more sophisticated heuristic.
+
+This new approach makes universal compaction closer to leveled compaction. The operation for 1 is closer to how Leveled compaction triggeres Lmax-1 to Lmax compaction. And 2 can potentially be implemented as something similar to level picking in Leveled Compaction. In fact, all those file picking can co-existing in one single compaction style and there isn’t fundamental conflicts to that.
+
+### Limitation
+
+There are two limitations:
+
+* Periodic automatic full compaction is unpleasant but at the same time is pleasant in another way. Some users might uses it to reason that everything is periodically collapsed so dead data is gone and old data is rewritten. We need to make sure periodic compaction works to continue with that.
+* L0 to the first non-L0 level compaction is the first time data is partitioned in LSM-tree so that incremental compaction by range is possible. We might need to do more of these compactions in order to make incremental possible, which will increase compaction slightly.
+* Compacting subset of a level would introduce some extra overhead for unaligned files, just as in leveled compaction. More SST boundary cutting heuristic can reduce this overhead but it will be there.
+
+But I believe the benefits would outweight the limitations. Reducing temporary space doubling and moving towards to unified compaction would be important achievements.
+
+### Interested in Help?
+
+Compaction is the core of LSM-tree, but its improvements are far overdue. If you are a user of universal compaction and would be able to benefit from those improvements, we will be happy to work with you on speeding up the project and bring them to RocksDB sooner. Feel free to communicate with us in [this issue](https://github.com/facebook/rocksdb/issues/8181).
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-integrated-blob-db.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,101 @@
+---
+title: Integrated BlobDB
+layout: post
+author: ltamasi
+category: blog
+---
+## Background
+
+BlobDB is essentially RocksDB for large-value use cases. The basic idea, which was proposed in the [WiscKey paper](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf), is key-value separation: by storing large values in dedicated blob files and storing only small pointers to them in the LSM tree, we avoid copying the values over and over again during compaction, thus reducing write amplification. Historically, BlobDB supported only FIFO and TTL based use cases that can tolerate some data loss. In addition, it was incompatible with many widely used RocksDB features, and required users to adopt a custom API. In 2020, we decided to rearchitect BlobDB from the ground up, taking the lessons learned from WiscKey and the original BlobDB but also drawing inspiration and incorporating ideas from other similar systems. Our goals were to eliminate the above limitations and to create a new integrated version that enables customers to use the well-known RocksDB API, has feature parity with the core of RocksDB, and offers better performance. This new implementation is now available and provides the following improvements over the original:
+
+* **API.** In contrast with the legacy BlobDB implementation, which had its own `StackableDB`-based interface (`rocksdb::blob_db::BlobDB`), the new version can be used via the well-known `rocksdb::DB` API, and can be configured simply by using a few column family options.
+* **Consistency.** With the integrated BlobDB implementation, RocksDB’s consistency guarantees and various write options (like using the WAL or synchronous writes) now apply to blobs as well. Moreover, the new BlobDB keeps track of blob files in the RocksDB MANIFEST.
+* **Write performance.** When using the old BlobDB, blobs are extracted and immediately written to blob files by the BlobDB layer *in the application thread*. This has multiple drawbacks from a performance perspective: first, it requires synchronization; second, it means that expensive operations like compression are performed in the application thread; and finally, it involves flushing the blob file after each blob. The new code takes a completely different approach by *offloading blob file building to RocksDB’s background jobs*, i.e. flushes and compactions. This means that similarly to SSTs, any given blob file is now written by a single background thread, eliminating the need for locking, flushing, or performing compression in the foreground. Note that this approach is also a better fit for network-based file systems where small writes might be expensive and opens up the possibility of file format optimizations that involve buffering (like dictionary compression).
+* **Read performance.** The old code relies on each read (i.e. `Get`, `MultiGet`, or iterator) taking a snapshot and uses those snapshots when deciding which obsolete blob files can be removed. The new BlobDB improves this by generalizing RocksDB’s Version concept, which historically referred to the set of live SST files at a given point in time, to include the set of live blob files as well. This has performance benefits like [making the read path mostly lock-free by utilizing thread-local storage](https://rocksdb.org/blog/2014/06/27/avoid-expensive-locks-in-get.html). We have also introduced a blob file cache that can be utilized to keep frequently accessed blob files open.
+* **Garbage collection.** Key-value separation means that if a key pointing to a blob gets overwritten or deleted, the blob becomes unreferenced garbage. To be able to reclaim this space, BlobDB now has garbage collection capabilities. GC is integrated into the compaction process and works by relocating valid blobs residing in old blob files as they are encountered during compaction. Blob files can be marked obsolete (and eventually deleted in one shot) once they contain nothing but garbage. This is more efficient than the method used by WiscKey, which involves performing a `Get` operation to find out whether a blob is still referenced followed by a `Put` to update the reference, which in turn results in garbage collection competing and potentially conflicting with the application’s writes.
+* **Feature parity with the RocksDB core.** The new BlobDB supports way more features than the original and is near feature parity with vanilla RocksDB. In particular, we support all basic read/write APIs (with the exception of `Merge`, which is coming soon), recovery, compression, atomic flush, column families, compaction filters, checkpoints, backup/restore, transactions, per-file checksums, and the SST file manager. In addition, the new BlobDB’s options can be dynamically adjusted using the `SetOptions` interface.
+
+## API
+
+The new BlobDB can be configured (on a per-column family basis if needed) simply by using the following options:
+
+* `enable_blob_files`: set it to `true` to enable key-value separation.
+* `min_blob_size`: values at or above this threshold will be written to blob files during flush or compaction.
+* `blob_file_size`: the size limit for blob files.
+* `blob_compression_type`: the compression type to use for blob files. All blobs in the same file are compressed using the same algorithm.
+* `enable_blob_garbage_collection`: set this to `true` to make BlobDB actively relocate valid blobs from the oldest blob files as they are encountered during compaction.
+* `blob_garbage_collection_age_cutoff`: the threshold that the GC logic uses to determine which blob files should be considered “old.” For example, the default value of 0.25 signals to RocksDB that blobs residing in the oldest 25% of blob files should be relocated by GC. This parameter can be tuned to adjust the trade-off between write amplification and space amplification.
+
+The above options are all dynamically adjustable via the `SetOptions` API; changing them will affect subsequent flushes and compactions but not ones that are already in progress.
+
+In terms of compaction styles, we recommend using leveled compaction with BlobDB. The rationale behind universal compaction in general is to provide lower write amplification at the expense of higher read amplification; however, as we will see later in the Performance section, BlobDB can provide very low write amp and good read performance with leveled compaction. Therefore, there is really no reason to take the hit in read performance that comes with universal compaction.
+
+In addition to the above, consider tuning the following non-BlobDB specific options:
+
+* `write_buffer_size`: this is the memtable size. You might want to increase it for large-value workloads to ensure that SST and blob files contain a decent number of keys.
+* `target_file_size_base`: the target size of SST files. Note that even when using BlobDB, it is important to have an LSM tree with a “nice” shape and multiple levels and files per level to prevent heavy compactions. Since BlobDB extracts and writes large values to blob files, it makes sense to make this parameter significantly smaller than the memtable size. One guideline is to set `blob_file_size` to the same value as `write_buffer_size` (adjusted for compression if needed) and make `target_file_size_base` proportionally smaller based on the ratio of key size to value size.
+* `max_bytes_for_level_base`: consider setting this to a multiple (e.g. 8x or 10x) of `target_file_size_base`.
+
+As mentioned above, the new BlobDB now also supports compaction filters. Key-value separation actually enables an optimization here: if the compaction filter of an application can make a decision about a key-value solely based on the key, it is unnecessary to read the value from the blob file. Applications can take advantage of this optimization by implementing the new `FilterBlobByKey` method of the `CompactionFilter` interface. This method gets called by RocksDB first whenever it encounters a key-value where the value is stored in a blob file. If this method returns a “final” decision like `kKeep`, `kRemove`, `kChangeValue`, or `kRemoveAndSkipUntil`, RocksDB will honor that decision; on the other hand, if the method returns `kUndetermined`, RocksDB will read the blob from the blob file and call `FilterV2` with the value in the usual fashion.
+
+## Performance
+
+We tested the performance of the new BlobDB for six different value sizes between 1 KB and 1 MB using a customized version of our [standard benchmark suite](https://github.com/facebook/rocksdb/wiki/Performance-Benchmarks) on a box with an 18-core Skylake DE CPU (running at 1.6 GHz, with hyperthreading enabled), 64 GB RAM, a 512 GB boot SSD, and two 1.88 TB M.2 SSDs in a RAID0 configuration for data. The RocksDB version used was equivalent to 6.18.1, with some benchmarking and statistics related enhancements. Leveled and universal compaction without key-value separation were used as reference points. Note that for simplicity, we use “leveled compaction” and “universal compaction” as shorthand for leveled and universal compaction without key-value separation, respectively, and “BlobDB” for BlobDB with leveled compaction.
+
+Our benchmarks cycled through six different workloads: two write-only ones (initial load and overwrite), two read/write ones (point lookup/write mix and range scan/write mix), and finally two read-only ones (point lookups and range scans). The first two phases performed a fixed amount of work (see below), while the final four were run for a fixed amount of time, namely 30 minutes each. Each phase other than the first one started with the database state left behind by the previous one. Here’s a brief description of the workloads:
+
+* **Initial load**: this workload has two distinct stages, a single-threaded random write stage during which compactions are disabled (so all data is flushed to L0, where it remains for the rest of the stage), followed by a full manual compaction. The random writes are performed with load-optimized settings, namely using the vector memtable implementation and with concurrent memtable writes and WAL disabled. This stage was used to populate the database with 1 TB worth of raw values, e.g. 2^30 (~1 billion) 1 KB values or 2^20 (~1 million) 1 MB values.
+* **Overwrite**: this is a multi-threaded random write workload using the usual skiplist memtable, with compactions, WAL, and concurrent memtable writes enabled. In our tests, 16 writer threads were used. The total number of writes was set to the same number as in the initial load stage and split up evenly between the writer threads. For instance, for the 1 MB value size, we had 2^20 writes divided up between the 16 threads, resulting in each thread performing 2^16 write operations. At the end of this phase, a “wait for compactions” step was added to prevent this workload from exhibiting artificially low write amp or conversely, the next phase showing inflated write amp.
+* **Point lookup/write mix**: a single writer thread performing random writes while N (in our case, 16) threads perform random point lookups. WAL is enabled and all writes are synced.
+* **Range scan/write mix**: similar to the above, with one writer thread and N reader threads (where N was again set to 16 in our tests). The reader threads perform random range scans, with 10 `Next` calls per `Seek`. Again, WAL is enabled, and sync writes are used.
+* **Point lookups (read-only)**: N=16 threads perform random point lookups.
+* **Range scans (read-only)**: N=16 threads execute random range scans, with 10 `Next`s per `Seek` like above.
+
+With that out of the way, let’s see how the new BlobDB performs against traditional leveled and universal compaction. In the next few sections, we’ll be looking at write amplification as well as read and write performance. We’ll also briefly compare the write performance of the new BlobDB with the legacy implementation.
+
+### Write amplification
+
+Reducing write amp is the original motivation for key-value separation. Here, we follow RocksDB’s definition of write amplification (as used in compaction statistics and the info log). That is, we define write amp as the total amount of data written by flushes and compactions divided by the amount of data written by flushes, where “data written” includes SST files and blob files as well (if applicable). The following charts show that BlobDB significantly reduces write amplification for all of our (non-read only) workloads.
+
+For the initial load, where due to the nature of the workload both leveled and universal already have a low write amp factor of 1.6, BlobDB has a write amp close to the theoretical minimum of 1.0, namely in the 1.0..1.02 range, depending on value size. How is this possible? Well, the trick is that when key-value separation is used, the full compaction step only has to sort the keys but not the values. This results in a write amp that is about **36% lower** than the already low write amp you get with either leveled or universal.
+
+In the case of the overwrite workload, BlobDB had a write amp between 1.4 and 1.7 depending on value size. This is around **75-78% lower** than the write amp of leveled compaction (6.1 to 6.8) and **70-77% lower** than universal (5.7 to 6.2); for this workload, there wasn’t a huge difference between the performance of leveled and universal.
+
+When it comes to the point lookup/write mix workload, BlobDB had a write amp between 1.4 and 1.8. This is **83-88% lower** than the write amp of leveled compaction, which had values between 10.8 and 12.5. Universal fared much better than leveled under this workload, and had write amp in the 2.2..6.6 range; however, BlobDB still provided significant gains for all value sizes we tested: namely, write amp was **18-77% lower** than that of universal, depending on value size.
+
+As for the range scan/write mix workload, BlobDB again had a write amp between 1.4 and 1.8, while leveled had values between 13.6 and 14.9, and universal was between 2.8 and 5.0. In other words, BlobDB’s write amp was **88-90% lower** than that of leveled, and **46-70% lower** than that of universal.
+
+![Write amplification](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Write performance
+
+In terms of write performance, there are other factors to consider besides write amplification. The following charts show some interesting metrics for the two write-only workloads (initial load and overwrite). As discussed earlier, these two workloads perform a fixed amount of work; the two charts in the top row show how long it took BlobDB, leveled, and universal to complete that work. Note that each bar is broken down into two, corresponding to the two stages of each workload (random write and full compaction for initial load, and random write and waiting for compactions for overwrite).
+
+For initial load, note that the random write stage takes the same amount of time regardless of which algorithm is used. This is not surprising considering the fact that compactions are disabled during this stage and thus RocksDB is simply writing L0 files (and in BlobDB’s case, blob files) as fast as it can. The second stage, on the other hand, is very different: as mentioned above, BlobDB essentially only needs to read, sort, and rewrite the keys during compaction, which can be done much much faster (with 1 MB values, more than a hundred times faster) than doing the same for large key-values. Due to this, initial load completed **2.3x to 4.7x faster** overall when using BlobDB.
+
+As for the overwrite workload, BlobDB performs much better during both stages. The two charts in the bottom row help explain why. In the case of both leveled and universal compaction, compactions can’t keep up with the write rate, which eventually leads to back pressure in the form of write stalls. As shown in the chart below, both leveled and universal stall between ~40% and ~70% of the time; on the other hand, BlobDB is stall-free except for the largest value size tested (1 MB). This naturally leads to higher throughput, namely **2.1x to 3.5x higher** throughput compared to leveled, and **1.6x to 3.0x higher** throughput compared to universal. The overwrite time chart also shows that the catch-up stage that waits for all compactions to finish is much shorter (and in fact, at larger value sizes, negligible) with BlobDB.
+
+![Write performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Read/write and read-only performance
+
+The charts below show the read performance (in terms of operations per second) of BlobDB versus leveled and universal compaction under the two read/write workloads and the two read-only workloads. BlobDB meets or exceeds the read performance of leveled compaction, except for workloads involving range scans at the two smallest value sizes tested (1 KB and 4 KB). It also provides better (in some cases, much better) read performance than universal across the board. In particular, BlobDB provides up **1.4x higher** read performance than leveled (for larger values), and up to **5.6x higher** than universal.
+
+![Read-write and read-only performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Comparing the two BlobDB implementations
+
+To compare the write performance of the new BlobDB with the legacy implementation, we ran two versions of the first (single-threaded random write) stage of the initial load benchmark using 1 KB values: one with WAL disabled, and one with WAL enabled. The new implementation completed the load **4.6x faster** than the old one without WAL, and **2.3x faster** with WAL.
+
+![Comparing the two BlobDB implementations](/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Future work
+
+There are a few remaining features that are not yet supported by the new BlobDB. The most important one is `Merge` (and the related `GetMergeOperands` API); in addition, we don’t currently support the `EventListener` interface, the `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` APIs, secondary instances, and ingestion of blob files. We will continue to work on closing this gap.
+
+We also have further plans when it comes to performance. These include optimizing garbage collection, introducing a dedicated cache for blobs, improving iterator and `MultiGet` performance, and evolving the blob file format amongst others.
+
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-26-online-validation.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,17 @@
+---
+title: Online Validation
+layout: post
+author: sdong
+category: blog
+---
+To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones.
+
+We improved ColumnFamilyOptions::force_consistency_checks and enabled it by default. The option does some basic consistency checks to LSM-tree, e.g., files in one level are not overlapping. The DB will be frozen from new writes if a violation is detected. Previously, the feature’s check was too limited and didn’t always freeze the DB in a timely manner. Last year, we made the checking stricter so that it can [catch much more corrupted LSM-tree structures](https://github.com/facebook/rocksdb/pull/6901). We also fixed several issues where the checking failure was swallowed without freezing the DB. After making force_consistency_checks more reliable, we changed the default value to be on.
+
+ColumnFamilyOptions::paranoid_file_checks does some more expensive extra checking when generating a new SST file. Last year, we advanced coverage to this feature: after every SST file is generated, the SST file is created, read back keys one by one and check two things: (1) the keys are in comparator order (also available and enabled by default during file write via ColumnFamilyOptions::check_flush_compaction_key_order); (2) the hash of all the KVs is the same as calculated when we add KVs into it. These checks detect certain corruptions so we can prevent the corrupt files from being applied to the DB. We suggest users turn it on at least in shadow environments, and consider to run it in production too if you can afford the overheads.
+
+A recent feature is added to check the count of entries added into memtable while flushing it into an SST file. This feature is to have some online coverage to memtable corruption, caused by either software bug or hardware issue. This feature will be released in the coming release (6.21) and by default on. In the future, we will check more counters during memtables, e.g. number of puts or number of deletes.
+
+We also improved the reporting of online validation errors to improve debuggability. For example, failure to parse a corrupt key now reports details about the corrupt key. Since we did not want to expose key data in logs, error messages, etc., by default, this reporting is opt-in via DBOptions::allow_data_in_errors.
+
+More online checking features are planned and some are more sophisticated, including key/value checksums and sample based query validation.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,195 @@
+---
+title: RocksDB Secondary Cache
+layout: post
+author: anand1976
+category: blog
+---
+## Introduction
+
+The RocksDB team is implementing support for a block cache on non-volatile media, such as a local flash device or NVM/SCM. It can be viewed as an extension of RocksDB’s current volatile block cache (LRUCache or ClockCache). The non-volatile block cache acts as a second tier cache that contains blocks evicted from the volatile cache. Those blocks are then promoted to the volatile cache as they become hotter due to access.
+
+This feature is meant for cases where the DB is located on remote storage or cloud storage. The non-volatile cache is officially referred to in RocksDB as the SecondaryCache. By maintaining a SecondaryCache that’s an order of magnitude larger than DRAM, fewer reads would be required from remote storage, thus reducing read latency as well as network bandwidth consumption. 
+
+From the user point of view, the local flash cache will support the following requirements -
+
+1. Provide a pointer to a secondary cache when opening a DB
+2. Be able to share the secondary cache across DBs in the same process
+3. Have multiple secondary caches on a host
+4. Support persisting the cache across process restarts and reboots by ensuring repeatability of the cache key
+
+![Architecture](/static/images/rocksdb-secondary-cache/arch_diagram.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Design
+
+When designing the API for a SecondaryCache, we had a choice between making it visible to the RocksDB code (table reader) or hiding it behind the RocksDB block cache. There are several advantages of hiding it behind the block cache -
+
+* Allows flexibility in insertion of blocks into the secondary cache. A block can be inserted on eviction from the RAM tier, or it could be eagerly inserted.
+* It makes the rest of the RocksDB code less complex by providing a uniform interface regardless of whether a secondary cache is configured or not
+* Makes parallel reads, peeking in the cache for prefetching, failure handling etc. easier
+* Makes it easier to extend to compressed data if needed, and allows other persistent media, such as PM, to be added as an additional tier
+
+
+We decided to make the secondary cache transparent to the rest of RocksDB code by hiding it behind the block cache. A key issue that we needed to address was the allocation and ownership of memory of the cached items - insertion into the secondary cache may require that memory be allocated by the same. This means that parts of the cached object that can be transferred to the secondary cache needs to be copied out (referred to as **unpacking**), and on a lookup the data stored in the secondary cache needs to be provided to the object constructor (referred to as **packing**). For RocksDB cached objects such as data blocks, index and filter blocks, and compression dictionaries, unpacking involves copying out the raw uncompressed BlockContents of the block, and packing involves constructing the corresponding block/index/filter/dictionary object using the raw uncompressed data.
+
+Another alternative we considered was the existing PersistentCache interface. However, we decided to not pursue it and eventually deprecate it for the following reasons -
+* It is exposed directly to the table reader code, which makes it more difficult to implement different policies such as inclusive/exclusive cache, as well as extending it to more sophisticated admission control policies
+* The interface does not allow for custom memory allocation and object packing/unpacking, so new APIs would have to be defined anyway
+* The current PersistentCache implementation is very simple and does not have any admission control policies
+
+## API
+
+The interface between RocksDB’s block cache and the secondary cache is designed to allow pluggable implementations. For FB internal usage, we plan to use Cachelib  with a wrapper to provide the plug-in implementation and use folly and other fbcode libraries, which cannot be used directly by RocksDB, to efficiently implement the cache operations. The following diagrams show the flow of insertion  and lookup of a block.
+
+![Insert flow](/static/images/rocksdb-secondary-cache/insert_flow.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+![Lookup flow](/static/images/rocksdb-secondary-cache/lookup_flow.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+An item in the secondary cache is referenced by a SecondaryCacheHandle. The handle may not be immediately ready or have a valid  value. The caller can call IsReady() to determine if its ready, and can call Wait() in order to block until it becomes ready. The caller must call Value() after it becomes ready to determine if the item was successfully read. Value() must return nullptr on failure.
+
+```
+class SecondaryCacheHandle {
+ public:
+  virtual ~SecondaryCacheHandle() {}
+
+  // Returns whether the handle is ready or not
+  virtual bool IsReady() = 0;
+
+  // Block until handle becomes ready
+  virtual void Wait() = 0;
+
+  // Return the value. If nullptr, it means the lookup was unsuccessful
+  virtual void* Value() = 0;
+
+  // Return the size of value
+  virtual size_t Size() = 0;
+};
+```
+
+The user of the secondary cache (for example, BlockBasedTableReader indirectly through LRUCache) must implement the callbacks defined in CacheItemHelper, in order to facilitate the unpacking/packing of objects for saving to and restoring from the secondary cache. The CreateCallback must be implemented to construct a cacheable object from the raw data in secondary cache.
+
+```
+  // The SizeCallback takes a void* pointer to the object and returns the size
+  // of the persistable data. It can be used by the secondary cache to allocate
+  // memory if needed.
+  using SizeCallback = size_t (*)(void* obj);
+
+  // The SaveToCallback takes a void* object pointer and saves the persistable
+  // data into a buffer. The secondary cache may decide to not store it in a
+  // contiguous buffer, in which case this callback will be called multiple
+  // times with increasing offset
+  using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+                                    size_t length, void* out);
+
+  // A function pointer type for custom destruction of an entry's
+  // value. The Cache is responsible for copying and reclaiming space
+  // for the key, but values are managed by the caller.
+  using DeleterFn = void (*)(const Slice& key, void* value);
+
+  // A struct with pointers to helper functions for spilling items from the
+  // cache into the secondary cache. May be extended in the future. An
+  // instance of this struct is expected to outlive the cache.
+  struct CacheItemHelper {
+    SizeCallback size_cb;
+    SaveToCallback saveto_cb;
+    DeleterFn del_cb;
+
+    CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+    CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+                    DeleterFn _del_cb)
+        : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+  };
+
+  // The CreateCallback is passed by the block cache user to Lookup(). It
+  // takes in a buffer from the NVM cache and constructs an object using
+  // it. The callback doesn't have ownership of the buffer and should
+  // copy the contents into its own buffer.
+  // typedef std::function<Status(void* buf, size_t size, void** out_obj,
+  //                             size_t* charge)>
+  //    CreateCallback;
+  using CreateCallback = std::function<Status(void* buf, size_t size,
+                                              void** out_obj, size_t* charge)>;
+```
+
+The secondary cache provider must provide a concrete implementation of the SecondaryCache abstract class.
+
+```
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+class SecondaryCache {
+ public:
+  virtual ~SecondaryCache() {}
+
+  virtual std::string Name() = 0;
+
+  static const std::string Type() { return "SecondaryCache"; }
+
+  // Insert the given value into this cache. The value is not written
+  // directly. Rather, the SaveToCallback provided by helper_cb will be
+  // used to extract the persistable data in value, which will be written
+  // to this tier. The implementation may or may not write it to cache
+  // depending on the admission control policy, even if the return status is
+  // success.
+  virtual Status Insert(const Slice& key, void* value,
+                        const Cache::CacheItemHelper* helper) = 0;
+
+  // Lookup the data for the given key in this cache. The create_cb
+  // will be used to create the object. The handle returned may not be
+  // ready yet, unless wait=true, in which case Lookup() will block until
+  // the handle is ready
+  virtual std::unique_ptr<SecondaryCacheHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0;
+
+  // At the discretion of the implementation, erase the data associated
+  // with key
+  virtual void Erase(const Slice& key) = 0;
+
+  // Wait for a collection of handles to become ready. This would be used
+  // by MultiGet, for example, to read multitple data blocks in parallel
+  virtual void WaitAll(std::vector<SecondaryCacheHandle*> handles) = 0;
+
+  virtual std::string GetPrintableOptions() const = 0;
+};
+```
+
+A SecondaryCache is configured by the user by providing a pointer to it in LRUCacheOptions -
+```
+struct LRUCacheOptions {
+  ...
+  // A SecondaryCache instance to use as an additional cache tier
+  std::shared_ptr<SecondaryCache> secondary_cache;
+  ...
+};
+```
+
+## Current Status
+
+The initial RocksDB support for the secondary cache has been merged into the main branch, and will be available in the 6.21 release. This includes providing a way for the user to configure a secondary cache when instantiating RocksDB’s LRU cache (volatile block cache), spilling blocks evicted from the LRU cache to the flash cache, promoting a block read from the SecondaryCache to the LRU cache, update tools such as cache_bench and db_bench to specify a flash cache. The relevant PRs are [#8271](https://github.com/facebook/rocksdb/pull/8271), [#8191](https://github.com/facebook/rocksdb/pull/8191), and [#8312](https://github.com/facebook/rocksdb/pull/8312).
+
+We prototyped an end-to-end solution, with the above PRs as well as a Cachelib based implementation of the SecondaryCache. We ran a mixgraph benchmark to simulate a realistic read/write workload. The results showed a 15% gain with the local flash cache over no local cache, and a ~25-30% reduction in network reads with a corresponding decrease in cache misses.
+
+![Throughput](/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+![Hit Rate](/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Future Work
+
+In the short term, we plan to do the following in order to fully integrate the SecondaryCache with RocksDB -
+
+1. Use DB session ID as the cache key prefix to ensure uniqueness and repeatability
+2. Optimize flash cache usage of MultiGet and iterator workloads
+3. Stress testing
+4. More benchmarking
+
+Longer term, we plan to deploy this in production at Facebook.
+
+## Call to Action
+
+We are hoping for a community contribution of a secondary cache implementation, which would make this feature usable by the broader RocksDB userbase. If you are interested in contributing, please reach out to us in [this issue](https://github.com/facebook/rocksdb/issues/8347).
+
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-05-31-dictionary-compression.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,157 @@
+---
+title: Preset Dictionary Compression
+layout: post
+author: ajkr
+category: blog
+---
+
+## Summary
+
+Compression algorithms relying on an adaptive dictionary, such as LZ4, zstd, and zlib, struggle to achieve good compression ratios on small inputs when using the basic compress API.
+With the basic compress API, the compressor starts with an empty dictionary.
+With small inputs, not much content gets added to the dictionary during the compression.
+Combined, these factors suggest the dictionary will never have enough contents to achieve great compression ratios.
+
+RocksDB groups key-value pairs into data blocks before storing them in files.
+For use cases that are heavy on random accesses, smaller data block size is sometimes desirable for reducing I/O and CPU spent reading blocks.
+However, as explained above, smaller data block size comes with the downside of worse compression ratio when using the basic compress API.
+
+Fortunately, zstd and other libraries offer advanced compress APIs that preset the dictionary.
+A preset dictionary makes it possible for the compressor to start from a useful state instead of from an empty one, making compression immediately effective.
+
+RocksDB now optionally takes advantage of these dictionary presetting APIs.
+The challenges in integrating this feature into the storage engine were more substantial than apparent on the surface.
+First, we need to target a preset dictionary to the relevant data.
+Second, preset dictionaries need to be trained from data samples, which need to be gathered.
+Third, preset dictionaries need to be persisted since they are needed at decompression time.
+Fourth, overhead in accessing the preset dictionary must be minimized to prevent regression in critical code paths.
+Fifth, we need easy-to-use measurement to evaluate candidate use cases and production impact.
+
+In production, we have deployed dictionary presetting to save space in multiple RocksDB use cases with data block size 8KB or smaller.
+We have measured meaningful benefit to compression ratio in use cases with data block size up to 16KB.
+We have also measured a use case that can save both CPU and space by reducing data block size and turning on dictionary presetting at the same time.
+
+## Feature design
+#### Targeting
+
+Over time we have considered a few possibilities for the scope of a dictionary.
+
+- Subcompaction
+- SST file
+- Column family
+
+The original choice was subcompaction scope.
+This enabled an approach with minimal buffering overhead because we could collect samples while generating the first output SST file.
+The dictionary could then be trained and applied to subsequent SST files in the same subcompaction.
+
+However, we found a large use case where the proximity of data in the keyspace was more correlated with its similarity than we had predicted.
+In particular, the approach of training a dictionary on an adjacent file yielded substantially worse ratios than training the dictionary on the same file it would be used to compress.
+In response to this finding, we changed the preset dictionary scope to per SST file.
+
+With this change in approach, we had to face the problem we had hoped to avoid: how can we compress all of an SST file's data blocks with the same preset dictionary while that dictionary can only be trained after many data blocks have been sampled?
+The solutions we considered both involved a new overhead.
+We could read the input more than once and introduce I/O overhead, or we could buffer the uncompressed output file data blocks until a dictionary is trained, introducing memory overhead.
+We chose to take the hit on memory overhead.
+
+Another approach that we considered was associating multiple dictionaries with a column family.
+For example, in MyRocks there could be a dictionary trained on data from each large table.
+When compressing a data block, we would look at the table to which its data belongs and pick the corresponding dictionary.
+However, this approach would introduce many challenges.
+RocksDB would need to be aware of the key schema to know where are the table boundaries.
+RocksDB would also need to periodically update the dictionaries to account for changes in data pattern.
+It would need somewhere to store dictionaries at column family scope.
+Overall, we thought these challenges were too difficult to pursue the approach.
+
+#### Training
+
+![](/static/images/dictcmp/dictcmp_raw_sampled.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+Raw samples mode (`zstd_max_train_bytes == 0`)
+</i></p>
+
+As mentioned earlier, the approach we took is to build the dictionary from buffered uncompressed data blocks.
+The first row of data blocks in these diagrams illustrate this buffering.
+The second row illustrates training samples selected from the buffered blocks.
+In raw samples mode (above), the final dictionary is simply the concatenation of these samples.
+Whereas, in zstd training mode (below), these samples will be passed to the trainer to produce the final dictionary.
+
+![](/static/images/dictcmp/dictcmp_zstd_trained.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+zstd training mode (`zstd_max_train_bytes > 0`)
+</i></p>
+
+#### Compression path
+
+Once the preset dictionary is generated by the above process, we apply it to the buffered data blocks and write them to the output file.
+Thereafter, newly generated data blocks are immediately compressed and written out.
+
+One optimization here is available to zstd v0.7.0+ users.
+Instead of deserializing the dictionary on each compress invocation, we can do that work once and reuse it.
+A `ZSTD_CDict` holds this digested dictionary state and is passed to the compress API.
+
+#### Persistence
+
+When an SST file's data blocks are compressed using a preset dictionary, that dictionary is stored inside the file for later use in decompression.
+
+![](/static/images/dictcmp/dictcmp_sst_blocks.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+SST file layout with the preset dictionary in its own (uncompressed) block
+</i></p>
+
+#### Decompression path
+
+To decompress, we need to provide both the data block and the dictionary used to compress it.
+Since dictionaries are just blocks in a file, we access them through block cache.
+However this additional load on block cache can be problematic.
+It can be alleviated by pinning the dictionaries to avoid going through the LRU locks.
+
+An optimization analogous to the digested dictionary exists for certain zstd users (see User API section for details).
+When enabled, the block cache stores the digested dictionary state for decompression (`ZSTD_DDict`) instead of the block contents.
+In some cases we have seen decompression CPU decrease overall when enabling dictionary thanks to this optimization.
+
+#### Measurement
+
+Typically our first step in evaluating a candidate use case is an offline analysis of the data.
+This gives us a quick idea whether presetting dictionary will be beneficial without any code, config, or data changes.
+Our `sst_dump` tool reports what size SST files would have been using specified compression libraries and options.
+We can select random SST files and compare the size with vs. without dictionary.
+
+When that goes well, the next step is to see how it works in a live DB, like a production shadow or canary.
+There we can observe how it affects application/system metrics.
+
+Even after dictionary is enabled, there is the question of how much space was finally saved.
+We provide a way to A/B test size with vs. without dictionary while running in production.
+This feature picks a sample of data blocks to compress in multiple ways -- one of the outputs is stored, while the other outputs are thrown away after counting their size.
+Due to API limitations, the stored output always has to be the dictionary-compressed one, so this feature can only be used after enabling dictionary.
+The size with and without dictionary are stored in the SST file as table properties.
+These properties can be aggregated across all SST files in a DB (and across all DBs in a tier) to learn the final space saving.
+
+## User API
+
+RocksDB allows presetting compression dictionary for users of LZ4, zstd, and zlib.
+The most advanced capabilities are available to zstd v1.1.4+ users who statically link (see below).
+Newer versions of zstd (v1.3.6+) have internal changes to the dictionary trainer and digested dictionary management, which significantly improve memory and CPU efficiency.
+
+Run-time settings:
+
+- `CompressionOptions::max_dict_bytes`: Limit on per-SST file dictionary size. Increasing this causes dictionaries to consume more space and memory for the possibility of better data block compression. A typical value we use is 16KB.
+- (**zstd only**) `CompressionOptions::zstd_max_train_bytes`: Limit on training data passed to zstd dictionary trainer. Larger values cause the training to consume more CPU (and take longer) while generating more effective dictionaries. The starting point guidance we received from zstd team is to set it to 100x `CompressionOptions::max_dict_bytes`.
+- `CompressionOptions::max_dict_buffer_bytes`: Limit on data buffering from which training samples are gathered. By default we buffer up to the target file size per ongoing background job. If this amount of memory is concerning, this option can constrain the buffering with the downside that training samples will cover a smaller portion of the SST file. Work is ongoing to charge this memory usage to block cache so it will not need to be accounted for separately.
+- `BlockBasedTableOptions::cache_index_and_filter_blocks`: Controls whether metadata blocks including dictionary are accessed through block cache or held in table reader memory (yes, its name is outdated).
+- `BlockBasedTableOptions::metadata_cache_options`: Controls what metadata blocks are pinned in block cache. Pinning avoids LRU contention at the risk of cold blocks holding memory.
+- `ColumnFamilyOptions::sample_for_compression`: Controls frequency of measuring extra compressions on data blocks using various libraries with default settings (i.e., without preset dictionary).
+
+Compile-time setting:
+
+- (**zstd only**) `EXTRA_CXXFLAGS=-DZSTD_STATIC_LINKING_ONLY`: Hold digested dictionaries in block cache to save repetitive deserialization overhead. This saves a lot of CPU for read-heavy workloads. This compiler flag is necessary because one of the digested dictionary APIs we use is marked as experimental. We still use it in production, however.
+
+Function:
+
+- `DB::GetPropertiesOfAllTables()`: The properties `kSlowCompressionEstimatedDataSize` and `kFastCompressionEstimatedDataSize` estimate what the data block size (`kDataSize`) would have been if the corresponding compression library had been used. These properties are only present when `ColumnFamilyOptions::sample_for_compression` causes one or more samples to be measured, and they become more accurate with higher sampling frequency.
+
+Tool:
+
+- `sst_dump --command=recompress`: Offline analysis tool that reports what the SST file size would have been using the specified compression library and options.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_posts/2021-12-29-ribbon-filter.markdown	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,281 @@
+---
+title: Ribbon Filter
+layout: post
+author: pdillinger
+category: blog
+---
+
+## Summary
+Since version 6.15 last year, RocksDB supports Ribbon filters, a new
+alternative to Bloom filters that save space, especially memory, at
+the cost of more CPU usage, mostly in constructing the filters in the
+background. Most applications with long-lived data (many hours or
+longer) will likely benefit from adopting a Ribbon+Bloom hybrid filter
+policy. Here we explain why and how.
+
+[Ribbon filter on RocksDB wiki](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#ribbon-filter)
+
+[Ribbon filter paper](https://arxiv.org/abs/2103.02515)
+
+## Problem & background
+Bloom filters play a critical role in optimizing point queries and
+some range queries in LSM-tree storage systems like RocksDB. Very
+large DBs can use 10% or more of their RAM memory for (Bloom) filters,
+so that (average case) read performance can be very good despite high
+(worst case) read amplification, [which is useful for lowering write
+and/or space
+amplification](http://smalldatum.blogspot.com/2015/11/read-write-space-amplification-pick-2_23.html).
+Although the `format_version=5` Bloom filter in RocksDB is extremely
+fast, all Bloom filters use around 50% more space than is
+theoretically possible for a hashed structure configured for the same
+false positive (FP) rate and number of keys added. What would it take
+to save that significant share of “wasted” filter memory, and when
+does it make sense to use such a Bloom alternative?
+
+A number of alternatives to Bloom filters were known, especially for
+static filters (not modified after construction), but all the
+previously known structures were unsatisfying for SSTs because of some
+combination of
+* Not enough space savings for CPU increase. For example, [Xor
+  filters](https://arxiv.org/abs/1912.08258) use 3-4x more CPU than
+  Bloom but only save 15-20% of
+  space. [GOV](https://arxiv.org/pdf/1603.04330.pdf) can save around
+  30% space but requires around 10x more CPU than Bloom.
+* Inconsistent space savings. [Cuckoo
+  filters](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf)
+  and Xor+ filters offer significant space savings for very low FP
+  rates (high bits per key) but little or no savings for higher FP
+  rates (low bits per key). ([Higher FP rates are considered best for
+  largest levels of
+  LSM.](https://stratos.seas.harvard.edu/files/stratos/files/monkeykeyvaluestore.pdf))
+  [Spatially-coupled Xor
+  filters](https://arxiv.org/pdf/2001.10500.pdf) require very large
+  number of keys per filter for large space savings.
+* Inflexible configuration. No published alternatives offered the same
+  continuous configurability of Bloom filters, where any FP rate and
+  any fractional bits per key could be chosen. This flexibility
+  improves memory efficiency with the `optimize_filters_for_memory`
+  option that minimizes internal fragmentation on filters.
+
+## Ribbon filter development and implementation
+The Ribbon filter came about when I developed a faster, simpler, and
+more adaptable algorithm for constructing a little-known [Xor-based
+structure from Dietzfelbinger and
+Walzer](https://arxiv.org/pdf/1907.04750.pdf). It has very good space
+usage for required CPU time (~30% space savings for 3-4x CPU) and,
+with some engineering, Bloom-like configurability. The complications
+were managable for use in RocksDB:
+* Ribbon space efficiency does not naturally scale to very large
+  number of keys in a single filter (whole SST file or partition), but
+  with the current 128-bit Ribbon implementation in RocksDB, even 100
+  million keys in one filter saves 27% space vs. Bloom rather than 30%
+  for 100,000 keys in a filter.
+* More temporary memory is required during construction, ~230 bits per
+  key for 128-bit Ribbon vs. ~75 bits per key for Bloom filter. A
+  quick calculation shows that if you are saving 3 bits per key on the
+  generated filter, you only need about 50 generated filters in memory
+  to offset this temporary memory usage. (Thousands of filters in
+  memory is typical.) Starting in RocksDB version 6.27, this temporary
+  memory can be accounted for under block cache using
+  `BlockBasedTableOptions::reserve_table_builder_memory`.
+* Ribbon filter queries use relatively more CPU for lower FP rates
+  (but still O(1) relative to number of keys added to filter). This
+  should be OK because lower FP rates are only appropriate when then
+  cost of a false positive is very high (worth extra query time) or
+  memory is not so constrained (can use Bloom instead).
+
+Future: data in [the paper](https://arxiv.org/abs/2103.02515) suggests
+that 32-bit Balanced Ribbon (new name: [Bump-Once
+Ribbon](https://arxiv.org/pdf/2109.01892.pdf)) would improve all of
+these issues and be better all around (except for code complexity).
+
+## Ribbon vs. Bloom in RocksDB configuration
+Different applications and hardware configurations have different
+constraints, but we can use hardware costs to examine and better
+understand the trade-off between Bloom and Ribbon.
+
+### Same FP rate, RAM vs. CPU hardware cost
+Under ideal conditions where we can adjust our hardware to suit the
+application, in terms of dollars, how much does it cost to construct,
+query, and keep in memory a Bloom filter vs. a Ribbon filter?  The
+Ribbon filter costs more for CPU but less for RAM. Importantly, the
+RAM cost directly depends on how long the filter is kept in memory,
+which in RocksDB is essentially the lifetime of the filter.
+(Temporary RAM during construction is so short-lived that it is
+ignored.)  Using some consumer hardware and electricity prices and a
+predicted balance between construction and queries, we can compute a
+“break even” duration in memory. To minimize cost, filters with a
+lifetime shorter than this should be Bloom and filters with a lifetime
+longer than this should be Ribbon. (Python code)
+
+```
+# Commodity prices based roughly on consumer prices and rough guesses
+# Upfront cost of a CPU per hardware thread
+upfront_dollars_per_cpu_thread = 30.0
+
+# CPU average power usage per hardware thread
+watts_per_cpu_thread = 3.5
+
+# Upfront cost of a GB of RAM
+upfront_dollars_per_gb_ram = 8.0
+
+# RAM average power usage per GB
+# https://www.crucial.com/support/articles-faq-memory/how-much-power-does-memory-use
+watts_per_gb_ram = 0.375
+
+# Estimated price of power per kilowatt-hour, including overheads like conversion losses and cooling
+dollars_per_kwh = 0.35
+
+# Assume 3 year hardware lifetime
+hours_per_lifetime = 3 * 365 * 24
+seconds_per_lifetime = hours_per_lifetime * 60 * 60
+
+# Number of filter queries per key added in filter construction is heavily dependent on workload.
+# When replication is in layer above RocksDB, it will be low, likely < 1. When replication is in
+# storage layer below RocksDB, it will likely be > 1. Using a rough and general guesstimate.
+key_query_per_construct = 1.0
+
+#==================================
+# Bloom & Ribbon filter performance
+typical_bloom_bits_per_key = 10.0
+typical_ribbon_bits_per_key = 7.0
+
+# Speeds here are sensitive to many variables, especially query speed because it
+# is so dependent on memory latency. Using this benchmark here:
+# for IMPL in 2 3; do
+#   ./filter_bench -impl=$IMPL -quick -m_keys_total_max=200 -use_full_block_reader
+# done
+# and "Random filter" queries.
+nanoseconds_per_construct_bloom_key = 32.0
+nanoseconds_per_construct_ribbon_key = 140.0
+
+nanoseconds_per_query_bloom_key = 500.0
+nanoseconds_per_query_ribbon_key = 600.0
+
+#==================================
+# Some constants
+kwh_per_watt_lifetime = hours_per_lifetime / 1000.0
+bits_per_gb = 8 * 1024 * 1024 * 1024
+
+#==================================
+# Crunching the numbers
+# on CPU for constructing filters
+dollars_per_cpu_thread_lifetime = upfront_dollars_per_cpu_thread + watts_per_cpu_thread * kwh_per_watt_lifetime * dollars_per_kwh
+dollars_per_cpu_thread_second = dollars_per_cpu_thread_lifetime / seconds_per_lifetime
+
+dollars_per_construct_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_bloom_key / 10**9
+dollars_per_construct_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_construct_ribbon_key / 10**9
+
+dollars_per_query_bloom_key = dollars_per_cpu_thread_second * nanoseconds_per_query_bloom_key / 10**9
+dollars_per_query_ribbon_key = dollars_per_cpu_thread_second * nanoseconds_per_query_ribbon_key / 10**9
+
+dollars_per_bloom_key_cpu = dollars_per_construct_bloom_key + key_query_per_construct * dollars_per_query_bloom_key
+dollars_per_ribbon_key_cpu = dollars_per_construct_ribbon_key + key_query_per_construct * dollars_per_query_ribbon_key
+
+# on holding filters in RAM
+dollars_per_gb_ram_lifetime = upfront_dollars_per_gb_ram + watts_per_gb_ram * kwh_per_watt_lifetime * dollars_per_kwh
+dollars_per_gb_ram_second = dollars_per_gb_ram_lifetime / seconds_per_lifetime
+
+dollars_per_bloom_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_bloom_bits_per_key
+dollars_per_ribbon_key_in_ram_second = dollars_per_gb_ram_second / bits_per_gb * typical_ribbon_bits_per_key
+
+#==================================
+# How many seconds does it take for the added cost of constructing a ribbon filter instead
+# of bloom to be offset by the added cost of holding the bloom filter in memory?
+break_even_seconds = (dollars_per_ribbon_key_cpu - dollars_per_bloom_key_cpu) / (dollars_per_bloom_key_in_ram_second - dollars_per_ribbon_key_in_ram_second)
+print(break_even_seconds)
+# -> 3235.1647730256936
+```
+
+So roughly speaking, filters that live in memory for more than an hour
+should be Ribbon, and filters that live less than an hour should be
+Bloom. This is very interesting, but how long do filters live in
+RocksDB?
+
+First let's consider the average case. Write-heavy RocksDB loads are
+often backed by flash storage, which has some specified write
+endurance for its intended lifetime. This can be expressed as *device
+writes per day* (DWPD), and supported DWPD is typically < 10.0 even
+for high end devices (excluding NVRAM). Roughly speaking, the DB would
+need to be writing at a rate of 20+ DWPD for data to have an average
+lifetime of less than one hour. Thus, unless you are prematurely
+burning out your flash or massively under-utilizing available storage,
+using the Ribbon filter has the better cost profile *on average*.
+
+### Predictable lifetime
+But we can do even better than optimizing for the average case. LSM
+levels give us very strong data lifetime hints.  Data in L0 might live
+for minutes or a small number of hours. Data in Lmax might live for
+days or weeks. So even if Ribbon filters weren't the best choice on
+average for a workload, they almost certainly make sense for the
+larger, longer-lived levels of the LSM. As of RocksDB 6.24, you can
+specify a minimum LSM level for Ribbon filters with
+`NewRibbonFilterPolicy`, and earlier levels will use Bloom filters.
+
+### Resident filter memory
+The above analysis assumes that nearly all filters for all live SST
+files are resident in memory. This is true if using
+`cache_index_and_filter_blocks=0` and `max_open_files=-1` (defaults),
+but `cache_index_and_filter_blocks=1` is popular. In that case,
+if you use `optimize_filters_for_hits=1` and non-partitioned filters
+(a popular MyRocks configuration), it is also likely that nearly all
+live filters are in memory. However, if you don't use
+`optimize_filters_for_hits` and use partitioned filters, then
+cold data (by age or by key range) can lead to only a portion of
+filters being resident in memory. In that case, benefit from Ribbon
+filter is not as clear, though because Ribbon filters are smaller,
+they are more efficient to read into memory.
+
+RocksDB version 6.21 and later include a rough feature to determine
+block cache usage for data blocks, filter blocks, index blocks, etc.
+Data like this is periodically dumped to LOG file
+(`stats_dump_period_sec`):
+
+```
+Block cache entry stats(count,size,portion): DataBlock(441761,6.82 GB,75.765%) FilterBlock(3002,1.27 GB,14.1387%) IndexBlock(17777,887.75 MB,9.63267%) Misc(1,0.00 KB,0%)
+Block cache LRUCache@0x7fdd08104290#7004432 capacity: 9.00 GB collections: 2573 last_copies: 10 last_secs: 0.143248 secs_since: 0
+```
+
+This indicates that at this moment in time, the block cache object
+identified by `LRUCache@0x7fdd08104290#7004432` (potentially used
+by multiple DBs) uses roughly 14% of its 9GB, about 1.27 GB, on filter
+blocks. This same data is available through `DB::GetMapProperty` with
+`DB::Properties::kBlockCacheEntryStats`, and (with some effort) can
+be compared to total size of all filters (not necessarily in memory)
+using `rocksdb.filter.size` from
+`DB::Properties::kAggregatedTableProperties`.
+
+### Sanity checking lifetime
+Can we be sure that using filters even makes sense for such long-lived
+data? We can apply [the current 5 minute rule for caching SSD data in
+RAM](http://renata.borovica-gajic.com/data/adms2017_5minuterule.pdf). A
+4KB filter page holds data for roughly 4K keys. If we assume at least
+one negative (useful) filter query in its lifetime per added key, it
+can satisfy the 5 minute rule with a lifetime of up to about two
+weeks. Thus, the lifetime threshold for “no filter” is about 300x
+higher than the lifetime threshold for Ribbon filter.
+
+### What to do with saved memory
+The default way to improve overall RocksDB performance with more
+available memory is to use more space for caching, which improves
+latency, CPU load, read IOs, etc.  With
+`cache_index_and_filter_blocks=1`, savings in filters will
+automatically make room for caching more data blocks in block
+cache. With `cache_index_and_filter_blocks=0`, consider increasing
+block cache size.
+
+Using the space savings to lower filter FP rates is also an option,
+but there is less evidence for this commonly improving existing
+*optimized* configurations.
+
+## Generic recommendation
+If using `NewBloomFilterPolicy(bpk)` for a large persistent DB using
+compression, try using `NewRibbonFilterPolicy(bpk)` instead, which
+will generate Ribbon filters during compaction and Bloom filters
+for flush, both with the same FP rate as the old setting. Once new SST
+files are generated under the new policy, this should free up some
+memory for more caching without much effect on burst or sustained
+write speed. Both kinds of filters can be read under either policy, so
+there's always an option to adjust settings or gracefully roll back to
+using Bloom filter only (keeping in mind that SST files must be
+replaced to see effect of that change).
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_top-level/support.md mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_top-level/support.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/docs/_top-level/support.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/docs/_top-level/support.md	2025-05-19 16:14:27.000000000 +0000
@@ -19,4 +19,4 @@
 
 ### FAQ
 
-Check out a list of [commonly asked questions](/docs/support/faq) about RocksDB.
+Check out a list of [commonly asked questions](https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ) about RocksDB.
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_raw_sampled.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_sst_blocks.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/dictcmp/dictcmp_zstd_trained.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/arch_diagram.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/insert_flow.png differ
Binary files /srv/release.debian.org/tmp/lqSoMGBIg0/mariadb-10.11.11/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png and /srv/release.debian.org/tmp/EHlq1RjOhk/mariadb-10.11.13/storage/rocksdb/rocksdb/docs/static/images/rocksdb-secondary-cache/lookup_flow.png differ
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,464 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "env/composite_env_wrapper.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// The CompositeEnvWrapper class provides an interface that is compatible
+// with the old monolithic Env API, and an implementation that wraps around
+// the new Env that provides threading and other OS related functionality, and
+// the new FileSystem API that provides storage functionality. By
+// providing the old Env interface, it allows the rest of RocksDB code to
+// be agnostic of whether the underlying Env implementation is a monolithic
+// Env or an Env + FileSystem. In the former case, the user will specify
+// Options::env only, whereas in the latter case, the user will specify
+// Options::env and Options::file_system.
+
+class CompositeSequentialFileWrapper : public SequentialFile {
+ public:
+  explicit CompositeSequentialFileWrapper(
+      std::unique_ptr<FSSequentialFile>& target)
+      : target_(std::move(target)) {}
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(n, io_opts, result, scratch, &dbg);
+  }
+  Status Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg);
+  }
+
+ private:
+  std::unique_ptr<FSSequentialFile> target_;
+};
+
+class CompositeRandomAccessFileWrapper : public RandomAccessFile {
+ public:
+  explicit CompositeRandomAccessFileWrapper(
+      std::unique_ptr<FSRandomAccessFile>& target)
+      : target_(std::move(target)) {}
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+  }
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    std::vector<FSReadRequest> fs_reqs;
+    Status status;
+
+    fs_reqs.resize(num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      fs_reqs[i].offset = reqs[i].offset;
+      fs_reqs[i].len = reqs[i].len;
+      fs_reqs[i].scratch = reqs[i].scratch;
+      fs_reqs[i].status = IOStatus::OK();
+    }
+    status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      reqs[i].result = fs_reqs[i].result;
+      reqs[i].status = fs_reqs[i].status;
+    }
+    return status;
+  }
+  Status Prefetch(uint64_t offset, size_t n) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Prefetch(offset, n, io_opts, &dbg);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override {
+    target_->Hint((FSRandomAccessFile::AccessPattern)pattern);
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> target_;
+};
+
+class CompositeWritableFileWrapper : public WritableFile {
+ public:
+  explicit CompositeWritableFileWrapper(std::unique_ptr<FSWritableFile>& t)
+      : target_(std::move(t)) {}
+
+  Status Append(const Slice& data) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Append(data, io_opts, &dbg);
+  }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& verification_info) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Append(data, io_opts, verification_info, &dbg);
+  }
+  Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedAppend(data, offset, io_opts, &dbg);
+  }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& verification_info) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedAppend(data, offset, io_opts, verification_info,
+                                     &dbg);
+  }
+  Status Truncate(uint64_t size) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Truncate(size, io_opts, &dbg);
+  }
+  Status Close() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Close(io_opts, &dbg);
+  }
+  Status Flush() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Flush(io_opts, &dbg);
+  }
+  Status Sync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Sync(io_opts, &dbg);
+  }
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Fsync(io_opts, &dbg);
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->GetFileSize(io_opts, &dbg);
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->RangeSync(offset, nbytes, io_opts, &dbg);
+  }
+
+  void PrepareWrite(size_t offset, size_t len) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    target_->PrepareWrite(offset, len, io_opts, &dbg);
+  }
+
+  Status Allocate(uint64_t offset, uint64_t len) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Allocate(offset, len, io_opts, &dbg);
+  }
+
+  std::unique_ptr<FSWritableFile>* target() { return &target_; }
+
+ private:
+  std::unique_ptr<FSWritableFile> target_;
+};
+
+class CompositeRandomRWFileWrapper : public RandomRWFile {
+ public:
+  explicit CompositeRandomRWFileWrapper(std::unique_ptr<FSRandomRWFile>& target)
+      : target_(std::move(target)) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status Write(uint64_t offset, const Slice& data) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Write(offset, data, io_opts, &dbg);
+  }
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+  }
+  Status Flush() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Flush(io_opts, &dbg);
+  }
+  Status Sync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Sync(io_opts, &dbg);
+  }
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Fsync(io_opts, &dbg);
+  }
+  Status Close() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Close(io_opts, &dbg);
+  }
+
+ private:
+  std::unique_ptr<FSRandomRWFile> target_;
+};
+
+class CompositeDirectoryWrapper : public Directory {
+ public:
+  explicit CompositeDirectoryWrapper(std::unique_ptr<FSDirectory>& target)
+      : target_(std::move(target)) {}
+
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->FsyncWithDirOptions(io_opts, &dbg, DirFsyncOptions());
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<FSDirectory> target_;
+};
+}  // namespace
+
+Status CompositeEnv::NewSequentialFile(const std::string& f,
+                                       std::unique_ptr<SequentialFile>* r,
+                                       const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSSequentialFile> file;
+  Status status;
+  status =
+      file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeSequentialFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewRandomAccessFile(const std::string& f,
+                                         std::unique_ptr<RandomAccessFile>* r,
+                                         const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSRandomAccessFile> file;
+  Status status;
+  status =
+      file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeRandomAccessFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewWritableFile(const std::string& f,
+                                     std::unique_ptr<WritableFile>* r,
+                                     const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSWritableFile> file;
+  Status status;
+  status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::ReopenWritableFile(const std::string& fname,
+                                        std::unique_ptr<WritableFile>* result,
+                                        const EnvOptions& options) {
+  IODebugContext dbg;
+  Status status;
+  std::unique_ptr<FSWritableFile> file;
+  status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file,
+                                            &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::ReuseWritableFile(const std::string& fname,
+                                       const std::string& old_fname,
+                                       std::unique_ptr<WritableFile>* r,
+                                       const EnvOptions& options) {
+  IODebugContext dbg;
+  Status status;
+  std::unique_ptr<FSWritableFile> file;
+  status = file_system_->ReuseWritableFile(fname, old_fname,
+                                           FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewRandomRWFile(const std::string& fname,
+                                     std::unique_ptr<RandomRWFile>* result,
+                                     const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSRandomRWFile> file;
+  Status status;
+  status =
+      file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeRandomRWFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewDirectory(const std::string& name,
+                                  std::unique_ptr<Directory>* result) {
+  IOOptions io_opts;
+  IODebugContext dbg;
+  std::unique_ptr<FSDirectory> dir;
+  Status status;
+  status = file_system_->NewDirectory(name, io_opts, &dir, &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeDirectoryWrapper(dir));
+  }
+  return status;
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    composite_env_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+        {"target",
+         {0, OptionType::kCustomizable, OptionVerificationType::kByName,
+          OptionTypeFlags::kDontSerialize | OptionTypeFlags::kRawPointer,
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto target = static_cast<EnvWrapper::Target*>(addr);
+            return Env::CreateFromString(opts, value, &(target->env),
+                                         &(target->guard));
+          },
+          nullptr, nullptr}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo>
+    composite_fs_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+        {"file_system",
+         OptionTypeInfo::AsCustomSharedPtr<FileSystem>(
+             0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    composite_clock_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+        {"clock",
+         OptionTypeInfo::AsCustomSharedPtr<SystemClock>(
+             0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace
+
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs) {
+  return std::unique_ptr<Env>(new CompositeEnvWrapper(Env::Default(), fs));
+}
+
+CompositeEnvWrapper::CompositeEnvWrapper(Env* env,
+                                         const std::shared_ptr<FileSystem>& fs,
+                                         const std::shared_ptr<SystemClock>& sc)
+    : CompositeEnv(fs, sc), target_(env) {
+  RegisterOptions("", &target_, &composite_env_wrapper_type_info);
+  RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info);
+  RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info);
+}
+
+CompositeEnvWrapper::CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                                         const std::shared_ptr<FileSystem>& fs,
+                                         const std::shared_ptr<SystemClock>& sc)
+    : CompositeEnv(fs, sc), target_(env) {
+  RegisterOptions("", &target_, &composite_env_wrapper_type_info);
+  RegisterOptions("", &file_system_, &composite_fs_wrapper_type_info);
+  RegisterOptions("", &system_clock_, &composite_clock_wrapper_type_info);
+}
+
+Status CompositeEnvWrapper::PrepareOptions(const ConfigOptions& options) {
+  target_.Prepare();
+  if (file_system_ == nullptr) {
+    file_system_ = target_.env->GetFileSystem();
+  }
+  if (system_clock_ == nullptr) {
+    system_clock_ = target_.env->GetSystemClock();
+  }
+  return Env::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string CompositeEnvWrapper::SerializeOptions(
+    const ConfigOptions& config_options, const std::string& header) const {
+  auto options = CompositeEnv::SerializeOptions(config_options, header);
+  if (target_.env != nullptr && target_.env != Env::Default()) {
+    options.append("target=");
+    options.append(target_.env->ToString(config_options));
+  }
+  return options;
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env_wrapper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/composite_env_wrapper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/composite_env_wrapper.h	2025-05-19 16:14:27.000000000 +0000
@@ -7,1111 +7,366 @@
 
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 
-namespace ROCKSDB_NAMESPACE {
-
-// The CompositeEnvWrapper class provides an interface that is compatible
-// with the old monolithic Env API, and an implementation that wraps around
-// the new Env that provides threading and other OS related functionality, and
-// the new FileSystem API that provides storage functionality. By
-// providing the old Env interface, it allows the rest of RocksDB code to
-// be agnostic of whether the underlying Env implementation is a monolithic
-// Env or an Env + FileSystem. In the former case, the user will specify
-// Options::env only, whereas in the latter case, the user will specify
-// Options::env and Options::file_system.
-
-inline IOStatus status_to_io_status(Status&& status) {
-  if (status.ok()) {
-    // Fast path
-    return IOStatus::OK();
-  } else {
-    const char* state = status.getState();
-    if (state) {
-      return IOStatus(status.code(), status.subcode(),
-                      Slice(state, strlen(status.getState()) + 1),
-                      Slice());
-    } else {
-      return IOStatus(status.code(), status.subcode());
-    }
-  }
-}
-
-class CompositeSequentialFileWrapper : public SequentialFile {
- public:
-  explicit CompositeSequentialFileWrapper(
-      std::unique_ptr<FSSequentialFile>& target)
-      : target_(std::move(target)) {}
-
-  Status Read(size_t n, Slice* result, char* scratch) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Read(n, io_opts, result, scratch, &dbg);
-  }
-  Status Skip(uint64_t n) override { return target_->Skip(n); }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return target_->InvalidateCache(offset, length);
-  }
-  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-                        char* scratch) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg);
-  }
-
- private:
-  std::unique_ptr<FSSequentialFile> target_;
-};
-
-class CompositeRandomAccessFileWrapper : public RandomAccessFile {
- public:
-  explicit CompositeRandomAccessFileWrapper(
-      std::unique_ptr<FSRandomAccessFile>& target)
-      : target_(std::move(target)) {}
-
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
-  }
-  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    std::vector<FSReadRequest> fs_reqs;
-    Status status;
-
-    fs_reqs.resize(num_reqs);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      fs_reqs[i].offset = reqs[i].offset;
-      fs_reqs[i].len = reqs[i].len;
-      fs_reqs[i].scratch = reqs[i].scratch;
-      fs_reqs[i].status = IOStatus::OK();
-    }
-    status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      reqs[i].result = fs_reqs[i].result;
-      reqs[i].status = fs_reqs[i].status;
-    }
-    return status;
-  }
-  Status Prefetch(uint64_t offset, size_t n) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Prefetch(offset, n, io_opts, &dbg);
-  }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  };
-  void Hint(AccessPattern pattern) override {
-    target_->Hint((FSRandomAccessFile::AccessPattern)pattern);
-  }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return target_->InvalidateCache(offset, length);
-  }
-
- private:
-  std::unique_ptr<FSRandomAccessFile> target_;
-};
-
-class CompositeWritableFileWrapper : public WritableFile {
- public:
-  explicit CompositeWritableFileWrapper(std::unique_ptr<FSWritableFile>& t)
-      : target_(std::move(t)) {}
-
-  Status Append(const Slice& data) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Append(data, io_opts, &dbg);
-  }
-  Status PositionedAppend(const Slice& data, uint64_t offset) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->PositionedAppend(data, offset, io_opts, &dbg);
-  }
-  Status Truncate(uint64_t size) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Truncate(size, io_opts, &dbg);
-  }
-  Status Close() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Close(io_opts, &dbg);
-  }
-  Status Flush() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Flush(io_opts, &dbg);
-  }
-  Status Sync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Sync(io_opts, &dbg);
-  }
-  Status Fsync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Fsync(io_opts, &dbg);
-  }
-  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-
-  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
-    target_->SetWriteLifeTimeHint(hint);
-  }
-
-  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
-    return target_->GetWriteLifeTimeHint();
-  }
-
-  uint64_t GetFileSize() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->GetFileSize(io_opts, &dbg);
-  }
-
-  void SetPreallocationBlockSize(size_t size) override {
-    target_->SetPreallocationBlockSize(size);
-  }
-
-  void GetPreallocationStatus(size_t* block_size,
-                              size_t* last_allocated_block) override {
-    target_->GetPreallocationStatus(block_size, last_allocated_block);
-  }
-
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  }
-
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return target_->InvalidateCache(offset, length);
-  }
-
-  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->RangeSync(offset, nbytes, io_opts, &dbg);
-  }
-
-  void PrepareWrite(size_t offset, size_t len) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    target_->PrepareWrite(offset, len, io_opts, &dbg);
-  }
-
-  Status Allocate(uint64_t offset, uint64_t len) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Allocate(offset, len, io_opts, &dbg);
-  }
-
-  std::unique_ptr<FSWritableFile>* target() { return &target_; }
-
- private:
-  std::unique_ptr<FSWritableFile> target_;
-};
-
-class CompositeRandomRWFileWrapper : public RandomRWFile {
- public:
-  explicit CompositeRandomRWFileWrapper(std::unique_ptr<FSRandomRWFile>& target)
-      : target_(std::move(target)) {}
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  Status Write(uint64_t offset, const Slice& data) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Write(offset, data, io_opts, &dbg);
-  }
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
-  }
-  Status Flush() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Flush(io_opts, &dbg);
-  }
-  Status Sync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Sync(io_opts, &dbg);
-  }
-  Status Fsync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Fsync(io_opts, &dbg);
-  }
-  Status Close() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Close(io_opts, &dbg);
-  }
-
- private:
-  std::unique_ptr<FSRandomRWFile> target_;
-};
-
-class CompositeDirectoryWrapper : public Directory {
- public:
-  explicit CompositeDirectoryWrapper(std::unique_ptr<FSDirectory>& target)
-      : target_(std::move(target)) {}
-
-  Status Fsync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Fsync(io_opts, &dbg);
-  }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  }
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#undef LoadLibrary
+#endif
 
- private:
-  std::unique_ptr<FSDirectory> target_;
-};
+namespace ROCKSDB_NAMESPACE {
 
-class CompositeEnvWrapper : public Env {
+class CompositeEnv : public Env {
  public:
   // Initialize a CompositeEnvWrapper that delegates all thread/time related
   // calls to env, and all file operations to fs
-  explicit CompositeEnvWrapper(Env* env, FileSystem* fs)
-      : env_target_(env), fs_env_target_(fs) {}
-  ~CompositeEnvWrapper() {}
+  explicit CompositeEnv(const std::shared_ptr<FileSystem>& fs,
+                        const std::shared_ptr<SystemClock>& clock)
+      : Env(fs, clock) {}
 
-  // Return the target to which this Env forwards all calls
-  Env* env_target() const { return env_target_; }
-
-  FileSystem* fs_env_target() const { return fs_env_target_; }
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return file_system_->RegisterDbPaths(paths);
+  }
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    return file_system_->UnregisterDbPaths(paths);
+  }
 
   // The following text is boilerplate that forwards all methods to target()
   Status NewSequentialFile(const std::string& f,
                            std::unique_ptr<SequentialFile>* r,
-                           const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSSequentialFile> file;
-    Status status;
-    status =
-        fs_env_target_->NewSequentialFile(f, FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeSequentialFileWrapper(file));
-    }
-    return status;
-  }
+                           const EnvOptions& options) override;
+
   Status NewRandomAccessFile(const std::string& f,
                              std::unique_ptr<RandomAccessFile>* r,
-                             const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSRandomAccessFile> file;
-    Status status;
-    status = fs_env_target_->NewRandomAccessFile(f, FileOptions(options), &file,
-                                                 &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeRandomAccessFileWrapper(file));
-    }
-    return status;
-  }
+                             const EnvOptions& options) override;
+
   Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
-                         const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSWritableFile> file;
-    Status status;
-    status =
-        fs_env_target_->NewWritableFile(f, FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeWritableFileWrapper(file));
-    }
-    return status;
-  }
+                         const EnvOptions& options) override;
+
   Status ReopenWritableFile(const std::string& fname,
                             std::unique_ptr<WritableFile>* result,
-                            const EnvOptions& options) override {
-    IODebugContext dbg;
-    Status status;
-    std::unique_ptr<FSWritableFile> file;
-    status = fs_env_target_->ReopenWritableFile(fname, FileOptions(options),
-                                                &file, &dbg);
-    if (status.ok()) {
-      result->reset(new CompositeWritableFileWrapper(file));
-    }
-    return status;
-  }
+                            const EnvOptions& options) override;
+
   Status ReuseWritableFile(const std::string& fname,
                            const std::string& old_fname,
                            std::unique_ptr<WritableFile>* r,
-                           const EnvOptions& options) override {
-    IODebugContext dbg;
-    Status status;
-    std::unique_ptr<FSWritableFile> file;
-    status = fs_env_target_->ReuseWritableFile(
-        fname, old_fname, FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeWritableFileWrapper(file));
-    }
-    return status;
-  }
+                           const EnvOptions& options) override;
+
   Status NewRandomRWFile(const std::string& fname,
                          std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSRandomRWFile> file;
-    Status status;
-    status = fs_env_target_->NewRandomRWFile(fname, FileOptions(options), &file,
-                                             &dbg);
-    if (status.ok()) {
-      result->reset(new CompositeRandomRWFileWrapper(file));
-    }
-    return status;
-  }
+                         const EnvOptions& options) override;
+
   Status NewMemoryMappedFileBuffer(
       const std::string& fname,
       std::unique_ptr<MemoryMappedFileBuffer>* result) override {
-    return fs_env_target_->NewMemoryMappedFileBuffer(fname, result);
+    return file_system_->NewMemoryMappedFileBuffer(fname, result);
   }
+
   Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    std::unique_ptr<FSDirectory> dir;
-    Status status;
-    status = fs_env_target_->NewDirectory(name, io_opts, &dir, &dbg);
-    if (status.ok()) {
-      result->reset(new CompositeDirectoryWrapper(dir));
-    }
-    return status;
-  }
+                      std::unique_ptr<Directory>* result) override;
+
   Status FileExists(const std::string& f) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->FileExists(f, io_opts, &dbg);
+    return file_system_->FileExists(f, io_opts, &dbg);
   }
   Status GetChildren(const std::string& dir,
                      std::vector<std::string>* r) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->GetChildren(dir, io_opts, r, &dbg);
+    return file_system_->GetChildren(dir, io_opts, r, &dbg);
   }
   Status GetChildrenFileAttributes(
       const std::string& dir, std::vector<FileAttributes>* result) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->GetChildrenFileAttributes(dir, io_opts, result,
-                                                     &dbg);
+    return file_system_->GetChildrenFileAttributes(dir, io_opts, result, &dbg);
   }
   Status DeleteFile(const std::string& f) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->DeleteFile(f, io_opts, &dbg);
+    return file_system_->DeleteFile(f, io_opts, &dbg);
   }
   Status Truncate(const std::string& fname, size_t size) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->Truncate(fname, size, io_opts, &dbg);
+    return file_system_->Truncate(fname, size, io_opts, &dbg);
   }
   Status CreateDir(const std::string& d) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->CreateDir(d, io_opts, &dbg);
+    return file_system_->CreateDir(d, io_opts, &dbg);
   }
   Status CreateDirIfMissing(const std::string& d) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->CreateDirIfMissing(d, io_opts, &dbg);
+    return file_system_->CreateDirIfMissing(d, io_opts, &dbg);
   }
   Status DeleteDir(const std::string& d) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->DeleteDir(d, io_opts, &dbg);
+    return file_system_->DeleteDir(d, io_opts, &dbg);
   }
   Status GetFileSize(const std::string& f, uint64_t* s) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->GetFileSize(f, io_opts, s, &dbg);
+    return file_system_->GetFileSize(f, io_opts, s, &dbg);
   }
 
   Status GetFileModificationTime(const std::string& fname,
                                  uint64_t* file_mtime) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->GetFileModificationTime(fname, io_opts, file_mtime,
-                                                   &dbg);
+    return file_system_->GetFileModificationTime(fname, io_opts, file_mtime,
+                                                 &dbg);
   }
 
   Status RenameFile(const std::string& s, const std::string& t) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->RenameFile(s, t, io_opts, &dbg);
+    return file_system_->RenameFile(s, t, io_opts, &dbg);
   }
 
   Status LinkFile(const std::string& s, const std::string& t) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->LinkFile(s, t, io_opts, &dbg);
+    return file_system_->LinkFile(s, t, io_opts, &dbg);
   }
 
   Status NumFileLinks(const std::string& fname, uint64_t* count) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->NumFileLinks(fname, io_opts, count, &dbg);
+    return file_system_->NumFileLinks(fname, io_opts, count, &dbg);
   }
 
   Status AreFilesSame(const std::string& first, const std::string& second,
                       bool* res) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->AreFilesSame(first, second, io_opts, res, &dbg);
+    return file_system_->AreFilesSame(first, second, io_opts, res, &dbg);
   }
 
   Status LockFile(const std::string& f, FileLock** l) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->LockFile(f, io_opts, l, &dbg);
+    return file_system_->LockFile(f, io_opts, l, &dbg);
   }
 
   Status UnlockFile(FileLock* l) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->UnlockFile(l, io_opts, &dbg);
+    return file_system_->UnlockFile(l, io_opts, &dbg);
   }
 
   Status GetAbsolutePath(const std::string& db_path,
                          std::string* output_path) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->GetAbsolutePath(db_path, io_opts, output_path, &dbg);
+    return file_system_->GetAbsolutePath(db_path, io_opts, output_path, &dbg);
   }
 
-#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
-  Status LoadLibrary(const std::string& lib_name,
-                     const std::string& search_path,
-                     std::shared_ptr<DynamicLibrary>* result) override {
-    return env_target_->LoadLibrary(lib_name, search_path, result);
-  }
-#endif
-
-  void Schedule(void (*f)(void* arg), void* a, Priority pri,
-                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
-    return env_target_->Schedule(f, a, pri, tag, u);
-  }
-
-  int UnSchedule(void* tag, Priority pri) override {
-    return env_target_->UnSchedule(tag, pri);
-  }
-
-  void StartThread(void (*f)(void*), void* a) override {
-    return env_target_->StartThread(f, a);
-  }
-  void WaitForJoin() override { return env_target_->WaitForJoin(); }
-  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
-    return env_target_->GetThreadPoolQueueLen(pri);
-  }
-  Status GetTestDirectory(std::string* path) override {
-    return env_target_->GetTestDirectory(path);
-  }
   Status NewLogger(const std::string& fname,
                    std::shared_ptr<Logger>* result) override {
-    return env_target_->NewLogger(fname, result);
-  }
-  uint64_t NowMicros() override { return env_target_->NowMicros(); }
-  uint64_t NowNanos() override { return env_target_->NowNanos(); }
-  uint64_t NowCPUNanos() override { return env_target_->NowCPUNanos(); }
-
-  void SleepForMicroseconds(int micros) override {
-    env_target_->SleepForMicroseconds(micros);
-  }
-  Status GetHostName(char* name, uint64_t len) override {
-    return env_target_->GetHostName(name, len);
-  }
-  Status GetCurrentTime(int64_t* unix_time) override {
-    return env_target_->GetCurrentTime(unix_time);
-  }
-  void SetBackgroundThreads(int num, Priority pri) override {
-    return env_target_->SetBackgroundThreads(num, pri);
-  }
-  int GetBackgroundThreads(Priority pri) override {
-    return env_target_->GetBackgroundThreads(pri);
-  }
-
-  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
-    return env_target_->SetAllowNonOwnerAccess(allow_non_owner_access);
-  }
-
-  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
-    return env_target_->IncBackgroundThreadsIfNeeded(num, pri);
-  }
-
-  void LowerThreadPoolIOPriority(Priority pool = LOW) override {
-    env_target_->LowerThreadPoolIOPriority(pool);
-  }
-
-  void LowerThreadPoolCPUPriority(Priority pool = LOW) override {
-    env_target_->LowerThreadPoolCPUPriority(pool);
-  }
-
-  std::string TimeToString(uint64_t time) override {
-    return env_target_->TimeToString(time);
-  }
-
-  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
-    return env_target_->GetThreadList(thread_list);
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->NewLogger(fname, io_opts, result, &dbg);
   }
 
-  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
-    return env_target_->GetThreadStatusUpdater();
+  Status IsDirectory(const std::string& path, bool* is_dir) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->IsDirectory(path, io_opts, is_dir, &dbg);
   }
 
-  uint64_t GetThreadID() const override { return env_target_->GetThreadID(); }
-
-  std::string GenerateUniqueId() override {
-    return env_target_->GenerateUniqueId();
+  Status GetTestDirectory(std::string* path) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return file_system_->GetTestDirectory(io_opts, path, &dbg);
   }
 
   EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
-    return fs_env_target_->OptimizeForLogRead(FileOptions(env_options));
+    return file_system_->OptimizeForLogRead(FileOptions(env_options));
   }
+
   EnvOptions OptimizeForManifestRead(
       const EnvOptions& env_options) const override {
-    return fs_env_target_->OptimizeForManifestRead(
-                                FileOptions(env_options));
+    return file_system_->OptimizeForManifestRead(FileOptions(env_options));
   }
+
   EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
                                  const DBOptions& db_options) const override {
-    return fs_env_target_->OptimizeForLogWrite(FileOptions(env_options),
-                                               db_options);
+    return file_system_->OptimizeForLogWrite(FileOptions(env_options),
+                                             db_options);
   }
+
   EnvOptions OptimizeForManifestWrite(
       const EnvOptions& env_options) const override {
-    return fs_env_target_->OptimizeForManifestWrite(
-                                FileOptions(env_options));
+    return file_system_->OptimizeForManifestWrite(FileOptions(env_options));
   }
+
   EnvOptions OptimizeForCompactionTableWrite(
       const EnvOptions& env_options,
       const ImmutableDBOptions& immutable_ops) const override {
-    return fs_env_target_->OptimizeForCompactionTableWrite(
-                                FileOptions(env_options),
-                                immutable_ops);
+    return file_system_->OptimizeForCompactionTableWrite(
+        FileOptions(env_options), immutable_ops);
   }
   EnvOptions OptimizeForCompactionTableRead(
       const EnvOptions& env_options,
       const ImmutableDBOptions& db_options) const override {
-    return fs_env_target_->OptimizeForCompactionTableRead(
-                                FileOptions(env_options),
-                                db_options);
+    return file_system_->OptimizeForCompactionTableRead(
+        FileOptions(env_options), db_options);
+  }
+  EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return file_system_->OptimizeForBlobFileRead(FileOptions(env_options),
+                                                 db_options);
   }
+  // This seems to clash with a macro on Windows, so #undef it here
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
   Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
     IOOptions io_opts;
     IODebugContext dbg;
-    return fs_env_target_->GetFreeSpace(path, io_opts, diskfree, &dbg);
+    return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg);
   }
+  uint64_t NowMicros() override { return system_clock_->NowMicros(); }
+  uint64_t NowNanos() override { return system_clock_->NowNanos(); }
 
- private:
-  Env* env_target_;
-  FileSystem* fs_env_target_;
-};
+  uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); }
 
-class LegacySequentialFileWrapper : public FSSequentialFile {
- public:
-  explicit LegacySequentialFileWrapper(
-      std::unique_ptr<SequentialFile>&& _target)
-      : target_(std::move(_target)) {}
-
-  IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result,
-                char* scratch, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Read(n, result, scratch));
-  }
-  IOStatus Skip(uint64_t n) override {
-    return status_to_io_status(target_->Skip(n));
-  }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  IOStatus InvalidateCache(size_t offset, size_t length) override {
-    return status_to_io_status(target_->InvalidateCache(offset, length));
-  }
-  IOStatus PositionedRead(uint64_t offset, size_t n,
-                          const IOOptions& /*options*/, Slice* result,
-                          char* scratch, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(
-        target_->PositionedRead(offset, n, result, scratch));
+  void SleepForMicroseconds(int micros) override {
+    system_clock_->SleepForMicroseconds(micros);
   }
-  SequentialFile* target() { return target_.get(); }
 
- private:
-  std::unique_ptr<SequentialFile> target_;
-};
-
-class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
- public:
-  explicit LegacyRandomAccessFileWrapper(
-      std::unique_ptr<RandomAccessFile>&& target)
-      : target_(std::move(target)) {}
-
-  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
-                Slice* result, char* scratch,
-                IODebugContext* /*dbg*/) const override {
-    return status_to_io_status(target_->Read(offset, n, result, scratch));
-  }
-  IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs,
-                     const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    std::vector<ReadRequest> reqs;
-    Status status;
-
-    reqs.reserve(num_reqs);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      ReadRequest req;
-
-      req.offset = fs_reqs[i].offset;
-      req.len = fs_reqs[i].len;
-      req.scratch = fs_reqs[i].scratch;
-      req.status = Status::OK();
-
-      reqs.emplace_back(req);
-    }
-    status = target_->MultiRead(reqs.data(), num_reqs);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      fs_reqs[i].result = reqs[i].result;
-      fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status));
-    }
-    return status_to_io_status(std::move(status));
-    ;
-  }
-  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Prefetch(offset, n));
-  }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  };
-  void Hint(AccessPattern pattern) override {
-    target_->Hint((RandomAccessFile::AccessPattern)pattern);
-  }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return system_clock_->GetCurrentTime(unix_time);
   }
-  IOStatus InvalidateCache(size_t offset, size_t length) override {
-    return status_to_io_status(target_->InvalidateCache(offset, length));
+  std::string TimeToString(uint64_t time) override {
+    return system_clock_->TimeToString(time);
   }
-
- private:
-  std::unique_ptr<RandomAccessFile> target_;
 };
 
-class LegacyWritableFileWrapper : public FSWritableFile {
+class CompositeEnvWrapper : public CompositeEnv {
  public:
-  explicit LegacyWritableFileWrapper(std::unique_ptr<WritableFile>&& _target)
-      : target_(std::move(_target)) {}
-
-  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
-                  IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Append(data));
-  }
-  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
-                            const IOOptions& /*options*/,
-                            IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->PositionedAppend(data, offset));
-  }
-  IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Truncate(size));
-  }
-  IOStatus Close(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Close());
-  }
-  IOStatus Flush(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Flush());
-  }
-  IOStatus Sync(const IOOptions& /*options*/,
-                IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Sync());
-  }
-  IOStatus Fsync(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Fsync());
-  }
-  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-
-  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
-    target_->SetWriteLifeTimeHint(hint);
-  }
-
-  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
-    return target_->GetWriteLifeTimeHint();
-  }
-
-  uint64_t GetFileSize(const IOOptions& /*options*/,
-                       IODebugContext* /*dbg*/) override {
-    return target_->GetFileSize();
-  }
-
-  void SetPreallocationBlockSize(size_t size) override {
-    target_->SetPreallocationBlockSize(size);
-  }
-
-  void GetPreallocationStatus(size_t* block_size,
-                              size_t* last_allocated_block) override {
-    target_->GetPreallocationStatus(block_size, last_allocated_block);
-  }
-
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
+  // Initialize a CompositeEnvWrapper that delegates all thread/time related
+  // calls to env, and all file operations to fs
+  explicit CompositeEnvWrapper(Env* env)
+      : CompositeEnvWrapper(env, env->GetFileSystem(), env->GetSystemClock()) {}
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs)
+      : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {}
+
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<SystemClock>& sc)
+      : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {}
+
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& sc);
+
+  explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                               const std::shared_ptr<FileSystem>& fs)
+      : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {}
+
+  explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                               const std::shared_ptr<SystemClock>& sc)
+      : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {}
+
+  explicit CompositeEnvWrapper(const std::shared_ptr<Env>& env,
+                               const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& sc);
+
+  static const char* kClassName() { return "CompositeEnv"; }
+  const char* Name() const override { return kClassName(); }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return CompositeEnv::IsInstanceOf(name);
+    }
   }
+  const Customizable* Inner() const override { return target_.env; }
 
-  IOStatus InvalidateCache(size_t offset, size_t length) override {
-    return status_to_io_status(target_->InvalidateCache(offset, length));
-  }
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
 
-  IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
-                     const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->RangeSync(offset, nbytes));
-  }
+  // Return the target to which this Env forwards all calls
+  Env* env_target() const { return target_.env; }
 
-  void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    target_->PrepareWrite(offset, len);
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+  Status LoadLibrary(const std::string& lib_name,
+                     const std::string& search_path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    return target_.env->LoadLibrary(lib_name, search_path, result);
   }
+#endif
 
-  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Allocate(offset, len));
+  void Schedule(void (*f)(void* arg), void* a, Priority pri,
+                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+    return target_.env->Schedule(f, a, pri, tag, u);
   }
 
-  WritableFile* target() { return target_.get(); }
-
- private:
-  std::unique_ptr<WritableFile> target_;
-};
-
-class LegacyRandomRWFileWrapper : public FSRandomRWFile {
- public:
-  explicit LegacyRandomRWFileWrapper(std::unique_ptr<RandomRWFile>&& target)
-      : target_(std::move(target)) {}
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  IOStatus Write(uint64_t offset, const Slice& data,
-                 const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Write(offset, data));
-  }
-  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
-                Slice* result, char* scratch,
-                IODebugContext* /*dbg*/) const override {
-    return status_to_io_status(target_->Read(offset, n, result, scratch));
-  }
-  IOStatus Flush(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Flush());
-  }
-  IOStatus Sync(const IOOptions& /*options*/,
-                IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Sync());
-  }
-  IOStatus Fsync(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Fsync());
-  }
-  IOStatus Close(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Close());
+  int UnSchedule(void* tag, Priority pri) override {
+    return target_.env->UnSchedule(tag, pri);
   }
 
- private:
-  std::unique_ptr<RandomRWFile> target_;
-};
-
-class LegacyDirectoryWrapper : public FSDirectory {
- public:
-  explicit LegacyDirectoryWrapper(std::unique_ptr<Directory>&& target)
-      : target_(std::move(target)) {}
-
-  IOStatus Fsync(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Fsync());
+  void StartThread(void (*f)(void*), void* a) override {
+    return target_.env->StartThread(f, a);
   }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
+  void WaitForJoin() override { return target_.env->WaitForJoin(); }
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+    return target_.env->GetThreadPoolQueueLen(pri);
   }
 
- private:
-  std::unique_ptr<Directory> target_;
-};
-
-class LegacyFileSystemWrapper : public FileSystem {
- public:
-  // Initialize an EnvWrapper that delegates all calls to *t
-  explicit LegacyFileSystemWrapper(Env* t) : target_(t) {}
-  ~LegacyFileSystemWrapper() override {}
-
-  const char* Name() const override { return "Legacy File System"; }
-
-  // Return the target to which this Env forwards all calls
-  Env* target() const { return target_; }
-
-  // The following text is boilerplate that forwards all methods to target()
-  IOStatus NewSequentialFile(const std::string& f,
-                             const FileOptions& file_opts,
-                             std::unique_ptr<FSSequentialFile>* r,
-                             IODebugContext* /*dbg*/) override {
-    std::unique_ptr<SequentialFile> file;
-    Status s = target_->NewSequentialFile(f, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacySequentialFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewRandomAccessFile(const std::string& f,
-      const FileOptions& file_opts,
-                               std::unique_ptr<FSRandomAccessFile>* r,
-                               IODebugContext* /*dbg*/) override {
-    std::unique_ptr<RandomAccessFile> file;
-    Status s = target_->NewRandomAccessFile(f, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacyRandomAccessFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
-                           std::unique_ptr<FSWritableFile>* r,
-                           IODebugContext* /*dbg*/) override {
-    std::unique_ptr<WritableFile> file;
-    Status s = target_->NewWritableFile(f, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacyWritableFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus ReopenWritableFile(const std::string& fname,
-                              const FileOptions& file_opts,
-                              std::unique_ptr<FSWritableFile>* result,
-                              IODebugContext* /*dbg*/) override {
-    std::unique_ptr<WritableFile> file;
-    Status s = target_->ReopenWritableFile(fname, &file, file_opts);
-    if (s.ok()) {
-      result->reset(new LegacyWritableFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus ReuseWritableFile(const std::string& fname,
-                             const std::string& old_fname,
-                             const FileOptions& file_opts,
-                             std::unique_ptr<FSWritableFile>* r,
-                             IODebugContext* /*dbg*/) override {
-    std::unique_ptr<WritableFile> file;
-    Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacyWritableFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewRandomRWFile(const std::string& fname,
-      const FileOptions& file_opts,
-                           std::unique_ptr<FSRandomRWFile>* result,
-                           IODebugContext* /*dbg*/) override {
-    std::unique_ptr<RandomRWFile> file;
-    Status s = target_->NewRandomRWFile(fname, &file, file_opts);
-    if (s.ok()) {
-      result->reset(new LegacyRandomRWFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewMemoryMappedFileBuffer(
-      const std::string& fname,
-      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
-    return status_to_io_status(
-        target_->NewMemoryMappedFileBuffer(fname, result));
-  }
-  IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/,
-                        std::unique_ptr<FSDirectory>* result,
-                        IODebugContext* /*dbg*/) override {
-    std::unique_ptr<Directory> dir;
-    Status s = target_->NewDirectory(name, &dir);
-    if (s.ok()) {
-      result->reset(new LegacyDirectoryWrapper(std::move(dir)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->FileExists(f));
-  }
-  IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/,
-                       std::vector<std::string>* r,
-                       IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetChildren(dir, r));
-  }
-  IOStatus GetChildrenFileAttributes(const std::string& dir,
-                                     const IOOptions& /*options*/,
-                                     std::vector<FileAttributes>* result,
-                                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetChildrenFileAttributes(dir, result));
-  }
-  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->DeleteFile(f));
-  }
-  IOStatus Truncate(const std::string& fname, size_t size,
-                    const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Truncate(fname, size));
-  }
-  IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->CreateDir(d));
-  }
-  IOStatus CreateDirIfMissing(const std::string& d,
-                              const IOOptions& /*options*/,
-                              IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->CreateDirIfMissing(d));
-  }
-  IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->DeleteDir(d));
+  Status GetHostName(char* name, uint64_t len) override {
+    return target_.env->GetHostName(name, len);
   }
-  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
-                       uint64_t* s, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetFileSize(f, s));
+  void SetBackgroundThreads(int num, Priority pri) override {
+    return target_.env->SetBackgroundThreads(num, pri);
   }
-
-  IOStatus GetFileModificationTime(const std::string& fname,
-                                   const IOOptions& /*options*/,
-                                   uint64_t* file_mtime,
-                                   IODebugContext* /*dbg*/) override {
-    return status_to_io_status(
-        target_->GetFileModificationTime(fname, file_mtime));
+  int GetBackgroundThreads(Priority pri) override {
+    return target_.env->GetBackgroundThreads(pri);
   }
 
-  IOStatus GetAbsolutePath(const std::string& db_path,
-                           const IOOptions& /*options*/,
-                           std::string* output_path,
-                           IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetAbsolutePath(db_path, output_path));
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access);
   }
 
-  IOStatus RenameFile(const std::string& s, const std::string& t,
-                      const IOOptions& /*options*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->RenameFile(s, t));
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    return target_.env->IncBackgroundThreadsIfNeeded(num, pri);
   }
 
-  IOStatus LinkFile(const std::string& s, const std::string& t,
-                    const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->LinkFile(s, t));
+  void LowerThreadPoolIOPriority(Priority pool) override {
+    target_.env->LowerThreadPoolIOPriority(pool);
   }
 
-  IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
-                        uint64_t* count, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->NumFileLinks(fname, count));
+  void LowerThreadPoolCPUPriority(Priority pool) override {
+    target_.env->LowerThreadPoolCPUPriority(pool);
   }
 
-  IOStatus AreFilesSame(const std::string& first, const std::string& second,
-                        const IOOptions& /*options*/, bool* res,
-                        IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->AreFilesSame(first, second, res));
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    return target_.env->LowerThreadPoolCPUPriority(pool, pri);
   }
 
-  IOStatus LockFile(const std::string& f, const IOOptions& /*options*/,
-                    FileLock** l, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->LockFile(f, l));
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+    return target_.env->GetThreadList(thread_list);
   }
 
-  IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->UnlockFile(l));
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return target_.env->GetThreadStatusUpdater();
   }
 
-  IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path,
-                            IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetTestDirectory(path));
-  }
-  IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/,
-                     std::shared_ptr<Logger>* result,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->NewLogger(fname, result));
-  }
+  uint64_t GetThreadID() const override { return target_.env->GetThreadID(); }
 
-  FileOptions OptimizeForLogRead(
-                  const FileOptions& file_options) const override {
-    return target_->OptimizeForLogRead(file_options);
-  }
-  FileOptions OptimizeForManifestRead(
-      const FileOptions& file_options) const override {
-    return target_->OptimizeForManifestRead(file_options);
-  }
-  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
-                                 const DBOptions& db_options) const override {
-    return target_->OptimizeForLogWrite(file_options, db_options);
-  }
-  FileOptions OptimizeForManifestWrite(
-      const FileOptions& file_options) const override {
-    return target_->OptimizeForManifestWrite(file_options);
-  }
-  FileOptions OptimizeForCompactionTableWrite(
-      const FileOptions& file_options,
-      const ImmutableDBOptions& immutable_ops) const override {
-    return target_->OptimizeForCompactionTableWrite(file_options,
-                                                     immutable_ops);
-  }
-  FileOptions OptimizeForCompactionTableRead(
-      const FileOptions& file_options,
-      const ImmutableDBOptions& db_options) const override {
-    return target_->OptimizeForCompactionTableRead(file_options, db_options);
-  }
-  IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/,
-                        uint64_t* diskfree, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetFreeSpace(path, diskfree));
+  std::string GenerateUniqueId() override {
+    return target_.env->GenerateUniqueId();
   }
 
  private:
-  Env* target_;
+  EnvWrapper::Target target_;
 };
-
-inline std::unique_ptr<FSSequentialFile> NewLegacySequentialFileWrapper(
-    std::unique_ptr<SequentialFile>& file) {
-  return std::unique_ptr<FSSequentialFile>(
-      new LegacySequentialFileWrapper(std::move(file)));
-}
-
-inline std::unique_ptr<FSRandomAccessFile> NewLegacyRandomAccessFileWrapper(
-    std::unique_ptr<RandomAccessFile>& file) {
-  return std::unique_ptr<FSRandomAccessFile>(
-      new LegacyRandomAccessFileWrapper(std::move(file)));
-}
-
-inline std::unique_ptr<FSWritableFile> NewLegacyWritableFileWrapper(
-    std::unique_ptr<WritableFile>&& file) {
-  return std::unique_ptr<FSWritableFile>(
-      new LegacyWritableFileWrapper(std::move(file)));
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/emulated_clock.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/emulated_clock.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/emulated_clock.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/emulated_clock.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <string>
+
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A SystemClock that can "mock" sleep and counts its operations.
+class EmulatedSystemClock : public SystemClockWrapper {
+ private:
+  // Something to return when mocking current time
+  const int64_t maybe_starting_time_;
+  std::atomic<int> sleep_counter_{0};
+  std::atomic<int> cpu_counter_{0};
+  std::atomic<int64_t> addon_microseconds_{0};
+  // Do not modify in the env of a running DB (could cause deadlock)
+  std::atomic<bool> time_elapse_only_sleep_;
+  bool no_slowdown_;
+
+ public:
+  explicit EmulatedSystemClock(const std::shared_ptr<SystemClock>& base,
+                               bool time_elapse_only_sleep = false);
+
+  static const char* kClassName() { return "TimeEmulatedSystemClock"; }
+  const char* Name() const override { return kClassName(); }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    sleep_counter_++;
+    if (no_slowdown_ || time_elapse_only_sleep_) {
+      addon_microseconds_.fetch_add(micros);
+    }
+    if (!no_slowdown_) {
+      SystemClockWrapper::SleepForMicroseconds(micros);
+    }
+  }
+
+  void MockSleepForMicroseconds(int64_t micros) {
+    sleep_counter_++;
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(micros);
+  }
+
+  void MockSleepForSeconds(int64_t seconds) {
+    sleep_counter_++;
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(seconds * 1000000);
+  }
+
+  void SetTimeElapseOnlySleep(bool enabled) {
+    // We cannot set these before destroying the last DB because they might
+    // cause a deadlock or similar without the appropriate options set in
+    // the DB.
+    time_elapse_only_sleep_ = enabled;
+    no_slowdown_ = enabled;
+  }
+
+  bool IsTimeElapseOnlySleep() const { return time_elapse_only_sleep_.load(); }
+  void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; }
+  bool IsMockSleepEnabled() const { return no_slowdown_; }
+
+  int GetSleepCounter() const { return sleep_counter_.load(); }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s;
+    if (time_elapse_only_sleep_) {
+      *unix_time = maybe_starting_time_;
+    } else {
+      s = SystemClockWrapper::GetCurrentTime(unix_time);
+    }
+    if (s.ok()) {
+      // mock microseconds elapsed to seconds of time
+      *unix_time += addon_microseconds_.load() / 1000000;
+    }
+    return s;
+  }
+
+  virtual uint64_t CPUNanos() override {
+    cpu_counter_++;
+    return SystemClockWrapper::CPUNanos();
+  }
+
+  virtual uint64_t CPUMicros() override {
+    cpu_counter_++;
+    return SystemClockWrapper::CPUMicros();
+  }
+
+  virtual uint64_t NowNanos() override {
+    return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowNanos()) +
+           addon_microseconds_.load() * 1000;
+  }
+
+  virtual uint64_t NowMicros() override {
+    return (time_elapse_only_sleep_ ? 0 : SystemClockWrapper::NowMicros()) +
+           addon_microseconds_.load();
+  }
+
+  int GetCpuCounter() const { return cpu_counter_.load(); }
+
+  void ResetCounters() {
+    cpu_counter_.store(0);
+    sleep_counter_.store(0);
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,17 +10,625 @@
 #include "rocksdb/env.h"
 
 #include <thread>
+
 #include "env/composite_env_wrapper.h"
+#include "env/emulated_clock.h"
+#include "env/mock_env.h"
+#include "env/unique_id_gen.h"
 #include "logging/env_logger.h"
 #include "memory/arena.h"
 #include "options/db_options.h"
 #include "port/port.h"
-#include "port/sys_time.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/customizable_util.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/autovector.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinEnvs(ObjectLibrary& library,
+                               const std::string& /*arg*/) {
+  library.AddFactory<Env>(MockEnv::kClassName(), [](const std::string& /*uri*/,
+                                                    std::unique_ptr<Env>* guard,
+                                                    std::string* /* errmsg */) {
+    guard->reset(MockEnv::Create(Env::Default()));
+    return guard->get();
+  });
+  library.AddFactory<Env>(
+      CompositeEnvWrapper::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new CompositeEnvWrapper(Env::Default()));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+static void RegisterSystemEnvs() {
+#ifndef ROCKSDB_LITE
+  static std::once_flag loaded;
+  std::call_once(loaded, [&]() {
+    RegisterBuiltinEnvs(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+}
+
+class LegacySystemClock : public SystemClock {
+ private:
+  Env* env_;
+
+ public:
+  explicit LegacySystemClock(Env* env) : env_(env) {}
+  const char* Name() const override { return "LegacySystemClock"; }
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  uint64_t NowMicros() override { return env_->NowMicros(); }
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  uint64_t NowNanos() override { return env_->NowNanos(); }
+
+  uint64_t CPUMicros() override { return CPUNanos() / 1000; }
+  uint64_t CPUNanos() override { return env_->NowCPUNanos(); }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  void SleepForMicroseconds(int micros) override {
+    env_->SleepForMicroseconds(micros);
+  }
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return env_->GetCurrentTime(unix_time);
+  }
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  std::string TimeToString(uint64_t time) override {
+    return env_->TimeToString(time);
+  }
+
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& /*config_options*/,
+                               const std::string& /*prefix*/) const override {
+    // We do not want the LegacySystemClock to appear in the serialized output.
+    // This clock is an internal class for those who do not implement one and
+    // would be part of the Env.  As such, do not serialize it here.
+    return "";
+  }
+#endif  // ROCKSDB_LITE
+};
+
+class LegacySequentialFileWrapper : public FSSequentialFile {
+ public:
+  explicit LegacySequentialFileWrapper(
+      std::unique_ptr<SequentialFile>&& _target)
+      : target_(std::move(_target)) {}
+
+  IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result,
+                char* scratch, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Read(n, result, scratch));
+  }
+  IOStatus Skip(uint64_t n) override {
+    return status_to_io_status(target_->Skip(n));
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+  IOStatus PositionedRead(uint64_t offset, size_t n,
+                          const IOOptions& /*options*/, Slice* result,
+                          char* scratch, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(
+        target_->PositionedRead(offset, n, result, scratch));
+  }
+
+ private:
+  std::unique_ptr<SequentialFile> target_;
+};
+
+class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+  explicit LegacyRandomAccessFileWrapper(
+      std::unique_ptr<RandomAccessFile>&& target)
+      : target_(std::move(target)) {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
+    return status_to_io_status(target_->Read(offset, n, result, scratch));
+  }
+
+  IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    std::vector<ReadRequest> reqs;
+    Status status;
+
+    reqs.reserve(num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      ReadRequest req;
+
+      req.offset = fs_reqs[i].offset;
+      req.len = fs_reqs[i].len;
+      req.scratch = fs_reqs[i].scratch;
+      req.status = Status::OK();
+
+      reqs.emplace_back(req);
+    }
+    status = target_->MultiRead(reqs.data(), num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      fs_reqs[i].result = reqs[i].result;
+      fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status));
+    }
+    return status_to_io_status(std::move(status));
+  }
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Prefetch(offset, n));
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override {
+    target_->Hint((RandomAccessFile::AccessPattern)pattern);
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+
+ private:
+  std::unique_ptr<RandomAccessFile> target_;
+};
+
+class LegacyRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+  explicit LegacyRandomRWFileWrapper(std::unique_ptr<RandomRWFile>&& target)
+      : target_(std::move(target)) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus Write(uint64_t offset, const Slice& data,
+                 const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Write(offset, data));
+  }
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
+    return status_to_io_status(target_->Read(offset, n, result, scratch));
+  }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Flush());
+  }
+  IOStatus Sync(const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Sync());
+  }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Close());
+  }
+
+ private:
+  std::unique_ptr<RandomRWFile> target_;
+};
+
+class LegacyWritableFileWrapper : public FSWritableFile {
+ public:
+  explicit LegacyWritableFileWrapper(std::unique_ptr<WritableFile>&& _target)
+      : target_(std::move(_target)) {}
+
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Append(data));
+  }
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  const DataVerificationInfo& /*verification_info*/,
+                  IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Append(data));
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->PositionedAppend(data, offset));
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& /*options*/,
+                            const DataVerificationInfo& /*verification_info*/,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->PositionedAppend(data, offset));
+  }
+  IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Truncate(size));
+  }
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Close());
+  }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Flush());
+  }
+  IOStatus Sync(const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Sync());
+  }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize(const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
+    return target_->GetFileSize();
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->RangeSync(offset, nbytes));
+  }
+
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    target_->PrepareWrite(offset, len);
+  }
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Allocate(offset, len));
+  }
+
+ private:
+  std::unique_ptr<WritableFile> target_;
+};
+
+class LegacyDirectoryWrapper : public FSDirectory {
+ public:
+  explicit LegacyDirectoryWrapper(std::unique_ptr<Directory>&& target)
+      : target_(std::move(target)) {}
+
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<Directory> target_;
+};
+
+class LegacyFileSystemWrapper : public FileSystem {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit LegacyFileSystemWrapper(Env* t) : target_(t) {}
+  ~LegacyFileSystemWrapper() override {}
+
+  static const char* kClassName() { return "LegacyFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    std::unique_ptr<SequentialFile> file;
+    Status s = target_->NewSequentialFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacySequentialFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* /*dbg*/) override {
+    std::unique_ptr<RandomAccessFile> file;
+    Status s = target_->NewRandomAccessFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyRandomAccessFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->NewWritableFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->ReopenWritableFile(fname, &file, file_opts);
+    if (s.ok()) {
+      result->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* /*dbg*/) override {
+    std::unique_ptr<RandomRWFile> file;
+    Status s = target_->NewRandomRWFile(fname, &file, file_opts);
+    if (s.ok()) {
+      result->reset(new LegacyRandomRWFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return status_to_io_status(
+        target_->NewMemoryMappedFileBuffer(fname, result));
+  }
+  IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* /*dbg*/) override {
+    std::unique_ptr<Directory> dir;
+    Status s = target_->NewDirectory(name, &dir);
+    if (s.ok()) {
+      result->reset(new LegacyDirectoryWrapper(std::move(dir)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->FileExists(f));
+  }
+  IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/,
+                       std::vector<std::string>* r,
+                       IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetChildren(dir, r));
+  }
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& /*options*/,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetChildrenFileAttributes(dir, result));
+  }
+  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->DeleteFile(f));
+  }
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Truncate(fname, size));
+  }
+  IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->CreateDir(d));
+  }
+  IOStatus CreateDirIfMissing(const std::string& d,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->CreateDirIfMissing(d));
+  }
+  IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->DeleteDir(d));
+  }
+  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
+                       uint64_t* s, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetFileSize(f, s));
+  }
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& /*options*/,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* /*dbg*/) override {
+    return status_to_io_status(
+        target_->GetFileModificationTime(fname, file_mtime));
+  }
+
+  IOStatus GetAbsolutePath(const std::string& db_path,
+                           const IOOptions& /*options*/,
+                           std::string* output_path,
+                           IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetAbsolutePath(db_path, output_path));
+  }
+
+  IOStatus RenameFile(const std::string& s, const std::string& t,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->RenameFile(s, t));
+  }
+
+  IOStatus LinkFile(const std::string& s, const std::string& t,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->LinkFile(s, t));
+  }
+
+  IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
+                        uint64_t* count, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->NumFileLinks(fname, count));
+  }
+
+  IOStatus AreFilesSame(const std::string& first, const std::string& second,
+                        const IOOptions& /*options*/, bool* res,
+                        IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->AreFilesSame(first, second, res));
+  }
+
+  IOStatus LockFile(const std::string& f, const IOOptions& /*options*/,
+                    FileLock** l, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->LockFile(f, l));
+  }
+
+  IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->UnlockFile(l));
+  }
+
+  IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetTestDirectory(path));
+  }
+  IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->NewLogger(fname, result));
+  }
+
+  void SanitizeFileOptions(FileOptions* opts) const override {
+    target_->SanitizeEnvOptions(opts);
+  }
+
+  FileOptions OptimizeForLogRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForLogRead(file_options);
+  }
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestRead(file_options);
+  }
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override {
+    return target_->OptimizeForLogWrite(file_options, db_options);
+  }
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestWrite(file_options);
+  }
+  FileOptions OptimizeForCompactionTableWrite(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_->OptimizeForCompactionTableWrite(file_options,
+                                                    immutable_ops);
+  }
+  FileOptions OptimizeForCompactionTableRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForCompactionTableRead(file_options, db_options);
+  }
+  FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(file_options, db_options);
+  }
+
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+  IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/,
+                        uint64_t* diskfree, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetFreeSpace(path, diskfree));
+  }
+  IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/,
+                       bool* is_dir, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->IsDirectory(path, is_dir));
+  }
+
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& /*config_options*/,
+                               const std::string& /*prefix*/) const override {
+    // We do not want the LegacyFileSystem to appear in the serialized output.
+    // This clock is an internal class for those who do not implement one and
+    // would be part of the Env.  As such, do not serialize it here.
+    return "";
+  }
+#endif  // ROCKSDB_LITE
+ private:
+  Env* target_;
+};
+}  // end anonymous namespace
+
+Env::Env() : thread_status_updater_(nullptr) {
+  file_system_ = std::make_shared<LegacyFileSystemWrapper>(this);
+  system_clock_ = std::make_shared<LegacySystemClock>(this);
+}
+
+Env::Env(const std::shared_ptr<FileSystem>& fs)
+    : thread_status_updater_(nullptr), file_system_(fs) {
+  system_clock_ = std::make_shared<LegacySystemClock>(this);
+}
+
+Env::Env(const std::shared_ptr<FileSystem>& fs,
+         const std::shared_ptr<SystemClock>& clock)
+    : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {}
 
 Env::~Env() {
 }
@@ -31,47 +639,99 @@
 }
 
 Status Env::LoadEnv(const std::string& value, Env** result) {
-  Env* env = *result;
-  Status s;
-#ifndef ROCKSDB_LITE
-  s = ObjectRegistry::NewInstance()->NewStaticObject<Env>(value, &env);
-#else
-  s = Status::NotSupported("Cannot load environment in LITE mode: ", value);
-#endif
-  if (s.ok()) {
-    *result = env;
+  return CreateFromString(ConfigOptions(), value, result);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+                             const std::string& value, Env** result) {
+  Env* base = Env::Default();
+  if (value.empty() || base->IsInstanceOf(value)) {
+    *result = base;
+    return Status::OK();
+  } else {
+    RegisterSystemEnvs();
+    Env* env = *result;
+    Status s = LoadStaticObject<Env>(config_options, value, nullptr, &env);
+    if (s.ok()) {
+      *result = env;
+    }
+    return s;
   }
-  return s;
 }
 
 Status Env::LoadEnv(const std::string& value, Env** result,
                     std::shared_ptr<Env>* guard) {
+  return CreateFromString(ConfigOptions(), value, result, guard);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+                             const std::string& value, Env** result,
+                             std::shared_ptr<Env>* guard) {
   assert(result);
-  Status s;
-#ifndef ROCKSDB_LITE
-  Env* env = nullptr;
-  std::unique_ptr<Env> uniq_guard;
-  std::string err_msg;
   assert(guard != nullptr);
-  env = ObjectRegistry::NewInstance()->NewObject<Env>(value, &uniq_guard,
-                                                      &err_msg);
-  if (!env) {
-    s = Status::NotFound(std::string("Cannot load ") + Env::Type() + ": " +
-                         value);
-    env = Env::Default();
-  }
-  if (s.ok() && uniq_guard) {
-    guard->reset(uniq_guard.release());
-    *result = guard->get();
-  } else {
-    *result = env;
+  std::unique_ptr<Env> uniq;
+
+  Env* env = *result;
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+
+  Status status =
+      Customizable::GetOptionsMap(config_options, env, value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
   }
+  Env* base = Env::Default();
+  if (id.empty() || base->IsInstanceOf(id)) {
+    env = base;
+    status = Status::OK();
+  } else {
+    RegisterSystemEnvs();
+#ifndef ROCKSDB_LITE
+    std::string errmsg;
+    env = config_options.registry->NewObject<Env>(id, &uniq, &errmsg);
+    if (!env) {
+      status = Status::NotSupported(
+          std::string("Cannot load environment[") + id + "]: ", errmsg);
+    }
 #else
-  (void)result;
-  (void)guard;
-  s = Status::NotSupported("Cannot load environment in LITE mode: ", value);
+    status =
+        Status::NotSupported("Cannot load environment in LITE mode", value);
 #endif
-  return s;
+  }
+  if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+    status = Status::OK();
+  } else if (status.ok()) {
+    status = Customizable::ConfigureNewObject(config_options, env, opt_map);
+  }
+  if (status.ok()) {
+    guard->reset(uniq.release());
+    *result = env;
+  }
+  return status;
+}
+
+Status Env::CreateFromUri(const ConfigOptions& config_options,
+                          const std::string& env_uri, const std::string& fs_uri,
+                          Env** result, std::shared_ptr<Env>* guard) {
+  *result = config_options.env;
+  if (env_uri.empty() && fs_uri.empty()) {
+    // Neither specified.  Use the default
+    guard->reset();
+    return Status::OK();
+  } else if (!env_uri.empty() && !fs_uri.empty()) {
+    // Both specified.  Cannot choose.  Return Invalid
+    return Status::InvalidArgument("cannot specify both fs_uri and env_uri");
+  } else if (fs_uri.empty()) {  // Only have an ENV URI.  Create an Env from it
+    return CreateFromString(config_options, env_uri, result, guard);
+  } else {
+    std::shared_ptr<FileSystem> fs;
+    Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs);
+    if (s.ok()) {
+      guard->reset(new CompositeEnvWrapper(*result, fs));
+      *result = guard->get();
+    }
+    return s;
+  }
 }
 
 std::string Env::PriorityToString(Env::Priority priority) {
@@ -132,6 +792,56 @@
   return Status::OK();
 }
 
+Status Env::GetHostNameString(std::string* result) {
+  std::array<char, kMaxHostNameLen> hostname_buf{};
+  Status s = GetHostName(hostname_buf.data(), hostname_buf.size());
+  if (s.ok()) {
+    hostname_buf[hostname_buf.size() - 1] = '\0';
+    result->assign(hostname_buf.data());
+  }
+  return s;
+}
+
+std::string Env::GenerateUniqueId() {
+  std::string result;
+  bool success = port::GenerateRfcUuid(&result);
+  if (!success) {
+    // Fall back on our own way of generating a unique ID and adapt it to
+    // RFC 4122 variant 1 version 4 (a random ID).
+    // https://en.wikipedia.org/wiki/Universally_unique_identifier
+    // We already tried GenerateRfcUuid so no need to try it again in
+    // GenerateRawUniqueId
+    constexpr bool exclude_port_uuid = true;
+    uint64_t upper, lower;
+    GenerateRawUniqueId(&upper, &lower, exclude_port_uuid);
+
+    // Set 4-bit version to 4
+    upper = (upper & (~uint64_t{0xf000})) | 0x4000;
+    // Set unary-encoded variant to 1 (0b10)
+    lower = (lower & (~(uint64_t{3} << 62))) | (uint64_t{2} << 62);
+
+    // Use 36 character format of RFC 4122
+    result.resize(36U);
+    char* buf = &result[0];
+    PutBaseChars<16>(&buf, 8, upper >> 32, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 4, upper >> 16, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 4, upper, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 4, lower >> 48, /*!uppercase*/ false);
+    *(buf++) = '-';
+    PutBaseChars<16>(&buf, 12, lower, /*!uppercase*/ false);
+    assert(buf == &result[36]);
+
+    // Verify variant 1 version 4
+    assert(result[14] == '4');
+    assert(result[19] == '8' || result[19] == '9' || result[19] == 'a' ||
+           result[19] == 'b');
+  }
+  return result;
+}
+
 SequentialFile::~SequentialFile() {
 }
 
@@ -200,6 +910,14 @@
       kInfoLogLevelNames[log_level], format);
     Logv(new_format, ap);
   }
+
+  if (log_level >= InfoLogLevel::WARN_LEVEL &&
+      log_level != InfoLogLevel::HEADER_LEVEL) {
+    // Log messages with severity of warning or higher should be rare and are
+    // sometimes followed by an unclean crash. We want to be sure important
+    // messages are not lost in an application buffer when that happens.
+    Flush();
+  }
 }
 
 static void Logv(const InfoLogLevel log_level, Logger *info_log, const char *format, va_list ap) {
@@ -361,30 +1079,74 @@
 
 Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
                          bool should_sync) {
-  std::unique_ptr<WritableFile> file;
-  EnvOptions soptions;
-  Status s = env->NewWritableFile(fname, &file, soptions);
-  if (!s.ok()) {
-    return s;
-  }
-  s = file->Append(data);
-  if (s.ok() && should_sync) {
-    s = file->Sync();
-  }
-  if (!s.ok()) {
-    env->DeleteFile(fname);
-  }
-  return s;
+  const auto& fs = env->GetFileSystem();
+  return WriteStringToFile(fs.get(), data, fname, should_sync);
 }
 
 Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
-  LegacyFileSystemWrapper lfsw(env);
-  return ReadFileToString(&lfsw, fname, data);
+  const auto& fs = env->GetFileSystem();
+  return ReadFileToString(fs.get(), fname, data);
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> env_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target",
+     {0, OptionType::kCustomizable, OptionVerificationType::kByName,
+      OptionTypeFlags::kDontSerialize | OptionTypeFlags::kRawPointer,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        EnvWrapper::Target* target = static_cast<EnvWrapper::Target*>(addr);
+        return Env::CreateFromString(opts, value, &(target->env),
+                                     &(target->guard));
+      },
+      nullptr, nullptr}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+
+EnvWrapper::EnvWrapper(Env* t) : target_(t) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::EnvWrapper(std::unique_ptr<Env>&& t) : target_(std::move(t)) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
+}
+
+EnvWrapper::EnvWrapper(const std::shared_ptr<Env>& t) : target_(t) {
+  RegisterOptions("", &target_, &env_wrapper_type_info);
 }
 
 EnvWrapper::~EnvWrapper() {
 }
 
+Status EnvWrapper::PrepareOptions(const ConfigOptions& options) {
+  target_.Prepare();
+  return Env::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options,
+                                         const std::string& header) const {
+  auto parent = Env::SerializeOptions(config_options, "");
+  if (config_options.IsShallow() || target_.env == nullptr ||
+      target_.env == Env::Default()) {
+    return parent;
+  } else {
+    std::string result = header;
+    if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+      result.append(OptionTypeInfo::kIdPropName()).append("=");
+    }
+    result.append(parent);
+    if (!EndsWith(result, config_options.delimiter)) {
+      result.append(config_options.delimiter);
+    }
+    result.append("target=").append(target_.env->ToString(config_options));
+    return result;
+  }
+}
+#endif  // ROCKSDB_LITE
+
 namespace {  // anonymous namespace
 
 void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
@@ -445,6 +1207,12 @@
   optimized_env_options.use_direct_reads = db_options.use_direct_reads;
   return optimized_env_options;
 }
+EnvOptions Env::OptimizeForBlobFileRead(
+    const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_env_options;
+}
 
 EnvOptions::EnvOptions(const DBOptions& options) {
   AssignEnvOptions(this, options);
@@ -457,19 +1225,103 @@
 
 Status NewEnvLogger(const std::string& fname, Env* env,
                     std::shared_ptr<Logger>* result) {
-  EnvOptions options;
+  FileOptions options;
   // TODO: Tune the buffer size.
   options.writable_file_max_buffer_size = 1024 * 1024;
-  std::unique_ptr<WritableFile> writable_file;
-  const auto status = env->NewWritableFile(fname, &writable_file, options);
+  std::unique_ptr<FSWritableFile> writable_file;
+  const auto status = env->GetFileSystem()->NewWritableFile(
+      fname, options, &writable_file, nullptr);
   if (!status.ok()) {
     return status;
   }
 
-  *result = std::make_shared<EnvLogger>(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname, options,
-      env);
+  *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+                                        options, env);
   return Status::OK();
 }
 
+const std::shared_ptr<FileSystem>& Env::GetFileSystem() const {
+  return file_system_;
+}
+
+const std::shared_ptr<SystemClock>& Env::GetSystemClock() const {
+  return system_clock_;
+}
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> sc_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target",
+     OptionTypeInfo::AsCustomSharedPtr<SystemClock>(
+         0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)},
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace
+SystemClockWrapper::SystemClockWrapper(const std::shared_ptr<SystemClock>& t)
+    : target_(t) {
+  RegisterOptions("", &target_, &sc_wrapper_type_info);
+}
+
+Status SystemClockWrapper::PrepareOptions(const ConfigOptions& options) {
+  if (target_ == nullptr) {
+    target_ = SystemClock::Default();
+  }
+  return SystemClock::PrepareOptions(options);
+}
+
+#ifndef ROCKSDB_LITE
+std::string SystemClockWrapper::SerializeOptions(
+    const ConfigOptions& config_options, const std::string& header) const {
+  auto parent = SystemClock::SerializeOptions(config_options, "");
+  if (config_options.IsShallow() || target_ == nullptr ||
+      target_->IsInstanceOf(SystemClock::kDefaultName())) {
+    return parent;
+  } else {
+    std::string result = header;
+    if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+      result.append(OptionTypeInfo::kIdPropName()).append("=");
+    }
+    result.append(parent);
+    if (!EndsWith(result, config_options.delimiter)) {
+      result.append(config_options.delimiter);
+    }
+    result.append("target=").append(target_->ToString(config_options));
+    return result;
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinSystemClocks(ObjectLibrary& library,
+                                       const std::string& /*arg*/) {
+  library.AddFactory<SystemClock>(
+      EmulatedSystemClock::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new EmulatedSystemClock(SystemClock::Default()));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status SystemClock::CreateFromString(const ConfigOptions& config_options,
+                                     const std::string& value,
+                                     std::shared_ptr<SystemClock>* result) {
+  auto clock = SystemClock::Default();
+  if (clock->IsInstanceOf(value)) {
+    *result = clock;
+    return Status::OK();
+  } else {
+#ifndef ROCKSDB_LITE
+    static std::once_flag once;
+    std::call_once(once, [&]() {
+      RegisterBuiltinSystemClocks(*(ObjectLibrary::Default().get()), "");
+    });
+#endif  // ROCKSDB_LITE
+    return LoadSharedObject<SystemClock>(config_options, value, nullptr,
+                                         result);
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_basic_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_basic_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_basic_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_basic_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,95 +4,122 @@
 //
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
-#include <algorithm>
 
 #include "env/mock_env.h"
+#include "file/file_util.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
+#include "rocksdb/env_encryption.h"
 #include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+using CreateEnvFunc = Env*();
 
-// Normalizes trivial differences across Envs such that these test cases can
-// run on all Envs.
-class NormalizingEnvWrapper : public EnvWrapper {
- public:
-  explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {}
+// These functions are used to create the various environments under which this
+// test can execute. These functions are used to allow the test cases to be
+// created without the Env being initialized, thereby eliminating a potential
+// static initialization fiasco/race condition when attempting to get a
+// custom/configured env prior to main being invoked.
+
+static Env* GetDefaultEnv() { return Env::Default(); }
+
+static Env* GetMockEnv() {
+  static std::unique_ptr<Env> mock_env(MockEnv::Create(Env::Default()));
+  return mock_env.get();
+}
+#ifndef ROCKSDB_LITE
+static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) {
+  ConfigOptions config_opts;
+  config_opts.invoke_prepare_options = false;
+
+  std::shared_ptr<EncryptionProvider> provider;
+  EXPECT_OK(EncryptionProvider::CreateFromString(config_opts, provider_id,
+                                                 &provider));
+  return NewEncryptedEnv(base, provider);
+}
+
+static Env* GetCtrEncryptedEnv() {
+  static std::unique_ptr<Env> ctr_encrypt_env(
+      NewTestEncryptedEnv(Env::Default(), "CTR://test"));
+  return ctr_encrypt_env.get();
+}
+
+static Env* GetMemoryEnv() {
+  static std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
+  return mem_env.get();
+}
 
-  // Removes . and .. from directory listing
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override {
-    Status status = EnvWrapper::GetChildren(dir, result);
-    if (status.ok()) {
-      result->erase(std::remove_if(result->begin(), result->end(),
-                                   [](const std::string& s) {
-                                     return s == "." || s == "..";
-                                   }),
-                    result->end());
+static Env* GetTestEnv() {
+  static std::shared_ptr<Env> env_guard;
+  static Env* custom_env = nullptr;
+  if (custom_env == nullptr) {
+    const char* uri = getenv("TEST_ENV_URI");
+    if (uri != nullptr) {
+      EXPECT_OK(Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env,
+                                   &env_guard));
     }
-    return status;
   }
+  EXPECT_NE(custom_env, nullptr);
+  return custom_env;
+}
 
-  // Removes . and .. from directory listing
-  Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
-    Status status = EnvWrapper::GetChildrenFileAttributes(dir, result);
-    if (status.ok()) {
-      result->erase(std::remove_if(result->begin(), result->end(),
-                                   [](const FileAttributes& fa) {
-                                     return fa.name == "." || fa.name == "..";
-                                   }),
-                    result->end());
+static Env* GetTestFS() {
+  static std::shared_ptr<Env> fs_env_guard;
+  static Env* fs_env = nullptr;
+  if (fs_env == nullptr) {
+    const char* uri = getenv("TEST_FS_URI");
+    if (uri != nullptr) {
+      EXPECT_OK(
+          Env::CreateFromUri(ConfigOptions(), uri, "", &fs_env, &fs_env_guard));
     }
-    return status;
   }
-};
+  EXPECT_NE(fs_env, nullptr);
+  return fs_env;
+}
+#endif  // ROCKSDB_LITE
 
-class EnvBasicTestWithParam : public testing::Test,
-                              public ::testing::WithParamInterface<Env*> {
+}  // namespace
+class EnvBasicTestWithParam
+    : public testing::Test,
+      public ::testing::WithParamInterface<CreateEnvFunc*> {
  public:
   Env* env_;
   const EnvOptions soptions_;
   std::string test_dir_;
 
-  EnvBasicTestWithParam() : env_(GetParam()) {
+  EnvBasicTestWithParam() : env_(GetParam()()) {
     test_dir_ = test::PerThreadDBPath(env_, "env_basic_test");
   }
 
-  void SetUp() override { env_->CreateDirIfMissing(test_dir_); }
+  void SetUp() override { ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); }
 
-  void TearDown() override {
-    std::vector<std::string> files;
-    env_->GetChildren(test_dir_, &files);
-    for (const auto& file : files) {
-      // don't know whether it's file or directory, try both. The tests must
-      // only create files or empty directories, so one must succeed, else the
-      // directory's corrupted.
-      Status s = env_->DeleteFile(test_dir_ + "/" + file);
-      if (!s.ok()) {
-        ASSERT_OK(env_->DeleteDir(test_dir_ + "/" + file));
-      }
-    }
-  }
+  void TearDown() override { ASSERT_OK(DestroyDir(env_, test_dir_)); }
 };
 
 class EnvMoreTestWithParam : public EnvBasicTestWithParam {};
 
-static std::unique_ptr<Env> def_env(new NormalizingEnvWrapper(Env::Default()));
 INSTANTIATE_TEST_CASE_P(EnvDefault, EnvBasicTestWithParam,
-                        ::testing::Values(def_env.get()));
+                        ::testing::Values(&GetDefaultEnv));
 INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam,
-                        ::testing::Values(def_env.get()));
+                        ::testing::Values(&GetDefaultEnv));
 
-static std::unique_ptr<Env> mock_env(new MockEnv(Env::Default()));
 INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam,
-                        ::testing::Values(mock_env.get()));
+                        ::testing::Values(&GetMockEnv));
+
 #ifndef ROCKSDB_LITE
-static std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
+// next statements run env test against default encryption code.
+INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam,
+                        ::testing::Values(&GetCtrEncryptedEnv));
+INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam,
+                        ::testing::Values(&GetCtrEncryptedEnv));
+
 INSTANTIATE_TEST_CASE_P(MemEnv, EnvBasicTestWithParam,
-                        ::testing::Values(mem_env.get()));
+                        ::testing::Values(&GetMemoryEnv));
 
 namespace {
 
@@ -101,20 +128,15 @@
 //
 // The purpose of returning an empty vector (instead of nullptr) is that gtest
 // ValuesIn() will skip running tests when given an empty collection.
-std::vector<Env*> GetCustomEnvs() {
-  static Env* custom_env;
-  static bool init = false;
-  if (!init) {
-    init = true;
-    const char* uri = getenv("TEST_ENV_URI");
-    if (uri != nullptr) {
-      Env::LoadEnv(uri, &custom_env);
-    }
+std::vector<CreateEnvFunc*> GetCustomEnvs() {
+  std::vector<CreateEnvFunc*> res;
+  const char* uri = getenv("TEST_ENV_URI");
+  if (uri != nullptr) {
+    res.push_back(&GetTestEnv);
   }
-
-  std::vector<Env*> res;
-  if (custom_env != nullptr) {
-    res.emplace_back(custom_env);
+  uri = getenv("TEST_FS_URI");
+  if (uri != nullptr) {
+    res.push_back(&GetTestFS);
   }
   return res;
 }
@@ -126,7 +148,6 @@
 
 INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam,
                         ::testing::ValuesIn(GetCustomEnvs()));
-
 #endif  // ROCKSDB_LITE
 
 TEST_P(EnvBasicTestWithParam, Basics) {
@@ -190,19 +211,18 @@
                                        soptions_)
                    .ok());
   ASSERT_TRUE(!seq_file);
-  ASSERT_TRUE(!env_->NewRandomAccessFile(test_dir_ + "/non_existent",
-                                         &rand_file, soptions_)
-                   .ok());
+  ASSERT_NOK(env_->NewRandomAccessFile(test_dir_ + "/non_existent", &rand_file,
+                                       soptions_));
   ASSERT_TRUE(!rand_file);
 
   // Check that deleting works.
-  ASSERT_TRUE(!env_->DeleteFile(test_dir_ + "/non_existent").ok());
+  ASSERT_NOK(env_->DeleteFile(test_dir_ + "/non_existent"));
   ASSERT_OK(env_->DeleteFile(test_dir_ + "/g"));
   ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g"));
   ASSERT_OK(env_->GetChildren(test_dir_, &children));
   ASSERT_EQ(0U, children.size());
-  ASSERT_TRUE(
-      env_->GetChildren(test_dir_ + "/non_existent", &children).IsNotFound());
+  Status s = env_->GetChildren(test_dir_ + "/non_existent", &children);
+  ASSERT_TRUE(s.IsNotFound());
 }
 
 TEST_P(EnvBasicTestWithParam, ReadWrite) {
@@ -298,7 +318,7 @@
   ASSERT_OK(env_->CreateDir(test_dir_ + "/j"));
   ASSERT_OK(env_->FileExists(test_dir_ + "/j"));
   std::vector<std::string> children;
-  env_->GetChildren(test_dir_, &children);
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
   ASSERT_EQ(1U, children.size());
   // fail because file already exists
   ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok());
@@ -327,14 +347,14 @@
   ASSERT_EQ(3U, children.size());
   ASSERT_EQ(3U, childAttr.size());
   for (auto each : children) {
-    env_->DeleteDir(test_dir_ + "/" + each);
+    env_->DeleteDir(test_dir_ + "/" + each).PermitUncheckedError();
   }  // necessary for default POSIX env
 
   // non-exist directory returns IOError
   ASSERT_OK(env_->DeleteDir(test_dir_));
-  ASSERT_TRUE(!env_->FileExists(test_dir_).ok());
-  ASSERT_TRUE(!env_->GetChildren(test_dir_, &children).ok());
-  ASSERT_TRUE(!env_->GetChildrenFileAttributes(test_dir_, &childAttr).ok());
+  ASSERT_NOK(env_->FileExists(test_dir_));
+  ASSERT_NOK(env_->GetChildren(test_dir_, &children));
+  ASSERT_NOK(env_->GetChildrenFileAttributes(test_dir_, &childAttr));
 
   // if dir is a file, returns IOError
   ASSERT_OK(env_->CreateDir(test_dir_));
@@ -343,10 +363,36 @@
       env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_));
   ASSERT_OK(writable_file->Close());
   writable_file.reset();
-  ASSERT_TRUE(!env_->GetChildren(test_dir_ + "/file", &children).ok());
+  ASSERT_NOK(env_->GetChildren(test_dir_ + "/file", &children));
   ASSERT_EQ(0U, children.size());
 }
 
+TEST_P(EnvMoreTestWithParam, GetChildrenIgnoresDotAndDotDot) {
+  auto* env = Env::Default();
+  ASSERT_OK(env->CreateDirIfMissing(test_dir_));
+
+  // Create a single file
+  std::string path = test_dir_;
+  const EnvOptions soptions;
+#ifdef OS_WIN
+  path.append("\\test_file");
+#else
+  path.append("/test_file");
+#endif
+  std::string data("test data");
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(env->NewWritableFile(path, &file, soptions));
+  ASSERT_OK(file->Append("test data"));
+
+  // get the children
+  std::vector<std::string> result;
+  ASSERT_OK(env->GetChildren(test_dir_, &result));
+
+  // expect only one file named `test_data`, i.e. no `.` or `..` names
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result.at(0), "test_file");
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,28 +7,41 @@
 
 #include "env/env_chroot.h"
 
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "rocksdb/status.h"
+#include <errno.h>   // errno
+#include <stdlib.h>  // realpath, free
+#include <unistd.h>  // geteuid
+
+#include "env/composite_env_wrapper.h"
+#include "env/fs_remap.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"  // errnoStr
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> chroot_fs_type_info = {
+    {"chroot_dir", {0, OptionType::kString}}};
+}  // namespace
+ChrootFileSystem::ChrootFileSystem(const std::shared_ptr<FileSystem>& base,
+                                   const std::string& chroot_dir)
+    : RemapFileSystem(base), chroot_dir_(chroot_dir) {
+  RegisterOptions("chroot_dir", &chroot_dir_, &chroot_fs_type_info);
+}
 
-class ChrootEnv : public EnvWrapper {
- public:
-  ChrootEnv(Env* base_env, const std::string& chroot_dir)
-      : EnvWrapper(base_env) {
+Status ChrootFileSystem::PrepareOptions(const ConfigOptions& options) {
+  Status s = FileSystemWrapper::PrepareOptions(options);
+  if (!s.ok()) {
+    return s;
+  } else if (chroot_dir_.empty()) {
+    s = Status::InvalidArgument("ChRootFileSystem requires a chroot dir");
+  } else {
+    s = target_->FileExists(chroot_dir_, IOOptions(), nullptr);
+  }
+  if (s.ok()) {
 #if defined(OS_AIX)
     char resolvedName[PATH_MAX];
-    char* real_chroot_dir = realpath(chroot_dir.c_str(), resolvedName);
+    char* real_chroot_dir = realpath(chroot_dir_.c_str(), resolvedName);
 #else
-    char* real_chroot_dir = realpath(chroot_dir.c_str(), nullptr);
+    char* real_chroot_dir = realpath(chroot_dir_.c_str(), nullptr);
 #endif
     // chroot_dir must exist so realpath() returns non-nullptr.
     assert(real_chroot_dir != nullptr);
@@ -37,231 +50,32 @@
     free(real_chroot_dir);
 #endif
   }
+  return s;
+}
 
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewSequentialFile(status_and_enc_path.second, result,
-                                         options);
-  }
-
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewRandomAccessFile(status_and_enc_path.second, result,
-                                           options);
-  }
-
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewWritableFile(status_and_enc_path.second, result,
-                                       options);
-  }
-
-  Status ReuseWritableFile(const std::string& fname,
-                           const std::string& old_fname,
-                           std::unique_ptr<WritableFile>* result,
-                           const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    auto status_and_old_enc_path = EncodePath(old_fname);
-    if (!status_and_old_enc_path.first.ok()) {
-      return status_and_old_enc_path.first;
-    }
-    return EnvWrapper::ReuseWritableFile(status_and_old_enc_path.second,
-                                         status_and_old_enc_path.second, result,
-                                         options);
-  }
-
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewRandomRWFile(status_and_enc_path.second, result,
-                                       options);
-  }
-
-  Status NewDirectory(const std::string& dir,
-                      std::unique_ptr<Directory>* result) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(dir);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewDirectory(status_and_enc_path.second, result);
-  }
-
-  Status FileExists(const std::string& fname) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::FileExists(status_and_enc_path.second);
-  }
-
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override {
-    auto status_and_enc_path = EncodePath(dir);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetChildren(status_and_enc_path.second, result);
-  }
-
-  Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
-    auto status_and_enc_path = EncodePath(dir);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetChildrenFileAttributes(status_and_enc_path.second,
-                                                 result);
-  }
-
-  Status DeleteFile(const std::string& fname) override {
-    auto status_and_enc_path = EncodePath(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::DeleteFile(status_and_enc_path.second);
-  }
-
-  Status CreateDir(const std::string& dirname) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(dirname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::CreateDir(status_and_enc_path.second);
-  }
-
-  Status CreateDirIfMissing(const std::string& dirname) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(dirname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::CreateDirIfMissing(status_and_enc_path.second);
-  }
-
-  Status DeleteDir(const std::string& dirname) override {
-    auto status_and_enc_path = EncodePath(dirname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::DeleteDir(status_and_enc_path.second);
-  }
-
-  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
-    auto status_and_enc_path = EncodePath(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetFileSize(status_and_enc_path.second, file_size);
-  }
-
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) override {
-    auto status_and_enc_path = EncodePath(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetFileModificationTime(status_and_enc_path.second,
-                                               file_mtime);
-  }
-
-  Status RenameFile(const std::string& src, const std::string& dest) override {
-    auto status_and_src_enc_path = EncodePath(src);
-    if (!status_and_src_enc_path.first.ok()) {
-      return status_and_src_enc_path.first;
-    }
-    auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
-    if (!status_and_dest_enc_path.first.ok()) {
-      return status_and_dest_enc_path.first;
-    }
-    return EnvWrapper::RenameFile(status_and_src_enc_path.second,
-                                  status_and_dest_enc_path.second);
-  }
-
-  Status LinkFile(const std::string& src, const std::string& dest) override {
-    auto status_and_src_enc_path = EncodePath(src);
-    if (!status_and_src_enc_path.first.ok()) {
-      return status_and_src_enc_path.first;
-    }
-    auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
-    if (!status_and_dest_enc_path.first.ok()) {
-      return status_and_dest_enc_path.first;
-    }
-    return EnvWrapper::LinkFile(status_and_src_enc_path.second,
-                                status_and_dest_enc_path.second);
-  }
-
-  Status LockFile(const std::string& fname, FileLock** lock) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    // FileLock subclasses may store path (e.g., PosixFileLock stores it). We
-    // can skip stripping the chroot directory from this path because callers
-    // shouldn't use it.
-    return EnvWrapper::LockFile(status_and_enc_path.second, lock);
-  }
-
-  Status GetTestDirectory(std::string* path) override {
-    // Adapted from PosixEnv's implementation since it doesn't provide a way to
-    // create directory in the chroot.
-    char buf[256];
-    snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast<int>(geteuid()));
-    *path = buf;
-
-    // Directory may already exist, so ignore return
-    CreateDir(*path);
-    return Status::OK();
-  }
-
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewLogger(status_and_enc_path.second, result);
-  }
+IOStatus ChrootFileSystem::GetTestDirectory(const IOOptions& options,
+                                            std::string* path,
+                                            IODebugContext* dbg) {
+  // Adapted from PosixEnv's implementation since it doesn't provide a way to
+  // create directory in the chroot.
+  char buf[256];
+  snprintf(buf, sizeof(buf), "/rocksdbtest-%d", static_cast<int>(geteuid()));
+  *path = buf;
 
-  Status GetAbsolutePath(const std::string& db_path,
-                         std::string* output_path) override {
-    auto status_and_enc_path = EncodePath(db_path);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetAbsolutePath(status_and_enc_path.second, output_path);
-  }
+  // Directory may already exist, so ignore return
+  return CreateDirIfMissing(*path, options, dbg);
+}
 
- private:
   // Returns status and expanded absolute path including the chroot directory.
   // Checks whether the provided path breaks out of the chroot. If it returns
   // non-OK status, the returned path should not be used.
-  std::pair<Status, std::string> EncodePath(const std::string& path) {
-    if (path.empty() || path[0] != '/') {
-      return {Status::InvalidArgument(path, "Not an absolute path"), ""};
-    }
-    std::pair<Status, std::string> res;
-    res.second = chroot_dir_ + path;
+std::pair<IOStatus, std::string> ChrootFileSystem::EncodePath(
+    const std::string& path) {
+  if (path.empty() || path[0] != '/') {
+    return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+  }
+  std::pair<IOStatus, std::string> res;
+  res.second = chroot_dir_ + path;
 #if defined(OS_AIX)
     char resolvedName[PATH_MAX];
     char* normalized_path = realpath(res.second.c_str(), resolvedName);
@@ -269,51 +83,64 @@
     char* normalized_path = realpath(res.second.c_str(), nullptr);
 #endif
     if (normalized_path == nullptr) {
-      res.first = Status::NotFound(res.second, strerror(errno));
+      res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str());
     } else if (strlen(normalized_path) < chroot_dir_.size() ||
                strncmp(normalized_path, chroot_dir_.c_str(),
                        chroot_dir_.size()) != 0) {
-      res.first = Status::IOError(res.second,
-                                  "Attempted to access path outside chroot");
+      res.first = IOStatus::IOError(res.second,
+                                    "Attempted to access path outside chroot");
     } else {
-      res.first = Status::OK();
+      res.first = IOStatus::OK();
     }
 #if !defined(OS_AIX)
     free(normalized_path);
 #endif
     return res;
-  }
+}
 
   // Similar to EncodePath() except assumes the basename in the path hasn't been
   // created yet.
-  std::pair<Status, std::string> EncodePathWithNewBasename(
-      const std::string& path) {
-    if (path.empty() || path[0] != '/') {
-      return {Status::InvalidArgument(path, "Not an absolute path"), ""};
-    }
-    // Basename may be followed by trailing slashes
-    size_t final_idx = path.find_last_not_of('/');
-    if (final_idx == std::string::npos) {
-      // It's only slashes so no basename to extract
-      return EncodePath(path);
-    }
+std::pair<IOStatus, std::string> ChrootFileSystem::EncodePathWithNewBasename(
+    const std::string& path) {
+  if (path.empty() || path[0] != '/') {
+    return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+  }
+  // Basename may be followed by trailing slashes
+  size_t final_idx = path.find_last_not_of('/');
+  if (final_idx == std::string::npos) {
+    // It's only slashes so no basename to extract
+    return EncodePath(path);
+  }
+
+  // Pull off the basename temporarily since realname(3) (used by
+  // EncodePath()) requires a path that exists
+  size_t base_sep = path.rfind('/', final_idx);
+  auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1));
+  status_and_enc_path.second.append(path.substr(base_sep + 1));
+  return status_and_enc_path;
+}
 
-    // Pull off the basename temporarily since realname(3) (used by
-    // EncodePath()) requires a path that exists
-    size_t base_sep = path.rfind('/', final_idx);
-    auto status_and_enc_path = EncodePath(path.substr(0, base_sep + 1));
-    status_and_enc_path.second.append(path.substr(base_sep + 1));
-    return status_and_enc_path;
+std::shared_ptr<FileSystem> NewChrootFileSystem(
+    const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir) {
+  auto chroot_fs = std::make_shared<ChrootFileSystem>(base, chroot_dir);
+  Status s = chroot_fs->PrepareOptions(ConfigOptions());
+  if (s.ok()) {
+    return chroot_fs;
+  } else {
+    return nullptr;
   }
-
-  std::string chroot_dir_;
-};
+}
 
 Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) {
   if (!base_env->FileExists(chroot_dir).ok()) {
     return nullptr;
   }
-  return new ChrootEnv(base_env, chroot_dir);
+  auto chroot_fs = NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir);
+  if (chroot_fs != nullptr) {
+    return new CompositeEnvWrapper(base_env, chroot_fs);
+  } else {
+    return nullptr;
+  }
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_chroot.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_chroot.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,13 +9,46 @@
 
 #include <string>
 
-#include "rocksdb/env.h"
+#include "env/fs_remap.h"
+#include "rocksdb/file_system.h"
 
 namespace ROCKSDB_NAMESPACE {
+class ChrootFileSystem : public RemapFileSystem {
+ public:
+  ChrootFileSystem(const std::shared_ptr<FileSystem>& base,
+                   const std::string& chroot_dir);
+
+  static const char* kClassName() { return "ChrootFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+
+  Status PrepareOptions(const ConfigOptions& options) override;
+
+ protected:
+  // Returns status and expanded absolute path including the chroot directory.
+  // Checks whether the provided path breaks out of the chroot. If it returns
+  // non-OK status, the returned path should not be used.
+  std::pair<IOStatus, std::string> EncodePath(const std::string& path) override;
+
+  // Similar to EncodePath() except assumes the basename in the path hasn't been
+  // created yet.
+  std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+      const std::string& path) override;
+
+ private:
+  std::string chroot_dir_;
+};
 
 // Returns an Env that translates paths such that the root directory appears to
 // be chroot_dir. chroot_dir should refer to an existing directory.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
 Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir);
+std::shared_ptr<FileSystem> NewChrootFileSystem(
+    const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir);
 
 }  // namespace ROCKSDB_NAMESPACE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,35 +5,33 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "rocksdb/env_encryption.h"
+
 #include <algorithm>
 #include <cassert>
 #include <cctype>
 #include <iostream>
 
-#include "rocksdb/env_encryption.h"
+#include "env/composite_env_wrapper.h"
+#include "env/env_encryption_ctr.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/aligned_buffer.h"
 #include "util/coding.h"
 #include "util/random.h"
+#include "util/string_util.h"
 
 #endif
-
 namespace ROCKSDB_NAMESPACE {
-
 #ifndef ROCKSDB_LITE
-
-class EncryptedSequentialFile : public SequentialFile {
-  private:
-    std::unique_ptr<SequentialFile> file_;
-    std::unique_ptr<BlockAccessCipherStream> stream_;
-    uint64_t offset_;
-    size_t prefixLength_;
-
-     public:
-  // Default ctor. Given underlying sequential file is supposed to be at
-  // offset == prefixLength.
-  EncryptedSequentialFile(SequentialFile* f, BlockAccessCipherStream* s, size_t prefixLength)
-      : file_(f), stream_(s), offset_(prefixLength), prefixLength_(prefixLength) {
-  }
+std::shared_ptr<EncryptionProvider> EncryptionProvider::NewCTRProvider(
+    const std::shared_ptr<BlockCipher>& cipher) {
+  return std::make_shared<CTREncryptionProvider>(cipher);
+}
 
   // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
   // written by this routine.  Sets "*result" to the data that was
@@ -43,76 +41,82 @@
   // If an error was encountered, returns a non-OK status.
   //
   // REQUIRES: External synchronization
-  Status Read(size_t n, Slice* result, char* scratch) override {
-    assert(scratch);
-    Status status = file_->Read(n, result, scratch);
-    if (!status.ok()) {
-      return status;
-    }
-    status = stream_->Decrypt(offset_, (char*)result->data(), result->size());
-    offset_ += result->size(); // We've already ready data from disk, so update offset_ even if decryption fails.
-    return status;
+IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options,
+                                       Slice* result, char* scratch,
+                                       IODebugContext* dbg) {
+  assert(scratch);
+  IOStatus io_s = file_->Read(n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset_, (char*)result->data(), result->size()));
+  }
+  if (io_s.ok()) {
+    offset_ += result->size();  // We've already ready data from disk, so update
+                                // offset_ even if decryption fails.
   }
+  return io_s;
+}
 
-  // Skip "n" bytes from the file. This is guaranteed to be no
-  // slower that reading the same data, but may be faster.
-  //
-  // If end of file is reached, skipping will stop at the end of the
-  // file, and Skip will return OK.
-  //
-  // REQUIRES: External synchronization
-  Status Skip(uint64_t n) override {
-    auto status = file_->Skip(n);
-    if (!status.ok()) {
-      return status;
-    }
-    offset_ += n;
+// Skip "n" bytes from the file. This is guaranteed to be no
+// slower that reading the same data, but may be faster.
+//
+// If end of file is reached, skipping will stop at the end of the
+// file, and Skip will return OK.
+//
+// REQUIRES: External synchronization
+IOStatus EncryptedSequentialFile::Skip(uint64_t n) {
+  auto status = file_->Skip(n);
+  if (!status.ok()) {
     return status;
   }
+  offset_ += n;
+  return status;
+}
 
-  // Indicates the upper layers if the current SequentialFile implementation
-  // uses direct IO.
-  bool use_direct_io() const override { return file_->use_direct_io(); }
+// Indicates the upper layers if the current SequentialFile implementation
+// uses direct IO.
+bool EncryptedSequentialFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
 
-  // Use the returned alignment value to allocate
-  // aligned buffer for Direct I/O
-  size_t GetRequiredBufferAlignment() const override {
-    return file_->GetRequiredBufferAlignment();
-  }
+// Use the returned alignment value to allocate
+// aligned buffer for Direct I/O
+size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return file_->InvalidateCache(offset + prefixLength_, length);
-  }
+IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset,
+                                                  size_t length) {
+  return file_->InvalidateCache(offset + prefixLength_, length);
+}
 
   // Positioned Read for direct I/O
   // If Direct I/O enabled, offset, n, and scratch should be properly aligned
-  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-                        char* scratch) override {
-    assert(scratch);
-    offset += prefixLength_; // Skip prefix
-    auto status = file_->PositionedRead(offset, n, result, scratch);
-    if (!status.ok()) {
-      return status;
-    }
-    offset_ = offset + result->size();
-    status = stream_->Decrypt(offset, (char*)result->data(), result->size());
-    return status;
+IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                                 const IOOptions& options,
+                                                 Slice* result, char* scratch,
+                                                 IODebugContext* dbg) {
+  assert(scratch);
+  offset += prefixLength_;  // Skip prefix
+  auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  offset_ = offset + result->size();
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
   }
-};
-
-// A file abstraction for randomly reading the contents of a file.
-class EncryptedRandomAccessFile : public RandomAccessFile {
-  private:
-    std::unique_ptr<RandomAccessFile> file_;
-    std::unique_ptr<BlockAccessCipherStream> stream_;
-    size_t prefixLength_;
-
- public:
-  EncryptedRandomAccessFile(RandomAccessFile* f, BlockAccessCipherStream* s, size_t prefixLength)
-    : file_(f), stream_(s), prefixLength_(prefixLength) { }
+  return io_s;
+}
 
   // Read up to "n" bytes from the file starting at "offset".
   // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
@@ -124,23 +128,31 @@
   //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    assert(scratch);
-    offset += prefixLength_;
-    auto status = file_->Read(offset, n, result, scratch);
-    if (!status.ok()) {
-      return status;
-    }
-    status = stream_->Decrypt(offset, (char*)result->data(), result->size());
-    return status;
+IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n,
+                                         const IOOptions& options,
+                                         Slice* result, char* scratch,
+                                         IODebugContext* dbg) const {
+  assert(scratch);
+  offset += prefixLength_;
+  auto io_s = file_->Read(offset, n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
   }
+  return io_s;
+}
 
   // Readahead the file starting from offset by n bytes for caching.
-  Status Prefetch(uint64_t offset, size_t n) override {
-    //return Status::OK();
-    return file_->Prefetch(offset + prefixLength_, n);
-  }
+IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  // return Status::OK();
+  return file_->Prefetch(offset + prefixLength_, n, options, dbg);
+}
 
   // Tries to get an unique ID for this file that will be the same each time
   // the file is opened (and will stay the same while the file is open).
@@ -157,343 +169,603 @@
   // a single varint.
   //
   // Note: these IDs are only valid for the duration of the process.
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return file_->GetUniqueId(id, max_size);
-  };
+size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+  return file_->GetUniqueId(id, max_size);
+};
 
-  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+void EncryptedRandomAccessFile::Hint(AccessPattern pattern) {
+  file_->Hint(pattern);
+}
 
   // Indicates the upper layers if the current RandomAccessFile implementation
   // uses direct IO.
-  bool use_direct_io() const override { return file_->use_direct_io(); }
+bool EncryptedRandomAccessFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  size_t GetRequiredBufferAlignment() const override {
-    return file_->GetRequiredBufferAlignment();
-  }
+size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return file_->InvalidateCache(offset + prefixLength_, length);
-  }
-};
+IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset,
+                                                    size_t length) {
+  return file_->InvalidateCache(offset + prefixLength_, length);
+}
 
 // A file abstraction for sequential writing.  The implementation
 // must provide buffering since callers may append small fragments
 // at a time to the file.
-class EncryptedWritableFile : public WritableFileWrapper {
-  private:
-    std::unique_ptr<WritableFile> file_;
-    std::unique_ptr<BlockAccessCipherStream> stream_;
-    size_t prefixLength_;
-
- public:
-  // Default ctor. Prefix is assumed to be written already.
-  EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, size_t prefixLength)
-    : WritableFileWrapper(f), file_(f), stream_(s), prefixLength_(prefixLength) { }
-
-  Status Append(const Slice& data) override {
-    AlignedBuffer buf;
-    Status status;
-    Slice dataToAppend(data);
-    if (data.size() > 0) {
-      auto offset = file_->GetFileSize(); // size including prefix
-      // Encrypt in cloned buffer
-      buf.Alignment(GetRequiredBufferAlignment());
-      buf.AllocateNewBuffer(data.size());
-      // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove
-      // so that the next two lines can be replaced with buf.Append().
-      memmove(buf.BufferStart(), data.data(), data.size());
-      buf.Size(data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
-      if (!status.ok()) {
-        return status;
-      }
-      dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
+IOStatus EncryptedWritableFile::Append(const Slice& data,
+                                       const IOOptions& options,
+                                       IODebugContext* dbg) {
+  AlignedBuffer buf;
+  Slice dataToAppend(data);
+  if (data.size() > 0) {
+    auto offset = file_->GetFileSize(options, dbg);  // size including prefix
+    // Encrypt in cloned buffer
+    buf.Alignment(GetRequiredBufferAlignment());
+    buf.AllocateNewBuffer(data.size());
+    // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove
+    // so that the next two lines can be replaced with buf.Append().
+    memmove(buf.BufferStart(), data.data(), data.size());
+    buf.Size(data.size());
+    IOStatus io_s;
+    {
+      PERF_TIMER_GUARD(encrypt_data_nanos);
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
     }
-    status = file_->Append(dataToAppend);
-    if (!status.ok()) {
-      return status;
+    if (!io_s.ok()) {
+      return io_s;
     }
-    return status;
+    dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
   }
+  return file_->Append(dataToAppend, options, dbg);
+}
 
-  Status PositionedAppend(const Slice& data, uint64_t offset) override {
-    AlignedBuffer buf;
-    Status status;
-    Slice dataToAppend(data);
-    offset += prefixLength_;
-    if (data.size() > 0) {
-      // Encrypt in cloned buffer
-      buf.Alignment(GetRequiredBufferAlignment());
-      buf.AllocateNewBuffer(data.size());
-      memmove(buf.BufferStart(), data.data(), data.size());
-      buf.Size(data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
-      if (!status.ok()) {
-        return status;
-      }
-      dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
+IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data,
+                                                 uint64_t offset,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
+  AlignedBuffer buf;
+  Slice dataToAppend(data);
+  offset += prefixLength_;
+  if (data.size() > 0) {
+    // Encrypt in cloned buffer
+    buf.Alignment(GetRequiredBufferAlignment());
+    buf.AllocateNewBuffer(data.size());
+    memmove(buf.BufferStart(), data.data(), data.size());
+    buf.Size(data.size());
+    IOStatus io_s;
+    {
+      PERF_TIMER_GUARD(encrypt_data_nanos);
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
     }
-    status = file_->PositionedAppend(dataToAppend, offset);
-    if (!status.ok()) {
-      return status;
+    if (!io_s.ok()) {
+      return io_s;
     }
-    return status;
+    dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
   }
+  return file_->PositionedAppend(dataToAppend, offset, options, dbg);
+}
 
-  // Indicates the upper layers if the current WritableFile implementation
-  // uses direct IO.
-  bool use_direct_io() const override { return file_->use_direct_io(); }
+// Indicates the upper layers if the current WritableFile implementation
+// uses direct IO.
+bool EncryptedWritableFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
+
+// true if Sync() and Fsync() are safe to call concurrently with Append()
+// and Flush().
+bool EncryptedWritableFile::IsSyncThreadSafe() const {
+  return file_->IsSyncThreadSafe();
+}
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  size_t GetRequiredBufferAlignment() const override {
-    return file_->GetRequiredBufferAlignment();
-  }
+size_t EncryptedWritableFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
 
-    /*
-   * Get the size of valid data in the file.
-   */
-  uint64_t GetFileSize() override {
-    return file_->GetFileSize() - prefixLength_;
-  }
+/*
+ * Get the size of valid data in the file.
+ */
+uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options,
+                                            IODebugContext* dbg) {
+  return file_->GetFileSize(options, dbg) - prefixLength_;
+}
 
-  // Truncate is necessary to trim the file to the correct size
-  // before closing. It is not always possible to keep track of the file
-  // size due to whole pages writes. The behavior is undefined if called
-  // with other writes to follow.
-  Status Truncate(uint64_t size) override {
-    return file_->Truncate(size + prefixLength_);
-  }
+// Truncate is necessary to trim the file to the correct size
+// before closing. It is not always possible to keep track of the file
+// size due to whole pages writes. The behavior is undefined if called
+// with other writes to follow.
+IOStatus EncryptedWritableFile::Truncate(uint64_t size,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  return file_->Truncate(size + prefixLength_, options, dbg);
+}
 
-    // Remove any kind of caching of data from the offset to offset+length
-  // of this file. If the length is 0, then it refers to the end of file.
-  // If the system is not caching the file contents, then this is a noop.
-  // This call has no effect on dirty pages in the cache.
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return file_->InvalidateCache(offset + prefixLength_, length);
-  }
-
-  // Sync a file range with disk.
-  // offset is the starting byte of the file range to be synchronized.
-  // nbytes specifies the length of the range to be synchronized.
-  // This asks the OS to initiate flushing the cached data to disk,
-  // without waiting for completion.
-  // Default implementation does nothing.
-  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
-    return file_->RangeSync(offset + prefixLength_, nbytes);
-  }
-
-  // PrepareWrite performs any necessary preparation for a write
-  // before the write actually occurs.  This allows for pre-allocation
-  // of space on devices where it can result in less file
-  // fragmentation and/or less waste from over-zealous filesystem
-  // pre-allocation.
-  void PrepareWrite(size_t offset, size_t len) override {
-    file_->PrepareWrite(offset + prefixLength_, len);
-  }
-
-  // Pre-allocates space for a file.
-  Status Allocate(uint64_t offset, uint64_t len) override {
-    return file_->Allocate(offset + prefixLength_, len);
-  }
-};
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+// This call has no effect on dirty pages in the cache.
+IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) {
+  return file_->InvalidateCache(offset + prefixLength_, length);
+}
 
-// A file abstraction for random reading and writing.
-class EncryptedRandomRWFile : public RandomRWFile {
-  private:
-    std::unique_ptr<RandomRWFile> file_;
-    std::unique_ptr<BlockAccessCipherStream> stream_;
-    size_t prefixLength_;
+// Sync a file range with disk.
+// offset is the starting byte of the file range to be synchronized.
+// nbytes specifies the length of the range to be synchronized.
+// This asks the OS to initiate flushing the cached data to disk,
+// without waiting for completion.
+// Default implementation does nothing.
+IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg);
+}
 
- public:
-  EncryptedRandomRWFile(RandomRWFile* f, BlockAccessCipherStream* s, size_t prefixLength)
-    : file_(f), stream_(s), prefixLength_(prefixLength) {}
+// PrepareWrite performs any necessary preparation for a write
+// before the write actually occurs.  This allows for pre-allocation
+// of space on devices where it can result in less file
+// fragmentation and/or less waste from over-zealous filesystem
+// pre-allocation.
+void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  file_->PrepareWrite(offset + prefixLength_, len, options, dbg);
+}
 
-  // Indicates if the class makes use of direct I/O
-  // If false you must pass aligned buffer to Write()
-  bool use_direct_io() const override { return file_->use_direct_io(); }
+void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) {
+  // the size here doesn't need to include prefixLength_, as it's a
+  // configuration will be use for `PrepareWrite()`.
+  file_->SetPreallocationBlockSize(size);
+}
+
+void EncryptedWritableFile::GetPreallocationStatus(
+    size_t* block_size, size_t* last_allocated_block) {
+  file_->GetPreallocationStatus(block_size, last_allocated_block);
+}
+
+// Pre-allocates space for a file.
+IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  return file_->Allocate(offset + prefixLength_, len, options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Flush(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Flush(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  return file_->Sync(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Close(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Close(options, dbg);
+}
+
+// A file abstraction for random reading and writing.
+
+// Indicates if the class makes use of direct I/O
+// If false you must pass aligned buffer to Write()
+bool EncryptedRandomRWFile::use_direct_io() const {
+  return file_->use_direct_io();
+}
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  size_t GetRequiredBufferAlignment() const override {
-    return file_->GetRequiredBufferAlignment();
-  }
+size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const {
+  return file_->GetRequiredBufferAlignment();
+}
 
   // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
   // Pass aligned buffer when use_direct_io() returns true.
-  Status Write(uint64_t offset, const Slice& data) override {
-    AlignedBuffer buf;
-    Status status;
-    Slice dataToWrite(data);
-    offset += prefixLength_;
-    if (data.size() > 0) {
-      // Encrypt in cloned buffer
-      buf.Alignment(GetRequiredBufferAlignment());
-      buf.AllocateNewBuffer(data.size());
-      memmove(buf.BufferStart(), data.data(), data.size());
-      buf.Size(data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
-      if (!status.ok()) {
-        return status;
-      }
-      dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize());
+IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) {
+  AlignedBuffer buf;
+  Slice dataToWrite(data);
+  offset += prefixLength_;
+  if (data.size() > 0) {
+    // Encrypt in cloned buffer
+    buf.Alignment(GetRequiredBufferAlignment());
+    buf.AllocateNewBuffer(data.size());
+    memmove(buf.BufferStart(), data.data(), data.size());
+    buf.Size(data.size());
+    IOStatus io_s;
+    {
+      PERF_TIMER_GUARD(encrypt_data_nanos);
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
     }
-    status = file_->Write(offset, dataToWrite);
-    return status;
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize());
   }
+  return file_->Write(offset, dataToWrite, options, dbg);
+}
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
   // Returns Status::OK() on success.
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    assert(scratch);
-    offset += prefixLength_;
-    auto status = file_->Read(offset, n, result, scratch);
-    if (!status.ok()) {
-      return status;
-    }
-    status = stream_->Decrypt(offset, (char*)result->data(), result->size());
+IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n,
+                                     const IOOptions& options, Slice* result,
+                                     char* scratch, IODebugContext* dbg) const {
+  assert(scratch);
+  offset += prefixLength_;
+  auto status = file_->Read(offset, n, options, result, scratch, dbg);
+  if (!status.ok()) {
     return status;
   }
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    status = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
+  }
+  return status;
+}
 
-  Status Flush() override { return file_->Flush(); }
+IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Flush(options, dbg);
+}
+
+IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  return file_->Sync(options, dbg);
+}
 
-  Status Sync() override { return file_->Sync(); }
+IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Fsync(options, dbg);
+}
 
-  Status Fsync() override { return file_->Fsync(); }
+IOStatus EncryptedRandomRWFile::Close(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Close(options, dbg);
+}
 
-  Status Close() override { return file_->Close(); }
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> encrypted_fs_type_info =
+    {
+        {"provider",
+         OptionTypeInfo::AsCustomSharedPtr<EncryptionProvider>(
+             0 /* No offset, whole struct*/, OptionVerificationType::kByName,
+             OptionTypeFlags::kNone)},
 };
+// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption
+// to files stored on disk.
+class EncryptedFileSystemImpl : public EncryptedFileSystem {
+ public:
+  const char* Name() const override {
+    return EncryptedFileSystem::kClassName();
+  }
+  // Returns the raw encryption provider that should be used to write the input
+  // encrypted file.  If there is no such provider, NotFound is returned.
+  IOStatus GetWritableProvider(const std::string& /*fname*/,
+                               EncryptionProvider** result) {
+    if (provider_) {
+      *result = provider_.get();
+      return IOStatus::OK();
+    } else {
+      *result = nullptr;
+      return IOStatus::NotFound("No WriteProvider specified");
+    }
+  }
+
+  // Returns the raw encryption provider that should be used to read the input
+  // encrypted file.  If there is no such provider, NotFound is returned.
+  IOStatus GetReadableProvider(const std::string& /*fname*/,
+                               EncryptionProvider** result) {
+    if (provider_) {
+      *result = provider_.get();
+      return IOStatus::OK();
+    } else {
+      *result = nullptr;
+      return IOStatus::NotFound("No Provider specified");
+    }
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a writable provider is found and encryption is enabled, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // should be encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateWritableCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    EncryptionProvider* provider = nullptr;
+    *prefix_length = 0;
+    IOStatus status = GetWritableProvider(fname, &provider);
+    if (!status.ok()) {
+      return status;
+    } else if (provider != nullptr) {
+      // Initialize & write prefix (if needed)
+      AlignedBuffer buffer;
+      Slice prefix;
+      *prefix_length = provider->GetPrefixLength();
+      if (*prefix_length > 0) {
+        // Initialize prefix
+        buffer.Alignment(underlying->GetRequiredBufferAlignment());
+        buffer.AllocateNewBuffer(*prefix_length);
+        status = status_to_io_status(provider->CreateNewPrefix(
+            fname, buffer.BufferStart(), *prefix_length));
+        if (status.ok()) {
+          buffer.Size(*prefix_length);
+          prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
+          // Write prefix
+          status = underlying->Append(prefix, options.io_options, dbg);
+        }
+        if (!status.ok()) {
+          return status;
+        }
+      }
+      // Create cipher stream
+      status = status_to_io_status(
+          provider->CreateCipherStream(fname, options, prefix, stream));
+    }
+    return status;
+  }
+
+  template <class TypeFile>
+  IOStatus CreateWritableEncryptedFile(const std::string& fname,
+                                       std::unique_ptr<TypeFile>& underlying,
+                                       const FileOptions& options,
+                                       std::unique_ptr<TypeFile>* result,
+                                       IODebugContext* dbg) {
+    // Create cipher stream
+    std::unique_ptr<BlockAccessCipherStream> stream;
+    size_t prefix_length;
+    IOStatus status = CreateWritableCipherStream(fname, underlying, options,
+                                                 &prefix_length, &stream, dbg);
+    if (status.ok()) {
+      if (stream) {
+        result->reset(new EncryptedWritableFile(
+            std::move(underlying), std::move(stream), prefix_length));
+      } else {
+        result->reset(underlying.release());
+      }
+    }
+    return status;
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a writable provider is found and encryption is enabled, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // should be encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateRandomWriteCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    EncryptionProvider* provider = nullptr;
+    *prefix_length = 0;
+    IOStatus io_s = GetWritableProvider(fname, &provider);
+    if (!io_s.ok()) {
+      return io_s;
+    } else if (provider != nullptr) {
+      // Initialize & write prefix (if needed)
+      AlignedBuffer buffer;
+      Slice prefix;
+      *prefix_length = provider->GetPrefixLength();
+      if (*prefix_length > 0) {
+        // Initialize prefix
+        buffer.Alignment(underlying->GetRequiredBufferAlignment());
+        buffer.AllocateNewBuffer(*prefix_length);
+        io_s = status_to_io_status(provider->CreateNewPrefix(
+            fname, buffer.BufferStart(), *prefix_length));
+        if (io_s.ok()) {
+          buffer.Size(*prefix_length);
+          prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
+          // Write prefix
+          io_s = underlying->Write(0, prefix, options.io_options, dbg);
+        }
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+      // Create cipher stream
+      io_s = status_to_io_status(
+          provider->CreateCipherStream(fname, options, prefix, stream));
+    }
+    return io_s;
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a readable provider is found and the file is encrypted, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // is encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateSequentialCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    // Read prefix (if needed)
+    AlignedBuffer buffer;
+    Slice prefix;
+    *prefix_length = provider_->GetPrefixLength();
+    if (*prefix_length > 0) {
+      // Read prefix
+      buffer.Alignment(underlying->GetRequiredBufferAlignment());
+      buffer.AllocateNewBuffer(*prefix_length);
+      IOStatus status = underlying->Read(*prefix_length, options.io_options,
+                                         &prefix, buffer.BufferStart(), dbg);
+      if (!status.ok()) {
+        return status;
+      }
+      buffer.Size(*prefix_length);
+    }
+    return status_to_io_status(
+        provider_->CreateCipherStream(fname, options, prefix, stream));
+  }
+
+  // Creates a CipherStream for the underlying file/name using the options
+  // If a readable provider is found and the file is encrypted, uses
+  // this provider to create a cipher stream.
+  // @param fname         Name of the writable file
+  // @param underlying    The underlying "raw" file
+  // @param options       Options for creating the file/cipher
+  // @param prefix_length Returns the length of the encryption prefix used for
+  // this file
+  // @param stream        Returns the cipher stream to use for this file if it
+  // is encrypted
+  // @return OK on success, non-OK on failure.
+  template <class TypeFile>
+  IOStatus CreateRandomReadCipherStream(
+      const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
+    // Read prefix (if needed)
+    AlignedBuffer buffer;
+    Slice prefix;
+    *prefix_length = provider_->GetPrefixLength();
+    if (*prefix_length > 0) {
+      // Read prefix
+      buffer.Alignment(underlying->GetRequiredBufferAlignment());
+      buffer.AllocateNewBuffer(*prefix_length);
+      IOStatus status = underlying->Read(0, *prefix_length, options.io_options,
+                                         &prefix, buffer.BufferStart(), dbg);
+      if (!status.ok()) {
+        return status;
+      }
+      buffer.Size(*prefix_length);
+    }
+    return status_to_io_status(
+        provider_->CreateCipherStream(fname, options, prefix, stream));
+  }
 
-// EncryptedEnv implements an Env wrapper that adds encryption to files stored on disk.
-class EncryptedEnv : public EnvWrapper {
  public:
-  EncryptedEnv(Env* base_env, EncryptionProvider *provider)
-      : EnvWrapper(base_env) {
+  EncryptedFileSystemImpl(const std::shared_ptr<FileSystem>& base,
+                          const std::shared_ptr<EncryptionProvider>& provider)
+      : EncryptedFileSystem(base) {
     provider_ = provider;
+    RegisterOptions("EncryptionProvider", &provider_, &encrypted_fs_type_info);
+  }
+
+  Status AddCipher(const std::string& descriptor, const char* cipher,
+                   size_t len, bool for_write) override {
+    return provider_->AddCipher(descriptor, cipher, len, for_write);
   }
 
   // NewSequentialFile opens a file for sequential reading.
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& options) override {
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_reads) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<SequentialFile> underlying;
-    auto status = EnvWrapper::NewSequentialFile(fname, &underlying, options);
+    std::unique_ptr<FSSequentialFile> underlying;
+    auto status =
+        FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    // Read prefix (if needed)
-    AlignedBuffer prefixBuf;
-    Slice prefixSlice;
-    size_t prefixLength = provider_->GetPrefixLength();
-    if (prefixLength > 0) {
-      // Read prefix
-      prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
-      prefixBuf.AllocateNewBuffer(prefixLength);
-      status = underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart());
-      if (!status.ok()) {
-        return status;
-      }
-      prefixBuf.Size(prefixLength);
+    uint64_t file_size;
+    status = FileSystemWrapper::GetFileSize(fname, options.io_options,
+                                            &file_size, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    if (!file_size) {
+      *result = std::move(underlying);
+      return status;
     }
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
-    status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream);
-    if (!status.ok()) {
-      return status;
+    size_t prefix_length;
+    status = CreateSequentialCipherStream(fname, underlying, options,
+                                          &prefix_length, &stream, dbg);
+    if (status.ok()) {
+      result->reset(new EncryptedSequentialFile(
+          std::move(underlying), std::move(stream), prefix_length));
     }
-    (*result) = std::unique_ptr<SequentialFile>(new EncryptedSequentialFile(underlying.release(), stream.release(), prefixLength));
-    return Status::OK();
+    return status;
   }
 
   // NewRandomAccessFile opens a file for random read access.
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& options) override {
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_reads) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<RandomAccessFile> underlying;
-    auto status = EnvWrapper::NewRandomAccessFile(fname, &underlying, options);
+    std::unique_ptr<FSRandomAccessFile> underlying;
+    auto status = FileSystemWrapper::NewRandomAccessFile(fname, options,
+                                                         &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    // Read prefix (if needed)
-    AlignedBuffer prefixBuf;
-    Slice prefixSlice;
-    size_t prefixLength = provider_->GetPrefixLength();
-    if (prefixLength > 0) {
-      // Read prefix
-      prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
-      prefixBuf.AllocateNewBuffer(prefixLength);
-      status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart());
-      if (!status.ok()) {
-        return status;
-      }
-      prefixBuf.Size(prefixLength);
-    }
-    // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
-    status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream);
-    if (!status.ok()) {
-      return status;
+    size_t prefix_length;
+    status = CreateRandomReadCipherStream(fname, underlying, options,
+                                          &prefix_length, &stream, dbg);
+    if (status.ok()) {
+      if (stream) {
+        result->reset(new EncryptedRandomAccessFile(
+            std::move(underlying), std::move(stream), prefix_length));
+      } else {
+        result->reset(underlying.release());
+      }
     }
-    (*result) = std::unique_ptr<RandomAccessFile>(new EncryptedRandomAccessFile(underlying.release(), stream.release(), prefixLength));
-    return Status::OK();
+    return status;
   }
 
   // NewWritableFile opens a file for sequential writing.
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& options) override {
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<WritableFile> underlying;
-    Status status = EnvWrapper::NewWritableFile(fname, &underlying, options);
-    if (!status.ok()) {
-      return status;
-    }
-    // Initialize & write prefix (if needed)
-    AlignedBuffer prefixBuf;
-    Slice prefixSlice;
-    size_t prefixLength = provider_->GetPrefixLength();
-    if (prefixLength > 0) {
-      // Initialize prefix
-      prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
-      prefixBuf.AllocateNewBuffer(prefixLength);
-      provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixBuf.Size(prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
-      // Write prefix
-      status = underlying->Append(prefixSlice);
-      if (!status.ok()) {
-        return status;
-      }
-    }
-    // Create cipher stream
-    std::unique_ptr<BlockAccessCipherStream> stream;
-    status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream);
+    std::unique_ptr<FSWritableFile> underlying;
+    IOStatus status =
+        FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    (*result) = std::unique_ptr<WritableFile>(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength));
-    return Status::OK();
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
   }
 
   // Create an object that writes to a new file with the specified
@@ -503,86 +775,42 @@
   // returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  Status ReopenWritableFile(const std::string& fname,
-                            std::unique_ptr<WritableFile>* result,
-                            const EnvOptions& options) override {
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<WritableFile> underlying;
-    Status status = EnvWrapper::ReopenWritableFile(fname, &underlying, options);
+    std::unique_ptr<FSWritableFile> underlying;
+    IOStatus status =
+        FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    // Initialize & write prefix (if needed)
-    AlignedBuffer prefixBuf;
-    Slice prefixSlice;
-    size_t prefixLength = provider_->GetPrefixLength();
-    if (prefixLength > 0) {
-      // Initialize prefix
-      prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
-      prefixBuf.AllocateNewBuffer(prefixLength);
-      provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixBuf.Size(prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
-      // Write prefix
-      status = underlying->Append(prefixSlice);
-      if (!status.ok()) {
-        return status;
-      }
-    }
-    // Create cipher stream
-    std::unique_ptr<BlockAccessCipherStream> stream;
-    status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream);
-    if (!status.ok()) {
-      return status;
-    }
-    (*result) = std::unique_ptr<WritableFile>(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength));
-    return Status::OK();
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
   }
 
   // Reuse an existing file by renaming it and opening it as writable.
-  Status ReuseWritableFile(const std::string& fname,
-                           const std::string& old_fname,
-                           std::unique_ptr<WritableFile>* result,
-                           const EnvOptions& options) override {
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<WritableFile> underlying;
-    Status status = EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options);
+    std::unique_ptr<FSWritableFile> underlying;
+    auto status = FileSystemWrapper::ReuseWritableFile(
+        fname, old_fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    // Initialize & write prefix (if needed)
-    AlignedBuffer prefixBuf;
-    Slice prefixSlice;
-    size_t prefixLength = provider_->GetPrefixLength();
-    if (prefixLength > 0) {
-      // Initialize prefix
-      prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
-      prefixBuf.AllocateNewBuffer(prefixLength);
-      provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixBuf.Size(prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
-      // Write prefix
-      status = underlying->Append(prefixSlice);
-      if (!status.ok()) {
-        return status;
-      }
-    }
-    // Create cipher stream
-    std::unique_ptr<BlockAccessCipherStream> stream;
-    status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream);
-    if (!status.ok()) {
-      return status;
-    }
-    (*result) = std::unique_ptr<WritableFile>(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength));
-    return Status::OK();
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
   }
 
   // Open `fname` for random read and write, if file doesn't exist the file
@@ -590,102 +818,137 @@
   // *result and returns OK.  On failure returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override {
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_reads || options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Check file exists
-    bool isNewFile = !FileExists(fname).ok();
+    bool isNewFile = !FileExists(fname, options.io_options, dbg).ok();
 
     // Open file using underlying Env implementation
-    std::unique_ptr<RandomRWFile> underlying;
-    Status status = EnvWrapper::NewRandomRWFile(fname, &underlying, options);
+    std::unique_ptr<FSRandomRWFile> underlying;
+    auto status =
+        FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    // Read or Initialize & write prefix (if needed)
-    AlignedBuffer prefixBuf;
-    Slice prefixSlice;
-    size_t prefixLength = provider_->GetPrefixLength();
-    if (prefixLength > 0) {
-      prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
-      prefixBuf.AllocateNewBuffer(prefixLength);
-      if (!isNewFile) {
-        // File already exists, read prefix
-        status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart());
-        if (!status.ok()) {
-          return status;
-        }
-        prefixBuf.Size(prefixLength);
-      } else {
-        // File is new, initialize & write prefix
-        provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-        prefixBuf.Size(prefixLength);
-        prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
-        // Write prefix
-        status = underlying->Write(0, prefixSlice);
-        if (!status.ok()) {
-          return status;
-        }
-      }
-    }
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
-    status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream);
-    if (!status.ok()) {
-      return status;
+    size_t prefix_length = 0;
+    if (!isNewFile) {
+      // File already exists, read prefix
+      status = CreateRandomReadCipherStream(fname, underlying, options,
+                                            &prefix_length, &stream, dbg);
+    } else {
+      status = CreateRandomWriteCipherStream(fname, underlying, options,
+                                             &prefix_length, &stream, dbg);
+    }
+    if (status.ok()) {
+      if (stream) {
+        result->reset(new EncryptedRandomRWFile(
+            std::move(underlying), std::move(stream), prefix_length));
+      } else {
+        result->reset(underlying.release());
+      }
     }
-    (*result) = std::unique_ptr<RandomRWFile>(new EncryptedRandomRWFile(underlying.release(), stream.release(), prefixLength));
-    return Status::OK();
+    return status;
   }
 
-  // Store in *result the attributes of the children of the specified directory.
-  // In case the implementation lists the directory prior to iterating the files
-  // and files are concurrently deleted, the deleted files will be omitted from
+  // Store in *result the attributes of the children of the specified
+  // directory.
+  // In case the implementation lists the directory prior to iterating the
+  // files
+  // and files are concurrently deleted, the deleted files will be omitted
+  // from
   // result.
   // The name attributes are relative to "dir".
   // Original contents of *results are dropped.
   // Returns OK if "dir" exists and "*result" contains its children.
-  //         NotFound if "dir" does not exist, the calling process does not have
+  //         NotFound if "dir" does not exist, the calling process does not
+  //         have
   //                  permission to access "dir", or if "dir" is invalid.
   //         IOError if an IO Error was encountered
-  Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
-    auto status = EnvWrapper::GetChildrenFileAttributes(dir, result);
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override {
+    auto status =
+        FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg);
     if (!status.ok()) {
       return status;
     }
-    size_t prefixLength = provider_->GetPrefixLength();
-    for (auto it = std::begin(*result); it!=std::end(*result); ++it) {
-      assert(it->size_bytes >= prefixLength);
-      it->size_bytes -= prefixLength;
+    for (auto it = std::begin(*result); it != std::end(*result); ++it) {
+      // assert(it->size_bytes >= prefixLength);
+      //  breaks env_basic_test when called on directory containing
+      //  directories
+      // which makes subtraction of prefixLength worrisome since
+      // FileAttributes does not identify directories
+      EncryptionProvider* provider;
+      status = GetReadableProvider(it->name, &provider);
+      if (!status.ok()) {
+        return status;
+      } else if (provider != nullptr) {
+        it->size_bytes -= provider->GetPrefixLength();
+      }
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
   // Store the size of fname in *file_size.
-  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
-    auto status = EnvWrapper::GetFileSize(fname, file_size);
-    if (!status.ok()) {
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override {
+    auto status =
+        FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
+    if (!status.ok() || !(*file_size)) {
       return status;
     }
-    size_t prefixLength = provider_->GetPrefixLength();
-    assert(*file_size >= prefixLength);
-    *file_size -= prefixLength;
-    return Status::OK();
+    EncryptionProvider* provider;
+    status = GetReadableProvider(fname, &provider);
+    if (provider != nullptr && status.ok()) {
+      size_t prefixLength = provider->GetPrefixLength();
+      assert(*file_size >= prefixLength);
+      *file_size -= prefixLength;
+    }
+    return status;
   }
 
  private:
-  EncryptionProvider *provider_;
+  std::shared_ptr<EncryptionProvider> provider_;
 };
+}  // namespace
+
+Status NewEncryptedFileSystemImpl(
+    const std::shared_ptr<FileSystem>& base,
+    const std::shared_ptr<EncryptionProvider>& provider,
+    std::unique_ptr<FileSystem>* result) {
+  result->reset(new EncryptedFileSystemImpl(base, provider));
+  return Status::OK();
+}
 
+std::shared_ptr<FileSystem> NewEncryptedFS(
+    const std::shared_ptr<FileSystem>& base,
+    const std::shared_ptr<EncryptionProvider>& provider) {
+  std::unique_ptr<FileSystem> efs;
+  Status s = NewEncryptedFileSystemImpl(base, provider, &efs);
+  if (s.ok()) {
+    s = efs->PrepareOptions(ConfigOptions());
+  }
+  if (s.ok()) {
+    std::shared_ptr<FileSystem> result(efs.release());
+    return result;
+  } else {
+    return nullptr;
+  }
+}
 // Returns an Env that encrypts data when stored on disk and decrypts data when
 // read from disk.
-Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) {
-  return new EncryptedEnv(base_env, provider);
+Env* NewEncryptedEnv(Env* base_env,
+                     const std::shared_ptr<EncryptionProvider>& provider) {
+  return new CompositeEnvWrapper(
+      base_env, NewEncryptedFS(base_env->GetFileSystem(), provider));
 }
 
 // Encrypt one or more (partial) blocks of data at the file offset.
@@ -786,38 +1049,71 @@
   }
 }
 
-// Encrypt a block of data.
-// Length of data is equal to BlockSize().
-Status ROT13BlockCipher::Encrypt(char *data) {
-  for (size_t i = 0; i < blockSize_; ++i) {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    rot13_block_cipher_type_info = {
+        {"block_size",
+         {0 /* No offset, whole struct*/, OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+// Implements a BlockCipher using ROT13.
+//
+// Note: This is a sample implementation of BlockCipher,
+// it is NOT considered safe and should NOT be used in production.
+class ROT13BlockCipher : public BlockCipher {
+ private:
+  size_t blockSize_;
+
+ public:
+  explicit ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {
+    RegisterOptions("ROT13BlockCipherOptions", &blockSize_,
+                    &rot13_block_cipher_type_info);
+  }
+
+  static const char* kClassName() { return "ROT13"; }
+  const char* Name() const override { return kClassName(); }
+  // BlockSize returns the size of each block supported by this cipher stream.
+  size_t BlockSize() override { return blockSize_; }
+
+  // Encrypt a block of data.
+  // Length of data is equal to BlockSize().
+  Status Encrypt(char* data) override {
+    for (size_t i = 0; i < blockSize_; ++i) {
       data[i] += 13;
+    }
+    return Status::OK();
   }
-  return Status::OK();
-}
 
-// Decrypt a block of data.
-// Length of data is equal to BlockSize().
-Status ROT13BlockCipher::Decrypt(char *data) {
-  return Encrypt(data);
-}
+  // Decrypt a block of data.
+  // Length of data is equal to BlockSize().
+  Status Decrypt(char* data) override { return Encrypt(data); }
+};
+static const std::unordered_map<std::string, OptionTypeInfo>
+    ctr_encryption_provider_type_info = {
+        {"cipher",
+         OptionTypeInfo::AsCustomSharedPtr<BlockCipher>(
+             0 /* No offset, whole struct*/, OptionVerificationType::kByName,
+             OptionTypeFlags::kNone)},
+};
+}  // anonymous namespace
 
 // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
 void CTRCipherStream::AllocateScratch(std::string& scratch) {
-  auto blockSize = cipher_.BlockSize();
+  auto blockSize = cipher_->BlockSize();
   scratch.reserve(blockSize);
 }
 
 // Encrypt a block of data at the given block index.
 // Length of data is equal to BlockSize();
-Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scratch) {
-
+Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data,
+                                     char* scratch) {
   // Create nonce + counter
-  auto blockSize = cipher_.BlockSize();
+  auto blockSize = cipher_->BlockSize();
   memmove(scratch, iv_.data(), blockSize);
   EncodeFixed64(scratch, blockIndex + initialCounter_);
 
   // Encrypt nonce+counter
-  auto status = cipher_.Encrypt(scratch);
+  auto status = cipher_->Encrypt(scratch);
   if (!status.ok()) {
     return status;
   }
@@ -831,22 +1127,44 @@
 
 // Decrypt a block of data at the given block index.
 // Length of data is equal to BlockSize();
-Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) {
+Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data,
+                                     char* scratch) {
   // For CTR decryption & encryption are the same
   return EncryptBlock(blockIndex, data, scratch);
 }
 
+CTREncryptionProvider::CTREncryptionProvider(
+    const std::shared_ptr<BlockCipher>& c)
+    : cipher_(c) {
+  RegisterOptions("Cipher", &cipher_, &ctr_encryption_provider_type_info);
+}
+
 // GetPrefixLength returns the length of the prefix that is added to every file
 // and used for storing encryption options.
 // For optimal performance, the prefix length should be a multiple of
 // the page size.
-size_t CTREncryptionProvider::GetPrefixLength() {
+size_t CTREncryptionProvider::GetPrefixLength() const {
   return defaultPrefixLength;
 }
 
+Status CTREncryptionProvider::AddCipher(const std::string& /*descriptor*/,
+                                        const char* cipher, size_t len,
+                                        bool /*for_write*/) {
+  if (cipher_) {
+    return Status::NotSupported("Cannot add keys to CTREncryptionProvider");
+  } else if (strcmp(ROT13BlockCipher::kClassName(), cipher) == 0) {
+    cipher_.reset(new ROT13BlockCipher(len));
+    return Status::OK();
+  } else {
+    return BlockCipher::CreateFromString(ConfigOptions(), std::string(cipher),
+                                         &cipher_);
+  }
+}
+
 // decodeCTRParameters decodes the initial counter & IV from the given
 // (plain text) prefix.
-static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t &initialCounter, Slice &iv) {
+static void decodeCTRParameters(const char* prefix, size_t blockSize,
+                                uint64_t& initialCounter, Slice& iv) {
   // First block contains 64-bit initial counter
   initialCounter = DecodeFixed64(prefix);
   // Second block contains IV
@@ -857,25 +1175,35 @@
 // for a new file.
 Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
                                               char* prefix,
-                                              size_t prefixLength) {
+                                              size_t prefixLength) const {
+  if (!cipher_) {
+    return Status::InvalidArgument("Encryption Cipher is missing");
+  }
   // Create & seed rnd.
-  Random rnd((uint32_t)Env::Default()->NowMicros());
+  Random rnd((uint32_t)SystemClock::Default()->NowMicros());
   // Fill entire prefix block with random values.
   for (size_t i = 0; i < prefixLength; i++) {
     prefix[i] = rnd.Uniform(256) & 0xFF;
   }
   // Take random data to extract initial counter & IV
-  auto blockSize = cipher_.BlockSize();
+  auto blockSize = cipher_->BlockSize();
   uint64_t initialCounter;
   Slice prefixIV;
   decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV);
 
   // Now populate the rest of the prefix, starting from the third block.
-  PopulateSecretPrefixPart(prefix + (2 * blockSize), prefixLength - (2 * blockSize), blockSize);
+  PopulateSecretPrefixPart(prefix + (2 * blockSize),
+                           prefixLength - (2 * blockSize), blockSize);
 
-  // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial counter & IV unencrypted)
+  // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial
+  // counter & IV unencrypted)
   CTRCipherStream cipherStream(cipher_, prefixIV.data(), initialCounter);
-  auto status = cipherStream.Encrypt(0, prefix + (2 * blockSize), prefixLength - (2 * blockSize));
+  Status status;
+  {
+    PERF_TIMER_GUARD(encrypt_data_nanos);
+    status = cipherStream.Encrypt(0, prefix + (2 * blockSize),
+                                  prefixLength - (2 * blockSize));
+  }
   if (!status.ok()) {
     return status;
   }
@@ -886,9 +1214,8 @@
 // in plain text.
 // Returns the amount of space (starting from the start of the prefix)
 // that has been initialized.
-size_t CTREncryptionProvider::PopulateSecretPrefixPart(char* /*prefix*/,
-                                                       size_t /*prefixLength*/,
-                                                       size_t /*blockSize*/) {
+size_t CTREncryptionProvider::PopulateSecretPrefixPart(
+    char* /*prefix*/, size_t /*prefixLength*/, size_t /*blockSize*/) const {
   // Nothing to do here, put in custom data in override when needed.
   return 0;
 }
@@ -896,8 +1223,11 @@
 Status CTREncryptionProvider::CreateCipherStream(
     const std::string& fname, const EnvOptions& options, Slice& prefix,
     std::unique_ptr<BlockAccessCipherStream>* result) {
+  if (!cipher_) {
+    return Status::InvalidArgument("Encryption Cipher is missing");
+  }
   // Read plain text part of prefix.
-  auto blockSize = cipher_.BlockSize();
+  auto blockSize = cipher_->BlockSize();
   uint64_t initialCounter;
   Slice iv;
   decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv);
@@ -910,19 +1240,26 @@
                               ": read attempt would read beyond file bounds");
   }
 
-  // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted)
+  // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1
+  // with initial counter & IV are unencrypted)
   CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter);
-  auto status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), prefix.size() - (2 * blockSize));
+  Status status;
+  {
+    PERF_TIMER_GUARD(decrypt_data_nanos);
+    status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize),
+                                  prefix.size() - (2 * blockSize));
+  }
   if (!status.ok()) {
     return status;
   }
 
   // Create cipher stream
-  return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result);
+  return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv,
+                                      prefix, result);
 }
 
-// CreateCipherStreamFromPrefix creates a block access cipher stream for a file given
-// given name and options. The given prefix is already decrypted.
+// CreateCipherStreamFromPrefix creates a block access cipher stream for a file
+// given given name and options. The given prefix is already decrypted.
 Status CTREncryptionProvider::CreateCipherStreamFromPrefix(
     const std::string& /*fname*/, const EnvOptions& /*options*/,
     uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/,
@@ -932,6 +1269,72 @@
   return Status::OK();
 }
 
+namespace {
+static void RegisterEncryptionBuiltins() {
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    auto lib = ObjectRegistry::Default()->AddLibrary("encryption");
+    // Match "CTR" or "CTR://test"
+    lib->AddFactory<EncryptionProvider>(
+        ObjectLibrary::PatternEntry(CTREncryptionProvider::kClassName(), true)
+            .AddSuffix("://test"),
+        [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard,
+           std::string* /*errmsg*/) {
+          if (EndsWith(uri, "://test")) {
+            std::shared_ptr<BlockCipher> cipher =
+                std::make_shared<ROT13BlockCipher>(32);
+            guard->reset(new CTREncryptionProvider(cipher));
+          } else {
+            guard->reset(new CTREncryptionProvider());
+          }
+          return guard->get();
+        });
+
+    lib->AddFactory<EncryptionProvider>(
+        "1://test", [](const std::string& /*uri*/,
+                       std::unique_ptr<EncryptionProvider>* guard,
+                       std::string* /*errmsg*/) {
+          std::shared_ptr<BlockCipher> cipher =
+              std::make_shared<ROT13BlockCipher>(32);
+          guard->reset(new CTREncryptionProvider(cipher));
+          return guard->get();
+        });
+
+    // Match "ROT13" or "ROT13:[0-9]+"
+    lib->AddFactory<BlockCipher>(
+        ObjectLibrary::PatternEntry(ROT13BlockCipher::kClassName(), true)
+            .AddNumber(":"),
+        [](const std::string& uri, std::unique_ptr<BlockCipher>* guard,
+           std::string* /* errmsg */) {
+          size_t colon = uri.find(':');
+          if (colon != std::string::npos) {
+            size_t block_size = ParseSizeT(uri.substr(colon + 1));
+            guard->reset(new ROT13BlockCipher(block_size));
+          } else {
+            guard->reset(new ROT13BlockCipher(32));
+          }
+
+          return guard->get();
+        });
+  });
+}
+}  // namespace
+
+Status BlockCipher::CreateFromString(const ConfigOptions& config_options,
+                                     const std::string& value,
+                                     std::shared_ptr<BlockCipher>* result) {
+  RegisterEncryptionBuiltins();
+  return LoadSharedObject<BlockCipher>(config_options, value, nullptr, result);
+}
+
+Status EncryptionProvider::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<EncryptionProvider>* result) {
+  RegisterEncryptionBuiltins();
+  return LoadSharedObject<EncryptionProvider>(config_options, value, nullptr,
+                                              result);
+}
+
 #endif // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption_ctr.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption_ctr.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_encryption_ctr.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_encryption_ctr.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,116 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#if !defined(ROCKSDB_LITE)
+
+#include "rocksdb/env_encryption.h"
+
+namespace ROCKSDB_NAMESPACE {
+// CTRCipherStream implements BlockAccessCipherStream using an
+// Counter operations mode.
+// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation
+//
+// Note: This is a possible implementation of BlockAccessCipherStream,
+// it is considered suitable for use.
+class CTRCipherStream final : public BlockAccessCipherStream {
+ private:
+  std::shared_ptr<BlockCipher> cipher_;
+  std::string iv_;
+  uint64_t initialCounter_;
+
+ public:
+  CTRCipherStream(const std::shared_ptr<BlockCipher>& c, const char* iv,
+                  uint64_t initialCounter)
+      : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){};
+  virtual ~CTRCipherStream(){};
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  size_t BlockSize() override { return cipher_->BlockSize(); }
+
+ protected:
+  // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+  void AllocateScratch(std::string&) override;
+
+  // Encrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override;
+
+  // Decrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override;
+};
+
+// This encryption provider uses a CTR cipher stream, with a given block cipher
+// and IV.
+//
+// Note: This is a possible implementation of EncryptionProvider,
+// it is considered suitable for use, provided a safe BlockCipher is used.
+class CTREncryptionProvider : public EncryptionProvider {
+ private:
+  std::shared_ptr<BlockCipher> cipher_;
+
+ protected:
+  // For optimal performance when using direct IO, the prefix length should be a
+  // multiple of the page size. This size is to ensure the first real data byte
+  // is placed at largest known alignment point for direct io.
+  const static size_t defaultPrefixLength = 4096;
+
+ public:
+  explicit CTREncryptionProvider(
+      const std::shared_ptr<BlockCipher>& c = nullptr);
+  virtual ~CTREncryptionProvider() {}
+
+  static const char* kClassName() { return "CTR"; }
+  const char* Name() const override { return kClassName(); }
+
+  // GetPrefixLength returns the length of the prefix that is added to every
+  // file
+  // and used for storing encryption options.
+  // For optimal performance when using direct IO, the prefix length should be a
+  // multiple of the page size.
+  size_t GetPrefixLength() const override;
+
+  // CreateNewPrefix initialized an allocated block of prefix memory
+  // for a new file.
+  Status CreateNewPrefix(const std::string& fname, char* prefix,
+                         size_t prefixLength) const override;
+
+  // CreateCipherStream creates a block access cipher stream for a file given
+  // given name and options.
+  Status CreateCipherStream(
+      const std::string& fname, const EnvOptions& options, Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result) override;
+
+  Status AddCipher(const std::string& descriptor, const char* /*cipher*/,
+                   size_t /*len*/, bool /*for_write*/) override;
+ protected:
+
+  // PopulateSecretPrefixPart initializes the data into a new prefix block
+  // that will be encrypted. This function will store the data in plain text.
+  // It will be encrypted later (before written to disk).
+  // Returns the amount of space (starting from the start of the prefix)
+  // that has been initialized.
+  virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength,
+                                          size_t blockSize) const;
+
+  // CreateCipherStreamFromPrefix creates a block access cipher stream for a
+  // file given
+  // given name and options. The given prefix is already decrypted.
+  virtual Status CreateCipherStreamFromPrefix(
+      const std::string& fname, const EnvOptions& options,
+      uint64_t initialCounter, const Slice& iv, const Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result);
+};
+
+Status NewEncryptedFileSystemImpl(
+    const std::shared_ptr<FileSystem>& base_fs,
+    const std::shared_ptr<EncryptionProvider>& provider,
+    std::unique_ptr<FileSystem>* fs);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_hdfs.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_hdfs.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_hdfs.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_hdfs.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,7 +12,6 @@
 #define ROCKSDB_HDFS_FILE_C
 
 #include <stdio.h>
-#include <sys/time.h>
 #include <time.h>
 #include <algorithm>
 #include <iostream>
@@ -38,10 +37,10 @@
 // Log error message
 static Status IOError(const std::string& context, int err_number) {
   return (err_number == ENOSPC)
-             ? Status::NoSpace(context, strerror(err_number))
+             ? Status::NoSpace(context, errnoStr(err_number).c_str())
              : (err_number == ENOENT)
-                   ? Status::PathNotFound(context, strerror(err_number))
-                   : Status::IOError(context, strerror(err_number));
+                   ? Status::PathNotFound(context, errnoStr(err_number).c_str())
+                   : Status::IOError(context, errnoStr(err_number).c_str());
 }
 
 // assume that there is one global logger for now. It is not thread-safe,
@@ -124,8 +123,9 @@
     Status s;
     ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsReadableFile preading %s\n",
                     filename_.c_str());
-    ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset,
-                                   (void*)scratch, (tSize)n);
+    tSize bytes_read =
+        hdfsPread(fileSys_, hfile_, offset, static_cast<void*>(scratch),
+                  static_cast<tSize>(n));
     ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsReadableFile pread %s\n",
                     filename_.c_str());
     *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read);
@@ -213,6 +213,8 @@
     }
   }
 
+  using WritableFile::Append;
+
   // If the file was successfully created, then this returns true.
   // Otherwise returns false.
   bool isValid() {
@@ -609,6 +611,18 @@
   return Status::OK();
 }
 
+Status HdfsEnv::IsDirectory(const std::string& path, bool* is_dir) {
+  hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, path.c_str());
+  if (pFileInfo != nullptr) {
+    if (is_dir != nullptr) {
+      *is_dir = (pFileInfo->mKind == kObjectKindDirectory);
+    }
+    hdfsFreeFileInfo(pFileInfo, 1);
+    return Status::OK();
+  }
+  return IOError(path, errno);
+}
+
 // The factory method for creating an HDFS Env
 Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname) {
   *hdfs_env = new HdfsEnv(fsname);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_posix.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_posix.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_posix.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,6 +6,10 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors
+
+#include "port/lang.h"
+#if !defined(OS_WIN)
+
 #include <dirent.h>
 #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
 #include <dlfcn.h>
@@ -13,9 +17,6 @@
 #include <errno.h>
 #include <fcntl.h>
 
-#if defined(OS_LINUX)
-#include <linux/fs.h>
-#endif
 #if defined(ROCKSDB_IOURING_PRESENT)
 #include <liburing.h>
 #endif
@@ -24,13 +25,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
 #include <sys/statfs.h>
-#include <sys/syscall.h>
-#include <sys/sysmacros.h>
 #endif
 #include <sys/statvfs.h>
 #include <sys/time.h>
@@ -39,9 +37,11 @@
 #include <sys/uio.h>
 #endif
 #include <time.h>
+#include <unistd.h>
+
 #include <algorithm>
 // Get nano time includes
-#if defined(OS_LINUX) || defined(OS_FREEBSD)
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
 #elif defined(__MACH__)
 #include <Availability.h>
 #include <mach/clock.h>
@@ -55,13 +55,14 @@
 
 #include "env/composite_env_wrapper.h"
 #include "env/io_posix.h"
-#include "logging/logging.h"
 #include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "port/port.h"
+#include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
@@ -126,23 +127,105 @@
 };
 #endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
 
-class PosixEnv : public CompositeEnvWrapper {
+class PosixClock : public SystemClock {
  public:
-  PosixEnv();
+  static const char* kClassName() { return "PosixClock"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
 
-  ~PosixEnv() override {
-    for (const auto tid : threads_to_join_) {
-      pthread_join(tid, nullptr);
-    }
-    for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
-      thread_pools_[pool_id].JoinAllThreads();
+  uint64_t NowMicros() override {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  }
+
+  uint64_t NowNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX)
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#elif defined(OS_SOLARIS)
+    return gethrtime();
+#elif defined(__MACH__)
+    clock_serv_t cclock;
+    mach_timespec_t ts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &ts);
+    mach_port_deallocate(mach_task_self(), cclock);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#else
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+               std::chrono::steady_clock::now().time_since_epoch())
+        .count();
+#endif
+  }
+
+  uint64_t CPUMicros() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
+#endif
+    return 0;
+  }
+
+  uint64_t CPUNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#endif
+    return 0;
+  }
+
+  void SleepForMicroseconds(int micros) override { usleep(micros); }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    time_t ret = time(nullptr);
+    if (ret == (time_t)-1) {
+      return IOError("GetCurrentTime", "", errno);
     }
-    // Delete the thread_status_updater_ only when the current Env is not
-    // Env::Default().  This is to avoid the free-after-use error when
-    // Env::Default() is destructed while some other child threads are
-    // still trying to update thread status.
-    if (this != Env::Default()) {
-      delete thread_status_updater_;
+    *unix_time = (int64_t)ret;
+    return Status::OK();
+  }
+
+  std::string TimeToString(uint64_t secondsSince1970) override {
+    const time_t seconds = (time_t)secondsSince1970;
+    struct tm t;
+    int maxsize = 64;
+    std::string dummy;
+    dummy.reserve(maxsize);
+    dummy.resize(maxsize);
+    char* p = &dummy[0];
+    localtime_r(&seconds, &t);
+    snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+             t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+    return dummy;
+  }
+};
+
+class PosixEnv : public CompositeEnv {
+ public:
+  static const char* kClassName() { return "PosixEnv"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
+
+  ~PosixEnv() override {
+    if (this == Env::Default()) {
+      for (const auto tid : threads_to_join_) {
+        pthread_join(tid, nullptr);
+      }
+      for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+        thread_pools_[pool_id].JoinAllThreads();
+      }
+      // Do not delete the thread_status_updater_ in order to avoid the
+      // free after use when Env::Default() is destructed while some other
+      // child threads are still trying to update thread status. All
+      // PosixEnv instances use the same thread_status_updater_, so never
+      // explicitly delete it.
     }
   }
 
@@ -163,7 +246,6 @@
   // provided by the search path
   Status LoadLibrary(const std::string& name, const std::string& path,
                      std::shared_ptr<DynamicLibrary>* result) override {
-    Status status;
     assert(result != nullptr);
     if (name.empty()) {
       void* hndl = dlopen(NULL, RTLD_NOW);
@@ -220,109 +302,32 @@
 
   unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
 
-  Status GetTestDirectory(std::string* result) override {
-    const char* env = getenv("TEST_TMPDIR");
-    if (env && env[0] != '\0') {
-      *result = env;
-    } else {
-      char buf[100];
-      snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
-      *result = buf;
-    }
-    // Directory may already exist
-    CreateDir(*result);
-    return Status::OK();
-  }
-
   Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
     assert(thread_status_updater_);
     return thread_status_updater_->GetThreadList(thread_list);
   }
 
-  static uint64_t gettid(pthread_t tid) {
+  uint64_t GetThreadID() const override {
     uint64_t thread_id = 0;
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 30)
+    thread_id = ::gettid();
+#else   // __GLIBC_PREREQ(2, 30)
+    pthread_t tid = pthread_self();
     memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
-    return thread_id;
-  }
-
-  static uint64_t gettid() {
+#endif  // __GLIBC_PREREQ(2, 30)
+#else   // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
     pthread_t tid = pthread_self();
-    return gettid(tid);
-  }
-
-  uint64_t GetThreadID() const override { return gettid(pthread_self()); }
-
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override {
-    FILE* f;
-    {
-      IOSTATS_TIMER_GUARD(open_nanos);
-      f = fopen(fname.c_str(),
-                "w"
-#ifdef __GLIBC_PREREQ
-#if __GLIBC_PREREQ(2, 7)
-                "e"  // glibc extension to enable O_CLOEXEC
-#endif
-#endif
-      );
-    }
-    if (f == nullptr) {
-      result->reset();
-      return IOError("when fopen a file for new logger", fname, errno);
-    } else {
-      int fd = fileno(f);
-#ifdef ROCKSDB_FALLOCATE_PRESENT
-      fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024);
-#endif
-      SetFD_CLOEXEC(fd, nullptr);
-      result->reset(new PosixLogger(f, &PosixEnv::gettid, this));
-      return Status::OK();
-    }
-  }
-
-  uint64_t NowMicros() override {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-  }
-
-  uint64_t NowNanos() override {
-#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX)
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#elif defined(OS_SOLARIS)
-    return gethrtime();
-#elif defined(__MACH__)
-    clock_serv_t cclock;
-    mach_timespec_t ts;
-    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-    clock_get_time(cclock, &ts);
-    mach_port_deallocate(mach_task_self(), cclock);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#else
-    return std::chrono::duration_cast<std::chrono::nanoseconds>(
-       std::chrono::steady_clock::now().time_since_epoch()).count();
-#endif
-  }
-
-  uint64_t NowCPUNanos() override {
-#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \
-    (defined(__MACH__) && defined(__MAC_10_12))
-    struct timespec ts;
-    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#endif
-    return 0;
+    memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
+#endif  // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+    return thread_id;
   }
 
-  void SleepForMicroseconds(int micros) override { usleep(micros); }
-
   Status GetHostName(char* name, uint64_t len) override {
     int ret = gethostname(name, static_cast<size_t>(len));
     if (ret < 0) {
       if (errno == EFAULT || errno == EINVAL) {
-        return Status::InvalidArgument(strerror(errno));
+        return Status::InvalidArgument(errnoStr(errno).c_str());
       } else {
         return IOError("GetHostName", name, errno);
       }
@@ -330,15 +335,6 @@
     return Status::OK();
   }
 
-  Status GetCurrentTime(int64_t* unix_time) override {
-    time_t ret = time(nullptr);
-    if (ret == (time_t) -1) {
-      return IOError("GetCurrentTime", "", errno);
-    }
-    *unix_time = (int64_t) ret;
-    return Status::OK();
-  }
-
   ThreadStatusUpdater* GetThreadStatusUpdater() const override {
     return Env::GetThreadStatusUpdater();
   }
@@ -367,7 +363,7 @@
     thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
   }
 
-  void LowerThreadPoolIOPriority(Priority pool = LOW) override {
+  void LowerThreadPoolIOPriority(Priority pool) override {
     assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
 #ifdef OS_LINUX
     thread_pools_[pool].LowerIOPriority();
@@ -376,48 +372,46 @@
 #endif
   }
 
-  void LowerThreadPoolCPUPriority(Priority pool = LOW) override {
+  void LowerThreadPoolCPUPriority(Priority pool) override {
     assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
-#ifdef OS_LINUX
-    thread_pools_[pool].LowerCPUPriority();
-#else
-    (void)pool;
-#endif
+    thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow);
   }
 
-  std::string TimeToString(uint64_t secondsSince1970) override {
-    const time_t seconds = (time_t)secondsSince1970;
-    struct tm t;
-    int maxsize = 64;
-    std::string dummy;
-    dummy.reserve(maxsize);
-    dummy.resize(maxsize);
-    char* p = &dummy[0];
-    localtime_r(&seconds, &t);
-    snprintf(p, maxsize,
-             "%04d/%02d/%02d-%02d:%02d:%02d ",
-             t.tm_year + 1900,
-             t.tm_mon + 1,
-             t.tm_mday,
-             t.tm_hour,
-             t.tm_min,
-             t.tm_sec);
-    return dummy;
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+    thread_pools_[pool].LowerCPUPriority(pri);
+    return Status::OK();
   }
 
  private:
-  std::vector<ThreadPoolImpl> thread_pools_;
-  pthread_mutex_t mu_;
-  std::vector<pthread_t> threads_to_join_;
+  friend Env* Env::Default();
+  // Constructs the default Env, a singleton
+  PosixEnv();
+
+  // The below 4 members are only used by the default PosixEnv instance.
+  // Non-default instances simply maintain references to the backing
+  // members in te default instance
+  std::vector<ThreadPoolImpl> thread_pools_storage_;
+  pthread_mutex_t mu_storage_;
+  std::vector<pthread_t> threads_to_join_storage_;
+  bool allow_non_owner_access_storage_;
+
+  std::vector<ThreadPoolImpl>& thread_pools_;
+  pthread_mutex_t& mu_;
+  std::vector<pthread_t>& threads_to_join_;
   // If true, allow non owner read access for db files. Otherwise, non-owner
   //  has no access to db files.
-  bool allow_non_owner_access_;
+  bool& allow_non_owner_access_;
 };
 
 PosixEnv::PosixEnv()
-    : CompositeEnvWrapper(this, FileSystem::Default().get()),
-      thread_pools_(Priority::TOTAL),
-      allow_non_owner_access_(true) {
+    : CompositeEnv(FileSystem::Default(), SystemClock::Default()),
+      thread_pools_storage_(Priority::TOTAL),
+      allow_non_owner_access_storage_(true),
+      thread_pools_(thread_pools_storage_),
+      mu_(mu_storage_),
+      threads_to_join_(threads_to_join_storage_),
+      allow_non_owner_access_(allow_non_owner_access_storage_) {
   ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
   for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
     thread_pools_[pool_id].SetThreadPriority(
@@ -476,31 +470,6 @@
 
 }  // namespace
 
-std::string Env::GenerateUniqueId() {
-  std::string uuid_file = "/proc/sys/kernel/random/uuid";
-
-  Status s = FileExists(uuid_file);
-  if (s.ok()) {
-    std::string uuid;
-    s = ReadFileToString(this, uuid_file, &uuid);
-    if (s.ok()) {
-      return uuid;
-    }
-  }
-  // Could not read uuid_file - generate uuid using "nanos-random"
-  Random64 r(time(nullptr));
-  uint64_t random_uuid_portion =
-    r.Uniform(std::numeric_limits<uint64_t>::max());
-  uint64_t nanos_uuid_portion = NowNanos();
-  char uuid2[200];
-  snprintf(uuid2,
-           200,
-           "%lx-%lx",
-           (unsigned long)nanos_uuid_portion,
-           (unsigned long)random_uuid_portion);
-  return uuid2;
-}
-
 //
 // Default Posix Env
 //
@@ -518,10 +487,19 @@
   ThreadLocalPtr::InitSingletons();
   CompressionContextCache::InitSingleton();
   INIT_SYNC_POINT_SINGLETONS();
+  // ~PosixEnv must be called on exit
   static PosixEnv default_env;
-  static CompositeEnvWrapper composite_env(&default_env,
-                                           FileSystem::Default().get());
-  return &composite_env;
+  return &default_env;
 }
 
+//
+// Default Posix SystemClock
+//
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+  static std::shared_ptr<SystemClock> default_clock =
+      std::make_shared<PosixClock>();
+  return default_clock;
+}
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/env_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/env_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,12 +11,17 @@
 #include <sys/ioctl.h>
 #endif
 
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#include <sys/uio.h>
+#endif
+
 #include <sys/types.h>
 
-#include <iostream>
-#include <unordered_set>
 #include <atomic>
 #include <list>
+#include <mutex>
+#include <unordered_set>
 
 #ifdef OS_LINUX
 #include <fcntl.h>
@@ -30,17 +35,36 @@
 #include <errno.h>
 #endif
 
+#include "db/db_impl/db_impl.h"
+#include "env/emulated_clock.h"
 #include "env/env_chroot.h"
+#include "env/env_encryption_ctr.h"
+#include "env/fs_readonly.h"
+#include "env/mock_env.h"
+#include "env/unique_id_gen.h"
 #include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "port/malloc.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
+#include "util/crc32c.h"
 #include "util/mutexlock.h"
+#include "util/random.h"
 #include "util/string_util.h"
+#include "utilities/env_timed.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -86,6 +110,11 @@
   Env* env_;
   bool direct_io_;
   EnvPosixTest() : env_(Env::Default()), direct_io_(false) {}
+  ~EnvPosixTest() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->LoadDependency({});
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
 };
 
 class EnvPosixTestWithParam
@@ -183,7 +212,7 @@
       if (::stat(filename.c_str(), &sb) == 0) {
         ASSERT_EQ(sb.st_mode & 0777, 0644);
       }
-      env_->DeleteFile(filename);
+      ASSERT_OK(env_->DeleteFile(filename));
     }
 
     env_->SetAllowNonOwnerAccess(false);
@@ -196,10 +225,88 @@
       if (::stat(filename.c_str(), &sb) == 0) {
         ASSERT_EQ(sb.st_mode & 0777, 0600);
       }
-      env_->DeleteFile(filename);
+      ASSERT_OK(env_->DeleteFile(filename));
     }
   }
 }
+
+TEST_F(EnvPosixTest, LowerThreadPoolCpuPriority) {
+  std::atomic<CpuPriority> from_priority(CpuPriority::kNormal);
+  std::atomic<CpuPriority> to_priority(CpuPriority::kNormal);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ThreadPoolImpl::BGThread::BeforeSetCpuPriority", [&](void* pri) {
+        from_priority.store(*reinterpret_cast<CpuPriority*>(pri));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ThreadPoolImpl::BGThread::AfterSetCpuPriority", [&](void* pri) {
+        to_priority.store(*reinterpret_cast<CpuPriority*>(pri));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  env_->SetBackgroundThreads(1, Env::BOTTOM);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+
+  auto RunTask = [&](Env::Priority pool) {
+    std::atomic<bool> called(false);
+    env_->Schedule(&SetBool, &called, pool);
+    for (int i = 0; i < kDelayMicros; i++) {
+      if (called.load()) {
+        break;
+      }
+      Env::Default()->SleepForMicroseconds(1);
+    }
+    ASSERT_TRUE(called.load());
+  };
+
+  {
+    // Same priority, no-op.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM,
+                                     CpuPriority::kNormal)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kNormal);
+  }
+
+  {
+    // Higher priority, no-op.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kHigh)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kNormal);
+  }
+
+  {
+    // Lower priority from kNormal -> kLow.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kLow)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kLow);
+  }
+
+  {
+    // Lower priority from kLow -> kIdle.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::BOTTOM, CpuPriority::kIdle)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::BOTTOM);
+    ASSERT_EQ(from_priority, CpuPriority::kLow);
+    ASSERT_EQ(to_priority, CpuPriority::kIdle);
+  }
+
+  {
+    // Lower priority from kNormal -> kIdle for another pool.
+    env_->LowerThreadPoolCPUPriority(Env::Priority::HIGH, CpuPriority::kIdle)
+        .PermitUncheckedError();
+    RunTask(Env::Priority::HIGH);
+    ASSERT_EQ(from_priority, CpuPriority::kNormal);
+    ASSERT_EQ(to_priority, CpuPriority::kIdle);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
 #endif
 
 TEST_F(EnvPosixTest, MemoryMappedFileBuffer) {
@@ -212,7 +319,7 @@
     ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
 
     Random rnd(301);
-    test::RandomString(&rnd, kFileBytes, &expected_data);
+    expected_data = rnd.RandomString(kFileBytes);
     ASSERT_OK(wfile->Append(expected_data));
   }
 
@@ -325,6 +432,7 @@
 // run in any order. The purpose of the test is unclear.
 #ifndef OS_WIN
 TEST_P(EnvPosixTestWithParam, RunMany) {
+  env_->SetBackgroundThreads(1, Env::LOW);
   std::atomic<int> last_id(0);
 
   struct CB {
@@ -831,7 +939,7 @@
       } else {
         // mkdtemp failed: diagnose it, but don't give up.
         fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(),
-                strerror(errno));
+                errnoStr(errno).c_str());
       }
     }
 
@@ -929,7 +1037,7 @@
     ASSERT_EQ(unique_id2, unique_id3);
 
     // Delete the file
-    env_->DeleteFile(fname);
+    ASSERT_OK(env_->DeleteFile(fname));
   }
 }
 #endif  // !defined(OS_WIN)
@@ -956,7 +1064,8 @@
     int err_number = 0;
     if (alloc_status != 0) {
       err_number = errno;
-      fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number));
+      fprintf(stderr, "Warning: fallocate() fails, %s\n",
+              errnoStr(err_number).c_str());
     }
     close(fd);
     ASSERT_OK(env_->DeleteFile(fname_test_fallocate));
@@ -1044,7 +1153,7 @@
 
     // Collect and check whether the IDs are unique.
     std::unordered_set<std::string> ids;
-    for (const std::string fname : fnames) {
+    for (const std::string& fname : fnames) {
       std::unique_ptr<RandomAccessFile> file;
       std::string unique_id;
       ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
@@ -1058,7 +1167,7 @@
     }
 
     // Delete the files
-    for (const std::string fname : fnames) {
+    for (const std::string& fname : fnames) {
       ASSERT_OK(env_->DeleteFile(fname));
     }
 
@@ -1066,7 +1175,9 @@
   }
 }
 
-TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
+// TODO: Disable the flaky test, it's a known issue that ext4 may return same
+// key after file deletion. The issue is tracked in #7405, #7470.
+TEST_P(EnvPosixTestWithParam, DISABLED_RandomAccessUniqueIDDeletes) {
   if (env_ == Env::Default()) {
     EnvOptions soptions;
     soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
@@ -1180,6 +1291,213 @@
   }
 }
 
+TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) {
+  // In this test we don't do aligned read, so it doesn't work for
+  // direct I/O case.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  const size_t kTotalSize = 81920;
+  Random rnd(301);
+  std::string expected_data = rnd.RandomString(kTotalSize);
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    ASSERT_OK(wfile->Append(expected_data));
+    ASSERT_OK(wfile->Close());
+  }
+
+  // More attempts to simulate more partial result sequences.
+  for (uint32_t attempt = 0; attempt < 25; attempt++) {
+    // Right now kIoUringDepth is hard coded as 256, so we need very large
+    // number of keys to cover the case of multiple rounds of submissions.
+    // Right now the test latency is still acceptable. If it ends up with
+    // too long, we can modify the io uring depth with SyncPoint here.
+    const int num_reads = rnd.Uniform(512) + 1;
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "PosixRandomAccessFile::MultiRead:io_uring_result", [&](void* arg) {
+          if (attempt > 5) {
+            // Improve partial result rates in second half of the run to
+            // cover the case of repeated partial results.
+            int odd = (attempt < 15) ? num_reads / 2 : 4;
+            // No failure in first several attempts.
+            size_t& bytes_read = *static_cast<size_t*>(arg);
+            if (rnd.OneIn(odd)) {
+              bytes_read = 0;
+            } else if (rnd.OneIn(odd / 2)) {
+              bytes_read = static_cast<size_t>(
+                  rnd.Uniform(static_cast<int>(bytes_read)));
+            }
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Generate (offset, len) pairs
+    std::set<int> start_offsets;
+    for (int i = 0; i < num_reads; i++) {
+      int rnd_off;
+      // No repeat offsets.
+      while (start_offsets.find(rnd_off = rnd.Uniform(81920)) != start_offsets.end()) {}
+      start_offsets.insert(rnd_off);
+    }
+    std::vector<size_t> offsets;
+    std::vector<size_t> lens;
+    // std::set already sorted the offsets.
+    for (int so: start_offsets) {
+      offsets.push_back(so);
+    }
+    for (size_t i = 0; i + 1 < offsets.size(); i++) {
+      lens.push_back(static_cast<size_t>(rnd.Uniform(static_cast<int>(offsets[i + 1] - offsets[i])) + 1));
+    }
+    lens.push_back(static_cast<size_t>(rnd.Uniform(static_cast<int>(kTotalSize - offsets.back())) + 1));
+    ASSERT_EQ(num_reads, lens.size());
+
+    // Create requests
+    std::vector<std::string> scratches;
+    scratches.reserve(num_reads);
+    std::vector<ReadRequest> reqs(num_reads);
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      reqs[i].offset = offsets[i];
+      reqs[i].len = lens[i];
+      scratches.emplace_back(reqs[i].len, ' ');
+      reqs[i].scratch = const_cast<char*>(scratches.back().data());
+    }
+
+    // Query the data
+    std::unique_ptr<RandomAccessFile> file;
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+
+    // Validate results
+    for (int i = 0; i < num_reads; ++i) {
+      ASSERT_OK(reqs[i].status);
+      ASSERT_EQ(Slice(expected_data.data() + offsets[i], lens[i]).ToString(true),
+                reqs[i].result.ToString(true));
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+#if defined(ROCKSDB_IOURING_PRESENT)
+void GenerateFilesAndRequest(Env* env, const std::string& fname,
+                             std::vector<ReadRequest>* ret_reqs,
+                             std::vector<std::string>* scratches) {
+  const size_t kTotalSize = 81920;
+  Random rnd(301);
+  std::string expected_data = rnd.RandomString(kTotalSize);
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env->NewWritableFile(fname, &wfile, EnvOptions()));
+    ASSERT_OK(wfile->Append(expected_data));
+    ASSERT_OK(wfile->Close());
+  }
+
+  // Right now kIoUringDepth is hard coded as 256, so we need very large
+  // number of keys to cover the case of multiple rounds of submissions.
+  // Right now the test latency is still acceptable. If it ends up with
+  // too long, we can modify the io uring depth with SyncPoint here.
+  const int num_reads = 3;
+  std::vector<size_t> offsets = {10000, 20000, 30000};
+  std::vector<size_t> lens = {3000, 200, 100};
+
+  // Create requests
+  scratches->reserve(num_reads);
+  std::vector<ReadRequest>& reqs = *ret_reqs;
+  reqs.resize(num_reads);
+  for (int i = 0; i < num_reads; ++i) {
+    reqs[i].offset = offsets[i];
+    reqs[i].len = lens[i];
+    scratches->emplace_back(reqs[i].len, ' ');
+    reqs[i].scratch = const_cast<char*>(scratches->back().data());
+  }
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError) {
+  // In this test we don't do aligned read, so we can't do direct I/O.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  std::vector<std::string> scratches;
+  std::vector<ReadRequest> reqs;
+  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+  // Query the data
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+  bool io_uring_wait_cqe_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return",
+      [&](void* arg) {
+        if (!io_uring_wait_cqe_called) {
+          io_uring_wait_cqe_called = true;
+          ssize_t& ret = *(static_cast<ssize_t*>(arg));
+          ret = 1;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = file->MultiRead(reqs.data(), reqs.size());
+  if (io_uring_wait_cqe_called) {
+    ASSERT_NOK(s);
+  } else {
+    s.PermitUncheckedError();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError2) {
+  // In this test we don't do aligned read, so we can't do direct I/O.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  std::vector<std::string> scratches;
+  std::vector<ReadRequest> reqs;
+  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+  // Query the data
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+  bool io_uring_submit_and_wait_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+      [&](void* arg) {
+        io_uring_submit_and_wait_called = true;
+        ssize_t* ret = static_cast<ssize_t*>(arg);
+        (*ret)--;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+      [&](void* arg) {
+        struct io_uring* iu = static_cast<struct io_uring*>(arg);
+        struct io_uring_cqe* cqe;
+        assert(io_uring_wait_cqe(iu, &cqe) == 0);
+        io_uring_cqe_seen(iu, cqe);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = file->MultiRead(reqs.data(), reqs.size());
+  if (io_uring_submit_and_wait_called) {
+    ASSERT_NOK(s);
+  } else {
+    s.PermitUncheckedError();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_IOURING_PRESENT
+
 // Only works in linux platforms
 #ifdef OS_WIN
 TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) {
@@ -1398,7 +1716,7 @@
     auto data = NewAligned(kStrSize, 'A');
     Slice str(data.get(), kStrSize);
     srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize);
-    srcfile->Append(str);
+    ASSERT_OK(srcfile->Append(str));
     srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
     ASSERT_EQ(last_allocated_block, 1UL);
 
@@ -1407,7 +1725,7 @@
       auto buf_ptr = NewAligned(block_size, ' ');
       Slice buf(buf_ptr.get(), block_size);
       srcfile->PrepareWrite(srcfile->GetFileSize(), block_size);
-      srcfile->Append(buf);
+      ASSERT_OK(srcfile->Append(buf));
       srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
       ASSERT_EQ(last_allocated_block, 2UL);
     }
@@ -1417,7 +1735,7 @@
       auto buf_ptr = NewAligned(block_size * 5, ' ');
       Slice buf = Slice(buf_ptr.get(), block_size * 5);
       srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size());
-      srcfile->Append(buf);
+      ASSERT_OK(srcfile->Append(buf));
       srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
       ASSERT_EQ(last_allocated_block, 7UL);
     }
@@ -1433,9 +1751,10 @@
   const int kNumChildren = 10;
 
   std::string data;
+  std::string test_base_dir = test::PerThreadDBPath(env_, "env_test_chr_attr");
+  env_->CreateDir(test_base_dir).PermitUncheckedError();
   for (int i = 0; i < kNumChildren; ++i) {
-    const std::string path =
-        test::TmpDir(env_) + "/" + "testfile_" + std::to_string(i);
+    const std::string path = test_base_dir + "/testfile_" + std::to_string(i);
     std::unique_ptr<WritableFile> file;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
       if (soptions.use_direct_writes) {
@@ -1449,15 +1768,15 @@
       ASSERT_OK(env_->NewWritableFile(path, &file, soptions));
       auto buf_ptr = NewAligned(data.size(), 'T');
       Slice buf(buf_ptr.get(), data.size());
-      file->Append(buf);
+      ASSERT_OK(file->Append(buf));
       data.append(std::string(4096, 'T'));
   }
 
     std::vector<Env::FileAttributes> file_attrs;
-    ASSERT_OK(env_->GetChildrenFileAttributes(test::TmpDir(env_), &file_attrs));
+    ASSERT_OK(env_->GetChildrenFileAttributes(test_base_dir, &file_attrs));
     for (int i = 0; i < kNumChildren; ++i) {
       const std::string name = "testfile_" + std::to_string(i);
-      const std::string path = test::TmpDir(env_) + "/" + name;
+      const std::string path = test_base_dir + "/" + name;
 
       auto file_attrs_iter = std::find_if(
           file_attrs.begin(), file_attrs.end(),
@@ -1490,12 +1809,26 @@
       return Status::OK();
     }
 
+    Status Append(
+        const Slice& /*data*/,
+        const DataVerificationInfo& /* verification_info */) override {
+      inc(1);
+      return Status::OK();
+    }
+
     Status PositionedAppend(const Slice& /*data*/,
                             uint64_t /*offset*/) override {
       inc(2);
       return Status::OK();
     }
 
+    Status PositionedAppend(
+        const Slice& /*data*/, uint64_t /*offset*/,
+        const DataVerificationInfo& /* verification_info */) override {
+      inc(2);
+      return Status::OK();
+    }
+
     Status Truncate(uint64_t /*size*/) override {
       inc(3);
       return Status::OK();
@@ -1600,13 +1933,13 @@
   {
     Base b(&step);
     Wrapper w(&b);
-    w.Append(Slice());
-    w.PositionedAppend(Slice(), 0);
-    w.Truncate(0);
-    w.Close();
-    w.Flush();
-    w.Sync();
-    w.Fsync();
+    ASSERT_OK(w.Append(Slice()));
+    ASSERT_OK(w.PositionedAppend(Slice(), 0));
+    ASSERT_OK(w.Truncate(0));
+    ASSERT_OK(w.Close());
+    ASSERT_OK(w.Flush());
+    ASSERT_OK(w.Sync());
+    ASSERT_OK(w.Fsync());
     w.IsSyncThreadSafe();
     w.use_direct_io();
     w.GetRequiredBufferAlignment();
@@ -1618,10 +1951,10 @@
     w.SetPreallocationBlockSize(0);
     w.GetPreallocationStatus(nullptr, nullptr);
     w.GetUniqueId(nullptr, 0);
-    w.InvalidateCache(0, 0);
-    w.RangeSync(0, 0);
+    ASSERT_OK(w.InvalidateCache(0, 0));
+    ASSERT_OK(w.RangeSync(0, 0));
     w.PrepareWrite(0, 0);
-    w.Allocate(0, 0);
+    ASSERT_OK(w.Allocate(0, 0));
   }
 
   EXPECT_EQ(24, step);
@@ -1630,7 +1963,7 @@
 TEST_P(EnvPosixTestWithParam, PosixRandomRWFile) {
   const std::string path = test::PerThreadDBPath(env_, "random_rw_file");
 
-  env_->DeleteFile(path);
+  env_->DeleteFile(path).PermitUncheckedError();
 
   std::unique_ptr<RandomRWFile> file;
 
@@ -1680,7 +2013,7 @@
   ASSERT_EQ(read_res.ToString(), "XXXQ");
 
   // Close file and reopen it
-  file->Close();
+  ASSERT_OK(file->Close());
   ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
 
   ASSERT_OK(file->Read(0, 9, &read_res, buf));
@@ -1697,7 +2030,7 @@
   ASSERT_EQ(read_res.ToString(), "ABXXTTTTTT");
 
   // Clean up
-  env_->DeleteFile(path);
+  ASSERT_OK(env_->DeleteFile(path));
 }
 
 class RandomRWFileWithMirrorString {
@@ -1757,7 +2090,7 @@
 
 TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) {
   const std::string path = test::PerThreadDBPath(env_, "random_rw_file_rand");
-  env_->DeleteFile(path);
+  env_->DeleteFile(path).PermitUncheckedError();
 
   std::unique_ptr<RandomRWFile> file;
 
@@ -1779,7 +2112,7 @@
   std::string buf;
   for (int i = 0; i < 10000; i++) {
     // Genrate random data
-    test::RandomString(&rnd, 10, &buf);
+    buf = rnd.RandomString(10);
 
     // Pick random offset for write
     size_t write_off = rnd.Next() % 1000;
@@ -1798,35 +2131,36 @@
   }
 
   // clean up
-  env_->DeleteFile(path);
+  ASSERT_OK(env_->DeleteFile(path));
 }
 
 class TestEnv : public EnvWrapper {
   public:
     explicit TestEnv() : EnvWrapper(Env::Default()),
                 close_count(0) { }
-
-  class TestLogger : public Logger {
-   public:
-    using Logger::Logv;
-    TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
-    ~TestLogger() override {
-      if (!closed_) {
-        CloseHelper();
+    const char* Name() const override { return "TestEnv"; }
+    class TestLogger : public Logger {
+     public:
+      using Logger::Logv;
+      explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+      ~TestLogger() override {
+        if (!closed_) {
+          Status s = CloseHelper();
+          s.PermitUncheckedError();
+        }
       }
-    }
-    void Logv(const char* /*format*/, va_list /*ap*/) override{};
+      void Logv(const char* /*format*/, va_list /*ap*/) override {}
 
-   protected:
-    Status CloseImpl() override { return CloseHelper(); }
+     protected:
+      Status CloseImpl() override { return CloseHelper(); }
 
-   private:
-    Status CloseHelper() {
-      env->CloseCountInc();;
-      return Status::OK();
-    }
-    TestEnv* env;
-  };
+     private:
+      Status CloseHelper() {
+        env->CloseCountInc();
+        return Status::OK();
+      }
+      TestEnv* env;
+    };
 
   void CloseCountInc() { close_count++; }
 
@@ -1842,7 +2176,13 @@
   int close_count;
 };
 
-class EnvTest : public testing::Test {};
+class EnvTest : public testing::Test {
+ public:
+  EnvTest() : test_directory_(test::PerThreadDBPath("env_test")) {}
+
+ protected:
+  const std::string test_directory_;
+};
 
 TEST_F(EnvTest, Close) {
   TestEnv* env = new TestEnv();
@@ -1850,23 +2190,43 @@
   Status s;
 
   s = env->NewLogger("", &logger);
-  ASSERT_EQ(s, Status::OK());
-  logger.get()->Close();
+  ASSERT_OK(s);
+  ASSERT_OK(logger.get()->Close());
   ASSERT_EQ(env->GetCloseCount(), 1);
   // Call Close() again. CloseHelper() should not be called again
-  logger.get()->Close();
+  ASSERT_OK(logger.get()->Close());
   ASSERT_EQ(env->GetCloseCount(), 1);
   logger.reset();
   ASSERT_EQ(env->GetCloseCount(), 1);
 
   s = env->NewLogger("", &logger);
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   logger.reset();
   ASSERT_EQ(env->GetCloseCount(), 2);
 
   delete env;
 }
 
+class LogvWithInfoLogLevelLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const InfoLogLevel /* log_level */, const char* /* format */,
+            va_list /* ap */) override {}
+};
+
+TEST_F(EnvTest, LogvWithInfoLogLevel) {
+  // Verifies the log functions work on a `Logger` that only overrides the
+  // `Logv()` overload including `InfoLogLevel`.
+  const std::string kSampleMessage("sample log message");
+  LogvWithInfoLogLevelLogger logger;
+  ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+}
+
 INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam,
                         ::testing::Values(std::pair<Env*, bool>(Env::Default(),
                                                                 false)));
@@ -1877,19 +2237,845 @@
 #endif  // !defined(ROCKSDB_LITE)
 
 #if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
-static std::unique_ptr<Env> chroot_env(
-    NewChrootEnv(Env::Default(), test::TmpDir(Env::Default())));
-INSTANTIATE_TEST_CASE_P(
-    ChrootEnvWithoutDirectIO, EnvPosixTestWithParam,
-    ::testing::Values(std::pair<Env*, bool>(chroot_env.get(), false)));
-INSTANTIATE_TEST_CASE_P(
-    ChrootEnvWithDirectIO, EnvPosixTestWithParam,
-    ::testing::Values(std::pair<Env*, bool>(chroot_env.get(), true)));
+static Env* GetChrootEnv() {
+  static std::unique_ptr<Env> chroot_env(
+      NewChrootEnv(Env::Default(), test::TmpDir(Env::Default())));
+  return chroot_env.get();
+}
+INSTANTIATE_TEST_CASE_P(ChrootEnvWithoutDirectIO, EnvPosixTestWithParam,
+                        ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(),
+                                                                false)));
+INSTANTIATE_TEST_CASE_P(ChrootEnvWithDirectIO, EnvPosixTestWithParam,
+                        ::testing::Values(std::pair<Env*, bool>(GetChrootEnv(),
+                                                                true)));
 #endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 
+class EnvFSTestWithParam
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+  EnvFSTestWithParam() {
+    bool env_non_null = std::get<0>(GetParam());
+    bool env_default = std::get<1>(GetParam());
+    bool fs_default = std::get<2>(GetParam());
+
+    env_ = env_non_null ? (env_default ? Env::Default() : nullptr) : nullptr;
+    fs_ = fs_default
+              ? FileSystem::Default()
+              : std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+    if (env_non_null && env_default && !fs_default) {
+      env_ptr_ = NewCompositeEnv(fs_);
+    }
+    if (env_non_null && !env_default && fs_default) {
+      env_ptr_ = std::unique_ptr<Env>(new FaultInjectionTestEnv(Env::Default()));
+      fs_.reset();
+    }
+    if (env_non_null && !env_default && !fs_default) {
+      env_ptr_.reset(new FaultInjectionTestEnv(Env::Default()));
+      composite_env_ptr_.reset(new CompositeEnvWrapper(env_ptr_.get(), fs_));
+      env_ = composite_env_ptr_.get();
+    } else {
+      env_ = env_ptr_.get();
+    }
+
+    dbname1_ = test::PerThreadDBPath("env_fs_test1");
+    dbname2_ = test::PerThreadDBPath("env_fs_test2");
+  }
+
+  ~EnvFSTestWithParam() = default;
+
+  Env* env_;
+  std::unique_ptr<Env> env_ptr_;
+  std::unique_ptr<Env> composite_env_ptr_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname1_;
+  std::string dbname2_;
+};
+
+TEST_P(EnvFSTestWithParam, OptionsTest) {
+  Options opts;
+  opts.env = env_;
+  opts.create_if_missing = true;
+  std::string dbname = dbname1_;
+
+  if (env_) {
+    if (fs_) {
+      ASSERT_EQ(fs_.get(), env_->GetFileSystem().get());
+    } else {
+      ASSERT_NE(FileSystem::Default().get(), env_->GetFileSystem().get());
+    }
+  }
+  for (int i = 0; i < 2; ++i) {
+    DB* db;
+    Status s = DB::Open(opts, dbname, &db);
+    ASSERT_OK(s);
+
+    WriteOptions wo;
+    ASSERT_OK(db->Put(wo, "a", "a"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(db->Put(wo, "b", "b"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    std::string val;
+    ASSERT_OK(db->Get(ReadOptions(), "a", &val));
+    ASSERT_EQ("a", val);
+    ASSERT_OK(db->Get(ReadOptions(), "b", &val));
+    ASSERT_EQ("b", val);
+
+    ASSERT_OK(db->Close());
+    delete db;
+    ASSERT_OK(DestroyDB(dbname, opts));
+
+    dbname = dbname2_;
+  }
+}
+
+// The parameters are as follows -
+// 1. True means Options::env is non-null, false means null
+// 2. True means use Env::Default, false means custom
+// 3. True means use FileSystem::Default, false means custom
+INSTANTIATE_TEST_CASE_P(
+    EnvFSTest, EnvFSTestWithParam,
+    ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                       ::testing::Bool()));
+// This test ensures that default Env and those allocated by
+// NewCompositeEnv() all share the same threadpool
+TEST_F(EnvTest, MultipleCompositeEnv) {
+  std::shared_ptr<FaultInjectionTestFS> fs1 =
+    std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  std::shared_ptr<FaultInjectionTestFS> fs2 =
+    std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  std::unique_ptr<Env> env1 = NewCompositeEnv(fs1);
+  std::unique_ptr<Env> env2 = NewCompositeEnv(fs2);
+  Env::Default()->SetBackgroundThreads(8, Env::HIGH);
+  Env::Default()->SetBackgroundThreads(16, Env::LOW);
+  ASSERT_EQ(env1->GetBackgroundThreads(Env::LOW), 16);
+  ASSERT_EQ(env1->GetBackgroundThreads(Env::HIGH), 8);
+  ASSERT_EQ(env2->GetBackgroundThreads(Env::LOW), 16);
+  ASSERT_EQ(env2->GetBackgroundThreads(Env::HIGH), 8);
+}
+
+TEST_F(EnvTest, IsDirectory) {
+  Status s = Env::Default()->CreateDirIfMissing(test_directory_);
+  ASSERT_OK(s);
+  const std::string test_sub_dir = test_directory_ + "sub1";
+  const std::string test_file_path = test_directory_ + "file1";
+  ASSERT_OK(Env::Default()->CreateDirIfMissing(test_sub_dir));
+  bool is_dir = false;
+  ASSERT_OK(Env::Default()->IsDirectory(test_sub_dir, &is_dir));
+  ASSERT_TRUE(is_dir);
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    s = Env::Default()->GetFileSystem()->NewWritableFile(
+        test_file_path, FileOptions(), &wfile, /*dbg=*/nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> fwriter;
+    fwriter.reset(new WritableFileWriter(std::move(wfile), test_file_path,
+                                         FileOptions(),
+                                         SystemClock::Default().get()));
+    constexpr char buf[] = "test";
+    s = fwriter->Append(buf);
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(Env::Default()->IsDirectory(test_file_path, &is_dir));
+  ASSERT_FALSE(is_dir);
+}
+
+TEST_F(EnvTest, EnvWriteVerificationTest) {
+  Status s = Env::Default()->CreateDirIfMissing(test_directory_);
+  const std::string test_file_path = test_directory_ + "file1";
+  ASSERT_OK(s);
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  std::unique_ptr<WritableFile> file;
+  s = fault_fs_env->NewWritableFile(test_file_path, &file, EnvOptions());
+  ASSERT_OK(s);
+
+  DataVerificationInfo v_info;
+  std::string test_data = "test";
+  std::string checksum;
+  uint32_t v_crc32c = crc32c::Extend(0, test_data.c_str(), test_data.size());
+  PutFixed32(&checksum, v_crc32c);
+  v_info.checksum = Slice(checksum);
+  s = file->Append(Slice(test_data), v_info);
+  ASSERT_OK(s);
+}
+
+class CreateEnvTest : public testing::Test {
+ public:
+  CreateEnvTest() {
+    config_options_.ignore_unknown_options = false;
+    config_options_.ignore_unsupported_options = false;
+  }
+  ConfigOptions config_options_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(CreateEnvTest, LoadCTRProvider) {
+  config_options_.invoke_prepare_options = false;
+  std::string CTR = CTREncryptionProvider::kClassName();
+  std::shared_ptr<EncryptionProvider> provider;
+  // Test a provider with no cipher
+  ASSERT_OK(
+      EncryptionProvider::CreateFromString(config_options_, CTR, &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  ASSERT_NOK(provider->PrepareOptions(config_options_));
+  ASSERT_NOK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  auto cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_EQ(cipher->get(), nullptr);
+  provider.reset();
+
+  ASSERT_OK(EncryptionProvider::CreateFromString(config_options_,
+                                                 CTR + "://test", &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  ASSERT_OK(provider->PrepareOptions(config_options_));
+  ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_NE(cipher->get(), nullptr);
+  ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+  provider.reset();
+
+  ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "1://test",
+                                                 &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  ASSERT_OK(provider->PrepareOptions(config_options_));
+  ASSERT_OK(provider->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_NE(cipher->get(), nullptr);
+  ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+  provider.reset();
+
+  ASSERT_OK(EncryptionProvider::CreateFromString(
+      config_options_, "id=" + CTR + "; cipher=ROT13", &provider));
+  ASSERT_NE(provider, nullptr);
+  ASSERT_EQ(provider->Name(), CTR);
+  cipher = provider->GetOptions<std::shared_ptr<BlockCipher>>("Cipher");
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_NE(cipher->get(), nullptr);
+  ASSERT_STREQ(cipher->get()->Name(), "ROT13");
+  provider.reset();
+}
+
+TEST_F(CreateEnvTest, LoadROT13Cipher) {
+  std::shared_ptr<BlockCipher> cipher;
+  // Test a provider with no cipher
+  ASSERT_OK(BlockCipher::CreateFromString(config_options_, "ROT13", &cipher));
+  ASSERT_NE(cipher, nullptr);
+  ASSERT_STREQ(cipher->Name(), "ROT13");
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(CreateEnvTest, CreateDefaultSystemClock) {
+  std::shared_ptr<SystemClock> clock, copy;
+  ASSERT_OK(SystemClock::CreateFromString(config_options_,
+                                          SystemClock::kDefaultName(), &clock));
+  ASSERT_NE(clock, nullptr);
+  ASSERT_EQ(clock, SystemClock::Default());
+#ifndef ROCKSDB_LITE
+  std::string opts_str = clock->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(clock->AreEquivalent(config_options_, copy.get(), &mismatch));
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CreateEnvTest, CreateMockSystemClock) {
+  std::shared_ptr<SystemClock> mock, copy;
+
+  config_options_.registry->AddLibrary("test")->AddFactory<SystemClock>(
+      MockSystemClock::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSystemClock(nullptr));
+        return guard->get();
+      });
+  ASSERT_OK(SystemClock::CreateFromString(
+      config_options_, EmulatedSystemClock::kClassName(), &mock));
+  ASSERT_NE(mock, nullptr);
+  ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName());
+  ASSERT_EQ(mock->Inner(), SystemClock::Default().get());
+  std::string opts_str = mock->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  std::string id = std::string("id=") + EmulatedSystemClock::kClassName() +
+                   ";target=" + MockSystemClock::kClassName();
+
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, id, &mock));
+  ASSERT_NE(mock, nullptr);
+  ASSERT_STREQ(mock->Name(), EmulatedSystemClock::kClassName());
+  ASSERT_NE(mock->Inner(), nullptr);
+  ASSERT_STREQ(mock->Inner()->Name(), MockSystemClock::kClassName());
+  ASSERT_EQ(mock->Inner()->Inner(), SystemClock::Default().get());
+  opts_str = mock->ToString(config_options_);
+  ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(mock->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(SystemClock::CreateFromString(
+      config_options_, EmulatedSystemClock::kClassName(), &mock));
+}
+
+TEST_F(CreateEnvTest, CreateReadOnlyFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_, ReadOnlyFileSystem::kClassName(), &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      std::string("id=") + ReadOnlyFileSystem::kClassName() +
+          "; target=" + TimedFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  opts_str = fs->ToString(config_options_);
+  ASSERT_STREQ(fs->Name(), ReadOnlyFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CreateEnvTest, CreateTimedFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_,
+                                         TimedFileSystem::kClassName(), &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      std::string("id=") + TimedFileSystem::kClassName() +
+          "; target=" + ReadOnlyFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  opts_str = fs->ToString(config_options_);
+  ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), ReadOnlyFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+#ifndef OS_WIN
+TEST_F(CreateEnvTest, CreateChrootFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+  auto tmp_dir = test::TmpDir(Env::Default());
+  // The Chroot FileSystem has a required "chroot_dir" option.
+  ASSERT_NOK(FileSystem::CreateFromString(config_options_,
+                                          ChrootFileSystem::kClassName(), &fs));
+
+  // ChrootFileSystem fails with an invalid directory
+  ASSERT_NOK(FileSystem::CreateFromString(
+      config_options_,
+      std::string("chroot_dir=/No/Such/Directory; id=") +
+          ChrootFileSystem::kClassName(),
+      &fs));
+  std::string chroot_opts = std::string("chroot_dir=") + tmp_dir +
+                            std::string("; id=") +
+                            ChrootFileSystem::kClassName();
+
+  // Create a valid ChrootFileSystem with an inner Default
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, chroot_opts, &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  // Create a valid ChrootFileSystem with an inner TimedFileSystem
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      chroot_opts + "; target=" + TimedFileSystem::kClassName(), &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), ChrootFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  opts_str = fs->ToString(config_options_);
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  // Create a TimedFileSystem with an inner ChrootFileSystem
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_,
+      "target={" + chroot_opts + "}; id=" + TimedFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), TimedFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), ChrootFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  opts_str = fs->ToString(config_options_);
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+#endif  // OS_WIN
+
+TEST_F(CreateEnvTest, CreateEncryptedFileSystem) {
+  std::shared_ptr<FileSystem> fs, copy;
+
+  std::string base_opts =
+      std::string("provider=1://test; id=") + EncryptedFileSystem::kClassName();
+  // The EncryptedFileSystem requires a "provider" option.
+  ASSERT_NOK(FileSystem::CreateFromString(
+      config_options_, EncryptedFileSystem::kClassName(), &fs));
+
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, base_opts, &fs));
+
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner(), FileSystem::Default().get());
+  std::string opts_str = fs->ToString(config_options_);
+  std::string mismatch;
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(FileSystem::CreateFromString(
+      config_options_, base_opts + "; target=" + TimedFileSystem::kClassName(),
+      &fs));
+  ASSERT_NE(fs, nullptr);
+  ASSERT_STREQ(fs->Name(), EncryptedFileSystem::kClassName());
+  ASSERT_NE(fs->Inner(), nullptr);
+  ASSERT_STREQ(fs->Inner()->Name(), TimedFileSystem::kClassName());
+  ASSERT_EQ(fs->Inner()->Inner(), FileSystem::Default().get());
+  opts_str = fs->ToString(config_options_);
+  ASSERT_OK(FileSystem::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+#endif  // ROCKSDB_LITE
+
+namespace {
+
+constexpr size_t kThreads = 8;
+constexpr size_t kIdsPerThread = 1000;
+
+// This is a mini-stress test to check for duplicates in functions like
+// GenerateUniqueId()
+template <typename IdType, class Hash = std::hash<IdType>>
+struct NoDuplicateMiniStressTest {
+  std::unordered_set<IdType, Hash> ids;
+  std::mutex mutex;
+  Env* env;
+
+  NoDuplicateMiniStressTest() { env = Env::Default(); }
+
+  virtual ~NoDuplicateMiniStressTest() {}
+
+  void Run() {
+    std::array<std::thread, kThreads> threads;
+    for (size_t i = 0; i < kThreads; ++i) {
+      threads[i] = std::thread([&]() { ThreadFn(); });
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    // All must be unique
+    ASSERT_EQ(ids.size(), kThreads * kIdsPerThread);
+  }
+
+  void ThreadFn() {
+    std::array<IdType, kIdsPerThread> my_ids;
+    // Generate in parallel threads as fast as possible
+    for (size_t i = 0; i < kIdsPerThread; ++i) {
+      my_ids[i] = Generate();
+    }
+    // Now collate
+    std::lock_guard<std::mutex> lock(mutex);
+    for (auto& id : my_ids) {
+      ids.insert(id);
+    }
+  }
+
+  virtual IdType Generate() = 0;
+};
+
+void VerifyRfcUuids(const std::unordered_set<std::string>& uuids) {
+  if (uuids.empty()) {
+    return;
+  }
+}
+
+using uint64_pair_t = std::pair<uint64_t, uint64_t>;
+struct HashUint64Pair {
+  std::size_t operator()(
+      std::pair<uint64_t, uint64_t> const& u) const noexcept {
+    // Assume suitable distribution already
+    return static_cast<size_t>(u.first ^ u.second);
+  }
+};
+
+}  // namespace
+
+TEST_F(EnvTest, GenerateUniqueId) {
+  struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+    std::string Generate() override { return env->GenerateUniqueId(); }
+  };
+
+  MyStressTest t;
+  t.Run();
+
+  // Basically verify RFC-4122 format
+  for (auto& uuid : t.ids) {
+    ASSERT_EQ(36U, uuid.size());
+    ASSERT_EQ('-', uuid[8]);
+    ASSERT_EQ('-', uuid[13]);
+    ASSERT_EQ('-', uuid[18]);
+    ASSERT_EQ('-', uuid[23]);
+  }
+}
+
+TEST_F(EnvTest, GenerateDbSessionId) {
+  struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+    std::string Generate() override { return DBImpl::GenerateDbSessionId(env); }
+  };
+
+  MyStressTest t;
+  t.Run();
+
+  // Basically verify session ID
+  for (auto& id : t.ids) {
+    ASSERT_EQ(20U, id.size());
+  }
+}
+
+constexpr bool kRequirePortGenerateRfcUuid =
+#if defined(OS_LINUX) || defined(OS_ANDROID) || defined(OS_WIN)
+    true;
+#else
+    false;
+#endif
+
+TEST_F(EnvTest, PortGenerateRfcUuid) {
+  if (!kRequirePortGenerateRfcUuid) {
+    ROCKSDB_GTEST_SKIP("Not supported/expected on this platform");
+    return;
+  }
+  struct MyStressTest : public NoDuplicateMiniStressTest<std::string> {
+    std::string Generate() override {
+      std::string u;
+      assert(port::GenerateRfcUuid(&u));
+      return u;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+
+  // Extra verification on versions and variants
+  VerifyRfcUuids(t.ids);
+}
+
+// Test the atomic, linear generation of GenerateRawUuid
+TEST_F(EnvTest, GenerateRawUniqueId) {
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      GenerateRawUniqueId(&p.first, &p.second);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+// Test that each entropy source ("track") is at least adequate
+TEST_F(EnvTest, GenerateRawUniqueIdTrackPortUuidOnly) {
+  if (!kRequirePortGenerateRfcUuid) {
+    ROCKSDB_GTEST_SKIP("Not supported/expected on this platform");
+    return;
+  }
+
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      TEST_GenerateRawUniqueId(&p.first, &p.second, false, true, true);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, GenerateRawUniqueIdTrackEnvDetailsOnly) {
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      TEST_GenerateRawUniqueId(&p.first, &p.second, true, false, true);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, GenerateRawUniqueIdTrackRandomDeviceOnly) {
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      TEST_GenerateRawUniqueId(&p.first, &p.second, true, true, false);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) {
+  // Must be thread safe and usable as a static
+  static SemiStructuredUniqueIdGen gen;
+
+  struct MyStressTest
+      : public NoDuplicateMiniStressTest<uint64_pair_t, HashUint64Pair> {
+    uint64_pair_t Generate() override {
+      uint64_pair_t p;
+      gen.GenerateNext(&p.first, &p.second);
+      return p;
+    }
+  };
+
+  MyStressTest t;
+  t.Run();
+}
+
+TEST_F(EnvTest, FailureToCreateLockFile) {
+  auto env = Env::Default();
+  auto fs = env->GetFileSystem();
+  std::string dir = test::PerThreadDBPath(env, "lockdir");
+  std::string file = dir + "/lockfile";
+
+  // Ensure directory doesn't exist
+  ASSERT_OK(DestroyDir(env, dir));
+
+  // Make sure that we can acquire a file lock after the first attempt fails
+  FileLock* lock = nullptr;
+  ASSERT_NOK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr));
+  ASSERT_FALSE(lock);
+
+  ASSERT_OK(fs->CreateDir(dir, IOOptions(), /*dbg*/ nullptr));
+  ASSERT_OK(fs->LockFile(file, IOOptions(), &lock, /*dbg*/ nullptr));
+  ASSERT_OK(fs->UnlockFile(lock, IOOptions(), /*dbg*/ nullptr));
+
+  // Clean up
+  ASSERT_OK(DestroyDir(env, dir));
+}
+
+TEST_F(EnvTest, CreateDefaultEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+
+  std::shared_ptr<Env> guard;
+  Env* env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, "", &env));
+  ASSERT_EQ(env, Env::Default());
+
+  env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env));
+  ASSERT_EQ(env, Env::Default());
+
+  env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, "", &env, &guard));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_EQ(guard, nullptr);
+
+  env = nullptr;
+  ASSERT_OK(Env::CreateFromString(options, Env::kDefaultName(), &env, &guard));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_EQ(guard, nullptr);
+
+#ifndef ROCKSDB_LITE
+  std::string opt_str = env->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+  ASSERT_EQ(env, Env::Default());
+  ASSERT_EQ(guard, nullptr);
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+class WrappedEnv : public EnvWrapper {
+ public:
+  explicit WrappedEnv(Env* t) : EnvWrapper(t) {}
+  explicit WrappedEnv(const std::shared_ptr<Env>& t) : EnvWrapper(t) {}
+  static const char* kClassName() { return "WrappedEnv"; }
+  const char* Name() const override { return kClassName(); }
+  static void Register(ObjectLibrary& lib, const std::string& /*arg*/) {
+    lib.AddFactory<Env>(
+        WrappedEnv::kClassName(),
+        [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new WrappedEnv(nullptr));
+          return guard->get();
+        });
+  }
+};
+}  // namespace
+TEST_F(EnvTest, CreateMockEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+  std::shared_ptr<Env> guard, copy;
+  std::string opt_str;
+
+  Env* env = nullptr;
+  ASSERT_NOK(Env::CreateFromString(options, MockEnv::kClassName(), &env));
+  ASSERT_OK(
+      Env::CreateFromString(options, MockEnv::kClassName(), &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  opt_str = env->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  std::string mismatch;
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+  guard.reset(MockEnv::Create(Env::Default(), SystemClock::Default()));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  std::unique_ptr<Env> wrapped_env(new WrappedEnv(Env::Default()));
+  guard.reset(MockEnv::Create(wrapped_env.get(), SystemClock::Default()));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  opt_str = copy->ToString(options);
+}
+
+TEST_F(EnvTest, CreateWrappedEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+  Env* env = nullptr;
+  std::shared_ptr<Env> guard, copy;
+  std::string opt_str;
+  std::string mismatch;
+
+  ASSERT_NOK(Env::CreateFromString(options, WrappedEnv::kClassName(), &env));
+  ASSERT_OK(
+      Env::CreateFromString(options, WrappedEnv::kClassName(), &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_FALSE(guard->AreEquivalent(options, Env::Default(), &mismatch));
+
+  opt_str = env->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>(Env::Default())));
+  ASSERT_NE(guard.get(), env);
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  guard.reset(new WrappedEnv(std::make_shared<WrappedEnv>(
+      std::make_shared<WrappedEnv>(Env::Default()))));
+  ASSERT_NE(guard.get(), env);
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(copy, guard);
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+}
+
+TEST_F(EnvTest, CreateCompositeEnv) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  std::shared_ptr<Env> guard, copy;
+  Env* env = nullptr;
+  std::string mismatch, opt_str;
+
+  WrappedEnv::Register(*(options.registry->AddLibrary("test")), "");
+  std::unique_ptr<Env> base(NewCompositeEnv(FileSystem::Default()));
+  std::unique_ptr<Env> wrapped(new WrappedEnv(Env::Default()));
+  std::shared_ptr<FileSystem> timed_fs =
+      std::make_shared<TimedFileSystem>(FileSystem::Default());
+  std::shared_ptr<SystemClock> clock =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default());
+
+  opt_str = base->ToString(options);
+  ASSERT_NOK(Env::CreateFromString(options, opt_str, &env));
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_EQ(env->GetFileSystem(), FileSystem::Default());
+  ASSERT_EQ(env->GetSystemClock(), SystemClock::Default());
+
+  base = NewCompositeEnv(timed_fs);
+  opt_str = base->ToString(options);
+  ASSERT_NOK(Env::CreateFromString(options, opt_str, &env));
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_NE(env->GetFileSystem(), FileSystem::Default());
+  ASSERT_EQ(env->GetSystemClock(), SystemClock::Default());
+
+  env = nullptr;
+  guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  env = nullptr;
+  guard.reset(new CompositeEnvWrapper(wrapped.get(), clock));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+
+  env = nullptr;
+  guard.reset(new CompositeEnvWrapper(wrapped.get(), timed_fs, clock));
+  opt_str = guard->ToString(options);
+  ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &copy));
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(env, Env::Default());
+  ASSERT_TRUE(guard->AreEquivalent(options, copy.get(), &mismatch));
+}
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,10 +3,20 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "env/composite_env_wrapper.h"
 #include "rocksdb/file_system.h"
+
+#include "env/composite_env_wrapper.h"
+#include "env/env_chroot.h"
+#include "env/env_encryption_ctr.h"
+#include "env/fs_readonly.h"
+#include "env/mock_env.h"
 #include "options/db_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+#include "utilities/env_timed.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -16,14 +26,85 @@
 
 Status FileSystem::Load(const std::string& value,
                         std::shared_ptr<FileSystem>* result) {
-  Status s;
+  return CreateFromString(ConfigOptions(), value, result);
+}
+
 #ifndef ROCKSDB_LITE
-  s = ObjectRegistry::NewInstance()->NewSharedObject<FileSystem>(value, result);
-#else
-  (void)result;
-  s = Status::NotSupported("Cannot load FileSystem in LITE mode: ", value);
-#endif
-  return s;
+static int RegisterBuiltinFileSystems(ObjectLibrary& library,
+                                      const std::string& /*arg*/) {
+  library.AddFactory<FileSystem>(
+      TimedFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TimedFileSystem(nullptr));
+        return guard->get();
+      });
+  library.AddFactory<FileSystem>(
+      ReadOnlyFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new ReadOnlyFileSystem(nullptr));
+        return guard->get();
+      });
+  library.AddFactory<FileSystem>(
+      EncryptedFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* errmsg) {
+        Status s = NewEncryptedFileSystemImpl(nullptr, nullptr, guard);
+        if (!s.ok()) {
+          *errmsg = s.ToString();
+        }
+        return guard->get();
+      });
+  library.AddFactory<FileSystem>(
+      MockFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new MockFileSystem(SystemClock::Default()));
+        return guard->get();
+      });
+#ifndef OS_WIN
+  library.AddFactory<FileSystem>(
+      ChrootFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new ChrootFileSystem(nullptr, ""));
+        return guard->get();
+      });
+#endif  // OS_WIN
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status FileSystem::CreateFromString(const ConfigOptions& config_options,
+                                    const std::string& value,
+                                    std::shared_ptr<FileSystem>* result) {
+  auto default_fs = FileSystem::Default();
+  if (default_fs->IsInstanceOf(value)) {
+    *result = default_fs;
+    return Status::OK();
+  } else {
+#ifndef ROCKSDB_LITE
+    static std::once_flag once;
+    std::call_once(once, [&]() {
+      RegisterBuiltinFileSystems(*(ObjectLibrary::Default().get()), "");
+    });
+#endif  // ROCKSDB_LITE
+    return LoadSharedObject<FileSystem>(config_options, value, nullptr, result);
+  }
+}
+
+IOStatus FileSystem::ReuseWritableFile(const std::string& fname,
+                                       const std::string& old_fname,
+                                       const FileOptions& opts,
+                                       std::unique_ptr<FSWritableFile>* result,
+                                       IODebugContext* dbg) {
+  IOStatus s = RenameFile(old_fname, fname, opts.io_options, dbg);
+  if (!s.ok()) {
+    return s;
+  }
+  return NewWritableFile(fname, opts, result, dbg);
 }
 
 FileOptions FileSystem::OptimizeForLogRead(
@@ -71,12 +152,39 @@
   return optimized_file_options;
 }
 
-Status ReadFileToString(FileSystem* fs, const std::string& fname,
-                        std::string* data) {
+FileOptions FileSystem::OptimizeForBlobFileRead(
+    const FileOptions& file_options,
+    const ImmutableDBOptions& db_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_file_options;
+}
+
+IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
+                           const std::string& fname, bool should_sync) {
+  std::unique_ptr<FSWritableFile> file;
+  EnvOptions soptions;
+  IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  s = file->Append(data, IOOptions(), nullptr);
+  if (s.ok() && should_sync) {
+    s = file->Sync(IOOptions(), nullptr);
+  }
+  if (!s.ok()) {
+    fs->DeleteFile(fname, IOOptions(), nullptr);
+  }
+  return s;
+}
+
+IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
+                          std::string* data) {
   FileOptions soptions;
   data->clear();
   std::unique_ptr<FSSequentialFile> file;
-  Status s = fs->NewSequentialFile(fname, soptions, &file, nullptr);
+  IOStatus s = status_to_io_status(
+      fs->NewSequentialFile(fname, soptions, &file, nullptr));
   if (!s.ok()) {
     return s;
   }
@@ -98,13 +206,58 @@
   return s;
 }
 
-#ifdef OS_WIN
-std::shared_ptr<FileSystem> FileSystem::Default() {
-  static LegacyFileSystemWrapper default_fs(Env::Default());
-  static std::shared_ptr<LegacyFileSystemWrapper> default_fs_ptr(
-      &default_fs, [](LegacyFileSystemWrapper*) {});
-  return default_fs_ptr;
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> fs_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target",
+     OptionTypeInfo::AsCustomSharedPtr<FileSystem>(
+         0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+FileSystemWrapper::FileSystemWrapper(const std::shared_ptr<FileSystem>& t)
+    : target_(t) {
+  RegisterOptions("", &target_, &fs_wrapper_type_info);
+}
+
+Status FileSystemWrapper::PrepareOptions(const ConfigOptions& options) {
+  if (target_ == nullptr) {
+    target_ = FileSystem::Default();
+  }
+  return FileSystem::PrepareOptions(options);
 }
-#endif
 
+#ifndef ROCKSDB_LITE
+std::string FileSystemWrapper::SerializeOptions(
+    const ConfigOptions& config_options, const std::string& header) const {
+  auto parent = FileSystem::SerializeOptions(config_options, "");
+  if (config_options.IsShallow() || target_ == nullptr ||
+      target_->IsInstanceOf(FileSystem::kDefaultName())) {
+    return parent;
+  } else {
+    std::string result = header;
+    if (!StartsWith(parent, OptionTypeInfo::kIdPropName())) {
+      result.append(OptionTypeInfo::kIdPropName()).append("=");
+    }
+    result.append(parent);
+    if (!EndsWith(result, config_options.delimiter)) {
+      result.append(config_options.delimiter);
+    }
+    result.append("target=").append(target_->ToString(config_options));
+    return result;
+  }
+}
+#endif  // ROCKSDB_LITE
+
+DirFsyncOptions::DirFsyncOptions() { reason = kDefault; }
+
+DirFsyncOptions::DirFsyncOptions(std::string file_renamed_new_name) {
+  reason = kFileRenamed;
+  renamed_new_name = file_renamed_new_name;
+}
+
+DirFsyncOptions::DirFsyncOptions(FsyncReason fsync_reason) {
+  assert(fsync_reason != kFileRenamed);
+  reason = fsync_reason;
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,519 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "env/file_system_tracer.h"
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus FileSystemTracingWrapper::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& file_opts, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s =
+      target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomRWFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewDirectory(
+    const std::string& name, const IOOptions& io_opts,
+    std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewDirectory(name, io_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          name.substr(name.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir,
+                                               const IOOptions& io_opts,
+                                               std::vector<std::string>* r,
+                                               IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->GetChildren(dir, io_opts, r, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dir.substr(dir.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname,
+                                              const IOOptions& options,
+                                              IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->DeleteFile(fname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->CreateDir(dirname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::CreateDirIfMissing(
+    const std::string& dirname, const IOOptions& options, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->DeleteDir(dirname, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname,
+                                               const IOOptions& options,
+                                               uint64_t* file_size,
+                                               IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->GetFileSize(fname, options, file_size, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(
+      clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed,
+      s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname,
+                                            size_t size,
+                                            const IOOptions& options,
+                                            IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Truncate(fname, size, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1), size);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::Read(size_t n,
+                                              const IOOptions& options,
+                                              Slice* result, char* scratch,
+                                              IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Read(n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          result->size(), 0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset,
+                                                         size_t length) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->InvalidateCache(offset, length);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+  return s;
+}
+
+IOStatus FSSequentialFileTracingWrapper::PositionedRead(
+    uint64_t offset, size_t n, const IOOptions& options, Slice* result,
+    char* scratch, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s =
+      target()->PositionedRead(offset, n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          result->size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n,
+                                                const IOOptions& options,
+                                                Slice* result, char* scratch,
+                                                IODebugContext* dbg) const {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs,
+                                                     size_t num_reqs,
+                                                     const IOOptions& options,
+                                                     IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t latency = elapsed;
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  for (size_t i = 0; i < num_reqs; i++) {
+    IOTraceRecord io_record(
+        clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency,
+        reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset);
+    io_tracer_->WriteIOOp(io_record, dbg);
+  }
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n,
+                                                    const IOOptions& options,
+                                                    IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Prefetch(offset, n, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset,
+                                                           size_t length) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->InvalidateCache(offset, length);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
+                          static_cast<uint64_t>(offset));
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+  return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::Append(const Slice& data,
+                                              const IOOptions& options,
+                                              IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Append(data, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), 0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::PositionedAppend(
+    const Slice& data, uint64_t offset, const IOOptions& options,
+    IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->PositionedAppend(data, offset, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size,
+                                                const IOOptions& options,
+                                                IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Truncate(size, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, size,
+                          0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Close(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options,
+                                                   IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  uint64_t file_size = target()->GetFileSize(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, "OK", file_name_, file_size);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return file_size;
+}
+
+IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset,
+                                                       size_t length) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->InvalidateCache(offset, length);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
+                          static_cast<uint64_t>(offset));
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Write(offset, data, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n,
+                                            const IOOptions& options,
+                                            Slice* result, char* scratch,
+                                            IODebugContext* dbg) const {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Flush(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Close(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options,
+                                            IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Sync(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Fsync(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/file_system_tracer.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/file_system_tracer.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,447 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "trace_replay/io_tracer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// FileSystemTracingWrapper is a wrapper class above FileSystem that forwards
+// the call to the underlying storage system. It then invokes IOTracer to record
+// file operations and other contextual information in a binary format for
+// tracing. It overrides methods we are interested in tracing and extends
+// FileSystemWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FileSystemTracingWrapper : public FileSystemWrapper {
+ public:
+  FileSystemTracingWrapper(const std::shared_ptr<FileSystem>& t,
+                           const std::shared_ptr<IOTracer>& io_tracer)
+      : FileSystemWrapper(t),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()) {}
+
+  ~FileSystemTracingWrapper() override {}
+
+  static const char* kClassName() { return "FileSystemTracing"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+};
+
+// The FileSystemPtr is a wrapper class that takes pointer to storage systems
+// (such as posix filesystems). It overloads operator -> and returns a pointer
+// of either FileSystem or FileSystemTracingWrapper based on whether tracing is
+// enabled or  not. It is added to bypass FileSystemTracingWrapper when tracing
+// is disabled.
+class FileSystemPtr {
+ public:
+  FileSystemPtr(std::shared_ptr<FileSystem> fs,
+                const std::shared_ptr<IOTracer>& io_tracer)
+      : fs_(fs), io_tracer_(io_tracer) {
+    fs_tracer_ = std::make_shared<FileSystemTracingWrapper>(fs_, io_tracer_);
+  }
+
+  std::shared_ptr<FileSystem> operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_;
+    } else {
+      return fs_;
+    }
+  }
+
+  /* Returns the underlying File System pointer */
+  FileSystem* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_.get();
+    } else {
+      return fs_.get();
+    }
+  }
+
+ private:
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::shared_ptr<FileSystemTracingWrapper> fs_tracer_;
+};
+
+// FSSequentialFileTracingWrapper is a wrapper class above FSSequentialFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSSequentialFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSSequentialFileTracingWrapper : public FSSequentialFileOwnerWrapper {
+ public:
+  FSSequentialFileTracingWrapper(std::unique_ptr<FSSequentialFile>&& t,
+                                 std::shared_ptr<IOTracer> io_tracer,
+                                 const std::string& file_name)
+      : FSSequentialFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSSequentialFileTracingWrapper() override {}
+
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  std::string file_name_;
+};
+
+// The FSSequentialFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSSequentialFile or FSSequentialFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSSequentialFileTracingWrapper when tracing is disabled.
+class FSSequentialFilePtr {
+ public:
+  FSSequentialFilePtr() = delete;
+  FSSequentialFilePtr(std::unique_ptr<FSSequentialFile>&& fs,
+                      const std::shared_ptr<IOTracer>& io_tracer,
+                      const std::string& file_name)
+      : io_tracer_(io_tracer),
+        fs_tracer_(std::move(fs), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
+
+  FSSequentialFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+  FSSequentialFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSSequentialFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  FSSequentialFileTracingWrapper fs_tracer_;
+};
+
+// FSRandomAccessFileTracingWrapper is a wrapper class above FSRandomAccessFile
+// that forwards the call to the underlying storage system. It then invokes
+// IOTracer to record file operations and other contextual information in a
+// binary format for tracing. It overrides methods we are interested in tracing
+// and extends FSRandomAccessFileWrapper, which forwards all methods that are
+// not explicitly overridden.
+class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper {
+ public:
+  FSRandomAccessFileTracingWrapper(std::unique_ptr<FSRandomAccessFile>&& t,
+                                   std::shared_ptr<IOTracer> io_tracer,
+                                   const std::string& file_name)
+      : FSRandomAccessFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSRandomAccessFileTracingWrapper() override {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
+};
+
+// The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSRandomAccessFile or FSRandomAccessFileTracingWrapper
+// based on whether tracing is enabled or not. It is added to bypass
+// FSRandomAccessFileTracingWrapper when tracing is disabled.
+class FSRandomAccessFilePtr {
+ public:
+  FSRandomAccessFilePtr(std::unique_ptr<FSRandomAccessFile>&& fs,
+                        const std::shared_ptr<IOTracer>& io_tracer,
+                        const std::string& file_name)
+      : io_tracer_(io_tracer),
+        fs_tracer_(std::move(fs), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
+
+  FSRandomAccessFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+  FSRandomAccessFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomAccessFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  FSRandomAccessFileTracingWrapper fs_tracer_;
+};
+
+// FSWritableFileTracingWrapper is a wrapper class above FSWritableFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSWritableFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSWritableFileTracingWrapper : public FSWritableFileOwnerWrapper {
+ public:
+  FSWritableFileTracingWrapper(std::unique_ptr<FSWritableFile>&& t,
+                               std::shared_ptr<IOTracer> io_tracer,
+                               const std::string& file_name)
+      : FSWritableFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSWritableFileTracingWrapper() override {}
+
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& /*verification_info*/,
+                  IODebugContext* dbg) override {
+    return Append(data, options, dbg);
+  }
+
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& /*verification_info*/,
+                            IODebugContext* dbg) override {
+    return PositionedAppend(data, offset, options, dbg);
+  }
+
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
+};
+
+// The FSWritableFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSWritableFile or FSWritableFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSWritableFileTracingWrapper when tracing is disabled.
+class FSWritableFilePtr {
+ public:
+  FSWritableFilePtr(std::unique_ptr<FSWritableFile>&& fs,
+                    const std::shared_ptr<IOTracer>& io_tracer,
+                    const std::string& file_name)
+      : io_tracer_(io_tracer) {
+    fs_tracer_.reset(new FSWritableFileTracingWrapper(
+        std::move(fs), io_tracer_,
+        file_name.substr(file_name.find_last_of("/\\") +
+                         1) /* pass file name */));
+  }
+
+  FSWritableFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_.get();
+    } else {
+      return fs_tracer_->target();
+    }
+  }
+
+  FSWritableFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return fs_tracer_.get();
+    } else if (fs_tracer_) {
+      return fs_tracer_->target();
+    } else {
+      return nullptr;
+    }
+  }
+
+  void reset() {
+    fs_tracer_.reset();
+    io_tracer_ = nullptr;
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::unique_ptr<FSWritableFileTracingWrapper> fs_tracer_;
+};
+
+// FSRandomRWFileTracingWrapper is a wrapper class above FSRandomRWFile that
+// forwards the call to the underlying storage system. It then invokes IOTracer
+// to record file operations and other contextual information in a binary format
+// for tracing. It overrides methods we are interested in tracing and extends
+// FSRandomRWFileWrapper, which forwards all methods that are not explicitly
+// overridden.
+class FSRandomRWFileTracingWrapper : public FSRandomRWFileOwnerWrapper {
+ public:
+  FSRandomRWFileTracingWrapper(std::unique_ptr<FSRandomRWFile>&& t,
+                               std::shared_ptr<IOTracer> io_tracer,
+                               const std::string& file_name)
+      : FSRandomRWFileOwnerWrapper(std::move(t)),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
+
+  ~FSRandomRWFileTracingWrapper() override {}
+
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
+};
+
+// The FSRandomRWFilePtr is a wrapper class that takes pointer to storage
+// systems (such as posix filesystems). It overloads operator -> and returns a
+// pointer of either FSRandomRWFile or FSRandomRWFileTracingWrapper based on
+// whether tracing is enabled or not. It is added to bypass
+// FSRandomRWFileTracingWrapper when tracing is disabled.
+class FSRandomRWFilePtr {
+ public:
+  FSRandomRWFilePtr(std::unique_ptr<FSRandomRWFile>&& fs,
+                    std::shared_ptr<IOTracer> io_tracer,
+                    const std::string& file_name)
+      : io_tracer_(io_tracer),
+        fs_tracer_(std::move(fs), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
+
+  FSRandomRWFile* operator->() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+  FSRandomRWFile* get() const {
+    if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
+      return const_cast<FSRandomRWFileTracingWrapper*>(&fs_tracer_);
+    } else {
+      return fs_tracer_.target();
+    }
+  }
+
+ private:
+  std::shared_ptr<IOTracer> io_tracer_;
+  FSRandomRWFileTracingWrapper fs_tracer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_posix.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_posix.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_posix.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,16 +6,15 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors
+
+#if !defined(OS_WIN)
+
 #include <dirent.h>
 #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
 #include <dlfcn.h>
 #endif
 #include <errno.h>
 #include <fcntl.h>
-
-#if defined(OS_LINUX)
-#include <linux/fs.h>
-#endif
 #include <pthread.h>
 #include <signal.h>
 #include <stdio.h>
@@ -26,13 +25,13 @@
 #include <sys/stat.h>
 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
 #include <sys/statfs.h>
-#include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
 #include <sys/statvfs.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <time.h>
+
 #include <algorithm>
 // Get nano time includes
 #if defined(OS_LINUX) || defined(OS_FREEBSD)
@@ -47,14 +46,15 @@
 #include <set>
 #include <vector>
 
+#include "env/composite_env_wrapper.h"
 #include "env/io_posix.h"
-#include "logging/logging.h"
 #include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "port/port.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
@@ -73,6 +73,8 @@
 #define EXT4_SUPER_MAGIC 0xEF53
 #endif
 
+extern "C" bool RocksDbIOUringEnable() __attribute__((__weak__));
+
 namespace ROCKSDB_NAMESPACE {
 
 namespace {
@@ -81,9 +83,16 @@
   return allow_non_owner_access ? 0644 : 0600;
 }
 
+static uint64_t gettid() { return Env::Default()->GetThreadID(); }
+
 // list of pathnames that are locked
-static std::set<std::string> lockedFiles;
-static port::Mutex mutex_lockedFiles;
+// Only used for error message.
+struct LockHoldingInfo {
+  int64_t acquire_time;
+  uint64_t acquiring_thread;
+};
+static std::map<std::string, LockHoldingInfo> locked_files;
+static port::Mutex mutex_locked_files;
 
 static int LockOrUnlock(int fd, bool lock) {
   errno = 0;
@@ -100,8 +109,18 @@
 
 class PosixFileLock : public FileLock {
  public:
-  int fd_;
+  int fd_ = /*invalid*/ -1;
   std::string filename;
+
+  void Clear() {
+    fd_ = -1;
+    filename.clear();
+  }
+
+  virtual ~PosixFileLock() override {
+    // Check for destruction without UnlockFile
+    assert(fd_ == -1);
+  }
 };
 
 int cloexec_flags(int flags, const EnvOptions* options) {
@@ -112,6 +131,8 @@
   if (options == nullptr || options->set_fd_cloexec) {
     flags |= O_CLOEXEC;
   }
+#else
+  (void)options;
 #endif
   return flags;
 }
@@ -120,7 +141,9 @@
  public:
   PosixFileSystem();
 
-  const char* Name() const override { return "Posix File System"; }
+  static const char* kClassName() { return "PosixFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
 
   ~PosixFileSystem() override {}
 
@@ -146,6 +169,7 @@
 #endif  // !ROCKSDB_LITE
 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
       flags |= O_DIRECT;
+      TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
 #endif
     }
 
@@ -178,7 +202,9 @@
                        errno);
       }
     }
-    result->reset(new PosixSequentialFile(fname, file, fd, options));
+    result->reset(new PosixSequentialFile(
+        fname, file, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
+        options));
     return IOStatus::OK();
   }
 
@@ -187,7 +213,7 @@
                                std::unique_ptr<FSRandomAccessFile>* result,
                                IODebugContext* /*dbg*/) override {
     result->reset();
-    IOStatus s;
+    IOStatus s = IOStatus::OK();
     int fd;
     int flags = cloexec_flags(O_RDONLY, &options);
 
@@ -207,11 +233,12 @@
       fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
     } while (fd < 0 && errno == EINTR);
     if (fd < 0) {
-      return IOError("While open a file for random read", fname, errno);
+      s = IOError("While open a file for random read", fname, errno);
+      return s;
     }
     SetFD_CLOEXEC(fd, &options);
 
-    if (options.use_mmap_reads && sizeof(void*) >= 8) {
+    if (options.use_mmap_reads) {
       // Use of mmap for random reads has been removed because it
       // kills performance when storage is fast.
       // Use mmap when virtual address-space is plentiful.
@@ -227,6 +254,8 @@
           s = IOError("while mmap file for read", fname, errno);
           close(fd);
         }
+      } else {
+        close(fd);
       }
     } else {
       if (options.use_direct_reads && !options.use_mmap_reads) {
@@ -237,19 +266,20 @@
         }
 #endif
       }
-      result->reset(new PosixRandomAccessFile(fname, fd, options
+      result->reset(new PosixRandomAccessFile(
+          fname, fd, GetLogicalBlockSizeForReadIfNeeded(options, fname, fd),
+          options
 #if defined(ROCKSDB_IOURING_PRESENT)
-                                              ,
-                                              thread_local_io_urings_.get()
+          ,
+          !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get()
 #endif
-                                                  ));
+              ));
     }
     return s;
   }
 
   virtual IOStatus OpenWritableFile(const std::string& fname,
-                                    const FileOptions& options,
-                                    bool reopen,
+                                    const FileOptions& options, bool reopen,
                                     std::unique_ptr<FSWritableFile>* result,
                                     IODebugContext* /*dbg*/) {
     result->reset();
@@ -295,14 +325,7 @@
     SetFD_CLOEXEC(fd, &options);
 
     if (options.use_mmap_writes) {
-      if (!checkedDiskForMmap_) {
-        // this will be executed once in the program's lifetime.
-        // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
-        if (!SupportsFastAllocate(fname)) {
-          forceMmapOff_ = true;
-        }
-        checkedDiskForMmap_ = true;
-      }
+      MaybeForceDisableMmap(fd);
     }
     if (options.use_mmap_writes && !forceMmapOff_) {
       result->reset(new PosixMmapFile(fname, fd, page_size_, options));
@@ -323,12 +346,18 @@
         }
       }
 #endif
-      result->reset(new PosixWritableFile(fname, fd, options));
+      result->reset(new PosixWritableFile(
+          fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
+          options));
     } else {
       // disable mmap writes
       EnvOptions no_mmap_writes_options = options;
       no_mmap_writes_options.use_mmap_writes = false;
-      result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
+      result->reset(
+          new PosixWritableFile(fname, fd,
+                                GetLogicalBlockSizeForWriteIfNeeded(
+                                    no_mmap_writes_options, fname, fd),
+                                no_mmap_writes_options));
     }
     return s;
   }
@@ -395,14 +424,7 @@
     }
 
     if (options.use_mmap_writes) {
-      if (!checkedDiskForMmap_) {
-        // this will be executed once in the program's lifetime.
-        // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
-        if (!SupportsFastAllocate(fname)) {
-          forceMmapOff_ = true;
-        }
-        checkedDiskForMmap_ = true;
-      }
+      MaybeForceDisableMmap(fd);
     }
     if (options.use_mmap_writes && !forceMmapOff_) {
       result->reset(new PosixMmapFile(fname, fd, page_size_, options));
@@ -423,12 +445,18 @@
         }
       }
 #endif
-      result->reset(new PosixWritableFile(fname, fd, options));
+      result->reset(new PosixWritableFile(
+          fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
+          options));
     } else {
       // disable mmap writes
       FileOptions no_mmap_writes_options = options;
       no_mmap_writes_options.use_mmap_writes = false;
-      result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
+      result->reset(
+          new PosixWritableFile(fname, fd,
+                                GetLogicalBlockSizeForWriteIfNeeded(
+                                    no_mmap_writes_options, fname, fd),
+                                no_mmap_writes_options));
     }
     return s;
   }
@@ -519,10 +547,45 @@
     return IOStatus::OK();
   }
 
-  IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*opts*/,
-                     std::shared_ptr<ROCKSDB_NAMESPACE::Logger>* /*ptr*/,
+  IOStatus NewLogger(const std::string& fname, const IOOptions& /*opts*/,
+                     std::shared_ptr<Logger>* result,
                      IODebugContext* /*dbg*/) override {
-    return IOStatus::NotSupported();
+    FILE* f = nullptr;
+    int fd;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(),
+                cloexec_flags(O_WRONLY | O_CREAT | O_TRUNC, nullptr),
+                GetDBFileMode(allow_non_owner_access_));
+      if (fd != -1) {
+        f = fdopen(fd,
+                   "w"
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 7)
+                   "e"  // glibc extension to enable O_CLOEXEC
+#endif
+#endif
+        );
+      }
+    }
+    if (fd == -1) {
+      result->reset();
+      return status_to_io_status(
+          IOError("when open a file for new logger", fname, errno));
+    }
+    if (f == nullptr) {
+      close(fd);
+      result->reset();
+      return status_to_io_status(
+          IOError("when fdopen a file for new logger", fname, errno));
+    } else {
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+      fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024);
+#endif
+      SetFD_CLOEXEC(fd, nullptr);
+      result->reset(new PosixLogger(f, &gettid, Env::Default()));
+      return IOStatus::OK();
+    }
   }
 
   IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/,
@@ -543,7 +606,8 @@
         return IOStatus::NotFound();
       default:
         assert(err == EIO || err == ENOMEM);
-        return IOStatus::IOError("Unexpected error(" + ToString(err) +
+        return IOStatus::IOError("Unexpected error(" +
+                                 ROCKSDB_NAMESPACE::ToString(err) +
                                  ") accessing file `" + fname + "' ");
     }
   }
@@ -552,6 +616,7 @@
                        std::vector<std::string>* result,
                        IODebugContext* /*dbg*/) override {
     result->clear();
+
     DIR* d = opendir(dir.c_str());
     if (d == nullptr) {
       switch (errno) {
@@ -563,11 +628,36 @@
           return IOError("While opendir", dir, errno);
       }
     }
+
+    // reset errno before calling readdir()
+    errno = 0;
     struct dirent* entry;
     while ((entry = readdir(d)) != nullptr) {
-      result->push_back(entry->d_name);
+      // filter out '.' and '..' directory entries
+      // which appear only on some platforms
+      const bool ignore =
+          entry->d_type == DT_DIR &&
+          (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0);
+      if (!ignore) {
+        result->push_back(entry->d_name);
+      }
+      errno = 0;  // reset errno if readdir() success
+    }
+
+    // always attempt to close the dir
+    const auto pre_close_errno = errno;  // errno may be modified by closedir
+    const int close_result = closedir(d);
+
+    if (pre_close_errno != 0) {
+      // error occurred during readdir
+      return IOError("While readdir", dir, pre_close_errno);
+    }
+
+    if (close_result != 0) {
+      // error occurred during closedir
+      return IOError("While closedir", dir, errno);
     }
-    closedir(d);
+
     return IOStatus::OK();
   }
 
@@ -582,50 +672,46 @@
 
   IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/,
                      IODebugContext* /*dbg*/) override {
-    IOStatus result;
     if (mkdir(name.c_str(), 0755) != 0) {
-      result = IOError("While mkdir", name, errno);
+      return IOError("While mkdir", name, errno);
     }
-    return result;
+    return IOStatus::OK();
   }
 
   IOStatus CreateDirIfMissing(const std::string& name,
                               const IOOptions& /*opts*/,
                               IODebugContext* /*dbg*/) override {
-    IOStatus result;
     if (mkdir(name.c_str(), 0755) != 0) {
       if (errno != EEXIST) {
-        result = IOError("While mkdir if missing", name, errno);
+        return IOError("While mkdir if missing", name, errno);
       } else if (!DirExists(name)) {  // Check that name is actually a
                                       // directory.
         // Message is taken from mkdir
-        result =
-            IOStatus::IOError("`" + name + "' exists but is not a directory");
+        return IOStatus::IOError("`" + name +
+                                 "' exists but is not a directory");
       }
     }
-    return result;
+    return IOStatus::OK();
   }
 
   IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/,
                      IODebugContext* /*dbg*/) override {
-    IOStatus result;
     if (rmdir(name.c_str()) != 0) {
-      result = IOError("file rmdir", name, errno);
+      return IOError("file rmdir", name, errno);
     }
-    return result;
+    return IOStatus::OK();
   }
 
   IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
                        uint64_t* size, IODebugContext* /*dbg*/) override {
-    IOStatus s;
     struct stat sbuf;
     if (stat(fname.c_str(), &sbuf) != 0) {
       *size = 0;
-      s = IOError("while stat a file for size", fname, errno);
+      return IOError("while stat a file for size", fname, errno);
     } else {
       *size = sbuf.st_size;
     }
-    return s;
+    return IOStatus::OK();
   }
 
   IOStatus GetFileModificationTime(const std::string& fname,
@@ -643,24 +729,24 @@
   IOStatus RenameFile(const std::string& src, const std::string& target,
                       const IOOptions& /*opts*/,
                       IODebugContext* /*dbg*/) override {
-    IOStatus result;
     if (rename(src.c_str(), target.c_str()) != 0) {
-      result = IOError("While renaming a file to " + target, src, errno);
+      return IOError("While renaming a file to " + target, src, errno);
     }
-    return result;
+    return IOStatus::OK();
   }
 
   IOStatus LinkFile(const std::string& src, const std::string& target,
                     const IOOptions& /*opts*/,
                     IODebugContext* /*dbg*/) override {
-    IOStatus result;
     if (link(src.c_str(), target.c_str()) != 0) {
-      if (errno == EXDEV) {
-        return IOStatus::NotSupported("No cross FS links allowed");
+      if (errno == EXDEV || errno == ENOTSUP) {
+        return IOStatus::NotSupported(errno == EXDEV
+                                          ? "No cross FS links allowed"
+                                          : "Links not supported by FS");
       }
-      result = IOError("while link file to " + target, src, errno);
+      return IOError("while link file to " + target, src, errno);
     }
-    return result;
+    return IOStatus::OK();
   }
 
   IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/,
@@ -697,11 +783,19 @@
   IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/,
                     FileLock** lock, IODebugContext* /*dbg*/) override {
     *lock = nullptr;
-    IOStatus result;
 
-    mutex_lockedFiles.Lock();
-    // If it already exists in the lockedFiles set, then it is already locked,
-    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
+    LockHoldingInfo lhi;
+    int64_t current_time = 0;
+    // Ignore status code as the time is only used for error message.
+    SystemClock::Default()
+        ->GetCurrentTime(&current_time)
+        .PermitUncheckedError();
+    lhi.acquire_time = current_time;
+    lhi.acquiring_thread = Env::Default()->GetThreadID();
+
+    mutex_locked_files.Lock();
+    // If it already exists in the locked_files set, then it is already locked,
+    // and fail this lock attempt. Otherwise, insert it into locked_files.
     // This check is needed because fcntl() does not detect lock conflict
     // if the fcntl is issued by the same thread that earlier acquired
     // this lock.
@@ -709,12 +803,22 @@
     // Otherwise, we will open a new file descriptor. Locks are associated with
     // a process, not a file descriptor and when *any* file descriptor is
     // closed, all locks the process holds for that *file* are released
-    if (lockedFiles.insert(fname).second == false) {
-      mutex_lockedFiles.Unlock();
+    const auto it_success = locked_files.insert({fname, lhi});
+    if (it_success.second == false) {
+      LockHoldingInfo prev_info = it_success.first->second;
+      mutex_locked_files.Unlock();
       errno = ENOLCK;
-      return IOError("lock ", fname, errno);
+      // Note that the thread ID printed is the same one as the one in
+      // posix logger, but posix logger prints it hex format.
+      return IOError(
+          "lock hold by current process, acquire time " +
+              ROCKSDB_NAMESPACE::ToString(prev_info.acquire_time) +
+              " acquiring thread " +
+              ROCKSDB_NAMESPACE::ToString(prev_info.acquiring_thread),
+          fname, errno);
     }
 
+    IOStatus result = IOStatus::OK();
     int fd;
     int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
 
@@ -725,9 +829,6 @@
     if (fd < 0) {
       result = IOError("while open a file for lock", fname, errno);
     } else if (LockOrUnlock(fd, true) == -1) {
-      // if there is an error in locking, then remove the pathname from
-      // lockedfiles
-      lockedFiles.erase(fname);
       result = IOError("While lock file", fname, errno);
       close(fd);
     } else {
@@ -737,8 +838,14 @@
       my_lock->filename = fname;
       *lock = my_lock;
     }
+    if (!result.ok()) {
+      // If there is an error in locking, then remove the pathname from
+      // locked_files. (If we got this far, it did not exist in locked_files
+      // before this call.)
+      locked_files.erase(fname);
+    }
 
-    mutex_lockedFiles.Unlock();
+    mutex_locked_files.Unlock();
     return result;
   }
 
@@ -746,18 +853,19 @@
                       IODebugContext* /*dbg*/) override {
     PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
     IOStatus result;
-    mutex_lockedFiles.Lock();
+    mutex_locked_files.Lock();
     // If we are unlocking, then verify that we had locked it earlier,
-    // it should already exist in lockedFiles. Remove it from lockedFiles.
-    if (lockedFiles.erase(my_lock->filename) != 1) {
+    // it should already exist in locked_files. Remove it from locked_files.
+    if (locked_files.erase(my_lock->filename) != 1) {
       errno = ENOLCK;
       result = IOError("unlock", my_lock->filename, errno);
     } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
       result = IOError("unlock", my_lock->filename, errno);
     }
     close(my_lock->fd_);
+    my_lock->Clear();
     delete my_lock;
-    mutex_lockedFiles.Unlock();
+    mutex_locked_files.Unlock();
     return result;
   }
 
@@ -772,7 +880,7 @@
     char the_path[256];
     char* ret = getcwd(the_path, 256);
     if (ret == nullptr) {
-      return IOStatus::IOError(strerror(errno));
+      return IOStatus::IOError(errnoStr(errno).c_str());
     }
 
     *output_path = ret;
@@ -792,7 +900,7 @@
     // Directory may already exist
     {
       IOOptions opts;
-      CreateDir(*result, opts, nullptr);
+      return CreateDirIfMissing(*result, opts, nullptr);
     }
     return IOStatus::OK();
   }
@@ -806,12 +914,46 @@
       return IOError("While doing statvfs", fname, errno);
     }
 
-    *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+    // sbuf.bfree is total free space available to root
+    // sbuf.bavail is total free space available to unprivileged user
+    //  sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id
+    if (geteuid()) {
+      // non-zero user is unprivileged, or -1 if error.  take more conservative
+      // size
+      *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail);
+    } else {
+      // root user can access all disk space
+      *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+    }
     return IOStatus::OK();
   }
 
+  IOStatus IsDirectory(const std::string& path, const IOOptions& /*opts*/,
+                       bool* is_dir, IODebugContext* /*dbg*/) override {
+    // First open
+    int fd = -1;
+    int flags = cloexec_flags(O_RDONLY, nullptr);
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(path.c_str(), flags);
+    }
+    if (fd < 0) {
+      return IOError("While open for IsDirectory()", path, errno);
+    }
+    IOStatus io_s;
+    struct stat sbuf;
+    if (fstat(fd, &sbuf) < 0) {
+      io_s = IOError("While doing stat for IsDirectory()", path, errno);
+    }
+    close(fd);
+    if (io_s.ok() && nullptr != is_dir) {
+      *is_dir = S_ISDIR(sbuf.st_mode);
+    }
+    return io_s;
+  }
+
   FileOptions OptimizeForLogWrite(const FileOptions& file_options,
-                                 const DBOptions& db_options) const override {
+                                  const DBOptions& db_options) const override {
     FileOptions optimized = file_options;
     optimized.use_mmap_writes = false;
     optimized.use_direct_writes = false;
@@ -833,10 +975,17 @@
     optimized.fallocate_with_keep_size = true;
     return optimized;
   }
-
+#ifdef OS_LINUX
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return logical_block_size_cache_.RefAndCacheLogicalBlockSize(paths);
+  }
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    logical_block_size_cache_.UnrefAndTryRemoveCachedLogicalBlockSize(paths);
+    return Status::OK();
+  }
+#endif
  private:
-  bool checkedDiskForMmap_;
-  bool forceMmapOff_;  // do we override Env options?
+  bool forceMmapOff_ = false;  // do we override Env options?
 
   // Returns true iff the named directory exists and is a directory.
   virtual bool DirExists(const std::string& dname) {
@@ -847,10 +996,10 @@
     return false;  // stat() failed return false
   }
 
-  bool SupportsFastAllocate(const std::string& path) {
+  bool SupportsFastAllocate(int fd) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
     struct statfs s;
-    if (statfs(path.c_str(), &s)) {
+    if (fstatfs(fd, &s)) {
       return false;
     }
     switch (s.f_type) {
@@ -864,11 +1013,36 @@
         return false;
     }
 #else
-    (void)path;
+    (void)fd;
     return false;
 #endif
   }
 
+  void MaybeForceDisableMmap(int fd) {
+    static std::once_flag s_check_disk_for_mmap_once;
+    assert(this == FileSystem::Default().get());
+    std::call_once(
+        s_check_disk_for_mmap_once,
+        [this](int fdesc) {
+          // this will be executed once in the program's lifetime.
+          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
+          if (!SupportsFastAllocate(fdesc)) {
+            forceMmapOff_ = true;
+          }
+        },
+        fd);
+  }
+
+#ifdef ROCKSDB_IOURING_PRESENT
+  bool IsIOUringEnabled() {
+    if (RocksDbIOUringEnable && RocksDbIOUringEnable()) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+#endif  // ROCKSDB_IOURING_PRESENT
+
 #if defined(ROCKSDB_IOURING_PRESENT)
   // io_uring instance
   std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
@@ -879,11 +1053,50 @@
   // If true, allow non owner read access for db files. Otherwise, non-owner
   //  has no access to db files.
   bool allow_non_owner_access_;
+
+#ifdef OS_LINUX
+  static LogicalBlockSizeCache logical_block_size_cache_;
+#endif
+  static size_t GetLogicalBlockSize(const std::string& fname, int fd);
+  // In non-direct IO mode, this directly returns kDefaultPageSize.
+  // Otherwise call GetLogicalBlockSize.
+  static size_t GetLogicalBlockSizeForReadIfNeeded(const EnvOptions& options,
+                                                   const std::string& fname,
+                                                   int fd);
+  static size_t GetLogicalBlockSizeForWriteIfNeeded(const EnvOptions& options,
+                                                    const std::string& fname,
+                                                    int fd);
 };
 
+#ifdef OS_LINUX
+LogicalBlockSizeCache PosixFileSystem::logical_block_size_cache_;
+#endif
+
+size_t PosixFileSystem::GetLogicalBlockSize(const std::string& fname, int fd) {
+#ifdef OS_LINUX
+  return logical_block_size_cache_.GetLogicalBlockSize(fname, fd);
+#else
+  (void)fname;
+  return PosixHelper::GetLogicalBlockSizeOfFd(fd);
+#endif
+}
+
+size_t PosixFileSystem::GetLogicalBlockSizeForReadIfNeeded(
+    const EnvOptions& options, const std::string& fname, int fd) {
+  return options.use_direct_reads
+             ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
+             : kDefaultPageSize;
+}
+
+size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
+    const EnvOptions& options, const std::string& fname, int fd) {
+  return options.use_direct_writes
+             ? PosixFileSystem::GetLogicalBlockSize(fname, fd)
+             : kDefaultPageSize;
+}
+
 PosixFileSystem::PosixFileSystem()
-    : checkedDiskForMmap_(false),
-      forceMmapOff_(false),
+    : forceMmapOff_(false),
       page_size_(getpagesize()),
       allow_non_owner_access_(true) {
 #if defined(ROCKSDB_IOURING_PRESENT)
@@ -910,4 +1123,17 @@
   return default_fs_ptr;
 }
 
+#ifndef ROCKSDB_LITE
+static FactoryFunc<FileSystem> posix_filesystem_reg =
+    ObjectLibrary::Default()->AddFactory<FileSystem>(
+        ObjectLibrary::PatternEntry("posix").AddSeparator("://", false),
+        [](const std::string& /* uri */, std::unique_ptr<FileSystem>* f,
+           std::string* /* errmsg */) {
+          f->reset(new PosixFileSystem());
+          return f->get();
+        });
+#endif
+
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_readonly.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_readonly.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_readonly.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_readonly.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,107 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A FileSystem wrapper that only allows read-only operation.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class ReadOnlyFileSystem : public FileSystemWrapper {
+  static inline IOStatus FailReadOnly() {
+    IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem");
+    assert(s.GetRetryable() == false);
+    return s;
+  }
+
+ public:
+  explicit ReadOnlyFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+
+  static const char* kClassName() { return "ReadOnlyFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewWritableFile(const std::string& /*fname*/,
+                           const FileOptions& /*options*/,
+                           std::unique_ptr<FSWritableFile>* /*result*/,
+                           IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus ReuseWritableFile(const std::string& /*fname*/,
+                             const std::string& /*old_fname*/,
+                             const FileOptions& /*options*/,
+                             std::unique_ptr<FSWritableFile>* /*result*/,
+                             IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewRandomRWFile(const std::string& /*fname*/,
+                           const FileOptions& /*options*/,
+                           std::unique_ptr<FSRandomRWFile>* /*result*/,
+                           IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewDirectory(const std::string& /*dir*/,
+                        const IOOptions& /*options*/,
+                        std::unique_ptr<FSDirectory>* /*result*/,
+                        IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus DeleteFile(const std::string& /*fname*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus CreateDir(const std::string& /*dirname*/,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override {
+    // Allow if dir already exists
+    bool is_dir = false;
+    IOStatus s = IsDirectory(dirname, options, &is_dir, dbg);
+    if (s.ok() && is_dir) {
+      return s;
+    } else {
+      return FailReadOnly();
+    }
+  }
+  IOStatus DeleteDir(const std::string& /*dirname*/,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/,
+                    FileLock** /*lock*/, IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/,
+                     std::shared_ptr<Logger>* /*result*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,306 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "env/fs_remap.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RemapFileSystem::RemapFileSystem(const std::shared_ptr<FileSystem>& base)
+    : FileSystemWrapper(base) {}
+
+std::pair<IOStatus, std::string> RemapFileSystem::EncodePathWithNewBasename(
+    const std::string& path) {
+  // No difference by default
+  return EncodePath(path);
+}
+
+Status RemapFileSystem::RegisterDbPaths(const std::vector<std::string>& paths) {
+  std::vector<std::string> encoded_paths;
+  encoded_paths.reserve(paths.size());
+  for (auto& path : paths) {
+    auto status_and_enc_path = EncodePathWithNewBasename(path);
+    if (!status_and_enc_path.first.ok()) {
+      return status_and_enc_path.first;
+    }
+    encoded_paths.emplace_back(status_and_enc_path.second);
+  }
+  return FileSystemWrapper::RegisterDbPaths(encoded_paths);
+}
+
+Status RemapFileSystem::UnregisterDbPaths(
+    const std::vector<std::string>& paths) {
+  std::vector<std::string> encoded_paths;
+  encoded_paths.reserve(paths.size());
+  for (auto& path : paths) {
+    auto status_and_enc_path = EncodePathWithNewBasename(path);
+    if (!status_and_enc_path.first.ok()) {
+      return status_and_enc_path.first;
+    }
+    encoded_paths.emplace_back(status_and_enc_path.second);
+  }
+  return FileSystemWrapper::UnregisterDbPaths(encoded_paths);
+}
+
+IOStatus RemapFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second,
+                                              options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second,
+                                                options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options,
+                                            result, dbg);
+}
+
+IOStatus RemapFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  auto status_and_old_enc_path = EncodePath(old_fname);
+  if (!status_and_old_enc_path.first.ok()) {
+    return status_and_old_enc_path.first;
+  }
+  return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second,
+                                              status_and_old_enc_path.second,
+                                              options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options,
+                                            result, dbg);
+}
+
+IOStatus RemapFileSystem::NewDirectory(const std::string& dir,
+                                       const IOOptions& options,
+                                       std::unique_ptr<FSDirectory>* result,
+                                       IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewDirectory(status_and_enc_path.second, options,
+                                         result, dbg);
+}
+
+IOStatus RemapFileSystem::FileExists(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::FileExists(status_and_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::GetChildren(const std::string& dir,
+                                      const IOOptions& options,
+                                      std::vector<std::string>* result,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetChildren(status_and_enc_path.second, options,
+                                        result, dbg);
+}
+
+IOStatus RemapFileSystem::GetChildrenFileAttributes(
+    const std::string& dir, const IOOptions& options,
+    std::vector<FileAttributes>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetChildrenFileAttributes(
+      status_and_enc_path.second, options, result, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteFile(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::CreateDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second,
+                                               options, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileSize(const std::string& fname,
+                                      const IOOptions& options,
+                                      uint64_t* file_size,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options,
+                                        file_size, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname,
+                                                  const IOOptions& options,
+                                                  uint64_t* file_mtime,
+                                                  IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second,
+                                                    options, file_mtime, dbg);
+}
+
+IOStatus RemapFileSystem::IsDirectory(const std::string& path,
+                                      const IOOptions& options, bool* is_dir,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(path);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options,
+                                        is_dir, dbg);
+}
+
+IOStatus RemapFileSystem::RenameFile(const std::string& src,
+                                     const std::string& dest,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_src_enc_path = EncodePath(src);
+  if (!status_and_src_enc_path.first.ok()) {
+    return status_and_src_enc_path.first;
+  }
+  auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+  if (!status_and_dest_enc_path.first.ok()) {
+    return status_and_dest_enc_path.first;
+  }
+  return FileSystemWrapper::RenameFile(status_and_src_enc_path.second,
+                                       status_and_dest_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::LinkFile(const std::string& src,
+                                   const std::string& dest,
+                                   const IOOptions& options,
+                                   IODebugContext* dbg) {
+  auto status_and_src_enc_path = EncodePath(src);
+  if (!status_and_src_enc_path.first.ok()) {
+    return status_and_src_enc_path.first;
+  }
+  auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+  if (!status_and_dest_enc_path.first.ok()) {
+    return status_and_dest_enc_path.first;
+  }
+  return FileSystemWrapper::LinkFile(status_and_src_enc_path.second,
+                                     status_and_dest_enc_path.second, options,
+                                     dbg);
+}
+
+IOStatus RemapFileSystem::LockFile(const std::string& fname,
+                                   const IOOptions& options, FileLock** lock,
+                                   IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  // FileLock subclasses may store path (e.g., PosixFileLock stores it). We
+  // can skip stripping the chroot directory from this path because callers
+  // shouldn't use it.
+  return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock,
+                                     dbg);
+}
+
+IOStatus RemapFileSystem::NewLogger(const std::string& fname,
+                                    const IOOptions& options,
+                                    std::shared_ptr<Logger>* result,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewLogger(status_and_enc_path.second, options,
+                                      result, dbg);
+}
+
+IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path,
+                                          const IOOptions& options,
+                                          std::string* output_path,
+                                          IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(db_path);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options,
+                                            output_path, dbg);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/fs_remap.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/fs_remap.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,139 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <utility>
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An abstract FileSystem wrapper that creates a view of an existing
+// FileSystem by remapping names in some way.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class RemapFileSystem : public FileSystemWrapper {
+ public:
+  explicit RemapFileSystem(const std::shared_ptr<FileSystem>& base);
+
+ protected:
+  // Returns status and mapped-to path in the wrapped filesystem.
+  // If it returns non-OK status, the returned path should not be used.
+  virtual std::pair<IOStatus, std::string> EncodePath(
+      const std::string& path) = 0;
+
+  // Similar to EncodePath() except used in cases in which it is OK for
+  // no file or directory on 'path' to already exist, such as if the
+  // operation would create one. However, the parent of 'path' is expected
+  // to exist for the operation to succeed.
+  // Default implementation: call EncodePath
+  virtual std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+      const std::string& path);
+
+ public:
+  // Left abstract:
+  // const char* Name() const override { ... }
+  static const char* kClassName() { return "RemapFileSystem"; }
+  bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return FileSystemWrapper::IsInstanceOf(id);
+    }
+  }
+
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override;
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override;
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& dir, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus FileExists(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+
+  IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+                       bool* is_dir, IODebugContext* dbg) override;
+
+  IOStatus RenameFile(const std::string& src, const std::string& dest,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LinkFile(const std::string& src, const std::string& dest,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.cc	2025-05-19 16:14:27.000000000 +0000
@@ -27,11 +27,11 @@
 #include <sys/types.h>
 #ifdef OS_LINUX
 #include <sys/statfs.h>
-#include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/slice.h"
 #include "test_util/sync_point.h"
 #include "util/autovector.h"
@@ -45,6 +45,35 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+std::string IOErrorMsg(const std::string& context,
+                       const std::string& file_name) {
+  if (file_name.empty()) {
+    return context;
+  }
+  return context + ": " + file_name;
+}
+
+// file_name can be left empty if it is not unkown.
+IOStatus IOError(const std::string& context, const std::string& file_name,
+                 int err_number) {
+  switch (err_number) {
+    case ENOSPC: {
+      IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
+                                     errnoStr(err_number).c_str());
+      s.SetRetryable(true);
+      return s;
+    }
+    case ESTALE:
+      return IOStatus::IOError(IOStatus::kStaleFile);
+    case ENOENT:
+      return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
+                                    errnoStr(err_number).c_str());
+    default:
+      return IOStatus::IOError(IOErrorMsg(context, file_name),
+                               errnoStr(err_number).c_str());
+  }
+}
+
 // A wrapper for fadvise, if the platform doesn't support fadvise,
 // it will simply return 0.
 int Fadvise(int fd, off_t offset, size_t len, int advice) {
@@ -112,75 +141,6 @@
   return true;
 }
 
-size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
-#ifdef OS_LINUX
-  struct stat buf;
-  int result = fstat(fd, &buf);
-  if (result == -1) {
-    return kDefaultPageSize;
-  }
-  if (major(buf.st_dev) == 0) {
-    // Unnamed devices (e.g. non-device mounts), reserved as null device number.
-    // These don't have an entry in /sys/dev/block/. Return a sensible default.
-    return kDefaultPageSize;
-  }
-
-  // Reading queue/logical_block_size does not require special permissions.
-  const int kBufferSize = 100;
-  char path[kBufferSize];
-  char real_path[PATH_MAX + 1];
-  snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
-           minor(buf.st_dev));
-  if (realpath(path, real_path) == nullptr) {
-    return kDefaultPageSize;
-  }
-  std::string device_dir(real_path);
-  if (!device_dir.empty() && device_dir.back() == '/') {
-    device_dir.pop_back();
-  }
-  // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
-  // and nvme0n1 have it.
-  // $ ls -al '/sys/dev/block/8:3'
-  // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
-  // ../../block/sda/sda3
-  // $ ls -al '/sys/dev/block/259:4'
-  // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
-  // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
-  size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
-  if (parent_end == std::string::npos) {
-    return kDefaultPageSize;
-  }
-  size_t parent_begin = device_dir.rfind('/', parent_end - 1);
-  if (parent_begin == std::string::npos) {
-    return kDefaultPageSize;
-  }
-  std::string parent =
-      device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
-  std::string child = device_dir.substr(parent_end + 1, std::string::npos);
-  if (parent != "block" &&
-      (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
-    device_dir = device_dir.substr(0, parent_end);
-  }
-  std::string fname = device_dir + "/queue/logical_block_size";
-  FILE* fp;
-  size_t size = 0;
-  fp = fopen(fname.c_str(), "r");
-  if (fp != nullptr) {
-    char* line = nullptr;
-    size_t len = 0;
-    if (getline(&line, &len, fp) != -1) {
-      sscanf(line, "%zu", &size);
-    }
-    free(line);
-    fclose(fp);
-  }
-  if (size != 0 && (size & (size - 1)) == 0) {
-    return size;
-  }
-#endif
-  return kDefaultPageSize;
-}
-
 #ifdef ROCKSDB_RANGESYNC_PRESENT
 
 #if !defined(ZFS_SUPER_MAGIC)
@@ -190,11 +150,11 @@
 #endif
 
 bool IsSyncFileRangeSupported(int fd) {
-  // The approach taken in this function is to build a blacklist of cases where
-  // we know `sync_file_range` definitely will not work properly despite passing
-  // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or
-  // if any of the checks fail in unexpected ways, we allow `sync_file_range` to
-  // be used. This way should minimize risk of impacting existing use cases.
+  // This function tracks and checks for cases where we know `sync_file_range`
+  // definitely will not work properly despite passing the compile-time check
+  // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
+  // fail in unexpected ways, we allow `sync_file_range` to be used. This way
+  // should minimize risk of impacting existing use cases.
   struct statfs buf;
   int ret = fstatfs(fd, &buf);
   assert(ret == 0);
@@ -216,7 +176,7 @@
     // ("Function not implemented").
     return false;
   }
-  // None of the cases on the blacklist matched, so allow `sync_file_range` use.
+  // None of the known cases matched, so allow `sync_file_range` use.
   return true;
 }
 
@@ -229,30 +189,31 @@
 /*
  * DirectIOHelper
  */
-#ifndef NDEBUG
 namespace {
 
 bool IsSectorAligned(const size_t off, size_t sector_size) {
-  return off % sector_size == 0;
+  assert((sector_size & (sector_size - 1)) == 0);
+  return (off & (sector_size - 1)) == 0;
 }
 
+#ifndef NDEBUG
 bool IsSectorAligned(const void* ptr, size_t sector_size) {
   return uintptr_t(ptr) % sector_size == 0;
 }
-
-}  // namespace
 #endif
+}  // namespace
 
 /*
  * PosixSequentialFile
  */
 PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
-                                         int fd, const EnvOptions& options)
+                                         int fd, size_t logical_block_size,
+                                         const EnvOptions& options)
     : filename_(fname),
       file_(file),
       fd_(fd),
       use_direct_io_(options.use_direct_reads),
-      logical_sector_size_(GetLogicalBufferSize(fd_)) {
+      logical_sector_size_(logical_block_size) {
   assert(!options.use_direct_reads || !options.use_mmap_reads);
 }
 
@@ -273,6 +234,7 @@
   IOStatus s;
   size_t r = 0;
   do {
+    clearerr(file_);
     r = fread_unlocked(scratch, 1, n, file_);
   } while (r == 0 && ferror(file_) && errno == EINTR);
   *result = Slice(scratch, r);
@@ -314,7 +276,7 @@
     ptr += r;
     offset += r;
     left -= r;
-    if (r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
+    if (!IsSectorAligned(r, GetRequiredBufferAlignment())) {
       // Bytes reads don't fill sectors. Should only happen at the end
       // of the file.
       break;
@@ -409,13 +371,178 @@
   return static_cast<size_t>(rid - id);
 }
 #endif
+
+#ifdef OS_LINUX
+std::string RemoveTrailingSlash(const std::string& path) {
+  std::string p = path;
+  if (p.size() > 1 && p.back() == '/') {
+    p.pop_back();
+  }
+  return p;
+}
+
+Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
+    const std::vector<std::string>& directories) {
+  std::vector<std::string> dirs;
+  dirs.reserve(directories.size());
+  for (auto& d : directories) {
+    dirs.emplace_back(RemoveTrailingSlash(d));
+  }
+
+  std::map<std::string, size_t> dir_sizes;
+  {
+    ReadLock lock(&cache_mutex_);
+    for (const auto& dir : dirs) {
+      if (cache_.find(dir) == cache_.end()) {
+        dir_sizes.emplace(dir, 0);
+      }
+    }
+  }
+
+  Status s;
+  for (auto& dir_size : dir_sizes) {
+    s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  WriteLock lock(&cache_mutex_);
+  for (const auto& dir : dirs) {
+    auto& v = cache_[dir];
+    v.ref++;
+    auto dir_size = dir_sizes.find(dir);
+    if (dir_size != dir_sizes.end()) {
+      v.size = dir_size->second;
+    }
+  }
+  return s;
+}
+
+void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
+    const std::vector<std::string>& directories) {
+  std::vector<std::string> dirs;
+  dirs.reserve(directories.size());
+  for (auto& dir : directories) {
+    dirs.emplace_back(RemoveTrailingSlash(dir));
+  }
+
+  WriteLock lock(&cache_mutex_);
+  for (const auto& dir : dirs) {
+    auto it = cache_.find(dir);
+    if (it != cache_.end() && !(--(it->second.ref))) {
+      cache_.erase(it);
+    }
+  }
+}
+
+size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
+                                                  int fd) {
+  std::string dir = fname.substr(0, fname.find_last_of("/"));
+  if (dir.empty()) {
+    dir = "/";
+  }
+  {
+    ReadLock lock(&cache_mutex_);
+    auto it = cache_.find(dir);
+    if (it != cache_.end()) {
+      return it->second.size;
+    }
+  }
+  return get_logical_block_size_of_fd_(fd);
+}
+#endif
+
+Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
+                                                   size_t* size) {
+  int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
+  if (fd == -1) {
+    close(fd);
+    return Status::IOError("Cannot open directory " + directory);
+  }
+  *size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
+  close(fd);
+  return Status::OK();
+}
+
+size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
+#ifdef OS_LINUX
+  struct stat buf;
+  int result = fstat(fd, &buf);
+  if (result == -1) {
+    return kDefaultPageSize;
+  }
+  if (major(buf.st_dev) == 0) {
+    // Unnamed devices (e.g. non-device mounts), reserved as null device number.
+    // These don't have an entry in /sys/dev/block/. Return a sensible default.
+    return kDefaultPageSize;
+  }
+
+  // Reading queue/logical_block_size does not require special permissions.
+  const int kBufferSize = 100;
+  char path[kBufferSize];
+  char real_path[PATH_MAX + 1];
+  snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
+           minor(buf.st_dev));
+  if (realpath(path, real_path) == nullptr) {
+    return kDefaultPageSize;
+  }
+  std::string device_dir(real_path);
+  if (!device_dir.empty() && device_dir.back() == '/') {
+    device_dir.pop_back();
+  }
+  // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
+  // and nvme0n1 have it.
+  // $ ls -al '/sys/dev/block/8:3'
+  // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
+  // ../../block/sda/sda3
+  // $ ls -al '/sys/dev/block/259:4'
+  // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
+  // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
+  size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
+  if (parent_end == std::string::npos) {
+    return kDefaultPageSize;
+  }
+  size_t parent_begin = device_dir.rfind('/', parent_end - 1);
+  if (parent_begin == std::string::npos) {
+    return kDefaultPageSize;
+  }
+  std::string parent =
+      device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
+  std::string child = device_dir.substr(parent_end + 1, std::string::npos);
+  if (parent != "block" &&
+      (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
+    device_dir = device_dir.substr(0, parent_end);
+  }
+  std::string fname = device_dir + "/queue/logical_block_size";
+  FILE* fp;
+  size_t size = 0;
+  fp = fopen(fname.c_str(), "r");
+  if (fp != nullptr) {
+    char* line = nullptr;
+    size_t len = 0;
+    if (getline(&line, &len, fp) != -1) {
+      sscanf(line, "%zu", &size);
+    }
+    free(line);
+    fclose(fp);
+  }
+  if (size != 0 && (size & (size - 1)) == 0) {
+    return size;
+  }
+#endif
+  (void)fd;
+  return kDefaultPageSize;
+}
+
 /*
  * PosixRandomAccessFile
  *
  * pread() based random-access
  */
 PosixRandomAccessFile::PosixRandomAccessFile(
-    const std::string& fname, int fd, const EnvOptions& options
+    const std::string& fname, int fd, size_t logical_block_size,
+    const EnvOptions& options
 #if defined(ROCKSDB_IOURING_PRESENT)
     ,
     ThreadLocalPtr* thread_local_io_urings
@@ -424,14 +551,14 @@
     : filename_(fname),
       fd_(fd),
       use_direct_io_(options.use_direct_reads),
-      logical_sector_size_(GetLogicalBufferSize(fd_))
+      logical_sector_size_(logical_block_size)
 #if defined(ROCKSDB_IOURING_PRESENT)
       ,
       thread_local_io_urings_(thread_local_io_urings)
 #endif
 {
   assert(!options.use_direct_reads || !options.use_mmap_reads);
-  assert(!options.use_mmap_reads || sizeof(void*) < 8);
+  assert(!options.use_mmap_reads);
 }
 
 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
@@ -481,6 +608,14 @@
                                           size_t num_reqs,
                                           const IOOptions& options,
                                           IODebugContext* dbg) {
+  if (use_direct_io()) {
+    for (size_t i = 0; i < num_reqs; i++) {
+      assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment()));
+      assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment()));
+      assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment()));
+    }
+  }
+
 #if defined(ROCKSDB_IOURING_PRESENT)
   struct io_uring* iu = nullptr;
   if (thread_local_io_urings_) {
@@ -499,6 +634,8 @@
     return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
   }
 
+  IOStatus ios = IOStatus::OK();
+
   struct WrappedReadRequest {
     FSReadRequest* req;
     struct iovec iov;
@@ -508,6 +645,7 @@
 
   autovector<WrappedReadRequest, 32> req_wraps;
   autovector<WrappedReadRequest*, 4> incomplete_rq_list;
+  std::unordered_set<WrappedReadRequest*> wrap_cache;
 
   for (size_t i = 0; i < num_reqs; i++) {
     req_wraps.emplace_back(&reqs[i]);
@@ -540,26 +678,71 @@
           sqe, fd_, &rep_to_submit->iov, 1,
           rep_to_submit->req->offset + rep_to_submit->finished_len);
       io_uring_sqe_set_data(sqe, rep_to_submit);
+      wrap_cache.emplace(rep_to_submit);
     }
     incomplete_rq_list.clear();
 
     ssize_t ret =
         io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
+    TEST_SYNC_POINT_CALLBACK(
+        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+        &ret);
+    TEST_SYNC_POINT_CALLBACK(
+        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+        iu);
+
     if (static_cast<size_t>(ret) != this_reqs) {
       fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
+      // If error happens and we submitted fewer than expected, it is an
+      // exception case and we don't retry here. We should still consume
+      // what is is submitted in the ring.
+      for (ssize_t i = 0; i < ret; i++) {
+        struct io_uring_cqe* cqe = nullptr;
+        io_uring_wait_cqe(iu, &cqe);
+        if (cqe != nullptr) {
+          io_uring_cqe_seen(iu, cqe);
+        }
+      }
+      return IOStatus::IOError("io_uring_submit_and_wait() requested " +
+                               ToString(this_reqs) + " but returned " +
+                               ToString(ret));
     }
-    assert(static_cast<size_t>(ret) == this_reqs);
 
     for (size_t i = 0; i < this_reqs; i++) {
-      struct io_uring_cqe* cqe;
+      struct io_uring_cqe* cqe = nullptr;
       WrappedReadRequest* req_wrap;
 
       // We could use the peek variant here, but this seems safer in terms
       // of our initial wait not reaping all completions
       ret = io_uring_wait_cqe(iu, &cqe);
-      assert(!ret);
+      TEST_SYNC_POINT_CALLBACK(
+          "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
+      if (ret) {
+        ios = IOStatus::IOError("io_uring_wait_cqe() returns " + ToString(ret));
+
+        if (cqe != nullptr) {
+          io_uring_cqe_seen(iu, cqe);
+        }
+        continue;
+      }
 
       req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
+      // Reset cqe data to catch any stray reuse of it
+      static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+      // Check that we got a valid unique cqe data
+      auto wrap_check = wrap_cache.find(req_wrap);
+      if (wrap_check == wrap_cache.end()) {
+        fprintf(stderr,
+                "PosixRandomAccessFile::MultiRead: "
+                "Bad cqe data from IO uring - %p\n",
+                req_wrap);
+        port::PrintStack();
+        ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
+                                ToString((uint64_t)req_wrap));
+        continue;
+      }
+      wrap_cache.erase(wrap_check);
+
       FSReadRequest* req = req_wrap->req;
       if (cqe->res < 0) {
         req->result = Slice(req->scratch, 0);
@@ -576,13 +759,22 @@
           // comment
           // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
           // Fall back to pread in this case.
-          Slice tmp_slice;
-          req->status =
-              Read(req->offset + req_wrap->finished_len,
-                   req->len - req_wrap->finished_len, options, &tmp_slice,
-                   req->scratch + req_wrap->finished_len, dbg);
-          req->result =
-              Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
+          if (use_direct_io() &&
+              !IsSectorAligned(req_wrap->finished_len,
+                               GetRequiredBufferAlignment())) {
+            // Bytes reads don't fill sectors. Should only happen at the end
+            // of the file.
+            req->result = Slice(req->scratch, req_wrap->finished_len);
+            req->status = IOStatus::OK();
+          } else {
+            Slice tmp_slice;
+            req->status =
+                Read(req->offset + req_wrap->finished_len,
+                     req->len - req_wrap->finished_len, options, &tmp_slice,
+                     req->scratch + req_wrap->finished_len, dbg);
+            req->result =
+                Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
+          }
         } else if (bytes_read < req_wrap->iov.iov_len) {
           assert(bytes_read > 0);
           assert(bytes_read + req_wrap->finished_len < req->len);
@@ -596,8 +788,9 @@
       }
       io_uring_cqe_seen(iu, cqe);
     }
+    wrap_cache.clear();
   }
-  return IOStatus::OK();
+  return ios;
 #else
   return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
 #endif
@@ -750,7 +943,7 @@
  * knows enough to skip zero suffixes.
  */
 IOStatus PosixMmapFile::UnmapCurrentRegion() {
-  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
   if (base_ != nullptr) {
     int munmap_status = munmap(base_, limit_ - base_);
     if (munmap_status != 0) {
@@ -773,7 +966,7 @@
 IOStatus PosixMmapFile::MapNewRegion() {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   assert(base_ == nullptr);
-  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
   // we can't fallocate with FALLOC_FL_KEEP_SIZE here
   if (allow_fallocate_) {
     IOSTATS_TIMER_GUARD(allocate_nanos);
@@ -784,17 +977,17 @@
     }
     if (alloc_status != 0) {
       return IOStatus::IOError("Error allocating space to file : " + filename_ +
-                               "Error : " + strerror(alloc_status));
+                               "Error : " + errnoStr(alloc_status).c_str());
     }
   }
 
-  TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Append:1");
   void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
                    file_offset_);
   if (ptr == MAP_FAILED) {
     return IOStatus::IOError("MMap failed on " + filename_);
   }
-  TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Append:2");
 
   base_ = reinterpret_cast<char*>(ptr);
   limit_ = base_ + map_size_;
@@ -815,7 +1008,7 @@
   size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
   size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
   last_sync_ = dst_;
-  TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
   if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
     return IOError("While msync", filename_, errno);
   }
@@ -846,7 +1039,8 @@
 
 PosixMmapFile::~PosixMmapFile() {
   if (fd_ >= 0) {
-    PosixMmapFile::Close(IOOptions(), nullptr);
+    IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr);
+    s.PermitUncheckedError();
   }
 }
 
@@ -867,7 +1061,7 @@
       if (!s.ok()) {
         return s;
       }
-      TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
+      TEST_KILL_RANDOM("PosixMmapFile::Append:0");
     }
 
     size_t n = (left <= avail) ? left : avail;
@@ -914,9 +1108,15 @@
 
 IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
                              IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
   if (fdatasync(fd_) < 0) {
     return IOError("While fdatasync mmapped file", filename_, errno);
   }
+#endif  // HAVE_FULLFSYNC
 
   return Msync();
 }
@@ -926,9 +1126,15 @@
  */
 IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
                               IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
   if (fsync(fd_) < 0) {
     return IOError("While fsync mmaped file", filename_, errno);
   }
+#endif  // HAVE_FULLFSYNC
 
   return Msync();
 }
@@ -965,7 +1171,7 @@
                                  IODebugContext* /*dbg*/) {
   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
-  TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
   int alloc_status = 0;
   if (allow_fallocate_) {
     alloc_status =
@@ -988,13 +1194,14 @@
  * Use posix write to write data to a file.
  */
 PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
+                                     size_t logical_block_size,
                                      const EnvOptions& options)
     : FSWritableFile(options),
       filename_(fname),
       use_direct_io_(options.use_direct_writes),
       fd_(fd),
       filesize_(0),
-      logical_sector_size_(GetLogicalBufferSize(fd_)) {
+      logical_sector_size_(logical_block_size) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   allow_fallocate_ = options.allow_fallocate;
   fallocate_with_keep_size_ = options.fallocate_with_keep_size;
@@ -1007,7 +1214,8 @@
 
 PosixWritableFile::~PosixWritableFile() {
   if (fd_ >= 0) {
-    PosixWritableFile::Close(IOOptions(), nullptr);
+    IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr);
+    s.PermitUncheckedError();
   }
 }
 
@@ -1067,6 +1275,7 @@
   size_t block_size;
   size_t last_allocated_block;
   GetPreallocationStatus(&block_size, &last_allocated_block);
+  TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block);
   if (last_allocated_block > 0) {
     // trim the extra space preallocated at the end of the file
     // NOTE(ljin): we probably don't want to surface failure as an IOError,
@@ -1123,17 +1332,29 @@
 
 IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
                                  IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
   if (fdatasync(fd_) < 0) {
     return IOError("While fdatasync", filename_, errno);
   }
+#endif  // HAVE_FULLFSYNC
   return IOStatus::OK();
 }
 
 IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
                                   IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
   if (fsync(fd_) < 0) {
     return IOError("While fsync", filename_, errno);
   }
+#endif  // HAVE_FULLFSYNC
   return IOStatus::OK();
 }
 
@@ -1186,7 +1407,7 @@
                                      IODebugContext* /*dbg*/) {
   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
-  TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
   IOSTATS_TIMER_GUARD(allocate_nanos);
   int alloc_status = 0;
   if (allow_fallocate_) {
@@ -1249,7 +1470,8 @@
 
 PosixRandomRWFile::~PosixRandomRWFile() {
   if (fd_ >= 0) {
-    Close(IOOptions(), nullptr);
+    IOStatus s = Close(IOOptions(), nullptr);
+    s.PermitUncheckedError();
   }
 }
 
@@ -1305,17 +1527,29 @@
 
 IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
                                  IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
   if (fdatasync(fd_) < 0) {
     return IOError("While fdatasync random read/write file", filename_, errno);
   }
+#endif  // HAVE_FULLFSYNC
   return IOStatus::OK();
 }
 
 IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
                                   IODebugContext* /*dbg*/) {
+#ifdef HAVE_FULLFSYNC
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno);
+  }
+#else   // HAVE_FULLFSYNC
   if (fsync(fd_) < 0) {
     return IOError("While fsync random read/write file", filename_, errno);
   }
+#endif  // HAVE_FULLFSYNC
   return IOStatus::OK();
 }
 
@@ -1336,17 +1570,71 @@
 /*
  * PosixDirectory
  */
+#if !defined(BTRFS_SUPER_MAGIC)
+// The magic number for BTRFS is fixed, if it's not defined, define it here
+#define BTRFS_SUPER_MAGIC 0x9123683E
+#endif
+PosixDirectory::PosixDirectory(int fd) : fd_(fd) {
+  is_btrfs_ = false;
+#ifdef OS_LINUX
+  struct statfs buf;
+  int ret = fstatfs(fd, &buf);
+  is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>(
+                                             BTRFS_SUPER_MAGIC));
+#endif
+}
 
 PosixDirectory::~PosixDirectory() { close(fd_); }
 
-IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/,
-                               IODebugContext* /*dbg*/) {
+IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) {
+  return FsyncWithDirOptions(opts, dbg, DirFsyncOptions());
+}
+
+IOStatus PosixDirectory::FsyncWithDirOptions(
+    const IOOptions& /*opts*/, IODebugContext* /*dbg*/,
+    const DirFsyncOptions& dir_fsync_options) {
+  IOStatus s = IOStatus::OK();
 #ifndef OS_AIX
+  if (is_btrfs_) {
+    // skip dir fsync for new file creation, which is not needed for btrfs
+    if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
+      return s;
+    }
+    // skip dir fsync for renaming file, only need to sync new file
+    if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) {
+      std::string new_name = dir_fsync_options.renamed_new_name;
+      assert(!new_name.empty());
+      int fd;
+      do {
+        IOSTATS_TIMER_GUARD(open_nanos);
+        fd = open(new_name.c_str(), O_RDONLY);
+      } while (fd < 0 && errno == EINTR);
+      if (fd < 0) {
+        s = IOError("While open renaming file", new_name, errno);
+      } else if (fsync(fd) < 0) {
+        s = IOError("While fsync renaming file", new_name, errno);
+      }
+      if (close(fd) < 0) {
+        s = IOError("While closing file after fsync", new_name, errno);
+      }
+      return s;
+    }
+    // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
+  }
+#ifdef HAVE_FULLFSYNC
+  // btrfs is a Linux file system, while currently F_FULLFSYNC is available on
+  // Mac OS.
+  assert(!is_btrfs_);
+  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
+    return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno);
+  }
+#else   // HAVE_FULLFSYNC
   if (fsync(fd_) == -1) {
-    return IOError("While fsync", "a directory", errno);
+    s = IOError("While fsync", "a directory", errno);
   }
-#endif
-  return IOStatus::OK();
+#endif  // HAVE_FULLFSYNC
+#endif  // OS_AIX
+  return s;
 }
 }  // namespace ROCKSDB_NAMESPACE
 #endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,11 +14,15 @@
 #endif
 #include <unistd.h>
 #include <atomic>
+#include <functional>
+#include <map>
 #include <string>
+#include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/thread_local.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/io_status.h"
+#include "util/mutexlock.h"
+#include "util/thread_local.h"
 
 // For non linux platform, the following macros are used only as place
 // holder.
@@ -27,43 +31,96 @@
 #define POSIX_FADV_RANDOM 1     /* [MC1] expect random page refs */
 #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
 #define POSIX_FADV_WILLNEED 3   /* [MC1] will need these pages */
-#define POSIX_FADV_DONTNEED 4   /* [MC1] dont need these pages */
+#define POSIX_FADV_DONTNEED 4   /* [MC1] don't need these pages */
 #endif
 
 namespace ROCKSDB_NAMESPACE {
-static std::string IOErrorMsg(const std::string& context,
-                              const std::string& file_name) {
-  if (file_name.empty()) {
-    return context;
-  }
-  return context + ": " + file_name;
-}
-
+std::string IOErrorMsg(const std::string& context,
+                       const std::string& file_name);
 // file_name can be left empty if it is not unkown.
-static IOStatus IOError(const std::string& context,
-                        const std::string& file_name, int err_number) {
-  switch (err_number) {
-    case ENOSPC: {
-      IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
-                                     strerror(err_number));
-      s.SetRetryable(true);
-      return s;
-    }
-  case ESTALE:
-    return IOStatus::IOError(IOStatus::kStaleFile);
-  case ENOENT:
-    return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
-                                  strerror(err_number));
-  default:
-    return IOStatus::IOError(IOErrorMsg(context, file_name),
-                             strerror(err_number));
-  }
-}
+IOStatus IOError(const std::string& context, const std::string& file_name,
+                 int err_number);
 
 class PosixHelper {
  public:
   static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
+  static size_t GetLogicalBlockSizeOfFd(int fd);
+  static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
+                                               size_t* size);
+};
+
+#ifdef OS_LINUX
+// Files under a specific directory have the same logical block size.
+// This class caches the logical block size for the specified directories to
+// save the CPU cost of computing the size.
+// Safe for concurrent access from multiple threads without any external
+// synchronization.
+class LogicalBlockSizeCache {
+ public:
+  LogicalBlockSizeCache(
+      std::function<size_t(int)> get_logical_block_size_of_fd =
+          PosixHelper::GetLogicalBlockSizeOfFd,
+      std::function<Status(const std::string&, size_t*)>
+          get_logical_block_size_of_directory =
+              PosixHelper::GetLogicalBlockSizeOfDirectory)
+      : get_logical_block_size_of_fd_(get_logical_block_size_of_fd),
+        get_logical_block_size_of_directory_(
+            get_logical_block_size_of_directory) {}
+
+  // Takes the following actions:
+  // 1. Increases reference count of the directories;
+  // 2. If the directory's logical block size is not cached,
+  //    compute the buffer size and cache the result.
+  Status RefAndCacheLogicalBlockSize(
+      const std::vector<std::string>& directories);
+
+  // Takes the following actions:
+  // 1. Decreases reference count of the directories;
+  // 2. If the reference count of a directory reaches 0, remove the directory
+  //    from the cache.
+  void UnrefAndTryRemoveCachedLogicalBlockSize(
+      const std::vector<std::string>& directories);
+
+  // Returns the logical block size for the file.
+  //
+  // If the file is under a cached directory, return the cached size.
+  // Otherwise, the size is computed.
+  size_t GetLogicalBlockSize(const std::string& fname, int fd);
+
+  int GetRefCount(const std::string& dir) {
+    ReadLock lock(&cache_mutex_);
+    auto it = cache_.find(dir);
+    if (it == cache_.end()) {
+      return 0;
+    }
+    return it->second.ref;
+  }
+
+  size_t Size() const { return cache_.size(); }
+
+  bool Contains(const std::string& dir) {
+    ReadLock lock(&cache_mutex_);
+    return cache_.find(dir) != cache_.end();
+  }
+
+ private:
+  struct CacheValue {
+    CacheValue() : size(0), ref(0) {}
+
+    // Logical block size of the directory.
+    size_t size;
+    // Reference count of the directory.
+    int ref;
+  };
+
+  std::function<size_t(int)> get_logical_block_size_of_fd_;
+  std::function<Status(const std::string&, size_t*)>
+      get_logical_block_size_of_directory_;
+
+  std::map<std::string, CacheValue> cache_;
+  port::RWMutex cache_mutex_;
 };
+#endif
 
 class PosixSequentialFile : public FSSequentialFile {
  private:
@@ -75,6 +132,7 @@
 
  public:
   PosixSequentialFile(const std::string& fname, FILE* file, int fd,
+                      size_t logical_block_size,
                       const EnvOptions& options);
   virtual ~PosixSequentialFile();
 
@@ -123,6 +181,7 @@
 
  public:
   PosixRandomAccessFile(const std::string& fname, int fd,
+                        size_t logical_block_size,
                         const EnvOptions& options
 #if defined(ROCKSDB_IOURING_PRESENT)
                         ,
@@ -172,6 +231,7 @@
 
  public:
   explicit PosixWritableFile(const std::string& fname, int fd,
+                             size_t logical_block_size,
                              const EnvOptions& options);
   virtual ~PosixWritableFile();
 
@@ -182,9 +242,20 @@
   virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
   virtual IOStatus Append(const Slice& data, const IOOptions& opts,
                           IODebugContext* dbg) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+                          const DataVerificationInfo& /* verification_info */,
+                          IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
   virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
                                     const IOOptions& opts,
                                     IODebugContext* dbg) override;
+  virtual IOStatus PositionedAppend(
+      const Slice& data, uint64_t offset, const IOOptions& opts,
+      const DataVerificationInfo& /* verification_info */,
+      IODebugContext* dbg) override {
+    return PositionedAppend(data, offset, opts, dbg);
+  }
   virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
   virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
   virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
@@ -271,6 +342,11 @@
   virtual IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
   virtual IOStatus Append(const Slice& data, const IOOptions& opts,
                           IODebugContext* dbg) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& opts,
+                          const DataVerificationInfo& /* verification_info */,
+                          IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
   virtual IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
   virtual IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
   virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
@@ -315,12 +391,17 @@
 
 class PosixDirectory : public FSDirectory {
  public:
-  explicit PosixDirectory(int fd) : fd_(fd) {}
+  explicit PosixDirectory(int fd);
   ~PosixDirectory();
   virtual IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
 
+  virtual IOStatus FsyncWithDirOptions(
+      const IOOptions&, IODebugContext*,
+      const DirFsyncOptions& dir_fsync_options) override;
+
  private:
   int fd_;
+  bool is_btrfs_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/io_posix_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/io_posix_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,140 @@
+// Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+
+#ifdef ROCKSDB_LIB_IO_POSIX
+#include "env/io_posix.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef OS_LINUX
+class LogicalBlockSizeCacheTest : public testing::Test {};
+
+// Tests the caching behavior.
+TEST_F(LogicalBlockSizeCacheTest, Cache) {
+  int ncall = 0;
+  auto get_fd_block_size = [&](int fd) {
+    ncall++;
+    return fd;
+  };
+  std::map<std::string, int> dir_fds{
+      {"/", 0},
+      {"/db", 1},
+      {"/db1", 2},
+      {"/db2", 3},
+  };
+  auto get_dir_block_size = [&](const std::string& dir, size_t* size) {
+    ncall++;
+    *size = dir_fds[dir];
+    return Status::OK();
+  };
+  LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size);
+  ASSERT_EQ(0, ncall);
+  ASSERT_EQ(0, cache.Size());
+
+  ASSERT_EQ(6, cache.GetLogicalBlockSize("/sst", 6));
+  ASSERT_EQ(1, ncall);
+  ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7));
+  ASSERT_EQ(2, ncall);
+  ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8));
+  ASSERT_EQ(3, ncall);
+
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/", "/db1/", "/db2"}));
+  ASSERT_EQ(3, cache.Size());
+  ASSERT_TRUE(cache.Contains("/"));
+  ASSERT_TRUE(cache.Contains("/db1"));
+  ASSERT_TRUE(cache.Contains("/db2"));
+  ASSERT_EQ(6, ncall);
+  // Block size for / is cached.
+  ASSERT_EQ(0, cache.GetLogicalBlockSize("/sst", 6));
+  ASSERT_EQ(6, ncall);
+  // No cached size for /db.
+  ASSERT_EQ(7, cache.GetLogicalBlockSize("/db/sst1", 7));
+  ASSERT_EQ(7, ncall);
+  ASSERT_EQ(8, cache.GetLogicalBlockSize("/db/sst2", 8));
+  ASSERT_EQ(8, ncall);
+  // Block size for /db1 is cached.
+  ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst1", 4));
+  ASSERT_EQ(8, ncall);
+  ASSERT_EQ(2, cache.GetLogicalBlockSize("/db1/sst2", 5));
+  ASSERT_EQ(8, ncall);
+  // Block size for /db2 is cached.
+  ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst1", 6));
+  ASSERT_EQ(8, ncall);
+  ASSERT_EQ(3, cache.GetLogicalBlockSize("/db2/sst2", 7));
+  ASSERT_EQ(8, ncall);
+
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+  ASSERT_EQ(4, cache.Size());
+  ASSERT_TRUE(cache.Contains("/"));
+  ASSERT_TRUE(cache.Contains("/db1"));
+  ASSERT_TRUE(cache.Contains("/db2"));
+  ASSERT_TRUE(cache.Contains("/db"));
+
+  ASSERT_EQ(9, ncall);
+  // Block size for /db is cached.
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst1", 7));
+  ASSERT_EQ(9, ncall);
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst2", 8));
+  ASSERT_EQ(9, ncall);
+}
+
+// Tests the reference counting behavior.
+TEST_F(LogicalBlockSizeCacheTest, Ref) {
+  int ncall = 0;
+  auto get_fd_block_size = [&](int fd) {
+    ncall++;
+    return fd;
+  };
+  std::map<std::string, int> dir_fds{
+      {"/db", 0},
+  };
+  auto get_dir_block_size = [&](const std::string& dir, size_t* size) {
+    ncall++;
+    *size = dir_fds[dir];
+    return Status::OK();
+  };
+  LogicalBlockSizeCache cache(get_fd_block_size, get_dir_block_size);
+
+  ASSERT_EQ(0, ncall);
+
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1));
+  ASSERT_EQ(1, ncall);
+
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+  ASSERT_EQ(2, ncall);
+  ASSERT_EQ(1, cache.GetRefCount("/db"));
+  // Block size for /db is cached. Ref count = 1.
+  ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst1", 1));
+  ASSERT_EQ(2, ncall);
+
+  // Ref count = 2, but won't recompute the cached buffer size.
+  ASSERT_OK(cache.RefAndCacheLogicalBlockSize({"/db"}));
+  ASSERT_EQ(2, cache.GetRefCount("/db"));
+  ASSERT_EQ(2, ncall);
+
+  // Ref count = 1.
+  cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"});
+  ASSERT_EQ(1, cache.GetRefCount("/db"));
+  // Block size for /db is still cached.
+  ASSERT_EQ(0, cache.GetLogicalBlockSize("/db/sst2", 1));
+  ASSERT_EQ(2, ncall);
+
+  // Ref count = 0 and cached buffer size for /db is removed.
+  cache.UnrefAndTryRemoveCachedLogicalBlockSize({"/db"});
+  ASSERT_EQ(0, cache.Size());
+  ASSERT_EQ(1, cache.GetLogicalBlockSize("/db/sst0", 1));
+  ASSERT_EQ(3, ncall);
+}
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,28 +8,94 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "env/mock_env.h"
+
 #include <algorithm>
 #include <chrono>
+
+#include "env/emulated_clock.h"
+#include "file/filename.h"
 #include "port/sys_time.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/options_type.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
-#include "util/murmurhash.h"
+#include "util/hash.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+int64_t MaybeCurrentTime(const std::shared_ptr<SystemClock>& clock) {
+  int64_t time = 1337346000;  // arbitrary fallback default
+  clock->GetCurrentTime(&time).PermitUncheckedError();
+  return time;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> time_elapse_type_info = {
+#ifndef ROCKSDB_LITE
+    {"time_elapse_only_sleep",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kCompareNever,
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto clock = static_cast<EmulatedSystemClock*>(addr);
+        clock->SetTimeElapseOnlySleep(ParseBoolean("", value));
+        return Status::OK();
+      },
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto clock = static_cast<const EmulatedSystemClock*>(addr);
+        *value = clock->IsTimeElapseOnlySleep() ? "true" : "false";
+        return Status::OK();
+      },
+      nullptr}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo> mock_sleep_type_info = {
+#ifndef ROCKSDB_LITE
+    {"mock_sleep",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kCompareNever,
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto clock = static_cast<EmulatedSystemClock*>(addr);
+        clock->SetMockSleep(ParseBoolean("", value));
+        return Status::OK();
+      },
+      [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto clock = static_cast<const EmulatedSystemClock*>(addr);
+        *value = clock->IsMockSleepEnabled() ? "true" : "false";
+        return Status::OK();
+      },
+      nullptr}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+
+EmulatedSystemClock::EmulatedSystemClock(
+    const std::shared_ptr<SystemClock>& base, bool time_elapse_only_sleep)
+    : SystemClockWrapper(base),
+      maybe_starting_time_(MaybeCurrentTime(base)),
+      time_elapse_only_sleep_(time_elapse_only_sleep),
+      no_slowdown_(time_elapse_only_sleep) {
+  RegisterOptions("", this, &time_elapse_type_info);
+  RegisterOptions("", this, &mock_sleep_type_info);
+}
 
 class MemFile {
  public:
-  explicit MemFile(Env* env, const std::string& fn, bool _is_lock_file = false)
-      : env_(env),
+  explicit MemFile(SystemClock* clock, const std::string& fn,
+                   bool _is_lock_file = false)
+      : clock_(clock),
         fn_(fn),
         refs_(0),
         is_lock_file_(_is_lock_file),
         locked_(false),
         size_(0),
         modified_time_(Now()),
-        rnd_(static_cast<uint32_t>(
-            MurmurHash(fn.data(), static_cast<int>(fn.size()), 0))),
+        rnd_(Lower32of64(GetSliceNPHash64(fn))),
         fsynced_bytes_(0) {}
   // No copying allowed.
   MemFile(const MemFile&) = delete;
@@ -77,7 +143,8 @@
 
   uint64_t Size() const { return size_; }
 
-  void Truncate(size_t size) {
+  void Truncate(size_t size, const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) {
     MutexLock lock(&mutex_);
     if (size < size_) {
       data_.resize(size);
@@ -99,7 +166,17 @@
     }
   }
 
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch, IODebugContext* /*dbg*/) const {
+    {
+      IOStatus s;
+      TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s);
+      if (!s.ok()) {
+        // with sync point only
+        *result = Slice();
+        return s;
+      }
+    }
     MutexLock lock(&mutex_);
     const uint64_t available = Size() - std::min(Size(), offset);
     size_t offset_ = static_cast<size_t>(offset);
@@ -108,7 +185,7 @@
     }
     if (n == 0) {
       *result = Slice();
-      return Status::OK();
+      return IOStatus::OK();
     }
     if (scratch) {
       memcpy(scratch, &(data_[offset_]), n);
@@ -116,10 +193,11 @@
     } else {
       *result = Slice(&(data_[offset_]), n);
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Write(uint64_t offset, const Slice& data) {
+  IOStatus Write(uint64_t offset, const Slice& data,
+                 const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
     MutexLock lock(&mutex_);
     size_t offset_ = static_cast<size_t>(offset);
     if (offset + data.size() > data_.size()) {
@@ -128,20 +206,21 @@
     data_.replace(offset_, data.size(), data.data(), data.size());
     size_ = data_.size();
     modified_time_ = Now();
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Append(const Slice& data) {
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) {
     MutexLock lock(&mutex_);
     data_.append(data.data(), data.size());
     size_ = data_.size();
     modified_time_ = Now();
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Fsync() {
+  IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
     fsynced_bytes_ = size_.load();
-    return Status::OK();
+    return IOStatus::OK();
   }
 
   uint64_t ModifiedTime() const { return modified_time_; }
@@ -149,7 +228,7 @@
  private:
   uint64_t Now() {
     int64_t unix_time = 0;
-    auto s = env_->GetCurrentTime(&unix_time);
+    auto s = clock_->GetCurrentTime(&unix_time);
     assert(s.ok());
     return static_cast<uint64_t>(unix_time);
   }
@@ -157,7 +236,7 @@
   // Private since only Unref() should be used to delete it.
   ~MemFile() { assert(refs_ == 0); }
 
-  Env* env_;
+  SystemClock* clock_;
   const std::string fn_;
   mutable port::Mutex mutex_;
   int refs_;
@@ -176,111 +255,176 @@
 
 namespace {
 
-class MockSequentialFile : public SequentialFile {
+class MockSequentialFile : public FSSequentialFile {
  public:
-  explicit MockSequentialFile(MemFile* file) : file_(file), pos_(0) {
+  explicit MockSequentialFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_reads),
+        use_mmap_read_(opts.use_mmap_reads),
+        pos_(0) {
     file_->Ref();
   }
 
   ~MockSequentialFile() override { file_->Unref(); }
 
-  Status Read(size_t n, Slice* result, char* scratch) override {
-    Status s = file_->Read(pos_, n, result, scratch);
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override {
+    IOStatus s = file_->Read(pos_, n, options, result,
+                             (use_mmap_read_) ? nullptr : scratch, dbg);
     if (s.ok()) {
       pos_ += result->size();
     }
     return s;
   }
 
-  Status Skip(uint64_t n) override {
+  bool use_direct_io() const override { return use_direct_io_; }
+  IOStatus Skip(uint64_t n) override {
     if (pos_ > file_->Size()) {
-      return Status::IOError("pos_ > file_->Size()");
+      return IOStatus::IOError("pos_ > file_->Size()");
     }
     const uint64_t available = file_->Size() - pos_;
     if (n > available) {
       n = available;
     }
     pos_ += static_cast<size_t>(n);
-    return Status::OK();
+    return IOStatus::OK();
   }
 
  private:
   MemFile* file_;
+  bool use_direct_io_;
+  bool use_mmap_read_;
   size_t pos_;
 };
 
-class MockRandomAccessFile : public RandomAccessFile {
+class MockRandomAccessFile : public FSRandomAccessFile {
  public:
-  explicit MockRandomAccessFile(MemFile* file) : file_(file) { file_->Ref(); }
+  explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_reads),
+        use_mmap_read_(opts.use_mmap_reads) {
+    file_->Ref();
+  }
 
   ~MockRandomAccessFile() override { file_->Unref(); }
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    return file_->Read(offset, n, result, scratch);
+  bool use_direct_io() const override { return use_direct_io_; }
+
+  IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    if (use_mmap_read_) {
+      return file_->Read(offset, n, options, result, nullptr, dbg);
+    } else {
+      return file_->Read(offset, n, options, result, scratch, dbg);
+    }
   }
 
  private:
   MemFile* file_;
+  bool use_direct_io_;
+  bool use_mmap_read_;
 };
 
-class MockRandomRWFile : public RandomRWFile {
+class MockRandomRWFile : public FSRandomRWFile {
  public:
   explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); }
 
   ~MockRandomRWFile() override { file_->Unref(); }
 
-  Status Write(uint64_t offset, const Slice& data) override {
-    return file_->Write(offset, data);
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override {
+    return file_->Write(offset, data, options, dbg);
   }
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    return file_->Read(offset, n, result, scratch);
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    return file_->Read(offset, n, options, result, scratch, dbg);
   }
 
-  Status Close() override { return file_->Fsync(); }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
 
-  Status Flush() override { return Status::OK(); }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  Status Sync() override { return file_->Fsync(); }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
 
  private:
   MemFile* file_;
 };
 
-class MockWritableFile : public WritableFile {
+class MockWritableFile : public FSWritableFile {
  public:
-  MockWritableFile(MemFile* file, RateLimiter* rate_limiter)
-      : file_(file), rate_limiter_(rate_limiter) {
+  MockWritableFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_writes),
+        rate_limiter_(opts.rate_limiter) {
     file_->Ref();
   }
 
   ~MockWritableFile() override { file_->Unref(); }
 
-  Status Append(const Slice& data) override {
+  bool use_direct_io() const override { return false && use_direct_io_; }
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
     size_t bytes_written = 0;
     while (bytes_written < data.size()) {
       auto bytes = RequestToken(data.size() - bytes_written);
-      Status s = file_->Append(Slice(data.data() + bytes_written, bytes));
+      IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes),
+                                 options, dbg);
       if (!s.ok()) {
         return s;
       }
       bytes_written += bytes;
     }
-    return Status::OK();
+    return IOStatus::OK();
+  }
+
+  using FSWritableFile::PositionedAppend;
+  IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override {
+    assert(use_direct_io_);
+    return Append(data, options, dbg);
   }
-  Status Truncate(uint64_t size) override {
-    file_->Truncate(static_cast<size_t>(size));
-    return Status::OK();
+
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    file_->Truncate(static_cast<size_t>(size), options, dbg);
+    return IOStatus::OK();
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
   }
-  Status Close() override { return file_->Fsync(); }
 
-  Status Flush() override { return Status::OK(); }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  Status Sync() override { return file_->Fsync(); }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
 
-  uint64_t GetFileSize() override { return file_->Size(); }
+  uint64_t GetFileSize(const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
+    return file_->Size();
+  }
 
  private:
   inline size_t RequestToken(size_t bytes) {
@@ -293,12 +437,16 @@
   }
 
   MemFile* file_;
+  bool use_direct_io_;
   RateLimiter* rate_limiter_;
 };
 
-class MockEnvDirectory : public Directory {
+class MockEnvDirectory : public FSDirectory {
  public:
-  Status Fsync() override { return Status::OK(); }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 };
 
 class MockEnvFileLock : public FileLock {
@@ -313,21 +461,26 @@
 
 class TestMemLogger : public Logger {
  private:
-  std::unique_ptr<WritableFile> file_;
+  std::unique_ptr<FSWritableFile> file_;
   std::atomic_size_t log_size_;
   static const uint64_t flush_every_seconds_ = 5;
   std::atomic_uint_fast64_t last_flush_micros_;
-  Env* env_;
+  SystemClock* clock_;
+  IOOptions options_;
+  IODebugContext* dbg_;
   std::atomic<bool> flush_pending_;
 
  public:
-  TestMemLogger(std::unique_ptr<WritableFile> f, Env* env,
+  TestMemLogger(std::unique_ptr<FSWritableFile> f, SystemClock* clock,
+                const IOOptions& options, IODebugContext* dbg,
                 const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
       : Logger(log_level),
         file_(std::move(f)),
         log_size_(0),
         last_flush_micros_(0),
-        env_(env),
+        clock_(clock),
+        options_(options),
+        dbg_(dbg),
         flush_pending_(false) {}
   ~TestMemLogger() override {}
 
@@ -335,7 +488,7 @@
     if (flush_pending_) {
       flush_pending_ = false;
     }
-    last_flush_micros_ = env_->NowMicros();
+    last_flush_micros_ = clock_->NowMicros();
   }
 
   using Logger::Logv;
@@ -393,9 +546,11 @@
       assert(p <= limit);
       const size_t write_size = p - base;
 
-      file_->Append(Slice(base, write_size));
-      flush_pending_ = true;
-      log_size_ += write_size;
+      Status s = file_->Append(Slice(base, write_size), options_, dbg_);
+      if (s.ok()) {
+        flush_pending_ = true;
+        log_size_ += write_size;
+      }
       uint64_t now_micros =
           static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec;
       if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
@@ -411,151 +566,235 @@
   size_t GetLogFileSize() const override { return log_size_; }
 };
 
-}  // Anonymous namespace
+static std::unordered_map<std::string, OptionTypeInfo> mock_fs_type_info = {
+#ifndef ROCKSDB_LITE
+    {"supports_direct_io",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
 
-MockEnv::MockEnv(Env* base_env) : EnvWrapper(base_env), fake_sleep_micros_(0) {}
+MockFileSystem::MockFileSystem(const std::shared_ptr<SystemClock>& clock,
+                               bool supports_direct_io)
+    : system_clock_(clock), supports_direct_io_(supports_direct_io) {
+  clock_ = system_clock_.get();
+  RegisterOptions("", &supports_direct_io_, &mock_fs_type_info);
+}
 
-MockEnv::~MockEnv() {
-  for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i) {
+MockFileSystem::~MockFileSystem() {
+  for (auto i = file_map_.begin(); i != file_map_.end(); ++i) {
     i->second->Unref();
   }
 }
 
-// Partial implementation of the Env interface.
-Status MockEnv::NewSequentialFile(const std::string& fname,
-                                  std::unique_ptr<SequentialFile>* result,
-                                  const EnvOptions& /*soptions*/) {
-  auto fn = NormalizePath(fname);
+Status MockFileSystem::PrepareOptions(const ConfigOptions& options) {
+  Status s = FileSystem::PrepareOptions(options);
+  if (s.ok() && system_clock_ == SystemClock::Default()) {
+    system_clock_ = options.env->GetSystemClock();
+    clock_ = system_clock_.get();
+  }
+  return s;
+}
+
+IOStatus MockFileSystem::GetAbsolutePath(const std::string& db_path,
+                                         const IOOptions& /*options*/,
+                                         std::string* output_path,
+                                         IODebugContext* /*dbg*/) {
+  *output_path = NormalizeMockPath(db_path);
+  if (output_path->at(0) != '/') {
+    return IOStatus::NotSupported("GetAbsolutePath");
+  } else {
+    return IOStatus::OK();
+  }
+}
+
+std::string MockFileSystem::NormalizeMockPath(const std::string& path) {
+  std::string p = NormalizePath(path);
+  if (p.back() == kFilePathSeparator && p.size() > 1) {
+    p.pop_back();
+  }
+  return p;
+}
+
+// Partial implementation of the FileSystem interface.
+IOStatus MockFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
     *result = nullptr;
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   auto* f = file_map_[fn];
   if (f->is_lock_file()) {
-    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+  } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockSequentialFile(f, file_opts));
+    return IOStatus::OK();
   }
-  result->reset(new MockSequentialFile(f));
-  return Status::OK();
 }
 
-Status MockEnv::NewRandomAccessFile(const std::string& fname,
-                                    std::unique_ptr<RandomAccessFile>* result,
-                                    const EnvOptions& /*soptions*/) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
     *result = nullptr;
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   auto* f = file_map_[fn];
   if (f->is_lock_file()) {
-    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+  } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockRandomAccessFile(f, file_opts));
+    return IOStatus::OK();
   }
-  result->reset(new MockRandomAccessFile(f));
-  return Status::OK();
 }
 
-Status MockEnv::NewRandomRWFile(const std::string& fname,
-                                std::unique_ptr<RandomRWFile>* result,
-                                const EnvOptions& /*soptions*/) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& /*file_opts*/,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
     *result = nullptr;
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   auto* f = file_map_[fn];
   if (f->is_lock_file()) {
-    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
   }
   result->reset(new MockRandomRWFile(f));
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::ReuseWritableFile(const std::string& fname,
-                                  const std::string& old_fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options) {
-  auto s = RenameFile(old_fname, fname);
+IOStatus MockFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  auto s = RenameFile(old_fname, fname, IOOptions(), dbg);
   if (!s.ok()) {
     return s;
+  } else {
+    result->reset();
+    return NewWritableFile(fname, options, result, dbg);
   }
-  result->reset();
-  return NewWritableFile(fname, result, options);
 }
 
-Status MockEnv::NewWritableFile(const std::string& fname,
-                                std::unique_ptr<WritableFile>* result,
-                                const EnvOptions& env_options) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) != file_map_.end()) {
     DeleteFileInternal(fn);
   }
-  MemFile* file = new MemFile(this, fn, false);
+  MemFile* file = new MemFile(clock_, fn, false);
   file->Ref();
   file_map_[fn] = file;
+  if (file_opts.use_direct_writes && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockWritableFile(file, file_opts));
+    return IOStatus::OK();
+  }
+}
 
-  result->reset(new MockWritableFile(file, env_options.rate_limiter));
-  return Status::OK();
+IOStatus MockFileSystem::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  MemFile* file = nullptr;
+  if (file_map_.find(fn) == file_map_.end()) {
+    file = new MemFile(clock_, fn, false);
+    // Only take a reference when we create the file objectt
+    file->Ref();
+    file_map_[fn] = file;
+  } else {
+    file = file_map_[fn];
+  }
+  if (file_opts.use_direct_writes && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockWritableFile(file, file_opts));
+    return IOStatus::OK();
+  }
 }
 
-Status MockEnv::NewDirectory(const std::string& /*name*/,
-                             std::unique_ptr<Directory>* result) {
+IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/,
+                                      const IOOptions& /*io_opts*/,
+                                      std::unique_ptr<FSDirectory>* result,
+                                      IODebugContext* /*dbg*/) {
   result->reset(new MockEnvDirectory());
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::FileExists(const std::string& fname) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::FileExists(const std::string& fname,
+                                    const IOOptions& /*io_opts*/,
+                                    IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) != file_map_.end()) {
     // File exists
-    return Status::OK();
+    return IOStatus::OK();
   }
   // Now also check if fn exists as a dir
   for (const auto& iter : file_map_) {
     const std::string& filename = iter.first;
     if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' &&
         Slice(filename).starts_with(Slice(fn))) {
-      return Status::OK();
+      return IOStatus::OK();
     }
   }
-  return Status::NotFound();
+  return IOStatus::NotFound();
 }
 
-Status MockEnv::GetChildren(const std::string& dir,
-                            std::vector<std::string>* result) {
-  auto d = NormalizePath(dir);
+bool MockFileSystem::GetChildrenInternal(const std::string& dir,
+                                         std::vector<std::string>* result) {
+  auto d = NormalizeMockPath(dir);
   bool found_dir = false;
-  {
-    MutexLock lock(&mutex_);
-    result->clear();
-    for (const auto& iter : file_map_) {
-      const std::string& filename = iter.first;
-
-      if (filename == d) {
-        found_dir = true;
-      } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
-                 Slice(filename).starts_with(Slice(d))) {
-        found_dir = true;
-        size_t next_slash = filename.find('/', d.size() + 1);
-        if (next_slash != std::string::npos) {
-          result->push_back(
-              filename.substr(d.size() + 1, next_slash - d.size() - 1));
-        } else {
-          result->push_back(filename.substr(d.size() + 1));
-        }
+  result->clear();
+  for (const auto& iter : file_map_) {
+    const std::string& filename = iter.first;
+
+    if (filename == d) {
+      found_dir = true;
+    } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
+               Slice(filename).starts_with(Slice(d))) {
+      found_dir = true;
+      size_t next_slash = filename.find('/', d.size() + 1);
+      if (next_slash != std::string::npos) {
+        result->push_back(
+            filename.substr(d.size() + 1, next_slash - d.size() - 1));
+      } else {
+        result->push_back(filename.substr(d.size() + 1));
       }
     }
   }
   result->erase(std::unique(result->begin(), result->end()), result->end());
-  return found_dir ? Status::OK() : Status::NotFound();
+  return found_dir;
 }
 
-void MockEnv::DeleteFileInternal(const std::string& fname) {
-  assert(fname == NormalizePath(fname));
+IOStatus MockFileSystem::GetChildren(const std::string& dir,
+                                     const IOOptions& /*options*/,
+                                     std::vector<std::string>* result,
+                                     IODebugContext* /*dbg*/) {
+  MutexLock lock(&mutex_);
+  bool found_dir = GetChildrenInternal(dir, result);
+  return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir);
+}
+
+void MockFileSystem::DeleteFileInternal(const std::string& fname) {
+  assert(fname == NormalizeMockPath(fname));
   const auto& pair = file_map_.find(fname);
   if (pair != file_map_.end()) {
     pair->second->Unref();
@@ -563,180 +802,222 @@
   }
 }
 
-Status MockEnv::DeleteFile(const std::string& fname) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::DeleteFile(const std::string& fname,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
 
   DeleteFileInternal(fn);
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::Truncate(const std::string& fname, size_t size) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size,
+                                  const IOOptions& options,
+                                  IODebugContext* dbg) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
-  iter->second->Truncate(size);
-  return Status::OK();
+  iter->second->Truncate(size, options, dbg);
+  return IOStatus::OK();
 }
 
-Status MockEnv::CreateDir(const std::string& dirname) {
-  auto dn = NormalizePath(dirname);
+IOStatus MockFileSystem::CreateDir(const std::string& dirname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  auto dn = NormalizeMockPath(dirname);
+  MutexLock lock(&mutex_);
   if (file_map_.find(dn) == file_map_.end()) {
-    MemFile* file = new MemFile(this, dn, false);
+    MemFile* file = new MemFile(clock_, dn, false);
     file->Ref();
     file_map_[dn] = file;
   } else {
-    return Status::IOError();
+    return IOStatus::IOError();
   }
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::CreateDirIfMissing(const std::string& dirname) {
-  CreateDir(dirname);
-  return Status::OK();
+IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                            const IOOptions& options,
+                                            IODebugContext* dbg) {
+  CreateDir(dirname, options, dbg).PermitUncheckedError();
+  return IOStatus::OK();
 }
 
-Status MockEnv::DeleteDir(const std::string& dirname) {
-  return DeleteFile(dirname);
+IOStatus MockFileSystem::DeleteDir(const std::string& dirname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  auto dir = NormalizeMockPath(dirname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(dir) == file_map_.end()) {
+    return IOStatus::PathNotFound(dir);
+  } else {
+    std::vector<std::string> children;
+    if (GetChildrenInternal(dir, &children)) {
+      for (const auto& child : children) {
+        DeleteFileInternal(child);
+      }
+    }
+    DeleteFileInternal(dir);
+    return IOStatus::OK();
+  }
 }
 
-Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::GetFileSize(const std::string& fname,
+                                     const IOOptions& /*options*/,
+                                     uint64_t* file_size,
+                                     IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
 
   *file_size = iter->second->Size();
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::GetFileModificationTime(const std::string& fname,
-                                        uint64_t* time) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname,
+                                                 const IOOptions& /*options*/,
+                                                 uint64_t* time,
+                                                 IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   *time = iter->second->ModifiedTime();
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::RenameFile(const std::string& src, const std::string& dest) {
-  auto s = NormalizePath(src);
-  auto t = NormalizePath(dest);
-  MutexLock lock(&mutex_);
-  if (file_map_.find(s) == file_map_.end()) {
-    return Status::IOError(s, "File not found");
+bool MockFileSystem::RenameFileInternal(const std::string& src,
+                                        const std::string& dest) {
+  if (file_map_.find(src) == file_map_.end()) {
+    return false;
+  } else {
+    std::vector<std::string> children;
+    if (GetChildrenInternal(src, &children)) {
+      for (const auto& child : children) {
+        RenameFileInternal(src + "/" + child, dest + "/" + child);
+      }
+    }
+    DeleteFileInternal(dest);
+    file_map_[dest] = file_map_[src];
+    file_map_.erase(src);
+    return true;
   }
+}
 
-  DeleteFileInternal(t);
-  file_map_[t] = file_map_[s];
-  file_map_.erase(s);
-  return Status::OK();
+IOStatus MockFileSystem::RenameFile(const std::string& src,
+                                    const std::string& dest,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  auto s = NormalizeMockPath(src);
+  auto t = NormalizeMockPath(dest);
+  MutexLock lock(&mutex_);
+  bool found = RenameFileInternal(s, t);
+  if (!found) {
+    return IOStatus::PathNotFound(s);
+  } else {
+    return IOStatus::OK();
+  }
 }
 
-Status MockEnv::LinkFile(const std::string& src, const std::string& dest) {
-  auto s = NormalizePath(src);
-  auto t = NormalizePath(dest);
+IOStatus MockFileSystem::LinkFile(const std::string& src,
+                                  const std::string& dest,
+                                  const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
+  auto s = NormalizeMockPath(src);
+  auto t = NormalizeMockPath(dest);
   MutexLock lock(&mutex_);
   if (file_map_.find(s) == file_map_.end()) {
-    return Status::IOError(s, "File not found");
+    return IOStatus::PathNotFound(s);
   }
 
   DeleteFileInternal(t);
   file_map_[t] = file_map_[s];
   file_map_[t]->Ref();  // Otherwise it might get deleted when noone uses s
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::NewLogger(const std::string& fname,
-                          std::shared_ptr<Logger>* result) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewLogger(const std::string& fname,
+                                   const IOOptions& io_opts,
+                                   std::shared_ptr<Logger>* result,
+                                   IODebugContext* dbg) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   MemFile* file = nullptr;
   if (iter == file_map_.end()) {
-    file = new MemFile(this, fn, false);
+    file = new MemFile(clock_, fn, false);
     file->Ref();
     file_map_[fn] = file;
   } else {
     file = iter->second;
   }
-  std::unique_ptr<WritableFile> f(new MockWritableFile(file, nullptr));
-  result->reset(new TestMemLogger(std::move(f), this));
-  return Status::OK();
+  std::unique_ptr<FSWritableFile> f(new MockWritableFile(file, FileOptions()));
+  result->reset(new TestMemLogger(std::move(f), clock_, io_opts, dbg));
+  return IOStatus::OK();
 }
 
-Status MockEnv::LockFile(const std::string& fname, FileLock** flock) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::LockFile(const std::string& fname,
+                                  const IOOptions& /*options*/,
+                                  FileLock** flock, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   {
     MutexLock lock(&mutex_);
     if (file_map_.find(fn) != file_map_.end()) {
       if (!file_map_[fn]->is_lock_file()) {
-        return Status::InvalidArgument(fname, "Not a lock file.");
+        return IOStatus::InvalidArgument(fname, "Not a lock file.");
       }
       if (!file_map_[fn]->Lock()) {
-        return Status::IOError(fn, "Lock is already held.");
+        return IOStatus::IOError(fn, "lock is already held.");
       }
     } else {
-      auto* file = new MemFile(this, fn, true);
+      auto* file = new MemFile(clock_, fn, true);
       file->Ref();
       file->Lock();
       file_map_[fn] = file;
     }
   }
   *flock = new MockEnvFileLock(fn);
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::UnlockFile(FileLock* flock) {
-  std::string fn =
-      static_cast_with_check<MockEnvFileLock, FileLock>(flock)->FileName();
+IOStatus MockFileSystem::UnlockFile(FileLock* flock,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  std::string fn = static_cast_with_check<MockEnvFileLock>(flock)->FileName();
   {
     MutexLock lock(&mutex_);
     if (file_map_.find(fn) != file_map_.end()) {
       if (!file_map_[fn]->is_lock_file()) {
-        return Status::InvalidArgument(fn, "Not a lock file.");
+        return IOStatus::InvalidArgument(fn, "Not a lock file.");
       }
       file_map_[fn]->Unlock();
     }
   }
   delete flock;
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::GetTestDirectory(std::string* path) {
+IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/,
+                                          std::string* path,
+                                          IODebugContext* /*dbg*/) {
   *path = "/test";
-  return Status::OK();
-}
-
-Status MockEnv::GetCurrentTime(int64_t* unix_time) {
-  auto s = EnvWrapper::GetCurrentTime(unix_time);
-  if (s.ok()) {
-    *unix_time += fake_sleep_micros_.load() / (1000 * 1000);
-  }
-  return s;
-}
-
-uint64_t MockEnv::NowMicros() {
-  return EnvWrapper::NowMicros() + fake_sleep_micros_.load();
+  return IOStatus::OK();
 }
 
-uint64_t MockEnv::NowNanos() {
-  return EnvWrapper::NowNanos() + fake_sleep_micros_.load() * 1000;
-}
-
-Status MockEnv::CorruptBuffer(const std::string& fname) {
-  auto fn = NormalizePath(fname);
+Status MockFileSystem::CorruptBuffer(const std::string& fname) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
@@ -746,24 +1027,29 @@
   return Status::OK();
 }
 
-std::string MockEnv::NormalizePath(const std::string path) {
-  std::string dst;
-  for (auto c : path) {
-    if (!dst.empty() && c == '/' && dst.back() == '/') {
-      continue;
-    }
-    dst.push_back(c);
-  }
-  return dst;
+MockEnv::MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs,
+                 const std::shared_ptr<SystemClock>& clock)
+    : CompositeEnvWrapper(env, fs, clock) {}
+
+MockEnv* MockEnv::Create(Env* env) {
+  auto clock =
+      std::make_shared<EmulatedSystemClock>(env->GetSystemClock(), true);
+  return MockEnv::Create(env, clock);
+}
+
+MockEnv* MockEnv::Create(Env* env, const std::shared_ptr<SystemClock>& clock) {
+  auto fs = std::make_shared<MockFileSystem>(clock);
+  return new MockEnv(env, fs, clock);
 }
 
-void MockEnv::FakeSleepForMicroseconds(int64_t micros) {
-  fake_sleep_micros_.fetch_add(micros);
+Status MockEnv::CorruptBuffer(const std::string& fname) {
+  auto mock = static_cast_with_check<MockFileSystem>(GetFileSystem().get());
+  return mock->CorruptBuffer(fname);
 }
 
 #ifndef ROCKSDB_LITE
 // This is to maintain the behavior before swithcing from InMemoryEnv to MockEnv
-Env* NewMemEnv(Env* base_env) { return new MockEnv(base_env); }
+Env* NewMemEnv(Env* base_env) { return MockEnv::Create(base_env); }
 
 #else  // ROCKSDB_LITE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,103 +12,132 @@
 #include <map>
 #include <string>
 #include <vector>
+
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
-
 class MemFile;
-class MockEnv : public EnvWrapper {
+class MockFileSystem : public FileSystem {
  public:
-  explicit MockEnv(Env* base_env);
-
-  virtual ~MockEnv();
-
-  // Partial implementation of the Env interface.
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& soptions) override;
-
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     std::unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& soptions) override;
-
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 std::unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override;
-
-  virtual Status ReuseWritableFile(const std::string& fname,
-                                   const std::string& old_fname,
-                                   std::unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override;
+  explicit MockFileSystem(const std::shared_ptr<SystemClock>& clock,
+                          bool supports_direct_io = true);
+  ~MockFileSystem() override;
+
+  static const char* kClassName() { return "MemoryFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override;
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+  IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+  IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/,
+                      IODebugContext* /*dbg*/) override;
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+  IOStatus RenameFile(const std::string& src, const std::string& target,
+                      const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+  IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+  // Get full directory name for this db.
+  IOStatus GetAbsolutePath(const std::string& db_path,
+                           const IOOptions& /*options*/,
+                           std::string* output_path,
+                           IODebugContext* /*dbg*/) override;
+  IOStatus IsDirectory(const std::string& /*path*/,
+                       const IOOptions& /*options*/, bool* /*is_dir*/,
+                       IODebugContext* /*dgb*/) override {
+    return IOStatus::NotSupported("IsDirectory");
+  }
 
-  virtual Status NewWritableFile(const std::string& fname,
-                                 std::unique_ptr<WritableFile>* result,
-                                 const EnvOptions& env_options) override;
-
-  virtual Status NewDirectory(const std::string& name,
-                              std::unique_ptr<Directory>* result) override;
-
-  virtual Status FileExists(const std::string& fname) override;
-
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) override;
+  Status CorruptBuffer(const std::string& fname);
+  Status PrepareOptions(const ConfigOptions& options) override;
 
+ private:
+  bool RenameFileInternal(const std::string& src, const std::string& dest);
   void DeleteFileInternal(const std::string& fname);
+  bool GetChildrenInternal(const std::string& fname,
+                           std::vector<std::string>* results);
 
-  virtual Status DeleteFile(const std::string& fname) override;
-
-  virtual Status Truncate(const std::string& fname, size_t size) override;
-
-  virtual Status CreateDir(const std::string& dirname) override;
-
-  virtual Status CreateDirIfMissing(const std::string& dirname) override;
-
-  virtual Status DeleteDir(const std::string& dirname) override;
+  std::string NormalizeMockPath(const std::string& path);
 
-  virtual Status GetFileSize(const std::string& fname,
-                             uint64_t* file_size) override;
-
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* time) override;
-
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& target) override;
-
-  virtual Status LinkFile(const std::string& src,
-                          const std::string& target) override;
-
-  virtual Status NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result) override;
-
-  virtual Status LockFile(const std::string& fname, FileLock** flock) override;
-
-  virtual Status UnlockFile(FileLock* flock) override;
+ private:
+  // Map from filenames to MemFile objects, representing a simple file system.
+  port::Mutex mutex_;
+  std::map<std::string, MemFile*> file_map_;  // Protected by mutex_.
+  std::shared_ptr<SystemClock> system_clock_;
+  SystemClock* clock_;
+  bool supports_direct_io_;
+};
 
-  virtual Status GetTestDirectory(std::string* path) override;
+class MockEnv : public CompositeEnvWrapper {
+ public:
+  static MockEnv* Create(Env* base);
+  static MockEnv* Create(Env* base, const std::shared_ptr<SystemClock>& clock);
 
-  // Results of these can be affected by FakeSleepForMicroseconds()
-  virtual Status GetCurrentTime(int64_t* unix_time) override;
-  virtual uint64_t NowMicros() override;
-  virtual uint64_t NowNanos() override;
+  static const char* kClassName() { return "MockEnv"; }
+  const char* Name() const override { return kClassName(); }
 
   Status CorruptBuffer(const std::string& fname);
-
-  // Doesn't really sleep, just affects output of GetCurrentTime(), NowMicros()
-  // and NowNanos()
-  void FakeSleepForMicroseconds(int64_t micros);
-
  private:
-  std::string NormalizePath(const std::string path);
-
-  // Map from filenames to MemFile objects, representing a simple file system.
-  typedef std::map<std::string, MemFile*> FileSystem;
-  port::Mutex mutex_;
-  FileSystem file_map_;  // Protected by mutex_.
-
-  std::atomic<int64_t> fake_sleep_micros_;
+  MockEnv(Env* env, const std::shared_ptr<FileSystem>& fs,
+          const std::shared_ptr<SystemClock>& clock);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/mock_env_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/mock_env_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -19,9 +19,7 @@
   MockEnv* env_;
   const EnvOptions soptions_;
 
-  MockEnvTest()
-      : env_(new MockEnv(Env::Default())) {
-  }
+  MockEnvTest() : env_(MockEnv::Create(Env::Default())) {}
   ~MockEnvTest() override { delete env_; }
 };
 
@@ -68,7 +66,7 @@
   int64_t now = 0;
   auto s = env_->GetCurrentTime(&now);
   ASSERT_OK(s);
-  env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
+  env_->SleepForMicroseconds(3 * 1000 * 1000);
   int64_t after_sleep = 0;
   s = env_->GetCurrentTime(&after_sleep);
   ASSERT_OK(s);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.cc mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,164 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "env/unique_id_gen.h"
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <random>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/version.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+struct GenerateRawUniqueIdOpts {
+  Env* env = Env::Default();
+  bool exclude_port_uuid = false;
+  bool exclude_env_details = false;
+  bool exclude_random_device = false;
+};
+
+// Each of these "tracks" below should be sufficient for generating 128 bits
+// of entropy, after hashing the raw bytes. The tracks are separable for
+// testing purposes, but in production we combine as many tracks as possible
+// to ensure quality results even if some environments have degraded
+// capabilities or quality in some APIs.
+//
+// This approach has not been validated for use in cryptography. The goal is
+// generating globally unique values with high probability without coordination
+// between instances.
+//
+// Linux performance: EntropyTrackRandomDevice is much faster than
+// EntropyTrackEnvDetails, which is much faster than EntropyTrackPortUuid.
+
+struct EntropyTrackPortUuid {
+  std::array<char, 36> uuid;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    if (opts.exclude_port_uuid) {
+      return;
+    }
+    std::string s;
+    port::GenerateRfcUuid(&s);
+    if (s.size() >= uuid.size()) {
+      std::copy_n(s.begin(), uuid.size(), uuid.begin());
+    }
+  }
+};
+
+struct EntropyTrackEnvDetails {
+  std::array<char, 64> hostname_buf;
+  int64_t process_id;
+  uint64_t thread_id;
+  int64_t unix_time;
+  uint64_t nano_time;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    if (opts.exclude_env_details) {
+      return;
+    }
+    opts.env->GetHostName(hostname_buf.data(), hostname_buf.size())
+        .PermitUncheckedError();
+    process_id = port::GetProcessID();
+    thread_id = opts.env->GetThreadID();
+    opts.env->GetCurrentTime(&unix_time).PermitUncheckedError();
+    nano_time = opts.env->NowNanos();
+  }
+};
+
+struct EntropyTrackRandomDevice {
+  using RandType = std::random_device::result_type;
+  static constexpr size_t kNumRandVals =
+      /* generous bits */ 192U / (8U * sizeof(RandType));
+  std::array<RandType, kNumRandVals> rand_vals;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    if (opts.exclude_random_device) {
+      return;
+    }
+    std::random_device r;
+    for (auto& val : rand_vals) {
+      val = r();
+    }
+  }
+};
+
+struct Entropy {
+  uint64_t version_identifier;
+  EntropyTrackRandomDevice et1;
+  EntropyTrackEnvDetails et2;
+  EntropyTrackPortUuid et3;
+
+  void Populate(const GenerateRawUniqueIdOpts& opts) {
+    // If we change the format of what goes into the entropy inputs, it's
+    // conceivable there could be a physical collision in the hash input
+    // even though they are logically different. This value should change
+    // if there's a change to the "schema" here, including byte order.
+    version_identifier = (uint64_t{ROCKSDB_MAJOR} << 32) +
+                         (uint64_t{ROCKSDB_MINOR} << 16) +
+                         uint64_t{ROCKSDB_PATCH};
+    et1.Populate(opts);
+    et2.Populate(opts);
+    et3.Populate(opts);
+  }
+};
+
+void GenerateRawUniqueIdImpl(uint64_t* a, uint64_t* b,
+                             const GenerateRawUniqueIdOpts& opts) {
+  Entropy e;
+  std::memset(&e, 0, sizeof(e));
+  e.Populate(opts);
+  Hash2x64(reinterpret_cast<const char*>(&e), sizeof(e), a, b);
+}
+
+}  // namespace
+
+void GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid) {
+  GenerateRawUniqueIdOpts opts;
+  opts.exclude_port_uuid = exclude_port_uuid;
+  assert(!opts.exclude_env_details);
+  assert(!opts.exclude_random_device);
+  GenerateRawUniqueIdImpl(a, b, opts);
+}
+
+#ifndef NDEBUG
+void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
+                              bool exclude_env_details,
+                              bool exclude_random_device) {
+  GenerateRawUniqueIdOpts opts;
+  opts.exclude_port_uuid = exclude_port_uuid;
+  opts.exclude_env_details = exclude_env_details;
+  opts.exclude_random_device = exclude_random_device;
+  GenerateRawUniqueIdImpl(a, b, opts);
+}
+#endif
+
+void SemiStructuredUniqueIdGen::Reset() {
+  saved_process_id_ = port::GetProcessID();
+  GenerateRawUniqueId(&base_upper_, &base_lower_);
+  counter_ = 0;
+}
+
+void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) {
+  if (port::GetProcessID() == saved_process_id_) {
+    // Safe to increment the atomic for guaranteed uniqueness within this
+    // process lifetime. Xor slightly better than +. See
+    // https://github.com/pdillinger/unique_id
+    *lower = base_lower_ ^ counter_.fetch_add(1);
+    *upper = base_upper_;
+  } else {
+    // There must have been a fork() or something. Rather than attempting to
+    // update in a thread-safe way, simply fall back on GenerateRawUniqueId.
+    GenerateRawUniqueId(upper, lower);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.h mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/env/unique_id_gen.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/env/unique_id_gen.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,71 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This file is for functions that generate unique identifiers by
+// (at least in part) by extracting novel entropy or sources of uniqueness
+// from the execution environment. (By contrast, random.h is for algorithmic
+// pseudorandomness.)
+//
+// These functions could eventually migrate to public APIs, such as in Env.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Generates a new 128-bit identifier that is universally unique
+// (with high probability) for each call. The result is split into
+// two 64-bit pieces. This function has NOT been validated for use in
+// cryptography.
+//
+// This is used in generating DB session IDs and by Env::GenerateUniqueId
+// (used for DB IDENTITY) if the platform does not provide a generator of
+// RFC 4122 UUIDs or fails somehow. (Set exclude_port_uuid=true if this
+// function is used as a fallback for GenerateRfcUuid, because no need
+// trying it again.)
+void GenerateRawUniqueId(uint64_t* a, uint64_t* b,
+                         bool exclude_port_uuid = false);
+
+#ifndef NDEBUG
+// A version of above with options for challenge testing
+void TEST_GenerateRawUniqueId(uint64_t* a, uint64_t* b, bool exclude_port_uuid,
+                              bool exclude_env_details,
+                              bool exclude_random_device);
+#endif
+
+// Generates globally unique ids with lower probability of any collisions
+// vs. each unique id being independently random (GenerateRawUniqueId).
+// We call this "semi-structured" because between different
+// SemiStructuredUniqueIdGen objects, the IDs are separated by random
+// intervals (unstructured), but within a single SemiStructuredUniqueIdGen
+// object, the generated IDs are trivially related (structured). See
+// https://github.com/pdillinger/unique_id for how this improves probability
+// of no collision. In short, if we have n SemiStructuredUniqueIdGen
+// objects each generating m IDs, the first collision is expected at
+// around n = sqrt(2^128 / m), equivalently n * sqrt(m) = 2^64,
+// rather than n * m = 2^64 for fully random IDs.
+class SemiStructuredUniqueIdGen {
+ public:
+  // Initializes with random starting state (from GenerateRawUniqueId)
+  SemiStructuredUniqueIdGen() { Reset(); }
+  // Re-initializes, but not thread safe
+  void Reset();
+
+  // Assuming no fork(), `lower` is guaranteed unique from one call
+  // to the next (thread safe).
+  void GenerateNext(uint64_t* upper, uint64_t* lower);
+
+ private:
+  uint64_t base_upper_;
+  uint64_t base_lower_;
+  std::atomic<uint64_t> counter_;
+  int64_t saved_process_id_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/examples/CMakeLists.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/CMakeLists.txt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,45 @@
+add_executable(simple_example
+  simple_example.cc)
+target_link_libraries(simple_example
+  ${ROCKSDB_LIB})
+
+add_executable(column_families_example
+  column_families_example.cc)
+target_link_libraries(column_families_example
+  ${ROCKSDB_LIB})
+
+add_executable(compact_files_example
+  compact_files_example.cc)
+target_link_libraries(compact_files_example
+  ${ROCKSDB_LIB})
+
+add_executable(c_simple_example
+  c_simple_example.c)
+target_link_libraries(c_simple_example
+  ${ROCKSDB_LIB})
+
+add_executable(optimistic_transaction_example
+  optimistic_transaction_example.cc)
+target_link_libraries(optimistic_transaction_example
+  ${ROCKSDB_LIB})
+
+add_executable(transaction_example
+  transaction_example.cc)
+target_link_libraries(transaction_example
+  ${ROCKSDB_LIB})
+
+add_executable(compaction_filter_example
+  compaction_filter_example.cc)
+target_link_libraries(compaction_filter_example
+  ${ROCKSDB_LIB})
+
+add_executable(options_file_example
+  options_file_example.cc)
+target_link_libraries(options_file_example
+  ${ROCKSDB_LIB})
+
+add_executable(multi_processes_example
+  EXCLUDE_FROM_ALL
+  multi_processes_example.cc)
+target_link_libraries(multi_processes_example
+  ${ROCKSDB_LIB})
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/examples/Makefile
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/Makefile	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/Makefile	2025-05-19 16:14:27.000000000 +0000
@@ -12,6 +12,8 @@
 	CXXFLAGS += -fno-rtti
 endif
 
+CFLAGS += -Wstrict-prototypes
+
 .PHONY: clean librocksdb
 
 all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/c_simple_example.c mariadb-10.11.13/storage/rocksdb/rocksdb/examples/c_simple_example.c
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/c_simple_example.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/c_simple_example.c	2025-05-19 16:14:27.000000000 +0000
@@ -10,18 +10,35 @@
 
 #include "rocksdb/c.h"
 
+#if defined(OS_WIN)
+#include <Windows.h>
+#else
 #include <unistd.h>  // sysconf() - get CPU count
+#endif
 
-const char DBPath[] = "/tmp/rocksdb_simple_example";
-const char DBBackupPath[] = "/tmp/rocksdb_simple_example_backup";
+#if defined(OS_WIN)
+const char DBPath[] = "C:\\Windows\\TEMP\\rocksdb_c_simple_example";
+const char DBBackupPath[] =
+    "C:\\Windows\\TEMP\\rocksdb_c_simple_example_backup";
+#else
+const char DBPath[] = "/tmp/rocksdb_c_simple_example";
+const char DBBackupPath[] = "/tmp/rocksdb_c_simple_example_backup";
+#endif
 
 int main(int argc, char **argv) {
   rocksdb_t *db;
   rocksdb_backup_engine_t *be;
   rocksdb_options_t *options = rocksdb_options_create();
   // Optimize RocksDB. This is the easiest way to
-  // get RocksDB to perform well
-  long cpus = sysconf(_SC_NPROCESSORS_ONLN);  // get # of online cores
+  // get RocksDB to perform well.
+#if defined(OS_WIN)
+  SYSTEM_INFO system_info;
+  GetSystemInfo(&system_info);
+  long cpus = system_info.dwNumberOfProcessors;
+#else
+  long cpus = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+  // Set # of online cores
   rocksdb_options_increase_parallelism(options, (int)(cpus));
   rocksdb_options_optimize_level_style_compaction(options, 0);
   // create the DB if it's not already present
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/column_families_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/column_families_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/column_families_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/column_families_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,9 +10,23 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/options.h"
 
-using namespace ROCKSDB_NAMESPACE;
-
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_column_families_example";
+#else
 std::string kDBPath = "/tmp/rocksdb_column_families_example";
+#endif
+
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
 
 int main() {
   // open DB
@@ -28,14 +42,15 @@
   assert(s.ok());
 
   // close DB
-  delete cf;
+  s = db->DestroyColumnFamilyHandle(cf);
+  assert(s.ok());
   delete db;
 
   // open DB with two column families
   std::vector<ColumnFamilyDescriptor> column_families;
   // have to open default column family
   column_families.push_back(ColumnFamilyDescriptor(
-      kDefaultColumnFamilyName, ColumnFamilyOptions()));
+      ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, ColumnFamilyOptions()));
   // open the new one, too
   column_families.push_back(ColumnFamilyDescriptor(
       "new_cf", ColumnFamilyOptions()));
@@ -64,7 +79,8 @@
 
   // close db
   for (auto handle : handles) {
-    delete handle;
+    s = db->DestroyColumnFamilyHandle(handle);
+    assert(s.ok());
   }
   delete db;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compact_files_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compact_files_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compact_files_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compact_files_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,8 +12,22 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 
-using namespace ROCKSDB_NAMESPACE;
+using ROCKSDB_NAMESPACE::ColumnFamilyMetaData;
+using ROCKSDB_NAMESPACE::CompactionOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::EventListener;
+using ROCKSDB_NAMESPACE::FlushJobInfo;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_compact_files_example";
+#else
 std::string kDBPath = "/tmp/rocksdb_compact_files_example";
+#endif
+
 struct CompactionTask;
 
 // This is an example interface of external-compaction algorithm.
@@ -136,7 +150,7 @@
   Options options;
   options.create_if_missing = true;
   // Disable RocksDB background compaction.
-  options.compaction_style = kCompactionStyleNone;
+  options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleNone;
   // Small slowdown and stop trigger for experimental purpose.
   options.level0_slowdown_writes_trigger = 3;
   options.level0_stop_writes_trigger = 5;
@@ -144,7 +158,7 @@
   options.listeners.emplace_back(new FullCompactor(options));
 
   DB* db = nullptr;
-  DestroyDB(kDBPath, options);
+  ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options);
   Status s = DB::Open(options, kDBPath, &db);
   assert(s.ok());
   assert(db);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/compaction_filter_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <rocksdb/compaction_filter.h>
-#include <rocksdb/db.h>
-#include <rocksdb/merge_operator.h>
-#include <rocksdb/options.h>
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
 
 class MyMerge : public ROCKSDB_NAMESPACE::MergeOperator {
  public:
@@ -54,22 +54,30 @@
   mutable int merge_count_ = 0;
 };
 
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksmergetest";
+std::string kRemoveDirCommand = "rmdir /Q /S ";
+#else
+std::string kDBPath = "/tmp/rocksmergetest";
+std::string kRemoveDirCommand = "rm -rf ";
+#endif
+
 int main() {
   ROCKSDB_NAMESPACE::DB* raw_db;
   ROCKSDB_NAMESPACE::Status status;
 
   MyFilter filter;
 
-  int ret = system("rm -rf /tmp/rocksmergetest");
+  std::string rm_cmd = kRemoveDirCommand + kDBPath;
+  int ret = system(rm_cmd.c_str());
   if (ret != 0) {
-    fprintf(stderr, "Error deleting /tmp/rocksmergetest, code: %d\n", ret);
-    return ret;
+    fprintf(stderr, "Error deleting %s, code: %d\n", kDBPath.c_str(), ret);
   }
   ROCKSDB_NAMESPACE::Options options;
   options.create_if_missing = true;
   options.merge_operator.reset(new MyMerge);
   options.compaction_filter = &filter;
-  status = ROCKSDB_NAMESPACE::DB::Open(options, "/tmp/rocksmergetest", &raw_db);
+  status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &raw_db);
   assert(status.ok());
   std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(raw_db);
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/multi_processes_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/multi_processes_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/multi_processes_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/multi_processes_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -23,6 +23,8 @@
 #include <thread>
 #include <vector>
 
+// TODO: port this example to other systems. It should be straightforward for
+// POSIX-compliant systems.
 #if defined(OS_LINUX)
 #include <dirent.h>
 #include <signal.h>
@@ -30,7 +32,6 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
-#endif  // !OS_LINUX
 
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
@@ -136,9 +137,6 @@
 
 static bool ShouldCloseDB() { return true; }
 
-// TODO: port this example to other systems. It should be straightforward for
-// POSIX-compliant systems.
-#if defined(OS_LINUX)
 void CreateDB() {
   long my_pid = static_cast<long>(getpid());
   Options options;
@@ -301,7 +299,7 @@
       std::string value;
       db->Get(ropts, key, &value);
     }
-    fprintf(stdout, "[process %ld] Point lookup thread finished\n");
+    fprintf(stdout, "[process %ld] Point lookup thread finished\n", my_pid);
   });
 
   uint64_t curr_key = 0;
@@ -389,7 +387,7 @@
 }
 #else   // OS_LINUX
 int main() {
-  fpritnf(stderr, "Not implemented.\n");
+  fprintf(stderr, "Not implemented.\n");
   return 0;
 }
 #endif  // !OS_LINUX
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/optimistic_transaction_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,9 +11,21 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 
-using namespace ROCKSDB_NAMESPACE;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::WriteOptions;
 
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_transaction_example";
+#else
 std::string kDBPath = "/tmp/rocksdb_transaction_example";
+#endif
 
 int main() {
   // open DB
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/options_file_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/options_file_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/options_file_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/options_file_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -18,9 +18,24 @@
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/options_util.h"
 
-using namespace ROCKSDB_NAMESPACE;
-
+using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::ConfigOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Status;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_options_file_example";
+#else
 std::string kDBPath = "/tmp/rocksdb_options_file_example";
+#endif
 
 namespace {
 // A dummy compaction filter
@@ -41,7 +56,8 @@
   db_opt.create_if_missing = true;
 
   std::vector<ColumnFamilyDescriptor> cf_descs;
-  cf_descs.push_back({kDefaultColumnFamilyName, ColumnFamilyOptions()});
+  cf_descs.push_back(
+      {ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, ColumnFamilyOptions()});
   cf_descs.push_back({"new_cf", ColumnFamilyOptions()});
 
   // initialize BlockBasedTableOptions
@@ -59,7 +75,8 @@
 
   // destroy and open DB
   DB* db;
-  Status s = DestroyDB(kDBPath, Options(db_opt, cf_descs[0].options));
+  Status s = ROCKSDB_NAMESPACE::DestroyDB(kDBPath,
+                                          Options(db_opt, cf_descs[0].options));
   assert(s.ok());
   s = DB::Open(Options(db_opt, cf_descs[0].options), kDBPath, &db);
   assert(s.ok());
@@ -79,15 +96,17 @@
   // Load the options file.
   DBOptions loaded_db_opt;
   std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
-  s = LoadLatestOptions(kDBPath, Env::Default(), &loaded_db_opt,
+  ConfigOptions config_options;
+  s = LoadLatestOptions(config_options, kDBPath, &loaded_db_opt,
                         &loaded_cf_descs);
   assert(s.ok());
   assert(loaded_db_opt.create_if_missing == db_opt.create_if_missing);
 
   // Initialize pointer options for each column family
   for (size_t i = 0; i < loaded_cf_descs.size(); ++i) {
-    auto* loaded_bbt_opt = reinterpret_cast<BlockBasedTableOptions*>(
-        loaded_cf_descs[0].options.table_factory->GetOptions());
+    auto* loaded_bbt_opt =
+        loaded_cf_descs[0]
+            .options.table_factory->GetOptions<BlockBasedTableOptions>();
     // Expect the same as BlockBasedTableOptions will be loaded form file.
     assert(loaded_bbt_opt->block_size == bbt_opts.block_size);
     // However, block_cache needs to be manually initialized as documented
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/simple_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/simple_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/simple_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/simple_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,9 +10,19 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/options.h"
 
-using namespace ROCKSDB_NAMESPACE;
-
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::PinnableSlice;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_simple_example";
+#else
 std::string kDBPath = "/tmp/rocksdb_simple_example";
+#endif
 
 int main() {
   DB* db;
@@ -68,7 +78,7 @@
   }
 
   PinnableSlice pinnable_val;
-  db->Get(ReadOptions(), db->DefaultColumnFamily(), "key1", &pinnable_val);
+  s = db->Get(ReadOptions(), db->DefaultColumnFamily(), "key1", &pinnable_val);
   assert(s.IsNotFound());
   // Reset PinnableSlice after each use and before each reuse
   pinnable_val.Reset();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/examples/transaction_example.cc mariadb-10.11.13/storage/rocksdb/rocksdb/examples/transaction_example.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/examples/transaction_example.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/examples/transaction_example.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,9 +11,21 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 
-using namespace ROCKSDB_NAMESPACE;
-
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::TransactionDB;
+using ROCKSDB_NAMESPACE::TransactionDBOptions;
+using ROCKSDB_NAMESPACE::TransactionOptions;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+#if defined(OS_WIN)
+std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_transaction_example";
+#else
 std::string kDBPath = "/tmp/rocksdb_transaction_example";
+#endif
 
 int main() {
   // open DB
@@ -179,7 +191,7 @@
 
   // Cleanup
   delete txn_db;
-  DestroyDB(kDBPath, options);
+  ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options);
   return 0;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,7 @@
 
 #include "file/delete_scheduler.h"
 
+#include <cinttypes>
 #include <thread>
 #include <vector>
 
@@ -14,17 +15,19 @@
 #include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-DeleteScheduler::DeleteScheduler(Env* env, FileSystem* fs,
+DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
                                  int64_t rate_bytes_per_sec, Logger* info_log,
                                  SstFileManagerImpl* sst_file_manager,
                                  double max_trash_db_ratio,
                                  uint64_t bytes_max_delete_chunk)
-    : env_(env),
+    : clock_(clock),
       fs_(fs),
       total_trash_size_(0),
       rate_bytes_per_sec_(rate_bytes_per_sec),
@@ -32,13 +35,13 @@
       bytes_max_delete_chunk_(bytes_max_delete_chunk),
       closing_(false),
       cv_(&mu_),
+      bg_thread_(nullptr),
       info_log_(info_log),
       sst_file_manager_(sst_file_manager),
       max_trash_db_ratio_(max_trash_db_ratio) {
   assert(sst_file_manager != nullptr);
   assert(max_trash_db_ratio >= 0);
-  bg_thread_.reset(
-      new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
+  MaybeCreateBackgroundThread();
 }
 
 DeleteScheduler::~DeleteScheduler() {
@@ -50,47 +53,68 @@
   if (bg_thread_) {
     bg_thread_->join();
   }
+  for (const auto& it : bg_errors_) {
+    it.second.PermitUncheckedError();
+  }
 }
 
 Status DeleteScheduler::DeleteFile(const std::string& file_path,
                                    const std::string& dir_to_sync,
                                    const bool force_bg) {
-  Status s;
   if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
       total_trash_size_.load() >
           sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
     // Rate limiting is disabled or trash size makes up more than
     // max_trash_db_ratio_ (default 25%) of the total DB size
     TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
-    s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
     if (s.ok()) {
-      sst_file_manager_->OnDeleteFile(file_path);
+      s = sst_file_manager_->OnDeleteFile(file_path);
+      ROCKS_LOG_INFO(info_log_,
+                     "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
+                     ", total_trash_size %" PRIu64 " max_trash_db_ratio %lf",
+                     file_path.c_str(), rate_bytes_per_sec_.load(),
+                     total_trash_size_.load(), max_trash_db_ratio_.load());
+      InstrumentedMutexLock l(&mu_);
+      RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
     }
     return s;
   }
 
   // Move file to trash
   std::string trash_file;
-  s = MarkAsTrash(file_path, &trash_file);
+  Status s = MarkAsTrash(file_path, &trash_file);
+  ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
+                 s.ToString().c_str());
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
                     file_path.c_str(), s.ToString().c_str());
     s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
     if (s.ok()) {
-      sst_file_manager_->OnDeleteFile(file_path);
+      s = sst_file_manager_->OnDeleteFile(file_path);
+      ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
+                     trash_file.c_str());
+      InstrumentedMutexLock l(&mu_);
+      RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
     }
     return s;
   }
 
   // Update the total trash size
   uint64_t trash_file_size = 0;
-  fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
-  total_trash_size_.fetch_add(trash_file_size);
+  IOStatus io_s =
+      fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
+  if (io_s.ok()) {
+    total_trash_size_.fetch_add(trash_file_size);
+  }
+  //**TODO: What should we do if we failed to
+  // get the file size?
 
   // Add file to delete queue
   {
     InstrumentedMutexLock l(&mu_);
+    RecordTick(stats_.get(), FILES_MARKED_TRASH);
     queue_.emplace(trash_file, dir_to_sync);
     pending_files_++;
     if (pending_files_ == 1) {
@@ -131,7 +155,7 @@
     std::string trash_file = path + "/" + current_file;
     if (sfm) {
       // We have an SstFileManager that will schedule the file delete
-      sfm->OnAddFile(trash_file);
+      s = sfm->OnAddFile(trash_file);
       file_delete = sfm->ScheduleFileDeletion(trash_file, path);
     } else {
       // Delete the file immediately
@@ -154,17 +178,17 @@
     return Status::InvalidArgument("file_path is corrupted");
   }
 
-  Status s;
   if (DeleteScheduler::IsTrashFile(file_path)) {
     // This is already a trash file
     *trash_file = file_path;
-    return s;
+    return Status::OK();
   }
 
   *trash_file = file_path + kTrashExtension;
   // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
   //             file_move_mu mutex.
   int cnt = 0;
+  Status s;
   InstrumentedMutexLock l(&file_move_mu_);
   while (true) {
     s = fs_->FileExists(*trash_file, IOOptions(), nullptr);
@@ -182,7 +206,7 @@
     cnt++;
   }
   if (s.ok()) {
-    sst_file_manager_->OnMoveFile(file_path, *trash_file);
+    s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
   }
   return s;
 }
@@ -201,22 +225,24 @@
     }
 
     // Delete all files in queue_
-    uint64_t start_time = env_->NowMicros();
+    uint64_t start_time = clock_->NowMicros();
     uint64_t total_deleted_bytes = 0;
     int64_t current_delete_rate = rate_bytes_per_sec_.load();
     while (!queue_.empty() && !closing_) {
       if (current_delete_rate != rate_bytes_per_sec_.load()) {
         // User changed the delete rate
         current_delete_rate = rate_bytes_per_sec_.load();
-        start_time = env_->NowMicros();
+        start_time = clock_->NowMicros();
         total_deleted_bytes = 0;
+        ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64,
+                       current_delete_rate);
       }
 
       // Get new file to delete
       const FileAndDir& fad = queue_.front();
       std::string path_in_trash = fad.fname;
 
-      // We dont need to hold the lock while deleting the file
+      // We don't need to hold the lock while deleting the file
       mu_.Unlock();
       uint64_t deleted_bytes = 0;
       bool is_complete = true;
@@ -233,19 +259,27 @@
         bg_errors_[path_in_trash] = s;
       }
 
-      // Apply penlty if necessary
-      uint64_t total_penlty;
+      // Apply penalty if necessary
+      uint64_t total_penalty;
       if (current_delete_rate > 0) {
         // rate limiting is enabled
-        total_penlty =
+        total_penalty =
             ((total_deleted_bytes * kMicrosInSecond) / current_delete_rate);
-        while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {}
+        ROCKS_LOG_INFO(info_log_,
+                       "Rate limiting is enabled with penalty %" PRIu64
+                       " after deleting file %s",
+                       total_penalty, path_in_trash.c_str());
+        while (!closing_ && !cv_.TimedWait(start_time + total_penalty)) {
+        }
       } else {
         // rate limiting is disabled
-        total_penlty = 0;
+        total_penalty = 0;
+        ROCKS_LOG_INFO(info_log_,
+                       "Rate limiting is disabled after deleting file %s",
+                       path_in_trash.c_str());
       }
       TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
-                               &total_penlty);
+                               &total_penalty);
 
       if (is_complete) {
         pending_files_--;
@@ -323,14 +357,18 @@
           s = fs_->NewDirectory(dir_to_sync, IOOptions(), &dir_obj, nullptr);
         }
         if (s.ok()) {
-          s = dir_obj->Fsync(IOOptions(), nullptr);
+          s = dir_obj->FsyncWithDirOptions(
+              IOOptions(), nullptr,
+              DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted));
           TEST_SYNC_POINT_CALLBACK(
               "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
               reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
         }
       }
-      *deleted_bytes = file_size;
-      sst_file_manager_->OnDeleteFile(path_in_trash);
+      if (s.ok()) {
+        *deleted_bytes = file_size;
+        s = sst_file_manager_->OnDeleteFile(path_in_trash);
+      }
     }
   }
   if (!s.ok()) {
@@ -352,6 +390,17 @@
   }
 }
 
+void DeleteScheduler::MaybeCreateBackgroundThread() {
+  if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
+    bg_thread_.reset(
+        new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
+    ROCKS_LOG_INFO(info_log_,
+                   "Created background thread for deletion scheduler with "
+                   "rate_bytes_per_sec: %" PRIi64,
+                   rate_bytes_per_sec_.load());
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler.h	2025-05-19 16:14:27.000000000 +0000
@@ -15,26 +15,28 @@
 #include "monitoring/instrumented_mutex.h"
 #include "port/port.h"
 
-#include "rocksdb/file_system.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Env;
+class FileSystem;
 class Logger;
 class SstFileManagerImpl;
+class SystemClock;
 
 // DeleteScheduler allows the DB to enforce a rate limit on file deletion,
 // Instead of deleteing files immediately, files are marked as trash
-// and deleted in a background thread that apply sleep penlty between deletes
+// and deleted in a background thread that apply sleep penalty between deletes
 // if they are happening in a rate faster than rate_bytes_per_sec,
 //
 // Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
 // case DeleteScheduler will delete files immediately.
 class DeleteScheduler {
  public:
-  DeleteScheduler(Env* env, FileSystem* fs, int64_t rate_bytes_per_sec,
-                  Logger* info_log, SstFileManagerImpl* sst_file_manager,
+  DeleteScheduler(SystemClock* clock, FileSystem* fs,
+                  int64_t rate_bytes_per_sec, Logger* info_log,
+                  SstFileManagerImpl* sst_file_manager,
                   double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
 
   ~DeleteScheduler();
@@ -45,9 +47,10 @@
   // Set delete rate limit in bytes per second
   void SetRateBytesPerSecond(int64_t bytes_per_sec) {
     rate_bytes_per_sec_.store(bytes_per_sec);
+    MaybeCreateBackgroundThread();
   }
 
-  // Mark file as trash directory and schedule it's deletion. If force_bg is
+  // Mark file as trash directory and schedule its deletion. If force_bg is
   // set, it forces the file to always be deleted in the background thread,
   // except when rate limiting is disabled
   Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
@@ -77,11 +80,16 @@
   static const std::string kTrashExtension;
   static bool IsTrashFile(const std::string& file_path);
 
-  // Check if there are any .trash filse in path, and schedule their deletion
+  // Check if there are any .trash files in path, and schedule their deletion
   // Or delete immediately if sst_file_manager is nullptr
   static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
                                  const std::string& path);
 
+  void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) {
+    InstrumentedMutexLock l(&mu_);
+    stats_ = stats;
+  }
+
  private:
   Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
 
@@ -91,14 +99,16 @@
 
   void BackgroundEmptyTrash();
 
-  Env* env_;
+  void MaybeCreateBackgroundThread();
+
+  SystemClock* clock_;
   FileSystem* fs_;
 
   // total size of trash files
   std::atomic<uint64_t> total_trash_size_;
   // Maximum number of bytes that should be deleted per second
   std::atomic<int64_t> rate_bytes_per_sec_;
-  // Mutex to protect queue_, pending_files_, bg_errors_, closing_
+  // Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_
   InstrumentedMutex mu_;
 
   struct FileAndDir {
@@ -134,6 +144,7 @@
   // immediately
   std::atomic<double> max_trash_db_ratio_;
   static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+  std::shared_ptr<Statistics> stats_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/delete_scheduler_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,18 +3,19 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "file/delete_scheduler.h"
+
 #include <atomic>
 #include <cinttypes>
 #include <thread>
 #include <vector>
 
-#include "file/delete_scheduler.h"
+#include "file/file_util.h"
 #include "file/sst_file_manager_impl.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "util/string_util.h"
 
 #ifndef ROCKSDB_LITE
@@ -32,6 +33,7 @@
           ToString(i));
       DestroyAndCreateDir(dummy_files_dirs_.back());
     }
+    stats_ = ROCKSDB_NAMESPACE::CreateDBStatistics();
   }
 
   ~DeleteSchedulerTest() override {
@@ -39,12 +41,12 @@
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
     for (const auto& dummy_files_dir : dummy_files_dirs_) {
-      test::DestroyDir(env_, dummy_files_dir);
+      DestroyDir(env_, dummy_files_dir);
     }
   }
 
   void DestroyAndCreateDir(const std::string& dir) {
-    ASSERT_OK(test::DestroyDir(env_, dir));
+    ASSERT_OK(DestroyDir(env_, dir));
     EXPECT_OK(env_->CreateDir(dir));
   }
 
@@ -55,7 +57,7 @@
 
     int normal_cnt = 0;
     for (auto& f : files_in_dir) {
-      if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") {
+      if (!DeleteScheduler::IsTrashFile(f)) {
         normal_cnt++;
       }
     }
@@ -85,20 +87,20 @@
     std::string data(size, 'A');
     EXPECT_OK(f->Append(data));
     EXPECT_OK(f->Close());
-    sst_file_mgr_->OnAddFile(file_path, false);
+    sst_file_mgr_->OnAddFile(file_path);
     return file_path;
   }
 
   void NewDeleteScheduler() {
-    // Tests in this file are for DeleteScheduler component and dont create any
+    // Tests in this file are for DeleteScheduler component and don't create any
     // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
     // 25%)
-    std::shared_ptr<FileSystem>
-                fs(std::make_shared<LegacyFileSystemWrapper>(env_));
     sst_file_mgr_.reset(
-        new SstFileManagerImpl(env_, fs, nullptr, rate_bytes_per_sec_,
+        new SstFileManagerImpl(env_->GetSystemClock(), env_->GetFileSystem(),
+                               nullptr, rate_bytes_per_sec_,
                                /* max_trash_db_ratio= */ 1.1, 128 * 1024));
     delete_scheduler_ = sst_file_mgr_->delete_scheduler();
+    sst_file_mgr_->SetStatisticsPtr(stats_);
   }
 
   Env* env_;
@@ -106,6 +108,7 @@
   int64_t rate_bytes_per_sec_;
   DeleteScheduler* delete_scheduler_;
   std::unique_ptr<SstFileManagerImpl> sst_file_mgr_;
+  std::shared_ptr<Statistics> stats_;
 };
 
 // Test the basic functionality of DeleteScheduler (Rate Limiting).
@@ -182,6 +185,8 @@
     ASSERT_EQ(num_files, dir_synced);
 
     ASSERT_EQ(CountTrashFiles(), 0);
+    ASSERT_EQ(num_files, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
@@ -219,6 +224,9 @@
     ASSERT_EQ(0, CountTrashFiles(i));
   }
 
+  ASSERT_EQ(kNumFiles, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
@@ -301,12 +309,16 @@
 
     ASSERT_EQ(CountNormalFiles(), 0);
     ASSERT_EQ(CountTrashFiles(), 0);
+    ASSERT_EQ(num_files * thread_cnt,
+              stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
 
 // Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure
-// that when DeleteScheduler delete a file it delete it immediately and dont
+// that when DeleteScheduler delete a file it delete it immediately and don't
 // move it to trash
 TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
   int bg_delete_file = 0;
@@ -318,8 +330,9 @@
 
   rate_bytes_per_sec_ = 0;
   NewDeleteScheduler();
+  constexpr int num_files = 10;
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < num_files; i++) {
     // Every file we delete will be deleted immediately
     std::string dummy_file = NewDummyFile("dummy.data");
     ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
@@ -329,6 +342,9 @@
   }
 
   ASSERT_EQ(bg_delete_file, 0);
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(num_files,
+            stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -365,6 +381,8 @@
 
   auto bg_errors = delete_scheduler_->GetBackgroundErrors();
   ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -406,7 +424,9 @@
   delete_scheduler_->WaitForEmptyTrash();
   auto bg_errors = delete_scheduler_->GetBackgroundErrors();
   ASSERT_EQ(bg_errors.size(), 10);
-
+  for (const auto& it : bg_errors) {
+    ASSERT_TRUE(it.second.IsPathNotFound());
+  }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
@@ -439,9 +459,12 @@
 
     auto bg_errors = delete_scheduler_->GetBackgroundErrors();
     ASSERT_EQ(bg_errors.size(), 0);
+    ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+    ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
   }
 
   ASSERT_EQ(bg_delete_file, 50);
+
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 }
 
@@ -647,12 +670,14 @@
   }
 
   for (std::string& file_name : generated_files) {
-    delete_scheduler_->DeleteFile(file_name, "");
+    ASSERT_OK(delete_scheduler_->DeleteFile(file_name, ""));
   }
 
   // When we end up with 26 files in trash we will start
   // deleting new files immediately
   ASSERT_EQ(fg_delete_file, 74);
+  ASSERT_EQ(26, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(74, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -21,12 +21,14 @@
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
-Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
+Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
+                                    RandomAccessFileReader* reader,
                                     uint64_t offset, size_t n,
                                     bool for_compaction) {
   if (!enable_ || reader == nullptr) {
     return Status::OK();
   }
+  TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
   size_t alignment = reader->file()->GetRequiredBufferAlignment();
   size_t offset_ = static_cast<size_t>(offset);
   uint64_t rounddown_offset = Rounddown(offset_, alignment);
@@ -86,18 +88,30 @@
   }
 
   Slice result;
-  s = reader->Read(rounddown_offset + chunk_len,
-                   static_cast<size_t>(roundup_len - chunk_len), &result,
-                   buffer_.BufferStart() + chunk_len, for_compaction);
-  if (s.ok()) {
-    buffer_offset_ = rounddown_offset;
-    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
-  }
+  size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
+  s = reader->Read(opts, rounddown_offset + chunk_len, read_len, &result,
+                   buffer_.BufferStart() + chunk_len, nullptr, for_compaction);
+  if (!s.ok()) {
+    return s;
+  }
+
+#ifndef NDEBUG
+  if (result.size() < read_len) {
+    // Fake an IO error to force db_stress fault injection to ignore
+    // truncated read errors
+    IGNORE_STATUS_IF_ERROR(Status::IOError());
+  }
+#endif
+  buffer_offset_ = rounddown_offset;
+  buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
   return s;
 }
 
-bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
-                                          Slice* result, bool for_compaction) {
+bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
+                                          RandomAccessFileReader* reader,
+                                          uint64_t offset, size_t n,
+                                          Slice* result, Status* status,
+                                          bool for_compaction) {
   if (track_min_offset_ && offset < min_offset_read_) {
     min_offset_read_ = static_cast<size_t>(offset);
   }
@@ -106,21 +120,47 @@
   }
 
   // If the buffer contains only a few of the requested bytes:
-  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+  //    If readahead is enabled: prefetch the remaining bytes + readahead bytes
   //        and satisfy the request.
   //    If readahead is not enabled: return false.
+  TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
+                           &readahead_size_);
   if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
     if (readahead_size_ > 0) {
-      assert(file_reader_ != nullptr);
+      assert(reader != nullptr);
       assert(max_readahead_size_ >= readahead_size_);
       Status s;
       if (for_compaction) {
-        s = Prefetch(file_reader_, offset, std::max(n, readahead_size_),
+        s = Prefetch(opts, reader, offset, std::max(n, readahead_size_),
                      for_compaction);
       } else {
-        s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
+        if (implicit_auto_readahead_) {
+          // Prefetch only if this read is sequential otherwise reset
+          // readahead_size_ to initial value.
+          if (!IsBlockSequential(offset)) {
+            UpdateReadPattern(offset, n);
+            ResetValues();
+            // Ignore status as Prefetch is not called.
+            s.PermitUncheckedError();
+            return false;
+          }
+          num_file_reads_++;
+          if (num_file_reads_ <= kMinNumFileReadsToStartAutoReadahead) {
+            UpdateReadPattern(offset, n);
+            // Ignore status as Prefetch is not called.
+            s.PermitUncheckedError();
+            return false;
+          }
+        }
+        s = Prefetch(opts, reader, offset, n + readahead_size_, for_compaction);
       }
       if (!s.ok()) {
+        if (status) {
+          *status = s;
+        }
+#ifndef NDEBUG
+        IGNORE_STATUS_IF_ERROR(s);
+#endif
         return false;
       }
       readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
@@ -128,7 +168,7 @@
       return false;
     }
   }
-
+  UpdateReadPattern(offset, n);
   uint64_t offset_in_buffer = offset - buffer_offset_;
   *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
   return true;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_prefetch_buffer.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,23 +8,33 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <algorithm>
 #include <atomic>
 #include <sstream>
 #include <string>
-#include "file/random_access_file_reader.h"
+
+#include "file/readahead_file_info.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
 #include "util/aligned_buffer.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+#define DEAFULT_DECREMENT 8 * 1024
+
+struct IOOptions;
+class RandomAccessFileReader;
+
 // FilePrefetchBuffer is a smart buffer to store and read data from a file.
 class FilePrefetchBuffer {
  public:
+  static const int kMinNumFileReadsToStartAutoReadahead = 2;
+  static const size_t kInitAutoReadaheadSize = 8 * 1024;
+
   // Constructor.
   //
   // All arguments are optional.
-  // file_reader        : the file reader to use. Can be a nullptr.
   // readahead_size     : the initial readahead size.
   // max_readahead_size : the maximum readahead size.
   //   If max_readahead_size > readahead_size, the readahead size will be
@@ -36,54 +46,113 @@
   //   for the minimum offset if track_min_offset = true.
   // track_min_offset : Track the minimum offset ever read and collect stats on
   //   it. Used for adaptable readahead of the file footer/metadata.
+  // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
+  //   doing sequential scans for two times.
   //
-  // Automatic readhead is enabled for a file if file_reader, readahead_size,
+  // Automatic readhead is enabled for a file if readahead_size
   // and max_readahead_size are passed in.
-  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
-  // does not make any sense. So it does nothing.
   // A user can construct a FilePrefetchBuffer without any arguments, but use
   // `Prefetch` to load data into the buffer.
-  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
-                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
-                     bool enable = true, bool track_min_offset = false)
+  FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false,
+                     bool implicit_auto_readahead = false)
       : buffer_offset_(0),
-        file_reader_(file_reader),
-        readahead_size_(readadhead_size),
+        readahead_size_(readahead_size),
         max_readahead_size_(max_readahead_size),
         min_offset_read_(port::kMaxSizet),
         enable_(enable),
-        track_min_offset_(track_min_offset) {}
+        track_min_offset_(track_min_offset),
+        implicit_auto_readahead_(implicit_auto_readahead),
+        prev_offset_(0),
+        prev_len_(0),
+        num_file_reads_(kMinNumFileReadsToStartAutoReadahead + 1) {}
 
   // Load data into the buffer from a file.
   // reader : the file reader.
   // offset : the file offset to start reading from.
   // n      : the number of bytes to read.
   // for_compaction : if prefetch is done for compaction read.
-  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
-                  bool for_compaction = false);
+  Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader,
+                  uint64_t offset, size_t n, bool for_compaction = false);
 
-  // Tries returning the data for a file raed from this buffer, if that data is
+  // Tries returning the data for a file read from this buffer if that data is
   // in the buffer.
   // It handles tracking the minimum read offset if track_min_offset = true.
-  // It also does the exponential readahead when readadhead_size is set as part
+  // It also does the exponential readahead when readahead_size is set as part
   // of the constructor.
   //
-  // offset : the file offset.
-  // n      : the number of bytes.
-  // result : output buffer to put the data into.
-  // for_compaction : if cache read is done for compaction read.
-  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
+  // opts           : the IO options to use.
+  // reader         : the file reader.
+  // offset         : the file offset.
+  // n              : the number of bytes.
+  // result         : output buffer to put the data into.
+  // s              : output status.
+  // for_compaction : true if cache read is done for compaction read.
+  bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader,
+                        uint64_t offset, size_t n, Slice* result, Status* s,
                         bool for_compaction = false);
 
   // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
   // tracked if track_min_offset = true.
   size_t min_offset_read() const { return min_offset_read_; }
 
+  // Called in case of implicit auto prefetching.
+  void UpdateReadPattern(const uint64_t& offset, const size_t& len,
+                         bool is_adaptive_readahead = false) {
+    if (is_adaptive_readahead) {
+      // Since this block was eligible for prefetch but it was found in
+      // cache, so check and decrease the readahead_size by 8KB (default)
+      // if eligible.
+      DecreaseReadAheadIfEligible(offset, len);
+    }
+    prev_offset_ = offset;
+    prev_len_ = len;
+  }
+
+  bool IsBlockSequential(const size_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  // Called in case of implicit auto prefetching.
+  void ResetValues() {
+    num_file_reads_ = 1;
+    readahead_size_ = kInitAutoReadaheadSize;
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
+    readahead_info->readahead_size = readahead_size_;
+    readahead_info->num_file_reads = num_file_reads_;
+  }
+
+  void DecreaseReadAheadIfEligible(uint64_t offset, size_t size,
+                                   size_t value = DEAFULT_DECREMENT) {
+    // Decrease the readahead_size if
+    // - its enabled internally by RocksDB (implicit_auto_readahead_) and,
+    // - readahead_size is greater than 0 and,
+    // - this block would have called prefetch API if not found in cache for
+    //   which conditions are:
+    //   - few/no bytes are in buffer and,
+    //   - block is sequential with the previous read and,
+    //   - num_file_reads_ + 1 (including this read) >
+    //   kMinNumFileReadsToStartAutoReadahead
+    if (implicit_auto_readahead_ && readahead_size_ > 0) {
+      if ((offset + size > buffer_offset_ + buffer_.CurrentSize()) &&
+          IsBlockSequential(offset) &&
+          (num_file_reads_ + 1 > kMinNumFileReadsToStartAutoReadahead)) {
+        size_t initial_auto_readahead_size = kInitAutoReadaheadSize;
+        readahead_size_ =
+            std::max(initial_auto_readahead_size,
+                     (readahead_size_ >= value ? readahead_size_ - value : 0));
+      }
+    }
+  }
+
  private:
   AlignedBuffer buffer_;
   uint64_t buffer_offset_;
-  RandomAccessFileReader* file_reader_;
   size_t readahead_size_;
+  // FilePrefetchBuffer object won't be created from Iterator flow if
+  // max_readahead_size_ = 0.
   size_t max_readahead_size_;
   // The minimum `offset` ever passed to TryReadFromCache().
   size_t min_offset_read_;
@@ -93,5 +162,12 @@
   // If true, track minimum `offset` ever passed to TryReadFromCache(), which
   // can be fetched from min_offset_read().
   bool track_min_offset_;
+
+  // implicit_auto_readahead is enabled by rocksdb internally after 2
+  // sequential IOs.
+  bool implicit_auto_readahead_;
+  uint64_t prev_offset_;
+  size_t prev_len_;
+  int64_t num_file_reads_;
 };
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,33 +17,35 @@
 namespace ROCKSDB_NAMESPACE {
 
 // Utility function to copy a file up to a specified length
-Status CopyFile(FileSystem* fs, const std::string& source,
-                const std::string& destination, uint64_t size, bool use_fsync) {
+IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                  const std::string& destination, uint64_t size, bool use_fsync,
+                  const std::shared_ptr<IOTracer>& io_tracer) {
   const FileOptions soptions;
-  Status s;
+  IOStatus io_s;
   std::unique_ptr<SequentialFileReader> src_reader;
   std::unique_ptr<WritableFileWriter> dest_writer;
 
   {
     std::unique_ptr<FSSequentialFile> srcfile;
-    s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr);
-    if (!s.ok()) {
-      return s;
+    io_s = fs->NewSequentialFile(source, soptions, &srcfile, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
     }
     std::unique_ptr<FSWritableFile> destfile;
-    s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
-    if (!s.ok()) {
-      return s;
+    io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
     }
 
     if (size == 0) {
       // default argument means copy everything
-      s = fs->GetFileSize(source, IOOptions(), &size, nullptr);
-      if (!s.ok()) {
-        return s;
+      io_s = fs->GetFileSize(source, IOOptions(), &size, nullptr);
+      if (!io_s.ok()) {
+        return io_s;
       }
     }
-    src_reader.reset(new SequentialFileReader(std::move(srcfile), source));
+    src_reader.reset(
+        new SequentialFileReader(std::move(srcfile), source, io_tracer));
     dest_writer.reset(
         new WritableFileWriter(std::move(destfile), destination, soptions));
   }
@@ -52,16 +54,16 @@
   Slice slice;
   while (size > 0) {
     size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
-    s = src_reader->Read(bytes_to_read, &slice, buffer);
-    if (!s.ok()) {
-      return s;
+    io_s = status_to_io_status(src_reader->Read(bytes_to_read, &slice, buffer));
+    if (!io_s.ok()) {
+      return io_s;
     }
     if (slice.size() == 0) {
-      return Status::Corruption("file too small");
+      return IOStatus::Corruption("file too small");
     }
-    s = dest_writer->Append(slice);
-    if (!s.ok()) {
-      return s;
+    io_s = dest_writer->Append(slice);
+    if (!io_s.ok()) {
+      return io_s;
     }
     size -= slice.size();
   }
@@ -69,22 +71,22 @@
 }
 
 // Utility function to create a file with the provided contents
-Status CreateFile(FileSystem* fs, const std::string& destination,
-                  const std::string& contents, bool use_fsync) {
+IOStatus CreateFile(FileSystem* fs, const std::string& destination,
+                    const std::string& contents, bool use_fsync) {
   const EnvOptions soptions;
-  Status s;
+  IOStatus io_s;
   std::unique_ptr<WritableFileWriter> dest_writer;
 
   std::unique_ptr<FSWritableFile> destfile;
-  s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
-  if (!s.ok()) {
-    return s;
+  io_s = fs->NewWritableFile(destination, soptions, &destfile, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
   }
   dest_writer.reset(
       new WritableFileWriter(std::move(destfile), destination, soptions));
-  s = dest_writer->Append(Slice(contents));
-  if (!s.ok()) {
-    return s;
+  io_s = dest_writer->Append(Slice(contents));
+  if (!io_s.ok()) {
+    return io_s;
   }
   return dest_writer->Sync(use_fsync);
 }
@@ -110,15 +112,147 @@
 #endif
 }
 
-bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) {
-  bool same = false;
-  assert(!db_options->db_paths.empty());
-  Status s = db_options->env->AreFilesSame(db_options->wal_dir,
-                                           db_options->db_paths[0].path, &same);
-  if (s.IsNotSupported()) {
-    same = db_options->wal_dir == db_options->db_paths[0].path;
+// requested_checksum_func_name brings the function name of the checksum
+// generator in checksum_factory. Empty string is permitted, in which case the
+// name of the generator created by the factory is unchecked. When
+// `requested_checksum_func_name` is non-empty, however, the created generator's
+// name must match it, otherwise an `InvalidArgument` error is returned.
+IOStatus GenerateOneFileChecksum(
+    FileSystem* fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter) {
+  if (checksum_factory == nullptr) {
+    return IOStatus::InvalidArgument("Checksum factory is invalid");
+  }
+  assert(file_checksum != nullptr);
+  assert(file_checksum_func_name != nullptr);
+
+  FileChecksumGenContext gen_context;
+  gen_context.requested_checksum_func_name = requested_checksum_func_name;
+  gen_context.file_name = file_path;
+  std::unique_ptr<FileChecksumGenerator> checksum_generator =
+      checksum_factory->CreateFileChecksumGenerator(gen_context);
+  if (checksum_generator == nullptr) {
+    std::string msg =
+        "Cannot get the file checksum generator based on the requested "
+        "checksum function name: " +
+        requested_checksum_func_name +
+        " from checksum factory: " + checksum_factory->Name();
+    return IOStatus::InvalidArgument(msg);
+  } else {
+    // For backward compatibility and use in file ingestion clients where there
+    // is no stored checksum function name, `requested_checksum_func_name` can
+    // be empty. If we give the requested checksum function name, we expect it
+    // is the same name of the checksum generator.
+    if (!requested_checksum_func_name.empty() &&
+        checksum_generator->Name() != requested_checksum_func_name) {
+      std::string msg = "Expected file checksum generator named '" +
+                        requested_checksum_func_name +
+                        "', while the factory created one "
+                        "named '" +
+                        checksum_generator->Name() + "'";
+      return IOStatus::InvalidArgument(msg);
+    }
+  }
+
+  uint64_t size;
+  IOStatus io_s;
+  std::unique_ptr<RandomAccessFileReader> reader;
+  {
+    std::unique_ptr<FSRandomAccessFile> r_file;
+    io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    io_s = fs->GetFileSize(file_path, IOOptions(), &size, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    reader.reset(new RandomAccessFileReader(std::move(r_file), file_path,
+                                            nullptr /*Env*/, io_tracer, nullptr,
+                                            0, nullptr, rate_limiter));
+  }
+
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  size_t default_max_read_ahead_size = 256 * 1024;
+  size_t readahead_size = (verify_checksums_readahead_size != 0)
+                              ? verify_checksums_readahead_size
+                              : default_max_read_ahead_size;
+
+  FilePrefetchBuffer prefetch_buffer(readahead_size /* readahead_size */,
+                                     readahead_size /* max_readahead_size */,
+                                     !allow_mmap_reads /* enable */);
+
+  Slice slice;
+  uint64_t offset = 0;
+  IOOptions opts;
+  while (size > 0) {
+    size_t bytes_to_read =
+        static_cast<size_t>(std::min(uint64_t{readahead_size}, size));
+    if (!prefetch_buffer.TryReadFromCache(
+            opts, reader.get(), offset, bytes_to_read, &slice,
+            nullptr /* status */, false /* for_compaction */)) {
+      return IOStatus::Corruption("file read failed");
+    }
+    if (slice.size() == 0) {
+      return IOStatus::Corruption("file too small");
+    }
+    checksum_generator->Update(slice.data(), slice.size());
+    size -= slice.size();
+    offset += slice.size();
+  }
+  checksum_generator->Finalize();
+  *file_checksum = checksum_generator->GetChecksum();
+  *file_checksum_func_name = checksum_generator->Name();
+  return IOStatus::OK();
+}
+
+Status DestroyDir(Env* env, const std::string& dir) {
+  Status s;
+  if (env->FileExists(dir).IsNotFound()) {
+    return s;
+  }
+  std::vector<std::string> files_in_dir;
+  s = env->GetChildren(dir, &files_in_dir);
+  if (s.ok()) {
+    for (auto& file_in_dir : files_in_dir) {
+      std::string path = dir + "/" + file_in_dir;
+      bool is_dir = false;
+      s = env->IsDirectory(path, &is_dir);
+      if (s.ok()) {
+        if (is_dir) {
+          s = DestroyDir(env, path);
+        } else {
+          s = env->DeleteFile(path);
+        }
+      } else if (s.IsNotSupported()) {
+        s = Status::OK();
+      }
+      if (!s.ok()) {
+        // IsDirectory, etc. might not report NotFound
+        if (s.IsNotFound() || env->FileExists(path).IsNotFound()) {
+          // Allow files to be deleted externally
+          s = Status::OK();
+        } else {
+          break;
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    s = env->DeleteDir(dir);
+    // DeleteDir might or might not report NotFound
+    if (!s.ok() && (s.IsNotFound() || env->FileExists(dir).IsNotFound())) {
+      // Allow to be deleted externally
+      s = Status::OK();
+    }
   }
-  return same;
+  return s;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/file_util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/file_util.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,24 +10,83 @@
 #include "options/db_options.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
 #include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/types.h"
+#include "trace_replay/io_tracer.h"
 
 namespace ROCKSDB_NAMESPACE {
 // use_fsync maps to options.use_fsync, which determines the way that
 // the file is synced after copying.
-extern Status CopyFile(FileSystem* fs, const std::string& source,
-                       const std::string& destination, uint64_t size,
-                       bool use_fsync);
+extern IOStatus CopyFile(FileSystem* fs, const std::string& source,
+                         const std::string& destination, uint64_t size,
+                         bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer = nullptr);
+inline IOStatus CopyFile(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& source,
+                         const std::string& destination, uint64_t size,
+                         bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer = nullptr) {
+  return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer);
+}
 
-extern Status CreateFile(FileSystem* fs, const std::string& destination,
-                         const std::string& contents, bool use_fsync);
+extern IOStatus CreateFile(FileSystem* fs, const std::string& destination,
+                           const std::string& contents, bool use_fsync);
+
+inline IOStatus CreateFile(const std::shared_ptr<FileSystem>& fs,
+                           const std::string& destination,
+                           const std::string& contents, bool use_fsync) {
+  return CreateFile(fs.get(), destination, contents, use_fsync);
+}
 
 extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
                            const std::string& fname,
                            const std::string& path_to_sync, const bool force_bg,
                            const bool force_fg);
 
-extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options);
+extern IOStatus GenerateOneFileChecksum(
+    FileSystem* fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter = nullptr);
+
+inline IOStatus GenerateOneFileChecksum(
+    const std::shared_ptr<FileSystem>& fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer) {
+  return GenerateOneFileChecksum(
+      fs.get(), file_path, checksum_factory, requested_checksum_func_name,
+      file_checksum, file_checksum_func_name, verify_checksums_readahead_size,
+      allow_mmap_reads, io_tracer);
+}
+
+inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
+                                         SystemClock* clock, IOOptions& opts) {
+  if (ro.deadline.count()) {
+    std::chrono::microseconds now =
+        std::chrono::microseconds(clock->NowMicros());
+    // Ensure there is atleast 1us available. We don't want to pass a value of
+    // 0 as that means no timeout
+    if (now >= ro.deadline) {
+      return IOStatus::TimedOut("Deadline exceeded");
+    }
+    opts.timeout = ro.deadline - now;
+  }
+
+  if (ro.io_timeout.count() &&
+      (!opts.timeout.count() || ro.io_timeout < opts.timeout)) {
+    opts.timeout = ro.io_timeout;
+  }
+  return IOStatus::OK();
+}
 
+// Test method to delete the input directory and all of its contents.
+// This method is destructive and is meant for use only in tests!!!
+Status DestroyDir(Env* env, const std::string& dir);
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,7 +13,6 @@
 #include <stdio.h>
 #include <vector>
 #include "file/writable_file_writer.h"
-#include "logging/logging.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "util/stop_watch.h"
@@ -21,9 +20,14 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+const std::string kCurrentFileName = "CURRENT";
+const std::string kOptionsFileNamePrefix = "OPTIONS-";
+const std::string kTempFileNameSuffix = "dbtmp";
+
 static const std::string kRocksDbTFileExt = "sst";
 static const std::string kLevelDbTFileExt = "ldb";
 static const std::string kRocksDBBlobFileExt = "blob";
+static const std::string kArchivalDirName = "archive";
 
 // Given a path, flatten the path name by replacing all chars not in
 // {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
@@ -79,6 +83,11 @@
   return MakeFileName(number, "log");
 }
 
+std::string BlobFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, kRocksDBBlobFileExt.c_str());
+}
+
 std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
   assert(number > 0);
   return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
@@ -92,11 +101,11 @@
 }
 
 std::string ArchivalDirectory(const std::string& dir) {
-  return dir + "/" + ARCHIVAL_DIR;
+  return dir + "/" + kArchivalDirName;
 }
 std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
   assert(number > 0);
-  return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+  return MakeFileName(name + "/" + kArchivalDirName, number, "log");
 }
 
 std::string MakeTableFileName(const std::string& path, uint64_t number) {
@@ -151,16 +160,20 @@
   }
 }
 
-std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+std::string DescriptorFileName(uint64_t number) {
   assert(number > 0);
   char buf[100];
-  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+  snprintf(buf, sizeof(buf), "MANIFEST-%06llu",
            static_cast<unsigned long long>(number));
-  return dbname + buf;
+  return buf;
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  return dbname + "/" + DescriptorFileName(number);
 }
 
 std::string CurrentFileName(const std::string& dbname) {
-  return dbname + "/CURRENT";
+  return dbname + "/" + kCurrentFileName;
 }
 
 std::string LockFileName(const std::string& dbname) {
@@ -179,7 +192,8 @@
     snprintf(buf, sizeof(buf), kInfoLogPrefix);
     prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
   } else {
-    size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf));
+    size_t len =
+        GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf));
     prefix = Slice(buf, len);
   }
 }
@@ -208,11 +222,14 @@
   return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
 }
 
-std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
+std::string OptionsFileName(uint64_t file_num) {
   char buffer[256];
   snprintf(buffer, sizeof(buffer), "%s%06" PRIu64,
            kOptionsFileNamePrefix.c_str(), file_num);
-  return dbname + "/" + buffer;
+  return buffer;
+}
+std::string OptionsFileName(const std::string& dbname, uint64_t file_num) {
+  return dbname + "/" + OptionsFileName(file_num);
 }
 
 std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
@@ -326,11 +343,12 @@
     // Avoid strtoull() to keep filename format independent of the
     // current locale
     bool archive_dir_found = false;
-    if (rest.starts_with(ARCHIVAL_DIR)) {
-      if (rest.size() <= ARCHIVAL_DIR.size()) {
+    if (rest.starts_with(kArchivalDirName)) {
+      if (rest.size() <= kArchivalDirName.size()) {
         return false;
       }
-      rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+      rest.remove_prefix(kArchivalDirName.size() +
+                         1);  // Add 1 to remove / also
       if (log_type) {
         *log_type = kArchivedLogFile;
       }
@@ -347,7 +365,7 @@
 
     Slice suffix = rest;
     if (suffix == Slice("log")) {
-      *type = kLogFile;
+      *type = kWalFile;
       if (log_type && !archive_dir_found) {
         *log_type = kAliveLogFile;
       }
@@ -368,27 +386,34 @@
   return true;
 }
 
-Status SetCurrentFile(Env* env, const std::string& dbname,
-                      uint64_t descriptor_number,
-                      Directory* directory_to_fsync) {
+IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
+                        uint64_t descriptor_number,
+                        FSDirectory* directory_to_fsync) {
   // Remove leading "dbname/" and add newline to manifest file name
   std::string manifest = DescriptorFileName(dbname, descriptor_number);
   Slice contents = manifest;
   assert(contents.starts_with(dbname + "/"));
   contents.remove_prefix(dbname.size() + 1);
   std::string tmp = TempFileName(dbname, descriptor_number);
-  Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp, true);
+  IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true);
+  TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
   if (s.ok()) {
-    TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
-    s = env->RenameFile(tmp, CurrentFileName(dbname));
-    TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2);
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2);
+    s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr);
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2);
+    TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s);
   }
   if (s.ok()) {
     if (directory_to_fsync != nullptr) {
-      s = directory_to_fsync->Fsync();
+      s = directory_to_fsync->FsyncWithDirOptions(
+          IOOptions(), nullptr, DirFsyncOptions(CurrentFileName(dbname)));
     }
   } else {
-    env->DeleteFile(tmp);
+    fs->DeleteFile(tmp, IOOptions(), nullptr)
+        .PermitUncheckedError();  // NOTE: PermitUncheckedError is acceptable
+                                  // here as we are already handling an error
+                                  // case, and this is just a best-attempt
+                                  // effort at some cleanup
   }
   return s;
 }
@@ -404,30 +429,41 @@
   assert(!id.empty());
   // Reserve the filename dbname/000000.dbtmp for the temporary identity file
   std::string tmp = TempFileName(dbname, 0);
+  std::string identify_file_name = IdentityFileName(dbname);
   Status s = WriteStringToFile(env, id, tmp, true);
   if (s.ok()) {
-    s = env->RenameFile(tmp, IdentityFileName(dbname));
+    s = env->RenameFile(tmp, identify_file_name);
+  }
+  std::unique_ptr<FSDirectory> dir_obj;
+  if (s.ok()) {
+    s = env->GetFileSystem()->NewDirectory(dbname, IOOptions(), &dir_obj,
+                                           nullptr);
+  }
+  if (s.ok()) {
+    s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
+                                     DirFsyncOptions(identify_file_name));
   }
   if (!s.ok()) {
-    env->DeleteFile(tmp);
+    env->DeleteFile(tmp).PermitUncheckedError();
   }
   return s;
 }
 
-Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
-                    WritableFileWriter* file) {
-  TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2);
-  StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+IOStatus SyncManifest(const ImmutableDBOptions* db_options,
+                      WritableFileWriter* file) {
+  TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2);
+  StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS);
   return file->Sync(db_options->use_fsync);
 }
 
-Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
-                       const std::string& dbname, std::string* parent_dir,
+Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                       const std::string& db_log_dir, const std::string& dbname,
+                       std::string* parent_dir,
                        std::vector<std::string>* info_log_list) {
   assert(parent_dir != nullptr);
   assert(info_log_list != nullptr);
   uint64_t number = 0;
-  FileType type = kLogFile;
+  FileType type = kWalFile;
 
   if (!db_log_dir.empty()) {
     *parent_dir = db_log_dir;
@@ -438,7 +474,7 @@
   InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
 
   std::vector<std::string> file_names;
-  Status s = env->GetChildren(*parent_dir, &file_names);
+  Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr);
 
   if (!s.ok()) {
     return s;
@@ -453,4 +489,16 @@
   return Status::OK();
 }
 
+std::string NormalizePath(const std::string& path) {
+  std::string dst;
+  for (auto c : path) {
+    if (!dst.empty() && (c == kFilePathSeparator || c == '/') &&
+        (dst.back() == kFilePathSeparator || dst.back() == '/')) {
+      continue;
+    }
+    dst.push_back(c);
+  }
+  return dst;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/filename.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/filename.h	2025-05-19 16:14:27.000000000 +0000
@@ -17,6 +17,7 @@
 
 #include "options/db_options.h"
 #include "port/port.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@@ -26,21 +27,14 @@
 
 class Env;
 class Directory;
+class SystemClock;
 class WritableFileWriter;
 
-enum FileType {
-  kLogFile,
-  kDBLockFile,
-  kTableFile,
-  kDescriptorFile,
-  kCurrentFile,
-  kTempFile,
-  kInfoLogFile,  // Either the current one, or an old one
-  kMetaDatabase,
-  kIdentityFile,
-  kOptionsFile,
-  kBlobFile
-};
+#ifdef OS_WIN
+constexpr char kFilePathSeparator = '\\';
+#else
+constexpr char kFilePathSeparator = '/';
+#endif
 
 // Return the name of the log file with the specified number
 // in the db named by "dbname".  The result will be prefixed with
@@ -49,13 +43,13 @@
 
 extern std::string LogFileName(uint64_t number);
 
+extern std::string BlobFileName(uint64_t number);
+
 extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
 
 extern std::string BlobFileName(const std::string& dbname,
                                 const std::string& blob_dir, uint64_t number);
 
-static const std::string ARCHIVAL_DIR = "archive";
-
 extern std::string ArchivalDirectory(const std::string& dbname);
 
 //  Return the name of the archived log file with the specified number
@@ -93,6 +87,10 @@
 extern std::string DescriptorFileName(const std::string& dbname,
                                       uint64_t number);
 
+extern std::string DescriptorFileName(uint64_t number);
+
+extern const std::string kCurrentFileName;  // = "CURRENT"
+
 // Return the name of the current file.  This file contains the name
 // of the current manifest file.  The result will be prefixed with
 // "dbname".
@@ -126,13 +124,14 @@
                                       const std::string& db_path = "",
                                       const std::string& log_dir = "");
 
-static const std::string kOptionsFileNamePrefix = "OPTIONS-";
-static const std::string kTempFileNameSuffix = "dbtmp";
+extern const std::string kOptionsFileNamePrefix;  // = "OPTIONS-"
+extern const std::string kTempFileNameSuffix;     // = "dbtmp"
 
 // Return a options file name given the "dbname" and file number.
 // Format:  OPTIONS-[number].dbtmp
 extern std::string OptionsFileName(const std::string& dbname,
                                    uint64_t file_num);
+extern std::string OptionsFileName(uint64_t file_num);
 
 // Return a temp options file name given the "dbname" and file number.
 // Format:  OPTIONS-[number]
@@ -162,24 +161,27 @@
 
 // Make the CURRENT file point to the descriptor file with the
 // specified number.
-extern Status SetCurrentFile(Env* env, const std::string& dbname,
-                             uint64_t descriptor_number,
-                             Directory* directory_to_fsync);
+extern IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
+                               uint64_t descriptor_number,
+                               FSDirectory* directory_to_fsync);
 
 // Make the IDENTITY file for the db
 extern Status SetIdentityFile(Env* env, const std::string& dbname,
                               const std::string& db_id = {});
 
 // Sync manifest file `file`.
-extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
-                           WritableFileWriter* file);
+extern IOStatus SyncManifest(const ImmutableDBOptions* db_options,
+                             WritableFileWriter* file);
 
 // Return list of file names of info logs in `file_names`.
 // The list only contains file name. The parent directory name is stored
 // in `parent_dir`.
 // `db_log_dir` should be the one as in options.db_log_dir
-extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+extern Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                              const std::string& db_log_dir,
                               const std::string& dbname,
                               std::string* parent_dir,
                               std::vector<std::string>* file_names);
+
+extern std::string NormalizePath(const std::string& path);
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,68 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/line_file_reader.h"
+
+#include <cstring>
+
+#include "monitoring/iostats_context_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus LineFileReader::Create(const std::shared_ptr<FileSystem>& fs,
+                                const std::string& fname,
+                                const FileOptions& file_opts,
+                                std::unique_ptr<LineFileReader>* reader,
+                                IODebugContext* dbg) {
+  std::unique_ptr<FSSequentialFile> file;
+  IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new LineFileReader(std::move(file), fname));
+  }
+  return io_s;
+}
+
+bool LineFileReader::ReadLine(std::string* out) {
+  assert(out);
+  if (!io_status_.ok()) {
+    // Status should be checked (or permit unchecked) any time we return false.
+    io_status_.MustCheck();
+    return false;
+  }
+  out->clear();
+  for (;;) {
+    // Look for line delimiter
+    const char* found = static_cast<const char*>(
+        std::memchr(buf_begin_, '\n', buf_end_ - buf_begin_));
+    if (found) {
+      size_t len = found - buf_begin_;
+      out->append(buf_begin_, len);
+      buf_begin_ += len + /*delim*/ 1;
+      ++line_number_;
+      return true;
+    }
+    if (at_eof_) {
+      io_status_.MustCheck();
+      return false;
+    }
+    // else flush and reload buffer
+    out->append(buf_begin_, buf_end_ - buf_begin_);
+    Slice result;
+    io_status_ = sfr_.Read(buf_.size(), &result, buf_.data());
+    IOSTATS_ADD(bytes_read, result.size());
+    if (!io_status_.ok()) {
+      io_status_.MustCheck();
+      return false;
+    }
+    if (result.size() != buf_.size()) {
+      // The obscure way of indicating EOF
+      at_eof_ = true;
+    }
+    buf_begin_ = result.data();
+    buf_end_ = result.data() + result.size();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/line_file_reader.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/line_file_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,59 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <array>
+
+#include "file/sequence_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper on top of Env::SequentialFile for reading text lines from a file.
+// Lines are delimited by '\n'. The last line may or may not include a
+// trailing newline. Uses SequentialFileReader internally.
+class LineFileReader {
+ private:
+  std::array<char, 8192> buf_;
+  SequentialFileReader sfr_;
+  IOStatus io_status_;
+  const char* buf_begin_ = buf_.data();
+  const char* buf_end_ = buf_.data();
+  size_t line_number_ = 0;
+  bool at_eof_ = false;
+
+ public:
+  // See SequentialFileReader constructors
+  template <typename... Args>
+  explicit LineFileReader(Args&&... args)
+      : sfr_(std::forward<Args&&>(args)...) {}
+
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<LineFileReader>* reader,
+                         IODebugContext* dbg);
+
+  LineFileReader(const LineFileReader&) = delete;
+  LineFileReader& operator=(const LineFileReader&) = delete;
+
+  // Reads another line from the file, returning true on success and saving
+  // the line to `out`, without delimiter, or returning false on failure. You
+  // must check GetStatus() to determine whether the failure was just
+  // end-of-file (OK status) or an I/O error (another status).
+  bool ReadLine(std::string* out);
+
+  // Returns the number of the line most recently returned from ReadLine.
+  // Return value is unspecified if ReadLine has returned false due to
+  // I/O error. After ReadLine returns false due to end-of-file, return
+  // value is the last returned line number, or equivalently the total
+  // number of lines returned.
+  size_t GetLineNumber() const { return line_number_; }
+
+  // Returns any error encountered during read. The error is considered
+  // permanent and no retry or recovery is attempted with the same
+  // LineFileReader.
+  const IOStatus& GetStatus() const { return io_status_; }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/prefetch_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/prefetch_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/prefetch_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/prefetch_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,1004 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockFS;
+
+class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  MockRandomAccessFile(std::unique_ptr<FSRandomAccessFile>& file,
+                       bool support_prefetch, std::atomic_int& prefetch_count)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)),
+        support_prefetch_(support_prefetch),
+        prefetch_count_(prefetch_count) {}
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    if (support_prefetch_) {
+      prefetch_count_.fetch_add(1);
+      return target()->Prefetch(offset, n, options, dbg);
+    } else {
+      return IOStatus::NotSupported("Prefetch not supported");
+    }
+  }
+
+ private:
+  const bool support_prefetch_;
+  std::atomic_int& prefetch_count_;
+};
+
+class MockFS : public FileSystemWrapper {
+ public:
+  explicit MockFS(const std::shared_ptr<FileSystem>& wrapped,
+                  bool support_prefetch)
+      : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {}
+
+  static const char* kClassName() { return "MockFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s;
+    s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    result->reset(
+        new MockRandomAccessFile(file, support_prefetch_, prefetch_count_));
+    return s;
+  }
+
+  void ClearPrefetchCount() { prefetch_count_ = 0; }
+
+  bool IsPrefetchCalled() { return prefetch_count_ > 0; }
+
+  int GetPrefetchCount() {
+    return prefetch_count_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  const bool support_prefetch_;
+  std::atomic_int prefetch_count_{0};
+};
+
+class PrefetchTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  PrefetchTest() : DBTestBase("prefetch_test", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
+std::string BuildKey(int num, std::string postfix = "") {
+  return "my_key_" + std::to_string(num) + postfix;
+}
+
+TEST_P(PrefetchTest, Basic) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+  const int kNumKeys = 1100;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  // create first key range
+  WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key"));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key"));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Delete(BuildKey(i, "key2")));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  if (support_prefetch && !use_direct_io) {
+    // If underline file system supports prefetch, and directIO is not enabled
+    // make sure prefetch() is called and FilePrefetchBuffer is not used.
+    ASSERT_TRUE(fs->IsPrefetchCalled());
+    fs->ClearPrefetchCount();
+    ASSERT_EQ(0, buff_prefetch_count);
+  } else {
+    // If underline file system doesn't support prefetch, or directIO is
+    // enabled, make sure prefetch() is not called and FilePrefetchBuffer is
+    // used.
+    ASSERT_FALSE(fs->IsPrefetchCalled());
+    ASSERT_GT(buff_prefetch_count, 0);
+    buff_prefetch_count = 0;
+  }
+
+  // count the keys
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      num_keys++;
+    }
+  }
+
+  // Make sure prefetch is called only if file system support prefetch.
+  if (support_prefetch && !use_direct_io) {
+    ASSERT_TRUE(fs->IsPrefetchCalled());
+    fs->ClearPrefetchCount();
+    ASSERT_EQ(0, buff_prefetch_count);
+  } else {
+    ASSERT_FALSE(fs->IsPrefetchCalled());
+    ASSERT_GT(buff_prefetch_count, 0);
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.max_auto_readahead_size = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+
+  // DB open will create table readers unless we reduce the table cache
+  // capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
+  // cache is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 10 so table cache capacity will become 0. This will
+  // prevent file open during DB open and force the file to be opened during
+  // Iteration.
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  Random rnd(309);
+  int key_count = 0;
+  const int num_keys_per_level = 100;
+  // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
+  for (int level = 2; level >= 0; level--) {
+    key_count = level * num_keys_per_level;
+    for (int i = 0; i < num_keys_per_level; ++i) {
+      ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  Close();
+  std::vector<int> buff_prefectch_level_count = {0, 0, 0};
+  TryReopen(options);
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    fs->ClearPrefetchCount();
+    buff_prefetch_count = 0;
+
+    for (int level = 2; level >= 0; level--) {
+      key_count = level * num_keys_per_level;
+      switch (level) {
+        case 0:
+          // max_auto_readahead_size is set 0 so data and index blocks are not
+          // prefetched.
+          ASSERT_OK(db_->SetOptions(
+              {{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
+          break;
+        case 1:
+          // max_auto_readahead_size is set less than
+          // BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains
+          // equal to max_auto_readahead_size.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=4096;}"}}));
+          break;
+        case 2:
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=65536;}"}}));
+          break;
+        default:
+          assert(false);
+      }
+
+      for (int i = 0; i < num_keys_per_level; ++i) {
+        iter->Seek(Key(key_count++));
+        iter->Next();
+      }
+
+      buff_prefectch_level_count[level] = buff_prefetch_count;
+      if (support_prefetch && !use_direct_io) {
+        if (level == 0) {
+          ASSERT_FALSE(fs->IsPrefetchCalled());
+        } else {
+          ASSERT_TRUE(fs->IsPrefetchCalled());
+        }
+        fs->ClearPrefetchCount();
+      } else {
+        ASSERT_FALSE(fs->IsPrefetchCalled());
+        if (level == 0) {
+          ASSERT_EQ(buff_prefetch_count, 0);
+        } else {
+          ASSERT_GT(buff_prefetch_count, 0);
+        }
+        buff_prefetch_count = 0;
+      }
+    }
+  }
+
+  if (!support_prefetch) {
+    ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(PrefetchTest, PrefetchWhenReseek) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1000));
+    iter->Seek(BuildKey(1004));  // Prefetch Data
+    iter->Seek(BuildKey(1008));
+    iter->Seek(BuildKey(1011));
+    iter->Seek(BuildKey(1015));  // Prefetch Data
+    iter->Seek(BuildKey(1019));
+    // Missed 2 blocks but they are already in buffer so no reset.
+    iter->Seek(BuildKey(103));   // Already in buffer.
+    iter->Seek(BuildKey(1033));  // Prefetch Data
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 3);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  non sequential data blocks within same partitioned
+     * index. buff_prefetch_count will be 0 in that case.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1008));
+    iter->Seek(BuildKey(1019));
+    iter->Seek(BuildKey(1033));
+    iter->Seek(BuildKey(1048));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reesek keys from Single Data Block.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1));
+    iter->Seek(BuildKey(10));
+    iter->Seek(BuildKey(100));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  sequential data blocks to set implicit auto readahead
+     * and prefetch data but after that iterate over different (non sequential)
+     * data blocks which won't prefetch any data further. So buff_prefetch_count
+     * will be 1 for the first one.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1000));
+    iter->Seek(BuildKey(1004));  // This iteration will prefetch buffer
+    iter->Seek(BuildKey(1008));
+    iter->Seek(
+        BuildKey(996));  // Reseek won't prefetch any data and
+                         // readahead_size will be initiallized to 8*1024.
+    iter->Seek(BuildKey(992));
+    iter->Seek(BuildKey(989));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+
+    // Read sequentially to confirm readahead_size is reset to initial value (2
+    // more data blocks)
+    iter->Seek(BuildKey(1011));
+    iter->Seek(BuildKey(1015));
+    iter->Seek(BuildKey(1019));  // Prefetch Data
+    iter->Seek(BuildKey(1022));
+    iter->Seek(BuildKey(1026));
+    iter->Seek(BuildKey(103));  // Prefetch Data
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 2);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Reseek keys from sequential partitioned index block. Since partitioned
+     * index fetch are sequential, buff_prefetch_count will be 1.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1167));
+    iter->Seek(BuildKey(1334));  // This iteration will prefetch buffer
+    iter->Seek(BuildKey(1499));
+    iter->Seek(BuildKey(1667));
+    iter->Seek(BuildKey(1847));
+    iter->Seek(BuildKey(1999));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek over different keys from different blocks. buff_prefetch_count is
+     * set 0.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    int i = 0;
+    int j = 1000;
+    do {
+      iter->Seek(BuildKey(i));
+      if (!iter->Valid()) {
+        break;
+      }
+      i = i + 100;
+      iter->Seek(BuildKey(j));
+      j = j + 100;
+    } while (i < 1000 && j < kNumKeys && iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Iterates sequentially over all keys. It will prefetch the buffer.*/
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    }
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 13);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 13);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);  // 8MB
+  table_options.block_cache = cache;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    // Warm up the cache
+    iter->Seek(BuildKey(1011));
+    iter->Seek(BuildKey(1015));
+    iter->Seek(BuildKey(1019));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    // After caching, blocks will be read from cache (Sequential blocks)
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1000));
+    iter->Seek(BuildKey(1004));  // Prefetch data (not in cache).
+    // Missed one sequential block but next is in already in buffer so readahead
+    // will not be reset.
+    iter->Seek(BuildKey(1011));
+    // Prefetch data but blocks are in cache so no prefetch and reset.
+    iter->Seek(BuildKey(1015));
+    iter->Seek(BuildKey(1019));
+    iter->Seek(BuildKey(1022));
+    // Prefetch data with readahead_size = 4 blocks.
+    iter->Seek(BuildKey(1026));
+    iter->Seek(BuildKey(103));
+    iter->Seek(BuildKey(1033));
+    iter->Seek(BuildKey(1037));
+
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+class PrefetchTest1
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  PrefetchTest1() : DBTestBase("prefetch_test1", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest1, DBIterLevelReadAhead) {
+  const int kNumKeys = 1000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool is_adaptive_readahead = std::get<1>(GetParam());
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (std::get<0>(GetParam())) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (std::get<0>(GetParam()) &&
+      (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int j = 0; j < 5; j++) {
+    for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+      ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+  int buff_prefetch_count = 0;
+  int readahead_carry_over_count = 0;
+  int num_sst_files = NumTableFilesAtLevel(2);
+  size_t current_readahead_size = 0;
+
+  // Test - Iterate over the keys sequentially.
+  {
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::Prefetch:Start",
+        [&](void*) { buff_prefetch_count++; });
+
+    // The callback checks, since reads are sequential, readahead_size doesn't
+    // start from 8KB when iterator moves to next file and its called
+    // num_sst_files-1 times (excluding for first file).
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockPrefetcher::SetReadaheadState", [&](void* arg) {
+          readahead_carry_over_count++;
+          size_t readahead_size = *reinterpret_cast<size_t*>(arg);
+          if (readahead_carry_over_count) {
+            ASSERT_GT(readahead_size, 8 * 1024);
+            // ASSERT_GE(readahead_size, current_readahead_size);
+          }
+        });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
+          current_readahead_size = *reinterpret_cast<size_t*>(arg);
+          ASSERT_GT(current_readahead_size, 0);
+        });
+
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ReadOptions ro;
+    if (is_adaptive_readahead) {
+      ro.adaptive_readahead = true;
+    }
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      num_keys++;
+    }
+
+    ASSERT_GT(buff_prefetch_count, 0);
+    buff_prefetch_count = 0;
+    // For index and data blocks.
+    if (is_adaptive_readahead) {
+      ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1));
+    } else {
+      ASSERT_EQ(readahead_carry_over_count, 0);
+    }
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+class PrefetchTest2 : public DBTestBase,
+                      public ::testing::WithParamInterface<bool> {
+ public:
+  PrefetchTest2() : DBTestBase("prefetch_test2", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(PrefetchTest2, PrefetchTest2, ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest2, NonSequentialReads) {
+  const int kNumKeys = 1000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int j = 0; j < 5; j++) {
+    for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+      ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+
+  int buff_prefetch_count = 0;
+  int set_readahead = 0;
+  size_t readahead_size = 0;
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockPrefetcher::SetReadaheadState",
+      [&](void* /*arg*/) { set_readahead++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::TryReadFromCache",
+      [&](void* arg) { readahead_size = *reinterpret_cast<size_t*>(arg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    // Iterate until prefetch is done.
+    ReadOptions ro;
+    ro.adaptive_readahead = true;
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->SeekToFirst();
+    while (iter->Valid() && buff_prefetch_count == 0) {
+      iter->Next();
+    }
+    ASSERT_EQ(readahead_size, 8 * 1024);
+    ASSERT_EQ(buff_prefetch_count, 1);
+    ASSERT_EQ(set_readahead, 0);
+    buff_prefetch_count = 0;
+
+    // Move to last file and check readahead size fallbacks to 8KB. So next
+    // readahead size after prefetch should be 8 * 1024;
+    iter->Seek(BuildKey(4004));
+    while (iter->Valid() && buff_prefetch_count == 0) {
+      iter->Next();
+    }
+    ASSERT_EQ(readahead_size, 8 * 1024);
+    ASSERT_EQ(set_readahead, 0);
+    ASSERT_EQ(buff_prefetch_count, 1);
+  }
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) {
+  const int kNumKeys = 2000;
+  // Set options
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  if (GetParam()) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);  // 8MB
+  table_options.block_cache = cache;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  int buff_prefetch_count = 0;
+  size_t current_readahead_size = 0;
+  size_t expected_current_readahead_size = 8 * 1024;
+  size_t decrease_readahead_size = 8 * 1024;
+
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
+        current_readahead_size = *reinterpret_cast<size_t*>(arg);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  {
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
+     * more initially (2 more data blocks).
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    // Warm up the cache
+    iter->Seek(BuildKey(1011));
+    iter->Seek(BuildKey(1015));
+    iter->Seek(BuildKey(1019));
+    buff_prefetch_count = 0;
+  }
+  {
+    // After caching, blocks will be read from cache (Sequential blocks)
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1000));
+    iter->Seek(BuildKey(1004));  // Prefetch data (not in cache).
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+
+    // Missed one sequential block but 1011 is already in buffer so
+    // readahead will not be reset.
+    iter->Seek(BuildKey(1011));
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+
+    // Eligible to Prefetch data (not in buffer) but block is in cache so no
+    // prefetch will happen and will result in decrease in readahead_size.
+    // readahead_size will be 8 * 1024
+    iter->Seek(BuildKey(1015));
+    expected_current_readahead_size -= decrease_readahead_size;
+
+    // 1016 is the same block as 1015. So no change in readahead_size.
+    iter->Seek(BuildKey(1016));
+
+    // Prefetch data (not in buffer) but found in cache. So decrease
+    // readahead_size. Since it will 0 after decrementing so readahead_size will
+    // be set to initial value.
+    iter->Seek(BuildKey(1019));
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    // Prefetch next sequential data.
+    iter->Seek(BuildKey(1022));
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+    ASSERT_EQ(buff_prefetch_count, 2);
+    buff_prefetch_count = 0;
+  }
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,20 +12,130 @@
 #include <algorithm>
 #include <mutex>
 
+#include "file/file_util.h"
 #include "monitoring/histogram.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
+#include "table/format.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
-Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
-                                    char* scratch, bool for_compaction) const {
-  Status s;
+inline void IOStatsAddBytesByTemperature(Temperature file_temperature,
+                                         size_t value) {
+  if (file_temperature == Temperature::kUnknown) {
+    return;
+  }
+  switch (file_temperature) {
+    case Temperature::kHot:
+      IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, value);
+      break;
+    case Temperature::kWarm:
+      IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, value);
+      break;
+    case Temperature::kCold:
+      IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, value);
+      break;
+    default:
+      break;
+  }
+}
+
+inline void IOStatsAddCountByTemperature(Temperature file_temperature,
+                                         size_t value) {
+  if (file_temperature == Temperature::kUnknown) {
+    return;
+  }
+  switch (file_temperature) {
+    case Temperature::kHot:
+      IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, value);
+      break;
+    case Temperature::kWarm:
+      IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, value);
+      break;
+    case Temperature::kCold:
+      IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, value);
+      break;
+    default:
+      break;
+  }
+}
+
+inline void StatisticAddBytesByTemperature(Statistics* stats,
+                                           Temperature file_temperature,
+                                           size_t value) {
+  if (stats == nullptr || file_temperature == Temperature::kUnknown) {
+    return;
+  }
+  switch (file_temperature) {
+    case Temperature::kHot:
+      RecordTick(stats, HOT_FILE_READ_BYTES, value);
+      break;
+    case Temperature::kWarm:
+      RecordTick(stats, WARM_FILE_READ_BYTES, value);
+      break;
+    case Temperature::kCold:
+      RecordTick(stats, COLD_FILE_READ_BYTES, value);
+      break;
+    default:
+      break;
+  }
+}
+
+inline void StatisticAddCountByTemperature(Statistics* stats,
+                                           Temperature file_temperature,
+                                           size_t value) {
+  if (stats == nullptr || file_temperature == Temperature::kUnknown) {
+    return;
+  }
+  switch (file_temperature) {
+    case Temperature::kHot:
+      RecordTick(stats, HOT_FILE_READ_COUNT, value);
+      break;
+    case Temperature::kWarm:
+      RecordTick(stats, WARM_FILE_READ_COUNT, value);
+      break;
+    case Temperature::kCold:
+      RecordTick(stats, COLD_FILE_READ_COUNT, value);
+      break;
+    default:
+      break;
+  }
+}
+
+IOStatus RandomAccessFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts,
+    std::unique_ptr<RandomAccessFileReader>* reader, IODebugContext* dbg) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new RandomAccessFileReader(std::move(file), fname));
+  }
+  return io_s;
+}
+
+IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
+                                      size_t n, Slice* result, char* scratch,
+                                      AlignedBuf* aligned_buf,
+                                      bool for_compaction) const {
+  (void)aligned_buf;
+
+  TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
+
+  // To be paranoid: modify scratch a little bit, so in case underlying
+  // FileSystem doesn't fill the buffer but return success and `scratch` returns
+  // contains a previous block, returned value will not pass checksum.
+  if (n > 0 && scratch != nullptr) {
+    // This byte might not change anything for direct I/O case, but it's OK.
+    scratch[0]++;
+  }
+
+  IOStatus io_s;
   uint64_t elapsed = 0;
   {
-    StopWatch sw(env_, stats_, hist_type_,
+    StopWatch sw(clock_, stats_, hist_type_,
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     auto prev_perf_level = GetPerfLevel();
@@ -53,32 +163,47 @@
         }
         Slice tmp;
 
-        FileOperationInfo::TimePoint start_ts;
+        FileOperationInfo::StartTimePoint start_ts;
         uint64_t orig_offset = 0;
         if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::now();
+          start_ts = FileOperationInfo::StartNow();
           orig_offset = aligned_offset + buf.CurrentSize();
         }
+
         {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed,
-                          IOOptions(), &tmp, buf.Destination(), nullptr);
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+          // Only user reads are expected to specify a timeout. And user reads
+          // are not subjected to rate_limiter and should go through only
+          // one iteration of this loop, so we don't need to check and adjust
+          // the opts.timeout before calling file_->Read
+          assert(!opts.timeout.count() || allowed == read_size);
+          io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
+                             &tmp, buf.Destination(), nullptr);
         }
         if (ShouldNotifyListeners()) {
-          auto finish_ts = std::chrono::system_clock::now();
+          auto finish_ts = FileOperationInfo::FinishNow();
           NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
-                                 s);
+                                 io_s);
+          if (!io_s.ok()) {
+            NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
+                            tmp.size(), orig_offset);
+          }
         }
 
         buf.Size(buf.CurrentSize() + tmp.size());
-        if (!s.ok() || tmp.size() < allowed) {
+        if (!io_s.ok() || tmp.size() < allowed) {
           break;
         }
       }
       size_t res_len = 0;
-      if (s.ok() && offset_advance < buf.CurrentSize()) {
-        res_len = buf.Read(scratch, offset_advance,
-                           std::min(buf.CurrentSize() - offset_advance, n));
+      if (io_s.ok() && offset_advance < buf.CurrentSize()) {
+        res_len = std::min(buf.CurrentSize() - offset_advance, n);
+        if (aligned_buf == nullptr) {
+          buf.Read(scratch, offset_advance, res_len);
+        } else {
+          scratch = buf.BufferStart() + offset_advance;
+          aligned_buf->reset(buf.Release());
+        }
       }
       *result = Slice(scratch, res_len);
 #endif  // !ROCKSDB_LITE
@@ -103,24 +228,34 @@
         Slice tmp_result;
 
 #ifndef ROCKSDB_LITE
-        FileOperationInfo::TimePoint start_ts;
+        FileOperationInfo::StartTimePoint start_ts;
         if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::now();
+          start_ts = FileOperationInfo::StartNow();
         }
 #endif
+
         {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-          s = file_->Read(offset + pos, allowed, IOOptions(), &tmp_result,
-                          scratch + pos, nullptr);
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+          // Only user reads are expected to specify a timeout. And user reads
+          // are not subjected to rate_limiter and should go through only
+          // one iteration of this loop, so we don't need to check and adjust
+          // the opts.timeout before calling file_->Read
+          assert(!opts.timeout.count() || allowed == n);
+          io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
+                             scratch + pos, nullptr);
         }
 #ifndef ROCKSDB_LITE
         if (ShouldNotifyListeners()) {
-          auto finish_ts = std::chrono::system_clock::now();
+          auto finish_ts = FileOperationInfo::FinishNow();
           NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
-                                 finish_ts, s);
+                                 finish_ts, io_s);
+
+          if (!io_s.ok()) {
+            NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
+                            tmp_result.size(), offset + pos);
+          }
         }
 #endif
-
         if (res_scratch == nullptr) {
           // we can't simply use `scratch` because reads of mmap'd files return
           // data in a different buffer.
@@ -130,53 +265,194 @@
           assert(tmp_result.data() == res_scratch + pos);
         }
         pos += tmp_result.size();
-        if (!s.ok() || tmp_result.size() < allowed) {
+        if (!io_s.ok() || tmp_result.size() < allowed) {
           break;
         }
       }
-      *result = Slice(res_scratch, s.ok() ? pos : 0);
+      *result = Slice(res_scratch, io_s.ok() ? pos : 0);
     }
-    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+    IOSTATS_ADD(bytes_read, result->size());
+    IOStatsAddBytesByTemperature(file_temperature_, result->size());
+    IOStatsAddCountByTemperature(file_temperature_, 1);
+    StatisticAddBytesByTemperature(stats_, file_temperature_, result->size());
+    StatisticAddCountByTemperature(stats_, file_temperature_, 1);
     SetPerfLevel(prev_perf_level);
   }
   if (stats_ != nullptr && file_read_hist_ != nullptr) {
     file_read_hist_->Add(elapsed);
   }
 
-  return s;
+  return io_s;
+}
+
+size_t End(const FSReadRequest& r) {
+  return static_cast<size_t>(r.offset) + r.len;
+}
+
+FSReadRequest Align(const FSReadRequest& r, size_t alignment) {
+  FSReadRequest req;
+  req.offset = static_cast<uint64_t>(
+    TruncateToPageBoundary(alignment, static_cast<size_t>(r.offset)));
+  req.len = Roundup(End(r), alignment) - req.offset;
+  req.scratch = nullptr;
+  return req;
 }
 
-Status RandomAccessFileReader::MultiRead(FSReadRequest* read_reqs,
-                                         size_t num_reqs) const {
-  Status s;
+bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
+  size_t dest_offset = static_cast<size_t>(dest->offset);
+  size_t src_offset = static_cast<size_t>(src.offset);
+  size_t dest_end = End(*dest);
+  size_t src_end = End(src);
+  if (std::max(dest_offset, src_offset) > std::min(dest_end, src_end)) {
+    return false;
+  }
+  dest->offset = static_cast<uint64_t>(std::min(dest_offset, src_offset));
+  dest->len = std::max(dest_end, src_end) - dest->offset;
+  return true;
+}
+
+IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
+                                           FSReadRequest* read_reqs,
+                                           size_t num_reqs,
+                                           AlignedBuf* aligned_buf) const {
+  (void)aligned_buf;  // suppress warning of unused variable in LITE mode
+  assert(num_reqs > 0);
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_reqs - 1; ++i) {
+    assert(read_reqs[i].offset <= read_reqs[i + 1].offset);
+  }
+#endif  // !NDEBUG
+
+  // To be paranoid modify scratch a little bit, so in case underlying
+  // FileSystem doesn't fill the buffer but return succee and `scratch` returns
+  // contains a previous block, returned value will not pass checksum.
+  // This byte might not change anything for direct I/O case, but it's OK.
+  for (size_t i = 0; i < num_reqs; i++) {
+    FSReadRequest& r = read_reqs[i];
+    if (r.len > 0 && r.scratch != nullptr) {
+      r.scratch[0]++;
+    }
+  }
+
+  IOStatus io_s;
   uint64_t elapsed = 0;
-  assert(!use_direct_io());
   {
-    StopWatch sw(env_, stats_, hist_type_,
+    StopWatch sw(clock_, stats_, hist_type_,
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     auto prev_perf_level = GetPerfLevel();
     IOSTATS_TIMER_GUARD(read_nanos);
 
+    FSReadRequest* fs_reqs = read_reqs;
+    size_t num_fs_reqs = num_reqs;
+#ifndef ROCKSDB_LITE
+    std::vector<FSReadRequest> aligned_reqs;
+    if (use_direct_io()) {
+      // num_reqs is the max possible size,
+      // this can reduce std::vecector's internal resize operations.
+      aligned_reqs.reserve(num_reqs);
+      // Align and merge the read requests.
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      for (size_t i = 0; i < num_reqs; i++) {
+        const auto& r = Align(read_reqs[i], alignment);
+        if (i == 0) {
+          // head
+          aligned_reqs.push_back(r);
+
+        } else if (!TryMerge(&aligned_reqs.back(), r)) {
+          // head + n
+          aligned_reqs.push_back(r);
+
+        } else {
+          // unused
+          r.status.PermitUncheckedError();
+        }
+      }
+      TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs",
+                               &aligned_reqs);
+
+      // Allocate aligned buffer and let scratch buffers point to it.
+      size_t total_len = 0;
+      for (const auto& r : aligned_reqs) {
+        total_len += r.len;
+      }
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(total_len);
+      char* scratch = buf.BufferStart();
+      for (auto& r : aligned_reqs) {
+        r.scratch = scratch;
+        scratch += r.len;
+      }
+
+      aligned_buf->reset(buf.Release());
+      fs_reqs = aligned_reqs.data();
+      num_fs_reqs = aligned_reqs.size();
+    }
+#endif  // ROCKSDB_LITE
+
 #ifndef ROCKSDB_LITE
-    FileOperationInfo::TimePoint start_ts;
+    FileOperationInfo::StartTimePoint start_ts;
     if (ShouldNotifyListeners()) {
-      start_ts = std::chrono::system_clock::now();
+      start_ts = FileOperationInfo::StartNow();
     }
 #endif  // ROCKSDB_LITE
+
     {
-      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-      s = file_->MultiRead(read_reqs, num_reqs, IOOptions(), nullptr);
+      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
     }
+
+#ifndef ROCKSDB_LITE
+    if (use_direct_io()) {
+      // Populate results in the unaligned read requests.
+      size_t aligned_i = 0;
+      for (size_t i = 0; i < num_reqs; i++) {
+        auto& r = read_reqs[i];
+        if (static_cast<size_t>(r.offset) > End(aligned_reqs[aligned_i])) {
+          aligned_i++;
+        }
+        const auto& fs_r = fs_reqs[aligned_i];
+        r.status = fs_r.status;
+        if (r.status.ok()) {
+          uint64_t offset = r.offset - fs_r.offset;
+          if (fs_r.result.size() <= offset) {
+            // No byte in the read range is returned.
+            r.result = Slice();
+          } else {
+            size_t len = std::min(
+                r.len, static_cast<size_t>(fs_r.result.size() - offset));
+            r.result = Slice(fs_r.scratch + offset, len);
+          }
+        } else {
+          r.result = Slice();
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+
     for (size_t i = 0; i < num_reqs; ++i) {
 #ifndef ROCKSDB_LITE
       if (ShouldNotifyListeners()) {
-        auto finish_ts = std::chrono::system_clock::now();
+        auto finish_ts = FileOperationInfo::FinishNow();
         NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
                                start_ts, finish_ts, read_reqs[i].status);
       }
+      if (!read_reqs[i].status.ok()) {
+        NotifyOnIOError(read_reqs[i].status, FileOperationType::kRead,
+                        file_name(), read_reqs[i].result.size(),
+                        read_reqs[i].offset);
+      }
+
 #endif  // ROCKSDB_LITE
-      IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
+      IOSTATS_ADD(bytes_read, read_reqs[i].result.size());
+      IOStatsAddBytesByTemperature(file_temperature_,
+                                   read_reqs[i].result.size());
+      IOStatsAddCountByTemperature(file_temperature_, 1);
+      StatisticAddBytesByTemperature(stats_, file_temperature_,
+                                     read_reqs[i].result.size());
+      StatisticAddCountByTemperature(stats_, file_temperature_, 1);
     }
     SetPerfLevel(prev_perf_level);
   }
@@ -184,6 +460,15 @@
     file_read_hist_->Add(elapsed);
   }
 
-  return s;
+  return io_s;
+}
+
+IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
+                                                  IOOptions& opts) {
+  if (clock_ != nullptr) {
+    return PrepareIOFromReadOptions(ro, clock_, opts);
+  } else {
+    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
+  }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,18 +11,34 @@
 #include <atomic>
 #include <sstream>
 #include <string>
+
+#include "env/file_system_tracer.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/listener.h"
+#include "rocksdb/options.h"
 #include "rocksdb/rate_limiter.h"
 #include "util/aligned_buffer.h"
 
 namespace ROCKSDB_NAMESPACE {
 class Statistics;
 class HistogramImpl;
+class SystemClock;
+
+using AlignedBuf = std::unique_ptr<char[]>;
+
+// Align the request r according to alignment and return the aligned result.
+FSReadRequest Align(const FSReadRequest& r, size_t alignment);
 
-// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
+// Try to merge src to dest if they have overlap.
+//
+// Each request represents an inclusive interval [offset, offset + len].
+// If the intervals have overlap, update offset and len to represent the
+// merged interval, and return true.
+// Otherwise, do nothing and return false.
+bool TryMerge(FSReadRequest* dest, const FSReadRequest& src);
+
+// RandomAccessFileReader is a wrapper on top of Env::RandomAccessFile. It is
 // responsible for:
 // - Handling Buffered and Direct reads appropriately.
 // - Rate limiting compaction reads.
@@ -31,47 +47,69 @@
 class RandomAccessFileReader {
  private:
 #ifndef ROCKSDB_LITE
-  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
-                              const FileOperationInfo::TimePoint& start_ts,
-                              const FileOperationInfo::TimePoint& finish_ts,
-                              const Status& status) const {
-    FileOperationInfo info(file_name_, start_ts, finish_ts);
+  void NotifyOnFileReadFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const Status& status) const {
+    FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts,
+                           finish_ts, status);
     info.offset = offset;
     info.length = length;
-    info.status = status;
 
     for (auto& listener : listeners_) {
       listener->OnFileReadFinish(info);
     }
+    info.status.PermitUncheckedError();
   }
+
+  void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation,
+                       const std::string& file_path, size_t length,
+                       uint64_t offset) const {
+    if (listeners_.empty()) {
+      return;
+    }
+    IOErrorInfo io_error_info(io_status, operation, file_path, length, offset);
+
+    for (auto& listener : listeners_) {
+      listener->OnIOError(io_error_info);
+    }
+    io_status.PermitUncheckedError();
+  }
+
 #endif  // ROCKSDB_LITE
 
   bool ShouldNotifyListeners() const { return !listeners_.empty(); }
 
-  std::unique_ptr<FSRandomAccessFile> file_;
+  FSRandomAccessFilePtr file_;
   std::string file_name_;
-  Env* env_;
+  SystemClock* clock_;
   Statistics* stats_;
   uint32_t hist_type_;
   HistogramImpl* file_read_hist_;
   RateLimiter* rate_limiter_;
   std::vector<std::shared_ptr<EventListener>> listeners_;
+  Temperature file_temperature_;
 
  public:
   explicit RandomAccessFileReader(
-      std::unique_ptr<FSRandomAccessFile>&& raf, std::string _file_name,
-      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+      std::unique_ptr<FSRandomAccessFile>&& raf, const std::string& _file_name,
+      SystemClock* clock = nullptr,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      Statistics* stats = nullptr, uint32_t hist_type = 0,
       HistogramImpl* file_read_hist = nullptr,
       RateLimiter* rate_limiter = nullptr,
-      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
-      : file_(std::move(raf)),
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {},
+      Temperature file_temperature = Temperature::kUnknown)
+      : file_(std::move(raf), io_tracer, _file_name),
         file_name_(std::move(_file_name)),
-        env_(env),
+        clock_(clock),
         stats_(stats),
         hist_type_(hist_type),
         file_read_hist_(file_read_hist),
         rate_limiter_(rate_limiter),
-        listeners_() {
+        listeners_(),
+        file_temperature_(file_temperature) {
 #ifndef ROCKSDB_LITE
     std::for_each(listeners.begin(), listeners.end(),
                   [this](const std::shared_ptr<EventListener>& e) {
@@ -84,37 +122,45 @@
 #endif
   }
 
-  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
-    *this = std::move(o);
-  }
-
-  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
-      ROCKSDB_NOEXCEPT {
-    file_ = std::move(o.file_);
-    env_ = std::move(o.env_);
-    stats_ = std::move(o.stats_);
-    hist_type_ = std::move(o.hist_type_);
-    file_read_hist_ = std::move(o.file_read_hist_);
-    rate_limiter_ = std::move(o.rate_limiter_);
-    return *this;
-  }
-
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<RandomAccessFileReader>* reader,
+                         IODebugContext* dbg);
   RandomAccessFileReader(const RandomAccessFileReader&) = delete;
   RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
 
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
-              bool for_compaction = false) const;
+  // In non-direct IO mode,
+  // 1. if using mmap, result is stored in a buffer other than scratch;
+  // 2. if not using mmap, result is stored in the buffer starting from scratch.
+  //
+  // In direct IO mode, an aligned buffer is allocated internally.
+  // 1. If aligned_buf is null, then results are copied to the buffer
+  // starting from scratch;
+  // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns
+  // the internally allocated buffer on return, and the result refers to a
+  // region in aligned_buf.
+  IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
+                char* scratch, AlignedBuf* aligned_buf,
+                bool for_compaction = false) const;
+
+  // REQUIRES:
+  // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing.
+  // In non-direct IO mode, aligned_buf should be null;
+  // In direct IO mode, aligned_buf stores the aligned buffer allocated inside
+  // MultiRead, the result Slices in reqs refer to aligned_buf.
+  IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs,
+                     size_t num_reqs, AlignedBuf* aligned_buf) const;
 
-  Status MultiRead(FSReadRequest* reqs, size_t num_reqs) const;
-
-  Status Prefetch(uint64_t offset, size_t n) const {
+  IOStatus Prefetch(uint64_t offset, size_t n) const {
     return file_->Prefetch(offset, n, IOOptions(), nullptr);
   }
 
   FSRandomAccessFile* file() { return file_.get(); }
 
-  std::string file_name() const { return file_name_; }
+  const std::string& file_name() const { return file_name_; }
 
   bool use_direct_io() const { return file_->use_direct_io(); }
+
+  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts);
 };
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/random_access_file_reader_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,483 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReaderTest : public testing::Test {
+ public:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    test_dir_ = test::PerThreadDBPath("random_access_file_reader_test");
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void Write(const std::string& fname, const std::string& content) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void Read(const std::string& fname, const FileOptions& opts,
+            std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string fpath = Path(fname);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), fpath,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void AssertResult(const std::string& content,
+                    const std::vector<FSReadRequest>& reqs) {
+    for (const auto& r : reqs) {
+      ASSERT_OK(r.status);
+      ASSERT_EQ(r.len, r.result.size());
+      ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString());
+    }
+  }
+
+ private:
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string test_dir_;
+
+  std::string Path(const std::string& fname) {
+    return test_dir_ + "/" + fname;
+  }
+};
+
+// Skip the following tests in lite mode since direct I/O is unsupported.
+#ifndef ROCKSDB_LITE
+
+TEST_F(RandomAccessFileReaderTest, ReadDirectIO) {
+  std::string fname = "read-direct-io";
+  Random rand(0);
+  std::string content = rand.RandomString(kDefaultPageSize);
+  Write(fname, content);
+
+  FileOptions opts;
+  opts.use_direct_reads = true;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+  ASSERT_TRUE(r->use_direct_io());
+
+  const size_t page_size = r->file()->GetRequiredBufferAlignment();
+  size_t offset = page_size / 2;
+  size_t len = page_size / 3;
+  Slice result;
+  AlignedBuf buf;
+  for (bool for_compaction : {true, false}) {
+    ASSERT_OK(r->Read(IOOptions(), offset, len, &result, nullptr, &buf,
+                      for_compaction));
+    ASSERT_EQ(result.ToString(), content.substr(offset, len));
+  }
+}
+
+TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
+  std::vector<FSReadRequest> aligned_reqs;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* reqs) {
+        // Copy reqs, since it's allocated on stack inside MultiRead, which will
+        // be deallocated after MultiRead returns.
+        aligned_reqs = *reinterpret_cast<std::vector<FSReadRequest>*>(reqs);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Creates a file with 3 pages.
+  std::string fname = "multi-read-direct-io";
+  Random rand(0);
+  std::string content = rand.RandomString(3 * kDefaultPageSize);
+  Write(fname, content);
+
+  FileOptions opts;
+  opts.use_direct_reads = true;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+  ASSERT_TRUE(r->use_direct_io());
+
+  const size_t page_size = r->file()->GetRequiredBufferAlignment();
+
+  {
+    // Reads 2 blocks in the 1st page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // First page: xxxx
+    // 1st block:  x
+    // 2nd block:    xx
+    FSReadRequest r0;
+    r0.offset = 0;
+    r0.len = page_size / 4;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size / 2;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(
+        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+
+    AssertResult(content, reqs);
+
+    // Reads the first page internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, page_size);
+  }
+
+  {
+    // Reads 3 blocks:
+    // 1st block in the 1st page;
+    // 2nd block from the middle of the 1st page to the middle of the 2nd page;
+    // 3rd block in the 2nd page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 2 pages:   xxxxxxxx
+    // 1st block: x
+    // 2nd block:   xxxx
+    // 3rd block:        x
+    FSReadRequest r0;
+    r0.offset = 0;
+    r0.len = page_size / 4;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size / 2;
+    r1.len = page_size;
+    r1.scratch = nullptr;
+
+    FSReadRequest r2;
+    r2.offset = 2 * page_size - page_size / 4;
+    r2.len = page_size / 4;
+    r2.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    reqs.push_back(std::move(r2));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(
+        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+
+    AssertResult(content, reqs);
+
+    // Reads the first two pages in one request internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, 2 * page_size);
+  }
+
+  {
+    // Reads 3 blocks:
+    // 1st block in the middle of the 1st page;
+    // 2nd block in the middle of the 2nd page;
+    // 3rd block in the middle of the 3rd page.
+    // The results should be SharedSlices of the same underlying buffer.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 3 pages:   xxxxxxxxxxxx
+    // 1st block:  xx
+    // 2nd block:      xx
+    // 3rd block:          xx
+    FSReadRequest r0;
+    r0.offset = page_size / 4;
+    r0.len = page_size / 2;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = page_size + page_size / 4;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    FSReadRequest r2;
+    r2.offset = 2 * page_size + page_size / 4;
+    r2.len = page_size / 2;
+    r2.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    reqs.push_back(std::move(r2));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(
+        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+
+    AssertResult(content, reqs);
+
+    // Reads the first 3 pages in one request internally.
+    ASSERT_EQ(aligned_reqs.size(), 1);
+    const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
+    ASSERT_EQ(aligned_r.offset, 0);
+    ASSERT_EQ(aligned_r.len, 3 * page_size);
+  }
+
+  {
+    // Reads 2 blocks:
+    // 1st block in the middle of the 1st page;
+    // 2nd block in the middle of the 3rd page.
+    // The results are two different buffers.
+    //
+    // Illustration (each x is a 1/4 page)
+    // 3 pages:   xxxxxxxxxxxx
+    // 1st block:  xx
+    // 2nd block:          xx
+    FSReadRequest r0;
+    r0.offset = page_size / 4;
+    r0.len = page_size / 2;
+    r0.scratch = nullptr;
+
+    FSReadRequest r1;
+    r1.offset = 2 * page_size + page_size / 4;
+    r1.len = page_size / 2;
+    r1.scratch = nullptr;
+
+    std::vector<FSReadRequest> reqs;
+    reqs.push_back(std::move(r0));
+    reqs.push_back(std::move(r1));
+    AlignedBuf aligned_buf;
+    ASSERT_OK(
+        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+
+    AssertResult(content, reqs);
+
+    // Reads the 1st and 3rd pages in two requests internally.
+    ASSERT_EQ(aligned_reqs.size(), 2);
+    const FSReadRequest& aligned_r0 = aligned_reqs[0];
+    const FSReadRequest& aligned_r1 = aligned_reqs[1];
+    ASSERT_OK(aligned_r0.status);
+    ASSERT_EQ(aligned_r0.offset, 0);
+    ASSERT_EQ(aligned_r0.len, page_size);
+    ASSERT_OK(aligned_r1.status);
+    ASSERT_EQ(aligned_r1.offset, 2 * page_size);
+    ASSERT_EQ(aligned_r1.len, page_size);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST(FSReadRequest, Align) {
+  FSReadRequest r;
+  r.offset = 2000;
+  r.len = 2000;
+  r.scratch = nullptr;
+  ASSERT_OK(r.status);
+
+  FSReadRequest aligned_r = Align(r, 1024);
+  ASSERT_OK(r.status);
+  ASSERT_OK(aligned_r.status);
+  ASSERT_EQ(aligned_r.offset, 1024);
+  ASSERT_EQ(aligned_r.len, 3072);
+}
+
+TEST(FSReadRequest, TryMerge) {
+  // reverse means merging dest into src.
+  for (bool reverse : {true, false}) {
+    {
+      // dest: [ ]
+      //  src:      [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 15;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_FALSE(TryMerge(&dest, src));
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [ ]
+      //  src:   [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 10;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 20);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [    ]
+      //  src:   [    ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 15);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [    ]
+      //  src:   [  ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 5;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) {
+        std::swap(dest, src);
+      }
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [     ]
+      //  src:   [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 5;
+      src.len = 1;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [ ]
+      //  src: [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 0;
+      src.len = 10;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+
+    {
+      // dest: [   ]
+      //  src: [ ]
+      FSReadRequest dest;
+      dest.offset = 0;
+      dest.len = 10;
+      dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
+
+      FSReadRequest src;
+      src.offset = 0;
+      src.len = 5;
+      src.scratch = nullptr;
+      ASSERT_OK(src.status);
+
+      if (reverse) std::swap(dest, src);
+      ASSERT_TRUE(TryMerge(&dest, src));
+      ASSERT_EQ(dest.offset, 0);
+      ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,48 +17,13 @@
 IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
                          std::unique_ptr<FSWritableFile>* result,
                          const FileOptions& options) {
+  TEST_SYNC_POINT_CALLBACK("NewWritableFile::FileOptions.temperature",
+                           const_cast<Temperature*>(&options.temperature));
   IOStatus s = fs->NewWritableFile(fname, options, result, nullptr);
-  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2);
   return s;
 }
 
-bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
-                 std::string* output, bool* has_data, Status* result) {
-  const int kBufferSize = 8192;
-  char buffer[kBufferSize + 1];
-  Slice input_slice;
-
-  std::string line;
-  bool has_complete_line = false;
-  while (!has_complete_line) {
-    if (std::getline(*iss, line)) {
-      has_complete_line = !iss->eof();
-    } else {
-      has_complete_line = false;
-    }
-    if (!has_complete_line) {
-      // if we're not sure whether we have a complete line,
-      // further read from the file.
-      if (*has_data) {
-        *result = seq_file_reader->Read(kBufferSize, &input_slice, buffer);
-      }
-      if (input_slice.size() == 0) {
-        // meaning we have read all the data
-        *has_data = false;
-        break;
-      } else {
-        iss->str(line + input_slice.ToString());
-        // reset the internal state of iss so that we can keep reading it.
-        iss->clear();
-        *has_data = (input_slice.size() == kBufferSize);
-        continue;
-      }
-    }
-  }
-  *output = line;
-  return *has_data || has_complete_line;
-}
-
 #ifndef NDEBUG
 bool IsFileSectorAligned(const size_t off, size_t sector_size) {
   return off % sector_size == 0;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/read_write_util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/read_write_util.h	2025-05-19 16:14:27.000000000 +0000
@@ -24,10 +24,6 @@
                                 std::unique_ptr<FSWritableFile>* result,
                                 const FileOptions& options);
 
-// Read a single line from a file.
-bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
-                 std::string* output, bool* has_data, Status* result);
-
 #ifndef NDEBUG
 bool IsFileSectorAligned(const size_t off, size_t sector_size);
 #endif  // NDEBUG
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_file_info.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_file_info.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_file_info.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_file_info.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// struct ReadaheadFileInfo contains readahead information that is passed from
+// one file to another file per level during iterations. This information helps
+// iterators to carry forward the internal automatic prefetching readahead value
+// to next file during sequential reads instead of starting from the scratch.
+
+struct ReadaheadFileInfo {
+  struct ReadaheadInfo {
+    size_t readahead_size = 0;
+    int64_t num_file_reads = 0;
+  };
+
+  // Used by Data block iterators to update readahead info.
+  ReadaheadInfo data_block_readahead_info;
+
+  // Used by Index block iterators to update readahead info.
+  ReadaheadInfo index_block_readahead_info;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,15 +11,17 @@
 
 #include <algorithm>
 #include <mutex>
+
 #include "file/read_write_util.h"
+#include "rocksdb/file_system.h"
 #include "util/aligned_buffer.h"
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
-class ReadaheadRandomAccessFile : public RandomAccessFile {
+class ReadaheadRandomAccessFile : public FSRandomAccessFile {
  public:
-  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+  ReadaheadRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
                             size_t readahead_size)
       : file_(std::move(file)),
         alignment_(file_->GetRequiredBufferAlignment()),
@@ -35,11 +37,12 @@
   ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
       delete;
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
     // Read-ahead only make sense if we have some slack left after reading
     if (n + alignment_ >= readahead_size_) {
-      return file_->Read(offset, n, result, scratch);
+      return file_->Read(offset, n, options, result, scratch, dbg);
     }
 
     std::unique_lock<std::mutex> lk(lock_);
@@ -53,14 +56,14 @@
         (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
       // We read exactly what we needed, or we hit end of file - return.
       *result = Slice(scratch, cached_len);
-      return Status::OK();
+      return IOStatus::OK();
     }
     size_t advanced_offset = static_cast<size_t>(offset + cached_len);
     // In the case of cache hit advanced_offset is already aligned, means that
     // chunk_offset equals to advanced_offset
     size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
 
-    Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+    IOStatus s = ReadIntoBuffer(chunk_offset, readahead_size_, options, dbg);
     if (s.ok()) {
       // The data we need is now in cache, so we can safely read it
       size_t remaining_len;
@@ -71,11 +74,12 @@
     return s;
   }
 
-  Status Prefetch(uint64_t offset, size_t n) override {
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
     if (n < readahead_size_) {
       // Don't allow smaller prefetches than the configured `readahead_size_`.
       // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
-      return Status::OK();
+      return IOStatus::OK();
     }
 
     std::unique_lock<std::mutex> lk(lock_);
@@ -83,10 +87,11 @@
     size_t offset_ = static_cast<size_t>(offset);
     size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
     if (prefetch_offset == buffer_offset_) {
-      return Status::OK();
+      return IOStatus::OK();
     }
     return ReadIntoBuffer(prefetch_offset,
-                          Roundup(offset_ + n, alignment_) - prefetch_offset);
+                          Roundup(offset_ + n, alignment_) - prefetch_offset,
+                          options, dbg);
   }
 
   size_t GetUniqueId(char* id, size_t max_size) const override {
@@ -95,7 +100,7 @@
 
   void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
 
-  Status InvalidateCache(size_t offset, size_t length) override {
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
     std::unique_lock<std::mutex> lk(lock_);
     buffer_.Clear();
     return file_->InvalidateCache(offset, length);
@@ -125,14 +130,16 @@
   // Reads into buffer_ the next n bytes from file_ starting at offset.
   // Can actually read less if EOF was reached.
   // Returns the status of the read operastion on the file.
-  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
+  IOStatus ReadIntoBuffer(uint64_t offset, size_t n, const IOOptions& options,
+                          IODebugContext* dbg) const {
     if (n > buffer_.Capacity()) {
       n = buffer_.Capacity();
     }
     assert(IsFileSectorAligned(offset, alignment_));
     assert(IsFileSectorAligned(n, alignment_));
     Slice result;
-    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
+    IOStatus s =
+        file_->Read(offset, n, options, &result, buffer_.BufferStart(), dbg);
     if (s.ok()) {
       buffer_offset_ = offset;
       buffer_.Size(result.size());
@@ -141,7 +148,7 @@
     return s;
   }
 
-  const std::unique_ptr<RandomAccessFile> file_;
+  const std::unique_ptr<FSRandomAccessFile> file_;
   const size_t alignment_;
   const size_t readahead_size_;
 
@@ -153,9 +160,9 @@
 };
 }  // namespace
 
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
-  std::unique_ptr<RandomAccessFile> result(
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<FSRandomAccessFile> result(
       new ReadaheadRandomAccessFile(std::move(file), readahead_size));
   return result;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/readahead_raf.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/readahead_raf.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,10 +8,12 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <atomic>
-#include "rocksdb/env.h"
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
+class FSRandomAccessFile;
 // This file provides the following main abstractions:
 // SequentialFileReader : wrapper over Env::SequentialFile
 // RandomAccessFileReader : wrapper over Env::RandomAccessFile
@@ -22,6 +24,6 @@
 // NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
 // always prefetch additional data with every read. This is mainly used in
 // Compaction Table Readers.
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size);
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -22,8 +22,20 @@
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
-Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
-  Status s;
+IOStatus SequentialFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts, std::unique_ptr<SequentialFileReader>* reader,
+    IODebugContext* dbg) {
+  std::unique_ptr<FSSequentialFile> file;
+  IOStatus io_s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new SequentialFileReader(std::move(file), fname));
+  }
+  return io_s;
+}
+
+IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
+  IOStatus io_s;
   if (use_direct_io()) {
 #ifndef ROCKSDB_LITE
     size_t offset = offset_.fetch_add(n);
@@ -35,28 +47,64 @@
     AlignedBuffer buf;
     buf.Alignment(alignment);
     buf.AllocateNewBuffer(size);
+
     Slice tmp;
-    s = file_->PositionedRead(aligned_offset, size, IOOptions(), &tmp,
-                              buf.BufferStart(), nullptr);
-    if (s.ok() && offset_advance < tmp.size()) {
+    uint64_t orig_offset = 0;
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      orig_offset = aligned_offset + buf.CurrentSize();
+      start_ts = FileOperationInfo::StartNow();
+    }
+    io_s = file_->PositionedRead(aligned_offset, size, IOOptions(), &tmp,
+                                 buf.BufferStart(), nullptr);
+    if (io_s.ok() && offset_advance < tmp.size()) {
       buf.Size(tmp.size());
       r = buf.Read(scratch, offset_advance,
                    std::min(tmp.size() - offset_advance, n));
     }
     *result = Slice(scratch, r);
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = FileOperationInfo::FinishNow();
+      NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                             io_s);
+    }
 #endif  // !ROCKSDB_LITE
   } else {
-    s = file_->Read(n, IOOptions(), result, scratch, nullptr);
+    // To be paranoid, modify scratch a little bit, so in case underlying
+    // FileSystem doesn't fill the buffer but return succee and `scratch`
+    // returns contains a previous block, returned value will not pass
+    // checksum.
+    // It's hard to find useful byte for direct I/O case, so we skip it.
+    if (n > 0 && scratch != nullptr) {
+      scratch[0]++;
+    }
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif
+
+    io_s = file_->Read(n, IOOptions(), result, scratch, nullptr);
+
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = FileOperationInfo::FinishNow();
+      size_t offset = offset_.fetch_add(result->size());
+      NotifyOnFileReadFinish(offset, result->size(), start_ts, finish_ts, io_s);
+    }
+#endif
   }
   IOSTATS_ADD(bytes_read, result->size());
-  return s;
+  return io_s;
 }
 
-Status SequentialFileReader::Skip(uint64_t n) {
+IOStatus SequentialFileReader::Skip(uint64_t n) {
 #ifndef ROCKSDB_LITE
   if (use_direct_io()) {
     offset_ += static_cast<size_t>(n);
-    return Status::OK();
+    return IOStatus::OK();
   }
 #endif  // !ROCKSDB_LITE
   return file_->Skip(n);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sequence_file_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sequence_file_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,8 @@
 #pragma once
 #include <atomic>
 #include <string>
+
+#include "env/file_system_tracer.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
@@ -21,36 +23,82 @@
 // cache disabled) reads appropriately, and also updates the IO stats.
 class SequentialFileReader {
  private:
-  std::unique_ptr<FSSequentialFile> file_;
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const Status& status) const {
+    FileOperationInfo info(FileOperationType::kRead, file_name_, start_ts,
+                           finish_ts, status);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void AddFileIOListeners(
+      const std::vector<std::shared_ptr<EventListener>>& listeners) {
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
   std::string file_name_;
+  FSSequentialFilePtr file_;
   std::atomic<size_t> offset_{0};  // read offset
+  std::vector<std::shared_ptr<EventListener>> listeners_{};
 
  public:
-  explicit SequentialFileReader(std::unique_ptr<FSSequentialFile>&& _file,
-                                const std::string& _file_name)
-      : file_(std::move(_file)), file_name_(_file_name) {}
-
-  explicit SequentialFileReader(std::unique_ptr<FSSequentialFile>&& _file,
-                                const std::string& _file_name,
-                                size_t _readahead_size)
-      : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)),
-        file_name_(_file_name) {}
-
-  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
-    *this = std::move(o);
+  explicit SequentialFileReader(
+      std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : file_name_(_file_name),
+        file_(std::move(_file), io_tracer, _file_name),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    AddFileIOListeners(listeners);
+#else
+    (void)listeners;
+#endif
   }
 
-  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
-    file_ = std::move(o.file_);
-    return *this;
+  explicit SequentialFileReader(
+      std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
+      size_t _readahead_size,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : file_name_(_file_name),
+        file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size),
+              io_tracer, _file_name),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    AddFileIOListeners(listeners);
+#else
+    (void)listeners;
+#endif
   }
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<SequentialFileReader>* reader,
+                         IODebugContext* dbg);
 
   SequentialFileReader(const SequentialFileReader&) = delete;
   SequentialFileReader& operator=(const SequentialFileReader&) = delete;
 
-  Status Read(size_t n, Slice* result, char* scratch);
+  IOStatus Read(size_t n, Slice* result, char* scratch);
 
-  Status Skip(uint64_t n);
+  IOStatus Skip(uint64_t n);
 
   FSSequentialFile* file() { return file_.get(); }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,7 +9,7 @@
 #include <vector>
 
 #include "db/db_impl/db_impl.h"
-#include "env/composite_env_wrapper.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
@@ -19,21 +19,21 @@
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
-SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
-                                       std::shared_ptr<Logger> logger,
-                                       int64_t rate_bytes_per_sec,
-                                       double max_trash_db_ratio,
-                                       uint64_t bytes_max_delete_chunk)
-    : env_(env),
+SstFileManagerImpl::SstFileManagerImpl(
+    const std::shared_ptr<SystemClock>& clock,
+    const std::shared_ptr<FileSystem>& fs,
+    const std::shared_ptr<Logger>& logger, int64_t rate_bytes_per_sec,
+    double max_trash_db_ratio, uint64_t bytes_max_delete_chunk)
+    : clock_(clock),
       fs_(fs),
       logger_(logger),
       total_files_size_(0),
-      in_progress_files_size_(0),
       compaction_buffer_size_(0),
       cur_compactions_reserved_size_(0),
       max_allowed_space_(0),
-      delete_scheduler_(env, fs_.get(), rate_bytes_per_sec, logger.get(), this,
-                        max_trash_db_ratio, bytes_max_delete_chunk),
+      delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec,
+                        logger.get(), this, max_trash_db_ratio,
+                        bytes_max_delete_chunk),
       cv_(&mu_),
       closing_(false),
       bg_thread_(nullptr),
@@ -43,6 +43,7 @@
 
 SstFileManagerImpl::~SstFileManagerImpl() {
   Close();
+  bg_err_.PermitUncheckedError();
 }
 
 void SstFileManagerImpl::Close() {
@@ -59,23 +60,24 @@
   }
 }
 
-Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
-                                     bool compaction) {
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
   uint64_t file_size;
   Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
   if (s.ok()) {
     MutexLock l(&mu_);
-    OnAddFileImpl(file_path, file_size, compaction);
+    OnAddFileImpl(file_path, file_size);
   }
-  TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
   return s;
 }
 
 Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
-                                     uint64_t file_size, bool compaction) {
+                                     uint64_t file_size) {
   MutexLock l(&mu_);
-  OnAddFileImpl(file_path, file_size, compaction);
-  TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+  OnAddFileImpl(file_path, file_size);
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
   return Status::OK();
 }
 
@@ -84,7 +86,8 @@
     MutexLock l(&mu_);
     OnDeleteFileImpl(file_path);
   }
-  TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile");
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile",
+                           const_cast<std::string*>(&file_path));
   return Status::OK();
 }
 
@@ -98,19 +101,6 @@
     }
   }
   cur_compactions_reserved_size_ -= size_added_by_compaction;
-
-  auto new_files = c->edit()->GetNewFiles();
-  for (auto& new_file : new_files) {
-    auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
-                            new_file.second.fd.GetNumber(),
-                            new_file.second.fd.GetPathId());
-    if (in_progress_files_.find(fn) != in_progress_files_.end()) {
-      auto tracked_file = tracked_files_.find(fn);
-      assert(tracked_file != tracked_files_.end());
-      in_progress_files_size_ -= tracked_file->second;
-      in_progress_files_.erase(fn);
-    }
-  }
 }
 
 Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
@@ -121,7 +111,7 @@
     if (file_size != nullptr) {
       *file_size = tracked_files_[old_path];
     }
-    OnAddFileImpl(new_path, tracked_files_[old_path], false);
+    OnAddFileImpl(new_path, tracked_files_[old_path]);
     OnDeleteFileImpl(old_path);
   }
   TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
@@ -158,7 +148,7 @@
 
 bool SstFileManagerImpl::EnoughRoomForCompaction(
     ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
-    Status bg_error) {
+    const Status& bg_error) {
   MutexLock l(&mu_);
   uint64_t size_added_by_compaction = 0;
   // First check if we even have the space to do the compaction
@@ -183,12 +173,13 @@
   // seen a NoSpace() error. This is tin order to contain a single potentially
   // misbehaving DB instance and prevent it from slowing down compactions of
   // other DB instances
-  if (CheckFreeSpace() && bg_error == Status::NoSpace()) {
+  if (bg_error.IsNoSpace() && CheckFreeSpace()) {
     auto fn =
         TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
                       inputs[0][0]->fd.GetPathId());
     uint64_t free_space = 0;
-    fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
+    Status s = fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
+    s.PermitUncheckedError();  // TODO: Check the status
     // needed_headroom is based on current size reserved by compactions,
     // minus any files created by running compactions as they would count
     // against the reserved size. If user didn't specify any compaction
@@ -197,7 +188,6 @@
     if (compaction_buffer_size_ == 0) {
       needed_headroom += reserved_disk_buffer_;
     }
-    needed_headroom -= in_progress_files_size_;
     if (free_space < needed_headroom + size_added_by_compaction) {
       // We hit the condition of not enough disk space
       ROCKS_LOG_ERROR(logger_,
@@ -328,7 +318,7 @@
         // error is also a NoSpace() non-fatal error, leave the instance in
         // the list
         Status err = cur_instance_->GetBGError();
-        if (s.ok() && err == Status::NoSpace() &&
+        if (s.ok() && err.subcode() == IOStatus::SubCode::kNoSpace &&
             err.severity() < Status::Severity::kFatalError) {
           s = err;
         }
@@ -346,7 +336,7 @@
     if (!error_handler_list_.empty()) {
       // If there are more instances to be recovered, reschedule after 5
       // seconds
-      int64_t wait_until = env_->NowMicros() + 5000000;
+      int64_t wait_until = clock_->NowMicros() + 5000000;
       cv_.TimedWait(wait_until);
     }
 
@@ -438,24 +428,15 @@
 }
 
 void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
-                                       uint64_t file_size, bool compaction) {
+                                       uint64_t file_size) {
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file != tracked_files_.end()) {
     // File was added before, we will just update the size
-    assert(!compaction);
     total_files_size_ -= tracked_file->second;
     total_files_size_ += file_size;
     cur_compactions_reserved_size_ -= file_size;
   } else {
     total_files_size_ += file_size;
-    if (compaction) {
-      // Keep track of the size of files created by in-progress compactions.
-      // When calculating whether there's enough headroom for new compactions,
-      // this will be subtracted from cur_compactions_reserved_size_.
-      // Otherwise, compactions will be double counted.
-      in_progress_files_size_ += file_size;
-      in_progress_files_.insert(file_path);
-    }
   }
   tracked_files_[file_path] = file_size;
 }
@@ -464,16 +445,10 @@
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file == tracked_files_.end()) {
     // File is not tracked
-    assert(in_progress_files_.find(file_path) == in_progress_files_.end());
     return;
   }
 
   total_files_size_ -= tracked_file->second;
-  // Check if it belonged to an in-progress compaction
-  if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
-    in_progress_files_size_ -= tracked_file->second;
-    in_progress_files_.erase(file_path);
-  }
   tracked_files_.erase(tracked_file);
 }
 
@@ -483,14 +458,7 @@
                                   bool delete_existing_trash, Status* status,
                                   double max_trash_db_ratio,
                                   uint64_t bytes_max_delete_chunk) {
-  std::shared_ptr<FileSystem> fs;
-
-  if (env == Env::Default()) {
-    fs = FileSystem::Default();
-  } else {
-    fs.reset(new LegacyFileSystemWrapper(env));
-  }
-
+  const auto& fs = env->GetFileSystem();
   return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
                            delete_existing_trash, status, max_trash_db_ratio,
                            bytes_max_delete_chunk);
@@ -503,22 +471,19 @@
                                   bool delete_existing_trash, Status* status,
                                   double max_trash_db_ratio,
                                   uint64_t bytes_max_delete_chunk) {
+  const auto& clock = env->GetSystemClock();
   SstFileManagerImpl* res =
-      new SstFileManagerImpl(env, fs, info_log, rate_bytes_per_sec,
+      new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec,
                              max_trash_db_ratio, bytes_max_delete_chunk);
 
   // trash_dir is deprecated and not needed anymore, but if user passed it
   // we will still remove files in it.
-  Status s;
+  Status s = Status::OK();
   if (delete_existing_trash && trash_dir != "") {
     std::vector<std::string> files_in_trash;
     s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
     if (s.ok()) {
       for (const std::string& trash_file : files_in_trash) {
-        if (trash_file == "." || trash_file == "..") {
-          continue;
-        }
-
         std::string path_in_trash = trash_dir + "/" + trash_file;
         res->OnAddFile(path_in_trash);
         Status file_delete =
@@ -532,6 +497,9 @@
 
   if (status) {
     *status = s;
+  } else {
+    // No one passed us a Status, so they must not care about the error...
+    s.PermitUncheckedError();
   }
 
   return res;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/sst_file_manager_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,47 +12,45 @@
 #include "port/port.h"
 
 #include "db/compaction/compaction.h"
-#include "db/error_handler.h"
 #include "file/delete_scheduler.h"
-#include "rocksdb/file_system.h"
 #include "rocksdb/sst_file_manager.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-class Env;
+class ErrorHandler;
+class FileSystem;
+class SystemClock;
 class Logger;
 
-// SstFileManager is used to track SST files in the DB and control there
-// deletion rate.
-// All SstFileManager public functions are thread-safe.
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
 class SstFileManagerImpl : public SstFileManager {
  public:
-  explicit SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
-                              std::shared_ptr<Logger> logger,
+  explicit SstFileManagerImpl(const std::shared_ptr<SystemClock>& clock,
+                              const std::shared_ptr<FileSystem>& fs,
+                              const std::shared_ptr<Logger>& logger,
                               int64_t rate_bytes_per_sec,
                               double max_trash_db_ratio,
                               uint64_t bytes_max_delete_chunk);
 
   ~SstFileManagerImpl();
 
-  // DB will call OnAddFile whenever a new sst file is added.
-  Status OnAddFile(const std::string& file_path, bool compaction = false);
+  // DB will call OnAddFile whenever a new sst/blob file is added.
+  Status OnAddFile(const std::string& file_path);
 
   // Overload where size of the file is provided by the caller rather than
   // queried from the filesystem. This is an optimization.
-  Status OnAddFile(const std::string& file_path, uint64_t file_size,
-                   bool compaction);
+  Status OnAddFile(const std::string& file_path, uint64_t file_size);
 
-  // DB will call OnDeleteFile whenever an sst file is deleted.
+  // DB will call OnDeleteFile whenever a sst/blob file is deleted.
   Status OnDeleteFile(const std::string& file_path);
 
-  // DB will call OnMoveFile whenever an sst file is move to a new path.
+  // DB will call OnMoveFile whenever a sst/blob file is move to a new path.
   Status OnMoveFile(const std::string& old_path, const std::string& new_path,
                     uint64_t* file_size = nullptr);
 
   // Update the maximum allowed space that should be used by RocksDB, if
-  // the total size of the SST files exceeds max_allowed_space, writes to
-  // RocksDB will fail.
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
   //
   // Setting max_allowed_space to 0 will disable this feature, maximum allowed
   // space will be infinite (Default value).
@@ -62,8 +60,8 @@
 
   void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
 
-  // Return true if the total size of SST files exceeded the maximum allowed
-  // space usage.
+  // Return true if the total size of SST and blob files exceeded the maximum
+  // allowed space usage.
   //
   // thread-safe.
   bool IsMaxAllowedSpaceReached() override;
@@ -77,7 +75,7 @@
   // the full compaction size).
   bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
                                const std::vector<CompactionInputFiles>& inputs,
-                               Status bg_error);
+                               const Status& bg_error);
 
   // Bookkeeping so total_file_sizes_ goes back to normal after compaction
   // finishes
@@ -135,10 +133,14 @@
   // once in the object's lifetime, and before the destructor
   void Close();
 
+  void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) override {
+    stats_ = stats;
+    delete_scheduler_.SetStatisticsPtr(stats);
+  }
+
  private:
   // REQUIRES: mutex locked
-  void OnAddFileImpl(const std::string& file_path, uint64_t file_size,
-                     bool compaction);
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
   // REQUIRES: mutex locked
   void OnDeleteFileImpl(const std::string& file_path);
 
@@ -147,15 +149,13 @@
     return bg_err_.severity() == Status::Severity::kSoftError;
   }
 
-  Env* env_;
+  std::shared_ptr<SystemClock> clock_;
   std::shared_ptr<FileSystem> fs_;
   std::shared_ptr<Logger> logger_;
   // Mutex to protect tracked_files_, total_files_size_
   port::Mutex mu_;
   // The summation of the sizes of all files in tracked_files_ map
   uint64_t total_files_size_;
-  // The summation of all output files of in-progress compactions
-  uint64_t in_progress_files_size_;
   // Compactions should only execute if they can leave at least
   // this amount of buffer space for logs and flushes
   uint64_t compaction_buffer_size_;
@@ -164,9 +164,7 @@
   // A map containing all tracked files and there sizes
   //  file_path => file_size
   std::unordered_map<std::string, uint64_t> tracked_files_;
-  // A set of files belonging to in-progress compactions
-  std::unordered_set<std::string> in_progress_files_;
-  // The maximum allowed space (in bytes) for sst files.
+  // The maximum allowed space (in bytes) for sst and blob files.
   uint64_t max_allowed_space_;
   // DeleteScheduler used to throttle file deletition.
   DeleteScheduler delete_scheduler_;
@@ -186,10 +184,11 @@
   // compactions to run full throttle. If disk space is below this trigger,
   // compactions will be gated by free disk space > input size
   uint64_t free_space_trigger_;
-  // List of database error handler instances tracked by this sst file manager
+  // List of database error handler instances tracked by this SstFileManager.
   std::list<ErrorHandler*> error_handler_list_;
   // Pointer to ErrorHandler instance that is currently processing recovery
   ErrorHandler* cur_instance_;
+  std::shared_ptr<Statistics> stats_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,19 +16,37 @@
 #include "monitoring/histogram.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
+#include "util/crc32c.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
-Status WritableFileWriter::Append(const Slice& data) {
+IOStatus WritableFileWriter::Create(const std::shared_ptr<FileSystem>& fs,
+                                    const std::string& fname,
+                                    const FileOptions& file_opts,
+                                    std::unique_ptr<WritableFileWriter>* writer,
+                                    IODebugContext* dbg) {
+  std::unique_ptr<FSWritableFile> file;
+  IOStatus io_s = fs->NewWritableFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    writer->reset(new WritableFileWriter(std::move(file), fname, file_opts));
+  }
+  return io_s;
+}
+
+IOStatus WritableFileWriter::Append(const Slice& data,
+                                    uint32_t crc32c_checksum) {
   const char* src = data.data();
   size_t left = data.size();
-  Status s;
+  IOStatus s;
   pending_sync_ = true;
 
-  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2);
+
+  // Calculate the checksum of appended data
+  UpdateFileChecksum(data);
 
   {
     IOSTATS_TIMER_GUARD(prepare_write_nanos);
@@ -64,40 +82,88 @@
     assert(buf_.CurrentSize() == 0);
   }
 
-  // We never write directly to disk with direct I/O on.
-  // or we simply use it for its original purpose to accumulate many small
-  // chunks
-  if (use_direct_io() || (buf_.Capacity() >= left)) {
-    while (left > 0) {
-      size_t appended = buf_.Append(src, left);
-      left -= appended;
-      src += appended;
-
-      if (left > 0) {
-        s = Flush();
-        if (!s.ok()) {
-          break;
+  if (perform_data_verification_ && buffered_data_with_checksum_ &&
+      crc32c_checksum != 0) {
+    // Since we want to use the checksum of the input data, we cannot break it
+    // into several pieces. We will only write them in the buffer when buffer
+    // size is enough. Otherwise, we will directly write it down.
+    if (use_direct_io() || (buf_.Capacity() - buf_.CurrentSize()) >= left) {
+      if ((buf_.Capacity() - buf_.CurrentSize()) >= left) {
+        size_t appended = buf_.Append(src, left);
+        if (appended != left) {
+          s = IOStatus::Corruption("Write buffer append failure");
+        }
+        buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
+            buffered_data_crc32c_checksum_, crc32c_checksum, appended);
+      } else {
+        while (left > 0) {
+          size_t appended = buf_.Append(src, left);
+          buffered_data_crc32c_checksum_ =
+              crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
+          left -= appended;
+          src += appended;
+
+          if (left > 0) {
+            s = Flush();
+            if (!s.ok()) {
+              break;
+            }
+          }
         }
       }
+    } else {
+      assert(buf_.CurrentSize() == 0);
+      buffered_data_crc32c_checksum_ = crc32c_checksum;
+      s = WriteBufferedWithChecksum(src, left);
     }
   } else {
-    // Writing directly to file bypassing the buffer
-    assert(buf_.CurrentSize() == 0);
-    s = WriteBuffered(src, left);
+    // In this case, either we do not need to do the data verification or
+    // caller does not provide the checksum of the data (crc32c_checksum = 0).
+    //
+    // We never write directly to disk with direct I/O on.
+    // or we simply use it for its original purpose to accumulate many small
+    // chunks
+    if (use_direct_io() || (buf_.Capacity() >= left)) {
+      while (left > 0) {
+        size_t appended = buf_.Append(src, left);
+        if (perform_data_verification_ && buffered_data_with_checksum_) {
+          buffered_data_crc32c_checksum_ =
+              crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
+        }
+        left -= appended;
+        src += appended;
+
+        if (left > 0) {
+          s = Flush();
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    } else {
+      // Writing directly to file bypassing the buffer
+      assert(buf_.CurrentSize() == 0);
+      if (perform_data_verification_ && buffered_data_with_checksum_) {
+        buffered_data_crc32c_checksum_ = crc32c::Value(src, left);
+        s = WriteBufferedWithChecksum(src, left);
+      } else {
+        s = WriteBuffered(src, left);
+      }
+    }
   }
 
-  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1");
   if (s.ok()) {
     filesize_ += data.size();
-    CalculateFileChecksum(data);
   }
   return s;
 }
 
-Status WritableFileWriter::Pad(const size_t pad_bytes) {
+IOStatus WritableFileWriter::Pad(const size_t pad_bytes) {
   assert(pad_bytes < kDefaultPageSize);
   size_t left = pad_bytes;
   size_t cap = buf_.Capacity() - buf_.CurrentSize();
+  size_t pad_start = buf_.CurrentSize();
 
   // Assume pad_bytes is small compared to buf_ capacity. So we always
   // use buf_ rather than write directly to file in certain cases like
@@ -107,7 +173,7 @@
     buf_.PadWith(append_bytes, 0);
     left -= append_bytes;
     if (left > 0) {
-      Status s = Flush();
+      IOStatus s = Flush();
       if (!s.ok()) {
         return s;
       }
@@ -116,71 +182,158 @@
   }
   pending_sync_ = true;
   filesize_ += pad_bytes;
-  return Status::OK();
+  if (perform_data_verification_) {
+    buffered_data_crc32c_checksum_ =
+        crc32c::Extend(buffered_data_crc32c_checksum_,
+                       buf_.BufferStart() + pad_start, pad_bytes);
+  }
+  return IOStatus::OK();
 }
 
-Status WritableFileWriter::Close() {
+IOStatus WritableFileWriter::Close() {
   // Do not quit immediately on failure the file MUST be closed
-  Status s;
+  IOStatus s;
 
   // Possible to close it twice now as we MUST close
   // in __dtor, simply flushing is not enough
   // Windows when pre-allocating does not fill with zeros
   // also with unbuffered access we also set the end of data.
-  if (!writable_file_) {
+  if (writable_file_.get() == nullptr) {
     return s;
   }
 
   s = Flush();  // flush cache to OS
 
-  Status interim;
+  IOStatus interim;
   // In direct I/O mode we write whole pages so
   // we need to let the file know where data ends.
   if (use_direct_io()) {
-    interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
+    {
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::StartTimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = FileOperationInfo::StartNow();
+      }
+#endif
+      interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = FileOperationInfo::FinishNow();
+        NotifyOnFileTruncateFinish(start_ts, finish_ts, s);
+        if (!interim.ok()) {
+          NotifyOnIOError(interim, FileOperationType::kTruncate, file_name(),
+                          filesize_);
+        }
+      }
+#endif
+    }
     if (interim.ok()) {
-      interim = writable_file_->Fsync(IOOptions(), nullptr);
+      {
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::StartTimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = FileOperationInfo::StartNow();
+        }
+#endif
+        interim = writable_file_->Fsync(IOOptions(), nullptr);
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = FileOperationInfo::FinishNow();
+          NotifyOnFileSyncFinish(start_ts, finish_ts, s,
+                                 FileOperationType::kFsync);
+          if (!interim.ok()) {
+            NotifyOnIOError(interim, FileOperationType::kFsync, file_name());
+          }
+        }
+#endif
+      }
     }
     if (!interim.ok() && s.ok()) {
       s = interim;
     }
   }
 
-  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
-  interim = writable_file_->Close(IOOptions(), nullptr);
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0");
+  {
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif
+    interim = writable_file_->Close(IOOptions(), nullptr);
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = FileOperationInfo::FinishNow();
+      NotifyOnFileCloseFinish(start_ts, finish_ts, s);
+      if (!interim.ok()) {
+        NotifyOnIOError(interim, FileOperationType::kClose, file_name());
+      }
+    }
+#endif
+  }
   if (!interim.ok() && s.ok()) {
     s = interim;
   }
 
   writable_file_.reset();
-  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1");
+
+  if (s.ok() && checksum_generator_ != nullptr && !checksum_finalized_) {
+    checksum_generator_->Finalize();
+    checksum_finalized_ = true;
+  }
 
   return s;
 }
 
 // write out the cached data to the OS cache or storage if direct I/O
 // enabled
-Status WritableFileWriter::Flush() {
-  Status s;
-  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
+IOStatus WritableFileWriter::Flush() {
+  IOStatus s;
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2);
 
   if (buf_.CurrentSize() > 0) {
     if (use_direct_io()) {
 #ifndef ROCKSDB_LITE
       if (pending_sync_) {
-        s = WriteDirect();
+        if (perform_data_verification_ && buffered_data_with_checksum_) {
+          s = WriteDirectWithChecksum();
+        } else {
+          s = WriteDirect();
+        }
       }
 #endif  // !ROCKSDB_LITE
     } else {
-      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+      if (perform_data_verification_ && buffered_data_with_checksum_) {
+        s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize());
+      } else {
+        s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+      }
     }
     if (!s.ok()) {
       return s;
     }
   }
 
-  s = writable_file_->Flush(IOOptions(), nullptr);
+  {
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+#endif
+    s = writable_file_->Flush(IOOptions(), nullptr);
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileFlushFinish(start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kFlush, file_name());
+      }
+    }
+#endif
+  }
 
   if (!s.ok()) {
     return s;
@@ -216,71 +369,118 @@
   return s;
 }
 
+std::string WritableFileWriter::GetFileChecksum() {
+  if (checksum_generator_ != nullptr) {
+    assert(checksum_finalized_);
+    return checksum_generator_->GetChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
 const char* WritableFileWriter::GetFileChecksumFuncName() const {
-  if (checksum_func_ != nullptr) {
-    return checksum_func_->Name();
+  if (checksum_generator_ != nullptr) {
+    return checksum_generator_->Name();
   } else {
-    return kUnknownFileChecksumFuncName.c_str();
+    return kUnknownFileChecksumFuncName;
   }
 }
 
-Status WritableFileWriter::Sync(bool use_fsync) {
-  Status s = Flush();
+IOStatus WritableFileWriter::Sync(bool use_fsync) {
+  IOStatus s = Flush();
   if (!s.ok()) {
     return s;
   }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0");
   if (!use_direct_io() && pending_sync_) {
     s = SyncInternal(use_fsync);
     if (!s.ok()) {
       return s;
     }
   }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1");
   pending_sync_ = false;
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+IOStatus WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
   if (!writable_file_->IsSyncThreadSafe()) {
-    return Status::NotSupported(
+    return IOStatus::NotSupported(
         "Can't WritableFileWriter::SyncWithoutFlush() because "
         "WritableFile::IsSyncThreadSafe() is false");
   }
   TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
-  Status s = SyncInternal(use_fsync);
+  IOStatus s = SyncInternal(use_fsync);
   TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
   return s;
 }
 
-Status WritableFileWriter::SyncInternal(bool use_fsync) {
-  Status s;
+IOStatus WritableFileWriter::SyncInternal(bool use_fsync) {
+  IOStatus s;
   IOSTATS_TIMER_GUARD(fsync_nanos);
   TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
   auto prev_perf_level = GetPerfLevel();
-  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+#ifndef ROCKSDB_LITE
+  FileOperationInfo::StartTimePoint start_ts;
+  if (ShouldNotifyListeners()) {
+    start_ts = FileOperationInfo::StartNow();
+  }
+#endif
   if (use_fsync) {
     s = writable_file_->Fsync(IOOptions(), nullptr);
   } else {
     s = writable_file_->Sync(IOOptions(), nullptr);
   }
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = std::chrono::steady_clock::now();
+    NotifyOnFileSyncFinish(
+        start_ts, finish_ts, s,
+        use_fsync ? FileOperationType::kFsync : FileOperationType::kSync);
+    if (!s.ok()) {
+      NotifyOnIOError(
+          s, (use_fsync ? FileOperationType::kFsync : FileOperationType::kSync),
+          file_name());
+    }
+  }
+#endif
   SetPerfLevel(prev_perf_level);
   return s;
 }
 
-Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
   IOSTATS_TIMER_GUARD(range_sync_nanos);
   TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
-  return writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
+#ifndef ROCKSDB_LITE
+  FileOperationInfo::StartTimePoint start_ts;
+  if (ShouldNotifyListeners()) {
+    start_ts = FileOperationInfo::StartNow();
+  }
+#endif
+  IOStatus s = writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
+#ifndef ROCKSDB_LITE
+  if (ShouldNotifyListeners()) {
+    auto finish_ts = std::chrono::steady_clock::now();
+    NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s);
+    if (!s.ok()) {
+      NotifyOnIOError(s, FileOperationType::kRangeSync, file_name(), nbytes,
+                      offset);
+    }
+  }
+#endif
+  return s;
 }
 
 // This method writes to disk the specified data and makes use of the rate
 // limiter if available
-Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
-  Status s;
+IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) {
+  IOStatus s;
   assert(!use_direct_io());
   const char* src = data;
   size_t left = size;
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
 
   while (left > 0) {
     size_t allowed;
@@ -297,23 +497,48 @@
       TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
 
 #ifndef ROCKSDB_LITE
-      FileOperationInfo::TimePoint start_ts;
+      FileOperationInfo::StartTimePoint start_ts;
       uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
       if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::now();
+        start_ts = FileOperationInfo::StartNow();
         old_size = next_write_offset_;
       }
 #endif
       {
         auto prev_perf_level = GetPerfLevel();
-        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
-        s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
+
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+        if (perform_data_verification_) {
+          Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf);
+          v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+          s = writable_file_->Append(Slice(src, allowed), IOOptions(), v_info,
+                                     nullptr);
+        } else {
+          s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
+        }
+        if (!s.ok()) {
+          // If writable_file_->Append() failed, then the data may or may not
+          // exist in the underlying memory buffer, OS page cache, remote file
+          // system's buffer, etc. If WritableFileWriter keeps the data in
+          // buf_, then a future Close() or write retry may send the data to
+          // the underlying file again. If the data does exist in the
+          // underlying buffer and gets written to the file eventually despite
+          // returning error, the file may end up with two duplicate pieces of
+          // data. Therefore, clear the buf_ at the WritableFileWriter layer
+          // and let caller determine error handling.
+          buf_.Size(0);
+          buffered_data_crc32c_checksum_ = 0;
+        }
         SetPerfLevel(prev_perf_level);
       }
 #ifndef ROCKSDB_LITE
       if (ShouldNotifyListeners()) {
-        auto finish_ts = std::chrono::system_clock::now();
+        auto finish_ts = std::chrono::steady_clock::now();
         NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+        if (!s.ok()) {
+          NotifyOnIOError(s, FileOperationType::kAppend, file_name(), allowed,
+                          old_size);
+        }
       }
 #endif
       if (!s.ok()) {
@@ -322,25 +547,117 @@
     }
 
     IOSTATS_ADD(bytes_written, allowed);
-    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
 
     left -= allowed;
     src += allowed;
   }
   buf_.Size(0);
+  buffered_data_crc32c_checksum_ = 0;
   return s;
 }
 
-void WritableFileWriter::CalculateFileChecksum(const Slice& data) {
-  if (checksum_func_ != nullptr) {
-    if (is_first_checksum_) {
-      file_checksum_ = checksum_func_->Value(data.data(), data.size());
-      is_first_checksum_ = false;
-    } else {
-      file_checksum_ =
-          checksum_func_->Extend(file_checksum_, data.data(), data.size());
+IOStatus WritableFileWriter::WriteBufferedWithChecksum(const char* data,
+                                                       size_t size) {
+  IOStatus s;
+  assert(!use_direct_io());
+  assert(perform_data_verification_ && buffered_data_with_checksum_);
+  const char* src = data;
+  size_t left = size;
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+
+  // Check how much is allowed. Here, we loop until the rate limiter allows to
+  // write the entire buffer.
+  // TODO: need to be improved since it sort of defeats the purpose of the rate
+  // limiter
+  size_t data_size = left;
+  if (rate_limiter_ != nullptr) {
+    while (data_size > 0) {
+      size_t tmp_size;
+      tmp_size = rate_limiter_->RequestToken(
+          data_size, buf_.Alignment(), writable_file_->GetIOPriority(), stats_,
+          RateLimiter::OpType::kWrite);
+      data_size -= tmp_size;
     }
   }
+
+  {
+    IOSTATS_TIMER_GUARD(write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::StartTimePoint start_ts;
+    uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+      old_size = next_write_offset_;
+    }
+#endif
+    {
+      auto prev_perf_level = GetPerfLevel();
+
+      IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+
+      EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
+      v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+      s = writable_file_->Append(Slice(src, left), IOOptions(), v_info,
+                                 nullptr);
+      SetPerfLevel(prev_perf_level);
+    }
+#ifndef ROCKSDB_LITE
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileWriteFinish(old_size, left, start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kAppend, file_name(), left,
+                        old_size);
+      }
+    }
+#endif
+    if (!s.ok()) {
+      // If writable_file_->Append() failed, then the data may or may not
+      // exist in the underlying memory buffer, OS page cache, remote file
+      // system's buffer, etc. If WritableFileWriter keeps the data in
+      // buf_, then a future Close() or write retry may send the data to
+      // the underlying file again. If the data does exist in the
+      // underlying buffer and gets written to the file eventually despite
+      // returning error, the file may end up with two duplicate pieces of
+      // data. Therefore, clear the buf_ at the WritableFileWriter layer
+      // and let caller determine error handling.
+      buf_.Size(0);
+      buffered_data_crc32c_checksum_ = 0;
+      return s;
+    }
+  }
+
+  IOSTATS_ADD(bytes_written, left);
+  TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
+
+  // Buffer write is successful, reset the buffer current size to 0 and reset
+  // the corresponding checksum value
+  buf_.Size(0);
+  buffered_data_crc32c_checksum_ = 0;
+  return s;
+}
+
+void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
+  if (checksum_generator_ != nullptr) {
+    checksum_generator_->Update(data.data(), data.size());
+  }
+}
+
+// Currently, crc32c checksum is used to calculate the checksum value of the
+// content in the input buffer for handoff. In the future, the checksum might be
+// calculated from the existing crc32c checksums of the in WAl and Manifest
+// records, or even SST file blocks.
+// TODO: effectively use the existing checksum of the data being writing to
+// generate the crc32c checksum instead of a raw calculation.
+void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data,
+                                                          size_t size,
+                                                          char* buf) {
+  uint32_t v_crc32c = crc32c::Extend(0, data, size);
+  EncodeFixed32(buf, v_crc32c);
 }
 
 // This flushes the accumulated data in the buffer. We pad data with zeros if
@@ -352,20 +669,20 @@
 // only write on aligned
 // offsets.
 #ifndef ROCKSDB_LITE
-Status WritableFileWriter::WriteDirect() {
+IOStatus WritableFileWriter::WriteDirect() {
   assert(use_direct_io());
-  Status s;
+  IOStatus s;
   const size_t alignment = buf_.Alignment();
   assert((next_write_offset_ % alignment) == 0);
 
   // Calculate whole page final file advance if all writes succeed
-  size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());
+  const size_t file_advance =
+      TruncateToPageBoundary(alignment, buf_.CurrentSize());
 
   // Calculate the leftover tail, we write it here padded with zeros BUT we
-  // will write
-  // it again in the future either on Close() OR when the current whole page
-  // fills out
-  size_t leftover_tail = buf_.CurrentSize() - file_advance;
+  // will write it again in the future either on Close() OR when the current
+  // whole page fills out.
+  const size_t leftover_tail = buf_.CurrentSize() - file_advance;
 
   // Round up and pad
   buf_.PadToAlignmentWith(0);
@@ -373,6 +690,8 @@
   const char* src = buf_.BufferStart();
   uint64_t write_offset = next_write_offset_;
   size_t left = buf_.CurrentSize();
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
 
   while (left > 0) {
     // Check how much is allowed
@@ -388,16 +707,28 @@
     {
       IOSTATS_TIMER_GUARD(write_nanos);
       TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
-      FileOperationInfo::TimePoint start_ts;
+      FileOperationInfo::StartTimePoint start_ts;
       if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::now();
+        start_ts = FileOperationInfo::StartNow();
       }
       // direct writes must be positional
-      s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
-                                           IOOptions(), nullptr);
+      if (perform_data_verification_) {
+        Crc32cHandoffChecksumCalculation(src, size, checksum_buf);
+        v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             IOOptions(), v_info, nullptr);
+      } else {
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             IOOptions(), nullptr);
+      }
+
       if (ShouldNotifyListeners()) {
-        auto finish_ts = std::chrono::system_clock::now();
+        auto finish_ts = std::chrono::steady_clock::now();
         NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+        if (!s.ok()) {
+          NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
+                          size, write_offset);
+        }
       }
       if (!s.ok()) {
         buf_.Size(file_advance + leftover_tail);
@@ -420,6 +751,104 @@
     // This is where we start writing next time which may or not be
     // the actual file size on disk. They match if the buffer size
     // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  }
+  return s;
+}
+
+IOStatus WritableFileWriter::WriteDirectWithChecksum() {
+  assert(use_direct_io());
+  assert(perform_data_verification_ && buffered_data_with_checksum_);
+  IOStatus s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  const size_t file_advance =
+      TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write it again in the future either on Close() OR when the current
+  // whole page fills out.
+  const size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up, pad, and combine the checksum.
+  size_t last_cur_size = buf_.CurrentSize();
+  buf_.PadToAlignmentWith(0);
+  size_t padded_size = buf_.CurrentSize() - last_cur_size;
+  const char* padded_start = buf_.BufferStart() + last_cur_size;
+  uint32_t padded_checksum = crc32c::Value(padded_start, padded_size);
+  buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
+      buffered_data_crc32c_checksum_, padded_checksum, padded_size);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
+
+  // Check how much is allowed. Here, we loop until the rate limiter allows to
+  // write the entire buffer.
+  // TODO: need to be improved since it sort of defeats the purpose of the rate
+  // limiter
+  size_t data_size = left;
+  if (rate_limiter_ != nullptr) {
+    while (data_size > 0) {
+      size_t size;
+      size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
+                                         writable_file_->GetIOPriority(),
+                                         stats_, RateLimiter::OpType::kWrite);
+      data_size -= size;
+    }
+  }
+
+  {
+    IOSTATS_TIMER_GUARD(write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+    FileOperationInfo::StartTimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = FileOperationInfo::StartNow();
+    }
+    // direct writes must be positional
+    EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
+    v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+    s = writable_file_->PositionedAppend(Slice(src, left), write_offset,
+                                         IOOptions(), v_info, nullptr);
+
+    if (ShouldNotifyListeners()) {
+      auto finish_ts = std::chrono::steady_clock::now();
+      NotifyOnFileWriteFinish(write_offset, left, start_ts, finish_ts, s);
+      if (!s.ok()) {
+        NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
+                        left, write_offset);
+      }
+    }
+    if (!s.ok()) {
+      // In this case, we do not change buffered_data_crc32c_checksum_ because
+      // it still aligns with the data in the buffer.
+      buf_.Size(file_advance + leftover_tail);
+      buffered_data_crc32c_checksum_ =
+          crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
+      return s;
+    }
+  }
+
+  IOSTATS_ADD(bytes_written, left);
+  assert((next_write_offset_ % alignment) == 0);
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close(). Also the buffer checksum will
+    // recalculated accordingly.
+    buf_.RefitTail(file_advance, leftover_tail);
+    // Adjust the checksum value to align with the data in the buffer
+    buffered_data_crc32c_checksum_ =
+        crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
     // behind
     next_write_offset_ += file_advance;
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/file/writable_file_writer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/file/writable_file_writer.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,11 +10,13 @@
 #pragma once
 #include <atomic>
 #include <string>
+
 #include "db/version_edit.h"
+#include "env/file_system_tracer.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/rate_limiter.h"
 #include "test_util/sync_point.h"
@@ -22,6 +24,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 class Statistics;
+class SystemClock;
 
 // WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
 // facilities to:
@@ -33,27 +36,107 @@
 class WritableFileWriter {
  private:
 #ifndef ROCKSDB_LITE
-  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
-                               const FileOperationInfo::TimePoint& start_ts,
-                               const FileOperationInfo::TimePoint& finish_ts,
-                               const Status& status) {
-    FileOperationInfo info(file_name_, start_ts, finish_ts);
+  void NotifyOnFileWriteFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kWrite, file_name_, start_ts,
+                           finish_ts, io_status);
     info.offset = offset;
     info.length = length;
-    info.status = status;
 
     for (auto& listener : listeners_) {
       listener->OnFileWriteFinish(info);
     }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileFlushFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kFlush, file_name_, start_ts,
+                           finish_ts, io_status);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileFlushFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileSyncFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status,
+      FileOperationType type = FileOperationType::kSync) {
+    FileOperationInfo info(type, file_name_, start_ts, finish_ts, io_status);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileSyncFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileRangeSyncFinish(
+      uint64_t offset, size_t length,
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kRangeSync, file_name_, start_ts,
+                           finish_ts, io_status);
+    info.offset = offset;
+    info.length = length;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileRangeSyncFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileTruncateFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kTruncate, file_name_, start_ts,
+                           finish_ts, io_status);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileTruncateFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  void NotifyOnFileCloseFinish(
+      const FileOperationInfo::StartTimePoint& start_ts,
+      const FileOperationInfo::FinishTimePoint& finish_ts,
+      const IOStatus& io_status) {
+    FileOperationInfo info(FileOperationType::kClose, file_name_, start_ts,
+                           finish_ts, io_status);
+
+    for (auto& listener : listeners_) {
+      listener->OnFileCloseFinish(info);
+    }
+    info.status.PermitUncheckedError();
+  }
+
+  void NotifyOnIOError(const IOStatus& io_status, FileOperationType operation,
+                       const std::string& file_path, size_t length = 0,
+                       uint64_t offset = 0) {
+    if (listeners_.empty()) {
+      return;
+    }
+    IOErrorInfo io_error_info(io_status, operation, file_path, length, offset);
+    for (auto& listener : listeners_) {
+      listener->OnIOError(io_error_info);
+    }
+    io_error_info.io_status.PermitUncheckedError();
   }
 #endif  // ROCKSDB_LITE
 
   bool ShouldNotifyListeners() const { return !listeners_.empty(); }
-  void CalculateFileChecksum(const Slice& data);
+  void UpdateFileChecksum(const Slice& data);
+  void Crc32cHandoffChecksumCalculation(const char* data, size_t size,
+                                        char* buf);
 
-  std::unique_ptr<FSWritableFile> writable_file_;
   std::string file_name_;
-  Env* env_;
+  FSWritableFilePtr writable_file_;
+  SystemClock* clock_;
   AlignedBuffer buf_;
   size_t max_buffer_size_;
   // Actually written data size can be used for truncate
@@ -71,20 +154,25 @@
   RateLimiter* rate_limiter_;
   Statistics* stats_;
   std::vector<std::shared_ptr<EventListener>> listeners_;
-  FileChecksumFunc* checksum_func_;
-  std::string file_checksum_ = kUnknownFileChecksum;
-  bool is_first_checksum_ = true;
+  std::unique_ptr<FileChecksumGenerator> checksum_generator_;
+  bool checksum_finalized_;
+  bool perform_data_verification_;
+  uint32_t buffered_data_crc32c_checksum_;
+  bool buffered_data_with_checksum_;
 
  public:
   WritableFileWriter(
       std::unique_ptr<FSWritableFile>&& file, const std::string& _file_name,
-      const FileOptions& options, Env* env = nullptr,
+      const FileOptions& options, SystemClock* clock = nullptr,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
       Statistics* stats = nullptr,
       const std::vector<std::shared_ptr<EventListener>>& listeners = {},
-      FileChecksumFunc* checksum_func = nullptr)
-      : writable_file_(std::move(file)),
-        file_name_(_file_name),
-        env_(env),
+      FileChecksumGenFactory* file_checksum_gen_factory = nullptr,
+      bool perform_data_verification = false,
+      bool buffered_data_with_checksum = false)
+      : file_name_(_file_name),
+        writable_file_(std::move(file), io_tracer, _file_name),
+        clock_(clock),
         buf_(),
         max_buffer_size_(options.writable_file_max_buffer_size),
         filesize_(0),
@@ -97,7 +185,11 @@
         rate_limiter_(options.rate_limiter),
         stats_(stats),
         listeners_(),
-        checksum_func_(checksum_func) {
+        checksum_generator_(nullptr),
+        checksum_finalized_(false),
+        perform_data_verification_(perform_data_verification),
+        buffered_data_crc32c_checksum_(0),
+        buffered_data_with_checksum_(buffered_data_with_checksum) {
     TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
                              reinterpret_cast<void*>(max_buffer_size_));
     buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
@@ -112,34 +204,50 @@
 #else  // !ROCKSDB_LITE
     (void)listeners;
 #endif
+    if (file_checksum_gen_factory != nullptr) {
+      FileChecksumGenContext checksum_gen_context;
+      checksum_gen_context.file_name = _file_name;
+      checksum_generator_ =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(
+              checksum_gen_context);
+    }
   }
 
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<WritableFileWriter>* writer,
+                         IODebugContext* dbg);
   WritableFileWriter(const WritableFileWriter&) = delete;
 
   WritableFileWriter& operator=(const WritableFileWriter&) = delete;
 
-  ~WritableFileWriter() { Close(); }
+  ~WritableFileWriter() {
+    auto s = Close();
+    s.PermitUncheckedError();
+  }
 
   std::string file_name() const { return file_name_; }
 
-  Status Append(const Slice& data);
+  // When this Append API is called, if the crc32c_checksum is not provided, we
+  // will calculate the checksum internally.
+  IOStatus Append(const Slice& data, uint32_t crc32c_checksum = 0);
 
-  Status Pad(const size_t pad_bytes);
+  IOStatus Pad(const size_t pad_bytes);
 
-  Status Flush();
+  IOStatus Flush();
 
-  Status Close();
+  IOStatus Close();
 
-  Status Sync(bool use_fsync);
+  IOStatus Sync(bool use_fsync);
 
   // Sync only the data that was already Flush()ed. Safe to call concurrently
   // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
   // returns NotSupported status.
-  Status SyncWithoutFlush(bool use_fsync);
+  IOStatus SyncWithoutFlush(bool use_fsync);
 
   uint64_t GetFileSize() const { return filesize_; }
 
-  Status InvalidateCache(size_t offset, size_t length) {
+  IOStatus InvalidateCache(size_t offset, size_t length) {
     return writable_file_->InvalidateCache(offset, length);
   }
 
@@ -149,11 +257,12 @@
 
   bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
 
-  void TEST_SetFileChecksumFunc(FileChecksumFunc* checksum_func) {
-    checksum_func_ = checksum_func;
+  void TEST_SetFileChecksumGenerator(
+      FileChecksumGenerator* checksum_generator) {
+    checksum_generator_.reset(checksum_generator);
   }
 
-  const std::string& GetFileChecksum() const { return file_checksum_; }
+  std::string GetFileChecksum();
 
   const char* GetFileChecksumFuncName() const;
 
@@ -161,11 +270,13 @@
   // Used when os buffering is OFF and we are writing
   // DMA such as in Direct I/O mode
 #ifndef ROCKSDB_LITE
-  Status WriteDirect();
+  IOStatus WriteDirect();
+  IOStatus WriteDirectWithChecksum();
 #endif  // !ROCKSDB_LITE
   // Normal write
-  Status WriteBuffered(const char* data, size_t size);
-  Status RangeSync(uint64_t offset, uint64_t nbytes);
-  Status SyncInternal(bool use_fsync);
+  IOStatus WriteBuffered(const char* data, size_t size);
+  IOStatus WriteBufferedWithChecksum(const char* data, size_t size);
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes);
+  IOStatus SyncInternal(bool use_fsync);
 };
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/Makefile
--- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/Makefile	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/Makefile	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,61 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+ROOT_DIR = $(abspath $(shell pwd)/../)
+
+include $(ROOT_DIR)/make_config.mk
+
+PROTOBUF_CFLAGS = `pkg-config --cflags protobuf`
+PROTOBUF_LDFLAGS = `pkg-config --libs protobuf`
+
+PROTOBUF_MUTATOR_CFLAGS = `pkg-config --cflags libprotobuf-mutator`
+PROTOBUF_MUTATOR_LDFLAGS = `pkg-config --libs libprotobuf-mutator`
+
+ROCKSDB_INCLUDE_DIR = $(ROOT_DIR)/include
+ROCKSDB_LIB_DIR = $(ROOT_DIR)
+
+PROTO_IN = $(ROOT_DIR)/fuzz/proto
+PROTO_OUT = $(ROOT_DIR)/fuzz/proto/gen
+
+ifneq ($(FUZZ_ENV), ossfuzz)
+CC = clang++
+CCFLAGS += -Wall -fsanitize=address,fuzzer
+CFLAGS += $(PLATFORM_CXXFLAGS) $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR)
+LDFLAGS += $(PLATFORM_LDFLAGS) $(PROTOBUF_LDFLAGS) $(PROTOBUF_MUTATOR_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb
+else
+# OSS-Fuzz sets various environment flags that are used for compilation.
+# These environment flags depend on which type of sanitizer build is being
+# used, however, an ASan build would set the environment flags as follows:
+# CFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \
+         -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \
+         -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link"
+# CXXFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \
+           -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \
+           -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link \
+           -stdlib=libc++"
+# LIB_FUZZING_ENGINE="-fsanitize=fuzzer"
+CC = $(CXX)
+CCFLAGS = $(CXXFLAGS)
+CFLAGS += $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR)
+LDFLAGS += $(PLATFORM_LDFLAGS) $(LIB_FUZZING_ENGINE) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb
+endif
+
+.PHONY: gen_proto
+
+gen_proto:
+	mkdir -p $(PROTO_OUT)
+	protoc \
+		--proto_path=$(PROTO_IN) \
+		--cpp_out=$(PROTO_OUT) \
+		$(PROTO_IN)/*.proto
+
+db_fuzzer: db_fuzzer.cc
+	$(CC) $(CCFLAGS) -o db_fuzzer db_fuzzer.cc $(CFLAGS) $(LDFLAGS)
+
+db_map_fuzzer: gen_proto db_map_fuzzer.cc proto/gen/db_operation.pb.cc
+	$(CC) $(CCFLAGS) -o db_map_fuzzer db_map_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS)
+
+sst_file_writer_fuzzer: gen_proto sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc
+	$(CC) $(CCFLAGS) -o sst_file_writer_fuzzer sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/README.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/README.md	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/README.md	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,160 @@
+# Fuzzing RocksDB
+
+## Overview
+
+This directory contains [fuzz tests](https://en.wikipedia.org/wiki/Fuzzing) for RocksDB.
+RocksDB testing infrastructure currently includes unit tests and [stress tests](https://github.com/facebook/rocksdb/wiki/Stress-test),
+we hope fuzz testing can catch more bugs.
+
+## Prerequisite
+
+We use [LLVM libFuzzer](http://llvm.org/docs/LibFuzzer.html) as the fuzzying engine,
+so make sure you have [clang](https://clang.llvm.org/get_started.html) as your compiler.
+
+Some tests rely on [structure aware fuzzing](https://github.com/google/fuzzing/blob/master/docs/structure-aware-fuzzing.md).
+We use [protobuf](https://developers.google.com/protocol-buffers) to define structured input to the fuzzer,
+and use [libprotobuf-mutator](https://github.com/google/libprotobuf-mutator) as the custom libFuzzer mutator.
+So make sure you have protobuf and libprotobuf-mutator installed, and make sure `pkg-config` can find them.
+
+## Example
+
+This example shows you how to do structure aware fuzzing to `rocksdb::SstFileWriter`.
+
+After walking through the steps to create the fuzzer, we'll introduce a bug into `rocksdb::SstFileWriter::Put`,
+then show that the fuzzer can catch the bug.
+
+### Design the test
+
+We want the fuzzing engine to automatically generate a list of database operations,
+then we apply these operations to `SstFileWriter` in sequence,
+finally, after the SST file is generated, we use `SstFileReader` to check the file's checksum.
+
+### Define input
+
+We define the database operations in protobuf, each operation has a type of operation and a key value pair,
+see [proto/db_operation.proto](proto/db_operation.proto) for details.
+
+### Define tests with the input
+
+In [sst_file_writer_fuzzer.cc](sst_file_writer_fuzzer.cc),
+we define the tests to be run on the generated input:
+
+```
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  // apply the operations to SstFileWriter and use SstFileReader to verify checksum.
+  // ...
+}
+```
+
+`SstFileWriter` requires the keys of the operations to be unique and be in ascending order,
+but the fuzzing engine generates the input randomly, so we need to process the generated input before
+passing it to `DEFINE_PROTO_FUZZER`, this is accomplished by registering a post processor:
+
+```
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations>
+```
+
+### Compile and link the fuzzer
+
+In the rocksdb root directory, compile rocksdb library by `make static_lib`.
+
+Go to the `fuzz` directory,
+run `make sst_file_writer_fuzzer` to generate the fuzzer,
+it will compile rocksdb static library, generate protobuf, then compile and link `sst_file_writer_fuzzer`.
+
+### Introduce a bug
+
+Manually introduce a bug to `SstFileWriter::Put`:
+
+```
+diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
+index ab1ee7c4e..c7da9ffa0 100644
+--- a/table/sst_file_writer.cc
++++ b/table/sst_file_writer.cc
+@@ -277,6 +277,11 @@ Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
+ }
+
+ Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
++  if (user_key.starts_with("!")) {
++    if (value.ends_with("!")) {
++      return Status::Corruption("bomb");
++    }
++  }
+   return rep_->Add(user_key, value, ValueType::kTypeValue);
+ }
+```
+
+The bug is that for `Put`, if `user_key` starts with `!` and `value` ends with `!`, then corrupt.
+
+### Run fuzz testing to catch the bug
+
+Run the fuzzer by `time ./sst_file_writer_fuzzer`.
+
+Here is the output on my machine:
+
+```
+Corruption: bomb
+==59680== ERROR: libFuzzer: deadly signal
+    #0 0x109487315 in __sanitizer_print_stack_trace+0x35 (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x4d315)
+    #1 0x108d63f18 in fuzzer::PrintStackTrace() FuzzerUtil.cpp:205
+    #2 0x108d47613 in fuzzer::Fuzzer::CrashCallback() FuzzerLoop.cpp:232
+    #3 0x7fff6af535fc in _sigtramp+0x1c (libsystem_platform.dylib:x86_64+0x35fc)
+    #4 0x7ffee720f3ef  (<unknown module>)
+    #5 0x7fff6ae29807 in abort+0x77 (libsystem_c.dylib:x86_64+0x7f807)
+    #6 0x108cf1c4c in TestOneProtoInput(DBOperations&)+0x113c (sst_file_writer_fuzzer:x86_64+0x100302c4c)
+    #7 0x108cf09be in LLVMFuzzerTestOneInput+0x16e (sst_file_writer_fuzzer:x86_64+0x1003019be)
+    #8 0x108d48ce0 in fuzzer::Fuzzer::ExecuteCallback(unsigned char const*, unsigned long) FuzzerLoop.cpp:556
+    #9 0x108d48425 in fuzzer::Fuzzer::RunOne(unsigned char const*, unsigned long, bool, fuzzer::InputInfo*, bool*) FuzzerLoop.cpp:470
+    #10 0x108d4a626 in fuzzer::Fuzzer::MutateAndTestOne() FuzzerLoop.cpp:698
+    #11 0x108d4b325 in fuzzer::Fuzzer::Loop(std::__1::vector<fuzzer::SizedFile, fuzzer::fuzzer_allocator<fuzzer::SizedFile> >&) FuzzerLoop.cpp:830
+    #12 0x108d37fcd in fuzzer::FuzzerDriver(int*, char***, int (*)(unsigned char const*, unsigned long)) FuzzerDriver.cpp:829
+    #13 0x108d652b2 in main FuzzerMain.cpp:19
+    #14 0x7fff6ad5acc8 in start+0x0 (libdyld.dylib:x86_64+0x1acc8)
+
+NOTE: libFuzzer has rudimentary signal handlers.
+      Combine libFuzzer with AddressSanitizer or similar for better crash reports.
+SUMMARY: libFuzzer: deadly signal
+MS: 7 Custom-CustomCrossOver-InsertByte-Custom-ChangeBit-Custom-CustomCrossOver-; base unit: 90863b4d83c3f994bba0a417d0c2ee3b68f9e795
+0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x76,0x61,0x6c,0x75,0x65,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2b,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2e,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x5c,0x32,0x35,0x33,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,
+operations {\x0a  key: \"!\"\x0a  value: \"!\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \"+\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \".\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \"\\253\"\x0a  type: PUT\x0a}\x0a
+artifact_prefix='./'; Test unit written to ./crash-a1460be302d09b548e61787178d9edaa40aea467
+Base64: b3BlcmF0aW9ucyB7CiAga2V5OiAiISIKICB2YWx1ZTogIiEiCiAgdHlwZTogUFVUCn0Kb3BlcmF0aW9ucyB7CiAga2V5OiAiKyIKICB0eXBlOiBQVVQKfQpvcGVyYXRpb25zIHsKICBrZXk6ICIuIgogIHR5cGU6IFBVVAp9Cm9wZXJhdGlvbnMgewogIGtleTogIlwyNTMiCiAgdHlwZTogUFVUCn0K
+./sst_file_writer_fuzzer  5.97s user 4.40s system 64% cpu 16.195 total
+```
+
+Within 6 seconds, it catches the bug.
+
+The input that triggers the bug is persisted in `./crash-a1460be302d09b548e61787178d9edaa40aea467`:
+
+```
+$ cat ./crash-a1460be302d09b548e61787178d9edaa40aea467
+operations {
+  key: "!"
+  value: "!"
+  type: PUT
+}
+operations {
+  key: "+"
+  type: PUT
+}
+operations {
+  key: "."
+  type: PUT
+}
+operations {
+  key: "\253"
+  type: PUT
+}
+```
+
+### Reproduce the crash to debug
+
+The above crash can be reproduced by `./sst_file_writer_fuzzer ./crash-a1460be302d09b548e61787178d9edaa40aea467`,
+so you can debug the crash.
+
+## Future Work
+
+According to [OSS-Fuzz](https://github.com/google/oss-fuzz),
+`as of June 2020, OSS-Fuzz has found over 20,000 bugs in 300 open source projects.`
+
+RocksDB can join OSS-Fuzz together with other open source projects such as sqlite.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_fuzzer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,164 @@
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include "rocksdb/db.h"
+
+enum OperationType {
+  kPut,
+  kGet,
+  kDelete,
+  kGetProperty,
+  kIterator,
+  kSnapshot,
+  kOpenClose,
+  kColumn,
+  kCompactRange,
+  kSeekForPrev,
+  OP_COUNT
+};
+
+constexpr char db_path[] = "/tmp/testdb";
+
+// Fuzzes DB operations by doing interpretations on the data. Both the
+// sequence of API calls to be called on the DB as well as the arguments
+// to each of these APIs are interpreted by way of the data buffer.
+// The operations that the fuzzer supports are given by the OperationType
+// enum. The goal is to capture sanitizer bugs, so the code should be
+// compiled with a given sanitizer (ASan, UBSan, MSan).
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  ROCKSDB_NAMESPACE::DB* db;
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  ROCKSDB_NAMESPACE::Status status =
+      ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+  if (!status.ok()) {
+    return 0;
+  }
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  // perform a sequence of calls on our db instance
+  int max_iter = static_cast<int>(data[0]);
+  for (int i = 0; i < max_iter && i < size; i++) {
+    OperationType op = static_cast<OperationType>(data[i] % OP_COUNT);
+
+    switch (op) {
+      case kPut: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        std::string val = fuzzed_data.ConsumeRandomLengthString();
+        db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, val);
+        break;
+      }
+      case kGet: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        std::string value;
+        db->Get(ROCKSDB_NAMESPACE::ReadOptions(), key, &value);
+        break;
+      }
+      case kDelete: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        db->Delete(ROCKSDB_NAMESPACE::WriteOptions(), key);
+        break;
+      }
+      case kGetProperty: {
+        std::string prop;
+        std::string property_name = fuzzed_data.ConsumeRandomLengthString();
+        db->GetProperty(property_name, &prop);
+        break;
+      }
+      case kIterator: {
+        ROCKSDB_NAMESPACE::Iterator* it =
+            db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        }
+        delete it;
+        break;
+      }
+      case kSnapshot: {
+        ROCKSDB_NAMESPACE::ReadOptions snapshot_options;
+        snapshot_options.snapshot = db->GetSnapshot();
+        ROCKSDB_NAMESPACE::Iterator* it = db->NewIterator(snapshot_options);
+        db->ReleaseSnapshot(snapshot_options.snapshot);
+        delete it;
+        break;
+      }
+      case kOpenClose: {
+        db->Close();
+        delete db;
+        status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+        if (!status.ok()) {
+          ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
+          return 0;
+        }
+
+        break;
+      }
+      case kColumn: {
+        ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf;
+        ROCKSDB_NAMESPACE::Status s;
+        s = db->CreateColumnFamily(ROCKSDB_NAMESPACE::ColumnFamilyOptions(),
+                                   "new_cf", &cf);
+        s = db->DestroyColumnFamilyHandle(cf);
+        db->Close();
+        delete db;
+
+        // open DB with two column families
+        std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
+        // have to open default column family
+        column_families.push_back(ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(
+            ROCKSDB_NAMESPACE::kDefaultColumnFamilyName,
+            ROCKSDB_NAMESPACE::ColumnFamilyOptions()));
+        // open the new one, too
+        column_families.push_back(ROCKSDB_NAMESPACE::ColumnFamilyDescriptor(
+            "new_cf", ROCKSDB_NAMESPACE::ColumnFamilyOptions()));
+        std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> handles;
+        s = ROCKSDB_NAMESPACE::DB::Open(ROCKSDB_NAMESPACE::DBOptions(), db_path,
+                                        column_families, &handles, &db);
+
+        if (s.ok()) {
+          std::string key1 = fuzzed_data.ConsumeRandomLengthString();
+          std::string val1 = fuzzed_data.ConsumeRandomLengthString();
+          std::string key2 = fuzzed_data.ConsumeRandomLengthString();
+          s = db->Put(ROCKSDB_NAMESPACE::WriteOptions(), handles[1], key1,
+                      val1);
+          std::string value;
+          s = db->Get(ROCKSDB_NAMESPACE::ReadOptions(), handles[1], key2,
+                      &value);
+          s = db->DropColumnFamily(handles[1]);
+          for (auto handle : handles) {
+            s = db->DestroyColumnFamilyHandle(handle);
+          }
+        } else {
+          status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+          if (!status.ok()) {
+            // At this point there is no saving to do. So we exit
+            ROCKSDB_NAMESPACE::DestroyDB(db_path, ROCKSDB_NAMESPACE::Options());
+            return 0;
+          }
+        }
+        break;
+      }
+      case kCompactRange: {
+        std::string slice_start = fuzzed_data.ConsumeRandomLengthString();
+        std::string slice_end = fuzzed_data.ConsumeRandomLengthString();
+
+        ROCKSDB_NAMESPACE::Slice begin(slice_start);
+        ROCKSDB_NAMESPACE::Slice end(slice_end);
+        ROCKSDB_NAMESPACE::CompactRangeOptions options;
+        ROCKSDB_NAMESPACE::Status s = db->CompactRange(options, &begin, &end);
+        break;
+      }
+      case kSeekForPrev: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        auto iter = db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+        iter->SeekForPrev(key);
+        delete iter;
+        break;
+      }
+    }
+  }
+
+  // Cleanup DB
+  db->Close();
+  delete db;
+  ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
+  return 0;
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/db_map_fuzzer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "proto/gen/db_operation.pb.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "src/libfuzzer/libfuzzer_macro.h"
+#include "util.h"
+
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations> reg = {
+    [](DBOperations* input, unsigned int /* seed */) {
+      const ROCKSDB_NAMESPACE::Comparator* comparator =
+          ROCKSDB_NAMESPACE::BytewiseComparator();
+      auto ops = input->mutable_operations();
+      // Make sure begin <= end for DELETE_RANGE.
+      for (DBOperation& op : *ops) {
+        if (op.type() == OpType::DELETE_RANGE) {
+          auto begin = op.key();
+          auto end = op.value();
+          if (comparator->Compare(begin, end) > 0) {
+            std::swap(begin, end);
+            op.set_key(begin);
+            op.set_value(end);
+          }
+        }
+      }
+    }};
+
+// Execute randomly generated operations on both a DB and a std::map,
+// then reopen the DB and make sure that iterating the DB produces the
+// same key-value pairs as iterating through the std::map.
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  if (input.operations().empty()) {
+    return;
+  }
+
+  const std::string kDbPath = "/tmp/db_map_fuzzer_test";
+  auto fs = ROCKSDB_NAMESPACE::FileSystem::Default();
+  if (fs->FileExists(kDbPath, ROCKSDB_NAMESPACE::IOOptions(), /*dbg=*/nullptr)
+          .ok()) {
+    std::cerr << "db path " << kDbPath << " already exists" << std::endl;
+    abort();
+  }
+
+  std::map<std::string, std::string> kv;
+  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
+
+  for (const DBOperation& op : input.operations()) {
+    switch (op.type()) {
+      case OpType::PUT: {
+        CHECK_OK(
+            db->Put(ROCKSDB_NAMESPACE::WriteOptions(), op.key(), op.value()));
+        kv[op.key()] = op.value();
+        break;
+      }
+      case OpType::MERGE: {
+        break;
+      }
+      case OpType::DELETE: {
+        CHECK_OK(db->Delete(ROCKSDB_NAMESPACE::WriteOptions(), op.key()));
+        kv.erase(op.key());
+        break;
+      }
+      case OpType::DELETE_RANGE: {
+        // [op.key(), op.value()) corresponds to [begin, end).
+        CHECK_OK(db->DeleteRange(ROCKSDB_NAMESPACE::WriteOptions(),
+                                 db->DefaultColumnFamily(), op.key(),
+                                 op.value()));
+        kv.erase(kv.lower_bound(op.key()), kv.lower_bound(op.value()));
+        break;
+      }
+      default: {
+        std::cerr << "Unsupported operation" << static_cast<int>(op.type());
+        return;
+      }
+    }
+  }
+  CHECK_OK(db->Close());
+  delete db;
+  db = nullptr;
+
+  CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
+  auto kv_it = kv.begin();
+  ROCKSDB_NAMESPACE::Iterator* it =
+      db->NewIterator(ROCKSDB_NAMESPACE::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next(), kv_it++) {
+    CHECK_TRUE(kv_it != kv.end());
+    CHECK_EQ(it->key().ToString(), kv_it->first);
+    CHECK_EQ(it->value().ToString(), kv_it->second);
+  }
+  CHECK_TRUE(kv_it == kv.end());
+  delete it;
+
+  CHECK_OK(db->Close());
+  delete db;
+  CHECK_OK(ROCKSDB_NAMESPACE::DestroyDB(kDbPath, options));
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto
--- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/proto/db_operation.proto	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,28 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// Defines database operations.
+// Each operation is a key-value pair and an operation type.
+
+syntax = "proto2";
+
+enum OpType {
+  PUT = 0;
+  MERGE = 1;
+  DELETE = 2;
+  DELETE_RANGE = 3;
+}
+
+message DBOperation {
+  required string key = 1;
+  // value is ignored for DELETE.
+  // [key, value] is the range for DELETE_RANGE.
+  optional string value = 2;
+  required OpType type = 3;
+}
+
+message DBOperations {
+  repeated DBOperation operations = 1;
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/sst_file_writer_fuzzer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,185 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "proto/gen/db_operation.pb.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "src/libfuzzer/libfuzzer_macro.h"
+#include "table/table_reader.h"
+#include "util.h"
+
+// Keys in SST file writer operations must be unique and in ascending order.
+// For each DBOperation generated by the fuzzer, this function is called on
+// it to deduplicate and sort the keys in the DBOperations.
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations> reg = {
+    [](DBOperations* input, unsigned int /* seed */) {
+      const Comparator* comparator = BytewiseComparator();
+      auto ops = input->mutable_operations();
+
+      // Make sure begin <= end for DELETE_RANGE.
+      for (DBOperation& op : *ops) {
+        if (op.type() == OpType::DELETE_RANGE) {
+          auto begin = op.key();
+          auto end = op.value();
+          if (comparator->Compare(begin, end) > 0) {
+            std::swap(begin, end);
+            op.set_key(begin);
+            op.set_value(end);
+          }
+        }
+      }
+
+      std::sort(ops->begin(), ops->end(),
+                [&comparator](const DBOperation& a, const DBOperation& b) {
+                  return comparator->Compare(a.key(), b.key()) < 0;
+                });
+
+      auto last = std::unique(
+          ops->begin(), ops->end(),
+          [&comparator](const DBOperation& a, const DBOperation& b) {
+            return comparator->Compare(a.key(), b.key()) == 0;
+          });
+      ops->erase(last, ops->end());
+    }};
+
+TableReader* NewTableReader(const std::string& sst_file_path,
+                            const Options& options,
+                            const EnvOptions& env_options,
+                            const ImmutableCFOptions& cf_ioptions) {
+  // This code block is similar to SstFileReader::Open.
+
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions fopts(env_options);
+  Status s = options.env->GetFileSize(sst_file_path, fopts.io_options,
+                                      &file_size, nullptr);
+  if (s.ok()) {
+    s = RandomAccessFileReader::Create(fs, sst_file_path, fopts, &file_reader,
+                                       nullptr);
+  }
+  if (s.ok()) {
+    TableReaderOptions t_opt(cf_ioptions, /*prefix_extractor=*/nullptr,
+                             env_options, cf_ioptions.internal_comparator);
+    t_opt.largest_seqno = kMaxSequenceNumber;
+    s = options.table_factory->NewTableReader(t_opt, std::move(file_reader),
+                                              file_size, &table_reader,
+                                              /*prefetch=*/false);
+  }
+  if (!s.ok()) {
+    std::cerr << "Failed to create TableReader for " << sst_file_path << ": "
+              << s.ToString() << std::endl;
+    abort();
+  }
+  return table_reader.release();
+}
+
+ValueType ToValueType(OpType op_type) {
+  switch (op_type) {
+    case OpType::PUT:
+      return ValueType::kTypeValue;
+    case OpType::MERGE:
+      return ValueType::kTypeMerge;
+    case OpType::DELETE:
+      return ValueType::kTypeDeletion;
+    case OpType::DELETE_RANGE:
+      return ValueType::kTypeRangeDeletion;
+    default:
+      std::cerr << "Unknown operation type " << static_cast<int>(op_type)
+                << std::endl;
+      abort();
+  }
+}
+
+// Fuzzes DB operations as input, let SstFileWriter generate a SST file
+// according to the operations, then let TableReader read and check all the
+// key-value pairs from the generated SST file.
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  if (input.operations().empty()) {
+    return;
+  }
+
+  std::string sstfile;
+  {
+    auto fs = FileSystem::Default();
+    std::string dir;
+    IOOptions opt;
+    CHECK_OK(fs->GetTestDirectory(opt, &dir, nullptr));
+    sstfile = dir + "/SstFileWriterFuzzer.sst";
+  }
+
+  Options options;
+  EnvOptions env_options(options);
+  ImmutableCFOptions cf_ioptions(options);
+
+  // Generate sst file.
+  SstFileWriter writer(env_options, options);
+  CHECK_OK(writer.Open(sstfile));
+  for (const DBOperation& op : input.operations()) {
+    switch (op.type()) {
+      case OpType::PUT: {
+        CHECK_OK(writer.Put(op.key(), op.value()));
+        break;
+      }
+      case OpType::MERGE: {
+        CHECK_OK(writer.Merge(op.key(), op.value()));
+        break;
+      }
+      case OpType::DELETE: {
+        CHECK_OK(writer.Delete(op.key()));
+        break;
+      }
+      case OpType::DELETE_RANGE: {
+        CHECK_OK(writer.DeleteRange(op.key(), op.value()));
+        break;
+      }
+      default: {
+        std::cerr << "Unsupported operation" << static_cast<int>(op.type())
+                  << std::endl;
+        abort();
+      }
+    }
+  }
+  ExternalSstFileInfo info;
+  CHECK_OK(writer.Finish(&info));
+
+  // Iterate and verify key-value pairs.
+  std::unique_ptr<TableReader> table_reader(
+      NewTableReader(sstfile, options, env_options, cf_ioptions));
+  ReadOptions roptions;
+  CHECK_OK(table_reader->VerifyChecksum(roptions,
+                                        TableReaderCaller::kUncategorized));
+  std::unique_ptr<InternalIterator> it(
+      table_reader->NewIterator(roptions, /*prefix_extractor=*/nullptr,
+                                /*arena=*/nullptr, /*skip_filters=*/true,
+                                TableReaderCaller::kUncategorized));
+  it->SeekToFirst();
+  for (const DBOperation& op : input.operations()) {
+    if (op.type() == OpType::DELETE_RANGE) {
+      // InternalIterator cannot iterate over DELETE_RANGE entries.
+      continue;
+    }
+    CHECK_TRUE(it->Valid());
+    ParsedInternalKey ikey;
+    CHECK_OK(ParseInternalKey(it->key(), &ikey, /*log_err_key=*/true));
+    CHECK_EQ(ikey.user_key.ToString(), op.key());
+    CHECK_EQ(ikey.sequence, 0);
+    CHECK_EQ(ikey.type, ToValueType(op.type()));
+    if (op.type() != OpType::DELETE) {
+      CHECK_EQ(op.value(), it->value().ToString());
+    }
+    it->Next();
+  }
+  CHECK_TRUE(!it->Valid());
+
+  // Delete sst file.
+  remove(sstfile.c_str());
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/util.h mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/fuzz/util.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/fuzz/util.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,23 @@
+#pragma once
+
+#define CHECK_OK(expression)                       \
+  do {                                             \
+    auto status = (expression);                    \
+    if (!status.ok()) {                            \
+      std::cerr << status.ToString() << std::endl; \
+      abort();                                     \
+    }                                              \
+  } while (0)
+
+#define CHECK_EQ(a, b)                                                      \
+  if (a != b) {                                                             \
+    std::cerr << "(" << #a << "=" << a << ") != (" << #b << "=" << b << ")" \
+              << std::endl;                                                 \
+    abort();                                                                \
+  }
+
+#define CHECK_TRUE(cond)                                      \
+  if (!(cond)) {                                              \
+    std::cerr << "\"" << #cond << "\" is false" << std::endl; \
+    abort();                                                  \
+  }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/hdfs/env_hdfs.h mariadb-10.11.13/storage/rocksdb/rocksdb/hdfs/env_hdfs.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/hdfs/env_hdfs.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/hdfs/env_hdfs.h	2025-05-19 16:14:27.000000000 +0000
@@ -48,6 +48,10 @@
     posixEnv = Env::Default();
     fileSys_ = connectToPath(fsname_);
   }
+  static const char* kClassName() { return "HdfsEnv"; }
+  const char* Name() const override { return kClassName(); }
+  static const char* kNickName() { return "hdfs"; }
+  const char* NickName() const override { return kNickName(); }
 
   virtual ~HdfsEnv() {
     fprintf(stderr, "Destroying HdfsEnv::Default()\n");
@@ -101,6 +105,8 @@
   Status NewLogger(const std::string& fname,
                    std::shared_ptr<Logger>* result) override;
 
+  Status IsDirectory(const std::string& path, bool* is_dir) override;
+
   void Schedule(void (*function)(void* arg), void* arg, Priority pri = LOW,
                 void* tag = nullptr,
                 void (*unschedFunction)(void* arg) = 0) override {
@@ -160,10 +166,7 @@
     return posixEnv->TimeToString(number);
   }
 
-  static uint64_t gettid() {
-    assert(sizeof(pthread_t) <= sizeof(uint64_t));
-    return (uint64_t)pthread_self();
-  }
+  static uint64_t gettid() { return Env::Default()->GetThreadID(); }
 
   uint64_t GetThreadID() const override { return HdfsEnv::gettid(); }
 
@@ -207,8 +210,7 @@
     std::string portStr = (rem == 0 ? remaining :
                            remaining.substr(0, rem));
 
-    tPort port;
-    port = atoi(portStr.c_str());
+    tPort port = static_cast<tPort>(atoi(portStr.c_str()));
     if (port == 0) {
       throw HdfsFatalException("Bad host-port for hdfs " + uri);
     }
@@ -236,8 +238,6 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-static const Status notsup;
-
 class HdfsEnv : public Env {
 
  public:
@@ -246,6 +246,10 @@
     fprintf(stderr, "Please see hdfs/README for details\n");
     abort();
   }
+  static const char* kClassName() { return "HdfsEnv"; }
+  const char* Name() const override { return kClassName(); }
+  static const char* kNickName() { return "hdfs"; }
+  const char* NickName() const override { return kNickName(); }
 
   virtual ~HdfsEnv() {
   }
@@ -258,75 +262,81 @@
       const std::string& /*fname*/,
       std::unique_ptr<RandomAccessFile>* /*result*/,
       const EnvOptions& /*options*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status NewWritableFile(const std::string& /*fname*/,
                                  std::unique_ptr<WritableFile>* /*result*/,
                                  const EnvOptions& /*options*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status NewDirectory(const std::string& /*name*/,
                               std::unique_ptr<Directory>* /*result*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status FileExists(const std::string& /*fname*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status GetChildren(const std::string& /*path*/,
                              std::vector<std::string>* /*result*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status DeleteFile(const std::string& /*fname*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status CreateDir(const std::string& /*name*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status CreateDirIfMissing(const std::string& /*name*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status DeleteDir(const std::string& /*name*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status GetFileSize(const std::string& /*fname*/,
                              uint64_t* /*size*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status GetFileModificationTime(const std::string& /*fname*/,
                                          uint64_t* /*time*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status RenameFile(const std::string& /*src*/,
                             const std::string& /*target*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status LinkFile(const std::string& /*src*/,
                           const std::string& /*target*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status LockFile(const std::string& /*fname*/,
                           FileLock** /*lock*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
-  virtual Status UnlockFile(FileLock* /*lock*/) override { return notsup; }
+  virtual Status UnlockFile(FileLock* /*lock*/) override {
+    return Status::NotSupported();
+  }
 
   virtual Status NewLogger(const std::string& /*fname*/,
                            std::shared_ptr<Logger>* /*result*/) override {
-    return notsup;
+    return Status::NotSupported();
+  }
+
+  Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) override {
+    return Status::NotSupported();
   }
 
   virtual void Schedule(void (* /*function*/)(void* arg), void* /*arg*/,
@@ -346,7 +356,7 @@
   }
 
   virtual Status GetTestDirectory(std::string* /*path*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual uint64_t NowMicros() override { return 0; }
@@ -354,16 +364,16 @@
   virtual void SleepForMicroseconds(int /*micros*/) override {}
 
   virtual Status GetHostName(char* /*name*/, uint64_t /*len*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status GetCurrentTime(int64_t* /*unix_time*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual Status GetAbsolutePath(const std::string& /*db_path*/,
                                  std::string* /*outputpath*/) override {
-    return notsup;
+    return Status::NotSupported();
   }
 
   virtual void SetBackgroundThreads(int /*number*/,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/advanced_options.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 
 #include <memory>
 
+#include "rocksdb/compression_type.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/universal_compaction.h"
 
@@ -17,7 +18,6 @@
 
 class Slice;
 class SliceTransform;
-enum CompressionType : unsigned char;
 class TablePropertiesCollectorFactory;
 class TableFactory;
 struct Options;
@@ -70,6 +70,10 @@
   // Default: false;
   bool allow_compaction = false;
 
+  // When not 0, if the data in the file is older than this threshold, RocksDB
+  // will soon move the file to warm temperature.
+  uint64_t age_for_warm = 0;
+
   CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
   CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
       : max_table_files_size(_max_table_files_size),
@@ -101,9 +105,14 @@
   //
   // When compression dictionary is disabled, we compress and write each block
   // before buffering data for the next one. When compression dictionary is
-  // enabled, we buffer all SST file data in-memory so we can sample it, as data
+  // enabled, we buffer SST file data in-memory so we can sample it, as data
   // can only be compressed and written after the dictionary has been finalized.
-  // So users of this feature may see increased memory usage.
+  //
+  // The amount of data buffered can be limited by `max_dict_buffer_bytes`. This
+  // buffered memory is charged to the block cache when there is a block cache.
+  // If block cache insertion fails with `Status::Incomplete` (i.e., it is
+  // full), we finalize the dictionary with whatever data we have and then stop
+  // buffering.
   //
   // Default: 0.
   uint32_t max_dict_bytes;
@@ -117,6 +126,21 @@
   // Default: 0.
   uint32_t zstd_max_train_bytes;
 
+  // Number of threads for parallel compression.
+  // Parallel compression is enabled only if threads > 1.
+  // THE FEATURE IS STILL EXPERIMENTAL
+  //
+  // This option is valid only when BlockBasedTable is used.
+  //
+  // When parallel compression is enabled, SST size file sizes might be
+  // more inflated compared to the target size, because more data of unknown
+  // compressed size is in flight when compression is parallelized. To be
+  // reasonably accurate, this inflation is also estimated by using historical
+  // compression ratio and current bytes inflight.
+  //
+  // Default: 1.
+  uint32_t parallel_threads;
+
   // When the compression options are set by the user, it will be set to "true".
   // For bottommost_compression_opts, to enable it, user must set enabled=true.
   // Otherwise, bottommost compression will use compression_opts as default
@@ -128,21 +152,67 @@
   // Default: false.
   bool enabled;
 
+  // Limit on data buffering when gathering samples to build a dictionary. Zero
+  // means no limit. When dictionary is disabled (`max_dict_bytes == 0`),
+  // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect.
+  //
+  // In compaction, the buffering is limited to the target file size (see
+  // `target_file_size_base` and `target_file_size_multiplier`) even if this
+  // setting permits more buffering. Since we cannot determine where the file
+  // should be cut until data blocks are compressed with dictionary, buffering
+  // more than the target file size could lead to selecting samples that belong
+  // to a later output SST.
+  //
+  // Limiting too strictly may harm dictionary effectiveness since it forces
+  // RocksDB to pick samples from the initial portion of the output SST, which
+  // may not be representative of the whole file. Configuring this limit below
+  // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can
+  // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can
+  // restrict the size of the final dictionary.
+  //
+  // Default: 0 (unlimited)
+  uint64_t max_dict_buffer_bytes;
+
   CompressionOptions()
       : window_bits(-14),
         level(kDefaultCompressionLevel),
         strategy(0),
         max_dict_bytes(0),
         zstd_max_train_bytes(0),
-        enabled(false) {}
-  CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes,
-                     int _zstd_max_train_bytes, bool _enabled)
+        parallel_threads(1),
+        enabled(false),
+        max_dict_buffer_bytes(0) {}
+  CompressionOptions(int wbits, int _lev, int _strategy,
+                     uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes,
+                     uint32_t _parallel_threads, bool _enabled,
+                     uint64_t _max_dict_buffer_bytes)
       : window_bits(wbits),
         level(_lev),
         strategy(_strategy),
         max_dict_bytes(_max_dict_bytes),
         zstd_max_train_bytes(_zstd_max_train_bytes),
-        enabled(_enabled) {}
+        parallel_threads(_parallel_threads),
+        enabled(_enabled),
+        max_dict_buffer_bytes(_max_dict_buffer_bytes) {}
+};
+
+// Temperature of a file. Used to pass to FileSystem for a different
+// placement and/or coding.
+// Reserve some numbers in the middle, in case we need to insert new tier
+// there.
+enum class Temperature : uint8_t {
+  kUnknown = 0,
+  kHot = 0x04,
+  kWarm = 0x08,
+  kCold = 0x0C,
+};
+
+// The control option of how the cache tiers will be used. Currently rocksdb
+// support block cahe (volatile tier), secondary cache (non-volatile tier).
+// In the future, we may add more caching layers.
+enum class CacheTier : uint8_t {
+  kVolatileTier = 0,
+  kNonVolatileBlockTier = 0x01,
 };
 
 enum UpdateStatus {    // Return status For inplace update callback
@@ -183,17 +253,32 @@
   // ignored.
   int max_write_buffer_number_to_maintain = 0;
 
-  // The total maximum size(bytes) of write buffers to maintain in memory
-  // including copies of buffers that have already been flushed. This parameter
-  // only affects trimming of flushed buffers and does not affect flushing.
-  // This controls the maximum amount of write history that will be available
-  // in memory for conflict checking when Transactions are used. The actual
-  // size of write history (flushed Memtables) might be higher than this limit
-  // if further trimming will reduce write history total size below this
-  // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB,
-  // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB.
-  // Because trimming the next Memtable of size 20MB will reduce total memory
-  // usage to 52MB which is below the limit, RocksDB will stop trimming.
+  // The target number of write history bytes to hold in memory. Write history
+  // comprises the latest write buffers (memtables). To reach the target, write
+  // buffers that were most recently flushed to SST files may be retained in
+  // memory.
+  //
+  // This controls the target amount of write history that will be available
+  // in memory for conflict checking when Transactions are used.
+  //
+  // This target may be undershot when the CF first opens and has not recovered
+  // or received enough writes to reach the target. After reaching the target
+  // once, it is guaranteed to never undershoot again. That guarantee is
+  // implemented by retaining flushed write buffers in-memory until the oldest
+  // one can be trimmed without dropping below the target.
+  //
+  // Examples with `max_write_buffer_size_to_maintain` set to 32MB:
+  //
+  // - One mutable memtable of 64MB, one unflushed immutable memtable of 64MB,
+  //   and zero flushed immutable memtables. Nothing trimmable exists.
+  // - One mutable memtable of 16MB, zero unflushed immutable memtables, and
+  //   one flushed immutable memtable of 64MB. Trimming is disallowed because
+  //   dropping the earliest (only) flushed immutable memtable would result in
+  //   write history of 16MB < 32MB.
+  // - One mutable memtable of 24MB, one unflushed immutable memtable of 16MB,
+  //   and one flushed immutable memtable of 16MB. The earliest (only) flushed
+  //   immutable memtable is trimmed because without it we still have
+  //   16MB + 24MB = 40MB > 32MB of write history.
   //
   // When using an OptimisticTransactionDB:
   // If this value is too low, some transactions may fail at commit time due
@@ -219,6 +304,7 @@
   // achieve point-in-time consistency using snapshot or iterator (assuming
   // concurrent updates). Hence iterator and multi-get will return results
   // which are not consistent as of any point-in-time.
+  // Backward iteration on memtables will not work either.
   // If inplace_callback function is not set,
   //   Put(key, new_value) will update inplace the existing_value iff
   //   * key exists in current memtable
@@ -241,45 +327,55 @@
   // delta_value - Delta value to be merged with the existing_value.
   //               Stored in transaction logs.
   // merged_value - Set when delta is applied on the previous value.
-
+  //
   // Applicable only when inplace_update_support is true,
   // this callback function is called at the time of updating the memtable
   // as part of a Put operation, lets say Put(key, delta_value). It allows the
   // 'delta_value' specified as part of the Put operation to be merged with
   // an 'existing_value' of the key in the database.
-
+  //
   // If the merged value is smaller in size that the 'existing_value',
   // then this function can update the 'existing_value' buffer inplace and
   // the corresponding 'existing_value'_size pointer, if it wishes to.
   // The callback should return UpdateStatus::UPDATED_INPLACE.
   // In this case. (In this case, the snapshot-semantics of the rocksdb
   // Iterator is not atomic anymore).
-
+  //
   // If the merged value is larger in size than the 'existing_value' or the
   // application does not wish to modify the 'existing_value' buffer inplace,
   // then the merged value should be returned via *merge_value. It is set by
   // merging the 'existing_value' and the Put 'delta_value'. The callback should
   // return UpdateStatus::UPDATED in this case. This merged value will be added
   // to the memtable.
-
+  //
   // If merging fails or the application does not wish to take any action,
   // then the callback should return UpdateStatus::UPDATE_FAILED.
-
+  //
   // Please remember that the original call from the application is Put(key,
   // delta_value). So the transaction log (if enabled) will still contain (key,
   // delta_value). The 'merged_value' is not stored in the transaction log.
   // Hence the inplace_callback function should be consistent across db reopens.
-
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  //
   // Default: nullptr
   UpdateStatus (*inplace_callback)(char* existing_value,
                                    uint32_t* existing_value_size,
                                    Slice delta_value,
                                    std::string* merged_value) = nullptr;
 
-  // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
-  // create prefix bloom for memtable with the size of
+  // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
+  // Bloom filter in memtable to optimize many queries that must go beyond
+  // the memtable. The size in bytes of the filter is
   // write_buffer_size * memtable_prefix_bloom_size_ratio.
-  // If it is larger than 0.25, it is sanitized to 0.25.
+  // * If prefix_extractor is set, the filter includes prefixes.
+  // * If memtable_whole_key_filtering, the filter includes whole keys.
+  // * If both, the filter includes both.
+  // * If neither, the feature is disabled.
+  //
+  // If this value is larger than 0.25, it is sanitized to 0.25.
   //
   // Default: 0 (disable)
   //
@@ -338,7 +434,8 @@
 
   // size of one block in arena memory allocation.
   // If <= 0, a proper value is automatically calculated (usually 1/8 of
-  // writer_buffer_size, rounded up to a multiple of 4KB).
+  // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is
+  // smaller).
   //
   // There are two additional restriction of the specified size:
   // (1) size should be in the range of [4096, 2 << 30] and
@@ -591,8 +688,8 @@
   // the tables.
   // Default: empty vector -- no user-defined statistics collection will be
   // performed.
-  typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
-      TablePropertiesCollectorFactories;
+  using TablePropertiesCollectorFactories =
+      std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>;
   TablePropertiesCollectorFactories table_properties_collector_factories;
 
   // Maximum number of successive merge operations on a key in the memtable.
@@ -624,18 +721,32 @@
   // Default: false
   bool optimize_filters_for_hits = false;
 
+  // During flush or compaction, check whether keys inserted to output files
+  // are in order.
+  //
+  // Default: true
+  //
+  // Dynamically changeable through SetOptions() API
+  bool check_flush_compaction_key_order = true;
+
   // After writing every SST file, reopen it and read all the keys.
+  // Checks the hash of all of the keys and values written versus the
+  // keys in the file and signals a corruption if they do not match
   //
   // Default: false
   //
   // Dynamically changeable through SetOptions() API
   bool paranoid_file_checks = false;
 
-  // In debug mode, RocksDB run consistency checks on the LSM every time the LSM
-  // change (Flush, Compaction, AddFile). These checks are disabled in release
-  // mode, use this option to enable them in release mode as well.
-  // Default: false
-  bool force_consistency_checks = false;
+  // In debug mode, RocksDB runs consistency checks on the LSM every time the
+  // LSM changes (Flush, Compaction, AddFile). When this option is true, these
+  // checks are also enabled in release mode. These checks were historically
+  // disabled in release mode, but are now enabled by default for proactive
+  // corruption detection. The CPU overhead is negligible for normal mixed
+  // operations but can slow down saturated writing. See
+  // Options::DisableExtraChecks().
+  // Default: true
+  bool force_consistency_checks = true;
 
   // Measure IO stats in compactions and flushes, if true.
   //
@@ -644,10 +755,14 @@
   // Dynamically changeable through SetOptions() API
   bool report_bg_io_stats = false;
 
-  // Files older than TTL will go through the compaction process.
+  // Files containing updates older than TTL will go through the compaction
+  // process. This usually happens in a cascading way so that those entries
+  // will be compacted to bottommost level/file.
+  // The feature is used to remove stale entries that have been deleted or
+  // updated from the file system.
   // Pre-req: This needs max_open_files to be set to -1.
   // In Level: Non-bottom-level files older than TTL will go through the
-  //           compation process.
+  //           compaction process.
   // In FIFO: Files older than TTL will be deleted.
   // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
   // In FIFO, this option will have the same meaning as
@@ -664,6 +779,9 @@
 
   // Files older than this value will be picked up for compaction, and
   // re-written to the same level as they were before.
+  // One main use of the feature is to make sure a file goes through compaction
+  // filters periodically. Users can also use the feature to clear up SST
+  // files using old format.
   //
   // A file's age is computed by looking at file_creation_time or creation_time
   // table properties in order, if they have valid non-zero values; if not, the
@@ -697,6 +815,100 @@
   // data is left uncompressed (unless compression is also requested).
   uint64_t sample_for_compression = 0;
 
+  // EXPERIMENTAL
+  // The feature is still in development and is incomplete.
+  // If this option is set, when creating bottommost files, pass this
+  // temperature to FileSystem used. Should be no-op for default FileSystem
+  // and users need to plug in their own FileSystem to take advantage of it.
+  Temperature bottommost_temperature = Temperature::kUnknown;
+
+  // When set, large values (blobs) are written to separate blob files, and
+  // only pointers to them are stored in SST files. This can reduce write
+  // amplification for large-value use cases at the cost of introducing a level
+  // of indirection for reads. See also the options min_blob_size,
+  // blob_file_size, blob_compression_type, enable_blob_garbage_collection,
+  // blob_garbage_collection_age_cutoff,
+  // blob_garbage_collection_force_threshold, and blob_compaction_readahead_size
+  // below.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through the SetOptions() API
+  bool enable_blob_files = false;
+
+  // The size of the smallest value to be stored separately in a blob file.
+  // Values which have an uncompressed size smaller than this threshold are
+  // stored alongside the keys in SST files in the usual fashion. A value of
+  // zero for this option means that all values are stored in blob files. Note
+  // that enable_blob_files has to be set in order for this option to have any
+  // effect.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t min_blob_size = 0;
+
+  // The size limit for blob files. When writing blob files, a new file is
+  // opened once this limit is reached. Note that enable_blob_files has to be
+  // set in order for this option to have any effect.
+  //
+  // Default: 256 MB
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t blob_file_size = 1ULL << 28;
+
+  // The compression algorithm to use for large values stored in blob files.
+  // Note that enable_blob_files has to be set in order for this option to have
+  // any effect.
+  //
+  // Default: no compression
+  //
+  // Dynamically changeable through the SetOptions() API
+  CompressionType blob_compression_type = kNoCompression;
+
+  // Enables garbage collection of blobs. Blob GC is performed as part of
+  // compaction. Valid blobs residing in blob files older than a cutoff get
+  // relocated to new files as they are encountered during compaction, which
+  // makes it possible to clean up blob files once they contain nothing but
+  // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff and
+  // blob_garbage_collection_force_threshold below.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through the SetOptions() API
+  bool enable_blob_garbage_collection = false;
+
+  // The cutoff in terms of blob file age for garbage collection. Blobs in
+  // the oldest N blob files will be relocated when encountered during
+  // compaction, where N = garbage_collection_cutoff * number_of_blob_files.
+  // Note that enable_blob_garbage_collection has to be set in order for this
+  // option to have any effect.
+  //
+  // Default: 0.25
+  //
+  // Dynamically changeable through the SetOptions() API
+  double blob_garbage_collection_age_cutoff = 0.25;
+
+  // If the ratio of garbage in the oldest blob files exceeds this threshold,
+  // targeted compactions are scheduled in order to force garbage collecting
+  // the blob files in question, assuming they are all eligible based on the
+  // value of blob_garbage_collection_age_cutoff above. This option is
+  // currently only supported with leveled compactions.
+  // Note that enable_blob_garbage_collection has to be set in order for this
+  // option to have any effect.
+  //
+  // Default: 1.0
+  //
+  // Dynamically changeable through the SetOptions() API
+  double blob_garbage_collection_force_threshold = 1.0;
+
+  // Compaction readahead for blob files.
+  //
+  // Default: 0
+  //
+  // Dynamically changeable through the SetOptions() API
+  uint64_t blob_compaction_readahead_size = 0;
+
   // Create ColumnFamilyOptions with default values for all fields
   AdvancedColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/c.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/c.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/c.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/c.h	2025-05-19 16:14:27.000000000 +0000
@@ -71,8 +71,11 @@
 typedef struct rocksdb_t                 rocksdb_t;
 typedef struct rocksdb_backup_engine_t   rocksdb_backup_engine_t;
 typedef struct rocksdb_backup_engine_info_t   rocksdb_backup_engine_info_t;
+typedef struct rocksdb_backupable_db_options_t rocksdb_backupable_db_options_t;
 typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
-typedef struct rocksdb_cache_t           rocksdb_cache_t;
+typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t;
+typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t;
+typedef struct rocksdb_cache_t rocksdb_cache_t;
 typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
 typedef struct rocksdb_compactionfiltercontext_t
     rocksdb_compactionfiltercontext_t;
@@ -136,7 +139,7 @@
 
 extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
     const rocksdb_options_t* options, const char* name,
-    unsigned char error_if_log_file_exist, char** errptr);
+    unsigned char error_if_wal_file_exists, char** errptr);
 
 extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
     const rocksdb_options_t* options, const char* name,
@@ -145,6 +148,10 @@
 extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
     const rocksdb_options_t* options, const char* path, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t*
+rocksdb_backup_engine_open_opts(const rocksdb_backupable_db_options_t* options,
+                                rocksdb_env_t* env, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
     rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
 
@@ -156,7 +163,7 @@
     rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr);
 
 extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t*
-rocksdb_restore_options_create();
+rocksdb_restore_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
     rocksdb_restore_options_t* opt);
 extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
@@ -171,6 +178,11 @@
     rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
     const rocksdb_restore_options_t* restore_options, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+    char** errptr);
+
 extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t*
 rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be);
 
@@ -198,6 +210,100 @@
 extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close(
     rocksdb_backup_engine_t* be);
 
+/* BackupableDBOptions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_backupable_db_options_t*
+rocksdb_backupable_db_options_create(const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_backup_dir(
+    rocksdb_backupable_db_options_t* options, const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_env(
+    rocksdb_backupable_db_options_t* options, rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_share_table_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backupable_db_options_get_share_table_files(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_sync(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backupable_db_options_get_sync(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_destroy_old_data(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backupable_db_options_get_destroy_old_data(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_backup_log_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backupable_db_options_get_backup_log_files(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backupable_db_options_get_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backupable_db_options_get_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_max_background_operations(
+    rocksdb_backupable_db_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backupable_db_options_get_max_background_operations(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options, uint64_t size);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backupable_db_options_get_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backupable_db_options_get_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backupable_db_options_get_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_destroy(
+    rocksdb_backupable_db_options_t*);
+
+/* Checkpoint */
+
 extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
 rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr);
 
@@ -214,13 +320,20 @@
     const rocksdb_options_t* const* column_family_options,
     rocksdb_column_family_handle_t** column_family_handles, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families_with_ttl(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+    char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_t*
 rocksdb_open_for_read_only_column_families(
     const rocksdb_options_t* options, const char* name, int num_column_families,
     const char* const* column_family_names,
     const rocksdb_options_t* const* column_family_options,
     rocksdb_column_family_handle_t** column_family_handles,
-    unsigned char error_if_log_file_exist, char** errptr);
+    unsigned char error_if_wal_file_exists, char** errptr);
 
 extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families(
     const rocksdb_options_t* options, const char* name,
@@ -241,6 +354,11 @@
                              const rocksdb_options_t* column_family_options,
                              const char* column_family_name, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family_with_ttl(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, int ttl, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family(
     rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr);
 
@@ -320,6 +438,21 @@
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs);
 
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found);
+
+// The value is only allocated (using malloc) and returned if it is found and
+// value_found isn't NULL. In that case the user is responsible for freeing it.
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found);
+
 extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator(
     rocksdb_t* db, const rocksdb_readoptions_t* options);
 
@@ -365,13 +498,13 @@
 extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes(
     rocksdb_t* db, int num_ranges, const char* const* range_start_key,
     const size_t* range_start_key_len, const char* const* range_limit_key,
-    const size_t* range_limit_key_len, uint64_t* sizes);
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf(
     rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
     int num_ranges, const char* const* range_start_key,
     const size_t* range_start_key_len, const char* const* range_limit_key,
-    const size_t* range_limit_key_len, uint64_t* sizes);
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db,
                                                       const char* start_key,
@@ -406,6 +539,10 @@
     rocksdb_t* db, const rocksdb_flushoptions_t* options,
     rocksdb_column_family_handle_t* column_family, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db,
+                                                  unsigned char sync,
+                                                  char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db,
                                                                char** errptr);
 
@@ -451,7 +588,8 @@
 
 /* Write batch */
 
-extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(
+    void);
 extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from(
     const char* rep, size_t size);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy(
@@ -495,9 +633,14 @@
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*,
                                                           const char* key,
                                                           size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete(
+    rocksdb_writebatch_t* b, const char* key, size_t klen);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf(
     rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
     const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_singledelete_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev(
     rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes);
@@ -583,9 +726,14 @@
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t*,
                                                           const char* key,
                                                           size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete(
+    rocksdb_writebatch_wi_t*, const char* key, size_t klen);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf(
     rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
     const char* key, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete_cf(
+    rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_deletev(
     rocksdb_writebatch_wi_t* b, int num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes);
@@ -670,7 +818,7 @@
 /* Block based table options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
-rocksdb_block_based_options_create();
+rocksdb_block_based_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy(
     rocksdb_block_based_table_options_t* options);
 extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size(
@@ -745,7 +893,7 @@
 /* Cuckoo table options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*
-rocksdb_cuckoo_options_create();
+rocksdb_cuckoo_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy(
     rocksdb_cuckoo_table_options_t* options);
 extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio(
@@ -769,8 +917,10 @@
 extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf(
     rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr);
 
-extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create();
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism(
     rocksdb_options_t* opt, int total_threads);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup(
@@ -783,12 +933,16 @@
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_allow_ingest_behind(rocksdb_options_t*,
                                                    unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter(
     rocksdb_options_t*, rocksdb_compactionfilter_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory(
     rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_compaction_readahead_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_compaction_readahead_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator(
     rocksdb_options_t*, rocksdb_comparator_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator(
@@ -796,16 +950,24 @@
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator(
     rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level(
-    rocksdb_options_t* opt, int* level_values, size_t num_levels);
+    rocksdb_options_t* opt, const int* level_values, size_t num_levels);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_create_if_missing(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_create_missing_column_families(rocksdb_options_t*,
                                                    unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_create_missing_column_families(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_error_if_exists(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(rocksdb_options_t*,
                                                              const rocksdb_dbpath_t** path_values,
                                                              size_t num_paths);
@@ -815,41 +977,98 @@
                                                              rocksdb_logger_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_info_log_level(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_write_buffer_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_write_buffer_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_db_write_buffer_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_open_files(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_file_opening_threads(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_file_opening_threads(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size(
     rocksdb_options_t* opt, uint64_t n);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options(
     rocksdb_options_t*, int, int, int, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*,
+                                                             int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*,
+                                                         int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int,
+                                                   int, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t*, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
     rocksdb_options_t*, rocksdb_slicetransform_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_num_levels(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_file_num_compaction_trigger(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_level0_stop_writes_trigger(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_mem_compaction_level(
     rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_target_file_size_base(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_target_file_size_multiplier(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
                                                          unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_level_compaction_dynamic_level_bytes(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_max_bytes_for_level_multiplier(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_bytes_for_level_multiplier_additional(
     rocksdb_options_t*, int* level_values, size_t num_levels);
@@ -858,9 +1077,56 @@
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
                                                  unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
     rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt);
+
+/* Blob Options Settings */
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_min_blob_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_file_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type(
+    rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff(
+    rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold(
+    rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+                                                   uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_compaction_readahead_size(rocksdb_options_t* opt);
 
 /* returns a pointer to a malloc()-ed, null terminated string */
 extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
@@ -868,122 +1134,222 @@
 
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_write_buffer_number(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
                                                         int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
                                                       int64_t);
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_options_get_max_write_buffer_size_to_maintain(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_pipelined_write(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_unordered_write(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
     rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_max_subcompactions(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_jobs(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_compactions(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_base_background_compactions(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_base_background_compactions(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_max_background_flushes(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_log_file_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_keep_log_file_num(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit(
     rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_soft_rate_limit(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit(
     rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_hard_rate_limit(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_pending_compaction_bytes_limit(
     rocksdb_options_t* opt, size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_pending_compaction_bytes_limit(
     rocksdb_options_t* opt, size_t v);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_rate_limit_delay_max_milliseconds(rocksdb_options_t*,
                                                       unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_rate_limit_delay_max_milliseconds(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_manifest_file_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_table_cache_numshardbits(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_table_cache_remove_scan_count_limit(rocksdb_options_t*,
                                                         int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_arena_block_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_use_fsync(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir(
     rocksdb_options_t*, const char*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*,
                                                             const char*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*,
                                                     unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_reads(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_mmap_writes(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_direct_reads(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_use_direct_io_for_flush_and_compaction(rocksdb_options_t*,
                                                            unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_use_direct_io_for_flush_and_compaction(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_skip_log_error_on_recovery(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec(
     rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_persist_period_sec(
+    rocksdb_options_t*, unsigned int);
+extern ROCKSDB_LIBRARY_API unsigned int
+rocksdb_options_get_stats_persist_period_sec(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_advise_random_on_open(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_access_hint_on_compaction_start(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_use_adaptive_mutex(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_bytes_per_sync(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
         rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_writable_file_max_buffer_size(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
                                                     unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_allow_concurrent_memtable_write(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_enable_write_thread_adaptive_yield(rocksdb_options_t*,
                                                        unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_enable_write_thread_adaptive_yield(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*,
                                                       uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_sequential_skip_in_iterations(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_disable_auto_compactions(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_optimize_filters_for_hits(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_optimize_filters_for_hits(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*,
                                                         uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_delete_obsolete_files_period_micros(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load(
     rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep(
     rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
     rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API double
+rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_max_compaction_bytes(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep(
     rocksdb_options_t*, size_t, int32_t, int32_t);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep(
@@ -996,17 +1362,29 @@
 
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_huge_page_size(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t*);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_max_successive_merges(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality(
     rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_bloom_locality(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_inplace_update_support(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks(
     rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_report_bg_io_stats(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_report_bg_io_stats(
+    rocksdb_options_t*);
 
 enum {
   rocksdb_tolerate_corrupted_tail_records_recovery = 0,
@@ -1016,6 +1394,8 @@
 };
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_recovery_mode(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_recovery_mode(
+    rocksdb_options_t*);
 
 enum {
   rocksdb_no_compression = 0,
@@ -1029,6 +1409,12 @@
 };
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compression(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_bottommost_compression(
+    rocksdb_options_t*);
 
 enum {
   rocksdb_level_compaction = 0,
@@ -1037,6 +1423,8 @@
 };
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style(
     rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_compaction_style(
+    rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_universal_compaction_options(
     rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
@@ -1046,11 +1434,21 @@
     rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_atomic_flush(
     rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush(
+    rocksdb_options_t* opt);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache(
     rocksdb_options_t* opt, rocksdb_cache_t* cache
 );
 
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t*, size_t window_size, size_t num_dels_trigger);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush(
+    rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush(
+    rocksdb_options_t* opt);
+
 /* RateLimiter */
 extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
     int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness);
@@ -1139,7 +1537,8 @@
 };
 
 extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
-extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create();
+extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset(
     rocksdb_perfcontext_t* context);
 extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report(
@@ -1211,9 +1610,14 @@
     rocksdb_filterpolicy_t*);
 
 extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
-rocksdb_filterpolicy_create_bloom(int bits_per_key);
+rocksdb_filterpolicy_create_bloom(double bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom_full(double bits_per_key);
 extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
-rocksdb_filterpolicy_create_bloom_full(int bits_per_key);
+rocksdb_filterpolicy_create_ribbon(double bloom_equivalent_bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_ribbon_hybrid(double bloom_equivalent_bits_per_key,
+                                          int bloom_before_level);
 
 /* Merge Operator */
 
@@ -1237,13 +1641,18 @@
 
 /* Read options */
 
-extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy(
     rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_verify_checksums(rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_fill_cache(
+    rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot(
     rocksdb_readoptions_t*, const rocksdb_snapshot_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound(
@@ -1252,80 +1661,155 @@
     rocksdb_readoptions_t*, const char* key, size_t keylen);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_readoptions_get_read_tier(
+    rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing(
+    rocksdb_readoptions_t*);
 // The functionality that this option controlled has been removed.
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
     rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
     rocksdb_readoptions_t*, size_t);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_prefix_same_as_start(rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_pin_data(
+    rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_max_skippable_internal_keys(
     rocksdb_readoptions_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*);
 
 /* Write options */
 
-extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t*
-rocksdb_writeoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy(
     rocksdb_writeoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
     rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_sync(
+    rocksdb_writeoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL(
     rocksdb_writeoptions_t* opt, int disable);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL(
+    rocksdb_writeoptions_t* opt);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_ignore_missing_column_families(
     rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_ignore_missing_column_families(
+    rocksdb_writeoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
     rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_no_slowdown(
+    rocksdb_writeoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
     rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_low_pri(
+    rocksdb_writeoptions_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*,
                                                         unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t*);
 
 /* Compact range options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t*
-rocksdb_compactoptions_create();
+rocksdb_compactoptions_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy(
     rocksdb_compactoptions_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_compactoptions_set_exclusive_manual_compaction(
     rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_exclusive_manual_compaction(
+    rocksdb_compactoptions_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_compactoptions_set_bottommost_level_compaction(
     rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_bottommost_level_compaction(
+    rocksdb_compactoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level(
     rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactoptions_get_change_level(rocksdb_compactoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level(
     rocksdb_compactoptions_t*, int);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level(
+    rocksdb_compactoptions_t*);
 
 /* Flush options */
 
-extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t*
-rocksdb_flushoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy(
     rocksdb_flushoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
     rocksdb_flushoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait(
+    rocksdb_flushoptions_t*);
+
+/* Memory allocator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t*
+rocksdb_jemalloc_nodump_allocator_create(char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy(
+    rocksdb_memory_allocator_t*);
 
 /* Cache */
 
+extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t*
+rocksdb_lru_cache_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy(
+    rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity(
+    rocksdb_lru_cache_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*);
+
 extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru(
     size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data(
+    rocksdb_cache_t* cache);
 extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity(
     rocksdb_cache_t* cache, size_t capacity);
 extern ROCKSDB_LIBRARY_API size_t
+rocksdb_cache_get_capacity(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API size_t
 rocksdb_cache_get_usage(rocksdb_cache_t* cache);
 extern ROCKSDB_LIBRARY_API size_t
 rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache);
@@ -1337,12 +1821,24 @@
 
 /* Env */
 
-extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env();
-extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env();
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void);
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads(
     rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads(
+    rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads(
+    rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
     rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env);
@@ -1352,7 +1848,8 @@
 
 extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
 
-extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy(
     rocksdb_envoptions_t* opt);
 
@@ -1387,7 +1884,7 @@
     rocksdb_sstfilewriter_t* writer);
 
 extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t*
-rocksdb_ingestexternalfileoptions_create();
+rocksdb_ingestexternalfileoptions_create(void);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_ingestexternalfileoptions_set_move_files(
     rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files);
@@ -1433,7 +1930,7 @@
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
     rocksdb_slicetransform_create_fixed_prefix(size_t);
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
-rocksdb_slicetransform_create_noop();
+rocksdb_slicetransform_create_noop(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
     rocksdb_slicetransform_t*);
 
@@ -1445,38 +1942,61 @@
 };
 
 extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t*
-rocksdb_universal_compaction_options_create();
+rocksdb_universal_compaction_options_create(void);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_size_ratio(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_min_merge_width(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_max_merge_width(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_max_size_amplification_percent(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_compression_size_percent(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_stop_style(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
     rocksdb_universal_compaction_options_t*);
 
 extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
-rocksdb_fifo_compaction_options_create();
+rocksdb_fifo_compaction_options_create(void);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_fifo_compaction_options_set_max_table_files_size(
     rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
 extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts);
 
 extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
     const rocksdb_livefiles_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name(
+    const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
     const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
@@ -1522,7 +2042,7 @@
     const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
     char** errptr);
 
-rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
     const rocksdb_options_t* options,
     const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
     int num_column_families, const char* const* column_family_names,
@@ -1535,6 +2055,12 @@
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_release_snapshot(
     rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot);
 
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value(
+    rocksdb_transactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int(
+    rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val);
+
 extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin(
     rocksdb_transactiondb_t* txn_db,
     const rocksdb_writeoptions_t* write_options,
@@ -1574,7 +2100,7 @@
     const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
     char** errptr);
 
-char* rocksdb_transaction_get_for_update_cf(
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf(
     rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
     rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
     size_t* vlen, unsigned char exclusive, char** errptr);
@@ -1692,13 +2218,22 @@
     const rocksdb_optimistictransaction_options_t* otxn_options,
     rocksdb_transaction_t* old_txn);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_write(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+    char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close(
     rocksdb_optimistictransactiondb_t* otxn_db);
 
+extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
+rocksdb_optimistictransactiondb_checkpoint_object_create(
+    rocksdb_optimistictransactiondb_t* otxn_db, char** errptr);
+
 /* Transaction Options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t*
-rocksdb_transactiondb_options_create();
+rocksdb_transactiondb_options_create(void);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy(
     rocksdb_transactiondb_options_t* opt);
@@ -1718,7 +2253,7 @@
     rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout);
 
 extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t*
-rocksdb_transaction_options_create();
+rocksdb_transaction_options_create(void);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy(
     rocksdb_transaction_options_t* opt);
@@ -1744,7 +2279,7 @@
     rocksdb_transaction_options_t* opt, size_t size);
 
 extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
-rocksdb_optimistictransaction_options_create();
+rocksdb_optimistictransaction_options_create(void);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy(
     rocksdb_optimistictransaction_options_t* opt);
@@ -1753,6 +2288,13 @@
 rocksdb_optimistictransaction_options_set_set_snapshot(
     rocksdb_optimistictransaction_options_t* opt, unsigned char v);
 
+extern ROCKSDB_LIBRARY_API char* rocksdb_optimistictransactiondb_property_value(
+    rocksdb_optimistictransactiondb_t* db, const char* propname);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_optimistictransactiondb_property_int(
+    rocksdb_optimistictransactiondb_t* db, const char* propname,
+    uint64_t* out_val);
+
 // referring to convention (3), this should be used by client
 // to free memory that was malloc()ed
 extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr);
@@ -1770,7 +2312,7 @@
     const rocksdb_pinnableslice_t* t, size_t* vlen);
 
 extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
-    rocksdb_memory_consumers_create();
+rocksdb_memory_consumers_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
     rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
 extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
@@ -1796,6 +2338,16 @@
 rocksdb_approximate_memory_usage_get_cache_total(
     rocksdb_memory_usage_t* memory_usage);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_dump_malloc_stats(
+    rocksdb_options_t*, unsigned char);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t*,
+                                                 unsigned char);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_cancel_all_background_work(
+    rocksdb_t* db, unsigned char wait);
+
 #ifdef __cplusplus
 }  /* end extern "C" */
 #endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -22,9 +22,11 @@
 
 #pragma once
 
-#include <stdint.h>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
+
 #include "rocksdb/memory_allocator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
@@ -33,6 +35,8 @@
 namespace ROCKSDB_NAMESPACE {
 
 class Cache;
+struct ConfigOptions;
+class SecondaryCache;
 
 extern const bool kDefaultToAdaptiveMutex;
 
@@ -58,10 +62,10 @@
 
   // Percentage of cache reserved for high priority entries.
   // If greater than zero, the LRU list will be split into a high-pri
-  // list and a low-pri list. High-pri entries will be insert to the
+  // list and a low-pri list. High-pri entries will be inserted to the
   // tail of high-pri list, while low-pri entries will be first inserted to
-  // the low-pri list (the midpoint). This is refered to as
-  // midpoint insertion strategy to make entries never get hit in cache
+  // the low-pri list (the midpoint). This is referred to as
+  // midpoint insertion strategy to make entries that never get hit in cache
   // age out faster.
   //
   // See also
@@ -86,6 +90,9 @@
   CacheMetadataChargePolicy metadata_charge_policy =
       kDefaultCacheMetadataChargePolicy;
 
+  // A SecondaryCache instance to use a the non-volatile tier
+  std::shared_ptr<SecondaryCache> secondary_cache;
+
   LRUCacheOptions() {}
   LRUCacheOptions(size_t _capacity, int _num_shard_bits,
                   bool _strict_capacity_limit, double _high_pri_pool_ratio,
@@ -125,23 +132,104 @@
 // more detail.
 //
 // Return nullptr if it is not supported.
+//
+// BROKEN: ClockCache is known to have bugs that could lead to crash or
+// corruption, so should not be used until fixed. Use NewLRUCache instead.
 extern std::shared_ptr<Cache> NewClockCache(
     size_t capacity, int num_shard_bits = -1,
     bool strict_capacity_limit = false,
     CacheMetadataChargePolicy metadata_charge_policy =
         kDefaultCacheMetadataChargePolicy);
+
 class Cache {
  public:
   // Depending on implementation, cache entries with high priority could be less
   // likely to get evicted than low priority entries.
   enum class Priority { HIGH, LOW };
 
+  // A set of callbacks to allow objects in the primary block cache to be
+  // be persisted in a secondary cache. The purpose of the secondary cache
+  // is to support other ways of caching the object, such as persistent or
+  // compressed data, that may require the object to be parsed and transformed
+  // in some way. Since the primary cache holds C++ objects and the secondary
+  // cache may only hold flat data that doesn't need relocation, these
+  // callbacks need to be provided by the user of the block
+  // cache to do the conversion.
+  // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers
+  // to callback functions for size, saving and deletion of the
+  // object. The callbacks are defined in C-style in order to make them
+  // stateless and not add to the cache metadata size.
+  // Saving multiple std::function objects will take up 32 bytes per
+  // function, even if its not bound to an object and does no capture.
+  //
+  // All the callbacks are C-style function pointers in order to simplify
+  // lifecycle management. Objects in the cache can outlive the parent DB,
+  // so anything required for these operations should be contained in the
+  // object itself.
+  //
+  // The SizeCallback takes a void* pointer to the object and returns the size
+  // of the persistable data. It can be used by the secondary cache to allocate
+  // memory if needed.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  using SizeCallback = size_t (*)(void* obj);
+
+  // The SaveToCallback takes a void* object pointer and saves the persistable
+  // data into a buffer. The secondary cache may decide to not store it in a
+  // contiguous buffer, in which case this callback will be called multiple
+  // times with increasing offset
+  using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+                                    size_t length, void* out);
+
+  // A function pointer type for custom destruction of an entry's
+  // value. The Cache is responsible for copying and reclaiming space
+  // for the key, but values are managed by the caller.
+  using DeleterFn = void (*)(const Slice& key, void* value);
+
+  // A struct with pointers to helper functions for spilling items from the
+  // cache into the secondary cache. May be extended in the future. An
+  // instance of this struct is expected to outlive the cache.
+  struct CacheItemHelper {
+    SizeCallback size_cb;
+    SaveToCallback saveto_cb;
+    DeleterFn del_cb;
+
+    CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+    CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+                    DeleterFn _del_cb)
+        : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+  };
+
+  // The CreateCallback is passed by the block cache user to Lookup(). It
+  // takes in a buffer from the NVM cache and constructs an object using
+  // it. The callback doesn't have ownership of the buffer and should
+  // copy the contents into its own buffer.
+  using CreateCallback = std::function<Status(void* buf, size_t size,
+                                              void** out_obj, size_t* charge)>;
+
   Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
       : memory_allocator_(std::move(allocator)) {}
   // No copying allowed
   Cache(const Cache&) = delete;
   Cache& operator=(const Cache&) = delete;
 
+  // Creates a new Cache based on the input value string and returns the result.
+  // Currently, this method can be used to create LRUCaches only
+  // @param config_options
+  // @param value  The value might be:
+  //   - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102(
+  //   - Name-value option pairs -- "capacity=1M; num_shard_bits=4;
+  //     For the LRUCache, the values are defined in LRUCacheOptions.
+  // @param result The new Cache object
+  // @return OK if the cache was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<Cache>* result);
+
   // Destroys all existing entries by calling the "deleter"
   // function that was passed via the Insert() function.
   //
@@ -154,8 +242,8 @@
   // The type of the Cache
   virtual const char* Name() const = 0;
 
-  // Insert a mapping from key->value into the cache and assign it
-  // the specified charge against the total cache capacity.
+  // Insert a mapping from key->value into the volatile cache only
+  // and assign it // the specified charge against the total cache capacity.
   // If strict_capacity_limit is true and cache reaches its full capacity,
   // return Status::Incomplete.
   //
@@ -168,10 +256,11 @@
   // insert. In case of error value will be cleanup.
   //
   // When the inserted entry is no longer needed, the key and
-  // value will be passed to "deleter".
+  // value will be passed to "deleter" which must delete the value.
+  // (The Cache is responsible for copying and reclaiming space for
+  // the key.)
   virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Handle** handle = nullptr,
+                        DeleterFn deleter, Handle** handle = nullptr,
                         Priority priority = Priority::LOW) = 0;
 
   // If the cache has no mapping for "key", returns nullptr.
@@ -248,6 +337,12 @@
   // returns the charge for the specific entry in the cache.
   virtual size_t GetCharge(Handle* handle) const = 0;
 
+  // Returns the deleter for the specified entry. This might seem useless
+  // as the Cache itself is responsible for calling the deleter, but
+  // the deleter can essentially verify that a cache entry is of an
+  // expected type from an expected code source.
+  virtual DeleterFn GetDeleter(Handle* handle) const = 0;
+
   // Call this on shutdown if you want to speed it up. Cache will disown
   // any underlying data and will not free it on delete. This call will leak
   // memory - call this only if you're shutting down the process.
@@ -257,11 +352,33 @@
       // default implementation is noop
   }
 
-  // Apply callback to all entries in the cache
-  // If thread_safe is true, it will also lock the accesses. Otherwise, it will
-  // access the cache without the lock held
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) = 0;
+  struct ApplyToAllEntriesOptions {
+    // If the Cache uses locks, setting `average_entries_per_lock` to
+    // a higher value suggests iterating over more entries each time a lock
+    // is acquired, likely reducing the time for ApplyToAllEntries but
+    // increasing latency for concurrent users of the Cache. Setting
+    // `average_entries_per_lock` to a smaller value could be helpful if
+    // callback is relatively expensive, such as using large data structures.
+    size_t average_entries_per_lock = 256;
+  };
+
+  // Apply a callback to all entries in the cache. The Cache must ensure
+  // thread safety but does not guarantee that a consistent snapshot of all
+  // entries is iterated over if other threads are operating on the Cache
+  // also.
+  virtual void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) = 0;
+
+  // DEPRECATED version of above. (Default implementation uses above.)
+  virtual void ApplyToAllCacheEntries(void (*callback)(void* value,
+                                                       size_t charge),
+                                      bool /*thread_safe*/) {
+    ApplyToAllEntries([callback](const Slice&, void* value, size_t charge,
+                                 DeleterFn) { callback(value, charge); },
+                      {});
+  }
 
   // Remove all entries.
   // Prerequisite: no entry is referenced.
@@ -271,6 +388,108 @@
 
   MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
 
+  // EXPERIMENTAL
+  // The following APIs are experimental and might change in the future.
+  // The Insert and Lookup APIs below are intended to allow cached objects
+  // to be demoted/promoted between the primary block cache and a secondary
+  // cache. The secondary cache could be a non-volatile cache, and will
+  // likely store the object in a different representation more suitable
+  // for on disk storage. They rely on a per object CacheItemHelper to do
+  // the conversions.
+  // The secondary cache may persist across process and system restarts,
+  // and may even be moved between hosts. Therefore, the cache key must
+  // be repeatable across restarts/reboots, and globally unique if
+  // multiple DBs share the same cache and the set of DBs can change
+  // over time.
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  // If strict_capacity_limit is true and cache reaches its full capacity,
+  // return Status::Incomplete.
+  //
+  // The helper argument is saved by the cache and will be used when the
+  // inserted object is evicted or promoted to the secondary cache. It,
+  // therefore, must outlive the cache.
+  //
+  // If handle is not nullptr, returns a handle that corresponds to the
+  // mapping. The caller must call this->Release(handle) when the returned
+  // mapping is no longer needed. In case of error caller is responsible to
+  // cleanup the value (i.e. calling "deleter").
+  //
+  // If handle is nullptr, it is as if Release is called immediately after
+  // insert. In case of error value will be cleanup.
+  //
+  // Regardless of whether the item was inserted into the cache,
+  // it will attempt to insert it into the secondary cache if one is
+  // configured, and the helper supports it.
+  // The cache implementation must support a secondary cache, otherwise
+  // the item is only inserted into the primary cache. It may
+  // defer the insertion to the secondary cache as it sees fit.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Status Insert(const Slice& key, void* value,
+                        const CacheItemHelper* helper, size_t charge,
+                        Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) {
+    if (!helper) {
+      return Status::InvalidArgument();
+    }
+    return Insert(key, value, charge, helper->del_cb, handle, priority);
+  }
+
+  // Lookup the key in the primary and secondary caches (if one is configured).
+  // The create_cb callback function object will be used to contruct the
+  // cached object.
+  // If none of the caches have the mapping for the key, returns nullptr.
+  // Else, returns a handle that corresponds to the mapping.
+  //
+  // This call may promote the object from the secondary cache (if one is
+  // configured, and has the given key) to the primary cache.
+  //
+  // The helper argument should be provided if the caller wants the lookup
+  // to include the secondary cache (if one is configured) and the object,
+  // if it exists, to be promoted to the primary cache. The helper may be
+  // saved and used later when the object is evicted. Therefore, it must
+  // outlive the cache.
+  //
+  // The handle returned may not be ready. The caller should call IsReady()
+  // to check if the item value is ready, and call Wait() or WaitAll() if
+  // its not ready. The caller should then call Value() to check if the
+  // item was successfully retrieved. If unsuccessful (perhaps due to an
+  // IO error), Value() will return nullptr.
+  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/,
+                         const CreateCallback& /*create_cb*/,
+                         Priority /*priority*/, bool /*wait*/,
+                         Statistics* stats = nullptr) {
+    return Lookup(key, stats);
+  }
+
+  // Release a mapping returned by a previous Lookup(). The "useful"
+  // parameter specifies whether the data was actually used or not,
+  // which may be used by the cache implementation to decide whether
+  // to consider it as a hit for retention purposes.
+  virtual bool Release(Handle* handle, bool /*useful*/, bool force_erase) {
+    return Release(handle, force_erase);
+  }
+
+  // Determines if the handle returned by Lookup() has a valid value yet. The
+  // call is not thread safe and should be called only by someone holding a
+  // reference to the handle.
+  virtual bool IsReady(Handle* /*handle*/) { return true; }
+
+  // If the handle returned by Lookup() is not ready yet, wait till it
+  // becomes ready.
+  // Note: A ready handle doesn't necessarily mean it has a valid value. The
+  // user should call Value() and check for nullptr.
+  virtual void Wait(Handle* /*handle*/) {}
+
+  // Wait for a vector of handles to become ready. As with Wait(), the user
+  // should check the Value() of each handle for nullptr. This call is not
+  // thread safe and should only be called by the caller holding a reference
+  // to each of the handles.
+  virtual void WaitAll(std::vector<Handle*>& /*handles*/) {}
+
  private:
   std::shared_ptr<MemoryAllocator> memory_allocator_;
 };
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cache_bench_tool.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,14 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+int cache_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/cleanable.h	2025-05-19 16:14:27.000000000 +0000
@@ -30,7 +30,7 @@
   //
   // Note that unlike all of the preceding methods, this method is
   // not abstract and therefore clients should not override it.
-  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  using CleanupFunction = void (*)(void* arg1, void* arg2);
   void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
   void DelegateCleanupsTo(Cleanable* other);
   // DoCleanup and also resets the pointers for reuse
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_filter.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,26 +13,22 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Slice;
 class SliceTransform;
 
-// Context information of a compaction run
-struct CompactionFilterContext {
-  // Does this compaction run include all data files
-  bool is_full_compaction;
-  // Is this compaction requested by the client (true),
-  // or is it occurring as an automatic compaction process
-  bool is_manual_compaction;
-};
-
-// CompactionFilter allows an application to modify/delete a key-value at
-// the time of compaction.
-
-class CompactionFilter {
+// CompactionFilter allows an application to modify/delete a key-value during
+// table file creation.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilter : public Customizable {
  public:
   enum ValueType {
     kValue,
@@ -45,35 +41,44 @@
     kRemove,
     kChangeValue,
     kRemoveAndSkipUntil,
+    kChangeBlobIndex,  // used internally by BlobDB.
+    kIOError,          // used internally by BlobDB.
+    kUndetermined,
   };
 
   enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };
 
-  // Context information of a compaction run
+  // Context information for a table file creation.
   struct Context {
-    // Does this compaction run include all data files
+    // Whether this table file is created as part of a compaction including all
+    // table files.
     bool is_full_compaction;
-    // Is this compaction requested by the client (true),
-    // or is it occurring as an automatic compaction process
+    // Whether this table file is created as part of a compaction requested by
+    // the client.
     bool is_manual_compaction;
-    // Which column family this compaction is for.
+    // The column family that will contain the created table file.
     uint32_t column_family_id;
+    // Reason this table file is being created.
+    TableFileCreationReason reason;
   };
 
   virtual ~CompactionFilter() {}
-
-  // The compaction process invokes this
-  // method for kv that is being compacted. A return value
-  // of false indicates that the kv should be preserved in the
-  // output of this compaction run and a return value of true
-  // indicates that this key-value should be removed from the
-  // output of the compaction.  The application can inspect
-  // the existing value of the key and make decision based on it.
-  //
-  // Key-Values that are results of merge operation during compaction are not
-  // passed into this function. Currently, when you have a mix of Put()s and
-  // Merge()s on a same key, we only guarantee to process the merge operands
-  // through the compaction filters. Put()s might be processed, or might not.
+  static const char* Type() { return "CompactionFilter"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& name,
+                                 const CompactionFilter** result);
+
+  // The table file creation process invokes this method before adding a kv to
+  // the table file. A return value of false indicates that the kv should be
+  // preserved in the new table file and a return value of true indicates
+  // that this key-value should be removed from the new table file. The
+  // application can inspect the existing value of the key and make decision
+  // based on it.
+  //
+  // Key-Values that are results of merge operation during table file creation
+  // are not passed into this function. Currently, when you have a mix of Put()s
+  // and Merge()s on a same key, we only guarantee to process the merge operands
+  // through the `CompactionFilter`s. Put()s might be processed, or might not.
   //
   // When the value is to be preserved, the application has the option
   // to modify the existing_value and pass it back through new_value.
@@ -81,9 +86,10 @@
   //
   // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
   // DB* object) will not guarantee to preserve the state of the DB with
-  // CompactionFilter. Data seen from a snapshot might disppear after a
-  // compaction finishes. If you use snapshots, think twice about whether you
-  // want to use compaction filter and whether you are using it in a safe way.
+  // CompactionFilter. Data seen from a snapshot might disappear after a
+  // table file created with a `CompactionFilter` is installed. If you use
+  // snapshots, think twice about whether you want to use `CompactionFilter` and
+  // whether you are using it in a safe way.
   //
   // If multithreaded compaction is being used *and* a single CompactionFilter
   // instance was supplied via Options::compaction_filter, this method may be
@@ -91,7 +97,7 @@
   // that the call is thread-safe.
   //
   // If the CompactionFilter was created by a factory, then it will only ever
-  // be used by a single thread that is doing the compaction run, and this
+  // be used by a single thread that is doing the table file creation, and this
   // call does not need to be thread-safe.  However, multiple filters may be
   // in existence and operating concurrently.
   virtual bool Filter(int /*level*/, const Slice& /*key*/,
@@ -101,9 +107,9 @@
     return false;
   }
 
-  // The compaction process invokes this method on every merge operand. If this
-  // method returns true, the merge operand will be ignored and not written out
-  // in the compaction output
+  // The table file creation process invokes this method on every merge operand.
+  // If this method returns true, the merge operand will be ignored and not
+  // written out in the new table file.
   //
   // Note: If you are using a TransactionDB, it is not recommended to implement
   // FilterMergeOperand().  If a Merge operation is filtered out, TransactionDB
@@ -140,14 +146,16 @@
   //         snapshot - beware if you're using TransactionDB or
   //         DB::GetSnapshot().
   //       - If value for a key was overwritten or merged into (multiple Put()s
-  //         or Merge()s), and compaction filter skips this key with
+  //         or Merge()s), and `CompactionFilter` skips this key with
   //         kRemoveAndSkipUntil, it's possible that it will remove only
   //         the new value, exposing the old value that was supposed to be
   //         overwritten.
   //       - Doesn't work with PlainTableFactory in prefix mode.
-  //       - If you use kRemoveAndSkipUntil, consider also reducing
-  //         compaction_readahead_size option.
+  //       - If you use kRemoveAndSkipUntil for table files created by
+  //         compaction, consider also reducing compaction_readahead_size
+  //         option.
   //
+  // Should never return kUndetermined.
   // Note: If you are using a TransactionDB, it is not recommended to filter
   // out or modify merge operands (ValueType::kMergeOperand).
   // If a merge operation is filtered out, TransactionDB may not realize there
@@ -185,28 +193,62 @@
   }
 
   // This function is deprecated. Snapshots will always be ignored for
-  // compaction filters, because we realized that not ignoring snapshots doesn't
-  // provide the gurantee we initially thought it would provide. Repeatable
-  // reads will not be guaranteed anyway. If you override the function and
-  // returns false, we will fail the compaction.
+  // `CompactionFilter`s, because we realized that not ignoring snapshots
+  // doesn't provide the guarantee we initially thought it would provide.
+  // Repeatable reads will not be guaranteed anyway. If you override the
+  // function and returns false, we will fail the table file creation.
   virtual bool IgnoreSnapshots() const { return true; }
 
-  // Returns a name that identifies this compaction filter.
+  // Returns a name that identifies this `CompactionFilter`.
   // The name will be printed to LOG file on start up for diagnosis.
-  virtual const char* Name() const = 0;
+  const char* Name() const override = 0;
+
+  // Internal (BlobDB) use only. Do not override in application code.
+  virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; }
+
+  // In the case of BlobDB, it may be possible to reach a decision with only
+  // the key without reading the actual value. Keys whose value_type is
+  // kBlobIndex will be checked by this method.
+  // Returning kUndetermined will cause FilterV2() to be called to make a
+  // decision as usual.
+  virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/,
+                                   std::string* /*new_value*/,
+                                   std::string* /*skip_until*/) const {
+    return Decision::kUndetermined;
+  }
 };
 
-// Each compaction will create a new CompactionFilter allowing the
-// application to know about different compactions
-class CompactionFilterFactory {
+// Each thread of work involving creating table files will create a new
+// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This
+// allows the application to know about the different ongoing threads of work
+// and makes it unnecessary for `CompactionFilter` to provide thread-safety.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionFilterFactory : public Customizable {
  public:
   virtual ~CompactionFilterFactory() {}
+  static const char* Type() { return "CompactionFilterFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& name,
+      std::shared_ptr<CompactionFilterFactory>* result);
+
+  // Returns whether a thread creating table files for the specified `reason`
+  // should invoke `CreateCompactionFilter()` and pass KVs through the returned
+  // filter.
+  virtual bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const {
+    // For backward compatibility, default implementation only applies
+    // `CompactionFilter` to files generated by compaction.
+    return reason == TableFileCreationReason::kCompaction;
+  }
 
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) = 0;
 
-  // Returns a name that identifies this compaction filter factory.
-  virtual const char* Name() const = 0;
+  // Returns a name that identifies this `CompactionFilter` factory.
+  virtual const char* Name() const override = 0;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h	2025-05-19 16:14:27.000000000 +0000
@@ -25,23 +25,33 @@
 
   // the number of compaction input records.
   uint64_t num_input_records;
-  // the number of compaction input files.
+  // the number of blobs read from blob files
+  uint64_t num_blobs_read;
+  // the number of compaction input files (table files)
   size_t num_input_files;
-  // the number of compaction input files at the output level.
+  // the number of compaction input files at the output level (table files)
   size_t num_input_files_at_output_level;
 
   // the number of compaction output records.
   uint64_t num_output_records;
-  // the number of compaction output files.
+  // the number of compaction output files (table files)
   size_t num_output_files;
+  // the number of compaction output files (blob files)
+  size_t num_output_files_blob;
 
+  // true if the compaction is a full compaction (all live SST files input)
+  bool is_full_compaction;
   // true if the compaction is a manual compaction
   bool is_manual_compaction;
 
-  // the size of the compaction input in bytes.
+  // the total size of table files in the compaction input
   uint64_t total_input_bytes;
-  // the size of the compaction output in bytes.
+  // the total size of blobs read from blob files
+  uint64_t total_blob_bytes_read;
+  // the total size of table files in the compaction output
   uint64_t total_output_bytes;
+  // the total size of blob files in the compaction output
+  uint64_t total_output_bytes_blob;
 
   // number of records being replaced by newer record associated with same key.
   // this could be a new value or a deletion entry for that key so this field
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/comparator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/comparator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/comparator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/comparator.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 
 #include <string>
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -20,7 +21,11 @@
 // used as keys in an sstable or a database.  A Comparator implementation
 // must be thread-safe since rocksdb may invoke its methods concurrently
 // from multiple threads.
-class Comparator {
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Comparator : public Customizable {
  public:
   Comparator() : timestamp_size_(0) {}
 
@@ -35,13 +40,20 @@
     return *this;
   }
 
-  virtual ~Comparator() {}
+  ~Comparator() override {}
 
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& id,
+                                 const Comparator** comp);
   static const char* Type() { return "Comparator"; }
+
   // Three-way comparison.  Returns value:
   //   < 0 iff "a" < "b",
   //   == 0 iff "a" == "b",
   //   > 0 iff "a" > "b"
+  // Note that Compare(a, b) also compares timestamp if timestamp size is
+  // non-zero. For the same user key with different timestamps, larger (newer)
+  // timestamp comes first.
   virtual int Compare(const Slice& a, const Slice& b) const = 0;
 
   // Compares two slices for equality. The following invariant should always
@@ -63,7 +75,7 @@
   //
   // Names starting with "rocksdb." are reserved and should not be used
   // by any clients of this package.
-  virtual const char* Name() const = 0;
+  const char* Name() const override = 0;
 
   // Advanced functions: these are used to reduce the space requirements
   // for internal data structures like index blocks.
@@ -97,15 +109,34 @@
 
   inline size_t timestamp_size() const { return timestamp_size_; }
 
-  virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
-    return Compare(a, b);
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
   }
 
+  // For two events e1 and e2 whose timestamps are t1 and t2 respectively,
+  // Returns value:
+  // < 0  iff t1 < t2
+  // == 0 iff t1 == t2
+  // > 0  iff t1 > t2
+  // Note that an all-zero byte array will be the smallest (oldest) timestamp
+  // of the same length, and a byte array with all bits 1 will be the largest.
+  // In the future, we can extend Comparator so that subclasses can specify
+  // both largest and smallest timestamps.
   virtual int CompareTimestamp(const Slice& /*ts1*/,
                                const Slice& /*ts2*/) const {
     return 0;
   }
 
+  virtual int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/,
+                                      const Slice& b, bool /*b_has_ts*/) const {
+    return Compare(a, b);
+  }
+
+  virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return 0 ==
+           CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+  }
+
  private:
   size_t timestamp_size_;
 };
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/compression_type.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs.  Each block may be compressed before
+// being stored in a file.  The following enum describes which
+// compression method (if any) is used to compress a block.
+
+enum CompressionType : unsigned char {
+  // NOTE: do not change the values of existing entries, as these are
+  // part of the persistent format on disk.
+  kNoCompression = 0x0,
+  kSnappyCompression = 0x1,
+  kZlibCompression = 0x2,
+  kBZip2Compression = 0x3,
+  kLZ4Compression = 0x4,
+  kLZ4HCCompression = 0x5,
+  kXpressCompression = 0x6,
+  kZSTD = 0x7,
+
+  // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
+  // 0.8.0 or consider a possibility of downgrading the service or copying
+  // the database files to another service running with an older version of
+  // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
+  // eventually remove the option from the public API.
+  kZSTDNotFinalCompression = 0x40,
+
+  // kDisableCompressionOption is used to disable some compression options.
+  kDisableCompressionOption = 0xff,
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/concurrent_task_limiter.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,11 +9,16 @@
 
 #pragma once
 
-#include "rocksdb/env.h"
-#include "rocksdb/statistics.h"
+#include <stdint.h>
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+// This is NOT an extensible interface but a public interface for result of
+// NewConcurrentTaskLimiter. Any derived classes must be RocksDB internal.
 class ConcurrentTaskLimiter {
  public:
   virtual ~ConcurrentTaskLimiter() {}
@@ -33,7 +38,7 @@
   virtual int32_t GetOutstandingTask() const = 0;
 };
 
-// Create a ConcurrentTaskLimiter that can be shared with mulitple CFs
+// Create a ConcurrentTaskLimiter that can be shared with multiple CFs
 // across RocksDB instances to control concurrent tasks.
 //
 // @param name: Name of the limiter.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/configurable.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/configurable.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/configurable.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/configurable.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,397 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+class ObjectRegistry;
+class OptionTypeInfo;
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+
+// Configurable is a base class used by the rocksdb that describes a
+// standard way of configuring objects.  A Configurable object can:
+//   -> Populate itself given:
+//        - One or more "name/value" pair strings
+//        - A string representing the set of name=value properties
+//        - A map of name/value properties.
+//   -> Convert itself into its string representation
+//   -> Dump itself to a Logger
+//   -> Compare itself to another Configurable object to see if the two objects
+// have equivalent options settings
+//
+// If a derived class calls RegisterOptions to register (by name) how its
+// options objects are to be processed, this functionality can typically be
+// handled by this class without additional overrides. Otherwise, the derived
+// class will need to implement the methods for handling the corresponding
+// functionality.
+class Configurable {
+ protected:
+  friend class ConfigurableHelper;
+  struct RegisteredOptions {
+    // The name of the options being registered
+    std::string name;
+    // Pointer to the object being registered
+    void* opt_ptr;
+#ifndef ROCKSDB_LITE
+    // The map of options being registered
+    const std::unordered_map<std::string, OptionTypeInfo>* type_map;
+#endif
+  };
+
+ public:
+  virtual ~Configurable() {}
+
+  // Returns the raw pointer of the named options that is used by this
+  // object, or nullptr if this function is not supported.
+  // Since the return value is a raw pointer, the object owns the
+  // pointer and the caller should not delete the pointer.
+  //
+  // Note that changing the underlying options while the object
+  // is currently used by any open DB is undefined behavior.
+  // Developers should use DB::SetOption() instead to dynamically change
+  // options while the DB is open.
+  template <typename T>
+  const T* GetOptions() const {
+    return GetOptions<T>(T::kName());
+  }
+  template <typename T>
+  T* GetOptions() {
+    return GetOptions<T>(T::kName());
+  }
+  template <typename T>
+  const T* GetOptions(const std::string& name) const {
+    return reinterpret_cast<const T*>(GetOptionsPtr(name));
+  }
+  template <typename T>
+  T* GetOptions(const std::string& name) {
+    return reinterpret_cast<T*>(const_cast<void*>(GetOptionsPtr(name)));
+  }
+
+  // Configures the options for this class based on the input parameters.
+  // On successful completion, the object is updated with the settings from
+  // the opt_map.
+  // If this method fails, an attempt is made to revert the object to original
+  // state. Note that the revert may not be the original state but may be an
+  // equivalent. For example, if the object contains an option that is a
+  // shared_ptr, the shared_ptr may not be the original one but a copy (e.g. not
+  // the Cache object that was passed in, but a Cache object of the same size).
+  //
+  // The acceptable values of the name/value pairs are documented with the
+  // specific class/instance.
+  //
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_map Name/value pairs of the options to update
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all values in the map were successfully updated
+  //      If invoke_prepare_options is true, OK also implies
+  //      PrepareOptions ran successfully.
+  // @return NotFound If any of the names in the opt_map were not valid
+  //      for this object.  If unused is specified, it will contain the
+  //      collection of NotFound names.
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  // @see ConfigOptions for a description of the controls.
+  Status ConfigureFromMap(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opt_map);
+  Status ConfigureFromMap(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opt_map,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Updates the named option to the input value, returning OK if successful.
+  // Note that ConfigureOption does not cause PrepareOptions to be invoked.
+  // @param config_options Controls how the name/value is processed.
+  // @param name The name of the option to update
+  // @param value The value to set for the named option
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @return NotSupported  If the name is valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If the value cannot be successfully  parsed.
+  Status ConfigureOption(const ConfigOptions& config_options,
+                         const std::string& name, const std::string& value);
+#endif  // ROCKSDB_LITE
+
+  // Configures the options for this class based on the input parameters.
+  // On successful completion, the object is updated with the settings from
+  // the opt_map.  If this method fails, an attempt is made to revert the
+  // object to original state.  Note that the revert may not be the original
+  // state but may be an equivalent.
+  // @see ConfigureFromMap for more details
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_str string containing the values to update.
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all specified values were successfully updated
+  //      If invoke_prepare_options is true, OK also implies
+  //      PrepareOptions ran successfully.
+  // @return NotFound If any of the names were not valid for this object.
+  //      If unused is specified, it will contain the collection of NotFound
+  //      names.
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  Status ConfigureFromString(const ConfigOptions& config_options,
+                             const std::string& opts);
+
+  // Fills in result with the serialized options for this object.
+  // This is the inverse of ConfigureFromString.
+  // @param config_options Controls how serialization happens.
+  // @param result The string representation of this object.
+  // @return OK If the options for this object were successfully serialized.
+  // @return InvalidArgument If one or more of the options could not be
+  // serialized.
+  Status GetOptionString(const ConfigOptions& config_options,
+                         std::string* result) const;
+#ifndef ROCKSDB_LITE
+  // Returns the serialized options for this object.
+  // This method is similar to GetOptionString with no errors.
+  // @param config_options Controls how serialization happens.
+  // @param prefix A string to prepend to every option.
+  // @return The serialized representation of the options for this object
+  std::string ToString(const ConfigOptions& config_options) const {
+    return ToString(config_options, "");
+  }
+  std::string ToString(const ConfigOptions& config_options,
+                       const std::string& prefix) const;
+
+  // Returns the list of option names associated with this configurable
+  // @param config_options Controls how the names are returned
+  // @param result The set of option names for this object. Note that
+  //      options that are deprecated or aliases are not returned.
+  // @return OK on success.
+  Status GetOptionNames(const ConfigOptions& config_options,
+                        std::unordered_set<std::string>* result) const;
+
+  // Returns the value of the option associated with the input name
+  // This method is the functional inverse of ConfigureOption
+  // @param config_options Controls how the value is returned
+  // @param name The name of the option to return a value for.
+  // @param value The returned value associated with the named option.
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @param InvalidArgument If the name is valid for this object but
+  //      its value cannot be serialized.
+  virtual Status GetOption(const ConfigOptions& config_options,
+                           const std::string& name, std::string* value) const;
+#endif  // ROCKSDB_LITE
+
+  // Checks to see if this Configurable is equivalent to other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param other The other object to compare to.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  virtual bool AreEquivalent(const ConfigOptions& config_options,
+                             const Configurable* other,
+                             std::string* name) const;
+
+  // Returns a pretty-printed, human-readable version of the options.
+  // This method is typically used to dump the options to a log file.
+  // Classes should override this method
+  virtual std::string GetPrintableOptions() const { return ""; }
+
+  // Validates that the settings are valid/consistent and performs any object
+  // initialization required by this object.  This method may be called as part
+  // of Configure (if invoke_prepare_options is set), or may be invoked
+  // separately.
+  //
+  // Once an object has been prepared, non-mutable options can no longer be
+  // updated.
+  //
+  // Classes must override this method to provide any implementation-specific
+  // initialization, such as opening log files or setting up cache parameters.
+  // Implementations should be idempotent (e.g. don't re-open the log file or
+  // reconfigure the cache), as there is the potential this method can be called
+  // more than once.
+  //
+  // By default, this method will also prepare all nested (Inner and
+  // OptionType::kConfigurable) objects.
+  //
+  // @param config_options Controls how the object is prepared.  Also contains
+  //      a Logger and Env that can be used to initialize this object.
+  // @return OK If the object was successfully initialized.
+  // @return InvalidArgument If this object could not be successfully
+  // initialized.
+  virtual Status PrepareOptions(const ConfigOptions& config_options);
+
+  // Checks to see if the settings are valid for this object.
+  // This method checks to see if the input DBOptions and ColumnFamilyOptions
+  // are valid for the settings of this object.  For example, an Env might not
+  // support certain mmap modes or a TableFactory might require certain
+  // settings.
+  //
+  // By default, this method will also validate all nested (Inner and
+  // OptionType::kConfigurable) objects.
+  //
+  // @param db_opts The DBOptions to validate
+  // @param cf_opts The ColumnFamilyOptions to validate
+  // @return OK if the options are valid
+  // @return InvalidArgument If the arguments are not valid for the options
+  //       of the current object.
+  virtual Status ValidateOptions(const DBOptions& db_opts,
+                                 const ColumnFamilyOptions& cf_opts) const;
+
+  // Splits the input opt_value into the ID field and the remaining options.
+  // The input opt_value can be in the form of "name" or "name=value
+  // [;name=value]". The first form uses the "name" as an id with no options The
+  // latter form converts the input into a map of name=value pairs and sets "id"
+  // to the "id" value from the map.
+  // @param opt_value The value to split into id and options
+  // @param id The id field from the opt_value
+  // @param options The remaining name/value pairs from the opt_value
+  // @param default_id If specified and there is no id field in the map, this
+  // value is returned as the ID
+  // @return OK if the value was converted to a map successfully and an ID was
+  // found.
+  // @return InvalidArgument if the value could not be converted to a map or
+  // there was or there is no id property in the map.
+  static Status GetOptionsMap(
+      const std::string& opt_value, const std::string& default_id,
+      std::string* id, std::unordered_map<std::string, std::string>* options);
+
+ protected:
+  // Returns the raw pointer for the associated named option.
+  // The name is typically the name of an option registered via the
+  // Classes may override this method to provide further specialization (such as
+  // returning a sub-option)
+  //
+  // The default implementation looks at the registered options.  If the
+  // input name matches that of a registered option, the pointer registered
+  // with that name is returned.
+  // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns
+  // "my_ptr"
+  virtual const void* GetOptionsPtr(const std::string& name) const;
+
+  // Method for allowing options to be configured outside of the normal
+  // registered options framework.  Classes may override this method if they
+  // wish to support non-standard options implementations (such as configuring
+  // themselves from constant or simple ":"-separated strings.
+  //
+  // The default implementation does nothing and returns OK
+  virtual Status ParseStringOptions(const ConfigOptions& config_options,
+                                    const std::string& opts_str);
+
+  // Internal method to configure an object from a map of name-value options.
+  // This method uses the input config_options to drive the configuration of
+  // the options in opt_map.  Any option name that cannot be found from the
+  // input set will be returned in "unused".
+  //
+  // Classes may override this method to extend the functionality if required.
+  // @param config_options Controls how the options are configured and errors
+  // handled.
+  // @param opts_map The set of options to configure
+  // @param unused Any options from opt_map that were not configured.
+  // @returns a Status based on the rules outlined in ConfigureFromMap
+  virtual Status ConfigureOptions(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Method that configures a the specific opt_name from opt_value.
+  // By default, this method calls opt_info.ParseOption with the
+  // input parameters.
+  // Classes may override this method to extend the functionality, or
+  // change the returned Status.
+  virtual Status ParseOption(const ConfigOptions& config_options,
+                             const OptionTypeInfo& opt_info,
+                             const std::string& opt_name,
+                             const std::string& opt_value, void* opt_ptr);
+
+  // Internal method to see if the single option name/info matches for this and
+  // that Classes may override this value to change its behavior.
+  // @param config_options Controls how the options are being matched
+  // @param opt_info The OptionTypeInfo registered for this option name
+  //      that controls what field is matched (offset) and how (type).
+  // @param name The name associated with this opt_info.
+  // @param this_ptr The base pointer to compare to.  This is the object
+  // registered for
+  //      for this OptionTypeInfo.
+  // @param that_ptr The other pointer to compare to.  This is the object
+  // registered for
+  //      for this OptionTypeInfo.
+  // @param bad_name  If the match fails, the name of the option that failed to
+  // match.
+  virtual bool OptionsAreEqual(const ConfigOptions& config_options,
+                               const OptionTypeInfo& opt_info,
+                               const std::string& name,
+                               const void* const this_ptr,
+                               const void* const that_ptr,
+                               std::string* bad_name) const;
+#endif
+#ifndef ROCKSDB_LITE
+  // Internal method to serialize options (ToString)
+  // Classes may override this value to change its behavior.
+  virtual std::string SerializeOptions(const ConfigOptions& config_options,
+                                       const std::string& header) const;
+#endif  // ROCKSDB_LITE
+
+  //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+  virtual std::string GetOptionName(const std::string& long_name) const;
+
+  // Registers the input name with the options and associated map.
+  // When classes register their options in this manner, most of the
+  // functionality (excluding unknown options and validate/prepare) is
+  // implemented by the base class.
+  //
+  // This method should be called in the class constructor to register the
+  // option set for this object.  For example, to register the options
+  // associated with the BlockBasedTableFactory, the constructor calls this
+  // method passing in:
+  // - the name of the options ("BlockBasedTableOptions");
+  // - the options object (the BlockBasedTableOptions object for this object;
+  // - the options type map for the BlockBasedTableOptions.
+  // This registration allows the Configurable class to process the option
+  // values associated with the BlockBasedTableOptions without further code in
+  // the derived class.
+  //
+  // @param name    The name of this set of options (@see GetOptionsPtr)
+  // @param opt_ptr Pointer to the options to associate with this name
+  // @param opt_map Options map that controls how this option is configured.
+  template <typename T>
+  void RegisterOptions(
+      T* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map) {
+    RegisterOptions(T::kName(), opt_ptr, opt_map);
+  }
+  void RegisterOptions(
+      const std::string& name, void* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map);
+
+ private:
+  // Contains the collection of options (name, opt_ptr, opt_map) associated with
+  // this object. This collection is typically set in the constructor of the
+  // Configurable option via
+  std::vector<RegisteredOptions> options_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/convenience.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/convenience.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/convenience.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/convenience.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,15 +9,108 @@
 #include <unordered_map>
 #include <vector>
 
+#include "rocksdb/compression_type.h"
 #include "rocksdb/db.h"
-#include "rocksdb/options.h"
+#include "rocksdb/status.h"
 #include "rocksdb/table.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Env;
+class Logger;
+class ObjectRegistry;
+
+struct ColumnFamilyOptions;
+struct DBOptions;
+struct Options;
+
+// ConfigOptions containing the parameters/controls for
+// comparing objects and converting to/from strings.
+// These settings control how the methods
+// treat errors (e.g. ignore_unknown_objects), the format
+// of the serialization (e.g. delimiter), and how to compare
+// options (sanity_level).
+struct ConfigOptions {
+  // Constructs a new ConfigOptions with a new object registry.
+  // This method should only be used when a DBOptions is not available,
+  // else registry settings may be lost
+  ConfigOptions();
+
+  // Constructs a new ConfigOptions using the settings from
+  // the input DBOptions.  Currently constructs a new object registry.
+  explicit ConfigOptions(const DBOptions&);
+
+  // This enum defines the RocksDB options sanity level.
+  enum SanityLevel : unsigned char {
+    kSanityLevelNone = 0x01,  // Performs no sanity check at all.
+    // Performs minimum check to ensure the RocksDB instance can be
+    // opened without corrupting / mis-interpreting the data.
+    kSanityLevelLooselyCompatible = 0x02,
+    // Perform exact match sanity check.
+    kSanityLevelExactMatch = 0xFF,
+  };
+
+  enum Depth {
+    kDepthDefault,  // Traverse nested options that are not flagged as "shallow"
+    kDepthShallow,  // Do not traverse into any nested options
+    kDepthDetailed,  // Traverse nested options, overriding the options shallow
+                     // setting
+  };
+
+  // When true, any unused options will be ignored and OK will be returned
+  bool ignore_unknown_options = false;
+
+  // When true, any unsupported options will be ignored and OK will be returned
+  bool ignore_unsupported_options = true;
+
+  // If the strings are escaped (old-style?)
+  bool input_strings_escaped = true;
+
+  // Whether or not to invoke PrepareOptions after configure is called.
+  bool invoke_prepare_options = true;
+
+  // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not.
+  // When "mutable_options_only=false", all options are evaluated.
+  // When "mutable_options_only="true", any option not marked as Mutable is
+  // either ignored (in the case of string/equals methods) or results in an
+  // error (in the case of Configure).
+  bool mutable_options_only = false;
+
+  // The separator between options when converting to a string
+  std::string delimiter = ";";
+
+  // Controls how to traverse options during print/match stages
+  Depth depth = Depth::kDepthDefault;
+
+  // Controls how options are serialized
+  // Controls how pedantic the comparison must be for equivalency
+  SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch;
+  // `file_readahead_size` is used for readahead for the option file.
+  size_t file_readahead_size = 512 * 1024;
+
+  // The environment to use for this option
+  Env* env = Env::Default();
+
+#ifndef ROCKSDB_LITE
+  // The object registry to use for this options
+  std::shared_ptr<ObjectRegistry> registry;
+#endif
+
+  bool IsShallow() const { return depth == Depth::kDepthShallow; }
+  bool IsDetailed() const { return depth == Depth::kDepthDetailed; }
+
+  bool IsCheckDisabled() const {
+    return sanity_level == SanityLevel::kSanityLevelNone;
+  }
+
+  bool IsCheckEnabled(SanityLevel level) const {
+    return (level > SanityLevel::kSanityLevelNone && level <= sanity_level);
+  }
+};
 
 #ifndef ROCKSDB_LITE
+
 // The following set of functions provide a way to construct RocksDB Options
-// from a string or a string-to-string map.  Here're the general rule of
+// from a string or a string-to-string map.  Here is the general rule of
 // setting option values from strings by type.  Some RocksDB types are also
 // supported in these APIs.  Please refer to the comment of the function itself
 // to find more information about how to config those RocksDB types.
@@ -73,7 +166,7 @@
 // ColumnFamilyOptions "new_options".
 //
 // Below are the instructions of how to config some non-primitive-typed
-// options in ColumnFOptions:
+// options in ColumnFamilyOptions:
 //
 // * table_factory:
 //   table_factory can be configured using our custom nested-option syntax.
@@ -115,7 +208,7 @@
 //     * {"memtable", "skip_list:5"} is equivalent to setting
 //       memtable to SkipListFactory(5).
 //   - PrefixHash:
-//     Pass "prfix_hash:<hash_bucket_count>" to config memtable
+//     Pass "prefix_hash:<hash_bucket_count>" to config memtable
 //     to use PrefixHash, or simply "prefix_hash" to use the default
 //     PrefixHash.
 //     [Example]:
@@ -134,13 +227,6 @@
 //     [Example]:
 //     * {"memtable", "vector:1024"} is equivalent to setting memtable
 //       to VectorRepFactory(1024).
-//   - HashCuckooRepFactory:
-//     Pass "cuckoo:<write_buffer_size>" to use HashCuckooRepFactory with the
-//     specified write buffer size, or simply "cuckoo" to use the default
-//     HashCuckooRepFactory.
-//     [Example]:
-//     * {"memtable", "cuckoo:1024"} is equivalent to setting memtable
-//       to NewHashCuckooRepFactory(1024).
 //
 //  * compression_opts:
 //    Use "compression_opts" to config compression_opts.  The value format
@@ -153,6 +239,12 @@
 //        cf_opt.compression_opts.strategy = 6;
 //        cf_opt.compression_opts.max_dict_bytes = 7;
 //
+// The GetColumnFamilyOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release.  The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
 // @param base_options the default options of the output "new_options".
 // @param opts_map an option name to value map for specifying how "new_options"
 //     should be set.
@@ -165,6 +257,17 @@
 //     instead of resulting in an unknown-option error.
 // @return Status::OK() on success.  Otherwise, a non-ok status indicating
 //     error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+//     the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+//     value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+//     valid for this option.
+Status GetColumnFamilyOptionsFromMap(
+    const ConfigOptions& config_options,
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options);
 Status GetColumnFamilyOptionsFromMap(
     const ColumnFamilyOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
@@ -184,6 +287,12 @@
 //   - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to
 //     passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec.
 //
+// The GetDBOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
 // @param base_options the default options of the output "new_options".
 // @param opts_map an option name to value map for specifying how "new_options"
 //     should be set.
@@ -196,6 +305,16 @@
 //     instead of resulting in an unknown-option error.
 // @return Status::OK() on success.  Otherwise, a non-ok status indicating
 //     error will be returned, and "new_options" will be set to "base_options".
+// @return Status::NotFound means the one (or more) of the option name in
+//     the opts_map is not valid for this option
+// @return Status::NotSupported means we do not know how to parse one of the
+//     value for this option
+// @return Status::InvalidArgument means the one of the option values is not
+//     valid for this option.
+Status GetDBOptionsFromMap(
+    const ConfigOptions& cfg_options, const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options);
 Status GetDBOptionsFromMap(
     const DBOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
@@ -227,6 +346,12 @@
 //   - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
 //     equivalent to setting block_cache using NewLRUCache(1024 * 1024).
 //
+// The GetBlockBasedTableOptionsFromMap(ConfigOptions, ...) should be used;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
 // @param table_options the default options of the output "new_table_options".
 // @param opts_map an option name to value map for specifying how
 //     "new_table_options" should be set.
@@ -241,6 +366,11 @@
 //     error will be returned, and "new_table_options" will be set to
 //     "table_options".
 Status GetBlockBasedTableOptionsFromMap(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromMap(
     const BlockBasedTableOptions& table_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     BlockBasedTableOptions* new_table_options,
@@ -250,6 +380,12 @@
 // map "opts_map" of option name to option value to construct the new
 // PlainTableOptions "new_table_options".
 //
+// The GetPlainTableOptionsFromMap(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+//
+// @param config_options controls how the map is processed.
 // @param table_options the default options of the output "new_table_options".
 // @param opts_map an option name to value map for specifying how
 //     "new_table_options" should be set.
@@ -264,12 +400,16 @@
 //     error will be returned, and "new_table_options" will be set to
 //     "table_options".
 Status GetPlainTableOptionsFromMap(
+    const ConfigOptions& config_options, const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromMap(
     const PlainTableOptions& table_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     PlainTableOptions* new_table_options, bool input_strings_escaped = false,
     bool ignore_unknown_options = false);
 
-// Take a string representation of option names and  values, apply them into the
+// Take a string representation of option names and values, apply them into the
 // base_options, and return the new options as a result. The string has the
 // following format:
 //   "write_buffer_size=1024;max_write_buffer_number=2"
@@ -277,22 +417,43 @@
 // BlockBasedTableOptions as part of the string for block-based table factory:
 //   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
 //   "max_write_buffer_num=2"
+//
+//
+// The GetColumnFamilyOptionsFromString(ConfigOptions, ...) should be used; the
+// alternative signature may be deprecated in a future release. The equivalent
+// functionality can be achieved by setting the corresponding options in
+// the ConfigOptions parameter.
+Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options);
 Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options,
                                         const std::string& opts_str,
                                         ColumnFamilyOptions* new_options);
 
+Status GetDBOptionsFromString(const ConfigOptions& config_options,
+                              const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options);
+
 Status GetDBOptionsFromString(const DBOptions& base_options,
                               const std::string& opts_str,
                               DBOptions* new_options);
 
+Status GetStringFromDBOptions(const ConfigOptions& config_options,
+                              const DBOptions& db_options,
+                              std::string* opts_str);
+
 Status GetStringFromDBOptions(std::string* opts_str,
                               const DBOptions& db_options,
                               const std::string& delimiter = ";  ");
 
+Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& cf_options,
+                                        std::string* opts_str);
 Status GetStringFromColumnFamilyOptions(std::string* opts_str,
                                         const ColumnFamilyOptions& cf_options,
                                         const std::string& delimiter = ";  ");
-
 Status GetStringFromCompressionType(std::string* compression_str,
                                     CompressionType compression_type);
 
@@ -301,10 +462,18 @@
 Status GetBlockBasedTableOptionsFromString(
     const BlockBasedTableOptions& table_options, const std::string& opts_str,
     BlockBasedTableOptions* new_table_options);
+Status GetBlockBasedTableOptionsFromString(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
 
 Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
                                       const std::string& opts_str,
                                       PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+                                      const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options);
 
 Status GetMemTableRepFactoryFromString(
     const std::string& opts_str,
@@ -312,6 +481,9 @@
 
 Status GetOptionsFromString(const Options& base_options,
                             const std::string& opts_str, Options* new_options);
+Status GetOptionsFromString(const ConfigOptions& config_options,
+                            const Options& base_options,
+                            const std::string& opts_str, Options* new_options);
 
 Status StringToMap(const std::string& opts_str,
                    std::unordered_map<std::string, std::string>* opts_map);
@@ -345,7 +517,6 @@
                              const EnvOptions& env_options,
                              const ReadOptions& read_options,
                              const std::string& file_path);
-
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/customizable.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/customizable.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/customizable.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/customizable.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,233 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/configurable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+/**
+ * Customizable a base class used by the rocksdb that describes a
+ * standard way of configuring and creating objects.  Customizable objects
+ * are configurable objects that can be created from an ObjectRegistry.
+ *
+ * Customizable classes are used when there are multiple potential
+ * implementations of a class for use by RocksDB (e.g. Table, Cache,
+ * MergeOperator, etc).  The abstract base class is expected to define a method
+ * declaring its type and a factory method for creating one of these, such as:
+ * static const char *Type() { return "Table"; }
+ * static Status CreateFromString(const ConfigOptions& options,
+ *                                const std::string& id,
+ *                                std::shared_ptr<TableFactory>* result);
+ * The "Type" string is expected to be unique (no two base classes are the same
+ * type). This factory is expected, based on the options and id, create and
+ * return the appropriate derived type of the customizable class (e.g.
+ * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers,
+ * helper classes and methods are provided for writing this factory.
+ *
+ * Instances of a Customizable class need to define:
+ * - A "static const char *kClassName()" method.  This method defines the name
+ * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the
+ * CheckedCast method.
+ * - The Name() of the object.  This name is used when creating and saving
+ * instances of this class.  Typically this name will be the same as
+ * kClassName().
+ *
+ * Additionally, Customizable classes should register any options used to
+ * configure themselves with the Configurable subsystem.
+ *
+ * When a Customizable is being created, the "name" property specifies
+ * the name of the instance being created.
+ * For custom objects, their configuration and name can be specified by:
+ * [prop]={name=X;option 1 = value1[; option2=value2...]}
+ *
+ * [prop].name=X
+ * [prop].option1 = value1
+ *
+ * [prop].name=X
+ * X.option1 =value1
+ */
+class Customizable : public Configurable {
+ public:
+  ~Customizable() override {}
+
+  // Returns the name of this class of Customizable
+  virtual const char* Name() const = 0;
+
+  // Returns an identifier for this Customizable.
+  // This could be its name or something more complex (like its URL/pattern).
+  // Used for pretty printing.
+  virtual std::string GetId() const {
+    std::string id = Name();
+    return id;
+  }
+
+  // This is typically determined by if the input name matches the
+  // name of this object.
+  // This method is typically used in conjunction with CheckedCast to find the
+  // derived class instance from its base.  For example, if you have an Env
+  // and want the "Default" env, you would IsInstanceOf("Default") to get
+  // the default implementation.  This method should be used when you need a
+  // specific derivative or implementation of a class.
+  //
+  // Intermediary caches (such as SharedCache) may wish to override this method
+  // to check for the intermediary name (SharedCache).  Classes with multiple
+  // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override
+  // this method.
+  //
+  // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a".
+  // Wrapped classes that have an Inner "has-a" should not be returned.
+  //
+  // @param name The name of the instance to find.
+  // Returns true if the class is an instance of the input name.
+  virtual bool IsInstanceOf(const std::string& name) const {
+    if (name.empty()) {
+      return false;
+    } else if (name == Name()) {
+      return true;
+    } else {
+      const char* nickname = NickName();
+      if (nickname != nullptr && name == nickname) {
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  const void* GetOptionsPtr(const std::string& name) const override {
+    const void* ptr = Configurable::GetOptionsPtr(name);
+    if (ptr != nullptr) {
+      return ptr;
+    } else {
+      const auto inner = Inner();
+      if (inner != nullptr) {
+        return inner->GetOptionsPtr(name);
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // Returns the named instance of the Customizable as a T*, or nullptr if not
+  // found. This method uses IsInstanceOf/Inner to find the appropriate class
+  // instance and then casts it to the expected return type.
+  template <typename T>
+  const T* CheckedCast() const {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<const T*>(this);
+    } else {
+      const auto inner = Inner();
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  template <typename T>
+  T* CheckedCast() {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<T*>(this);
+    } else {
+      auto inner = const_cast<Customizable*>(Inner());
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // Checks to see if this Customizable is equivalent to other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param other The other object to compare to.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  // @see Configurable::AreEquivalent for more details
+  bool AreEquivalent(const ConfigOptions& config_options,
+                     const Configurable* other,
+                     std::string* mismatch) const override;
+#ifndef ROCKSDB_LITE
+  // Gets the value of the option associated with the input name
+  // @see Configurable::GetOption for more details
+  Status GetOption(const ConfigOptions& config_options, const std::string& name,
+                   std::string* value) const override;
+#endif  // ROCKSDB_LITE
+  // Helper method for getting for parsing the opt_value into the corresponding
+  // options for use in potentially creating a new Customizable object (this
+  // method is primarily a support method for LoadSharedObject et al for new
+  // Customizable objects). The opt_value may be either name-value pairs
+  // separated by ";" (a=b; c=d), or a simple name (a). In order to create a new
+  // Customizable, the ID is determined by:
+  // - If the value is a simple name (e.g. "BlockBasedTable"), the id is this
+  // name;
+  // - Otherwise, if there is a "id=value", the id is set to "value"
+  // - Otherwise, if the input customizable is not null, custom->GetId is used
+  // - Otherwise, an error is returned.
+  //
+  // If the opt_value is name-value pairs, these pairs will be returned in
+  // options (without the id pair). If the ID being returned matches the ID of
+  // the input custom object, then the options from the input object will also
+  // be added to the returned options.
+  //
+  // This method returns non-OK if the ID could not be found, or if the
+  // opt_value could not be parsed into name-value pairs.
+  static Status GetOptionsMap(
+      const ConfigOptions& config_options, const Customizable* custom,
+      const std::string& opt_value, std::string* id,
+      std::unordered_map<std::string, std::string>* options);
+
+  // Helper method to configure a new object with the supplied options.
+  // If the object is not null and invoke_prepare_options=true, the object
+  // will be configured and prepared.
+  // Returns success if the object is properly configured and (optionally)
+  // prepared Returns InvalidArgument if the object is nullptr and there are
+  // options in the map Returns the result of the ConfigureFromMap or
+  // PrepareOptions
+  static Status ConfigureNewObject(
+      const ConfigOptions& config_options, Customizable* object,
+      const std::unordered_map<std::string, std::string>& options);
+
+  // Returns the inner class when a Customizable implements a has-a (wrapped)
+  // relationship.  Derived classes that implement a has-a must override this
+  // method in order to get CheckedCast to function properly.
+  virtual const Customizable* Inner() const { return nullptr; }
+
+ protected:
+  // Generates a ID specific for this instance of the customizable.
+  // The unique ID is of the form <name>:<addr>#pid, where:
+  // - name is the Name() of this object;
+  // - addr is the memory address of this object;
+  // - pid is the process ID of this process ID for this process.
+  // Note that if obj1 and obj2 have the same unique IDs, they must be the
+  // same.  However, if an object is deleted and recreated, it may have the
+  // same unique ID as a predecessor
+  //
+  // This method is useful for objects (especially ManagedObjects) that
+  // wish to generate an ID that is specific for this instance and wish to
+  // override the GetId() method.
+  std::string GenerateIndividualId() const;
+
+  // Some classes have both a class name (e.g. PutOperator) and a nickname
+  // (e.g. put). Classes can override this method to return a
+  // nickname.  Nicknames can be used by InstanceOf and object creation.
+  virtual const char* NickName() const { return ""; }
+  //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+  std::string GetOptionName(const std::string& long_name) const override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& options,
+                               const std::string& prefix) const override;
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/data_structure.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,51 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a data structure specifically designed as a "Set" for a
+// pretty small scale of Enum structure. For now, it can support up
+// to 64 element, and it is expandable in the future.
+template <typename ENUM_TYPE, ENUM_TYPE MAX_VALUE>
+class SmallEnumSet {
+ public:
+  SmallEnumSet() : state_(0) {}
+
+  ~SmallEnumSet() {}
+
+  // Return true if the input enum is included in the "Set" (i.e., changes the
+  // internal scalar state successfully), otherwise, it will return false.
+  bool Add(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t old_state = state_;
+    uint64_t tmp = 1;
+    state_ |= (tmp << value);
+    return old_state != state_;
+  }
+
+  // Return true if the input enum is contained in the "Set".
+  bool Contains(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t tmp = 1;
+    return state_ & (tmp << value);
+  }
+
+ private:
+  uint64_t state_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/db.h	2025-05-19 16:14:27.000000000 +0000
@@ -39,25 +39,31 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-struct Options;
-struct DBOptions;
 struct ColumnFamilyOptions;
-struct ReadOptions;
-struct WriteOptions;
-struct FlushOptions;
 struct CompactionOptions;
 struct CompactRangeOptions;
-struct TableProperties;
+struct DBOptions;
 struct ExternalSstFileInfo;
-class WriteBatch;
+struct FlushOptions;
+struct Options;
+struct ReadOptions;
+struct TableProperties;
+struct WriteOptions;
+#ifdef ROCKSDB_LITE
+class CompactionJobInfo;
+#endif
 class Env;
 class EventListener;
+class FileSystem;
+#ifndef ROCKSDB_LITE
+class Replayer;
+#endif
 class StatsHistoryIterator;
+#ifndef ROCKSDB_LITE
+class TraceReader;
 class TraceWriter;
-#ifdef ROCKSDB_LITE
-class CompactionJobInfo;
 #endif
-class FileSystem;
+class WriteBatch;
 
 extern const std::string kDefaultColumnFamilyName;
 extern const std::string kPersistentStatsColumnFamilyName;
@@ -111,10 +117,19 @@
   RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
 };
 
+// It is valid that files_checksums and files_checksum_func_names are both
+// empty (no checksum information is provided for ingestion). Otherwise,
+// their sizes should be the same as external_files. The file order should
+// be the same in three vectors and guaranteed by the caller.
+// Note that, we assume the temperatures of this batch of files to be
+// ingested are the same.
 struct IngestExternalFileArg {
   ColumnFamilyHandle* column_family = nullptr;
   std::vector<std::string> external_files;
   IngestExternalFileOptions options;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  Temperature file_temperature = Temperature::kUnknown;
 };
 
 struct GetMergeOperandsOptions {
@@ -124,19 +139,25 @@
 // A collections of table properties objects, where
 //  key: is the table's file name.
 //  value: the table properties object of the given table.
-typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
-    TablePropertiesCollection;
+using TablePropertiesCollection =
+    std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
 
-// A DB is a persistent ordered map from keys to values.
+// A DB is a persistent, versioned ordered map from keys to values.
 // A DB is safe for concurrent access from multiple threads without
 // any external synchronization.
+// DB is an abstract base class with one primary implementation (DBImpl)
+// and a number of wrapper implementations.
 class DB {
  public:
-  // Open the database with the specified "name".
+  // Open the database with the specified "name" for reads and writes.
   // Stores a pointer to a heap-allocated database in *dbptr and returns
   // OK on success.
-  // Stores nullptr in *dbptr and returns a non-OK status on error.
-  // Caller should delete *dbptr when it is no longer needed.
+  // Stores nullptr in *dbptr and returns a non-OK status on error, including
+  // if the DB is already open (read-write) by another DB object. (This
+  // guarantee depends on options.env->LockFile(), which might not provide
+  // this guarantee in a custom Env implementation.)
+  //
+  // Caller must delete *dbptr when it is no longer needed.
   static Status Open(const Options& options, const std::string& name,
                      DB** dbptr);
 
@@ -145,11 +166,17 @@
   // If the db is opened in read only mode, then no compactions
   // will happen.
   //
+  // While a given DB can be simultaneously open via OpenForReadOnly
+  // by any number of readers, if a DB is simultaneously open by Open
+  // and OpenForReadOnly, the read-only instance has undefined behavior
+  // (though can often succeed if quickly closed) and the read-write
+  // instance is unaffected. See also OpenAsSecondary.
+  //
   // Not supported in ROCKSDB_LITE, in which case the function will
   // return Status::NotSupported.
   static Status OpenForReadOnly(const Options& options, const std::string& name,
                                 DB** dbptr,
-                                bool error_if_log_file_exist = false);
+                                bool error_if_wal_file_exists = false);
 
   // Open the database for read only with column families. When opening DB with
   // read only, you can specify only a subset of column families in the
@@ -157,13 +184,19 @@
   // column family. The default column family name is 'default' and it's stored
   // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
   //
+  // While a given DB can be simultaneously open via OpenForReadOnly
+  // by any number of readers, if a DB is simultaneously open by Open
+  // and OpenForReadOnly, the read-only instance has undefined behavior
+  // (though can often succeed if quickly closed) and the read-write
+  // instance is unaffected. See also OpenAsSecondary.
+  //
   // Not supported in ROCKSDB_LITE, in which case the function will
   // return Status::NotSupported.
   static Status OpenForReadOnly(
       const DBOptions& db_options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
       std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
-      bool error_if_log_file_exist = false);
+      bool error_if_wal_file_exists = false);
 
   // The following OpenAsSecondary functions create a secondary instance that
   // can dynamically tail the MANIFEST of a primary that must have already been
@@ -197,11 +230,11 @@
   // to open the primary instance.
   // The secondary_path argument points to a directory where the secondary
   // instance stores its info log.
-  // The column_families argument specifieds a list of column families to open.
+  // The column_families argument specifies a list of column families to open.
   // If any of the column families does not exist, the function returns non-OK
   // status.
   // The handles is an out-arg corresponding to the opened database column
-  // familiy handles.
+  // family handles.
   // The dbptr is an out-arg corresponding to the opened secondary instance.
   // The pointer points to a heap-allocated database, and the caller should
   // delete it after use. Before deleting the dbptr, the user should also
@@ -231,6 +264,16 @@
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
 
+  // Open DB and run the compaction.
+  // It's a read-only operation, the result won't be installed to the DB, it
+  // will be output to the `output_directory`. The API should only be used with
+  // `options.CompactionService` to run compaction triggered by
+  // `CompactionService`.
+  static Status OpenAndCompact(
+      const std::string& name, const std::string& output_directory,
+      const std::string& input, std::string* output,
+      const CompactionServiceOptionsOverride& override_options);
+
   virtual Status Resume() { return Status::NotSupported(); }
 
   // Close the DB by releasing resources, closing files etc. This should be
@@ -242,9 +285,9 @@
   // If the return status is Aborted(), closing fails because there is
   // unreleased snapshot in the system. In this case, users can release
   // the unreleased snapshots and try again and expect it to succeed. For
-  // other status, recalling Close() will be no-op.
-  // If the return status is NotSupported(), then the DB implementation does
-  // cleanup in the destructor
+  // other status, re-calling Close() will be no-op and return the original
+  // close status. If the return status is NotSupported(), then the DB
+  // implementation does cleanup in the destructor
   virtual Status Close() { return Status::NotSupported(); }
 
   // ListColumnFamilies will open the DB specified by argument name
@@ -255,6 +298,7 @@
                                    const std::string& name,
                                    std::vector<std::string>* column_families);
 
+  // Abstract class ctor
   DB() {}
   // No copying allowed
   DB(const DB&) = delete;
@@ -353,8 +397,15 @@
 
   // Removes the database entries in the range ["begin_key", "end_key"), i.e.,
   // including "begin_key" and excluding "end_key". Returns OK on success, and
-  // a non-OK status on error. It is not an error if no keys exist in the range
-  // ["begin_key", "end_key").
+  // a non-OK status on error. It is not an error if the database does not
+  // contain any existing data in the range ["begin_key", "end_key").
+  //
+  // If "end_key" comes before "start_key" according to the user's comparator,
+  // a `Status::InvalidArgument` is returned.
+  //
+  // WARNING: Do not use `Iterator::Refresh()` API on DBs where `DeleteRange()`
+  // has been used or will be used. This feature combination is neither
+  // supported nor programmatically prevented.
   //
   // This feature is now usable in production, with the following caveats:
   // 1) Accumulating many range tombstones in the memtable will degrade read
@@ -388,6 +439,9 @@
   // If the database contains an entry for "key" store the
   // corresponding value in *value and return OK.
   //
+  // If timestamp is enabled and a non-null timestamp pointer is passed in,
+  // timestamp is returned.
+  //
   // If there is no entry for "key" leave *value unchanged and return
   // a status for which Status::IsNotFound() returns true.
   //
@@ -412,6 +466,32 @@
     return Get(options, DefaultColumnFamily(), key, value);
   }
 
+  // Get() methods that return timestamp. Derived DB classes don't need to worry
+  // about this group of methods if they don't care about timestamp feature.
+  virtual inline Status Get(const ReadOptions& options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            std::string* value, std::string* timestamp) {
+    assert(value != nullptr);
+    PinnableSlice pinnable_val(value);
+    assert(!pinnable_val.IsPinned());
+    auto s = Get(options, column_family, key, &pinnable_val, timestamp);
+    if (s.ok() && pinnable_val.IsPinned()) {
+      value->assign(pinnable_val.data(), pinnable_val.size());
+    }  // else value is already assigned
+    return s;
+  }
+  virtual Status Get(const ReadOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, PinnableSlice* /*value*/,
+                     std::string* /*timestamp*/) {
+    return Status::NotSupported(
+        "Get() that returns timestamp is not implemented.");
+  }
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value, std::string* timestamp) {
+    return Get(options, DefaultColumnFamily(), key, value, timestamp);
+  }
+
   // Returns all the merge operands corresponding to the key. If the
   // number of merge operands in DB is greater than
   // merge_operands_options.expected_max_number_of_operands
@@ -428,6 +508,11 @@
       GetMergeOperandsOptions* get_merge_operands_options,
       int* number_of_operands) = 0;
 
+  // Consistent Get of many keys across column families without the need
+  // for an explicit snapshot. NOTE: the implementation of this MultiGet API
+  // does not have the performance benefits of the void-returning MultiGet
+  // functions.
+  //
   // If keys[i] does not exist in the database, then the i'th returned
   // status will be one for which Status::IsNotFound() is true, and
   // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
@@ -451,6 +536,25 @@
         keys, values);
   }
 
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+      const std::vector<Slice>& keys, std::vector<std::string>* /*values*/,
+      std::vector<std::string>* /*timestamps*/) {
+    return std::vector<Status>(
+        keys.size(), Status::NotSupported(
+                         "MultiGet() returning timestamps not implemented."));
+  }
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values,
+                                       std::vector<std::string>* timestamps) {
+    return MultiGet(
+        options,
+        std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+        keys, values, timestamps);
+  }
+
   // Overloaded MultiGet API that improves performance by batching operations
   // in the read path for greater efficiency. Currently, only the block based
   // table format with full filters are supported. Other table formats such
@@ -492,6 +596,30 @@
     }
   }
 
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses, const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+    std::vector<std::string> tss;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_family);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals, &tss);
+    std::copy(status.begin(), status.end(), statuses);
+    std::copy(tss.begin(), tss.end(), timestamps);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
   // Overloaded MultiGet API that improves performance by batching operations
   // in the read path for greater efficiency. Currently, only the block based
   // table format with full filters are supported. Other table formats such
@@ -531,6 +659,28 @@
       values++;
     }
   }
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses, const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+    std::vector<std::string> tss;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_families[i]);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals, &tss);
+    std::copy(status.begin(), status.end(), statuses);
+    std::copy(tss.begin(), tss.end(), timestamps);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
 
   // If the key definitely does not exist in the database, then this method
   // returns false, else true. If the caller wants to obtain value when the key
@@ -542,17 +692,33 @@
   virtual bool KeyMayExist(const ReadOptions& /*options*/,
                            ColumnFamilyHandle* /*column_family*/,
                            const Slice& /*key*/, std::string* /*value*/,
+                           std::string* /*timestamp*/,
                            bool* value_found = nullptr) {
     if (value_found != nullptr) {
       *value_found = false;
     }
     return true;
   }
+
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, bool* value_found = nullptr) {
+    return KeyMayExist(options, column_family, key, value,
+                       /*timestamp=*/nullptr, value_found);
+  }
+
   virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
                            std::string* value, bool* value_found = nullptr) {
     return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
   }
 
+  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
+                           std::string* value, std::string* timestamp,
+                           bool* value_found = nullptr) {
+    return KeyMayExist(options, DefaultColumnFamily(), key, value, timestamp,
+                       value_found);
+  }
+
   // Return a heap-allocated iterator over the contents of the database.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
@@ -578,7 +744,7 @@
   // snapshot is no longer needed.
   //
   // nullptr will be returned if the DB fails to take a snapshot or does
-  // not support snapshot.
+  // not support snapshot (eg: inplace_update_support enabled).
   virtual const Snapshot* GetSnapshot() = 0;
 
   // Release a previously acquired snapshot.  The caller must not
@@ -586,7 +752,9 @@
   virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
 
 #ifndef ROCKSDB_LITE
-  // Contains all valid property arguments for GetProperty().
+  // Contains all valid property arguments for GetProperty() or
+  // GetMapProperty(). Each is a "string" property for retrieval with
+  // GetProperty() unless noted as a "map" property, for GetMapProperty().
   //
   // NOTE: Property names cannot end in numbers since those are interpreted as
   //       arguments, e.g., see kNumFilesAtLevelPrefix.
@@ -611,34 +779,35 @@
     //      SST files.
     static const std::string kSSTables;
 
-    //  "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and
-    //      "rocksdb.cf-file-histogram" together. See below for description
-    //      of the two.
+    //  "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram"
+    //      and "rocksdb.cf-file-histogram" as a "map" property.
     static const std::string kCFStats;
 
     //  "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
-    //      general columm family stats per-level over db's lifetime ("L<n>"),
+    //      general column family stats per-level over db's lifetime ("L<n>"),
     //      aggregated over db's lifetime ("Sum"), and aggregated over the
     //      interval since the last retrieval ("Int").
-    //  It could also be used to return the stats in the format of the map.
-    //  In this case there will a pair of string to array of double for
-    //  each level as well as for "Sum". "Int" stats will not be affected
-    //  when this form of stats are retrieved.
     static const std::string kCFStatsNoFileHistogram;
 
     //  "rocksdb.cf-file-histogram" - print out how many file reads to every
     //      level, as well as the histogram of latency of single requests.
     static const std::string kCFFileHistogram;
 
-    //  "rocksdb.dbstats" - returns a multi-line string with general database
-    //      stats, both cumulative (over the db's lifetime) and interval (since
-    //      the last retrieval of kDBStats).
+    //  "rocksdb.dbstats" - As a string property, returns a multi-line string
+    //      with general database stats, both cumulative (over the db's
+    //      lifetime) and interval (since the last retrieval of kDBStats).
+    //      As a map property, returns cumulative stats only and does not
+    //      update the baseline for the interval stats.
     static const std::string kDBStats;
 
     //  "rocksdb.levelstats" - returns multi-line string containing the number
     //      of files per level and total size of each level (MB).
     static const std::string kLevelStats;
 
+    //  "rocksdb.block-cache-entry-stats" - returns a multi-line string or
+    //      map with statistics on block cache usage.
+    static const std::string kBlockCacheEntryStats;
+
     //  "rocksdb.num-immutable-mem-table" - returns number of immutable
     //      memtables that have not yet been flushed.
     static const std::string kNumImmutableMemTable;
@@ -733,7 +902,8 @@
     static const std::string kCurrentSuperVersionNumber;
 
     //  "rocksdb.estimate-live-data-size" - returns an estimate of the amount of
-    //      live data in bytes.
+    //      live data in bytes. For BlobDB, it also includes the exact value of
+    //      live bytes in the blob files of the version.
     static const std::string kEstimateLiveDataSize;
 
     //  "rocksdb.min-log-number-to-keep" - return the minimum log number of the
@@ -754,6 +924,10 @@
     //      files belong to the latest LSM tree.
     static const std::string kLiveSstFilesSize;
 
+    // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes)
+    //      of SST files at all certain file temperature
+    static const std::string kLiveSstFilesSizeAtTemperature;
+
     //  "rocksdb.base-level" - returns number of level to which L0 data will be
     //      compacted.
     static const std::string kBaseLevel;
@@ -764,8 +938,10 @@
     //      based.
     static const std::string kEstimatePendingCompactionBytes;
 
-    //  "rocksdb.aggregated-table-properties" - returns a string representation
-    //      of the aggregated table properties of the target column family.
+    //  "rocksdb.aggregated-table-properties" - returns a string or map
+    //      representation of the aggregated table properties of the target
+    //      column family. Only properties that make sense for aggregation
+    //      are included.
     static const std::string kAggregatedTableProperties;
 
     //  "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
@@ -800,18 +976,39 @@
     // "rocksdb.options-statistics" - returns multi-line string
     //      of options.statistics
     static const std::string kOptionsStatistics;
+
+    // "rocksdb.num-blob-files" - returns number of blob files in the current
+    //      version.
+    static const std::string kNumBlobFiles;
+
+    // "rocksdb.blob-stats" - return the total number and size of all blob
+    //      files, and total amount of garbage (bytes) in the blob files in
+    //      the current version.
+    static const std::string kBlobStats;
+
+    // "rocksdb.total-blob-file-size" - returns the total size of all blob
+    //      files over all versions.
+    static const std::string kTotalBlobFileSize;
+
+    // "rocksdb.live-blob-file-size" - returns the total size of all blob
+    //      files in the current version.
+    static const std::string kLiveBlobFileSize;
   };
 #endif /* ROCKSDB_LITE */
 
-  // DB implementations can export properties about their state via this method.
-  // If "property" is a valid property understood by this DB implementation (see
-  // Properties struct above for valid options), fills "*value" with its current
-  // value and returns true.  Otherwise, returns false.
+  // DB implementations export properties about their state via this method.
+  // If "property" is a valid "string" property understood by this DB
+  // implementation (see Properties struct above for valid options), fills
+  // "*value" with its current value and returns true.  Otherwise, returns
+  // false.
   virtual bool GetProperty(ColumnFamilyHandle* column_family,
                            const Slice& property, std::string* value) = 0;
   virtual bool GetProperty(const Slice& property, std::string* value) {
     return GetProperty(DefaultColumnFamily(), property, value);
   }
+
+  // Like GetProperty but for valid "map" properties. (Some properties can be
+  // accessed as either "string" properties or "map" properties.)
   virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
                               const Slice& property,
                               std::map<std::string, std::string>* value) = 0;
@@ -856,6 +1053,11 @@
   //  "rocksdb.block-cache-capacity"
   //  "rocksdb.block-cache-usage"
   //  "rocksdb.block-cache-pinned-usage"
+  //
+  //  Properties dedicated for BlobDB:
+  //  "rocksdb.num-blob-files"
+  //  "rocksdb.total-blob-file-size"
+  //  "rocksdb.live-blob-file-size"
   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
                               const Slice& property, uint64_t* value) = 0;
   virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
@@ -883,32 +1085,36 @@
   };
 
   // For each i in [0,n-1], store in "sizes[i]", the approximate
-  // file system space used by keys in "[range[i].start .. range[i].limit)".
+  // file system space used by keys in "[range[i].start .. range[i].limit)"
+  // in a single column family.
   //
   // Note that the returned sizes measure file system space usage, so
   // if the user data compresses by a factor of ten, the returned
   // sizes will be one-tenth the size of the corresponding user data size.
   virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
                                      ColumnFamilyHandle* column_family,
-                                     const Range* range, int n,
+                                     const Range* ranges, int n,
                                      uint64_t* sizes) = 0;
 
   // Simpler versions of the GetApproximateSizes() method above.
-  // The include_flags argumenbt must of type DB::SizeApproximationFlags
+  // The include_flags argument must of type DB::SizeApproximationFlags
   // and can not be NONE.
-  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags = INCLUDE_FILES) {
+  virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                     const Range* ranges, int n,
+                                     uint64_t* sizes,
+                                     uint8_t include_flags = INCLUDE_FILES) {
     SizeApproximationOptions options;
     options.include_memtabtles =
         (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
     options.include_files =
         (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
-    GetApproximateSizes(options, column_family, range, n, sizes);
+    return GetApproximateSizes(options, column_family, ranges, n, sizes);
   }
-  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags = INCLUDE_FILES) {
-    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
+  virtual Status GetApproximateSizes(const Range* ranges, int n,
+                                     uint64_t* sizes,
+                                     uint8_t include_flags = INCLUDE_FILES) {
+    return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes,
+                               include_flags);
   }
 
   // The method is similar to GetApproximateSizes, except it
@@ -948,6 +1154,8 @@
   // and the data is rearranged to reduce the cost of operations
   // needed to access the data.  This operation should typically only
   // be invoked by users who understand the underlying implementation.
+  // This call blocks until the operation completes successfully, fails,
+  // or is aborted (Status::Incomplete). See DisableManualCompaction.
   //
   // begin==nullptr is treated as a key before all keys in the database.
   // end==nullptr is treated as a key after all keys in the database.
@@ -1002,9 +1210,9 @@
       const std::unordered_map<std::string, std::string>& new_options) = 0;
 
   // CompactFiles() inputs a list of files specified by file numbers and
-  // compacts them to the specified level. Note that the behavior is different
-  // from CompactRange() in that CompactFiles() performs the compaction job
-  // using the CURRENT thread.
+  // compacts them to the specified level. A small difference compared to
+  // CompactRange() is that CompactFiles() performs the compaction job
+  // using the CURRENT thread, so is not considered a "background" job.
   //
   // @see GetDataBaseMetaData
   // @see GetColumnFamilyMetaData
@@ -1029,7 +1237,8 @@
 
   // This function will wait until all currently running background processes
   // finish. After it returns, no background process will be run until
-  // ContinueBackgroundWork is called
+  // ContinueBackgroundWork is called, once for each preceding OK-returning
+  // call to PauseBackgroundWork.
   virtual Status PauseBackgroundWork() = 0;
   virtual Status ContinueBackgroundWork() = 0;
 
@@ -1045,7 +1254,16 @@
   virtual Status EnableAutoCompaction(
       const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
 
+  // After this function call, CompactRange() or CompactFiles() will not
+  // run compactions and fail. Calling this function will tell outstanding
+  // manual compactions to abort and will wait for them to finish or abort
+  // before returning.
   virtual void DisableManualCompaction() = 0;
+  // Re-enable CompactRange() and ComapctFiles() that are disabled by
+  // DisableManualCompaction(). This function must be called as many times
+  // as DisableManualCompaction() has been called in order to re-enable
+  // manual compactions, and must not be called more times than
+  // DisableManualCompaction() has been called.
   virtual void EnableManualCompaction() = 0;
 
   // Number of levels used for this DB.
@@ -1137,13 +1355,20 @@
   // updated, false if user attempted to call if with seqnum <= current value.
   virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0;
 
-#ifndef ROCKSDB_LITE
-
   // Prevent file deletions. Compactions will continue to occur,
   // but no obsolete files will be deleted. Calling this multiple
   // times have the same effect as calling it once.
   virtual Status DisableFileDeletions() = 0;
 
+  // Increase the full_history_ts of column family. The new ts_low value should
+  // be newer than current full_history_ts value.
+  virtual Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                          std::string ts_low) = 0;
+
+  // Get current full_history_ts value.
+  virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                     std::string* ts_low) = 0;
+
   // Allow compactions to delete obsolete files.
   // If force == true, the call to EnableFileDeletions() will guarantee that
   // file deletions are enabled after the call, even if DisableFileDeletions()
@@ -1155,6 +1380,7 @@
   // threads call EnableFileDeletions()
   virtual Status EnableFileDeletions(bool force = true) = 0;
 
+#ifndef ROCKSDB_LITE
   // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
 
   // Retrieve the list of all files in the database. The files are
@@ -1216,6 +1442,14 @@
 
 // Windows API macro interference
 #undef DeleteFile
+  // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+  // operate at the proper level of abstraction for a key-value store, and its
+  // contract/restrictions are poorly documented. For example, it returns non-OK
+  // `Status` for non-bottommost files and files undergoing compaction. Since we
+  // do not plan to maintain it, the contract will likely remain underspecified
+  // until its removal. Any user is encouraged to read the implementation
+  // carefully and migrate away from it when possible.
+  //
   // Delete the file name from the db directory and update the internal state to
   // reflect that. Supports deletion of sst and log files only. 'name' must be
   // path relative to the db directory. eg. 000001.sst, /archive/000003.log
@@ -1226,6 +1460,20 @@
   virtual void GetLiveFilesMetaData(
       std::vector<LiveFileMetaData>* /*metadata*/) {}
 
+  // Return a list of all table and blob files checksum info.
+  // Note: This function might be of limited use because it cannot be
+  // synchronized with GetLiveFiles.
+  virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0;
+
+  // EXPERIMENTAL: This function is not yet feature-complete.
+  // Get information about all live files that make up a DB, for making
+  // live copies (Checkpoint, backups, etc.) or other storage-related purposes.
+  // Use DisableFileDeletions() before and EnableFileDeletions() after to
+  // preserve the files for live copy.
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) = 0;
+
   // Obtains the meta data of the specified column family of the DB.
   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
                                        ColumnFamilyMetaData* /*metadata*/) {}
@@ -1235,6 +1483,12 @@
     GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
   }
 
+  // Obtains the meta data of all column families for the DB.
+  // The returned map contains one entry for each column family indexed by the
+  // name of the column family.
+  virtual void GetAllColumnFamilyMetaData(
+      std::vector<ColumnFamilyMetaData>* /*metadata*/) {}
+
   // IngestExternalFile() will load a list of external SST files (1) into the DB
   // Two primary modes are supported:
   // - Duplicate keys in the new files will overwrite exiting keys (default)
@@ -1286,13 +1540,14 @@
   // this column family.
   // (1) External SST files can be created using SstFileWriter.
   // (2) External SST files can be exported from a particular column family in
-  //     an existing DB.
+  //     an existing DB using Checkpoint::ExportColumnFamily.
   // Option in import_options specifies whether the external files are copied or
   // moved (default is copy). When option specifies copy, managing files at
   // external_file_path is caller's responsibility. When option specifies a
-  // move, the call ensures that the specified files at external_file_path are
-  // deleted on successful return and files are not modified on any error
-  // return.
+  // move, the call makes a best effort to delete the specified files at
+  // external_file_path on successful return, logging any failure to delete
+  // rather than returning in Status. Files are not modified on any error
+  // return, and a best effort is made to remove any newly-created files.
   // On error return, column family handle returned will be nullptr.
   // ColumnFamily will be present on successful return and will not be present
   // on error return. ColumnFamily may be present on any crash during this call.
@@ -1302,6 +1557,14 @@
       const ExportImportFilesMetaData& metadata,
       ColumnFamilyHandle** handle) = 0;
 
+  // Verify the checksums of files in db. Currently the whole-file checksum of
+  // table files are checked.
+  virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) {
+    return Status::NotSupported("File verification not supported");
+  }
+
+  // Verify the block checksums of files in db. The block checksums of table
+  // files are checked.
   virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
 
   virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
@@ -1415,10 +1678,18 @@
   // Returns Status::OK if identity could be set properly
   virtual Status GetDbIdentity(std::string& identity) const = 0;
 
+  // Return a unique identifier for each DB object that is opened
+  // This DB session ID should be unique among all open DB instances on all
+  // hosts, and should be unique among re-openings of the same or other DBs.
+  // (Two open DBs have the same identity from other function GetDbIdentity when
+  // one is physically copied from the other.)
+  virtual Status GetDbSessionId(std::string& session_id) const = 0;
+
   // Returns default column family handle
   virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
 
 #ifndef ROCKSDB_LITE
+
   virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                           TablePropertiesCollection* props) = 0;
   virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
@@ -1449,6 +1720,16 @@
     return Status::NotSupported("EndTrace() is not implemented.");
   }
 
+  // IO Tracing operations. Use EndIOTrace() to stop tracing.
+  virtual Status StartIOTrace(const TraceOptions& /*options*/,
+                              std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartIOTrace() is not implemented.");
+  }
+
+  virtual Status EndIOTrace() {
+    return Status::NotSupported("EndIOTrace() is not implemented.");
+  }
+
   // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
   virtual Status StartBlockCacheTrace(
       const TraceOptions& /*options*/,
@@ -1459,6 +1740,15 @@
   virtual Status EndBlockCacheTrace() {
     return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
   }
+
+  // Create a default trace replayer.
+  virtual Status NewDefaultReplayer(
+      const std::vector<ColumnFamilyHandle*>& /*handles*/,
+      std::unique_ptr<TraceReader>&& /*reader*/,
+      std::unique_ptr<Replayer>* /*replayer*/) {
+    return Status::NotSupported("NewDefaultReplayer() is not implemented.");
+  }
+
 #endif  // ROCKSDB_LITE
 
   // Needed for StackableDB
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env.h	2025-05-19 16:14:27.000000000 +0000
@@ -17,12 +17,16 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <cstdarg>
 #include <functional>
 #include <limits>
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/functor_wrapper.h"
 #include "rocksdb/status.h"
 #include "rocksdb/thread_status.h"
 
@@ -30,11 +34,12 @@
 // Windows API macro interference
 #undef DeleteFile
 #undef GetCurrentTime
+#undef LoadLibrary
 #endif
 
 #if defined(__GNUC__) || defined(__clang__)
 #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
-    __attribute__((__format__(__printf__, format_param, dots_param)))
+  __attribute__((__format__(__printf__, format_param, dots_param)))
 #else
 #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param)
 #endif
@@ -47,6 +52,7 @@
 class RandomAccessFile;
 class SequentialFile;
 class Slice;
+struct DataVerificationInfo;
 class WritableFile;
 class RandomRWFile;
 class MemoryMappedFileBuffer;
@@ -57,9 +63,19 @@
 class RateLimiter;
 class ThreadStatusUpdater;
 struct ThreadStatus;
+class FileSystem;
+class SystemClock;
+struct ConfigOptions;
 
 const size_t kDefaultPageSize = 4 * 1024;
 
+enum class CpuPriority {
+  kIdle = 0,
+  kLow = 1,
+  kNormal = 2,
+  kHigh = 3,
+};
+
 // Options while opening a file to read/write
 struct EnvOptions {
   // Construct with default Options
@@ -68,7 +84,8 @@
   // Construct from Options
   explicit EnvOptions(const DBOptions& options);
 
-  // If true, then use mmap to read data
+  // If true, then use mmap to read data.
+  // Not recommended for 32-bit OS.
   bool use_mmap_reads = false;
 
   // If true, then use mmap to write data
@@ -130,8 +147,12 @@
   RateLimiter* rate_limiter = nullptr;
 };
 
-class Env {
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Env : public Customizable {
  public:
+  static const char* kDefaultName() { return "DefaultEnv"; }
   struct FileAttributes {
     // File name
     std::string name;
@@ -140,22 +161,63 @@
     uint64_t size_bytes;
   };
 
-  Env() : thread_status_updater_(nullptr) {}
+  Env();
+  // Construct an Env with a separate FileSystem and/or SystemClock
+  // implementation
+  explicit Env(const std::shared_ptr<FileSystem>& fs);
+  Env(const std::shared_ptr<FileSystem>& fs,
+      const std::shared_ptr<SystemClock>& clock);
   // No copying allowed
   Env(const Env&) = delete;
   void operator=(const Env&) = delete;
 
-  virtual ~Env();
+  ~Env() override;
 
   static const char* Type() { return "Environment"; }
 
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  const char* Name() const override { return ""; }
+
   // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
   static Status LoadEnv(const std::string& value, Env** result);
 
   // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
   static Status LoadEnv(const std::string& value, Env** result,
                         std::shared_ptr<Env>* guard);
 
+  // Loads the environment specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  //
+  // @param config_options Controls how the environment is loaded.
+  // @param value the name and associated properties for the environment.
+  // @param result On success, the environment that was loaded.
+  // @param guard If specified and the loaded environment is not static,
+  //      this value will contain the loaded environment (guard.get() ==
+  //      result).
+  // @return OK If the environment was successfully loaded (and optionally
+  // prepared)
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result);
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result,
+                                 std::shared_ptr<Env>* guard);
+
+  // Loads the environment specified by the env and fs uri.
+  // If both are specified, an error is returned.
+  // Otherwise, the environment is created by loading (via CreateFromString)
+  // the appropriate env/fs from the corresponding values.
+  static Status CreateFromUri(const ConfigOptions& options,
+                              const std::string& env_uri,
+                              const std::string& fs_uri, Env** result,
+                              std::shared_ptr<Env>* guard);
+
   // Return a default environment suitable for the current operating
   // system.  Sophisticated users may wish to provide their own Env
   // implementation instead of relying on this default environment.
@@ -163,6 +225,15 @@
   // The result of Default() belongs to rocksdb and must never be deleted.
   static Env* Default();
 
+  // See FileSystem::RegisterDbPaths.
+  virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+  // See FileSystem::UnregisterDbPaths.
+  virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+
   // Create a brand new sequentially-readable file with the specified name.
   // On success, stores a pointer to the new file in *result and returns OK.
   // On failure stores nullptr in *result and returns non-OK.  If the file does
@@ -205,17 +276,18 @@
                                  std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) = 0;
 
-  // Create an object that writes to a new file with the specified
-  // name.  Deletes any existing file with the same name and creates a
-  // new file.  On success, stores a pointer to the new file in
-  // *result and returns OK.  On failure stores nullptr in *result and
-  // returns non-OK.
+  // Create an object that writes to a file with the specified name.
+  // `WritableFile::Append()`s will append after any existing content.  If the
+  // file does not already exist, creates it.
+  //
+  // On success, stores a pointer to the file in *result and returns OK.  On
+  // failure stores nullptr in *result and returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status ReopenWritableFile(const std::string& /*fname*/,
                                     std::unique_ptr<WritableFile>* /*result*/,
                                     const EnvOptions& /*options*/) {
-    return Status::NotSupported();
+    return Status::NotSupported("Env::ReopenWritableFile() not supported.");
   }
 
   // Reuse an existing file by renaming it and opening it as writable.
@@ -263,7 +335,8 @@
   virtual Status FileExists(const std::string& fname) = 0;
 
   // Store in *result the names of the children of the specified directory.
-  // The names are relative to "dir".
+  // The names are relative to "dir", and shall never include the
+  // names `.` or `..`.
   // Original contents of *results are dropped.
   // Returns OK if "dir" exists and "*result" contains its children.
   //         NotFound if "dir" does not exist, the calling process does not have
@@ -276,7 +349,8 @@
   // In case the implementation lists the directory prior to iterating the files
   // and files are concurrently deleted, the deleted files will be omitted from
   // result.
-  // The name attributes are relative to "dir".
+  // The name attributes are relative to "dir", and shall never include the
+  // names `.` or `..`.
   // Original contents of *results are dropped.
   // Returns OK if "dir" exists and "*result" contains its children.
   //         NotFound if "dir" does not exist, the calling process does not have
@@ -301,6 +375,8 @@
   virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
 
   // Delete the specified directory.
+  // Many implementations of this function will only delete a directory if it is
+  // empty.
   virtual Status DeleteDir(const std::string& dirname) = 0;
 
   // Store the size of fname in *file_size.
@@ -369,7 +445,13 @@
   static std::string PriorityToString(Priority priority);
 
   // Priority for requesting bytes in rate limiter scheduler
-  enum IOPriority { IO_LOW = 0, IO_HIGH = 1, IO_TOTAL = 2 };
+  enum IOPriority {
+    IO_LOW = 0,
+    IO_MID = 1,
+    IO_HIGH = 2,
+    IO_USER = 3,
+    IO_TOTAL = 4
+  };
 
   // Arrange to run "(*function)(arg)" once in a background thread, in
   // the thread pool specified by pri. By default, jobs go to the 'LOW'
@@ -393,6 +475,21 @@
   // When "function(arg)" returns, the thread will be destroyed.
   virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
 
+  // Start a new thread, invoking "function(args...)" within the new thread.
+  // When "function(args...)" returns, the thread will be destroyed.
+  template <typename FunctionT, typename... Args>
+  void StartThreadTyped(FunctionT function, Args&&... args) {
+    using FWType = FunctorWrapper<Args...>;
+    StartThread(
+        [](void* arg) {
+          auto* functor = static_cast<FWType*>(arg);
+          functor->invoke();
+          delete functor;
+        },
+        new FWType(std::function<void(Args...)>(function),
+                   std::forward<Args>(args)...));
+  }
+
   // Wait for all threads started by StartThread to terminate.
   virtual void WaitForJoin() {}
 
@@ -408,7 +505,7 @@
   virtual Status GetTestDirectory(std::string* path) = 0;
 
   // Create and returns a default logger (an instance of EnvLogger) for storing
-  // informational messages. Derived classes can overide to provide custom
+  // informational messages. Derived classes can override to provide custom
   // logger.
   virtual Status NewLogger(const std::string& fname,
                            std::shared_ptr<Logger>* result);
@@ -431,9 +528,15 @@
   // Sleep/delay the thread for the prescribed number of micro-seconds.
   virtual void SleepForMicroseconds(int micros) = 0;
 
-  // Get the current host name.
+  // Get the current host name as a null terminated string iff the string
+  // length is < len. The hostname should otherwise be truncated to len.
   virtual Status GetHostName(char* name, uint64_t len) = 0;
 
+  // Get the current hostname from the given env as a std::string in result.
+  // The result may be truncated if the hostname is too
+  // long
+  virtual Status GetHostNameString(std::string* result);
+
   // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
   // Only overwrites *unix_time on success.
   virtual Status GetCurrentTime(int64_t* unix_time) = 0;
@@ -449,7 +552,7 @@
   virtual int GetBackgroundThreads(Priority pri = LOW) = 0;
 
   virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) {
-    return Status::NotSupported("Not supported.");
+    return Status::NotSupported("Env::SetAllowNonOwnerAccess() not supported.");
   }
 
   // Enlarge number of background worker threads of a specific thread pool
@@ -461,12 +564,22 @@
   virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {}
 
   // Lower CPU priority for threads from the specified pool.
+  virtual Status LowerThreadPoolCPUPriority(Priority /*pool*/,
+                                            CpuPriority /*pri*/) {
+    return Status::NotSupported(
+        "Env::LowerThreadPoolCPUPriority(Priority, CpuPriority) not supported");
+  }
+
+  // Lower CPU priority for threads from the specified pool.
   virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {}
 
   // Converts seconds-since-Jan-01-1970 to a printable string
   virtual std::string TimeToString(uint64_t time) = 0;
 
-  // Generates a unique id that can be used to identify a db
+  // Generates a human-readable unique ID that can be used to identify a DB.
+  // In built-in implementations, this is an RFC-4122 UUID string, but might
+  // not be in all implementations. Overriding is not recommended.
+  // NOTE: this has not be validated for use in cryptography
   virtual std::string GenerateUniqueId();
 
   // OptimizeForLogWrite will create a new EnvOptions object that is a copy of
@@ -504,9 +617,16 @@
       const EnvOptions& env_options,
       const ImmutableDBOptions& db_options) const;
 
+  // OptimizeForBlobFileRead will create a new EnvOptions object that
+  // is a copy of the EnvOptions in the parameters, but is optimized for reading
+  // blob files.
+  virtual EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const;
+
   // Returns the status of all threads that belong to the current Env.
   virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
-    return Status::NotSupported("Not supported.");
+    return Status::NotSupported("Env::GetThreadList() not supported.");
   }
 
   // Returns the pointer to ThreadStatusUpdater.  This function will be
@@ -525,17 +645,39 @@
   // Get the amount of free disk space
   virtual Status GetFreeSpace(const std::string& /*path*/,
                               uint64_t* /*diskfree*/) {
-    return Status::NotSupported();
+    return Status::NotSupported("Env::GetFreeSpace() not supported.");
+  }
+
+  // Check whether the specified path is a directory
+  virtual Status IsDirectory(const std::string& /*path*/, bool* /*is_dir*/) {
+    return Status::NotSupported("Env::IsDirectory() not supported.");
   }
 
   virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {}
 
+  // Get the FileSystem implementation this Env was constructed with. It
+  // could be a fully implemented one, or a wrapper class around the Env
+  const std::shared_ptr<FileSystem>& GetFileSystem() const;
+
+  // Get the SystemClock implementation this Env was constructed with. It
+  // could be a fully implemented one, or a wrapper class around the Env
+  const std::shared_ptr<SystemClock>& GetSystemClock() const;
+
   // If you're adding methods here, remember to add them to EnvWrapper too.
 
  protected:
   // The pointer to an internal structure that will update the
   // status of each thread.
   ThreadStatusUpdater* thread_status_updater_;
+
+  // Pointer to the underlying FileSystem implementation
+  std::shared_ptr<FileSystem> file_system_;
+
+  // Pointer to the underlying SystemClock implementation
+  std::shared_ptr<SystemClock> system_clock_;
+
+ private:
+  static const size_t kMaxHostNameLen = 256;
 };
 
 // The factory function to construct a ThreadStatusUpdater.  Any Env
@@ -556,6 +698,10 @@
   // "scratch[0..n-1]" must be live when "*result" is used.
   // If an error was encountered, returns a non-OK status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // REQUIRES: External synchronization
   virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
 
@@ -580,14 +726,16 @@
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
   virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
-    return Status::NotSupported("InvalidateCache not supported.");
+    return Status::NotSupported(
+        "SequentialFile::InvalidateCache not supported.");
   }
 
   // Positioned Read for direct I/O
   // If Direct I/O enabled, offset, n, and scratch should be properly aligned
   virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/,
                                 Slice* /*result*/, char* /*scratch*/) {
-    return Status::NotSupported();
+    return Status::NotSupported(
+        "SequentialFile::PositionedRead() not supported.");
   }
 
   // If you're adding methods here, remember to add them to
@@ -599,7 +747,8 @@
   // File offset in bytes
   uint64_t offset;
 
-  // Length to read in bytes
+  // Length to read in bytes. `result` only returns fewer bytes if end of file
+  // is hit (or `status` is not OK).
   size_t len;
 
   // A buffer that MultiRead()  can optionally place data in. It can
@@ -628,6 +777,10 @@
   // "*result" is used.  If an error was encountered, returns a non-OK
   // status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
@@ -690,7 +843,8 @@
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
   virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
-    return Status::NotSupported("InvalidateCache not supported.");
+    return Status::NotSupported(
+        "RandomAccessFile::InvalidateCache not supported.");
   }
 
   // If you're adding methods here, remember to add them to
@@ -722,10 +876,22 @@
   virtual ~WritableFile();
 
   // Append data to the end of the file
-  // Note: A WriteabelFile object must support either Append or
+  // Note: A WriteableFile object must support either Append or
   // PositionedAppend, so the users cannot mix the two.
   virtual Status Append(const Slice& data) = 0;
 
+  // Append data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status Append(const Slice& data,
+                        const DataVerificationInfo& /* verification_info */) {
+    return Append(data);
+  }
+
   // PositionedAppend data to the specified offset. The new EOF after append
   // must be larger than the previous EOF. This is to be used when writes are
   // not backed by OS buffers and hence has to always start from the start of
@@ -748,7 +914,21 @@
   // required is queried via GetRequiredBufferAlignment()
   virtual Status PositionedAppend(const Slice& /* data */,
                                   uint64_t /* offset */) {
-    return Status::NotSupported();
+    return Status::NotSupported(
+        "WritableFile::PositionedAppend() not supported.");
+  }
+
+  // PositionedAppend data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status PositionedAppend(
+      const Slice& /* data */, uint64_t /* offset */,
+      const DataVerificationInfo& /* verification_info */) {
+    return Status::NotSupported("PositionedAppend");
   }
 
   // Truncate is necessary to trim the file to the correct size
@@ -823,7 +1003,7 @@
   // If the system is not caching the file contents, then this is a noop.
   // This call has no effect on dirty pages in the cache.
   virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
-    return Status::NotSupported("InvalidateCache not supported.");
+    return Status::NotSupported("WritableFile::InvalidateCache not supported.");
   }
 
   // Sync a file range with disk.
@@ -857,8 +1037,10 @@
     if (new_last_preallocated_block > last_preallocated_block_) {
       size_t num_spanned_blocks =
           new_last_preallocated_block - last_preallocated_block_;
+      // TODO: Don't ignore errors from allocate
       Allocate(block_size * last_preallocated_block_,
-               block_size * num_spanned_blocks);
+               block_size * num_spanned_blocks)
+          .PermitUncheckedError();
       last_preallocated_block_ = new_last_preallocated_block;
     }
   }
@@ -908,6 +1090,11 @@
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Returns Status::OK() on success.
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const = 0;
@@ -973,6 +1160,10 @@
 };
 
 // An interface for writing log messages.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
 class Logger {
  public:
   size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)();
@@ -996,11 +1187,17 @@
   virtual void LogHeader(const char* format, va_list ap) {
     // Default implementation does a simple INFO level log write.
     // Please override as per the logger class requirement.
-    Logv(format, ap);
+    Logv(InfoLogLevel::INFO_LEVEL, format, ap);
   }
 
   // Write an entry to the log file with the specified format.
-  virtual void Logv(const char* format, va_list ap) = 0;
+  //
+  // Users who override the `Logv()` overload taking `InfoLogLevel` do not need
+  // to implement this, unless they explicitly invoke it in
+  // `Logv(InfoLogLevel, ...)`.
+  virtual void Logv(const char* /* format */, va_list /* ap */) {
+    assert(false);
+  }
 
   // Write an entry to the log file with the specified log level
   // and format.  Any log with level under the internal log level
@@ -1027,7 +1224,9 @@
   InfoLogLevel log_level_;
 };
 
-// Identifies a locked file.
+// Identifies a locked file. Except in custom Env/Filesystem implementations,
+// the lifetime of a FileLock object should be managed only by LockFile() and
+// UnlockFile().
 class FileLock {
  public:
   FileLock() {}
@@ -1147,232 +1346,297 @@
 // functionality of another Env.
 class EnvWrapper : public Env {
  public:
+  // The Target struct allows an Env to be stored as a raw (Env*) or
+  // std::shared_ptr<Env>.  By using this struct, the wrapping/calling
+  // class does not need to worry about the ownership/lifetime of the
+  // wrapped target env.  If the guard is set, then the Env will point
+  // to the guard.get().
+  struct Target {
+    Env* env;                    // The raw Env
+    std::shared_ptr<Env> guard;  // The guarded Env
+
+    // Creates a Target without assuming ownership of the target Env
+    explicit Target(Env* t) : env(t) {}
+
+    // Creates a Target from the guarded env, assuming ownership
+    explicit Target(std::unique_ptr<Env>&& t) : guard(t.release()) {
+      env = guard.get();
+    }
+
+    // Creates a Target from the guarded env, assuming ownership
+    explicit Target(const std::shared_ptr<Env>& t) : guard(t) {
+      env = guard.get();
+    }
+
+    // Makes sure the raw Env is not nullptr
+    void Prepare() {
+      if (guard.get() != nullptr) {
+        env = guard.get();
+      } else if (env == nullptr) {
+        env = Env::Default();
+      }
+    }
+  };
+
   // Initialize an EnvWrapper that delegates all calls to *t
-  explicit EnvWrapper(Env* t) : target_(t) {}
+  explicit EnvWrapper(Env* t);
+  explicit EnvWrapper(std::unique_ptr<Env>&& t);
+  explicit EnvWrapper(const std::shared_ptr<Env>& t);
   ~EnvWrapper() override;
 
   // Return the target to which this Env forwards all calls
-  Env* target() const { return target_; }
+  Env* target() const { return target_.env; }
+
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  const char* Name() const override { return target_.env->Name(); }
 
   // The following text is boilerplate that forwards all methods to target()
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return target_.env->RegisterDbPaths(paths);
+  }
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    return target_.env->UnregisterDbPaths(paths);
+  }
+
   Status NewSequentialFile(const std::string& f,
                            std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& options) override {
-    return target_->NewSequentialFile(f, r, options);
+    return target_.env->NewSequentialFile(f, r, options);
   }
   Status NewRandomAccessFile(const std::string& f,
                              std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& options) override {
-    return target_->NewRandomAccessFile(f, r, options);
+    return target_.env->NewRandomAccessFile(f, r, options);
   }
   Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& options) override {
-    return target_->NewWritableFile(f, r, options);
+    return target_.env->NewWritableFile(f, r, options);
   }
   Status ReopenWritableFile(const std::string& fname,
                             std::unique_ptr<WritableFile>* result,
                             const EnvOptions& options) override {
-    return target_->ReopenWritableFile(fname, result, options);
+    return target_.env->ReopenWritableFile(fname, result, options);
   }
   Status ReuseWritableFile(const std::string& fname,
                            const std::string& old_fname,
                            std::unique_ptr<WritableFile>* r,
                            const EnvOptions& options) override {
-    return target_->ReuseWritableFile(fname, old_fname, r, options);
+    return target_.env->ReuseWritableFile(fname, old_fname, r, options);
   }
   Status NewRandomRWFile(const std::string& fname,
                          std::unique_ptr<RandomRWFile>* result,
                          const EnvOptions& options) override {
-    return target_->NewRandomRWFile(fname, result, options);
+    return target_.env->NewRandomRWFile(fname, result, options);
   }
   Status NewMemoryMappedFileBuffer(
       const std::string& fname,
       std::unique_ptr<MemoryMappedFileBuffer>* result) override {
-    return target_->NewMemoryMappedFileBuffer(fname, result);
+    return target_.env->NewMemoryMappedFileBuffer(fname, result);
   }
   Status NewDirectory(const std::string& name,
                       std::unique_ptr<Directory>* result) override {
-    return target_->NewDirectory(name, result);
+    return target_.env->NewDirectory(name, result);
   }
   Status FileExists(const std::string& f) override {
-    return target_->FileExists(f);
+    return target_.env->FileExists(f);
   }
   Status GetChildren(const std::string& dir,
                      std::vector<std::string>* r) override {
-    return target_->GetChildren(dir, r);
+    return target_.env->GetChildren(dir, r);
   }
   Status GetChildrenFileAttributes(
       const std::string& dir, std::vector<FileAttributes>* result) override {
-    return target_->GetChildrenFileAttributes(dir, result);
+    return target_.env->GetChildrenFileAttributes(dir, result);
   }
   Status DeleteFile(const std::string& f) override {
-    return target_->DeleteFile(f);
+    return target_.env->DeleteFile(f);
   }
   Status Truncate(const std::string& fname, size_t size) override {
-    return target_->Truncate(fname, size);
+    return target_.env->Truncate(fname, size);
   }
   Status CreateDir(const std::string& d) override {
-    return target_->CreateDir(d);
+    return target_.env->CreateDir(d);
   }
   Status CreateDirIfMissing(const std::string& d) override {
-    return target_->CreateDirIfMissing(d);
+    return target_.env->CreateDirIfMissing(d);
   }
   Status DeleteDir(const std::string& d) override {
-    return target_->DeleteDir(d);
+    return target_.env->DeleteDir(d);
   }
   Status GetFileSize(const std::string& f, uint64_t* s) override {
-    return target_->GetFileSize(f, s);
+    return target_.env->GetFileSize(f, s);
   }
 
   Status GetFileModificationTime(const std::string& fname,
                                  uint64_t* file_mtime) override {
-    return target_->GetFileModificationTime(fname, file_mtime);
+    return target_.env->GetFileModificationTime(fname, file_mtime);
   }
 
   Status RenameFile(const std::string& s, const std::string& t) override {
-    return target_->RenameFile(s, t);
+    return target_.env->RenameFile(s, t);
   }
 
   Status LinkFile(const std::string& s, const std::string& t) override {
-    return target_->LinkFile(s, t);
+    return target_.env->LinkFile(s, t);
   }
 
   Status NumFileLinks(const std::string& fname, uint64_t* count) override {
-    return target_->NumFileLinks(fname, count);
+    return target_.env->NumFileLinks(fname, count);
   }
 
   Status AreFilesSame(const std::string& first, const std::string& second,
                       bool* res) override {
-    return target_->AreFilesSame(first, second, res);
+    return target_.env->AreFilesSame(first, second, res);
   }
 
   Status LockFile(const std::string& f, FileLock** l) override {
-    return target_->LockFile(f, l);
+    return target_.env->LockFile(f, l);
   }
 
-  Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); }
+  Status UnlockFile(FileLock* l) override { return target_.env->UnlockFile(l); }
+
+  Status IsDirectory(const std::string& path, bool* is_dir) override {
+    return target_.env->IsDirectory(path, is_dir);
+  }
 
   Status LoadLibrary(const std::string& lib_name,
                      const std::string& search_path,
                      std::shared_ptr<DynamicLibrary>* result) override {
-    return target_->LoadLibrary(lib_name, search_path, result);
+    return target_.env->LoadLibrary(lib_name, search_path, result);
   }
 
   void Schedule(void (*f)(void* arg), void* a, Priority pri,
                 void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
-    return target_->Schedule(f, a, pri, tag, u);
+    return target_.env->Schedule(f, a, pri, tag, u);
   }
 
   int UnSchedule(void* tag, Priority pri) override {
-    return target_->UnSchedule(tag, pri);
+    return target_.env->UnSchedule(tag, pri);
   }
 
   void StartThread(void (*f)(void*), void* a) override {
-    return target_->StartThread(f, a);
+    return target_.env->StartThread(f, a);
   }
-  void WaitForJoin() override { return target_->WaitForJoin(); }
+  void WaitForJoin() override { return target_.env->WaitForJoin(); }
   unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
-    return target_->GetThreadPoolQueueLen(pri);
+    return target_.env->GetThreadPoolQueueLen(pri);
   }
   Status GetTestDirectory(std::string* path) override {
-    return target_->GetTestDirectory(path);
+    return target_.env->GetTestDirectory(path);
   }
   Status NewLogger(const std::string& fname,
                    std::shared_ptr<Logger>* result) override {
-    return target_->NewLogger(fname, result);
+    return target_.env->NewLogger(fname, result);
   }
-  uint64_t NowMicros() override { return target_->NowMicros(); }
-  uint64_t NowNanos() override { return target_->NowNanos(); }
-  uint64_t NowCPUNanos() override { return target_->NowCPUNanos(); }
+  uint64_t NowMicros() override { return target_.env->NowMicros(); }
+  uint64_t NowNanos() override { return target_.env->NowNanos(); }
+  uint64_t NowCPUNanos() override { return target_.env->NowCPUNanos(); }
 
   void SleepForMicroseconds(int micros) override {
-    target_->SleepForMicroseconds(micros);
+    target_.env->SleepForMicroseconds(micros);
   }
   Status GetHostName(char* name, uint64_t len) override {
-    return target_->GetHostName(name, len);
+    return target_.env->GetHostName(name, len);
   }
   Status GetCurrentTime(int64_t* unix_time) override {
-    return target_->GetCurrentTime(unix_time);
+    return target_.env->GetCurrentTime(unix_time);
   }
   Status GetAbsolutePath(const std::string& db_path,
                          std::string* output_path) override {
-    return target_->GetAbsolutePath(db_path, output_path);
+    return target_.env->GetAbsolutePath(db_path, output_path);
   }
   void SetBackgroundThreads(int num, Priority pri) override {
-    return target_->SetBackgroundThreads(num, pri);
+    return target_.env->SetBackgroundThreads(num, pri);
   }
   int GetBackgroundThreads(Priority pri) override {
-    return target_->GetBackgroundThreads(pri);
+    return target_.env->GetBackgroundThreads(pri);
   }
 
   Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
-    return target_->SetAllowNonOwnerAccess(allow_non_owner_access);
+    return target_.env->SetAllowNonOwnerAccess(allow_non_owner_access);
   }
 
   void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
-    return target_->IncBackgroundThreadsIfNeeded(num, pri);
+    return target_.env->IncBackgroundThreadsIfNeeded(num, pri);
+  }
+
+  void LowerThreadPoolIOPriority(Priority pool) override {
+    target_.env->LowerThreadPoolIOPriority(pool);
   }
 
-  void LowerThreadPoolIOPriority(Priority pool = LOW) override {
-    target_->LowerThreadPoolIOPriority(pool);
+  void LowerThreadPoolCPUPriority(Priority pool) override {
+    target_.env->LowerThreadPoolCPUPriority(pool);
   }
 
-  void LowerThreadPoolCPUPriority(Priority pool = LOW) override {
-    target_->LowerThreadPoolCPUPriority(pool);
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    return target_.env->LowerThreadPoolCPUPriority(pool, pri);
   }
 
   std::string TimeToString(uint64_t time) override {
-    return target_->TimeToString(time);
+    return target_.env->TimeToString(time);
   }
 
   Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
-    return target_->GetThreadList(thread_list);
+    return target_.env->GetThreadList(thread_list);
   }
 
   ThreadStatusUpdater* GetThreadStatusUpdater() const override {
-    return target_->GetThreadStatusUpdater();
+    return target_.env->GetThreadStatusUpdater();
   }
 
-  uint64_t GetThreadID() const override { return target_->GetThreadID(); }
+  uint64_t GetThreadID() const override { return target_.env->GetThreadID(); }
 
   std::string GenerateUniqueId() override {
-    return target_->GenerateUniqueId();
+    return target_.env->GenerateUniqueId();
   }
 
   EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
-    return target_->OptimizeForLogRead(env_options);
+    return target_.env->OptimizeForLogRead(env_options);
   }
   EnvOptions OptimizeForManifestRead(
       const EnvOptions& env_options) const override {
-    return target_->OptimizeForManifestRead(env_options);
+    return target_.env->OptimizeForManifestRead(env_options);
   }
   EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
                                  const DBOptions& db_options) const override {
-    return target_->OptimizeForLogWrite(env_options, db_options);
+    return target_.env->OptimizeForLogWrite(env_options, db_options);
   }
   EnvOptions OptimizeForManifestWrite(
       const EnvOptions& env_options) const override {
-    return target_->OptimizeForManifestWrite(env_options);
+    return target_.env->OptimizeForManifestWrite(env_options);
   }
   EnvOptions OptimizeForCompactionTableWrite(
       const EnvOptions& env_options,
       const ImmutableDBOptions& immutable_ops) const override {
-    return target_->OptimizeForCompactionTableWrite(env_options, immutable_ops);
+    return target_.env->OptimizeForCompactionTableWrite(env_options,
+                                                        immutable_ops);
   }
   EnvOptions OptimizeForCompactionTableRead(
       const EnvOptions& env_options,
       const ImmutableDBOptions& db_options) const override {
-    return target_->OptimizeForCompactionTableRead(env_options, db_options);
+    return target_.env->OptimizeForCompactionTableRead(env_options, db_options);
+  }
+  EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_.env->OptimizeForBlobFileRead(env_options, db_options);
   }
   Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
-    return target_->GetFreeSpace(path, diskfree);
+    return target_.env->GetFreeSpace(path, diskfree);
   }
   void SanitizeEnvOptions(EnvOptions* env_opts) const override {
-    target_->SanitizeEnvOptions(env_opts);
+    target_.env->SanitizeEnvOptions(env_opts);
   }
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
 
  private:
-  Env* target_;
+  Target target_;
 };
 
 class SequentialFileWrapper : public SequentialFile {
@@ -1435,9 +1699,18 @@
   explicit WritableFileWrapper(WritableFile* t) : target_(t) {}
 
   Status Append(const Slice& data) override { return target_->Append(data); }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& verification_info) override {
+    return target_->Append(data, verification_info);
+  }
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
     return target_->PositionedAppend(data, offset);
   }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& verification_info) override {
+    return target_->PositionedAppend(data, offset, verification_info);
+  }
   Status Truncate(uint64_t size) override { return target_->Truncate(size); }
   Status Close() override { return target_->Close(); }
   Status Flush() override { return target_->Flush(); }
@@ -1586,4 +1859,8 @@
 Status NewEnvLogger(const std::string& fname, Env* env,
                     std::shared_ptr<Logger>* result);
 
+// Creates a new Env based on Env::Default() but modified to use the specified
+// FileSystem.
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs);
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/env_encryption.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,15 +9,24 @@
 
 #include <string>
 
-#include "env.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class EncryptionProvider;
 
+struct ConfigOptions;
+
 // Returns an Env that encrypts data when stored on disk and decrypts data when
 // read from disk.
-Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider);
+Env* NewEncryptedEnv(Env* base_env,
+                     const std::shared_ptr<EncryptionProvider>& provider);
+std::shared_ptr<FileSystem> NewEncryptedFS(
+    const std::shared_ptr<FileSystem>& base_fs,
+    const std::shared_ptr<EncryptionProvider>& provider);
 
 // BlockAccessCipherStream is the base class for any cipher stream that
 // supports random access at block level (without requiring data from other
@@ -53,10 +62,38 @@
 };
 
 // BlockCipher
-class BlockCipher {
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class BlockCipher : public Customizable {
  public:
   virtual ~BlockCipher(){};
 
+  // Creates a new BlockCipher from the input config_options and value
+  // The value describes the type of provider (and potentially optional
+  // configuration parameters) used to create this provider.
+  // For example, if the value is "ROT13", a ROT13BlockCipher is created.
+  //
+  // @param config_options  Options to control how this cipher is created
+  //                        and initialized.
+  // @param value  The value might be:
+  //   - ROT13         Create a ROT13 Cipher
+  //   - ROT13:nn      Create a ROT13 Cipher with block size of nn
+  // @param result The new cipher object
+  // @return OK if the cipher was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<BlockCipher>* result);
+
+  static const char* Type() { return "BlockCipher"; }
+  // Short-cut method to create a ROT13 BlockCipher.
+  // This cipher is only suitable for test purposes and should not be used in
+  // production!!!
+  static std::shared_ptr<BlockCipher> NewROT13Cipher(size_t block_size);
+
   // BlockSize returns the size of each block supported by this cipher stream.
   virtual size_t BlockSize() = 0;
 
@@ -69,138 +106,360 @@
   virtual Status Decrypt(char* data) = 0;
 };
 
-// Implements a BlockCipher using ROT13.
-//
-// Note: This is a sample implementation of BlockCipher,
-// it is NOT considered safe and should NOT be used in production.
-class ROT13BlockCipher : public BlockCipher {
- private:
-  size_t blockSize_;
-
- public:
-  ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {}
-  virtual ~ROT13BlockCipher(){};
-
-  // BlockSize returns the size of each block supported by this cipher stream.
-  virtual size_t BlockSize() override { return blockSize_; }
-
-  // Encrypt a block of data.
-  // Length of data is equal to BlockSize().
-  virtual Status Encrypt(char* data) override;
-
-  // Decrypt a block of data.
-  // Length of data is equal to BlockSize().
-  virtual Status Decrypt(char* data) override;
-};
-
-// CTRCipherStream implements BlockAccessCipherStream using an
-// Counter operations mode.
-// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation
-//
-// Note: This is a possible implementation of BlockAccessCipherStream,
-// it is considered suitable for use.
-class CTRCipherStream final : public BlockAccessCipherStream {
- private:
-  BlockCipher& cipher_;
-  std::string iv_;
-  uint64_t initialCounter_;
-
- public:
-  CTRCipherStream(BlockCipher& c, const char* iv, uint64_t initialCounter)
-      : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter){};
-  virtual ~CTRCipherStream(){};
-
-  // BlockSize returns the size of each block supported by this cipher stream.
-  virtual size_t BlockSize() override { return cipher_.BlockSize(); }
-
- protected:
-  // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
-  virtual void AllocateScratch(std::string&) override;
-
-  // Encrypt a block of data at the given block index.
-  // Length of data is equal to BlockSize();
-  virtual Status EncryptBlock(uint64_t blockIndex, char* data,
-                              char* scratch) override;
-
-  // Decrypt a block of data at the given block index.
-  // Length of data is equal to BlockSize();
-  virtual Status DecryptBlock(uint64_t blockIndex, char* data,
-                              char* scratch) override;
-};
-
 // The encryption provider is used to create a cipher stream for a specific
 // file. The returned cipher stream will be used for actual
 // encryption/decryption actions.
-class EncryptionProvider {
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class EncryptionProvider : public Customizable {
  public:
   virtual ~EncryptionProvider(){};
 
+  // Creates a new EncryptionProvider from the input config_options and value
+  // The value describes the type of provider (and potentially optional
+  // configuration parameters) used to create this provider.
+  // For example, if the value is "CTR", a CTREncryptionProvider will be
+  // created. If the value is ends with "://test" (e.g CTR://test"), the
+  // provider will be initialized in "TEST" mode prior to being returned.
+  //
+  // @param config_options  Options to control how this provider is created
+  //                        and initialized.
+  // @param value  The value might be:
+  //   - CTR         Create a CTR provider
+  //   - CTR://test Create a CTR provider and initialize it for tests.
+  // @param result The new provider object
+  // @return OK if the provider was successfully created
+  // @return NotFound if an invalid name was specified in the value
+  // @return InvalidArgument if either the options were not valid
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<EncryptionProvider>* result);
+
+  static const char* Type() { return "EncryptionProvider"; }
+
+  // Short-cut method to create a CTR-provider
+  static std::shared_ptr<EncryptionProvider> NewCTRProvider(
+      const std::shared_ptr<BlockCipher>& cipher);
+
   // GetPrefixLength returns the length of the prefix that is added to every
   // file and used for storing encryption options. For optimal performance, the
   // prefix length should be a multiple of the page size.
-  virtual size_t GetPrefixLength() = 0;
+  virtual size_t GetPrefixLength() const = 0;
 
   // CreateNewPrefix initialized an allocated block of prefix memory
   // for a new file.
   virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
-                                 size_t prefixLength) = 0;
+                                 size_t prefixLength) const = 0;
+
+  // Method to add a new cipher key for use by the EncryptionProvider.
+  // @param description  Descriptor for this key.
+  // @param cipher       The cryptographic key to use
+  // @param len          The length of the cipher key
+  // @param for_write If true, this cipher should be used for writing files.
+  //                  If false, this cipher should only be used for reading
+  //                  files
+  // @return OK if the cipher was successfully added to the provider, non-OK
+  // otherwise
+  virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+                           size_t len, bool for_write) = 0;
 
   // CreateCipherStream creates a block access cipher stream for a file given
   // given name and options.
   virtual Status CreateCipherStream(
       const std::string& fname, const EnvOptions& options, Slice& prefix,
       std::unique_ptr<BlockAccessCipherStream>* result) = 0;
+
+  // Returns a string representing an encryption marker prefix for this
+  // provider. If a marker is provided, this marker can be used to tell whether
+  // or not a file is encrypted by this provider.  The maker will also be part
+  // of any encryption prefix for this provider.
+  virtual std::string GetMarker() const { return ""; }
 };
 
-// This encryption provider uses a CTR cipher stream, with a given block cipher
-// and IV.
-//
-// Note: This is a possible implementation of EncryptionProvider,
-// it is considered suitable for use, provided a safe BlockCipher is used.
-class CTREncryptionProvider : public EncryptionProvider {
- private:
-  BlockCipher& cipher_;
+class EncryptedSequentialFile : public FSSequentialFile {
+ protected:
+  std::unique_ptr<FSSequentialFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  uint64_t offset_;
+  size_t prefixLength_;
+
+ public:
+  // Default ctor. Given underlying sequential file is supposed to be at
+  // offset == prefixLength.
+  EncryptedSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+                          std::unique_ptr<BlockAccessCipherStream>&& s,
+                          size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        offset_(prefixLength),
+        prefixLength_(prefixLength) {}
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  IOStatus Skip(uint64_t n) override;
+
+  // Indicates the upper layers if the current SequentialFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Positioned Read for direct I/O
+  // If Direct I/O enabled, offset, n, and scratch should be properly aligned
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+};
 
+// A file abstraction for randomly reading the contents of a file.
+class EncryptedRandomAccessFile : public FSRandomAccessFile {
  protected:
-  const static size_t defaultPrefixLength = 4096;
+  std::unique_ptr<FSRandomAccessFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
 
  public:
-  CTREncryptionProvider(BlockCipher& c) : cipher_(c){};
-  virtual ~CTREncryptionProvider() {}
+  EncryptedRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& f,
+                            std::unique_ptr<BlockAccessCipherStream>&& s,
+                            size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  // Readahead the file starting from offset by n bytes for caching.
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to each other by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  size_t GetUniqueId(char* id, size_t max_size) const override;
+
+  void Hint(AccessPattern pattern) override;
+
+  // Indicates the upper layers if the current RandomAccessFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+};
 
-  // GetPrefixLength returns the length of the prefix that is added to every
-  // file and used for storing encryption options. For optimal performance, the
-  // prefix length should be a multiple of the page size.
-  virtual size_t GetPrefixLength() override;
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class EncryptedWritableFile : public FSWritableFile {
+ protected:
+  std::unique_ptr<FSWritableFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
 
-  // CreateNewPrefix initialized an allocated block of prefix memory
-  // for a new file.
-  virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
-                                 size_t prefixLength) override;
+ public:
+  // Default ctor. Prefix is assumed to be written already.
+  EncryptedWritableFile(std::unique_ptr<FSWritableFile>&& f,
+                        std::unique_ptr<BlockAccessCipherStream>&& s,
+                        size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+
+  using FSWritableFile::PositionedAppend;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  bool IsSyncThreadSafe() const override;
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  void SetPreallocationBlockSize(size_t size) override;
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override;
+
+  // Pre-allocates space for a file.
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
-  // CreateCipherStream creates a block access cipher stream for a file given
-  // given name and options.
-  virtual Status CreateCipherStream(
-      const std::string& fname, const EnvOptions& options, Slice& prefix,
-      std::unique_ptr<BlockAccessCipherStream>* result) override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+};
 
+// A file abstraction for random reading and writing.
+class EncryptedRandomRWFile : public FSRandomRWFile {
  protected:
-  // PopulateSecretPrefixPart initializes the data into a new prefix block
-  // that will be encrypted. This function will store the data in plain text.
-  // It will be encrypted later (before written to disk).
-  // Returns the amount of space (starting from the start of the prefix)
-  // that has been initialized.
-  virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength,
-                                          size_t blockSize);
-
-  // CreateCipherStreamFromPrefix creates a block access cipher stream for a
-  // file given given name and options. The given prefix is already decrypted.
-  virtual Status CreateCipherStreamFromPrefix(
-      const std::string& fname, const EnvOptions& options,
-      uint64_t initialCounter, const Slice& iv, const Slice& prefix,
-      std::unique_ptr<BlockAccessCipherStream>* result);
+  std::unique_ptr<FSRandomRWFile> file_;
+  std::unique_ptr<BlockAccessCipherStream> stream_;
+  size_t prefixLength_;
+
+ public:
+  EncryptedRandomRWFile(std::unique_ptr<FSRandomRWFile>&& f,
+                        std::unique_ptr<BlockAccessCipherStream>&& s,
+                        size_t prefixLength)
+      : file_(std::move(f)),
+        stream_(std::move(s)),
+        prefixLength_(prefixLength) {}
+
+  // Indicates if the class makes use of direct I/O
+  // If false you must pass aligned buffer to Write()
+  bool use_direct_io() const override;
+
+  // Use the returned alignment value to allocate
+  // aligned buffer for Direct I/O
+  size_t GetRequiredBufferAlignment() const override;
+
+  // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
+  // Pass aligned buffer when use_direct_io() returns true.
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+
+  // Read up to `n` bytes starting from offset `offset` and store them in
+  // result, provided `scratch` size should be at least `n`.
+  // Returns Status::OK() on success.
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 };
 
+class EncryptedFileSystem : public FileSystemWrapper {
+ public:
+  explicit EncryptedFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+  // Method to add a new cipher key for use by the EncryptionProvider.
+  // @param description  Descriptor for this key.
+  // @param cipher       The cryptographic key to use
+  // @param len          The length of the cipher key
+  // @param for_write If true, this cipher should be used for writing files.
+  //                  If false, this cipher should only be used for reading
+  //                  files
+  // @return OK if the cipher was successfully added to the provider, non-OK
+  // otherwise
+  virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+                           size_t len, bool for_write) = 0;
+  static const char* kClassName() { return "EncryptedFileSystem"; }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return FileSystemWrapper::IsInstanceOf(name);
+    }
+  }
+};
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // !defined(ROCKSDB_LITE)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_checksum.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,37 +14,90 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-// FileChecksumFunc is the function class to generates the checksum value
+// The unknown file checksum.
+constexpr char kUnknownFileChecksum[] = "";
+// The unknown sst file checksum function name.
+constexpr char kUnknownFileChecksumFuncName[] = "Unknown";
+// The standard DB file checksum function name.
+// This is the name of the checksum function returned by
+// GetFileChecksumGenCrc32cFactory();
+constexpr char kStandardDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+
+struct FileChecksumGenContext {
+  std::string file_name;
+  // The name of the requested checksum generator.
+  // Checksum factories may use or ignore requested_checksum_func_name,
+  // and checksum factories written before this field was available are still
+  // compatible.
+  std::string requested_checksum_func_name;
+};
+
+// FileChecksumGenerator is the class to generates the checksum value
 // for each file when the file is written to the file system.
-class FileChecksumFunc {
+// Implementations may assume that
+// * Finalize is called at most once during the life of the object
+// * All calls to Update come before Finalize
+// * All calls to GetChecksum come after Finalize
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenerator {
  public:
-  virtual ~FileChecksumFunc() {}
-  // Return the checksum of concat (A, data[0,n-1]) where init_checksum is the
-  // returned value of some string A. It is used to maintain the checksum of a
-  // stream of data
-  virtual std::string Extend(const std::string& init_checksum, const char* data,
-                             size_t n) = 0;
+  virtual ~FileChecksumGenerator() {}
 
-  // Return the checksum value of data[0,n-1]
-  virtual std::string Value(const char* data, size_t n) = 0;
-
-  // Return a processed value of the checksum for store in somewhere
-  virtual std::string ProcessChecksum(const std::string& checksum) = 0;
+  // Update the current result after process the data. For different checksum
+  // functions, the temporal results may be stored and used in Update to
+  // include the new data.
+  virtual void Update(const char* data, size_t n) = 0;
+
+  // Generate the final results if no further new data will be updated.
+  virtual void Finalize() = 0;
+
+  // Get the checksum. The result should not be the empty string and may
+  // include arbitrary bytes, including non-printable characters.
+  virtual std::string GetChecksum() const = 0;
 
   // Returns a name that identifies the current file checksum function.
   virtual const char* Name() const = 0;
 };
 
+// Create the FileChecksumGenerator object for each SST file.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileChecksumGenFactory : public Customizable {
+ public:
+  ~FileChecksumGenFactory() override {}
+  static const char* Type() { return "FileChecksumGenFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<FileChecksumGenFactory>* result);
+
+  // Create a new FileChecksumGenerator.
+  virtual std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) = 0;
+
+  // Return the name of this FileChecksumGenFactory.
+  const char* Name() const override = 0;
+};
+
 // FileChecksumList stores the checksum information of a list of files (e.g.,
-// SST files). The FileChecksumLIst can be used to store the checksum
+// SST files). The FileChecksumList can be used to store the checksum
 // information of all SST file getting  from the MANIFEST, which are
 // the checksum information of all valid SST file of a DB instance. It can
 // also be used to store the checksum information of a list of SST files to
 // be ingested.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
 class FileChecksumList {
  public:
   virtual ~FileChecksumList() {}
@@ -80,7 +133,14 @@
 // Create a new file checksum list.
 extern FileChecksumList* NewFileChecksumList();
 
-// Create a Crc32c based file checksum function
-extern FileChecksumFunc* CreateFileChecksumFuncCrc32c();
+// Return a shared_ptr of the builtin Crc32c based file checksum generator
+// factory object, which can be shared to create the Crc32c based checksum
+// generator object.
+// Note: this implementation is compatible with many other crc32c checksum
+// implementations and uses big-endian encoding of the result, unlike most
+// other crc32c checksums in RocksDB, which alter the result with
+// crc32c::Mask and use little-endian encoding.
+extern std::shared_ptr<FileChecksumGenFactory>
+GetFileChecksumGenCrc32cFactory();
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_system.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_system.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/file_system.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/file_system.h	2025-05-19 16:14:27.000000000 +0000
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <chrono>
 #include <cstdarg>
 #include <functional>
@@ -24,10 +25,14 @@
 #include <memory>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
+
+#include "rocksdb/customizable.h"
 #include "rocksdb/env.h"
 #include "rocksdb/io_status.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "rocksdb/thread_status.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -43,6 +48,7 @@
 struct ImmutableDBOptions;
 struct MutableDBOptions;
 class RateLimiter;
+struct ConfigOptions;
 
 using AccessPattern = RandomAccessFile::AccessPattern;
 using FileAttributes = Env::FileAttributes;
@@ -77,14 +83,53 @@
 // honored. More hints can be added here in the future to indicate things like
 // storage media (HDD/SSD) to be used, replication level etc.
 struct IOOptions {
-  // Timeout for the operation in milliseconds
-  std::chrono::milliseconds timeout;
+  // Timeout for the operation in microseconds
+  std::chrono::microseconds timeout;
 
   // Priority - high or low
   IOPriority prio;
 
   // Type of data being read/written
   IOType type;
+
+  // EXPERIMENTAL
+  // An option map that's opaque to RocksDB. It can be used to implement a
+  // custom contract between a FileSystem user and the provider. This is only
+  // useful in cases where a RocksDB user directly uses the FileSystem or file
+  // object for their own purposes, and wants to pass extra options to APIs
+  // such as NewRandomAccessFile and NewWritableFile.
+  std::unordered_map<std::string, std::string> property_bag;
+
+  // Force directory fsync, some file systems like btrfs may skip directory
+  // fsync, set this to force the fsync
+  bool force_dir_fsync;
+
+  IOOptions() : IOOptions(false) {}
+
+  explicit IOOptions(bool force_dir_fsync_)
+      : timeout(std::chrono::microseconds::zero()),
+        prio(IOPriority::kIOLow),
+        type(IOType::kUnknown),
+        force_dir_fsync(force_dir_fsync_) {}
+};
+
+struct DirFsyncOptions {
+  enum FsyncReason : uint8_t {
+    kNewFileSynced,
+    kFileRenamed,
+    kDirRenamed,
+    kFileDeleted,
+    kDefault,
+  } reason;
+
+  std::string renamed_new_name;  // for kFileRenamed
+  // add other options for other FsyncReason
+
+  DirFsyncOptions();
+
+  explicit DirFsyncOptions(std::string file_renamed_new_name);
+
+  explicit DirFsyncOptions(FsyncReason fsync_reason);
 };
 
 // File scope options that control how a file is opened/created and accessed
@@ -95,13 +140,32 @@
   // to be issued for the file open/creation
   IOOptions io_options;
 
-  FileOptions() : EnvOptions() {}
+  // EXPERIMENTAL
+  // The feature is in development and is subject to change.
+  // When creating a new file, set the temperature of the file so that
+  // underlying file systems can put it with appropriate storage media and/or
+  // coding.
+  Temperature temperature = Temperature::kUnknown;
+
+  // The checksum type that is used to calculate the checksum value for
+  // handoff during file writes.
+  ChecksumType handoff_checksum_type;
+
+  FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const DBOptions& opts)
-    : EnvOptions(opts) {}
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const EnvOptions& opts)
-    : EnvOptions(opts) {}
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
+
+  FileOptions(const FileOptions& opts)
+      : EnvOptions(opts),
+        io_options(opts.io_options),
+        temperature(opts.temperature),
+        handoff_checksum_type(opts.handoff_checksum_type) {}
+
+  FileOptions& operator=(const FileOptions&) = default;
 };
 
 // A structure to pass back some debugging information from the FileSystem
@@ -116,12 +180,36 @@
   // To be set by the FileSystem implementation
   std::string msg;
 
+  // To be set by the underlying FileSystem implementation.
+  std::string request_id;
+
+  // In order to log required information in IO tracing for different
+  // operations, Each bit in trace_data stores which corresponding info from
+  // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it
+  // means bit at position 0 is set so TraceData::kRequestID (request_id) will
+  // be logged in the trace record.
+  //
+  enum TraceData : char {
+    // The value of each enum represents the bitwise position for
+    // that information in trace_data which will be used by IOTracer for
+    // tracing. Make sure to add them sequentially.
+    kRequestID = 0,
+  };
+  uint64_t trace_data = 0;
+
   IODebugContext() {}
 
   void AddCounter(std::string& name, uint64_t value) {
     counters.emplace(name, value);
   }
 
+  // Called by underlying file system to set request_id and log request_id in
+  // IOTracing.
+  void SetRequestId(const std::string& _request_id) {
+    request_id = _request_id;
+    trace_data |= (1 << TraceData::kRequestID);
+  }
+
   std::string ToString() {
     std::ostringstream ss;
     ss << file_path << ", ";
@@ -147,7 +235,13 @@
 // of the APIs is of type IOStatus, which can indicate an error code/sub-code,
 // as well as metadata about the error such as its scope and whether its
 // retryable.
-class FileSystem {
+// NewCompositeEnv can be used to create an Env with a custom FileSystem for
+// DBOptions::env.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class FileSystem : public Customizable {
  public:
   FileSystem();
 
@@ -156,21 +250,61 @@
 
   virtual ~FileSystem();
 
-  virtual const char* Name() const = 0;
-
   static const char* Type() { return "FileSystem"; }
+  static const char* kDefaultName() { return "DefaultFileSystem"; }
 
   // Loads the FileSystem specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
   static Status Load(const std::string& value,
                      std::shared_ptr<FileSystem>* result);
 
-  // Return a default fie_system suitable for the current operating
-  // system.  Sophisticated users may wish to provide their own Env
-  // implementation instead of relying on this default file_system
-  //
-  // The result of Default() belongs to rocksdb and must never be deleted.
+  // Loads the FileSystem specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  // @param config_options Controls how the FileSystem is loaded
+  // @param value The name and optional properties describing the file system
+  //      to load.
+  // @param result On success, returns the loaded FileSystem
+  // @return OK if the FileSystem was successfully loaded.
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<FileSystem>* result);
+
+  // Return a default FileSystem suitable for the current operating
+  // system.
   static std::shared_ptr<FileSystem> Default();
 
+  // Handles the event when a new DB or a new ColumnFamily starts using the
+  // specified data paths.
+  //
+  // The data paths might be shared by different DBs or ColumnFamilies,
+  // so RegisterDbPaths might be called with the same data paths.
+  // For example, when CreateColumnFamily is called multiple times with the same
+  // data path, RegisterDbPaths will also be called with the same data path.
+  //
+  // If the return status is ok, then the paths must be correspondingly
+  // called in UnregisterDbPaths;
+  // otherwise this method should have no side effect, and UnregisterDbPaths
+  // do not need to be called for the paths.
+  //
+  // Different implementations may take different actions.
+  // By default, it's a no-op and returns Status::OK.
+  virtual Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+  // Handles the event a DB or a ColumnFamily stops using the specified data
+  // paths.
+  //
+  // It should be called corresponding to each successful RegisterDbPaths.
+  //
+  // Different implementations may take different actions.
+  // By default, it's a no-op and returns Status::OK.
+  virtual Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) {
+    return Status::OK();
+  }
+
   // Create a brand new sequentially-readable file with the specified name.
   // On success, stores a pointer to the new file in *result and returns OK.
   // On failure stores nullptr in *result and returns non-OK.  If the file does
@@ -216,17 +350,18 @@
                                    std::unique_ptr<FSWritableFile>* result,
                                    IODebugContext* dbg) = 0;
 
-  // Create an object that writes to a new file with the specified
-  // name.  Deletes any existing file with the same name and creates a
-  // new file.  On success, stores a pointer to the new file in
-  // *result and returns OK.  On failure stores nullptr in *result and
-  // returns non-OK.
+  // Create an object that writes to a file with the specified name.
+  // `FSWritableFile::Append()`s will append after any existing content.  If the
+  // file does not already exist, creates it.
+  //
+  // On success, stores a pointer to the file in *result and returns OK.  On
+  // failure stores nullptr in *result and returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
   virtual IOStatus ReopenWritableFile(
       const std::string& /*fname*/, const FileOptions& /*options*/,
       std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("ReopenWritableFile");
   }
 
   // Reuse an existing file by renaming it and opening it as writable.
@@ -234,7 +369,7 @@
                                      const std::string& old_fname,
                                      const FileOptions& file_opts,
                                      std::unique_ptr<FSWritableFile>* result,
-                                     IODebugContext* dbg) = 0;
+                                     IODebugContext* dbg);
 
   // Open `fname` for random read and write, if file doesn't exist the file
   // will be created.  On success, stores a pointer to the new file in
@@ -330,6 +465,10 @@
     return IOStatus::OK();
   }
 
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
   // Delete the named file.
   virtual IOStatus DeleteFile(const std::string& fname,
                               const IOOptions& options,
@@ -424,7 +563,7 @@
                                     IODebugContext* dbg) = 0;
 
   // Create and returns a default logger (an instance of EnvLogger) for storing
-  // informational messages. Derived classes can overide to provide custom
+  // informational messages. Derived classes can override to provide custom
   // logger.
   virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
                              std::shared_ptr<Logger>* result,
@@ -436,6 +575,10 @@
                                    std::string* output_path,
                                    IODebugContext* dbg) = 0;
 
+  // Sanitize the FileOptions. Typically called by a FileOptions/EnvOptions
+  // copy constructor
+  virtual void SanitizeFileOptions(FileOptions* /*opts*/) const {}
+
   // OptimizeForLogRead will create a new FileOptions object that is a copy of
   // the FileOptions in the parameters, but is optimized for reading log files.
   virtual FileOptions OptimizeForLogRead(const FileOptions& file_options) const;
@@ -473,6 +616,13 @@
       const FileOptions& file_options,
       const ImmutableDBOptions& db_options) const;
 
+  // OptimizeForBlobFileRead will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // reading blob files.
+  virtual FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const;
+
 // This seems to clash with a macro on Windows, so #undef it here
 #ifdef GetFreeSpace
 #undef GetFreeSpace
@@ -483,9 +633,13 @@
                                 const IOOptions& /*options*/,
                                 uint64_t* /*diskfree*/,
                                 IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("GetFreeSpace");
   }
 
+  virtual IOStatus IsDirectory(const std::string& /*path*/,
+                               const IOOptions& options, bool* is_dir,
+                               IODebugContext* /*dgb*/) = 0;
+
   // If you're adding methods here, remember to add them to EnvWrapper too.
 
  private:
@@ -506,6 +660,10 @@
   // "scratch[0..n-1]" must be live when "*result" is used.
   // If an error was encountered, returns a non-OK status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // REQUIRES: External synchronization
   virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result,
                         char* scratch, IODebugContext* dbg) = 0;
@@ -540,7 +698,7 @@
                                   const IOOptions& /*options*/,
                                   Slice* /*result*/, char* /*scratch*/,
                                   IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("PositionedRead");
   }
 
   // If you're adding methods here, remember to add them to
@@ -552,7 +710,8 @@
   // File offset in bytes
   uint64_t offset;
 
-  // Length to read in bytes
+  // Length to read in bytes. `result` only returns fewer bytes if end of file
+  // is hit (or `status` is not OK).
   size_t len;
 
   // A buffer that MultiRead()  can optionally place data in. It can
@@ -582,6 +741,10 @@
   // "*result" is used.  If an error was encountered, returns a non-OK
   // status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
   virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
@@ -589,19 +752,22 @@
                         IODebugContext* dbg) const = 0;
 
   // Readahead the file starting from offset by n bytes for caching.
+  // If it's not implemented (default: `NotSupported`), RocksDB will create
+  // internal prefetch buffer to improve read performance.
   virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
                             const IOOptions& /*options*/,
                             IODebugContext* /*dbg*/) {
-    return IOStatus::OK();
+    return IOStatus::NotSupported("Prefetch");
   }
 
   // Read a bunch of blocks as described by reqs. The blocks can
   // optionally be read in parallel. This is a synchronous call, i.e it
   // should return after all reads have completed. The reads will be
-  // non-overlapping. If the function return Status is not ok, status of
-  // individual requests will be ignored and return status will be assumed
-  // for all read requests. The function return status is only meant for any
-  // any errors that occur before even processing specific read requests
+  // non-overlapping but can be in any order. If the function return Status
+  // is not ok, status of individual requests will be ignored and return
+  // status will be assumed for all read requests. The function return status
+  // is only meant for errors that occur before processing individual read
+  // requests.
   virtual IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
                              const IOOptions& options, IODebugContext* dbg) {
     assert(reqs != nullptr);
@@ -656,6 +822,13 @@
   // RandomAccessFileWrapper too.
 };
 
+// A data structure brings the data verification information, which is
+// used together with data being written to a file.
+struct DataVerificationInfo {
+  // checksum of the data being written.
+  Slice checksum;
+};
+
 // A file abstraction for sequential writing.  The implementation
 // must provide buffering since callers may append small fragments
 // at a time to the file.
@@ -678,11 +851,25 @@
   virtual ~FSWritableFile() {}
 
   // Append data to the end of the file
-  // Note: A WriteabelFile object must support either Append or
+  // Note: A WriteableFile object must support either Append or
   // PositionedAppend, so the users cannot mix the two.
   virtual IOStatus Append(const Slice& data, const IOOptions& options,
                           IODebugContext* dbg) = 0;
 
+  // Append data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual IOStatus Append(const Slice& data, const IOOptions& options,
+                          const DataVerificationInfo& /* verification_info */,
+                          IODebugContext* dbg) {
+    return Append(data, options, dbg);
+  }
+
   // PositionedAppend data to the specified offset. The new EOF after append
   // must be larger than the previous EOF. This is to be used when writes are
   // not backed by OS buffers and hence has to always start from the start of
@@ -707,7 +894,23 @@
                                     uint64_t /* offset */,
                                     const IOOptions& /*options*/,
                                     IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("PositionedAppend");
+  }
+
+  // PositionedAppend data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual IOStatus PositionedAppend(
+      const Slice& /* data */, uint64_t /* offset */,
+      const IOOptions& /*options*/,
+      const DataVerificationInfo& /* verification_info */,
+      IODebugContext* /*dbg*/) {
+    return IOStatus::NotSupported("PositionedAppend");
   }
 
   // Truncate is necessary to trim the file to the correct size
@@ -825,7 +1028,8 @@
       size_t num_spanned_blocks =
           new_last_preallocated_block - last_preallocated_block_;
       Allocate(block_size * last_preallocated_block_,
-               block_size * num_spanned_blocks, options, dbg);
+               block_size * num_spanned_blocks, options, dbg)
+          .PermitUncheckedError();
       last_preallocated_block_ = new_last_preallocated_block;
     }
   }
@@ -878,6 +1082,11 @@
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Returns Status::OK() on success.
   virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
                         Slice* result, char* scratch,
@@ -931,6 +1140,15 @@
   // Fsync directory. Can be called concurrently from multiple threads.
   virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0;
 
+  // FsyncWithDirOptions after renaming a file. Depends on the filesystem, it
+  // may fsync directory or just the renaming file (e.g. btrfs). By default, it
+  // just calls directory fsync.
+  virtual IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& /*dir_fsync_options*/) {
+    return Fsync(options, dbg);
+  }
+
   virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
     return 0;
   }
@@ -972,11 +1190,15 @@
 class FileSystemWrapper : public FileSystem {
  public:
   // Initialize an EnvWrapper that delegates all calls to *t
-  explicit FileSystemWrapper(FileSystem* t) : target_(t) {}
+  explicit FileSystemWrapper(const std::shared_ptr<FileSystem>& t);
   ~FileSystemWrapper() override {}
 
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  const char* Name() const override { return target_->Name(); }
+
   // Return the target to which this Env forwards all calls
-  FileSystem* target() const { return target_; }
+  FileSystem* target() const { return target_.get(); }
 
   // The following text is boilerplate that forwards all methods to target()
   IOStatus NewSequentialFile(const std::string& f,
@@ -1120,6 +1342,10 @@
     return target_->NewLogger(fname, options, result, dbg);
   }
 
+  void SanitizeFileOptions(FileOptions* opts) const override {
+    target_->SanitizeFileOptions(opts);
+  }
+
   FileOptions OptimizeForLogRead(
                   const FileOptions& file_options) const override {
     return target_->OptimizeForLogRead(file_options);
@@ -1147,19 +1373,37 @@
       const ImmutableDBOptions& db_options) const override {
     return target_->OptimizeForCompactionTableRead(file_options, db_options);
   }
+  FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(file_options, db_options);
+  }
   IOStatus GetFreeSpace(const std::string& path, const IOOptions& options,
                         uint64_t* diskfree, IODebugContext* dbg) override {
     return target_->GetFreeSpace(path, options, diskfree, dbg);
   }
+  IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+                       bool* is_dir, IODebugContext* dbg) override {
+    return target_->IsDirectory(path, options, is_dir, dbg);
+  }
 
- private:
-  FileSystem* target_;
+  const Customizable* Inner() const override { return target_.get(); }
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+ protected:
+  std::shared_ptr<FileSystem> target_;
 };
 
 class FSSequentialFileWrapper : public FSSequentialFile {
  public:
-  explicit FSSequentialFileWrapper(FSSequentialFile* target)
-      : target_(target) {}
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSSequentialFileWrapper(FSSequentialFile* t) : target_(t) {}
+
+  FSSequentialFile* target() const { return target_; }
 
   IOStatus Read(size_t n, const IOOptions& options, Slice* result,
                 char* scratch, IODebugContext* dbg) override {
@@ -1183,10 +1427,24 @@
   FSSequentialFile* target_;
 };
 
+class FSSequentialFileOwnerWrapper : public FSSequentialFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSSequentialFileOwnerWrapper(std::unique_ptr<FSSequentialFile>&& t)
+      : FSSequentialFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSSequentialFile> guard_;
+};
+
 class FSRandomAccessFileWrapper : public FSRandomAccessFile {
  public:
-  explicit FSRandomAccessFileWrapper(FSRandomAccessFile* target)
-      : target_(target) {}
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSRandomAccessFileWrapper(FSRandomAccessFile* t) : target_(t) {}
+
+  FSRandomAccessFile* target() const { return target_; }
 
   IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
                 Slice* result, char* scratch,
@@ -1214,22 +1472,51 @@
   }
 
  private:
+  std::unique_ptr<FSRandomAccessFile> guard_;
   FSRandomAccessFile* target_;
 };
 
+class FSRandomAccessFileOwnerWrapper : public FSRandomAccessFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSRandomAccessFileOwnerWrapper(
+      std::unique_ptr<FSRandomAccessFile>&& t)
+      : FSRandomAccessFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> guard_;
+};
+
 class FSWritableFileWrapper : public FSWritableFile {
  public:
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
   explicit FSWritableFileWrapper(FSWritableFile* t) : target_(t) {}
 
+  FSWritableFile* target() const { return target_; }
+
   IOStatus Append(const Slice& data, const IOOptions& options,
                   IODebugContext* dbg) override {
     return target_->Append(data, options, dbg);
   }
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& verification_info,
+                  IODebugContext* dbg) override {
+    return target_->Append(data, options, verification_info, dbg);
+  }
   IOStatus PositionedAppend(const Slice& data, uint64_t offset,
                             const IOOptions& options,
                             IODebugContext* dbg) override {
     return target_->PositionedAppend(data, offset, options, dbg);
   }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& verification_info,
+                            IODebugContext* dbg) override {
+    return target_->PositionedAppend(data, offset, options, verification_info,
+                                     dbg);
+  }
   IOStatus Truncate(uint64_t size, const IOOptions& options,
                     IODebugContext* dbg) override {
     return target_->Truncate(size, options, dbg);
@@ -1302,9 +1589,24 @@
   FSWritableFile* target_;
 };
 
+class FSWritableFileOwnerWrapper : public FSWritableFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSWritableFileOwnerWrapper(std::unique_ptr<FSWritableFile>&& t)
+      : FSWritableFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSWritableFile> guard_;
+};
+
 class FSRandomRWFileWrapper : public FSRandomRWFile {
  public:
-  explicit FSRandomRWFileWrapper(FSRandomRWFile* target) : target_(target) {}
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSRandomRWFileWrapper(FSRandomRWFile* t) : target_(t) {}
+
+  FSRandomRWFile* target() const { return target_; }
 
   bool use_direct_io() const override { return target_->use_direct_io(); }
   size_t GetRequiredBufferAlignment() const override {
@@ -1336,23 +1638,56 @@
   FSRandomRWFile* target_;
 };
 
+class FSRandomRWFileOwnerWrapper : public FSRandomRWFileWrapper {
+ public:
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSRandomRWFileOwnerWrapper(std::unique_ptr<FSRandomRWFile>&& t)
+      : FSRandomRWFileWrapper(t.get()), guard_(std::move(t)) {}
+
+ private:
+  std::unique_ptr<FSRandomRWFile> guard_;
+};
+
 class FSDirectoryWrapper : public FSDirectory {
  public:
-  explicit FSDirectoryWrapper(FSDirectory* target) : target_(target) {}
+  // Creates a FileWrapper around the input File object and takes
+  // ownership of the object
+  explicit FSDirectoryWrapper(std::unique_ptr<FSDirectory>&& t)
+      : guard_(std::move(t)) {
+    target_ = guard_.get();
+  }
+
+  // Creates a FileWrapper around the input File object and without
+  // taking ownership of the object
+  explicit FSDirectoryWrapper(FSDirectory* t) : target_(t) {}
 
   IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
     return target_->Fsync(options, dbg);
   }
+
+  IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& dir_fsync_options) override {
+    return target_->FsyncWithDirOptions(options, dbg, dir_fsync_options);
+  }
+
   size_t GetUniqueId(char* id, size_t max_size) const override {
     return target_->GetUniqueId(id, max_size);
   }
 
  private:
+  std::unique_ptr<FSDirectory> guard_;
   FSDirectory* target_;
 };
 
+// A utility routine: write "data" to the named file.
+extern IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
+                                  const std::string& fname,
+                                  bool should_sync = false);
+
 // A utility routine: read contents of named file into *data
-extern Status ReadFileToString(FileSystem* fs, const std::string& fname,
-                               std::string* data);
+extern IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
+                                 std::string* data);
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/filter_policy.h	2025-05-19 16:14:27.000000000 +0000
@@ -20,50 +20,70 @@
 #pragma once
 
 #include <stdlib.h>
+
+#include <algorithm>
 #include <memory>
 #include <stdexcept>
 #include <string>
 #include <vector>
 
 #include "rocksdb/advanced_options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Slice;
 struct BlockBasedTableOptions;
+struct ConfigOptions;
 
 // A class that takes a bunch of keys, then generates filter
 class FilterBitsBuilder {
  public:
   virtual ~FilterBitsBuilder() {}
 
-  // Add Key to filter, you could use any way to store the key.
-  // Such as: storing hashes or original keys
-  // Keys are in sorted order and duplicated keys are possible.
+  // Add a key (or prefix) to the filter. Typically, a builder will keep
+  // a set of 64-bit key hashes and only build the filter in Finish
+  // when the final number of keys is known. Keys are added in sorted order
+  // and duplicated keys are possible, so typically, the builder will
+  // only add this key if its hash is different from the most recently
+  // added.
   virtual void AddKey(const Slice& key) = 0;
 
+  // Called by RocksDB before Finish to populate
+  // TableProperties::num_filter_entries, so should represent the
+  // number of unique keys (and/or prefixes) added, but does not have
+  // to be exact.
+  virtual size_t EstimateEntriesAdded() {
+    // Default implementation for backward compatibility.
+    // 0 conspicuously stands for "unknown".
+    return 0;
+  }
+
   // Generate the filter using the keys that are added
   // The return value of this function would be the filter bits,
   // The ownership of actual data is set to buf
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
 
-  // Calculate num of keys that can be added and generate a filter
-  // <= the specified number of bytes.
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4702)  // unreachable code
-#endif
-  virtual int CalculateNumEntry(const uint32_t /*bytes*/) {
-#ifndef ROCKSDB_LITE
-    throw std::runtime_error("CalculateNumEntry not Implemented");
-#else
-    abort();
-#endif
-    return 0;
+  // Approximate the number of keys that can be added and generate a filter
+  // <= the specified number of bytes. Callers (including RocksDB) should
+  // only use this result for optimizing performance and not as a guarantee.
+  // This default implementation is for compatibility with older custom
+  // FilterBitsBuilders only implementing deprecated CalculateNumEntry.
+  virtual size_t ApproximateNumEntries(size_t bytes) {
+    bytes = std::min(bytes, size_t{0xffffffff});
+    return static_cast<size_t>(CalculateNumEntry(static_cast<uint32_t>(bytes)));
+  }
+
+  // Old, DEPRECATED version of ApproximateNumEntries. This is not
+  // called by RocksDB except as the default implementation of
+  // ApproximateNumEntries for API compatibility.
+  virtual int CalculateNumEntry(const uint32_t bytes) {
+    // DEBUG: ideally should not rely on this implementation
+    assert(false);
+    // RELEASE: something reasonably conservative: 2 bytes per entry
+    return static_cast<int>(bytes / 2);
   }
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
 };
 
 // A class that checks if a key can be in filter
@@ -93,18 +113,32 @@
   // Options for the table being built
   const BlockBasedTableOptions& table_options;
 
-  // Name of the column family for the table (or empty string if unknown)
-  std::string column_family_name;
-
-  // The compactions style in effect for the table
+  // BEGIN from (DB|ColumnFamily)Options in effect at table creation time
   CompactionStyle compaction_style = kCompactionStyleLevel;
 
-  // The table level at time of constructing the SST file, or -1 if unknown.
-  // (The table file could later be used at a different level.)
-  int level_at_creation = -1;
+  // Number of LSM levels, or -1 if unknown
+  int num_levels = -1;
 
   // An optional logger for reporting errors, warnings, etc.
   Logger* info_log = nullptr;
+  // END from (DB|ColumnFamily)Options
+
+  // Name of the column family for the table (or empty string if unknown)
+  // TODO: consider changing to Slice
+  std::string column_family_name;
+
+  // The table level at time of constructing the SST file, or -1 if unknown
+  // or N/A as in SstFileWriter. (The table file could later be used at a
+  // different level.)
+  int level_at_creation = -1;
+
+  // True if known to be going into bottommost sorted run for applicable
+  // key range (which might not even be last level with data). False
+  // otherwise.
+  bool is_bottommost = false;
+
+  // Reason for creating the file with the filter
+  TableFileCreationReason reason = TableFileCreationReason::kMisc;
 };
 
 // We add a new format of filter block called full filter block
@@ -125,12 +159,27 @@
  public:
   virtual ~FilterPolicy();
 
+  // Creates a new FilterPolicy based on the input value string and returns the
+  // result The value might be an ID, and ID with properties, or an old-style
+  // policy string.
+  // The value describes the FilterPolicy being created.
+  // For BloomFilters, value may be a ":"-delimited value of the form:
+  //   "bloomfilter:[bits_per_key]:[use_block_based_builder]",
+  //   e.g. ""bloomfilter:4:true"
+  //   The above string is equivalent to calling NewBloomFilterPolicy(4, true).
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value,
+                                 std::shared_ptr<const FilterPolicy>* result);
+
   // Return the name of this policy.  Note that if the filter encoding
   // changes in an incompatible way, the name returned by this method
   // must be changed.  Otherwise, old incompatible filters may be
   // passed to methods of this type.
   virtual const char* Name() const = 0;
 
+  // DEPRECATED: This function is part of the deprecated block-based
+  // filter, which will be removed in a future release.
+  //
   // keys[0,n-1] contains a list of keys (potentially with duplicates)
   // that are ordered according to the user supplied comparator.
   // Append a filter that summarizes keys[0,n-1] to *dst.
@@ -140,6 +189,9 @@
   virtual void CreateFilter(const Slice* keys, int n,
                             std::string* dst) const = 0;
 
+  // DEPRECATED: This function is part of the deprecated block-based
+  // filter, which will be removed in a future release.
+  //
   // "filter" contains the data appended by a preceding call to
   // CreateFilter() on this class.  This method must return true if
   // the key was in the list of keys passed to CreateFilter().
@@ -152,6 +204,7 @@
   // NOTE: This function is only called by GetBuilderWithContext() below for
   // custom FilterPolicy implementations. Thus, it is not necessary to
   // override this function if overriding GetBuilderWithContext().
+  // DEPRECATED: This function will be removed in a future release.
   virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; }
 
   // A newer variant of GetFilterBitsBuilder that allows a FilterPolicy
@@ -197,4 +250,49 @@
 // trailing spaces in keys.
 extern const FilterPolicy* NewBloomFilterPolicy(
     double bits_per_key, bool use_block_based_builder = false);
+
+// A new Bloom alternative that saves about 30% space compared to
+// Bloom filters, with similar query times but roughly 3-4x CPU time
+// and 3x temporary space usage during construction.  For example, if
+// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same
+// 0.95% FP rate as Bloom filter but only using about 7 bits per key.
+//
+// The space savings of Ribbon filters makes sense for lower (higher
+// numbered; larger; longer-lived) levels of LSM, whereas the speed of
+// Bloom filters make sense for highest levels of LSM. Setting
+// bloom_before_level allows for this design with Level and Universal
+// compaction styles. For example, bloom_before_level=1 means that Bloom
+// filters will be used in level 0, including flushes, and Ribbon
+// filters elsewhere, including FIFO compaction and external SST files.
+// For this option, memtable flushes are considered level -1 (so that
+// flushes can be distinguished from intra-L0 compaction).
+// bloom_before_level=0 (default) -> Generate Bloom filters only for
+// flushes under Level and Universal compaction styles.
+// bloom_before_level=-1 -> Always generate Ribbon filters (except in
+// some extreme or exceptional cases).
+//
+// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier
+// versions reading the data will behave as if no filter was used
+// (degraded performance until compaction rebuilds filters). All
+// built-in FilterPolicies (Bloom or Ribbon) are able to read other
+// kinds of built-in filters.
+//
+// Note: the current Ribbon filter schema uses some extra resources
+// when constructing very large filters. For example, for 100 million
+// keys in a single filter (one SST file without partitioned filters),
+// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom.
+// However, the savings in filter space from just ~60 open SST files
+// makes up for the additional temporary memory use.
+//
+// Also consider using optimize_filters_for_memory to save filter
+// memory.
+extern const FilterPolicy* NewRibbonFilterPolicy(
+    double bloom_equivalent_bits_per_key, int bloom_before_level = 0);
+
+// Old name and old default behavior (DEPRECATED)
+inline const FilterPolicy* NewExperimentalRibbonFilterPolicy(
+    double bloom_equivalent_bits_per_key) {
+  return NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, -1);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/flush_block_policy.h	2025-05-19 16:14:27.000000000 +0000
@@ -6,16 +6,23 @@
 #pragma once
 
 #include <string>
+
+#include "rocksdb/customizable.h"
 #include "rocksdb/table.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Slice;
 class BlockBuilder;
+struct ConfigOptions;
 struct Options;
 
 // FlushBlockPolicy provides a configurable way to determine when to flush a
-// block in the block based tables,
+// block in the block based tables.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
 class FlushBlockPolicy {
  public:
   // Keep track of the key/value sequences and return the boolean value to
@@ -25,10 +32,16 @@
   virtual ~FlushBlockPolicy() {}
 };
 
-class FlushBlockPolicyFactory {
+class FlushBlockPolicyFactory : public Customizable {
  public:
-  // Return the name of the flush block policy.
-  virtual const char* Name() const = 0;
+  static const char* Type() { return "FlushBlockPolicyFactory"; }
+
+  // Creates a FlushBlockPolicyFactory based on the input value.
+  // By default, this method can create EveryKey or BySize PolicyFactory,
+  // which take now config_options.
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& value,
+      std::shared_ptr<FlushBlockPolicyFactory>* result);
 
   // Return a new block flush policy that flushes data blocks by data size.
   // FlushBlockPolicy may need to access the metadata of the data block
@@ -45,9 +58,10 @@
 
 class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
  public:
-  FlushBlockBySizePolicyFactory() {}
+  FlushBlockBySizePolicyFactory();
 
-  const char* Name() const override { return "FlushBlockBySizePolicyFactory"; }
+  static const char* kClassName() { return "FlushBlockBySizePolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
 
   FlushBlockPolicy* NewFlushBlockPolicy(
       const BlockBasedTableOptions& table_options,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/functor_wrapper.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,56 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace detail {
+template <std::size_t...>
+struct IndexSequence {};
+
+template <std::size_t N, std::size_t... Next>
+struct IndexSequenceHelper
+    : public IndexSequenceHelper<N - 1U, N - 1U, Next...> {};
+
+template <std::size_t... Next>
+struct IndexSequenceHelper<0U, Next...> {
+  using type = IndexSequence<Next...>;
+};
+
+template <std::size_t N>
+using make_index_sequence = typename IndexSequenceHelper<N>::type;
+
+template <typename Function, typename Tuple, size_t... I>
+void call(Function f, Tuple t, IndexSequence<I...>) {
+  f(std::get<I>(t)...);
+}
+
+template <typename Function, typename Tuple>
+void call(Function f, Tuple t) {
+  static constexpr auto size = std::tuple_size<Tuple>::value;
+  call(f, t, make_index_sequence<size>{});
+}
+}  // namespace detail
+
+template <typename... Args>
+class FunctorWrapper {
+ public:
+  explicit FunctorWrapper(std::function<void(Args...)> functor, Args &&...args)
+      : functor_(std::move(functor)), args_(std::forward<Args>(args)...) {}
+
+  void invoke() { detail::call(functor_, args_); }
+
+ private:
+  std::function<void(Args...)> functor_;
+  std::tuple<Args...> args_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/io_status.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/io_status.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/io_status.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/io_status.h	2025-05-19 16:14:27.000000000 +0000
@@ -126,6 +126,11 @@
     return IOStatus(kIOError, kPathNotFound, msg, msg2);
   }
 
+  static IOStatus IOFenced() { return IOStatus(kIOError, kIOFenced); }
+  static IOStatus IOFenced(const Slice& msg, const Slice& msg2 = Slice()) {
+    return IOStatus(kIOError, kIOFenced, msg, msg2);
+  }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   // std::string ToString() const;
@@ -170,6 +175,9 @@
 }
 
 inline IOStatus::IOStatus(const IOStatus& s) : Status(s.code_, s.subcode_) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  s.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
   retryable_ = s.retryable_;
   data_loss_ = s.data_loss_;
   scope_ = s.scope_;
@@ -179,6 +187,10 @@
   // The following condition catches both aliasing (when this == &s),
   // and the common case where both s and *this are ok.
   if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    s.checked_ = true;
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
     code_ = s.code_;
     subcode_ = s.subcode_;
     retryable_ = s.retryable_;
@@ -204,16 +216,18 @@
 #endif
 {
   if (this != &s) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    s.checked_ = true;
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
     code_ = std::move(s.code_);
     s.code_ = kOk;
     subcode_ = std::move(s.subcode_);
     s.subcode_ = kNone;
     retryable_ = s.retryable_;
-    retryable_ = false;
     data_loss_ = s.data_loss_;
-    data_loss_ = false;
     scope_ = s.scope_;
-    scope_ = kIOErrorScopeFileSystem;
+    s.scope_ = kIOErrorScopeFileSystem;
     delete[] state_;
     state_ = nullptr;
     std::swap(state_, s.state_);
@@ -222,11 +236,34 @@
 }
 
 inline bool IOStatus::operator==(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+  rhs.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
   return (code_ == rhs.code_);
 }
 
 inline bool IOStatus::operator!=(const IOStatus& rhs) const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+  rhs.checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
   return !(*this == rhs);
 }
 
+inline IOStatus status_to_io_status(Status&& status) {
+  if (status.ok()) {
+    // Fast path
+    return IOStatus::OK();
+  } else {
+    const char* state = status.getState();
+    if (state) {
+      return IOStatus(status.code(), status.subcode(),
+                      Slice(state, strlen(status.getState()) + 1), Slice());
+    } else {
+      return IOStatus(status.code(), status.subcode());
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iostats_context.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,6 +14,32 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+// EXPERIMENTAL: the IO statistics for tiered storage. It matches with each
+// item in Temperature class.
+struct FileIOByTemperature {
+  // the number of bytes read to Temperature::kHot file
+  uint64_t hot_file_bytes_read;
+  // the number of bytes read to Temperature::kWarm file
+  uint64_t warm_file_bytes_read;
+  // the number of bytes read to Temperature::kCold file
+  uint64_t cold_file_bytes_read;
+  // total number of reads to Temperature::kHot file
+  uint64_t hot_file_read_count;
+  // total number of reads to Temperature::kWarm file
+  uint64_t warm_file_read_count;
+  // total number of reads to Temperature::kCold file
+  uint64_t cold_file_read_count;
+  // reset all the statistics to 0.
+  void Reset() {
+    hot_file_bytes_read = 0;
+    warm_file_bytes_read = 0;
+    cold_file_bytes_read = 0;
+    hot_file_read_count = 0;
+    warm_file_read_count = 0;
+    cold_file_read_count = 0;
+  }
+};
+
 struct IOStatsContext {
   // reset all io-stats counter to zero
   void Reset();
@@ -48,9 +74,19 @@
   uint64_t cpu_write_nanos;
   // CPU time spent in read() and pread()
   uint64_t cpu_read_nanos;
+
+  FileIOByTemperature file_io_stats_by_temperature;
 };
 
-// Get Thread-local IOStatsContext object pointer
+// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global,
+// non-thread-local IOStatsContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise,
+// a) if thread-local is supported on the platform, then a pointer to
+//    a thread-local IOStatsContext object will be returned.
+// b) if thread-local is NOT supported, then compilation will fail.
+//
+// This function never returns nullptr.
 IOStatsContext* get_iostats_context();
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/iterator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -53,11 +53,13 @@
   // All Seek*() methods clear any error status() that the iterator had prior to
   // the call; after the seek, status() indicates only the error (if any) that
   // happened during the seek, not any past errors.
+  // Target does not contain timestamp.
   virtual void Seek(const Slice& target) = 0;
 
   // Position at the last key in the source that at or before target.
   // The iterator is Valid() after this call iff the source contains
   // an entry that comes at or before target.
+  // Target does not contain timestamp.
   virtual void SeekForPrev(const Slice& target) = 0;
 
   // Moves to the next entry in the source.  After this call, Valid() is
@@ -90,6 +92,10 @@
   // If supported, renew the iterator to represent the latest state. The
   // iterator will be invalidated after the call. Not supported if
   // ReadOptions.snapshot is given when creating the iterator.
+  //
+  // WARNING: Do not use `Iterator::Refresh()` API on DBs where `DeleteRange()`
+  // has been used or will be used. This feature combination is neither
+  // supported nor programmatically prevented.
   virtual Status Refresh() {
     return Status::NotSupported("Refresh() is not supported");
   }
@@ -108,6 +114,11 @@
   //   Get the user-key portion of the internal key at which the iteration
   //   stopped.
   virtual Status GetProperty(std::string prop_name, std::string* prop);
+
+  virtual Slice timestamp() const {
+    assert(false);
+    return Slice();
+  }
 };
 
 // Return an empty iterator (yields nothing).
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/listener.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/listener.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/listener.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/listener.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,30 +11,35 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/io_status.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
-    TablePropertiesCollection;
+using TablePropertiesCollection =
+    std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
 
 class DB;
 class ColumnFamilyHandle;
 class Status;
 struct CompactionJobStats;
-enum CompressionType : unsigned char;
-
-enum class TableFileCreationReason {
-  kFlush,
-  kCompaction,
-  kRecovery,
-  kMisc,
-};
 
-struct TableFileCreationBriefInfo {
-  // the name of the database where the file was created
+struct FileCreationBriefInfo {
+  FileCreationBriefInfo() = default;
+  FileCreationBriefInfo(const std::string& _db_name,
+                        const std::string& _cf_name,
+                        const std::string& _file_path, int _job_id)
+      : db_name(_db_name),
+        cf_name(_cf_name),
+        file_path(_file_path),
+        job_id(_job_id) {}
+  // the name of the database where the file was created.
   std::string db_name;
   // the name of the column family where the file was created.
   std::string cf_name;
@@ -42,7 +47,10 @@
   std::string file_path;
   // the id of the job (which could be flush or compaction) that
   // created the file.
-  int job_id;
+  int job_id = 0;
+};
+
+struct TableFileCreationBriefInfo : public FileCreationBriefInfo {
   // reason of creating the table.
   TableFileCreationReason reason;
 };
@@ -57,6 +65,48 @@
   TableProperties table_properties;
   // The status indicating whether the creation was successful or not.
   Status status;
+  // The checksum of the table file being created
+  std::string file_checksum;
+  // The checksum function name of checksum generator used for this table file
+  std::string file_checksum_func_name;
+};
+
+struct BlobFileCreationBriefInfo : public FileCreationBriefInfo {
+  BlobFileCreationBriefInfo(const std::string& _db_name,
+                            const std::string& _cf_name,
+                            const std::string& _file_path, int _job_id,
+                            BlobFileCreationReason _reason)
+      : FileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id),
+        reason(_reason) {}
+  // reason of creating the blob file.
+  BlobFileCreationReason reason;
+};
+
+struct BlobFileCreationInfo : public BlobFileCreationBriefInfo {
+  BlobFileCreationInfo(const std::string& _db_name, const std::string& _cf_name,
+                       const std::string& _file_path, int _job_id,
+                       BlobFileCreationReason _reason,
+                       uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+                       Status _status, const std::string& _file_checksum,
+                       const std::string& _file_checksum_func_name)
+      : BlobFileCreationBriefInfo(_db_name, _cf_name, _file_path, _job_id,
+                                  _reason),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes),
+        status(_status),
+        file_checksum(_file_checksum),
+        file_checksum_func_name(_file_checksum_func_name) {}
+
+  // the number of blob in a file.
+  uint64_t total_blob_count;
+  // the total bytes in a file.
+  uint64_t total_blob_bytes;
+  // The status indicating whether the creation was successful or not.
+  Status status;
+  // The checksum of the blob file being created.
+  std::string file_checksum;
+  // The checksum function name of checksum generator used for this blob file.
+  std::string file_checksum_func_name;
 };
 
 enum class CompactionReason : int {
@@ -93,6 +143,10 @@
   kExternalSstIngestion,
   // Compaction due to SST file being too old
   kPeriodicCompaction,
+  // Compaction in order to move files to temperature
+  kChangeTemperature,
+  // Compaction scheduled to force garbage collection of blob files
+  kForcedBlobGC,
   // total number of compaction reasons, new reasons must be added above this.
   kNumOfReasons,
 };
@@ -110,13 +164,24 @@
   kAutoCompaction = 0x09,
   kManualFlush = 0x0a,
   kErrorRecovery = 0xb,
+  // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
+  // will not be called to avoid many small immutable memtables.
+  kErrorRecoveryRetryFlush = 0xc,
+  kWalFull = 0xd,
 };
 
+// TODO: In the future, BackgroundErrorReason will only be used to indicate
+// why the BG Error is happening (e.g., flush, compaction). We may introduce
+// other data structure to indicate other essential information such as
+// the file type (e.g., Manifest, SST) and special context.
 enum class BackgroundErrorReason {
   kFlush,
   kCompaction,
   kWriteCallback,
   kMemTable,
+  kManifestWrite,
+  kFlushNoWAL,
+  kManifestWriteNoWAL,
 };
 
 enum class WriteStallCondition {
@@ -137,30 +202,113 @@
 
 #ifndef ROCKSDB_LITE
 
-struct TableFileDeletionInfo {
+struct FileDeletionInfo {
+  FileDeletionInfo() = default;
+
+  FileDeletionInfo(const std::string& _db_name, const std::string& _file_path,
+                   int _job_id, Status _status)
+      : db_name(_db_name),
+        file_path(_file_path),
+        job_id(_job_id),
+        status(_status) {}
   // The name of the database where the file was deleted.
   std::string db_name;
   // The path to the deleted file.
   std::string file_path;
   // The id of the job which deleted the file.
-  int job_id;
+  int job_id = 0;
   // The status indicating whether the deletion was successful or not.
   Status status;
 };
 
+struct TableFileDeletionInfo : public FileDeletionInfo {};
+
+struct BlobFileDeletionInfo : public FileDeletionInfo {
+  BlobFileDeletionInfo(const std::string& _db_name,
+                       const std::string& _file_path, int _job_id,
+                       Status _status)
+      : FileDeletionInfo(_db_name, _file_path, _job_id, _status) {}
+};
+
+enum class FileOperationType {
+  kRead,
+  kWrite,
+  kTruncate,
+  kClose,
+  kFlush,
+  kSync,
+  kFsync,
+  kRangeSync,
+  kAppend,
+  kPositionedAppend,
+  kOpen
+};
+
 struct FileOperationInfo {
-  using TimePoint = std::chrono::time_point<std::chrono::system_clock,
-                                            std::chrono::nanoseconds>;
+  using Duration = std::chrono::nanoseconds;
+  using SteadyTimePoint =
+      std::chrono::time_point<std::chrono::steady_clock, Duration>;
+  using SystemTimePoint =
+      std::chrono::time_point<std::chrono::system_clock, Duration>;
+  using StartTimePoint = std::pair<SystemTimePoint, SteadyTimePoint>;
+  using FinishTimePoint = SteadyTimePoint;
 
+  FileOperationType type;
   const std::string& path;
   uint64_t offset;
   size_t length;
-  const TimePoint& start_timestamp;
-  const TimePoint& finish_timestamp;
+  const Duration duration;
+  const SystemTimePoint& start_ts;
   Status status;
-  FileOperationInfo(const std::string& _path, const TimePoint& start,
-                    const TimePoint& finish)
-      : path(_path), start_timestamp(start), finish_timestamp(finish) {}
+  FileOperationInfo(const FileOperationType _type, const std::string& _path,
+                    const StartTimePoint& _start_ts,
+                    const FinishTimePoint& _finish_ts, const Status& _status)
+      : type(_type),
+        path(_path),
+        duration(std::chrono::duration_cast<std::chrono::nanoseconds>(
+            _finish_ts - _start_ts.second)),
+        start_ts(_start_ts.first),
+        status(_status) {}
+  static StartTimePoint StartNow() {
+    return std::make_pair<SystemTimePoint, SteadyTimePoint>(
+        std::chrono::system_clock::now(), std::chrono::steady_clock::now());
+  }
+  static FinishTimePoint FinishNow() {
+    return std::chrono::steady_clock::now();
+  }
+};
+
+struct BlobFileInfo {
+  BlobFileInfo(const std::string& _blob_file_path,
+               const uint64_t _blob_file_number)
+      : blob_file_path(_blob_file_path), blob_file_number(_blob_file_number) {}
+
+  std::string blob_file_path;
+  uint64_t blob_file_number;
+};
+
+struct BlobFileAdditionInfo : public BlobFileInfo {
+  BlobFileAdditionInfo(const std::string& _blob_file_path,
+                       const uint64_t _blob_file_number,
+                       const uint64_t _total_blob_count,
+                       const uint64_t _total_blob_bytes)
+      : BlobFileInfo(_blob_file_path, _blob_file_number),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes) {}
+  uint64_t total_blob_count;
+  uint64_t total_blob_bytes;
+};
+
+struct BlobFileGarbageInfo : public BlobFileInfo {
+  BlobFileGarbageInfo(const std::string& _blob_file_path,
+                      const uint64_t _blob_file_number,
+                      const uint64_t _garbage_blob_count,
+                      const uint64_t _garbage_blob_bytes)
+      : BlobFileInfo(_blob_file_path, _blob_file_number),
+        garbage_blob_count(_garbage_blob_count),
+        garbage_blob_bytes(_garbage_blob_bytes) {}
+  uint64_t garbage_blob_count;
+  uint64_t garbage_blob_bytes;
 };
 
 struct FlushJobInfo {
@@ -196,6 +344,12 @@
   TableProperties table_properties;
 
   FlushReason flush_reason;
+
+  // Compression algorithm used for blob output files
+  CompressionType blob_compression_type;
+
+  // Information about blob files created during flush in Integrated BlobDB.
+  std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
 };
 
 struct CompactionFileInfo {
@@ -210,6 +364,7 @@
 };
 
 struct CompactionJobInfo {
+  ~CompactionJobInfo() { status.PermitUncheckedError(); }
   // the id of the column family where the compaction happened.
   uint32_t cf_id;
   // the name of the column family where the compaction happened.
@@ -253,9 +408,19 @@
   // Compression algorithm used for output files
   CompressionType compression;
 
-  // If non-null, this variable stores detailed information
-  // about this compaction.
+  // Statistics and other additional details on the compaction
   CompactionJobStats stats;
+
+  // Compression algorithm used for blob output files.
+  CompressionType blob_compression_type;
+
+  // Information about blob files created during compaction in Integrated
+  // BlobDB.
+  std::vector<BlobFileAdditionInfo> blob_file_addition_infos;
+
+  // Information about blob files deleted during compaction in Integrated
+  // BlobDB.
+  std::vector<BlobFileGarbageInfo> blob_file_garbage_infos;
 };
 
 struct MemTableInfo {
@@ -288,18 +453,49 @@
   TableProperties table_properties;
 };
 
+// Result of auto background error recovery
+struct BackgroundErrorRecoveryInfo {
+  // The original error that triggered the recovery
+  Status old_bg_error;
+
+  // The final bg_error after all recovery attempts. Status::OK() means
+  // the recovery was successful and the database is fully operational.
+  Status new_bg_error;
+};
+
+struct IOErrorInfo {
+  IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
+              const std::string& _file_path, size_t _length, uint64_t _offset)
+      : io_status(_io_status),
+        operation(_operation),
+        file_path(_file_path),
+        length(_length),
+        offset(_offset) {}
+
+  IOStatus io_status;
+  FileOperationType operation;
+  std::string file_path;
+  size_t length;
+  uint64_t offset;
+};
+
 // EventListener class contains a set of callback functions that will
 // be called when specific RocksDB event happens such as flush.  It can
 // be used as a building block for developing custom features such as
 // stats-collector or external compaction algorithm.
 //
-// Note that callback functions should not run for an extended period of
-// time before the function returns, otherwise RocksDB may be blocked.
-// For example, it is not suggested to do DB::CompactFiles() (as it may
-// run for a long while) or issue many of DB::Put() (as Put may be blocked
-// in certain cases) in the same thread in the EventListener callback.
-// However, doing DB::CompactFiles() and DB::Put() in another thread is
-// considered safe.
+// IMPORTANT
+// Because compaction is needed to resolve a "writes stopped" condition,
+// calling or waiting for any blocking DB write function (no_slowdown=false)
+// from a compaction-related listener callback can hang RocksDB. For DB
+// writes from a callback we recommend a WriteBatch and no_slowdown=true,
+// because the WriteBatch can accumulate writes for later in case DB::Write
+// returns Status::Incomplete. Similarly, calling CompactRange or similar
+// could hang by waiting for a background worker that is occupied until the
+// callback returns.
+//
+// Otherwise, callback functions should not run for an extended period of
+// time before the function returns, because this will slow RocksDB.
 //
 // [Threading] All EventListener callback will be called using the
 // actual thread that involves in that specific event.   For example, it
@@ -310,8 +506,21 @@
 // the current thread holding any DB mutex. This is to prevent potential
 // deadlock and performance issue when using EventListener callback
 // in a complex way.
-class EventListener {
+//
+// [Exceptions] Exceptions MUST NOT propagate out of overridden functions into
+// RocksDB, because RocksDB is not exception-safe. This could cause undefined
+// behavior including data loss, unreported corruption, deadlocks, and more.
+class EventListener : public Customizable {
  public:
+  static const char* Type() { return "EventListener"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& id,
+                                 std::shared_ptr<EventListener>* result);
+  const char* Name() const override {
+    // Since EventListeners did not have a name previously, we will assume
+    // an empty name.  Instances should override this method.
+    return "";
+  }
   // A callback function to RocksDB which will be called whenever a
   // registered RocksDB flushes a file.  The default implementation is
   // no-op.
@@ -459,7 +668,27 @@
   // operation finishes.
   virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
 
-  // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If
+  // A callback function for RocksDB which will be called whenever a file flush
+  // operation finishes.
+  virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file sync
+  // operation finishes.
+  virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file
+  // rangeSync operation finishes.
+  virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file
+  // truncate operation finishes.
+  virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file close
+  // operation finishes.
+  virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {}
+
+  // If true, the OnFile*Finish functions will be called. If
   // false, then they won't be called.
   virtual bool ShouldBeNotifiedOnFileIO() { return false; }
 
@@ -472,13 +701,56 @@
                                     Status /* bg_error */,
                                     bool* /* auto_recovery */) {}
 
+  // DEPRECATED
   // A callback function for RocksDB which will be called once the database
   // is recovered from read-only mode after an error. When this is called, it
   // means normal writes to the database can be issued and the user can
   // initiate any further recovery actions needed
-  virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
+  virtual void OnErrorRecoveryCompleted(Status old_bg_error) {
+    old_bg_error.PermitUncheckedError();
+  }
+
+  // A callback function for RocksDB which will be called once the recovery
+  // attempt from a background retryable error is completed. The recovery
+  // may have been successful or not. In either case, the callback is called
+  // with the old and new error. If info.new_bg_error is Status::OK(), that
+  // means the recovery succeeded.
+  virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) {
+  }
+
+  // A callback function for RocksDB which will be called before
+  // a blob file is being created. It will follow by OnBlobFileCreated after
+  // the creation finishes.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileCreationStarted(
+      const BlobFileCreationBriefInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a blob file is created.
+  // It will be called whether the file is successfully created or not. User can
+  // check info.status to see if it succeeded or not.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileCreated(const BlobFileCreationInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever
+  // a blob file is deleted.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnBlobFileDeleted(const BlobFileDeletionInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever an IO error
+  // happens. ShouldBeNotifiedOnFileIO should be set to true to get a callback.
+  virtual void OnIOError(const IOErrorInfo& /*info*/) {}
 
-  virtual ~EventListener() {}
+  ~EventListener() override {}
 };
 
 #else
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memory_allocator.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,22 +5,23 @@
 
 #pragma once
 
-#include "rocksdb/status.h"
-
 #include <memory>
 
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 // MemoryAllocator is an interface that a client can implement to supply custom
 // memory allocation and deallocation methods. See rocksdb/cache.h for more
 // information.
 // All methods should be thread-safe.
-class MemoryAllocator {
+class MemoryAllocator : public Customizable {
  public:
-  virtual ~MemoryAllocator() = default;
-
-  // Name of the cache allocator, printed in the log
-  virtual const char* Name() const = 0;
+  static const char* Type() { return "MemoryAllocator"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<MemoryAllocator>* result);
 
   // Allocate a block of at least size. Has to be thread-safe.
   virtual void* Allocate(size_t size) = 0;
@@ -34,9 +35,12 @@
     // default implementation just returns the allocation size
     return allocation_size;
   }
+
+  std::string GetId() const override { return GenerateIndividualId(); }
 };
 
 struct JemallocAllocatorOptions {
+  static const char* kName() { return "JemallocAllocatorOptions"; }
   // Jemalloc tcache cache allocations by size class. For each size class,
   // it caches between 20 (for large size classes) to 200 (for small size
   // classes). To reduce tcache memory usage in case the allocator is access
@@ -45,31 +49,31 @@
   bool limit_tcache_size = false;
 
   // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
-  // When used with block cache, it is recommneded to set it to block_size/4.
+  // When used with block cache, it is recommended to set it to block_size/4.
   size_t tcache_size_lower_bound = 1024;
 
   // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
-  // When used with block cache, it is recommneded to set it to block_size.
+  // When used with block cache, it is recommended to set it to block_size.
   size_t tcache_size_upper_bound = 16 * 1024;
 };
 
-// Generate memory allocators which allocates through Jemalloc and utilize
-// MADV_DONTDUMP through madvice to exclude cache items from core dump.
+// Generate memory allocator which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvise to exclude cache items from core dump.
 // Applications can use the allocator with block cache to exclude block cache
 // usage from core dump.
 //
 // Implementation details:
-// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all
-// allocations of the JemallocNodumpAllocator is through the same arena.
-// The memory allocator hooks memory allocation of the arena, and call
-// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from
-// core dump. Side benefit of using single arena would be reduce of jemalloc
-// metadata for some workload.
+// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator are through the same arena.
+// The memory allocator hooks memory allocation of the arena, and calls
+// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduction of jemalloc
+// metadata for some workloads.
 //
 // To mitigate mutex contention for using one single arena, jemalloc tcache
 // (thread-local cache) is enabled to cache unused allocations for future use.
-// The tcache normally incur 0.5M extra memory usage per-thread. The usage
-// can be reduce by limitting allocation sizes to cache.
+// The tcache normally incurs 0.5M extra memory usage per-thread. The usage
+// can be reduced by limiting allocation sizes to cache.
 extern Status NewJemallocNodumpAllocator(
     JemallocAllocatorOptions& options,
     std::shared_ptr<MemoryAllocator>* memory_allocator);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/memtablerep.h	2025-05-19 16:14:27.000000000 +0000
@@ -35,11 +35,15 @@
 
 #pragma once
 
-#include <rocksdb/slice.h>
 #include <stdint.h>
 #include <stdlib.h>
+
 #include <memory>
 #include <stdexcept>
+#include <unordered_set>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -48,8 +52,9 @@
 class LookupKey;
 class SliceTransform;
 class Logger;
+struct DBOptions;
 
-typedef void* KeyHandle;
+using KeyHandle = void*;
 
 extern Slice GetLengthPrefixedSlice(const char* data);
 
@@ -59,10 +64,10 @@
   // concatenated with values.
   class KeyComparator {
    public:
-    typedef ROCKSDB_NAMESPACE::Slice DecodedType;
+    using DecodedType = ROCKSDB_NAMESPACE::Slice;
 
     virtual DecodedType decode_key(const char* key) const {
-      // The format of key is frozen and can be terated as a part of the API
+      // The format of key is frozen and can be treated as a part of the API
       // contract. Refer to MemTable::Add for details.
       return GetLengthPrefixedSlice(key);
     }
@@ -120,7 +125,7 @@
     return true;
   }
 
-  // Same as ::InsertWithHint, but allow concurrnet write
+  // Same as ::InsertWithHint, but allow concurrent write
   //
   // If hint points to nullptr, a new hint will be allocated on heap, otherwise
   // the hint will be updated to reflect the last insert location. The hint is
@@ -194,6 +199,17 @@
     return 0;
   }
 
+  // Returns a vector of unique random memtable entries of approximate
+  // size 'target_sample_size' (this size is not strictly enforced).
+  virtual void UniqueRandomSample(const uint64_t num_entries,
+                                  const uint64_t target_sample_size,
+                                  std::unordered_set<const char*>* entries) {
+    (void)num_entries;
+    (void)target_sample_size;
+    (void)entries;
+    assert(false);
+  }
+
   // Report an approximation of how much memory has been used other than memory
   // that was allocated through the allocator.  Safe to call from any thread.
   virtual size_t ApproximateMemoryUsage() = 0;
@@ -230,6 +246,8 @@
     virtual void SeekForPrev(const Slice& internal_key,
                              const char* memtable_key) = 0;
 
+    virtual void RandomSeek() {}
+
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
     virtual void SeekToFirst() = 0;
@@ -274,9 +292,14 @@
 
 // This is the base class for all factories that are used by RocksDB to create
 // new MemTableRep objects
-class MemTableRepFactory {
+class MemTableRepFactory : public Customizable {
  public:
-  virtual ~MemTableRepFactory() {}
+  ~MemTableRepFactory() override {}
+
+  static const char* Type() { return "MemTableRepFactory"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::unique_ptr<MemTableRepFactory>* factory);
 
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
                                          Allocator*, const SliceTransform*,
@@ -288,7 +311,7 @@
     return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
   }
 
-  virtual const char* Name() const = 0;
+  const char* Name() const override = 0;
 
   // Return true if the current MemTableRep supports concurrent inserts
   // Default: false
@@ -310,20 +333,27 @@
 //     seeks with consecutive keys.
 class SkipListFactory : public MemTableRepFactory {
  public:
-  explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {}
+  explicit SkipListFactory(size_t lookahead = 0);
+
+  // Methods for Configurable/Customizable class overrides
+  static const char* kClassName() { return "SkipListFactory"; }
+  static const char* kNickName() { return "skip_list"; }
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override;
 
+  // Methods for MemTableRepFactory class overrides
   using MemTableRepFactory::CreateMemTableRep;
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
                                          Allocator*, const SliceTransform*,
                                          Logger* logger) override;
-  virtual const char* Name() const override { return "SkipListFactory"; }
 
   bool IsInsertConcurrentlySupported() const override { return true; }
 
   bool CanHandleDuplicatedKey() const override { return true; }
 
  private:
-  const size_t lookahead_;
+  size_t lookahead_;
 };
 
 #ifndef ROCKSDB_LITE
@@ -336,17 +366,22 @@
 //     VectorRep. On initialization, the underlying array will be at least count
 //     bytes reserved for usage.
 class VectorRepFactory : public MemTableRepFactory {
-  const size_t count_;
+  size_t count_;
 
  public:
-  explicit VectorRepFactory(size_t count = 0) : count_(count) {}
+  explicit VectorRepFactory(size_t count = 0);
 
+  // Methods for Configurable/Customizable class overrides
+  static const char* kClassName() { return "VectorRepFactory"; }
+  static const char* kNickName() { return "vector"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+  // Methods for MemTableRepFactory class overrides
   using MemTableRepFactory::CreateMemTableRep;
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
                                          Allocator*, const SliceTransform*,
                                          Logger* logger) override;
-
-  virtual const char* Name() const override { return "VectorRepFactory"; }
 };
 
 // This class contains a fixed array of buckets, each
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/merge_operator.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/slice.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -43,10 +44,16 @@
 //
 // Refer to rocksdb-merge wiki for more details and example implementations.
 //
-class MergeOperator {
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class MergeOperator : public Customizable {
  public:
   virtual ~MergeOperator() {}
   static const char* Type() { return "MergeOperator"; }
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& id,
+                                 std::shared_ptr<MergeOperator>* result);
 
   // Gives the client a way to express the read -> modify -> write semantics
   // key:      (IN)    The key that's associated with this merge operation.
@@ -109,7 +116,7 @@
     Slice& existing_operand;
   };
 
-  // This function applies a stack of merge operands in chrionological order
+  // This function applies a stack of merge operands in chronological order
   // on top of an existing value. There are two ways in which this method is
   // being used:
   // a) During Get() operation, it used to calculate the final value of a key
@@ -125,7 +132,7 @@
   // In the example above, Get(K) operation will call FullMerge with a base
   // value of 2 and operands [+1, +2]. Compaction process might decide to
   // collapse the beginning of the history up to the snapshot by performing
-  // full Merge with base value of 0 and operands [+1, +2, +7, +3].
+  // full Merge with base value of 0 and operands [+1, +2, +7, +4].
   virtual bool FullMergeV2(const MergeOperationInput& merge_in,
                            MergeOperationOutput* merge_out) const;
 
@@ -176,7 +183,7 @@
   // PartialMergeMulti should combine them into a single merge operation that is
   // saved into *new_value, and then it should return true.  *new_value should
   // be constructed such that a call to DB::Merge(key, *new_value) would yield
-  // the same result as subquential individual calls to DB::Merge(key, operand)
+  // the same result as sequential individual calls to DB::Merge(key, operand)
   // for each operand in operand_list from front() to back().
   //
   // The string that new_value is pointing to will be empty.
@@ -198,7 +205,7 @@
   // TODO: the name is currently not stored persistently and thus
   //       no checking is enforced. Client is responsible for providing
   //       consistent MergeOperator between DB opens.
-  virtual const char* Name() const = 0;
+  virtual const char* Name() const override = 0;
 
   // Determines whether the PartialMerge can be called with just a single
   // merge operand.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/metadata.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/metadata.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/metadata.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/metadata.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,79 +5,86 @@
 
 #pragma once
 
-#include <stdint.h>
-
+#include <cstdint>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "rocksdb/options.h"
 #include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
-struct ColumnFamilyMetaData;
-struct LevelMetaData;
-struct SstFileMetaData;
 
-// The metadata that describes a column family.
-struct ColumnFamilyMetaData {
-  ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
-  ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
-                       const std::vector<LevelMetaData>&& _levels)
-      : size(_size), name(_name), levels(_levels) {}
+// Basic identifiers and metadata for a file in a DB. This only includes
+// information considered relevant for taking backups, checkpoints, or other
+// services relating to DB file storage.
+// This is only appropriate for immutable files, such as SST files or all
+// files in a backup. See also LiveFileStorageInfo.
+struct FileStorageInfo {
+  // The name of the file within its directory (e.g. "123456.sst")
+  std::string relative_filename;
+  // The directory containing the file, without a trailing '/'. This could be
+  // a DB path, wal_dir, etc.
+  std::string directory;
+
+  // The id of the file within a single DB. Set to 0 if the file does not have
+  // a number (e.g. CURRENT)
+  uint64_t file_number = 0;
+  // The type of the file as part of a DB.
+  FileType file_type = kTempFile;
 
-  // The size of this column family in bytes, which is equal to the sum of
-  // the file size of its "levels".
-  uint64_t size;
-  // The number of files in this column family.
-  size_t file_count;
-  // The name of the column family.
-  std::string name;
-  // The metadata of all levels in this column family.
-  std::vector<LevelMetaData> levels;
-};
+  // File size in bytes. See also `trim_to_size`.
+  uint64_t size = 0;
 
-// The metadata that describes a level.
-struct LevelMetaData {
-  LevelMetaData(int _level, uint64_t _size,
-                const std::vector<SstFileMetaData>&& _files)
-      : level(_level), size(_size), files(_files) {}
+  // This feature is experimental and subject to change.
+  Temperature temperature = Temperature::kUnknown;
 
-  // The level which this meta data describes.
-  const int level;
-  // The size of this level in bytes, which is equal to the sum of
-  // the file size of its "files".
-  const uint64_t size;
-  // The metadata of all sst files in this level.
-  const std::vector<SstFileMetaData> files;
+  // The checksum of a SST file, the value is decided by the file content and
+  // the checksum algorithm used for this SST file. The checksum function is
+  // identified by the file_checksum_func_name. If the checksum function is
+  // not specified, file_checksum is "0" by default.
+  std::string file_checksum;
+
+  // The name of the checksum function used to generate the file checksum
+  // value. If file checksum is not enabled (e.g., sst_file_checksum_func is
+  // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is
+  // "Unknown".
+  std::string file_checksum_func_name;
 };
 
-// The metadata that describes a SST file.
-struct SstFileMetaData {
-  SstFileMetaData()
-      : size(0),
-        file_number(0),
-        smallest_seqno(0),
-        largest_seqno(0),
-        num_reads_sampled(0),
-        being_compacted(false),
-        num_entries(0),
-        num_deletions(0),
-        oldest_blob_file_number(0) {}
+// Adds to FileStorageInfo the ability to capture the state of files that
+// might change in a running DB.
+struct LiveFileStorageInfo : public FileStorageInfo {
+  // If non-empty, this string represents the "saved" contents of the file
+  // for the current context. (This field is used for checkpointing CURRENT
+  // file.) In that case, size == replacement_contents.size() and file on disk
+  // should be ignored. If empty string, the file on disk should still have
+  // "saved" contents. (See trim_to_size.)
+  std::string replacement_contents;
+
+  // If true, the file on disk is allowed to be larger than `size` but only
+  // the first `size` bytes should be used for the current context. If false,
+  // the file is corrupt if size on disk does not equal `size`.
+  bool trim_to_size = false;
+};
+
+// The metadata that describes an SST file. (Does not need to extend
+// LiveFileStorageInfo because SST files are always immutable.)
+struct SstFileMetaData : public FileStorageInfo {
+  SstFileMetaData() {}
 
   SstFileMetaData(const std::string& _file_name, uint64_t _file_number,
-                  const std::string& _path, size_t _size,
+                  const std::string& _directory, size_t _size,
                   SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
                   const std::string& _smallestkey,
                   const std::string& _largestkey, uint64_t _num_reads_sampled,
-                  bool _being_compacted, uint64_t _oldest_blob_file_number,
+                  bool _being_compacted, Temperature _temperature,
+                  uint64_t _oldest_blob_file_number,
                   uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
                   std::string& _file_checksum,
                   std::string& _file_checksum_func_name)
-      : size(_size),
-        name(_file_name),
-        file_number(_file_number),
-        db_path(_path),
-        smallest_seqno(_smallest_seqno),
+      : smallest_seqno(_smallest_seqno),
         largest_seqno(_largest_seqno),
         smallestkey(_smallestkey),
         largestkey(_largestkey),
@@ -87,52 +94,61 @@
         num_deletions(0),
         oldest_blob_file_number(_oldest_blob_file_number),
         oldest_ancester_time(_oldest_ancester_time),
-        file_creation_time(_file_creation_time),
-        file_checksum(_file_checksum),
-        file_checksum_func_name(_file_checksum_func_name) {}
-
-  // File size in bytes.
-  size_t size;
-  // The name of the file.
-  std::string name;
-  // The id of the file.
-  uint64_t file_number;
-  // The full path where the file locates.
-  std::string db_path;
+        file_creation_time(_file_creation_time) {
+    if (!_file_name.empty()) {
+      if (_file_name[0] == '/') {
+        relative_filename = _file_name.substr(1);
+        name = _file_name;  // Deprecated field
+      } else {
+        relative_filename = _file_name;
+        name = std::string("/") + _file_name;  // Deprecated field
+      }
+      assert(relative_filename.size() + 1 == name.size());
+      assert(relative_filename[0] != '/');
+      assert(name[0] == '/');
+    }
+    directory = _directory;
+    db_path = _directory;  // Deprecated field
+    file_number = _file_number;
+    file_type = kTableFile;
+    size = _size;
+    temperature = _temperature;
+    file_checksum = _file_checksum;
+    file_checksum_func_name = _file_checksum_func_name;
+  }
+
+  SequenceNumber smallest_seqno = 0;  // Smallest sequence number in file.
+  SequenceNumber largest_seqno = 0;   // Largest sequence number in file.
+  std::string smallestkey;            // Smallest user defined key in the file.
+  std::string largestkey;             // Largest user defined key in the file.
+  uint64_t num_reads_sampled = 0;     // How many times the file is read.
+  bool being_compacted =
+      false;  // true if the file is currently being compacted.
 
-  SequenceNumber smallest_seqno;  // Smallest sequence number in file.
-  SequenceNumber largest_seqno;   // Largest sequence number in file.
-  std::string smallestkey;        // Smallest user defined key in the file.
-  std::string largestkey;         // Largest user defined key in the file.
-  uint64_t num_reads_sampled;     // How many times the file is read.
-  bool being_compacted;  // true if the file is currently being compacted.
+  uint64_t num_entries = 0;
+  uint64_t num_deletions = 0;
 
-  uint64_t num_entries;
-  uint64_t num_deletions;
-
-  uint64_t oldest_blob_file_number;  // The id of the oldest blob file
-                                     // referenced by the file.
+  uint64_t oldest_blob_file_number = 0;  // The id of the oldest blob file
+                                         // referenced by the file.
   // An SST file may be generated by compactions whose input files may
   // in turn be generated by earlier compactions. The creation time of the
-  // oldest SST file that is the compaction ancester of this file.
-  // The timestamp is provided Env::GetCurrentTime().
-  // 0 if the information is not available.
-  uint64_t oldest_ancester_time;
-  // Timestamp when the SST file is created, provided by Env::GetCurrentTime().
+  // oldest SST file that is the compaction ancestor of this file.
+  // The timestamp is provided SystemClock::GetCurrentTime().
   // 0 if the information is not available.
-  uint64_t file_creation_time;
-
-  // The checksum of a SST file, the value is decided by the file content and
-  // the checksum algorithm used for this SST file. The checksum function is
-  // identified by the file_checksum_func_name. If the checksum function is
-  // not specified, file_checksum is "0" by default.
-  std::string file_checksum;
+  //
+  // Note: for TTL blob files, it contains the start of the expiration range.
+  uint64_t oldest_ancester_time = 0;
+  // Timestamp when the SST file is created, provided by
+  // SystemClock::GetCurrentTime(). 0 if the information is not available.
+  uint64_t file_creation_time = 0;
+
+  // DEPRECATED: The name of the file within its directory with a
+  // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct
+  // instead.
+  std::string name;
 
-  // The name of the checksum function used to generate the file checksum
-  // value. If file checksum is not enabled (e.g., sst_file_checksum_func is
-  // null), file_checksum_func_name is UnknownFileChecksumFuncName, which is
-  // "Unknown".
-  std::string file_checksum_func_name;
+  // DEPRECATED: replaced by `directory` in base struct
+  std::string db_path;
 };
 
 // The full set of metadata associated with each SST file.
@@ -142,6 +158,84 @@
   LiveFileMetaData() : column_family_name(), level(0) {}
 };
 
+// The MetaData that describes a Blob file
+struct BlobMetaData {
+  BlobMetaData()
+      : blob_file_number(0),
+        blob_file_size(0),
+        total_blob_count(0),
+        total_blob_bytes(0),
+        garbage_blob_count(0),
+        garbage_blob_bytes(0) {}
+
+  BlobMetaData(uint64_t _file_number, const std::string& _file_name,
+               const std::string& _file_path, uint64_t _file_size,
+               uint64_t _total_blob_count, uint64_t _total_blob_bytes,
+               uint64_t _garbage_blob_count, uint64_t _garbage_blob_bytes,
+               const std::string& _file_checksum,
+               const std::string& _file_checksum_func_name)
+      : blob_file_number(_file_number),
+        blob_file_name(_file_name),
+        blob_file_path(_file_path),
+        blob_file_size(_file_size),
+        total_blob_count(_total_blob_count),
+        total_blob_bytes(_total_blob_bytes),
+        garbage_blob_count(_garbage_blob_count),
+        garbage_blob_bytes(_garbage_blob_bytes),
+        checksum_method(_file_checksum),
+        checksum_value(_file_checksum_func_name) {}
+  uint64_t blob_file_number;
+  std::string blob_file_name;
+  std::string blob_file_path;
+  uint64_t blob_file_size;
+  uint64_t total_blob_count;
+  uint64_t total_blob_bytes;
+  uint64_t garbage_blob_count;
+  uint64_t garbage_blob_bytes;
+  std::string checksum_method;
+  std::string checksum_value;
+};
+
+// The metadata that describes a level.
+struct LevelMetaData {
+  LevelMetaData(int _level, uint64_t _size,
+                const std::vector<SstFileMetaData>&& _files)
+      : level(_level), size(_size), files(_files) {}
+
+  // The level which this meta data describes.
+  const int level;
+  // The size of this level in bytes, which is equal to the sum of
+  // the file size of its "files".
+  const uint64_t size;
+  // The metadata of all sst files in this level.
+  const std::vector<SstFileMetaData> files;
+};
+
+// The metadata that describes a column family.
+struct ColumnFamilyMetaData {
+  ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
+  ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
+                       const std::vector<LevelMetaData>&& _levels)
+      : size(_size), name(_name), levels(_levels) {}
+
+  // The size of this column family in bytes, which is equal to the sum of
+  // the file size of its "levels".
+  uint64_t size;
+  // The number of files in this column family.
+  size_t file_count;
+  // The name of the column family.
+  std::string name;
+  // The metadata of all levels in this column family.
+  std::vector<LevelMetaData> levels;
+
+  // The total size of all blob files
+  uint64_t blob_file_size = 0;
+  // The number of blob files in this column family.
+  size_t blob_file_count = 0;
+  // The metadata of the blobs in this column family
+  std::vector<BlobMetaData> blob_files;
+};
+
 // Metadata returned as output from ExportColumnFamily() and used as input to
 // CreateColumnFamiliesWithImport().
 struct ExportImportFilesMetaData {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/options.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/options.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/options.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/options.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <limits>
 #include <memory>
 #include <string>
@@ -18,9 +19,14 @@
 
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/data_structure.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/listener.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/types.h"
 #include "rocksdb/universal_compaction.h"
 #include "rocksdb/version.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -51,36 +57,11 @@
 class WalFilter;
 class FileSystem;
 
-// DB contents are stored in a set of blocks, each of which holds a
-// sequence of key,value pairs.  Each block may be compressed before
-// being stored in a file.  The following enum describes which
-// compression method (if any) is used to compress a block.
-enum CompressionType : unsigned char {
-  // NOTE: do not change the values of existing entries, as these are
-  // part of the persistent format on disk.
-  kNoCompression = 0x0,
-  kSnappyCompression = 0x1,
-  kZlibCompression = 0x2,
-  kBZip2Compression = 0x3,
-  kLZ4Compression = 0x4,
-  kLZ4HCCompression = 0x5,
-  kXpressCompression = 0x6,
-  kZSTD = 0x7,
-
-  // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
-  // 0.8.0 or consider a possibility of downgrading the service or copying
-  // the database files to another service running with an older version of
-  // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
-  // eventually remove the option from the public API.
-  kZSTDNotFinalCompression = 0x40,
-
-  // kDisableCompressionOption is used to disable some compression options.
-  kDisableCompressionOption = 0xff,
-};
-
 struct Options;
 struct DbPath;
 
+using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+
 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // The function recovers options to a previous version. Only 4.6 or later
   // versions are supported.
@@ -148,9 +129,10 @@
   // Allows an application to modify/delete a key-value during background
   // compaction.
   //
-  // If the client requires a new compaction filter to be used for different
-  // compaction runs, it can specify compaction_filter_factory instead of this
-  // option.  The client should specify only one of the two.
+  // If the client requires a new `CompactionFilter` to be used for different
+  // compaction runs and/or requires a `CompactionFilter` for table file
+  // creations outside of compaction, it can specify compaction_filter_factory
+  // instead of this option.  The client should specify only one of the two.
   // compaction_filter takes precedence over compaction_filter_factory if
   // client specifies both.
   //
@@ -161,12 +143,21 @@
   // Default: nullptr
   const CompactionFilter* compaction_filter = nullptr;
 
-  // This is a factory that provides compaction filter objects which allow
-  // an application to modify/delete a key-value during background compaction.
+  // This is a factory that provides `CompactionFilter` objects which allow
+  // an application to modify/delete a key-value during table file creation.
   //
-  // A new filter will be created on each compaction run.  If multithreaded
-  // compaction is being used, each created CompactionFilter will only be used
-  // from a single thread and so does not need to be thread-safe.
+  // Unlike the `compaction_filter` option, which is used when compaction
+  // creates a table file, this factory allows using a `CompactionFilter` when a
+  // table file is created for various reasons. The factory can decide what
+  // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by
+  // default the decision is to use a `CompactionFilter` for
+  // `TableFileCreationReason::kCompaction` only.
+  //
+  // Each thread of work involving creating table files will create a new
+  // `CompactionFilter` when it will be used according to the above
+  // `TableFileCreationReason`-based decision. This allows the application to
+  // know about the different ongoing threads of work and makes it unnecessary
+  // for `CompactionFilter` to provide thread-safety.
   //
   // Default: nullptr
   std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
@@ -220,14 +211,18 @@
   CompressionType compression;
 
   // Compression algorithm that will be used for the bottommost level that
-  // contain files.
+  // contain files. The behavior for num_levels = 1 is not well defined.
+  // Right now, with num_levels = 1,  all compaction outputs will use
+  // bottommost_compression and all flush outputs still use options.compression,
+  // but the behavior is subject to change.
   //
   // Default: kDisableCompressionOption (Disabled)
   CompressionType bottommost_compression = kDisableCompressionOption;
 
   // different options for compression algorithms used by bottommost_compression
   // if it is enabled. To enable it, please see the definition of
-  // CompressionOptions.
+  // CompressionOptions. Behavior for num_levels = 1 is the same as
+  // options.bottommost_compression.
   CompressionOptions bottommost_compression_opts;
 
   // different options for compression algorithms
@@ -308,6 +303,15 @@
   // Default: nullptr
   std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
 
+  // If non-nullptr, use the specified factory for a function to determine the
+  // partitioning of sst files. This helps compaction to split the files
+  // on interesting boundaries (key prefixes) to make propagation of sst
+  // files less write amplifying (covering the whole key space).
+  // THE FEATURE IS STILL EXPERIMENTAL
+  //
+  // Default: nullptr
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
   // Create ColumnFamilyOptions with default values for all fields
   ColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
@@ -318,8 +322,24 @@
 
 enum class WALRecoveryMode : char {
   // Original levelDB recovery
-  // We tolerate incomplete record in trailing data on all logs
-  // Use case : This is legacy behavior
+  //
+  // We tolerate the last record in any log to be incomplete due to a crash
+  // while writing it. Zeroed bytes from preallocation are also tolerated in the
+  // trailing data of any log.
+  //
+  // Use case: Applications for which updates, once applied, must not be rolled
+  // back even after a crash-recovery. In this recovery mode, RocksDB guarantees
+  // this as long as `WritableFile::Append()` writes are durable. In case the
+  // user needs the guarantee in more situations (e.g., when
+  // `WritableFile::Append()` writes to page cache, but the user desires this
+  // guarantee in face of power-loss crash-recovery), RocksDB offers various
+  // mechanisms to additionally invoke `WritableFile::Sync()` in order to
+  // strengthen the guarantee.
+  //
+  // This differs from `kPointInTimeRecovery` in that, in case a corruption is
+  // detected during recovery, this mode will refuse to open the DB. Whereas,
+  // `kPointInTimeRecovery` will stop recovery just before the corruption since
+  // that is a valid point-in-time to which to recover.
   kTolerateCorruptedTailRecords = 0x00,
   // Recover from clean shutdown
   // We don't expect to find any corruption in the WAL
@@ -347,6 +367,86 @@
   DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
 };
 
+extern const char* kHostnameForDbHostId;
+
+enum class CompactionServiceJobStatus : char {
+  kSuccess,
+  kFailure,
+  kUseLocal,
+};
+
+struct CompactionServiceJobInfo {
+  std::string db_name;
+  std::string db_id;
+  std::string db_session_id;
+  uint64_t job_id;  // job_id is only unique within the current DB and session,
+                    // restart DB will reset the job_id. `db_id` and
+                    // `db_session_id` could help you build unique id across
+                    // different DBs and sessions.
+
+  Env::Priority priority;
+
+  CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
+                           std::string db_session_id_, uint64_t job_id_,
+                           Env::Priority priority_)
+      : db_name(std::move(db_name_)),
+        db_id(std::move(db_id_)),
+        db_session_id(std::move(db_session_id_)),
+        job_id(job_id_),
+        priority(priority_) {}
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionService : public Customizable {
+ public:
+  static const char* Type() { return "CompactionService"; }
+
+  // Returns the name of this compaction service.
+  const char* Name() const override = 0;
+
+  // Start the compaction with input information, which can be passed to
+  // `DB::OpenAndCompact()`.
+  // job_id is pre-assigned, it will be reset after DB re-open.
+  // Warning: deprecated, please use the new interface
+  // `StartV2(CompactionServiceJobInfo, ...)` instead.
+  virtual CompactionServiceJobStatus Start(
+      const std::string& /*compaction_service_input*/, uint64_t /*job_id*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  // Start the remote compaction with `compaction_service_input`, which can be
+  // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the
+  // information the user might want to know, which includes `job_id`.
+  virtual CompactionServiceJobStatus StartV2(
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) {
+    // Default implementation to call legacy interface, please override and
+    // replace the legacy implementation
+    return Start(compaction_service_input, info.job_id);
+  }
+
+  // Wait compaction to be finish.
+  // Warning: deprecated, please use the new interface
+  // `WaitForCompleteV2(CompactionServiceJobInfo, ...)` instead.
+  virtual CompactionServiceJobStatus WaitForComplete(
+      uint64_t /*job_id*/, std::string* /*compaction_service_result*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  // Wait for remote compaction to finish.
+  virtual CompactionServiceJobStatus WaitForCompleteV2(
+      const CompactionServiceJobInfo& info,
+      std::string* compaction_service_result) {
+    // Default implementation to call legacy interface, please override and
+    // replace the legacy implementation
+    return WaitForComplete(info.job_id, compaction_service_result);
+  }
+
+  ~CompactionService() override = default;
+};
+
 struct DBOptions {
   // The function recovers options to the option as in version 4.6.
   DBOptions* OldDefaults(int rocksdb_major_version = 4,
@@ -389,6 +489,23 @@
   // Default: true
   bool paranoid_checks = true;
 
+  // If true, during memtable flush, RocksDB will validate total entries
+  // read in flush, and compare with counter inserted into it.
+  // The option is here to turn the feature off in case this new validation
+  // feature has a bug.
+  // Default: true
+  bool flush_verify_memtable_count = true;
+
+  // If true, the log numbers and sizes of the synced WALs are tracked
+  // in MANIFEST, then during DB recovery, if a synced WAL is missing
+  // from disk, or the WAL's size does not match the recorded size in
+  // MANIFEST, an error will be reported and the recovery will be aborted.
+  //
+  // Note that this option does not work with secondary instance.
+  //
+  // Default: false
+  bool track_and_verify_wals_in_manifest = false;
+
   // Use the specified object to interact with the environment,
   // e.g. to read/write files, schedule background work, etc. In the near
   // future, support for doing storage operations such as read/write files
@@ -396,12 +513,7 @@
   // Default: Env::Default()
   Env* env = Env::Default();
 
-  // Use the specified object to interact with the storage to
-  // read/write files. This is in addition to env. This option should be used
-  // if the desired storage subsystem provides a FileSystem implementation.
-  std::shared_ptr<FileSystem> file_system = nullptr;
-
-  // Use to control write rate of flush and compaction. Flush has higher
+  // Use to control write/read rate of flush and compaction. Flush has higher
   // priority than compaction. Rate limiting is disabled if nullptr.
   // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
   // Default: nullptr
@@ -456,8 +568,18 @@
   // (i.e. the ones that are causing all the space amplification). If set to 0
   // (default), we will dynamically choose the WAL size limit to be
   // [sum of all write_buffer_size * max_write_buffer_number] * 4
-  // This option takes effect only when there are more than one column family as
-  // otherwise the wal size is dictated by the write_buffer_size.
+  //
+  // For example, with 15 column families, each with
+  // write_buffer_size = 128 MB
+  // max_write_buffer_number = 6
+  // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB
+  //
+  // The RocksDB wiki has some discussion about how the WAL interacts
+  // with memtables and flushing of column families.
+  // https://github.com/facebook/rocksdb/wiki/Column-Families
+  //
+  // This option takes effect only when there are more than one column
+  // family as otherwise the wal size is dictated by the write_buffer_size.
   //
   // Default: 0
   //
@@ -541,7 +663,7 @@
   // Dynamically changeable through SetDBOptions() API.
   int base_background_compactions = -1;
 
-  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+  // DEPRECATED: RocksDB automatically decides this based on the
   // value of max_background_jobs. For backwards compatibility we will set
   // `max_background_jobs = max_background_compactions + max_background_flushes`
   // in the case where user sets at least one of `max_background_compactions` or
@@ -563,9 +685,11 @@
   // concurrently perform a compaction job by breaking it into multiple,
   // smaller ones that are run simultaneously.
   // Default: 1 (i.e. no subcompactions)
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint32_t max_subcompactions = 1;
 
-  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+  // DEPRECATED: RocksDB automatically decides this based on the
   // value of max_background_jobs. For backwards compatibility we will set
   // `max_background_jobs = max_background_compactions + max_background_flushes`
   // in the case where user sets at least one of `max_background_compactions` or
@@ -648,7 +772,9 @@
   // large amounts of data (such as xfs's allocsize option).
   size_t manifest_preallocation_size = 4 * 1024 * 1024;
 
-  // Allow the OS to mmap file for reading sst tables. Default: false
+  // Allow the OS to mmap file for reading sst tables.
+  // Not recommended for 32-bit OS.
+  // Default: false
   bool allow_mmap_reads = false;
 
   // Allow the OS to mmap file for writing.
@@ -675,7 +801,15 @@
   // Not supported in ROCKSDB_LITE mode!
   bool use_direct_io_for_flush_and_compaction = false;
 
-  // If false, fallocate() calls are bypassed
+  // If false, fallocate() calls are bypassed, which disables file
+  // preallocation. The file space preallocation is used to increase the file
+  // write/append performance. By default, RocksDB preallocates space for WAL,
+  // SST, Manifest files, the extra space is truncated when the file is written.
+  // Warning: if you're using btrfs, we would recommend setting
+  // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra
+  // allocated space cannot be freed, which could be significant if you have
+  // lots of files. More details about this limitation:
+  // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md
   bool allow_fallocate = true;
 
   // Disable child process inherit open files. Default: true
@@ -717,6 +851,23 @@
   // Default: true
   bool advise_random_on_open = true;
 
+  // [experimental]
+  // Used to activate or deactive the Mempurge feature (memtable garbage
+  // collection). (deactivated by default). At every flush, the total useful
+  // payload (total entries minus garbage entries) is estimated as a ratio
+  // [useful payload bytes]/[size of a memtable (in bytes)]. This ratio is then
+  // compared to this `threshold` value:
+  //     - if ratio<threshold: the flush is replaced by a mempurge operation
+  //     - else: a regular flush operation takes place.
+  // Threshold values:
+  //   0.0: mempurge deactivated (default).
+  //   1.0: recommended threshold value.
+  //   >1.0 : aggressive mempurge.
+  //   0 < threshold < 1.0: mempurge triggered only for very low useful payload
+  //   ratios.
+  // [experimental]
+  double experimental_mempurge_threshold = 0.0;
+
   // Amount of data to build up in memtables across all column
   // families before writing to disk.
   //
@@ -795,7 +946,7 @@
   size_t random_access_max_buffer_size = 1024 * 1024;
 
   // This is the maximum buffer size that is used by WritableFileWriter.
-  // On Windows, we need to maintain an aligned buffer for writes.
+  // With direct IO, we need to maintain an aligned buffer for writes.
   // We allow the buffer to grow until it's size hits the limit in buffered
   // IO and fix the buffer size when using direct IO to ensure alignment of
   // write requests if the logical sector size is unusual
@@ -822,7 +973,7 @@
   // Allows OS to incrementally sync files to disk while they are being
   // written, asynchronously, in the background. This operation can be used
   // to smooth out write I/Os over time. Users shouldn't rely on it for
-  // persistency guarantee.
+  // persistence guarantee.
   // Issue one request for every bytes_per_sync written. 0 turns it off.
   //
   // You may consider using rate_limiter to regulate write rate to device.
@@ -1060,16 +1211,9 @@
   // Immutable.
   bool allow_ingest_behind = false;
 
-  // Needed to support differential snapshots.
-  // If set to true then DB will only process deletes with sequence number
-  // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
-  // Clients are responsible to periodically call this method to advance
-  // the cutoff time. If this method is never called and preserve_deletes
-  // is set to true NO deletes will ever be processed.
-  // At the moment this only keeps normal deletes, SingleDeletes will
-  // not be preserved.
+  // Deprecated, will be removed in a future release.
+  // Please try using user-defined timestamp instead.
   // DEFAULT: false
-  // Immutable (TODO: make it dynamically changeable)
   bool preserve_deletes = false;
 
   // If enabled it uses two queues for writes, one for the ones with
@@ -1124,12 +1268,94 @@
   // Default: 0
   size_t log_readahead_size = 0;
 
-  // If user does NOT provide SST file checksum function, the SST file checksum
-  // will NOT be used. The single checksum instance are shared by options and
-  // file writers. Make sure the algorithm is thread safe.
+  // If user does NOT provide the checksum generator factory, the file checksum
+  // will NOT be used. A new file checksum generator object will be created
+  // when a SST file is created. Therefore, each created FileChecksumGenerator
+  // will only be used from a single thread and so does not need to be
+  // thread-safe.
   //
   // Default: nullptr
-  std::shared_ptr<FileChecksumFunc> sst_file_checksum_func = nullptr;
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  // By default, RocksDB recovery fails if any table file referenced in
+  // MANIFEST are missing after scanning the MANIFEST.
+  // Best-efforts recovery is another recovery mode that
+  // tries to restore the database to the most recent point in time without
+  // missing file.
+  // Currently not compatible with atomic flush. Furthermore, WAL files will
+  // not be used for recovery if best_efforts_recovery is true.
+  // Default: false
+  bool best_efforts_recovery = false;
+
+  // It defines how many times db resume is called by a separate thread when
+  // background retryable IO Error happens. When background retryable IO
+  // Error happens, SetBGError is called to deal with the error. If the error
+  // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+  // then db resume is called in background to recover from the error. If this
+  // value is 0 or negative, db resume will not be called.
+  //
+  // Default: INT_MAX
+  int max_bgerror_resume_count = INT_MAX;
+
+  // If max_bgerror_resume_count is >= 2, db resume is called multiple times.
+  // This option decides how long to wait to retry the next resume if the
+  // previous resume fails and satisfy redo resume conditions.
+  //
+  // Default: 1000000 (microseconds).
+  uint64_t bgerror_resume_retry_interval = 1000000;
+
+  // It allows user to opt-in to get error messages containing corrupted
+  // keys/values. Corrupt keys, values will be logged in the
+  // messages/logs/status that will help users with the useful information
+  // regarding affected data. By default value is set false to prevent users
+  // data to be exposed in the logs/messages etc.
+  //
+  // Default: false
+  bool allow_data_in_errors = false;
+
+  // A string identifying the machine hosting the DB. This
+  // will be written as a property in every SST file written by the DB (or
+  // by offline writers such as SstFileWriter and RepairDB). It can be useful
+  // for troubleshooting in memory corruption caused by a failing host when
+  // writing a file, by tracing back to the writing host. These corruptions
+  // may not be caught by the checksum since they happen before checksumming.
+  // If left as default, the table writer will substitute it with the actual
+  // hostname when writing the SST file. If set to an empty string, the
+  // property will not be written to the SST file.
+  //
+  // Default: hostname
+  std::string db_host_id = kHostnameForDbHostId;
+
+  // Use this if your DB want to enable checksum handoff for specific file
+  // types writes. Make sure that the File_system you use support the
+  // crc32c checksum verification
+  // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.
+  // NOTE: currently RocksDB only generates crc32c based checksum for the
+  // handoff. If the storage layer has different checksum support, user
+  // should enble this set as empty. Otherwise,it may cause unexpected
+  // write failures.
+  FileTypeSet checksum_handoff_file_types;
+
+  // EXPERIMENTAL
+  // CompactionService is a feature allows the user to run compactions on a
+  // different host or process, which offloads the background load from the
+  // primary host.
+  // It's an experimental feature, the interface will be changed without
+  // backward/forward compatibility support for now. Some known issues are still
+  // under development.
+  std::shared_ptr<CompactionService> compaction_service = nullptr;
+
+  // It indicates, which lowest cache tier we want to
+  // use for a certain DB. Currently we support volatile_tier and
+  // non_volatile_tier. They are layered. By setting it to kVolatileTier, only
+  // the block cache (current implemented volatile_tier) is used. So
+  // cache entries will not spill to secondary cache (current
+  // implemented non_volatile_tier), and block cache lookup misses will not
+  // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use
+  // both block cache and secondary cache.
+  //
+  // Default: kNonVolatileBlockTier
+  CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -1141,7 +1367,11 @@
           const ColumnFamilyOptions& column_family_options)
       : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
 
-  // The function recovers options to the option as in version 4.6.
+  // Change to some default settings from an older version.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
   Options* OldDefaults(int rocksdb_major_version = 4,
                        int rocksdb_minor_version = 6);
 
@@ -1164,6 +1394,12 @@
   // Use this if your DB is very small (like under 1GB) and you don't want to
   // spend lots of memory for memtables.
   Options* OptimizeForSmallDb();
+
+  // Disable some checks that should not be necessary in the absence of
+  // software logic errors or CPU+memory hardware errors. This can improve
+  // write speeds but is only recommended for temporary use. Does not
+  // change protection against corrupt storage (e.g. verify_checksums).
+  Options* DisableExtraChecks();
 };
 
 //
@@ -1204,19 +1440,28 @@
   // Default: nullptr
   const Slice* iterate_lower_bound;
 
-  // "iterate_upper_bound" defines the extent upto which the forward iterator
+  // "iterate_upper_bound" defines the extent up to which the forward iterator
   // can returns entries. Once the bound is reached, Valid() will be false.
   // "iterate_upper_bound" is exclusive ie the bound value is
-  // not a valid entry. If prefix_extractor is not null, the Seek target
-  // and iterate_upper_bound need to have the same prefix.
-  // This is because ordering is not guaranteed outside of prefix domain.
+  // not a valid entry. If prefix_extractor is not null:
+  // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
+  //    to infer whether prefix iterating (e.g. applying prefix bloom filter)
+  //    can be used within RocksDB. This is done by comparing
+  //    iterate_upper_bound with the seek key.
+  // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes
+  //    effect if it shares the same prefix as the seek key. If
+  //    iterate_upper_bound is outside the prefix of the seek key, then keys
+  //    returned outside the prefix range will be undefined, just as if
+  //    iterate_upper_bound = null.
+  // If iterate_upper_bound is not null, SeekToLast() will position the iterator
+  // at the first key smaller than iterate_upper_bound.
   //
   // Default: nullptr
   const Slice* iterate_upper_bound;
 
   // RocksDB does auto-readahead for iterators on noticing more than two reads
   // for a table file. The readahead starts at 8KB and doubles on every
-  // additional read upto 256KB.
+  // additional read up to 256KB.
   // This option can help if most of the range scans are large, and if it is
   // determined that a larger readahead than that enabled by auto-readahead is
   // needed.
@@ -1242,11 +1487,12 @@
   // Default: true
   bool verify_checksums;
 
-  // Should the "data block"/"index block"" read for this iteration be placed in
+  // Should the "data block"/"index block" read for this iteration be placed in
   // block cache?
   // Callers may wish to set this field to false for bulk scans.
   // This would help not to the change eviction order of existing items in the
-  // block cache. Default: true
+  // block cache.
+  // Default: true
   bool fill_cache;
 
   // Specify to create a tailing iterator -- a special iterator that has a
@@ -1267,13 +1513,15 @@
   // If true when calling Get(), we also skip prefix bloom when reading from
   // block based table. It provides a way to read existing data after
   // changing implementation of prefix extractor.
+  // Default: false
   bool total_order_seek;
 
   // When true, by default use total_order_seek = true, and RocksDB can
   // selectively enable prefix seek mode if won't generate a different result
   // from total_order_seek, based on seek key, and iterator upper bound.
-  // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
+  // Not supported in ROCKSDB_LITE mode, in the way that even with value true
   // prefix mode is not used.
+  // Default: false
   bool auto_prefix_mode;
 
   // Enforce that the iterator only iterates over the same prefix as the seek.
@@ -1298,9 +1546,11 @@
   // Default: false
   bool background_purge_on_iterator_cleanup;
 
-  // If true, keys deleted using the DeleteRange() API will be visible to
-  // readers until they are naturally deleted during compaction. This improves
-  // read performance in DBs with many range deletions.
+  // If true, range tombstones handling will be skipped in key lookup paths.
+  // For DB instances that don't use DeleteRange() calls, this setting can
+  // be used to optimize the read performance.
+  // Note that, if this assumption (of no previous DeleteRange() calls) is
+  // broken, stale keys could be served in read paths.
   // Default: false
   bool ignore_range_deletions;
 
@@ -1312,10 +1562,8 @@
   // Default: empty (every table will be scanned)
   std::function<bool(const TableProperties&)> table_filter;
 
-  // Needed to support differential snapshots. Has 2 effects:
-  // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
-  // 2) if this param > 0 iterator will return INTERNAL keys instead of
-  //    user keys; e.g. return tombstones as well.
+  // Deprecated, will be removed in a future release.
+  // Please try using user-defined timestamp instead.
   // Default: 0 (don't filter by seqnum, return user keys)
   SequenceNumber iter_start_seqnum;
 
@@ -1323,9 +1571,52 @@
   // specified timestamp. All timestamps of the same database must be of the
   // same length and format. The user is responsible for providing a customized
   // compare function via Comparator to order <key, timestamp> tuples.
+  // For iterator, iter_start_ts is the lower bound (older) and timestamp
+  // serves as the upper bound. Versions of the same record that fall in
+  // the timestamp range will be returned. If iter_start_ts is nullptr,
+  // only the most recent version visible to timestamp is returned.
   // The user-specified timestamp feature is still under active development,
   // and the API is subject to change.
+  // Default: nullptr
   const Slice* timestamp;
+  const Slice* iter_start_ts;
+
+  // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+  // in microseconds.
+  // It should be set to microseconds since epoch, i.e, gettimeofday or
+  // equivalent plus allowed duration in microseconds. The best way is to use
+  // env->NowMicros() + some timeout.
+  // This is best efforts. The call may exceed the deadline if there is IO
+  // involved and the file system doesn't support deadlines, or due to
+  // checking for deadline periodically rather than for every key if
+  // processing a batch
+  std::chrono::microseconds deadline;
+
+  // A timeout in microseconds to be passed to the underlying FileSystem for
+  // reads. As opposed to deadline, this determines the timeout for each
+  // individual file read request. If a MultiGet/Get/Seek/Next etc call
+  // results in multiple reads, each read can last up to io_timeout us.
+  std::chrono::microseconds io_timeout;
+
+  // It limits the maximum cumulative value size of the keys in batch while
+  // reading through MultiGet. Once the cumulative value size exceeds this
+  // soft limit then all the remaining keys are returned with status Aborted.
+  //
+  // Default: std::numeric_limits<uint64_t>::max()
+  uint64_t value_size_soft_limit;
+
+  // For iterators, RocksDB does auto-readahead on noticing more than two
+  // sequential reads for a table file if user doesn't provide readahead_size.
+  // The readahead starts at 8KB and doubles on every additional read upto
+  // max_auto_readahead_size only when reads are sequential. However at each
+  // level, if iterator moves over next file, readahead_size starts again from
+  // 8KB.
+  //
+  // By enabling this option, RocksDB will do some enhancements for
+  // prefetching the data.
+  //
+  // Default: false
+  bool adaptive_readahead;
 
   ReadOptions();
   ReadOptions(bool cksum, bool cache);
@@ -1371,7 +1662,7 @@
   bool no_slowdown;
 
   // If true, this write request is of lower priority if compaction is
-  // behind. In this case, no_slowdown = true, the request will be cancelled
+  // behind. In this case, no_slowdown = true, the request will be canceled
   // immediately with Status::Incomplete() returned. Otherwise, it will be
   // slowed down. The slowdown value is determined by RocksDB to guarantee
   // it introduces minimum impacts to high priority writes.
@@ -1486,6 +1777,15 @@
   bool allow_write_stall = false;
   // If > 0, it will replace the option in the DBOptions for this compaction.
   uint32_t max_subcompactions = 0;
+  // Set user-defined timestamp low bound, the data with older timestamp than
+  // low bound maybe GCed by compaction. Default: nullptr
+  Slice* full_history_ts_low = nullptr;
+
+  // Allows cancellation of an in-progress manual compaction.
+  //
+  // Cancellation can be delayed waiting on automatic compactions when used
+  // together with `exclusive_manual_compaction == true`.
+  std::atomic<bool>* canceled = nullptr;
 };
 
 // IngestExternalFileOptions is used by IngestExternalFile()
@@ -1505,7 +1805,7 @@
   bool allow_blocking_flush = true;
   // Set to true if you would like duplicate keys in the file being ingested
   // to be skipped rather than overwriting existing data under that key.
-  // Usecase: back-fill of some historical data in the database without
+  // Use case: back-fill of some historical data in the database without
   // over-writing existing newer version of data.
   // This option could only be used if the DB has been running
   // with allow_ingest_behind=true since the dawn of time.
@@ -1535,6 +1835,26 @@
   // Using a large readahead size (> 2MB) can typically improve the performance
   // of forward iteration on spinning disks.
   size_t verify_checksums_readahead_size = 0;
+  // Set to TRUE if user wants to verify the sst file checksum of ingested
+  // files. The DB checksum function will generate the checksum of each
+  // ingested file (if file_checksum_gen_factory is set) and compare the
+  // checksum function name and checksum with the ingested checksum information.
+  //
+  // If this option is set to True: 1) if DB does not enable checksum
+  // (file_checksum_gen_factory == nullptr), the ingested checksum information
+  // will be ignored; 2) If DB enable the checksum function, we calculate the
+  // sst file checksum after the file is moved or copied and compare the
+  // checksum and checksum name. If checksum or checksum function name does
+  // not match, ingestion will be failed. If the verification is successful,
+  // checksum and checksum function name will be stored in Manifest.
+  // If this option is set to FALSE, 1) if DB does not enable checksum,
+  // the ingested checksum information will be ignored; 2) if DB enable the
+  // checksum, we only verify the ingested checksum function name and we
+  // trust the ingested checksum. If the checksum function name matches, we
+  // store the checksum in Manifest. DB does not calculate the checksum during
+  // ingestion. However, if no checksum information is provided with the
+  // ingested files, DB will generate the checksum and store in the Manifest.
+  bool verify_file_checksum = true;
 };
 
 enum TraceFilterType : uint64_t {
@@ -1543,7 +1863,13 @@
   // Do not trace the get operations
   kTraceFilterGet = 0x1 << 0,
   // Do not trace the write operations
-  kTraceFilterWrite = 0x1 << 1
+  kTraceFilterWrite = 0x1 << 1,
+  // Do not trace the `Iterator::Seek()` operations
+  kTraceFilterIteratorSeek = 0x1 << 2,
+  // Do not trace the `Iterator::SeekForPrev()` operations
+  kTraceFilterIteratorSeekForPrev = 0x1 << 3,
+  // Do not trace the `MultiGet()` operations
+  kTraceFilterMultiGet = 0x1 << 4,
 };
 
 // TraceOptions is used for StartTrace
@@ -1556,6 +1882,13 @@
   uint64_t sampling_frequency = 1;
   // Note: The filtering happens before sampling.
   uint64_t filter = kTraceFilterNone;
+  // When true, the order of write records in the trace will match the order of
+  // the corresponding write records in the WAL and applied to the DB. There may
+  // be a performance penalty associated with preserving this ordering.
+  //
+  // Default: false. This means write records in the trace may be in an order
+  // different from the WAL's order.
+  bool preserve_write_order = false;
 };
 
 // ImportColumnFamilyOptions is used by ImportColumnFamily()
@@ -1584,4 +1917,35 @@
   double files_size_error_margin = -1.0;
 };
 
+struct CompactionServiceOptionsOverride {
+  // Currently pointer configurations are not passed to compaction service
+  // compaction so the user needs to set it. It will be removed once pointer
+  // configuration passing is supported.
+  Env* env = Env::Default();
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  const Comparator* comparator = BytewiseComparator();
+  std::shared_ptr<MergeOperator> merge_operator = nullptr;
+  const CompactionFilter* compaction_filter = nullptr;
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+  std::shared_ptr<TableFactory> table_factory;
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+  // statistics is used to collect DB operation metrics, the metrics won't be
+  // returned to CompactionService primary host, to collect that, the user needs
+  // to set it here.
+  std::shared_ptr<Statistics> statistics = nullptr;
+};
+
+#ifndef ROCKSDB_LITE
+struct LiveFilesStorageInfoOptions {
+  // Whether to populate FileStorageInfo::file_checksum* or leave blank
+  bool include_checksum_info = false;
+  // Flushes memtables if total size in bytes of live WAL files is >= this
+  // number. Default: always force a flush without checking sizes.
+  uint64_t wal_size_for_flush = 0;
+};
+#endif  // !ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/perf_context.h	2025-05-19 16:14:27.000000000 +0000
@@ -30,10 +30,10 @@
 
   // total number of user key returned (only include keys that are found, does
   // not include keys that are deleted or merged without a final put
-  uint64_t user_key_return_count;
+  uint64_t user_key_return_count = 0;
 
   // total nanos spent on reading data from SST files
-  uint64_t get_from_table_nanos;
+  uint64_t get_from_table_nanos = 0;
 
   uint64_t block_cache_hit_count = 0;   // total number of block cache hits
   uint64_t block_cache_miss_count = 0;  // total number of block cache misses
@@ -57,7 +57,7 @@
   // enable per level perf context and allocate storage for PerfContextByLevel
   void EnablePerLevelPerfContext();
 
-  // temporarily disable per level perf contxt by setting the flag to false
+  // temporarily disable per level perf context by setting the flag to false
   void DisablePerLevelPerfContext();
 
   // free the space for PerfContextByLevel, also disable per level perf context
@@ -74,6 +74,9 @@
   uint64_t filter_block_read_count;       // total number of filter block reads
   uint64_t compression_dict_block_read_count;  // total number of compression
                                                // dictionary block reads
+
+  uint64_t secondary_cache_hit_count;  // total number of secondary cache hits
+
   uint64_t block_checksum_time;    // total nanos spent on block checksum
   uint64_t block_decompress_time;  // total nanos spent on block decompression
 
@@ -221,12 +224,24 @@
   uint64_t iter_prev_cpu_nanos;
   uint64_t iter_seek_cpu_nanos;
 
+  // Time spent in encrypting data. Populated when EncryptedEnv is used.
+  uint64_t encrypt_data_nanos;
+  // Time spent in decrypting data. Populated when EncryptedEnv is used.
+  uint64_t decrypt_data_nanos;
+
   std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
   bool per_level_perf_context_enabled = false;
 };
 
-// Get Thread-local PerfContext object pointer
-// if defined(NPERF_CONTEXT), then the pointer is not thread-local
+// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global,
+// non-thread-local PerfContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise,
+// a) if thread-local is supported on the platform, then a pointer to
+//    a thread-local PerfContext object will be returned.
+// b) if thread-local is NOT supported, then compilation will fail.
+//
+// This function never returns nullptr.
 PerfContext* get_perf_context();
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/persistent_cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -24,14 +24,14 @@
 // cache interface is specifically designed for persistent read cache.
 class PersistentCache {
  public:
-  typedef std::vector<std::map<std::string, double>> StatsType;
+  using StatsType = std::vector<std::map<std::string, double>>;
 
   virtual ~PersistentCache() {}
 
   // Insert to page cache
   //
   // page_key   Identifier to identify a page uniquely across restarts
-  // data       Page data
+  // data       Page data to copy (caller retains ownership)
   // size       Size of the page
   virtual Status Insert(const Slice& key, const char* data,
                         const size_t size) = 0;
@@ -56,6 +56,12 @@
   virtual StatsType Stats() = 0;
 
   virtual std::string GetPrintableOptions() const = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharding the same persistent cache to partition the key space.  Typically
+  // the client will allocate a new id at startup and prepend the id to its
+  // cache keys.
+  virtual uint64_t NewId() = 0;
 };
 
 // Factor method to create a new persistent cache
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rate_limiter.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,12 +9,17 @@
 
 #pragma once
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class RateLimiter {
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class RateLimiter : public Customizable {
  public:
   enum class OpType {
     // Limitation: we currently only invoke Request() with OpType::kRead for
@@ -28,11 +33,20 @@
     kAllIo,
   };
 
+  static const char* Type() { return "RateLimiter"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<RateLimiter>* result);
+
   // For API compatibility, default to rate-limiting writes only.
-  explicit RateLimiter(Mode mode = Mode::kWritesOnly) : mode_(mode) {}
+  explicit RateLimiter(Mode mode = Mode::kWritesOnly);
 
   virtual ~RateLimiter() {}
 
+  // Deprecated. Will be removed in a major release. Derived classes
+  // should implement this method.
+  virtual const char* Name() const override { return ""; }
+
   // This API allows user to dynamically change rate limiter's bytes per second.
   // REQUIRED: bytes_per_second > 0
   virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0;
@@ -45,13 +59,15 @@
   // Request for token for bytes. If this request can not be satisfied, the call
   // is blocked. Caller is responsible to make sure
   // bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
   virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) {
     assert(false);
   }
 
   // Request for token for bytes and potentially update statistics. If this
   // request can not be satisfied, the call is blocked. Caller is responsible to
-  // make sure bytes <= GetSingleBurstBytes().
+  // make sure bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
   virtual void Request(const int64_t bytes, const Env::IOPriority pri,
                        Statistics* /* stats */) {
     // For API compatibility, default implementation calls the older API in
@@ -62,7 +78,8 @@
   // Requests token to read or write bytes and potentially updates statistics.
   //
   // If this request can not be satisfied, the call is blocked. Caller is
-  // responsible to make sure bytes <= GetSingleBurstBytes().
+  // responsible to make sure bytes <= GetSingleBurstBytes()
+  // and bytes >= 0.
   virtual void Request(const int64_t bytes, const Env::IOPriority pri,
                        Statistics* stats, OpType op_type) {
     if (IsRateLimited(op_type)) {
@@ -89,6 +106,20 @@
   virtual int64_t GetTotalRequests(
       const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
 
+  // Total # of requests that are pending for bytes in rate limiter
+  // For convenience, this function is supported by the RateLimiter returned
+  // by NewGenericRateLimiter but is not required by RocksDB.
+  //
+  // REQUIRED: total_pending_request != nullptr
+  virtual Status GetTotalPendingRequests(
+      int64_t* total_pending_requests,
+      const Env::IOPriority pri = Env::IO_TOTAL) const {
+    assert(total_pending_requests != nullptr);
+    (void)total_pending_requests;
+    (void)pri;
+    return Status::NotSupported();
+  }
+
   virtual int64_t GetBytesPerSecond() const = 0;
 
   virtual bool IsRateLimited(OpType op_type) {
@@ -105,7 +136,7 @@
   Mode GetMode() { return mode_; }
 
  private:
-  const Mode mode_;
+  Mode mode_;
 };
 
 // Create a RateLimiter object, which can be shared among RocksDB instances to
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/rocksdb_namespace.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,12 @@
 
 #pragma once
 
+// For testing purposes
+#if ROCKSDB_NAMESPACE == 42
+#undef ROCKSDB_NAMESPACE
+#endif
+
+// Normal logic
 #ifndef ROCKSDB_NAMESPACE
 #define ROCKSDB_NAMESPACE rocksdb
 #endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/secondary_cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,85 @@
+// Copyright (c) 2021, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A handle for lookup result. The handle may not be immediately ready or
+// have a valid value. The caller must call isReady() to determine if its
+// ready, and call Wait() in order to block until it becomes ready.
+// The caller must call value() after it becomes ready to determine if the
+// handle successfullly read the item.
+class SecondaryCacheResultHandle {
+ public:
+  virtual ~SecondaryCacheResultHandle() {}
+
+  // Returns whether the handle is ready or not
+  virtual bool IsReady() = 0;
+
+  // Block until handle becomes ready
+  virtual void Wait() = 0;
+
+  // Return the value. If nullptr, it means the lookup was unsuccessful
+  virtual void* Value() = 0;
+
+  // Return the size of value
+  virtual size_t Size() = 0;
+};
+
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SecondaryCache : public Customizable {
+ public:
+  virtual ~SecondaryCache() {}
+
+  static const char* Type() { return "SecondaryCache"; }
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<SecondaryCache>* result);
+
+  // Insert the given value into this cache. The value is not written
+  // directly. Rather, the SaveToCallback provided by helper_cb will be
+  // used to extract the persistable data in value, which will be written
+  // to this tier. The implementation may or may not write it to cache
+  // depending on the admission control policy, even if the return status is
+  // success.
+  virtual Status Insert(const Slice& key, void* value,
+                        const Cache::CacheItemHelper* helper) = 0;
+
+  // Lookup the data for the given key in this cache. The create_cb
+  // will be used to create the object. The handle returned may not be
+  // ready yet, unless wait=true, in which case Lookup() will block until
+  // the handle is ready
+  virtual std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0;
+
+  // At the discretion of the implementation, erase the data associated
+  // with key
+  virtual void Erase(const Slice& key) = 0;
+
+  // Wait for a collection of handles to become ready
+  virtual void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) = 0;
+
+  virtual std::string GetPrintableOptions() const override = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/slice_transform.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,26 +14,41 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Slice;
+struct ConfigOptions;
 
-/*
- * A SliceTransform is a generic pluggable way of transforming one string
- * to another. Its primary use-case is in configuring rocksdb
- * to store prefix blooms by setting prefix_extractor in
- * ColumnFamilyOptions.
- */
-class SliceTransform {
+// A SliceTransform is a generic pluggable way of transforming one string
+// to another. Its primary use-case is in configuring rocksdb
+// to store prefix blooms by setting prefix_extractor in
+// ColumnFamilyOptions.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SliceTransform : public Customizable {
  public:
   virtual ~SliceTransform(){};
 
   // Return the name of this transformation.
-  virtual const char* Name() const = 0;
+  virtual const char* Name() const override = 0;
+  static const char* Type() { return "SliceTransform"; }
+
+  // Creates and configures a new SliceTransform from the input options and id.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<const SliceTransform>* result);
+
+  // Returns a string representation of this SliceTransform, representing the ID
+  // and any additional properties
+  std::string AsString() const;
 
   // Extract a prefix from a specified key. This method is called when
   // a key is inserted into the db, and the returned slice is used to
@@ -54,7 +69,7 @@
   // prefix size of 4.
   //
   // Wiki documentation here:
-  // https://github.com/facebook/rocksdb/wiki/Prefix-Seek-API-Changes
+  // https://github.com/facebook/rocksdb/wiki/Prefix-Seek
   //
   virtual bool InDomain(const Slice& key) const = 0;
 
@@ -62,7 +77,7 @@
   virtual bool InRange(const Slice& /*dst*/) const { return false; }
 
   // Some SliceTransform will have a full length which can be used to
-  // determine if two keys are consecuitive. Can be disabled by always
+  // determine if two keys are consecutive. Can be disabled by always
   // returning 0
   virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
 
@@ -94,10 +109,15 @@
   }
 };
 
+// The prefix is the first `prefix_len` bytes of the key, and keys shorter
+// then `prefix_len` are not InDomain.
 extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
 
+// The prefix is the first min(length(key),`cap_len`) bytes of the key, and
+// all keys are InDomain.
 extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
 
+// Prefix is equal to key. All keys are InDomain.
 extern const SliceTransform* NewNoopTransform();
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_dump_tool.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,7 +11,7 @@
 
 class SSTDumpTool {
  public:
-  int Run(int argc, char** argv, Options options = Options());
+  int Run(int argc, char const* const* argv, Options options = Options());
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_manager.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,6 +11,7 @@
 #include <vector>
 
 #include "rocksdb/file_system.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -18,17 +19,17 @@
 class Env;
 class Logger;
 
-// SstFileManager is used to track SST files in the DB and control their
-// deletion rate.
-// All SstFileManager public functions are thread-safe.
-// SstFileManager is not extensible.
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
+// SstFileManager is NOT an extensible interface but a public interface for
+// result of NewSstFileManager. Any derived classes must be RocksDB internal.
 class SstFileManager {
  public:
   virtual ~SstFileManager() {}
 
   // Update the maximum allowed space that should be used by RocksDB, if
-  // the total size of the SST files exceeds max_allowed_space, writes to
-  // RocksDB will fail.
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
   //
   // Setting max_allowed_space to 0 will disable this feature; maximum allowed
   // space will be infinite (Default value).
@@ -42,14 +43,14 @@
   // other background functions may continue, such as logging and flushing.
   virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
 
-  // Return true if the total size of SST files exceeded the maximum allowed
-  // space usage.
+  // Return true if the total size of SST  and blob files exceeded the maximum
+  // allowed space usage.
   //
   // thread-safe.
   virtual bool IsMaxAllowedSpaceReached() = 0;
 
-  // Returns true if the total size of SST files as well as estimated size
-  // of ongoing compactions exceeds the maximums allowed space usage.
+  // Returns true if the total size of SST and blob files as well as estimated
+  // size of ongoing compactions exceeds the maximums allowed space usage.
   virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
 
   // Return the total size of all tracked files.
@@ -80,10 +81,13 @@
   // Return the total size of trash files
   // thread-safe
   virtual uint64_t GetTotalTrashSize() = 0;
+
+  // Set the statistics ptr to dump the stat information
+  virtual void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) = 0;
 };
 
 // Create a new SstFileManager that can be shared among multiple RocksDB
-// instances to track SST file and control there deletion rate.
+// instances to track SST and blob files and control there deletion rate.
 // Even though SstFileManager don't track WAL files but it still control
 // there deletion rate.
 //
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h	2025-05-19 16:14:27.000000000 +0000
@@ -34,6 +34,8 @@
         largest_key(""),
         smallest_range_del_key(""),
         largest_range_del_key(""),
+        file_checksum(""),
+        file_checksum_func_name(""),
         sequence_number(0),
         file_size(0),
         num_entries(0),
@@ -50,6 +52,8 @@
         largest_key(_largest_key),
         smallest_range_del_key(""),
         largest_range_del_key(""),
+        file_checksum(""),
+        file_checksum_func_name(""),
         sequence_number(_sequence_number),
         file_size(_file_size),
         num_entries(_num_entries),
@@ -62,6 +66,8 @@
   std::string
       smallest_range_del_key;  // smallest range deletion user key in file
   std::string largest_range_del_key;  // largest range deletion user key in file
+  std::string file_checksum;          // sst file checksum;
+  std::string file_checksum_func_name;  // The name of file checksum function
   SequenceNumber sequence_number;     // sequence number of all keys in file
   uint64_t file_size;                 // file size in bytes
   uint64_t num_entries;               // number of entries in file
@@ -80,6 +86,9 @@
   // hint that this file pages is not needed every time we write 1MB to the
   // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
   // passed.
+  // The `skip_filters` option is DEPRECATED and could be removed in the
+  // future. Use `BlockBasedTableOptions::filter_policy` to control filter
+  // generation.
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
@@ -103,21 +112,40 @@
 
   // Add a Put key with value to currently opened file (deprecated)
   // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
   ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value);
 
   // Add a Put key with value to currently opened file
   // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
   Status Put(const Slice& user_key, const Slice& value);
 
+  // Add a Put (key with timestamp, value) to the currently opened file
+  // REQUIRES: key is after any previously added key according to the
+  // comparator.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value);
+
   // Add a Merge key with value to currently opened file
   // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
   Status Merge(const Slice& user_key, const Slice& value);
 
   // Add a deletion key to currently opened file
   // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: comparator is *not* timestamp-aware.
   Status Delete(const Slice& user_key);
 
+  // Add a deletion key with timestamp to the currently opened file
+  // REQUIRES: key is after any previously added key according to the
+  // comparator.
+  // REQUIRES: the timestamp's size is equal to what is expected by
+  // the comparator.
+  Status Delete(const Slice& user_key, const Slice& timestamp);
+
   // Add a range deletion tombstone to currently opened file
+  // REQUIRES: comparator is *not* timestamp-aware.
   Status DeleteRange(const Slice& begin_key, const Slice& end_key);
 
   // Finalize writing to sst file and close file.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/sst_partitioner.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+enum PartitionerResult : char {
+  // Partitioner does not require to create new file
+  kNotRequired = 0x0,
+  // Partitioner is requesting forcefully to create new file
+  kRequired = 0x1
+  // Additional constants can be added
+};
+
+struct PartitionerRequest {
+  PartitionerRequest(const Slice& prev_user_key_,
+                     const Slice& current_user_key_,
+                     uint64_t current_output_file_size_)
+      : prev_user_key(&prev_user_key_),
+        current_user_key(&current_user_key_),
+        current_output_file_size(current_output_file_size_) {}
+  const Slice* prev_user_key;
+  const Slice* current_user_key;
+  uint64_t current_output_file_size;
+};
+
+/*
+ * A SstPartitioner is a generic pluggable way of defining the partition
+ * of SST files. Compaction job will split the SST files on partition boundary
+ * to lower the write amplification during SST file promote to higher level.
+ */
+class SstPartitioner {
+ public:
+  virtual ~SstPartitioner() {}
+
+  // Return the name of this partitioner.
+  virtual const char* Name() const = 0;
+
+  // It is called for all keys in compaction. When partitioner want to create
+  // new SST file it needs to return true. It means compaction job will finish
+  // current SST file where last key is "prev_user_key" parameter and start new
+  // SST file where first key is "current_user_key". Returns decision if
+  // partition boundary was detected and compaction should create new file.
+  virtual PartitionerResult ShouldPartition(
+      const PartitionerRequest& request) = 0;
+
+  // Called with smallest and largest keys in SST file when compaction try to do
+  // trivial move. Returns true is partitioner allows to do trivial move.
+  virtual bool CanDoTrivialMove(const Slice& smallest_user_key,
+                                const Slice& largest_user_key) = 0;
+
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+    // Is this compaction requested by the client (true),
+    // or is it occurring as an automatic compaction process
+    bool is_manual_compaction;
+    // Output level for this compaction
+    int output_level;
+    // Smallest key for compaction
+    Slice smallest_user_key;
+    // Largest key for compaction
+    Slice largest_user_key;
+  };
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class SstPartitionerFactory : public Customizable {
+ public:
+  ~SstPartitionerFactory() override {}
+  static const char* Type() { return "SstPartitionerFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<SstPartitionerFactory>* result);
+
+  virtual std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& context) const = 0;
+
+  // Returns a name that identifies this partitioner factory.
+  const char* Name() const override = 0;
+};
+
+/*
+ * Fixed key prefix partitioner. It splits the output SST files when prefix
+ * defined by size changes.
+ */
+class SstPartitionerFixedPrefix : public SstPartitioner {
+ public:
+  explicit SstPartitionerFixedPrefix(size_t len) : len_(len) {}
+
+  virtual ~SstPartitionerFixedPrefix() override {}
+
+  const char* Name() const override { return "SstPartitionerFixedPrefix"; }
+
+  PartitionerResult ShouldPartition(const PartitionerRequest& request) override;
+
+  bool CanDoTrivialMove(const Slice& smallest_user_key,
+                        const Slice& largest_user_key) override;
+
+ private:
+  size_t len_;
+};
+
+/*
+ * Factory for fixed prefix partitioner.
+ */
+class SstPartitionerFixedPrefixFactory : public SstPartitionerFactory {
+ public:
+  explicit SstPartitionerFixedPrefixFactory(size_t len);
+
+  ~SstPartitionerFixedPrefixFactory() override {}
+
+  static const char* kClassName() { return "SstPartitionerFixedPrefixFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override;
+
+ private:
+  size_t len_;
+};
+
+extern std::shared_ptr<SstPartitionerFactory>
+NewSstPartitionerFixedPrefixFactory(size_t prefix_len);
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/statistics.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/statistics.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/statistics.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/statistics.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,17 +13,19 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 /**
- * Keep adding ticker's here.
- *  1. Any ticker should be added before TICKER_ENUM_MAX.
+ * Keep adding tickers here.
+ *  1. Any ticker should be added immediately before TICKER_ENUM_MAX, taking
+ *     over its old value.
  *  2. Add a readable string in TickersNameMap below for the newly added ticker.
  *  3. Add a corresponding enum value to TickerType.java in the java API
  *  4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
- * and toCppTickers
+ *     and toCppTickers
  */
 enum Tickers : uint32_t {
   // total block cache misses
@@ -117,7 +119,7 @@
   COMPACTION_RANGE_DEL_DROP_OBSOLETE,  // all keys in range were deleted.
   // Deletions obsoleted before bottom level due to file gap optimization.
   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
-  // If a compaction was cancelled in sfm to prevent ENOSPC
+  // If a compaction was canceled in sfm to prevent ENOSPC
   COMPACTION_CANCELLED,
 
   // Number of keys written to the database via the Put and Write call's
@@ -183,7 +185,7 @@
   // over large number of keys with same userkey.
   NUMBER_OF_RESEEKS_IN_ITERATION,
 
-  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // Record the number of calls to GetUpdatesSince. Useful to keep track of
   // transaction log iterator refreshes
   GET_UPDATES_SINCE_CALLS,
   BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
@@ -205,6 +207,14 @@
   COMPACT_WRITE_BYTES,  // Bytes written during compaction
   FLUSH_WRITE_BYTES,    // Bytes written during flush
 
+  // Compaction read and write statistics broken down by CompactionReason
+  COMPACT_READ_BYTES_MARKED,
+  COMPACT_READ_BYTES_PERIODIC,
+  COMPACT_READ_BYTES_TTL,
+  COMPACT_WRITE_BYTES_MARKED,
+  COMPACT_WRITE_BYTES_PERIODIC,
+  COMPACT_WRITE_BYTES_TTL,
+
   // Number of table's properties loaded directly from file, without creating
   // table reader object.
   NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
@@ -239,35 +249,42 @@
   NUMBER_ITER_SKIP,
 
   // BlobDB specific stats
-  // # of Put/PutTTL/PutUntil to BlobDB.
+  // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_PUT,
-  // # of Write to BlobDB.
+  // # of Write to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_WRITE,
-  // # of Get to BlobDB.
+  // # of Get to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_GET,
-  // # of MultiGet to BlobDB.
+  // # of MultiGet to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_MULTIGET,
-  // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
+  // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator. Only
+  // applicable to legacy BlobDB.
   BLOB_DB_NUM_SEEK,
-  // # of Next to BlobDB iterator.
+  // # of Next to BlobDB iterator. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_NEXT,
-  // # of Prev to BlobDB iterator.
+  // # of Prev to BlobDB iterator. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_PREV,
-  // # of keys written to BlobDB.
+  // # of keys written to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_KEYS_WRITTEN,
-  // # of keys read from BlobDB.
+  // # of keys read from BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_KEYS_READ,
-  // # of bytes (key + value) written to BlobDB.
+  // # of bytes (key + value) written to BlobDB. Only applicable to legacy
+  // BlobDB.
   BLOB_DB_BYTES_WRITTEN,
-  // # of bytes (keys + value) read from BlobDB.
+  // # of bytes (keys + value) read from BlobDB. Only applicable to legacy
+  // BlobDB.
   BLOB_DB_BYTES_READ,
-  // # of keys written by BlobDB as non-TTL inlined value.
+  // # of keys written by BlobDB as non-TTL inlined value. Only applicable to
+  // legacy BlobDB.
   BLOB_DB_WRITE_INLINED,
-  // # of keys written by BlobDB as TTL inlined value.
+  // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy
+  // BlobDB.
   BLOB_DB_WRITE_INLINED_TTL,
-  // # of keys written by BlobDB as non-TTL blob value.
+  // # of keys written by BlobDB as non-TTL blob value. Only applicable to
+  // legacy BlobDB.
   BLOB_DB_WRITE_BLOB,
-  // # of keys written by BlobDB as TTL blob value.
+  // # of keys written by BlobDB as TTL blob value. Only applicable to legacy
+  // BlobDB.
   BLOB_DB_WRITE_BLOB_TTL,
   // # of bytes written to blob file.
   BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
@@ -276,22 +293,24 @@
   // # of times a blob files being synced.
   BLOB_DB_BLOB_FILE_SYNCED,
   // # of blob index evicted from base DB by BlobDB compaction filter because
-  // of expiration.
+  // of expiration. Only applicable to legacy BlobDB.
   BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
   // size of blob index evicted from base DB by BlobDB compaction filter
-  // because of expiration.
+  // because of expiration. Only applicable to legacy BlobDB.
   BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
   // # of blob index evicted from base DB by BlobDB compaction filter because
-  // of corresponding file deleted.
+  // of corresponding file deleted. Only applicable to legacy BlobDB.
   BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
   // size of blob index evicted from base DB by BlobDB compaction filter
-  // because of corresponding file deleted.
+  // because of corresponding file deleted. Only applicable to legacy BlobDB.
   BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
-  // # of blob files that were obsoleted by garbage collection.
+  // # of blob files that were obsoleted by garbage collection. Only applicable
+  // to legacy BlobDB.
   BLOB_DB_GC_NUM_FILES,
-  // # of blob files generated by garbage collection.
+  // # of blob files generated by garbage collection. Only applicable to legacy
+  // BlobDB.
   BLOB_DB_GC_NUM_NEW_FILES,
-  // # of BlobDB garbage collection failures.
+  // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB.
   BLOB_DB_GC_FAILURES,
   // # of keys dropped by BlobDB garbage collection because they had been
   // overwritten. DEPRECATED.
@@ -309,11 +328,14 @@
   BLOB_DB_GC_BYTES_EXPIRED,
   // # of bytes relocated to new blob file by garbage collection.
   BLOB_DB_GC_BYTES_RELOCATED,
-  // # of blob files evicted because of BlobDB is full.
+  // # of blob files evicted because of BlobDB is full. Only applicable to
+  // legacy BlobDB.
   BLOB_DB_FIFO_NUM_FILES_EVICTED,
-  // # of keys in the blob files evicted because of BlobDB is full.
+  // # of keys in the blob files evicted because of BlobDB is full. Only
+  // applicable to legacy BlobDB.
   BLOB_DB_FIFO_NUM_KEYS_EVICTED,
-  // # of bytes in the blob files evicted because of BlobDB is full.
+  // # of bytes in the blob files evicted because of BlobDB is full. Only
+  // applicable to legacy BlobDB.
   BLOB_DB_FIFO_BYTES_EVICTED,
 
   // These counters indicate a performance issue in WritePrepared transactions.
@@ -342,6 +364,67 @@
   BLOCK_CACHE_COMPRESSION_DICT_ADD,
   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
   BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+
+  // # of blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD
+  BLOCK_CACHE_ADD_REDUNDANT,
+  // # of index blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_INDEX_ADD_REDUNDANT <= BLOCK_CACHE_INDEX_ADD
+  BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+  // # of filter blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_FILTER_ADD_REDUNDANT <= BLOCK_CACHE_FILTER_ADD
+  BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+  // # of data blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_DATA_ADD_REDUNDANT <= BLOCK_CACHE_DATA_ADD
+  BLOCK_CACHE_DATA_ADD_REDUNDANT,
+  // # of dict blocks redundantly inserted into block cache.
+  // REQUIRES: BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT
+  //           <= BLOCK_CACHE_COMPRESSION_DICT_ADD
+  BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+
+  // # of files marked as trash by sst file manager and will be deleted
+  // later by background thread.
+  FILES_MARKED_TRASH,
+  // # of files deleted immediately by sst file manger through delete scheduler.
+  FILES_DELETED_IMMEDIATELY,
+
+  // The counters for error handler, not that, bg_io_error is the subset of
+  // bg_error and bg_retryable_io_error is the subset of bg_io_error
+  ERROR_HANDLER_BG_ERROR_COUNT,
+  ERROR_HANDLER_BG_IO_ERROR_COUNT,
+  ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+  ERROR_HANDLER_AUTORESUME_COUNT,
+  ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+  ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+
+  // Statistics for memtable garbage collection:
+  // Raw bytes of data (payload) present on memtable at flush time.
+  MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+  // Outdated bytes of data present on memtable at flush time.
+  MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+
+  // Secondary cache statistics
+  SECONDARY_CACHE_HITS,
+
+  // Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs.
+  VERIFY_CHECKSUM_READ_BYTES,
+
+  // Bytes read/written while creating backups
+  BACKUP_READ_BYTES,
+  BACKUP_WRITE_BYTES,
+
+  // Remote compaction read/write statistics
+  REMOTE_COMPACT_READ_BYTES,
+  REMOTE_COMPACT_WRITE_BYTES,
+
+  // Tiered storage related statistics
+  HOT_FILE_READ_BYTES,
+  WARM_FILE_READ_BYTES,
+  COLD_FILE_READ_BYTES,
+  HOT_FILE_READ_COUNT,
+  WARM_FILE_READ_COUNT,
+  COLD_FILE_READ_COUNT,
+
   TICKER_ENUM_MAX
 };
 
@@ -400,21 +483,23 @@
   READ_NUM_MERGE_OPERANDS,
 
   // BlobDB specific stats
-  // Size of keys written to BlobDB.
+  // Size of keys written to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_KEY_SIZE,
-  // Size of values written to BlobDB.
+  // Size of values written to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_VALUE_SIZE,
-  // BlobDB Put/PutWithTTL/PutUntil/Write latency.
+  // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy
+  // BlobDB.
   BLOB_DB_WRITE_MICROS,
-  // BlobDB Get lagency.
+  // BlobDB Get latency. Only applicable to legacy BlobDB.
   BLOB_DB_GET_MICROS,
-  // BlobDB MultiGet latency.
+  // BlobDB MultiGet latency. Only applicable to legacy BlobDB.
   BLOB_DB_MULTIGET_MICROS,
-  // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
+  // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency. Only applicable to
+  // legacy BlobDB.
   BLOB_DB_SEEK_MICROS,
-  // BlobDB Next latency.
+  // BlobDB Next latency. Only applicable to legacy BlobDB.
   BLOB_DB_NEXT_MICROS,
-  // BlobDB Prev latency.
+  // BlobDB Prev latency. Only applicable to legacy BlobDB.
   BLOB_DB_PREV_MICROS,
   // Blob file write latency.
   BLOB_DB_BLOB_FILE_WRITE_MICROS,
@@ -432,6 +517,17 @@
   FLUSH_TIME,
   SST_BATCH_SIZE,
 
+  // MultiGet stats logged per level
+  // Num of index and filter blocks read from file system per level.
+  NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+  // Num of data blocks read from file system per level.
+  NUM_DATA_BLOCKS_READ_PER_LEVEL,
+  // Num of sst files read from file system per level.
+  NUM_SST_READ_PER_LEVEL,
+
+  // Error handler statistics
+  ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+
   HISTOGRAM_ENUM_MAX,
 };
 
@@ -456,6 +552,10 @@
 // Usage:
 //   options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
 enum StatsLevel : uint8_t {
+  // Disable all metrics
+  kDisableAll,
+  // Disable tickers
+  kExceptTickers = kDisableAll,
   // Disable timer stats, and skip histogram stats
   kExceptHistogramOrTimers,
   // Skip timer stats
@@ -481,10 +581,21 @@
 //  options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
 //  HistogramData hist;
 //  options.statistics->histogramData(FLUSH_TIME, &hist);
-class Statistics {
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Statistics : public Customizable {
  public:
-  virtual ~Statistics() {}
+  ~Statistics() override {}
   static const char* Type() { return "Statistics"; }
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::shared_ptr<Statistics>* result);
+  // Default name of empty, for backwards compatibility.  Derived classes should
+  // override this method.
+  // This default implementation will likely be removed in a future release
+  const char* Name() const override { return ""; }
   virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
   virtual void histogramData(uint32_t type,
                              HistogramData* const data) const = 0;
@@ -516,7 +627,10 @@
   // Resets all ticker and histogram stats
   virtual Status Reset() { return Status::NotSupported("Not implemented"); }
 
-  // String representation of the statistic object.
+#ifndef ROCKSDB_LITE
+  using Customizable::ToString;
+#endif  // ROCKSDB_LITE
+  // String representation of the statistic object. Must be thread-safe.
   virtual std::string ToString() const {
     // Do nothing by default
     return std::string("ToString(): not implemented");
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/stats_history.h	2025-05-19 16:14:27.000000000 +0000
@@ -53,6 +53,7 @@
   // REQUIRES: Valid()
   virtual uint64_t GetStatsTime() const = 0;
 
+  // DEPRECATED (was never used)
   virtual int GetFormatVersion() const { return -1; }
 
   // Return the current stats history as an std::map which specifies the
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/status.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/status.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/status.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/status.h	2025-05-19 16:14:27.000000000 +0000
@@ -16,7 +16,17 @@
 
 #pragma once
 
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
 #include <string>
+
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+#include "port/stack_trace.h"
+#endif
+
 #include "rocksdb/slice.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -25,7 +35,16 @@
  public:
   // Create a success status.
   Status() : code_(kOk), subcode_(kNone), sev_(kNoError), state_(nullptr) {}
-  ~Status() { delete[] state_; }
+  ~Status() {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (!checked_) {
+      fprintf(stderr, "Failed to check Status %p\n", this);
+      port::PrintStack();
+      abort();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    delete[] state_;
+  }
 
   // Copy the specified status.
   Status(const Status& s);
@@ -43,6 +62,17 @@
   bool operator==(const Status& rhs) const;
   bool operator!=(const Status& rhs) const;
 
+  // In case of intentionally swallowing an error, user must explicitly call
+  // this function. That way we are easily able to search the code to find where
+  // error swallowing occurs.
+  inline void PermitUncheckedError() const { MarkChecked(); }
+
+  inline void MustCheck() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
+
   enum Code : unsigned char {
     kOk = 0,
     kNotFound = 1,
@@ -63,7 +93,10 @@
     kMaxCode
   };
 
-  Code code() const { return code_; }
+  Code code() const {
+    MarkChecked();
+    return code_;
+  }
 
   enum SubCode : unsigned char {
     kNone = 0,
@@ -78,10 +111,16 @@
     kPathNotFound = 9,
     KMergeOperandsInsufficientCapacity = 10,
     kManualCompactionPaused = 11,
+    kOverwritten = 12,
+    kTxnNotPrepared = 13,
+    kIOFenced = 14,
     kMaxSubCode
   };
 
-  SubCode subcode() const { return subcode_; }
+  SubCode subcode() const {
+    MarkChecked();
+    return subcode_;
+  }
 
   enum Severity : unsigned char {
     kNoError = 0,
@@ -93,21 +132,43 @@
   };
 
   Status(const Status& s, Severity sev);
-  Severity severity() const { return sev_; }
+
+  Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg)
+      : Status(_code, _subcode, msg, "", _sev) {}
+
+  Severity severity() const {
+    MarkChecked();
+    return sev_;
+  }
 
   // Returns a C style string indicating the message of the Status
-  const char* getState() const { return state_; }
+  const char* getState() const {
+    MarkChecked();
+    return state_;
+  }
 
   // Return a success status.
   static Status OK() { return Status(); }
 
+  // Successful, though an existing something was overwritten
+  // Note: using variants of OK status for program logic is discouraged,
+  // but it can be useful for communicating statistical information without
+  // changing public APIs.
+  static Status OkOverwritten() { return Status(kOk, kOverwritten); }
+
   // Return error status of an appropriate type.
   static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kNotFound, msg, msg2);
   }
+
   // Fast path for not found without malloc;
   static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); }
 
+  static Status NotFound(SubCode sc, const Slice& msg,
+                         const Slice& msg2 = Slice()) {
+    return Status(kNotFound, sc, msg, msg2);
+  }
+
   static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kCorruption, msg, msg2);
   }
@@ -217,60 +278,126 @@
     return Status(kIOError, kPathNotFound, msg, msg2);
   }
 
+  static Status TxnNotPrepared() {
+    return Status(kInvalidArgument, kTxnNotPrepared);
+  }
+  static Status TxnNotPrepared(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2);
+  }
+
   // Returns true iff the status indicates success.
-  bool ok() const { return code() == kOk; }
+  bool ok() const {
+    MarkChecked();
+    return code() == kOk;
+  }
+
+  // Returns true iff the status indicates success *with* something
+  // overwritten
+  bool IsOkOverwritten() const {
+    MarkChecked();
+    return code() == kOk && subcode() == kOverwritten;
+  }
 
   // Returns true iff the status indicates a NotFound error.
-  bool IsNotFound() const { return code() == kNotFound; }
+  bool IsNotFound() const {
+    MarkChecked();
+    return code() == kNotFound;
+  }
 
   // Returns true iff the status indicates a Corruption error.
-  bool IsCorruption() const { return code() == kCorruption; }
+  bool IsCorruption() const {
+    MarkChecked();
+    return code() == kCorruption;
+  }
 
   // Returns true iff the status indicates a NotSupported error.
-  bool IsNotSupported() const { return code() == kNotSupported; }
+  bool IsNotSupported() const {
+    MarkChecked();
+    return code() == kNotSupported;
+  }
 
   // Returns true iff the status indicates an InvalidArgument error.
-  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
+  bool IsInvalidArgument() const {
+    MarkChecked();
+    return code() == kInvalidArgument;
+  }
 
   // Returns true iff the status indicates an IOError.
-  bool IsIOError() const { return code() == kIOError; }
+  bool IsIOError() const {
+    MarkChecked();
+    return code() == kIOError;
+  }
 
   // Returns true iff the status indicates an MergeInProgress.
-  bool IsMergeInProgress() const { return code() == kMergeInProgress; }
+  bool IsMergeInProgress() const {
+    MarkChecked();
+    return code() == kMergeInProgress;
+  }
 
   // Returns true iff the status indicates Incomplete
-  bool IsIncomplete() const { return code() == kIncomplete; }
+  bool IsIncomplete() const {
+    MarkChecked();
+    return code() == kIncomplete;
+  }
 
   // Returns true iff the status indicates Shutdown In progress
-  bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
+  bool IsShutdownInProgress() const {
+    MarkChecked();
+    return code() == kShutdownInProgress;
+  }
 
-  bool IsTimedOut() const { return code() == kTimedOut; }
+  bool IsTimedOut() const {
+    MarkChecked();
+    return code() == kTimedOut;
+  }
 
-  bool IsAborted() const { return code() == kAborted; }
+  bool IsAborted() const {
+    MarkChecked();
+    return code() == kAborted;
+  }
 
   bool IsLockLimit() const {
+    MarkChecked();
     return code() == kAborted && subcode() == kLockLimit;
   }
 
   // Returns true iff the status indicates that a resource is Busy and
   // temporarily could not be acquired.
-  bool IsBusy() const { return code() == kBusy; }
+  bool IsBusy() const {
+    MarkChecked();
+    return code() == kBusy;
+  }
 
-  bool IsDeadlock() const { return code() == kBusy && subcode() == kDeadlock; }
+  bool IsDeadlock() const {
+    MarkChecked();
+    return code() == kBusy && subcode() == kDeadlock;
+  }
 
   // Returns true iff the status indicated that the operation has Expired.
-  bool IsExpired() const { return code() == kExpired; }
+  bool IsExpired() const {
+    MarkChecked();
+    return code() == kExpired;
+  }
 
   // Returns true iff the status indicates a TryAgain error.
   // This usually means that the operation failed, but may succeed if
   // re-attempted.
-  bool IsTryAgain() const { return code() == kTryAgain; }
+  bool IsTryAgain() const {
+    MarkChecked();
+    return code() == kTryAgain;
+  }
 
   // Returns true iff the status indicates the proposed compaction is too large
-  bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; }
+  bool IsCompactionTooLarge() const {
+    MarkChecked();
+    return code() == kCompactionTooLarge;
+  }
 
   // Returns true iff the status indicates Column Family Dropped
-  bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; }
+  bool IsColumnFamilyDropped() const {
+    MarkChecked();
+    return code() == kColumnFamilyDropped;
+  }
 
   // Returns true iff the status indicates a NoSpace error
   // This is caused by an I/O error returning the specific "out of space"
@@ -278,6 +405,7 @@
   // with a specific subcode, enabling users to take the appropriate action
   // if needed
   bool IsNoSpace() const {
+    MarkChecked();
     return (code() == kIOError) && (subcode() == kNoSpace);
   }
 
@@ -285,6 +413,7 @@
   // cases where we limit the memory used in certain operations (eg. the size
   // of a write batch) in order to avoid out of memory exceptions.
   bool IsMemoryLimit() const {
+    MarkChecked();
     return (code() == kAborted) && (subcode() == kMemoryLimit);
   }
 
@@ -293,52 +422,76 @@
   // directory" error condition. A PathNotFound error is an I/O error with
   // a specific subcode, enabling users to take appropriate action if necessary
   bool IsPathNotFound() const {
-    return (code() == kIOError) && (subcode() == kPathNotFound);
+    MarkChecked();
+    return (code() == kIOError || code() == kNotFound) &&
+           (subcode() == kPathNotFound);
   }
 
   // Returns true iff the status indicates manual compaction paused. This
   // is caused by a call to PauseManualCompaction
   bool IsManualCompactionPaused() const {
+    MarkChecked();
     return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
   }
 
+  // Returns true iff the status indicates a TxnNotPrepared error.
+  bool IsTxnNotPrepared() const {
+    MarkChecked();
+    return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared);
+  }
+
+  // Returns true iff the status indicates a IOFenced error.
+  bool IsIOFenced() const {
+    MarkChecked();
+    return (code() == kIOError) && (subcode() == kIOFenced);
+  }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
 
  protected:
-  // A nullptr state_ (which is always the case for OK) means the message
-  // is empty.
-  // of the following form:
-  //    state_[0..3] == length of message
-  //    state_[4..]  == message
   Code code_;
   SubCode subcode_;
   Severity sev_;
+  // A nullptr state_ (which is at least the case for OK) means the extra
+  // message is empty.
   const char* state_;
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  mutable bool checked_ = false;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
 
   explicit Status(Code _code, SubCode _subcode = kNone)
       : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {}
 
-  Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
+  Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2,
+         Severity sev = kNoError);
   Status(Code _code, const Slice& msg, const Slice& msg2)
       : Status(_code, kNone, msg, msg2) {}
 
   static const char* CopyState(const char* s);
+
+  inline void MarkChecked() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
 };
 
 inline Status::Status(const Status& s)
     : code_(s.code_), subcode_(s.subcode_), sev_(s.sev_) {
+  s.MarkChecked();
   state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
 }
 inline Status::Status(const Status& s, Severity sev)
     : code_(s.code_), subcode_(s.subcode_), sev_(sev) {
+  s.MarkChecked();
   state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
 }
 inline Status& Status::operator=(const Status& s) {
-  // The following condition catches both aliasing (when this == &s),
-  // and the common case where both s and *this are ok.
   if (this != &s) {
+    s.MarkChecked();
+    MustCheck();
     code_ = s.code_;
     subcode_ = s.subcode_;
     sev_ = s.sev_;
@@ -353,6 +506,7 @@
     noexcept
 #endif
     : Status() {
+  s.MarkChecked();
   *this = std::move(s);
 }
 
@@ -362,6 +516,8 @@
 #endif
 {
   if (this != &s) {
+    s.MarkChecked();
+    MustCheck();
     code_ = std::move(s.code_);
     s.code_ = kOk;
     subcode_ = std::move(s.subcode_);
@@ -376,10 +532,14 @@
 }
 
 inline bool Status::operator==(const Status& rhs) const {
+  MarkChecked();
+  rhs.MarkChecked();
   return (code_ == rhs.code_);
 }
 
 inline bool Status::operator!=(const Status& rhs) const {
+  MarkChecked();
+  rhs.MarkChecked();
   return !(*this == rhs);
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/system_clock.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,116 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+
+// A SystemClock is an interface used by the rocksdb implementation to access
+// operating system time-related functionality.
+class SystemClock : public Customizable {
+ public:
+  virtual ~SystemClock() {}
+
+  static const char* Type() { return "SystemClock"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<SystemClock>* result);
+  // The name of this system clock
+  virtual const char* Name() const override = 0;
+
+  // The name/nickname for the Default SystemClock.  This name can be used
+  // to determine if the clock is the default one.
+  static const char* kDefaultName() { return "DefaultClock"; }
+
+  // Return a default SystemClock suitable for the current operating
+  // system.
+  static const std::shared_ptr<SystemClock>& Default();
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+  // Returns the number of micro-seconds of CPU time used by the current thread.
+  // 0 indicates not supported.
+  virtual uint64_t CPUMicros() { return 0; }
+
+  // Returns the number of nano-seconds of CPU time used by the current thread.
+  // Default implementation simply relies on CPUMicros.
+  // 0 indicates not supported.
+  virtual uint64_t CPUNanos() { return CPUMicros() * 1000; }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+};
+
+// Wrapper class for a SystemClock.  Redirects all methods (except Name)
+// of the SystemClock interface to the target/wrapped class.
+class SystemClockWrapper : public SystemClock {
+ public:
+  explicit SystemClockWrapper(const std::shared_ptr<SystemClock>& t);
+
+  uint64_t NowMicros() override { return target_->NowMicros(); }
+
+  uint64_t NowNanos() override { return target_->NowNanos(); }
+
+  uint64_t CPUMicros() override { return target_->CPUMicros(); }
+
+  uint64_t CPUNanos() override { return target_->CPUNanos(); }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    return target_->SleepForMicroseconds(micros);
+  }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return target_->GetCurrentTime(unix_time);
+  }
+
+  std::string TimeToString(uint64_t time) override {
+    return target_->TimeToString(time);
+  }
+
+  Status PrepareOptions(const ConfigOptions& options) override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& config_options,
+                               const std::string& header) const override;
+#endif  // ROCKSDB_LITE
+  const Customizable* Inner() const override { return target_.get(); }
+
+ protected:
+  std::shared_ptr<SystemClock> target_;
+};
+
+}  // end namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table.h	2025-05-19 16:14:27.000000000 +0000
@@ -22,35 +22,91 @@
 #include <string>
 #include <unordered_map>
 
-#include "rocksdb/cache.h"
+#include "rocksdb/customizable.h"
 #include "rocksdb/env.h"
-#include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 // -- Block-based Table
+class Cache;
+class FilterPolicy;
 class FlushBlockPolicyFactory;
 class PersistentCache;
 class RandomAccessFile;
 struct TableReaderOptions;
 struct TableBuilderOptions;
 class TableBuilder;
+class TableFactory;
 class TableReader;
 class WritableFileWriter;
+struct ConfigOptions;
 struct EnvOptions;
-struct Options;
 
+// Types of checksums to use for checking integrity of logical blocks within
+// files. All checksums currently use 32 bits of checking power (1 in 4B
+// chance of failing to detect random corruption).
 enum ChecksumType : char {
   kNoChecksum = 0x0,
   kCRC32c = 0x1,
   kxxHash = 0x2,
   kxxHash64 = 0x3,
+  kXXH3 = 0x4,  // Supported since RocksDB 6.27
+};
+
+// `PinningTier` is used to specify which tier of block-based tables should
+// be affected by a block cache pinning setting (see
+// `MetadataCacheOptions` below).
+enum class PinningTier {
+  // For compatibility, this value specifies to fallback to the behavior
+  // indicated by the deprecated options,
+  // `pin_l0_filter_and_index_blocks_in_cache` and
+  // `pin_top_level_index_and_filter`.
+  kFallback,
+
+  // This tier contains no block-based tables.
+  kNone,
+
+  // This tier contains block-based tables that may have originated from a
+  // memtable flush. In particular, it includes tables from L0 that are smaller
+  // than 1.5 times the current `write_buffer_size`. Note these criteria imply
+  // it can include intra-L0 compaction outputs and ingested files, as long as
+  // they are not abnormally large compared to flushed files in L0.
+  kFlushedAndSimilar,
+
+  // This tier contains all block-based tables.
+  kAll,
+};
+
+// `MetadataCacheOptions` contains members indicating the desired caching
+// behavior for the different categories of metadata blocks.
+struct MetadataCacheOptions {
+  // The tier of block-based tables whose top-level index into metadata
+  // partitions will be pinned. Currently indexes and filters may be
+  // partitioned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise any top-level index into metadata partitions would be
+  // held in table reader memory, outside the block cache.
+  PinningTier top_level_index_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose metadata partitions will be pinned.
+  // Currently indexes and filters may be partitioned.
+  PinningTier partition_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose unpartitioned metadata blocks will be
+  // pinned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise the unpartitioned meta-blocks would be held in table
+  // reader memory, outside the block cache.
+  PinningTier unpartitioned_pinning = PinningTier::kFallback;
 };
 
 // For advanced user only
 struct BlockBasedTableOptions {
+  static const char* kName() { return "BlockTableOptions"; };
   // @flush_block_policy_factory creates the instances of flush block policy.
   // which provides a configurable way to determine when to flush a block in
   // the block based tables.  If not set, table builder will use the default
@@ -65,9 +121,10 @@
   // caching as they should now apply to range tombstone and compression
   // dictionary meta-blocks, in addition to index and filter meta-blocks.
   //
-  // Indicating if we'd put index/filter blocks to the block cache.
-  // If not specified, each "table reader" object will pre-load index/filter
-  // block during table initialization.
+  // Whether to put index/filter blocks in the block cache. When false,
+  // each "table reader" object will pre-load index/filter blocks during
+  // table initialization. Index and filter partition blocks always use
+  // block cache regardless of this option.
   bool cache_index_and_filter_blocks = false;
 
   // If cache_index_and_filter_blocks is enabled, cache index and filter
@@ -76,12 +133,44 @@
   // than data blocks.
   bool cache_index_and_filter_blocks_with_high_priority = true;
 
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating each of the following variables that
+  // has the default value, `PinningTier::kFallback`:
+  //
+  // - `MetadataCacheOptions::partition_pinning`
+  // - `MetadataCacheOptions::unpartitioned_pinning`
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_l0_filter_and_index_blocks_in_cache == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_l0_filter_and_index_blocks_in_cache == true` ->
+  //   `PinningTier::kFlushedAndSimilar`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
   // if cache_index_and_filter_blocks is true and the below is true, then
   // filter and index blocks are stored in the cache, but a reference is
   // held in the "table reader" object so the blocks are pinned and only
   // evicted from cache when the table reader is freed.
   bool pin_l0_filter_and_index_blocks_in_cache = false;
 
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating
+  // `MetadataCacheOptions::top_level_index_pinning` when it has the
+  // default value, `PinningTier::kFallback`.
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_top_level_index_and_filter == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_top_level_index_and_filter == true` ->
+  //   `PinningTier::kAll`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
   // If cache_index_and_filter_blocks is true and the below is true, then
   // the top-level index of partitioned filter and index blocks are stored in
   // the cache, but a reference is held in the "table reader" object so the
@@ -89,6 +178,12 @@
   // freed. This is not limited to l0 in LSM tree.
   bool pin_top_level_index_and_filter = true;
 
+  // The desired block cache pinning behavior for the different categories of
+  // metadata blocks. While pinning can reduce block cache contention, users
+  // must take care not to pin excessive amounts of data, which risks
+  // overflowing block cache.
+  MetadataCacheOptions metadata_cache_options;
+
   // The index type that will be used for this table.
   enum IndexType : char {
     // A space efficient index block that is optimized for
@@ -100,6 +195,8 @@
     kHashSearch = 0x01,
 
     // A two-level index implementation. Both levels are binary search indexes.
+    // Second level index blocks ("partitions") use block cache even when
+    // cache_index_and_filter_blocks=false.
     kTwoLevelIndexSearch = 0x02,
 
     // Like kBinarySearch, but index also contains first key of each block.
@@ -113,11 +210,6 @@
     //    e.g. when prefix changes.
     // Makes the index significantly bigger (2x or more), especially when keys
     // are long.
-    //
-    // IO errors are not handled correctly in this mode right now: if an error
-    // happens when lazily reading a block in value(), value() returns empty
-    // slice, and you need to call Valid()/status() afterwards.
-    // TODO(kolmike): Fix it.
     kBinarySearchWithFirstKey = 0x03,
   };
 
@@ -167,7 +259,7 @@
   // block size specified here corresponds to uncompressed data.  The
   // actual size of the unit read from disk may be smaller if
   // compression is enabled.  This parameter can be changed dynamically.
-  size_t block_size = 4 * 1024;
+  uint64_t block_size = 4 * 1024;
 
   // This is used to close a block before it reaches the configured
   // 'block_size'. If the percentage of free space in the current block is less
@@ -196,13 +288,68 @@
   // separately
   uint64_t metadata_block_size = 4096;
 
+  // If true, a dynamically updating charge to block cache, loosely based
+  // on the actual memory usage of table building, will occur to account
+  // the memory, if block cache available.
+  //
+  // Charged memory usage includes:
+  // 1. (new) Bloom Filter and Ribbon Filter construction
+  // 2. More to come...
+  //
+  // Note:
+  // 1. (new) Bloom Filter and Ribbon Filter construction
+  //
+  // If additional temporary memory of Ribbon Filter uses up too much memory
+  // relative to the avaible space left in the block cache
+  // at some point (i.e, causing a cache full when strict_capacity_limit =
+  // true), construction will fall back to (new) Bloom Filter.
+  //
+  // Default: false
+  bool reserve_table_builder_memory = false;
+
   // Note: currently this option requires kTwoLevelIndexSearch to be set as
   // well.
   // TODO(myabandeh): remove the note above once the limitation is lifted
   // Use partitioned full filters for each SST file. This option is
-  // incompatible with block-based filters.
+  // incompatible with block-based filters. Filter partition blocks use
+  // block cache even when cache_index_and_filter_blocks=false.
   bool partition_filters = false;
 
+  // Option to generate Bloom/Ribbon filters that minimize memory
+  // internal fragmentation.
+  //
+  // When false, malloc_usable_size is not available, or format_version < 5,
+  // filters are generated without regard to internal fragmentation when
+  // loaded into memory (historical behavior). When true (and
+  // malloc_usable_size is available and format_version >= 5), then
+  // filters are generated to "round up" and "round down" their sizes to
+  // minimize internal fragmentation when loaded into memory, assuming the
+  // reading DB has the same memory allocation characteristics as the
+  // generating DB. This option does not break forward or backward
+  // compatibility.
+  //
+  // While individual filters will vary in bits/key and false positive rate
+  // when setting is true, the implementation attempts to maintain a weighted
+  // average FP rate for filters consistent with this option set to false.
+  //
+  // With Jemalloc for example, this setting is expected to save about 10% of
+  // the memory footprint and block cache charge of filters, while increasing
+  // disk usage of filters by about 1-2% due to encoding efficiency losses
+  // with variance in bits/key.
+  //
+  // NOTE: Because some memory counted by block cache might be unmapped pages
+  // within internal fragmentation, this option can increase observed RSS
+  // memory usage. With cache_index_and_filter_blocks=true, this option makes
+  // the block cache better at using space it is allowed. (These issues
+  // should not arise with partitioned filters.)
+  //
+  // NOTE: Do not set to true if you do not trust malloc_usable_size. With
+  // this option, RocksDB might access an allocated memory object beyond its
+  // original size if malloc_usable_size says it is safe to do so. While this
+  // can be considered bad practice, it should not produce undefined behavior
+  // unless malloc_usable_size is buggy or broken.
+  bool optimize_filters_for_memory = false;
+
   // Use delta encoding to compress keys in blocks.
   // ReadOptions::pin_data requires this option to be disabled.
   //
@@ -246,10 +393,9 @@
   // Default: 0 (disabled)
   uint32_t read_amp_bytes_per_bit = 0;
 
-  // We currently have five versions:
-  // 0 -- This version is currently written out by all RocksDB's versions by
-  // default.  Can be read by really old RocksDB's. Doesn't support changing
-  // checksum (default is CRC32).
+  // We currently have these versions:
+  // 0 -- This version can be read by really old RocksDB's. Doesn't support
+  // changing checksum type (default is CRC32).
   // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
   // checksum, like xxHash. It is written by RocksDB when
   // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
@@ -272,7 +418,7 @@
   // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
   // filters use a generally faster and more accurate Bloom filter
   // implementation, with a different schema.
-  uint32_t format_version = 2;
+  uint32_t format_version = 5;
 
   // Store index blocks on disk in compressed format. Changing this option to
   // false  will avoid the overhead of decompression if index blocks are evicted
@@ -316,6 +462,55 @@
 
   IndexShorteningMode index_shortening =
       IndexShorteningMode::kShortenSeparators;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size. The readahead
+  // starts at 8KB and doubles on every additional read upto
+  // max_auto_readahead_size and max_auto_readahead_size can be configured.
+  //
+  // Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit
+  // auto prefetching will be done. If max_auto_readahead_size provided is less
+  // than 8KB (which is initial readahead size used by rocksdb in case of
+  // auto-readahead), readahead size will remain same as
+  // max_auto_readahead_size.
+  //
+  // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
+  // the blocks.
+  //
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{max_auto_readahead_size=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 256 KB (256 * 1024).
+  size_t max_auto_readahead_size = 256 * 1024;
+
+  // If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and
+  // filter blocks) which are already in memory into block cache at the time of
+  // flush. On a flush, the block that is in memory (in memtables) get flushed
+  // to the device. If using Direct IO, additional IO is incurred to read this
+  // data back into memory again, which is avoided by enabling this option. This
+  // further helps if the workload exhibits high temporal locality, where most
+  // of the reads go to recently written data. This also helps in case of
+  // Distributed FileSystem.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{prepopulate_block_cache=kFlushOnly;}"}}));
+  enum class PrepopulateBlockCache : char {
+    // Disable prepopulate block cache.
+    kDisable,
+    // Prepopulate blocks during flush only.
+    kFlushOnly,
+  };
+
+  PrepopulateBlockCache prepopulate_block_cache =
+      PrepopulateBlockCache::kDisable;
 };
 
 // Table Properties that are specific to block-based table properties.
@@ -361,6 +556,7 @@
 const uint32_t kPlainTableVariableLength = 0;
 
 struct PlainTableOptions {
+  static const char* kName() { return "PlainTableOptions"; };
   // @user_key_len: plain table has optimization for fix-sized keys, which can
   //                be specified via user_key_len.  Alternatively, you can pass
   //                `kPlainTableVariableLength` if your keys have variable
@@ -408,7 +604,7 @@
 
   // @store_index_in_file: compute plain table index and bloom filter during
   //                       file building and store it in file. When reading
-  //                       file, index will be mmaped instead of recomputation.
+  //                       file, index will be mapped instead of recomputation.
   bool store_index_in_file = false;
 };
 
@@ -454,6 +650,8 @@
 };
 
 struct CuckooTableOptions {
+  static const char* kName() { return "CuckooTableOptions"; };
+
   // Determines the utilization of hash tables. Smaller values
   // result in larger hash tables with fewer collisions.
   double hash_table_ratio = 0.9;
@@ -491,18 +689,21 @@
 class RandomAccessFileReader;
 
 // A base class for table factories.
-class TableFactory {
+class TableFactory : public Customizable {
  public:
-  virtual ~TableFactory() {}
+  virtual ~TableFactory() override {}
 
-  // The type of the table.
-  //
-  // The client of this package should switch to a new name whenever
-  // the table format implementation changes.
-  //
-  // Names starting with "rocksdb." are reserved and should not be used
-  // by any clients of this package.
-  virtual const char* Name() const = 0;
+  static const char* kBlockCacheOpts() { return "BlockCache"; };
+  static const char* kBlockBasedTableName() { return "BlockBasedTable"; };
+  static const char* kPlainTableName() { return "PlainTable"; }
+  static const char* kCuckooTableName() { return "CuckooTable"; };
+
+  // Creates and configures a new TableFactory from the input options and id.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<TableFactory>* factory);
+
+  static const char* Type() { return "TableFactory"; }
 
   // Returns a Table object table that can fetch data from file specified
   // in parameter file. It's the caller's responsibility to make sure
@@ -525,7 +726,19 @@
       const TableReaderOptions& table_reader_options,
       std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
       std::unique_ptr<TableReader>* table_reader,
-      bool prefetch_index_and_filter_in_cache = true) const = 0;
+      bool prefetch_index_and_filter_in_cache = true) const {
+    ReadOptions ro;
+    return NewTableReader(ro, table_reader_options, std::move(file), file_size,
+                          table_reader, prefetch_index_and_filter_in_cache);
+  }
+
+  // Overload of the above function that allows the caller to pass in a
+  // ReadOptions
+  virtual Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache) const = 0;
 
   // Return a table builder to write to a file for this table type.
   //
@@ -547,40 +760,7 @@
   // to use in this table.
   virtual TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const = 0;
-
-  // Sanitizes the specified DB Options and ColumnFamilyOptions.
-  //
-  // If the function cannot find a way to sanitize the input DB Options,
-  // a non-ok Status will be returned.
-  virtual Status SanitizeOptions(const DBOptions& db_opts,
-                                 const ColumnFamilyOptions& cf_opts) const = 0;
-
-  // Return a string that contains printable format of table configurations.
-  // RocksDB prints configurations at DB Open().
-  virtual std::string GetPrintableTableOptions() const = 0;
-
-  virtual Status GetOptionString(std::string* /*opt_string*/,
-                                 const std::string& /*delimiter*/) const {
-    return Status::NotSupported(
-        "The table factory doesn't implement GetOptionString().");
-  }
-
-  // Returns the raw pointer of the table options that is used by this
-  // TableFactory, or nullptr if this function is not supported.
-  // Since the return value is a raw pointer, the TableFactory owns the
-  // pointer and the caller should not delete the pointer.
-  //
-  // In certain case, it is desirable to alter the underlying options when the
-  // TableFactory is not used by any open DB by casting the returned pointer
-  // to the right class.   For instance, if BlockBasedTableFactory is used,
-  // then the pointer can be casted to BlockBasedTableOptions.
-  //
-  // Note that changing the underlying TableFactory options while the
-  // TableFactory is currently used by any open DB is undefined behavior.
-  // Developers should use DB::SetOption() instead to dynamically change
-  // options while the DB is open.
-  virtual void* GetOptions() { return nullptr; }
+      WritableFileWriter* file) const = 0;
 
   // Return is delete range supported
   virtual bool IsDeleteRangeSupported() const { return false; }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/table_properties.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,8 +5,12 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <map>
+#include <memory>
 #include <string>
+
+#include "rocksdb/customizable.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 
@@ -26,10 +30,14 @@
 //      ++pos) {
 //   ...
 // }
-typedef std::map<std::string, std::string> UserCollectedProperties;
+using UserCollectedProperties = std::map<std::string, std::string>;
 
 // table properties' human-readable names in the property block.
 struct TablePropertiesNames {
+  static const std::string kDbId;
+  static const std::string kDbSessionId;
+  static const std::string kDbHostId;
+  static const std::string kOriginalFileNumber;
   static const std::string kDataSize;
   static const std::string kIndexSize;
   static const std::string kIndexPartitions;
@@ -41,6 +49,7 @@
   static const std::string kRawValueSize;
   static const std::string kNumDataBlocks;
   static const std::string kNumEntries;
+  static const std::string kNumFilterEntries;
   static const std::string kDeletedKeys;
   static const std::string kMergeOperands;
   static const std::string kNumRangeDeletions;
@@ -58,18 +67,23 @@
   static const std::string kCreationTime;
   static const std::string kOldestKeyTime;
   static const std::string kFileCreationTime;
+  static const std::string kSlowCompressionEstimatedDataSize;
+  static const std::string kFastCompressionEstimatedDataSize;
 };
 
-extern const std::string kPropertiesBlock;
-extern const std::string kCompressionDictBlock;
-extern const std::string kRangeDelBlock;
-
 // `TablePropertiesCollector` provides the mechanism for users to collect
 // their own properties that they are interested in. This class is essentially
 // a collection of callback functions that will be invoked during table
 // building. It is constructed with TablePropertiesCollectorFactory. The methods
 // don't need to be thread-safe, as we will create exactly one
-// TablePropertiesCollector object per table and then call it sequentially
+// TablePropertiesCollector object per table and then call it sequentially.
+//
+// Statuses from these callbacks are currently logged when not OK, but
+// otherwise ignored by RocksDB.
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
 class TablePropertiesCollector {
  public:
   virtual ~TablePropertiesCollector() {}
@@ -96,9 +110,9 @@
   }
 
   // Called after each new block is cut
-  virtual void BlockAdd(uint64_t /* blockRawBytes */,
-                        uint64_t /* blockCompressedBytesFast */,
-                        uint64_t /* blockCompressedBytesSlow */) {
+  virtual void BlockAdd(uint64_t /* block_raw_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) {
     // Nothing to do here. Callback registers can override.
     return;
   }
@@ -122,26 +136,47 @@
 
 // Constructs TablePropertiesCollector. Internals create a new
 // TablePropertiesCollector for each new table
-class TablePropertiesCollectorFactory {
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class TablePropertiesCollectorFactory : public Customizable {
  public:
   struct Context {
     uint32_t column_family_id;
+    // The level at creating the SST file (i.e, table), of which the
+    // properties are being collected.
+    int level_at_creation = kUnknownLevelAtCreation;
     static const uint32_t kUnknownColumnFamily;
+    static const int kUnknownLevelAtCreation = -1;
   };
 
-  virtual ~TablePropertiesCollectorFactory() {}
+  ~TablePropertiesCollectorFactory() override {}
+  static const char* Type() { return "TablePropertiesCollectorFactory"; }
+  static Status CreateFromString(
+      const ConfigOptions& options, const std::string& value,
+      std::shared_ptr<TablePropertiesCollectorFactory>* result);
+
   // has to be thread-safe
   virtual TablePropertiesCollector* CreateTablePropertiesCollector(
       TablePropertiesCollectorFactory::Context context) = 0;
 
   // The name of the properties collector can be used for debugging purpose.
-  virtual const char* Name() const = 0;
+  const char* Name() const override = 0;
+
+  // Can be overridden by sub-classes to return the Name, followed by
+  // configuration info that will // be logged to the info log when the
+  // DB is opened
+  virtual std::string ToString() const { return Name(); }
 };
 
 // TableProperties contains a bunch of read-only properties of its associated
 // table.
 struct TableProperties {
  public:
+  // the file number at creation time, or 0 for unknown. When known,
+  // combining with db_session_id must uniquely identify an SST file.
+  uint64_t orig_file_number = 0;
   // the total size of all data blocks.
   uint64_t data_size = 0;
   // the size of index block.
@@ -165,6 +200,8 @@
   uint64_t num_data_blocks = 0;
   // the number of entries in this table
   uint64_t num_entries = 0;
+  // the number of unique entries (keys or prefixes) added to filters
+  uint64_t num_filter_entries = 0;
   // the number of deletions in the table
   uint64_t num_deletions = 0;
   // the number of merge operands in the table
@@ -187,6 +224,35 @@
   uint64_t oldest_key_time = 0;
   // Actual SST file creation time. 0 means unknown.
   uint64_t file_creation_time = 0;
+  // Estimated size of data blocks if compressed using a relatively slower
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t slow_compression_estimated_data_size = 0;
+  // Estimated size of data blocks if compressed using a relatively faster
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t fast_compression_estimated_data_size = 0;
+  // Offset of the value of the property "external sst file global seqno" in the
+  // file if the property exists.
+  // 0 means not exists.
+  uint64_t external_sst_file_global_seqno_offset = 0;
+
+  // DB identity
+  // db_id is an identifier generated the first time the DB is created
+  // If DB identity is unset or unassigned, `db_id` will be an empty string.
+  std::string db_id;
+
+  // DB session identity
+  // db_session_id is an identifier that gets reset every time the DB is opened
+  // If DB session identity is unset or unassigned, `db_session_id` will be an
+  // empty string.
+  std::string db_session_id;
+
+  // Location of the machine hosting the DB instance
+  // db_host_id identifies the location of the host in some form
+  // (hostname by default, but can also be any string of the user's choosing).
+  // It can potentially change whenever the DB is opened
+  std::string db_host_id;
 
   // Name of the column family with which this SST file is associated.
   // If column family is unknown, `column_family_name` will be an empty string.
@@ -222,9 +288,6 @@
   UserCollectedProperties user_collected_properties;
   UserCollectedProperties readable_properties;
 
-  // The offset of the value of each property in the file.
-  std::map<std::string, uint64_t> properties_offsets;
-
   // convert this object to a human readable form
   //   @prop_delim: delimiter for each property.
   std::string ToString(const std::string& prop_delim = "; ",
@@ -233,6 +296,11 @@
   // Aggregate the numerical member variables of the specified
   // TableProperties.
   void Add(const TableProperties& tp);
+
+  // Subset of properties that make sense when added together
+  // between tables. Keys match field names in this class instead
+  // of using full property names.
+  std::map<std::string, uint64_t> GetAggregatablePropertiesAsMap() const;
 };
 
 // Extra properties
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/thread_status.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,13 +13,15 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <cstddef>
+#include <cstdint>
 #include <map>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "rocksdb/rocksdb_namespace.h"
+
 #if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) && \
     defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
 #define ROCKSDB_USING_THREAD_STATUS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_reader_writer.h	2025-05-19 16:14:27.000000000 +0000
@@ -19,8 +19,7 @@
 // a time.
 class TraceWriter {
  public:
-  TraceWriter() {}
-  virtual ~TraceWriter() {}
+  virtual ~TraceWriter() = default;
 
   virtual Status Write(const Slice& data) = 0;
   virtual Status Close() = 0;
@@ -28,21 +27,26 @@
 };
 
 // TraceReader allows reading RocksDB traces from any system, one operation at
-// a time. A RocksDB Replayer could depend on this to replay opertions.
+// a time. A RocksDB Replayer could depend on this to replay operations.
 class TraceReader {
  public:
-  TraceReader() {}
-  virtual ~TraceReader() {}
+  virtual ~TraceReader() = default;
 
   virtual Status Read(std::string* data) = 0;
   virtual Status Close() = 0;
+
+  // Seek back to the trace header. Replayer can call this method to restart
+  // replaying. Note this method may fail if the reader is already closed.
+  virtual Status Reset() = 0;
 };
 
-// Factory methods to read/write traces from/to a file.
+// Factory methods to write/read traces to/from a file.
+// The implementations may not be thread-safe.
 Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceWriter>* trace_writer);
 Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceReader>* trace_reader);
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,247 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+class DB;
+
+// Supported trace record types.
+enum TraceType : char {
+  kTraceNone = 0,
+  kTraceBegin = 1,
+  kTraceEnd = 2,
+  // Query level tracing related trace types.
+  kTraceWrite = 3,
+  kTraceGet = 4,
+  kTraceIteratorSeek = 5,
+  kTraceIteratorSeekForPrev = 6,
+  // Block cache tracing related trace types.
+  kBlockTraceIndexBlock = 7,
+  kBlockTraceFilterBlock = 8,
+  kBlockTraceDataBlock = 9,
+  kBlockTraceUncompressionDictBlock = 10,
+  kBlockTraceRangeDeletionBlock = 11,
+  // IO tracing related trace type.
+  kIOTracer = 12,
+  // Query level tracing related trace type.
+  kTraceMultiGet = 13,
+  // All trace types should be added before kTraceMax
+  kTraceMax,
+};
+
+class GetQueryTraceRecord;
+class IteratorSeekQueryTraceRecord;
+class MultiGetQueryTraceRecord;
+class TraceRecordResult;
+class WriteQueryTraceRecord;
+
+// Base class for all types of trace records.
+class TraceRecord {
+ public:
+  explicit TraceRecord(uint64_t timestamp);
+
+  virtual ~TraceRecord() = default;
+
+  // Type of the trace record.
+  virtual TraceType GetTraceType() const = 0;
+
+  // Timestamp (in microseconds) of this trace.
+  virtual uint64_t GetTimestamp() const;
+
+  class Handler {
+   public:
+    virtual ~Handler() = default;
+
+    virtual Status Handle(const WriteQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const GetQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const IteratorSeekQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+
+    virtual Status Handle(const MultiGetQueryTraceRecord& record,
+                          std::unique_ptr<TraceRecordResult>* result) = 0;
+  };
+
+  // Accept the handler and report the corresponding result in `result`.
+  virtual Status Accept(Handler* handler,
+                        std::unique_ptr<TraceRecordResult>* result) = 0;
+
+  // Create a handler for the exeution of TraceRecord.
+  static Handler* NewExecutionHandler(
+      DB* db, const std::vector<ColumnFamilyHandle*>& handles);
+
+ private:
+  uint64_t timestamp_;
+};
+
+// Base class for all query types of trace records.
+class QueryTraceRecord : public TraceRecord {
+ public:
+  explicit QueryTraceRecord(uint64_t timestamp);
+};
+
+// Trace record for DB::Write() operation.
+class WriteQueryTraceRecord : public QueryTraceRecord {
+ public:
+  WriteQueryTraceRecord(PinnableSlice&& write_batch_rep, uint64_t timestamp);
+
+  WriteQueryTraceRecord(const std::string& write_batch_rep, uint64_t timestamp);
+
+  virtual ~WriteQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceWrite; }
+
+  // rep string for the WriteBatch.
+  virtual Slice GetWriteBatchRep() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  PinnableSlice rep_;
+};
+
+// Trace record for DB::Get() operation
+class GetQueryTraceRecord : public QueryTraceRecord {
+ public:
+  GetQueryTraceRecord(uint32_t column_family_id, PinnableSlice&& key,
+                      uint64_t timestamp);
+
+  GetQueryTraceRecord(uint32_t column_family_id, const std::string& key,
+                      uint64_t timestamp);
+
+  virtual ~GetQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceGet; }
+
+  // Column family ID.
+  virtual uint32_t GetColumnFamilyID() const;
+
+  // Key to get.
+  virtual Slice GetKey() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  uint32_t cf_id_;
+  PinnableSlice key_;
+};
+
+// Base class for all Iterator related operations.
+class IteratorQueryTraceRecord : public QueryTraceRecord {
+ public:
+  explicit IteratorQueryTraceRecord(uint64_t timestamp);
+
+  IteratorQueryTraceRecord(PinnableSlice&& lower_bound,
+                           PinnableSlice&& upper_bound, uint64_t timestamp);
+
+  IteratorQueryTraceRecord(const std::string& lower_bound,
+                           const std::string& upper_bound, uint64_t timestamp);
+
+  virtual ~IteratorQueryTraceRecord() override;
+
+  // Get the iterator's lower/upper bound. They may be used in ReadOptions to
+  // create an Iterator instance.
+  virtual Slice GetLowerBound() const;
+  virtual Slice GetUpperBound() const;
+
+ private:
+  PinnableSlice lower_;
+  PinnableSlice upper_;
+};
+
+// Trace record for Iterator::Seek() and Iterator::SeekForPrev() operation.
+class IteratorSeekQueryTraceRecord : public IteratorQueryTraceRecord {
+ public:
+  // Currently we only support Seek() and SeekForPrev().
+  enum SeekType {
+    kSeek = kTraceIteratorSeek,
+    kSeekForPrev = kTraceIteratorSeekForPrev
+  };
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               PinnableSlice&& key, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               const std::string& key, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               PinnableSlice&& key, PinnableSlice&& lower_bound,
+                               PinnableSlice&& upper_bound, uint64_t timestamp);
+
+  IteratorSeekQueryTraceRecord(SeekType seekType, uint32_t column_family_id,
+                               const std::string& key,
+                               const std::string& lower_bound,
+                               const std::string& upper_bound,
+                               uint64_t timestamp);
+
+  virtual ~IteratorSeekQueryTraceRecord() override;
+
+  // Trace type matches the seek type.
+  TraceType GetTraceType() const override;
+
+  // Type of seek, Seek or SeekForPrev.
+  virtual SeekType GetSeekType() const;
+
+  // Column family ID.
+  virtual uint32_t GetColumnFamilyID() const;
+
+  // Key to seek to.
+  virtual Slice GetKey() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  SeekType type_;
+  uint32_t cf_id_;
+  PinnableSlice key_;
+};
+
+// Trace record for DB::MultiGet() operation.
+class MultiGetQueryTraceRecord : public QueryTraceRecord {
+ public:
+  MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+                           std::vector<PinnableSlice>&& keys,
+                           uint64_t timestamp);
+
+  MultiGetQueryTraceRecord(std::vector<uint32_t> column_family_ids,
+                           const std::vector<std::string>& keys,
+                           uint64_t timestamp);
+
+  virtual ~MultiGetQueryTraceRecord() override;
+
+  TraceType GetTraceType() const override { return kTraceMultiGet; }
+
+  // Column familiy IDs.
+  virtual std::vector<uint32_t> GetColumnFamilyIDs() const;
+
+  // Keys to get.
+  virtual std::vector<Slice> GetKeys() const;
+
+  Status Accept(Handler* handler,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  std::vector<uint32_t> cf_ids_;
+  std::vector<PinnableSlice> keys_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/trace_record_result.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,187 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IteratorTraceExecutionResult;
+class MultiValuesTraceExecutionResult;
+class SingleValueTraceExecutionResult;
+class StatusOnlyTraceExecutionResult;
+
+// Base class for the results of all types of trace records.
+// Theses classes can be used to report the execution result of
+// TraceRecord::Handler::Handle() or TraceRecord::Accept().
+class TraceRecordResult {
+ public:
+  explicit TraceRecordResult(TraceType trace_type);
+
+  virtual ~TraceRecordResult() = default;
+
+  // Trace type of the corresponding TraceRecord.
+  virtual TraceType GetTraceType() const;
+
+  class Handler {
+   public:
+    virtual ~Handler() = default;
+
+    virtual Status Handle(const StatusOnlyTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const SingleValueTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const MultiValuesTraceExecutionResult& result) = 0;
+
+    virtual Status Handle(const IteratorTraceExecutionResult& result) = 0;
+  };
+
+  // Accept the handler.
+  virtual Status Accept(Handler* handler) = 0;
+
+ private:
+  TraceType trace_type_;
+};
+
+// Base class for the results from the trace record execution handler (created
+// by TraceRecord::NewExecutionHandler()).
+//
+// The actual execution status or returned values may be hidden from
+// TraceRecord::Handler::Handle and TraceRecord::Accept. For example, a
+// GetQueryTraceRecord's execution calls DB::Get() internally. DB::Get() may
+// return Status::NotFound() but TraceRecord::Handler::Handle() or
+// TraceRecord::Accept() will still return Status::OK(). The actual status from
+// DB::Get() and the returned value string may be saved in a
+// SingleValueTraceExecutionResult.
+class TraceExecutionResult : public TraceRecordResult {
+ public:
+  TraceExecutionResult(uint64_t start_timestamp, uint64_t end_timestamp,
+                       TraceType trace_type);
+
+  // Execution start/end timestamps and request latency in microseconds.
+  virtual uint64_t GetStartTimestamp() const;
+  virtual uint64_t GetEndTimestamp() const;
+  inline uint64_t GetLatency() const {
+    return GetEndTimestamp() - GetStartTimestamp();
+  }
+
+ private:
+  uint64_t ts_start_;
+  uint64_t ts_end_;
+};
+
+// Result for operations that only return a single Status.
+// Example operation: DB::Write()
+class StatusOnlyTraceExecutionResult : public TraceExecutionResult {
+ public:
+  StatusOnlyTraceExecutionResult(Status status, uint64_t start_timestamp,
+                                 uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~StatusOnlyTraceExecutionResult() override = default;
+
+  // Return value of DB::Write(), etc.
+  virtual const Status& GetStatus() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  Status status_;
+};
+
+// Result for operations that return a Status and a value.
+// Example operation: DB::Get()
+class SingleValueTraceExecutionResult : public TraceExecutionResult {
+ public:
+  SingleValueTraceExecutionResult(Status status, const std::string& value,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  SingleValueTraceExecutionResult(Status status, std::string&& value,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~SingleValueTraceExecutionResult() override;
+
+  // Return status of DB::Get().
+  virtual const Status& GetStatus() const;
+
+  // Value for the searched key.
+  virtual const std::string& GetValue() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  Status status_;
+  std::string value_;
+};
+
+// Result for operations that return multiple Status(es) and values as vectors.
+// Example operation: DB::MultiGet()
+class MultiValuesTraceExecutionResult : public TraceExecutionResult {
+ public:
+  MultiValuesTraceExecutionResult(std::vector<Status> multi_status,
+                                  std::vector<std::string> values,
+                                  uint64_t start_timestamp,
+                                  uint64_t end_timestamp, TraceType trace_type);
+
+  virtual ~MultiValuesTraceExecutionResult() override;
+
+  // Returned Status(es) of DB::MultiGet().
+  virtual const std::vector<Status>& GetMultiStatus() const;
+
+  // Returned values for the searched keys.
+  virtual const std::vector<std::string>& GetValues() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  std::vector<Status> multi_status_;
+  std::vector<std::string> values_;
+};
+
+// Result for Iterator operations.
+// Example operations: Iterator::Seek(), Iterator::SeekForPrev()
+class IteratorTraceExecutionResult : public TraceExecutionResult {
+ public:
+  IteratorTraceExecutionResult(bool valid, Status status, PinnableSlice&& key,
+                               PinnableSlice&& value, uint64_t start_timestamp,
+                               uint64_t end_timestamp, TraceType trace_type);
+
+  IteratorTraceExecutionResult(bool valid, Status status,
+                               const std::string& key, const std::string& value,
+                               uint64_t start_timestamp, uint64_t end_timestamp,
+                               TraceType trace_type);
+
+  virtual ~IteratorTraceExecutionResult() override;
+
+  // Return if the Iterator is valid.
+  virtual bool GetValid() const;
+
+  // Return the status of the Iterator.
+  virtual const Status& GetStatus() const;
+
+  // Key of the current iterating entry, empty if GetValid() is false.
+  virtual Slice GetKey() const;
+
+  // Value of the current iterating entry, empty if GetValid() is false.
+  virtual Slice GetValue() const;
+
+  virtual Status Accept(Handler* handler) override;
+
+ private:
+  bool valid_;
+  Status status_;
+  PinnableSlice key_;
+  PinnableSlice value_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/transaction_log.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,7 +14,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 class LogFile;
-typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
+using VectorLogPtr = std::vector<std::unique_ptr<LogFile>>;
 
 enum WalFileType {
   /* Indicates that WAL file is in archive directory. WAL files are moved from
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/types.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/types.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/types.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/types.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,12 +12,44 @@
 
 // Define all public custom types here.
 
+using ColumnFamilyId = uint32_t;
+
 // Represents a sequence number in a WAL file.
-typedef uint64_t SequenceNumber;
+using SequenceNumber = uint64_t;
 
 const SequenceNumber kMinUnCommittedSeq = 1;  // 0 is always committed
 
+enum class TableFileCreationReason {
+  kFlush,
+  kCompaction,
+  kRecovery,
+  kMisc,
+};
+
+enum class BlobFileCreationReason {
+  kFlush,
+  kCompaction,
+  kRecovery,
+};
+
+// The types of files RocksDB uses in a DB directory. (Available for
+// advanced options.)
+enum FileType {
+  kWalFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile,
+  kOptionsFile,
+  kBlobFile
+};
+
 // User-oriented representation of internal key types.
+// Ordering of this enum entries should not change.
 enum EntryType {
   kEntryPut,
   kEntryDelete,
@@ -25,30 +57,8 @@
   kEntryMerge,
   kEntryRangeDeletion,
   kEntryBlobIndex,
+  kEntryDeleteWithTimestamp,
   kEntryOther,
 };
 
-// <user key, sequence number, and entry type> tuple.
-struct FullKey {
-  Slice user_key;
-  SequenceNumber sequence;
-  EntryType type;
-
-  FullKey() : sequence(0) {}  // Intentionally left uninitialized (for speed)
-  FullKey(const Slice& u, const SequenceNumber& seq, EntryType t)
-      : user_key(u), sequence(seq), type(t) {}
-  std::string DebugString(bool hex = false) const;
-
-  void clear() {
-    user_key.clear();
-    sequence = 0;
-    type = EntryType::kEntryPut;
-  }
-};
-
-// Parse slice representing internal key to FullKey
-// Parsed FullKey is valid for as long as the memory pointed to by
-// internal_key is alive.
-bool ParseFullKey(const Slice& internal_key, FullKey* result);
-
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/unique_id.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,46 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// EXPERIMENTAL: This API is subject to change
+//
+// Computes a stable, universally unique 192-bit (24 binary char) identifier
+// for an SST file from TableProperties. This is supported for table (SST)
+// files created with RocksDB 6.24 and later. NotSupported will be returned
+// for other cases. The first 16 bytes (128 bits) is of sufficient quality
+// for almost all applications, and shorter prefixes are usable as a
+// hash of the full unique id.
+//
+// Note: .c_str() is not compatible with binary char strings, so using
+// .c_str() on the result will often result in information loss and very
+// poor uniqueness probability.
+//
+// More detail: the first 128 bits are *guaranteed* unique for SST files
+// generated in the same process (even different DBs, RocksDB >= 6.26),
+// and first 128 bits are guaranteed not "all zeros" (RocksDB >= 6.26)
+// so that the "all zeros" value can be used reliably for a null ID.
+// Assuming one generates many SST files in the lifetime of each process,
+// the probability of collision between processes is "better than
+// random": if processes generate n SST files on average, we expect to
+// generate roughly 2^64 * sqrt(n) files before first collision in the
+// first 128 bits. See https://github.com/pdillinger/unique_id
+// Using the full 192 bits, we expect to generate roughly 2^96 * sqrt(n)
+// files before first collision.
+Status GetUniqueIdFromTableProperties(const TableProperties &props,
+                                      std::string *out_id);
+
+// EXPERIMENTAL: This API is subject to change
+//
+// Converts a binary string (unique id) to hexadecimal, with each 64 bits
+// separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B
+// Also works on unique id prefix.
+std::string UniqueIdToHumanString(const std::string &id);
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/universal_compaction.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,10 +5,12 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <climits>
+#include <cstdint>
 #include <vector>
 
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 //
@@ -36,12 +38,12 @@
   // The size amplification is defined as the amount (in percentage) of
   // additional storage needed to store a single byte of data in the database.
   // For example, a size amplification of 2% means that a database that
-  // contains 100 bytes of user-data may occupy upto 102 bytes of
+  // contains 100 bytes of user-data may occupy up to 102 bytes of
   // physical storage. By this definition, a fully compacted database has
   // a size amplification of 0%. Rocksdb uses the following heuristic
   // to calculate size amplification: it assumes that all files excluding
   // the earliest file contribute to the size amplification.
-  // Default: 200, which means that a 100 byte database could require upto
+  // Default: 200, which means that a 100 byte database could require up to
   // 300 bytes of storage.
   unsigned int max_size_amplification_percent;
 
@@ -72,6 +74,13 @@
   // Default: false
   bool allow_trivial_move;
 
+  // EXPERIMENTAL
+  // If true, try to limit compaction size under max_compaction_bytes.
+  // This might cause higher write amplification, but can prevent some
+  // problem caused by large compactions.
+  // Default: false
+  bool incremental;
+
   // Default set of parameters
   CompactionOptionsUniversal()
       : size_ratio(1),
@@ -80,7 +89,8 @@
         max_size_amplification_percent(200),
         compression_size_percent(-1),
         stop_style(kCompactionStopStyleTotalSize),
-        allow_trivial_move(false) {}
+        allow_trivial_move(false),
+        incremental(false) {}
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,616 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The default DB file checksum function name.
+constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+// The default BackupEngine file checksum function name.
+constexpr char kBackupFileChecksumFuncName[] = "crc32c";
+
+struct BackupEngineOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // If you want to have backups on HDFS, use HDFS Env here!
+  // Default: nullptr
+  Env* backup_env;
+
+  // share_table_files supports table and blob files.
+  //
+  // If share_table_files == true, the backup directory will share table and
+  // blob files among backups, to save space among backups of the same DB and to
+  // enable incremental backups by only copying new files.
+  // If share_table_files == false, each backup will be on its own and will not
+  // share any data with other backups.
+  //
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup and
+  // restore even on a machine crash/reboot. Backup and restore processes are
+  // slower with sync enabled. If sync == false, we can only guarantee that
+  // other previously synced backups and restores are not modified while
+  // creating a new one.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  // If false, we won't backup log files. This option can be useful for backing
+  // up in-memory databases where log file are persisted, but table files are in
+  // memory.
+  // Default: true
+  bool backup_log_files;
+
+  // Max bytes that can be transferred in a second during backup.
+  // If 0, go as fast as you can
+  // This limit only applies to writes. To also limit reads,
+  // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+  // have to be passed in through the option "backup_rate_limiter"
+  // Default: 0
+  uint64_t backup_rate_limit;
+
+  // Backup rate limiter. Used to control transfer speed for backup. If this is
+  // not null, backup_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
+
+  // Max bytes that can be transferred in a second during restore.
+  // If 0, go as fast as you can
+  // This limit only applies to writes. To also limit reads,
+  // a rate limiter able to also limit reads (e.g, its mode = kAllIo)
+  // have to be passed in through the option "restore_rate_limiter"
+  // Default: 0
+  uint64_t restore_rate_limit;
+
+  // Restore rate limiter. Used to control transfer speed during restore. If
+  // this is not null, restore_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
+
+  // share_files_with_checksum supports table and blob files.
+  //
+  // Only used if share_table_files is set to true. Setting to false is
+  // DEPRECATED and potentially dangerous because in that case BackupEngine
+  // can lose data if backing up databases with distinct or divergent
+  // history, for example if restoring from a backup other than the latest,
+  // writing to the DB, and creating another backup. Setting to true (default)
+  // prevents these issues by ensuring that different table files (SSTs) and
+  // blob files with the same number are treated as distinct. See
+  // share_files_with_checksum_naming and ShareFilesNaming.
+  //
+  // Default: true
+  bool share_files_with_checksum;
+
+  // Up to this many background threads will copy files for CreateNewBackup()
+  // and RestoreDBFromBackup()
+  // Default: 1
+  int max_background_operations;
+
+  // During backup user can get callback every time next
+  // callback_trigger_interval_size bytes being copied.
+  // Default: 4194304
+  uint64_t callback_trigger_interval_size;
+
+  // For BackupEngineReadOnly, Open() will open at most this many of the
+  // latest non-corrupted backups.
+  //
+  // Note: this setting is ignored (behaves like INT_MAX) for any kind of
+  // writable BackupEngine because it would inhibit accounting for shared
+  // files for proper backup deletion, including purging any incompletely
+  // created backups on creation of a new backup.
+  //
+  // Default: INT_MAX
+  int max_valid_backups_to_open;
+
+  // ShareFilesNaming describes possible naming schemes for backup
+  // table and blob file names when they are stored in the
+  // shared_checksum directory (i.e., both share_table_files and
+  // share_files_with_checksum are true).
+  enum ShareFilesNaming : uint32_t {
+    // Backup blob filenames are <file_number>_<crc32c>_<file_size>.blob and
+    // backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
+    // where <crc32c> is an unsigned decimal integer. This is the
+    // original/legacy naming scheme for share_files_with_checksum,
+    // with two problems:
+    // * At massive scale, collisions on this triple with different file
+    //   contents is plausible.
+    // * Determining the name to use requires computing the checksum,
+    //   so generally requires reading the whole file even if the file
+    //   is already backed up.
+    //
+    // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR **
+    kLegacyCrc32cAndFileSize = 1U,
+
+    // Backup SST filenames are <file_number>_s<db_session_id>.sst. This
+    // pair of values should be very strongly unique for a given SST file
+    // and easily determined before computing a checksum. The 's' indicates
+    // the value is a DB session id, not a checksum.
+    //
+    // Exceptions:
+    // * For blob files, kLegacyCrc32cAndFileSize is used as currently
+    //   db_session_id is not supported by the blob file format.
+    // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize
+    //   will be used instead, matching the names assigned by RocksDB versions
+    //   not supporting the newer naming scheme.
+    // * See also flags below.
+    kUseDbSessionId = 2U,
+
+    kMaskNoNamingFlags = 0xffffU,
+
+    // If not already part of the naming scheme, insert
+    //   _<file_size>
+    // before .sst and .blob in the name. In case of user code actually parsing
+    // the last _<whatever> before the .sst  and .blob as the file size, this
+    // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this
+    // option makes official that unofficial feature of the backup metadata.
+    //
+    // We do not consider SST and blob file sizes to have sufficient entropy to
+    // contribute significantly to naming uniqueness.
+    kFlagIncludeFileSize = 1U << 31,
+
+    kMaskNamingFlags = ~kMaskNoNamingFlags,
+  };
+
+  // Naming option for share_files_with_checksum table and blob files. See
+  // ShareFilesNaming for details.
+  //
+  // Modifying this option cannot introduce a downgrade compatibility issue
+  // because RocksDB can read, restore, and delete backups using different file
+  // names, and it's OK for a backup directory to use a mixture of table and
+  // blob files naming schemes.
+  //
+  // However, modifying this option and saving more backups to the same
+  // directory can lead to the same file getting saved again to that
+  // directory, under the new shared name in addition to the old shared
+  // name.
+  //
+  // Default: kUseDbSessionId | kFlagIncludeFileSize
+  //
+  // Note: This option comes into effect only if both share_files_with_checksum
+  // and share_table_files are true.
+  ShareFilesNaming share_files_with_checksum_naming;
+
+  void Dump(Logger* logger) const;
+
+  explicit BackupEngineOptions(
+      const std::string& _backup_dir, Env* _backup_env = nullptr,
+      bool _share_table_files = true, Logger* _info_log = nullptr,
+      bool _sync = true, bool _destroy_old_data = false,
+      bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
+      uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
+      uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
+      int _max_valid_backups_to_open = INT_MAX,
+      ShareFilesNaming _share_files_with_checksum_naming =
+          static_cast<ShareFilesNaming>(kUseDbSessionId | kFlagIncludeFileSize))
+      : backup_dir(_backup_dir),
+        backup_env(_backup_env),
+        share_table_files(_share_table_files),
+        info_log(_info_log),
+        sync(_sync),
+        destroy_old_data(_destroy_old_data),
+        backup_log_files(_backup_log_files),
+        backup_rate_limit(_backup_rate_limit),
+        restore_rate_limit(_restore_rate_limit),
+        share_files_with_checksum(true),
+        max_background_operations(_max_background_operations),
+        callback_trigger_interval_size(_callback_trigger_interval_size),
+        max_valid_backups_to_open(_max_valid_backups_to_open),
+        share_files_with_checksum_naming(_share_files_with_checksum_naming) {
+    assert(share_table_files || !share_files_with_checksum);
+    assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0);
+  }
+};
+
+inline BackupEngineOptions::ShareFilesNaming operator&(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert(r == BackupEngineOptions::kMaskNoNamingFlags ||
+         (r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l & r);
+}
+
+inline BackupEngineOptions::ShareFilesNaming operator|(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l | r);
+}
+
+struct CreateBackupOptions {
+  // Flush will always trigger if 2PC is enabled.
+  // If write-ahead logs are disabled, set flush_before_backup=true to
+  // avoid losing unflushed key/value pairs from the memtable.
+  bool flush_before_backup = false;
+
+  // Callback for reporting progress, based on callback_trigger_interval_size.
+  //
+  // RocksDB callbacks are NOT exception-safe. A callback completing with an
+  // exception can lead to undefined behavior in RocksDB, including data loss,
+  // unreported corruption, deadlocks, and more.
+  std::function<void()> progress_callback = []() {};
+
+  // If false, background_thread_cpu_priority is ignored.
+  // Otherwise, the cpu priority can be decreased,
+  // if you try to increase the priority, the priority will not change.
+  // The initial priority of the threads is CpuPriority::kNormal,
+  // so you can decrease to priorities lower than kNormal.
+  bool decrease_background_thread_cpu_priority = false;
+  CpuPriority background_thread_cpu_priority = CpuPriority::kNormal;
+};
+
+struct RestoreOptions {
+  // If true, restore won't overwrite the existing log files in wal_dir. It will
+  // also move all log files from archive directory to wal_dir. Use this option
+  // in combination with BackupEngineOptions::backup_log_files = false for
+  // persisting in-memory databases.
+  // Default: false
+  bool keep_log_files;
+
+  explicit RestoreOptions(bool _keep_log_files = false)
+      : keep_log_files(_keep_log_files) {}
+};
+
+using BackupID = uint32_t;
+
+using BackupFileInfo = FileStorageInfo;
+
+struct BackupInfo {
+  BackupID backup_id = 0U;
+  // Creation time, according to GetCurrentTime
+  int64_t timestamp = 0;
+
+  // Total size in bytes (based on file payloads, not including filesystem
+  // overheads or backup meta file)
+  uint64_t size = 0U;
+
+  // Number of backed up files, some of which might be shared with other
+  // backups. Does not include backup meta file.
+  uint32_t number_files = 0U;
+
+  // Backup API user metadata
+  std::string app_metadata;
+
+  // Backup file details, if requested with include_file_details=true
+  std::vector<BackupFileInfo> file_details;
+
+  // DB "name" (a directory in the backup_env) for opening this backup as a
+  // read-only DB. This should also be used as the DBOptions::wal_dir, such
+  // as by default setting wal_dir="". See also env_for_open.
+  // This field is only set if include_file_details=true
+  std::string name_for_open;
+
+  // An Env(+FileSystem) for opening this backup as a read-only DB, with
+  // DB::OpenForReadOnly or similar. This field is only set if
+  // include_file_details=true. (The FileSystem in this Env takes care
+  // of making shared backup files openable from the `name_for_open` DB
+  // directory.) See also name_for_open.
+  //
+  // This Env might or might not be shared with other backups. To work
+  // around DBOptions::env being a raw pointer, this is a shared_ptr so
+  // that keeping either this BackupInfo, the BackupEngine, or a copy of
+  // this shared_ptr alive is sufficient to keep the Env alive for use by
+  // a read-only DB.
+  std::shared_ptr<Env> env_for_open;
+
+  BackupInfo() {}
+
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+             uint32_t _number_files, const std::string& _app_metadata)
+      : backup_id(_backup_id),
+        timestamp(_timestamp),
+        size(_size),
+        number_files(_number_files),
+        app_metadata(_app_metadata) {}
+};
+
+class BackupStatistics {
+ public:
+  BackupStatistics() {
+    number_success_backup = 0;
+    number_fail_backup = 0;
+  }
+
+  BackupStatistics(uint32_t _number_success_backup,
+                   uint32_t _number_fail_backup)
+      : number_success_backup(_number_success_backup),
+        number_fail_backup(_number_fail_backup) {}
+
+  ~BackupStatistics() {}
+
+  void IncrementNumberSuccessBackup();
+  void IncrementNumberFailBackup();
+
+  uint32_t GetNumberSuccessBackup() const;
+  uint32_t GetNumberFailBackup() const;
+
+  std::string ToString() const;
+
+ private:
+  uint32_t number_success_backup;
+  uint32_t number_fail_backup;
+};
+
+// Read-only functions of a BackupEngine. (Restore writes to another directory
+// not the backup directory.) See BackupEngine comments for details on
+// safe concurrent operations.
+class BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnlyBase() {}
+
+  // Returns info about the latest good backup in backup_info, or NotFound
+  // no good backup exists.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetLatestBackupInfo(
+      BackupInfo* backup_info, bool include_file_details = false) const = 0;
+
+  // Returns info about a specific backup in backup_info, or NotFound
+  // or Corruption status if the requested backup id does not exist or is
+  // known corrupt.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                               bool include_file_details = false) const = 0;
+
+  // Returns info about non-corrupt backups in backup_infos.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_infos,
+                             bool include_file_details = false) const = 0;
+
+  // Returns info about corrupt backups in corrupt_backups.
+  // WARNING: Any write to the BackupEngine could trigger automatic
+  // GarbageCollect(), which could delete files that would be needed to
+  // manually recover a corrupt backup or to preserve an unrecognized (e.g.
+  // incompatible future version) backup.
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) const = 0;
+
+  // Restore to specified db_dir and wal_dir from backup_id.
+  virtual IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+                                       BackupID backup_id,
+                                       const std::string& db_dir,
+                                       const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual IOStatus RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+  }
+
+  // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id
+  virtual IOStatus RestoreDBFromLatestBackup(
+      const RestoreOptions& options, const std::string& db_dir,
+      const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual IOStatus RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
+  }
+
+  // If verify_with_checksum is true, this function
+  // inspects the current checksums and file sizes of backup files to see if
+  // they match our expectation.
+  //
+  // If verify_with_checksum is false, this function
+  // checks that each file exists and that the size of the file matches our
+  // expectation. It does not check file checksum.
+  //
+  // If this BackupEngine created the backup, it compares the files' current
+  // sizes (and current checksum) against the number of bytes written to
+  // them (and the checksum calculated) during creation.
+  // Otherwise, it compares the files' current sizes (and checksums) against
+  // their sizes (and checksums) when the BackupEngine was opened.
+  //
+  // Returns Status::OK() if all checks are good
+  virtual IOStatus VerifyBackup(BackupID backup_id,
+                                bool verify_with_checksum = false) const = 0;
+};
+
+// Append-only functions of a BackupEngine. See BackupEngine comment for
+// details on distinction between Append and Write operations and safe
+// concurrent operations.
+class BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngineAppendOnlyBase() {}
+
+  // same as CreateNewBackup, but stores extra application metadata.
+  virtual IOStatus CreateNewBackupWithMetadata(
+      const CreateBackupOptions& options, DB* db,
+      const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0;
+
+  // keep here for backward compatibility.
+  virtual IOStatus CreateNewBackupWithMetadata(
+      DB* db, const std::string& app_metadata, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackupWithMetadata(options, db, app_metadata);
+  }
+
+  // Captures the state of the database by creating a new (latest) backup.
+  // On success (OK status), the BackupID of the new backup is saved to
+  // *new_backup_id when not nullptr.
+  // NOTE: db_paths and cf_paths are not supported for creating backups,
+  // and NotSupported will be returned when the DB (without WALs) uses more
+  // than one directory.
+  virtual IOStatus CreateNewBackup(const CreateBackupOptions& options, DB* db,
+                                   BackupID* new_backup_id = nullptr) {
+    return CreateNewBackupWithMetadata(options, db, "", new_backup_id);
+  }
+
+  // keep here for backward compatibility.
+  virtual IOStatus CreateNewBackup(
+      DB* db, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackup(options, db);
+  }
+
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediately, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up the
+  // next time you call CreateNewBackup or GarbageCollect.
+  virtual void StopBackup() = 0;
+
+  // Will delete any files left over from incomplete creation or deletion of
+  // a backup. This is not normally needed as those operations also clean up
+  // after prior incomplete calls to the same kind of operation (create or
+  // delete). This does not delete corrupt backups but can delete files that
+  // would be needed to manually recover a corrupt backup or to preserve an
+  // unrecognized (e.g. incompatible future version) backup.
+  // NOTE: This is not designed to delete arbitrary files added to the backup
+  // directory outside of BackupEngine, and clean-up is always subject to
+  // permissions on and availability of the underlying filesystem.
+  // NOTE2: For concurrency and interference purposes (see BackupEngine
+  // comment), GarbageCollect (GC) is like other Append operations, even
+  // though it seems different. Although GC can delete physical data, it does
+  // not delete any logical data read by Read operations. GC can interfere
+  // with Append or Write operations in another BackupEngine on the same
+  // backup_dir, because temporary files will be treated as obsolete and
+  // deleted.
+  virtual IOStatus GarbageCollect() = 0;
+};
+
+// A backup engine for organizing and managing backups.
+// This class is not user-extensible.
+//
+// This class declaration adds "Write" operations in addition to the
+// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase.
+//
+// # Concurrency between threads on the same BackupEngine* object
+//
+// As of version 6.20, BackupEngine* operations are generally thread-safe,
+// using a read-write lock, though single-thread operation is still
+// recommended to avoid TOCTOU bugs. Specifically, particular kinds of
+// concurrent operations behave like this:
+//
+// op1\op2| Read  | Append | Write
+// -------|-------|--------|--------
+//   Read | conc  | block  | block
+// Append | block | block  | block
+//  Write | block | block  | block
+//
+// conc = operations safely proceed concurrently
+// block = one of the operations safely blocks until the other completes.
+//   There is generally no guarantee as to which completes first.
+//
+// StopBackup is the only operation that affects an ongoing operation.
+//
+// # Interleaving operations between BackupEngine* objects open on the
+// same backup_dir
+//
+// It is recommended only to have one BackupEngine* object open for a given
+// backup_dir, but it is possible to mix / interleave some operations
+// (regardless of whether they are concurrent) with these caveats:
+//
+// op1\op2|  Open  |  Read  | Append | Write
+// -------|--------|--------|--------|--------
+//   Open | conc   | conc   | atomic | unspec
+//   Read | conc   | conc   | old    | unspec
+// Append | atomic | old    | unspec | unspec
+//  Write | unspec | unspec | unspec | unspec
+//
+// Special case: Open with destroy_old_data=true is really a Write
+//
+// conc = operations safely proceed, concurrently when applicable
+// atomic = operations are effectively atomic; if a concurrent Append
+//   operation has not completed at some key point during Open, the
+//   opened BackupEngine* will never see the result of the Append op.
+// old = Read operations do not include any state changes from other
+//   BackupEngine* objects; they return the state at their Open time.
+// unspec = Behavior is unspecified, including possibly trashing the
+//   backup_dir, but is "memory safe" (no C++ undefined behavior)
+//
+class BackupEngine : public BackupEngineReadOnlyBase,
+                     public BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngine() {}
+
+  // BackupEngineOptions have to be the same as the ones used in previous
+  // BackupEngines for the same backup directory.
+  static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+                       BackupEngine** backup_engine_ptr);
+
+  // keep for backward compatibility.
+  static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+                       BackupEngine** backup_engine_ptr) {
+    return BackupEngine::Open(options, db_env, backup_engine_ptr);
+  }
+
+  // Deletes old backups, keeping latest num_backups_to_keep alive.
+  // See also DeleteBackup.
+  virtual IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+
+  // Deletes a specific backup. If this operation (or PurgeOldBackups)
+  // is not completed due to crash, power failure, etc. the state
+  // will be cleaned up the next time you call DeleteBackup,
+  // PurgeOldBackups, or GarbageCollect.
+  virtual IOStatus DeleteBackup(BackupID backup_id) = 0;
+};
+
+// A variant of BackupEngine that only allows "Read" operations. See
+// BackupEngine comment for details. This class is not user-extensible.
+class BackupEngineReadOnly : public BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnly() {}
+
+  static IOStatus Open(const BackupEngineOptions& options, Env* db_env,
+                       BackupEngineReadOnly** backup_engine_ptr);
+  // keep for backward compatibility.
+  static IOStatus Open(Env* db_env, const BackupEngineOptions& options,
+                       BackupEngineReadOnly** backup_engine_ptr) {
+    return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/backupable_db.h	2025-05-19 16:14:27.000000000 +0000
@@ -1,341 +1,26 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This is a DEPRECATED header for API backward compatibility. Please
+// use backup_engine.h.
 
 #pragma once
 #ifndef ROCKSDB_LITE
 
+// A legacy unnecessary include
 #include <cinttypes>
-#include <functional>
-#include <map>
-#include <string>
-#include <vector>
 
-#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/backup_engine.h"
 
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
+// A legacy unnecessary include
+#include "rocksdb/utilities/stackable_db.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-struct BackupableDBOptions {
-  // Where to keep the backup files. Has to be different than dbname_
-  // Best to set this to dbname_ + "/backups"
-  // Required
-  std::string backup_dir;
-
-  // Backup Env object. It will be used for backup file I/O. If it's
-  // nullptr, backups will be written out using DBs Env. If it's
-  // non-nullptr, backup's I/O will be performed using this object.
-  // If you want to have backups on HDFS, use HDFS Env here!
-  // Default: nullptr
-  Env* backup_env;
-
-  // If share_table_files == true, backup will assume that table files with
-  // same name have the same contents. This enables incremental backups and
-  // avoids unnecessary data copies.
-  // If share_table_files == false, each backup will be on its own and will
-  // not share any data with other backups.
-  // default: true
-  bool share_table_files;
-
-  // Backup info and error messages will be written to info_log
-  // if non-nullptr.
-  // Default: nullptr
-  Logger* info_log;
-
-  // If sync == true, we can guarantee you'll get consistent backup even
-  // on a machine crash/reboot. Backup process is slower with sync enabled.
-  // If sync == false, we don't guarantee anything on machine reboot. However,
-  // chances are some of the backups are consistent.
-  // Default: true
-  bool sync;
-
-  // If true, it will delete whatever backups there are already
-  // Default: false
-  bool destroy_old_data;
-
-  // If false, we won't backup log files. This option can be useful for backing
-  // up in-memory databases where log file are persisted, but table files are in
-  // memory.
-  // Default: true
-  bool backup_log_files;
-
-  // Max bytes that can be transferred in a second during backup.
-  // If 0, go as fast as you can
-  // Default: 0
-  uint64_t backup_rate_limit;
-
-  // Backup rate limiter. Used to control transfer speed for backup. If this is
-  // not null, backup_rate_limit is ignored.
-  // Default: nullptr
-  std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
-
-  // Max bytes that can be transferred in a second during restore.
-  // If 0, go as fast as you can
-  // Default: 0
-  uint64_t restore_rate_limit;
-
-  // Restore rate limiter. Used to control transfer speed during restore. If
-  // this is not null, restore_rate_limit is ignored.
-  // Default: nullptr
-  std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
-
-  // Only used if share_table_files is set to true. If true, will consider that
-  // backups can come from different databases, hence a sst is not uniquely
-  // identifed by its name, but by the triple (file name, crc32, file length)
-  // Default: false
-  // Note: this is an experimental option, and you'll need to set it manually
-  // *turn it on only if you know what you're doing*
-  bool share_files_with_checksum;
-
-  // Up to this many background threads will copy files for CreateNewBackup()
-  // and RestoreDBFromBackup()
-  // Default: 1
-  int max_background_operations;
-
-  // During backup user can get callback every time next
-  // callback_trigger_interval_size bytes being copied.
-  // Default: 4194304
-  uint64_t callback_trigger_interval_size;
-
-  // For BackupEngineReadOnly, Open() will open at most this many of the
-  // latest non-corrupted backups.
-  //
-  // Note: this setting is ignored (behaves like INT_MAX) for any kind of
-  // writable BackupEngine because it would inhibit accounting for shared
-  // files for proper backup deletion, including purging any incompletely
-  // created backups on creation of a new backup.
-  //
-  // Default: INT_MAX
-  int max_valid_backups_to_open;
-
-  void Dump(Logger* logger) const;
-
-  explicit BackupableDBOptions(
-      const std::string& _backup_dir, Env* _backup_env = nullptr,
-      bool _share_table_files = true, Logger* _info_log = nullptr,
-      bool _sync = true, bool _destroy_old_data = false,
-      bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
-      uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
-      uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
-      int _max_valid_backups_to_open = INT_MAX)
-      : backup_dir(_backup_dir),
-        backup_env(_backup_env),
-        share_table_files(_share_table_files),
-        info_log(_info_log),
-        sync(_sync),
-        destroy_old_data(_destroy_old_data),
-        backup_log_files(_backup_log_files),
-        backup_rate_limit(_backup_rate_limit),
-        restore_rate_limit(_restore_rate_limit),
-        share_files_with_checksum(false),
-        max_background_operations(_max_background_operations),
-        callback_trigger_interval_size(_callback_trigger_interval_size),
-        max_valid_backups_to_open(_max_valid_backups_to_open) {
-    assert(share_table_files || !share_files_with_checksum);
-  }
-};
-
-struct RestoreOptions {
-  // If true, restore won't overwrite the existing log files in wal_dir. It will
-  // also move all log files from archive directory to wal_dir. Use this option
-  // in combination with BackupableDBOptions::backup_log_files = false for
-  // persisting in-memory databases.
-  // Default: false
-  bool keep_log_files;
-
-  explicit RestoreOptions(bool _keep_log_files = false)
-      : keep_log_files(_keep_log_files) {}
-};
-
-typedef uint32_t BackupID;
-
-struct BackupInfo {
-  BackupID backup_id;
-  int64_t timestamp;
-  uint64_t size;
-
-  uint32_t number_files;
-  std::string app_metadata;
-
-  BackupInfo() {}
-
-  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
-             uint32_t _number_files, const std::string& _app_metadata)
-      : backup_id(_backup_id),
-        timestamp(_timestamp),
-        size(_size),
-        number_files(_number_files),
-        app_metadata(_app_metadata) {}
-};
-
-class BackupStatistics {
- public:
-  BackupStatistics() {
-    number_success_backup = 0;
-    number_fail_backup = 0;
-  }
-
-  BackupStatistics(uint32_t _number_success_backup,
-                   uint32_t _number_fail_backup)
-      : number_success_backup(_number_success_backup),
-        number_fail_backup(_number_fail_backup) {}
-
-  ~BackupStatistics() {}
-
-  void IncrementNumberSuccessBackup();
-  void IncrementNumberFailBackup();
-
-  uint32_t GetNumberSuccessBackup() const;
-  uint32_t GetNumberFailBackup() const;
-
-  std::string ToString() const;
-
- private:
-  uint32_t number_success_backup;
-  uint32_t number_fail_backup;
-};
-
-// A backup engine for accessing information about backups and restoring from
-// them.
-class BackupEngineReadOnly {
- public:
-  virtual ~BackupEngineReadOnly() {}
-
-  static Status Open(Env* db_env, const BackupableDBOptions& options,
-                     BackupEngineReadOnly** backup_engine_ptr);
-
-  // Returns info about backups in backup_info
-  // You can GetBackupInfo safely, even with other BackupEngine performing
-  // backups on the same directory
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
-
-  // Returns info about corrupt backups in corrupt_backups
-  virtual void GetCorruptedBackups(
-      std::vector<BackupID>* corrupt_backup_ids) = 0;
-
-  // Restoring DB from backup is NOT safe when there is another BackupEngine
-  // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
-  // responsibility to synchronize the operation, i.e. don't delete the backup
-  // when you're restoring from it
-  // See also the corresponding doc in BackupEngine
-  virtual Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-
-  // See the corresponding doc in BackupEngine
-  virtual Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-
-  // checks that each file exists and that the size of the file matches our
-  // expectations. it does not check file checksum.
-  //
-  // If this BackupEngine created the backup, it compares the files' current
-  // sizes against the number of bytes written to them during creation.
-  // Otherwise, it compares the files' current sizes against their sizes when
-  // the BackupEngine was opened.
-  //
-  // Returns Status::OK() if all checks are good
-  virtual Status VerifyBackup(BackupID backup_id) = 0;
-};
-
-// A backup engine for creating new backups.
-class BackupEngine {
- public:
-  virtual ~BackupEngine() {}
-
-  // BackupableDBOptions have to be the same as the ones used in previous
-  // BackupEngines for the same backup directory.
-  static Status Open(Env* db_env, const BackupableDBOptions& options,
-                     BackupEngine** backup_engine_ptr);
-
-  // same as CreateNewBackup, but stores extra application metadata
-  // Flush will always trigger if 2PC is enabled.
-  // If write-ahead logs are disabled, set flush_before_backup=true to
-  // avoid losing unflushed key/value pairs from the memtable.
-  virtual Status CreateNewBackupWithMetadata(
-      DB* db, const std::string& app_metadata, bool flush_before_backup = false,
-      std::function<void()> progress_callback = []() {}) = 0;
-
-  // Captures the state of the database in the latest backup
-  // NOT a thread safe call
-  // Flush will always trigger if 2PC is enabled.
-  // If write-ahead logs are disabled, set flush_before_backup=true to
-  // avoid losing unflushed key/value pairs from the memtable.
-  virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false,
-                                 std::function<void()> progress_callback =
-                                     []() {}) {
-    return CreateNewBackupWithMetadata(db, "", flush_before_backup,
-                                       progress_callback);
-  }
-
-  // Deletes old backups, keeping latest num_backups_to_keep alive.
-  // See also DeleteBackup.
-  virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
-
-  // Deletes a specific backup. If this operation (or PurgeOldBackups)
-  // is not completed due to crash, power failure, etc. the state
-  // will be cleaned up the next time you call DeleteBackup,
-  // PurgeOldBackups, or GarbageCollect.
-  virtual Status DeleteBackup(BackupID backup_id) = 0;
-
-  // Call this from another thread if you want to stop the backup
-  // that is currently happening. It will return immediatelly, will
-  // not wait for the backup to stop.
-  // The backup will stop ASAP and the call to CreateNewBackup will
-  // return Status::Incomplete(). It will not clean up after itself, but
-  // the state will remain consistent. The state will be cleaned up the
-  // next time you call CreateNewBackup or GarbageCollect.
-  virtual void StopBackup() = 0;
-
-  // Returns info about backups in backup_info
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
-
-  // Returns info about corrupt backups in corrupt_backups
-  virtual void GetCorruptedBackups(
-      std::vector<BackupID>* corrupt_backup_ids) = 0;
-
-  // restore from backup with backup_id
-  // IMPORTANT -- if options_.share_table_files == true,
-  // options_.share_files_with_checksum == false, you restore DB from some
-  // backup that is not the latest, and you start creating new backups from the
-  // new DB, they will probably fail.
-  //
-  // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
-  // If you add new data to the DB and try creating a new backup now, the
-  // database will diverge from backups 4 and 5 and the new backup will fail.
-  // If you want to create new backup, you will first have to delete backups 4
-  // and 5.
-  virtual Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-
-  // restore from the latest backup
-  virtual Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) = 0;
-
-  // checks that each file exists and that the size of the file matches our
-  // expectations. it does not check file checksum.
-  // Returns Status::OK() if all checks are good
-  virtual Status VerifyBackup(BackupID backup_id) = 0;
-
-  // Will delete any files left over from incomplete creation or deletion of
-  // a backup. This is not normally needed as those operations also clean up
-  // after prior incomplete calls to the same kind of operation (create or
-  // delete).
-  // NOTE: This is not designed to delete arbitrary files added to the backup
-  // directory outside of BackupEngine, and clean-up is always subject to
-  // permissions on and availability of the underlying filesystem.
-  virtual Status GarbageCollect() = 0;
-};
+using BackupableDBOptions = BackupEngineOptions;
 
 }  // namespace ROCKSDB_NAMESPACE
+
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/cache_dump_load.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,142 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <set>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The classes and functions in this header file is used for dumping out the
+// blocks in a block cache, storing or transfering the blocks to another
+// destination host, and load these blocks to the secondary cache at destination
+// host.
+// NOTE that: The classes, functions, and data structures are EXPERIMENTAL! They
+// my be changed in the future when the development continues.
+
+// The major and minor version number of the data format to be stored/trandfered
+// via CacheDumpWriter and read out via CacheDumpReader
+static const int kCacheDumpMajorVersion = 0;
+static const int kCacheDumpMinorVersion = 1;
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to write or transfer the data that is created by
+// CacheDumper. We pack one block with its block type, dump time, block key in
+// the block cache, block len, block crc32c checksum and block itself as a unit
+// and it is stored via WritePacket. Before we call WritePacket, we must call
+// WriteMetadata once, which stores the sequence number, block unit checksum,
+// and block unit size.
+// We provide file based CacheDumpWriter to store the metadata and its package
+// sequentially in a file as the defualt implementation. Users can implement
+// their own CacheDumpWriter to store/transfer the data. For example, user can
+// create a subclass which transfer the metadata and package on the fly.
+class CacheDumpWriter {
+ public:
+  virtual ~CacheDumpWriter() = default;
+
+  // Called ONCE before the calls to WritePacket
+  virtual IOStatus WriteMetadata(const Slice& metadata) = 0;
+  virtual IOStatus WritePacket(const Slice& data) = 0;
+  virtual IOStatus Close() = 0;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is an abstract class to read or receive the data that is stored
+// or transfered by CacheDumpWriter. Note that, ReadMetadata must be called
+// once before we call a ReadPacket.
+class CacheDumpReader {
+ public:
+  virtual ~CacheDumpReader() = default;
+  // Called ONCE before the calls to ReadPacket
+  virtual IOStatus ReadMetadata(std::string* metadata) = 0;
+  // Sets data to empty string on EOF
+  virtual IOStatus ReadPacket(std::string* data) = 0;
+  // (Close not needed)
+};
+
+// CacheDumpOptions is the option for CacheDumper and CacheDumpedLoader. Any
+// dump or load process related control variables can be added here.
+struct CacheDumpOptions {
+  SystemClock* clock;
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This the class to dump out the block in the block cache, store/transfer them
+// via CacheDumpWriter. In order to dump out the blocks belonging to a certain
+// DB or a list of DB (block cache can be shared by many DB), user needs to call
+// SetDumpFilter to specify a list of DB to filter out the blocks that do not
+// belong to those DB.
+// A typical use case is: when we migrate a DB instance from host A to host B.
+// We need to reopen the DB at host B after all the files are copied to host B.
+// At this moment, the block cache at host B does not have any block from this
+// migrated DB. Therefore, the read performance can be low due to cache warm up.
+// By using CacheDumper before we shut down the DB at host A and using
+// CacheDumpedLoader at host B before we reopen the DB, we can warmup the cache
+// ahead. This function can be used in other use cases also.
+class CacheDumper {
+ public:
+  virtual ~CacheDumper() = default;
+  // Only dump the blocks in the block cache that belong to the DBs in this list
+  virtual Status SetDumpFilter(std::vector<DB*> db_list) {
+    (void)db_list;
+    return Status::NotSupported("SetDumpFilter is not supported");
+  }
+  // The main function to dump out all the blocks that satisfy the filter
+  // condition from block cache to a certain CacheDumpWriter in one shot. This
+  // process may take some time.
+  virtual IOStatus DumpCacheEntriesToWriter() {
+    return IOStatus::NotSupported("DumpCacheEntriesToWriter is not supported");
+  }
+};
+
+// NOTE that: this class is EXPERIMENTAL! May be changed in the future!
+// This is the class to load the dumped blocks to the destination cache. For now
+// we only load the blocks to the SecondaryCache. In the future, we may plan to
+// support loading to the block cache.
+class CacheDumpedLoader {
+ public:
+  virtual ~CacheDumpedLoader() = default;
+  virtual IOStatus RestoreCacheEntriesToSecondaryCache() {
+    return IOStatus::NotSupported(
+        "RestoreCacheEntriesToSecondaryCache is not supported");
+  }
+};
+
+// Get the writer which stores all the metadata and data sequentially to a file
+IOStatus NewToFileCacheDumpWriter(const std::shared_ptr<FileSystem>& fs,
+                                  const FileOptions& file_opts,
+                                  const std::string& file_name,
+                                  std::unique_ptr<CacheDumpWriter>* writer);
+
+// Get the reader which read out the metadata and data sequentially from a file
+IOStatus NewFromFileCacheDumpReader(const std::shared_ptr<FileSystem>& fs,
+                                    const FileOptions& file_opts,
+                                    const std::string& file_name,
+                                    std::unique_ptr<CacheDumpReader>* reader);
+
+// Get the default cache dumper
+Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options,
+                             const std::shared_ptr<Cache>& cache,
+                             std::unique_ptr<CacheDumpWriter>&& writer,
+                             std::unique_ptr<CacheDumper>* cache_dumper);
+
+// Get the default cache dump loader
+Status NewDefaultCacheDumpedLoader(
+    const CacheDumpOptions& dump_options,
+    const BlockBasedTableOptions& toptions,
+    const std::shared_ptr<SecondaryCache>& secondary_cache,
+    std::unique_ptr<CacheDumpReader>&& reader,
+    std::unique_ptr<CacheDumpedLoader>* cache_dump_loader);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/checkpoint.h	2025-05-19 16:14:27.000000000 +0000
@@ -24,21 +24,28 @@
   // Creates a Checkpoint object to be used for creating openable snapshots
   static Status Create(DB* db, Checkpoint** checkpoint_ptr);
 
-  // Builds an openable snapshot of RocksDB on the same disk, which
-  // accepts an output directory on the same disk, and under the directory
-  // (1) hard-linked SST files pointing to existing live SST files
-  // SST files will be copied if output directory is on a different filesystem
-  // (2) a copied manifest files and other files
-  // The directory should not already exist and will be created by this API.
-  // The directory will be an absolute path
+  // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an
+  // absolute path. The specified directory should not exist, since it will be
+  // created by the API.
+  // When a checkpoint is created,
+  // (1) SST and blob files are hard linked if the output directory is on the
+  // same filesystem as the database, and copied otherwise.
+  // (2) other required files (like MANIFEST) are always copied.
   // log_size_for_flush: if the total log file size is equal or larger than
   // this value, then a flush is triggered for all the column families. The
   // default value is 0, which means flush is always triggered. If you move
   // away from the default, the checkpoint may not contain up-to-date data
   // if WAL writing is not always enabled.
   // Flush will always trigger if it is 2PC.
+  // sequence_number_ptr: if it is not nullptr, the value it points to will be
+  // set to a sequence number guaranteed to be part of the DB, not necessarily
+  // the latest. The default value of this parameter is nullptr.
+  // NOTE: db_paths and cf_paths are not supported for creating checkpoints
+  // and NotSupported will be returned when the DB (without WALs) uses more
+  // than one directory.
   virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
-                                  uint64_t log_size_for_flush = 0);
+                                  uint64_t log_size_for_flush = 0,
+                                  uint64_t* sequence_number_ptr = nullptr);
 
   // Exports all live SST files of a specified Column Family onto export_dir,
   // returning SST files information in metadata.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/customizable_util.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,368 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The FactoryFunc functions are used to create a new customizable object
+// without going through the ObjectRegistry.  This methodology is especially
+// useful in LITE mode, where there is no ObjectRegistry.  The methods take
+// in an ID of the object to create and a pointer to store the created object.
+// If the factory successfully recognized the input ID, the method should return
+// success; otherwise false should be returned.  On success, the object
+// parameter contains the new object.
+template <typename T>
+using SharedFactoryFunc =
+    std::function<bool(const std::string&, std::shared_ptr<T>*)>;
+
+template <typename T>
+using UniqueFactoryFunc =
+    std::function<bool(const std::string&, std::unique_ptr<T>*)>;
+
+template <typename T>
+using StaticFactoryFunc = std::function<bool(const std::string&, T**)>;
+
+// Creates a new shared customizable instance object based on the
+// input parameters using the object registry.
+//
+// The id parameter specifies the instance class of the object to create.
+// The opt_map parameter specifies the configuration of the new instance.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewSharedObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::shared_ptr<T>* result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewSharedObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                                opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    result->reset();
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new managed customizable instance object based on the
+// input parameters using the object registry.  Unlike "shared" objects,
+// managed objects are limited to a single instance per ID.
+//
+// The id parameter specifies the instance class of the object to create.
+// If an object with this id exists in the registry, the existing object
+// will be returned.  If the object does not exist, a new one will be created.
+//
+// The opt_map parameter specifies the configuration of the new instance.
+// If the object already exists, the existing object is returned "as is" and
+// this parameter is ignored.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the object.  This string
+// will be used by the object registry to locate the appropriate object to
+// create or return.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The managed instance.
+template <typename T>
+static Status NewManagedObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::shared_ptr<T>* result) {
+  Status status;
+  if (!id.empty()) {
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->GetOrCreateManagedObject<T>(
+        id, result, [config_options, opt_map](T* object) {
+          return object->ConfigureFromMap(config_options, opt_map);
+        });
+#else
+    (void)result;
+    (void)opt_map;
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      return Status::OK();
+    }
+  } else {
+    status = Status::NotSupported("Cannot reset object ");
+  }
+  return status;
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+// This method parses the input value to determine the type of instance to
+// create. If there is an existing instance (in result) and it is the same ID
+// as the object being created, the existing configuration is stored and used as
+// the default for the new object.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings.  If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object.  Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadSharedObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const SharedFactoryFunc<T>& func,
+                               std::shared_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewSharedObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, result->get(),
+                                            opt_map);
+  }
+}
+
+// Creates a new shared Customizable object based on the input parameters.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings.  If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object.  Id name-value
+// pairs are specified, there should be an "id=value" pairing or an error may
+// result.
+//
+// The "id" field from the value (either the whole field or "id=XX") is used
+// to determine the type/id of the object to return.  For a given id, there
+// the same instance of the object will be returned from this method (as opposed
+// to LoadSharedObject which would create different objects for the same id.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration. If ignore_unsupported_options=true, unknown instance types
+// are ignored. If invoke_prepare_options=true, the resulting instance will be
+// initialized (via PrepareOptions)
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadManagedObject(const ConfigOptions& config_options,
+                                const std::string& value,
+                                std::shared_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, nullptr, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (value.empty()) {  // No Id and no options.  Clear the object
+    *result = nullptr;
+    return Status::OK();
+  } else {
+    return NewManagedObject(config_options, id, opt_map, result);
+  }
+}
+
+// Creates a new unique pointer customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewUniqueObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    std::unique_ptr<T>* result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewUniqueObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                                opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    result->reset();
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new unique customizable instance object based on the input
+// parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadUniqueObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const UniqueFactoryFunc<T>& func,
+                               std::unique_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewUniqueObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, result->get(),
+                                            opt_map);
+  }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters using the object registry.
+// @see NewSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param id The identifier of the new object being created.  This string
+// will be used by the object registry to locate the appropriate object to
+// create.
+// @param opt_map Optional name-value pairs of properties to set for the newly
+// created object
+// @param result The newly created and configured instance.
+template <typename T>
+static Status NewStaticObject(
+    const ConfigOptions& config_options, const std::string& id,
+    const std::unordered_map<std::string, std::string>& opt_map, T** result) {
+  if (!id.empty()) {
+    Status status;
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewStaticObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      status = Status::OK();
+    } else if (status.ok()) {
+      status =
+          Customizable::ConfigureNewObject(config_options, *result, opt_map);
+    }
+    return status;
+  } else if (opt_map.empty()) {
+    // There was no ID and no map (everything empty), so reset/clear the result
+    *result = nullptr;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("Cannot reset object ");
+  }
+}
+
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to create and initailize the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadStaticObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const StaticFactoryFunc<T>& func, T** result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, *result, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (func == nullptr ||
+             !func(id, result)) {  // No factory, or it failed
+    return NewStaticObject(config_options, id, opt_map, result);
+  } else {
+    return Customizable::ConfigureNewObject(config_options, *result, opt_map);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/db_ttl.h	2025-05-19 16:14:27.000000000 +0000
@@ -57,7 +57,7 @@
   static Status Open(const DBOptions& db_options, const std::string& dbname,
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles,
-                     DBWithTTL** dbptr, std::vector<int32_t> ttls,
+                     DBWithTTL** dbptr, const std::vector<int32_t>& ttls,
                      bool read_only = false);
 
   virtual void SetTtl(int32_t ttl) = 0;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/env_librados.h	2025-05-19 16:14:27.000000000 +0000
@@ -76,7 +76,8 @@
   // Store in *result the names of the children of the specified directory.
   // The names are relative to "dir".
   // Original contents of *results are dropped.
-  Status GetChildren(const std::string& dir, std::vector<std::string>* result);
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* result) override;
 
   // Delete the named file.
   Status DeleteFile(const std::string& fname) override;
@@ -116,18 +117,16 @@
   // to go away.
   //
   // May create the named file if it does not already exist.
-  Status LockFile(const std::string& fname, FileLock** lock);
+  Status LockFile(const std::string& fname, FileLock** lock) override;
 
   // Release the lock acquired by a previous successful call to LockFile.
   // REQUIRES: lock was returned by a successful LockFile() call
   // REQUIRES: lock has not already been unlocked.
-  Status UnlockFile(FileLock* lock);
+  Status UnlockFile(FileLock* lock) override;
 
   // Get full directory name for this db.
-  Status GetAbsolutePath(const std::string& db_path, std::string* output_path);
-
-  // Generate unique id
-  std::string GenerateUniqueId();
+  Status GetAbsolutePath(const std::string& db_path,
+                         std::string* output_path) override;
 
   // Get default EnvLibrados
   static EnvLibrados* Default();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+
 #include <algorithm>
 #include <functional>
 #include <map>
@@ -16,6 +17,7 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/ldb_tool.h"
@@ -30,6 +32,7 @@
  public:
   // Command-line arguments
   static const std::string ARG_ENV_URI;
+  static const std::string ARG_FS_URI;
   static const std::string ARG_DB;
   static const std::string ARG_PATH;
   static const std::string ARG_SECONDARY_PATH;
@@ -57,6 +60,7 @@
   static const std::string ARG_FILE_SIZE;
   static const std::string ARG_CREATE_IF_MISSING;
   static const std::string ARG_NO_VALUE;
+  static const std::string ARG_DISABLE_CONSISTENCY_CHECKS;
 
   struct ParsedParams {
     std::string cmd;
@@ -75,13 +79,17 @@
           SelectCommand);
 
   static LDBCommand* InitFromCmdLineArgs(
-      int argc, char** argv, const Options& options,
+      int argc, char const* const* argv, const Options& options,
       const LDBOptions& ldb_options,
       const std::vector<ColumnFamilyDescriptor>* column_families);
 
   bool ValidateCmdLineOptions();
 
-  virtual Options PrepareOptionsForOpenDB();
+  virtual void PrepareOptions();
+
+  virtual void OverrideBaseOptions();
+
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts);
 
   virtual void SetDBOptions(Options options) { options_ = options; }
 
@@ -130,6 +138,7 @@
  protected:
   LDBCommandExecuteResult exec_state_;
   std::string env_uri_;
+  std::string fs_uri_;
   std::string db_path_;
   // If empty, open DB as primary. If non-empty, open the DB as secondary
   // with this secondary path. When running against a database opened by
@@ -161,7 +170,8 @@
   // If true, try to construct options from DB's option files.
   bool try_load_options_;
 
-  bool ignore_unknown_options_;
+  // The value passed to options.force_consistency_checks.
+  bool force_consistency_checks_;
 
   bool create_if_missing_;
 
@@ -237,6 +247,7 @@
 
   Options options_;
   std::vector<ColumnFamilyDescriptor> column_families_;
+  ConfigOptions config_options_;
   LDBOptions ldb_options_;
 
  private:
@@ -264,11 +275,13 @@
 
 class LDBCommandRunner {
  public:
-  static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name);
+  static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name,
+                        bool to_stderr = true);
 
   // Returns the status code to return. 0 is no error.
   static int RunCommand(
-      int argc, char** argv, Options options, const LDBOptions& ldb_options,
+      int argc, char const* const* argv, Options options,
+      const LDBOptions& ldb_options,
       const std::vector<ColumnFamilyDescriptor>* column_families);
 };
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,10 @@
 //
 #pragma once
 
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
 #ifdef FAILED
 #undef FAILED
 #endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/leveldb_options.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,6 +11,7 @@
 
 #include <stddef.h>
 
+#include "rocksdb/compression_type.h"
 #include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -23,8 +24,6 @@
 struct Options;
 class Snapshot;
 
-enum CompressionType : unsigned char;
-
 // Options to control the behavior of a database (passed to
 // DB::Open). A LevelDBOptions object can be initialized as though
 // it were a LevelDB Options object, and then it can be converted into
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/object_registry.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,83 +8,302 @@
 #ifndef ROCKSDB_LITE
 
 #include <functional>
+#include <map>
 #include <memory>
-#include <regex>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "rocksdb/status.h"
+#include "rocksdb/utilities/regex.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Customizable;
 class Logger;
+class ObjectLibrary;
+
 // Returns a new T when called with a string. Populates the std::unique_ptr
 // argument if granting ownership to caller.
 template <typename T>
 using FactoryFunc =
     std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
 
+// The signature of the function for loading factories
+// into an object library.  This method is expected to register
+// factory functions in the supplied ObjectLibrary.
+// The ObjectLibrary is the library in which the factories will be loaded.
+// The std::string is the argument passed to the loader function.
+// The RegistrarFunc should return the number of objects loaded into this
+// library
+using RegistrarFunc = std::function<int(ObjectLibrary&, const std::string&)>;
+
+template <typename T>
+using ConfigureFunc = std::function<Status(T*)>;
+
 class ObjectLibrary {
- public:
+ private:
   // Base class for an Entry in the Registry.
   class Entry {
    public:
     virtual ~Entry() {}
-    Entry(const std::string& name) : name_(std::move(name)) {}
+    virtual bool Matches(const std::string& target) const = 0;
+    virtual const char* Name() const = 0;
+  };
 
-    // Checks to see if the target matches this entry
-    virtual bool matches(const std::string& target) const {
-      return name_ == target;
+  // A class that implements an Entry based on Regex.
+  //
+  // WARNING: some regexes are problematic for std::regex; see
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582 for example
+  //
+  // This class is deprecated and will be removed in a future release
+  class RegexEntry : public Entry {
+   public:
+    explicit RegexEntry(const std::string& name) : name_(name) {
+      Regex::Parse(name, &regex_).PermitUncheckedError();
+    }
+
+    bool Matches(const std::string& target) const override {
+      return regex_.Matches(target);
     }
-    const std::string& Name() const { return name_; }
+    const char* Name() const override { return name_.c_str(); }
+
+   private:
+    std::string name_;
+    Regex regex_;  // The pattern for this entry
+  };
 
+ public:
+  // Class for matching target strings to a pattern.
+  // Entries consist of a name that starts the pattern and attributes
+  // The following attributes can be added to the entry:
+  //   -Suffix: Comparable to name(suffix)
+  //   -Separator: Comparable to name(separator).+ or name(separator).*
+  //   -Number: Comparable to name(separator).[0-9]+
+  //   -AltName: Comparable to (name|alt)
+  //   -Optional: Comparable to name(separator)?
+  // Multiple separators can be combined and cause multiple matches.
+  // For example, Pattern("A").AnotherName("B").AddSeparator("@").AddNumber("#")
+  // is roughly equivalent to "(A|B)@.+#.+"
+  //
+  // Note that though this class does provide some regex-style matching,
+  // it is not a full regex parser and has some key differences:
+  //   - Separators are matched left-most.  For example, an entry
+  //     Name("Hello").AddSeparator(" ").AddSuffix("!") would match
+  //     "Hello world!", but not "Hello world!!"
+  //   - No backtracking is necessary, enabling reliably efficient matching
+  class PatternEntry : public Entry {
    private:
-    const std::string name_;  // The name of the Entry
-  };                          // End class Entry
+    enum Quantifier {
+      kMatchZeroOrMore,  // [suffix].*
+      kMatchAtLeastOne,  // [suffix].+
+      kMatchExact,       // [suffix]
+      kMatchNumeric,     // [suffix][0-9]+
+    };
+
+   public:
+    // Short-cut for creating an entry that matches to a
+    // Customizable::IndividualId
+    static PatternEntry AsIndividualId(const std::string& name) {
+      PatternEntry entry(name, true);
+      entry.AddSeparator("@");
+      entry.AddSeparator("#");
+      return entry;
+    }
+
+    // Creates a new PatternEntry for "name".  If optional is true,
+    // Matches will also return true if name==target
+    explicit PatternEntry(const std::string& name, bool optional = true)
+        : name_(name), optional_(optional), slength_(0) {
+      nlength_ = name_.size();
+    }
+
+    // Adds a suffix (exact match of separator with no trailing characters) to
+    // the separator
+    PatternEntry& AddSuffix(const std::string& suffix) {
+      separators_.emplace_back(suffix, kMatchExact);
+      slength_ += suffix.size();
+      return *this;
+    }
 
+    // Adds a separator (exact match of separator with trailing characters) to
+    // the entry
+    // If at_least_one is true, the separator must be followed by at least
+    // one character (e.g. separator.+).
+    // If at_least_one is false, the separator may be followed by zero or
+    // more characters (e.g. separator.*).
+    PatternEntry& AddSeparator(const std::string& separator,
+                               bool at_least_one = true) {
+      slength_ += separator.size();
+      if (at_least_one) {
+        separators_.emplace_back(separator, kMatchAtLeastOne);
+        ++slength_;
+      } else {
+        separators_.emplace_back(separator, kMatchZeroOrMore);
+      }
+      return *this;
+    }
+
+    // Adds a separator (exact match of separator with trailing numbers) to the
+    // entry
+    PatternEntry& AddNumber(const std::string& separator) {
+      separators_.emplace_back(separator, kMatchNumeric);
+      slength_ += separator.size() + 1;
+      return *this;
+    }
+
+    // Sets another name that this entry will match, similar to (name|alt)
+    PatternEntry& AnotherName(const std::string& alt) {
+      names_.emplace_back(alt);
+      return *this;
+    }
+
+    // Sets whether the separators are required -- similar to name(separator)?
+    // If optional is true, then name(separator)? would match
+    // If optional is false, then the separators must also match
+    PatternEntry& SetOptional(bool optional) {
+      optional_ = optional;
+      return *this;
+    }
+
+    // Checks to see if the target matches this entry
+    bool Matches(const std::string& target) const override;
+    const char* Name() const override { return name_.c_str(); }
+
+   private:
+    size_t MatchSeparatorAt(size_t start, Quantifier mode,
+                            const std::string& target, size_t tlen,
+                            const std::string& pattern) const;
+
+    bool MatchesTarget(const std::string& name, size_t nlen,
+                       const std::string& target, size_t ylen) const;
+    std::string name_;                // The base name for this entry
+    size_t nlength_;                  // The length of name_
+    std::vector<std::string> names_;  // Alternative names for this entry
+    bool optional_;   // Whether matching of separators is required
+    size_t slength_;  // The minimum required length to match the separators
+    std::vector<std::pair<std::string, Quantifier>>
+        separators_;  // What to match
+  };                  // End class Entry
+
+ private:
   // An Entry containing a FactoryFunc for creating new Objects
   template <typename T>
   class FactoryEntry : public Entry {
    public:
-    FactoryEntry(const std::string& name, FactoryFunc<T> f)
-        : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {}
-    ~FactoryEntry() override {}
-    bool matches(const std::string& target) const override {
-      return std::regex_match(target, pattern_);
+    FactoryEntry(Entry* e, FactoryFunc<T> f)
+        : entry_(e), factory_(std::move(f)) {}
+    bool Matches(const std::string& target) const override {
+      return entry_->Matches(target);
     }
+    const char* Name() const override { return entry_->Name(); }
+
     // Creates a new T object.
     T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
                         std::string* msg) const {
       return factory_(target, guard, msg);
     }
+    const FactoryFunc<T>& GetFactory() const { return factory_; }
 
    private:
-    std::regex pattern_;  // The pattern for this entry
+    std::unique_ptr<Entry> entry_;  // What to match for this entry
     FactoryFunc<T> factory_;
   };  // End class FactoryEntry
  public:
-  // Finds the entry matching the input name and type
-  const Entry* FindEntry(const std::string& type,
-                         const std::string& name) const;
+  explicit ObjectLibrary(const std::string& id) { id_ = id; }
+
+  const std::string& GetID() const { return id_; }
+
+  // Finds the factory function for the input target.
+  // @see PatternEntry for the matching rules to target
+  // @return If matched, the FactoryFunc for this target, else nullptr
+  template <typename T>
+  FactoryFunc<T> FindFactory(const std::string& target) const {
+    std::unique_lock<std::mutex> lock(mu_);
+    auto factories = factories_.find(T::Type());
+    if (factories != factories_.end()) {
+      for (const auto& e : factories->second) {
+        if (e->Matches(target)) {
+          const auto* fe =
+              static_cast<const ObjectLibrary::FactoryEntry<T>*>(e.get());
+          return fe->GetFactory();
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  // Returns the total number of factories registered for this library.
+  // This method returns the sum of all factories registered for all types.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(size_t* num_types) const;
+
   void Dump(Logger* logger) const;
 
-  // Registers the factory with the library for the pattern.
+  // Registers the factory with the library for the regular expression pattern.
   // If the pattern matches, the factory may be used to create a new object.
+  //
+  // WARNING: some regexes are problematic for std::regex; see
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582 for example
+  //
+  // Deprecated. Will be removed in a major release. Code should use AddFactory
+  // instead.
   template <typename T>
   const FactoryFunc<T>& Register(const std::string& pattern,
                                  const FactoryFunc<T>& factory) {
-    std::unique_ptr<Entry> entry(new FactoryEntry<T>(pattern, factory));
-    AddEntry(T::Type(), entry);
+    std::unique_ptr<Entry> entry(
+        new FactoryEntry<T>(new RegexEntry(pattern), factory));
+    AddFactoryEntry(T::Type(), std::move(entry));
     return factory;
   }
+
+  // Registers the factory with the library for the name.
+  // If name==target, the factory may be used to create a new object.
+  template <typename T>
+  const FactoryFunc<T>& AddFactory(const std::string& name,
+                                   const FactoryFunc<T>& func) {
+    std::unique_ptr<Entry> entry(
+        new FactoryEntry<T>(new PatternEntry(name), func));
+    AddFactoryEntry(T::Type(), std::move(entry));
+    return func;
+  }
+
+  // Registers the factory with the library for the entry.
+  // If the entry matches the target, the factory may be used to create a new
+  // object.
+  // @see PatternEntry for the matching rules.
+  template <typename T>
+  const FactoryFunc<T>& AddFactory(const PatternEntry& entry,
+                                   const FactoryFunc<T>& func) {
+    std::unique_ptr<Entry> factory(
+        new FactoryEntry<T>(new PatternEntry(entry), func));
+    AddFactoryEntry(T::Type(), std::move(factory));
+    return func;
+  }
+
+  // Invokes the registrar function with the supplied arg for this library.
+  int Register(const RegistrarFunc& registrar, const std::string& arg) {
+    return registrar(*this, arg);
+  }
+
   // Returns the default ObjectLibrary
   static std::shared_ptr<ObjectLibrary>& Default();
 
  private:
-  // Adds the input entry to the list for the given type
-  void AddEntry(const std::string& type, std::unique_ptr<Entry>& entry);
+  void AddFactoryEntry(const char* type, std::unique_ptr<Entry>&& entry) {
+    std::unique_lock<std::mutex> lock(mu_);
+    auto& factories = factories_[type];
+    factories.emplace_back(std::move(entry));
+  }
 
+  // Protects the entry map
+  mutable std::mutex mu_;
   // ** FactoryFunctions for this loader, organized by type
-  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>> entries_;
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>>
+      factories_;
+
+  // The name for this library
+  std::string id_;
 };
 
 // The ObjectRegistry is used to register objects that can be created by a
@@ -93,30 +312,46 @@
 class ObjectRegistry {
  public:
   static std::shared_ptr<ObjectRegistry> NewInstance();
-
-  ObjectRegistry();
+  static std::shared_ptr<ObjectRegistry> NewInstance(
+      const std::shared_ptr<ObjectRegistry>& parent);
+  static std::shared_ptr<ObjectRegistry> Default();
+  explicit ObjectRegistry(const std::shared_ptr<ObjectRegistry>& parent)
+      : parent_(parent) {}
+
+  std::shared_ptr<ObjectLibrary> AddLibrary(const std::string& id) {
+    auto library = std::make_shared<ObjectLibrary>(id);
+    AddLibrary(library);
+    return library;
+  }
 
   void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
-    libraries_.emplace_back(library);
+    std::unique_lock<std::mutex> lock(library_mutex_);
+    libraries_.push_back(library);
+  }
+
+  void AddLibrary(const std::string& id, const RegistrarFunc& registrar,
+                  const std::string& arg) {
+    auto library = AddLibrary(id);
+    library->Register(registrar, arg);
   }
 
-  // Creates a new T using the factory function that was registered with a
-  // pattern that matches the provided "target" string according to
-  // std::regex_match.
+  // Creates a new T using the factory function that was registered for this
+  // target.  Searches through the libraries to find the first library where
+  // there is an entry that matches target (see PatternEntry for the matching
+  // rules).
   //
   // If no registered functions match, returns nullptr. If multiple functions
   // match, the factory function used is unspecified.
   //
-  // Populates res_guard with result pointer if caller is granted ownership.
+  // Populates guard with result pointer if caller is granted ownership.
+  // Deprecated.  Use NewShared/Static/UniqueObject instead.
   template <typename T>
   T* NewObject(const std::string& target, std::unique_ptr<T>* guard,
                std::string* errmsg) {
     guard->reset();
-    const auto* basic = FindEntry(T::Type(), target);
-    if (basic != nullptr) {
-      const auto* factory =
-          static_cast<const ObjectLibrary::FactoryEntry<T>*>(basic);
-      return factory->NewFactoryObject(target, guard, errmsg);
+    auto factory = FindFactory<T>(target);
+    if (factory != nullptr) {
+      return factory(target, guard, errmsg);
     } else {
       *errmsg = std::string("Could not load ") + T::Type();
       return nullptr;
@@ -125,7 +360,7 @@
 
   // Creates a new unique T using the input factory functions.
   // Returns OK if a new unique T was successfully created
-  // Returns NotFound if the type/target could not be created
+  // Returns NotSupported if the type/target could not be created
   // Returns InvalidArgument if the factory return an unguarded object
   //                      (meaning it cannot be managed by a unique ptr)
   template <typename T>
@@ -134,7 +369,7 @@
     std::string errmsg;
     T* ptr = NewObject(target, result, &errmsg);
     if (ptr == nullptr) {
-      return Status::NotFound(errmsg, target);
+      return Status::NotSupported(errmsg, target);
     } else if (*result) {
       return Status::OK();
     } else {
@@ -146,7 +381,7 @@
 
   // Creates a new shared T using the input factory functions.
   // Returns OK if a new shared T was successfully created
-  // Returns NotFound if the type/target could not be created
+  // Returns NotSupported if the type/target could not be created
   // Returns InvalidArgument if the factory return an unguarded object
   //                      (meaning it cannot be managed by a shared ptr)
   template <typename T>
@@ -156,7 +391,7 @@
     std::unique_ptr<T> guard;
     T* ptr = NewObject(target, &guard, &errmsg);
     if (ptr == nullptr) {
-      return Status::NotFound(errmsg, target);
+      return Status::NotSupported(errmsg, target);
     } else if (guard) {
       result->reset(guard.release());
       return Status::OK();
@@ -169,7 +404,7 @@
 
   // Creates a new static T using the input factory functions.
   // Returns OK if a new static T was successfully created
-  // Returns NotFound if the type/target could not be created
+  // Returns NotSupported if the type/target could not be created
   // Returns InvalidArgument if the factory return a guarded object
   //                      (meaning it is managed by a unique ptr)
   template <typename T>
@@ -178,7 +413,7 @@
     std::unique_ptr<T> guard;
     T* ptr = NewObject(target, &guard, &errmsg);
     if (ptr == nullptr) {
-      return Status::NotFound(errmsg, target);
+      return Status::NotSupported(errmsg, target);
     } else if (guard.get()) {
       return Status::InvalidArgument(std::string("Cannot make a static ") +
                                          T::Type() + " from a guarded one ",
@@ -189,17 +424,167 @@
     }
   }
 
+  // Sets the object for the given id/type to be the input object
+  // If the registry does not contain this id/type, the object is added and OK
+  // is returned. If the registry contains a different object, an error is
+  // returned. If the registry contains the input object, OK is returned.
+  template <typename T>
+  Status SetManagedObject(const std::shared_ptr<T>& object) {
+    assert(object != nullptr);
+    return SetManagedObject(object->GetId(), object);
+  }
+
+  template <typename T>
+  Status SetManagedObject(const std::string& id,
+                          const std::shared_ptr<T>& object) {
+    const auto c = std::static_pointer_cast<Customizable>(object);
+    return SetManagedObject(T::Type(), id, c);
+  }
+
+  // Returns the object for the given id, if one exists.
+  // If the object is not found in the registry, a nullptr is returned
+  template <typename T>
+  std::shared_ptr<T> GetManagedObject(const std::string& id) const {
+    auto c = GetManagedObject(T::Type(), id);
+    return std::static_pointer_cast<T>(c);
+  }
+
+  // Returns the set of managed objects found in the registry matching
+  // the input type and ID.
+  // If the input id is not empty, then only objects of that class
+  // (IsInstanceOf(id)) will be returned (for example, only return LRUCache
+  // objects) If the input id is empty, then all objects of that type (all Cache
+  // objects)
+  template <typename T>
+  Status ListManagedObjects(const std::string& id,
+                            std::vector<std::shared_ptr<T>>* results) const {
+    std::vector<std::shared_ptr<Customizable>> customizables;
+    results->clear();
+    Status s = ListManagedObjects(T::Type(), id, &customizables);
+    if (s.ok()) {
+      for (const auto& c : customizables) {
+        results->push_back(std::static_pointer_cast<T>(c));
+      }
+    }
+    return s;
+  }
+
+  template <typename T>
+  Status ListManagedObjects(std::vector<std::shared_ptr<T>>* results) const {
+    return ListManagedObjects("", results);
+  }
+
+  // Creates a new ManagedObject in the registry for the id if one does not
+  // currently exist.  If an object with that ID already exists, the current
+  // object is returned.
+  //
+  // The ID is the identifier of the object to be returned/created and returned
+  // in result
+  // If a new object is created (using the object factories), the cfunc
+  // parameter will be invoked to configure the new object.
+  template <typename T>
+  Status GetOrCreateManagedObject(const std::string& id,
+                                  std::shared_ptr<T>* result,
+                                  const ConfigureFunc<T>& cfunc = nullptr) {
+    if (parent_ != nullptr) {
+      auto object = parent_->GetManagedObject(T::Type(), id);
+      if (object != nullptr) {
+        *result = std::static_pointer_cast<T>(object);
+        return Status::OK();
+      }
+    }
+    {
+      std::unique_lock<std::mutex> lock(objects_mutex_);
+      auto key = ToManagedObjectKey(T::Type(), id);
+      auto iter = managed_objects_.find(key);
+      if (iter != managed_objects_.end()) {
+        auto object = iter->second.lock();
+        if (object != nullptr) {
+          *result = std::static_pointer_cast<T>(object);
+          return Status::OK();
+        }
+      }
+      std::shared_ptr<T> object;
+      Status s = NewSharedObject(id, &object);
+      if (s.ok() && cfunc != nullptr) {
+        s = cfunc(object.get());
+      }
+      if (s.ok()) {
+        auto c = std::static_pointer_cast<Customizable>(object);
+        if (id != c->Name()) {
+          // If the ID is not the base name of the class, add the new
+          // object under the input ID
+          managed_objects_[key] = c;
+        }
+        if (id != c->GetId() && c->GetId() != c->Name()) {
+          // If the input and current ID do not match, and the
+          // current ID is not the base bame, add the new object under
+          // its new ID
+          key = ToManagedObjectKey(T::Type(), c->GetId());
+          managed_objects_[key] = c;
+        }
+        *result = object;
+      }
+      return s;
+    }
+  }
+
   // Dump the contents of the registry to the logger
   void Dump(Logger* logger) const;
 
  private:
-  const ObjectLibrary::Entry* FindEntry(const std::string& type,
-                                        const std::string& name) const;
+  explicit ObjectRegistry(const std::shared_ptr<ObjectLibrary>& library) {
+    libraries_.push_back(library);
+  }
+  static std::string ToManagedObjectKey(const std::string& type,
+                                        const std::string& id) {
+    return type + "://" + id;
+  }
+
+  // Returns the Customizable managed object associated with the key (Type/ID).
+  // If not found, nullptr is returned.
+  std::shared_ptr<Customizable> GetManagedObject(const std::string& type,
+                                                 const std::string& id) const;
+  Status ListManagedObjects(
+      const std::string& type, const std::string& pattern,
+      std::vector<std::shared_ptr<Customizable>>* results) const;
+  // Sets the managed object associated with the key (Type/ID) to c.
+  // If the named managed object does not exist, the object is added and OK is
+  // returned If the object exists and is the same as c, OK is returned
+  // Otherwise, an error status is returned.
+  Status SetManagedObject(const std::string& type, const std::string& id,
+                          const std::shared_ptr<Customizable>& c);
+
+  // Searches (from back to front) the libraries looking for the
+  // factory that matches this name.
+  // Returns the factory if it is found, and nullptr otherwise
+  template <typename T>
+  const FactoryFunc<T> FindFactory(const std::string& name) const {
+    {
+      std::unique_lock<std::mutex> lock(library_mutex_);
+      for (auto iter = libraries_.crbegin(); iter != libraries_.crend();
+           ++iter) {
+        const auto factory = iter->get()->FindFactory<T>(name);
+        if (factory != nullptr) {
+          return factory;
+        }
+      }
+    }
+    if (parent_ == nullptr) {
+      return nullptr;
+    } else {
+      return parent_->FindFactory<T>(name);
+    }
+  }
 
   // The set of libraries to search for factories for this registry.
   // The libraries are searched in reverse order (back to front) when
   // searching for entries.
   std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+  std::map<std::string, std::weak_ptr<Customizable>> managed_objects_;
+  std::shared_ptr<ObjectRegistry> parent_;
+  mutable std::mutex objects_mutex_;  // Mutex for managed objects
+  mutable std::mutex library_mutex_;  // Mutex for managed libraries
 };
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h	2025-05-19 16:14:27.000000000 +0000
@@ -51,6 +51,8 @@
   uint32_t occ_lock_buckets = (1 << 20);
 };
 
+// Range deletions (including those in `WriteBatch`es passed to `Write()`) are
+// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status`
 class OptimisticTransactionDB : public StackableDB {
  public:
   // Open an OptimisticTransactionDB similar to DB::Open().
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_type.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,946 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionTypeInfo;
+
+// The underlying "class/type" of the option.
+// This enum is used to determine how the option should
+// be converted to/from strings and compared.
+enum class OptionType {
+  kBoolean,
+  kInt,
+  kInt32T,
+  kInt64T,
+  kUInt,
+  kUInt8T,
+  kUInt32T,
+  kUInt64T,
+  kSizeT,
+  kString,
+  kDouble,
+  kCompactionStyle,
+  kCompactionPri,
+  kCompressionType,
+  kCompactionStopStyle,
+  kFilterPolicy,
+  kChecksumType,
+  kEncodingType,
+  kEnv,
+  kEnum,
+  kStruct,
+  kVector,
+  kConfigurable,
+  kCustomizable,
+  kEncodedString,
+  kUnknown,
+};
+
+enum class OptionVerificationType {
+  kNormal,
+  kByName,               // The option is pointer typed so we can only verify
+                         // based on it's name.
+  kByNameAllowNull,      // Same as kByName, but it also allows the case
+                         // where one of them is a nullptr.
+  kByNameAllowFromNull,  // Same as kByName, but it also allows the case
+                         // where the old option is nullptr.
+  kDeprecated,           // The option is no longer used in rocksdb. The RocksDB
+                         // OptionsParser will still accept this option if it
+                         // happen to exists in some Options file.  However,
+                         // the parser will not include it in serialization
+                         // and verification processes.
+  kAlias,                // This option represents is a name/shortcut for
+                         // another option and should not be written or verified
+                         // independently
+};
+
+// A set of modifier flags used to alter how an option is evaluated or
+// processed. These flags can be combined together (e.g. kMutable | kShared).
+// The kCompare flags can be used to control if/when options are compared.
+// If kCompareNever is set, two related options would never be compared (always
+// equal) If kCompareExact is set, the options will only be compared if the
+// sanity mode
+//                  is exact
+// kMutable       means the option can be changed after it is prepared
+// kShared        means the option is contained in a std::shared_ptr
+// kUnique        means the option is contained in a std::uniqued_ptr
+// kRawPointer    means the option is a raw pointer value.
+// kAllowNull     means that an option is allowed to be null for verification
+//                purposes.
+// kDontSerialize means this option should not be serialized and included in
+//                the string representation.
+// kDontPrepare   means do not call PrepareOptions for this pointer value.
+enum class OptionTypeFlags : uint32_t {
+  kNone = 0x00,  // No flags
+  kCompareDefault = 0x0,
+  kCompareNever = ConfigOptions::kSanityLevelNone,
+  kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible,
+  kCompareExact = ConfigOptions::kSanityLevelExactMatch,
+
+  kMutable = 0x0100,         // Option is mutable
+  kRawPointer = 0x0200,      // The option is stored as a raw pointer
+  kShared = 0x0400,          // The option is stored as a shared_ptr
+  kUnique = 0x0800,          // The option is stored as a unique_ptr
+  kAllowNull = 0x1000,       // The option can be null
+  kDontSerialize = 0x2000,   // Don't serialize the option
+  kDontPrepare = 0x4000,     // Don't prepare or sanitize this option
+  kStringNameOnly = 0x8000,  // The option serializes to a name only
+};
+
+inline OptionTypeFlags operator|(const OptionTypeFlags &a,
+                                 const OptionTypeFlags &b) {
+  return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) |
+                                      static_cast<uint32_t>(b));
+}
+
+inline OptionTypeFlags operator&(const OptionTypeFlags &a,
+                                 const OptionTypeFlags &b) {
+  return static_cast<OptionTypeFlags>(static_cast<uint32_t>(a) &
+                                      static_cast<uint32_t>(b));
+}
+
+// Converts an string into its enumerated value.
+// @param type_map Mapping between strings and enum values
+// @param type The string representation of the enum
+// @param value Returns the enum value represented by the string
+// @return true if the string was found in the enum map, false otherwise.
+template <typename T>
+bool ParseEnum(const std::unordered_map<std::string, T>& type_map,
+               const std::string& type, T* value) {
+  auto iter = type_map.find(type);
+  if (iter != type_map.end()) {
+    *value = iter->second;
+    return true;
+  }
+  return false;
+}
+
+// Converts an enum into its string representation.
+// @param type_map Mapping between strings and enum values
+// @param type The enum
+// @param value Returned as the string representation of the enum
+// @return true if the enum was found in the enum map, false otherwise.
+template <typename T>
+bool SerializeEnum(const std::unordered_map<std::string, T>& type_map,
+                   const T& type, std::string* value) {
+  for (const auto& pair : type_map) {
+    if (pair.second == type) {
+      *value = pair.first;
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+                   const OptionTypeInfo& elem_info, char separator,
+                   const std::string& name, const std::string& value,
+                   std::vector<T>* result);
+
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+                       const OptionTypeInfo& elem_info, char separator,
+                       const std::string& name, const std::vector<T>& vec,
+                       std::string* value);
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+                     const OptionTypeInfo& elem_info, const std::string& name,
+                     const std::vector<T>& vec1, const std::vector<T>& vec2,
+                     std::string* mismatch);
+
+// Function for converting a option string value into its underlying
+// representation in "addr"
+// On success, Status::OK is returned and addr is set to the parsed form
+// On failure, a non-OK status is returned
+// @param opts  The ConfigOptions controlling how the value is parsed
+// @param name  The name of the options being parsed
+// @param value The string representation of the option
+// @param addr  Pointer to the object
+using ParseFunc = std::function<Status(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const std::string& /*value*/, void* /*addr*/)>;
+
+// Function for converting an option "addr" into its string representation.
+// On success, Status::OK is returned and value is the serialized form.
+// On failure, a non-OK status is returned
+// @param opts  The ConfigOptions controlling how the values are serialized
+// @param name  The name of the options being serialized
+// @param addr  Pointer to the value being serialized
+// @param value The result of the serialization.
+using SerializeFunc = std::function<Status(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const void* /*addr*/, std::string* /*value*/)>;
+
+// Function for comparing two option values
+// If they are not equal, updates "mismatch" with the name of the bad option
+// @param opts  The ConfigOptions controlling how the values are compared
+// @param name  The name of the options being compared
+// @param addr1 The first address to compare
+// @param addr2 The address to compare to
+// @param mismatch If the values are not equal, the name of the option that
+// first differs
+using EqualsFunc = std::function<bool(
+    const ConfigOptions& /*opts*/, const std::string& /*name*/,
+    const void* /*addr1*/, const void* /*addr2*/, std::string* mismatch)>;
+
+// A struct for storing constant option information such as option name,
+// option type, and offset.
+class OptionTypeInfo {
+ public:
+  // A simple "normal", non-mutable Type "type" at offset
+  OptionTypeInfo(int offset, OptionType type)
+      : offset_(offset),
+        parse_func_(nullptr),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(OptionVerificationType::kNormal),
+        flags_(OptionTypeFlags::kNone) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags)
+      : offset_(offset),
+        parse_func_(nullptr),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags,
+                 const ParseFunc& parse_func)
+      : offset_(offset),
+        parse_func_(parse_func),
+        serialize_func_(nullptr),
+        equals_func_(nullptr),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  OptionTypeInfo(int offset, OptionType type,
+                 OptionVerificationType verification, OptionTypeFlags flags,
+                 const ParseFunc& parse_func,
+                 const SerializeFunc& serialize_func,
+                 const EqualsFunc& equals_func)
+      : offset_(offset),
+        parse_func_(parse_func),
+        serialize_func_(serialize_func),
+        equals_func_(equals_func),
+        type_(type),
+        verification_(verification),
+        flags_(flags) {}
+
+  // Creates an OptionTypeInfo for an enum type.  Enums use an additional
+  // map to convert the enums to/from their string representation.
+  // To create an OptionTypeInfo that is an Enum, one should:
+  // - Create a static map of string values to the corresponding enum value
+  // - Call this method passing the static map in as a parameter.
+  // Note that it is not necessary to add a new OptionType or make any
+  // other changes -- the returned object handles parsing, serialization, and
+  // comparisons.
+  //
+  // @param offset The offset in the option object for this enum
+  // @param map The string to enum mapping for this enum
+  template <typename T>
+  static OptionTypeInfo Enum(
+      int offset, const std::unordered_map<std::string, T>* const map,
+      OptionTypeFlags flags = OptionTypeFlags::kNone) {
+    return OptionTypeInfo(
+        offset, OptionType::kEnum, OptionVerificationType::kNormal, flags,
+        // Uses the map argument to convert the input string into
+        // its corresponding enum value.  If value is found in the map,
+        // addr is updated to the corresponding map entry.
+        // @return OK if the value is found in the map
+        // @return InvalidArgument if the value is not found in the map
+        [map](const ConfigOptions&, const std::string& name,
+              const std::string& value, void* addr) {
+          if (map == nullptr) {
+            return Status::NotSupported("No enum mapping ", name);
+          } else if (ParseEnum<T>(*map, value, static_cast<T*>(addr))) {
+            return Status::OK();
+          } else {
+            return Status::InvalidArgument("No mapping for enum ", name);
+          }
+        },
+        // Uses the map argument to convert the input enum into
+        // its corresponding string value.  If enum value is found in the map,
+        // value is updated to the corresponding string value in the map.
+        // @return OK if the enum is found in the map
+        // @return InvalidArgument if the enum is not found in the map
+        [map](const ConfigOptions&, const std::string& name, const void* addr,
+              std::string* value) {
+          if (map == nullptr) {
+            return Status::NotSupported("No enum mapping ", name);
+          } else if (SerializeEnum<T>(*map, (*static_cast<const T*>(addr)),
+                                      value)) {
+            return Status::OK();
+          } else {
+            return Status::InvalidArgument("No mapping for enum ", name);
+          }
+        },
+        // Casts addr1 and addr2 to the enum type and returns true if
+        // they are equal, false otherwise.
+        [](const ConfigOptions&, const std::string&, const void* addr1,
+           const void* addr2, std::string*) {
+          return (*static_cast<const T*>(addr1) ==
+                  *static_cast<const T*>(addr2));
+        });
+  }  // End OptionTypeInfo::Enum
+
+  // Creates an OptionTypeInfo for a Struct type.  Structs have a
+  // map of string-OptionTypeInfo associated with them that describes how
+  // to process the object for parsing, serializing, and matching.
+  // Structs also have a struct_name, which is the name of the object
+  // as registered in the parent map.
+  // When processing a struct, the option name can be specified as:
+  //   - <struct_name>       Meaning to process the entire struct.
+  //   - <struct_name.field> Meaning to process the single field
+  //   - <field>             Process the single fields
+  // The CompactionOptionsFIFO, CompactionOptionsUniversal, and LRUCacheOptions
+  // are all examples of Struct options.
+  //
+  // To create an OptionTypeInfo that is a Struct, one should:
+  // - Create a static map of string-OptionTypeInfo corresponding to the
+  //   properties of the object that can be set via the options.
+  // - Call this method passing the name and map in as parameters.
+  // Note that it is not necessary to add a new OptionType or make any
+  // other changes -- the returned object handles parsing, serialization, and
+  // comparisons.
+  //
+  // @param offset The offset in the option object for this enum
+  // @param map The string to enum mapping for this enum
+  static OptionTypeInfo Struct(
+      const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+      int offset, OptionVerificationType verification, OptionTypeFlags flags) {
+    return OptionTypeInfo(
+        offset, OptionType::kStruct, verification, flags,
+        // Parses the struct and updates the fields at addr
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name,
+                                  const std::string& value, void* addr) {
+          return ParseStruct(opts, struct_name, struct_map, name, value, addr);
+        },
+        // Serializes the struct options into value
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr,
+                                  std::string* value) {
+          return SerializeStruct(opts, struct_name, struct_map, name, addr,
+                                 value);
+        },
+        // Compares the struct fields of addr1 and addr2 for equality
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr1,
+                                  const void* addr2, std::string* mismatch) {
+          return StructsAreEqual(opts, struct_name, struct_map, name, addr1,
+                                 addr2, mismatch);
+        });
+  }
+  static OptionTypeInfo Struct(
+      const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+      int offset, OptionVerificationType verification, OptionTypeFlags flags,
+      const ParseFunc& parse_func) {
+    return OptionTypeInfo(
+        offset, OptionType::kStruct, verification, flags, parse_func,
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr,
+                                  std::string* value) {
+          return SerializeStruct(opts, struct_name, struct_map, name, addr,
+                                 value);
+        },
+        [struct_name, struct_map](const ConfigOptions& opts,
+                                  const std::string& name, const void* addr1,
+                                  const void* addr2, std::string* mismatch) {
+          return StructsAreEqual(opts, struct_name, struct_map, name, addr1,
+                                 addr2, mismatch);
+        });
+  }
+
+  template <typename T>
+  static OptionTypeInfo Vector(int _offset,
+                               OptionVerificationType _verification,
+                               OptionTypeFlags _flags,
+                               const OptionTypeInfo& elem_info,
+                               char separator = ':') {
+    return OptionTypeInfo(
+        _offset, OptionType::kVector, _verification, _flags,
+        [elem_info, separator](const ConfigOptions& opts,
+                               const std::string& name,
+                               const std::string& value, void* addr) {
+          auto result = static_cast<std::vector<T>*>(addr);
+          return ParseVector<T>(opts, elem_info, separator, name, value,
+                                result);
+        },
+        [elem_info, separator](const ConfigOptions& opts,
+                               const std::string& name, const void* addr,
+                               std::string* value) {
+          const auto& vec = *(static_cast<const std::vector<T>*>(addr));
+          return SerializeVector<T>(opts, elem_info, separator, name, vec,
+                                    value);
+        },
+        [elem_info](const ConfigOptions& opts, const std::string& name,
+                    const void* addr1, const void* addr2,
+                    std::string* mismatch) {
+          const auto& vec1 = *(static_cast<const std::vector<T>*>(addr1));
+          const auto& vec2 = *(static_cast<const std::vector<T>*>(addr2));
+          return VectorsAreEqual<T>(opts, elem_info, name, vec1, vec2,
+                                    mismatch);
+        });
+  }
+
+  // Create a new std::shared_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::shared_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    return AsCustomSharedPtr<T>(offset, ovt, flags, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    return OptionTypeInfo(
+        offset, OptionType::kCustomizable, ovt,
+        flags | OptionTypeFlags::kShared,
+        [](const ConfigOptions& opts, const std::string& name,
+           const std::string& value, void* addr) {
+          auto* shared = static_cast<std::shared_ptr<T>*>(addr);
+          if (name == kIdPropName() && value.empty()) {
+            shared->reset();
+            return Status::OK();
+          } else {
+            return T::CreateFromString(opts, value, shared);
+          }
+        },
+        serialize_func, equals_func);
+  }
+
+  // Create a new std::unique_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::unique_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    return AsCustomUniquePtr<T>(offset, ovt, flags, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    return OptionTypeInfo(
+        offset, OptionType::kCustomizable, ovt,
+        flags | OptionTypeFlags::kUnique,
+        [](const ConfigOptions& opts, const std::string& name,
+           const std::string& value, void* addr) {
+          auto* unique = static_cast<std::unique_ptr<T>*>(addr);
+          if (name == kIdPropName() && value.empty()) {
+            unique->reset();
+            return Status::OK();
+          } else {
+            return T::CreateFromString(opts, value, unique);
+          }
+        },
+        serialize_func, equals_func);
+  }
+
+  // Create a new Customizable* OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // T object.
+  //
+  // @param _offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags) {
+    return AsCustomRawPtr<T>(offset, ovt, flags, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags,
+                                       const SerializeFunc& serialize_func,
+                                       const EqualsFunc& equals_func) {
+    return OptionTypeInfo(
+        offset, OptionType::kCustomizable, ovt,
+        flags | OptionTypeFlags::kRawPointer,
+        [](const ConfigOptions& opts, const std::string& name,
+           const std::string& value, void* addr) {
+          auto** pointer = static_cast<T**>(addr);
+          if (name == kIdPropName() && value.empty()) {
+            *pointer = nullptr;
+            return Status::OK();
+          } else {
+            return T::CreateFromString(opts, value, pointer);
+          }
+        },
+        serialize_func, equals_func);
+  }
+
+  bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; }
+
+  bool IsEditable(const ConfigOptions& opts) const {
+    if (opts.mutable_options_only) {
+      return IsMutable();
+    } else {
+      return true;
+    }
+  }
+  bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); }
+
+  bool IsDeprecated() const {
+    return IsEnabled(OptionVerificationType::kDeprecated);
+  }
+
+  // Returns true if the option is marked as an Alias.
+  // Aliases are valid options that are parsed but are not converted to strings
+  // or compared.
+  bool IsAlias() const { return IsEnabled(OptionVerificationType::kAlias); }
+
+  bool IsEnabled(OptionVerificationType ovf) const {
+    return verification_ == ovf;
+  }
+
+  // Returns the sanity level for comparing the option.
+  // If the options should not be compared, returns None
+  // If the option has a compare flag, returns it.
+  // Otherwise, returns "exact"
+  ConfigOptions::SanityLevel GetSanityLevel() const {
+    if (IsDeprecated() || IsAlias()) {
+      return ConfigOptions::SanityLevel::kSanityLevelNone;
+    } else {
+      auto match = (flags_ & OptionTypeFlags::kCompareExact);
+      if (match == OptionTypeFlags::kCompareDefault) {
+        return ConfigOptions::SanityLevel::kSanityLevelExactMatch;
+      } else {
+        return (ConfigOptions::SanityLevel)match;
+      }
+    }
+  }
+
+  // Returns true if the option should be serialized.
+  // Options should be serialized if the are not deprecated, aliases,
+  // or marked as "Don't Serialize".
+  bool ShouldSerialize() const {
+    if (IsDeprecated() || IsAlias()) {
+      return false;
+    } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  // Returns true if the option is allowed to be null.
+  // Options can be null if the verification type is allow from null
+  // or if the flags specify allow null.
+  bool CanBeNull() const {
+    return (IsEnabled(OptionTypeFlags::kAllowNull) ||
+            IsEnabled(OptionVerificationType::kByNameAllowNull) ||
+            IsEnabled(OptionVerificationType::kByNameAllowFromNull));
+  }
+
+  bool IsSharedPtr() const { return IsEnabled(OptionTypeFlags::kShared); }
+
+  bool IsUniquePtr() const { return IsEnabled(OptionTypeFlags::kUnique); }
+
+  bool IsRawPtr() const { return IsEnabled(OptionTypeFlags::kRawPointer); }
+
+  bool IsByName() const {
+    return (verification_ == OptionVerificationType::kByName ||
+            verification_ == OptionVerificationType::kByNameAllowNull ||
+            verification_ == OptionVerificationType::kByNameAllowFromNull);
+  }
+
+  bool IsStruct() const { return (type_ == OptionType::kStruct); }
+
+  bool IsConfigurable() const {
+    return (type_ == OptionType::kConfigurable ||
+            type_ == OptionType::kCustomizable);
+  }
+
+  bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); }
+
+  // Returns the underlying pointer for the type at base_addr
+  // The value returned is the underlying "raw" pointer, offset from base.
+  template <typename T>
+  const T* AsRawPointer(const void* const base_addr) const {
+    if (base_addr == nullptr) {
+      return nullptr;
+    }
+    const void* opt_addr = static_cast<const char*>(base_addr) + offset_;
+    if (IsUniquePtr()) {
+      const std::unique_ptr<T>* ptr =
+          static_cast<const std::unique_ptr<T>*>(opt_addr);
+      return ptr->get();
+    } else if (IsSharedPtr()) {
+      const std::shared_ptr<T>* ptr =
+          static_cast<const std::shared_ptr<T>*>(opt_addr);
+      return ptr->get();
+    } else if (IsRawPtr()) {
+      const T* const* ptr = static_cast<const T* const*>(opt_addr);
+      return *ptr;
+    } else {
+      return static_cast<const T*>(opt_addr);
+    }
+  }
+
+  // Returns the underlying pointer for the type at base_addr
+  // The value returned is the underlying "raw" pointer, offset from base.
+  template <typename T>
+  T* AsRawPointer(void* base_addr) const {
+    if (base_addr == nullptr) {
+      return nullptr;
+    }
+    void* opt_addr = static_cast<char*>(base_addr) + offset_;
+    if (IsUniquePtr()) {
+      std::unique_ptr<T>* ptr = static_cast<std::unique_ptr<T>*>(opt_addr);
+      return ptr->get();
+    } else if (IsSharedPtr()) {
+      std::shared_ptr<T>* ptr = static_cast<std::shared_ptr<T>*>(opt_addr);
+      return ptr->get();
+    } else if (IsRawPtr()) {
+      T** ptr = static_cast<T**>(opt_addr);
+      return *ptr;
+    } else {
+      return static_cast<T*>(opt_addr);
+    }
+  }
+
+  // Parses the option in "opt_value" according to the rules of this class
+  // and updates the value at "opt_ptr".
+  // On success, Status::OK() is returned.  On failure:
+  // NotFound means the opt_name is not valid for this option
+  // NotSupported means we do not know how to parse the value for this option
+  // InvalidArgument means the opt_value is not valid for this option.
+  Status Parse(const ConfigOptions& config_options, const std::string& opt_name,
+               const std::string& opt_value, void* const opt_ptr) const;
+
+  // Serializes the option in "opt_addr" according to the rules of this class
+  // into the value at "opt_value".
+  Status Serialize(const ConfigOptions& config_options,
+                   const std::string& opt_name, const void* const opt_ptr,
+                   std::string* opt_value) const;
+
+  // Compares the "addr1" and "addr2" values according to the rules of this
+  // class and returns true if they match.  On a failed match, mismatch is the
+  // name of the option that failed to match.
+  bool AreEqual(const ConfigOptions& config_options,
+                const std::string& opt_name, const void* const addr1,
+                const void* const addr2, std::string* mismatch) const;
+
+  // Used to override the match rules for "ByName" options.
+  bool AreEqualByName(const ConfigOptions& config_options,
+                      const std::string& opt_name, const void* const this_ptr,
+                      const void* const that_ptr) const;
+  bool AreEqualByName(const ConfigOptions& config_options,
+                      const std::string& opt_name, const void* const this_ptr,
+                      const std::string& that_value) const;
+
+  // Parses the input opts_map according to the type_map for the opt_addr
+  // For each name-value pair in opts_map, find the corresponding name in
+  // type_map If the name is found:
+  //    - set the corresponding value in opt_addr, returning the status on
+  //    failure;
+  // If the name is not found:
+  //    - If unused is specified, add the name-value to unused and continue
+  //    - If ingore_unknown_options is false, return NotFound
+  // Returns OK if all options were either:
+  //    - Successfully set
+  //    - options were not found and ignore_unknown_options=true
+  //    - options were not found and unused was specified
+  // Note that this method is much less sophisticated than the comparable
+  // Configurable::Configure methods.  For example, on error, there is no
+  // attempt to return opt_addr to the initial state.  Additionally, there
+  // is no effort to initialize (Configurable::PrepareOptions) the object
+  // on success.  This method should typically only be used for simpler,
+  // standalone structures and not those that contain shared and embedded
+  // objects.
+  static Status ParseType(
+      const ConfigOptions& config_options, const std::string& opts_str,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+  static Status ParseType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+
+  // Parses the input value according to the map for the struct at opt_addr
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it, based on struct_name and
+  // opt_name.
+  static Status ParseStruct(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const std::string& value, void* opt_addr);
+
+  // Serializes the values from opt_addr using the rules in type_map.
+  // Returns the serialized form in result.
+  // Returns OK on success or non-OK if some option could not be serialized.
+  static Status SerializeType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      const void* opt_addr, std::string* value);
+
+  // Serializes the input addr according to the map for the struct to value.
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it
+  static Status SerializeStruct(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const void* opt_addr, std::string* value);
+
+  // Compares the values in this_addr and that_addr using the rules in type_map.
+  // If the values are equal, returns true
+  // If the values are not equal, returns false and sets mismatch to the name
+  // of the first value that did not match.
+  static bool TypesAreEqual(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& map,
+      const void* this_addr, const void* that_addr, std::string* mismatch);
+
+  // Compares the input offsets according to the map for the struct and returns
+  // true if they are equivalent, false otherwise.
+  // struct_name is the name of the struct option as registered
+  // opt_name is the name of the option being evaluated.  This may
+  // be the whole struct or a sub-element of it
+  static bool StructsAreEqual(
+      const ConfigOptions& config_options, const std::string& struct_name,
+      const std::unordered_map<std::string, OptionTypeInfo>* map,
+      const std::string& opt_name, const void* this_offset,
+      const void* that_offset, std::string* mismatch);
+
+  // Finds the entry for the opt_name in the opt_map, returning
+  // nullptr if not found.
+  // If found, elem_name will be the name of option to find.
+  // This may be opt_name, or a substring of opt_name.
+  // For "simple" options, opt_name will be equal to elem_name.  Given the
+  // opt_name "opt", elem_name will equal "opt".
+  // For "embedded" options (like structs), elem_name may be opt_name
+  // or a field within the opt_name.  For example, given the struct "struct",
+  // and opt_name of "struct.field", elem_name will be "field"
+  static const OptionTypeInfo* Find(
+      const std::string& opt_name,
+      const std::unordered_map<std::string, OptionTypeInfo>& opt_map,
+      std::string* elem_name);
+
+  // Returns the next token marked by the delimiter from "opts" after start in
+  // token and updates end to point to where that token stops. Delimiters inside
+  // of braces are ignored. Returns OK if a token is found and an error if the
+  // input opts string is mis-formatted.
+  // Given "a=AA;b=BB;" start=2 and delimiter=";", token is "AA" and end points
+  // to "b" Given "{a=A;b=B}", the token would be "a=A;b=B"
+  //
+  // @param opts The string in which to find the next token
+  // @param delimiter The delimiter between tokens
+  // @param start     The position in opts to start looking for the token
+  // @param ed        Returns the end position in opts of the token
+  // @param token     Returns the token
+  // @returns OK if a token was found
+  // @return InvalidArgument if the braces mismatch
+  //          (e.g. "{a={b=c;}" ) -- missing closing brace
+  // @return InvalidArgument if an expected delimiter is not found
+  //        e.g. "{a=b}c=d;" -- missing delimiter before "c"
+  static Status NextToken(const std::string& opts, char delimiter, size_t start,
+                          size_t* end, std::string* token);
+
+  constexpr static const char* kIdPropName() { return "id"; }
+  constexpr static const char* kIdPropSuffix() { return ".id"; }
+
+ private:
+  int offset_;
+
+  // The optional function to convert a string to its representation
+  ParseFunc parse_func_;
+
+  // The optional function to convert a value to its string representation
+  SerializeFunc serialize_func_;
+
+  // The optional function to match two option values
+  EqualsFunc equals_func_;
+
+  OptionType type_;
+  OptionVerificationType verification_;
+  OptionTypeFlags flags_;
+};
+
+// Parses the input value into elements of the result vector.  This method
+// will break the input value into the individual tokens (based on the
+// separator), where each of those tokens will be parsed based on the rules of
+// elem_info. The result vector will be populated with elements based on the
+// input tokens. For example, if the value=1:2:3:4:5 and elem_info parses
+// integers, the result vector will contain the integers 1,2,3,4,5
+// @param config_options Controls how the option value is parsed.
+// @param elem_info Controls how individual tokens in value are parsed
+// @param separator Character separating tokens in values (':' in the above
+// example)
+// @param name      The name associated with this vector option
+// @param value     The input string to parse into tokens
+// @param result    Returns the results of parsing value into its elements.
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status ParseVector(const ConfigOptions& config_options,
+                   const OptionTypeInfo& elem_info, char separator,
+                   const std::string& name, const std::string& value,
+                   std::vector<T>* result) {
+  result->clear();
+  Status status;
+
+  // Turn off ignore_unknown_objects so we can tell if the returned
+  // object is valid or not.
+  ConfigOptions copy = config_options;
+  copy.ignore_unsupported_options = false;
+  for (size_t start = 0, end = 0;
+       status.ok() && start < value.size() && end != std::string::npos;
+       start = end + 1) {
+    std::string token;
+    status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
+    if (status.ok()) {
+      T elem;
+      status = elem_info.Parse(copy, name, token, &elem);
+      if (status.ok()) {
+        result->emplace_back(elem);
+      } else if (config_options.ignore_unsupported_options &&
+                 status.IsNotSupported()) {
+        // If we were ignoring unsupported options and this one should be
+        // ignored, ignore it by setting the status to OK
+        status = Status::OK();
+      }
+    }
+  }
+  return status;
+}
+
+// Serializes the input vector into its output value.  Elements are
+// separated by the separator character.  This element will convert all of the
+// elements in vec into their serialized form, using elem_info to perform the
+// serialization.
+// For example, if the vec contains the integers 1,2,3,4,5 and elem_info
+// serializes the output would be 1:2:3:4:5 for separator ":".
+// @param config_options Controls how the option value is serialized.
+// @param elem_info Controls how individual tokens in value are serialized
+// @param separator Character separating tokens in value (':' in the above
+// example)
+// @param name      The name associated with this vector option
+// @param vec       The input vector to serialize
+// @param value     The output string of serialized options
+// @return OK if the value was successfully parse
+// @return InvalidArgument if the value is improperly formed or if the token
+//                          could not be parsed
+// @return NotFound         If the tokenized value contains unknown options for
+// its type
+template <typename T>
+Status SerializeVector(const ConfigOptions& config_options,
+                       const OptionTypeInfo& elem_info, char separator,
+                       const std::string& name, const std::vector<T>& vec,
+                       std::string* value) {
+  std::string result;
+  ConfigOptions embedded = config_options;
+  embedded.delimiter = ";";
+  int printed = 0;
+  for (const auto& elem : vec) {
+    std::string elem_str;
+    Status s = elem_info.Serialize(embedded, name, &elem, &elem_str);
+    if (!s.ok()) {
+      return s;
+    } else if (!elem_str.empty()) {
+      if (printed++ > 0) {
+        result += separator;
+      }
+      // If the element contains embedded separators, put it inside of brackets
+      if (elem_str.find(separator) != std::string::npos) {
+        result += "{" + elem_str + "}";
+      } else {
+        result += elem_str;
+      }
+    }
+  }
+  if (result.find("=") != std::string::npos) {
+    *value = "{" + result + "}";
+  } else if (printed > 1 && result.at(0) == '{') {
+    *value = "{" + result + "}";
+  } else {
+    *value = result;
+  }
+  return Status::OK();
+}
+
+// Compares the input vectors vec1 and vec2 for equality
+// If the vectors are the same size, elements of the vectors are compared one by
+// one using elem_info to perform the comparison.
+//
+// @param config_options Controls how the vectors are compared.
+// @param elem_info Controls how individual elements in the vectors are compared
+// @param name      The name associated with this vector option
+// @param vec1,vec2 The vectors to compare.
+// @param mismatch  If the vectors are not equivalent, mismatch will point to
+// the first
+//                  element of the comparison that did not match.
+// @return true     If vec1 and vec2 are "equal", false otherwise
+template <typename T>
+bool VectorsAreEqual(const ConfigOptions& config_options,
+                     const OptionTypeInfo& elem_info, const std::string& name,
+                     const std::vector<T>& vec1, const std::vector<T>& vec2,
+                     std::string* mismatch) {
+  if (vec1.size() != vec2.size()) {
+    *mismatch = name;
+    return false;
+  } else {
+    for (size_t i = 0; i < vec1.size(); ++i) {
+      if (!elem_info.AreEqual(
+              config_options, name, reinterpret_cast<const char*>(&vec1[i]),
+              reinterpret_cast<const char*>(&vec2[i]), mismatch)) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/options_util.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,12 +11,14 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
 // Constructs the DBOptions and ColumnFamilyDescriptors by loading the
 // latest RocksDB options file stored in the specified rocksdb database.
 //
@@ -45,20 +47,26 @@
 // pointer options of BlockBasedTableOptions (flush_block_policy_factory,
 // block_cache, and block_cache_compressed), which will be initialized with
 // default values.  Developers can further specify these three options by
-// casting the return value of TableFactoroy::GetOptions() to
+// casting the return value of TableFactory::GetOptions() to
 // BlockBasedTableOptions and making necessary changes.
 //
 // ignore_unknown_options can be set to true if you want to ignore options
-// that are from a newer version of the db, esentially for forward
+// that are from a newer version of the db, essentially for forward
 // compatibility.
 //
+// config_options contains a set of options that controls the processing
+// of the options.  The LoadLatestOptions(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding options
+// in the ConfigOptions parameter.
+//
 // examples/options_file_example.cc demonstrates how to use this function
 // to open a RocksDB instance.
 //
 // @return the function returns an OK status when it went successfully.  If
 //     the specified "dbpath" does not contain any option file, then a
 //     Status::NotFound will be returned.  A return value other than
-//     Status::OK or Status::NotFound indicates there're some error related
+//     Status::OK or Status::NotFound indicates there is some error related
 //     to the options file itself.
 //
 // @see LoadOptionsFromFile
@@ -67,16 +75,30 @@
                          std::vector<ColumnFamilyDescriptor>* cf_descs,
                          bool ignore_unknown_options = false,
                          std::shared_ptr<Cache>* cache = {});
+Status LoadLatestOptions(const ConfigOptions& config_options,
+                         const std::string& dbpath, DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         std::shared_ptr<Cache>* cache = {});
 
 // Similar to LoadLatestOptions, this function constructs the DBOptions
 // and ColumnFamilyDescriptors based on the specified RocksDB Options file.
 //
+// The LoadOptionsFile(ConfigOptions...) should be preferred;
+// the alternative signature may be deprecated in a future release. The
+// equivalent functionality can be achieved by setting the corresponding
+// options in the ConfigOptions parameter.
+//
 // @see LoadLatestOptions
 Status LoadOptionsFromFile(const std::string& options_file_name, Env* env,
                            DBOptions* db_options,
                            std::vector<ColumnFamilyDescriptor>* cf_descs,
                            bool ignore_unknown_options = false,
                            std::shared_ptr<Cache>* cache = {});
+Status LoadOptionsFromFile(const ConfigOptions& config_options,
+                           const std::string& options_file_name,
+                           DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           std::shared_ptr<Cache>* cache = {});
 
 // Returns the latest options file name under the specified db path.
 Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
@@ -97,6 +119,10 @@
     const std::string& dbpath, Env* env, const DBOptions& db_options,
     const std::vector<ColumnFamilyDescriptor>& cf_descs,
     bool ignore_unknown_options = false);
+Status CheckOptionsCompatibility(
+    const ConfigOptions& config_options, const std::string& dbpath,
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs);
 
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/regex.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,48 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper for parsed regular expressions. The regex syntax and matching is
+// compatible with std::regex.
+//
+// !!!!!! WARNING !!!!!!: The implementation currently uses std::regex, which
+// has terrible performance in some cases, including possible crash due to
+// stack overflow. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61582
+// for example. Avoid use in production as much as possible.
+//
+// Internal note: see also TestRegex
+class Regex {
+ public:
+  // Note: Cannot be constructed with a pattern, so that syntax errors can
+  // be handled without using exceptions.
+
+  // Parse returns OK and saves to `out` when the pattern is valid regex
+  // syntax (modified ECMAScript), or else returns InvalidArgument.
+  // See https://en.cppreference.com/w/cpp/regex/ecmascript
+  static Status Parse(const char *pattern, Regex *out);
+  static Status Parse(const std::string &pattern, Regex *out);
+
+  // Checks that the whole of str is matched by this regex. If called on a
+  // default-constructed Regex, will trigger assertion failure in DEBUG build
+  // or return false in release build.
+  bool Matches(const std::string &str) const;
+
+ private:
+  class Impl;
+  std::shared_ptr<Impl> impl_;  // shared_ptr for simple implementation
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/replayer.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,87 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TraceRecord;
+class TraceRecordResult;
+
+struct ReplayOptions {
+  // Number of threads used for replaying. If 0 or 1, replay using
+  // single thread.
+  uint32_t num_threads;
+
+  // Enables fast forwarding a replay by increasing/reducing the delay between
+  // the ingested traces.
+  //   If > 0.0 and < 1.0, slow down the replay by this amount.
+  //   If 1.0, replay the operations at the same rate as in the trace stream.
+  //   If > 1, speed up the replay by this amount.
+  double fast_forward;
+
+  ReplayOptions() : num_threads(1), fast_forward(1.0) {}
+
+  ReplayOptions(uint32_t num_of_threads, double fast_forward_ratio)
+      : num_threads(num_of_threads), fast_forward(fast_forward_ratio) {}
+};
+
+// Replayer helps to replay the captured RocksDB query level operations.
+// The Replayer can either be created from DB::NewReplayer method, or be
+// instantiated via db_bench today, on using "replay" benchmark.
+class Replayer {
+ public:
+  virtual ~Replayer() = default;
+
+  // Make some preparation before replaying the trace. This will also reset the
+  // replayer in order to restart replaying.
+  virtual Status Prepare() = 0;
+
+  // Return the timestamp when the trace recording was started.
+  virtual uint64_t GetHeaderTimestamp() const = 0;
+
+  // Atomically read one trace into a TraceRecord (excluding the header and
+  // footer traces).
+  // Return Status::OK() on success;
+  // Status::Incomplete() if Prepare() was not called or no more available
+  // trace;
+  // Status::NotSupported() if the read trace type is not supported.
+  virtual Status Next(std::unique_ptr<TraceRecord>* record) = 0;
+
+  // Execute one TraceRecord.
+  // Return Status::OK() if the execution was successful. Get/MultiGet traces
+  // will still return Status::OK() even if they got Status::NotFound()
+  // from DB::Get() or DB::MultiGet();
+  // Status::Incomplete() if Prepare() was not called or no more available
+  // trace;
+  // Status::NotSupported() if the operation is not supported;
+  // Otherwise, return the corresponding error status.
+  //
+  // The actual operation execution status and result(s) will be saved in
+  // result. For example, a GetQueryTraceRecord will have its DB::Get() status
+  // and the returned value saved in a SingleValueTraceExecutionResult.
+  virtual Status Execute(const std::unique_ptr<TraceRecord>& record,
+                         std::unique_ptr<TraceRecordResult>* result) = 0;
+
+  // Replay all the traces from the provided trace stream, taking the delay
+  // between the traces into consideration.
+  //
+  // result_callback reports the status of executing a trace record, and the
+  // actual operation execution result (See the description for Execute()).
+  virtual Status Replay(
+      const ReplayOptions& options,
+      const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+          result_callback) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/sim_cache.h	2025-05-19 16:14:27.000000000 +0000
@@ -25,7 +25,7 @@
 // can help users tune their current block cache size, and determine how
 // efficient they are using the memory.
 //
-// Since GetSimCapacity() returns the capacity for simulutation, it differs from
+// Since GetSimCapacity() returns the capacity for simulation, it differs from
 // actual memory usage, which can be estimated as:
 // sim_capacity * entry_size / (entry_size + block_size),
 // where 76 <= entry_size <= 104,
@@ -60,7 +60,7 @@
   // sets the maximum configured capacity of the simcache. When the new
   // capacity is less than the old capacity and the existing usage is
   // greater than new capacity, the implementation will purge old entries
-  // to fit new capapicty.
+  // to fit new capacity.
   virtual void SetSimCapacity(size_t capacity) = 0;
 
   // returns the lookup times of simcache
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h	2025-05-19 16:14:27.000000000 +0000
@@ -141,6 +141,11 @@
                                              import_options, metadata, handle);
   }
 
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_opts) override {
+    return db_->VerifyFileChecksums(read_opts);
+  }
+
   virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
 
   virtual Status VerifyChecksum(const ReadOptions& options) override {
@@ -347,6 +352,17 @@
     db_->GetLiveFilesMetaData(metadata);
   }
 
+  virtual Status GetLiveFilesChecksumInfo(
+      FileChecksumList* checksum_list) override {
+    return db_->GetLiveFilesChecksumInfo(checksum_list);
+  }
+
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override {
+    return db_->GetLiveFilesStorageInfo(opts, files);
+  }
+
   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
                                        ColumnFamilyMetaData* cf_meta) override {
     db_->GetColumnFamilyMetaData(column_family, cf_meta);
@@ -362,6 +378,31 @@
   using DB::EndBlockCacheTrace;
   Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
 
+  using DB::StartIOTrace;
+  Status StartIOTrace(const TraceOptions& options,
+                      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartIOTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndIOTrace;
+  Status EndIOTrace() override { return db_->EndIOTrace(); }
+
+  using DB::StartTrace;
+  Status StartTrace(const TraceOptions& options,
+                    std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndTrace;
+  Status EndTrace() override { return db_->EndTrace(); }
+
+  using DB::NewDefaultReplayer;
+  Status NewDefaultReplayer(const std::vector<ColumnFamilyHandle*>& handles,
+                            std::unique_ptr<TraceReader>&& reader,
+                            std::unique_ptr<Replayer>* replayer) override {
+    return db_->NewDefaultReplayer(handles, std::move(reader), replayer);
+  }
+
 #endif  // ROCKSDB_LITE
 
   virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
@@ -378,6 +419,16 @@
     return db_->SetPreserveDeletesSequenceNumber(seqnum);
   }
 
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                  std::string ts_low) override {
+    return db_->IncreaseFullHistoryTsLow(column_family, ts_low);
+  }
+
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                             std::string* ts_low) override {
+    return db_->GetFullHistoryTsLow(column_family, ts_low);
+  }
+
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
     return db_->GetSortedWalFiles(files);
   }
@@ -392,6 +443,13 @@
     return db_->GetCreationTimeOfOldestFile(creation_time);
   }
 
+  // WARNING: This API is planned for removal in RocksDB 7.0 since it does not
+  // operate at the proper level of abstraction for a key-value store, and its
+  // contract/restrictions are poorly documented. For example, it returns non-OK
+  // `Status` for non-bottommost files and files undergoing compaction. Since we
+  // do not plan to maintain it, the contract will likely remain underspecified
+  // until its removal. Any user is encouraged to read the implementation
+  // carefully and migrate away from it when possible.
   virtual Status DeleteFile(std::string name) override {
     return db_->DeleteFile(name);
   }
@@ -400,6 +458,10 @@
     return db_->GetDbIdentity(identity);
   }
 
+  virtual Status GetDbSessionId(std::string& session_id) const override {
+    return db_->GetDbSessionId(session_id);
+  }
+
   using DB::SetOptions;
   virtual Status SetOptions(ColumnFamilyHandle* column_family_handle,
                             const std::unordered_map<std::string, std::string>&
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/table_properties_collectors.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,13 +14,27 @@
 
 // A factory of a table property collector that marks a SST
 // file as need-compaction when it observe at least "D" deletion
-// entries in any "N" consecutive entires.
+// entries in any "N" consecutive entries or the ratio of tombstone
+// entries in the whole file >= the specified deletion ratio.
 class CompactOnDeletionCollectorFactory
     : public TablePropertiesCollectorFactory {
  public:
-  virtual ~CompactOnDeletionCollectorFactory() {}
+  // A factory of a table property collector that marks a SST
+  // file as need-compaction when it observe at least "D" deletion
+  // entries in any "N" consecutive entries, or the ratio of tombstone
+  // entries >= deletion_ratio.
+  //
+  // @param sliding_window_size "N"
+  // @param deletion_trigger "D"
+  // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+  //     based on deletion ratio.
+  CompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                    size_t deletion_trigger,
+                                    double deletion_ratio);
 
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+  ~CompactOnDeletionCollectorFactory() {}
+
+  TablePropertiesCollector* CreateTablePropertiesCollector(
       TablePropertiesCollectorFactory::Context context) override;
 
   // Change the value of sliding_window_size "N"
@@ -28,47 +42,49 @@
   void SetWindowSize(size_t sliding_window_size) {
     sliding_window_size_.store(sliding_window_size);
   }
+  size_t GetWindowSize() const { return sliding_window_size_.load(); }
 
   // Change the value of deletion_trigger "D"
   void SetDeletionTrigger(size_t deletion_trigger) {
     deletion_trigger_.store(deletion_trigger);
   }
 
-  virtual const char* Name() const override {
-    return "CompactOnDeletionCollector";
+  size_t GetDeletionTrigger() const { return deletion_trigger_.load(); }
+  // Change deletion ratio.
+  // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+  //     based on deletion ratio.
+  void SetDeletionRatio(double deletion_ratio) {
+    deletion_ratio_.store(deletion_ratio);
   }
 
- private:
-  friend std::shared_ptr<CompactOnDeletionCollectorFactory>
-  NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
-                                       size_t deletion_trigger);
-  // A factory of a table property collector that marks a SST
-  // file as need-compaction when it observe at least "D" deletion
-  // entries in any "N" consecutive entires.
-  //
-  // @param sliding_window_size "N"
-  // @param deletion_trigger "D"
-  CompactOnDeletionCollectorFactory(size_t sliding_window_size,
-                                    size_t deletion_trigger)
-      : sliding_window_size_(sliding_window_size),
-        deletion_trigger_(deletion_trigger) {}
+  double GetDeletionRatio() const { return deletion_ratio_.load(); }
+  static const char* kClassName() { return "CompactOnDeletionCollector"; }
+  const char* Name() const override { return kClassName(); }
 
+  std::string ToString() const override;
+
+ private:
   std::atomic<size_t> sliding_window_size_;
   std::atomic<size_t> deletion_trigger_;
+  std::atomic<double> deletion_ratio_;
 };
 
 // Creates a factory of a table property collector that marks a SST
 // file as need-compaction when it observe at least "D" deletion
-// entries in any "N" consecutive entires.
+// entries in any "N" consecutive entries, or the ratio of tombstone
+// entries >= deletion_ratio.
 //
 // @param sliding_window_size "N". Note that this number will be
 //     round up to the smallest multiple of 128 that is no less
 //     than the specified size.
 // @param deletion_trigger "D".  Note that even when "N" is changed,
 //     the specified number for "D" will not be changed.
+// @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
+//     based on deletion ratio. Disabled by default.
 extern std::shared_ptr<CompactOnDeletionCollectorFactory>
 NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
-                                     size_t deletion_trigger);
+                                     size_t deletion_trigger,
+                                     double deletion_ratio = 0);
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h	2025-05-19 16:14:27.000000000 +0000
@@ -24,6 +24,83 @@
 
 using TransactionID = uint64_t;
 
+/*
+  class Endpoint allows to define prefix ranges.
+
+  Prefix ranges are introduced below.
+
+  == Basic Ranges ==
+  Let's start from basic ranges. Key Comparator defines ordering of rowkeys.
+  Then, one can specify finite closed ranges by just providing rowkeys of their
+  endpoints:
+
+    lower_endpoint <= X <= upper_endpoint
+
+  However our goal is to provide a richer set of endpoints. Read on.
+
+  == Lexicographic ordering ==
+  A lexicographic (or dictionary) ordering satisfies these criteria: If there
+  are two keys in form
+    key_a = {prefix_a, suffix_a}
+    key_b = {prefix_b, suffix_b}
+  and
+    prefix_a < prefix_b
+  then
+    key_a < key_b.
+
+  == Prefix ranges ==
+  With lexicographic ordering, one may want to define ranges in form
+
+     "prefix is $PREFIX"
+
+  which translates to a range in form
+
+    {$PREFIX, -infinity} < X < {$PREFIX, +infinity}
+
+  where -infinity will compare less than any possible suffix, and +infinity
+  will compare as greater than any possible suffix.
+
+  class Endpoint allows to define these kind of rangtes.
+
+  == Notes ==
+  BytewiseComparator and ReverseBytewiseComparator produce lexicographic
+  ordering.
+
+  The row comparison function is able to compare key prefixes. If the data
+  domain includes keys A and B, then the comparison function is able to compare
+  equal-length prefixes:
+
+    min_len= min(byte_length(A), byte_length(B));
+    cmp(Slice(A, min_len), Slice(B, min_len));  // this call is valid
+
+  == Other options ==
+  As far as MyRocks is concerned, the alternative to prefix ranges would be to
+  support both open (non-inclusive) and closed (inclusive) range endpoints.
+*/
+
+class Endpoint {
+ public:
+  Slice slice;
+
+  /*
+    true  : the key has a "+infinity" suffix. A suffix that would compare as
+            greater than any other suffix
+    false : otherwise
+  */
+  bool inf_suffix;
+
+  explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false)
+      : slice(slice_arg), inf_suffix(inf_suffix_arg) {}
+
+  explicit Endpoint(const char* s, bool inf_suffix_arg = false)
+      : slice(s), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint(const char* s, size_t size, bool inf_suffix_arg = false)
+      : slice(s, size), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint() : inf_suffix(false) {}
+};
+
 // Provides notification to the caller of SetSnapshotOnNextOperation when
 // the actual snapshot gets created
 class TransactionNotifier {
@@ -139,7 +216,9 @@
   //
   // If this transaction was created by a TransactionDB(), Status::Expired()
   // may be returned if this transaction has lived for longer than
-  // TransactionOptions.expiration.
+  // TransactionOptions.expiration. Status::TxnNotPrepared() may be returned if
+  // TransactionOptions.skip_prepare is false and Prepare is not called on this
+  // transaction before Commit.
   virtual Status Commit() = 0;
 
   // Discard all batched writes in this transaction.
@@ -275,6 +354,12 @@
     }
   }
 
+  // Get a range lock on [start_endpoint; end_endpoint].
+  virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&,
+                              const Endpoint&) {
+    return Status::NotSupported();
+  }
+
   virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
                               std::string* value, bool exclusive = true,
                               const bool do_validate = true) = 0;
@@ -491,7 +576,8 @@
     AWAITING_PREPARE = 1,
     PREPARED = 2,
     AWAITING_COMMIT = 3,
-    COMMITED = 4,
+    COMMITTED = 4,
+    COMMITED = COMMITTED, // old misspelled name
     AWAITING_ROLLBACK = 5,
     ROLLEDBACK = 6,
     LOCKS_STOLEN = 7,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h	2025-05-19 16:14:27.000000000 +0000
@@ -31,6 +31,122 @@
 
 const uint32_t kInitialMaxDeadlocks = 5;
 
+class LockManager;
+struct RangeLockInfo;
+
+// A lock manager handle
+// The workflow is as follows:
+//  * Use a factory method (like NewRangeLockManager()) to create a lock
+//    manager and get its handle.
+//  * A Handle for a particular kind of lock manager will have extra
+//    methods and parameters to control the lock manager
+//  * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It
+//    will be used to perform locking.
+class LockManagerHandle {
+ public:
+  // PessimisticTransactionDB will call this to get the Lock Manager it's going
+  // to use.
+  virtual LockManager* getLockManager() = 0;
+
+  virtual ~LockManagerHandle() {}
+};
+
+// Same as class Endpoint, but use std::string to manage the buffer allocation
+struct EndpointWithString {
+  std::string slice;
+  bool inf_suffix;
+};
+
+struct RangeDeadlockInfo {
+  TransactionID m_txn_id;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+
+  EndpointWithString m_start;
+  EndpointWithString m_end;
+};
+
+struct RangeDeadlockPath {
+  std::vector<RangeDeadlockInfo> path;
+  bool limit_exceeded;
+  int64_t deadlock_time;
+
+  explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry,
+                             const int64_t& dl_time)
+      : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+  // empty path, limit exceeded constructor and default constructor
+  explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+      : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+  bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+// A handle to control RangeLockManager (Range-based lock manager) from outside
+// RocksDB
+class RangeLockManagerHandle : public LockManagerHandle {
+ public:
+  // Set total amount of lock memory to use.
+  //
+  //  @return 0 Ok
+  //  @return EDOM Failed to set because currently using more memory than
+  //        specified
+  virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
+  virtual size_t GetMaxLockMemory() = 0;
+
+  using RangeLockStatus =
+      std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+
+  // Lock Escalation barrier check function.
+  // It is called for a couple of endpoints A and B, such that A < B.
+  // If escalation_barrier_check_func(A, B)==true, then there's a lock
+  // escalation barrier between A and B, and lock escalation is not allowed
+  // to bridge the gap between A and B.
+  //
+  // The function may be called from any thread that acquires or releases
+  // locks. It should not throw exceptions. There is currently no way to return
+  // an error.
+  using EscalationBarrierFunc =
+      std::function<bool(const Endpoint& a, const Endpoint& b)>;
+
+  // Set the user-provided barrier check function
+  virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0;
+
+  virtual RangeLockStatus GetRangeLockStatusData() = 0;
+
+  class Counters {
+   public:
+    // Number of times lock escalation was triggered (for all column families)
+    uint64_t escalation_count;
+
+    // Number of times lock acquisition had to wait for a conflicting lock
+    // to be released. This counts both successful waits (where the desired
+    // lock was acquired) and waits that timed out or got other error.
+    uint64_t lock_wait_count;
+
+    // How much memory is currently used for locks (total for all column
+    // families)
+    uint64_t current_lock_memory;
+  };
+
+  // Get the current counter values
+  virtual Counters GetStatus() = 0;
+
+  // Functions for range-based Deadlock reporting.
+  virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0;
+  virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+  virtual ~RangeLockManagerHandle() {}
+};
+
+// A factory function to create a Range Lock Manager. The created object should
+// be:
+//  1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in
+//     range-locking mode
+//  2. Used to control the lock manager when the DB is already open.
+RangeLockManagerHandle* NewRangeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
 struct TransactionDBOptions {
   // Specifies the maximum number of keys that can be locked at the same time
   // per column family.
@@ -92,9 +208,13 @@
   // for the special way that myrocks uses this operands.
   bool rollback_merge_operands = false;
 
+  // nullptr means use default lock manager.
+  // Other value means the user provides a custom lock manager.
+  std::shared_ptr<LockManagerHandle> lock_mgr_handle;
+
   // If true, the TransactionDB implementation might skip concurrency control
   // unless it is overridden by TransactionOptions or
-  // TransactionDBWriteOptimizations. This can be used in conjuction with
+  // TransactionDBWriteOptimizations. This can be used in conjunction with
   // DBOptions::unordered_write when the TransactionDB is used solely for write
   // ordering rather than concurrency control.
   bool skip_concurrency_control = false;
@@ -172,6 +292,10 @@
   // Default: false
   bool skip_concurrency_control = false;
 
+  // In pessimistic transaction, if this is true, then you can skip Prepare
+  // before Commit, otherwise, you must Prepare before Commit.
+  bool skip_prepare = true;
+
   // See TransactionDBOptions::default_write_batch_flush_threshold for
   // description. If a negative value is specified, then the default value from
   // TransactionDBOptions is used.
@@ -198,6 +322,13 @@
   bool exclusive;
 };
 
+struct RangeLockInfo {
+  EndpointWithString start;
+  EndpointWithString end;
+  std::vector<TransactionID> ids;
+  bool exclusive;
+};
+
 struct DeadlockInfo {
   TransactionID m_txn_id;
   uint32_t m_cf_id;
@@ -233,6 +364,17 @@
     // falls back to the un-optimized version of ::Write
     return Write(opts, updates);
   }
+  // Transactional `DeleteRange()` is not yet supported.
+  // However, users who know their deleted range does not conflict with
+  // anything can still use it via the `Write()` API. In all cases, the
+  // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
+  // used and `skip_concurrency_control` must be set. When using either
+  // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
+  // additionally be set.
+  virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+                             const Slice&, const Slice&) override {
+    return Status::NotSupported();
+  }
   // Open a TransactionDB similar to DB::Open().
   // Internally call PrepareWrap() and WrapDB()
   // If the return status is not ok, then dbptr is set to nullptr.
@@ -292,6 +434,7 @@
   // The mapping is column family id -> KeyLockInfo
   virtual std::unordered_multimap<uint32_t, KeyLockInfo>
   GetLockStatusData() = 0;
+
   virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
   virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h	2025-05-19 16:14:27.000000000 +0000
@@ -61,7 +61,7 @@
   //
   // Returns OK if notified.
   // Returns TimedOut if timeout is reached.
-  // Returns other status if TransactionDB should otherwis stop waiting and
+  // Returns other status if TransactionDB should otherwise stop waiting and
   //  fail the operation.
   // May return OK spuriously even if not notified.
   virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/utilities/write_batch_with_index.h	2025-05-19 16:14:27.000000000 +0000
@@ -40,12 +40,13 @@
   kDeleteRangeRecord,
   kLogDataRecord,
   kXIDRecord,
+  kUnknownRecord,
 };
 
 // an entry for Put, Merge, Delete, or SingleDelete entry for write batches.
 // Used in WBWIIterator.
 struct WriteEntry {
-  WriteType type;
+  WriteType type = kUnknownRecord;
   Slice key;
   Slice value;
 };
@@ -168,7 +169,7 @@
   // returned iterator will also delete the base_iterator.
   //
   // Updating write batch with the current key of the iterator is not safe.
-  // We strongly recommand users not to do it. It will invalidate the current
+  // We strongly recommend users not to do it. It will invalidate the current
   // key() and value() of the iterator. This invalidation happens even before
   // the write batch update finishes. The state may recover after Next() is
   // called.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/version.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/version.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/version.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/version.h	2025-05-19 16:14:27.000000000 +0000
@@ -4,9 +4,14 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #pragma once
 
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/rocksdb_namespace.h"
+
 #define ROCKSDB_MAJOR 6
-#define ROCKSDB_MINOR 8
-#define ROCKSDB_PATCH 0
+#define ROCKSDB_MINOR 29
+#define ROCKSDB_PATCH 5
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
@@ -14,3 +19,23 @@
 #define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
 #define __ROCKSDB_MINOR__ ROCKSDB_MINOR
 #define __ROCKSDB_PATCH__ ROCKSDB_PATCH
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a set of properties indicating how/when/where this version of RocksDB
+// was created.
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties();
+
+// Returns the current version of RocksDB as a string (e.g. "6.16.0").
+// If with_patch is true, the patch is included (6.16.x).
+// Otherwise, only major and minor version is included (6.16)
+std::string GetRocksVersionAsString(bool with_patch = true);
+
+// Gets the set of build properties (@see GetRocksBuildProperties) into a
+// string. Properties are returned one-per-line, with the first line being:
+// "<program> from RocksDB <version>.
+// If verbose is true, the full set of properties is
+// printed. If verbose is false, only the version information (@see
+// GetRocksVersionString) is printed.
+std::string GetRocksBuildInfoAsString(const std::string& program,
+                                      bool verbose = false);
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/wal_filter.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,17 +8,26 @@
 #include <map>
 #include <string>
 
+#include "rocksdb/customizable.h"
 #include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class WriteBatch;
+struct ConfigOptions;
 
 // WALFilter allows an application to inspect write-ahead-log (WAL)
 // records or modify their processing on recovery.
 // Please see the details below.
-class WalFilter {
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class WalFilter : public Customizable {
  public:
+  static const char* Type() { return "WalFilter"; }
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value, WalFilter** result);
   enum class WalProcessingOption {
     // Continue processing as usual
     kContinueProcessing = 0,
@@ -96,7 +105,7 @@
 
   // Returns a name that identifies this WAL filter.
   // The name will be printed to LOG file on start up for diagnosis.
-  virtual const char* Name() const = 0;
+  virtual const char* Name() const override = 0;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_batch.h	2025-05-19 16:14:27.000000000 +0000
@@ -25,10 +25,13 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <atomic>
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "rocksdb/status.h"
 #include "rocksdb/write_batch_base.h"
 
@@ -61,11 +64,19 @@
 class WriteBatch : public WriteBatchBase {
  public:
   explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0);
-  explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz);
+  // `protection_bytes_per_key` is the number of bytes used to store
+  // protection information for each key entry. Currently supported values are
+  // zero (disabled) and eight.
+  explicit WriteBatch(size_t reserved_bytes, size_t max_bytes,
+                      size_t protection_bytes_per_key);
   ~WriteBatch() override;
 
   using WriteBatchBase::Put;
   // Store the mapping "key->value" in the database.
+  // The following Put(..., const Slice& key, ...) API can also be used when
+  // user-defined timestamp is enabled as long as `key` points to a contiguous
+  // buffer with timestamp appended after user key. The caller is responsible
+  // for setting up the memory buffer pointed to by `key`.
   Status Put(ColumnFamilyHandle* column_family, const Slice& key,
              const Slice& value) override;
   Status Put(const Slice& key, const Slice& value) override {
@@ -75,6 +86,10 @@
   // Variant of Put() that gathers output like writev(2).  The key and value
   // that will be written to the database are concatenations of arrays of
   // slices.
+  // The following Put(..., const SliceParts& key, ...) API can be used when
+  // user-defined timestamp is enabled as long as the timestamp is the last
+  // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+  // for setting up the `key` SliceParts object.
   Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
              const SliceParts& value) override;
   Status Put(const SliceParts& key, const SliceParts& value) override {
@@ -83,10 +98,18 @@
 
   using WriteBatchBase::Delete;
   // If the database contains a mapping for "key", erase it.  Else do nothing.
+  // The following Delete(..., const Slice& key) can be used when user-defined
+  // timestamp is enabled as long as `key` points to a contiguous buffer with
+  // timestamp appended after user key. The caller is responsible for setting
+  // up the memory buffer pointed to by `key`.
   Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
   Status Delete(const Slice& key) override { return Delete(nullptr, key); }
 
   // variant that takes SliceParts
+  // These two variants of Delete(..., const SliceParts& key) can be used when
+  // user-defined timestamp is enabled as long as the timestamp is the last
+  // Slice in `key`, a SliceParts (array of Slices). The caller is responsible
+  // for setting up the `key` SliceParts object.
   Status Delete(ColumnFamilyHandle* column_family,
                 const SliceParts& key) override;
   Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
@@ -265,6 +288,12 @@
       return Status::InvalidArgument("MarkCommit() handler not defined.");
     }
 
+    virtual Status MarkCommitWithTimestamp(const Slice& /*xid*/,
+                                           const Slice& /*commit_ts*/) {
+      return Status::InvalidArgument(
+          "MarkCommitWithTimestamp() handler not defined.");
+    }
+
     // Continue is called by WriteBatch::Iterate. If it returns false,
     // iteration is halted. Otherwise, it continues iterating. The default
     // implementation always returns true.
@@ -307,17 +336,62 @@
   // Returns true if MarkEndPrepare will be called during Iterate
   bool HasEndPrepare() const;
 
-  // Returns trie if MarkCommit will be called during Iterate
+  // Returns true if MarkCommit will be called during Iterate
   bool HasCommit() const;
 
-  // Returns trie if MarkRollback will be called during Iterate
+  // Returns true if MarkRollback will be called during Iterate
   bool HasRollback() const;
 
-  // Assign timestamp to write batch
-  Status AssignTimestamp(const Slice& ts);
-
-  // Assign timestamps to write batch
-  Status AssignTimestamps(const std::vector<Slice>& ts_list);
+  // Experimental.
+  // Assign timestamp to write batch.
+  // This requires that all keys, if enable timestamp, (possibly from multiple
+  // column families) in the write batch have timestamps of the same format.
+  //
+  // checker: callable object to check the timestamp sizes of column families.
+  //
+  // in: cf, the column family id.
+  // in/out: ts_sz. Input as the expected timestamp size of the column
+  //         family, output as the actual timestamp size of the column family.
+  // ret: OK if assignment succeeds.
+  // Status checker(uint32_t cf, size_t& ts_sz);
+  //
+  // User can call checker(uint32_t cf, size_t& ts_sz) which does the
+  // following:
+  // 1. find out the timestamp size of the column family whose id equals `cf`.
+  // 2. if cf's timestamp size is 0, then set ts_sz to 0 and return OK.
+  // 3. otherwise, compare ts_sz with cf's timestamp size and return
+  //    Status::InvalidArgument() if different.
+  Status AssignTimestamp(
+      const Slice& ts,
+      std::function<Status(uint32_t, size_t&)> checker =
+          [](uint32_t /*cf*/, size_t& /*ts_sz*/) { return Status::OK(); });
+
+  // Experimental.
+  // Assign timestamps to write batch.
+  // This API allows the write batch to include keys from multiple column
+  // families whose timestamps' formats can differ. For example, some column
+  // families can enable timestamp, while others disable the feature.
+  // If key does not have timestamp, then put an empty Slice in ts_list as
+  // a placeholder.
+  //
+  // checker: callable object specified by caller to check the timestamp sizes
+  // of column families.
+  //
+  // in: cf, the column family id.
+  // in/out: ts_sz. Input as the expected timestamp size of the column
+  //         family, output as the actual timestamp size of the column family.
+  // ret: OK if assignment succeeds.
+  // Status checker(uint32_t cf, size_t& ts_sz);
+  //
+  // User can call checker(uint32_t cf, size_t& ts_sz) which does the
+  // following:
+  // 1. find out the timestamp size of the column family whose id equals `cf`.
+  // 2. compare ts_sz with cf's timestamp size and return
+  //    Status::InvalidArgument() if different.
+  Status AssignTimestamps(
+      const std::vector<Slice>& ts_list,
+      std::function<Status(uint32_t, size_t&)> checker =
+          [](uint32_t /*cf*/, size_t& /*ts_sz*/) { return Status::OK(); });
 
   using WriteBatchBase::GetWriteBatch;
   WriteBatch* GetWriteBatch() override { return this; }
@@ -338,6 +412,9 @@
 
   void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; }
 
+  struct ProtectionInfo;
+  size_t GetProtectionBytesPerKey() const;
+
  private:
   friend class WriteBatchInternal;
   friend class LocalSavePoint;
@@ -367,11 +444,10 @@
   // more details.
   bool is_latest_persistent_state_ = false;
 
+  std::unique_ptr<ProtectionInfo> prot_info_;
+
  protected:
   std::string rep_;  // See comment in write_batch.cc for the format of rep_
-  const size_t timestamp_size_;
-
-  // Intentionally copyable
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/include/rocksdb/write_buffer_manager.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,46 +13,93 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
 #include <cstddef>
+#include <list>
+#include <mutex>
+
 #include "rocksdb/cache.h"
 
 namespace ROCKSDB_NAMESPACE {
+class CacheReservationManager;
+
+// Interface to block and signal DB instances, intended for RocksDB
+// internal use only. Each DB instance contains ptr to StallInterface.
+class StallInterface {
+ public:
+  virtual ~StallInterface() {}
+
+  virtual void Block() = 0;
 
-class WriteBufferManager {
+  virtual void Signal() = 0;
+};
+
+class WriteBufferManager final {
  public:
-  // _buffer_size = 0 indicates no limit. Memory won't be capped.
+  // Parameters:
+  // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped.
   // memory_usage() won't be valid and ShouldFlush() will always return true.
-  // if `cache` is provided, we'll put dummy entries in the cache and cost
-  // the memory allocated to the cache. It can be used even if _buffer_size = 0.
+  //
+  // cache_: if `cache` is provided, we'll put dummy entries in the cache and
+  // cost the memory allocated to the cache. It can be used even if _buffer_size
+  // = 0.
+  //
+  // allow_stall: if set true, it will enable stalling of writes when
+  // memory_usage() exceeds buffer_size. It will wait for flush to complete and
+  // memory usage to drop down.
   explicit WriteBufferManager(size_t _buffer_size,
-                              std::shared_ptr<Cache> cache = {});
+                              std::shared_ptr<Cache> cache = {},
+                              bool allow_stall = false);
   // No copying allowed
   WriteBufferManager(const WriteBufferManager&) = delete;
   WriteBufferManager& operator=(const WriteBufferManager&) = delete;
 
   ~WriteBufferManager();
 
-  bool enabled() const { return buffer_size_ != 0; }
+  // Returns true if buffer_limit is passed to limit the total memory usage and
+  // is greater than 0.
+  bool enabled() const { return buffer_size() > 0; }
 
-  bool cost_to_cache() const { return cache_rep_ != nullptr; }
+  // Returns true if pointer to cache is passed.
+  bool cost_to_cache() const { return cache_res_mgr_ != nullptr; }
 
+  // Returns the total memory used by memtables.
   // Only valid if enabled()
   size_t memory_usage() const {
     return memory_used_.load(std::memory_order_relaxed);
   }
+
+  // Returns the total memory used by active memtables.
   size_t mutable_memtable_memory_usage() const {
     return memory_active_.load(std::memory_order_relaxed);
   }
-  size_t buffer_size() const { return buffer_size_; }
+
+  size_t dummy_entries_in_cache_usage() const;
+
+  // Returns the buffer_size.
+  size_t buffer_size() const {
+    return buffer_size_.load(std::memory_order_relaxed);
+  }
+
+  void SetBufferSize(size_t new_size) {
+    buffer_size_.store(new_size, std::memory_order_relaxed);
+    mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed);
+    // Check if stall is active and can be ended.
+    MaybeEndWriteStall();
+  }
+
+  // Below functions should be called by RocksDB internally.
 
   // Should only be called from write thread
   bool ShouldFlush() const {
     if (enabled()) {
-      if (mutable_memtable_memory_usage() > mutable_limit_) {
+      if (mutable_memtable_memory_usage() >
+          mutable_limit_.load(std::memory_order_relaxed)) {
         return true;
       }
-      if (memory_usage() >= buffer_size_ &&
-          mutable_memtable_memory_usage() >= buffer_size_ / 2) {
+      size_t local_size = buffer_size();
+      if (memory_usage() >= local_size &&
+          mutable_memtable_memory_usage() >= local_size / 2) {
         // If the memory exceeds the buffer size, we trigger more aggressive
         // flush. But if already more than half memory is being flushed,
         // triggering more flush may not help. We will hold it instead.
@@ -62,39 +109,66 @@
     return false;
   }
 
-  void ReserveMem(size_t mem) {
-    if (cache_rep_ != nullptr) {
-      ReserveMemWithCache(mem);
-    } else if (enabled()) {
-      memory_used_.fetch_add(mem, std::memory_order_relaxed);
-    }
-    if (enabled()) {
-      memory_active_.fetch_add(mem, std::memory_order_relaxed);
+  // Returns true if total memory usage exceeded buffer_size.
+  // We stall the writes untill memory_usage drops below buffer_size. When the
+  // function returns true, all writer threads (including one checking this
+  // condition) across all DBs will be stalled. Stall is allowed only if user
+  // pass allow_stall = true during WriteBufferManager instance creation.
+  //
+  // Should only be called by RocksDB internally .
+  bool ShouldStall() const {
+    if (!allow_stall_ || !enabled()) {
+      return false;
     }
+
+    return IsStallActive() || IsStallThresholdExceeded();
   }
-  // We are in the process of freeing `mem` bytes, so it is not considered
-  // when checking the soft limit.
-  void ScheduleFreeMem(size_t mem) {
-    if (enabled()) {
-      memory_active_.fetch_sub(mem, std::memory_order_relaxed);
-    }
+
+  // Returns true if stall is active.
+  bool IsStallActive() const {
+    return stall_active_.load(std::memory_order_relaxed);
   }
-  void FreeMem(size_t mem) {
-    if (cache_rep_ != nullptr) {
-      FreeMemWithCache(mem);
-    } else if (enabled()) {
-      memory_used_.fetch_sub(mem, std::memory_order_relaxed);
-    }
+
+  // Returns true if stalling condition is met.
+  bool IsStallThresholdExceeded() const {
+    return memory_usage() >= buffer_size_;
   }
 
+  void ReserveMem(size_t mem);
+
+  // We are in the process of freeing `mem` bytes, so it is not considered
+  // when checking the soft limit.
+  void ScheduleFreeMem(size_t mem);
+
+  void FreeMem(size_t mem);
+
+  // Add the DB instance to the queue and block the DB.
+  // Should only be called by RocksDB internally.
+  void BeginWriteStall(StallInterface* wbm_stall);
+
+  // If stall conditions have resolved, remove DB instances from queue and
+  // signal them to continue.
+  void MaybeEndWriteStall();
+
+  void RemoveDBFromQueue(StallInterface* wbm_stall);
+
  private:
-  const size_t buffer_size_;
-  const size_t mutable_limit_;
+  std::atomic<size_t> buffer_size_;
+  std::atomic<size_t> mutable_limit_;
   std::atomic<size_t> memory_used_;
   // Memory that hasn't been scheduled to free.
   std::atomic<size_t> memory_active_;
-  struct CacheRep;
-  std::unique_ptr<CacheRep> cache_rep_;
+  std::unique_ptr<CacheReservationManager> cache_res_mgr_;
+  // Protects cache_res_mgr_
+  std::mutex cache_res_mgr_mu_;
+
+  std::list<StallInterface*> queue_;
+  // Protects the queue_ and stall_active_.
+  std::mutex mu_;
+  bool allow_stall_;
+  // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall()
+  // while holding mu_, but it can be read without a lock.
+  std::atomic<bool> stall_active_;
 
   void ReserveMemWithCache(size_t mem);
   void FreeMemWithCache(size_t mem);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/java/CMakeLists.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/CMakeLists.txt	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -4,6 +4,8 @@
     message("Please consider switching to CMake 3.11.4 or newer")
 endif()
 
+set(CMAKE_JAVA_COMPILE_FLAGS -source 7)
+
 set(JNI_NATIVE_SOURCES
         rocksjni/backupablejni.cc
         rocksjni/backupenginejni.cc
@@ -11,6 +13,7 @@
         rocksjni/cassandra_value_operator.cc
         rocksjni/checkpoint.cc
         rocksjni/clock_cache.cc
+        rocksjni/cache.cc
         rocksjni/columnfamilyhandle.cc
         rocksjni/compaction_filter.cc
         rocksjni/compaction_filter_factory.cc
@@ -24,8 +27,12 @@
         rocksjni/comparator.cc
         rocksjni/comparatorjnicallback.cc
         rocksjni/compression_options.cc
+        rocksjni/concurrent_task_limiter.cc
+        rocksjni/config_options.cc
         rocksjni/env.cc
         rocksjni/env_options.cc
+        rocksjni/event_listener.cc
+        rocksjni/event_listener_jnicallback.cc
         rocksjni/filter.cc
         rocksjni/ingest_external_file_options.cc
         rocksjni/iterator.cc
@@ -53,11 +60,13 @@
         rocksjni/sst_file_writerjni.cc
         rocksjni/sst_file_readerjni.cc
         rocksjni/sst_file_reader_iterator.cc
+        rocksjni/sst_partitioner.cc
         rocksjni/statistics.cc
         rocksjni/statisticsjni.cc
         rocksjni/table.cc
         rocksjni/table_filter.cc
         rocksjni/table_filter_jnicallback.cc
+        rocksjni/testable_event_listener.cc
         rocksjni/thread_status.cc
         rocksjni/trace_writer.cc
         rocksjni/trace_writer_jnicallback.cc
@@ -82,6 +91,7 @@
   src/main/java/org/rocksdb/AbstractCompactionFilter.java
   src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
   src/main/java/org/rocksdb/AbstractComparator.java
+  src/main/java/org/rocksdb/AbstractEventListener.java
   src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
   src/main/java/org/rocksdb/AbstractMutableOptions.java
   src/main/java/org/rocksdb/AbstractNativeReference.java
@@ -95,12 +105,14 @@
   src/main/java/org/rocksdb/AccessHint.java
   src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
   src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/BackgroundErrorReason.java
   src/main/java/org/rocksdb/BackupableDBOptions.java
   src/main/java/org/rocksdb/BackupEngine.java
   src/main/java/org/rocksdb/BackupInfo.java
   src/main/java/org/rocksdb/BlockBasedTableConfig.java
   src/main/java/org/rocksdb/BloomFilter.java
   src/main/java/org/rocksdb/BuiltinComparator.java
+  src/main/java/org/rocksdb/ByteBufferGetStatus.java
   src/main/java/org/rocksdb/Cache.java
   src/main/java/org/rocksdb/CassandraCompactionFilter.java
   src/main/java/org/rocksdb/CassandraValueMergeOperator.java
@@ -126,6 +138,7 @@
   src/main/java/org/rocksdb/ComparatorType.java
   src/main/java/org/rocksdb/CompressionOptions.java
   src/main/java/org/rocksdb/CompressionType.java
+  src/main/java/org/rocksdb/ConfigOptions.java
   src/main/java/org/rocksdb/DataBlockIndexType.java
   src/main/java/org/rocksdb/DBOptionsInterface.java
   src/main/java/org/rocksdb/DBOptions.java
@@ -134,8 +147,13 @@
   src/main/java/org/rocksdb/EncodingType.java
   src/main/java/org/rocksdb/Env.java
   src/main/java/org/rocksdb/EnvOptions.java
+  src/main/java/org/rocksdb/EventListener.java
   src/main/java/org/rocksdb/Experimental.java
+  src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
   src/main/java/org/rocksdb/Filter.java
+  src/main/java/org/rocksdb/FileOperationInfo.java
+  src/main/java/org/rocksdb/FlushJobInfo.java
+  src/main/java/org/rocksdb/FlushReason.java
   src/main/java/org/rocksdb/FlushOptions.java
   src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
   src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
@@ -143,10 +161,14 @@
   src/main/java/org/rocksdb/HistogramData.java
   src/main/java/org/rocksdb/HistogramType.java
   src/main/java/org/rocksdb/Holder.java
+  src/main/java/org/rocksdb/IndexShorteningMode.java
   src/main/java/org/rocksdb/IndexType.java
   src/main/java/org/rocksdb/InfoLogLevel.java
   src/main/java/org/rocksdb/IngestExternalFileOptions.java
   src/main/java/org/rocksdb/LevelMetaData.java
+  src/main/java/org/rocksdb/ConcurrentTaskLimiter.java
+  src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java
+  src/main/java/org/rocksdb/KeyMayExist.java
   src/main/java/org/rocksdb/LiveFileMetaData.java
   src/main/java/org/rocksdb/LogFile.java
   src/main/java/org/rocksdb/Logger.java
@@ -154,6 +176,7 @@
   src/main/java/org/rocksdb/MemoryUsageType.java
   src/main/java/org/rocksdb/MemoryUtil.java
   src/main/java/org/rocksdb/MemTableConfig.java
+  src/main/java/org/rocksdb/MemTableInfo.java
   src/main/java/org/rocksdb/MergeOperator.java
   src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
   src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -168,6 +191,7 @@
   src/main/java/org/rocksdb/OptimisticTransactionDB.java
   src/main/java/org/rocksdb/OptimisticTransactionOptions.java
   src/main/java/org/rocksdb/Options.java
+  src/main/java/org/rocksdb/OptionString.java
   src/main/java/org/rocksdb/OptionsUtil.java
   src/main/java/org/rocksdb/PersistentCache.java
   src/main/java/org/rocksdb/PlainTableConfig.java
@@ -189,15 +213,18 @@
   src/main/java/org/rocksdb/RocksMemEnv.java
   src/main/java/org/rocksdb/RocksMutableObject.java
   src/main/java/org/rocksdb/RocksObject.java
+  src/main/java/org/rocksdb/SanityLevel.java
   src/main/java/org/rocksdb/SizeApproximationFlag.java
   src/main/java/org/rocksdb/SkipListMemTableConfig.java
   src/main/java/org/rocksdb/Slice.java
   src/main/java/org/rocksdb/Snapshot.java
   src/main/java/org/rocksdb/SstFileManager.java
   src/main/java/org/rocksdb/SstFileMetaData.java
-  src/main/java/org/rocksdb/SstFileWriter.java
   src/main/java/org/rocksdb/SstFileReader.java
   src/main/java/org/rocksdb/SstFileReaderIterator.java
+  src/main/java/org/rocksdb/SstFileWriter.java
+  src/main/java/org/rocksdb/SstPartitionerFactory.java
+  src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java
   src/main/java/org/rocksdb/StateType.java
   src/main/java/org/rocksdb/StatisticsCollectorCallback.java
   src/main/java/org/rocksdb/StatisticsCollector.java
@@ -206,6 +233,10 @@
   src/main/java/org/rocksdb/StatsLevel.java
   src/main/java/org/rocksdb/Status.java
   src/main/java/org/rocksdb/StringAppendOperator.java
+  src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
+  src/main/java/org/rocksdb/TableFileCreationInfo.java
+  src/main/java/org/rocksdb/TableFileCreationReason.java
+  src/main/java/org/rocksdb/TableFileDeletionInfo.java
   src/main/java/org/rocksdb/TableFilter.java
   src/main/java/org/rocksdb/TableProperties.java
   src/main/java/org/rocksdb/TableFormatConfig.java
@@ -235,6 +266,8 @@
   src/main/java/org/rocksdb/WriteBatchWithIndex.java
   src/main/java/org/rocksdb/WriteOptions.java
   src/main/java/org/rocksdb/WriteBufferManager.java
+  src/main/java/org/rocksdb/WriteStallCondition.java
+  src/main/java/org/rocksdb/WriteStallInfo.java
   src/main/java/org/rocksdb/util/ByteUtil.java
   src/main/java/org/rocksdb/util/BytewiseComparator.java
   src/main/java/org/rocksdb/util/Environment.java
@@ -255,6 +288,7 @@
   src/test/java/org/rocksdb/WriteBatchTest.java
   src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
   src/test/java/org/rocksdb/util/WriteBatchGetter.java
+  src/test/java/org/rocksdb/test/TestableEventListener.java
 )
 
 include(FindJava)
@@ -316,19 +350,18 @@
 if (DEFINED CUSTOM_DEPS_URL)
   set(DEPS_URL ${CUSTOM_DEPS_URL}/)
 else ()
-  # This is a URL for artifacts from a "fake" release on pdillinger's fork,
-  # so as not to put binaries in git (ew). We should move to hosting these
-  # under the facebook account on github, or something else more reliable
-  # than maven.org, which has been failing frequently from Travis.
-  set(DEPS_URL "https://github.com/pdillinger/rocksdb/releases/download/v6.6.x-java-deps")
+  # Using a Facebook AWS account for S3 storage. (maven.org has a history
+  # of failing in Travis builds.)
+  set(DEPS_URL "https://rocksdb-deps.s3-us-west-2.amazonaws.com/jars")
 endif()
 
 if(NOT EXISTS ${JAVA_JUNIT_JAR})
   message("Downloading ${JAVA_JUNIT_JAR}")
   file(DOWNLOAD ${DEPS_URL}/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
   list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
   if(NOT error_code EQUAL 0)
-    message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}")
+    message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}: ${error_message}")
   endif()
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_JUNIT_JAR})
 endif()
@@ -336,8 +369,9 @@
   message("Downloading ${JAVA_HAMCR_JAR}")
   file(DOWNLOAD ${DEPS_URL}/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
   list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
   if(NOT error_code EQUAL 0)
-    message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}")
+    message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}: ${error_message}")
   endif()
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_HAMCR_JAR})
 endif()
@@ -345,8 +379,9 @@
   message("Downloading ${JAVA_MOCKITO_JAR}")
   file(DOWNLOAD ${DEPS_URL}/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
   list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
   if(NOT error_code EQUAL 0)
-    message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}")
+    message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}: ${error_message}")
   endif()
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_MOCKITO_JAR})
 endif()
@@ -354,8 +389,9 @@
   message("Downloading ${JAVA_CGLIB_JAR}")
   file(DOWNLOAD ${DEPS_URL}/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
   list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
   if(NOT error_code EQUAL 0)
-    message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}")
+    message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}: ${error_message}")
   endif()
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_CGLIB_JAR})
 endif()
@@ -363,8 +399,9 @@
   message("Downloading ${JAVA_ASSERTJ_JAR}")
   file(DOWNLOAD ${DEPS_URL}/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
   list(GET downloadStatus 0 error_code)
+  list(GET downloadStatus 1 error_message)
   if(NOT error_code EQUAL 0)
-    message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}")
+    message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}: ${error_message}")
   endif()
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR})
 endif()
@@ -376,6 +413,7 @@
           org.rocksdb.AbstractCompactionFilter
           org.rocksdb.AbstractCompactionFilterFactory
           org.rocksdb.AbstractComparator
+          org.rocksdb.AbstractEventListener
           org.rocksdb.AbstractImmutableNativeReference
           org.rocksdb.AbstractNativeReference
           org.rocksdb.AbstractRocksIterator
@@ -392,6 +430,7 @@
           org.rocksdb.CassandraValueMergeOperator
           org.rocksdb.Checkpoint
           org.rocksdb.ClockCache
+          org.rocksdb.Cache
           org.rocksdb.ColumnFamilyHandle
           org.rocksdb.ColumnFamilyOptions
           org.rocksdb.CompactionJobInfo
@@ -402,6 +441,8 @@
           org.rocksdb.CompactRangeOptions
           org.rocksdb.ComparatorOptions
           org.rocksdb.CompressionOptions
+          org.rocksdb.ConcurrentTaskLimiterImpl
+          org.rocksdb.ConfigOptions
           org.rocksdb.DBOptions
           org.rocksdb.DirectSlice
           org.rocksdb.Env
@@ -443,6 +484,8 @@
           org.rocksdb.SstFileWriter
           org.rocksdb.SstFileReader
           org.rocksdb.SstFileReaderIterator
+          org.rocksdb.SstPartitionerFactory
+          org.rocksdb.SstPartitionerFixedPrefixFactory
           org.rocksdb.Statistics
           org.rocksdb.StringAppendOperator
           org.rocksdb.TableFormatConfig
@@ -468,6 +511,7 @@
           org.rocksdb.WriteBatchTest
           org.rocksdb.WriteBatchTestInternalHelper
           org.rocksdb.WriteBufferManager
+          org.rocksdb.test.TestableEventListener
   )
 
   create_javah(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/Makefile mariadb-10.11.13/storage/rocksdb/rocksdb/java/Makefile
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/Makefile	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/Makefile	2025-05-19 16:14:27.000000000 +0000
@@ -2,6 +2,7 @@
 	org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.AbstractCompactionFilterFactory\
 	org.rocksdb.AbstractComparator\
+	org.rocksdb.AbstractEventListener\
 	org.rocksdb.AbstractSlice\
 	org.rocksdb.AbstractTableFilter\
 	org.rocksdb.AbstractTraceWriter\
@@ -13,6 +14,7 @@
 	org.rocksdb.BloomFilter\
 	org.rocksdb.Checkpoint\
 	org.rocksdb.ClockCache\
+	org.rocksdb.Cache\
 	org.rocksdb.CassandraCompactionFilter\
 	org.rocksdb.CassandraValueMergeOperator\
 	org.rocksdb.ColumnFamilyHandle\
@@ -25,6 +27,7 @@
 	org.rocksdb.CompactRangeOptions\
 	org.rocksdb.ComparatorOptions\
 	org.rocksdb.CompressionOptions\
+	org.rocksdb.ConfigOptions\
 	org.rocksdb.DBOptions\
 	org.rocksdb.DirectSlice\
 	org.rocksdb.Env\
@@ -35,6 +38,9 @@
 	org.rocksdb.HashLinkedListMemTableConfig\
 	org.rocksdb.HashSkipListMemTableConfig\
 	org.rocksdb.HdfsEnv\
+	org.rocksdb.ConcurrentTaskLimiter\
+	org.rocksdb.ConcurrentTaskLimiterImpl\
+	org.rocksdb.KeyMayExist\
 	org.rocksdb.Logger\
 	org.rocksdb.LRUCache\
 	org.rocksdb.MemoryUsageType\
@@ -62,6 +68,8 @@
 	org.rocksdb.SstFileWriter\
 	org.rocksdb.SstFileReader\
 	org.rocksdb.SstFileReaderIterator\
+	org.rocksdb.SstPartitionerFactory\
+	org.rocksdb.SstPartitionerFixedPrefixFactory\
 	org.rocksdb.Statistics\
 	org.rocksdb.ThreadStatus\
 	org.rocksdb.TimedEnv\
@@ -82,7 +90,9 @@
 	org.rocksdb.WriteBufferManager\
 	org.rocksdb.WBWIRocksIterator
 
-NATIVE_JAVA_TEST_CLASSES = org.rocksdb.RocksDBExceptionTest\
+NATIVE_JAVA_TEST_CLASSES = \
+    org.rocksdb.RocksDBExceptionTest\
+    org.rocksdb.test.TestableEventListener\
     org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper\
     org.rocksdb.WriteBatchTest\
     org.rocksdb.WriteBatchTestInternalHelper
@@ -93,16 +103,15 @@
 
 NATIVE_INCLUDE = ./include
 ARCH := $(shell getconf LONG_BIT)
-ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
-ifeq ($(PLATFORM), OS_MACOSX)
-ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
-endif
+SHA256_CMD ?= sha256sum
 
 JAVA_TESTS = \
 	org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.BackupEngineTest\
+	org.rocksdb.BlobOptionsTest\
 	org.rocksdb.BlockBasedTableConfigTest\
 	org.rocksdb.BuiltinComparatorTest\
+	org.rocksdb.BytewiseComparatorRegressionTest\
 	org.rocksdb.util.BytewiseComparatorTest\
 	org.rocksdb.util.BytewiseComparatorIntTest\
 	org.rocksdb.CheckPointTest\
@@ -124,6 +133,7 @@
 	org.rocksdb.DirectSliceTest\
 	org.rocksdb.util.EnvironmentTest\
 	org.rocksdb.EnvOptionsTest\
+	org.rocksdb.EventListenerTest\
 	org.rocksdb.HdfsEnvTest\
 	org.rocksdb.IngestExternalFileOptionsTest\
 	org.rocksdb.util.IntComparatorTest\
@@ -132,14 +142,18 @@
 	org.rocksdb.FlushTest\
 	org.rocksdb.InfoLogLevelTest\
 	org.rocksdb.KeyMayExistTest\
+	org.rocksdb.ConcurrentTaskLimiterTest\
 	org.rocksdb.LoggerTest\
 	org.rocksdb.LRUCacheTest\
 	org.rocksdb.MemoryUtilTest\
 	org.rocksdb.MemTableTest\
 	org.rocksdb.MergeTest\
+	org.rocksdb.MultiGetManyKeysTest\
+	org.rocksdb.MultiGetTest\
 	org.rocksdb.MixedOptionsTest\
 	org.rocksdb.MutableColumnFamilyOptionsTest\
 	org.rocksdb.MutableDBOptionsTest\
+	org.rocksdb.MutableOptionsGetSetTest \
 	org.rocksdb.NativeComparatorWrapperTest\
 	org.rocksdb.NativeLibraryLoaderTest\
 	org.rocksdb.OptimisticTransactionTest\
@@ -158,11 +172,13 @@
 	org.rocksdb.RocksIteratorTest\
 	org.rocksdb.RocksMemEnvTest\
 	org.rocksdb.util.SizeUnitTest\
+	org.rocksdb.SecondaryDBTest\
 	org.rocksdb.SliceTest\
 	org.rocksdb.SnapshotTest\
 	org.rocksdb.SstFileManagerTest\
 	org.rocksdb.SstFileWriterTest\
 	org.rocksdb.SstFileReaderTest\
+	org.rocksdb.SstPartitionerTest\
 	org.rocksdb.TableFilterTest\
 	org.rocksdb.TimedEnvTest\
 	org.rocksdb.TransactionTest\
@@ -197,31 +213,77 @@
 SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes
 
 JAVA_TEST_LIBDIR = test-libs
-JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar
-JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar
-JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)/mockito-all-1.10.19.jar
-JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)/cglib-2.2.2.jar
-JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)/assertj-core-1.7.1.jar
-JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR)
+JAVA_JUNIT_VER = 4.13.1
+JAVA_JUNIT_SHA256 = c30719db974d6452793fe191b3638a5777005485bae145924044530ffa5f6122
+JAVA_JUNIT_JAR = junit-$(JAVA_JUNIT_VER).jar
+JAVA_JUNIT_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_JUNIT_JAR)
+JAVA_HAMCREST_VER = 2.2
+JAVA_HAMCREST_SHA256 = 5e62846a89f05cd78cd9c1a553f340d002458380c320455dd1f8fc5497a8a1c1
+JAVA_HAMCREST_JAR = hamcrest-$(JAVA_HAMCREST_VER).jar
+JAVA_HAMCREST_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_HAMCREST_JAR)
+JAVA_MOCKITO_VER = 1.10.19
+JAVA_MOCKITO_SHA256 = d1a7a7ef14b3db5c0fc3e0a63a81b374b510afe85add9f7984b97911f4c70605
+JAVA_MOCKITO_JAR = mockito-all-$(JAVA_MOCKITO_VER).jar
+JAVA_MOCKITO_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_MOCKITO_JAR)
+JAVA_CGLIB_VER = 3.3.0
+JAVA_CGLIB_SHA256 = 9fe0c26d7464140ccdfe019ac687be1fb906122b508ab54beb810db0f09a9212
+JAVA_CGLIB_JAR = cglib-$(JAVA_CGLIB_VER).jar
+JAVA_CGLIB_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_CGLIB_JAR)
+JAVA_ASSERTJ_VER = 2.9.0
+JAVA_ASSERTJ_SHA256 = 5e88ea3ecbe3c48aa1346fec76c84979fa9c8d22499f11479011691230e8babf
+JAVA_ASSERTJ_JAR = assertj-core-$(JAVA_ASSERTJ_VER).jar
+JAVA_ASSERTJ_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_ASSERTJ_JAR)
+JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR_PATH):$(JAVA_HAMCREST_JAR_PATH):$(JAVA_MOCKITO_JAR_PATH):$(JAVA_CGLIB_JAR_PATH):$(JAVA_ASSERTJ_JAR_PATH)
 
 MVN_LOCAL = ~/.m2/repository
 
+# Set the path of the java commands
+ifeq ($(JAVA_CMD),)
+ifneq ($(JAVA_HOME),)
+JAVA_CMD := $(JAVA_HOME)/bin/java
+else
+JAVA_CMD := java
+endif
+endif
+
+ifeq ($(JAVAC_CMD),)
+ifneq ($(JAVA_HOME),)
+JAVAC_CMD := $(JAVA_HOME)/bin/javac
+else
+JAVAC_CMD := javac
+endif
+endif
+
+ifeq ($(JAVAH_CMD),)
+ifneq ($(JAVA_HOME),)
+JAVAH_CMD := $(JAVA_HOME)/bin/javah
+else
+JAVAH_CMD := javah
+endif
+endif
+
+ifeq ($(JAVADOC_CMD),)
+ifneq ($(JAVA_HOME),)
+JAVADOC_CMD := $(JAVA_HOME)/bin/javadoc
+else
+JAVADOC_CMD := javadoc
+endif
+endif
+
 # Set the default JAVA_ARGS to "" for DEBUG_LEVEL=0
-JAVA_ARGS? =
+JAVA_ARGS ?=
 
-JAVAC_ARGS? =
+JAVAC_ARGS ?=
 
 # When debugging add -Xcheck:jni to the java args
 ifneq ($(DEBUG_LEVEL),0)
-	JAVA_ARGS = -ea -Xcheck:jni
-	JAVAC_ARGS = -Xlint:deprecation -Xlint:unchecked
+	JAVA_ARGS += -ea -Xcheck:jni
+	JAVAC_ARGS += -Xlint:deprecation -Xlint:unchecked
 endif
 
-# This is a URL for artifacts from a "fake" release on pdillinger's fork,
-# so as not to put binaries in git (ew). We should move to hosting these
-# under the facebook account on github, or something else more reliable
-# than maven.org, which has been failing frequently from Travis.
-DEPS_URL?=https://github.com/pdillinger/rocksdb/releases/download/v6.6.x-java-deps
+# Using a Facebook AWS account for S3 storage. (maven.org has a history
+# of failing in Travis builds.)
+DEPS_URL?=https://rocksdb-deps.s3-us-west-2.amazonaws.com/jars
 
 clean: clean-not-downloaded clean-downloaded
 
@@ -237,75 +299,132 @@
 
 javadocs: java
 	$(AM_V_GEN)mkdir -p $(JAVADOC)
-	$(AM_V_at)javadoc -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org
+	$(AM_V_at)$(JAVADOC_CMD) -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org
 
 javalib: java java_test javadocs
 
 java:
 	$(AM_V_GEN)mkdir -p $(MAIN_CLASSES)
-ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
-	$(AM_V_at)javac $(JAVAC_ARGS) -d $(MAIN_CLASSES)\
+ifeq ($(shell $(JAVAC_CMD) -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -d $(MAIN_CLASSES)\
 		$(MAIN_SRC)/org/rocksdb/util/*.java\
 		$(MAIN_SRC)/org/rocksdb/*.java
 else
-	$(AM_V_at)javac $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\
 		$(MAIN_SRC)/org/rocksdb/util/*.java\
 		$(MAIN_SRC)/org/rocksdb/*.java
 endif
 	$(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md
 	$(AM_V_at)@rm -f ./HISTORY-CPP.md
-ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
-	$(AM_V_at)javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
+ifeq ($(shell $(JAVAH_CMD) -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
+	$(AM_V_at)$(JAVAH_CMD) -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
 endif
 
 sample: java
 	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
-	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found
-	java $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni
+	$(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found
 
 column_family_sample: java
 	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
-	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
-	java $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni
+	$(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
 
 transaction_sample: java
 	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
-	$(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java
+	$(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
-	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni
+	$(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
 
 optimistic_transaction_sample: java
 	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
-	$(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java
+	$(AM_V_at)$(JAVAC_CMD) -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
-	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni
+	$(JAVA_CMD) -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
 
-resolve_test_deps:
-	test -d "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)"
-	test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_JUNIT_JAR) --location $(DEPS_URL)/junit-4.12.jar
-	test -s "$(JAVA_HAMCR_JAR)" || cp $(MVN_LOCAL)/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_HAMCR_JAR) --location $(DEPS_URL)/hamcrest-core-1.3.jar
-	test -s "$(JAVA_MOCKITO_JAR)" || cp $(MVN_LOCAL)/org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_MOCKITO_JAR)" --location $(DEPS_URL)/mockito-all-1.10.19.jar
-	test -s "$(JAVA_CGLIB_JAR)" || cp $(MVN_LOCAL)/cglib/cglib/2.2.2/cglib-2.2.2.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_CGLIB_JAR)" --location $(DEPS_URL)/cglib-2.2.2.jar
-	test -s "$(JAVA_ASSERTJ_JAR)" || cp $(MVN_LOCAL)/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR)" --location $(DEPS_URL)/assertj-core-1.7.1.jar
+$(JAVA_TEST_LIBDIR):
+	mkdir -p "$(JAVA_TEST_LIBDIR)"
+
+$(JAVA_JUNIT_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR)))
+	cp -v $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output $(JAVA_JUNIT_JAR_PATH) --location $(DEPS_URL)/$(JAVA_JUNIT_JAR)
+	JAVA_JUNIT_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_JUNIT_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_JUNIT_SHA256)" != "$$JAVA_JUNIT_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_JUNIT_JAR_PATH) checksum mismatch, expected=\"$(JAVA_JUNIT_SHA256)\" actual=\"$$JAVA_JUNIT_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_HAMCREST_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR)))
+	cp -v $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output $(JAVA_HAMCREST_JAR_PATH) --location $(DEPS_URL)/$(JAVA_HAMCREST_JAR)
+	JAVA_HAMCREST_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_HAMCREST_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_HAMCREST_SHA256)" != "$$JAVA_HAMCREST_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_HAMCREST_JAR_PATH) checksum mismatch, expected=\"$(JAVA_HAMCREST_SHA256)\" actual=\"$$JAVA_HAMCREST_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_MOCKITO_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR)))
+	cp -v $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_MOCKITO_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_MOCKITO_JAR)
+	JAVA_MOCKITO_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_MOCKITO_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_MOCKITO_SHA256)" != "$$JAVA_MOCKITO_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_MOCKITO_JAR_PATH) checksum mismatch, expected=\"$(JAVA_MOCKITO_SHA256)\" actual=\"$$JAVA_MOCKITO_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_CGLIB_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR)))
+	cp -v $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_CGLIB_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_CGLIB_JAR)
+	JAVA_CGLIB_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_CGLIB_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_CGLIB_SHA256)" != "$$JAVA_CGLIB_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_CGLIB_JAR_PATH) checksum mismatch, expected=\"$(JAVA_CGLIB_SHA256)\" actual=\"$$JAVA_CGLIB_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_ASSERTJ_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR)))
+	cp -v $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_ASSERTJ_JAR)
+	JAVA_ASSERTJ_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_ASSERTJ_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_ASSERTJ_SHA256)" != "$$JAVA_ASSERTJ_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_ASSERTJ_JAR_PATH) checksum mismatch, expected=\"$(JAVA_ASSERTJ_SHA256)\" actual=\"$$JAVA_ASSERTJ_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+resolve_test_deps: $(JAVA_JUNIT_JAR_PATH) $(JAVA_HAMCREST_JAR_PATH) $(JAVA_MOCKITO_JAR_PATH) $(JAVA_CGLIB_JAR_PATH) $(JAVA_ASSERTJ_JAR_PATH)
 
 java_test: java resolve_test_deps
 	$(AM_V_GEN)mkdir -p $(TEST_CLASSES)
-ifeq ($(shell java -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0)
-	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\
+ifeq ($(shell $(JAVAC_CMD) -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0)
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\
 		$(TEST_SRC)/org/rocksdb/test/*.java\
 		$(TEST_SRC)/org/rocksdb/util/*.java\
 		$(TEST_SRC)/org/rocksdb/*.java
-	$(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES)
+	$(AM_V_at)$(JAVAH_CMD) -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES)
 else
-	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\
 		$(TEST_SRC)/org/rocksdb/test/*.java\
 		$(TEST_SRC)/org/rocksdb/util/*.java\
 		$(TEST_SRC)/org/rocksdb/*.java
@@ -314,8 +433,8 @@
 test: java java_test run_test
 
 run_test:
-	java $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
+	$(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
 
 db_bench: java
 	$(AM_V_GEN)mkdir -p $(BENCHMARK_MAIN_CLASSES)
-	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java
+	$(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-alpine.sh	2025-05-19 16:14:27.000000000 +0000
@@ -12,7 +12,6 @@
 cd /rocksdb-local-build
 
 make clean-not-downloaded
-PORTABLE=1 make rocksdbjavastatic
-
-cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target
+PORTABLE=1 make -j2 rocksdbjavastatic
 
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/crossbuild/docker-build-linux-centos.sh	2025-05-19 16:14:27.000000000 +0000
@@ -27,8 +27,8 @@
 	fi
 else
 	make clean-not-downloaded
-        PORTABLE=1 make -j2 rocksdbjavastatic
+	PORTABLE=1 make -j2 rocksdbjavastatic
 fi
 
-cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar java/target/rocksdbjni-*-linux*.jar.sha1 /rocksdb-java-target
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/README.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/README.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/README.md	2025-05-19 16:14:27.000000000 +0000
@@ -6,6 +6,12 @@
 
 **Note**: This uses a specific build of RocksDB that is set in the `<version>` element of the `dependencies` section of the `pom.xml` file. If you are testing local changes you should build and install a SNAPSHOT version of rocksdbjni, and update the `pom.xml` of rocksdbjni-jmh file to test with this.
 
+For instance, this is how to install the OSX jar you just built for 6.26.0
+
+```bash
+$ mvn install:install-file -Dfile=./java/target/rocksdbjni-6.26.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=6.26.0-SNAPSHOT -Dpackaging=jar
+```
+
 ```bash
 $ mvn package
 ```
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/pom.xml mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/pom.xml
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/pom.xml	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/pom.xml	2025-05-19 16:14:27.000000000 +0000
@@ -50,7 +50,7 @@
         <dependency>
             <groupId>org.rocksdb</groupId>
             <artifactId>rocksdbjni</artifactId>
-            <version>6.6.0-SNAPSHOT</version>
+            <version>6.27.0-SNAPSHOT</version>
         </dependency>
 
         <dependency>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java	2025-05-19 16:14:27.000000000 +0000
@@ -6,23 +6,26 @@
  */
 package org.rocksdb.jmh;
 
-import org.openjdk.jmh.annotations.*;
-import org.rocksdb.*;
-import org.rocksdb.util.FileUtils;
+import static org.rocksdb.util.KVUtils.ba;
+import static org.rocksdb.util.KVUtils.keys;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.rocksdb.*;
+import org.rocksdb.util.FileUtils;
 
-import static org.rocksdb.util.KVUtils.ba;
-import static org.rocksdb.util.KVUtils.keys;
-
-@State(Scope.Benchmark)
+@State(Scope.Thread)
 public class MultiGetBenchmarks {
-
   @Param({
       "no_column_family",
       "1_column_family",
@@ -31,8 +34,7 @@
   })
   String columnFamilyTestType;
 
-  @Param("100000")
-  int keyCount;
+  @Param({"10000", "25000", "100000"}) int keyCount;
 
   @Param({
           "10",
@@ -42,6 +44,9 @@
   })
   int multiGetSize;
 
+  @Param({"16", "64", "250", "1000", "4000", "16000"}) int valueSize;
+  @Param({"16"}) int keySize; // big enough
+
   Path dbDir;
   DBOptions options;
   int cfs = 0;  // number of column families
@@ -85,7 +90,8 @@
     // store initial data for retrieving via get
     for (int i = 0; i < cfs; i++) {
       for (int j = 0; j < keyCount; j++) {
-        db.put(cfHandles[i], ba("key" + j), ba("value" + j));
+        final byte[] paddedValue = Arrays.copyOf(ba("value" + j), valueSize);
+        db.put(cfHandles[i], ba("key" + j), paddedValue);
       }
     }
 
@@ -149,10 +155,78 @@
     }
   }
 
+  ByteBuffer keysBuffer;
+  ByteBuffer valuesBuffer;
+
+  List<ByteBuffer> valueBuffersList;
+  List<ByteBuffer> keyBuffersList;
+
+  @Setup
+  public void allocateSliceBuffers() {
+    keysBuffer = ByteBuffer.allocateDirect(keyCount * valueSize);
+    valuesBuffer = ByteBuffer.allocateDirect(keyCount * valueSize);
+    valueBuffersList = new ArrayList<>();
+    keyBuffersList = new ArrayList<>();
+    for (int i = 0; i < keyCount; i++) {
+      valueBuffersList.add(valuesBuffer.slice());
+      valuesBuffer.position(i * valueSize);
+      keyBuffersList.add(keysBuffer.slice());
+      keysBuffer.position(i * keySize);
+    }
+  }
+
+  @TearDown
+  public void freeSliceBuffers() {
+    valueBuffersList.clear();
+  }
+
   @Benchmark
   public List<byte[]> multiGet10() throws RocksDBException {
     final int fromKeyIdx = next(multiGetSize, keyCount);
-    final List<byte[]> keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize);
-    return db.multiGetAsList(keys);
+    if (fromKeyIdx >= 0) {
+      final List<byte[]> keys = keys(fromKeyIdx, fromKeyIdx + multiGetSize);
+      final List<byte[]> valueResults = db.multiGetAsList(keys);
+      for (final byte[] result : valueResults) {
+        if (result.length != valueSize)
+          throw new RuntimeException("Test valueSize assumption wrong");
+      }
+    }
+    return new ArrayList<>();
+  }
+
+  @Benchmark
+  public List<RocksDB.MultiGetInstance> multiGetDirect10() throws RocksDBException {
+    final int fromKeyIdx = next(multiGetSize, keyCount);
+    if (fromKeyIdx >= 0) {
+      final List<ByteBuffer> keys = keys(keyBuffersList, fromKeyIdx, fromKeyIdx + multiGetSize);
+      final List<RocksDB.MultiGetInstance> results = db.multiGetByteBuffers(
+          keys, valueBuffersList.subList(fromKeyIdx, fromKeyIdx + multiGetSize));
+      for (final RocksDB.MultiGetInstance result : results) {
+        if (result.status.getCode() != Status.Code.Ok)
+          throw new RuntimeException("Test status assumption wrong");
+        if (result.valueSize != valueSize)
+          throw new RuntimeException("Test valueSize assumption wrong");
+      }
+      return results;
+    }
+    return new ArrayList<>();
+  }
+
+  public static void main(final String[] args) throws RunnerException {
+    final org.openjdk.jmh.runner.options.Options opt =
+        new OptionsBuilder()
+            .include(MultiGetBenchmarks.class.getSimpleName())
+            .forks(1)
+            .jvmArgs("-ea")
+            .warmupIterations(1)
+            .measurementIterations(2)
+            .forks(2)
+            .param("columnFamilyTestType=", "1_column_family")
+            .param("multiGetSize=", "10", "1000")
+            .param("keyCount=", "1000")
+            .output("jmh_output")
+            .build();
+
+    new Runner(opt).run();
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/jmh/src/main/java/org/rocksdb/util/KVUtils.java	2025-05-19 16:14:27.000000000 +0000
@@ -6,11 +6,12 @@
  */
 package org.rocksdb.util;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 public final class KVUtils {
 
   /**
@@ -55,4 +56,17 @@
     }
     return keys;
   }
+
+  public static List<ByteBuffer> keys(
+      final List<ByteBuffer> keyBuffers, final int from, final int to) {
+    final List<ByteBuffer> keys = new ArrayList<>(to - from);
+    for (int i = from; i < to; i++) {
+      final ByteBuffer key = keyBuffers.get(i);
+      key.clear();
+      key.put(ba("key" + i));
+      key.flip();
+      keys.add(key);
+    }
+    return keys;
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/pom.xml.template mariadb-10.11.13/storage/rocksdb/rocksdb/java/pom.xml.template
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/pom.xml.template	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/pom.xml.template	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,178 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.rocksdb</groupId>
+    <artifactId>rocksdbjni</artifactId>
+    <version>${ROCKSDB_JAVA_VERSION}</version>  <!-- this will be replaced by the Makefile's rocksdbjavageneratepom target -->
+
+    <name>RocksDB JNI</name>
+    <description>RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files
+        for Mac OSX, and a .dll for Windows x64.
+    </description>
+    <url>https://rocksdb.org</url>
+    <inceptionYear>2012</inceptionYear>
+
+    <licenses>
+        <license>
+            <name>Apache License 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+        <license>
+            <name>GNU General Public License, version 2</name>
+            <url>http://www.gnu.org/licenses/gpl-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <scm>
+        <connection>scm:git:https://github.com/facebook/rocksdb.git</connection>
+        <developerConnection>scm:git:https://github.com/facebook/rocksdb.git</developerConnection>
+        <url>scm:git:https://github.com/facebook/rocksdb.git</url>
+    </scm>
+
+    <organization>
+        <name>Facebook</name>
+        <url>https://www.facebook.com</url>
+    </organization>
+
+    <developers>
+        <developer>
+            <name>Facebook</name>
+            <email>help@facebook.com</email>
+            <timezone>America/New_York</timezone>
+            <roles>
+                <role>architect</role>
+            </roles>
+        </developer>
+    </developers>
+
+    <mailingLists>
+        <mailingList>
+            <name>rocksdb - Google Groups</name>
+            <subscribe>rocksdb-subscribe@googlegroups.com</subscribe>
+            <unsubscribe>rocksdb-unsubscribe@googlegroups.com</unsubscribe>
+            <post>rocksdb@googlegroups.com</post>
+            <archive>https://groups.google.com/forum/#!forum/rocksdb</archive>
+        </mailingList>
+    </mailingLists>
+
+    <properties>
+        <project.build.source>1.7</project.build.source>
+        <project.build.target>1.7</project.build.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.2</version>
+                <configuration>
+                    <source>${project.build.source}</source>
+                    <target>${project.build.target}</target>
+                    <encoding>${project.build.sourceEncoding}</encoding>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.18.1</version>
+                <configuration>
+                    <argLine>${argLine} -ea -Xcheck:jni -Djava.library.path=${project.build.directory}</argLine>
+                    <useManifestOnlyJar>false</useManifestOnlyJar>  
+                    <useSystemClassLoader>false</useSystemClassLoader>
+                    <additionalClasspathElements>
+                        <additionalClasspathElement>${project.build.directory}/*</additionalClasspathElement>
+                    </additionalClasspathElements>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.jacoco</groupId>
+                <artifactId>jacoco-maven-plugin</artifactId>
+                <version>0.7.2.201409121644</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>prepare-agent</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>report</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>report</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.codehaus.gmaven</groupId>
+                <artifactId>groovy-maven-plugin</artifactId>
+                <version>2.0</version>
+                <executions>
+                    <execution>
+                        <phase>process-classes</phase>
+                        <goals>
+                            <goal>execute</goal>
+                        </goals>
+                        <configuration>
+                            <defaults>
+                                <name>Xenu</name>
+                            </defaults>
+                            <source>
+                                String fileContents = new File(project.basedir.absolutePath + '/../include/rocksdb/version.h').getText('UTF-8')
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/)
+                                String major_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/)
+                                String minor_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/)
+                                String patch_version = matcher.getAt(0).getAt(1)
+                                String version = String.format('%s.%s.%s', major_version, minor_version, patch_version)
+                                // Set version to be used in pom.properties
+                                project.version = version
+                                // Set version to be set as jar name
+                                project.build.finalName = project.artifactId + "-" + version
+                            </source>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.13.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.hamcrest</groupId>
+            <artifactId>hamcrest</artifactId>
+            <version>2.2</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>cglib</groupId>
+            <artifactId>cglib</artifactId>
+            <version>3.3.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.assertj</groupId>
+            <artifactId>assertj-core</artifactId>
+            <version>2.9.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-all</artifactId>
+            <version>1.10.19</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/cache.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/cache.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,35 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Cache.
+
+#include "rocksdb/cache.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_Cache.h"
+
+/*
+ * Class:     org_rocksdb_Cache
+ * Method:    getUsage
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Cache_getUsage(JNIEnv*, jclass, jlong jhandle) {
+  auto* sptr_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  return static_cast<jlong>(sptr_cache->get()->GetUsage());
+}
+
+/*
+ * Class:     org_rocksdb_Cache
+ * Method:    getPinnedUsage
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Cache_getPinnedUsage(JNIEnv*, jclass, jlong jhandle) {
+  auto* sptr_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  return static_cast<jlong>(sptr_cache->get()->GetPinnedUsage());
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/compression_options.cc	2025-05-19 16:14:27.000000000 +0000
@@ -134,6 +134,27 @@
 
 /*
  * Class:     org_rocksdb_CompressionOptions
+ * Method:    setMaxDictBufferBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompressionOptions_setMaxDictBufferBytes(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_dict_buffer_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->max_dict_buffer_bytes = static_cast<uint64_t>(jmax_dict_buffer_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    maxDictBufferBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jlong>(opt->max_dict_buffer_bytes);
+}
+/*
+ * Class:     org_rocksdb_CompressionOptions
  * Method:    setEnabled
  * Signature: (JZ)V
  */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/concurrent_task_limiter.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,90 @@
+#include "rocksdb/concurrent_task_limiter.h"
+
+#include <jni.h>
+
+#include <memory>
+#include <string>
+
+#include "include/org_rocksdb_ConcurrentTaskLimiterImpl.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    newConcurrentTaskLimiterImpl0
+ * Signature: (Ljava/lang/String;I)J
+ */
+jlong Java_org_rocksdb_ConcurrentTaskLimiterImpl_newConcurrentTaskLimiterImpl0(
+    JNIEnv* env, jclass, jstring jname, jint limit) {
+  jboolean has_exception = JNI_FALSE;
+  std::string name =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jname, &has_exception);
+  if (JNI_TRUE == has_exception) {
+    return 0;
+  }
+
+  auto* ptr = new std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>(
+      ROCKSDB_NAMESPACE::NewConcurrentTaskLimiter(name, limit));
+
+  return reinterpret_cast<jlong>(ptr);
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    name
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ConcurrentTaskLimiterImpl_name(JNIEnv* env, jclass,
+                                                        jlong handle) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &limiter->GetName());
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    setMaxOutstandingTask
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ConcurrentTaskLimiterImpl_setMaxOutstandingTask(
+    JNIEnv*, jclass, jlong handle, jint max_outstanding_task) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  limiter->SetMaxOutstandingTask(static_cast<int32_t>(max_outstanding_task));
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    resetMaxOutstandingTask
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ConcurrentTaskLimiterImpl_resetMaxOutstandingTask(
+    JNIEnv*, jclass, jlong handle) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  limiter->ResetMaxOutstandingTask();
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    outstandingTask
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ConcurrentTaskLimiterImpl_outstandingTask(JNIEnv*, jclass,
+                                                                jlong handle) {
+  const auto& limiter = *reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(handle);
+  return static_cast<jint>(limiter->GetOutstandingTask());
+}
+
+/*
+ * Class:     org_rocksdb_ConcurrentTaskLimiterImpl
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ConcurrentTaskLimiterImpl_disposeInternal(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* ptr = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(jhandle);
+  delete ptr;  // delete std::shared_ptr
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/config_options.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,88 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::ConfigOptions methods
+// from Java side.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_ConfigOptions.h"
+#include "rocksdb/convenience.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ConfigOptions_disposeInternal(JNIEnv *, jobject,
+                                                    jlong jhandle) {
+  auto *co = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(jhandle);
+  assert(co != nullptr);
+  delete co;
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    newConfigOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) {
+  auto *cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions();
+  return reinterpret_cast<jlong>(cfg_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setDelimiter
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass,
+                                                 jlong handle, jstring s) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  const char *delim = env->GetStringUTFChars(s, nullptr);
+  if (delim == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  cfg_opt->delimiter = delim;
+  env->ReleaseStringUTFChars(s, delim);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setIgnoreUnknownOptions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass,
+                                                            jlong handle,
+                                                            jboolean b) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  cfg_opt->ignore_unknown_options = static_cast<bool>(b);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setInputStringsEscaped
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass,
+                                                           jlong handle,
+                                                           jboolean b) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  cfg_opt->input_strings_escaped = static_cast<bool>(b);
+}
+
+/*
+ * Class:     org_rocksdb_ConfigOptions
+ * Method:    setSanityLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv *, jclass,
+                                                   jlong handle, jbyte level) {
+  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  cfg_opt->sanity_level = ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::EventListener.
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_AbstractEventListener.h"
+#include "rocksjni/event_listener_jnicallback.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_AbstractEventListener
+ * Method:    createNewEventListener
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_AbstractEventListener_createNewEventListener(
+    JNIEnv* env, jobject jobj, jlong jenabled_event_callback_values) {
+  auto enabled_event_callbacks =
+      ROCKSDB_NAMESPACE::EnabledEventCallbackJni::toCppEnabledEventCallbacks(
+          jenabled_event_callback_values);
+  auto* sptr_event_listener =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>(
+          new ROCKSDB_NAMESPACE::EventListenerJniCallback(
+              env, jobj, enabled_event_callbacks));
+  return reinterpret_cast<jlong>(sptr_event_listener);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractEventListener
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractEventListener_disposeInternal(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  delete reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
+      jhandle);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,502 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::EventListener.
+
+#include "rocksjni/event_listener_jnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace ROCKSDB_NAMESPACE {
+EventListenerJniCallback::EventListenerJniCallback(
+    JNIEnv* env, jobject jevent_listener,
+    const std::set<EnabledEventCallback>& enabled_event_callbacks)
+    : JniCallback(env, jevent_listener),
+      m_enabled_event_callbacks(enabled_event_callbacks) {
+  InitCallbackMethodId(
+      m_on_flush_completed_proxy_mid, EnabledEventCallback::ON_FLUSH_COMPLETED,
+      env, AbstractEventListenerJni::getOnFlushCompletedProxyMethodId);
+
+  InitCallbackMethodId(m_on_flush_begin_proxy_mid,
+                       EnabledEventCallback::ON_FLUSH_BEGIN, env,
+                       AbstractEventListenerJni::getOnFlushBeginProxyMethodId);
+
+  InitCallbackMethodId(m_on_table_file_deleted_mid,
+                       EnabledEventCallback::ON_TABLE_FILE_DELETED, env,
+                       AbstractEventListenerJni::getOnTableFileDeletedMethodId);
+
+  InitCallbackMethodId(
+      m_on_compaction_begin_proxy_mid,
+      EnabledEventCallback::ON_COMPACTION_BEGIN, env,
+      AbstractEventListenerJni::getOnCompactionBeginProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_compaction_completed_proxy_mid,
+      EnabledEventCallback::ON_COMPACTION_COMPLETED, env,
+      AbstractEventListenerJni::getOnCompactionCompletedProxyMethodId);
+
+  InitCallbackMethodId(m_on_table_file_created_mid,
+                       EnabledEventCallback::ON_TABLE_FILE_CREATED, env,
+                       AbstractEventListenerJni::getOnTableFileCreatedMethodId);
+
+  InitCallbackMethodId(
+      m_on_table_file_creation_started_mid,
+      EnabledEventCallback::ON_TABLE_FILE_CREATION_STARTED, env,
+      AbstractEventListenerJni::getOnTableFileCreationStartedMethodId);
+
+  InitCallbackMethodId(m_on_mem_table_sealed_mid,
+                       EnabledEventCallback::ON_MEMTABLE_SEALED, env,
+                       AbstractEventListenerJni::getOnMemTableSealedMethodId);
+
+  InitCallbackMethodId(
+      m_on_column_family_handle_deletion_started_mid,
+      EnabledEventCallback::ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, env,
+      AbstractEventListenerJni::getOnColumnFamilyHandleDeletionStartedMethodId);
+
+  InitCallbackMethodId(
+      m_on_external_file_ingested_proxy_mid,
+      EnabledEventCallback::ON_EXTERNAL_FILE_INGESTED, env,
+      AbstractEventListenerJni::getOnExternalFileIngestedProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_background_error_proxy_mid,
+      EnabledEventCallback::ON_BACKGROUND_ERROR, env,
+      AbstractEventListenerJni::getOnBackgroundErrorProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_stall_conditions_changed_mid,
+      EnabledEventCallback::ON_STALL_CONDITIONS_CHANGED, env,
+      AbstractEventListenerJni::getOnStallConditionsChangedMethodId);
+
+  InitCallbackMethodId(m_on_file_read_finish_mid,
+                       EnabledEventCallback::ON_FILE_READ_FINISH, env,
+                       AbstractEventListenerJni::getOnFileReadFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_write_finish_mid,
+                       EnabledEventCallback::ON_FILE_WRITE_FINISH, env,
+                       AbstractEventListenerJni::getOnFileWriteFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_flush_finish_mid,
+                       EnabledEventCallback::ON_FILE_FLUSH_FINISH, env,
+                       AbstractEventListenerJni::getOnFileFlushFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_sync_finish_mid,
+                       EnabledEventCallback::ON_FILE_SYNC_FINISH, env,
+                       AbstractEventListenerJni::getOnFileSyncFinishMethodId);
+
+  InitCallbackMethodId(
+      m_on_file_range_sync_finish_mid,
+      EnabledEventCallback::ON_FILE_RANGE_SYNC_FINISH, env,
+      AbstractEventListenerJni::getOnFileRangeSyncFinishMethodId);
+
+  InitCallbackMethodId(
+      m_on_file_truncate_finish_mid,
+      EnabledEventCallback::ON_FILE_TRUNCATE_FINISH, env,
+      AbstractEventListenerJni::getOnFileTruncateFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_close_finish_mid,
+                       EnabledEventCallback::ON_FILE_CLOSE_FINISH, env,
+                       AbstractEventListenerJni::getOnFileCloseFinishMethodId);
+
+  InitCallbackMethodId(
+      m_should_be_notified_on_file_io,
+      EnabledEventCallback::SHOULD_BE_NOTIFIED_ON_FILE_IO, env,
+      AbstractEventListenerJni::getShouldBeNotifiedOnFileIOMethodId);
+
+  InitCallbackMethodId(
+      m_on_error_recovery_begin_proxy_mid,
+      EnabledEventCallback::ON_ERROR_RECOVERY_BEGIN, env,
+      AbstractEventListenerJni::getOnErrorRecoveryBeginProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_error_recovery_completed_mid,
+      EnabledEventCallback::ON_ERROR_RECOVERY_COMPLETED, env,
+      AbstractEventListenerJni::getOnErrorRecoveryCompletedMethodId);
+}
+
+EventListenerJniCallback::~EventListenerJniCallback() {}
+
+void EventListenerJniCallback::OnFlushCompleted(
+    DB* db, const FlushJobInfo& flush_job_info) {
+  if (m_on_flush_completed_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jflush_job_info = SetupCallbackInvocation<FlushJobInfo>(
+      env, attached_thread, flush_job_info,
+      FlushJobInfoJni::fromCppFlushJobInfo);
+
+  if (jflush_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_flush_completed_proxy_mid,
+                        reinterpret_cast<jlong>(db), jflush_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info});
+}
+
+void EventListenerJniCallback::OnFlushBegin(
+    DB* db, const FlushJobInfo& flush_job_info) {
+  if (m_on_flush_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jflush_job_info = SetupCallbackInvocation<FlushJobInfo>(
+      env, attached_thread, flush_job_info,
+      FlushJobInfoJni::fromCppFlushJobInfo);
+
+  if (jflush_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_flush_begin_proxy_mid,
+                        reinterpret_cast<jlong>(db), jflush_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info});
+}
+
+void EventListenerJniCallback::OnTableFileDeleted(
+    const TableFileDeletionInfo& info) {
+  if (m_on_table_file_deleted_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jdeletion_info = SetupCallbackInvocation<TableFileDeletionInfo>(
+      env, attached_thread, info,
+      TableFileDeletionInfoJni::fromCppTableFileDeletionInfo);
+
+  if (jdeletion_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_deleted_mid,
+                        jdeletion_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jdeletion_info});
+}
+
+void EventListenerJniCallback::OnCompactionBegin(DB* db,
+                                                 const CompactionJobInfo& ci) {
+  if (m_on_compaction_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcompaction_job_info = SetupCallbackInvocation<CompactionJobInfo>(
+      env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo);
+
+  if (jcompaction_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_compaction_begin_proxy_mid,
+                        reinterpret_cast<jlong>(db), jcompaction_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info});
+}
+
+void EventListenerJniCallback::OnCompactionCompleted(
+    DB* db, const CompactionJobInfo& ci) {
+  if (m_on_compaction_completed_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcompaction_job_info = SetupCallbackInvocation<CompactionJobInfo>(
+      env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo);
+
+  if (jcompaction_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_compaction_completed_proxy_mid,
+                        reinterpret_cast<jlong>(db), jcompaction_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info});
+}
+
+void EventListenerJniCallback::OnTableFileCreated(
+    const TableFileCreationInfo& info) {
+  if (m_on_table_file_created_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jfile_creation_info = SetupCallbackInvocation<TableFileCreationInfo>(
+      env, attached_thread, info,
+      TableFileCreationInfoJni::fromCppTableFileCreationInfo);
+
+  if (jfile_creation_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_created_mid,
+                        jfile_creation_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jfile_creation_info});
+}
+
+void EventListenerJniCallback::OnTableFileCreationStarted(
+    const TableFileCreationBriefInfo& info) {
+  if (m_on_table_file_creation_started_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcreation_brief_info =
+      SetupCallbackInvocation<TableFileCreationBriefInfo>(
+          env, attached_thread, info,
+          TableFileCreationBriefInfoJni::fromCppTableFileCreationBriefInfo);
+
+  if (jcreation_brief_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_creation_started_mid,
+                        jcreation_brief_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcreation_brief_info});
+}
+
+void EventListenerJniCallback::OnMemTableSealed(const MemTableInfo& info) {
+  if (m_on_mem_table_sealed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jmem_table_info = SetupCallbackInvocation<MemTableInfo>(
+      env, attached_thread, info, MemTableInfoJni::fromCppMemTableInfo);
+
+  if (jmem_table_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_mem_table_sealed_mid,
+                        jmem_table_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jmem_table_info});
+}
+
+void EventListenerJniCallback::OnColumnFamilyHandleDeletionStarted(
+    ColumnFamilyHandle* handle) {
+  if (m_on_column_family_handle_deletion_started_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcf_handle = SetupCallbackInvocation<ColumnFamilyHandle>(
+      env, attached_thread, *handle,
+      ColumnFamilyHandleJni::fromCppColumnFamilyHandle);
+
+  if (jcf_handle != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj,
+                        m_on_column_family_handle_deletion_started_mid,
+                        jcf_handle);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcf_handle});
+}
+
+void EventListenerJniCallback::OnExternalFileIngested(
+    DB* db, const ExternalFileIngestionInfo& info) {
+  if (m_on_external_file_ingested_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jingestion_info = SetupCallbackInvocation<ExternalFileIngestionInfo>(
+      env, attached_thread, info,
+      ExternalFileIngestionInfoJni::fromCppExternalFileIngestionInfo);
+
+  if (jingestion_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_external_file_ingested_proxy_mid,
+                        reinterpret_cast<jlong>(db), jingestion_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jingestion_info});
+}
+
+void EventListenerJniCallback::OnBackgroundError(BackgroundErrorReason reason,
+                                                 Status* bg_error) {
+  if (m_on_background_error_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jstatus = SetupCallbackInvocation<Status>(
+      env, attached_thread, *bg_error, StatusJni::construct);
+
+  if (jstatus != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_background_error_proxy_mid,
+                        static_cast<jbyte>(reason), jstatus);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jstatus});
+}
+
+void EventListenerJniCallback::OnStallConditionsChanged(
+    const WriteStallInfo& info) {
+  if (m_on_stall_conditions_changed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jwrite_stall_info = SetupCallbackInvocation<WriteStallInfo>(
+      env, attached_thread, info, WriteStallInfoJni::fromCppWriteStallInfo);
+
+  if (jwrite_stall_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_stall_conditions_changed_mid,
+                        jwrite_stall_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jwrite_stall_info});
+}
+
+void EventListenerJniCallback::OnFileReadFinish(const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_read_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileWriteFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_write_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileFlushFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_flush_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileSyncFinish(const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_sync_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileRangeSyncFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_range_sync_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileTruncateFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_truncate_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileCloseFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_close_finish_mid, info);
+}
+
+bool EventListenerJniCallback::ShouldBeNotifiedOnFileIO() {
+  if (m_should_be_notified_on_file_io == nullptr) {
+    return false;
+  }
+
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  jboolean jshould_be_notified =
+      env->CallBooleanMethod(m_jcallback_obj, m_should_be_notified_on_file_io);
+
+  CleanupCallbackInvocation(env, attached_thread, {});
+
+  return static_cast<bool>(jshould_be_notified);
+}
+
+void EventListenerJniCallback::OnErrorRecoveryBegin(
+    BackgroundErrorReason reason, Status bg_error, bool* auto_recovery) {
+  if (m_on_error_recovery_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jbg_error = SetupCallbackInvocation<Status>(
+      env, attached_thread, bg_error, StatusJni::construct);
+
+  if (jbg_error != nullptr) {
+    jboolean jauto_recovery = env->CallBooleanMethod(
+        m_jcallback_obj, m_on_error_recovery_begin_proxy_mid,
+        static_cast<jbyte>(reason), jbg_error);
+    *auto_recovery = jauto_recovery == JNI_TRUE;
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jbg_error});
+}
+
+void EventListenerJniCallback::OnErrorRecoveryCompleted(Status old_bg_error) {
+  if (m_on_error_recovery_completed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jold_bg_error = SetupCallbackInvocation<Status>(
+      env, attached_thread, old_bg_error, StatusJni::construct);
+
+  if (jold_bg_error != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_error_recovery_completed_mid,
+                        jold_bg_error);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jold_bg_error});
+}
+
+void EventListenerJniCallback::InitCallbackMethodId(
+    jmethodID& mid, EnabledEventCallback eec, JNIEnv* env,
+    jmethodID (*get_id)(JNIEnv* env)) {
+  if (m_enabled_event_callbacks.count(eec) == 1) {
+    mid = get_id(env);
+  } else {
+    mid = nullptr;
+  }
+}
+
+template <class T>
+jobject EventListenerJniCallback::SetupCallbackInvocation(
+    JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj,
+    jobject (*convert)(JNIEnv* env, const T* cpp_obj)) {
+  attached_thread = JNI_FALSE;
+  env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  return convert(env, &cpp_obj);
+}
+
+void EventListenerJniCallback::CleanupCallbackInvocation(
+    JNIEnv* env, jboolean attached_thread,
+    std::initializer_list<jobject*> refs) {
+  for (auto* ref : refs) {
+    if (*ref == nullptr) continue;
+    env->DeleteLocalRef(*ref);
+  }
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallVoidMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+  }
+
+  releaseJniEnv(attached_thread);
+}
+
+void EventListenerJniCallback::OnFileOperation(const jmethodID& mid,
+                                               const FileOperationInfo& info) {
+  if (mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jop_info = SetupCallbackInvocation<FileOperationInfo>(
+      env, attached_thread, info,
+      FileOperationInfoJni::fromCppFileOperationInfo);
+
+  if (jop_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, mid, jop_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jop_info});
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/event_listener_jnicallback.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,122 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::EventListener.
+
+#ifndef JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include <memory>
+#include <set>
+
+#include "rocksdb/listener.h"
+#include "rocksjni/jnicallback.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum EnabledEventCallback {
+  ON_FLUSH_COMPLETED = 0x0,
+  ON_FLUSH_BEGIN = 0x1,
+  ON_TABLE_FILE_DELETED = 0x2,
+  ON_COMPACTION_BEGIN = 0x3,
+  ON_COMPACTION_COMPLETED = 0x4,
+  ON_TABLE_FILE_CREATED = 0x5,
+  ON_TABLE_FILE_CREATION_STARTED = 0x6,
+  ON_MEMTABLE_SEALED = 0x7,
+  ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED = 0x8,
+  ON_EXTERNAL_FILE_INGESTED = 0x9,
+  ON_BACKGROUND_ERROR = 0xA,
+  ON_STALL_CONDITIONS_CHANGED = 0xB,
+  ON_FILE_READ_FINISH = 0xC,
+  ON_FILE_WRITE_FINISH = 0xD,
+  ON_FILE_FLUSH_FINISH = 0xE,
+  ON_FILE_SYNC_FINISH = 0xF,
+  ON_FILE_RANGE_SYNC_FINISH = 0x10,
+  ON_FILE_TRUNCATE_FINISH = 0x11,
+  ON_FILE_CLOSE_FINISH = 0x12,
+  SHOULD_BE_NOTIFIED_ON_FILE_IO = 0x13,
+  ON_ERROR_RECOVERY_BEGIN = 0x14,
+  ON_ERROR_RECOVERY_COMPLETED = 0x15,
+
+  NUM_ENABLED_EVENT_CALLBACK = 0x16,
+};
+
+class EventListenerJniCallback : public JniCallback, public EventListener {
+ public:
+  EventListenerJniCallback(
+      JNIEnv* env, jobject jevent_listener,
+      const std::set<EnabledEventCallback>& enabled_event_callbacks);
+  virtual ~EventListenerJniCallback();
+  virtual void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info);
+  virtual void OnFlushBegin(DB* db, const FlushJobInfo& flush_job_info);
+  virtual void OnTableFileDeleted(const TableFileDeletionInfo& info);
+  virtual void OnCompactionBegin(DB* db, const CompactionJobInfo& ci);
+  virtual void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci);
+  virtual void OnTableFileCreated(const TableFileCreationInfo& info);
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& info);
+  virtual void OnMemTableSealed(const MemTableInfo& info);
+  virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle);
+  virtual void OnExternalFileIngested(DB* db,
+                                      const ExternalFileIngestionInfo& info);
+  virtual void OnBackgroundError(BackgroundErrorReason reason,
+                                 Status* bg_error);
+  virtual void OnStallConditionsChanged(const WriteStallInfo& info);
+  virtual void OnFileReadFinish(const FileOperationInfo& info);
+  virtual void OnFileWriteFinish(const FileOperationInfo& info);
+  virtual void OnFileFlushFinish(const FileOperationInfo& info);
+  virtual void OnFileSyncFinish(const FileOperationInfo& info);
+  virtual void OnFileRangeSyncFinish(const FileOperationInfo& info);
+  virtual void OnFileTruncateFinish(const FileOperationInfo& info);
+  virtual void OnFileCloseFinish(const FileOperationInfo& info);
+  virtual bool ShouldBeNotifiedOnFileIO();
+  virtual void OnErrorRecoveryBegin(BackgroundErrorReason reason,
+                                    Status bg_error, bool* auto_recovery);
+  virtual void OnErrorRecoveryCompleted(Status old_bg_error);
+
+ private:
+  inline void InitCallbackMethodId(jmethodID& mid, EnabledEventCallback eec,
+                                   JNIEnv* env,
+                                   jmethodID (*get_id)(JNIEnv* env));
+  template <class T>
+  inline jobject SetupCallbackInvocation(
+      JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj,
+      jobject (*convert)(JNIEnv* env, const T* cpp_obj));
+  inline void CleanupCallbackInvocation(JNIEnv* env, jboolean attached_thread,
+                                        std::initializer_list<jobject*> refs);
+  inline void OnFileOperation(const jmethodID& mid,
+                              const FileOperationInfo& info);
+
+  const std::set<EnabledEventCallback> m_enabled_event_callbacks;
+  jmethodID m_on_flush_completed_proxy_mid;
+  jmethodID m_on_flush_begin_proxy_mid;
+  jmethodID m_on_table_file_deleted_mid;
+  jmethodID m_on_compaction_begin_proxy_mid;
+  jmethodID m_on_compaction_completed_proxy_mid;
+  jmethodID m_on_table_file_created_mid;
+  jmethodID m_on_table_file_creation_started_mid;
+  jmethodID m_on_mem_table_sealed_mid;
+  jmethodID m_on_column_family_handle_deletion_started_mid;
+  jmethodID m_on_external_file_ingested_proxy_mid;
+  jmethodID m_on_background_error_proxy_mid;
+  jmethodID m_on_stall_conditions_changed_mid;
+  jmethodID m_on_file_read_finish_mid;
+  jmethodID m_on_file_write_finish_mid;
+  jmethodID m_on_file_flush_finish_mid;
+  jmethodID m_on_file_sync_finish_mid;
+  jmethodID m_on_file_range_sync_finish_mid;
+  jmethodID m_on_file_truncate_finish_mid;
+  jmethodID m_on_file_close_finish_mid;
+  jmethodID m_should_be_notified_on_file_io;
+  jmethodID m_on_error_recovery_begin_proxy_mid;
+  jmethodID m_on_error_recovery_completed_mid;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -83,6 +83,23 @@
 
 /*
  * Class:     org_rocksdb_RocksIterator
+ * Method:    refresh0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_refresh0(JNIEnv* env, jobject /*jobj*/,
+                                            jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Status s = it->Refresh();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
  * Method:    seek0
  * Signature: (J[BI)V
  */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.cc	2025-05-19 16:14:27.000000000 +0000
@@ -43,11 +43,10 @@
   JNIEnv* env = getJniEnv(&attached_thread);
   assert(env != nullptr);
 
-  if(m_jcallback_obj != nullptr) {    
+  if (m_jcallback_obj != nullptr) {
     env->DeleteGlobalRef(m_jcallback_obj);
   }
 
   releaseJniEnv(attached_thread);
 }
-// @lint-ignore TXT4 T25377293 Grandfathered in
-}  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/jnicallback.h	2025-05-19 16:14:27.000000000 +0000
@@ -19,6 +19,8 @@
   JniCallback(JNIEnv* env, jobject jcallback_obj);
   virtual ~JniCallback();
 
+  const jobject& GetJavaObject() const { return m_jcallback_obj; }
+
  protected:
   JavaVM* m_jvm;
   jobject m_jcallback_obj;
@@ -27,5 +29,4 @@
   };
   }  // namespace ROCKSDB_NAMESPACE
 
-// @lint-ignore TXT4 T25377293 Grandfathered in
 #endif  // JAVA_ROCKSJNI_JNICALLBACK_H_
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/memory_util.cc	2025-05-19 16:14:27.000000000 +0000
@@ -22,20 +22,14 @@
  * Signature: ([J[J)Ljava/util/Map;
  */
 jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
-    JNIEnv *env, jclass /*jclazz*/, jlongArray jdb_handles, jlongArray jcache_handles) {
-  std::vector<ROCKSDB_NAMESPACE::DB *> dbs;
-  jsize db_handle_count = env->GetArrayLength(jdb_handles);
-  if(db_handle_count > 0) {
-    jlong *ptr_jdb_handles = env->GetLongArrayElements(jdb_handles, nullptr);
-    if (ptr_jdb_handles == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return nullptr;
-    }
-    for (jsize i = 0; i < db_handle_count; i++) {
-      dbs.push_back(
-          reinterpret_cast<ROCKSDB_NAMESPACE::DB *>(ptr_jdb_handles[i]));
-    }
-    env->ReleaseLongArrayElements(jdb_handles, ptr_jdb_handles, JNI_ABORT);
+    JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) {
+  jboolean has_exception = JNI_FALSE;
+  std::vector<ROCKSDB_NAMESPACE::DB *> dbs =
+      ROCKSDB_NAMESPACE::JniUtil::fromJPointers<ROCKSDB_NAMESPACE::DB>(
+          env, jdb_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
   }
 
   std::unordered_set<const ROCKSDB_NAMESPACE::Cache *> cache_set;
@@ -103,5 +97,4 @@
   }
 
   return jusage_by_type;
-
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/merge_operator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -30,7 +30,7 @@
  * Method:    newSharedStringAppendOperator
  * Signature: (C)J
  */
-jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator(
+jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator__C(
     JNIEnv* /*env*/, jclass /*jclazz*/, jchar jdelim) {
   auto* sptr_string_append_op =
       new std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>(
@@ -39,6 +39,20 @@
   return reinterpret_cast<jlong>(sptr_string_append_op);
 }
 
+jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator__Ljava_lang_String_2(
+    JNIEnv* env, jclass /*jclass*/, jstring jdelim) {
+  jboolean has_exception = JNI_FALSE;
+  auto delim =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdelim, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    return 0;
+  }
+  auto* sptr_string_append_op =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::MergeOperator>(
+          ROCKSDB_NAMESPACE::MergeOperators::CreateStringAppendOperator(delim));
+  return reinterpret_cast<jlong>(sptr_string_append_op);
+}
+
 /*
  * Class:     org_rocksdb_StringAppendOperator
  * Method:    disposeInternal
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,9 +6,12 @@
 // This file implements the "bridge" between Java and C++ for
 // ROCKSDB_NAMESPACE::Options.
 
+#include "rocksdb/options.h"
+
 #include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
+
 #include <memory>
 #include <vector>
 
@@ -19,22 +22,20 @@
 #include "include/org_rocksdb_Options.h"
 #include "include/org_rocksdb_ReadOptions.h"
 #include "include/org_rocksdb_WriteOptions.h"
-
-#include "rocksjni/comparatorjnicallback.h"
-#include "rocksjni/portal.h"
-#include "rocksjni/statisticsjni.h"
-#include "rocksjni/table_filter_jnicallback.h"
-
 #include "rocksdb/comparator.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
-#include "rocksdb/options.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_partitioner.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/portal.h"
+#include "rocksjni/statisticsjni.h"
+#include "rocksjni/table_filter_jnicallback.h"
 #include "utilities/merge_operators.h"
 
 /*
@@ -552,7 +553,8 @@
 void Java_org_rocksdb_Options_dbPaths(
     JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
-  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
   if (ptr_jtarget_size == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -580,7 +582,8 @@
     ptr_jtarget_size[i] = static_cast<jint>(db_path.target_size);
   }
 
-  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT);
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 }
 
 /*
@@ -916,6 +919,135 @@
   return env->NewStringUTF(tf->Name());
 }
 
+static std::vector<ROCKSDB_NAMESPACE::DbPath>
+rocksdb_convert_cf_paths_from_java_helper(JNIEnv* env, jobjectArray path_array,
+                                          jlongArray size_array,
+                                          jboolean* has_exception) {
+  jboolean copy_str_has_exception;
+  std::vector<std::string> paths = ROCKSDB_NAMESPACE::JniUtil::copyStrings(
+      env, path_array, &copy_str_has_exception);
+  if (JNI_TRUE == copy_str_has_exception) {
+    // Exception thrown
+    *has_exception = JNI_TRUE;
+    return {};
+  }
+
+  if (static_cast<size_t>(env->GetArrayLength(size_array)) != paths.size()) {
+    ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(
+        env,
+        ROCKSDB_NAMESPACE::Status::InvalidArgument(
+            ROCKSDB_NAMESPACE::Slice("There should be a corresponding target "
+                                     "size for every path and vice versa.")));
+    *has_exception = JNI_TRUE;
+    return {};
+  }
+
+  jlong* size_array_ptr = env->GetLongArrayElements(size_array, nullptr);
+  if (nullptr == size_array_ptr) {
+    // exception thrown: OutOfMemoryError
+    *has_exception = JNI_TRUE;
+    return {};
+  }
+  std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths;
+  for (size_t i = 0; i < paths.size(); ++i) {
+    jlong target_size = size_array_ptr[i];
+    if (target_size < 0) {
+      ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew(
+          env,
+          ROCKSDB_NAMESPACE::Status::InvalidArgument(ROCKSDB_NAMESPACE::Slice(
+              "Path target size has to be positive.")));
+      *has_exception = JNI_TRUE;
+      env->ReleaseLongArrayElements(size_array, size_array_ptr, JNI_ABORT);
+      return {};
+    }
+    cf_paths.push_back(ROCKSDB_NAMESPACE::DbPath(
+        paths[i], static_cast<uint64_t>(target_size)));
+  }
+
+  env->ReleaseLongArrayElements(size_array, size_array_ptr, JNI_ABORT);
+
+  return cf_paths;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_Options_setCfPaths(JNIEnv* env, jclass, jlong jhandle,
+                                         jobjectArray path_array,
+                                         jlongArray size_array) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  jboolean has_exception = JNI_FALSE;
+  std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths =
+      rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array,
+                                                &has_exception);
+  if (JNI_FALSE == has_exception) {
+    options->cf_paths = std::move(cf_paths);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    cfPathsLen
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_cfPathsLen(JNIEnv*, jclass, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->cf_paths.size());
+}
+
+template <typename T>
+static void rocksdb_convert_cf_paths_to_java_helper(JNIEnv* env, jlong jhandle,
+                                                    jobjectArray jpaths,
+                                                    jlongArray jtarget_sizes) {
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  auto* opt = reinterpret_cast<T*>(jhandle);
+  const jsize len = env->GetArrayLength(jpaths);
+  for (jsize i = 0; i < len; i++) {
+    ROCKSDB_NAMESPACE::DbPath cf_path = opt->cf_paths[i];
+
+    jstring jpath = env->NewStringUTF(cf_path.path.c_str());
+    if (jpath == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+    env->SetObjectArrayElement(jpaths, i, jpath);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jpath);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
+    }
+
+    ptr_jtarget_size[i] = static_cast<jint>(cf_path.target_size);
+
+    env->DeleteLocalRef(jpath);
+  }
+
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy ? 0 : JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    cfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_Options_cfPaths(JNIEnv* env, jclass, jlong jhandle,
+                                      jobjectArray jpaths,
+                                      jlongArray jtarget_sizes) {
+  rocksdb_convert_cf_paths_to_java_helper<ROCKSDB_NAMESPACE::Options>(
+      env, jhandle, jpaths, jtarget_sizes);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setMaxManifestFileSize
@@ -1092,6 +1224,29 @@
 
 /*
  * Class:     org_rocksdb_Options
+ * Method:    setMaxWriteBatchGroupSizeBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxWriteBatchGroupSizeBytes(
+    JNIEnv*, jclass, jlong jhandle, jlong jmax_write_batch_group_size_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->max_write_batch_group_size_bytes =
+      static_cast<uint64_t>(jmax_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxWriteBatchGroupSizeBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass,
+                                                           jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->max_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
  * Method:    manifestPreallocationSize
  * Signature: (J)J
  */
@@ -1131,6 +1286,34 @@
 }
 
 /*
+ * Method:    setSstPartitionerFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setSstPartitionerFactory(JNIEnv*, jobject,
+                                                       jlong jhandle,
+                                                       jlong factory_handle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(
+      factory_handle);
+  options->sst_partitioner_factory = *factory;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionThreadLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompactionThreadLimiter(
+    JNIEnv*, jclass, jlong jhandle, jlong jlimiter_handle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* limiter = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(
+      jlimiter_handle);
+  options->compaction_thread_limiter = *limiter;
+}
+
+/*
  * Class:     org_rocksdb_Options
  * Method:    allowMmapReads
  * Signature: (J)Z
@@ -1587,6 +1770,76 @@
   return static_cast<jboolean>(opt->strict_bytes_per_sync);
 }
 
+// Note: the RocksJava API currently only supports EventListeners implemented in
+// Java. It could be extended in future to also support adding/removing
+// EventListeners implemented in C++.
+static void rocksdb_set_event_listeners_helper(
+    JNIEnv* env, jlongArray jlistener_array,
+    std::vector<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>>&
+        listener_sptr_vec) {
+  jlong* ptr_jlistener_array =
+      env->GetLongArrayElements(jlistener_array, nullptr);
+  if (ptr_jlistener_array == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  const jsize array_size = env->GetArrayLength(jlistener_array);
+  listener_sptr_vec.clear();
+  for (jsize i = 0; i < array_size; ++i) {
+    const auto& listener_sptr =
+        *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
+            ptr_jlistener_array[i]);
+    listener_sptr_vec.push_back(listener_sptr);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEventListeners
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_Options_setEventListeners(JNIEnv* env, jclass,
+                                                jlong jhandle,
+                                                jlongArray jlistener_array) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners);
+}
+
+// Note: the RocksJava API currently only supports EventListeners implemented in
+// Java. It could be extended in future to also support adding/removing
+// EventListeners implemented in C++.
+static jobjectArray rocksdb_get_event_listeners_helper(
+    JNIEnv* env,
+    const std::vector<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>>&
+        listener_sptr_vec) {
+  jsize sz = static_cast<jsize>(listener_sptr_vec.size());
+  jclass jlistener_clazz =
+      ROCKSDB_NAMESPACE::AbstractEventListenerJni::getJClass(env);
+  jobjectArray jlisteners = env->NewObjectArray(sz, jlistener_clazz, nullptr);
+  if (jlisteners == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  for (jsize i = 0; i < sz; ++i) {
+    const auto* jni_cb =
+        static_cast<ROCKSDB_NAMESPACE::EventListenerJniCallback*>(
+            listener_sptr_vec[i].get());
+    env->SetObjectArrayElement(jlisteners, i, jni_cb->GetJavaObject());
+  }
+  return jlisteners;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    eventListeners
+ * Signature: (J)[Lorg/rocksdb/AbstractEventListener;
+ */
+jobjectArray Java_org_rocksdb_Options_eventListeners(JNIEnv* env, jclass,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return rocksdb_get_event_listeners_helper(env, opt->listeners);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setEnableThreadTracking
@@ -1793,7 +2046,7 @@
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jobject, jlong jhandle,
+    JNIEnv*, jclass, jlong jhandle,
     jboolean jskip_checking_sst_file_sizes_on_db_open) {
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
   opt->skip_checking_sst_file_sizes_on_db_open =
@@ -1806,7 +2059,7 @@
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jobject, jlong jhandle) {
+    JNIEnv*, jclass, jlong jhandle) {
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
   return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
 }
@@ -1957,6 +2210,162 @@
 
 /*
  * Class:     org_rocksdb_Options
+ * Method:    setAvoidUnnecessaryBlockingIO
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAvoidUnnecessaryBlockingIO(
+    JNIEnv*, jclass, jlong jhandle, jboolean avoid_blocking_io) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->avoid_unnecessary_blocking_io = static_cast<bool>(avoid_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    avoidUnnecessaryBlockingIO
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_avoidUnnecessaryBlockingIO(JNIEnv*, jclass,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_unnecessary_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setPersistStatsToDisk
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setPersistStatsToDisk(
+    JNIEnv*, jclass, jlong jhandle, jboolean persist_stats_to_disk) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->persist_stats_to_disk = static_cast<bool>(persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    persistStatsToDisk
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_persistStatsToDisk(JNIEnv*, jclass,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteDbidToManifest
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setWriteDbidToManifest(
+    JNIEnv*, jclass, jlong jhandle, jboolean jwrite_dbid_to_manifest) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->write_dbid_to_manifest = static_cast<bool>(jwrite_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writeDbidToManifest
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_writeDbidToManifest(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opt->write_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLogReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setLogReadaheadSize(JNIEnv*, jclass,
+                                                  jlong jhandle,
+                                                  jlong jlog_readahead_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->log_readahead_size = static_cast<size_t>(jlog_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    logReasaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_logReadaheadSize(JNIEnv*, jclass,
+                                                jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->log_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBestEffortsRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setBestEffortsRecovery(
+    JNIEnv*, jclass, jlong jhandle, jboolean jbest_efforts_recovery) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->best_efforts_recovery = static_cast<bool>(jbest_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bestEffortsRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_bestEffortsRecovery(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->best_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBgErrorResumeCount
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBgErrorResumeCount(
+    JNIEnv*, jclass, jlong jhandle, jint jmax_bgerror_resume_count) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->max_bgerror_resume_count = static_cast<int>(jmax_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBgerrorResumeCount
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBgerrorResumeCount(JNIEnv*, jclass,
+                                                    jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jint>(opt->max_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBgerrorResumeRetryInterval
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBgerrorResumeRetryInterval(
+    JNIEnv*, jclass, jlong jhandle, jlong jbgerror_resume_retry_interval) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opt->bgerror_resume_retry_interval =
+      static_cast<uint64_t>(jbgerror_resume_retry_interval);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bgerrorResumeRetryInterval
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_bgerrorResumeRetryInterval(JNIEnv*, jclass,
+                                                          jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opt->bgerror_resume_retry_interval);
+}
+
+/*
+ * Class:     org_rocksdb_Options
  * Method:    setAvoidFlushDuringShutdown
  * Signature: (JZ)V
  */
@@ -2833,16 +3242,45 @@
 
 /*
  * Class:     org_rocksdb_Options
+ * Method:    oldDefaults
+ * Signature: (JII)V
+ */
+void Java_org_rocksdb_Options_oldDefaults(JNIEnv*, jclass, jlong jhandle,
+                                          jint major_version,
+                                          jint minor_version) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->OldDefaults(
+      major_version, minor_version);
+}
+
+/*
+ * Class:     org_rocksdb_Options
  * Method:    optimizeForSmallDb
  * Signature: (J)V
  */
-void Java_org_rocksdb_Options_optimizeForSmallDb(
-    JNIEnv*, jobject, jlong jhandle) {
+void Java_org_rocksdb_Options_optimizeForSmallDb__J(JNIEnv*, jobject,
+                                                    jlong jhandle) {
   reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)->OptimizeForSmallDb();
 }
 
 /*
  * Class:     org_rocksdb_Options
+ * Method:    optimizeForSmallDb
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeForSmallDb__JJ(JNIEnv*, jclass,
+                                                     jlong jhandle,
+                                                     jlong cache_handle) {
+  auto* cache_sptr_ptr =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
+          cache_handle);
+  auto* options_ptr = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  auto* cf_options_ptr =
+      static_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(options_ptr);
+  cf_options_ptr->OptimizeForSmallDb(cache_sptr_ptr);
+}
+
+/*
+ * Class:     org_rocksdb_Options
  * Method:    optimizeForPointLookup
  * Signature: (JJ)V
  */
@@ -3188,6 +3626,29 @@
 
 /*
  * Class:     org_rocksdb_Options
+ * Method:    setPeriodicCompactionSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setPeriodicCompactionSeconds(
+    JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->periodic_compaction_seconds =
+      static_cast<uint64_t>(jperiodicCompactionSeconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    periodicCompactionSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_periodicCompactionSeconds(JNIEnv*, jobject,
+                                                         jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->periodic_compaction_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
  * Method:    setCompactionOptionsUniversal
  * Signature: (JJ)V
  */
@@ -3236,6 +3697,170 @@
   return static_cast<bool>(opts->force_consistency_checks);
 }
 
+/// BLOB options
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnableBlobFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnableBlobFiles(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jboolean jenable_blob_files) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->enable_blob_files = static_cast<bool>(jenable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enableBlobFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enableBlobFiles(JNIEnv*, jobject,
+                                                  jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinBlobSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMinBlobSize(JNIEnv*, jobject, jlong jhandle,
+                                             jlong jmin_blob_size) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->min_blob_size = static_cast<uint64_t>(jmin_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minBlobSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_minBlobSize(JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->min_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinBlobSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBlobFileSize(JNIEnv*, jobject, jlong jhandle,
+                                              jlong jblob_file_size) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_file_size = static_cast<uint64_t>(jblob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minBlobSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_blobFileSize(JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jlong>(opts->blob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setBlobCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_compression_type =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jblob_compression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobCompressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_blobCompressionType(JNIEnv*, jobject,
+                                                   jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      opts->blob_compression_type);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnableBlobGarbageCollection
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnableBlobGarbageCollection(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->enable_blob_garbage_collection =
+      static_cast<bool>(jenable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enableBlobGarbageCollection
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enableBlobGarbageCollection(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobGarbageCollectionAgeCutoff
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setBlobGarbageCollectionAgeCutoff(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_age_cutoff) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_garbage_collection_age_cutoff =
+      static_cast<double>(jblob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobGarbageCollectionAgeCutoff
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_blobGarbageCollectionAgeCutoff(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBlobGarbageCollectionForceThreshold
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setBlobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_force_threshold) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  opts->blob_garbage_collection_force_threshold =
+      static_cast<double>(jblob_garbage_collection_force_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    blobGarbageCollectionForceThreshold
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_blobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_force_threshold);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // ROCKSDB_NAMESPACE::ColumnFamilyOptions
 
@@ -3277,9 +3902,43 @@
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    getColumnFamilyOptionsFromProps
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__JLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong cfg_handle, jstring jopt_string) {
+  const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
+  if (opt_string == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(cfg_handle);
+  auto* cf_options = new ROCKSDB_NAMESPACE::ColumnFamilyOptions();
+  ROCKSDB_NAMESPACE::Status status =
+      ROCKSDB_NAMESPACE::GetColumnFamilyOptionsFromString(
+          *config_options, ROCKSDB_NAMESPACE::ColumnFamilyOptions(), opt_string,
+          cf_options);
+
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+
+  // Check if ColumnFamilyOptions creation was possible.
+  jlong ret_value = 0;
+  if (status.ok()) {
+    ret_value = reinterpret_cast<jlong>(cf_options);
+  } else {
+    // if operation failed the ColumnFamilyOptions need to be deleted
+    // again to prevent a memory leak.
+    delete cf_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    getColumnFamilyOptionsFromProps
  * Signature: (Ljava/util/String;)J
  */
-jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps(
+jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__Ljava_lang_String_2(
     JNIEnv* env, jclass, jstring jopt_string) {
   const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
   if (opt_string == nullptr) {
@@ -3320,17 +3979,45 @@
 
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    oldDefaults
+ * Signature: (JII)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_oldDefaults(JNIEnv*, jclass,
+                                                      jlong jhandle,
+                                                      jint major_version,
+                                                      jint minor_version) {
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OldDefaults(major_version, minor_version);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    optimizeForSmallDb
  * Signature: (J)V
  */
-void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb(
-    JNIEnv*, jobject, jlong jhandle) {
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__J(JNIEnv*,
+                                                                jobject,
+                                                                jlong jhandle) {
   reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
       ->OptimizeForSmallDb();
 }
 
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeForSmallDb
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb__JJ(
+    JNIEnv*, jclass, jlong jhandle, jlong cache_handle) {
+  auto* cache_sptr_ptr =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
+          cache_handle);
+  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeForSmallDb(cache_sptr_ptr);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    optimizeForPointLookup
  * Signature: (JJ)V
  */
@@ -3588,6 +4275,35 @@
 }
 
 /*
+ * Method:    setSstPartitionerFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory(
+    JNIEnv*, jobject, jlong jhandle, jlong factory_handle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(
+      factory_handle);
+  options->sst_partitioner_factory = *factory;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionThreadLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionThreadLimiter(
+    JNIEnv*, jclass, jlong jhandle, jlong jlimiter_handle) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  auto* limiter = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::ConcurrentTaskLimiter>*>(
+      jlimiter_handle);
+  options->compaction_thread_limiter = *limiter;
+}
+
+/*
  * Method:    tableFactoryName
  * Signature: (J)Ljava/lang/String
  */
@@ -3606,6 +4322,52 @@
 
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCfPaths(JNIEnv* env, jclass,
+                                                     jlong jhandle,
+                                                     jobjectArray path_array,
+                                                     jlongArray size_array) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  jboolean has_exception = JNI_FALSE;
+  std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths =
+      rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array,
+                                                &has_exception);
+  if (JNI_FALSE == has_exception) {
+    options->cf_paths = std::move(cf_paths);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    cfPathsLen
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_cfPathsLen(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(opt->cf_paths.size());
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    cfPaths
+ * Signature: (J[Ljava/lang/String;[J)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_cfPaths(JNIEnv* env, jclass,
+                                                  jlong jhandle,
+                                                  jobjectArray jpaths,
+                                                  jlongArray jtarget_sizes) {
+  rocksdb_convert_cf_paths_to_java_helper<
+      ROCKSDB_NAMESPACE::ColumnFamilyOptions>(env, jhandle, jpaths,
+                                              jtarget_sizes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    minWriteBufferNumberToMerge
  * Signature: (J)I
  */
@@ -4458,8 +5220,8 @@
     JNIEnv* env, jobject, jlong jhandle,
     jintArray jmax_bytes_for_level_multiplier_additional) {
   jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional);
-  jint* additionals =
-      env->GetIntArrayElements(jmax_bytes_for_level_multiplier_additional, 0);
+  jint* additionals = env->GetIntArrayElements(
+      jmax_bytes_for_level_multiplier_additional, nullptr);
   if (additionals == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -4576,6 +5338,32 @@
 
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setPeriodicCompactionSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setPeriodicCompactionSeconds(
+    JNIEnv*, jobject, jlong jhandle, jlong jperiodicCompactionSeconds) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  cf_opts->periodic_compaction_seconds =
+      static_cast<uint64_t>(jperiodicCompactionSeconds);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    periodicCompactionSeconds
+ * Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_rocksdb_ColumnFamilyOptions_periodicCompactionSeconds(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  auto* cf_opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(cf_opts->periodic_compaction_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    setCompactionOptionsUniversal
  * Signature: (JJ)V
  */
@@ -4626,7 +5414,187 @@
     JNIEnv*, jobject, jlong jhandle) {
   auto* cf_opts =
       reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
-  return static_cast<bool>(cf_opts->force_consistency_checks);
+  return static_cast<jboolean>(cf_opts->force_consistency_checks);
+}
+
+/// BLOB options
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setEnableBlobFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobFiles(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_files) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->enable_blob_files = static_cast<bool>(jenable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    enableBlobFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobFiles(JNIEnv*, jobject,
+                                                              jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_files);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinBlobSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMinBlobSize(JNIEnv*, jobject,
+                                                         jlong jhandle,
+                                                         jlong jmin_blob_size) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->min_blob_size = static_cast<uint64_t>(jmin_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minBlobSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_minBlobSize(JNIEnv*, jobject,
+                                                       jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(opts->min_blob_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinBlobSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobFileSize(
+    JNIEnv*, jobject, jlong jhandle, jlong jblob_file_size) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_file_size = static_cast<uint64_t>(jblob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minBlobSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_blobFileSize(JNIEnv*, jobject,
+                                                        jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(opts->blob_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobCompressionType(
+    JNIEnv*, jobject, jlong jhandle, jbyte jblob_compression_type_value) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_compression_type =
+      ROCKSDB_NAMESPACE::CompressionTypeJni::toCppCompressionType(
+          jblob_compression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobCompressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_blobCompressionType(JNIEnv*, jobject,
+                                                               jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType(
+      opts->blob_compression_type);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setEnableBlobGarbageCollection
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setEnableBlobGarbageCollection(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_blob_garbage_collection) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->enable_blob_garbage_collection =
+      static_cast<bool>(jenable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    enableBlobGarbageCollection
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_enableBlobGarbageCollection(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jboolean>(opts->enable_blob_garbage_collection);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobGarbageCollectionAgeCutoff
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionAgeCutoff(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_age_cutoff) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_garbage_collection_age_cutoff =
+      static_cast<double>(jblob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobGarbageCollectionAgeCutoff
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionAgeCutoff(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_age_cutoff);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBlobGarbageCollectionForceThreshold
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBlobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle,
+    jdouble jblob_garbage_collection_force_threshold) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  opts->blob_garbage_collection_force_threshold =
+      static_cast<double>(jblob_garbage_collection_force_threshold);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    blobGarbageCollectionAgeCutoff
+ * Signature: (J)D
+ */
+jdouble
+Java_org_rocksdb_ColumnFamilyOptions_blobGarbageCollectionForceThreshold(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jdouble>(opts->blob_garbage_collection_force_threshold);
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -4670,9 +5638,42 @@
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    getDBOptionsFromProps
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__JLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong config_handle, jstring jopt_string) {
+  const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
+  if (opt_string == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(config_handle);
+  auto* db_options = new ROCKSDB_NAMESPACE::DBOptions();
+  ROCKSDB_NAMESPACE::Status status = ROCKSDB_NAMESPACE::GetDBOptionsFromString(
+      *config_options, ROCKSDB_NAMESPACE::DBOptions(), opt_string, db_options);
+
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+
+  // Check if DBOptions creation was possible.
+  jlong ret_value = 0;
+  if (status.ok()) {
+    ret_value = reinterpret_cast<jlong>(db_options);
+  } else {
+    // if operation failed the DBOptions need to be deleted
+    // again to prevent a memory leak.
+    delete db_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    getDBOptionsFromProps
  * Signature: (Ljava/util/String;)J
  */
-jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps(
+jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__Ljava_lang_String_2(
     JNIEnv* env, jclass, jstring jopt_string) {
   const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
   if (opt_string == nullptr) {
@@ -5078,7 +6079,8 @@
 void Java_org_rocksdb_DBOptions_dbPaths(
     JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
-  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
   if (ptr_jtarget_size == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -5106,7 +6108,8 @@
     ptr_jtarget_size[i] = static_cast<jint>(db_path.target_size);
   }
 
-  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT);
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 }
 
 /*
@@ -5498,6 +6501,29 @@
 
 /*
  * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxWriteBatchGroupSizeBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxWriteBatchGroupSizeBytes(
+    JNIEnv*, jclass, jlong jhandle, jlong jmax_write_batch_group_size_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->max_write_batch_group_size_bytes =
+      static_cast<uint64_t>(jmax_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxWriteBatchGroupSizeBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->max_write_batch_group_size_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
  * Method:    setManifestPreallocationSize
  * Signature: (JJ)V
  */
@@ -5994,6 +7020,29 @@
 
 /*
  * Class:     org_rocksdb_DBOptions
+ * Method:    setEventListeners
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_DBOptions_setEventListeners(JNIEnv* env, jclass,
+                                                  jlong jhandle,
+                                                  jlongArray jlistener_array) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    eventListeners
+ * Signature: (J)[Lorg/rocksdb/AbstractEventListener;
+ */
+jobjectArray Java_org_rocksdb_DBOptions_eventListeners(JNIEnv* env, jclass,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return rocksdb_get_event_listeners_helper(env, opt->listeners);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
  * Method:    setDelayedWriteRate
  * Signature: (JJ)V
  */
@@ -6198,7 +7247,7 @@
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jobject, jlong jhandle,
+    JNIEnv*, jclass, jlong jhandle,
     jboolean jskip_checking_sst_file_sizes_on_db_open) {
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
   opt->skip_checking_sst_file_sizes_on_db_open =
@@ -6211,7 +7260,7 @@
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jobject, jlong jhandle) {
+    JNIEnv*, jclass, jlong jhandle) {
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
 }
@@ -6491,6 +7540,162 @@
   return static_cast<jboolean>(opt->avoid_flush_during_shutdown);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAvoidUnnecessaryBlockingIO
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAvoidUnnecessaryBlockingIO(
+    JNIEnv*, jclass, jlong jhandle, jboolean avoid_blocking_io) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->avoid_unnecessary_blocking_io = static_cast<bool>(avoid_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    avoidUnnecessaryBlockingIO
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_avoidUnnecessaryBlockingIO(JNIEnv*, jclass,
+                                                               jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->avoid_unnecessary_blocking_io);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setPersistStatsToDisk
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setPersistStatsToDisk(
+    JNIEnv*, jclass, jlong jhandle, jboolean persist_stats_to_disk) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->persist_stats_to_disk = static_cast<bool>(persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    persistStatsToDisk
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_persistStatsToDisk(JNIEnv*, jclass,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->persist_stats_to_disk);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWriteDbidToManifest
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setWriteDbidToManifest(
+    JNIEnv*, jclass, jlong jhandle, jboolean jwrite_dbid_to_manifest) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->write_dbid_to_manifest = static_cast<bool>(jwrite_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    writeDbidToManifest
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_writeDbidToManifest(JNIEnv*, jclass,
+                                                        jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->write_dbid_to_manifest);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setLogReadaheadSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setLogReadaheadSize(JNIEnv*, jclass,
+                                                    jlong jhandle,
+                                                    jlong jlog_readahead_size) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->log_readahead_size = static_cast<size_t>(jlog_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    logReasaheadSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_logReadaheadSize(JNIEnv*, jclass,
+                                                  jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->log_readahead_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setBestEffortsRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setBestEffortsRecovery(
+    JNIEnv*, jclass, jlong jhandle, jboolean jbest_efforts_recovery) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->best_efforts_recovery = static_cast<bool>(jbest_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    bestEffortsRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_bestEffortsRecovery(JNIEnv*, jclass,
+                                                        jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->best_efforts_recovery);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBgErrorResumeCount
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBgErrorResumeCount(
+    JNIEnv*, jclass, jlong jhandle, jint jmax_bgerror_resume_count) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->max_bgerror_resume_count = static_cast<int>(jmax_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBgerrorResumeCount
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBgerrorResumeCount(JNIEnv*, jclass,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jint>(opt->max_bgerror_resume_count);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setBgerrorResumeRetryInterval
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setBgerrorResumeRetryInterval(
+    JNIEnv*, jclass, jlong jhandle, jlong jbgerror_resume_retry_interval) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  opt->bgerror_resume_retry_interval =
+      static_cast<uint64_t>(jbgerror_resume_retry_interval);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    bgerrorResumeRetryInterval
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_bgerrorResumeRetryInterval(JNIEnv*, jclass,
+                                                            jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->bgerror_resume_retry_interval);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // ROCKSDB_NAMESPACE::WriteOptions
 
@@ -7062,6 +8267,141 @@
   return static_cast<jlong>(opt->iter_start_seqnum);
 }
 
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    autoPrefixMode
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_autoPrefixMode(JNIEnv*, jobject,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jboolean>(opt->auto_prefix_mode);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setAutoPrefixMode
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setAutoPrefixMode(
+    JNIEnv*, jobject, jlong jhandle, jboolean jauto_prefix_mode) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->auto_prefix_mode = static_cast<bool>(jauto_prefix_mode);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    timestamp
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_timestamp(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  auto& timestamp_slice_handle = opt->timestamp;
+  return reinterpret_cast<jlong>(timestamp_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTimestamp
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTimestamp(JNIEnv*, jobject, jlong jhandle,
+                                               jlong jtimestamp_slice_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->timestamp =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jtimestamp_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterStartTs
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterStartTs(JNIEnv*, jobject,
+                                               jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  auto& iter_start_ts_handle = opt->iter_start_ts;
+  return reinterpret_cast<jlong>(iter_start_ts_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterStartTs
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setIterStartTs(JNIEnv*, jobject,
+                                                 jlong jhandle,
+                                                 jlong jiter_start_ts_handle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->iter_start_ts =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jiter_start_ts_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    deadline
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_deadline(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->deadline.count());
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setDeadline
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setDeadline(JNIEnv*, jobject, jlong jhandle,
+                                              jlong jdeadline) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->deadline = std::chrono::microseconds(static_cast<int64_t>(jdeadline));
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    ioTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_ioTimeout(JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->io_timeout.count());
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIoTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setIoTimeout(JNIEnv*, jobject, jlong jhandle,
+                                               jlong jio_timeout) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->io_timeout =
+      std::chrono::microseconds(static_cast<int64_t>(jio_timeout));
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    valueSizeSofLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_valueSizeSoftLimit(JNIEnv*, jobject,
+                                                      jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->value_size_soft_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setValueSizeSofLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setValueSizeSoftLimit(
+    JNIEnv*, jobject, jlong jhandle, jlong jvalue_size_soft_limit) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle);
+  opt->value_size_soft_limit = static_cast<uint64_t>(jvalue_size_soft_limit);
+}
+
 /////////////////////////////////////////////////////////////////////
 // ROCKSDB_NAMESPACE::ComparatorOptions
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/options_util.cc	2025-05-19 16:14:27.000000000 +0000
@@ -55,7 +55,7 @@
  * Method:    loadLatestOptions
  * Signature: (Ljava/lang/String;JLjava/util/List;Z)V
  */
-void Java_org_rocksdb_OptionsUtil_loadLatestOptions(
+void Java_org_rocksdb_OptionsUtil_loadLatestOptions__Ljava_lang_String_2JJLjava_util_List_2Z(
     JNIEnv* env, jclass /*jcls*/, jstring jdbpath, jlong jenv_handle,
     jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) {
   jboolean has_exception = JNI_FALSE;
@@ -80,10 +80,40 @@
 
 /*
  * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadLatestOptions_1
+ * Signature: (JLjava/lang/String;JLjava/util/List;)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadLatestOptions__JLjava_lang_String_2JLjava_util_List_2(
+    JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jdbpath,
+    jlong jdb_opts_handle, jobject jcfds) {
+  jboolean has_exception = JNI_FALSE;
+  auto db_path =
+      ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdbpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descs;
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(cfg_handle);
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_opts_handle);
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadLatestOptions(
+      *config_options, db_path, db_options, &cf_descs);
+  if (!s.ok()) {
+    // error, raise an exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
  * Method:    loadOptionsFromFile
  * Signature: (Ljava/lang/String;JJLjava/util/List;Z)V
  */
-void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile(
+void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__Ljava_lang_String_2JJLjava_util_List_2Z(
     JNIEnv* env, jclass /*jcls*/, jstring jopts_file_name, jlong jenv_handle,
     jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) {
   jboolean has_exception = JNI_FALSE;
@@ -101,6 +131,36 @@
   if (!s.ok()) {
     // error, raise an exception
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadOptionsFromFile
+ * Signature: (JLjava/lang/String;JLjava/util/List;)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__JLjava_lang_String_2JLjava_util_List_2(
+    JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jopts_file_name,
+    jlong jdb_opts_handle, jobject jcfds) {
+  jboolean has_exception = JNI_FALSE;
+  auto opts_file_name = ROCKSDB_NAMESPACE::JniUtil::copyStdString(
+      env, jopts_file_name, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descs;
+  auto* config_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(cfg_handle);
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdb_opts_handle);
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadOptionsFromFile(
+      *config_options, opts_file_name, db_options, &cf_descs);
+  if (!s.ok()) {
+    // error, raise an exception
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
   } else {
     build_column_family_descriptor_list(env, jcfds, cf_descs);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/portal.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/portal.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/portal.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/portal.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,18 +10,21 @@
 #ifndef JAVA_ROCKSJNI_PORTAL_H_
 #define JAVA_ROCKSJNI_PORTAL_H_
 
+#include <jni.h>
+
 #include <algorithm>
 #include <cstring>
 #include <functional>
 #include <iostream>
 #include <iterator>
-#include <jni.h>
 #include <limits>
 #include <memory>
+#include <set>
 #include <string>
 #include <type_traits>
 #include <vector>
 
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/rate_limiter.h"
@@ -33,6 +36,7 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksjni/compaction_filter_factory_jnicallback.h"
 #include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/event_listener_jnicallback.h"
 #include "rocksjni/loggerjnicallback.h"
 #include "rocksjni/table_filter_jnicallback.h"
 #include "rocksjni/trace_writer_jnicallback.h"
@@ -222,7 +226,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getValueMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -260,7 +264,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getValueMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -325,7 +329,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCodeMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -346,7 +350,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getSubCodeMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -367,7 +371,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getStateMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -437,6 +441,10 @@
     return jstatus;
   }
 
+  static jobject construct(JNIEnv* env, const Status* status) {
+    return construct(env, *status);
+  }
+
   // Returns the equivalent org.rocksdb.Status.Code for the provided
   // C++ ROCKSDB_NAMESPACE::Status::Code enum
   static jbyte toJavaStatusCode(const ROCKSDB_NAMESPACE::Status::Code& code) {
@@ -933,7 +941,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getStatusMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -1024,7 +1032,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getIteratorMethod(JNIEnv* env) {
     jclass jlist_clazz = getListClass(env);
@@ -1045,7 +1053,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getHasNextMethod(JNIEnv* env) {
     jclass jiterator_clazz = getIteratorClass(env);
@@ -1065,7 +1073,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNextMethod(JNIEnv* env) {
     jclass jiterator_clazz = getIteratorClass(env);
@@ -1086,7 +1094,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getArrayListConstructorMethodId(JNIEnv* env) {
     jclass jarray_list_clazz = getArrayListClass(env);
@@ -1106,7 +1114,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getListAddMethodId(JNIEnv* env) {
     jclass jlist_clazz = getListClass(env);
@@ -1243,10 +1251,11 @@
    * Get the Java Method: ByteBuffer#allocate
    *
    * @param env A pointer to the Java environment
-   * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or nullptr
+   * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or
+   * nullptr
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getAllocateMethodId(JNIEnv* env,
       jclass jbytebuffer_clazz = nullptr) {
@@ -1269,7 +1278,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getArrayMethodId(JNIEnv* env,
       jclass jbytebuffer_clazz = nullptr) {
@@ -1291,9 +1300,9 @@
     return constructWith(env, direct, nullptr, capacity, jbytebuffer_clazz);
   }
 
-  static jobject constructWith(
-      JNIEnv* env, const bool direct, const char* buf, const size_t capacity, 
-      jclass jbytebuffer_clazz = nullptr) {
+  static jobject constructWith(JNIEnv* env, const bool direct, const char* buf,
+                               const size_t capacity,
+                               jclass jbytebuffer_clazz = nullptr) {
     if (direct) {
       bool allocated = false;
       if (buf == nullptr) {
@@ -1478,7 +1487,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getListAddMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2365,7 +2374,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMapPutMethodId(JNIEnv* env) {
     jclass jlist_clazz = getJClass(env);
@@ -2897,7 +2906,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getPutCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2917,7 +2926,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getPutMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2937,7 +2946,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMergeCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2957,7 +2966,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMergeMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2977,7 +2986,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2997,7 +3006,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3017,7 +3026,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3037,7 +3046,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getSingleDeleteMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3057,7 +3066,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3077,7 +3086,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteRangeMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3097,7 +3106,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getLogDataMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3117,7 +3126,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3137,7 +3146,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3157,7 +3166,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3177,7 +3186,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkNoopMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3197,7 +3206,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkRollbackMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3217,7 +3226,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkCommitMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3232,12 +3241,33 @@
   }
 
   /**
+   * Get the Java Method: WriteBatch.Handler#markCommitWithTimestamp
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retrieved
+   */
+  static jmethodID getMarkCommitWithTimestampMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "markCommitWithTimestamp", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
    * Get the Java Method: WriteBatch.Handler#shouldContinue
    *
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getContinueMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3273,7 +3303,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getConstructorMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3364,7 +3394,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getConstructorMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3460,6 +3490,19 @@
     : public RocksDBNativeClass<ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
                                 ColumnFamilyHandleJni> {
  public:
+  static jobject fromCppColumnFamilyHandle(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::ColumnFamilyHandle* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    return env->NewObject(jclazz, ctor, reinterpret_cast<jlong>(info));
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(J)V");
+  }
+
   /**
    * Get the Java Class org.rocksdb.ColumnFamilyHandle
    *
@@ -3540,7 +3583,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNameMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3561,7 +3604,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3628,7 +3671,7 @@
    * @param jclazz the AbstractComparatorJniBridge class
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCompareInternalMethodId(JNIEnv* env, jclass jclazz) {
     static jmethodID mid =
@@ -3645,7 +3688,7 @@
    * @param jclazz the AbstractComparatorJniBridge class
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env, jclass jclazz) {
     static jmethodID mid =
@@ -3662,7 +3705,7 @@
    * @param jclazz the AbstractComparatorJniBridge class
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env, jclass jclazz) {
     static jmethodID mid =
@@ -3698,7 +3741,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNameMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3995,7 +4038,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Field ID or nullptr if the class or field id could not
-   *     be retieved
+   *     be retrieved
    */
   static jfieldID getWriteEntryField(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -4316,7 +4359,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getLogMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -4854,7 +4897,7 @@
       case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND:
         return 0x5E;
       case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED:
-        // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX.
+        // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX).
         return -0x01;
       case ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED:
         return 0x60;
@@ -4944,8 +4987,73 @@
         return -0x0C;
       case ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN:
         return -0x0D;
+      case ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH:
+        return -0x0E;
+      case ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY:
+        return -0X0F;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED:
+        return -0x10;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC:
+        return -0x11;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL:
+        return -0x12;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED:
+        return -0x13;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC:
+        return -0x14;
+      case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL:
+        return -0x15;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT:
+        return -0x16;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT:
+        return -0x17;
+      case ROCKSDB_NAMESPACE::Tickers::
+          ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT:
+        return -0x18;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT:
+        return -0x19;
+      case ROCKSDB_NAMESPACE::Tickers::
+          ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT:
+        return -0x1A;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT:
+        return -0x1B;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH:
+        return -0x1C;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH:
+        return -0x1D;
+      case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS:
+        return -0x1E;
+      case ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES:
+        return -0x1F;
+      case ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES:
+        return -0x20;
+      case ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES:
+        return -0x21;
+      case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES:
+        return -0x22;
+      case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES:
+        return -0x23;
+      case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES:
+        return -0x24;
+      case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES:
+        return -0x25;
+      case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES:
+        return -0x26;
+      case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT:
+        return -0x27;
+      case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT:
+        return -0x28;
+      case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT:
+        return -0x29;
       case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
-        // 0x5F for backwards compatibility on current minor version.
+        // 0x5F was the max value in the initial copy of tickers to Java.
+        // Since these values are exposed directly to Java clients, we keep
+        // the value the same forever.
+        //
+        // TODO: This particular case seems confusing and unnecessary to pin the
+        // value since it's meant to be the number of tickers, not an actual
+        // ticker value. But we aren't yet in a position to fix it since the
+        // number of tickers doesn't fit in the Java representation (jbyte).
         return 0x5F;
       default:
         // undefined/default
@@ -5148,7 +5256,7 @@
       case 0x5E:
         return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_FOUND;
       case -0x01:
-        // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX.
+        // -0x01 so we can skip over the already taken 0x5F (TICKER_ENUM_MAX).
         return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_CREATED;
       case 0x60:
         return ROCKSDB_NAMESPACE::Tickers::NO_ITERATOR_DELETED;
@@ -5239,8 +5347,74 @@
         return ROCKSDB_NAMESPACE::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD;
       case -0x0D:
         return ROCKSDB_NAMESPACE::Tickers::TXN_GET_TRY_AGAIN;
+      case -0x0E:
+        return ROCKSDB_NAMESPACE::Tickers::FILES_MARKED_TRASH;
+      case -0x0F:
+        return ROCKSDB_NAMESPACE::Tickers::FILES_DELETED_IMMEDIATELY;
+      case -0x10:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_MARKED;
+      case -0x11:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_PERIODIC;
+      case -0x12:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES_TTL;
+      case -0x13:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_MARKED;
+      case -0x14:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC;
+      case -0x15:
+        return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL;
+      case -0x16:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT;
+      case -0x17:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT;
+      case -0x18:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT;
+      case -0x19:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT;
+      case -0x1A:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT;
+      case -0x1B:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT;
+      case -0x1C:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH;
+      case -0x1D:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+      case -0x1E:
+        return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_HITS;
+      case -0x1F:
+        return ROCKSDB_NAMESPACE::Tickers::VERIFY_CHECKSUM_READ_BYTES;
+      case -0x20:
+        return ROCKSDB_NAMESPACE::Tickers::BACKUP_READ_BYTES;
+      case -0x21:
+        return ROCKSDB_NAMESPACE::Tickers::BACKUP_WRITE_BYTES;
+      case -0x22:
+        return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES;
+      case -0x23:
+        return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES;
+      case -0x24:
+        return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES;
+      case -0x25:
+        return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES;
+      case -0x26:
+        return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES;
+      case -0x27:
+        return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT;
+      case -0x28:
+        return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT;
+      case -0x29:
+        return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT;
       case 0x5F:
-        // 0x5F for backwards compatibility on current minor version.
+        // 0x5F was the max value in the initial copy of tickers to Java.
+        // Since these values are exposed directly to Java clients, we keep
+        // the value the same forever.
+        //
+        // TODO: This particular case seems confusing and unnecessary to pin the
+        // value since it's meant to be the number of tickers, not an actual
+        // ticker value. But we aren't yet in a position to fix it since the
+        // number of tickers doesn't fit in the Java representation (jbyte).
         return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX;
 
       default:
@@ -5351,6 +5525,15 @@
         return 0x2D;
       case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS:
         return 0x2E;
+      case ROCKSDB_NAMESPACE::Histograms::
+          NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL:
+        return 0x2F;
+      case ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL:
+        return 0x30;
+      case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL:
+        return 0x31;
+      case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT:
+        return 0x31;
       case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
         // 0x1F for backwards compatibility on current minor version.
         return 0x1F;
@@ -5458,6 +5641,16 @@
         return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS;
       case 0x2E:
         return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS;
+      case 0x2F:
+        return ROCKSDB_NAMESPACE::Histograms::
+            NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL;
+      case 0x30:
+        return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL;
+      case 0x31:
+        return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL;
+      case 0x32:
+        return ROCKSDB_NAMESPACE::Histograms::
+            ERROR_HANDLER_AUTORESUME_RETRY_COUNT;
       case 0x1F:
         // 0x1F for backwards compatibility on current minor version.
         return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;
@@ -5650,7 +5843,8 @@
       return nullptr;
     }
 
-    jlong *body = env->GetLongArrayElements(jtransaction_ids, nullptr);
+    jboolean is_copy;
+    jlong* body = env->GetLongArrayElements(jtransaction_ids, &is_copy);
     if(body == nullptr) {
         // exception thrown: OutOfMemoryError
         env->DeleteLocalRef(jkey);
@@ -5660,7 +5854,8 @@
     for(size_t i = 0; i < len; ++i) {
       body[i] = static_cast<jlong>(transaction_ids[i]);
     }
-    env->ReleaseLongArrayElements(jtransaction_ids, body, 0);
+    env->ReleaseLongArrayElements(jtransaction_ids, body,
+                                  is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 
     jobject jwaiting_transactions = env->CallObjectMethod(jtransaction,
       mid, static_cast<jlong>(column_family_id), jkey, jtransaction_ids);
@@ -5931,7 +6126,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getFilterMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -5971,7 +6166,11 @@
       return nullptr;
     }
 
-    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/util/Map;Ljava/util/Map;Ljava/util/Map;)V");
+    jmethodID mid = env->GetMethodID(
+        jclazz, "<init>",
+        "(JJJJJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/"
+        "lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/"
+        "String;Ljava/util/Map;Ljava/util/Map;)V");
     if (mid == nullptr) {
       // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
@@ -6080,25 +6279,8 @@
       return nullptr;
     }
 
-    // Map<String, Long>
-    jobject jproperties_offsets = ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(
-        env, &table_properties.properties_offsets);
-    if (env->ExceptionCheck()) {
-      // exception occurred creating java map
-      env->DeleteLocalRef(jcolumn_family_name);
-      env->DeleteLocalRef(jfilter_policy_name);
-      env->DeleteLocalRef(jcomparator_name);
-      env->DeleteLocalRef(jmerge_operator_name);
-      env->DeleteLocalRef(jprefix_extractor_name);
-      env->DeleteLocalRef(jproperty_collectors_names);
-      env->DeleteLocalRef(jcompression_name);
-      env->DeleteLocalRef(juser_collected_properties);
-      env->DeleteLocalRef(jreadable_properties);
-      return nullptr;
-    }
-
-    jobject jtable_properties = env->NewObject(jclazz, mid,
-        static_cast<jlong>(table_properties.data_size),
+    jobject jtable_properties = env->NewObject(
+        jclazz, mid, static_cast<jlong>(table_properties.data_size),
         static_cast<jlong>(table_properties.index_size),
         static_cast<jlong>(table_properties.index_partitions),
         static_cast<jlong>(table_properties.top_level_index_size),
@@ -6117,17 +6299,16 @@
         static_cast<jlong>(table_properties.column_family_id),
         static_cast<jlong>(table_properties.creation_time),
         static_cast<jlong>(table_properties.oldest_key_time),
-        jcolumn_family_name,
-        jfilter_policy_name,
-        jcomparator_name,
-        jmerge_operator_name,
-        jprefix_extractor_name,
-        jproperty_collectors_names,
-        jcompression_name,
-        juser_collected_properties,
-        jreadable_properties,
-        jproperties_offsets
-    );
+        static_cast<jlong>(
+            table_properties.slow_compression_estimated_data_size),
+        static_cast<jlong>(
+            table_properties.fast_compression_estimated_data_size),
+        static_cast<jlong>(
+            table_properties.external_sst_file_global_seqno_offset),
+        jcolumn_family_name, jfilter_policy_name, jcomparator_name,
+        jmerge_operator_name, jprefix_extractor_name,
+        jproperty_collectors_names, jcompression_name,
+        juser_collected_properties, jreadable_properties);
 
     if (env->ExceptionCheck()) {
       return nullptr;
@@ -6201,7 +6382,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -6221,7 +6402,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -6365,6 +6546,51 @@
   }
 };
 
+// The portal class for org.rocksdb.IndexShorteningMode
+class IndexShorteningModeJni {
+ public:
+  // Returns the equivalent org.rocksdb.IndexShorteningMode for the provided
+  // C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum
+  static jbyte toJavaIndexShorteningMode(
+      const ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode&
+          index_shortening_mode) {
+    switch (index_shortening_mode) {
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+          kNoShortening:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+          kShortenSeparators:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+          kShortenSeparatorsAndSuccessor:
+        return 0x2;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::IndexShorteningMode enum for
+  // the provided Java org.rocksdb.IndexShorteningMode
+  static ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode
+  toCppIndexShorteningMode(jbyte jindex_shortening_mode) {
+    switch (jindex_shortening_mode) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kNoShortening;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kShortenSeparators;
+      case 0x2:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kShortenSeparatorsAndSuccessor;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::IndexShorteningMode::
+            kShortenSeparators;
+    }
+  }
+};
+
 // The portal class for org.rocksdb.Priority
 class PriorityJni {
  public:
@@ -6670,7 +6896,8 @@
       env->DeleteLocalRef(jcf_name);
       return nullptr;
     }
-    jlong *body = env->GetLongArrayElements(joperation_properties, nullptr);
+    jboolean is_copy;
+    jlong* body = env->GetLongArrayElements(joperation_properties, &is_copy);
     if (body == nullptr) {
         // exception thrown: OutOfMemoryError
         env->DeleteLocalRef(jdb_name);
@@ -6681,7 +6908,8 @@
     for (size_t i = 0; i < len; ++i) {
       body[i] = static_cast<jlong>(thread_status->op_properties[i]);
     }
-    env->ReleaseLongArrayElements(joperation_properties, body, 0);
+    env->ReleaseLongArrayElements(joperation_properties, body,
+                                  is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 
     jobject jcfd = env->NewObject(jclazz, mid,
         static_cast<jlong>(thread_status->thread_id),
@@ -6829,6 +7057,10 @@
         return ROCKSDB_NAMESPACE::CompactionReason::kFlush;
       case 0x0D:
         return ROCKSDB_NAMESPACE::CompactionReason::kExternalSstIngestion;
+      case 0x0E:
+        return ROCKSDB_NAMESPACE::CompactionReason::kPeriodicCompaction;
+      case 0x0F:
+        return ROCKSDB_NAMESPACE::CompactionReason::kChangeTemperature;
       default:
         // undefined/default
         return ROCKSDB_NAMESPACE::CompactionReason::kUnknown;
@@ -7302,7 +7534,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getWriteProxyMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7323,7 +7555,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7344,7 +7576,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getGetFileSizeMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7385,7 +7617,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7407,7 +7639,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7428,7 +7660,7 @@
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNameMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7530,5 +7762,796 @@
     }
   }
 };
+// The portal class for org.rocksdb.SanityLevel
+class SanityLevelJni {
+ public:
+  // Returns the equivalent org.rocksdb.SanityLevel for the provided
+  // C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum
+  static jbyte toJavaSanityLevel(
+      const ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel &sanity_level) {
+    switch (sanity_level) {
+      case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::kSanityLevelNone:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::
+          kSanityLevelLooselyCompatible:
+        return 0x1;
+      case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::
+          kSanityLevelExactMatch:
+        return -0x01;
+      default:
+        return -0x01;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum for
+  // the provided Java org.rocksdb.SanityLevel
+  static ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel toCppSanityLevel(
+      jbyte sanity_level) {
+    switch (sanity_level) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelNone;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelLooselyCompatible;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::ConfigOptions::kSanityLevelExactMatch;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.AbstractListener.EnabledEventCallback
+class EnabledEventCallbackJni {
+ public:
+  // Returns the set of equivalent C++
+  // ROCKSDB_NAMESPACE::EnabledEventCallbackJni::EnabledEventCallback enums for
+  // the provided Java jenabled_event_callback_values
+  static std::set<EnabledEventCallback> toCppEnabledEventCallbacks(
+      jlong jenabled_event_callback_values) {
+    std::set<EnabledEventCallback> enabled_event_callbacks;
+    for (size_t i = 0; i < EnabledEventCallback::NUM_ENABLED_EVENT_CALLBACK;
+         ++i) {
+      if (((1ULL << i) & jenabled_event_callback_values) > 0) {
+        enabled_event_callbacks.emplace(static_cast<EnabledEventCallback>(i));
+      }
+    }
+    return enabled_event_callbacks;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractEventListener
+class AbstractEventListenerJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::EventListenerJniCallback*,
+          AbstractEventListenerJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractEventListener
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/AbstractEventListener");
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFlushCompletedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFlushCompletedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onFlushCompletedProxy",
+                                            "(JLorg/rocksdb/FlushJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFlushBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFlushBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onFlushBeginProxy",
+                                            "(JLorg/rocksdb/FlushJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileDeleted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileDeletedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onTableFileDeleted", "(Lorg/rocksdb/TableFileDeletionInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onCompactionBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnCompactionBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onCompactionBeginProxy",
+                         "(JLorg/rocksdb/CompactionJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onCompactionCompletedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnCompactionCompletedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onCompactionCompletedProxy",
+                         "(JLorg/rocksdb/CompactionJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileCreated
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileCreatedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onTableFileCreated", "(Lorg/rocksdb/TableFileCreationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileCreationStarted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileCreationStartedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onTableFileCreationStarted",
+                         "(Lorg/rocksdb/TableFileCreationBriefInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onMemTableSealed
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnMemTableSealedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onMemTableSealed",
+                                            "(Lorg/rocksdb/MemTableInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method:
+   * AbstractEventListener#onColumnFamilyHandleDeletionStarted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnColumnFamilyHandleDeletionStartedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onColumnFamilyHandleDeletionStarted",
+                         "(Lorg/rocksdb/ColumnFamilyHandle;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onExternalFileIngestedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnExternalFileIngestedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onExternalFileIngestedProxy",
+                         "(JLorg/rocksdb/ExternalFileIngestionInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onBackgroundError
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnBackgroundErrorProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onBackgroundErrorProxy",
+                                            "(BLorg/rocksdb/Status;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onStallConditionsChanged
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnStallConditionsChangedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onStallConditionsChanged",
+                                            "(Lorg/rocksdb/WriteStallInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileReadFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileReadFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileReadFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileWriteFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileWriteFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileWriteFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileFlushFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileFlushFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileFlushFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileSyncFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileSyncFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileRangeSyncFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileRangeSyncFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileRangeSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileTruncateFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileTruncateFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileTruncateFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileCloseFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileCloseFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileCloseFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#shouldBeNotifiedOnFileIO
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getShouldBeNotifiedOnFileIOMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "shouldBeNotifiedOnFileIO", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onErrorRecoveryBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnErrorRecoveryBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryBeginProxy",
+                                            "(BLorg/rocksdb/Status;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onErrorRecoveryCompleted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnErrorRecoveryCompletedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryCompleted",
+                                            "(Lorg/rocksdb/Status;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class FlushJobInfoJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.FlushJobInfo object.
+   *
+   * @param env A pointer to the Java environment
+   * @param flush_job_info A Cpp flush job info object
+   *
+   * @return A reference to a Java org.rocksdb.FlushJobInfo object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppFlushJobInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::FlushJobInfo* flush_job_info) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &flush_job_info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &flush_job_info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jfile_path);
+      return nullptr;
+    }
+    jobject jtable_properties = TablePropertiesJni::fromCppTableProperties(
+        env, flush_job_info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jfile_path);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, static_cast<jlong>(flush_job_info->cf_id), jcf_name,
+        jfile_path, static_cast<jlong>(flush_job_info->thread_id),
+        static_cast<jint>(flush_job_info->job_id),
+        static_cast<jboolean>(flush_job_info->triggered_writes_slowdown),
+        static_cast<jboolean>(flush_job_info->triggered_writes_stop),
+        static_cast<jlong>(flush_job_info->smallest_seqno),
+        static_cast<jlong>(flush_job_info->largest_seqno), jtable_properties,
+        static_cast<jbyte>(flush_job_info->flush_reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/FlushJobInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(JLjava/lang/String;Ljava/lang/String;JIZZJJLorg/"
+                            "rocksdb/TableProperties;B)V");
+  }
+};
+
+class TableFileDeletionInfoJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.TableFileDeletionInfo object.
+   *
+   * @param env A pointer to the Java environment
+   * @param file_del_info A Cpp table file deletion info object
+   *
+   * @return A reference to a Java org.rocksdb.TableFileDeletionInfo object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppTableFileDeletionInfo(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::TableFileDeletionInfo* file_del_info) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &file_del_info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, file_del_info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jdb_name,
+                          JniUtil::toJavaString(env, &file_del_info->file_path),
+                          static_cast<jint>(file_del_info->job_id), jstatus);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileDeletionInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(Ljava/lang/String;Ljava/lang/String;ILorg/rocksdb/Status;)V");
+  }
+};
+
+class CompactionJobInfoJni : public JavaClass {
+ public:
+  static jobject fromCppCompactionJobInfo(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::CompactionJobInfo* compaction_job_info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    return env->NewObject(jclazz, ctor,
+                          reinterpret_cast<jlong>(compaction_job_info));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/CompactionJobInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(J)V");
+  }
+};
+
+class TableFileCreationInfoJni : public JavaClass {
+ public:
+  static jobject fromCppTableFileCreationInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jobject jtable_properties =
+        TablePropertiesJni::fromCppTableProperties(env, info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jtable_properties);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, static_cast<jlong>(info->file_size),
+                          jtable_properties, jstatus, jdb_name, jcf_name,
+                          jfile_path, static_cast<jint>(info->job_id),
+                          static_cast<jbyte>(info->reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(JLorg/rocksdb/TableProperties;Lorg/rocksdb/Status;Ljava/lang/"
+        "String;Ljava/lang/String;Ljava/lang/String;IB)V");
+  }
+};
+
+class TableFileCreationBriefInfoJni : public JavaClass {
+ public:
+  static jobject fromCppTableFileCreationBriefInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationBriefInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jdb_name, jcf_name, jfile_path,
+                          static_cast<jint>(info->job_id),
+                          static_cast<jbyte>(info->reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationBriefInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IB)V");
+  }
+};
+
+class MemTableInfoJni : public JavaClass {
+ public:
+  static jobject fromCppMemTableInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::MemTableInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jcf_name,
+                          static_cast<jlong>(info->first_seqno),
+                          static_cast<jlong>(info->earliest_seqno),
+                          static_cast<jlong>(info->num_entries),
+                          static_cast<jlong>(info->num_deletes));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/MemTableInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(Ljava/lang/String;JJJJ)V");
+  }
+};
+
+class ExternalFileIngestionInfoJni : public JavaClass {
+ public:
+  static jobject fromCppExternalFileIngestionInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::ExternalFileIngestionInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jexternal_file_path =
+        JniUtil::toJavaString(env, &info->external_file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jstring jinternal_file_path =
+        JniUtil::toJavaString(env, &info->internal_file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jexternal_file_path);
+      return nullptr;
+    }
+    jobject jtable_properties =
+        TablePropertiesJni::fromCppTableProperties(env, info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jexternal_file_path);
+      env->DeleteLocalRef(jinternal_file_path);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, jcf_name, jexternal_file_path, jinternal_file_path,
+        static_cast<jlong>(info->global_seqno), jtable_properties);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ExternalFileIngestionInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/"
+                            "String;JLorg/rocksdb/TableProperties;)V");
+  }
+};
+
+class WriteStallInfoJni : public JavaClass {
+ public:
+  static jobject fromCppWriteStallInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::WriteStallInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jcf_name,
+                          static_cast<jbyte>(info->condition.cur),
+                          static_cast<jbyte>(info->condition.prev));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WriteStallInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(Ljava/lang/String;BB)V");
+  }
+};
+
+class FileOperationInfoJni : public JavaClass {
+ public:
+  static jobject fromCppFileOperationInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::FileOperationInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jpath = JniUtil::toJavaString(env, &info->path);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jpath);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, jpath, static_cast<jlong>(info->offset),
+        static_cast<jlong>(info->length),
+        static_cast<jlong>(info->start_ts.time_since_epoch().count()),
+        static_cast<jlong>(info->duration.count()), jstatus);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/FileOperationInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(Ljava/lang/String;JJJJLorg/rocksdb/Status;)V");
+  }
+};
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // JAVA_ROCKSJNI_PORTAL_H_
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocks_callback_object.cc	2025-05-19 16:14:27.000000000 +0000
@@ -27,5 +27,4 @@
   // I think this is okay, as Comparator and JniCallback both have virtual
   // destructors...
   delete reinterpret_cast<ROCKSDB_NAMESPACE::JniCallback*>(handle);
-  // @lint-ignore TXT4 T25377293 Grandfathered in
-}
\ No newline at end of file
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/rocksjni.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,7 @@
 #include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
+
 #include <algorithm>
 #include <functional>
 #include <memory>
@@ -22,6 +23,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
+#include "rocksdb/version.h"
 #include "rocksjni/portal.h"
 
 #ifdef min
@@ -70,15 +72,19 @@
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    openROnly
- * Signature: (JLjava/lang/String;)J
+ * Signature: (JLjava/lang/String;Z)J
  */
-jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2(
-    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) {
+jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Z(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jboolean jerror_if_wal_file_exists) {
+  const bool error_if_wal_file_exists = jerror_if_wal_file_exists == JNI_TRUE;
   return rocksdb_open_helper(
       env, jopt_handle, jdb_path,
-      [](const ROCKSDB_NAMESPACE::Options& options, const std::string& db_path,
-         ROCKSDB_NAMESPACE::DB** db) {
-        return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db);
+      [error_if_wal_file_exists](const ROCKSDB_NAMESPACE::Options& options,
+                                 const std::string& db_path,
+                                 ROCKSDB_NAMESPACE::DB** db) {
+        return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db,
+                                                      error_if_wal_file_exists);
       });
 }
 
@@ -170,21 +176,25 @@
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    openROnly
- * Signature: (JLjava/lang/String;[[B[J)[J
+ * Signature: (JLjava/lang/String;[[B[JZ)[J
  */
-jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3J(
+jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3JZ(
     JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
-    jobjectArray jcolumn_names, jlongArray jcolumn_options) {
+    jobjectArray jcolumn_names, jlongArray jcolumn_options,
+    jboolean jerror_if_wal_file_exists) {
+  const bool error_if_wal_file_exists = jerror_if_wal_file_exists == JNI_TRUE;
   return rocksdb_open_helper(
       env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
-      [](const ROCKSDB_NAMESPACE::DBOptions& options,
-         const std::string& db_path,
-         const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
-             column_families,
-         std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
-         ROCKSDB_NAMESPACE::DB** db) {
+      [error_if_wal_file_exists](
+          const ROCKSDB_NAMESPACE::DBOptions& options,
+          const std::string& db_path,
+          const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
+              column_families,
+          std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
+          ROCKSDB_NAMESPACE::DB** db) {
         return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(
-            options, db_path, column_families, handles, db);
+            options, db_path, column_families, handles, db,
+            error_if_wal_file_exists);
       });
 }
 
@@ -208,6 +218,72 @@
 
 /*
  * Class:     org_rocksdb_RocksDB
+ * Method:    openAsSecondary
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)J
+ */
+jlong Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_2(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jstring jsecondary_db_path) {
+  const char* secondary_db_path =
+      env->GetStringUTFChars(jsecondary_db_path, nullptr);
+  if (secondary_db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  jlong db_handle = rocksdb_open_helper(
+      env, jopt_handle, jdb_path,
+      [secondary_db_path](const ROCKSDB_NAMESPACE::Options& options,
+                          const std::string& db_path,
+                          ROCKSDB_NAMESPACE::DB** db) {
+        return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(options, db_path,
+                                                      secondary_db_path, db);
+      });
+
+  // we have now finished with secondary_db_path
+  env->ReleaseStringUTFChars(jsecondary_db_path, secondary_db_path);
+
+  return db_handle;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openAsSecondary
+ * Signature: (JLjava/lang/String;Ljava/lang/String;[[B[J)[J
+ */
+jlongArray
+Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_2_3_3B_3J(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
+    jstring jsecondary_db_path, jobjectArray jcolumn_names,
+    jlongArray jcolumn_options) {
+  const char* secondary_db_path =
+      env->GetStringUTFChars(jsecondary_db_path, nullptr);
+  if (secondary_db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  jlongArray jhandles = rocksdb_open_helper(
+      env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
+      [secondary_db_path](
+          const ROCKSDB_NAMESPACE::DBOptions& options,
+          const std::string& db_path,
+          const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
+              column_families,
+          std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
+          ROCKSDB_NAMESPACE::DB** db) {
+        return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(
+            options, db_path, secondary_db_path, column_families, handles, db);
+      });
+
+  // we have now finished with secondary_db_path
+  env->ReleaseStringUTFChars(jsecondary_db_path, secondary_db_path);
+
+  return jhandles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
  * Method:    disposeInternal
  * Signature: (J)V
  */
@@ -345,8 +421,8 @@
   std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descriptors;
   cf_descriptors.reserve(jlen);
 
-  jboolean jcf_options_handles_is_copy = JNI_FALSE;
-  jlong *jcf_options_handles_elems = env->GetLongArrayElements(jcf_options_handles, &jcf_options_handles_is_copy);
+  jlong* jcf_options_handles_elems =
+      env->GetLongArrayElements(jcf_options_handles, nullptr);
   if(jcf_options_handles_elems == nullptr) {
       // exception thrown: OutOfMemoryError
       return nullptr;
@@ -1600,34 +1676,37 @@
   }
 }
 
-inline void multi_get_helper_release_keys(
-    JNIEnv* env, std::vector<std::pair<jbyte*, jobject>>& keys_to_free) {
+inline void multi_get_helper_release_keys(std::vector<jbyte*>& keys_to_free) {
   auto end = keys_to_free.end();
   for (auto it = keys_to_free.begin(); it != end; ++it) {
-    delete[] it->first;
-    env->DeleteLocalRef(it->second);
+    delete[] * it;
   }
   keys_to_free.clear();
 }
 
 /**
- * cf multi get
+ * @brief fill a native array of cf handles from java handles
  *
- * @return byte[][] of values or nullptr if an exception occurs
- */
-jobjectArray multi_get_helper(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db,
-                              const ROCKSDB_NAMESPACE::ReadOptions& rOpt,
-                              jobjectArray jkeys, jintArray jkey_offs,
-                              jintArray jkey_lens,
-                              jlongArray jcolumn_family_handles) {
-  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+ * @param env
+ * @param cf_handles to fill from the java variants
+ * @param jcolumn_family_handles
+ * @return true if the copy succeeds
+ * @return false if a JNI exception is generated
+ */
+inline bool cf_handles_from_jcf_handles(
+    JNIEnv* env,
+    std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>& cf_handles,
+    jlongArray jcolumn_family_handles) {
   if (jcolumn_family_handles != nullptr) {
     const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
 
     jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
     if (jcfh == nullptr) {
       // exception thrown: OutOfMemoryError
-      return nullptr;
+      jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+      (env)->ThrowNew(exception_cls,
+                      "Insufficient Memory for CF handle array.");
+      return false;
     }
 
     for (jsize i = 0; i < len_cols; i++) {
@@ -1637,36 +1716,53 @@
     }
     env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
   }
+  return true;
+}
 
-  const jsize len_keys = env->GetArrayLength(jkeys);
-  if (env->EnsureLocalCapacity(len_keys) != 0) {
-    // exception thrown: OutOfMemoryError
-    return nullptr;
-  }
-
+/**
+ * @brief copy keys from JNI into vector of slices for Rocks API
+ *
+ * @param keys to instantiate
+ * @param jkeys
+ * @param jkey_offs
+ * @param jkey_lens
+ * @return true if the copy succeeds
+ * @return false if a JNI exception is raised
+ */
+inline bool keys_from_jkeys(JNIEnv* env,
+                            std::vector<ROCKSDB_NAMESPACE::Slice>& keys,
+                            std::vector<jbyte*>& keys_to_free,
+                            jobjectArray jkeys, jintArray jkey_offs,
+                            jintArray jkey_lens) {
   jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr);
   if (jkey_off == nullptr) {
     // exception thrown: OutOfMemoryError
-    return nullptr;
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array.");
+    return false;
   }
 
   jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr);
   if (jkey_len == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
-    return nullptr;
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array.");
+    return false;
   }
 
-  std::vector<ROCKSDB_NAMESPACE::Slice> keys;
-  std::vector<std::pair<jbyte*, jobject>> keys_to_free;
+  const jsize len_keys = env->GetArrayLength(jkeys);
   for (jsize i = 0; i < len_keys; i++) {
     jobject jkey = env->GetObjectArrayElement(jkeys, i);
     if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
       env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
       env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
-      multi_get_helper_release_keys(env, keys_to_free);
-      return nullptr;
+      multi_get_helper_release_keys(keys_to_free);
+      jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+      (env)->ThrowNew(exception_cls,
+                      "Insufficient Memory for key object array.");
+      return false;
     }
 
     jbyteArray jkey_ba = reinterpret_cast<jbyteArray>(jkey);
@@ -1680,20 +1776,86 @@
       env->DeleteLocalRef(jkey);
       env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
       env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
-      multi_get_helper_release_keys(env, keys_to_free);
-      return nullptr;
+      multi_get_helper_release_keys(keys_to_free);
+      jclass exception_cls =
+          (env)->FindClass("java/lang/ArrayIndexOutOfBoundsException");
+      (env)->ThrowNew(exception_cls, "Invalid byte array region index.");
+      return false;
     }
 
     ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), len_key);
     keys.push_back(key_slice);
 
-    keys_to_free.push_back(std::pair<jbyte*, jobject>(key, jkey));
+    env->DeleteLocalRef(jkey);
+    keys_to_free.push_back(key);
   }
 
   // cleanup jkey_off and jken_len
   env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
   env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
 
+  return true;
+}
+
+inline bool keys_from_bytebuffers(JNIEnv* env,
+                                  std::vector<ROCKSDB_NAMESPACE::Slice>& keys,
+                                  jobjectArray jkeys, jintArray jkey_offs,
+                                  jintArray jkey_lens) {
+  jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr);
+  if (jkey_off == nullptr) {
+    // exception thrown: OutOfMemoryError
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key offset array.");
+    return false;
+  }
+
+  jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr);
+  if (jkey_len == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for key length array.");
+    return false;
+  }
+
+  const jsize len_keys = env->GetArrayLength(jkeys);
+  for (jsize i = 0; i < len_keys; i++) {
+    jobject jkey = env->GetObjectArrayElement(jkeys, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return false;
+    }
+    char* key = reinterpret_cast<char*>(env->GetDirectBufferAddress(jkey));
+    ROCKSDB_NAMESPACE::Slice key_slice(key + jkey_off[i], jkey_len[i]);
+    keys.push_back(key_slice);
+
+    env->DeleteLocalRef(jkey);
+  }
+  return true;
+}
+
+/**
+ * cf multi get
+ *
+ * @return byte[][] of values or nullptr if an
+ * exception occurs
+ */
+jobjectArray multi_get_helper(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db,
+                              const ROCKSDB_NAMESPACE::ReadOptions& rOpt,
+                              jobjectArray jkeys, jintArray jkey_offs,
+                              jintArray jkey_lens,
+                              jlongArray jcolumn_family_handles) {
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) {
+    return nullptr;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::Slice> keys;
+  std::vector<jbyte*> keys_to_free;
+  if (!keys_from_jkeys(env, keys, keys_to_free, jkeys, jkey_offs, jkey_lens)) {
+    return nullptr;
+  }
+
   std::vector<std::string> values;
   std::vector<ROCKSDB_NAMESPACE::Status> s;
   if (cf_handles.size() == 0) {
@@ -1703,22 +1865,18 @@
   }
 
   // free up allocated byte arrays
-  multi_get_helper_release_keys(env, keys_to_free);
+  multi_get_helper_release_keys(keys_to_free);
 
   // prepare the results
   jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray(
       env, static_cast<jsize>(s.size()));
   if (jresults == nullptr) {
     // exception occurred
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for results.");
     return nullptr;
   }
 
-  // TODO(AR) it is not clear to me why EnsureLocalCapacity is needed for the
-  //     loop as we cleanup references with env->DeleteLocalRef(jentry_value);
-  if (env->EnsureLocalCapacity(static_cast<jint>(s.size())) != 0) {
-    // exception thrown: OutOfMemoryError
-    return nullptr;
-  }
   // add to the jresults
   for (std::vector<ROCKSDB_NAMESPACE::Status>::size_type i = 0; i != s.size();
        i++) {
@@ -1735,14 +1893,16 @@
           jentry_value, 0, static_cast<jsize>(jvalue_len),
           const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value->c_str())));
       if (env->ExceptionCheck()) {
-        // exception thrown: ArrayIndexOutOfBoundsException
+        // exception thrown:
+        // ArrayIndexOutOfBoundsException
         env->DeleteLocalRef(jentry_value);
         return nullptr;
       }
 
       env->SetObjectArrayElement(jresults, static_cast<jsize>(i), jentry_value);
       if (env->ExceptionCheck()) {
-        // exception thrown: ArrayIndexOutOfBoundsException
+        // exception thrown:
+        // ArrayIndexOutOfBoundsException
         env->DeleteLocalRef(jentry_value);
         return nullptr;
       }
@@ -1754,14 +1914,129 @@
   return jresults;
 }
 
+/**
+ * cf multi get
+ *
+ * fill supplied native buffers, or raise JNI
+ * exception on a problem
+ */
+
+/**
+ * @brief multi_get_helper_direct for fast-path multiget (io_uring) on Linux
+ *
+ * @param env
+ * @param db
+ * @param rOpt read options
+ * @param jcolumn_family_handles 0, 1, or n column family handles
+ * @param jkeys
+ * @param jkey_offsets
+ * @param jkey_lengths
+ * @param jvalues byte buffers to receive values
+ * @param jvalue_sizes returned actual sizes of data values for keys
+ * @param jstatuses returned java RocksDB status values for per key
+ */
+void multi_get_helper_direct(JNIEnv* env, jobject, ROCKSDB_NAMESPACE::DB* db,
+                             const ROCKSDB_NAMESPACE::ReadOptions& rOpt,
+                             jlongArray jcolumn_family_handles,
+                             jobjectArray jkeys, jintArray jkey_offsets,
+                             jintArray jkey_lengths, jobjectArray jvalues,
+                             jintArray jvalue_sizes, jobjectArray jstatuses) {
+  const jsize num_keys = env->GetArrayLength(jkeys);
+
+  std::vector<ROCKSDB_NAMESPACE::Slice> keys;
+  if (!keys_from_bytebuffers(env, keys, jkeys, jkey_offsets, jkey_lengths)) {
+    return;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::PinnableSlice> values(num_keys);
+
+  std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
+  if (!cf_handles_from_jcf_handles(env, cf_handles, jcolumn_family_handles)) {
+    return;
+  }
+
+  std::vector<ROCKSDB_NAMESPACE::Status> s(num_keys);
+  if (cf_handles.size() == 0) {
+    // we can use the more efficient call here
+    auto cf_handle = db->DefaultColumnFamily();
+    db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(),
+                 s.data());
+  } else if (cf_handles.size() == 1) {
+    // we can use the more efficient call here
+    auto cf_handle = cf_handles[0];
+    db->MultiGet(rOpt, cf_handle, num_keys, keys.data(), values.data(),
+                 s.data());
+  } else {
+    // multiple CFs version
+    db->MultiGet(rOpt, num_keys, cf_handles.data(), keys.data(), values.data(),
+                 s.data());
+  }
+
+  // prepare the results
+  jobjectArray jresults = ROCKSDB_NAMESPACE::ByteJni::new2dByteArray(
+      env, static_cast<jsize>(s.size()));
+  if (jresults == nullptr) {
+    // exception occurred
+    jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError");
+    (env)->ThrowNew(exception_cls, "Insufficient Memory for results.");
+    return;
+  }
+
+  std::vector<jint> value_size;
+  for (int i = 0; i < num_keys; i++) {
+    auto jstatus = ROCKSDB_NAMESPACE::StatusJni::construct(env, s[i]);
+    if (jstatus == nullptr) {
+      // exception in context
+      return;
+    }
+    env->SetObjectArrayElement(jstatuses, i, jstatus);
+
+    if (s[i].ok()) {
+      jobject jvalue_bytebuf = env->GetObjectArrayElement(jvalues, i);
+      if (env->ExceptionCheck()) {
+        // ArrayIndexOutOfBoundsException is thrown
+        return;
+      }
+      jlong jvalue_capacity = env->GetDirectBufferCapacity(jvalue_bytebuf);
+      if (jvalue_capacity == -1) {
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+            env,
+            "Invalid value(s) argument (argument is not a valid direct "
+            "ByteBuffer)");
+        return;
+      }
+      void* jvalue_address = env->GetDirectBufferAddress(jvalue_bytebuf);
+      if (jvalue_address == nullptr) {
+        ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+            env,
+            "Invalid value(s) argument (argument is not a valid direct "
+            "ByteBuffer)");
+        return;
+      }
+
+      // record num returned, push back that number, which may be bigger then
+      // the ByteBuffer supplied. then copy as much as fits in the ByteBuffer.
+      value_size.push_back(static_cast<jint>(values[i].size()));
+      auto copy_bytes =
+          std::min(static_cast<jlong>(values[i].size()), jvalue_capacity);
+      memcpy(jvalue_address, values[i].data(), copy_bytes);
+    } else {
+      // bad status for this
+      value_size.push_back(0);
+    }
+  }
+
+  env->SetIntArrayRegion(jvalue_sizes, 0, num_keys, value_size.data());
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    multiGet
  * Signature: (J[[B[I[I)[[B
  */
 jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens) {
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys,
+    jintArray jkey_offs, jintArray jkey_lens) {
   return multi_get_helper(
       env, jdb, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
       ROCKSDB_NAMESPACE::ReadOptions(), jkeys, jkey_offs, jkey_lens, nullptr);
@@ -1773,8 +2048,8 @@
  * Signature: (J[[B[I[I[J)[[B
  */
 jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I_3J(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens,
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys,
+    jintArray jkey_offs, jintArray jkey_lens,
     jlongArray jcolumn_family_handles) {
   return multi_get_helper(env, jdb,
                           reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
@@ -1811,38 +2086,60 @@
       jkey_offs, jkey_lens, jcolumn_family_handles);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature:
+ * (JJ[J[Ljava/nio/ByteBuffer;[I[I[Ljava/nio/ByteBuffer;[I[Lorg/rocksdb/Status;)V
+ */
+void Java_org_rocksdb_RocksDB_multiGet__JJ_3J_3Ljava_nio_ByteBuffer_2_3I_3I_3Ljava_nio_ByteBuffer_2_3I_3Lorg_rocksdb_Status_2(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jlongArray jcolumn_family_handles, jobjectArray jkeys,
+    jintArray jkey_offsets, jintArray jkey_lengths, jobjectArray jvalues,
+    jintArray jvalues_sizes, jobjectArray jstatus_objects) {
+  return multi_get_helper_direct(
+      env, jdb, reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle),
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+      jcolumn_family_handles, jkeys, jkey_offsets, jkey_lengths, jvalues,
+      jvalues_sizes, jstatus_objects);
+}
+// private native void
+// multiGet(final long dbHandle, final long rOptHandle,
+//        final long[] columnFamilyHandles, final ByteBuffer[] keysArray,
+//        final ByteBuffer[] valuesArray);
+
 //////////////////////////////////////////////////////////////////////////////
 // ROCKSDB_NAMESPACE::DB::KeyMayExist
 bool key_may_exist_helper(JNIEnv* env, jlong jdb_handle, jlong jcf_handle,
-  jlong jread_opts_handle,
-  jbyteArray jkey, jint jkey_offset, jint jkey_len,
-  bool* has_exception, std::string* value, bool* value_found) {
+                          jlong jread_opts_handle, jbyteArray jkey,
+                          jint jkey_offset, jint jkey_len, bool* has_exception,
+                          std::string* value, bool* value_found) {
   auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
   ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
   if (jcf_handle == 0) {
     cf_handle = db->DefaultColumnFamily();
- } else {
-   cf_handle =
-       reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
- }
- ROCKSDB_NAMESPACE::ReadOptions read_opts =
-     jread_opts_handle == 0
-         ? ROCKSDB_NAMESPACE::ReadOptions()
-         : *(reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
-               jread_opts_handle));
-
- jbyte* key = new jbyte[jkey_len];
- env->GetByteArrayRegion(jkey, jkey_offset, jkey_len, key);
- if (env->ExceptionCheck()) {
-   // exception thrown: ArrayIndexOutOfBoundsException
-   delete[] key;
-   *has_exception = true;
-   return false;
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  ROCKSDB_NAMESPACE::ReadOptions read_opts =
+      jread_opts_handle == 0
+          ? ROCKSDB_NAMESPACE::ReadOptions()
+          : *(reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
+                jread_opts_handle));
+
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_offset, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    *has_exception = true;
+    return false;
   }
   ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
 
-  const bool exists = db->KeyMayExist(
-      read_opts, cf_handle, key_slice, value, value_found);
+  const bool exists =
+      db->KeyMayExist(read_opts, cf_handle, key_slice, value, value_found);
 
   // cleanup
   delete[] key;
@@ -1850,6 +2147,49 @@
   return exists;
 }
 
+bool key_may_exist_direct_helper(JNIEnv* env, jlong jdb_handle,
+                                 jlong jcf_handle, jlong jread_opts_handle,
+                                 jobject jkey, jint jkey_offset, jint jkey_len,
+                                 bool* has_exception, std::string* value,
+                                 bool* value_found) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+  ROCKSDB_NAMESPACE::ReadOptions read_opts =
+      jread_opts_handle == 0
+          ? ROCKSDB_NAMESPACE::ReadOptions()
+          : *(reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
+                jread_opts_handle));
+
+  char* key = reinterpret_cast<char*>(env->GetDirectBufferAddress(jkey));
+  if (key == nullptr) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid key argument (argument is not a valid direct ByteBuffer)");
+    *has_exception = true;
+    return false;
+  }
+  if (env->GetDirectBufferCapacity(jkey) < (jkey_offset + jkey_len)) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid key argument. Capacity is less than requested region (offset "
+        "+ length).");
+    *has_exception = true;
+    return false;
+  }
+
+  ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len);
+
+  const bool exists =
+      db->KeyMayExist(read_opts, cf_handle, key_slice, value, value_found);
+
+  return exists;
+}
 
 /*
  * Class:     org_rocksdb_RocksDB
@@ -1880,22 +2220,114 @@
 
 /*
  * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExistDirect
+ * Signature: (JJJLjava/nio/ByteBuffer;II)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExistDirect(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len) {
+  bool has_exception = false;
+  std::string value;
+  bool value_found = false;
+
+  const bool exists = key_may_exist_direct_helper(
+      env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset,
+      jkey_len, &has_exception, &value, &value_found);
+  if (has_exception) {
+    // java exception already raised
+    return false;
+  }
+
+  return static_cast<jboolean>(exists);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExistDirectFoundValue
+ * Signature:
+ * (JJJLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)[J
+ */
+jintArray Java_org_rocksdb_RocksDB_keyMayExistDirectFoundValue(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len,
+    jobject jval, jint jval_offset, jint jval_len) {
+  char* val_buffer = reinterpret_cast<char*>(env->GetDirectBufferAddress(jval));
+  if (val_buffer == nullptr) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid value argument (argument is not a valid direct ByteBuffer)");
+    return nullptr;
+  }
+
+  if (env->GetDirectBufferCapacity(jval) < (jval_offset + jval_len)) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(
+        env,
+        "Invalid value argument. Capacity is less than requested region "
+        "(offset + length).");
+    return nullptr;
+  }
+
+  bool has_exception = false;
+  std::string cvalue;
+  bool value_found = false;
+
+  const bool exists = key_may_exist_direct_helper(
+      env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset,
+      jkey_len, &has_exception, &cvalue, &value_found);
+
+  if (has_exception) {
+    // java exception already raised
+    return nullptr;
+  }
+
+  const jint cvalue_len = static_cast<jint>(cvalue.size());
+  const jint length = std::min(jval_len, cvalue_len);
+  memcpy(val_buffer + jval_offset, cvalue.c_str(), length);
+
+  // keep consistent with java KeyMayExistEnum.values()
+  const int kNotExist = 0;
+  const int kExistsWithoutValue = 1;
+  const int kExistsWithValue = 2;
+
+  // TODO fix return value/type
+  // exists/value_found/neither
+  // cvalue_len
+  jintArray jresult = env->NewIntArray(2);
+  const jint jexists =
+      exists ? (value_found ? kExistsWithValue : kExistsWithoutValue)
+             : kNotExist;
+
+  env->SetIntArrayRegion(jresult, 0, 1, &jexists);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresult);
+    return nullptr;
+  }
+  env->SetIntArrayRegion(jresult, 1, 1, &cvalue_len);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresult);
+    return nullptr;
+  }
+
+  return jresult;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
  * Method:    keyMayExistFoundValue
  * Signature: (JJJ[BII)[[B
  */
 jobjectArray Java_org_rocksdb_RocksDB_keyMayExistFoundValue(
     JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
-    jlong jread_opts_handle,
-    jbyteArray jkey, jint jkey_offset, jint jkey_len) {
-
+    jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, jint jkey_len) {
   bool has_exception = false;
   std::string value;
   bool value_found = false;
 
   const bool exists = key_may_exist_helper(
-      env, jdb_handle, jcf_handle, jread_opts_handle,
-      jkey, jkey_offset, jkey_len,
-      &has_exception, &value, &value_found);
+      env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset,
+      jkey_len, &has_exception, &value, &value_found);
 
   if (has_exception) {
     // java exception already raised
@@ -1930,12 +2362,12 @@
     env->DeleteLocalRef(jresult_flags);
     return nullptr;
   }
-  
+
   env->SetObjectArrayElement(jresults, 0, jresult_flags);
   if (env->ExceptionCheck()) {
-      // exception thrown: ArrayIndexOutOfBoundsException
-      env->DeleteLocalRef(jresult_flags);
-      return nullptr;
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresult_flags);
+    return nullptr;
   }
 
   env->DeleteLocalRef(jresult_flags);
@@ -2267,9 +2699,7 @@
   const jsize jlen = env->GetArrayLength(jrange_slice_handles);
   const size_t range_count = jlen / 2;
 
-  jboolean jranges_is_copy = JNI_FALSE;
-  jlong* jranges = env->GetLongArrayElements(jrange_slice_handles,
-      &jranges_is_copy);
+  jlong* jranges = env->GetLongArrayElements(jrange_slice_handles, nullptr);
   if (jranges == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
@@ -2277,10 +2707,11 @@
 
   auto ranges = std::unique_ptr<ROCKSDB_NAMESPACE::Range[]>(
       new ROCKSDB_NAMESPACE::Range[range_count]);
+  size_t range_offset = 0;
   for (jsize i = 0; i < jlen; ++i) {
     auto* start = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jranges[i]);
     auto* limit = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jranges[++i]);
-    ranges.get()[i] = ROCKSDB_NAMESPACE::Range(*start, *limit);
+    ranges.get()[range_offset++] = ROCKSDB_NAMESPACE::Range(*start, *limit);
   }
 
   auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
@@ -2353,14 +2784,13 @@
       static_cast<jlong>(count),
       static_cast<jlong>(sizes)};
 
-  const jsize jcount = static_cast<jsize>(count);
-  jlongArray jsizes = env->NewLongArray(jcount);
+  jlongArray jsizes = env->NewLongArray(2);
   if (jsizes == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
 
-  env->SetLongArrayRegion(jsizes, 0, jcount, results);
+  env->SetLongArrayRegion(jsizes, 0, 2, results);
   if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
     env->DeleteLocalRef(jsizes);
@@ -2497,6 +2927,9 @@
   auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
   auto* cf_handle =
       reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle == nullptr) {
+    cf_handle = db->DefaultColumnFamily();
+  }
   auto s = db->SetOptions(cf_handle, options_map);
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -2563,6 +2996,55 @@
 
 /*
  * Class:     org_rocksdb_RocksDB
+ * Method:    getOptions
+ * Signature: (JJ)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getOptions(JNIEnv* env, jobject,
+                                            jlong jdb_handle,
+                                            jlong jcf_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle =
+        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  auto options = db->GetOptions(cf_handle);
+  std::string options_as_string;
+  ROCKSDB_NAMESPACE::Status s =
+      GetStringFromColumnFamilyOptions(&options_as_string, options);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+  return env->NewStringUTF(options_as_string.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getDBOptions
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getDBOptions(JNIEnv* env, jobject,
+                                              jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+
+  auto options = db->GetDBOptions();
+  std::string options_as_string;
+  ROCKSDB_NAMESPACE::Status s =
+      GetStringFromDBOptions(&options_as_string, options);
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+  return env->NewStringUTF(options_as_string.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
  * Method:    compactFiles
  * Signature: (JJJ[Ljava/lang/String;IIJ)[Ljava/lang/String;
  */
@@ -2612,6 +3094,17 @@
 
 /*
  * Class:     org_rocksdb_RocksDB
+ * Method:    cancelAllBackgroundWork
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork(
+        JNIEnv*, jobject, jlong jdb_handle, jboolean jwait) {
+    auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+    ROCKSDB_NAMESPACE::CancelAllBackgroundWork(db, jwait);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
  * Method:    pauseBackgroundWork
  * Signature: (J)V
  */
@@ -2809,7 +3302,7 @@
  * Method:    setPreserveDeletesSequenceNumber
  * Signature: (JJ)Z
  */
-jboolean JNICALL Java_org_rocksdb_RocksDB_setPreserveDeletesSequenceNumber(
+jboolean Java_org_rocksdb_RocksDB_setPreserveDeletesSequenceNumber(
     JNIEnv*, jobject, jlong jdb_handle, jlong jseq_number) {
   auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
   if (db->SetPreserveDeletesSequenceNumber(
@@ -3168,9 +3661,8 @@
         reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
   }
   const jsize jlen = env->GetArrayLength(jrange_slice_handles);
-  jboolean jrange_slice_handles_is_copy = JNI_FALSE;
-  jlong *jrange_slice_handle = env->GetLongArrayElements(
-      jrange_slice_handles, &jrange_slice_handles_is_copy);
+  jlong* jrange_slice_handle =
+      env->GetLongArrayElements(jrange_slice_handles, nullptr);
   if (jrange_slice_handle == nullptr) {
     // exception occurred
     return nullptr;
@@ -3298,8 +3790,7 @@
  * Method:    endTrace
  * Signature: (J)V
  */
-JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_endTrace(
-    JNIEnv* env, jobject, jlong jdb_handle) {
+void Java_org_rocksdb_RocksDB_endTrace(JNIEnv* env, jobject, jlong jdb_handle) {
   auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
   auto s = db->EndTrace();
   if (!s.ok()) {
@@ -3309,6 +3800,20 @@
 
 /*
  * Class:     org_rocksdb_RocksDB
+ * Method:    tryCatchUpWithPrimary
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_tryCatchUpWithPrimary(JNIEnv* env, jobject,
+                                                    jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  auto s = db->TryCatchUpWithPrimary();
+  if (!s.ok()) {
+    ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
  * Method:    destroyDB
  * Signature: (Ljava/lang/String;J)V
  */
@@ -3367,9 +3872,11 @@
  * Method:    deleteFilesInRanges
  * Signature: (JJLjava/util/List;Z)V
  */
-JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_deleteFilesInRanges(
-    JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jlong jcf_handle,
-    jobjectArray ranges, jboolean include_end) {
+void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jobject /*jdb*/,
+                                                  jlong jdb_handle,
+                                                  jlong jcf_handle,
+                                                  jobjectArray ranges,
+                                                  jboolean include_end) {
   jsize length = env->GetArrayLength(ranges);
 
   std::vector<ROCKSDB_NAMESPACE::RangePtr> rangesVector;
@@ -3404,3 +3911,15 @@
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    version
+ * Signature: ()I
+ */
+jint Java_org_rocksdb_RocksDB_version(JNIEnv*, jclass) {
+  uint32_t encodedVersion = (ROCKSDB_MAJOR & 0xff) << 16;
+  encodedVersion |= (ROCKSDB_MINOR & 0xff) << 8;
+  encodedVersion |= (ROCKSDB_PATCH & 0xff);
+  return static_cast<jint>(encodedVersion);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/slice.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/slice.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/slice.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/slice.cc	2025-05-19 16:14:27.000000000 +0000
@@ -229,6 +229,17 @@
 }
 
 /*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    setLength0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong handle, jint length) {
+  auto* slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(handle);
+  slice->size_ = length;
+}
+
+/*
  * Class:     org_rocksdb_Slice
  * Method:    disposeInternalBuf
  * Signature: (JJ)V
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_file_reader_iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -251,3 +251,20 @@
   ROCKSDB_NAMESPACE::JniUtil::k_op_direct(seekPrev, env, jtarget, jtarget_off,
                                           jtarget_len);
 }
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    refresh0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_refresh0(JNIEnv* env, jobject /*jobj*/,
+                                            jlong handle) {
+  auto* it = reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(handle);
+  ROCKSDB_NAMESPACE::Status s = it->Refresh();
+
+  if (s.ok()) {
+    return;
+  }
+
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/sst_partitioner.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ ROCKSDB_NAMESPACE::SstFileManager methods
+// from Java side.
+
+#include "rocksdb/sst_partitioner.h"
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_SstPartitionerFixedPrefixFactory.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstPartitionerFixedPrefixFactory
+ * Method:    newSstPartitionerFixedPrefixFactory0
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstPartitionerFixedPrefixFactory_newSstPartitionerFixedPrefixFactory0(
+    JNIEnv*, jclass, jlong prefix_len) {
+  auto* ptr = new std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>(
+      ROCKSDB_NAMESPACE::NewSstPartitionerFixedPrefixFactory(prefix_len));
+  return reinterpret_cast<jlong>(ptr);
+}
+
+/*
+ * Class:     org_rocksdb_SstPartitionerFixedPrefixFactory
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstPartitionerFixedPrefixFactory_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* ptr = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(jhandle);
+  delete ptr;  // delete std::shared_ptr
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.cc	2025-05-19 16:14:27.000000000 +0000
@@ -28,5 +28,4 @@
 
   return true;
 }
-// @lint-ignore TXT4 T25377293 Grandfathered in
-};  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/statisticsjni.h	2025-05-19 16:14:27.000000000 +0000
@@ -30,5 +30,4 @@
 
  }  // namespace ROCKSDB_NAMESPACE
 
-// @lint-ignore TXT4 T25377293 Grandfathered in
-#endif  // JAVA_ROCKSJNI_STATISTICSJNI_H_
\ No newline at end of file
+#endif  // JAVA_ROCKSJNI_STATISTICSJNI_H_
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/table.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/table.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/table.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/table.cc	2025-05-19 16:14:27.000000000 +0000
@@ -42,25 +42,25 @@
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZZZZBBDBZJJJJIIIJZZJZZIIZZJIJI)J
+ * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
-    JNIEnv*, jobject, jboolean jcache_index_and_filter_blocks,
+    JNIEnv *, jobject, jboolean jcache_index_and_filter_blocks,
     jboolean jcache_index_and_filter_blocks_with_high_priority,
     jboolean jpin_l0_filter_and_index_blocks_in_cache,
     jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value,
     jbyte jdata_block_index_type_value,
     jdouble jdata_block_hash_table_util_ratio, jbyte jchecksum_type_value,
     jboolean jno_block_cache, jlong jblock_cache_handle,
-    jlong jpersistent_cache_handle,
-    jlong jblock_cache_compressed_handle, jlong jblock_size,
-    jint jblock_size_deviation, jint jblock_restart_interval,
+    jlong jpersistent_cache_handle, jlong jblock_cache_compressed_handle,
+    jlong jblock_size, jint jblock_size_deviation, jint jblock_restart_interval,
     jint jindex_block_restart_interval, jlong jmetadata_block_size,
-    jboolean jpartition_filters, jboolean juse_delta_encoding,
-    jlong jfilter_policy_handle, jboolean jwhole_key_filtering,
-    jboolean jverify_compression, jint jread_amp_bytes_per_bit,
-    jint jformat_version, jboolean jenable_index_compression,
-    jboolean jblock_align, jlong jblock_cache_size,
+    jboolean jpartition_filters, jboolean joptimize_filters_for_memory,
+    jboolean juse_delta_encoding, jlong jfilter_policy_handle,
+    jboolean jwhole_key_filtering, jboolean jverify_compression,
+    jint jread_amp_bytes_per_bit, jint jformat_version,
+    jboolean jenable_index_compression, jboolean jblock_align,
+    jbyte jindex_shortening, jlong jblock_cache_size,
     jint jblock_cache_num_shard_bits, jlong jblock_cache_compressed_size,
     jint jblock_cache_compressed_num_shard_bits) {
   ROCKSDB_NAMESPACE::BlockBasedTableOptions options;
@@ -131,6 +131,8 @@
   options.index_block_restart_interval = static_cast<int>(jindex_block_restart_interval);
   options.metadata_block_size = static_cast<uint64_t>(jmetadata_block_size);
   options.partition_filters = static_cast<bool>(jpartition_filters);
+  options.optimize_filters_for_memory =
+      static_cast<bool>(joptimize_filters_for_memory);
   options.use_delta_encoding = static_cast<bool>(juse_delta_encoding);
   if (jfilter_policy_handle > 0) {
     std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy> *pFilterPolicy =
@@ -144,6 +146,9 @@
   options.format_version = static_cast<uint32_t>(jformat_version);
   options.enable_index_compression = static_cast<bool>(jenable_index_compression);
   options.block_align = static_cast<bool>(jblock_align);
+  options.index_shortening =
+      ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode(
+          jindex_shortening);
 
   return reinterpret_cast<jlong>(
       ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options));
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/testable_event_listener.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,216 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <climits>
+#include <cstdint>
+#include <utility>
+
+#include "include/org_rocksdb_test_TestableEventListener.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+
+using ROCKSDB_NAMESPACE::BackgroundErrorReason;
+using ROCKSDB_NAMESPACE::CompactionJobInfo;
+using ROCKSDB_NAMESPACE::CompactionJobStats;
+using ROCKSDB_NAMESPACE::CompactionReason;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo;
+using ROCKSDB_NAMESPACE::FileOperationInfo;
+using ROCKSDB_NAMESPACE::FileOperationType;
+using ROCKSDB_NAMESPACE::FlushJobInfo;
+using ROCKSDB_NAMESPACE::FlushReason;
+using ROCKSDB_NAMESPACE::MemTableInfo;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TableFileCreationBriefInfo;
+using ROCKSDB_NAMESPACE::TableFileCreationInfo;
+using ROCKSDB_NAMESPACE::TableFileCreationReason;
+using ROCKSDB_NAMESPACE::TableFileDeletionInfo;
+using ROCKSDB_NAMESPACE::TableProperties;
+using ROCKSDB_NAMESPACE::WriteStallCondition;
+using ROCKSDB_NAMESPACE::WriteStallInfo;
+
+static TableProperties newTablePropertiesForTest() {
+  TableProperties table_properties;
+  table_properties.data_size = UINT64_MAX;
+  table_properties.index_size = UINT64_MAX;
+  table_properties.index_partitions = UINT64_MAX;
+  table_properties.top_level_index_size = UINT64_MAX;
+  table_properties.index_key_is_user_key = UINT64_MAX;
+  table_properties.index_value_is_delta_encoded = UINT64_MAX;
+  table_properties.filter_size = UINT64_MAX;
+  table_properties.raw_key_size = UINT64_MAX;
+  table_properties.raw_value_size = UINT64_MAX;
+  table_properties.num_data_blocks = UINT64_MAX;
+  table_properties.num_entries = UINT64_MAX;
+  table_properties.num_deletions = UINT64_MAX;
+  table_properties.num_merge_operands = UINT64_MAX;
+  table_properties.num_range_deletions = UINT64_MAX;
+  table_properties.format_version = UINT64_MAX;
+  table_properties.fixed_key_len = UINT64_MAX;
+  table_properties.column_family_id = UINT64_MAX;
+  table_properties.creation_time = UINT64_MAX;
+  table_properties.oldest_key_time = UINT64_MAX;
+  table_properties.file_creation_time = UINT64_MAX;
+  table_properties.slow_compression_estimated_data_size = UINT64_MAX;
+  table_properties.fast_compression_estimated_data_size = UINT64_MAX;
+  table_properties.external_sst_file_global_seqno_offset = UINT64_MAX;
+  table_properties.db_id = "dbId";
+  table_properties.db_session_id = "sessionId";
+  table_properties.column_family_name = "columnFamilyName";
+  table_properties.filter_policy_name = "filterPolicyName";
+  table_properties.comparator_name = "comparatorName";
+  table_properties.merge_operator_name = "mergeOperatorName";
+  table_properties.prefix_extractor_name = "prefixExtractorName";
+  table_properties.property_collectors_names = "propertyCollectorsNames";
+  table_properties.compression_name = "compressionName";
+  table_properties.compression_options = "compressionOptions";
+  table_properties.user_collected_properties = {{"key", "value"}};
+  table_properties.readable_properties = {{"key", "value"}};
+  return table_properties;
+}
+
+/*
+ * Class:     org_rocksdb_test_TestableEventListener
+ * Method:    invokeAllCallbacks
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks(
+    JNIEnv *, jclass, jlong jhandle) {
+  const auto &el =
+      *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener> *>(
+          jhandle);
+
+  TableProperties table_properties = newTablePropertiesForTest();
+
+  FlushJobInfo flush_job_info;
+  flush_job_info.cf_id = INT_MAX;
+  flush_job_info.cf_name = "testColumnFamily";
+  flush_job_info.file_path = "/file/path";
+  flush_job_info.file_number = UINT64_MAX;
+  flush_job_info.oldest_blob_file_number = UINT64_MAX;
+  flush_job_info.thread_id = UINT64_MAX;
+  flush_job_info.job_id = INT_MAX;
+  flush_job_info.triggered_writes_slowdown = true;
+  flush_job_info.triggered_writes_stop = true;
+  flush_job_info.smallest_seqno = UINT64_MAX;
+  flush_job_info.largest_seqno = UINT64_MAX;
+  flush_job_info.table_properties = table_properties;
+  flush_job_info.flush_reason = FlushReason::kManualFlush;
+
+  el->OnFlushCompleted(nullptr, flush_job_info);
+  el->OnFlushBegin(nullptr, flush_job_info);
+
+  Status status = Status::Incomplete(Status::SubCode::kNoSpace);
+
+  TableFileDeletionInfo file_deletion_info;
+  file_deletion_info.db_name = "dbName";
+  file_deletion_info.file_path = "/file/path";
+  file_deletion_info.job_id = INT_MAX;
+  file_deletion_info.status = status;
+
+  el->OnTableFileDeleted(file_deletion_info);
+
+  CompactionJobInfo compaction_job_info;
+  compaction_job_info.cf_id = UINT32_MAX;
+  compaction_job_info.cf_name = "compactionColumnFamily";
+  compaction_job_info.status = status;
+  compaction_job_info.thread_id = UINT64_MAX;
+  compaction_job_info.job_id = INT_MAX;
+  compaction_job_info.base_input_level = INT_MAX;
+  compaction_job_info.output_level = INT_MAX;
+  compaction_job_info.input_files = {"inputFile.sst"};
+  compaction_job_info.input_file_infos = {};
+  compaction_job_info.output_files = {"outputFile.sst"};
+  compaction_job_info.output_file_infos = {};
+  compaction_job_info.table_properties = {
+      {"tableProperties", std::shared_ptr<TableProperties>(
+                              &table_properties, [](TableProperties *) {})}};
+  compaction_job_info.compaction_reason = CompactionReason::kFlush;
+  compaction_job_info.compression = CompressionType::kSnappyCompression;
+
+  compaction_job_info.stats = CompactionJobStats();
+
+  el->OnCompactionBegin(nullptr, compaction_job_info);
+  el->OnCompactionCompleted(nullptr, compaction_job_info);
+
+  TableFileCreationInfo file_creation_info;
+  file_creation_info.file_size = UINT64_MAX;
+  file_creation_info.table_properties = table_properties;
+  file_creation_info.status = status;
+  file_creation_info.file_checksum = "fileChecksum";
+  file_creation_info.file_checksum_func_name = "fileChecksumFuncName";
+  file_creation_info.db_name = "dbName";
+  file_creation_info.cf_name = "columnFamilyName";
+  file_creation_info.file_path = "/file/path";
+  file_creation_info.job_id = INT_MAX;
+  file_creation_info.reason = TableFileCreationReason::kMisc;
+
+  el->OnTableFileCreated(file_creation_info);
+
+  TableFileCreationBriefInfo file_creation_brief_info;
+  file_creation_brief_info.db_name = "dbName";
+  file_creation_brief_info.cf_name = "columnFamilyName";
+  file_creation_brief_info.file_path = "/file/path";
+  file_creation_brief_info.job_id = INT_MAX;
+  file_creation_brief_info.reason = TableFileCreationReason::kMisc;
+
+  el->OnTableFileCreationStarted(file_creation_brief_info);
+
+  MemTableInfo mem_table_info;
+  mem_table_info.cf_name = "columnFamilyName";
+  mem_table_info.first_seqno = UINT64_MAX;
+  mem_table_info.earliest_seqno = UINT64_MAX;
+  mem_table_info.num_entries = UINT64_MAX;
+  mem_table_info.num_deletes = UINT64_MAX;
+
+  el->OnMemTableSealed(mem_table_info);
+  el->OnColumnFamilyHandleDeletionStarted(nullptr);
+
+  ExternalFileIngestionInfo file_ingestion_info;
+  file_ingestion_info.cf_name = "columnFamilyName";
+  file_ingestion_info.external_file_path = "/external/file/path";
+  file_ingestion_info.internal_file_path = "/internal/file/path";
+  file_ingestion_info.global_seqno = UINT64_MAX;
+  file_ingestion_info.table_properties = table_properties;
+  el->OnExternalFileIngested(nullptr, file_ingestion_info);
+
+  el->OnBackgroundError(BackgroundErrorReason::kFlush, &status);
+
+  WriteStallInfo write_stall_info;
+  write_stall_info.cf_name = "columnFamilyName";
+  write_stall_info.condition.cur = WriteStallCondition::kDelayed;
+  write_stall_info.condition.prev = WriteStallCondition::kStopped;
+  el->OnStallConditionsChanged(write_stall_info);
+
+  FileOperationInfo op_info = FileOperationInfo(
+      FileOperationType::kRead, "/file/path",
+      std::make_pair(std::chrono::time_point<std::chrono::system_clock,
+                                             std::chrono::nanoseconds>(
+                         std::chrono::nanoseconds(1600699420000000000ll)),
+                     std::chrono::time_point<std::chrono::steady_clock,
+                                             std::chrono::nanoseconds>(
+                         std::chrono::nanoseconds(1600699420000000000ll))),
+      std::chrono::time_point<std::chrono::steady_clock,
+                              std::chrono::nanoseconds>(
+          std::chrono::nanoseconds(1600699425000000000ll)),
+      status);
+  op_info.offset = UINT64_MAX;
+  op_info.length = SIZE_MAX;
+  op_info.status = status;
+
+  el->OnFileReadFinish(op_info);
+  el->OnFileWriteFinish(op_info);
+  el->OnFileFlushFinish(op_info);
+  el->OnFileSyncFinish(op_info);
+  el->OnFileRangeSyncFinish(op_info);
+  el->OnFileTruncateFinish(op_info);
+  el->OnFileCloseFinish(op_info);
+  el->ShouldBeNotifiedOnFileIO();
+
+  bool auto_recovery;
+  el->OnErrorRecoveryBegin(BackgroundErrorReason::kFlush, status,
+                           &auto_recovery);
+  el->OnErrorRecoveryCompleted(status);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/transaction.cc	2025-05-19 16:14:27.000000000 +0000
@@ -14,8 +14,6 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksjni/portal.h"
 
-using namespace std::placeholders;
-
 #if defined(_MSC_VER)
 #pragma warning(push)
 #pragma warning(disable : 4503)  // identifier' : decorated name length
@@ -220,8 +218,8 @@
           const ROCKSDB_NAMESPACE::ReadOptions&,
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, std::string*)>(
-          &ROCKSDB_NAMESPACE::Transaction::Get, txn, _1, column_family_handle,
-          _2, _3);
+          &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1,
+          column_family_handle, std::placeholders::_2, std::placeholders::_3);
   return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len);
 }
 
@@ -238,7 +236,8 @@
       std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
           const ROCKSDB_NAMESPACE::ReadOptions&,
           const ROCKSDB_NAMESPACE::Slice&, std::string*)>(
-          &ROCKSDB_NAMESPACE::Transaction::Get, txn, _1, _2, _3);
+          &ROCKSDB_NAMESPACE::Transaction::Get, txn, std::placeholders::_1,
+          std::placeholders::_2, std::placeholders::_3);
   return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len);
 }
 
@@ -402,8 +401,8 @@
       const ROCKSDB_NAMESPACE::ReadOptions&,
       const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>&,
       const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
-      &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, _1, column_family_handles,
-      _2, _3);
+      &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1,
+      column_family_handles, std::placeholders::_2, std::placeholders::_3);
   return txn_multi_get_helper(env, fn_multi_get, jread_options_handle,
                               jkey_parts);
 }
@@ -421,7 +420,8 @@
       ROCKSDB_NAMESPACE::Transaction::*)(
       const ROCKSDB_NAMESPACE::ReadOptions&,
       const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
-      &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, _1, _2, _3);
+      &ROCKSDB_NAMESPACE::Transaction::MultiGet, txn, std::placeholders::_1,
+      std::placeholders::_2, std::placeholders::_3);
   return txn_multi_get_helper(env, fn_multi_get, jread_options_handle,
                               jkey_parts);
 }
@@ -444,8 +444,9 @@
           const ROCKSDB_NAMESPACE::ReadOptions&,
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, std::string*, bool, bool)>(
-          &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, _1,
-          column_family_handle, _2, _3, jexclusive, jdo_validate);
+          &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn,
+          std::placeholders::_1, column_family_handle, std::placeholders::_2,
+          std::placeholders::_3, jexclusive, jdo_validate);
   return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey,
                         jkey_part_len);
 }
@@ -464,7 +465,8 @@
       std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
           const ROCKSDB_NAMESPACE::ReadOptions&,
           const ROCKSDB_NAMESPACE::Slice&, std::string*, bool, bool)>(
-          &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn, _1, _2, _3,
+          &ROCKSDB_NAMESPACE::Transaction::GetForUpdate, txn,
+          std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
           jexclusive, jdo_validate);
   return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey,
                         jkey_part_len);
@@ -492,8 +494,9 @@
       const ROCKSDB_NAMESPACE::ReadOptions&,
       const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>&,
       const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
-      &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, _1,
-      column_family_handles, _2, _3);
+      &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn,
+      std::placeholders::_1, column_family_handles, std::placeholders::_2,
+      std::placeholders::_3);
   return txn_multi_get_helper(env, fn_multi_get_for_update,
                               jread_options_handle, jkey_parts);
 }
@@ -511,7 +514,8 @@
       ROCKSDB_NAMESPACE::Status> (ROCKSDB_NAMESPACE::Transaction::*)(
       const ROCKSDB_NAMESPACE::ReadOptions&,
       const std::vector<ROCKSDB_NAMESPACE::Slice>&, std::vector<std::string>*)>(
-      &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn, _1, _2, _3);
+      &ROCKSDB_NAMESPACE::Transaction::MultiGetForUpdate, txn,
+      std::placeholders::_1, std::placeholders::_2, std::placeholders::_3);
   return txn_multi_get_helper(env, fn_multi_get_for_update,
                               jread_options_handle, jkey_parts);
 }
@@ -605,7 +609,8 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&,
           bool)>(&ROCKSDB_NAMESPACE::Transaction::Put, txn,
-                 column_family_handle, _1, _2, jassume_tracked);
+                 column_family_handle, std::placeholders::_1,
+                 std::placeholders::_2, jassume_tracked);
   txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len);
 }
 
@@ -623,7 +628,8 @@
   FnWriteKV fn_put =
       std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
-          &ROCKSDB_NAMESPACE::Transaction::Put, txn, _1, _2);
+          &ROCKSDB_NAMESPACE::Transaction::Put, txn, std::placeholders::_1,
+          std::placeholders::_2);
   txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len);
 }
 
@@ -689,6 +695,7 @@
       // out of memory
       env->DeleteLocalRef(jobj_value_part);
       env->DeleteLocalRef(jobj_key_part);
+      env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT);
       free_parts(env, jparts_to_free);
       return;
     }
@@ -698,6 +705,7 @@
       env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT);
       env->DeleteLocalRef(jobj_value_part);
       env->DeleteLocalRef(jobj_key_part);
+      env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT);
       free_parts(env, jparts_to_free);
       return;
     }
@@ -748,8 +756,8 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::SliceParts&,
           const ROCKSDB_NAMESPACE::SliceParts&, bool)>(
-          &ROCKSDB_NAMESPACE::Transaction::Put, txn, column_family_handle, _1,
-          _2, jassume_tracked);
+          &ROCKSDB_NAMESPACE::Transaction::Put, txn, column_family_handle,
+          std::placeholders::_1, std::placeholders::_2, jassume_tracked);
   txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len,
                             jvalue_parts, jvalue_parts_len);
 }
@@ -766,7 +774,8 @@
   FnWriteKVParts fn_put_parts = std::bind<ROCKSDB_NAMESPACE::Status (
       ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&,
                                          const ROCKSDB_NAMESPACE::SliceParts&)>(
-      &ROCKSDB_NAMESPACE::Transaction::Put, txn, _1, _2);
+      &ROCKSDB_NAMESPACE::Transaction::Put, txn, std::placeholders::_1,
+      std::placeholders::_2);
   txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len,
                             jvalue_parts, jvalue_parts_len);
 }
@@ -789,7 +798,8 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&,
           bool)>(&ROCKSDB_NAMESPACE::Transaction::Merge, txn,
-                 column_family_handle, _1, _2, jassume_tracked);
+                 column_family_handle, std::placeholders::_1,
+                 std::placeholders::_2, jassume_tracked);
   txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len);
 }
 
@@ -805,7 +815,8 @@
   FnWriteKV fn_merge =
       std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
-          &ROCKSDB_NAMESPACE::Transaction::Merge, txn, _1, _2);
+          &ROCKSDB_NAMESPACE::Transaction::Merge, txn, std::placeholders::_1,
+          std::placeholders::_2);
   txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len);
 }
 
@@ -854,7 +865,7 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, bool)>(
           &ROCKSDB_NAMESPACE::Transaction::Delete, txn, column_family_handle,
-          _1, jassume_tracked);
+          std::placeholders::_1, jassume_tracked);
   txn_write_k_helper(env, fn_delete, jkey, jkey_part_len);
 }
 
@@ -869,7 +880,7 @@
   auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
   FnWriteK fn_delete = std::bind<ROCKSDB_NAMESPACE::Status (
       ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::Slice&)>(
-      &ROCKSDB_NAMESPACE::Transaction::Delete, txn, _1);
+      &ROCKSDB_NAMESPACE::Transaction::Delete, txn, std::placeholders::_1);
   txn_write_k_helper(env, fn_delete, jkey, jkey_part_len);
 }
 
@@ -949,7 +960,7 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::SliceParts&, bool)>(
           &ROCKSDB_NAMESPACE::Transaction::Delete, txn, column_family_handle,
-          _1, jassume_tracked);
+          std::placeholders::_1, jassume_tracked);
   txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len);
 }
 
@@ -965,7 +976,7 @@
   auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
   FnWriteKParts fn_delete_parts = std::bind<ROCKSDB_NAMESPACE::Status (
       ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&)>(
-      &ROCKSDB_NAMESPACE::Transaction::Delete, txn, _1);
+      &ROCKSDB_NAMESPACE::Transaction::Delete, txn, std::placeholders::_1);
   txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len);
 }
 
@@ -986,7 +997,7 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, bool)>(
           &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
-          column_family_handle, _1, jassume_tracked);
+          column_family_handle, std::placeholders::_1, jassume_tracked);
   txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len);
 }
 
@@ -1003,7 +1014,8 @@
   auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
   FnWriteK fn_single_delete = std::bind<ROCKSDB_NAMESPACE::Status (
       ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::Slice&)>(
-      &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, _1);
+      &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
+      std::placeholders::_1);
   txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len);
 }
 
@@ -1025,7 +1037,7 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::SliceParts&, bool)>(
           &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
-          column_family_handle, _1, jassume_tracked);
+          column_family_handle, std::placeholders::_1, jassume_tracked);
   txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts,
                            jkey_parts_len);
 }
@@ -1043,7 +1055,8 @@
   auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
   FnWriteKParts fn_single_delete_parts = std::bind<ROCKSDB_NAMESPACE::Status (
       ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&)>(
-      &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn, _1);
+      &ROCKSDB_NAMESPACE::Transaction::SingleDelete, txn,
+      std::placeholders::_1);
   txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts,
                            jkey_parts_len);
 }
@@ -1066,7 +1079,7 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
           &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn,
-          column_family_handle, _1, _2);
+          column_family_handle, std::placeholders::_1, std::placeholders::_2);
   txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval,
                       jval_len);
 }
@@ -1083,7 +1096,8 @@
   FnWriteKV fn_put_untracked =
       std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
-          &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, _1, _2);
+          &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn,
+          std::placeholders::_1, std::placeholders::_2);
   txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval,
                       jval_len);
 }
@@ -1106,7 +1120,7 @@
                                          const ROCKSDB_NAMESPACE::SliceParts&,
                                          const ROCKSDB_NAMESPACE::SliceParts&)>(
       &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, column_family_handle,
-      _1, _2);
+      std::placeholders::_1, std::placeholders::_2);
   txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts,
                             jkey_parts_len, jvalue_parts, jvalue_parts_len);
 }
@@ -1123,7 +1137,8 @@
   FnWriteKVParts fn_put_parts_untracked = std::bind<ROCKSDB_NAMESPACE::Status (
       ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::SliceParts&,
                                          const ROCKSDB_NAMESPACE::SliceParts&)>(
-      &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, _1, _2);
+      &ROCKSDB_NAMESPACE::Transaction::PutUntracked, txn, std::placeholders::_1,
+      std::placeholders::_2);
   txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts,
                             jkey_parts_len, jvalue_parts, jvalue_parts_len);
 }
@@ -1146,7 +1161,7 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
           &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn,
-          column_family_handle, _1, _2);
+          column_family_handle, std::placeholders::_1, std::placeholders::_2);
   txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval,
                       jval_len);
 }
@@ -1163,7 +1178,8 @@
   FnWriteKV fn_merge_untracked =
       std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
           const ROCKSDB_NAMESPACE::Slice&, const ROCKSDB_NAMESPACE::Slice&)>(
-          &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn, _1, _2);
+          &ROCKSDB_NAMESPACE::Transaction::MergeUntracked, txn,
+          std::placeholders::_1, std::placeholders::_2);
   txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval,
                       jval_len);
 }
@@ -1184,7 +1200,7 @@
       ROCKSDB_NAMESPACE::Transaction::*)(ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
                                          const ROCKSDB_NAMESPACE::Slice&)>(
       &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
-      column_family_handle, _1);
+      column_family_handle, std::placeholders::_1);
   txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len);
 }
 
@@ -1201,7 +1217,8 @@
   auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
   FnWriteK fn_delete_untracked = std::bind<ROCKSDB_NAMESPACE::Status (
       ROCKSDB_NAMESPACE::Transaction::*)(const ROCKSDB_NAMESPACE::Slice&)>(
-      &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, _1);
+      &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
+      std::placeholders::_1);
   txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len);
 }
 
@@ -1222,7 +1239,7 @@
           ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
           const ROCKSDB_NAMESPACE::SliceParts&)>(
           &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
-          column_family_handle, _1);
+          column_family_handle, std::placeholders::_1);
   txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts,
                            jkey_parts_len);
 }
@@ -1239,7 +1256,8 @@
   FnWriteKParts fn_delete_untracked_parts =
       std::bind<ROCKSDB_NAMESPACE::Status (ROCKSDB_NAMESPACE::Transaction::*)(
           const ROCKSDB_NAMESPACE::SliceParts&)>(
-          &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn, _1);
+          &ROCKSDB_NAMESPACE::Transaction::DeleteUntracked, txn,
+          std::placeholders::_1);
   txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts,
                            jkey_parts_len);
 }
@@ -1605,7 +1623,7 @@
     case ROCKSDB_NAMESPACE::Transaction::TransactionState::AWAITING_COMMIT:
       return 0x3;
 
-    case ROCKSDB_NAMESPACE::Transaction::TransactionState::COMMITED:
+    case ROCKSDB_NAMESPACE::Transaction::TransactionState::COMMITTED:
       return 0x4;
 
     case ROCKSDB_NAMESPACE::Transaction::TransactionState::AWAITING_ROLLBACK:
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/ttl.cc	2025-05-19 16:14:27.000000000 +0000
@@ -197,7 +197,7 @@
       *cfOptions, std::string(reinterpret_cast<char*>(cfname), len), &handle,
       jttl);
 
-  env->ReleaseByteArrayElements(jcolumn_name, cfname, 0);
+  env->ReleaseByteArrayElements(jcolumn_name, cfname, JNI_ABORT);
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch.cc	2025-05-19 16:14:27.000000000 +0000
@@ -363,10 +363,10 @@
 
 /*
  * Class:     org_rocksdb_WriteBatch
- * Method:    removeDirect
+ * Method:    deleteDirect
  * Signature: (JLjava/nio/ByteBuffer;IIJ)V
  */
-void Java_org_rocksdb_WriteBatch_removeDirect(JNIEnv* env, jobject /*jobj*/,
+void Java_org_rocksdb_WriteBatch_deleteDirect(JNIEnv* env, jobject /*jobj*/,
                                               jlong jwb_handle, jobject jkey,
                                               jint jkey_offset, jint jkey_len,
                                               jlong jcf_handle) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -47,7 +47,7 @@
   ROCKSDB_NAMESPACE::WriteBufferManager wb(options.db_write_buffer_size);
   options.memtable_factory = factory;
   ROCKSDB_NAMESPACE::MemTable* mem = new ROCKSDB_NAMESPACE::MemTable(
-      cmp, ROCKSDB_NAMESPACE::ImmutableCFOptions(options),
+      cmp, ROCKSDB_NAMESPACE::ImmutableOptions(options),
       ROCKSDB_NAMESPACE::MutableCFOptions(options), &wb,
       ROCKSDB_NAMESPACE::kMaxSequenceNumber, 0 /* column_family_id */);
   mem->Ref();
@@ -63,10 +63,10 @@
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
     ikey.clear();
-    bool parsed = ROCKSDB_NAMESPACE::ParseInternalKey(iter->key(), &ikey);
-    if (!parsed) {
-      assert(parsed);
-    }
+    ROCKSDB_NAMESPACE::Status pik_status = ROCKSDB_NAMESPACE::ParseInternalKey(
+        iter->key(), &ikey, true /* log_err_key */);
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
     switch (ikey.type) {
       case ROCKSDB_NAMESPACE::kTypeValue:
         state.append("Put(");
@@ -119,7 +119,7 @@
         break;
     }
     state.append("@");
-    state.append(ROCKSDB_NAMESPACE::NumberToString(ikey.sequence));
+    state.append(ROCKSDB_NAMESPACE::ToString(ikey.sequence));
   }
   if (!s.ok()) {
     state.append(s.ToString());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_batch_with_index.cc	2025-05-19 16:14:27.000000000 +0000
@@ -301,10 +301,10 @@
 
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
- * Method:    removeDirect
+ * Method:    deleteDirect
  * Signature: (JLjava/nio/ByteBuffer;IIJ)V
  */
-void Java_org_rocksdb_WriteBatchWithIndex_removeDirect(
+void Java_org_rocksdb_WriteBatchWithIndex_deleteDirect(
     JNIEnv* env, jobject /*jobj*/, jlong jwb_handle, jobject jkey,
     jint jkey_offset, jint jkey_len, jlong jcf_handle) {
   auto* wb = reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatch*>(jwb_handle);
@@ -533,20 +533,24 @@
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
  * Method:    iteratorWithBase
- * Signature: (JJJ)J
+ * Signature: (JJJJ)J
  */
-jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(JNIEnv* /*env*/,
-                                                            jobject /*jobj*/,
-                                                            jlong jwbwi_handle,
-                                                            jlong jcf_handle,
-                                                            jlong jbi_handle) {
+jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(
+    JNIEnv*, jobject, jlong jwbwi_handle, jlong jcf_handle,
+    jlong jbase_iterator_handle, jlong jread_opts_handle) {
   auto* wbwi =
       reinterpret_cast<ROCKSDB_NAMESPACE::WriteBatchWithIndex*>(jwbwi_handle);
   auto* cf_handle =
       reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
   auto* base_iterator =
-      reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(jbi_handle);
-  auto* iterator = wbwi->NewIteratorWithBase(cf_handle, base_iterator);
+      reinterpret_cast<ROCKSDB_NAMESPACE::Iterator*>(jbase_iterator_handle);
+  ROCKSDB_NAMESPACE::ReadOptions* read_opts =
+      jread_opts_handle == 0
+          ? nullptr
+          : reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
+                jread_opts_handle);
+  auto* iterator =
+      wbwi->NewIteratorWithBase(cf_handle, base_iterator, read_opts);
   return reinterpret_cast<jlong>(iterator);
 }
 
@@ -860,3 +864,13 @@
 
   return jresults;
 }
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    refresh0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_refresh0(JNIEnv* env) {
+  ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::Status::NotSupported("Refresh() is not supported");
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/write_buffer_manager.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,14 +16,15 @@
  * Signature: (JJ)J
  */
 jlong Java_org_rocksdb_WriteBufferManager_newWriteBufferManager(
-        JNIEnv* /*env*/, jclass /*jclazz*/, jlong jbuffer_size, jlong jcache_handle) {
+    JNIEnv* /*env*/, jclass /*jclazz*/, jlong jbuffer_size, jlong jcache_handle,
+    jboolean allow_stall) {
   auto* cache_ptr =
       reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
           jcache_handle);
   auto* write_buffer_manager =
       new std::shared_ptr<ROCKSDB_NAMESPACE::WriteBufferManager>(
-          std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(jbuffer_size,
-                                                                  *cache_ptr));
+          std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(
+              jbuffer_size, *cache_ptr, allow_stall));
   return reinterpret_cast<jlong>(write_buffer_manager);
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc	2025-05-19 16:14:27.000000000 +0000
@@ -108,7 +108,7 @@
     // exception thrown
     return;
   }
-    
+
   m_jMarkRollbackMethodId = WriteBatchHandlerJni::getMarkRollbackMethodId(env);
   if(m_jMarkRollbackMethodId == nullptr) {
     // exception thrown
@@ -121,6 +121,13 @@
     return;
   }
 
+  m_jMarkCommitWithTimestampMethodId =
+      WriteBatchHandlerJni::getMarkCommitWithTimestampMethodId(env);
+  if (m_jMarkCommitWithTimestampMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
   m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env);
   if(m_jContinueMethodId == nullptr) {
     // exception thrown
@@ -424,6 +431,23 @@
     return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
                                              // an Exception but we don't know
                                              // the ROCKSDB_NAMESPACE::Status?
+  } else {
+    return ROCKSDB_NAMESPACE::Status(*status);
+  }
+}
+
+ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkCommitWithTimestamp(
+    const Slice& xid, const Slice& ts) {
+  auto markCommitWithTimestamp = [this](jbyteArray j_xid, jbyteArray j_ts) {
+    m_env->CallVoidMethod(m_jcallback_obj, m_jMarkCommitWithTimestampMethodId,
+                          j_xid, j_ts);
+  };
+  auto status =
+      WriteBatchHandlerJniCallback::kv_op(xid, ts, markCommitWithTimestamp);
+  if (status == nullptr) {
+    return ROCKSDB_NAMESPACE::Status::OK();  // TODO(AR) what to do if there is
+                                             // an Exception but we don't know
+                                             // the ROCKSDB_NAMESPACE::Status?
   } else {
     return ROCKSDB_NAMESPACE::Status(*status);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h	2025-05-19 16:14:27.000000000 +0000
@@ -48,6 +48,7 @@
     Status MarkNoop(bool empty_batch);
     Status MarkRollback(const Slice& xid);
     Status MarkCommit(const Slice& xid);
+    Status MarkCommitWithTimestamp(const Slice& xid, const Slice& commit_ts);
     bool Continue();
 
  private:
@@ -69,6 +70,7 @@
     jmethodID m_jMarkNoopMethodId;
     jmethodID m_jMarkRollbackMethodId;
     jmethodID m_jMarkCommitMethodId;
+    jmethodID m_jMarkCommitWithTimestampMethodId;
     jmethodID m_jContinueMethodId;
     /**
      * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni.pom mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni.pom
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/rocksjni.pom	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/rocksjni.pom	1970-01-01 00:00:00.000000000 +0000
@@ -1,150 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project
-        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
-        xmlns="http://maven.apache.org/POM/4.0.0"
-        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-    <modelVersion>4.0.0</modelVersion>
-    <name>RocksDB JNI</name>
-    <url>http://rocksdb.org/</url>
-    <groupId>org.rocksdb</groupId>
-    <artifactId>rocksdbjni</artifactId>
-    <!-- Version will be automatically replaced -->
-    <version>-</version>
-    <description>RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files
-        for Mac OSX, and a .dll for Windows x64.
-    </description>
-    <licenses>
-        <license>
-            <name>Apache License 2.0</name>
-            <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
-            <distribution>repo</distribution>
-        </license>
-        <license>
-            <name>GNU General Public License, version 2</name>
-            <url>http://www.gnu.org/licenses/gpl-2.0.html</url>
-            <distribution>repo</distribution>
-        </license>
-    </licenses>
-    <scm>
-        <connection>scm:git:git://github.com/dropwizard/metrics.git</connection>
-        <developerConnection>scm:git:git@github.com:dropwizard/metrics.git</developerConnection>
-        <url>http://github.com/dropwizard/metrics/</url>
-        <tag>HEAD</tag>
-    </scm>
-    <developers>
-        <developer>
-            <name>Facebook</name>
-            <email>help@facebook.com</email>
-            <timezone>America/New_York</timezone>
-            <roles>
-                <role>architect</role>
-            </roles>
-        </developer>
-    </developers>
-
-    <properties>
-        <project.build.source>1.7</project.build.source>
-        <project.build.target>1.7</project.build.target>
-        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    </properties>
-
-    <build>
-        <plugins>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-compiler-plugin</artifactId>
-                <version>3.2</version>
-                <configuration>
-                    <source>${project.build.source}</source>
-                    <target>${project.build.target}</target>
-                    <encoding>${project.build.sourceEncoding}</encoding>
-                </configuration>
-            </plugin>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-surefire-plugin</artifactId>
-                <version>2.18.1</version>
-                <configuration>
-                    <argLine>${argLine} -ea -Xcheck:jni -Djava.library.path=${project.build.directory}</argLine>
-                    <useManifestOnlyJar>false</useManifestOnlyJar>  
-                    <useSystemClassLoader>false</useSystemClassLoader>
-                    <additionalClasspathElements>
-                        <additionalClasspathElement>${project.build.directory}/*</additionalClasspathElement>
-                    </additionalClasspathElements>
-                </configuration>
-            </plugin>
-            <plugin>
-                <groupId>org.jacoco</groupId>
-                <artifactId>jacoco-maven-plugin</artifactId>
-                <version>0.7.2.201409121644</version>
-                <executions>
-                    <execution>
-                        <goals>
-                            <goal>prepare-agent</goal>
-                        </goals>
-                    </execution>
-                    <execution>
-                        <id>report</id>
-                        <phase>prepare-package</phase>
-                        <goals>
-                            <goal>report</goal>
-                        </goals>
-                    </execution>
-                </executions>
-            </plugin>
-            <plugin>
-                <groupId>org.codehaus.gmaven</groupId>
-                <artifactId>groovy-maven-plugin</artifactId>
-                <version>2.0</version>
-                <executions>
-                    <execution>
-                        <phase>process-classes</phase>
-                        <goals>
-                            <goal>execute</goal>
-                        </goals>
-                        <configuration>
-                            <defaults>
-                                <name>Xenu</name>
-                            </defaults>
-                            <source>
-                                String fileContents = new File(project.basedir.absolutePath + '/../include/rocksdb/version.h').getText('UTF-8')
-                                matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/)
-                                String major_version = matcher.getAt(0).getAt(1)
-                                matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/)
-                                String minor_version = matcher.getAt(0).getAt(1)
-                                matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/)
-                                String patch_version = matcher.getAt(0).getAt(1)
-                                String version = String.format('%s.%s.%s', major_version, minor_version, patch_version)
-                                // Set version to be used in pom.properties
-                                project.version = version
-                                // Set version to be set as jar name
-                                project.build.finalName = project.artifactId + "-" + version
-                            </source>
-                        </configuration>
-                    </execution>
-                </executions>
-            </plugin>
-        </plugins>
-    </build>
-
-    <dependencies>
-        <dependency>
-            <groupId>junit</groupId>
-            <artifactId>junit</artifactId>
-            <version>4.12</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>org.assertj</groupId>
-            <artifactId>assertj-core</artifactId>
-            <version>1.7.1</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>org.mockito</groupId>
-            <artifactId>mockito-all</artifactId>
-            <version>1.10.19</version>
-            <scope>test</scope>
-        </dependency>
-    </dependencies>
-</project>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java	2025-05-19 16:14:27.000000000 +0000
@@ -111,7 +111,7 @@
       // Read a key using the snapshot.
       readOptions.setSnapshot(snapshot);
       final byte[] value = txn.getForUpdate(readOptions, key1, true);
-      assert(value == value1);
+      assert (value == null);
 
       try {
         // Attempt to commit transaction
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java	2025-05-19 16:14:27.000000000 +0000
@@ -53,8 +53,8 @@
 
       try {
         // put and get from non-default column family
-        db.put(columnFamilyHandles.get(0), new WriteOptions(),
-            "key".getBytes(), "value".getBytes());
+        db.put(
+            columnFamilyHandles.get(1), new WriteOptions(), "key".getBytes(), "value".getBytes());
 
         // atomic write
         try (final WriteBatch wb = new WriteBatch()) {
@@ -62,7 +62,7 @@
               "value2".getBytes());
           wb.put(columnFamilyHandles.get(1), "key3".getBytes(),
               "value3".getBytes());
-          wb.remove(columnFamilyHandles.get(0), "key".getBytes());
+          wb.delete(columnFamilyHandles.get(1), "key".getBytes());
           db.write(new WriteOptions(), wb);
         }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/samples/src/main/java/RocksDBSample.java	2025-05-19 16:14:27.000000000 +0000
@@ -45,7 +45,7 @@
             .setStatistics(stats)
             .setWriteBufferSize(8 * SizeUnit.KB)
             .setMaxWriteBufferNumber(3)
-            .setMaxBackgroundCompactions(10)
+            .setMaxBackgroundJobs(10)
             .setCompressionType(CompressionType.SNAPPY_COMPRESSION)
             .setCompactionStyle(CompactionStyle.UNIVERSAL);
       } catch (final IllegalArgumentException e) {
@@ -55,7 +55,7 @@
       assert (options.createIfMissing() == true);
       assert (options.writeBufferSize() == 8 * SizeUnit.KB);
       assert (options.maxWriteBufferNumber() == 3);
-      assert (options.maxBackgroundCompactions() == 10);
+      assert (options.maxBackgroundJobs() == 10);
       assert (options.compressionType() == CompressionType.SNAPPY_COMPRESSION);
       assert (options.compactionStyle() == CompactionStyle.UNIVERSAL);
 
@@ -87,24 +87,17 @@
       options.setRateLimiter(rateLimiter);
 
       final BlockBasedTableConfig table_options = new BlockBasedTableConfig();
-      table_options.setBlockCacheSize(64 * SizeUnit.KB)
-          .setFilter(bloomFilter)
-          .setCacheNumShardBits(6)
+      Cache cache = new LRUCache(64 * 1024, 6);
+      table_options.setBlockCache(cache)
+          .setFilterPolicy(bloomFilter)
           .setBlockSizeDeviation(5)
           .setBlockRestartInterval(10)
           .setCacheIndexAndFilterBlocks(true)
-          .setHashIndexAllowCollision(false)
-          .setBlockCacheCompressedSize(64 * SizeUnit.KB)
-          .setBlockCacheCompressedNumShardBits(10);
+          .setBlockCacheCompressed(new LRUCache(64 * 1000, 10));
 
-      assert (table_options.blockCacheSize() == 64 * SizeUnit.KB);
-      assert (table_options.cacheNumShardBits() == 6);
       assert (table_options.blockSizeDeviation() == 5);
       assert (table_options.blockRestartInterval() == 10);
       assert (table_options.cacheIndexAndFilterBlocks() == true);
-      assert (table_options.hashIndexAllowCollision() == false);
-      assert (table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB);
-      assert (table_options.blockCacheCompressedNumShardBits() == 10);
 
       options.setTableFormatConfig(table_options);
       assert (options.tableFactoryName().equals("BlockBasedTable"));
@@ -203,14 +196,14 @@
         len = db.get(readOptions, testKey, enoughArray);
         assert (len == testValue.length);
 
-        db.remove(testKey);
+        db.delete(testKey);
         len = db.get(testKey, enoughArray);
         assert (len == RocksDB.NOT_FOUND);
 
         // repeat the test with WriteOptions
         try (final WriteOptions writeOpts = new WriteOptions()) {
           writeOpts.setSync(true);
-          writeOpts.setDisableWAL(true);
+          writeOpts.setDisableWAL(false);
           db.put(writeOpts, testKey, testValue);
           len = db.get(testKey, enoughArray);
           assert (len == testValue.length);
@@ -284,15 +277,15 @@
           }
         }
 
-        Map<byte[], byte[]> values = db.multiGet(keys);
+        List<byte[]> values = db.multiGetAsList(keys);
         assert (values.size() == keys.size());
-        for (final byte[] value1 : values.values()) {
+        for (final byte[] value1 : values) {
           assert (value1 != null);
         }
 
-        values = db.multiGet(new ReadOptions(), keys);
+        values = db.multiGetAsList(new ReadOptions(), keys);
         assert (values.size() == keys.size());
-        for (final byte[] value1 : values.values()) {
+        for (final byte[] value1 : values) {
           assert (value1 != null);
         }
       } catch (final RocksDBException e) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractEventListener.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,334 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.rocksdb.AbstractEventListener.EnabledEventCallback.*;
+
+/**
+ * Base class for Event Listeners.
+ */
+public abstract class AbstractEventListener extends RocksCallbackObject implements EventListener {
+  public enum EnabledEventCallback {
+    ON_FLUSH_COMPLETED((byte) 0x0),
+    ON_FLUSH_BEGIN((byte) 0x1),
+    ON_TABLE_FILE_DELETED((byte) 0x2),
+    ON_COMPACTION_BEGIN((byte) 0x3),
+    ON_COMPACTION_COMPLETED((byte) 0x4),
+    ON_TABLE_FILE_CREATED((byte) 0x5),
+    ON_TABLE_FILE_CREATION_STARTED((byte) 0x6),
+    ON_MEMTABLE_SEALED((byte) 0x7),
+    ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED((byte) 0x8),
+    ON_EXTERNAL_FILE_INGESTED((byte) 0x9),
+    ON_BACKGROUND_ERROR((byte) 0xA),
+    ON_STALL_CONDITIONS_CHANGED((byte) 0xB),
+    ON_FILE_READ_FINISH((byte) 0xC),
+    ON_FILE_WRITE_FINISH((byte) 0xD),
+    ON_FILE_FLUSH_FINISH((byte) 0xE),
+    ON_FILE_SYNC_FINISH((byte) 0xF),
+    ON_FILE_RANGE_SYNC_FINISH((byte) 0x10),
+    ON_FILE_TRUNCATE_FINISH((byte) 0x11),
+    ON_FILE_CLOSE_FINISH((byte) 0x12),
+    SHOULD_BE_NOTIFIED_ON_FILE_IO((byte) 0x13),
+    ON_ERROR_RECOVERY_BEGIN((byte) 0x14),
+    ON_ERROR_RECOVERY_COMPLETED((byte) 0x15);
+
+    private final byte value;
+
+    EnabledEventCallback(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * Get the internal representation value.
+     *
+     * @return the internal representation value
+     */
+    byte getValue() {
+      return value;
+    }
+
+    /**
+     * Get the EnabledEventCallbacks from the internal representation value.
+     *
+     * @return the enabled event callback.
+     *
+     * @throws IllegalArgumentException if the value is unknown.
+     */
+    static EnabledEventCallback fromValue(final byte value) {
+      for (final EnabledEventCallback enabledEventCallback : EnabledEventCallback.values()) {
+        if (enabledEventCallback.value == value) {
+          return enabledEventCallback;
+        }
+      }
+
+      throw new IllegalArgumentException(
+          "Illegal value provided for EnabledEventCallback: " + value);
+    }
+  }
+
+  /**
+   * Creates an Event Listener that will
+   * received all callbacks from C++.
+   *
+   * If you don't need all callbacks, it is much more efficient to
+   * just register for the ones you need by calling
+   * {@link #AbstractEventListener(EnabledEventCallback...)} instead.
+   */
+  protected AbstractEventListener() {
+    this(ON_FLUSH_COMPLETED, ON_FLUSH_BEGIN, ON_TABLE_FILE_DELETED, ON_COMPACTION_BEGIN,
+        ON_COMPACTION_COMPLETED, ON_TABLE_FILE_CREATED, ON_TABLE_FILE_CREATION_STARTED,
+        ON_MEMTABLE_SEALED, ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, ON_EXTERNAL_FILE_INGESTED,
+        ON_BACKGROUND_ERROR, ON_STALL_CONDITIONS_CHANGED, ON_FILE_READ_FINISH, ON_FILE_WRITE_FINISH,
+        ON_FILE_FLUSH_FINISH, ON_FILE_SYNC_FINISH, ON_FILE_RANGE_SYNC_FINISH,
+        ON_FILE_TRUNCATE_FINISH, ON_FILE_CLOSE_FINISH, SHOULD_BE_NOTIFIED_ON_FILE_IO,
+        ON_ERROR_RECOVERY_BEGIN, ON_ERROR_RECOVERY_COMPLETED);
+  }
+
+  /**
+   * Creates an Event Listener that will
+   * receive only certain callbacks from C++.
+   *
+   * @param enabledEventCallbacks callbacks to enable in Java.
+   */
+  protected AbstractEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+    super(packToLong(enabledEventCallbacks));
+  }
+
+  /**
+   * Pack EnabledEventCallbacks to a long.
+   *
+   * @param enabledEventCallbacks the flags
+   *
+   * @return a long
+   */
+  private static long packToLong(final EnabledEventCallback... enabledEventCallbacks) {
+    long l = 0;
+    for (int i = 0; i < enabledEventCallbacks.length; i++) {
+      l |= 1 << enabledEventCallbacks[i].getValue();
+    }
+    return l;
+  }
+
+  @Override
+  public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onFlushCompleted(RocksDB, FlushJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param flushJobInfo the flush job info
+   */
+  private void onFlushCompletedProxy(final long dbHandle, final FlushJobInfo flushJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onFlushCompleted(db, flushJobInfo);
+  }
+
+  @Override
+  public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onFlushBegin(RocksDB, FlushJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param flushJobInfo the flush job info
+   */
+  private void onFlushBeginProxy(final long dbHandle, final FlushJobInfo flushJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onFlushBegin(db, flushJobInfo);
+  }
+
+  @Override
+  public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onCompactionBegin(RocksDB, CompactionJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param compactionJobInfo the flush job info
+   */
+  private void onCompactionBeginProxy(
+      final long dbHandle, final CompactionJobInfo compactionJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onCompactionBegin(db, compactionJobInfo);
+  }
+
+  @Override
+  public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param compactionJobInfo the flush job info
+   */
+  private void onCompactionCompletedProxy(
+      final long dbHandle, final CompactionJobInfo compactionJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onCompactionCompleted(db, compactionJobInfo);
+  }
+
+  @Override
+  public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onTableFileCreationStarted(
+      final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onMemTableSealed(final MemTableInfo memTableInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+    // no-op
+  }
+
+  @Override
+  public void onExternalFileIngested(
+      final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onExternalFileIngested(RocksDB, ExternalFileIngestionInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param externalFileIngestionInfo the flush job info
+   */
+  private void onExternalFileIngestedProxy(
+      final long dbHandle, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onExternalFileIngested(db, externalFileIngestionInfo);
+  }
+
+  @Override
+  public void onBackgroundError(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onBackgroundError(BackgroundErrorReason, Status)}.
+   *
+   * @param reasonByte byte value representing error reason
+   * @param backgroundError status with error code
+   */
+  private void onBackgroundErrorProxy(final byte reasonByte, final Status backgroundError) {
+    onBackgroundError(BackgroundErrorReason.fromValue(reasonByte), backgroundError);
+  }
+
+  @Override
+  public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public boolean shouldBeNotifiedOnFileIO() {
+    return false;
+  }
+
+  @Override
+  public boolean onErrorRecoveryBegin(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+    return true;
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onErrorRecoveryBegin(BackgroundErrorReason, Status)}.
+   *
+   * @param reasonByte byte value representing error reason
+   * @param backgroundError status with error code
+   */
+  private boolean onErrorRecoveryBeginProxy(final byte reasonByte, final Status backgroundError) {
+    return onErrorRecoveryBegin(BackgroundErrorReason.fromValue(reasonByte), backgroundError);
+  }
+
+  @Override
+  public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+    // no-op
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewEventListener(nativeParameterHandles[0]);
+  }
+
+  /**
+   * Deletes underlying C++ native callback object pointer
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native long createNewEventListener(final long enabledEventCallbackValues);
+  private native void disposeInternal(final long handle);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -7,7 +7,7 @@
 
   protected static final String KEY_VALUE_PAIR_SEPARATOR = ";";
   protected static final char KEY_VALUE_SEPARATOR = '=';
-  static final String INT_ARRAY_INT_SEPARATOR = ",";
+  static final String INT_ARRAY_INT_SEPARATOR = ":";
 
   protected final String[] keys;
   private final String[] values;
@@ -59,6 +59,7 @@
       K extends MutableOptionKey> {
 
     private final Map<K, MutableOptionValue<?>> options = new LinkedHashMap<>();
+    private final List<OptionString.Entry> unknown = new ArrayList<>();
 
     protected abstract U self();
 
@@ -213,44 +214,147 @@
       return ((MutableOptionValue.MutableOptionEnumValue<N>) value).asObject();
     }
 
-    public U fromString(
-        final String keyStr, final String valueStr)
+    /**
+     * Parse a string into a long value, accepting values expressed as a double (such as 9.00) which
+     * are meant to be a long, not a double
+     *
+     * @param value the string containing a value which represents a long
+     * @return the long value of the parsed string
+     */
+    private long parseAsLong(final String value) {
+      try {
+        return Long.parseLong(value);
+      } catch (NumberFormatException nfe) {
+        final double doubleValue = Double.parseDouble(value);
+        if (doubleValue != Math.round(doubleValue))
+          throw new IllegalArgumentException("Unable to parse or round " + value + " to int");
+        return Math.round(doubleValue);
+      }
+    }
+
+    /**
+     * Parse a string into an int value, accepting values expressed as a double (such as 9.00) which
+     * are meant to be an int, not a double
+     *
+     * @param value the string containing a value which represents an int
+     * @return the int value of the parsed string
+     */
+    private int parseAsInt(final String value) {
+      try {
+        return Integer.parseInt(value);
+      } catch (NumberFormatException nfe) {
+        final double doubleValue = Double.parseDouble(value);
+        if (doubleValue != Math.round(doubleValue))
+          throw new IllegalArgumentException("Unable to parse or round " + value + " to long");
+        return (int) Math.round(doubleValue);
+      }
+    }
+
+    /**
+     * Constructs a builder for mutable column family options from a hierarchical parsed options
+     * string representation. The {@link OptionString.Parser} class output has been used to create a
+     * (name,value)-list; each value may be either a simple string or a (name, value)-list in turn.
+     *
+     * @param options a list of parsed option string objects
+     * @param ignoreUnknown what to do if the key is not one of the keys we expect
+     *
+     * @return a builder with the values from the parsed input set
+     *
+     * @throws IllegalArgumentException if an option value is of the wrong type, or a key is empty
+     */
+    protected U fromParsed(final List<OptionString.Entry> options, final boolean ignoreUnknown) {
+      Objects.requireNonNull(options);
+
+      for (final OptionString.Entry option : options) {
+        try {
+          if (option.key.isEmpty()) {
+            throw new IllegalArgumentException("options string is invalid: " + option);
+          }
+          fromOptionString(option, ignoreUnknown);
+        } catch (NumberFormatException nfe) {
+          throw new IllegalArgumentException(
+              "" + option.key + "=" + option.value + " - not a valid value for its type", nfe);
+        }
+      }
+
+      return self();
+    }
+
+    /**
+     * Set a value in the builder from the supplied option string
+     *
+     * @param option the option key/value to add to this builder
+     * @param ignoreUnknown if this is not set, throw an exception when a key is not in the known
+     *     set
+     * @return the same object, after adding options
+     * @throws IllegalArgumentException if the key is unkown, or a value has the wrong type/form
+     */
+    private U fromOptionString(final OptionString.Entry option, final boolean ignoreUnknown)
         throws IllegalArgumentException {
-      Objects.requireNonNull(keyStr);
-      Objects.requireNonNull(valueStr);
+      Objects.requireNonNull(option.key);
+      Objects.requireNonNull(option.value);
+
+      final K key = allKeys().get(option.key);
+      if (key == null && ignoreUnknown) {
+        unknown.add(option);
+        return self();
+      } else if (key == null) {
+        throw new IllegalArgumentException("Key: " + key + " is not a known option key");
+      }
 
-      final K key = allKeys().get(keyStr);
-      switch(key.getValueType()) {
+      if (!option.value.isList()) {
+        throw new IllegalArgumentException(
+            "Option: " + key + " is not a simple value or list, don't know how to parse it");
+      }
+
+      // Check that simple values are the single item in the array
+      if (key.getValueType() != MutableOptionKey.ValueType.INT_ARRAY) {
+        {
+          if (option.value.list.size() != 1) {
+            throw new IllegalArgumentException(
+                "Simple value does not have exactly 1 item: " + option.value.list);
+          }
+        }
+      }
+
+      final List<String> valueStrs = option.value.list;
+      final String valueStr = valueStrs.get(0);
+
+      switch (key.getValueType()) {
         case DOUBLE:
           return setDouble(key, Double.parseDouble(valueStr));
 
         case LONG:
-          return setLong(key, Long.parseLong(valueStr));
+          return setLong(key, parseAsLong(valueStr));
 
         case INT:
-          return setInt(key, Integer.parseInt(valueStr));
+          return setInt(key, parseAsInt(valueStr));
 
         case BOOLEAN:
           return setBoolean(key, Boolean.parseBoolean(valueStr));
 
         case INT_ARRAY:
-          final String[] strInts = valueStr
-              .trim().split(INT_ARRAY_INT_SEPARATOR);
-          if(strInts == null || strInts.length == 0) {
-            throw new IllegalArgumentException(
-                "int array value is not correctly formatted");
-          }
-
-          final int value[] = new int[strInts.length];
-          int i = 0;
-          for(final String strInt : strInts) {
-            value[i++] = Integer.parseInt(strInt);
+          final int[] value = new int[valueStrs.size()];
+          for (int i = 0; i < valueStrs.size(); i++) {
+            value[i] = Integer.parseInt(valueStrs.get(i));
           }
           return setIntArray(key, value);
+
+        case ENUM:
+          final CompressionType compressionType = CompressionType.getFromInternal(valueStr);
+          return setEnum(key, compressionType);
+
+        default:
+          throw new IllegalStateException(key + " has unknown value type: " + key.getValueType());
       }
+    }
 
-      throw new IllegalStateException(
-          key + " has unknown value type: " + key.getValueType());
+    /**
+     *
+     * @return the list of keys encountered which were not known to the type being generated
+     */
+    public List<OptionString.Entry> getUnknown() {
+      return new ArrayList<>(unknown);
     }
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractNativeReference.java	2025-05-19 16:14:27.000000000 +0000
@@ -67,7 +67,7 @@
   @Override
   @Deprecated
   protected void finalize() throws Throwable {
-    if(isOwningHandle()) {
+    if (isOwningHandle()) {
       //TODO(AR) log a warning message... developer should have called close()
     }
     dispose();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java	2025-05-19 16:14:27.000000000 +0000
@@ -93,6 +93,12 @@
   }
 
   @Override
+  public void refresh() throws RocksDBException {
+    assert (isOwningHandle());
+    refresh0(nativeHandle_);
+  }
+
+  @Override
   public void status() throws RocksDBException {
     assert (isOwningHandle());
     status0(nativeHandle_);
@@ -118,6 +124,7 @@
   abstract void seekToLast0(long handle);
   abstract void next0(long handle);
   abstract void prev0(long handle);
+  abstract void refresh0(long handle) throws RocksDBException;
   abstract void seek0(long handle, byte[] target, int targetLen);
   abstract void seekForPrev0(long handle, byte[] target, int targetLen);
   abstract void seekDirect0(long handle, ByteBuffer target, int targetOffset, int targetLen);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java	2025-05-19 16:14:27.000000000 +0000
@@ -56,7 +56,21 @@
     delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
   }
 
-  public void put(ByteBuffer key, ByteBuffer value) throws RocksDBException {
+  @Override
+  @Deprecated
+  public void remove(final ByteBuffer key) throws RocksDBException {
+    this.delete(key);
+  }
+
+  @Override
+  @Deprecated
+  public void remove(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key)
+      throws RocksDBException {
+    this.delete(columnFamilyHandle, key);
+  }
+
+  @Override
+  public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException {
     assert key.isDirect() && value.isDirect();
     putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(),
         value.remaining(), 0);
@@ -65,8 +79,8 @@
   }
 
   @Override
-  public void put(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key, ByteBuffer value)
-      throws RocksDBException {
+  public void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key,
+      final ByteBuffer value) throws RocksDBException {
     assert key.isDirect() && value.isDirect();
     putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(),
         value.remaining(), columnFamilyHandle.nativeHandle_);
@@ -85,6 +99,19 @@
     delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
   }
 
+  @Override
+  public void delete(final ByteBuffer key) throws RocksDBException {
+    deleteDirect(nativeHandle_, key, key.position(), key.remaining(), 0);
+    key.position(key.limit());
+  }
+
+  @Override
+  public void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key)
+      throws RocksDBException {
+    deleteDirect(
+        nativeHandle_, key, key.position(), key.remaining(), columnFamilyHandle.nativeHandle_);
+    key.position(key.limit());
+  }
 
   @Override
   public void singleDelete(byte[] key) throws RocksDBException {
@@ -110,19 +137,6 @@
         columnFamilyHandle.nativeHandle_);
   }
 
-  public void remove(ByteBuffer key) throws RocksDBException {
-    removeDirect(nativeHandle_, key, key.position(), key.remaining(), 0);
-    key.position(key.limit());
-  }
-
-  @Override
-  public void remove(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key)
-      throws RocksDBException {
-    removeDirect(
-        nativeHandle_, key, key.position(), key.remaining(), columnFamilyHandle.nativeHandle_);
-    key.position(key.limit());
-  }
-
   @Override
   public void putLogData(byte[] blob) throws RocksDBException {
     putLogData(nativeHandle_, blob, blob.length);
@@ -184,13 +198,13 @@
   abstract void delete(final long handle, final byte[] key,
       final int keyLen, final long cfHandle) throws RocksDBException;
 
-  abstract void singleDelete(final long handle, final byte[] key,
-                       final int keyLen) throws RocksDBException;
+  abstract void singleDelete(final long handle, final byte[] key, final int keyLen)
+      throws RocksDBException;
 
-  abstract void singleDelete(final long handle, final byte[] key,
-                       final int keyLen, final long cfHandle) throws RocksDBException;
+  abstract void singleDelete(final long handle, final byte[] key, final int keyLen,
+      final long cfHandle) throws RocksDBException;
 
-  abstract void removeDirect(final long handle, final ByteBuffer key, final int keyOffset,
+  abstract void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset,
       final int keyLength, final long cfHandle) throws RocksDBException;
 
   abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -301,7 +301,7 @@
    * @return the reference to the current options.
    */
   @Experimental("Turning this feature on or off for an existing DB can cause" +
-      "unexpected LSM tree structure so it's not recommended")
+      " unexpected LSM tree structure so it's not recommended")
   T setLevelCompactionDynamicLevelBytes(
       boolean enableLevelCompactionDynamicLevelBytes);
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -461,4 +461,258 @@
    * @return the time-to-live.
    */
   long ttl();
+
+  /**
+   * Files older than this value will be picked up for compaction, and
+   * re-written to the same level as they were before.
+   * One main use of the feature is to make sure a file goes through compaction
+   * filters periodically. Users can also use the feature to clear up SST
+   * files using old format.
+   *
+   * A file's age is computed by looking at file_creation_time or creation_time
+   * table properties in order, if they have valid non-zero values; if not, the
+   * age is based on the file's last modified time (given by the underlying
+   * Env).
+   *
+   * Supported in Level and FIFO compaction.
+   * In FIFO compaction, this option has the same meaning as TTL and whichever
+   * stricter will be used.
+   * Pre-req: max_open_file == -1.
+   * unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
+   *
+   * Values:
+   * 0: Turn off Periodic compactions.
+   * UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
+   *     as needed. For now, RocksDB will change this value to 30 days
+   *     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+   *     process at least once every 30 days if not compacted sooner.
+   *     In FIFO compaction, since the option has the same meaning as ttl,
+   *     when this value is left default, and ttl is left to 0, 30 days will be
+   *     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
+   *
+   * Default: 0xfffffffffffffffe (allow RocksDB to auto-tune)
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param periodicCompactionSeconds the periodic compaction in seconds.
+   *
+   * @return the reference to the current options.
+   */
+  T setPeriodicCompactionSeconds(final long periodicCompactionSeconds);
+
+  /**
+   * Get the periodicCompactionSeconds.
+   *
+   * See {@link #setPeriodicCompactionSeconds(long)}.
+   *
+   * @return the periodic compaction in seconds.
+   */
+  long periodicCompactionSeconds();
+
+  //
+  // BEGIN options for blobs (integrated BlobDB)
+  //
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param enableBlobFiles true iff blob files should be enabled
+   *
+   * @return the reference to the current options.
+   */
+  T setEnableBlobFiles(final boolean enableBlobFiles);
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return true iff blob files are enabled
+   */
+  boolean enableBlobFiles();
+
+  /**
+   * Set the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param minBlobSize the size of the smallest value to be stored separately in a blob file
+   * @return the reference to the current options.
+   */
+  T setMinBlobSize(final long minBlobSize);
+
+  /**
+   * Get the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the current minimum size of value which is stored separately in a blob
+   */
+  long minBlobSize();
+
+  /**
+   * Set the size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached. Note that enable_blob_files has to be set in
+   * order for this option to have any effect.
+   *
+   * Default: 256 MB
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobFileSize the size limit for blob files
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobFileSize(final long blobFileSize);
+
+  /**
+   * The size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached.
+   *
+   * @return the current size limit for blob files
+   */
+  long blobFileSize();
+
+  /**
+   * Set the compression algorithm to use for large values stored in blob files. Note
+   * that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * Default: no compression
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param compressionType the compression algorithm to use.
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobCompressionType(CompressionType compressionType);
+
+  /**
+   * Get the compression algorithm in use for large values stored in blob files.
+   * Note that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * @return the current compression algorithm
+   */
+  CompressionType blobCompressionType();
+
+  /**
+   * Enable/disable garbage collection of blobs. Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @param enableBlobGarbageCollection the new enabled/disabled state of blob garbage collection
+   *
+   * @return the reference to the current options.
+   */
+  T setEnableBlobGarbageCollection(final boolean enableBlobGarbageCollection);
+
+  /**
+   * Query whether garbage collection of blobs is enabled.Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @return true iff blob garbage collection is currently enabled.
+   */
+  boolean enableBlobGarbageCollection();
+
+  /**
+   * Set cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @param blobGarbageCollectionAgeCutoff the new age cutoff
+   *
+   * @return the reference to the current options.
+   */
+  T setBlobGarbageCollectionAgeCutoff(double blobGarbageCollectionAgeCutoff);
+  /**
+   * Get cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @return the current age cutoff for garbage collection
+   */
+  double blobGarbageCollectionAgeCutoff();
+
+  /**
+   *  If the ratio of garbage in the oldest blob files exceeds this threshold,
+   *  targeted compactions are scheduled in order to force garbage collecting
+   *  the blob files in question, assuming they are all eligible based on the
+   *  value of {@link #blobGarbageCollectionAgeCutoff} above. This option is
+   *  currently only supported with leveled compactions.
+   *
+   *  Note that {@link #enableBlobGarbageCollection} has to be set in order for this
+   *  option to have any effect.
+   *
+   *  Default: 1.0
+   *
+   * Dynamically changeable through the SetOptions() API
+   *
+   * @param blobGarbageCollectionForceThreshold new value for the threshold
+   * @return the reference to the current options
+   */
+  T setBlobGarbageCollectionForceThreshold(double blobGarbageCollectionForceThreshold);
+
+  /**
+   * Get the current value for the {@link #blobGarbageCollectionForceThreshold}
+   * @return the current threshold at which garbage collection of blobs is forced
+   */
+  double blobGarbageCollectionForceThreshold();
+
+  //
+  // END options for blobs (integrated BlobDB)
+  //
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BackgroundErrorReason.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum BackgroundErrorReason {
+  FLUSH((byte) 0x0),
+  COMPACTION((byte) 0x1),
+  WRITE_CALLBACK((byte) 0x2),
+  MEMTABLE((byte) 0x3);
+
+  private final byte value;
+
+  BackgroundErrorReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the BackgroundErrorReason from the internal representation value.
+   *
+   * @return the background error reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static BackgroundErrorReason fromValue(final byte value) {
+    for (final BackgroundErrorReason backgroundErrorReason : BackgroundErrorReason.values()) {
+      if (backgroundErrorReason.value == value) {
+        return backgroundErrorReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for BackgroundErrorReason: " + value);
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java	2025-05-19 16:14:27.000000000 +0000
@@ -15,7 +15,7 @@
   public BlockBasedTableConfig() {
     //TODO(AR) flushBlockPolicyFactory
     cacheIndexAndFilterBlocks = false;
-    cacheIndexAndFilterBlocksWithHighPriority = false;
+    cacheIndexAndFilterBlocksWithHighPriority = true;
     pinL0FilterAndIndexBlocksInCache = false;
     pinTopLevelIndexAndFilter = true;
     indexType = IndexType.kBinarySearch;
@@ -32,14 +32,16 @@
     indexBlockRestartInterval = 1;
     metadataBlockSize = 4096;
     partitionFilters = false;
+    optimizeFiltersForMemory = false;
     useDeltaEncoding = true;
     filterPolicy = null;
     wholeKeyFiltering = true;
-    verifyCompression = true;
+    verifyCompression = false;
     readAmpBytesPerBit = 0;
-    formatVersion = 2;
+    formatVersion = 5;
     enableIndexCompression = true;
     blockAlign = false;
+    indexShortening = IndexShorteningMode.kShortenSeparators;
 
     // NOTE: ONLY used if blockCache == null
     blockCacheSize = 8 * 1024 * 1024;
@@ -77,7 +79,7 @@
 
   /**
    * Indicates if index and filter blocks will be treated as high-priority in the block cache.
-   * See note below about applicability. If not specified, defaults to false.
+   * See note below about applicability. If not specified, defaults to true.
    *
    * @return if index and filter blocks will be treated as high-priority.
    */
@@ -453,6 +455,65 @@
     return this;
   }
 
+  /***
+   * Option to generate Bloom filters that minimize memory
+   * internal fragmentation.
+   *
+   * See {@link #setOptimizeFiltersForMemory(boolean)}.
+   *
+   * @return true if bloom filters are used to minimize memory internal
+   *     fragmentation
+   */
+  @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation")
+  public boolean optimizeFiltersForMemory() {
+    return optimizeFiltersForMemory;
+  }
+
+  /**
+   * Option to generate Bloom filters that minimize memory
+   * internal fragmentation.
+   *
+   * When false, malloc_usable_size is not available, or format_version &lt; 5,
+   * filters are generated without regard to internal fragmentation when
+   * loaded into memory (historical behavior). When true (and
+   * malloc_usable_size is available and {@link #formatVersion()} &gt;= 5),
+   * then Bloom filters are generated to "round up" and "round down" their
+   * sizes to minimize internal fragmentation when loaded into memory, assuming
+   * the reading DB has the same memory allocation characteristics as the
+   * generating DB. This option does not break forward or backward
+   * compatibility.
+   *
+   * While individual filters will vary in bits/key and false positive rate
+   * when setting is true, the implementation attempts to maintain a weighted
+   * average FP rate for filters consistent with this option set to false.
+   *
+   * With Jemalloc for example, this setting is expected to save about 10% of
+   * the memory footprint and block cache charge of filters, while increasing
+   * disk usage of filters by about 1-2% due to encoding efficiency losses
+   * with variance in bits/key.
+   *
+   * NOTE: Because some memory counted by block cache might be unmapped pages
+   * within internal fragmentation, this option can increase observed RSS
+   * memory usage. With {@link #cacheIndexAndFilterBlocks()} == true,
+   * this option makes the block cache better at using space it is allowed.
+   *
+   * NOTE: Do not set to true if you do not trust malloc_usable_size. With
+   * this option, RocksDB might access an allocated memory object beyond its
+   * original size if malloc_usable_size says it is safe to do so. While this
+   * can be considered bad practice, it should not produce undefined behavior
+   * unless malloc_usable_size is buggy or broken.
+   *
+   * @param optimizeFiltersForMemory true to enable Bloom filters that minimize
+   *     memory internal fragmentation, or false to disable.
+   *
+   * @return the reference to the current config.
+   */
+  @Experimental("Option to generate Bloom filters that minimize memory internal fragmentation")
+  public BlockBasedTableConfig setOptimizeFiltersForMemory(final boolean optimizeFiltersForMemory) {
+    this.optimizeFiltersForMemory = optimizeFiltersForMemory;
+    return this;
+  }
+
   /**
    * Determine if delta encoding is being used to compress block keys.
    *
@@ -648,10 +709,13 @@
    * <li><strong>4</strong> - Can be read by RocksDB's versions since 5.16. Changes the way we
    * encode the values in index blocks. If you don't plan to run RocksDB before
    * version 5.16 and you are using index_block_restart_interval &gt; 1, you should
-   * probably use this as it would reduce the index size.</li>
+   * probably use this as it would reduce the index size.
+   * This option only affects newly written tables. When reading existing
+   * tables, the information about version is read from the footer.</li>
+   * <li><strong>5</strong> - Can be read by RocksDB's versions since 6.6.0.
+   * Full and partitioned filters use a generally faster and more accurate
+   * Bloom filter implementation, with a different schema.</li>
    * </ul>
-   * <p> This option only affects newly written tables. When reading existing
-   * tables, the information about version is read from the footer.</p>
    *
    * @param formatVersion integer representing the version to be used.
    *
@@ -659,7 +723,7 @@
    */
   public BlockBasedTableConfig setFormatVersion(
       final int formatVersion) {
-    assert(formatVersion >= 0 && formatVersion <= 4);
+    assert (formatVersion >= 0);
     this.formatVersion = formatVersion;
     return this;
   }
@@ -717,6 +781,28 @@
     return this;
   }
 
+  /**
+   * Get the index shortening mode.
+   *
+   * @return the index shortening mode.
+   */
+  public IndexShorteningMode indexShortening() {
+    return indexShortening;
+  }
+
+  /**
+   * Set the index shortening mode.
+   *
+   * See {@link IndexShorteningMode}.
+   *
+   * @param indexShortening the index shortening mode.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexShortening(final IndexShorteningMode indexShortening) {
+    this.indexShortening = indexShortening;
+    return this;
+  }
 
   /**
    * Get the size of the cache in bytes that will be used by RocksDB.
@@ -900,54 +986,35 @@
     }
 
     return newTableFactoryHandle(cacheIndexAndFilterBlocks,
-        cacheIndexAndFilterBlocksWithHighPriority,
-        pinL0FilterAndIndexBlocksInCache, pinTopLevelIndexAndFilter,
-        indexType.getValue(), dataBlockIndexType.getValue(),
-        dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache,
-        blockCacheHandle, persistentCacheHandle, blockCacheCompressedHandle,
-        blockSize, blockSizeDeviation, blockRestartInterval,
-        indexBlockRestartInterval, metadataBlockSize, partitionFilters,
-        useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering,
-        verifyCompression, readAmpBytesPerBit, formatVersion,
-        enableIndexCompression, blockAlign,
-        blockCacheSize, blockCacheNumShardBits,
+        cacheIndexAndFilterBlocksWithHighPriority, pinL0FilterAndIndexBlocksInCache,
+        pinTopLevelIndexAndFilter, indexType.getValue(), dataBlockIndexType.getValue(),
+        dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache, blockCacheHandle,
+        persistentCacheHandle, blockCacheCompressedHandle, blockSize, blockSizeDeviation,
+        blockRestartInterval, indexBlockRestartInterval, metadataBlockSize, partitionFilters,
+        optimizeFiltersForMemory, useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering,
+        verifyCompression, readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign,
+        indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits,
         blockCacheCompressedSize, blockCacheCompressedNumShardBits);
   }
 
-  private native long newTableFactoryHandle(
-      final boolean cacheIndexAndFilterBlocks,
+  private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks,
       final boolean cacheIndexAndFilterBlocksWithHighPriority,
-      final boolean pinL0FilterAndIndexBlocksInCache,
-      final boolean pinTopLevelIndexAndFilter,
-      final byte indexTypeValue,
-      final byte dataBlockIndexTypeValue,
-      final double dataBlockHashTableUtilRatio,
-      final byte checksumTypeValue,
-      final boolean noBlockCache,
-      final long blockCacheHandle,
-      final long persistentCacheHandle,
-      final long blockCacheCompressedHandle,
-      final long blockSize,
-      final int blockSizeDeviation,
-      final int blockRestartInterval,
-      final int indexBlockRestartInterval,
-      final long metadataBlockSize,
-      final boolean partitionFilters,
-      final boolean useDeltaEncoding,
-      final long filterPolicyHandle,
-      final boolean wholeKeyFiltering,
-      final boolean verifyCompression,
-      final int readAmpBytesPerBit,
-      final int formatVersion,
-      final boolean enableIndexCompression,
-      final boolean blockAlign,
+      final boolean pinL0FilterAndIndexBlocksInCache, final boolean pinTopLevelIndexAndFilter,
+      final byte indexTypeValue, final byte dataBlockIndexTypeValue,
+      final double dataBlockHashTableUtilRatio, final byte checksumTypeValue,
+      final boolean noBlockCache, final long blockCacheHandle, final long persistentCacheHandle,
+      final long blockCacheCompressedHandle, final long blockSize, final int blockSizeDeviation,
+      final int blockRestartInterval, final int indexBlockRestartInterval,
+      final long metadataBlockSize, final boolean partitionFilters,
+      final boolean optimizeFiltersForMemory, final boolean useDeltaEncoding,
+      final long filterPolicyHandle, final boolean wholeKeyFiltering,
+      final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion,
+      final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening,
 
-      @Deprecated final long blockCacheSize,
-      @Deprecated final int blockCacheNumShardBits,
+      @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits,
 
       @Deprecated final long blockCacheCompressedSize,
-      @Deprecated final int blockCacheCompressedNumShardBits
-  );
+      @Deprecated final int blockCacheCompressedNumShardBits);
 
   //TODO(AR) flushBlockPolicyFactory
   private boolean cacheIndexAndFilterBlocks;
@@ -968,6 +1035,7 @@
   private int indexBlockRestartInterval;
   private long metadataBlockSize;
   private boolean partitionFilters;
+  private boolean optimizeFiltersForMemory;
   private boolean useDeltaEncoding;
   private Filter filterPolicy;
   private boolean wholeKeyFiltering;
@@ -976,6 +1044,7 @@
   private int formatVersion;
   private boolean enableIndexCompression;
   private boolean blockAlign;
+  private IndexShorteningMode indexShortening;
 
   // NOTE: ONLY used if blockCache == null
   @Deprecated private long blockCacheSize;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,44 @@
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+/**
+ * A ByteBuffer containing fetched data, together with a result for the fetch
+ * and the total size of the object fetched.
+ *
+ * Used for the individual results of
+ * {@link RocksDB#multiGetByteBuffers(List, List)}
+ * {@link RocksDB#multiGetByteBuffers(List, List, List)}
+ * {@link RocksDB#multiGetByteBuffers(ReadOptions, List, List)}
+ * {@link RocksDB#multiGetByteBuffers(ReadOptions, List, List, List)}
+ */
+public class ByteBufferGetStatus {
+  public final Status status;
+  public final int requiredSize;
+  public final ByteBuffer value;
+
+  /**
+   * Constructor used for success status, when the value is contained in the buffer
+   *
+   * @param status the status of the request to fetch into the buffer
+   * @param requiredSize the size of the data, which may be bigger than the buffer
+   * @param value the buffer containing as much of the value as fits
+   */
+  ByteBufferGetStatus(final Status status, final int requiredSize, final ByteBuffer value) {
+    this.status = status;
+    this.requiredSize = requiredSize;
+    this.value = value;
+  }
+
+  /**
+   * Constructor used for a failure status, when no value is filled in
+   *
+   * @param status the status of the request to fetch into the buffer
+   */
+  ByteBufferGetStatus(final Status status) {
+    this.status = status;
+    this.requiredSize = 0;
+    this.value = null;
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Cache.java	2025-05-19 16:14:27.000000000 +0000
@@ -10,4 +10,31 @@
   protected Cache(final long nativeHandle) {
     super(nativeHandle);
   }
+
+  /**
+   * Returns the memory size for the entries
+   * residing in cache.
+   *
+   * @return cache usage size.
+   *
+   */
+  public long getUsage() {
+    assert (isOwningHandle());
+    return getUsage(this.nativeHandle_);
+  }
+
+  /**
+   * Returns the memory size for the entries
+   * being pinned in cache.
+   *
+   * @return cache pinned usage size.
+   *
+   */
+  public long getPinnedUsage() {
+    assert (isOwningHandle());
+    return getPinnedUsage(this.nativeHandle_);
+  }
+
+  private native static long getUsage(final long handle);
+  private native static long getPinnedUsage(final long handle);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java	2025-05-19 16:14:27.000000000 +0000
@@ -20,7 +20,11 @@
   /**
    * XX Hash
    */
-  kxxHash((byte) 2);
+  kxxHash((byte) 2),
+  /**
+   * XX Hash 64
+   */
+  kxxHash64((byte) 3);
 
   /**
    * Returns the byte value of the enumerations value
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java	2025-05-19 16:14:27.000000000 +0000
@@ -13,6 +13,12 @@
  * ColumnFamily Pointers.
  */
 public class ColumnFamilyHandle extends RocksObject {
+  /**
+   * Constructs column family Java object, which operates on underlying native object.
+   *
+   * @param rocksDB db instance associated with this column family
+   * @param nativeHandle native handle to underlying native ColumnFamily object
+   */
   ColumnFamilyHandle(final RocksDB rocksDB,
       final long nativeHandle) {
     super(nativeHandle);
@@ -25,6 +31,28 @@
   }
 
   /**
+   * Constructor called only from JNI.
+   *
+   * NOTE: we are producing an additional Java Object here to represent the underlying native C++
+   * ColumnFamilyHandle object. The underlying object is not owned by ourselves. The Java API user
+   * likely already had a ColumnFamilyHandle Java object which owns the underlying C++ object, as
+   * they will have been presented it when they opened the database or added a Column Family.
+   *
+   *
+   * TODO(AR) - Potentially a better design would be to cache the active Java Column Family Objects
+   * in RocksDB, and return the same Java Object instead of instantiating a new one here. This could
+   * also help us to improve the Java API semantics for Java users. See for example
+   * https://github.com/facebook/rocksdb/issues/2687.
+   *
+   * @param nativeHandle native handle to the column family.
+   */
+  ColumnFamilyHandle(final long nativeHandle) {
+    super(nativeHandle);
+    rocksDB_ = null;
+    disOwnNativeHandle();
+  }
+
+  /**
    * Gets the name of the Column Family.
    *
    * @return The name of the Column Family.
@@ -32,6 +60,7 @@
    * @throws RocksDBException if an error occurs whilst retrieving the name.
    */
   public byte[] getName() throws RocksDBException {
+    assert(isOwningHandle() || isDefaultColumnFamily());
     return getName(nativeHandle_);
   }
 
@@ -41,6 +70,7 @@
    * @return the ID of the Column Family.
    */
   public int getID() {
+    assert(isOwningHandle() || isDefaultColumnFamily());
     return getID(nativeHandle_);
   }
 
@@ -59,7 +89,7 @@
    *     descriptor.
    */
   public ColumnFamilyDescriptor getDescriptor() throws RocksDBException {
-    assert(isOwningHandle());
+    assert(isOwningHandle() || isDefaultColumnFamily());
     return getDescriptor(nativeHandle_);
   }
 
@@ -85,12 +115,18 @@
   @Override
   public int hashCode() {
     try {
-      return Objects.hash(getName(), getID(), rocksDB_.nativeHandle_);
+      int result = Objects.hash(getID(), rocksDB_.nativeHandle_);
+      result = 31 * result + Arrays.hashCode(getName());
+      return result;
     } catch (RocksDBException e) {
       throw new RuntimeException("Cannot calculate hash code of column family handle", e);
     }
   }
 
+  protected boolean isDefaultColumnFamily() {
+    return nativeHandle_ == rocksDB_.getDefaultColumnFamily().nativeHandle_;
+  }
+
   /**
    * <p>Deletes underlying C++ iterator pointer.</p>
    *
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,9 +5,8 @@
 
 package org.rocksdb;
 
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Properties;
+import java.nio.file.Paths;
+import java.util.*;
 
 /**
  * ColumnFamilyOptions to control the behavior of a database.  It will be used
@@ -52,6 +51,8 @@
     this.compactionOptionsFIFO_ = other.compactionOptionsFIFO_;
     this.bottommostCompressionOptions_ = other.bottommostCompressionOptions_;
     this.compressionOptions_ = other.compressionOptions_;
+    this.compactionThreadLimiter_ = other.compactionThreadLimiter_;
+    this.sstPartitionerFactory_ = other.sstPartitionerFactory_;
   }
 
   /**
@@ -96,20 +97,40 @@
    */
   public static ColumnFamilyOptions getColumnFamilyOptionsFromProps(
       final Properties properties) {
-    if (properties == null || properties.size() == 0) {
-      throw new IllegalArgumentException(
-          "Properties value must contain at least one value.");
-    }
     ColumnFamilyOptions columnFamilyOptions = null;
-    StringBuilder stringBuilder = new StringBuilder();
-    for (final String name : properties.stringPropertyNames()){
-      stringBuilder.append(name);
-      stringBuilder.append("=");
-      stringBuilder.append(properties.getProperty(name));
-      stringBuilder.append(";");
+    final long handle =
+        getColumnFamilyOptionsFromProps(Options.getOptionStringFromProps(properties));
+    if (handle != 0) {
+      columnFamilyOptions = new ColumnFamilyOptions(handle);
     }
-    long handle = getColumnFamilyOptionsFromProps(
-        stringBuilder.toString());
+    return columnFamilyOptions;
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code writeBufferSize()} has a property key:
+   * {@code write_buffer_size}.</p>
+   *
+   * @param cfgOpts  ConfigOptions controlling how the properties are parsed.
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link Properties} instance is passed to the method call.
+   */
+  public static ColumnFamilyOptions getColumnFamilyOptionsFromProps(
+      final ConfigOptions cfgOpts, final Properties properties) {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    final long handle = getColumnFamilyOptionsFromProps(
+        cfgOpts.nativeHandle_, Options.getOptionStringFromProps(properties));
     if (handle != 0){
       columnFamilyOptions = new ColumnFamilyOptions(handle);
     }
@@ -117,12 +138,24 @@
   }
 
   @Override
+  public ColumnFamilyOptions oldDefaults(final int majorVersion, final int minorVersion) {
+    oldDefaults(nativeHandle_, majorVersion, minorVersion);
+    return this;
+  }
+
+  @Override
   public ColumnFamilyOptions optimizeForSmallDb() {
     optimizeForSmallDb(nativeHandle_);
     return this;
   }
 
   @Override
+  public ColumnFamilyOptions optimizeForSmallDb(final Cache cache) {
+    optimizeForSmallDb(nativeHandle_, cache.getNativeHandle());
+    return this;
+  }
+
+  @Override
   public ColumnFamilyOptions optimizeForPointLookup(
       final long blockCacheSizeMb) {
     optimizeForPointLookup(nativeHandle_,
@@ -307,7 +340,7 @@
     final byte[] byteCompressionTypes =
         compressionPerLevel(nativeHandle_);
     final List<CompressionType> compressionLevels = new ArrayList<>();
-    for (final Byte byteCompressionType : byteCompressionTypes) {
+    for (final byte byteCompressionType : byteCompressionTypes) {
       compressionLevels.add(CompressionType.getCompressionType(
           byteCompressionType));
     }
@@ -576,6 +609,45 @@
   }
 
   @Override
+  public ColumnFamilyOptions setCfPaths(final Collection<DbPath> cfPaths) {
+    assert (isOwningHandle());
+
+    final int len = cfPaths.size();
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    int i = 0;
+    for (final DbPath dbPath : cfPaths) {
+      paths[i] = dbPath.path.toString();
+      targetSizes[i] = dbPath.targetSize;
+      i++;
+    }
+    setCfPaths(nativeHandle_, paths, targetSizes);
+    return this;
+  }
+
+  @Override
+  public List<DbPath> cfPaths() {
+    final int len = (int) cfPathsLen(nativeHandle_);
+
+    if (len == 0) {
+      return Collections.emptyList();
+    }
+
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    cfPaths(nativeHandle_, paths, targetSizes);
+
+    final List<DbPath> cfPaths = new ArrayList<>();
+    for (int i = 0; i < len; i++) {
+      cfPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i]));
+    }
+
+    return cfPaths;
+  }
+
+  @Override
   public ColumnFamilyOptions setInplaceUpdateSupport(
       final boolean inplaceUpdateSupport) {
     setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
@@ -787,6 +859,17 @@
   }
 
   @Override
+  public ColumnFamilyOptions setPeriodicCompactionSeconds(final long periodicCompactionSeconds) {
+    setPeriodicCompactionSeconds(nativeHandle_, periodicCompactionSeconds);
+    return this;
+  }
+
+  @Override
+  public long periodicCompactionSeconds() {
+    return periodicCompactionSeconds(nativeHandle_);
+  }
+
+  @Override
   public ColumnFamilyOptions setCompactionOptionsUniversal(
       final CompactionOptionsUniversal compactionOptionsUniversal) {
     setCompactionOptionsUniversal(nativeHandle_,
@@ -824,8 +907,304 @@
     return forceConsistencyChecks(nativeHandle_);
   }
 
+  @Override
+  public ColumnFamilyOptions setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) {
+    setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_);
+    this.sstPartitionerFactory_ = sstPartitionerFactory;
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionThreadLimiter(
+      final ConcurrentTaskLimiter compactionThreadLimiter) {
+    setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_);
+    this.compactionThreadLimiter_ = compactionThreadLimiter;
+    return this;
+  }
+
+  @Override
+  public ConcurrentTaskLimiter compactionThreadLimiter() {
+    assert (isOwningHandle());
+    return this.compactionThreadLimiter_;
+  }
+
+  @Override
+  public SstPartitionerFactory sstPartitionerFactory() {
+    return sstPartitionerFactory_;
+  }
+
+  //
+  // BEGIN options for blobs (integrated BlobDB)
+  //
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param enableBlobFiles true iff blob files should be enabled
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setEnableBlobFiles(final boolean enableBlobFiles) {
+    setEnableBlobFiles(nativeHandle_, enableBlobFiles);
+    return this;
+  }
+
+  /**
+   * When set, large values (blobs) are written to separate blob files, and only
+   * pointers to them are stored in SST files. This can reduce write amplification
+   * for large-value use cases at the cost of introducing a level of indirection
+   * for reads. See also the options min_blob_size, blob_file_size,
+   * blob_compression_type, enable_blob_garbage_collection, and
+   * blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return true iff blob files are currently enabled
+   */
+  public boolean enableBlobFiles() {
+    return enableBlobFiles(nativeHandle_);
+  }
+
+  /**
+   * Set the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param minBlobSize the size of the smallest value to be stored separately in a blob file
+   * @return these options, updated with the supplied minimum blob size value
+   */
+  @Override
+  public ColumnFamilyOptions setMinBlobSize(final long minBlobSize) {
+    setMinBlobSize(nativeHandle_, minBlobSize);
+    return this;
+  }
+
+  /**
+   * Get the size of the smallest value to be stored separately in a blob file. Values
+   * which have an uncompressed size smaller than this threshold are stored
+   * alongside the keys in SST files in the usual fashion. A value of zero for
+   * this option means that all values are stored in blob files. Note that
+   * enable_blob_files has to be set in order for this option to have any effect.
+   *
+   * Default: 0
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the current minimum blob size
+   */
+  @Override
+  public long minBlobSize() {
+    return minBlobSize(nativeHandle_);
+  }
+
+  /**
+   * Set the size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached. Note that enable_blob_files has to be set in
+   * order for this option to have any effect.
+   *
+   * Default: 256 MB
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param blobFileSize the new size limit for blob files
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobFileSize(final long blobFileSize) {
+    setBlobFileSize(nativeHandle_, blobFileSize);
+    return this;
+  }
+
+  /**
+   * Get the size limit for blob files. When writing blob files, a new file is opened
+   * once this limit is reached. Note that enable_blob_files has to be set in
+   * order for this option to have any effect.
+   *
+   * Default: 256 MB
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the size limit for blob files
+   */
+  @Override
+  public long blobFileSize() {
+    return blobFileSize(nativeHandle_);
+  }
+
+  /**
+   * Set the compression algorithm to use for large values stored in blob files. Note
+   * that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * Default: no compression
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param compressionType the compression algorithm to use
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobCompressionType(final CompressionType compressionType) {
+    setBlobCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  /**
+   * Get the compression algorithm to use for large values stored in blob files. Note
+   * that enable_blob_files has to be set in order for this option to have any
+   * effect.
+   *
+   * Default: no compression
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @return the compression algorithm currently in use for blobs
+   */
+  @Override
+  public CompressionType blobCompressionType() {
+    return CompressionType.values()[blobCompressionType(nativeHandle_)];
+  }
+
+  /**
+   * Enable/disable garbage collection of blobs. Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @param enableBlobGarbageCollection true iff blob garbage collection is to be enabled
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setEnableBlobGarbageCollection(
+      final boolean enableBlobGarbageCollection) {
+    setEnableBlobGarbageCollection(nativeHandle_, enableBlobGarbageCollection);
+    return this;
+  }
+
+  /**
+   * Get enabled/disables state for garbage collection of blobs. Blob GC is performed as part of
+   * compaction. Valid blobs residing in blob files older than a cutoff get
+   * relocated to new files as they are encountered during compaction, which makes
+   * it possible to clean up blob files once they contain nothing but
+   * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+   *
+   * Default: false
+   *
+   * @return true iff blob garbage collection is currently enabled
+   */
+  @Override
+  public boolean enableBlobGarbageCollection() {
+    return enableBlobGarbageCollection(nativeHandle_);
+  }
+
+  /**
+   * Set the cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @param blobGarbageCollectionAgeCutoff the new blob garbage collection age cutoff
+   *
+   * @return the reference to the current options.
+   */
+  @Override
+  public ColumnFamilyOptions setBlobGarbageCollectionAgeCutoff(
+      final double blobGarbageCollectionAgeCutoff) {
+    setBlobGarbageCollectionAgeCutoff(nativeHandle_, blobGarbageCollectionAgeCutoff);
+    return this;
+  }
+
+  /**
+   * Get the cutoff in terms of blob file age for garbage collection. Blobs in the
+   * oldest N blob files will be relocated when encountered during compaction,
+   * where N = garbage_collection_cutoff * number_of_blob_files. Note that
+   * enable_blob_garbage_collection has to be set in order for this option to have
+   * any effect.
+   *
+   * Default: 0.25
+   *
+   * @return the current blob garbage collection age cutoff
+   */
+  @Override
+  public double blobGarbageCollectionAgeCutoff() {
+    return blobGarbageCollectionAgeCutoff(nativeHandle_);
+  }
+
+  /**
+   *  If the ratio of garbage in the oldest blob files exceeds this threshold,
+   *  targeted compactions are scheduled in order to force garbage collecting
+   *  the blob files in question, assuming they are all eligible based on the
+   *  value of {@link #blobGarbageCollectionAgeCutoff} above. This option is
+   *  currently only supported with leveled compactions.
+   *
+   *  Note that {@link #enableBlobGarbageCollection} has to be set in order for this
+   *  option to have any effect.
+   *
+   *  Default: 1.0
+   *
+   * Dynamically changeable through the SetOptions() API
+   *
+   * @param blobGarbageCollectionForceThreshold new value for the threshold
+   * @return the reference to the current options
+   */
+  @Override
+  public ColumnFamilyOptions setBlobGarbageCollectionForceThreshold(
+      final double blobGarbageCollectionForceThreshold) {
+    setBlobGarbageCollectionForceThreshold(nativeHandle_, blobGarbageCollectionForceThreshold);
+    return this;
+  }
+
+  /**
+   * Get the current value for the {@link #blobGarbageCollectionForceThreshold}
+   * @return the current threshold at which garbage collection of blobs is forced
+   */
+  @Override
+  public double blobGarbageCollectionForceThreshold() {
+    return blobGarbageCollectionForceThreshold(nativeHandle_);
+  }
+
+  //
+  // END options for blobs (integrated BlobDB)
+  //
+
   private static native long getColumnFamilyOptionsFromProps(
-      String optString);
+      final long cfgHandle, String optString);
+  private static native long getColumnFamilyOptionsFromProps(final String optString);
 
   private static native long newColumnFamilyOptions();
   private static native long copyColumnFamilyOptions(final long handle);
@@ -833,7 +1212,10 @@
       final long optionsHandle);
   @Override protected final native void disposeInternal(final long handle);
 
+  private static native void oldDefaults(
+      final long handle, final int majorVersion, final int minorVersion);
   private native void optimizeForSmallDb(final long handle);
+  private static native void optimizeForSmallDb(final long handle, final long cacheHandle);
   private native void optimizeForPointLookup(long handle,
       long blockCacheSizeMb);
   private native void optimizeLevelStyleCompaction(long handle,
@@ -922,6 +1304,11 @@
   private native String memTableFactoryName(long handle);
   private native void setTableFactory(long handle, long factoryHandle);
   private native String tableFactoryName(long handle);
+  private static native void setCfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
+  private static native long cfPathsLen(final long handle);
+  private static native void cfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
   private native void setInplaceUpdateSupport(
       long handle, boolean inplaceUpdateSupport);
   private native boolean inplaceUpdateSupport(long handle);
@@ -977,6 +1364,9 @@
   private native boolean reportBgIoStats(final long handle);
   private native void setTtl(final long handle, final long ttl);
   private native long ttl(final long handle);
+  private native void setPeriodicCompactionSeconds(
+      final long handle, final long periodicCompactionSeconds);
+  private native long periodicCompactionSeconds(final long handle);
   private native void setCompactionOptionsUniversal(final long handle,
     final long compactionOptionsUniversalHandle);
   private native void setCompactionOptionsFIFO(final long handle,
@@ -984,6 +1374,27 @@
   private native void setForceConsistencyChecks(final long handle,
     final boolean forceConsistencyChecks);
   private native boolean forceConsistencyChecks(final long handle);
+  private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle);
+  private static native void setCompactionThreadLimiter(
+      final long nativeHandle_, final long compactionThreadLimiterHandle);
+
+  private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles);
+  private native boolean enableBlobFiles(final long nativeHandle_);
+  private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize);
+  private native long minBlobSize(final long nativeHandle_);
+  private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize);
+  private native long blobFileSize(final long nativeHandle_);
+  private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType);
+  private native byte blobCompressionType(final long nativeHandle_);
+  private native void setEnableBlobGarbageCollection(
+      final long nativeHandle_, final boolean enableBlobGarbageCollection);
+  private native boolean enableBlobGarbageCollection(final long nativeHandle_);
+  private native void setBlobGarbageCollectionAgeCutoff(
+      final long nativeHandle_, final double blobGarbageCollectionAgeCutoff);
+  private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_);
+  private native void setBlobGarbageCollectionForceThreshold(
+      final long nativeHandle_, final double blobGarbageCollectionForceThreshold);
+  private native double blobGarbageCollectionForceThreshold(final long nativeHandle_);
 
   // instance variables
   // NOTE: If you add new member variables, please update the copy constructor above!
@@ -997,5 +1408,6 @@
   private CompactionOptionsFIFO compactionOptionsFIFO_;
   private CompressionOptions bottommostCompressionOptions_;
   private CompressionOptions compressionOptions_;
-
+  private SstPartitionerFactory sstPartitionerFactory_;
+  private ConcurrentTaskLimiter compactionThreadLimiter_;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,9 +5,22 @@
 
 package org.rocksdb;
 
+import java.util.Collection;
+import java.util.List;
+
 public interface ColumnFamilyOptionsInterface<T extends ColumnFamilyOptionsInterface<T>>
     extends AdvancedColumnFamilyOptionsInterface<T> {
   /**
+   * The function recovers options to a previous version. Only 4.6 or later
+   * versions are supported.
+   *
+   * @param majorVersion The major version to recover default values of options
+   * @param minorVersion The minor version to recover default values of options
+   * @return the instance of the current object.
+   */
+  T oldDefaults(int majorVersion, int minorVersion);
+
+  /**
    * Use this if your DB is very small (like under 1GB) and you don't want to
    * spend lots of memory for memtables.
    *
@@ -16,6 +29,16 @@
   T optimizeForSmallDb();
 
   /**
+   * Some functions that make it easier to optimize RocksDB
+   * Use this if your DB is very small (like under 1GB) and you don't want to
+   * spend lots of memory for memtables.
+   *
+   * @param cache An optional cache object is passed in to be used as the block cache
+   * @return the instance of the current object.
+   */
+  T optimizeForSmallDb(Cache cache);
+
+  /**
    * Use this if you don't need to keep the data sorted, i.e. you'll never use
    * an iterator, only Put() and Get() API calls
    *
@@ -372,6 +395,30 @@
   String tableFactoryName();
 
   /**
+   * A list of paths where SST files for this column family
+   * can be put into, with its target size. Similar to db_paths,
+   * newer data is placed into paths specified earlier in the
+   * vector while older data gradually moves to paths specified
+   * later in the vector.
+   * Note that, if a path is supplied to multiple column
+   * families, it would have files and total size from all
+   * the column families combined. User should provision for the
+   * total size(from all the column families) in such cases.
+   *
+   * If left empty, db_paths will be used.
+   * Default: empty
+   *
+   * @param paths collection of paths for SST files.
+   * @return the reference of the current options.
+   */
+  T setCfPaths(final Collection<DbPath> paths);
+
+  /**
+   * @return collection of paths for SST files.
+   */
+  List<DbPath> cfPaths();
+
+  /**
    * Compression algorithm that will be used for the bottommost level that
    * contain files. If level-compaction is used, this option will only affect
    * levels after base level.
@@ -438,6 +485,46 @@
   CompressionOptions compressionOptions();
 
   /**
+   * If non-nullptr, use the specified factory for a function to determine the
+   * partitioning of sst files. This helps compaction to split the files
+   * on interesting boundaries (key prefixes) to make propagation of sst
+   * files less write amplifying (covering the whole key space).
+   *
+   * Default: nullptr
+   *
+   * @param factory The factory reference
+   * @return the reference of the current options.
+   */
+  @Experimental("Caution: this option is experimental")
+  T setSstPartitionerFactory(SstPartitionerFactory factory);
+
+  /**
+   * Get SST partitioner factory
+   *
+   * @return SST partitioner factory
+   */
+  @Experimental("Caution: this option is experimental")
+  SstPartitionerFactory sstPartitionerFactory();
+
+  /**
+   * Compaction concurrent thread limiter for the column family.
+   * If non-nullptr, use given concurrent thread limiter to control
+   * the max outstanding compaction tasks. Limiter can be shared with
+   * multiple column families across db instances.
+   *
+   * @param concurrentTaskLimiter The compaction thread limiter.
+   * @return the reference of the current options.
+   */
+  T setCompactionThreadLimiter(ConcurrentTaskLimiter concurrentTaskLimiter);
+
+  /**
+   * Get compaction thread limiter
+   *
+   * @return Compaction thread limiter
+   */
+  ConcurrentTaskLimiter compactionThreadLimiter();
+
+  /**
    * Default memtable memory budget used with the following methods:
    *
    * <ol>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -15,13 +15,14 @@
   private final static byte VALUE_kIfHaveCompactionFilter = 1;
   private final static byte VALUE_kForce = 2;
 
-  // For level based compaction, we can configure if we want to skip/force bottommost level compaction.
-  // The order of this neum MUST follow the C++ layer. See BottommostLevelCompaction in db/options.h
+  // For level based compaction, we can configure if we want to skip/force bottommost level
+  // compaction. The order of this enum MUST follow the C++ layer. See BottommostLevelCompaction in
+  // db/options.h
   public enum BottommostLevelCompaction {
     /**
      * Skip bottommost level compaction
      */
-    kSkip((byte)VALUE_kSkip),
+    kSkip(VALUE_kSkip),
     /**
      * Only compact bottommost level if there is a compaction filter. This is the default option
      */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -20,6 +20,8 @@
    */
   private CompactionJobInfo(final long nativeHandle) {
     super(nativeHandle);
+    // We do not own the native object!
+    disOwnNativeHandle();
   }
 
   /**
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java	2025-05-19 16:14:27.000000000 +0000
@@ -78,7 +78,17 @@
   /**
    * Compaction caused by external sst file ingestion
    */
-  kExternalSstIngestion((byte)0x0D);
+  kExternalSstIngestion((byte) 0x0D),
+
+  /**
+   * Compaction due to SST file being too old
+   */
+  kPeriodicCompaction((byte) 0x0E),
+
+  /**
+   * Compaction in order to move files to temperature
+   */
+  kChangeTemperature((byte) 0x0F);
 
   private final byte value;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java	2025-05-19 16:14:27.000000000 +0000
@@ -14,16 +14,15 @@
  * compression method (if any) is used to compress a block.</p>
  */
 public enum CompressionType {
-
-  NO_COMPRESSION((byte) 0x0, null),
-  SNAPPY_COMPRESSION((byte) 0x1, "snappy"),
-  ZLIB_COMPRESSION((byte) 0x2, "z"),
-  BZLIB2_COMPRESSION((byte) 0x3, "bzip2"),
-  LZ4_COMPRESSION((byte) 0x4, "lz4"),
-  LZ4HC_COMPRESSION((byte) 0x5, "lz4hc"),
-  XPRESS_COMPRESSION((byte) 0x6, "xpress"),
-  ZSTD_COMPRESSION((byte)0x7, "zstd"),
-  DISABLE_COMPRESSION_OPTION((byte)0x7F, null);
+  NO_COMPRESSION((byte) 0x0, null, "kNoCompression"),
+  SNAPPY_COMPRESSION((byte) 0x1, "snappy", "kSnappyCompression"),
+  ZLIB_COMPRESSION((byte) 0x2, "z", "kZlibCompression"),
+  BZLIB2_COMPRESSION((byte) 0x3, "bzip2", "kBZip2Compression"),
+  LZ4_COMPRESSION((byte) 0x4, "lz4", "kLZ4Compression"),
+  LZ4HC_COMPRESSION((byte) 0x5, "lz4hc", "kLZ4HCCompression"),
+  XPRESS_COMPRESSION((byte) 0x6, "xpress", "kXpressCompression"),
+  ZSTD_COMPRESSION((byte) 0x7, "zstd", "kZSTD"),
+  DISABLE_COMPRESSION_OPTION((byte) 0x7F, null, "kDisableCompressionOption");
 
   /**
    * <p>Get the CompressionType enumeration value by
@@ -71,6 +70,27 @@
   }
 
   /**
+   * <p>Get a CompressionType value based on the string key in the C++ options output.
+   * This gets used in support of getting options into Java from an options string,
+   * which is generated at the C++ level.
+   * </p>
+   *
+   * @param internalName the internal (C++) name by which the option is known.
+   *
+   * @return CompressionType instance (optional)
+   */
+  static CompressionType getFromInternal(final String internalName) {
+    for (final CompressionType compressionType : CompressionType.values()) {
+      if (compressionType.internalName_.equals(internalName)) {
+        return compressionType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal internalName '" + internalName + " ' provided for CompressionType.");
+  }
+
+  /**
    * <p>Returns the byte value of the enumerations value.</p>
    *
    * @return byte representation
@@ -89,11 +109,13 @@
     return libraryName_;
   }
 
-  CompressionType(final byte value, final String libraryName) {
+  CompressionType(final byte value, final String libraryName, final String internalName) {
     value_ = value;
     libraryName_ = libraryName;
+    internalName_ = internalName;
   }
 
   private final byte value_;
   private final String libraryName_;
+  private final String internalName_;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiter.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,38 @@
+package org.rocksdb;
+
+public abstract class ConcurrentTaskLimiter extends RocksObject {
+  protected ConcurrentTaskLimiter(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Returns a name that identifies this concurrent task limiter.
+   *
+   * @return Concurrent task limiter name.
+   */
+  public abstract String name();
+
+  /**
+   * Set max concurrent tasks.<br>
+   * limit = 0 means no new task allowed.<br>
+   * limit &lt; 0 means no limitation.
+   *
+   * @param maxOutstandinsTask max concurrent tasks.
+   * @return the reference to the current instance of ConcurrentTaskLimiter.
+   */
+  public abstract ConcurrentTaskLimiter setMaxOutstandingTask(final int maxOutstandinsTask);
+
+  /**
+   * Reset to unlimited max concurrent task.
+   *
+   * @return the reference to the current instance of ConcurrentTaskLimiter.
+   */
+  public abstract ConcurrentTaskLimiter resetMaxOutstandingTask();
+
+  /**
+   * Returns current outstanding task count.
+   *
+   * @return current outstanding task count.
+   */
+  public abstract int outstandingTask();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConcurrentTaskLimiterImpl.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,42 @@
+package org.rocksdb;
+
+public class ConcurrentTaskLimiterImpl extends ConcurrentTaskLimiter {
+  public ConcurrentTaskLimiterImpl(final String name, final int maxOutstandingTask) {
+    super(newConcurrentTaskLimiterImpl0(name, maxOutstandingTask));
+  }
+
+  @Override
+  public String name() {
+    assert (isOwningHandle());
+    return name(nativeHandle_);
+  }
+
+  @Override
+  public ConcurrentTaskLimiter setMaxOutstandingTask(final int maxOutstandingTask) {
+    assert (isOwningHandle());
+    setMaxOutstandingTask(nativeHandle_, maxOutstandingTask);
+    return this;
+  }
+
+  @Override
+  public ConcurrentTaskLimiter resetMaxOutstandingTask() {
+    assert (isOwningHandle());
+    resetMaxOutstandingTask(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public int outstandingTask() {
+    assert (isOwningHandle());
+    return outstandingTask(nativeHandle_);
+  }
+
+  private static native long newConcurrentTaskLimiterImpl0(
+      final String name, final int maxOutstandingTask);
+  private static native String name(final long handle);
+  private static native void setMaxOutstandingTask(final long handle, final int limit);
+  private static native void resetMaxOutstandingTask(final long handle);
+  private static native int outstandingTask(final long handle);
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ConfigOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,47 @@
+package org.rocksdb;
+
+public class ConfigOptions extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct with default Options
+   */
+  public ConfigOptions() {
+    super(newConfigOptions());
+  }
+
+  public ConfigOptions setDelimiter(final String delimiter) {
+    setDelimiter(nativeHandle_, delimiter);
+    return this;
+  }
+  public ConfigOptions setIgnoreUnknownOptions(final boolean ignore) {
+    setIgnoreUnknownOptions(nativeHandle_, ignore);
+    return this;
+  }
+
+  public ConfigOptions setEnv(final Env env) {
+    setEnv(nativeHandle_, env.nativeHandle_);
+    return this;
+  }
+
+  public ConfigOptions setInputStringsEscaped(final boolean escaped) {
+    setInputStringsEscaped(nativeHandle_, escaped);
+    return this;
+  }
+
+  public ConfigOptions setSanityLevel(final SanityLevel level) {
+    setSanityLevel(nativeHandle_, level.getValue());
+    return this;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native static long newConfigOptions();
+  private native static void setEnv(final long handle, final long envHandle);
+  private native static void setDelimiter(final long handle, final String delimiter);
+  private native static void setIgnoreUnknownOptions(final long handle, final boolean ignore);
+  private native static void setInputStringsEscaped(final long handle, final boolean escaped);
+  private native static void setSanityLevel(final long handle, final byte level);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -71,6 +71,7 @@
    * {@code allowMmapReads()} has a property key:
    * {@code allow_mmap_reads}.</p>
    *
+   * @param cfgOpts The ConfigOptions to control how the string is processed.
    * @param properties {@link java.util.Properties} instance.
    *
    * @return {@link org.rocksdb.DBOptions instance}
@@ -80,22 +81,40 @@
    *     {@link java.util.Properties} instance is passed to the method call.
    */
   public static DBOptions getDBOptionsFromProps(
-      final Properties properties) {
-    if (properties == null || properties.size() == 0) {
-      throw new IllegalArgumentException(
-          "Properties value must contain at least one value.");
-    }
+      final ConfigOptions cfgOpts, final Properties properties) {
     DBOptions dbOptions = null;
-    StringBuilder stringBuilder = new StringBuilder();
-    for (final String name : properties.stringPropertyNames()){
-      stringBuilder.append(name);
-      stringBuilder.append("=");
-      stringBuilder.append(properties.getProperty(name));
-      stringBuilder.append(";");
+    final String optionsString = Options.getOptionStringFromProps(properties);
+    final long handle = getDBOptionsFromProps(cfgOpts.nativeHandle_, optionsString);
+    if (handle != 0) {
+      dbOptions = new DBOptions(handle);
     }
-    long handle = getDBOptionsFromProps(
-        stringBuilder.toString());
-    if (handle != 0){
+    return dbOptions;
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code allowMmapReads()} has a property key:
+   * {@code allow_mmap_reads}.</p>
+   *
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.DBOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link java.util.Properties} instance is passed to the method call.
+   */
+  public static DBOptions getDBOptionsFromProps(final Properties properties) {
+    DBOptions dbOptions = null;
+    final String optionsString = Options.getOptionStringFromProps(properties);
+    final long handle = getDBOptionsFromProps(optionsString);
+    if (handle != 0) {
       dbOptions = new DBOptions(handle);
     }
     return dbOptions;
@@ -554,6 +573,18 @@
   }
 
   @Override
+  public DBOptions setMaxWriteBatchGroupSizeBytes(final long maxWriteBatchGroupSizeBytes) {
+    setMaxWriteBatchGroupSizeBytes(nativeHandle_, maxWriteBatchGroupSizeBytes);
+    return this;
+  }
+
+  @Override
+  public long maxWriteBatchGroupSizeBytes() {
+    assert (isOwningHandle());
+    return maxWriteBatchGroupSizeBytes(nativeHandle_);
+  }
+
+  @Override
   public DBOptions setManifestPreallocationSize(
       final long size) {
     assert(isOwningHandle());
@@ -853,32 +884,18 @@
     return strictBytesPerSync(nativeHandle_);
   }
 
-  //TODO(AR) NOW
-//  @Override
-//  public DBOptions setListeners(final List<EventListener> listeners) {
-//    assert(isOwningHandle());
-//    final long[] eventListenerHandlers = new long[listeners.size()];
-//    for (int i = 0; i < eventListenerHandlers.length; i++) {
-//      eventListenerHandlers[i] = listeners.get(i).nativeHandle_;
-//    }
-//    setEventListeners(nativeHandle_, eventListenerHandlers);
-//    return this;
-//  }
-//
-//  @Override
-//  public Collection<EventListener> listeners() {
-//    assert(isOwningHandle());
-//    final long[] eventListenerHandlers = listeners(nativeHandle_);
-//    if (eventListenerHandlers == null || eventListenerHandlers.length == 0) {
-//      return Collections.emptyList();
-//    }
-//
-//    final List<EventListener> eventListeners = new ArrayList<>();
-//    for (final long eventListenerHandle : eventListenerHandlers) {
-//      eventListeners.add(new EventListener(eventListenerHandle)); //TODO(AR) check ownership is set to false!
-//    }
-//    return eventListeners;
-//  }
+  @Override
+  public DBOptions setListeners(final List<AbstractEventListener> listeners) {
+    assert (isOwningHandle());
+    setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners));
+    return this;
+  }
+
+  @Override
+  public List<AbstractEventListener> listeners() {
+    assert (isOwningHandle());
+    return Arrays.asList(eventListeners(nativeHandle_));
+  }
 
   @Override
   public DBOptions setEnableThreadTracking(final boolean enableThreadTracking) {
@@ -992,6 +1009,19 @@
   }
 
   @Override
+  public DBOptions setSkipCheckingSstFileSizesOnDbOpen(
+      final boolean skipCheckingSstFileSizesOnDbOpen) {
+    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
+    return this;
+  }
+
+  @Override
+  public boolean skipCheckingSstFileSizesOnDbOpen() {
+    assert (isOwningHandle());
+    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
+  }
+
+  @Override
   public DBOptions setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
     assert(isOwningHandle());
     setWalRecoveryMode(nativeHandle_, walRecoveryMode.getValue());
@@ -1160,6 +1190,90 @@
     return atomicFlush(nativeHandle_);
   }
 
+  @Override
+  public DBOptions setAvoidUnnecessaryBlockingIO(final boolean avoidUnnecessaryBlockingIO) {
+    setAvoidUnnecessaryBlockingIO(nativeHandle_, avoidUnnecessaryBlockingIO);
+    return this;
+  }
+
+  @Override
+  public boolean avoidUnnecessaryBlockingIO() {
+    assert (isOwningHandle());
+    return avoidUnnecessaryBlockingIO(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setPersistStatsToDisk(final boolean persistStatsToDisk) {
+    setPersistStatsToDisk(nativeHandle_, persistStatsToDisk);
+    return this;
+  }
+
+  @Override
+  public boolean persistStatsToDisk() {
+    assert (isOwningHandle());
+    return persistStatsToDisk(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWriteDbidToManifest(final boolean writeDbidToManifest) {
+    setWriteDbidToManifest(nativeHandle_, writeDbidToManifest);
+    return this;
+  }
+
+  @Override
+  public boolean writeDbidToManifest() {
+    assert (isOwningHandle());
+    return writeDbidToManifest(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setLogReadaheadSize(final long logReadaheadSize) {
+    setLogReadaheadSize(nativeHandle_, logReadaheadSize);
+    return this;
+  }
+
+  @Override
+  public long logReadaheadSize() {
+    assert (isOwningHandle());
+    return logReadaheadSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setBestEffortsRecovery(final boolean bestEffortsRecovery) {
+    setBestEffortsRecovery(nativeHandle_, bestEffortsRecovery);
+    return this;
+  }
+
+  @Override
+  public boolean bestEffortsRecovery() {
+    assert (isOwningHandle());
+    return bestEffortsRecovery(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxBgErrorResumeCount(final int maxBgerrorResumeCount) {
+    setMaxBgErrorResumeCount(nativeHandle_, maxBgerrorResumeCount);
+    return this;
+  }
+
+  @Override
+  public int maxBgerrorResumeCount() {
+    assert (isOwningHandle());
+    return maxBgerrorResumeCount(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setBgerrorResumeRetryInterval(final long bgerrorResumeRetryInterval) {
+    setBgerrorResumeRetryInterval(nativeHandle_, bgerrorResumeRetryInterval);
+    return this;
+  }
+
+  @Override
+  public long bgerrorResumeRetryInterval() {
+    assert (isOwningHandle());
+    return bgerrorResumeRetryInterval(nativeHandle_);
+  }
+
   static final int DEFAULT_NUM_SHARD_BITS = -1;
 
 
@@ -1175,8 +1289,8 @@
     super(nativeHandle);
   }
 
-  private static native long getDBOptionsFromProps(
-      String optString);
+  private static native long getDBOptionsFromProps(long cfgHandle, String optString);
+  private static native long getDBOptionsFromProps(String optString);
 
   private static native long newDBOptions();
   private static native long copyDBOptions(final long handle);
@@ -1262,6 +1376,9 @@
   private native long walTtlSeconds(long handle);
   private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
   private native long walSizeLimitMB(long handle);
+  private static native void setMaxWriteBatchGroupSizeBytes(
+      final long handle, final long maxWriteBatchGroupSizeBytes);
+  private static native long maxWriteBatchGroupSizeBytes(final long handle);
   private native void setManifestPreallocationSize(
       long handle, long size) throws IllegalArgumentException;
   private native long manifestPreallocationSize(long handle);
@@ -1328,6 +1445,9 @@
       final long handle, final boolean strictBytesPerSync);
   private native boolean strictBytesPerSync(
       final long handle);
+  private static native void setEventListeners(
+      final long handle, final long[] eventListenerHandles);
+  private static native AbstractEventListener[] eventListeners(final long handle);
   private native void setEnableThreadTracking(long handle,
       boolean enableThreadTracking);
   private native boolean enableThreadTracking(long handle);
@@ -1354,6 +1474,9 @@
   private native void setSkipStatsUpdateOnDbOpen(final long handle,
       final boolean skipStatsUpdateOnDbOpen);
   private native boolean skipStatsUpdateOnDbOpen(final long handle);
+  private static native void setSkipCheckingSstFileSizesOnDbOpen(
+      final long handle, final boolean skipChecking);
+  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
   private native void setWalRecoveryMode(final long handle,
       final byte walRecoveryMode);
   private native byte walRecoveryMode(final long handle);
@@ -1391,6 +1514,26 @@
   private native void setAtomicFlush(final long handle,
       final boolean atomicFlush);
   private native boolean atomicFlush(final long handle);
+  private static native void setAvoidUnnecessaryBlockingIO(
+      final long handle, final boolean avoidBlockingIO);
+  private static native boolean avoidUnnecessaryBlockingIO(final long handle);
+  private static native void setPersistStatsToDisk(
+      final long handle, final boolean persistStatsToDisk);
+  private static native boolean persistStatsToDisk(final long handle);
+  private static native void setWriteDbidToManifest(
+      final long handle, final boolean writeDbidToManifest);
+  private static native boolean writeDbidToManifest(final long handle);
+  private static native void setLogReadaheadSize(final long handle, final long logReadaheadSize);
+  private static native long logReadaheadSize(final long handle);
+  private static native void setBestEffortsRecovery(
+      final long handle, final boolean bestEffortsRecovery);
+  private static native boolean bestEffortsRecovery(final long handle);
+  private static native void setMaxBgErrorResumeCount(
+      final long handle, final int maxBgerrorRecumeCount);
+  private static native int maxBgerrorResumeCount(final long handle);
+  private static native void setBgerrorResumeRetryInterval(
+      final long handle, final long bgerrorResumeRetryInterval);
+  private static native long bgerrorResumeRetryInterval(final long handle);
 
   // instance variables
   // NOTE: If you add new member variables, please update the copy constructor above!
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -625,7 +625,7 @@
    *    then WAL_size_limit_MB, they will be deleted starting with the
    *    earliest until size_limit is met. All empty files will be deleted.</li>
    * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    WAL files will be checked every WAL_ttl_seconds / 2 and those that
    *    are older than WAL_ttl_seconds will be deleted.</li>
    * <li>If both are not 0, WAL files will be checked every 10 min and both
    *    checks will be performed with ttl being first.</li>
@@ -648,7 +648,7 @@
    * then WAL_size_limit_MB, they will be deleted starting with the
    * earliest until size_limit is met. All empty files will be deleted.</li>
    * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   * WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   * WAL files will be checked every WAL_ttl_seconds / 2 and those that
    * are older than WAL_ttl_seconds will be deleted.</li>
    * <li>If both are not 0, WAL files will be checked every 10 min and both
    * checks will be performed with ttl being first.</li>
@@ -704,6 +704,29 @@
   long walSizeLimitMB();
 
   /**
+   * The maximum limit of number of bytes that are written in a single batch
+   * of WAL or memtable write. It is followed when the leader write size
+   * is larger than 1/8 of this limit.
+   *
+   * Default: 1 MB
+   *
+   * @param maxWriteBatchGroupSizeBytes the maximum limit of number of bytes, see description.
+   * @return the instance of the current object.
+   */
+  T setMaxWriteBatchGroupSizeBytes(final long maxWriteBatchGroupSizeBytes);
+
+  /**
+   * The maximum limit of number of bytes that are written in a single batch
+   * of WAL or memtable write. It is followed when the leader write size
+   * is larger than 1/8 of this limit.
+   *
+   * Default: 1 MB
+   *
+   * @return the maximum limit of number of bytes, see description.
+   */
+  long maxWriteBatchGroupSizeBytes();
+
+  /**
    * Number of bytes to preallocate (via fallocate) the manifest
    * files.  Default is 4mb, which is reasonable to reduce random IO
    * as well as prevent overallocation for mounts that preallocate
@@ -1032,24 +1055,31 @@
    */
   boolean useAdaptiveMutex();
 
-  //TODO(AR) NOW
-//  /**
-//   * Sets the {@link EventListener}s whose callback functions
-//   * will be called when specific RocksDB event happens.
-//   *
-//   * @param listeners the listeners who should be notified on various events.
-//   *
-//   * @return the instance of the current object.
-//   */
-//  T setListeners(final List<EventListener> listeners);
-//
-//  /**
-//   * Gets the {@link EventListener}s whose callback functions
-//   * will be called when specific RocksDB event happens.
-//   *
-//   * @return a collection of Event listeners.
-//   */
-//  Collection<EventListener> listeners();
+  /**
+   * Sets the {@link EventListener}s whose callback functions
+   * will be called when specific RocksDB event happens.
+   *
+   * Note: the RocksJava API currently only supports EventListeners implemented in Java.
+   * It could be extended in future to also support adding/removing EventListeners implemented in
+   * C++.
+   *
+   * @param listeners the listeners who should be notified on various events.
+   *
+   * @return the instance of the current object.
+   */
+  T setListeners(final List<AbstractEventListener> listeners);
+
+  /**
+   * Sets the {@link EventListener}s whose callback functions
+   * will be called when specific RocksDB event happens.
+   *
+   * Note: the RocksJava API currently only supports EventListeners implemented in Java.
+   * It could be extended in future to also support adding/removing EventListeners implemented in
+   * C++.
+   *
+   * @return the instance of the current object.
+   */
+  List<AbstractEventListener> listeners();
 
   /**
    * If true, then the status of the threads involved in this DB will
@@ -1279,6 +1309,36 @@
   boolean skipStatsUpdateOnDbOpen();
 
   /**
+   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
+   * This may significantly speed up startup if there are many sst files,
+   * especially when using non-default Env with expensive GetFileSize().
+   * We'll still check that all required sst files exist.
+   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
+   * not checked at all.
+   *
+   * Default: false
+   *
+   * @param skipCheckingSstFileSizesOnDbOpen if true, then SST file sizes will not be checked
+   *                                         when calling {@link RocksDB#open(String)}.
+   * @return the reference to the current options.
+   */
+  T setSkipCheckingSstFileSizesOnDbOpen(final boolean skipCheckingSstFileSizesOnDbOpen);
+
+  /**
+   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
+   * This may significantly speed up startup if there are many sst files,
+   * especially when using non-default Env with expensive GetFileSize().
+   * We'll still check that all required sst files exist.
+   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
+   * not checked at all.
+   *
+   * Default: false
+   *
+   * @return true, if file sizes will not be checked when calling {@link RocksDB#open(String)}.
+   */
+  boolean skipCheckingSstFileSizesOnDbOpen();
+
+  /**
    * Recovery mode to control the consistency while replaying WAL
    *
    * Default: {@link WALRecoveryMode#PointInTimeRecovery}
@@ -1561,4 +1621,199 @@
    * @return true if atomic flush is enabled.
    */
   boolean atomicFlush();
+
+  /**
+   * If true, working thread may avoid doing unnecessary and long-latency
+   * operation (such as deleting obsolete files directly or deleting memtable)
+   * and will instead schedule a background job to do it.
+   * Use it if you're latency-sensitive.
+   * If set to true, takes precedence over
+   * {@link ReadOptions#setBackgroundPurgeOnIteratorCleanup(boolean)}.
+   *
+   * @param avoidUnnecessaryBlockingIO If true, working thread may avoid doing unnecessary
+   *     operation.
+   * @return the reference to the current options.
+   */
+  T setAvoidUnnecessaryBlockingIO(final boolean avoidUnnecessaryBlockingIO);
+
+  /**
+   * If true, working thread may avoid doing unnecessary and long-latency
+   * operation (such as deleting obsolete files directly or deleting memtable)
+   * and will instead schedule a background job to do it.
+   * Use it if you're latency-sensitive.
+   * If set to true, takes precedence over
+   * {@link ReadOptions#setBackgroundPurgeOnIteratorCleanup(boolean)}.
+   *
+   * @return true, if working thread may avoid doing unnecessary operation.
+   */
+  boolean avoidUnnecessaryBlockingIO();
+
+  /**
+   * If true, automatically persist stats to a hidden column family (column
+   * family name: ___rocksdb_stats_history___) every
+   * stats_persist_period_sec seconds; otherwise, write to an in-memory
+   * struct. User can query through `GetStatsHistory` API.
+   * If user attempts to create a column family with the same name on a DB
+   * which have previously set persist_stats_to_disk to true, the column family
+   * creation will fail, but the hidden column family will survive, as well as
+   * the previously persisted statistics.
+   * When peristing stats to disk, the stat name will be limited at 100 bytes.
+   * Default: false
+   *
+   * @param persistStatsToDisk true if stats should be persisted to hidden column family.
+   * @return the instance of the current object.
+   */
+  T setPersistStatsToDisk(final boolean persistStatsToDisk);
+
+  /**
+   * If true, automatically persist stats to a hidden column family (column
+   * family name: ___rocksdb_stats_history___) every
+   * stats_persist_period_sec seconds; otherwise, write to an in-memory
+   * struct. User can query through `GetStatsHistory` API.
+   * If user attempts to create a column family with the same name on a DB
+   * which have previously set persist_stats_to_disk to true, the column family
+   * creation will fail, but the hidden column family will survive, as well as
+   * the previously persisted statistics.
+   * When peristing stats to disk, the stat name will be limited at 100 bytes.
+   * Default: false
+   *
+   * @return true if stats should be persisted to hidden column family.
+   */
+  boolean persistStatsToDisk();
+
+  /**
+   * Historically DB ID has always been stored in Identity File in DB folder.
+   * If this flag is true, the DB ID is written to Manifest file in addition
+   * to the Identity file. By doing this 2 problems are solved
+   * 1. We don't checksum the Identity file where as Manifest file is.
+   * 2. Since the source of truth for DB is Manifest file DB ID will sit with
+   *    the source of truth. Previously the Identity file could be copied
+   *    independent of Manifest and that can result in wrong DB ID.
+   * We recommend setting this flag to true.
+   * Default: false
+   *
+   * @param writeDbidToManifest if true, then DB ID will be written to Manifest file.
+   * @return the instance of the current object.
+   */
+  T setWriteDbidToManifest(final boolean writeDbidToManifest);
+
+  /**
+   * Historically DB ID has always been stored in Identity File in DB folder.
+   * If this flag is true, the DB ID is written to Manifest file in addition
+   * to the Identity file. By doing this 2 problems are solved
+   * 1. We don't checksum the Identity file where as Manifest file is.
+   * 2. Since the source of truth for DB is Manifest file DB ID will sit with
+   *    the source of truth. Previously the Identity file could be copied
+   *    independent of Manifest and that can result in wrong DB ID.
+   * We recommend setting this flag to true.
+   * Default: false
+   *
+   * @return true, if DB ID will be written to Manifest file.
+   */
+  boolean writeDbidToManifest();
+
+  /**
+   * The number of bytes to prefetch when reading the log. This is mostly useful
+   * for reading a remotely located log, as it can save the number of
+   * round-trips. If 0, then the prefetching is disabled.
+   *
+   * Default: 0
+   *
+   * @param logReadaheadSize the number of bytes to prefetch when reading the log.
+   * @return the instance of the current object.
+   */
+  T setLogReadaheadSize(final long logReadaheadSize);
+
+  /**
+   * The number of bytes to prefetch when reading the log. This is mostly useful
+   * for reading a remotely located log, as it can save the number of
+   * round-trips. If 0, then the prefetching is disabled.
+   *
+   * Default: 0
+   *
+   * @return the number of bytes to prefetch when reading the log.
+   */
+  long logReadaheadSize();
+
+  /**
+   * By default, RocksDB recovery fails if any table file referenced in
+   * MANIFEST are missing after scanning the MANIFEST.
+   * Best-efforts recovery is another recovery mode that
+   * tries to restore the database to the most recent point in time without
+   * missing file.
+   * Currently not compatible with atomic flush. Furthermore, WAL files will
+   * not be used for recovery if best_efforts_recovery is true.
+   * Default: false
+   *
+   * @param bestEffortsRecovery if true, RocksDB will use best-efforts mode when recovering.
+   * @return the instance of the current object.
+   */
+  T setBestEffortsRecovery(final boolean bestEffortsRecovery);
+
+  /**
+   * By default, RocksDB recovery fails if any table file referenced in
+   * MANIFEST are missing after scanning the MANIFEST.
+   * Best-efforts recovery is another recovery mode that
+   * tries to restore the database to the most recent point in time without
+   * missing file.
+   * Currently not compatible with atomic flush. Furthermore, WAL files will
+   * not be used for recovery if best_efforts_recovery is true.
+   * Default: false
+   *
+   * @return true, if RocksDB uses best-efforts mode when recovering.
+   */
+  boolean bestEffortsRecovery();
+
+  /**
+   * It defines how many times db resume is called by a separate thread when
+   * background retryable IO Error happens. When background retryable IO
+   * Error happens, SetBGError is called to deal with the error. If the error
+   * can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+   * then db resume is called in background to recover from the error. If this
+   * value is 0 or negative, db resume will not be called.
+   *
+   * Default: INT_MAX
+   *
+   * @param maxBgerrorResumeCount maximum number of times db resume should be called when IO Error
+   *     happens.
+   * @return the instance of the current object.
+   */
+  T setMaxBgErrorResumeCount(final int maxBgerrorResumeCount);
+
+  /**
+   * It defines how many times db resume is called by a separate thread when
+   * background retryable IO Error happens. When background retryable IO
+   * Error happens, SetBGError is called to deal with the error. If the error
+   * can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
+   * then db resume is called in background to recover from the error. If this
+   * value is 0 or negative, db resume will not be called.
+   *
+   * Default: INT_MAX
+   *
+   * @return maximum number of times db resume should be called when IO Error happens.
+   */
+  int maxBgerrorResumeCount();
+
+  /**
+   * If max_bgerror_resume_count is &ge; 2, db resume is called multiple times.
+   * This option decides how long to wait to retry the next resume if the
+   * previous resume fails and satisfy redo resume conditions.
+   *
+   * Default: 1000000 (microseconds).
+   *
+   * @param bgerrorResumeRetryInterval how many microseconds to wait between DB resume attempts.
+   * @return the instance of the current object.
+   */
+  T setBgerrorResumeRetryInterval(final long bgerrorResumeRetryInterval);
+
+  /**
+   * If max_bgerror_resume_count is &ge; 2, db resume is called multiple times.
+   * This option decides how long to wait to retry the next resume if the
+   * previous resume fails and satisfy redo resume conditions.
+   *
+   * Default: 1000000 (microseconds).
+   *
+   * @return the instance of the current object.
+   */
+  long bgerrorResumeRetryInterval();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java	2025-05-19 16:14:27.000000000 +0000
@@ -110,6 +110,10 @@
     this.internalBufferOffset += n;
   }
 
+  public void setLength(final int n) {
+    setLength0(getNativeHandle(), n);
+  }
+
   @Override
   protected void disposeInternal() {
     final long nativeHandle = getNativeHandle();
@@ -127,6 +131,7 @@
   private native void clear0(long handle, boolean internalBuffer,
       long internalBufferOffset);
   private native void removePrefix0(long handle, int length);
+  private native void setLength0(long handle, int length);
   private native void disposeInternalBuf(final long handle,
       long internalBufferOffset);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Env.java	2025-05-19 16:14:27.000000000 +0000
@@ -43,8 +43,8 @@
   }
 
   /**
-   * <p>Sets the number of background worker threads of the flush pool
-   * for this environment.</p>
+   * <p>Sets the number of background worker threads of the low priority
+   * pool for this environment.</p>
    * <p>Default number: 1</p>
    *
    * @param number the number of threads
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/EventListener.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,335 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * EventListener class contains a set of callback functions that will
+ * be called when specific RocksDB event happens such as flush.  It can
+ * be used as a building block for developing custom features such as
+ * stats-collector or external compaction algorithm.
+ *
+ * Note that callback functions should not run for an extended period of
+ * time before the function returns, otherwise RocksDB may be blocked.
+ * For example, it is not suggested to do
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int,
+ * CompactionJobInfo)} (as it may run for a long while) or issue many of
+ * {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])}
+ * (as Put may be blocked in certain cases) in the same thread in the
+ * EventListener callback.
+ *
+ * However, doing
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int,
+ * CompactionJobInfo)} and {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} in
+ * another thread is considered safe.
+ *
+ * [Threading] All EventListener callback will be called using the
+ * actual thread that involves in that specific event. For example, it
+ * is the RocksDB background flush thread that does the actual flush to
+ * call {@link #onFlushCompleted(RocksDB, FlushJobInfo)}.
+ *
+ * [Locking] All EventListener callbacks are designed to be called without
+ * the current thread holding any DB mutex. This is to prevent potential
+ * deadlock and performance issue when using EventListener callback
+ * in a complex way.
+ */
+public interface EventListener {
+  /**
+   * A callback function to RocksDB which will be called before a
+   * RocksDB starts to flush memtables.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db the database
+   * @param flushJobInfo the flush job info, contains data copied from
+   *     respective native structure.
+   */
+  void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo);
+
+  /**
+   * callback function to RocksDB which will be called whenever a
+   * registered RocksDB flushes a file.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db the database
+   * @param flushJobInfo the flush job info, contains data copied from
+   *     respective native structure.
+   */
+  void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a SST file is deleted. Different from
+   * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)} and
+   * {@link #onFlushCompleted(RocksDB, FlushJobInfo)},
+   * this callback is designed for external logging
+   * service and thus only provide string parameters instead
+   * of a pointer to DB.  Applications that build logic basic based
+   * on file creations and deletions is suggested to implement
+   * {@link #onFlushCompleted(RocksDB, FlushJobInfo)} and
+   * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from the
+   * returned value.
+   *
+   * @param tableFileDeletionInfo the table file deletion info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo);
+
+  /**
+   * A callback function to RocksDB which will be called before a
+   * RocksDB starts to compact. The default implementation is
+   * no-op.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db a pointer to the rocksdb instance which just compacted
+   *     a file.
+   * @param compactionJobInfo a reference to a native CompactionJobInfo struct,
+   *     which is released after this function is returned, and must be copied
+   *     if it is needed outside of this function.
+   */
+  void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a registered RocksDB compacts a file. The default implementation
+   * is a no-op.
+   *
+   * Note that this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db a pointer to the rocksdb instance which just compacted
+   *     a file.
+   * @param compactionJobInfo a reference to a native CompactionJobInfo struct,
+   *     which is released after this function is returned, and must be copied
+   *     if it is needed outside of this function.
+   */
+  void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a SST file is created.  Different from OnCompactionCompleted and
+   * OnFlushCompleted, this callback is designed for external logging
+   * service and thus only provide string parameters instead
+   * of a pointer to DB.  Applications that build logic basic based
+   * on file creations and deletions is suggested to implement
+   * OnFlushCompleted and OnCompactionCompleted.
+   *
+   * Historically it will only be called if the file is successfully created.
+   * Now it will also be called on failure case. User can check info.status
+   * to see if it succeeded or not.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param tableFileCreationInfo the table file creation info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a SST file is being created. It will follow by OnTableFileCreated after
+   * the creation finishes.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param tableFileCreationBriefInfo the table file creation brief info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileCreationStarted(final TableFileCreationBriefInfo tableFileCreationBriefInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a memtable is made immutable.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns.  Otherwise, RocksDB may be blocked.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param memTableInfo the mem table info, contains data
+   *     copied from respective native structure.
+   */
+  void onMemTableSealed(final MemTableInfo memTableInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a column family handle is deleted.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns.  Otherwise, RocksDB may be blocked.
+   *
+   * @param columnFamilyHandle is a pointer to the column family handle to be
+   *     deleted which will become a dangling pointer after the deletion.
+   */
+  void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle);
+
+  /**
+   * A callback function for RocksDB which will be called after an external
+   * file is ingested using IngestExternalFile.
+   *
+   * Note that the this function will run on the same thread as
+   * IngestExternalFile(), if this function is blocked, IngestExternalFile()
+   * will be blocked from finishing.
+   *
+   * @param db the database
+   * @param externalFileIngestionInfo the external file ingestion info,
+   *     contains data copied from respective native structure.
+   */
+  void onExternalFileIngested(
+      final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before setting the
+   * background error status to a non-OK value. The new background error status
+   * is provided in `bg_error` and can be modified by the callback. E.g., a
+   * callback can suppress errors by resetting it to Status::OK(), thus
+   * preventing the database from entering read-only mode. We do not provide any
+   * guarantee when failed flushes/compactions will be rescheduled if the user
+   * suppresses an error.
+   *
+   * Note that this function can run on the same threads as flush, compaction,
+   * and user writes. So, it is extremely important not to perform heavy
+   * computations or blocking calls in this function.
+   *
+   * @param backgroundErrorReason background error reason code
+   * @param backgroundError background error codes
+   */
+  void onBackgroundError(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a change
+   * of superversion triggers a change of the stall conditions.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param writeStallInfo write stall info,
+   *     contains data copied from respective native structure.
+   */
+  void onStallConditionsChanged(final WriteStallInfo writeStallInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file read
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileReadFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file write
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileWriteFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file flush
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileFlushFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file sync
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileSyncFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file
+   * rangeSync operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file
+   * truncate operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileTruncateFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file close
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileCloseFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * If true, the {@link #onFileReadFinish(FileOperationInfo)}
+   * and {@link #onFileWriteFinish(FileOperationInfo)} will be called. If
+   * false, then they won't be called.
+   *
+   * Default: false
+   *
+   * @return whether to callback when file read/write is finished
+   */
+  boolean shouldBeNotifiedOnFileIO();
+
+  /**
+   * A callback function for RocksDB which will be called just before
+   * starting the automatic recovery process for recoverable background
+   * errors, such as NoSpace(). The callback can suppress the automatic
+   * recovery by setting returning false. The database will then
+   * have to be transitioned out of read-only mode by calling
+   * RocksDB#resume().
+   *
+   * @param backgroundErrorReason background error reason code
+   * @param backgroundError background error codes
+   * @return return {@code false} if the automatic recovery should be suppressed
+   */
+  boolean onErrorRecoveryBegin(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError);
+
+  /**
+   * A callback function for RocksDB which will be called once the database
+   * is recovered from read-only mode after an error. When this is called, it
+   * means normal writes to the database can be issued and the user can
+   * initiate any further recovery actions needed
+   *
+   * @param oldBackgroundError old background error codes
+   */
+  void onErrorRecoveryCompleted(final Status oldBackgroundError);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class ExternalFileIngestionInfo {
+  private final String columnFamilyName;
+  private final String externalFilePath;
+  private final String internalFilePath;
+  private final long globalSeqno;
+  private final TableProperties tableProperties;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  ExternalFileIngestionInfo(final String columnFamilyName, final String externalFilePath,
+      final String internalFilePath, final long globalSeqno,
+      final TableProperties tableProperties) {
+    this.columnFamilyName = columnFamilyName;
+    this.externalFilePath = externalFilePath;
+    this.internalFilePath = internalFilePath;
+    this.globalSeqno = globalSeqno;
+    this.tableProperties = tableProperties;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path of the file outside the DB.
+   *
+   * @return the path of the file outside the DB.
+   */
+  public String getExternalFilePath() {
+    return externalFilePath;
+  }
+
+  /**
+   * Get the path of the file inside the DB.
+   *
+   * @return the path of the file inside the DB.
+   */
+  public String getInternalFilePath() {
+    return internalFilePath;
+  }
+
+  /**
+   * Get the global sequence number assigned to keys in this file.
+   *
+   * @return the global sequence number.
+   */
+  public long getGlobalSeqno() {
+    return globalSeqno;
+  }
+
+  /**
+   * Get the Table properties of the table being flushed.
+   *
+   * @return the table properties.
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    ExternalFileIngestionInfo that = (ExternalFileIngestionInfo) o;
+    return globalSeqno == that.globalSeqno
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(externalFilePath, that.externalFilePath)
+        && Objects.equals(internalFilePath, that.internalFilePath)
+        && Objects.equals(tableProperties, that.tableProperties);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(
+        columnFamilyName, externalFilePath, internalFilePath, globalSeqno, tableProperties);
+  }
+
+  @Override
+  public String toString() {
+    return "ExternalFileIngestionInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", externalFilePath='" + externalFilePath
+        + '\'' + ", internalFilePath='" + internalFilePath + '\'' + ", globalSeqno=" + globalSeqno
+        + ", tableProperties=" + tableProperties + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FileOperationInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+/**
+ * Java representation of FileOperationInfo struct from include/rocksdb/listener.h
+ */
+public class FileOperationInfo {
+  private final String path;
+  private final long offset;
+  private final long length;
+  private final long startTimestamp;
+  private final long duration;
+  private final Status status;
+
+  /**
+   * Access is private as this will only be constructed from
+   * C++ via JNI.
+   */
+  FileOperationInfo(final String path, final long offset, final long length,
+      final long startTimestamp, final long duration, final Status status) {
+    this.path = path;
+    this.offset = offset;
+    this.length = length;
+    this.startTimestamp = startTimestamp;
+    this.duration = duration;
+    this.status = status;
+  }
+
+  /**
+   * Get the file path.
+   *
+   * @return the file path.
+   */
+  public String getPath() {
+    return path;
+  }
+
+  /**
+   * Get the offset.
+   *
+   * @return the offset.
+   */
+  public long getOffset() {
+    return offset;
+  }
+
+  /**
+   * Get the length.
+   *
+   * @return the length.
+   */
+  public long getLength() {
+    return length;
+  }
+
+  /**
+   * Get the start timestamp (in nanoseconds).
+   *
+   * @return the start timestamp.
+   */
+  public long getStartTimestamp() {
+    return startTimestamp;
+  }
+
+  /**
+   * Get the operation duration (in nanoseconds).
+   *
+   * @return the operation duration.
+   */
+  public long getDuration() {
+    return duration;
+  }
+
+  /**
+   * Get the status.
+   *
+   * @return the status.
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    FileOperationInfo that = (FileOperationInfo) o;
+    return offset == that.offset && length == that.length && startTimestamp == that.startTimestamp
+        && duration == that.duration && Objects.equals(path, that.path)
+        && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(path, offset, length, startTimestamp, duration, status);
+  }
+
+  @Override
+  public String toString() {
+    return "FileOperationInfo{"
+        + "path='" + path + '\'' + ", offset=" + offset + ", length=" + length + ", startTimestamp="
+        + startTimestamp + ", duration=" + duration + ", status=" + status + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushJobInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,186 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class FlushJobInfo {
+  private final long columnFamilyId;
+  private final String columnFamilyName;
+  private final String filePath;
+  private final long threadId;
+  private final int jobId;
+  private final boolean triggeredWritesSlowdown;
+  private final boolean triggeredWritesStop;
+  private final long smallestSeqno;
+  private final long largestSeqno;
+  private final TableProperties tableProperties;
+  private final FlushReason flushReason;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  FlushJobInfo(final long columnFamilyId, final String columnFamilyName, final String filePath,
+      final long threadId, final int jobId, final boolean triggeredWritesSlowdown,
+      final boolean triggeredWritesStop, final long smallestSeqno, final long largestSeqno,
+      final TableProperties tableProperties, final byte flushReasonValue) {
+    this.columnFamilyId = columnFamilyId;
+    this.columnFamilyName = columnFamilyName;
+    this.filePath = filePath;
+    this.threadId = threadId;
+    this.jobId = jobId;
+    this.triggeredWritesSlowdown = triggeredWritesSlowdown;
+    this.triggeredWritesStop = triggeredWritesStop;
+    this.smallestSeqno = smallestSeqno;
+    this.largestSeqno = largestSeqno;
+    this.tableProperties = tableProperties;
+    this.flushReason = FlushReason.fromValue(flushReasonValue);
+  }
+
+  /**
+   * Get the id of the column family.
+   *
+   * @return the id of the column family
+   */
+  public long getColumnFamilyId() {
+    return columnFamilyId;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path to the newly created file.
+   *
+   * @return the path to the newly created file
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the thread that completed this flush job.
+   *
+   * @return the id of the thread that completed this flush job
+   */
+  public long getThreadId() {
+    return threadId;
+  }
+
+  /**
+   * Get the job id, which is unique in the same thread.
+   *
+   * @return the job id
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Determine if rocksdb is currently slowing-down all writes to prevent
+   * creating too many Level 0 files as compaction seems not able to
+   * catch up the write request speed.
+   *
+   * This indicates that there are too many files in Level 0.
+   *
+   * @return true if rocksdb is currently slowing-down all writes,
+   *     false otherwise
+   */
+  public boolean isTriggeredWritesSlowdown() {
+    return triggeredWritesSlowdown;
+  }
+
+  /**
+   * Determine if rocksdb is currently blocking any writes to prevent
+   * creating more L0 files.
+   *
+   * This indicates that there are too many files in level 0.
+   * Compactions should try to compact L0 files down to lower levels as soon
+   * as possible.
+   *
+   * @return true  if rocksdb is currently blocking any writes, false otherwise
+   */
+  public boolean isTriggeredWritesStop() {
+    return triggeredWritesStop;
+  }
+
+  /**
+   * Get the smallest sequence number in the newly created file.
+   *
+   * @return the smallest sequence number
+   */
+  public long getSmallestSeqno() {
+    return smallestSeqno;
+  }
+
+  /**
+   * Get the largest sequence number in the newly created file.
+   *
+   * @return the largest sequence number
+   */
+  public long getLargestSeqno() {
+    return largestSeqno;
+  }
+
+  /**
+   * Get the Table properties of the table being flushed.
+   *
+   * @return the Table properties of the table being flushed
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  /**
+   * Get the reason for initiating the flush.
+   *
+   * @return the reason for initiating the flush.
+   */
+  public FlushReason getFlushReason() {
+    return flushReason;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    FlushJobInfo that = (FlushJobInfo) o;
+    return columnFamilyId == that.columnFamilyId && threadId == that.threadId && jobId == that.jobId
+        && triggeredWritesSlowdown == that.triggeredWritesSlowdown
+        && triggeredWritesStop == that.triggeredWritesStop && smallestSeqno == that.smallestSeqno
+        && largestSeqno == that.largestSeqno
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filePath, that.filePath)
+        && Objects.equals(tableProperties, that.tableProperties) && flushReason == that.flushReason;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyId, columnFamilyName, filePath, threadId, jobId,
+        triggeredWritesSlowdown, triggeredWritesStop, smallestSeqno, largestSeqno, tableProperties,
+        flushReason);
+  }
+
+  @Override
+  public String toString() {
+    return "FlushJobInfo{"
+        + "columnFamilyId=" + columnFamilyId + ", columnFamilyName='" + columnFamilyName + '\''
+        + ", filePath='" + filePath + '\'' + ", threadId=" + threadId + ", jobId=" + jobId
+        + ", triggeredWritesSlowdown=" + triggeredWritesSlowdown
+        + ", triggeredWritesStop=" + triggeredWritesStop + ", smallestSeqno=" + smallestSeqno
+        + ", largestSeqno=" + largestSeqno + ", tableProperties=" + tableProperties
+        + ", flushReason=" + flushReason + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/FlushReason.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum FlushReason {
+  OTHERS((byte) 0x00),
+  GET_LIVE_FILES((byte) 0x01),
+  SHUTDOWN((byte) 0x02),
+  EXTERNAL_FILE_INGESTION((byte) 0x03),
+  MANUAL_COMPACTION((byte) 0x04),
+  WRITE_BUFFER_MANAGER((byte) 0x05),
+  WRITE_BUFFER_FULL((byte) 0x06),
+  TEST((byte) 0x07),
+  DELETE_FILES((byte) 0x08),
+  AUTO_COMPACTION((byte) 0x09),
+  MANUAL_FLUSH((byte) 0x0a),
+  ERROR_RECOVERY((byte) 0xb);
+
+  private final byte value;
+
+  FlushReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the FlushReason from the internal representation value.
+   *
+   * @return the flush reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static FlushReason fromValue(final byte value) {
+    for (final FlushReason flushReason : FlushReason.values()) {
+      if (flushReason.value == value) {
+        return flushReason;
+      }
+    }
+
+    throw new IllegalArgumentException("Illegal value provided for FlushReason: " + value);
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java	2025-05-19 16:14:27.000000000 +0000
@@ -159,6 +159,27 @@
    */
   BLOB_DB_DECOMPRESSION_MICROS((byte) 0x2E),
 
+  /**
+   * Num of Index and Filter blocks read from file system per level in MultiGet
+   * request
+   */
+  NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL((byte) 0x2F),
+
+  /**
+   * Num of Data blocks read from file system per level in MultiGet request.
+   */
+  NUM_DATA_BLOCKS_READ_PER_LEVEL((byte) 0x30),
+
+  /**
+   * Num of SST files read from file system per level in MultiGet request.
+   */
+  NUM_SST_READ_PER_LEVEL((byte) 0x31),
+
+  /**
+   * The number of retry in auto resume
+   */
+  ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32),
+
   // 0x1F for backwards compatibility on current minor version.
   HISTOGRAM_ENUM_MAX((byte) 0x1F);
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexShorteningMode.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * This enum allows trading off increased index size for improved iterator
+ * seek performance in some situations, particularly when block cache is
+ * disabled ({@link ReadOptions#fillCache()} == false and direct IO is
+ * enabled ({@link DBOptions#useDirectReads()} == true).
+ * The default mode is the best tradeoff for most use cases.
+ * This option only affects newly written tables.
+ *
+ * The index contains a key separating each pair of consecutive blocks.
+ * Let A be the highest key in one block, B the lowest key in the next block,
+ * and I the index entry separating these two blocks:
+ * [ ... A] I [B ...]
+ * I is allowed to be anywhere in [A, B).
+ * If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
+ * first block, then immediately fall through to the second block.
+ * However, if I=A, this can't happen, and we'll read only the second block.
+ * In kNoShortening mode, we use I=A. In other modes, we use the shortest
+ * key in [A, B), which usually significantly reduces index size.
+ *
+ * There's a similar story for the last index entry, which is an upper bound
+ * of the highest key in the file. If it's shortened and therefore
+ * overestimated, iterator is likely to unnecessarily read the last data block
+ * from each file on each seek.
+ */
+public enum IndexShorteningMode {
+  /**
+   * Use full keys.
+   */
+  kNoShortening((byte) 0),
+  /**
+   * Shorten index keys between blocks, but use full key for the last index
+   * key, which is the upper bound of the whole file.
+   */
+  kShortenSeparators((byte) 1),
+  /**
+   * Shorten both keys between blocks and key after last block.
+   */
+  kShortenSeparatorsAndSuccessor((byte) 2);
+
+  private final byte value;
+
+  IndexShorteningMode(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value.
+   *
+   * @return byte representation
+   */
+  byte getValue() {
+    return value;
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/IndexType.java	2025-05-19 16:14:27.000000000 +0000
@@ -22,7 +22,21 @@
   /**
    * A two-level index implementation. Both levels are binary search indexes.
    */
-  kTwoLevelIndexSearch((byte) 2);
+  kTwoLevelIndexSearch((byte) 2),
+  /**
+   * Like {@link #kBinarySearch}, but index also contains first key of each block.
+   * This allows iterators to defer reading the block until it's actually
+   * needed. May significantly reduce read amplification of short range scans.
+   * Without it, iterator seek usually reads one block from each level-0 file
+   * and from each level, which may be expensive.
+   * Works best in combination with:
+   *   - IndexShorteningMode::kNoShortening,
+   *   - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+   *     e.g. when prefix changes.
+   * Makes the index significantly bigger (2x or more), especially when keys
+   * are long.
+   */
+  kBinarySearchWithFirstKey((byte) 3);
 
   /**
    * Returns the byte value of the enumerations value
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/KeyMayExist.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class KeyMayExist {
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    final KeyMayExist that = (KeyMayExist) o;
+    return (valueLength == that.valueLength && exists == that.exists);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(exists, valueLength);
+  }
+
+  public enum KeyMayExistEnum { kNotExist, kExistsWithoutValue, kExistsWithValue }
+  ;
+
+  public KeyMayExist(final KeyMayExistEnum exists, final int valueLength) {
+    this.exists = exists;
+    this.valueLength = valueLength;
+  }
+
+  public final KeyMayExistEnum exists;
+  public final int valueLength;
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MemTableInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class MemTableInfo {
+  private final String columnFamilyName;
+  private final long firstSeqno;
+  private final long earliestSeqno;
+  private final long numEntries;
+  private final long numDeletes;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  MemTableInfo(final String columnFamilyName, final long firstSeqno, final long earliestSeqno,
+      final long numEntries, final long numDeletes) {
+    this.columnFamilyName = columnFamilyName;
+    this.firstSeqno = firstSeqno;
+    this.earliestSeqno = earliestSeqno;
+    this.numEntries = numEntries;
+    this.numDeletes = numDeletes;
+  }
+
+  /**
+   * Get the name of the column family to which memtable belongs.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the Sequence number of the first element that was inserted into the
+   * memtable.
+   *
+   * @return the sequence number of the first inserted element.
+   */
+  public long getFirstSeqno() {
+    return firstSeqno;
+  }
+
+  /**
+   * Get the Sequence number that is guaranteed to be smaller than or equal
+   * to the sequence number of any key that could be inserted into this
+   * memtable. It can then be assumed that any write with a larger(or equal)
+   * sequence number will be present in this memtable or a later memtable.
+   *
+   * @return the earliest sequence number.
+   */
+  public long getEarliestSeqno() {
+    return earliestSeqno;
+  }
+
+  /**
+   * Get the total number of entries in memtable.
+   *
+   * @return the total number of entries.
+   */
+  public long getNumEntries() {
+    return numEntries;
+  }
+
+  /**
+   * Get the total number of deletes in memtable.
+   *
+   * @return the total number of deletes.
+   */
+  public long getNumDeletes() {
+    return numDeletes;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    MemTableInfo that = (MemTableInfo) o;
+    return firstSeqno == that.firstSeqno && earliestSeqno == that.earliestSeqno
+        && numEntries == that.numEntries && numDeletes == that.numDeletes
+        && Objects.equals(columnFamilyName, that.columnFamilyName);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyName, firstSeqno, earliestSeqno, numEntries, numDeletes);
+  }
+
+  @Override
+  public String toString() {
+    return "MemTableInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", firstSeqno=" + firstSeqno
+        + ", earliestSeqno=" + earliestSeqno + ", numEntries=" + numEntries
+        + ", numDeletes=" + numDeletes + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -39,42 +39,25 @@
    *
    * The format is: key1=value1;key2=value2;key3=value3 etc
    *
-   * For int[] values, each int should be separated by a comma, e.g.
+   * For int[] values, each int should be separated by a colon, e.g.
    *
-   * key1=value1;intArrayKey1=1,2,3
+   * key1=value1;intArrayKey1=1:2:3
    *
    * @param str The string representation of the mutable column family options
+   * @param ignoreUnknown what to do if the key is not one of the keys we expect
    *
    * @return A builder for the mutable column family options
    */
-  public static MutableColumnFamilyOptionsBuilder parse(final String str) {
+  public static MutableColumnFamilyOptionsBuilder parse(
+      final String str, final boolean ignoreUnknown) {
     Objects.requireNonNull(str);
 
-    final MutableColumnFamilyOptionsBuilder builder =
-        new MutableColumnFamilyOptionsBuilder();
-
-    final String[] options = str.trim().split(KEY_VALUE_PAIR_SEPARATOR);
-    for(final String option : options) {
-      final int equalsOffset = option.indexOf(KEY_VALUE_SEPARATOR);
-      if(equalsOffset <= 0) {
-        throw new IllegalArgumentException(
-            "options string has an invalid key=value pair");
-      }
-
-      final String key = option.substring(0, equalsOffset);
-      if(key.isEmpty()) {
-        throw new IllegalArgumentException("options string is invalid");
-      }
-
-      final String value = option.substring(equalsOffset + 1);
-      if(value.isEmpty()) {
-        throw new IllegalArgumentException("options string is invalid");
-      }
-
-      builder.fromString(key, value);
-    }
+    final List<OptionString.Entry> parsedOptions = OptionString.Parser.parse(str);
+    return new MutableColumnFamilyOptionsBuilder().fromParsed(parsedOptions, ignoreUnknown);
+  }
 
-    return builder;
+  public static MutableColumnFamilyOptionsBuilder parse(final String str) {
+    return parse(str, false);
   }
 
   private interface MutableColumnFamilyOptionKey extends MutableOptionKey {}
@@ -117,7 +100,8 @@
     max_bytes_for_level_base(ValueType.LONG),
     max_bytes_for_level_multiplier(ValueType.INT),
     max_bytes_for_level_multiplier_additional(ValueType.INT_ARRAY),
-    ttl(ValueType.LONG);
+    ttl(ValueType.LONG),
+    periodic_compaction_seconds(ValueType.LONG);
 
     private final ValueType valueType;
     CompactionOption(final ValueType valueType) {
@@ -130,11 +114,31 @@
     }
   }
 
+  public enum BlobOption implements MutableColumnFamilyOptionKey {
+    enable_blob_files(ValueType.BOOLEAN),
+    min_blob_size(ValueType.LONG),
+    blob_file_size(ValueType.LONG),
+    blob_compression_type(ValueType.ENUM),
+    enable_blob_garbage_collection(ValueType.BOOLEAN),
+    blob_garbage_collection_age_cutoff(ValueType.DOUBLE),
+    blob_garbage_collection_force_threshold(ValueType.DOUBLE);
+
+    private final ValueType valueType;
+    BlobOption(final ValueType valueType) {
+      this.valueType = valueType;
+    }
+
+    @Override
+    public ValueType getValueType() {
+      return valueType;
+    }
+  }
+
   public enum MiscOption implements MutableColumnFamilyOptionKey {
     max_sequential_skip_in_iterations(ValueType.LONG),
     paranoid_file_checks(ValueType.BOOLEAN),
     report_bg_io_stats(ValueType.BOOLEAN),
-    compression_type(ValueType.ENUM);
+    compression(ValueType.ENUM);
 
     private final ValueType valueType;
     MiscOption(final ValueType valueType) {
@@ -164,6 +168,10 @@
       for(final MutableColumnFamilyOptionKey key : MiscOption.values()) {
         ALL_KEYS_LOOKUP.put(key.name(), key);
       }
+
+      for (final MutableColumnFamilyOptionKey key : BlobOption.values()) {
+        ALL_KEYS_LOOKUP.put(key.name(), key);
+      }
     }
 
     private MutableColumnFamilyOptionsBuilder() {
@@ -437,12 +445,12 @@
     @Override
     public MutableColumnFamilyOptionsBuilder setCompressionType(
         final CompressionType compressionType) {
-      return setEnum(MiscOption.compression_type, compressionType);
+      return setEnum(MiscOption.compression, compressionType);
     }
 
     @Override
     public CompressionType compressionType() {
-      return (CompressionType)getEnum(MiscOption.compression_type);
+      return (CompressionType) getEnum(MiscOption.compression);
     }
 
     @Override
@@ -465,5 +473,92 @@
     public long ttl() {
       return getLong(CompactionOption.ttl);
     }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setPeriodicCompactionSeconds(
+        final long periodicCompactionSeconds) {
+      return setLong(CompactionOption.periodic_compaction_seconds, periodicCompactionSeconds);
+    }
+
+    @Override
+    public long periodicCompactionSeconds() {
+      return getLong(CompactionOption.periodic_compaction_seconds);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setEnableBlobFiles(final boolean enableBlobFiles) {
+      return setBoolean(BlobOption.enable_blob_files, enableBlobFiles);
+    }
+
+    @Override
+    public boolean enableBlobFiles() {
+      return getBoolean(BlobOption.enable_blob_files);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setMinBlobSize(final long minBlobSize) {
+      return setLong(BlobOption.min_blob_size, minBlobSize);
+    }
+
+    @Override
+    public long minBlobSize() {
+      return getLong(BlobOption.min_blob_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobFileSize(final long blobFileSize) {
+      return setLong(BlobOption.blob_file_size, blobFileSize);
+    }
+
+    @Override
+    public long blobFileSize() {
+      return getLong(BlobOption.blob_file_size);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobCompressionType(
+        final CompressionType compressionType) {
+      return setEnum(BlobOption.blob_compression_type, compressionType);
+    }
+
+    @Override
+    public CompressionType blobCompressionType() {
+      return (CompressionType) getEnum(BlobOption.blob_compression_type);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setEnableBlobGarbageCollection(
+        final boolean enableBlobGarbageCollection) {
+      return setBoolean(BlobOption.enable_blob_garbage_collection, enableBlobGarbageCollection);
+    }
+
+    @Override
+    public boolean enableBlobGarbageCollection() {
+      return getBoolean(BlobOption.enable_blob_garbage_collection);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobGarbageCollectionAgeCutoff(
+        final double blobGarbageCollectionAgeCutoff) {
+      return setDouble(
+          BlobOption.blob_garbage_collection_age_cutoff, blobGarbageCollectionAgeCutoff);
+    }
+
+    @Override
+    public double blobGarbageCollectionAgeCutoff() {
+      return getDouble(BlobOption.blob_garbage_collection_age_cutoff);
+    }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setBlobGarbageCollectionForceThreshold(
+        final double blobGarbageCollectionForceThreshold) {
+      return setDouble(
+          BlobOption.blob_garbage_collection_force_threshold, blobGarbageCollectionForceThreshold);
+    }
+
+    @Override
+    public double blobGarbageCollectionForceThreshold() {
+      return getDouble(BlobOption.blob_garbage_collection_force_threshold);
+    }
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -26,7 +26,7 @@
    * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
    *   while overflowing the underlying platform specific value.
    */
-  MutableColumnFamilyOptionsInterface setWriteBufferSize(long writeBufferSize);
+  T setWriteBufferSize(long writeBufferSize);
 
   /**
    * Return size of write buffer size.
@@ -43,8 +43,7 @@
    * @param disableAutoCompactions true if auto-compactions are disabled.
    * @return the reference to the current option.
    */
-  MutableColumnFamilyOptionsInterface setDisableAutoCompactions(
-      boolean disableAutoCompactions);
+  T setDisableAutoCompactions(boolean disableAutoCompactions);
 
   /**
    * Disable automatic compactions. Manual compactions can still
@@ -64,8 +63,7 @@
    *   level-0 compaction
    * @return the reference to the current option.
    */
-  MutableColumnFamilyOptionsInterface setLevel0FileNumCompactionTrigger(
-      int level0FileNumCompactionTrigger);
+  T setLevel0FileNumCompactionTrigger(int level0FileNumCompactionTrigger);
 
   /**
    * Number of files to trigger level-0 compaction. A value &lt; 0 means that
@@ -86,7 +84,7 @@
    * @return the reference to the current option.
    * @see #maxCompactionBytes()
    */
-  MutableColumnFamilyOptionsInterface setMaxCompactionBytes(final long maxCompactionBytes);
+  T setMaxCompactionBytes(final long maxCompactionBytes);
 
   /**
    * We try to limit number of bytes in one compaction to be lower than this
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -6,6 +6,7 @@
 package org.rocksdb;
 
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 
@@ -41,40 +42,22 @@
    *
    * For int[] values, each int should be separated by a comma, e.g.
    *
-   * key1=value1;intArrayKey1=1,2,3
+   * key1=value1;intArrayKey1=1:2:3
    *
    * @param str The string representation of the mutable db options
+   * @param ignoreUnknown what to do if the key is not one of the keys we expect
    *
    * @return A builder for the mutable db options
    */
-  public static MutableDBOptionsBuilder parse(final String str) {
+  public static MutableDBOptionsBuilder parse(final String str, boolean ignoreUnknown) {
     Objects.requireNonNull(str);
 
-    final MutableDBOptionsBuilder builder =
-        new MutableDBOptionsBuilder();
-
-    final String[] options = str.trim().split(KEY_VALUE_PAIR_SEPARATOR);
-    for(final String option : options) {
-      final int equalsOffset = option.indexOf(KEY_VALUE_SEPARATOR);
-      if(equalsOffset <= 0) {
-        throw new IllegalArgumentException(
-            "options string has an invalid key=value pair");
-      }
-
-      final String key = option.substring(0, equalsOffset);
-      if(key.isEmpty()) {
-        throw new IllegalArgumentException("options string is invalid");
-      }
-
-      final String value = option.substring(equalsOffset + 1);
-      if(value.isEmpty()) {
-        throw new IllegalArgumentException("options string is invalid");
-      }
-
-      builder.fromString(key, value);
-    }
+    final List<OptionString.Entry> parsedOptions = OptionString.Parser.parse(str);
+    return new MutableDBOptions.MutableDBOptionsBuilder().fromParsed(parsedOptions, ignoreUnknown);
+  }
 
-    return builder;
+  public static MutableDBOptionsBuilder parse(final String str) {
+    return parse(str, false);
   }
 
   private interface MutableDBOptionKey extends MutableOptionKey {}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -202,12 +202,24 @@
   long delayedWriteRate();
 
   /**
-   * <p>Once write-ahead logs exceed this size, we will start forcing the
-   * flush of column families whose memtables are backed by the oldest live
-   * WAL file (i.e. the ones that are causing all the space amplification).
+   * <p>Set the max total write-ahead log size. Once write-ahead logs exceed this size, we will
+   * start forcing the flush of column families whose memtables are backed by the oldest live WAL
+   * file
    * </p>
+   * <p>The oldest WAL files are the ones that are causing all the space amplification.
+   * </p>
+   *  For example, with 15 column families, each with
+   *  <code>write_buffer_size = 128 MB</code>
+   *  <code>max_write_buffer_number = 6</code>
+   *  <code>max_total_wal_size</code> will be calculated to be <code>[15 * 128MB * 6] * 4 =
+   * 45GB</code>
+   * <p>
+   *  The RocksDB wiki has some discussion about how the WAL interacts
+   *  with memtables and flushing of column families, at
+   * <a href="https://github.com/facebook/rocksdb/wiki/Column-Families">...</a>
+   *  </p>
    * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
-   * be [sum of all write_buffer_size * max_write_buffer_number] * 2</p>
+   * be [sum of all write_buffer_size * max_write_buffer_number] * 4</p>
    * <p>This option takes effect only when there are more than one column family as
    * otherwise the wal size is dictated by the write_buffer_size.</p>
    * <p>Default: 0</p>
@@ -218,13 +230,30 @@
   T setMaxTotalWalSize(long maxTotalWalSize);
 
   /**
-   * <p>Returns the max total wal size. Once write-ahead logs exceed this size,
+   * <p>Returns the max total write-ahead log size. Once write-ahead logs exceed this size,
    * we will start forcing the flush of column families whose memtables are
-   * backed by the oldest live WAL file (i.e. the ones that are causing all
-   * the space amplification).</p>
+   * backed by the oldest live WAL file.</p>
+   * <p>The oldest WAL files are the ones that are causing all the space amplification.
+   * </p>
+   *  For example, with 15 column families, each with
+   *  <code>write_buffer_size = 128 MB</code>
+   *  <code>max_write_buffer_number = 6</code>
+   *  <code>max_total_wal_size</code> will be calculated to be <code>[15 * 128MB * 6] * 4 =
+   * 45GB</code>
+   * <p>
+   *  The RocksDB wiki has some discussion about how the WAL interacts
+   *  with memtables and flushing of column families, at
+   * <a href="https://github.com/facebook/rocksdb/wiki/Column-Families">...</a>
+   *  </p>
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
+   * be [sum of all write_buffer_size * max_write_buffer_number] * 4</p>
+   * <p>This option takes effect only when there are more than one column family as
+   * otherwise the wal size is dictated by the write_buffer_size.</p>
+   * <p>Default: 0</p>
+   *
    *
    * <p>If set to 0 (default), we will dynamically choose the WAL size limit
-   * to be [sum of all write_buffer_size * max_write_buffer_number] * 2
+   * to be [sum of all write_buffer_size * max_write_buffer_number] * 4
    * </p>
    *
    * @return max total wal size
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java	2025-05-19 16:14:27.000000000 +0000
@@ -326,7 +326,7 @@
     String asString() {
       final StringBuilder builder = new StringBuilder();
       for(int i = 0; i < value.length; i++) {
-        builder.append(i);
+        builder.append(value[i]);
         if(i + 1 < value.length) {
           builder.append(INT_ARRAY_INT_SEPARATOR);
         }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java	2025-05-19 16:14:27.000000000 +0000
@@ -18,7 +18,11 @@
 
   private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb");
   private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryName =
+      Environment.getFallbackJniLibraryName("rocksdb");
   private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryFileName =
+      Environment.getFallbackJniLibraryFileName("rocksdb");
   private static final String tempFilePrefix = "librocksdbjni";
   private static final String tempFileSuffix = Environment.getJniLibraryExtension();
 
@@ -49,14 +53,33 @@
    */
   public synchronized void loadLibrary(final String tmpDir) throws IOException {
     try {
-        System.loadLibrary(sharedLibraryName);
-    } catch(final UnsatisfiedLinkError ule1) {
+      // try dynamic library
+      System.loadLibrary(sharedLibraryName);
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - try from static library
+    }
+
+    try {
+      // try static library
+      System.loadLibrary(jniLibraryName);
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - then try static library fallback or from jar
+    }
+
+    if (fallbackJniLibraryName != null) {
       try {
-        System.loadLibrary(jniLibraryName);
-      } catch(final UnsatisfiedLinkError ule2) {
-        loadLibraryFromJar(tmpDir);
+        // try static library fallback
+        System.loadLibrary(fallbackJniLibraryName);
+        return;
+      } catch (final UnsatisfiedLinkError ule) {
+        // ignore - then try from jar
       }
     }
+
+    // try jar
+    loadLibraryFromJar(tmpDir);
   }
 
   /**
@@ -83,38 +106,62 @@
 
   File loadLibraryFromJarToTemp(final String tmpDir)
           throws IOException {
-    final File temp;
-    if (tmpDir == null || tmpDir.isEmpty()) {
-      temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
-    } else {
-      temp = new File(tmpDir, jniLibraryFileName);
-      if (temp.exists() && !temp.delete()) {
-        throw new RuntimeException("File: " + temp.getAbsolutePath()
-            + " already exists and cannot be removed.");
+    InputStream is = null;
+    try {
+      // attempt to look up the static library in the jar file
+      String libraryFileName = jniLibraryFileName;
+      is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+
+      if (is == null) {
+        // is there a fallback we can try
+        if (fallbackJniLibraryFileName == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
+
+        // attempt to look up the fallback static library in the jar file
+        libraryFileName = fallbackJniLibraryFileName;
+        is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+        if (is == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
       }
-      if (!temp.createNewFile()) {
-        throw new RuntimeException("File: " + temp.getAbsolutePath()
-            + " could not be created.");
+
+      // create a temporary file to copy the library to
+      final File temp;
+      if (tmpDir == null || tmpDir.isEmpty()) {
+        temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
+      } else {
+        final File parentDir = new File(tmpDir);
+        if (!parentDir.exists()) {
+          throw new RuntimeException(
+              "Directory: " + parentDir.getAbsolutePath() + " does not exist!");
+        }
+        temp = new File(parentDir, libraryFileName);
+        if (temp.exists() && !temp.delete()) {
+          throw new RuntimeException(
+              "File: " + temp.getAbsolutePath() + " already exists and cannot be removed.");
+        }
+        if (!temp.createNewFile()) {
+          throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created.");
+        }
+      }
+      if (!temp.exists()) {
+        throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
+      } else {
+        temp.deleteOnExit();
       }
-    }
 
-    if (!temp.exists()) {
-      throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
-    } else {
-      temp.deleteOnExit();
-    }
+      // copy the library from the Jar file to the temp destination
+      Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
 
-    // attempt to copy the library from the Jar file to the temp destination
-    try (final InputStream is = getClass().getClassLoader().
-      getResourceAsStream(jniLibraryFileName)) {
-      if (is == null) {
-        throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
-      } else {
-        Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
+      // return the temporary library file
+      return temp;
+
+    } finally {
+      if (is != null) {
+        is.close();
       }
     }
-
-    return temp;
   }
 
   /**
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionString.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,256 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+public class OptionString {
+  private final static char kvPairSeparator = ';';
+  private final static char kvSeparator = '=';
+  private final static char complexValueBegin = '{';
+  private final static char complexValueEnd = '}';
+  private final static char wrappedValueBegin = '{';
+  private final static char wrappedValueEnd = '}';
+  private final static char arrayValueSeparator = ':';
+
+  static class Value {
+    final List<String> list;
+    final List<Entry> complex;
+
+    public Value(final List<String> list, final List<Entry> complex) {
+      this.list = list;
+      this.complex = complex;
+    }
+
+    public boolean isList() {
+      return (this.list != null && this.complex == null);
+    }
+
+    public static Value fromList(final List<String> list) {
+      return new Value(list, null);
+    }
+
+    public static Value fromComplex(final List<Entry> complex) {
+      return new Value(null, complex);
+    }
+
+    public String toString() {
+      final StringBuilder sb = new StringBuilder();
+      if (isList()) {
+        for (final String item : list) {
+          sb.append(item).append(arrayValueSeparator);
+        }
+        // remove the final separator
+        if (sb.length() > 0)
+          sb.delete(sb.length() - 1, sb.length());
+      } else {
+        sb.append('[');
+        for (final Entry entry : complex) {
+          sb.append(entry.toString()).append(';');
+        }
+        sb.append(']');
+      }
+      return sb.toString();
+    }
+  }
+
+  static class Entry {
+    public final String key;
+    public final Value value;
+
+    private Entry(final String key, final Value value) {
+      this.key = key;
+      this.value = value;
+    }
+
+    public String toString() {
+      return "" + key + "=" + value;
+    }
+  }
+
+  static class Parser {
+    static class Exception extends RuntimeException {
+      public Exception(final String s) {
+        super(s);
+      }
+    }
+
+    final String str;
+    final StringBuilder sb;
+
+    private Parser(final String str) {
+      this.str = str;
+      this.sb = new StringBuilder(str);
+    }
+
+    private void exception(final String message) {
+      final int pos = str.length() - sb.length();
+      final int before = Math.min(pos, 64);
+      final int after = Math.min(64, str.length() - pos);
+      final String here =
+          str.substring(pos - before, pos) + "__*HERE*__" + str.substring(pos, pos + after);
+
+      throw new Parser.Exception(message + " at [" + here + "]");
+    }
+
+    private void skipWhite() {
+      while (sb.length() > 0 && Character.isWhitespace(sb.charAt(0))) {
+        sb.delete(0, 1);
+      }
+    }
+
+    private char first() {
+      if (sb.length() == 0)
+        exception("Unexpected end of input");
+      return sb.charAt(0);
+    }
+
+    private char next() {
+      if (sb.length() == 0)
+        exception("Unexpected end of input");
+      final char c = sb.charAt(0);
+      sb.delete(0, 1);
+      return c;
+    }
+
+    private boolean hasNext() {
+      return (sb.length() > 0);
+    }
+
+    private boolean is(final char c) {
+      return (sb.length() > 0 && sb.charAt(0) == c);
+    }
+
+    private boolean isKeyChar() {
+      if (!hasNext())
+        return false;
+      final char c = first();
+      return (Character.isAlphabetic(c) || Character.isDigit(c) || "_".indexOf(c) != -1);
+    }
+
+    private boolean isValueChar() {
+      if (!hasNext())
+        return false;
+      final char c = first();
+      return (Character.isAlphabetic(c) || Character.isDigit(c) || "_-+.[]".indexOf(c) != -1);
+    }
+
+    private String parseKey() {
+      final StringBuilder sbKey = new StringBuilder();
+      sbKey.append(next());
+      while (isKeyChar()) {
+        sbKey.append(next());
+      }
+
+      return sbKey.toString();
+    }
+
+    private String parseSimpleValue() {
+      if (is(wrappedValueBegin)) {
+        next();
+        final String result = parseSimpleValue();
+        if (!is(wrappedValueEnd)) {
+          exception("Expected to end a wrapped value with " + wrappedValueEnd);
+        }
+        next();
+
+        return result;
+      } else {
+        final StringBuilder sbValue = new StringBuilder();
+        while (isValueChar()) sbValue.append(next());
+
+        return sbValue.toString();
+      }
+    }
+
+    private List<String> parseList() {
+      final List<String> list = new ArrayList<>(1);
+      while (true) {
+        list.add(parseSimpleValue());
+        if (!is(arrayValueSeparator))
+          break;
+
+        next();
+      }
+
+      return list;
+    }
+
+    private Entry parseOption() {
+      skipWhite();
+      if (!isKeyChar()) {
+        exception("No valid key character(s) for key in key=value ");
+      }
+      final String key = parseKey();
+      skipWhite();
+      if (is(kvSeparator)) {
+        next();
+      } else {
+        exception("Expected = separating key and value");
+      }
+      skipWhite();
+      final Value value = parseValue();
+      return new Entry(key, value);
+    }
+
+    private Value parseValue() {
+      skipWhite();
+      if (is(complexValueBegin)) {
+        next();
+        skipWhite();
+        final Value value = Value.fromComplex(parseComplex());
+        skipWhite();
+        if (is(complexValueEnd)) {
+          next();
+          skipWhite();
+        } else {
+          exception("Expected } ending complex value");
+        }
+        return value;
+      } else if (isValueChar()) {
+        return Value.fromList(parseList());
+      }
+
+      exception("No valid value character(s) for value in key=value");
+      return null;
+    }
+
+    private List<Entry> parseComplex() {
+      final List<Entry> entries = new ArrayList<>();
+
+      skipWhite();
+      if (hasNext()) {
+        entries.add(parseOption());
+        skipWhite();
+        while (is(kvPairSeparator)) {
+          next();
+          skipWhite();
+          if (!isKeyChar()) {
+            // the separator was a terminator
+            break;
+          }
+          entries.add(parseOption());
+          skipWhite();
+        }
+      }
+      return entries;
+    }
+
+    public static List<Entry> parse(final String str) {
+      Objects.requireNonNull(str);
+
+      final Parser parser = new Parser(str);
+      final List<Entry> result = parser.parseComplex();
+      if (parser.hasNext()) {
+        parser.exception("Unexpected end of parsing ");
+      }
+
+      return result;
+    }
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Options.java	2025-05-19 16:14:27.000000000 +0000
@@ -6,10 +6,7 @@
 package org.rocksdb;
 
 import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
 
 /**
  * Options to control the behavior of a database.  It will be used
@@ -28,6 +25,25 @@
   }
 
   /**
+   * Converts the input properties into a Options-style formatted string
+   * @param properties   The set of properties to convert
+   * @return The Options-style representation of those properties.
+   */
+  public static String getOptionStringFromProps(final Properties properties) {
+    if (properties == null || properties.size() == 0) {
+      throw new IllegalArgumentException("Properties value must contain at least one value.");
+    }
+    StringBuilder stringBuilder = new StringBuilder();
+    for (final String name : properties.stringPropertyNames()) {
+      stringBuilder.append(name);
+      stringBuilder.append("=");
+      stringBuilder.append(properties.getProperty(name));
+      stringBuilder.append(";");
+    }
+    return stringBuilder.toString();
+  }
+
+  /**
    * Construct options for opening a RocksDB.
    *
    * This constructor will create (by allocating a block of memory)
@@ -75,6 +91,10 @@
     this.compressionOptions_ = other.compressionOptions_;
     this.rowCache_ = other.rowCache_;
     this.writeBufferManager_ = other.writeBufferManager_;
+    this.compactionThreadLimiter_ = other.compactionThreadLimiter_;
+    this.bottommostCompressionOptions_ = other.bottommostCompressionOptions_;
+    this.walFilter_ = other.walFilter_;
+    this.sstPartitionerFactory_ = other.sstPartitionerFactory_;
   }
 
   @Override
@@ -141,12 +161,24 @@
   }
 
   @Override
+  public Options oldDefaults(final int majorVersion, final int minorVersion) {
+    oldDefaults(nativeHandle_, majorVersion, minorVersion);
+    return this;
+  }
+
+  @Override
   public Options optimizeForSmallDb() {
     optimizeForSmallDb(nativeHandle_);
     return this;
   }
 
   @Override
+  public Options optimizeForSmallDb(final Cache cache) {
+    optimizeForSmallDb(nativeHandle_, cache.getNativeHandle());
+    return this;
+  }
+
+  @Override
   public Options optimizeForPointLookup(
       long blockCacheSizeMb) {
     optimizeForPointLookup(nativeHandle_,
@@ -633,6 +665,18 @@
   }
 
   @Override
+  public Options setMaxWriteBatchGroupSizeBytes(long maxWriteBatchGroupSizeBytes) {
+    setMaxWriteBatchGroupSizeBytes(nativeHandle_, maxWriteBatchGroupSizeBytes);
+    return this;
+  }
+
+  @Override
+  public long maxWriteBatchGroupSizeBytes() {
+    assert (isOwningHandle());
+    return maxWriteBatchGroupSizeBytes(nativeHandle_);
+  }
+
+  @Override
   public Options setWalSizeLimitMB(final long sizeLimitMB) {
     assert(isOwningHandle());
     setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
@@ -930,6 +974,19 @@
   }
 
   @Override
+  public Options setListeners(final List<AbstractEventListener> listeners) {
+    assert (isOwningHandle());
+    setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners));
+    return this;
+  }
+
+  @Override
+  public List<AbstractEventListener> listeners() {
+    assert (isOwningHandle());
+    return Arrays.asList(eventListeners(nativeHandle_));
+  }
+
+  @Override
   public Options setEnableThreadTracking(final boolean enableThreadTracking) {
     assert(isOwningHandle());
     setEnableThreadTracking(nativeHandle_, enableThreadTracking);
@@ -1038,6 +1095,18 @@
   }
 
   @Override
+  public Options setSkipCheckingSstFileSizesOnDbOpen(boolean skipCheckingSstFileSizesOnDbOpen) {
+    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
+    return this;
+  }
+
+  @Override
+  public boolean skipCheckingSstFileSizesOnDbOpen() {
+    assert (isOwningHandle());
+    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
+  }
+
+  @Override
   public Options setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
     assert(isOwningHandle());
     setWalRecoveryMode(nativeHandle_, walRecoveryMode.getValue());
@@ -1268,6 +1337,45 @@
   }
 
   @Override
+  public Options setCfPaths(final Collection<DbPath> cfPaths) {
+    assert (isOwningHandle());
+
+    final int len = cfPaths.size();
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    int i = 0;
+    for (final DbPath dbPath : cfPaths) {
+      paths[i] = dbPath.path.toString();
+      targetSizes[i] = dbPath.targetSize;
+      i++;
+    }
+    setCfPaths(nativeHandle_, paths, targetSizes);
+    return this;
+  }
+
+  @Override
+  public List<DbPath> cfPaths() {
+    final int len = (int) cfPathsLen(nativeHandle_);
+
+    if (len == 0) {
+      return Collections.emptyList();
+    }
+
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
+
+    cfPaths(nativeHandle_, paths, targetSizes);
+
+    final List<DbPath> cfPaths = new ArrayList<>();
+    for (int i = 0; i < len; i++) {
+      cfPaths.add(new DbPath(Paths.get(paths[i]), targetSizes[i]));
+    }
+
+    return cfPaths;
+  }
+
+  @Override
   public Options useFixedLengthPrefixExtractor(final int n) {
     assert(isOwningHandle());
     useFixedLengthPrefixExtractor(nativeHandle_, n);
@@ -1303,7 +1411,7 @@
     final byte[] byteCompressionTypes =
         compressionPerLevel(nativeHandle_);
     final List<CompressionType> compressionLevels = new ArrayList<>();
-    for (final Byte byteCompressionType : byteCompressionTypes) {
+    for (final byte byteCompressionType : byteCompressionTypes) {
       compressionLevels.add(CompressionType.getCompressionType(
           byteCompressionType));
     }
@@ -1744,6 +1852,17 @@
   }
 
   @Override
+  public Options setPeriodicCompactionSeconds(final long periodicCompactionSeconds) {
+    setPeriodicCompactionSeconds(nativeHandle_, periodicCompactionSeconds);
+    return this;
+  }
+
+  @Override
+  public long periodicCompactionSeconds() {
+    return periodicCompactionSeconds(nativeHandle_);
+  }
+
+  @Override
   public Options setCompactionOptionsUniversal(
       final CompactionOptionsUniversal compactionOptionsUniversal) {
     setCompactionOptionsUniversal(nativeHandle_,
@@ -1792,6 +1911,201 @@
     return atomicFlush(nativeHandle_);
   }
 
+  @Override
+  public Options setAvoidUnnecessaryBlockingIO(boolean avoidUnnecessaryBlockingIO) {
+    setAvoidUnnecessaryBlockingIO(nativeHandle_, avoidUnnecessaryBlockingIO);
+    return this;
+  }
+
+  @Override
+  public boolean avoidUnnecessaryBlockingIO() {
+    assert (isOwningHandle());
+    return avoidUnnecessaryBlockingIO(nativeHandle_);
+  }
+
+  @Override
+  public Options setPersistStatsToDisk(boolean persistStatsToDisk) {
+    setPersistStatsToDisk(nativeHandle_, persistStatsToDisk);
+    return this;
+  }
+
+  @Override
+  public boolean persistStatsToDisk() {
+    assert (isOwningHandle());
+    return persistStatsToDisk(nativeHandle_);
+  }
+
+  @Override
+  public Options setWriteDbidToManifest(boolean writeDbidToManifest) {
+    setWriteDbidToManifest(nativeHandle_, writeDbidToManifest);
+    return this;
+  }
+
+  @Override
+  public boolean writeDbidToManifest() {
+    assert (isOwningHandle());
+    return writeDbidToManifest(nativeHandle_);
+  }
+
+  @Override
+  public Options setLogReadaheadSize(long logReadaheadSize) {
+    setLogReadaheadSize(nativeHandle_, logReadaheadSize);
+    return this;
+  }
+
+  @Override
+  public long logReadaheadSize() {
+    assert (isOwningHandle());
+    return logReadaheadSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setBestEffortsRecovery(boolean bestEffortsRecovery) {
+    setBestEffortsRecovery(nativeHandle_, bestEffortsRecovery);
+    return this;
+  }
+
+  @Override
+  public boolean bestEffortsRecovery() {
+    assert (isOwningHandle());
+    return bestEffortsRecovery(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBgErrorResumeCount(int maxBgerrorResumeCount) {
+    setMaxBgErrorResumeCount(nativeHandle_, maxBgerrorResumeCount);
+    return this;
+  }
+
+  @Override
+  public int maxBgerrorResumeCount() {
+    assert (isOwningHandle());
+    return maxBgerrorResumeCount(nativeHandle_);
+  }
+
+  @Override
+  public Options setBgerrorResumeRetryInterval(long bgerrorResumeRetryInterval) {
+    setBgerrorResumeRetryInterval(nativeHandle_, bgerrorResumeRetryInterval);
+    return this;
+  }
+
+  @Override
+  public long bgerrorResumeRetryInterval() {
+    assert (isOwningHandle());
+    return bgerrorResumeRetryInterval(nativeHandle_);
+  }
+
+  @Override
+  public Options setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) {
+    setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_);
+    this.sstPartitionerFactory_ = sstPartitionerFactory;
+    return this;
+  }
+
+  @Override
+  public SstPartitionerFactory sstPartitionerFactory() {
+    return sstPartitionerFactory_;
+  }
+
+  @Override
+  public Options setCompactionThreadLimiter(final ConcurrentTaskLimiter compactionThreadLimiter) {
+    setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_);
+    this.compactionThreadLimiter_ = compactionThreadLimiter;
+    return this;
+  }
+
+  @Override
+  public ConcurrentTaskLimiter compactionThreadLimiter() {
+    assert (isOwningHandle());
+    return this.compactionThreadLimiter_;
+  }
+
+  //
+  // BEGIN options for blobs (integrated BlobDB)
+  //
+
+  @Override
+  public Options setEnableBlobFiles(final boolean enableBlobFiles) {
+    setEnableBlobFiles(nativeHandle_, enableBlobFiles);
+    return this;
+  }
+
+  @Override
+  public boolean enableBlobFiles() {
+    return enableBlobFiles(nativeHandle_);
+  }
+
+  @Override
+  public Options setMinBlobSize(final long minBlobSize) {
+    setMinBlobSize(nativeHandle_, minBlobSize);
+    return this;
+  }
+
+  @Override
+  public long minBlobSize() {
+    return minBlobSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobFileSize(final long blobFileSize) {
+    setBlobFileSize(nativeHandle_, blobFileSize);
+    return this;
+  }
+
+  @Override
+  public long blobFileSize() {
+    return blobFileSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobCompressionType(CompressionType compressionType) {
+    setBlobCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompressionType blobCompressionType() {
+    return CompressionType.values()[blobCompressionType(nativeHandle_)];
+  }
+
+  @Override
+  public Options setEnableBlobGarbageCollection(final boolean enableBlobGarbageCollection) {
+    setEnableBlobGarbageCollection(nativeHandle_, enableBlobGarbageCollection);
+    return this;
+  }
+
+  @Override
+  public boolean enableBlobGarbageCollection() {
+    return enableBlobGarbageCollection(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobGarbageCollectionAgeCutoff(final double blobGarbageCollectionAgeCutoff) {
+    setBlobGarbageCollectionAgeCutoff(nativeHandle_, blobGarbageCollectionAgeCutoff);
+    return this;
+  }
+
+  @Override
+  public double blobGarbageCollectionAgeCutoff() {
+    return blobGarbageCollectionAgeCutoff(nativeHandle_);
+  }
+
+  @Override
+  public Options setBlobGarbageCollectionForceThreshold(
+      final double blobGarbageCollectionForceThreshold) {
+    setBlobGarbageCollectionForceThreshold(nativeHandle_, blobGarbageCollectionForceThreshold);
+    return this;
+  }
+
+  @Override
+  public double blobGarbageCollectionForceThreshold() {
+    return blobGarbageCollectionForceThreshold(nativeHandle_);
+  }
+
+  //
+  // END options for blobs (integrated BlobDB)
+  //
+
   private native static long newOptions();
   private native static long newOptions(long dbOptHandle,
       long cfOptHandle);
@@ -1881,6 +2195,9 @@
   private native long walTtlSeconds(long handle);
   private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
   private native long walSizeLimitMB(long handle);
+  private static native void setMaxWriteBatchGroupSizeBytes(
+      final long handle, final long maxWriteBatchGroupSizeBytes);
+  private static native long maxWriteBatchGroupSizeBytes(final long handle);
   private native void setManifestPreallocationSize(
       long handle, long size) throws IllegalArgumentException;
   private native long manifestPreallocationSize(long handle);
@@ -1947,6 +2264,9 @@
       final long handle, final boolean strictBytesPerSync);
   private native boolean strictBytesPerSync(
       final long handle);
+  private static native void setEventListeners(
+      final long handle, final long[] eventListenerHandles);
+  private static native AbstractEventListener[] eventListeners(final long handle);
   private native void setEnableThreadTracking(long handle,
       boolean enableThreadTracking);
   private native boolean enableThreadTracking(long handle);
@@ -1973,6 +2293,9 @@
   private native void setSkipStatsUpdateOnDbOpen(final long handle,
       final boolean skipStatsUpdateOnDbOpen);
   private native boolean skipStatsUpdateOnDbOpen(final long handle);
+  private static native void setSkipCheckingSstFileSizesOnDbOpen(
+      final long handle, final boolean skipChecking);
+  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
   private native void setWalRecoveryMode(final long handle,
       final byte walRecoveryMode);
   private native byte walRecoveryMode(final long handle);
@@ -2010,7 +2333,10 @@
 
 
   // CF native handles
+  private static native void oldDefaults(
+      final long handle, final int majorVersion, final int minorVersion);
   private native void optimizeForSmallDb(final long handle);
+  private static native void optimizeForSmallDb(final long handle, final long cacheHandle);
   private native void optimizeForPointLookup(long handle,
       long blockCacheSizeMb);
   private native void optimizeLevelStyleCompaction(long handle,
@@ -2097,6 +2423,11 @@
   private native String memTableFactoryName(long handle);
   private native void setTableFactory(long handle, long factoryHandle);
   private native String tableFactoryName(long handle);
+  private static native void setCfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
+  private static native long cfPathsLen(final long handle);
+  private static native void cfPaths(
+      final long handle, final String[] paths, final long[] targetSizes);
   private native void setInplaceUpdateSupport(
       long handle, boolean inplaceUpdateSupport);
   private native boolean inplaceUpdateSupport(long handle);
@@ -2152,6 +2483,9 @@
   private native boolean reportBgIoStats(final long handle);
   private native void setTtl(final long handle, final long ttl);
   private native long ttl(final long handle);
+  private native void setPeriodicCompactionSeconds(
+      final long handle, final long periodicCompactionSeconds);
+  private native long periodicCompactionSeconds(final long handle);
   private native void setCompactionOptionsUniversal(final long handle,
       final long compactionOptionsUniversalHandle);
   private native void setCompactionOptionsFIFO(final long handle,
@@ -2162,6 +2496,47 @@
   private native void setAtomicFlush(final long handle,
       final boolean atomicFlush);
   private native boolean atomicFlush(final long handle);
+  private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle);
+  private static native void setCompactionThreadLimiter(
+      final long nativeHandle_, final long newLimiterHandle);
+  private static native void setAvoidUnnecessaryBlockingIO(
+      final long handle, final boolean avoidBlockingIO);
+  private static native boolean avoidUnnecessaryBlockingIO(final long handle);
+  private static native void setPersistStatsToDisk(
+      final long handle, final boolean persistStatsToDisk);
+  private static native boolean persistStatsToDisk(final long handle);
+  private static native void setWriteDbidToManifest(
+      final long handle, final boolean writeDbidToManifest);
+  private static native boolean writeDbidToManifest(final long handle);
+  private static native void setLogReadaheadSize(final long handle, final long logReadaheadSize);
+  private static native long logReadaheadSize(final long handle);
+  private static native void setBestEffortsRecovery(
+      final long handle, final boolean bestEffortsRecovery);
+  private static native boolean bestEffortsRecovery(final long handle);
+  private static native void setMaxBgErrorResumeCount(
+      final long handle, final int maxBgerrorRecumeCount);
+  private static native int maxBgerrorResumeCount(final long handle);
+  private static native void setBgerrorResumeRetryInterval(
+      final long handle, final long bgerrorResumeRetryInterval);
+  private static native long bgerrorResumeRetryInterval(final long handle);
+
+  private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles);
+  private native boolean enableBlobFiles(final long nativeHandle_);
+  private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize);
+  private native long minBlobSize(final long nativeHandle_);
+  private native void setBlobFileSize(final long nativeHandle_, final long blobFileSize);
+  private native long blobFileSize(final long nativeHandle_);
+  private native void setBlobCompressionType(final long nativeHandle_, final byte compressionType);
+  private native byte blobCompressionType(final long nativeHandle_);
+  private native void setEnableBlobGarbageCollection(
+      final long nativeHandle_, final boolean enableBlobGarbageCollection);
+  private native boolean enableBlobGarbageCollection(final long nativeHandle_);
+  private native void setBlobGarbageCollectionAgeCutoff(
+      final long nativeHandle_, final double blobGarbageCollectionAgeCutoff);
+  private native double blobGarbageCollectionAgeCutoff(final long nativeHandle_);
+  private native void setBlobGarbageCollectionForceThreshold(
+      final long nativeHandle_, final double blobGarbageCollectionForceThreshold);
+  private native double blobGarbageCollectionForceThreshold(final long nativeHandle_);
 
   // instance variables
   // NOTE: If you add new member variables, please update the copy constructor above!
@@ -2180,4 +2555,6 @@
   private Cache rowCache_;
   private WalFilter walFilter_;
   private WriteBufferManager writeBufferManager_;
+  private SstPartitionerFactory sstPartitionerFactory_;
+  private ConcurrentTaskLimiter compactionThreadLimiter_;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,7 +5,6 @@
 
 package org.rocksdb;
 
-import java.util.ArrayList;
 import java.util.List;
 
 public class OptionsUtil {
@@ -59,7 +58,7 @@
    * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
    *     returned.
    * @param ignoreUnknownOptions this flag can be set to true if you want to
-   *     ignore options that are from a newer version of the db, esentially for
+   *     ignore options that are from a newer version of the db, essentially for
    *     forward compatibility.
    *
    * @throws RocksDBException thrown if error happens in underlying
@@ -76,6 +75,25 @@
    * and ColumnFamilyDescriptors based on the specified RocksDB Options file.
    * See LoadLatestOptions above.
    *
+   * @param dbPath the path to the RocksDB.
+   * @param configOptions {@link org.rocksdb.ConfigOptions} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadLatestOptions(ConfigOptions configOptions, String dbPath,
+      DBOptions dbOptions, List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadLatestOptions(configOptions.nativeHandle_, dbPath, dbOptions.nativeHandle_, cfDescs);
+  }
+
+  /**
+   * Similar to LoadLatestOptions, this function constructs the DBOptions
+   * and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+   * See LoadLatestOptions above.
+   *
    * @param optionsFileName the RocksDB options file path.
    * @param env {@link org.rocksdb.Env} instance.
    * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
@@ -112,6 +130,26 @@
   }
 
   /**
+   * Similar to LoadLatestOptions, this function constructs the DBOptions
+   * and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+   * See LoadLatestOptions above.
+   *
+   * @param optionsFileName the RocksDB options file path.
+   * @param configOptions {@link org.rocksdb.ConfigOptions} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadOptionsFromFile(ConfigOptions configOptions, String optionsFileName,
+      DBOptions dbOptions, List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadOptionsFromFile(
+        configOptions.nativeHandle_, optionsFileName, dbOptions.nativeHandle_, cfDescs);
+  }
+
+  /**
    * Returns the latest options file name under the specified RocksDB path.
    *
    * @param dbPath the path to the RocksDB.
@@ -134,9 +172,13 @@
   // native methods
   private native static void loadLatestOptions(String dbPath, long envHandle, long dbOptionsHandle,
       List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions) throws RocksDBException;
+  private native static void loadLatestOptions(long cfgHandle, String dbPath, long dbOptionsHandle,
+      List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException;
   private native static void loadOptionsFromFile(String optionsFileName, long envHandle,
       long dbOptionsHandle, List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions)
       throws RocksDBException;
+  private native static void loadOptionsFromFile(long cfgHandle, String optionsFileName,
+      long dbOptionsHandle, List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException;
   private native static String getLatestOptionsFileName(String dbPath, long envHandle)
       throws RocksDBException;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -37,6 +37,8 @@
     super(copyReadOptions(other.nativeHandle_));
     this.iterateLowerBoundSlice_ = other.iterateLowerBoundSlice_;
     this.iterateUpperBoundSlice_ = other.iterateUpperBoundSlice_;
+    this.timestampSlice_ = other.timestampSlice_;
+    this.iterStartTs_ = other.iterStartTs_;
   }
 
   /**
@@ -437,16 +439,15 @@
    *
    * Default: null
    *
-   * @param iterateLowerBound Slice representing the upper bound
+   * @param iterateLowerBound Slice representing the lower bound
    * @return the reference to the current ReadOptions.
    */
-  public ReadOptions setIterateLowerBound(final Slice iterateLowerBound) {
+  public ReadOptions setIterateLowerBound(final AbstractSlice<?> iterateLowerBound) {
     assert(isOwningHandle());
-    if (iterateLowerBound != null) {
-      // Hold onto a reference so it doesn't get garbage collected out from under us.
-      iterateLowerBoundSlice_ = iterateLowerBound;
-      setIterateLowerBound(nativeHandle_, iterateLowerBoundSlice_.getNativeHandle());
-    }
+    setIterateLowerBound(
+        nativeHandle_, iterateLowerBound == null ? 0 : iterateLowerBound.getNativeHandle());
+    // Hold onto a reference so it doesn't get garbage collected out from under us.
+    iterateLowerBoundSlice_ = iterateLowerBound;
     return this;
   }
 
@@ -485,13 +486,12 @@
    * @param iterateUpperBound Slice representing the upper bound
    * @return the reference to the current ReadOptions.
    */
-  public ReadOptions setIterateUpperBound(final Slice iterateUpperBound) {
+  public ReadOptions setIterateUpperBound(final AbstractSlice<?> iterateUpperBound) {
     assert(isOwningHandle());
-    if (iterateUpperBound != null) {
-      // Hold onto a reference so it doesn't get garbage collected out from under us.
-      iterateUpperBoundSlice_ = iterateUpperBound;
-      setIterateUpperBound(nativeHandle_, iterateUpperBoundSlice_.getNativeHandle());
-    }
+    setIterateUpperBound(
+        nativeHandle_, iterateUpperBound == null ? 0 : iterateUpperBound.getNativeHandle());
+    // Hold onto a reference so it doesn't get garbage collected out from under us.
+    iterateUpperBoundSlice_ = iterateUpperBound;
     return this;
   }
 
@@ -562,6 +562,233 @@
     return iterStartSeqnum(nativeHandle_);
   }
 
+  /**
+   * When true, by default use total_order_seek = true, and RocksDB can
+   * selectively enable prefix seek mode if won't generate a different result
+   * from total_order_seek, based on seek key, and iterator upper bound.
+   * Not supported in ROCKSDB_LITE mode, in the way that even with value true
+   * prefix mode is not used.
+   * Default: false
+   *
+   * @return true if auto prefix mode is set.
+   *
+   */
+  public boolean autoPrefixMode() {
+    assert (isOwningHandle());
+    return autoPrefixMode(nativeHandle_);
+  }
+
+  /**
+   * When true, by default use total_order_seek = true, and RocksDB can
+   * selectively enable prefix seek mode if won't generate a different result
+   * from total_order_seek, based on seek key, and iterator upper bound.
+   * Not supported in ROCKSDB_LITE mode, in the way that even with value true
+   * prefix mode is not used.
+   * Default: false
+   * @param mode auto prefix mode
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setAutoPrefixMode(final boolean mode) {
+    assert (isOwningHandle());
+    setAutoPrefixMode(nativeHandle_, mode);
+    return this;
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order &gt;key, timestamp&gt; tuples.
+   * For iterator, iter_start_ts is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   * @see #iterStartTs()
+   * @return Reference to timestamp or null if there is no timestamp defined.
+   */
+  public Slice timestamp() {
+    assert (isOwningHandle());
+    final long timestampSliceHandle = timestamp(nativeHandle_);
+    if (timestampSliceHandle != 0) {
+      return new Slice(timestampSliceHandle);
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order {@code <key, timestamp>} tuples.
+   * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   * @see #setIterStartTs(AbstractSlice)
+   * @param timestamp Slice representing the timestamp
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTimestamp(final AbstractSlice<?> timestamp) {
+    assert (isOwningHandle());
+    setTimestamp(nativeHandle_, timestamp == null ? 0 : timestamp.getNativeHandle());
+    timestampSlice_ = timestamp;
+    return this;
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order {@code <key, timestamp>} tuples.
+   * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   * @return Reference to lower bound timestamp or null if there is no lower bound timestamp
+   *     defined.
+   */
+  public Slice iterStartTs() {
+    assert (isOwningHandle());
+    final long iterStartTsHandle = iterStartTs(nativeHandle_);
+    if (iterStartTsHandle != 0) {
+      return new Slice(iterStartTsHandle);
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Timestamp of operation. Read should return the latest data visible to the
+   * specified timestamp. All timestamps of the same database must be of the
+   * same length and format. The user is responsible for providing a customized
+   * compare function via Comparator to order {@code <key, timestamp>} tuples.
+   * For iterator, {@code iter_start_ts} is the lower bound (older) and timestamp
+   * serves as the upper bound. Versions of the same record that fall in
+   * the timestamp range will be returned. If iter_start_ts is nullptr,
+   * only the most recent version visible to timestamp is returned.
+   * The user-specified timestamp feature is still under active development,
+   * and the API is subject to change.
+   *
+   * Default: null
+   *
+   * @param iterStartTs Reference to lower bound timestamp or null if there is no lower bound
+   *     timestamp defined
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterStartTs(final AbstractSlice<?> iterStartTs) {
+    assert (isOwningHandle());
+    setIterStartTs(nativeHandle_, iterStartTs == null ? 0 : iterStartTs.getNativeHandle());
+    iterStartTs_ = iterStartTs;
+    return this;
+  }
+
+  /**
+   * Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+   * in microseconds.
+   * It should be set to microseconds since epoch, i.e, {@code gettimeofday} or
+   * equivalent plus allowed duration in microseconds. The best way is to use
+   * {@code env->NowMicros() + some timeout}.
+   * This is best efforts. The call may exceed the deadline if there is IO
+   * involved and the file system doesn't support deadlines, or due to
+   * checking for deadline periodically rather than for every key if
+   * processing a batch
+   *
+   * @return deadline time in microseconds
+   */
+  public long deadline() {
+    assert (isOwningHandle());
+    return deadline(nativeHandle_);
+  }
+
+  /**
+   * Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
+   * in microseconds.
+   * It should be set to microseconds since epoch, i.e, {@code gettimeofday} or
+   * equivalent plus allowed duration in microseconds. The best way is to use
+   * {@code env->NowMicros() + some timeout}.
+   * This is best efforts. The call may exceed the deadline if there is IO
+   * involved and the file system doesn't support deadlines, or due to
+   * checking for deadline periodically rather than for every key if
+   * processing a batch
+   *
+   * @param deadlineTime deadline time in microseconds.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setDeadline(final long deadlineTime) {
+    assert (isOwningHandle());
+    setDeadline(nativeHandle_, deadlineTime);
+    return this;
+  }
+
+  /**
+   * A timeout in microseconds to be passed to the underlying FileSystem for
+   * reads. As opposed to deadline, this determines the timeout for each
+   * individual file read request. If a MultiGet/Get/Seek/Next etc call
+   * results in multiple reads, each read can last up to io_timeout us.
+   * @return ioTimeout time in microseconds
+   */
+  public long ioTimeout() {
+    assert (isOwningHandle());
+    return ioTimeout(nativeHandle_);
+  }
+
+  /**
+   * A timeout in microseconds to be passed to the underlying FileSystem for
+   * reads. As opposed to deadline, this determines the timeout for each
+   * individual file read request. If a MultiGet/Get/Seek/Next etc call
+   * results in multiple reads, each read can last up to io_timeout us.
+   *
+   * @param ioTimeout time in microseconds.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIoTimeout(final long ioTimeout) {
+    assert (isOwningHandle());
+    setIoTimeout(nativeHandle_, ioTimeout);
+    return this;
+  }
+
+  /**
+   * It limits the maximum cumulative value size of the keys in batch while
+   * reading through MultiGet. Once the cumulative value size exceeds this
+   * soft limit then all the remaining keys are returned with status Aborted.
+   *
+   * Default: {@code std::numeric_limits<uint64_t>::max()}
+   * @return actual valueSizeSofLimit
+   */
+  public long valueSizeSoftLimit() {
+    assert (isOwningHandle());
+    return valueSizeSoftLimit(nativeHandle_);
+  }
+
+  /**
+   * It limits the maximum cumulative value size of the keys in batch while
+   * reading through MultiGet. Once the cumulative value size exceeds this
+   * soft limit then all the remaining keys are returned with status Aborted.
+   *
+   * Default: {@code std::numeric_limits<uint64_t>::max()}
+   *
+   * @param valueSizeSofLimit
+   * @return the reference to the current ReadOptions
+   */
+  public ReadOptions setValueSizeSoftLimit(final long valueSizeSofLimit) {
+    assert (isOwningHandle());
+    setValueSizeSoftLimit(nativeHandle_, valueSizeSofLimit);
+    return this;
+  }
+
   // instance variables
   // NOTE: If you add new member variables, please update the copy constructor above!
   //
@@ -570,8 +797,10 @@
   // freely leave scope without us losing the Java Slice object, which during
   // close() would also reap its associated rocksdb::Slice native object since
   // it's possibly (likely) to be an owning handle.
-  private Slice iterateLowerBoundSlice_;
-  private Slice iterateUpperBoundSlice_;
+  private AbstractSlice<?> iterateLowerBoundSlice_;
+  private AbstractSlice<?> iterateUpperBoundSlice_;
+  private AbstractSlice<?> timestampSlice_;
+  private AbstractSlice<?> iterStartTs_;
 
   private native static long newReadOptions();
   private native static long newReadOptions(final boolean verifyChecksums,
@@ -619,4 +848,16 @@
       final long tableFilterHandle);
   private native void setIterStartSeqnum(final long handle, final long seqNum);
   private native long iterStartSeqnum(final long handle);
+  private native boolean autoPrefixMode(final long handle);
+  private native void setAutoPrefixMode(final long handle, final boolean autoPrefixMode);
+  private native long timestamp(final long handle);
+  private native void setTimestamp(final long handle, final long timestampSliceHandle);
+  private native long iterStartTs(final long handle);
+  private native void setIterStartTs(final long handle, final long iterStartTsHandle);
+  private native long deadline(final long handle);
+  private native void setDeadline(final long handle, final long deadlineTime);
+  private native long ioTimeout(final long handle);
+  private native void setIoTimeout(final long handle, final long ioTimeout);
+  private native long valueSizeSoftLimit(final long handle);
+  private native void setValueSizeSoftLimit(final long handle, final long softLimit);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.List;
+
 /**
  * RocksCallbackObject is similar to {@link RocksObject} but varies
  * in its construction as it is designed for Java objects which have functions
@@ -27,6 +29,27 @@
   }
 
   /**
+   * Given a list of RocksCallbackObjects, it returns a list
+   * of the native handles of the underlying objects.
+   *
+   * @param objectList the rocks callback objects
+   *
+   * @return the native handles
+   */
+  static /* @Nullable */ long[] toNativeHandleList(
+      /* @Nullable */ final List<? extends RocksCallbackObject> objectList) {
+    if (objectList == null) {
+      return null;
+    }
+    final int len = objectList.size();
+    final long[] handleList = new long[len];
+    for (int i = 0; i < len; i++) {
+      handleList[i] = objectList.get(i).nativeHandle_;
+    }
+    return handleList;
+  }
+
+  /**
    * Construct the Native C++ object which will callback
    * to our object methods
    *
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java	2025-05-19 16:14:27.000000000 +0000
@@ -31,13 +31,15 @@
     LOADED
   }
 
-  private static AtomicReference<LibraryState> libraryLoaded
-      = new AtomicReference<>(LibraryState.NOT_LOADED);
+  private static final AtomicReference<LibraryState> libraryLoaded =
+      new AtomicReference<>(LibraryState.NOT_LOADED);
 
   static {
     RocksDB.loadLibrary();
   }
 
+  private final List<ColumnFamilyHandle> ownedColumnFamilyHandles = new ArrayList<>();
+
   /**
    * Loads the necessary library files.
    * Calling this method twice will have no effect.
@@ -59,18 +61,21 @@
           if (compressionType.getLibraryName() != null) {
             System.loadLibrary(compressionType.getLibraryName());
           }
-        } catch (UnsatisfiedLinkError e) {
+        } catch (final UnsatisfiedLinkError e) {
           // since it may be optional, we ignore its loading failure here.
         }
       }
       try {
         NativeLibraryLoader.getInstance().loadLibrary(tmpDir);
-      } catch (IOException e) {
+      } catch (final IOException e) {
         libraryLoaded.set(LibraryState.NOT_LOADED);
         throw new RuntimeException("Unable to load the RocksDB shared library",
             e);
       }
 
+      final int encodedVersion = version();
+      version = Version.fromEncodedVersion(encodedVersion);
+
       libraryLoaded.set(LibraryState.LOADED);
       return;
     }
@@ -107,7 +112,7 @@
             System.load(path + "/" + Environment.getSharedLibraryFileName(
                 compressionType.getLibraryName()));
             break;
-          } catch (UnsatisfiedLinkError e) {
+          } catch (final UnsatisfiedLinkError e) {
             // since they are optional, we ignore loading fails.
           }
         }
@@ -120,7 +125,7 @@
               Environment.getJniLibraryFileName("rocksdbjni"));
           success = true;
           break;
-        } catch (UnsatisfiedLinkError e) {
+        } catch (final UnsatisfiedLinkError e) {
           err = e;
         }
       }
@@ -129,6 +134,9 @@
         throw err;
       }
 
+      final int encodedVersion = version();
+      version = Version.fromEncodedVersion(encodedVersion);
+
       libraryLoaded.set(LibraryState.LOADED);
       return;
     }
@@ -142,6 +150,10 @@
     }
   }
 
+  public static Version rocksdbVersion() {
+    return version;
+  }
+
   /**
    * Private constructor.
    *
@@ -297,9 +309,12 @@
     db.storeOptionsInstance(options);
 
     for (int i = 1; i < handles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
 
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
     return db;
   }
 
@@ -319,12 +334,63 @@
       throws RocksDBException {
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
-    Options options = new Options();
+    final Options options = new Options();
     return openReadOnly(options, path);
   }
 
   /**
    * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.
+   *
+   * @param options {@link Options} instance.
+   * @param path the path to the RocksDB.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final Options options, final String path)
+      throws RocksDBException {
+    return openReadOnly(options, path, false);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.
+   *
+   * @param options {@link Options} instance.
+   * @param path the path to the RocksDB.
+   * @param errorIfWalFileExists true to raise an error when opening the db
+   *            if a Write Ahead Log file exists, false otherwise.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final Options options, final String path,
+      final boolean errorIfWalFileExists) throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path, errorIfWalFileExists));
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
    * Read-Only mode given the path to the database using the default
    * options.
    *
@@ -345,8 +411,7 @@
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
     final DBOptions options = new DBOptions();
-    return openReadOnly(options, path, columnFamilyDescriptors,
-        columnFamilyHandles);
+    return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false);
   }
 
   /**
@@ -354,26 +419,27 @@
    * Read-Only mode given the path to the database using the specified
    * options and db path.
    *
-   * Options instance *should* not be disposed before all DBs using this options
-   * instance have been closed. If user doesn't call options dispose explicitly,
-   * then this options instance will be GC'd automatically.
+   * <p>This open method allows to open RocksDB using a subset of available
+   * column families</p>
+   * <p>Options instance *should* not be disposed before all DBs using this
+   * options instance have been closed. If user doesn't call options dispose
+   * explicitly,then this options instance will be GC'd automatically.</p>
    *
-   * @param options {@link Options} instance.
+   * @param options {@link DBOptions} instance.
    * @param path the path to the RocksDB.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public static RocksDB openReadOnly(final Options options, final String path)
-      throws RocksDBException {
-    // when non-default Options is used, keeping an Options reference
-    // in RocksDB can prevent Java to GC during the life-time of
-    // the currently-created RocksDB.
-    final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path));
-    db.storeOptionsInstance(options);
-    return db;
+  public static RocksDB openReadOnly(final DBOptions options, final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
+    return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false);
   }
 
   /**
@@ -392,6 +458,8 @@
    * @param columnFamilyDescriptors list of column family descriptors
    * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
    *     on open.
+   * @param errorIfWalFileExists true to raise an error when opening the db
+   *            if a Write Ahead Log file exists, false otherwise.
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
    *
@@ -400,7 +468,7 @@
    */
   public static RocksDB openReadOnly(final DBOptions options, final String path,
       final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
-      final List<ColumnFamilyHandle> columnFamilyHandles)
+      final List<ColumnFamilyHandle> columnFamilyHandles, final boolean errorIfWalFileExists)
       throws RocksDBException {
     // when non-default Options is used, keeping an Options reference
     // in RocksDB can prevent Java to GC during the life-time of
@@ -415,15 +483,114 @@
       cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
     }
 
-    final long[] handles = openROnly(options.nativeHandle_, path, cfNames,
-        cfOptionHandles);
+    final long[] handles =
+        openROnly(options.nativeHandle_, path, cfNames, cfOptionHandles, errorIfWalFileExists);
     final RocksDB db = new RocksDB(handles[0]);
     db.storeOptionsInstance(options);
 
     for (int i = 1; i < handles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
 
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
+    return db;
+  }
+
+  /**
+   * Open DB as secondary instance with only the default column family.
+   *
+   * The secondary instance can dynamically tail the MANIFEST of
+   * a primary that must have already been created. User can call
+   * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up
+   * with primary (WAL tailing is NOT supported now) whenever the user feels
+   * necessary. Column families created by the primary after the secondary
+   * instance starts are currently ignored by the secondary instance.
+   * Column families opened by secondary and dropped by the primary will be
+   * dropped by secondary as well. However the user of the secondary instance
+   * can still access the data of such dropped column family as long as they
+   * do not destroy the corresponding column family handle.
+   * WAL tailing is not supported at present, but will arrive soon.
+   *
+   * @param options the options to open the secondary instance.
+   * @param path the path to the primary RocksDB instance.
+   * @param secondaryPath points to a directory where the secondary instance
+   *    stores its info log
+   *
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openAsSecondary(final Options options, final String path,
+      final String secondaryPath) throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    final RocksDB db = new RocksDB(openAsSecondary(options.nativeHandle_, path, secondaryPath));
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * Open DB as secondary instance with column families.
+   * You can open a subset of column families in secondary mode.
+   *
+   * The secondary instance can dynamically tail the MANIFEST of
+   * a primary that must have already been created. User can call
+   * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up
+   * with primary (WAL tailing is NOT supported now) whenever the user feels
+   * necessary. Column families created by the primary after the secondary
+   * instance starts are currently ignored by the secondary instance.
+   * Column families opened by secondary and dropped by the primary will be
+   * dropped by secondary as well. However the user of the secondary instance
+   * can still access the data of such dropped column family as long as they
+   * do not destroy the corresponding column family handle.
+   * WAL tailing is not supported at present, but will arrive soon.
+   *
+   * @param options the options to open the secondary instance.
+   * @param path the path to the primary RocksDB instance.
+   * @param secondaryPath points to a directory where the secondary instance
+   *    stores its info log.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   *
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openAsSecondary(final DBOptions options, final String path,
+      final String secondaryPath, final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors.get(i);
+      cfNames[i] = cfDescriptor.getName();
+      cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_;
+    }
+
+    final long[] handles =
+        openAsSecondary(options.nativeHandle_, path, secondaryPath, cfNames, cfOptionHandles);
+    final RocksDB db = new RocksDB(handles[0]);
+    db.storeOptionsInstance(options);
+
+    for (int i = 1; i < handles.length; i++) {
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
+    }
+
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
     return db;
   }
 
@@ -441,6 +608,11 @@
    * @throws RocksDBException if an error occurs whilst closing.
    */
   public void closeE() throws RocksDBException {
+    for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) {
+      columnFamilyHandle.close();
+    }
+    ownedColumnFamilyHandles.clear();
+
     if (owningHandle_.compareAndSet(true, false)) {
       try {
         closeDatabase(nativeHandle_);
@@ -463,6 +635,11 @@
    */
   @Override
   public void close() {
+    for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) {
+      columnFamilyHandle.close();
+    }
+    ownedColumnFamilyHandles.clear();
+
     if (owningHandle_.compareAndSet(true, false)) {
       try {
         closeDatabase(nativeHandle_);
@@ -505,10 +682,12 @@
   public ColumnFamilyHandle createColumnFamily(
       final ColumnFamilyDescriptor columnFamilyDescriptor)
       throws RocksDBException {
-    return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_,
-        columnFamilyDescriptor.getName(),
-        columnFamilyDescriptor.getName().length,
-        columnFamilyDescriptor.getOptions().nativeHandle_));
+    final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this,
+        createColumnFamily(nativeHandle_, columnFamilyDescriptor.getName(),
+            columnFamilyDescriptor.getName().length,
+            columnFamilyDescriptor.getOptions().nativeHandle_));
+    ownedColumnFamilyHandles.add(columnFamilyHandle);
+    return columnFamilyHandle;
   }
 
   /**
@@ -532,8 +711,10 @@
     final List<ColumnFamilyHandle> columnFamilyHandles =
         new ArrayList<>(cfHandles.length);
     for (int i = 0; i < cfHandles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
+    ownedColumnFamilyHandles.addAll(columnFamilyHandles);
     return columnFamilyHandles;
   }
 
@@ -563,8 +744,10 @@
     final List<ColumnFamilyHandle> columnFamilyHandles =
         new ArrayList<>(cfHandles.length);
     for (int i = 0; i < cfHandles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
+    ownedColumnFamilyHandles.addAll(columnFamilyHandles);
     return columnFamilyHandles;
   }
 
@@ -597,7 +780,22 @@
     dropColumnFamilies(nativeHandle_, cfHandles);
   }
 
-  //TODO(AR) what about DestroyColumnFamilyHandle
+  /**
+   * Deletes native column family handle of given {@link ColumnFamilyHandle} Java object
+   * and removes reference from {@link RocksDB#ownedColumnFamilyHandles}.
+   *
+   * @param columnFamilyHandle column family handle object.
+   */
+  public void destroyColumnFamilyHandle(final ColumnFamilyHandle columnFamilyHandle) {
+    for (int i = 0; i < ownedColumnFamilyHandles.size(); ++i) {
+      final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i);
+      if (ownedHandle.equals(columnFamilyHandle)) {
+        columnFamilyHandle.close();
+        ownedColumnFamilyHandles.remove(i);
+        return;
+      }
+    }
+  }
 
   /**
    * Set the database entry for "key" to "value".
@@ -2020,8 +2218,8 @@
     assert(keys.size() != 0);
 
     final byte[][] keysArray = keys.toArray(new byte[0][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2080,8 +2278,8 @@
     }
 
     final byte[][] keysArray = keys.toArray(new byte[0][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2119,8 +2317,8 @@
     assert(keys.size() != 0);
 
     final byte[][] keysArray = keys.toArray(new byte[0][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2180,8 +2378,8 @@
     }
 
     final byte[][] keysArray = keys.toArray(new byte[0][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2217,8 +2415,8 @@
     assert(keys.size() != 0);
 
     final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2263,8 +2461,8 @@
     }
 
     final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2290,8 +2488,8 @@
     assert(keys.size() != 0);
 
     final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2336,8 +2534,8 @@
     }
 
     final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
+    final int[] keyOffsets = new int[keysArray.length];
+    final int[] keyLengths = new int[keysArray.length];
     for(int i = 0; i < keyLengths.length; i++) {
       keyLengths[i] = keysArray[i].length;
     }
@@ -2347,8 +2545,158 @@
   }
 
   /**
+   * Fetches a list of values for the given list of keys, all from the default column family.
+   *
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @return list of number of bytes in DB for each requested key
+   * this can be more than the size of the corresponding buffer; then the buffer will be filled
+   * with the appropriate truncation of the database value.
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys and passed values
+   * do not match.
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(
+      final List<ByteBuffer> keys, final List<ByteBuffer> values) throws RocksDBException {
+    final ReadOptions readOptions = new ReadOptions();
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>(1);
+    columnFamilyHandleList.add(getDefaultColumnFamily());
+    return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values);
+  }
+
+  /**
+   * Fetches a list of values for the given list of keys, all from the default column family.
+   *
+   * @param readOptions Read options
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys and passed values
+   * do not match.
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(final ReadOptions readOptions,
+      final List<ByteBuffer> keys, final List<ByteBuffer> values) throws RocksDBException {
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>(1);
+    columnFamilyHandleList.add(getDefaultColumnFamily());
+    return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values);
+  }
+
+  /**
+   * Fetches a list of values for the given list of keys.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   * {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys, passed values and
+   * passed column family handles do not match.
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(
+      final List<ColumnFamilyHandle> columnFamilyHandleList, final List<ByteBuffer> keys,
+      final List<ByteBuffer> values) throws RocksDBException {
+    final ReadOptions readOptions = new ReadOptions();
+    return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values);
+  }
+
+  /**
+   * Fetches a list of values for the given list of keys.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param readOptions Read options
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   * {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys list of keys for which values need to be retrieved.
+   * @param values list of buffers to return retrieved values in
+   * @throws RocksDBException if error happens in underlying native library.
+   * @throws IllegalArgumentException thrown if the number of passed keys, passed values and
+   * passed column family handles do not match.
+   */
+  public List<ByteBufferGetStatus> multiGetByteBuffers(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandleList, final List<ByteBuffer> keys,
+      final List<ByteBuffer> values) throws RocksDBException {
+    assert (keys.size() != 0);
+
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size() != columnFamilyHandleList.size() && columnFamilyHandleList.size() > 1) {
+      throw new IllegalArgumentException(
+          "Wrong number of ColumnFamilyHandle(s) supplied. Provide 0, 1, or as many as there are key/value(s)");
+    }
+
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (values.size() != keys.size()) {
+      throw new IllegalArgumentException("For each key there must be a corresponding value.");
+    }
+
+    // TODO (AP) support indirect buffers
+    for (final ByteBuffer key : keys) {
+      if (!key.isDirect()) {
+        throw new IllegalArgumentException("All key buffers must be direct byte buffers");
+      }
+    }
+
+    // TODO (AP) support indirect buffers, though probably via a less efficient code path
+    for (final ByteBuffer value : values) {
+      if (!value.isDirect()) {
+        throw new IllegalArgumentException("All value buffers must be direct byte buffers");
+      }
+    }
+
+    final int numCFHandles = columnFamilyHandleList.size();
+    final long[] cfHandles = new long[numCFHandles];
+    for (int i = 0; i < numCFHandles; i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final int numValues = keys.size();
+
+    final ByteBuffer[] keysArray = keys.toArray(new ByteBuffer[0]);
+    final int[] keyOffsets = new int[numValues];
+    final int[] keyLengths = new int[numValues];
+    for (int i = 0; i < numValues; i++) {
+      // TODO (AP) add keysArray[i].arrayOffset() if the buffer is indirect
+      // TODO (AP) because in that case we have to pass the array directly,
+      // so that the JNI C++ code will not know to compensate for the array offset
+      keyOffsets[i] = keysArray[i].position();
+      keyLengths[i] = keysArray[i].limit();
+    }
+    final ByteBuffer[] valuesArray = values.toArray(new ByteBuffer[0]);
+    final int[] valuesSizeArray = new int[numValues];
+    final Status[] statusArray = new Status[numValues];
+
+    multiGet(nativeHandle_, readOptions.nativeHandle_, cfHandles, keysArray, keyOffsets, keyLengths,
+        valuesArray, valuesSizeArray, statusArray);
+
+    final List<ByteBufferGetStatus> results = new ArrayList<>();
+    for (int i = 0; i < numValues; i++) {
+      final Status status = statusArray[i];
+      if (status.getCode() == Status.Code.Ok) {
+        final ByteBuffer value = valuesArray[i];
+        value.position(Math.min(valuesSizeArray[i], value.capacity()));
+        value.flip(); // prepare for read out
+        results.add(new ByteBufferGetStatus(status, valuesSizeArray[i], value));
+      } else {
+        results.add(new ByteBufferGetStatus(status));
+      }
+    }
+
+    return results;
+  }
+
+  /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2372,7 +2720,9 @@
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2401,7 +2751,9 @@
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2428,7 +2780,9 @@
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2460,7 +2814,9 @@
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2487,7 +2843,9 @@
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2519,7 +2877,9 @@
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2548,7 +2908,9 @@
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2602,6 +2964,159 @@
   }
 
   /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ByteBuffer key) {
+    return keyMayExist(null, (ReadOptions) null, key);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) {
+    return keyMayExist(columnFamilyHandle, (ReadOptions) null, key);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ReadOptions readOptions, final ByteBuffer key) {
+    return keyMayExist(null, readOptions, key);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(final ByteBuffer key, final ByteBuffer value) {
+    return keyMayExist(null, null, key, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(
+      final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value) {
+    return keyMayExist(columnFamilyHandle, null, key, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(
+      final ReadOptions readOptions, final ByteBuffer key, final ByteBuffer value) {
+    return keyMayExist(null, readOptions, key, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a false negative.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @return false if the key definitely does not exist in the database,
+   *     otherwise true.
+   */
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions, final ByteBuffer key) {
+    assert key != null : "key ByteBuffer parameter cannot be null";
+    assert key.isDirect() : "key parameter must be a direct ByteBuffer";
+    return keyMayExistDirect(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.limit());
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns {@link KeyMayExist.KeyMayExistEnum#kNotExist},
+   * otherwise if it can with best effort retreive the value, it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithValue} otherwise it returns {@link
+   * KeyMayExist.KeyMayExistEnum#kExistsWithoutValue}. The choice not to return a value which might
+   * exist is at the discretion of the implementation; the only guarantee is that {@link
+   * KeyMayExist.KeyMayExistEnum#kNotExist} is an assurance that the key does not exist.
+   *
+   * @param columnFamilyHandle the {@link ColumnFamilyHandle} to look for the key in
+   * @param readOptions the {@link ReadOptions} to use when reading the key/value
+   * @param key bytebuffer containing the value of the key
+   * @param value bytebuffer which will receive a value if the key exists and a value is known
+   * @return a {@link KeyMayExist} object reporting if key may exist and if a value is provided
+   */
+  public KeyMayExist keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions, final ByteBuffer key, final ByteBuffer value) {
+    assert key != null : "key ByteBuffer parameter cannot be null";
+    assert key.isDirect() : "key parameter must be a direct ByteBuffer";
+    assert value
+        != null
+        : "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method";
+    assert value.isDirect() : "value parameter must be a direct ByteBuffer";
+
+    final int[] result = keyMayExistDirectFoundValue(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.remaining(),
+        value, value.position(), value.remaining());
+    final int valueLength = result[1];
+    value.limit(value.position() + Math.min(valueLength, value.remaining()));
+    return new KeyMayExist(KeyMayExist.KeyMayExistEnum.values()[result[0]], valueLength);
+  }
+
+  /**
    * <p>Return a heap-allocated iterator over the contents of the
    * database. The result of newIterator() is initially invalid
    * (caller must call one of the Seek methods on the iterator
@@ -2636,8 +3151,8 @@
   }
 
   /**
-   * <p>Return a heap-allocated iterator over the contents of the
-   * database. The result of newIterator() is initially invalid
+   * <p>Return a heap-allocated iterator over the contents of a
+   * ColumnFamily. The result of newIterator() is initially invalid
    * (caller must call one of the Seek methods on the iterator
    * before using it).</p>
    *
@@ -2656,8 +3171,8 @@
   }
 
   /**
-   * <p>Return a heap-allocated iterator over the contents of the
-   * database. The result of newIterator() is initially invalid
+   * <p>Return a heap-allocated iterator over the contents of a
+   * ColumnFamily. The result of newIterator() is initially invalid
    * (caller must call one of the Seek methods on the iterator
    * before using it).</p>
    *
@@ -3376,9 +3891,52 @@
       /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
       final MutableColumnFamilyOptions mutableColumnFamilyOptions)
       throws RocksDBException {
-    setOptions(nativeHandle_, columnFamilyHandle.nativeHandle_,
-        mutableColumnFamilyOptions.getKeys(),
-        mutableColumnFamilyOptions.getValues());
+    setOptions(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues());
+  }
+
+  /**
+   * Get the options for the column family handle
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   *
+   * @return the options parsed from the options string return by RocksDB
+   *
+   * @throws RocksDBException if an error occurs while getting the options string, or parsing the
+   *     resulting options string into options
+   */
+  public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException {
+    String optionsString = getOptions(
+        nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+    return MutableColumnFamilyOptions.parse(optionsString, true);
+  }
+
+  /**
+   * Default column family options
+   *
+   * @return the options parsed from the options string return by RocksDB
+   *
+   * @throws RocksDBException if an error occurs while getting the options string, or parsing the
+   *     resulting options string into options
+   */
+  public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions()
+      throws RocksDBException {
+    return getOptions(null);
+  }
+
+  /**
+   * Get the database options
+   *
+   * @return the DB options parsed from the options string return by RocksDB
+   *
+   * @throws RocksDBException if an error occurs while getting the options string, or parsing the
+   *     resulting options string into options
+   */
+  public MutableDBOptions.MutableDBOptionsBuilder getDBOptions() throws RocksDBException {
+    String optionsString = getDBOptions(nativeHandle_);
+    return MutableDBOptions.parse(optionsString, true);
   }
 
   /**
@@ -3480,6 +4038,17 @@
   }
 
   /**
+   * This function will cancel all currently running background processes.
+   *
+   * @param wait if true, wait for all background work to be cancelled before
+   *   returning.
+   *
+   */
+  public void cancelAllBackgroundWork(boolean wait) {
+    cancelAllBackgroundWork(nativeHandle_, wait);
+  }
+
+  /**
    * This function will wait until all currently running background processes
    * finish. After it returns, no background process will be run until
    * {@link #continueBackgroundWork()} is called
@@ -3914,7 +4483,7 @@
    *
    * @return the column family metadata
    */
-  public ColumnFamilyMetaData GetColumnFamilyMetaData() {
+  public ColumnFamilyMetaData getColumnFamilyMetaData() {
     return getColumnFamilyMetaData(null);
   }
 
@@ -4146,6 +4715,25 @@
   }
 
   /**
+   * Make the secondary instance catch up with the primary by tailing and
+   * replaying the MANIFEST and WAL of the primary.
+   * Column families created by the primary after the secondary instance starts
+   * will be ignored unless the secondary instance closes and restarts with the
+   * newly created column families.
+   * Column families that exist before secondary instance starts and dropped by
+   * the primary afterwards will be marked as dropped. However, as long as the
+   * secondary instance does not delete the corresponding column family
+   * handles, the data of the column family is still accessible to the
+   * secondary.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void tryCatchUpWithPrimary() throws RocksDBException {
+    tryCatchUpWithPrimary(nativeHandle_);
+  }
+
+  /**
    * Delete files in multiple ranges at once.
    * Delete files in a lot of ranges one at a time can be slow, use this API for
    * better performance in that case.
@@ -4212,7 +4800,7 @@
     return rangeSliceHandles;
   }
 
-  protected void storeOptionsInstance(DBOptionsInterface options) {
+  protected void storeOptionsInstance(DBOptionsInterface<?> options) {
     options_ = options;
   }
 
@@ -4248,8 +4836,8 @@
       final String path, final byte[][] columnFamilyNames,
       final long[] columnFamilyOptions) throws RocksDBException;
 
-  private native static long openROnly(final long optionsHandle,
-      final String path) throws RocksDBException;
+  private native static long openROnly(final long optionsHandle, final String path,
+      final boolean errorIfWalFileExists) throws RocksDBException;
 
   /**
    * @param optionsHandle Native handle pointing to an Options object
@@ -4263,10 +4851,16 @@
    *
    * @throws RocksDBException thrown if the database could not be opened
    */
-  private native static long[] openROnly(final long optionsHandle,
-      final String path, final byte[][] columnFamilyNames,
-      final long[] columnFamilyOptions
-  ) throws RocksDBException;
+  private native static long[] openROnly(final long optionsHandle, final String path,
+      final byte[][] columnFamilyNames, final long[] columnFamilyOptions,
+      final boolean errorIfWalFileExists) throws RocksDBException;
+
+  private native static long openAsSecondary(final long optionsHandle, final String path,
+      final String secondaryPath) throws RocksDBException;
+
+  private native static long[] openAsSecondary(final long optionsHandle, final String path,
+      final String secondaryPath, final byte[][] columnFamilyNames,
+      final long[] columnFamilyOptions) throws RocksDBException;
 
   @Override protected native void disposeInternal(final long handle);
 
@@ -4287,7 +4881,6 @@
       final long handle, final long cfHandle) throws RocksDBException;
   private native void dropColumnFamilies(final long handle,
       final long[] cfHandles) throws RocksDBException;
-  //TODO(AR) best way to express DestroyColumnFamilyHandle? ...maybe in ColumnFamilyHandle?
   private native void put(final long handle, final byte[] key,
       final int keyOffset, final int keyLength, final byte[] value,
       final int valueOffset, int valueLength) throws RocksDBException;
@@ -4397,6 +4990,12 @@
   private native byte[][] multiGet(final long dbHandle, final long rOptHandle,
       final byte[][] keys, final int[] keyOffsets, final int[] keyLengths,
       final long[] columnFamilyHandles);
+
+  private native void multiGet(final long dbHandle, final long rOptHandle,
+      final long[] columnFamilyHandles, final ByteBuffer[] keysArray, final int[] keyOffsets,
+      final int[] keyLengths, final ByteBuffer[] valuesArray, final int[] valuesSizeArray,
+      final Status[] statusArray);
+
   private native boolean keyMayExist(
       final long handle, final long cfHandle, final long readOptHandle,
       final byte[] key, final int keyOffset, final int keyLength);
@@ -4426,6 +5025,11 @@
   private native int getDirect(long handle, long readOptHandle, ByteBuffer key, int keyOffset,
       int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle)
       throws RocksDBException;
+  private native boolean keyMayExistDirect(final long handle, final long cfHhandle,
+      final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength);
+  private native int[] keyMayExistDirectFoundValue(final long handle, final long cfHhandle,
+      final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength,
+      final ByteBuffer value, final int valueOffset, final int valueLength);
   private native void deleteDirect(long handle, long optHandle, ByteBuffer key, int keyOffset,
       int keyLength, long cfHandle) throws RocksDBException;
   private native long getLongProperty(final long nativeHandle,
@@ -4438,9 +5042,9 @@
   private native long[] getApproximateSizes(final long nativeHandle,
       final long columnFamilyHandle, final long[] rangeSliceHandles,
       final byte includeFlags);
-  private final native long[] getApproximateMemTableStats(
-      final long nativeHandle, final long columnFamilyHandle,
-      final long rangeStartSliceHandle, final long rangeLimitSliceHandle);
+  private native long[] getApproximateMemTableStats(final long nativeHandle,
+      final long columnFamilyHandle, final long rangeStartSliceHandle,
+      final long rangeLimitSliceHandle);
   private native void compactRange(final long handle,
       /* @Nullable */ final byte[] begin, final int beginLen,
       /* @Nullable */ final byte[] end, final int endLen,
@@ -4448,8 +5052,10 @@
       throws RocksDBException;
   private native void setOptions(final long handle, final long cfHandle,
       final String[] keys, final String[] values) throws RocksDBException;
+  private native String getOptions(final long handle, final long cfHandle);
   private native void setDBOptions(final long handle,
       final String[] keys, final String[] values) throws RocksDBException;
+  private native String getDBOptions(final long handle);
   private native String[] compactFiles(final long handle,
       final long compactionOptionsHandle,
       final long columnFamilyHandle,
@@ -4457,6 +5063,8 @@
       final int outputLevel,
       final int outputPathId,
       final long compactionJobInfoHandle) throws RocksDBException;
+  private native void cancelAllBackgroundWork(final long handle,
+      final boolean wait);
   private native void pauseBackgroundWork(final long handle)
       throws RocksDBException;
   private native void continueBackgroundWork(final long handle)
@@ -4512,11 +5120,54 @@
   private native void startTrace(final long handle, final long maxTraceFileSize,
       final long traceWriterHandle) throws RocksDBException;
   private native void endTrace(final long handle) throws RocksDBException;
+  private native void tryCatchUpWithPrimary(final long handle) throws RocksDBException;
   private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges,
       boolean include_end) throws RocksDBException;
 
   private native static void destroyDB(final String path,
       final long optionsHandle) throws RocksDBException;
 
-  protected DBOptionsInterface options_;
+  private native static int version();
+
+  protected DBOptionsInterface<?> options_;
+  private static Version version;
+
+  public static class Version {
+    private final byte major;
+    private final byte minor;
+    private final byte patch;
+
+    public Version(final byte major, final byte minor, final byte patch) {
+      this.major = major;
+      this.minor = minor;
+      this.patch = patch;
+    }
+
+    public int getMajor() {
+      return major;
+    }
+
+    public int getMinor() {
+      return minor;
+    }
+
+    public int getPatch() {
+      return patch;
+    }
+
+    @Override
+    public String toString() {
+      return getMajor() + "." + getMinor() + "." + getPatch();
+    }
+
+    private static Version fromEncodedVersion(int encodedVersion) {
+      final byte patch = (byte) (encodedVersion & 0xff);
+      encodedVersion >>= 8;
+      final byte minor = (byte) (encodedVersion & 0xff);
+      encodedVersion >>= 8;
+      final byte major = (byte) (encodedVersion & 0xff);
+
+      return new Version(major, minor, patch);
+    }
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java	2025-05-19 16:14:27.000000000 +0000
@@ -102,6 +102,7 @@
   @Override final native void seekToLast0(long handle);
   @Override final native void next0(long handle);
   @Override final native void prev0(long handle);
+  @Override final native void refresh0(long handle);
   @Override final native void seek0(long handle, byte[] target, int targetLen);
   @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
   @Override
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -114,4 +114,14 @@
    *                          native library.
    */
   void status() throws RocksDBException;
+
+  /**
+   * <p>If supported, renew the iterator to represent the latest state. The iterator will be
+   * invalidated after the call. Not supported if {@link ReadOptions#setSnapshot(Snapshot)} was
+   * specified when creating the iterator.</p>
+   *
+   * @throws RocksDBException thrown if the operation is not supported or an error happens in the
+   *     underlying native library
+   */
+  void refresh() throws RocksDBException;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java	2025-05-19 16:14:27.000000000 +0000
@@ -38,4 +38,8 @@
   }
 
   protected abstract void disposeInternal(final long handle);
+
+  public long getNativeHandle() {
+    return nativeHandle_;
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SanityLevel.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,41 @@
+package org.rocksdb;
+
+public enum SanityLevel {
+  NONE((byte) 0x0),
+  LOOSELY_COMPATIBLE((byte) 0x1),
+  EXACT_MATCH((byte) 0xFF);
+
+  private final byte value;
+
+  SanityLevel(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the SanityLevel from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the SanityLevel
+   *
+   * @throws IllegalArgumentException if the value does not match a
+   *     SanityLevel
+   */
+  static SanityLevel fromValue(final byte value) throws IllegalArgumentException {
+    for (final SanityLevel level : SanityLevel.values()) {
+      if (level.value == value) {
+        return level;
+      }
+    }
+    throw new IllegalArgumentException("Unknown value for SanityLevel: " + value);
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileReaderIterator.java	2025-05-19 16:14:27.000000000 +0000
@@ -102,6 +102,7 @@
   @Override final native void seekToLast0(long handle);
   @Override final native void next0(long handle);
   @Override final native void prev0(long handle);
+  @Override final native void refresh0(long handle) throws RocksDBException;
   @Override final native void seek0(long handle, byte[] target, int targetLen);
   @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
   @Override final native void status0(long handle) throws RocksDBException;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java	2025-05-19 16:14:27.000000000 +0000
@@ -244,6 +244,7 @@
   /**
    * Return the current file size.
    *
+   * @return the current file size.
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFactory.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,15 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Handle to factory for SstPartitioner. It is used in {@link ColumnFamilyOptions}
+ */
+public abstract class SstPartitionerFactory extends RocksObject {
+  protected SstPartitionerFactory(final long nativeHandle) {
+    super(nativeHandle);
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Fixed prefix factory. It partitions SST files using fixed prefix of the key.
+ */
+public class SstPartitionerFixedPrefixFactory extends SstPartitionerFactory {
+  public SstPartitionerFixedPrefixFactory(long prefixLength) {
+    super(newSstPartitionerFixedPrefixFactory0(prefixLength));
+  }
+
+  private native static long newSstPartitionerFixedPrefixFactory0(long prefixLength);
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Status.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.Objects;
+
 /**
  * Represents the status returned by a function call in RocksDB.
  *
@@ -135,4 +137,19 @@
       return value;
     }
   }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    Status status = (Status) o;
+    return code == status.code && subCode == status.subCode && Objects.equals(state, status.state);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(code, subCode, state);
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java	2025-05-19 16:14:27.000000000 +0000
@@ -19,6 +19,11 @@
         super(newSharedStringAppendOperator(delim));
     }
 
+    public StringAppendOperator(String delim) {
+      super(newSharedStringAppendOperator(delim));
+    }
+
     private native static long newSharedStringAppendOperator(final char delim);
+    private native static long newSharedStringAppendOperator(final String delim);
     @Override protected final native void disposeInternal(final long handle);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileCreationBriefInfo {
+  private final String dbName;
+  private final String columnFamilyName;
+  private final String filePath;
+  private final int jobId;
+  private final TableFileCreationReason reason;
+
+  /**
+   * Access is private as this will only be constructed from
+   * C++ via JNI, either directly of via
+   * {@link TableFileCreationInfo#TableFileCreationInfo(long, TableProperties, Status, String,
+   * String, String, int, byte)}.
+   *
+   * @param dbName the database name
+   * @param columnFamilyName the column family name
+   * @param filePath the path to the table file
+   * @param jobId the job identifier
+   * @param tableFileCreationReasonValue the reason for creation of the table file
+   */
+  protected TableFileCreationBriefInfo(final String dbName, final String columnFamilyName,
+      final String filePath, final int jobId, final byte tableFileCreationReasonValue) {
+    this.dbName = dbName;
+    this.columnFamilyName = columnFamilyName;
+    this.filePath = filePath;
+    this.jobId = jobId;
+    this.reason = TableFileCreationReason.fromValue(tableFileCreationReasonValue);
+  }
+
+  /**
+   * Get the name of the database where the file was created.
+   *
+   * @return the name of the database.
+   */
+  public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * Get the name of the column family where the file was created.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path to the created file.
+   *
+   * @return the path.
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the job (which could be flush or compaction) that
+   * created the file.
+   *
+   * @return the id of the job.
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Get the reason for creating the table.
+   *
+   * @return the reason for creating the table.
+   */
+  public TableFileCreationReason getReason() {
+    return reason;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileCreationBriefInfo that = (TableFileCreationBriefInfo) o;
+    return jobId == that.jobId && Objects.equals(dbName, that.dbName)
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filePath, that.filePath) && reason == that.reason;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(dbName, columnFamilyName, filePath, jobId, reason);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileCreationBriefInfo{"
+        + "dbName='" + dbName + '\'' + ", columnFamilyName='" + columnFamilyName + '\''
+        + ", filePath='" + filePath + '\'' + ", jobId=" + jobId + ", reason=" + reason + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileCreationInfo extends TableFileCreationBriefInfo {
+  private final long fileSize;
+  private final TableProperties tableProperties;
+  private final Status status;
+
+  /**
+   * Access is protected as this will only be constructed from
+   * C++ via JNI.
+   *
+   * @param fileSize the size of the table file
+   * @param tableProperties the properties of the table file
+   * @param status the status of the creation operation
+   * @param dbName the database name
+   * @param columnFamilyName the column family name
+   * @param filePath the path to the table file
+   * @param jobId the job identifier
+   * @param tableFileCreationReasonValue the reason for creation of the table file
+   */
+  protected TableFileCreationInfo(final long fileSize, final TableProperties tableProperties,
+      final Status status, final String dbName, final String columnFamilyName,
+      final String filePath, final int jobId, final byte tableFileCreationReasonValue) {
+    super(dbName, columnFamilyName, filePath, jobId, tableFileCreationReasonValue);
+    this.fileSize = fileSize;
+    this.tableProperties = tableProperties;
+    this.status = status;
+  }
+
+  /**
+   * Get the size of the file.
+   *
+   * @return the size.
+   */
+  public long getFileSize() {
+    return fileSize;
+  }
+
+  /**
+   * Get the detailed properties of the created file.
+   *
+   * @return the properties.
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  /**
+   * Get the status indicating whether the creation was successful or not.
+   *
+   * @return the status.
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileCreationInfo that = (TableFileCreationInfo) o;
+    return fileSize == that.fileSize && Objects.equals(tableProperties, that.tableProperties)
+        && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(fileSize, tableProperties, status);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileCreationInfo{"
+        + "fileSize=" + fileSize + ", tableProperties=" + tableProperties + ", status=" + status
+        + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileCreationReason.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum TableFileCreationReason {
+  FLUSH((byte) 0x00),
+  COMPACTION((byte) 0x01),
+  RECOVERY((byte) 0x02),
+  MISC((byte) 0x03);
+
+  private final byte value;
+
+  TableFileCreationReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the TableFileCreationReason from the internal representation value.
+   *
+   * @return the table file creation reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static TableFileCreationReason fromValue(final byte value) {
+    for (final TableFileCreationReason tableFileCreationReason : TableFileCreationReason.values()) {
+      if (tableFileCreationReason.value == value) {
+        return tableFileCreationReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for TableFileCreationReason: " + value);
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileDeletionInfo {
+  private final String dbName;
+  private final String filePath;
+  private final int jobId;
+  private final Status status;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  TableFileDeletionInfo(
+      final String dbName, final String filePath, final int jobId, final Status status) {
+    this.dbName = dbName;
+    this.filePath = filePath;
+    this.jobId = jobId;
+    this.status = status;
+  }
+
+  /**
+   * Get the name of the database where the file was deleted.
+   *
+   * @return the name of the database.
+   */
+  public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * Get the path to the deleted file.
+   *
+   * @return the path.
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the job which deleted the file.
+   *
+   * @return the id of the job.
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Get the status indicating whether the deletion was successful or not.
+   *
+   * @return the status
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileDeletionInfo that = (TableFileDeletionInfo) o;
+    return jobId == that.jobId && Objects.equals(dbName, that.dbName)
+        && Objects.equals(filePath, that.filePath) && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(dbName, filePath, jobId, status);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileDeletionInfo{"
+        + "dbName='" + dbName + '\'' + ", filePath='" + filePath + '\'' + ", jobId=" + jobId
+        + ", status=" + status + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java	2025-05-19 16:14:27.000000000 +0000
@@ -1,7 +1,9 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 package org.rocksdb;
 
+import java.util.Arrays;
 import java.util.Map;
+import java.util.Objects;
 
 /**
  * TableProperties contains read-only properties of its associated
@@ -27,6 +29,9 @@
   private final long columnFamilyId;
   private final long creationTime;
   private final long oldestKeyTime;
+  private final long slowCompressionEstimatedDataSize;
+  private final long fastCompressionEstimatedDataSize;
+  private final long externalSstFileGlobalSeqnoOffset;
   private final byte[] columnFamilyName;
   private final String filterPolicyName;
   private final String comparatorName;
@@ -36,27 +41,24 @@
   private final String compressionName;
   private final Map<String, String> userCollectedProperties;
   private final Map<String, String> readableProperties;
-  private final Map<String, Long> propertiesOffsets;
 
   /**
-   * Access is private as this will only be constructed from
-   * C++ via JNI.
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
    */
-  private TableProperties(final long dataSize, final long indexSize,
-      final long indexPartitions, final long topLevelIndexSize,
-      final long indexKeyIsUserKey, final long indexValueIsDeltaEncoded,
-      final long filterSize, final long rawKeySize, final long rawValueSize,
-      final long numDataBlocks, final long numEntries, final long numDeletions,
-      final long numMergeOperands, final long numRangeDeletions,
-      final long formatVersion, final long fixedKeyLen,
-      final long columnFamilyId, final long creationTime,
-      final long oldestKeyTime, final byte[] columnFamilyName,
-      final String filterPolicyName, final String comparatorName,
-      final String mergeOperatorName, final String prefixExtractorName,
-      final String propertyCollectorsNames, final String compressionName,
-      final Map<String, String> userCollectedProperties,
-      final Map<String, String> readableProperties,
-      final Map<String, Long> propertiesOffsets) {
+  TableProperties(final long dataSize, final long indexSize, final long indexPartitions,
+      final long topLevelIndexSize, final long indexKeyIsUserKey,
+      final long indexValueIsDeltaEncoded, final long filterSize, final long rawKeySize,
+      final long rawValueSize, final long numDataBlocks, final long numEntries,
+      final long numDeletions, final long numMergeOperands, final long numRangeDeletions,
+      final long formatVersion, final long fixedKeyLen, final long columnFamilyId,
+      final long creationTime, final long oldestKeyTime,
+      final long slowCompressionEstimatedDataSize, final long fastCompressionEstimatedDataSize,
+      final long externalSstFileGlobalSeqnoOffset, final byte[] columnFamilyName,
+      final String filterPolicyName, final String comparatorName, final String mergeOperatorName,
+      final String prefixExtractorName, final String propertyCollectorsNames,
+      final String compressionName, final Map<String, String> userCollectedProperties,
+      final Map<String, String> readableProperties) {
     this.dataSize = dataSize;
     this.indexSize = indexSize;
     this.indexPartitions = indexPartitions;
@@ -76,6 +78,9 @@
     this.columnFamilyId = columnFamilyId;
     this.creationTime = creationTime;
     this.oldestKeyTime = oldestKeyTime;
+    this.slowCompressionEstimatedDataSize = slowCompressionEstimatedDataSize;
+    this.fastCompressionEstimatedDataSize = fastCompressionEstimatedDataSize;
+    this.externalSstFileGlobalSeqnoOffset = externalSstFileGlobalSeqnoOffset;
     this.columnFamilyName = columnFamilyName;
     this.filterPolicyName = filterPolicyName;
     this.comparatorName = comparatorName;
@@ -85,7 +90,6 @@
     this.compressionName = compressionName;
     this.userCollectedProperties = userCollectedProperties;
     this.readableProperties = readableProperties;
-    this.propertiesOffsets = propertiesOffsets;
   }
 
   /**
@@ -269,6 +273,26 @@
   }
 
   /**
+   * Get the estimated size of data blocks compressed with a relatively slower
+   * compression algorithm.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getSlowCompressionEstimatedDataSize() {
+    return slowCompressionEstimatedDataSize;
+  }
+
+  /**
+   * Get the estimated size of data blocks compressed with a relatively faster
+   * compression algorithm.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getFastCompressionEstimatedDataSize() {
+    return fastCompressionEstimatedDataSize;
+  }
+
+  /**
    * Get the name of the column family with which this
    * SST file is associated.
    *
@@ -355,12 +379,48 @@
     return readableProperties;
   }
 
-  /**
-   * The offset of the value of each property in the file.
-   *
-   * @return the offset of each property.
-   */
-  public Map<String, Long> getPropertiesOffsets() {
-    return propertiesOffsets;
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableProperties that = (TableProperties) o;
+    return dataSize == that.dataSize && indexSize == that.indexSize
+        && indexPartitions == that.indexPartitions && topLevelIndexSize == that.topLevelIndexSize
+        && indexKeyIsUserKey == that.indexKeyIsUserKey
+        && indexValueIsDeltaEncoded == that.indexValueIsDeltaEncoded
+        && filterSize == that.filterSize && rawKeySize == that.rawKeySize
+        && rawValueSize == that.rawValueSize && numDataBlocks == that.numDataBlocks
+        && numEntries == that.numEntries && numDeletions == that.numDeletions
+        && numMergeOperands == that.numMergeOperands && numRangeDeletions == that.numRangeDeletions
+        && formatVersion == that.formatVersion && fixedKeyLen == that.fixedKeyLen
+        && columnFamilyId == that.columnFamilyId && creationTime == that.creationTime
+        && oldestKeyTime == that.oldestKeyTime
+        && slowCompressionEstimatedDataSize == that.slowCompressionEstimatedDataSize
+        && fastCompressionEstimatedDataSize == that.fastCompressionEstimatedDataSize
+        && externalSstFileGlobalSeqnoOffset == that.externalSstFileGlobalSeqnoOffset
+        && Arrays.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filterPolicyName, that.filterPolicyName)
+        && Objects.equals(comparatorName, that.comparatorName)
+        && Objects.equals(mergeOperatorName, that.mergeOperatorName)
+        && Objects.equals(prefixExtractorName, that.prefixExtractorName)
+        && Objects.equals(propertyCollectorsNames, that.propertyCollectorsNames)
+        && Objects.equals(compressionName, that.compressionName)
+        && Objects.equals(userCollectedProperties, that.userCollectedProperties)
+        && Objects.equals(readableProperties, that.readableProperties);
+  }
+
+  @Override
+  public int hashCode() {
+    int result = Objects.hash(dataSize, indexSize, indexPartitions, topLevelIndexSize,
+        indexKeyIsUserKey, indexValueIsDeltaEncoded, filterSize, rawKeySize, rawValueSize,
+        numDataBlocks, numEntries, numDeletions, numMergeOperands, numRangeDeletions, formatVersion,
+        fixedKeyLen, columnFamilyId, creationTime, oldestKeyTime, slowCompressionEstimatedDataSize,
+        fastCompressionEstimatedDataSize, externalSstFileGlobalSeqnoOffset, filterPolicyName,
+        comparatorName, mergeOperatorName, prefixExtractorName, propertyCollectorsNames,
+        compressionName, userCollectedProperties, readableProperties);
+    result = 31 * result + Arrays.hashCode(columnFamilyName);
+    return result;
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TickerType.java	2025-05-19 16:14:27.000000000 +0000
@@ -722,6 +722,80 @@
      */
     TXN_GET_TRY_AGAIN((byte) -0x0D),
 
+    /**
+     * # of files marked as trash by delete scheduler
+     */
+    FILES_MARKED_TRASH((byte) -0x0E),
+
+    /**
+     * # of files deleted immediately by delete scheduler
+     */
+    FILES_DELETED_IMMEDIATELY((byte) -0x0f),
+
+    /**
+     * Compaction read and write statistics broken down by CompactionReason
+     */
+    COMPACT_READ_BYTES_MARKED((byte) -0x10),
+    COMPACT_READ_BYTES_PERIODIC((byte) -0x11),
+    COMPACT_READ_BYTES_TTL((byte) -0x12),
+    COMPACT_WRITE_BYTES_MARKED((byte) -0x13),
+    COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14),
+    COMPACT_WRITE_BYTES_TTL((byte) -0x15),
+
+    /**
+     * DB error handler statistics
+     */
+    ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16),
+    ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17),
+    ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18),
+    ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19),
+    ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A),
+    ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B),
+
+    /**
+     * Bytes of raw data (payload) found on memtable at flush time.
+     * Contains the sum of garbage payload (bytes that are discarded
+     * at flush time) and useful payload (bytes of data that will
+     * eventually be written to SSTable).
+     */
+    MEMTABLE_PAYLOAD_BYTES_AT_FLUSH((byte) -0x1C),
+    /**
+     * Outdated bytes of data present on memtable at flush time.
+     */
+    MEMTABLE_GARBAGE_BYTES_AT_FLUSH((byte) -0x1D),
+
+    /**
+     * Number of secondary cache hits
+     */
+    SECONDARY_CACHE_HITS((byte) -0x1E),
+
+    /**
+     * Bytes read by `VerifyChecksum()` and `VerifyFileChecksums()` APIs.
+     */
+    VERIFY_CHECKSUM_READ_BYTES((byte) -0x1F),
+
+    /**
+     * Bytes read/written while creating backups
+     */
+    BACKUP_READ_BYTES((byte) -0x20),
+    BACKUP_WRITE_BYTES((byte) -0x21),
+
+    /**
+     * Remote compaction read/write statistics
+     */
+    REMOTE_COMPACT_READ_BYTES((byte) -0x22),
+    REMOTE_COMPACT_WRITE_BYTES((byte) -0x23),
+
+    /**
+     * Tiered storage related statistics
+     */
+    HOT_FILE_READ_BYTES((byte) -0x24),
+    WARM_FILE_READ_BYTES((byte) -0x25),
+    COLD_FILE_READ_BYTES((byte) -0x26),
+    HOT_FILE_READ_COUNT((byte) -0x27),
+    WARM_FILE_READ_COUNT((byte) -0x28),
+    COLD_FILE_READ_COUNT((byte) -0x29),
+
     TICKER_ENUM_MAX((byte) 0x5F);
 
     private final byte value;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -13,7 +13,7 @@
   private final long maxTraceFileSize;
 
   public TraceOptions() {
-    this.maxTraceFileSize = 64 * 1024 * 1024 * 1024;  // 64 GB
+    this.maxTraceFileSize = 64L * 1024L * 1024L * 1024L; // 64 GB
   }
 
   public TraceOptions(final long maxTraceFileSize) {
@@ -21,8 +21,8 @@
   }
 
   /**
-   * To avoid the trace file size grows large than the storage space,
-   * user can set the max trace file size in Bytes. Default is 64GB
+   * To avoid the trace file size grows larger than the storage space,
+   * user can set the max trace file size in Bytes. Default is 64 GB.
    *
    * @return the max trace size
    */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/Transaction.java	2025-05-19 16:14:27.000000000 +0000
@@ -45,7 +45,7 @@
 
   /**
    * If a transaction has a snapshot set, the transaction will ensure that
-   * any keys successfully written(or fetched via {@link #getForUpdate}) have
+   * any keys successfully written (or fetched via {@link #getForUpdate}) have
    * not been modified outside of this transaction since the time the snapshot
    * was set.
    *
@@ -611,9 +611,9 @@
   }
 
   /**
-   * Returns an iterator that will iterate on all keys in the default
-   * column family including both keys in the DB and uncommitted keys in this
-   * transaction.
+   * Returns an iterator that will iterate on all keys in the column family
+   * specified by {@code columnFamilyHandle} including both keys in the DB
+   * and uncommitted keys in this transaction.
    *
    * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
    * from the DB but will NOT change which keys are read from this transaction
@@ -1068,7 +1068,7 @@
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
    * @param assumeTracked true when it is expected that the key is already
-   *     tracked. More specifically, it means the the key was previous tracked
+   *     tracked. More specifically, it means the key was previously tracked
    *     in the same savepoint, with the same exclusive flag, and at a lower
    *     sequence number. If valid then it skips ValidateSnapshot,
    *     throws an error otherwise.
@@ -1152,7 +1152,7 @@
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param keyParts the specified key to be deleted.
    * @param assumeTracked true when it is expected that the key is already
-   *     tracked. More specifically, it means the the key was previous tracked
+   *     tracked. More specifically, it means the key was previously tracked
    *     in the same savepoint, with the same exclusive flag, and at a lower
    *     sequence number. If valid then it skips ValidateSnapshot,
    *     throws an error otherwise.
@@ -1788,11 +1788,17 @@
     AWAITING_PREPARE((byte)1),
     PREPARED((byte)2),
     AWAITING_COMMIT((byte)3),
-    COMMITED((byte)4),
+    COMMITTED((byte)4),
     AWAITING_ROLLBACK((byte)5),
     ROLLEDBACK((byte)6),
     LOCKS_STOLEN((byte)7);
 
+    /*
+     * Keep old misspelled variable as alias
+     * Tip from https://stackoverflow.com/a/37092410/454544
+     */
+    public static final TransactionState COMMITED = COMMITTED;
+
     private final byte value;
 
     TransactionState(final byte value) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java	2025-05-19 16:14:27.000000000 +0000
@@ -6,7 +6,6 @@
 package org.rocksdb;
 
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,10 +5,7 @@
 
 package org.rocksdb;
 
-
-interface TransactionalDB<T extends TransactionalOptions>
-    extends AutoCloseable {
-
+interface TransactionalDB<T extends TransactionalOptions<T>> extends AutoCloseable {
   /**
    * Starts a new Transaction.
    *
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java	2025-05-19 16:14:27.000000000 +0000
@@ -113,7 +113,7 @@
       throws RocksDBException {
     if (columnFamilyDescriptors.size() != ttlValues.size()) {
       throw new IllegalArgumentException("There must be a ttl value per column"
-          + "family handle.");
+          + " family handle.");
     }
 
     final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java	2025-05-19 16:14:27.000000000 +0000
@@ -46,6 +46,7 @@
   @Override final native void seekToLast0(long handle);
   @Override final native void next0(long handle);
   @Override final native void prev0(long handle);
+  @Override final native void refresh0(final long handle) throws RocksDBException;
   @Override final native void seek0(long handle, byte[] target, int targetLen);
   @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
   @Override final native void status0(long handle) throws RocksDBException;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java	2025-05-19 16:14:27.000000000 +0000
@@ -243,7 +243,7 @@
   @Override final native void singleDelete(final long handle, final byte[] key,
       final int keyLen, final long cfHandle) throws RocksDBException;
   @Override
-  final native void removeDirect(final long handle, final ByteBuffer key, final int keyOffset,
+  final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset,
       final int keyLength, final long cfHandle) throws RocksDBException;
   @Override
   final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
@@ -321,6 +321,8 @@
         throws RocksDBException;
     public abstract void markCommit(final byte[] xid)
         throws RocksDBException;
+    public abstract void markCommitWithTimestamp(final byte[] xid, final byte[] ts)
+        throws RocksDBException;
 
     /**
      * shouldContinue is called by the underlying iterator
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java	2025-05-19 16:14:27.000000000 +0000
@@ -39,8 +39,8 @@
      * @param value the value associated with the specified key.
      * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void put(ColumnFamilyHandle columnFamilyHandle,
-                    byte[] key, byte[] value) throws RocksDBException;
+    void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value)
+        throws RocksDBException;
 
     /**
      * <p>Store the mapping "key-&gt;value" within given column
@@ -50,9 +50,9 @@
      *     Supports direct buffer only.
      * @param value the value associated with the specified key. It is using position and limit.
      *     Supports direct buffer only.
-     * @throws RocksDBException
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void put(ByteBuffer key, ByteBuffer value) throws RocksDBException;
+    void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBException;
 
     /**
      * <p>Store the mapping "key-&gt;value" within given column
@@ -64,9 +64,9 @@
      *     Supports direct buffer only.
      * @param value the value associated with the specified key. It is using position and limit.
      *     Supports direct buffer only.
-     * @throws RocksDBException
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void put(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key, ByteBuffer value)
+    void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value)
         throws RocksDBException;
 
     /**
@@ -90,8 +90,8 @@
      * the specified key.
      * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void merge(ColumnFamilyHandle columnFamilyHandle,
-                      byte[] key, byte[] value) throws RocksDBException;
+    void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value)
+        throws RocksDBException;
 
     /**
      * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
@@ -114,7 +114,31 @@
      * @throws RocksDBException thrown if error happens in underlying native library.
      */
     @Deprecated
-    void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+    void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param key Key to delete within database. It is using position and limit.
+     *     Supports direct buffer only.
+     *
+     * @deprecated Use {@link #delete(ByteBuffer)}
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    @Deprecated void remove(final ByteBuffer key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key Key to delete within database. It is using position and limit.
+     *     Supports direct buffer only.
+     *
+     * @deprecated Use {@link #delete(ColumnFamilyHandle, ByteBuffer)}
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    @Deprecated
+    void remove(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key)
         throws RocksDBException;
 
     /**
@@ -132,7 +156,28 @@
      * @param key Key to delete within database
      * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+    void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param key Key to delete within database. It is using position and limit.
+     *     Supports direct buffer only.
+     *
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void delete(final ByteBuffer key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key Key to delete within database. It is using position and limit.
+     *     Supports direct buffer only.
+     *
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key)
         throws RocksDBException;
 
     /**
@@ -182,27 +227,8 @@
      *     native library.
      */
     @Experimental("Performance optimization for a very specific workload")
-    void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
-            final byte[] key) throws RocksDBException;
-
-    /**
-     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
-     *
-     * @param key Key to delete within database. It is using position and limit.
-     *     Supports direct buffer only.
-     * @throws RocksDBException
-     */
-    void remove(ByteBuffer key) throws RocksDBException;
-
-    /**
-     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
-     *
-     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
-     * @param key Key to delete within database. It is using position and limit.
-     *     Supports direct buffer only.
-     * @throws RocksDBException
-     */
-    void remove(ColumnFamilyHandle columnFamilyHandle, ByteBuffer key) throws RocksDBException;
+    void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key)
+        throws RocksDBException;
 
     /**
      * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
@@ -237,8 +263,8 @@
      *          Last key to delete within database (excluded)
      * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey,
-            byte[] endKey) throws RocksDBException;
+    void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] endKey)
+        throws RocksDBException;
 
     /**
      * Append a blob of arbitrary size to the records in this batch. The blob will
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java	2025-05-19 16:14:27.000000000 +0000
@@ -117,7 +117,7 @@
    * as a delta and baseIterator as a base
    *
    * Updating write batch with the current key of the iterator is not safe.
-   * We strongly recommand users not to do it. It will invalidate the current
+   * We strongly recommend users not to do it. It will invalidate the current
    * key() and value() of the iterator. This invalidation happens even before
    * the write batch update finishes. The state may recover after Next() is
    * called.
@@ -131,11 +131,36 @@
   public RocksIterator newIteratorWithBase(
       final ColumnFamilyHandle columnFamilyHandle,
       final RocksIterator baseIterator) {
-    RocksIterator iterator = new RocksIterator(baseIterator.parent_,
-        iteratorWithBase(
-            nativeHandle_, columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_));
+    return newIteratorWithBase(columnFamilyHandle, baseIterator, null);
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base
+   *
+   * Updating write batch with the current key of the iterator is not safe.
+   * We strongly recommend users not to do it. It will invalidate the current
+   * key() and value() of the iterator. This invalidation happens even before
+   * the write batch update finishes. The state may recover after Next() is
+   * called.
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @param baseIterator The base iterator,
+   *   e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @param readOptions the read options, or null
+   * @return An iterator which shows a view comprised of both the database
+   * point-in-time from baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(final ColumnFamilyHandle columnFamilyHandle,
+      final RocksIterator baseIterator, /* @Nullable */ final ReadOptions readOptions) {
+    final RocksIterator iterator = new RocksIterator(baseIterator.parent_,
+        iteratorWithBase(nativeHandle_, columnFamilyHandle.nativeHandle_,
+            baseIterator.nativeHandle_, readOptions == null ? 0 : readOptions.nativeHandle_));
+
     // when the iterator is deleted it will also delete the baseIterator
     baseIterator.disOwnNativeHandle();
+
     return iterator;
   }
 
@@ -151,7 +176,25 @@
    * point-in-timefrom baseIterator and modifications made in this write batch.
    */
   public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) {
-    return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator);
+    return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator, null);
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base. Operates on the default column
+   * family.
+   *
+   * @param baseIterator The base iterator,
+   *   e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @param readOptions the read options, or null
+   * @return An iterator which shows a view comprised of both the database
+   * point-in-timefrom baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(final RocksIterator baseIterator,
+      /* @Nullable */ final ReadOptions readOptions) {
+    return newIteratorWithBase(
+        baseIterator.parent_.getDefaultColumnFamily(), baseIterator, readOptions);
   }
 
   /**
@@ -200,7 +243,7 @@
    * the results using the DB's merge operator (if the batch contains any
    * merge requests).
    *
-   * Setting {@link ReadOptions#setSnapshot(long, long)} will affect what is
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is
    * read from the DB but will NOT change which keys are read from the batch
    * (the keys in this batch do not yet belong to any snapshot and will be
    * fetched regardless).
@@ -230,7 +273,7 @@
    * the results using the DB's merge operator (if the batch contains any
    * merge requests).
    *
-   * Setting {@link ReadOptions#setSnapshot(long, long)} will affect what is
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is
    * read from the DB but will NOT change which keys are read from the batch
    * (the keys in this batch do not yet belong to any snapshot and will be
    * fetched regardless).
@@ -275,7 +318,7 @@
   @Override final native void singleDelete(final long handle, final byte[] key,
       final int keyLen, final long cfHandle) throws RocksDBException;
   @Override
-  final native void removeDirect(final long handle, final ByteBuffer key, final int keyOffset,
+  final native void deleteDirect(final long handle, final ByteBuffer key, final int keyOffset,
       final int keyLength, final long cfHandle) throws RocksDBException;
   // DO NOT USE - `WriteBatchWithIndex::deleteRange` is not yet supported
   @Override
@@ -303,8 +346,8 @@
       final boolean overwriteKey);
   private native long iterator0(final long handle);
   private native long iterator1(final long handle, final long cfHandle);
-  private native long iteratorWithBase(
-      final long handle, final long baseIteratorHandle, final long cfHandle);
+  private native long iteratorWithBase(final long handle, final long baseIteratorHandle,
+      final long cfHandle, final long readOptionsHandle);
   private native byte[] getFromBatch(final long handle, final long optHandle,
       final byte[] key, final int keyLen);
   private native byte[] getFromBatch(final long handle, final long optHandle,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java	2025-05-19 16:14:27.000000000 +0000
@@ -22,12 +22,29 @@
    *
    * @param bufferSizeBytes buffer size(in bytes) to use for native write_buffer_manager
    * @param cache cache whose memory should be bounded by this write buffer manager
+   * @param allowStall if set true, it will enable stalling of writes when memory_usage() exceeds
+   *     buffer_size.
+   *        It will wait for flush to complete and memory usage to drop down.
    */
+  public WriteBufferManager(
+      final long bufferSizeBytes, final Cache cache, final boolean allowStall) {
+    super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_, allowStall));
+    this.allowStall_ = allowStall;
+  }
+
   public WriteBufferManager(final long bufferSizeBytes, final Cache cache){
-    super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_));
+    this(bufferSizeBytes, cache, false);
+  }
+
+  public boolean allowStall() {
+    return allowStall_;
   }
 
-  private native static long newWriteBufferManager(final long bufferSizeBytes, final long cacheHandle);
+  private native static long newWriteBufferManager(
+      final long bufferSizeBytes, final long cacheHandle, final boolean allowStall);
+
   @Override
   protected native void disposeInternal(final long handle);
+
+  private boolean allowStall_;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java	2025-05-19 16:14:27.000000000 +0000
@@ -171,7 +171,7 @@
 
   /**
    * If true, this write request is of lower priority if compaction is
-   * behind. In this case that, {@link #noSlowdown()} == true, the request
+   * behind. In the case that, {@link #noSlowdown()} == true, the request
    * will be cancelled immediately with {@link Status.Code#Incomplete} returned.
    * Otherwise, it will be slowed down. The slowdown value is determined by
    * RocksDB to guarantee it introduces minimum impacts to high priority writes.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallCondition.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum WriteStallCondition {
+  NORMAL((byte) 0x0),
+  DELAYED((byte) 0x1),
+  STOPPED((byte) 0x2);
+
+  private final byte value;
+
+  WriteStallCondition(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the WriteStallCondition from the internal representation value.
+   *
+   * @return the flush reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static WriteStallCondition fromValue(final byte value) {
+    for (final WriteStallCondition writeStallCondition : WriteStallCondition.values()) {
+      if (writeStallCondition.value == value) {
+        return writeStallCondition;
+      }
+    }
+
+    throw new IllegalArgumentException("Illegal value provided for WriteStallCondition: " + value);
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/WriteStallInfo.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class WriteStallInfo {
+  private final String columnFamilyName;
+  private final WriteStallCondition currentCondition;
+  private final WriteStallCondition previousCondition;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  WriteStallInfo(final String columnFamilyName, final byte currentConditionValue,
+      final byte previousConditionValue) {
+    this.columnFamilyName = columnFamilyName;
+    this.currentCondition = WriteStallCondition.fromValue(currentConditionValue);
+    this.previousCondition = WriteStallCondition.fromValue(previousConditionValue);
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the current state of the write controller.
+   *
+   * @return the current state.
+   */
+  public WriteStallCondition getCurrentCondition() {
+    return currentCondition;
+  }
+
+  /**
+   * Get the previous state of the write controller.
+   *
+   * @return the previous state.
+   */
+  public WriteStallCondition getPreviousCondition() {
+    return previousCondition;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    WriteStallInfo that = (WriteStallInfo) o;
+    return Objects.equals(columnFamilyName, that.columnFamilyName)
+        && currentCondition == that.currentCondition && previousCondition == that.previousCondition;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyName, currentCondition, previousCondition);
+  }
+
+  @Override
+  public String toString() {
+    return "WriteStallInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", currentCondition=" + currentCondition
+        + ", previousCondition=" + previousCondition + '}';
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java	2025-05-19 16:14:27.000000000 +0000
@@ -1,7 +1,6 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 package org.rocksdb.util;
 
-import java.io.File;
 import java.io.IOException;
 
 public class Environment {
@@ -106,12 +105,22 @@
       if (isPowerPC() || isAarch64()) {
         return String.format("%sjni-linux-%s%s", name, ARCH, getLibcPostfix());
       } else if (isS390x()) {
-        return String.format("%sjni-linux%s", name, ARCH);
+        return String.format("%sjni-linux-%s", name, ARCH);
       } else {
         return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix());
       }
     } else if (isMac()) {
-      return String.format("%sjni-osx", name);
+      if (is64Bit()) {
+        final String arch;
+        if (isAarch64()) {
+          arch = "arm64";
+        } else {
+          arch = "x86_64";
+        }
+        return String.format("%sjni-osx-%s", name, arch);
+      } else {
+        return String.format("%sjni-osx", name);
+      }
     } else if (isFreeBSD()) {
       return String.format("%sjni-freebsd%s", name, is64Bit() ? "64" : "32");
     } else if (isAix() && is64Bit()) {
@@ -128,10 +137,25 @@
     throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name));
   }
 
+  public static /*@Nullable*/ String getFallbackJniLibraryName(final String name) {
+    if (isMac() && is64Bit()) {
+      return String.format("%sjni-osx", name);
+    }
+    return null;
+  }
+
   public static String getJniLibraryFileName(final String name) {
     return appendLibOsSuffix("lib" + getJniLibraryName(name), false);
   }
 
+  public static /*@Nullable*/ String getFallbackJniLibraryFileName(final String name) {
+    final String fallbackJniLibraryName = getFallbackJniLibraryName(name);
+    if (fallbackJniLibraryName == null) {
+      return null;
+    }
+    return appendLibOsSuffix("lib" + fallbackJniLibraryName, false);
+  }
+
   private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
     if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) {
       return libraryFileName + ".so";
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlobOptionsTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,313 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.*;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class BlobOptionsTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  final int minBlobSize = 65536;
+  final int largeBlobSize = 65536 * 2;
+
+  /**
+   * Count the files in the temporary folder which end with a particular suffix
+   * Used to query the state of a test database to check if it is as the test expects
+   *
+   * @param endsWith the suffix to match
+   * @return the number of files with a matching suffix
+   */
+  @SuppressWarnings("CallToStringConcatCanBeReplacedByOperator")
+  private int countDBFiles(final String endsWith) {
+    return Objects
+        .requireNonNull(dbFolder.getRoot().list(new FilenameFilter() {
+          @Override
+          public boolean accept(File dir, String name) {
+            return name.endsWith(endsWith);
+          }
+        }))
+        .length;
+  }
+
+  @SuppressWarnings("SameParameterValue")
+  private byte[] small_key(String suffix) {
+    return ("small_key_" + suffix).getBytes(UTF_8);
+  }
+
+  @SuppressWarnings("SameParameterValue")
+  private byte[] small_value(String suffix) {
+    return ("small_value_" + suffix).getBytes(UTF_8);
+  }
+
+  private byte[] large_key(String suffix) {
+    return ("large_key_" + suffix).getBytes(UTF_8);
+  }
+
+  private byte[] large_value(String repeat) {
+    final byte[] large_value = ("" + repeat + "_" + largeBlobSize + "b").getBytes(UTF_8);
+    final byte[] large_buffer = new byte[largeBlobSize];
+    for (int pos = 0; pos < largeBlobSize; pos += large_value.length) {
+      int numBytes = Math.min(large_value.length, large_buffer.length - pos);
+      System.arraycopy(large_value, 0, large_buffer, pos, numBytes);
+    }
+    return large_buffer;
+  }
+
+  @Test
+  public void blobOptions() {
+    try (final Options options = new Options()) {
+      assertThat(options.enableBlobFiles()).isEqualTo(false);
+      assertThat(options.minBlobSize()).isEqualTo(0);
+      assertThat(options.blobCompressionType()).isEqualTo(CompressionType.NO_COMPRESSION);
+      assertThat(options.enableBlobGarbageCollection()).isEqualTo(false);
+      assertThat(options.blobFileSize()).isEqualTo(268435456L);
+      assertThat(options.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+      assertThat(options.blobGarbageCollectionForceThreshold()).isEqualTo(1.0);
+
+      assertThat(options.setEnableBlobFiles(true)).isEqualTo(options);
+      assertThat(options.setMinBlobSize(132768L)).isEqualTo(options);
+      assertThat(options.setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION))
+          .isEqualTo(options);
+      assertThat(options.setEnableBlobGarbageCollection(true)).isEqualTo(options);
+      assertThat(options.setBlobFileSize(132768L)).isEqualTo(options);
+      assertThat(options.setBlobGarbageCollectionAgeCutoff(0.89)).isEqualTo(options);
+      assertThat(options.setBlobGarbageCollectionForceThreshold(0.80)).isEqualTo(options);
+
+      assertThat(options.enableBlobFiles()).isEqualTo(true);
+      assertThat(options.minBlobSize()).isEqualTo(132768L);
+      assertThat(options.blobCompressionType()).isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+      assertThat(options.enableBlobGarbageCollection()).isEqualTo(true);
+      assertThat(options.blobFileSize()).isEqualTo(132768L);
+      assertThat(options.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89);
+      assertThat(options.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+    }
+  }
+
+  @Test
+  public void blobColumnFamilyOptions() {
+    try (final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions()) {
+      assertThat(columnFamilyOptions.enableBlobFiles()).isEqualTo(false);
+      assertThat(columnFamilyOptions.minBlobSize()).isEqualTo(0);
+      assertThat(columnFamilyOptions.blobCompressionType())
+          .isEqualTo(CompressionType.NO_COMPRESSION);
+      assertThat(columnFamilyOptions.enableBlobGarbageCollection()).isEqualTo(false);
+      assertThat(columnFamilyOptions.blobFileSize()).isEqualTo(268435456L);
+      assertThat(columnFamilyOptions.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+      assertThat(columnFamilyOptions.blobGarbageCollectionForceThreshold()).isEqualTo(1.0);
+
+      assertThat(columnFamilyOptions.setEnableBlobFiles(true)).isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setMinBlobSize(132768L)).isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setEnableBlobGarbageCollection(true))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobFileSize(132768L)).isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobGarbageCollectionAgeCutoff(0.89))
+          .isEqualTo(columnFamilyOptions);
+      assertThat(columnFamilyOptions.setBlobGarbageCollectionForceThreshold(0.80))
+          .isEqualTo(columnFamilyOptions);
+
+      assertThat(columnFamilyOptions.enableBlobFiles()).isEqualTo(true);
+      assertThat(columnFamilyOptions.minBlobSize()).isEqualTo(132768L);
+      assertThat(columnFamilyOptions.blobCompressionType())
+          .isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+      assertThat(columnFamilyOptions.enableBlobGarbageCollection()).isEqualTo(true);
+      assertThat(columnFamilyOptions.blobFileSize()).isEqualTo(132768L);
+      assertThat(columnFamilyOptions.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89);
+      assertThat(columnFamilyOptions.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+    }
+  }
+
+  @Test
+  public void blobMutableColumnFamilyOptionsBuilder() {
+    final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder =
+        MutableColumnFamilyOptions.builder();
+    builder.setEnableBlobFiles(true)
+        .setMinBlobSize(1024)
+        .setBlobCompressionType(CompressionType.BZLIB2_COMPRESSION)
+        .setEnableBlobGarbageCollection(true)
+        .setBlobGarbageCollectionAgeCutoff(0.89)
+        .setBlobGarbageCollectionForceThreshold(0.80)
+        .setBlobFileSize(132768);
+
+    assertThat(builder.enableBlobFiles()).isEqualTo(true);
+    assertThat(builder.minBlobSize()).isEqualTo(1024);
+    assertThat(builder.blobCompressionType()).isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+    assertThat(builder.enableBlobGarbageCollection()).isEqualTo(true);
+    assertThat(builder.blobGarbageCollectionAgeCutoff()).isEqualTo(0.89);
+    assertThat(builder.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+    assertThat(builder.blobFileSize()).isEqualTo(132768);
+
+    builder.setEnableBlobFiles(false)
+        .setMinBlobSize(4096)
+        .setBlobCompressionType(CompressionType.LZ4_COMPRESSION)
+        .setEnableBlobGarbageCollection(false)
+        .setBlobGarbageCollectionAgeCutoff(0.91)
+        .setBlobGarbageCollectionForceThreshold(0.96)
+        .setBlobFileSize(2048);
+
+    assertThat(builder.enableBlobFiles()).isEqualTo(false);
+    assertThat(builder.minBlobSize()).isEqualTo(4096);
+    assertThat(builder.blobCompressionType()).isEqualTo(CompressionType.LZ4_COMPRESSION);
+    assertThat(builder.enableBlobGarbageCollection()).isEqualTo(false);
+    assertThat(builder.blobGarbageCollectionAgeCutoff()).isEqualTo(0.91);
+    assertThat(builder.blobGarbageCollectionForceThreshold()).isEqualTo(0.96);
+    assertThat(builder.blobFileSize()).isEqualTo(2048);
+
+    final MutableColumnFamilyOptions options = builder.build();
+    assertThat(options.getKeys())
+        .isEqualTo(new String[] {"enable_blob_files", "min_blob_size", "blob_compression_type",
+            "enable_blob_garbage_collection", "blob_garbage_collection_age_cutoff",
+            "blob_garbage_collection_force_threshold", "blob_file_size"});
+    assertThat(options.getValues())
+        .isEqualTo(
+            new String[] {"false", "4096", "LZ4_COMPRESSION", "false", "0.91", "0.96", "2048"});
+  }
+
+  /**
+   * Configure the default column family with BLOBs.
+   * Confirm that BLOBs are generated when appropriately-sized writes are flushed.
+   *
+   * @throws RocksDBException if a db access throws an exception
+   */
+  @Test
+  public void testBlobWriteAboveThreshold() throws RocksDBException {
+    try (final Options options = new Options()
+                                     .setCreateIfMissing(true)
+                                     .setMinBlobSize(minBlobSize)
+                                     .setEnableBlobFiles(true);
+
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      db.put(small_key("default"), small_value("default"));
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      // check there are no blobs in the database
+      assertThat(countDBFiles(".sst")).isEqualTo(1);
+      assertThat(countDBFiles(".blob")).isEqualTo(0);
+
+      db.put(large_key("default"), large_value("default"));
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      // wrote and flushed a value larger than the blobbing threshold
+      // check there is a single blob in the database
+      assertThat(countDBFiles(".sst")).isEqualTo(2);
+      assertThat(countDBFiles(".blob")).isEqualTo(1);
+
+      assertThat(db.get(small_key("default"))).isEqualTo(small_value("default"));
+      assertThat(db.get(large_key("default"))).isEqualTo(large_value("default"));
+
+      final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder fetchOptions =
+          db.getOptions(null);
+      assertThat(fetchOptions.minBlobSize()).isEqualTo(minBlobSize);
+      assertThat(fetchOptions.enableBlobFiles()).isEqualTo(true);
+      assertThat(fetchOptions.writeBufferSize()).isEqualTo(64 << 20);
+    }
+  }
+
+  /**
+   * Configure 2 column families respectively with and without BLOBs.
+   * Confirm that BLOB files are generated (once the DB is flushed) only for the appropriate column
+   * family.
+   *
+   * @throws RocksDBException if a db access throws an exception
+   */
+  @Test
+  public void testBlobWriteAboveThresholdCF() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      db.put(columnFamilyHandles.get(0), small_key("default"), small_value("default"));
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      assertThat(countDBFiles(".blob")).isEqualTo(0);
+
+      try (final ColumnFamilyOptions columnFamilyOptions1 =
+               new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(true);
+
+           final ColumnFamilyOptions columnFamilyOptions2 =
+               new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(false)) {
+        final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+            new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+        final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+            new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+
+        // Create the first column family with blob options
+        db.createColumnFamily(columnFamilyDescriptor1);
+
+        // Create the second column family with not-blob options
+        db.createColumnFamily(columnFamilyDescriptor2);
+      }
+    }
+
+    // Now re-open after auto-close - at this point the CF options we use are recognized.
+    try (final ColumnFamilyOptions columnFamilyOptions1 =
+             new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(true);
+
+         final ColumnFamilyOptions columnFamilyOptions2 =
+             new ColumnFamilyOptions().setMinBlobSize(minBlobSize).setEnableBlobFiles(false)) {
+      assertThat(columnFamilyOptions1.enableBlobFiles()).isEqualTo(true);
+      assertThat(columnFamilyOptions1.minBlobSize()).isEqualTo(minBlobSize);
+      assertThat(columnFamilyOptions2.enableBlobFiles()).isEqualTo(false);
+      assertThat(columnFamilyOptions1.minBlobSize()).isEqualTo(minBlobSize);
+
+      final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+          new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+      final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+          new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+      columnFamilyDescriptors = new ArrayList<>();
+      columnFamilyDescriptors.add(columnFamilyDescriptor0);
+      columnFamilyDescriptors.add(columnFamilyDescriptor1);
+      columnFamilyDescriptors.add(columnFamilyDescriptor2);
+      columnFamilyHandles = new ArrayList<>();
+
+      assertThat(columnFamilyDescriptor1.getOptions().enableBlobFiles()).isEqualTo(true);
+      assertThat(columnFamilyDescriptor2.getOptions().enableBlobFiles()).isEqualTo(false);
+
+      try (final DBOptions dbOptions = new DBOptions();
+           final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+               columnFamilyDescriptors, columnFamilyHandles)) {
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 =
+            db.getOptions(columnFamilyHandles.get(1));
+        assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+        assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 =
+            db.getOptions(columnFamilyHandles.get(2));
+        assertThat(builder2.enableBlobFiles()).isEqualTo(false);
+        assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize);
+
+        db.put(columnFamilyHandles.get(1), large_key("column_family_1_k2"),
+            large_value("column_family_1_k2"));
+        db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(1));
+        assertThat(countDBFiles(".blob")).isEqualTo(1);
+
+        db.put(columnFamilyHandles.get(2), large_key("column_family_2_k2"),
+            large_value("column_family_2_k2"));
+        db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(2));
+        assertThat(countDBFiles(".blob")).isEqualTo(1);
+      }
+    }
+  }
+}
\ No newline at end of file
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,16 +5,16 @@
 
 package org.rocksdb;
 
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.fail;
+
+import java.nio.charset.StandardCharsets;
 import org.junit.ClassRule;
 import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
-import java.nio.charset.StandardCharsets;
-
-import static org.assertj.core.api.Assertions.assertThat;
-
 public class BlockBasedTableConfigTest {
 
   @ClassRule
@@ -35,9 +35,10 @@
   @Test
   public void cacheIndexAndFilterBlocksWithHighPriority() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(true);
     assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).
         isTrue();
+    blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(false);
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).isFalse();
   }
 
   @Test
@@ -59,7 +60,7 @@
   @Test
   public void indexType() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    assertThat(IndexType.values().length).isEqualTo(3);
+    assertThat(IndexType.values().length).isEqualTo(4);
     blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
     assertThat(blockBasedTableConfig.indexType().equals(
         IndexType.kHashSearch));
@@ -83,7 +84,7 @@
   @Test
   public void checksumType() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    assertThat(ChecksumType.values().length).isEqualTo(3);
+    assertThat(ChecksumType.values().length).isEqualTo(4);
     assertThat(ChecksumType.valueOf("kxxHash")).
         isEqualTo(ChecksumType.kxxHash);
     blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
@@ -259,6 +260,13 @@
   }
 
   @Test
+  public void optimizeFiltersForMemory() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setOptimizeFiltersForMemory(true);
+    assertThat(blockBasedTableConfig.optimizeFiltersForMemory()).isTrue();
+  }
+
+  @Test
   public void useDeltaEncoding() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setUseDeltaEncoding(false);
@@ -296,6 +304,7 @@
   @Test
   public void verifyCompression() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(blockBasedTableConfig.verifyCompression()).isFalse();
     blockBasedTableConfig.setVerifyCompression(true);
     assertThat(blockBasedTableConfig.verifyCompression()).
         isTrue();
@@ -312,7 +321,7 @@
   @Test
   public void formatVersion() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    for (int version = 0; version < 5; version++) {
+    for (int version = 0; version <= 5; version++) {
       blockBasedTableConfig.setFormatVersion(version);
       assertThat(blockBasedTableConfig.formatVersion()).isEqualTo(version);
     }
@@ -324,10 +333,15 @@
     blockBasedTableConfig.setFormatVersion(-1);
   }
 
-  @Test(expected = AssertionError.class)
-  public void formatVersionFailIllegalVersion() {
-    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setFormatVersion(99);
+  @Test(expected = RocksDBException.class)
+  public void invalidFormatVersion() throws RocksDBException {
+    final BlockBasedTableConfig blockBasedTableConfig =
+        new BlockBasedTableConfig().setFormatVersion(99999);
+
+    try (final Options options = new Options().setTableFormatConfig(blockBasedTableConfig);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      fail("Opening the database with an invalid format_version should have raised an exception");
+    }
   }
 
   @Test
@@ -346,6 +360,14 @@
         isTrue();
   }
 
+  @Test
+  public void indexShortening() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
+    assertThat(blockBasedTableConfig.indexShortening())
+        .isEqualTo(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
+  }
+
   @Deprecated
   @Test
   public void hashIndexAllowCollision() {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.junit.Assert.assertArrayEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.BytewiseComparator;
+
+/**
+ * This test confirms that the following issues were in fact resolved
+ * by a change made between 6.2.2 and 6.22.1,
+ * to wit {@link <a href="https://github.com/facebook/rocksdb/commit/7242dae7">...</a>}
+ * which as part of its effect, changed the Java bytewise comparators.
+ *
+ * {@link <a href="https://github.com/facebook/rocksdb/issues/5891">...</a>}
+ * {@link <a href="https://github.com/facebook/rocksdb/issues/2001">...</a>}
+ */
+public class BytewiseComparatorRegressionTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule public TemporaryFolder temporarySSTFolder = new TemporaryFolder();
+
+  private final static byte[][] testData = {{10, -11, 13}, {10, 11, 12}, {10, 11, 14}};
+  private final static byte[][] orderedData = {{10, 11, 12}, {10, 11, 14}, {10, -11, 13}};
+
+  /**
+   * {@link <a href="https://github.com/facebook/rocksdb/issues/5891">...</a>}
+   */
+  @Test
+  public void testJavaComparator() throws RocksDBException {
+    final BytewiseComparator comparator = new BytewiseComparator(new ComparatorOptions());
+    performTest(new Options().setCreateIfMissing(true).setComparator(comparator));
+  }
+
+  @Test
+  public void testDefaultComparator() throws RocksDBException {
+    performTest(new Options().setCreateIfMissing(true));
+  }
+
+  /**
+   * {@link <a href="https://github.com/facebook/rocksdb/issues/5891">...</a>}
+   */
+  @Test
+  public void testCppComparator() throws RocksDBException {
+    performTest(new Options().setCreateIfMissing(true).setComparator(
+        BuiltinComparator.BYTEWISE_COMPARATOR));
+  }
+
+  private void performTest(final Options options) throws RocksDBException {
+    try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      for (final byte[] item : testData) {
+        db.put(item, item);
+      }
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seekToFirst();
+        final ArrayList<byte[]> result = new ArrayList<>();
+        while (iterator.isValid()) {
+          result.add(iterator.key());
+          iterator.next();
+        }
+        assertArrayEquals(orderedData, result.toArray());
+      }
+    }
+  }
+
+  private byte[] hexToByte(final String hexString) {
+    final byte[] bytes = new byte[hexString.length() / 2];
+    if (bytes.length * 2 < hexString.length()) {
+      throw new RuntimeException("Hex string has odd length: " + hexString);
+    }
+
+    for (int i = 0; i < bytes.length; i++) {
+      final int firstDigit = toDigit(hexString.charAt(i + i));
+      final int secondDigit = toDigit(hexString.charAt(i + i + 1));
+      bytes[i] = (byte) ((firstDigit << 4) + secondDigit);
+    }
+
+    return bytes;
+  }
+
+  private int toDigit(final char hexChar) {
+    final int digit = Character.digit(hexChar, 16);
+    if (digit == -1) {
+      throw new IllegalArgumentException("Invalid Hexadecimal Character: " + hexChar);
+    }
+    return digit;
+  }
+
+  /**
+   * {@link <a href="https://github.com/facebook/rocksdb/issues/2001">...</a>}
+   *
+   * @throws RocksDBException if something goes wrong, or if the regression occurs
+   * @throws IOException if we can't make the temporary file
+   */
+  @Test
+  public void testSST() throws RocksDBException, IOException {
+    final File tempSSTFile = temporarySSTFolder.newFile("test_file_with_weird_keys.sst");
+
+    final EnvOptions envOpts = new EnvOptions();
+    final Options opts = new Options();
+    final SstFileWriter writer =
+        new SstFileWriter(envOpts, opts, new BytewiseComparator(new ComparatorOptions()));
+    writer.open(tempSSTFile.getAbsolutePath());
+    final byte[] gKey =
+        hexToByte("000000293030303030303030303030303030303030303032303736343730696E666F33");
+    final byte[] wKey =
+        hexToByte("0000008d3030303030303030303030303030303030303030303437363433696e666f34");
+    writer.add(new Slice(gKey), new Slice("dummyV1"));
+    writer.add(new Slice(wKey), new Slice("dummyV2"));
+    writer.finish();
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,17 +5,17 @@
 
 package org.rocksdb;
 
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.*;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
 
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Properties;
-import java.util.Random;
-
-import static org.assertj.core.api.Assertions.assertThat;
-
 public class ColumnFamilyOptionsTest {
 
   @ClassRule
@@ -55,6 +55,27 @@
   }
 
   @Test
+  public void getColumnFamilyOptionsFromPropsWithIgnoreIllegalValue() {
+    // setup sample properties
+    final Properties properties = new Properties();
+    properties.put("tomato", "1024");
+    properties.put("burger", "2");
+    properties.put("write_buffer_size", "112");
+    properties.put("max_write_buffer_number", "13");
+
+    try (final ConfigOptions cfgOpts = new ConfigOptions().setIgnoreUnknownOptions(true);
+         final ColumnFamilyOptions opt =
+             ColumnFamilyOptions.getColumnFamilyOptionsFromProps(cfgOpts, properties)) {
+      // setup sample properties
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.writeBufferSize()))
+          .isEqualTo(properties.get("write_buffer_size"));
+      assertThat(String.valueOf(opt.maxWriteBufferNumber()))
+          .isEqualTo(properties.get("max_write_buffer_number"));
+    }
+  }
+
+  @Test
   public void failColumnFamilyOptionsFromPropsWithIllegalValue() {
     // setup sample properties
     final Properties properties = new Properties();
@@ -569,6 +590,14 @@
   }
 
   @Test
+  public void periodicCompactionSeconds() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.setPeriodicCompactionSeconds(1000 * 60);
+      assertThat(options.periodicCompactionSeconds()).isEqualTo(1000 * 60);
+    }
+  }
+
+  @Test
   public void compactionOptionsUniversal() {
     try (final ColumnFamilyOptions opt = new ColumnFamilyOptions();
         final CompactionOptionsUniversal optUni = new CompactionOptionsUniversal()
@@ -622,4 +651,46 @@
     }
   }
 
+  @Test
+  public void compactionThreadLimiter() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions();
+         final ConcurrentTaskLimiter compactionThreadLimiter =
+             new ConcurrentTaskLimiterImpl("name", 3)) {
+      options.setCompactionThreadLimiter(compactionThreadLimiter);
+      assertThat(options.compactionThreadLimiter()).isEqualTo(compactionThreadLimiter);
+    }
+  }
+
+  @Test
+  public void oldDefaults() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.oldDefaults(4, 6);
+      assertEquals(4 << 20, options.writeBufferSize());
+      assertThat(options.compactionPriority()).isEqualTo(CompactionPriority.ByCompensatedSize);
+      assertThat(options.targetFileSizeBase()).isEqualTo(2 * 1048576);
+      assertThat(options.maxBytesForLevelBase()).isEqualTo(10 * 1048576);
+      assertThat(options.softPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.hardPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.level0StopWritesTrigger()).isEqualTo(24);
+    }
+  }
+
+  @Test
+  public void optimizeForSmallDbWithCache() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions();
+         final Cache cache = new LRUCache(1024)) {
+      assertThat(options.optimizeForSmallDb(cache)).isEqualTo(options);
+    }
+  }
+
+  @Test
+  public void cfPaths() throws IOException {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      final List<DbPath> paths = Arrays.asList(
+          new DbPath(Paths.get("test1"), 2 << 25), new DbPath(Paths.get("/test2/path"), 2 << 25));
+      assertThat(options.cfPaths()).isEqualTo(Collections.emptyList());
+      assertThat(options.setCfPaths(paths)).isEqualTo(options);
+      assertThat(options.cfPaths()).isEqualTo(paths);
+    }
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,16 +5,17 @@
 
 package org.rocksdb;
 
-import java.util.*;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
+import java.util.*;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.assertj.core.api.Assertions.assertThat;
-
 public class ColumnFamilyTest {
 
   @ClassRule
@@ -75,6 +76,7 @@
 
         assertThat(cfh.getName()).isEqualTo("default".getBytes(UTF_8));
         assertThat(cfh.getID()).isEqualTo(0);
+        assertThat(cfh.getDescriptor().getName()).isEqualTo("default".getBytes(UTF_8));
 
         final byte[] key = "key".getBytes();
         final byte[] value = "value".getBytes();
@@ -140,33 +142,19 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfNames,
              columnFamilyHandleList)) {
-
-      try {
-        assertThat(columnFamilyHandleList.size()).isEqualTo(2);
-        db.put("dfkey1".getBytes(), "dfvalue".getBytes());
-        db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(),
-            "dfvalue".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(),
-            "newcfvalue".getBytes());
-
-        String retVal = new String(db.get(columnFamilyHandleList.get(1),
-            "newcfkey1".getBytes()));
-        assertThat(retVal).isEqualTo("newcfvalue");
-        assertThat((db.get(columnFamilyHandleList.get(1),
-            "dfkey1".getBytes()))).isNull();
-        db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
-        assertThat((db.get(columnFamilyHandleList.get(1),
-            "newcfkey1".getBytes()))).isNull();
-        db.delete(columnFamilyHandleList.get(0), new WriteOptions(),
-            "dfkey2".getBytes());
-        assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(),
-            "dfkey2".getBytes())).isNull();
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+      db.put("dfkey1".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), "newcfvalue".getBytes());
+
+      String retVal = new String(db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()));
+      assertThat(retVal).isEqualTo("newcfvalue");
+      assertThat((db.get(columnFamilyHandleList.get(1), "dfkey1".getBytes()))).isNull();
+      db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
+      assertThat((db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()))).isNull();
+      db.delete(columnFamilyHandleList.get(0), new WriteOptions(), "dfkey2".getBytes());
+      assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), "dfkey2".getBytes()))
+          .isNull();
     }
   }
 
@@ -183,30 +171,22 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      try {
-        db.put(columnFamilyHandleList.get(0), new WriteOptions(),
-            "key1".getBytes(), "value".getBytes());
-        db.put("key2".getBytes(), "12345678".getBytes());
-        final byte[] outValue = new byte[5];
-        // not found value
-        int getResult = db.get("keyNotFound".getBytes(), outValue);
-        assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
-        // found value which fits in outValue
-        getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(),
-            outValue);
-        assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
-        assertThat(outValue).isEqualTo("value".getBytes());
-        // found value which fits partially
-        getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(),
-            "key2".getBytes(), outValue);
-        assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
-        assertThat(outValue).isEqualTo("12345".getBytes());
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.put(
+          columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      final byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult =
+          db.get(columnFamilyHandleList.get(0), new ReadOptions(), "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
     }
   }
 
@@ -222,22 +202,12 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      ColumnFamilyHandle tmpColumnFamilyHandle = null;
-      try {
-        tmpColumnFamilyHandle = db.createColumnFamily(
-            new ColumnFamilyDescriptor("tmpCF".getBytes(),
-                new ColumnFamilyOptions()));
-        db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
-        db.dropColumnFamily(tmpColumnFamilyHandle);
-        assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
-      } finally {
-        if (tmpColumnFamilyHandle != null) {
-          tmpColumnFamilyHandle.close();
-        }
-        for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      ColumnFamilyHandle tmpColumnFamilyHandle;
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamily(tmpColumnFamilyHandle);
+      assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
     }
   }
 
@@ -255,29 +225,15 @@
              columnFamilyHandleList)) {
       ColumnFamilyHandle tmpColumnFamilyHandle = null;
       ColumnFamilyHandle tmpColumnFamilyHandle2 = null;
-      try {
-        tmpColumnFamilyHandle = db.createColumnFamily(
-            new ColumnFamilyDescriptor("tmpCF".getBytes(),
-                new ColumnFamilyOptions()));
-        tmpColumnFamilyHandle2 = db.createColumnFamily(
-            new ColumnFamilyDescriptor("tmpCF2".getBytes(),
-                new ColumnFamilyOptions()));
-        db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
-        db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes());
-        db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2));
-        assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
-        assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue();
-      } finally {
-        if (tmpColumnFamilyHandle != null) {
-          tmpColumnFamilyHandle.close();
-        }
-        if (tmpColumnFamilyHandle2 != null) {
-          tmpColumnFamilyHandle2.close();
-        }
-        for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
+      tmpColumnFamilyHandle2 = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF2".getBytes(), new ColumnFamilyOptions()));
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2));
+      assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
+      assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue();
     }
   }
 
@@ -299,36 +255,24 @@
                cfDescriptors, columnFamilyHandleList);
            final WriteBatch writeBatch = new WriteBatch();
            final WriteOptions writeOpt = new WriteOptions()) {
-        try {
-          writeBatch.put("key".getBytes(), "value".getBytes());
-          writeBatch.put(db.getDefaultColumnFamily(),
-              "mergeKey".getBytes(), "merge".getBytes());
-          writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(),
-              "merge".getBytes());
-          writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-              "value".getBytes());
-          writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
-              "value2".getBytes());
-          writeBatch.delete("xyz".getBytes());
-          writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes());
-          db.write(writeOpt, writeBatch);
-
-          assertThat(db.get(columnFamilyHandleList.get(1),
-              "xyz".getBytes()) == null);
-          assertThat(new String(db.get(columnFamilyHandleList.get(1),
-              "newcfkey".getBytes()))).isEqualTo("value");
-          assertThat(new String(db.get(columnFamilyHandleList.get(1),
-              "newcfkey2".getBytes()))).isEqualTo("value2");
-          assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
-          // check if key is merged
-          assertThat(new String(db.get(db.getDefaultColumnFamily(),
-              "mergeKey".getBytes()))).isEqualTo("merge,merge");
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              columnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+        writeBatch.put("key".getBytes(), "value".getBytes());
+        writeBatch.put(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes());
+        writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes());
+        writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+        writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes());
+        writeBatch.delete("xyz".getBytes());
+        writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes());
+        db.write(writeOpt, writeBatch);
+
+        assertThat(db.get(columnFamilyHandleList.get(1), "xyz".getBytes()) == null);
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey".getBytes())))
+            .isEqualTo("value");
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey2".getBytes())))
+            .isEqualTo("value2");
+        assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
+        // check if key is merged
+        assertThat(new String(db.get(db.getDefaultColumnFamily(), "mergeKey".getBytes())))
+            .isEqualTo("merge,merge");
       }
     }
   }
@@ -345,32 +289,21 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-
-        db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-            "value".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
-            "value2".getBytes());
-        try (final RocksIterator rocksIterator =
-                 db.newIterator(columnFamilyHandleList.get(1))) {
-          rocksIterator.seekToFirst();
-          Map<String, String> refMap = new HashMap<>();
-          refMap.put("newcfkey", "value");
-          refMap.put("newcfkey2", "value2");
-          int i = 0;
-          while (rocksIterator.isValid()) {
-            i++;
-            assertThat(refMap.get(new String(rocksIterator.key()))).
-                isEqualTo(new String(rocksIterator.value()));
-            rocksIterator.next();
-          }
-          assertThat(i).isEqualTo(2);
-        }
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes());
+      try (final RocksIterator rocksIterator = db.newIterator(columnFamilyHandleList.get(1))) {
+        rocksIterator.seekToFirst();
+        Map<String, String> refMap = new HashMap<>();
+        refMap.put("newcfkey", "value");
+        refMap.put("newcfkey2", "value2");
+        int i = 0;
+        while (rocksIterator.isValid()) {
+          i++;
+          assertThat(refMap.get(new String(rocksIterator.key())))
+              .isEqualTo(new String(rocksIterator.value()));
+          rocksIterator.next();
         }
+        assertThat(i).isEqualTo(2);
       }
     }
   }
@@ -387,35 +320,20 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.put(columnFamilyHandleList.get(0), "key".getBytes(),
-            "value".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-            "value".getBytes());
-
-        final List<byte[]> keys = Arrays.asList(new byte[][]{
-            "key".getBytes(), "newcfkey".getBytes()
-        });
-
-        List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-        retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList,
-            keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      final List<byte[]> keys =
+          Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()});
+
+      List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+      retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
     }
   }
 
@@ -431,35 +349,19 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.put(columnFamilyHandleList.get(0), "key".getBytes(),
-            "value".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-            "value".getBytes());
-
-        final List<byte[]> keys = Arrays.asList(new byte[][]{
-            "key".getBytes(), "newcfkey".getBytes()
-        });
-        List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList,
-            keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-        retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList,
-            keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      final List<byte[]> keys =
+          Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()});
+      List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+      retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
     }
   }
 
@@ -475,30 +377,18 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        assertThat(db.getProperty("rocksdb.estimate-num-keys")).
-            isNotNull();
-        assertThat(db.getLongProperty(columnFamilyHandleList.get(0),
-            "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0);
-        assertThat(db.getProperty("rocksdb.stats")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(0),
-            "rocksdb.sstables")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(1),
-            "rocksdb.estimate-num-keys")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(1),
-            "rocksdb.stats")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(1),
-            "rocksdb.sstables")).isNotNull();
-        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
-            isNotNull();
-        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
-            isGreaterThanOrEqualTo(0);
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      assertThat(db.getProperty("rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getLongProperty(columnFamilyHandleList.get(0), "rocksdb.estimate-num-keys"))
+          .isGreaterThanOrEqualTo(0);
+      assertThat(db.getProperty("rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(0), "rocksdb.sstables")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.estimate-num-keys"))
+          .isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.sstables")).isNotNull();
+      assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys"))
+          .isGreaterThanOrEqualTo(0);
     }
   }
 
@@ -546,10 +436,6 @@
             rocksIterator.close();
           }
         }
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
       }
     }
   }
@@ -565,15 +451,8 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.dropColumnFamily(columnFamilyHandleList.get(1));
-        db.put(columnFamilyHandleList.get(1), "key".getBytes(),
-            "value".getBytes());
-      } finally {
-        for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes());
     }
   }
 
@@ -588,15 +467,8 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.dropColumnFamily(columnFamilyHandleList.get(1));
-        db.delete(columnFamilyHandleList.get(1), "key".getBytes());
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.delete(columnFamilyHandleList.get(1), "key".getBytes());
     }
   }
 
@@ -611,15 +483,8 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      try {
-        db.dropColumnFamily(columnFamilyHandleList.get(1));
-        db.get(columnFamilyHandleList.get(1), "key".getBytes());
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.get(columnFamilyHandleList.get(1), "key".getBytes());
     }
   }
 
@@ -634,19 +499,11 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      try {
-        final List<byte[]> keys = new ArrayList<>();
-        keys.add("key".getBytes());
-        keys.add("newcfkey".getBytes());
-        final List<ColumnFamilyHandle> cfCustomList = new ArrayList<>();
-        db.multiGetAsList(cfCustomList, keys);
-
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      final List<byte[]> keys = new ArrayList<>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      final List<ColumnFamilyHandle> cfCustomList = new ArrayList<>();
+      db.multiGetAsList(cfCustomList, keys);
     }
   }
 
@@ -660,25 +517,12 @@
       final byte[] b0 = new byte[]{(byte) 0x00};
       final byte[] b1 = new byte[]{(byte) 0x01};
       final byte[] b2 = new byte[]{(byte) 0x02};
-      ColumnFamilyHandle cf1 = null, cf2 = null, cf3 = null;
-      try {
-        cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
-        cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
-        final List<byte[]> families = RocksDB.listColumnFamilies(options,
-            dbFolder.getRoot().getAbsolutePath());
-        assertThat(families).contains("default".getBytes(), b0, b1);
-        cf3 = db.createColumnFamily(new ColumnFamilyDescriptor(b2));
-      } finally {
-        if (cf1 != null) {
-          cf1.close();
-        }
-        if (cf2 != null) {
-          cf2.close();
-        }
-        if (cf3 != null) {
-          cf3.close();
-        }
-      }
+      db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
+      db.createColumnFamily(new ColumnFamilyDescriptor(b2));
     }
   }
 
@@ -689,22 +533,13 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath());
     ) {
-      try {
-        final byte[] b0 = new byte[]{0, 0};
-        final byte[] b1 = new byte[]{0, 1};
-        cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
-        cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
-        final List<byte[]> families = RocksDB.listColumnFamilies(options,
-            dbFolder.getRoot().getAbsolutePath());
-        assertThat(families).contains("default".getBytes(), b0, b1);
-      } finally {
-        if (cf1 != null) {
-          cf1.close();
-        }
-        if (cf2 != null) {
-          cf2.close();
-        }
-      }
+      final byte[] b0 = new byte[] {0, 0};
+      final byte[] b1 = new byte[] {0, 1};
+      cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
     }
   }
 
@@ -715,17 +550,57 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath());
     ) {
+      final String simplifiedChinese = "\u7b80\u4f53\u5b57";
+      columnFamilyHandle =
+          db.createColumnFamily(new ColumnFamilyDescriptor(simplifiedChinese.getBytes()));
+
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), simplifiedChinese.getBytes());
+    }
+  }
+
+  @Test
+  public void testDestroyColumnFamilyHandle() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());) {
+      final byte[] name1 = "cf1".getBytes();
+      final byte[] name2 = "cf2".getBytes();
+      final ColumnFamilyDescriptor desc1 = new ColumnFamilyDescriptor(name1);
+      final ColumnFamilyDescriptor desc2 = new ColumnFamilyDescriptor(name2);
+      final ColumnFamilyHandle cf1 = db.createColumnFamily(desc1);
+      final ColumnFamilyHandle cf2 = db.createColumnFamily(desc2);
+      assertTrue(cf1.isOwningHandle());
+      assertTrue(cf2.isOwningHandle());
+      assertFalse(cf1.isDefaultColumnFamily());
+      db.destroyColumnFamilyHandle(cf1);
+      // At this point cf1 should not be used!
+      assertFalse(cf1.isOwningHandle());
+      assertTrue(cf2.isOwningHandle());
+    }
+  }
+
+  @Test
+  @Deprecated
+  /**
+   * @deprecated Now explicitly closing instances of ColumnFamilyHandle is not required.
+   *     RocksDB instance will take care of closing its associated ColumnFamilyHandle objects.
+   */
+  public void testColumnFamilyCloseBeforeDb() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
       try {
-        final String simplifiedChinese = "\u7b80\u4f53\u5b57";
-        columnFamilyHandle = db.createColumnFamily(
-            new ColumnFamilyDescriptor(simplifiedChinese.getBytes()));
-
-        final List<byte[]> families = RocksDB.listColumnFamilies(options,
-            dbFolder.getRoot().getAbsolutePath());
-        assertThat(families).contains("default".getBytes(),
-            simplifiedChinese.getBytes());
+        db.put("testKey".getBytes(), "tstValue".getBytes());
+        // Do something...
       } finally {
-        if (columnFamilyHandle != null) {
+        for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
           columnFamilyHandle.close();
         }
       }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -39,29 +39,22 @@
 
       final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
 
-      try (final RocksDB rocksDb = RocksDB.open(options,
-               dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles);
-      ) {
-        try {
-          final byte[] key1 = "key1".getBytes();
-          final byte[] key2 = "key2".getBytes();
-
-          final byte[] value1 = "value1".getBytes();
-          final byte[] value2 = new byte[0];
-
-          rocksDb.put(cfHandles.get(1), key1, value1);
-          rocksDb.put(cfHandles.get(1), key2, value2);
-
-          rocksDb.compactRange(cfHandles.get(1));
-
-          assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1);
-          final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null);
-          assertThat(exists).isFalse();
-        } finally {
-          for (final ColumnFamilyHandle cfHandle : cfHandles) {
-            cfHandle.close();
-          }
-        }
+      try (final RocksDB rocksDb =
+               RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles)) {
+        final byte[] key1 = "key1".getBytes();
+        final byte[] key2 = "key2".getBytes();
+
+        final byte[] value1 = "value1".getBytes();
+        final byte[] value2 = new byte[0];
+
+        rocksDb.put(cfHandles.get(1), key1, value1);
+        rocksDb.put(cfHandles.get(1), key2, value2);
+
+        rocksDb.compactRange(cfHandles.get(1));
+
+        assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1);
+        final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null);
+        assertThat(exists).isFalse();
       }
     }
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,50 @@
+package org.rocksdb;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Test;
+
+public class ConcurrentTaskLimiterTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  private static final String NAME = "name";
+
+  private ConcurrentTaskLimiter concurrentTaskLimiter;
+
+  @Before
+  public void beforeTest() {
+    concurrentTaskLimiter = new ConcurrentTaskLimiterImpl(NAME, 3);
+  }
+
+  @Test
+  public void name() {
+    assertEquals(NAME, concurrentTaskLimiter.name());
+  }
+
+  @Test
+  public void outstandingTask() {
+    assertEquals(0, concurrentTaskLimiter.outstandingTask());
+  }
+
+  @Test
+  public void setMaxOutstandingTask() {
+    assertEquals(concurrentTaskLimiter, concurrentTaskLimiter.setMaxOutstandingTask(4));
+    assertEquals(0, concurrentTaskLimiter.outstandingTask());
+  }
+
+  @Test
+  public void resetMaxOutstandingTask() {
+    assertEquals(concurrentTaskLimiter, concurrentTaskLimiter.resetMaxOutstandingTask());
+    assertEquals(0, concurrentTaskLimiter.outstandingTask());
+  }
+
+  @After
+  public void afterTest() {
+    concurrentTaskLimiter.close();
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,13 +5,16 @@
 
 package org.rocksdb;
 
-import org.junit.ClassRule;
-import org.junit.Test;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 
 import java.nio.file.Paths;
 import java.util.*;
-
-import static org.assertj.core.api.Assertions.assertThat;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.junit.ClassRule;
+import org.junit.Test;
 
 public class DBOptionsTest {
 
@@ -810,4 +813,123 @@
       assertThat(stats).isNotNull();
     }
   }
+
+  @Test
+  public void avoidUnnecessaryBlockingIO() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(false);
+      assertThat(options.setAvoidUnnecessaryBlockingIO(true)).isEqualTo(options);
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void persistStatsToDisk() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.persistStatsToDisk()).isEqualTo(false);
+      assertThat(options.setPersistStatsToDisk(true)).isEqualTo(options);
+      assertThat(options.persistStatsToDisk()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void writeDbidToManifest() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.writeDbidToManifest()).isEqualTo(false);
+      assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options);
+      assertThat(options.writeDbidToManifest()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void logReadaheadSize() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.logReadaheadSize()).isEqualTo(0);
+      final int size = 1024 * 1024 * 100;
+      assertThat(options.setLogReadaheadSize(size)).isEqualTo(options);
+      assertThat(options.logReadaheadSize()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void bestEffortsRecovery() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.bestEffortsRecovery()).isEqualTo(false);
+      assertThat(options.setBestEffortsRecovery(true)).isEqualTo(options);
+      assertThat(options.bestEffortsRecovery()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void maxBgerrorResumeCount() {
+    try (final DBOptions options = new DBOptions()) {
+      final int INT_MAX = 2147483647;
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(INT_MAX);
+      assertThat(options.setMaxBgErrorResumeCount(-1)).isEqualTo(options);
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(-1);
+    }
+  }
+
+  @Test
+  public void bgerrorResumeRetryInterval() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(1000000);
+      final long newRetryInterval = 24 * 3600 * 1000000L;
+      assertThat(options.setBgerrorResumeRetryInterval(newRetryInterval)).isEqualTo(options);
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(newRetryInterval);
+    }
+  }
+
+  @Test
+  public void maxWriteBatchGroupSizeBytes() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(1024 * 1024);
+      final long size = 1024 * 1024 * 1024 * 10L;
+      assertThat(options.setMaxWriteBatchGroupSizeBytes(size)).isEqualTo(options);
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void skipCheckingSstFileSizesOnDbOpen() {
+    try (final DBOptions options = new DBOptions()) {
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
+      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void eventListeners() {
+    final AtomicBoolean wasCalled1 = new AtomicBoolean();
+    final AtomicBoolean wasCalled2 = new AtomicBoolean();
+    try (final DBOptions options = new DBOptions();
+         final AbstractEventListener el1 =
+             new AbstractEventListener() {
+               @Override
+               public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+                 wasCalled1.set(true);
+               }
+             };
+         final AbstractEventListener el2 =
+             new AbstractEventListener() {
+               @Override
+               public void onMemTableSealed(final MemTableInfo memTableInfo) {
+                 wasCalled2.set(true);
+               }
+             }) {
+      assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options);
+      List<AbstractEventListener> listeners = options.listeners();
+      assertEquals(el1, listeners.get(0));
+      assertEquals(el2, listeners.get(1));
+      options.setListeners(Collections.<AbstractEventListener>emptyList());
+      listeners.get(0).onTableFileDeleted(null);
+      assertTrue(wasCalled1.get());
+      listeners.get(1).onMemTableSealed(null);
+      assertTrue(wasCalled2.get());
+      List<AbstractEventListener> listeners2 = options.listeners();
+      assertNotNull(listeners2);
+      assertEquals(0, listeners2.size());
+    }
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/EventListenerTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,763 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.*;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.AbstractEventListener.EnabledEventCallback;
+import org.rocksdb.test.TestableEventListener;
+
+public class EventListenerTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
+
+  void flushDb(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      db.flush(new FlushOptions());
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onFlushCompleted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onFlushCompletedListener = new AbstractEventListener() {
+      @Override
+      public void onFlushCompleted(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) {
+        assertNotNull(flushJobInfo.getColumnFamilyName());
+        assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onFlushCompletedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onFlushBegin() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onFlushBeginListener = new AbstractEventListener() {
+      @Override
+      public void onFlushBegin(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) {
+        assertNotNull(flushJobInfo.getColumnFamilyName());
+        assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onFlushBeginListener, wasCbCalled);
+  }
+
+  void deleteTableFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      final RocksDB.LiveFiles liveFiles = db.getLiveFiles();
+      assertNotNull(liveFiles);
+      assertNotNull(liveFiles.files);
+      assertFalse(liveFiles.files.isEmpty());
+      db.deleteFile(liveFiles.files.get(0));
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onTableFileDeleted() throws RocksDBException, InterruptedException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileDeletedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+        assertNotNull(tableFileDeletionInfo.getDbName());
+        wasCbCalled.set(true);
+      }
+    };
+    deleteTableFile(onTableFileDeletedListener, wasCbCalled);
+  }
+
+  void compactRange(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      db.compactRange();
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onCompactionBegin() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onCompactionBeginListener = new AbstractEventListener() {
+      @Override
+      public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason());
+        wasCbCalled.set(true);
+      }
+    };
+    compactRange(onCompactionBeginListener, wasCbCalled);
+  }
+
+  @Test
+  public void onCompactionCompleted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onCompactionCompletedListener = new AbstractEventListener() {
+      @Override
+      public void onCompactionCompleted(
+          final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason());
+        wasCbCalled.set(true);
+      }
+    };
+    compactRange(onCompactionCompletedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onTableFileCreated() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileCreatedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+        assertEquals(TableFileCreationReason.FLUSH, tableFileCreationInfo.getReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onTableFileCreatedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onTableFileCreationStarted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileCreationStartedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileCreationStarted(
+          final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+        assertEquals(TableFileCreationReason.FLUSH, tableFileCreationBriefInfo.getReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onTableFileCreationStartedListener, wasCbCalled);
+  }
+
+  void deleteColumnFamilyHandle(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      ColumnFamilyHandle columnFamilyHandle = db.getDefaultColumnFamily();
+      columnFamilyHandle.close();
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onColumnFamilyHandleDeletionStarted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onColumnFamilyHandleDeletionStartedListener =
+        new AbstractEventListener() {
+          @Override
+          public void onColumnFamilyHandleDeletionStarted(
+              final ColumnFamilyHandle columnFamilyHandle) {
+            assertNotNull(columnFamilyHandle);
+            wasCbCalled.set(true);
+          }
+        };
+    deleteColumnFamilyHandle(onColumnFamilyHandleDeletionStartedListener, wasCbCalled);
+  }
+
+  void ingestExternalFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final String uuid = UUID.randomUUID().toString();
+      final SstFileWriter sstFileWriter = new SstFileWriter(new EnvOptions(), opt);
+      final Path externalFilePath = Paths.get(db.getName(), uuid);
+      sstFileWriter.open(externalFilePath.toString());
+      sstFileWriter.put("testKey".getBytes(), uuid.getBytes());
+      sstFileWriter.finish();
+      db.ingestExternalFile(
+          Collections.singletonList(externalFilePath.toString()), new IngestExternalFileOptions());
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onExternalFileIngested() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onExternalFileIngestedListener = new AbstractEventListener() {
+      @Override
+      public void onExternalFileIngested(
+          final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+        assertNotNull(db);
+        wasCbCalled.set(true);
+      }
+    };
+    ingestExternalFile(onExternalFileIngestedListener, wasCbCalled);
+  }
+
+  @Test
+  public void testAllCallbacksInvocation() {
+    final int TEST_INT_VAL = -1;
+    final long TEST_LONG_VAL = -1;
+    // Expected test data objects
+    final Map<String, String> userCollectedPropertiesTestData =
+        Collections.singletonMap("key", "value");
+    final Map<String, String> readablePropertiesTestData = Collections.singletonMap("key", "value");
+    final TableProperties tablePropertiesTestData = new TableProperties(TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, "columnFamilyName".getBytes(),
+        "filterPolicyName", "comparatorName", "mergeOperatorName", "prefixExtractorName",
+        "propertyCollectorsNames", "compressionName", userCollectedPropertiesTestData,
+        readablePropertiesTestData);
+    final FlushJobInfo flushJobInfoTestData = new FlushJobInfo(Integer.MAX_VALUE,
+        "testColumnFamily", "/file/path", TEST_LONG_VAL, Integer.MAX_VALUE, true, true,
+        TEST_LONG_VAL, TEST_LONG_VAL, tablePropertiesTestData, (byte) 0x0a);
+    final Status statusTestData = new Status(Status.Code.Incomplete, Status.SubCode.NoSpace, null);
+    final TableFileDeletionInfo tableFileDeletionInfoTestData =
+        new TableFileDeletionInfo("dbName", "/file/path", Integer.MAX_VALUE, statusTestData);
+    final TableFileCreationInfo tableFileCreationInfoTestData =
+        new TableFileCreationInfo(TEST_LONG_VAL, tablePropertiesTestData, statusTestData, "dbName",
+            "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03);
+    final TableFileCreationBriefInfo tableFileCreationBriefInfoTestData =
+        new TableFileCreationBriefInfo(
+            "dbName", "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03);
+    final MemTableInfo memTableInfoTestData = new MemTableInfo(
+        "columnFamilyName", TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL);
+    final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path",
+        TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData);
+    final WriteStallInfo writeStallInfoTestData =
+        new WriteStallInfo("columnFamilyName", (byte) 0x1, (byte) 0x2);
+    final ExternalFileIngestionInfo externalFileIngestionInfoTestData =
+        new ExternalFileIngestionInfo("columnFamilyName", "/external/file/path",
+            "/internal/file/path", TEST_LONG_VAL, tablePropertiesTestData);
+
+    final CapturingTestableEventListener listener = new CapturingTestableEventListener() {
+      @Override
+      public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+        super.onFlushCompleted(db, flushJobInfo);
+        assertEquals(flushJobInfoTestData, flushJobInfo);
+      }
+
+      @Override
+      public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+        super.onFlushBegin(db, flushJobInfo);
+        assertEquals(flushJobInfoTestData, flushJobInfo);
+      }
+
+      @Override
+      public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+        super.onTableFileDeleted(tableFileDeletionInfo);
+        assertEquals(tableFileDeletionInfoTestData, tableFileDeletionInfo);
+      }
+
+      @Override
+      public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        super.onCompactionBegin(db, compactionJobInfo);
+        assertArrayEquals(
+            "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName());
+        assertEquals(statusTestData, compactionJobInfo.status());
+        assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel());
+        assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles());
+        assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles());
+        assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData),
+            compactionJobInfo.tableProperties());
+        assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason());
+        assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression());
+      }
+
+      @Override
+      public void onCompactionCompleted(
+          final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        super.onCompactionCompleted(db, compactionJobInfo);
+        assertArrayEquals(
+            "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName());
+        assertEquals(statusTestData, compactionJobInfo.status());
+        assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel());
+        assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles());
+        assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles());
+        assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData),
+            compactionJobInfo.tableProperties());
+        assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason());
+        assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression());
+      }
+
+      @Override
+      public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+        super.onTableFileCreated(tableFileCreationInfo);
+        assertEquals(tableFileCreationInfoTestData, tableFileCreationInfo);
+      }
+
+      @Override
+      public void onTableFileCreationStarted(
+          final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+        super.onTableFileCreationStarted(tableFileCreationBriefInfo);
+        assertEquals(tableFileCreationBriefInfoTestData, tableFileCreationBriefInfo);
+      }
+
+      @Override
+      public void onMemTableSealed(final MemTableInfo memTableInfo) {
+        super.onMemTableSealed(memTableInfo);
+        assertEquals(memTableInfoTestData, memTableInfo);
+      }
+
+      @Override
+      public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+        super.onColumnFamilyHandleDeletionStarted(columnFamilyHandle);
+      }
+
+      @Override
+      public void onExternalFileIngested(
+          final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+        super.onExternalFileIngested(db, externalFileIngestionInfo);
+        assertEquals(externalFileIngestionInfoTestData, externalFileIngestionInfo);
+      }
+
+      @Override
+      public void onBackgroundError(
+          final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+        super.onBackgroundError(backgroundErrorReason, backgroundError);
+      }
+
+      @Override
+      public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+        super.onStallConditionsChanged(writeStallInfo);
+        assertEquals(writeStallInfoTestData, writeStallInfo);
+      }
+
+      @Override
+      public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileReadFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileWriteFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileFlushFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileSyncFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileRangeSyncFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+        super.onFileTruncateFinish(fileOperationInfo);
+      }
+
+      @Override
+      public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileCloseFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public boolean shouldBeNotifiedOnFileIO() {
+        super.shouldBeNotifiedOnFileIO();
+        return false;
+      }
+
+      @Override
+      public boolean onErrorRecoveryBegin(
+          final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+        super.onErrorRecoveryBegin(backgroundErrorReason, backgroundError);
+        assertEquals(BackgroundErrorReason.FLUSH, backgroundErrorReason);
+        assertEquals(statusTestData, backgroundError);
+        return true;
+      }
+
+      @Override
+      public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+        super.onErrorRecoveryCompleted(oldBackgroundError);
+        assertEquals(statusTestData, oldBackgroundError);
+      }
+    };
+
+    // test action
+    listener.invokeAllCallbacks();
+
+    // assert
+    assertAllEventsCalled(listener);
+  }
+
+  @Test
+  public void testEnabledCallbacks() {
+    final EnabledEventCallback enabledEvents[] = {
+        EnabledEventCallback.ON_MEMTABLE_SEALED, EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED};
+
+    final CapturingTestableEventListener listener =
+        new CapturingTestableEventListener(enabledEvents);
+
+    // test action
+    listener.invokeAllCallbacks();
+
+    // assert
+    assertEventsCalled(listener, enabledEvents);
+  }
+
+  private static void assertAllEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener) {
+    assertEventsCalled(capturingTestableEventListener, EnumSet.allOf(EnabledEventCallback.class));
+  }
+
+  private static void assertEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener,
+      final EnabledEventCallback[] expected) {
+    assertEventsCalled(capturingTestableEventListener, EnumSet.copyOf(Arrays.asList(expected)));
+  }
+
+  private static void assertEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener,
+      final EnumSet<EnabledEventCallback> expected) {
+    final ListenerEvents capturedEvents = capturingTestableEventListener.capturedListenerEvents;
+
+    if (expected.contains(EnabledEventCallback.ON_FLUSH_COMPLETED)) {
+      assertTrue("onFlushCompleted was not called", capturedEvents.flushCompleted);
+    } else {
+      assertFalse("onFlushCompleted was not called", capturedEvents.flushCompleted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FLUSH_BEGIN)) {
+      assertTrue("onFlushBegin was not called", capturedEvents.flushBegin);
+    } else {
+      assertFalse("onFlushBegin was called", capturedEvents.flushBegin);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_DELETED)) {
+      assertTrue("onTableFileDeleted was not called", capturedEvents.tableFileDeleted);
+    } else {
+      assertFalse("onTableFileDeleted was called", capturedEvents.tableFileDeleted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_COMPACTION_BEGIN)) {
+      assertTrue("onCompactionBegin was not called", capturedEvents.compactionBegin);
+    } else {
+      assertFalse("onCompactionBegin was called", capturedEvents.compactionBegin);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_COMPACTION_COMPLETED)) {
+      assertTrue("onCompactionCompleted was not called", capturedEvents.compactionCompleted);
+    } else {
+      assertFalse("onCompactionCompleted was called", capturedEvents.compactionCompleted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATED)) {
+      assertTrue("onTableFileCreated was not called", capturedEvents.tableFileCreated);
+    } else {
+      assertFalse("onTableFileCreated was called", capturedEvents.tableFileCreated);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATION_STARTED)) {
+      assertTrue(
+          "onTableFileCreationStarted was not called", capturedEvents.tableFileCreationStarted);
+    } else {
+      assertFalse("onTableFileCreationStarted was called", capturedEvents.tableFileCreationStarted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_MEMTABLE_SEALED)) {
+      assertTrue("onMemTableSealed was not called", capturedEvents.memTableSealed);
+    } else {
+      assertFalse("onMemTableSealed was called", capturedEvents.memTableSealed);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED)) {
+      assertTrue("onColumnFamilyHandleDeletionStarted was not called",
+          capturedEvents.columnFamilyHandleDeletionStarted);
+    } else {
+      assertFalse("onColumnFamilyHandleDeletionStarted was called",
+          capturedEvents.columnFamilyHandleDeletionStarted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_EXTERNAL_FILE_INGESTED)) {
+      assertTrue("onExternalFileIngested was not called", capturedEvents.externalFileIngested);
+    } else {
+      assertFalse("onExternalFileIngested was called", capturedEvents.externalFileIngested);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_BACKGROUND_ERROR)) {
+      assertTrue("onBackgroundError was not called", capturedEvents.backgroundError);
+    } else {
+      assertFalse("onBackgroundError was called", capturedEvents.backgroundError);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_STALL_CONDITIONS_CHANGED)) {
+      assertTrue("onStallConditionsChanged was not called", capturedEvents.stallConditionsChanged);
+    } else {
+      assertFalse("onStallConditionsChanged was called", capturedEvents.stallConditionsChanged);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_READ_FINISH)) {
+      assertTrue("onFileReadFinish was not called", capturedEvents.fileReadFinish);
+    } else {
+      assertFalse("onFileReadFinish was called", capturedEvents.fileReadFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_WRITE_FINISH)) {
+      assertTrue("onFileWriteFinish was not called", capturedEvents.fileWriteFinish);
+    } else {
+      assertFalse("onFileWriteFinish was called", capturedEvents.fileWriteFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_FLUSH_FINISH)) {
+      assertTrue("onFileFlushFinish was not called", capturedEvents.fileFlushFinish);
+    } else {
+      assertFalse("onFileFlushFinish was called", capturedEvents.fileFlushFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_SYNC_FINISH)) {
+      assertTrue("onFileSyncFinish was not called", capturedEvents.fileSyncFinish);
+    } else {
+      assertFalse("onFileSyncFinish was called", capturedEvents.fileSyncFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_RANGE_SYNC_FINISH)) {
+      assertTrue("onFileRangeSyncFinish was not called", capturedEvents.fileRangeSyncFinish);
+    } else {
+      assertFalse("onFileRangeSyncFinish was called", capturedEvents.fileRangeSyncFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_TRUNCATE_FINISH)) {
+      assertTrue("onFileTruncateFinish was not called", capturedEvents.fileTruncateFinish);
+    } else {
+      assertFalse("onFileTruncateFinish was called", capturedEvents.fileTruncateFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_CLOSE_FINISH)) {
+      assertTrue("onFileCloseFinish was not called", capturedEvents.fileCloseFinish);
+    } else {
+      assertFalse("onFileCloseFinish was called", capturedEvents.fileCloseFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.SHOULD_BE_NOTIFIED_ON_FILE_IO)) {
+      assertTrue(
+          "shouldBeNotifiedOnFileIO was not called", capturedEvents.shouldBeNotifiedOnFileIO);
+    } else {
+      assertFalse("shouldBeNotifiedOnFileIO was called", capturedEvents.shouldBeNotifiedOnFileIO);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_BEGIN)) {
+      assertTrue("onErrorRecoveryBegin was not called", capturedEvents.errorRecoveryBegin);
+    } else {
+      assertFalse("onErrorRecoveryBegin was called", capturedEvents.errorRecoveryBegin);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED)) {
+      assertTrue("onErrorRecoveryCompleted was not called", capturedEvents.errorRecoveryCompleted);
+    } else {
+      assertFalse("onErrorRecoveryCompleted was called", capturedEvents.errorRecoveryCompleted);
+    }
+  }
+
+  /**
+   * Members are volatile as they may be written
+   * and read by different threads.
+   */
+  private static class ListenerEvents {
+    volatile boolean flushCompleted;
+    volatile boolean flushBegin;
+    volatile boolean tableFileDeleted;
+    volatile boolean compactionBegin;
+    volatile boolean compactionCompleted;
+    volatile boolean tableFileCreated;
+    volatile boolean tableFileCreationStarted;
+    volatile boolean memTableSealed;
+    volatile boolean columnFamilyHandleDeletionStarted;
+    volatile boolean externalFileIngested;
+    volatile boolean backgroundError;
+    volatile boolean stallConditionsChanged;
+    volatile boolean fileReadFinish;
+    volatile boolean fileWriteFinish;
+    volatile boolean fileFlushFinish;
+    volatile boolean fileSyncFinish;
+    volatile boolean fileRangeSyncFinish;
+    volatile boolean fileTruncateFinish;
+    volatile boolean fileCloseFinish;
+    volatile boolean shouldBeNotifiedOnFileIO;
+    volatile boolean errorRecoveryBegin;
+    volatile boolean errorRecoveryCompleted;
+  }
+
+  private static class CapturingTestableEventListener extends TestableEventListener {
+    final ListenerEvents capturedListenerEvents = new ListenerEvents();
+
+    public CapturingTestableEventListener() {}
+
+    public CapturingTestableEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+      super(enabledEventCallbacks);
+    }
+
+    @Override
+    public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+      capturedListenerEvents.flushCompleted = true;
+    }
+
+    @Override
+    public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+      capturedListenerEvents.flushBegin = true;
+    }
+
+    @Override
+    public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+      capturedListenerEvents.tableFileDeleted = true;
+    }
+
+    @Override
+    public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+      capturedListenerEvents.compactionBegin = true;
+    }
+
+    @Override
+    public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+      capturedListenerEvents.compactionCompleted = true;
+    }
+
+    @Override
+    public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+      capturedListenerEvents.tableFileCreated = true;
+    }
+
+    @Override
+    public void onTableFileCreationStarted(
+        final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+      capturedListenerEvents.tableFileCreationStarted = true;
+    }
+
+    @Override
+    public void onMemTableSealed(final MemTableInfo memTableInfo) {
+      capturedListenerEvents.memTableSealed = true;
+    }
+
+    @Override
+    public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+      capturedListenerEvents.columnFamilyHandleDeletionStarted = true;
+    }
+
+    @Override
+    public void onExternalFileIngested(
+        final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+      capturedListenerEvents.externalFileIngested = true;
+    }
+
+    @Override
+    public void onBackgroundError(
+        final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+      capturedListenerEvents.backgroundError = true;
+    }
+
+    @Override
+    public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+      capturedListenerEvents.stallConditionsChanged = true;
+    }
+
+    @Override
+    public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileReadFinish = true;
+    }
+
+    @Override
+    public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileWriteFinish = true;
+    }
+
+    @Override
+    public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileFlushFinish = true;
+    }
+
+    @Override
+    public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileSyncFinish = true;
+    }
+
+    @Override
+    public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileRangeSyncFinish = true;
+    }
+
+    @Override
+    public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileTruncateFinish = true;
+    }
+
+    @Override
+    public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileCloseFinish = true;
+    }
+
+    @Override
+    public boolean shouldBeNotifiedOnFileIO() {
+      capturedListenerEvents.shouldBeNotifiedOnFileIO = true;
+      return false;
+    }
+
+    @Override
+    public boolean onErrorRecoveryBegin(
+        final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+      capturedListenerEvents.errorRecoveryBegin = true;
+      return true;
+    }
+
+    @Override
+    public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+      capturedListenerEvents.errorRecoveryCompleted = true;
+    }
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -4,20 +4,19 @@
 //  (found in the LICENSE.Apache file in the root directory).
 package org.rocksdb;
 
-import org.junit.ClassRule;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
 
+import java.nio.BufferUnderflowException;
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.assertj.core.api.Assertions.assertThat;
+import org.junit.*;
+import org.junit.rules.ExpectedException;
+import org.junit.rules.TemporaryFolder;
 
 public class KeyMayExistTest {
-
   @ClassRule
   public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
       new RocksNativeLibraryResource();
@@ -25,168 +24,505 @@
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
+  @Rule public ExpectedException exceptionRule = ExpectedException.none();
+
+  List<ColumnFamilyDescriptor> cfDescriptors;
+  List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+  RocksDB db;
+
+  // Slice key
+  int offset;
+  int len;
+
+  byte[] sliceKey;
+  byte[] sliceValue;
+
+  @Before
+  public void before() throws RocksDBException {
+    cfDescriptors = Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final DBOptions options =
+        new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+
+    db = RocksDB.open(
+        options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList);
+
+    // Build the slice key
+    final StringBuilder builder = new StringBuilder("prefix");
+    offset = builder.toString().length();
+    builder.append("slice key 0");
+    len = builder.toString().length() - offset;
+    builder.append("suffix");
+    sliceKey = builder.toString().getBytes(UTF_8);
+    sliceValue = "slice value 0".getBytes(UTF_8);
+  }
+
+  @After
+  public void after() {
+    for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+      columnFamilyHandle.close();
+    }
+    db.close();
+  }
+
   @Test
   public void keyMayExist() throws RocksDBException {
-    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
-        new ColumnFamilyDescriptor("new_cf".getBytes())
-    );
-
-    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
-    try (final DBOptions options = new DBOptions()
-        .setCreateIfMissing(true)
-        .setCreateMissingColumnFamilies(true);
-         final RocksDB db = RocksDB.open(options,
-             dbFolder.getRoot().getAbsolutePath(),
-             cfDescriptors, columnFamilyHandleList)) {
-      try {
-        assertThat(columnFamilyHandleList.size()).
-            isEqualTo(2);
-        db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
-        // Test without column family
-        final Holder<byte[]> holder = new Holder<>();
-        boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder);
-        assertThat(exists).isTrue();
-        assertThat(holder.getValue()).isNotNull();
-        assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
-
-        exists = db.keyMayExist("key".getBytes(UTF_8), null);
-        assertThat(exists).isTrue();
-
-        // Slice key
-        final StringBuilder builder = new StringBuilder("prefix");
-        final int offset = builder.toString().length();
-        builder.append("slice key 0");
-        final int len = builder.toString().length() - offset;
-        builder.append("suffix");
-
-        final byte[] sliceKey = builder.toString().getBytes(UTF_8);
-        final byte[] sliceValue = "slice value 0".getBytes(UTF_8);
-        db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
-
-        exists = db.keyMayExist(sliceKey, offset, len, holder);
-        assertThat(exists).isTrue();
-        assertThat(holder.getValue()).isNotNull();
-        assertThat(holder.getValue()).isEqualTo(sliceValue);
-
-        exists = db.keyMayExist(sliceKey, offset, len, null);
-        assertThat(exists).isTrue();
-
-        // Test without column family but with readOptions
-        try (final ReadOptions readOptions = new ReadOptions()) {
-          exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), holder);
-          assertThat(exists).isTrue();
-          assertThat(holder.getValue()).isNotNull();
-          assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
-
-          exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), null);
-          assertThat(exists).isTrue();
-
-          exists = db.keyMayExist(readOptions, sliceKey, offset, len, holder);
-          assertThat(exists).isTrue();
-          assertThat(holder.getValue()).isNotNull();
-          assertThat(holder.getValue()).isEqualTo(sliceValue);
-
-          exists = db.keyMayExist(readOptions, sliceKey, offset, len, null);
-          assertThat(exists).isTrue();
-        }
-
-        // Test with column family
-        exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8),
-            holder);
-        assertThat(exists).isTrue();
-        assertThat(holder.getValue()).isNotNull();
-        assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
-
-        exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8),
-            null);
-        assertThat(exists).isTrue();
-
-        // Test slice sky with column family
-        exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len,
-            holder);
-        assertThat(exists).isTrue();
-        assertThat(holder.getValue()).isNotNull();
-        assertThat(holder.getValue()).isEqualTo(sliceValue);
-
-        exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len,
-            null);
-        assertThat(exists).isTrue();
-
-        // Test with column family and readOptions
-        try (final ReadOptions readOptions = new ReadOptions()) {
-          exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions,
-              "key".getBytes(UTF_8), holder);
-          assertThat(exists).isTrue();
-          assertThat(holder.getValue()).isNotNull();
-          assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
-
-          exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions,
-              "key".getBytes(UTF_8), null);
-          assertThat(exists).isTrue();
-
-          // Test slice key with column family and read options
-          exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions,
-              sliceKey, offset, len, holder);
-          assertThat(exists).isTrue();
-          assertThat(holder.getValue()).isNotNull();
-          assertThat(holder.getValue()).isEqualTo(sliceValue);
-
-          exists = db.keyMayExist(columnFamilyHandleList.get(0), readOptions,
-              sliceKey, offset, len, null);
-          assertThat(exists).isTrue();
-        }
-
-        // KeyMayExist in CF1 must return null value
-        exists = db.keyMayExist(columnFamilyHandleList.get(1),
-            "key".getBytes(UTF_8), holder);
-        assertThat(exists).isFalse();
-        assertThat(holder.getValue()).isNull();
-        exists = db.keyMayExist(columnFamilyHandleList.get(1),
-            "key".getBytes(UTF_8), null);
-        assertThat(exists).isFalse();
-
-        // slice key
-        exists = db.keyMayExist(columnFamilyHandleList.get(1),
-            sliceKey, 1, 3, holder);
-        assertThat(exists).isFalse();
-        assertThat(holder.getValue()).isNull();
-        exists = db.keyMayExist(columnFamilyHandleList.get(1),
-            sliceKey, 1, 3, null);
-        assertThat(exists).isFalse();
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
-    }
+    assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Test without column family
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+    exists = db.keyMayExist("key".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
   }
 
   @Test
-  public void keyMayExistNonUnicodeString() throws RocksDBException {
-    try (final Options options = new Options()
-        .setCreateIfMissing(true)
-        .setCreateMissingColumnFamilies(true);
-         final RocksDB db = RocksDB.open(options,
-             dbFolder.getRoot().getAbsolutePath())) {
-      final byte key[] = "key".getBytes(UTF_8);
-      final byte value[] = { (byte)0x80 };  // invalid unicode code-point
-      db.put(key, value);
-
-      final byte buf[] = new byte[10];
-      final int read = db.get(key, buf);
-      assertThat(read).isEqualTo(1);
-      assertThat(buf).startsWith(value);
+  public void keyMayExistReadOptions() throws RocksDBException {
+    // Test without column family but with readOptions
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      // Standard key
+      db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+      // Slice key
+      db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
 
       final Holder<byte[]> holder = new Holder<>();
-      boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder);
+      boolean exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), holder);
+      assertThat(exists).isTrue();
+      assertThat(holder.getValue()).isNotNull();
+      assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+      exists = db.keyMayExist(readOptions, "key".getBytes(UTF_8), null);
+      assertThat(exists).isTrue();
+
+      exists = db.keyMayExist(readOptions, sliceKey, offset, len, holder);
+      assertThat(exists).isTrue();
+      assertThat(holder.getValue()).isNotNull();
+      assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+      exists = db.keyMayExist(readOptions, sliceKey, offset, len, null);
+      assertThat(exists).isTrue();
+    }
+  }
+
+  @Test
+  public void keyMayExistColumnFamily() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // Test slice key with column family
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+    exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len, null);
+    assertThat(exists).isTrue();
+  }
+
+  @Test
+  public void keyMayExistColumnFamilyReadOptions() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // Test slice key with column family and read options
+    final Holder<byte[]> holder = new Holder<>();
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      boolean exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, "key".getBytes(UTF_8), holder);
       assertThat(exists).isTrue();
       assertThat(holder.getValue()).isNotNull();
-      assertThat(holder.getValue()).isEqualTo(value);
+      assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+      exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, "key".getBytes(UTF_8), null);
+      assertThat(exists).isTrue();
 
-      exists = db.keyMayExist("key".getBytes(UTF_8), null);
+      // Test slice key with column family and read options
+      exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, sliceKey, offset, len, holder);
+      assertThat(exists).isTrue();
+      assertThat(holder.getValue()).isNotNull();
+      assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+      exists =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, sliceKey, offset, len, null);
       assertThat(exists).isTrue();
     }
   }
+
+  @Test
+  public void keyMayExistSliceKey() throws RocksDBException {
+    assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(sliceKey, offset, len, holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(holder.getValue()).isEqualTo(sliceValue);
+
+    exists = db.keyMayExist(sliceKey, offset, len, null);
+    assertThat(exists).isTrue();
+
+    exists = db.keyMayExist("slice key".getBytes(UTF_8), null);
+    assertThat(exists).isFalse();
+
+    exists = db.keyMayExist("slice key 0".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
+
+    // Test with column family
+    exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(new String(holder.getValue(), UTF_8)).isEqualTo("value");
+
+    exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
+
+    // KeyMayExist in CF1 must return null value
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), null);
+    assertThat(exists).isFalse();
+
+    // slice key
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, null);
+    assertThat(exists).isFalse();
+  }
+
+  @Test
+  public void keyMayExistCF1() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // KeyMayExist in CF1 must return null value
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(UTF_8), null);
+    assertThat(exists).isFalse();
+  }
+
+  @Test
+  public void keyMayExistCF1Slice() throws RocksDBException {
+    // Standard key
+    db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8));
+
+    // Slice key
+    db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+    // slice key
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, holder);
+    assertThat(exists).isFalse();
+    assertThat(holder.getValue()).isNull();
+    exists = db.keyMayExist(columnFamilyHandleList.get(1), sliceKey, 1, 3, null);
+    assertThat(exists).isFalse();
+  }
+
+  @Test
+  public void keyMayExistBB() throws RocksDBException {
+    // Standard key
+    db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8));
+
+    final byte[] key = "keyBB".getBytes(UTF_8);
+    final byte[] value = "valueBB".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true);
+
+    final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+    valueBuffer.position(12);
+    KeyMayExist keyMayExist = db.keyMayExist(keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(12);
+    assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+    byte[] valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(value);
+
+    valueBuffer.limit(value.length + 24);
+    valueBuffer.position(25);
+    keyMayExist = db.keyMayExist(keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(25);
+    assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+    valueGet = new byte[value.length - 1];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+    exceptionRule.expect(BufferUnderflowException.class);
+    valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+  }
+
+  @Test
+  public void keyMayExistBBReadOptions() throws RocksDBException {
+    // Standard key
+    db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8));
+
+    final byte[] key = "keyBB".getBytes(UTF_8);
+    final byte[] value = "valueBB".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(true);
+
+      final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+      valueBuffer.position(12);
+      KeyMayExist keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(12);
+      assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+      byte[] valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(value);
+
+      valueBuffer.limit(value.length + 24);
+      valueBuffer.position(25);
+      keyMayExist = db.keyMayExist(readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(25);
+      assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+      valueGet = new byte[value.length - 1];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+      exceptionRule.expect(BufferUnderflowException.class);
+      valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+    }
+  }
+
+  @Test
+  public void keyMayExistBBNullValue() throws RocksDBException {
+    // Standard key
+    db.put("keyBB".getBytes(UTF_8), "valueBB".getBytes(UTF_8));
+
+    final byte[] key = "keyBB".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    exceptionRule.expect(AssertionError.class);
+    exceptionRule.expectMessage(
+        "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method");
+    final KeyMayExist keyMayExist = db.keyMayExist(keyBuffer, null);
+  }
+
+  @Test
+  public void keyMayExistBBCF() throws RocksDBException {
+    // Standard key
+    db.put(columnFamilyHandleList.get(0), "keyBBCF0".getBytes(UTF_8), "valueBBCF0".getBytes(UTF_8));
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    // 0 is the default CF
+    byte[] key = "keyBBCF0".getBytes(UTF_8);
+    ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(false);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(true);
+
+    // 1 is just a CF
+    key = "keyBBCF1".getBytes(UTF_8);
+    keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(keyBuffer)).isEqualTo(false);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true);
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer)).isEqualTo(false);
+
+    exceptionRule.expect(AssertionError.class);
+    exceptionRule.expectMessage(
+        "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method");
+    final KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(0), keyBuffer, null);
+  }
+
+  @Test
+  public void keyMayExistBBCFReadOptions() throws RocksDBException {
+    // Standard key
+    db.put(columnFamilyHandleList.get(0), "keyBBCF0".getBytes(UTF_8), "valueBBCF0".getBytes(UTF_8));
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    // 0 is the default CF
+    byte[] key = "keyBBCF0".getBytes(UTF_8);
+    ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      assertThat(db.keyMayExist(keyBuffer)).isEqualTo(true);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer))
+          .isEqualTo(false);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer))
+          .isEqualTo(true);
+
+      // 1 is just a CF
+      key = "keyBBCF1".getBytes(UTF_8);
+      keyBuffer = ByteBuffer.allocateDirect(key.length);
+      keyBuffer.put(key, 0, key.length);
+      keyBuffer.flip();
+
+      assertThat(db.keyMayExist(readOptions, keyBuffer)).isEqualTo(false);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer))
+          .isEqualTo(true);
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer))
+          .isEqualTo(false);
+
+      exceptionRule.expect(AssertionError.class);
+      exceptionRule.expectMessage(
+          "value ByteBuffer parameter cannot be null. If you do not need the value, use a different version of the method");
+      final KeyMayExist keyMayExist =
+          db.keyMayExist(columnFamilyHandleList.get(0), readOptions, keyBuffer, null);
+    }
+  }
+
+  @Test
+  public void keyMayExistBBCFOffset() throws RocksDBException {
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    final byte[] key = "keyBBCF1".getBytes(UTF_8);
+    final byte[] value = "valueBBCF1".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer)).isEqualTo(true);
+
+    final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+    valueBuffer.position(12);
+    KeyMayExist keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(12);
+    assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+    byte[] valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(value);
+
+    valueBuffer.limit(value.length + 24);
+    valueBuffer.position(25);
+    keyMayExist = db.keyMayExist(columnFamilyHandleList.get(1), keyBuffer, valueBuffer);
+    assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+    assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+    assertThat(valueBuffer.position()).isEqualTo(25);
+    assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+    valueGet = new byte[value.length - 1];
+    valueBuffer.get(valueGet);
+    assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+    exceptionRule.expect(BufferUnderflowException.class);
+    valueGet = new byte[value.length];
+    valueBuffer.get(valueGet);
+  }
+
+  @Test
+  public void keyMayExistBBCFOffsetReadOptions() throws RocksDBException {
+    db.put(columnFamilyHandleList.get(1), "keyBBCF1".getBytes(UTF_8), "valueBBCF1".getBytes(UTF_8));
+
+    final byte[] key = "keyBBCF1".getBytes(UTF_8);
+    final byte[] value = "valueBBCF1".getBytes(UTF_8);
+
+    final ByteBuffer keyBuffer = ByteBuffer.allocateDirect(key.length);
+    keyBuffer.put(key, 0, key.length);
+    keyBuffer.flip();
+
+    try (final ReadOptions readOptions = new ReadOptions()) {
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer))
+          .isEqualTo(true);
+
+      final ByteBuffer valueBuffer = ByteBuffer.allocateDirect(value.length + 24);
+      valueBuffer.position(12);
+      KeyMayExist keyMayExist =
+          db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(12);
+      assertThat(valueBuffer.limit()).isEqualTo(12 + value.length);
+      byte[] valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(value);
+
+      valueBuffer.limit(value.length + 24);
+      valueBuffer.position(25);
+      keyMayExist =
+          db.keyMayExist(columnFamilyHandleList.get(1), readOptions, keyBuffer, valueBuffer);
+      assertThat(keyMayExist.exists).isEqualTo(KeyMayExist.KeyMayExistEnum.kExistsWithValue);
+      assertThat(keyMayExist.valueLength).isEqualTo(value.length);
+      assertThat(valueBuffer.position()).isEqualTo(25);
+      assertThat(valueBuffer.limit()).isEqualTo(24 + value.length);
+      valueGet = new byte[value.length - 1];
+      valueBuffer.get(valueGet);
+      assertThat(valueGet).isEqualTo(Arrays.copyOfRange(value, 0, value.length - 1));
+
+      exceptionRule.expect(BufferUnderflowException.class);
+      valueGet = new byte[value.length];
+      valueBuffer.get(valueGet);
+    }
+  }
+
+  @Test
+  public void keyMayExistNonUnicodeString() throws RocksDBException {
+    final byte[] key = "key".getBytes(UTF_8);
+    final byte[] value = {(byte) 0x80}; // invalid unicode code-point
+    db.put(key, value);
+
+    final byte[] buf = new byte[10];
+    final int read = db.get(key, buf);
+    assertThat(read).isEqualTo(1);
+    assertThat(buf).startsWith(value);
+
+    final Holder<byte[]> holder = new Holder<>();
+    boolean exists = db.keyMayExist("key".getBytes(UTF_8), holder);
+    assertThat(exists).isTrue();
+    assertThat(holder.getValue()).isNotNull();
+    assertThat(holder.getValue()).isEqualTo(value);
+
+    exists = db.keyMayExist("key".getBytes(UTF_8), null);
+    assertThat(exists).isTrue();
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/LRUCacheTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,23 +5,27 @@
 
 package org.rocksdb;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.ClassRule;
 import org.junit.Test;
 
 public class LRUCacheTest {
-
-  static {
-    RocksDB.loadLibrary();
-  }
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
 
   @Test
   public void newLRUCache() {
-    final long capacity = 1000;
+    final long capacity = 80000000;
     final int numShardBits = 16;
     final boolean strictCapacityLimit = true;
-    final double highPriPoolRatio = 5;
+    final double highPriPoolRatio = 0.05;
     try(final Cache lruCache = new LRUCache(capacity,
         numShardBits, strictCapacityLimit, highPriPoolRatio)) {
       //no op
+      assertThat(lruCache.getUsage()).isGreaterThanOrEqualTo(0);
+      assertThat(lruCache.getPinnedUsage()).isGreaterThanOrEqualTo(0);
     }
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -58,7 +58,8 @@
               db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
       assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
               db.getAggregatedLongProperty(TABLE_READERS));
-      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isEqualTo(0);
+      // TODO(peterd): disable block cache entry stats and check for 0
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isLessThan(1024);
 
       db.put(key, value);
       db.flush(flushOptions);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,18 +5,18 @@
 
 package org.rocksdb;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.ArrayList;
-
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
-import static org.assertj.core.api.Assertions.assertThat;
-
 public class MergeTest {
 
   @ClassRule
@@ -46,13 +46,13 @@
   }
 
   private byte[] longToByteArray(long l) {
-    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE);
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN);
     buf.putLong(l);
     return buf.array();
   }
 
   private long longFromByteArray(byte[] a) {
-    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE);
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN);
     buf.put(a);
     buf.flip();
     return buf.getLong();
@@ -144,14 +144,13 @@
           // writing (long)100 under key
           db.put(columnFamilyHandleList.get(1),
               "cfkey".getBytes(), longToByteArray(100));
-          // merge (long)1 under key
-          db.merge(columnFamilyHandleList.get(1),
-              "cfkey".getBytes(), longToByteArray(1));
+          // merge (long)157 under key
+          db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(157));
 
           byte[] value = db.get(columnFamilyHandleList.get(1),
               "cfkey".getBytes());
           long longValue = longFromByteArray(value);
-          assertThat(longValue).isEqualTo(101);
+          assertThat(longValue).isEqualTo(257);
         } finally {
           for (final ColumnFamilyHandle handle : columnFamilyHandleList) {
             handle.close();
@@ -413,6 +412,32 @@
     }
   }
 
+  @Test
+  public void emptyStringAsStringAppendDelimiter() throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator("");
+         final Options opt =
+             new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "aa".getBytes());
+      db.merge("key".getBytes(), "bb".getBytes());
+      final byte[] value = db.get("key".getBytes());
+      assertThat(new String(value)).isEqualTo("aabb");
+    }
+  }
+
+  @Test
+  public void multiCharStringAsStringAppendDelimiter() throws RocksDBException {
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator("<>");
+         final Options opt =
+             new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key".getBytes(), "aa".getBytes());
+      db.merge("key".getBytes(), "bb".getBytes());
+      final byte[] value = db.get("key".getBytes());
+      assertThat(new String(value)).isEqualTo("aa<>bb");
+    }
+  }
+
   @Test
   public void emptyStringInSetMergeOperatorByName() {
     try (final Options opt = new Options()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class MultiGetManyKeysTest {
+  @Parameterized.Parameters
+  public static List<Integer> data() {
+    return Arrays.asList(3, 250, 60000, 70000, 150000, 750000);
+  }
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private final int keySize;
+
+  public MultiGetManyKeysTest(final Integer keySize) {
+    this.keySize = keySize;
+  }
+
+  /**
+   * Test for https://github.com/facebook/rocksdb/issues/8039
+   */
+  @Test
+  public void multiGetAsListLarge() throws RocksDBException {
+    final Random rand = new Random();
+    final List<byte[]> keys = new ArrayList<>();
+    for (int i = 0; i < keySize; i++) {
+      final byte[] key = new byte[4];
+      rand.nextBytes(key);
+      keys.add(key);
+    }
+
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<byte[]> values = db.multiGetAsList(keys);
+      assertThat(values.size()).isEqualTo(keys.size());
+    }
+  }
+
+  @Test
+  public void multiGetAsListCheckResults() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<byte[]> keys = new ArrayList<>();
+      for (int i = 0; i < keySize; i++) {
+        byte[] key = ("key" + i + ":").getBytes();
+        keys.add(key);
+        db.put(key, ("value" + i + ":").getBytes());
+      }
+
+      final List<byte[]> values = db.multiGetAsList(keys);
+      assertThat(values.size()).isEqualTo(keys.size());
+      for (int i = 0; i < keySize; i++) {
+        assertThat(values.get(i)).isEqualTo(("value" + i + ":").getBytes());
+      }
+    }
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MultiGetTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,525 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.TestUtil;
+
+public class MultiGetTest {
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void putNThenMultiGet() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+      final List<byte[]> keys =
+          Arrays.asList("key1".getBytes(), "key2".getBytes(), "key3".getBytes());
+      final List<byte[]> values = db.multiGetAsList(keys);
+      assertThat(values.size()).isEqualTo(keys.size());
+      assertThat(values.get(0)).isEqualTo("value1ForKey1".getBytes());
+      assertThat(values.get(1)).isEqualTo("value2ForKey2".getBytes());
+      assertThat(values.get(2)).isEqualTo("value3ForKey3".getBytes());
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirect() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ByteBufferGetStatus> results = db.multiGetByteBuffers(keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+
+      {
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(new ReadOptions(), keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectSliced() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      keys.add(
+          ByteBuffer.allocateDirect(12).put("prefix1".getBytes()).slice().put("key1".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ByteBufferGetStatus> results = db.multiGetByteBuffers(keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value3ForKey3".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value1ForKey1".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectBadValuesArray() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+
+      {
+        final List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < keys.size(); i++) {
+          values.add(ByteBuffer.allocateDirect(24));
+        }
+
+        values.remove(0);
+
+        try {
+          db.multiGetByteBuffers(keys, values);
+          fail("Expected exception when not enough value ByteBuffers supplied");
+        } catch (final IllegalArgumentException e) {
+          assertThat(e.getMessage()).contains("For each key there must be a corresponding value");
+        }
+      }
+
+      {
+        final List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < keys.size(); i++) {
+          values.add(ByteBuffer.allocateDirect(24));
+        }
+
+        values.add(ByteBuffer.allocateDirect(24));
+
+        try {
+          db.multiGetByteBuffers(keys, values);
+          fail("Expected exception when too many value ByteBuffers supplied");
+        } catch (final IllegalArgumentException e) {
+          assertThat(e.getMessage()).contains("For each key there must be a corresponding value");
+        }
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectShortValueBuffers() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+
+      {
+        final List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < keys.size(); i++) {
+          values.add(ByteBuffer.allocateDirect(4));
+        }
+
+        final List<ByteBufferGetStatus> statii = db.multiGetByteBuffers(keys, values);
+        assertThat(statii.size()).isEqualTo(values.size());
+        for (final ByteBufferGetStatus status : statii) {
+          assertThat(status.status.getCode()).isEqualTo(Status.Code.Ok);
+          assertThat(status.requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+          final ByteBuffer expected =
+              ByteBuffer.allocateDirect(24).put(Arrays.copyOf("valueX".getBytes(), 4));
+          expected.flip();
+          assertThat(status.value).isEqualTo(expected);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectNondefaultCF() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>(0);
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf2".getBytes()));
+
+      final List<ColumnFamilyHandle> cf = db.createColumnFamilies(cfDescriptors);
+
+      db.put(cf.get(0), "key1".getBytes(), "value1ForKey1".getBytes());
+      db.put(cf.get(0), "key2".getBytes(), "value2ForKey2".getBytes());
+      db.put(cf.get(0), "key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ByteBufferGetStatus> results = db.multiGetByteBuffers(keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound);
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(0));
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(0));
+        columnFamilyHandles.add(cf.get(0));
+        columnFamilyHandles.add(cf.get(0));
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectCFParams() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put("key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      columnFamilyHandles.add(db.getDefaultColumnFamily());
+      columnFamilyHandles.add(db.getDefaultColumnFamily());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+      try {
+        db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+        fail("Expected exception when 2 column families supplied");
+      } catch (final IllegalArgumentException e) {
+        assertThat(e.getMessage()).contains("Wrong number of ColumnFamilyHandle(s) supplied");
+      }
+
+      columnFamilyHandles.clear();
+      columnFamilyHandles.add(db.getDefaultColumnFamily());
+      final List<ByteBufferGetStatus> results =
+          db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+      assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+      assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+      assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+      assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+      assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+      assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+      assertThat(TestUtil.bufferBytes(results.get(0).value)).isEqualTo("value1ForKey1".getBytes());
+      assertThat(TestUtil.bufferBytes(results.get(1).value)).isEqualTo("value2ForKey2".getBytes());
+      assertThat(TestUtil.bufferBytes(results.get(2).value)).isEqualTo("value3ForKey3".getBytes());
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectMixedCF() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf2".getBytes()));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf3".getBytes()));
+
+      final List<ColumnFamilyHandle> cf = db.createColumnFamilies(cfDescriptors);
+
+      db.put(cf.get(1), "key1".getBytes(), "value1ForKey1".getBytes());
+      db.put("key2".getBytes(), "value2ForKey2".getBytes());
+      db.put(cf.get(3), "key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(db.getDefaultColumnFamily());
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound);
+
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(1));
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.NotFound);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(1));
+        columnFamilyHandles.add(db.getDefaultColumnFamily());
+        columnFamilyHandles.add(cf.get(3));
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize).isEqualTo("value2ForKey2".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("value2ForKey2".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(db.getDefaultColumnFamily());
+        columnFamilyHandles.add(cf.get(1));
+        columnFamilyHandles.add(cf.get(3));
+
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.NotFound);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+
+  @Test
+  public void putNThenMultiGetDirectTruncateCF() throws RocksDBException {
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf0".getBytes()));
+
+      final List<ColumnFamilyHandle> cf = db.createColumnFamilies(cfDescriptors);
+
+      db.put(cf.get(0), "key1".getBytes(), "value1ForKey1".getBytes());
+      db.put(cf.get(0), "key2".getBytes(), "value2ForKey2WithLotsOfTrailingGarbage".getBytes());
+      db.put(cf.get(0), "key3".getBytes(), "value3ForKey3".getBytes());
+
+      final List<ByteBuffer> keys = new ArrayList<>();
+      keys.add(ByteBuffer.allocateDirect(12).put("key1".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key2".getBytes()));
+      keys.add(ByteBuffer.allocateDirect(12).put("key3".getBytes()));
+      // Java8 and lower flip() returns Buffer not ByteBuffer, so can't chain above /\/\
+      for (final ByteBuffer key : keys) {
+        key.flip();
+      }
+      final List<ByteBuffer> values = new ArrayList<>();
+      for (int i = 0; i < keys.size(); i++) {
+        values.add(ByteBuffer.allocateDirect(24));
+      }
+
+      {
+        final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+        columnFamilyHandles.add(cf.get(0));
+        final List<ByteBufferGetStatus> results =
+            db.multiGetByteBuffers(columnFamilyHandles, keys, values);
+
+        assertThat(results.get(0).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(1).status.getCode()).isEqualTo(Status.Code.Ok);
+        assertThat(results.get(2).status.getCode()).isEqualTo(Status.Code.Ok);
+
+        assertThat(results.get(0).requiredSize).isEqualTo("value1ForKey1".getBytes().length);
+        assertThat(results.get(1).requiredSize)
+            .isEqualTo("value2ForKey2WithLotsOfTrailingGarbage".getBytes().length);
+        assertThat(results.get(2).requiredSize).isEqualTo("value3ForKey3".getBytes().length);
+
+        assertThat(TestUtil.bufferBytes(results.get(0).value))
+            .isEqualTo("value1ForKey1".getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(1).value))
+            .isEqualTo("valu e2Fo rKey 2Wit hLot sOfT".replace(" ", "").getBytes());
+        assertThat(TestUtil.bufferBytes(results.get(2).value))
+            .isEqualTo("value3ForKey3".getBytes());
+      }
+    }
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -59,23 +59,23 @@
 
   @Test
   public void mutableColumnFamilyOptions_toString() {
-    final String str = MutableColumnFamilyOptions
-        .builder()
-        .setWriteBufferSize(10)
-        .setInplaceUpdateNumLocks(5)
-        .setDisableAutoCompactions(true)
-        .setParanoidFileChecks(true)
-        .build()
-        .toString();
+    final String str = MutableColumnFamilyOptions.builder()
+                           .setWriteBufferSize(10)
+                           .setInplaceUpdateNumLocks(5)
+                           .setDisableAutoCompactions(true)
+                           .setParanoidFileChecks(true)
+                           .setMaxBytesForLevelMultiplierAdditional(new int[] {2, 3, 5, 7, 11, 13})
+                           .build()
+                           .toString();
 
     assertThat(str).isEqualTo("write_buffer_size=10;inplace_update_num_locks=5;"
-        + "disable_auto_compactions=true;paranoid_file_checks=true");
+        + "disable_auto_compactions=true;paranoid_file_checks=true;max_bytes_for_level_multiplier_additional=2:3:5:7:11:13");
   }
 
   @Test
   public void mutableColumnFamilyOptions_parse() {
     final String str = "write_buffer_size=10;inplace_update_num_locks=5;"
-        + "disable_auto_compactions=true;paranoid_file_checks=true";
+        + "disable_auto_compactions=true;paranoid_file_checks=true;max_bytes_for_level_multiplier_additional=2:{3}:{5}:{7}:{11}:{13}";
 
     final MutableColumnFamilyOptionsBuilder builder =
         MutableColumnFamilyOptions.parse(str);
@@ -84,5 +84,79 @@
     assertThat(builder.inplaceUpdateNumLocks()).isEqualTo(5);
     assertThat(builder.disableAutoCompactions()).isEqualTo(true);
     assertThat(builder.paranoidFileChecks()).isEqualTo(true);
+    assertThat(builder.maxBytesForLevelMultiplierAdditional())
+        .isEqualTo(new int[] {2, 3, 5, 7, 11, 13});
+  }
+
+  /**
+   * Extended parsing test to deal with all the options which C++ may return.
+   * We have canned a set of options returned by {RocksDB#getOptions}
+   */
+  @Test
+  public void mutableColumnFamilyOptions_parse_getOptions_output() {
+    final String optionsString =
+        "bottommost_compression=kDisableCompressionOption;  sample_for_compression=0;  "
+        + "blob_garbage_collection_age_cutoff=0.250000;  blob_garbage_collection_force_threshold=0.800000;  arena_block_size=1048576;  enable_blob_garbage_collection=false;  "
+        + "level0_stop_writes_trigger=36;  min_blob_size=65536;  "
+        + "compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;"
+        + "compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width=4294967295;size_ratio=1;};  "
+        + "target_file_size_base=67108864;  max_bytes_for_level_base=268435456;  memtable_whole_key_filtering=false;  "
+        + "soft_pending_compaction_bytes_limit=68719476736;  blob_compression_type=kNoCompression;  max_write_buffer_number=2;  "
+        + "ttl=2592000;  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;};  "
+        + "check_flush_compaction_key_order=true;  max_successive_merges=0;  inplace_update_num_locks=10000;  "
+        + "bottommost_compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;"
+        + "strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;};  "
+        + "target_file_size_multiplier=1;  max_bytes_for_level_multiplier_additional=5:{7}:{9}:{11}:{13}:{15}:{17};  "
+        + "enable_blob_files=true;  level0_slowdown_writes_trigger=20;  compression=kLZ4HCCompression;  level0_file_num_compaction_trigger=4;  "
+        + "blob_file_size=268435456;  prefix_extractor=nullptr;  max_bytes_for_level_multiplier=10.000000;  write_buffer_size=67108864;  "
+        + "disable_auto_compactions=false;  max_compaction_bytes=1677721600;  memtable_huge_page_size=0;  "
+        + "compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;"
+        + "level=32767;window_bits=-14;};  "
+        + "hard_pending_compaction_bytes_limit=274877906944;  periodic_compaction_seconds=0;  paranoid_file_checks=true;  "
+        + "memtable_prefix_bloom_size_ratio=7.500000;  max_sequential_skip_in_iterations=8;  report_bg_io_stats=true;  "
+        + "compaction_pri=kMinOverlappingRatio;  compaction_style=kCompactionStyleLevel;  memtable_factory=SkipListFactory;  "
+        + "comparator=leveldb.BytewiseComparator;  bloom_locality=0;  compaction_filter_factory=nullptr;  "
+        + "min_write_buffer_number_to_merge=1;  max_write_buffer_number_to_maintain=0;  compaction_filter=nullptr;  merge_operator=nullptr;  "
+        + "num_levels=7;  optimize_filters_for_hits=false;  force_consistency_checks=true;  table_factory=BlockBasedTable;  "
+        + "max_write_buffer_size_to_maintain=0;  memtable_insert_with_hint_prefix_extractor=nullptr;  level_compaction_dynamic_level_bytes=false;  "
+        + "inplace_update_support=false;";
+
+    MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder cf =
+        MutableColumnFamilyOptions.parse(optionsString, true);
+
+    // Check the values from the parsed string which are column family options
+    assertThat(cf.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+    assertThat(cf.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+    assertThat(cf.arenaBlockSize()).isEqualTo(1048576);
+    assertThat(cf.enableBlobGarbageCollection()).isEqualTo(false);
+    assertThat(cf.level0StopWritesTrigger()).isEqualTo(36);
+    assertThat(cf.minBlobSize()).isEqualTo(65536);
+    assertThat(cf.targetFileSizeBase()).isEqualTo(67108864);
+    assertThat(cf.maxBytesForLevelBase()).isEqualTo(268435456);
+    assertThat(cf.softPendingCompactionBytesLimit()).isEqualTo(68719476736L);
+    assertThat(cf.blobCompressionType()).isEqualTo(CompressionType.NO_COMPRESSION);
+    assertThat(cf.maxWriteBufferNumber()).isEqualTo(2);
+    assertThat(cf.ttl()).isEqualTo(2592000);
+    assertThat(cf.maxSuccessiveMerges()).isEqualTo(0);
+    assertThat(cf.inplaceUpdateNumLocks()).isEqualTo(10000);
+    assertThat(cf.targetFileSizeMultiplier()).isEqualTo(1);
+    assertThat(cf.maxBytesForLevelMultiplierAdditional())
+        .isEqualTo(new int[] {5, 7, 9, 11, 13, 15, 17});
+    assertThat(cf.enableBlobFiles()).isEqualTo(true);
+    assertThat(cf.level0SlowdownWritesTrigger()).isEqualTo(20);
+    assertThat(cf.compressionType()).isEqualTo(CompressionType.LZ4HC_COMPRESSION);
+    assertThat(cf.level0FileNumCompactionTrigger()).isEqualTo(4);
+    assertThat(cf.blobFileSize()).isEqualTo(268435456);
+    assertThat(cf.maxBytesForLevelMultiplier()).isEqualTo(10.0);
+    assertThat(cf.writeBufferSize()).isEqualTo(67108864);
+    assertThat(cf.disableAutoCompactions()).isEqualTo(false);
+    assertThat(cf.maxCompactionBytes()).isEqualTo(1677721600);
+    assertThat(cf.memtableHugePageSize()).isEqualTo(0);
+    assertThat(cf.hardPendingCompactionBytesLimit()).isEqualTo(274877906944L);
+    assertThat(cf.periodicCompactionSeconds()).isEqualTo(0);
+    assertThat(cf.paranoidFileChecks()).isEqualTo(true);
+    assertThat(cf.memtablePrefixBloomSizeRatio()).isEqualTo(7.5);
+    assertThat(cf.maxSequentialSkipInIterations()).isEqualTo(8);
+    assertThat(cf.reportBgIoStats()).isEqualTo(true);
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/MutableOptionsGetSetTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,397 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class MutableOptionsGetSetTest {
+  final int minBlobSize = 65536;
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  /**
+   * Validate the round-trip of  blob options into and out of the C++ core of RocksDB
+   * From CF options on CF Creation to {RocksDB#getOptions}
+   * Uses 2x column families with different values for their options.
+   * NOTE that some constraints are applied to the options in the C++ core,
+   * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio}
+   *
+   * @throws RocksDBException if the database throws an exception
+   */
+  @Test
+  public void testGetMutableBlobOptionsAfterCreate() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      try (final ColumnFamilyOptions columnFamilyOptions1 =
+               new ColumnFamilyOptions()
+                   .setMinBlobSize(minBlobSize)
+                   .setEnableBlobFiles(true)
+                   .setBlobGarbageCollectionAgeCutoff(0.25)
+                   .setBlobGarbageCollectionForceThreshold(0.80)
+                   .setArenaBlockSize(42)
+                   .setMemtablePrefixBloomSizeRatio(0.17)
+                   .setMemtableHugePageSize(3)
+                   .setMaxSuccessiveMerges(4)
+                   .setMaxWriteBufferNumber(12)
+                   .setInplaceUpdateNumLocks(16)
+                   .setDisableAutoCompactions(false)
+                   .setSoftPendingCompactionBytesLimit(112)
+                   .setHardPendingCompactionBytesLimit(280)
+                   .setLevel0FileNumCompactionTrigger(200)
+                   .setLevel0SlowdownWritesTrigger(312)
+                   .setLevel0StopWritesTrigger(584)
+                   .setMaxCompactionBytes(12)
+                   .setTargetFileSizeBase(99)
+                   .setTargetFileSizeMultiplier(112)
+                   .setMaxSequentialSkipInIterations(50)
+                   .setReportBgIoStats(true);
+
+           final ColumnFamilyOptions columnFamilyOptions2 =
+               new ColumnFamilyOptions()
+                   .setMinBlobSize(minBlobSize)
+                   .setEnableBlobFiles(false)
+                   .setArenaBlockSize(42)
+                   .setMemtablePrefixBloomSizeRatio(0.236)
+                   .setMemtableHugePageSize(8)
+                   .setMaxSuccessiveMerges(12)
+                   .setMaxWriteBufferNumber(22)
+                   .setInplaceUpdateNumLocks(160)
+                   .setDisableAutoCompactions(true)
+                   .setSoftPendingCompactionBytesLimit(1124)
+                   .setHardPendingCompactionBytesLimit(2800)
+                   .setLevel0FileNumCompactionTrigger(2000)
+                   .setLevel0SlowdownWritesTrigger(5840)
+                   .setLevel0StopWritesTrigger(31200)
+                   .setMaxCompactionBytes(112)
+                   .setTargetFileSizeBase(999)
+                   .setTargetFileSizeMultiplier(1120)
+                   .setMaxSequentialSkipInIterations(24)
+                   .setReportBgIoStats(true)) {
+        final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+            new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+        final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+            new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+
+        // Create the column family with blob options
+        final ColumnFamilyHandle columnFamilyHandle1 =
+            db.createColumnFamily(columnFamilyDescriptor1);
+        final ColumnFamilyHandle columnFamilyHandle2 =
+            db.createColumnFamily(columnFamilyDescriptor2);
+
+        // Check the getOptions() brings back the creation options for CF1
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 =
+            db.getOptions(columnFamilyHandle1);
+        assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+        assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+        assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+        assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder1.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder1.memtableHugePageSize()).isEqualTo(3);
+        assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17);
+        assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4);
+        assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12);
+        assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16);
+        assertThat(builder1.disableAutoCompactions()).isEqualTo(false);
+        assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112);
+        assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280);
+        assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200);
+        assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312);
+        assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584);
+        assertThat(builder1.maxCompactionBytes()).isEqualTo(12);
+        assertThat(builder1.targetFileSizeBase()).isEqualTo(99);
+        assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112);
+        assertThat(builder1.maxSequentialSkipInIterations()).isEqualTo(50);
+        assertThat(builder1.reportBgIoStats()).isEqualTo(true);
+
+        // Check the getOptions() brings back the creation options for CF2
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 =
+            db.getOptions(columnFamilyHandle2);
+        assertThat(builder2.enableBlobFiles()).isEqualTo(false);
+        assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder2.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder2.memtableHugePageSize()).isEqualTo(8);
+        assertThat(builder2.memtablePrefixBloomSizeRatio()).isEqualTo(0.236);
+        assertThat(builder2.maxSuccessiveMerges()).isEqualTo(12);
+        assertThat(builder2.maxWriteBufferNumber()).isEqualTo(22);
+        assertThat(builder2.inplaceUpdateNumLocks()).isEqualTo(160);
+        assertThat(builder2.disableAutoCompactions()).isEqualTo(true);
+        assertThat(builder2.softPendingCompactionBytesLimit()).isEqualTo(1124);
+        assertThat(builder2.hardPendingCompactionBytesLimit()).isEqualTo(2800);
+        assertThat(builder2.level0FileNumCompactionTrigger()).isEqualTo(2000);
+        assertThat(builder2.level0SlowdownWritesTrigger()).isEqualTo(5840);
+        assertThat(builder2.level0StopWritesTrigger()).isEqualTo(31200);
+        assertThat(builder2.maxCompactionBytes()).isEqualTo(112);
+        assertThat(builder2.targetFileSizeBase()).isEqualTo(999);
+        assertThat(builder2.targetFileSizeMultiplier()).isEqualTo(1120);
+        assertThat(builder2.maxSequentialSkipInIterations()).isEqualTo(24);
+        assertThat(builder2.reportBgIoStats()).isEqualTo(true);
+      }
+    }
+  }
+
+  /**
+   * Validate the round-trip of  blob options into and out of the C++ core of RocksDB
+   * From {RocksDB#setOptions} to {RocksDB#getOptions}
+   * Uses 2x column families with different values for their options.
+   * NOTE that some constraints are applied to the options in the C++ core,
+   * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio}
+   *
+   * @throws RocksDBException if a database access has an error
+   */
+  @Test
+  public void testGetMutableBlobOptionsAfterSetCF() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      try (final ColumnFamilyOptions columnFamilyOptions1 = new ColumnFamilyOptions();
+
+           final ColumnFamilyOptions columnFamilyOptions2 = new ColumnFamilyOptions()) {
+        final ColumnFamilyDescriptor columnFamilyDescriptor1 =
+            new ColumnFamilyDescriptor("column_family_1".getBytes(UTF_8), columnFamilyOptions1);
+        final ColumnFamilyDescriptor columnFamilyDescriptor2 =
+            new ColumnFamilyDescriptor("column_family_2".getBytes(UTF_8), columnFamilyOptions2);
+
+        // Create the column family with blob options
+        final ColumnFamilyHandle columnFamilyHandle1 =
+            db.createColumnFamily(columnFamilyDescriptor1);
+        final ColumnFamilyHandle columnFamilyHandle2 =
+            db.createColumnFamily(columnFamilyDescriptor2);
+        db.flush(new FlushOptions().setWaitForFlush(true));
+
+        final MutableColumnFamilyOptions
+            .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions1 =
+            MutableColumnFamilyOptions.builder()
+                .setMinBlobSize(minBlobSize)
+                .setEnableBlobFiles(true)
+                .setBlobGarbageCollectionAgeCutoff(0.25)
+                .setBlobGarbageCollectionForceThreshold(0.80)
+                .setArenaBlockSize(42)
+                .setMemtablePrefixBloomSizeRatio(0.17)
+                .setMemtableHugePageSize(3)
+                .setMaxSuccessiveMerges(4)
+                .setMaxWriteBufferNumber(12)
+                .setInplaceUpdateNumLocks(16)
+                .setDisableAutoCompactions(false)
+                .setSoftPendingCompactionBytesLimit(112)
+                .setHardPendingCompactionBytesLimit(280)
+                .setLevel0FileNumCompactionTrigger(200)
+                .setLevel0SlowdownWritesTrigger(312)
+                .setLevel0StopWritesTrigger(584)
+                .setMaxCompactionBytes(12)
+                .setTargetFileSizeBase(99)
+                .setTargetFileSizeMultiplier(112);
+        db.setOptions(columnFamilyHandle1, mutableColumnFamilyOptions1.build());
+
+        // Check the getOptions() brings back the creation options for CF1
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 =
+            db.getOptions(columnFamilyHandle1);
+        assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+        assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+        assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+        assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder1.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder1.memtableHugePageSize()).isEqualTo(3);
+        assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17);
+        assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4);
+        assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12);
+        assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16);
+        assertThat(builder1.disableAutoCompactions()).isEqualTo(false);
+        assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112);
+        assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280);
+        assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200);
+        assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312);
+        assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584);
+        assertThat(builder1.maxCompactionBytes()).isEqualTo(12);
+        assertThat(builder1.targetFileSizeBase()).isEqualTo(99);
+        assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112);
+
+        final MutableColumnFamilyOptions
+            .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions2 =
+            MutableColumnFamilyOptions.builder()
+                .setMinBlobSize(minBlobSize)
+                .setEnableBlobFiles(false)
+                .setArenaBlockSize(42)
+                .setMemtablePrefixBloomSizeRatio(0.236)
+                .setMemtableHugePageSize(8)
+                .setMaxSuccessiveMerges(12)
+                .setMaxWriteBufferNumber(22)
+                .setInplaceUpdateNumLocks(160)
+                .setDisableAutoCompactions(true)
+                .setSoftPendingCompactionBytesLimit(1124)
+                .setHardPendingCompactionBytesLimit(2800)
+                .setLevel0FileNumCompactionTrigger(2000)
+                .setLevel0SlowdownWritesTrigger(5840)
+                .setLevel0StopWritesTrigger(31200)
+                .setMaxCompactionBytes(112)
+                .setTargetFileSizeBase(999)
+                .setTargetFileSizeMultiplier(1120);
+        db.setOptions(columnFamilyHandle2, mutableColumnFamilyOptions2.build());
+
+        // Check the getOptions() brings back the creation options for CF2
+        final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder2 =
+            db.getOptions(columnFamilyHandle2);
+        assertThat(builder2.enableBlobFiles()).isEqualTo(false);
+        assertThat(builder2.minBlobSize()).isEqualTo(minBlobSize);
+        assertThat(builder2.arenaBlockSize()).isEqualTo(42);
+        assertThat(builder2.memtableHugePageSize()).isEqualTo(8);
+        assertThat(builder2.memtablePrefixBloomSizeRatio()).isEqualTo(0.236);
+        assertThat(builder2.maxSuccessiveMerges()).isEqualTo(12);
+        assertThat(builder2.maxWriteBufferNumber()).isEqualTo(22);
+        assertThat(builder2.inplaceUpdateNumLocks()).isEqualTo(160);
+        assertThat(builder2.disableAutoCompactions()).isEqualTo(true);
+        assertThat(builder2.softPendingCompactionBytesLimit()).isEqualTo(1124);
+        assertThat(builder2.hardPendingCompactionBytesLimit()).isEqualTo(2800);
+        assertThat(builder2.level0FileNumCompactionTrigger()).isEqualTo(2000);
+        assertThat(builder2.level0SlowdownWritesTrigger()).isEqualTo(5840);
+        assertThat(builder2.level0StopWritesTrigger()).isEqualTo(31200);
+        assertThat(builder2.maxCompactionBytes()).isEqualTo(112);
+        assertThat(builder2.targetFileSizeBase()).isEqualTo(999);
+        assertThat(builder2.targetFileSizeMultiplier()).isEqualTo(1120);
+      }
+    }
+  }
+
+  /**
+   * Validate the round-trip of  blob options into and out of the C++ core of RocksDB
+   * From {RocksDB#setOptions} to {RocksDB#getOptions}
+   * Uses 2x column families with different values for their options.
+   * NOTE that some constraints are applied to the options in the C++ core,
+   * e.g. on {ColumnFamilyOptions#setMemtablePrefixBloomSizeRatio}
+   *
+   * @throws RocksDBException if a database access has an error
+   */
+  @Test
+  public void testGetMutableBlobOptionsAfterSet() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      final MutableColumnFamilyOptions
+          .MutableColumnFamilyOptionsBuilder mutableColumnFamilyOptions =
+          MutableColumnFamilyOptions.builder()
+              .setMinBlobSize(minBlobSize)
+              .setEnableBlobFiles(true)
+              .setBlobGarbageCollectionAgeCutoff(0.25)
+              .setBlobGarbageCollectionForceThreshold(0.80)
+              .setArenaBlockSize(42)
+              .setMemtablePrefixBloomSizeRatio(0.17)
+              .setMemtableHugePageSize(3)
+              .setMaxSuccessiveMerges(4)
+              .setMaxWriteBufferNumber(12)
+              .setInplaceUpdateNumLocks(16)
+              .setDisableAutoCompactions(false)
+              .setSoftPendingCompactionBytesLimit(112)
+              .setHardPendingCompactionBytesLimit(280)
+              .setLevel0FileNumCompactionTrigger(200)
+              .setLevel0SlowdownWritesTrigger(312)
+              .setLevel0StopWritesTrigger(584)
+              .setMaxCompactionBytes(12)
+              .setTargetFileSizeBase(99)
+              .setTargetFileSizeMultiplier(112);
+      db.setOptions(mutableColumnFamilyOptions.build());
+
+      // Check the getOptions() brings back the creation options for CF1
+      final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder builder1 = db.getOptions();
+      assertThat(builder1.enableBlobFiles()).isEqualTo(true);
+      assertThat(builder1.blobGarbageCollectionAgeCutoff()).isEqualTo(0.25);
+      assertThat(builder1.blobGarbageCollectionForceThreshold()).isEqualTo(0.80);
+      assertThat(builder1.minBlobSize()).isEqualTo(minBlobSize);
+      assertThat(builder1.arenaBlockSize()).isEqualTo(42);
+      assertThat(builder1.memtableHugePageSize()).isEqualTo(3);
+      assertThat(builder1.memtablePrefixBloomSizeRatio()).isEqualTo(0.17);
+      assertThat(builder1.maxSuccessiveMerges()).isEqualTo(4);
+      assertThat(builder1.maxWriteBufferNumber()).isEqualTo(12);
+      assertThat(builder1.inplaceUpdateNumLocks()).isEqualTo(16);
+      assertThat(builder1.disableAutoCompactions()).isEqualTo(false);
+      assertThat(builder1.softPendingCompactionBytesLimit()).isEqualTo(112);
+      assertThat(builder1.hardPendingCompactionBytesLimit()).isEqualTo(280);
+      assertThat(builder1.level0FileNumCompactionTrigger()).isEqualTo(200);
+      assertThat(builder1.level0SlowdownWritesTrigger()).isEqualTo(312);
+      assertThat(builder1.level0StopWritesTrigger()).isEqualTo(584);
+      assertThat(builder1.maxCompactionBytes()).isEqualTo(12);
+      assertThat(builder1.targetFileSizeBase()).isEqualTo(99);
+      assertThat(builder1.targetFileSizeMultiplier()).isEqualTo(112);
+    }
+  }
+
+  @Test
+  public void testGetMutableDBOptionsAfterSet() throws RocksDBException {
+    final ColumnFamilyOptions columnFamilyOptions0 = new ColumnFamilyOptions();
+    final ColumnFamilyDescriptor columnFamilyDescriptor0 =
+        new ColumnFamilyDescriptor("default".getBytes(UTF_8), columnFamilyOptions0);
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Collections.singletonList(columnFamilyDescriptor0);
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    try (final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+             columnFamilyDescriptors, columnFamilyHandles)) {
+      final MutableDBOptions.MutableDBOptionsBuilder mutableDBOptions =
+          MutableDBOptions.builder()
+              .setMaxBackgroundJobs(16)
+              .setAvoidFlushDuringShutdown(true)
+              .setWritableFileMaxBufferSize(2097152)
+              .setDelayedWriteRate(67108864)
+              .setMaxTotalWalSize(16777216)
+              .setDeleteObsoleteFilesPeriodMicros(86400000000L)
+              .setStatsDumpPeriodSec(1200)
+              .setStatsPersistPeriodSec(7200)
+              .setStatsHistoryBufferSize(6291456)
+              .setMaxOpenFiles(8)
+              .setBytesPerSync(4194304)
+              .setWalBytesPerSync(1048576)
+              .setStrictBytesPerSync(true)
+              .setCompactionReadaheadSize(1024);
+
+      db.setDBOptions(mutableDBOptions.build());
+
+      final MutableDBOptions.MutableDBOptionsBuilder getBuilder = db.getDBOptions();
+      assertThat(getBuilder.maxBackgroundJobs()).isEqualTo(16); // 4
+      assertThat(getBuilder.avoidFlushDuringShutdown()).isEqualTo(true); // false
+      assertThat(getBuilder.writableFileMaxBufferSize()).isEqualTo(2097152); // 1048576
+      assertThat(getBuilder.delayedWriteRate()).isEqualTo(67108864); // 16777216
+      assertThat(getBuilder.maxTotalWalSize()).isEqualTo(16777216);
+      assertThat(getBuilder.deleteObsoleteFilesPeriodMicros())
+          .isEqualTo(86400000000L); // 21600000000
+      assertThat(getBuilder.statsDumpPeriodSec()).isEqualTo(1200); // 600
+      assertThat(getBuilder.statsPersistPeriodSec()).isEqualTo(7200); // 600
+      assertThat(getBuilder.statsHistoryBufferSize()).isEqualTo(6291456); // 1048576
+      assertThat(getBuilder.maxOpenFiles()).isEqualTo(8); //-1
+      assertThat(getBuilder.bytesPerSync()).isEqualTo(4194304); // 1048576
+      assertThat(getBuilder.walBytesPerSync()).isEqualTo(1048576); // 0
+      assertThat(getBuilder.strictBytesPerSync()).isEqualTo(true); // false
+      assertThat(getBuilder.compactionReadaheadSize()).isEqualTo(1024); // 0
+    }
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -15,6 +15,9 @@
 import static org.junit.Assert.assertEquals;
 
 public class NativeComparatorWrapperTest {
+  static {
+    RocksDB.loadLibrary();
+  }
 
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,16 +5,18 @@
 
 package org.rocksdb;
 
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.*;
-
+import java.util.concurrent.atomic.AtomicBoolean;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
 
-import static org.assertj.core.api.Assertions.assertThat;
-
-
 public class OptionsTest {
 
   @ClassRule
@@ -685,6 +687,16 @@
   }
 
   @Test
+  public void setWriteBufferManagerWithAllowStall() throws RocksDBException {
+    try (final Options opt = new Options(); final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache, true)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+      assertThat(opt.writeBufferManager().allowStall()).isEqualTo(true);
+    }
+  }
+
+  @Test
   public void accessHintOnCompactionStart() {
     try (final Options opt = new Options()) {
       final AccessHint accessHint = AccessHint.SEQUENTIAL;
@@ -1255,6 +1267,14 @@
   }
 
   @Test
+  public void periodicCompactionSeconds() {
+    try (final Options options = new Options()) {
+      options.setPeriodicCompactionSeconds(1000 * 60);
+      assertThat(options.periodicCompactionSeconds()).isEqualTo(1000 * 60);
+    }
+  }
+
+  @Test
   public void compactionOptionsUniversal() {
     try (final Options options = new Options();
          final CompactionOptionsUniversal optUni = new CompactionOptionsUniversal()
@@ -1308,4 +1328,164 @@
     }
   }
 
+  @Test
+  public void compactionThreadLimiter() {
+    try (final Options options = new Options();
+         final ConcurrentTaskLimiter compactionThreadLimiter =
+             new ConcurrentTaskLimiterImpl("name", 3)) {
+      options.setCompactionThreadLimiter(compactionThreadLimiter);
+      assertThat(options.compactionThreadLimiter()).isEqualTo(compactionThreadLimiter);
+    }
+  }
+
+  @Test
+  public void oldDefaults() {
+    try (final Options options = new Options()) {
+      options.oldDefaults(4, 6);
+      assertThat(options.writeBufferSize()).isEqualTo(4 << 20);
+      assertThat(options.compactionPriority()).isEqualTo(CompactionPriority.ByCompensatedSize);
+      assertThat(options.targetFileSizeBase()).isEqualTo(2 * 1048576);
+      assertThat(options.maxBytesForLevelBase()).isEqualTo(10 * 1048576);
+      assertThat(options.softPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.hardPendingCompactionBytesLimit()).isEqualTo(0);
+      assertThat(options.level0StopWritesTrigger()).isEqualTo(24);
+    }
+  }
+
+  @Test
+  public void optimizeForSmallDbWithCache() {
+    try (final Options options = new Options(); final Cache cache = new LRUCache(1024)) {
+      assertThat(options.optimizeForSmallDb(cache)).isEqualTo(options);
+    }
+  }
+
+  @Test
+  public void cfPaths() {
+    try (final Options options = new Options()) {
+      final List<DbPath> paths = Arrays.asList(
+          new DbPath(Paths.get("test1"), 2 << 25), new DbPath(Paths.get("/test2/path"), 2 << 25));
+      assertThat(options.cfPaths()).isEqualTo(Collections.emptyList());
+      assertThat(options.setCfPaths(paths)).isEqualTo(options);
+      assertThat(options.cfPaths()).isEqualTo(paths);
+    }
+  }
+
+  @Test
+  public void avoidUnnecessaryBlockingIO() {
+    try (final Options options = new Options()) {
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(false);
+      assertThat(options.setAvoidUnnecessaryBlockingIO(true)).isEqualTo(options);
+      assertThat(options.avoidUnnecessaryBlockingIO()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void persistStatsToDisk() {
+    try (final Options options = new Options()) {
+      assertThat(options.persistStatsToDisk()).isEqualTo(false);
+      assertThat(options.setPersistStatsToDisk(true)).isEqualTo(options);
+      assertThat(options.persistStatsToDisk()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void writeDbidToManifest() {
+    try (final Options options = new Options()) {
+      assertThat(options.writeDbidToManifest()).isEqualTo(false);
+      assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options);
+      assertThat(options.writeDbidToManifest()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void logReadaheadSize() {
+    try (final Options options = new Options()) {
+      assertThat(options.logReadaheadSize()).isEqualTo(0);
+      final int size = 1024 * 1024 * 100;
+      assertThat(options.setLogReadaheadSize(size)).isEqualTo(options);
+      assertThat(options.logReadaheadSize()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void bestEffortsRecovery() {
+    try (final Options options = new Options()) {
+      assertThat(options.bestEffortsRecovery()).isEqualTo(false);
+      assertThat(options.setBestEffortsRecovery(true)).isEqualTo(options);
+      assertThat(options.bestEffortsRecovery()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void maxBgerrorResumeCount() {
+    try (final Options options = new Options()) {
+      final int INT_MAX = 2147483647;
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(INT_MAX);
+      assertThat(options.setMaxBgErrorResumeCount(-1)).isEqualTo(options);
+      assertThat(options.maxBgerrorResumeCount()).isEqualTo(-1);
+    }
+  }
+
+  @Test
+  public void bgerrorResumeRetryInterval() {
+    try (final Options options = new Options()) {
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(1000000);
+      final long newRetryInterval = 24 * 3600 * 1000000L;
+      assertThat(options.setBgerrorResumeRetryInterval(newRetryInterval)).isEqualTo(options);
+      assertThat(options.bgerrorResumeRetryInterval()).isEqualTo(newRetryInterval);
+    }
+  }
+
+  @Test
+  public void maxWriteBatchGroupSizeBytes() {
+    try (final Options options = new Options()) {
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(1024 * 1024);
+      final long size = 1024 * 1024 * 1024 * 10L;
+      assertThat(options.setMaxWriteBatchGroupSizeBytes(size)).isEqualTo(options);
+      assertThat(options.maxWriteBatchGroupSizeBytes()).isEqualTo(size);
+    }
+  }
+
+  @Test
+  public void skipCheckingSstFileSizesOnDbOpen() {
+    try (final Options options = new Options()) {
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
+      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
+      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
+    }
+  }
+
+  @Test
+  public void eventListeners() {
+    final AtomicBoolean wasCalled1 = new AtomicBoolean();
+    final AtomicBoolean wasCalled2 = new AtomicBoolean();
+    try (final Options options = new Options();
+         final AbstractEventListener el1 =
+             new AbstractEventListener() {
+               @Override
+               public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+                 wasCalled1.set(true);
+               }
+             };
+         final AbstractEventListener el2 =
+             new AbstractEventListener() {
+               @Override
+               public void onMemTableSealed(final MemTableInfo memTableInfo) {
+                 wasCalled2.set(true);
+               }
+             }) {
+      assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options);
+      List<AbstractEventListener> listeners = options.listeners();
+      assertEquals(el1, listeners.get(0));
+      assertEquals(el2, listeners.get(1));
+      options.setListeners(Collections.<AbstractEventListener>emptyList());
+      listeners.get(0).onTableFileDeleted(null);
+      assertTrue(wasCalled1.get());
+      listeners.get(1).onMemTableSealed(null);
+      assertTrue(wasCalled2.get());
+      List<AbstractEventListener> listeners2 = options.listeners();
+      assertNotNull(listeners2);
+      assertEquals(0, listeners2.size());
+    }
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -31,115 +31,60 @@
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath())) {
       db.put("key".getBytes(), "value".getBytes());
-      try (final RocksDB db2 = RocksDB.openReadOnly(
-          dbFolder.getRoot().getAbsolutePath())) {
-        assertThat("value").
-            isEqualTo(new String(db2.get("key".getBytes())));
-      }
+    }
+    try (final RocksDB db = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath())) {
+      assertThat("value").isEqualTo(new String(db.get("key".getBytes())));
     }
 
     try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
       final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-      cfDescriptors.add(new ColumnFamilyDescriptor(
-          RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
-
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
       final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
-      try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
-          cfDescriptors, columnFamilyHandleList)) {
-        try (final ColumnFamilyOptions newCfOpts = new ColumnFamilyOptions();
-             final ColumnFamilyOptions newCf2Opts = new ColumnFamilyOptions()
-        ) {
-          columnFamilyHandleList.add(db.createColumnFamily(
-              new ColumnFamilyDescriptor("new_cf".getBytes(), newCfOpts)));
-          columnFamilyHandleList.add(db.createColumnFamily(
-              new ColumnFamilyDescriptor("new_cf2".getBytes(), newCf2Opts)));
-          db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
-              "value2".getBytes());
-
-          final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-              new ArrayList<>();
-          try (final RocksDB db2 = RocksDB.openReadOnly(
-              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-              readOnlyColumnFamilyHandleList)) {
-            try (final ColumnFamilyOptions newCfOpts2 =
-                     new ColumnFamilyOptions();
-                 final ColumnFamilyOptions newCf2Opts2 =
-                     new ColumnFamilyOptions()
-            ) {
-              assertThat(db2.get("key2".getBytes())).isNull();
-              assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0),
-                  "key2".getBytes())).
-                  isNull();
-              cfDescriptors.clear();
-              cfDescriptors.add(
-                  new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-                      newCfOpts2));
-              cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(),
-                      newCf2Opts2));
-
-              final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList2
-                  = new ArrayList<>();
-              try (final RocksDB db3 = RocksDB.openReadOnly(
-                  dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-                  readOnlyColumnFamilyHandleList2)) {
-                try {
-                  assertThat(new String(db3.get(
-                      readOnlyColumnFamilyHandleList2.get(1),
-                      "key2".getBytes()))).isEqualTo("value2");
-                } finally {
-                  for (final ColumnFamilyHandle columnFamilyHandle :
-                      readOnlyColumnFamilyHandleList2) {
-                    columnFamilyHandle.close();
-                  }
-                }
-              }
-            } finally {
-              for (final ColumnFamilyHandle columnFamilyHandle :
-                  readOnlyColumnFamilyHandleList) {
-                columnFamilyHandle.close();
-              }
-            }
-          }
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              columnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+      try (final RocksDB db = RocksDB.open(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        columnFamilyHandleList.add(
+            db.createColumnFamily(new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpts)));
+        columnFamilyHandleList.add(
+            db.createColumnFamily(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts)));
+        db.put(columnFamilyHandleList.get(2), "key2".getBytes(), "value2".getBytes());
+      }
+
+      columnFamilyHandleList.clear();
+      try (final RocksDB db = RocksDB.openReadOnly(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        assertThat(db.get("key2".getBytes())).isNull();
+        assertThat(db.get(columnFamilyHandleList.get(0), "key2".getBytes())).isNull();
+      }
+
+      cfDescriptors.clear();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts));
+      columnFamilyHandleList.clear();
+      try (final RocksDB db = RocksDB.openReadOnly(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "key2".getBytes())))
+            .isEqualTo("value2");
       }
     }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToWriteInReadOnly() throws RocksDBException {
-    try (final Options options = new Options()
-        .setCreateIfMissing(true)) {
-
-      try (final RocksDB db = RocksDB.open(options,
-          dbFolder.getRoot().getAbsolutePath())) {
-        //no-op
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+        // no-op
       }
     }
 
     try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
-      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
-          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)
-      );
+      final List<ColumnFamilyDescriptor> cfDescriptors =
+          Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
 
-      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-          new ArrayList<>();
-      try (final RocksDB rDb = RocksDB.openReadOnly(
-          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-          readOnlyColumnFamilyHandleList)) {
-        try {
-          // test that put fails in readonly mode
-          rDb.put("key".getBytes(), "value".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList = new ArrayList<>();
+      try (final RocksDB rDb = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(),
+               cfDescriptors, readOnlyColumnFamilyHandleList)) {
+        // test that put fails in readonly mode
+        rDb.put("key".getBytes(), "value".getBytes());
       }
     }
   }
@@ -161,15 +106,7 @@
       try (final RocksDB rDb = RocksDB.openReadOnly(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
           readOnlyColumnFamilyHandleList)) {
-        try {
-          rDb.put(readOnlyColumnFamilyHandleList.get(0),
-              "key".getBytes(), "value".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+        rDb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
       }
     }
   }
@@ -193,14 +130,7 @@
       try (final RocksDB rDb = RocksDB.openReadOnly(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
           readOnlyColumnFamilyHandleList)) {
-        try {
-          rDb.delete("key".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+        rDb.delete("key".getBytes());
       }
     }
   }
@@ -223,15 +153,8 @@
       try (final RocksDB rDb = RocksDB.openReadOnly(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
           readOnlyColumnFamilyHandleList)) {
-        try {
           rDb.delete(readOnlyColumnFamilyHandleList.get(0),
               "key".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
       }
     }
   }
@@ -256,15 +179,8 @@
           readOnlyColumnFamilyHandleList);
            final WriteBatch wb = new WriteBatch();
            final WriteOptions wOpts = new WriteOptions()) {
-        try {
           wb.put("key".getBytes(), "value".getBytes());
           rDb.write(wOpts, wb);
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
       }
     }
   }
@@ -289,16 +205,29 @@
           readOnlyColumnFamilyHandleList);
            final WriteBatch wb = new WriteBatch();
            final WriteOptions wOpts = new WriteOptions()) {
-        try {
           wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(),
               "value".getBytes());
           rDb.write(wOpts, wb);
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void errorIfWalFileExists() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      // no-op
+    }
+
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors =
+          Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions options = new DBOptions();
+           final RocksDB rDb = RocksDB.openReadOnly(options, dbFolder.getRoot().getAbsolutePath(),
+               cfDescriptors, readOnlyColumnFamilyHandleList, true);) {
+        // no-op... should have raised an error as errorIfWalFileExists=true
       }
     }
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -39,11 +39,15 @@
       opt.setFillCache(false);
       opt.setIterateUpperBound(buildRandomSlice());
       opt.setIterateLowerBound(buildRandomSlice());
+      opt.setTimestamp(buildRandomSlice());
+      opt.setIterStartTs(buildRandomSlice());
       try (final ReadOptions other = new ReadOptions(opt)) {
         assertThat(opt.verifyChecksums()).isEqualTo(other.verifyChecksums());
         assertThat(opt.fillCache()).isEqualTo(other.fillCache());
         assertThat(Arrays.equals(opt.iterateUpperBound().data(), other.iterateUpperBound().data())).isTrue();
         assertThat(Arrays.equals(opt.iterateLowerBound().data(), other.iterateLowerBound().data())).isTrue();
+        assertThat(Arrays.equals(opt.timestamp().data(), other.timestamp().data())).isTrue();
+        assertThat(Arrays.equals(opt.iterStartTs().data(), other.iterStartTs().data())).isTrue();
       }
     }
   }
@@ -159,6 +163,8 @@
       Slice upperBound = buildRandomSlice();
       opt.setIterateUpperBound(upperBound);
       assertThat(Arrays.equals(upperBound.data(), opt.iterateUpperBound().data())).isTrue();
+      opt.setIterateUpperBound(null);
+      assertThat(opt.iterateUpperBound()).isNull();
     }
   }
 
@@ -175,6 +181,8 @@
       Slice lowerBound = buildRandomSlice();
       opt.setIterateLowerBound(lowerBound);
       assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue();
+      opt.setIterateLowerBound(null);
+      assertThat(opt.iterateLowerBound()).isNull();
     }
   }
 
@@ -203,6 +211,60 @@
     }
   }
 
+  @Test
+  public void autoPrefixMode() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setAutoPrefixMode(true);
+      assertThat(opt.autoPrefixMode()).isTrue();
+    }
+  }
+
+  @Test
+  public void timestamp() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice timestamp = buildRandomSlice();
+      opt.setTimestamp(timestamp);
+      assertThat(Arrays.equals(timestamp.data(), opt.timestamp().data())).isTrue();
+      opt.setTimestamp(null);
+      assertThat(opt.timestamp()).isNull();
+    }
+  }
+
+  @Test
+  public void iterStartTs() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice itertStartTsSlice = buildRandomSlice();
+      opt.setIterStartTs(itertStartTsSlice);
+      assertThat(Arrays.equals(itertStartTsSlice.data(), opt.iterStartTs().data())).isTrue();
+      opt.setIterStartTs(null);
+      assertThat(opt.iterStartTs()).isNull();
+    }
+  }
+
+  @Test
+  public void deadline() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setDeadline(1999l);
+      assertThat(opt.deadline()).isEqualTo(1999l);
+    }
+  }
+
+  @Test
+  public void ioTimeout() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setIoTimeout(34555l);
+      assertThat(opt.ioTimeout()).isEqualTo(34555l);
+    }
+  }
+
+  @Test
+  public void valueSizeSoftLimit() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setValueSizeSoftLimit(12134324l);
+      assertThat(opt.valueSizeSoftLimit()).isEqualTo(12134324l);
+    }
+  }
+
   @Test
   public void failSetVerifyChecksumUninitialized() {
     try (final ReadOptions readOptions =
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -1085,6 +1085,57 @@
   }
 
   @Test
+  public void continueBackgroundWorkAfterCancelAllBackgroundWork() throws RocksDBException {
+    final int KEY_SIZE = 20;
+    final int VALUE_SIZE = 300;
+    try (final DBOptions opt = new DBOptions().
+        setCreateIfMissing(true).
+        setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions()
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)
+          );
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      // open the database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+        try {
+          db.cancelAllBackgroundWork(true);
+          try {
+            db.put(new byte[KEY_SIZE], new byte[VALUE_SIZE]);
+            db.flush(new FlushOptions().setWaitForFlush(true));
+            fail("Expected RocksDBException to be thrown if we attempt to trigger a flush after" +
+                " all background work is cancelled.");
+          } catch (RocksDBException ignored) { }
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void cancelAllBackgroundWorkTwice() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())
+    ) {
+      // Cancel all background work synchronously
+      db.cancelAllBackgroundWork(true);
+      // Cancel all background work asynchronously
+      db.cancelAllBackgroundWork(false);
+    }
+  }
+
+  @Test
   public void pauseContinueBackgroundWork() throws RocksDBException {
     try (final Options options = new Options().setCreateIfMissing(true);
          final RocksDB db = RocksDB.open(options,
@@ -1170,7 +1221,6 @@
     }
   }
 
-  @Ignore("This test crashes. Re-enable after fixing.")
   @Test
   public void getApproximateSizes() throws RocksDBException {
     final byte key1[] = "key1".getBytes(UTF_8);
@@ -1185,7 +1235,7 @@
 
         final long[] sizes = db.getApproximateSizes(
             Arrays.asList(
-                new Range(new Slice(key1), new Slice(key2)),
+                new Range(new Slice(key1), new Slice(key1)),
                 new Range(new Slice(key2), new Slice(key3))
             ),
             SizeApproximationFlag.INCLUDE_FILES,
@@ -1221,6 +1271,26 @@
     }
   }
 
+  @Test
+  public void getApproximateMemTableStatsSingleKey() throws RocksDBException {
+    final byte key1[] = "key1".getBytes(UTF_8);
+    final byte key2[] = "key2".getBytes(UTF_8);
+    final byte key3[] = "key3".getBytes(UTF_8);
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put(key1, key1);
+
+        final RocksDB.CountAndSize stats =
+            db.getApproximateMemTableStats(new Range(new Slice(key1), new Slice(key3)));
+
+        assertThat(stats).isNotNull();
+        assertThat(stats.count).isEqualTo(1);
+        assertThat(stats.size).isGreaterThan(1);
+      }
+    }
+  }
+
   @Ignore("TODO(AR) re-enable when ready!")
   @Test
   public void compactFiles() throws RocksDBException {
@@ -1406,11 +1476,11 @@
       try (final RocksDB db = RocksDB.open(options, dbPath)) {
         final RocksDB.LiveFiles livefiles = db.getLiveFiles(true);
         assertThat(livefiles).isNotNull();
-        assertThat(livefiles.manifestFileSize).isEqualTo(13);
+        assertThat(livefiles.manifestFileSize).isEqualTo(59);
         assertThat(livefiles.files.size()).isEqualTo(3);
         assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
-        assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000001");
-        assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000005");
+        assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000004");
+        assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007");
       }
     }
   }
@@ -1633,6 +1703,13 @@
     }
   }
 
+  @Test
+  public void rocksdbVersion() {
+    final RocksDB.Version version = RocksDB.rocksdbVersion();
+    assertThat(version).isNotNull();
+    assertThat(version.getMajor()).isGreaterThan(1);
+  }
+
   private static class InMemoryTraceWriter extends AbstractTraceWriter {
     private final List<byte[]> writes = new ArrayList<>();
     private volatile boolean closed = false;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -147,6 +147,27 @@
         assertThat(iterator.isValid()).isTrue();
         assertThat(iterator.key()).isEqualTo("key2".getBytes());
       }
+
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seekToFirst();
+        assertThat(iterator.isValid()).isTrue();
+
+        byte[] lastKey;
+        do {
+          lastKey = iterator.key();
+          iterator.next();
+        } while (iterator.isValid());
+
+        db.put("key3".getBytes(), "value3".getBytes());
+        assertThat(iterator.isValid()).isFalse();
+        iterator.refresh();
+        iterator.seek(lastKey);
+        assertThat(iterator.isValid()).isTrue();
+
+        iterator.next();
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key3".getBytes());
+      }
     }
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SecondaryDBTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,135 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class SecondaryDBTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule public TemporaryFolder secondaryDbFolder = new TemporaryFolder();
+
+  @Test
+  public void openAsSecondary() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
+      db.put("key3".getBytes(), "value3".getBytes());
+
+      // open secondary
+      try (final Options secondaryOptions = new Options();
+           final RocksDB secondaryDb =
+               RocksDB.openAsSecondary(secondaryOptions, dbFolder.getRoot().getAbsolutePath(),
+                   secondaryDbFolder.getRoot().getAbsolutePath())) {
+        assertThat(secondaryDb.get("key1".getBytes())).isEqualTo("value1".getBytes());
+        assertThat(secondaryDb.get("key2".getBytes())).isEqualTo("value2".getBytes());
+        assertThat(secondaryDb.get("key3".getBytes())).isEqualTo("value3".getBytes());
+
+        // write to primary
+        db.put("key4".getBytes(), "value4".getBytes());
+        db.put("key5".getBytes(), "value5".getBytes());
+        db.put("key6".getBytes(), "value6".getBytes());
+
+        // tell secondary to catch up
+        secondaryDb.tryCatchUpWithPrimary();
+
+        db.put("key7".getBytes(), "value7".getBytes());
+
+        // check secondary
+        assertThat(secondaryDb.get("key4".getBytes())).isEqualTo("value4".getBytes());
+        assertThat(secondaryDb.get("key5".getBytes())).isEqualTo("value5".getBytes());
+        assertThat(secondaryDb.get("key6".getBytes())).isEqualTo("value6".getBytes());
+
+        assertThat(secondaryDb.get("key7".getBytes())).isNull();
+      }
+    }
+  }
+
+  @Test
+  public void openAsSecondaryColumnFamilies() throws RocksDBException {
+    try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+      cfDescriptors.add(new ColumnFamilyDescriptor("cf1".getBytes(), cfOpts));
+
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+      try (final DBOptions options =
+               new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(
+               options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, cfHandles)) {
+        try {
+          final ColumnFamilyHandle cf1 = cfHandles.get(1);
+
+          db.put(cf1, "key1".getBytes(), "value1".getBytes());
+          db.put(cf1, "key2".getBytes(), "value2".getBytes());
+          db.put(cf1, "key3".getBytes(), "value3".getBytes());
+
+          final List<ColumnFamilyHandle> secondaryCfHandles = new ArrayList<>();
+
+          // open secondary
+          try (final DBOptions secondaryOptions = new DBOptions();
+               final RocksDB secondaryDb =
+                   RocksDB.openAsSecondary(secondaryOptions, dbFolder.getRoot().getAbsolutePath(),
+                       secondaryDbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+                       secondaryCfHandles)) {
+            try {
+              final ColumnFamilyHandle secondaryCf1 = secondaryCfHandles.get(1);
+
+              assertThat(secondaryDb.get(secondaryCf1, "key1".getBytes()))
+                  .isEqualTo("value1".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key2".getBytes()))
+                  .isEqualTo("value2".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key3".getBytes()))
+                  .isEqualTo("value3".getBytes());
+
+              // write to primary
+              db.put(cf1, "key4".getBytes(), "value4".getBytes());
+              db.put(cf1, "key5".getBytes(), "value5".getBytes());
+              db.put(cf1, "key6".getBytes(), "value6".getBytes());
+
+              // tell secondary to catch up
+              secondaryDb.tryCatchUpWithPrimary();
+
+              db.put(cf1, "key7".getBytes(), "value7".getBytes());
+
+              // check secondary
+              assertThat(secondaryDb.get(secondaryCf1, "key4".getBytes()))
+                  .isEqualTo("value4".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key5".getBytes()))
+                  .isEqualTo("value5".getBytes());
+              assertThat(secondaryDb.get(secondaryCf1, "key6".getBytes()))
+                  .isEqualTo("value6".getBytes());
+
+              assertThat(secondaryDb.get(secondaryCf1, "key7".getBytes())).isNull();
+
+            } finally {
+              for (final ColumnFamilyHandle secondaryCfHandle : secondaryCfHandles) {
+                secondaryCfHandle.close();
+              }
+            }
+          }
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/SstPartitionerTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class SstPartitionerTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void sstFixedPrefix() throws RocksDBException {
+    try (SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4);
+         final Options opt =
+             new Options().setCreateIfMissing(true).setSstPartitionerFactory(factory);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      // writing (long)100 under key
+      db.put("aaaa1".getBytes(), "A".getBytes());
+      db.put("bbbb1".getBytes(), "B".getBytes());
+      db.flush(new FlushOptions());
+
+      db.put("aaaa0".getBytes(), "A2".getBytes());
+      db.put("aaaa2".getBytes(), "A2".getBytes());
+      db.flush(new FlushOptions());
+
+      db.compactRange();
+
+      List<LiveFileMetaData> metadata = db.getLiveFilesMetaData();
+      assertThat(metadata.size()).isEqualTo(2);
+    }
+  }
+
+  @Test
+  public void sstFixedPrefixFamily() throws RocksDBException {
+    final byte[] cfName = "new_cf".getBytes(UTF_8);
+    final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName,
+        new ColumnFamilyOptions().setSstPartitionerFactory(
+            new SstPartitionerFixedPrefixFactory(4)));
+
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor);
+
+      // writing (long)100 under key
+      db.put(columnFamilyHandle, "aaaa1".getBytes(), "A".getBytes());
+      db.put(columnFamilyHandle, "bbbb1".getBytes(), "B".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.put(columnFamilyHandle, "aaaa0".getBytes(), "A2".getBytes());
+      db.put(columnFamilyHandle, "aaaa2".getBytes(), "A2".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.compactRange(columnFamilyHandle);
+
+      List<LiveFileMetaData> metadata = db.getLiveFilesMetaData();
+      assertThat(metadata.size()).isEqualTo(2);
+    }
+  }
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -209,7 +209,7 @@
             .isSameAs(Transaction.TransactionState.STARTED);
         txn.commit();
         assertThat(txn.getState())
-            .isSameAs(Transaction.TransactionState.COMMITED);
+            .isSameAs(Transaction.TransactionState.COMMITTED);
       }
 
       try(final Transaction txn = dbContainer.beginTransaction()) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -100,7 +100,7 @@
 
       key.clear();
       key.put("box".getBytes("US-ASCII")).flip();
-      batch.remove(key);
+      batch.delete(key);
       assertThat(key.position()).isEqualTo(3);
       assertThat(key.limit()).isEqualTo(3);
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -13,7 +13,9 @@
 import static org.assertj.core.api.Assertions.assertThat;
 
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
@@ -102,6 +104,95 @@
   }
 
   @Test
+  public void readYourOwnWritesCf() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+
+    // Test open database with column family names
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      final ColumnFamilyHandle newCf = columnFamilyHandleList.get(1);
+
+      try {
+        final byte[] k1 = "key1".getBytes();
+        final byte[] v1 = "value1".getBytes();
+        final byte[] k2 = "key2".getBytes();
+        final byte[] v2 = "value2".getBytes();
+
+        db.put(newCf, k1, v1);
+        db.put(newCf, k2, v2);
+
+        try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+             final ReadOptions readOptions = new ReadOptions();
+             final RocksIterator base = db.newIterator(newCf, readOptions);
+             final RocksIterator it = wbwi.newIteratorWithBase(newCf, base, readOptions)) {
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k1);
+          assertThat(it.value()).isEqualTo(v1);
+
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k2);
+          assertThat(it.value()).isEqualTo(v2);
+
+          // put data to the write batch and make sure we can read it.
+          final byte[] k3 = "key3".getBytes();
+          final byte[] v3 = "value3".getBytes();
+          wbwi.put(newCf, k3, v3);
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k3);
+          assertThat(it.value()).isEqualTo(v3);
+
+          // update k2 in the write batch and check the value
+          final byte[] v2Other = "otherValue2".getBytes();
+          wbwi.put(newCf, k2, v2Other);
+          it.seek(k2);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k2);
+          assertThat(it.value()).isEqualTo(v2Other);
+
+          // delete k1 and make sure we can read back the write
+          wbwi.delete(newCf, k1);
+          it.seek(k1);
+          assertThat(it.key()).isNotEqualTo(k1);
+
+          // reinsert k1 and make sure we see the new value
+          final byte[] v1Other = "otherValue1".getBytes();
+          wbwi.put(newCf, k1, v1Other);
+          it.seek(k1);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k1);
+          assertThat(it.value()).isEqualTo(v1Other);
+
+          // single remove k3 and make sure we can read back the write
+          wbwi.singleDelete(newCf, k3);
+          it.seek(k3);
+          assertThat(it.isValid()).isEqualTo(false);
+
+          // reinsert k3 and make sure we see the new value
+          final byte[] v3Other = "otherValue3".getBytes();
+          wbwi.put(newCf, k3, v3Other);
+          it.seek(k3);
+          assertThat(it.isValid()).isTrue();
+          assertThat(it.key()).isEqualTo(k3);
+          assertThat(it.value()).isEqualTo(v3Other);
+        }
+      } finally {
+        for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+          columnFamilyHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
   public void writeBatchWithIndex() throws RocksDBException {
     try (final Options options = new Options().setCreateIfMissing(true);
          final RocksDB db = RocksDB.open(options,
@@ -563,4 +654,106 @@
       assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
     }
   }
+
+  @Test
+  public void iteratorWithBaseOverwriteTrue() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+  }
+
+  @Test
+  public void iteratorWithBaseOverwriteFalse() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter = wbwi.newIteratorWithBase(baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(false);
+           final RocksIterator baseIter = db.newIterator();
+           final ReadOptions readOptions = new ReadOptions();
+           final RocksIterator wbwiIter =
+               wbwi.newIteratorWithBase(columnFamilyHandleList.get(1), baseIter, readOptions)) {
+        assertThat(wbwiIter).isNotNull();
+        assertThat(wbwiIter.nativeHandle_).isGreaterThan(0);
+        wbwiIter.status();
+      }
+    }
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/test/TestableEventListener.java	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb.test;
+
+import org.rocksdb.AbstractEventListener;
+
+public class TestableEventListener extends AbstractEventListener {
+  public TestableEventListener() {
+    super();
+  }
+
+  public TestableEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+    super(enabledEventCallbacks);
+  }
+
+  public void invokeAllCallbacks() {
+    invokeAllCallbacks(nativeHandle_);
+  }
+
+  private static native void invokeAllCallbacks(final long handle);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -263,7 +263,7 @@
       for (int i = 0; i < num_iter_ops; i++) {
         // Random walk and make sure iter and result_iter returns the
         // same key and value
-        final int type = rnd.nextInt(7);
+        final int type = rnd.nextInt(8);
         iter.status();
         switch (type) {
           case 0:
@@ -310,8 +310,15 @@
               continue;
             }
             break;
+          case 6:
+            // Refresh
+            iter.refresh();
+            result_iter.refresh();
+            iter.seekToFirst();
+            result_iter.seekToFirst();
+            break;
           default: {
-            assert (type == 6);
+            assert (type == 7);
             final int key_idx = rnd.nextInt(source_strings.size());
             final String key = source_strings.get(key_idx);
             final byte[] result = db.get(readOptions, bytes(key));
@@ -473,6 +480,11 @@
     }
 
     @Override
+    public void refresh() throws RocksDBException {
+      offset = -1;
+    }
+
+    @Override
     public void status() throws RocksDBException {
       if(offset < 0 || offset >= entries.size()) {
         throw new RocksDBException("Index out of bounds. Size is: " +
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java	2025-05-19 16:14:27.000000000 +0000
@@ -119,6 +119,11 @@
     events.add(new Event(Action.MARK_COMMIT, (byte[])null, (byte[])null));
   }
 
+  @Override
+  public void markCommitWithTimestamp(final byte[] xid, final byte[] ts) throws RocksDBException {
+    events.add(new Event(Action.MARK_COMMIT_WITH_TIMESTAMP, (byte[]) null, (byte[]) null));
+  }
+
   public static class Event {
     public final Action action;
     public final int columnFamilyId;
@@ -156,8 +161,10 @@
 
     @Override
     public int hashCode() {
-
-      return Objects.hash(action, columnFamilyId, key, value);
+      int result = Objects.hash(action, columnFamilyId);
+      result = 31 * result + Arrays.hashCode(key);
+      result = 31 * result + Arrays.hashCode(value);
+      return result;
     }
   }
 
@@ -166,7 +173,18 @@
    * event actions
    */
   public enum Action {
-    PUT, MERGE, DELETE, SINGLE_DELETE, DELETE_RANGE, LOG, PUT_BLOB_INDEX,
-    MARK_BEGIN_PREPARE, MARK_END_PREPARE, MARK_NOOP, MARK_COMMIT,
-    MARK_ROLLBACK }
+    PUT,
+    MERGE,
+    DELETE,
+    SINGLE_DELETE,
+    DELETE_RANGE,
+    LOG,
+    PUT_BLOB_INDEX,
+    MARK_BEGIN_PREPARE,
+    MARK_END_PREPARE,
+    MARK_NOOP,
+    MARK_COMMIT,
+    MARK_ROLLBACK,
+    MARK_COMMIT_WITH_TIMESTAMP
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java	2025-05-19 16:14:27.000000000 +0000
@@ -9,7 +9,6 @@
 import org.junit.Test;
 
 import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -37,23 +36,38 @@
         isEqualTo(".jnilib");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.dylib");
   }
 
   @Test
-  public void mac64() {
-    setEnvironmentClassFields("mac", "64");
+  public void mac64_x86_64() {
+    setEnvironmentClassFields("mac", "x86_64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".jnilib");
-    assertThat(Environment.getJniLibraryFileName("rocksdb")).
-        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx-x86_64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.dylib");
   }
 
   @Test
+  public void macAarch64() {
+    setEnvironmentClassFields("mac", "aarch64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx-arm64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
   public void nix32() {
     // Linux
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
@@ -63,6 +77,7 @@
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
@@ -93,7 +108,8 @@
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    Environment.getJniLibraryFileName("rocksdb");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).isEqualTo("blah");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
   }
 
   @Test
@@ -105,6 +121,7 @@
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
@@ -114,6 +131,7 @@
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // UNIX
@@ -124,6 +142,7 @@
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // AIX
@@ -133,6 +152,7 @@
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-aix64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
   }
@@ -151,6 +171,7 @@
       isEqualTo(".dll");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
       isEqualTo("librocksdbjni-win64.dll");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
       isEqualTo("librocksdbjni.dll");
   }
@@ -167,6 +188,7 @@
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-ppc64le.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
@@ -179,12 +201,13 @@
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-ppc64le-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
   }
 
   @Test
-  public void aarch64() {
+  public void linuxArch64() {
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Linux", "aarch64");
     assertThat(Environment.isUnix()).isTrue();
@@ -195,6 +218,7 @@
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
@@ -207,6 +231,7 @@
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-aarch64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java	2025-05-19 16:14:27.000000000 +0000
@@ -5,14 +5,14 @@
 
 package org.rocksdb.util;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
 import org.rocksdb.CompactionPriority;
 import org.rocksdb.Options;
 import org.rocksdb.WALRecoveryMode;
 
-import java.util.Random;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 /**
  * General test utilities.
  */
@@ -58,4 +58,15 @@
     random.nextBytes(str);
     return str;
   }
+
+  /**
+   * Copy a {@link ByteBuffer} into an array for shorthand ease of test coding
+   * @param byteBuffer the buffer to copy
+   * @return a {@link byte[]} containing the same bytes as the input
+   */
+  public static byte[] bufferBytes(final ByteBuffer byteBuffer) {
+    final byte[] result = new byte[byteBuffer.limit()];
+    byteBuffer.get(result);
+    return result;
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java	2025-05-19 16:14:27.000000000 +0000
@@ -131,4 +131,9 @@
   public void markCommit(final byte[] xid) throws RocksDBException {
     throw new UnsupportedOperationException();
   }
+
+  @Override
+  public void markCommitWithTimestamp(final byte[] xid, final byte[] ts) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/java/understanding_options.md mariadb-10.11.13/storage/rocksdb/rocksdb/java/understanding_options.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/java/understanding_options.md	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/java/understanding_options.md	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,79 @@
+# How RocksDB Options and their Java Wrappers Work
+
+Options in RocksDB come in many different flavours. This is an attempt at a taxonomy and explanation.
+
+## RocksDB Options
+
+Initially, I believe, RocksDB had only database options. I don't know if any of these were mutable. Column families came later. Read on to understand the terminology.
+
+So to begin, one sets up a collection of options and starts/creates a database with these options. That's a useful way to think about it, because from a Java point-of-view (and I didn't realise this initially and got very confused), despite making native calls to C++, the `API`s are just manipulating a native C++ configuration object. This object is just a record of configuration, and it must later be passed to the database (at create or open time) in order to apply the options.
+
+### Database versus Column Family
+
+The concept of the *column family* or `CF` is widespread within RocksDB. I think of it as a data namespace, but conveniently transactions can operate across these namespaces. The concept of a default column family exists, and when operations do not refer to a particular `CF`, it refers to the default.
+
+We raise this w.r.t. options because many options, perhaps most that users encounter, are *column family options*. That is to say they apply individually to a particular column family, or to the default column family. Crucially also, many/most/all of these same options are exposed as *database options* and then apply as the default for column families which do not have the option set explicitly. Obviously some database options are naturally database-wide; they apply to the operation of the database and don't make any sense applied to a column family.
+
+### Mutability
+
+There are 2 kinds of options
+
+- Mutable options
+- Immutable options. We name these in contrast to the mutable ones, but they are usually referred to unqualified.
+
+Mutable options are those which can be changed on a running `RocksDB` instance. Immutable options can only be configured prior to the start of a database. Of course, we can configure the immutable options at this time too; The entirety of options is a strict superset of the mutable options.
+
+Mutable options (whether column-family specific or database-wide) are manipulated at runtime with builders, so we have `MutableDBOptions.MutableDBOptionsBuilder` and `MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder` which share tooling classes/hierarchy and maintain and manipulate the relevant options as a `(key,value)` map.
+
+Mutable options are then passed using `setOptions()` and `setDBOptions()` methods on the live RocksDB, and then take effect immediately (depending on the semantics of the option) on the database.
+
+### Advanced
+
+There are 2 classes of options
+
+- Advanced options
+- Non-advanced options
+
+It's not clear to me what the conceptual distinction is between advanced and not. However, the Java code takes care to reflect it from the underlying C++.
+
+This leads to 2 separate type hierarchies within column family options, one for each `class` of options. The `kind`s are represented by where the options appear in their hierarchy.
+
+```java
+interface ColumnFamilyOptionsInterface<T extends ColumnFamilyOptionsInterface<T>>
+    extends AdvancedColumnFamilyOptionsInterface<T>
+interface MutableColumnFamilyOptionsInterface<T extends MutableColumnFamilyOptionsInterface<T>>
+    extends AdvancedMutableColumnFamilyOptionsInterface<T>
+```
+
+And then there is ultimately a single concrete implementation class for CF options:
+
+```java
+class ColumnFamilyOptions extends RocksObject
+    implements ColumnFamilyOptionsInterface<ColumnFamilyOptions>,
+    MutableColumnFamilyOptionsInterface<ColumnFamilyOptions>
+```
+
+as there is a single concrete implementation class for DB options:
+
+```java
+class DBOptions extends RocksObject
+    implements DBOptionsInterface<DBOptions>,
+    MutableDBOptionsInterface<DBOptions>
+```
+
+Interestingly `DBOptionsInterface` doesn't extend `MutableDBOptionsInterface`, if only in order to disrupt our belief in consistent basic laws of the Universe.
+
+## Startup/Creation Options
+
+```java
+class Options extends RocksObject
+    implements DBOptionsInterface<Options>,
+    MutableDBOptionsInterface<Options>,
+    ColumnFamilyOptionsInterface<Options>,
+    MutableColumnFamilyOptionsInterface<Options>
+```
+
+### Example - Blob Options
+
+The `enable_blob_files` and `min_blob_size` options are per-column-family, and are mutable. The options also appear in the unqualified database options. So by initial configuration, we can set up a RocksDB database where for every `(key,value)` with a value of size at least `min_blob_size`, the value is written (indirected) to a blob file. Blobs may share a blob file, subject to the configuration values set. Later, using the `MutableColumnFamilyOptionsInterface` of the `ColumnFamilyOptions`, we can choose to turn this off (`enable_blob_files=false`) , or alter the `min_blob_size` for the default column family, or any other column family. It seems to me that we cannot, though, mutate the column family options for all column families using the
+`setOptions()` mechanism, either for all existing column families or for all future column families; but maybe we can do the latter on a re-`open()/create()'
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,8 +6,12 @@
 #include "logging/auto_roll_logger.h"
 
 #include <algorithm>
+
 #include "file/filename.h"
 #include "logging/logging.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -15,7 +19,9 @@
 #ifndef ROCKSDB_LITE
 // -- AutoRollLogger
 
-AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname,
+AutoRollLogger::AutoRollLogger(const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& clock,
+                               const std::string& dbname,
                                const std::string& db_log_dir,
                                size_t log_max_size,
                                size_t log_file_time_to_roll,
@@ -24,36 +30,38 @@
     : Logger(log_level),
       dbname_(dbname),
       db_log_dir_(db_log_dir),
-      env_(env),
+      fs_(fs),
+      clock_(clock),
       status_(Status::OK()),
       kMaxLogFileSize(log_max_size),
       kLogFileTimeToRoll(log_file_time_to_roll),
       kKeepLogFileNum(keep_log_file_num),
-      cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+      cached_now(static_cast<uint64_t>(clock_->NowMicros() * 1e-6)),
       ctime_(cached_now),
       cached_now_access_count(0),
       call_NowMicros_every_N_records_(100),
       mutex_() {
-  Status s = env->GetAbsolutePath(dbname, &db_absolute_path_);
+  Status s = fs->GetAbsolutePath(dbname, io_options_, &db_absolute_path_,
+                                 &io_context_);
   if (s.IsNotSupported()) {
     db_absolute_path_ = dbname;
   } else {
     status_ = s;
   }
   log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
-  if (env_->FileExists(log_fname_).ok()) {
+  if (fs_->FileExists(log_fname_, io_options_, &io_context_).ok()) {
     RollLogFile();
   }
   GetExistingFiles();
-  ResetLogger();
-  if (status_.ok()) {
+  s = ResetLogger();
+  if (s.ok() && status_.ok()) {
     status_ = TrimOldLogFiles();
   }
 }
 
 Status AutoRollLogger::ResetLogger() {
   TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger");
-  status_ = env_->NewLogger(log_fname_, &logger_);
+  status_ = fs_->NewLogger(log_fname_, io_options_, &logger_, &io_context_);
   TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger");
 
   if (!status_.ok()) {
@@ -67,7 +75,7 @@
         "The underlying logger doesn't support GetLogFileSize()");
   }
   if (status_.ok()) {
-    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    cached_now = static_cast<uint64_t>(clock_->NowMicros() * 1e-6);
     ctime_ = cached_now;
     cached_now_access_count = 0;
   }
@@ -79,14 +87,17 @@
   // This function is called when log is rotating. Two rotations
   // can happen quickly (NowMicro returns same value). To not overwrite
   // previous log file we increment by one micro second and try again.
-  uint64_t now = env_->NowMicros();
+  uint64_t now = clock_->NowMicros();
   std::string old_fname;
   do {
     old_fname = OldInfoLogFileName(
       dbname_, now, db_absolute_path_, db_log_dir_);
     now++;
-  } while (env_->FileExists(old_fname).ok());
-  env_->RenameFile(log_fname_, old_fname);
+  } while (fs_->FileExists(old_fname, io_options_, &io_context_).ok());
+  Status s = fs_->RenameFile(log_fname_, old_fname, io_options_, &io_context_);
+  if (!s.ok()) {
+    // What should we do on error?
+  }
   old_log_files_.push(old_fname);
 }
 
@@ -100,7 +111,7 @@
   std::string parent_dir;
   std::vector<std::string> info_log_files;
   Status s =
-      GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files);
+      GetInfoLogFiles(fs_, db_log_dir_, dbname_, &parent_dir, &info_log_files);
   if (status_.ok()) {
     status_ = s;
   }
@@ -114,7 +125,7 @@
 }
 
 Status AutoRollLogger::TrimOldLogFiles() {
-  // Here we directly list info files and delete them through Env.
+  // Here we directly list info files and delete them through FileSystem.
   // The deletion isn't going through DB, so there are shortcomes:
   // 1. the deletion is not rate limited by SstFileManager
   // 2. there is a chance that an I/O will be issued here
@@ -127,7 +138,8 @@
   // it's essentially the same thing, and checking empty before accessing
   // the queue feels safer.
   while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) {
-    Status s = env_->DeleteFile(old_log_files_.front());
+    Status s =
+        fs_->DeleteFile(old_log_files_.front(), io_options_, &io_context_);
     // Remove the file from the tracking anyway. It's possible that
     // DB cleaned up the old log file, or people cleaned it up manually.
     old_log_files_.pop();
@@ -238,7 +250,7 @@
 
 bool AutoRollLogger::LogExpired() {
   if (cached_now_access_count >= call_NowMicros_every_N_records_) {
-    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    cached_now = static_cast<uint64_t>(clock_->NowMicros() * 1e-6);
     cached_now_access_count = 0;
   }
 
@@ -257,19 +269,24 @@
 
   Env* env = options.env;
   std::string db_absolute_path;
-  env->GetAbsolutePath(dbname, &db_absolute_path);
+  Status s = env->GetAbsolutePath(dbname, &db_absolute_path);
+  if (!s.ok()) {
+    return s;
+  }
   std::string fname =
       InfoLogFileName(dbname, db_absolute_path, options.db_log_dir);
 
-  env->CreateDirIfMissing(dbname);  // In case it does not exist
+  const auto& clock = env->GetSystemClock();
+  env->CreateDirIfMissing(dbname)
+      .PermitUncheckedError();  // In case it does not exist
   // Currently we only support roll by time-to-roll and log size
 #ifndef ROCKSDB_LITE
   if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
     AutoRollLogger* result = new AutoRollLogger(
-        env, dbname, options.db_log_dir, options.max_log_file_size,
-        options.log_file_time_to_roll, options.keep_log_file_num,
-        options.info_log_level);
-    Status s = result->GetStatus();
+        env->GetFileSystem(), clock, dbname, options.db_log_dir,
+        options.max_log_file_size, options.log_file_time_to_roll,
+        options.keep_log_file_num, options.info_log_level);
+    s = result->GetStatus();
     if (!s.ok()) {
       delete result;
     } else {
@@ -279,11 +296,19 @@
   }
 #endif  // !ROCKSDB_LITE
   // Open a log file in the same directory as the db
-  env->RenameFile(fname,
-                  OldInfoLogFileName(dbname, env->NowMicros(), db_absolute_path,
-                                     options.db_log_dir));
-  auto s = env->NewLogger(fname, logger);
-  if (logger->get() != nullptr) {
+  s = env->FileExists(fname);
+  if (s.ok()) {
+    s = env->RenameFile(
+        fname, OldInfoLogFileName(dbname, clock->NowMicros(), db_absolute_path,
+                                  options.db_log_dir));
+  } else if (s.IsNotFound()) {
+    // "LOG" is not required to exist since this could be a new DB.
+    s = Status::OK();
+  }
+  if (s.ok()) {
+    s = env->NewLogger(fname, logger);
+  }
+  if (s.ok() && logger->get() != nullptr) {
     (*logger)->SetInfoLogLevel(options.info_log_level);
   }
   return s;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger.h	2025-05-19 16:14:27.000000000 +0000
@@ -18,14 +18,18 @@
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
+class FileSystem;
+class SystemClock;
 
 #ifndef ROCKSDB_LITE
 // Rolls the log file by size and/or time
 class AutoRollLogger : public Logger {
  public:
-  AutoRollLogger(Env* env, const std::string& dbname,
-                 const std::string& db_log_dir, size_t log_max_size,
-                 size_t log_file_time_to_roll, size_t keep_log_file_num,
+  AutoRollLogger(const std::shared_ptr<FileSystem>& fs,
+                 const std::shared_ptr<SystemClock>& clock,
+                 const std::string& dbname, const std::string& db_log_dir,
+                 size_t log_max_size, size_t log_file_time_to_roll,
+                 size_t keep_log_file_num,
                  const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL);
 
   using Logger::Logv;
@@ -69,8 +73,9 @@
 
   virtual ~AutoRollLogger() {
     if (logger_ && !closed_) {
-      logger_->Close();
+      logger_->Close().PermitUncheckedError();
     }
+    status_.PermitUncheckedError();
   }
 
   using Logger::GetInfoLogLevel;
@@ -133,7 +138,8 @@
   std::string dbname_;
   std::string db_log_dir_;
   std::string db_absolute_path_;
-  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<SystemClock> clock_;
   std::shared_ptr<Logger> logger_;
   // current status of the logger
   Status status_;
@@ -147,11 +153,13 @@
   // Full path is stored here. It consumes signifianctly more memory
   // than only storing file name. Can optimize if it causes a problem.
   std::queue<std::string> old_log_files_;
-  // to avoid frequent env->NowMicros() calls, we cached the current time
+  // to avoid frequent clock->NowMicros() calls, we cached the current time
   uint64_t cached_now;
   uint64_t ctime_;
   uint64_t cached_now_access_count;
   uint64_t call_NowMicros_every_N_records_;
+  IOOptions io_options_;
+  IODebugContext io_context_;
   mutable port::Mutex mutex_;
 };
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/auto_roll_logger_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,8 +7,9 @@
 #ifndef ROCKSDB_LITE
 
 #include "logging/auto_roll_logger.h"
-#include <errno.h>
+
 #include <sys/stat.h>
+
 #include <algorithm>
 #include <cmath>
 #include <fstream>
@@ -17,30 +18,19 @@
 #include <string>
 #include <thread>
 #include <vector>
+
+#include "db/db_test_util.h"
+#include "env/emulated_clock.h"
 #include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
 namespace ROCKSDB_NAMESPACE {
-namespace {
-class NoSleepEnv : public EnvWrapper {
- public:
-  NoSleepEnv(Env* base) : EnvWrapper(base) {}
-  void SleepForMicroseconds(int micros) override {
-    fake_time_ += static_cast<uint64_t>(micros);
-  }
-
-  uint64_t NowNanos() override { return fake_time_ * 1000; }
-
-  uint64_t NowMicros() override { return fake_time_; }
-
- private:
-  uint64_t fake_time_ = 6666666666;
-};
-}  // namespace
 
 // In this test we only want to Log some simple log message with
 // no format. LogMessage() provides such a simple interface and
@@ -71,12 +61,14 @@
     std::string deleteCmd = "rm -rf " + kTestDir;
 #endif
     ASSERT_TRUE(system(deleteCmd.c_str()) == 0);
-    Env::Default()->CreateDir(kTestDir);
+    ASSERT_OK(Env::Default()->CreateDir(kTestDir));
   }
 
   void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size,
                              const std::string& log_message);
-  void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time,
+  void RollLogFileByTimeTest(const std::shared_ptr<FileSystem>& fs,
+                             const std::shared_ptr<SystemClock>& sc,
+                             AutoRollLogger* logger, size_t time,
                              const std::string& log_message);
   // return list of files under kTestDir that contains "LOG"
   std::vector<std::string> GetLogFiles() {
@@ -157,21 +149,22 @@
   ASSERT_TRUE(message_size == logger->GetLogFileSize());
 }
 
-void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger,
-                                               size_t time,
-                                               const std::string& log_message) {
+void AutoRollLoggerTest::RollLogFileByTimeTest(
+    const std::shared_ptr<FileSystem>& fs,
+    const std::shared_ptr<SystemClock>& sc, AutoRollLogger* logger, size_t time,
+    const std::string& log_message) {
   uint64_t expected_ctime;
   uint64_t actual_ctime;
 
   uint64_t total_log_size;
-  EXPECT_OK(env->GetFileSize(kLogFile, &total_log_size));
+  EXPECT_OK(fs->GetFileSize(kLogFile, IOOptions(), &total_log_size, nullptr));
   expected_ctime = logger->TEST_ctime();
   logger->SetCallNowMicrosEveryNRecords(0);
 
   // -- Write to the log for several times, which is supposed
   // to be finished before time.
   for (int i = 0; i < 10; ++i) {
-    env->SleepForMicroseconds(50000);
+    sc->SleepForMicroseconds(50000);
     LogMessage(logger, log_message.c_str());
     EXPECT_OK(logger->GetStatus());
     // Make sure we always write to the same log file (by
@@ -186,7 +179,7 @@
   }
 
   // -- Make the log file expire
-  env->SleepForMicroseconds(static_cast<int>(time * 1000000));
+  sc->SleepForMicroseconds(static_cast<int>(time * 1000000));
   LogMessage(logger, log_message.c_str());
 
   // At this time, the new log file should be created.
@@ -200,15 +193,16 @@
     size_t log_max_size = 1024 * 5;
     size_t keep_log_file_num = 10;
 
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0,
-                          keep_log_file_num);
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, "", log_max_size, 0, keep_log_file_num);
 
     RollLogFileBySizeTest(&logger, log_max_size,
                           kSampleMessage + ":RollLogFileBySize");
 }
 
 TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
-  NoSleepEnv nse(Env::Default());
+  auto nsc =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default(), true);
 
   size_t time = 2;
   size_t log_size = 1024 * 5;
@@ -217,10 +211,11 @@
   InitTestDb();
   // -- Test the existence of file during the server restart.
   ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile));
-  AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num);
+  AutoRollLogger logger(default_env->GetFileSystem(), nsc, kTestDir, "",
+                        log_size, time, keep_log_file_num);
   ASSERT_OK(default_env->FileExists(kLogFile));
 
-  RollLogFileByTimeTest(&nse, &logger, time,
+  RollLogFileByTimeTest(default_env->GetFileSystem(), nsc, &logger, time,
                         kSampleMessage + ":RollLogFileByTime");
 }
 
@@ -255,15 +250,17 @@
   size_t log_size = 1024;
   size_t keep_log_file_num = 10;
 
-  AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "",
-                                              log_size, 0, keep_log_file_num);
+  AutoRollLogger* logger =
+      new AutoRollLogger(FileSystem::Default(), SystemClock::Default(),
+                         kTestDir, "", log_size, 0, keep_log_file_num);
 
   LogMessage(logger, kSampleMessage.c_str());
   ASSERT_GT(logger->GetLogFileSize(), kZero);
   delete logger;
 
   // reopens the log file and an empty log file will be created.
-  logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10);
+  logger = new AutoRollLogger(FileSystem::Default(), SystemClock::Default(),
+                              kTestDir, "", log_size, 0, 10);
   ASSERT_EQ(logger->GetLogFileSize(), kZero);
   delete logger;
 }
@@ -274,16 +271,17 @@
 
   InitTestDb();
 
-  NoSleepEnv nse(Env::Default());
-  AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time,
-                        keep_log_file_num);
+  auto nsc =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default(), true);
+  AutoRollLogger logger(FileSystem::Default(), nsc, kTestDir, "", log_max_size,
+                        time, keep_log_file_num);
 
   // Test the ability to roll by size
   RollLogFileBySizeTest(&logger, log_max_size,
                         kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
 
   // Test the ability to roll by Time
-  RollLogFileByTimeTest(&nse, &logger, time,
+  RollLogFileByTimeTest(FileSystem::Default(), nsc, &logger, time,
                         kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
 }
 
@@ -292,7 +290,10 @@
 // port
 TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
   DBOptions options;
-  NoSleepEnv nse(Env::Default());
+  auto nsc =
+      std::make_shared<EmulatedSystemClock>(SystemClock::Default(), true);
+  std::unique_ptr<Env> nse(new CompositeEnvWrapper(Env::Default(), nsc));
+
   std::shared_ptr<Logger> logger;
 
   // Normal logger
@@ -311,14 +312,15 @@
       kSampleMessage + ":CreateLoggerFromOptions - size");
 
   // Only roll by Time
-  options.env = &nse;
+  options.env = nse.get();
   InitTestDb();
   options.max_log_file_size = 0;
   options.log_file_time_to_roll = 2;
   ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
   auto_roll_logger =
     dynamic_cast<AutoRollLogger*>(logger.get());
-  RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
+  RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger,
+                        options.log_file_time_to_roll,
                         kSampleMessage + ":CreateLoggerFromOptions - time");
 
   // roll by both Time and size
@@ -330,7 +332,8 @@
     dynamic_cast<AutoRollLogger*>(logger.get());
   RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
                         kSampleMessage + ":CreateLoggerFromOptions - both");
-  RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
+  RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger,
+                        options.log_file_time_to_roll,
                         kSampleMessage + ":CreateLoggerFromOptions - both");
 
   // Set keep_log_file_num
@@ -403,8 +406,8 @@
     const size_t kMaxFileSize = 512;
     {
       size_t log_num = 8;
-      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
-                            log_num);
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
       RollNTimesBySize(&logger, log_num, kMaxFileSize);
 
       ASSERT_EQ(log_num, GetLogFiles().size());
@@ -412,8 +415,8 @@
     // Shrink number of files
     {
       size_t log_num = 5;
-      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
-                            log_num);
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
       ASSERT_EQ(log_num, GetLogFiles().size());
 
       RollNTimesBySize(&logger, 3, kMaxFileSize);
@@ -423,8 +426,8 @@
     // Increase number of files again.
     {
       size_t log_num = 7;
-      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
-                            log_num);
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
       ASSERT_EQ(6, GetLogFiles().size());
 
       RollNTimesBySize(&logger, 3, kMaxFileSize);
@@ -486,7 +489,8 @@
   // an extra-scope to force the AutoRollLogger to flush the log file when it
   // becomes out of scope.
   {
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, "", log_size, 0, 10);
     for (int log_level = InfoLogLevel::HEADER_LEVEL;
          log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
       logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -524,7 +528,8 @@
 
   size_t log_size = 8192;
   size_t log_lines = 0;
-  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
+  AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), kTestDir,
+                        "", log_size, 0, 10);
   for (int log_level = InfoLogLevel::HEADER_LEVEL;
        log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
     logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -567,7 +572,7 @@
   const std::string fname = path.substr(path.find_last_of("/") + 1);
 
   std::vector<std::string> children;
-  Env::Default()->GetChildren(dirname, &children);
+  EXPECT_OK(Env::Default()->GetChildren(dirname, &children));
 
   // We know that the old log files are named [path]<something>
   // Return all entities that match the pattern
@@ -591,8 +596,9 @@
 
     InitTestDb();
 
-    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"",
-                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/0,
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, /*db_log_dir=*/"", LOG_MAX_SIZE,
+                          /*log_file_time_to_roll=*/0,
                           /*keep_log_file_num=*/10);
 
     if (test_num == 0) {
@@ -666,6 +672,50 @@
   ASSERT_NOK(CreateLoggerFromOptions("", options, &logger));
   ASSERT_TRUE(!logger);
 }
+
+TEST_F(AutoRollLoggerTest, RenameOnlyWhenExists) {
+  InitTestDb();
+  SpecialEnv env(Env::Default());
+  Options options;
+  options.env = &env;
+
+  // Originally no LOG exists. Should not see a rename.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_EQ(0, env.rename_count_);
+  }
+
+  // Now a LOG exists. Create a new one should see a rename.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_EQ(1, env.rename_count_);
+  }
+}
+
+TEST_F(AutoRollLoggerTest, RenameError) {
+  InitTestDb();
+  SpecialEnv env(Env::Default());
+  env.rename_error_ = true;
+  Options options;
+  options.env = &env;
+
+  // Originally no LOG exists. Should not be impacted by rename error.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_TRUE(logger != nullptr);
+  }
+
+  // Now a LOG exists. Rename error should cause failure.
+  {
+    std::shared_ptr<Logger> logger;
+    ASSERT_NOK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    ASSERT_TRUE(logger == nullptr);
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger.h	2025-05-19 16:14:27.000000000 +0000
@@ -31,15 +31,16 @@
             const std::string& fname, const EnvOptions& options, Env* env,
             InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
       : Logger(log_level),
-        file_(std::move(writable_file), fname, options, env),
-        last_flush_micros_(0),
         env_(env),
+        clock_(env_->GetSystemClock().get()),
+        file_(std::move(writable_file), fname, options, clock_),
+        last_flush_micros_(0),
         flush_pending_(false) {}
 
   ~EnvLogger() {
     if (!closed_) {
       closed_ = true;
-      CloseHelper();
+      CloseHelper().PermitUncheckedError();
     }
   }
 
@@ -48,9 +49,9 @@
     mutex_.AssertHeld();
     if (flush_pending_) {
       flush_pending_ = false;
-      file_.Flush();
+      file_.Flush().PermitUncheckedError();
     }
-    last_flush_micros_ = env_->NowMicros();
+    last_flush_micros_ = clock_->NowMicros();
   }
 
   void Flush() override {
@@ -134,9 +135,9 @@
       assert(p <= limit);
       mutex_.Lock();
       // We will ignore any error returned by Append().
-      file_.Append(Slice(base, p - base));
+      file_.Append(Slice(base, p - base)).PermitUncheckedError();
       flush_pending_ = true;
-      const uint64_t now_micros = env_->NowMicros();
+      const uint64_t now_micros = clock_->NowMicros();
       if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
         FlushLocked();
       }
@@ -154,11 +155,12 @@
   }
 
  private:
+  Env* env_;
+  SystemClock* clock_;
   WritableFileWriter file_;
   mutable port::Mutex mutex_;  // Mutex to protect the shared variables below.
   const static uint64_t flush_every_seconds_ = 5;
   std::atomic_uint_fast64_t last_flush_micros_;
-  Env* env_;
   std::atomic<bool> flush_pending_;
 };
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/env_logger_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/env_logger_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,7 +5,6 @@
 //
 
 #include "logging/env_logger.h"
-#include "env/mock_env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/event_logger.cc mariadb-10.11.13/storage/rocksdb/rocksdb/logging/event_logger.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/event_logger.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/event_logger.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,7 +10,6 @@
 #include <sstream>
 #include <string>
 
-#include "logging/logging.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/logging.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/logging.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/logging.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/logging.h	2025-05-19 16:14:27.000000000 +0000
@@ -19,9 +19,9 @@
 
 inline const char* RocksLogShorterFileName(const char* file)
 {
-  // 15 is the length of "logging/logging.h".
+  // 18 is the length of "logging/logging.h".
   // If the name of this file changed, please change this number, too.
-  return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0);
+  return file + (sizeof(__FILE__) > 18 ? sizeof(__FILE__) - 18 : 0);
 }
 
 // Don't inclide file/line info in HEADER level
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/logging/posix_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/logging/posix_logger.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/logging/posix_logger.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/logging/posix_logger.h	2025-05-19 16:14:27.000000000 +0000
@@ -68,7 +68,7 @@
   virtual ~PosixLogger() {
     if (!closed_) {
       closed_ = true;
-      PosixCloseHelper();
+      PosixCloseHelper().PermitUncheckedError();
     }
   }
   virtual void Flush() override {
@@ -108,15 +108,9 @@
       const time_t seconds = now_tv.tv_sec;
       struct tm t;
       localtime_r(&seconds, &t);
-      p += snprintf(p, limit - p,
-                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
-                    t.tm_year + 1900,
-                    t.tm_mon + 1,
-                    t.tm_mday,
-                    t.tm_hour,
-                    t.tm_min,
-                    t.tm_sec,
-                    static_cast<int>(now_tv.tv_usec),
+      p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llu ",
+                    t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                    t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec),
                     static_cast<long long unsigned int>(thread_id));
 
       // Print the message
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,11 +12,13 @@
 #include <sys/mman.h>
 #endif
 #include <algorithm>
+
 #include "logging/logging.h"
 #include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -160,7 +162,7 @@
 
 #ifdef MAP_HUGETLB
   if (huge_page_size > 0 && bytes > 0) {
-    // Allocate from a huge page TBL table.
+    // Allocate from a huge page TLB table.
     assert(logger != nullptr);  // logger need to be passed in.
     size_t reserved_size =
         ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
@@ -170,7 +172,7 @@
     if (addr == nullptr) {
       ROCKS_LOG_WARN(logger,
                      "AllocateAligned fail to allocate huge TLB pages: %s",
-                     strerror(errno));
+                     errnoStr(errno).c_str());
       // fail back to malloc
     } else {
       return addr;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/arena.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/arena.h	2025-05-19 16:14:27.000000000 +0000
@@ -86,7 +86,7 @@
   // Number of bytes allocated in one block
   const size_t kBlockSize;
   // Array of new[] allocated memory blocks
-  typedef std::vector<char*> Blocks;
+  using Blocks = std::vector<char*>;
   Blocks blocks_;
 
   struct MmapInfo {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/concurrent_arena.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/concurrent_arena.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/concurrent_arena.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/concurrent_arena.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,6 +13,7 @@
 #include <utility>
 #include "memory/allocator.h"
 #include "memory/arena.h"
+#include "port/lang.h"
 #include "port/likely.h"
 #include "util/core_local.h"
 #include "util/mutexlock.h"
@@ -49,7 +50,7 @@
 
   char* Allocate(size_t bytes) override {
     return AllocateImpl(bytes, false /*force_arena*/,
-                        [=]() { return arena_.Allocate(bytes); });
+                        [this, bytes]() { return arena_.Allocate(bytes); });
   }
 
   char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
@@ -58,9 +59,11 @@
     assert(rounded_up >= bytes && rounded_up < bytes + sizeof(void*) &&
            (rounded_up % sizeof(void*)) == 0);
 
-    return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/, [=]() {
-      return arena_.AllocateAligned(rounded_up, huge_page_size, logger);
-    });
+    return AllocateImpl(rounded_up, huge_page_size != 0 /*force_arena*/,
+                        [this, rounded_up, huge_page_size, logger]() {
+                          return arena_.AllocateAligned(rounded_up,
+                                                        huge_page_size, logger);
+                        });
   }
 
   size_t ApproximateMemoryUsage() const {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,22 +10,175 @@
 
 #include "port/likely.h"
 #include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
-
 std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+static std::unordered_map<std::string, OptionTypeInfo> jemalloc_type_info = {
+#ifndef ROCKSDB_LITE
+    {"limit_tcache_size",
+     {offsetof(struct JemallocAllocatorOptions, limit_tcache_size),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"tcache_size_lower_bound",
+     {offsetof(struct JemallocAllocatorOptions, tcache_size_lower_bound),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"tcache_size_upper_bound",
+     {offsetof(struct JemallocAllocatorOptions, tcache_size_upper_bound),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+bool JemallocNodumpAllocator::IsSupported(std::string* why) {
+#ifndef ROCKSDB_JEMALLOC
+  *why = "Not compiled with ROCKSDB_JEMALLOC";
+  return false;
+#else
+  static const std::string unsupported =
+      "JemallocNodumpAllocator only available with jemalloc version >= 5 "
+      "and MADV_DONTDUMP is available.";
+  if (!HasJemalloc()) {
+    *why = unsupported;
+    return false;
+  }
+#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  *why = unsupported;
+  return false;
+#else
+  return true;
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+#endif  // ROCKSDB_MALLOC
+}
 
 JemallocNodumpAllocator::JemallocNodumpAllocator(
-    JemallocAllocatorOptions& options,
-    std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index)
+    JemallocAllocatorOptions& options)
     : options_(options),
-      arena_hooks_(std::move(arena_hooks)),
-      arena_index_(arena_index),
-      tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {}
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+      tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache),
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+      arena_index_(0) {
+  RegisterOptions(&options_, &jemalloc_type_info);
+}
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+JemallocNodumpAllocator::~JemallocNodumpAllocator() {
+  // Destroy tcache before destroying arena.
+  autovector<void*> tcache_list;
+  tcache_.Scrape(&tcache_list, nullptr);
+  for (void* tcache_index : tcache_list) {
+    DestroyThreadSpecificCache(tcache_index);
+  }
+  if (arena_index_ > 0) {
+    // Destroy arena. Silently ignore error.
+    Status s = DestroyArena(arena_index_);
+    assert(s.ok());
+    s.PermitUncheckedError();
+  }
+}
+
+size_t JemallocNodumpAllocator::UsableSize(void* p,
+                                           size_t /*allocation_size*/) const {
+  return malloc_usable_size(static_cast<void*>(p));
+}
+
+void* JemallocNodumpAllocator::Allocate(size_t size) {
+  int tcache_flag = GetThreadSpecificCache(size);
+  return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
+}
+
+void JemallocNodumpAllocator::Deallocate(void* p) {
+  // Obtain tcache.
+  size_t size = 0;
+  if (options_.limit_tcache_size) {
+    size = malloc_usable_size(p);
+  }
+  int tcache_flag = GetThreadSpecificCache(size);
+  // No need to pass arena index to dallocx(). Jemalloc will find arena index
+  // from its own metadata.
+  dallocx(p, tcache_flag);
+}
+
+Status JemallocNodumpAllocator::InitializeArenas() {
+  // Create arena.
+  size_t arena_index_size = sizeof(arena_index_);
+  int ret =
+      mallctl("arenas.create", &arena_index_, &arena_index_size, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to create jemalloc arena, error code: " +
+                              ROCKSDB_NAMESPACE::ToString(ret));
+  }
+  assert(arena_index_ != 0);
+
+  // Read existing hooks.
+  std::string key =
+      "arena." + ROCKSDB_NAMESPACE::ToString(arena_index_) + ".extent_hooks";
+  extent_hooks_t* hooks;
+  size_t hooks_size = sizeof(hooks);
+  ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to read existing hooks, error code: " +
+                              ROCKSDB_NAMESPACE::ToString(ret));
+  }
+
+  // Store existing alloc.
+  extent_alloc_t* original_alloc = hooks->alloc;
+  extent_alloc_t* expected = nullptr;
+  bool success =
+      JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
+          expected, original_alloc);
+  if (!success && original_alloc != expected) {
+    return Status::Incomplete("Original alloc conflict.");
+  }
 
+  // Set the custom hook.
+  arena_hooks_.reset(new extent_hooks_t(*hooks));
+  arena_hooks_->alloc = &JemallocNodumpAllocator::Alloc;
+  extent_hooks_t* hooks_ptr = arena_hooks_.get();
+  ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
+  if (ret != 0) {
+    return Status::Incomplete("Failed to set custom hook, error code: " +
+                              ROCKSDB_NAMESPACE::ToString(ret));
+  }
+  return Status::OK();
+}
+
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+Status JemallocNodumpAllocator::PrepareOptions(
+    const ConfigOptions& config_options) {
+  std::string message;
+
+  if (!IsSupported(&message)) {
+    return Status::NotSupported(message);
+  } else if (options_.limit_tcache_size &&
+             options_.tcache_size_lower_bound >=
+                 options_.tcache_size_upper_bound) {
+    return Status::InvalidArgument(
+        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+  } else if (IsMutable()) {
+    Status s = MemoryAllocator::PrepareOptions(config_options);
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+    if (s.ok()) {
+      s = InitializeArenas();
+    }
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+    return s;
+  } else {
+    // Already prepared
+    return Status::OK();
+  }
+}
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
   // We always enable tcache. The only corner case is when there are a ton of
   // threads accessing with low frequency, then it could consume a lot of
@@ -50,24 +203,6 @@
   }
   return MALLOCX_TCACHE(*tcache_index);
 }
-
-void* JemallocNodumpAllocator::Allocate(size_t size) {
-  int tcache_flag = GetThreadSpecificCache(size);
-  return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
-}
-
-void JemallocNodumpAllocator::Deallocate(void* p) {
-  // Obtain tcache.
-  size_t size = 0;
-  if (options_.limit_tcache_size) {
-    size = malloc_usable_size(p);
-  }
-  int tcache_flag = GetThreadSpecificCache(size);
-  // No need to pass arena index to dallocx(). Jemalloc will find arena index
-  // from its own metadata.
-  dallocx(p, tcache_flag);
-}
-
 void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
                                      size_t size, size_t alignment, bool* zero,
                                      bool* commit, unsigned arena_ind) {
@@ -91,11 +226,12 @@
 
 Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
   assert(arena_index != 0);
-  std::string key = "arena." + ToString(arena_index) + ".destroy";
+  std::string key =
+      "arena." + ROCKSDB_NAMESPACE::ToString(arena_index) + ".destroy";
   int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
   if (ret != 0) {
     return Status::Incomplete("Failed to destroy jemalloc arena, error code: " +
-                              ToString(ret));
+                              ROCKSDB_NAMESPACE::ToString(ret));
   }
   return Status::OK();
 }
@@ -111,96 +247,25 @@
   delete tcache_index;
 }
 
-JemallocNodumpAllocator::~JemallocNodumpAllocator() {
-  // Destroy tcache before destroying arena.
-  autovector<void*> tcache_list;
-  tcache_.Scrape(&tcache_list, nullptr);
-  for (void* tcache_index : tcache_list) {
-    DestroyThreadSpecificCache(tcache_index);
-  }
-  // Destroy arena. Silently ignore error.
-  Status s __attribute__((__unused__)) = DestroyArena(arena_index_);
-  assert(s.ok());
-}
-
-size_t JemallocNodumpAllocator::UsableSize(void* p,
-                                           size_t /*allocation_size*/) const {
-  return malloc_usable_size(static_cast<void*>(p));
-}
 #endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 
 Status NewJemallocNodumpAllocator(
     JemallocAllocatorOptions& options,
     std::shared_ptr<MemoryAllocator>* memory_allocator) {
-  *memory_allocator = nullptr;
-  Status unsupported = Status::NotSupported(
-      "JemallocNodumpAllocator only available with jemalloc version >= 5 "
-      "and MADV_DONTDUMP is available.");
-#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
-  (void)options;
-  return unsupported;
-#else
-  if (!HasJemalloc()) {
-    return unsupported;
-  }
   if (memory_allocator == nullptr) {
     return Status::InvalidArgument("memory_allocator must be non-null.");
   }
-  if (options.limit_tcache_size &&
-      options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
-    return Status::InvalidArgument(
-        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
-  }
-
-  // Create arena.
-  unsigned arena_index = 0;
-  size_t arena_index_size = sizeof(arena_index);
-  int ret =
-      mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
-  if (ret != 0) {
-    return Status::Incomplete("Failed to create jemalloc arena, error code: " +
-                              ToString(ret));
-  }
-  assert(arena_index != 0);
-
-  // Read existing hooks.
-  std::string key = "arena." + ToString(arena_index) + ".extent_hooks";
-  extent_hooks_t* hooks;
-  size_t hooks_size = sizeof(hooks);
-  ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
-  if (ret != 0) {
-    JemallocNodumpAllocator::DestroyArena(arena_index);
-    return Status::Incomplete("Failed to read existing hooks, error code: " +
-                              ToString(ret));
-  }
-
-  // Store existing alloc.
-  extent_alloc_t* original_alloc = hooks->alloc;
-  extent_alloc_t* expected = nullptr;
-  bool success =
-      JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
-          expected, original_alloc);
-  if (!success && original_alloc != expected) {
-    JemallocNodumpAllocator::DestroyArena(arena_index);
-    return Status::Incomplete("Original alloc conflict.");
-  }
-
-  // Set the custom hook.
-  std::unique_ptr<extent_hooks_t> new_hooks(new extent_hooks_t(*hooks));
-  new_hooks->alloc = &JemallocNodumpAllocator::Alloc;
-  extent_hooks_t* hooks_ptr = new_hooks.get();
-  ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
-  if (ret != 0) {
-    JemallocNodumpAllocator::DestroyArena(arena_index);
-    return Status::Incomplete("Failed to set custom hook, error code: " +
-                              ToString(ret));
+#ifndef ROCKSDB_JEMALLOC
+  (void)options;
+  return Status::NotSupported("Not compiled with JEMALLOC");
+#else
+  std::unique_ptr<MemoryAllocator> allocator(
+      new JemallocNodumpAllocator(options));
+  Status s = allocator->PrepareOptions(ConfigOptions());
+  if (s.ok()) {
+    memory_allocator->reset(allocator.release());
   }
-
-  // Create cache allocator.
-  memory_allocator->reset(
-      new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index));
-  return Status::OK();
-#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  return s;
+#endif
 }
-
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/jemalloc_nodump_allocator.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,6 +12,7 @@
 #include "port/port.h"
 #include "rocksdb/memory_allocator.h"
 #include "util/thread_local.h"
+#include "utilities/memory_allocators.h"
 
 #if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
 
@@ -19,22 +20,38 @@
 
 #if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
 #define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+#endif  // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
+#endif  // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
 
 namespace ROCKSDB_NAMESPACE {
-
-class JemallocNodumpAllocator : public MemoryAllocator {
+class JemallocNodumpAllocator : public BaseMemoryAllocator {
  public:
-  JemallocNodumpAllocator(JemallocAllocatorOptions& options,
-                          std::unique_ptr<extent_hooks_t>&& arena_hooks,
-                          unsigned arena_index);
+  explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options);
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
   ~JemallocNodumpAllocator();
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+  static const char* kClassName() { return "JemallocNodumpAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  static bool IsSupported() {
+    std::string unused;
+    return IsSupported(&unused);
+  }
+  static bool IsSupported(std::string* why);
+  bool IsMutable() const { return arena_index_ == 0; }
 
-  const char* Name() const override { return "JemallocNodumpAllocator"; }
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
   void* Allocate(size_t size) override;
   void Deallocate(void* p) override;
   size_t UsableSize(void* p, size_t allocation_size) const override;
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 
  private:
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  Status InitializeArenas();
+
   friend Status NewJemallocNodumpAllocator(
       JemallocAllocatorOptions& options,
       std::shared_ptr<MemoryAllocator>* memory_allocator);
@@ -53,7 +70,10 @@
   // Get or create tcache. Return flag suitable to use with `mallocx`:
   // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
   int GetThreadSpecificCache(size_t size);
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  JemallocAllocatorOptions options_;
 
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
   // A function pointer to jemalloc default alloc. Use atomic to make sure
   // NewJemallocNodumpAllocator is thread-safe.
   //
@@ -61,18 +81,14 @@
   // alloc needs to be static to pass to jemalloc as function pointer.
   static std::atomic<extent_alloc_t*> original_alloc_;
 
-  const JemallocAllocatorOptions options_;
-
   // Custom hooks has to outlive corresponding arena.
-  const std::unique_ptr<extent_hooks_t> arena_hooks_;
-
-  // Arena index.
-  const unsigned arena_index_;
+  std::unique_ptr<extent_hooks_t> arena_hooks_;
 
   // Hold thread-local tcache index.
   ThreadLocalPtr tcache_;
-};
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
 
+  // Arena index.
+  unsigned arena_index_;
+};
 }  // namespace ROCKSDB_NAMESPACE
-#endif  // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
-#endif  // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2019 Intel Corporation
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef MEMKIND
+#include <memkind.h>
+#endif  // MEMKIND
+
+#include "memory/memkind_kmem_allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status MemkindKmemAllocator::PrepareOptions(const ConfigOptions& options) {
+  std::string message;
+  if (!IsSupported(&message)) {
+    return Status::NotSupported(message);
+  } else {
+    return MemoryAllocator::PrepareOptions(options);
+  }
+}
+
+#ifdef MEMKIND
+void* MemkindKmemAllocator::Allocate(size_t size) {
+  void* p = memkind_malloc(MEMKIND_DAX_KMEM, size);
+  if (p == NULL) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void MemkindKmemAllocator::Deallocate(void* p) {
+  memkind_free(MEMKIND_DAX_KMEM, p);
+}
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+size_t MemkindKmemAllocator::UsableSize(void* p,
+                                        size_t /*allocation_size*/) const {
+  return memkind_malloc_usable_size(MEMKIND_DAX_KMEM, p);
+}
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+#endif  // MEMKIND
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memkind_kmem_allocator.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2019 Intel Corporation
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/memory_allocator.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemkindKmemAllocator : public BaseMemoryAllocator {
+ public:
+  static const char* kClassName() { return "MemkindKmemAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  static bool IsSupported() {
+    std::string unused;
+    return IsSupported(&unused);
+  }
+
+  static bool IsSupported(std::string* msg) {
+#ifdef MEMKIND
+    (void)msg;
+    return true;
+#else
+    *msg = "Not compiled with MemKind";
+    return false;
+#endif
+  }
+  Status PrepareOptions(const ConfigOptions& options) override;
+
+#ifdef MEMKIND
+  void* Allocate(size_t size) override;
+  void Deallocate(void* p) override;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  size_t UsableSize(void* p, size_t /*allocation_size*/) const override;
+#endif
+#endif  // MEMKIND
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,91 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/memory_allocator.h"
+
+#include "memory/jemalloc_nodump_allocator.h"
+#include "memory/memkind_kmem_allocator.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> ma_wrapper_type_info = {
+#ifndef ROCKSDB_LITE
+    {"target", OptionTypeInfo::AsCustomSharedPtr<MemoryAllocator>(
+                   0, OptionVerificationType::kByName, OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinAllocators(ObjectLibrary& library,
+                                     const std::string& /*arg*/) {
+  library.AddFactory<MemoryAllocator>(
+      DefaultMemoryAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new DefaultMemoryAllocator());
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      CountedMemoryAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new CountedMemoryAllocator(
+            std::make_shared<DefaultMemoryAllocator>()));
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      JemallocNodumpAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* errmsg) {
+        if (JemallocNodumpAllocator::IsSupported(errmsg)) {
+          JemallocAllocatorOptions options;
+          guard->reset(new JemallocNodumpAllocator(options));
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      MemkindKmemAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* errmsg) {
+        if (MemkindKmemAllocator::IsSupported(errmsg)) {
+          guard->reset(new MemkindKmemAllocator());
+        }
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+}  // namespace
+
+MemoryAllocatorWrapper::MemoryAllocatorWrapper(
+    const std::shared_ptr<MemoryAllocator>& t)
+    : target_(t) {
+  RegisterOptions("", &target_, &ma_wrapper_type_info);
+}
+
+Status MemoryAllocator::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<MemoryAllocator>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinAllocators(*(ObjectLibrary::Default().get()), "");
+  });
+#else
+  if (value == DefaultMemoryAllocator::kClassName()) {
+    result->reset(new DefaultMemoryAllocator());
+    return Status::OK();
+  }
+#endif  // ROCKSDB_LITE
+  ConfigOptions copy = options;
+  copy.invoke_prepare_options = true;
+  return LoadManagedObject<MemoryAllocator>(copy, value, result);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_allocator_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,243 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2019 Intel Corporation
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdio>
+
+#include "memory/jemalloc_nodump_allocator.h"
+#include "memory/memkind_kmem_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/testharness.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: the tests do not work in LITE mode due to relying on
+// `CreateFromString()` to create non-default memory allocators.
+#ifndef ROCKSDB_LITE
+
+class MemoryAllocatorTest
+    : public testing::Test,
+      public ::testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  MemoryAllocatorTest() {
+    std::tie(id_, supported_) = GetParam();
+    Status s =
+        MemoryAllocator::CreateFromString(ConfigOptions(), id_, &allocator_);
+    if (supported_) {
+      EXPECT_OK(s);
+    } else if (!s.ok()) {
+      EXPECT_TRUE(s.IsNotSupported());
+    }
+  }
+  bool IsSupported() { return supported_; }
+
+  std::shared_ptr<MemoryAllocator> allocator_;
+  std::string id_;
+
+ private:
+  bool supported_;
+};
+
+TEST_P(MemoryAllocatorTest, Allocate) {
+  if (!IsSupported()) {
+    return;
+  }
+  void* p = allocator_->Allocate(1024);
+  ASSERT_NE(p, nullptr);
+  size_t size = allocator_->UsableSize(p, 1024);
+  ASSERT_GE(size, 1024);
+  allocator_->Deallocate(p);
+}
+
+TEST_P(MemoryAllocatorTest, CreateAllocator) {
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.ignore_unsupported_options = false;
+  std::shared_ptr<MemoryAllocator> orig, copy;
+  Status s = MemoryAllocator::CreateFromString(config_options, id_, &orig);
+  if (!IsSupported()) {
+    ASSERT_TRUE(s.IsNotSupported());
+  } else {
+    ASSERT_OK(s);
+    ASSERT_NE(orig, nullptr);
+#ifndef ROCKSDB_LITE
+    std::string str = orig->ToString(config_options);
+    ASSERT_OK(MemoryAllocator::CreateFromString(config_options, str, &copy));
+    ASSERT_EQ(orig, copy);
+#endif  // ROCKSDB_LITE
+  }
+}
+
+TEST_P(MemoryAllocatorTest, DatabaseBlockCache) {
+  if (!IsSupported()) {
+    // Check if a memory node is available for allocation
+  }
+
+  // Create database with block cache using the MemoryAllocator
+  Options options;
+  std::string dbname = test::PerThreadDBPath("allocator_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  options.create_if_missing = true;
+  BlockBasedTableOptions table_options;
+  auto cache = NewLRUCache(1024 * 1024, 6, false, false, allocator_);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DB* db = nullptr;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+  ASSERT_LE(cache->GetUsage(), 104);  // Cache will contain stats
+
+  // Write 2kB (200 values, each 10 bytes)
+  int num_keys = 200;
+  WriteOptions wo;
+  std::string val = "0123456789";
+  for (int i = 0; i < num_keys; i++) {
+    std::string key = std::to_string(i);
+    s = db->Put(wo, Slice(key), Slice(val));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));  // Flush all data from memtable so that
+                                         // reads are from block cache
+
+  // Read and check block cache usage
+  ReadOptions ro;
+  std::string result;
+  for (int i = 0; i < num_keys; i++) {
+    std::string key = std::to_string(i);
+    s = db->Get(ro, key, &result);
+    ASSERT_OK(s);
+    ASSERT_EQ(result, val);
+  }
+  ASSERT_GT(cache->GetUsage(), 2000);
+
+  // Close database
+  s = db->Close();
+  ASSERT_OK(s);
+  delete db;
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
+class CreateMemoryAllocatorTest : public testing::Test {
+ public:
+  CreateMemoryAllocatorTest() {
+    config_options_.ignore_unknown_options = false;
+    config_options_.ignore_unsupported_options = false;
+  }
+  ConfigOptions config_options_;
+};
+
+TEST_F(CreateMemoryAllocatorTest, JemallocOptionsTest) {
+  std::shared_ptr<MemoryAllocator> allocator;
+  std::string id = std::string("id=") + JemallocNodumpAllocator::kClassName();
+  Status s = MemoryAllocator::CreateFromString(config_options_, id, &allocator);
+  if (!JemallocNodumpAllocator::IsSupported()) {
+    ASSERT_TRUE(s.IsNotSupported());
+    ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+    return;
+  }
+  ASSERT_OK(s);
+  ASSERT_NE(allocator, nullptr);
+  JemallocAllocatorOptions jopts;
+  auto opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size);
+  ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound);
+  ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound);
+
+  ASSERT_NOK(MemoryAllocator::CreateFromString(
+      config_options_,
+      id + "; limit_tcache_size=true; tcache_size_lower_bound=4096; "
+           "tcache_size_upper_bound=1024",
+      &allocator));
+  ASSERT_OK(MemoryAllocator::CreateFromString(
+      config_options_,
+      id + "; limit_tcache_size=false; tcache_size_lower_bound=4096; "
+           "tcache_size_upper_bound=1024",
+      &allocator));
+  opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->limit_tcache_size, false);
+  ASSERT_EQ(opts->tcache_size_lower_bound, 4096U);
+  ASSERT_EQ(opts->tcache_size_upper_bound, 1024U);
+  ASSERT_OK(MemoryAllocator::CreateFromString(
+      config_options_,
+      id + "; limit_tcache_size=true; tcache_size_upper_bound=4096; "
+           "tcache_size_lower_bound=1024",
+      &allocator));
+  opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->limit_tcache_size, true);
+  ASSERT_EQ(opts->tcache_size_lower_bound, 1024U);
+  ASSERT_EQ(opts->tcache_size_upper_bound, 4096U);
+}
+
+TEST_F(CreateMemoryAllocatorTest, NewJemallocNodumpAllocator) {
+  JemallocAllocatorOptions jopts;
+  std::shared_ptr<MemoryAllocator> allocator;
+
+  jopts.limit_tcache_size = true;
+  jopts.tcache_size_lower_bound = 2 * 1024;
+  jopts.tcache_size_upper_bound = 1024;
+
+  ASSERT_NOK(NewJemallocNodumpAllocator(jopts, nullptr));
+  Status s = NewJemallocNodumpAllocator(jopts, &allocator);
+  std::string msg;
+  if (!JemallocNodumpAllocator::IsSupported(&msg)) {
+    ASSERT_TRUE(s.IsNotSupported());
+    ROCKSDB_GTEST_BYPASS("JEMALLOC not supported");
+    return;
+  }
+  ASSERT_NOK(s);  // Invalid options
+  ASSERT_EQ(allocator, nullptr);
+
+  jopts.tcache_size_upper_bound = 4 * 1024;
+  ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator));
+  ASSERT_NE(allocator, nullptr);
+  auto opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound);
+  ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound);
+  ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size);
+
+  jopts.limit_tcache_size = false;
+  ASSERT_OK(NewJemallocNodumpAllocator(jopts, &allocator));
+  ASSERT_NE(allocator, nullptr);
+  opts = allocator->GetOptions<JemallocAllocatorOptions>();
+  ASSERT_EQ(opts->tcache_size_upper_bound, jopts.tcache_size_upper_bound);
+  ASSERT_EQ(opts->tcache_size_lower_bound, jopts.tcache_size_lower_bound);
+  ASSERT_EQ(opts->limit_tcache_size, jopts.limit_tcache_size);
+}
+
+INSTANTIATE_TEST_CASE_P(DefaultMemoryAllocator, MemoryAllocatorTest,
+                        ::testing::Values(std::make_tuple(
+                            DefaultMemoryAllocator::kClassName(), true)));
+#ifdef MEMKIND
+INSTANTIATE_TEST_CASE_P(
+    MemkindkMemAllocator, MemoryAllocatorTest,
+    ::testing::Values(std::make_tuple(MemkindKmemAllocator::kClassName(),
+                                      MemkindKmemAllocator::IsSupported())));
+#endif  // MEMKIND
+
+#ifdef ROCKSDB_JEMALLOC
+INSTANTIATE_TEST_CASE_P(
+    JemallocNodumpAllocator, MemoryAllocatorTest,
+    ::testing::Values(std::make_tuple(JemallocNodumpAllocator::kClassName(),
+                                      JemallocNodumpAllocator::IsSupported())));
+#endif  // ROCKSDB_JEMALLOC
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_usage.h mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_usage.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memory/memory_usage.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memory/memory_usage.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,8 +5,11 @@
 
 #pragma once
 
+#include <cstddef>
 #include <unordered_map>
 
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 // Helper methods to estimate memroy usage by std containers.
@@ -14,7 +17,7 @@
 template <class Key, class Value, class Hash>
 size_t ApproximateMemoryUsage(
     const std::unordered_map<Key, Value, Hash>& umap) {
-  typedef std::unordered_map<Key, Value, Hash> Map;
+  using Map = std::unordered_map<Key, Value, Hash>;
   return sizeof(umap) +
          // Size of all items plus a next pointer for each item.
          (sizeof(typename Map::value_type) + sizeof(void*)) * umap.size() +
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,10 +5,10 @@
 //
 
 #ifndef ROCKSDB_LITE
-#include "memtable/hash_linklist_rep.h"
 
 #include <algorithm>
 #include <atomic>
+
 #include "db/memtable.h"
 #include "memory/arena.h"
 #include "memtable/skiplist.h"
@@ -17,14 +17,15 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/hash.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
 
-typedef const char* Key;
-typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
-typedef std::atomic<void*> Pointer;
+using Key = const char*;
+using MemtableSkipList = SkipList<Key, const MemTableRep::KeyComparator&>;
+using Pointer = std::atomic<void*>;
 
 // A data structure used as the header of a link list of a hash bucket.
 struct BucketHeader {
@@ -218,7 +219,7 @@
   }
 
   size_t GetHash(const Slice& slice) const {
-    return fastrange64(GetSliceNPHash64(slice), bucket_size_);
+    return GetSliceRangedNPHash(slice, bucket_size_);
   }
 
   Pointer* GetBucket(size_t i) const {
@@ -820,15 +821,77 @@
   return x;
 }
 
-} // anon namespace
+struct HashLinkListRepOptions {
+  static const char* kName() { return "HashLinkListRepFactoryOptions"; }
+  size_t bucket_count;
+  uint32_t threshold_use_skiplist;
+  size_t huge_page_tlb_size;
+  int bucket_entries_logging_threshold;
+  bool if_log_bucket_dist_when_flash;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> hash_linklist_info = {
+    {"bucket_count",
+     {offsetof(struct HashLinkListRepOptions, bucket_count), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"threshold",
+     {offsetof(struct HashLinkListRepOptions, threshold_use_skiplist),
+      OptionType::kUInt32T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"huge_page_size",
+     {offsetof(struct HashLinkListRepOptions, huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"logging_threshold",
+     {offsetof(struct HashLinkListRepOptions, bucket_entries_logging_threshold),
+      OptionType::kInt, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"log_when_flash",
+     {offsetof(struct HashLinkListRepOptions, if_log_bucket_dist_when_flash),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+class HashLinkListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashLinkListRepFactory(size_t bucket_count,
+                                  uint32_t threshold_use_skiplist,
+                                  size_t huge_page_tlb_size,
+                                  int bucket_entries_logging_threshold,
+                                  bool if_log_bucket_dist_when_flash) {
+    options_.bucket_count = bucket_count;
+    options_.threshold_use_skiplist = threshold_use_skiplist;
+    options_.huge_page_tlb_size = huge_page_tlb_size;
+    options_.bucket_entries_logging_threshold =
+        bucket_entries_logging_threshold;
+    options_.if_log_bucket_dist_when_flash = if_log_bucket_dist_when_flash;
+    RegisterOptions(&options_, &hash_linklist_info);
+  }
+
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Allocator* allocator,
+      const SliceTransform* transform, Logger* logger) override;
+
+  static const char* kClassName() { return "HashLinkListRepFactory"; }
+  static const char* kNickName() { return "hash_linkedlist"; }
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+
+ private:
+  HashLinkListRepOptions options_;
+};
+
+}  // namespace
 
 MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Allocator* allocator,
     const SliceTransform* transform, Logger* logger) {
-  return new HashLinkListRep(compare, allocator, transform, bucket_count_,
-                             threshold_use_skiplist_, huge_page_tlb_size_,
-                             logger, bucket_entries_logging_threshold_,
-                             if_log_bucket_dist_when_flash_);
+  return new HashLinkListRep(
+      compare, allocator, transform, options_.bucket_count,
+      options_.threshold_use_skiplist, options_.huge_page_tlb_size, logger,
+      options_.bucket_entries_logging_threshold,
+      options_.if_log_bucket_dist_when_flash);
 }
 
 MemTableRepFactory* NewHashLinkListRepFactory(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_linklist_rep.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,49 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#ifndef ROCKSDB_LITE
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/memtablerep.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class HashLinkListRepFactory : public MemTableRepFactory {
- public:
-  explicit HashLinkListRepFactory(size_t bucket_count,
-                                  uint32_t threshold_use_skiplist,
-                                  size_t huge_page_tlb_size,
-                                  int bucket_entries_logging_threshold,
-                                  bool if_log_bucket_dist_when_flash)
-      : bucket_count_(bucket_count),
-        threshold_use_skiplist_(threshold_use_skiplist),
-        huge_page_tlb_size_(huge_page_tlb_size),
-        bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
-        if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {}
-
-  virtual ~HashLinkListRepFactory() {}
-
-  using MemTableRepFactory::CreateMemTableRep;
-  virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Allocator* allocator,
-      const SliceTransform* transform, Logger* logger) override;
-
-  virtual const char* Name() const override {
-    return "HashLinkListRepFactory";
-  }
-
- private:
-  const size_t bucket_count_;
-  const uint32_t threshold_use_skiplist_;
-  const size_t huge_page_tlb_size_;
-  int bucket_entries_logging_threshold_;
-  bool if_log_bucket_dist_when_flash_;
-};
-
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,8 +5,6 @@
 //
 
 #ifndef ROCKSDB_LITE
-#include "memtable/hash_skiplist_rep.h"
-
 #include <atomic>
 
 #include "db/memtable.h"
@@ -16,6 +14,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/murmurhash.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -46,7 +45,7 @@
 
  private:
   friend class DynamicIterator;
-  typedef SkipList<const char*, const MemTableRep::KeyComparator&> Bucket;
+  using Bucket = SkipList<const char*, const MemTableRep::KeyComparator&>;
 
   size_t bucket_size_;
 
@@ -329,13 +328,60 @@
   }
 }
 
-} // anon namespace
+struct HashSkipListRepOptions {
+  static const char* kName() { return "HashSkipListRepFactoryOptions"; }
+  size_t bucket_count;
+  int32_t skiplist_height;
+  int32_t skiplist_branching_factor;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> hash_skiplist_info = {
+    {"bucket_count",
+     {offsetof(struct HashSkipListRepOptions, bucket_count), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"skiplist_height",
+     {offsetof(struct HashSkipListRepOptions, skiplist_height),
+      OptionType::kInt32T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"branching_factor",
+     {offsetof(struct HashSkipListRepOptions, skiplist_branching_factor),
+      OptionType::kInt32T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+class HashSkipListRepFactory : public MemTableRepFactory {
+ public:
+  explicit HashSkipListRepFactory(size_t bucket_count, int32_t skiplist_height,
+                                  int32_t skiplist_branching_factor) {
+    options_.bucket_count = bucket_count;
+    options_.skiplist_height = skiplist_height;
+    options_.skiplist_branching_factor = skiplist_branching_factor;
+    RegisterOptions(&options_, &hash_skiplist_info);
+  }
+
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Allocator* allocator,
+      const SliceTransform* transform, Logger* logger) override;
+
+  static const char* kClassName() { return "HashSkipListRepFactory"; }
+  static const char* kNickName() { return "prefix_hash"; }
+
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
+
+ private:
+  HashSkipListRepOptions options_;
+};
+
+}  // namespace
 
 MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Allocator* allocator,
     const SliceTransform* transform, Logger* /*logger*/) {
-  return new HashSkipListRep(compare, allocator, transform, bucket_count_,
-                             skiplist_height_, skiplist_branching_factor_);
+  return new HashSkipListRep(compare, allocator, transform,
+                             options_.bucket_count, options_.skiplist_height,
+                             options_.skiplist_branching_factor);
 }
 
 MemTableRepFactory* NewHashSkipListRepFactory(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/hash_skiplist_rep.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,44 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#ifndef ROCKSDB_LITE
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/memtablerep.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class HashSkipListRepFactory : public MemTableRepFactory {
- public:
-  explicit HashSkipListRepFactory(
-    size_t bucket_count,
-    int32_t skiplist_height,
-    int32_t skiplist_branching_factor)
-      : bucket_count_(bucket_count),
-        skiplist_height_(skiplist_height),
-        skiplist_branching_factor_(skiplist_branching_factor) { }
-
-  virtual ~HashSkipListRepFactory() {}
-
-  using MemTableRepFactory::CreateMemTableRep;
-  virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Allocator* allocator,
-      const SliceTransform* transform, Logger* logger) override;
-
-  virtual const char* Name() const override {
-    return "HashSkipListRepFactory";
-  }
-
- private:
-  const size_t bucket_count_;
-  const int32_t skiplist_height_;
-  const int32_t skiplist_branching_factor_;
-};
-
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist.h mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist.h	2025-05-19 16:14:27.000000000 +0000
@@ -177,6 +177,9 @@
     // Retreat to the last entry with a key <= target
     void SeekForPrev(const char* target);
 
+    // Advance to a random entry in the list.
+    void RandomSeek();
+
     // Position at the first entry in list.
     // Final state of iterator is Valid() iff list is not empty.
     void SeekToFirst();
@@ -252,6 +255,9 @@
   // Return head_ if list is empty.
   Node* FindLast() const;
 
+  // Returns a random entry.
+  Node* FindRandomEntry() const;
+
   // Traverses a single level of the list, setting *out_prev to the last
   // node before the key and *out_next to the first node after. Assumes
   // that the key is not present in the skip list. On entry, before should
@@ -413,6 +419,11 @@
 }
 
 template <class Comparator>
+inline void InlineSkipList<Comparator>::Iterator::RandomSeek() {
+  node_ = list_->FindRandomEntry();
+}
+
+template <class Comparator>
 inline void InlineSkipList<Comparator>::Iterator::SeekToFirst() {
   node_ = list_->head_->Next(0);
 }
@@ -559,6 +570,48 @@
 }
 
 template <class Comparator>
+typename InlineSkipList<Comparator>::Node*
+InlineSkipList<Comparator>::FindRandomEntry() const {
+  // TODO(bjlemaire): consider adding PREFETCH calls.
+  Node *x = head_, *scan_node = nullptr, *limit_node = nullptr;
+
+  // We start at the max level.
+  // FOr each level, we look at all the nodes at the level, and
+  // we randomly pick one of them. Then decrement the level
+  // and reiterate the process.
+  // eg: assume GetMaxHeight()=5, and there are #100 elements (nodes).
+  // level 4 nodes: lvl_nodes={#1, #15, #67, #84}. Randomly pick #15.
+  // We will consider all the nodes between #15 (inclusive) and #67
+  // (exclusive). #67 is called 'limit_node' here.
+  // level 3 nodes: lvl_nodes={#15, #21, #45, #51}. Randomly choose
+  // #51. #67 remains 'limit_node'.
+  // [...]
+  // level 0 nodes: lvl_nodes={#56,#57,#58,#59}. Randomly pick $57.
+  // Return Node #57.
+  std::vector<Node*> lvl_nodes;
+  Random* rnd = Random::GetTLSInstance();
+  int level = GetMaxHeight() - 1;
+
+  while (level >= 0) {
+    lvl_nodes.clear();
+    scan_node = x;
+    while (scan_node != limit_node) {
+      lvl_nodes.push_back(scan_node);
+      scan_node = scan_node->Next(level);
+    }
+    uint32_t rnd_idx = rnd->Next() % lvl_nodes.size();
+    x = lvl_nodes[rnd_idx];
+    if (rnd_idx + 1 < lvl_nodes.size()) {
+      limit_node = lvl_nodes[rnd_idx + 1];
+    }
+    level--;
+  }
+  // There is a special case where x could still be the head_
+  // (note that the head_ contains no key).
+  return x == head_ ? head_->Next(0) : x;
+}
+
+template <class Comparator>
 uint64_t InlineSkipList<Comparator>::EstimateCount(const char* key) const {
   uint64_t count = 0;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/inlineskiplist_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -19,7 +19,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 // Our test skip list stores 8-byte unsigned integers
-typedef uint64_t Key;
+using Key = uint64_t;
 
 static const char* Encode(const uint64_t* key) {
   return reinterpret_cast<const char*>(key);
@@ -32,7 +32,7 @@
 }
 
 struct TestComparator {
-  typedef Key DecodedType;
+  using DecodedType = Key;
 
   static DecodedType decode_key(const char* b) {
     return Decode(b);
@@ -59,7 +59,7 @@
   }
 };
 
-typedef InlineSkipList<TestComparator> TestInlineSkipList;
+using TestInlineSkipList = InlineSkipList<TestComparator>;
 
 class InlineSkipTest : public testing::Test {
  public:
@@ -309,7 +309,7 @@
   Validate(&list);
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 // We want to make sure that with a single writer and multiple
 // concurrent readers (with no synchronization other than when a
 // reader's iterator is created), the reader always observes all the
@@ -654,7 +654,7 @@
   RunConcurrentInsert(3, true);
 }
 
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/memtablerep_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -28,9 +28,11 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
@@ -141,7 +143,7 @@
   RandomGenerator() {
     Random rnd(301);
     auto size = (unsigned)std::max(1048576, FLAGS_item_size);
-    test::RandomString(&rnd, size, &data_);
+    data_ = rnd.RandomString(size);
     pos_ = 0;
   }
 
@@ -170,9 +172,8 @@
       for (uint64_t i = 0; i < num_; ++i) {
         values_[i] = i;
       }
-      std::shuffle(
-          values_.begin(), values_.end(),
-          std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
+      RandomShuffle(values_.begin(), values_.end(),
+                    static_cast<uint32_t>(FLAGS_seed));
     }
   }
 
@@ -418,7 +419,7 @@
     uint64_t bytes_written = 0;
     uint64_t bytes_read = 0;
     uint64_t read_hits = 0;
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(SystemClock::Default().get(), true);
     RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits);
     auto elapsed_time = static_cast<double>(timer.ElapsedNanos() / 1000);
     std::cout << "Elapsed time: " << static_cast<int>(elapsed_time) << " us"
@@ -453,8 +454,8 @@
   MemTableRep* table_;
   KeyGenerator* key_gen_;
   uint64_t* sequence_;
-  uint64_t num_write_ops_per_thread_;
-  uint64_t num_read_ops_per_thread_;
+  uint64_t num_write_ops_per_thread_ = 0;
+  uint64_t num_read_ops_per_thread_ = 0;
   const uint32_t num_threads_;
 };
 
@@ -581,13 +582,15 @@
 #ifndef ROCKSDB_LITE
   } else if (FLAGS_memtablerep == "vector") {
     factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
-  } else if (FLAGS_memtablerep == "hashskiplist") {
+  } else if (FLAGS_memtablerep == "hashskiplist" ||
+             FLAGS_memtablerep == "prefix_hash") {
     factory.reset(ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
         FLAGS_bucket_count, FLAGS_hashskiplist_height,
         FLAGS_hashskiplist_branching_factor));
     options.prefix_extractor.reset(
         ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_length));
-  } else if (FLAGS_memtablerep == "hashlinklist") {
+  } else if (FLAGS_memtablerep == "hashlinklist" ||
+             FLAGS_memtablerep == "hash_linkedlist") {
     factory.reset(ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(
         FLAGS_bucket_count, FLAGS_huge_page_tlb_size,
         FLAGS_bucket_entries_logging_threshold,
@@ -596,8 +599,16 @@
         ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_length));
 #endif  // ROCKSDB_LITE
   } else {
-    fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str());
-    exit(1);
+    ROCKSDB_NAMESPACE::ConfigOptions config_options;
+    config_options.ignore_unsupported_options = false;
+
+    ROCKSDB_NAMESPACE::Status s =
+        ROCKSDB_NAMESPACE::MemTableRepFactory::CreateFromString(
+            config_options, FLAGS_memtablerep, &factory);
+    if (!s.ok()) {
+      fprintf(stdout, "Unknown memtablerep: %s\n", s.ToString().c_str());
+      exit(1);
+    }
   }
 
   ROCKSDB_NAMESPACE::InternalKeyComparator internal_key_comp(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplist_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplist_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplist_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplist_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -17,7 +17,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-typedef uint64_t Key;
+using Key = uint64_t;
 
 struct TestComparator {
   int operator()(const Key& a, const Key& b) const {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplistrep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplistrep.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/skiplistrep.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/skiplistrep.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,10 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
+#include <random>
+
 #include "db/memtable.h"
 #include "memory/arena.h"
 #include "memtable/inlineskiplist.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
@@ -95,6 +99,66 @@
     return (end_count >= start_count) ? (end_count - start_count) : 0;
   }
 
+  void UniqueRandomSample(const uint64_t num_entries,
+                          const uint64_t target_sample_size,
+                          std::unordered_set<const char*>* entries) override {
+    entries->clear();
+    // Avoid divide-by-0.
+    assert(target_sample_size > 0);
+    assert(num_entries > 0);
+    // NOTE: the size of entries is not enforced to be exactly
+    // target_sample_size at the end of this function, it might be slightly
+    // greater or smaller.
+    SkipListRep::Iterator iter(&skip_list_);
+    // There are two methods to create the subset of samples (size m)
+    // from the table containing N elements:
+    // 1-Iterate linearly through the N memtable entries. For each entry i,
+    //   add it to the sample set with a probability
+    //   (target_sample_size - entries.size() ) / (N-i).
+    //
+    // 2-Pick m random elements without repetition.
+    // We pick Option 2 when m<sqrt(N) and
+    // Option 1 when m > sqrt(N).
+    if (target_sample_size >
+        static_cast<uint64_t>(std::sqrt(1.0 * num_entries))) {
+      Random* rnd = Random::GetTLSInstance();
+      iter.SeekToFirst();
+      uint64_t counter = 0, num_samples_left = target_sample_size;
+      for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) {
+        // Add entry to sample set with probability
+        // num_samples_left/(num_entries - counter).
+        if (rnd->Next() % (num_entries - counter) < num_samples_left) {
+          entries->insert(iter.key());
+          num_samples_left--;
+        }
+      }
+    } else {
+      // Option 2: pick m random elements with no duplicates.
+      // If Option 2 is picked, then target_sample_size<sqrt(N)
+      // Using a set spares the need to check for duplicates.
+      for (uint64_t i = 0; i < target_sample_size; i++) {
+        // We give it 5 attempts to find a non-duplicate
+        // With 5 attempts, the chances of returning `entries` set
+        // of size target_sample_size is:
+        // PROD_{i=1}^{target_sample_size-1} [1-(i/N)^5]
+        // which is monotonically increasing with N in the worse case
+        // of target_sample_size=sqrt(N), and is always >99.9% for N>4.
+        // At worst, for the final pick , when m=sqrt(N) there is
+        // a probability of p= 1/sqrt(N) chances to find a duplicate.
+        for (uint64_t j = 0; j < 5; j++) {
+          iter.RandomSeek();
+          // unordered_set::insert returns pair<iterator, bool>.
+          // The second element is true if an insert successfully happened.
+          // If element is already in the set, this bool will be false, and
+          // true otherwise.
+          if ((entries->insert(iter.key())).second) {
+            break;
+          }
+        }
+      }
+    }
+  }
+
   ~SkipListRep() override {}
 
   // Iteration over the contents of a skip list
@@ -143,6 +207,8 @@
       }
     }
 
+    void RandomSeek() override { iter_.RandomSeek(); }
+
     // Position at the first entry in list.
     // Final state of iterator is Valid() iff list is not empty.
     void SeekToFirst() override { iter_.SeekToFirst(); }
@@ -271,6 +337,27 @@
 };
 }
 
+static std::unordered_map<std::string, OptionTypeInfo> skiplist_factory_info = {
+#ifndef ROCKSDB_LITE
+    {"lookahead",
+     {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}},
+#endif
+};
+
+SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) {
+  RegisterOptions("SkipListFactoryOptions", &lookahead_,
+                  &skiplist_factory_info);
+}
+
+std::string SkipListFactory::GetId() const {
+  std::string id = Name();
+  if (lookahead_ > 0) {
+    id.append(":").append(ROCKSDB_NAMESPACE::ToString(lookahead_));
+  }
+  return id;
+}
+
 MemTableRep* SkipListFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Allocator* allocator,
     const SliceTransform* transform, Logger* /*logger*/) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/vectorrep.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/vectorrep.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/vectorrep.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/vectorrep.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,25 +4,23 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #ifndef ROCKSDB_LITE
-#include "rocksdb/memtablerep.h"
-
-#include <unordered_set>
-#include <set>
-#include <memory>
 #include <algorithm>
+#include <memory>
+#include <set>
 #include <type_traits>
+#include <unordered_set>
 
 #include "db/memtable.h"
 #include "memory/arena.h"
 #include "memtable/stl_wrappers.h"
 #include "port/port.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
 
-using namespace stl_wrappers;
-
 class VectorRep : public MemTableRep {
  public:
   VectorRep(const KeyComparator& compare, Allocator* allocator, size_t count);
@@ -98,7 +96,7 @@
 
  private:
   friend class Iterator;
-  typedef std::vector<const char*> Bucket;
+  using Bucket = std::vector<const char*>;
   std::shared_ptr<Bucket> bucket_;
   mutable port::RWMutex rwlock_;
   bool immutable_;
@@ -157,14 +155,16 @@
   if (!sorted_ && vrep_ != nullptr) {
     WriteLock l(&vrep_->rwlock_);
     if (!vrep_->sorted_) {
-      std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
+      std::sort(bucket_->begin(), bucket_->end(),
+                stl_wrappers::Compare(compare_));
       cit_ = bucket_->begin();
       vrep_->sorted_ = true;
     }
     sorted_ = true;
   }
   if (!sorted_) {
-    std::sort(bucket_->begin(), bucket_->end(), Compare(compare_));
+    std::sort(bucket_->begin(), bucket_->end(),
+              stl_wrappers::Compare(compare_));
     cit_ = bucket_->begin();
     sorted_ = true;
   }
@@ -292,6 +292,16 @@
 }
 } // anon namespace
 
+static std::unordered_map<std::string, OptionTypeInfo> vector_rep_table_info = {
+    {"count",
+     {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+VectorRepFactory::VectorRepFactory(size_t count) : count_(count) {
+  RegisterOptions("VectorRepFactoryOptions", &count_, &vector_rep_table_info);
+}
+
 MemTableRep* VectorRepFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Allocator* allocator,
     const SliceTransform*, Logger* /*logger*/) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,57 +8,31 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/write_buffer_manager.h"
-#include <mutex>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/status.h"
 #include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
-#ifndef ROCKSDB_LITE
-namespace {
-const size_t kSizeDummyEntry = 256 * 1024;
-// The key will be longer than keys for blocks in SST files so they won't
-// conflict.
-const size_t kCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
-}  // namespace
-
-struct WriteBufferManager::CacheRep {
-  std::shared_ptr<Cache> cache_;
-  std::mutex cache_mutex_;
-  std::atomic<size_t> cache_allocated_size_;
-  // The non-prefix part will be updated according to the ID to use.
-  char cache_key_[kCacheKeyPrefix + kMaxVarint64Length];
-  uint64_t next_cache_key_id_ = 0;
-  std::vector<Cache::Handle*> dummy_handles_;
-
-  explicit CacheRep(std::shared_ptr<Cache> cache)
-      : cache_(cache), cache_allocated_size_(0) {
-    memset(cache_key_, 0, kCacheKeyPrefix);
-    size_t pointer_size = sizeof(const void*);
-    assert(pointer_size <= kCacheKeyPrefix);
-    memcpy(cache_key_, static_cast<const void*>(this), pointer_size);
-  }
-
-  Slice GetNextCacheKey() {
-    memset(cache_key_ + kCacheKeyPrefix, 0, kMaxVarint64Length);
-    char* end =
-        EncodeVarint64(cache_key_ + kCacheKeyPrefix, next_cache_key_id_++);
-    return Slice(cache_key_, static_cast<size_t>(end - cache_key_));
-  }
-};
-#else
-struct WriteBufferManager::CacheRep {};
-#endif  // ROCKSDB_LITE
-
 WriteBufferManager::WriteBufferManager(size_t _buffer_size,
-                                       std::shared_ptr<Cache> cache)
+                                       std::shared_ptr<Cache> cache,
+                                       bool allow_stall)
     : buffer_size_(_buffer_size),
       mutable_limit_(buffer_size_ * 7 / 8),
       memory_used_(0),
       memory_active_(0),
-      cache_rep_(nullptr) {
+      cache_res_mgr_(nullptr),
+      allow_stall_(allow_stall),
+      stall_active_(false) {
 #ifndef ROCKSDB_LITE
   if (cache) {
-    // Construct the cache key using the pointer to this.
-    cache_rep_.reset(new CacheRep(cache));
+    // Memtable's memory usage tends to fluctuate frequently
+    // therefore we set delayed_decrease = true to save some dummy entry
+    // insertion on memory increase right after memory decrease
+    cache_res_mgr_.reset(
+        new CacheReservationManager(cache, true /* delayed_decrease */));
   }
 #else
   (void)cache;
@@ -66,65 +40,164 @@
 }
 
 WriteBufferManager::~WriteBufferManager() {
-#ifndef ROCKSDB_LITE
-  if (cache_rep_) {
-    for (auto* handle : cache_rep_->dummy_handles_) {
-      cache_rep_->cache_->Release(handle, true);
-    }
+#ifndef NDEBUG
+  std::unique_lock<std::mutex> lock(mu_);
+  assert(queue_.empty());
+#endif
+}
+
+std::size_t WriteBufferManager::dummy_entries_in_cache_usage() const {
+  if (cache_res_mgr_ != nullptr) {
+    return cache_res_mgr_->GetTotalReservedCacheSize();
+  } else {
+    return 0;
+  }
+}
+
+void WriteBufferManager::ReserveMem(size_t mem) {
+  if (cache_res_mgr_ != nullptr) {
+    ReserveMemWithCache(mem);
+  } else if (enabled()) {
+    memory_used_.fetch_add(mem, std::memory_order_relaxed);
+  }
+  if (enabled()) {
+    memory_active_.fetch_add(mem, std::memory_order_relaxed);
   }
-#endif  // ROCKSDB_LITE
 }
 
 // Should only be called from write thread
 void WriteBufferManager::ReserveMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
-  assert(cache_rep_ != nullptr);
+  assert(cache_res_mgr_ != nullptr);
   // Use a mutex to protect various data structures. Can be optimized to a
   // lock-free solution if it ends up with a performance bottleneck.
-  std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);
+  std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
 
   size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) + mem;
   memory_used_.store(new_mem_used, std::memory_order_relaxed);
-  while (new_mem_used > cache_rep_->cache_allocated_size_) {
-    // Expand size by at least 256KB.
-    // Add a dummy record to the cache
-    Cache::Handle* handle;
-    cache_rep_->cache_->Insert(cache_rep_->GetNextCacheKey(), nullptr,
-                               kSizeDummyEntry, nullptr, &handle);
-    cache_rep_->dummy_handles_.push_back(handle);
-    cache_rep_->cache_allocated_size_ += kSizeDummyEntry;
-  }
+  Status s =
+      cache_res_mgr_->UpdateCacheReservation<CacheEntryRole::kWriteBuffer>(
+          new_mem_used);
+
+  // We absorb the error since WriteBufferManager is not able to handle
+  // this failure properly. Ideallly we should prevent this allocation
+  // from happening if this cache reservation fails.
+  // [TODO] We'll need to improve it in the future and figure out what to do on
+  // error
+  s.PermitUncheckedError();
 #else
   (void)mem;
 #endif  // ROCKSDB_LITE
 }
 
+void WriteBufferManager::ScheduleFreeMem(size_t mem) {
+  if (enabled()) {
+    memory_active_.fetch_sub(mem, std::memory_order_relaxed);
+  }
+}
+
+void WriteBufferManager::FreeMem(size_t mem) {
+  if (cache_res_mgr_ != nullptr) {
+    FreeMemWithCache(mem);
+  } else if (enabled()) {
+    memory_used_.fetch_sub(mem, std::memory_order_relaxed);
+  }
+  // Check if stall is active and can be ended.
+  MaybeEndWriteStall();
+}
+
 void WriteBufferManager::FreeMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
-  assert(cache_rep_ != nullptr);
+  assert(cache_res_mgr_ != nullptr);
   // Use a mutex to protect various data structures. Can be optimized to a
   // lock-free solution if it ends up with a performance bottleneck.
-  std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);
+  std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
   size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem;
   memory_used_.store(new_mem_used, std::memory_order_relaxed);
-  // Gradually shrink memory costed in the block cache if the actual
-  // usage is less than 3/4 of what we reserve from the block cache.
-  // We do this because:
-  // 1. we don't pay the cost of the block cache immediately a memtable is
-  //    freed, as block cache insert is expensive;
-  // 2. eventually, if we walk away from a temporary memtable size increase,
-  //    we make sure shrink the memory costed in block cache over time.
-  // In this way, we only shrink costed memory showly even there is enough
-  // margin.
-  if (new_mem_used < cache_rep_->cache_allocated_size_ / 4 * 3 &&
-      cache_rep_->cache_allocated_size_ - kSizeDummyEntry > new_mem_used) {
-    assert(!cache_rep_->dummy_handles_.empty());
-    cache_rep_->cache_->Release(cache_rep_->dummy_handles_.back(), true);
-    cache_rep_->dummy_handles_.pop_back();
-    cache_rep_->cache_allocated_size_ -= kSizeDummyEntry;
-  }
+  Status s =
+      cache_res_mgr_->UpdateCacheReservation<CacheEntryRole::kWriteBuffer>(
+          new_mem_used);
+
+  // We absorb the error since WriteBufferManager is not able to handle
+  // this failure properly.
+  // [TODO] We'll need to improve it in the future and figure out what to do on
+  // error
+  s.PermitUncheckedError();
 #else
   (void)mem;
 #endif  // ROCKSDB_LITE
 }
+
+void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) {
+  assert(wbm_stall != nullptr);
+  assert(allow_stall_);
+
+  // Allocate outside of the lock.
+  std::list<StallInterface*> new_node = {wbm_stall};
+
+  {
+    std::unique_lock<std::mutex> lock(mu_);
+    // Verify if the stall conditions are stil active.
+    if (ShouldStall()) {
+      stall_active_.store(true, std::memory_order_relaxed);
+      queue_.splice(queue_.end(), std::move(new_node));
+    }
+  }
+
+  // If the node was not consumed, the stall has ended already and we can signal
+  // the caller.
+  if (!new_node.empty()) {
+    new_node.front()->Signal();
+  }
+}
+
+// Called when memory is freed in FreeMem or the buffer size has changed.
+void WriteBufferManager::MaybeEndWriteStall() {
+  // Cannot early-exit on !enabled() because SetBufferSize(0) needs to unblock
+  // the writers.
+  if (!allow_stall_) {
+    return;
+  }
+
+  if (IsStallThresholdExceeded()) {
+    return;  // Stall conditions have not resolved.
+  }
+
+  // Perform all deallocations outside of the lock.
+  std::list<StallInterface*> cleanup;
+
+  std::unique_lock<std::mutex> lock(mu_);
+  if (!stall_active_.load(std::memory_order_relaxed)) {
+    return;  // Nothing to do.
+  }
+
+  // Unblock new writers.
+  stall_active_.store(false, std::memory_order_relaxed);
+
+  // Unblock the writers in the queue.
+  for (StallInterface* wbm_stall : queue_) {
+    wbm_stall->Signal();
+  }
+  cleanup = std::move(queue_);
+}
+
+void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) {
+  assert(wbm_stall != nullptr);
+
+  // Deallocate the removed nodes outside of the lock.
+  std::list<StallInterface*> cleanup;
+
+  if (enabled() && allow_stall_) {
+    std::unique_lock<std::mutex> lock(mu_);
+    for (auto it = queue_.begin(); it != queue_.end();) {
+      auto next = std::next(it);
+      if (*it == wbm_stall) {
+        cleanup.splice(cleanup.end(), queue_, std::move(it));
+      }
+      it = next;
+    }
+  }
+  wbm_stall->Signal();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/memtable/write_buffer_manager_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,10 +11,11 @@
 #include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
-
 class WriteBufferManagerTest : public testing::Test {};
 
 #ifndef ROCKSDB_LITE
+const size_t kSizeDummyEntry = 256 * 1024;
+
 TEST_F(WriteBufferManagerTest, ShouldFlush) {
   // A write buffer manager of size 10MB
   std::unique_ptr<WriteBufferManager> wbf(
@@ -46,11 +47,39 @@
   ASSERT_TRUE(wbf->ShouldFlush());
 
   wbf->FreeMem(7 * 1024 * 1024);
-  // 9MB total, 8MB mutable.
+  // 8MB total, 8MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  // change size: 8M limit, 7M mutable limit
+  wbf->SetBufferSize(8 * 1024 * 1024);
+  // 8MB total, 8MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(2 * 1024 * 1024);
+  // 8MB total, 6MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->FreeMem(2 * 1024 * 1024);
+  // 6MB total, 6MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(1 * 1024 * 1024);
+  // 7MB total, 7MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(1 * 1024 * 1024);
+  // 8MB total, 8MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(1 * 1024 * 1024);
+  wbf->FreeMem(1 * 1024 * 1024);
+  // 7MB total, 7MB mutable.
   ASSERT_FALSE(wbf->ShouldFlush());
 }
 
 TEST_F(WriteBufferManagerTest, CacheCost) {
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+
   LRUCacheOptions co;
   // 1GB cache
   co.capacity = 1024 * 1024 * 1024;
@@ -61,91 +90,208 @@
   std::unique_ptr<WriteBufferManager> wbf(
       new WriteBufferManager(50 * 1024 * 1024, cache));
 
-  // Allocate 333KB will allocate 512KB
+  // Allocate 333KB will allocate 512KB, memory_used_ = 333KB
   wbf->ReserveMem(333 * 1024);
+  // 2 dummy entries are added for size 333 KB
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 2 * kSizeDummyEntry);
   ASSERT_GE(cache->GetPinnedUsage(), 2 * 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 2 * 256 * 1024 + 10000);
+  ASSERT_LT(cache->GetPinnedUsage(), 2 * 256 * 1024 + kMetaDataChargeOverhead);
 
-  // Allocate another 512KB
+  // Allocate another 512KB, memory_used_ = 845KB
   wbf->ReserveMem(512 * 1024);
+  // 2 more dummy entries are added for size 512 KB
+  // since ceil((memory_used_ - dummy_entries_in_cache_usage) % kSizeDummyEntry)
+  // = 2
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
   ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + 10000);
+  ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead);
 
-  // Allocate another 10MB
+  // Allocate another 10MB, memory_used_ = 11085KB
   wbf->ReserveMem(10 * 1024 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000);
-
-  // Free 1MB will not cause any change in cache cost
-  wbf->FreeMem(1024 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000);
-
+  // 40 more entries are added for size 10 * 1024 * 1024 KB
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 1MB, memory_used_ = 10061KB
+  // It will not cause any change in cache cost
+  // since memory_used_ > dummy_entries_in_cache_usage * (3/4)
+  wbf->FreeMem(1 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
   ASSERT_FALSE(wbf->ShouldFlush());
 
-  // Allocate another 41MB
+  // Allocate another 41MB, memory_used_ = 52045KB
   wbf->ReserveMem(41 * 1024 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            204 * 256 * 1024 + kMetaDataChargeOverhead);
   ASSERT_TRUE(wbf->ShouldFlush());
 
   ASSERT_TRUE(wbf->ShouldFlush());
 
+  // Schedule free 20MB, memory_used_ = 52045KB
+  // It will not cause any change in memory_used and cache cost
   wbf->ScheduleFreeMem(20 * 1024 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000);
-
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            204 * 256 * 1024 + kMetaDataChargeOverhead);
   // Still need flush as the hard limit hits
   ASSERT_TRUE(wbf->ShouldFlush());
 
-  // Free 20MB will releae 256KB from cache
+  // Free 20MB, memory_used_ = 31565KB
+  // It will releae 80 dummy entries from cache since
+  // since memory_used_ < dummy_entries_in_cache_usage * (3/4)
+  // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry)
+  // = 80
   wbf->FreeMem(20 * 1024 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            124 * 256 * 1024 + kMetaDataChargeOverhead);
 
   ASSERT_FALSE(wbf->ShouldFlush());
 
-  // Every free will release 256KB if still not hit 3/4
-  wbf->FreeMem(16 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024 + 10000);
-
+  // Free 16KB, memory_used_ = 31549KB
+  // It will not release any dummy entry since memory_used_ >=
+  // dummy_entries_in_cache_usage * (3/4)
   wbf->FreeMem(16 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000);
-
-  // Reserve 512KB will not cause any change in cache cost
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 124 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 124 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            124 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 20MB, memory_used_ = 11069KB
+  // It will releae 80 dummy entries from cache
+  // since memory_used_ < dummy_entries_in_cache_usage * (3/4)
+  // and floor((dummy_entries_in_cache_usage - memory_used_) % kSizeDummyEntry)
+  // = 80
+  wbf->FreeMem(20 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 1MB, memory_used_ = 10045KB
+  // It will not cause any change in cache cost
+  // since memory_used_ > dummy_entries_in_cache_usage * (3/4)
+  wbf->FreeMem(1 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Reserve 512KB, memory_used_ = 10557KB
+  // It will not casue any change in cache cost
+  // since memory_used_ > dummy_entries_in_cache_usage * (3/4)
+  // which reflects the benefit of saving dummy entry insertion on memory
+  // reservation after delay decrease
   wbf->ReserveMem(512 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000);
-
-  wbf->FreeMem(16 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
 
   // Destory write buffer manger should free everything
   wbf.reset();
-  ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0);
 }
 
 TEST_F(WriteBufferManagerTest, NoCapCacheCost) {
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
   // 1GB cache
   std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024 * 1024, 4);
   // A write buffer manager of size 256MB
   std::unique_ptr<WriteBufferManager> wbf(new WriteBufferManager(0, cache));
-  // Allocate 1.5MB will allocate 2MB
+
+  // Allocate 10MB,  memory_used_ = 10240KB
+  // It will allocate 40 dummy entries
   wbf->ReserveMem(10 * 1024 * 1024);
-  ASSERT_GE(cache->GetPinnedUsage(), 10 * 1024 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 10 * 1024 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 40 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 40 * 256 * 1024 + kMetaDataChargeOverhead);
+
   ASSERT_FALSE(wbf->ShouldFlush());
 
+  // Free 9MB,  memory_used_ = 1024KB
+  // It will free 36 dummy entries
   wbf->FreeMem(9 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead);
+
+  // Free 160KB gradually, memory_used_ = 864KB
+  // It will not cause any change
+  // since memory_used_ > dummy_entries_in_cache_usage * 3/4
   for (int i = 0; i < 40; i++) {
     wbf->FreeMem(4 * 1024);
   }
-  ASSERT_GE(cache->GetPinnedUsage(), 1024 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024);
+  ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + kMetaDataChargeOverhead);
+}
+
+TEST_F(WriteBufferManagerTest, CacheFull) {
+  constexpr std::size_t kMetaDataChargeOverhead = 20000;
+
+  // 12MB cache size with strict capacity
+  LRUCacheOptions lo;
+  lo.capacity = 12 * 1024 * 1024;
+  lo.num_shard_bits = 0;
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache = NewLRUCache(lo);
+  std::unique_ptr<WriteBufferManager> wbf(new WriteBufferManager(0, cache));
+
+  // Allocate 10MB, memory_used_ = 10240KB
+  wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 40 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            40 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Allocate 10MB, memory_used_ = 20480KB
+  // Some dummy entry insertion will fail due to full cache
+  wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_GE(cache->GetPinnedUsage(), 40 * kSizeDummyEntry);
+  ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024);
+  ASSERT_LT(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry);
+
+  // Free 15MB after encoutering cache full, memory_used_ = 5120KB
+  wbf->FreeMem(15 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 20 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 20 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            20 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Reserve 15MB, creating cache full again, memory_used_ = 20480KB
+  wbf->ReserveMem(15 * 1024 * 1024);
+  ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024);
+  ASSERT_LT(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry);
+
+  // Increase capacity so next insert will fully succeed
+  cache->SetCapacity(40 * 1024 * 1024);
+
+  // Allocate 10MB, memory_used_ = 30720KB
+  wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 120 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 120 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            120 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  // Gradually release 20 MB
+  // It ended up sequentially releasing 32, 24, 18 dummy entries when
+  // memory_used_ decreases to 22528KB, 16384KB, 11776KB.
+  // In total, it releases 74 dummy entries
+  for (int i = 0; i < 40; i++) {
+    wbf->FreeMem(512 * 1024);
+  }
+
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 46 * kSizeDummyEntry);
+  ASSERT_GE(cache->GetPinnedUsage(), 46 * kSizeDummyEntry);
+  ASSERT_LT(cache->GetPinnedUsage(),
+            46 * kSizeDummyEntry + kMetaDataChargeOverhead);
 }
+
 #endif  // ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/CMakeLists.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/CMakeLists.txt	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,16 @@
+find_package(benchmark REQUIRED)
+find_package(Threads REQUIRED)
+
+file(GLOB_RECURSE ALL_BENCH_CPP *.cc)
+foreach(ONE_BENCH_CPP ${ALL_BENCH_CPP})
+  get_filename_component(TARGET_NAME ${ONE_BENCH_CPP} NAME_WE)
+  add_executable(${TARGET_NAME} ${ONE_BENCH_CPP})
+  target_link_libraries(${TARGET_NAME} ${ROCKSDB_LIB} benchmark::benchmark
+          ${CMAKE_THREAD_LIBS_INIT})
+  # run benchmark like a test, if added, the benchmark tests could be run by `ctest -R Bench_`
+  # add_test(Bench_${TARGET_NAME} ${TARGET_NAME})
+  list(APPEND ALL_BENCH_TARGETS ${TARGET_NAME})
+endforeach()
+add_custom_target(microbench
+        COMMAND for t in ${ALL_BENCH_TARGETS}\; do \.\/$$t \|\| exit 1\; done
+        DEPENDS ${ALL_BENCH_TARGETS})
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/db_basic_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,134 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// this is a simple micro-benchmark for compare ribbon filter vs. other filter
+// for more comprehensive, please check the dedicate util/filter_bench.
+#include <benchmark/benchmark.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void DBOpen(benchmark::State& state) {
+  // create DB
+  DB* db;
+  Options options;
+  auto env = Env::Default();
+  std::string db_path;
+  auto s = env->GetTestDirectory(&db_path);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+    return;
+  }
+  std::string db_name = db_path + "/bench_dbopen";
+
+  DestroyDB(db_name, options);
+
+  options.create_if_missing = true;
+  s = DB::Open(options, db_name, &db);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+    return;
+  }
+  db->Close();
+
+  options.create_if_missing = false;
+
+  auto rnd = Random(12345);
+
+  for (auto _ : state) {
+    s = DB::Open(options, db_name, &db);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    state.PauseTiming();
+    auto wo = WriteOptions();
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 100; j++) {
+        s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100));
+        if (!s.ok()) {
+          state.SkipWithError(s.ToString().c_str());
+        }
+      }
+      s = db->Flush(FlushOptions());
+    }
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    s = db->Close();
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    state.ResumeTiming();
+  }
+  DestroyDB(db_name, options);
+}
+
+BENCHMARK(DBOpen)->Iterations(200);  // specify iteration number as the db size
+                                     // is impacted by iteration number
+
+static void DBClose(benchmark::State& state) {
+  // create DB
+  DB* db;
+  Options options;
+  auto env = Env::Default();
+  std::string db_path;
+  auto s = env->GetTestDirectory(&db_path);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+    return;
+  }
+  std::string db_name = db_path + "/bench_dbclose";
+
+  DestroyDB(db_name, options);
+
+  options.create_if_missing = true;
+  s = DB::Open(options, db_name, &db);
+  if (!s.ok()) {
+    state.SkipWithError(s.ToString().c_str());
+    return;
+  }
+  db->Close();
+
+  options.create_if_missing = false;
+
+  auto rnd = Random(12345);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    s = DB::Open(options, db_name, &db);
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    auto wo = WriteOptions();
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 100; j++) {
+        s = db->Put(wo, rnd.RandomString(10), rnd.RandomString(100));
+        if (!s.ok()) {
+          state.SkipWithError(s.ToString().c_str());
+        }
+      }
+      s = db->Flush(FlushOptions());
+    }
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+    state.ResumeTiming();
+    s = db->Close();
+    if (!s.ok()) {
+      state.SkipWithError(s.ToString().c_str());
+    }
+  }
+  DestroyDB(db_name, options);
+}
+
+BENCHMARK(DBClose)->Iterations(200);  // specify iteration number as the db size
+                                      // is impacted by iteration number
+
+}  // namespace ROCKSDB_NAMESPACE
+
+BENCHMARK_MAIN();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/microbench/ribbon_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,156 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// this is a simple micro-benchmark for compare ribbon filter vs. other filter
+// for more comprehensive, please check the dedicate util/filter_bench.
+#include <benchmark/benchmark.h>
+
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/mock_block_based_table.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct KeyMaker {
+  explicit KeyMaker(size_t avg_size)
+      : smallest_size_(avg_size),
+        buf_size_(avg_size + 11),  // pad to vary key size and alignment
+        buf_(new char[buf_size_]) {
+    memset(buf_.get(), 0, buf_size_);
+    assert(smallest_size_ > 8);
+  }
+  size_t smallest_size_;
+  size_t buf_size_;
+  std::unique_ptr<char[]> buf_;
+
+  // Returns a unique(-ish) key based on the given parameter values. Each
+  // call returns a Slice from the same buffer so previously returned
+  // Slices should be considered invalidated.
+  Slice Get(uint32_t filter_num, uint32_t val_num) const {
+    size_t start = val_num % 4;
+    size_t len = smallest_size_;
+    // To get range [avg_size - 2, avg_size + 2]
+    // use range [smallest_size, smallest_size + 4]
+    len += FastRange32((val_num >> 5) * 1234567891, 5);
+    char *data = buf_.get() + start;
+    // Populate key data such that all data makes it into a key of at
+    // least 8 bytes. We also don't want all the within-filter key
+    // variance confined to a contiguous 32 bits, because then a 32 bit
+    // hash function can "cheat" the false positive rate by
+    // approximating a perfect hash.
+    EncodeFixed32(data, val_num);
+    EncodeFixed32(data + 4, filter_num + val_num);
+    // ensure clearing leftovers from different alignment
+    EncodeFixed32(data + 8, 0);
+    return {data, len};
+  }
+};
+
+// benchmark arguments:
+// 0. filter mode
+// 1. filter config bits_per_key
+// 2. average data key length
+// 3. data entry number
+static void CustomArguments(benchmark::internal::Benchmark *b) {
+  for (int filterMode :
+       {BloomFilterPolicy::kLegacyBloom, BloomFilterPolicy::kFastLocalBloom,
+        BloomFilterPolicy::kStandard128Ribbon}) {
+    //    for (int bits_per_key : {4, 10, 20, 30}) {
+    for (int bits_per_key : {10, 20}) {
+      for (int key_len_avg : {10, 100}) {
+        for (int64_t entry_num : {1 << 10, 1 << 20}) {
+          b->Args({filterMode, bits_per_key, key_len_avg, entry_num});
+        }
+      }
+    }
+  }
+}
+
+static void FilterBuild(benchmark::State &state) {
+  // setup data
+  auto filter = new BloomFilterPolicy(
+      static_cast<double>(state.range(1)),
+      static_cast<BloomFilterPolicy::Mode>(state.range(0)));
+  auto tester = new mock::MockBlockBasedTableTester(filter);
+  KeyMaker km(state.range(2));
+  std::unique_ptr<const char[]> owner;
+  const int64_t kEntryNum = state.range(3);
+  auto rnd = Random32(12345);
+  uint32_t filter_num = rnd.Next();
+  // run the test
+  for (auto _ : state) {
+    std::unique_ptr<FilterBitsBuilder> builder(tester->GetBuilder());
+    for (uint32_t i = 0; i < kEntryNum; i++) {
+      builder->AddKey(km.Get(filter_num, i));
+    }
+    auto ret = builder->Finish(&owner);
+    state.counters["size"] = static_cast<double>(ret.size());
+  }
+}
+BENCHMARK(FilterBuild)->Apply(CustomArguments);
+
+static void FilterQueryPositive(benchmark::State &state) {
+  // setup data
+  auto filter = new BloomFilterPolicy(
+      static_cast<double>(state.range(1)),
+      static_cast<BloomFilterPolicy::Mode>(state.range(0)));
+  auto tester = new mock::MockBlockBasedTableTester(filter);
+  KeyMaker km(state.range(2));
+  std::unique_ptr<const char[]> owner;
+  const int64_t kEntryNum = state.range(3);
+  auto rnd = Random32(12345);
+  uint32_t filter_num = rnd.Next();
+  std::unique_ptr<FilterBitsBuilder> builder(tester->GetBuilder());
+  for (uint32_t i = 0; i < kEntryNum; i++) {
+    builder->AddKey(km.Get(filter_num, i));
+  }
+  auto data = builder->Finish(&owner);
+  auto reader = filter->GetFilterBitsReader(data);
+
+  // run test
+  uint32_t i = 0;
+  for (auto _ : state) {
+    i++;
+    i = i % kEntryNum;
+    reader->MayMatch(km.Get(filter_num, i));
+  }
+}
+BENCHMARK(FilterQueryPositive)->Apply(CustomArguments);
+
+static void FilterQueryNegative(benchmark::State &state) {
+  // setup data
+  auto filter = new BloomFilterPolicy(
+      static_cast<double>(state.range(1)),
+      static_cast<BloomFilterPolicy::Mode>(state.range(0)));
+  auto tester = new mock::MockBlockBasedTableTester(filter);
+  KeyMaker km(state.range(2));
+  std::unique_ptr<const char[]> owner;
+  const int64_t kEntryNum = state.range(3);
+  auto rnd = Random32(12345);
+  uint32_t filter_num = rnd.Next();
+  std::unique_ptr<FilterBitsBuilder> builder(tester->GetBuilder());
+  for (uint32_t i = 0; i < kEntryNum; i++) {
+    builder->AddKey(km.Get(filter_num, i));
+  }
+  auto data = builder->Finish(&owner);
+  auto reader = filter->GetFilterBitsReader(data);
+
+  // run test
+  uint32_t i = 0;
+  double fp_cnt = 0;
+  for (auto _ : state) {
+    i++;
+    auto result = reader->MayMatch(km.Get(filter_num + 1, i));
+    if (result) {
+      fp_cnt++;
+    }
+  }
+  state.counters["FP %"] =
+      benchmark::Counter(fp_cnt * 100, benchmark::Counter::kAvgIterations);
+}
+BENCHMARK(FilterQueryNegative)->Apply(CustomArguments);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+BENCHMARK_MAIN();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,6 +10,8 @@
 #include "monitoring/histogram.h"
 
 #include <stdio.h>
+
+#include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
@@ -23,7 +25,6 @@
   // If you change this, you also need to change
   // size of array buckets_ in HistogramImpl
   bucketValues_ = {1, 2};
-  valueIndexMap_ = {{1, 0}, {2, 1}};
   double bucket_val = static_cast<double>(bucketValues_.back());
   while ((bucket_val = 1.5 * bucket_val) <= static_cast<double>(port::kMaxUint64)) {
     bucketValues_.push_back(static_cast<uint64_t>(bucket_val));
@@ -35,26 +36,18 @@
       pow_of_ten *= 10;
     }
     bucketValues_.back() *= pow_of_ten;
-    valueIndexMap_[bucketValues_.back()] = bucketValues_.size() - 1;
   }
   maxBucketValue_ = bucketValues_.back();
   minBucketValue_ = bucketValues_.front();
 }
 
 size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
-  if (value >= maxBucketValue_) {
-    return bucketValues_.size() - 1;
-  } else if ( value >= minBucketValue_ ) {
-    std::map<uint64_t, uint64_t>::const_iterator lowerBound =
-      valueIndexMap_.lower_bound(value);
-    if (lowerBound != valueIndexMap_.end()) {
-      return static_cast<size_t>(lowerBound->second);
-    } else {
-      return 0;
-    }
-  } else {
-    return 0;
-  }
+  auto beg = bucketValues_.begin();
+  auto end = bucketValues_.end();
+  if (value >= maxBucketValue_)
+    return end - beg - 1;  // bucketValues_.size() - 1
+  else
+    return std::lower_bound(beg, end, value) - beg;
 }
 
 namespace {
@@ -251,8 +244,7 @@
 
 void HistogramImpl::Merge(const Histogram& other) {
   if (strcmp(Name(), other.Name()) == 0) {
-    Merge(
-        *static_cast_with_check<const HistogramImpl, const Histogram>(&other));
+    Merge(*static_cast_with_check<const HistogramImpl>(&other));
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram.h	2025-05-19 16:14:27.000000000 +0000
@@ -48,7 +48,6 @@
   std::vector<uint64_t> bucketValues_;
   uint64_t maxBucketValue_;
   uint64_t minBucketValue_;
-  std::map<uint64_t, uint64_t> valueIndexMap_;
 };
 
 struct HistogramStat {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,11 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
+#include "monitoring/histogram.h"
+
 #include <cmath>
 
-#include "monitoring/histogram.h"
 #include "monitoring/histogram_windowing.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/testharness.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -16,16 +20,22 @@
 namespace {
   const double kIota = 0.1;
   const HistogramBucketMapper bucketMapper;
-  Env* env = Env::Default();
+  std::shared_ptr<MockSystemClock> clock =
+      std::make_shared<MockSystemClock>(SystemClock::Default());
 }
 
 void PopulateHistogram(Histogram& histogram,
              uint64_t low, uint64_t high, uint64_t loop = 1) {
+  Random rnd(test::RandomSeed());
   for (; loop > 0; loop--) {
     for (uint64_t i = low; i <= high; i++) {
       histogram.Add(i);
+      // sleep a random microseconds [0-10)
+      clock->SleepForMicroseconds(rnd.Uniform(10));
     }
   }
+  // make sure each data population at least take some time
+  clock->SleepForMicroseconds(1);
 }
 
 void BasicOperation(Histogram& histogram) {
@@ -131,23 +141,23 @@
 
   HistogramWindowingImpl
       histogramWindowing(num_windows, micros_per_window, min_num_per_window);
-
+  histogramWindowing.TEST_UpdateClock(clock);
   PopulateHistogram(histogramWindowing, 1, 1, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 100);
   ASSERT_EQ(histogramWindowing.min(), 1);
   ASSERT_EQ(histogramWindowing.max(), 1);
   ASSERT_EQ(histogramWindowing.Average(), 1);
 
   PopulateHistogram(histogramWindowing, 2, 2, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 200);
   ASSERT_EQ(histogramWindowing.min(), 1);
   ASSERT_EQ(histogramWindowing.max(), 2);
   ASSERT_EQ(histogramWindowing.Average(), 1.5);
 
   PopulateHistogram(histogramWindowing, 3, 3, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 300);
   ASSERT_EQ(histogramWindowing.min(), 1);
   ASSERT_EQ(histogramWindowing.max(), 3);
@@ -155,7 +165,7 @@
 
   // dropping oldest window with value 1, remaining 2 ~ 4
   PopulateHistogram(histogramWindowing, 4, 4, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 300);
   ASSERT_EQ(histogramWindowing.min(), 2);
   ASSERT_EQ(histogramWindowing.max(), 4);
@@ -163,7 +173,7 @@
 
   // dropping oldest window with value 2, remaining 3 ~ 5
   PopulateHistogram(histogramWindowing, 5, 5, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 300);
   ASSERT_EQ(histogramWindowing.min(), 3);
   ASSERT_EQ(histogramWindowing.max(), 5);
@@ -179,18 +189,20 @@
       histogramWindowing(num_windows, micros_per_window, min_num_per_window);
   HistogramWindowingImpl
       otherWindowing(num_windows, micros_per_window, min_num_per_window);
+  histogramWindowing.TEST_UpdateClock(clock);
+  otherWindowing.TEST_UpdateClock(clock);
 
   PopulateHistogram(histogramWindowing, 1, 1, 100);
   PopulateHistogram(otherWindowing, 1, 1, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
 
   PopulateHistogram(histogramWindowing, 2, 2, 100);
   PopulateHistogram(otherWindowing, 2, 2, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
 
   PopulateHistogram(histogramWindowing, 3, 3, 100);
   PopulateHistogram(otherWindowing, 3, 3, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
 
   histogramWindowing.Merge(otherWindowing);
   ASSERT_EQ(histogramWindowing.num(), 600);
@@ -200,14 +212,14 @@
 
   // dropping oldest window with value 1, remaining 2 ~ 4
   PopulateHistogram(histogramWindowing, 4, 4, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 500);
   ASSERT_EQ(histogramWindowing.min(), 2);
   ASSERT_EQ(histogramWindowing.max(), 4);
 
   // dropping oldest window with value 2, remaining 3 ~ 5
   PopulateHistogram(histogramWindowing, 5, 5, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->SleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 400);
   ASSERT_EQ(histogramWindowing.min(), 3);
   ASSERT_EQ(histogramWindowing.max(), 5);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,15 +8,17 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "monitoring/histogram_windowing.h"
-#include "monitoring/histogram.h"
-#include "util/cast_util.h"
 
 #include <algorithm>
 
+#include "monitoring/histogram.h"
+#include "rocksdb/system_clock.h"
+#include "util/cast_util.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 HistogramWindowingImpl::HistogramWindowingImpl() {
-  env_ = Env::Default();
+  clock_ = SystemClock::Default();
   window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
@@ -28,7 +30,7 @@
       num_windows_(num_windows),
       micros_per_window_(micros_per_window),
       min_num_per_window_(min_num_per_window) {
-  env_ = Env::Default();
+  clock_ = SystemClock::Default();
   window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
@@ -44,7 +46,7 @@
     window_stats_[i].Clear();
   }
   current_window_.store(0, std::memory_order_relaxed);
-  last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed);
+  last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed);
 }
 
 bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); }
@@ -65,9 +67,7 @@
 
 void HistogramWindowingImpl::Merge(const Histogram& other) {
   if (strcmp(Name(), other.Name()) == 0) {
-    Merge(
-        *static_cast_with_check<const HistogramWindowingImpl, const Histogram>(
-            &other));
+    Merge(*static_cast_with_check<const HistogramWindowingImpl>(&other));
   }
 }
 
@@ -131,7 +131,7 @@
 }
 
 void HistogramWindowingImpl::TimerTick() {
-  uint64_t curr_time = env_->NowMicros();
+  uint64_t curr_time = clock_->NowMicros();
   size_t curr_window_ = static_cast<size_t>(current_window());
   if (curr_time - last_swap_time() > micros_per_window_ &&
       window_stats_[curr_window_].num() >= min_num_per_window_) {
@@ -146,7 +146,7 @@
   // If mutex is held by Merge() or Clear(), next Add() will take care of the
   // swap, if needed.
   if (mutex_.try_lock()) {
-    last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed);
+    last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed);
 
     uint64_t curr_window = current_window();
     uint64_t next_window = (curr_window == num_windows_ - 1) ?
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/histogram_windowing.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,9 +10,9 @@
 #pragma once
 
 #include "monitoring/histogram.h"
-#include "rocksdb/env.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 
 class HistogramWindowingImpl : public Histogram
 {
@@ -44,7 +44,13 @@
   virtual double StandardDeviation() const override;
   virtual void Data(HistogramData* const data) const override;
 
-private:
+#ifndef NDEBUG
+  void TEST_UpdateClock(const std::shared_ptr<SystemClock>& clock) {
+    clock_ = clock;
+  }
+#endif  // NDEBUG
+
+ private:
   void TimerTick();
   void SwapHistoryBucket();
   inline uint64_t current_window() const {
@@ -54,7 +60,7 @@
     return last_swap_time_.load(std::memory_order_relaxed);
   }
 
-  Env* env_;
+  std::shared_ptr<SystemClock> clock_;
   std::mutex mutex_;
 
   // Aggregated stats over windows_stats_, all the computation is done
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,26 +4,30 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "monitoring/instrumented_mutex.h"
+
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
-Statistics* stats_for_report(Env* env, Statistics* stats) {
-  if (env != nullptr && stats != nullptr &&
+#ifndef NPERF_CONTEXT
+Statistics* stats_for_report(SystemClock* clock, Statistics* stats) {
+  if (clock != nullptr && stats != nullptr &&
       stats->get_stats_level() > kExceptTimeForMutex) {
     return stats;
   } else {
     return nullptr;
   }
 }
+#endif  // NPERF_CONTEXT
 }  // namespace
 
 void InstrumentedMutex::Lock() {
   PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
       db_mutex_lock_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
-      stats_for_report(env_, stats_), stats_code_);
+      stats_for_report(clock_, stats_), stats_code_);
   LockInternal();
 }
 
@@ -37,7 +41,7 @@
 void InstrumentedCondVar::Wait() {
   PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
       db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
-      stats_for_report(env_, stats_), stats_code_);
+      stats_for_report(clock_, stats_), stats_code_);
   WaitInternal();
 }
 
@@ -51,7 +55,7 @@
 bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) {
   PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
       db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
-      stats_for_report(env_, stats_), stats_code_);
+      stats_for_report(clock_, stats_), stats_code_);
   return TimedWaitInternal(abs_time_us);
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/instrumented_mutex.h	2025-05-19 16:14:27.000000000 +0000
@@ -7,8 +7,8 @@
 
 #include "monitoring/statistics.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/thread_status.h"
 #include "util/stop_watch.h"
 
@@ -20,13 +20,16 @@
 class InstrumentedMutex {
  public:
   explicit InstrumentedMutex(bool adaptive = false)
-      : mutex_(adaptive), stats_(nullptr), env_(nullptr),
-        stats_code_(0) {}
+      : mutex_(adaptive), stats_(nullptr), clock_(nullptr), stats_code_(0) {}
 
-  InstrumentedMutex(
-      Statistics* stats, Env* env,
-      int stats_code, bool adaptive = false)
-      : mutex_(adaptive), stats_(stats), env_(env),
+  explicit InstrumentedMutex(SystemClock* clock, bool adaptive = false)
+      : mutex_(adaptive), stats_(nullptr), clock_(clock), stats_code_(0) {}
+
+  InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code,
+                    bool adaptive = false)
+      : mutex_(adaptive),
+        stats_(stats),
+        clock_(clock),
         stats_code_(stats_code) {}
 
   void Lock();
@@ -44,12 +47,11 @@
   friend class InstrumentedCondVar;
   port::Mutex mutex_;
   Statistics* stats_;
-  Env* env_;
+  SystemClock* clock_;
   int stats_code_;
 };
 
-// A wrapper class for port::Mutex that provides additional layer
-// for collecting stats and instrumentation.
+// RAII wrapper for InstrumentedMutex
 class InstrumentedMutexLock {
  public:
   explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) {
@@ -66,12 +68,28 @@
   void operator=(const InstrumentedMutexLock&) = delete;
 };
 
+// RAII wrapper for temporary releasing InstrumentedMutex inside
+// InstrumentedMutexLock
+class InstrumentedMutexUnlock {
+ public:
+  explicit InstrumentedMutexUnlock(InstrumentedMutex* mutex) : mutex_(mutex) {
+    mutex_->Unlock();
+  }
+
+  ~InstrumentedMutexUnlock() { mutex_->Lock(); }
+
+ private:
+  InstrumentedMutex* const mutex_;
+  InstrumentedMutexUnlock(const InstrumentedMutexUnlock&) = delete;
+  void operator=(const InstrumentedMutexUnlock&) = delete;
+};
+
 class InstrumentedCondVar {
  public:
   explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex)
       : cond_(&(instrumented_mutex->mutex_)),
         stats_(instrumented_mutex->stats_),
-        env_(instrumented_mutex->env_),
+        clock_(instrumented_mutex->clock_),
         stats_code_(instrumented_mutex->stats_code_) {}
 
   void Wait();
@@ -91,7 +109,7 @@
   bool TimedWaitInternal(uint64_t abs_time_us);
   port::CondVar cond_;
   Statistics* stats_;
-  Env* env_;
+  SystemClock* clock_;
   int stats_code_;
 };
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,19 +9,23 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+#ifdef NIOSTATS_CONTEXT
+// Should not be used because the counters are not thread-safe.
+// Put here just to make get_iostats_context() simple without ifdef.
+static IOStatsContext iostats_context;
+#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
 __thread IOStatsContext iostats_context;
+#else
+#error \
+    "No thread-local support. Disable iostats context with -DNIOSTATS_CONTEXT."
 #endif
 
 IOStatsContext* get_iostats_context() {
-#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
   return &iostats_context;
-#else
-  return nullptr;
-#endif
 }
 
 void IOStatsContext::Reset() {
+#ifndef NIOSTATS_CONTEXT
   thread_pool_id = Env::Priority::TOTAL;
   bytes_read = 0;
   bytes_written = 0;
@@ -33,6 +37,10 @@
   prepare_write_nanos = 0;
   fsync_nanos = 0;
   logger_nanos = 0;
+  cpu_write_nanos = 0;
+  cpu_read_nanos = 0;
+  file_io_stats_by_temperature.Reset();
+#endif  //! NIOSTATS_CONTEXT
 }
 
 #define IOSTATS_CONTEXT_OUTPUT(counter)         \
@@ -41,6 +49,10 @@
   }
 
 std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
+#ifdef NIOSTATS_CONTEXT
+  (void)exclude_zero_counters;
+  return "";
+#else
   std::ostringstream ss;
   IOSTATS_CONTEXT_OUTPUT(thread_pool_id);
   IOSTATS_CONTEXT_OUTPUT(bytes_read);
@@ -53,10 +65,18 @@
   IOSTATS_CONTEXT_OUTPUT(fsync_nanos);
   IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos);
   IOSTATS_CONTEXT_OUTPUT(logger_nanos);
-
+  IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos);
+  IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count);
   std::string str = ss.str();
   str.erase(str.find_last_not_of(", ") + 1);
   return str;
+#endif  //! NIOSTATS_CONTEXT
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/iostats_context_imp.h	2025-05-19 16:14:27.000000000 +0000
@@ -7,7 +7,7 @@
 #include "monitoring/perf_step_timer.h"
 #include "rocksdb/iostats_context.h"
 
-#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL) && !defined(NIOSTATS_CONTEXT)
 namespace ROCKSDB_NAMESPACE {
 extern __thread IOStatsContext iostats_context;
 }  // namespace ROCKSDB_NAMESPACE
@@ -15,10 +15,6 @@
 // increment a specific counter by the specified value
 #define IOSTATS_ADD(metric, value) (iostats_context.metric += value)
 
-// Increase metric value only when it is positive
-#define IOSTATS_ADD_IF_POSITIVE(metric, value)   \
-  if (value > 0) { IOSTATS_ADD(metric, value); }
-
 // reset a specific counter to zero
 #define IOSTATS_RESET(metric) (iostats_context.metric = 0)
 
@@ -38,13 +34,13 @@
   iostats_step_timer_##metric.Start();
 
 // Declare and set start time of the timer
-#define IOSTATS_CPU_TIMER_GUARD(metric, env)           \
+#define IOSTATS_CPU_TIMER_GUARD(metric, clock)         \
   PerfStepTimer iostats_step_timer_##metric(           \
-      &(iostats_context.metric), env, true,            \
+      &(iostats_context.metric), clock, true,          \
       PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
   iostats_step_timer_##metric.Start();
 
-#else  // ROCKSDB_SUPPORT_THREAD_LOCAL
+#else  // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT
 
 #define IOSTATS_ADD(metric, value)
 #define IOSTATS_ADD_IF_POSITIVE(metric, value)
@@ -55,6 +51,6 @@
 #define IOSTATS(metric) 0
 
 #define IOSTATS_TIMER_GUARD(metric)
-#define IOSTATS_CPU_TIMER_GUARD(metric, env)   static_cast<void>(env)
+#define IOSTATS_CPU_TIMER_GUARD(metric, clock) static_cast<void>(clock)
 
-#endif  // ROCKSDB_SUPPORT_THREAD_LOCAL
+#endif  // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,26 +9,22 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
+#if defined(NPERF_CONTEXT)
+// Should not be used because the counters are not thread-safe.
+// Put here just to make get_perf_context() simple without ifdef.
 PerfContext perf_context;
-#else
+#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
 #if defined(OS_SOLARIS)
-__thread PerfContext perf_context_;
-#else
+__thread PerfContext perf_context;
+#else   // OS_SOLARIS
 thread_local PerfContext perf_context;
-#endif
+#endif  // OS_SOLARIS
+#else
+#error "No thread-local support. Disable perf context with -DNPERF_CONTEXT."
 #endif
 
 PerfContext* get_perf_context() {
-#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
-  return &perf_context;
-#else
-#if defined(OS_SOLARIS)
-  return &perf_context_;
-#else
   return &perf_context;
-#endif
-#endif
 }
 
 PerfContext::~PerfContext() {
@@ -38,7 +34,9 @@
 }
 
 PerfContext::PerfContext(const PerfContext& other) {
-#ifndef NPERF_CONTEXT
+#ifdef NPERF_CONTEXT
+  (void)other;
+#else
   user_key_comparison_count = other.user_key_comparison_count;
   block_cache_hit_count = other.block_cache_hit_count;
   block_read_count = other.block_read_count;
@@ -49,6 +47,7 @@
   block_cache_filter_hit_count = other.block_cache_filter_hit_count;
   filter_block_read_count = other.filter_block_read_count;
   compression_dict_block_read_count = other.compression_dict_block_read_count;
+  secondary_cache_hit_count = other.secondary_cache_hit_count;
   block_checksum_time = other.block_checksum_time;
   block_decompress_time = other.block_decompress_time;
   get_read_bytes = other.get_read_bytes;
@@ -133,7 +132,9 @@
 }
 
 PerfContext::PerfContext(PerfContext&& other) noexcept {
-#ifndef NPERF_CONTEXT
+#ifdef NPERF_CONTEXT
+  (void)other;
+#else
   user_key_comparison_count = other.user_key_comparison_count;
   block_cache_hit_count = other.block_cache_hit_count;
   block_read_count = other.block_read_count;
@@ -144,6 +145,7 @@
   block_cache_filter_hit_count = other.block_cache_filter_hit_count;
   filter_block_read_count = other.filter_block_read_count;
   compression_dict_block_read_count = other.compression_dict_block_read_count;
+  secondary_cache_hit_count = other.secondary_cache_hit_count;
   block_checksum_time = other.block_checksum_time;
   block_decompress_time = other.block_decompress_time;
   get_read_bytes = other.get_read_bytes;
@@ -230,7 +232,9 @@
 // TODO(Zhongyi): reduce code duplication between copy constructor and
 // assignment operator
 PerfContext& PerfContext::operator=(const PerfContext& other) {
-#ifndef NPERF_CONTEXT
+#ifdef NPERF_CONTEXT
+  (void)other;
+#else
   user_key_comparison_count = other.user_key_comparison_count;
   block_cache_hit_count = other.block_cache_hit_count;
   block_read_count = other.block_read_count;
@@ -241,6 +245,7 @@
   block_cache_filter_hit_count = other.block_cache_filter_hit_count;
   filter_block_read_count = other.filter_block_read_count;
   compression_dict_block_read_count = other.compression_dict_block_read_count;
+  secondary_cache_hit_count = other.secondary_cache_hit_count;
   block_checksum_time = other.block_checksum_time;
   block_decompress_time = other.block_decompress_time;
   get_read_bytes = other.get_read_bytes;
@@ -337,6 +342,7 @@
   block_cache_filter_hit_count = 0;
   filter_block_read_count = 0;
   compression_dict_block_read_count = 0;
+  secondary_cache_hit_count = 0;
   block_checksum_time = 0;
   block_decompress_time = 0;
   get_read_bytes = 0;
@@ -443,6 +449,7 @@
 
 std::string PerfContext::ToString(bool exclude_zero_counters) const {
 #ifdef NPERF_CONTEXT
+  (void)exclude_zero_counters;
   return "";
 #else
   std::ostringstream ss;
@@ -456,6 +463,7 @@
   PERF_CONTEXT_OUTPUT(block_cache_filter_hit_count);
   PERF_CONTEXT_OUTPUT(filter_block_read_count);
   PERF_CONTEXT_OUTPUT(compression_dict_block_read_count);
+  PERF_CONTEXT_OUTPUT(secondary_cache_hit_count);
   PERF_CONTEXT_OUTPUT(block_checksum_time);
   PERF_CONTEXT_OUTPUT(block_decompress_time);
   PERF_CONTEXT_OUTPUT(get_read_bytes);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_context_imp.h	2025-05-19 16:14:27.000000000 +0000
@@ -25,8 +25,8 @@
 #define PERF_TIMER_STOP(metric)
 #define PERF_TIMER_START(metric)
 #define PERF_TIMER_GUARD(metric)
-#define PERF_TIMER_GUARD_WITH_ENV(metric, env)
-#define PERF_CPU_TIMER_GUARD(metric, env)
+#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock)
+#define PERF_CPU_TIMER_GUARD(metric, clock)
 #define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
                                                ticker_type)
 #define PERF_TIMER_MEASURE(metric)
@@ -46,14 +46,14 @@
   perf_step_timer_##metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_TIMER_GUARD_WITH_ENV(metric, env)                         \
-  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \
+#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock)                       \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), clock); \
   perf_step_timer_##metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_CPU_TIMER_GUARD(metric, env)              \
+#define PERF_CPU_TIMER_GUARD(metric, clock)            \
   PerfStepTimer perf_step_timer_##metric(              \
-      &(perf_context.metric), env, true,               \
+      &(perf_context.metric), clock, true,             \
       PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
   perf_step_timer_##metric.Start();
 
@@ -77,20 +77,19 @@
   }
 
 // Increase metric value
-#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)                      \
-  if (perf_level >= PerfLevel::kEnableCount &&                               \
-      perf_context.per_level_perf_context_enabled &&                         \
-      perf_context.level_to_perf_context) {                                  \
-    if ((*(perf_context.level_to_perf_context)).find(level) !=               \
-        (*(perf_context.level_to_perf_context)).end()) {                     \
-      (*(perf_context.level_to_perf_context))[level].metric += value;        \
-    }                                                                        \
-    else {                                                                   \
-      PerfContextByLevel empty_context;                                      \
-      (*(perf_context.level_to_perf_context))[level] = empty_context;        \
-      (*(perf_context.level_to_perf_context))[level].metric += value;       \
-    }                                                                        \
-  }                                                                          \
+#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)               \
+  if (perf_level >= PerfLevel::kEnableCount &&                        \
+      perf_context.per_level_perf_context_enabled &&                  \
+      perf_context.level_to_perf_context) {                           \
+    if ((*(perf_context.level_to_perf_context)).find(level) !=        \
+        (*(perf_context.level_to_perf_context)).end()) {              \
+      (*(perf_context.level_to_perf_context))[level].metric += value; \
+    } else {                                                          \
+      PerfContextByLevel empty_context;                               \
+      (*(perf_context.level_to_perf_context))[level] = empty_context; \
+      (*(perf_context.level_to_perf_context))[level].metric += value; \
+    }                                                                 \
+  }
 
 #endif
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/perf_step_timer.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,26 +5,26 @@
 //
 #pragma once
 #include "monitoring/perf_level_imp.h"
-#include "rocksdb/env.h"
-#include "util/stop_watch.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class PerfStepTimer {
  public:
   explicit PerfStepTimer(
-      uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false,
+      uint64_t* metric, SystemClock* clock = nullptr, bool use_cpu_time = false,
       PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex,
       Statistics* statistics = nullptr, uint32_t ticker_type = 0)
       : perf_counter_enabled_(perf_level >= enable_level),
         use_cpu_time_(use_cpu_time),
-        env_((perf_counter_enabled_ || statistics != nullptr)
-                 ? ((env != nullptr) ? env : Env::Default())
-                 : nullptr),
+        ticker_type_(ticker_type),
+        clock_((perf_counter_enabled_ || statistics != nullptr)
+                   ? (clock ? clock : SystemClock::Default().get())
+                   : nullptr),
         start_(0),
         metric_(metric),
-        statistics_(statistics),
-        ticker_type_(ticker_type) {}
+        statistics_(statistics) {}
 
   ~PerfStepTimer() {
     Stop();
@@ -36,14 +36,6 @@
     }
   }
 
-  uint64_t time_now() {
-    if (!use_cpu_time_) {
-      return env_->NowNanos();
-    } else {
-      return env_->NowCPUNanos();
-    }
-  }
-
   void Measure() {
     if (start_) {
       uint64_t now = time_now();
@@ -67,13 +59,21 @@
   }
 
  private:
+  uint64_t time_now() {
+    if (!use_cpu_time_) {
+      return clock_->NowNanos();
+    } else {
+      return clock_->CPUNanos();
+    }
+  }
+
   const bool perf_counter_enabled_;
   const bool use_cpu_time_;
-  Env* const env_;
+  uint32_t ticker_type_;
+  SystemClock* const clock_;
   uint64_t start_;
   uint64_t* metric_;
   Statistics* statistics_;
-  uint32_t ticker_type_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/persistent_stats_history.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,7 +12,6 @@
 #include <string>
 #include <utility>
 #include "db/db_impl/db_impl.h"
-#include "port/likely.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,8 +8,12 @@
 #include <algorithm>
 #include <cinttypes>
 #include <cstdio>
-#include "port/likely.h"
+
+#include "rocksdb/convenience.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -105,6 +109,12 @@
     {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
     {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
     {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
+    {COMPACT_READ_BYTES_MARKED, "rocksdb.compact.read.marked.bytes"},
+    {COMPACT_READ_BYTES_PERIODIC, "rocksdb.compact.read.periodic.bytes"},
+    {COMPACT_READ_BYTES_TTL, "rocksdb.compact.read.ttl.bytes"},
+    {COMPACT_WRITE_BYTES_MARKED, "rocksdb.compact.write.marked.bytes"},
+    {COMPACT_WRITE_BYTES_PERIODIC, "rocksdb.compact.write.periodic.bytes"},
+    {COMPACT_WRITE_BYTES_TTL, "rocksdb.compact.write.ttl.bytes"},
     {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
      "rocksdb.number.direct.load.table.properties"},
     {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
@@ -176,6 +186,42 @@
      "rocksdb.block.cache.compression.dict.bytes.insert"},
     {BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
      "rocksdb.block.cache.compression.dict.bytes.evict"},
+    {BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"},
+    {BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+     "rocksdb.block.cache.index.add.redundant"},
+    {BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+     "rocksdb.block.cache.filter.add.redundant"},
+    {BLOCK_CACHE_DATA_ADD_REDUNDANT, "rocksdb.block.cache.data.add.redundant"},
+    {BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+     "rocksdb.block.cache.compression.dict.add.redundant"},
+    {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"},
+    {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"},
+    {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"},
+    {ERROR_HANDLER_BG_IO_ERROR_COUNT,
+     "rocksdb.error.handler.bg.io.errro.count"},
+    {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+     "rocksdb.error.handler.bg.retryable.io.errro.count"},
+    {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"},
+    {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+     "rocksdb.error.handler.autoresume.retry.total.count"},
+    {ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+     "rocksdb.error.handler.autoresume.success.count"},
+    {MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+     "rocksdb.memtable.payload.bytes.at.flush"},
+    {MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+     "rocksdb.memtable.garbage.bytes.at.flush"},
+    {SECONDARY_CACHE_HITS, "rocksdb.secondary.cache.hits"},
+    {VERIFY_CHECKSUM_READ_BYTES, "rocksdb.verify_checksum.read.bytes"},
+    {BACKUP_READ_BYTES, "rocksdb.backup.read.bytes"},
+    {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"},
+    {REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"},
+    {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"},
+    {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"},
+    {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"},
+    {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"},
+    {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"},
+    {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"},
+    {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@@ -227,14 +273,64 @@
     {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
     {FLUSH_TIME, "rocksdb.db.flush.micros"},
     {SST_BATCH_SIZE, "rocksdb.sst.batch.size"},
+    {NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+     "rocksdb.num.index.and.filter.blocks.read.per.level"},
+    {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"},
+    {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"},
+    {ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+     "rocksdb.error.handler.autoresume.retry.count"},
 };
 
 std::shared_ptr<Statistics> CreateDBStatistics() {
   return std::make_shared<StatisticsImpl>(nullptr);
 }
 
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinStatistics(ObjectLibrary& library,
+                                     const std::string& /*arg*/) {
+  library.AddFactory<Statistics>(
+      StatisticsImpl::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<Statistics>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new StatisticsImpl(nullptr));
+        return guard->get();
+      });
+  return 1;
+}
+#endif  // ROCKSDB_LITE
+
+Status Statistics::CreateFromString(const ConfigOptions& config_options,
+                                    const std::string& id,
+                                    std::shared_ptr<Statistics>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinStatistics(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  Status s;
+  if (id == "" || id == StatisticsImpl::kClassName()) {
+    result->reset(new StatisticsImpl(nullptr));
+  } else if (id == kNullptrString) {
+    result->reset();
+  } else {
+    s = LoadSharedObject<Statistics>(config_options, id, nullptr, result);
+  }
+  return s;
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> stats_type_info = {
+#ifndef ROCKSDB_LITE
+    {"inner", OptionTypeInfo::AsCustomSharedPtr<Statistics>(
+                  0, OptionVerificationType::kByNameAllowFromNull,
+                  OptionTypeFlags::kCompareNever)},
+#endif  // !ROCKSDB_LITE
+};
+
 StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats)
-    : stats_(std::move(stats)) {}
+    : stats_(std::move(stats)) {
+  RegisterOptions("StatisticsOptions", &stats_, &stats_type_info);
+}
 
 StatisticsImpl::~StatisticsImpl() {}
 
@@ -313,11 +409,17 @@
 }
 
 void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
-  assert(tickerType < TICKER_ENUM_MAX);
-  per_core_stats_.Access()->tickers_[tickerType].fetch_add(
-      count, std::memory_order_relaxed);
-  if (stats_ && tickerType < TICKER_ENUM_MAX) {
-    stats_->recordTick(tickerType, count);
+  if (get_stats_level() <= StatsLevel::kExceptTickers) {
+    return;
+  }
+  if (tickerType < TICKER_ENUM_MAX) {
+    per_core_stats_.Access()->tickers_[tickerType].fetch_add(
+        count, std::memory_order_relaxed);
+    if (stats_) {
+      stats_->recordTick(tickerType, count);
+    }
+  } else {
+    assert(false);
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.h mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics.h	2025-05-19 16:14:27.000000000 +0000
@@ -44,6 +44,8 @@
  public:
   StatisticsImpl(std::shared_ptr<Statistics> stats);
   virtual ~StatisticsImpl();
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "BasicStatistics"; }
 
   virtual uint64_t getTickerCount(uint32_t ticker_type) const override;
   virtual void histogramData(uint32_t histogram_type,
@@ -68,6 +70,8 @@
   virtual bool getTickerMap(std::map<std::string, uint64_t>*) const override;
   virtual bool HistEnabledForType(uint32_t type) const override;
 
+  const Customizable* Inner() const override { return stats_.get(); }
+
  private:
   // If non-nullptr, forwards updates to the object pointed to by `stats_`.
   std::shared_ptr<Statistics> stats_;
@@ -96,7 +100,9 @@
     void operator delete[](void *p) { port::cacheline_aligned_free(p); }
   };
 
+#ifndef TEST_CACHE_LINE_SIZE
   static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0, "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned");
+#endif
 
   CoreLocalArray<StatisticsData> per_core_stats_;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/statistics_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/statistics_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,12 +4,14 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 
+#include "rocksdb/statistics.h"
+
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/options_type.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
-#include "rocksdb/statistics.h"
-
 namespace ROCKSDB_NAMESPACE {
 
 class StatisticsTest : public testing::Test {};
@@ -38,6 +40,49 @@
   }
 }
 
+TEST_F(StatisticsTest, NoNameStats) {
+  static std::unordered_map<std::string, OptionTypeInfo> no_name_opt_info = {
+#ifndef ROCKSDB_LITE
+      {"inner",
+       OptionTypeInfo::AsCustomSharedPtr<Statistics>(
+           0, OptionVerificationType::kByName,
+           OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever)},
+#endif  // ROCKSDB_LITE
+  };
+
+  class DefaultNameStatistics : public Statistics {
+   public:
+    DefaultNameStatistics(const std::shared_ptr<Statistics>& stats = nullptr)
+        : inner(stats) {
+      RegisterOptions("", &inner, &no_name_opt_info);
+    }
+
+    uint64_t getTickerCount(uint32_t /*tickerType*/) const override {
+      return 0;
+    }
+    void histogramData(uint32_t /*type*/,
+                       HistogramData* const /*data*/) const override {}
+    void recordTick(uint32_t /*tickerType*/, uint64_t /*count*/) override {}
+    void setTickerCount(uint32_t /*tickerType*/, uint64_t /*count*/) override {}
+    uint64_t getAndResetTickerCount(uint32_t /*tickerType*/) override {
+      return 0;
+    }
+    std::shared_ptr<Statistics> inner;
+  };
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+  auto stats = std::make_shared<DefaultNameStatistics>();
+  ASSERT_STREQ(stats->Name(), "");
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("", stats->ToString(
+                    options));  // A stats with no name with have no options...
+  ASSERT_OK(stats->ConfigureFromString(options, "inner="));
+  ASSERT_EQ("", stats->ToString(
+                    options));  // A stats with no name with have no options...
+  ASSERT_NE(stats->inner, nullptr);
+  ASSERT_NE("", stats->inner->ToString(options));  // ... even if it does...
+#endif                                             // ROCKSDB_LITE
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/stats_history_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -6,6 +6,8 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "rocksdb/stats_history.h"
+
 #include <limits>
 #include <string>
 #include <unordered_map>
@@ -13,58 +15,70 @@
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
+#include "db/periodic_work_scheduler.h"
 #include "monitoring/persistent_stats_history.h"
 #include "options/options_helper.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/rate_limiter.h"
-#include "rocksdb/stats_history.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+#ifndef ROCKSDB_LITE
 class StatsHistoryTest : public DBTestBase {
  public:
-  StatsHistoryTest() : DBTestBase("/stats_history_test") {}
+  StatsHistoryTest() : DBTestBase("stats_history_test", /*env_do_fsync=*/true) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+  }
+
+ protected:
+  std::shared_ptr<MockSystemClock> mock_clock_;
+  std::unique_ptr<Env> mock_env_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) {
+          auto* periodic_work_scheduler_ptr =
+              reinterpret_cast<PeriodicWorkScheduler**>(arg);
+          *periodic_work_scheduler_ptr =
+              PeriodicWorkTestScheduler::Default(mock_clock_);
+        });
+  }
 };
-#ifndef ROCKSDB_LITE
 
 TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) {
+  constexpr int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
-  options.stats_dump_period_sec = 5;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
+  options.stats_dump_period_sec = kPeriodSec;
+  options.env = mock_env_.get();
   int counter = 0;
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::DumpStats:1", [&](void* /*arg*/) { counter++; });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:1",
+                                        [&](void* /*arg*/) { counter++; });
   Reopen(options);
   ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
-  dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); });
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_GE(counter, 1);
 
-  // Test cacel job through SetOptions
+  // Test cancel job through SetOptions
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
   int old_val = counter;
-  for (int i = 6; i < 20; ++i) {
-    dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); });
+  for (int i = 1; i < 20; ++i) {
+    mock_clock_->MockSleepForSeconds(kPeriodSec);
   }
   ASSERT_EQ(counter, old_val);
   Close();
@@ -72,120 +86,96 @@
 
 // Test persistent stats background thread scheduling and cancelling
 TEST_F(StatsHistoryTest, StatsPersistScheduling) {
+  constexpr int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
+  options.stats_persist_period_sec = kPeriodSec;
+  options.env = mock_env_.get();
   int counter = 0;
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:Entry",
+                                        [&](void* /*arg*/) { counter++; });
   Reopen(options);
   ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_GE(counter, 1);
 
-  // Test cacel job through SetOptions
-  ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled());
+  // Test cancel job through SetOptions
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
-  ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled());
+  int old_val = counter;
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec * 2); });
+  ASSERT_EQ(counter, old_val);
+
   Close();
 }
 
 // Test enabling persistent stats for the first time
 TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) {
+  constexpr unsigned int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
   options.stats_persist_period_sec = 0;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
+  options.env = mock_env_.get();
   int counter = 0;
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:Entry",
+                                        [&](void* /*arg*/) { counter++; });
   Reopen(options);
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}}));
-  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"stats_persist_period_sec", std::to_string(kPeriodSec)}}));
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_GE(counter, 1);
   Close();
 }
 
 // TODO(Zhongyi): Move persistent stats related tests to a separate file
 TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
+  constexpr int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-#endif  // OS_MACOSX && !NDEBUG
-
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
+  options.env = mock_env_.get();
   CreateColumnFamilies({"pikachu"}, options);
   ASSERT_OK(Put("foo", "bar"));
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
-  int mock_time = 1;
+  // make sure the first stats persist to finish
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
   // Wait for stats persist to finish
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0 /*start_time*/, 6 /*end_time*/, &stats_iter);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   // disabled stats snapshots
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
   size_t stats_count = 0;
   for (; stats_iter->Valid(); stats_iter->Next()) {
     auto stats_map = stats_iter->GetStatsMap();
-    ASSERT_EQ(stats_iter->GetStatsTime(), 5);
+    ASSERT_EQ(stats_iter->GetStatsTime(), mock_clock_->NowSeconds());
     stats_count += stats_map.size();
   }
   ASSERT_GT(stats_count, 0);
   // Wait a bit and verify no more stats are found
-  for (mock_time = 6; mock_time < 20; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
+  for (int i = 0; i < 10; ++i) {
+    dbfull()->TEST_WaitForStatsDumpRun(
+        [&] { mock_clock_->MockSleepForSeconds(1); });
   }
-  db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter);
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count_new = 0;
   for (; stats_iter->Valid(); stats_iter->Next()) {
@@ -196,26 +186,12 @@
 }
 
 TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
+  constexpr int kPeriodSec = 1;
   Options options;
   options.create_if_missing = true;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  options.stats_persist_period_sec = 1;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-#endif  // OS_MACOSX && !NDEBUG
+  options.statistics = CreateDBStatistics();
+  options.stats_persist_period_sec = kPeriodSec;
+  options.env = mock_env_.get();
 
   CreateColumnFamilies({"pikachu"}, options);
   ASSERT_OK(Put("foo", "bar"));
@@ -235,13 +211,7 @@
   delete iterator;
   ASSERT_OK(Flush());
   ASSERT_OK(Delete("sol"));
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-  int mock_time = 1;
-  // Wait for stats persist to finish
-  for (; mock_time < 5; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   // second round of ops
   ASSERT_OK(Put("saigon", "saigon"));
@@ -253,13 +223,17 @@
   }
   delete iterator;
   ASSERT_OK(Flush());
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-  for (; mock_time < 10; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  const int kIterations = 10;
+  for (int i = 0; i < kIterations; ++i) {
+    dbfull()->TEST_WaitForStatsDumpRun(
+        [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   }
+
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0 /*start_time*/, 10 /*end_time*/, &stats_iter);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count = 0;
   int slice_count = 0;
@@ -269,17 +243,20 @@
     stats_count += stats_map.size();
   }
   size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize();
-  ASSERT_GE(slice_count, 9);
-  ASSERT_GE(stats_history_size, 12000);
-  // capping memory cost at 12000 bytes since one slice is around 10000~12000
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}}));
-  ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size);
+  ASSERT_GE(slice_count, kIterations - 1);
+  ASSERT_GE(stats_history_size, 15000);
+  // capping memory cost at 15000 bytes since one slice is around 10000~15000
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "15000"}}));
+  ASSERT_EQ(15000, dbfull()->GetDBOptions().stats_history_buffer_size);
+
   // Wait for stats persist to finish
-  for (; mock_time < 20; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
+  for (int i = 0; i < kIterations; ++i) {
+    dbfull()->TEST_WaitForStatsDumpRun(
+        [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   }
-  db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter);
+
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count_reopen = 0;
   slice_count = 0;
@@ -292,7 +269,7 @@
       dbfull()->TEST_EstimateInMemoryStatsHistorySize();
   // only one slice can fit under the new stats_history_buffer_size
   ASSERT_LT(slice_count, 2);
-  ASSERT_TRUE(stats_history_size_reopen < 12000 &&
+  ASSERT_TRUE(stats_history_size_reopen < 15000 &&
               stats_history_size_reopen > 0);
   ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
   Close();
@@ -309,34 +286,41 @@
 }
 
 TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
+  constexpr int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
   options.persist_stats_to_disk = true;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
+  options.env = mock_env_.get();
   CreateColumnFamilies({"pikachu"}, options);
   ASSERT_OK(Put("foo", "bar"));
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
   ASSERT_EQ(Get("foo"), "bar");
 
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
   // Wait for stats persist to finish
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+
   auto iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   int key_count1 = countkeys(iter);
   delete iter;
-  dbfull()->TEST_WaitForPersistStatsRun(
-      [&] { mock_env->set_current_time(10); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   int key_count2 = countkeys(iter);
   delete iter;
-  dbfull()->TEST_WaitForPersistStatsRun(
-      [&] { mock_env->set_current_time(15); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   int key_count3 = countkeys(iter);
@@ -345,15 +329,16 @@
   ASSERT_GE(key_count3, key_count2);
   ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1);
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count = 0;
   int slice_count = 0;
   int non_zero_count = 0;
-  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) {
+  for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) {
     slice_count++;
     auto stats_map = stats_iter->GetStatsMap();
-    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1);
     for (auto& stat : stats_map) {
       if (stat.second != 0) {
         non_zero_count++;
@@ -366,7 +351,8 @@
   ASSERT_EQ(stats_count, key_count3 - 2);
   // verify reopen will not cause data loss
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count_reopen = 0;
   int slice_count_reopen = 0;
@@ -381,6 +367,7 @@
     }
     stats_count_reopen += stats_map.size();
   }
+
   ASSERT_EQ(non_zero_count, non_zero_count_recover);
   ASSERT_EQ(slice_count, slice_count_reopen);
   ASSERT_EQ(stats_count, stats_count_reopen);
@@ -390,53 +377,61 @@
 // Test persisted stats matches the value found in options.statistics and
 // the stats value retains after DB reopen
 TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) {
+  constexpr int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
   options.persist_stats_to_disk = true;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
   std::map<std::string, uint64_t> stats_map_before;
   ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_before));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
+  options.env = mock_env_.get();
   CreateColumnFamilies({"pikachu"}, options);
   ASSERT_OK(Put("foo", "bar"));
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
   ASSERT_EQ(Get("foo"), "bar");
 
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
   // Wait for stats persist to finish
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   auto iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   countkeys(iter);
   delete iter;
-  dbfull()->TEST_WaitForPersistStatsRun(
-      [&] { mock_env->set_current_time(10); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   countkeys(iter);
   delete iter;
-  dbfull()->TEST_WaitForPersistStatsRun(
-      [&] { mock_env->set_current_time(15); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   countkeys(iter);
   delete iter;
-  dbfull()->TEST_WaitForPersistStatsRun(
-      [&] { mock_env->set_current_time(20); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
 
   std::map<std::string, uint64_t> stats_map_after;
   ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after));
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   std::string sample = "rocksdb.num.iterator.deleted";
   uint64_t recovered_value = 0;
-  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), ++i) {
+  for (int i = 2; stats_iter->Valid(); stats_iter->Next(), ++i) {
     auto stats_map = stats_iter->GetStatsMap();
-    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1);
     for (const auto& stat : stats_map) {
       if (sample.compare(stat.first) == 0) {
         recovered_value += stat.second;
@@ -447,12 +442,13 @@
 
   // test stats value retains after recovery
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter);
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   uint64_t new_recovered_value = 0;
-  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) {
+  for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) {
     auto stats_map = stats_iter->GetStatsMap();
-    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    ASSERT_EQ(stats_iter->GetStatsTime(), kPeriodSec * i - 1);
     for (const auto& stat : stats_map) {
       if (sample.compare(stat.first) == 0) {
         new_recovered_value += stat.second;
@@ -469,15 +465,13 @@
 // TODO(Zhongyi): add test for different format versions
 
 TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) {
+  constexpr int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
   options.persist_stats_to_disk = true;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
+  options.env = mock_env_.get();
   ASSERT_OK(TryReopen(options));
   CreateColumnFamilies({"one", "two", "three"}, options);
   ASSERT_OK(Put(1, "foo", "bar"));
@@ -486,7 +480,13 @@
   CreateColumnFamilies({"four"}, options);
   ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
   ASSERT_EQ(Get(2, "foo"), "bar");
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+
+  // make sure the first stats persist to finish
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   auto iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   int key_count = countkeys(iter);
@@ -495,7 +495,7 @@
   uint64_t num_write_wal = 0;
   std::string sample = "rocksdb.write.wal";
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter);
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   for (; stats_iter->Valid(); stats_iter->Next()) {
     auto stats_map = stats_iter->GetStatsMap();
@@ -506,7 +506,7 @@
     }
   }
   stats_iter.reset();
-  ASSERT_EQ(num_write_wal, 2);
+  ASSERT_EQ(num_write_wal, 1);
 
   options.persist_stats_to_disk = false;
   ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
@@ -531,7 +531,7 @@
   ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
                                      &handle));
   // verify stats is not affected by prior failed CF creation
-  db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter);
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   num_write_wal = 0;
   for (; stats_iter->Valid(); stats_iter->Next()) {
@@ -542,7 +542,7 @@
       }
     }
   }
-  ASSERT_EQ(num_write_wal, 2);
+  ASSERT_EQ(num_write_wal, 1);
 
   Close();
   Destroy(options);
@@ -562,25 +562,29 @@
 
   // Reopen and flush memtable.
   ASSERT_OK(TryReopen(options));
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
 }
 
 TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
+  constexpr int kPeriodSec = 5;
   Options options;
   options.create_if_missing = true;
   options.write_buffer_size = 1024 * 1024 * 10;  // 10 Mb
-  options.stats_persist_period_sec = 5;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.stats_persist_period_sec = kPeriodSec;
+  options.statistics = CreateDBStatistics();
   options.persist_stats_to_disk = true;
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env;
-  mock_env.reset(new ROCKSDB_NAMESPACE::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
+  options.env = mock_env_.get();
   CreateColumnFamilies({"pikachu"}, options);
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Wait for the first stats persist to finish, as the initial delay could be
+  // different.
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+
   ColumnFamilyData* cfd_default =
       static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
           ->cfd();
@@ -596,7 +600,9 @@
   ASSERT_EQ("v0", Get("foo"));
   ASSERT_OK(Put(1, "Eevee", "v0"));
   ASSERT_EQ("v0", Get(1, "Eevee"));
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   // writing to all three cf, flush default cf
   // LogNumbers: default: 14, stats: 4, pikachu: 4
   ASSERT_OK(Flush());
@@ -619,8 +625,9 @@
   ASSERT_OK(Put("bar2", "v2"));
   ASSERT_EQ("v2", Get("bar2"));
   ASSERT_EQ("v2", Get("foo2"));
-  dbfull()->TEST_WaitForPersistStatsRun(
-      [&] { mock_env->set_current_time(10); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   // writing to default and stats cf, flushing default cf
   // LogNumbers: default: 19, stats: 19, pikachu: 19
   ASSERT_OK(Flush());
@@ -633,8 +640,9 @@
   ASSERT_EQ("v3", Get("foo3"));
   ASSERT_OK(Put(1, "Jolteon", "v3"));
   ASSERT_EQ("v3", Get(1, "Jolteon"));
-  dbfull()->TEST_WaitForPersistStatsRun(
-      [&] { mock_env->set_current_time(15); });
+
+  dbfull()->TEST_WaitForStatsDumpRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   // writing to all three cf, flushing test cf
   // LogNumbers: default: 19, stats: 19, pikachu: 22
   ASSERT_OK(Flush(1));
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,9 +4,12 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "monitoring/thread_status_updater.h"
+
 #include <memory>
+
 #include "port/likely.h"
 #include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -159,7 +162,7 @@
     std::vector<ThreadStatus>* thread_list) {
   thread_list->clear();
   std::vector<std::shared_ptr<ThreadStatusData>> valid_list;
-  uint64_t now_micros = Env::Default()->NowMicros();
+  uint64_t now_micros = SystemClock::Default()->NowMicros();
 
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   for (auto* thread_data : thread_data_set_) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,7 @@
 
 #include "db/column_family.h"
 #include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -19,7 +20,7 @@
     assert(cf_info_map_.size() == handles.size());
   }
   for (auto* handle : handles) {
-    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(handle)->cfd();
     auto iter __attribute__((__unused__)) = cf_info_map_.find(cfd);
     if (check_exist) {
       assert(iter != cf_info_map_.end());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,7 @@
 
 #include "monitoring/thread_status_updater.h"
 #include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -57,7 +58,7 @@
   }
 
   if (op != ThreadStatus::OP_UNKNOWN) {
-    uint64_t current_time = Env::Default()->NowMicros();
+    uint64_t current_time = SystemClock::Default()->NowMicros();
     thread_updater_local_cache_->SetOperationStartTime(current_time);
   } else {
     // TDOO(yhchiang): we could report the time when we set operation to
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/monitoring/thread_status_util_debug.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,7 +7,7 @@
 
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -23,7 +23,7 @@
 void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) {
   auto delay = states_delay[state].load(std::memory_order_relaxed);
   if (delay > 0) {
-    Env::Default()->SleepForMicroseconds(delay);
+    SystemClock::Default()->SleepForMicroseconds(delay);
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,27 +9,833 @@
 #include <cinttypes>
 #include <limits>
 #include <string>
+
+#include "logging/logging.h"
+#include "options/configurable_helper.h"
 #include "options/db_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
 #include "port/port.h"
+#include "rocksdb/compaction_filter.h"
 #include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/configurable.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+// offset_of is used to get the offset of a class data member
+// ex: offset_of(&ColumnFamilyOptions::num_levels)
+// This call will return the offset of num_levels in ColumnFamilyOptions class
+//
+// This is the same as offsetof() but allow us to work with non standard-layout
+// classes and structures
+// refs:
+// http://en.cppreference.com/w/cpp/concept/StandardLayoutType
+// https://gist.github.com/graphitemaster/494f21190bb2c63c5516
+#ifndef ROCKSDB_LITE
+static ImmutableCFOptions dummy_cf_options;
+template <typename T1>
+int offset_of(T1 ImmutableCFOptions::*member) {
+  return int(size_t(&(dummy_cf_options.*member)) - size_t(&dummy_cf_options));
+}
+
+static Status ParseCompressionOptions(const std::string& value,
+                                      const std::string& name,
+                                      CompressionOptions& compression_opts) {
+  const char kDelimiter = ':';
+  std::istringstream field_stream(value);
+  std::string field;
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.window_bits = ParseInt(field);
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.level = ParseInt(field);
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.strategy = ParseInt(field);
+
+  // max_dict_bytes is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.max_dict_bytes = ParseInt(field);
+  }
+
+  // zstd_max_train_bytes is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.zstd_max_train_bytes = ParseInt(field);
+  }
+
+  // parallel_threads is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    // Since parallel_threads comes before enabled but was added optionally
+    // later, we need to check if this is the final token (meaning it is the
+    // enabled bit), or if there are more tokens (meaning this one is
+    // parallel_threads).
+    if (!field_stream.eof()) {
+      compression_opts.parallel_threads = ParseInt(field);
+    } else {
+      // parallel_threads is not serialized with this format, but enabled is
+      compression_opts.enabled = ParseBoolean("", field);
+    }
+  }
+
+  // enabled is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.enabled = ParseBoolean("", field);
+  }
+
+  // max_dict_buffer_bytes is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.max_dict_buffer_bytes = ParseUint64(field);
+  }
+
+  if (!field_stream.eof()) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  return Status::OK();
+}
+
+const std::string kOptNameBMCompOpts = "bottommost_compression_opts";
+const std::string kOptNameCompOpts = "compression_opts";
+
+// OptionTypeInfo map for CompressionOptions
+static std::unordered_map<std::string, OptionTypeInfo>
+    compression_options_type_info = {
+        {"window_bits",
+         {offsetof(struct CompressionOptions, window_bits), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"level",
+         {offsetof(struct CompressionOptions, level), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"strategy",
+         {offsetof(struct CompressionOptions, strategy), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"max_dict_bytes",
+         {offsetof(struct CompressionOptions, max_dict_bytes), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"zstd_max_train_bytes",
+         {offsetof(struct CompressionOptions, zstd_max_train_bytes),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"parallel_threads",
+         {offsetof(struct CompressionOptions, parallel_threads),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"enabled",
+         {offsetof(struct CompressionOptions, enabled), OptionType::kBoolean,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"max_dict_buffer_bytes",
+         {offsetof(struct CompressionOptions, max_dict_buffer_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    fifo_compaction_options_type_info = {
+        {"max_table_files_size",
+         {offsetof(struct CompactionOptionsFIFO, max_table_files_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"age_for_warm",
+         {offsetof(struct CompactionOptionsFIFO, age_for_warm),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"ttl",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"allow_compaction",
+         {offsetof(struct CompactionOptionsFIFO, allow_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    universal_compaction_options_type_info = {
+        {"size_ratio",
+         {offsetof(class CompactionOptionsUniversal, size_ratio),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"min_merge_width",
+         {offsetof(class CompactionOptionsUniversal, min_merge_width),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_merge_width",
+         {offsetof(class CompactionOptionsUniversal, max_merge_width),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_size_amplification_percent",
+         {offsetof(class CompactionOptionsUniversal,
+                   max_size_amplification_percent),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compression_size_percent",
+         {offsetof(class CompactionOptionsUniversal, compression_size_percent),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stop_style",
+         {offsetof(class CompactionOptionsUniversal, stop_style),
+          OptionType::kCompactionStopStyle, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"incremental",
+         {offsetof(class CompactionOptionsUniversal, incremental),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"allow_trivial_move",
+         {offsetof(class CompactionOptionsUniversal, allow_trivial_move),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cf_mutable_options_type_info = {
+        {"report_bg_io_stats",
+         {offsetof(struct MutableCFOptions, report_bg_io_stats),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"disable_auto_compactions",
+         {offsetof(struct MutableCFOptions, disable_auto_compactions),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"filter_deletes",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"check_flush_compaction_key_order",
+         {offsetof(struct MutableCFOptions, check_flush_compaction_key_order),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"paranoid_file_checks",
+         {offsetof(struct MutableCFOptions, paranoid_file_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"verify_checksums_in_compaction",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"soft_pending_compaction_bytes_limit",
+         {offsetof(struct MutableCFOptions,
+                   soft_pending_compaction_bytes_limit),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"hard_pending_compaction_bytes_limit",
+         {offsetof(struct MutableCFOptions,
+                   hard_pending_compaction_bytes_limit),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"hard_rate_limit",
+         {0, OptionType::kDouble, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"soft_rate_limit",
+         {0, OptionType::kDouble, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_compaction_bytes",
+         {offsetof(struct MutableCFOptions, max_compaction_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"expanded_compaction_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"level0_file_num_compaction_trigger",
+         {offsetof(struct MutableCFOptions, level0_file_num_compaction_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"level0_slowdown_writes_trigger",
+         {offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"level0_stop_writes_trigger",
+         {offsetof(struct MutableCFOptions, level0_stop_writes_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_grandparent_overlap_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_write_buffer_number",
+         {offsetof(struct MutableCFOptions, max_write_buffer_number),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"source_compaction_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"target_file_size_multiplier",
+         {offsetof(struct MutableCFOptions, target_file_size_multiplier),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"arena_block_size",
+         {offsetof(struct MutableCFOptions, arena_block_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"inplace_update_num_locks",
+         {offsetof(struct MutableCFOptions, inplace_update_num_locks),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_successive_merges",
+         {offsetof(struct MutableCFOptions, max_successive_merges),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_huge_page_size",
+         {offsetof(struct MutableCFOptions, memtable_huge_page_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_huge_page_tlb_size",
+         {0, OptionType::kSizeT, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"write_buffer_size",
+         {offsetof(struct MutableCFOptions, write_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_bits",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_size_ratio",
+         {offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_prefix_bloom_probes",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"memtable_whole_key_filtering",
+         {offsetof(struct MutableCFOptions, memtable_whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"min_partial_merge_operands",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_bytes_for_level_base",
+         {offsetof(struct MutableCFOptions, max_bytes_for_level_base),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"snap_refresh_nanos",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_bytes_for_level_multiplier",
+         {offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_bytes_for_level_multiplier_additional",
+         OptionTypeInfo::Vector<int>(
+             offsetof(struct MutableCFOptions,
+                      max_bytes_for_level_multiplier_additional),
+             OptionVerificationType::kNormal, OptionTypeFlags::kMutable,
+             {0, OptionType::kInt})},
+        {"max_sequential_skip_in_iterations",
+         {offsetof(struct MutableCFOptions, max_sequential_skip_in_iterations),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"target_file_size_base",
+         {offsetof(struct MutableCFOptions, target_file_size_base),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compression",
+         {offsetof(struct MutableCFOptions, compression),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"prefix_extractor",
+         OptionTypeInfo::AsCustomSharedPtr<const SliceTransform>(
+             offsetof(struct MutableCFOptions, prefix_extractor),
+             OptionVerificationType::kByNameAllowNull,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kAllowNull))},
+        {"compaction_options_fifo",
+         OptionTypeInfo::Struct(
+             "compaction_options_fifo", &fifo_compaction_options_type_info,
+             offsetof(struct MutableCFOptions, compaction_options_fifo),
+             OptionVerificationType::kNormal, OptionTypeFlags::kMutable,
+             [](const ConfigOptions& opts, const std::string& name,
+                const std::string& value, void* addr) {
+               // This is to handle backward compatibility, where
+               // compaction_options_fifo could be assigned a single scalar
+               // value, say, like "23", which would be assigned to
+               // max_table_files_size.
+               if (name == "compaction_options_fifo" &&
+                   value.find("=") == std::string::npos) {
+                 // Old format. Parse just a single uint64_t value.
+                 auto options = static_cast<CompactionOptionsFIFO*>(addr);
+                 options->max_table_files_size = ParseUint64(value);
+                 return Status::OK();
+               } else {
+                 return OptionTypeInfo::ParseStruct(
+                     opts, "compaction_options_fifo",
+                     &fifo_compaction_options_type_info, name, value, addr);
+               }
+             })},
+        {"compaction_options_universal",
+         OptionTypeInfo::Struct(
+             "compaction_options_universal",
+             &universal_compaction_options_type_info,
+             offsetof(struct MutableCFOptions, compaction_options_universal),
+             OptionVerificationType::kNormal, OptionTypeFlags::kMutable)},
+        {"ttl",
+         {offsetof(struct MutableCFOptions, ttl), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"periodic_compaction_seconds",
+         {offsetof(struct MutableCFOptions, periodic_compaction_seconds),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"enable_blob_files",
+         {offsetof(struct MutableCFOptions, enable_blob_files),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"min_blob_size",
+         {offsetof(struct MutableCFOptions, min_blob_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_file_size",
+         {offsetof(struct MutableCFOptions, blob_file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_compression_type",
+         {offsetof(struct MutableCFOptions, blob_compression_type),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"enable_blob_garbage_collection",
+         {offsetof(struct MutableCFOptions, enable_blob_garbage_collection),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_garbage_collection_age_cutoff",
+         {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_garbage_collection_force_threshold",
+         {offsetof(struct MutableCFOptions,
+                   blob_garbage_collection_force_threshold),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_compaction_readahead_size",
+         {offsetof(struct MutableCFOptions, blob_compaction_readahead_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"sample_for_compression",
+         {offsetof(struct MutableCFOptions, sample_for_compression),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"bottommost_compression",
+         {offsetof(struct MutableCFOptions, bottommost_compression),
+          OptionType::kCompressionType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {kOptNameCompOpts,
+         OptionTypeInfo::Struct(
+             kOptNameCompOpts, &compression_options_type_info,
+             offsetof(struct MutableCFOptions, compression_opts),
+             OptionVerificationType::kNormal,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever),
+             [](const ConfigOptions& opts, const std::string& name,
+                const std::string& value, void* addr) {
+               // This is to handle backward compatibility, where
+               // compression_options was a ":" separated list.
+               if (name == kOptNameCompOpts &&
+                   value.find("=") == std::string::npos) {
+                 auto* compression = static_cast<CompressionOptions*>(addr);
+                 return ParseCompressionOptions(value, name, *compression);
+               } else {
+                 return OptionTypeInfo::ParseStruct(
+                     opts, kOptNameCompOpts, &compression_options_type_info,
+                     name, value, addr);
+               }
+             })},
+        {kOptNameBMCompOpts,
+         OptionTypeInfo::Struct(
+             kOptNameBMCompOpts, &compression_options_type_info,
+             offsetof(struct MutableCFOptions, bottommost_compression_opts),
+             OptionVerificationType::kNormal,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever),
+             [](const ConfigOptions& opts, const std::string& name,
+                const std::string& value, void* addr) {
+               // This is to handle backward compatibility, where
+               // compression_options was a ":" separated list.
+               if (name == kOptNameBMCompOpts &&
+                   value.find("=") == std::string::npos) {
+                 auto* compression = static_cast<CompressionOptions*>(addr);
+                 return ParseCompressionOptions(value, name, *compression);
+               } else {
+                 return OptionTypeInfo::ParseStruct(
+                     opts, kOptNameBMCompOpts, &compression_options_type_info,
+                     name, value, addr);
+               }
+             })},
+        // End special case properties
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cf_immutable_options_type_info = {
+        /* not yet supported
+        CompressionOptions compression_opts;
+        TablePropertiesCollectorFactories table_properties_collector_factories;
+        using TablePropertiesCollectorFactories =
+            std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>;
+        UpdateStatus (*inplace_callback)(char* existing_value,
+                                         uint34_t* existing_value_size,
+                                         Slice delta_value,
+                                         std::string* merged_value);
+        std::vector<DbPath> cf_paths;
+         */
+        {"compaction_measure_io_stats",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"inplace_update_support",
+         {offset_of(&ImmutableCFOptions::inplace_update_support),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"level_compaction_dynamic_level_bytes",
+         {offset_of(&ImmutableCFOptions::level_compaction_dynamic_level_bytes),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"optimize_filters_for_hits",
+         {offset_of(&ImmutableCFOptions::optimize_filters_for_hits),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"force_consistency_checks",
+         {offset_of(&ImmutableCFOptions::force_consistency_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"purge_redundant_kvs_while_flush",
+         {offset_of(&ImmutableCFOptions::purge_redundant_kvs_while_flush),
+          OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"max_mem_compaction_level",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"max_write_buffer_number_to_maintain",
+         {offset_of(&ImmutableCFOptions::max_write_buffer_number_to_maintain),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone, 0}},
+        {"max_write_buffer_size_to_maintain",
+         {offset_of(&ImmutableCFOptions::max_write_buffer_size_to_maintain),
+          OptionType::kInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"min_write_buffer_number_to_merge",
+         {offset_of(&ImmutableCFOptions::min_write_buffer_number_to_merge),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone, 0}},
+        {"num_levels",
+         {offset_of(&ImmutableCFOptions::num_levels), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"bloom_locality",
+         {offset_of(&ImmutableCFOptions::bloom_locality), OptionType::kUInt32T,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"rate_limit_delay_max_milliseconds",
+         {0, OptionType::kUInt, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"compression_per_level",
+         OptionTypeInfo::Vector<CompressionType>(
+             offset_of(&ImmutableCFOptions::compression_per_level),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+             {0, OptionType::kCompressionType})},
+        {"comparator",
+         OptionTypeInfo::AsCustomRawPtr<const Comparator>(
+             offset_of(&ImmutableCFOptions::user_comparator),
+             OptionVerificationType::kByName, OptionTypeFlags::kCompareLoose,
+             // Serializes a Comparator
+             [](const ConfigOptions& opts, const std::string&, const void* addr,
+                std::string* value) {
+               // it's a const pointer of const Comparator*
+               const auto* ptr = static_cast<const Comparator* const*>(addr);
+
+               // Since the user-specified comparator will be wrapped by
+               // InternalKeyComparator, we should persist the user-specified
+               // one instead of InternalKeyComparator.
+               if (*ptr == nullptr) {
+                 *value = kNullptrString;
+               } else if (opts.mutable_options_only) {
+                 *value = "";
+               } else {
+                 const Comparator* root_comp = (*ptr)->GetRootComparator();
+                 if (root_comp == nullptr) {
+                   root_comp = (*ptr);
+                 }
+                 *value = root_comp->ToString(opts);
+               }
+               return Status::OK();
+             },
+             /* Use the default match function*/ nullptr)},
+        {"memtable_insert_with_hint_prefix_extractor",
+         OptionTypeInfo::AsCustomSharedPtr<const SliceTransform>(
+             offset_of(&ImmutableCFOptions::
+                           memtable_insert_with_hint_prefix_extractor),
+             OptionVerificationType::kByNameAllowNull, OptionTypeFlags::kNone)},
+        {"memtable_factory",
+         {offset_of(&ImmutableCFOptions::memtable_factory),
+          OptionType::kCustomizable, OptionVerificationType::kByName,
+          OptionTypeFlags::kShared,
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            std::unique_ptr<MemTableRepFactory> factory;
+            auto* shared =
+                static_cast<std::shared_ptr<MemTableRepFactory>*>(addr);
+            Status s =
+                MemTableRepFactory::CreateFromString(opts, value, &factory);
+            if (factory && s.ok()) {
+              shared->reset(factory.release());
+            }
+            return s;
+          }}},
+        {"memtable",
+         {offset_of(&ImmutableCFOptions::memtable_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
+          OptionTypeFlags::kShared,
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            std::unique_ptr<MemTableRepFactory> factory;
+            auto* shared =
+                static_cast<std::shared_ptr<MemTableRepFactory>*>(addr);
+            Status s =
+                MemTableRepFactory::CreateFromString(opts, value, &factory);
+            if (factory && s.ok()) {
+              shared->reset(factory.release());
+            }
+            return s;
+          }}},
+        {"table_factory", OptionTypeInfo::AsCustomSharedPtr<TableFactory>(
+                              offset_of(&ImmutableCFOptions::table_factory),
+                              OptionVerificationType::kByName,
+                              (OptionTypeFlags::kCompareLoose |
+                               OptionTypeFlags::kStringNameOnly |
+                               OptionTypeFlags::kDontPrepare))},
+        {"block_based_table_factory",
+         {offset_of(&ImmutableCFOptions::table_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
+          OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose,
+          // Parses the input value and creates a BlockBasedTableFactory
+          [](const ConfigOptions& opts, const std::string& name,
+             const std::string& value, void* addr) {
+            BlockBasedTableOptions* old_opts = nullptr;
+            auto table_factory =
+                static_cast<std::shared_ptr<TableFactory>*>(addr);
+            if (table_factory->get() != nullptr) {
+              old_opts =
+                  table_factory->get()->GetOptions<BlockBasedTableOptions>();
+            }
+            if (name == "block_based_table_factory") {
+              std::unique_ptr<TableFactory> new_factory;
+              if (old_opts != nullptr) {
+                new_factory.reset(NewBlockBasedTableFactory(*old_opts));
+              } else {
+                new_factory.reset(NewBlockBasedTableFactory());
+              }
+              Status s = new_factory->ConfigureFromString(opts, value);
+              if (s.ok()) {
+                table_factory->reset(new_factory.release());
+              }
+              return s;
+            } else if (old_opts != nullptr) {
+              return table_factory->get()->ConfigureOption(opts, name, value);
+            } else {
+              return Status::NotFound("Mismatched table option: ", name);
+            }
+          }}},
+        {"plain_table_factory",
+         {offset_of(&ImmutableCFOptions::table_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
+          OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose,
+          // Parses the input value and creates a PlainTableFactory
+          [](const ConfigOptions& opts, const std::string& name,
+             const std::string& value, void* addr) {
+            PlainTableOptions* old_opts = nullptr;
+            auto table_factory =
+                static_cast<std::shared_ptr<TableFactory>*>(addr);
+            if (table_factory->get() != nullptr) {
+              old_opts = table_factory->get()->GetOptions<PlainTableOptions>();
+            }
+            if (name == "plain_table_factory") {
+              std::unique_ptr<TableFactory> new_factory;
+              if (old_opts != nullptr) {
+                new_factory.reset(NewPlainTableFactory(*old_opts));
+              } else {
+                new_factory.reset(NewPlainTableFactory());
+              }
+              Status s = new_factory->ConfigureFromString(opts, value);
+              if (s.ok()) {
+                table_factory->reset(new_factory.release());
+              }
+              return s;
+            } else if (old_opts != nullptr) {
+              return table_factory->get()->ConfigureOption(opts, name, value);
+            } else {
+              return Status::NotFound("Mismatched table option: ", name);
+            }
+          }}},
+        {"table_properties_collectors",
+         OptionTypeInfo::Vector<
+             std::shared_ptr<TablePropertiesCollectorFactory>>(
+             offset_of(
+                 &ImmutableCFOptions::table_properties_collector_factories),
+             OptionVerificationType::kByName, OptionTypeFlags::kNone,
+             OptionTypeInfo::AsCustomSharedPtr<TablePropertiesCollectorFactory>(
+                 0, OptionVerificationType::kByName, OptionTypeFlags::kNone))},
+        {"compaction_filter",
+         OptionTypeInfo::AsCustomRawPtr<const CompactionFilter>(
+             offset_of(&ImmutableCFOptions::compaction_filter),
+             OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)},
+        {"compaction_filter_factory",
+         OptionTypeInfo::AsCustomSharedPtr<CompactionFilterFactory>(
+             offset_of(&ImmutableCFOptions::compaction_filter_factory),
+             OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)},
+        {"merge_operator",
+         OptionTypeInfo::AsCustomSharedPtr<MergeOperator>(
+             offset_of(&ImmutableCFOptions::merge_operator),
+             OptionVerificationType::kByNameAllowFromNull,
+             OptionTypeFlags::kCompareLoose | OptionTypeFlags::kAllowNull)},
+        {"compaction_style",
+         {offset_of(&ImmutableCFOptions::compaction_style),
+          OptionType::kCompactionStyle, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"compaction_pri",
+         {offset_of(&ImmutableCFOptions::compaction_pri),
+          OptionType::kCompactionPri, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"sst_partitioner_factory",
+         OptionTypeInfo::AsCustomSharedPtr<SstPartitionerFactory>(
+             offset_of(&ImmutableCFOptions::sst_partitioner_factory),
+             OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)},
+};
 
-ImmutableCFOptions::ImmutableCFOptions(const Options& options)
-    : ImmutableCFOptions(ImmutableDBOptions(options), options) {}
+const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions";
 
-ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
-                                       const ColumnFamilyOptions& cf_options)
+class ConfigurableMutableCFOptions : public Configurable {
+ public:
+  explicit ConfigurableMutableCFOptions(const MutableCFOptions& mcf) {
+    mutable_ = mcf;
+    RegisterOptions(&mutable_, &cf_mutable_options_type_info);
+  }
+
+ protected:
+  MutableCFOptions mutable_;
+};
+
+class ConfigurableCFOptions : public ConfigurableMutableCFOptions {
+ public:
+  ConfigurableCFOptions(const ColumnFamilyOptions& opts,
+                        const std::unordered_map<std::string, std::string>* map)
+      : ConfigurableMutableCFOptions(MutableCFOptions(opts)),
+        immutable_(opts),
+        cf_options_(opts),
+        opt_map_(map) {
+    RegisterOptions(&immutable_, &cf_immutable_options_type_info);
+  }
+
+ protected:
+  Status ConfigureOptions(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      std::unordered_map<std::string, std::string>* unused) override {
+    Status s = Configurable::ConfigureOptions(config_options, opts_map, unused);
+    if (s.ok()) {
+      UpdateColumnFamilyOptions(mutable_, &cf_options_);
+      UpdateColumnFamilyOptions(immutable_, &cf_options_);
+      s = PrepareOptions(config_options);
+    }
+    return s;
+  }
+
+  virtual const void* GetOptionsPtr(const std::string& name) const override {
+    if (name == OptionsHelper::kCFOptionsName) {
+      return &cf_options_;
+    } else {
+      return ConfigurableMutableCFOptions::GetOptionsPtr(name);
+    }
+  }
+
+  bool OptionsAreEqual(const ConfigOptions& config_options,
+                       const OptionTypeInfo& opt_info,
+                       const std::string& opt_name, const void* const this_ptr,
+                       const void* const that_ptr,
+                       std::string* mismatch) const override {
+    bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr,
+                                    that_ptr, mismatch);
+    if (!equals && opt_info.IsByName()) {
+      if (opt_map_ == nullptr) {
+        equals = true;
+      } else {
+        const auto& iter = opt_map_->find(opt_name);
+        if (iter == opt_map_->end()) {
+          equals = true;
+        } else {
+          equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr,
+                                           iter->second);
+        }
+      }
+      if (equals) {  // False alarm, clear mismatch
+        *mismatch = "";
+      }
+    }
+    if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) {
+      const auto* this_config = opt_info.AsRawPointer<Configurable>(this_ptr);
+      if (this_config == nullptr) {
+        const auto& iter = opt_map_->find(opt_name);
+        // If the name exists in the map and is not empty/null,
+        // then the this_config should be set.
+        if (iter != opt_map_->end() && !iter->second.empty() &&
+            iter->second != kNullptrString) {
+          *mismatch = opt_name;
+          equals = false;
+        }
+      }
+    }
+    return equals;
+  }
+
+ private:
+  ImmutableCFOptions immutable_;
+  ColumnFamilyOptions cf_options_;
+  const std::unordered_map<std::string, std::string>* opt_map_;
+};
+
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const MutableCFOptions& opts) {
+  std::unique_ptr<Configurable> ptr(new ConfigurableMutableCFOptions(opts));
+  return ptr;
+}
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const ColumnFamilyOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  std::unique_ptr<Configurable> ptr(new ConfigurableCFOptions(opts, opt_map));
+  return ptr;
+}
+#endif  // ROCKSDB_LITE
+
+ImmutableCFOptions::ImmutableCFOptions() : ImmutableCFOptions(Options()) {}
+
+ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
     : compaction_style(cf_options.compaction_style),
       compaction_pri(cf_options.compaction_pri),
       user_comparator(cf_options.comparator),
       internal_comparator(InternalKeyComparator(cf_options.comparator)),
-      merge_operator(cf_options.merge_operator.get()),
+      merge_operator(cf_options.merge_operator),
       compaction_filter(cf_options.compaction_filter),
-      compaction_filter_factory(cf_options.compaction_filter_factory.get()),
+      compaction_filter_factory(cf_options.compaction_filter_factory),
       min_write_buffer_number_to_merge(
           cf_options.min_write_buffer_number_to_merge),
       max_write_buffer_number_to_maintain(
@@ -38,47 +844,45 @@
           cf_options.max_write_buffer_size_to_maintain),
       inplace_update_support(cf_options.inplace_update_support),
       inplace_callback(cf_options.inplace_callback),
-      info_log(db_options.info_log.get()),
-      statistics(db_options.statistics.get()),
-      rate_limiter(db_options.rate_limiter.get()),
-      info_log_level(db_options.info_log_level),
-      env(db_options.env),
-      fs(db_options.fs.get()),
-      allow_mmap_reads(db_options.allow_mmap_reads),
-      allow_mmap_writes(db_options.allow_mmap_writes),
-      db_paths(db_options.db_paths),
-      memtable_factory(cf_options.memtable_factory.get()),
-      table_factory(cf_options.table_factory.get()),
+      memtable_factory(cf_options.memtable_factory),
+      table_factory(cf_options.table_factory),
       table_properties_collector_factories(
           cf_options.table_properties_collector_factories),
-      advise_random_on_open(db_options.advise_random_on_open),
       bloom_locality(cf_options.bloom_locality),
       purge_redundant_kvs_while_flush(
           cf_options.purge_redundant_kvs_while_flush),
-      use_fsync(db_options.use_fsync),
       compression_per_level(cf_options.compression_per_level),
-      bottommost_compression(cf_options.bottommost_compression),
-      bottommost_compression_opts(cf_options.bottommost_compression_opts),
-      compression_opts(cf_options.compression_opts),
       level_compaction_dynamic_level_bytes(
           cf_options.level_compaction_dynamic_level_bytes),
-      access_hint_on_compaction_start(
-          db_options.access_hint_on_compaction_start),
-      new_table_reader_for_compaction_inputs(
-          db_options.new_table_reader_for_compaction_inputs),
       num_levels(cf_options.num_levels),
       optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
       force_consistency_checks(cf_options.force_consistency_checks),
-      allow_ingest_behind(db_options.allow_ingest_behind),
-      preserve_deletes(db_options.preserve_deletes),
-      listeners(db_options.listeners),
-      row_cache(db_options.row_cache),
-      max_subcompactions(db_options.max_subcompactions),
       memtable_insert_with_hint_prefix_extractor(
-          cf_options.memtable_insert_with_hint_prefix_extractor.get()),
+          cf_options.memtable_insert_with_hint_prefix_extractor),
       cf_paths(cf_options.cf_paths),
       compaction_thread_limiter(cf_options.compaction_thread_limiter),
-      sst_file_checksum_func(db_options.sst_file_checksum_func.get()) {}
+      sst_partitioner_factory(cf_options.sst_partitioner_factory) {}
+
+ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {}
+
+ImmutableOptions::ImmutableOptions(const Options& options)
+    : ImmutableOptions(options, options) {}
+
+ImmutableOptions::ImmutableOptions(const DBOptions& db_options,
+                                   const ColumnFamilyOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const DBOptions& db_options,
+                                   const ImmutableCFOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options,
+                                   const ColumnFamilyOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options,
+                                   const ImmutableCFOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
 
 // Multiple two operands. If they overflow, return op1.
 uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {
@@ -109,6 +913,17 @@
   }
 }
 
+size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options) {
+  // We do not want to pin meta-blocks that almost certainly came from intra-L0
+  // or a former larger `write_buffer_size` value to avoid surprising users with
+  // pinned memory usage. We use a factor of 1.5 to account for overhead
+  // introduced during flush in most cases.
+  if (port::kMaxSizet / 3 < cf_options.write_buffer_size / 2) {
+    return port::kMaxSizet;
+  }
+  return cf_options.write_buffer_size / 2 * 3;
+}
+
 void MutableCFOptions::RefreshDerivedOptions(int num_levels,
                                              CompactionStyle compaction_style) {
   max_file_size.resize(num_levels);
@@ -147,9 +962,10 @@
   ROCKS_LOG_INFO(log,
                  "                 inplace_update_num_locks: %" ROCKSDB_PRIszt,
                  inplace_update_num_locks);
-  ROCKS_LOG_INFO(
-      log, "                         prefix_extractor: %s",
-      prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
+  ROCKS_LOG_INFO(log, "                         prefix_extractor: %s",
+                 prefix_extractor == nullptr
+                     ? "nullptr"
+                     : prefix_extractor->GetId().c_str());
   ROCKS_LOG_INFO(log, "                 disable_auto_compactions: %d",
                  disable_auto_compactions);
   ROCKS_LOG_INFO(log, "      soft_pending_compaction_bytes_limit: %" PRIu64,
@@ -192,6 +1008,8 @@
                  result.c_str());
   ROCKS_LOG_INFO(log, "        max_sequential_skip_in_iterations: %" PRIu64,
                  max_sequential_skip_in_iterations);
+  ROCKS_LOG_INFO(log, "         check_flush_compaction_key_order: %d",
+                 check_flush_compaction_key_order);
   ROCKS_LOG_INFO(log, "                     paranoid_file_checks: %d",
                  paranoid_file_checks);
   ROCKS_LOG_INFO(log, "                       report_bg_io_stats: %d",
@@ -217,15 +1035,60 @@
   ROCKS_LOG_INFO(
       log, "compaction_options_universal.allow_trivial_move : %d",
       static_cast<int>(compaction_options_universal.allow_trivial_move));
+  ROCKS_LOG_INFO(log, "compaction_options_universal.incremental        : %d",
+                 static_cast<int>(compaction_options_universal.incremental));
 
   // FIFO Compaction Options
   ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64,
                  compaction_options_fifo.max_table_files_size);
   ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d",
                  compaction_options_fifo.allow_compaction);
+
+  // Blob file related options
+  ROCKS_LOG_INFO(log, "                        enable_blob_files: %s",
+                 enable_blob_files ? "true" : "false");
+  ROCKS_LOG_INFO(log, "                            min_blob_size: %" PRIu64,
+                 min_blob_size);
+  ROCKS_LOG_INFO(log, "                           blob_file_size: %" PRIu64,
+                 blob_file_size);
+  ROCKS_LOG_INFO(log, "                    blob_compression_type: %s",
+                 CompressionTypeToString(blob_compression_type).c_str());
+  ROCKS_LOG_INFO(log, "           enable_blob_garbage_collection: %s",
+                 enable_blob_garbage_collection ? "true" : "false");
+  ROCKS_LOG_INFO(log, "       blob_garbage_collection_age_cutoff: %f",
+                 blob_garbage_collection_age_cutoff);
+  ROCKS_LOG_INFO(log, "  blob_garbage_collection_force_threshold: %f",
+                 blob_garbage_collection_force_threshold);
+  ROCKS_LOG_INFO(log, "           blob_compaction_readahead_size: %" PRIu64,
+                 blob_compaction_readahead_size);
 }
 
 MutableCFOptions::MutableCFOptions(const Options& options)
     : MutableCFOptions(ColumnFamilyOptions(options)) {}
 
+#ifndef ROCKSDB_LITE
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Logger* /*info_log*/, MutableCFOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  ConfigOptions config_options;
+  Status s = OptionTypeInfo::ParseType(
+      config_options, options_map, cf_mutable_options_type_info, new_options);
+  if (!s.ok()) {
+    *new_options = base_options;
+  }
+  return s;
+}
+
+Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
+                                     const MutableCFOptions& mutable_opts,
+                                     std::string* opt_string) {
+  assert(opt_string);
+  opt_string->clear();
+  return OptionTypeInfo::SerializeType(
+      config_options, cf_mutable_options_type_info, &mutable_opts, opt_string);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/cf_options.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/cf_options.h	2025-05-19 16:14:27.000000000 +0000
@@ -20,23 +20,23 @@
 // of DB. Raw pointers defined in this struct do not have ownership to the data
 // they point to. Options contains std::shared_ptr to these data.
 struct ImmutableCFOptions {
-  explicit ImmutableCFOptions(const Options& options);
-
-  ImmutableCFOptions(const ImmutableDBOptions& db_options,
-                     const ColumnFamilyOptions& cf_options);
+ public:
+  static const char* kName() { return "ImmutableCFOptions"; }
+  explicit ImmutableCFOptions();
+  explicit ImmutableCFOptions(const ColumnFamilyOptions& cf_options);
 
   CompactionStyle compaction_style;
 
   CompactionPri compaction_pri;
 
   const Comparator* user_comparator;
-  InternalKeyComparator internal_comparator;
+  InternalKeyComparator internal_comparator;  // Only in Immutable
 
-  MergeOperator* merge_operator;
+  std::shared_ptr<MergeOperator> merge_operator;
 
   const CompactionFilter* compaction_filter;
 
-  CompactionFilterFactory* compaction_filter_factory;
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
 
   int min_write_buffer_number_to_merge;
 
@@ -51,85 +51,58 @@
                                    Slice delta_value,
                                    std::string* merged_value);
 
-  Logger* info_log;
-
-  Statistics* statistics;
-
-  RateLimiter* rate_limiter;
-
-  InfoLogLevel info_log_level;
-
-  Env* env;
-
-  FileSystem* fs;
-
-  // Allow the OS to mmap file for reading sst tables. Default: false
-  bool allow_mmap_reads;
-
-  // Allow the OS to mmap file for writing. Default: false
-  bool allow_mmap_writes;
+  std::shared_ptr<MemTableRepFactory> memtable_factory;
 
-  std::vector<DbPath> db_paths;
-
-  MemTableRepFactory* memtable_factory;
-
-  TableFactory* table_factory;
+  std::shared_ptr<TableFactory> table_factory;
 
   Options::TablePropertiesCollectorFactories
       table_properties_collector_factories;
 
-  bool advise_random_on_open;
-
   // This options is required by PlainTableReader. May need to move it
   // to PlainTableOptions just like bloom_bits_per_key
   uint32_t bloom_locality;
 
   bool purge_redundant_kvs_while_flush;
 
-  bool use_fsync;
-
   std::vector<CompressionType> compression_per_level;
 
-  CompressionType bottommost_compression;
-
-  CompressionOptions bottommost_compression_opts;
-
-  CompressionOptions compression_opts;
-
   bool level_compaction_dynamic_level_bytes;
 
-  Options::AccessHint access_hint_on_compaction_start;
-
-  bool new_table_reader_for_compaction_inputs;
-
   int num_levels;
 
   bool optimize_filters_for_hits;
 
   bool force_consistency_checks;
 
-  bool allow_ingest_behind;
+  std::shared_ptr<const SliceTransform>
+      memtable_insert_with_hint_prefix_extractor;
 
-  bool preserve_deletes;
+  std::vector<DbPath> cf_paths;
 
-  // A vector of EventListeners which callback functions will be called
-  // when specific RocksDB event happens.
-  std::vector<std::shared_ptr<EventListener>> listeners;
+  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter;
 
-  std::shared_ptr<Cache> row_cache;
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory;
+};
 
-  uint32_t max_subcompactions;
+struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions {
+  explicit ImmutableOptions();
+  explicit ImmutableOptions(const Options& options);
 
-  const SliceTransform* memtable_insert_with_hint_prefix_extractor;
+  ImmutableOptions(const DBOptions& db_options,
+                   const ColumnFamilyOptions& cf_options);
 
-  std::vector<DbPath> cf_paths;
+  ImmutableOptions(const ImmutableDBOptions& db_options,
+                   const ImmutableCFOptions& cf_options);
 
-  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter;
+  ImmutableOptions(const DBOptions& db_options,
+                   const ImmutableCFOptions& cf_options);
 
-  FileChecksumFunc* sst_file_checksum_func;
+  ImmutableOptions(const ImmutableDBOptions& db_options,
+                   const ColumnFamilyOptions& cf_options);
 };
 
 struct MutableCFOptions {
+  static const char* kName() { return "MutableCFOptions"; }
   explicit MutableCFOptions(const ColumnFamilyOptions& options)
       : write_buffer_size(options.write_buffer_size),
         max_write_buffer_number(options.max_write_buffer_number),
@@ -161,12 +134,29 @@
             options.max_bytes_for_level_multiplier_additional),
         compaction_options_fifo(options.compaction_options_fifo),
         compaction_options_universal(options.compaction_options_universal),
+        enable_blob_files(options.enable_blob_files),
+        min_blob_size(options.min_blob_size),
+        blob_file_size(options.blob_file_size),
+        blob_compression_type(options.blob_compression_type),
+        enable_blob_garbage_collection(options.enable_blob_garbage_collection),
+        blob_garbage_collection_age_cutoff(
+            options.blob_garbage_collection_age_cutoff),
+        blob_garbage_collection_force_threshold(
+            options.blob_garbage_collection_force_threshold),
+        blob_compaction_readahead_size(options.blob_compaction_readahead_size),
         max_sequential_skip_in_iterations(
             options.max_sequential_skip_in_iterations),
+        check_flush_compaction_key_order(
+            options.check_flush_compaction_key_order),
         paranoid_file_checks(options.paranoid_file_checks),
         report_bg_io_stats(options.report_bg_io_stats),
         compression(options.compression),
-        sample_for_compression(options.sample_for_compression) {
+        bottommost_compression(options.bottommost_compression),
+        compression_opts(options.compression_opts),
+        bottommost_compression_opts(options.bottommost_compression_opts),
+        bottommost_temperature(options.bottommost_temperature),
+        sample_for_compression(
+            options.sample_for_compression) {  // TODO: is 0 fine here?
     RefreshDerivedOptions(options.num_levels, options.compaction_style);
   }
 
@@ -194,10 +184,21 @@
         ttl(0),
         periodic_compaction_seconds(0),
         compaction_options_fifo(),
+        enable_blob_files(false),
+        min_blob_size(0),
+        blob_file_size(0),
+        blob_compression_type(kNoCompression),
+        enable_blob_garbage_collection(false),
+        blob_garbage_collection_age_cutoff(0.0),
+        blob_garbage_collection_force_threshold(0.0),
+        blob_compaction_readahead_size(0),
         max_sequential_skip_in_iterations(0),
+        check_flush_compaction_key_order(true),
         paranoid_file_checks(false),
         report_bg_io_stats(false),
         compression(Snappy_Supported() ? kSnappyCompression : kNoCompression),
+        bottommost_compression(kDisableCompressionOption),
+        bottommost_temperature(Temperature::kUnknown),
         sample_for_compression(0) {}
 
   explicit MutableCFOptions(const Options& options);
@@ -248,11 +249,29 @@
   CompactionOptionsFIFO compaction_options_fifo;
   CompactionOptionsUniversal compaction_options_universal;
 
+  // Blob file related options
+  bool enable_blob_files;
+  uint64_t min_blob_size;
+  uint64_t blob_file_size;
+  CompressionType blob_compression_type;
+  bool enable_blob_garbage_collection;
+  double blob_garbage_collection_age_cutoff;
+  double blob_garbage_collection_force_threshold;
+  uint64_t blob_compaction_readahead_size;
+
   // Misc options
   uint64_t max_sequential_skip_in_iterations;
+  bool check_flush_compaction_key_order;
   bool paranoid_file_checks;
   bool report_bg_io_stats;
   CompressionType compression;
+  CompressionType bottommost_compression;
+  CompressionOptions compression_opts;
+  CompressionOptions bottommost_compression_opts;
+  // TODO this experimental option isn't made configurable
+  // through strings yet.
+  Temperature bottommost_temperature;
+
   uint64_t sample_for_compression;
 
   // Derived options
@@ -266,4 +285,20 @@
 uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options,
     int level, CompactionStyle compaction_style, int base_level = 1,
     bool level_compaction_dynamic_level_bytes = false);
+
+// Get the max size of an L0 file for which we will pin its meta-blocks when
+// `pin_l0_filter_and_index_blocks_in_cache` is set.
+size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options);
+
+#ifndef ROCKSDB_LITE
+Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
+                                     const MutableCFOptions& mutable_opts,
+                                     std::string* opt_string);
+
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Logger* info_log, MutableCFOptions* new_options);
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,785 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/configurable.h"
+
+#include "logging/logging.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void Configurable::RegisterOptions(
+    const std::string& name, void* opt_ptr,
+    const std::unordered_map<std::string, OptionTypeInfo>* type_map) {
+  RegisteredOptions opts;
+  opts.name = name;
+#ifndef ROCKSDB_LITE
+  opts.type_map = type_map;
+#else
+  (void)type_map;
+#endif  // ROCKSDB_LITE
+  opts.opt_ptr = opt_ptr;
+  options_.emplace_back(opts);
+}
+
+//*************************************************************************
+//
+//       Methods for Initializing and Validating Configurable Objects
+//
+//*************************************************************************
+
+Status Configurable::PrepareOptions(const ConfigOptions& opts) {
+  // We ignore the invoke_prepare_options here intentionally,
+  // as if you are here, you must have called PrepareOptions explicitly.
+  Status status = Status::OK();
+#ifndef ROCKSDB_LITE
+  for (auto opt_iter : options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (auto map_iter : *(opt_iter.type_map)) {
+        auto& opt_info = map_iter.second;
+        if (!opt_info.IsDeprecated() && !opt_info.IsAlias() &&
+            opt_info.IsConfigurable()) {
+          if (!opt_info.IsEnabled(OptionTypeFlags::kDontPrepare)) {
+            Configurable* config =
+                opt_info.AsRawPointer<Configurable>(opt_iter.opt_ptr);
+            if (config != nullptr) {
+              status = config->PrepareOptions(opts);
+            } else if (!opt_info.CanBeNull()) {
+              status = Status::NotFound("Missing configurable object",
+                                        map_iter.first);
+            }
+            if (!status.ok()) {
+              return status;
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+  (void)opts;
+#endif  // ROCKSDB_LITE
+  return status;
+}
+
+Status Configurable::ValidateOptions(const DBOptions& db_opts,
+                                     const ColumnFamilyOptions& cf_opts) const {
+  Status status;
+#ifndef ROCKSDB_LITE
+  for (auto opt_iter : options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (auto map_iter : *(opt_iter.type_map)) {
+        auto& opt_info = map_iter.second;
+        if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) {
+          if (opt_info.IsConfigurable()) {
+            const Configurable* config =
+                opt_info.AsRawPointer<Configurable>(opt_iter.opt_ptr);
+            if (config != nullptr) {
+              status = config->ValidateOptions(db_opts, cf_opts);
+            } else if (!opt_info.CanBeNull()) {
+              status = Status::NotFound("Missing configurable object",
+                                        map_iter.first);
+            }
+            if (!status.ok()) {
+              return status;
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+  (void)db_opts;
+  (void)cf_opts;
+#endif  // ROCKSDB_LITE
+  return status;
+}
+
+/*********************************************************************************/
+/*                                                                               */
+/*       Methods for Retrieving Options from Configurables */
+/*                                                                               */
+/*********************************************************************************/
+
+const void* Configurable::GetOptionsPtr(const std::string& name) const {
+  for (auto o : options_) {
+    if (o.name == name) {
+      return o.opt_ptr;
+    }
+  }
+  return nullptr;
+}
+
+std::string Configurable::GetOptionName(const std::string& opt_name) const {
+  return opt_name;
+}
+
+#ifndef ROCKSDB_LITE
+const OptionTypeInfo* ConfigurableHelper::FindOption(
+    const std::vector<Configurable::RegisteredOptions>& options,
+    const std::string& short_name, std::string* opt_name, void** opt_ptr) {
+  for (auto iter : options) {
+    if (iter.type_map != nullptr) {
+      const auto opt_info =
+          OptionTypeInfo::Find(short_name, *(iter.type_map), opt_name);
+      if (opt_info != nullptr) {
+        *opt_ptr = iter.opt_ptr;
+        return opt_info;
+      }
+    }
+  }
+  return nullptr;
+}
+#endif  // ROCKSDB_LITE
+
+//*************************************************************************
+//
+//       Methods for Configuring Options from Strings/Name-Value Pairs/Maps
+//
+//*************************************************************************
+
+Status Configurable::ConfigureFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map) {
+  Status s = ConfigureFromMap(config_options, opts_map, nullptr);
+  return s;
+}
+
+Status Configurable::ConfigureFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    std::unordered_map<std::string, std::string>* unused) {
+  return ConfigureOptions(config_options, opts_map, unused);
+}
+
+Status Configurable::ConfigureOptions(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    std::unordered_map<std::string, std::string>* unused) {
+  std::string curr_opts;
+  Status s;
+  if (!opts_map.empty()) {
+    // There are options in the map.
+    // Save the current configuration in curr_opts and then configure the
+    // options, but do not prepare them now.  We will do all the prepare when
+    // the configuration is complete.
+    ConfigOptions copy = config_options;
+    copy.invoke_prepare_options = false;
+#ifndef ROCKSDB_LITE
+    if (!config_options.ignore_unknown_options) {
+      // If we are not ignoring unused, get the defaults in case we need to
+      // reset
+      copy.depth = ConfigOptions::kDepthDetailed;
+      copy.delimiter = "; ";
+      GetOptionString(copy, &curr_opts).PermitUncheckedError();
+    }
+#endif  // ROCKSDB_LITE
+
+    s = ConfigurableHelper::ConfigureOptions(copy, *this, opts_map, unused);
+  }
+  if (config_options.invoke_prepare_options && s.ok()) {
+    s = PrepareOptions(config_options);
+  }
+#ifndef ROCKSDB_LITE
+  if (!s.ok() && !curr_opts.empty()) {
+    ConfigOptions reset = config_options;
+    reset.ignore_unknown_options = true;
+    reset.invoke_prepare_options = true;
+    reset.ignore_unsupported_options = true;
+    // There are some options to reset from this current error
+    ConfigureFromString(reset, curr_opts).PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+  return s;
+}
+
+Status Configurable::ParseStringOptions(const ConfigOptions& /*config_options*/,
+                                        const std::string& /*opts_str*/) {
+  return Status::OK();
+}
+
+Status Configurable::ConfigureFromString(const ConfigOptions& config_options,
+                                         const std::string& opts_str) {
+  Status s;
+  if (!opts_str.empty()) {
+#ifndef ROCKSDB_LITE
+    if (opts_str.find(';') != std::string::npos ||
+        opts_str.find('=') != std::string::npos) {
+      std::unordered_map<std::string, std::string> opt_map;
+      s = StringToMap(opts_str, &opt_map);
+      if (s.ok()) {
+        s = ConfigureFromMap(config_options, opt_map, nullptr);
+      }
+    } else {
+#endif  // ROCKSDB_LITE
+      s = ParseStringOptions(config_options, opts_str);
+      if (s.ok() && config_options.invoke_prepare_options) {
+        s = PrepareOptions(config_options);
+      }
+#ifndef ROCKSDB_LITE
+    }
+#endif  // ROCKSDB_LITE
+  } else if (config_options.invoke_prepare_options) {
+    s = PrepareOptions(config_options);
+  } else {
+    s = Status::OK();
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+/**
+ * Sets the value of the named property to the input value, returning OK on
+ * succcess.
+ */
+Status Configurable::ConfigureOption(const ConfigOptions& config_options,
+                                     const std::string& name,
+                                     const std::string& value) {
+  return ConfigurableHelper::ConfigureSingleOption(config_options, *this, name,
+                                                   value);
+}
+
+/**
+ * Looks for the named option amongst the options for this type and sets
+ * the value for it to be the input value.
+ * If the name was found, found_option will be set to true and the resulting
+ * status should be returned.
+ */
+
+Status Configurable::ParseOption(const ConfigOptions& config_options,
+                                 const OptionTypeInfo& opt_info,
+                                 const std::string& opt_name,
+                                 const std::string& opt_value, void* opt_ptr) {
+  if (opt_info.IsMutable()) {
+    if (config_options.mutable_options_only) {
+      // This option is mutable. Treat all of its children as mutable as well
+      ConfigOptions copy = config_options;
+      copy.mutable_options_only = false;
+      return opt_info.Parse(copy, opt_name, opt_value, opt_ptr);
+    } else {
+      return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr);
+    }
+  } else if (config_options.mutable_options_only) {
+    return Status::InvalidArgument("Option not changeable: " + opt_name);
+  } else {
+    return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+Status ConfigurableHelper::ConfigureOptions(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    std::unordered_map<std::string, std::string>* unused) {
+  std::unordered_map<std::string, std::string> remaining = opts_map;
+  Status s = Status::OK();
+  if (!opts_map.empty()) {
+#ifndef ROCKSDB_LITE
+    for (const auto& iter : configurable.options_) {
+      if (iter.type_map != nullptr) {
+        s = ConfigureSomeOptions(config_options, configurable, *(iter.type_map),
+                                 &remaining, iter.opt_ptr);
+        if (remaining.empty()) {  // Are there more options left?
+          break;
+        } else if (!s.ok()) {
+          break;
+        }
+      }
+    }
+#else
+    (void)configurable;
+    if (!config_options.ignore_unknown_options) {
+      s = Status::NotSupported("ConfigureFromMap not supported in LITE mode");
+    }
+#endif  // ROCKSDB_LITE
+  }
+  if (unused != nullptr && !remaining.empty()) {
+    unused->insert(remaining.begin(), remaining.end());
+  }
+  if (config_options.ignore_unknown_options) {
+    s = Status::OK();
+  } else if (s.ok() && unused == nullptr && !remaining.empty()) {
+    s = Status::NotFound("Could not find option: ", remaining.begin()->first);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+/**
+ * Updates the object with the named-value property values, returning OK on
+ * succcess. Any properties that were found are removed from the options list;
+ * upon return only options that were not found in this opt_map remain.
+
+ * Returns:
+ * -  OK if ignore_unknown_options is set
+ * - InvalidArgument, if any option was invalid
+ * - NotSupported, if any option is unsupported and ignore_unsupported_options
+ is OFF
+ * - OK, if no option was invalid or not supported (or ignored)
+ */
+Status ConfigurableHelper::ConfigureSomeOptions(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    std::unordered_map<std::string, std::string>* options, void* opt_ptr) {
+  Status result = Status::OK();  // The last non-OK result (if any)
+  Status notsup = Status::OK();  // The last NotSupported result (if any)
+  std::string elem_name;
+  int found = 1;
+  std::unordered_set<std::string> unsupported;
+  // While there are unused properties and we processed at least one,
+  // go through the remaining unused properties and attempt to configure them.
+  while (found > 0 && !options->empty()) {
+    found = 0;
+    notsup = Status::OK();
+    for (auto it = options->begin(); it != options->end();) {
+      const std::string& opt_name = configurable.GetOptionName(it->first);
+      const std::string& opt_value = it->second;
+      const auto opt_info =
+          OptionTypeInfo::Find(opt_name, type_map, &elem_name);
+      if (opt_info == nullptr) {  // Did not find the option.  Skip it
+        ++it;
+      } else {
+        Status s = ConfigureOption(config_options, configurable, *opt_info,
+                                   opt_name, elem_name, opt_value, opt_ptr);
+        if (s.IsNotFound()) {
+          ++it;
+        } else if (s.IsNotSupported()) {
+          notsup = s;
+          unsupported.insert(it->first);
+          ++it;  // Skip it for now
+        } else {
+          found++;
+          it = options->erase(it);
+          if (!s.ok()) {
+            result = s;
+          }
+        }
+      }
+    }  // End for all remaining options
+  }    // End while found one or options remain
+
+  // Now that we have been through the list, remove any unsupported
+  for (auto u : unsupported) {
+    auto it = options->find(u);
+    if (it != options->end()) {
+      options->erase(it);
+    }
+  }
+  if (config_options.ignore_unknown_options) {
+    if (!result.ok()) result.PermitUncheckedError();
+    if (!notsup.ok()) notsup.PermitUncheckedError();
+    return Status::OK();
+  } else if (!result.ok()) {
+    if (!notsup.ok()) notsup.PermitUncheckedError();
+    return result;
+  } else if (config_options.ignore_unsupported_options) {
+    if (!notsup.ok()) notsup.PermitUncheckedError();
+    return Status::OK();
+  } else {
+    return notsup;
+  }
+}
+
+Status ConfigurableHelper::ConfigureSingleOption(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const std::string& name, const std::string& value) {
+  const std::string& opt_name = configurable.GetOptionName(name);
+  std::string elem_name;
+  void* opt_ptr = nullptr;
+  const auto opt_info =
+      FindOption(configurable.options_, opt_name, &elem_name, &opt_ptr);
+  if (opt_info == nullptr) {
+    return Status::NotFound("Could not find option: ", name);
+  } else {
+    return ConfigureOption(config_options, configurable, *opt_info, opt_name,
+                           elem_name, value, opt_ptr);
+  }
+}
+Status ConfigurableHelper::ConfigureCustomizableOption(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const OptionTypeInfo& opt_info, const std::string& opt_name,
+    const std::string& name, const std::string& value, void* opt_ptr) {
+  Customizable* custom = opt_info.AsRawPointer<Customizable>(opt_ptr);
+  ConfigOptions copy = config_options;
+  if (opt_info.IsMutable()) {
+    // This option is mutable. Pass that property on to any subsequent calls
+    copy.mutable_options_only = false;
+  }
+
+  if (opt_info.IsMutable() || !config_options.mutable_options_only) {
+    // Either the option is mutable, or we are processing all of the options
+    if (opt_name == name || name == OptionTypeInfo::kIdPropName() ||
+        EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix())) {
+      return configurable.ParseOption(copy, opt_info, name, value, opt_ptr);
+    } else if (value.empty()) {
+      return Status::OK();
+    } else if (custom == nullptr || !StartsWith(name, custom->GetId() + ".")) {
+      return configurable.ParseOption(copy, opt_info, name, value, opt_ptr);
+    } else if (value.find("=") != std::string::npos) {
+      return custom->ConfigureFromString(copy, value);
+    } else {
+      return custom->ConfigureOption(copy, name, value);
+    }
+  } else {
+    // We are processing immutable options, which means that we cannot change
+    // the Customizable object itself, but could change its mutable properties.
+    // Check to make sure that nothing is trying to change the Customizable
+    if (custom == nullptr) {
+      // We do not have a Customizable to configure.  This is OK if the
+      // value is empty (nothing being configured) but an error otherwise
+      if (value.empty()) {
+        return Status::OK();
+      } else {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      }
+    } else if (EndsWith(opt_name, OptionTypeInfo::kIdPropSuffix()) ||
+               name == OptionTypeInfo::kIdPropName()) {
+      // We have a property of the form "id=value" or "table.id=value"
+      // This is OK if we ID/value matches the current customizable object
+      if (custom->GetId() == value) {
+        return Status::OK();
+      } else {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      }
+    } else if (opt_name == name) {
+      // The properties are of one of forms:
+      //    name = { id = id; prop1 = value1; ... }
+      //    name = { prop1=value1; prop2=value2; ... }
+      //    name = ID
+      // Convert the value to a map and extract the ID
+      // If the ID does not match that of the current customizable, return an
+      // error. Otherwise, update the current customizable via the properties
+      // map
+      std::unordered_map<std::string, std::string> props;
+      std::string id;
+      Status s =
+          Configurable::GetOptionsMap(value, custom->GetId(), &id, &props);
+      if (!s.ok()) {
+        return s;
+      } else if (custom->GetId() != id) {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      } else if (props.empty()) {
+        return Status::OK();
+      } else {
+        return custom->ConfigureFromMap(copy, props);
+      }
+    } else {
+      // Attempting to configure one of the properties of the customizable
+      // Let it through
+      return custom->ConfigureOption(copy, name, value);
+    }
+  }
+}
+
+Status ConfigurableHelper::ConfigureOption(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const OptionTypeInfo& opt_info, const std::string& opt_name,
+    const std::string& name, const std::string& value, void* opt_ptr) {
+  if (opt_info.IsCustomizable()) {
+    return ConfigureCustomizableOption(config_options, configurable, opt_info,
+                                       opt_name, name, value, opt_ptr);
+  } else if (opt_name == name) {
+    return configurable.ParseOption(config_options, opt_info, opt_name, value,
+                                    opt_ptr);
+  } else if (opt_info.IsStruct() || opt_info.IsConfigurable()) {
+    return configurable.ParseOption(config_options, opt_info, name, value,
+                                    opt_ptr);
+  } else {
+    return Status::NotFound("Could not find option: ", name);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+//*******************************************************************************
+//
+//       Methods for Converting Options into strings
+//
+//*******************************************************************************
+
+Status Configurable::GetOptionString(const ConfigOptions& config_options,
+                                     std::string* result) const {
+  assert(result);
+  result->clear();
+#ifndef ROCKSDB_LITE
+  return ConfigurableHelper::SerializeOptions(config_options, *this, "",
+                                              result);
+#else
+  (void)config_options;
+  return Status::NotSupported("GetOptionString not supported in LITE mode");
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+std::string Configurable::ToString(const ConfigOptions& config_options,
+                                   const std::string& prefix) const {
+  std::string result = SerializeOptions(config_options, prefix);
+  if (result.empty() || result.find('=') == std::string::npos) {
+    return result;
+  } else {
+    return "{" + result + "}";
+  }
+}
+
+std::string Configurable::SerializeOptions(const ConfigOptions& config_options,
+                                           const std::string& header) const {
+  std::string result;
+  Status s = ConfigurableHelper::SerializeOptions(config_options, *this, header,
+                                                  &result);
+  assert(s.ok());
+  return result;
+}
+
+Status Configurable::GetOption(const ConfigOptions& config_options,
+                               const std::string& name,
+                               std::string* value) const {
+  return ConfigurableHelper::GetOption(config_options, *this,
+                                       GetOptionName(name), value);
+}
+
+Status ConfigurableHelper::GetOption(const ConfigOptions& config_options,
+                                     const Configurable& configurable,
+                                     const std::string& short_name,
+                                     std::string* value) {
+  // Look for option directly
+  assert(value);
+  value->clear();
+
+  std::string opt_name;
+  void* opt_ptr = nullptr;
+  const auto opt_info =
+      FindOption(configurable.options_, short_name, &opt_name, &opt_ptr);
+  if (opt_info != nullptr) {
+    ConfigOptions embedded = config_options;
+    embedded.delimiter = ";";
+    if (short_name == opt_name) {
+      return opt_info->Serialize(embedded, opt_name, opt_ptr, value);
+    } else if (opt_info->IsStruct()) {
+      return opt_info->Serialize(embedded, opt_name, opt_ptr, value);
+    } else if (opt_info->IsConfigurable()) {
+      auto const* config = opt_info->AsRawPointer<Configurable>(opt_ptr);
+      if (config != nullptr) {
+        return config->GetOption(embedded, opt_name, value);
+      }
+    }
+  }
+  return Status::NotFound("Cannot find option: ", short_name);
+}
+
+Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options,
+                                            const Configurable& configurable,
+                                            const std::string& prefix,
+                                            std::string* result) {
+  assert(result);
+  for (auto const& opt_iter : configurable.options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (const auto& map_iter : *(opt_iter.type_map)) {
+        const auto& opt_name = map_iter.first;
+        const auto& opt_info = map_iter.second;
+        if (opt_info.ShouldSerialize()) {
+          std::string value;
+          Status s;
+          if (!config_options.mutable_options_only) {
+            s = opt_info.Serialize(config_options, prefix + opt_name,
+                                   opt_iter.opt_ptr, &value);
+          } else if (opt_info.IsMutable()) {
+            ConfigOptions copy = config_options;
+            copy.mutable_options_only = false;
+            s = opt_info.Serialize(copy, prefix + opt_name, opt_iter.opt_ptr,
+                                   &value);
+          } else if (opt_info.IsConfigurable()) {
+            // If it is a Configurable and we are either printing all of the
+            // details or not printing only the name, this option should be
+            // included in the list
+            if (config_options.IsDetailed() ||
+                !opt_info.IsEnabled(OptionTypeFlags::kStringNameOnly)) {
+              s = opt_info.Serialize(config_options, prefix + opt_name,
+                                     opt_iter.opt_ptr, &value);
+            }
+          }
+          if (!s.ok()) {
+            return s;
+          } else if (!value.empty()) {
+            // <prefix><opt_name>=<value><delimiter>
+            result->append(prefix + opt_name + "=" + value +
+                           config_options.delimiter);
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+#endif  // ROCKSDB_LITE
+
+//********************************************************************************
+//
+// Methods for listing the options from Configurables
+//
+//********************************************************************************
+#ifndef ROCKSDB_LITE
+Status Configurable::GetOptionNames(
+    const ConfigOptions& config_options,
+    std::unordered_set<std::string>* result) const {
+  assert(result);
+  return ConfigurableHelper::ListOptions(config_options, *this, "", result);
+}
+
+Status ConfigurableHelper::ListOptions(
+    const ConfigOptions& config_options, const Configurable& configurable,
+    const std::string& prefix, std::unordered_set<std::string>* result) {
+  Status status;
+  for (auto const& opt_iter : configurable.options_) {
+    if (opt_iter.type_map != nullptr) {
+      for (const auto& map_iter : *(opt_iter.type_map)) {
+        const auto& opt_name = map_iter.first;
+        const auto& opt_info = map_iter.second;
+        // If the option is no longer used in rocksdb and marked as deprecated,
+        // we skip it in the serialization.
+        if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) {
+          if (!config_options.mutable_options_only) {
+            result->emplace(prefix + opt_name);
+          } else if (opt_info.IsMutable()) {
+            result->emplace(prefix + opt_name);
+          }
+        }
+      }
+    }
+  }
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+//*******************************************************************************
+//
+//       Methods for Comparing Configurables
+//
+//*******************************************************************************
+
+bool Configurable::AreEquivalent(const ConfigOptions& config_options,
+                                 const Configurable* other,
+                                 std::string* name) const {
+  assert(name);
+  name->clear();
+  if (this == other || config_options.IsCheckDisabled()) {
+    return true;
+  } else if (other != nullptr) {
+#ifndef ROCKSDB_LITE
+    return ConfigurableHelper::AreEquivalent(config_options, *this, *other,
+                                             name);
+#else
+    return true;
+#endif  // ROCKSDB_LITE
+  } else {
+    return false;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+bool Configurable::OptionsAreEqual(const ConfigOptions& config_options,
+                                   const OptionTypeInfo& opt_info,
+                                   const std::string& opt_name,
+                                   const void* const this_ptr,
+                                   const void* const that_ptr,
+                                   std::string* mismatch) const {
+  if (opt_info.AreEqual(config_options, opt_name, this_ptr, that_ptr,
+                        mismatch)) {
+    return true;
+  } else if (opt_info.AreEqualByName(config_options, opt_name, this_ptr,
+                                     that_ptr)) {
+    *mismatch = "";
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options,
+                                       const Configurable& this_one,
+                                       const Configurable& that_one,
+                                       std::string* mismatch) {
+  assert(mismatch != nullptr);
+  for (auto const& o : this_one.options_) {
+    const auto this_offset = this_one.GetOptionsPtr(o.name);
+    const auto that_offset = that_one.GetOptionsPtr(o.name);
+    if (this_offset != that_offset) {
+      if (this_offset == nullptr || that_offset == nullptr) {
+        return false;
+      } else if (o.type_map != nullptr) {
+        for (const auto& map_iter : *(o.type_map)) {
+          const auto& opt_info = map_iter.second;
+          if (config_options.IsCheckEnabled(opt_info.GetSanityLevel())) {
+            if (!config_options.mutable_options_only) {
+              if (!this_one.OptionsAreEqual(config_options, opt_info,
+                                            map_iter.first, this_offset,
+                                            that_offset, mismatch)) {
+                return false;
+              }
+            } else if (opt_info.IsMutable()) {
+              ConfigOptions copy = config_options;
+              copy.mutable_options_only = false;
+              if (!this_one.OptionsAreEqual(copy, opt_info, map_iter.first,
+                                            this_offset, that_offset,
+                                            mismatch)) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+#endif  // ROCKSDB_LITE
+
+Status Configurable::GetOptionsMap(
+    const std::string& value, const std::string& default_id, std::string* id,
+    std::unordered_map<std::string, std::string>* props) {
+  assert(id);
+  assert(props);
+  Status status;
+  if (value.empty() || value == kNullptrString) {
+    *id = default_id;
+  } else if (value.find('=') == std::string::npos) {
+    *id = value;
+#ifndef ROCKSDB_LITE
+  } else {
+    status = StringToMap(value, props);
+    if (!status.ok()) {       // There was an error creating the map.
+      *id = value;            // Treat the value as id
+      props->clear();         // Clear the properties
+      status = Status::OK();  // and ignore the error
+    } else {
+      auto iter = props->find(OptionTypeInfo::kIdPropName());
+      if (iter != props->end()) {
+        *id = iter->second;
+        props->erase(iter);
+        if (*id == kNullptrString) {
+          id->clear();
+        }
+      } else if (!default_id.empty()) {
+        *id = default_id;
+      } else {           // No id property and no default
+        *id = value;     // Treat the value as id
+        props->clear();  // Clear the properties
+      }
+    }
+#else
+  } else {
+    *id = value;
+    props->clear();
+#endif
+  }
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_helper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_helper.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_helper.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,187 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "rocksdb/configurable.h"
+#include "rocksdb/convenience.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Helper class defining static methods for supporting the Configurable
+// class.  The purpose of this class is to keep the Configurable class
+// as tight as possible and provide methods for doing the actual work
+// of configuring the objects.
+class ConfigurableHelper {
+ public:
+  // Configures the input Configurable object based on the parameters.
+  // On successful completion, the Configurable is updated with the settings
+  // from the opt_map.
+  //
+  // The acceptable values of the name/value pairs are documented with the
+  // specific class/instance.
+  //
+  // @param config_options Controls how the arguments are processed.
+  // @param opt_map Name/value pairs of the options to update
+  // @param unused If specified, this value will return the name/value
+  //      pairs from opt_map that were NotFound for this object.
+  // @return OK If all values in the map were successfully updated
+  // @return NotFound If any of the names in the opt_map were not valid
+  //      for this object.  If unused is specified, it will contain the
+  //      collection of NotFound entries
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.  This can also be returned if PrepareOptions encounters an
+  //       error.
+  static Status ConfigureOptions(
+      const ConfigOptions& config_options, Configurable& configurable,
+      const std::unordered_map<std::string, std::string>& options,
+      std::unordered_map<std::string, std::string>* unused);
+
+#ifndef ROCKSDB_LITE
+  // Internal method to configure a set of options for this object.
+  // Classes may override this value to change its behavior.
+  // @param config_options Controls how the options are being configured
+  // @param type_name The name that was registered for this set of options
+  // @param type_map The map of options for this name
+  // @param opt_ptr Pointer to the object being configured for this option set.
+  // @param options The option name/values being updated.  On return, any
+  //    option that was found is removed from the list.
+  // @return OK If all of the options were successfully updated.
+  // @return InvalidArgument If an option was found but the value could not
+  //       be updated.
+  // @return NotFound If an option name was not found in type_mape
+  // @return NotSupported If the option was found but no rule for converting
+  //       the value could be found.
+  static Status ConfigureSomeOptions(
+      const ConfigOptions& config_options, Configurable& configurable,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      std::unordered_map<std::string, std::string>* options, void* opt_ptr);
+
+  // Configures a single option in the input Configurable.
+  // This method will look through the set of option names for this
+  // Configurable searching for one with the input name.  If such an option
+  // is found, it will be configured via the input value.
+  //
+  // @param config_options Controls how the option is being configured
+  // @param configurable The object to configure
+  // @param name For options with sub-options (like Structs or
+  // Configurables),
+  //      this value may be the name of the sub-field of the option being
+  //      updated. For example, if the option is
+  //      "compaction_options_fifo.allow_compaction", then field name would be
+  //      "allow_compaction".  For most options, field_name and opt_name will be
+  //      equivalent.
+  // @param value The new value for this option.
+  // @param See ConfigureOptions for the possible return values
+  static Status ConfigureSingleOption(const ConfigOptions& config_options,
+                                      Configurable& configurable,
+                                      const std::string& name,
+                                      const std::string& value);
+
+  // Configures the option referenced by opt_info for this configurable
+  // This method configures the option based on opt_info for the input
+  // configurable.
+  // @param config_options Controls how the option is being configured
+  // @param configurable The object to configure
+  // @param opt_name The full option name
+  // @param name For options with sub-options (like Structs or
+  // Configurables),
+  //      this value may be the name of the sub-field of the option being
+  //      updated. For example, if the option is
+  //      "compaction_options_fifo.allow_compaction", then field name would be
+  //      "allow_compaction".  For most options, field_name and opt_name will be
+  //      equivalent.
+  // @param value The new value for this option.
+  // @param See ConfigureOptions for the possible return values
+  static Status ConfigureOption(const ConfigOptions& config_options,
+                                Configurable& configurable,
+                                const OptionTypeInfo& opt_info,
+                                const std::string& opt_name,
+                                const std::string& name,
+                                const std::string& value, void* opt_ptr);
+
+  // Returns the value of the option associated with the input name
+  // This method is the functional inverse of ConfigureOption
+  // @param config_options Controls how the value is returned
+  // @param configurable The object from which to get the option.
+  // @param name The name of the option to return a value for.
+  // @param value The returned value associated with the named option.
+  //              Note that value will be only the serialized version
+  //              of the option and not "name=value"
+  // @return OK If the named field was successfully updated to value.
+  // @return NotFound If the name is not valid for this object.
+  // @param InvalidArgument If the name is valid for this object but
+  //      its value cannot be serialized.
+  static Status GetOption(const ConfigOptions& config_options,
+                          const Configurable& configurable,
+                          const std::string& name, std::string* value);
+
+  // Serializes the input Configurable into the output result.
+  // This is the inverse of ConfigureOptions
+  // @param config_options Controls how serialization happens.
+  // @param configurable The object to serialize
+  // @param prefix A prefix to add to the each option as it is serialized.
+  // @param result The string representation of the configurable.
+  // @return OK If the options for this object wer successfully serialized.
+  // @return InvalidArgument If one or more of the options could not be
+  // serialized.
+  static Status SerializeOptions(const ConfigOptions& config_options,
+                                 const Configurable& configurable,
+                                 const std::string& prefix,
+                                 std::string* result);
+
+  // Internal method to list the option names for this object.
+  // Classes may override this value to change its behavior.
+  // @see ListOptions for more details
+  static Status ListOptions(const ConfigOptions& config_options,
+                            const Configurable& configurable,
+                            const std::string& prefix,
+                            std::unordered_set<std::string>* result);
+
+  // Checks to see if the two configurables are equivalent to one other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param this_one The object to compare to.
+  // @param that_one The other object being compared.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  static bool AreEquivalent(const ConfigOptions& config_options,
+                            const Configurable& this_one,
+                            const Configurable& that_one,
+                            std::string* mismatch);
+
+ private:
+  // Looks for the option specified by name in the RegisteredOptions.
+  // This method traverses the types in the input options vector.  If an entry
+  // matching name is found, that entry, opt_name, and pointer are returned.
+  // @param options  The vector of options to search through
+  // @param name     The name of the option to search for in the OptionType map
+  // @param opt_name If the name was found, this value is set to the option name
+  //                 associated with the input name/type.
+  // @param opt_ptr  If the name was found, this value is set to the option
+  // pointer
+  //                 in the RegisteredOptions vector associated with this entry
+  // @return         A pointer to the OptionTypeInfo from the options if found,
+  //                 nullptr if the name was not found in the input options
+  static const OptionTypeInfo* FindOption(
+      const std::vector<Configurable::RegisteredOptions>& options,
+      const std::string& name, std::string* opt_name, void** opt_ptr);
+
+  static Status ConfigureCustomizableOption(
+      const ConfigOptions& config_options, Configurable& configurable,
+      const OptionTypeInfo& opt_info, const std::string& opt_name,
+      const std::string& name, const std::string& value, void* opt_ptr);
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,880 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "options/configurable_test.h"
+
+#include <cctype>
+#include <cinttypes>
+#include <cstring>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "rocksdb/configurable.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+namespace test {
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    char buffer[1000];
+    vsnprintf(buffer, sizeof(buffer), format, ap);
+    string_.append(buffer);
+  }
+  const std::string& str() const { return string_; }
+  void clear() { string_.clear(); }
+
+ private:
+  std::string string_;
+};
+static std::unordered_map<std::string, OptionTypeInfo> struct_option_info = {
+#ifndef ROCKSDB_LITE
+    {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0,
+                                      OptionVerificationType::kNormal,
+                                      OptionTypeFlags::kMutable)},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> imm_struct_option_info =
+    {
+#ifndef ROCKSDB_LITE
+        {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0,
+                                          OptionVerificationType::kNormal,
+                                          OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+class SimpleConfigurable : public TestConfigurable<Configurable> {
+ public:
+  static SimpleConfigurable* Create(
+      const std::string& name = "simple",
+      int mode = TestConfigMode::kDefaultMode,
+      const std::unordered_map<std::string, OptionTypeInfo>* map =
+          &simple_option_info) {
+    return new SimpleConfigurable(name, mode, map);
+  }
+
+  SimpleConfigurable(const std::string& name, int mode,
+                     const std::unordered_map<std::string, OptionTypeInfo>*
+                         map = &simple_option_info)
+      : TestConfigurable(name, mode, map) {
+    if ((mode & TestConfigMode::kUniqueMode) != 0) {
+      unique_.reset(SimpleConfigurable::Create("Unique" + name_));
+      RegisterOptions(name_ + "Unique", &unique_, &unique_option_info);
+    }
+    if ((mode & TestConfigMode::kSharedMode) != 0) {
+      shared_.reset(SimpleConfigurable::Create("Shared" + name_));
+      RegisterOptions(name_ + "Shared", &shared_, &shared_option_info);
+    }
+    if ((mode & TestConfigMode::kRawPtrMode) != 0) {
+      pointer_ = SimpleConfigurable::Create("Pointer" + name_);
+      RegisterOptions(name_ + "Pointer", &pointer_, &pointer_option_info);
+    }
+  }
+
+};  // End class SimpleConfigurable
+
+using ConfigTestFactoryFunc = std::function<Configurable*()>;
+
+class ConfigurableTest : public testing::Test {
+ public:
+  ConfigurableTest() { config_options_.invoke_prepare_options = false; }
+
+  ConfigOptions config_options_;
+};
+
+TEST_F(ConfigurableTest, GetOptionsPtrTest) {
+  std::string opt_str;
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  ASSERT_NE(configurable->GetOptions<TestOptions>("simple"), nullptr);
+  ASSERT_EQ(configurable->GetOptions<TestOptions>("bad-opt"), nullptr);
+}
+
+TEST_F(ConfigurableTest, ConfigureFromMapTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  auto* opts = configurable->GetOptions<TestOptions>("simple");
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, {}));
+  ASSERT_NE(opts, nullptr);
+#ifndef ROCKSDB_LITE
+  std::unordered_map<std::string, std::string> options_map = {
+      {"int", "1"}, {"bool", "true"}, {"string", "string"}};
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, options_map));
+  ASSERT_EQ(opts->i, 1);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_EQ(opts->s, "string");
+#endif
+}
+
+TEST_F(ConfigurableTest, ConfigureFromStringTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  auto* opts = configurable->GetOptions<TestOptions>("simple");
+  ASSERT_OK(configurable->ConfigureFromString(config_options_, ""));
+  ASSERT_NE(opts, nullptr);
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+  ASSERT_OK(configurable->ConfigureFromString(config_options_,
+                                              "int=1;bool=true;string=s"));
+  ASSERT_EQ(opts->i, 1);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_EQ(opts->s, "s");
+#endif
+}
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+TEST_F(ConfigurableTest, ConfigureIgnoreTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  std::unordered_map<std::string, std::string> options_map = {{"unused", "u"}};
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unknown_options = true;
+  ASSERT_NOK(configurable->ConfigureFromMap(config_options_, options_map));
+  ASSERT_OK(configurable->ConfigureFromMap(ignore, options_map));
+  ASSERT_NOK(configurable->ConfigureFromString(config_options_, "unused=u"));
+  ASSERT_OK(configurable->ConfigureFromString(ignore, "unused=u"));
+}
+
+TEST_F(ConfigurableTest, ConfigureNestedOptionsTest) {
+  std::unique_ptr<Configurable> base, copy;
+  std::string opt_str;
+  std::string mismatch;
+
+  base.reset(SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode));
+  copy.reset(SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode));
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "shared={int=10; string=10};"
+                                      "unique={int=20; string=20};"
+                                      "pointer={int=30; string=30};"));
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(ConfigurableTest, GetOptionsTest) {
+  std::unique_ptr<Configurable> simple;
+
+  simple.reset(
+      SimpleConfigurable::Create("simple", TestConfigMode::kAllOptMode));
+  int i = 11;
+  for (auto opt : {"", "shared.", "unique.", "pointer."}) {
+    std::string value;
+    std::string expected = ToString(i);
+    std::string opt_name = opt;
+    ASSERT_OK(
+        simple->ConfigureOption(config_options_, opt_name + "int", expected));
+    ASSERT_OK(simple->GetOption(config_options_, opt_name + "int", &value));
+    ASSERT_EQ(expected, value);
+    ASSERT_OK(simple->ConfigureOption(config_options_, opt_name + "string",
+                                      expected));
+    ASSERT_OK(simple->GetOption(config_options_, opt_name + "string", &value));
+    ASSERT_EQ(expected, value);
+
+    ASSERT_NOK(
+        simple->ConfigureOption(config_options_, opt_name + "bad", expected));
+    ASSERT_NOK(simple->GetOption(config_options_, "bad option", &value));
+    ASSERT_TRUE(value.empty());
+    i += 11;
+  }
+}
+
+TEST_F(ConfigurableTest, ConfigureBadOptionsTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  auto* opts = configurable->GetOptions<TestOptions>("simple");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_OK(configurable->ConfigureOption(config_options_, "int", "42"));
+  ASSERT_EQ(opts->i, 42);
+  ASSERT_NOK(configurable->ConfigureOption(config_options_, "int", "fred"));
+  ASSERT_NOK(configurable->ConfigureOption(config_options_, "bool", "fred"));
+  ASSERT_NOK(
+      configurable->ConfigureFromString(config_options_, "int=33;unused=u"));
+  ASSERT_EQ(opts->i, 42);
+}
+
+TEST_F(ConfigurableTest, InvalidOptionTest) {
+  std::unique_ptr<Configurable> configurable(SimpleConfigurable::Create());
+  std::unordered_map<std::string, std::string> options_map = {
+      {"bad-option", "bad"}};
+  ASSERT_NOK(configurable->ConfigureFromMap(config_options_, options_map));
+  ASSERT_NOK(
+      configurable->ConfigureFromString(config_options_, "bad-option=bad"));
+  ASSERT_NOK(
+      configurable->ConfigureOption(config_options_, "bad-option", "bad"));
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> validated_option_info = {
+#ifndef ROCKSDB_LITE
+    {"validated",
+     {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo> prepared_option_info = {
+#ifndef ROCKSDB_LITE
+    {"prepared",
+     {0, OptionType::kInt, OptionVerificationType::kNormal,
+      OptionTypeFlags::kMutable}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo>
+    dont_prepare_option_info = {
+#ifndef ROCKSDB_LITE
+        {"unique",
+         {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kUnique | OptionTypeFlags::kDontPrepare)}},
+
+#endif  // ROCKSDB_LITE
+};
+
+class ValidatedConfigurable : public SimpleConfigurable {
+ public:
+  ValidatedConfigurable(const std::string& name, unsigned char mode,
+                        bool dont_prepare = false)
+      : SimpleConfigurable(name, TestConfigMode::kDefaultMode),
+        validated(false),
+        prepared(0) {
+    RegisterOptions("Validated", &validated, &validated_option_info);
+    RegisterOptions("Prepared", &prepared, &prepared_option_info);
+    if ((mode & TestConfigMode::kUniqueMode) != 0) {
+      unique_.reset(new ValidatedConfigurable(
+          "Unique" + name_, TestConfigMode::kDefaultMode, false));
+      if (dont_prepare) {
+        RegisterOptions(name_ + "Unique", &unique_, &dont_prepare_option_info);
+      } else {
+        RegisterOptions(name_ + "Unique", &unique_, &unique_option_info);
+      }
+    }
+  }
+
+  Status PrepareOptions(const ConfigOptions& config_options) override {
+    if (++prepared <= 0) {
+      return Status::InvalidArgument("Cannot prepare option");
+    } else {
+      return SimpleConfigurable::PrepareOptions(config_options);
+    }
+  }
+
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (!validated) {
+      return Status::InvalidArgument("Not Validated");
+    } else {
+      return SimpleConfigurable::ValidateOptions(db_opts, cf_opts);
+    }
+  }
+
+ private:
+  bool validated;
+  int prepared;
+};
+
+TEST_F(ConfigurableTest, ValidateOptionsTest) {
+  std::unique_ptr<Configurable> configurable(
+      new ValidatedConfigurable("validated", TestConfigMode::kDefaultMode));
+  ColumnFamilyOptions cf_opts;
+  DBOptions db_opts;
+  ASSERT_OK(
+      configurable->ConfigureOption(config_options_, "validated", "false"));
+  ASSERT_NOK(configurable->ValidateOptions(db_opts, cf_opts));
+  ASSERT_OK(
+      configurable->ConfigureOption(config_options_, "validated", "true"));
+  ASSERT_OK(configurable->ValidateOptions(db_opts, cf_opts));
+}
+
+TEST_F(ConfigurableTest, PrepareOptionsTest) {
+  std::unique_ptr<Configurable> c(
+      new ValidatedConfigurable("Simple", TestConfigMode::kUniqueMode, false));
+  auto cp = c->GetOptions<int>("Prepared");
+  auto u = c->GetOptions<std::unique_ptr<Configurable>>("SimpleUnique");
+  auto up = u->get()->GetOptions<int>("Prepared");
+  config_options_.invoke_prepare_options = false;
+
+  ASSERT_NE(cp, nullptr);
+  ASSERT_NE(up, nullptr);
+  ASSERT_EQ(*cp, 0);
+  ASSERT_EQ(*up, 0);
+  ASSERT_OK(c->ConfigureFromMap(config_options_, {}));
+  ASSERT_EQ(*cp, 0);
+  ASSERT_EQ(*up, 0);
+  config_options_.invoke_prepare_options = true;
+  ASSERT_OK(c->ConfigureFromMap(config_options_, {}));
+  ASSERT_EQ(*cp, 1);
+  ASSERT_EQ(*up, 1);
+  ASSERT_OK(c->ConfigureFromString(config_options_, "prepared=0"));
+  ASSERT_EQ(*up, 2);
+  ASSERT_EQ(*cp, 1);
+
+  ASSERT_NOK(c->ConfigureFromString(config_options_, "prepared=-2"));
+
+  c.reset(
+      new ValidatedConfigurable("Simple", TestConfigMode::kUniqueMode, true));
+  cp = c->GetOptions<int>("Prepared");
+  u = c->GetOptions<std::unique_ptr<Configurable>>("SimpleUnique");
+  up = u->get()->GetOptions<int>("Prepared");
+
+  ASSERT_OK(c->ConfigureFromString(config_options_, "prepared=0"));
+  ASSERT_EQ(*cp, 1);
+  ASSERT_EQ(*up, 0);
+}
+
+TEST_F(ConfigurableTest, CopyObjectTest) {
+  class CopyConfigurable : public Configurable {
+   public:
+    CopyConfigurable() : prepared_(0), validated_(0) {}
+    Status PrepareOptions(const ConfigOptions& options) override {
+      prepared_++;
+      return Configurable::PrepareOptions(options);
+    }
+    Status ValidateOptions(const DBOptions& db_opts,
+                           const ColumnFamilyOptions& cf_opts) const override {
+      validated_++;
+      return Configurable::ValidateOptions(db_opts, cf_opts);
+    }
+    int prepared_;
+    mutable int validated_;
+  };
+
+  CopyConfigurable c1;
+  ConfigOptions config_options;
+  Options options;
+
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c1.prepared_, 1);
+  ASSERT_EQ(c1.validated_, 1);
+  CopyConfigurable c2 = c1;
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c2.prepared_, 1);
+  ASSERT_EQ(c2.validated_, 1);
+  ASSERT_EQ(c1.prepared_, 2);
+  ASSERT_EQ(c1.validated_, 2);
+}
+
+TEST_F(ConfigurableTest, MutableOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> imm_option_info = {
+#ifndef ROCKSDB_LITE
+      {"imm", OptionTypeInfo::Struct("imm", &simple_option_info, 0,
+                                     OptionVerificationType::kNormal,
+                                     OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+  };
+
+  class MutableConfigurable : public SimpleConfigurable {
+   public:
+    MutableConfigurable()
+        : SimpleConfigurable("mutable", TestConfigMode::kDefaultMode |
+                                            TestConfigMode::kUniqueMode |
+                                            TestConfigMode::kSharedMode) {
+      RegisterOptions("struct", &options_, &struct_option_info);
+      RegisterOptions("imm", &options_, &imm_option_info);
+    }
+  };
+  MutableConfigurable mc;
+  ConfigOptions options = config_options_;
+
+  ASSERT_OK(mc.ConfigureOption(options, "bool", "true"));
+  ASSERT_OK(mc.ConfigureOption(options, "int", "42"));
+  auto* opts = mc.GetOptions<TestOptions>("mutable");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->i, 42);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_OK(mc.ConfigureOption(options, "struct", "{bool=false;}"));
+  ASSERT_OK(mc.ConfigureOption(options, "imm", "{int=55;}"));
+
+  options.mutable_options_only = true;
+
+  // Now only mutable options should be settable.
+  ASSERT_NOK(mc.ConfigureOption(options, "bool", "true"));
+  ASSERT_OK(mc.ConfigureOption(options, "int", "24"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+  ASSERT_NOK(mc.ConfigureFromString(options, "bool=false;int=33;"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+
+  // Setting options through an immutable struct fails
+  ASSERT_NOK(mc.ConfigureOption(options, "imm", "{int=55;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "imm.int", "55"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+
+  // Setting options through an mutable struct succeeds
+  ASSERT_OK(mc.ConfigureOption(options, "struct", "{int=44;}"));
+  ASSERT_EQ(opts->i, 44);
+  ASSERT_OK(mc.ConfigureOption(options, "struct.int", "55"));
+  ASSERT_EQ(opts->i, 55);
+
+  // Setting nested immutable configurable options fail
+  ASSERT_NOK(mc.ConfigureOption(options, "shared", "{bool=true;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "shared.bool", "true"));
+
+  // Setting nested mutable configurable options succeeds
+  ASSERT_OK(mc.ConfigureOption(options, "unique", "{bool=true}"));
+  ASSERT_OK(mc.ConfigureOption(options, "unique.bool", "true"));
+}
+
+TEST_F(ConfigurableTest, DeprecatedOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo>
+      deprecated_option_info = {
+          {"deprecated",
+           {offsetof(struct TestOptions, b), OptionType::kBoolean,
+            OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}};
+  std::unique_ptr<Configurable> orig;
+  orig.reset(SimpleConfigurable::Create("simple", TestConfigMode::kDefaultMode,
+                                        &deprecated_option_info));
+  auto* opts = orig->GetOptions<TestOptions>("simple");
+  ASSERT_NE(opts, nullptr);
+  opts->d = true;
+  ASSERT_OK(orig->ConfigureOption(config_options_, "deprecated", "false"));
+  ASSERT_TRUE(opts->d);
+  ASSERT_OK(orig->ConfigureFromString(config_options_, "deprecated=false"));
+  ASSERT_TRUE(opts->d);
+}
+
+TEST_F(ConfigurableTest, AliasOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> alias_option_info = {
+      {"bool",
+       {offsetof(struct TestOptions, b), OptionType::kBoolean,
+        OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+      {"alias",
+       {offsetof(struct TestOptions, b), OptionType::kBoolean,
+        OptionVerificationType::kAlias, OptionTypeFlags::kNone, 0}}};
+  std::unique_ptr<Configurable> orig;
+  orig.reset(SimpleConfigurable::Create("simple", TestConfigMode::kDefaultMode,
+                                        &alias_option_info));
+  auto* opts = orig->GetOptions<TestOptions>("simple");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_OK(orig->ConfigureOption(config_options_, "bool", "false"));
+  ASSERT_FALSE(opts->b);
+  ASSERT_OK(orig->ConfigureOption(config_options_, "alias", "true"));
+  ASSERT_TRUE(opts->b);
+  std::string opts_str;
+  ASSERT_OK(orig->GetOptionString(config_options_, &opts_str));
+  ASSERT_EQ(opts_str.find("alias"), std::string::npos);
+
+  ASSERT_OK(orig->ConfigureOption(config_options_, "bool", "false"));
+  ASSERT_FALSE(opts->b);
+  ASSERT_OK(orig->GetOption(config_options_, "alias", &opts_str));
+  ASSERT_EQ(opts_str, "false");
+}
+
+TEST_F(ConfigurableTest, NestedUniqueConfigTest) {
+  std::unique_ptr<Configurable> simple;
+  simple.reset(
+      SimpleConfigurable::Create("Outer", TestConfigMode::kAllOptMode));
+  const auto outer = simple->GetOptions<TestOptions>("Outer");
+  const auto unique =
+      simple->GetOptions<std::unique_ptr<Configurable>>("OuterUnique");
+  ASSERT_NE(outer, nullptr);
+  ASSERT_NE(unique, nullptr);
+  ASSERT_OK(
+      simple->ConfigureFromString(config_options_, "int=24;string=outer"));
+  ASSERT_OK(simple->ConfigureFromString(config_options_,
+                                        "unique={int=42;string=nested}"));
+  const auto inner = unique->get()->GetOptions<TestOptions>("UniqueOuter");
+  ASSERT_NE(inner, nullptr);
+  ASSERT_EQ(outer->i, 24);
+  ASSERT_EQ(outer->s, "outer");
+  ASSERT_EQ(inner->i, 42);
+  ASSERT_EQ(inner->s, "nested");
+}
+
+TEST_F(ConfigurableTest, NestedSharedConfigTest) {
+  std::unique_ptr<Configurable> simple;
+  simple.reset(SimpleConfigurable::Create(
+      "Outer", TestConfigMode::kDefaultMode | TestConfigMode::kSharedMode));
+  ASSERT_OK(
+      simple->ConfigureFromString(config_options_, "int=24;string=outer"));
+  ASSERT_OK(simple->ConfigureFromString(config_options_,
+                                        "shared={int=42;string=nested}"));
+  const auto outer = simple->GetOptions<TestOptions>("Outer");
+  const auto shared =
+      simple->GetOptions<std::shared_ptr<Configurable>>("OuterShared");
+  ASSERT_NE(outer, nullptr);
+  ASSERT_NE(shared, nullptr);
+  const auto inner = shared->get()->GetOptions<TestOptions>("SharedOuter");
+  ASSERT_NE(inner, nullptr);
+  ASSERT_EQ(outer->i, 24);
+  ASSERT_EQ(outer->s, "outer");
+  ASSERT_EQ(inner->i, 42);
+  ASSERT_EQ(inner->s, "nested");
+}
+
+TEST_F(ConfigurableTest, NestedRawConfigTest) {
+  std::unique_ptr<Configurable> simple;
+  simple.reset(SimpleConfigurable::Create(
+      "Outer", TestConfigMode::kDefaultMode | TestConfigMode::kRawPtrMode));
+  ASSERT_OK(
+      simple->ConfigureFromString(config_options_, "int=24;string=outer"));
+  ASSERT_OK(simple->ConfigureFromString(config_options_,
+                                        "pointer={int=42;string=nested}"));
+  const auto outer = simple->GetOptions<TestOptions>("Outer");
+  const auto pointer = simple->GetOptions<Configurable*>("OuterPointer");
+  ASSERT_NE(outer, nullptr);
+  ASSERT_NE(pointer, nullptr);
+  const auto inner = (*pointer)->GetOptions<TestOptions>("PointerOuter");
+  ASSERT_NE(inner, nullptr);
+  ASSERT_EQ(outer->i, 24);
+  ASSERT_EQ(outer->s, "outer");
+  ASSERT_EQ(inner->i, 42);
+  ASSERT_EQ(inner->s, "nested");
+}
+
+TEST_F(ConfigurableTest, MatchesTest) {
+  std::string mismatch;
+  std::unique_ptr<Configurable> base, copy;
+  base.reset(SimpleConfigurable::Create(
+      "simple", TestConfigMode::kDefaultMode | TestConfigMode::kNestedMode));
+  copy.reset(SimpleConfigurable::Create(
+      "simple", TestConfigMode::kDefaultMode | TestConfigMode::kNestedMode));
+  ASSERT_OK(base->ConfigureFromString(
+      config_options_,
+      "int=11;string=outer;unique={int=22;string=u};shared={int=33;string=s}"));
+  ASSERT_OK(copy->ConfigureFromString(
+      config_options_,
+      "int=11;string=outer;unique={int=22;string=u};shared={int=33;string=s}"));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->ConfigureOption(config_options_, "shared", "int=44"));
+  ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_EQ(mismatch, "shared.int");
+  std::string c1value, c2value;
+  ASSERT_OK(base->GetOption(config_options_, mismatch, &c1value));
+  ASSERT_OK(copy->GetOption(config_options_, mismatch, &c2value));
+  ASSERT_NE(c1value, c2value);
+}
+
+static Configurable* SimpleStructFactory() {
+  return SimpleConfigurable::Create(
+      "simple-struct", TestConfigMode::kDefaultMode, &struct_option_info);
+}
+
+TEST_F(ConfigurableTest, ConfigureStructTest) {
+  std::unique_ptr<Configurable> base(SimpleStructFactory());
+  std::unique_ptr<Configurable> copy(SimpleStructFactory());
+  std::string opt_str, value;
+  std::string mismatch;
+  std::unordered_set<std::string> names;
+
+  ASSERT_OK(
+      base->ConfigureFromString(config_options_, "struct={int=10; string=10}"));
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->GetOptionNames(config_options_, &names));
+  ASSERT_EQ(names.size(), 1);
+  ASSERT_EQ(*(names.begin()), "struct");
+  ASSERT_OK(
+      base->ConfigureFromString(config_options_, "struct={int=20; string=20}"));
+  ASSERT_OK(base->GetOption(config_options_, "struct", &value));
+  ASSERT_OK(copy->ConfigureOption(config_options_, "struct", value));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+
+  ASSERT_NOK(base->ConfigureFromString(config_options_,
+                                       "struct={int=10; string=10; bad=11}"));
+  ASSERT_OK(base->ConfigureOption(config_options_, "struct.int", "42"));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "struct.bad", "42"));
+  ASSERT_NOK(base->GetOption(config_options_, "struct.bad", &value));
+  ASSERT_OK(base->GetOption(config_options_, "struct.int", &value));
+  ASSERT_EQ(value, "42");
+}
+
+TEST_F(ConfigurableTest, ConfigurableEnumTest) {
+  std::unique_ptr<Configurable> base, copy;
+  base.reset(SimpleConfigurable::Create("e", TestConfigMode::kEnumMode));
+  copy.reset(SimpleConfigurable::Create("e", TestConfigMode::kEnumMode));
+
+  std::string opts_str;
+  std::string mismatch;
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "enum=B"));
+  ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->GetOptionString(config_options_, &opts_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opts_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "enum", "bad"));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "unknown", "bad"));
+}
+
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> noserialize_option_info =
+    {
+        {"int",
+         {offsetof(struct TestOptions, i), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kDontSerialize}},
+};
+
+TEST_F(ConfigurableTest, TestNoSerialize) {
+  std::unique_ptr<Configurable> base;
+  base.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode,
+                                        &noserialize_option_info));
+  std::string opts_str, value;
+  ASSERT_OK(base->ConfigureFromString(config_options_, "int=10"));
+  ASSERT_OK(base->GetOptionString(config_options_, &opts_str));
+  ASSERT_EQ(opts_str, "");
+  ASSERT_NOK(base->GetOption(config_options_, "int", &value));
+}
+
+TEST_F(ConfigurableTest, TestNoCompare) {
+  std::unordered_map<std::string, OptionTypeInfo> nocomp_option_info = {
+      {"int",
+       {offsetof(struct TestOptions, i), OptionType::kInt,
+        OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}},
+  };
+  std::unordered_map<std::string, OptionTypeInfo> normal_option_info = {
+      {"int",
+       {offsetof(struct TestOptions, i), OptionType::kInt,
+        OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+  };
+
+  std::unique_ptr<Configurable> base, copy;
+  base.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode,
+                                        &nocomp_option_info));
+  copy.reset(SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode,
+                                        &normal_option_info));
+  ASSERT_OK(base->ConfigureFromString(config_options_, "int=10"));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, "int=20"));
+  std::string bvalue, cvalue, mismatch;
+  ASSERT_OK(base->GetOption(config_options_, "int", &bvalue));
+  ASSERT_OK(copy->GetOption(config_options_, "int", &cvalue));
+  ASSERT_EQ(bvalue, "10");
+  ASSERT_EQ(cvalue, "20");
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_FALSE(copy->AreEquivalent(config_options_, base.get(), &mismatch));
+}
+
+TEST_F(ConfigurableTest, NullOptionMapTest) {
+  std::unique_ptr<Configurable> base;
+  std::unordered_set<std::string> names;
+  std::string str;
+
+  base.reset(
+      SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, nullptr));
+  ASSERT_NOK(base->ConfigureFromString(config_options_, "int=10"));
+  ASSERT_NOK(base->ConfigureFromString(config_options_, "int=20"));
+  ASSERT_NOK(base->ConfigureOption(config_options_, "int", "20"));
+  ASSERT_NOK(base->GetOption(config_options_, "int", &str));
+  ASSERT_NE(base->GetOptions<TestOptions>("c"), nullptr);
+  ASSERT_OK(base->GetOptionNames(config_options_, &names));
+  ASSERT_EQ(names.size(), 0UL);
+  ASSERT_OK(base->PrepareOptions(config_options_));
+  ASSERT_OK(base->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  std::unique_ptr<Configurable> copy;
+  copy.reset(
+      SimpleConfigurable::Create("c", TestConfigMode::kDefaultMode, nullptr));
+  ASSERT_OK(base->GetOptionString(config_options_, &str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &str));
+}
+#endif
+
+static std::unordered_map<std::string, ConfigTestFactoryFunc> TestFactories = {
+    {"Simple", []() { return SimpleConfigurable::Create("simple"); }},
+    {"Struct", []() { return SimpleStructFactory(); }},
+    {"Unique",
+     []() {
+       return SimpleConfigurable::Create(
+           "simple", TestConfigMode::kSimpleMode | TestConfigMode::kUniqueMode);
+     }},
+    {"Shared",
+     []() {
+       return SimpleConfigurable::Create(
+           "simple", TestConfigMode::kSimpleMode | TestConfigMode::kSharedMode);
+     }},
+    {"Nested",
+     []() {
+       return SimpleConfigurable::Create(
+           "simple", TestConfigMode::kSimpleMode | TestConfigMode::kNestedMode);
+     }},
+    {"Mutable",
+     []() {
+       return SimpleConfigurable::Create("simple",
+                                         TestConfigMode::kMutableMode |
+                                             TestConfigMode::kSimpleMode |
+                                             TestConfigMode::kNestedMode);
+     }},
+    {"ThreeDeep",
+     []() {
+       Configurable* simple = SimpleConfigurable::Create(
+           "Simple",
+           TestConfigMode::kUniqueMode | TestConfigMode::kDefaultMode);
+       auto* unique =
+           simple->GetOptions<std::unique_ptr<Configurable>>("SimpleUnique");
+       unique->reset(SimpleConfigurable::Create(
+           "Child",
+           TestConfigMode::kUniqueMode | TestConfigMode::kDefaultMode));
+       unique = unique->get()->GetOptions<std::unique_ptr<Configurable>>(
+           "ChildUnique");
+       unique->reset(
+           SimpleConfigurable::Create("Child", TestConfigMode::kDefaultMode));
+       return simple;
+     }},
+    {"DBOptions",
+     []() {
+       auto config = DBOptionsAsConfigurable(DBOptions());
+       return config.release();
+     }},
+    {"CFOptions",
+     []() {
+       auto config = CFOptionsAsConfigurable(ColumnFamilyOptions());
+       return config.release();
+     }},
+    {"BlockBased", []() { return NewBlockBasedTableFactory(); }},
+};
+
+class ConfigurableParamTest : public ConfigurableTest,
+                              virtual public ::testing::WithParamInterface<
+                                  std::pair<std::string, std::string>> {
+ public:
+  ConfigurableParamTest() {
+    type_ = GetParam().first;
+    configuration_ = GetParam().second;
+    assert(TestFactories.find(type_) != TestFactories.end());
+    object_.reset(CreateConfigurable());
+  }
+
+  Configurable* CreateConfigurable() {
+    const auto& iter = TestFactories.find(type_);
+    return (iter->second)();
+  }
+
+  void TestConfigureOptions(const ConfigOptions& opts);
+  std::string type_;
+  std::string configuration_;
+  std::unique_ptr<Configurable> object_;
+};
+
+void ConfigurableParamTest::TestConfigureOptions(
+    const ConfigOptions& config_options) {
+  std::unique_ptr<Configurable> base, copy;
+  std::unordered_set<std::string> names;
+  std::string opt_str, mismatch;
+
+  base.reset(CreateConfigurable());
+  copy.reset(CreateConfigurable());
+
+  ASSERT_OK(base->ConfigureFromString(config_options, configuration_));
+  ASSERT_OK(base->GetOptionString(config_options, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options, opt_str));
+  ASSERT_OK(copy->GetOptionString(config_options, &opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options, copy.get(), &mismatch));
+
+  copy.reset(CreateConfigurable());
+  ASSERT_OK(base->GetOptionNames(config_options, &names));
+  std::unordered_map<std::string, std::string> unused;
+  bool found_one = false;
+  for (auto name : names) {
+    std::string value;
+    Status s = base->GetOption(config_options, name, &value);
+    if (s.ok()) {
+      s = copy->ConfigureOption(config_options, name, value);
+      if (s.ok() || s.IsNotSupported()) {
+        found_one = true;
+      } else {
+        unused[name] = value;
+      }
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+    }
+  }
+  ASSERT_TRUE(found_one || names.empty());
+  while (found_one && !unused.empty()) {
+    found_one = false;
+    for (auto iter = unused.begin(); iter != unused.end();) {
+      if (copy->ConfigureOption(config_options, iter->first, iter->second)
+              .ok()) {
+        found_one = true;
+        iter = unused.erase(iter);
+      } else {
+        ++iter;
+      }
+    }
+  }
+  ASSERT_EQ(0, unused.size());
+  ASSERT_TRUE(base->AreEquivalent(config_options, copy.get(), &mismatch));
+}
+
+TEST_P(ConfigurableParamTest, GetDefaultOptionsTest) {
+  TestConfigureOptions(config_options_);
+}
+
+TEST_P(ConfigurableParamTest, ConfigureFromPropsTest) {
+  std::string opt_str, mismatch;
+  std::unordered_set<std::string> names;
+  std::unique_ptr<Configurable> copy(CreateConfigurable());
+
+  ASSERT_OK(object_->ConfigureFromString(config_options_, configuration_));
+  config_options_.delimiter = "\n";
+  ASSERT_OK(object_->GetOptionString(config_options_, &opt_str));
+  std::istringstream iss(opt_str);
+  std::unordered_map<std::string, std::string> copy_map;
+  std::string line;
+  for (int line_num = 0; std::getline(iss, line); line_num++) {
+    std::string name;
+    std::string value;
+    ASSERT_OK(
+        RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num));
+    copy_map[name] = value;
+  }
+  ASSERT_OK(copy->ConfigureFromMap(config_options_, copy_map));
+  ASSERT_TRUE(object_->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ParamTest, ConfigurableParamTest,
+    testing::Values(
+        std::pair<std::string, std::string>("Simple",
+                                            "int=42;bool=true;string=s"),
+        std::pair<std::string, std::string>(
+            "Mutable", "int=42;unique={int=33;string=unique}"),
+        std::pair<std::string, std::string>(
+            "Struct", "struct={int=33;bool=true;string=s;}"),
+        std::pair<std::string, std::string>("Shared",
+                                            "int=33;bool=true;string=outer;"
+                                            "shared={int=42;string=shared}"),
+        std::pair<std::string, std::string>("Unique",
+                                            "int=33;bool=true;string=outer;"
+                                            "unique={int=42;string=unique}"),
+        std::pair<std::string, std::string>("Nested",
+                                            "int=11;bool=true;string=outer;"
+                                            "pointer={int=22;string=pointer};"
+                                            "unique={int=33;string=unique};"
+                                            "shared={int=44;string=shared}"),
+        std::pair<std::string, std::string>("ThreeDeep",
+                                            "int=11;bool=true;string=outer;"
+                                            "unique={int=22;string=inner;"
+                                            "unique={int=33;string=unique}};"),
+        std::pair<std::string, std::string>("DBOptions",
+                                            "max_background_jobs=100;"
+                                            "max_open_files=200;"),
+        std::pair<std::string, std::string>("CFOptions",
+                                            "table_factory=BlockBasedTable;"
+                                            "disable_auto_compactions=true;"),
+        std::pair<std::string, std::string>("BlockBased",
+                                            "block_size=1024;"
+                                            "no_block_cache=true;")));
+#endif  // ROCKSDB_LITE
+
+}  // namespace test
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/configurable_test.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/configurable_test.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,126 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "rocksdb/configurable.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct ColumnFamilyOptions;
+struct DBOptions;
+
+namespace test {
+enum TestEnum { kTestA, kTestB };
+
+static const std::unordered_map<std::string, int> test_enum_map = {
+    {"A", TestEnum::kTestA},
+    {"B", TestEnum::kTestB},
+};
+
+struct TestOptions {
+  int i = 0;
+  bool b = false;
+  bool d = true;
+  TestEnum e = TestEnum::kTestA;
+  std::string s = "";
+  std::string u = "";
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> simple_option_info = {
+#ifndef ROCKSDB_LITE
+    {"int",
+     {offsetof(struct TestOptions, i), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+    {"bool",
+     {offsetof(struct TestOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"string",
+     {offsetof(struct TestOptions, s), OptionType::kString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> enum_option_info = {
+#ifndef ROCKSDB_LITE
+    {"enum",
+     OptionTypeInfo::Enum(offsetof(struct TestOptions, e), &test_enum_map)}
+#endif
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> unique_option_info = {
+#ifndef ROCKSDB_LITE
+    {"unique",
+     {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+      (OptionTypeFlags::kUnique | OptionTypeFlags::kMutable)}},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> shared_option_info = {
+#ifndef ROCKSDB_LITE
+    {"shared",
+     {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+      (OptionTypeFlags::kShared)}},
+#endif  // ROCKSDB_LITE
+};
+static std::unordered_map<std::string, OptionTypeInfo> pointer_option_info = {
+#ifndef ROCKSDB_LITE
+    {"pointer",
+     {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kRawPointer}},
+#endif  // ROCKSDB_LITE
+};
+
+enum TestConfigMode {
+  kEmptyMode = 0x0,            // Don't register anything
+  kMutableMode = 0x01,         // Configuration is mutable
+  kSimpleMode = 0x02,          // Use the simple options
+  kEnumMode = 0x04,            // Use the enum options
+  kDefaultMode = kSimpleMode,  // Use no inner nested configurations
+  kSharedMode = 0x10,          // Use shared configuration
+  kUniqueMode = 0x20,          // Use unique configuration
+  kRawPtrMode = 0x40,          // Use pointer configuration
+  kNestedMode = (kSharedMode | kUniqueMode | kRawPtrMode),
+  kAllOptMode = (kNestedMode | kEnumMode | kSimpleMode),
+};
+
+template <typename T>
+class TestConfigurable : public Configurable {
+ protected:
+  std::string name_;
+  std::string prefix_;
+  TestOptions options_;
+
+ public:
+  std::unique_ptr<T> unique_;
+  std::shared_ptr<T> shared_;
+  T* pointer_;
+
+  TestConfigurable(const std::string& name, int mode,
+                   const std::unordered_map<std::string, OptionTypeInfo>* map =
+                       &simple_option_info)
+      : name_(name), pointer_(nullptr) {
+    prefix_ = "test." + name + ".";
+    if ((mode & TestConfigMode::kSimpleMode) != 0) {
+      RegisterOptions(name_, &options_, map);
+    }
+    if ((mode & TestConfigMode::kEnumMode) != 0) {
+      RegisterOptions(name_ + "Enum", &options_, &enum_option_info);
+    }
+  }
+
+  ~TestConfigurable() override { delete pointer_; }
+};
+
+}  // namespace test
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,137 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/customizable.h"
+
+#include <sstream>
+
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/options_type.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string Customizable::GetOptionName(const std::string& long_name) const {
+  const std::string& name = Name();
+  size_t name_len = name.size();
+  if (long_name.size() > name_len + 1 &&
+      long_name.compare(0, name_len, name) == 0 &&
+      long_name.at(name_len) == '.') {
+    return long_name.substr(name_len + 1);
+  } else {
+    return Configurable::GetOptionName(long_name);
+  }
+}
+
+std::string Customizable::GenerateIndividualId() const {
+  std::ostringstream ostr;
+  ostr << Name() << "@" << static_cast<const void*>(this) << "#"
+       << port::GetProcessID();
+  return ostr.str();
+}
+
+#ifndef ROCKSDB_LITE
+Status Customizable::GetOption(const ConfigOptions& config_options,
+                               const std::string& opt_name,
+                               std::string* value) const {
+  if (opt_name == OptionTypeInfo::kIdPropName()) {
+    *value = GetId();
+    return Status::OK();
+  } else {
+    return Configurable::GetOption(config_options, opt_name, value);
+  }
+}
+
+std::string Customizable::SerializeOptions(const ConfigOptions& config_options,
+                                           const std::string& prefix) const {
+  std::string result;
+  std::string parent;
+  std::string id = GetId();
+  if (!config_options.IsShallow() && !id.empty()) {
+    parent = Configurable::SerializeOptions(config_options, "");
+  }
+  if (parent.empty()) {
+    result = id;
+  } else {
+    result.append(prefix);
+    result.append(OptionTypeInfo::kIdPropName());
+    result.append("=");
+    result.append(id);
+    result.append(config_options.delimiter);
+    result.append(parent);
+  }
+  return result;
+}
+
+#endif  // ROCKSDB_LITE
+
+bool Customizable::AreEquivalent(const ConfigOptions& config_options,
+                                 const Configurable* other,
+                                 std::string* mismatch) const {
+  if (config_options.sanity_level > ConfigOptions::kSanityLevelNone &&
+      this != other) {
+    const Customizable* custom = reinterpret_cast<const Customizable*>(other);
+    if (GetId() != custom->GetId()) {
+      *mismatch = OptionTypeInfo::kIdPropName();
+      return false;
+    } else if (config_options.sanity_level >
+               ConfigOptions::kSanityLevelLooselyCompatible) {
+      bool matches =
+          Configurable::AreEquivalent(config_options, other, mismatch);
+      return matches;
+    }
+  }
+  return true;
+}
+
+Status Customizable::GetOptionsMap(
+    const ConfigOptions& config_options, const Customizable* customizable,
+    const std::string& value, std::string* id,
+    std::unordered_map<std::string, std::string>* props) {
+  Status status;
+  if (value.empty() || value == kNullptrString) {
+    *id = "";
+    props->clear();
+  } else if (customizable != nullptr) {
+    status =
+        Configurable::GetOptionsMap(value, customizable->GetId(), id, props);
+#ifdef ROCKSDB_LITE
+    (void)config_options;
+#else
+    if (status.ok() && customizable->IsInstanceOf(*id)) {
+      // The new ID and the old ID match, so the objects are the same type.
+      // Try to get the existing options, ignoring any errors
+      ConfigOptions embedded = config_options;
+      embedded.delimiter = ";";
+      std::string curr_opts;
+      if (customizable->GetOptionString(embedded, &curr_opts).ok()) {
+        std::unordered_map<std::string, std::string> curr_props;
+        if (StringToMap(curr_opts, &curr_props).ok()) {
+          props->insert(curr_props.begin(), curr_props.end());
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+  } else {
+    status = Configurable::GetOptionsMap(value, "", id, props);
+  }
+  return status;
+}
+
+Status Customizable::ConfigureNewObject(
+    const ConfigOptions& config_options, Customizable* object,
+    const std::unordered_map<std::string, std::string>& opt_map) {
+  Status status;
+  if (object != nullptr) {
+    status = object->ConfigureFromMap(config_options, opt_map);
+  } else if (!opt_map.empty()) {
+    status = Status::InvalidArgument("Cannot configure null object ");
+  }
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/customizable_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/customizable_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,2132 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/customizable.h"
+
+#include <cctype>
+#include <cinttypes>
+#include <cstring>
+#include <unordered_map>
+
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/memory_allocator.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_partitioner.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/block_based/flush_block_policy.h"
+#include "table/mock_table.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/file_checksum_helper.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+#include "utilities/memory_allocators.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    char buffer[1000];
+    vsnprintf(buffer, sizeof(buffer), format, ap);
+    string_.append(buffer);
+  }
+  const std::string& str() const { return string_; }
+  void clear() { string_.clear(); }
+
+ private:
+  std::string string_;
+};
+
+class TestCustomizable : public Customizable {
+ public:
+  TestCustomizable(const std::string& name) : name_(name) {}
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() {
+    return "TestCustomizable";
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+  static const char* Type() { return "test.custom"; }
+#ifndef ROCKSDB_LITE
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::unique_ptr<TestCustomizable>* result);
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::shared_ptr<TestCustomizable>* result);
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 TestCustomizable** result);
+#endif  // ROCKSDB_LITE
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return Customizable::IsInstanceOf(name);
+    }
+  }
+
+ protected:
+  const std::string name_;
+};
+
+struct AOptions {
+  static const char* kName() { return "A"; }
+  int i = 0;
+  bool b = false;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> a_option_info = {
+#ifndef ROCKSDB_LITE
+    {"int",
+     {offsetof(struct AOptions, i), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+    {"bool",
+     {offsetof(struct AOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+class ACustomizable : public TestCustomizable {
+ public:
+  explicit ACustomizable(const std::string& id)
+      : TestCustomizable("A"), id_(id) {
+    RegisterOptions(&opts_, &a_option_info);
+  }
+  std::string GetId() const override { return id_; }
+  static const char* kClassName() { return "A"; }
+
+ private:
+  AOptions opts_;
+  const std::string id_;
+};
+
+struct BOptions {
+  std::string s;
+  bool b = false;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> b_option_info = {
+#ifndef ROCKSDB_LITE
+    {"string",
+     {offsetof(struct BOptions, s), OptionType::kString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"bool",
+     {offsetof(struct BOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+class BCustomizable : public TestCustomizable {
+ private:
+ public:
+  explicit BCustomizable(const std::string& name) : TestCustomizable(name) {
+    RegisterOptions(name, &opts_, &b_option_info);
+  }
+  static const char* kClassName() { return "B"; }
+
+ private:
+  BOptions opts_;
+};
+
+#ifndef ROCKSDB_LITE
+static bool LoadSharedB(const std::string& id,
+                        std::shared_ptr<TestCustomizable>* result) {
+  if (id == "B") {
+    result->reset(new BCustomizable(id));
+    return true;
+  } else if (id.empty()) {
+    result->reset();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static int A_count = 0;
+static int RegisterCustomTestObjects(ObjectLibrary& library,
+                                     const std::string& /*arg*/) {
+  library.AddFactory<TestCustomizable>(
+      ObjectLibrary::PatternEntry("A", true).AddSeparator("_"),
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new ACustomizable(name));
+        A_count++;
+        return guard->get();
+      });
+
+  library.AddFactory<TestCustomizable>(
+      "S", [](const std::string& name,
+              std::unique_ptr<TestCustomizable>* /* guard */,
+              std::string* /* msg */) { return new BCustomizable(name); });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+struct SimpleOptions {
+  static const char* kName() { return "simple"; }
+  bool b = true;
+  std::unique_ptr<TestCustomizable> cu;
+  std::shared_ptr<TestCustomizable> cs;
+  TestCustomizable* cp = nullptr;
+};
+
+static SimpleOptions dummy_simple_options;
+template <typename T1>
+int offset_of(T1 SimpleOptions::*member) {
+  return static_cast<int>(
+      reinterpret_cast<uintptr_t>(
+          std::addressof(dummy_simple_options.*member)) -
+      reinterpret_cast<uintptr_t>(std::addressof(dummy_simple_options)));
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> simple_option_info = {
+#ifndef ROCKSDB_LITE
+    {"bool",
+     {offset_of(&SimpleOptions::b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"unique",
+     OptionTypeInfo::AsCustomUniquePtr<TestCustomizable>(
+         offset_of(&SimpleOptions::cu), OptionVerificationType::kNormal,
+         OptionTypeFlags::kAllowNull)},
+    {"shared",
+     OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+         offset_of(&SimpleOptions::cs), OptionVerificationType::kNormal,
+         OptionTypeFlags::kAllowNull)},
+    {"pointer",
+     OptionTypeInfo::AsCustomRawPtr<TestCustomizable>(
+         offset_of(&SimpleOptions::cp), OptionVerificationType::kNormal,
+         OptionTypeFlags::kAllowNull)},
+#endif  // ROCKSDB_LITE
+};
+
+class SimpleConfigurable : public Configurable {
+ private:
+  SimpleOptions simple_;
+
+ public:
+  SimpleConfigurable() { RegisterOptions(&simple_, &simple_option_info); }
+
+  explicit SimpleConfigurable(
+      const std::unordered_map<std::string, OptionTypeInfo>* map) {
+    RegisterOptions(&simple_, map);
+  }
+};
+
+#ifndef ROCKSDB_LITE
+static void GetMapFromProperties(
+    const std::string& props,
+    std::unordered_map<std::string, std::string>* map) {
+  std::istringstream iss(props);
+  std::unordered_map<std::string, std::string> copy_map;
+  std::string line;
+  map->clear();
+  for (int line_num = 0; std::getline(iss, line); line_num++) {
+    std::string name;
+    std::string value;
+    ASSERT_OK(
+        RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num));
+    (*map)[name] = value;
+  }
+}
+#endif  // ROCKSDB_LITE
+}  // namespace
+
+#ifndef ROCKSDB_LITE
+Status TestCustomizable::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<TestCustomizable>* result) {
+  return LoadSharedObject<TestCustomizable>(config_options, value, LoadSharedB,
+                                            result);
+}
+
+Status TestCustomizable::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::unique_ptr<TestCustomizable>* result) {
+  return LoadUniqueObject<TestCustomizable>(
+      config_options, value,
+      [](const std::string& id, std::unique_ptr<TestCustomizable>* u) {
+        if (id == "B") {
+          u->reset(new BCustomizable(id));
+          return true;
+        } else if (id.empty()) {
+          u->reset();
+          return true;
+        } else {
+          return false;
+        }
+      },
+      result);
+}
+
+Status TestCustomizable::CreateFromString(const ConfigOptions& config_options,
+                                          const std::string& value,
+                                          TestCustomizable** result) {
+  return LoadStaticObject<TestCustomizable>(
+      config_options, value,
+      [](const std::string& id, TestCustomizable** ptr) {
+        if (id == "B") {
+          *ptr = new BCustomizable(id);
+          return true;
+        } else if (id.empty()) {
+          *ptr = nullptr;
+          return true;
+        } else {
+          return false;
+        }
+      },
+      result);
+}
+#endif  // ROCKSDB_LITE
+
+class CustomizableTest : public testing::Test {
+ public:
+  CustomizableTest() {
+    config_options_.invoke_prepare_options = false;
+#ifndef ROCKSDB_LITE
+    // GetOptionsFromMap is not supported in ROCKSDB_LITE
+    config_options_.registry->AddLibrary("CustomizableTest",
+                                         RegisterCustomTestObjects, "");
+#endif  // ROCKSDB_LITE
+  }
+
+  ConfigOptions config_options_;
+};
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+// Tests that a Customizable can be created by:
+//    - a simple name
+//    - a XXX.id option
+//    - a property with a name
+TEST_F(CustomizableTest, CreateByNameTest) {
+  ObjectLibrary::Default()->AddFactory<TestCustomizable>(
+      ObjectLibrary::PatternEntry("TEST", false).AddSeparator("_"),
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new TestCustomizable(name));
+        return guard->get();
+      });
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique={id=TEST_1}"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_1");
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique.id=TEST_2"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_2");
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique=TEST_3"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_3");
+}
+
+TEST_F(CustomizableTest, ToStringTest) {
+  std::unique_ptr<TestCustomizable> custom(new TestCustomizable("test"));
+  ASSERT_EQ(custom->ToString(config_options_), "test");
+}
+
+TEST_F(CustomizableTest, SimpleConfigureTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique", "id=A;int=1;bool=true"},
+      {"shared", "id=B;string=s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str));
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(
+      configurable->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CustomizableTest, ConfigureFromPropsTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique.id", "A"}, {"unique.A.int", "1"},    {"unique.A.bool", "true"},
+      {"shared.id", "B"}, {"shared.B.string", "s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+  std::string opt_str;
+  std::string mismatch;
+  config_options_.delimiter = "\n";
+  std::unordered_map<std::string, std::string> props;
+  ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str));
+  GetMapFromProperties(opt_str, &props);
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromMap(config_options_, props));
+  ASSERT_TRUE(
+      configurable->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CustomizableTest, ConfigureFromShortTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique.id", "A"}, {"unique.A.int", "1"},    {"unique.A.bool", "true"},
+      {"shared.id", "B"}, {"shared.B.string", "s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+}
+
+TEST_F(CustomizableTest, AreEquivalentOptionsTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique", "id=A;int=1;bool=true"},
+      {"shared", "id=A;int=1;bool=true"},
+  };
+  std::string mismatch;
+  ConfigOptions config_options = config_options_;
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  std::unique_ptr<Configurable> c2(new SimpleConfigurable());
+  ASSERT_OK(c1->ConfigureFromMap(config_options, opt_map));
+  ASSERT_OK(c2->ConfigureFromMap(config_options, opt_map));
+  ASSERT_TRUE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  SimpleOptions* simple = c1->GetOptions<SimpleOptions>();
+  ASSERT_TRUE(
+      simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch));
+  ASSERT_OK(simple->cu->ConfigureOption(config_options, "int", "2"));
+  ASSERT_FALSE(
+      simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  ConfigOptions loosely = config_options;
+  loosely.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_TRUE(simple->cu->AreEquivalent(loosely, simple->cs.get(), &mismatch));
+
+  ASSERT_OK(c1->ConfigureOption(config_options, "shared", "id=B;string=3"));
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  ASSERT_FALSE(simple->cs->AreEquivalent(loosely, simple->cu.get(), &mismatch));
+  simple->cs.reset();
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+}
+
+// Tests that we can initialize a customizable from its options
+TEST_F(CustomizableTest, ConfigureStandaloneCustomTest) {
+  std::unique_ptr<TestCustomizable> base, copy;
+  const auto& registry = config_options_.registry;
+  ASSERT_OK(registry->NewUniqueObject<TestCustomizable>("A", &base));
+  ASSERT_OK(registry->NewUniqueObject<TestCustomizable>("A", &copy));
+  ASSERT_OK(base->ConfigureFromString(config_options_, "int=33;bool=true"));
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+// Tests that we fail appropriately if the pattern is not registered
+TEST_F(CustomizableTest, BadNameTest) {
+  config_options_.ignore_unsupported_options = false;
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  ASSERT_NOK(
+      c1->ConfigureFromString(config_options_, "unique.shared.id=bad name"));
+  config_options_.ignore_unsupported_options = true;
+  ASSERT_OK(
+      c1->ConfigureFromString(config_options_, "unique.shared.id=bad name"));
+}
+
+// Tests that we fail appropriately if a bad option is passed to the underlying
+// configurable
+TEST_F(CustomizableTest, BadOptionTest) {
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unknown_options = true;
+
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.int=11"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "shared={id=B;int=1}"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared={id=A;string=s}"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "B.int=11"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "B.int=11"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.string=s"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "A.string=s"));
+  // Test as detached
+  ASSERT_NOK(
+      c1->ConfigureFromString(config_options_, "shared.id=A;A.string=b}"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=A;A.string=s}"));
+}
+
+// Tests that different IDs lead to different objects
+TEST_F(CustomizableTest, UniqueIdTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=true}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), std::string("A_1"));
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_2;int=1;bool=true}"));
+  ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_EQ(simple->cu->GetId(), std::string("A_2"));
+}
+
+TEST_F(CustomizableTest, IsInstanceOfTest) {
+  std::shared_ptr<TestCustomizable> tc = std::make_shared<ACustomizable>("A_1");
+
+  ASSERT_EQ(tc->GetId(), std::string("A_1"));
+  ASSERT_TRUE(tc->IsInstanceOf("A"));
+  ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable"));
+  ASSERT_FALSE(tc->IsInstanceOf("B"));
+  ASSERT_FALSE(tc->IsInstanceOf("A_1"));
+  ASSERT_EQ(tc->CheckedCast<ACustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<TestCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<BCustomizable>(), nullptr);
+
+  tc.reset(new BCustomizable("B"));
+  ASSERT_TRUE(tc->IsInstanceOf("B"));
+  ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable"));
+  ASSERT_FALSE(tc->IsInstanceOf("A"));
+  ASSERT_EQ(tc->CheckedCast<BCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<TestCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<ACustomizable>(), nullptr);
+}
+
+TEST_F(CustomizableTest, PrepareOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> p_option_info = {
+#ifndef ROCKSDB_LITE
+      {"can_prepare",
+       {0, OptionType::kBoolean, OptionVerificationType::kNormal,
+        OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+  };
+
+  class PrepareCustomizable : public TestCustomizable {
+   public:
+    bool can_prepare_ = true;
+
+    PrepareCustomizable() : TestCustomizable("P") {
+      RegisterOptions("Prepare", &can_prepare_, &p_option_info);
+    }
+
+    Status PrepareOptions(const ConfigOptions& opts) override {
+      if (!can_prepare_) {
+        return Status::InvalidArgument("Cannot Prepare");
+      } else {
+        return TestCustomizable::PrepareOptions(opts);
+      }
+    }
+  };
+
+  ObjectLibrary::Default()->AddFactory<TestCustomizable>(
+      "P",
+      [](const std::string& /*name*/, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new PrepareCustomizable());
+        return guard->get();
+      });
+
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  ConfigOptions prepared(config_options_);
+  prepared.invoke_prepare_options = true;
+
+  ASSERT_OK(base->ConfigureFromString(
+      prepared, "unique=A_1; shared={id=B;string=s}; pointer.id=S"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_NE(simple->cs, nullptr);
+  ASSERT_NE(simple->cp, nullptr);
+  delete simple->cp;
+  base.reset(new SimpleConfigurable());
+  ASSERT_OK(base->ConfigureFromString(
+      config_options_, "unique=A_1; shared={id=B;string=s}; pointer.id=S"));
+
+  simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_NE(simple->cs, nullptr);
+  ASSERT_NE(simple->cp, nullptr);
+
+  ASSERT_OK(base->PrepareOptions(config_options_));
+  delete simple->cp;
+  base.reset(new SimpleConfigurable());
+  simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+
+  ASSERT_NOK(
+      base->ConfigureFromString(prepared, "unique={id=P; can_prepare=false}"));
+  ASSERT_EQ(simple->cu, nullptr);
+
+  ASSERT_OK(
+      base->ConfigureFromString(prepared, "unique={id=P; can_prepare=true}"));
+  ASSERT_NE(simple->cu, nullptr);
+
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=P; can_prepare=true}"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_OK(simple->cu->PrepareOptions(prepared));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=P; can_prepare=false}"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_NOK(simple->cu->PrepareOptions(prepared));
+}
+
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo> inner_option_info = {
+#ifndef ROCKSDB_LITE
+    {"inner",
+     OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+         0, OptionVerificationType::kNormal, OptionTypeFlags::kStringNameOnly)}
+#endif  // ROCKSDB_LITE
+};
+
+struct InnerOptions {
+  static const char* kName() { return "InnerOptions"; }
+  std::shared_ptr<Customizable> inner;
+};
+
+class InnerCustomizable : public Customizable {
+ public:
+  explicit InnerCustomizable(const std::shared_ptr<Customizable>& w) {
+    iopts_.inner = w;
+    RegisterOptions(&iopts_, &inner_option_info);
+  }
+  static const char* kClassName() { return "Inner"; }
+  const char* Name() const override { return kClassName(); }
+
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return Customizable::IsInstanceOf(name);
+    }
+  }
+
+ protected:
+  const Customizable* Inner() const override { return iopts_.inner.get(); }
+
+ private:
+  InnerOptions iopts_;
+};
+
+struct WrappedOptions1 {
+  static const char* kName() { return "WrappedOptions1"; }
+  int i = 42;
+};
+
+class WrappedCustomizable1 : public InnerCustomizable {
+ public:
+  explicit WrappedCustomizable1(const std::shared_ptr<Customizable>& w)
+      : InnerCustomizable(w) {
+    RegisterOptions(&wopts_, nullptr);
+  }
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Wrapped1"; }
+
+ private:
+  WrappedOptions1 wopts_;
+};
+
+struct WrappedOptions2 {
+  static const char* kName() { return "WrappedOptions2"; }
+  std::string s = "42";
+};
+class WrappedCustomizable2 : public InnerCustomizable {
+ public:
+  explicit WrappedCustomizable2(const std::shared_ptr<Customizable>& w)
+      : InnerCustomizable(w) {}
+  const void* GetOptionsPtr(const std::string& name) const override {
+    if (name == WrappedOptions2::kName()) {
+      return &wopts_;
+    } else {
+      return InnerCustomizable::GetOptionsPtr(name);
+    }
+  }
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Wrapped2"; }
+
+ private:
+  WrappedOptions2 wopts_;
+};
+}  // namespace
+
+TEST_F(CustomizableTest, WrappedInnerTest) {
+  std::shared_ptr<TestCustomizable> ac =
+      std::make_shared<TestCustomizable>("A");
+
+  ASSERT_TRUE(ac->IsInstanceOf("A"));
+  ASSERT_TRUE(ac->IsInstanceOf("TestCustomizable"));
+  ASSERT_EQ(ac->CheckedCast<TestCustomizable>(), ac.get());
+  ASSERT_EQ(ac->CheckedCast<InnerCustomizable>(), nullptr);
+  ASSERT_EQ(ac->CheckedCast<WrappedCustomizable1>(), nullptr);
+  ASSERT_EQ(ac->CheckedCast<WrappedCustomizable2>(), nullptr);
+  std::shared_ptr<Customizable> wc1 =
+      std::make_shared<WrappedCustomizable1>(ac);
+
+  ASSERT_TRUE(wc1->IsInstanceOf(WrappedCustomizable1::kClassName()));
+  ASSERT_EQ(wc1->CheckedCast<WrappedCustomizable1>(), wc1.get());
+  ASSERT_EQ(wc1->CheckedCast<WrappedCustomizable2>(), nullptr);
+  ASSERT_EQ(wc1->CheckedCast<InnerCustomizable>(), wc1.get());
+  ASSERT_EQ(wc1->CheckedCast<TestCustomizable>(), ac.get());
+
+  std::shared_ptr<Customizable> wc2 =
+      std::make_shared<WrappedCustomizable2>(wc1);
+  ASSERT_TRUE(wc2->IsInstanceOf(WrappedCustomizable2::kClassName()));
+  ASSERT_EQ(wc2->CheckedCast<WrappedCustomizable2>(), wc2.get());
+  ASSERT_EQ(wc2->CheckedCast<WrappedCustomizable1>(), wc1.get());
+  ASSERT_EQ(wc2->CheckedCast<InnerCustomizable>(), wc2.get());
+  ASSERT_EQ(wc2->CheckedCast<TestCustomizable>(), ac.get());
+}
+
+TEST_F(CustomizableTest, CustomizableInnerTest) {
+  std::shared_ptr<Customizable> c =
+      std::make_shared<InnerCustomizable>(std::make_shared<ACustomizable>("a"));
+  std::shared_ptr<Customizable> wc1 = std::make_shared<WrappedCustomizable1>(c);
+  std::shared_ptr<Customizable> wc2 = std::make_shared<WrappedCustomizable2>(c);
+  auto inner = c->GetOptions<InnerOptions>();
+  ASSERT_NE(inner, nullptr);
+
+  auto aopts = c->GetOptions<AOptions>();
+  ASSERT_NE(aopts, nullptr);
+  ASSERT_EQ(aopts, wc1->GetOptions<AOptions>());
+  ASSERT_EQ(aopts, wc2->GetOptions<AOptions>());
+  auto w1opts = wc1->GetOptions<WrappedOptions1>();
+  ASSERT_NE(w1opts, nullptr);
+  ASSERT_EQ(c->GetOptions<WrappedOptions1>(), nullptr);
+  ASSERT_EQ(wc2->GetOptions<WrappedOptions1>(), nullptr);
+
+  auto w2opts = wc2->GetOptions<WrappedOptions2>();
+  ASSERT_NE(w2opts, nullptr);
+  ASSERT_EQ(c->GetOptions<WrappedOptions2>(), nullptr);
+  ASSERT_EQ(wc1->GetOptions<WrappedOptions2>(), nullptr);
+}
+
+TEST_F(CustomizableTest, CopyObjectTest) {
+  class CopyCustomizable : public Customizable {
+   public:
+    CopyCustomizable() : prepared_(0), validated_(0) {}
+    const char* Name() const override { return "CopyCustomizable"; }
+
+    Status PrepareOptions(const ConfigOptions& options) override {
+      prepared_++;
+      return Customizable::PrepareOptions(options);
+    }
+    Status ValidateOptions(const DBOptions& db_opts,
+                           const ColumnFamilyOptions& cf_opts) const override {
+      validated_++;
+      return Customizable::ValidateOptions(db_opts, cf_opts);
+    }
+    int prepared_;
+    mutable int validated_;
+  };
+
+  CopyCustomizable c1;
+  ConfigOptions config_options;
+  Options options;
+
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c1.prepared_, 1);
+  ASSERT_EQ(c1.validated_, 1);
+  CopyCustomizable c2 = c1;
+  ASSERT_OK(c1.PrepareOptions(config_options));
+  ASSERT_OK(c1.ValidateOptions(options, options));
+  ASSERT_EQ(c2.prepared_, 1);
+  ASSERT_EQ(c2.validated_, 1);
+  ASSERT_EQ(c1.prepared_, 2);
+  ASSERT_EQ(c1.validated_, 2);
+}
+
+TEST_F(CustomizableTest, TestStringDepth) {
+  ConfigOptions shallow = config_options_;
+  std::unique_ptr<Configurable> c(
+      new InnerCustomizable(std::make_shared<ACustomizable>("a")));
+  std::string opt_str;
+  shallow.depth = ConfigOptions::Depth::kDepthShallow;
+  ASSERT_OK(c->GetOptionString(shallow, &opt_str));
+  ASSERT_EQ(opt_str, "inner=a;");
+  shallow.depth = ConfigOptions::Depth::kDepthDetailed;
+  ASSERT_OK(c->GetOptionString(shallow, &opt_str));
+  ASSERT_NE(opt_str, "inner=a;");
+}
+
+// Tests that we only get a new customizable when it changes
+TEST_F(CustomizableTest, NewUniqueCustomizableTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  A_count = 0;
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=true}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 1);  // Created one A
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=false}"));
+  ASSERT_EQ(A_count, 2);  // Create another A_1
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=}"));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 2);
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_2;int=1;bool=false}"));
+  ASSERT_EQ(A_count, 3);  // Created another A
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id="));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 3);
+}
+
+TEST_F(CustomizableTest, NewEmptyUniqueTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=}"));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique={id=nullptr}"));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id="));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+  simple->cu.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=nullptr"));
+  ASSERT_EQ(simple->cu, nullptr);
+}
+
+TEST_F(CustomizableTest, NewEmptySharedTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared={id=}"));
+  ASSERT_NE(simple, nullptr);
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared={id=nullptr}"));
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared.id="));
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared.id=nullptr"));
+  ASSERT_EQ(simple->cs, nullptr);
+  simple->cs.reset(new BCustomizable("B"));
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "shared=nullptr"));
+  ASSERT_EQ(simple->cs, nullptr);
+}
+
+TEST_F(CustomizableTest, NewEmptyStaticTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer={id=}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>();
+  ASSERT_NE(simple, nullptr);
+  ASSERT_EQ(simple->cp, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer={id=nullptr}"));
+  ASSERT_EQ(simple->cp, nullptr);
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer="));
+  ASSERT_EQ(simple->cp, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer=nullptr"));
+  ASSERT_EQ(simple->cp, nullptr);
+
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer.id="));
+  ASSERT_EQ(simple->cp, nullptr);
+  ASSERT_OK(base->ConfigureFromString(config_options_, "pointer.id=nullptr"));
+  ASSERT_EQ(simple->cp, nullptr);
+}
+
+namespace {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> vector_option_info = {
+    {"vector",
+     OptionTypeInfo::Vector<std::shared_ptr<TestCustomizable>>(
+         0, OptionVerificationType::kNormal,
+
+         OptionTypeFlags::kNone,
+
+         OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+             0, OptionVerificationType::kNormal, OptionTypeFlags::kNone))},
+};
+class VectorConfigurable : public SimpleConfigurable {
+ public:
+  VectorConfigurable() { RegisterOptions("vector", &cv, &vector_option_info); }
+  std::vector<std::shared_ptr<TestCustomizable>> cv;
+};
+}  // namespace
+
+TEST_F(CustomizableTest, VectorConfigTest) {
+  VectorConfigurable orig, copy;
+  std::shared_ptr<TestCustomizable> c1, c2;
+  ASSERT_OK(TestCustomizable::CreateFromString(config_options_, "A", &c1));
+  ASSERT_OK(TestCustomizable::CreateFromString(config_options_, "B", &c2));
+  orig.cv.push_back(c1);
+  orig.cv.push_back(c2);
+  ASSERT_OK(orig.ConfigureFromString(config_options_, "unique=A2"));
+  std::string opt_str, mismatch;
+  ASSERT_OK(orig.GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy.ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(orig.AreEquivalent(config_options_, &copy, &mismatch));
+}
+
+TEST_F(CustomizableTest, NoNameTest) {
+  // If Customizables are created without names, they are not
+  // part of the serialization (since they cannot be recreated)
+  VectorConfigurable orig, copy;
+  auto sopts = orig.GetOptions<SimpleOptions>();
+  auto copts = copy.GetOptions<SimpleOptions>();
+  sopts->cu.reset(new ACustomizable(""));
+  orig.cv.push_back(std::make_shared<ACustomizable>(""));
+  orig.cv.push_back(std::make_shared<ACustomizable>("A_1"));
+  std::string opt_str, mismatch;
+  ASSERT_OK(orig.GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy.ConfigureFromString(config_options_, opt_str));
+  ASSERT_EQ(copy.cv.size(), 1U);
+  ASSERT_EQ(copy.cv[0]->GetId(), "A_1");
+  ASSERT_EQ(copts->cu, nullptr);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(CustomizableTest, IgnoreUnknownObjects) {
+  ConfigOptions ignore = config_options_;
+  std::shared_ptr<TestCustomizable> shared;
+  std::unique_ptr<TestCustomizable> unique;
+  TestCustomizable* pointer = nullptr;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_NOK(
+      LoadSharedObject<TestCustomizable>(ignore, "Unknown", nullptr, &shared));
+  ASSERT_NOK(
+      LoadUniqueObject<TestCustomizable>(ignore, "Unknown", nullptr, &unique));
+  ASSERT_NOK(
+      LoadStaticObject<TestCustomizable>(ignore, "Unknown", nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ignore.ignore_unsupported_options = true;
+  ASSERT_OK(
+      LoadSharedObject<TestCustomizable>(ignore, "Unknown", nullptr, &shared));
+  ASSERT_OK(
+      LoadUniqueObject<TestCustomizable>(ignore, "Unknown", nullptr, &unique));
+  ASSERT_OK(
+      LoadStaticObject<TestCustomizable>(ignore, "Unknown", nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_OK(LoadSharedObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &shared));
+  ASSERT_OK(LoadUniqueObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &unique));
+  ASSERT_OK(LoadStaticObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_OK(LoadSharedObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &shared));
+  ASSERT_OK(LoadUniqueObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &unique));
+  ASSERT_OK(LoadStaticObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+}
+
+TEST_F(CustomizableTest, FactoryFunctionTest) {
+  std::shared_ptr<TestCustomizable> shared;
+  std::unique_ptr<TestCustomizable> unique;
+  TestCustomizable* pointer = nullptr;
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &shared));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &pointer));
+  ASSERT_NE(shared.get(), nullptr);
+  ASSERT_NE(unique.get(), nullptr);
+  ASSERT_NE(pointer, nullptr);
+  delete pointer;
+  pointer = nullptr;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &shared));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &shared));
+  ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &unique));
+  ASSERT_NOK(
+      TestCustomizable::CreateFromString(ignore, "option=bad", &pointer));
+  ASSERT_EQ(pointer, nullptr);
+}
+
+TEST_F(CustomizableTest, URLFactoryTest) {
+  std::unique_ptr<TestCustomizable> unique;
+  config_options_.registry->AddLibrary("URL")->AddFactory<TestCustomizable>(
+      ObjectLibrary::PatternEntry("Z", false).AddSeparator(""),
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new TestCustomizable(name));
+        return guard->get();
+      });
+
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unsupported_options = false;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1;x=y", &unique));
+  ASSERT_NE(unique, nullptr);
+  ASSERT_EQ(unique->GetId(), "Z=1;x=y");
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z;x=y", &unique));
+  ASSERT_NE(unique, nullptr);
+  ASSERT_EQ(unique->GetId(), "Z;x=y");
+  unique.reset();
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "Z=1?x=y", &unique));
+  ASSERT_NE(unique, nullptr);
+  ASSERT_EQ(unique->GetId(), "Z=1?x=y");
+}
+
+TEST_F(CustomizableTest, MutableOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> mutable_option_info = {
+      {"mutable",
+       OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+           0, OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}};
+  static std::unordered_map<std::string, OptionTypeInfo> immutable_option_info =
+      {{"immutable",
+        OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+            0, OptionVerificationType::kNormal, OptionTypeFlags::kAllowNull)}};
+
+  class MutableCustomizable : public Customizable {
+   private:
+    std::shared_ptr<TestCustomizable> mutable_;
+    std::shared_ptr<TestCustomizable> immutable_;
+
+   public:
+    MutableCustomizable() {
+      RegisterOptions("mutable", &mutable_, &mutable_option_info);
+      RegisterOptions("immutable", &immutable_, &immutable_option_info);
+    }
+    const char* Name() const override { return "MutableCustomizable"; }
+  };
+  MutableCustomizable mc, mc2;
+  std::string mismatch;
+  std::string opt_str;
+
+  ConfigOptions options = config_options_;
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=B;}"));
+  options.mutable_options_only = true;
+  ASSERT_OK(mc.GetOptionString(options, &opt_str));
+  ASSERT_OK(mc2.ConfigureFromString(options, opt_str));
+  ASSERT_TRUE(mc.AreEquivalent(options, &mc2, &mismatch));
+
+  options.mutable_options_only = false;
+  ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=A; int=10}"));
+  auto* mm = mc.GetOptions<std::shared_ptr<TestCustomizable>>("mutable");
+  auto* im = mc.GetOptions<std::shared_ptr<TestCustomizable>>("immutable");
+  ASSERT_NE(mm, nullptr);
+  ASSERT_NE(mm->get(), nullptr);
+  ASSERT_NE(im, nullptr);
+  ASSERT_NE(im->get(), nullptr);
+
+  // Now only deal with mutable options
+  options.mutable_options_only = true;
+
+  // Setting nested immutable customizable options fails
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{id=B;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable.id", "B"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable.bool", "true"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "bool=true"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{int=11;bool=true}"));
+  auto* im_a = im->get()->GetOptions<AOptions>("A");
+  ASSERT_NE(im_a, nullptr);
+  ASSERT_EQ(im_a->i, 10);
+  ASSERT_EQ(im_a->b, false);
+
+  // Setting nested mutable customizable options succeeds but the object did not
+  // change
+  ASSERT_OK(mc.ConfigureOption(options, "immutable.int", "11"));
+  ASSERT_EQ(im_a->i, 11);
+  ASSERT_EQ(im_a, im->get()->GetOptions<AOptions>("A"));
+
+  // The mutable configurable itself can be changed
+  ASSERT_OK(mc.ConfigureOption(options, "mutable.id", "A"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "A"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=A}"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}"));
+
+  // The Nested options in the mutable object can be changed
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}"));
+  auto* mm_a = mm->get()->GetOptions<AOptions>("A");
+  ASSERT_EQ(mm_a->b, true);
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{int=22;bool=false}"));
+  mm_a = mm->get()->GetOptions<AOptions>("A");
+  ASSERT_EQ(mm_a->i, 22);
+  ASSERT_EQ(mm_a->b, false);
+
+  // Only the mutable options should get serialized
+  options.mutable_options_only = false;
+  ASSERT_OK(mc.GetOptionString(options, &opt_str));
+  ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=B;}"));
+  options.mutable_options_only = true;
+
+  ASSERT_OK(mc.GetOptionString(options, &opt_str));
+  ASSERT_OK(mc2.ConfigureFromString(options, opt_str));
+  ASSERT_TRUE(mc.AreEquivalent(options, &mc2, &mismatch));
+  options.mutable_options_only = false;
+  ASSERT_FALSE(mc.AreEquivalent(options, &mc2, &mismatch));
+  ASSERT_EQ(mismatch, "immutable");
+}
+
+TEST_F(CustomizableTest, CustomManagedObjects) {
+  std::shared_ptr<TestCustomizable> object1, object2;
+  ASSERT_OK(LoadManagedObject<TestCustomizable>(
+      config_options_, "id=A_1;int=1;bool=true", &object1));
+  ASSERT_NE(object1, nullptr);
+  ASSERT_OK(
+      LoadManagedObject<TestCustomizable>(config_options_, "A_1", &object2));
+  ASSERT_EQ(object1, object2);
+  auto* opts = object2->GetOptions<AOptions>("A");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->i, 1);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_OK(
+      LoadManagedObject<TestCustomizable>(config_options_, "A_2", &object2));
+  ASSERT_NE(object1, object2);
+  object1.reset();
+  ASSERT_OK(LoadManagedObject<TestCustomizable>(
+      config_options_, "id=A_1;int=2;bool=false", &object1));
+  opts = object1->GetOptions<AOptions>("A");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->i, 2);
+  ASSERT_EQ(opts->b, false);
+}
+
+TEST_F(CustomizableTest, CreateManagedObjects) {
+  class ManagedCustomizable : public Customizable {
+   public:
+    static const char* Type() { return "ManagedCustomizable"; }
+    static const char* kClassName() { return "Managed"; }
+    const char* Name() const override { return kClassName(); }
+    std::string GetId() const override { return id_; }
+    ManagedCustomizable() { id_ = GenerateIndividualId(); }
+    static Status CreateFromString(
+        const ConfigOptions& opts, const std::string& value,
+        std::shared_ptr<ManagedCustomizable>* result) {
+      return LoadManagedObject<ManagedCustomizable>(opts, value, result);
+    }
+
+   private:
+    std::string id_;
+  };
+
+  config_options_.registry->AddLibrary("Managed")
+      ->AddFactory<ManagedCustomizable>(
+          ObjectLibrary::PatternEntry::AsIndividualId(
+              ManagedCustomizable::kClassName()),
+          [](const std::string& /*name*/,
+             std::unique_ptr<ManagedCustomizable>* guard,
+             std::string* /* msg */) {
+            guard->reset(new ManagedCustomizable());
+            return guard->get();
+          });
+
+  std::shared_ptr<ManagedCustomizable> mc1, mc2, mc3, obj;
+  // Create a "deadbeef" customizable
+  std::string deadbeef =
+      std::string(ManagedCustomizable::kClassName()) + "@0xdeadbeef#0001";
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &mc1));
+  // Create an object with the base/class name
+  ASSERT_OK(ManagedCustomizable::CreateFromString(
+      config_options_, ManagedCustomizable::kClassName(), &mc2));
+  // Creating another with the base name returns a different object
+  ASSERT_OK(ManagedCustomizable::CreateFromString(
+      config_options_, ManagedCustomizable::kClassName(), &mc3));
+  // At this point, there should be 4 managed objects (deadbeef, mc1, 2, and 3)
+  std::vector<std::shared_ptr<ManagedCustomizable>> objects;
+  ASSERT_OK(config_options_.registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 4U);
+  objects.clear();
+  // Three separate object, none of them equal
+  ASSERT_NE(mc1, mc2);
+  ASSERT_NE(mc1, mc3);
+  ASSERT_NE(mc2, mc3);
+
+  // Creating another object with "deadbeef" object
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj));
+  ASSERT_EQ(mc1, obj);
+  // Create another with the IDs of the instances
+  ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc1->GetId(),
+                                                  &obj));
+  ASSERT_EQ(mc1, obj);
+  ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc2->GetId(),
+                                                  &obj));
+  ASSERT_EQ(mc2, obj);
+  ASSERT_OK(ManagedCustomizable::CreateFromString(config_options_, mc3->GetId(),
+                                                  &obj));
+  ASSERT_EQ(mc3, obj);
+
+  // Now get rid of deadbeef.  2 Objects left (m2+m3)
+  mc1.reset();
+  ASSERT_EQ(
+      config_options_.registry->GetManagedObject<ManagedCustomizable>(deadbeef),
+      nullptr);
+  ASSERT_OK(config_options_.registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 2U);
+  objects.clear();
+
+  // Associate deadbeef with #2
+  ASSERT_OK(config_options_.registry->SetManagedObject(deadbeef, mc2));
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj));
+  ASSERT_EQ(mc2, obj);
+  obj.reset();
+
+  // Get the ID of mc2 and then reset it.  1 Object left
+  std::string mc2id = mc2->GetId();
+  mc2.reset();
+  ASSERT_EQ(
+      config_options_.registry->GetManagedObject<ManagedCustomizable>(mc2id),
+      nullptr);
+  ASSERT_OK(config_options_.registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  objects.clear();
+
+  // Create another object with the old mc2id.
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, mc2id, &mc2));
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, mc2id, &obj));
+  ASSERT_EQ(mc2, obj);
+
+  // For good measure, create another deadbeef object
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &mc1));
+  ASSERT_OK(
+      ManagedCustomizable::CreateFromString(config_options_, deadbeef, &obj));
+  ASSERT_EQ(mc1, obj);
+}
+
+#endif  // !ROCKSDB_LITE
+
+namespace {
+class TestSecondaryCache : public SecondaryCache {
+ public:
+  static const char* kClassName() { return "Test"; }
+  const char* Name() const override { return kClassName(); }
+  Status Insert(const Slice& /*key*/, void* /*value*/,
+                const Cache::CacheItemHelper* /*helper*/) override {
+    return Status::NotSupported();
+  }
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& /*key*/, const Cache::CreateCallback& /*create_cb*/,
+      bool /*wait*/) override {
+    return nullptr;
+  }
+  void Erase(const Slice& /*key*/) override {}
+
+  // Wait for a collection of handles to become ready
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> /*handles*/) override {}
+
+  std::string GetPrintableOptions() const override { return ""; }
+};
+
+class TestStatistics : public StatisticsImpl {
+ public:
+  TestStatistics() : StatisticsImpl(nullptr) {}
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Test"; }
+};
+
+class TestFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  TestFlushBlockPolicyFactory() {}
+
+  static const char* kClassName() { return "TestFlushBlockPolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return nullptr;
+  }
+};
+
+class MockSliceTransform : public SliceTransform {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Mock"; }
+
+  Slice Transform(const Slice& /*key*/) const override { return Slice(); }
+
+  bool InDomain(const Slice& /*key*/) const override { return false; }
+
+  bool InRange(const Slice& /*key*/) const override { return false; }
+};
+
+class MockMemoryAllocator : public BaseMemoryAllocator {
+ public:
+  static const char* kClassName() { return "MockMemoryAllocator"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+#ifndef ROCKSDB_LITE
+class MockEncryptionProvider : public EncryptionProvider {
+ public:
+  explicit MockEncryptionProvider(const std::string& id) : id_(id) {}
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+  size_t GetPrefixLength() const override { return 0; }
+  Status CreateNewPrefix(const std::string& /*fname*/, char* /*prefix*/,
+                         size_t /*prefixLength*/) const override {
+    return Status::NotSupported();
+  }
+
+  Status AddCipher(const std::string& /*descriptor*/, const char* /*cipher*/,
+                   size_t /*len*/, bool /*for_write*/) override {
+    return Status::NotSupported();
+  }
+
+  Status CreateCipherStream(
+      const std::string& /*fname*/, const EnvOptions& /*options*/,
+      Slice& /*prefix*/,
+      std::unique_ptr<BlockAccessCipherStream>* /*result*/) override {
+    return Status::NotSupported();
+  }
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (EndsWith(id_, "://test")) {
+      return EncryptionProvider::ValidateOptions(db_opts, cf_opts);
+    } else {
+      return Status::InvalidArgument("MockProvider not initialized");
+    }
+  }
+
+ private:
+  std::string id_;
+};
+
+class MockCipher : public BlockCipher {
+ public:
+  const char* Name() const override { return "Mock"; }
+  size_t BlockSize() override { return 0; }
+  Status Encrypt(char* /*data*/) override { return Status::NotSupported(); }
+  Status Decrypt(char* data) override { return Encrypt(data); }
+};
+#endif  // ROCKSDB_LITE
+
+class DummyFileSystem : public FileSystemWrapper {
+ public:
+  explicit DummyFileSystem(const std::shared_ptr<FileSystem>& t)
+      : FileSystemWrapper(t) {}
+  static const char* kClassName() { return "DummyFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+#ifndef ROCKSDB_LITE
+
+#endif  // ROCKSDB_LITE
+
+class MockTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ private:
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return nullptr;
+  }
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+};
+
+class MockSstPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return nullptr;
+  }
+};
+
+class MockFileChecksumGenFactory : public FileChecksumGenFactory {
+ public:
+  static const char* kClassName() { return "Mock"; }
+  const char* Name() const override { return kClassName(); }
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& /*context*/) override {
+    return nullptr;
+  }
+};
+
+class MockRateLimiter : public RateLimiter {
+ public:
+  static const char* kClassName() { return "MockRateLimiter"; }
+  const char* Name() const override { return kClassName(); }
+  void SetBytesPerSecond(int64_t /*bytes_per_second*/) override {}
+  int64_t GetBytesPerSecond() const override { return 0; }
+  int64_t GetSingleBurstBytes() const override { return 0; }
+  int64_t GetTotalBytesThrough(const Env::IOPriority /*pri*/) const override {
+    return 0;
+  }
+  int64_t GetTotalRequests(const Env::IOPriority /*pri*/) const override {
+    return 0;
+  }
+};
+
+#ifndef ROCKSDB_LITE
+static int RegisterLocalObjects(ObjectLibrary& library,
+                                const std::string& /*arg*/) {
+  size_t num_types;
+  library.AddFactory<TableFactory>(
+      mock::MockTableFactory::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new mock::MockTableFactory());
+        return guard->get();
+      });
+  library.AddFactory<EventListener>(
+      OnFileDeletionListener::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new OnFileDeletionListener());
+        return guard->get();
+      });
+  library.AddFactory<EventListener>(
+      FlushCounterListener::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushCounterListener());
+        return guard->get();
+      });
+  // Load any locally defined objects here
+  library.AddFactory<const SliceTransform>(
+      MockSliceTransform::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const SliceTransform>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSliceTransform());
+        return guard->get();
+      });
+  library.AddFactory<Statistics>(
+      TestStatistics::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<Statistics>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestStatistics());
+        return guard->get();
+      });
+
+  library.AddFactory<EncryptionProvider>(
+      ObjectLibrary::PatternEntry(MockEncryptionProvider::kClassName(), true)
+          .AddSuffix("://test"),
+      [](const std::string& uri, std::unique_ptr<EncryptionProvider>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockEncryptionProvider(uri));
+        return guard->get();
+      });
+  library.AddFactory<BlockCipher>(
+      "Mock",
+      [](const std::string& /*uri*/, std::unique_ptr<BlockCipher>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockCipher());
+        return guard->get();
+      });
+  library.AddFactory<MemoryAllocator>(
+      MockMemoryAllocator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MemoryAllocator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockMemoryAllocator());
+        return guard->get();
+      });
+  library.AddFactory<FlushBlockPolicyFactory>(
+      TestFlushBlockPolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestFlushBlockPolicyFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<SecondaryCache>(
+      TestSecondaryCache::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SecondaryCache>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestSecondaryCache());
+        return guard->get();
+      });
+
+  library.AddFactory<FileSystem>(
+      DummyFileSystem::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<FileSystem>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new DummyFileSystem(nullptr));
+        return guard->get();
+      });
+
+  library.AddFactory<SstPartitionerFactory>(
+      MockSstPartitionerFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<SstPartitionerFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSstPartitionerFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<FileChecksumGenFactory>(
+      MockFileChecksumGenFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FileChecksumGenFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockFileChecksumGenFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<TablePropertiesCollectorFactory>(
+      MockTablePropertiesCollectorFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<TablePropertiesCollectorFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockTablePropertiesCollectorFactory());
+        return guard->get();
+      });
+
+  library.AddFactory<RateLimiter>(
+      MockRateLimiter::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<RateLimiter>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockRateLimiter());
+        return guard->get();
+      });
+
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace
+
+class LoadCustomizableTest : public testing::Test {
+ public:
+  LoadCustomizableTest() {
+    config_options_.ignore_unsupported_options = false;
+    config_options_.invoke_prepare_options = false;
+  }
+  bool RegisterTests(const std::string& arg) {
+#ifndef ROCKSDB_LITE
+    config_options_.registry->AddLibrary("custom-tests",
+                                         test::RegisterTestObjects, arg);
+    config_options_.registry->AddLibrary("local-tests", RegisterLocalObjects,
+                                         arg);
+    return true;
+#else
+    (void)arg;
+    return false;
+#endif  // !ROCKSDB_LITE
+  }
+
+ protected:
+  DBOptions db_opts_;
+  ColumnFamilyOptions cf_opts_;
+  ConfigOptions config_options_;
+};
+
+TEST_F(LoadCustomizableTest, LoadTableFactoryTest) {
+  std::shared_ptr<TableFactory> factory;
+  ASSERT_NOK(TableFactory::CreateFromString(
+      config_options_, mock::MockTableFactory::kClassName(), &factory));
+  ASSERT_OK(TableFactory::CreateFromString(
+      config_options_, TableFactory::kBlockBasedTableName(), &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), TableFactory::kBlockBasedTableName());
+#ifndef ROCKSDB_LITE
+  std::string opts_str = "table_factory=";
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options_, cf_opts_,
+      opts_str + TableFactory::kBlockBasedTableName(), &cf_opts_));
+  ASSERT_NE(cf_opts_.table_factory.get(), nullptr);
+  ASSERT_STREQ(cf_opts_.table_factory->Name(),
+               TableFactory::kBlockBasedTableName());
+#endif  // ROCKSDB_LITE
+  if (RegisterTests("Test")) {
+    ASSERT_OK(TableFactory::CreateFromString(
+        config_options_, mock::MockTableFactory::kClassName(), &factory));
+    ASSERT_NE(factory, nullptr);
+    ASSERT_STREQ(factory->Name(), mock::MockTableFactory::kClassName());
+#ifndef ROCKSDB_LITE
+    ASSERT_OK(GetColumnFamilyOptionsFromString(
+        config_options_, cf_opts_,
+        opts_str + mock::MockTableFactory::kClassName(), &cf_opts_));
+    ASSERT_NE(cf_opts_.table_factory.get(), nullptr);
+    ASSERT_STREQ(cf_opts_.table_factory->Name(),
+                 mock::MockTableFactory::kClassName());
+#endif  // ROCKSDB_LITE
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadFileSystemTest) {
+  ColumnFamilyOptions cf_opts;
+  std::shared_ptr<FileSystem> result;
+  ASSERT_NOK(FileSystem::CreateFromString(
+      config_options_, DummyFileSystem::kClassName(), &result));
+  ASSERT_OK(FileSystem::CreateFromString(config_options_,
+                                         FileSystem::kDefaultName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_TRUE(result->IsInstanceOf(FileSystem::kDefaultName()));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(FileSystem::CreateFromString(
+        config_options_, DummyFileSystem::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), DummyFileSystem::kClassName());
+    ASSERT_FALSE(result->IsInstanceOf(FileSystem::kDefaultName()));
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadSecondaryCacheTest) {
+  std::shared_ptr<SecondaryCache> result;
+  ASSERT_NOK(SecondaryCache::CreateFromString(
+      config_options_, TestSecondaryCache::kClassName(), &result));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(SecondaryCache::CreateFromString(
+        config_options_, TestSecondaryCache::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), TestSecondaryCache::kClassName());
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(LoadCustomizableTest, LoadSstPartitionerFactoryTest) {
+  std::shared_ptr<SstPartitionerFactory> factory;
+  ASSERT_NOK(SstPartitionerFactory::CreateFromString(config_options_, "Mock",
+                                                     &factory));
+  ASSERT_OK(SstPartitionerFactory::CreateFromString(
+      config_options_, SstPartitionerFixedPrefixFactory::kClassName(),
+      &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), SstPartitionerFixedPrefixFactory::kClassName());
+
+  if (RegisterTests("Test")) {
+    ASSERT_OK(SstPartitionerFactory::CreateFromString(config_options_, "Mock",
+                                                      &factory));
+    ASSERT_NE(factory, nullptr);
+    ASSERT_STREQ(factory->Name(), "Mock");
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(LoadCustomizableTest, LoadChecksumGenFactoryTest) {
+  std::shared_ptr<FileChecksumGenFactory> factory;
+  ASSERT_NOK(FileChecksumGenFactory::CreateFromString(config_options_, "Mock",
+                                                      &factory));
+  ASSERT_OK(FileChecksumGenFactory::CreateFromString(
+      config_options_, FileChecksumGenCrc32cFactory::kClassName(), &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), FileChecksumGenCrc32cFactory::kClassName());
+
+  if (RegisterTests("Test")) {
+    ASSERT_OK(FileChecksumGenFactory::CreateFromString(config_options_, "Mock",
+                                                       &factory));
+    ASSERT_NE(factory, nullptr);
+    ASSERT_STREQ(factory->Name(), "Mock");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadTablePropertiesCollectorFactoryTest) {
+  std::shared_ptr<TablePropertiesCollectorFactory> factory;
+  ASSERT_NOK(TablePropertiesCollectorFactory::CreateFromString(
+      config_options_, MockTablePropertiesCollectorFactory::kClassName(),
+      &factory));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+        config_options_, MockTablePropertiesCollectorFactory::kClassName(),
+        &factory));
+    ASSERT_NE(factory, nullptr);
+    ASSERT_STREQ(factory->Name(),
+                 MockTablePropertiesCollectorFactory::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadComparatorTest) {
+  const Comparator* bytewise = BytewiseComparator();
+  const Comparator* reverse = ReverseBytewiseComparator();
+
+  const Comparator* result = nullptr;
+  ASSERT_NOK(Comparator::CreateFromString(
+      config_options_, test::SimpleSuffixReverseComparator::kClassName(),
+      &result));
+  ASSERT_OK(
+      Comparator::CreateFromString(config_options_, bytewise->Name(), &result));
+  ASSERT_EQ(result, bytewise);
+  ASSERT_OK(
+      Comparator::CreateFromString(config_options_, reverse->Name(), &result));
+  ASSERT_EQ(result, reverse);
+
+  if (RegisterTests("Test")) {
+    ASSERT_OK(Comparator::CreateFromString(
+        config_options_, test::SimpleSuffixReverseComparator::kClassName(),
+        &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(),
+                 test::SimpleSuffixReverseComparator::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadSliceTransformFactoryTest) {
+  std::shared_ptr<const SliceTransform> result;
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options_, "Mock", &result));
+  ASSERT_OK(
+      SliceTransform::CreateFromString(config_options_, "fixed:16", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf("fixed"));
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options_, "rocksdb.FixedPrefix.22", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf("fixed"));
+
+  ASSERT_OK(
+      SliceTransform::CreateFromString(config_options_, "capped:16", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf("capped"));
+
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options_, "rocksdb.CappedPrefix.11", &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf("capped"));
+
+  if (RegisterTests("Test")) {
+    ASSERT_OK(
+        SliceTransform::CreateFromString(config_options_, "Mock", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "Mock");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadStatisticsTest) {
+  std::shared_ptr<Statistics> stats;
+  ASSERT_NOK(Statistics::CreateFromString(
+      config_options_, TestStatistics::kClassName(), &stats));
+  ASSERT_OK(
+      Statistics::CreateFromString(config_options_, "BasicStatistics", &stats));
+  ASSERT_NE(stats, nullptr);
+  ASSERT_EQ(stats->Name(), std::string("BasicStatistics"));
+#ifndef ROCKSDB_LITE
+  ASSERT_NOK(GetDBOptionsFromString(config_options_, db_opts_,
+                                    "statistics=Test", &db_opts_));
+  ASSERT_OK(GetDBOptionsFromString(config_options_, db_opts_,
+                                   "statistics=BasicStatistics", &db_opts_));
+  ASSERT_NE(db_opts_.statistics, nullptr);
+  ASSERT_STREQ(db_opts_.statistics->Name(), "BasicStatistics");
+
+  if (RegisterTests("test")) {
+    ASSERT_OK(Statistics::CreateFromString(
+        config_options_, TestStatistics::kClassName(), &stats));
+    ASSERT_NE(stats, nullptr);
+    ASSERT_STREQ(stats->Name(), TestStatistics::kClassName());
+
+    ASSERT_OK(GetDBOptionsFromString(config_options_, db_opts_,
+                                     "statistics=Test", &db_opts_));
+    ASSERT_NE(db_opts_.statistics, nullptr);
+    ASSERT_STREQ(db_opts_.statistics->Name(), TestStatistics::kClassName());
+
+    ASSERT_OK(GetDBOptionsFromString(
+        config_options_, db_opts_, "statistics={id=Test;inner=BasicStatistics}",
+        &db_opts_));
+    ASSERT_NE(db_opts_.statistics, nullptr);
+    ASSERT_STREQ(db_opts_.statistics->Name(), TestStatistics::kClassName());
+    auto* inner = db_opts_.statistics->GetOptions<std::shared_ptr<Statistics>>(
+        "StatisticsOptions");
+    ASSERT_NE(inner, nullptr);
+    ASSERT_NE(inner->get(), nullptr);
+    ASSERT_STREQ(inner->get()->Name(), "BasicStatistics");
+
+    ASSERT_OK(Statistics::CreateFromString(
+        config_options_, "id=BasicStatistics;inner=Test", &stats));
+    ASSERT_NE(stats, nullptr);
+    ASSERT_STREQ(stats->Name(), "BasicStatistics");
+    inner = stats->GetOptions<std::shared_ptr<Statistics>>("StatisticsOptions");
+    ASSERT_NE(inner, nullptr);
+    ASSERT_NE(inner->get(), nullptr);
+    ASSERT_STREQ(inner->get()->Name(), TestStatistics::kClassName());
+  }
+#endif
+}
+
+TEST_F(LoadCustomizableTest, LoadMemTableRepFactoryTest) {
+  std::unique_ptr<MemTableRepFactory> result;
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options_, "SpecialSkipListFactory", &result));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options_, SkipListFactory::kClassName(), &result));
+  ASSERT_NE(result.get(), nullptr);
+  ASSERT_TRUE(result->IsInstanceOf(SkipListFactory::kClassName()));
+
+  if (RegisterTests("Test")) {
+    ASSERT_OK(MemTableRepFactory::CreateFromString(
+        config_options_, "SpecialSkipListFactory", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "SpecialSkipListFactory");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadMergeOperatorTest) {
+  std::shared_ptr<MergeOperator> result;
+
+  ASSERT_NOK(
+      MergeOperator::CreateFromString(config_options_, "Changling", &result));
+  //**TODO: MJR: Use the constants when these names are in public classes
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_, "put", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "PutOperator");
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "PutOperator", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "PutOperator");
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "put_v1", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "PutOperator");
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "uint64add", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "UInt64AddOperator");
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_,
+                                            "UInt64AddOperator", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "UInt64AddOperator");
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_, "max", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "MaxOperator");
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options_, "MaxOperator", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "MaxOperator");
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendOperator::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendOperator::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendOperator::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendOperator::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendTESTOperator::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendTESTOperator::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, StringAppendTESTOperator::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), StringAppendTESTOperator::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_,
+                                            SortList::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), SortList::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_,
+                                            SortList::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), SortList::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, BytesXOROperator::kNickName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), BytesXOROperator::kClassName());
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, BytesXOROperator::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), BytesXOROperator::kClassName());
+#endif  // ROCKSDB_LITE
+  ASSERT_NOK(
+      MergeOperator::CreateFromString(config_options_, "Changling", &result));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(
+        MergeOperator::CreateFromString(config_options_, "Changling", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "ChanglingMergeOperator");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadCompactionFilterFactoryTest) {
+  std::shared_ptr<CompactionFilterFactory> result;
+
+  ASSERT_NOK(CompactionFilterFactory::CreateFromString(config_options_,
+                                                       "Changling", &result));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(CompactionFilterFactory::CreateFromString(config_options_,
+                                                        "Changling", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "ChanglingCompactionFilterFactory");
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadCompactionFilterTest) {
+  const CompactionFilter* result = nullptr;
+
+  ASSERT_NOK(CompactionFilter::CreateFromString(config_options_, "Changling",
+                                                &result));
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options_, RemoveEmptyValueCompactionFilter::kClassName(),
+      &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), RemoveEmptyValueCompactionFilter::kClassName());
+  delete result;
+  result = nullptr;
+  if (RegisterTests("Test")) {
+    ASSERT_OK(CompactionFilter::CreateFromString(config_options_, "Changling",
+                                                 &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "ChanglingCompactionFilter");
+    delete result;
+  }
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(LoadCustomizableTest, LoadEventListenerTest) {
+  std::shared_ptr<EventListener> result;
+
+  ASSERT_NOK(EventListener::CreateFromString(
+      config_options_, OnFileDeletionListener::kClassName(), &result));
+  ASSERT_NOK(EventListener::CreateFromString(
+      config_options_, FlushCounterListener::kClassName(), &result));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(EventListener::CreateFromString(
+        config_options_, OnFileDeletionListener::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), OnFileDeletionListener::kClassName());
+    ASSERT_OK(EventListener::CreateFromString(
+        config_options_, FlushCounterListener::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), FlushCounterListener::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadEncryptionProviderTest) {
+  std::shared_ptr<EncryptionProvider> result;
+  ASSERT_NOK(
+      EncryptionProvider::CreateFromString(config_options_, "Mock", &result));
+  ASSERT_OK(
+      EncryptionProvider::CreateFromString(config_options_, "CTR", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "CTR");
+  ASSERT_NOK(result->ValidateOptions(db_opts_, cf_opts_));
+  ASSERT_OK(EncryptionProvider::CreateFromString(config_options_, "CTR://test",
+                                                 &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "CTR");
+  ASSERT_OK(result->ValidateOptions(db_opts_, cf_opts_));
+
+  if (RegisterTests("Test")) {
+    ASSERT_OK(
+        EncryptionProvider::CreateFromString(config_options_, "Mock", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "Mock");
+    ASSERT_OK(EncryptionProvider::CreateFromString(config_options_,
+                                                   "Mock://test", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "Mock");
+    ASSERT_OK(result->ValidateOptions(db_opts_, cf_opts_));
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadEncryptionCipherTest) {
+  std::shared_ptr<BlockCipher> result;
+  ASSERT_NOK(BlockCipher::CreateFromString(config_options_, "Mock", &result));
+  ASSERT_OK(BlockCipher::CreateFromString(config_options_, "ROT13", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), "ROT13");
+  if (RegisterTests("Test")) {
+    ASSERT_OK(BlockCipher::CreateFromString(config_options_, "Mock", &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), "Mock");
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(LoadCustomizableTest, LoadSystemClockTest) {
+  std::shared_ptr<SystemClock> result;
+  ASSERT_NOK(SystemClock::CreateFromString(
+      config_options_, MockSystemClock::kClassName(), &result));
+  ASSERT_OK(SystemClock::CreateFromString(
+      config_options_, SystemClock::kDefaultName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_TRUE(result->IsInstanceOf(SystemClock::kDefaultName()));
+  if (RegisterTests("Test")) {
+    ASSERT_OK(SystemClock::CreateFromString(
+        config_options_, MockSystemClock::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), MockSystemClock::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadMemoryAllocatorTest) {
+  std::shared_ptr<MemoryAllocator> result;
+  ASSERT_NOK(MemoryAllocator::CreateFromString(
+      config_options_, MockMemoryAllocator::kClassName(), &result));
+  ASSERT_OK(MemoryAllocator::CreateFromString(
+      config_options_, DefaultMemoryAllocator::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), DefaultMemoryAllocator::kClassName());
+  if (RegisterTests("Test")) {
+    ASSERT_OK(MemoryAllocator::CreateFromString(
+        config_options_, MockMemoryAllocator::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), MockMemoryAllocator::kClassName());
+  }
+}
+
+TEST_F(LoadCustomizableTest, LoadRateLimiterTest) {
+  std::shared_ptr<RateLimiter> result;
+  ASSERT_NOK(RateLimiter::CreateFromString(
+      config_options_, MockRateLimiter::kClassName(), &result));
+  ASSERT_OK(RateLimiter::CreateFromString(
+      config_options_, std::string(GenericRateLimiter::kClassName()) + ":1234",
+      &result));
+  ASSERT_NE(result, nullptr);
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(RateLimiter::CreateFromString(
+      config_options_, GenericRateLimiter::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_OK(GetDBOptionsFromString(
+      config_options_, db_opts_,
+      std::string("rate_limiter=") + GenericRateLimiter::kClassName(),
+      &db_opts_));
+  ASSERT_NE(db_opts_.rate_limiter, nullptr);
+  if (RegisterTests("Test")) {
+    ASSERT_OK(RateLimiter::CreateFromString(
+        config_options_, MockRateLimiter::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_OK(GetDBOptionsFromString(
+        config_options_, db_opts_,
+        std::string("rate_limiter=") + MockRateLimiter::kClassName(),
+        &db_opts_));
+    ASSERT_NE(db_opts_.rate_limiter, nullptr);
+  }
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(LoadCustomizableTest, LoadFlushBlockPolicyFactoryTest) {
+  std::shared_ptr<TableFactory> table;
+  std::shared_ptr<FlushBlockPolicyFactory> result;
+  ASSERT_NOK(FlushBlockPolicyFactory::CreateFromString(
+      config_options_, TestFlushBlockPolicyFactory::kClassName(), &result));
+
+  ASSERT_OK(
+      FlushBlockPolicyFactory::CreateFromString(config_options_, "", &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), FlushBlockBySizePolicyFactory::kClassName());
+
+  ASSERT_OK(FlushBlockPolicyFactory::CreateFromString(
+      config_options_, FlushBlockEveryKeyPolicyFactory::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), FlushBlockEveryKeyPolicyFactory::kClassName());
+
+  ASSERT_OK(FlushBlockPolicyFactory::CreateFromString(
+      config_options_, FlushBlockBySizePolicyFactory::kClassName(), &result));
+  ASSERT_NE(result, nullptr);
+  ASSERT_STREQ(result->Name(), FlushBlockBySizePolicyFactory::kClassName());
+#ifndef ROCKSDB_LITE
+  std::string table_opts = "id=BlockBasedTable; flush_block_policy_factory=";
+  ASSERT_OK(TableFactory::CreateFromString(
+      config_options_,
+      table_opts + FlushBlockEveryKeyPolicyFactory::kClassName(), &table));
+  auto bbto = table->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_NE(bbto->flush_block_policy_factory.get(), nullptr);
+  ASSERT_STREQ(bbto->flush_block_policy_factory->Name(),
+               FlushBlockEveryKeyPolicyFactory::kClassName());
+  if (RegisterTests("Test")) {
+    ASSERT_OK(FlushBlockPolicyFactory::CreateFromString(
+        config_options_, TestFlushBlockPolicyFactory::kClassName(), &result));
+    ASSERT_NE(result, nullptr);
+    ASSERT_STREQ(result->Name(), TestFlushBlockPolicyFactory::kClassName());
+    ASSERT_OK(TableFactory::CreateFromString(
+        config_options_, table_opts + TestFlushBlockPolicyFactory::kClassName(),
+        &table));
+    bbto = table->GetOptions<BlockBasedTableOptions>();
+    ASSERT_NE(bbto, nullptr);
+    ASSERT_NE(bbto->flush_block_policy_factory.get(), nullptr);
+    ASSERT_STREQ(bbto->flush_block_policy_factory->Name(),
+                 TestFlushBlockPolicyFactory::kClassName());
+  }
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,16 +7,650 @@
 
 #include <cinttypes>
 
-#include "db/version_edit.h"
 #include "logging/logging.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
 #include "port/port.h"
-#include "rocksdb/cache.h"
+#include "rocksdb/configurable.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
 #include "rocksdb/sst_file_manager.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/options_type.h"
 #include "rocksdb/wal_filter.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, WALRecoveryMode>
+    wal_recovery_mode_string_map = {
+        {"kTolerateCorruptedTailRecords",
+         WALRecoveryMode::kTolerateCorruptedTailRecords},
+        {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency},
+        {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery},
+        {"kSkipAnyCorruptedRecords",
+         WALRecoveryMode::kSkipAnyCorruptedRecords}};
+
+static std::unordered_map<std::string, DBOptions::AccessHint>
+    access_hint_string_map = {{"NONE", DBOptions::AccessHint::NONE},
+                              {"NORMAL", DBOptions::AccessHint::NORMAL},
+                              {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL},
+                              {"WILLNEED", DBOptions::AccessHint::WILLNEED}};
+
+static std::unordered_map<std::string, CacheTier> cache_tier_string_map = {
+    {"kVolatileTier", CacheTier::kVolatileTier},
+    {"kNonVolatileBlockTier", CacheTier::kNonVolatileBlockTier}};
+
+static std::unordered_map<std::string, InfoLogLevel> info_log_level_string_map =
+    {{"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL},
+     {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL},
+     {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL},
+     {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL},
+     {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL},
+     {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    db_mutable_options_type_info = {
+        {"allow_os_buffer",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kMutable}},
+        {"max_background_jobs",
+         {offsetof(struct MutableDBOptions, max_background_jobs),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_background_compactions",
+         {offsetof(struct MutableDBOptions, max_background_compactions),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"base_background_compactions",
+         {offsetof(struct MutableDBOptions, base_background_compactions),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_subcompactions",
+         {offsetof(struct MutableDBOptions, max_subcompactions),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"avoid_flush_during_shutdown",
+         {offsetof(struct MutableDBOptions, avoid_flush_during_shutdown),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"writable_file_max_buffer_size",
+         {offsetof(struct MutableDBOptions, writable_file_max_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"delayed_write_rate",
+         {offsetof(struct MutableDBOptions, delayed_write_rate),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_total_wal_size",
+         {offsetof(struct MutableDBOptions, max_total_wal_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"delete_obsolete_files_period_micros",
+         {offsetof(struct MutableDBOptions,
+                   delete_obsolete_files_period_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stats_dump_period_sec",
+         {offsetof(struct MutableDBOptions, stats_dump_period_sec),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stats_persist_period_sec",
+         {offsetof(struct MutableDBOptions, stats_persist_period_sec),
+          OptionType::kUInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"stats_history_buffer_size",
+         {offsetof(struct MutableDBOptions, stats_history_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_open_files",
+         {offsetof(struct MutableDBOptions, max_open_files), OptionType::kInt,
+          OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"bytes_per_sync",
+         {offsetof(struct MutableDBOptions, bytes_per_sync),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"wal_bytes_per_sync",
+         {offsetof(struct MutableDBOptions, wal_bytes_per_sync),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"strict_bytes_per_sync",
+         {offsetof(struct MutableDBOptions, strict_bytes_per_sync),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"compaction_readahead_size",
+         {offsetof(struct MutableDBOptions, compaction_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_background_flushes",
+         {offsetof(struct MutableDBOptions, max_background_flushes),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    db_immutable_options_type_info = {
+        /*
+         // not yet supported
+          std::shared_ptr<Cache> row_cache;
+          std::shared_ptr<DeleteScheduler> delete_scheduler;
+          std::shared_ptr<Logger> info_log;
+          std::shared_ptr<RateLimiter> rate_limiter;
+          std::shared_ptr<Statistics> statistics;
+          std::vector<DbPath> db_paths;
+          FileTypeSet checksum_handoff_file_types;
+         */
+        {"advise_random_on_open",
+         {offsetof(struct ImmutableDBOptions, advise_random_on_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_mmap_reads",
+         {offsetof(struct ImmutableDBOptions, allow_mmap_reads),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_fallocate",
+         {offsetof(struct ImmutableDBOptions, allow_fallocate),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_mmap_writes",
+         {offsetof(struct ImmutableDBOptions, allow_mmap_writes),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_direct_reads",
+         {offsetof(struct ImmutableDBOptions, use_direct_reads),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_direct_writes",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"use_direct_io_for_flush_and_compaction",
+         {offsetof(struct ImmutableDBOptions,
+                   use_direct_io_for_flush_and_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_2pc",
+         {offsetof(struct ImmutableDBOptions, allow_2pc), OptionType::kBoolean,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"wal_filter",
+         OptionTypeInfo::AsCustomRawPtr<WalFilter>(
+             offsetof(struct ImmutableDBOptions, wal_filter),
+             OptionVerificationType::kByName,
+             (OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever))},
+        {"create_if_missing",
+         {offsetof(struct ImmutableDBOptions, create_if_missing),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"create_missing_column_families",
+         {offsetof(struct ImmutableDBOptions, create_missing_column_families),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"disableDataSync",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"disable_data_sync",  // for compatibility
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"enable_thread_tracking",
+         {offsetof(struct ImmutableDBOptions, enable_thread_tracking),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"error_if_exists",
+         {offsetof(struct ImmutableDBOptions, error_if_exists),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"experimental_allow_mempurge",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"experimental_mempurge_policy",
+         {0, OptionType::kString, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"experimental_mempurge_threshold",
+         {offsetof(struct ImmutableDBOptions, experimental_mempurge_threshold),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_fd_close_on_exec",
+         {offsetof(struct ImmutableDBOptions, is_fd_close_on_exec),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"paranoid_checks",
+         {offsetof(struct ImmutableDBOptions, paranoid_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"flush_verify_memtable_count",
+         {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"track_and_verify_wals_in_manifest",
+         {offsetof(struct ImmutableDBOptions,
+                   track_and_verify_wals_in_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"skip_log_error_on_recovery",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"skip_stats_update_on_db_open",
+         {offsetof(struct ImmutableDBOptions, skip_stats_update_on_db_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"skip_checking_sst_file_sizes_on_db_open",
+         {offsetof(struct ImmutableDBOptions,
+                   skip_checking_sst_file_sizes_on_db_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"new_table_reader_for_compaction_inputs",
+         {offsetof(struct ImmutableDBOptions,
+                   new_table_reader_for_compaction_inputs),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"random_access_max_buffer_size",
+         {offsetof(struct ImmutableDBOptions, random_access_max_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_adaptive_mutex",
+         {offsetof(struct ImmutableDBOptions, use_adaptive_mutex),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_fsync",
+         {offsetof(struct ImmutableDBOptions, use_fsync), OptionType::kBoolean,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"max_file_opening_threads",
+         {offsetof(struct ImmutableDBOptions, max_file_opening_threads),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"table_cache_numshardbits",
+         {offsetof(struct ImmutableDBOptions, table_cache_numshardbits),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"db_write_buffer_size",
+         {offsetof(struct ImmutableDBOptions, db_write_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"keep_log_file_num",
+         {offsetof(struct ImmutableDBOptions, keep_log_file_num),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"recycle_log_file_num",
+         {offsetof(struct ImmutableDBOptions, recycle_log_file_num),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"log_file_time_to_roll",
+         {offsetof(struct ImmutableDBOptions, log_file_time_to_roll),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"manifest_preallocation_size",
+         {offsetof(struct ImmutableDBOptions, manifest_preallocation_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_log_file_size",
+         {offsetof(struct ImmutableDBOptions, max_log_file_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"db_log_dir",
+         {offsetof(struct ImmutableDBOptions, db_log_dir), OptionType::kString,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"wal_dir",
+         {offsetof(struct ImmutableDBOptions, wal_dir), OptionType::kString,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"WAL_size_limit_MB",
+         {offsetof(struct ImmutableDBOptions, WAL_size_limit_MB),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"WAL_ttl_seconds",
+         {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_manifest_file_size",
+         {offsetof(struct ImmutableDBOptions, max_manifest_file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"persist_stats_to_disk",
+         {offsetof(struct ImmutableDBOptions, persist_stats_to_disk),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"fail_if_options_file_error",
+         {offsetof(struct ImmutableDBOptions, fail_if_options_file_error),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"enable_pipelined_write",
+         {offsetof(struct ImmutableDBOptions, enable_pipelined_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"unordered_write",
+         {offsetof(struct ImmutableDBOptions, unordered_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_concurrent_memtable_write",
+         {offsetof(struct ImmutableDBOptions, allow_concurrent_memtable_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"wal_recovery_mode",
+         OptionTypeInfo::Enum<WALRecoveryMode>(
+             offsetof(struct ImmutableDBOptions, wal_recovery_mode),
+             &wal_recovery_mode_string_map)},
+        {"enable_write_thread_adaptive_yield",
+         {offsetof(struct ImmutableDBOptions,
+                   enable_write_thread_adaptive_yield),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"write_thread_slow_yield_usec",
+         {offsetof(struct ImmutableDBOptions, write_thread_slow_yield_usec),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_write_batch_group_size_bytes",
+         {offsetof(struct ImmutableDBOptions, max_write_batch_group_size_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"write_thread_max_yield_usec",
+         {offsetof(struct ImmutableDBOptions, write_thread_max_yield_usec),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"access_hint_on_compaction_start",
+         OptionTypeInfo::Enum<DBOptions::AccessHint>(
+             offsetof(struct ImmutableDBOptions,
+                      access_hint_on_compaction_start),
+             &access_hint_string_map)},
+        {"info_log_level",
+         OptionTypeInfo::Enum<InfoLogLevel>(
+             offsetof(struct ImmutableDBOptions, info_log_level),
+             &info_log_level_string_map)},
+        {"dump_malloc_stats",
+         {offsetof(struct ImmutableDBOptions, dump_malloc_stats),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"avoid_flush_during_recovery",
+         {offsetof(struct ImmutableDBOptions, avoid_flush_during_recovery),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"allow_ingest_behind",
+         {offsetof(struct ImmutableDBOptions, allow_ingest_behind),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"preserve_deletes",
+         {offsetof(struct ImmutableDBOptions, preserve_deletes),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"concurrent_prepare",  // Deprecated by two_write_queues
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"two_write_queues",
+         {offsetof(struct ImmutableDBOptions, two_write_queues),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"manual_wal_flush",
+         {offsetof(struct ImmutableDBOptions, manual_wal_flush),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"seq_per_batch",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"atomic_flush",
+         {offsetof(struct ImmutableDBOptions, atomic_flush),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"avoid_unnecessary_blocking_io",
+         {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"write_dbid_to_manifest",
+         {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"log_readahead_size",
+         {offsetof(struct ImmutableDBOptions, log_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"best_efforts_recovery",
+         {offsetof(struct ImmutableDBOptions, best_efforts_recovery),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_bgerror_resume_count",
+         {offsetof(struct ImmutableDBOptions, max_bgerror_resume_count),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bgerror_resume_retry_interval",
+         {offsetof(struct ImmutableDBOptions, bgerror_resume_retry_interval),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"db_host_id",
+         {offsetof(struct ImmutableDBOptions, db_host_id), OptionType::kString,
+          OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}},
+        {"rate_limiter",
+         OptionTypeInfo::AsCustomSharedPtr<RateLimiter>(
+             offsetof(struct ImmutableDBOptions, rate_limiter),
+             OptionVerificationType::kNormal,
+             OptionTypeFlags::kCompareNever | OptionTypeFlags::kAllowNull)},
+
+        // The following properties were handled as special cases in ParseOption
+        // This means that the properties could be read from the options file
+        // but never written to the file or compared to each other.
+        {"rate_limiter_bytes_per_sec",
+         {offsetof(struct ImmutableDBOptions, rate_limiter),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever),
+          // Parse the input value as a RateLimiter
+          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto limiter = static_cast<std::shared_ptr<RateLimiter>*>(addr);
+            limiter->reset(NewGenericRateLimiter(
+                static_cast<int64_t>(ParseUint64(value))));
+            return Status::OK();
+          }}},
+        {"env",
+         {offsetof(struct ImmutableDBOptions, env), OptionType::kUnknown,
+          OptionVerificationType::kNormal,
+          (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever),
+          // Parse the input value as an Env
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto old_env = static_cast<Env**>(addr);       // Get the old value
+            Env* new_env = *old_env;                       // Set new to old
+            Status s = Env::CreateFromString(opts, value,
+                                             &new_env);    // Update new value
+            if (s.ok()) {                                  // It worked
+              *old_env = new_env;                          // Update the old one
+            }
+            return s;
+          }}},
+        {"allow_data_in_errors",
+         {offsetof(struct ImmutableDBOptions, allow_data_in_errors),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_checksum_gen_factory",
+         OptionTypeInfo::AsCustomSharedPtr<FileChecksumGenFactory>(
+             offsetof(struct ImmutableDBOptions, file_checksum_gen_factory),
+             OptionVerificationType::kByNameAllowFromNull,
+             OptionTypeFlags::kAllowNull)},
+        {"statistics",
+         OptionTypeInfo::AsCustomSharedPtr<Statistics>(
+             // Statistics should not be compared and can be null
+             // Statistics are maked "don't serialize" until they can be shared
+             // between DBs
+             offsetof(struct ImmutableDBOptions, statistics),
+             OptionVerificationType::kNormal,
+             OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize |
+                 OptionTypeFlags::kAllowNull)},
+        // Allow EventListeners that have a non-empty Name() to be read/written
+        // as options Each listener will either be
+        // - A simple name (e.g. "MyEventListener")
+        // - A name with properties (e.g. "{id=MyListener1; timeout=60}"
+        // Multiple listeners will be separated by a ":":
+        //   - "MyListener0;{id=MyListener1; timeout=60}
+        {"listeners",
+         {offsetof(struct ImmutableDBOptions, listeners), OptionType::kVector,
+          OptionVerificationType::kByNameAllowNull,
+          OptionTypeFlags::kCompareNever,
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            ConfigOptions embedded = opts;
+            embedded.ignore_unsupported_options = true;
+            std::vector<std::shared_ptr<EventListener>> listeners;
+            Status s;
+            for (size_t start = 0, end = 0;
+                 s.ok() && start < value.size() && end != std::string::npos;
+                 start = end + 1) {
+              std::string token;
+              s = OptionTypeInfo::NextToken(value, ':', start, &end, &token);
+              if (s.ok() && !token.empty()) {
+                std::shared_ptr<EventListener> listener;
+                s = EventListener::CreateFromString(embedded, token, &listener);
+                if (s.ok() && listener != nullptr) {
+                  listeners.push_back(listener);
+                }
+              }
+            }
+            if (s.ok()) {  // It worked
+              *(static_cast<std::vector<std::shared_ptr<EventListener>>*>(
+                  addr)) = listeners;
+            }
+            return s;
+          },
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const void* addr, std::string* value) {
+            const auto listeners =
+                static_cast<const std::vector<std::shared_ptr<EventListener>>*>(
+                    addr);
+            ConfigOptions embedded = opts;
+            embedded.delimiter = ";";
+            int printed = 0;
+            for (const auto& listener : *listeners) {
+              auto id = listener->GetId();
+              if (!id.empty()) {
+                std::string elem_str = listener->ToString(embedded, "");
+                if (printed++ == 0) {
+                  value->append("{");
+                } else {
+                  value->append(":");
+                }
+                value->append(elem_str);
+              }
+            }
+            if (printed > 0) {
+              value->append("}");
+            }
+            return Status::OK();
+          },
+          nullptr}},
+        {"lowest_used_cache_tier",
+         OptionTypeInfo::Enum<CacheTier>(
+             offsetof(struct ImmutableDBOptions, lowest_used_cache_tier),
+             &cache_tier_string_map, OptionTypeFlags::kNone)},
+};
+
+const std::string OptionsHelper::kDBOptionsName = "DBOptions";
+
+class MutableDBConfigurable : public Configurable {
+ public:
+  explicit MutableDBConfigurable(
+      const MutableDBOptions& mdb,
+      const std::unordered_map<std::string, std::string>* map = nullptr)
+      : mutable_(mdb), opt_map_(map) {
+    RegisterOptions(&mutable_, &db_mutable_options_type_info);
+  }
+
+  bool OptionsAreEqual(const ConfigOptions& config_options,
+                       const OptionTypeInfo& opt_info,
+                       const std::string& opt_name, const void* const this_ptr,
+                       const void* const that_ptr,
+                       std::string* mismatch) const override {
+    bool equals = opt_info.AreEqual(config_options, opt_name, this_ptr,
+                                    that_ptr, mismatch);
+    if (!equals && opt_info.IsByName()) {
+      if (opt_map_ == nullptr) {
+        equals = true;
+      } else {
+        const auto& iter = opt_map_->find(opt_name);
+        if (iter == opt_map_->end()) {
+          equals = true;
+        } else {
+          equals = opt_info.AreEqualByName(config_options, opt_name, this_ptr,
+                                           iter->second);
+        }
+      }
+      if (equals) {  // False alarm, clear mismatch
+        *mismatch = "";
+      }
+    }
+    if (equals && opt_info.IsConfigurable() && opt_map_ != nullptr) {
+      const auto* this_config = opt_info.AsRawPointer<Configurable>(this_ptr);
+      if (this_config == nullptr) {
+        const auto& iter = opt_map_->find(opt_name);
+        // If the name exists in the map and is not empty/null,
+        // then the this_config should be set.
+        if (iter != opt_map_->end() && !iter->second.empty() &&
+            iter->second != kNullptrString) {
+          *mismatch = opt_name;
+          equals = false;
+        }
+      }
+    }
+    return equals;
+  }
+
+ protected:
+  MutableDBOptions mutable_;
+  const std::unordered_map<std::string, std::string>* opt_map_;
+};
+
+class DBOptionsConfigurable : public MutableDBConfigurable {
+ public:
+  explicit DBOptionsConfigurable(
+      const DBOptions& opts,
+      const std::unordered_map<std::string, std::string>* map = nullptr)
+      : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) {
+    // The ImmutableDBOptions currently requires the env to be non-null.  Make
+    // sure it is
+    if (opts.env != nullptr) {
+      immutable_ = ImmutableDBOptions(opts);
+    } else {
+      DBOptions copy = opts;
+      copy.env = Env::Default();
+      immutable_ = ImmutableDBOptions(copy);
+    }
+    RegisterOptions(&immutable_, &db_immutable_options_type_info);
+  }
+
+ protected:
+  Status ConfigureOptions(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      std::unordered_map<std::string, std::string>* unused) override {
+    Status s = Configurable::ConfigureOptions(config_options, opts_map, unused);
+    if (s.ok()) {
+      db_options_ = BuildDBOptions(immutable_, mutable_);
+      s = PrepareOptions(config_options);
+    }
+    return s;
+  }
+
+  const void* GetOptionsPtr(const std::string& name) const override {
+    if (name == OptionsHelper::kDBOptionsName) {
+      return &db_options_;
+    } else {
+      return MutableDBConfigurable::GetOptionsPtr(name);
+    }
+  }
+
+ private:
+  ImmutableDBOptions immutable_;
+  DBOptions db_options_;
+};
+
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const MutableDBOptions& opts) {
+  std::unique_ptr<Configurable> ptr(new MutableDBConfigurable(opts));
+  return ptr;
+}
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const DBOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  std::unique_ptr<Configurable> ptr(new DBOptionsConfigurable(opts, opt_map));
+  return ptr;
+}
+#endif  // ROCKSDB_LITE
 
 ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {}
 
@@ -25,8 +659,10 @@
       create_missing_column_families(options.create_missing_column_families),
       error_if_exists(options.error_if_exists),
       paranoid_checks(options.paranoid_checks),
+      flush_verify_memtable_count(options.flush_verify_memtable_count),
+      track_and_verify_wals_in_manifest(
+          options.track_and_verify_wals_in_manifest),
       env(options.env),
-      fs(options.file_system),
       rate_limiter(options.rate_limiter),
       sst_file_manager(options.sst_file_manager),
       info_log(options.info_log),
@@ -37,16 +673,14 @@
       db_paths(options.db_paths),
       db_log_dir(options.db_log_dir),
       wal_dir(options.wal_dir),
-      max_subcompactions(options.max_subcompactions),
-      max_background_flushes(options.max_background_flushes),
       max_log_file_size(options.max_log_file_size),
       log_file_time_to_roll(options.log_file_time_to_roll),
       keep_log_file_num(options.keep_log_file_num),
       recycle_log_file_num(options.recycle_log_file_num),
       max_manifest_file_size(options.max_manifest_file_size),
       table_cache_numshardbits(options.table_cache_numshardbits),
-      wal_ttl_seconds(options.WAL_ttl_seconds),
-      wal_size_limit_mb(options.WAL_size_limit_MB),
+      WAL_ttl_seconds(options.WAL_ttl_seconds),
+      WAL_size_limit_MB(options.WAL_size_limit_MB),
       max_write_batch_group_size_bytes(
           options.max_write_batch_group_size_bytes),
       manifest_preallocation_size(options.manifest_preallocation_size),
@@ -58,6 +692,7 @@
       allow_fallocate(options.allow_fallocate),
       is_fd_close_on_exec(options.is_fd_close_on_exec),
       advise_random_on_open(options.advise_random_on_open),
+      experimental_mempurge_threshold(options.experimental_mempurge_threshold),
       db_write_buffer_size(options.db_write_buffer_size),
       write_buffer_manager(options.write_buffer_manager),
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
@@ -95,7 +730,24 @@
       persist_stats_to_disk(options.persist_stats_to_disk),
       write_dbid_to_manifest(options.write_dbid_to_manifest),
       log_readahead_size(options.log_readahead_size),
-      sst_file_checksum_func(options.sst_file_checksum_func) {
+      file_checksum_gen_factory(options.file_checksum_gen_factory),
+      best_efforts_recovery(options.best_efforts_recovery),
+      max_bgerror_resume_count(options.max_bgerror_resume_count),
+      bgerror_resume_retry_interval(options.bgerror_resume_retry_interval),
+      allow_data_in_errors(options.allow_data_in_errors),
+      db_host_id(options.db_host_id),
+      checksum_handoff_file_types(options.checksum_handoff_file_types),
+      lowest_used_cache_tier(options.lowest_used_cache_tier),
+      compaction_service(options.compaction_service) {
+  stats = statistics.get();
+  fs = env->GetFileSystem();
+  if (env != nullptr) {
+    clock = env->GetSystemClock().get();
+  } else {
+    clock = SystemClock::Default().get();
+  }
+  logger = info_log.get();
+  stats = statistics.get();
 }
 
 void ImmutableDBOptions::Dump(Logger* log) const {
@@ -105,6 +757,12 @@
                    create_if_missing);
   ROCKS_LOG_HEADER(log, "                        Options.paranoid_checks: %d",
                    paranoid_checks);
+  ROCKS_LOG_HEADER(log, "            Options.flush_verify_memtable_count: %d",
+                   flush_verify_memtable_count);
+  ROCKS_LOG_HEADER(log,
+                   "                              "
+                   "Options.track_and_verify_wals_in_manifest: %d",
+                   track_and_verify_wals_in_manifest);
   ROCKS_LOG_HEADER(log, "                                    Options.env: %p",
                    env);
   ROCKS_LOG_HEADER(log, "                                     Options.fs: %s",
@@ -114,7 +772,7 @@
   ROCKS_LOG_HEADER(log, "               Options.max_file_opening_threads: %d",
                    max_file_opening_threads);
   ROCKS_LOG_HEADER(log, "                             Options.statistics: %p",
-                   statistics.get());
+                   stats);
   ROCKS_LOG_HEADER(log, "                              Options.use_fsync: %d",
                    use_fsync);
   ROCKS_LOG_HEADER(
@@ -153,16 +811,11 @@
   ROCKS_LOG_HEADER(log, "               Options.table_cache_numshardbits: %d",
                    table_cache_numshardbits);
   ROCKS_LOG_HEADER(log,
-                   "                     Options.max_subcompactions: %" PRIu32,
-                   max_subcompactions);
-  ROCKS_LOG_HEADER(log, "                 Options.max_background_flushes: %d",
-                   max_background_flushes);
-  ROCKS_LOG_HEADER(log,
                    "                        Options.WAL_ttl_seconds: %" PRIu64,
-                   wal_ttl_seconds);
+                   WAL_ttl_seconds);
   ROCKS_LOG_HEADER(log,
                    "                      Options.WAL_size_limit_MB: %" PRIu64,
-                   wal_size_limit_mb);
+                   WAL_size_limit_MB);
   ROCKS_LOG_HEADER(log,
                    "                       "
                    "Options.max_write_batch_group_size_bytes: %" PRIu64,
@@ -175,6 +828,9 @@
   ROCKS_LOG_HEADER(log, "                  Options.advise_random_on_open: %d",
                    advise_random_on_open);
   ROCKS_LOG_HEADER(
+      log, "                  Options.experimental_mempurge_threshold: %f",
+      experimental_mempurge_threshold);
+  ROCKS_LOG_HEADER(
       log, "                   Options.db_write_buffer_size: %" ROCKSDB_PRIszt,
       db_write_buffer_size);
   ROCKS_LOG_HEADER(log, "                   Options.write_buffer_manager: %p",
@@ -246,16 +902,62 @@
   ROCKS_LOG_HEADER(
       log, "                Options.log_readahead_size: %" ROCKSDB_PRIszt,
       log_readahead_size);
-  ROCKS_LOG_HEADER(log, "                Options.sst_file_checksum_func: %s",
-                   sst_file_checksum_func
-                       ? sst_file_checksum_func->Name()
-                       : kUnknownFileChecksumFuncName.c_str());
+  ROCKS_LOG_HEADER(log, "                Options.file_checksum_gen_factory: %s",
+                   file_checksum_gen_factory ? file_checksum_gen_factory->Name()
+                                             : kUnknownFileChecksumFuncName);
+  ROCKS_LOG_HEADER(log, "                Options.best_efforts_recovery: %d",
+                   static_cast<int>(best_efforts_recovery));
+  ROCKS_LOG_HEADER(log, "               Options.max_bgerror_resume_count: %d",
+                   max_bgerror_resume_count);
+  ROCKS_LOG_HEADER(log,
+                   "           Options.bgerror_resume_retry_interval: %" PRIu64,
+                   bgerror_resume_retry_interval);
+  ROCKS_LOG_HEADER(log, "            Options.allow_data_in_errors: %d",
+                   allow_data_in_errors);
+  ROCKS_LOG_HEADER(log, "            Options.db_host_id: %s",
+                   db_host_id.c_str());
+}
+
+bool ImmutableDBOptions::IsWalDirSameAsDBPath() const {
+  assert(!db_paths.empty());
+  return IsWalDirSameAsDBPath(db_paths[0].path);
+}
+
+bool ImmutableDBOptions::IsWalDirSameAsDBPath(
+    const std::string& db_path) const {
+  bool same = wal_dir.empty();
+  if (!same) {
+    Status s = env->AreFilesSame(wal_dir, db_path, &same);
+    if (s.IsNotSupported()) {
+      same = wal_dir == db_path;
+    }
+  }
+  return same;
+}
+
+const std::string& ImmutableDBOptions::GetWalDir() const {
+  if (wal_dir.empty()) {
+    assert(!db_paths.empty());
+    return db_paths[0].path;
+  } else {
+    return wal_dir;
+  }
+}
+
+const std::string& ImmutableDBOptions::GetWalDir(
+    const std::string& path) const {
+  if (wal_dir.empty()) {
+    return path;
+  } else {
+    return wal_dir;
+  }
 }
 
 MutableDBOptions::MutableDBOptions()
     : max_background_jobs(2),
       base_background_compactions(-1),
       max_background_compactions(-1),
+      max_subcompactions(0),
       avoid_flush_during_shutdown(false),
       writable_file_max_buffer_size(1024 * 1024),
       delayed_write_rate(2 * 1024U * 1024U),
@@ -268,12 +970,14 @@
       bytes_per_sync(0),
       wal_bytes_per_sync(0),
       strict_bytes_per_sync(false),
-      compaction_readahead_size(0) {}
+      compaction_readahead_size(0),
+      max_background_flushes(-1) {}
 
 MutableDBOptions::MutableDBOptions(const DBOptions& options)
     : max_background_jobs(options.max_background_jobs),
       base_background_compactions(options.base_background_compactions),
       max_background_compactions(options.max_background_compactions),
+      max_subcompactions(options.max_subcompactions),
       avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
       writable_file_max_buffer_size(options.writable_file_max_buffer_size),
       delayed_write_rate(options.delayed_write_rate),
@@ -287,13 +991,16 @@
       bytes_per_sync(options.bytes_per_sync),
       wal_bytes_per_sync(options.wal_bytes_per_sync),
       strict_bytes_per_sync(options.strict_bytes_per_sync),
-      compaction_readahead_size(options.compaction_readahead_size) {}
+      compaction_readahead_size(options.compaction_readahead_size),
+      max_background_flushes(options.max_background_flushes) {}
 
 void MutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log, "            Options.max_background_jobs: %d",
                    max_background_jobs);
   ROCKS_LOG_HEADER(log, "            Options.max_background_compactions: %d",
                    max_background_compactions);
+  ROCKS_LOG_HEADER(log, "            Options.max_subcompactions: %" PRIu32,
+                   max_subcompactions);
   ROCKS_LOG_HEADER(log, "            Options.avoid_flush_during_shutdown: %d",
                    avoid_flush_during_shutdown);
   ROCKS_LOG_HEADER(
@@ -328,6 +1035,40 @@
   ROCKS_LOG_HEADER(log,
                    "      Options.compaction_readahead_size: %" ROCKSDB_PRIszt,
                    compaction_readahead_size);
+  ROCKS_LOG_HEADER(log, "                 Options.max_background_flushes: %d",
+                          max_background_flushes);
+}
+
+#ifndef ROCKSDB_LITE
+Status GetMutableDBOptionsFromStrings(
+    const MutableDBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableDBOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  ConfigOptions config_options;
+  Status s = OptionTypeInfo::ParseType(
+      config_options, options_map, db_mutable_options_type_info, new_options);
+  if (!s.ok()) {
+    *new_options = base_options;
+  }
+  return s;
 }
 
+bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options,
+                              const MutableDBOptions& that_options) {
+  ConfigOptions config_options;
+  std::string mismatch;
+  return OptionTypeInfo::StructsAreEqual(
+      config_options, "MutableDBOptions", &db_mutable_options_type_info,
+      "MutableDBOptions", &this_options, &that_options, &mismatch);
+}
+
+Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
+                                     const MutableDBOptions& mutable_opts,
+                                     std::string* opt_string) {
+  return OptionTypeInfo::SerializeType(
+      config_options, db_mutable_options_type_info, &mutable_opts, opt_string);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/db_options.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/db_options.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,8 +11,10 @@
 #include "rocksdb/options.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 
 struct ImmutableDBOptions {
+  static const char* kName() { return "ImmutableDBOptions"; }
   ImmutableDBOptions();
   explicit ImmutableDBOptions(const DBOptions& options);
 
@@ -22,8 +24,9 @@
   bool create_missing_column_families;
   bool error_if_exists;
   bool paranoid_checks;
+  bool flush_verify_memtable_count;
+  bool track_and_verify_wals_in_manifest;
   Env* env;
-  std::shared_ptr<FileSystem> fs;
   std::shared_ptr<RateLimiter> rate_limiter;
   std::shared_ptr<SstFileManager> sst_file_manager;
   std::shared_ptr<Logger> info_log;
@@ -33,17 +36,18 @@
   bool use_fsync;
   std::vector<DbPath> db_paths;
   std::string db_log_dir;
+  // The wal_dir option from the file.  To determine the
+  // directory in use, the GetWalDir or IsWalDirSameAsDBPath
+  // methods should be used instead of accessing this variable directly.
   std::string wal_dir;
-  uint32_t max_subcompactions;
-  int max_background_flushes;
   size_t max_log_file_size;
   size_t log_file_time_to_roll;
   size_t keep_log_file_num;
   size_t recycle_log_file_num;
   uint64_t max_manifest_file_size;
   int table_cache_numshardbits;
-  uint64_t wal_ttl_seconds;
-  uint64_t wal_size_limit_mb;
+  uint64_t WAL_ttl_seconds;
+  uint64_t WAL_size_limit_MB;
   uint64_t max_write_batch_group_size_bytes;
   size_t manifest_preallocation_size;
   bool allow_mmap_reads;
@@ -53,6 +57,7 @@
   bool allow_fallocate;
   bool is_fd_close_on_exec;
   bool advise_random_on_open;
+  double experimental_mempurge_threshold;
   size_t db_write_buffer_size;
   std::shared_ptr<WriteBufferManager> write_buffer_manager;
   DBOptions::AccessHint access_hint_on_compaction_start;
@@ -87,12 +92,30 @@
   bool persist_stats_to_disk;
   bool write_dbid_to_manifest;
   size_t log_readahead_size;
-  std::shared_ptr<FileChecksumFunc> sst_file_checksum_func;
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory;
+  bool best_efforts_recovery;
+  int max_bgerror_resume_count;
+  uint64_t bgerror_resume_retry_interval;
+  bool allow_data_in_errors;
+  std::string db_host_id;
+  FileTypeSet checksum_handoff_file_types;
+  CacheTier lowest_used_cache_tier;
+  // Convenience/Helper objects that are not part of the base DBOptions
+  std::shared_ptr<FileSystem> fs;
+  SystemClock* clock;
+  Statistics* stats;
+  Logger* logger;
+  std::shared_ptr<CompactionService> compaction_service;
+
+  bool IsWalDirSameAsDBPath() const;
+  bool IsWalDirSameAsDBPath(const std::string& path) const;
+  const std::string& GetWalDir() const;
+  const std::string& GetWalDir(const std::string& path) const;
 };
 
 struct MutableDBOptions {
+  static const char* kName() { return "MutableDBOptions"; }
   MutableDBOptions();
-  explicit MutableDBOptions(const MutableDBOptions& options) = default;
   explicit MutableDBOptions(const DBOptions& options);
 
   void Dump(Logger* log) const;
@@ -100,6 +123,7 @@
   int max_background_jobs;
   int base_background_compactions;
   int max_background_compactions;
+  uint32_t max_subcompactions;
   bool avoid_flush_during_shutdown;
   size_t writable_file_max_buffer_size;
   uint64_t delayed_write_rate;
@@ -113,6 +137,21 @@
   uint64_t wal_bytes_per_sync;
   bool strict_bytes_per_sync;
   size_t compaction_readahead_size;
+  int max_background_flushes;
 };
 
+#ifndef ROCKSDB_LITE
+Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
+                                     const MutableDBOptions& mutable_opts,
+                                     std::string* opt_string);
+
+Status GetMutableDBOptionsFromStrings(
+    const MutableDBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableDBOptions* new_options);
+
+bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options,
+                              const MutableDBOptions& that_options);
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options.cc	2025-05-19 16:14:27.000000000 +0000
@@ -12,6 +12,7 @@
 #include <cinttypes>
 #include <limits>
 
+#include "logging/logging.h"
 #include "monitoring/statistics.h"
 #include "options/db_options.h"
 #include "options/options_helper.h"
@@ -19,11 +20,13 @@
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/sst_file_manager.h"
+#include "rocksdb/sst_partitioner.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/wal_filter.h"
@@ -87,7 +90,17 @@
       report_bg_io_stats(options.report_bg_io_stats),
       ttl(options.ttl),
       periodic_compaction_seconds(options.periodic_compaction_seconds),
-      sample_for_compression(options.sample_for_compression) {
+      sample_for_compression(options.sample_for_compression),
+      enable_blob_files(options.enable_blob_files),
+      min_blob_size(options.min_blob_size),
+      blob_file_size(options.blob_file_size),
+      blob_compression_type(options.blob_compression_type),
+      enable_blob_garbage_collection(options.enable_blob_garbage_collection),
+      blob_garbage_collection_age_cutoff(
+          options.blob_garbage_collection_age_cutoff),
+      blob_garbage_collection_force_threshold(
+          options.blob_garbage_collection_force_threshold),
+      blob_compaction_readahead_size(options.blob_compaction_readahead_size) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -122,12 +135,15 @@
   ROCKS_LOG_HEADER(
       log, "       Options.compaction_filter_factory: %s",
       compaction_filter_factory ? compaction_filter_factory->Name() : "None");
+  ROCKS_LOG_HEADER(
+      log, " Options.sst_partitioner_factory: %s",
+      sst_partitioner_factory ? sst_partitioner_factory->Name() : "None");
   ROCKS_LOG_HEADER(log, "        Options.memtable_factory: %s",
                    memtable_factory->Name());
   ROCKS_LOG_HEADER(log, "           Options.table_factory: %s",
                    table_factory->Name());
   ROCKS_LOG_HEADER(log, "           table_factory options: %s",
-                   table_factory->GetPrintableTableOptions().c_str());
+                   table_factory->GetPrintableOptions().c_str());
   ROCKS_LOG_HEADER(log, "       Options.write_buffer_size: %" ROCKSDB_PRIszt,
                    write_buffer_size);
   ROCKS_LOG_HEADER(log, " Options.max_write_buffer_number: %d",
@@ -183,8 +199,18 @@
         "%" PRIu32,
         bottommost_compression_opts.zstd_max_train_bytes);
     ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.parallel_threads: "
+        "%" PRIu32,
+        bottommost_compression_opts.parallel_threads);
+    ROCKS_LOG_HEADER(
         log, "                 Options.bottommost_compression_opts.enabled: %s",
         bottommost_compression_opts.enabled ? "true" : "false");
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.max_dict_buffer_bytes: "
+        "%" PRIu64,
+        bottommost_compression_opts.max_dict_buffer_bytes);
     ROCKS_LOG_HEADER(log, "           Options.compression_opts.window_bits: %d",
                      compression_opts.window_bits);
     ROCKS_LOG_HEADER(log, "                 Options.compression_opts.level: %d",
@@ -200,8 +226,16 @@
                      "%" PRIu32,
                      compression_opts.zstd_max_train_bytes);
     ROCKS_LOG_HEADER(log,
+                     "        Options.compression_opts.parallel_threads: "
+                     "%" PRIu32,
+                     compression_opts.parallel_threads);
+    ROCKS_LOG_HEADER(log,
                      "                 Options.compression_opts.enabled: %s",
                      compression_opts.enabled ? "true" : "false");
+    ROCKS_LOG_HEADER(log,
+                     "        Options.compression_opts.max_dict_buffer_bytes: "
+                     "%" PRIu64,
+                     compression_opts.max_dict_buffer_bytes);
     ROCKS_LOG_HEADER(log, "     Options.level0_file_num_compaction_trigger: %d",
                      level0_file_num_compaction_trigger);
     ROCKS_LOG_HEADER(log, "         Options.level0_slowdown_writes_trigger: %d",
@@ -310,14 +344,13 @@
     ROCKS_LOG_HEADER(log,
                      "Options.compaction_options_fifo.allow_compaction: %d",
                      compaction_options_fifo.allow_compaction);
-    std::string collector_names;
+    std::ostringstream collector_info;
     for (const auto& collector_factory : table_properties_collector_factories) {
-      collector_names.append(collector_factory->Name());
-      collector_names.append("; ");
+      collector_info << collector_factory->ToString() << ';';
     }
     ROCKS_LOG_HEADER(
         log, "                  Options.table_properties_collectors: %s",
-        collector_names.c_str());
+        collector_info.str().c_str());
     ROCKS_LOG_HEADER(log,
                      "                  Options.inplace_update_support: %d",
                      inplace_update_support);
@@ -357,6 +390,25 @@
     ROCKS_LOG_HEADER(log,
                      "         Options.periodic_compaction_seconds: %" PRIu64,
                      periodic_compaction_seconds);
+    ROCKS_LOG_HEADER(log, "                      Options.enable_blob_files: %s",
+                     enable_blob_files ? "true" : "false");
+    ROCKS_LOG_HEADER(
+        log, "                          Options.min_blob_size: %" PRIu64,
+        min_blob_size);
+    ROCKS_LOG_HEADER(
+        log, "                         Options.blob_file_size: %" PRIu64,
+        blob_file_size);
+    ROCKS_LOG_HEADER(log, "                  Options.blob_compression_type: %s",
+                     CompressionTypeToString(blob_compression_type).c_str());
+    ROCKS_LOG_HEADER(log, "         Options.enable_blob_garbage_collection: %s",
+                     enable_blob_garbage_collection ? "true" : "false");
+    ROCKS_LOG_HEADER(log, "     Options.blob_garbage_collection_age_cutoff: %f",
+                     blob_garbage_collection_age_cutoff);
+    ROCKS_LOG_HEADER(log, "Options.blob_garbage_collection_force_threshold: %f",
+                     blob_garbage_collection_force_threshold);
+    ROCKS_LOG_HEADER(
+        log, "         Options.blob_compaction_readahead_size: %" PRIu64,
+        blob_compaction_readahead_size);
 }  // ColumnFamilyOptions::Dump
 
 void Options::Dump(Logger* log) const {
@@ -422,6 +474,19 @@
   return this;
 }
 
+Options* Options::DisableExtraChecks() {
+  // See https://github.com/facebook/rocksdb/issues/9354
+  force_consistency_checks = false;
+  // Considered but no clear performance impact seen:
+  // * check_flush_compaction_key_order
+  // * paranoid_checks
+  // * flush_verify_memtable_count
+  // By current API contract, not including
+  // * verify_checksums
+  // because checking storage data integrity is a more standard practice.
+  return this;
+}
+
 Options* Options::OldDefaults(int rocksdb_major_version,
                               int rocksdb_minor_version) {
   ColumnFamilyOptions::OldDefaults(rocksdb_major_version,
@@ -598,7 +663,12 @@
       background_purge_on_iterator_cleanup(false),
       ignore_range_deletions(false),
       iter_start_seqnum(0),
-      timestamp(nullptr) {}
+      timestamp(nullptr),
+      iter_start_ts(nullptr),
+      deadline(std::chrono::microseconds::zero()),
+      io_timeout(std::chrono::microseconds::zero()),
+      value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
+      adaptive_readahead(false) {}
 
 ReadOptions::ReadOptions(bool cksum, bool cache)
     : snapshot(nullptr),
@@ -618,6 +688,11 @@
       background_purge_on_iterator_cleanup(false),
       ignore_range_deletions(false),
       iter_start_seqnum(0),
-      timestamp(nullptr) {}
+      timestamp(nullptr),
+      iter_start_ts(nullptr),
+      deadline(std::chrono::microseconds::zero()),
+      io_timeout(std::chrono::microseconds::zero()),
+      value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
+      adaptive_readahead(false) {}
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,13 +7,17 @@
 #include <cassert>
 #include <cctype>
 #include <cstdlib>
+#include <set>
 #include <unordered_set>
 #include <vector>
 
+#include "options/cf_options.h"
+#include "options/db_options.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
@@ -21,12 +25,37 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/object_registry.h"
-#include "table/block_based/block_based_table_factory.h"
-#include "table/plain/plain_table_factory.h"
-#include "util/cast_util.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+ConfigOptions::ConfigOptions()
+#ifndef ROCKSDB_LITE
+    : registry(ObjectRegistry::NewInstance())
+#endif
+{
+  env = Env::Default();
+}
+
+ConfigOptions::ConfigOptions(const DBOptions& db_opts) : env(db_opts.env) {
+#ifndef ROCKSDB_LITE
+  registry = ObjectRegistry::NewInstance();
+#endif
+}
+
+Status ValidateOptions(const DBOptions& db_opts,
+                       const ColumnFamilyOptions& cf_opts) {
+  Status s;
+#ifndef ROCKSDB_LITE
+  auto db_cfg = DBOptionsAsConfigurable(db_opts);
+  auto cf_cfg = CFOptionsAsConfigurable(cf_opts);
+  s = db_cfg->ValidateOptions(db_opts, cf_opts);
+  if (s.ok()) s = cf_cfg->ValidateOptions(db_opts, cf_opts);
+#else
+  s = cf_opts.table_factory->ValidateOptions(db_opts, cf_opts);
+#endif
+  return s;
+}
 
 DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
                          const MutableDBOptions& mutable_db_options) {
@@ -37,8 +66,11 @@
       immutable_db_options.create_missing_column_families;
   options.error_if_exists = immutable_db_options.error_if_exists;
   options.paranoid_checks = immutable_db_options.paranoid_checks;
+  options.flush_verify_memtable_count =
+      immutable_db_options.flush_verify_memtable_count;
+  options.track_and_verify_wals_in_manifest =
+      immutable_db_options.track_and_verify_wals_in_manifest;
   options.env = immutable_db_options.env;
-  options.file_system = immutable_db_options.fs;
   options.rate_limiter = immutable_db_options.rate_limiter;
   options.sst_file_manager = immutable_db_options.sst_file_manager;
   options.info_log = immutable_db_options.info_log;
@@ -62,8 +94,8 @@
   options.bytes_per_sync = mutable_db_options.bytes_per_sync;
   options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync;
   options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync;
-  options.max_subcompactions = immutable_db_options.max_subcompactions;
-  options.max_background_flushes = immutable_db_options.max_background_flushes;
+  options.max_subcompactions = mutable_db_options.max_subcompactions;
+  options.max_background_flushes = mutable_db_options.max_background_flushes;
   options.max_log_file_size = immutable_db_options.max_log_file_size;
   options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll;
   options.keep_log_file_num = immutable_db_options.keep_log_file_num;
@@ -71,8 +103,8 @@
   options.max_manifest_file_size = immutable_db_options.max_manifest_file_size;
   options.table_cache_numshardbits =
       immutable_db_options.table_cache_numshardbits;
-  options.WAL_ttl_seconds = immutable_db_options.wal_ttl_seconds;
-  options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb;
+  options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds;
+  options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB;
   options.manifest_preallocation_size =
       immutable_db_options.manifest_preallocation_size;
   options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
@@ -144,7 +176,18 @@
   options.avoid_unnecessary_blocking_io =
       immutable_db_options.avoid_unnecessary_blocking_io;
   options.log_readahead_size = immutable_db_options.log_readahead_size;
-  options.sst_file_checksum_func = immutable_db_options.sst_file_checksum_func;
+  options.file_checksum_gen_factory =
+      immutable_db_options.file_checksum_gen_factory;
+  options.best_efforts_recovery = immutable_db_options.best_efforts_recovery;
+  options.max_bgerror_resume_count =
+      immutable_db_options.max_bgerror_resume_count;
+  options.bgerror_resume_retry_interval =
+      immutable_db_options.bgerror_resume_retry_interval;
+  options.db_host_id = immutable_db_options.db_host_id;
+  options.allow_data_in_errors = immutable_db_options.allow_data_in_errors;
+  options.checksum_handoff_file_types =
+      immutable_db_options.checksum_handoff_file_types;
+  options.lowest_used_cache_tier = immutable_db_options.lowest_used_cache_tier;
   return options;
 }
 
@@ -152,69 +195,119 @@
     const ColumnFamilyOptions& options,
     const MutableCFOptions& mutable_cf_options) {
   ColumnFamilyOptions cf_opts(options);
+  UpdateColumnFamilyOptions(mutable_cf_options, &cf_opts);
+  // TODO(yhchiang): find some way to handle the following derived options
+  // * max_file_size
+  return cf_opts;
+}
 
+void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
+                               ColumnFamilyOptions* cf_opts) {
   // Memtable related options
-  cf_opts.write_buffer_size = mutable_cf_options.write_buffer_size;
-  cf_opts.max_write_buffer_number = mutable_cf_options.max_write_buffer_number;
-  cf_opts.arena_block_size = mutable_cf_options.arena_block_size;
-  cf_opts.memtable_prefix_bloom_size_ratio =
-      mutable_cf_options.memtable_prefix_bloom_size_ratio;
-  cf_opts.memtable_whole_key_filtering =
-      mutable_cf_options.memtable_whole_key_filtering;
-  cf_opts.memtable_huge_page_size = mutable_cf_options.memtable_huge_page_size;
-  cf_opts.max_successive_merges = mutable_cf_options.max_successive_merges;
-  cf_opts.inplace_update_num_locks =
-      mutable_cf_options.inplace_update_num_locks;
-  cf_opts.prefix_extractor = mutable_cf_options.prefix_extractor;
+  cf_opts->write_buffer_size = moptions.write_buffer_size;
+  cf_opts->max_write_buffer_number = moptions.max_write_buffer_number;
+  cf_opts->arena_block_size = moptions.arena_block_size;
+  cf_opts->memtable_prefix_bloom_size_ratio =
+      moptions.memtable_prefix_bloom_size_ratio;
+  cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering;
+  cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size;
+  cf_opts->max_successive_merges = moptions.max_successive_merges;
+  cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks;
+  cf_opts->prefix_extractor = moptions.prefix_extractor;
 
   // Compaction related options
-  cf_opts.disable_auto_compactions =
-      mutable_cf_options.disable_auto_compactions;
-  cf_opts.soft_pending_compaction_bytes_limit =
-      mutable_cf_options.soft_pending_compaction_bytes_limit;
-  cf_opts.hard_pending_compaction_bytes_limit =
-      mutable_cf_options.hard_pending_compaction_bytes_limit;
-  cf_opts.level0_file_num_compaction_trigger =
-      mutable_cf_options.level0_file_num_compaction_trigger;
-  cf_opts.level0_slowdown_writes_trigger =
-      mutable_cf_options.level0_slowdown_writes_trigger;
-  cf_opts.level0_stop_writes_trigger =
-      mutable_cf_options.level0_stop_writes_trigger;
-  cf_opts.max_compaction_bytes = mutable_cf_options.max_compaction_bytes;
-  cf_opts.target_file_size_base = mutable_cf_options.target_file_size_base;
-  cf_opts.target_file_size_multiplier =
-      mutable_cf_options.target_file_size_multiplier;
-  cf_opts.max_bytes_for_level_base =
-      mutable_cf_options.max_bytes_for_level_base;
-  cf_opts.max_bytes_for_level_multiplier =
-      mutable_cf_options.max_bytes_for_level_multiplier;
-  cf_opts.ttl = mutable_cf_options.ttl;
-  cf_opts.periodic_compaction_seconds =
-      mutable_cf_options.periodic_compaction_seconds;
-
-  cf_opts.max_bytes_for_level_multiplier_additional.clear();
-  for (auto value :
-       mutable_cf_options.max_bytes_for_level_multiplier_additional) {
-    cf_opts.max_bytes_for_level_multiplier_additional.emplace_back(value);
-  }
-
-  cf_opts.compaction_options_fifo = mutable_cf_options.compaction_options_fifo;
-  cf_opts.compaction_options_universal =
-      mutable_cf_options.compaction_options_universal;
+  cf_opts->disable_auto_compactions = moptions.disable_auto_compactions;
+  cf_opts->soft_pending_compaction_bytes_limit =
+      moptions.soft_pending_compaction_bytes_limit;
+  cf_opts->hard_pending_compaction_bytes_limit =
+      moptions.hard_pending_compaction_bytes_limit;
+  cf_opts->level0_file_num_compaction_trigger =
+      moptions.level0_file_num_compaction_trigger;
+  cf_opts->level0_slowdown_writes_trigger =
+      moptions.level0_slowdown_writes_trigger;
+  cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger;
+  cf_opts->max_compaction_bytes = moptions.max_compaction_bytes;
+  cf_opts->target_file_size_base = moptions.target_file_size_base;
+  cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier;
+  cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base;
+  cf_opts->max_bytes_for_level_multiplier =
+      moptions.max_bytes_for_level_multiplier;
+  cf_opts->ttl = moptions.ttl;
+  cf_opts->periodic_compaction_seconds = moptions.periodic_compaction_seconds;
+
+  cf_opts->max_bytes_for_level_multiplier_additional.clear();
+  for (auto value : moptions.max_bytes_for_level_multiplier_additional) {
+    cf_opts->max_bytes_for_level_multiplier_additional.emplace_back(value);
+  }
+
+  cf_opts->compaction_options_fifo = moptions.compaction_options_fifo;
+  cf_opts->compaction_options_universal = moptions.compaction_options_universal;
+
+  // Blob file related options
+  cf_opts->enable_blob_files = moptions.enable_blob_files;
+  cf_opts->min_blob_size = moptions.min_blob_size;
+  cf_opts->blob_file_size = moptions.blob_file_size;
+  cf_opts->blob_compression_type = moptions.blob_compression_type;
+  cf_opts->enable_blob_garbage_collection =
+      moptions.enable_blob_garbage_collection;
+  cf_opts->blob_garbage_collection_age_cutoff =
+      moptions.blob_garbage_collection_age_cutoff;
+  cf_opts->blob_garbage_collection_force_threshold =
+      moptions.blob_garbage_collection_force_threshold;
+  cf_opts->blob_compaction_readahead_size =
+      moptions.blob_compaction_readahead_size;
 
   // Misc options
-  cf_opts.max_sequential_skip_in_iterations =
-      mutable_cf_options.max_sequential_skip_in_iterations;
-  cf_opts.paranoid_file_checks = mutable_cf_options.paranoid_file_checks;
-  cf_opts.report_bg_io_stats = mutable_cf_options.report_bg_io_stats;
-  cf_opts.compression = mutable_cf_options.compression;
-  cf_opts.sample_for_compression = mutable_cf_options.sample_for_compression;
+  cf_opts->max_sequential_skip_in_iterations =
+      moptions.max_sequential_skip_in_iterations;
+  cf_opts->check_flush_compaction_key_order =
+      moptions.check_flush_compaction_key_order;
+  cf_opts->paranoid_file_checks = moptions.paranoid_file_checks;
+  cf_opts->report_bg_io_stats = moptions.report_bg_io_stats;
+  cf_opts->compression = moptions.compression;
+  cf_opts->compression_opts = moptions.compression_opts;
+  cf_opts->bottommost_compression = moptions.bottommost_compression;
+  cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts;
+  cf_opts->sample_for_compression = moptions.sample_for_compression;
+}
+
+void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
+                               ColumnFamilyOptions* cf_opts) {
+  cf_opts->compaction_style = ioptions.compaction_style;
+  cf_opts->compaction_pri = ioptions.compaction_pri;
+  cf_opts->comparator = ioptions.user_comparator;
+  cf_opts->merge_operator = ioptions.merge_operator;
+  cf_opts->compaction_filter = ioptions.compaction_filter;
+  cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory;
+  cf_opts->min_write_buffer_number_to_merge =
+      ioptions.min_write_buffer_number_to_merge;
+  cf_opts->max_write_buffer_number_to_maintain =
+      ioptions.max_write_buffer_number_to_maintain;
+  cf_opts->max_write_buffer_size_to_maintain =
+      ioptions.max_write_buffer_size_to_maintain;
+  cf_opts->inplace_update_support = ioptions.inplace_update_support;
+  cf_opts->inplace_callback = ioptions.inplace_callback;
+  cf_opts->memtable_factory = ioptions.memtable_factory;
+  cf_opts->table_factory = ioptions.table_factory;
+  cf_opts->table_properties_collector_factories =
+      ioptions.table_properties_collector_factories;
+  cf_opts->bloom_locality = ioptions.bloom_locality;
+  cf_opts->purge_redundant_kvs_while_flush =
+      ioptions.purge_redundant_kvs_while_flush;
+  cf_opts->compression_per_level = ioptions.compression_per_level;
+  cf_opts->level_compaction_dynamic_level_bytes =
+      ioptions.level_compaction_dynamic_level_bytes;
+  cf_opts->num_levels = ioptions.num_levels;
+  cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits;
+  cf_opts->force_consistency_checks = ioptions.force_consistency_checks;
+  cf_opts->memtable_insert_with_hint_prefix_extractor =
+      ioptions.memtable_insert_with_hint_prefix_extractor;
+  cf_opts->cf_paths = ioptions.cf_paths;
+  cf_opts->compaction_thread_limiter = ioptions.compaction_thread_limiter;
+  cf_opts->sst_partitioner_factory = ioptions.sst_partitioner_factory;
 
-  cf_opts.table_factory = options.table_factory;
   // TODO(yhchiang): find some way to handle the following derived options
   // * max_file_size
-
-  return cf_opts;
 }
 
 std::map<CompactionStyle, std::string>
@@ -239,7 +332,8 @@
     OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum},
                                                {"kCRC32c", kCRC32c},
                                                {"kxxHash", kxxHash},
-                                               {"kxxHash64", kxxHash64}};
+                                               {"kxxHash64", kxxHash64},
+                                               {"kXXH3", kXXH3}};
 
 std::unordered_map<std::string, CompressionType>
     OptionsHelper::compression_type_string_map = {
@@ -253,604 +347,215 @@
         {"kZSTD", kZSTD},
         {"kZSTDNotFinalCompression", kZSTDNotFinalCompression},
         {"kDisableCompressionOption", kDisableCompressionOption}};
-#ifndef ROCKSDB_LITE
-
-const std::string kNameComparator = "comparator";
-const std::string kNameEnv = "env";
-const std::string kNameMergeOperator = "merge_operator";
-
-template <typename T>
-Status GetStringFromStruct(
-    std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
-    const std::string& delimiter);
-
-namespace {
-template <typename T>
-bool ParseEnum(const std::unordered_map<std::string, T>& type_map,
-               const std::string& type, T* value) {
-  auto iter = type_map.find(type);
-  if (iter != type_map.end()) {
-    *value = iter->second;
-    return true;
-  }
-  return false;
-}
 
-template <typename T>
-bool SerializeEnum(const std::unordered_map<std::string, T>& type_map,
-                   const T& type, std::string* value) {
-  for (const auto& pair : type_map) {
-    if (pair.second == type) {
-      *value = pair.first;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool SerializeVectorCompressionType(const std::vector<CompressionType>& types,
-                                    std::string* value) {
-  std::stringstream ss;
-  bool result;
-  for (size_t i = 0; i < types.size(); ++i) {
-    if (i > 0) {
-      ss << ':';
-    }
-    std::string string_type;
-    result = SerializeEnum<CompressionType>(compression_type_string_map,
-                                            types[i], &string_type);
-    if (result == false) {
-      return result;
-    }
-    ss << string_type;
-  }
-  *value = ss.str();
-  return true;
-}
-
-bool ParseVectorCompressionType(
-    const std::string& value,
-    std::vector<CompressionType>* compression_per_level) {
-  compression_per_level->clear();
-  size_t start = 0;
-  while (start < value.size()) {
-    size_t end = value.find(':', start);
-    bool is_ok;
-    CompressionType type;
-    if (end == std::string::npos) {
-      is_ok = ParseEnum<CompressionType>(compression_type_string_map,
-                                         value.substr(start), &type);
-      if (!is_ok) {
-        return false;
-      }
-      compression_per_level->emplace_back(type);
-      break;
-    } else {
-      is_ok = ParseEnum<CompressionType>(
-          compression_type_string_map, value.substr(start, end - start), &type);
-      if (!is_ok) {
-        return false;
-      }
-      compression_per_level->emplace_back(type);
-      start = end + 1;
+std::vector<CompressionType> GetSupportedCompressions() {
+  // std::set internally to deduplicate potential name aliases
+  std::set<CompressionType> supported_compressions;
+  for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) {
+    CompressionType t = comp_to_name.second;
+    if (t != kDisableCompressionOption && CompressionTypeSupported(t)) {
+      supported_compressions.insert(t);
     }
   }
-  return true;
+  return std::vector<CompressionType>(supported_compressions.begin(),
+                                      supported_compressions.end());
 }
 
-// This is to handle backward compatibility, where compaction_options_fifo
-// could be assigned a single scalar value, say, like "23", which would be
-// assigned to max_table_files_size.
-bool FIFOCompactionOptionsSpecialCase(const std::string& opt_str,
-                                      CompactionOptionsFIFO* options) {
-  if (opt_str.find("=") != std::string::npos) {
-    // New format. Go do your new parsing using ParseStructOptions.
-    return false;
-  }
-
-  // Old format. Parse just a single uint64_t value.
-  options->max_table_files_size = ParseUint64(opt_str);
-  return true;
-}
-
-template <typename T>
-bool SerializeStruct(
-    const T& options, std::string* value,
-    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
-  std::string opt_str;
-  Status s = GetStringFromStruct(&opt_str, options, type_info_map, ";");
-  if (!s.ok()) {
-    return false;
-  }
-  *value = "{" + opt_str + "}";
-  return true;
-}
-
-template <typename T>
-bool ParseSingleStructOption(
-    const std::string& opt_val_str, T* options,
-    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
-  size_t end = opt_val_str.find('=');
-  std::string key = opt_val_str.substr(0, end);
-  std::string value = opt_val_str.substr(end + 1);
-  auto iter = type_info_map.find(key);
-  if (iter == type_info_map.end()) {
-    return false;
-  }
-  const auto& opt_info = iter->second;
-  if (opt_info.verification == OptionVerificationType::kDeprecated) {
-    // Should also skip deprecated sub-options such as
-    // fifo_compaction_options_type_info.ttl
-    return true;
-  }
-  return ParseOptionHelper(
-      reinterpret_cast<char*>(options) + opt_info.mutable_offset, opt_info.type,
-      value);
-}
-
-template <typename T>
-bool ParseStructOptions(
-    const std::string& opt_str, T* options,
-    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
-  assert(!opt_str.empty());
-
-  size_t start = 0;
-  if (opt_str[0] == '{') {
-    start++;
-  }
-  while ((start != std::string::npos) && (start < opt_str.size())) {
-    if (opt_str[start] == '}') {
-      break;
-    }
-    size_t end = opt_str.find(';', start);
-    size_t len = (end == std::string::npos) ? end : end - start;
-    if (!ParseSingleStructOption(opt_str.substr(start, len), options,
-                                 type_info_map)) {
-      return false;
+std::vector<CompressionType> GetSupportedDictCompressions() {
+  std::set<CompressionType> dict_compression_types;
+  for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) {
+    CompressionType t = comp_to_name.second;
+    if (t != kDisableCompressionOption && DictCompressionTypeSupported(t)) {
+      dict_compression_types.insert(t);
     }
-    start = (end == std::string::npos) ? end : end + 1;
   }
-  return true;
+  return std::vector<CompressionType>(dict_compression_types.begin(),
+                                      dict_compression_types.end());
 }
-}  // anonymouse namespace
 
-bool ParseSliceTransformHelper(
-    const std::string& kFixedPrefixName, const std::string& kCappedPrefixName,
-    const std::string& value,
-    std::shared_ptr<const SliceTransform>* slice_transform) {
-  const char* no_op_name = "rocksdb.Noop";
-  size_t no_op_length = strlen(no_op_name);
-  auto& pe_value = value;
-  if (pe_value.size() > kFixedPrefixName.size() &&
-      pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == 0) {
-    int prefix_length = ParseInt(trim(value.substr(kFixedPrefixName.size())));
-    slice_transform->reset(NewFixedPrefixTransform(prefix_length));
-  } else if (pe_value.size() > kCappedPrefixName.size() &&
-             pe_value.compare(0, kCappedPrefixName.size(), kCappedPrefixName) ==
-                 0) {
-    int prefix_length =
-        ParseInt(trim(pe_value.substr(kCappedPrefixName.size())));
-    slice_transform->reset(NewCappedPrefixTransform(prefix_length));
-  } else if (pe_value.size() == no_op_length &&
-             pe_value.compare(0, no_op_length, no_op_name) == 0) {
-    const SliceTransform* no_op_transform = NewNoopTransform();
-    slice_transform->reset(no_op_transform);
-  } else if (value == kNullptrString) {
-    slice_transform->reset();
-  } else {
-    return false;
+std::vector<ChecksumType> GetSupportedChecksums() {
+  std::set<ChecksumType> checksum_types;
+  for (const auto& e : OptionsHelper::checksum_type_string_map) {
+    checksum_types.insert(e.second);
   }
-
-  return true;
+  return std::vector<ChecksumType>(checksum_types.begin(),
+                                   checksum_types.end());
 }
 
-bool ParseSliceTransform(
-    const std::string& value,
-    std::shared_ptr<const SliceTransform>* slice_transform) {
-  // While we normally don't convert the string representation of a
-  // pointer-typed option into its instance, here we do so for backward
-  // compatibility as we allow this action in SetOption().
-
-  // TODO(yhchiang): A possible better place for these serialization /
-  // deserialization is inside the class definition of pointer-typed
-  // option itself, but this requires a bigger change of public API.
-  bool result =
-      ParseSliceTransformHelper("fixed:", "capped:", value, slice_transform);
-  if (result) {
-    return result;
-  }
-  result = ParseSliceTransformHelper(
-      "rocksdb.FixedPrefix.", "rocksdb.CappedPrefix.", value, slice_transform);
-  if (result) {
-    return result;
-  }
-  // TODO(yhchiang): we can further support other default
-  //                 SliceTransforms here.
-  return false;
-}
-
-bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
-                       const std::string& value) {
+#ifndef ROCKSDB_LITE
+static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type,
+                              const std::string& value) {
   switch (opt_type) {
     case OptionType::kBoolean:
-      *reinterpret_cast<bool*>(opt_address) = ParseBoolean("", value);
+      *static_cast<bool*>(opt_address) = ParseBoolean("", value);
       break;
     case OptionType::kInt:
-      *reinterpret_cast<int*>(opt_address) = ParseInt(value);
+      *static_cast<int*>(opt_address) = ParseInt(value);
       break;
     case OptionType::kInt32T:
-      *reinterpret_cast<int32_t*>(opt_address) = ParseInt32(value);
+      *static_cast<int32_t*>(opt_address) = ParseInt32(value);
       break;
     case OptionType::kInt64T:
-      PutUnaligned(reinterpret_cast<int64_t*>(opt_address), ParseInt64(value));
-      break;
-    case OptionType::kVectorInt:
-      *reinterpret_cast<std::vector<int>*>(opt_address) = ParseVectorInt(value);
+      PutUnaligned(static_cast<int64_t*>(opt_address), ParseInt64(value));
       break;
     case OptionType::kUInt:
-      *reinterpret_cast<unsigned int*>(opt_address) = ParseUint32(value);
+      *static_cast<unsigned int*>(opt_address) = ParseUint32(value);
+      break;
+    case OptionType::kUInt8T:
+      *static_cast<uint8_t*>(opt_address) = ParseUint8(value);
       break;
     case OptionType::kUInt32T:
-      *reinterpret_cast<uint32_t*>(opt_address) = ParseUint32(value);
+      *static_cast<uint32_t*>(opt_address) = ParseUint32(value);
       break;
     case OptionType::kUInt64T:
-      PutUnaligned(reinterpret_cast<uint64_t*>(opt_address), ParseUint64(value));
+      PutUnaligned(static_cast<uint64_t*>(opt_address), ParseUint64(value));
       break;
     case OptionType::kSizeT:
-      PutUnaligned(reinterpret_cast<size_t*>(opt_address), ParseSizeT(value));
+      PutUnaligned(static_cast<size_t*>(opt_address), ParseSizeT(value));
       break;
     case OptionType::kString:
-      *reinterpret_cast<std::string*>(opt_address) = value;
+      *static_cast<std::string*>(opt_address) = value;
       break;
     case OptionType::kDouble:
-      *reinterpret_cast<double*>(opt_address) = ParseDouble(value);
+      *static_cast<double*>(opt_address) = ParseDouble(value);
       break;
     case OptionType::kCompactionStyle:
       return ParseEnum<CompactionStyle>(
           compaction_style_string_map, value,
-          reinterpret_cast<CompactionStyle*>(opt_address));
+          static_cast<CompactionStyle*>(opt_address));
     case OptionType::kCompactionPri:
-      return ParseEnum<CompactionPri>(
-          compaction_pri_string_map, value,
-          reinterpret_cast<CompactionPri*>(opt_address));
+      return ParseEnum<CompactionPri>(compaction_pri_string_map, value,
+                                      static_cast<CompactionPri*>(opt_address));
     case OptionType::kCompressionType:
       return ParseEnum<CompressionType>(
           compression_type_string_map, value,
-          reinterpret_cast<CompressionType*>(opt_address));
-    case OptionType::kVectorCompressionType:
-      return ParseVectorCompressionType(
-          value, reinterpret_cast<std::vector<CompressionType>*>(opt_address));
-    case OptionType::kSliceTransform:
-      return ParseSliceTransform(
-          value, reinterpret_cast<std::shared_ptr<const SliceTransform>*>(
-                     opt_address));
+          static_cast<CompressionType*>(opt_address));
     case OptionType::kChecksumType:
-      return ParseEnum<ChecksumType>(
-          checksum_type_string_map, value,
-          reinterpret_cast<ChecksumType*>(opt_address));
-    case OptionType::kBlockBasedTableIndexType:
-      return ParseEnum<BlockBasedTableOptions::IndexType>(
-          block_base_table_index_type_string_map, value,
-          reinterpret_cast<BlockBasedTableOptions::IndexType*>(opt_address));
-    case OptionType::kBlockBasedTableDataBlockIndexType:
-      return ParseEnum<BlockBasedTableOptions::DataBlockIndexType>(
-          block_base_table_data_block_index_type_string_map, value,
-          reinterpret_cast<BlockBasedTableOptions::DataBlockIndexType*>(
-              opt_address));
-    case OptionType::kBlockBasedTableIndexShorteningMode:
-      return ParseEnum<BlockBasedTableOptions::IndexShorteningMode>(
-          block_base_table_index_shortening_mode_string_map, value,
-          reinterpret_cast<BlockBasedTableOptions::IndexShorteningMode*>(
-              opt_address));
+      return ParseEnum<ChecksumType>(checksum_type_string_map, value,
+                                     static_cast<ChecksumType*>(opt_address));
     case OptionType::kEncodingType:
-      return ParseEnum<EncodingType>(
-          encoding_type_string_map, value,
-          reinterpret_cast<EncodingType*>(opt_address));
-    case OptionType::kWALRecoveryMode:
-      return ParseEnum<WALRecoveryMode>(
-          wal_recovery_mode_string_map, value,
-          reinterpret_cast<WALRecoveryMode*>(opt_address));
-    case OptionType::kAccessHint:
-      return ParseEnum<DBOptions::AccessHint>(
-          access_hint_string_map, value,
-          reinterpret_cast<DBOptions::AccessHint*>(opt_address));
-    case OptionType::kInfoLogLevel:
-      return ParseEnum<InfoLogLevel>(
-          info_log_level_string_map, value,
-          reinterpret_cast<InfoLogLevel*>(opt_address));
-    case OptionType::kCompactionOptionsFIFO: {
-      if (!FIFOCompactionOptionsSpecialCase(
-              value, reinterpret_cast<CompactionOptionsFIFO*>(opt_address))) {
-        return ParseStructOptions<CompactionOptionsFIFO>(
-            value, reinterpret_cast<CompactionOptionsFIFO*>(opt_address),
-            fifo_compaction_options_type_info);
-      }
-      return true;
-    }
-    case OptionType::kLRUCacheOptions: {
-      return ParseStructOptions<LRUCacheOptions>(value,
-          reinterpret_cast<LRUCacheOptions*>(opt_address),
-          lru_cache_options_type_info);
-    }
-    case OptionType::kCompactionOptionsUniversal:
-      return ParseStructOptions<CompactionOptionsUniversal>(
-          value, reinterpret_cast<CompactionOptionsUniversal*>(opt_address),
-          universal_compaction_options_type_info);
+      return ParseEnum<EncodingType>(encoding_type_string_map, value,
+                                     static_cast<EncodingType*>(opt_address));
     case OptionType::kCompactionStopStyle:
       return ParseEnum<CompactionStopStyle>(
           compaction_stop_style_string_map, value,
-          reinterpret_cast<CompactionStopStyle*>(opt_address));
+          static_cast<CompactionStopStyle*>(opt_address));
+    case OptionType::kEncodedString: {
+      std::string* output_addr = static_cast<std::string*>(opt_address);
+      (Slice(value)).DecodeHex(output_addr);
+      break;
+    }
     default:
       return false;
   }
   return true;
 }
 
-bool SerializeSingleOptionHelper(const char* opt_address,
+bool SerializeSingleOptionHelper(const void* opt_address,
                                  const OptionType opt_type,
                                  std::string* value) {
-
   assert(value);
   switch (opt_type) {
     case OptionType::kBoolean:
-      *value = *(reinterpret_cast<const bool*>(opt_address)) ? "true" : "false";
+      *value = *(static_cast<const bool*>(opt_address)) ? "true" : "false";
       break;
     case OptionType::kInt:
-      *value = ToString(*(reinterpret_cast<const int*>(opt_address)));
+      *value = ToString(*(static_cast<const int*>(opt_address)));
       break;
     case OptionType::kInt32T:
-      *value = ToString(*(reinterpret_cast<const int32_t*>(opt_address)));
+      *value = ToString(*(static_cast<const int32_t*>(opt_address)));
       break;
     case OptionType::kInt64T:
       {
         int64_t v;
-        GetUnaligned(reinterpret_cast<const int64_t*>(opt_address), &v);
+        GetUnaligned(static_cast<const int64_t*>(opt_address), &v);
         *value = ToString(v);
       }
       break;
-    case OptionType::kVectorInt:
-      return SerializeIntVector(
-          *reinterpret_cast<const std::vector<int>*>(opt_address), value);
     case OptionType::kUInt:
-      *value = ToString(*(reinterpret_cast<const unsigned int*>(opt_address)));
+      *value = ToString(*(static_cast<const unsigned int*>(opt_address)));
+      break;
+    case OptionType::kUInt8T:
+      *value = ToString(*(static_cast<const uint8_t*>(opt_address)));
       break;
     case OptionType::kUInt32T:
-      *value = ToString(*(reinterpret_cast<const uint32_t*>(opt_address)));
+      *value = ToString(*(static_cast<const uint32_t*>(opt_address)));
       break;
     case OptionType::kUInt64T:
       {
         uint64_t v;
-        GetUnaligned(reinterpret_cast<const uint64_t*>(opt_address), &v);
+        GetUnaligned(static_cast<const uint64_t*>(opt_address), &v);
         *value = ToString(v);
       }
       break;
     case OptionType::kSizeT:
       {
         size_t v;
-        GetUnaligned(reinterpret_cast<const size_t*>(opt_address), &v);
+        GetUnaligned(static_cast<const size_t*>(opt_address), &v);
         *value = ToString(v);
       }
       break;
     case OptionType::kDouble:
-      *value = ToString(*(reinterpret_cast<const double*>(opt_address)));
+      *value = ToString(*(static_cast<const double*>(opt_address)));
       break;
     case OptionType::kString:
-      *value = EscapeOptionString(
-          *(reinterpret_cast<const std::string*>(opt_address)));
+      *value =
+          EscapeOptionString(*(static_cast<const std::string*>(opt_address)));
       break;
     case OptionType::kCompactionStyle:
       return SerializeEnum<CompactionStyle>(
           compaction_style_string_map,
-          *(reinterpret_cast<const CompactionStyle*>(opt_address)), value);
+          *(static_cast<const CompactionStyle*>(opt_address)), value);
     case OptionType::kCompactionPri:
       return SerializeEnum<CompactionPri>(
           compaction_pri_string_map,
-          *(reinterpret_cast<const CompactionPri*>(opt_address)), value);
+          *(static_cast<const CompactionPri*>(opt_address)), value);
     case OptionType::kCompressionType:
       return SerializeEnum<CompressionType>(
           compression_type_string_map,
-          *(reinterpret_cast<const CompressionType*>(opt_address)), value);
-    case OptionType::kVectorCompressionType:
-      return SerializeVectorCompressionType(
-          *(reinterpret_cast<const std::vector<CompressionType>*>(opt_address)),
-          value);
-      break;
-    case OptionType::kSliceTransform: {
-      const auto* slice_transform_ptr =
-          reinterpret_cast<const std::shared_ptr<const SliceTransform>*>(
-              opt_address);
-      *value = slice_transform_ptr->get() ? slice_transform_ptr->get()->Name()
-                                          : kNullptrString;
-      break;
-    }
-    case OptionType::kTableFactory: {
-      const auto* table_factory_ptr =
-          reinterpret_cast<const std::shared_ptr<const TableFactory>*>(
-              opt_address);
-      *value = table_factory_ptr->get() ? table_factory_ptr->get()->Name()
-                                        : kNullptrString;
-      break;
-    }
-    case OptionType::kComparator: {
-      // it's a const pointer of const Comparator*
-      const auto* ptr = reinterpret_cast<const Comparator* const*>(opt_address);
-      // Since the user-specified comparator will be wrapped by
-      // InternalKeyComparator, we should persist the user-specified one
-      // instead of InternalKeyComparator.
-      if (*ptr == nullptr) {
-        *value = kNullptrString;
-      } else {
-        const Comparator* root_comp = (*ptr)->GetRootComparator();
-        if (root_comp == nullptr) {
-          root_comp = (*ptr);
-        }
-        *value = root_comp->Name();
-      }
+          *(static_cast<const CompressionType*>(opt_address)), value);
       break;
-    }
-    case OptionType::kCompactionFilter: {
-      // it's a const pointer of const CompactionFilter*
-      const auto* ptr =
-          reinterpret_cast<const CompactionFilter* const*>(opt_address);
-      *value = *ptr ? (*ptr)->Name() : kNullptrString;
-      break;
-    }
-    case OptionType::kCompactionFilterFactory: {
-      const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<CompactionFilterFactory>*>(
-              opt_address);
-      *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
-      break;
-    }
-    case OptionType::kMemTableRepFactory: {
-      const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<MemTableRepFactory>*>(
-              opt_address);
-      *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
-      break;
-    }
-    case OptionType::kMergeOperator: {
-      const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<MergeOperator>*>(opt_address);
-      *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
-      break;
-    }
     case OptionType::kFilterPolicy: {
       const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<FilterPolicy>*>(opt_address);
+          static_cast<const std::shared_ptr<FilterPolicy>*>(opt_address);
       *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
       break;
     }
     case OptionType::kChecksumType:
       return SerializeEnum<ChecksumType>(
           checksum_type_string_map,
-          *reinterpret_cast<const ChecksumType*>(opt_address), value);
-    case OptionType::kBlockBasedTableIndexType:
-      return SerializeEnum<BlockBasedTableOptions::IndexType>(
-          block_base_table_index_type_string_map,
-          *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(
-              opt_address),
-          value);
-    case OptionType::kBlockBasedTableDataBlockIndexType:
-      return SerializeEnum<BlockBasedTableOptions::DataBlockIndexType>(
-          block_base_table_data_block_index_type_string_map,
-          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
-              opt_address),
-          value);
-    case OptionType::kBlockBasedTableIndexShorteningMode:
-      return SerializeEnum<BlockBasedTableOptions::IndexShorteningMode>(
-          block_base_table_index_shortening_mode_string_map,
-          *reinterpret_cast<const BlockBasedTableOptions::IndexShorteningMode*>(
-              opt_address),
-          value);
-    case OptionType::kFlushBlockPolicyFactory: {
-      const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<FlushBlockPolicyFactory>*>(
-              opt_address);
-      *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
-      break;
-    }
+          *static_cast<const ChecksumType*>(opt_address), value);
     case OptionType::kEncodingType:
       return SerializeEnum<EncodingType>(
           encoding_type_string_map,
-          *reinterpret_cast<const EncodingType*>(opt_address), value);
-    case OptionType::kWALRecoveryMode:
-      return SerializeEnum<WALRecoveryMode>(
-          wal_recovery_mode_string_map,
-          *reinterpret_cast<const WALRecoveryMode*>(opt_address), value);
-    case OptionType::kAccessHint:
-      return SerializeEnum<DBOptions::AccessHint>(
-          access_hint_string_map,
-          *reinterpret_cast<const DBOptions::AccessHint*>(opt_address), value);
-    case OptionType::kInfoLogLevel:
-      return SerializeEnum<InfoLogLevel>(
-          info_log_level_string_map,
-          *reinterpret_cast<const InfoLogLevel*>(opt_address), value);
-    case OptionType::kCompactionOptionsFIFO:
-      return SerializeStruct<CompactionOptionsFIFO>(
-          *reinterpret_cast<const CompactionOptionsFIFO*>(opt_address), value,
-          fifo_compaction_options_type_info);
-    case OptionType::kCompactionOptionsUniversal:
-      return SerializeStruct<CompactionOptionsUniversal>(
-          *reinterpret_cast<const CompactionOptionsUniversal*>(opt_address),
-          value, universal_compaction_options_type_info);
+          *static_cast<const EncodingType*>(opt_address), value);
     case OptionType::kCompactionStopStyle:
       return SerializeEnum<CompactionStopStyle>(
           compaction_stop_style_string_map,
-          *reinterpret_cast<const CompactionStopStyle*>(opt_address), value);
+          *static_cast<const CompactionStopStyle*>(opt_address), value);
+    case OptionType::kEncodedString: {
+      const auto* ptr = static_cast<const std::string*>(opt_address);
+      *value = (Slice(*ptr)).ToString(true);
+      break;
+    }
     default:
       return false;
   }
   return true;
 }
 
-Status GetMutableOptionsFromStrings(
-    const MutableCFOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    Logger* info_log, MutableCFOptions* new_options) {
-  assert(new_options);
-  *new_options = base_options;
-  for (const auto& o : options_map) {
-    try {
-      auto iter = cf_options_type_info.find(o.first);
-      if (iter == cf_options_type_info.end()) {
-        return Status::InvalidArgument("Unrecognized option: " + o.first);
-      }
-      const auto& opt_info = iter->second;
-      if (!opt_info.is_mutable) {
-        return Status::InvalidArgument("Option not changeable: " + o.first);
-      }
-      if (opt_info.verification == OptionVerificationType::kDeprecated) {
-        // log warning when user tries to set a deprecated option but don't fail
-        // the call for compatibility.
-        ROCKS_LOG_WARN(info_log, "%s is a deprecated option and cannot be set",
-                       o.first.c_str());
-        continue;
-      }
-      bool is_ok = ParseOptionHelper(
-          reinterpret_cast<char*>(new_options) + opt_info.mutable_offset,
-          opt_info.type, o.second);
-      if (!is_ok) {
-        return Status::InvalidArgument("Error parsing " + o.first);
-      }
-    } catch (std::exception& e) {
-      return Status::InvalidArgument("Error parsing " + o.first + ":" +
-                                     std::string(e.what()));
-    }
+template <typename T>
+Status ConfigureFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opt_map,
+    const std::string& option_name, Configurable* config, T* new_opts) {
+  Status s = config->ConfigureFromMap(config_options, opt_map);
+  if (s.ok()) {
+    *new_opts = *(config->GetOptions<T>(option_name));
   }
-  return Status::OK();
+  return s;
 }
 
-Status GetMutableDBOptionsFromStrings(
-    const MutableDBOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    MutableDBOptions* new_options) {
-  assert(new_options);
-  *new_options = base_options;
-  for (const auto& o : options_map) {
-    try {
-      auto iter = db_options_type_info.find(o.first);
-      if (iter == db_options_type_info.end()) {
-        return Status::InvalidArgument("Unrecognized option: " + o.first);
-      }
-      const auto& opt_info = iter->second;
-      if (!opt_info.is_mutable) {
-        return Status::InvalidArgument("Option not changeable: " + o.first);
-      }
-      bool is_ok = ParseOptionHelper(
-          reinterpret_cast<char*>(new_options) + opt_info.mutable_offset,
-          opt_info.type, o.second);
-      if (!is_ok) {
-        return Status::InvalidArgument("Error parsing " + o.first);
-      }
-    } catch (std::exception& e) {
-      return Status::InvalidArgument("Error parsing " + o.first + ":" +
-                                     std::string(e.what()));
-    }
-  }
-  return Status::OK();
-}
 
 Status StringToMap(const std::string& opts_str,
                    std::unordered_map<std::string, std::string>* opts_map) {
@@ -860,306 +565,74 @@
   //              "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100"
   size_t pos = 0;
   std::string opts = trim(opts_str);
+  // If the input string starts and ends with "{...}", strip off the brackets
+  while (opts.size() > 2 && opts[0] == '{' && opts[opts.size() - 1] == '}') {
+    opts = trim(opts.substr(1, opts.size() - 2));
+  }
+
   while (pos < opts.size()) {
-    size_t eq_pos = opts.find('=', pos);
+    size_t eq_pos = opts.find_first_of("={};", pos);
     if (eq_pos == std::string::npos) {
       return Status::InvalidArgument("Mismatched key value pair, '=' expected");
+    } else if (opts[eq_pos] != '=') {
+      return Status::InvalidArgument("Unexpected char in key");
     }
+
     std::string key = trim(opts.substr(pos, eq_pos - pos));
     if (key.empty()) {
       return Status::InvalidArgument("Empty key found");
     }
 
-    // skip space after '=' and look for '{' for possible nested options
-    pos = eq_pos + 1;
-    while (pos < opts.size() && isspace(opts[pos])) {
-      ++pos;
-    }
-    // Empty value at the end
-    if (pos >= opts.size()) {
-      (*opts_map)[key] = "";
-      break;
-    }
-    if (opts[pos] == '{') {
-      int count = 1;
-      size_t brace_pos = pos + 1;
-      while (brace_pos < opts.size()) {
-        if (opts[brace_pos] == '{') {
-          ++count;
-        } else if (opts[brace_pos] == '}') {
-          --count;
-          if (count == 0) {
-            break;
-          }
-        }
-        ++brace_pos;
-      }
-      // found the matching closing brace
-      if (count == 0) {
-        (*opts_map)[key] = trim(opts.substr(pos + 1, brace_pos - pos - 1));
-        // skip all whitespace and move to the next ';'
-        // brace_pos points to the next position after the matching '}'
-        pos = brace_pos + 1;
-        while (pos < opts.size() && isspace(opts[pos])) {
-          ++pos;
-        }
-        if (pos < opts.size() && opts[pos] != ';') {
-          return Status::InvalidArgument(
-              "Unexpected chars after nested options");
-        }
-        ++pos;
-      } else {
-        return Status::InvalidArgument(
-            "Mismatched curly braces for nested options");
-      }
+    std::string value;
+    Status s = OptionTypeInfo::NextToken(opts, ';', eq_pos + 1, &pos, &value);
+    if (!s.ok()) {
+      return s;
     } else {
-      size_t sc_pos = opts.find(';', pos);
-      if (sc_pos == std::string::npos) {
-        (*opts_map)[key] = trim(opts.substr(pos));
-        // It either ends with a trailing semi-colon or the last key-value pair
+      (*opts_map)[key] = value;
+      if (pos == std::string::npos) {
         break;
       } else {
-        (*opts_map)[key] = trim(opts.substr(pos, sc_pos - pos));
+        pos++;
       }
-      pos = sc_pos + 1;
     }
   }
 
   return Status::OK();
 }
 
-Status ParseCompressionOptions(const std::string& value, const std::string& name,
-                              CompressionOptions& compression_opts) {
-  size_t start = 0;
-  size_t end = value.find(':');
-  if (end == std::string::npos) {
-    return Status::InvalidArgument("unable to parse the specified CF option " +
-                                   name);
-  }
-  compression_opts.window_bits = ParseInt(value.substr(start, end - start));
-  start = end + 1;
-  end = value.find(':', start);
-  if (end == std::string::npos) {
-    return Status::InvalidArgument("unable to parse the specified CF option " +
-                                   name);
-  }
-  compression_opts.level = ParseInt(value.substr(start, end - start));
-  start = end + 1;
-  if (start >= value.size()) {
-    return Status::InvalidArgument("unable to parse the specified CF option " +
-                                   name);
-  }
-  end = value.find(':', start);
-  compression_opts.strategy =
-      ParseInt(value.substr(start, value.size() - start));
-  // max_dict_bytes is optional for backwards compatibility
-  if (end != std::string::npos) {
-    start = end + 1;
-    if (start >= value.size()) {
-      return Status::InvalidArgument(
-          "unable to parse the specified CF option " + name);
-    }
-    compression_opts.max_dict_bytes =
-        ParseInt(value.substr(start, value.size() - start));
-    end = value.find(':', start);
-  }
-  // zstd_max_train_bytes is optional for backwards compatibility
-  if (end != std::string::npos) {
-    start = end + 1;
-    if (start >= value.size()) {
-      return Status::InvalidArgument(
-          "unable to parse the specified CF option " + name);
-    }
-    compression_opts.zstd_max_train_bytes =
-        ParseInt(value.substr(start, value.size() - start));
-    end = value.find(':', start);
-  }
-  // enabled is optional for backwards compatibility
-  if (end != std::string::npos) {
-    start = end + 1;
-    if (start >= value.size()) {
-      return Status::InvalidArgument(
-          "unable to parse the specified CF option " + name);
-    }
-    compression_opts.enabled =
-        ParseBoolean("", value.substr(start, value.size() - start));
-  }
-  return Status::OK();
-}
-
-Status ParseColumnFamilyOption(const std::string& name,
-                               const std::string& org_value,
-                               ColumnFamilyOptions* new_options,
-                               bool input_strings_escaped = false) {
-  const std::string& value =
-      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
-  try {
-    if (name == "block_based_table_factory") {
-      // Nested options
-      BlockBasedTableOptions table_opt, base_table_options;
-      BlockBasedTableFactory* block_based_table_factory =
-          static_cast_with_check<BlockBasedTableFactory, TableFactory>(
-              new_options->table_factory.get());
-      if (block_based_table_factory != nullptr) {
-        base_table_options = block_based_table_factory->table_options();
-      }
-      Status table_opt_s = GetBlockBasedTableOptionsFromString(
-          base_table_options, value, &table_opt);
-      if (!table_opt_s.ok()) {
-        return Status::InvalidArgument(
-            "unable to parse the specified CF option " + name);
-      }
-      new_options->table_factory.reset(NewBlockBasedTableFactory(table_opt));
-    } else if (name == "plain_table_factory") {
-      // Nested options
-      PlainTableOptions table_opt, base_table_options;
-      PlainTableFactory* plain_table_factory =
-          static_cast_with_check<PlainTableFactory, TableFactory>(
-              new_options->table_factory.get());
-      if (plain_table_factory != nullptr) {
-        base_table_options = plain_table_factory->table_options();
-      }
-      Status table_opt_s = GetPlainTableOptionsFromString(
-          base_table_options, value, &table_opt);
-      if (!table_opt_s.ok()) {
-        return Status::InvalidArgument(
-            "unable to parse the specified CF option " + name);
-      }
-      new_options->table_factory.reset(NewPlainTableFactory(table_opt));
-    } else if (name == "memtable") {
-      std::unique_ptr<MemTableRepFactory> new_mem_factory;
-      Status mem_factory_s =
-          GetMemTableRepFactoryFromString(value, &new_mem_factory);
-      if (!mem_factory_s.ok()) {
-        return Status::InvalidArgument(
-            "unable to parse the specified CF option " + name);
-      }
-      new_options->memtable_factory.reset(new_mem_factory.release());
-    } else if (name == "bottommost_compression_opts") {
-      Status s = ParseCompressionOptions(
-          value, name, new_options->bottommost_compression_opts);
-      if (!s.ok()) {
-        return s;
-      }
-    } else if (name == "compression_opts") {
-      Status s =
-          ParseCompressionOptions(value, name, new_options->compression_opts);
-      if (!s.ok()) {
-        return s;
-      }
-    } else {
-      if (name == kNameComparator) {
-        // Try to get comparator from object registry first.
-        // Only support static comparator for now.
-        Status status = ObjectRegistry::NewInstance()->NewStaticObject(
-            value, &new_options->comparator);
-        if (status.ok()) {
-          return status;
-        }
-      } else if (name == kNameMergeOperator) {
-        // Try to get merge operator from object registry first.
-        std::shared_ptr<MergeOperator> mo;
-        Status status =
-            ObjectRegistry::NewInstance()->NewSharedObject<MergeOperator>(
-                value, &new_options->merge_operator);
-        // Only support static comparator for now.
-        if (status.ok()) {
-          return status;
-        }
-      }
-
-      auto iter = cf_options_type_info.find(name);
-      if (iter == cf_options_type_info.end()) {
-        return Status::InvalidArgument(
-            "Unable to parse the specified CF option " + name);
-      }
-      const auto& opt_info = iter->second;
-      if (opt_info.verification != OptionVerificationType::kDeprecated &&
-          ParseOptionHelper(
-              reinterpret_cast<char*>(new_options) + opt_info.offset,
-              opt_info.type, value)) {
-        return Status::OK();
-      }
-      switch (opt_info.verification) {
-        case OptionVerificationType::kByName:
-        case OptionVerificationType::kByNameAllowNull:
-        case OptionVerificationType::kByNameAllowFromNull:
-          return Status::NotSupported(
-              "Deserializing the specified CF option " + name +
-                  " is not supported");
-        case OptionVerificationType::kDeprecated:
-          return Status::OK();
-        default:
-          return Status::InvalidArgument(
-              "Unable to parse the specified CF option " + name);
-      }
-    }
-  } catch (const std::exception&) {
-    return Status::InvalidArgument(
-        "unable to parse the specified option " + name);
-  }
-  return Status::OK();
-}
 
-template <typename T>
-bool SerializeSingleStructOption(
-    std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
-    const std::string& name, const std::string& delimiter) {
-  auto iter = type_info.find(name);
-  if (iter == type_info.end()) {
-    return false;
-  }
-  auto& opt_info = iter->second;
-  const char* opt_address =
-      reinterpret_cast<const char*>(&options) + opt_info.offset;
-  std::string value;
-  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
-  if (result) {
-    *opt_string = name + "=" + value + delimiter;
-  }
-  return result;
+Status GetStringFromDBOptions(std::string* opt_string,
+                              const DBOptions& db_options,
+                              const std::string& delimiter) {
+  ConfigOptions config_options(db_options);
+  config_options.delimiter = delimiter;
+  return GetStringFromDBOptions(config_options, db_options, opt_string);
 }
 
-template <typename T>
-Status GetStringFromStruct(
-    std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
-    const std::string& delimiter) {
+Status GetStringFromDBOptions(const ConfigOptions& config_options,
+                              const DBOptions& db_options,
+                              std::string* opt_string) {
   assert(opt_string);
   opt_string->clear();
-  for (auto iter = type_info.begin(); iter != type_info.end(); ++iter) {
-    if (iter->second.verification == OptionVerificationType::kDeprecated) {
-      // If the option is no longer used in rocksdb and marked as deprecated,
-      // we skip it in the serialization.
-      continue;
-    }
-    std::string single_output;
-    bool result = SerializeSingleStructOption<T>(
-        &single_output, options, type_info, iter->first, delimiter);
-    if (result) {
-      opt_string->append(single_output);
-    } else {
-      return Status::InvalidArgument("failed to serialize %s\n",
-                                     iter->first.c_str());
-    }
-    assert(result);
-  }
-  return Status::OK();
+  auto config = DBOptionsAsConfigurable(db_options);
+  return config->GetOptionString(config_options, opt_string);
 }
 
-Status GetStringFromDBOptions(std::string* opt_string,
-                              const DBOptions& db_options,
-                              const std::string& delimiter) {
-  return GetStringFromStruct<DBOptions>(opt_string, db_options,
-                                        db_options_type_info, delimiter);
-}
 
 Status GetStringFromColumnFamilyOptions(std::string* opt_string,
                                         const ColumnFamilyOptions& cf_options,
                                         const std::string& delimiter) {
-  return GetStringFromStruct<ColumnFamilyOptions>(
-      opt_string, cf_options, cf_options_type_info, delimiter);
+  ConfigOptions config_options;
+  config_options.delimiter = delimiter;
+  return GetStringFromColumnFamilyOptions(config_options, cf_options,
+                                          opt_string);
+}
+
+Status GetStringFromColumnFamilyOptions(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& cf_options,
+                                        std::string* opt_string) {
+  const auto config = CFOptionsAsConfigurable(cf_options);
+  return config->GetOptionString(config_options, opt_string);
 }
 
 Status GetStringFromCompressionType(std::string* compression_str,
@@ -1173,124 +646,62 @@
   }
 }
 
-std::vector<CompressionType> GetSupportedCompressions() {
-  std::vector<CompressionType> supported_compressions;
-  for (const auto& comp_to_name : compression_type_string_map) {
-    CompressionType t = comp_to_name.second;
-    if (t != kDisableCompressionOption && CompressionTypeSupported(t)) {
-      supported_compressions.push_back(t);
-    }
-  }
-  return supported_compressions;
-}
-
-Status ParseDBOption(const std::string& name,
-                     const std::string& org_value,
-                     DBOptions* new_options,
-                     bool input_strings_escaped = false) {
-  const std::string& value =
-      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
-  try {
-    if (name == "rate_limiter_bytes_per_sec") {
-      new_options->rate_limiter.reset(
-          NewGenericRateLimiter(static_cast<int64_t>(ParseUint64(value))));
-    } else if (name == kNameEnv) {
-      // Currently `Env` can be deserialized from object registry only.
-      Env* env = new_options->env;
-      Status status = Env::LoadEnv(value, &env);
-      // Only support static env for now.
-      if (status.ok()) {
-        new_options->env = env;
-      }
-    } else {
-      auto iter = db_options_type_info.find(name);
-      if (iter == db_options_type_info.end()) {
-        return Status::InvalidArgument("Unrecognized option DBOptions:", name);
-      }
-      const auto& opt_info = iter->second;
-      if (opt_info.verification != OptionVerificationType::kDeprecated &&
-          ParseOptionHelper(
-              reinterpret_cast<char*>(new_options) + opt_info.offset,
-              opt_info.type, value)) {
-        return Status::OK();
-      }
-      switch (opt_info.verification) {
-        case OptionVerificationType::kByName:
-        case OptionVerificationType::kByNameAllowNull:
-          return Status::NotSupported(
-              "Deserializing the specified DB option " + name +
-                  " is not supported");
-        case OptionVerificationType::kDeprecated:
-          return Status::OK();
-        default:
-          return Status::InvalidArgument(
-              "Unable to parse the specified DB option " + name);
-      }
-    }
-  } catch (const std::exception&) {
-    return Status::InvalidArgument("Unable to parse DBOptions:", name);
-  }
-  return Status::OK();
-}
-
 Status GetColumnFamilyOptionsFromMap(
     const ColumnFamilyOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     ColumnFamilyOptions* new_options, bool input_strings_escaped,
     bool ignore_unknown_options) {
-  return GetColumnFamilyOptionsFromMapInternal(
-      base_options, opts_map, new_options, input_strings_escaped, nullptr,
-      ignore_unknown_options);
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map,
+                                       new_options);
 }
 
-Status GetColumnFamilyOptionsFromMapInternal(
+Status GetColumnFamilyOptionsFromMap(
+    const ConfigOptions& config_options,
     const ColumnFamilyOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
-    ColumnFamilyOptions* new_options, bool input_strings_escaped,
-    std::vector<std::string>* unsupported_options_names,
-    bool ignore_unknown_options) {
+    ColumnFamilyOptions* new_options) {
   assert(new_options);
+
   *new_options = base_options;
-  if (unsupported_options_names) {
-    unsupported_options_names->clear();
-  }
-  for (const auto& o : opts_map) {
-    auto s = ParseColumnFamilyOption(o.first, o.second, new_options,
-                                 input_strings_escaped);
-    if (!s.ok()) {
-      if (s.IsNotSupported()) {
-        // If the deserialization of the specified option is not supported
-        // and an output vector of unsupported_options is provided, then
-        // we log the name of the unsupported option and proceed.
-        if (unsupported_options_names != nullptr) {
-          unsupported_options_names->push_back(o.first);
-        }
-        // Note that we still return Status::OK in such case to maintain
-        // the backward compatibility in the old public API defined in
-        // rocksdb/convenience.h
-      } else if (s.IsInvalidArgument() && ignore_unknown_options) {
-        continue;
-      } else {
-        // Restore "new_options" to the default "base_options".
-        *new_options = base_options;
-        return s;
-      }
-    }
+
+  const auto config = CFOptionsAsConfigurable(base_options);
+  Status s = ConfigureFromMap<ColumnFamilyOptions>(
+      config_options, opts_map, OptionsHelper::kCFOptionsName, config.get(),
+      new_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
   }
-  return Status::OK();
 }
 
 Status GetColumnFamilyOptionsFromString(
     const ColumnFamilyOptions& base_options,
     const std::string& opts_str,
     ColumnFamilyOptions* new_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  return GetColumnFamilyOptionsFromString(config_options, base_options,
+                                          opts_str, new_options);
+}
+
+Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options,
+                                        const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options) {
   std::unordered_map<std::string, std::string> opts_map;
   Status s = StringToMap(opts_str, &opts_map);
   if (!s.ok()) {
     *new_options = base_options;
     return s;
   }
-  return GetColumnFamilyOptionsFromMap(base_options, opts_map, new_options);
+  return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map,
+                                       new_options);
 }
 
 Status GetDBOptionsFromMap(
@@ -1298,417 +709,103 @@
     const std::unordered_map<std::string, std::string>& opts_map,
     DBOptions* new_options, bool input_strings_escaped,
     bool ignore_unknown_options) {
-  return GetDBOptionsFromMapInternal(base_options, opts_map, new_options,
-                                     input_strings_escaped, nullptr,
-                                     ignore_unknown_options);
+  ConfigOptions config_options(base_options);
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  return GetDBOptionsFromMap(config_options, base_options, opts_map,
+                             new_options);
 }
 
-Status GetDBOptionsFromMapInternal(
-    const DBOptions& base_options,
+Status GetDBOptionsFromMap(
+    const ConfigOptions& config_options, const DBOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
-    DBOptions* new_options, bool input_strings_escaped,
-    std::vector<std::string>* unsupported_options_names,
-    bool ignore_unknown_options) {
+    DBOptions* new_options) {
   assert(new_options);
   *new_options = base_options;
-  if (unsupported_options_names) {
-    unsupported_options_names->clear();
-  }
-  for (const auto& o : opts_map) {
-    auto s = ParseDBOption(o.first, o.second,
-                           new_options, input_strings_escaped);
-    if (!s.ok()) {
-      if (s.IsNotSupported()) {
-        // If the deserialization of the specified option is not supported
-        // and an output vector of unsupported_options is provided, then
-        // we log the name of the unsupported option and proceed.
-        if (unsupported_options_names != nullptr) {
-          unsupported_options_names->push_back(o.first);
-        }
-        // Note that we still return Status::OK in such case to maintain
-        // the backward compatibility in the old public API defined in
-        // rocksdb/convenience.h
-      } else if (s.IsInvalidArgument() && ignore_unknown_options) {
-        continue;
-      } else {
-        // Restore "new_options" to the default "base_options".
-        *new_options = base_options;
-        return s;
-      }
-    }
+  auto config = DBOptionsAsConfigurable(base_options);
+  Status s = ConfigureFromMap<DBOptions>(config_options, opts_map,
+                                         OptionsHelper::kDBOptionsName,
+                                         config.get(), new_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
   }
-  return Status::OK();
 }
 
-Status GetDBOptionsFromString(
-    const DBOptions& base_options,
-    const std::string& opts_str,
-    DBOptions* new_options) {
+Status GetDBOptionsFromString(const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options) {
+  ConfigOptions config_options(base_options);
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
+  return GetDBOptionsFromString(config_options, base_options, opts_str,
+                                new_options);
+}
+
+Status GetDBOptionsFromString(const ConfigOptions& config_options,
+                              const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options) {
   std::unordered_map<std::string, std::string> opts_map;
   Status s = StringToMap(opts_str, &opts_map);
   if (!s.ok()) {
     *new_options = base_options;
     return s;
   }
-  return GetDBOptionsFromMap(base_options, opts_map, new_options);
+  return GetDBOptionsFromMap(config_options, base_options, opts_map,
+                             new_options);
 }
 
 Status GetOptionsFromString(const Options& base_options,
                             const std::string& opts_str, Options* new_options) {
+  ConfigOptions config_options(base_options);
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
+  return GetOptionsFromString(config_options, base_options, opts_str,
+                              new_options);
+}
+
+Status GetOptionsFromString(const ConfigOptions& config_options,
+                            const Options& base_options,
+                            const std::string& opts_str, Options* new_options) {
+  ColumnFamilyOptions new_cf_options;
+  std::unordered_map<std::string, std::string> unused_opts;
   std::unordered_map<std::string, std::string> opts_map;
+
+  assert(new_options);
+  *new_options = base_options;
   Status s = StringToMap(opts_str, &opts_map);
   if (!s.ok()) {
     return s;
   }
-  DBOptions new_db_options(base_options);
-  ColumnFamilyOptions new_cf_options(base_options);
-  for (const auto& o : opts_map) {
-    if (ParseDBOption(o.first, o.second, &new_db_options).ok()) {
-    } else if (ParseColumnFamilyOption(
-        o.first, o.second, &new_cf_options).ok()) {
+  auto config = DBOptionsAsConfigurable(base_options);
+  s = config->ConfigureFromMap(config_options, opts_map, &unused_opts);
+
+  if (s.ok()) {
+    DBOptions* new_db_options =
+        config->GetOptions<DBOptions>(OptionsHelper::kDBOptionsName);
+    if (!unused_opts.empty()) {
+      s = GetColumnFamilyOptionsFromMap(config_options, base_options,
+                                        unused_opts, &new_cf_options);
+      if (s.ok()) {
+        *new_options = Options(*new_db_options, new_cf_options);
+      }
     } else {
-      return Status::InvalidArgument("Can't parse option " + o.first);
+      *new_options = Options(*new_db_options, base_options);
     }
   }
-  *new_options = Options(new_db_options, new_cf_options);
-  return Status::OK();
-}
-
-Status GetTableFactoryFromMap(
-    const std::string& factory_name,
-    const std::unordered_map<std::string, std::string>& opt_map,
-    std::shared_ptr<TableFactory>* table_factory, bool ignore_unknown_options) {
-  Status s;
-  if (factory_name == BlockBasedTableFactory().Name()) {
-    BlockBasedTableOptions bbt_opt;
-    s = GetBlockBasedTableOptionsFromMap(BlockBasedTableOptions(), opt_map,
-                                         &bbt_opt,
-                                         true, /* input_strings_escaped */
-                                         ignore_unknown_options);
-    if (!s.ok()) {
-      return s;
-    }
-    table_factory->reset(new BlockBasedTableFactory(bbt_opt));
-    return Status::OK();
-  } else if (factory_name == PlainTableFactory().Name()) {
-    PlainTableOptions pt_opt;
-    s = GetPlainTableOptionsFromMap(PlainTableOptions(), opt_map, &pt_opt,
-                                    true, /* input_strings_escaped */
-                                    ignore_unknown_options);
-    if (!s.ok()) {
-      return s;
-    }
-    table_factory->reset(new PlainTableFactory(pt_opt));
-    return Status::OK();
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
   }
-  // Return OK for not supported table factories as TableFactory
-  // Deserialization is optional.
-  table_factory->reset();
-  return Status::OK();
 }
 
-std::unordered_map<std::string, OptionTypeInfo>
-    OptionsHelper::db_options_type_info = {
-        /*
-         // not yet supported
-          std::shared_ptr<Cache> row_cache;
-          std::shared_ptr<DeleteScheduler> delete_scheduler;
-          std::shared_ptr<Logger> info_log;
-          std::shared_ptr<RateLimiter> rate_limiter;
-          std::shared_ptr<Statistics> statistics;
-          std::vector<DbPath> db_paths;
-          std::vector<std::shared_ptr<EventListener>> listeners;
-         */
-        {"advise_random_on_open",
-         {offsetof(struct DBOptions, advise_random_on_open),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"allow_mmap_reads",
-         {offsetof(struct DBOptions, allow_mmap_reads), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"allow_fallocate",
-         {offsetof(struct DBOptions, allow_fallocate), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"allow_mmap_writes",
-         {offsetof(struct DBOptions, allow_mmap_writes), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"use_direct_reads",
-         {offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"use_direct_writes",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"use_direct_io_for_flush_and_compaction",
-         {offsetof(struct DBOptions, use_direct_io_for_flush_and_compaction),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"allow_2pc",
-         {offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"allow_os_buffer",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"create_if_missing",
-         {offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"create_missing_column_families",
-         {offsetof(struct DBOptions, create_missing_column_families),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"disableDataSync",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"disable_data_sync",  // for compatibility
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"enable_thread_tracking",
-         {offsetof(struct DBOptions, enable_thread_tracking),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"error_if_exists",
-         {offsetof(struct DBOptions, error_if_exists), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"is_fd_close_on_exec",
-         {offsetof(struct DBOptions, is_fd_close_on_exec), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"paranoid_checks",
-         {offsetof(struct DBOptions, paranoid_checks), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"skip_log_error_on_recovery",
-         {offsetof(struct DBOptions, skip_log_error_on_recovery),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"skip_stats_update_on_db_open",
-         {offsetof(struct DBOptions, skip_stats_update_on_db_open),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"skip_checking_sst_file_sizes_on_db_open",
-         {offsetof(struct DBOptions, skip_checking_sst_file_sizes_on_db_open),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"new_table_reader_for_compaction_inputs",
-         {offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"compaction_readahead_size",
-         {offsetof(struct DBOptions, compaction_readahead_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, compaction_readahead_size)}},
-        {"random_access_max_buffer_size",
-         {offsetof(struct DBOptions, random_access_max_buffer_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-        {"use_adaptive_mutex",
-         {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"use_fsync",
-         {offsetof(struct DBOptions, use_fsync), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"max_background_jobs",
-         {offsetof(struct DBOptions, max_background_jobs), OptionType::kInt,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, max_background_jobs)}},
-        {"max_background_compactions",
-         {offsetof(struct DBOptions, max_background_compactions),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, max_background_compactions)}},
-        {"base_background_compactions",
-         {offsetof(struct DBOptions, base_background_compactions),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, base_background_compactions)}},
-        {"max_background_flushes",
-         {offsetof(struct DBOptions, max_background_flushes), OptionType::kInt,
-          OptionVerificationType::kNormal, false, 0}},
-        {"max_file_opening_threads",
-         {offsetof(struct DBOptions, max_file_opening_threads),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"max_open_files",
-         {offsetof(struct DBOptions, max_open_files), OptionType::kInt,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, max_open_files)}},
-        {"table_cache_numshardbits",
-         {offsetof(struct DBOptions, table_cache_numshardbits),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"db_write_buffer_size",
-         {offsetof(struct DBOptions, db_write_buffer_size), OptionType::kSizeT,
-          OptionVerificationType::kNormal, false, 0}},
-        {"keep_log_file_num",
-         {offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT,
-          OptionVerificationType::kNormal, false, 0}},
-        {"recycle_log_file_num",
-         {offsetof(struct DBOptions, recycle_log_file_num), OptionType::kSizeT,
-          OptionVerificationType::kNormal, false, 0}},
-        {"log_file_time_to_roll",
-         {offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT,
-          OptionVerificationType::kNormal, false, 0}},
-        {"manifest_preallocation_size",
-         {offsetof(struct DBOptions, manifest_preallocation_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-        {"max_log_file_size",
-         {offsetof(struct DBOptions, max_log_file_size), OptionType::kSizeT,
-          OptionVerificationType::kNormal, false, 0}},
-        {"db_log_dir",
-         {offsetof(struct DBOptions, db_log_dir), OptionType::kString,
-          OptionVerificationType::kNormal, false, 0}},
-        {"wal_dir",
-         {offsetof(struct DBOptions, wal_dir), OptionType::kString,
-          OptionVerificationType::kNormal, false, 0}},
-        {"max_subcompactions",
-         {offsetof(struct DBOptions, max_subcompactions), OptionType::kUInt32T,
-          OptionVerificationType::kNormal, false, 0}},
-        {"WAL_size_limit_MB",
-         {offsetof(struct DBOptions, WAL_size_limit_MB), OptionType::kUInt64T,
-          OptionVerificationType::kNormal, false, 0}},
-        {"WAL_ttl_seconds",
-         {offsetof(struct DBOptions, WAL_ttl_seconds), OptionType::kUInt64T,
-          OptionVerificationType::kNormal, false, 0}},
-        {"bytes_per_sync",
-         {offsetof(struct DBOptions, bytes_per_sync), OptionType::kUInt64T,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, bytes_per_sync)}},
-        {"delayed_write_rate",
-         {offsetof(struct DBOptions, delayed_write_rate), OptionType::kUInt64T,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, delayed_write_rate)}},
-        {"delete_obsolete_files_period_micros",
-         {offsetof(struct DBOptions, delete_obsolete_files_period_micros),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions,
-                   delete_obsolete_files_period_micros)}},
-        {"max_manifest_file_size",
-         {offsetof(struct DBOptions, max_manifest_file_size),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-        {"max_total_wal_size",
-         {offsetof(struct DBOptions, max_total_wal_size), OptionType::kUInt64T,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, max_total_wal_size)}},
-        {"wal_bytes_per_sync",
-         {offsetof(struct DBOptions, wal_bytes_per_sync), OptionType::kUInt64T,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, wal_bytes_per_sync)}},
-        {"strict_bytes_per_sync",
-         {offsetof(struct DBOptions, strict_bytes_per_sync),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, strict_bytes_per_sync)}},
-        {"stats_dump_period_sec",
-         {offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, stats_dump_period_sec)}},
-        {"stats_persist_period_sec",
-         {offsetof(struct DBOptions, stats_persist_period_sec),
-          OptionType::kUInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, stats_persist_period_sec)}},
-        {"persist_stats_to_disk",
-         {offsetof(struct DBOptions, persist_stats_to_disk),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false,
-          offsetof(struct ImmutableDBOptions, persist_stats_to_disk)}},
-        {"stats_history_buffer_size",
-         {offsetof(struct DBOptions, stats_history_buffer_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, stats_history_buffer_size)}},
-        {"fail_if_options_file_error",
-         {offsetof(struct DBOptions, fail_if_options_file_error),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"enable_pipelined_write",
-         {offsetof(struct DBOptions, enable_pipelined_write),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"unordered_write",
-         {offsetof(struct DBOptions, unordered_write), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"allow_concurrent_memtable_write",
-         {offsetof(struct DBOptions, allow_concurrent_memtable_write),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"wal_recovery_mode",
-         {offsetof(struct DBOptions, wal_recovery_mode),
-          OptionType::kWALRecoveryMode, OptionVerificationType::kNormal, false,
-          0}},
-        {"enable_write_thread_adaptive_yield",
-         {offsetof(struct DBOptions, enable_write_thread_adaptive_yield),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"write_thread_slow_yield_usec",
-         {offsetof(struct DBOptions, write_thread_slow_yield_usec),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-        {"max_write_batch_group_size_bytes",
-         {offsetof(struct DBOptions, max_write_batch_group_size_bytes),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-        {"write_thread_max_yield_usec",
-         {offsetof(struct DBOptions, write_thread_max_yield_usec),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-        {"access_hint_on_compaction_start",
-         {offsetof(struct DBOptions, access_hint_on_compaction_start),
-          OptionType::kAccessHint, OptionVerificationType::kNormal, false, 0}},
-        {"info_log_level",
-         {offsetof(struct DBOptions, info_log_level), OptionType::kInfoLogLevel,
-          OptionVerificationType::kNormal, false, 0}},
-        {"dump_malloc_stats",
-         {offsetof(struct DBOptions, dump_malloc_stats), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false, 0}},
-        {"avoid_flush_during_recovery",
-         {offsetof(struct DBOptions, avoid_flush_during_recovery),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"avoid_flush_during_shutdown",
-         {offsetof(struct DBOptions, avoid_flush_during_shutdown),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}},
-        {"writable_file_max_buffer_size",
-         {offsetof(struct DBOptions, writable_file_max_buffer_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableDBOptions, writable_file_max_buffer_size)}},
-        {"allow_ingest_behind",
-         {offsetof(struct DBOptions, allow_ingest_behind), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false,
-          offsetof(struct ImmutableDBOptions, allow_ingest_behind)}},
-        {"preserve_deletes",
-         {offsetof(struct DBOptions, preserve_deletes), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false,
-          offsetof(struct ImmutableDBOptions, preserve_deletes)}},
-        {"concurrent_prepare",  // Deprecated by two_write_queues
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"two_write_queues",
-         {offsetof(struct DBOptions, two_write_queues), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false,
-          offsetof(struct ImmutableDBOptions, two_write_queues)}},
-        {"manual_wal_flush",
-         {offsetof(struct DBOptions, manual_wal_flush), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false,
-          offsetof(struct ImmutableDBOptions, manual_wal_flush)}},
-        {"seq_per_batch",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"atomic_flush",
-         {offsetof(struct DBOptions, atomic_flush), OptionType::kBoolean,
-          OptionVerificationType::kNormal, false,
-          offsetof(struct ImmutableDBOptions, atomic_flush)}},
-        {"avoid_unnecessary_blocking_io",
-         {offsetof(struct DBOptions, avoid_unnecessary_blocking_io),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false,
-          offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}},
-        {"write_dbid_to_manifest",
-         {offsetof(struct DBOptions, write_dbid_to_manifest),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"log_readahead_size",
-         {offsetof(struct DBOptions, log_readahead_size), OptionType::kSizeT,
-          OptionVerificationType::kNormal, false, 0}},
-};
-
-std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
-    OptionsHelper::block_base_table_index_type_string_map = {
-        {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
-        {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
-        {"kTwoLevelIndexSearch",
-         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
-        {"kBinarySearchWithFirstKey",
-         BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
-
-std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
-    OptionsHelper::block_base_table_data_block_index_type_string_map = {
-        {"kDataBlockBinarySearch",
-         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
-        {"kDataBlockBinaryAndHash",
-         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
-
-std::unordered_map<std::string, BlockBasedTableOptions::IndexShorteningMode>
-    OptionsHelper::block_base_table_index_shortening_mode_string_map = {
-        {"kNoShortening",
-         BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
-        {"kShortenSeparators",
-         BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
-        {"kShortenSeparatorsAndSuccessor",
-         BlockBasedTableOptions::IndexShorteningMode::
-             kShortenSeparatorsAndSuccessor}};
-
 std::unordered_map<std::string, EncodingType>
     OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
                                                {"kPrefix", kPrefix}};
@@ -1727,398 +824,574 @@
         {"kOldestSmallestSeqFirst", kOldestSmallestSeqFirst},
         {"kMinOverlappingRatio", kMinOverlappingRatio}};
 
-std::unordered_map<std::string, WALRecoveryMode>
-    OptionsHelper::wal_recovery_mode_string_map = {
-        {"kTolerateCorruptedTailRecords",
-         WALRecoveryMode::kTolerateCorruptedTailRecords},
-        {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency},
-        {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery},
-        {"kSkipAnyCorruptedRecords",
-         WALRecoveryMode::kSkipAnyCorruptedRecords}};
-
-std::unordered_map<std::string, DBOptions::AccessHint>
-    OptionsHelper::access_hint_string_map = {
-        {"NONE", DBOptions::AccessHint::NONE},
-        {"NORMAL", DBOptions::AccessHint::NORMAL},
-        {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL},
-        {"WILLNEED", DBOptions::AccessHint::WILLNEED}};
-
-std::unordered_map<std::string, InfoLogLevel>
-    OptionsHelper::info_log_level_string_map = {
-        {"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL},
-        {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL},
-        {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL},
-        {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL},
-        {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL},
-        {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}};
-
-ColumnFamilyOptions OptionsHelper::dummy_cf_options;
-CompactionOptionsFIFO OptionsHelper::dummy_comp_options;
-LRUCacheOptions OptionsHelper::dummy_lru_cache_options;
-CompactionOptionsUniversal OptionsHelper::dummy_comp_options_universal;
-
-// offset_of is used to get the offset of a class data member
-// ex: offset_of(&ColumnFamilyOptions::num_levels)
-// This call will return the offset of num_levels in ColumnFamilyOptions class
-//
-// This is the same as offsetof() but allow us to work with non standard-layout
-// classes and structures
-// refs:
-// http://en.cppreference.com/w/cpp/concept/StandardLayoutType
-// https://gist.github.com/graphitemaster/494f21190bb2c63c5516
-template <typename T1>
-int offset_of(T1 ColumnFamilyOptions::*member) {
-  return int(size_t(&(OptionsHelper::dummy_cf_options.*member)) -
-             size_t(&OptionsHelper::dummy_cf_options));
-}
-template <typename T1>
-int offset_of(T1 AdvancedColumnFamilyOptions::*member) {
-  return int(size_t(&(OptionsHelper::dummy_cf_options.*member)) -
-             size_t(&OptionsHelper::dummy_cf_options));
-}
-template <typename T1>
-int offset_of(T1 CompactionOptionsFIFO::*member) {
-  return int(size_t(&(OptionsHelper::dummy_comp_options.*member)) -
-             size_t(&OptionsHelper::dummy_comp_options));
-}
-template <typename T1>
-int offset_of(T1 LRUCacheOptions::*member) {
-  return int(size_t(&(OptionsHelper::dummy_lru_cache_options.*member)) -
-             size_t(&OptionsHelper::dummy_lru_cache_options));
-}
-template <typename T1>
-int offset_of(T1 CompactionOptionsUniversal::*member) {
-  return int(size_t(&(OptionsHelper::dummy_comp_options_universal.*member)) -
-             size_t(&OptionsHelper::dummy_comp_options_universal));
-}
-
-std::unordered_map<std::string, OptionTypeInfo>
-    OptionsHelper::cf_options_type_info = {
-        /* not yet supported
-        CompressionOptions compression_opts;
-        TablePropertiesCollectorFactories table_properties_collector_factories;
-        typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
-            TablePropertiesCollectorFactories;
-        UpdateStatus (*inplace_callback)(char* existing_value,
-                                         uint34_t* existing_value_size,
-                                         Slice delta_value,
-                                         std::string* merged_value);
-        std::vector<DbPath> cf_paths;
-         */
-        {"report_bg_io_stats",
-         {offset_of(&ColumnFamilyOptions::report_bg_io_stats),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, report_bg_io_stats)}},
-        {"compaction_measure_io_stats",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"disable_auto_compactions",
-         {offset_of(&ColumnFamilyOptions::disable_auto_compactions),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, disable_auto_compactions)}},
-        {"filter_deletes",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"inplace_update_support",
-         {offset_of(&ColumnFamilyOptions::inplace_update_support),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"level_compaction_dynamic_level_bytes",
-         {offset_of(&ColumnFamilyOptions::level_compaction_dynamic_level_bytes),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"optimize_filters_for_hits",
-         {offset_of(&ColumnFamilyOptions::optimize_filters_for_hits),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"paranoid_file_checks",
-         {offset_of(&ColumnFamilyOptions::paranoid_file_checks),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, paranoid_file_checks)}},
-        {"force_consistency_checks",
-         {offset_of(&ColumnFamilyOptions::force_consistency_checks),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"purge_redundant_kvs_while_flush",
-         {offset_of(&ColumnFamilyOptions::purge_redundant_kvs_while_flush),
-          OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
-        {"verify_checksums_in_compaction",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"soft_pending_compaction_bytes_limit",
-         {offset_of(&ColumnFamilyOptions::soft_pending_compaction_bytes_limit),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions,
-                   soft_pending_compaction_bytes_limit)}},
-        {"hard_pending_compaction_bytes_limit",
-         {offset_of(&ColumnFamilyOptions::hard_pending_compaction_bytes_limit),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions,
-                   hard_pending_compaction_bytes_limit)}},
-        {"hard_rate_limit",
-         {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"soft_rate_limit",
-         {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"max_compaction_bytes",
-         {offset_of(&ColumnFamilyOptions::max_compaction_bytes),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, max_compaction_bytes)}},
-        {"expanded_compaction_factor",
-         {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
-        {"level0_file_num_compaction_trigger",
-         {offset_of(&ColumnFamilyOptions::level0_file_num_compaction_trigger),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions,
-                   level0_file_num_compaction_trigger)}},
-        {"level0_slowdown_writes_trigger",
-         {offset_of(&ColumnFamilyOptions::level0_slowdown_writes_trigger),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger)}},
-        {"level0_stop_writes_trigger",
-         {offset_of(&ColumnFamilyOptions::level0_stop_writes_trigger),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, level0_stop_writes_trigger)}},
-        {"max_grandparent_overlap_factor",
-         {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
-        {"max_mem_compaction_level",
-         {0, OptionType::kInt, OptionVerificationType::kDeprecated, false, 0}},
-        {"max_write_buffer_number",
-         {offset_of(&ColumnFamilyOptions::max_write_buffer_number),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, max_write_buffer_number)}},
-        {"max_write_buffer_number_to_maintain",
-         {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"max_write_buffer_size_to_maintain",
-         {offset_of(&ColumnFamilyOptions::max_write_buffer_size_to_maintain),
-          OptionType::kInt64T, OptionVerificationType::kNormal, false, 0}},
-        {"min_write_buffer_number_to_merge",
-         {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"num_levels",
-         {offset_of(&ColumnFamilyOptions::num_levels), OptionType::kInt,
-          OptionVerificationType::kNormal, false, 0}},
-        {"source_compaction_factor",
-         {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
-        {"target_file_size_multiplier",
-         {offset_of(&ColumnFamilyOptions::target_file_size_multiplier),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, target_file_size_multiplier)}},
-        {"arena_block_size",
-         {offset_of(&ColumnFamilyOptions::arena_block_size), OptionType::kSizeT,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, arena_block_size)}},
-        {"inplace_update_num_locks",
-         {offset_of(&ColumnFamilyOptions::inplace_update_num_locks),
-          OptionType::kSizeT, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, inplace_update_num_locks)}},
-        {"max_successive_merges",
-         {offset_of(&ColumnFamilyOptions::max_successive_merges),
-          OptionType::kSizeT, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, max_successive_merges)}},
-        {"memtable_huge_page_size",
-         {offset_of(&ColumnFamilyOptions::memtable_huge_page_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, memtable_huge_page_size)}},
-        {"memtable_prefix_bloom_huge_page_tlb_size",
-         {0, OptionType::kSizeT, OptionVerificationType::kDeprecated, true, 0}},
-        {"write_buffer_size",
-         {offset_of(&ColumnFamilyOptions::write_buffer_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, write_buffer_size)}},
-        {"bloom_locality",
-         {offset_of(&ColumnFamilyOptions::bloom_locality), OptionType::kUInt32T,
-          OptionVerificationType::kNormal, false, 0}},
-        {"memtable_prefix_bloom_bits",
-         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"memtable_prefix_bloom_size_ratio",
-         {offset_of(&ColumnFamilyOptions::memtable_prefix_bloom_size_ratio),
-          OptionType::kDouble, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio)}},
-        {"memtable_prefix_bloom_probes",
-         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"memtable_whole_key_filtering",
-         {offset_of(&ColumnFamilyOptions::memtable_whole_key_filtering),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, memtable_whole_key_filtering)}},
-        {"min_partial_merge_operands",
-         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"max_bytes_for_level_base",
-         {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, max_bytes_for_level_base)}},
-        {"snap_refresh_nanos",
-         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, true,
-          0}},
-        {"max_bytes_for_level_multiplier",
-         {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier),
-          OptionType::kDouble, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier)}},
-        {"max_bytes_for_level_multiplier_additional",
-         {offset_of(
-              &ColumnFamilyOptions::max_bytes_for_level_multiplier_additional),
-          OptionType::kVectorInt, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions,
-                   max_bytes_for_level_multiplier_additional)}},
-        {"max_sequential_skip_in_iterations",
-         {offset_of(&ColumnFamilyOptions::max_sequential_skip_in_iterations),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions,
-                   max_sequential_skip_in_iterations)}},
-        {"target_file_size_base",
-         {offset_of(&ColumnFamilyOptions::target_file_size_base),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, target_file_size_base)}},
-        {"rate_limit_delay_max_milliseconds",
-         {0, OptionType::kUInt, OptionVerificationType::kDeprecated, false, 0}},
-        {"compression",
-         {offset_of(&ColumnFamilyOptions::compression),
-          OptionType::kCompressionType, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, compression)}},
-        {"compression_per_level",
-         {offset_of(&ColumnFamilyOptions::compression_per_level),
-          OptionType::kVectorCompressionType, OptionVerificationType::kNormal,
-          false, 0}},
-        {"bottommost_compression",
-         {offset_of(&ColumnFamilyOptions::bottommost_compression),
-          OptionType::kCompressionType, OptionVerificationType::kNormal, false,
-          0}},
-        {kNameComparator,
-         {offset_of(&ColumnFamilyOptions::comparator), OptionType::kComparator,
-          OptionVerificationType::kByName, false, 0}},
-        {"prefix_extractor",
-         {offset_of(&ColumnFamilyOptions::prefix_extractor),
-          OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
-          true, offsetof(struct MutableCFOptions, prefix_extractor)}},
-        {"memtable_insert_with_hint_prefix_extractor",
-         {offset_of(
-              &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor),
-          OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
-          false, 0}},
-        {"memtable_factory",
-         {offset_of(&ColumnFamilyOptions::memtable_factory),
-          OptionType::kMemTableRepFactory, OptionVerificationType::kByName,
-          false, 0}},
-        {"table_factory",
-         {offset_of(&ColumnFamilyOptions::table_factory),
-          OptionType::kTableFactory, OptionVerificationType::kByName, false,
-          0}},
-        {"compaction_filter",
-         {offset_of(&ColumnFamilyOptions::compaction_filter),
-          OptionType::kCompactionFilter, OptionVerificationType::kByName, false,
-          0}},
-        {"compaction_filter_factory",
-         {offset_of(&ColumnFamilyOptions::compaction_filter_factory),
-          OptionType::kCompactionFilterFactory, OptionVerificationType::kByName,
-          false, 0}},
-        {kNameMergeOperator,
-         {offset_of(&ColumnFamilyOptions::merge_operator),
-          OptionType::kMergeOperator,
-          OptionVerificationType::kByNameAllowFromNull, false, 0}},
-        {"compaction_style",
-         {offset_of(&ColumnFamilyOptions::compaction_style),
-          OptionType::kCompactionStyle, OptionVerificationType::kNormal, false,
-          0}},
-        {"compaction_pri",
-         {offset_of(&ColumnFamilyOptions::compaction_pri),
-          OptionType::kCompactionPri, OptionVerificationType::kNormal, false,
-          0}},
-        {"compaction_options_fifo",
-         {offset_of(&ColumnFamilyOptions::compaction_options_fifo),
-          OptionType::kCompactionOptionsFIFO, OptionVerificationType::kNormal,
-          true, offsetof(struct MutableCFOptions, compaction_options_fifo)}},
-        {"compaction_options_universal",
-         {offset_of(&ColumnFamilyOptions::compaction_options_universal),
-          OptionType::kCompactionOptionsUniversal,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, compaction_options_universal)}},
-        {"ttl",
-         {offset_of(&ColumnFamilyOptions::ttl), OptionType::kUInt64T,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, ttl)}},
-        {"periodic_compaction_seconds",
-         {offset_of(&ColumnFamilyOptions::periodic_compaction_seconds),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, periodic_compaction_seconds)}},
-        {"sample_for_compression",
-         {offset_of(&ColumnFamilyOptions::sample_for_compression),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, sample_for_compression)}}};
-
-std::unordered_map<std::string, OptionTypeInfo>
-    OptionsHelper::fifo_compaction_options_type_info = {
-        {"max_table_files_size",
-         {offset_of(&CompactionOptionsFIFO::max_table_files_size),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct CompactionOptionsFIFO, max_table_files_size)}},
-        {"ttl",
-         {0, OptionType::kUInt64T,
-          OptionVerificationType::kDeprecated, false,
-          0}},
-        {"allow_compaction",
-         {offset_of(&CompactionOptionsFIFO::allow_compaction),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct CompactionOptionsFIFO, allow_compaction)}}};
-
-std::unordered_map<std::string, OptionTypeInfo>
-    OptionsHelper::universal_compaction_options_type_info = {
-        {"size_ratio",
-         {offset_of(&CompactionOptionsUniversal::size_ratio), OptionType::kUInt,
-          OptionVerificationType::kNormal, true,
-          offsetof(class CompactionOptionsUniversal, size_ratio)}},
-        {"min_merge_width",
-         {offset_of(&CompactionOptionsUniversal::min_merge_width),
-          OptionType::kUInt, OptionVerificationType::kNormal, true,
-          offsetof(class CompactionOptionsUniversal, min_merge_width)}},
-        {"max_merge_width",
-         {offset_of(&CompactionOptionsUniversal::max_merge_width),
-          OptionType::kUInt, OptionVerificationType::kNormal, true,
-          offsetof(class CompactionOptionsUniversal, max_merge_width)}},
-        {"max_size_amplification_percent",
-         {offset_of(
-              &CompactionOptionsUniversal::max_size_amplification_percent),
-          OptionType::kUInt, OptionVerificationType::kNormal, true,
-          offsetof(class CompactionOptionsUniversal,
-                   max_size_amplification_percent)}},
-        {"compression_size_percent",
-         {offset_of(&CompactionOptionsUniversal::compression_size_percent),
-          OptionType::kInt, OptionVerificationType::kNormal, true,
-          offsetof(class CompactionOptionsUniversal,
-                   compression_size_percent)}},
-        {"stop_style",
-         {offset_of(&CompactionOptionsUniversal::stop_style),
-          OptionType::kCompactionStopStyle, OptionVerificationType::kNormal,
-          true, offsetof(class CompactionOptionsUniversal, stop_style)}},
-        {"allow_trivial_move",
-         {offset_of(&CompactionOptionsUniversal::allow_trivial_move),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(class CompactionOptionsUniversal, allow_trivial_move)}}};
-
 std::unordered_map<std::string, CompactionStopStyle>
     OptionsHelper::compaction_stop_style_string_map = {
         {"kCompactionStopStyleSimilarSize", kCompactionStopStyleSimilarSize},
         {"kCompactionStopStyleTotalSize", kCompactionStopStyleTotalSize}};
 
-std::unordered_map<std::string, OptionTypeInfo>
-    OptionsHelper::lru_cache_options_type_info = {
-        {"capacity",
-         {offset_of(&LRUCacheOptions::capacity), OptionType::kSizeT,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct LRUCacheOptions, capacity)}},
-        {"num_shard_bits",
-         {offset_of(&LRUCacheOptions::num_shard_bits), OptionType::kInt,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct LRUCacheOptions, num_shard_bits)}},
-        {"strict_capacity_limit",
-         {offset_of(&LRUCacheOptions::strict_capacity_limit),
-          OptionType::kBoolean, OptionVerificationType::kNormal, true,
-          offsetof(struct LRUCacheOptions, strict_capacity_limit)}},
-        {"high_pri_pool_ratio",
-         {offset_of(&LRUCacheOptions::high_pri_pool_ratio), OptionType::kDouble,
-          OptionVerificationType::kNormal, true,
-          offsetof(struct LRUCacheOptions, high_pri_pool_ratio)}}};
+Status OptionTypeInfo::NextToken(const std::string& opts, char delimiter,
+                                 size_t pos, size_t* end, std::string* token) {
+  while (pos < opts.size() && isspace(opts[pos])) {
+    ++pos;
+  }
+  // Empty value at the end
+  if (pos >= opts.size()) {
+    *token = "";
+    *end = std::string::npos;
+    return Status::OK();
+  } else if (opts[pos] == '{') {
+    int count = 1;
+    size_t brace_pos = pos + 1;
+    while (brace_pos < opts.size()) {
+      if (opts[brace_pos] == '{') {
+        ++count;
+      } else if (opts[brace_pos] == '}') {
+        --count;
+        if (count == 0) {
+          break;
+        }
+      }
+      ++brace_pos;
+    }
+    // found the matching closing brace
+    if (count == 0) {
+      *token = trim(opts.substr(pos + 1, brace_pos - pos - 1));
+      // skip all whitespace and move to the next delimiter
+      // brace_pos points to the next position after the matching '}'
+      pos = brace_pos + 1;
+      while (pos < opts.size() && isspace(opts[pos])) {
+        ++pos;
+      }
+      if (pos < opts.size() && opts[pos] != delimiter) {
+        return Status::InvalidArgument("Unexpected chars after nested options");
+      }
+      *end = pos;
+    } else {
+      return Status::InvalidArgument(
+          "Mismatched curly braces for nested options");
+    }
+  } else {
+    *end = opts.find(delimiter, pos);
+    if (*end == std::string::npos) {
+      // It either ends with a trailing semi-colon or the last key-value pair
+      *token = trim(opts.substr(pos));
+    } else {
+      *token = trim(opts.substr(pos, *end - pos));
+    }
+  }
+  return Status::OK();
+}
+
+Status OptionTypeInfo::Parse(const ConfigOptions& config_options,
+                             const std::string& opt_name,
+                             const std::string& value, void* opt_ptr) const {
+  if (IsDeprecated()) {
+    return Status::OK();
+  }
+  try {
+    void* opt_addr = static_cast<char*>(opt_ptr) + offset_;
+    const std::string& opt_value = config_options.input_strings_escaped
+                                       ? UnescapeOptionString(value)
+                                       : value;
+
+    if (opt_addr == nullptr) {
+      return Status::NotFound("Could not find option", opt_name);
+    } else if (parse_func_ != nullptr) {
+      ConfigOptions copy = config_options;
+      copy.invoke_prepare_options = false;
+      return parse_func_(copy, opt_name, opt_value, opt_addr);
+    } else if (ParseOptionHelper(opt_addr, type_, opt_value)) {
+      return Status::OK();
+    } else if (IsConfigurable()) {
+      // The option is <config>.<name>
+      Configurable* config = AsRawPointer<Configurable>(opt_ptr);
+      if (opt_value.empty()) {
+        return Status::OK();
+      } else if (config == nullptr) {
+        return Status::NotFound("Could not find configurable: ", opt_name);
+      } else {
+        ConfigOptions copy = config_options;
+        copy.ignore_unknown_options = false;
+        copy.invoke_prepare_options = false;
+        if (opt_value.find("=") != std::string::npos) {
+          return config->ConfigureFromString(copy, opt_value);
+        } else {
+          return config->ConfigureOption(copy, opt_name, opt_value);
+        }
+      }
+    } else if (IsByName()) {
+      return Status::NotSupported("Deserializing the option " + opt_name +
+                                  " is not supported");
+    } else {
+      return Status::InvalidArgument("Error parsing:", opt_name);
+    }
+  } catch (std::exception& e) {
+    return Status::InvalidArgument("Error parsing " + opt_name + ":" +
+                                   std::string(e.what()));
+  }
+}
+
+Status OptionTypeInfo::ParseType(
+    const ConfigOptions& config_options, const std::string& opts_str,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    void* opt_addr, std::unordered_map<std::string, std::string>* unused) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status status = StringToMap(opts_str, &opts_map);
+  if (!status.ok()) {
+    return status;
+  } else {
+    return ParseType(config_options, opts_map, type_map, opt_addr, unused);
+  }
+}
+
+Status OptionTypeInfo::ParseType(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    void* opt_addr, std::unordered_map<std::string, std::string>* unused) {
+  for (const auto& opts_iter : opts_map) {
+    std::string opt_name;
+    const auto* opt_info = Find(opts_iter.first, type_map, &opt_name);
+    if (opt_info != nullptr) {
+      Status status =
+          opt_info->Parse(config_options, opt_name, opts_iter.second, opt_addr);
+      if (!status.ok()) {
+        return status;
+      }
+    } else if (unused != nullptr) {
+      (*unused)[opts_iter.first] = opts_iter.second;
+    } else if (!config_options.ignore_unknown_options) {
+      return Status::NotFound("Unrecognized option", opts_iter.first);
+    }
+  }
+  return Status::OK();
+}
+
+Status OptionTypeInfo::ParseStruct(
+    const ConfigOptions& config_options, const std::string& struct_name,
+    const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+    const std::string& opt_name, const std::string& opt_value, void* opt_addr) {
+  assert(struct_map);
+  Status status;
+  if (opt_name == struct_name || EndsWith(opt_name, "." + struct_name)) {
+    // This option represents the entire struct
+    std::unordered_map<std::string, std::string> unused;
+    status =
+        ParseType(config_options, opt_value, *struct_map, opt_addr, &unused);
+    if (status.ok() && !unused.empty()) {
+      status = Status::InvalidArgument(
+          "Unrecognized option", struct_name + "." + unused.begin()->first);
+    }
+  } else if (StartsWith(opt_name, struct_name + ".")) {
+    // This option represents a nested field in the struct (e.g, struct.field)
+    std::string elem_name;
+    const auto opt_info =
+        Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name);
+    if (opt_info != nullptr) {
+      status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr);
+    } else {
+      status = Status::InvalidArgument("Unrecognized option", opt_name);
+    }
+  } else {
+    // This option represents a field in the struct (e.g. field)
+    std::string elem_name;
+    const auto opt_info = Find(opt_name, *struct_map, &elem_name);
+    if (opt_info != nullptr) {
+      status = opt_info->Parse(config_options, elem_name, opt_value, opt_addr);
+    } else {
+      status = Status::InvalidArgument("Unrecognized option",
+                                       struct_name + "." + opt_name);
+    }
+  }
+  return status;
+}
 
+Status OptionTypeInfo::Serialize(const ConfigOptions& config_options,
+                                 const std::string& opt_name,
+                                 const void* const opt_ptr,
+                                 std::string* opt_value) const {
+  // If the option is no longer used in rocksdb and marked as deprecated,
+  // we skip it in the serialization.
+  const void* opt_addr = static_cast<const char*>(opt_ptr) + offset_;
+  if (opt_addr == nullptr || IsDeprecated()) {
+    return Status::OK();
+  } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) {
+    return Status::NotSupported("Cannot serialize option: ", opt_name);
+  } else if (serialize_func_ != nullptr) {
+    return serialize_func_(config_options, opt_name, opt_addr, opt_value);
+  } else if (IsCustomizable()) {
+    const Customizable* custom = AsRawPointer<Customizable>(opt_ptr);
+    opt_value->clear();
+    if (custom == nullptr) {
+      // We do not have a custom object to serialize.
+      // If the option is not mutable and we are doing only mutable options,
+      // we return an empty string (which will cause the option not to be
+      // printed). Otherwise, we return the "nullptr" string, which will result
+      // in "option=nullptr" being printed.
+      if (IsMutable() || !config_options.mutable_options_only) {
+        *opt_value = kNullptrString;
+      } else {
+        *opt_value = "";
+      }
+    } else if (IsEnabled(OptionTypeFlags::kStringNameOnly) &&
+               !config_options.IsDetailed()) {
+      if (!config_options.mutable_options_only || IsMutable()) {
+        *opt_value = custom->GetId();
+      }
+    } else {
+      ConfigOptions embedded = config_options;
+      embedded.delimiter = ";";
+      // If this option is mutable, everything inside it should be considered
+      // mutable
+      if (IsMutable()) {
+        embedded.mutable_options_only = false;
+      }
+      std::string value = custom->ToString(embedded);
+      if (!embedded.mutable_options_only ||
+          value.find("=") != std::string::npos) {
+        *opt_value = value;
+      } else {
+        *opt_value = "";
+      }
+    }
+    return Status::OK();
+  } else if (IsConfigurable()) {
+    const Configurable* config = AsRawPointer<Configurable>(opt_ptr);
+    if (config != nullptr) {
+      ConfigOptions embedded = config_options;
+      embedded.delimiter = ";";
+      *opt_value = config->ToString(embedded);
+    }
+    return Status::OK();
+  } else if (config_options.mutable_options_only && !IsMutable()) {
+    return Status::OK();
+  } else if (SerializeSingleOptionHelper(opt_addr, type_, opt_value)) {
+    return Status::OK();
+  } else {
+    return Status::InvalidArgument("Cannot serialize option: ", opt_name);
+  }
+}
+
+Status OptionTypeInfo::SerializeType(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* opt_addr, std::string* result) {
+  Status status;
+  for (const auto& iter : type_map) {
+    std::string single;
+    const auto& opt_info = iter.second;
+    if (opt_info.ShouldSerialize()) {
+      status =
+          opt_info.Serialize(config_options, iter.first, opt_addr, &single);
+      if (!status.ok()) {
+        return status;
+      } else {
+        result->append(iter.first + "=" + single + config_options.delimiter);
+      }
+    }
+  }
+  return status;
+}
+
+Status OptionTypeInfo::SerializeStruct(
+    const ConfigOptions& config_options, const std::string& struct_name,
+    const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+    const std::string& opt_name, const void* opt_addr, std::string* value) {
+  assert(struct_map);
+  Status status;
+  if (EndsWith(opt_name, struct_name)) {
+    // We are going to write the struct as "{ prop1=value1; prop2=value2;}.
+    // Set the delimiter to ";" so that the everything will be on one line.
+    ConfigOptions embedded = config_options;
+    embedded.delimiter = ";";
+
+    // This option represents the entire struct
+    std::string result;
+    status = SerializeType(embedded, *struct_map, opt_addr, &result);
+    if (!status.ok()) {
+      return status;
+    } else {
+      *value = "{" + result + "}";
+    }
+  } else if (StartsWith(opt_name, struct_name + ".")) {
+    // This option represents a nested field in the struct (e.g, struct.field)
+    std::string elem_name;
+    const auto opt_info =
+        Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name);
+    if (opt_info != nullptr) {
+      status = opt_info->Serialize(config_options, elem_name, opt_addr, value);
+    } else {
+      status = Status::InvalidArgument("Unrecognized option", opt_name);
+    }
+  } else {
+    // This option represents a field in the struct (e.g. field)
+    std::string elem_name;
+    const auto opt_info = Find(opt_name, *struct_map, &elem_name);
+    if (opt_info == nullptr) {
+      status = Status::InvalidArgument("Unrecognized option", opt_name);
+    } else if (opt_info->ShouldSerialize()) {
+      status = opt_info->Serialize(config_options, opt_name + "." + elem_name,
+                                   opt_addr, value);
+    }
+  }
+  return status;
+}
+
+template <typename T>
+bool IsOptionEqual(const void* offset1, const void* offset2) {
+  return (*static_cast<const T*>(offset1) == *static_cast<const T*>(offset2));
+}
+
+static bool AreEqualDoubles(const double a, const double b) {
+  return (fabs(a - b) < 0.00001);
+}
+
+static bool AreOptionsEqual(OptionType type, const void* this_offset,
+                            const void* that_offset) {
+  switch (type) {
+    case OptionType::kBoolean:
+      return IsOptionEqual<bool>(this_offset, that_offset);
+    case OptionType::kInt:
+      return IsOptionEqual<int>(this_offset, that_offset);
+    case OptionType::kUInt:
+      return IsOptionEqual<unsigned int>(this_offset, that_offset);
+    case OptionType::kInt32T:
+      return IsOptionEqual<int32_t>(this_offset, that_offset);
+    case OptionType::kInt64T: {
+      int64_t v1, v2;
+      GetUnaligned(static_cast<const int64_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const int64_t*>(that_offset), &v2);
+      return (v1 == v2);
+    }
+    case OptionType::kUInt8T:
+      return IsOptionEqual<uint8_t>(this_offset, that_offset);
+    case OptionType::kUInt32T:
+      return IsOptionEqual<uint32_t>(this_offset, that_offset);
+    case OptionType::kUInt64T: {
+      uint64_t v1, v2;
+      GetUnaligned(static_cast<const uint64_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const uint64_t*>(that_offset), &v2);
+      return (v1 == v2);
+    }
+    case OptionType::kSizeT: {
+      size_t v1, v2;
+      GetUnaligned(static_cast<const size_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const size_t*>(that_offset), &v2);
+      return (v1 == v2);
+    }
+    case OptionType::kString:
+      return IsOptionEqual<std::string>(this_offset, that_offset);
+    case OptionType::kDouble:
+      return AreEqualDoubles(*static_cast<const double*>(this_offset),
+                             *static_cast<const double*>(that_offset));
+    case OptionType::kCompactionStyle:
+      return IsOptionEqual<CompactionStyle>(this_offset, that_offset);
+    case OptionType::kCompactionStopStyle:
+      return IsOptionEqual<CompactionStopStyle>(this_offset, that_offset);
+    case OptionType::kCompactionPri:
+      return IsOptionEqual<CompactionPri>(this_offset, that_offset);
+    case OptionType::kCompressionType:
+      return IsOptionEqual<CompressionType>(this_offset, that_offset);
+    case OptionType::kChecksumType:
+      return IsOptionEqual<ChecksumType>(this_offset, that_offset);
+    case OptionType::kEncodingType:
+      return IsOptionEqual<EncodingType>(this_offset, that_offset);
+    case OptionType::kEncodedString:
+      return IsOptionEqual<std::string>(this_offset, that_offset);
+    default:
+      return false;
+  }  // End switch
+}
+
+bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options,
+                              const std::string& opt_name,
+                              const void* const this_ptr,
+                              const void* const that_ptr,
+                              std::string* mismatch) const {
+  auto level = GetSanityLevel();
+  if (!config_options.IsCheckEnabled(level)) {
+    return true;  // If the sanity level is not being checked, skip it
+  }
+  const void* this_addr = static_cast<const char*>(this_ptr) + offset_;
+  const void* that_addr = static_cast<const char*>(that_ptr) + offset_;
+  if (this_addr == nullptr || that_addr == nullptr) {
+    if (this_addr == that_addr) {
+      return true;
+    }
+  } else if (equals_func_ != nullptr) {
+    if (equals_func_(config_options, opt_name, this_addr, that_addr,
+                     mismatch)) {
+      return true;
+    }
+  } else if (AreOptionsEqual(type_, this_addr, that_addr)) {
+    return true;
+  } else if (IsConfigurable()) {
+    const auto* this_config = AsRawPointer<Configurable>(this_ptr);
+    const auto* that_config = AsRawPointer<Configurable>(that_ptr);
+    if (this_config == that_config) {
+      return true;
+    } else if (this_config != nullptr && that_config != nullptr) {
+      std::string bad_name;
+      bool matches;
+      if (level < config_options.sanity_level) {
+        ConfigOptions copy = config_options;
+        copy.sanity_level = level;
+        matches = this_config->AreEquivalent(copy, that_config, &bad_name);
+      } else {
+        matches =
+            this_config->AreEquivalent(config_options, that_config, &bad_name);
+      }
+      if (!matches) {
+        *mismatch = opt_name + "." + bad_name;
+      }
+      return matches;
+    }
+  }
+  if (mismatch->empty()) {
+    *mismatch = opt_name;
+  }
+  return false;
+}
+
+bool OptionTypeInfo::TypesAreEqual(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* this_addr, const void* that_addr, std::string* mismatch) {
+  for (const auto& iter : type_map) {
+    const auto& opt_info = iter.second;
+    if (!opt_info.AreEqual(config_options, iter.first, this_addr, that_addr,
+                           mismatch)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool OptionTypeInfo::StructsAreEqual(
+    const ConfigOptions& config_options, const std::string& struct_name,
+    const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
+    const std::string& opt_name, const void* this_addr, const void* that_addr,
+    std::string* mismatch) {
+  assert(struct_map);
+  bool matches = true;
+  std::string result;
+  if (EndsWith(opt_name, struct_name)) {
+    // This option represents the entire struct
+    matches = TypesAreEqual(config_options, *struct_map, this_addr, that_addr,
+                            &result);
+    if (!matches) {
+      *mismatch = struct_name + "." + result;
+      return false;
+    }
+  } else if (StartsWith(opt_name, struct_name + ".")) {
+    // This option represents a nested field in the struct (e.g, struct.field)
+    std::string elem_name;
+    const auto opt_info =
+        Find(opt_name.substr(struct_name.size() + 1), *struct_map, &elem_name);
+    assert(opt_info);
+    if (opt_info == nullptr) {
+      *mismatch = opt_name;
+      matches = false;
+    } else if (!opt_info->AreEqual(config_options, elem_name, this_addr,
+                                   that_addr, &result)) {
+      matches = false;
+      *mismatch = struct_name + "." + result;
+    }
+  } else {
+    // This option represents a field in the struct (e.g. field)
+    std::string elem_name;
+    const auto opt_info = Find(opt_name, *struct_map, &elem_name);
+    assert(opt_info);
+    if (opt_info == nullptr) {
+      *mismatch = struct_name + "." + opt_name;
+      matches = false;
+    } else if (!opt_info->AreEqual(config_options, elem_name, this_addr,
+                                   that_addr, &result)) {
+      matches = false;
+      *mismatch = struct_name + "." + result;
+    }
+  }
+  return matches;
+}
+
+bool MatchesOptionsTypeFromMap(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* const this_ptr, const void* const that_ptr,
+    std::string* mismatch) {
+  for (auto& pair : type_map) {
+    // We skip checking deprecated variables as they might
+    // contain random values since they might not be initialized
+    if (config_options.IsCheckEnabled(pair.second.GetSanityLevel())) {
+      if (!pair.second.AreEqual(config_options, pair.first, this_ptr, that_ptr,
+                                mismatch) &&
+          !pair.second.AreEqualByName(config_options, pair.first, this_ptr,
+                                      that_ptr)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options,
+                                    const std::string& opt_name,
+                                    const void* const this_ptr,
+                                    const void* const that_ptr) const {
+  if (IsByName()) {
+    std::string that_value;
+    if (Serialize(config_options, opt_name, that_ptr, &that_value).ok()) {
+      return AreEqualByName(config_options, opt_name, this_ptr, that_value);
+    }
+  }
+  return false;
+}
+
+bool OptionTypeInfo::AreEqualByName(const ConfigOptions& config_options,
+                                    const std::string& opt_name,
+                                    const void* const opt_ptr,
+                                    const std::string& that_value) const {
+  std::string this_value;
+  if (!IsByName()) {
+    return false;
+  } else if (!Serialize(config_options, opt_name, opt_ptr, &this_value).ok()) {
+    return false;
+  } else if (IsEnabled(OptionVerificationType::kByNameAllowFromNull)) {
+    if (that_value == kNullptrString) {
+      return true;
+    }
+  } else if (IsEnabled(OptionVerificationType::kByNameAllowNull)) {
+    if (that_value == kNullptrString) {
+      return true;
+    }
+  }
+  return (this_value == that_value);
+}
+
+const OptionTypeInfo* OptionTypeInfo::Find(
+    const std::string& opt_name,
+    const std::unordered_map<std::string, OptionTypeInfo>& opt_map,
+    std::string* elem_name) {
+  const auto iter = opt_map.find(opt_name);  // Look up the value in the map
+  if (iter != opt_map.end()) {               // Found the option in the map
+    *elem_name = opt_name;                   // Return the name
+    return &(iter->second);  // Return the contents of the iterator
+  } else {
+    auto idx = opt_name.find(".");              // Look for a separator
+    if (idx > 0 && idx != std::string::npos) {  // We found a separator
+      auto siter =
+          opt_map.find(opt_name.substr(0, idx));  // Look for the short name
+      if (siter != opt_map.end()) {               // We found the short name
+        if (siter->second.IsStruct() ||           // If the object is a struct
+            siter->second.IsConfigurable()) {     // or a Configurable
+          *elem_name = opt_name.substr(idx + 1);  // Return the rest
+          return &(siter->second);  // Return the contents of the iterator
+        }
+      }
+    }
+  }
+  return nullptr;
+}
 #endif  // !ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_helper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_helper.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,14 +10,36 @@
 #include <string>
 #include <vector>
 
-#include "options/cf_options.h"
-#include "options/db_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "rocksdb/universal_compaction.h"
 
 namespace ROCKSDB_NAMESPACE {
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
+struct ImmutableCFOptions;
+struct ImmutableDBOptions;
+struct MutableDBOptions;
+struct MutableCFOptions;
+struct Options;
+
+std::vector<CompressionType> GetSupportedCompressions();
+
+std::vector<CompressionType> GetSupportedDictCompressions();
+
+std::vector<ChecksumType> GetSupportedChecksums();
+
+inline bool IsSupportedChecksumType(ChecksumType type) {
+  // Avoid annoying compiler warning-as-error (-Werror=type-limits)
+  auto min = kNoChecksum;
+  auto max = kXXH3;
+  return type >= min && type <= max;
+}
+
+// Checks that the combination of DBOptions and ColumnFamilyOptions are valid
+Status ValidateOptions(const DBOptions& db_opts,
+                       const ColumnFamilyOptions& cf_opts);
 
 DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
                          const MutableDBOptions& mutable_db_options);
@@ -26,128 +48,31 @@
     const ColumnFamilyOptions& ioptions,
     const MutableCFOptions& mutable_cf_options);
 
-#ifndef ROCKSDB_LITE
-
-Status GetMutableOptionsFromStrings(
-    const MutableCFOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    Logger* info_log, MutableCFOptions* new_options);
-
-Status GetMutableDBOptionsFromStrings(
-    const MutableDBOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    MutableDBOptions* new_options);
-
-Status GetTableFactoryFromMap(
-    const std::string& factory_name,
-    const std::unordered_map<std::string, std::string>& opt_map,
-    std::shared_ptr<TableFactory>* table_factory,
-    bool ignore_unknown_options = false);
-
-enum class OptionType {
-  kBoolean,
-  kInt,
-  kInt32T,
-  kInt64T,
-  kVectorInt,
-  kUInt,
-  kUInt32T,
-  kUInt64T,
-  kSizeT,
-  kString,
-  kDouble,
-  kCompactionStyle,
-  kCompactionPri,
-  kSliceTransform,
-  kCompressionType,
-  kVectorCompressionType,
-  kTableFactory,
-  kComparator,
-  kCompactionFilter,
-  kCompactionFilterFactory,
-  kCompactionOptionsFIFO,
-  kCompactionOptionsUniversal,
-  kCompactionStopStyle,
-  kMergeOperator,
-  kMemTableRepFactory,
-  kBlockBasedTableIndexType,
-  kBlockBasedTableDataBlockIndexType,
-  kBlockBasedTableIndexShorteningMode,
-  kFilterPolicy,
-  kFlushBlockPolicyFactory,
-  kChecksumType,
-  kEncodingType,
-  kWALRecoveryMode,
-  kAccessHint,
-  kInfoLogLevel,
-  kLRUCacheOptions,
-  kEnv,
-  kUnknown,
-};
-
-enum class OptionVerificationType {
-  kNormal,
-  kByName,               // The option is pointer typed so we can only verify
-                         // based on it's name.
-  kByNameAllowNull,      // Same as kByName, but it also allows the case
-                         // where one of them is a nullptr.
-  kByNameAllowFromNull,  // Same as kByName, but it also allows the case
-                         // where the old option is nullptr.
-  kDeprecated            // The option is no longer used in rocksdb. The RocksDB
-                         // OptionsParser will still accept this option if it
-                         // happen to exists in some Options file.  However,
-                         // the parser will not include it in serialization
-                         // and verification processes.
-};
+void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
+                               ColumnFamilyOptions* cf_opts);
+void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
+                               ColumnFamilyOptions* cf_opts);
 
-// A struct for storing constant option information such as option name,
-// option type, and offset.
-struct OptionTypeInfo {
-  int offset;
-  OptionType type;
-  OptionVerificationType verification;
-  bool is_mutable;
-  int mutable_offset;
-};
-
-// A helper function that converts "opt_address" to a std::string
-// based on the specified OptionType.
-bool SerializeSingleOptionHelper(const char* opt_address,
-                                 const OptionType opt_type, std::string* value);
-
-// In addition to its public version defined in rocksdb/convenience.h,
-// this further takes an optional output vector "unsupported_options_names",
-// which stores the name of all the unsupported options specified in "opts_map".
-Status GetDBOptionsFromMapInternal(
-    const DBOptions& base_options,
-    const std::unordered_map<std::string, std::string>& opts_map,
-    DBOptions* new_options, bool input_strings_escaped,
-    std::vector<std::string>* unsupported_options_names = nullptr,
-    bool ignore_unknown_options = false);
-
-// In addition to its public version defined in rocksdb/convenience.h,
-// this further takes an optional output vector "unsupported_options_names",
-// which stores the name of all the unsupported options specified in "opts_map".
-Status GetColumnFamilyOptionsFromMapInternal(
-    const ColumnFamilyOptions& base_options,
-    const std::unordered_map<std::string, std::string>& opts_map,
-    ColumnFamilyOptions* new_options, bool input_strings_escaped,
-    std::vector<std::string>* unsupported_options_names = nullptr,
-    bool ignore_unknown_options = false);
-
-bool ParseSliceTransform(
-    const std::string& value,
-    std::shared_ptr<const SliceTransform>* slice_transform);
+#ifndef ROCKSDB_LITE
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const MutableDBOptions& opts);
+std::unique_ptr<Configurable> DBOptionsAsConfigurable(
+    const DBOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map = nullptr);
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const MutableCFOptions& opts);
+std::unique_ptr<Configurable> CFOptionsAsConfigurable(
+    const ColumnFamilyOptions& opts,
+    const std::unordered_map<std::string, std::string>* opt_map = nullptr);
 
 extern Status StringToMap(
     const std::string& opts_str,
     std::unordered_map<std::string, std::string>* opts_map);
-
-extern bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
-                              const std::string& value);
 #endif  // !ROCKSDB_LITE
 
 struct OptionsHelper {
+  static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/;
+  static const std::string kDBOptionsName /*= "DBOptions" */;
   static std::map<CompactionStyle, std::string> compaction_style_to_string;
   static std::map<CompactionPri, std::string> compaction_pri_to_string;
   static std::map<CompactionStopStyle, std::string>
@@ -156,39 +81,13 @@
   static std::unordered_map<std::string, CompressionType>
       compression_type_string_map;
 #ifndef ROCKSDB_LITE
-  static std::unordered_map<std::string, OptionTypeInfo> cf_options_type_info;
-  static std::unordered_map<std::string, OptionTypeInfo>
-      fifo_compaction_options_type_info;
-  static std::unordered_map<std::string, OptionTypeInfo>
-      universal_compaction_options_type_info;
   static std::unordered_map<std::string, CompactionStopStyle>
       compaction_stop_style_string_map;
-  static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info;
-  static std::unordered_map<std::string, OptionTypeInfo>
-      lru_cache_options_type_info;
-  static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
-      block_base_table_index_type_string_map;
-  static std::unordered_map<std::string,
-                            BlockBasedTableOptions::DataBlockIndexType>
-      block_base_table_data_block_index_type_string_map;
-  static std::unordered_map<std::string,
-                            BlockBasedTableOptions::IndexShorteningMode>
-      block_base_table_index_shortening_mode_string_map;
   static std::unordered_map<std::string, EncodingType> encoding_type_string_map;
   static std::unordered_map<std::string, CompactionStyle>
       compaction_style_string_map;
   static std::unordered_map<std::string, CompactionPri>
       compaction_pri_string_map;
-  static std::unordered_map<std::string, WALRecoveryMode>
-      wal_recovery_mode_string_map;
-  static std::unordered_map<std::string, DBOptions::AccessHint>
-      access_hint_string_map;
-  static std::unordered_map<std::string, InfoLogLevel>
-      info_log_level_string_map;
-  static ColumnFamilyOptions dummy_cf_options;
-  static CompactionOptionsFIFO dummy_comp_options;
-  static LRUCacheOptions dummy_lru_cache_options;
-  static CompactionOptionsUniversal dummy_comp_options_universal;
 #endif  // !ROCKSDB_LITE
 };
 
@@ -200,34 +99,15 @@
     OptionsHelper::compaction_stop_style_to_string;
 static auto& checksum_type_string_map = OptionsHelper::checksum_type_string_map;
 #ifndef ROCKSDB_LITE
-static auto& cf_options_type_info = OptionsHelper::cf_options_type_info;
-static auto& fifo_compaction_options_type_info =
-    OptionsHelper::fifo_compaction_options_type_info;
-static auto& universal_compaction_options_type_info =
-    OptionsHelper::universal_compaction_options_type_info;
 static auto& compaction_stop_style_string_map =
     OptionsHelper::compaction_stop_style_string_map;
-static auto& db_options_type_info = OptionsHelper::db_options_type_info;
-static auto& lru_cache_options_type_info =
-    OptionsHelper::lru_cache_options_type_info;
 static auto& compression_type_string_map =
     OptionsHelper::compression_type_string_map;
-static auto& block_base_table_index_type_string_map =
-    OptionsHelper::block_base_table_index_type_string_map;
-static auto& block_base_table_data_block_index_type_string_map =
-    OptionsHelper::block_base_table_data_block_index_type_string_map;
-static auto& block_base_table_index_shortening_mode_string_map =
-    OptionsHelper::block_base_table_index_shortening_mode_string_map;
 static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map;
 static auto& compaction_style_string_map =
     OptionsHelper::compaction_style_string_map;
 static auto& compaction_pri_string_map =
     OptionsHelper::compaction_pri_string_map;
-static auto& wal_recovery_mode_string_map =
-    OptionsHelper::wal_recovery_mode_string_map;
-static auto& access_hint_string_map = OptionsHelper::access_hint_string_map;
-static auto& info_log_level_string_map =
-    OptionsHelper::info_log_level_string_map;
 #endif  // !ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,17 +13,19 @@
 #include <utility>
 #include <vector>
 
-#include "file/read_write_util.h"
+#include "file/line_file_reader.h"
 #include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
 #include "options/options_helper.h"
+#include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
+#include "rocksdb/utilities/options_type.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
 
-#include "port/port.h"
-
 namespace ROCKSDB_NAMESPACE {
 
 static const std::string option_file_header =
@@ -38,6 +40,27 @@
                              const std::vector<std::string>& cf_names,
                              const std::vector<ColumnFamilyOptions>& cf_opts,
                              const std::string& file_name, FileSystem* fs) {
+  ConfigOptions
+      config_options;  // Use default for escaped(true) and check (exact)
+  config_options.delimiter = "\n  ";
+  // Do not invoke PrepareOptions when we are doing validation.
+  config_options.invoke_prepare_options = false;
+  // If a readahead size was set in the input options, use it
+  if (db_opt.log_readahead_size > 0) {
+    config_options.file_readahead_size = db_opt.log_readahead_size;
+  }
+  return PersistRocksDBOptions(config_options, db_opt, cf_names, cf_opts,
+                               file_name, fs);
+}
+
+Status PersistRocksDBOptions(const ConfigOptions& config_options_in,
+                             const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, FileSystem* fs) {
+  ConfigOptions config_options = config_options_in;
+  config_options.delimiter = "\n  ";  // Override the default to nl
+
   TEST_SYNC_POINT("PersistRocksDBOptions:start");
   if (cf_names.size() != cf_opts.size()) {
     return Status::InvalidArgument(
@@ -56,55 +79,68 @@
 
   std::string options_file_content;
 
-  writable->Append(option_file_header + "[" +
-                   opt_section_titles[kOptionSectionVersion] +
-                   "]\n"
-                   "  rocksdb_version=" +
-                   ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR) +
-                   "." + ToString(ROCKSDB_PATCH) + "\n");
-  writable->Append("  options_file_version=" +
-                   ToString(ROCKSDB_OPTION_FILE_MAJOR) + "." +
-                   ToString(ROCKSDB_OPTION_FILE_MINOR) + "\n");
-  writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] +
-                   "]\n  ");
+  s = writable->Append(option_file_header + "[" +
+                       opt_section_titles[kOptionSectionVersion] +
+                       "]\n"
+                       "  rocksdb_version=" +
+                       ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR) +
+                       "." + ToString(ROCKSDB_PATCH) + "\n");
+  if (s.ok()) {
+    s = writable->Append(
+        "  options_file_version=" + ToString(ROCKSDB_OPTION_FILE_MAJOR) + "." +
+        ToString(ROCKSDB_OPTION_FILE_MINOR) + "\n");
+  }
+  if (s.ok()) {
+    s = writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] +
+                         "]\n  ");
+  }
 
-  s = GetStringFromDBOptions(&options_file_content, db_opt, "\n  ");
-  if (!s.ok()) {
-    writable->Close();
-    return s;
+  if (s.ok()) {
+    s = GetStringFromDBOptions(config_options, db_opt, &options_file_content);
+  }
+  if (s.ok()) {
+    s = writable->Append(options_file_content + "\n");
   }
-  writable->Append(options_file_content + "\n");
 
-  for (size_t i = 0; i < cf_opts.size(); ++i) {
+  for (size_t i = 0; s.ok() && i < cf_opts.size(); ++i) {
     // CFOptions section
-    writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] +
-                     " \"" + EscapeOptionString(cf_names[i]) + "\"]\n  ");
-    s = GetStringFromColumnFamilyOptions(&options_file_content, cf_opts[i],
-                                         "\n  ");
-    if (!s.ok()) {
-      writable->Close();
-      return s;
+    s = writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] +
+                         " \"" + EscapeOptionString(cf_names[i]) + "\"]\n  ");
+    if (s.ok()) {
+      s = GetStringFromColumnFamilyOptions(config_options, cf_opts[i],
+                                           &options_file_content);
+    }
+    if (s.ok()) {
+      s = writable->Append(options_file_content + "\n");
     }
-    writable->Append(options_file_content + "\n");
     // TableOptions section
     auto* tf = cf_opts[i].table_factory.get();
     if (tf != nullptr) {
-      writable->Append("[" + opt_section_titles[kOptionSectionTableOptions] +
-                       tf->Name() + " \"" + EscapeOptionString(cf_names[i]) +
-                       "\"]\n  ");
-      options_file_content.clear();
-      s = tf->GetOptionString(&options_file_content, "\n  ");
-      if (!s.ok()) {
-        return s;
+      if (s.ok()) {
+        s = writable->Append(
+            "[" + opt_section_titles[kOptionSectionTableOptions] + tf->Name() +
+            " \"" + EscapeOptionString(cf_names[i]) + "\"]\n  ");
+      }
+      if (s.ok()) {
+        options_file_content.clear();
+        s = tf->GetOptionString(config_options, &options_file_content);
+      }
+      if (s.ok()) {
+        s = writable->Append(options_file_content + "\n");
       }
-      writable->Append(options_file_content + "\n");
     }
   }
-  writable->Sync(true /* use_fsync */);
-  writable->Close();
-
-  return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
-      db_opt, cf_names, cf_opts, file_name, fs);
+  if (s.ok()) {
+    s = writable->Sync(true /* use_fsync */);
+  }
+  if (s.ok()) {
+    s = writable->Close();
+  }
+  if (s.ok()) {
+    return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+        config_options, db_opt, cf_names, cf_opts, file_name, fs);
+  }
+  return s;
 }
 
 RocksDBOptionsParser::RocksDBOptionsParser() { Reset(); }
@@ -205,7 +241,20 @@
 Status RocksDBOptionsParser::Parse(const std::string& file_name, FileSystem* fs,
                                    bool ignore_unknown_options,
                                    size_t file_readahead_size) {
+  ConfigOptions
+      config_options;  // Use default for escaped(true) and check (exact)
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  if (file_readahead_size > 0) {
+    config_options.file_readahead_size = file_readahead_size;
+  }
+  return Parse(config_options, file_name, fs);
+}
+
+Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in,
+                                   const std::string& file_name,
+                                   FileSystem* fs) {
   Reset();
+  ConfigOptions config_options = config_options_in;
 
   std::unique_ptr<FSSequentialFile> seq_file;
   Status s = fs->NewSequentialFile(file_name, FileOptions(), &seq_file,
@@ -213,29 +262,23 @@
   if (!s.ok()) {
     return s;
   }
-
-  SequentialFileReader sf_reader(std::move(seq_file), file_name,
-                                 file_readahead_size);
+  LineFileReader lf_reader(std::move(seq_file), file_name,
+                           config_options.file_readahead_size);
 
   OptionSection section = kOptionSectionUnknown;
   std::string title;
   std::string argument;
   std::unordered_map<std::string, std::string> opt_map;
-  std::istringstream iss;
   std::string line;
-  bool has_data = true;
   // we only support single-lined statement.
-  for (int line_num = 1; ReadOneLine(&iss, &sf_reader, &line, &has_data, &s);
-       ++line_num) {
-    if (!s.ok()) {
-      return s;
-    }
+  while (lf_reader.ReadLine(&line)) {
+    int line_num = static_cast<int>(lf_reader.GetLineNumber());
     line = TrimAndRemoveComment(line);
     if (line.empty()) {
       continue;
     }
     if (IsSection(line)) {
-      s = EndSection(section, title, argument, opt_map, ignore_unknown_options);
+      s = EndSection(config_options, section, title, argument, opt_map);
       opt_map.clear();
       if (!s.ok()) {
         return s;
@@ -243,10 +286,11 @@
 
       // If the option file is not generated by a higher minor version,
       // there shouldn't be any unknown option.
-      if (ignore_unknown_options && section == kOptionSectionVersion) {
+      if (config_options.ignore_unknown_options &&
+          section == kOptionSectionVersion) {
         if (db_version[0] < ROCKSDB_MAJOR || (db_version[0] == ROCKSDB_MAJOR &&
                                               db_version[1] <= ROCKSDB_MINOR)) {
-          ignore_unknown_options = false;
+          config_options.ignore_unknown_options = false;
         }
       }
 
@@ -264,8 +308,12 @@
       opt_map.insert({name, value});
     }
   }
+  s = lf_reader.GetStatus();
+  if (!s.ok()) {
+    return s;
+  }
 
-  s = EndSection(section, title, argument, opt_map, ignore_unknown_options);
+  s = EndSection(config_options, section, title, argument, opt_map);
   opt_map.clear();
   if (!s.ok()) {
     return s;
@@ -372,14 +420,12 @@
 }
 
 Status RocksDBOptionsParser::EndSection(
-    const OptionSection section, const std::string& section_title,
-    const std::string& section_arg,
-    const std::unordered_map<std::string, std::string>& opt_map,
-    bool ignore_unknown_options) {
+    const ConfigOptions& config_options, const OptionSection section,
+    const std::string& section_title, const std::string& section_arg,
+    const std::unordered_map<std::string, std::string>& opt_map) {
   Status s;
   if (section == kOptionSectionDBOptions) {
-    s = GetDBOptionsFromMap(DBOptions(), opt_map, &db_opt_, true,
-                            ignore_unknown_options);
+    s = GetDBOptionsFromMap(config_options, DBOptions(), opt_map, &db_opt_);
     if (!s.ok()) {
       return s;
     }
@@ -390,9 +436,8 @@
     assert(GetCFOptions(section_arg) == nullptr);
     cf_names_.emplace_back(section_arg);
     cf_opts_.emplace_back();
-    s = GetColumnFamilyOptionsFromMap(ColumnFamilyOptions(), opt_map,
-                                      &cf_opts_.back(), true,
-                                      ignore_unknown_options);
+    s = GetColumnFamilyOptionsFromMap(config_options, ColumnFamilyOptions(),
+                                      opt_map, &cf_opts_.back());
     if (!s.ok()) {
       return s;
     }
@@ -408,15 +453,27 @@
           section_arg);
     }
     // Ignore error as table factory deserialization is optional
-    s = GetTableFactoryFromMap(
+    s = TableFactory::CreateFromString(
+        config_options,
         section_title.substr(
             opt_section_titles[kOptionSectionTableOptions].size()),
-        opt_map, &(cf_opt->table_factory), ignore_unknown_options);
-    if (!s.ok()) {
-      return s;
+        &(cf_opt->table_factory));
+    if (s.ok()) {
+      s = cf_opt->table_factory->ConfigureFromMap(config_options, opt_map);
+      // Translate any errors (NotFound, NotSupported, to InvalidArgument
+      if (s.ok() || s.IsInvalidArgument()) {
+        return s;
+      } else {
+        return Status::InvalidArgument(s.getState());
+      }
+    } else {
+      // Return OK for not supported table factories as TableFactory
+      // Deserialization is optional.
+      cf_opt->table_factory.reset();
+      return Status::OK();
     }
   } else if (section == kOptionSectionVersion) {
-    for (const auto pair : opt_map) {
+    for (const auto& pair : opt_map) {
       if (pair.first == "rocksdb_version") {
         s = ParseVersionNumber(pair.first, pair.second, 3, db_version);
         if (!s.ok()) {
@@ -434,7 +491,7 @@
       }
     }
   }
-  return Status::OK();
+  return s;
 }
 
 Status RocksDBOptionsParser::ValidityCheck() {
@@ -487,204 +544,37 @@
   return "";
 }
 
-namespace {
-bool AreEqualDoubles(const double a, const double b) {
-  return (fabs(a - b) < 0.00001);
-}
-}  // namespace
-
-bool AreEqualOptions(
-    const char* opt1, const char* opt2, const OptionTypeInfo& type_info,
-    const std::string& opt_name,
-    const std::unordered_map<std::string, std::string>* opt_map) {
-  const char* offset1 = opt1 + type_info.offset;
-  const char* offset2 = opt2 + type_info.offset;
-
-  switch (type_info.type) {
-    case OptionType::kBoolean:
-      return (*reinterpret_cast<const bool*>(offset1) ==
-              *reinterpret_cast<const bool*>(offset2));
-    case OptionType::kInt:
-      return (*reinterpret_cast<const int*>(offset1) ==
-              *reinterpret_cast<const int*>(offset2));
-    case OptionType::kInt32T:
-      return (*reinterpret_cast<const int32_t*>(offset1) ==
-              *reinterpret_cast<const int32_t*>(offset2));
-    case OptionType::kInt64T:
-      {
-        int64_t v1, v2;
-        GetUnaligned(reinterpret_cast<const int64_t*>(offset1), &v1);
-        GetUnaligned(reinterpret_cast<const int64_t*>(offset2), &v2);
-        return (v1 == v2);
-      }
-    case OptionType::kVectorInt:
-      return (*reinterpret_cast<const std::vector<int>*>(offset1) ==
-              *reinterpret_cast<const std::vector<int>*>(offset2));
-    case OptionType::kUInt:
-      return (*reinterpret_cast<const unsigned int*>(offset1) ==
-              *reinterpret_cast<const unsigned int*>(offset2));
-    case OptionType::kUInt32T:
-      return (*reinterpret_cast<const uint32_t*>(offset1) ==
-              *reinterpret_cast<const uint32_t*>(offset2));
-    case OptionType::kUInt64T:
-      {
-        uint64_t v1, v2;
-        GetUnaligned(reinterpret_cast<const uint64_t*>(offset1), &v1);
-        GetUnaligned(reinterpret_cast<const uint64_t*>(offset2), &v2);
-        return (v1 == v2);
-      }
-    case OptionType::kSizeT:
-      {
-        size_t v1, v2;
-        GetUnaligned(reinterpret_cast<const size_t*>(offset1), &v1);
-        GetUnaligned(reinterpret_cast<const size_t*>(offset2), &v2);
-        return (v1 == v2);
-      }
-    case OptionType::kString:
-      return (*reinterpret_cast<const std::string*>(offset1) ==
-              *reinterpret_cast<const std::string*>(offset2));
-    case OptionType::kDouble:
-      return AreEqualDoubles(*reinterpret_cast<const double*>(offset1),
-                             *reinterpret_cast<const double*>(offset2));
-    case OptionType::kCompactionStyle:
-      return (*reinterpret_cast<const CompactionStyle*>(offset1) ==
-              *reinterpret_cast<const CompactionStyle*>(offset2));
-    case OptionType::kCompactionPri:
-      return (*reinterpret_cast<const CompactionPri*>(offset1) ==
-              *reinterpret_cast<const CompactionPri*>(offset2));
-    case OptionType::kCompressionType:
-      return (*reinterpret_cast<const CompressionType*>(offset1) ==
-              *reinterpret_cast<const CompressionType*>(offset2));
-    case OptionType::kVectorCompressionType: {
-      const auto* vec1 =
-          reinterpret_cast<const std::vector<CompressionType>*>(offset1);
-      const auto* vec2 =
-          reinterpret_cast<const std::vector<CompressionType>*>(offset2);
-      return (*vec1 == *vec2);
-    }
-    case OptionType::kChecksumType:
-      return (*reinterpret_cast<const ChecksumType*>(offset1) ==
-              *reinterpret_cast<const ChecksumType*>(offset2));
-    case OptionType::kBlockBasedTableIndexType:
-      return (
-          *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(
-              offset1) ==
-          *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(offset2));
-    case OptionType::kBlockBasedTableDataBlockIndexType:
-      return (
-          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
-              offset1) ==
-          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
-              offset2));
-    case OptionType::kBlockBasedTableIndexShorteningMode:
-      return (
-          *reinterpret_cast<const BlockBasedTableOptions::IndexShorteningMode*>(
-              offset1) ==
-          *reinterpret_cast<const BlockBasedTableOptions::IndexShorteningMode*>(
-              offset2));
-    case OptionType::kWALRecoveryMode:
-      return (*reinterpret_cast<const WALRecoveryMode*>(offset1) ==
-              *reinterpret_cast<const WALRecoveryMode*>(offset2));
-    case OptionType::kAccessHint:
-      return (*reinterpret_cast<const DBOptions::AccessHint*>(offset1) ==
-              *reinterpret_cast<const DBOptions::AccessHint*>(offset2));
-    case OptionType::kInfoLogLevel:
-      return (*reinterpret_cast<const InfoLogLevel*>(offset1) ==
-              *reinterpret_cast<const InfoLogLevel*>(offset2));
-    case OptionType::kCompactionOptionsFIFO: {
-      CompactionOptionsFIFO lhs =
-          *reinterpret_cast<const CompactionOptionsFIFO*>(offset1);
-      CompactionOptionsFIFO rhs =
-          *reinterpret_cast<const CompactionOptionsFIFO*>(offset2);
-      if (lhs.max_table_files_size == rhs.max_table_files_size &&
-          lhs.allow_compaction == rhs.allow_compaction) {
-        return true;
-      }
-      return false;
-    }
-    case OptionType::kCompactionOptionsUniversal: {
-      CompactionOptionsUniversal lhs =
-          *reinterpret_cast<const CompactionOptionsUniversal*>(offset1);
-      CompactionOptionsUniversal rhs =
-          *reinterpret_cast<const CompactionOptionsUniversal*>(offset2);
-      if (lhs.size_ratio == rhs.size_ratio &&
-          lhs.min_merge_width == rhs.min_merge_width &&
-          lhs.max_merge_width == rhs.max_merge_width &&
-          lhs.max_size_amplification_percent ==
-              rhs.max_size_amplification_percent &&
-          lhs.compression_size_percent == rhs.compression_size_percent &&
-          lhs.stop_style == rhs.stop_style &&
-          lhs.allow_trivial_move == rhs.allow_trivial_move) {
-        return true;
-      }
-      return false;
-    }
-    default:
-      if (type_info.verification == OptionVerificationType::kByName ||
-          type_info.verification ==
-              OptionVerificationType::kByNameAllowFromNull ||
-          type_info.verification == OptionVerificationType::kByNameAllowNull) {
-        std::string value1;
-        bool result =
-            SerializeSingleOptionHelper(offset1, type_info.type, &value1);
-        if (result == false) {
-          return false;
-        }
-        if (opt_map == nullptr) {
-          return true;
-        }
-        auto iter = opt_map->find(opt_name);
-        if (iter == opt_map->end()) {
-          return true;
-        } else {
-          if (type_info.verification ==
-              OptionVerificationType::kByNameAllowNull) {
-            if (iter->second == kNullptrString || value1 == kNullptrString) {
-              return true;
-            }
-          } else if (type_info.verification ==
-                     OptionVerificationType::kByNameAllowFromNull) {
-            if (iter->second == kNullptrString) {
-              return true;
-            }
-          }
-          return (value1 == iter->second);
-        }
-      }
-      return false;
-  }
-}
-
 Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
-    const DBOptions& db_opt, const std::vector<std::string>& cf_names,
+    const ConfigOptions& config_options_in, const DBOptions& db_opt,
+    const std::vector<std::string>& cf_names,
     const std::vector<ColumnFamilyOptions>& cf_opts,
-    const std::string& file_name, FileSystem* fs,
-    OptionsSanityCheckLevel sanity_check_level, bool ignore_unknown_options) {
-  // We infer option file readhead size from log readahead size.
-  // If it is not given, use 512KB.
-  size_t file_readahead_size = db_opt.log_readahead_size;
-  if (file_readahead_size == 0) {
-    const size_t kDefaultOptionFileReadAheadSize = 512 * 1024;
-    file_readahead_size = kDefaultOptionFileReadAheadSize;
-  }
-
+    const std::string& file_name, FileSystem* fs) {
   RocksDBOptionsParser parser;
-  Status s =
-      parser.Parse(file_name, fs, ignore_unknown_options, file_readahead_size);
+  ConfigOptions config_options = config_options_in;
+  config_options.invoke_prepare_options =
+      false;  // No need to do a prepare for verify
+  if (config_options.sanity_level < ConfigOptions::kSanityLevelExactMatch) {
+    // If we are not doing an exact comparison, we should ignore
+    // unsupported options, as they may cause the Parse to fail
+    // (if the ObjectRegistry is not initialized)
+    config_options.ignore_unsupported_options = true;
+  }
+  Status s = parser.Parse(config_options, file_name, fs);
   if (!s.ok()) {
     return s;
   }
 
   // Verify DBOptions
-  s = VerifyDBOptions(db_opt, *parser.db_opt(), parser.db_opt_map(),
-                      sanity_check_level);
+  s = VerifyDBOptions(config_options, db_opt, *parser.db_opt(),
+                      parser.db_opt_map());
   if (!s.ok()) {
     return s;
   }
 
   // Verify ColumnFamily Name
   if (cf_names.size() != parser.cf_names()->size()) {
-    if (sanity_check_level >= kSanityLevelLooselyCompatible) {
+    if (config_options.sanity_level >=
+        ConfigOptions::kSanityLevelLooselyCompatible) {
       return Status::InvalidArgument(
           "[RocksDBOptionParser Error] The persisted options does not have "
           "the same number of column family names as the db instance.");
@@ -706,7 +596,8 @@
 
   // Verify Column Family Options
   if (cf_opts.size() != parser.cf_opts()->size()) {
-    if (sanity_check_level >= kSanityLevelLooselyCompatible) {
+    if (config_options.sanity_level >=
+        ConfigOptions::kSanityLevelLooselyCompatible) {
       return Status::InvalidArgument(
           "[RocksDBOptionsParser Error]",
           "The persisted options does not have the same number of "
@@ -719,14 +610,13 @@
     }
   }
   for (size_t i = 0; i < cf_opts.size(); ++i) {
-    s = VerifyCFOptions(cf_opts[i], parser.cf_opts()->at(i),
-                        &(parser.cf_opt_maps()->at(i)), sanity_check_level);
+    s = VerifyCFOptions(config_options, cf_opts[i], parser.cf_opts()->at(i),
+                        &(parser.cf_opt_maps()->at(i)));
     if (!s.ok()) {
       return s;
     }
-    s = VerifyTableFactory(cf_opts[i].table_factory.get(),
-                           parser.cf_opts()->at(i).table_factory.get(),
-                           sanity_check_level);
+    s = VerifyTableFactory(config_options, cf_opts[i].table_factory.get(),
+                           parser.cf_opts()->at(i).table_factory.get());
     if (!s.ok()) {
       return s;
     }
@@ -736,99 +626,96 @@
 }
 
 Status RocksDBOptionsParser::VerifyDBOptions(
-    const DBOptions& base_opt, const DBOptions& persisted_opt,
-    const std::unordered_map<std::string, std::string>* /*opt_map*/,
-    OptionsSanityCheckLevel sanity_check_level) {
-  for (auto pair : db_options_type_info) {
-    if (pair.second.verification == OptionVerificationType::kDeprecated) {
-      // We skip checking deprecated variables as they might
-      // contain random values since they might not be initialized
-      continue;
-    }
-    if (DBOptionSanityCheckLevel(pair.first) <= sanity_check_level) {
-      if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
-                           reinterpret_cast<const char*>(&persisted_opt),
-                           pair.second, pair.first, nullptr)) {
-        constexpr size_t kBufferSize = 2048;
-        char buffer[kBufferSize];
-        std::string base_value;
-        std::string persisted_value;
-        SerializeSingleOptionHelper(
-            reinterpret_cast<const char*>(&base_opt) + pair.second.offset,
-            pair.second.type, &base_value);
-        SerializeSingleOptionHelper(
-            reinterpret_cast<const char*>(&persisted_opt) + pair.second.offset,
-            pair.second.type, &persisted_value);
-        snprintf(buffer, sizeof(buffer),
-                 "[RocksDBOptionsParser]: "
-                 "failed the verification on DBOptions::%s --- "
-                 "The specified one is %s while the persisted one is %s.\n",
-                 pair.first.c_str(), base_value.c_str(),
-                 persisted_value.c_str());
-        return Status::InvalidArgument(Slice(buffer, strlen(buffer)));
-      }
+    const ConfigOptions& config_options, const DBOptions& base_opt,
+    const DBOptions& file_opt,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  auto base_config = DBOptionsAsConfigurable(base_opt, opt_map);
+  auto file_config = DBOptionsAsConfigurable(file_opt, opt_map);
+  std::string mismatch;
+  if (!base_config->AreEquivalent(config_options, file_config.get(),
+                                  &mismatch)) {
+    const size_t kBufferSize = 2048;
+    char buffer[kBufferSize];
+    std::string base_value;
+    std::string file_value;
+    int offset = snprintf(buffer, sizeof(buffer),
+                          "[RocksDBOptionsParser]: "
+                          "failed the verification on DBOptions::%s -- ",
+                          mismatch.c_str());
+    Status s = base_config->GetOption(config_options, mismatch, &base_value);
+    if (s.ok()) {
+      s = file_config->GetOption(config_options, mismatch, &file_value);
+    }
+    assert(offset >= 0);
+    assert(static_cast<size_t>(offset) < sizeof(buffer));
+    if (s.ok()) {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "-- The specified one is %s while the persisted one is %s.\n",
+               base_value.c_str(), file_value.c_str());
+    } else {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "-- Unable to re-serialize an option: %s.\n",
+               s.ToString().c_str());
     }
+    return Status::InvalidArgument(Slice(buffer, strlen(buffer)));
   }
   return Status::OK();
 }
 
 Status RocksDBOptionsParser::VerifyCFOptions(
-    const ColumnFamilyOptions& base_opt,
-    const ColumnFamilyOptions& persisted_opt,
-    const std::unordered_map<std::string, std::string>* persisted_opt_map,
-    OptionsSanityCheckLevel sanity_check_level) {
-  for (auto& pair : cf_options_type_info) {
-    if (pair.second.verification == OptionVerificationType::kDeprecated) {
-      // We skip checking deprecated variables as they might
-      // contain random values since they might not be initialized
-      continue;
-    }
-    if (CFOptionSanityCheckLevel(pair.first) <= sanity_check_level) {
-      if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
-                           reinterpret_cast<const char*>(&persisted_opt),
-                           pair.second, pair.first, persisted_opt_map)) {
-        constexpr size_t kBufferSize = 2048;
-        char buffer[kBufferSize];
-        std::string base_value;
-        std::string persisted_value;
-        SerializeSingleOptionHelper(
-            reinterpret_cast<const char*>(&base_opt) + pair.second.offset,
-            pair.second.type, &base_value);
-        SerializeSingleOptionHelper(
-            reinterpret_cast<const char*>(&persisted_opt) + pair.second.offset,
-            pair.second.type, &persisted_value);
-        snprintf(buffer, sizeof(buffer),
-                 "[RocksDBOptionsParser]: "
-                 "failed the verification on ColumnFamilyOptions::%s --- "
-                 "The specified one is %s while the persisted one is %s.\n",
-                 pair.first.c_str(), base_value.c_str(),
-                 persisted_value.c_str());
-        return Status::InvalidArgument(Slice(buffer, sizeof(buffer)));
-      }
+    const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt,
+    const ColumnFamilyOptions& file_opt,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  auto base_config = CFOptionsAsConfigurable(base_opt, opt_map);
+  auto file_config = CFOptionsAsConfigurable(file_opt, opt_map);
+  std::string mismatch;
+  if (!base_config->AreEquivalent(config_options, file_config.get(),
+                                  &mismatch)) {
+    std::string base_value;
+    std::string file_value;
+    // The options do not match
+    const size_t kBufferSize = 2048;
+    char buffer[kBufferSize];
+    Status s = base_config->GetOption(config_options, mismatch, &base_value);
+    if (s.ok()) {
+      s = file_config->GetOption(config_options, mismatch, &file_value);
+    }
+    int offset = snprintf(buffer, sizeof(buffer),
+                          "[RocksDBOptionsParser]: "
+                          "failed the verification on ColumnFamilyOptions::%s",
+                          mismatch.c_str());
+    assert(offset >= 0);
+    assert(static_cast<size_t>(offset) < sizeof(buffer));
+    if (s.ok()) {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "--- The specified one is %s while the persisted one is %s.\n",
+               base_value.c_str(), file_value.c_str());
+    } else {
+      snprintf(buffer + offset, sizeof(buffer) - static_cast<size_t>(offset),
+               "--- Unable to re-serialize an option: %s.\n",
+               s.ToString().c_str());
     }
-  }
+    return Status::InvalidArgument(Slice(buffer, sizeof(buffer)));
+  }  // For each option
   return Status::OK();
 }
 
 Status RocksDBOptionsParser::VerifyTableFactory(
-    const TableFactory* base_tf, const TableFactory* file_tf,
-    OptionsSanityCheckLevel sanity_check_level) {
+    const ConfigOptions& config_options, const TableFactory* base_tf,
+    const TableFactory* file_tf) {
+  std::string mismatch;
   if (base_tf && file_tf) {
-    if (sanity_check_level > kSanityLevelNone &&
+    if (config_options.sanity_level > ConfigOptions::kSanityLevelNone &&
         std::string(base_tf->Name()) != std::string(file_tf->Name())) {
       return Status::Corruption(
           "[RocksDBOptionsParser]: "
           "failed the verification on TableFactory->Name()");
+    } else if (!base_tf->AreEquivalent(config_options, file_tf, &mismatch)) {
+      return Status::Corruption(std::string("[RocksDBOptionsParser]:"
+                                            "failed the verification on ") +
+                                    base_tf->Name() + "::",
+                                mismatch);
     }
-    if (base_tf->Name() == BlockBasedTableFactory::kName) {
-      return VerifyBlockBasedTableFactory(
-          static_cast_with_check<const BlockBasedTableFactory,
-                                 const TableFactory>(base_tf),
-          static_cast_with_check<const BlockBasedTableFactory,
-                                 const TableFactory>(file_tf),
-          sanity_check_level);
-    }
-    // TODO(yhchiang): add checks for other table factory types
   } else {
     // TODO(yhchiang): further support sanity check here
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_parser.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_parser.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,14 +9,15 @@
 #include <string>
 #include <vector>
 
-#include "options/options_sanity_check.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "table/block_based/block_based_table_factory.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
+struct ConfigOptions;
+class OptionTypeInfo;
+class TableFactory;
 
 #define ROCKSDB_OPTION_FILE_MAJOR 1
 #define ROCKSDB_OPTION_FILE_MINOR 1
@@ -36,11 +37,11 @@
                              const std::vector<std::string>& cf_names,
                              const std::vector<ColumnFamilyOptions>& cf_opts,
                              const std::string& file_name, FileSystem* fs);
-
-extern bool AreEqualOptions(
-    const char* opt1, const char* opt2, const OptionTypeInfo& type_info,
-    const std::string& opt_name,
-    const std::unordered_map<std::string, std::string>* opt_map);
+Status PersistRocksDBOptions(const ConfigOptions& config_options,
+                             const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, FileSystem* fs);
 
 class RocksDBOptionsParser {
  public:
@@ -52,6 +53,10 @@
   // If 0 is given, a default value will be used.
   Status Parse(const std::string& file_name, FileSystem* fs,
                bool ignore_unknown_options, size_t file_readahead_size);
+
+  Status Parse(const ConfigOptions& config_options,
+               const std::string& file_name, FileSystem* fs);
+
   static std::string TrimAndRemoveComment(const std::string& line,
                                           const bool trim_only = false);
 
@@ -70,30 +75,32 @@
     return GetCFOptionsImpl(name);
   }
   size_t NumColumnFamilies() { return cf_opts_.size(); }
-
   static Status VerifyRocksDBOptionsFromFile(
-      const DBOptions& db_opt, const std::vector<std::string>& cf_names,
+      const ConfigOptions& config_options, const DBOptions& db_opt,
+      const std::vector<std::string>& cf_names,
       const std::vector<ColumnFamilyOptions>& cf_opts,
-      const std::string& file_name, FileSystem* fs,
-      OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch,
-      bool ignore_unknown_options = false);
-
+      const std::string& file_name, FileSystem* fs);
   static Status VerifyDBOptions(
-      const DBOptions& base_opt, const DBOptions& new_opt,
-      const std::unordered_map<std::string, std::string>* new_opt_map = nullptr,
-      OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch);
+      const ConfigOptions& config_options, const DBOptions& base_opt,
+      const DBOptions& new_opt,
+      const std::unordered_map<std::string, std::string>* new_opt_map =
+          nullptr);
 
   static Status VerifyCFOptions(
-      const ColumnFamilyOptions& base_opt, const ColumnFamilyOptions& new_opt,
-      const std::unordered_map<std::string, std::string>* new_opt_map = nullptr,
-      OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch);
-
-  static Status VerifyTableFactory(
-      const TableFactory* base_tf, const TableFactory* file_tf,
-      OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch);
+      const ConfigOptions& config_options, const ColumnFamilyOptions& base_opt,
+      const ColumnFamilyOptions& new_opt,
+      const std::unordered_map<std::string, std::string>* new_opt_map =
+          nullptr);
+
+  static Status VerifyTableFactory(const ConfigOptions& config_options,
+                                   const TableFactory* base_tf,
+                                   const TableFactory* file_tf);
 
   static Status ExtraParserCheck(const RocksDBOptionsParser& input_parser);
 
+  static Status ParseStatement(std::string* name, std::string* value,
+                               const std::string& line, const int line_num);
+
  protected:
   bool IsSection(const std::string& line);
   Status ParseSection(OptionSection* section, std::string* title,
@@ -103,17 +110,14 @@
   Status CheckSection(const OptionSection section,
                       const std::string& section_arg, const int line_num);
 
-  Status ParseStatement(std::string* name, std::string* value,
-                        const std::string& line, const int line_num);
-
-  Status EndSection(const OptionSection section, const std::string& title,
-                    const std::string& section_arg,
-                    const std::unordered_map<std::string, std::string>& opt_map,
-                    bool ignore_unknown_options);
+  Status EndSection(
+      const ConfigOptions& config_options, const OptionSection section,
+      const std::string& title, const std::string& section_arg,
+      const std::unordered_map<std::string, std::string>& opt_map);
 
   Status ValidityCheck();
 
-  Status InvalidArgument(const int line_num, const std::string& message);
+  static Status InvalidArgument(const int line_num, const std::string& message);
 
   Status ParseVersionNumber(const std::string& ver_name,
                             const std::string& ver_string, const int max_count,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,38 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include "options/options_sanity_check.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-namespace {
-OptionsSanityCheckLevel SanityCheckLevelHelper(
-    const std::unordered_map<std::string, OptionsSanityCheckLevel>& smap,
-    const std::string& name) {
-  auto iter = smap.find(name);
-  return iter != smap.end() ? iter->second : kSanityLevelExactMatch;
-}
-}
-
-OptionsSanityCheckLevel DBOptionSanityCheckLevel(
-    const std::string& option_name) {
-  return SanityCheckLevelHelper(sanity_level_db_options, option_name);
-}
-
-OptionsSanityCheckLevel CFOptionSanityCheckLevel(
-    const std::string& option_name) {
-  return SanityCheckLevelHelper(sanity_level_cf_options, option_name);
-}
-
-OptionsSanityCheckLevel BBTOptionSanityCheckLevel(
-    const std::string& option_name) {
-  return SanityCheckLevelHelper(sanity_level_bbt_options, option_name);
-}
-
-}  // namespace ROCKSDB_NAMESPACE
-
-#endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.h mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_sanity_check.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_sanity_check.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,50 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-
-#include "rocksdb/rocksdb_namespace.h"
-
-#ifndef ROCKSDB_LITE
-namespace ROCKSDB_NAMESPACE {
-// This enum defines the RocksDB options sanity level.
-enum OptionsSanityCheckLevel : unsigned char {
-  // Performs no sanity check at all.
-  kSanityLevelNone = 0x00,
-  // Performs minimum check to ensure the RocksDB instance can be
-  // opened without corrupting / mis-interpreting the data.
-  kSanityLevelLooselyCompatible = 0x01,
-  // Perform exact match sanity check.
-  kSanityLevelExactMatch = 0xFF,
-};
-
-// The sanity check level for DB options
-static const std::unordered_map<std::string, OptionsSanityCheckLevel>
-    sanity_level_db_options {};
-
-// The sanity check level for column-family options
-static const std::unordered_map<std::string, OptionsSanityCheckLevel>
-    sanity_level_cf_options = {
-        {"comparator", kSanityLevelLooselyCompatible},
-        {"table_factory", kSanityLevelLooselyCompatible},
-        {"merge_operator", kSanityLevelLooselyCompatible}};
-
-// The sanity check level for block-based table options
-static const std::unordered_map<std::string, OptionsSanityCheckLevel>
-    sanity_level_bbt_options {};
-
-OptionsSanityCheckLevel DBOptionSanityCheckLevel(
-    const std::string& options_name);
-OptionsSanityCheckLevel CFOptionSanityCheckLevel(
-    const std::string& options_name);
-OptionsSanityCheckLevel BBTOptionSanityCheckLevel(
-    const std::string& options_name);
-
-}  // namespace ROCKSDB_NAMESPACE
-
-#endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_settable_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_settable_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_settable_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_settable_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,8 @@
 
 #include <cstring>
 
+#include "options/cf_options.h"
+#include "options/db_options.h"
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
 #include "test_util/testharness.h"
@@ -39,23 +41,24 @@
 };
 
 const char kSpecialChar = 'z';
-typedef std::vector<std::pair<size_t, size_t>> OffsetGap;
+using OffsetGap = std::vector<std::pair<size_t, size_t>>;
 
 void FillWithSpecialChar(char* start_ptr, size_t total_size,
-                         const OffsetGap& blacklist) {
+                         const OffsetGap& excluded,
+                         char special_char = kSpecialChar) {
   size_t offset = 0;
-  for (auto& pair : blacklist) {
-    std::memset(start_ptr + offset, kSpecialChar, pair.first - offset);
+  for (auto& pair : excluded) {
+    std::memset(start_ptr + offset, special_char, pair.first - offset);
     offset = pair.first + pair.second;
   }
-  std::memset(start_ptr + offset, kSpecialChar, total_size - offset);
+  std::memset(start_ptr + offset, special_char, total_size - offset);
 }
 
 int NumUnsetBytes(char* start_ptr, size_t total_size,
-                  const OffsetGap& blacklist) {
+                  const OffsetGap& excluded) {
   int total_unset_bytes_base = 0;
   size_t offset = 0;
-  for (auto& pair : blacklist) {
+  for (auto& pair : excluded) {
     for (char* ptr = start_ptr + offset; ptr < start_ptr + pair.first; ptr++) {
       if (*ptr == kSpecialChar) {
         total_unset_bytes_base++;
@@ -71,6 +74,26 @@
   return total_unset_bytes_base;
 }
 
+// Return true iff two structs are the same except excluded fields.
+bool CompareBytes(char* start_ptr1, char* start_ptr2, size_t total_size,
+                  const OffsetGap& excluded) {
+  size_t offset = 0;
+  for (auto& pair : excluded) {
+    for (; offset < pair.first; offset++) {
+      if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) {
+        return false;
+      }
+    }
+    offset = pair.first + pair.second;
+  }
+  for (; offset < total_size; offset++) {
+    if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // If the test fails, likely a new option is added to BlockBasedTableOptions
 // but it cannot be set through GetBlockBasedTableOptionsFromString(), or the
 // test is not updated accordingly.
@@ -78,11 +101,11 @@
 // GetBlockBasedTableOptionsFromString() and add the option to the input string
 // passed to the GetBlockBasedTableOptionsFromString() in this test.
 // If it is a complicated type, you also need to add the field to
-// kBbtoBlacklist, and maybe add customized verification for it.
+// kBbtoExcluded, and maybe add customized verification for it.
 TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
   // Items in the form of <offset, size>. Need to be in ascending order
   // and not overlapping. Need to updated if new pointer-option is added.
-  const OffsetGap kBbtoBlacklist = {
+  const OffsetGap kBbtoExcluded = {
       {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
        sizeof(std::shared_ptr<FlushBlockPolicyFactory>)},
       {offsetof(struct BlockBasedTableOptions, block_cache),
@@ -107,20 +130,20 @@
   // copy a well constructed struct to this memory and see how many special
   // bytes left.
   BlockBasedTableOptions* bbto = new (bbto_ptr) BlockBasedTableOptions();
-  FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist);
+  FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
   // It based on the behavior of compiler that padding bytes are not changed
   // when copying the struct. It's prone to failure when compiler behavior
   // changes. We verify there is unset bytes to detect the case.
   *bbto = BlockBasedTableOptions();
   int unset_bytes_base =
-      NumUnsetBytes(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist);
+      NumUnsetBytes(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
   ASSERT_GT(unset_bytes_base, 0);
   bbto->~BlockBasedTableOptions();
 
   // Construct the base option passed into
   // GetBlockBasedTableOptionsFromString().
   bbto = new (bbto_ptr) BlockBasedTableOptions();
-  FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist);
+  FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
   // This option is not setable:
   bbto->use_delta_encoding = true;
 
@@ -128,13 +151,16 @@
   BlockBasedTableOptions* new_bbto =
       new (new_bbto_ptr) BlockBasedTableOptions();
   FillWithSpecialChar(new_bbto_ptr, sizeof(BlockBasedTableOptions),
-                      kBbtoBlacklist);
+                      kBbtoExcluded);
 
   // Need to update the option string if a new option is added.
   ASSERT_OK(GetBlockBasedTableOptionsFromString(
       *bbto,
       "cache_index_and_filter_blocks=1;"
       "cache_index_and_filter_blocks_with_high_priority=true;"
+      "metadata_cache_options={top_level_index_pinning=kFallback;"
+      "partition_pinning=kAll;"
+      "unpartitioned_pinning=kFlushedAndSimilar;};"
       "pin_l0_filter_and_index_blocks_in_cache=1;"
       "pin_top_level_index_and_filter=1;"
       "index_type=kHashSearch;"
@@ -146,18 +172,22 @@
       "block_size_deviation=8;block_restart_interval=4; "
       "metadata_block_size=1024;"
       "partition_filters=false;"
+      "optimize_filters_for_memory=true;"
       "index_block_restart_interval=4;"
       "filter_policy=bloomfilter:4:true;whole_key_filtering=1;"
+      "reserve_table_builder_memory=false;"
       "format_version=1;"
       "hash_index_allow_collision=false;"
       "verify_compression=true;read_amp_bytes_per_bit=0;"
       "enable_index_compression=false;"
-      "block_align=true",
+      "block_align=true;"
+      "max_auto_readahead_size=0;"
+      "prepopulate_block_cache=kDisable",
       new_bbto));
 
   ASSERT_EQ(unset_bytes_base,
             NumUnsetBytes(new_bbto_ptr, sizeof(BlockBasedTableOptions),
-                          kBbtoBlacklist));
+                          kBbtoExcluded));
 
   ASSERT_TRUE(new_bbto->block_cache.get() != nullptr);
   ASSERT_TRUE(new_bbto->block_cache_compressed.get() != nullptr);
@@ -177,12 +207,10 @@
 // GetDBOptionsFromString() and add the option to the input string passed to
 // DBOptionsFromString()in this test.
 // If it is a complicated type, you also need to add the field to
-// kDBOptionsBlacklist, and maybe add customized verification for it.
+// kDBOptionsExcluded, and maybe add customized verification for it.
 TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
-  const OffsetGap kDBOptionsBlacklist = {
+  const OffsetGap kDBOptionsExcluded = {
       {offsetof(struct DBOptions, env), sizeof(Env*)},
-      {offsetof(struct DBOptions, file_system),
-       sizeof(std::shared_ptr<FileSystem>)},
       {offsetof(struct DBOptions, rate_limiter),
        sizeof(std::shared_ptr<RateLimiter>)},
       {offsetof(struct DBOptions, sst_file_manager),
@@ -199,8 +227,13 @@
        sizeof(std::vector<std::shared_ptr<EventListener>>)},
       {offsetof(struct DBOptions, row_cache), sizeof(std::shared_ptr<Cache>)},
       {offsetof(struct DBOptions, wal_filter), sizeof(const WalFilter*)},
-      {offsetof(struct DBOptions, sst_file_checksum_func),
-       sizeof(std::shared_ptr<FileChecksumFunc>)},
+      {offsetof(struct DBOptions, file_checksum_gen_factory),
+       sizeof(std::shared_ptr<FileChecksumGenFactory>)},
+      {offsetof(struct DBOptions, db_host_id), sizeof(std::string)},
+      {offsetof(struct DBOptions, checksum_handoff_file_types),
+       sizeof(FileTypeSet)},
+      {offsetof(struct DBOptions, compaction_service),
+       sizeof(std::shared_ptr<CompactionService>)},
   };
 
   char* options_ptr = new char[sizeof(DBOptions)];
@@ -209,22 +242,22 @@
   // copy a well constructed struct to this memory and see how many special
   // bytes left.
   DBOptions* options = new (options_ptr) DBOptions();
-  FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist);
+  FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
   // It based on the behavior of compiler that padding bytes are not changed
   // when copying the struct. It's prone to failure when compiler behavior
   // changes. We verify there is unset bytes to detect the case.
   *options = DBOptions();
   int unset_bytes_base =
-      NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist);
+      NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
   ASSERT_GT(unset_bytes_base, 0);
   options->~DBOptions();
 
   options = new (options_ptr) DBOptions();
-  FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist);
+  FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
 
   char* new_options_ptr = new char[sizeof(DBOptions)];
   DBOptions* new_options = new (new_options_ptr) DBOptions();
-  FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsBlacklist);
+  FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
 
   // Need to update the option string if a new option is added.
   ASSERT_OK(
@@ -256,6 +289,8 @@
                              "skip_log_error_on_recovery=true;"
                              "writable_file_max_buffer_size=1048576;"
                              "paranoid_checks=true;"
+                             "flush_verify_memtable_count=true;"
+                             "track_and_verify_wals_in_manifest=true;"
                              "is_fd_close_on_exec=false;"
                              "bytes_per_sync=4295013613;"
                              "strict_bytes_per_sync=true;"
@@ -303,11 +338,17 @@
                              "atomic_flush=false;"
                              "avoid_unnecessary_blocking_io=false;"
                              "log_readahead_size=0;"
-                             "write_dbid_to_manifest=false",
+                             "write_dbid_to_manifest=false;"
+                             "best_efforts_recovery=false;"
+                             "max_bgerror_resume_count=2;"
+                             "bgerror_resume_retry_interval=1000000"
+                             "db_host_id=hostname;"
+                             "lowest_used_cache_tier=kNonVolatileBlockTier;"
+                             "allow_data_in_errors=false",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
-                                            kDBOptionsBlacklist));
+                                            kDBOptionsExcluded));
 
   options->~DBOptions();
   new_options->~DBOptions();
@@ -329,12 +370,12 @@
 // GetColumnFamilyOptionsFromString() and add the option to the input
 // string passed to GetColumnFamilyOptionsFromString()in this test.
 // If it is a complicated type, you also need to add the field to
-// kColumnFamilyOptionsBlacklist, and maybe add customized verification
+// kColumnFamilyOptionsExcluded, and maybe add customized verification
 // for it.
 TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
-  // options in the blacklist need to appear in the same order as in
+  // options in the excluded set need to appear in the same order as in
   // ColumnFamilyOptions.
-  const OffsetGap kColumnFamilyOptionsBlacklist = {
+  const OffsetGap kColumnFamilyOptionsExcluded = {
       {offset_of(&ColumnFamilyOptions::inplace_callback),
        sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))},
       {offset_of(
@@ -364,6 +405,8 @@
       {offset_of(&ColumnFamilyOptions::cf_paths), sizeof(std::vector<DbPath>)},
       {offset_of(&ColumnFamilyOptions::compaction_thread_limiter),
        sizeof(std::shared_ptr<ConcurrentTaskLimiter>)},
+      {offset_of(&ColumnFamilyOptions::sst_partitioner_factory),
+       sizeof(std::shared_ptr<SstPartitionerFactory>)},
   };
 
   char* options_ptr = new char[sizeof(ColumnFamilyOptions)];
@@ -371,44 +414,46 @@
   // Count padding bytes by setting all bytes in the memory to a special char,
   // copy a well constructed struct to this memory and see how many special
   // bytes left.
-  ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions();
   FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions),
-                      kColumnFamilyOptionsBlacklist);
-  // It based on the behavior of compiler that padding bytes are not changed
-  // when copying the struct. It's prone to failure when compiler behavior
-  // changes. We verify there is unset bytes to detect the case.
-  *options = ColumnFamilyOptions();
+                      kColumnFamilyOptionsExcluded);
+
+  // Invoke a user-defined constructor in the hope that it does not overwrite
+  // padding bytes. Note that previously we relied on the implicitly-defined
+  // copy-assignment operator (i.e., `*options = ColumnFamilyOptions();`) here,
+  // which did in fact modify padding bytes.
+  ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions();
 
   // Deprecatd option which is not initialized. Need to set it to avoid
   // Valgrind error
   options->max_mem_compaction_level = 0;
 
   int unset_bytes_base = NumUnsetBytes(options_ptr, sizeof(ColumnFamilyOptions),
-                                       kColumnFamilyOptionsBlacklist);
+                                       kColumnFamilyOptionsExcluded);
   ASSERT_GT(unset_bytes_base, 0);
   options->~ColumnFamilyOptions();
 
   options = new (options_ptr) ColumnFamilyOptions();
   FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions),
-                      kColumnFamilyOptionsBlacklist);
+                      kColumnFamilyOptionsExcluded);
 
   // Following options are not settable through
   // GetColumnFamilyOptionsFromString():
   options->rate_limit_delay_max_milliseconds = 33;
   options->compaction_options_universal = CompactionOptionsUniversal();
-  options->compression_opts = CompressionOptions();
-  options->bottommost_compression_opts = CompressionOptions();
   options->hard_rate_limit = 0;
   options->soft_rate_limit = 0;
+  options->num_levels = 42;  // Initialize options for MutableCF
   options->purge_redundant_kvs_while_flush = false;
   options->max_mem_compaction_level = 0;
   options->compaction_filter = nullptr;
+  options->sst_partitioner_factory = nullptr;
+  options->bottommost_temperature = Temperature::kUnknown;
 
   char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)];
   ColumnFamilyOptions* new_options =
       new (new_options_ptr) ColumnFamilyOptions();
   FillWithSpecialChar(new_options_ptr, sizeof(ColumnFamilyOptions),
-                      kColumnFamilyOptionsBlacklist);
+                      kColumnFamilyOptionsExcluded);
 
   // Need to update the option string if a new option is added.
   ASSERT_OK(GetColumnFamilyOptionsFromString(
@@ -435,6 +480,8 @@
       "max_bytes_for_level_multiplier=60;"
       "memtable_factory=SkipListFactory;"
       "compression=kNoCompression;"
+      "compression_opts=5:6:7:8:9:10:true:11;"
+      "bottommost_compression_opts=4:5:6:7:8:9:true:10;"
       "bottommost_compression=kDisableCompressionOption;"
       "level0_stop_writes_trigger=33;"
       "num_levels=99;"
@@ -449,6 +496,7 @@
       "memtable_prefix_bloom_size_ratio=0.4642;"
       "memtable_whole_key_filtering=true;"
       "memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;"
+      "check_flush_compaction_key_order=false;"
       "paranoid_file_checks=true;"
       "force_consistency_checks=true;"
       "inplace_update_num_locks=7429;"
@@ -463,19 +511,74 @@
       "ttl=60;"
       "periodic_compaction_seconds=3600;"
       "sample_for_compression=0;"
+      "enable_blob_files=true;"
+      "min_blob_size=256;"
+      "blob_file_size=1000000;"
+      "blob_compression_type=kBZip2Compression;"
+      "enable_blob_garbage_collection=true;"
+      "blob_garbage_collection_age_cutoff=0.5;"
+      "blob_garbage_collection_force_threshold=0.75;"
+      "blob_compaction_readahead_size=262144;"
       "compaction_options_fifo={max_table_files_size=3;allow_"
-      "compaction=false;};",
+      "compaction=false;age_for_warm=1;};",
       new_options));
 
   ASSERT_EQ(unset_bytes_base,
             NumUnsetBytes(new_options_ptr, sizeof(ColumnFamilyOptions),
-                          kColumnFamilyOptionsBlacklist));
+                          kColumnFamilyOptionsExcluded));
+
+  ColumnFamilyOptions rnd_filled_options = *new_options;
 
   options->~ColumnFamilyOptions();
   new_options->~ColumnFamilyOptions();
 
   delete[] options_ptr;
   delete[] new_options_ptr;
+
+  // Test copying to mutabable and immutable options and copy back the mutable
+  // part.
+  const OffsetGap kMutableCFOptionsExcluded = {
+      {offset_of(&MutableCFOptions::prefix_extractor),
+       sizeof(std::shared_ptr<const SliceTransform>)},
+      {offset_of(&MutableCFOptions::max_bytes_for_level_multiplier_additional),
+       sizeof(std::vector<int>)},
+      {offset_of(&MutableCFOptions::max_file_size),
+       sizeof(std::vector<uint64_t>)},
+  };
+
+  // For all memory used for options, pre-fill every char. Otherwise, the
+  // padding bytes might be different so that byte-wise comparison doesn't
+  // general equal results even if objects are equal.
+  const char kMySpecialChar = 'x';
+  char* mcfo1_ptr = new char[sizeof(MutableCFOptions)];
+  FillWithSpecialChar(mcfo1_ptr, sizeof(MutableCFOptions),
+                      kMutableCFOptionsExcluded, kMySpecialChar);
+  char* mcfo2_ptr = new char[sizeof(MutableCFOptions)];
+  FillWithSpecialChar(mcfo2_ptr, sizeof(MutableCFOptions),
+                      kMutableCFOptionsExcluded, kMySpecialChar);
+
+  // A clean column family options is constructed after filling the same special
+  // char as the initial one. So that the padding bytes are the same.
+  char* cfo_clean_ptr = new char[sizeof(ColumnFamilyOptions)];
+  FillWithSpecialChar(cfo_clean_ptr, sizeof(ColumnFamilyOptions),
+                      kColumnFamilyOptionsExcluded);
+  rnd_filled_options.num_levels = 66;
+  ColumnFamilyOptions* cfo_clean = new (cfo_clean_ptr) ColumnFamilyOptions();
+
+  MutableCFOptions* mcfo1 =
+      new (mcfo1_ptr) MutableCFOptions(rnd_filled_options);
+  ColumnFamilyOptions cfo_back = BuildColumnFamilyOptions(*cfo_clean, *mcfo1);
+  MutableCFOptions* mcfo2 = new (mcfo2_ptr) MutableCFOptions(cfo_back);
+
+  ASSERT_TRUE(CompareBytes(mcfo1_ptr, mcfo2_ptr, sizeof(MutableCFOptions),
+                           kMutableCFOptionsExcluded));
+
+  cfo_clean->~ColumnFamilyOptions();
+  mcfo1->~MutableCFOptions();
+  mcfo2->~MutableCFOptions();
+  delete[] mcfo1_ptr;
+  delete[] mcfo2_ptr;
+  delete[] cfo_clean_ptr;
 }
 #endif  // !__clang__
 #endif  // OS_LINUX || OS_WIN
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/options/options_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/options/options_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,13 +16,14 @@
 #include "cache/sharded_cache.h"
 #include "options/options_helper.h"
 #include "options/options_parser.h"
-#include "options/options_sanity_check.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/file_checksum.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/leveldb_options.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "table/block_based/filter_policy_internal.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -30,6 +31,9 @@
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
@@ -63,7 +67,7 @@
        "kZSTD:"
        "kZSTDNotFinalCompression"},
       {"bottommost_compression", "kLZ4Compression"},
-      {"bottommost_compression_opts", "5:6:7:8:9:true"},
+      {"bottommost_compression_opts", "5:6:7:8:10:true"},
       {"compression_opts", "4:5:6:7:8:true"},
       {"num_levels", "8"},
       {"level0_file_num_compaction_trigger", "8"},
@@ -98,6 +102,14 @@
       {"min_partial_merge_operands", "31"},
       {"prefix_extractor", "fixed:31"},
       {"optimize_filters_for_hits", "true"},
+      {"enable_blob_files", "true"},
+      {"min_blob_size", "1K"},
+      {"blob_file_size", "1G"},
+      {"blob_compression_type", "kZSTD"},
+      {"enable_blob_garbage_collection", "true"},
+      {"blob_garbage_collection_age_cutoff", "0.5"},
+      {"blob_garbage_collection_force_threshold", "0.75"},
+      {"blob_compaction_readahead_size", "256K"},
   };
 
   std::unordered_map<std::string, std::string> db_options_map = {
@@ -105,6 +117,7 @@
       {"create_missing_column_families", "true"},
       {"error_if_exists", "false"},
       {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
       {"max_open_files", "32"},
       {"max_total_wal_size", "33"},
       {"use_fsync", "true"},
@@ -133,6 +146,7 @@
       {"persist_stats_to_disk", "false"},
       {"stats_history_buffer_size", "69"},
       {"advise_random_on_open", "true"},
+      {"experimental_mempurge_threshold", "0.0"},
       {"use_adaptive_mutex", "false"},
       {"new_table_reader_for_compaction_inputs", "true"},
       {"compaction_readahead_size", "100"},
@@ -145,8 +159,16 @@
 
   ColumnFamilyOptions base_cf_opt;
   ColumnFamilyOptions new_cf_opt;
-  ASSERT_OK(GetColumnFamilyOptionsFromMap(
-            base_cf_opt, cf_options_map, &new_cf_opt));
+  ConfigOptions exact, loose;
+  exact.input_strings_escaped = false;
+  exact.ignore_unknown_options = false;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  loose.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+
+  loose.input_strings_escaped = false;
+  loose.ignore_unknown_options = true;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                          &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
@@ -168,13 +190,17 @@
   ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
   ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
   ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads,
+            CompressionOptions().parallel_threads);
   ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
-  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 10u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            CompressionOptions().parallel_threads);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.num_levels, 8);
   ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
@@ -208,41 +234,49 @@
   ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
   ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
   ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
-  ASSERT_EQ(std::string(new_cf_opt.prefix_extractor->Name()),
-            "rocksdb.FixedPrefix.31");
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
+  ASSERT_EQ(new_cf_opt.enable_blob_files, true);
+  ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
+  ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
+  ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
+  ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75);
+  ASSERT_EQ(new_cf_opt.blob_compaction_readahead_size, 262144);
 
   cf_options_map["write_buffer_size"] = "hello";
-  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
-             base_cf_opt, cf_options_map, &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                           &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
 
   cf_options_map["write_buffer_size"] = "1";
-  ASSERT_OK(GetColumnFamilyOptionsFromMap(
-            base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                          &new_cf_opt));
 
   cf_options_map["unknown_option"] = "1";
-  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
-             base_cf_opt, cf_options_map, &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
+                                           &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
 
-  ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map,
-                                          &new_cf_opt,
-                                          false, /* input_strings_escaped  */
-                                          true /* ignore_unknown_options */));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
-      base_cf_opt, new_cf_opt, nullptr, /* new_opt_map */
-      kSanityLevelLooselyCompatible /* from CheckOptionsCompatibility*/));
-  ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
-      base_cf_opt, new_cf_opt, nullptr, /* new_opt_map */
-      kSanityLevelExactMatch /* default for VerifyCFOptions */));
+  // ignore_unknown_options=true;input_strings_escaped=false
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(loose, base_cf_opt, cf_options_map,
+                                          &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(loose, base_cf_opt, new_cf_opt));
+  ASSERT_NOK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
 
   DBOptions base_db_opt;
   DBOptions new_db_opt;
-  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(
+      GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt));
   ASSERT_EQ(new_db_opt.create_if_missing, false);
   ASSERT_EQ(new_db_opt.create_missing_column_families, true);
   ASSERT_EQ(new_db_opt.error_if_exists, false);
   ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
   ASSERT_EQ(new_db_opt.max_open_files, 32);
   ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
   ASSERT_EQ(new_db_opt.use_fsync, true);
@@ -272,6 +306,7 @@
   ASSERT_EQ(new_db_opt.persist_stats_to_disk, false);
   ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U);
   ASSERT_EQ(new_db_opt.advise_random_on_open, true);
+  ASSERT_EQ(new_db_opt.experimental_mempurge_threshold, 0.0);
   ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
   ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true);
   ASSERT_EQ(new_db_opt.compaction_readahead_size, 100);
@@ -282,26 +317,30 @@
   ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true);
 
   db_options_map["max_open_files"] = "hello";
-  ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_db_opt, new_db_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(
-      base_db_opt, new_db_opt, nullptr, /* new_opt_map */
-      kSanityLevelLooselyCompatible /* from CheckOptionsCompatibility */));
+  Status s =
+      GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
 
   // unknow options should fail parsing without ignore_unknown_options = true
   db_options_map["unknown_db_option"] = "1";
-  ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_db_opt, new_db_opt));
+  s = GetDBOptionsFromMap(exact, base_db_opt, db_options_map, &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
 
-  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt,
-                                false, /* input_strings_escaped  */
-                                true /* ignore_unknown_options */));
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(
-      base_db_opt, new_db_opt, nullptr, /* new_opt_map */
-      kSanityLevelLooselyCompatible /* from CheckOptionsCompatibility */));
-  ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(
-      base_db_opt, new_db_opt, nullptr, /* new_opt_mat */
-      kSanityLevelExactMatch /* default for VerifyDBOptions */));
+  ASSERT_OK(
+      GetDBOptionsFromMap(loose, base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
+  ASSERT_NOK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
 }
 #endif  // !ROCKSDB_LITE
 
@@ -310,77 +349,91 @@
 TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
   ColumnFamilyOptions base_cf_opt;
   ColumnFamilyOptions new_cf_opt;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
   base_cf_opt.table_factory.reset();
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=5", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, "",
+                                             &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=5", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 5U);
   ASSERT_TRUE(new_cf_opt.table_factory == nullptr);
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=6;", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=6;", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 6U);
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "  write_buffer_size =  7  ", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "  write_buffer_size =  7  ", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 7U);
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "  write_buffer_size =  8 ; ", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "  write_buffer_size =  8 ; ", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 8U);
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 9U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=11; max_write_buffer_number  =  12 ;",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=11; max_write_buffer_number  =  12 ;", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 11U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
   // Wrong name "max_write_buffer_number_"
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=13;max_write_buffer_number_=14;",
-              &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=13;max_write_buffer_number_=14;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
   // Comparator from object registry
   std::string kCompName = "reverse_comp";
-  ObjectLibrary::Default()->Register<const Comparator>(
+  ObjectLibrary::Default()->AddFactory<const Comparator>(
       kCompName,
       [](const std::string& /*name*/,
          std::unique_ptr<const Comparator>* /*guard*/,
          std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
 
-  ASSERT_OK(GetColumnFamilyOptionsFromString(
-      base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "comparator=" + kCompName + ";",
+                                             &new_cf_opt));
   ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator());
 
   // MergeOperator from object registry
   std::unique_ptr<BytesXOROperator> bxo(new BytesXOROperator());
   std::string kMoName = bxo->Name();
-  ObjectLibrary::Default()->Register<MergeOperator>(
-      kMoName,
-      [](const std::string& /*name*/, std::unique_ptr<MergeOperator>* guard,
-         std::string* /* errmsg */) {
-        guard->reset(new BytesXOROperator());
-        return guard->get();
-      });
 
-  ASSERT_OK(GetColumnFamilyOptionsFromString(
-      base_cf_opt, "merge_operator=" + kMoName + ";", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "merge_operator=" + kMoName + ";",
+                                             &new_cf_opt));
   ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name()));
 
   // Wrong key/value pair
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
-
-  // Error Paring value
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  Status s = GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
+
+  // Error Parsing value
+  s = GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
   // Missing option name
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=13; =100;", &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  s = GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=13; =100;", &new_cf_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
   const uint64_t kilo = 1024UL;
   const uint64_t mega = 1024 * kilo;
@@ -389,17 +442,17 @@
 
   // Units (k)
   ASSERT_OK(GetColumnFamilyOptionsFromString(
-      base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt));
+      config_options, base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo);
   // Units (m)
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "max_write_buffer_number=16m;inplace_update_num_locks=17M",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "max_write_buffer_number=16m;inplace_update_num_locks=17M", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega);
   ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega);
   // Units (g)
   ASSERT_OK(GetColumnFamilyOptionsFromString(
-      base_cf_opt,
+      config_options, base_cf_opt,
       "write_buffer_size=18g;prefix_extractor=capped:8;"
       "arena_block_size=19G",
       &new_cf_opt));
@@ -407,129 +460,412 @@
   ASSERT_EQ(new_cf_opt.write_buffer_size, 18 * giga);
   ASSERT_EQ(new_cf_opt.arena_block_size, 19 * giga);
   ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
-  std::string prefix_name(new_cf_opt.prefix_extractor->Name());
-  ASSERT_EQ(prefix_name, "rocksdb.CappedPrefix.8");
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.CappedPrefix.8");
 
   // Units (t)
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt, "write_buffer_size=20t;arena_block_size=21T",
+      &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera);
   ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera);
 
   // Nested block based table options
   // Empty
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=10;max_write_buffer_number=16;"
-            "block_based_table_factory={};arena_block_size=1024",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={};arena_block_size=1024",
+      &new_cf_opt));
   ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
   // Non-empty
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=10;max_write_buffer_number=16;"
-            "block_based_table_factory={block_cache=1M;block_size=4;};"
-            "arena_block_size=1024",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;};"
+      "arena_block_size=1024",
+      &new_cf_opt));
   ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
   // Last one
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=10;max_write_buffer_number=16;"
-            "block_based_table_factory={block_cache=1M;block_size=4;}",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;}",
+      &new_cf_opt));
   ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
   // Mismatch curly braces
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=10;max_write_buffer_number=16;"
-             "block_based_table_factory={{{block_size=4;};"
-             "arena_block_size=1024",
-             &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={{{block_size=4;};"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
   // Unexpected chars after closing curly brace
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=10;max_write_buffer_number=16;"
-             "block_based_table_factory={block_size=4;}};"
-             "arena_block_size=1024",
-             &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_size=4;}};"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=10;max_write_buffer_number=16;"
-             "block_based_table_factory={block_size=4;}xdfa;"
-             "arena_block_size=1024",
-             &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_size=4;}xdfa;"
+      "arena_block_size=1024",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=10;max_write_buffer_number=16;"
-             "block_based_table_factory={block_size=4;}xdfa",
-             &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_size=4;}xdfa",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
   // Invalid block based table option
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-             "write_buffer_size=10;max_write_buffer_number=16;"
-             "block_based_table_factory={xx_block_size=4;}",
-             &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
-
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-           "optimize_filters_for_hits=true",
-           &new_cf_opt));
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "optimize_filters_for_hits=false",
-            &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={xx_block_size=4;}",
+      &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "optimize_filters_for_hits=junk",
-              &new_cf_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "optimize_filters_for_hits=true",
+                                             &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                             "optimize_filters_for_hits=false",
+                                             &new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt,
+                                              "optimize_filters_for_hits=junk",
+                                              &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opt,
+                                                  new_cf_opt));
 
   // Nested plain table options
   // Empty
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=10;max_write_buffer_number=16;"
-            "plain_table_factory={};arena_block_size=1024",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "plain_table_factory={};arena_block_size=1024",
+      &new_cf_opt));
   ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
   ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
   // Non-empty
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=10;max_write_buffer_number=16;"
-            "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};"
-            "arena_block_size=1024",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};"
+      "arena_block_size=1024",
+      &new_cf_opt));
   ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
   ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
 
   // memtable factory
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=10;max_write_buffer_number=16;"
-            "memtable=skip_list:10;arena_block_size=1024",
-            &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "memtable=skip_list:10;arena_block_size=1024",
+      &new_cf_opt));
   ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr);
   ASSERT_EQ(std::string(new_cf_opt.memtable_factory->Name()), "SkipListFactory");
+  ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory"));
+}
+
+TEST_F(OptionsTest, CompressionOptionsFromString) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ConfigOptions config_options;
+  std::string opts_str;
+  config_options.ignore_unknown_options = false;
+  CompressionOptions dflt;
+  // Test with some optional values removed....
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(config_options, ColumnFamilyOptions(),
+                                       "compression_opts=3:4:5; "
+                                       "bottommost_compression_opts=4:5:6:7",
+                                       &base_cf_opt));
+  ASSERT_EQ(base_cf_opt.compression_opts.window_bits, 3);
+  ASSERT_EQ(base_cf_opt.compression_opts.level, 4);
+  ASSERT_EQ(base_cf_opt.compression_opts.strategy, 5);
+  ASSERT_EQ(base_cf_opt.compression_opts.max_dict_bytes, dflt.max_dict_bytes);
+  ASSERT_EQ(base_cf_opt.compression_opts.zstd_max_train_bytes,
+            dflt.zstd_max_train_bytes);
+  ASSERT_EQ(base_cf_opt.compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(base_cf_opt.compression_opts.enabled, dflt.enabled);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.window_bits, 4);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.level, 5);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.strategy, 6);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes,
+            dflt.zstd_max_train_bytes);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.enabled, dflt.enabled);
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=4:5:6:7:8:9:true; "
+      "bottommost_compression_opts=5:6:7:8:9:false",
+      &base_cf_opt));
+  ASSERT_EQ(base_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(base_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(base_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(base_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(base_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(base_cf_opt.compression_opts.parallel_threads, 9u);
+  ASSERT_EQ(base_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(base_cf_opt.bottommost_compression_opts.enabled, false);
+
+  ASSERT_OK(
+      GetStringFromColumnFamilyOptions(config_options, base_cf_opt, &opts_str));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(), opts_str, &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 9u);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            dflt.parallel_threads);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, false);
+
+  // Test as struct values
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts={window_bits=5; level=6; strategy=7; max_dict_bytes=8;"
+      "zstd_max_train_bytes=9;parallel_threads=10;enabled=true}; "
+      "bottommost_compression_opts={window_bits=4; level=5; strategy=6;"
+      " max_dict_bytes=7;zstd_max_train_bytes=8;parallel_threads=9;"
+      "enabled=false}; ",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads, 10u);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads, 9u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, false);
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "compression_opts={window_bits=4; strategy=5;};"
+      "bottommost_compression_opts={level=6; strategy=7;}",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+
+  ASSERT_EQ(new_cf_opt.compression_opts.level,
+            base_cf_opt.compression_opts.level);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes,
+            base_cf_opt.compression_opts.max_dict_bytes);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes,
+            base_cf_opt.compression_opts.zstd_max_train_bytes);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads,
+            base_cf_opt.compression_opts.parallel_threads);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled,
+            base_cf_opt.compression_opts.enabled);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits,
+            base_cf_opt.bottommost_compression_opts.window_bits);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes,
+            base_cf_opt.bottommost_compression_opts.max_dict_bytes);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes,
+            base_cf_opt.bottommost_compression_opts.zstd_max_train_bytes);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            base_cf_opt.bottommost_compression_opts.parallel_threads);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled,
+            base_cf_opt.bottommost_compression_opts.enabled);
+
+  // Test a few individual struct values
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, base_cf_opt,
+      "compression_opts.enabled=false; "
+      "bottommost_compression_opts.enabled=true; ",
+      &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, false);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
+
+  // Now test some illegal values
+  ConfigOptions ignore;
+  ignore.ignore_unknown_options = true;
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=5:6:7:8:9:x:false", &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ignore, ColumnFamilyOptions(), "compression_opts=5:6:7:8:9:x:false",
+      &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=1:2:3:4:5:6:true:8", &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8",
+      &base_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=1:2:3:4:5:6:true:8:9", &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8:9",
+      &base_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(), "compression_opts={unknown=bad;}",
+      &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(ignore, ColumnFamilyOptions(),
+                                             "compression_opts={unknown=bad;}",
+                                             &base_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(), "compression_opts.unknown=bad",
+      &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(ignore, ColumnFamilyOptions(),
+                                             "compression_opts.unknown=bad",
+                                             &base_cf_opt));
+}
+
+TEST_F(OptionsTest, OldInterfaceTest) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ConfigOptions exact;
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=18;prefix_extractor=capped:8;"
+      "arena_block_size=19",
+      &new_cf_opt));
+
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
+
+  // And with a bad option
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={xx_block_size=4;}",
+      &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  std::unordered_map<std::string, std::string> cf_options_map = {
+      {"write_buffer_size", "1"},
+      {"max_write_buffer_number", "2"},
+      {"min_write_buffer_number_to_merge", "3"},
+  };
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt));
+  cf_options_map["unknown_option"] = "1";
+  ASSERT_NOK(
+      GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map,
+                                          &new_cf_opt, true, true));
+
+  DBOptions base_db_opt;
+  DBOptions new_db_opt;
+  std::unordered_map<std::string, std::string> db_options_map = {
+      {"create_if_missing", "false"},
+      {"create_missing_column_families", "true"},
+      {"error_if_exists", "false"},
+      {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
+      {"max_open_files", "32"},
+  };
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
+  ASSERT_EQ(new_db_opt.max_open_files, 32);
+  db_options_map["unknown_option"] = "1";
+  Status s = GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt, true,
+                                true));
+  ASSERT_OK(GetDBOptionsFromString(
+      base_db_opt,
+      "create_if_missing=false;error_if_exists=false;max_open_files=42;",
+      &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.max_open_files, 42);
+  s = GetDBOptionsFromString(
+      base_db_opt,
+      "create_if_missing=false;error_if_exists=false;max_open_files=42;"
+      "unknown_option=1;",
+      &new_db_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
 }
+
 #endif  // !ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE  // GetBlockBasedTableOptionsFromString is not supported
 TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   BlockBasedTableOptions table_opt;
   BlockBasedTableOptions new_opt;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
   // make sure default values are overwritten by something else
   ASSERT_OK(GetBlockBasedTableOptionsFromString(
-      table_opt,
+      config_options, table_opt,
       "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
-      "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
+      "checksum=kxxHash;hash_index_allow_collision=1;"
       "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
       "block_size_deviation=8;block_restart_interval=4;"
       "format_version=5;whole_key_filtering=1;"
-      "filter_policy=bloomfilter:4.567:false;",
+      "reserve_table_builder_memory=true;"
+      "filter_policy=bloomfilter:4.567:false;"
+      // A bug caused read_amp_bytes_per_bit to be a large integer in OPTIONS
+      // file generated by 6.10 to 6.14. Though bug is fixed in these releases,
+      // we need to handle the case of loading OPTIONS file generated before the
+      // fix.
+      "read_amp_bytes_per_bit=17179869185;",
       &new_opt));
   ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
   ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
   ASSERT_TRUE(new_opt.hash_index_allow_collision);
-  ASSERT_TRUE(new_opt.no_block_cache);
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
@@ -539,63 +875,130 @@
   ASSERT_EQ(new_opt.block_restart_interval, 4);
   ASSERT_EQ(new_opt.format_version, 5U);
   ASSERT_EQ(new_opt.whole_key_filtering, true);
+  ASSERT_EQ(new_opt.reserve_table_builder_memory, true);
   ASSERT_TRUE(new_opt.filter_policy != nullptr);
-  const BloomFilterPolicy& bfp =
-      dynamic_cast<const BloomFilterPolicy&>(*new_opt.filter_policy);
-  EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567);
-  EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5);
+  const BloomFilterPolicy* bfp =
+      dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 4567);
+  EXPECT_EQ(bfp->GetWholeBitsPerKey(), 5);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kAutoBloom);
+  // Verify that only the lower 32bits are stored in
+  // new_opt.read_amp_bytes_per_bit.
+  EXPECT_EQ(1U, new_opt.read_amp_bytes_per_bit);
 
   // unknown option
-  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
-             "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
-             "bad_option=1",
-             &new_opt));
+  Status s = GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
+      "bad_option=1",
+      &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
   ASSERT_EQ(static_cast<bool>(table_opt.cache_index_and_filter_blocks),
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.index_type, new_opt.index_type);
 
   // unrecognized index type
-  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
-             "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX",
-             &new_opt));
+  s = GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
   ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.index_type, new_opt.index_type);
 
   // unrecognized checksum type
-  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
-             "cache_index_and_filter_blocks=1;checksum=kxxHashXX",
-             &new_opt));
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "cache_index_and_filter_blocks=1;checksum=kxxHashXX", &new_opt));
   ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.index_type, new_opt.index_type);
 
   // unrecognized filter policy name
-  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
-             "cache_index_and_filter_blocks=1;"
-             "filter_policy=bloomfilterxx:4:true",
-             &new_opt));
+  s = GetBlockBasedTableOptionsFromString(config_options, table_opt,
+                                          "cache_index_and_filter_blocks=1;"
+                                          "filter_policy=bloomfilterxx:4:true",
+                                          &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
   ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy);
 
   // unrecognized filter policy config
-  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
-             "cache_index_and_filter_blocks=1;"
-             "filter_policy=bloomfilter:4",
-             &new_opt));
+  s = GetBlockBasedTableOptionsFromString(config_options, table_opt,
+                                          "cache_index_and_filter_blocks=1;"
+                                          "filter_policy=bloomfilter:4",
+                                          &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
   ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy);
 
+  // Ribbon filter policy (no Bloom hybrid)
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=ribbonfilter:5.678:-1;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  bfp = dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 5678);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon);
+
+  // Ribbon filter policy (default Bloom hybrid)
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=ribbonfilter:6.789;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  auto ltfp = dynamic_cast<const LevelThresholdFilterPolicy*>(
+      new_opt.filter_policy.get());
+  EXPECT_EQ(ltfp->TEST_GetStartingLevelForB(), 0);
+
+  bfp = dynamic_cast<const BloomFilterPolicy*>(ltfp->TEST_GetPolicyA());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kFastLocalBloom);
+
+  bfp = dynamic_cast<const BloomFilterPolicy*>(ltfp->TEST_GetPolicyB());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon);
+
+  // Ribbon filter policy (custom Bloom hybrid)
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=ribbonfilter:6.789:5;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  ltfp = dynamic_cast<const LevelThresholdFilterPolicy*>(
+      new_opt.filter_policy.get());
+  EXPECT_EQ(ltfp->TEST_GetStartingLevelForB(), 5);
+
+  bfp = dynamic_cast<const BloomFilterPolicy*>(ltfp->TEST_GetPolicyA());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kFastLocalBloom);
+
+  bfp = dynamic_cast<const BloomFilterPolicy*>(ltfp->TEST_GetPolicyB());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon);
+
+  // Old name
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=experimental_ribbon:6.789;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  bfp = dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon);
+
   // Check block cache options are overwritten when specified
   // in new format as a struct.
-  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
-             "block_cache={capacity=1M;num_shard_bits=4;"
-             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};"
-             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
-             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}",
-             &new_opt));
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "block_cache={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};"
+      "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}",
+      &new_opt));
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
@@ -614,10 +1017,11 @@
 
   // Set only block cache capacity. Check other values are
   // reset to default values.
-  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
-             "block_cache={capacity=2M};"
-             "block_cache_compressed={capacity=2M}",
-             &new_opt));
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "block_cache={capacity=2M};"
+      "block_cache_compressed={capacity=2M}",
+      &new_opt));
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
   // Default values
@@ -642,7 +1046,7 @@
 
   // Set couple of block cache options.
   ASSERT_OK(GetBlockBasedTableOptionsFromString(
-      table_opt,
+      config_options, table_opt,
       "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
       "block_cache_compressed={num_shard_bits=5;"
       "high_pri_pool_ratio=0.0;}",
@@ -663,12 +1067,13 @@
             0.0);
 
   // Set couple of block cache options.
-  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
-             "block_cache={capacity=1M;num_shard_bits=4;"
-             "strict_capacity_limit=true;};"
-             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
-             "strict_capacity_limit=true;}",
-             &new_opt));
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt,
+      "block_cache={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;};"
+      "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+      "strict_capacity_limit=true;}",
+      &new_opt));
   ASSERT_TRUE(new_opt.block_cache != nullptr);
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
@@ -693,12 +1098,16 @@
 TEST_F(OptionsTest, GetPlainTableOptionsFromString) {
   PlainTableOptions table_opt;
   PlainTableOptions new_opt;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
   // make sure default values are overwritten by something else
-  ASSERT_OK(GetPlainTableOptionsFromString(table_opt,
-            "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
-            "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;"
-            "full_scan_mode=true;store_index_in_file=true",
-            &new_opt));
+  ASSERT_OK(GetPlainTableOptionsFromString(
+      config_options, table_opt,
+      "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+      "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;"
+      "full_scan_mode=true;store_index_in_file=true",
+      &new_opt));
   ASSERT_EQ(new_opt.user_key_len, 66u);
   ASSERT_EQ(new_opt.bloom_bits_per_key, 20);
   ASSERT_EQ(new_opt.hash_table_ratio, 0.5);
@@ -709,16 +1118,22 @@
   ASSERT_TRUE(new_opt.store_index_in_file);
 
   // unknown option
-  ASSERT_NOK(GetPlainTableOptionsFromString(table_opt,
-             "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
-             "bad_option=1",
-             &new_opt));
+  Status s = GetPlainTableOptionsFromString(
+      config_options, table_opt,
+      "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+      "bad_option=1",
+      &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
 
   // unrecognized EncodingType
-  ASSERT_NOK(GetPlainTableOptionsFromString(table_opt,
-             "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
-             "encoding_type=kPrefixXX",
-             &new_opt));
+  s = GetPlainTableOptionsFromString(
+      config_options, table_opt,
+      "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+      "encoding_type=kPrefixXX",
+      &new_opt);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
 }
 #endif  // !ROCKSDB_LITE
 
@@ -728,14 +1143,14 @@
 
   ASSERT_OK(GetMemTableRepFactoryFromString("skip_list", &new_mem_factory));
   ASSERT_OK(GetMemTableRepFactoryFromString("skip_list:16", &new_mem_factory));
-  ASSERT_EQ(std::string(new_mem_factory->Name()), "SkipListFactory");
+  ASSERT_STREQ(new_mem_factory->Name(), "SkipListFactory");
   ASSERT_NOK(GetMemTableRepFactoryFromString("skip_list:16:invalid_opt",
                                              &new_mem_factory));
 
   ASSERT_OK(GetMemTableRepFactoryFromString("prefix_hash", &new_mem_factory));
   ASSERT_OK(GetMemTableRepFactoryFromString("prefix_hash:1000",
                                             &new_mem_factory));
-  ASSERT_EQ(std::string(new_mem_factory->Name()), "HashSkipListRepFactory");
+  ASSERT_STREQ(new_mem_factory->Name(), "HashSkipListRepFactory");
   ASSERT_NOK(GetMemTableRepFactoryFromString("prefix_hash:1000:invalid_opt",
                                              &new_mem_factory));
 
@@ -761,9 +1176,113 @@
 }
 #endif  // !ROCKSDB_LITE
 
+TEST_F(OptionsTest, MemTableRepFactoryCreateFromString) {
+  std::unique_ptr<MemTableRepFactory> new_mem_factory = nullptr;
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "skip_list",
+                                                 &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "skip_list:16",
+                                                 &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "SkipListFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("skip_list"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("SkipListFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "skip_list:16:invalid_opt", &new_mem_factory));
+
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "invalid_opt=10", &new_mem_factory));
+
+  // Test a reset
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "",
+                                                 &new_mem_factory));
+  ASSERT_EQ(new_mem_factory, nullptr);
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "invalid_opt=10", &new_mem_factory));
+
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "id=skip_list; lookahead=32", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "prefix_hash",
+                                                 &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "prefix_hash:1000", &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "HashSkipListRepFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("prefix_hash"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("HashSkipListRepFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "prefix_hash:1000:invalid_opt", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=prefix_hash; bucket_count=32; skiplist_height=64; "
+      "branching_factor=16",
+      &new_mem_factory));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=prefix_hash; bucket_count=32; skiplist_height=64; "
+      "branching_factor=16; invalid=unknown",
+      &new_mem_factory));
+
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "hash_linkedlist", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "hash_linkedlist:1000", &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "HashLinkListRepFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("hash_linkedlist"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("HashLinkListRepFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "hash_linkedlist:1000:invalid_opt", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=hash_linkedlist; bucket_count=32; threshold=64; huge_page_size=16; "
+      "logging_threshold=12; log_when_flash=true",
+      &new_mem_factory));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options,
+      "id=hash_linkedlist; bucket_count=32; threshold=64; huge_page_size=16; "
+      "logging_threshold=12; log_when_flash=true; invalid=unknown",
+      &new_mem_factory));
+
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "vector",
+                                                 &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "vector:1024",
+                                                 &new_mem_factory));
+  ASSERT_STREQ(new_mem_factory->Name(), "VectorRepFactory");
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("vector"));
+  ASSERT_TRUE(new_mem_factory->IsInstanceOf("VectorRepFactory"));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "vector:1024:invalid_opt", &new_mem_factory));
+  ASSERT_OK(MemTableRepFactory::CreateFromString(
+      config_options, "id=vector; count=42", &new_mem_factory));
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(
+      config_options, "id=vector; invalid=unknown", &new_mem_factory));
+#endif  // ROCKSDB_LITE
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "cuckoo",
+                                                  &new_mem_factory));
+  // CuckooHash memtable is already removed.
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "cuckoo:1024",
+                                                  &new_mem_factory));
+
+  ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "bad_factory",
+                                                  &new_mem_factory));
+}
+
 #ifndef ROCKSDB_LITE  // GetOptionsFromString is not supported in RocksDB Lite
+class CustomEnv : public EnvWrapper {
+ public:
+  explicit CustomEnv(Env* _target) : EnvWrapper(_target) {}
+  static const char* kClassName() { return "CustomEnv"; }
+  const char* Name() const override { return kClassName(); }
+};
+
 TEST_F(OptionsTest, GetOptionsFromStringTest) {
   Options base_options, new_options;
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+
   base_options.write_buffer_size = 20;
   base_options.min_write_buffer_number_to_merge = 15;
   BlockBasedTableOptions block_based_table_options;
@@ -772,14 +1291,8 @@
       NewBlockBasedTableFactory(block_based_table_options));
 
   // Register an Env with object registry.
-  const static char* kCustomEnvName = "CustomEnv";
-  class CustomEnv : public EnvWrapper {
-   public:
-    explicit CustomEnv(Env* _target) : EnvWrapper(_target) {}
-  };
-
-  ObjectLibrary::Default()->Register<Env>(
-      kCustomEnvName,
+  ObjectLibrary::Default()->AddFactory<Env>(
+      CustomEnv::kClassName(),
       [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
          std::string* /* errmsg */) {
         static CustomEnv env(Env::Default());
@@ -787,7 +1300,7 @@
       });
 
   ASSERT_OK(GetOptionsFromString(
-      base_options,
+      config_options, base_options,
       "write_buffer_size=10;max_write_buffer_number=16;"
       "block_based_table_factory={block_cache=1M;block_size=4;};"
       "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;"
@@ -801,6 +1314,7 @@
   ASSERT_EQ(new_options.compression_opts.strategy, 6);
   ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u);
   ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.parallel_threads, 1u);
   ASSERT_EQ(new_options.compression_opts.enabled, false);
   ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption);
   ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5);
@@ -808,41 +1322,97 @@
   ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7);
   ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u);
   ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.parallel_threads, 1u);
   ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false);
   ASSERT_EQ(new_options.write_buffer_size, 10U);
   ASSERT_EQ(new_options.max_write_buffer_number, 16);
-  BlockBasedTableOptions new_block_based_table_options =
-      dynamic_cast<BlockBasedTableFactory*>(new_options.table_factory.get())
-          ->table_options();
-  ASSERT_EQ(new_block_based_table_options.block_cache->GetCapacity(), 1U << 20);
-  ASSERT_EQ(new_block_based_table_options.block_size, 4U);
+  const auto new_bbto =
+      new_options.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(new_bbto, nullptr);
+  ASSERT_EQ(new_bbto->block_cache->GetCapacity(), 1U << 20);
+  ASSERT_EQ(new_bbto->block_size, 4U);
   // don't overwrite block based table options
-  ASSERT_TRUE(new_block_based_table_options.cache_index_and_filter_blocks);
+  ASSERT_TRUE(new_bbto->cache_index_and_filter_blocks);
 
   ASSERT_EQ(new_options.create_if_missing, true);
   ASSERT_EQ(new_options.max_open_files, 1);
   ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
   Env* newEnv = new_options.env;
-  ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv));
+  ASSERT_OK(Env::LoadEnv(CustomEnv::kClassName(), &newEnv));
   ASSERT_EQ(newEnv, new_options.env);
+
+  config_options.ignore_unknown_options = false;
+  // Test a bad value for a DBOption returns a failure
+  base_options.dump_malloc_stats = false;
+  base_options.write_buffer_size = 1024;
+  Options bad_options = new_options;
+  Status s = GetOptionsFromString(config_options, base_options,
+                                  "create_if_missing=XX;dump_malloc_stats=true",
+                                  &bad_options);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_EQ(bad_options.dump_malloc_stats, false);
+
+  bad_options = new_options;
+  s = GetOptionsFromString(config_options, base_options,
+                           "write_buffer_size=XX;dump_malloc_stats=true",
+                           &bad_options);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  ASSERT_EQ(bad_options.dump_malloc_stats, false);
+
+  // Test a bad value for a TableFactory Option returns a failure
+  bad_options = new_options;
+  s = GetOptionsFromString(config_options, base_options,
+                           "write_buffer_size=16;dump_malloc_stats=true"
+                           "block_based_table_factory={block_size=XX;};",
+                           &bad_options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_EQ(bad_options.dump_malloc_stats, false);
+  ASSERT_EQ(bad_options.write_buffer_size, 1024);
+
+  config_options.ignore_unknown_options = true;
+  ASSERT_OK(GetOptionsFromString(config_options, base_options,
+                                 "create_if_missing=XX;dump_malloc_stats=true;"
+                                 "write_buffer_size=XX;"
+                                 "block_based_table_factory={block_size=XX;};",
+                                 &bad_options));
+  ASSERT_EQ(bad_options.create_if_missing, base_options.create_if_missing);
+  ASSERT_EQ(bad_options.dump_malloc_stats, true);
+  ASSERT_EQ(bad_options.write_buffer_size, base_options.write_buffer_size);
+
+  // Test the old interface
+  ASSERT_OK(GetOptionsFromString(
+      base_options,
+      "write_buffer_size=22;max_write_buffer_number=33;max_open_files=44;",
+      &new_options));
+  ASSERT_EQ(new_options.write_buffer_size, 22U);
+  ASSERT_EQ(new_options.max_write_buffer_number, 33);
+  ASSERT_EQ(new_options.max_open_files, 44);
 }
 
 TEST_F(OptionsTest, DBOptionsSerialization) {
   Options base_options, new_options;
   Random rnd(301);
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
 
   // Phase 1: Make big change in base_options
   test::RandomInitDBOptions(&base_options, &rnd);
 
   // Phase 2: obtain a string from base_option
   std::string base_options_file_content;
-  ASSERT_OK(GetStringFromDBOptions(&base_options_file_content, base_options));
+  ASSERT_OK(GetStringFromDBOptions(config_options, base_options,
+                                   &base_options_file_content));
 
   // Phase 3: Set new_options from the derived string and expect
   //          new_options == base_options
-  ASSERT_OK(GetDBOptionsFromString(DBOptions(), base_options_file_content,
-                                   &new_options));
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_options, new_options));
+  ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(),
+                                   base_options_file_content, &new_options));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_options,
+                                                  new_options));
 }
 
 TEST_F(OptionsTest, OptionsComposeDecompose) {
@@ -850,6 +1420,9 @@
   // we get same constituent options.
   DBOptions base_db_opts;
   ColumnFamilyOptions base_cf_opts;
+  ConfigOptions
+      config_options;  // Use default for ignore(false) and check (exact)
+  config_options.input_strings_escaped = false;
 
   Random rnd(301);
   test::RandomInitDBOptions(&base_db_opts, &rnd);
@@ -859,34 +1432,254 @@
   DBOptions new_db_opts(base_opts);
   ColumnFamilyOptions new_cf_opts(base_opts);
 
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_db_opts, new_db_opts));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opts, new_cf_opts));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_db_opts,
+                                                  new_db_opts));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_cf_opts,
+                                                  new_cf_opts));
   delete new_cf_opts.compaction_filter;
 }
 
+TEST_F(OptionsTest, DBOptionsComposeImmutable) {
+  // Build a DBOptions from an Immutable/Mutable one and verify that
+  // we get same constituent options.
+  ConfigOptions config_options;
+  Random rnd(301);
+  DBOptions base_opts, new_opts;
+  test::RandomInitDBOptions(&base_opts, &rnd);
+  MutableDBOptions m_opts(base_opts);
+  ImmutableDBOptions i_opts(base_opts);
+  new_opts = BuildDBOptions(i_opts, m_opts);
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_opts,
+                                                  new_opts));
+}
+
+TEST_F(OptionsTest, GetMutableDBOptions) {
+  Random rnd(228);
+  DBOptions base_opts;
+  std::string opts_str;
+  std::unordered_map<std::string, std::string> opts_map;
+  ConfigOptions config_options;
+
+  test::RandomInitDBOptions(&base_opts, &rnd);
+  ImmutableDBOptions i_opts(base_opts);
+  MutableDBOptions m_opts(base_opts);
+  MutableDBOptions new_opts;
+  ASSERT_OK(GetStringFromMutableDBOptions(config_options, m_opts, &opts_str));
+  ASSERT_OK(StringToMap(opts_str, &opts_map));
+  ASSERT_OK(GetMutableDBOptionsFromStrings(m_opts, opts_map, &new_opts));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(
+      config_options, base_opts, BuildDBOptions(i_opts, new_opts)));
+}
+
+TEST_F(OptionsTest, CFOptionsComposeImmutable) {
+  // Build a DBOptions from an Immutable/Mutable one and verify that
+  // we get same constituent options.
+  ConfigOptions config_options;
+  Random rnd(301);
+  ColumnFamilyOptions base_opts, new_opts;
+  DBOptions dummy;  // Needed to create ImmutableCFOptions
+  test::RandomInitCFOptions(&base_opts, dummy, &rnd);
+  MutableCFOptions m_opts(base_opts);
+  ImmutableCFOptions i_opts(base_opts);
+  UpdateColumnFamilyOptions(i_opts, &new_opts);
+  UpdateColumnFamilyOptions(m_opts, &new_opts);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opts,
+                                                  new_opts));
+  delete new_opts.compaction_filter;
+}
+
+TEST_F(OptionsTest, GetMutableCFOptions) {
+  Random rnd(228);
+  ColumnFamilyOptions base, copy;
+  std::string opts_str;
+  std::unordered_map<std::string, std::string> opts_map;
+  ConfigOptions config_options;
+  DBOptions dummy;  // Needed to create ImmutableCFOptions
+
+  test::RandomInitCFOptions(&base, dummy, &rnd);
+  ColumnFamilyOptions result;
+  MutableCFOptions m_opts(base), new_opts;
+
+  ASSERT_OK(GetStringFromMutableCFOptions(config_options, m_opts, &opts_str));
+  ASSERT_OK(StringToMap(opts_str, &opts_map));
+  ASSERT_OK(GetMutableOptionsFromStrings(m_opts, opts_map, nullptr, &new_opts));
+  UpdateColumnFamilyOptions(ImmutableCFOptions(base), &copy);
+  UpdateColumnFamilyOptions(new_opts, &copy);
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base, copy));
+  delete copy.compaction_filter;
+}
+
 TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) {
   Options options;
   ColumnFamilyOptions base_opt, new_opt;
   Random rnd(302);
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+
   // Phase 1: randomly assign base_opt
   // custom type options
   test::RandomInitCFOptions(&base_opt, options, &rnd);
 
   // Phase 2: obtain a string from base_opt
   std::string base_options_file_content;
-  ASSERT_OK(
-      GetStringFromColumnFamilyOptions(&base_options_file_content, base_opt));
+  ASSERT_OK(GetStringFromColumnFamilyOptions(config_options, base_opt,
+                                             &base_options_file_content));
 
   // Phase 3: Set new_opt from the derived string and expect
   //          new_opt == base_opt
-  ASSERT_OK(GetColumnFamilyOptionsFromString(
-      ColumnFamilyOptions(), base_options_file_content, &new_opt));
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_opt, new_opt));
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(config_options, ColumnFamilyOptions(),
+                                       base_options_file_content, &new_opt));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt));
   if (base_opt.compaction_filter) {
     delete base_opt.compaction_filter;
   }
 }
 
+TEST_F(OptionsTest, CheckBlockBasedTableOptions) {
+  ColumnFamilyOptions cf_opts;
+  DBOptions db_opts;
+  ConfigOptions config_opts;
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_opts, cf_opts, "prefix_extractor=capped:8", &cf_opts));
+  ASSERT_OK(TableFactory::CreateFromString(config_opts, "BlockBasedTable",
+                                           &cf_opts.table_factory));
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  ASSERT_TRUE(cf_opts.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
+  auto bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_OK(cf_opts.table_factory->ConfigureFromString(
+      config_opts,
+      "block_cache={capacity=1M;num_shard_bits=4;};"
+      "block_size_deviation=101;"
+      "block_restart_interval=0;"
+      "index_block_restart_interval=5;"
+      "partition_filters=true;"
+      "index_type=kHashSearch;"
+      "no_block_cache=1;"));
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_EQ(bbto->block_cache.get(), nullptr);
+  ASSERT_EQ(bbto->block_size_deviation, 0);
+  ASSERT_EQ(bbto->block_restart_interval, 1);
+  ASSERT_EQ(bbto->index_block_restart_interval, 1);
+  ASSERT_FALSE(bbto->partition_filters);
+  ASSERT_OK(TableFactory::CreateFromString(config_opts, "BlockBasedTable",
+                                           &cf_opts.table_factory));
+  bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+
+  ASSERT_OK(cf_opts.table_factory->ConfigureFromString(config_opts,
+                                                       "no_block_cache=0;"));
+  ASSERT_NE(bbto->block_cache.get(), nullptr);
+  ASSERT_OK(cf_opts.table_factory->ValidateOptions(db_opts, cf_opts));
+}
+
+TEST_F(OptionsTest, MutableTableOptions) {
+  ConfigOptions config_options;
+  std::shared_ptr<TableFactory> bbtf;
+  bbtf.reset(NewBlockBasedTableFactory());
+  auto bbto = bbtf->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_align", "true"));
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024"));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_EQ(bbto->block_size, 1024);
+  ASSERT_OK(bbtf->PrepareOptions(config_options));
+  config_options.mutable_options_only = true;
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024"));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_NOK(bbtf->ConfigureOption(config_options, "block_align", "false"));
+  ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "2048"));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_EQ(bbto->block_size, 2048);
+
+  ColumnFamilyOptions cf_opts;
+  cf_opts.table_factory = bbtf;
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, cf_opts, "block_based_table_factory.block_align=false",
+      &cf_opts));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, cf_opts, "block_based_table_factory.block_size=8192",
+      &cf_opts));
+  ASSERT_EQ(bbto->block_align, true);
+  ASSERT_EQ(bbto->block_size, 8192);
+}
+
+TEST_F(OptionsTest, MutableCFOptions) {
+  ConfigOptions config_options;
+  ColumnFamilyOptions cf_opts;
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, cf_opts,
+      "paranoid_file_checks=true; block_based_table_factory.block_align=false; "
+      "block_based_table_factory.block_size=8192;",
+      &cf_opts));
+  ASSERT_TRUE(cf_opts.paranoid_file_checks);
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  const auto bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_EQ(bbto->block_size, 8192);
+  ASSERT_EQ(bbto->block_align, false);
+  std::unordered_map<std::string, std::string> unused_opts;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts));
+  ASSERT_EQ(cf_opts.paranoid_file_checks, false);
+
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.block_size", "16384"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 16384);
+
+  config_options.mutable_options_only = true;
+  // Force consistency checks is not mutable
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"force_consistency_checks", "true"}},
+      &cf_opts));
+
+  // Attempt to change the table.  It is not mutable, so this should fail and
+  // leave the original intact
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"table_factory", "PlainTable"}}, &cf_opts));
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"table_factory.id", "PlainTable"}}, &cf_opts));
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+
+  // Change the block size.  Should update the value in the current table
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.block_size", "8192"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 8192);
+
+  // Attempt to turn off block cache fails, as this option is not mutable
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.no_block_cache", "true"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+
+  // Attempt to change the block size via a config string/map.  Should update
+  // the current value
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory", "{block_size=32768}"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 32768);
+
+  // Attempt to change the block size and no cache through the map.  Should
+  // fail, leaving the old values intact
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory",
+        "{block_size=16384; no_block_cache=true}"}},
+      &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 32768);
+}
+
 #endif  // !ROCKSDB_LITE
 
 Status StringToMap(
@@ -1079,6 +1872,230 @@
   ASSERT_NOK(
       GetStringFromCompressionType(&res, static_cast<CompressionType>(-10)));
 }
+
+TEST_F(OptionsTest, OnlyMutableDBOptions) {
+  std::string opt_str;
+  Random rnd(302);
+  ConfigOptions cfg_opts;
+  DBOptions db_opts;
+  DBOptions mdb_opts;
+  std::unordered_set<std::string> m_names;
+  std::unordered_set<std::string> a_names;
+
+  test::RandomInitDBOptions(&db_opts, &rnd);
+  auto db_config = DBOptionsAsConfigurable(db_opts);
+
+  // Get all of the DB Option names (mutable or not)
+  ASSERT_OK(db_config->GetOptionNames(cfg_opts, &a_names));
+
+  // Get only the mutable options from db_opts and set those in mdb_opts
+  cfg_opts.mutable_options_only = true;
+
+  // Get only the Mutable DB Option names
+  ASSERT_OK(db_config->GetOptionNames(cfg_opts, &m_names));
+  ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opt_str));
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, mdb_opts, opt_str, &mdb_opts));
+  std::string mismatch;
+  // Comparing only the mutable options, the two are equivalent
+  auto mdb_config = DBOptionsAsConfigurable(mdb_opts);
+  ASSERT_TRUE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch));
+  ASSERT_TRUE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch));
+
+  ASSERT_GT(a_names.size(), m_names.size());
+  for (const auto& n : m_names) {
+    std::string m, d;
+    ASSERT_OK(mdb_config->GetOption(cfg_opts, n, &m));
+    ASSERT_OK(db_config->GetOption(cfg_opts, n, &d));
+    ASSERT_EQ(m, d);
+  }
+
+  cfg_opts.mutable_options_only = false;
+  // Comparing all of the options, the two are not equivalent
+  ASSERT_FALSE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch));
+  ASSERT_FALSE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch));
+
+  // Make sure there are only mutable options being configured
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, DBOptions(), opt_str, &db_opts));
+}
+
+TEST_F(OptionsTest, OnlyMutableCFOptions) {
+  std::string opt_str;
+  Random rnd(302);
+  ConfigOptions cfg_opts;
+  DBOptions db_opts;
+  ColumnFamilyOptions mcf_opts;
+  ColumnFamilyOptions cf_opts;
+  std::unordered_set<std::string> m_names;
+  std::unordered_set<std::string> a_names;
+
+  test::RandomInitCFOptions(&cf_opts, db_opts, &rnd);
+  cf_opts.comparator = ReverseBytewiseComparator();
+  auto cf_config = CFOptionsAsConfigurable(cf_opts);
+
+  // Get all of the CF Option names (mutable or not)
+  ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &a_names));
+
+  // Get only the mutable options from cf_opts and set those in mcf_opts
+  cfg_opts.mutable_options_only = true;
+  // Get only the Mutable CF Option names
+  ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &m_names));
+  ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opt_str));
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(cfg_opts, mcf_opts, opt_str, &mcf_opts));
+  std::string mismatch;
+
+  auto mcf_config = CFOptionsAsConfigurable(mcf_opts);
+  // Comparing only the mutable options, the two are equivalent
+  ASSERT_TRUE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch));
+  ASSERT_TRUE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch));
+
+  ASSERT_GT(a_names.size(), m_names.size());
+  for (const auto& n : m_names) {
+    std::string m, d;
+    ASSERT_OK(mcf_config->GetOption(cfg_opts, n, &m));
+    ASSERT_OK(cf_config->GetOption(cfg_opts, n, &d));
+    ASSERT_EQ(m, d);
+  }
+
+  cfg_opts.mutable_options_only = false;
+  // Comparing all of the options, the two are not equivalent
+  ASSERT_FALSE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch));
+  ASSERT_FALSE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch));
+  delete cf_opts.compaction_filter;
+
+  // Make sure the options string contains only mutable options
+  ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, ColumnFamilyOptions(),
+                                             opt_str, &cf_opts));
+  delete cf_opts.compaction_filter;
+}
+
+TEST_F(OptionsTest, SstPartitionerTest) {
+  ConfigOptions cfg_opts;
+  ColumnFamilyOptions cf_opts, new_opt;
+  std::string opts_str, mismatch;
+
+  ASSERT_OK(SstPartitionerFactory::CreateFromString(
+      cfg_opts, SstPartitionerFixedPrefixFactory::kClassName(),
+      &cf_opts.sst_partitioner_factory));
+  ASSERT_NE(cf_opts.sst_partitioner_factory, nullptr);
+  ASSERT_STREQ(cf_opts.sst_partitioner_factory->Name(),
+               SstPartitionerFixedPrefixFactory::kClassName());
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      cfg_opts, ColumnFamilyOptions(),
+      std::string("sst_partitioner_factory={id=") +
+          SstPartitionerFixedPrefixFactory::kClassName() + "; unknown=10;}",
+      &cf_opts));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      cfg_opts, ColumnFamilyOptions(),
+      std::string("sst_partitioner_factory={id=") +
+          SstPartitionerFixedPrefixFactory::kClassName() + "; length=10;}",
+      &cf_opts));
+  ASSERT_NE(cf_opts.sst_partitioner_factory, nullptr);
+  ASSERT_STREQ(cf_opts.sst_partitioner_factory->Name(),
+               SstPartitionerFixedPrefixFactory::kClassName());
+  ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opts_str));
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(cfg_opts, cf_opts, opts_str, &new_opt));
+  ASSERT_NE(new_opt.sst_partitioner_factory, nullptr);
+  ASSERT_STREQ(new_opt.sst_partitioner_factory->Name(),
+               SstPartitionerFixedPrefixFactory::kClassName());
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, cf_opts, new_opt));
+  ASSERT_TRUE(cf_opts.sst_partitioner_factory->AreEquivalent(
+      cfg_opts, new_opt.sst_partitioner_factory.get(), &mismatch));
+}
+
+TEST_F(OptionsTest, FileChecksumGenFactoryTest) {
+  ConfigOptions cfg_opts;
+  DBOptions db_opts, new_opt;
+  std::string opts_str, mismatch;
+  auto factory = GetFileChecksumGenCrc32cFactory();
+
+  cfg_opts.ignore_unsupported_options = false;
+
+  ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opts_str));
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, db_opts, opts_str, &new_opt));
+
+  ASSERT_NE(factory, nullptr);
+  ASSERT_OK(FileChecksumGenFactory::CreateFromString(
+      cfg_opts, factory->Name(), &db_opts.file_checksum_gen_factory));
+  ASSERT_NE(db_opts.file_checksum_gen_factory, nullptr);
+  ASSERT_STREQ(db_opts.file_checksum_gen_factory->Name(), factory->Name());
+  ASSERT_NOK(GetDBOptionsFromString(
+      cfg_opts, DBOptions(), "file_checksum_gen_factory=unknown", &db_opts));
+  ASSERT_OK(GetDBOptionsFromString(
+      cfg_opts, DBOptions(),
+      std::string("file_checksum_gen_factory=") + factory->Name(), &db_opts));
+  ASSERT_NE(db_opts.file_checksum_gen_factory, nullptr);
+  ASSERT_STREQ(db_opts.file_checksum_gen_factory->Name(), factory->Name());
+
+  ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opts_str));
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, db_opts, opts_str, &new_opt));
+  ASSERT_NE(new_opt.file_checksum_gen_factory, nullptr);
+  ASSERT_STREQ(new_opt.file_checksum_gen_factory->Name(), factory->Name());
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(cfg_opts, db_opts, new_opt));
+  ASSERT_TRUE(factory->AreEquivalent(
+      cfg_opts, new_opt.file_checksum_gen_factory.get(), &mismatch));
+  ASSERT_TRUE(db_opts.file_checksum_gen_factory->AreEquivalent(
+      cfg_opts, new_opt.file_checksum_gen_factory.get(), &mismatch));
+}
+
+class TestTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ private:
+  std::string id_;
+
+ public:
+  explicit TestTablePropertiesCollectorFactory(const std::string& id)
+      : id_(id) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return nullptr;
+  }
+  static const char* kClassName() { return "TestCollector"; }
+  const char* Name() const override { return kClassName(); }
+  std::string GetId() const override {
+    return std::string(kClassName()) + ":" + id_;
+  }
+};
+
+TEST_F(OptionsTest, OptionTablePropertiesTest) {
+  ConfigOptions cfg_opts;
+  ColumnFamilyOptions orig, copy;
+  orig.table_properties_collector_factories.push_back(
+      std::make_shared<TestTablePropertiesCollectorFactory>("1"));
+  orig.table_properties_collector_factories.push_back(
+      std::make_shared<TestTablePropertiesCollectorFactory>("2"));
+
+  // Push two TablePropertiesCollectorFactories then create a new
+  // ColumnFamilyOptions based on those settings.  The copy should
+  // have no properties but still match the original
+  std::string opts_str;
+  ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, orig, &opts_str));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, orig, opts_str, &copy));
+  ASSERT_EQ(copy.table_properties_collector_factories.size(), 0);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, orig, copy));
+
+  // Now register a TablePropertiesCollectorFactory
+  // Repeat the experiment.  The copy should have the same
+  // properties as the original
+  cfg_opts.registry->AddLibrary("collector")
+      ->AddFactory<TablePropertiesCollectorFactory>(
+          ObjectLibrary::PatternEntry(
+              TestTablePropertiesCollectorFactory::kClassName(), false)
+              .AddSeparator(":"),
+          [](const std::string& name,
+             std::unique_ptr<TablePropertiesCollectorFactory>* guard,
+             std::string* /* errmsg */) {
+            std::string id = name.substr(
+                strlen(TestTablePropertiesCollectorFactory::kClassName()) + 1);
+            guard->reset(new TestTablePropertiesCollectorFactory(id));
+            return guard->get();
+          });
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(cfg_opts, orig, opts_str, &copy));
+  ASSERT_EQ(copy.table_properties_collector_factories.size(), 2);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, orig, copy));
+}
 #endif  // !ROCKSDB_LITE
 
 TEST_F(OptionsTest, ConvertOptionsTest) {
@@ -1094,32 +2111,967 @@
   ASSERT_EQ(converted_opt.max_open_files, leveldb_opt.max_open_files);
   ASSERT_EQ(converted_opt.compression, leveldb_opt.compression);
 
-  std::shared_ptr<TableFactory> tb_guard = converted_opt.table_factory;
-  BlockBasedTableFactory* table_factory =
-      dynamic_cast<BlockBasedTableFactory*>(converted_opt.table_factory.get());
+  std::shared_ptr<TableFactory> table_factory = converted_opt.table_factory;
+  const auto table_opt = table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(table_opt, nullptr);
+
+  ASSERT_EQ(table_opt->block_cache->GetCapacity(), 8UL << 20);
+  ASSERT_EQ(table_opt->block_size, leveldb_opt.block_size);
+  ASSERT_EQ(table_opt->block_restart_interval,
+            leveldb_opt.block_restart_interval);
+  ASSERT_EQ(table_opt->filter_policy.get(), leveldb_opt.filter_policy);
+}
+#ifndef ROCKSDB_LITE
+class TestEventListener : public EventListener {
+ private:
+  std::string id_;
 
-  ASSERT_TRUE(table_factory != nullptr);
+ public:
+  explicit TestEventListener(const std::string& id) : id_("Test" + id) {}
+  const char* Name() const override { return id_.c_str(); }
+};
 
-  const BlockBasedTableOptions table_opt = table_factory->table_options();
+static std::unordered_map<std::string, OptionTypeInfo>
+    test_listener_option_info = {
+        {"s",
+         {0, OptionType::kString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
 
-  ASSERT_EQ(table_opt.block_cache->GetCapacity(), 8UL << 20);
-  ASSERT_EQ(table_opt.block_size, leveldb_opt.block_size);
-  ASSERT_EQ(table_opt.block_restart_interval,
-            leveldb_opt.block_restart_interval);
-  ASSERT_EQ(table_opt.filter_policy.get(), leveldb_opt.filter_policy);
+};
+
+class TestConfigEventListener : public TestEventListener {
+ private:
+  std::string s_;
+
+ public:
+  explicit TestConfigEventListener(const std::string& id)
+      : TestEventListener("Config" + id) {
+    s_ = id;
+    RegisterOptions("Test", &s_, &test_listener_option_info);
+  }
+};
+
+static int RegisterTestEventListener(ObjectLibrary& library,
+                                     const std::string& arg) {
+  library.AddFactory<EventListener>(
+      "Test" + arg,
+      [](const std::string& name, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestEventListener(name.substr(4)));
+        return guard->get();
+      });
+  library.AddFactory<EventListener>(
+      "TestConfig" + arg,
+      [](const std::string& name, std::unique_ptr<EventListener>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TestConfigEventListener(name.substr(10)));
+        return guard->get();
+      });
+  return 1;
 }
+TEST_F(OptionsTest, OptionsListenerTest) {
+  DBOptions orig, copy;
+  orig.listeners.push_back(std::make_shared<TestEventListener>("1"));
+  orig.listeners.push_back(std::make_shared<TestEventListener>("2"));
+  orig.listeners.push_back(std::make_shared<TestEventListener>(""));
+  orig.listeners.push_back(std::make_shared<TestConfigEventListener>("1"));
+  orig.listeners.push_back(std::make_shared<TestConfigEventListener>("2"));
+  orig.listeners.push_back(std::make_shared<TestConfigEventListener>(""));
+  ConfigOptions config_opts(orig);
+  config_opts.registry->AddLibrary("listener", RegisterTestEventListener, "1");
+  std::string opts_str;
+  ASSERT_OK(GetStringFromDBOptions(config_opts, orig, &opts_str));
+  ASSERT_OK(GetDBOptionsFromString(config_opts, orig, opts_str, &copy));
+  ASSERT_OK(GetStringFromDBOptions(config_opts, copy, &opts_str));
+  ASSERT_EQ(
+      copy.listeners.size(),
+      2);  // The Test{Config}1 Listeners could be loaded but not the others
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts, orig, copy));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+const static std::string kCustomEnvName = "Custom";
+const static std::string kCustomEnvProp = "env=" + kCustomEnvName;
+
+static int RegisterCustomEnv(ObjectLibrary& library, const std::string& arg) {
+  library.AddFactory<Env>(
+      arg, [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+              std::string* /* errmsg */) {
+        static CustomEnv env(Env::Default());
+        return &env;
+      });
+  return 1;
+}
+
+// This test suite tests the old APIs into the Configure options methods.
+// Once those APIs are officially deprecated, this test suite can be deleted.
+class OptionsOldApiTest : public testing::Test {};
+
+TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
+  std::unordered_map<std::string, std::string> cf_options_map = {
+      {"write_buffer_size", "1"},
+      {"max_write_buffer_number", "2"},
+      {"min_write_buffer_number_to_merge", "3"},
+      {"max_write_buffer_number_to_maintain", "99"},
+      {"max_write_buffer_size_to_maintain", "-99999"},
+      {"compression", "kSnappyCompression"},
+      {"compression_per_level",
+       "kNoCompression:"
+       "kSnappyCompression:"
+       "kZlibCompression:"
+       "kBZip2Compression:"
+       "kLZ4Compression:"
+       "kLZ4HCCompression:"
+       "kXpressCompression:"
+       "kZSTD:"
+       "kZSTDNotFinalCompression"},
+      {"bottommost_compression", "kLZ4Compression"},
+      {"bottommost_compression_opts", "5:6:7:8:9:true"},
+      {"compression_opts", "4:5:6:7:8:true"},
+      {"num_levels", "8"},
+      {"level0_file_num_compaction_trigger", "8"},
+      {"level0_slowdown_writes_trigger", "9"},
+      {"level0_stop_writes_trigger", "10"},
+      {"target_file_size_base", "12"},
+      {"target_file_size_multiplier", "13"},
+      {"max_bytes_for_level_base", "14"},
+      {"level_compaction_dynamic_level_bytes", "true"},
+      {"max_bytes_for_level_multiplier", "15.0"},
+      {"max_bytes_for_level_multiplier_additional", "16:17:18"},
+      {"max_compaction_bytes", "21"},
+      {"soft_rate_limit", "1.1"},
+      {"hard_rate_limit", "2.1"},
+      {"hard_pending_compaction_bytes_limit", "211"},
+      {"arena_block_size", "22"},
+      {"disable_auto_compactions", "true"},
+      {"compaction_style", "kCompactionStyleLevel"},
+      {"compaction_pri", "kOldestSmallestSeqFirst"},
+      {"verify_checksums_in_compaction", "false"},
+      {"compaction_options_fifo", "23"},
+      {"max_sequential_skip_in_iterations", "24"},
+      {"inplace_update_support", "true"},
+      {"report_bg_io_stats", "true"},
+      {"compaction_measure_io_stats", "false"},
+      {"inplace_update_num_locks", "25"},
+      {"memtable_prefix_bloom_size_ratio", "0.26"},
+      {"memtable_whole_key_filtering", "true"},
+      {"memtable_huge_page_size", "28"},
+      {"bloom_locality", "29"},
+      {"max_successive_merges", "30"},
+      {"min_partial_merge_operands", "31"},
+      {"prefix_extractor", "fixed:31"},
+      {"optimize_filters_for_hits", "true"},
+      {"enable_blob_files", "true"},
+      {"min_blob_size", "1K"},
+      {"blob_file_size", "1G"},
+      {"blob_compression_type", "kZSTD"},
+      {"enable_blob_garbage_collection", "true"},
+      {"blob_garbage_collection_age_cutoff", "0.5"},
+      {"blob_garbage_collection_force_threshold", "0.75"},
+      {"blob_compaction_readahead_size", "256K"},
+  };
+
+  std::unordered_map<std::string, std::string> db_options_map = {
+      {"create_if_missing", "false"},
+      {"create_missing_column_families", "true"},
+      {"error_if_exists", "false"},
+      {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
+      {"max_open_files", "32"},
+      {"max_total_wal_size", "33"},
+      {"use_fsync", "true"},
+      {"db_log_dir", "/db_log_dir"},
+      {"wal_dir", "/wal_dir"},
+      {"delete_obsolete_files_period_micros", "34"},
+      {"max_background_compactions", "35"},
+      {"max_background_flushes", "36"},
+      {"max_log_file_size", "37"},
+      {"log_file_time_to_roll", "38"},
+      {"keep_log_file_num", "39"},
+      {"recycle_log_file_num", "5"},
+      {"max_manifest_file_size", "40"},
+      {"table_cache_numshardbits", "41"},
+      {"WAL_ttl_seconds", "43"},
+      {"WAL_size_limit_MB", "44"},
+      {"manifest_preallocation_size", "45"},
+      {"allow_mmap_reads", "true"},
+      {"allow_mmap_writes", "false"},
+      {"use_direct_reads", "false"},
+      {"use_direct_io_for_flush_and_compaction", "false"},
+      {"is_fd_close_on_exec", "true"},
+      {"skip_log_error_on_recovery", "false"},
+      {"stats_dump_period_sec", "46"},
+      {"stats_persist_period_sec", "57"},
+      {"persist_stats_to_disk", "false"},
+      {"stats_history_buffer_size", "69"},
+      {"advise_random_on_open", "true"},
+      {"experimental_mempurge_threshold", "0.0"},
+      {"use_adaptive_mutex", "false"},
+      {"new_table_reader_for_compaction_inputs", "true"},
+      {"compaction_readahead_size", "100"},
+      {"random_access_max_buffer_size", "3145728"},
+      {"writable_file_max_buffer_size", "314159"},
+      {"bytes_per_sync", "47"},
+      {"wal_bytes_per_sync", "48"},
+      {"strict_bytes_per_sync", "true"},
+  };
+
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
+  ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
+  ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level.size(), 9U);
+  ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[6], kXpressCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[7], kZSTD);
+  ASSERT_EQ(new_cf_opt.compression_per_level[8], kZSTDNotFinalCompression);
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.compression_opts.parallel_threads,
+            CompressionOptions().parallel_threads);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.parallel_threads,
+            CompressionOptions().parallel_threads);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
+  ASSERT_EQ(new_cf_opt.num_levels, 8);
+  ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
+  ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9);
+  ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10);
+  ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
+  ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
+  ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[1], 17);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[2], 18);
+  ASSERT_EQ(new_cf_opt.max_compaction_bytes, 21);
+  ASSERT_EQ(new_cf_opt.hard_pending_compaction_bytes_limit, 211);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 22U);
+  ASSERT_EQ(new_cf_opt.disable_auto_compactions, true);
+  ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel);
+  ASSERT_EQ(new_cf_opt.compaction_pri, kOldestSmallestSeqFirst);
+  ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size,
+            static_cast<uint64_t>(23));
+  ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations,
+            static_cast<uint64_t>(24));
+  ASSERT_EQ(new_cf_opt.inplace_update_support, true);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_size_ratio, 0.26);
+  ASSERT_EQ(new_cf_opt.memtable_whole_key_filtering, true);
+  ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
+  ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
+  ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
+  ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
+  ASSERT_EQ(new_cf_opt.enable_blob_files, true);
+  ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
+  ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
+  ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
+  ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75);
+  ASSERT_EQ(new_cf_opt.blob_compaction_readahead_size, 262144);
+
+  cf_options_map["write_buffer_size"] = "hello";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
+  ConfigOptions exact, loose;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  loose.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  cf_options_map["write_buffer_size"] = "1";
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
+
+  cf_options_map["unknown_option"] = "1";
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map,
+                                          &new_cf_opt,
+                                          false, /* input_strings_escaped  */
+                                          true /* ignore_unknown_options */));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+      loose, base_cf_opt, new_cf_opt, nullptr /* new_opt_map */));
+  ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+      exact /* default for VerifyCFOptions */, base_cf_opt, new_cf_opt, nullptr));
+
+  DBOptions base_db_opt;
+  DBOptions new_db_opt;
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
+  ASSERT_EQ(new_db_opt.max_open_files, 32);
+  ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
+  ASSERT_EQ(new_db_opt.use_fsync, true);
+  ASSERT_EQ(new_db_opt.db_log_dir, "/db_log_dir");
+  ASSERT_EQ(new_db_opt.wal_dir, "/wal_dir");
+  ASSERT_EQ(new_db_opt.delete_obsolete_files_period_micros,
+            static_cast<uint64_t>(34));
+  ASSERT_EQ(new_db_opt.max_background_compactions, 35);
+  ASSERT_EQ(new_db_opt.max_background_flushes, 36);
+  ASSERT_EQ(new_db_opt.max_log_file_size, 37U);
+  ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
+  ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
+  ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
+  ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
+  ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
+  ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
+  ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
+  ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
+  ASSERT_EQ(new_db_opt.use_direct_reads, false);
+  ASSERT_EQ(new_db_opt.use_direct_io_for_flush_and_compaction, false);
+  ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
+  ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
+  ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
+  ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U);
+  ASSERT_EQ(new_db_opt.persist_stats_to_disk, false);
+  ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U);
+  ASSERT_EQ(new_db_opt.advise_random_on_open, true);
+  ASSERT_EQ(new_db_opt.experimental_mempurge_threshold, 0.0);
+  ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
+  ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true);
+  ASSERT_EQ(new_db_opt.compaction_readahead_size, 100);
+  ASSERT_EQ(new_db_opt.random_access_max_buffer_size, 3145728);
+  ASSERT_EQ(new_db_opt.writable_file_max_buffer_size, 314159);
+  ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
+  ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
+  ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true);
+
+  db_options_map["max_open_files"] = "hello";
+  ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
+
+  // unknow options should fail parsing without ignore_unknown_options = true
+  db_options_map["unknown_db_option"] = "1";
+  ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt,
+                                false, /* input_strings_escaped  */
+                                true /* ignore_unknown_options */));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt));
+  ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt));
+}
+
+TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  base_cf_opt.table_factory.reset();
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=5", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 5U);
+  ASSERT_TRUE(new_cf_opt.table_factory == nullptr);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=6;", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 6U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  7  ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 7U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  8 ; ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 8U);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 9U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=11; max_write_buffer_number  =  12 ;",
+            &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 11U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
+  // Wrong name "max_write_buffer_number_"
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number_=14;",
+              &new_cf_opt));
+  ConfigOptions exact;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Comparator from object registry
+  std::string kCompName = "reverse_comp";
+  ObjectLibrary::Default()->AddFactory<const Comparator>(
+      kCompName,
+      [](const std::string& /*name*/,
+         std::unique_ptr<const Comparator>* /*guard*/,
+         std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator());
+
+  // MergeOperator from object registry
+  std::unique_ptr<BytesXOROperator> bxo(new BytesXOROperator());
+  std::string kMoName = bxo->Name();
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "merge_operator=" + kMoName + ";", &new_cf_opt));
+  ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name()));
+
+  // Wrong key/value pair
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Error Paring value
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Missing option name
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13; =100;", &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  const uint64_t kilo = 1024UL;
+  const uint64_t mega = 1024 * kilo;
+  const uint64_t giga = 1024 * mega;
+  const uint64_t tera = 1024 * giga;
+
+  // Units (k)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo);
+  // Units (m)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "max_write_buffer_number=16m;inplace_update_num_locks=17M",
+            &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega);
+  // Units (g)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=18g;prefix_extractor=capped:8;"
+      "arena_block_size=19G",
+      &new_cf_opt));
+
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18 * giga);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19 * giga);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
+  ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.CappedPrefix.8");
+
+  // Units (t)
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera);
+
+  // Nested block based table options
+  // Empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={};arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;};"
+            "arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Last one
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;}",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Mismatch curly braces
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={{{block_size=4;};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Unexpected chars after closing curly brace
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa;"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Invalid block based table option
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={xx_block_size=4;}",
+             &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+           "optimize_filters_for_hits=true",
+           &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "optimize_filters_for_hits=false",
+            &new_cf_opt));
+
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "optimize_filters_for_hits=junk",
+              &new_cf_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt));
+
+  // Nested plain table options
+  // Empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "plain_table_factory={};arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};"
+            "arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable");
+
+  // memtable factory
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "memtable=skip_list:10;arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr);
+  ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory"));
+}
+
+TEST_F(OptionsTest, SliceTransformCreateFromString) {
+  std::shared_ptr<const SliceTransform> transform = nullptr;
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+
+  ASSERT_OK(
+      SliceTransform::CreateFromString(config_options, "fixed:31", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_FALSE(transform->IsInstanceOf("capped"));
+  ASSERT_TRUE(transform->IsInstanceOf("fixed"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.FixedPrefix"));
+  ASSERT_EQ(transform->GetId(), "rocksdb.FixedPrefix.31");
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.FixedPrefix.42", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_EQ(transform->GetId(), "rocksdb.FixedPrefix.42");
+
+  ASSERT_OK(SliceTransform::CreateFromString(config_options, "capped:16",
+                                             &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_FALSE(transform->IsInstanceOf("fixed"));
+  ASSERT_TRUE(transform->IsInstanceOf("capped"));
+  ASSERT_TRUE(transform->IsInstanceOf("rocksdb.CappedPrefix"));
+  ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.16");
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.CappedPrefix.42", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.42");
+
+  ASSERT_OK(SliceTransform::CreateFromString(config_options, "rocksdb.Noop",
+                                             &transform));
+  ASSERT_NE(transform, nullptr);
+
+  ASSERT_NOK(SliceTransform::CreateFromString(config_options,
+                                              "fixed:21:invalid", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(config_options,
+                                              "capped:21:invalid", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "fixed", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "capped", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.FixedPrefix:42", &transform));
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "rocksdb.CappedPrefix:42", &transform));
+  ASSERT_NOK(
+      SliceTransform::CreateFromString(config_options, "invalid", &transform));
+
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(SliceTransform::CreateFromString(
+      config_options, "id=rocksdb.CappedPrefix; length=11", &transform));
+  ASSERT_NE(transform, nullptr);
+  ASSERT_EQ(transform->GetId(), "rocksdb.CappedPrefix.11");
+
+  ASSERT_NOK(SliceTransform::CreateFromString(
+      config_options, "id=rocksdb.CappedPrefix; length=11; invalid=true",
+      &transform));
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) {
+  BlockBasedTableOptions table_opt;
+  BlockBasedTableOptions new_opt;
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
+      "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
+      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+      "block_size_deviation=8;block_restart_interval=4;"
+      "format_version=5;whole_key_filtering=1;"
+      "filter_policy=bloomfilter:4.567:false;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
+  ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
+  ASSERT_TRUE(new_opt.hash_index_allow_collision);
+  ASSERT_TRUE(new_opt.no_block_cache);
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL);
+  ASSERT_EQ(new_opt.block_size, 1024UL);
+  ASSERT_EQ(new_opt.block_size_deviation, 8);
+  ASSERT_EQ(new_opt.block_restart_interval, 4);
+  ASSERT_EQ(new_opt.format_version, 5U);
+  ASSERT_EQ(new_opt.whole_key_filtering, true);
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  const BloomFilterPolicy& bfp =
+      dynamic_cast<const BloomFilterPolicy&>(*new_opt.filter_policy);
+  EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567);
+  EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5);
+
+  // unknown option
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
+             "bad_option=1",
+             &new_opt));
+  ASSERT_EQ(static_cast<bool>(table_opt.cache_index_and_filter_blocks),
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized index type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX",
+             &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized checksum type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;checksum=kxxHashXX",
+             &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.index_type, new_opt.index_type);
+
+  // unrecognized filter policy name
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;"
+             "filter_policy=bloomfilterxx:4:true",
+             &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy);
+
+  // unrecognized filter policy config
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;"
+             "filter_policy=bloomfilter:4",
+             &new_opt));
+  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+            new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy);
+
+  // Check block cache options are overwritten when specified
+  // in new format as a struct.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};"
+             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
+                0.5);
+
+  // Set only block cache capacity. Check other values are
+  // reset to default values.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=2M};"
+             "block_cache_compressed={capacity=2M}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(),
+                GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(),
+                GetDefaultCacheShardBits(
+                    new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
+      "block_cache_compressed={num_shard_bits=5;"
+      "high_pri_pool_ratio=0.0;}",
+      &new_opt));
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(), 5);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(), 5);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.0);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;};"
+             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
+}
+
+TEST_F(OptionsOldApiTest, GetPlainTableOptionsFromString) {
+  PlainTableOptions table_opt;
+  PlainTableOptions new_opt;
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetPlainTableOptionsFromString(table_opt,
+            "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+            "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;"
+            "full_scan_mode=true;store_index_in_file=true",
+            &new_opt));
+  ASSERT_EQ(new_opt.user_key_len, 66u);
+  ASSERT_EQ(new_opt.bloom_bits_per_key, 20);
+  ASSERT_EQ(new_opt.hash_table_ratio, 0.5);
+  ASSERT_EQ(new_opt.index_sparseness, 8);
+  ASSERT_EQ(new_opt.huge_page_tlb_size, 4);
+  ASSERT_EQ(new_opt.encoding_type, EncodingType::kPrefix);
+  ASSERT_TRUE(new_opt.full_scan_mode);
+  ASSERT_TRUE(new_opt.store_index_in_file);
+
+  std::unordered_map<std::string, std::string> opt_map;
+  ASSERT_OK(StringToMap(
+      "user_key_len=55;bloom_bits_per_key=10;huge_page_tlb_size=8;", &opt_map));
+  ASSERT_OK(GetPlainTableOptionsFromMap(table_opt, opt_map, &new_opt));
+  ASSERT_EQ(new_opt.user_key_len, 55u);
+  ASSERT_EQ(new_opt.bloom_bits_per_key, 10);
+  ASSERT_EQ(new_opt.huge_page_tlb_size, 8);
+
+  // unknown option
+  ASSERT_NOK(GetPlainTableOptionsFromString(table_opt,
+             "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+             "bad_option=1",
+             &new_opt));
+
+  // unrecognized EncodingType
+  ASSERT_NOK(GetPlainTableOptionsFromString(table_opt,
+             "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;"
+             "encoding_type=kPrefixXX",
+             &new_opt));
+}
+
+TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) {
+  Options base_options, new_options;
+  base_options.write_buffer_size = 20;
+  base_options.min_write_buffer_number_to_merge = 15;
+  BlockBasedTableOptions block_based_table_options;
+  block_based_table_options.cache_index_and_filter_blocks = true;
+  base_options.table_factory.reset(
+      NewBlockBasedTableFactory(block_based_table_options));
+
+  // Register an Env with object registry.
+  ObjectLibrary::Default()->AddFactory<Env>(
+      "CustomEnvDefault",
+      [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+         std::string* /* errmsg */) {
+        static CustomEnv env(Env::Default());
+        return &env;
+      });
+
+  ASSERT_OK(GetOptionsFromString(
+      base_options,
+      "write_buffer_size=10;max_write_buffer_number=16;"
+      "block_based_table_factory={block_cache=1M;block_size=4;};"
+      "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;"
+      "bottommost_compression_opts=5:6:7;create_if_missing=true;max_open_files="
+      "1;"
+      "rate_limiter_bytes_per_sec=1024;env=CustomEnvDefault",
+      &new_options));
+
+  ASSERT_EQ(new_options.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_options.compression_opts.level, 5);
+  ASSERT_EQ(new_options.compression_opts.strategy, 6);
+  ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.parallel_threads, 1u);
+  ASSERT_EQ(new_options.compression_opts.enabled, false);
+  ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption);
+  ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_options.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.parallel_threads, 1u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false);
+  ASSERT_EQ(new_options.write_buffer_size, 10U);
+  ASSERT_EQ(new_options.max_write_buffer_number, 16);
+
+  auto new_block_based_table_options =
+      new_options.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(new_block_based_table_options, nullptr);
+  ASSERT_EQ(new_block_based_table_options->block_cache->GetCapacity(),
+            1U << 20);
+  ASSERT_EQ(new_block_based_table_options->block_size, 4U);
+  // don't overwrite block based table options
+  ASSERT_TRUE(new_block_based_table_options->cache_index_and_filter_blocks);
+
+  ASSERT_EQ(new_options.create_if_missing, true);
+  ASSERT_EQ(new_options.max_open_files, 1);
+  ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
+  Env* newEnv = new_options.env;
+  ASSERT_OK(Env::LoadEnv("CustomEnvDefault", &newEnv));
+  ASSERT_EQ(newEnv, new_options.env);
+}
+
+TEST_F(OptionsOldApiTest, DBOptionsSerialization) {
+  Options base_options, new_options;
+  Random rnd(301);
+
+  // Phase 1: Make big change in base_options
+  test::RandomInitDBOptions(&base_options, &rnd);
+
+  // Phase 2: obtain a string from base_option
+  std::string base_options_file_content;
+  ASSERT_OK(GetStringFromDBOptions(&base_options_file_content, base_options));
+
+  // Phase 3: Set new_options from the derived string and expect
+  //          new_options == base_options
+  ASSERT_OK(GetDBOptionsFromString(DBOptions(), base_options_file_content,
+                                   &new_options));
+  ConfigOptions config_options;
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_options, new_options));
+}
+
+TEST_F(OptionsOldApiTest, ColumnFamilyOptionsSerialization) {
+  Options options;
+  ColumnFamilyOptions base_opt, new_opt;
+  Random rnd(302);
+  // Phase 1: randomly assign base_opt
+  // custom type options
+  test::RandomInitCFOptions(&base_opt, options, &rnd);
+
+  // Phase 2: obtain a string from base_opt
+  std::string base_options_file_content;
+  ASSERT_OK(
+      GetStringFromColumnFamilyOptions(&base_options_file_content, base_opt));
+
+  // Phase 3: Set new_opt from the derived string and expect
+  //          new_opt == base_opt
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ColumnFamilyOptions(), base_options_file_content, &new_opt));
+  ConfigOptions config_options;
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt));
+  if (base_opt.compaction_filter) {
+    delete base_opt.compaction_filter;
+  }
+}
+#endif  // !ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE
 class OptionsParserTest : public testing::Test {
  public:
-  OptionsParserTest() {
-    env_.reset(new test::StringEnv(Env::Default()));
-    fs_.reset(new LegacyFileSystemWrapper(env_.get()));
-  }
+  OptionsParserTest() { fs_.reset(new test::StringFS(FileSystem::Default())); }
 
  protected:
-  std::unique_ptr<test::StringEnv> env_;
-  std::unique_ptr<LegacyFileSystemWrapper> fs_;
+  std::shared_ptr<test::StringFS> fs_;
 };
 
 TEST_F(OptionsParserTest, Comment) {
@@ -1148,15 +3100,19 @@
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_OK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
 
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(*parser.db_opt(), db_opt));
+  ConfigOptions exact;
+  exact.input_strings_escaped = false;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, *parser.db_opt(), db_opt));
   ASSERT_EQ(parser.NumColumnFamilies(), 1U);
   ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
-      *parser.GetCFOptions("default"), cf_opt));
+      exact, *parser.GetCFOptions("default"), cf_opt));
 }
 
 TEST_F(OptionsParserTest, ExtraSpace) {
@@ -1175,7 +3131,7 @@
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_OK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -1193,10 +3149,11 @@
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
+  ;
 }
 
 TEST_F(OptionsParserTest, DoubleDBOptions) {
@@ -1222,7 +3179,7 @@
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -1250,7 +3207,7 @@
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -1280,7 +3237,7 @@
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -1309,7 +3266,7 @@
       "[CFOptions \"something_else\"]\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -1377,8 +3334,12 @@
         "  # if a section is blank, we will use the default\n";
 
     const std::string kTestFileName = "test-rocksdb-options.ini";
-    env_->DeleteFile(kTestFileName);
-    env_->WriteToNewFile(kTestFileName, options_file_content);
+    auto s = fs_->FileExists(kTestFileName, IOOptions(), nullptr);
+    ASSERT_TRUE(s.ok() || s.IsNotFound());
+    if (s.ok()) {
+      ASSERT_OK(fs_->DeleteFile(kTestFileName, IOOptions(), nullptr));
+    }
+    ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
     RocksDBOptionsParser parser;
     ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(), false,
                             4096 /* readahead_size */));
@@ -1426,7 +3387,7 @@
     snprintf(buffer, kLength - 1, file_template.c_str(), iv.c_str());
 
     parser.Reset();
-    env_->WriteToNewFile(iv, buffer);
+    ASSERT_OK(fs_->WriteToNewFile(iv, buffer));
     ASSERT_NOK(parser.Parse(iv, fs_.get(), false, 0 /* readahead_size */));
   }
 
@@ -1435,7 +3396,7 @@
   for (auto vv : valid_versions) {
     snprintf(buffer, kLength - 1, file_template.c_str(), vv.c_str());
     parser.Reset();
-    env_->WriteToNewFile(vv, buffer);
+    ASSERT_OK(fs_->WriteToNewFile(vv, buffer));
     ASSERT_OK(parser.Parse(vv, fs_.get(), false, 0 /* readahead_size */));
   }
 }
@@ -1444,41 +3405,43 @@
     ColumnFamilyOptions* base_cf_opt, const ColumnFamilyOptions* new_cf_opt,
     const std::unordered_map<std::string, std::string>* new_cf_opt_map) {
   std::string name_buffer;
-  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
-                                                  new_cf_opt_map));
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, *base_cf_opt,
+                                                  *new_cf_opt, new_cf_opt_map));
 
   // change the name of merge operator back-and-forth
   {
-    auto* merge_operator = dynamic_cast<test::ChanglingMergeOperator*>(
-        base_cf_opt->merge_operator.get());
+    auto* merge_operator = base_cf_opt->merge_operator
+                               ->CheckedCast<test::ChanglingMergeOperator>();
     if (merge_operator != nullptr) {
       name_buffer = merge_operator->Name();
       // change the name  and expect non-ok status
       merge_operator->SetName("some-other-name");
       ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
-          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
       // change the name back and expect ok status
       merge_operator->SetName(name_buffer);
-      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
-                                                      new_cf_opt_map));
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
     }
   }
 
   // change the name of the compaction filter factory back-and-forth
   {
     auto* compaction_filter_factory =
-        dynamic_cast<test::ChanglingCompactionFilterFactory*>(
-            base_cf_opt->compaction_filter_factory.get());
+        base_cf_opt->compaction_filter_factory
+            ->CheckedCast<test::ChanglingCompactionFilterFactory>();
     if (compaction_filter_factory != nullptr) {
       name_buffer = compaction_filter_factory->Name();
       // change the name and expect non-ok status
       compaction_filter_factory->SetName("some-other-name");
       ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
-          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
       // change the name back and expect ok status
       compaction_filter_factory->SetName(name_buffer);
-      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
-                                                      new_cf_opt_map));
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
     }
   }
 
@@ -1489,11 +3452,11 @@
       base_cf_opt->compaction_filter = nullptr;
       // set compaction_filter to nullptr and expect non-ok status
       ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
-          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
       // set the value back and expect ok status
       base_cf_opt->compaction_filter = tmp_compaction_filter;
-      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
-                                                      new_cf_opt_map));
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
     }
   }
 
@@ -1504,11 +3467,11 @@
       base_cf_opt->table_factory.reset();
       // set table_factory to nullptr and expect non-ok status
       ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
-          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
       // set the value back and expect ok status
       base_cf_opt->table_factory = tmp_table_factory;
-      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
-                                                      new_cf_opt_map));
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
     }
   }
 
@@ -1519,11 +3482,11 @@
       base_cf_opt->memtable_factory.reset();
       // set memtable_factory to nullptr and expect non-ok status
       ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
-          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
       // set the value back and expect ok status
       base_cf_opt->memtable_factory = tmp_memtable_factory;
-      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
-                                                      new_cf_opt_map));
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          config_options, *base_cf_opt, *new_cf_opt, new_cf_opt_map));
     }
   }
 }
@@ -1542,37 +3505,37 @@
                                   kOptionsFileName, fs_.get()));
 
   uint64_t file_size = 0;
-  ASSERT_OK(env_->GetFileSize(kOptionsFileName, &file_size));
+  ASSERT_OK(
+      fs_->GetFileSize(kOptionsFileName, IOOptions(), &file_size, nullptr));
   assert(file_size > 0);
-  
+
   RocksDBOptionsParser parser;
 
-  env_->num_seq_file_read_ = 0;
+  fs_->num_seq_file_read_ = 0;
   size_t readahead_size = 128 * 1024;
 
   ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size));
-  ASSERT_EQ(env_->num_seq_file_read_.load(),
+  ASSERT_EQ(fs_->num_seq_file_read_.load(),
             (file_size - 1) / readahead_size + 1);
 
-  env_->num_seq_file_read_.store(0);
+  fs_->num_seq_file_read_.store(0);
   readahead_size = 1024 * 1024;
   ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size));
-  ASSERT_EQ(env_->num_seq_file_read_.load(),
+  ASSERT_EQ(fs_->num_seq_file_read_.load(),
             (file_size - 1) / readahead_size + 1);
 
   // Tiny readahead. 8 KB is read each time.
-  env_->num_seq_file_read_.store(0);
+  fs_->num_seq_file_read_.store(0);
   ASSERT_OK(
       parser.Parse(kOptionsFileName, fs_.get(), false, 1 /* readahead_size */));
-  ASSERT_GE(env_->num_seq_file_read_.load(), file_size / (8 * 1024));
-  ASSERT_LT(env_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2);
+  ASSERT_GE(fs_->num_seq_file_read_.load(), file_size / (8 * 1024));
+  ASSERT_LT(fs_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2);
 
   // Disable readahead means 512KB readahead.
-  env_->num_seq_file_read_.store(0);
+  fs_->num_seq_file_read_.store(0);
   ASSERT_OK(
       parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */));
-  ASSERT_GE(env_->num_seq_file_read_.load(),
-            (file_size - 1) / (512 * 1024) + 1);
+  ASSERT_GE(fs_->num_seq_file_read_.load(), (file_size - 1) / (512 * 1024) + 1);
 }
 
 TEST_F(OptionsParserTest, DumpAndParse) {
@@ -1607,32 +3570,35 @@
   }
 
   const std::string kOptionsFileName = "test-persisted-options.ini";
+  // Use default for escaped(true), unknown(false) and check (exact)
+  ConfigOptions config_options;
   ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts,
                                   kOptionsFileName, fs_.get()));
 
   RocksDBOptionsParser parser;
-  ASSERT_OK(
-      parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */));
+  ASSERT_OK(parser.Parse(config_options, kOptionsFileName, fs_.get()));
 
   // Make sure block-based table factory options was deserialized correctly
   std::shared_ptr<TableFactory> ttf = (*parser.cf_opts())[4].table_factory;
-  ASSERT_EQ(BlockBasedTableFactory::kName, std::string(ttf->Name()));
-  const BlockBasedTableOptions& parsed_bbto =
-      static_cast<BlockBasedTableFactory*>(ttf.get())->table_options();
-  ASSERT_EQ(special_bbto.block_size, parsed_bbto.block_size);
+  ASSERT_EQ(TableFactory::kBlockBasedTableName(), std::string(ttf->Name()));
+  const auto parsed_bbto = ttf->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(parsed_bbto, nullptr);
+  ASSERT_EQ(special_bbto.block_size, parsed_bbto->block_size);
   ASSERT_EQ(special_bbto.cache_index_and_filter_blocks,
-            parsed_bbto.cache_index_and_filter_blocks);
+            parsed_bbto->cache_index_and_filter_blocks);
 
   ASSERT_OK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
-      base_db_opt, cf_names, base_cf_opts, kOptionsFileName, fs_.get()));
+      config_options, base_db_opt, cf_names, base_cf_opts, kOptionsFileName,
+      fs_.get()));
 
-  ASSERT_OK(
-      RocksDBOptionsParser::VerifyDBOptions(*parser.db_opt(), base_db_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(
+      config_options, *parser.db_opt(), base_db_opt));
   for (int c = 0; c < num_cf; ++c) {
     const auto* cf_opt = parser.GetCFOptions(cf_names[c]);
     ASSERT_NE(cf_opt, nullptr);
     ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
-        base_cf_opts[c], *cf_opt, &(parser.cf_opt_maps()->at(c))));
+        config_options, base_cf_opts[c], *cf_opt,
+        &(parser.cf_opt_maps()->at(c))));
   }
 
   // Further verify pointer-typed options
@@ -1647,7 +3613,8 @@
 
   base_db_opt.max_open_files++;
   ASSERT_NOK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
-      base_db_opt, cf_names, base_cf_opts, kOptionsFileName, fs_.get()));
+      config_options, base_db_opt, cf_names, base_cf_opts, kOptionsFileName,
+      fs_.get()));
 
   for (int c = 0; c < num_cf; ++c) {
     if (base_cf_opts[c].compaction_filter) {
@@ -1671,8 +3638,8 @@
                                   kOptionsFileName, fs_.get()));
 
   RocksDBOptionsParser parser;
-  ASSERT_OK(
-      parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */));
+  ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false,
+                         4096 /* readahead_size */));
 
   {
     Options old_default_opts;
@@ -1747,38 +3714,94 @@
   ASSERT_EQ(5000, small_opts.max_open_files);
 }
 
-class OptionsSanityCheckTest : public OptionsParserTest {
+class OptionsSanityCheckTest : public OptionsParserTest,
+                               public ::testing::WithParamInterface<bool> {
+ protected:
+  ConfigOptions config_options_;
+
  public:
-  OptionsSanityCheckTest() {}
+  OptionsSanityCheckTest() {
+    config_options_.ignore_unknown_options = false;
+    config_options_.ignore_unsupported_options = GetParam();
+    config_options_.input_strings_escaped = true;
+  }
 
  protected:
-  Status SanityCheckCFOptions(const ColumnFamilyOptions& cf_opts,
-                              OptionsSanityCheckLevel level) {
+  Status SanityCheckOptions(const DBOptions& db_opts,
+                            const ColumnFamilyOptions& cf_opts,
+                            ConfigOptions::SanityLevel level) {
+    config_options_.sanity_level = level;
     return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
-        DBOptions(), {"default"}, {cf_opts}, kOptionsFileName, fs_.get(),
-        level);
+        config_options_, db_opts, {"default"}, {cf_opts}, kOptionsFileName,
+        fs_.get());
   }
 
-  Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) {
-    Status s = env_->DeleteFile(kOptionsFileName);
+  Status SanityCheckCFOptions(const ColumnFamilyOptions& cf_opts,
+                              ConfigOptions::SanityLevel level) {
+    return SanityCheckOptions(DBOptions(), cf_opts, level);
+  }
+
+  void SanityCheckCFOptions(const ColumnFamilyOptions& opts, bool exact) {
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
+    if (exact) {
+      ASSERT_OK(
+          SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    } else {
+      ASSERT_NOK(
+          SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    }
+  }
+
+  Status SanityCheckDBOptions(const DBOptions& db_opts,
+                              ConfigOptions::SanityLevel level) {
+    return SanityCheckOptions(db_opts, ColumnFamilyOptions(), level);
+  }
+
+  void SanityCheckDBOptions(const DBOptions& opts, bool exact) {
+    ASSERT_OK(SanityCheckDBOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelNone));
+    if (exact) {
+      ASSERT_OK(
+          SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    } else {
+      ASSERT_NOK(
+          SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    }
+  }
+
+  Status PersistOptions(const DBOptions& db_opts,
+                        const ColumnFamilyOptions& cf_opts) {
+    Status s = fs_->DeleteFile(kOptionsFileName, IOOptions(), nullptr);
     if (!s.ok()) {
       return s;
     }
-    return PersistRocksDBOptions(DBOptions(), {"default"}, {cf_opts},
+    return PersistRocksDBOptions(db_opts, {"default"}, {cf_opts},
                                  kOptionsFileName, fs_.get());
   }
 
+  Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) {
+    return PersistOptions(DBOptions(), cf_opts);
+  }
+
+  Status PersistDBOptions(const DBOptions& db_opts) {
+    return PersistOptions(db_opts, ColumnFamilyOptions());
+  }
+
   const std::string kOptionsFileName = "OPTIONS";
 };
 
-TEST_F(OptionsSanityCheckTest, SanityCheck) {
+TEST_P(OptionsSanityCheckTest, CFOptionsSanityCheck) {
   ColumnFamilyOptions opts;
   Random rnd(301);
 
   // default ColumnFamilyOptions
   {
     ASSERT_OK(PersistCFOptions(opts));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+    ASSERT_OK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
   }
 
   // prefix_extractor
@@ -1786,59 +3809,69 @@
     // Okay to change prefix_extractor form nullptr to non-nullptr
     ASSERT_EQ(opts.prefix_extractor.get(), nullptr);
     opts.prefix_extractor.reset(NewCappedPrefixTransform(10));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
 
     // persist the change
     ASSERT_OK(PersistCFOptions(opts));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+    ASSERT_OK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
 
     // use same prefix extractor but with different parameter
     opts.prefix_extractor.reset(NewCappedPrefixTransform(15));
-    // expect pass only in kSanityLevelLooselyCompatible
-    ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+    // expect pass only in
+    // ConfigOptions::kSanityLevelLooselyCompatible
+    ASSERT_NOK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
 
     // repeat the test with FixedPrefixTransform
     opts.prefix_extractor.reset(NewFixedPrefixTransform(10));
-    ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+    ASSERT_NOK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
 
     // persist the change of prefix_extractor
     ASSERT_OK(PersistCFOptions(opts));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+    ASSERT_OK(
+        SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
 
     // use same prefix extractor but with different parameter
     opts.prefix_extractor.reset(NewFixedPrefixTransform(15));
-    // expect pass only in kSanityLevelLooselyCompatible
-    ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+    // expect pass only in
+    // ConfigOptions::kSanityLevelLooselyCompatible
+    SanityCheckCFOptions(opts, false);
 
     // Change prefix extractor from non-nullptr to nullptr
     opts.prefix_extractor.reset();
     // expect pass as it's safe to change prefix_extractor
     // from non-null to null
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
   }
   // persist the change
   ASSERT_OK(PersistCFOptions(opts));
-  ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+  ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
 
   // table_factory
   {
     for (int tb = 0; tb <= 2; ++tb) {
       // change the table factory
       opts.table_factory.reset(test::RandomTableFactory(&rnd, tb));
-      ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+      ASSERT_NOK(SanityCheckCFOptions(
+          opts, ConfigOptions::kSanityLevelLooselyCompatible));
+      ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
 
       // persist the change
       ASSERT_OK(PersistCFOptions(opts));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+      ASSERT_OK(
+          SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch));
     }
   }
 
@@ -1846,32 +3879,35 @@
   {
     // Test when going from nullptr -> merge operator
     opts.merge_operator.reset(test::RandomMergeOperator(&rnd));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+    ASSERT_OK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
 
     // persist the change
     ASSERT_OK(PersistCFOptions(opts));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+    SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
 
     for (int test = 0; test < 5; ++test) {
       // change the merge operator
       opts.merge_operator.reset(test::RandomMergeOperator(&rnd));
-      ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+      ASSERT_NOK(SanityCheckCFOptions(
+          opts, ConfigOptions::kSanityLevelLooselyCompatible));
+      ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
 
       // persist the change
       ASSERT_OK(PersistCFOptions(opts));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+      SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
     }
 
     // Test when going from merge operator -> nullptr
     opts.merge_operator = nullptr;
-    ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+    ASSERT_NOK(SanityCheckCFOptions(
+        opts, ConfigOptions::kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelNone));
 
     // persist the change
     ASSERT_OK(PersistCFOptions(opts));
-    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+    SanityCheckCFOptions(opts, true);
   }
 
   // compaction_filter
@@ -1879,12 +3915,11 @@
     for (int test = 0; test < 5; ++test) {
       // change the compaction filter
       opts.compaction_filter = test::RandomCompactionFilter(&rnd);
-      ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
+      SanityCheckCFOptions(opts, false);
 
       // persist the change
       ASSERT_OK(PersistCFOptions(opts));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+      SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
       delete opts.compaction_filter;
       opts.compaction_filter = nullptr;
     }
@@ -1896,16 +3931,57 @@
       // change the compaction filter factory
       opts.compaction_filter_factory.reset(
           test::RandomCompactionFilterFactory(&rnd));
-      ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
+      SanityCheckCFOptions(opts, false);
 
       // persist the change
       ASSERT_OK(PersistCFOptions(opts));
-      ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+      SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options);
     }
   }
 }
 
+TEST_P(OptionsSanityCheckTest, DBOptionsSanityCheck) {
+  DBOptions opts;
+  Random rnd(301);
+
+  // default DBOptions
+  {
+    ASSERT_OK(PersistDBOptions(opts));
+    ASSERT_OK(
+        SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+  }
+
+  // File checksum generator
+  {
+    class MockFileChecksumGenFactory : public FileChecksumGenFactory {
+     public:
+      static const char* kClassName() { return "Mock"; }
+      const char* Name() const override { return kClassName(); }
+      std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+          const FileChecksumGenContext& /*context*/) override {
+        return nullptr;
+      }
+    };
+
+    // Okay to change file_checksum_gen_factory form nullptr to non-nullptr
+    ASSERT_EQ(opts.file_checksum_gen_factory.get(), nullptr);
+    opts.file_checksum_gen_factory.reset(new MockFileChecksumGenFactory());
+
+    // persist the change
+    ASSERT_OK(PersistDBOptions(opts));
+    SanityCheckDBOptions(opts, config_options_.ignore_unsupported_options);
+
+    // Change file_checksum_gen_factory from non-nullptr to nullptr
+    opts.file_checksum_gen_factory.reset();
+    // expect pass as it's safe to change file_checksum_gen_factory
+    // from non-null to null
+    SanityCheckDBOptions(opts, false);
+  }
+  // persist the change
+  ASSERT_OK(PersistDBOptions(opts));
+  ASSERT_OK(SanityCheckDBOptions(opts, ConfigOptions::kSanityLevelExactMatch));
+}
+
 namespace {
 bool IsEscapedString(const std::string& str) {
   for (size_t i = 0; i < str.size(); ++i) {
@@ -1992,7 +4068,635 @@
                 "Escape \\# and # comment together   ."),
             "Escape \\# and");
 }
+
+static void TestAndCompareOption(const ConfigOptions& config_options,
+                                 const OptionTypeInfo& opt_info,
+                                 const std::string& opt_name, void* base_ptr,
+                                 void* comp_ptr, bool strip = false) {
+  std::string result, mismatch;
+  ASSERT_OK(opt_info.Serialize(config_options, opt_name, base_ptr, &result));
+  if (strip) {
+    ASSERT_EQ(result.at(0), '{');
+    ASSERT_EQ(result.at(result.size() - 1), '}');
+    result = result.substr(1, result.size() - 2);
+  }
+  ASSERT_OK(opt_info.Parse(config_options, opt_name, result, comp_ptr));
+  ASSERT_TRUE(opt_info.AreEqual(config_options, opt_name, base_ptr, comp_ptr,
+                                &mismatch));
+}
+
+static void TestParseAndCompareOption(const ConfigOptions& config_options,
+                                      const OptionTypeInfo& opt_info,
+                                      const std::string& opt_name,
+                                      const std::string& opt_value,
+                                      void* base_ptr, void* comp_ptr,
+                                      bool strip = false) {
+  ASSERT_OK(opt_info.Parse(config_options, opt_name, opt_value, base_ptr));
+  TestAndCompareOption(config_options, opt_info, opt_name, base_ptr, comp_ptr,
+                       strip);
+}
+
+template <typename T>
+void TestOptInfo(const ConfigOptions& config_options, OptionType opt_type,
+                 T* base, T* comp) {
+  std::string result;
+  OptionTypeInfo opt_info(0, opt_type);
+  ASSERT_FALSE(opt_info.AreEqual(config_options, "base", base, comp, &result));
+  ASSERT_EQ(result, "base");
+  ASSERT_NE(*base, *comp);
+  TestAndCompareOption(config_options, opt_info, "base", base, comp);
+  ASSERT_EQ(*base, *comp);
+}
+
+class OptionTypeInfoTest : public testing::Test {};
+
+TEST_F(OptionTypeInfoTest, BasicTypes) {
+  ConfigOptions config_options;
+  {
+    bool a = true, b = false;
+    TestOptInfo(config_options, OptionType::kBoolean, &a, &b);
+  }
+  {
+    int a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kInt, &a, &b);
+  }
+  {
+    int32_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kInt32T, &a, &b);
+  }
+  {
+    int64_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kInt64T, &a, &b);
+  }
+  {
+    unsigned int a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kUInt, &a, &b);
+  }
+  {
+    uint32_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kUInt32T, &a, &b);
+  }
+  {
+    uint64_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kUInt64T, &a, &b);
+  }
+  {
+    size_t a = 100, b = 200;
+    TestOptInfo(config_options, OptionType::kSizeT, &a, &b);
+  }
+  {
+    std::string a = "100", b = "200";
+    TestOptInfo(config_options, OptionType::kString, &a, &b);
+  }
+  {
+    double a = 1.0, b = 2.0;
+    TestOptInfo(config_options, OptionType::kDouble, &a, &b);
+  }
+}
+
+TEST_F(OptionTypeInfoTest, TestInvalidArgs) {
+  ConfigOptions config_options;
+  bool b;
+  int i;
+  int32_t i32;
+  int64_t i64;
+  unsigned int u;
+  int32_t u32;
+  int64_t u64;
+  size_t sz;
+  double d;
+
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kBoolean)
+                 .Parse(config_options, "b", "x", &b));
+  ASSERT_NOK(
+      OptionTypeInfo(0, OptionType::kInt).Parse(config_options, "b", "x", &i));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kInt32T)
+                 .Parse(config_options, "b", "x", &i32));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kInt64T)
+                 .Parse(config_options, "b", "x", &i64));
+  ASSERT_NOK(
+      OptionTypeInfo(0, OptionType::kUInt).Parse(config_options, "b", "x", &u));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kUInt32T)
+                 .Parse(config_options, "b", "x", &u32));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kUInt64T)
+                 .Parse(config_options, "b", "x", &u64));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kSizeT)
+                 .Parse(config_options, "b", "x", &sz));
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kDouble)
+                 .Parse(config_options, "b", "x", &d));
+
+  // Don't know how to convert Unknowns to anything else
+  ASSERT_NOK(OptionTypeInfo(0, OptionType::kUnknown)
+                 .Parse(config_options, "b", "x", &d));
+
+  // Verify that if the parse function throws an exception, it is also trapped
+  OptionTypeInfo func_info(0, OptionType::kUnknown,
+                           OptionVerificationType::kNormal,
+                           OptionTypeFlags::kNone,
+                           [](const ConfigOptions&, const std::string&,
+                              const std::string& value, void* addr) {
+                             auto ptr = static_cast<int*>(addr);
+                             *ptr = ParseInt(value);
+                             return Status::OK();
+                           });
+  ASSERT_OK(func_info.Parse(config_options, "b", "1", &i));
+  ASSERT_NOK(func_info.Parse(config_options, "b", "x", &i));
+}
+
+TEST_F(OptionTypeInfoTest, TestParseFunc) {
+  OptionTypeInfo opt_info(
+      0, OptionType::kUnknown, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& /*opts*/, const std::string& name,
+         const std::string& value, void* addr) {
+        auto ptr = static_cast<std::string*>(addr);
+        if (name == "Oops") {
+          return Status::InvalidArgument(value);
+        } else {
+          *ptr = value + " " + name;
+          return Status::OK();
+        }
+      });
+  ConfigOptions config_options;
+  std::string base;
+  ASSERT_OK(opt_info.Parse(config_options, "World", "Hello", &base));
+  ASSERT_EQ(base, "Hello World");
+  ASSERT_NOK(opt_info.Parse(config_options, "Oops", "Hello", &base));
+}
+
+TEST_F(OptionTypeInfoTest, TestSerializeFunc) {
+  OptionTypeInfo opt_info(
+      0, OptionType::kString, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone, nullptr,
+      [](const ConfigOptions& /*opts*/, const std::string& name,
+         const void* /*addr*/, std::string* value) {
+        if (name == "Oops") {
+          return Status::InvalidArgument(name);
+        } else {
+          *value = name;
+          return Status::OK();
+        }
+      },
+      nullptr);
+  ConfigOptions config_options;
+  std::string base;
+  std::string value;
+  ASSERT_OK(opt_info.Serialize(config_options, "Hello", &base, &value));
+  ASSERT_EQ(value, "Hello");
+  ASSERT_NOK(opt_info.Serialize(config_options, "Oops", &base, &value));
+}
+
+TEST_F(OptionTypeInfoTest, TestEqualsFunc) {
+  OptionTypeInfo opt_info(
+      0, OptionType::kInt, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone, nullptr, nullptr,
+      [](const ConfigOptions& /*opts*/, const std::string& name,
+         const void* addr1, const void* addr2, std::string* mismatch) {
+        auto i1 = *(static_cast<const int*>(addr1));
+        auto i2 = *(static_cast<const int*>(addr2));
+        if (name == "LT") {
+          return i1 < i2;
+        } else if (name == "GT") {
+          return i1 > i2;
+        } else if (name == "EQ") {
+          return i1 == i2;
+        } else {
+          *mismatch = name + "???";
+          return false;
+        }
+      });
+
+  ConfigOptions config_options;
+  int int1 = 100;
+  int int2 = 200;
+  std::string mismatch;
+  ASSERT_TRUE(opt_info.AreEqual(config_options, "LT", &int1, &int2, &mismatch));
+  ASSERT_EQ(mismatch, "");
+  ASSERT_FALSE(
+      opt_info.AreEqual(config_options, "GT", &int1, &int2, &mismatch));
+  ASSERT_EQ(mismatch, "GT");
+  ASSERT_FALSE(
+      opt_info.AreEqual(config_options, "NO", &int1, &int2, &mismatch));
+  ASSERT_EQ(mismatch, "NO???");
+}
+
+TEST_F(OptionTypeInfoTest, TestOptionFlags) {
+  OptionTypeInfo opt_none(0, OptionType::kString,
+                          OptionVerificationType::kNormal,
+                          OptionTypeFlags::kDontSerialize);
+  OptionTypeInfo opt_never(0, OptionType::kString,
+                           OptionVerificationType::kNormal,
+                           OptionTypeFlags::kCompareNever);
+  OptionTypeInfo opt_alias(0, OptionType::kString,
+                           OptionVerificationType::kAlias,
+                           OptionTypeFlags::kNone);
+  OptionTypeInfo opt_deprecated(0, OptionType::kString,
+                                OptionVerificationType::kDeprecated,
+                                OptionTypeFlags::kNone);
+  ConfigOptions config_options;
+  std::string opts_str;
+  std::string base = "base";
+  std::string comp = "comp";
+
+  // If marked string none, the serialization returns not supported
+  ASSERT_NOK(opt_none.Serialize(config_options, "None", &base, &opts_str));
+  // If marked never compare, they match even when they do not
+  ASSERT_TRUE(opt_never.AreEqual(config_options, "Never", &base, &comp, &base));
+  ASSERT_FALSE(opt_none.AreEqual(config_options, "Never", &base, &comp, &base));
+
+  // An alias can change the value via parse, but does nothing on serialize on
+  // match
+  std::string result;
+  ASSERT_OK(opt_alias.Parse(config_options, "Alias", "Alias", &base));
+  ASSERT_OK(opt_alias.Serialize(config_options, "Alias", &base, &result));
+  ASSERT_TRUE(
+      opt_alias.AreEqual(config_options, "Alias", &base, &comp, &result));
+  ASSERT_EQ(base, "Alias");
+  ASSERT_NE(base, comp);
+
+  // Deprecated options do nothing on any of the commands
+  ASSERT_OK(opt_deprecated.Parse(config_options, "Alias", "Deprecated", &base));
+  ASSERT_OK(opt_deprecated.Serialize(config_options, "Alias", &base, &result));
+  ASSERT_TRUE(
+      opt_deprecated.AreEqual(config_options, "Alias", &base, &comp, &result));
+  ASSERT_EQ(base, "Alias");
+  ASSERT_NE(base, comp);
+}
+
+TEST_F(OptionTypeInfoTest, TestCustomEnum) {
+  enum TestEnum { kA, kB, kC };
+  std::unordered_map<std::string, TestEnum> enum_map = {
+      {"A", TestEnum::kA},
+      {"B", TestEnum::kB},
+      {"C", TestEnum::kC},
+  };
+  OptionTypeInfo opt_info = OptionTypeInfo::Enum<TestEnum>(0, &enum_map);
+  TestEnum e1, e2;
+  ConfigOptions config_options;
+  std::string result, mismatch;
+
+  e2 = TestEnum::kA;
+
+  ASSERT_OK(opt_info.Parse(config_options, "", "B", &e1));
+  ASSERT_OK(opt_info.Serialize(config_options, "", &e1, &result));
+  ASSERT_EQ(e1, TestEnum::kB);
+  ASSERT_EQ(result, "B");
+
+  ASSERT_FALSE(opt_info.AreEqual(config_options, "Enum", &e1, &e2, &mismatch));
+  ASSERT_EQ(mismatch, "Enum");
+
+  TestParseAndCompareOption(config_options, opt_info, "", "C", &e1, &e2);
+  ASSERT_EQ(e2, TestEnum::kC);
+
+  ASSERT_NOK(opt_info.Parse(config_options, "", "D", &e1));
+  ASSERT_EQ(e1, TestEnum::kC);
+}
+
+TEST_F(OptionTypeInfoTest, TestBuiltinEnum) {
+  ConfigOptions config_options;
+  for (auto iter : OptionsHelper::compaction_style_string_map) {
+    CompactionStyle e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kCompactionStyle),
+                              "CompactionStyle", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::compaction_pri_string_map) {
+    CompactionPri e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kCompactionPri),
+                              "CompactionPri", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::compression_type_string_map) {
+    CompressionType e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kCompressionType),
+                              "CompressionType", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::compaction_stop_style_string_map) {
+    CompactionStopStyle e1, e2;
+    TestParseAndCompareOption(
+        config_options, OptionTypeInfo(0, OptionType::kCompactionStopStyle),
+        "CompactionStopStyle", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::checksum_type_string_map) {
+    ChecksumType e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kChecksumType),
+                              "CheckSumType", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+  for (auto iter : OptionsHelper::encoding_type_string_map) {
+    EncodingType e1, e2;
+    TestParseAndCompareOption(config_options,
+                              OptionTypeInfo(0, OptionType::kEncodingType),
+                              "EncodingType", iter.first, &e1, &e2);
+    ASSERT_EQ(e1, iter.second);
+  }
+}
+
+TEST_F(OptionTypeInfoTest, TestStruct) {
+  struct Basic {
+    int i = 42;
+    std::string s = "Hello";
+  };
+
+  struct Extended {
+    int j = 11;
+    Basic b;
+  };
+
+  std::unordered_map<std::string, OptionTypeInfo> basic_type_map = {
+      {"i", {offsetof(struct Basic, i), OptionType::kInt}},
+      {"s", {offsetof(struct Basic, s), OptionType::kString}},
+  };
+  OptionTypeInfo basic_info = OptionTypeInfo::Struct(
+      "b", &basic_type_map, 0, OptionVerificationType::kNormal,
+      OptionTypeFlags::kMutable);
+
+  std::unordered_map<std::string, OptionTypeInfo> extended_type_map = {
+      {"j", {offsetof(struct Extended, j), OptionType::kInt}},
+      {"b", OptionTypeInfo::Struct(
+                "b", &basic_type_map, offsetof(struct Extended, b),
+                OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+      {"m", OptionTypeInfo::Struct(
+                "m", &basic_type_map, offsetof(struct Extended, b),
+                OptionVerificationType::kNormal, OptionTypeFlags::kMutable)},
+  };
+  OptionTypeInfo extended_info = OptionTypeInfo::Struct(
+      "e", &extended_type_map, 0, OptionVerificationType::kNormal,
+      OptionTypeFlags::kMutable);
+  Extended e1, e2;
+  ConfigOptions config_options;
+  std::string mismatch;
+  TestParseAndCompareOption(config_options, basic_info, "b", "{i=33;s=33}",
+                            &e1.b, &e2.b);
+  ASSERT_EQ(e1.b.i, 33);
+  ASSERT_EQ(e1.b.s, "33");
+
+  TestParseAndCompareOption(config_options, basic_info, "b.i", "44", &e1.b,
+                            &e2.b);
+  ASSERT_EQ(e1.b.i, 44);
+
+  TestParseAndCompareOption(config_options, basic_info, "i", "55", &e1.b,
+                            &e2.b);
+  ASSERT_EQ(e1.b.i, 55);
+
+  e1.b.i = 0;
+
+  ASSERT_FALSE(
+      basic_info.AreEqual(config_options, "b", &e1.b, &e2.b, &mismatch));
+  ASSERT_EQ(mismatch, "b.i");
+  mismatch.clear();
+  ASSERT_FALSE(
+      basic_info.AreEqual(config_options, "b.i", &e1.b, &e2.b, &mismatch));
+  ASSERT_EQ(mismatch, "b.i");
+  mismatch.clear();
+  ASSERT_FALSE(
+      basic_info.AreEqual(config_options, "i", &e1.b, &e2.b, &mismatch));
+  ASSERT_EQ(mismatch, "b.i");
+  mismatch.clear();
+
+  e1 = e2;
+  ASSERT_NOK(basic_info.Parse(config_options, "b", "{i=33;s=33;j=44}", &e1.b));
+  ASSERT_NOK(basic_info.Parse(config_options, "b.j", "44", &e1.b));
+  ASSERT_NOK(basic_info.Parse(config_options, "j", "44", &e1.b));
+
+  TestParseAndCompareOption(config_options, extended_info, "e",
+                            "b={i=55;s=55}; j=22;", &e1, &e2);
+  ASSERT_EQ(e1.b.i, 55);
+  ASSERT_EQ(e1.j, 22);
+  ASSERT_EQ(e1.b.s, "55");
+  TestParseAndCompareOption(config_options, extended_info, "e.b",
+                            "{i=66;s=66;}", &e1, &e2);
+  ASSERT_EQ(e1.b.i, 66);
+  ASSERT_EQ(e1.j, 22);
+  ASSERT_EQ(e1.b.s, "66");
+  TestParseAndCompareOption(config_options, extended_info, "e.b.i", "77", &e1,
+                            &e2);
+  ASSERT_EQ(e1.b.i, 77);
+  ASSERT_EQ(e1.j, 22);
+  ASSERT_EQ(e1.b.s, "66");
+}
+
+TEST_F(OptionTypeInfoTest, TestVectorType) {
+  OptionTypeInfo vec_info = OptionTypeInfo::Vector<std::string>(
+      0, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      {0, OptionType::kString});
+  std::vector<std::string> vec1, vec2;
+  std::string mismatch;
+
+  ConfigOptions config_options;
+  TestParseAndCompareOption(config_options, vec_info, "v", "a:b:c:d", &vec1,
+                            &vec2);
+  ASSERT_EQ(vec1.size(), 4);
+  ASSERT_EQ(vec1[0], "a");
+  ASSERT_EQ(vec1[1], "b");
+  ASSERT_EQ(vec1[2], "c");
+  ASSERT_EQ(vec1[3], "d");
+  vec1[3] = "e";
+  ASSERT_FALSE(vec_info.AreEqual(config_options, "v", &vec1, &vec2, &mismatch));
+  ASSERT_EQ(mismatch, "v");
+
+  // Test vectors with inner brackets
+  TestParseAndCompareOption(config_options, vec_info, "v", "a:{b}:c:d", &vec1,
+                            &vec2);
+  ASSERT_EQ(vec1.size(), 4);
+  ASSERT_EQ(vec1[0], "a");
+  ASSERT_EQ(vec1[1], "b");
+  ASSERT_EQ(vec1[2], "c");
+  ASSERT_EQ(vec1[3], "d");
+
+  OptionTypeInfo bar_info = OptionTypeInfo::Vector<std::string>(
+      0, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      {0, OptionType::kString}, '|');
+  TestParseAndCompareOption(config_options, vec_info, "v", "x|y|z", &vec1,
+                            &vec2);
+  // Test vectors with inner vector
+  TestParseAndCompareOption(config_options, bar_info, "v",
+                            "a|{b1|b2}|{c1|c2|{d1|d2}}", &vec1, &vec2, false);
+  ASSERT_EQ(vec1.size(), 3);
+  ASSERT_EQ(vec1[0], "a");
+  ASSERT_EQ(vec1[1], "b1|b2");
+  ASSERT_EQ(vec1[2], "c1|c2|{d1|d2}");
+
+  TestParseAndCompareOption(config_options, bar_info, "v",
+                            "{a1|a2}|{b1|{c1|c2}}|d1", &vec1, &vec2, true);
+  ASSERT_EQ(vec1.size(), 3);
+  ASSERT_EQ(vec1[0], "a1|a2");
+  ASSERT_EQ(vec1[1], "b1|{c1|c2}");
+  ASSERT_EQ(vec1[2], "d1");
+
+  TestParseAndCompareOption(config_options, bar_info, "v", "{a1}", &vec1, &vec2,
+                            false);
+  ASSERT_EQ(vec1.size(), 1);
+  ASSERT_EQ(vec1[0], "a1");
+
+  TestParseAndCompareOption(config_options, bar_info, "v", "{a1|a2}|{b1|b2}",
+                            &vec1, &vec2, true);
+  ASSERT_EQ(vec1.size(), 2);
+  ASSERT_EQ(vec1[0], "a1|a2");
+  ASSERT_EQ(vec1[1], "b1|b2");
+}
+
+TEST_F(OptionTypeInfoTest, TestStaticType) {
+  struct SimpleOptions {
+    size_t size = 0;
+    bool verify = true;
+  };
+
+  static std::unordered_map<std::string, OptionTypeInfo> type_map = {
+      {"size", {offsetof(struct SimpleOptions, size), OptionType::kSizeT}},
+      {"verify",
+       {offsetof(struct SimpleOptions, verify), OptionType::kBoolean}},
+  };
+
+  ConfigOptions config_options;
+  SimpleOptions opts, copy;
+  opts.size = 12345;
+  opts.verify = false;
+  std::string str, mismatch;
+
+  ASSERT_OK(
+      OptionTypeInfo::SerializeType(config_options, type_map, &opts, &str));
+  ASSERT_FALSE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts,
+                                             &copy, &mismatch));
+  ASSERT_OK(OptionTypeInfo::ParseType(config_options, str, type_map, &copy));
+  ASSERT_TRUE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts,
+                                            &copy, &mismatch));
+}
+
+class ConfigOptionsTest : public testing::Test {};
+
+TEST_F(ConfigOptionsTest, EnvFromConfigOptions) {
+  ConfigOptions config_options;
+  DBOptions db_opts;
+  Options opts;
+  Env* mem_env = NewMemEnv(Env::Default());
+  config_options.registry->AddLibrary("custom-env", RegisterCustomEnv,
+                                      kCustomEnvName);
+
+  config_options.env = mem_env;
+  // First test that we can get the env as expected
+  ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(), kCustomEnvProp,
+                                   &db_opts));
+  ASSERT_OK(
+      GetOptionsFromString(config_options, Options(), kCustomEnvProp, &opts));
+  ASSERT_NE(config_options.env, db_opts.env);
+  ASSERT_EQ(opts.env, db_opts.env);
+  Env* custom_env = db_opts.env;
+
+  // Now try a "bad" env" and check that nothing changed
+  config_options.ignore_unsupported_options = true;
+  ASSERT_OK(
+      GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts));
+  ASSERT_OK(GetOptionsFromString(config_options, opts, "env=unknown", &opts));
+  ASSERT_EQ(config_options.env, mem_env);
+  ASSERT_EQ(db_opts.env, custom_env);
+  ASSERT_EQ(opts.env, db_opts.env);
+
+  // Now try a "bad" env" ignoring unknown objects
+  config_options.ignore_unsupported_options = false;
+  ASSERT_NOK(
+      GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts));
+  ASSERT_EQ(config_options.env, mem_env);
+  ASSERT_EQ(db_opts.env, custom_env);
+  ASSERT_EQ(opts.env, db_opts.env);
+
+  delete mem_env;
+}
+TEST_F(ConfigOptionsTest, MergeOperatorFromString) {
+  ConfigOptions config_options;
+  std::shared_ptr<MergeOperator> merge_op;
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "put", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("put"));
+  ASSERT_STREQ(merge_op->Name(), "PutOperator");
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "put_v1", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("PutOperator"));
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "uint64add", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("uint64add"));
+  ASSERT_STREQ(merge_op->Name(), "UInt64AddOperator");
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "max", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("max"));
+  ASSERT_STREQ(merge_op->Name(), "MaxOperator");
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "bytesxor", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("bytesxor"));
+  ASSERT_STREQ(merge_op->Name(), BytesXOROperator::kClassName());
+
+  ASSERT_OK(
+      MergeOperator::CreateFromString(config_options, "sortlist", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("sortlist"));
+  ASSERT_STREQ(merge_op->Name(), SortList::kClassName());
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "stringappend",
+                                            &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappend"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendOperator::kClassName());
+  auto delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, ",");
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, "stringappendtest",
+                                            &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappendtest"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendTESTOperator::kClassName());
+  delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, ",");
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options, "id=stringappend; delimiter=||", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappend"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendOperator::kClassName());
+  delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, "||");
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options, "id=stringappendtest; delimiter=&&", &merge_op));
+  ASSERT_NE(merge_op, nullptr);
+  ASSERT_TRUE(merge_op->IsInstanceOf("stringappendtest"));
+  ASSERT_STREQ(merge_op->Name(), StringAppendTESTOperator::kClassName());
+  delimiter = merge_op->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, "&&");
+
+  std::shared_ptr<MergeOperator> copy;
+  std::string mismatch;
+  std::string opts_str = merge_op->ToString(config_options);
+
+  ASSERT_OK(MergeOperator::CreateFromString(config_options, opts_str, &copy));
+  ASSERT_TRUE(merge_op->AreEquivalent(config_options, copy.get(), &mismatch));
+  ASSERT_NE(copy, nullptr);
+  delimiter = copy->GetOptions<std::string>("Delimiter");
+  ASSERT_NE(delimiter, nullptr);
+  ASSERT_EQ(*delimiter, "&&");
+}
+
+INSTANTIATE_TEST_CASE_P(OptionsSanityCheckTest, OptionsSanityCheckTest,
+                        ::testing::Bool());
 #endif  // !ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/plugin/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/plugin/README.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/plugin/README.md	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/plugin/README.md	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,43 @@
+## Building external plugins together with RocksDB
+
+RocksDB offers several plugin interfaces for developers to customize its behavior. One difficulty developers face is how to make their plugin available to end users. The approach discussed here involves building the external code together with the RocksDB code into a single binary. Note another approach we plan to support involves loading plugins dynamically from shared libraries.
+
+### Discovery
+
+We hope developers will mention their work in "PLUGINS.md" so users can easily discover and reuse solutions for customizing RocksDB.
+
+### Directory organization
+
+External plugins will be linked according to their name into a subdirectory of "plugin/". For example, a plugin called "dedupfs" would be linked into "plugin/dedupfs/".
+
+### Build standard
+
+Currently the only supported build system are make and cmake.
+
+For make, files in the plugin directory ending in the .mk extension can define the following variables.
+
+* `$(PLUGIN_NAME)_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files.
+* `$(PLUGIN_NAME)_HEADERS`: these files will be installed in the RocksDB header directory. Their paths will be prefixed by "rocksdb/plugin/$(PLUGIN_NAME)/".
+* `$(PLUGIN_NAME)_LDFLAGS`: these flags will be passed to the final link step. For example, library dependencies can be propagated here, or symbols can be forcibly included, e.g., for static registration.
+* `$(PLUGIN_NAME)_CXXFLAGS`: these flags will be passed to the compiler. For example, they can specify locations of header files in non-standard locations.
+
+Users will run the usual make commands from the RocksDB directory, specifying the plugins to include in a space-separated list in the variable `ROCKSDB_PLUGINS`.
+
+For CMake, the CMakeLists.txt file in the plugin directory can define the following variables.
+
+* `${PLUGIN_NAME}_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files.
+* `${PLUGIN_NAME}_COMPILE_FLAGS`: these flags will be passed to the compiler. For example, they can specify locations of header files in non-standard locations.
+* `${PLUGIN_NAME}_INCLUDE_PATHS`: paths to directories to search for plugin-specific header files during compilation.
+* `${PLUGIN_NAME}_LIBS`: list of library names required to build the plugin, e.g. `dl`, `java`, `jvm`, `rados`, etc. CMake will generate proper flags for linking.
+* `${PLUGIN_NAME}_LINK_PATHS`: list of paths for the linker to search for required libraries in additional to standard locations.
+* `${PLUGIN_NAME}_CMAKE_SHARED_LINKER_FLAGS` additional linker flags used to generate shared libraries. For example, symbols can be forcibly included, e.g., for static registration.
+* `${PLUGIN_NAME}_CMAKE_EXE_LINKER_FLAGS`: additional linker flags used to generate executables. For example, symbols can be forcibly included, e.g., for static registration.
+
+Users will run the usual cmake commands, specifying the plugins to include in a space-separated list in the command line variable `ROCKSDB_PLUGINS` when invoking cmake.
+```
+cmake .. -DROCKSDB_PLUGINS="dedupfs hdfs rados"
+```
+
+### Example
+
+For a working example, see [Dedupfs](https://github.com/ajkr/dedupfs).
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/jemalloc_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/jemalloc_helper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/jemalloc_helper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/jemalloc_helper.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#if defined(__clang__)
+#if defined(__clang__) && defined(__GLIBC__)
 // glibc's `posix_memalign()` declaration specifies `throw()` while clang's
 // declaration does not. There is a hack in clang to make its re-declaration
 // compatible with glibc's if they are declared consecutively. That hack breaks
@@ -38,25 +38,54 @@
 
 #else
 
+// definitions for compatibility with older versions of jemalloc
+#if !defined(JEMALLOC_ALLOCATOR)
+#define JEMALLOC_ALLOCATOR
+#endif
+#if !defined(JEMALLOC_RESTRICT_RETURN)
+#define JEMALLOC_RESTRICT_RETURN
+#endif
+#if !defined(JEMALLOC_NOTHROW)
+#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#endif
+#if !defined(JEMALLOC_ALLOC_SIZE)
+#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#else
+#define JEMALLOC_ALLOC_SIZE(s)
+#endif
+#endif
+
 // Declare non-standard jemalloc APIs as weak symbols. We can null-check these
 // symbols to detect whether jemalloc is linked with the binary.
-extern "C" void* mallocx(size_t, int) __attribute__((__weak__));
-extern "C" void* rallocx(void*, size_t, int) __attribute__((__weak__));
-extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__weak__));
-extern "C" size_t sallocx(const void*, int) __attribute__((__weak__));
-extern "C" void dallocx(void*, int) __attribute__((__weak__));
-extern "C" void sdallocx(void*, size_t, int) __attribute__((__weak__));
-extern "C" size_t nallocx(size_t, int) __attribute__((__weak__));
-extern "C" int mallctl(const char*, void*, size_t*, void*, size_t)
-    __attribute__((__weak__));
-extern "C" int mallctlnametomib(const char*, size_t*, size_t*)
-    __attribute__((__weak__));
-extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*,
-                            size_t) __attribute__((__weak__));
-extern "C" void malloc_stats_print(void (*)(void*, const char*), void*,
-                                   const char*) __attribute__((__weak__));
-extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*)
-    JEMALLOC_CXX_THROW __attribute__((__weak__));
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
+    __attribute__((__weak__));
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int)
+    JEMALLOC_ATTR(pure) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure)
+    __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *,
+                                        size_t) __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *,
+                                                 size_t *)
+    __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *,
+                                             size_t *, void *, size_t)
+    __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW
+malloc_stats_print(void (*)(void *, const char *), void *, const char *)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW
+malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW
+    __attribute__((__weak__));
 
 // Check if Jemalloc is linked with the binary. Note the main program might be
 // using a different memory allocator even this method return true.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/lang.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/lang.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/lang.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/lang.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,64 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef FALLTHROUGH_INTENDED
+#if defined(__clang__)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#else
+#define FALLTHROUGH_INTENDED do {} while (0)
+#endif
+#endif
+
+// ASAN (Address sanitizer)
+
+#if defined(__clang__)
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // __has_feature(address_sanitizer)
+#endif  // defined(__has_feature)
+#else   // __clang__
+#ifdef __SANITIZE_ADDRESS__
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // __SANITIZE_ADDRESS__
+#endif  // __clang__
+
+#ifdef ROCKSDB_VALGRIND_RUN
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // ROCKSDB_VALGRIND_RUN
+
+// Coding guidelines say to avoid static objects with non-trivial destructors,
+// because it's easy to cause trouble (UB) in static destruction. This
+// macro makes it easier to define static objects that are normally never
+// destructed, except are destructed when running under ASAN. This should
+// avoid unexpected, unnecessary destruction behavior in production.
+// Note that constructor arguments can be provided as in
+//   STATIC_AVOID_DESTRUCTION(Foo, foo)(arg1, arg2);
+#ifdef MUST_FREE_HEAP_ALLOCATIONS
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type name
+constexpr bool kMustFreeHeapAllocations = true;
+#else
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type& name = *new Type
+constexpr bool kMustFreeHeapAllocations = false;
+#endif
+
+// TSAN (Thread sanitizer)
+
+// For simplicity, standardize on the GCC define
+#if defined(__clang__)
+#if defined(__has_feature) && __has_feature(thread_sanitizer)
+#define __SANITIZE_THREAD__ 1
+#endif  // __has_feature(thread_sanitizer)
+#endif  // __clang__
+
+#ifdef __SANITIZE_THREAD__
+#define TSAN_SUPPRESSION __attribute__((no_sanitize("thread")))
+#else
+#define TSAN_SUPPRESSION
+#endif  // TSAN_SUPPRESSION
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_example.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_example.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_example.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_example.h	2025-05-19 16:14:27.000000000 +0000
@@ -70,7 +70,7 @@
 //      static void Initializer() { ... do something ...; }
 //      ...
 //      port::InitOnce(&init_control, &Initializer);
-typedef intptr_t OnceType;
+using OnceType = intptr_t;
 #define LEVELDB_ONCE_INIT 0
 extern void InitOnce(port::OnceType*, void (*initializer)());
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#if !defined(OS_WIN)
+
 #include "port/port_posix.h"
 
 #include <assert.h>
@@ -21,8 +23,12 @@
 #include <sys/resource.h>
 #include <sys/time.h>
 #include <unistd.h>
+
 #include <cstdlib>
-#include "logging/logging.h"
+#include <fstream>
+#include <string>
+
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -43,8 +49,8 @@
 namespace port {
 
 static int PthreadCall(const char* label, int result) {
-  if (result != 0 && result != ETIMEDOUT) {
-    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+  if (result != 0 && result != ETIMEDOUT && result != EBUSY) {
+    fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str());
     abort();
   }
   return result;
@@ -86,6 +92,16 @@
   PthreadCall("unlock", pthread_mutex_unlock(&mu_));
 }
 
+bool Mutex::TryLock() {
+  bool ret = PthreadCall("trylock", pthread_mutex_trylock(&mu_)) == 0;
+#ifndef NDEBUG
+  if (ret) {
+    locked_ = true;
+  }
+#endif
+  return ret;
+}
+
 void Mutex::AssertHeld() {
 #ifndef NDEBUG
   assert(locked_);
@@ -230,5 +246,50 @@
 
 const size_t kPageSize = GetPageSize();
 
+void SetCpuPriority(ThreadId id, CpuPriority priority) {
+#ifdef OS_LINUX
+  sched_param param;
+  param.sched_priority = 0;
+  switch (priority) {
+    case CpuPriority::kHigh:
+      sched_setscheduler(id, SCHED_OTHER, &param);
+      setpriority(PRIO_PROCESS, id, -20);
+      break;
+    case CpuPriority::kNormal:
+      sched_setscheduler(id, SCHED_OTHER, &param);
+      setpriority(PRIO_PROCESS, id, 0);
+      break;
+    case CpuPriority::kLow:
+      sched_setscheduler(id, SCHED_OTHER, &param);
+      setpriority(PRIO_PROCESS, id, 19);
+      break;
+    case CpuPriority::kIdle:
+      sched_setscheduler(id, SCHED_IDLE, &param);
+      break;
+    default:
+      assert(false);
+  }
+#else
+  (void)id;
+  (void)priority;
+#endif
+}
+
+int64_t GetProcessID() { return getpid(); }
+
+bool GenerateRfcUuid(std::string* output) {
+  output->clear();
+  std::ifstream f("/proc/sys/kernel/random/uuid");
+  std::getline(f, /*&*/ *output);
+  if (output->size() == 36) {
+    return true;
+  } else {
+    output->clear();
+    return false;
+  }
+}
+
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/port_posix.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/port_posix.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,6 +13,7 @@
 
 #include <thread>
 
+#include "rocksdb/options.h"
 #include "rocksdb/rocksdb_namespace.h"
 
 // size_t printf formatting named in the manner of C99 standard formatting
@@ -115,6 +116,9 @@
 
   void Lock();
   void Unlock();
+
+  bool TryLock();
+
   // this will assert if the mutex is not locked
   // it does NOT verify that mutex is held by a calling thread
   void AssertHeld();
@@ -123,7 +127,7 @@
   friend class CondVar;
   pthread_mutex_t mu_;
 #ifndef NDEBUG
-  bool locked_;
+  bool locked_ = false;
 #endif
 };
 
@@ -166,7 +170,7 @@
 #if defined(__i386__) || defined(__x86_64__)
   asm volatile("pause");
 #elif defined(__aarch64__)
-  asm volatile("wfe");
+  asm volatile("yield");
 #elif defined(__powerpc64__)
   asm volatile("or 27,27,27");
 #endif
@@ -176,7 +180,7 @@
 // Returns -1 if not available on this platform
 extern int PhysicalCoreID();
 
-typedef pthread_once_t OnceType;
+using OnceType = pthread_once_t;
 #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
 extern void InitOnce(OnceType* once, void (*initializer)());
 
@@ -189,7 +193,11 @@
 #define ALIGN_AS(n) /*empty*/
 #else
 #if defined(__s390__)
+#if defined(__GNUC__) && __GNUC__ < 7
+#define CACHE_LINE_SIZE 64U
+#else
 #define CACHE_LINE_SIZE 256U
+#endif
 #elif defined(__powerpc__) || defined(__aarch64__)
 #define CACHE_LINE_SIZE 128U
 #else
@@ -214,5 +222,15 @@
 
 extern const size_t kPageSize;
 
+using ThreadId = pid_t;
+
+extern void SetCpuPriority(ThreadId id, CpuPriority priority);
+
+int64_t GetProcessID();
+
+// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns
+// true on success or false on failure.
+bool GenerateRfcUuid(std::string* output);
+
 } // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,8 +5,9 @@
 //
 #include "port/stack_trace.h"
 
-#if defined(ROCKSDB_LITE) || !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || \
-    defined(CYGWIN) || defined(OS_FREEBSD) || defined(OS_SOLARIS)
+#if defined(ROCKSDB_LITE) ||                                                  \
+    !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \
+    defined(OS_SOLARIS) || defined(OS_WIN)
 
 // noop
 
@@ -14,6 +15,10 @@
 namespace port {
 void InstallStackTraceHandler() {}
 void PrintStack(int /*first_frames_to_skip*/) {}
+void PrintAndFreeStack(void* /*callstack*/, int /*num_frames*/) {}
+void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
+  return nullptr;
+}
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
 
@@ -27,15 +32,22 @@
 #include <unistd.h>
 #include <cxxabi.h>
 
+#if defined(OS_FREEBSD)
+#include <sys/sysctl.h>
+#endif
+
+#include "port/lang.h"
+
 namespace ROCKSDB_NAMESPACE {
 namespace port {
 
 namespace {
 
-#if defined(OS_LINUX) || defined(OS_FREEBSD)
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
 const char* GetExecutableName() {
   static char name[1024];
 
+#if !defined(OS_FREEBSD)
   char link[1024];
   snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
   auto read = readlink(link, name, sizeof(name) - 1);
@@ -45,6 +57,17 @@
     name[read] = 0;
     return name;
   }
+#else
+  int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+  size_t namesz = sizeof(name);
+
+  auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0);
+  if (-1 == ret) {
+    return nullptr;
+  } else {
+    return name;
+  }
+#endif
 }
 
 void PrintStackTraceLine(const char* symbol, void* frame) {
@@ -99,18 +122,38 @@
 
 }  // namespace
 
+void PrintStack(void* frames[], int num_frames) {
+  auto symbols = backtrace_symbols(frames, num_frames);
+
+  for (int i = 0; i < num_frames; ++i) {
+    fprintf(stderr, "#%-2d  ", i);
+    PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]);
+  }
+  free(symbols);
+}
+
 void PrintStack(int first_frames_to_skip) {
   const int kMaxFrames = 100;
   void* frames[kMaxFrames];
 
   auto num_frames = backtrace(frames, kMaxFrames);
-  auto symbols = backtrace_symbols(frames, num_frames);
+  PrintStack(&frames[first_frames_to_skip], num_frames - first_frames_to_skip);
+}
 
-  for (int i = first_frames_to_skip; i < num_frames; ++i) {
-    fprintf(stderr, "#%-2d  ", i - first_frames_to_skip);
-    PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]);
-  }
-  free(symbols);
+void PrintAndFreeStack(void* callstack, int num_frames) {
+  PrintStack(static_cast<void**>(callstack), num_frames);
+  free(callstack);
+}
+
+void* SaveStack(int* num_frames, int first_frames_to_skip) {
+  const int kMaxFrames = 100;
+  void* frames[kMaxFrames];
+
+  auto count = backtrace(frames, kMaxFrames);
+  *num_frames = count - first_frames_to_skip;
+  void* callstack = malloc(sizeof(void*) * *num_frames);
+  memcpy(callstack, &frames[first_frames_to_skip], sizeof(void*) * *num_frames);
+  return callstack;
 }
 
 static void StackTraceHandler(int sig) {
@@ -119,6 +162,20 @@
   fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
   // skip the top three signal handler related frames
   PrintStack(3);
+
+  // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of
+  // a signal" have failed, so just warn the user about them.
+#ifdef __SANITIZE_THREAD__
+  fprintf(stderr,
+          "==> NOTE: any above warnings about \"signal-unsafe call\" are\n"
+          "==> ignorable, as they are expected when generating a stack\n"
+          "==> trace because of a signal under TSAN. Consider why the\n"
+          "==> signal was generated to begin with, and the stack trace\n"
+          "==> in the TSAN warning can be useful for that. (The stack\n"
+          "==> trace printed by the signal handler is likely obscured\n"
+          "==> by TSAN output.)\n");
+#endif
+
   // re-signal to default handler (so we still get core dump if needed...)
   raise(sig);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/stack_trace.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/stack_trace.h	2025-05-19 16:14:27.000000000 +0000
@@ -18,5 +18,11 @@
 // Prints stack, skips skip_first_frames frames
 void PrintStack(int first_frames_to_skip = 0);
 
+// Prints the given callstack
+void PrintAndFreeStack(void* callstack, int num_frames);
+
+// Save the current callstack
+void* SaveStack(int* num_frame, int first_frames_to_skip = 0);
+
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/sys_time.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/sys_time.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/sys_time.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/sys_time.h	2025-05-19 16:14:27.000000000 +0000
@@ -23,10 +23,10 @@
 namespace port {
 
 // Avoid including winsock2.h for this definition
-typedef struct timeval {
+struct timeval {
   long tv_sec;
   long tv_usec;
-} timeval;
+};
 
 void gettimeofday(struct timeval* tv, struct timezone* tz);
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_default.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_default.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_default.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_default.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,10 +7,12 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#if defined(OS_WIN)
+
 #include <mutex>
 
-#include <rocksdb/env.h>
 #include "port/win/env_win.h"
+#include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "util/compression_context_cache.h"
 #include "util/thread_local.h"
@@ -24,18 +26,20 @@
 //    dead-lock.
 //    in this manner any remaining threads are terminated OK.
 namespace {
-  std::once_flag winenv_once_flag;
-  Env* envptr;
-};
-}
+std::once_flag winenv_once_flag;
+Env* envptr;
+};  // namespace
+}  // namespace port
 
 Env* Env::Default() {
-  using namespace port;
   ThreadLocalPtr::InitSingletons();
   CompressionContextCache::InitSingleton();
   INIT_SYNC_POINT_SINGLETONS();
-  std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); });
-  return envptr;
+  std::call_once(port::winenv_once_flag,
+                 []() { port::envptr = new port::WinEnv(); });
+  return port::envptr;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,38 +7,40 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#if defined(OS_WIN)
+
 #include "port/win/env_win.h"
-#include "port/win/win_thread.h"
-#include <algorithm>
-#include <ctime>
-#include <thread>
 
+#include <direct.h>  // _rmdir, _mkdir, _getcwd
 #include <errno.h>
-#include <process.h> // _getpid
-#include <io.h> // _access
-#include <direct.h> // _rmdir, _mkdir, _getcwd
-#include <sys/types.h>
+#include <io.h>   // _access
+#include <rpc.h>  // for uuid generation
+#include <shlwapi.h>
 #include <sys/stat.h>
+#include <sys/types.h>
+#include <windows.h>
+#include <winioctl.h>
 
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-
-#include "port/port.h"
-#include "port/port_dirent.h"
-#include "port/win/win_logger.h"
-#include "port/win/io_win.h"
+#include <algorithm>
+#include <ctime>
+#include <thread>
 
 #include "monitoring/iostats_context_imp.h"
-
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
-
-#include <rpc.h>  // for uuid generation
-#include <windows.h>
-#include <shlwapi.h>
+#include "port/port.h"
+#include "port/port_dirent.h"
+#include "port/win/io_win.h"
+#include "port/win/win_logger.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
 #include "strsafe.h"
+#include "util/string_util.h"
 
-#include <algorithm>
+// Undefine the functions  windows might use (again)...
+#undef GetCurrentTime
+#undef DeleteFile
+#undef LoadLibrary
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -53,36 +55,26 @@
 
 // RAII helpers for HANDLEs
 const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
-typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;
+using UniqueCloseHandlePtr = std::unique_ptr<void, decltype(CloseHandleFunc)>;
 
 const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); };
-typedef std::unique_ptr<void, decltype(FindCloseFunc)> UniqueFindClosePtr;
+using UniqueFindClosePtr = std::unique_ptr<void, decltype(FindCloseFunc)>;
 
 void WinthreadCall(const char* label, std::error_code result) {
   if (0 != result.value()) {
-    fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value()));
+    fprintf(stderr, "Winthread %s: %s\n", label,
+            errnoStr(result.value()).c_str());
     abort();
   }
 }
 
-}
+}  // namespace
 
 namespace port {
-
-WinEnvIO::WinEnvIO(Env* hosted_env)
-    : hosted_env_(hosted_env),
-      page_size_(4 * 1024),
-      allocation_granularity_(page_size_),
-      perf_counter_frequency_(0),
+WinClock::WinClock()
+    : perf_counter_frequency_(0),
       nano_seconds_per_period_(0),
       GetSystemTimePreciseAsFileTime_(NULL) {
-
-  SYSTEM_INFO sinfo;
-  GetSystemInfo(&sinfo);
-
-  page_size_ = sinfo.dwPageSize;
-  allocation_granularity_ = sinfo.dwAllocationGranularity;
-
   {
     LARGE_INTEGER qpf;
     BOOL ret __attribute__((__unused__));
@@ -97,39 +89,90 @@
 
   HMODULE module = GetModuleHandle("kernel32.dll");
   if (module != NULL) {
-    GetSystemTimePreciseAsFileTime_ =
-      (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
-          module, "GetSystemTimePreciseAsFileTime");
+    GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)(
+        void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime");
   }
 }
 
-WinEnvIO::~WinEnvIO() {
+void WinClock::SleepForMicroseconds(int micros) {
+  std::this_thread::sleep_for(std::chrono::microseconds(micros));
 }
 
-Status WinEnvIO::DeleteFile(const std::string& fname) {
-  Status result;
+std::string WinClock::TimeToString(uint64_t secondsSince1970) {
+  std::string result;
 
-  BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+  const time_t seconds = secondsSince1970;
+  const int maxsize = 64;
 
-  if(!ret) {
-    auto lastError = GetLastError();
-    result = IOErrorFromWindowsError("Failed to delete: " + fname,
-                                     lastError);
+  struct tm t;
+  errno_t ret = localtime_s(&t, &seconds);
+
+  if (ret) {
+    result = std::to_string(seconds);
+  } else {
+    result.resize(maxsize);
+    char* p = &result[0];
+
+    int len =
+        snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+                 t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+    assert(len > 0);
+
+    result.resize(len);
   }
 
   return result;
 }
 
-Status WinEnvIO::Truncate(const std::string& fname, size_t size) {
-  Status s;
-  int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size);
-  if (result != 0) {
-    s = IOError("Failed to truncate: " + fname, errno);
+uint64_t WinClock::NowMicros() {
+  if (GetSystemTimePreciseAsFileTime_ != NULL) {
+    // all std::chrono clocks on windows proved to return
+    // values that may repeat that is not good enough for some uses.
+    const int64_t c_UnixEpochStartTicks = 116444736000000000LL;
+    const int64_t c_FtToMicroSec = 10;
+
+    // This interface needs to return system time and not
+    // just any microseconds because it is often used as an argument
+    // to TimedWait() on condition variable
+    FILETIME ftSystemTime;
+    GetSystemTimePreciseAsFileTime_(&ftSystemTime);
+
+    LARGE_INTEGER li;
+    li.LowPart = ftSystemTime.dwLowDateTime;
+    li.HighPart = ftSystemTime.dwHighDateTime;
+    // Subtract unix epoch start
+    li.QuadPart -= c_UnixEpochStartTicks;
+    // Convert to microsecs
+    li.QuadPart /= c_FtToMicroSec;
+    return li.QuadPart;
   }
-  return s;
+  return std::chrono::duration_cast<std::chrono::microseconds>(
+             std::chrono::system_clock::now().time_since_epoch())
+      .count();
 }
 
-Status WinEnvIO::GetCurrentTime(int64_t* unix_time) {
+uint64_t WinClock::NowNanos() {
+  if (nano_seconds_per_period_ != 0) {
+    // all std::chrono clocks on windows have the same resolution that is only
+    // good enough for microseconds but not nanoseconds
+    // On Windows 8 and Windows 2012 Server
+    // GetSystemTimePreciseAsFileTime(&current_time) can be used
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    // Convert performance counter to nanoseconds by precomputed ratio.
+    // Directly multiply nano::den with li.QuadPart causes overflow.
+    // Only do this when nano::den is divisible by perf_counter_frequency_,
+    // which most likely is the case in reality. If it's not, fall back to
+    // high_resolution_clock, which may be less precise under old compilers.
+    li.QuadPart *= nano_seconds_per_period_;
+    return li.QuadPart;
+  }
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             std::chrono::high_resolution_clock::now().time_since_epoch())
+      .count();
+}
+
+Status WinClock::GetCurrentTime(int64_t* unix_time) {
   time_t time = std::time(nullptr);
   if (time == (time_t)(-1)) {
     return Status::NotSupported("Failed to get time");
@@ -139,10 +182,55 @@
   return Status::OK();
 }
 
-Status WinEnvIO::NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) {
-  Status s;
+WinFileSystem::WinFileSystem(const std::shared_ptr<SystemClock>& clock)
+    : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) {
+  SYSTEM_INFO sinfo;
+  GetSystemInfo(&sinfo);
+
+  page_size_ = sinfo.dwPageSize;
+  allocation_granularity_ = sinfo.dwAllocationGranularity;
+}
+
+const std::shared_ptr<WinFileSystem>& WinFileSystem::Default() {
+  static std::shared_ptr<WinFileSystem> fs =
+      std::make_shared<WinFileSystem>(WinClock::Default());
+  return fs;
+}
+
+WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {}
+
+WinEnvIO::~WinEnvIO() {}
+
+IOStatus WinFileSystem::DeleteFile(const std::string& fname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
+
+  BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+
+  if (!ret) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError);
+  }
+
+  return result;
+}
+
+IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size,
+                                 const IOOptions& /*options*/,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus s;
+  int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size);
+  if (result != 0) {
+    s = IOError("Failed to truncate: " + fname, errno);
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   result->reset();
 
@@ -176,11 +264,11 @@
   return s;
 }
 
-Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
-                                     std::unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) {
+IOStatus WinFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
   result->reset();
-  Status s;
+  IOStatus s;
 
   // Open the file for read-only random access
   // Random access is to disable read-ahead as the system reads too much data
@@ -197,10 +285,10 @@
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = RX_CreateFile(
-        RX_FN(fname).c_str(), GENERIC_READ,
-        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-        NULL, OPEN_EXISTING, fileFlags, NULL);
+    hFile =
+        RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ,
+                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                      NULL, OPEN_EXISTING, fileFlags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -211,18 +299,18 @@
 
   UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
 
-  // CAUTION! This will map the entire file into the process address space
-  if (options.use_mmap_reads && sizeof(void*) >= 8) {
-    // Use mmap when virtual address-space is plentiful.
+  // CAUTION! This will map the entire file into the process address space.
+  // Not recommended for 32-bit platforms.
+  if (options.use_mmap_reads) {
     uint64_t fileSize;
 
-    s = GetFileSize(fname, &fileSize);
+    s = GetFileSize(fname, IOOptions(), &fileSize, dbg);
 
     if (s.ok()) {
       // Will not map empty files
       if (fileSize == 0) {
-        return IOError(
-            "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
+        return IOError("NewRandomAccessFile failed to map empty file: " + fname,
+                       EINVAL);
       }
 
       HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY,
@@ -240,11 +328,11 @@
       UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
 
       const void* mapped_region =
-        MapViewOfFileEx(hMap, FILE_MAP_READ,
-                        0,  // High DWORD of access start
-                        0,  // Low DWORD
-                        static_cast<SIZE_T>(fileSize),
-                        NULL);  // Let the OS choose the mapping
+          MapViewOfFileEx(hMap, FILE_MAP_READ,
+                          0,  // High DWORD of access start
+                          0,  // Low DWORD
+                          static_cast<SIZE_T>(fileSize),
+                          NULL);  // Let the OS choose the mapping
 
       if (!mapped_region) {
         auto lastError = GetLastError();
@@ -260,26 +348,21 @@
       fileGuard.release();
     }
   } else {
-    result->reset(new WinRandomAccessFile(fname, hFile,
-                                          std::max(GetSectorSize(fname),
-                                                   page_size_),
-                                          options));
+    result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
     fileGuard.release();
   }
   return s;
 }
 
-Status WinEnvIO::OpenWritableFile(const std::string& fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options,
-                                  bool reopen) {
-
+IOStatus WinFileSystem::OpenWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, bool reopen) {
   const size_t c_BufferCapacity = 64 * 1024;
 
   EnvOptions local_options(options);
 
   result->reset();
-  Status s;
+  IOStatus s;
 
   DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
 
@@ -316,11 +399,11 @@
         RX_FN(fname).c_str(),
         desired_access,  // Access desired
         shared_mode,
-        NULL,           // Security attributes
+        NULL,  // Security attributes
         // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC
         creation_disposition,
-        fileFlags,      // Flags
-        NULL);          // Template File
+        fileFlags,  // Flags
+        NULL);      // Template File
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -350,25 +433,36 @@
   } else {
     // Here we want the buffer allocation to be aligned by the SSD page size
     // and to be a multiple of it
-    result->reset(new WinWritableFile(fname, hFile,
-                                      std::max(GetSectorSize(fname),
-                                               GetPageSize()),
+    result->reset(new WinWritableFile(fname, hFile, GetPageSize(),
                                       c_BufferCapacity, local_options));
   }
   return s;
 }
 
-Status WinEnvIO::NewRandomRWFile(const std::string & fname,
-                                 std::unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions & options) {
-
-  Status s;
+IOStatus WinFileSystem::NewWritableFile(const std::string& fname,
+                                        const FileOptions& options,
+                                        std::unique_ptr<FSWritableFile>* result,
+                                        IODebugContext* /*dbg*/) {
+  return OpenWritableFile(fname, options, result, false);
+}
+
+IOStatus WinFileSystem::ReopenWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  return OpenWritableFile(fname, options, result, true);
+}
+
+IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname,
+                                        const FileOptions& options,
+                                        std::unique_ptr<FSRandomRWFile>* result,
+                                        IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   // Open the file for read-only random access
   // Random access is to disable read-ahead as the system reads too much data
   DWORD desired_access = GENERIC_READ | GENERIC_WRITE;
   DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist
+  DWORD creation_disposition = OPEN_EXISTING;  // Fail if file does not exist
   DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
 
   if (options.use_direct_reads && options.use_direct_writes) {
@@ -380,36 +474,27 @@
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile =
-      RX_CreateFile(RX_FN(fname).c_str(),
-                    desired_access,
-                    shared_mode,
-                    NULL, // Security attributes
-                    creation_disposition,
-                    file_flags,
-                    NULL);
+    hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode,
+                          NULL,  // Security attributes
+                          creation_disposition, file_flags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
     return IOErrorFromWindowsError(
-      "NewRandomRWFile failed to Create/Open: " + fname, lastError);
+        "NewRandomRWFile failed to Create/Open: " + fname, lastError);
   }
 
   UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
-  result->reset(new WinRandomRWFile(fname, hFile,
-                                    std::max(GetSectorSize(fname),
-                                             GetPageSize()),
-                                    options));
+  result->reset(new WinRandomRWFile(fname, hFile, GetPageSize(), options));
   fileGuard.release();
 
   return s;
 }
 
-Status WinEnvIO::NewMemoryMappedFileBuffer(
-    const std::string & fname,
-    std::unique_ptr<MemoryMappedFileBuffer>* result) {
-  Status s;
+IOStatus WinFileSystem::NewMemoryMappedFileBuffer(
+    const std::string& fname, std::unique_ptr<MemoryMappedFileBuffer>* result) {
+  IOStatus s;
   result->reset();
 
   DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
@@ -419,11 +504,9 @@
     IOSTATS_TIMER_GUARD(open_nanos);
     hFile = RX_CreateFile(
         RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE,
-        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-        NULL,
+        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
         OPEN_EXISTING,  // Open only if it exists
-        fileFlags,
-        NULL);
+        fileFlags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -435,21 +518,21 @@
   UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
 
   uint64_t fileSize = 0;
-  s = GetFileSize(fname, &fileSize);
+  s = GetFileSize(fname, IOOptions(), &fileSize, nullptr);
   if (!s.ok()) {
     return s;
   }
   // Will not map empty files
   if (fileSize == 0) {
-    return Status::NotSupported(
+    return IOStatus::NotSupported(
         "NewMemoryMappedFileBuffer can not map zero length files: " + fname);
   }
 
   // size_t is 32-bit with 32-bit builds
   if (fileSize > std::numeric_limits<size_t>::max()) {
-    return Status::NotSupported(
-        "The specified file size does not fit into 32-bit memory addressing: "
-         + fname);
+    return IOStatus::NotSupported(
+        "The specified file size does not fit into 32-bit memory addressing: " +
+        fname);
   }
 
   HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE,
@@ -486,15 +569,16 @@
   return s;
 }
 
-Status WinEnvIO::NewDirectory(const std::string& name,
-                              std::unique_ptr<Directory>* result) {
-  Status s;
+IOStatus WinFileSystem::NewDirectory(const std::string& name,
+                                     const IOOptions& /*options*/,
+                                     std::unique_ptr<FSDirectory>* result,
+                                     IODebugContext* /*dbg*/) {
+  IOStatus s;
   // Must be nullptr on failure
   result->reset();
 
   if (!DirExists(name)) {
-    s = IOErrorFromWindowsError(
-        "open folder: " + name, ERROR_DIRECTORY);
+    s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY);
     return s;
   }
 
@@ -504,10 +588,9 @@
     IOSTATS_TIMER_GUARD(open_nanos);
     handle = RX_CreateFile(
         RX_FN(name).c_str(), 0,
-        FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-        NULL,
+        FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
         OPEN_EXISTING,
-        FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+        FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
         NULL);
   }
 
@@ -522,8 +605,10 @@
   return s;
 }
 
-Status WinEnvIO::FileExists(const std::string& fname) {
-  Status s;
+IOStatus WinFileSystem::FileExists(const std::string& fname,
+                                   const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus s;
   // TODO: This does not follow symbolic links at this point
   // which is consistent with _access() impl on windows
   // but can be added
@@ -532,70 +617,74 @@
                                       GetFileExInfoStandard, &attrs)) {
     auto lastError = GetLastError();
     switch (lastError) {
-    case ERROR_ACCESS_DENIED:
-    case ERROR_NOT_FOUND:
-    case ERROR_FILE_NOT_FOUND:
-    case ERROR_PATH_NOT_FOUND:
-      s = Status::NotFound();
-      break;
-    default:
-      s = IOErrorFromWindowsError("Unexpected error for: " + fname,
-                                  lastError);
-      break;
+      case ERROR_ACCESS_DENIED:
+      case ERROR_NOT_FOUND:
+      case ERROR_FILE_NOT_FOUND:
+      case ERROR_PATH_NOT_FOUND:
+        s = IOStatus::NotFound();
+        break;
+      default:
+        s = IOErrorFromWindowsError("Unexpected error for: " + fname,
+                                    lastError);
+        break;
     }
   }
   return s;
 }
 
-Status WinEnvIO::GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) {
-
-  Status status;
+IOStatus WinFileSystem::GetChildren(const std::string& dir,
+                                    const IOOptions& /*opts*/,
+                                    std::vector<std::string>* result,
+                                    IODebugContext* /*dbg*/) {
+  IOStatus status;
   result->clear();
-  std::vector<std::string> output;
 
   RX_WIN32_FIND_DATA data;
   memset(&data, 0, sizeof(data));
   std::string pattern(dir);
   pattern.append("\\").append("*");
 
-  HANDLE handle = RX_FindFirstFileEx(RX_FN(pattern).c_str(),
-                                     // Do not want alternative name
-                                     FindExInfoBasic,
-                                     &data,
-                                     FindExSearchNameMatch,
-                                     NULL,  // lpSearchFilter
-                                     0);
+  HANDLE handle =
+      RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+                         // Do not want alternative name
+                         FindExInfoBasic, &data, FindExSearchNameMatch,
+                         NULL,  // lpSearchFilter
+                         0);
 
   if (handle == INVALID_HANDLE_VALUE) {
     auto lastError = GetLastError();
     switch (lastError) {
-    case ERROR_NOT_FOUND:
-    case ERROR_ACCESS_DENIED:
-    case ERROR_FILE_NOT_FOUND:
-    case ERROR_PATH_NOT_FOUND:
-      status = Status::NotFound();
-      break;
-    default:
-      status = IOErrorFromWindowsError(
-          "Failed to GetChhildren for: " + dir, lastError);
+      case ERROR_NOT_FOUND:
+      case ERROR_ACCESS_DENIED:
+      case ERROR_FILE_NOT_FOUND:
+      case ERROR_PATH_NOT_FOUND:
+        status = IOStatus::NotFound();
+        break;
+      default:
+        status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir,
+                                         lastError);
     }
     return status;
   }
 
   UniqueFindClosePtr fc(handle, FindCloseFunc);
 
-  if (result->capacity() > 0) {
-    output.reserve(result->capacity());
-  }
-
   // For safety
   data.cFileName[MAX_PATH - 1] = 0;
 
   while (true) {
-    auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
-    output.emplace_back(FN_TO_RX(x));
-    BOOL ret =- RX_FindNextFile(handle, &data);
+    // filter out '.' and '..' directory entries
+    // which appear only on some platforms
+    const bool ignore =
+        ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) &&
+        (RX_FNCMP(data.cFileName, ".") == 0 ||
+         RX_FNCMP(data.cFileName, "..") == 0);
+    if (!ignore) {
+      auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
+      result->push_back(FN_TO_RX(x));
+    }
+
+    BOOL ret = -RX_FindNextFile(handle, &data);
     // If the function fails the return value is zero
     // and non-zero otherwise. Not TRUE or FALSE.
     if (ret == FALSE) {
@@ -604,24 +693,27 @@
     }
     data.cFileName[MAX_PATH - 1] = 0;
   }
-  output.swap(*result);
   return status;
 }
 
-Status WinEnvIO::CreateDir(const std::string& name) {
-  Status result;
+IOStatus WinFileSystem::CreateDir(const std::string& name,
+                                  const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus result;
   BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
   if (!ret) {
     auto lastError = GetLastError();
-    result = IOErrorFromWindowsError(
-        "Failed to create a directory: " + name, lastError);
+    result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+                                     lastError);
   }
 
   return result;
 }
 
-Status  WinEnvIO::CreateDirIfMissing(const std::string& name) {
-  Status result;
+IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name,
+                                           const IOOptions& /*opts*/,
+                                           IODebugContext* /*dbg*/) {
+  IOStatus result;
 
   if (DirExists(name)) {
     return result;
@@ -631,30 +723,32 @@
   if (!ret) {
     auto lastError = GetLastError();
     if (lastError != ERROR_ALREADY_EXISTS) {
-      result = IOErrorFromWindowsError(
-          "Failed to create a directory: " + name, lastError);
+      result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+                                       lastError);
     } else {
-      result =
-          Status::IOError(name + ": exists but is not a directory");
+      result = IOStatus::IOError(name + ": exists but is not a directory");
     }
   }
   return result;
 }
 
-Status WinEnvIO::DeleteDir(const std::string& name) {
-  Status result;
+IOStatus WinFileSystem::DeleteDir(const std::string& name,
+                                  const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus result;
   BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str());
   if (!ret) {
     auto lastError = GetLastError();
-    result = IOErrorFromWindowsError("Failed to remove dir: " + name,
-                                     lastError);
+    result =
+        IOErrorFromWindowsError("Failed to remove dir: " + name, lastError);
   }
   return result;
 }
 
-Status WinEnvIO::GetFileSize(const std::string& fname,
-  uint64_t* size) {
-  Status s;
+IOStatus WinFileSystem::GetFileSize(const std::string& fname,
+                                    const IOOptions& /*opts*/, uint64_t* size,
+                                    IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
   if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
@@ -670,7 +764,7 @@
   return s;
 }
 
-uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) {
+uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) {
   const uint64_t c_FileTimePerSecond = 10000000U;
   // UNIX epoch starts on 1970-01-01T00:00:00Z
   // Windows FILETIME starts on 1601-01-01T00:00:00Z
@@ -684,31 +778,35 @@
   li.LowPart = ftTime.dwLowDateTime;
 
   uint64_t result =
-    (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
+      (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
   return result;
 }
 
-Status WinEnvIO::GetFileModificationTime(const std::string& fname,
-  uint64_t* file_mtime) {
-  Status s;
+IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname,
+                                                const IOOptions& /*opts*/,
+                                                uint64_t* file_mtime,
+                                                IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
   if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
-                            &attrs)) {
+                             &attrs)) {
     *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
   } else {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError(
-      "Can not get file modification time for: " + fname, lastError);
+        "Can not get file modification time for: " + fname, lastError);
     *file_mtime = 0;
   }
 
   return s;
 }
 
-Status WinEnvIO::RenameFile(const std::string& src,
-  const std::string& target) {
-  Status result;
+IOStatus WinFileSystem::RenameFile(const std::string& src,
+                                   const std::string& target,
+                                   const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
 
   // rename() is not capable of replacing the existing file as on Linux
   // so use OS API directly
@@ -725,14 +823,16 @@
   return result;
 }
 
-Status WinEnvIO::LinkFile(const std::string& src,
-  const std::string& target) {
-  Status result;
+IOStatus WinFileSystem::LinkFile(const std::string& src,
+                                 const std::string& target,
+                                 const IOOptions& /*opts*/,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus result;
 
-  if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(),  NULL)) {
+  if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) {
     DWORD lastError = GetLastError();
     if (lastError == ERROR_NOT_SAME_DEVICE) {
-      return Status::NotSupported("No cross FS links allowed");
+      return IOStatus::NotSupported("No cross FS links allowed");
     }
 
     std::string text("Failed to link: ");
@@ -744,12 +844,14 @@
   return result;
 }
 
-Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) {
-  Status s;
-  HANDLE handle = RX_CreateFile(
-      RX_FN(fname).c_str(), 0,
-      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-      NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+IOStatus WinFileSystem::NumFileLinks(const std::string& fname,
+                                     const IOOptions& /*opts*/, uint64_t* count,
+                                     IODebugContext* /*dbg*/) {
+  IOStatus s;
+  HANDLE handle =
+      RX_CreateFile(RX_FN(fname).c_str(), 0,
+                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
 
   if (INVALID_HANDLE_VALUE == handle) {
     auto lastError = GetLastError();
@@ -770,26 +872,27 @@
   return s;
 }
 
-Status WinEnvIO::AreFilesSame(const std::string& first,
-  const std::string& second, bool* res) {
+IOStatus WinFileSystem::AreFilesSame(const std::string& first,
+                                     const std::string& second,
+                                     const IOOptions& /*opts*/, bool* res,
+                                     IODebugContext* /*dbg*/) {
 // For MinGW builds
 #if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
-  Status s = Status::NotSupported();
+  IOStatus s = IOStatus::NotSupported();
 #else
   assert(res != nullptr);
-  Status s;
+  IOStatus s;
   if (res == nullptr) {
-    s = Status::InvalidArgument("res");
+    s = IOStatus::InvalidArgument("res");
     return s;
   }
 
   // 0 - for access means read metadata
   HANDLE file_1 = RX_CreateFile(
       RX_FN(first).c_str(), 0,
-      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-      NULL,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
       OPEN_EXISTING,
-      FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+      FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
       NULL);
 
   if (INVALID_HANDLE_VALUE == file_1) {
@@ -801,9 +904,9 @@
 
   HANDLE file_2 = RX_CreateFile(
       RX_FN(second).c_str(), 0,
-      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-      NULL, OPEN_EXISTING,
-      FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+      OPEN_EXISTING,
+      FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
       NULL);
 
   if (INVALID_HANDLE_VALUE == file_2) {
@@ -823,9 +926,9 @@
     return s;
   }
 
-   FILE_ID_INFO FileInfo_2;
-   result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2,
-                                         sizeof(FileInfo_2));
+  FILE_ID_INFO FileInfo_2;
+  result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2,
+                                        sizeof(FileInfo_2));
 
   if (!result) {
     auto lastError = GetLastError();
@@ -834,9 +937,9 @@
   }
 
   if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) {
-    *res = (0 == memcmp(FileInfo_1.FileId.Identifier,
-                        FileInfo_2.FileId.Identifier,
-                        sizeof(FileInfo_1.FileId.Identifier)));
+    *res =
+        (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier,
+                     sizeof(FileInfo_1.FileId.Identifier)));
   } else {
     *res = false;
   }
@@ -844,12 +947,13 @@
   return s;
 }
 
-Status  WinEnvIO::LockFile(const std::string& lockFname,
-                           FileLock** lock) {
+IOStatus WinFileSystem::LockFile(const std::string& lockFname,
+                                 const IOOptions& /*opts*/, FileLock** lock,
+                                 IODebugContext* /*dbg*/) {
   assert(lock != nullptr);
 
   *lock = NULL;
-  Status result;
+  IOStatus result;
 
   // No-sharing, this is a LOCK file
   const DWORD ExclusiveAccessON = 0;
@@ -861,15 +965,14 @@
   {
     IOSTATS_TIMER_GUARD(open_nanos);
     hFile = RX_CreateFile(RX_FN(lockFname).c_str(),
-                          (GENERIC_READ | GENERIC_WRITE),
-                          ExclusiveAccessON, NULL, CREATE_ALWAYS,
-                          FILE_ATTRIBUTE_NORMAL, NULL);
+                          (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON,
+                          NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
-    result = IOErrorFromWindowsError(
-        "Failed to create lock file: " + lockFname, lastError);
+    result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname,
+                                     lastError);
   } else {
     *lock = new WinFileLock(hFile);
   }
@@ -877,8 +980,9 @@
   return result;
 }
 
-Status WinEnvIO::UnlockFile(FileLock* lock) {
-  Status result;
+IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
 
   assert(lock != nullptr);
 
@@ -887,8 +991,9 @@
   return result;
 }
 
-Status WinEnvIO::GetTestDirectory(std::string* result) {
-
+IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts,
+                                         std::string* result,
+                                         IODebugContext* dbg) {
   std::string output;
 
   const char* env = getenv("TEST_TMPDIR");
@@ -903,21 +1008,23 @@
       output = "c:\\tmp";
     }
   }
-  CreateDir(output);
+  CreateDir(output, opts, dbg);
 
   output.append("\\testrocksdb-");
-  output.append(std::to_string(_getpid()));
+  output.append(std::to_string(GetCurrentProcessId()));
 
-  CreateDir(output);
+  CreateDir(output, opts, dbg);
 
   output.swap(*result);
 
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status WinEnvIO::NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result) {
-  Status s;
+IOStatus WinFileSystem::NewLogger(const std::string& fname,
+                                  const IOOptions& /*opts*/,
+                                  std::shared_ptr<Logger>* result,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   result->reset();
 
@@ -950,64 +1057,25 @@
       // Set creation, last access and last write time to the same value
       SetFileTime(hFile, &ft, &ft, &ft);
     }
-    result->reset(new WinLogger(&WinEnvThreads::gettid, hosted_env_, hFile));
+    result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile));
   }
   return s;
 }
 
-uint64_t WinEnvIO::NowMicros() {
-
-  if (GetSystemTimePreciseAsFileTime_ != NULL) {
-    // all std::chrono clocks on windows proved to return
-    // values that may repeat that is not good enough for some uses.
-    const int64_t c_UnixEpochStartTicks = 116444736000000000LL;
-    const int64_t c_FtToMicroSec = 10;
-
-    // This interface needs to return system time and not
-    // just any microseconds because it is often used as an argument
-    // to TimedWait() on condition variable
-    FILETIME ftSystemTime;
-    GetSystemTimePreciseAsFileTime_(&ftSystemTime);
-
-    LARGE_INTEGER li;
-    li.LowPart = ftSystemTime.dwLowDateTime;
-    li.HighPart = ftSystemTime.dwHighDateTime;
-    // Subtract unix epoch start
-    li.QuadPart -= c_UnixEpochStartTicks;
-    // Convert to microsecs
-    li.QuadPart /= c_FtToMicroSec;
-    return li.QuadPart;
+IOStatus WinFileSystem::IsDirectory(const std::string& path,
+                                    const IOOptions& /*opts*/, bool* is_dir,
+                                    IODebugContext* /*dbg*/) {
+  BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str());
+  if (is_dir) {
+    *is_dir = ret ? true : false;
   }
-  using namespace std::chrono;
-  return duration_cast<microseconds>(system_clock::now().time_since_epoch())
-      .count();
-}
-
-uint64_t WinEnvIO::NowNanos() {
-  if (nano_seconds_per_period_ != 0) {
-    // all std::chrono clocks on windows have the same resolution that is only
-    // good enough for microseconds but not nanoseconds
-    // On Windows 8 and Windows 2012 Server
-    // GetSystemTimePreciseAsFileTime(&current_time) can be used
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    // Convert performance counter to nanoseconds by precomputed ratio.
-    // Directly multiply nano::den with li.QuadPart causes overflow.
-    // Only do this when nano::den is divisible by perf_counter_frequency_,
-    // which most likely is the case in reality. If it's not, fall back to
-    // high_resolution_clock, which may be less precise under old compilers.
-    li.QuadPart *= nano_seconds_per_period_;
-    return li.QuadPart;
-  }
-  using namespace std::chrono;
-  return duration_cast<nanoseconds>(
-      high_resolution_clock::now().time_since_epoch()).count();
+  return IOStatus::OK();
 }
 
 Status WinEnvIO::GetHostName(char* name, uint64_t len) {
   Status s;
   DWORD nSize = static_cast<DWORD>(
-    std::min<uint64_t>(len, std::numeric_limits<DWORD>::max()));
+      std::min<uint64_t>(len, std::numeric_limits<DWORD>::max()));
 
   if (!::GetComputerNameA(name, &nSize)) {
     auto lastError = GetLastError();
@@ -1019,15 +1087,17 @@
   return s;
 }
 
-Status WinEnvIO::GetAbsolutePath(const std::string& db_path,
-                                 std::string* output_path) {
+IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path,
+                                        const IOOptions& /*options*/,
+                                        std::string* output_path,
+                                        IODebugContext* dbg) {
   // Check if we already have an absolute path
   // For test compatibility we will consider starting slash as an
   // absolute path
   if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) ||
-    !RX_PathIsRelative(RX_FN(db_path).c_str())) {
+      !RX_PathIsRelative(RX_FN(db_path).c_str())) {
     *output_path = db_path;
-    return Status::OK();
+    return IOStatus::OK();
   }
 
   RX_FILESTRING result;
@@ -1046,42 +1116,19 @@
   std::string res = FN_TO_RX(result);
 
   res.swap(*output_path);
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) {
-  std::string result;
-
-  const time_t seconds = secondsSince1970;
-  const int maxsize = 64;
-
-  struct tm t;
-  errno_t ret = localtime_s(&t, &seconds);
-
-  if (ret) {
-    result = std::to_string(seconds);
-  } else {
-    result.resize(maxsize);
-    char* p = &result[0];
-
-    int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ",
-                       t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
-                       t.tm_min, t.tm_sec);
-    assert(len > 0);
-
-    result.resize(len);
-  }
-
-  return result;
-}
-
-Status WinEnvIO::GetFreeSpace(const std::string& path, uint64_t* diskfree) {
+IOStatus WinFileSystem::GetFreeSpace(const std::string& path,
+                                     const IOOptions& /*options*/,
+                                     uint64_t* diskfree,
+                                     IODebugContext* /*dbg*/) {
   assert(diskfree != nullptr);
   ULARGE_INTEGER freeBytes;
   BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL);
   if (f) {
     *diskfree = freeBytes.QuadPart;
-    return Status::OK();
+    return IOStatus::OK();
   } else {
     DWORD lastError = GetLastError();
     return IOErrorFromWindowsError("Failed to get free space: " + path,
@@ -1089,9 +1136,9 @@
   }
 }
 
-EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
-                                         const DBOptions& db_options) const {
-  EnvOptions optimized(env_options);
+FileOptions WinFileSystem::OptimizeForLogWrite(
+    const FileOptions& file_options, const DBOptions& db_options) const {
+  FileOptions optimized(file_options);
   // These two the same as default optimizations
   optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
   optimized.writable_file_max_buffer_size =
@@ -1105,42 +1152,52 @@
   return optimized;
 }
 
-EnvOptions WinEnvIO::OptimizeForManifestWrite(
-    const EnvOptions& env_options) const {
-  EnvOptions optimized(env_options);
+FileOptions WinFileSystem::OptimizeForManifestWrite(
+    const FileOptions& options) const {
+  FileOptions optimized(options);
   optimized.use_mmap_writes = false;
   optimized.use_direct_reads = false;
   return optimized;
 }
 
-EnvOptions WinEnvIO::OptimizeForManifestRead(
-    const EnvOptions& env_options) const {
-  EnvOptions optimized(env_options);
+FileOptions WinFileSystem::OptimizeForManifestRead(
+    const FileOptions& file_options) const {
+  FileOptions optimized(file_options);
   optimized.use_mmap_writes = false;
   optimized.use_direct_reads = false;
   return optimized;
 }
 
 // Returns true iff the named directory exists and is a directory.
-bool WinEnvIO::DirExists(const std::string& dname) {
+bool WinFileSystem::DirExists(const std::string& dname) {
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (RX_GetFileAttributesEx(RX_FN(dname).c_str(),
-                             GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard,
+                             &attrs)) {
     return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
   }
   return false;
 }
 
-size_t WinEnvIO::GetSectorSize(const std::string& fname) {
+size_t WinFileSystem::GetSectorSize(const std::string& fname) {
   size_t sector_size = kSectorSize;
 
-  if (RX_PathIsRelative(RX_FN(fname).c_str())) {
-    return sector_size;
-  }
-
   // obtain device handle
   char devicename[7] = "\\\\.\\";
-  int erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2);
+  int erresult = 0;
+  if (RX_PathIsRelative(RX_FN(fname).c_str())) {
+    RX_FILESTRING rx_current_dir;
+    rx_current_dir.resize(MAX_PATH);
+    DWORD len = RX_GetCurrentDirectory(MAX_PATH, &rx_current_dir[0]);
+    if (len == 0) {
+      return sector_size;
+    }
+    rx_current_dir.resize(len);
+    std::string current_dir = FN_TO_RX(rx_current_dir);
+    erresult =
+        strncat_s(devicename, sizeof(devicename), current_dir.c_str(), 2);
+  } else {
+    erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2);
+  }
 
   if (erresult) {
     assert(false);
@@ -1161,21 +1218,21 @@
   BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)];
   DWORD output_bytes = 0;
 
-  BOOL ret = DeviceIoControl(hDevice, IOCTL_STORAGE_QUERY_PROPERTY,
-                             &spropertyquery, sizeof(spropertyquery),
-                             output_buffer,
-                             sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR),
-                             &output_bytes, nullptr);
+  BOOL ret = DeviceIoControl(
+      hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery,
+      sizeof(spropertyquery), output_buffer,
+      sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr);
 
   if (ret) {
-    sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR *)output_buffer)->BytesPerLogicalSector;
+    sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer)
+                      ->BytesPerLogicalSector;
   } else {
-    // many devices do not support StorageProcessAlignmentProperty. Any failure here and we
-    // fall back to logical alignment
+    // many devices do not support StorageProcessAlignmentProperty. Any failure
+    // here and we fall back to logical alignment
 
-    DISK_GEOMETRY_EX geometry = { 0 };
-    ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY,
-           nullptr, 0, &geometry, sizeof(geometry), &output_bytes, nullptr);
+    DISK_GEOMETRY_EX geometry = {0};
+    ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0,
+                          &geometry, sizeof(geometry), &output_bytes, nullptr);
     if (ret) {
       sector_size = geometry.Geometry.BytesPerSector;
     }
@@ -1193,17 +1250,15 @@
 
 WinEnvThreads::WinEnvThreads(Env* hosted_env)
     : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) {
-
   for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
     thread_pools_[pool_id].SetThreadPriority(
-      static_cast<Env::Priority>(pool_id));
+        static_cast<Env::Priority>(pool_id));
     // This allows later initializing the thread-local-env of each thread.
     thread_pools_[pool_id].SetHostEnv(hosted_env);
   }
 }
 
 WinEnvThreads::~WinEnvThreads() {
-
   WaitForJoin();
 
   for (auto& thpool : thread_pools_) {
@@ -1211,9 +1266,9 @@
   }
 }
 
-void WinEnvThreads::Schedule(void(*function)(void*), void* arg,
+void WinEnvThreads::Schedule(void (*function)(void*), void* arg,
                              Env::Priority pri, void* tag,
-                             void(*unschedFunction)(void* arg)) {
+                             void (*unschedFunction)(void* arg)) {
   assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
 }
@@ -1224,26 +1279,26 @@
 
 namespace {
 
-  struct StartThreadState {
-    void(*user_function)(void*);
-    void* arg;
-  };
+struct StartThreadState {
+  void (*user_function)(void*);
+  void* arg;
+};
 
-  void* StartThreadWrapper(void* arg) {
-    std::unique_ptr<StartThreadState> state(
+void* StartThreadWrapper(void* arg) {
+  std::unique_ptr<StartThreadState> state(
       reinterpret_cast<StartThreadState*>(arg));
-    state->user_function(state->arg);
-    return nullptr;
-  }
-
+  state->user_function(state->arg);
+  return nullptr;
 }
 
-void WinEnvThreads::StartThread(void(*function)(void* arg), void* arg) {
+}  // namespace
+
+void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) {
   std::unique_ptr<StartThreadState> state(new StartThreadState);
   state->user_function = function;
   state->arg = arg;
   try {
-    ROCKSDB_NAMESPACE::port::WindowsThread th(&StartThreadWrapper, state.get());
+    Thread th(&StartThreadWrapper, state.get());
     state.release();
 
     std::lock_guard<std::mutex> lg(mu_);
@@ -1273,10 +1328,6 @@
 
 uint64_t WinEnvThreads::GetThreadID() const { return gettid(); }
 
-void  WinEnvThreads::SleepForMicroseconds(int micros) {
-  std::this_thread::sleep_for(std::chrono::microseconds(micros));
-}
-
 void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) {
   assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].SetBackgroundThreads(num);
@@ -1295,12 +1346,14 @@
 /////////////////////////////////////////////////////////////////////////
 // WinEnv
 
-WinEnv::WinEnv() : winenv_io_(this), winenv_threads_(this) {
+WinEnv::WinEnv()
+    : CompositeEnv(WinFileSystem::Default(), WinClock::Default()),
+      winenv_io_(this),
+      winenv_threads_(this) {
   // Protected member of the base class
   thread_status_updater_ = CreateThreadStatusUpdater();
 }
 
-
 WinEnv::~WinEnv() {
   // All threads must be joined before the deletion of
   // thread_status_updater_.
@@ -1312,151 +1365,12 @@
   return thread_status_updater_->GetThreadList(thread_list);
 }
 
-Status WinEnv::DeleteFile(const std::string& fname) {
-  return winenv_io_.DeleteFile(fname);
-}
-
-Status WinEnv::Truncate(const std::string& fname, size_t size) {
-  return winenv_io_.Truncate(fname, size);
-}
-
-Status WinEnv::GetCurrentTime(int64_t* unix_time) {
-  return winenv_io_.GetCurrentTime(unix_time);
-}
-
-Status  WinEnv::NewSequentialFile(const std::string& fname,
-                                  std::unique_ptr<SequentialFile>* result,
-                                  const EnvOptions& options) {
-  return winenv_io_.NewSequentialFile(fname, result, options);
-}
-
-Status WinEnv::NewRandomAccessFile(const std::string& fname,
-                                   std::unique_ptr<RandomAccessFile>* result,
-                                   const EnvOptions& options) {
-  return winenv_io_.NewRandomAccessFile(fname, result, options);
-}
-
-Status WinEnv::NewWritableFile(const std::string& fname,
-                               std::unique_ptr<WritableFile>* result,
-                               const EnvOptions& options) {
-  return winenv_io_.OpenWritableFile(fname, result, options, false);
-}
-
-Status WinEnv::ReopenWritableFile(const std::string& fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options) {
-  return winenv_io_.OpenWritableFile(fname, result, options, true);
-}
-
-Status WinEnv::NewRandomRWFile(const std::string & fname,
-                               std::unique_ptr<RandomRWFile>* result,
-                               const EnvOptions & options) {
-  return winenv_io_.NewRandomRWFile(fname, result, options);
-}
-
-Status WinEnv::NewMemoryMappedFileBuffer(
-    const std::string& fname,
-    std::unique_ptr<MemoryMappedFileBuffer>* result) {
-  return winenv_io_.NewMemoryMappedFileBuffer(fname, result);
-}
-
-Status WinEnv::NewDirectory(const std::string& name,
-                            std::unique_ptr<Directory>* result) {
-  return winenv_io_.NewDirectory(name, result);
-}
-
-Status WinEnv::FileExists(const std::string& fname) {
-  return winenv_io_.FileExists(fname);
-}
-
-Status WinEnv::GetChildren(const std::string& dir,
-                           std::vector<std::string>* result) {
-  return winenv_io_.GetChildren(dir, result);
-}
-
-Status WinEnv::CreateDir(const std::string& name) {
-  return winenv_io_.CreateDir(name);
-}
-
-Status WinEnv::CreateDirIfMissing(const std::string& name) {
-  return winenv_io_.CreateDirIfMissing(name);
-}
-
-Status WinEnv::DeleteDir(const std::string& name) {
-  return winenv_io_.DeleteDir(name);
-}
-
-Status WinEnv::GetFileSize(const std::string& fname,
-                           uint64_t* size) {
-  return winenv_io_.GetFileSize(fname, size);
-}
-
-Status  WinEnv::GetFileModificationTime(const std::string& fname,
-                                        uint64_t* file_mtime) {
-  return winenv_io_.GetFileModificationTime(fname, file_mtime);
-}
-
-Status WinEnv::RenameFile(const std::string& src,
-                          const std::string& target) {
-  return winenv_io_.RenameFile(src, target);
-}
-
-Status WinEnv::LinkFile(const std::string& src,
-                        const std::string& target) {
-  return winenv_io_.LinkFile(src, target);
-}
-
-Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) {
-  return winenv_io_.NumFileLinks(fname, count);
-}
-
-Status WinEnv::AreFilesSame(const std::string& first,
-                            const std::string& second, bool* res) {
-  return winenv_io_.AreFilesSame(first, second, res);
-}
-
-Status WinEnv::LockFile(const std::string& lockFname,
-  FileLock** lock) {
-  return winenv_io_.LockFile(lockFname, lock);
-}
-
-Status WinEnv::UnlockFile(FileLock* lock) {
-  return winenv_io_.UnlockFile(lock);
-}
-
-Status  WinEnv::GetTestDirectory(std::string* result) {
-  return winenv_io_.GetTestDirectory(result);
-}
-
-Status WinEnv::NewLogger(const std::string& fname,
-                         std::shared_ptr<Logger>* result) {
-  return winenv_io_.NewLogger(fname, result);
-}
-
-uint64_t WinEnv::NowMicros() {
-  return winenv_io_.NowMicros();
-}
-
-uint64_t  WinEnv::NowNanos() {
-  return winenv_io_.NowNanos();
-}
-
 Status WinEnv::GetHostName(char* name, uint64_t len) {
   return winenv_io_.GetHostName(name, len);
 }
 
-Status WinEnv::GetAbsolutePath(const std::string& db_path,
-  std::string* output_path) {
-  return winenv_io_.GetAbsolutePath(db_path, output_path);
-}
-
-std::string WinEnv::TimeToString(uint64_t secondsSince1970) {
-  return winenv_io_.TimeToString(secondsSince1970);
-}
-
-void  WinEnv::Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-                       void* tag,
-                       void(*unschedFunction)(void* arg)) {
+void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                      void* tag, void (*unschedFunction)(void* arg)) {
   return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction);
 }
 
@@ -1464,32 +1378,20 @@
   return winenv_threads_.UnSchedule(arg, pri);
 }
 
-void WinEnv::StartThread(void(*function)(void* arg), void* arg) {
+void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
   return winenv_threads_.StartThread(function, arg);
 }
 
-void WinEnv::WaitForJoin() {
-  return winenv_threads_.WaitForJoin();
-}
+void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); }
 
-unsigned int  WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const {
+unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const {
   return winenv_threads_.GetThreadPoolQueueLen(pri);
 }
 
-uint64_t WinEnv::GetThreadID() const {
-  return winenv_threads_.GetThreadID();
-}
-
-Status WinEnv::GetFreeSpace(const std::string& path, uint64_t* diskfree) {
-  return winenv_io_.GetFreeSpace(path, diskfree);
-}
-
-void WinEnv::SleepForMicroseconds(int micros) {
-  return winenv_threads_.SleepForMicroseconds(micros);
-}
+uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); }
 
 // Allow increasing the number of worker threads.
-void  WinEnv::SetBackgroundThreads(int num, Env::Priority pri) {
+void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) {
   return winenv_threads_.SetBackgroundThreads(num, pri);
 }
 
@@ -1497,44 +1399,21 @@
   return winenv_threads_.GetBackgroundThreads(pri);
 }
 
-void  WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
+void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
   return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri);
 }
 
-EnvOptions WinEnv::OptimizeForManifestRead(
-    const EnvOptions& env_options) const {
-  return winenv_io_.OptimizeForManifestRead(env_options);
-}
-
-EnvOptions WinEnv::OptimizeForLogWrite(const EnvOptions& env_options,
-                                       const DBOptions& db_options) const {
-  return winenv_io_.OptimizeForLogWrite(env_options, db_options);
-}
-
-EnvOptions WinEnv::OptimizeForManifestWrite(
-    const EnvOptions& env_options) const {
-  return winenv_io_.OptimizeForManifestWrite(env_options);
-}
-
 }  // namespace port
 
-std::string Env::GenerateUniqueId() {
-  std::string result;
-
-  UUID uuid;
-  UuidCreateSequential(&uuid);
-
-  RPC_CSTR rpc_str;
-  auto status = UuidToStringA(&uuid, &rpc_str);
-  (void)status;
-  assert(status == RPC_S_OK);
-
-  result = reinterpret_cast<char*>(rpc_str);
-
-  status = RpcStringFreeA(&rpc_str);
-  assert(status == RPC_S_OK);
-
-  return result;
+std::shared_ptr<FileSystem> FileSystem::Default() {
+  return port::WinFileSystem::Default();
 }
 
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+  static std::shared_ptr<SystemClock> clock =
+      std::make_shared<port::WinClock>();
+  return clock;
+}
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/env_win.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/env_win.h	2025-05-19 16:14:27.000000000 +0000
@@ -15,30 +15,30 @@
 // multiple threads without any external synchronization.
 
 #pragma once
-
-#include "port/win/win_thread.h"
-#include <rocksdb/env.h>
-#include "util/threadpool_imp.h"
-
 #include <stdint.h>
 #include <windows.h>
 
 #include <mutex>
-#include <vector>
 #include <string>
+#include <vector>
 
+#include "env/composite_env_wrapper.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "util/threadpool_imp.h"
 
 #undef GetCurrentTime
 #undef DeleteFile
-#undef GetTickCount
+#undef LoadLibrary
 
 namespace ROCKSDB_NAMESPACE {
 namespace port {
 
 // Currently not designed for inheritance but rather a replacement
 class WinEnvThreads {
-public:
-
+ public:
   explicit WinEnvThreads(Env* hosted_env);
 
   ~WinEnvThreads();
@@ -46,12 +46,12 @@
   WinEnvThreads(const WinEnvThreads&) = delete;
   WinEnvThreads& operator=(const WinEnvThreads&) = delete;
 
-  void Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-                void* tag, void(*unschedFunction)(void* arg));
+  void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                void* tag, void (*unschedFunction)(void* arg));
 
   int UnSchedule(void* arg, Env::Priority pri);
 
-  void StartThread(void(*function)(void* arg), void* arg);
+  void StartThread(void (*function)(void* arg), void* arg);
 
   void WaitForJoin();
 
@@ -61,287 +61,236 @@
 
   uint64_t GetThreadID() const;
 
-  void SleepForMicroseconds(int micros);
-
   // Allow increasing the number of worker threads.
   void SetBackgroundThreads(int num, Env::Priority pri);
   int GetBackgroundThreads(Env::Priority pri);
 
   void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri);
 
-private:
-
+ private:
   Env* hosted_env_;
   mutable std::mutex mu_;
   std::vector<ThreadPoolImpl> thread_pools_;
-  std::vector<WindowsThread> threads_to_join_;
-
+  std::vector<Thread> threads_to_join_;
 };
 
-// Designed for inheritance so can be re-used
-// but certain parts replaced
-class WinEnvIO {
-public:
-  explicit WinEnvIO(Env* hosted_env);
-
-  virtual ~WinEnvIO();
-
-  virtual Status DeleteFile(const std::string& fname);
-
-  Status Truncate(const std::string& fname, size_t size);
-
-  virtual Status GetCurrentTime(int64_t* unix_time);
-
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options);
-
-  // Helper for NewWritable and ReopenWritableFile
-  virtual Status OpenWritableFile(const std::string& fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options,
-                                  bool reopen);
-
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     std::unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options);
-
-  // The returned file will only be accessed by one thread at a time.
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 std::unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options);
-
-  virtual Status NewMemoryMappedFileBuffer(
-      const std::string& fname,
-      std::unique_ptr<MemoryMappedFileBuffer>* result);
-
-  virtual Status NewDirectory(const std::string& name,
-                              std::unique_ptr<Directory>* result);
-
-  virtual Status FileExists(const std::string& fname);
-
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result);
-
-  virtual Status CreateDir(const std::string& name);
-
-  virtual Status CreateDirIfMissing(const std::string& name);
-
-  virtual Status DeleteDir(const std::string& name);
+class WinClock : public SystemClock {
+ public:
+  WinClock();
+  virtual ~WinClock() {}
+
+  static const char* kClassName() { return "WindowsClock"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
 
-  virtual Status GetFileSize(const std::string& fname, uint64_t* size);
-
-  static uint64_t FileTimeToUnixTime(const FILETIME& ftTime);
-
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* file_mtime);
-
-  virtual Status RenameFile(const std::string& src, const std::string& target);
-
-  virtual Status LinkFile(const std::string& src, const std::string& target);
-
-  virtual Status NumFileLinks(const std::string& /*fname*/,
-                              uint64_t* /*count*/);
-
-  virtual Status AreFilesSame(const std::string& first,
-                              const std::string& second, bool* res);
-
-  virtual Status LockFile(const std::string& lockFname, FileLock** lock);
-
-  virtual Status UnlockFile(FileLock* lock);
-
-  virtual Status GetTestDirectory(std::string* result);
-
-  virtual Status NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result);
-
-  virtual uint64_t NowMicros();
-
-  virtual uint64_t NowNanos();
-
-  virtual Status GetHostName(char* name, uint64_t len);
-
-  virtual Status GetAbsolutePath(const std::string& db_path,
-                                 std::string* output_path);
-
-  // This seems to clash with a macro on Windows, so #undef it here
-#undef GetFreeSpace
-
-  // Get the amount of free disk space
-  virtual Status GetFreeSpace(const std::string& path, uint64_t* diskfree);
-
-  virtual std::string TimeToString(uint64_t secondsSince1970);
-
-  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
-                                         const DBOptions& db_options) const;
-
-  virtual EnvOptions OptimizeForManifestWrite(
-      const EnvOptions& env_options) const;
+  uint64_t NowMicros() override;
 
-  virtual EnvOptions OptimizeForManifestRead(
-      const EnvOptions& env_options) const;
+  uint64_t NowNanos() override;
 
-  size_t GetPageSize() const { return page_size_; }
+  // 0 indicates not supported
+  uint64_t CPUMicros() override { return 0; }
+  void SleepForMicroseconds(int micros) override;
 
-  size_t GetAllocationGranularity() const { return allocation_granularity_; }
+  Status GetCurrentTime(int64_t* unix_time) override;
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time);
 
   uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; }
 
-  static size_t GetSectorSize(const std::string& fname);
+ private:
+  using FnGetSystemTimePreciseAsFileTime = VOID(WINAPI*)(LPFILETIME);
 
-private:
-  // Returns true iff the named directory exists and is a directory.
-  virtual bool DirExists(const std::string& dname);
-
-  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
-
-  Env* hosted_env_;
-  size_t page_size_;
-  size_t allocation_granularity_;
   uint64_t perf_counter_frequency_;
   uint64_t nano_seconds_per_period_;
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
-class WinEnv : public Env {
-public:
-  WinEnv();
+class WinFileSystem : public FileSystem {
+ public:
+  static const std::shared_ptr<WinFileSystem>& Default();
+  WinFileSystem(const std::shared_ptr<SystemClock>& clock);
+  ~WinFileSystem() {}
+  static const char* kClassName() { return "WinFS"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const { return kDefaultName(); }
 
-  ~WinEnv();
-
-  Status DeleteFile(const std::string& fname) override;
-
-  Status Truncate(const std::string& fname, size_t size) override;
-
-  Status GetCurrentTime(int64_t* unix_time) override;
+  static size_t GetSectorSize(const std::string& fname);
+  size_t GetPageSize() const { return page_size_; }
+  size_t GetAllocationGranularity() const { return allocation_granularity_; }
 
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& options) override;
-
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& options) override;
-
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& options) override;
-
-  // Create an object that writes to a new file with the specified
-  // name.  Deletes any existing file with the same name and creates a
-  // new file.  On success, stores a pointer to the new file in
-  // *result and returns OK.  On failure stores nullptr in *result and
-  // returns non-OK.
-  //
-  // The returned file will only be accessed by one thread at a time.
-  Status ReopenWritableFile(const std::string& fname,
-                            std::unique_ptr<WritableFile>* result,
-                            const EnvOptions& options) override;
-
-  // The returned file will only be accessed by one thread at a time.
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override;
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
 
-  Status NewMemoryMappedFileBuffer(
+  // Truncate the named file to the specified size.
+  IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* /*dbg*/) override;
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* dbg) override;
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus NewMemoryMappedFileBuffer(
       const std::string& fname,
       std::unique_ptr<MemoryMappedFileBuffer>* result) override;
 
-  Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override;
-
-  Status FileExists(const std::string& fname) override;
-
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override;
-
-  Status CreateDir(const std::string& name) override;
-
-  Status CreateDirIfMissing(const std::string& name) override;
-
-  Status DeleteDir(const std::string& name) override;
-
-  Status GetFileSize(const std::string& fname,
-                     uint64_t* size) override;
-
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) override;
-
-  Status RenameFile(const std::string& src,
-                    const std::string& target) override;
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+  IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+                      IODebugContext* dbg) override;
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override;
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  // Delete the specified directory.
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+  // Store the size of fname in *file_size.
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+  // Store the last modification time of fname in *file_mtime.
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+  // Rename file src to target.
+  IOStatus RenameFile(const std::string& src, const std::string& target,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  // Hard Link file src to target.
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus NumFileLinks(const std::string& /*fname*/,
+                        const IOOptions& /*options*/, uint64_t* /*count*/,
+                        IODebugContext* /*dbg*/) override;
+  IOStatus AreFilesSame(const std::string& /*first*/,
+                        const std::string& /*second*/,
+                        const IOOptions& /*options*/, bool* /*res*/,
+                        IODebugContext* /*dbg*/) override;
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can override to provide custom
+  // logger.
+  IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+  // Get full directory name for this db.
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override;
+  IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options,
+                       bool* is_dir, IODebugContext* /*dgb*/) override;
+  // This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+  IOStatus GetFreeSpace(const std::string& /*path*/,
+                        const IOOptions& /*options*/, uint64_t* /*diskfree*/,
+                        IODebugContext* /*dbg*/) override;
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override;
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override;
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override;
 
-  Status LinkFile(const std::string& src,
-                  const std::string& target) override;
+ protected:
+  static uint64_t FileTimeToUnixTime(const FILETIME& ftTime);
+  // Returns true iff the named directory exists and is a directory.
 
-  Status NumFileLinks(const std::string& fname, uint64_t* count) override;
+  virtual bool DirExists(const std::string& dname);
+  // Helper for NewWritable and ReopenWritableFile
+  virtual IOStatus OpenWritableFile(const std::string& fname,
+                                    const FileOptions& options,
+                                    std::unique_ptr<FSWritableFile>* result,
+                                    bool reopen);
 
-  Status AreFilesSame(const std::string& first,
-                      const std::string& second, bool* res) override;
+ private:
+  std::shared_ptr<SystemClock> clock_;
+  size_t page_size_;
+  size_t allocation_granularity_;
+};
 
-  Status LockFile(const std::string& lockFname, FileLock** lock) override;
+// Designed for inheritance so can be re-used
+// but certain parts replaced
+class WinEnvIO {
+ public:
+  explicit WinEnvIO(Env* hosted_env);
 
-  Status UnlockFile(FileLock* lock) override;
+  virtual ~WinEnvIO();
 
-  Status GetTestDirectory(std::string* result) override;
+  virtual Status GetHostName(char* name, uint64_t len);
 
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override;
+ private:
+  Env* hosted_env_;
+};
 
-  uint64_t NowMicros() override;
+class WinEnv : public CompositeEnv {
+ public:
+  WinEnv();
 
-  uint64_t NowNanos() override;
+  ~WinEnv();
+  static const char* kClassName() { return "WinEnv"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kDefaultName(); }
 
   Status GetHostName(char* name, uint64_t len) override;
 
-  Status GetAbsolutePath(const std::string& db_path,
-                         std::string* output_path) override;
-
-  std::string TimeToString(uint64_t secondsSince1970) override;
-
   Status GetThreadList(std::vector<ThreadStatus>* thread_list) override;
 
-  void Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-                void* tag, void(*unschedFunction)(void* arg)) override;
+  void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                void* tag, void (*unschedFunction)(void* arg)) override;
 
   int UnSchedule(void* arg, Env::Priority pri) override;
 
-  void StartThread(void(*function)(void* arg), void* arg) override;
+  void StartThread(void (*function)(void* arg), void* arg) override;
 
-  void WaitForJoin();
+  void WaitForJoin() override;
 
   unsigned int GetThreadPoolQueueLen(Env::Priority pri) const override;
 
   uint64_t GetThreadID() const override;
 
-  // This seems to clash with a macro on Windows, so #undef it here
-#undef GetFreeSpace
-
-  // Get the amount of free disk space
-  Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override;
-
-  void SleepForMicroseconds(int micros) override;
-
   // Allow increasing the number of worker threads.
   void SetBackgroundThreads(int num, Env::Priority pri) override;
   int GetBackgroundThreads(Env::Priority pri) override;
 
   void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override;
 
-  EnvOptions OptimizeForManifestRead(
-      const EnvOptions& env_options) const override;
-
-  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
-                                 const DBOptions& db_options) const override;
-
-  EnvOptions OptimizeForManifestWrite(
-      const EnvOptions& env_options) const override;
-
-
-private:
-
+ private:
   WinEnvIO winenv_io_;
   WinEnvThreads winenv_threads_;
 };
 
-} // namespace port
+}  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,8 +7,11 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#if defined(OS_WIN)
+
 #include "port/win/io_win.h"
 
+#include "env_win.h"
 #include "monitoring/iostats_context_imp.h"
 #include "test_util/sync_point.h"
 #include "util/aligned_buffer.h"
@@ -18,36 +21,28 @@
 namespace port {
 
 /*
-* DirectIOHelper
-*/
+ * DirectIOHelper
+ */
 namespace {
 
 const size_t kSectorSize = 512;
 
-inline
-bool IsPowerOfTwo(const size_t alignment) {
+inline bool IsPowerOfTwo(const size_t alignment) {
   return ((alignment) & (alignment - 1)) == 0;
 }
 
-inline
-bool IsSectorAligned(const size_t off) {
-  return (off & (kSectorSize - 1)) == 0;
-}
-
-inline
-bool IsAligned(size_t alignment, const void* ptr) {
+inline bool IsAligned(size_t alignment, const void* ptr) {
   return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
 }
-}
-
+}  // namespace
 
 std::string GetWindowsErrSz(DWORD err) {
   LPSTR lpMsgBuf;
   FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-    FORMAT_MESSAGE_IGNORE_INSERTS,
-    NULL, err,
-    0,  // Default language
-    reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
+                     FORMAT_MESSAGE_IGNORE_INSERTS,
+                 NULL, err,
+                 0,  // Default language
+                 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
 
   std::string Err = lpMsgBuf;
   LocalFree(lpMsgBuf);
@@ -67,21 +62,20 @@
 // Because all the reads/writes happen by the specified offset, the caller in
 // theory should not
 // rely on the current file offset.
-Status pwrite(const WinFileData* file_data, const Slice& data,
-  uint64_t offset, size_t& bytes_written) {
-
-  Status s;
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+                uint64_t offset, size_t& bytes_written) {
+  IOStatus s;
   bytes_written = 0;
 
   size_t num_bytes = data.size();
   if (num_bytes > std::numeric_limits<DWORD>::max()) {
     // May happen in 64-bit builds where size_t is 64-bits but
     // long is still 32-bit, but that's the API here at the moment
-    return Status::InvalidArgument("num_bytes is too large for a single write: " +
-          file_data->GetName());
+    return IOStatus::InvalidArgument(
+        "num_bytes is too large for a single write: " + file_data->GetName());
   }
 
-  OVERLAPPED overlapped = { 0 };
+  OVERLAPPED overlapped = {0};
   ULARGE_INTEGER offsetUnion;
   offsetUnion.QuadPart = offset;
 
@@ -90,11 +84,12 @@
 
   DWORD bytesWritten = 0;
 
-  if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
-    &bytesWritten, &overlapped)) {
+  if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
+                         static_cast<DWORD>(num_bytes), &bytesWritten,
+                         &overlapped)) {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
-      lastError);
+                                lastError);
   } else {
     bytes_written = bytesWritten;
   }
@@ -103,18 +98,17 @@
 }
 
 // See comments for pwrite above
-Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
-  uint64_t offset, size_t& bytes_read) {
-
-  Status s;
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+               uint64_t offset, size_t& bytes_read) {
+  IOStatus s;
   bytes_read = 0;
 
   if (num_bytes > std::numeric_limits<DWORD>::max()) {
-    return Status::InvalidArgument("num_bytes is too large for a single read: " +
-      file_data->GetName());
+    return IOStatus::InvalidArgument(
+        "num_bytes is too large for a single read: " + file_data->GetName());
   }
 
-  OVERLAPPED overlapped = { 0 };
+  OVERLAPPED overlapped = {0};
   ULARGE_INTEGER offsetUnion;
   offsetUnion.QuadPart = offset;
 
@@ -123,13 +117,14 @@
 
   DWORD bytesRead = 0;
 
-  if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
-    &bytesRead, &overlapped)) {
+  if (FALSE == ReadFile(file_data->GetFileHandle(), src,
+                        static_cast<DWORD>(num_bytes), &bytesRead,
+                        &overlapped)) {
     auto lastError = GetLastError();
     // EOF is OK with zero bytes read
     if (lastError != ERROR_HANDLE_EOF) {
       s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
-        lastError);
+                                  lastError);
     }
   } else {
     bytes_read = bytesRead;
@@ -141,35 +136,34 @@
 // SetFileInformationByHandle() is capable of fast pre-allocates.
 // However, this does not change the file end position unless the file is
 // truncated and the pre-allocated space is not considered filled with zeros.
-Status fallocate(const std::string& filename, HANDLE hFile,
-  uint64_t to_size) {
-  Status status;
+IOStatus fallocate(const std::string& filename, HANDLE hFile,
+                   uint64_t to_size) {
+  IOStatus status;
 
   FILE_ALLOCATION_INFO alloc_info;
   alloc_info.AllocationSize.QuadPart = to_size;
 
   if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
-    sizeof(FILE_ALLOCATION_INFO))) {
+                                  sizeof(FILE_ALLOCATION_INFO))) {
     auto lastError = GetLastError();
     status = IOErrorFromWindowsError(
-      "Failed to pre-allocate space: " + filename, lastError);
+        "Failed to pre-allocate space: " + filename, lastError);
   }
 
   return status;
 }
 
-Status ftruncate(const std::string& filename, HANDLE hFile,
-  uint64_t toSize) {
-  Status status;
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
+  IOStatus status;
 
   FILE_END_OF_FILE_INFO end_of_file;
   end_of_file.EndOfFile.QuadPart = toSize;
 
   if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
-    sizeof(FILE_END_OF_FILE_INFO))) {
+                                  sizeof(FILE_END_OF_FILE_INFO))) {
     auto lastError = GetLastError();
     status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
-      lastError);
+                                     lastError);
   }
 
   return status;
@@ -189,6 +183,17 @@
   return 0;
 }
 
+WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
+                         bool direct_io)
+    : filename_(filename),
+      hFile_(hFile),
+      use_direct_io_(direct_io),
+      sector_size_(WinFileSystem::GetSectorSize(filename)) {}
+
+bool WinFileData::IsSectorAligned(const size_t off) const {
+  return (off & (sector_size_ - 1)) == 0;
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // WinMmapReadableFile
 
@@ -210,9 +215,11 @@
   assert(ret);
 }
 
-Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
-  char* scratch) const {
-  Status s;
+IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
+                                   const IOOptions& /*options*/, Slice* result,
+                                   char* scratch,
+                                   IODebugContext* /*dbg*/) const {
+  IOStatus s;
 
   if (offset > length_) {
     *result = Slice();
@@ -220,13 +227,12 @@
   } else if (offset + n > length_) {
     n = length_ - static_cast<size_t>(offset);
   }
-  *result =
-    Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
+  *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
   return s;
 }
 
-Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
 size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
@@ -236,20 +242,19 @@
 ///////////////////////////////////////////////////////////////////////////////
 /// WinMmapFile
 
-
 // Can only truncate or reserve to a sector size aligned if
 // used on files that are opened with Unbuffered I/O
-Status WinMmapFile::TruncateFile(uint64_t toSize) {
+IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
   return ftruncate(filename_, hFile_, toSize);
 }
 
-Status WinMmapFile::UnmapCurrentRegion() {
-  Status status;
+IOStatus WinMmapFile::UnmapCurrentRegion() {
+  IOStatus status;
 
   if (mapped_begin_ != nullptr) {
     if (!::UnmapViewOfFile(mapped_begin_)) {
       status = IOErrorFromWindowsError(
-        "Failed to unmap file view: " + filename_, GetLastError());
+          "Failed to unmap file view: " + filename_, GetLastError());
     }
 
     // Move on to the next portion of the file
@@ -269,16 +274,16 @@
   return status;
 }
 
-Status WinMmapFile::MapNewRegion() {
-
-  Status status;
+IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  IOStatus status;
 
   assert(mapped_begin_ == nullptr);
 
   size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
 
   if (minDiskSize > reserved_size_) {
-    status = Allocate(file_offset_, view_size_);
+    status = Allocate(file_offset_, view_size_, options, dbg);
     if (!status.ok()) {
       return status;
     }
@@ -286,7 +291,6 @@
 
   // Need to remap
   if (hMap_ == NULL || reserved_size_ > mapping_size_) {
-
     if (hMap_ != NULL) {
       // Unmap the previous one
       BOOL ret __attribute__((__unused__));
@@ -299,18 +303,18 @@
     mappingSize.QuadPart = reserved_size_;
 
     hMap_ = CreateFileMappingA(
-      hFile_,
-      NULL,                  // Security attributes
-      PAGE_READWRITE,        // There is not a write only mode for mapping
-      mappingSize.HighPart,  // Enable mapping the whole file but the actual
-      // amount mapped is determined by MapViewOfFile
-      mappingSize.LowPart,
-      NULL);  // Mapping name
+        hFile_,
+        NULL,                  // Security attributes
+        PAGE_READWRITE,        // There is not a write only mode for mapping
+        mappingSize.HighPart,  // Enable mapping the whole file but the actual
+        // amount mapped is determined by MapViewOfFile
+        mappingSize.LowPart,
+        NULL);  // Mapping name
 
     if (NULL == hMap_) {
       return IOErrorFromWindowsError(
-        "WindowsMmapFile failed to create file mapping for: " + filename_,
-        GetLastError());
+          "WindowsMmapFile failed to create file mapping for: " + filename_,
+          GetLastError());
     }
 
     mapping_size_ = reserved_size_;
@@ -321,13 +325,13 @@
 
   // View must begin at the granularity aligned offset
   mapped_begin_ = reinterpret_cast<char*>(
-    MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
-    view_size_, NULL));
+      MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
+                      view_size_, NULL));
 
   if (!mapped_begin_) {
     status = IOErrorFromWindowsError(
-      "WindowsMmapFile failed to map file view: " + filename_,
-      GetLastError());
+        "WindowsMmapFile failed to map file view: " + filename_,
+        GetLastError());
   } else {
     mapped_end_ = mapped_begin_ + view_size_;
     dst_ = mapped_begin_;
@@ -337,15 +341,15 @@
   return status;
 }
 
-Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
+IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
   return fallocate(filename_, hFile_, spaceToReserve);
 }
 
 WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
                          size_t page_size, size_t allocation_granularity,
-                         const EnvOptions& options)
+                         const FileOptions& options)
     : WinFileData(fname, hFile, false),
-      WritableFile(options),
+      FSWritableFile(options),
       hMap_(NULL),
       page_size_(page_size),
       allocation_granularity_(allocation_granularity),
@@ -371,17 +375,19 @@
 
   // View size must be both the multiple of allocation_granularity AND the
   // page size and the granularity is usually a multiple of a page size.
-  const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
+  const size_t viewSize =
+      32 * 1024;  // 32Kb similar to the Windows File Cache in buffered mode
   view_size_ = Roundup(viewSize, allocation_granularity_);
 }
 
 WinMmapFile::~WinMmapFile() {
   if (hFile_) {
-    this->Close();
+    this->Close(IOOptions(), nullptr);
   }
 }
 
-Status WinMmapFile::Append(const Slice& data) {
+IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
+                             IODebugContext* dbg) {
   const char* src = data.data();
   size_t left = data.size();
 
@@ -390,9 +396,9 @@
     size_t avail = mapped_end_ - dst_;
 
     if (avail == 0) {
-      Status s = UnmapCurrentRegion();
+      IOStatus s = UnmapCurrentRegion();
       if (s.ok()) {
-        s = MapNewRegion();
+        s = MapNewRegion(options, dbg);
       }
 
       if (!s.ok()) {
@@ -414,30 +420,31 @@
     memset(dst_, 0, bytesToPad);
   }
 
-  return Status::OK();
+  return IOStatus::OK();
 }
 
 // Means Close() will properly take care of truncate
 // and it does not need any additional information
-Status WinMmapFile::Truncate(uint64_t size) {
-  return Status::OK();
+IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
 }
 
-Status WinMmapFile::Close() {
-  Status s;
+IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
+  IOStatus s;
 
   assert(NULL != hFile_);
 
   // We truncate to the precise size so no
   // uninitialized data at the end. SetEndOfFile
   // which we use does not write zeros and it is good.
-  uint64_t targetSize = GetFileSize();
+  uint64_t targetSize = GetFileSize(options, dbg);
 
   if (mapped_begin_ != nullptr) {
     // Sync before unmapping to make sure everything
     // is on disk and there is not a lazy writing
     // so we are deterministic with the tests
-    Sync();
+    Sync(options, dbg);
     s = UnmapCurrentRegion();
   }
 
@@ -446,14 +453,13 @@
     if (!ret && s.ok()) {
       auto lastError = GetLastError();
       s = IOErrorFromWindowsError(
-        "Failed to Close mapping for file: " + filename_, lastError);
+          "Failed to Close mapping for file: " + filename_, lastError);
     }
 
     hMap_ = NULL;
   }
 
   if (hFile_ != NULL) {
-
     TruncateFile(targetSize);
 
     BOOL ret = ::CloseHandle(hFile_);
@@ -462,18 +468,22 @@
     if (!ret && s.ok()) {
       auto lastError = GetLastError();
       s = IOErrorFromWindowsError(
-        "Failed to close file map handle: " + filename_, lastError);
+          "Failed to close file map handle: " + filename_, lastError);
     }
   }
 
   return s;
 }
 
-Status WinMmapFile::Flush() { return Status::OK(); }
+IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
 
 // Flush only data
-Status WinMmapFile::Sync() {
-  Status s;
+IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
+                           IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   // Some writes occurred since last sync
   if (dst_ > last_sync_) {
@@ -483,15 +493,15 @@
     assert(dst_ < mapped_end_);
 
     size_t page_begin =
-      TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
+        TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
     size_t page_end =
-      TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
+        TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
 
     // Flush only the amount of that is a multiple of pages
     if (!::FlushViewOfFile(mapped_begin_ + page_begin,
-      (page_end - page_begin) + page_size_)) {
+                           (page_end - page_begin) + page_size_)) {
       s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
-        GetLastError());
+                                  GetLastError());
     } else {
       last_sync_ = dst_;
     }
@@ -501,16 +511,16 @@
 }
 
 /**
-* Flush data as well as metadata to stable storage.
-*/
-Status WinMmapFile::Fsync() {
-  Status s = Sync();
+ * Flush data as well as metadata to stable storage.
+ */
+IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  IOStatus s = Sync(options, dbg);
 
   // Flush metadata
   if (s.ok() && pending_sync_) {
     if (!::FlushFileBuffers(hFile_)) {
       s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
-        GetLastError());
+                                  GetLastError());
     }
     pending_sync_ = false;
   }
@@ -519,27 +529,31 @@
 }
 
 /**
-* Get the size of valid data in the file. This will not match the
-* size that is returned from the filesystem because we use mmap
-* to extend file by map_size every time.
-*/
-uint64_t WinMmapFile::GetFileSize() {
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
   size_t used = dst_ - mapped_begin_;
   return file_offset_ + used;
 }
 
-Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
-Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
-  Status status;
-  TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
+IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
+                               const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+  IOStatus status;
+  TEST_KILL_RANDOM("WinMmapFile::Allocate");
 
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
+  size_t spaceToReserve =
+      Roundup(static_cast<size_t>(offset + len), view_size_);
   // Nothing to do
   if (spaceToReserve <= reserved_size_) {
     return status;
@@ -561,31 +575,34 @@
 // WinSequentialFile
 
 WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
-                                     const EnvOptions& options)
+                                     const FileOptions& options)
     : WinFileData(fname, f, options.use_direct_reads) {}
 
 WinSequentialFile::~WinSequentialFile() {
   assert(hFile_ != INVALID_HANDLE_VALUE);
 }
 
-Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
-  Status s;
+IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
+                                 Slice* result, char* scratch,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus s;
   size_t r = 0;
 
   assert(result != nullptr);
   if (WinFileData::use_direct_io()) {
-    return Status::NotSupported("Read() does not support direct_io");
+    return IOStatus::NotSupported("Read() does not support direct_io");
   }
 
   // Windows ReadFile API accepts a DWORD.
   // While it is possible to read in a loop if n is too big
   // it is an unlikely case.
   if (n > std::numeric_limits<DWORD>::max()) {
-    return Status::InvalidArgument("n is too big for a single ReadFile: "
-      + filename_);
+    return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
+                                     filename_);
   }
 
-  DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
+  DWORD bytesToRead =
+      static_cast<DWORD>(n);  // cast is safe due to the check above
   DWORD bytesRead = 0;
   BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
   if (ret != FALSE) {
@@ -593,8 +610,7 @@
   } else {
     auto lastError = GetLastError();
     if (lastError != ERROR_HANDLE_EOF) {
-      s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
-        lastError);
+      s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
     }
   }
 
@@ -602,99 +618,86 @@
   return s;
 }
 
-Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
-  uint64_t offset, size_t& bytes_read) const {
+IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
+                                                   uint64_t offset,
+                                                   size_t& bytes_read) const {
   return pread(this, src, numBytes, offset, bytes_read);
 }
 
-Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
-  char* scratch) {
-
-  Status s;
-
+IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                           const IOOptions& /*opts*/,
+                                           Slice* result, char* scratch,
+                                           IODebugContext* /*dbg*/) {
   if (!WinFileData::use_direct_io()) {
-    return Status::NotSupported("This function is only used for direct_io");
+    return IOStatus::NotSupported("This function is only used for direct_io");
   }
 
-  if (!IsSectorAligned(static_cast<size_t>(offset)) ||
-      !IsSectorAligned(n)) {
-      return Status::InvalidArgument(
-        "WinSequentialFile::PositionedRead: offset is not properly aligned");
-  }
+  assert(IsSectorAligned(static_cast<size_t>(offset)));
+  assert(IsSectorAligned(static_cast<size_t>(n)));
 
-  size_t bytes_read = 0; // out param
-  s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
+  size_t bytes_read = 0;  // out param
+  IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
+                                      bytes_read);
   *result = Slice(scratch, bytes_read);
   return s;
 }
 
-
-Status WinSequentialFile::Skip(uint64_t n) {
-  // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
-  // integer. As such it is a highly unlikley case to have n so large.
+IOStatus WinSequentialFile::Skip(uint64_t n) {
+  // Can't handle more than signed max as SetFilePointerEx accepts a signed
+  // 64-bit integer. As such it is a highly unlikley case to have n so large.
   if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
-    return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
-      filename_);
+    return IOStatus::InvalidArgument(
+        "n is too large for a single SetFilePointerEx() call" + filename_);
   }
 
   LARGE_INTEGER li;
-  li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
+  li.QuadPart = static_cast<LONGLONG>(n);  // cast is safe due to the check
+                                           // above
   BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
   if (ret == FALSE) {
     auto lastError = GetLastError();
-    return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, 
-      lastError);
+    return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
+                                   lastError);
   }
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
 /// WinRandomAccessBase
 
-inline
-Status WinRandomAccessImpl::PositionedReadInternal(char* src,
-  size_t numBytes,
-  uint64_t offset,
-  size_t& bytes_read) const {
+inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
+    char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
   return pread(file_base_, src, numBytes, offset, bytes_read);
 }
 
-inline
-WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
-  size_t alignment,
-  const EnvOptions& options) :
-    file_base_(file_base),
-    alignment_(alignment) {
-
+inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
+                                                size_t alignment,
+                                                const FileOptions& options)
+    : file_base_(file_base),
+      alignment_(std::max(alignment, file_base->GetSectorSize())) {
   assert(!options.use_mmap_reads);
 }
 
-inline
-Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
-  char* scratch) const {
-
-  Status s;
-
+inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
+                                              Slice* result,
+                                              char* scratch) const {
   // Check buffer alignment
   if (file_base_->use_direct_io()) {
-    if (!IsSectorAligned(static_cast<size_t>(offset)) ||
-        !IsAligned(alignment_, scratch)) {
-      return Status::InvalidArgument(
-        "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
-    }
+    assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
+    assert(IsAligned(alignment_, scratch));
   }
 
   if (n == 0) {
     *result = Slice(scratch, 0);
-    return s;
+    return IOStatus::OK();
   }
 
   size_t bytes_read = 0;
-  s = PositionedReadInternal(scratch, n, offset, bytes_read);
+  IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
   *result = Slice(scratch, bytes_read);
   return s;
 }
@@ -704,20 +707,21 @@
 
 WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
                                          size_t alignment,
-                                         const EnvOptions& options)
+                                         const FileOptions& options)
     : WinFileData(fname, hFile, options.use_direct_reads),
       WinRandomAccessImpl(this, alignment, options) {}
 
-WinRandomAccessFile::~WinRandomAccessFile() {
-}
+WinRandomAccessFile::~WinRandomAccessFile() {}
 
-Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
-  char* scratch) const {
+IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
+                                   const IOOptions& /*options*/, Slice* result,
+                                   char* scratch,
+                                   IODebugContext* /*dbg*/) const {
   return ReadImpl(offset, n, result, scratch);
 }
 
-Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
 size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
@@ -732,27 +736,26 @@
 // WinWritableImpl
 //
 
-inline
-Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
-  return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
+inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
+  return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
+                   spaceToReserve);
 }
 
-inline
-WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
-  : file_data_(file_data),
-  alignment_(alignment),
-  next_write_offset_(0),
-  reservedsize_(0) {
-
+inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
+                                        size_t alignment)
+    : file_data_(file_data),
+      alignment_(std::max(alignment, file_data->GetSectorSize())),
+      next_write_offset_(0),
+      reservedsize_(0) {
   // Query current position in case ReopenWritableFile is called
   // This position is only important for buffered writes
   // for unbuffered writes we explicitely specify the position.
   LARGE_INTEGER zero_move;
-  zero_move.QuadPart = 0; // Do not move
+  zero_move.QuadPart = 0;  // Do not move
   LARGE_INTEGER pos;
   pos.QuadPart = 0;
   BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
-      FILE_CURRENT);
+                              FILE_CURRENT);
   // Querying no supped to fail
   if (ret != 0) {
     next_write_offset_ = pos.QuadPart;
@@ -761,74 +764,62 @@
   }
 }
 
-inline
-Status WinWritableImpl::AppendImpl(const Slice& data) {
-
-  Status s;
+inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
+  IOStatus s;
 
   if (data.size() > std::numeric_limits<DWORD>::max()) {
-    return Status::InvalidArgument("data is too long for a single write" + 
-      file_data_->GetName());
+    return IOStatus::InvalidArgument("data is too long for a single write" +
+                                     file_data_->GetName());
   }
 
-  size_t bytes_written = 0; // out param
+  size_t bytes_written = 0;  // out param
 
   if (file_data_->use_direct_io()) {
     // With no offset specified we are appending
     // to the end of the file
-    assert(IsSectorAligned(next_write_offset_));
-    if (!IsSectorAligned(data.size()) ||
-        !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
-      s = Status::InvalidArgument(
-        "WriteData must be page aligned, size must be sector aligned");
-    } else {
-      s = pwrite(file_data_, data, next_write_offset_, bytes_written);
-    }
+    assert(file_data_->IsSectorAligned(next_write_offset_));
+    assert(file_data_->IsSectorAligned(data.size()));
+    assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
+    s = pwrite(file_data_, data, next_write_offset_, bytes_written);
   } else {
-
     DWORD bytesWritten = 0;
     if (!WriteFile(file_data_->GetFileHandle(), data.data(),
-      static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
+                   static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
       auto lastError = GetLastError();
       s = IOErrorFromWindowsError(
-        "Failed to WriteFile: " + file_data_->GetName(),
-        lastError);
+          "Failed to WriteFile: " + file_data_->GetName(), lastError);
     } else {
       bytes_written = bytesWritten;
     }
   }
 
-  if(s.ok()) {
+  if (s.ok()) {
     if (bytes_written == data.size()) {
       // This matters for direct_io cases where
       // we rely on the fact that next_write_offset_
       // is sector aligned
       next_write_offset_ += bytes_written;
     } else {
-      s = Status::IOError("Failed to write all bytes: " + 
-        file_data_->GetName());
+      s = IOStatus::IOError("Failed to write all bytes: " +
+                            file_data_->GetName());
     }
   }
 
   return s;
 }
 
-inline
-Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
-
-  if(file_data_->use_direct_io()) {
-    if (!IsSectorAligned(static_cast<size_t>(offset)) ||
-        !IsSectorAligned(data.size()) ||
-        !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
-      return Status::InvalidArgument(
-        "Data and offset must be page aligned, size must be sector aligned");
-    }
+inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
+                                                      uint64_t offset) {
+  if (file_data_->use_direct_io()) {
+    assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
+    assert(file_data_->IsSectorAligned(data.size()));
+    assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
   }
 
   size_t bytes_written = 0;
-  Status s = pwrite(file_data_, data, offset, bytes_written);
+  IOStatus s = pwrite(file_data_, data, offset, bytes_written);
 
-  if(s.ok()) {
+  if (s.ok()) {
     if (bytes_written == data.size()) {
       // For sequential write this would be simple
       // size extension by data.size()
@@ -837,23 +828,21 @@
         next_write_offset_ = write_end;
       }
     } else {
-      s = Status::IOError("Failed to write all of the requested data: " +
-        file_data_->GetName());
+      s = IOStatus::IOError("Failed to write all of the requested data: " +
+                            file_data_->GetName());
     }
   }
   return s;
 }
 
-inline
-Status WinWritableImpl::TruncateImpl(uint64_t size) {
-
+inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
   // It is tempting to check for the size for sector alignment
   // but truncation may come at the end and there is not a requirement
   // for this to be sector aligned so long as we do not attempt to write
   // after that. The interface docs state that the behavior is undefined
   // in that case.
-  Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
-    size);
+  IOStatus s =
+      ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
 
   if (s.ok()) {
     next_write_offset_ = size;
@@ -861,50 +850,48 @@
   return s;
 }
 
-inline
-Status WinWritableImpl::CloseImpl() {
-
-  Status s;
+inline IOStatus WinWritableImpl::CloseImpl() {
+  IOStatus s;
 
   auto hFile = file_data_->GetFileHandle();
   assert(INVALID_HANDLE_VALUE != hFile);
 
   if (!::FlushFileBuffers(hFile)) {
     auto lastError = GetLastError();
-    s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
-      file_data_->GetName(),
-      lastError);
+    s = IOErrorFromWindowsError(
+        "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
+        lastError);
   }
 
-  if(!file_data_->CloseFile() && s.ok()) {
+  if (!file_data_->CloseFile() && s.ok()) {
     auto lastError = GetLastError();
-    s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
-      lastError);
+    s = IOErrorFromWindowsError(
+        "CloseHandle failed for: " + file_data_->GetName(), lastError);
   }
   return s;
 }
 
-inline
-Status WinWritableImpl::SyncImpl() {
-  Status s;
-  if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
+inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
+                                          IODebugContext* /*dbg*/) {
+  IOStatus s;
+  if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError(
-        "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
+        "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
+        lastError);
   }
   return s;
 }
 
-
-inline
-Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
-  Status status;
-  TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
+inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
+  IOStatus status;
+  TEST_KILL_RANDOM("WinWritableFile::Allocate");
 
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
+  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
+                                  static_cast<size_t>(alignment_));
   // Nothing to do
   if (spaceToReserve <= reservedsize_) {
     return status;
@@ -918,66 +905,78 @@
   return status;
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
 /// WinWritableFile
 
 WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
                                  size_t alignment, size_t /* capacity */,
-                                 const EnvOptions& options)
+                                 const FileOptions& options)
     : WinFileData(fname, hFile, options.use_direct_writes),
       WinWritableImpl(this, alignment),
-      WritableFile(options) {
+      FSWritableFile(options) {
   assert(!options.use_mmap_writes);
 }
 
-WinWritableFile::~WinWritableFile() {
-}
+WinWritableFile::~WinWritableFile() {}
 
 // Indicates if the class makes use of direct I/O
-bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
+bool WinWritableFile::use_direct_io() const {
+  return WinFileData::use_direct_io();
+}
 
 size_t WinWritableFile::GetRequiredBufferAlignment() const {
-  return static_cast<size_t>(GetAlignement());
+  return static_cast<size_t>(GetAlignment());
 }
 
-Status WinWritableFile::Append(const Slice& data) {
+IOStatus WinWritableFile::Append(const Slice& data,
+                                 const IOOptions& /*options*/,
+                                 IODebugContext* /*dbg*/) {
   return AppendImpl(data);
 }
 
-Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
+IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
+                                           const IOOptions& /*options*/,
+                                           IODebugContext* /*dbg*/) {
   return PositionedAppendImpl(data, offset);
 }
 
 // Need to implement this so the file is truncated correctly
 // when buffered and unbuffered mode
-Status WinWritableFile::Truncate(uint64_t size) {
+IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
   return TruncateImpl(size);
 }
 
-Status WinWritableFile::Close() {
+IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
   return CloseImpl();
 }
 
-  // write out the cached data to the OS cache
-  // This is now taken care of the WritableFileWriter
-Status WinWritableFile::Flush() {
-  return Status::OK();
+// write out the cached data to the OS cache
+// This is now taken care of the WritableFileWriter
+IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
 }
 
-Status WinWritableFile::Sync() {
-  return SyncImpl();
+IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
 }
 
-Status WinWritableFile::Fsync() { return SyncImpl(); }
+IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
+}
 
 bool WinWritableFile::IsSyncThreadSafe() const { return true; }
 
-uint64_t WinWritableFile::GetFileSize() {
+uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
+                                      IODebugContext* /*dbg*/) {
   return GetFileNextWriteOffset();
 }
 
-Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
+IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
   return AllocateImpl(offset, len);
 }
 
@@ -989,36 +988,45 @@
 /// WinRandomRWFile
 
 WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
-                                 size_t alignment, const EnvOptions& options)
+                                 size_t alignment, const FileOptions& options)
     : WinFileData(fname, hFile,
                   options.use_direct_reads && options.use_direct_writes),
       WinRandomAccessImpl(this, alignment, options),
       WinWritableImpl(this, alignment) {}
 
-bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
+bool WinRandomRWFile::use_direct_io() const {
+  return WinFileData::use_direct_io();
+}
 
 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
-  return static_cast<size_t>(GetAlignement());
+  assert(WinRandomAccessImpl::GetAlignment() ==
+         WinWritableImpl::GetAlignment());
+  return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
 }
 
-Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
+IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
   return PositionedAppendImpl(data, offset);
 }
 
-Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
-                             char* scratch) const {
+IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
+                               const IOOptions& /*options*/, Slice* result,
+                               char* scratch, IODebugContext* /*dbg*/) const {
   return ReadImpl(offset, n, result, scratch);
 }
 
-Status WinRandomRWFile::Flush() {
-  return Status::OK();
+IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
 }
 
-Status WinRandomRWFile::Sync() {
-  return SyncImpl();
+IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
 }
 
-Status WinRandomRWFile::Close() {
+IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
   return CloseImpl();
 }
 
@@ -1027,9 +1035,9 @@
 WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
   BOOL ret
 #if defined(_MSC_VER)
-    = FALSE;
+      = FALSE;
 #else
-    __attribute__((__unused__));
+      __attribute__((__unused__));
 #endif
   if (base_ != nullptr) {
     ret = ::UnmapViewOfFile(base_);
@@ -1051,7 +1059,10 @@
 //////////////////////////////////////////////////////////////////////////
 /// WinDirectory
 
-Status WinDirectory::Fsync() { return Status::OK(); }
+IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
+                             IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
 
 size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
   return GetUniqueIdFromFile(handle_, id, max_size);
@@ -1065,5 +1076,7 @@
   assert(ret);
 }
 
-}
+}  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/io_win.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/io_win.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,51 +9,53 @@
 #pragma once
 
 #include <stdint.h>
+#include <windows.h>
+
 #include <mutex>
 #include <string>
 
+#include "rocksdb/file_system.h"
 #include "rocksdb/status.h"
-#include "rocksdb/env.h"
 #include "util/aligned_buffer.h"
-
-#include <windows.h>
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace port {
 
 std::string GetWindowsErrSz(DWORD err);
 
-inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
+inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) {
   return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
-             ? Status::NoSpace(context, GetWindowsErrSz(err))
+             ? IOStatus::NoSpace(context, GetWindowsErrSz(err))
              : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND))
-                   ? Status::PathNotFound(context, GetWindowsErrSz(err))
-                   : Status::IOError(context, GetWindowsErrSz(err));
+                   ? IOStatus::PathNotFound(context, GetWindowsErrSz(err))
+                   : IOStatus::IOError(context, GetWindowsErrSz(err));
 }
 
-inline Status IOErrorFromLastWindowsError(const std::string& context) {
+inline IOStatus IOErrorFromLastWindowsError(const std::string& context) {
   return IOErrorFromWindowsError(context, GetLastError());
 }
 
-inline Status IOError(const std::string& context, int err_number) {
+inline IOStatus IOError(const std::string& context, int err_number) {
   return (err_number == ENOSPC)
-             ? Status::NoSpace(context, strerror(err_number))
+             ? IOStatus::NoSpace(context, errnoStr(err_number).c_str())
              : (err_number == ENOENT)
-                   ? Status::PathNotFound(context, strerror(err_number))
-                   : Status::IOError(context, strerror(err_number));
+                   ? IOStatus::PathNotFound(context,
+                                            errnoStr(err_number).c_str())
+                   : IOStatus::IOError(context, errnoStr(err_number).c_str());
 }
 
 class WinFileData;
 
-Status pwrite(const WinFileData* file_data, const Slice& data,
-  uint64_t offset, size_t& bytes_written);
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+                uint64_t offset, size_t& bytes_written);
 
-Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
-  uint64_t offset, size_t& bytes_read);
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+               uint64_t offset, size_t& bytes_read);
 
-Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
+IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
 
-Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
 
 size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
 
@@ -65,12 +67,12 @@
   // will need to be aligned (not sure there is a guarantee that the buffer
   // passed in is aligned).
   const bool use_direct_io_;
+  const size_t sector_size_;
 
  public:
   // We want this class be usable both for inheritance (prive
   // or protected) and for containment so __ctor and __dtor public
-  WinFileData(const std::string& filename, HANDLE hFile, bool direct_io)
-      : filename_(filename), hFile_(hFile), use_direct_io_(direct_io) {}
+  WinFileData(const std::string& filename, HANDLE hFile, bool direct_io);
 
   virtual ~WinFileData() { this->CloseFile(); }
 
@@ -91,38 +93,46 @@
 
   bool use_direct_io() const { return use_direct_io_; }
 
+  size_t GetSectorSize() const { return sector_size_; }
+
+  bool IsSectorAligned(const size_t off) const;
+
   WinFileData(const WinFileData&) = delete;
   WinFileData& operator=(const WinFileData&) = delete;
 };
 
-class WinSequentialFile : protected WinFileData, public SequentialFile {
-
+class WinSequentialFile : protected WinFileData, public FSSequentialFile {
   // Override for behavior change when creating a custom env
-  virtual Status PositionedReadInternal(char* src, size_t numBytes,
-    uint64_t offset, size_t& bytes_read) const;
+  virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+                                          uint64_t offset,
+                                          size_t& bytes_read) const;
 
-public:
+ public:
   WinSequentialFile(const std::string& fname, HANDLE f,
-    const EnvOptions& options);
+                    const FileOptions& options);
 
   ~WinSequentialFile();
 
   WinSequentialFile(const WinSequentialFile&) = delete;
   WinSequentialFile& operator=(const WinSequentialFile&) = delete;
 
-  virtual Status Read(size_t n, Slice* result, char* scratch) override;
-  virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-    char* scratch) override;
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
 
-  virtual Status Skip(uint64_t n) override;
+  IOStatus Skip(uint64_t n) override;
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
-  virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); }
+  virtual bool use_direct_io() const override {
+    return WinFileData::use_direct_io();
+  }
 };
 
 // mmap() based random-access
-class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
+class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
   HANDLE hMap_;
 
   const void* mapped_region_;
@@ -138,10 +148,11 @@
   WinMmapReadableFile(const WinMmapReadableFile&) = delete;
   WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete;
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -150,7 +161,7 @@
 // data to the file.  This is safe since we either properly close the
 // file before reading from it, or for log files, the reading code
 // knows enough to skip zero suffixes.
-class WinMmapFile : private WinFileData, public WritableFile {
+class WinMmapFile : private WinFileData, public FSWritableFile {
  private:
   HANDLE hMap_;
 
@@ -179,51 +190,59 @@
 
   // Can only truncate or reserve to a sector size aligned if
   // used on files that are opened with Unbuffered I/O
-  Status TruncateFile(uint64_t toSize);
+  IOStatus TruncateFile(uint64_t toSize);
 
-  Status UnmapCurrentRegion();
+  IOStatus UnmapCurrentRegion();
 
-  Status MapNewRegion();
+  IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg);
 
-  virtual Status PreallocateInternal(uint64_t spaceToReserve);
+  virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
 
  public:
   WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
-              size_t allocation_granularity, const EnvOptions& options);
+              size_t allocation_granularity, const FileOptions& options);
 
   ~WinMmapFile();
 
   WinMmapFile(const WinMmapFile&) = delete;
   WinMmapFile& operator=(const WinMmapFile&) = delete;
 
-  virtual Status Append(const Slice& data) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& opts,
+                  const DataVerificationInfo& /* verification_info */,
+                  IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
 
   // Means Close() will properly take care of truncate
   // and it does not need any additional information
-  virtual Status Truncate(uint64_t size) override;
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
-  virtual Status Close() override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Flush() override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
 
   // Flush only data
-  virtual Status Sync() override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
 
   /**
-  * Flush data as well as metadata to stable storage.
-  */
-  virtual Status Fsync() override;
+   * Flush data as well as metadata to stable storage.
+   */
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
 
   /**
-  * Get the size of valid data in the file. This will not match the
-  * size that is returned from the filesystem because we use mmap
-  * to extend file by map_size every time.
-  */
-  virtual uint64_t GetFileSize() override;
+   * Get the size of valid data in the file. This will not match the
+   * size that is returned from the filesystem because we use mmap
+   * to extend file by map_size every time.
+   */
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
-  virtual Status Allocate(uint64_t offset, uint64_t len) override;
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -231,24 +250,24 @@
 class WinRandomAccessImpl {
  protected:
   WinFileData* file_base_;
-  size_t       alignment_;
+  size_t alignment_;
 
   // Override for behavior change when creating a custom env
-  virtual Status PositionedReadInternal(char* src, size_t numBytes,
-                                        uint64_t offset, size_t& bytes_read) const;
+  virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+                                          uint64_t offset,
+                                          size_t& bytes_read) const;
 
   WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
-                      const EnvOptions& options);
+                      const FileOptions& options);
 
   virtual ~WinRandomAccessImpl() {}
 
-  Status ReadImpl(uint64_t offset, size_t n, Slice* result,
-                  char* scratch) const;
+  IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result,
+                    char* scratch) const;
 
   size_t GetAlignment() const { return alignment_; }
 
  public:
-
   WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
   WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
 };
@@ -258,21 +277,24 @@
     : private WinFileData,
       protected WinRandomAccessImpl,  // Want to be able to override
                                       // PositionedReadInternal
-      public RandomAccessFile {
+      public FSRandomAccessFile {
  public:
   WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
-                      const EnvOptions& options);
+                      const FileOptions& options);
 
   ~WinRandomAccessFile();
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 
-  virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); }
+  virtual bool use_direct_io() const override {
+    return WinFileData::use_direct_io();
+  }
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   virtual size_t GetRequiredBufferAlignment() const override;
 };
@@ -293,28 +315,29 @@
  protected:
   WinFileData* file_data_;
   const uint64_t alignment_;
-  uint64_t next_write_offset_; // Needed because Windows does not support O_APPEND
+  uint64_t
+      next_write_offset_;  // Needed because Windows does not support O_APPEND
   uint64_t reservedsize_;  // how far we have reserved space
 
-  virtual Status PreallocateInternal(uint64_t spaceToReserve);
+  virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
 
   WinWritableImpl(WinFileData* file_data, size_t alignment);
 
   ~WinWritableImpl() {}
 
-  uint64_t GetAlignement() const { return alignment_; }
+  uint64_t GetAlignment() const { return alignment_; }
 
-  Status AppendImpl(const Slice& data);
+  IOStatus AppendImpl(const Slice& data);
 
   // Requires that the data is aligned as specified by
   // GetRequiredBufferAlignment()
-  Status PositionedAppendImpl(const Slice& data, uint64_t offset);
+  IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset);
 
-  Status TruncateImpl(uint64_t size);
+  IOStatus TruncateImpl(uint64_t size);
 
-  Status CloseImpl();
+  IOStatus CloseImpl();
 
-  Status SyncImpl();
+  IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg);
 
   uint64_t GetFileNextWriteOffset() {
     // Double accounting now here with WritableFileWriter
@@ -326,7 +349,7 @@
     return next_write_offset_;
   }
 
-  Status AllocateImpl(uint64_t offset, uint64_t len);
+  IOStatus AllocateImpl(uint64_t offset, uint64_t len);
 
  public:
   WinWritableImpl(const WinWritableImpl&) = delete;
@@ -335,32 +358,47 @@
 
 class WinWritableFile : private WinFileData,
                         protected WinWritableImpl,
-                        public WritableFile {
+                        public FSWritableFile {
  public:
   WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
-                  size_t capacity, const EnvOptions& options);
+                  size_t capacity, const FileOptions& options);
 
   ~WinWritableFile();
 
-  virtual Status Append(const Slice& data) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& opts,
+                  const DataVerificationInfo& /* verification_info */,
+                  IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
 
   // Requires that the data is aligned as specified by
   // GetRequiredBufferAlignment()
-  virtual Status PositionedAppend(const Slice& data, uint64_t offset) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& opts,
+                            const DataVerificationInfo& /* verification_info */,
+                            IODebugContext* dbg) override {
+    return PositionedAppend(data, offset, opts, dbg);
+  }
 
   // Need to implement this so the file is truncated correctly
   // when buffered and unbuffered mode
-  virtual Status Truncate(uint64_t size) override;
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
-  virtual Status Close() override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 
   // write out the cached data to the OS cache
   // This is now taken care of the WritableFileWriter
-  virtual Status Flush() override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Sync() override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Fsync() override;
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
 
   virtual bool IsSyncThreadSafe() const override;
 
@@ -370,9 +408,10 @@
 
   virtual size_t GetRequiredBufferAlignment() const override;
 
-  virtual uint64_t GetFileSize() override;
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Allocate(uint64_t offset, uint64_t len) override;
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -380,10 +419,10 @@
 class WinRandomRWFile : private WinFileData,
                         protected WinRandomAccessImpl,
                         protected WinWritableImpl,
-                        public RandomRWFile {
+                        public FSRandomRWFile {
  public:
   WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
-                  const EnvOptions& options);
+                  const FileOptions& options);
 
   ~WinRandomRWFile() {}
 
@@ -397,45 +436,50 @@
 
   // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
   // Pass aligned buffer when use_direct_io() returns true.
-  virtual Status Write(uint64_t offset, const Slice& data) override;
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
   // Returns Status::OK() on success.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
-  virtual Status Flush() override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Sync() override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Fsync() { return Sync(); }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return Sync(options, dbg);
+  }
 
-  virtual Status Close() override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 };
 
 class WinMemoryMappedBuffer : public MemoryMappedFileBuffer {
-private:
-  HANDLE  file_handle_;
-  HANDLE  map_handle_;
-public:
-  WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, size_t size) :
-    MemoryMappedFileBuffer(base, size),
-    file_handle_(file_handle),
-    map_handle_(map_handle) {}
+ private:
+  HANDLE file_handle_;
+  HANDLE map_handle_;
+
+ public:
+  WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base,
+                        size_t size)
+      : MemoryMappedFileBuffer(base, size),
+        file_handle_(file_handle),
+        map_handle_(map_handle) {}
   ~WinMemoryMappedBuffer() override;
 };
 
-class WinDirectory : public Directory {
+class WinDirectory : public FSDirectory {
   HANDLE handle_;
+
  public:
   explicit WinDirectory(HANDLE h) noexcept : handle_(h) {
     assert(handle_ != INVALID_HANDLE_VALUE);
   }
-  ~WinDirectory() {
-    ::CloseHandle(handle_);
-  }
-  virtual Status Fsync() override;
+  ~WinDirectory() { ::CloseHandle(handle_); }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
 
   size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -452,5 +496,5 @@
  private:
   HANDLE hFile_;
 };
-}
+}  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,30 +7,29 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#if !defined(OS_WIN) && !defined(WIN32) && !defined(_WIN32)
-#error Windows Specific Code
-#endif
+#if defined(OS_WIN)
 
 #include "port/win/port_win.h"
 
+#include <assert.h>
 #include <io.h>
-#include "port/port_dirent.h"
-#include "port/sys_time.h"
-
-#include <cstdlib>
+#include <rpc.h>
 #include <stdio.h>
-#include <assert.h>
 #include <string.h>
 
-#include <memory>
-#include <exception>
 #include <chrono>
+#include <cstdlib>
+#include <exception>
+#include <memory>
+
+#include "port/port_dirent.h"
+#include "port/sys_time.h"
 
 #ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
 // utf8 <-> utf16
-#include <string>
-#include <locale>
 #include <codecvt>
+#include <locale>
+#include <string>
 #endif
 
 #include "logging/logging.h"
@@ -43,7 +42,7 @@
 
 #ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
 std::string utf16_to_utf8(const std::wstring& utf16) {
-  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>,wchar_t> convert;
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> convert;
   return convert.to_bytes(utf16);
 }
 
@@ -54,16 +53,17 @@
 #endif
 
 void gettimeofday(struct timeval* tv, struct timezone* /* tz */) {
-  using namespace std::chrono;
-
-  microseconds usNow(
-      duration_cast<microseconds>(system_clock::now().time_since_epoch()));
+  std::chrono::microseconds usNow(
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          std::chrono::system_clock::now().time_since_epoch()));
 
-  seconds secNow(duration_cast<seconds>(usNow));
+  std::chrono::seconds secNow(
+      std::chrono::duration_cast<std::chrono::seconds>(usNow));
 
   tv->tv_sec = static_cast<long>(secNow.count());
-  tv->tv_usec = static_cast<long>(usNow.count() -
-      duration_cast<microseconds>(secNow).count());
+  tv->tv_usec = static_cast<long>(
+      usNow.count() -
+      std::chrono::duration_cast<std::chrono::microseconds>(secNow).count());
 }
 
 Mutex::~Mutex() {}
@@ -86,20 +86,28 @@
 }
 
 bool CondVar::TimedWait(uint64_t abs_time_us) {
-
-  using namespace std::chrono;
-
   // MSVC++ library implements wait_until in terms of wait_for so
   // we need to convert absolute wait into relative wait.
-  microseconds usAbsTime(abs_time_us);
+  std::chrono::microseconds usAbsTime(abs_time_us);
 
-  microseconds usNow(
-    duration_cast<microseconds>(system_clock::now().time_since_epoch()));
-  microseconds relTimeUs =
-    (usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero();
+  std::chrono::microseconds usNow(
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          std::chrono::system_clock::now().time_since_epoch()));
+  std::chrono::microseconds relTimeUs = (usAbsTime > usNow)
+                                            ? (usAbsTime - usNow)
+                                            : std::chrono::microseconds::zero();
 
   // Caller must ensure that mutex is held prior to calling this method
   std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
+
+  // Work around https://github.com/microsoft/STL/issues/369
+#if defined(_MSC_VER) && \
+    (!defined(_MSVC_STL_UPDATE) || _MSVC_STL_UPDATE < 202008L)
+  if (relTimeUs == std::chrono::microseconds::zero()) {
+    lk.unlock();
+    lk.lock();
+  }
+#endif
 #ifndef NDEBUG
   mu_->locked_ = false;
 #endif
@@ -130,13 +138,12 @@
 
 // Private structure, exposed only by pointer
 struct DIR {
-  HANDLE      handle_;
-  bool        firstread_;
+  HANDLE handle_;
+  bool firstread_;
   RX_WIN32_FIND_DATA data_;
   dirent entry_;
 
-  DIR() : handle_(INVALID_HANDLE_VALUE),
-    firstread_(true) {}
+  DIR() : handle_(INVALID_HANDLE_VALUE), firstread_(true) {}
 
   DIR(const DIR&) = delete;
   DIR& operator=(const DIR&) = delete;
@@ -159,20 +166,19 @@
 
   std::unique_ptr<DIR> dir(new DIR);
 
-  dir->handle_ = RX_FindFirstFileEx(RX_FN(pattern).c_str(), 
-    FindExInfoBasic, // Do not want alternative name
-    &dir->data_,
-    FindExSearchNameMatch,
-    NULL, // lpSearchFilter
-    0);
+  dir->handle_ =
+      RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+                         FindExInfoBasic,  // Do not want alternative name
+                         &dir->data_, FindExSearchNameMatch,
+                         NULL,  // lpSearchFilter
+                         0);
 
   if (dir->handle_ == INVALID_HANDLE_VALUE) {
     return nullptr;
   }
 
   RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName));
-  strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), 
-           FN_TO_RX(x).c_str());
+  strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), FN_TO_RX(x).c_str());
 
   return dir.release();
 }
@@ -195,7 +201,7 @@
   }
 
   RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName));
-  strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), 
+  strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name),
            FN_TO_RX(x).c_str());
 
   return &dirp->entry_;
@@ -215,7 +221,6 @@
 }
 
 int Truncate(std::string path, int64_t len) {
-
   if (len < 0) {
     errno = EINVAL;
     return -1;
@@ -223,10 +228,10 @@
 
   HANDLE hFile =
       RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE,
-                 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                 NULL,           // Security attrs
-                 OPEN_EXISTING,  // Truncate existing file only
-                 FILE_ATTRIBUTE_NORMAL, NULL);
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                    NULL,           // Security attrs
+                    OPEN_EXISTING,  // Truncate existing file only
+                    FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
@@ -265,5 +270,34 @@
 // Assume 4KB page size
 const size_t kPageSize = 4U * 1024U;
 
+void SetCpuPriority(ThreadId id, CpuPriority priority) {
+  // Not supported
+  (void)id;
+  (void)priority;
+}
+
+int64_t GetProcessID() { return GetCurrentProcessId(); }
+
+bool GenerateRfcUuid(std::string* output) {
+  UUID uuid;
+  UuidCreateSequential(&uuid);
+
+  RPC_CSTR rpc_str;
+  auto status = UuidToStringA(&uuid, &rpc_str);
+  if (status != RPC_S_OK) {
+    return false;
+  }
+
+  // rpc_str is nul-terminated
+  *output = reinterpret_cast<char*>(rpc_str);
+
+  status = RpcStringFreeA(&rpc_str);
+  assert(status == RPC_S_OK);
+
+  return true;
+}
+
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/port_win.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/port_win.h	2025-05-19 16:14:27.000000000 +0000
@@ -18,12 +18,14 @@
 
 #include <windows.h>
 #include <string>
+#include <thread>
 #include <string.h>
 #include <mutex>
 #include <limits>
 #include <condition_variable>
 #include <malloc.h>
 #include <intrin.h>
+#include <process.h>
 
 #include <stdint.h>
 
@@ -45,7 +47,7 @@
 #undef DeleteFile
 
 #ifndef _SSIZE_T_DEFINED
-typedef SSIZE_T ssize_t;
+using ssize_t = SSIZE_T;
 #endif
 
 // size_t printf formatting named in the manner of C99 standard formatting
@@ -146,6 +148,16 @@
     mutex_.unlock();
   }
 
+  bool TryLock() {
+    bool ret = mutex_.try_lock();
+#ifndef NDEBUG
+    if (ret) {
+      locked_ = true;
+    }
+#endif
+    return ret;
+  }
+
   // this will assert if the mutex is not locked
   // it does NOT verify that mutex is held by a calling thread
   void AssertHeld() {
@@ -217,9 +229,14 @@
   Mutex* mu_;
 };
 
+
+#ifdef _POSIX_THREADS
+using Thread = std::thread;
+#else
 // Wrapper around the platform efficient
 // or otherwise preferrable implementation
 using Thread = WindowsThread;
+#endif
 
 // OnceInit type helps emulate
 // Posix semantics with initialization
@@ -276,7 +293,7 @@
 #endif
 
 static inline void AsmVolatilePause() {
-#if defined(_M_IX86) || defined(_M_X64)
+#if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM)
   YieldProcessor();
 #endif
   // it would be nice to get "wfe" on ARM here
@@ -285,7 +302,7 @@
 extern int PhysicalCoreID();
 
 // For Thread Local Storage abstraction
-typedef DWORD pthread_key_t;
+using pthread_key_t = DWORD;
 
 inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) {
   // Not used
@@ -336,6 +353,16 @@
 std::string utf16_to_utf8(const std::wstring& utf16);
 std::wstring utf8_to_utf16(const std::string& utf8);
 
+using ThreadId = int;
+
+extern void SetCpuPriority(ThreadId id, CpuPriority priority);
+
+int64_t GetProcessID();
+
+// Uses platform APIs to generate a 36-character RFC-4122 UUID. Returns
+// true on success or false on failure.
+bool GenerateRfcUuid(std::string* output);
+
 }  // namespace port
 
 
@@ -344,6 +371,7 @@
 #define RX_FILESTRING std::wstring
 #define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a)
 #define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a)
+#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str())
 #define RX_FNLEN(a) ::wcslen(a)
 
 #define RX_DeleteFile DeleteFileW
@@ -361,12 +389,14 @@
 #define RX_PathIsRelative PathIsRelativeW
 #define RX_GetCurrentDirectory GetCurrentDirectoryW
 #define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExW
+#define RX_PathIsDirectory PathIsDirectoryW
 
 #else
 
 #define RX_FILESTRING std::string
 #define RX_FN(a) a
 #define FN_TO_RX(a) a
+#define RX_FNCMP(a, b) strcmp(a, b)
 #define RX_FNLEN(a) strlen(a)
 
 #define RX_DeleteFile DeleteFileA
@@ -376,7 +406,7 @@
 #define RX_FindFirstFileEx FindFirstFileExA
 #define RX_CreateDirectory CreateDirectoryA
 #define RX_FindNextFile FindNextFileA
-#define RX_WIN32_FIND_DATA WIN32_FIND_DATA
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA
 #define RX_CreateDirectory CreateDirectoryA
 #define RX_RemoveDirectory RemoveDirectoryA
 #define RX_GetFileAttributesEx GetFileAttributesExA
@@ -385,6 +415,7 @@
 #define RX_PathIsRelative PathIsRelativeA
 #define RX_GetCurrentDirectory GetCurrentDirectoryA
 #define RX_GetDiskFreeSpaceEx GetDiskFreeSpaceExA
+#define RX_PathIsDirectory PathIsDirectoryA
 
 #endif
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_jemalloc.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#if defined(OS_WIN)
+
 #ifndef ROCKSDB_JEMALLOC
 # error This file can only be part of jemalloc aware build
 #endif
@@ -73,3 +75,5 @@
     je_free(p);
   }
 }
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,32 +10,36 @@
 // Logger implementation that can be shared by all environments
 // where enough posix functionality is available.
 
+#if defined(OS_WIN)
+
 #include "port/win/win_logger.h"
-#include "port/win/io_win.h"
 
-#include <algorithm>
+#include <fcntl.h>
 #include <stdio.h>
 #include <time.h>
-#include <fcntl.h>
-#include <atomic>
 
-#include "rocksdb/env.h"
+#include <algorithm>
+#include <atomic>
 
 #include "monitoring/iostats_context_imp.h"
 #include "port/sys_time.h"
+#include "port/win/env_win.h"
+#include "port/win/io_win.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace port {
 
-WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file,
+WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
                      const InfoLogLevel log_level)
     : Logger(log_level),
       file_(file),
       gettid_(gettid),
       log_size_(0),
       last_flush_micros_(0),
-      env_(env),
+      clock_(clock),
       flush_pending_(false) {
   assert(file_ != NULL);
   assert(file_ != INVALID_HANDLE_VALUE);
@@ -47,13 +51,11 @@
   BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL);
   if (ret == FALSE) {
     std::string errSz = GetWindowsErrSz(GetLastError());
-    fprintf(stderr, errSz.c_str());
+    fprintf(stderr, "%s", errSz.c_str());
   }
 }
 
-WinLogger::~WinLogger() { 
-  CloseInternal();
-}
+WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); }
 
 Status WinLogger::CloseImpl() {
   return CloseInternal();
@@ -65,15 +67,13 @@
     BOOL ret = FlushFileBuffers(file_);
     if (ret == 0) {
       auto lastError = GetLastError();
-      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", 
-        lastError);
+      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError);
     }
     ret = CloseHandle(file_);
     // On error the return value is zero
     if (ret == 0 && s.ok()) {
       auto lastError = GetLastError();
-      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", 
-        lastError);
+      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", lastError);
     }
     file_ = INVALID_HANDLE_VALUE;
     closed_ = true;
@@ -90,7 +90,7 @@
     // for perf reasons.
   }
 
-  last_flush_micros_ = env_->NowMicros();
+  last_flush_micros_ = clock_->NowMicros();
 }
 
 void WinLogger::Logv(const char* format, va_list ap) {
@@ -163,7 +163,7 @@
       &bytesWritten, NULL);
     if (ret == FALSE) {
       std::string errSz = GetWindowsErrSz(GetLastError());
-      fprintf(stderr, errSz.c_str());
+      fprintf(stderr, "%s", errSz.c_str());
     }
 
     flush_pending_ = true;
@@ -190,3 +190,5 @@
 }
 
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_logger.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_logger.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,22 +12,21 @@
 
 #pragma once
 
+#include <stdint.h>
+#include <windows.h>
+
 #include <atomic>
+#include <memory>
 
 #include "rocksdb/env.h"
 
-#include <stdint.h>
-#include <windows.h>
-
 namespace ROCKSDB_NAMESPACE {
-
-class Env;
+class SystemClock;
 
 namespace port {
-
 class WinLogger : public ROCKSDB_NAMESPACE::Logger {
  public:
-  WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file,
+  WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
             const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL);
 
   virtual ~WinLogger();
@@ -54,7 +53,7 @@
   uint64_t (*gettid_)();  // Return the thread id for the current thread
   std::atomic_size_t log_size_;
   std::atomic_uint_fast64_t last_flush_micros_;
-  Env* env_;
+  SystemClock* clock_;
   bool flush_pending_;
 
   Status CloseInternal();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#if defined(OS_WIN)
+// Most Mingw builds support std::thread only when using posix threads.
+// In that case, some of these functions will be unavailable.
+// Note that we're using either WindowsThread or std::thread, depending on
+// which one is available.
+#ifndef _POSIX_THREADS
+
 #include "port/win/win_thread.h"
 
 #include <assert.h>
@@ -177,3 +184,6 @@
 }
 } // namespace port
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !_POSIX_THREADS
+#endif  // OS_WIN
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/win_thread.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/win_thread.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,8 @@
 
 #pragma once
 
+#ifndef _POSIX_THREADS
+
 #include <memory>
 #include <functional>
 #include <type_traits>
@@ -23,11 +25,10 @@
 //  -- is that it dynamically allocates its internals that are automatically
 //     freed when  the thread terminates and not on the destruction of the
 //     object. This makes it difficult to control the source of memory
-//     allocation 
+//     allocation
 //  -  This implements Pimpl so we can easily replace the guts of the
 //      object in our private version if necessary.
 class WindowsThread {
-
   struct Data;
 
   std::shared_ptr<Data>  data_;
@@ -35,15 +36,14 @@
 
   void Init(std::function<void()>&&);
 
-public:
-
-  typedef void* native_handle_type;
+ public:
+  using native_handle_type = void*;
 
   // Construct with no thread
   WindowsThread();
 
   // Template constructor
-  // 
+  //
   // This templated constructor accomplishes several things
   //
   // - Allows the class as whole to be not a template
@@ -66,17 +66,12 @@
   //   dependent type that both checks the signature conformance to ensure
   //   that all of the necessary arguments are provided and allows pimpl
   //   implementation.
-  template<class Fn,
-    class... Args,
-    class = typename std::enable_if<
-      !std::is_same<typename std::decay<Fn>::type,
-                    WindowsThread>::value>::type>
-  explicit WindowsThread(Fn&& fx, Args&&... ax) :
-      WindowsThread() {
-
+  template <class Fn, class... Args,
+            class = typename std::enable_if<!std::is_same<
+                typename std::decay<Fn>::type, WindowsThread>::value>::type>
+  explicit WindowsThread(Fn&& fx, Args&&... ax) : WindowsThread() {
     // Use binder to create a single callable entity
-    auto binder = std::bind(std::forward<Fn>(fx),
-      std::forward<Args>(ax)...);
+    auto binder = std::bind(std::forward<Fn>(fx), std::forward<Args>(ax)...);
     // Use std::function to take advantage of the type erasure
     // so we can still hide implementation within pimpl
     // This also makes sure that the binder signature is compliant
@@ -85,7 +80,6 @@
     Init(std::move(target));
   }
 
-
   ~WindowsThread();
 
   WindowsThread(const WindowsThread&) = delete;
@@ -120,3 +114,4 @@
 }
 } // namespace std
 
+#endif  // !_POSIX_THREADS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.cc mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#if defined(OS_WIN)
+
 #include "port/win/xpress_win.h"
 #include <windows.h>
 
@@ -127,10 +129,9 @@
 }
 
 char* Decompress(const char* input_data, size_t input_length,
-  int* decompress_size) {
-
+                 size_t* uncompressed_size) {
   assert(input_data != nullptr);
-  assert(decompress_size != nullptr);
+  assert(uncompressed_size != nullptr);
 
   if (input_length == 0) {
     return nullptr;
@@ -183,14 +184,6 @@
 
   assert(decompressedBufferSize > 0);
 
-  // On Windows we are limited to a 32-bit int for the
-  // output data size argument
-  // so we hopefully never get here
-  if (decompressedBufferSize > std::numeric_limits<int>::max()) {
-    assert(false);
-    return nullptr;
-  }
-
   // The callers are deallocating using delete[]
   // thus we must allocate with new[]
   std::unique_ptr<char[]> outputBuffer(new char[decompressedBufferSize]);
@@ -214,7 +207,7 @@
     return nullptr;
   }
 
-  *decompress_size = static_cast<int>(decompressedDataSize);
+  *uncompressed_size = decompressedDataSize;
 
   // Return the raw buffer to the caller supporting the tradition
   return outputBuffer.release();
@@ -224,3 +217,5 @@
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.h mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/port/win/xpress_win.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/port/win/xpress_win.h	2025-05-19 16:14:27.000000000 +0000
@@ -20,8 +20,7 @@
 bool Compress(const char* input, size_t length, std::string* output);
 
 char* Decompress(const char* input_data, size_t input_length,
-                 int* decompress_size);
-
+                 size_t* uncompressed_size);
 }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/src.mk mariadb-10.11.13/storage/rocksdb/rocksdb/src.mk
--- mariadb-10.11.11/storage/rocksdb/rocksdb/src.mk	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/src.mk	2025-05-19 16:14:27.000000000 +0000
@@ -1,22 +1,39 @@
 # These are the sources from which librocksdb.a is built:
 LIB_SOURCES =                                                   \
+  cache/cache.cc                                                \
+  cache/cache_entry_roles.cc                                    \
+  cache/cache_key.cc                                            \
+  cache/cache_reservation_manager.cc                            \
   cache/clock_cache.cc                                          \
   cache/lru_cache.cc                                            \
   cache/sharded_cache.cc                                        \
   db/arena_wrapped_db_iter.cc                                   \
+  db/blob/blob_fetcher.cc                                       \
+  db/blob/blob_file_addition.cc                                 \
+  db/blob/blob_file_builder.cc                                  \
+  db/blob/blob_file_cache.cc                                    \
+  db/blob/blob_file_garbage.cc                                  \
+  db/blob/blob_file_meta.cc                                     \
+  db/blob/blob_file_reader.cc                                   \
+  db/blob/blob_garbage_meter.cc                                 \
+  db/blob/blob_log_format.cc                                    \
+  db/blob/blob_log_sequential_reader.cc                         \
+  db/blob/blob_log_writer.cc                                    \
+  db/blob/prefetch_buffer_collection.cc                         \
   db/builder.cc                                                 \
   db/c.cc                                                       \
   db/column_family.cc                                           \
-  db/compacted_db_impl.cc                                       \
-  db/compaction/compaction.cc                                 	\
+  db/compaction/compaction.cc                                   \
   db/compaction/compaction_iterator.cc                          \
   db/compaction/compaction_job.cc                               \
   db/compaction/compaction_picker.cc                            \
   db/compaction/compaction_picker_fifo.cc                       \
   db/compaction/compaction_picker_level.cc                      \
-  db/compaction/compaction_picker_universal.cc                 	\
+  db/compaction/compaction_picker_universal.cc                  \
+  db/compaction/sst_partitioner.cc                              \
   db/convenience.cc                                             \
   db/db_filesnapshot.cc                                         \
+  db/db_impl/compacted_db_impl.cc                               \
   db/db_impl/db_impl.cc                                         \
   db/db_impl/db_impl_compaction_flush.cc                        \
   db/db_impl/db_impl_debug.cc                                   \
@@ -29,7 +46,7 @@
   db/db_info_dumper.cc                                          \
   db/db_iter.cc                                                 \
   db/dbformat.cc                                                \
-  db/error_handler.cc						                                \
+  db/error_handler.cc                                           \
   db/event_helpers.cc                                           \
   db/experimental.cc                                            \
   db/external_sst_file_ingestion_job.cc                         \
@@ -47,6 +64,8 @@
   db/memtable_list.cc                                           \
   db/merge_helper.cc                                            \
   db/merge_operator.cc                                          \
+  db/output_validator.cc                                        \
+  db/periodic_work_scheduler.cc                                 \
   db/range_del_aggregator.cc                                    \
   db/range_tombstone_fragmenter.cc                              \
   db/repair.cc                                                  \
@@ -57,25 +76,32 @@
   db/trim_history_scheduler.cc                                  \
   db/version_builder.cc                                         \
   db/version_edit.cc                                            \
+  db/version_edit_handler.cc                                    \
   db/version_set.cc                                             \
+  db/wal_edit.cc                                                \
   db/wal_manager.cc                                             \
   db/write_batch.cc                                             \
   db/write_batch_base.cc                                        \
   db/write_controller.cc                                        \
   db/write_thread.cc                                            \
+  env/composite_env.cc                                          \
   env/env.cc                                                    \
   env/env_chroot.cc                                             \
   env/env_encryption.cc                                         \
   env/env_hdfs.cc                                               \
   env/env_posix.cc                                              \
   env/file_system.cc                                            \
-  env/fs_posix.cc                                           	  \
+  env/fs_posix.cc                                               \
+  env/fs_remap.cc                                               \
+  env/file_system_tracer.cc                                     \
   env/io_posix.cc                                               \
   env/mock_env.cc                                               \
+  env/unique_id_gen.cc                                          \
   file/delete_scheduler.cc                                      \
   file/file_prefetch_buffer.cc                                  \
   file/file_util.cc                                             \
   file/filename.cc                                              \
+  file/line_file_reader.cc                                      \
   file/random_access_file_reader.cc                             \
   file/read_write_util.cc                                       \
   file/readahead_raf.cc                                         \
@@ -88,6 +114,8 @@
   memory/arena.cc                                               \
   memory/concurrent_arena.cc                                    \
   memory/jemalloc_nodump_allocator.cc                           \
+  memory/memkind_kmem_allocator.cc                              \
+  memory/memory_allocator.cc                                    \
   memtable/alloc_tracker.cc                                     \
   memtable/hash_linklist_rep.cc                                 \
   memtable/hash_skiplist_rep.cc                                 \
@@ -109,20 +137,30 @@
   monitoring/thread_status_util.cc                              \
   monitoring/thread_status_util_debug.cc                        \
   options/cf_options.cc                                         \
+  options/configurable.cc                                       \
+  options/customizable.cc                                       \
   options/db_options.cc                                         \
   options/options.cc                                            \
   options/options_helper.cc                                     \
   options/options_parser.cc                                     \
-  options/options_sanity_check.cc                               \
   port/port_posix.cc                                            \
+  port/win/env_default.cc                                       \
+  port/win/env_win.cc                                           \
+  port/win/io_win.cc                                            \
+  port/win/port_win.cc                                          \
+  port/win/win_logger.cc                                        \
+  port/win/win_thread.cc                                        \
   port/stack_trace.cc                                           \
   table/adaptive/adaptive_table_factory.cc                      \
+  table/block_based/binary_search_index_reader.cc               \
   table/block_based/block.cc                                    \
   table/block_based/block_based_filter_block.cc                 \
   table/block_based/block_based_table_builder.cc                \
   table/block_based/block_based_table_factory.cc                \
+  table/block_based/block_based_table_iterator.cc               \
   table/block_based/block_based_table_reader.cc                 \
   table/block_based/block_builder.cc                            \
+  table/block_based/block_prefetcher.cc                         \
   table/block_based/block_prefix_index.cc                       \
   table/block_based/data_block_hash_index.cc                    \
   table/block_based/data_block_footer.cc                        \
@@ -130,11 +168,16 @@
   table/block_based/filter_policy.cc                            \
   table/block_based/flush_block_policy.cc                       \
   table/block_based/full_filter_block.cc                        \
+  table/block_based/hash_index_reader.cc                        \
   table/block_based/index_builder.cc                            \
+  table/block_based/index_reader_common.cc                      \
   table/block_based/parsed_full_filter_block.cc                 \
   table/block_based/partitioned_filter_block.cc                 \
+  table/block_based/partitioned_index_iterator.cc               \
+  table/block_based/partitioned_index_reader.cc                 \
+  table/block_based/reader_common.cc                            \
   table/block_based/uncompression_dict_reader.cc                \
-  table/block_fetcher.cc                             		        \
+  table/block_fetcher.cc                                        \
   table/cuckoo/cuckoo_table_builder.cc                          \
   table/cuckoo/cuckoo_table_factory.cc                          \
   table/cuckoo/cuckoo_table_reader.cc                           \
@@ -150,16 +193,23 @@
   table/plain/plain_table_index.cc                              \
   table/plain/plain_table_key_coding.cc                         \
   table/plain/plain_table_reader.cc                             \
+  table/sst_file_dumper.cc                                      \
   table/sst_file_reader.cc                                      \
   table/sst_file_writer.cc                                      \
+  table/table_factory.cc                                        \
   table/table_properties.cc                                     \
   table/two_level_iterator.cc                                   \
+  table/unique_id.cc                                            \
   test_util/sync_point.cc                                       \
   test_util/sync_point_impl.cc                                  \
   test_util/transaction_test_util.cc                            \
   tools/dump/db_dump_tool.cc                                    \
+  trace_replay/trace_record_handler.cc                          \
+  trace_replay/trace_record_result.cc                           \
+  trace_replay/trace_record.cc                                  \
   trace_replay/trace_replay.cc                                  \
   trace_replay/block_cache_tracer.cc                            \
+  trace_replay/io_tracer.cc                                     \
   util/build_version.cc                                         \
   util/coding.cc                                                \
   util/compaction_job_stats_impl.cc                             \
@@ -167,13 +217,16 @@
   util/compression_context_cache.cc                             \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
+  util/crc32c_arm64.cc                                          \
   util/dynamic_bloom.cc                                         \
   util/hash.cc                                                  \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
   util/rate_limiter.cc                                          \
+  util/ribbon_config.cc                                         \
+  util/regex.cc                                                 \
   util/slice.cc                                                 \
-  util/file_checksum_helper.cc      				\
+  util/file_checksum_helper.cc                                  \
   util/status.cc                                                \
   util/string_util.cc                                           \
   util/thread_local.cc                                          \
@@ -185,23 +238,27 @@
   utilities/blob_db/blob_db_impl.cc                             \
   utilities/blob_db/blob_db_impl_filesnapshot.cc                \
   utilities/blob_db/blob_file.cc                                \
-  utilities/blob_db/blob_log_format.cc                          \
-  utilities/blob_db/blob_log_reader.cc                          \
-  utilities/blob_db/blob_log_writer.cc                          \
+  utilities/cache_dump_load.cc                                  \
+  utilities/cache_dump_load_impl.cc                             \
   utilities/cassandra/cassandra_compaction_filter.cc            \
   utilities/cassandra/format.cc                                 \
   utilities/cassandra/merge_operator.cc                         \
   utilities/checkpoint/checkpoint_impl.cc                       \
+  utilities/compaction_filters.cc                               \
   utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc    \
   utilities/convenience/info_log_finder.cc                      \
   utilities/debug.cc                                            \
   utilities/env_mirror.cc                                       \
   utilities/env_timed.cc                                        \
+  utilities/fault_injection_env.cc                              \
+  utilities/fault_injection_fs.cc                               \
+  utilities/fault_injection_secondary_cache.cc                  \
   utilities/leveldb_options/leveldb_options.cc                  \
   utilities/memory/memory_util.cc                               \
+  utilities/merge_operators.cc                                  \
   utilities/merge_operators/max.cc                              \
   utilities/merge_operators/put.cc                              \
-  utilities/merge_operators/sortlist.cc                  		    \
+  utilities/merge_operators/sortlist.cc                         \
   utilities/merge_operators/string_append/stringappend.cc       \
   utilities/merge_operators/string_append/stringappend2.cc      \
   utilities/merge_operators/uint64add.cc                        \
@@ -218,6 +275,10 @@
   utilities/simulator_cache/sim_cache.cc                        \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
   utilities/trace/file_trace_reader_writer.cc                   \
+  utilities/trace/replayer_impl.cc                              \
+  utilities/transactions/lock/lock_manager.cc                   \
+  utilities/transactions/lock/point/point_lock_tracker.cc       \
+  utilities/transactions/lock/point/point_lock_manager.cc       \
   utilities/transactions/optimistic_transaction.cc              \
   utilities/transactions/optimistic_transaction_db_impl.cc      \
   utilities/transactions/pessimistic_transaction.cc             \
@@ -225,21 +286,16 @@
   utilities/transactions/snapshot_checker.cc                    \
   utilities/transactions/transaction_base.cc                    \
   utilities/transactions/transaction_db_mutex_impl.cc           \
-  utilities/transactions/transaction_lock_mgr.cc                \
   utilities/transactions/transaction_util.cc                    \
   utilities/transactions/write_prepared_txn.cc                  \
   utilities/transactions/write_prepared_txn_db.cc               \
   utilities/transactions/write_unprepared_txn.cc                \
   utilities/transactions/write_unprepared_txn_db.cc             \
   utilities/ttl/db_ttl_impl.cc                                  \
+  utilities/wal_filter.cc                                       \
   utilities/write_batch_with_index/write_batch_with_index.cc    \
   utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
 
-ifeq ($(ARMCRC_SOURCE),1)
-LIB_SOURCES +=\
-  util/crc32c_arm64.cc
-endif
-
 ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
 LIB_SOURCES_ASM =\
   util/crc32c_ppc_asm.S
@@ -250,7 +306,24 @@
 LIB_SOURCES_C =
 endif
 
+RANGE_TREE_SOURCES =\
+  utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \
+  utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc    \
+  utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc         \
+  utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc    \
+  utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc       \
+  utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc             \
+  utilities/transactions/lock/range/range_tree/lib/standalone_port.cc          \
+  utilities/transactions/lock/range/range_tree/lib/util/dbt.cc                 \
+  utilities/transactions/lock/range/range_tree/lib/util/memarena.cc            \
+  utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc      \
+  utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
+
 TOOL_LIB_SOURCES =                                              \
+  tools/io_tracer_parser_tool.cc                                \
   tools/ldb_cmd.cc                                              \
   tools/ldb_tool.cc                                             \
   tools/sst_dump_tool.cc                                        \
@@ -262,24 +335,32 @@
 
 MOCK_LIB_SOURCES =                                              \
   table/mock_table.cc                                           \
-  test_util/fault_injection_test_env.cc
 
 BENCH_LIB_SOURCES =                                             \
   tools/db_bench_tool.cc                                        \
+  tools/simulated_hybrid_file_system.cc                         \
+
+CACHE_BENCH_LIB_SOURCES =					\
+  cache/cache_bench_tool.cc                                     \
 
 STRESS_LIB_SOURCES =                                            \
   db_stress_tool/batched_ops_stress.cc                         \
   db_stress_tool/cf_consistency_stress.cc                      \
   db_stress_tool/db_stress_common.cc                           \
   db_stress_tool/db_stress_driver.cc                           \
-  db_stress_tool/db_stress_test_base.cc                        \
   db_stress_tool/db_stress_gflags.cc                           \
+  db_stress_tool/db_stress_listener.cc                         \
   db_stress_tool/db_stress_shared_state.cc                     \
+  db_stress_tool/db_stress_stat.cc                             \
+  db_stress_tool/db_stress_test_base.cc                        \
   db_stress_tool/db_stress_tool.cc                             \
+  db_stress_tool/expected_state.cc                             \
   db_stress_tool/no_batched_ops_stress.cc                      \
+  db_stress_tool/multi_ops_txns_stress.cc                      \
 
 TEST_LIB_SOURCES =                                              \
   db/db_test_util.cc                                            \
+  test_util/mock_time_env.cc                                    \
   test_util/testharness.cc                                      \
   test_util/testutil.cc                                         \
   utilities/cassandra/test_utils.cc                             \
@@ -291,21 +372,59 @@
   third-party/folly/folly/synchronization/ParkingLot.cpp                       \
   third-party/folly/folly/synchronization/WaitOptions.cpp                      \
 
-MAIN_SOURCES =                                                          \
+TOOLS_MAIN_SOURCES =                                                    \
+  db_stress_tool/db_stress.cc                                           \
+  tools/blob_dump.cc                                                    \
+  tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc         \
+  tools/db_repl_stress.cc                                               \
+  tools/db_sanity_test.cc                                               \
+  tools/ldb.cc                                                          \
+  tools/io_tracer_parser.cc                                             \
+  tools/sst_dump.cc                                                     \
+  tools/write_stress.cc                                                 \
+  tools/dump/rocksdb_dump.cc                                            \
+  tools/dump/rocksdb_undump.cc                                          \
+  tools/trace_analyzer.cc                                               \
+  tools/io_tracer_parser_tool.cc                                        \
+
+BENCH_MAIN_SOURCES =                                                    \
   cache/cache_bench.cc                                                  \
+  db/range_del_aggregator_bench.cc                                      \
+  memtable/memtablerep_bench.cc                                         \
+  table/table_reader_bench.cc                                           \
+  tools/db_bench.cc                                                     \
+  util/filter_bench.cc                                                  \
+  utilities/persistent_cache/persistent_cache_bench.cc                  \
+  #util/log_write_bench.cc                                               \
+
+TEST_MAIN_SOURCES =                                                     \
   cache/cache_test.cc                                                   \
-  db_stress_tool/db_stress.cc                                           \
+  cache/cache_reservation_manager_test.cc                                               \
+  cache/lru_cache_test.cc                                               \
+  db/blob/blob_counting_iterator_test.cc                                \
+  db/blob/blob_file_addition_test.cc                                    \
+  db/blob/blob_file_builder_test.cc                                     \
+  db/blob/blob_file_cache_test.cc                                       \
+  db/blob/blob_file_garbage_test.cc                                     \
+  db/blob/blob_file_reader_test.cc                                      \
+  db/blob/blob_garbage_meter_test.cc                                    \
+  db/blob/db_blob_basic_test.cc                                         \
+  db/blob/db_blob_compaction_test.cc                                    \
+  db/blob/db_blob_corruption_test.cc                                    \
+  db/blob/db_blob_index_test.cc                                         \
   db/column_family_test.cc                                              \
   db/compact_files_test.cc                                              \
+  db/compaction/clipping_iterator_test.cc                               \
   db/compaction/compaction_iterator_test.cc                             \
   db/compaction/compaction_job_test.cc                                  \
   db/compaction/compaction_job_stats_test.cc                            \
   db/compaction/compaction_picker_test.cc                               \
+  db/compaction/compaction_service_test.cc                              \
   db/comparator_db_test.cc                                              \
   db/corruption_test.cc                                                 \
   db/cuckoo_table_db_test.cc                                            \
   db/db_basic_test.cc                                                   \
-  db/db_blob_index_test.cc                                              \
+  db/db_with_timestamp_basic_test.cc                                    \
   db/db_block_cache_test.cc                                             \
   db/db_bloom_filter_test.cc                                            \
   db/db_compaction_filter_test.cc                                       \
@@ -313,62 +432,58 @@
   db/db_dynamic_level_test.cc                                           \
   db/db_encryption_test.cc                                              \
   db/db_flush_test.cc                                                   \
+  db/import_column_family_test.cc                                       \
   db/db_inplace_update_test.cc                                          \
   db/db_io_failure_test.cc                                              \
   db/db_iter_test.cc                                                    \
   db/db_iter_stress_test.cc                                             \
   db/db_iterator_test.cc                                                \
+  db/db_kv_checksum_test.cc                                             \
   db/db_log_iter_test.cc                                                \
   db/db_memtable_test.cc                                                \
   db/db_merge_operator_test.cc                                          \
-  db/db_merge_operand_test.cc                                          	\
+  db/db_merge_operand_test.cc                                           \
   db/db_options_test.cc                                                 \
   db/db_properties_test.cc                                              \
   db/db_range_del_test.cc                                               \
-  db/db_impl/db_secondary_test.cc                                       \
+  db/db_secondary_test.cc                                               \
   db/db_sst_test.cc                                                     \
   db/db_statistics_test.cc                                              \
   db/db_table_properties_test.cc                                        \
   db/db_tailing_iter_test.cc                                            \
   db/db_test.cc                                                         \
   db/db_test2.cc                                                        \
+  db/db_logical_block_size_cache_test.cc                                \
   db/db_universal_compaction_test.cc                                    \
   db/db_wal_test.cc                                                     \
+  db/db_with_timestamp_compaction_test.cc                               \
+  db/db_write_buffer_manager_test.cc                                    \
   db/db_write_test.cc                                                   \
   db/dbformat_test.cc                                                   \
   db/deletefile_test.cc                                                 \
-  db/env_timed_test.cc                                                  \
-  db/error_handler_test.cc                                              \
+  db/error_handler_fs_test.cc                                           \
   db/external_sst_file_basic_test.cc                                    \
   db/external_sst_file_test.cc                                          \
   db/fault_injection_test.cc                                            \
   db/file_indexer_test.cc                                               \
-  db/file_reader_writer_test.cc                                         \
   db/filename_test.cc                                                   \
   db/flush_job_test.cc                                                  \
-  db/hash_table_test.cc                                                 \
-  db/hash_test.cc                                                       \
-  db/heap_test.cc                                                       \
   db/listener_test.cc                                                   \
   db/log_test.cc                                                        \
-  db/lru_cache_test.cc                                                  \
   db/manual_compaction_test.cc                                          \
   db/memtable_list_test.cc                                              \
   db/merge_helper_test.cc                                               \
   db/merge_test.cc                                                      \
-  db/obsolete_files_test.cc						                                  \
-  db/options_settable_test.cc                                           \
+  db/obsolete_files_test.cc                                             \
   db/options_file_test.cc                                               \
   db/perf_context_test.cc                                               \
-  db/persistent_cache_test.cc                                           \
+  db/periodic_work_scheduler_test.cc                                    \
   db/plain_table_db_test.cc                                             \
   db/prefix_test.cc                                                     \
   db/repair_test.cc                                                     \
   db/range_del_aggregator_test.cc                                       \
-  db/range_del_aggregator_bench.cc                                      \
   db/range_tombstone_fragmenter_test.cc                                 \
   db/table_properties_collector_test.cc                                 \
-  db/util_merge_operators_test.cc                                       \
   db/version_builder_test.cc                                            \
   db/version_edit_test.cc                                               \
   db/version_set_test.cc                                                \
@@ -378,21 +493,29 @@
   db/write_controller_test.cc                                           \
   env/env_basic_test.cc                                                 \
   env/env_test.cc                                                       \
+  env/io_posix_test.cc                                                  \
   env/mock_env_test.cc                                                  \
+  file/delete_scheduler_test.cc                                         \
+  file/prefetch_test.cc                                                 \
+  file/random_access_file_reader_test.cc                                \
   logging/auto_roll_logger_test.cc                                      \
   logging/env_logger_test.cc                                            \
   logging/event_logger_test.cc                                          \
   memory/arena_test.cc                                                  \
+  memory/memory_allocator_test.cc                                       \
   memtable/inlineskiplist_test.cc                                       \
-  memtable/memtablerep_bench.cc                                         \
   memtable/skiplist_test.cc                                             \
   memtable/write_buffer_manager_test.cc                                 \
   monitoring/histogram_test.cc                                          \
   monitoring/iostats_context_test.cc                                    \
   monitoring/statistics_test.cc                                         \
   monitoring/stats_history_test.cc                                      \
+  options/configurable_test.cc                                          \
+  options/customizable_test.cc                                          \
+  options/options_settable_test.cc                                      \
   options/options_test.cc                                               \
   table/block_based/block_based_filter_block_test.cc                    \
+  table/block_based/block_based_table_reader_test.cc                    \
   table/block_based/block_test.cc                                       \
   table/block_based/data_block_hash_index_test.cc                       \
   table/block_based/full_filter_block_test.cc                           \
@@ -402,19 +525,17 @@
   table/cuckoo/cuckoo_table_reader_test.cc                              \
   table/merger_test.cc                                                  \
   table/sst_file_reader_test.cc                                         \
-  table/table_reader_bench.cc                                           \
   table/table_test.cc                                                   \
-  third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
+  table/block_fetcher_test.cc                                           \
+  test_util/testutil_test.cc                                            \
   tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc         \
-  tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc         \
-  tools/db_bench.cc                                                     \
-  tools/db_bench_tool_test.cc                                           \
-  tools/db_sanity_test.cc                                               \
+  tools/io_tracer_parser_test.cc                                        \
   tools/ldb_cmd_test.cc                                                 \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
-  tools/trace_analyzer_test.cc				             	                    \
+  tools/trace_analyzer_test.cc                                          \
   trace_replay/block_cache_tracer_test.cc                               \
+  trace_replay/io_tracer_test.cc                                        \
   util/autovector_test.cc                                               \
   util/bloom_test.cc                                                    \
   util/coding_test.cc                                                   \
@@ -422,15 +543,20 @@
   util/defer_test.cc                                                    \
   util/dynamic_bloom_test.cc                                            \
   util/filelock_test.cc                                                 \
-  util/log_write_bench.cc                                               \
-  util/rate_limiter_test.cc                                             \
+  util/file_reader_writer_test.cc                                       \
+  util/hash_test.cc                                                     \
+  util/heap_test.cc                                                     \
   util/random_test.cc                                                   \
+  util/rate_limiter_test.cc                                             \
   util/repeatable_thread_test.cc                                        \
+  util/ribbon_test.cc                                                   \
   util/slice_test.cc                                                    \
   util/slice_transform_test.cc                                          \
   util/timer_queue_test.cc                                              \
+  util/timer_test.cc                                                    \
   util/thread_list_test.cc                                              \
   util/thread_local_test.cc                                             \
+  util/work_queue_test.cc                                               \
   utilities/backupable/backupable_db_test.cc                            \
   utilities/blob_db/blob_db_test.cc                                     \
   utilities/cassandra/cassandra_format_test.cc                          \
@@ -438,26 +564,39 @@
   utilities/cassandra/cassandra_row_merge_test.cc                       \
   utilities/cassandra/cassandra_serialize_test.cc                       \
   utilities/checkpoint/checkpoint_test.cc                               \
+  utilities/env_timed_test.cc                                           \
   utilities/memory/memory_test.cc                                       \
   utilities/merge_operators/string_append/stringappend_test.cc          \
   utilities/object_registry_test.cc                                     \
   utilities/option_change_migration/option_change_migration_test.cc     \
   utilities/options/options_util_test.cc                                \
+  utilities/persistent_cache/hash_table_test.cc                         \
+  utilities/persistent_cache/persistent_cache_test.cc                   \
   utilities/simulator_cache/cache_simulator_test.cc                     \
   utilities/simulator_cache/sim_cache_test.cc                           \
   utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
   utilities/transactions/optimistic_transaction_test.cc                 \
+  utilities/transactions/lock/range/range_locking_test.cc               \
   utilities/transactions/transaction_test.cc                            \
+  utilities/transactions/lock/point/point_lock_manager_test.cc          \
   utilities/transactions/write_prepared_transaction_test.cc             \
   utilities/transactions/write_unprepared_transaction_test.cc           \
   utilities/ttl/ttl_test.cc                                             \
+  utilities/util_merge_operators_test.cc                                \
   utilities/write_batch_with_index/write_batch_with_index_test.cc       \
 
+TEST_MAIN_SOURCES_C = \
+  db/c_test.c                                                           \
+
+MICROBENCH_SOURCES =                                          \
+  microbench/ribbon_bench.cc                                  \
+
 JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/backupenginejni.cc                            \
   java/rocksjni/backupablejni.cc                              \
   java/rocksjni/checkpoint.cc                                 \
   java/rocksjni/clock_cache.cc                                \
+  java/rocksjni/cache.cc                                      \
   java/rocksjni/columnfamilyhandle.cc                         \
   java/rocksjni/compact_range_options.cc                      \
   java/rocksjni/compaction_filter.cc                          \
@@ -471,8 +610,12 @@
   java/rocksjni/comparator.cc                                 \
   java/rocksjni/comparatorjnicallback.cc                      \
   java/rocksjni/compression_options.cc                        \
+  java/rocksjni/concurrent_task_limiter.cc                    \
+  java/rocksjni/config_options.cc                             \
   java/rocksjni/env.cc                                        \
   java/rocksjni/env_options.cc                                \
+  java/rocksjni/event_listener.cc                             \
+  java/rocksjni/event_listener_jnicallback.cc                 \
   java/rocksjni/ingest_external_file_options.cc               \
   java/rocksjni/filter.cc                                     \
   java/rocksjni/iterator.cc                                   \
@@ -502,6 +645,7 @@
   java/rocksjni/sst_file_writerjni.cc                         \
   java/rocksjni/sst_file_readerjni.cc                         \
   java/rocksjni/sst_file_reader_iterator.cc                   \
+  java/rocksjni/sst_partitioner.cc                            \
   java/rocksjni/statistics.cc                                 \
   java/rocksjni/statisticsjni.cc                              \
   java/rocksjni/table.cc                                      \
@@ -518,6 +662,7 @@
   java/rocksjni/transaction_notifier.cc                       \
   java/rocksjni/transaction_notifier_jnicallback.cc           \
   java/rocksjni/ttl.cc                                        \
+  java/rocksjni/testable_event_listener.cc                    \
   java/rocksjni/wal_filter.cc                                 \
   java/rocksjni/wal_filter_jnicallback.cc                     \
   java/rocksjni/write_batch.cc                                \
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc	2025-05-19 16:14:27.000000000 +0000
@@ -42,12 +42,13 @@
 extern const uint64_t kCuckooTableMagicNumber;
 
 Status AdaptiveTableFactory::NewTableReader(
-    const TableReaderOptions& table_reader_options,
+    const ReadOptions& ro, const TableReaderOptions& table_reader_options,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table,
-    bool /*prefetch_index_and_filter_in_cache*/) const {
+    bool prefetch_index_and_filter_in_cache) const {
   Footer footer;
-  auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */,
+  IOOptions opts;
+  auto s = ReadFooterFromFile(opts, file.get(), nullptr /* prefetch_buffer */,
                               file_size, &footer);
   if (!s.ok()) {
     return s;
@@ -57,9 +58,10 @@
     return plain_table_factory_->NewTableReader(
         table_reader_options, std::move(file), file_size, table);
   } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
-      footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
+             footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
     return block_based_table_factory_->NewTableReader(
-        table_reader_options, std::move(file), file_size, table);
+        ro, table_reader_options, std::move(file), file_size, table,
+        prefetch_index_and_filter_in_cache);
   } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
     return cuckoo_table_factory_->NewTableReader(
         table_reader_options, std::move(file), file_size, table);
@@ -69,13 +71,12 @@
 }
 
 TableBuilder* AdaptiveTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
-  return table_factory_to_write_->NewTableBuilder(table_builder_options,
-                                                  column_family_id, file);
+  return table_factory_to_write_->NewTableBuilder(table_builder_options, file);
 }
 
-std::string AdaptiveTableFactory::GetPrintableTableOptions() const {
+std::string AdaptiveTableFactory::GetPrintableOptions() const {
   std::string ret;
   ret.reserve(20000);
   const int kBufferSize = 200;
@@ -85,13 +86,13 @@
     snprintf(buffer, kBufferSize, "  write factory (%s) options:\n%s\n",
              (table_factory_to_write_->Name() ? table_factory_to_write_->Name()
                                               : ""),
-             table_factory_to_write_->GetPrintableTableOptions().c_str());
+             table_factory_to_write_->GetPrintableOptions().c_str());
     ret.append(buffer);
   }
   if (plain_table_factory_) {
     snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
              plain_table_factory_->Name() ? plain_table_factory_->Name() : "",
-             plain_table_factory_->GetPrintableTableOptions().c_str());
+             plain_table_factory_->GetPrintableOptions().c_str());
     ret.append(buffer);
   }
   if (block_based_table_factory_) {
@@ -99,13 +100,13 @@
         buffer, kBufferSize, "  %s options:\n%s\n",
         (block_based_table_factory_->Name() ? block_based_table_factory_->Name()
                                             : ""),
-        block_based_table_factory_->GetPrintableTableOptions().c_str());
+        block_based_table_factory_->GetPrintableOptions().c_str());
     ret.append(buffer);
   }
   if (cuckoo_table_factory_) {
     snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
              cuckoo_table_factory_->Name() ? cuckoo_table_factory_->Name() : "",
-             cuckoo_table_factory_->GetPrintableTableOptions().c_str());
+             cuckoo_table_factory_->GetPrintableOptions().c_str());
     ret.append(buffer);
   }
   return ret;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.h	2025-05-19 16:14:27.000000000 +0000
@@ -33,24 +33,18 @@
 
   const char* Name() const override { return "AdaptiveTableFactory"; }
 
+  using TableFactory::NewTableReader;
   Status NewTableReader(
-      const TableReaderOptions& table_reader_options,
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
       std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
       std::unique_ptr<TableReader>* table,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
-  // Sanitizes the specified DB Options.
-  Status SanitizeOptions(
-      const DBOptions& /*db_opts*/,
-      const ColumnFamilyOptions& /*cf_opts*/) const override {
-    return Status::OK();
-  }
-
-  std::string GetPrintableTableOptions() const override;
+  std::string GetPrintableOptions() const override;
 
  private:
   std::shared_ptr<TableFactory> table_factory_to_write_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,73 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/binary_search_index_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status BinarySearchIndexReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(index_reader != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  index_reader->reset(
+      new BinarySearchIndexReader(table, std::move(index_block)));
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* BinarySearchIndexReader::NewIterator(
+    const ReadOptions& read_options, bool /* disable_prefix_seek */,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const BlockBasedTable::Rep* rep = table()->get_rep();
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  Statistics* kNullStats = nullptr;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  auto it = index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true,
+      index_has_first_key(), index_key_includes_seq(), index_value_is_full());
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/binary_search_index_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that allows binary search lookup for the first key of each block.
+// This class can be viewed as a thin wrapper for `Block` class which already
+// supports binary search.
+class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read index from the file and create an intance for
+  // `BinarySearchIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  BinarySearchIndexReader(const BlockBasedTable* t,
+                          CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.cc	2025-05-19 16:14:27.000000000 +0000
@@ -15,7 +15,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@@ -127,22 +126,48 @@
   }
 };
 
-void DataBlockIter::Next() {
-  assert(Valid());
-  ParseNextDataKey<DecodeEntry>();
+struct DecodeEntryV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    assert(value_length);
+
+    *value_length = 0;
+    return DecodeKeyV4()(p, limit, shared, non_shared);
+  }
+};
+void DataBlockIter::NextImpl() {
+  bool is_shared = false;
+  ParseNextDataKey(&is_shared);
 }
 
-void DataBlockIter::NextOrReport() {
-  assert(Valid());
-  ParseNextDataKey<CheckAndDecodeEntry>();
+void MetaBlockIter::NextImpl() {
+  bool is_shared = false;
+  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
 }
 
-void IndexBlockIter::Next() {
+void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
+
+void IndexBlockIter::PrevImpl() {
   assert(Valid());
-  ParseNextIndexKey();
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+  SeekToRestartPoint(restart_index_);
+  // Loop until end of current entry hits the start of original entry
+  while (ParseNextIndexKey() && NextEntryOffset() < original) {
+  }
 }
 
-void IndexBlockIter::Prev() {
+void MetaBlockIter::PrevImpl() {
   assert(Valid());
   // Scan backwards to a restart point before current_
   const uint32_t original = current_;
@@ -156,13 +181,15 @@
     restart_index_--;
   }
   SeekToRestartPoint(restart_index_);
+  bool is_shared = false;
   // Loop until end of current entry hits the start of original entry
-  while (ParseNextIndexKey() && NextEntryOffset() < original) {
+  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
+         NextEntryOffset() < original) {
   }
 }
 
-// Similar to IndexBlockIter::Prev but also caches the prev entries
-void DataBlockIter::Prev() {
+// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
+void DataBlockIter::PrevImpl() {
   assert(Valid());
 
   assert(prev_entries_idx_ == -1 ||
@@ -176,19 +203,25 @@
         prev_entries_[prev_entries_idx_];
 
     const char* key_ptr = nullptr;
+    bool raw_key_cached;
     if (current_prev_entry.key_ptr != nullptr) {
       // The key is not delta encoded and stored in the data block
       key_ptr = current_prev_entry.key_ptr;
-      key_pinned_ = true;
+      raw_key_cached = false;
     } else {
       // The key is delta encoded and stored in prev_entries_keys_buff_
       key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
-      key_pinned_ = false;
+      raw_key_cached = true;
     }
     const Slice current_key(key_ptr, current_prev_entry.key_size);
 
     current_ = current_prev_entry.offset;
-    key_.SetKey(current_key, false /* copy */);
+    // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
+    // not necessity. It is convenient since this class treats keys as pinned
+    // when `raw_key_` points to an outside buffer. So we cannot allow
+    // `raw_key_` point into Prev cache as it is a transient outside buffer
+    // (i.e., keys in it are not actually pinned).
+    raw_key_.SetKey(current_key, raw_key_cached /* copy */);
     value_ = current_prev_entry.value;
 
     return;
@@ -214,12 +247,13 @@
   SeekToRestartPoint(restart_index_);
 
   do {
-    if (!ParseNextDataKey<DecodeEntry>()) {
+    bool is_shared = false;
+    if (!ParseNextDataKey(&is_shared)) {
       break;
     }
-    Slice current_key = key();
+    Slice current_key = raw_key_.GetKey();
 
-    if (key_.IsKeyPinned()) {
+    if (raw_key_.IsKeyPinned()) {
       // The key is not delta encoded
       prev_entries_.emplace_back(current_, current_key.data(), 0,
                                  current_key.size(), value());
@@ -236,24 +270,36 @@
   prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
 }
 
-void DataBlockIter::Seek(const Slice& target) {
+void DataBlockIter::SeekImpl(const Slice& target) {
   Slice seek_key = target;
   PERF_TIMER_GUARD(block_seek_nanos);
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   uint32_t index = 0;
-  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
-                                  comparator_);
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
 
   if (!ok) {
     return;
   }
-  SeekToRestartPoint(index);
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
 
-  // Linear search (within restart block) for first key >= target
-  while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
+void MetaBlockIter::SeekImpl(const Slice& target) {
+  Slice seek_key = target;
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
   }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
 }
 
 // Optimized Seek for point lookup for an internal key `target`
@@ -273,8 +319,8 @@
 //
 // If the return value is TRUE, iter location has two possibilies:
 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
-//    this case, it points to the first key_ with a larger user_key or a
-//    matching user_key with a seqno no greater than the seeking seqno.
+//    this case, it points to the first key with a larger user_key or a matching
+//    user_key with a seqno no greater than the seeking seqno.
 // 2) If the iter is invalid, it means that either all the user_key is less
 //    than the seek_user_key, or the block ends with a matching user_key but
 //    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
@@ -287,21 +333,21 @@
 
   if (entry == kCollision) {
     // HashSeek not effective, falling back
-    Seek(target);
+    SeekImpl(target);
     return true;
   }
 
   if (entry == kNoEntry) {
     // Even if we cannot find the user_key in this block, the result may
-    // exist in the next block. Consider this exmpale:
+    // exist in the next block. Consider this example:
     //
     // Block N:    [aab@100, ... , app@120]
-    // bounary key: axy@50 (we make minimal assumption about a boundary key)
+    // boundary key: axy@50 (we make minimal assumption about a boundary key)
     // Block N+1:  [axy@10, ...   ]
     //
     // If seek_key = axy@60, the search will starts from Block N.
     // Even if the user_key is not found in the hash map, the caller still
-    // have to conntinue searching the next block.
+    // have to continue searching the next block.
     //
     // In this case, we pretend the key is the the last restart interval.
     // The while-loop below will search the last restart interval for the
@@ -315,22 +361,21 @@
   // check if the key is in the restart_interval
   assert(restart_index < num_restarts_);
   SeekToRestartPoint(restart_index);
+  current_ = GetRestartPoint(restart_index);
 
-  const char* limit = nullptr;
-  if (restart_index_ + 1 < num_restarts_) {
-    limit = data_ + GetRestartPoint(restart_index_ + 1);
-  } else {
-    limit = data_ + restarts_;
+  uint32_t limit = restarts_;
+  if (restart_index + 1 < num_restarts_) {
+    limit = GetRestartPoint(restart_index + 1);
   }
-
-  while (true) {
+  while (current_ < limit) {
+    bool shared;
     // Here we only linear seek the target key inside the restart interval.
     // If a key does not exist inside a restart interval, we avoid
-    // further searching the block content accross restart interval boundary.
+    // further searching the block content across restart interval boundary.
     //
-    // TODO(fwu): check the left and write boundary of the restart interval
+    // TODO(fwu): check the left and right boundary of the restart interval
     // to avoid linear seek a target key that is out of range.
-    if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
+    if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) {
       // we stop at the first potential matching user key.
       break;
     }
@@ -341,7 +386,7 @@
     // 1) there is only one user_key match in the block (otherwise collsion).
     //    the matching user_key resides in the last restart interval, and it
     //    is the last key of the restart interval and of the block as well.
-    //    ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
+    //    ParseNextKey() skiped it as its [ type | seqno ] is smaller.
     //
     // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
     //    AND all existing user_keys in the restart interval are smaller than
@@ -355,18 +400,18 @@
     return true;
   }
 
-  if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) {
+  if (ucmp().Compare(raw_key_.GetUserKey(), target_user_key) != 0) {
     // the key is not in this block and cannot be at the next block either.
     return false;
   }
 
   // Here we are conservative and only support a limited set of cases
-  ValueType value_type = ExtractValueType(key_.GetKey());
+  ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
   if (value_type != ValueType::kTypeValue &&
       value_type != ValueType::kTypeDeletion &&
       value_type != ValueType::kTypeSingleDeletion &&
       value_type != ValueType::kTypeBlobIndex) {
-    Seek(target);
+    SeekImpl(target);
     return true;
   }
 
@@ -374,18 +419,19 @@
   return true;
 }
 
-void IndexBlockIter::Seek(const Slice& target) {
+void IndexBlockIter::SeekImpl(const Slice& target) {
   TEST_SYNC_POINT("IndexBlockIter::Seek:0");
-  Slice seek_key = target;
-  if (!key_includes_seq_) {
-    seek_key = ExtractUserKey(target);
-  }
   PERF_TIMER_GUARD(block_seek_nanos);
   if (data_ == nullptr) {  // Not init yet
     return;
   }
+  Slice seek_key = target;
+  if (raw_key_.IsUserKey()) {
+    seek_key = ExtractUserKey(target);
+  }
   status_ = Status::OK();
   uint32_t index = 0;
+  bool skip_linear_scan = false;
   bool ok = false;
   if (prefix_index_) {
     bool prefix_may_exist = true;
@@ -397,68 +443,88 @@
       current_ = restarts_;
       status_ = Status::NotFound();
     }
+    // restart interval must be one when hash search is enabled so the binary
+    // search simply lands at the right place.
+    skip_linear_scan = true;
   } else if (value_delta_encoded_) {
-    ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
-                                 comparator_);
+    ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
   } else {
-    ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
-                               comparator_);
+    ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
   }
 
   if (!ok) {
     return;
   }
-  SeekToRestartPoint(index);
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+}
 
-  // Linear search (within restart block) for first key >= target
-  while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) {
+void DataBlockIter::SeekForPrevImpl(const Slice& target) {
+  PERF_TIMER_GUARD(block_seek_nanos);
+  Slice seek_key = target;
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+
+  if (!ok) {
+    return;
+  }
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
+
+  if (!Valid()) {
+    SeekToLastImpl();
+  } else {
+    while (Valid() && CompareCurrentKey(seek_key) > 0) {
+      PrevImpl();
+    }
   }
 }
 
-void DataBlockIter::SeekForPrev(const Slice& target) {
+void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
   PERF_TIMER_GUARD(block_seek_nanos);
   Slice seek_key = target;
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   uint32_t index = 0;
-  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
-                                  comparator_);
+  bool skip_linear_scan = false;
+  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
 
   if (!ok) {
     return;
   }
-  SeekToRestartPoint(index);
+  FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
 
-  // Linear search (within restart block) for first key >= seek_key
-  while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
-  }
   if (!Valid()) {
-    SeekToLast();
+    SeekToLastImpl();
   } else {
-    while (Valid() && Compare(key_, seek_key) > 0) {
-      Prev();
+    while (Valid() && CompareCurrentKey(seek_key) > 0) {
+      PrevImpl();
     }
   }
 }
 
-void DataBlockIter::SeekToFirst() {
+void DataBlockIter::SeekToFirstImpl() {
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   SeekToRestartPoint(0);
-  ParseNextDataKey<DecodeEntry>();
+  bool is_shared = false;
+  ParseNextDataKey(&is_shared);
 }
 
-void DataBlockIter::SeekToFirstOrReport() {
+void MetaBlockIter::SeekToFirstImpl() {
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   SeekToRestartPoint(0);
-  ParseNextDataKey<CheckAndDecodeEntry>();
+  bool is_shared = false;
+  ParseNextKey<CheckAndDecodeEntry>(&is_shared);
 }
 
-void IndexBlockIter::SeekToFirst() {
+void IndexBlockIter::SeekToFirstImpl() {
   if (data_ == nullptr) {  // Not init yet
     return;
   }
@@ -467,17 +533,30 @@
   ParseNextIndexKey();
 }
 
-void DataBlockIter::SeekToLast() {
+void DataBlockIter::SeekToLastImpl() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  bool is_shared = false;
+  while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
+}
+
+void MetaBlockIter::SeekToLastImpl() {
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   SeekToRestartPoint(num_restarts_ - 1);
-  while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
+  bool is_shared = false;
+  while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
+         NextEntryOffset() < restarts_) {
     // Keep skipping
   }
 }
 
-void IndexBlockIter::SeekToLast() {
+void IndexBlockIter::SeekToLastImpl() {
   if (data_ == nullptr) {  // Not init yet
     return;
   }
@@ -493,17 +572,16 @@
   current_ = restarts_;
   restart_index_ = num_restarts_;
   status_ = Status::Corruption("bad entry in block");
-  key_.Clear();
+  raw_key_.Clear();
   value_.clear();
 }
 
+template <class TValue>
 template <typename DecodeEntryFunc>
-bool DataBlockIter::ParseNextDataKey(const char* limit) {
+bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
   current_ = NextEntryOffset();
   const char* p = data_ + current_;
-  if (!limit) {
-    limit = data_ + restarts_;  // Restarts come right after data
-  }
+  const char* limit = data_ + restarts_;  // Restarts come right after data
 
   if (p >= limit) {
     // No more entries to return.  Mark as invalid.
@@ -511,50 +589,23 @@
     restart_index_ = num_restarts_;
     return false;
   }
-
   // Decode next entry
   uint32_t shared, non_shared, value_length;
   p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
-  if (p == nullptr || key_.Size() < shared) {
+  if (p == nullptr || raw_key_.Size() < shared) {
     CorruptionError();
     return false;
   } else {
     if (shared == 0) {
-      // If this key dont share any bytes with prev key then we dont need
-      // to decode it and can use it's address in the block directly.
-      key_.SetKey(Slice(p, non_shared), false /* copy */);
-      key_pinned_ = true;
+      *is_shared = false;
+      // If this key doesn't share any bytes with prev key then we don't need
+      // to decode it and can use its address in the block directly.
+      raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
     } else {
       // This key share `shared` bytes with prev key, we need to decode it
-      key_.TrimAppend(shared, p, non_shared);
-      key_pinned_ = false;
-    }
-
-    if (global_seqno_ != kDisableGlobalSequenceNumber) {
-      // If we are reading a file with a global sequence number we should
-      // expect that all encoded sequence numbers are zeros and any value
-      // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
-      assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
-
-      ValueType value_type = ExtractValueType(key_.GetKey());
-      assert(value_type == ValueType::kTypeValue ||
-             value_type == ValueType::kTypeMerge ||
-             value_type == ValueType::kTypeDeletion ||
-             value_type == ValueType::kTypeRangeDeletion);
-
-      if (key_pinned_) {
-        // TODO(tec): Investigate updating the seqno in the loaded block
-        // directly instead of doing a copy and update.
-
-        // We cannot use the key address in the block directly because
-        // we have a global_seqno_ that will overwrite the encoded one.
-        key_.OwnKey();
-        key_pinned_ = false;
-      }
-
-      key_.UpdateInternalKey(global_seqno_, value_type);
+      *is_shared = true;
+      raw_key_.TrimAppend(shared, p, non_shared);
     }
-
     value_ = Slice(p + non_shared, value_length);
     if (shared == 0) {
       while (restart_index_ + 1 < num_restarts_ &&
@@ -568,52 +619,42 @@
   }
 }
 
-bool IndexBlockIter::ParseNextIndexKey() {
-  current_ = NextEntryOffset();
-  const char* p = data_ + current_;
-  const char* limit = data_ + restarts_;  // Restarts come right after data
-  if (p >= limit) {
-    // No more entries to return.  Mark as invalid.
-    current_ = restarts_;
-    restart_index_ = num_restarts_;
-    return false;
-  }
-
-  // Decode next entry
-  uint32_t shared, non_shared, value_length;
-  if (value_delta_encoded_) {
-    p = DecodeKeyV4()(p, limit, &shared, &non_shared);
-    value_length = 0;
+bool DataBlockIter::ParseNextDataKey(bool* is_shared) {
+  if (ParseNextKey<DecodeEntry>(is_shared)) {
+#ifndef NDEBUG
+    if (global_seqno_ != kDisableGlobalSequenceNumber) {
+      // If we are reading a file with a global sequence number we should
+      // expect that all encoded sequence numbers are zeros and any value
+      // type is kTypeValue, kTypeMerge, kTypeDeletion,
+      // kTypeDeletionWithTimestamp, or kTypeRangeDeletion.
+      uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
+      SequenceNumber seqno;
+      ValueType value_type;
+      UnPackSequenceAndType(packed, &seqno, &value_type);
+      assert(value_type == ValueType::kTypeValue ||
+             value_type == ValueType::kTypeMerge ||
+             value_type == ValueType::kTypeDeletion ||
+             value_type == ValueType::kTypeDeletionWithTimestamp ||
+             value_type == ValueType::kTypeRangeDeletion);
+      assert(seqno == 0);
+    }
+#endif  // NDEBUG
+    return true;
   } else {
-    p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
-  }
-  if (p == nullptr || key_.Size() < shared) {
-    CorruptionError();
     return false;
   }
-  if (shared == 0) {
-    // If this key dont share any bytes with prev key then we dont need
-    // to decode it and can use it's address in the block directly.
-    key_.SetKey(Slice(p, non_shared), false /* copy */);
-    key_pinned_ = true;
-  } else {
-    // This key share `shared` bytes with prev key, we need to decode it
-    key_.TrimAppend(shared, p, non_shared);
-    key_pinned_ = false;
-  }
-  value_ = Slice(p + non_shared, value_length);
-  if (shared == 0) {
-    while (restart_index_ + 1 < num_restarts_ &&
-           GetRestartPoint(restart_index_ + 1) < current_) {
-      ++restart_index_;
-    }
-  }
-  // else we are in the middle of a restart interval and the restart_index_
-  // thus has not changed
-  if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
-    DecodeCurrentValue(shared);
+}
+
+bool IndexBlockIter::ParseNextIndexKey() {
+  bool is_shared = false;
+  bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared)
+                                   : ParseNextKey<DecodeEntry>(&is_shared);
+  if (ok) {
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
+      DecodeCurrentValue(is_shared);
+    }
   }
-  return true;
+  return ok;
 }
 
 // The format:
@@ -623,16 +664,16 @@
 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
 // where, k is key, v is value, and its encoding is in parenthesis.
 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
-// The format of each value, i.e., block hanlde, is (offset, size) whenever the
-// shared_size is 0, which included the first entry in each restart point.
+// The format of each value, i.e., block handle, is (offset, size) whenever the
+// is_shared is false, which included the first entry in each restart point.
 // Otherwise the format is delta-size = block handle size - size of last block
 // handle.
-void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
+void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
   Slice v(value_.data(), data_ + restarts_ - value_.data());
   // Delta encoding is used if `shared` != 0.
   Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
       &v, have_first_key_,
-      (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
+      (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
   assert(decode_s.ok());
   value_ = Slice(value_.data(), v.data() - value_.data());
 
@@ -657,20 +698,78 @@
   }
 }
 
-// Binary search in restart array to find the first restart point that
-// is either the last restart point with a key less than target,
-// which means the key of next restart point is larger than target, or
-// the first restart point with a key = target
+template <class TValue>
+void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
+                                               uint32_t index,
+                                               bool skip_linear_scan) {
+  // SeekToRestartPoint() only does the lookup in the restart block. We need
+  // to follow it up with NextImpl() to position the iterator at the restart
+  // key.
+  SeekToRestartPoint(index);
+  NextImpl();
+
+  if (!skip_linear_scan) {
+    // Linear search (within restart block) for first key >= target
+    uint32_t max_offset;
+    if (index + 1 < num_restarts_) {
+      // We are in a non-last restart interval. Since `BinarySeek()` guarantees
+      // the next restart key is strictly greater than `target`, we can
+      // terminate upon reaching it without any additional key comparison.
+      max_offset = GetRestartPoint(index + 1);
+    } else {
+      // We are in the last restart interval. The while-loop will terminate by
+      // `Valid()` returning false upon advancing past the block's last key.
+      max_offset = port::kMaxUint32;
+    }
+    while (true) {
+      NextImpl();
+      if (!Valid()) {
+        break;
+      }
+      if (current_ == max_offset) {
+        assert(CompareCurrentKey(target) > 0);
+        break;
+      } else if (CompareCurrentKey(target) >= 0) {
+        break;
+      }
+    }
+  }
+}
+
+// Binary searches in restart array to find the starting restart point for the
+// linear scan, and stores it in `*index`. Assumes restart array does not
+// contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
+// is strictly greater than `target` or does not exist (this can be used to
+// elide a comparison when linear scan reaches all the way to the next restart
+// key). Furthermore, `*skip_linear_scan` is set to indicate whether the
+// `*index`th restart key is the final result so that key does not need to be
+// compared again later.
 template <class TValue>
 template <typename DecodeKeyFunc>
-bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
-                                   uint32_t right, uint32_t* index,
-                                   const Comparator* comp) {
-  assert(left <= right);
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
+                                   bool* skip_linear_scan) {
+  if (restarts_ == 0) {
+    // SST files dedicated to range tombstones are written with index blocks
+    // that have no keys while also having `num_restarts_ == 1`. This would
+    // cause a problem for `BinarySeek()` as it'd try to access the first key
+    // which does not exist. We identify such blocks by the offset at which
+    // their restarts are stored, and return false to prevent any attempted
+    // key accesses.
+    return false;
+  }
 
-  while (left < right) {
-    uint32_t mid = (left + right + 1) / 2;
-    uint32_t region_offset = GetRestartPoint(mid);
+  *skip_linear_scan = false;
+  // Loop invariants:
+  // - Restart key at index `left` is less than or equal to the target key. The
+  //   sentinel index `-1` is considered to have a key that is less than all
+  //   keys.
+  // - Any restart keys after index `right` are strictly greater than the target
+  //   key.
+  int64_t left = -1, right = num_restarts_ - 1;
+  while (left != right) {
+    // The `mid` is computed by rounding up so it lands in (`left`, `right`].
+    int64_t mid = left + (right - left + 1) / 2;
+    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
     uint32_t shared, non_shared;
     const char* key_ptr = DecodeKeyFunc()(
         data_ + region_offset, data_ + restarts_, &shared, &non_shared);
@@ -679,7 +778,8 @@
       return false;
     }
     Slice mid_key(key_ptr, non_shared);
-    int cmp = comp->Compare(mid_key, target);
+    raw_key_.SetKey(mid_key, false /* copy */);
+    int cmp = CompareCurrentKey(target);
     if (cmp < 0) {
       // Key at "mid" is smaller than "target". Therefore all
       // blocks before "mid" are uninteresting.
@@ -689,11 +789,19 @@
       // after "mid" are uninteresting.
       right = mid - 1;
     } else {
+      *skip_linear_scan = true;
       left = right = mid;
     }
   }
 
-  *index = left;
+  if (left == -1) {
+    // All keys in the block were strictly greater than `target`. So the very
+    // first key in the block is the final seek result.
+    *skip_linear_scan = true;
+    *index = 0;
+  } else {
+    *index = static_cast<uint32_t>(left);
+  }
   return true;
 }
 
@@ -713,7 +821,8 @@
     return 1;  // Return target is smaller
   }
   Slice block_key(key_ptr, non_shared);
-  return Compare(block_key, target);
+  raw_key_.SetKey(block_key, false /* copy */);
+  return CompareCurrentKey(target);
 }
 
 // Binary search in block_ids to find the first block
@@ -807,7 +916,7 @@
   assert(prefix_index_);
   *prefix_may_exist = true;
   Slice seek_key = target;
-  if (!key_includes_seq_) {
+  if (raw_key_.IsUserKey()) {
     seek_key = ExtractUserKey(target);
   }
   uint32_t* block_ids = nullptr;
@@ -865,14 +974,13 @@
   // TEST_SYNC_POINT("Block::~Block");
 }
 
-Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
-             size_t read_amp_bytes_per_bit, Statistics* statistics)
+Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
+             Statistics* statistics)
     : contents_(std::move(contents)),
       data_(contents_.data.data()),
       size_(contents_.data.size()),
       restart_offset_(0),
-      num_restarts_(0),
-      global_seqno_(_global_seqno) {
+      num_restarts_(0) {
   TEST_SYNC_POINT("Block::Block:0");
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
@@ -923,8 +1031,23 @@
   }
 }
 
-DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
-                                      const Comparator* ucmp,
+MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
+  MetaBlockIter* iter = new MetaBlockIter();
+  if (size_ < 2 * sizeof(uint32_t)) {
+    iter->Invalidate(Status::Corruption("bad block contents"));
+    return iter;
+  } else if (num_restarts_ == 0) {
+    // Empty block.
+    iter->Invalidate(Status::OK());
+  } else {
+    iter->Initialize(data_, restart_offset_, num_restarts_,
+                     block_contents_pinned);
+  }
+  return iter;
+}
+
+DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
+                                      SequenceNumber global_seqno,
                                       DataBlockIter* iter, Statistics* stats,
                                       bool block_contents_pinned) {
   DataBlockIter* ret_iter;
@@ -943,7 +1066,7 @@
     return ret_iter;
   } else {
     ret_iter->Initialize(
-        cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
+        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
         read_amp_bitmap_.get(), block_contents_pinned,
         data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
     if (read_amp_bitmap_) {
@@ -958,10 +1081,10 @@
 }
 
 IndexBlockIter* Block::NewIndexIterator(
-    const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
-    Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
-    bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
-    BlockPrefixIndex* prefix_index) {
+    const Comparator* raw_ucmp, SequenceNumber global_seqno,
+    IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
+    bool have_first_key, bool key_includes_seq, bool value_is_full,
+    bool block_contents_pinned, BlockPrefixIndex* prefix_index) {
   IndexBlockIter* ret_iter;
   if (iter != nullptr) {
     ret_iter = iter;
@@ -979,8 +1102,8 @@
   } else {
     BlockPrefixIndex* prefix_index_ptr =
         total_order_seek ? nullptr : prefix_index;
-    ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
-                         global_seqno_, prefix_index_ptr, have_first_key,
+    ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
+                         global_seqno, prefix_index_ptr, have_first_key,
                          key_includes_seq, value_is_full,
                          block_contents_pinned);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,7 +13,6 @@
 #include <string>
 #include <vector>
 
-#include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 #include "port/malloc.h"
 #include "rocksdb/iterator.h"
@@ -35,6 +34,7 @@
 class BlockIter;
 class DataBlockIter;
 class IndexBlockIter;
+class MetaBlockIter;
 class BlockPrefixIndex;
 
 // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
@@ -151,8 +151,7 @@
 class Block {
  public:
   // Initialize the block with the specified contents.
-  explicit Block(BlockContents&& contents, SequenceNumber _global_seqno,
-                 size_t read_amp_bytes_per_bit = 0,
+  explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
                  Statistics* statistics = nullptr);
   // No copying allowed
   Block(const Block&) = delete;
@@ -169,8 +168,8 @@
 
   BlockBasedTableOptions::DataBlockIndexType IndexType() const;
 
-  // If comparator is InternalKeyComparator, user_comparator is its user
-  // comparator; they are equal otherwise.
+  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+  // comparator.
   //
   // If iter is null, return new Iterator
   // If iter is not null, update this one and return it as Iterator*
@@ -188,12 +187,30 @@
   // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
   // the iterator will simply be set as "invalid", rather than returning
   // the key that is just pass the target key.
-  DataBlockIter* NewDataIterator(const Comparator* comparator,
-                                 const Comparator* user_comparator,
+  DataBlockIter* NewDataIterator(const Comparator* raw_ucmp,
+                                 SequenceNumber global_seqno,
                                  DataBlockIter* iter = nullptr,
                                  Statistics* stats = nullptr,
                                  bool block_contents_pinned = false);
 
+  // Returns an MetaBlockIter for iterating over blocks containing metadata
+  // (like Properties blocks).  Unlike data blocks, the keys for these blocks
+  // do not contain sequence numbers, do not use a user-define comparator, and
+  // do not track read amplification/statistics.  Additionally, MetaBlocks will
+  // not assert if the block is formatted improperly.
+  //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false);
+
+  // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
+  // comparator.
+  //
   // key_includes_seq, default true, means that the keys are in internal key
   // format.
   // value_is_full, default true, means that no delta encoding is
@@ -206,8 +223,8 @@
   // first_internal_key. It affects data serialization format, so the same value
   // have_first_key must be used when writing and reading index.
   // It is determined by IndexType property of the table.
-  IndexBlockIter* NewIndexIterator(const Comparator* comparator,
-                                   const Comparator* user_comparator,
+  IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp,
+                                   SequenceNumber global_seqno,
                                    IndexBlockIter* iter, Statistics* stats,
                                    bool total_order_seek, bool have_first_key,
                                    bool key_includes_seq, bool value_is_full,
@@ -217,8 +234,6 @@
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
 
-  SequenceNumber global_seqno() const { return global_seqno_; }
-
  private:
   BlockContents contents_;
   const char* data_;         // contents_.data.data()
@@ -226,23 +241,38 @@
   uint32_t restart_offset_;  // Offset in data_ of restart array
   uint32_t num_restarts_;
   std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
-  // All keys in the block will have seqno = global_seqno_, regardless of
-  // the encoded value (kDisableGlobalSequenceNumber means disabled)
-  const SequenceNumber global_seqno_;
-
   DataBlockHashIndex data_block_hash_index_;
 };
 
+// A `BlockIter` iterates over the entries in a `Block`'s data buffer. The
+// format of this data buffer is an uncompressed, sorted sequence of key-value
+// pairs (see `Block` API for more details).
+//
+// Notably, the keys may either be in internal key format or user key format.
+// Subclasses are responsible for configuring the key format.
+//
+// `BlockIter` intends to provide final overrides for all of
+// `InternalIteratorBase` functions that can move the iterator. It does
+// this to guarantee `UpdateKey()` is called exactly once after each key
+// movement potentially visible to users. In this step, the key is prepared
+// (e.g., serialized if global seqno is in effect) so it can be returned
+// immediately when the user asks for it via calling `key() const`.
+//
+// For its subclasses, it provides protected variants of the above-mentioned
+// final-overridden methods. They are named with the "Impl" suffix, e.g.,
+// `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
+// "Impl" functions are responsible for positioning `raw_key_` but not
+// invoking `UpdateKey()`.
 template <class TValue>
 class BlockIter : public InternalIteratorBase<TValue> {
  public:
-  void InitializeBase(const Comparator* comparator, const char* data,
+  void InitializeBase(const Comparator* raw_ucmp, const char* data,
                       uint32_t restarts, uint32_t num_restarts,
                       SequenceNumber global_seqno, bool block_contents_pinned) {
     assert(data_ == nullptr);  // Ensure it is called only once
     assert(num_restarts > 0);  // Ensure the param is valid
 
-    comparator_ = comparator;
+    raw_ucmp_ = raw_ucmp;
     data_ = data;
     restarts_ = restarts;
     num_restarts_ = num_restarts;
@@ -255,10 +285,9 @@
 
   // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
   // nothing. Calls cleanup functions.
-  void InvalidateBase(Status s) {
+  virtual void Invalidate(const Status& s) {
     // Assert that the BlockIter is never deleted while Pinning is Enabled.
-    assert(!pinned_iters_mgr_ ||
-           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
 
     data_ = nullptr;
     current_ = restarts_;
@@ -269,10 +298,47 @@
   }
 
   bool Valid() const override { return current_ < restarts_; }
+
+  virtual void SeekToFirst() override final {
+    SeekToFirstImpl();
+    UpdateKey();
+  }
+
+  virtual void SeekToLast() override final {
+    SeekToLastImpl();
+    UpdateKey();
+  }
+
+  virtual void Seek(const Slice& target) override final {
+    SeekImpl(target);
+    UpdateKey();
+  }
+
+  virtual void SeekForPrev(const Slice& target) override final {
+    SeekForPrevImpl(target);
+    UpdateKey();
+  }
+
+  virtual void Next() override final {
+    NextImpl();
+    UpdateKey();
+  }
+
+  virtual bool NextAndGetResult(IterateResult* result) override final {
+    // This does not need to call `UpdateKey()` as the parent class only has
+    // access to the `UpdateKey()`-invoking functions.
+    return InternalIteratorBase<TValue>::NextAndGetResult(result);
+  }
+
+  virtual void Prev() override final {
+    PrevImpl();
+    UpdateKey();
+  }
+
   Status status() const override { return status_; }
   Slice key() const override {
     assert(Valid());
-    return key_.GetKey();
+    return key_;
   }
 
 #ifndef NDEBUG
@@ -280,6 +346,7 @@
     // Assert that the BlockIter is never deleted while Pinning is Enabled.
     assert(!pinned_iters_mgr_ ||
            (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+    status_.PermitUncheckedError();
   }
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
@@ -304,9 +371,6 @@
   Cache::Handle* cache_handle() { return cache_handle_; }
 
  protected:
-  // Note: The type could be changed to InternalKeyComparator but we see a weird
-  // performance drop by that.
-  const Comparator* comparator_;
   const char* data_;       // underlying block contents
   uint32_t num_restarts_;  // Number of uint32_t entries in restart array
 
@@ -315,9 +379,14 @@
   uint32_t restarts_;  // Offset of restart array (list of fixed32)
   // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
   uint32_t current_;
-  IterKey key_;
+  // Raw key from block.
+  IterKey raw_key_;
+  // Buffer for key data when global seqno assignment is enabled.
+  IterKey key_buf_;
   Slice value_;
   Status status_;
+  // Key to be exposed to users.
+  Slice key_;
   bool key_pinned_;
   // Whether the block data is guaranteed to outlive this iterator, and
   // as long as the cleanup functions are transferred to another class,
@@ -325,7 +394,62 @@
   bool block_contents_pinned_;
   SequenceNumber global_seqno_;
 
+  virtual void SeekToFirstImpl() = 0;
+  virtual void SeekToLastImpl() = 0;
+  virtual void SeekImpl(const Slice& target) = 0;
+  virtual void SeekForPrevImpl(const Slice& target) = 0;
+  virtual void NextImpl() = 0;
+
+  virtual void PrevImpl() = 0;
+
+  template <typename DecodeEntryFunc>
+  inline bool ParseNextKey(bool* is_shared);
+
+  InternalKeyComparator icmp() {
+    return InternalKeyComparator(raw_ucmp_, false /* named */);
+  }
+
+  UserComparatorWrapper ucmp() { return UserComparatorWrapper(raw_ucmp_); }
+
+  // Must be called every time a key is found that needs to be returned to user,
+  // and may be called when no key is found (as a no-op). Updates `key_`,
+  // `key_buf_`, and `key_pinned_` with info about the found key.
+  void UpdateKey() {
+    key_buf_.Clear();
+    if (!Valid()) {
+      return;
+    }
+    if (raw_key_.IsUserKey()) {
+      assert(global_seqno_ == kDisableGlobalSequenceNumber);
+      key_ = raw_key_.GetUserKey();
+      key_pinned_ = raw_key_.IsKeyPinned();
+    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+      key_ = raw_key_.GetInternalKey();
+      key_pinned_ = raw_key_.IsKeyPinned();
+    } else {
+      key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
+                              ExtractValueType(raw_key_.GetInternalKey()));
+      key_ = key_buf_.GetInternalKey();
+      key_pinned_ = false;
+    }
+  }
+
+  // Returns the result of `Comparator::Compare()`, where the appropriate
+  // comparator is used for the block contents, the LHS argument is the current
+  // key with global seqno applied, and the RHS argument is `other`.
+  int CompareCurrentKey(const Slice& other) {
+    if (raw_key_.IsUserKey()) {
+      assert(global_seqno_ == kDisableGlobalSequenceNumber);
+      return ucmp().Compare(raw_key_.GetUserKey(), other);
+    } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
+      return icmp().Compare(raw_key_.GetInternalKey(), other);
+    }
+    return icmp().Compare(raw_key_.GetInternalKey(), global_seqno_, other,
+                          kDisableGlobalSequenceNumber);
+  }
+
  private:
+  const Comparator* raw_ucmp_;
   // Store the cache handle, if the block is cached. We need this since the
   // only other place the handle is stored is as an argument to the Cleanable
   // function callback, which is hard to retrieve. When multiple value
@@ -346,7 +470,7 @@
   }
 
   void SeekToRestartPoint(uint32_t index) {
-    key_.Clear();
+    raw_key_.Clear();
     restart_index_ = index;
     // current_ will be fixed by ParseNextKey();
 
@@ -357,36 +481,36 @@
 
   void CorruptionError();
 
+ protected:
   template <typename DecodeKeyFunc>
-  inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
-                         uint32_t* index, const Comparator* comp);
+  inline bool BinarySeek(const Slice& target, uint32_t* index,
+                         bool* is_index_key_result);
+
+  void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
+                              bool is_index_key_result);
 };
 
 class DataBlockIter final : public BlockIter<Slice> {
  public:
   DataBlockIter()
       : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
-  DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
-                const char* data, uint32_t restarts, uint32_t num_restarts,
-                SequenceNumber global_seqno,
+  DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
+                uint32_t num_restarts, SequenceNumber global_seqno,
                 BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
                 DataBlockHashIndex* data_block_hash_index)
       : DataBlockIter() {
-    Initialize(comparator, user_comparator, data, restarts, num_restarts,
-               global_seqno, read_amp_bitmap, block_contents_pinned,
-               data_block_hash_index);
+    Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
+               read_amp_bitmap, block_contents_pinned, data_block_hash_index);
   }
-  void Initialize(const Comparator* comparator,
-                  const Comparator* user_comparator, const char* data,
+  void Initialize(const Comparator* raw_ucmp, const char* data,
                   uint32_t restarts, uint32_t num_restarts,
                   SequenceNumber global_seqno,
                   BlockReadAmpBitmap* read_amp_bitmap,
                   bool block_contents_pinned,
                   DataBlockHashIndex* data_block_hash_index) {
-    InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
+    InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
                    block_contents_pinned);
-    user_comparator_ = user_comparator;
-    key_.SetIsUserKey(false);
+    raw_key_.SetIsUserKey(false);
     read_amp_bitmap_ = read_amp_bitmap;
     last_bitmap_offset_ = current_ + 1;
     data_block_hash_index_ = data_block_hash_index;
@@ -403,45 +527,35 @@
     return value_;
   }
 
-  void Seek(const Slice& target) override;
-
   inline bool SeekForGet(const Slice& target) {
     if (!data_block_hash_index_) {
-      Seek(target);
+      SeekImpl(target);
+      UpdateKey();
       return true;
     }
-
-    return SeekForGetImpl(target);
+    bool res = SeekForGetImpl(target);
+    UpdateKey();
+    return res;
   }
 
-  void SeekForPrev(const Slice& target) override;
-
-  void Prev() override;
-
-  void Next() final override;
-
-  // Try to advance to the next entry in the block. If there is data corruption
-  // or error, report it to the caller instead of aborting the process. May
-  // incur higher CPU overhead because we need to perform check on every entry.
-  void NextOrReport();
-
-  void SeekToFirst() override;
-
-  // Try to seek to the first entry in the block. If there is data corruption
-  // or error, report it to caller instead of aborting the process. May incur
-  // higher CPU overhead because we need to perform check on every entry.
-  void SeekToFirstOrReport();
-
-  void SeekToLast() override;
-
-  void Invalidate(Status s) {
-    InvalidateBase(s);
+  void Invalidate(const Status& s) override {
+    BlockIter::Invalidate(s);
     // Clear prev entries cache.
     prev_entries_keys_buff_.clear();
     prev_entries_.clear();
     prev_entries_idx_ = -1;
   }
 
+ protected:
+  friend Block;
+  inline bool ParseNextDataKey(bool* is_shared);
+  void SeekToFirstImpl() override;
+  void SeekToLastImpl() override;
+  void SeekImpl(const Slice& target) override;
+  void SeekForPrevImpl(const Slice& target) override;
+  void NextImpl() override;
+  void PrevImpl() override;
+
  private:
   // read-amp bitmap
   BlockReadAmpBitmap* read_amp_bitmap_;
@@ -472,41 +586,57 @@
   int32_t prev_entries_idx_ = -1;
 
   DataBlockHashIndex* data_block_hash_index_;
-  const Comparator* user_comparator_;
 
-  template <typename DecodeEntryFunc>
-  inline bool ParseNextDataKey(const char* limit = nullptr);
+  bool SeekForGetImpl(const Slice& target);
+};
 
-  inline int Compare(const IterKey& ikey, const Slice& b) const {
-    return comparator_->Compare(ikey.GetInternalKey(), b);
+// Iterator over MetaBlocks.  MetaBlocks are similar to Data Blocks and
+// are used to store Properties associated with table.
+// Meta blocks always store user keys (no sequence number) and always
+// use the BytewiseComparator.  Additionally, MetaBlock accesses are
+// not recorded in the Statistics or for Read-Amplification.
+class MetaBlockIter final : public BlockIter<Slice> {
+ public:
+  MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
+  void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
+                  bool block_contents_pinned) {
+    // Initializes the iterator with a BytewiseComparator and
+    // the raw key being a user key.
+    InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
+                   kDisableGlobalSequenceNumber, block_contents_pinned);
+    raw_key_.SetIsUserKey(true);
   }
 
-  bool SeekForGetImpl(const Slice& target);
+  Slice value() const override {
+    assert(Valid());
+    return value_;
+  }
+
+ protected:
+  void SeekToFirstImpl() override;
+  void SeekToLastImpl() override;
+  void SeekImpl(const Slice& target) override;
+  void SeekForPrevImpl(const Slice& target) override;
+  void NextImpl() override;
+  void PrevImpl() override;
 };
 
 class IndexBlockIter final : public BlockIter<IndexValue> {
  public:
   IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
 
-  Slice key() const override {
-    assert(Valid());
-    return key_.GetKey();
-  }
   // key_includes_seq, default true, means that the keys are in internal key
   // format.
   // value_is_full, default true, means that no delta encoding is
   // applied to values.
-  void Initialize(const Comparator* comparator,
-                  const Comparator* user_comparator, const char* data,
+  void Initialize(const Comparator* raw_ucmp, const char* data,
                   uint32_t restarts, uint32_t num_restarts,
                   SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
                   bool have_first_key, bool key_includes_seq,
                   bool value_is_full, bool block_contents_pinned) {
-    InitializeBase(key_includes_seq ? comparator : user_comparator, data,
-                   restarts, num_restarts, kDisableGlobalSequenceNumber,
-                   block_contents_pinned);
-    key_includes_seq_ = key_includes_seq;
-    key_.SetIsUserKey(!key_includes_seq_);
+    InitializeBase(raw_ucmp, data, restarts, num_restarts,
+                   kDisableGlobalSequenceNumber, block_contents_pinned);
+    raw_key_.SetIsUserKey(!key_includes_seq);
     prefix_index_ = prefix_index;
     value_delta_encoded_ = !value_is_full;
     have_first_key_ = have_first_key;
@@ -518,10 +648,8 @@
   }
 
   Slice user_key() const override {
-    if (key_includes_seq_) {
-      return ExtractUserKey(key());
-    }
-    return key();
+    assert(Valid());
+    return raw_key_.GetUserKey();
   }
 
   IndexValue value() const override {
@@ -538,6 +666,11 @@
     }
   }
 
+  bool IsValuePinned() const override {
+    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+  }
+
+ protected:
   // IndexBlockIter follows a different contract for prefix iterator
   // from data iterators.
   // If prefix of the seek key `target` exists in the file, it must
@@ -545,36 +678,28 @@
   // If the prefix of `target` doesn't exist in the file, it can either
   // return the result of total order seek, or set both of Valid() = false
   // and status() = NotFound().
-  void Seek(const Slice& target) override;
+  void SeekImpl(const Slice& target) override;
 
-  void SeekForPrev(const Slice&) override {
+  void SeekForPrevImpl(const Slice&) override {
     assert(false);
     current_ = restarts_;
     restart_index_ = num_restarts_;
     status_ = Status::InvalidArgument(
         "RocksDB internal error: should never call SeekForPrev() on index "
         "blocks");
-    key_.Clear();
+    raw_key_.Clear();
     value_.clear();
   }
 
-  void Prev() override;
-
-  void Next() override;
+  void PrevImpl() override;
 
-  void SeekToFirst() override;
+  void NextImpl() override;
 
-  void SeekToLast() override;
+  void SeekToFirstImpl() override;
 
-  void Invalidate(Status s) { InvalidateBase(s); }
-
-  bool IsValuePinned() const override {
-    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
-  }
+  void SeekToLastImpl() override;
 
  private:
-  // Key is in InternalKey format
-  bool key_includes_seq_;
   bool value_delta_encoded_;
   bool have_first_key_;  // value includes first_internal_key
   BlockPrefixIndex* prefix_index_;
@@ -613,19 +738,11 @@
                             bool* prefix_may_exist);
   inline int CompareBlockKey(uint32_t block_index, const Slice& target);
 
-  inline int Compare(const Slice& a, const Slice& b) const {
-    return comparator_->Compare(a, b);
-  }
-
-  inline int Compare(const IterKey& ikey, const Slice& b) const {
-    return comparator_->Compare(ikey.GetKey(), b);
-  }
-
   inline bool ParseNextIndexKey();
 
   // When value_delta_encoded_ is enabled it decodes the value which is assumed
   // to be BlockHandle and put it to decoded_value_
-  inline void DecodeCurrentValue(uint32_t shared);
+  inline void DecodeCurrentValue(bool is_shared);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc	2025-05-19 16:14:27.000000000 +0000
@@ -68,7 +68,7 @@
       whole_key_filtering_(table_opt.whole_key_filtering),
       prev_prefix_start_(0),
       prev_prefix_size_(0),
-      num_added_(0) {
+      total_added_in_built_(0) {
   assert(policy_);
 }
 
@@ -80,19 +80,22 @@
   }
 }
 
-void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
-  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
-    AddPrefix(key);
+size_t BlockBasedFilterBlockBuilder::EstimateEntriesAdded() {
+  return total_added_in_built_ + start_.size();
+}
+
+void BlockBasedFilterBlockBuilder::Add(const Slice& key_without_ts) {
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) {
+    AddPrefix(key_without_ts);
   }
 
   if (whole_key_filtering_) {
-    AddKey(key);
+    AddKey(key_without_ts);
   }
 }
 
 // Add key to filter if needed
 inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
-  num_added_++;
   start_.push_back(entries_.size());
   entries_.append(key.data(), key.size());
 }
@@ -114,10 +117,12 @@
   }
 }
 
-Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
-                                           Status* status) {
-  // In this impl we ignore BlockHandle
+Slice BlockBasedFilterBlockBuilder::Finish(
+    const BlockHandle& /*tmp*/, Status* status,
+    std::unique_ptr<const char[]>* /* filter_data */) {
+  // In this impl we ignore BlockHandle and filter_data
   *status = Status::OK();
+
   if (!start_.empty()) {
     GenerateFilter();
   }
@@ -140,6 +145,7 @@
     filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
     return;
   }
+  total_added_in_built_ += num_entries;
 
   // Make list of keys from flattened key structure
   start_.push_back(entries_.size());  // Simplify length computation
@@ -171,19 +177,20 @@
 }
 
 std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
-    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    bool use_cache, bool prefetch, bool pin,
-    BlockCacheLookupContext* lookup_context) {
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
   assert(table);
   assert(table->get_rep());
   assert(!pin || prefetch);
 
   CachableEntry<BlockContents> filter_block;
   if (prefetch || !use_cache) {
-    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
-                                     use_cache, nullptr /* get_context */,
-                                     lookup_context, &filter_block);
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block);
     if (!s.ok()) {
+      IGNORE_STATUS_IF_ERROR(s);
       return std::unique_ptr<FilterBlockReader>();
     }
 
@@ -251,6 +258,7 @@
   const Status s =
       GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
   if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
     return true;
   }
 
@@ -309,6 +317,7 @@
       GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
                            nullptr /* lookup_context */, &filter_block);
   if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
     return std::string("Unable to retrieve filter block");
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block.h	2025-05-19 16:14:27.000000000 +0000
@@ -44,9 +44,14 @@
 
   virtual bool IsBlockBased() override { return true; }
   virtual void StartBlock(uint64_t block_offset) override;
-  virtual void Add(const Slice& key) override;
-  virtual size_t NumAdded() const override { return num_added_; }
-  virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
+  virtual void Add(const Slice& key_without_ts) override;
+  virtual bool IsEmpty() const override {
+    return start_.empty() && filter_offsets_.empty();
+  }
+  virtual size_t EstimateEntriesAdded() override;
+  virtual Slice Finish(
+      const BlockHandle& tmp, Status* status,
+      std::unique_ptr<const char[]>* filter_data = nullptr) override;
   using FilterBlockBuilder::Finish;
 
  private:
@@ -70,7 +75,7 @@
   std::string result_;              // Filter data computed so far
   std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
   std::vector<uint32_t> filter_offsets_;
-  size_t num_added_;  // Number of keys added
+  uint64_t total_added_in_built_;  // Total keys added to filters built so far
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
@@ -85,9 +90,9 @@
   void operator=(const BlockBasedFilterBlockReader&) = delete;
 
   static std::unique_ptr<FilterBlockReader> Create(
-      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-      bool use_cache, bool prefetch, bool pin,
-      BlockCacheLookupContext* lookup_context);
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
 
   bool IsBlockBased() override { return true; }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_filter_block_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -76,17 +76,26 @@
 
 TEST_F(FilterBlockTest, SingleChunk) {
   BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  ASSERT_EQ(0, builder.NumAdded());
+  ASSERT_TRUE(builder.IsEmpty());
   builder.StartBlock(100);
   builder.Add("foo");
+  ASSERT_FALSE(builder.IsEmpty());
+  builder.Add("bar");
   builder.Add("bar");
   builder.Add("box");
   builder.StartBlock(200);
   builder.Add("box");
   builder.StartBlock(300);
   builder.Add("hello");
-  ASSERT_EQ(5, builder.NumAdded());
-  Slice slice(builder.Finish());
+  // XXX: "bar" should only count once but is counted twice. This actually
+  // indicates a serious space usage bug in old block-based filter. Good
+  // that it is deprecated.
+  // "box" counts twice, because it's in distinct blocks.
+  ASSERT_EQ(6, builder.EstimateEntriesAdded());
+  ASSERT_FALSE(builder.IsEmpty());
+  Status s;
+  Slice slice = builder.Finish(BlockHandle(), &s);
+  ASSERT_OK(s);
 
   CachableEntry<BlockContents> block(
       new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,53 +11,61 @@
 
 #include <assert.h>
 #include <stdio.h>
+
+#include <atomic>
 #include <list>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <utility>
 
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "cache/cache_reservation_manager.h"
 #include "db/dbformat.h"
 #include "index_builder.h"
-
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
-
+#include "rocksdb/types.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
+#include "table/block_based/block_like_traits.h"
 #include "table/block_based/filter_block.h"
 #include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
+#include "table/meta_blocks.h"
 #include "table/table_builder.h"
-
-#include "memory/memory_allocator.h"
 #include "util/coding.h"
 #include "util/compression.h"
-#include "util/crc32c.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/xxhash.h"
+#include "util/work_queue.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 extern const std::string kHashIndexPrefixesBlock;
 extern const std::string kHashIndexPrefixesMetadataBlock;
 
-typedef BlockBasedTableOptions::IndexType IndexType;
 
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace {
 
+constexpr size_t kBlockTrailerSize = BlockBasedTable::kBlockTrailerSize;
+
 // Create a filter block builder based on its type.
 FilterBlockBuilder* CreateFilterBlockBuilder(
     const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
@@ -65,7 +73,7 @@
     const bool use_delta_encoding_for_index_values,
     PartitionedIndexBuilder* const p_index_builder) {
   const BlockBasedTableOptions& table_opt = context.table_options;
-  if (table_opt.filter_policy == nullptr) return nullptr;
+  assert(table_opt.filter_policy);  // precondition
 
   FilterBitsBuilder* filter_bits_builder =
       BloomFilterPolicy::GetBuilderFromContext(context);
@@ -76,8 +84,9 @@
     if (table_opt.partition_filters) {
       assert(p_index_builder != nullptr);
       // Since after partition cut request from filter builder it takes time
-      // until index builder actully cuts the partition, we take the lower bound
-      // as partition size.
+      // until index builder actully cuts the partition, until the end of a
+      // data block potentially with many keys, we take the lower bound as
+      // partition size.
       assert(table_opt.block_size_deviation <= 100);
       auto partition_size =
           static_cast<uint32_t>(((table_opt.metadata_block_size *
@@ -102,48 +111,6 @@
   return compressed_size < raw_size - (raw_size / 8u);
 }
 
-bool CompressBlockInternal(const Slice& raw,
-                           const CompressionInfo& compression_info,
-                           uint32_t format_version,
-                           std::string* compressed_output) {
-  // Will return compressed block contents if (1) the compression method is
-  // supported in this platform and (2) the compression rate is "good enough".
-  switch (compression_info.type()) {
-    case kSnappyCompression:
-      return Snappy_Compress(compression_info, raw.data(), raw.size(),
-                             compressed_output);
-    case kZlibCompression:
-      return Zlib_Compress(
-          compression_info,
-          GetCompressFormatForVersion(kZlibCompression, format_version),
-          raw.data(), raw.size(), compressed_output);
-    case kBZip2Compression:
-      return BZip2_Compress(
-          compression_info,
-          GetCompressFormatForVersion(kBZip2Compression, format_version),
-          raw.data(), raw.size(), compressed_output);
-    case kLZ4Compression:
-      return LZ4_Compress(
-          compression_info,
-          GetCompressFormatForVersion(kLZ4Compression, format_version),
-          raw.data(), raw.size(), compressed_output);
-    case kLZ4HCCompression:
-      return LZ4HC_Compress(
-          compression_info,
-          GetCompressFormatForVersion(kLZ4HCCompression, format_version),
-          raw.data(), raw.size(), compressed_output);
-    case kXpressCompression:
-      return XPRESS_Compress(raw.data(), raw.size(), compressed_output);
-    case kZSTD:
-    case kZSTDNotFinalCompression:
-      return ZSTD_Compress(compression_info, raw.data(), raw.size(),
-                           compressed_output);
-    default:
-      // Do not recognize this compression type
-      return false;
-  }
-}
-
 }  // namespace
 
 // format_version is the block format as defined in include/rocksdb/table.h
@@ -152,11 +119,9 @@
                     bool do_sample, std::string* compressed_output,
                     std::string* sampled_output_fast,
                     std::string* sampled_output_slow) {
-  *type = info.type();
-
-  if (info.type() == kNoCompression && !info.SampleForCompression()) {
-    return raw;
-  }
+  assert(type);
+  assert(compressed_output);
+  assert(compressed_output->empty());
 
   // If requested, we sample one in every N block with a
   // fast and slow compression algorithm and report the stats.
@@ -164,10 +129,10 @@
   // enabling compression and they also get a hint about which
   // compression algorithm wil be beneficial.
   if (do_sample && info.SampleForCompression() &&
-      Random::GetTLSInstance()->OneIn((int)info.SampleForCompression()) &&
-      sampled_output_fast && sampled_output_slow) {
+      Random::GetTLSInstance()->OneIn(
+          static_cast<int>(info.SampleForCompression()))) {
     // Sampling with a fast compression algorithm
-    if (LZ4_Supported() || Snappy_Supported()) {
+    if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) {
       CompressionType c =
           LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
       CompressionContext context(c);
@@ -176,33 +141,46 @@
                                CompressionDict::GetEmptyDict(), c,
                                info.SampleForCompression());
 
-      CompressBlockInternal(raw, info_tmp, format_version, sampled_output_fast);
+      CompressData(raw, info_tmp, GetCompressFormatForVersion(format_version),
+                   sampled_output_fast);
     }
 
     // Sampling with a slow but high-compression algorithm
-    if (ZSTD_Supported() || Zlib_Supported()) {
+    if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) {
       CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
       CompressionContext context(c);
       CompressionOptions options;
       CompressionInfo info_tmp(options, context,
                                CompressionDict::GetEmptyDict(), c,
                                info.SampleForCompression());
-      CompressBlockInternal(raw, info_tmp, format_version, sampled_output_slow);
+
+      CompressData(raw, info_tmp, GetCompressFormatForVersion(format_version),
+                   sampled_output_slow);
     }
   }
 
-  // Actually compress the data
-  if (*type != kNoCompression) {
-    if (CompressBlockInternal(raw, info, format_version, compressed_output) &&
-        GoodCompressionRatio(compressed_output->size(), raw.size())) {
-      return *compressed_output;
-    }
+  if (info.type() == kNoCompression) {
+    *type = kNoCompression;
+    return raw;
+  }
+
+  // Actually compress the data; if the compression method is not supported,
+  // or the compression fails etc., just fall back to uncompressed
+  if (!CompressData(raw, info, GetCompressFormatForVersion(format_version),
+                    compressed_output)) {
+    *type = kNoCompression;
+    return raw;
+  }
+
+  // Check the compression ratio; if it's not good enough, just fall back to
+  // uncompressed
+  if (!GoodCompressionRatio(compressed_output->size(), raw.size())) {
+    *type = kNoCompression;
+    return raw;
   }
 
-  // Compression method is not supported, or not good
-  // compression ratio, so just fall back to uncompressed form.
-  *type = kNoCompression;
-  return raw;
+  *type = info.type();
+  return *compressed_output;
 }
 
 // kBlockBasedTableMagicNumber was picked by running
@@ -240,9 +218,9 @@
     return Status::OK();
   }
 
-  virtual void BlockAdd(uint64_t /* blockRawBytes */,
-                        uint64_t /* blockCompressedBytesFast */,
-                        uint64_t /* blockCompressedBytesSlow */) override {
+  virtual void BlockAdd(uint64_t /* block_raw_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
     // Intentionally left blank. No interest in collecting stats for
     // blocks.
     return;
@@ -276,22 +254,18 @@
 };
 
 struct BlockBasedTableBuilder::Rep {
-  const ImmutableCFOptions ioptions;
+  const ImmutableOptions ioptions;
   const MutableCFOptions moptions;
   const BlockBasedTableOptions table_options;
   const InternalKeyComparator& internal_comparator;
   WritableFileWriter* file;
-  uint64_t offset = 0;
-  Status status;
+  std::atomic<uint64_t> offset;
   size_t alignment;
   BlockBuilder data_block;
-  // Buffers uncompressed data blocks and keys to replay later. Needed when
+  // Buffers uncompressed data blocks to replay later. Needed when
   // compression dictionary is enabled so we can finalize the dictionary before
   // compressing any data blocks.
-  // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data
-  // blocks as it's redundant, but it's easier to implement for now.
-  std::vector<std::pair<std::string, std::vector<std::string>>>
-      data_block_and_keys_buffers;
+  std::vector<std::string> data_block_buffers;
   BlockBuilder range_del_block;
 
   InternalKeySliceTransform internal_prefix_transform;
@@ -299,12 +273,18 @@
   PartitionedIndexBuilder* p_index_builder_ = nullptr;
 
   std::string last_key;
+  const Slice* first_key_in_next_block = nullptr;
   CompressionType compression_type;
   uint64_t sample_for_compression;
+  std::atomic<uint64_t> compressible_input_data_bytes;
+  std::atomic<uint64_t> uncompressible_input_data_bytes;
+  std::atomic<uint64_t> sampled_input_data_bytes;
+  std::atomic<uint64_t> sampled_output_slow_data_bytes;
+  std::atomic<uint64_t> sampled_output_fast_data_bytes;
   CompressionOptions compression_opts;
   std::unique_ptr<CompressionDict> compression_dict;
-  CompressionContext compression_ctx;
-  std::unique_ptr<UncompressionContext> verify_ctx;
+  std::vector<std::unique_ptr<CompressionContext>> compression_ctxs;
+  std::vector<std::unique_ptr<UncompressionContext>> verify_ctxs;
   std::unique_ptr<UncompressionDict> verify_dict;
 
   size_t data_begin_offset = 0;
@@ -335,77 +315,149 @@
     kClosed,
   };
   State state;
-
+  // `kBuffered` state is allowed only as long as the buffering of uncompressed
+  // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
+  uint64_t buffer_limit;
+  std::unique_ptr<CacheReservationManager>
+      compression_dict_buffer_cache_res_mgr;
   const bool use_delta_encoding_for_index_values;
   std::unique_ptr<FilterBlockBuilder> filter_builder;
-  char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
-  size_t compressed_cache_key_prefix_size;
+  OffsetableCacheKey base_cache_key;
+  const TableFileCreationReason reason;
 
   BlockHandle pending_handle;  // Handle to add to index block
 
   std::string compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
-  int level_at_creation;
-  uint32_t column_family_id;
-  const std::string& column_family_name;
-  uint64_t creation_time = 0;
-  uint64_t oldest_key_time = 0;
-  const uint64_t target_file_size;
-  uint64_t file_creation_time = 0;
 
   std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
 
-  Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
-      const BlockBasedTableOptions& table_opt,
-      const InternalKeyComparator& icomparator,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          int_tbl_prop_collector_factories,
-      uint32_t _column_family_id, WritableFileWriter* f,
-      const CompressionType _compression_type,
-      const uint64_t _sample_for_compression,
-      const CompressionOptions& _compression_opts, const bool skip_filters,
-      const int _level_at_creation, const std::string& _column_family_name,
-      const uint64_t _creation_time, const uint64_t _oldest_key_time,
-      const uint64_t _target_file_size, const uint64_t _file_creation_time)
-      : ioptions(_ioptions),
-        moptions(_moptions),
+  std::unique_ptr<ParallelCompressionRep> pc_rep;
+
+  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
+  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
+
+  bool IsParallelCompressionEnabled() const {
+    return compression_opts.parallel_threads > 1;
+  }
+
+  Status GetStatus() {
+    // We need to make modifications of status visible when status_ok is set
+    // to false, and this is ensured by status_mutex, so no special memory
+    // order for status_ok is required.
+    if (status_ok.load(std::memory_order_relaxed)) {
+      return Status::OK();
+    } else {
+      return CopyStatus();
+    }
+  }
+
+  Status CopyStatus() {
+    std::lock_guard<std::mutex> lock(status_mutex);
+    return status;
+  }
+
+  IOStatus GetIOStatus() {
+    // We need to make modifications of io_status visible when status_ok is set
+    // to false, and this is ensured by io_status_mutex, so no special memory
+    // order for io_status_ok is required.
+    if (io_status_ok.load(std::memory_order_relaxed)) {
+      return IOStatus::OK();
+    } else {
+      return CopyIOStatus();
+    }
+  }
+
+  IOStatus CopyIOStatus() {
+    std::lock_guard<std::mutex> lock(io_status_mutex);
+    return io_status;
+  }
+
+  // Never erase an existing status that is not OK.
+  void SetStatus(Status s) {
+    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_opts.parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(status_mutex);
+      status = s;
+      status_ok.store(false, std::memory_order_relaxed);
+    }
+  }
+
+  // Never erase an existing I/O status that is not OK.
+  void SetIOStatus(IOStatus ios) {
+    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_opts.parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      io_status = ios;
+      io_status_ok.store(false, std::memory_order_relaxed);
+    }
+  }
+
+  Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
+      WritableFileWriter* f)
+      : ioptions(tbo.ioptions),
+        moptions(tbo.moptions),
         table_options(table_opt),
-        internal_comparator(icomparator),
+        internal_comparator(tbo.internal_comparator),
         file(f),
+        offset(0),
         alignment(table_options.block_align
-                      ? std::min(table_options.block_size, kDefaultPageSize)
+                      ? std::min(static_cast<size_t>(table_options.block_size),
+                                 kDefaultPageSize)
                       : 0),
         data_block(table_options.block_restart_interval,
                    table_options.use_delta_encoding,
                    false /* use_value_delta_encoding */,
-                   icomparator.user_comparator()
+                   tbo.internal_comparator.user_comparator()
                            ->CanKeysWithDifferentByteContentsBeEqual()
                        ? BlockBasedTableOptions::kDataBlockBinarySearch
                        : table_options.data_block_index_type,
                    table_options.data_block_hash_table_util_ratio),
         range_del_block(1 /* block_restart_interval */),
-        internal_prefix_transform(_moptions.prefix_extractor.get()),
-        compression_type(_compression_type),
-        sample_for_compression(_sample_for_compression),
-        compression_opts(_compression_opts),
+        internal_prefix_transform(tbo.moptions.prefix_extractor.get()),
+        compression_type(tbo.compression_type),
+        sample_for_compression(tbo.moptions.sample_for_compression),
+        compressible_input_data_bytes(0),
+        uncompressible_input_data_bytes(0),
+        sampled_input_data_bytes(0),
+        sampled_output_slow_data_bytes(0),
+        sampled_output_fast_data_bytes(0),
+        compression_opts(tbo.compression_opts),
         compression_dict(),
-        compression_ctx(_compression_type),
+        compression_ctxs(tbo.compression_opts.parallel_threads),
+        verify_ctxs(tbo.compression_opts.parallel_threads),
         verify_dict(),
-        state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered
-                                                     : State::kUnbuffered),
+        state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered
+                                                        : State::kUnbuffered),
         use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
                                             !table_opt.block_align),
-        compressed_cache_key_prefix_size(0),
+        reason(tbo.reason),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
-        level_at_creation(_level_at_creation),
-        column_family_id(_column_family_id),
-        column_family_name(_column_family_name),
-        creation_time(_creation_time),
-        oldest_key_time(_oldest_key_time),
-        target_file_size(_target_file_size),
-        file_creation_time(_file_creation_time) {
+        status_ok(true),
+        io_status_ok(true) {
+    if (tbo.target_file_size == 0) {
+      buffer_limit = compression_opts.max_dict_buffer_bytes;
+    } else if (compression_opts.max_dict_buffer_bytes == 0) {
+      buffer_limit = tbo.target_file_size;
+    } else {
+      buffer_limit = std::min(tbo.target_file_size,
+                              compression_opts.max_dict_buffer_bytes);
+    }
+    if (table_options.no_block_cache || table_options.block_cache == nullptr) {
+      compression_dict_buffer_cache_res_mgr.reset(nullptr);
+    } else {
+      compression_dict_buffer_cache_res_mgr.reset(
+          new CacheReservationManager(table_options.block_cache));
+    }
+    for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
+      compression_ctxs[i].reset(new CompressionContext(compression_type));
+    }
     if (table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
@@ -418,57 +470,407 @@
           &this->internal_prefix_transform, use_delta_encoding_for_index_values,
           table_options));
     }
-    if (skip_filters) {
-      filter_builder = nullptr;
+    if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
+      // Apply optimize_filters_for_hits setting here when applicable by
+      // skipping filter generation
+      filter_builder.reset();
+    } else if (tbo.skip_filters) {
+      // For SstFileWriter skip_filters
+      filter_builder.reset();
+    } else if (!table_options.filter_policy) {
+      // Null filter_policy -> no filter
+      filter_builder.reset();
     } else {
-      FilterBuildingContext context(table_options);
-      context.column_family_name = column_family_name;
-      context.compaction_style = ioptions.compaction_style;
-      context.level_at_creation = level_at_creation;
-      context.info_log = ioptions.info_log;
+      FilterBuildingContext filter_context(table_options);
+
+      filter_context.info_log = ioptions.logger;
+      filter_context.column_family_name = tbo.column_family_name;
+      filter_context.reason = reason;
+
+      // Only populate other fields if known to be in LSM rather than
+      // generating external SST file
+      if (reason != TableFileCreationReason::kMisc) {
+        filter_context.compaction_style = ioptions.compaction_style;
+        filter_context.num_levels = ioptions.num_levels;
+        filter_context.level_at_creation = tbo.level_at_creation;
+        filter_context.is_bottommost = tbo.is_bottommost;
+        assert(filter_context.level_at_creation < filter_context.num_levels);
+      }
+
       filter_builder.reset(CreateFilterBlockBuilder(
-          ioptions, moptions, context, use_delta_encoding_for_index_values,
-          p_index_builder_));
+          ioptions, moptions, filter_context,
+          use_delta_encoding_for_index_values, p_index_builder_));
     }
 
-    for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+    assert(tbo.int_tbl_prop_collector_factories);
+    for (auto& factory : *tbo.int_tbl_prop_collector_factories) {
+      assert(factory);
+
       table_properties_collectors.emplace_back(
-          collector_factories->CreateIntTblPropCollector(column_family_id));
+          factory->CreateIntTblPropCollector(tbo.column_family_id,
+                                             tbo.level_at_creation));
     }
     table_properties_collectors.emplace_back(
         new BlockBasedTablePropertiesCollector(
             table_options.index_type, table_options.whole_key_filtering,
-            _moptions.prefix_extractor != nullptr));
+            moptions.prefix_extractor != nullptr));
+    const Comparator* ucmp = tbo.internal_comparator.user_comparator();
+    assert(ucmp);
+    if (ucmp->timestamp_size() > 0) {
+      table_properties_collectors.emplace_back(
+          new TimestampTablePropertiesCollector(ucmp));
+    }
     if (table_options.verify_compression) {
-      verify_ctx.reset(new UncompressionContext(UncompressionContext::NoCache(),
-                                                compression_type));
+      for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
+        verify_ctxs[i].reset(new UncompressionContext(compression_type));
+      }
+    }
+
+    // These are only needed for populating table properties
+    props.column_family_id = tbo.column_family_id;
+    props.column_family_name = tbo.column_family_name;
+    props.creation_time = tbo.creation_time;
+    props.oldest_key_time = tbo.oldest_key_time;
+    props.file_creation_time = tbo.file_creation_time;
+    props.orig_file_number = tbo.cur_file_num;
+    props.db_id = tbo.db_id;
+    props.db_session_id = tbo.db_session_id;
+    props.db_host_id = ioptions.db_host_id;
+    if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
+      ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
     }
   }
 
   Rep(const Rep&) = delete;
   Rep& operator=(const Rep&) = delete;
 
-  ~Rep() {}
+ private:
+  // Synchronize status & io_status accesses across threads from main thread,
+  // compression thread and write thread in parallel compression.
+  std::mutex status_mutex;
+  std::atomic<bool> status_ok;
+  Status status;
+  std::mutex io_status_mutex;
+  std::atomic<bool> io_status_ok;
+  IOStatus io_status;
+};
+
+struct BlockBasedTableBuilder::ParallelCompressionRep {
+  // Keys is a wrapper of vector of strings avoiding
+  // releasing string memories during vector clear()
+  // in order to save memory allocation overhead
+  class Keys {
+   public:
+    Keys() : keys_(kKeysInitSize), size_(0) {}
+    void PushBack(const Slice& key) {
+      if (size_ == keys_.size()) {
+        keys_.emplace_back(key.data(), key.size());
+      } else {
+        keys_[size_].assign(key.data(), key.size());
+      }
+      size_++;
+    }
+    void SwapAssign(std::vector<std::string>& keys) {
+      size_ = keys.size();
+      std::swap(keys_, keys);
+    }
+    void Clear() { size_ = 0; }
+    size_t Size() { return size_; }
+    std::string& Back() { return keys_[size_ - 1]; }
+    std::string& operator[](size_t idx) {
+      assert(idx < size_);
+      return keys_[idx];
+    }
+
+   private:
+    const size_t kKeysInitSize = 32;
+    std::vector<std::string> keys_;
+    size_t size_;
+  };
+  std::unique_ptr<Keys> curr_block_keys;
+
+  class BlockRepSlot;
+
+  // BlockRep instances are fetched from and recycled to
+  // block_rep_pool during parallel compression.
+  struct BlockRep {
+    Slice contents;
+    Slice compressed_contents;
+    std::unique_ptr<std::string> data;
+    std::unique_ptr<std::string> compressed_data;
+    CompressionType compression_type;
+    std::unique_ptr<std::string> first_key_in_next_block;
+    std::unique_ptr<Keys> keys;
+    std::unique_ptr<BlockRepSlot> slot;
+    Status status;
+  };
+  // Use a vector of BlockRep as a buffer for a determined number
+  // of BlockRep structures. All data referenced by pointers in
+  // BlockRep will be freed when this vector is destructed.
+  using BlockRepBuffer = std::vector<BlockRep>;
+  BlockRepBuffer block_rep_buf;
+  // Use a thread-safe queue for concurrent access from block
+  // building thread and writer thread.
+  using BlockRepPool = WorkQueue<BlockRep*>;
+  BlockRepPool block_rep_pool;
+
+  // Use BlockRepSlot to keep block order in write thread.
+  // slot_ will pass references to BlockRep
+  class BlockRepSlot {
+   public:
+    BlockRepSlot() : slot_(1) {}
+    template <typename T>
+    void Fill(T&& rep) {
+      slot_.push(std::forward<T>(rep));
+    };
+    void Take(BlockRep*& rep) { slot_.pop(rep); }
+
+   private:
+    // slot_ will pass references to BlockRep in block_rep_buf,
+    // and those references are always valid before the destruction of
+    // block_rep_buf.
+    WorkQueue<BlockRep*> slot_;
+  };
+
+  // Compression queue will pass references to BlockRep in block_rep_buf,
+  // and those references are always valid before the destruction of
+  // block_rep_buf.
+  using CompressQueue = WorkQueue<BlockRep*>;
+  CompressQueue compress_queue;
+  std::vector<port::Thread> compress_thread_pool;
+
+  // Write queue will pass references to BlockRep::slot in block_rep_buf,
+  // and those references are always valid before the corresponding
+  // BlockRep::slot is destructed, which is before the destruction of
+  // block_rep_buf.
+  using WriteQueue = WorkQueue<BlockRepSlot*>;
+  WriteQueue write_queue;
+  std::unique_ptr<port::Thread> write_thread;
+
+  // Estimate output file size when parallel compression is enabled. This is
+  // necessary because compression & flush are no longer synchronized,
+  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
+  // memory_order_relaxed suffices because accurate statistics is not required.
+  class FileSizeEstimator {
+   public:
+    explicit FileSizeEstimator()
+        : raw_bytes_compressed(0),
+          raw_bytes_curr_block(0),
+          raw_bytes_curr_block_set(false),
+          raw_bytes_inflight(0),
+          blocks_inflight(0),
+          curr_compression_ratio(0),
+          estimated_file_size(0) {}
+
+    // Estimate file size when a block is about to be emitted to
+    // compression thread
+    void EmitBlock(uint64_t raw_block_size, uint64_t curr_file_size) {
+      uint64_t new_raw_bytes_inflight =
+          raw_bytes_inflight.fetch_add(raw_block_size,
+                                       std::memory_order_relaxed) +
+          raw_block_size;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_raw_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+    }
+
+    // Estimate file size when a block is already reaped from
+    // compression thread
+    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
+      assert(raw_bytes_curr_block_set);
+
+      uint64_t new_raw_bytes_compressed =
+          raw_bytes_compressed + raw_bytes_curr_block;
+      assert(new_raw_bytes_compressed > 0);
+
+      curr_compression_ratio.store(
+          (curr_compression_ratio.load(std::memory_order_relaxed) *
+               raw_bytes_compressed +
+           compressed_block_size) /
+              static_cast<double>(new_raw_bytes_compressed),
+          std::memory_order_relaxed);
+      raw_bytes_compressed = new_raw_bytes_compressed;
+
+      uint64_t new_raw_bytes_inflight =
+          raw_bytes_inflight.fetch_sub(raw_bytes_curr_block,
+                                       std::memory_order_relaxed) -
+          raw_bytes_curr_block;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_raw_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+
+      raw_bytes_curr_block_set = false;
+    }
+
+    void SetEstimatedFileSize(uint64_t size) {
+      estimated_file_size.store(size, std::memory_order_relaxed);
+    }
+
+    uint64_t GetEstimatedFileSize() {
+      return estimated_file_size.load(std::memory_order_relaxed);
+    }
+
+    void SetCurrBlockRawSize(uint64_t size) {
+      raw_bytes_curr_block = size;
+      raw_bytes_curr_block_set = true;
+    }
+
+   private:
+    // Raw bytes compressed so far.
+    uint64_t raw_bytes_compressed;
+    // Size of current block being appended.
+    uint64_t raw_bytes_curr_block;
+    // Whether raw_bytes_curr_block has been set for next
+    // ReapBlock call.
+    bool raw_bytes_curr_block_set;
+    // Raw bytes under compression and not appended yet.
+    std::atomic<uint64_t> raw_bytes_inflight;
+    // Number of blocks under compression and not appended yet.
+    std::atomic<uint64_t> blocks_inflight;
+    // Current compression ratio, maintained by BGWorkWriteRawBlock.
+    std::atomic<double> curr_compression_ratio;
+    // Estimated SST file size.
+    std::atomic<uint64_t> estimated_file_size;
+  };
+  FileSizeEstimator file_size_estimator;
+
+  // Facilities used for waiting first block completion. Need to Wait for
+  // the completion of first block compression and flush to get a non-zero
+  // compression ratio.
+  std::atomic<bool> first_block_processed;
+  std::condition_variable first_block_cond;
+  std::mutex first_block_mutex;
+
+  explicit ParallelCompressionRep(uint32_t parallel_threads)
+      : curr_block_keys(new Keys()),
+        block_rep_buf(parallel_threads),
+        block_rep_pool(parallel_threads),
+        compress_queue(parallel_threads),
+        write_queue(parallel_threads),
+        first_block_processed(false) {
+    for (uint32_t i = 0; i < parallel_threads; i++) {
+      block_rep_buf[i].contents = Slice();
+      block_rep_buf[i].compressed_contents = Slice();
+      block_rep_buf[i].data.reset(new std::string());
+      block_rep_buf[i].compressed_data.reset(new std::string());
+      block_rep_buf[i].compression_type = CompressionType();
+      block_rep_buf[i].first_key_in_next_block.reset(new std::string());
+      block_rep_buf[i].keys.reset(new Keys());
+      block_rep_buf[i].slot.reset(new BlockRepSlot());
+      block_rep_buf[i].status = Status::OK();
+      block_rep_pool.push(&block_rep_buf[i]);
+    }
+  }
+
+  ~ParallelCompressionRep() { block_rep_pool.finish(); }
+
+  // Make a block prepared to be emitted to compression thread
+  // Used in non-buffered mode
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         BlockBuilder* data_block) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    data_block->SwapAndReset(*(block_rep->data));
+    block_rep->contents = *(block_rep->data);
+    std::swap(block_rep->keys, curr_block_keys);
+    curr_block_keys->Clear();
+    return block_rep;
+  }
+
+  // Used in EnterUnbuffered
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         std::string* data_block,
+                         std::vector<std::string>* keys) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    std::swap(*(block_rep->data), *data_block);
+    block_rep->contents = *(block_rep->data);
+    block_rep->keys->SwapAssign(*keys);
+    return block_rep;
+  }
+
+  // Emit a block to compression thread
+  void EmitBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    assert(block_rep->status.ok());
+    if (!write_queue.push(block_rep->slot.get())) {
+      return;
+    }
+    if (!compress_queue.push(block_rep)) {
+      return;
+    }
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::unique_lock<std::mutex> lock(first_block_mutex);
+      first_block_cond.wait(lock, [this] {
+        return first_block_processed.load(std::memory_order_relaxed);
+      });
+    }
+  }
+
+  // Reap a block from compression thread
+  void ReapBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    block_rep->compressed_data->clear();
+    block_rep_pool.push(block_rep);
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::lock_guard<std::mutex> lock(first_block_mutex);
+      first_block_processed.store(true, std::memory_order_relaxed);
+      first_block_cond.notify_one();
+    }
+  }
+
+ private:
+  BlockRep* PrepareBlockInternal(CompressionType compression_type,
+                                 const Slice* first_key_in_next_block) {
+    BlockRep* block_rep = nullptr;
+    block_rep_pool.pop(block_rep);
+    assert(block_rep != nullptr);
+
+    assert(block_rep->data);
+
+    block_rep->compression_type = compression_type;
+
+    if (first_key_in_next_block == nullptr) {
+      block_rep->first_key_in_next_block.reset(nullptr);
+    } else {
+      block_rep->first_key_in_next_block->assign(
+          first_key_in_next_block->data(), first_key_in_next_block->size());
+    }
+
+    return block_rep;
+  }
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-    const BlockBasedTableOptions& table_options,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, WritableFileWriter* file,
-    const CompressionType compression_type,
-    const uint64_t sample_for_compression,
-    const CompressionOptions& compression_opts, const bool skip_filters,
-    const std::string& column_family_name, const int level_at_creation,
-    const uint64_t creation_time, const uint64_t oldest_key_time,
-    const uint64_t target_file_size, const uint64_t file_creation_time) {
+    const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo,
+    WritableFileWriter* file) {
   BlockBasedTableOptions sanitized_table_options(table_options);
   if (sanitized_table_options.format_version == 0 &&
       sanitized_table_options.checksum != kCRC32c) {
     ROCKS_LOG_WARN(
-        ioptions.info_log,
+        tbo.ioptions.logger,
         "Silently converting format_version to 1 because checksum is "
         "non-default");
     // silently convert format_version to 1 to keep consistent with current
@@ -476,21 +878,25 @@
     sanitized_table_options.format_version = 1;
   }
 
-  rep_ = new Rep(ioptions, moptions, sanitized_table_options,
-                 internal_comparator, int_tbl_prop_collector_factories,
-                 column_family_id, file, compression_type,
-                 sample_for_compression, compression_opts, skip_filters,
-                 level_at_creation, column_family_name, creation_time,
-                 oldest_key_time, target_file_size, file_creation_time);
+  rep_ = new Rep(sanitized_table_options, tbo, file);
 
   if (rep_->filter_builder != nullptr) {
     rep_->filter_builder->StartBlock(0);
   }
-  if (table_options.block_cache_compressed.get() != nullptr) {
-    BlockBasedTable::GenerateCachePrefix(
-        table_options.block_cache_compressed.get(), file->writable_file(),
-        &rep_->compressed_cache_key_prefix[0],
-        &rep_->compressed_cache_key_prefix_size);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
+      const_cast<TableProperties*>(&rep_->props));
+
+  // Extremely large files use atypical cache key encoding, and we don't
+  // know ahead of time how big the file will be. But assuming it's less
+  // than 4TB, we will correctly predict the cache keys.
+  BlockBasedTable::SetupBaseCacheKey(
+      &rep_->props, tbo.db_session_id, tbo.cur_file_num,
+      BlockBasedTable::kMaxFileSizeStandardEncoding, &rep_->base_cache_key);
+
+  if (rep_->IsParallelCompressionEnabled()) {
+    StartParallelCompression();
   }
 }
 
@@ -510,16 +916,33 @@
     if (r->props.num_entries > r->props.num_range_deletions) {
       assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
     }
-#endif  // NDEBUG
+#endif  // !NDEBUG
 
     auto should_flush = r->flush_block_policy->Update(key, value);
     if (should_flush) {
       assert(!r->data_block.empty());
+      r->first_key_in_next_block = &key;
       Flush();
+      if (r->state == Rep::State::kBuffered) {
+        bool exceeds_buffer_limit =
+            (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
+        bool exceeds_global_block_cache_limit = false;
+
+        // Increase cache reservation for the last buffered data block
+        // only if the block is not going to be unbuffered immediately
+        // and there exists a cache reservation manager
+        if (!exceeds_buffer_limit &&
+            r->compression_dict_buffer_cache_res_mgr != nullptr) {
+          Status s =
+              r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation<
+                  CacheEntryRole::kCompressionDictionaryBuildingBuffer>(
+                  r->data_begin_offset);
+          exceeds_global_block_cache_limit = s.IsIncomplete();
+        }
 
-      if (r->state == Rep::State::kBuffered &&
-          r->data_begin_offset > r->target_file_size) {
-        EnterUnbuffered();
+        if (exceeds_buffer_limit || exceeds_global_block_cache_limit) {
+          EnterUnbuffered();
+        }
       }
 
       // Add item to index block.
@@ -531,38 +954,50 @@
       // entries in the first block and < all entries in subsequent
       // blocks.
       if (ok() && r->state == Rep::State::kUnbuffered) {
-        r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
+        if (r->IsParallelCompressionEnabled()) {
+          r->pc_rep->curr_block_keys->Clear();
+        } else {
+          r->index_builder->AddIndexEntry(&r->last_key, &key,
+                                          r->pending_handle);
+        }
       }
     }
 
     // Note: PartitionedFilterBlockBuilder requires key being added to filter
     // builder after being added to index builder.
-    if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) {
-      size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size();
-      r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+    if (r->state == Rep::State::kUnbuffered) {
+      if (r->IsParallelCompressionEnabled()) {
+        r->pc_rep->curr_block_keys->PushBack(key);
+      } else {
+        if (r->filter_builder != nullptr) {
+          size_t ts_sz =
+              r->internal_comparator.user_comparator()->timestamp_size();
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+        }
+      }
     }
 
+    r->data_block.AddWithLastKey(key, value, r->last_key);
     r->last_key.assign(key.data(), key.size());
-    r->data_block.Add(key, value);
     if (r->state == Rep::State::kBuffered) {
-      // Buffer keys to be replayed during `Finish()` once compression
-      // dictionary has been finalized.
-      if (r->data_block_and_keys_buffers.empty() || should_flush) {
-        r->data_block_and_keys_buffers.emplace_back();
-      }
-      r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString());
+      // Buffered keys will be replayed from data_block_buffers during
+      // `Finish()` once compression dictionary has been finalized.
     } else {
-      r->index_builder->OnKeyAdded(key);
+      if (!r->IsParallelCompressionEnabled()) {
+        r->index_builder->OnKeyAdded(key);
+      }
     }
-    NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
+    // TODO offset passed in is not accurate for parallel compression case
+    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
                                       r->table_properties_collectors,
-                                      r->ioptions.info_log);
+                                      r->ioptions.logger);
 
   } else if (value_type == kTypeRangeDeletion) {
     r->range_del_block.Add(key, value);
-    NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
+    // TODO offset passed in is not accurate for parallel compression case
+    NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
                                       r->table_properties_collectors,
-                                      r->ioptions.info_log);
+                                      r->ioptions.logger);
   } else {
     assert(false);
   }
@@ -585,44 +1020,108 @@
   assert(rep_->state != Rep::State::kClosed);
   if (!ok()) return;
   if (r->data_block.empty()) return;
-  WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */);
+  if (r->IsParallelCompressionEnabled() &&
+      r->state == Rep::State::kUnbuffered) {
+    r->data_block.Finish();
+    ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+        r->compression_type, r->first_key_in_next_block, &(r->data_block));
+    assert(block_rep != nullptr);
+    r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                             r->get_offset());
+    r->pc_rep->EmitBlock(block_rep);
+  } else {
+    WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData);
+  }
 }
 
 void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
                                         BlockHandle* handle,
-                                        bool is_data_block) {
-  WriteBlock(block->Finish(), handle, is_data_block);
-  block->Reset();
+                                        BlockType block_type) {
+  block->Finish();
+  std::string raw_block_contents;
+  raw_block_contents.reserve(rep_->table_options.block_size);
+  block->SwapAndReset(raw_block_contents);
+  if (rep_->state == Rep::State::kBuffered) {
+    assert(block_type == BlockType::kData);
+    rep_->data_block_buffers.emplace_back(std::move(raw_block_contents));
+    rep_->data_begin_offset += rep_->data_block_buffers.back().size();
+    return;
+  }
+  WriteBlock(raw_block_contents, handle, block_type);
 }
 
 void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
                                         BlockHandle* handle,
-                                        bool is_data_block) {
+                                        BlockType block_type) {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kUnbuffered);
+  Slice block_contents;
+  CompressionType type;
+  Status compress_status;
+  bool is_data_block = block_type == BlockType::kData;
+  CompressAndVerifyBlock(raw_block_contents, is_data_block,
+                         *(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
+                         &(r->compressed_output), &(block_contents), &type,
+                         &compress_status);
+  r->SetStatus(compress_status);
+  if (!ok()) {
+    return;
+  }
+
+  WriteRawBlock(block_contents, type, handle, block_type, &raw_block_contents);
+  r->compressed_output.clear();
+  if (is_data_block) {
+    if (r->filter_builder != nullptr) {
+      r->filter_builder->StartBlock(r->get_offset());
+    }
+    r->props.data_size = r->get_offset();
+    ++r->props.num_data_blocks;
+  }
+}
+
+void BlockBasedTableBuilder::BGWorkCompression(
+    const CompressionContext& compression_ctx,
+    UncompressionContext* verify_ctx) {
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
+  while (rep_->pc_rep->compress_queue.pop(block_rep)) {
+    assert(block_rep != nullptr);
+    CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/
+                           compression_ctx, verify_ctx,
+                           block_rep->compressed_data.get(),
+                           &block_rep->compressed_contents,
+                           &(block_rep->compression_type), &block_rep->status);
+    block_rep->slot->Fill(block_rep);
+  }
+}
+
+void BlockBasedTableBuilder::CompressAndVerifyBlock(
+    const Slice& raw_block_contents, bool is_data_block,
+    const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
+    std::string* compressed_output, Slice* block_contents,
+    CompressionType* type, Status* out_status) {
   // File format contains a sequence of blocks where each block has:
   //    block_data: uint8[n]
   //    type: uint8
   //    crc: uint32
-  assert(ok());
   Rep* r = rep_;
+  bool is_status_ok = ok();
+  if (!r->IsParallelCompressionEnabled()) {
+    assert(is_status_ok);
+  }
 
-  auto type = r->compression_type;
+  *type = r->compression_type;
   uint64_t sample_for_compression = r->sample_for_compression;
-  Slice block_contents;
   bool abort_compression = false;
 
   StopWatchNano timer(
-      r->ioptions.env,
-      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics));
-
-  if (r->state == Rep::State::kBuffered) {
-    assert(is_data_block);
-    assert(!r->data_block_and_keys_buffers.empty());
-    r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString();
-    r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size();
-    return;
-  }
+      r->ioptions.clock,
+      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
 
-  if (raw_block_contents.size() < kCompressionSizeLimit) {
+  if (is_status_ok && raw_block_contents.size() < kCompressionSizeLimit) {
+    if (is_data_block) {
+      r->compressible_input_data_bytes.fetch_add(raw_block_contents.size(),
+                                                 std::memory_order_relaxed);
+    }
     const CompressionDict* compression_dict;
     if (!is_data_block || r->compression_dict == nullptr) {
       compression_dict = &CompressionDict::GetEmptyDict();
@@ -630,17 +1129,27 @@
       compression_dict = r->compression_dict.get();
     }
     assert(compression_dict != nullptr);
-    CompressionInfo compression_info(r->compression_opts, r->compression_ctx,
-                                     *compression_dict, type,
+    CompressionInfo compression_info(r->compression_opts, compression_ctx,
+                                     *compression_dict, *type,
                                      sample_for_compression);
 
     std::string sampled_output_fast;
     std::string sampled_output_slow;
-    block_contents = CompressBlock(
-        raw_block_contents, compression_info, &type,
+    *block_contents = CompressBlock(
+        raw_block_contents, compression_info, type,
         r->table_options.format_version, is_data_block /* do_sample */,
-        &r->compressed_output, &sampled_output_fast, &sampled_output_slow);
+        compressed_output, &sampled_output_fast, &sampled_output_slow);
 
+    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
+      // Currently compression sampling is only enabled for data block.
+      assert(is_data_block);
+      r->sampled_input_data_bytes.fetch_add(raw_block_contents.size(),
+                                            std::memory_order_relaxed);
+      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
+                                                  std::memory_order_relaxed);
+      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
+                                                  std::memory_order_relaxed);
+    }
     // notify collectors on block add
     NotifyCollectTableCollectorsOnBlockAdd(
         r->table_properties_collectors, raw_block_contents.size(),
@@ -649,7 +1158,7 @@
     // Some of the compression algorithms are known to be unreliable. If
     // the verify_compression flag is set then try to de-compress the
     // compressed data and compare to the input.
-    if (type != kNoCompression && r->table_options.verify_compression) {
+    if (*type != kNoCompression && r->table_options.verify_compression) {
       // Retrieve the uncompressed contents into a new buffer
       const UncompressionDict* verify_dict;
       if (!is_data_block || r->verify_dict == nullptr) {
@@ -659,10 +1168,10 @@
       }
       assert(verify_dict != nullptr);
       BlockContents contents;
-      UncompressionInfo uncompression_info(*r->verify_ctx, *verify_dict,
+      UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
                                            r->compression_type);
       Status stat = UncompressBlockContentsForCompressionType(
-          uncompression_info, block_contents.data(), block_contents.size(),
+          uncompression_info, block_contents->data(), block_contents->size(),
           &contents, r->table_options.format_version, r->ioptions);
 
       if (stat.ok()) {
@@ -670,140 +1179,250 @@
         if (!compressed_ok) {
           // The result of the compression was invalid. abort.
           abort_compression = true;
-          ROCKS_LOG_ERROR(r->ioptions.info_log,
+          ROCKS_LOG_ERROR(r->ioptions.logger,
                           "Decompressed block did not match raw block");
-          r->status =
+          *out_status =
               Status::Corruption("Decompressed block did not match raw block");
         }
       } else {
         // Decompression reported an error. abort.
-        r->status = Status::Corruption("Could not decompress");
+        *out_status = Status::Corruption(std::string("Could not decompress: ") +
+                                         stat.getState());
         abort_compression = true;
       }
     }
   } else {
     // Block is too big to be compressed.
+    if (is_data_block) {
+      r->uncompressible_input_data_bytes.fetch_add(raw_block_contents.size(),
+                                                   std::memory_order_relaxed);
+    }
     abort_compression = true;
   }
+  if (is_data_block) {
+    r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
+                                                 std::memory_order_relaxed);
+  }
 
   // Abort compression if the block is too big, or did not pass
   // verification.
   if (abort_compression) {
-    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
-    type = kNoCompression;
-    block_contents = raw_block_contents;
-  } else if (type != kNoCompression) {
-    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) {
-      RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS,
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
+    *type = kNoCompression;
+    *block_contents = raw_block_contents;
+  } else if (*type != kNoCompression) {
+    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) {
+      RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
                             timer.ElapsedNanos());
     }
-    RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED,
+    RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED,
                       raw_block_contents.size());
-    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED);
-  } else if (type != r->compression_type) {
-    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
-  }
-
-  WriteRawBlock(block_contents, type, handle, is_data_block);
-  r->compressed_output.clear();
-  if (is_data_block) {
-    if (r->filter_builder != nullptr) {
-      r->filter_builder->StartBlock(r->offset);
-    }
-    r->props.data_size = r->offset;
-    ++r->props.num_data_blocks;
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED);
+  } else if (*type != r->compression_type) {
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
   }
 }
 
 void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
                                            CompressionType type,
                                            BlockHandle* handle,
-                                           bool is_data_block) {
+                                           BlockType block_type,
+                                           const Slice* raw_block_contents,
+                                           bool is_top_level_filter_block) {
   Rep* r = rep_;
-  StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
-  handle->set_offset(r->offset);
+  bool is_data_block = block_type == BlockType::kData;
+  Status s = Status::OK();
+  IOStatus io_s = IOStatus::OK();
+  StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
+  handle->set_offset(r->get_offset());
   handle->set_size(block_contents.size());
-  assert(r->status.ok());
-  r->status = r->file->Append(block_contents);
-  if (r->status.ok()) {
-    char trailer[kBlockTrailerSize];
+  assert(status().ok());
+  assert(io_status().ok());
+  io_s = r->file->Append(block_contents);
+  if (io_s.ok()) {
+    std::array<char, kBlockTrailerSize> trailer;
     trailer[0] = type;
-    char* trailer_without_type = trailer + 1;
-    switch (r->table_options.checksum) {
-      case kNoChecksum:
-        EncodeFixed32(trailer_without_type, 0);
-        break;
-      case kCRC32c: {
-        auto crc = crc32c::Value(block_contents.data(), block_contents.size());
-        crc = crc32c::Extend(crc, trailer, 1);  // Extend to cover block type
-        EncodeFixed32(trailer_without_type, crc32c::Mask(crc));
-        break;
-      }
-      case kxxHash: {
-        XXH32_state_t* const state = XXH32_createState();
-        XXH32_reset(state, 0);
-        XXH32_update(state, block_contents.data(),
-                     static_cast<uint32_t>(block_contents.size()));
-        XXH32_update(state, trailer, 1);  // Extend  to cover block type
-        EncodeFixed32(trailer_without_type, XXH32_digest(state));
-        XXH32_freeState(state);
-        break;
-      }
-      case kxxHash64: {
-        XXH64_state_t* const state = XXH64_createState();
-        XXH64_reset(state, 0);
-        XXH64_update(state, block_contents.data(),
-                     static_cast<uint32_t>(block_contents.size()));
-        XXH64_update(state, trailer, 1);  // Extend  to cover block type
-        EncodeFixed32(
-            trailer_without_type,
-            static_cast<uint32_t>(XXH64_digest(state) &  // lower 32 bits
-                                  uint64_t{0xffffffff}));
-        XXH64_freeState(state);
-        break;
-      }
-    }
+    uint32_t checksum = ComputeBuiltinChecksumWithLastByte(
+        r->table_options.checksum, block_contents.data(), block_contents.size(),
+        /*last_byte*/ type);
+    EncodeFixed32(trailer.data() + 1, checksum);
 
-    assert(r->status.ok());
+    assert(io_s.ok());
     TEST_SYNC_POINT_CALLBACK(
         "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
-        static_cast<char*>(trailer));
-    r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
-    if (r->status.ok()) {
-      r->status = InsertBlockInCache(block_contents, type, handle);
+        trailer.data());
+    io_s = r->file->Append(Slice(trailer.data(), trailer.size()));
+    if (io_s.ok()) {
+      assert(s.ok());
+      bool warm_cache;
+      switch (r->table_options.prepopulate_block_cache) {
+        case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
+          warm_cache = (r->reason == TableFileCreationReason::kFlush);
+          break;
+        case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
+          warm_cache = false;
+          break;
+        default:
+          // missing case
+          assert(false);
+          warm_cache = false;
+      }
+      if (warm_cache) {
+        if (type == kNoCompression) {
+          s = InsertBlockInCacheHelper(block_contents, handle, block_type,
+                                       is_top_level_filter_block);
+        } else if (raw_block_contents != nullptr) {
+          s = InsertBlockInCacheHelper(*raw_block_contents, handle, block_type,
+                                       is_top_level_filter_block);
+        }
+        if (!s.ok()) {
+          r->SetStatus(s);
+        }
+      }
+      // TODO:: Should InsertBlockInCompressedCache take into account error from
+      // InsertBlockInCache or ignore and overwrite it.
+      s = InsertBlockInCompressedCache(block_contents, type, handle);
+      if (!s.ok()) {
+        r->SetStatus(s);
+      }
+    } else {
+      r->SetIOStatus(io_s);
     }
-    if (r->status.ok()) {
-      r->offset += block_contents.size() + kBlockTrailerSize;
+    if (s.ok() && io_s.ok()) {
+      r->set_offset(r->get_offset() + block_contents.size() +
+                    kBlockTrailerSize);
       if (r->table_options.block_align && is_data_block) {
         size_t pad_bytes =
             (r->alignment - ((block_contents.size() + kBlockTrailerSize) &
                              (r->alignment - 1))) &
             (r->alignment - 1);
-        r->status = r->file->Pad(pad_bytes);
-        if (r->status.ok()) {
-          r->offset += pad_bytes;
+        io_s = r->file->Pad(pad_bytes);
+        if (io_s.ok()) {
+          r->set_offset(r->get_offset() + pad_bytes);
+        } else {
+          r->SetIOStatus(io_s);
         }
       }
+      if (r->IsParallelCompressionEnabled()) {
+        if (is_data_block) {
+          r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(),
+                                                   r->get_offset());
+        } else {
+          r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset());
+        }
+      }
+    }
+  } else {
+    r->SetIOStatus(io_s);
+  }
+  if (!io_s.ok() && s.ok()) {
+    r->SetStatus(io_s);
+  }
+}
+
+void BlockBasedTableBuilder::BGWorkWriteRawBlock() {
+  Rep* r = rep_;
+  ParallelCompressionRep::BlockRepSlot* slot = nullptr;
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
+  while (r->pc_rep->write_queue.pop(slot)) {
+    assert(slot != nullptr);
+    slot->Take(block_rep);
+    assert(block_rep != nullptr);
+    if (!block_rep->status.ok()) {
+      r->SetStatus(block_rep->status);
+      // Reap block so that blocked Flush() can finish
+      // if there is one, and Flush() will notice !ok() next time.
+      block_rep->status = Status::OK();
+      r->pc_rep->ReapBlock(block_rep);
+      continue;
+    }
+
+    for (size_t i = 0; i < block_rep->keys->Size(); i++) {
+      auto& key = (*block_rep->keys)[i];
+      if (r->filter_builder != nullptr) {
+        size_t ts_sz =
+            r->internal_comparator.user_comparator()->timestamp_size();
+        r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+      }
+      r->index_builder->OnKeyAdded(key);
+    }
+
+    r->pc_rep->file_size_estimator.SetCurrBlockRawSize(block_rep->data->size());
+    WriteRawBlock(block_rep->compressed_contents, block_rep->compression_type,
+                  &r->pending_handle, BlockType::kData, &block_rep->contents);
+    if (!ok()) {
+      break;
+    }
+
+    if (r->filter_builder != nullptr) {
+      r->filter_builder->StartBlock(r->get_offset());
+    }
+    r->props.data_size = r->get_offset();
+    ++r->props.num_data_blocks;
+
+    if (block_rep->first_key_in_next_block == nullptr) {
+      r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), nullptr,
+                                      r->pending_handle);
+    } else {
+      Slice first_key_in_next_block =
+          Slice(*block_rep->first_key_in_next_block);
+      r->index_builder->AddIndexEntry(&(block_rep->keys->Back()),
+                                      &first_key_in_next_block,
+                                      r->pending_handle);
     }
+
+    r->pc_rep->ReapBlock(block_rep);
+  }
+}
+
+void BlockBasedTableBuilder::StartParallelCompression() {
+  rep_->pc_rep.reset(
+      new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
+  rep_->pc_rep->compress_thread_pool.reserve(
+      rep_->compression_opts.parallel_threads);
+  for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
+    rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
+      BGWorkCompression(*(rep_->compression_ctxs[i]),
+                        rep_->verify_ctxs[i].get());
+    });
+  }
+  rep_->pc_rep->write_thread.reset(
+      new port::Thread([this] { BGWorkWriteRawBlock(); }));
+}
+
+void BlockBasedTableBuilder::StopParallelCompression() {
+  rep_->pc_rep->compress_queue.finish();
+  for (auto& thread : rep_->pc_rep->compress_thread_pool) {
+    thread.join();
   }
+  rep_->pc_rep->write_queue.finish();
+  rep_->pc_rep->write_thread->join();
 }
 
-Status BlockBasedTableBuilder::status() const { return rep_->status; }
+Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); }
 
-static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) {
-  BlockContents* bc = reinterpret_cast<BlockContents*>(value);
-  delete bc;
+IOStatus BlockBasedTableBuilder::io_status() const {
+  return rep_->GetIOStatus();
 }
 
+namespace {
+// Delete the entry resided in the cache.
+template <class Entry>
+void DeleteEntryCached(const Slice& /*key*/, void* value) {
+  auto entry = reinterpret_cast<Entry*>(value);
+  delete entry;
+}
+}  // namespace
+
 //
 // Make a copy of the block contents and insert into compressed block cache
 //
-Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
-                                                  const CompressionType type,
-                                                  const BlockHandle* handle) {
+Status BlockBasedTableBuilder::InsertBlockInCompressedCache(
+    const Slice& block_contents, const CompressionType type,
+    const BlockHandle* handle) {
   Rep* r = rep_;
   Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
-
+  Status s;
   if (type != kNoCompression && block_cache_compressed != nullptr) {
     size_t size = block_contents.size();
 
@@ -818,39 +1437,133 @@
     block_contents_to_cache->is_raw_block = true;
 #endif  // NDEBUG
 
-    // make cache key by appending the file offset to the cache prefix id
-    char* end = EncodeVarint64(
-        r->compressed_cache_key_prefix + r->compressed_cache_key_prefix_size,
-        handle->offset());
-    Slice key(r->compressed_cache_key_prefix,
-              static_cast<size_t>(end - r->compressed_cache_key_prefix));
-
-    // Insert into compressed block cache.
-    block_cache_compressed->Insert(
-        key, block_contents_to_cache,
-        block_contents_to_cache->ApproximateMemoryUsage(),
-        &DeleteCachedBlockContents);
+    CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
 
+    s = block_cache_compressed->Insert(
+        key.AsSlice(), block_contents_to_cache,
+        block_contents_to_cache->ApproximateMemoryUsage(),
+        &DeleteEntryCached<BlockContents>);
+    if (s.ok()) {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    }
     // Invalidate OS cache.
-    r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
+    r->file->InvalidateCache(static_cast<size_t>(r->get_offset()), size)
+        .PermitUncheckedError();
+  }
+  return s;
+}
+
+Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
+    const Slice& block_contents, const BlockHandle* handle,
+    BlockType block_type, bool is_top_level_filter_block) {
+  Status s;
+  if (block_type == BlockType::kData || block_type == BlockType::kIndex) {
+    s = InsertBlockInCache<Block>(block_contents, handle, block_type);
+  } else if (block_type == BlockType::kFilter) {
+    if (rep_->filter_builder->IsBlockBased()) {
+      // for block-based filter which is deprecated.
+      s = InsertBlockInCache<BlockContents>(block_contents, handle, block_type);
+    } else if (is_top_level_filter_block) {
+      // for top level filter block in partitioned filter.
+      s = InsertBlockInCache<Block>(block_contents, handle, block_type);
+    } else {
+      // for second level partitioned filters and full filters.
+      s = InsertBlockInCache<ParsedFullFilterBlock>(block_contents, handle,
+                                                    block_type);
+    }
+  } else if (block_type == BlockType::kCompressionDictionary) {
+    s = InsertBlockInCache<UncompressionDict>(block_contents, handle,
+                                              block_type);
+  }
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+                                                  const BlockHandle* handle,
+                                                  BlockType block_type) {
+  // Uncompressed regular block cache
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  Status s;
+  if (block_cache != nullptr) {
+    size_t size = block_contents.size();
+    auto buf = AllocateBlock(size, block_cache->memory_allocator());
+    memcpy(buf.get(), block_contents.data(), size);
+    BlockContents results(std::move(buf), size);
+
+    CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
+
+    const size_t read_amp_bytes_per_bit =
+        rep_->table_options.read_amp_bytes_per_bit;
+
+    // TODO akanksha:: Dedup below code by calling
+    // BlockBasedTable::PutDataBlockToCache.
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(results), read_amp_bytes_per_bit,
+            rep_->ioptions.statistics.get(),
+            false /*rep_->blocks_definitely_zstd_compressed*/,
+            rep_->table_options.filter_policy.get()));
+
+    assert(block_holder->own_bytes());
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    s = block_cache->Insert(
+        key.AsSlice(), block_holder.get(),
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), charge,
+        nullptr, Cache::Priority::LOW);
+
+    if (s.ok()) {
+      // Release ownership of block_holder.
+      block_holder.release();
+      BlockBasedTable::UpdateCacheInsertionMetrics(
+          block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
+          rep_->ioptions.stats);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES);
+    }
   }
-  return Status::OK();
+  return s;
 }
 
 void BlockBasedTableBuilder::WriteFilterBlock(
     MetaIndexBuilder* meta_index_builder) {
   BlockHandle filter_block_handle;
-  bool empty_filter_block = (rep_->filter_builder == nullptr ||
-                             rep_->filter_builder->NumAdded() == 0);
+  bool empty_filter_block =
+      (rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty());
   if (ok() && !empty_filter_block) {
+    rep_->props.num_filter_entries +=
+        rep_->filter_builder->EstimateEntriesAdded();
     Status s = Status::Incomplete();
     while (ok() && s.IsIncomplete()) {
+      // filter_data is used to store the transferred filter data payload from
+      // FilterBlockBuilder and deallocate the payload by going out of scope.
+      // Otherwise, the payload will unnecessarily remain until
+      // BlockBasedTableBuilder is deallocated.
+      //
+      // See FilterBlockBuilder::Finish() for more on the difference in
+      // transferred filter data payload among different FilterBlockBuilder
+      // subtypes.
+      std::unique_ptr<const char[]> filter_data;
       Slice filter_content =
-          rep_->filter_builder->Finish(filter_block_handle, &s);
+          rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data);
       assert(s.ok() || s.IsIncomplete());
       rep_->props.filter_size += filter_content.size();
-      WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
+
+      // TODO: Refactor code so that BlockType can determine both the C++ type
+      // of a block cache entry (TBlocklike) and the CacheEntryRole while
+      // inserting blocks in cache.
+      bool top_level_filter_block = false;
+      if (s.ok() && rep_->table_options.partition_filters &&
+          !rep_->filter_builder->IsBlockBased()) {
+        top_level_filter_block = true;
+      }
+      WriteRawBlock(filter_content, kNoCompression, &filter_block_handle,
+                    BlockType::kFilter, nullptr /*raw_contents*/,
+                    top_level_filter_block);
     }
+    rep_->filter_builder->ResetFilterBitsBuilder();
   }
   if (ok() && !empty_filter_block) {
     // Add mapping from "<filter_block_prefix>.Name" to location
@@ -878,12 +1591,12 @@
     // HashIndexBuilder which is not multi-partition.
     assert(index_blocks.meta_blocks.empty());
   } else if (ok() && !index_builder_status.ok()) {
-    rep_->status = index_builder_status;
+    rep_->SetStatus(index_builder_status);
   }
   if (ok()) {
     for (const auto& item : index_blocks.meta_blocks) {
       BlockHandle block_handle;
-      WriteBlock(item.second, &block_handle, false /* is_data_block */);
+      WriteBlock(item.second, &block_handle, BlockType::kIndex);
       if (!ok()) {
         break;
       }
@@ -892,27 +1605,39 @@
   }
   if (ok()) {
     if (rep_->table_options.enable_index_compression) {
-      WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
+      WriteBlock(index_blocks.index_block_contents, index_block_handle,
+                 BlockType::kIndex);
     } else {
       WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
-                    index_block_handle);
+                    index_block_handle, BlockType::kIndex);
     }
   }
   // If there are more index partitions, finish them and write them out
-  Status s = index_builder_status;
-  while (ok() && s.IsIncomplete()) {
-    s = rep_->index_builder->Finish(&index_blocks, *index_block_handle);
-    if (!s.ok() && !s.IsIncomplete()) {
-      rep_->status = s;
-      return;
-    }
-    if (rep_->table_options.enable_index_compression) {
-      WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
-    } else {
-      WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
-                    index_block_handle);
+  if (index_builder_status.IsIncomplete()) {
+    bool index_building_finished = false;
+    while (ok() && !index_building_finished) {
+      Status s =
+          rep_->index_builder->Finish(&index_blocks, *index_block_handle);
+      if (s.ok()) {
+        index_building_finished = true;
+      } else if (s.IsIncomplete()) {
+        // More partitioned index after this one
+        assert(!index_building_finished);
+      } else {
+        // Error
+        rep_->SetStatus(s);
+        return;
+      }
+
+      if (rep_->table_options.enable_index_compression) {
+        WriteBlock(index_blocks.index_block_contents, index_block_handle,
+                   BlockType::kIndex);
+      } else {
+        WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+                      index_block_handle, BlockType::kIndex);
+      }
+      // The last index_block_handle will be for the partition index block
     }
-    // The last index_block_handle will be for the partition index block
   }
 }
 
@@ -921,8 +1646,6 @@
   BlockHandle properties_block_handle;
   if (ok()) {
     PropertyBlockBuilder property_block_builder;
-    rep_->props.column_family_id = rep_->column_family_id;
-    rep_->props.column_family_name = rep_->column_family_name;
     rep_->props.filter_policy_name =
         rep_->table_options.filter_policy != nullptr
             ? rep_->table_options.filter_policy->Name()
@@ -942,9 +1665,8 @@
         CompressionOptionsToString(rep_->compression_opts);
     rep_->props.prefix_extractor_name =
         rep_->moptions.prefix_extractor != nullptr
-            ? rep_->moptions.prefix_extractor->Name()
+            ? rep_->moptions.prefix_extractor->AsString()
             : "nullptr";
-
     std::string property_collectors_names = "[";
     for (size_t i = 0;
          i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
@@ -967,20 +1689,41 @@
         !rep_->index_builder->seperator_is_key_plus_seq();
     rep_->props.index_value_is_delta_encoded =
         rep_->use_delta_encoding_for_index_values;
-    rep_->props.creation_time = rep_->creation_time;
-    rep_->props.oldest_key_time = rep_->oldest_key_time;
-    rep_->props.file_creation_time = rep_->file_creation_time;
+    if (rep_->sampled_input_data_bytes > 0) {
+      rep_->props.slow_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_slow_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+      rep_->props.fast_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_fast_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+    } else if (rep_->sample_for_compression > 0) {
+      // We tried to sample but none were found. Assume worst-case (compression
+      // ratio 1.0) so data is complete and aggregatable.
+      rep_->props.slow_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+      rep_->props.fast_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+    }
 
     // Add basic properties
     property_block_builder.AddTableProperty(rep_->props);
 
     // Add use collected properties
     NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
-                                         rep_->ioptions.info_log,
+                                         rep_->ioptions.logger,
                                          &property_block_builder);
 
-    WriteRawBlock(property_block_builder.Finish(), kNoCompression,
-                  &properties_block_handle);
+    Slice block_data = property_block_builder.Finish();
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WritePropertiesBlock:BlockData", &block_data);
+    WriteRawBlock(block_data, kNoCompression, &properties_block_handle,
+                  BlockType::kProperties);
   }
   if (ok()) {
 #ifndef NDEBUG
@@ -995,7 +1738,12 @@
           &props_block_size);
     }
 #endif  // !NDEBUG
-    meta_index_builder->Add(kPropertiesBlock, properties_block_handle);
+
+    const std::string* properties_block_meta = &kPropertiesBlockName;
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WritePropertiesBlock:Meta",
+        &properties_block_meta);
+    meta_index_builder->Add(*properties_block_meta, properties_block_handle);
   }
 }
 
@@ -1006,7 +1754,8 @@
     BlockHandle compression_dict_block_handle;
     if (ok()) {
       WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression,
-                    &compression_dict_block_handle);
+                    &compression_dict_block_handle,
+                    BlockType::kCompressionDictionary);
 #ifndef NDEBUG
       Slice compression_dict = rep_->compression_dict->GetRawDict();
       TEST_SYNC_POINT_CALLBACK(
@@ -1015,7 +1764,7 @@
 #endif  // NDEBUG
     }
     if (ok()) {
-      meta_index_builder->Add(kCompressionDictBlock,
+      meta_index_builder->Add(kCompressionDictBlockName,
                               compression_dict_block_handle);
     }
   }
@@ -1026,37 +1775,29 @@
   if (ok() && !rep_->range_del_block.empty()) {
     BlockHandle range_del_block_handle;
     WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
-                  &range_del_block_handle);
-    meta_index_builder->Add(kRangeDelBlock, range_del_block_handle);
+                  &range_del_block_handle, BlockType::kRangeDeletion);
+    meta_index_builder->Add(kRangeDelBlockName, range_del_block_handle);
   }
 }
 
 void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
                                          BlockHandle& index_block_handle) {
   Rep* r = rep_;
-  // No need to write out new footer if we're using default checksum.
-  // We're writing legacy magic number because we want old versions of RocksDB
-  // be able to read files generated with new release (just in case if
-  // somebody wants to roll back after an upgrade)
-  // TODO(icanadi) at some point in the future, when we're absolutely sure
-  // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
-  // number and always write new table files with new magic number
-  bool legacy = (r->table_options.format_version == 0);
   // this is guaranteed by BlockBasedTableBuilder's constructor
   assert(r->table_options.checksum == kCRC32c ||
          r->table_options.format_version != 0);
-  Footer footer(
-      legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber,
-      r->table_options.format_version);
-  footer.set_metaindex_handle(metaindex_block_handle);
-  footer.set_index_handle(index_block_handle);
-  footer.set_checksum(r->table_options.checksum);
-  std::string footer_encoding;
-  footer.EncodeTo(&footer_encoding);
-  assert(r->status.ok());
-  r->status = r->file->Append(footer_encoding);
-  if (r->status.ok()) {
-    r->offset += footer_encoding.size();
+  assert(ok());
+
+  FooterBuilder footer;
+  footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version,
+               r->get_offset(), r->table_options.checksum,
+               metaindex_block_handle, index_block_handle);
+  IOStatus ios = r->file->Append(footer.GetSlice());
+  if (ios.ok()) {
+    r->set_offset(r->get_offset() + footer.GetSlice().size());
+  } else {
+    r->SetIOStatus(ios);
+    r->SetStatus(ios);
   }
 }
 
@@ -1067,20 +1808,45 @@
   const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
                                   ? r->compression_opts.zstd_max_train_bytes
                                   : r->compression_opts.max_dict_bytes;
-  Random64 generator{r->creation_time};
+  const size_t kNumBlocksBuffered = r->data_block_buffers.size();
+  if (kNumBlocksBuffered == 0) {
+    // The below code is neither safe nor necessary for handling zero data
+    // blocks.
+    return;
+  }
+
+  // Abstract algebra teaches us that a finite cyclic group (such as the
+  // additive group of integers modulo N) can be generated by a number that is
+  // coprime with N. Since N is variable (number of buffered data blocks), we
+  // must then pick a prime number in order to guarantee coprimeness with any N.
+  //
+  // One downside of this approach is the spread will be poor when
+  // `kPrimeGeneratorRemainder` is close to zero or close to
+  // `kNumBlocksBuffered`.
+  //
+  // Picked a random number between one and one trillion and then chose the
+  // next prime number greater than or equal to it.
+  const uint64_t kPrimeGenerator = 545055921143ull;
+  // Can avoid repeated division by just adding the remainder repeatedly.
+  const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
+      kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
+  const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
+
   std::string compression_dict_samples;
   std::vector<size_t> compression_dict_sample_lens;
-  if (!r->data_block_and_keys_buffers.empty()) {
-    while (compression_dict_samples.size() < kSampleBytes) {
-      size_t rand_idx =
-          static_cast<size_t>(
-              generator.Uniform(r->data_block_and_keys_buffers.size()));
-      size_t copy_len =
-          std::min(kSampleBytes - compression_dict_samples.size(),
-                   r->data_block_and_keys_buffers[rand_idx].first.size());
-      compression_dict_samples.append(
-          r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
-      compression_dict_sample_lens.emplace_back(copy_len);
+  size_t buffer_idx = kInitSampleIdx;
+  for (size_t i = 0;
+       i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+       ++i) {
+    size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(),
+                               r->data_block_buffers[buffer_idx].size());
+    compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0,
+                                    copy_len);
+    compression_dict_sample_lens.emplace_back(copy_len);
+
+    buffer_idx += kPrimeGeneratorRemainder;
+    if (buffer_idx >= kNumBlocksBuffered) {
+      buffer_idx -= kNumBlocksBuffered;
     }
   }
 
@@ -1100,45 +1866,114 @@
       dict, r->compression_type == kZSTD ||
                 r->compression_type == kZSTDNotFinalCompression));
 
-  for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) {
-    const auto& data_block = r->data_block_and_keys_buffers[i].first;
-    auto& keys = r->data_block_and_keys_buffers[i].second;
+  auto get_iterator_for_block = [&r](size_t i) {
+    auto& data_block = r->data_block_buffers[i];
     assert(!data_block.empty());
-    assert(!keys.empty());
 
-    for (const auto& key : keys) {
-      if (r->filter_builder != nullptr) {
-        size_t ts_sz =
-            r->internal_comparator.user_comparator()->timestamp_size();
-        r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+    Block reader{BlockContents{data_block}};
+    DataBlockIter* iter = reader.NewDataIterator(
+        r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber);
+
+    iter->SeekToFirst();
+    assert(iter->Valid());
+    return std::unique_ptr<DataBlockIter>(iter);
+  };
+
+  std::unique_ptr<DataBlockIter> iter = nullptr, next_block_iter = nullptr;
+
+  for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) {
+    if (iter == nullptr) {
+      iter = get_iterator_for_block(i);
+      assert(iter != nullptr);
+    };
+
+    if (i + 1 < r->data_block_buffers.size()) {
+      next_block_iter = get_iterator_for_block(i + 1);
+    }
+
+    auto& data_block = r->data_block_buffers[i];
+
+    if (r->IsParallelCompressionEnabled()) {
+      Slice first_key_in_next_block;
+      const Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+      if (i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        first_key_in_next_block = next_block_iter->key();
+      } else {
+        first_key_in_next_block_ptr = r->first_key_in_next_block;
+      }
+
+      std::vector<std::string> keys;
+      for (; iter->Valid(); iter->Next()) {
+        keys.emplace_back(iter->key().ToString());
+      }
+
+      ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+          r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
+
+      assert(block_rep != nullptr);
+      r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                               r->get_offset());
+      r->pc_rep->EmitBlock(block_rep);
+    } else {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (r->filter_builder != nullptr) {
+          size_t ts_sz =
+              r->internal_comparator.user_comparator()->timestamp_size();
+          r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
+        }
+        r->index_builder->OnKeyAdded(key);
+      }
+      WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
+      if (ok() && i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        Slice first_key_in_next_block = next_block_iter->key();
+
+        Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+
+        iter->SeekToLast();
+        std::string last_key = iter->key().ToString();
+        r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr,
+                                        r->pending_handle);
       }
-      r->index_builder->OnKeyAdded(key);
-    }
-    WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */);
-    if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) {
-      Slice first_key_in_next_block =
-          r->data_block_and_keys_buffers[i + 1].second.front();
-      Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
-      r->index_builder->AddIndexEntry(&keys.back(), first_key_in_next_block_ptr,
-                                      r->pending_handle);
     }
+    std::swap(iter, next_block_iter);
+  }
+  r->data_block_buffers.clear();
+  r->data_begin_offset = 0;
+  // Release all reserved cache for data block buffers
+  if (r->compression_dict_buffer_cache_res_mgr != nullptr) {
+    Status s = r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation<
+        CacheEntryRole::kCompressionDictionaryBuildingBuffer>(
+        r->data_begin_offset);
+    s.PermitUncheckedError();
   }
-  r->data_block_and_keys_buffers.clear();
 }
 
 Status BlockBasedTableBuilder::Finish() {
   Rep* r = rep_;
   assert(r->state != Rep::State::kClosed);
   bool empty_data_block = r->data_block.empty();
+  r->first_key_in_next_block = nullptr;
   Flush();
   if (r->state == Rep::State::kBuffered) {
     EnterUnbuffered();
   }
-  // To make sure properties block is able to keep the accurate size of index
-  // block, we will finish writing all index entries first.
-  if (ok() && !empty_data_block) {
-    r->index_builder->AddIndexEntry(
-        &r->last_key, nullptr /* no next data block */, r->pending_handle);
+  if (r->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
+#ifndef NDEBUG
+    for (const auto& br : r->pc_rep->block_rep_buf) {
+      assert(br.status.ok());
+    }
+#endif  // !NDEBUG
+  } else {
+    // To make sure properties block is able to keep the accurate size of index
+    // block, we will finish writing all index entries first.
+    if (ok() && !empty_data_block) {
+      r->index_builder->AddIndexEntry(
+          &r->last_key, nullptr /* no next data block */, r->pending_handle);
+    }
   }
 
   // Write meta blocks, metaindex block and footer in the following order.
@@ -1159,29 +1994,48 @@
   if (ok()) {
     // flush the meta index block
     WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
-                  &metaindex_block_handle);
+                  &metaindex_block_handle, BlockType::kMetaIndex);
   }
   if (ok()) {
     WriteFooter(metaindex_block_handle, index_block_handle);
   }
-  if (r->file != nullptr) {
-    file_checksum_ = r->file->GetFileChecksum();
-  }
   r->state = Rep::State::kClosed;
-  return r->status;
+  r->SetStatus(r->CopyIOStatus());
+  Status ret_status = r->CopyStatus();
+  assert(!ret_status.ok() || io_status().ok());
+  return ret_status;
 }
 
 void BlockBasedTableBuilder::Abandon() {
   assert(rep_->state != Rep::State::kClosed);
+  if (rep_->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
+  }
   rep_->state = Rep::State::kClosed;
+  rep_->CopyStatus().PermitUncheckedError();
+  rep_->CopyIOStatus().PermitUncheckedError();
 }
 
 uint64_t BlockBasedTableBuilder::NumEntries() const {
   return rep_->props.num_entries;
 }
 
+bool BlockBasedTableBuilder::IsEmpty() const {
+  return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0;
+}
+
 uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
 
+uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
+  if (rep_->IsParallelCompressionEnabled()) {
+    // Use compression ratio so far and inflight raw bytes to estimate
+    // final SST size.
+    return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize();
+  } else {
+    return FileSize();
+  }
+}
+
 bool BlockBasedTableBuilder::NeedCompact() const {
   for (const auto& collector : rep_->table_properties_collectors) {
     if (collector->NeedCompact()) {
@@ -1197,16 +2051,24 @@
     for (const auto& prop : collector->GetReadableProperties()) {
       ret.readable_properties.insert(prop);
     }
-    collector->Finish(&ret.user_collected_properties);
+    collector->Finish(&ret.user_collected_properties).PermitUncheckedError();
   }
   return ret;
 }
 
+std::string BlockBasedTableBuilder::GetFileChecksum() const {
+  if (rep_->file != nullptr) {
+    return rep_->file->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
 const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const {
   if (rep_->file != nullptr) {
     return rep_->file->GetFileChecksumFuncName();
   } else {
-    return kUnknownFileChecksumFuncName.c_str();
+    return kUnknownFileChecksumFuncName;
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,8 @@
 
 #pragma once
 #include <stdint.h>
+
+#include <array>
 #include <limits>
 #include <string>
 #include <utility>
@@ -19,6 +21,7 @@
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
+#include "rocksdb/table.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "util/compression.h"
@@ -38,20 +41,9 @@
   // Create a builder that will store the contents of the table it is
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish().
-  BlockBasedTableBuilder(
-      const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-      const BlockBasedTableOptions& table_options,
-      const InternalKeyComparator& internal_comparator,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          int_tbl_prop_collector_factories,
-      uint32_t column_family_id, WritableFileWriter* file,
-      const CompressionType compression_type,
-      const uint64_t sample_for_compression,
-      const CompressionOptions& compression_opts, const bool skip_filters,
-      const std::string& column_family_name, const int level_at_creation,
-      const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
-      const uint64_t target_file_size = 0,
-      const uint64_t file_creation_time = 0);
+  BlockBasedTableBuilder(const BlockBasedTableOptions& table_options,
+                         const TableBuilderOptions& table_builder_options,
+                         WritableFileWriter* file);
 
   // No copying allowed
   BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
@@ -68,6 +60,9 @@
   // Return non-ok iff some error has been detected.
   Status status() const override;
 
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override;
+
   // Finish building the table.  Stops using the file passed to the
   // constructor after this function returns.
   // REQUIRES: Finish(), Abandon() have not been called
@@ -83,17 +78,24 @@
   // Number of calls to Add() so far.
   uint64_t NumEntries() const override;
 
+  bool IsEmpty() const override;
+
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const override;
 
+  // Estimated size of the file generated so far. This is used when
+  // FileSize() cannot estimate final SST size, e.g. parallel compression
+  // is enabled.
+  uint64_t EstimatedFileSize() const override;
+
   bool NeedCompact() const override;
 
   // Get table properties
   TableProperties GetTableProperties() const override;
 
   // Get file checksum
-  const std::string& GetFileChecksum() const override { return file_checksum_; }
+  std::string GetFileChecksum() const override;
 
   // Get file checksum function name
   const char* GetFileChecksumFuncName() const override;
@@ -106,19 +108,34 @@
   // REQUIRES: `rep_->state == kBuffered`
   void EnterUnbuffered();
 
-  // Call block's Finish() method
-  // and then write the compressed block contents to file.
-  void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block);
+  // Call block's Finish() method and then
+  // - in buffered mode, buffer the uncompressed block contents.
+  // - in unbuffered mode, write the compressed block contents to file.
+  void WriteBlock(BlockBuilder* block, BlockHandle* handle,
+                  BlockType blocktype);
 
   // Compress and write block content to the file.
   void WriteBlock(const Slice& block_contents, BlockHandle* handle,
-                  bool is_data_block);
+                  BlockType block_type);
   // Directly write data to the file.
   void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle,
-                     bool is_data_block = false);
+                     BlockType block_type, const Slice* raw_data = nullptr,
+                     bool is_top_level_filter_block = false);
+
+  void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
+
+  template <typename TBlocklike>
   Status InsertBlockInCache(const Slice& block_contents,
-                            const CompressionType type,
-                            const BlockHandle* handle);
+                            const BlockHandle* handle, BlockType block_type);
+
+  Status InsertBlockInCacheHelper(const Slice& block_contents,
+                                  const BlockHandle* handle,
+                                  BlockType block_type,
+                                  bool is_top_level_filter_block);
+
+  Status InsertBlockInCompressedCache(const Slice& block_contents,
+                                      const CompressionType type,
+                                      const BlockHandle* handle);
 
   void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
   void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
@@ -134,6 +151,8 @@
   class BlockBasedTablePropertiesCollector;
   Rep* rep_;
 
+  struct ParallelCompressionRep;
+
   // Advanced operation: flush any buffered key/value pairs to file.
   // Can be used to ensure that two adjacent entries never live in
   // the same data block.  Most clients should not need to use this method.
@@ -144,8 +163,31 @@
   // uncompressed size is bigger than kCompressionSizeLimit, don't compress it
   const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
 
-  // Store file checksum. If checksum is disabled, its value is "0".
-  std::string file_checksum_ = kUnknownFileChecksum;
+  // Get blocks from mem-table walking thread, compress them and
+  // pass them to the write thread. Used in parallel compression mode only
+  void BGWorkCompression(const CompressionContext& compression_ctx,
+                         UncompressionContext* verify_ctx);
+
+  // Given raw block content, try to compress it and return result and
+  // compression type
+  void CompressAndVerifyBlock(const Slice& raw_block_contents,
+                              bool is_data_block,
+                              const CompressionContext& compression_ctx,
+                              UncompressionContext* verify_ctx,
+                              std::string* compressed_output,
+                              Slice* result_block_contents,
+                              CompressionType* result_compression_type,
+                              Status* out_status);
+
+  // Get compressed blocks from BGWorkCompression and write them into SST
+  void BGWorkWriteRawBlock();
+
+  // Initialize parallel compression context and
+  // start BGWorkCompression and BGWorkWriteRawBlock threads
+  void StartParallelCompression();
+
+  // Stop BGWorkCompression and BGWorkWriteRawBlock threads
+  void StopParallelCompression();
 };
 
 Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,19 +7,26 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "table/block_based/block_based_table_factory.h"
+
 #include <stdint.h>
-#include <cinttypes>
 
+#include <cinttypes>
 #include <memory>
 #include <string>
 
+#include "cache/cache_entry_roles.h"
+#include "logging/logging.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
 #include "table/block_based/block_based_table_builder.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 #include "util/mutexlock.h"
@@ -157,11 +164,294 @@
   return std::min(kMaxPrefetchSize, max_qualified_size);
 }
 
+#ifndef ROCKSDB_LITE
+
+const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
+
+static std::unordered_map<std::string, PinningTier>
+    pinning_tier_type_string_map = {
+        {"kFallback", PinningTier::kFallback},
+        {"kNone", PinningTier::kNone},
+        {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
+        {"kAll", PinningTier::kAll}};
+
+static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
+    block_base_table_index_type_string_map = {
+        {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
+        {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
+        {"kTwoLevelIndexSearch",
+         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
+        {"kBinarySearchWithFirstKey",
+         BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::DataBlockIndexType>
+    block_base_table_data_block_index_type_string_map = {
+        {"kDataBlockBinarySearch",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
+        {"kDataBlockBinaryAndHash",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::IndexShorteningMode>
+    block_base_table_index_shortening_mode_string_map = {
+        {"kNoShortening",
+         BlockBasedTableOptions::IndexShorteningMode::kNoShortening},
+        {"kShortenSeparators",
+         BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators},
+        {"kShortenSeparatorsAndSuccessor",
+         BlockBasedTableOptions::IndexShorteningMode::
+             kShortenSeparatorsAndSuccessor}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    metadata_cache_options_type_info = {
+        {"top_level_index_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, top_level_index_pinning),
+             &pinning_tier_type_string_map)},
+        {"partition_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, partition_pinning),
+             &pinning_tier_type_string_map)},
+        {"unpartitioned_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
+             &pinning_tier_type_string_map)}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::PrepopulateBlockCache>
+    block_base_table_prepopulate_block_cache_string_map = {
+        {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
+        {"kFlushOnly",
+         BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
+
+#endif  // ROCKSDB_LITE
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    block_based_table_type_info = {
+#ifndef ROCKSDB_LITE
+        /* currently not supported
+          std::shared_ptr<Cache> block_cache = nullptr;
+          std::shared_ptr<Cache> block_cache_compressed = nullptr;
+         */
+        {"flush_block_policy_factory",
+         OptionTypeInfo::AsCustomSharedPtr<FlushBlockPolicyFactory>(
+             offsetof(struct BlockBasedTableOptions,
+                      flush_block_policy_factory),
+             OptionVerificationType::kByName, OptionTypeFlags::kCompareNever)},
+        {"cache_index_and_filter_blocks",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cache_index_and_filter_blocks_with_high_priority",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks_with_high_priority),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"pin_l0_filter_and_index_blocks_in_cache",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_l0_filter_and_index_blocks_in_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
+                           offsetof(struct BlockBasedTableOptions, index_type),
+                           &block_base_table_index_type_string_map)},
+        {"hash_index_allow_collision",
+         {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"data_block_index_type",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::DataBlockIndexType>(
+             offsetof(struct BlockBasedTableOptions, data_block_index_type),
+             &block_base_table_data_block_index_type_string_map)},
+        {"index_shortening",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::IndexShorteningMode>(
+             offsetof(struct BlockBasedTableOptions, index_shortening),
+             &block_base_table_index_shortening_mode_string_map)},
+        {"data_block_hash_table_util_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   data_block_hash_table_util_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"checksum",
+         {offsetof(struct BlockBasedTableOptions, checksum),
+          OptionType::kChecksumType, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"no_block_cache",
+         {offsetof(struct BlockBasedTableOptions, no_block_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_size",
+         {offsetof(struct BlockBasedTableOptions, block_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"block_size_deviation",
+         {offsetof(struct BlockBasedTableOptions, block_size_deviation),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"index_block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"index_per_partition",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"metadata_block_size",
+         {offsetof(struct BlockBasedTableOptions, metadata_block_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"partition_filters",
+         {offsetof(struct BlockBasedTableOptions, partition_filters),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"optimize_filters_for_memory",
+         {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"filter_policy",
+         {offsetof(struct BlockBasedTableOptions, filter_policy),
+          OptionType::kUnknown, OptionVerificationType::kByNameAllowFromNull,
+          OptionTypeFlags::kNone,
+          // Parses the Filter policy
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* policy =
+                static_cast<std::shared_ptr<const FilterPolicy>*>(addr);
+            return FilterPolicy::CreateFromString(opts, value, policy);
+          },
+          // Converts the FilterPolicy to its string representation
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* policy =
+                static_cast<const std::shared_ptr<const FilterPolicy>*>(addr);
+            if (policy->get()) {
+              *value = (*policy)->Name();
+            } else {
+              *value = kNullptrString;
+            }
+            return Status::OK();
+          },
+          // Compares two FilterPolicy objects for equality
+          [](const ConfigOptions&, const std::string&, const void* addr1,
+             const void* addr2, std::string*) {
+            const auto* policy1 =
+                static_cast<const std::shared_ptr<const FilterPolicy>*>(addr1)
+                    ->get();
+            const auto* policy2 =
+                static_cast<const std::shared_ptr<FilterPolicy>*>(addr2)->get();
+            if (policy1 == policy2) {
+              return true;
+            } else if (policy1 != nullptr && policy2 != nullptr) {
+              return (strcmp(policy1->Name(), policy2->Name()) == 0);
+            } else {
+              return false;
+            }
+          }}},
+        {"whole_key_filtering",
+         {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"reserve_table_builder_memory",
+         {offsetof(struct BlockBasedTableOptions, reserve_table_builder_memory),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"skip_table_builder_flush",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
+          OptionTypeFlags::kNone}},
+        {"format_version",
+         {offsetof(struct BlockBasedTableOptions, format_version),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"verify_compression",
+         {offsetof(struct BlockBasedTableOptions, verify_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"read_amp_bytes_per_bit",
+         {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone,
+          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
+            // and 6.14. The bug will write out 8 bytes to OPTIONS file from the
+            // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
+            // which is actually a uint32. Consequently, the value of
+            // read_amp_bytes_per_bit written in the OPTIONS file is wrong.
+            // From 6.15, RocksDB will try to parse the read_amp_bytes_per_bit
+            // from OPTIONS file as a uint32. To be able to load OPTIONS file
+            // generated by affected releases before the fix, we need to
+            // manually parse read_amp_bytes_per_bit with this special hack.
+            uint64_t read_amp_bytes_per_bit = ParseUint64(value);
+            *(static_cast<uint32_t*>(addr)) =
+                static_cast<uint32_t>(read_amp_bytes_per_bit);
+            return Status::OK();
+          }}},
+        {"enable_index_compression",
+         {offsetof(struct BlockBasedTableOptions, enable_index_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"block_align",
+         {offsetof(struct BlockBasedTableOptions, block_align),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"pin_top_level_index_and_filter",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_top_level_index_and_filter),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {kOptNameMetadataCacheOpts,
+         OptionTypeInfo::Struct(
+             kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
+             offsetof(struct BlockBasedTableOptions, metadata_cache_options),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+        {"block_cache",
+         {offsetof(struct BlockBasedTableOptions, block_cache),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+          // Parses the input vsalue as a Cache
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+            return Cache::CreateFromString(opts, value, cache);
+          }}},
+        {"block_cache_compressed",
+         {offsetof(struct BlockBasedTableOptions, block_cache_compressed),
+          OptionType::kUnknown, OptionVerificationType::kNormal,
+          (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
+          // Parses the input vsalue as a Cache
+          [](const ConfigOptions& opts, const std::string&,
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
+            return Cache::CreateFromString(opts, value, cache);
+          }}},
+        {"max_auto_readahead_size",
+         {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"prepopulate_block_cache",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
+             offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
+             &block_base_table_prepopulate_block_cache_string_map,
+             OptionTypeFlags::kMutable)},
+
+#endif  // ROCKSDB_LITE
+};
+
 // TODO(myabandeh): We should return an error instead of silently changing the
 // options
 BlockBasedTableFactory::BlockBasedTableFactory(
     const BlockBasedTableOptions& _table_options)
     : table_options_(_table_options) {
+  InitializeOptions();
+  RegisterOptions(&table_options_, &block_based_table_type_info);
+}
+
+void BlockBasedTableFactory::InitializeOptions() {
   if (table_options_.flush_block_policy_factory == nullptr) {
     table_options_.flush_block_policy_factory.reset(
         new FlushBlockBySizePolicyFactory());
@@ -199,42 +489,148 @@
   }
 }
 
+Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
+  InitializeOptions();
+  return TableFactory::PrepareOptions(opts);
+}
+
+namespace {
+// Different cache kinds use the same keys for physically different values, so
+// they must not share an underlying key space with each other.
+Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) {
+  int cache_count = (bbto.block_cache != nullptr) +
+                    (bbto.block_cache_compressed != nullptr) +
+                    (bbto.persistent_cache != nullptr);
+  if (cache_count <= 1) {
+    // Nothing to share / overlap
+    return Status::OK();
+  }
+
+  // Simple pointer equality
+  if (bbto.block_cache == bbto.block_cache_compressed) {
+    return Status::InvalidArgument(
+        "block_cache same as block_cache_compressed not currently supported, "
+        "and would be bad for performance anyway");
+  }
+
+  // More complex test of shared key space, in case the instances are wrappers
+  // for some shared underlying cache.
+  std::string sentinel_key(size_t{1}, '\0');
+  static char kRegularBlockCacheMarker = 'b';
+  static char kCompressedBlockCacheMarker = 'c';
+  static char kPersistentCacheMarker = 'p';
+  if (bbto.block_cache) {
+    bbto.block_cache
+        ->Insert(Slice(sentinel_key), &kRegularBlockCacheMarker, 1,
+                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+        .PermitUncheckedError();
+  }
+  if (bbto.block_cache_compressed) {
+    bbto.block_cache_compressed
+        ->Insert(Slice(sentinel_key), &kCompressedBlockCacheMarker, 1,
+                 GetNoopDeleterForRole<CacheEntryRole::kMisc>())
+        .PermitUncheckedError();
+  }
+  if (bbto.persistent_cache) {
+    // Note: persistent cache copies the data, not keeping the pointer
+    bbto.persistent_cache
+        ->Insert(Slice(sentinel_key), &kPersistentCacheMarker, 1)
+        .PermitUncheckedError();
+  }
+  // If we get something different from what we inserted, that indicates
+  // dangerously overlapping key spaces.
+  if (bbto.block_cache) {
+    auto handle = bbto.block_cache->Lookup(Slice(sentinel_key));
+    if (handle) {
+      auto v = static_cast<char*>(bbto.block_cache->Value(handle));
+      char c = *v;
+      bbto.block_cache->Release(handle);
+      if (v == &kCompressedBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache and block_cache_compressed share the same key space, "
+            "which is not supported");
+      } else if (c == kPersistentCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache and persistent_cache share the same key space, "
+            "which is not supported");
+      } else if (v != &kRegularBlockCacheMarker) {
+        return Status::Corruption("Unexpected mutation to block_cache");
+      }
+    }
+  }
+  if (bbto.block_cache_compressed) {
+    auto handle = bbto.block_cache_compressed->Lookup(Slice(sentinel_key));
+    if (handle) {
+      auto v = static_cast<char*>(bbto.block_cache_compressed->Value(handle));
+      char c = *v;
+      bbto.block_cache_compressed->Release(handle);
+      if (v == &kRegularBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache_compressed and block_cache share the same key space, "
+            "which is not supported");
+      } else if (c == kPersistentCacheMarker) {
+        return Status::InvalidArgument(
+            "block_cache_compressed and persistent_cache share the same key "
+            "space, "
+            "which is not supported");
+      } else if (v != &kCompressedBlockCacheMarker) {
+        return Status::Corruption(
+            "Unexpected mutation to block_cache_compressed");
+      }
+    }
+  }
+  if (bbto.persistent_cache) {
+    std::unique_ptr<char[]> data;
+    size_t size = 0;
+    bbto.persistent_cache->Lookup(Slice(sentinel_key), &data, &size)
+        .PermitUncheckedError();
+    if (data && size > 0) {
+      if (data[0] == kRegularBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "persistent_cache and block_cache share the same key space, "
+            "which is not supported");
+      } else if (data[0] == kCompressedBlockCacheMarker) {
+        return Status::InvalidArgument(
+            "persistent_cache and block_cache_compressed share the same key "
+            "space, "
+            "which is not supported");
+      } else if (data[0] != kPersistentCacheMarker) {
+        return Status::Corruption("Unexpected mutation to persistent_cache");
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 Status BlockBasedTableFactory::NewTableReader(
-    const TableReaderOptions& table_reader_options,
+    const ReadOptions& ro, const TableReaderOptions& table_reader_options,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table_reader,
     bool prefetch_index_and_filter_in_cache) const {
   return BlockBasedTable::Open(
-      table_reader_options.ioptions, table_reader_options.env_options,
+      ro, table_reader_options.ioptions, table_reader_options.env_options,
       table_options_, table_reader_options.internal_comparator, std::move(file),
       file_size, table_reader, table_reader_options.prefix_extractor,
       prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
       table_reader_options.level, table_reader_options.immortal,
-      table_reader_options.largest_seqno, &tail_prefetch_stats_,
-      table_reader_options.block_cache_tracer);
+      table_reader_options.largest_seqno,
+      table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
+      table_reader_options.block_cache_tracer,
+      table_reader_options.max_file_size_for_l0_meta_pin,
+      table_reader_options.cur_db_session_id,
+      table_reader_options.cur_file_num);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
-  auto table_builder = new BlockBasedTableBuilder(
-      table_builder_options.ioptions, table_builder_options.moptions,
-      table_options_, table_builder_options.internal_comparator,
-      table_builder_options.int_tbl_prop_collector_factories, column_family_id,
-      file, table_builder_options.compression_type,
-      table_builder_options.sample_for_compression,
-      table_builder_options.compression_opts,
-      table_builder_options.skip_filters,
-      table_builder_options.column_family_name, table_builder_options.level,
-      table_builder_options.creation_time,
-      table_builder_options.oldest_key_time,
-      table_builder_options.target_file_size,
-      table_builder_options.file_creation_time);
-
-  return table_builder;
+  return new BlockBasedTableBuilder(table_options_, table_builder_options,
+                                    file);
 }
 
-Status BlockBasedTableFactory::SanitizeOptions(
+Status BlockBasedTableFactory::ValidateOptions(
     const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
   if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
       cf_opts.prefix_extractor == nullptr) {
@@ -254,7 +650,7 @@
         "Enable pin_l0_filter_and_index_blocks_in_cache, "
         ", but block cache is disabled");
   }
-  if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
+  if (!IsSupportedFormatVersion(table_options_.format_version)) {
     return Status::InvalidArgument(
         "Unsupported BlockBasedTable format_version. Please check "
         "include/rocksdb/table.h for more info");
@@ -286,10 +682,24 @@
         "max_successive_merges larger than 0 is currently inconsistent with "
         "unordered_write");
   }
-  return Status::OK();
+  {
+    Status s = CheckCacheOptionCompatibility(table_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  std::string garbage;
+  if (!SerializeEnum<ChecksumType>(checksum_type_string_map,
+                                   table_options_.checksum, &garbage)) {
+    return Status::InvalidArgument(
+        "Unrecognized ChecksumType for checksum: " +
+        ROCKSDB_NAMESPACE::ToString(
+            static_cast<uint32_t>(table_options_.checksum)));
+  }
+  return TableFactory::ValidateOptions(db_opts, cf_opts);
 }
 
-std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
+std::string BlockBasedTableFactory::GetPrintableOptions() const {
   std::string ret;
   ret.reserve(20000);
   const int kBufferSize = 200;
@@ -368,7 +778,7 @@
     ret.append(buffer);
     ret.append(table_options_.persistent_cache->GetPrintableOptions());
   }
-  snprintf(buffer, kBufferSize, "  block_size: %" ROCKSDB_PRIszt "\n",
+  snprintf(buffer, kBufferSize, "  block_size: %" PRIu64 "\n",
            table_options_.block_size);
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  block_size_deviation: %d\n",
@@ -412,149 +822,112 @@
   snprintf(buffer, kBufferSize, "  block_align: %d\n",
            table_options_.block_align);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.max_auto_readahead_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  prepopulate_block_cache: %d\n",
+           static_cast<int>(table_options_.prepopulate_block_cache));
+  ret.append(buffer);
   return ret;
 }
 
-#ifndef ROCKSDB_LITE
-namespace {
-bool SerializeSingleBlockBasedTableOption(
-    std::string* opt_string, const BlockBasedTableOptions& bbt_options,
-    const std::string& name, const std::string& delimiter) {
-  auto iter = block_based_table_type_info.find(name);
-  if (iter == block_based_table_type_info.end()) {
-    return false;
-  }
-  auto& opt_info = iter->second;
-  const char* opt_address =
-      reinterpret_cast<const char*>(&bbt_options) + opt_info.offset;
-  std::string value;
-  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
-  if (result) {
-    *opt_string = name + "=" + value + delimiter;
-  }
-  return result;
-}
-}  // namespace
-
-Status BlockBasedTableFactory::GetOptionString(
-    std::string* opt_string, const std::string& delimiter) const {
-  assert(opt_string);
-  opt_string->clear();
-  for (auto iter = block_based_table_type_info.begin();
-       iter != block_based_table_type_info.end(); ++iter) {
-    if (iter->second.verification == OptionVerificationType::kDeprecated) {
-      // If the option is no longer used in rocksdb and marked as deprecated,
-      // we skip it in the serialization.
-      continue;
-    }
-    std::string single_output;
-    bool result = SerializeSingleBlockBasedTableOption(
-        &single_output, table_options_, iter->first, delimiter);
-    assert(result);
-    if (result) {
-      opt_string->append(single_output);
+const void* BlockBasedTableFactory::GetOptionsPtr(
+    const std::string& name) const {
+  if (name == kBlockCacheOpts()) {
+    if (table_options_.no_block_cache) {
+      return nullptr;
+    } else {
+      return table_options_.block_cache.get();
     }
+  } else {
+    return TableFactory::GetOptionsPtr(name);
   }
-  return Status::OK();
-}
-#else
-Status BlockBasedTableFactory::GetOptionString(
-    std::string* /*opt_string*/, const std::string& /*delimiter*/) const {
-  return Status::OK();
-}
-#endif  // !ROCKSDB_LITE
-
-const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const {
-  return table_options_;
 }
 
 #ifndef ROCKSDB_LITE
-namespace {
-std::string ParseBlockBasedTableOption(const std::string& name,
-                                       const std::string& org_value,
-                                       BlockBasedTableOptions* new_options,
-                                       bool input_strings_escaped = false,
-                                       bool ignore_unknown_options = false) {
-  const std::string& value =
-      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
-  if (!input_strings_escaped) {
-    // if the input string is not escaped, it means this function is
-    // invoked from SetOptions, which takes the old format.
-    if (name == "block_cache" || name == "block_cache_compressed") {
-      // cache options can be specified in the following format
-      //   "block_cache={capacity=1M;num_shard_bits=4;
-      //    strict_capacity_limit=true;high_pri_pool_ratio=0.5;}"
-      // To support backward compatibility, the following format
-      // is also supported.
-      //   "block_cache=1M"
-      std::shared_ptr<Cache> cache;
-      // block_cache is specified in format block_cache=<cache_size>.
-      if (value.find('=') == std::string::npos) {
-        cache = NewLRUCache(ParseSizeT(value));
-      } else {
-        LRUCacheOptions cache_opts;
-        if (!ParseOptionHelper(reinterpret_cast<char*>(&cache_opts),
-                               OptionType::kLRUCacheOptions, value)) {
-          return "Invalid cache options";
-        }
-        cache = NewLRUCache(cache_opts);
-      }
-
-      if (name == "block_cache") {
-        new_options->block_cache = cache;
-      } else {
-        new_options->block_cache_compressed = cache;
-      }
-      return "";
-    } else if (name == "filter_policy") {
-      // Expect the following format
-      // bloomfilter:int:bool
-      const std::string kName = "bloomfilter:";
-      if (value.compare(0, kName.size(), kName) != 0) {
-        return "Invalid filter policy name";
-      }
-      size_t pos = value.find(':', kName.size());
-      if (pos == std::string::npos) {
-        return "Invalid filter policy config, missing bits_per_key";
-      }
-      double bits_per_key =
-          ParseDouble(trim(value.substr(kName.size(), pos - kName.size())));
-      bool use_block_based_builder =
-          ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
-      new_options->filter_policy.reset(
-          NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
-      return "";
+// Take a default BlockBasedTableOptions "table_options" in addition to a
+// map "opts_map" of option name to option value to construct the new
+// BlockBasedTableOptions "new_table_options".
+//
+// Below are the instructions of how to config some non-primitive-typed
+// options in BlockBasedTableOptions:
+//
+// * filter_policy:
+//   We currently only support the following FilterPolicy in the convenience
+//   functions:
+//   - BloomFilter: use "bloomfilter:[bits_per_key]:[use_block_based_builder]"
+//     to specify BloomFilter.  The above string is equivalent to calling
+//     NewBloomFilterPolicy(bits_per_key, use_block_based_builder).
+//     [Example]:
+//     - Pass {"filter_policy", "bloomfilter:4:true"} in
+//       GetBlockBasedTableOptionsFromMap to use a BloomFilter with 4-bits
+//       per key and use_block_based_builder enabled.
+//
+// * block_cache / block_cache_compressed:
+//   We currently only support LRU cache in the GetOptions API.  The LRU
+//   cache can be set by directly specifying its size.
+//   [Example]:
+//   - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is
+//     equivalent to setting block_cache using NewLRUCache(1024 * 1024).
+//
+// @param table_options the default options of the output "new_table_options".
+// @param opts_map an option name to value map for specifying how
+//     "new_table_options" should be set.
+// @param new_table_options the resulting options based on "table_options"
+//     with the change specified in "opts_map".
+// @param input_strings_escaped when set to true, each escaped characters
+//     prefixed by '\' in the values of the opts_map will be further converted
+//     back to the raw string before assigning to the associated options.
+// @param ignore_unknown_options when set to true, unknown options are ignored
+//     instead of resulting in an unknown-option error.
+// @return Status::OK() on success.  Otherwise, a non-ok status indicating
+//     error will be returned, and "new_table_options" will be set to
+//     "table_options".
+Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
+                                           const OptionTypeInfo& opt_info,
+                                           const std::string& opt_name,
+                                           const std::string& opt_value,
+                                           void* opt_ptr) {
+  Status status = TableFactory::ParseOption(config_options, opt_info, opt_name,
+                                            opt_value, opt_ptr);
+  if (config_options.input_strings_escaped && !status.ok()) {  // Got an error
+    // !input_strings_escaped indicates the old API, where everything is
+    // parsable.
+    if (opt_info.IsByName()) {
+      status = Status::OK();
     }
   }
-  const auto iter = block_based_table_type_info.find(name);
-  if (iter == block_based_table_type_info.end()) {
-    if (ignore_unknown_options) {
-      return "";
-    } else {
-      return "Unrecognized option";
-    }
-  }
-  const auto& opt_info = iter->second;
-  if (opt_info.verification != OptionVerificationType::kDeprecated &&
-      !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
-                         opt_info.type, value)) {
-    return "Invalid value";
-  }
-  return "";
+  return status;
 }
-}  // namespace
 
 Status GetBlockBasedTableOptionsFromString(
     const BlockBasedTableOptions& table_options, const std::string& opts_str,
     BlockBasedTableOptions* new_table_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.invoke_prepare_options = false;
+  return GetBlockBasedTableOptionsFromString(config_options, table_options,
+                                             opts_str, new_table_options);
+}
+Status GetBlockBasedTableOptionsFromString(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
   std::unordered_map<std::string, std::string> opts_map;
   Status s = StringToMap(opts_str, &opts_map);
   if (!s.ok()) {
     return s;
   }
-
-  return GetBlockBasedTableOptionsFromMap(table_options, opts_map,
-                                          new_table_options);
+  s = GetBlockBasedTableOptionsFromMap(config_options, table_options, opts_map,
+                                       new_table_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
+  } else {
+    return Status::InvalidArgument(s.getState());
+  }
 }
 
 Status GetBlockBasedTableOptionsFromMap(
@@ -562,69 +935,29 @@
     const std::unordered_map<std::string, std::string>& opts_map,
     BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
     bool ignore_unknown_options) {
-  assert(new_table_options);
-  *new_table_options = table_options;
-  for (const auto& o : opts_map) {
-    auto error_message = ParseBlockBasedTableOption(
-        o.first, o.second, new_table_options, input_strings_escaped,
-        ignore_unknown_options);
-    if (error_message != "") {
-      const auto iter = block_based_table_type_info.find(o.first);
-      if (iter == block_based_table_type_info.end() ||
-          !input_strings_escaped ||  // !input_strings_escaped indicates
-                                     // the old API, where everything is
-                                     // parsable.
-          (iter->second.verification != OptionVerificationType::kByName &&
-           iter->second.verification !=
-               OptionVerificationType::kByNameAllowNull &&
-           iter->second.verification !=
-               OptionVerificationType::kByNameAllowFromNull &&
-           iter->second.verification != OptionVerificationType::kDeprecated)) {
-        // Restore "new_options" to the default "base_options".
-        *new_table_options = table_options;
-        return Status::InvalidArgument("Can't parse BlockBasedTableOptions:",
-                                       o.first + " " + error_message);
-      }
-    }
-  }
-  return Status::OK();
-}
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.invoke_prepare_options = false;
 
-Status VerifyBlockBasedTableFactory(
-    const BlockBasedTableFactory* base_tf,
-    const BlockBasedTableFactory* file_tf,
-    OptionsSanityCheckLevel sanity_check_level) {
-  if ((base_tf != nullptr) != (file_tf != nullptr) &&
-      sanity_check_level > kSanityLevelNone) {
-    return Status::Corruption(
-        "[RocksDBOptionsParser]: Inconsistent TableFactory class type");
-  }
-  if (base_tf == nullptr) {
-    return Status::OK();
-  }
-  assert(file_tf != nullptr);
-
-  const auto& base_opt = base_tf->table_options();
-  const auto& file_opt = file_tf->table_options();
+  return GetBlockBasedTableOptionsFromMap(config_options, table_options,
+                                          opts_map, new_table_options);
+}
 
-  for (auto& pair : block_based_table_type_info) {
-    if (pair.second.verification == OptionVerificationType::kDeprecated) {
-      // We skip checking deprecated variables as they might
-      // contain random values since they might not be initialized
-      continue;
-    }
-    if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) {
-      if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
-                           reinterpret_cast<const char*>(&file_opt),
-                           pair.second, pair.first, nullptr)) {
-        return Status::Corruption(
-            "[RocksDBOptionsParser]: "
-            "failed the verification on BlockBasedTableOptions::",
-            pair.first);
-      }
-    }
+Status GetBlockBasedTableOptionsFromMap(
+    const ConfigOptions& config_options,
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options) {
+  assert(new_table_options);
+  BlockBasedTableFactory bbtf(table_options);
+  Status s = bbtf.ConfigureFromMap(config_options, opts_map);
+  if (s.ok()) {
+    *new_table_options = *(bbtf.GetOptions<BlockBasedTableOptions>());
+  } else {
+    *new_table_options = table_options;
   }
-  return Status::OK();
+  return s;
 }
 #endif  // !ROCKSDB_LITE
 
@@ -633,7 +966,6 @@
   return new BlockBasedTableFactory(_table_options);
 }
 
-const std::string BlockBasedTableFactory::kName = "BlockBasedTable";
 const std::string BlockBasedTablePropertyNames::kIndexType =
     "rocksdb.block.based.table.index.type";
 const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_factory.h	2025-05-19 16:14:27.000000000 +0000
@@ -13,17 +13,19 @@
 #include <memory>
 #include <string>
 
-#include "db/dbformat.h"
-#include "options/options_helper.h"
-#include "options/options_parser.h"
+#include "port/port.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/table.h"
 
 namespace ROCKSDB_NAMESPACE {
-
+struct ColumnFamilyOptions;
+struct ConfigOptions;
+struct DBOptions;
 struct EnvOptions;
 
 class BlockBasedTableBuilder;
+class RandomAccessFileReader;
+class WritableFileWriter;
 
 // A class used to track actual bytes written from the tail in the recent SST
 // file opens, and provide a suggestion for following open.
@@ -48,34 +50,42 @@
 
   ~BlockBasedTableFactory() {}
 
-  const char* Name() const override { return kName.c_str(); }
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kBlockBasedTableName(); }
+
+  const char* Name() const override { return kBlockBasedTableName(); }
 
+  using TableFactory::NewTableReader;
   Status NewTableReader(
-      const TableReaderOptions& table_reader_options,
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
       std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
       std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
-  // Sanitizes the specified DB Options.
-  Status SanitizeOptions(const DBOptions& db_opts,
+  // Valdates the specified DB Options.
+  Status ValidateOptions(const DBOptions& db_opts,
                          const ColumnFamilyOptions& cf_opts) const override;
+  Status PrepareOptions(const ConfigOptions& opts) override;
 
-  std::string GetPrintableTableOptions() const override;
-
-  Status GetOptionString(std::string* opt_string,
-                         const std::string& delimiter) const override;
-
-  const BlockBasedTableOptions& table_options() const;
-
-  void* GetOptions() override { return &table_options_; }
+  std::string GetPrintableOptions() const override;
 
   bool IsDeleteRangeSupported() const override { return true; }
 
-  static const std::string kName;
+  TailPrefetchStats* tail_prefetch_stats() { return &tail_prefetch_stats_; }
+
+ protected:
+  const void* GetOptionsPtr(const std::string& name) const override;
+#ifndef ROCKSDB_LITE
+  Status ParseOption(const ConfigOptions& config_options,
+                     const OptionTypeInfo& opt_info,
+                     const std::string& opt_name, const std::string& opt_value,
+                     void* opt_ptr) override;
+#endif
+  void InitializeOptions();
 
  private:
   BlockBasedTableOptions table_options_;
@@ -86,110 +96,4 @@
 extern const std::string kHashIndexPrefixesMetadataBlock;
 extern const std::string kPropTrue;
 extern const std::string kPropFalse;
-
-#ifndef ROCKSDB_LITE
-extern Status VerifyBlockBasedTableFactory(
-    const BlockBasedTableFactory* base_tf,
-    const BlockBasedTableFactory* file_tf,
-    OptionsSanityCheckLevel sanity_check_level);
-
-static std::unordered_map<std::string, OptionTypeInfo>
-    block_based_table_type_info = {
-        /* currently not supported
-          std::shared_ptr<Cache> block_cache = nullptr;
-          std::shared_ptr<Cache> block_cache_compressed = nullptr;
-         */
-        {"flush_block_policy_factory",
-         {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
-          OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName,
-          false, 0}},
-        {"cache_index_and_filter_blocks",
-         {offsetof(struct BlockBasedTableOptions,
-                   cache_index_and_filter_blocks),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"cache_index_and_filter_blocks_with_high_priority",
-         {offsetof(struct BlockBasedTableOptions,
-                   cache_index_and_filter_blocks_with_high_priority),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"pin_l0_filter_and_index_blocks_in_cache",
-         {offsetof(struct BlockBasedTableOptions,
-                   pin_l0_filter_and_index_blocks_in_cache),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"index_type",
-         {offsetof(struct BlockBasedTableOptions, index_type),
-          OptionType::kBlockBasedTableIndexType,
-          OptionVerificationType::kNormal, false, 0}},
-        {"hash_index_allow_collision",
-         {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"data_block_index_type",
-         {offsetof(struct BlockBasedTableOptions, data_block_index_type),
-          OptionType::kBlockBasedTableDataBlockIndexType,
-          OptionVerificationType::kNormal, false, 0}},
-        {"index_shortening",
-         {offsetof(struct BlockBasedTableOptions, index_shortening),
-          OptionType::kBlockBasedTableIndexShorteningMode,
-          OptionVerificationType::kNormal, false, 0}},
-        {"data_block_hash_table_util_ratio",
-         {offsetof(struct BlockBasedTableOptions,
-                   data_block_hash_table_util_ratio),
-          OptionType::kDouble, OptionVerificationType::kNormal, false, 0}},
-        {"checksum",
-         {offsetof(struct BlockBasedTableOptions, checksum),
-          OptionType::kChecksumType, OptionVerificationType::kNormal, false,
-          0}},
-        {"no_block_cache",
-         {offsetof(struct BlockBasedTableOptions, no_block_cache),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"block_size",
-         {offsetof(struct BlockBasedTableOptions, block_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-        {"block_size_deviation",
-         {offsetof(struct BlockBasedTableOptions, block_size_deviation),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"block_restart_interval",
-         {offsetof(struct BlockBasedTableOptions, block_restart_interval),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"index_block_restart_interval",
-         {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"index_per_partition",
-         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"metadata_block_size",
-         {offsetof(struct BlockBasedTableOptions, metadata_block_size),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-        {"partition_filters",
-         {offsetof(struct BlockBasedTableOptions, partition_filters),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"filter_policy",
-         {offsetof(struct BlockBasedTableOptions, filter_policy),
-          OptionType::kFilterPolicy, OptionVerificationType::kByName, false,
-          0}},
-        {"whole_key_filtering",
-         {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"skip_table_builder_flush",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"format_version",
-         {offsetof(struct BlockBasedTableOptions, format_version),
-          OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}},
-        {"verify_compression",
-         {offsetof(struct BlockBasedTableOptions, verify_compression),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"read_amp_bytes_per_bit",
-         {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
-          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-        {"enable_index_compression",
-         {offsetof(struct BlockBasedTableOptions, enable_index_compression),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"block_align",
-         {offsetof(struct BlockBasedTableOptions, block_align),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"pin_top_level_index_and_filter",
-         {offsetof(struct BlockBasedTableOptions,
-                   pin_top_level_index_and_filter),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
-#endif  // !ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,382 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target); }
+
+void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr); }
+
+void BlockBasedTableIterator::SeekImpl(const Slice* target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
+    ResetDataIter();
+    return;
+  }
+
+  bool need_seek_index = true;
+  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
+    // Reseek.
+    prev_block_offset_ = index_iter_->value().handle.offset();
+
+    if (target) {
+      // We can avoid an index seek if:
+      // 1. The new seek key is larger than the current key
+      // 2. The new seek key is within the upper bound of the block
+      // Since we don't necessarily know the internal key for either
+      // the current key or the upper bound, we check user keys and
+      // exclude the equality case. Considering internal keys can
+      // improve for the boundary cases, but it would complicate the
+      // code.
+      if (user_comparator_.Compare(ExtractUserKey(*target),
+                                   block_iter_.user_key()) > 0 &&
+          user_comparator_.Compare(ExtractUserKey(*target),
+                                   index_iter_->user_key()) < 0) {
+        need_seek_index = false;
+      }
+    }
+  }
+
+  if (need_seek_index) {
+    if (target) {
+      index_iter_->Seek(*target);
+    } else {
+      index_iter_->SeekToFirst();
+    }
+
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  IndexValue v = index_iter_->value();
+  const bool same_block = block_iter_points_to_real_block_ &&
+                          v.handle.offset() == prev_block_offset_;
+
+  if (!v.first_internal_key.empty() && !same_block &&
+      (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
+      allow_unprepared_value_) {
+    // Index contains the first key of the block, and it's >= target.
+    // We can defer reading the block.
+    is_at_first_key_from_index_ = true;
+    // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
+    // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
+    // as that will be done later when the data block is actually read.
+    ResetDataIter();
+  } else {
+    // Need to use the data block.
+    if (!same_block) {
+      InitDataBlock();
+    } else {
+      // When the user does a reseek, the iterate_upper_bound might have
+      // changed. CheckDataBlockWithinUpperBound() needs to be called
+      // explicitly if the reseek ends up in the same data block.
+      // If the reseek ends up in a different block, InitDataBlock() will do
+      // the iterator upper bound check.
+      CheckDataBlockWithinUpperBound();
+    }
+
+    if (target) {
+      block_iter_.Seek(*target);
+    } else {
+      block_iter_.SeekToFirst();
+    }
+    FindKeyForward();
+  }
+
+  CheckOutOfBound();
+
+  if (target) {
+    assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
+  }
+}
+
+void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  // For now totally disable prefix seek in auto prefix mode because we don't
+  // have logic
+  if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
+    ResetDataIter();
+    return;
+  }
+
+  SavePrevIndexValue();
+
+  // Call Seek() rather than SeekForPrev() in the index block, because the
+  // target data block will likely to contain the position for `target`, the
+  // same as Seek(), rather than than before.
+  // For example, if we have three data blocks, each containing two keys:
+  //   [2, 4]  [6, 8] [10, 12]
+  //  (the keys in the index block would be [4, 8, 12])
+  // and the user calls SeekForPrev(7), we need to go to the second block,
+  // just like if they call Seek(7).
+  // The only case where the block is difference is when they seek to a position
+  // in the boundary. For example, if they SeekForPrev(5), we should go to the
+  // first block, rather than the second. However, we don't have the information
+  // to distinguish the two unless we read the second block. In this case, we'll
+  // end up with reading two blocks.
+  index_iter_->Seek(target);
+
+  if (!index_iter_->Valid()) {
+    auto seek_status = index_iter_->status();
+    // Check for IO error
+    if (!seek_status.IsNotFound() && !seek_status.ok()) {
+      ResetDataIter();
+      return;
+    }
+
+    // With prefix index, Seek() returns NotFound if the prefix doesn't exist
+    if (seek_status.IsNotFound()) {
+      // Any key less than the target is fine for prefix seek
+      ResetDataIter();
+      return;
+    } else {
+      index_iter_->SeekToLast();
+    }
+    // Check for IO error
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  InitDataBlock();
+
+  block_iter_.SeekForPrev(target);
+
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+  assert(!block_iter_.Valid() ||
+         icomp_.Compare(target, block_iter_.key()) >= 0);
+}
+
+void BlockBasedTableIterator::SeekToLast() {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+  InitDataBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+}
+
+void BlockBasedTableIterator::Next() {
+  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    return;
+  }
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+  CheckOutOfBound();
+}
+
+bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
+  Next();
+  bool is_valid = Valid();
+  if (is_valid) {
+    result->key = key();
+    result->bound_check_result = UpperBoundCheckResult();
+    result->value_prepared = !is_at_first_key_from_index_;
+  }
+  return is_valid;
+}
+
+void BlockBasedTableIterator::Prev() {
+  if (is_at_first_key_from_index_) {
+    is_at_first_key_from_index_ = false;
+
+    index_iter_->Prev();
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToLast();
+  } else {
+    assert(block_iter_points_to_real_block_);
+    block_iter_.Prev();
+  }
+
+  FindKeyBackward();
+}
+
+void BlockBasedTableIterator::InitDataBlock() {
+  BlockHandle data_block_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      data_block_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetDataIter();
+    }
+    auto* rep = table_->get_rep();
+
+    bool is_for_compaction =
+        lookup_context_.caller == TableReaderCaller::kCompaction;
+    // Prefetch additional data for range scans (iterators).
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    block_prefetcher_.PrefetchIfNeeded(rep, data_block_handle,
+                                       read_options_.readahead_size,
+                                       is_for_compaction);
+    Status s;
+    table_->NewDataBlockIterator<DataBlockIter>(
+        read_options_, data_block_handle, &block_iter_, BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context_, s,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction);
+    block_iter_points_to_real_block_ = true;
+    CheckDataBlockWithinUpperBound();
+  }
+}
+
+bool BlockBasedTableIterator::MaterializeCurrentBlock() {
+  assert(is_at_first_key_from_index_);
+  assert(!block_iter_points_to_real_block_);
+  assert(index_iter_->Valid());
+
+  is_at_first_key_from_index_ = false;
+  InitDataBlock();
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.status().ok()) {
+    return false;
+  }
+
+  block_iter_.SeekToFirst();
+
+  if (!block_iter_.Valid() ||
+      icomp_.Compare(block_iter_.key(),
+                     index_iter_->value().first_internal_key) != 0) {
+    block_iter_.Invalidate(Status::Corruption(
+        "first key in index doesn't match first key in block"));
+    return false;
+  }
+
+  return true;
+}
+
+void BlockBasedTableIterator::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(!is_out_of_bound_);
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+void BlockBasedTableIterator::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    // Whether next data block is out of upper bound, if there is one.
+    const bool next_block_is_out_of_bound =
+        read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_ &&
+        block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock;
+    assert(!next_block_is_out_of_bound ||
+           user_comparator_.CompareWithoutTimestamp(
+               *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
+               index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
+    ResetDataIter();
+    index_iter_->Next();
+    if (next_block_is_out_of_bound) {
+      // The next block is out of bound. No need to read it.
+      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
+      // We need to make sure this is not the last data block before setting
+      // is_out_of_bound_, since the index key for the last data block can be
+      // larger than smallest key of the next file on the same level.
+      if (index_iter_->Valid()) {
+        is_out_of_bound_ = true;
+      }
+      return;
+    }
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    IndexValue v = index_iter_->value();
+
+    if (!v.first_internal_key.empty() && allow_unprepared_value_) {
+      // Index contains the first key of the block. Defer reading the block.
+      is_at_first_key_from_index_ = true;
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+void BlockBasedTableIterator::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetDataIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitDataBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+
+  // We could have check lower bound here too, but we opt not to do it for
+  // code simplicity.
+}
+
+void BlockBasedTableIterator::CheckOutOfBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock &&
+      Valid()) {
+    is_out_of_bound_ =
+        user_comparator_.CompareWithoutTimestamp(
+            *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
+            /*b_has_ts=*/true) <= 0;
+  }
+}
+
+void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_iter_points_to_real_block_) {
+    block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp(
+                                    *read_options_.iterate_upper_bound,
+                                    /*a_has_ts=*/false, index_iter_->user_key(),
+                                    /*b_has_ts=*/true) > 0)
+                                   ? BlockUpperBound::kUpperBoundBeyondCurBlock
+                                   : BlockUpperBound::kUpperBoundInCurBlock;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,273 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+
+#include "table/block_based/block_based_table_reader_impl.h"
+#include "table/block_based/block_prefetcher.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Iterates over the contents of BlockBasedTable.
+class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+  // @param read_options Must outlive this iterator.
+ public:
+  BlockBasedTableIterator(
+      const BlockBasedTable* table, const ReadOptions& read_options,
+      const InternalKeyComparator& icomp,
+      std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
+      bool check_filter, bool need_upper_bound_check,
+      const SliceTransform* prefix_extractor, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0, bool allow_unprepared_value = false)
+      : index_iter_(std::move(index_iter)),
+        table_(table),
+        read_options_(read_options),
+        icomp_(icomp),
+        user_comparator_(icomp.user_comparator()),
+        pinned_iters_mgr_(nullptr),
+        prefix_extractor_(prefix_extractor),
+        lookup_context_(caller),
+        block_prefetcher_(compaction_readahead_size),
+        allow_unprepared_value_(allow_unprepared_value),
+        block_iter_points_to_real_block_(false),
+        check_filter_(check_filter),
+        need_upper_bound_check_(need_upper_bound_check) {}
+
+  ~BlockBasedTableIterator() {}
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult* result) override;
+  void Prev() override;
+  bool Valid() const override {
+    return !is_out_of_bound_ &&
+           (is_at_first_key_from_index_ ||
+            (block_iter_points_to_real_block_ && block_iter_.Valid()));
+  }
+  Slice key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return index_iter_->value().first_internal_key;
+    } else {
+      return block_iter_.key();
+    }
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    if (is_at_first_key_from_index_) {
+      return ExtractUserKey(index_iter_->value().first_internal_key);
+    } else {
+      return block_iter_.user_key();
+    }
+  }
+  bool PrepareValue() override {
+    assert(Valid());
+
+    if (!is_at_first_key_from_index_) {
+      return true;
+    }
+
+    return const_cast<BlockBasedTableIterator*>(this)
+        ->MaterializeCurrentBlock();
+  }
+  Slice value() const override {
+    // PrepareValue() must have been called.
+    assert(!is_at_first_key_from_index_);
+    assert(Valid());
+
+    return block_iter_.value();
+  }
+  Status status() const override {
+    // Prefix index set status to NotFound when the prefix does not exist
+    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    if (is_out_of_bound_) {
+      return IterBoundCheck::kOutOfBound;
+    } else if (block_upper_bound_check_ ==
+               BlockUpperBound::kUpperBoundBeyondCurBlock) {
+      assert(!is_out_of_bound_);
+      return IterBoundCheck::kInbound;
+    } else {
+      return IterBoundCheck::kUnknown;
+    }
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  bool IsKeyPinned() const override {
+    // Our key comes either from block_iter_'s current key
+    // or index_iter_'s current *value*.
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
+            (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
+  }
+  bool IsValuePinned() const override {
+    assert(!is_at_first_key_from_index_);
+    assert(Valid());
+
+    // BlockIter::IsValuePinned() is always true. No need to check
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           block_iter_points_to_real_block_;
+  }
+
+  void ResetDataIter() {
+    if (block_iter_points_to_real_block_) {
+      if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
+        block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
+      }
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+    block_upper_bound_check_ = BlockUpperBound::kUnknown;
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_block_offset_ = index_iter_->value().handle.offset();
+    }
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (block_prefetcher_.prefetch_buffer() != nullptr &&
+        read_options_.adaptive_readahead) {
+      block_prefetcher_.prefetch_buffer()->GetReadaheadState(
+          &(readahead_file_info->data_block_readahead_info));
+      if (index_iter_) {
+        index_iter_->GetReadaheadState(readahead_file_info);
+      }
+    }
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (read_options_.adaptive_readahead) {
+      block_prefetcher_.SetReadaheadState(
+          &(readahead_file_info->data_block_readahead_info));
+      if (index_iter_) {
+        index_iter_->SetReadaheadState(readahead_file_info);
+      }
+    }
+  }
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
+
+ private:
+  enum class IterDirection {
+    kForward,
+    kBackward,
+  };
+  // This enum indicates whether the upper bound falls into current block
+  // or beyond.
+  //   +-------------+
+  //   |  cur block  |       <-- (1)
+  //   +-------------+
+  //                         <-- (2)
+  //  --- <boundary key> ---
+  //                         <-- (3)
+  //   +-------------+
+  //   |  next block |       <-- (4)
+  //        ......
+  //
+  // When the block is smaller than <boundary key>, kUpperBoundInCurBlock
+  // is the value to use. The examples are (1) or (2) in the graph. It means
+  // all keys in the next block or beyond will be out of bound. Keys within
+  // the current block may or may not be out of bound.
+  // When the block is larger or equal to <boundary key>,
+  // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4)
+  // in the graph. It means that all keys in the current block is within the
+  // upper bound and keys in the next block may or may not be within the uppder
+  // bound.
+  // If the boundary key hasn't been checked against the upper bound,
+  // kUnknown can be used.
+  enum class BlockUpperBound {
+    kUpperBoundInCurBlock,
+    kUpperBoundBeyondCurBlock,
+    kUnknown,
+  };
+
+  const BlockBasedTable* table_;
+  const ReadOptions& read_options_;
+  const InternalKeyComparator& icomp_;
+  UserComparatorWrapper user_comparator_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  DataBlockIter block_iter_;
+  const SliceTransform* prefix_extractor_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+
+  BlockPrefetcher block_prefetcher_;
+
+  const bool allow_unprepared_value_;
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
+  bool is_out_of_bound_ = false;
+  // How current data block's boundary key with the next block is compared with
+  // iterate upper bound.
+  BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
+  // True if we're standing at the first key of a block, and we haven't loaded
+  // that block yet. A call to PrepareValue() will trigger loading the block.
+  bool is_at_first_key_from_index_ = false;
+  bool check_filter_;
+  // TODO(Zhongyi): pick a better name
+  bool need_upper_bound_check_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target);
+
+  void InitDataBlock();
+  bool MaterializeCurrentBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+  void CheckOutOfBound();
+
+  // Check if data block is fully within iterate_upper_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update data_block_within_upper_bound_ accordingly.
+  void CheckDataBlockWithinUpperBound();
+
+  bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
+    if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
+      // Upper bound check isn't sufficient for backward direction to
+      // guarantee the same result as total order, so disable prefix
+      // check.
+      return true;
+    }
+    if (check_filter_ &&
+        !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
+                                need_upper_bound_check_, &lookup_context_)) {
+      // TODO remember the iterator is invalidated because of prefix
+      // match. This can avoid the upper level file iterator to falsely
+      // believe the position is the end of the SST file and move to
+      // the first key of the next file.
+      ResetDataIter();
+      return false;
+    }
+    return true;
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "table/block_based/block_based_table_reader.h"
+
 #include <algorithm>
 #include <array>
 #include <limits>
@@ -14,30 +15,45 @@
 #include <utility>
 #include <vector>
 
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "cache/sharded_cache.h"
+#include "db/compaction/compaction_picker.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
-
 #include "file/file_prefetch_buffer.h"
+#include "file/file_util.h"
 #include "file/random_access_file_reader.h"
-
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/lang.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/snapshot.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-
+#include "rocksdb/trace_record.h"
+#include "table/block_based/binary_search_index_reader.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_iterator.h"
+#include "table/block_based/block_like_traits.h"
 #include "table/block_based/block_prefix_index.h"
+#include "table/block_based/block_type.h"
 #include "table/block_based/filter_block.h"
 #include "table/block_based/full_filter_block.h"
+#include "table/block_based/hash_index_reader.h"
 #include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/partitioned_index_reader.h"
 #include "table/block_fetcher.h"
 #include "table/format.h"
 #include "table/get_context.h"
@@ -45,17 +61,14 @@
 #include "table/meta_blocks.h"
 #include "table/multiget_context.h"
 #include "table/persistent_cache_helper.h"
+#include "table/persistent_cache_options.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/two_level_iterator.h"
-
-#include "monitoring/perf_context_imp.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/util.h"
-#include "util/xxhash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -63,89 +76,10 @@
 extern const std::string kHashIndexPrefixesBlock;
 extern const std::string kHashIndexPrefixesMetadataBlock;
 
-typedef BlockBasedTable::IndexReader IndexReader;
-
-// Found that 256 KB readahead size provides the best performance, based on
-// experiments, for auto readahead. Experiment data is in PR #3282.
-const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
-
 BlockBasedTable::~BlockBasedTable() {
   delete rep_;
 }
 
-std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
-
-template <typename TBlocklike>
-class BlocklikeTraits;
-
-template <>
-class BlocklikeTraits<BlockContents> {
- public:
-  static BlockContents* Create(BlockContents&& contents,
-                               SequenceNumber /* global_seqno */,
-                               size_t /* read_amp_bytes_per_bit */,
-                               Statistics* /* statistics */,
-                               bool /* using_zstd */,
-                               const FilterPolicy* /* filter_policy */) {
-    return new BlockContents(std::move(contents));
-  }
-
-  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
-    return 0;
-  }
-};
-
-template <>
-class BlocklikeTraits<ParsedFullFilterBlock> {
- public:
-  static ParsedFullFilterBlock* Create(BlockContents&& contents,
-                                       SequenceNumber /* global_seqno */,
-                                       size_t /* read_amp_bytes_per_bit */,
-                                       Statistics* /* statistics */,
-                                       bool /* using_zstd */,
-                                       const FilterPolicy* filter_policy) {
-    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
-  }
-
-  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
-    return 0;
-  }
-};
-
-template <>
-class BlocklikeTraits<Block> {
- public:
-  static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
-                       size_t read_amp_bytes_per_bit, Statistics* statistics,
-                       bool /* using_zstd */,
-                       const FilterPolicy* /* filter_policy */) {
-    return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
-                     statistics);
-  }
-
-  static uint32_t GetNumRestarts(const Block& block) {
-    return block.NumRestarts();
-  }
-};
-
-template <>
-class BlocklikeTraits<UncompressionDict> {
- public:
-  static UncompressionDict* Create(BlockContents&& contents,
-                                   SequenceNumber /* global_seqno */,
-                                   size_t /* read_amp_bytes_per_bit */,
-                                   Statistics* /* statistics */,
-                                   bool using_zstd,
-                                   const FilterPolicy* /* filter_policy */) {
-    return new UncompressionDict(contents.data, std::move(contents.allocation),
-                                 using_zstd);
-  }
-
-  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
-    return 0;
-  }
-};
-
 namespace {
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
@@ -157,12 +91,12 @@
 Status ReadBlockFromFile(
     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
-    std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
+    std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
     bool do_uncompress, bool maybe_compressed, BlockType block_type,
     const UncompressionDict& uncompression_dict,
-    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
-    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
-    bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
+    const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
+    MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd,
+    const FilterPolicy* filter_policy) {
   assert(result);
 
   BlockContents contents;
@@ -173,41 +107,13 @@
   Status s = block_fetcher.ReadBlockContents();
   if (s.ok()) {
     result->reset(BlocklikeTraits<TBlocklike>::Create(
-        std::move(contents), global_seqno, read_amp_bytes_per_bit,
-        ioptions.statistics, using_zstd, filter_policy));
+        std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd,
+        filter_policy));
   }
 
   return s;
 }
 
-inline MemoryAllocator* GetMemoryAllocator(
-    const BlockBasedTableOptions& table_options) {
-  return table_options.block_cache.get()
-             ? table_options.block_cache->memory_allocator()
-             : nullptr;
-}
-
-inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
-    const BlockBasedTableOptions& table_options) {
-  return table_options.block_cache_compressed.get()
-             ? table_options.block_cache_compressed->memory_allocator()
-             : nullptr;
-}
-
-// Delete the entry resided in the cache.
-template <class Entry>
-void DeleteCachedEntry(const Slice& /*key*/, void* value) {
-  auto entry = reinterpret_cast<Entry*>(value);
-  delete entry;
-}
-
-// Release the cached entry and decrement its ref count.
-void ForceReleaseCachedEntry(void* arg, void* h) {
-  Cache* cache = reinterpret_cast<Cache*>(arg);
-  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
-  cache->Release(handle, true /* force_erase */);
-}
-
 // Release the cached entry and decrement its ref count.
 // Do not force erase
 void ReleaseCachedEntry(void* arg, void* h) {
@@ -219,8 +125,9 @@
 // For hash based index, return true if prefix_extractor and
 // prefix_extractor_block mismatch, false otherwise. This flag will be used
 // as total_order_seek via NewIndexIterator
-bool PrefixExtractorChanged(const TableProperties* table_properties,
-                            const SliceTransform* prefix_extractor) {
+inline bool PrefixExtractorChangedHelper(
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor) {
   // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
   // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
   // is set but prefix_extractor_block is not set, also disable hash index
@@ -230,8 +137,7 @@
   }
 
   // prefix_extractor and prefix_extractor_block are both non-empty
-  if (table_properties->prefix_extractor_name.compare(
-          prefix_extractor->Name()) != 0) {
+  if (table_properties->prefix_extractor_name != prefix_extractor->AsString()) {
     return true;
   } else {
     return false;
@@ -244,553 +150,12 @@
   memcpy(heap_buf.get(), buf.data(), buf.size());
   return heap_buf;
 }
-
 }  // namespace
 
-// Encapsulates common functionality for the various index reader
-// implementations. Provides access to the index block regardless of whether
-// it is owned by the reader or stored in the cache, or whether it is pinned
-// in the cache or not.
-class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
- public:
-  IndexReaderCommon(const BlockBasedTable* t,
-                    CachableEntry<Block>&& index_block)
-      : table_(t), index_block_(std::move(index_block)) {
-    assert(table_ != nullptr);
-  }
-
- protected:
-  static Status ReadIndexBlock(const BlockBasedTable* table,
-                               FilePrefetchBuffer* prefetch_buffer,
-                               const ReadOptions& read_options, bool use_cache,
-                               GetContext* get_context,
-                               BlockCacheLookupContext* lookup_context,
-                               CachableEntry<Block>* index_block);
-
-  const BlockBasedTable* table() const { return table_; }
-
-  const InternalKeyComparator* internal_comparator() const {
-    assert(table_ != nullptr);
-    assert(table_->get_rep() != nullptr);
-
-    return &table_->get_rep()->internal_comparator;
-  }
-
-  bool index_has_first_key() const {
-    assert(table_ != nullptr);
-    assert(table_->get_rep() != nullptr);
-    return table_->get_rep()->index_has_first_key;
-  }
-
-  bool index_key_includes_seq() const {
-    assert(table_ != nullptr);
-    assert(table_->get_rep() != nullptr);
-    return table_->get_rep()->index_key_includes_seq;
-  }
-
-  bool index_value_is_full() const {
-    assert(table_ != nullptr);
-    assert(table_->get_rep() != nullptr);
-    return table_->get_rep()->index_value_is_full;
-  }
-
-  bool cache_index_blocks() const {
-    assert(table_ != nullptr);
-    assert(table_->get_rep() != nullptr);
-    return table_->get_rep()->table_options.cache_index_and_filter_blocks;
-  }
-
-  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
-                             BlockCacheLookupContext* lookup_context,
-                             CachableEntry<Block>* index_block) const;
-
-  size_t ApproximateIndexBlockMemoryUsage() const {
-    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
-    return index_block_.GetOwnValue()
-               ? index_block_.GetValue()->ApproximateMemoryUsage()
-               : 0;
-  }
-
- private:
-  const BlockBasedTable* table_;
-  CachableEntry<Block> index_block_;
-};
-
-Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
-    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context,
-    CachableEntry<Block>* index_block) {
-  PERF_TIMER_GUARD(read_index_block_nanos);
-
-  assert(table != nullptr);
-  assert(index_block != nullptr);
-  assert(index_block->IsEmpty());
-
-  const Rep* const rep = table->get_rep();
-  assert(rep != nullptr);
-
-  const Status s = table->RetrieveBlock(
-      prefetch_buffer, read_options, rep->footer.index_handle(),
-      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
-      get_context, lookup_context, /* for_compaction */ false, use_cache);
-
-  return s;
-}
-
-Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
-    bool no_io, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context,
-    CachableEntry<Block>* index_block) const {
-  assert(index_block != nullptr);
-
-  if (!index_block_.IsEmpty()) {
-    index_block->SetUnownedValue(index_block_.GetValue());
-    return Status::OK();
-  }
-
-  ReadOptions read_options;
-  if (no_io) {
-    read_options.read_tier = kBlockCacheTier;
-  }
-
-  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
-                        cache_index_blocks(), get_context, lookup_context,
-                        index_block);
-}
-
-// Index that allows binary search lookup in a two-level index structure.
-class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
- public:
-  // Read the partition index from the file and create an instance for
-  // `PartitionIndexReader`.
-  // On success, index_reader will be populated; otherwise it will remain
-  // unmodified.
-  static Status Create(const BlockBasedTable* table,
-                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
-                       bool prefetch, bool pin,
-                       BlockCacheLookupContext* lookup_context,
-                       std::unique_ptr<IndexReader>* index_reader) {
-    assert(table != nullptr);
-    assert(table->get_rep());
-    assert(!pin || prefetch);
-    assert(index_reader != nullptr);
-
-    CachableEntry<Block> index_block;
-    if (prefetch || !use_cache) {
-      const Status s =
-          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
-                         /*get_context=*/nullptr, lookup_context, &index_block);
-      if (!s.ok()) {
-        return s;
-      }
-
-      if (use_cache && !pin) {
-        index_block.Reset();
-      }
-    }
-
-    index_reader->reset(
-        new PartitionIndexReader(table, std::move(index_block)));
-
-    return Status::OK();
-  }
-
-  // return a two-level iterator: first level is on the partition index
-  InternalIteratorBase<IndexValue>* NewIterator(
-      const ReadOptions& read_options, bool /* disable_prefix_seek */,
-      IndexBlockIter* iter, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context) override {
-    const bool no_io = (read_options.read_tier == kBlockCacheTier);
-    CachableEntry<Block> index_block;
-    const Status s =
-        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
-    if (!s.ok()) {
-      if (iter != nullptr) {
-        iter->Invalidate(s);
-        return iter;
-      }
-
-      return NewErrorInternalIterator<IndexValue>(s);
-    }
-
-    InternalIteratorBase<IndexValue>* it = nullptr;
-
-    Statistics* kNullStats = nullptr;
-    // Filters are already checked before seeking the index
-    if (!partition_map_.empty()) {
-      // We don't return pinned data from index blocks, so no need
-      // to set `block_contents_pinned`.
-      it = NewTwoLevelIterator(
-          new BlockBasedTable::PartitionedIndexIteratorState(table(),
-                                                             &partition_map_),
-          index_block.GetValue()->NewIndexIterator(
-              internal_comparator(), internal_comparator()->user_comparator(),
-              nullptr, kNullStats, true, index_has_first_key(),
-              index_key_includes_seq(), index_value_is_full()));
-    } else {
-      ReadOptions ro;
-      ro.fill_cache = read_options.fill_cache;
-      // We don't return pinned data from index blocks, so no need
-      // to set `block_contents_pinned`.
-      it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
-          table(), ro, *internal_comparator(),
-          index_block.GetValue()->NewIndexIterator(
-              internal_comparator(), internal_comparator()->user_comparator(),
-              nullptr, kNullStats, true, index_has_first_key(),
-              index_key_includes_seq(), index_value_is_full()),
-          false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
-          lookup_context ? lookup_context->caller
-                         : TableReaderCaller::kUncategorized);
-    }
-
-    assert(it != nullptr);
-    index_block.TransferTo(it);
-
-    return it;
-
-    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
-    // on-stack BlockIter while the state is on heap. Currentlly it assumes
-    // the first level iter is always on heap and will attempt to delete it
-    // in its destructor.
-  }
-
-  void CacheDependencies(bool pin) override {
-    // Before read partitions, prefetch them to avoid lots of IOs
-    BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
-    const BlockBasedTable::Rep* rep = table()->rep_;
-    IndexBlockIter biter;
-    BlockHandle handle;
-    Statistics* kNullStats = nullptr;
-
-    CachableEntry<Block> index_block;
-    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
-                                   &lookup_context, &index_block);
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log,
-                     "Error retrieving top-level index block while trying to "
-                     "cache index partitions: %s",
-                     s.ToString().c_str());
-      return;
-    }
-
-    // We don't return pinned data from index blocks, so no need
-    // to set `block_contents_pinned`.
-    index_block.GetValue()->NewIndexIterator(
-        internal_comparator(), internal_comparator()->user_comparator(), &biter,
-        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
-        index_value_is_full());
-    // Index partitions are assumed to be consecuitive. Prefetch them all.
-    // Read the first block offset
-    biter.SeekToFirst();
-    if (!biter.Valid()) {
-      // Empty index.
-      return;
-    }
-    handle = biter.value().handle;
-    uint64_t prefetch_off = handle.offset();
-
-    // Read the last block's offset
-    biter.SeekToLast();
-    if (!biter.Valid()) {
-      // Empty index.
-      return;
-    }
-    handle = biter.value().handle;
-    uint64_t last_off = handle.offset() + block_size(handle);
-    uint64_t prefetch_len = last_off - prefetch_off;
-    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
-    rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
-    s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
-                                  static_cast<size_t>(prefetch_len));
-
-    // After prefetch, read the partitions one by one
-    biter.SeekToFirst();
-    auto ro = ReadOptions();
-    for (; biter.Valid(); biter.Next()) {
-      handle = biter.value().handle;
-      CachableEntry<Block> block;
-      // TODO: Support counter batch update for partitioned index and
-      // filter blocks
-      s = table()->MaybeReadBlockAndLoadToCache(
-          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
-          /*contents=*/nullptr);
-
-      assert(s.ok() || block.GetValue() == nullptr);
-      if (s.ok() && block.GetValue() != nullptr) {
-        if (block.IsCached()) {
-          if (pin) {
-            partition_map_[handle.offset()] = std::move(block);
-          }
-        }
-      }
-    }
-  }
-
-  size_t ApproximateMemoryUsage() const override {
-    size_t usage = ApproximateIndexBlockMemoryUsage();
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
-#else
-    usage += sizeof(*this);
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
-    return usage;
-  }
-
- private:
-  PartitionIndexReader(const BlockBasedTable* t,
-                       CachableEntry<Block>&& index_block)
-      : IndexReaderCommon(t, std::move(index_block)) {}
-
-  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
-};
-
-// Index that allows binary search lookup for the first key of each block.
-// This class can be viewed as a thin wrapper for `Block` class which already
-// supports binary search.
-class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
- public:
-  // Read index from the file and create an intance for
-  // `BinarySearchIndexReader`.
-  // On success, index_reader will be populated; otherwise it will remain
-  // unmodified.
-  static Status Create(const BlockBasedTable* table,
-                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
-                       bool prefetch, bool pin,
-                       BlockCacheLookupContext* lookup_context,
-                       std::unique_ptr<IndexReader>* index_reader) {
-    assert(table != nullptr);
-    assert(table->get_rep());
-    assert(!pin || prefetch);
-    assert(index_reader != nullptr);
-
-    CachableEntry<Block> index_block;
-    if (prefetch || !use_cache) {
-      const Status s =
-          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
-                         /*get_context=*/nullptr, lookup_context, &index_block);
-      if (!s.ok()) {
-        return s;
-      }
-
-      if (use_cache && !pin) {
-        index_block.Reset();
-      }
-    }
-
-    index_reader->reset(
-        new BinarySearchIndexReader(table, std::move(index_block)));
-
-    return Status::OK();
-  }
-
-  InternalIteratorBase<IndexValue>* NewIterator(
-      const ReadOptions& read_options, bool /* disable_prefix_seek */,
-      IndexBlockIter* iter, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context) override {
-    const bool no_io = (read_options.read_tier == kBlockCacheTier);
-    CachableEntry<Block> index_block;
-    const Status s =
-        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
-    if (!s.ok()) {
-      if (iter != nullptr) {
-        iter->Invalidate(s);
-        return iter;
-      }
-
-      return NewErrorInternalIterator<IndexValue>(s);
-    }
-
-    Statistics* kNullStats = nullptr;
-    // We don't return pinned data from index blocks, so no need
-    // to set `block_contents_pinned`.
-    auto it = index_block.GetValue()->NewIndexIterator(
-        internal_comparator(), internal_comparator()->user_comparator(), iter,
-        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
-        index_value_is_full());
-
-    assert(it != nullptr);
-    index_block.TransferTo(it);
-
-    return it;
-  }
-
-  size_t ApproximateMemoryUsage() const override {
-    size_t usage = ApproximateIndexBlockMemoryUsage();
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
-#else
-    usage += sizeof(*this);
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    return usage;
-  }
-
- private:
-  BinarySearchIndexReader(const BlockBasedTable* t,
-                          CachableEntry<Block>&& index_block)
-      : IndexReaderCommon(t, std::move(index_block)) {}
-};
-
-// Index that leverages an internal hash table to quicken the lookup for a given
-// key.
-class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
- public:
-  static Status Create(const BlockBasedTable* table,
-                       FilePrefetchBuffer* prefetch_buffer,
-                       InternalIterator* meta_index_iter, bool use_cache,
-                       bool prefetch, bool pin,
-                       BlockCacheLookupContext* lookup_context,
-                       std::unique_ptr<IndexReader>* index_reader) {
-    assert(table != nullptr);
-    assert(index_reader != nullptr);
-    assert(!pin || prefetch);
-
-    const BlockBasedTable::Rep* rep = table->get_rep();
-    assert(rep != nullptr);
-
-    CachableEntry<Block> index_block;
-    if (prefetch || !use_cache) {
-      const Status s =
-          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
-                         /*get_context=*/nullptr, lookup_context, &index_block);
-      if (!s.ok()) {
-        return s;
-      }
-
-      if (use_cache && !pin) {
-        index_block.Reset();
-      }
-    }
-
-    // Note, failure to create prefix hash index does not need to be a
-    // hard error. We can still fall back to the original binary search index.
-    // So, Create will succeed regardless, from this point on.
-
-    index_reader->reset(new HashIndexReader(table, std::move(index_block)));
-
-    // Get prefixes block
-    BlockHandle prefixes_handle;
-    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
-                             &prefixes_handle);
-    if (!s.ok()) {
-      // TODO: log error
-      return Status::OK();
-    }
-
-    // Get index metadata block
-    BlockHandle prefixes_meta_handle;
-    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
-                      &prefixes_meta_handle);
-    if (!s.ok()) {
-      // TODO: log error
-      return Status::OK();
-    }
-
-    RandomAccessFileReader* const file = rep->file.get();
-    const Footer& footer = rep->footer;
-    const ImmutableCFOptions& ioptions = rep->ioptions;
-    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
-    MemoryAllocator* const memory_allocator =
-        GetMemoryAllocator(rep->table_options);
-
-    // Read contents for the blocks
-    BlockContents prefixes_contents;
-    BlockFetcher prefixes_block_fetcher(
-        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
-        &prefixes_contents, ioptions, true /*decompress*/,
-        true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
-        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
-    s = prefixes_block_fetcher.ReadBlockContents();
-    if (!s.ok()) {
-      return s;
-    }
-    BlockContents prefixes_meta_contents;
-    BlockFetcher prefixes_meta_block_fetcher(
-        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
-        &prefixes_meta_contents, ioptions, true /*decompress*/,
-        true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
-        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
-    s = prefixes_meta_block_fetcher.ReadBlockContents();
-    if (!s.ok()) {
-      // TODO: log error
-      return Status::OK();
-    }
-
-    BlockPrefixIndex* prefix_index = nullptr;
-    assert(rep->internal_prefix_transform.get() != nullptr);
-    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
-                                 prefixes_contents.data,
-                                 prefixes_meta_contents.data, &prefix_index);
-    // TODO: log error
-    if (s.ok()) {
-      HashIndexReader* const hash_index_reader =
-          static_cast<HashIndexReader*>(index_reader->get());
-      hash_index_reader->prefix_index_.reset(prefix_index);
-    }
-
-    return Status::OK();
-  }
-
-  InternalIteratorBase<IndexValue>* NewIterator(
-      const ReadOptions& read_options, bool disable_prefix_seek,
-      IndexBlockIter* iter, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context) override {
-    const bool no_io = (read_options.read_tier == kBlockCacheTier);
-    CachableEntry<Block> index_block;
-    const Status s =
-        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
-    if (!s.ok()) {
-      if (iter != nullptr) {
-        iter->Invalidate(s);
-        return iter;
-      }
-
-      return NewErrorInternalIterator<IndexValue>(s);
-    }
-
-    Statistics* kNullStats = nullptr;
-    const bool total_order_seek =
-        read_options.total_order_seek || disable_prefix_seek;
-    // We don't return pinned data from index blocks, so no need
-    // to set `block_contents_pinned`.
-    auto it = index_block.GetValue()->NewIndexIterator(
-        internal_comparator(), internal_comparator()->user_comparator(), iter,
-        kNullStats, total_order_seek, index_has_first_key(),
-        index_key_includes_seq(), index_value_is_full(),
-        false /* block_contents_pinned */, prefix_index_.get());
-
-    assert(it != nullptr);
-    index_block.TransferTo(it);
-
-    return it;
-  }
-
-  size_t ApproximateMemoryUsage() const override {
-    size_t usage = ApproximateIndexBlockMemoryUsage();
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
-#else
-    if (prefix_index_) {
-      usage += prefix_index_->ApproximateMemoryUsage();
-    }
-    usage += sizeof(*this);
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    return usage;
-  }
-
- private:
-  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
-      : IndexReaderCommon(t, std::move(index_block)) {}
-
-  std::unique_ptr<BlockPrefixIndex> prefix_index_;
-};
-
 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
                                             GetContext* get_context,
                                             size_t usage) const {
-  Statistics* const statistics = rep_->ioptions.statistics;
+  Statistics* const statistics = rep_->ioptions.stats;
 
   PERF_COUNTER_ADD(block_cache_hit_count, 1);
   PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
@@ -848,7 +213,7 @@
 
 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
                                              GetContext* get_context) const {
-  Statistics* const statistics = rep_->ioptions.statistics;
+  Statistics* const statistics = rep_->ioptions.stats;
 
   // TODO: introduce aggregate (not per-level) block cache miss count
   PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
@@ -898,17 +263,21 @@
   }
 }
 
-void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
-                                                  GetContext* get_context,
-                                                  size_t usage) const {
-  Statistics* const statistics = rep_->ioptions.statistics;
-
+void BlockBasedTable::UpdateCacheInsertionMetrics(
+    BlockType block_type, GetContext* get_context, size_t usage, bool redundant,
+    Statistics* const statistics) {
   // TODO: introduce perf counters for block cache insertions
   if (get_context) {
     ++get_context->get_context_stats_.num_cache_add;
+    if (redundant) {
+      ++get_context->get_context_stats_.num_cache_add_redundant;
+    }
     get_context->get_context_stats_.num_cache_bytes_write += usage;
   } else {
     RecordTick(statistics, BLOCK_CACHE_ADD);
+    if (redundant) {
+      RecordTick(statistics, BLOCK_CACHE_ADD_REDUNDANT);
+    }
     RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
   }
 
@@ -916,9 +285,15 @@
     case BlockType::kFilter:
       if (get_context) {
         ++get_context->get_context_stats_.num_cache_filter_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_filter_add_redundant;
+        }
         get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
       } else {
         RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_FILTER_ADD_REDUNDANT);
+        }
         RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
       }
       break;
@@ -926,10 +301,17 @@
     case BlockType::kCompressionDictionary:
       if (get_context) {
         ++get_context->get_context_stats_.num_cache_compression_dict_add;
+        if (redundant) {
+          ++get_context->get_context_stats_
+                .num_cache_compression_dict_add_redundant;
+        }
         get_context->get_context_stats_
             .num_cache_compression_dict_bytes_insert += usage;
       } else {
         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT);
+        }
         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
                    usage);
       }
@@ -938,9 +320,15 @@
     case BlockType::kIndex:
       if (get_context) {
         ++get_context->get_context_stats_.num_cache_index_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_index_add_redundant;
+        }
         get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
       } else {
         RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD_REDUNDANT);
+        }
         RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
       }
       break;
@@ -950,9 +338,15 @@
       // for range tombstones
       if (get_context) {
         ++get_context->get_context_stats_.num_cache_data_add;
+        if (redundant) {
+          ++get_context->get_context_stats_.num_cache_data_add_redundant;
+        }
         get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
       } else {
         RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+        if (redundant) {
+          RecordTick(statistics, BLOCK_CACHE_DATA_ADD_REDUNDANT);
+        }
         RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
       }
       break;
@@ -960,9 +354,17 @@
 }
 
 Cache::Handle* BlockBasedTable::GetEntryFromCache(
-    Cache* block_cache, const Slice& key, BlockType block_type,
-    GetContext* get_context) const {
-  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
+    const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
+    BlockType block_type, const bool wait, GetContext* get_context,
+    const Cache::CacheItemHelper* cache_helper,
+    const Cache::CreateCallback& create_cb, Cache::Priority priority) const {
+  Cache::Handle* cache_handle = nullptr;
+  if (cache_tier == CacheTier::kNonVolatileBlockTier) {
+    cache_handle = block_cache->Lookup(key, cache_helper, create_cb, priority,
+                                       wait, rep_->ioptions.statistics.get());
+  } else {
+    cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics.get());
+  }
 
   if (cache_handle != nullptr) {
     UpdateCacheHitMetrics(block_type, get_context,
@@ -974,51 +376,21 @@
   return cache_handle;
 }
 
-// Helper function to setup the cache key's prefix for the Table.
-void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
-  assert(kMaxCacheKeyPrefixSize >= 10);
-  rep->cache_key_prefix_size = 0;
-  rep->compressed_cache_key_prefix_size = 0;
-  if (rep->table_options.block_cache != nullptr) {
-    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
-                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
-  }
-  if (rep->table_options.persistent_cache != nullptr) {
-    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
-                        &rep->persistent_cache_key_prefix[0],
-                        &rep->persistent_cache_key_prefix_size);
-  }
-  if (rep->table_options.block_cache_compressed != nullptr) {
-    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
-                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
-                        &rep->compressed_cache_key_prefix_size);
-  }
-}
-
-void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
-                                          char* buffer, size_t* size) {
-  // generate an id from the file
-  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
-
-  // If the prefix wasn't generated or was too long,
-  // create one from the cache.
-  if (cc != nullptr && *size == 0) {
-    char* end = EncodeVarint64(buffer, cc->NewId());
-    *size = static_cast<size_t>(end - buffer);
-  }
-}
-
-void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
-                                          char* buffer, size_t* size) {
-  // generate an id from the file
-  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
-
-  // If the prefix wasn't generated or was too long,
-  // create one from the cache.
-  if (cc != nullptr && *size == 0) {
-    char* end = EncodeVarint64(buffer, cc->NewId());
-    *size = static_cast<size_t>(end - buffer);
+template <typename TBlocklike>
+Status BlockBasedTable::InsertEntryToCache(
+    const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
+    const Cache::CacheItemHelper* cache_helper,
+    std::unique_ptr<TBlocklike>& block_holder, size_t charge,
+    Cache::Handle** cache_handle, Cache::Priority priority) const {
+  Status s = Status::OK();
+  if (cache_tier == CacheTier::kNonVolatileBlockTier) {
+    s = block_cache->Insert(key, block_holder.get(), cache_helper, charge,
+                            cache_handle, priority);
+  } else {
+    s = block_cache->Insert(key, block_holder.get(), charge,
+                            cache_helper->del_cb, cache_handle, priority);
   }
+  return s;
 }
 
 namespace {
@@ -1115,46 +487,110 @@
 }
 }  // namespace
 
-Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
-                                   size_t cache_key_prefix_size,
-                                   const BlockHandle& handle, char* cache_key) {
-  assert(cache_key != nullptr);
-  assert(cache_key_prefix_size != 0);
-  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
-  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
-  char* end =
-      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
-  return Slice(cache_key, static_cast<size_t>(end - cache_key));
+void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
+                                        const std::string& cur_db_session_id,
+                                        uint64_t cur_file_number,
+                                        uint64_t file_size,
+                                        OffsetableCacheKey* out_base_cache_key,
+                                        bool* out_is_stable) {
+  // Use a stable cache key if sufficient data is in table properties
+  std::string db_session_id;
+  uint64_t file_num;
+  std::string db_id;
+  if (properties && !properties->db_session_id.empty() &&
+      properties->orig_file_number > 0) {
+    // (Newer SST file case)
+    // We must have both properties to get a stable unique id because
+    // CreateColumnFamilyWithImport or IngestExternalFiles can change the
+    // file numbers on a file.
+    db_session_id = properties->db_session_id;
+    file_num = properties->orig_file_number;
+    // Less critical, populated in earlier release than above
+    db_id = properties->db_id;
+    if (out_is_stable) {
+      *out_is_stable = true;
+    }
+  } else {
+    // (Old SST file case)
+    // We use (unique) cache keys based on current identifiers. These are at
+    // least stable across table file close and re-open, but not across
+    // different DBs nor DB close and re-open.
+    db_session_id = cur_db_session_id;
+    file_num = cur_file_number;
+    // Plumbing through the DB ID to here would be annoying, and of limited
+    // value because of the case of VersionSet::Recover opening some table
+    // files and later setting the DB ID. So we just rely on uniqueness
+    // level provided by session ID.
+    db_id = "unknown";
+    if (out_is_stable) {
+      *out_is_stable = false;
+    }
+  }
+
+  // Too many tests to update to get these working
+  // assert(file_num > 0);
+  // assert(!db_session_id.empty());
+  // assert(!db_id.empty());
+
+  // Minimum block size is 5 bytes; therefore we can trim off two lower bits
+  // from offets. See GetCacheKey.
+  *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num,
+                                           /*max_offset*/ file_size >> 2);
+}
+
+CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key,
+                                      const BlockHandle& handle) {
+  // Minimum block size is 5 bytes; therefore we can trim off two lower bits
+  // from offet.
+  return base_cache_key.WithOffset(handle.offset() >> 2);
 }
 
 Status BlockBasedTable::Open(
-    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
-    const BlockBasedTableOptions& table_options,
+    const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
     const InternalKeyComparator& internal_comparator,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table_reader,
-    const SliceTransform* prefix_extractor,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
     const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
     const int level, const bool immortal_table,
-    const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
-    BlockCacheTracer* const block_cache_tracer) {
+    const SequenceNumber largest_seqno, const bool force_direct_prefetch,
+    TailPrefetchStats* tail_prefetch_stats,
+    BlockCacheTracer* const block_cache_tracer,
+    size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id,
+    uint64_t cur_file_num) {
   table_reader->reset();
 
   Status s;
   Footer footer;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
 
+  // Only retain read_options.deadline and read_options.io_timeout.
+  // In future, we may retain more
+  // options. Specifically, w ignore verify_checksums and default to
+  // checksum verification anyway when creating the index and filter
+  // readers.
+  ReadOptions ro;
+  ro.deadline = read_options.deadline;
+  ro.io_timeout = read_options.io_timeout;
+
   // prefetch both index and filters, down to all partitions
   const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
   const bool preload_all = !table_options.cache_index_and_filter_blocks;
 
   if (!ioptions.allow_mmap_reads) {
-    s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
-                     preload_all, &prefetch_buffer);
+    s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
+                     tail_prefetch_stats, prefetch_all, preload_all,
+                     &prefetch_buffer);
+    // Return error in prefetch path to users.
+    if (!s.ok()) {
+      return s;
+    }
   } else {
     // Should not prefetch for mmap mode.
     prefetch_buffer.reset(new FilePrefetchBuffer(
-        nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
+        0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */,
+        true /* track_min_offset */));
   }
 
   // Read in the following order:
@@ -1165,12 +601,16 @@
   //    5. [meta block: compression dictionary]
   //    6. [meta block: index]
   //    7. [meta block: filter]
-  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
-                         kBlockBasedTableMagicNumber);
+  IOOptions opts;
+  s = file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size,
+                           &footer, kBlockBasedTableMagicNumber);
+  }
   if (!s.ok()) {
     return s;
   }
-  if (!BlockBasedTableSupportedVersion(footer.version())) {
+  if (!IsSupportedFormatVersion(footer.format_version())) {
     return Status::Corruption(
         "Unknown Footer version. Maybe this file was created with newer "
         "version of RocksDB?");
@@ -1182,8 +622,8 @@
   // access a dangling pointer.
   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
-                                      internal_comparator, skip_filters, level,
-                                      immortal_table);
+                                      internal_comparator, skip_filters,
+                                      file_size, level, immortal_table);
   rep->file = std::move(file);
   rep->footer = footer;
   rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
@@ -1191,18 +631,13 @@
   // handle prefix correctly.
   if (prefix_extractor != nullptr) {
     rep->internal_prefix_transform.reset(
-        new InternalKeySliceTransform(prefix_extractor));
+        new InternalKeySliceTransform(prefix_extractor.get()));
   }
-  SetupCacheKeyPrefix(rep);
-  std::unique_ptr<BlockBasedTable> new_table(
-      new BlockBasedTable(rep, block_cache_tracer));
 
-  // page cache options
-  rep->persistent_cache_options =
-      PersistentCacheOptions(rep->table_options.persistent_cache,
-                             std::string(rep->persistent_cache_key_prefix,
-                                         rep->persistent_cache_key_prefix_size),
-                             rep->ioptions.statistics);
+  // For fully portable/stable cache keys, we need to read the properties
+  // block before setting up cache keys. TODO: consider setting up a bootstrap
+  // cache key for PersistentCache to use for metaindex and properties blocks.
+  rep->persistent_cache_options = PersistentCacheOptions();
 
   // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
   // handle to null, otherwise it may be seen as uninitialized during the below
@@ -1210,9 +645,11 @@
   rep->compression_dict_handle = BlockHandle::NullBlockHandle();
 
   // Read metaindex
+  std::unique_ptr<BlockBasedTable> new_table(
+      new BlockBasedTable(rep, block_cache_tracer));
   std::unique_ptr<Block> metaindex;
   std::unique_ptr<InternalIterator> metaindex_iter;
-  s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
+  s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex,
                                     &metaindex_iter);
   if (!s.ok()) {
     return s;
@@ -1220,19 +657,54 @@
 
   // Populates table_properties and some fields that depend on it,
   // such as index_type.
-  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
+  s = new_table->ReadPropertiesBlock(ro, prefetch_buffer.get(),
                                      metaindex_iter.get(), largest_seqno);
   if (!s.ok()) {
     return s;
   }
-  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
-                                   internal_comparator, &lookup_context);
+  if (!PrefixExtractorChangedHelper(rep->table_properties.get(),
+                                    prefix_extractor.get())) {
+    // Establish fast path for unchanged prefix_extractor
+    rep->table_prefix_extractor = prefix_extractor;
+  } else {
+    // Current prefix_extractor doesn't match table
+#ifndef ROCKSDB_LITE
+    if (rep->table_properties) {
+      //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
+      // will need to use it
+      ConfigOptions config_options;
+      Status st = SliceTransform::CreateFromString(
+          config_options, rep->table_properties->prefix_extractor_name,
+          &(rep->table_prefix_extractor));
+      if (!st.ok()) {
+        //**TODO: Should this be error be returned or swallowed?
+        ROCKS_LOG_ERROR(rep->ioptions.logger,
+                        "Failed to create prefix extractor[%s]: %s",
+                        rep->table_properties->prefix_extractor_name.c_str(),
+                        st.ToString().c_str());
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+  // With properties loaded, we can set up portable/stable cache keys
+  SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id,
+                    cur_file_num, file_size, &rep->base_cache_key);
+
+  rep->persistent_cache_options =
+      PersistentCacheOptions(rep->table_options.persistent_cache,
+                             rep->base_cache_key, rep->ioptions.stats);
+
+  s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(),
+                                   metaindex_iter.get(), internal_comparator,
+                                   &lookup_context);
   if (!s.ok()) {
     return s;
   }
   s = new_table->PrefetchIndexAndFilterBlocks(
-      prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
-      prefetch_all, table_options, level, &lookup_context);
+      ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
+      prefetch_all, table_options, level, file_size,
+      max_file_size_for_l0_meta_pin, &lookup_context);
 
   if (s.ok()) {
     // Update tail prefetch stats
@@ -1250,9 +722,9 @@
 }
 
 Status BlockBasedTable::PrefetchTail(
-    RandomAccessFileReader* file, uint64_t file_size,
-    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
-    const bool preload_all,
+    const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
+    bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
+    const bool prefetch_all, const bool preload_all,
     std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
   size_t tail_prefetch_size = 0;
   if (tail_prefetch_stats != nullptr) {
@@ -1280,121 +752,58 @@
   }
   TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
                            &tail_prefetch_size);
-  Status s;
-  // TODO should not have this special logic in the future.
-  if (!file->use_direct_io()) {
-    prefetch_buffer->reset(new FilePrefetchBuffer(
-        nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
-    s = file->Prefetch(prefetch_off, prefetch_len);
-  } else {
-    prefetch_buffer->reset(new FilePrefetchBuffer(
-        nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
-    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
-  }
-  return s;
-}
 
-Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
-                      uint32_t expected) {
-  Status s;
-  uint32_t actual = 0;
-  switch (type) {
-    case kNoChecksum:
-      break;
-    case kCRC32c:
-      expected = crc32c::Unmask(expected);
-      actual = crc32c::Value(buf, len);
-      break;
-    case kxxHash:
-      actual = XXH32(buf, static_cast<int>(len), 0);
-      break;
-    case kxxHash64:
-      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
-                                     uint64_t{0xffffffff});
-      break;
-    default:
-      s = Status::Corruption("unknown checksum type");
-  }
-  if (s.ok() && actual != expected) {
-    s = Status::Corruption("properties block checksum mismatched");
+  // Try file system prefetch
+  if (!file->use_direct_io() && !force_direct_prefetch) {
+    if (!file->Prefetch(prefetch_off, prefetch_len).IsNotSupported()) {
+      prefetch_buffer->reset(new FilePrefetchBuffer(
+          0 /* readahead_size */, 0 /* max_readahead_size */,
+          false /* enable */, true /* track_min_offset */));
+      return Status::OK();
+    }
   }
-  return s;
-}
 
-Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
-    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
-    TableProperties** table_properties) {
-  assert(table_properties != nullptr);
-  // If this is an external SST file ingested with write_global_seqno set to
-  // true, then we expect the checksum mismatch because checksum was written
-  // by SstFileWriter, but its global seqno in the properties block may have
-  // been changed during ingestion. In this case, we read the properties
-  // block, copy it to a memory buffer, change the global seqno to its
-  // original value, i.e. 0, and verify the checksum again.
-  BlockHandle props_block_handle;
-  CacheAllocationPtr tmp_buf;
-  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
-                            rep_->footer, rep_->ioptions, table_properties,
-                            false /* verify_checksum */, &props_block_handle,
-                            &tmp_buf, false /* compression_type_missing */,
-                            nullptr /* memory_allocator */);
-  if (s.ok() && tmp_buf) {
-    const auto seqno_pos_iter =
-        (*table_properties)
-            ->properties_offsets.find(
-                ExternalSstFilePropertyNames::kGlobalSeqno);
-    size_t block_size = static_cast<size_t>(props_block_handle.size());
-    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
-      uint64_t global_seqno_offset = seqno_pos_iter->second;
-      EncodeFixed64(
-          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
-    }
-    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
-    s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
-                                          tmp_buf.get(), block_size + 1, value);
+  // Use `FilePrefetchBuffer`
+  prefetch_buffer->reset(
+      new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */,
+                             true /* enable */, true /* track_min_offset */));
+  IOOptions opts;
+  Status s = file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = (*prefetch_buffer)->Prefetch(opts, file, prefetch_off, prefetch_len);
   }
   return s;
 }
 
 Status BlockBasedTable::ReadPropertiesBlock(
-    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-    const SequenceNumber largest_seqno) {
-  bool found_properties_block = true;
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, const SequenceNumber largest_seqno) {
   Status s;
-  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
+  BlockHandle handle;
+  s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
 
   if (!s.ok()) {
-    ROCKS_LOG_WARN(rep_->ioptions.info_log,
+    ROCKS_LOG_WARN(rep_->ioptions.logger,
                    "Error when seeking to properties block from file: %s",
                    s.ToString().c_str());
-  } else if (found_properties_block) {
+  } else if (!handle.IsNull()) {
     s = meta_iter->status();
-    TableProperties* table_properties = nullptr;
+    std::unique_ptr<TableProperties> table_properties;
     if (s.ok()) {
-      s = ReadProperties(
-          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
-          rep_->ioptions, &table_properties, true /* verify_checksum */,
-          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
-          false /* compression_type_missing */, nullptr /* memory_allocator */);
-    }
-
-    if (s.IsCorruption()) {
-      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
-                                           &table_properties);
-    }
-    std::unique_ptr<TableProperties> props_guard;
-    if (table_properties != nullptr) {
-      props_guard.reset(table_properties);
+      s = ReadTablePropertiesHelper(
+          ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
+          rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
     }
+    IGNORE_STATUS_IF_ERROR(s);
 
     if (!s.ok()) {
-      ROCKS_LOG_WARN(rep_->ioptions.info_log,
+      ROCKS_LOG_WARN(rep_->ioptions.logger,
                      "Encountered error while reading data from properties "
                      "block %s",
                      s.ToString().c_str());
     } else {
       assert(table_properties != nullptr);
-      rep_->table_properties.reset(props_guard.release());
+      rep_->table_properties = std::move(table_properties);
       rep_->blocks_maybe_compressed =
           rep_->table_properties->compression_name !=
           CompressionTypeToString(kNoCompression);
@@ -1405,26 +814,19 @@
                CompressionTypeToString(kZSTDNotFinalCompression));
     }
   } else {
-    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
                     "Cannot find Properties block from file.");
   }
-#ifndef ROCKSDB_LITE
-  if (rep_->table_properties) {
-    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
-                        &(rep_->table_prefix_extractor));
-  }
-#endif  // ROCKSDB_LITE
 
   // Read the table properties, if provided.
   if (rep_->table_properties) {
     rep_->whole_key_filtering &=
         IsFeatureSupported(*(rep_->table_properties),
                            BlockBasedTablePropertyNames::kWholeKeyFiltering,
-                           rep_->ioptions.info_log);
-    rep_->prefix_filtering &=
-        IsFeatureSupported(*(rep_->table_properties),
-                           BlockBasedTablePropertyNames::kPrefixFiltering,
-                           rep_->ioptions.info_log);
+                           rep_->ioptions.logger);
+    rep_->prefix_filtering &= IsFeatureSupported(
+        *(rep_->table_properties),
+        BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
 
     rep_->index_key_includes_seq =
         rep_->table_properties->index_key_is_user_key == 0;
@@ -1447,27 +849,26 @@
     s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
                                 &(rep_->global_seqno));
     if (!s.ok()) {
-      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
+      ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
     }
   }
   return s;
 }
 
 Status BlockBasedTable::ReadRangeDelBlock(
-    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    const ReadOptions& read_options, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter,
     const InternalKeyComparator& internal_comparator,
     BlockCacheLookupContext* lookup_context) {
   Status s;
-  bool found_range_del_block;
   BlockHandle range_del_handle;
-  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
+  s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle);
   if (!s.ok()) {
     ROCKS_LOG_WARN(
-        rep_->ioptions.info_log,
+        rep_->ioptions.logger,
         "Error when seeking to range delete tombstones block from file: %s",
         s.ToString().c_str());
-  } else if (found_range_del_block && !range_del_handle.IsNull()) {
-    ReadOptions read_options;
+  } else if (!range_del_handle.IsNull()) {
     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
         read_options, range_del_handle,
         /*input_iter=*/nullptr, BlockType::kRangeDeletion,
@@ -1476,9 +877,10 @@
     s = iter->status();
     if (!s.ok()) {
       ROCKS_LOG_WARN(
-          rep_->ioptions.info_log,
+          rep_->ioptions.logger,
           "Encountered error while reading data from range del block %s",
           s.ToString().c_str());
+      IGNORE_STATUS_IF_ERROR(s);
     } else {
       rep_->fragmented_range_dels =
           std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
@@ -1489,9 +891,10 @@
 }
 
 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
-    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-    BlockBasedTable* new_table, bool prefetch_all,
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all,
     const BlockBasedTableOptions& table_options, const int level,
+    size_t file_size, size_t max_file_size_for_l0_meta_pin,
     BlockCacheLookupContext* lookup_context) {
   Status s;
 
@@ -1523,11 +926,13 @@
       }
     }
   }
+  // Partition filters cannot be enabled without partition indexes
+  assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter ||
+         rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
 
   // Find compression dictionary handle
-  bool found_compression_dict = false;
-  s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
-                                 &rep_->compression_dict_handle);
+  s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
+                            &rep_->compression_dict_handle);
   if (!s.ok()) {
     return s;
   }
@@ -1536,22 +941,58 @@
 
   const bool use_cache = table_options.cache_index_and_filter_blocks;
 
-  // pin both index and filters, down to all partitions
-  const bool pin_all =
-      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+  const bool maybe_flushed =
+      level == 0 && file_size <= max_file_size_for_l0_meta_pin;
+  std::function<bool(PinningTier, PinningTier)> is_pinned =
+      [maybe_flushed, &is_pinned](PinningTier pinning_tier,
+                                  PinningTier fallback_pinning_tier) {
+        // Fallback to fallback would lead to infinite recursion. Disallow it.
+        assert(fallback_pinning_tier != PinningTier::kFallback);
+
+        switch (pinning_tier) {
+          case PinningTier::kFallback:
+            return is_pinned(fallback_pinning_tier,
+                             PinningTier::kNone /* fallback_pinning_tier */);
+          case PinningTier::kNone:
+            return false;
+          case PinningTier::kFlushedAndSimilar:
+            return maybe_flushed;
+          case PinningTier::kAll:
+            return true;
+        };
+
+        // In GCC, this is needed to suppress `control reaches end of non-void
+        // function [-Werror=return-type]`.
+        assert(false);
+        return false;
+      };
+  const bool pin_top_level_index = is_pinned(
+      table_options.metadata_cache_options.top_level_index_pinning,
+      table_options.pin_top_level_index_and_filter ? PinningTier::kAll
+                                                   : PinningTier::kNone);
+  const bool pin_partition =
+      is_pinned(table_options.metadata_cache_options.partition_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
+  const bool pin_unpartitioned =
+      is_pinned(table_options.metadata_cache_options.unpartitioned_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
 
-  // prefetch the first level of index
-  const bool prefetch_index =
-      prefetch_all ||
-      (table_options.pin_top_level_index_and_filter &&
-       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
   // pin the first level of index
   const bool pin_index =
-      pin_all || (table_options.pin_top_level_index_and_filter &&
-                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+      index_type == BlockBasedTableOptions::kTwoLevelIndexSearch
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of index
+  // WART: this might be redundant (unnecessary cache hit) if !pin_index,
+  // depending on prepopulate_block_cache option
+  const bool prefetch_index = prefetch_all || pin_index;
 
   std::unique_ptr<IndexReader> index_reader;
-  s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
+  s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache,
                                    prefetch_index, pin_index, lookup_context,
                                    &index_reader);
   if (!s.ok()) {
@@ -1563,41 +1004,45 @@
   // The partitions of partitioned index are always stored in cache. They
   // are hence follow the configuration for pin and prefetch regardless of
   // the value of cache_index_and_filter_blocks
-  if (prefetch_all) {
-    rep_->index_reader->CacheDependencies(pin_all);
+  if (prefetch_all || pin_partition) {
+    s = rep_->index_reader->CacheDependencies(ro, pin_partition);
+  }
+  if (!s.ok()) {
+    return s;
   }
 
-  // prefetch the first level of filter
-  const bool prefetch_filter =
-      prefetch_all ||
-      (table_options.pin_top_level_index_and_filter &&
-       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
-  // Partition fitlers cannot be enabled without partition indexes
-  assert(!prefetch_filter || prefetch_index);
   // pin the first level of filter
   const bool pin_filter =
-      pin_all || (table_options.pin_top_level_index_and_filter &&
-                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+      rep_->filter_type == Rep::FilterType::kPartitionedFilter
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of filter
+  // WART: this might be redundant (unnecessary cache hit) if !pin_filter,
+  // depending on prepopulate_block_cache option
+  const bool prefetch_filter = prefetch_all || pin_filter;
 
   if (rep_->filter_policy) {
     auto filter = new_table->CreateFilterBlockReader(
-        prefetch_buffer, use_cache, prefetch_filter, pin_filter,
+        ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter,
         lookup_context);
+
     if (filter) {
       // Refer to the comment above about paritioned indexes always being cached
-      if (prefetch_all) {
-        filter->CacheDependencies(pin_all);
+      if (prefetch_all || pin_partition) {
+        s = filter->CacheDependencies(ro, pin_partition);
+        if (!s.ok()) {
+          return s;
+        }
       }
-
       rep_->filter = std::move(filter);
     }
   }
 
   if (!rep_->compression_dict_handle.IsNull()) {
     std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
-    s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
-                                        prefetch_all, pin_all, lookup_context,
-                                        &uncompression_dict_reader);
+    s = UncompressionDictReader::Create(
+        this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,
+        pin_unpartitioned, lookup_context, &uncompression_dict_reader);
     if (!s.ok()) {
       return s;
     }
@@ -1650,23 +1095,23 @@
 // metaindex
 // block and its iterator.
 Status BlockBasedTable::ReadMetaIndexBlock(
-    FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
     std::unique_ptr<Block>* metaindex_block,
     std::unique_ptr<InternalIterator>* iter) {
   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
   // it is an empty block.
   std::unique_ptr<Block> metaindex;
   Status s = ReadBlockFromFile(
-      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
+      rep_->file.get(), prefetch_buffer, rep_->footer, ro,
       rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
       true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
-      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
-      GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
-      rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
+      0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options),
+      false /* for_compaction */, rep_->blocks_definitely_zstd_compressed,
+      nullptr /* filter_policy */);
 
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
                     "Encountered error while reading data from properties"
                     " block %s",
                     s.ToString().c_str());
@@ -1675,33 +1120,48 @@
 
   *metaindex_block = std::move(metaindex);
   // meta block uses bytewise comparator.
-  iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
-                                                      BytewiseComparator()));
+  iter->reset(metaindex_block->get()->NewMetaIterator());
   return Status::OK();
 }
 
 template <typename TBlocklike>
 Status BlockBasedTable::GetDataBlockFromCache(
-    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed,
+    const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
     const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
     const UncompressionDict& uncompression_dict, BlockType block_type,
-    GetContext* get_context) const {
+    const bool wait, GetContext* get_context) const {
   const size_t read_amp_bytes_per_bit =
       block_type == BlockType::kData
           ? rep_->table_options.read_amp_bytes_per_bit
           : 0;
   assert(block);
   assert(block->IsEmpty());
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              (block_type == BlockType::kFilter ||
+               block_type == BlockType::kCompressionDictionary ||
+               block_type == BlockType::kIndex)
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
 
   Status s;
   BlockContents* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
+  Statistics* statistics = rep_->ioptions.statistics.get();
+  bool using_zstd = rep_->blocks_definitely_zstd_compressed;
+  const FilterPolicy* filter_policy = rep_->filter_policy;
+  Cache::CreateCallback create_cb = GetCreateCallback<TBlocklike>(
+      read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
 
   // Lookup uncompressed cache first
   if (block_cache != nullptr) {
-    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
-                                          block_type, get_context);
+    assert(!cache_key.empty());
+    Cache::Handle* cache_handle = nullptr;
+    cache_handle = GetEntryFromCache(
+        rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+        block_type, wait, get_context,
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), create_cb,
+        priority);
     if (cache_handle != nullptr) {
       block->SetCachedValue(
           reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
@@ -1717,11 +1177,20 @@
     return s;
   }
 
-  assert(!compressed_block_cache_key.empty());
-  block_cache_compressed_handle =
-      block_cache_compressed->Lookup(compressed_block_cache_key);
-
-  Statistics* statistics = rep_->ioptions.statistics;
+  assert(!cache_key.empty());
+  BlockContents contents;
+  if (rep_->ioptions.lowest_used_cache_tier ==
+      CacheTier::kNonVolatileBlockTier) {
+    Cache::CreateCallback create_cb_special = GetCreateCallback<BlockContents>(
+        read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
+    block_cache_compressed_handle = block_cache_compressed->Lookup(
+        cache_key,
+        BlocklikeTraits<BlockContents>::GetCacheItemHelper(block_type),
+        create_cb_special, priority, true);
+  } else {
+    block_cache_compressed_handle =
+        block_cache_compressed->Lookup(cache_key, statistics);
+  }
 
   // if we found in the compressed cache, then uncompress and insert into
   // uncompressed cache
@@ -1734,11 +1203,10 @@
   RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
   compressed_block = reinterpret_cast<BlockContents*>(
       block_cache_compressed->Value(block_cache_compressed_handle));
-  CompressionType compression_type = compressed_block->get_compression_type();
+  CompressionType compression_type = GetBlockCompressionType(*compressed_block);
   assert(compression_type != kNoCompression);
 
   // Retrieve the uncompressed contents into a new buffer
-  BlockContents contents;
   UncompressionContext context(compression_type);
   UncompressionInfo info(context, uncompression_dict, compression_type);
   s = UncompressBlockContents(
@@ -1746,12 +1214,12 @@
       &contents, rep_->table_options.format_version, rep_->ioptions,
       GetMemoryAllocator(rep_->table_options));
 
-  // Insert uncompressed block into block cache
+  // Insert uncompressed block into block cache, the priority is based on the
+  // data block type.
   if (s.ok()) {
     std::unique_ptr<TBlocklike> block_holder(
         BlocklikeTraits<TBlocklike>::Create(
-            std::move(contents), rep_->get_global_seqno(block_type),
-            read_amp_bytes_per_bit, statistics,
+            std::move(contents), read_amp_bytes_per_bit, statistics,
             rep_->blocks_definitely_zstd_compressed,
             rep_->table_options.filter_policy.get()));  // uncompressed block
 
@@ -1759,14 +1227,17 @@
         read_options.fill_cache) {
       size_t charge = block_holder->ApproximateMemoryUsage();
       Cache::Handle* cache_handle = nullptr;
-      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                              &DeleteCachedEntry<TBlocklike>, &cache_handle);
+      s = InsertEntryToCache(
+          rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+          BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
+          block_holder, charge, &cache_handle, priority);
       if (s.ok()) {
         assert(cache_handle != nullptr);
         block->SetCachedValue(block_holder.release(), block_cache,
                               cache_handle);
 
-        UpdateCacheInsertionMetrics(block_type, get_context, charge);
+        UpdateCacheInsertionMetrics(block_type, get_context, charge,
+                                    s.IsOkOverwritten(), rep_->ioptions.stats);
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
       }
@@ -1782,14 +1253,13 @@
 
 template <typename TBlocklike>
 Status BlockBasedTable::PutDataBlockToCache(
-    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed,
+    const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
     CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
     CompressionType raw_block_comp_type,
-    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+    const UncompressionDict& uncompression_dict,
     MemoryAllocator* memory_allocator, BlockType block_type,
     GetContext* get_context) const {
-  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  const ImmutableOptions& ioptions = rep_->ioptions;
   const uint32_t format_version = rep_->table_options.format_version;
   const size_t read_amp_bytes_per_bit =
       block_type == BlockType::kData
@@ -1806,7 +1276,7 @@
   assert(cached_block->IsEmpty());
 
   Status s;
-  Statistics* statistics = ioptions.statistics;
+  Statistics* statistics = ioptions.stats;
 
   std::unique_ptr<TBlocklike> block_holder;
   if (raw_block_comp_type != kNoCompression) {
@@ -1823,13 +1293,13 @@
     }
 
     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
-        std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
+        std::move(uncompressed_block_contents), read_amp_bytes_per_bit,
         statistics, rep_->blocks_definitely_zstd_compressed,
         rep_->table_options.filter_policy.get()));
   } else {
     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
-        std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
-        statistics, rep_->blocks_definitely_zstd_compressed,
+        std::move(*raw_block_contents), read_amp_bytes_per_bit, statistics,
+        rep_->blocks_definitely_zstd_compressed,
         rep_->table_options.filter_policy.get()));
   }
 
@@ -1838,24 +1308,28 @@
   if (block_cache_compressed != nullptr &&
       raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
       raw_block_contents->own_bytes()) {
-#ifndef NDEBUG
     assert(raw_block_contents->is_raw_block);
-#endif  // NDEBUG
+    assert(!cache_key.empty());
 
     // We cannot directly put raw_block_contents because this could point to
     // an object in the stack.
-    BlockContents* block_cont_for_comp_cache =
-        new BlockContents(std::move(*raw_block_contents));
-    s = block_cache_compressed->Insert(
-        compressed_block_cache_key, block_cont_for_comp_cache,
-        block_cont_for_comp_cache->ApproximateMemoryUsage(),
-        &DeleteCachedEntry<BlockContents>);
+    std::unique_ptr<BlockContents> block_cont_for_comp_cache(
+        new BlockContents(std::move(*raw_block_contents)));
+    s = InsertEntryToCache(
+        rep_->ioptions.lowest_used_cache_tier, block_cache_compressed,
+        cache_key,
+        BlocklikeTraits<BlockContents>::GetCacheItemHelper(block_type),
+        block_cont_for_comp_cache,
+        block_cont_for_comp_cache->ApproximateMemoryUsage(), nullptr,
+        Cache::Priority::LOW);
+
+    BlockContents* block_cont_raw_ptr = block_cont_for_comp_cache.release();
     if (s.ok()) {
       // Avoid the following code to delete this cached block.
       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
     } else {
       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
-      delete block_cont_for_comp_cache;
+      delete block_cont_raw_ptr;
     }
   }
 
@@ -1863,15 +1337,17 @@
   if (block_cache != nullptr && block_holder->own_bytes()) {
     size_t charge = block_holder->ApproximateMemoryUsage();
     Cache::Handle* cache_handle = nullptr;
-    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                            &DeleteCachedEntry<TBlocklike>, &cache_handle,
-                            priority);
+    s = InsertEntryToCache(
+        rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
+        block_holder, charge, &cache_handle, priority);
     if (s.ok()) {
       assert(cache_handle != nullptr);
       cached_block->SetCachedValue(block_holder.release(), block_cache,
                                    cache_handle);
 
-      UpdateCacheInsertionMetrics(block_type, get_context, charge);
+      UpdateCacheInsertionMetrics(block_type, get_context, charge,
+                                  s.IsOkOverwritten(), rep_->ioptions.stats);
     } else {
       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
     }
@@ -1883,8 +1359,8 @@
 }
 
 std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
-    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
-    bool pin, BlockCacheLookupContext* lookup_context) {
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+    bool prefetch, bool pin, BlockCacheLookupContext* lookup_context) {
   auto& rep = rep_;
   auto filter_type = rep->filter_type;
   if (filter_type == Rep::FilterType::kNoFilter) {
@@ -1896,14 +1372,14 @@
   switch (filter_type) {
     case Rep::FilterType::kPartitionedFilter:
       return PartitionedFilterBlockReader::Create(
-          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+          this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
 
     case Rep::FilterType::kBlockFilter:
       return BlockBasedFilterBlockReader::Create(
-          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+          this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
 
     case Rep::FilterType::kFullFilter:
-      return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
+      return FullFilterBlockReader::Create(this, ro, prefetch_buffer, use_cache,
                                            prefetch, pin, lookup_context);
 
     default:
@@ -1930,195 +1406,25 @@
                                          lookup_context);
 }
 
-// Convert an index iterator value (i.e., an encoded BlockHandle)
-// into an iterator over the contents of the corresponding block.
-// If input_iter is null, new a iterator
-// If input_iter is not null, update this iter and return it
-template <typename TBlockIter>
-TBlockIter* BlockBasedTable::NewDataBlockIterator(
-    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
-    BlockType block_type, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context, Status s,
-    FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
-  PERF_TIMER_GUARD(new_table_block_iter_nanos);
-
-  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
-  if (!s.ok()) {
-    iter->Invalidate(s);
-    return iter;
-  }
-
-  CachableEntry<UncompressionDict> uncompression_dict;
-  if (rep_->uncompression_dict_reader) {
-    const bool no_io = (ro.read_tier == kBlockCacheTier);
-    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
-        prefetch_buffer, no_io, get_context, lookup_context,
-        &uncompression_dict);
-    if (!s.ok()) {
-      iter->Invalidate(s);
-      return iter;
-    }
-  }
-
-  const UncompressionDict& dict = uncompression_dict.GetValue()
-                                      ? *uncompression_dict.GetValue()
-                                      : UncompressionDict::GetEmptyDict();
-
-  CachableEntry<Block> block;
-  s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
-                    get_context, lookup_context, for_compaction,
-                    /* use_cache */ true);
-
-  if (!s.ok()) {
-    assert(block.IsEmpty());
-    iter->Invalidate(s);
-    return iter;
-  }
-
-  assert(block.GetValue() != nullptr);
-
-  // Block contents are pinned and it is still pinned after the iterator
-  // is destroyed as long as cleanup functions are moved to another object,
-  // when:
-  // 1. block cache handle is set to be released in cleanup function, or
-  // 2. it's pointing to immortal source. If own_bytes is true then we are
-  //    not reading data from the original source, whether immortal or not.
-  //    Otherwise, the block is pinned iff the source is immortal.
-  const bool block_contents_pinned =
-      block.IsCached() ||
-      (!block.GetValue()->own_bytes() && rep_->immortal_table);
-  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
-                                       block_contents_pinned);
-
-  if (!block.IsCached()) {
-    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
-      // insert a dummy record to block cache to track the memory usage
-      Cache* const block_cache = rep_->table_options.block_cache.get();
-      Cache::Handle* cache_handle = nullptr;
-      // There are two other types of cache keys: 1) SST cache key added in
-      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
-      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
-      // from SST cache key(31 bytes), and use non-zero prefix to
-      // differentiate from `write_buffer_manager`
-      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
-      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
-      // Prefix: use rep_->cache_key_prefix padded by 0s
-      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
-      assert(rep_->cache_key_prefix_size != 0);
-      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
-      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
-      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
-                                 next_cache_key_id_++);
-      assert(end - cache_key <=
-             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
-      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
-      s = block_cache->Insert(unique_key, nullptr,
-                              block.GetValue()->ApproximateMemoryUsage(),
-                              nullptr, &cache_handle);
-
-      if (s.ok()) {
-        assert(cache_handle != nullptr);
-        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
-                              cache_handle);
-      }
-    }
-  } else {
-    iter->SetCacheHandle(block.GetCacheHandle());
-  }
-
-  block.TransferTo(iter);
-
-  return iter;
-}
-
 template <>
 DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
-    const Rep* rep, Block* block, DataBlockIter* input_iter,
-    bool block_contents_pinned) {
-  return block->NewDataIterator(
-      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-      input_iter, rep->ioptions.statistics, block_contents_pinned);
+    const Rep* rep, Block* block, BlockType block_type,
+    DataBlockIter* input_iter, bool block_contents_pinned) {
+  return block->NewDataIterator(rep->internal_comparator.user_comparator(),
+                                rep->get_global_seqno(block_type), input_iter,
+                                rep->ioptions.stats, block_contents_pinned);
 }
 
 template <>
 IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
-    const Rep* rep, Block* block, IndexBlockIter* input_iter,
-    bool block_contents_pinned) {
+    const Rep* rep, Block* block, BlockType block_type,
+    IndexBlockIter* input_iter, bool block_contents_pinned) {
   return block->NewIndexIterator(
-      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-      input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
-      rep->index_has_first_key, rep->index_key_includes_seq,
-      rep->index_value_is_full, block_contents_pinned);
-}
-
-// Convert an uncompressed data block (i.e CachableEntry<Block>)
-// into an iterator over the contents of the corresponding block.
-// If input_iter is null, new a iterator
-// If input_iter is not null, update this iter and return it
-template <typename TBlockIter>
-TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
-                                                  CachableEntry<Block>& block,
-                                                  TBlockIter* input_iter,
-                                                  Status s) const {
-  PERF_TIMER_GUARD(new_table_block_iter_nanos);
-
-  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
-  if (!s.ok()) {
-    iter->Invalidate(s);
-    return iter;
-  }
-
-  assert(block.GetValue() != nullptr);
-  // Block contents are pinned and it is still pinned after the iterator
-  // is destroyed as long as cleanup functions are moved to another object,
-  // when:
-  // 1. block cache handle is set to be released in cleanup function, or
-  // 2. it's pointing to immortal source. If own_bytes is true then we are
-  //    not reading data from the original source, whether immortal or not.
-  //    Otherwise, the block is pinned iff the source is immortal.
-  const bool block_contents_pinned =
-      block.IsCached() ||
-      (!block.GetValue()->own_bytes() && rep_->immortal_table);
-  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
-                                       block_contents_pinned);
-
-  if (!block.IsCached()) {
-    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
-      // insert a dummy record to block cache to track the memory usage
-      Cache* const block_cache = rep_->table_options.block_cache.get();
-      Cache::Handle* cache_handle = nullptr;
-      // There are two other types of cache keys: 1) SST cache key added in
-      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
-      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
-      // from SST cache key(31 bytes), and use non-zero prefix to
-      // differentiate from `write_buffer_manager`
-      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
-      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
-      // Prefix: use rep_->cache_key_prefix padded by 0s
-      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
-      assert(rep_->cache_key_prefix_size != 0);
-      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
-      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
-      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
-                                 next_cache_key_id_++);
-      assert(end - cache_key <=
-             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
-      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
-      s = block_cache->Insert(unique_key, nullptr,
-                              block.GetValue()->ApproximateMemoryUsage(),
-                              nullptr, &cache_handle);
-      if (s.ok()) {
-        assert(cache_handle != nullptr);
-        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
-                              cache_handle);
-      }
-    }
-  } else {
-    iter->SetCacheHandle(block.GetCacheHandle());
-  }
-
-  block.TransferTo(iter);
-  return iter;
+      rep->internal_comparator.user_comparator(),
+      rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
+      /* total_order_seek */ true, rep->index_has_first_key,
+      rep->index_key_includes_seq, rep->index_value_is_full,
+      block_contents_pinned);
 }
 
 // If contents is nullptr, this function looks up the block caches for the
@@ -2130,54 +1436,54 @@
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    const bool wait, const bool for_compaction,
     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
     BlockContents* contents) const {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep_->table_options.block_cache.get();
-  // No point to cache compressed blocks if it never goes away
   Cache* block_cache_compressed =
-      rep_->immortal_table ? nullptr
-                           : rep_->table_options.block_cache_compressed.get();
+      rep_->table_options.block_cache_compressed.get();
 
   // First, try to get the block from the cache
   //
   // If either block cache is enabled, we'll try to read from it.
   Status s;
-  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  Slice key /* key to the block cache */;
-  Slice ckey /* key to the compressed block cache */;
+  CacheKey key_data;
+  Slice key;
   bool is_cache_hit = false;
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
     // create key for block cache
-    if (block_cache != nullptr) {
-      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                        handle, cache_key);
-    }
-
-    if (block_cache_compressed != nullptr) {
-      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
-                         rep_->compressed_cache_key_prefix_size, handle,
-                         compressed_cache_key);
-    }
+    key_data = GetCacheKey(rep_->base_cache_key, handle);
+    key = key_data.AsSlice();
 
     if (!contents) {
-      s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                                ro, block_entry, uncompression_dict, block_type,
-                                get_context);
-      if (block_entry->GetValue()) {
+      s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro,
+                                block_entry, uncompression_dict, block_type,
+                                wait, get_context);
+      // Value could still be null at this point, so check the cache handle
+      // and update the read pattern for prefetching
+      if (block_entry->GetValue() || block_entry->GetCacheHandle()) {
         // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
         // compressed block cache.
         is_cache_hit = true;
+        if (prefetch_buffer) {
+          // Update the block details so that PrefetchBuffer can use the read
+          // pattern to determine if reads are sequential or not for
+          // prefetching. It should also take in account blocks read from cache.
+          prefetch_buffer->UpdateReadPattern(handle.offset(),
+                                             BlockSizeWithTrailer(handle),
+                                             ro.adaptive_readahead);
+        }
       }
     }
 
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
-    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
-      Statistics* statistics = rep_->ioptions.statistics;
+    if (block_entry->GetValue() == nullptr &&
+        block_entry->GetCacheHandle() == nullptr && !no_io && ro.fill_cache) {
+      Statistics* statistics = rep_->ioptions.stats;
       const bool maybe_compressed =
           block_type != BlockType::kFilter &&
           block_type != BlockType::kCompressionDictionary &&
@@ -2186,7 +1492,9 @@
       CompressionType raw_block_comp_type;
       BlockContents raw_block_contents;
       if (!contents) {
-        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
+        Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
+                                              : READ_BLOCK_GET_MICROS;
+        StopWatch sw(rep_->ioptions.clock, statistics, histogram);
         BlockFetcher block_fetcher(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
             &raw_block_contents, rep_->ioptions, do_uncompress,
@@ -2197,17 +1505,31 @@
         s = block_fetcher.ReadBlockContents();
         raw_block_comp_type = block_fetcher.get_compression_type();
         contents = &raw_block_contents;
+        if (get_context) {
+          switch (block_type) {
+            case BlockType::kIndex:
+              ++get_context->get_context_stats_.num_index_read;
+              break;
+            case BlockType::kFilter:
+              ++get_context->get_context_stats_.num_filter_read;
+              break;
+            case BlockType::kData:
+              ++get_context->get_context_stats_.num_data_read;
+              break;
+            default:
+              break;
+          }
+        }
       } else {
-        raw_block_comp_type = contents->get_compression_type();
+        raw_block_comp_type = GetBlockCompressionType(*contents);
       }
 
       if (s.ok()) {
-        SequenceNumber seq_no = rep_->get_global_seqno(block_type);
         // If filling cache is allowed and a cache is configured, try to put the
         // block to the cache.
         s = PutDataBlockToCache(
-            key, ckey, block_cache, block_cache_compressed, block_entry,
-            contents, raw_block_comp_type, uncompression_dict, seq_no,
+            key, block_cache, block_cache_compressed, block_entry, contents,
+            raw_block_comp_type, uncompression_dict,
             GetMemoryAllocator(rep_->table_options), block_type, get_context);
       }
     }
@@ -2261,7 +1583,7 @@
       // Avoid making copy of block_key and cf_name when constructing the access
       // record.
       BlockCacheTraceRecord access_record(
-          rep_->ioptions.env->NowMicros(),
+          rep_->ioptions.clock->NowMicros(),
           /*block_key=*/"", trace_block_type,
           /*block_size=*/usage, rep_->cf_id_for_tracing(),
           /*cf_name=*/"", rep_->level_for_tracing(),
@@ -2269,9 +1591,11 @@
           no_insert, lookup_context->get_id,
           lookup_context->get_from_user_specified_snapshot,
           /*referenced_key=*/"");
-      block_cache_tracer_->WriteBlockAccess(access_record, key,
-                                            rep_->cf_name_for_tracing(),
-                                            lookup_context->referenced_key);
+      // TODO: Should handle this error?
+      block_cache_tracer_
+          ->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(),
+                             lookup_context->referenced_key)
+          .PermitUncheckedError();
     }
   }
 
@@ -2304,12 +1628,11 @@
     char* scratch, const UncompressionDict& uncompression_dict) const {
   RandomAccessFileReader* file = rep_->file.get();
   const Footer& footer = rep_->footer;
-  const ImmutableCFOptions& ioptions = rep_->ioptions;
-  SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
+  const ImmutableOptions& ioptions = rep_->ioptions;
   size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
   MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
 
-  if (file->use_direct_io() || ioptions.allow_mmap_reads) {
+  if (ioptions.allow_mmap_reads) {
     size_t idx_in_batch = 0;
     for (auto mget_iter = batch->begin(); mget_iter != batch->end();
          ++mget_iter, ++idx_in_batch) {
@@ -2324,11 +1647,16 @@
           RetrieveBlock(nullptr, options, handle, uncompression_dict,
                         &(*results)[idx_in_batch], BlockType::kData,
                         mget_iter->get_context, &lookup_data_block_context,
-                        /* for_compaction */ false, /* use_cache */ true);
+                        /* for_compaction */ false, /* use_cache */ true,
+                        /* wait_for_cache */ true);
     }
     return;
   }
 
+  // In direct IO mode, blocks share the direct io buffer.
+  // Otherwise, blocks share the scratch buffer.
+  const bool use_shared_buffer = file->use_direct_io() || scratch != nullptr;
+
   autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
   size_t buf_offset = 0;
   size_t idx_in_batch = 0;
@@ -2349,9 +1677,13 @@
     // If current block is adjacent to the previous one, at the same time,
     // compression is enabled and there is no compressed cache, we combine
     // the two block read as one.
-    if (scratch != nullptr && prev_end == handle.offset()) {
+    // We don't combine block reads here in direct IO mode, because when doing
+    // direct IO read, the block requests will be realigned and merged when
+    // necessary.
+    if (use_shared_buffer && !file->use_direct_io() &&
+        prev_end == handle.offset()) {
       req_offset_for_block.emplace_back(prev_len);
-      prev_len += block_size(handle);
+      prev_len += BlockSizeWithTrailer(handle);
     } else {
       // No compression or current block and previous one is not adjacent:
       // Step 1, create a new request for previous blocks
@@ -2359,38 +1691,58 @@
         FSReadRequest req;
         req.offset = prev_offset;
         req.len = prev_len;
-        if (scratch == nullptr) {
-          req.scratch = new char[req.len];
-        } else {
+        if (file->use_direct_io()) {
+          req.scratch = nullptr;
+        } else if (use_shared_buffer) {
           req.scratch = scratch + buf_offset;
           buf_offset += req.len;
+        } else {
+          req.scratch = new char[req.len];
         }
-        req.status = IOStatus::OK();
         read_reqs.emplace_back(req);
       }
 
       // Step 2, remeber the previous block info
       prev_offset = handle.offset();
-      prev_len = block_size(handle);
+      prev_len = BlockSizeWithTrailer(handle);
       req_offset_for_block.emplace_back(0);
     }
     req_idx_for_block.emplace_back(read_reqs.size());
+
+    PERF_COUNTER_ADD(block_read_count, 1);
+    PERF_COUNTER_ADD(block_read_byte, BlockSizeWithTrailer(handle));
   }
   // Handle the last block and process the pending last request
   if (prev_len != 0) {
     FSReadRequest req;
     req.offset = prev_offset;
     req.len = prev_len;
-    if (scratch == nullptr) {
-      req.scratch = new char[req.len];
-    } else {
+    if (file->use_direct_io()) {
+      req.scratch = nullptr;
+    } else if (use_shared_buffer) {
       req.scratch = scratch + buf_offset;
+    } else {
+      req.scratch = new char[req.len];
     }
-    req.status = IOStatus::OK();
     read_reqs.emplace_back(req);
   }
 
-  file->MultiRead(&read_reqs[0], read_reqs.size());
+  AlignedBuf direct_io_buf;
+  {
+    IOOptions opts;
+    IOStatus s = file->PrepareIOOptions(options, opts);
+    if (s.ok()) {
+      s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(),
+                          &direct_io_buf);
+    }
+    if (!s.ok()) {
+      // Discard all the results in this batch if there is any time out
+      // or overall MultiRead error
+      for (FSReadRequest& req : read_reqs) {
+        req.status = s;
+      }
+    }
+  }
 
   idx_in_batch = 0;
   size_t valid_batch_idx = 0;
@@ -2408,10 +1760,14 @@
     size_t& req_idx = req_idx_for_block[valid_batch_idx];
     size_t& req_offset = req_offset_for_block[valid_batch_idx];
     valid_batch_idx++;
+    if (mget_iter->get_context) {
+      ++(mget_iter->get_context->get_context_stats_.num_data_read);
+    }
     FSReadRequest& req = read_reqs[req_idx];
     Status s = req.status;
     if (s.ok()) {
-      if (req.result.size() != req.len) {
+      if ((req.result.size() != req.len) ||
+          (req_offset + BlockSizeWithTrailer(handle) > req.result.size())) {
         s = Status::Corruption(
             "truncated block read from " + rep_->file->file_name() +
             " offset " + ToString(handle.offset()) + ", expected " +
@@ -2420,60 +1776,63 @@
     }
 
     BlockContents raw_block_contents;
-    size_t cur_read_end = req_offset + block_size(handle);
-    if (cur_read_end > req.result.size()) {
-      s = Status::Corruption(
-          "truncated block read from " + rep_->file->file_name() + " offset " +
-          ToString(handle.offset()) + ", expected " + ToString(req.len) +
-          " bytes, got " + ToString(req.result.size()));
-    }
-
-    bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
     if (s.ok()) {
-      if (scratch == nullptr && !blocks_share_read_buffer) {
+      if (!use_shared_buffer) {
         // We allocated a buffer for this block. Give ownership of it to
         // BlockContents so it can free the memory
         assert(req.result.data() == req.scratch);
-        std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
+        assert(req.result.size() == BlockSizeWithTrailer(handle));
+        assert(req_offset == 0);
+        std::unique_ptr<char[]> raw_block(req.scratch);
         raw_block_contents = BlockContents(std::move(raw_block), handle.size());
       } else {
-        // We used the scratch buffer which are shared by the blocks.
+        // We used the scratch buffer or direct io buffer
+        // which are shared by the blocks.
         // raw_block_contents does not have the ownership.
         raw_block_contents =
-            BlockContents(Slice(req.scratch + req_offset, handle.size()));
+            BlockContents(Slice(req.result.data() + req_offset, handle.size()));
       }
-
 #ifndef NDEBUG
       raw_block_contents.is_raw_block = true;
 #endif
+
       if (options.verify_checksums) {
         PERF_TIMER_GUARD(block_checksum_time);
         const char* data = req.result.data();
-        uint32_t expected =
-            DecodeFixed32(data + req_offset + handle.size() + 1);
-        // Since the scratch might be shared. the offset of the data block in
+        // Since the scratch might be shared, the offset of the data block in
         // the buffer might not be 0. req.result.data() only point to the
         // begin address of each read request, we need to add the offset
         // in each read request. Checksum is stored in the block trailer,
-        // which is handle.size() + 1.
-        s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
-                                              req.result.data() + req_offset,
-                                              handle.size() + 1, expected);
+        // beyond the payload size.
+        s = VerifyBlockChecksum(footer.checksum_type(), data + req_offset,
+                                handle.size(), rep_->file->file_name(),
+                                handle.offset());
         TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
       }
+    } else if (!use_shared_buffer) {
+      // Free the allocated scratch buffer.
+      delete[] req.scratch;
     }
 
     if (s.ok()) {
-      // It handles a rare case: compression is set and these is no compressed
-      // cache (enable combined read). In this case, the scratch != nullptr.
-      // At the same time, some blocks are actually not compressed,
-      // since its compression space saving is smaller than the threshold. In
-      // this case, if the block shares the scratch memory, we need to copy it
-      // to the heap such that it can be added to the regular block cache.
+      // When the blocks share the same underlying buffer (scratch or direct io
+      // buffer), we may need to manually copy the block into heap if the raw
+      // block has to be inserted into a cache. That falls into th following
+      // cases -
+      // 1. Raw block is not compressed, it needs to be inserted into the
+      //    uncompressed block cache if there is one
+      // 2. If the raw block is compressed, it needs to be inserted into the
+      //    compressed block cache if there is one
+      //
+      // In all other cases, the raw block is either uncompressed into a heap
+      // buffer or there is no cache at all.
       CompressionType compression_type =
-          raw_block_contents.get_compression_type();
-      if (scratch != nullptr && compression_type == kNoCompression) {
-        Slice raw = Slice(req.scratch + req_offset, block_size(handle));
+          GetBlockCompressionType(raw_block_contents);
+      if (use_shared_buffer && (compression_type == kNoCompression ||
+                                (compression_type != kNoCompression &&
+                                 rep_->table_options.block_cache_compressed))) {
+        Slice raw =
+            Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle));
         raw_block_contents = BlockContents(
             CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
             handle.size());
@@ -2492,40 +1851,43 @@
         // necessary. Since we're passing the raw block contents, it will
         // avoid looking up the block cache
         s = MaybeReadBlockAndLoadToCache(
-            nullptr, options, handle, uncompression_dict, block_entry,
-            BlockType::kData, mget_iter->get_context,
-            &lookup_data_block_context, &raw_block_contents);
+            nullptr, options, handle, uncompression_dict, /*wait=*/true,
+            /*for_compaction=*/false, block_entry, BlockType::kData,
+            mget_iter->get_context, &lookup_data_block_context,
+            &raw_block_contents);
 
         // block_entry value could be null if no block cache is present, i.e
         // BlockBasedTableOptions::no_block_cache is true and no compressed
         // block cache is configured. In that case, fall
         // through and set up the block explicitly
         if (block_entry->GetValue() != nullptr) {
+          s.PermitUncheckedError();
           continue;
         }
       }
 
       CompressionType compression_type =
-          raw_block_contents.get_compression_type();
+          GetBlockCompressionType(raw_block_contents);
       BlockContents contents;
       if (compression_type != kNoCompression) {
         UncompressionContext context(compression_type);
         UncompressionInfo info(context, uncompression_dict, compression_type);
-        s = UncompressBlockContents(info, req.result.data() + req_offset,
-                                    handle.size(), &contents, footer.version(),
-                                    rep_->ioptions, memory_allocator);
+        s = UncompressBlockContents(
+            info, req.result.data() + req_offset, handle.size(), &contents,
+            footer.format_version(), rep_->ioptions, memory_allocator);
       } else {
-        // There are two cases here: 1) caller uses the scratch buffer; 2) we
-        // use the requst buffer. If scratch buffer is used, we ensure that
+        // There are two cases here:
+        // 1) caller uses the shared buffer (scratch or direct io buffer);
+        // 2) we use the requst buffer.
+        // If scratch buffer or direct io buffer is used, we ensure that
         // all raw blocks are copyed to the heap as single blocks. If scratch
         // buffer is not used, we also have no combined read, so the raw
         // block can be used directly.
         contents = std::move(raw_block_contents);
       }
       if (s.ok()) {
-        (*results)[idx_in_batch].SetOwnedValue(
-            new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
-                      ioptions.statistics));
+        (*results)[idx_in_batch].SetOwnedValue(new Block(
+            std::move(contents), read_amp_bytes_per_bit, ioptions.stats));
       }
     }
     (*statuses)[idx_in_batch] = s;
@@ -2538,22 +1900,23 @@
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const {
+    bool for_compaction, bool use_cache, bool wait_for_cache) const {
   assert(block_entry);
   assert(block_entry->IsEmpty());
 
   Status s;
   if (use_cache) {
-    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
-                                     uncompression_dict, block_entry,
-                                     block_type, get_context, lookup_context,
-                                     /*contents=*/nullptr);
+    s = MaybeReadBlockAndLoadToCache(
+        prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache,
+        for_compaction, block_entry, block_type, get_context, lookup_context,
+        /*contents=*/nullptr);
 
     if (!s.ok()) {
       return s;
     }
 
-    if (block_entry->GetValue() != nullptr) {
+    if (block_entry->GetValue() != nullptr ||
+        block_entry->GetCacheHandle() != nullptr) {
       assert(s.ok());
       return s;
     }
@@ -2574,19 +1937,35 @@
   std::unique_ptr<TBlocklike> block;
 
   {
-    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
-                 READ_BLOCK_GET_MICROS);
+    Histograms histogram =
+        for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
+    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
     s = ReadBlockFromFile(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
         rep_->ioptions, do_uncompress, maybe_compressed, block_type,
         uncompression_dict, rep_->persistent_cache_options,
-        rep_->get_global_seqno(block_type),
         block_type == BlockType::kData
             ? rep_->table_options.read_amp_bytes_per_bit
             : 0,
         GetMemoryAllocator(rep_->table_options), for_compaction,
         rep_->blocks_definitely_zstd_compressed,
         rep_->table_options.filter_policy.get());
+
+    if (get_context) {
+      switch (block_type) {
+        case BlockType::kIndex:
+          ++(get_context->get_context_stats_.num_index_read);
+          break;
+        case BlockType::kFilter:
+          ++(get_context->get_context_stats_.num_filter_read);
+          break;
+        case BlockType::kData:
+          ++(get_context->get_context_stats_.num_data_read);
+          break;
+        default:
+          break;
+      }
+    }
   }
 
   if (!s.ok()) {
@@ -2606,28 +1985,28 @@
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<BlockContents>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 template Status BlockBasedTable::RetrieveBlock<Block>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     const BlockBasedTable* table,
@@ -2639,22 +2018,23 @@
     const BlockHandle& handle) {
   // Return a block iterator on the index partition
   auto block = block_map_->find(handle.offset());
-  // This is a possible scenario since block cache might not have had space
-  // for the partition
-  if (block != block_map_->end()) {
-    const Rep* rep = table_->get_rep();
-    assert(rep);
-
-    Statistics* kNullStats = nullptr;
-    // We don't return pinned data from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return block->second.GetValue()->NewIndexIterator(
-        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-        nullptr, kNullStats, true, rep->index_has_first_key,
-        rep->index_key_includes_seq, rep->index_value_is_full);
+  // block_map_ must be exhaustive
+  if (block == block_map_->end()) {
+    assert(false);
+    // Signal problem to caller
+    return nullptr;
   }
-  // Create an empty iterator
-  return new IndexBlockIter();
+  const Rep* rep = table_->get_rep();
+  assert(rep);
+
+  Statistics* kNullStats = nullptr;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  return block->second.GetValue()->NewIndexIterator(
+      rep->internal_comparator.user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+      rep->index_has_first_key, rep->index_key_includes_seq,
+      rep->index_value_is_full);
 }
 
 // This will be broken if the user specifies an unusual implementation
@@ -2666,7 +2046,9 @@
 // 2) Compare(prefix(key), key) <= 0.
 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
 //
-// Otherwise, this method guarantees no I/O will be incurred.
+// If read_options.read_tier == kBlockCacheTier, this method will do no I/O and
+// will return true if the filter block is not in memory and not found in block
+// cache.
 //
 // REQUIRES: this method shouldn't be called while the DB lock is held.
 bool BlockBasedTable::PrefixMayMatch(
@@ -2688,30 +2070,34 @@
   } else {
     prefix_extractor = rep_->table_prefix_extractor.get();
   }
-  auto user_key = ExtractUserKey(internal_key);
-  if (!prefix_extractor->InDomain(user_key)) {
+  auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  auto user_key_without_ts =
+      ExtractUserKeyAndStripTimestamp(internal_key, ts_sz);
+  if (!prefix_extractor->InDomain(user_key_without_ts)) {
     return true;
   }
 
   bool may_match = true;
-  Status s;
 
   // First, try check with full filter
   FilterBlockReader* const filter = rep_->filter.get();
   bool filter_checked = true;
   if (filter != nullptr) {
+    const bool no_io = read_options.read_tier == kBlockCacheTier;
+
     if (!filter->IsBlockBased()) {
       const Slice* const const_ikey_ptr = &internal_key;
       may_match = filter->RangeMayExist(
-          read_options.iterate_upper_bound, user_key, prefix_extractor,
-          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
-          &filter_checked, need_upper_bound_check, lookup_context);
+          read_options.iterate_upper_bound, user_key_without_ts,
+          prefix_extractor, rep_->internal_comparator.user_comparator(),
+          const_ikey_ptr, &filter_checked, need_upper_bound_check, no_io,
+          lookup_context);
     } else {
       // if prefix_extractor changed for block based filter, skip filter
       if (need_upper_bound_check) {
         return true;
       }
-      auto prefix = prefix_extractor->Transform(user_key);
+      auto prefix = prefix_extractor->Transform(user_key_without_ts);
       InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
       auto internal_prefix = internal_key_prefix.Encode();
 
@@ -2758,14 +2144,14 @@
         // is the only on could potentially contain the prefix.
         BlockHandle handle = iiter->value().handle;
         may_match = filter->PrefixMayMatch(
-            prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
+            prefix, prefix_extractor, handle.offset(), no_io,
             /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
       }
     }
   }
 
   if (filter_checked) {
-    Statistics* statistics = rep_->ioptions.statistics;
+    Statistics* statistics = rep_->ioptions.stats;
     RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
     if (!may_match) {
       RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
@@ -2775,465 +2161,45 @@
   return may_match;
 }
 
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
-  SeekImpl(&target);
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
-  SeekImpl(nullptr);
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
-    const Slice* target) {
-  is_out_of_bound_ = false;
-  is_at_first_key_from_index_ = false;
-  if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
-    ResetDataIter();
-    return;
-  }
-
-  bool need_seek_index = true;
-  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
-    // Reseek.
-    prev_block_offset_ = index_iter_->value().handle.offset();
-
-    if (target) {
-      // We can avoid an index seek if:
-      // 1. The new seek key is larger than the current key
-      // 2. The new seek key is within the upper bound of the block
-      // Since we don't necessarily know the internal key for either
-      // the current key or the upper bound, we check user keys and
-      // exclude the equality case. Considering internal keys can
-      // improve for the boundary cases, but it would complicate the
-      // code.
-      if (user_comparator_.Compare(ExtractUserKey(*target),
-                                   block_iter_.user_key()) > 0 &&
-          user_comparator_.Compare(ExtractUserKey(*target),
-                                   index_iter_->user_key()) < 0) {
-        need_seek_index = false;
-      }
-    }
-  }
-
-  if (need_seek_index) {
-    if (target) {
-      index_iter_->Seek(*target);
-    } else {
-      index_iter_->SeekToFirst();
-    }
-
-    if (!index_iter_->Valid()) {
-      ResetDataIter();
-      return;
-    }
-  }
-
-  IndexValue v = index_iter_->value();
-  const bool same_block = block_iter_points_to_real_block_ &&
-                          v.handle.offset() == prev_block_offset_;
-
-  // TODO(kolmike): Remove the != kBlockCacheTier condition.
-  if (!v.first_internal_key.empty() && !same_block &&
-      (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
-      read_options_.read_tier != kBlockCacheTier) {
-    // Index contains the first key of the block, and it's >= target.
-    // We can defer reading the block.
-    is_at_first_key_from_index_ = true;
-    // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
-    // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
-    // as that will be done later when the data block is actually read.
-    ResetDataIter();
-  } else {
-    // Need to use the data block.
-    if (!same_block) {
-      InitDataBlock();
-    } else {
-      // When the user does a reseek, the iterate_upper_bound might have
-      // changed. CheckDataBlockWithinUpperBound() needs to be called
-      // explicitly if the reseek ends up in the same data block.
-      // If the reseek ends up in a different block, InitDataBlock() will do
-      // the iterator upper bound check.
-      CheckDataBlockWithinUpperBound();
-    }
-
-    if (target) {
-      block_iter_.Seek(*target);
-    } else {
-      block_iter_.SeekToFirst();
-    }
-    FindKeyForward();
-  }
-
-  CheckOutOfBound();
-
-  if (target) {
-    assert(!Valid() || ((block_type_ == BlockType::kIndex &&
-                         !table_->get_rep()->index_key_includes_seq)
-                            ? (user_comparator_.Compare(ExtractUserKey(*target),
-                                                        key()) <= 0)
-                            : (icomp_.Compare(*target, key()) <= 0)));
-  }
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
-    const Slice& target) {
-  is_out_of_bound_ = false;
-  is_at_first_key_from_index_ = false;
-  // For now totally disable prefix seek in auto prefix mode because we don't
-  // have logic
-  if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
-    ResetDataIter();
-    return;
-  }
-
-  SavePrevIndexValue();
-
-  // Call Seek() rather than SeekForPrev() in the index block, because the
-  // target data block will likely to contain the position for `target`, the
-  // same as Seek(), rather than than before.
-  // For example, if we have three data blocks, each containing two keys:
-  //   [2, 4]  [6, 8] [10, 12]
-  //  (the keys in the index block would be [4, 8, 12])
-  // and the user calls SeekForPrev(7), we need to go to the second block,
-  // just like if they call Seek(7).
-  // The only case where the block is difference is when they seek to a position
-  // in the boundary. For example, if they SeekForPrev(5), we should go to the
-  // first block, rather than the second. However, we don't have the information
-  // to distinguish the two unless we read the second block. In this case, we'll
-  // end up with reading two blocks.
-  index_iter_->Seek(target);
-
-  if (!index_iter_->Valid()) {
-    auto seek_status = index_iter_->status();
-    // Check for IO error
-    if (!seek_status.IsNotFound() && !seek_status.ok()) {
-      ResetDataIter();
-      return;
-    }
-
-    // With prefix index, Seek() returns NotFound if the prefix doesn't exist
-    if (seek_status.IsNotFound()) {
-      // Any key less than the target is fine for prefix seek
-      ResetDataIter();
-      return;
-    } else {
-      index_iter_->SeekToLast();
-    }
-    // Check for IO error
-    if (!index_iter_->Valid()) {
-      ResetDataIter();
-      return;
-    }
-  }
-
-  InitDataBlock();
-
-  block_iter_.SeekForPrev(target);
-
-  FindKeyBackward();
-  CheckDataBlockWithinUpperBound();
-  assert(!block_iter_.Valid() ||
-         icomp_.Compare(target, block_iter_.key()) >= 0);
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
-  is_out_of_bound_ = false;
-  is_at_first_key_from_index_ = false;
-  SavePrevIndexValue();
-  index_iter_->SeekToLast();
-  if (!index_iter_->Valid()) {
-    ResetDataIter();
-    return;
-  }
-  InitDataBlock();
-  block_iter_.SeekToLast();
-  FindKeyBackward();
-  CheckDataBlockWithinUpperBound();
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
-  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
-    return;
-  }
-  assert(block_iter_points_to_real_block_);
-  block_iter_.Next();
-  FindKeyForward();
-  CheckOutOfBound();
-}
-
-template <class TBlockIter, typename TValue>
-bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
-    IterateResult* result) {
-  Next();
-  bool is_valid = Valid();
-  if (is_valid) {
-    result->key = key();
-    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
-  }
-  return is_valid;
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
-  if (is_at_first_key_from_index_) {
-    is_at_first_key_from_index_ = false;
-
-    index_iter_->Prev();
-    if (!index_iter_->Valid()) {
-      return;
-    }
-
-    InitDataBlock();
-    block_iter_.SeekToLast();
-  } else {
-    assert(block_iter_points_to_real_block_);
-    block_iter_.Prev();
-  }
-
-  FindKeyBackward();
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
-  BlockHandle data_block_handle = index_iter_->value().handle;
-  if (!block_iter_points_to_real_block_ ||
-      data_block_handle.offset() != prev_block_offset_ ||
-      // if previous attempt of reading the block missed cache, try again
-      block_iter_.status().IsIncomplete()) {
-    if (block_iter_points_to_real_block_) {
-      ResetDataIter();
-    }
-    auto* rep = table_->get_rep();
-
-    // Prefetch additional data for range scans (iterators). Enabled only for
-    // user reads.
-    // Implicit auto readahead:
-    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
-    // Explicit user requested readahead:
-    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
-    if (lookup_context_.caller != TableReaderCaller::kCompaction) {
-      if (read_options_.readahead_size == 0) {
-        // Implicit auto readahead
-        num_file_reads_++;
-        if (num_file_reads_ >
-            BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
-          if (!rep->file->use_direct_io() &&
-              (data_block_handle.offset() +
-                   static_cast<size_t>(block_size(data_block_handle)) >
-               readahead_limit_)) {
-            // Buffered I/O
-            // Discarding the return status of Prefetch calls intentionally, as
-            // we can fallback to reading from disk if Prefetch fails.
-            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
-            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
-                                                   readahead_size_);
-            // Keep exponentially increasing readahead size until
-            // kMaxAutoReadaheadSize.
-            readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
-                                       readahead_size_ * 2);
-          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
-            // Direct I/O
-            // Let FilePrefetchBuffer take care of the readahead.
-            rep->CreateFilePrefetchBuffer(
-                BlockBasedTable::kInitAutoReadaheadSize,
-                BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
-          }
-        }
-      } else if (!prefetch_buffer_) {
-        // Explicit user requested readahead
-        // The actual condition is:
-        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
-        rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
-                                      read_options_.readahead_size,
-                                      &prefetch_buffer_);
-      }
-    } else if (!prefetch_buffer_) {
-      rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
-                                    compaction_readahead_size_,
-                                    &prefetch_buffer_);
-    }
-
-    Status s;
-    table_->NewDataBlockIterator<TBlockIter>(
-        read_options_, data_block_handle, &block_iter_, block_type_,
-        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
-        /*for_compaction=*/lookup_context_.caller ==
-            TableReaderCaller::kCompaction);
-    block_iter_points_to_real_block_ = true;
-    CheckDataBlockWithinUpperBound();
-  }
-}
-
-template <class TBlockIter, typename TValue>
-bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
-  assert(is_at_first_key_from_index_);
-  assert(!block_iter_points_to_real_block_);
-  assert(index_iter_->Valid());
-
-  is_at_first_key_from_index_ = false;
-  InitDataBlock();
-  assert(block_iter_points_to_real_block_);
-  block_iter_.SeekToFirst();
-
-  if (!block_iter_.Valid() ||
-      icomp_.Compare(block_iter_.key(),
-                     index_iter_->value().first_internal_key) != 0) {
-    // Uh oh.
-    block_iter_.Invalidate(Status::Corruption(
-        "first key in index doesn't match first key in block"));
+bool BlockBasedTable::PrefixExtractorChanged(
+    const SliceTransform* prefix_extractor) const {
+  if (prefix_extractor == nullptr) {
+    return true;
+  } else if (prefix_extractor == rep_->table_prefix_extractor.get()) {
     return false;
-  }
-
-  return true;
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
-  // This method's code is kept short to make it likely to be inlined.
-
-  assert(!is_out_of_bound_);
-  assert(block_iter_points_to_real_block_);
-
-  if (!block_iter_.Valid()) {
-    // This is the only call site of FindBlockForward(), but it's extracted into
-    // a separate method to keep FindKeyForward() short and likely to be
-    // inlined. When transitioning to a different block, we call
-    // FindBlockForward(), which is much longer and is probably not inlined.
-    FindBlockForward();
   } else {
-    // This is the fast path that avoids a function call.
-  }
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
-  // TODO the while loop inherits from two-level-iterator. We don't know
-  // whether a block can be empty so it can be replaced by an "if".
-  do {
-    if (!block_iter_.status().ok()) {
-      return;
-    }
-    // Whether next data block is out of upper bound, if there is one.
-    const bool next_block_is_out_of_bound =
-        read_options_.iterate_upper_bound != nullptr &&
-        block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
-    assert(!next_block_is_out_of_bound ||
-           user_comparator_.Compare(*read_options_.iterate_upper_bound,
-                                    index_iter_->user_key()) <= 0);
-    ResetDataIter();
-    index_iter_->Next();
-    if (next_block_is_out_of_bound) {
-      // The next block is out of bound. No need to read it.
-      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
-      // We need to make sure this is not the last data block before setting
-      // is_out_of_bound_, since the index key for the last data block can be
-      // larger than smallest key of the next file on the same level.
-      if (index_iter_->Valid()) {
-        is_out_of_bound_ = true;
-      }
-      return;
-    }
-
-    if (!index_iter_->Valid()) {
-      return;
-    }
-
-    IndexValue v = index_iter_->value();
-
-    // TODO(kolmike): Remove the != kBlockCacheTier condition.
-    if (!v.first_internal_key.empty() &&
-        read_options_.read_tier != kBlockCacheTier) {
-      // Index contains the first key of the block. Defer reading the block.
-      is_at_first_key_from_index_ = true;
-      return;
-    }
-
-    InitDataBlock();
-    block_iter_.SeekToFirst();
-  } while (!block_iter_.Valid());
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
-  while (!block_iter_.Valid()) {
-    if (!block_iter_.status().ok()) {
-      return;
-    }
-
-    ResetDataIter();
-    index_iter_->Prev();
-
-    if (index_iter_->Valid()) {
-      InitDataBlock();
-      block_iter_.SeekToLast();
-    } else {
-      return;
-    }
-  }
-
-  // We could have check lower bound here too, but we opt not to do it for
-  // code simplicity.
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
-  if (read_options_.iterate_upper_bound != nullptr && Valid()) {
-    is_out_of_bound_ = user_comparator_.Compare(
-                           *read_options_.iterate_upper_bound, user_key()) <= 0;
-  }
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter,
-                             TValue>::CheckDataBlockWithinUpperBound() {
-  if (read_options_.iterate_upper_bound != nullptr &&
-      block_iter_points_to_real_block_) {
-    data_block_within_upper_bound_ =
-        (user_comparator_.Compare(*read_options_.iterate_upper_bound,
-                                  index_iter_->user_key()) > 0);
+    return PrefixExtractorChangedHelper(rep_->table_properties.get(),
+                                        prefix_extractor);
   }
 }
 
 InternalIterator* BlockBasedTable::NewIterator(
     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
     Arena* arena, bool skip_filters, TableReaderCaller caller,
-    size_t compaction_readahead_size) {
+    size_t compaction_readahead_size, bool allow_unprepared_value) {
   BlockCacheLookupContext lookup_context{caller};
   bool need_upper_bound_check =
-      read_options.auto_prefix_mode ||
-      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
+      read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(NewIndexIterator(
+      read_options,
+      need_upper_bound_check &&
+          rep_->index_type == BlockBasedTableOptions::kHashSearch,
+      /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
   if (arena == nullptr) {
-    return new BlockBasedTableIterator<DataBlockIter>(
-        this, read_options, rep_->internal_comparator,
-        NewIndexIterator(
-            read_options,
-            need_upper_bound_check &&
-                rep_->index_type == BlockBasedTableOptions::kHashSearch,
-            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+    return new BlockBasedTableIterator(
+        this, read_options, rep_->internal_comparator, std::move(index_iter),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
-        compaction_readahead_size);
+        need_upper_bound_check, prefix_extractor, caller,
+        compaction_readahead_size, allow_unprepared_value);
   } else {
-    auto* mem =
-        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
-    return new (mem) BlockBasedTableIterator<DataBlockIter>(
-        this, read_options, rep_->internal_comparator,
-        NewIndexIterator(
-            read_options,
-            need_upper_bound_check &&
-                rep_->index_type == BlockBasedTableOptions::kHashSearch,
-            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+    auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
+    return new (mem) BlockBasedTableIterator(
+        this, read_options, rep_->internal_comparator, std::move(index_iter),
         !skip_filters && !read_options.total_order_seek &&
             prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
-        compaction_readahead_size);
+        need_upper_bound_check, prefix_extractor, caller,
+        compaction_readahead_size, allow_unprepared_value);
   }
 }
 
@@ -3261,25 +2227,23 @@
   Slice user_key = ExtractUserKey(internal_key);
   const Slice* const const_ikey_ptr = &internal_key;
   bool may_match = true;
+  size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
   if (rep_->whole_key_filtering) {
-    size_t ts_sz =
-        rep_->internal_comparator.user_comparator()->timestamp_size();
-    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
     may_match =
         filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
                             no_io, const_ikey_ptr, get_context, lookup_context);
-  } else if (!read_options.total_order_seek && prefix_extractor &&
-             rep_->table_properties->prefix_extractor_name.compare(
-                 prefix_extractor->Name()) == 0 &&
-             prefix_extractor->InDomain(user_key) &&
-             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
-                                     prefix_extractor, kNotValid, no_io,
-                                     const_ikey_ptr, get_context,
-                                     lookup_context)) {
+  } else if (!read_options.total_order_seek &&
+             !PrefixExtractorChanged(prefix_extractor) &&
+             prefix_extractor->InDomain(user_key_without_ts) &&
+             !filter->PrefixMayMatch(
+                 prefix_extractor->Transform(user_key_without_ts),
+                 prefix_extractor, kNotValid, no_io, const_ikey_ptr,
+                 get_context, lookup_context)) {
     may_match = false;
   }
   if (may_match) {
-    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE);
     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
   }
   return may_match;
@@ -3293,14 +2257,34 @@
   if (filter == nullptr || filter->IsBlockBased()) {
     return;
   }
+  uint64_t before_keys = range->KeysLeft();
+  assert(before_keys > 0);  // Caller should ensure
   if (rep_->whole_key_filtering) {
     filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
                          lookup_context);
-  } else if (!read_options.total_order_seek && prefix_extractor &&
-             rep_->table_properties->prefix_extractor_name.compare(
-                 prefix_extractor->Name()) == 0) {
+    uint64_t after_keys = range->KeysLeft();
+    if (after_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys,
+                                rep_->level);
+    }
+    uint64_t filtered_keys = before_keys - after_keys;
+    if (filtered_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys,
+                                rep_->level);
+    }
+  } else if (!read_options.total_order_seek &&
+             !PrefixExtractorChanged(prefix_extractor)) {
     filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
                              lookup_context);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys);
+    uint64_t after_keys = range->KeysLeft();
+    uint64_t filtered_keys = before_keys - after_keys;
+    if (filtered_keys) {
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL,
+                 filtered_keys);
+    }
   }
 }
 
@@ -3328,11 +2312,13 @@
     lookup_context.get_from_user_specified_snapshot =
         read_options.snapshot != nullptr;
   }
+  TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch");
   const bool may_match =
       FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
                             get_context, &lookup_context);
+  TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch");
   if (!may_match) {
-    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
   } else {
     IndexBlockIter iiter_on_stack;
@@ -3340,8 +2326,7 @@
     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
     bool need_upper_bound_check = false;
     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
-      need_upper_bound_check = PrefixExtractorChanged(
-          rep_->table_properties.get(), prefix_extractor);
+      need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
     }
     auto iiter =
         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
@@ -3353,7 +2338,7 @@
 
     size_t ts_sz =
         rep_->internal_comparator.user_comparator()->timestamp_size();
-    bool matched = false;  // if such user key mathced a key in SST
+    bool matched = false;  // if such user key matched a key in SST
     bool done = false;
     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
       IndexValue v = iiter->value();
@@ -3369,15 +2354,16 @@
         // Not found
         // TODO: think about interaction with Merge. If a user key cannot
         // cross one data block, we should be fine.
-        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+        RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
         break;
       }
 
       if (!v.first_internal_key.empty() && !skip_filters &&
           UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                  .Compare(ExtractUserKey(key),
-                           ExtractUserKey(v.first_internal_key)) < 0) {
+                  .CompareWithoutTimestamp(
+                      ExtractUserKey(key),
+                      ExtractUserKey(v.first_internal_key)) < 0) {
         // The requested key falls between highest key in previous block and
         // lowest key in current block.
         break;
@@ -3400,6 +2386,7 @@
         // Update Saver.state to Found because we are only looking for
         // whether we can guarantee the key is not there when "no_io" is set
         get_context->MarkKeyMayExist();
+        s = biter.status();
         break;
       }
       if (!biter.status().ok()) {
@@ -3420,8 +2407,10 @@
         // Call the *saver function on each entry/block until it returns false
         for (; biter.Valid(); biter.Next()) {
           ParsedInternalKey parsed_key;
-          if (!ParseInternalKey(biter.key(), &parsed_key)) {
-            s = Status::Corruption(Slice());
+          Status pik_status = ParseInternalKey(
+              biter.key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
           }
 
           if (!get_context->SaveValue(
@@ -3448,7 +2437,7 @@
           referenced_key = key;
         }
         BlockCacheTraceRecord access_record(
-            rep_->ioptions.env->NowMicros(),
+            rep_->ioptions.clock->NowMicros(),
             /*block_key=*/"", lookup_data_block_context.block_type,
             lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
             /*cf_name=*/"", rep_->level_for_tracing(),
@@ -3460,9 +2449,12 @@
             /*referenced_key=*/"", referenced_data_size,
             lookup_data_block_context.num_keys_in_block,
             does_referenced_key_exist);
-        block_cache_tracer_->WriteBlockAccess(
-            access_record, lookup_data_block_context.block_key,
-            rep_->cf_name_for_tracing(), referenced_key);
+        // TODO: Should handle status here?
+        block_cache_tracer_
+            ->WriteBlockAccess(access_record,
+                               lookup_data_block_context.block_key,
+                               rep_->cf_name_for_tracing(), referenced_key)
+            .PermitUncheckedError();
       }
 
       if (done) {
@@ -3471,7 +2463,7 @@
       }
     }
     if (matched && filter != nullptr && !filter->IsBlockBased()) {
-      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
       PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                 rep_->level);
     }
@@ -3488,6 +2480,12 @@
                                const MultiGetRange* mget_range,
                                const SliceTransform* prefix_extractor,
                                bool skip_filters) {
+  if (mget_range->empty()) {
+    // Caller should ensure non-empty (performance bug)
+    assert(false);
+    return;  // Nothing to do
+  }
+
   FilterBlockReader* const filter =
       !skip_filters ? rep_->filter.get() : nullptr;
   MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
@@ -3497,7 +2495,7 @@
   // If full filter not useful, Then go into each block
   const bool no_io = read_options.read_tier == kBlockCacheTier;
   uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
-  if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
+  if (sst_file_range.begin()->get_context) {
     tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
   }
   BlockCacheLookupContext lookup_context{
@@ -3506,14 +2504,13 @@
   FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
                          prefix_extractor, &lookup_context);
 
-  if (skip_filters || !sst_file_range.empty()) {
+  if (!sst_file_range.empty()) {
     IndexBlockIter iiter_on_stack;
     // if prefix_extractor found in block differs from options, disable
     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
     bool need_upper_bound_check = false;
     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
-      need_upper_bound_check = PrefixExtractorChanged(
-          rep_->table_properties.get(), prefix_extractor);
+      need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
     }
     auto iiter =
         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
@@ -3532,21 +2529,13 @@
     {
       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
                                      sst_file_range.end());
+      std::vector<Cache::Handle*> cache_handles;
+      bool wait_for_cache_results = false;
 
       CachableEntry<UncompressionDict> uncompression_dict;
       Status uncompression_dict_status;
-      if (rep_->uncompression_dict_reader) {
-        uncompression_dict_status =
-            rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
-                nullptr /* prefetch_buffer */, no_io,
-                sst_file_range.begin()->get_context, &lookup_context,
-                &uncompression_dict);
-      }
-
-      const UncompressionDict& dict = uncompression_dict.GetValue()
-                                          ? *uncompression_dict.GetValue()
-                                          : UncompressionDict::GetEmptyDict();
-
+      uncompression_dict_status.PermitUncheckedError();
+      bool uncompression_dict_inited = false;
       size_t total_len = 0;
       ReadOptions ro = read_options;
       ro.read_tier = kBlockCacheTier;
@@ -3563,17 +2552,30 @@
         if (!iiter->Valid() ||
             (!v.first_internal_key.empty() && !skip_filters &&
              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                     .Compare(ExtractUserKey(key),
-                              ExtractUserKey(v.first_internal_key)) < 0)) {
+                     .CompareWithoutTimestamp(
+                         ExtractUserKey(key),
+                         ExtractUserKey(v.first_internal_key)) < 0)) {
           // The requested key falls between highest key in previous block and
           // lowest key in current block.
-          *(miter->s) = iiter->status();
+          if (!iiter->status().IsNotFound()) {
+            *(miter->s) = iiter->status();
+          }
           data_block_range.SkipKey(miter);
           sst_file_range.SkipKey(miter);
           continue;
         }
 
+        if (!uncompression_dict_inited && rep_->uncompression_dict_reader) {
+          uncompression_dict_status =
+              rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                  nullptr /* prefetch_buffer */, no_io,
+                  sst_file_range.begin()->get_context, &lookup_context,
+                  &uncompression_dict);
+          uncompression_dict_inited = true;
+        }
+
         if (!uncompression_dict_status.ok()) {
+          assert(!uncompression_dict_status.IsNotFound());
           *(miter->s) = uncompression_dict_status;
           data_block_range.SkipKey(miter);
           sst_file_range.SkipKey(miter);
@@ -3595,25 +2597,75 @@
         BlockHandle handle = v.handle;
         BlockCacheLookupContext lookup_data_block_context(
             TableReaderCaller::kUserMultiGet);
+        const UncompressionDict& dict = uncompression_dict.GetValue()
+                                            ? *uncompression_dict.GetValue()
+                                            : UncompressionDict::GetEmptyDict();
         Status s = RetrieveBlock(
             nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
             miter->get_context, &lookup_data_block_context,
-            /* for_compaction */ false, /* use_cache */ true);
+            /* for_compaction */ false, /* use_cache */ true,
+            /* wait_for_cache */ false);
         if (s.IsIncomplete()) {
           s = Status::OK();
         }
         if (s.ok() && !results.back().IsEmpty()) {
-          // Found it in the cache. Add NULL handle to indicate there is
-          // nothing to read from disk
-          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          // Since we have a valid handle, check the value. If its nullptr,
+          // it means the cache is waiting for the final result and we're
+          // supposed to call WaitAll() to wait for the result.
+          if (results.back().GetValue() != nullptr) {
+            // Found it in the cache. Add NULL handle to indicate there is
+            // nothing to read from disk.
+            if (results.back().GetCacheHandle()) {
+              results.back().UpdateCachedValue();
+            }
+            block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          } else {
+            // We have to wait for the cache lookup to finish in the
+            // background, and then we may have to read the block from disk
+            // anyway
+            assert(results.back().GetCacheHandle());
+            wait_for_cache_results = true;
+            block_handles.emplace_back(handle);
+            cache_handles.emplace_back(results.back().GetCacheHandle());
+          }
         } else {
           block_handles.emplace_back(handle);
-          total_len += block_size(handle);
+          total_len += BlockSizeWithTrailer(handle);
+        }
+      }
+
+      if (wait_for_cache_results) {
+        Cache* block_cache = rep_->table_options.block_cache.get();
+        block_cache->WaitAll(cache_handles);
+        for (size_t i = 0; i < block_handles.size(); ++i) {
+          // If this block was a success or failure or not needed because
+          // the corresponding key is in the same block as a prior key, skip
+          if (block_handles[i] == BlockHandle::NullBlockHandle() ||
+              results[i].IsEmpty()) {
+            continue;
+          }
+          results[i].UpdateCachedValue();
+          void* val = results[i].GetValue();
+          if (!val) {
+            // The async cache lookup failed - could be due to an error
+            // or a false positive. We need to read the data block from
+            // the SST file
+            results[i].Reset();
+            total_len += BlockSizeWithTrailer(block_handles[i]);
+          } else {
+            block_handles[i] = BlockHandle::NullBlockHandle();
+          }
         }
       }
 
       if (total_len) {
         char* scratch = nullptr;
+        const UncompressionDict& dict = uncompression_dict.GetValue()
+                                            ? *uncompression_dict.GetValue()
+                                            : UncompressionDict::GetEmptyDict();
+        assert(uncompression_dict_inited || !rep_->uncompression_dict_reader);
+        assert(uncompression_dict_status.ok());
+        // If using direct IO, then scratch is not used, so keep it nullptr.
         // If the blocks need to be uncompressed and we don't need the
         // compressed blocks, then we can use a contiguous block of
         // memory to read in all the blocks as it will be temporary
@@ -3623,7 +2675,8 @@
         // 2. If blocks are uncompressed, alloc heap bufs
         // 3. If blocks are compressed and no compressed block cache, use
         //    stack buf
-        if (rep_->table_options.block_cache_compressed == nullptr &&
+        if (!rep_->file->use_direct_io() &&
+            rep_->table_options.block_cache_compressed == nullptr &&
             rep_->blocks_maybe_compressed) {
           if (total_len <= kMultiGetReadStackBufSize) {
             scratch = stack_buf;
@@ -3634,6 +2687,10 @@
         }
         RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
                                &statuses, &results, scratch, dict);
+        if (sst_file_range.begin()->get_context) {
+          ++(sst_file_range.begin()
+                 ->get_context->get_context_stats_.num_sst_read);
+        }
       }
     }
 
@@ -3665,6 +2722,10 @@
                 read_options, results[idx_in_batch], &first_biter,
                 statuses[idx_in_batch]);
             reusing_block = false;
+          } else {
+            // If handler is null and result is empty, then the status is never
+            // set, which should be the initial value: ok().
+            assert(statuses[idx_in_batch].ok());
           }
           biter = &first_biter;
           idx_in_batch++;
@@ -3672,8 +2733,9 @@
           IndexValue v = iiter->value();
           if (!v.first_internal_key.empty() && !skip_filters &&
               UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                      .Compare(ExtractUserKey(key),
-                               ExtractUserKey(v.first_internal_key)) < 0) {
+                      .CompareWithoutTimestamp(
+                          ExtractUserKey(key),
+                          ExtractUserKey(v.first_internal_key)) < 0) {
             // The requested key falls between highest key in previous block and
             // lowest key in current block.
             break;
@@ -3715,8 +2777,10 @@
           ParsedInternalKey parsed_key;
           Cleanable dummy;
           Cleanable* value_pinner = nullptr;
-          if (!ParseInternalKey(biter->key(), &parsed_key)) {
-            s = Status::Corruption(Slice());
+          Status pik_status = ParseInternalKey(
+              biter->key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
           }
           if (biter->IsValuePinned()) {
             if (reusing_block) {
@@ -3753,7 +2817,7 @@
             referenced_key = key;
           }
           BlockCacheTraceRecord access_record(
-              rep_->ioptions.env->NowMicros(),
+              rep_->ioptions.clock->NowMicros(),
               /*block_key=*/"", lookup_data_block_context.block_type,
               lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
               /*cf_name=*/"", rep_->level_for_tracing(),
@@ -3765,9 +2829,12 @@
               /*referenced_key=*/"", referenced_data_size,
               lookup_data_block_context.num_keys_in_block,
               does_referenced_key_exist);
-          block_cache_tracer_->WriteBlockAccess(
-              access_record, lookup_data_block_context.block_key,
-              rep_->cf_name_for_tracing(), referenced_key);
+          // TODO: Should handle status here?
+          block_cache_tracer_
+              ->WriteBlockAccess(access_record,
+                                 lookup_data_block_context.block_key,
+                                 rep_->cf_name_for_tracing(), referenced_key)
+              .PermitUncheckedError();
         }
         s = biter->status();
         if (done) {
@@ -3782,15 +2849,21 @@
       } while (iiter->Valid());
 
       if (matched && filter != nullptr && !filter->IsBlockBased()) {
-        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+        RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                   rep_->level);
       }
-      if (s.ok()) {
+      if (s.ok() && !iiter->status().IsNotFound()) {
         s = iiter->status();
       }
       *(miter->s) = s;
     }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    // Not sure why we need to do it. Should investigate more.
+    for (auto& st : statuses) {
+      st.PermitUncheckedError();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
   }
 }
 
@@ -3860,7 +2933,8 @@
   // Check Meta blocks
   std::unique_ptr<Block> metaindex;
   std::unique_ptr<InternalIterator> metaindex_iter;
-  s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
+  ReadOptions ro;
+  s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex,
                          &metaindex_iter);
   if (s.ok()) {
     s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
@@ -3896,11 +2970,11 @@
   // increasing of the buffer size.
   size_t readahead_size = (read_options.readahead_size != 0)
                               ? read_options.readahead_size
-                              : kMaxAutoReadaheadSize;
+                              : rep_->table_options.max_auto_readahead_size;
   // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
   // needed there.
   FilePrefetchBuffer prefetch_buffer(
-      rep_->file.get(), readahead_size /* readadhead_size */,
+      readahead_size /* readahead_size */,
       readahead_size /* max_readahead_size */,
       !rep_->ioptions.allow_mmap_reads /* enable */);
 
@@ -3921,6 +2995,12 @@
       break;
     }
   }
+  if (s.ok()) {
+    // In the case of two level indexes, we would have exited the above loop
+    // by checking index_iter->Valid(), but Valid() might have returned false
+    // due to an IO error. So check the index_iter status
+    s = index_iter->status();
+  }
   return s;
 }
 
@@ -3932,15 +3012,15 @@
     return BlockType::kFilter;
   }
 
-  if (meta_block_name == kPropertiesBlock) {
+  if (meta_block_name == kPropertiesBlockName) {
     return BlockType::kProperties;
   }
 
-  if (meta_block_name == kCompressionDictBlock) {
+  if (meta_block_name == kCompressionDictBlockName) {
     return BlockType::kCompressionDictionary;
   }
 
-  if (meta_block_name == kRangeDelBlock) {
+  if (meta_block_name == kRangeDelBlockName) {
     return BlockType::kRangeDeletion;
   }
 
@@ -3969,19 +3049,22 @@
     s = handle.DecodeFrom(&input);
     BlockContents contents;
     const Slice meta_block_name = index_iter->key();
-    BlockFetcher block_fetcher(
-        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
-        ReadOptions(), handle, &contents, rep_->ioptions,
-        false /* decompress */, false /*maybe_compressed*/,
-        GetBlockTypeForMetaBlockByName(meta_block_name),
-        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
-    s = block_fetcher.ReadBlockContents();
-    if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
-      TableProperties* table_properties;
-      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
-                                           index_iter->value(),
-                                           &table_properties);
-      delete table_properties;
+    if (meta_block_name == kPropertiesBlockName) {
+      // Unfortunate special handling for properties block checksum w/
+      // global seqno
+      std::unique_ptr<TableProperties> table_properties;
+      s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(),
+                                    nullptr /* prefetch_buffer */, rep_->footer,
+                                    rep_->ioptions, &table_properties,
+                                    nullptr /* memory_allocator */);
+    } else {
+      s = BlockFetcher(
+              rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+              ReadOptions(), handle, &contents, rep_->ioptions,
+              false /* decompress */, false /*maybe_compressed*/,
+              GetBlockTypeForMetaBlockByName(meta_block_name),
+              UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options)
+              .ReadBlockContents();
     }
     if (!s.ok()) {
       break;
@@ -3998,12 +3081,9 @@
     return false;
   }
 
-  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  Slice cache_key =
-      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
-                  cache_key_storage);
+  CacheKey key = GetCacheKey(rep_->base_cache_key, handle);
 
-  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
+  Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice());
   if (cache_handle == nullptr) {
     return false;
   }
@@ -4031,9 +3111,9 @@
 //  4. internal_comparator
 //  5. index_type
 Status BlockBasedTable::CreateIndexReader(
-    FilePrefetchBuffer* prefetch_buffer,
-    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
-    bool pin, BlockCacheLookupContext* lookup_context,
+    const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context,
     std::unique_ptr<IndexReader>* index_reader) {
   // kHashSearch requires non-empty prefix_extractor but bypass checking
   // prefix_extractor here since we have no access to MutableCFOptions.
@@ -4043,47 +3123,34 @@
 
   switch (rep_->index_type) {
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
-      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
+      return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache,
                                           prefetch, pin, lookup_context,
                                           index_reader);
     }
     case BlockBasedTableOptions::kBinarySearch:
       FALLTHROUGH_INTENDED;
     case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
-      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
-                                             prefetch, pin, lookup_context,
-                                             index_reader);
+      return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
+                                             use_cache, prefetch, pin,
+                                             lookup_context, index_reader);
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> metaindex_guard;
       std::unique_ptr<InternalIterator> metaindex_iter_guard;
-      auto meta_index_iter = preloaded_meta_index_iter;
       bool should_fallback = false;
       if (rep_->internal_prefix_transform.get() == nullptr) {
-        ROCKS_LOG_WARN(rep_->ioptions.info_log,
+        ROCKS_LOG_WARN(rep_->ioptions.logger,
                        "No prefix extractor passed in. Fall back to binary"
                        " search index.");
         should_fallback = true;
-      } else if (meta_index_iter == nullptr) {
-        auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
-                                    &metaindex_iter_guard);
-        if (!s.ok()) {
-          // we simply fall back to binary search in case there is any
-          // problem with prefix hash index loading.
-          ROCKS_LOG_WARN(rep_->ioptions.info_log,
-                         "Unable to read the metaindex block."
-                         " Fall back to binary search index.");
-          should_fallback = true;
-        }
-        meta_index_iter = metaindex_iter_guard.get();
       }
 
       if (should_fallback) {
-        return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
-                                               prefetch, pin, lookup_context,
-                                               index_reader);
+        return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
+                                               use_cache, prefetch, pin,
+                                               lookup_context, index_reader);
       } else {
-        return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
+        return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter,
                                        use_cache, prefetch, pin, lookup_context,
                                        index_reader);
       }
@@ -4096,30 +3163,38 @@
   }
 }
 
-uint64_t BlockBasedTable::ApproximateOffsetOf(
-    const InternalIteratorBase<IndexValue>& index_iter) const {
-  uint64_t result = 0;
+uint64_t BlockBasedTable::ApproximateDataOffsetOf(
+    const InternalIteratorBase<IndexValue>& index_iter,
+    uint64_t data_size) const {
+  assert(index_iter.status().ok());
   if (index_iter.Valid()) {
     BlockHandle handle = index_iter.value().handle;
-    result = handle.offset();
+    return handle.offset();
   } else {
-    // The iterator is past the last key in the file. If table_properties is not
-    // available, approximate the offset by returning the offset of the
-    // metaindex block (which is right near the end of the file).
-    if (rep_->table_properties) {
-      result = rep_->table_properties->data_size;
-    }
-    // table_properties is not present in the table.
-    if (result == 0) {
-      result = rep_->footer.metaindex_handle().offset();
-    }
+    // The iterator is past the last key in the file.
+    return data_size;
   }
+}
 
-  return result;
+uint64_t BlockBasedTable::GetApproximateDataSize() {
+  // Should be in table properties unless super old version
+  if (rep_->table_properties) {
+    return rep_->table_properties->data_size;
+  }
+  // Fall back to rough estimate from footer
+  return rep_->footer.metaindex_handle().offset();
 }
 
 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
                                               TableReaderCaller caller) {
+  uint64_t data_size = GetApproximateDataSize();
+  if (UNLIKELY(data_size == 0)) {
+    // Hmm. Let's just split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }
+
   BlockCacheLookupContext context(caller);
   IndexBlockIter iiter_on_stack;
   ReadOptions ro;
@@ -4134,13 +3209,37 @@
   }
 
   index_iter->Seek(key);
-  return ApproximateOffsetOf(*index_iter);
+  uint64_t offset;
+  if (index_iter->status().ok()) {
+    offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }
+
+  // Pro-rate file metadata (incl filters) size-proportionally across data
+  // blocks.
+  double size_ratio =
+      static_cast<double>(offset) / static_cast<double>(data_size);
+  return static_cast<uint64_t>(size_ratio *
+                               static_cast<double>(rep_->file_size));
 }
 
 uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
                                           TableReaderCaller caller) {
   assert(rep_->internal_comparator.Compare(start, end) <= 0);
 
+  uint64_t data_size = GetApproximateDataSize();
+  if (UNLIKELY(data_size == 0)) {
+    // Hmm. Assume whole file is involved, since we have lower and upper
+    // bound. This likely skews the estimate if we consider that this function
+    // is typically called with `[start, end]` fully contained in the file's
+    // key-range.
+    return rep_->file_size;
+  }
+
   BlockCacheLookupContext context(caller);
   IndexBlockIter iiter_on_stack;
   ReadOptions ro;
@@ -4155,17 +3254,38 @@
   }
 
   index_iter->Seek(start);
-  uint64_t start_offset = ApproximateOffsetOf(*index_iter);
+  uint64_t start_offset;
+  if (index_iter->status().ok()) {
+    start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved from the start. This likely skews the estimate
+    // but is consistent with the above error handling.
+    start_offset = 0;
+  }
+
   index_iter->Seek(end);
-  uint64_t end_offset = ApproximateOffsetOf(*index_iter);
+  uint64_t end_offset;
+  if (index_iter->status().ok()) {
+    end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved until the end. This likely skews the estimate
+    // but is consistent with the above error handling.
+    end_offset = data_size;
+  }
 
   assert(end_offset >= start_offset);
-  return end_offset - start_offset;
+  // Pro-rate file metadata (incl filters) size-proportionally across data
+  // blocks.
+  double size_ratio = static_cast<double>(end_offset - start_offset) /
+                      static_cast<double>(data_size);
+  return static_cast<uint64_t>(size_ratio *
+                               static_cast<double>(rep_->file_size));
 }
 
 bool BlockBasedTable::TEST_FilterBlockInCache() const {
   assert(rep_ != nullptr);
-  return TEST_BlockInCache(rep_->filter_handle);
+  return rep_->filter_type != Rep::FilterType::kNoFilter &&
+         TEST_BlockInCache(rep_->filter_handle);
 }
 
 bool BlockBasedTable::TEST_IndexBlockInCache() const {
@@ -4230,21 +3350,20 @@
 }
 
 Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+  WritableFileStringStreamAdapter out_file_wrapper(out_file);
+  std::ostream out_stream(&out_file_wrapper);
   // Output Footer
-  out_file->Append(
-      "Footer Details:\n"
-      "--------------------------------------\n"
-      "  ");
-  out_file->Append(rep_->footer.ToString().c_str());
-  out_file->Append("\n");
+  out_stream << "Footer Details:\n"
+                "--------------------------------------\n";
+  out_stream << "  " << rep_->footer.ToString() << "\n";
 
   // Output MetaIndex
-  out_file->Append(
-      "Metaindex Details:\n"
-      "--------------------------------------\n");
+  out_stream << "Metaindex Details:\n"
+                "--------------------------------------\n";
   std::unique_ptr<Block> metaindex;
   std::unique_ptr<InternalIterator> metaindex_iter;
-  Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
+  ReadOptions ro;
+  Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex,
                                 &metaindex_iter);
   if (s.ok()) {
     for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
@@ -4253,27 +3372,22 @@
       if (!s.ok()) {
         return s;
       }
-      if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
-        out_file->Append("  Properties block handle: ");
-        out_file->Append(metaindex_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
-      } else if (metaindex_iter->key() ==
-                 ROCKSDB_NAMESPACE::kCompressionDictBlock) {
-        out_file->Append("  Compression dictionary block handle: ");
-        out_file->Append(metaindex_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
+      if (metaindex_iter->key() == kPropertiesBlockName) {
+        out_stream << "  Properties block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (metaindex_iter->key() == kCompressionDictBlockName) {
+        out_stream << "  Compression dictionary block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
       } else if (strstr(metaindex_iter->key().ToString().c_str(),
                         "filter.rocksdb.") != nullptr) {
-        out_file->Append("  Filter block handle: ");
-        out_file->Append(metaindex_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
-      } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
-        out_file->Append("  Range deletion block handle: ");
-        out_file->Append(metaindex_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
+        out_stream << "  Filter block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
+      } else if (metaindex_iter->key() == kRangeDelBlockName) {
+        out_stream << "  Range deletion block handle: "
+                   << metaindex_iter->value().ToString(true) << "\n";
       }
     }
-    out_file->Append("\n");
+    out_stream << "\n";
   } else {
     return s;
   }
@@ -4283,25 +3397,19 @@
   table_properties = rep_->table_properties.get();
 
   if (table_properties != nullptr) {
-    out_file->Append(
-        "Table Properties:\n"
-        "--------------------------------------\n"
-        "  ");
-    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
-    out_file->Append("\n");
+    out_stream << "Table Properties:\n"
+                  "--------------------------------------\n";
+    out_stream << "  " << table_properties->ToString("\n  ", ": ") << "\n";
   }
 
   if (rep_->filter) {
-    out_file->Append(
-        "Filter Details:\n"
-        "--------------------------------------\n"
-        "  ");
-    out_file->Append(rep_->filter->ToString().c_str());
-    out_file->Append("\n");
+    out_stream << "Filter Details:\n"
+                  "--------------------------------------\n";
+    out_stream << "  " << rep_->filter->ToString() << "\n";
   }
 
   // Output Index block
-  s = DumpIndexBlock(out_file);
+  s = DumpIndexBlock(out_stream);
   if (!s.ok()) {
     return s;
   }
@@ -4320,15 +3428,10 @@
     assert(uncompression_dict.GetValue());
 
     const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
-    out_file->Append(
-        "Compression Dictionary:\n"
-        "--------------------------------------\n");
-    out_file->Append("  size (bytes): ");
-    out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
-    out_file->Append("\n\n");
-    out_file->Append("  HEX    ");
-    out_file->Append(raw_dict.ToString(true).c_str());
-    out_file->Append("\n\n");
+    out_stream << "Compression Dictionary:\n"
+                  "--------------------------------------\n";
+    out_stream << "  size (bytes): " << raw_dict.size() << "\n\n";
+    out_stream << "  HEX    " << raw_dict.ToString(true) << "\n\n";
   }
 
   // Output range deletions block
@@ -4336,39 +3439,44 @@
   if (range_del_iter != nullptr) {
     range_del_iter->SeekToFirst();
     if (range_del_iter->Valid()) {
-      out_file->Append(
-          "Range deletions:\n"
-          "--------------------------------------\n"
-          "  ");
+      out_stream << "Range deletions:\n"
+                    "--------------------------------------\n";
       for (; range_del_iter->Valid(); range_del_iter->Next()) {
-        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
+        DumpKeyValue(range_del_iter->key(), range_del_iter->value(),
+                     out_stream);
       }
-      out_file->Append("\n");
+      out_stream << "\n";
     }
     delete range_del_iter;
   }
   // Output Data blocks
-  s = DumpDataBlocks(out_file);
+  s = DumpDataBlocks(out_stream);
 
-  return s;
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!out_stream.good()) {
+    return Status::IOError("Failed to write to output file");
+  }
+  return Status::OK();
 }
 
-Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
-  out_file->Append(
-      "Index Details:\n"
-      "--------------------------------------\n");
+Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
+  out_stream << "Index Details:\n"
+                "--------------------------------------\n";
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
                        /*lookup_contex=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
-    out_file->Append("Can not read Index Block \n\n");
+    out_stream << "Can not read Index Block \n\n";
     return s;
   }
 
-  out_file->Append("  Block key hex dump: Data block handle\n");
-  out_file->Append("  Block key ascii\n\n");
+  out_stream << "  Block key hex dump: Data block handle\n";
+  out_stream << "  Block key ascii\n\n";
   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
        blockhandles_iter->Next()) {
     s = blockhandles_iter->status();
@@ -4385,13 +3493,10 @@
       user_key = ikey.user_key();
     }
 
-    out_file->Append("  HEX    ");
-    out_file->Append(user_key.ToString(true).c_str());
-    out_file->Append(": ");
-    out_file->Append(blockhandles_iter->value()
-                         .ToString(true, rep_->index_has_first_key)
-                         .c_str());
-    out_file->Append("\n");
+    out_stream << "  HEX    " << user_key.ToString(true) << ": "
+               << blockhandles_iter->value().ToString(true,
+                                                      rep_->index_has_first_key)
+               << "\n";
 
     std::string str_key = user_key.ToString();
     std::string res_key("");
@@ -4400,22 +3505,21 @@
       res_key.append(&str_key[i], 1);
       res_key.append(1, cspace);
     }
-    out_file->Append("  ASCII  ");
-    out_file->Append(res_key.c_str());
-    out_file->Append("\n  ------\n");
+    out_stream << "  ASCII  " << res_key << "\n";
+    out_stream << "  ------\n";
   }
-  out_file->Append("\n");
+  out_stream << "\n";
   return Status::OK();
 }
 
-Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
+Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
                        /*lookup_contex=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
-    out_file->Append("Can not read Index Block \n\n");
+    out_stream << "Can not read Index Block \n\n";
     return s;
   }
 
@@ -4437,12 +3541,9 @@
     datablock_size_max = std::max(datablock_size_max, datablock_size);
     datablock_size_sum += datablock_size;
 
-    out_file->Append("Data Block # ");
-    out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
-    out_file->Append(" @ ");
-    out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
-    out_file->Append("\n");
-    out_file->Append("--------------------------------------\n");
+    out_stream << "Data Block # " << block_id << " @ "
+               << blockhandles_iter->value().handle.ToString(true) << "\n";
+    out_stream << "--------------------------------------\n";
 
     std::unique_ptr<InternalIterator> datablock_iter;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
@@ -4453,7 +3554,7 @@
     s = datablock_iter->status();
 
     if (!s.ok()) {
-      out_file->Append("Error reading the block - Skipped \n\n");
+      out_stream << "Error reading the block - Skipped \n\n";
       continue;
     }
 
@@ -4461,44 +3562,37 @@
          datablock_iter->Next()) {
       s = datablock_iter->status();
       if (!s.ok()) {
-        out_file->Append("Error reading the block - Skipped \n");
+        out_stream << "Error reading the block - Skipped \n";
         break;
       }
-      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
+      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream);
     }
-    out_file->Append("\n");
+    out_stream << "\n";
   }
 
   uint64_t num_datablocks = block_id - 1;
   if (num_datablocks) {
     double datablock_size_avg =
         static_cast<double>(datablock_size_sum) / num_datablocks;
-    out_file->Append("Data Block Summary:\n");
-    out_file->Append("--------------------------------------");
-    out_file->Append("\n  # data blocks: ");
-    out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
-    out_file->Append("\n  min data block size: ");
-    out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
-    out_file->Append("\n  max data block size: ");
-    out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
-    out_file->Append("\n  avg data block size: ");
-    out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
-    out_file->Append("\n");
+    out_stream << "Data Block Summary:\n";
+    out_stream << "--------------------------------------\n";
+    out_stream << "  # data blocks: " << num_datablocks << "\n";
+    out_stream << "  min data block size: " << datablock_size_min << "\n";
+    out_stream << "  max data block size: " << datablock_size_max << "\n";
+    out_stream << "  avg data block size: " << ToString(datablock_size_avg)
+               << "\n";
   }
 
   return Status::OK();
 }
 
 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
-                                   WritableFile* out_file) {
+                                   std::ostream& out_stream) {
   InternalKey ikey;
   ikey.DecodeFrom(key);
 
-  out_file->Append("  HEX    ");
-  out_file->Append(ikey.user_key().ToString(true).c_str());
-  out_file->Append(": ");
-  out_file->Append(value.ToString(true).c_str());
-  out_file->Append("\n");
+  out_stream << "  HEX    " << ikey.user_key().ToString(true) << ": "
+             << value.ToString(true) << "\n";
 
   std::string str_key = ikey.user_key().ToString();
   std::string str_value = value.ToString();
@@ -4521,11 +3615,8 @@
     res_value.append(1, cspace);
   }
 
-  out_file->Append("  ASCII  ");
-  out_file->Append(res_key.c_str());
-  out_file->Append(": ");
-  out_file->Append(res_value.c_str());
-  out_file->Append("\n  ------\n");
+  out_stream << "  ASCII  " << res_key << ": " << res_value << "\n";
+  out_stream << "  ------\n";
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,22 +9,13 @@
 
 #pragma once
 
-#include <stdint.h>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
+#include <cstdint>
 
+#include "cache/cache_key.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "file/filename.h"
-#include "file/random_access_file_reader.h"
-#include "options/cf_options.h"
-#include "rocksdb/options.h"
-#include "rocksdb/persistent_cache.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/status.h"
-#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table_properties.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_type.h"
@@ -32,15 +23,11 @@
 #include "table/block_based/filter_block.h"
 #include "table/block_based/uncompression_dict_reader.h"
 #include "table/format.h"
-#include "table/get_context.h"
-#include "table/multiget_context.h"
-#include "table/persistent_cache_helper.h"
+#include "table/persistent_cache_options.h"
 #include "table/table_properties_internal.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
 #include "trace_replay/block_cache_tracer.h"
-#include "util/coding.h"
-#include "util/user_comparator_wrapper.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -60,7 +47,7 @@
 struct ReadOptions;
 class GetContext;
 
-typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
+using KVPairBlock = std::vector<std::pair<std::string, std::string>>;
 
 // Reader class for BlockBasedTable format.
 // For the format of BlockBasedTable refer to
@@ -78,17 +65,14 @@
   static const std::string kFilterBlockPrefix;
   static const std::string kFullFilterBlockPrefix;
   static const std::string kPartitionedFilterBlockPrefix;
-  // The longest prefix of the cache key used to identify blocks.
-  // For Posix files the unique ID is three varints.
-  static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
 
   // All the below fields control iterator readahead
   static const size_t kInitAutoReadaheadSize = 8 * 1024;
-  // Found that 256 KB readahead size provides the best performance, based on
-  // experiments, for auto readahead. Experiment data is in PR #3282.
-  static const size_t kMaxAutoReadaheadSize;
   static const int kMinNumFileReadsToStartAutoReadahead = 2;
 
+  // 1-byte compression type + 32-bit checksum
+  static constexpr size_t kBlockTrailerSize = 5;
+
   // Attempt to open the table that is stored in bytes [0..file_size)
   // of "file", and read the metadata entries necessary to allow
   // retrieving data from the table.
@@ -105,20 +89,24 @@
   // @param skip_filters Disables loading/accessing the filter block. Overrides
   //    prefetch_index_and_filter_in_cache, so filter will be skipped if both
   //    are set.
-  static Status Open(const ImmutableCFOptions& ioptions,
-                     const EnvOptions& env_options,
-                     const BlockBasedTableOptions& table_options,
-                     const InternalKeyComparator& internal_key_comparator,
-                     std::unique_ptr<RandomAccessFileReader>&& file,
-                     uint64_t file_size,
-                     std::unique_ptr<TableReader>* table_reader,
-                     const SliceTransform* prefix_extractor = nullptr,
-                     bool prefetch_index_and_filter_in_cache = true,
-                     bool skip_filters = false, int level = -1,
-                     const bool immortal_table = false,
-                     const SequenceNumber largest_seqno = 0,
-                     TailPrefetchStats* tail_prefetch_stats = nullptr,
-                     BlockCacheTracer* const block_cache_tracer = nullptr);
+  // @param force_direct_prefetch if true, always prefetching to RocksDB
+  //    buffer, rather than calling RandomAccessFile::Prefetch().
+  static Status Open(
+      const ReadOptions& ro, const ImmutableOptions& ioptions,
+      const EnvOptions& env_options,
+      const BlockBasedTableOptions& table_options,
+      const InternalKeyComparator& internal_key_comparator,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false,
+      int level = -1, const bool immortal_table = false,
+      const SequenceNumber largest_seqno = 0,
+      bool force_direct_prefetch = false,
+      TailPrefetchStats* tail_prefetch_stats = nullptr,
+      BlockCacheTracer* const block_cache_tracer = nullptr,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0);
 
   bool PrefixMayMatch(const Slice& internal_key,
                       const ReadOptions& read_options,
@@ -129,6 +117,7 @@
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
+  // @param read_options Must outlive the returned iterator.
   // @param skip_filters Disables loading/accessing the filter block
   // compaction_readahead_size: its value will only be used if caller =
   // kCompaction.
@@ -136,7 +125,8 @@
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena, bool skip_filters,
                                 TableReaderCaller caller,
-                                size_t compaction_readahead_size = 0) override;
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
 
   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options) override;
@@ -220,14 +210,52 @@
     virtual size_t ApproximateMemoryUsage() const = 0;
     // Cache the dependencies of the index reader (e.g. the partitions
     // of a partitioned index).
-    virtual void CacheDependencies(bool /* pin */) {}
+    virtual Status CacheDependencies(const ReadOptions& /*ro*/,
+                                     bool /* pin */) {
+      return Status::OK();
+    }
   };
 
   class IndexReaderCommon;
 
-  static Slice GetCacheKey(const char* cache_key_prefix,
-                           size_t cache_key_prefix_size,
-                           const BlockHandle& handle, char* cache_key);
+  // Maximum SST file size that uses standard CacheKey encoding scheme.
+  // See GetCacheKey to explain << 2. + 3 is permitted because it is trimmed
+  // off by >> 2 in GetCacheKey.
+  static constexpr uint64_t kMaxFileSizeStandardEncoding =
+      (OffsetableCacheKey::kMaxOffsetStandardEncoding << 2) + 3;
+
+  static void SetupBaseCacheKey(const TableProperties* properties,
+                                const std::string& cur_db_session_id,
+                                uint64_t cur_file_number, uint64_t file_size,
+                                OffsetableCacheKey* out_base_cache_key,
+                                bool* out_is_stable = nullptr);
+
+  static CacheKey GetCacheKey(const OffsetableCacheKey& base_cache_key,
+                              const BlockHandle& handle);
+
+  static void UpdateCacheInsertionMetrics(BlockType block_type,
+                                          GetContext* get_context, size_t usage,
+                                          bool redundant,
+                                          Statistics* const statistics);
+
+  // Get the size to read from storage for a BlockHandle. size_t because we
+  // are about to load into memory.
+  static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) {
+    return static_cast<size_t>(handle.size() + kBlockTrailerSize);
+  }
+
+  // It's the caller's responsibility to make sure that this is
+  // for raw block contents, which contains the compression
+  // byte in the end.
+  static inline CompressionType GetBlockCompressionType(const char* block_data,
+                                                        size_t block_size) {
+    return static_cast<CompressionType>(block_data[block_size]);
+  }
+  static inline CompressionType GetBlockCompressionType(
+      const BlockContents& contents) {
+    assert(contents.is_raw_block);
+    return GetBlockCompressionType(contents.data.data(), contents.data.size());
+  }
 
   // Retrieve all key value pairs from data blocks in the table.
   // The key retrieved are internal keys.
@@ -271,22 +299,34 @@
 
  private:
   friend class MockedBlockBasedTable;
-  static std::atomic<uint64_t> next_cache_key_id_;
+  friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
   BlockCacheTracer* const block_cache_tracer_;
 
   void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
                              size_t usage) const;
   void UpdateCacheMissMetrics(BlockType block_type,
                               GetContext* get_context) const;
-  void UpdateCacheInsertionMetrics(BlockType block_type,
-                                   GetContext* get_context, size_t usage) const;
-  Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
-                                   BlockType block_type,
-                                   GetContext* get_context) const;
+
+  Cache::Handle* GetEntryFromCache(const CacheTier& cache_tier,
+                                   Cache* block_cache, const Slice& key,
+                                   BlockType block_type, const bool wait,
+                                   GetContext* get_context,
+                                   const Cache::CacheItemHelper* cache_helper,
+                                   const Cache::CreateCallback& create_cb,
+                                   Cache::Priority priority) const;
+
+  template <typename TBlocklike>
+  Status InsertEntryToCache(const CacheTier& cache_tier, Cache* block_cache,
+                            const Slice& key,
+                            const Cache::CacheItemHelper* cache_helper,
+                            std::unique_ptr<TBlocklike>& block_holder,
+                            size_t charge, Cache::Handle** cache_handle,
+                            Cache::Priority priority) const;
 
   // Either Block::NewDataIterator() or Block::NewIndexIterator().
   template <typename TBlockIter>
   static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
+                                       BlockType block_type,
                                        TBlockIter* input_iter,
                                        bool block_contents_pinned);
 
@@ -303,6 +343,7 @@
   Status MaybeReadBlockAndLoadToCache(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      const bool wait, const bool for_compaction,
       CachableEntry<TBlocklike>* block_entry, BlockType block_type,
       GetContext* get_context, BlockCacheLookupContext* lookup_context,
       BlockContents* contents) const;
@@ -317,7 +358,8 @@
                        CachableEntry<TBlocklike>* block_entry,
                        BlockType block_type, GetContext* get_context,
                        BlockCacheLookupContext* lookup_context,
-                       bool for_compaction, bool use_cache) const;
+                       bool for_compaction, bool use_cache,
+                       bool wait_for_cache) const;
 
   void RetrieveMultipleBlocks(
       const ReadOptions& options, const MultiGetRange* batch,
@@ -352,12 +394,13 @@
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
   template <typename TBlocklike>
-  Status GetDataBlockFromCache(
-      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed,
-      const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
-      const UncompressionDict& uncompression_dict, BlockType block_type,
-      GetContext* get_context) const;
+  Status GetDataBlockFromCache(const Slice& cache_key, Cache* block_cache,
+                               Cache* block_cache_compressed,
+                               const ReadOptions& read_options,
+                               CachableEntry<TBlocklike>* block,
+                               const UncompressionDict& uncompression_dict,
+                               BlockType block_type, const bool wait,
+                               GetContext* get_context) const;
 
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
@@ -370,14 +413,15 @@
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
   template <typename TBlocklike>
-  Status PutDataBlockToCache(
-      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed,
-      CachableEntry<TBlocklike>* cached_block,
-      BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
-      const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
-      MemoryAllocator* memory_allocator, BlockType block_type,
-      GetContext* get_context) const;
+  Status PutDataBlockToCache(const Slice& cache_key, Cache* block_cache,
+                             Cache* block_cache_compressed,
+                             CachableEntry<TBlocklike>* cached_block,
+                             BlockContents* raw_block_contents,
+                             CompressionType raw_block_comp_type,
+                             const UncompressionDict& uncompression_dict,
+                             MemoryAllocator* memory_allocator,
+                             BlockType block_type,
+                             GetContext* get_context) const;
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
   // after a call to Seek(key), until handle_result returns false.
@@ -389,7 +433,8 @@
   // Optionally, user can pass a preloaded meta_index_iter for the index that
   // need to access extra meta blocks for index construction. This parameter
   // helps avoid re-reading meta index block if caller already created one.
-  Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
+  Status CreateIndexReader(const ReadOptions& ro,
+                           FilePrefetchBuffer* prefetch_buffer,
                            InternalIterator* preloaded_meta_index_iter,
                            bool use_cache, bool prefetch, bool pin,
                            BlockCacheLookupContext* lookup_context,
@@ -408,28 +453,31 @@
                               const SliceTransform* prefix_extractor,
                               BlockCacheLookupContext* lookup_context) const;
 
+  // If force_direct_prefetch is true, always prefetching to RocksDB
+  //    buffer, rather than calling RandomAccessFile::Prefetch().
   static Status PrefetchTail(
-      RandomAccessFileReader* file, uint64_t file_size,
-      TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
-      const bool preload_all,
+      const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
+      bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
+      const bool prefetch_all, const bool preload_all,
       std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
-  Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
+  Status ReadMetaIndexBlock(const ReadOptions& ro,
+                            FilePrefetchBuffer* prefetch_buffer,
                             std::unique_ptr<Block>* metaindex_block,
                             std::unique_ptr<InternalIterator>* iter);
-  Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
-                                          const Slice& handle_value,
-                                          TableProperties** table_properties);
-  Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
+  Status ReadPropertiesBlock(const ReadOptions& ro,
+                             FilePrefetchBuffer* prefetch_buffer,
                              InternalIterator* meta_iter,
                              const SequenceNumber largest_seqno);
-  Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
+  Status ReadRangeDelBlock(const ReadOptions& ro,
+                           FilePrefetchBuffer* prefetch_buffer,
                            InternalIterator* meta_iter,
                            const InternalKeyComparator& internal_comparator,
                            BlockCacheLookupContext* lookup_context);
   Status PrefetchIndexAndFilterBlocks(
-      FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-      BlockBasedTable* new_table, bool prefetch_all,
-      const BlockBasedTableOptions& table_options, const int level,
+      const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+      InternalIterator* meta_iter, BlockBasedTable* new_table,
+      bool prefetch_all, const BlockBasedTableOptions& table_options,
+      const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin,
       BlockCacheLookupContext* lookup_context);
 
   static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
@@ -440,26 +488,27 @@
 
   // Create the filter from the filter block.
   std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
-      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
-      bool pin, BlockCacheLookupContext* lookup_context);
+      const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
 
-  static void SetupCacheKeyPrefix(Rep* rep);
+  // Size of all data blocks, maybe approximate
+  uint64_t GetApproximateDataSize();
 
-  // Generate a cache key prefix from the file
-  static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
-                                  char* buffer, size_t* size);
-  static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
-                                  size_t* size);
-
-  // Given an iterator return its offset in file.
-  uint64_t ApproximateOffsetOf(
-      const InternalIteratorBase<IndexValue>& index_iter) const;
+  // Given an iterator return its offset in data block section of file.
+  uint64_t ApproximateDataOffsetOf(
+      const InternalIteratorBase<IndexValue>& index_iter,
+      uint64_t data_size) const;
 
   // Helper functions for DumpTable()
-  Status DumpIndexBlock(WritableFile* out_file);
-  Status DumpDataBlocks(WritableFile* out_file);
+  Status DumpIndexBlock(std::ostream& out_stream);
+  Status DumpDataBlocks(std::ostream& out_stream);
   void DumpKeyValue(const Slice& key, const Slice& value,
-                    WritableFile* out_file);
+                    std::ostream& out_stream);
+
+  // Returns true if prefix_extractor is compatible with that used in building
+  // the table file.
+  bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const;
 
   // A cumulative data block file read in MultiGet lower than this size will
   // use a stack buffer
@@ -470,7 +519,7 @@
   friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
 };
 
-// Maitaning state of a two-level iteration on a partitioned index structure.
+// Maintaining state of a two-level iteration on a partitioned index structure.
 class BlockBasedTable::PartitionedIndexIteratorState
     : public TwoLevelIteratorState {
  public:
@@ -489,10 +538,10 @@
 // Stores all the properties associated with a BlockBasedTable.
 // These are immutable.
 struct BlockBasedTable::Rep {
-  Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
+  Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options,
       const BlockBasedTableOptions& _table_opt,
       const InternalKeyComparator& _internal_comparator, bool skip_filters,
-      int _level, const bool _immortal_table)
+      uint64_t _file_size, int _level, const bool _immortal_table)
       : ioptions(_ioptions),
         env_options(_env_options),
         table_options(_table_opt),
@@ -504,22 +553,18 @@
         whole_key_filtering(_table_opt.whole_key_filtering),
         prefix_filtering(true),
         global_seqno(kDisableGlobalSequenceNumber),
+        file_size(_file_size),
         level(_level),
         immortal_table(_immortal_table) {}
-
-  const ImmutableCFOptions& ioptions;
+  ~Rep() { status.PermitUncheckedError(); }
+  const ImmutableOptions& ioptions;
   const EnvOptions& env_options;
   const BlockBasedTableOptions table_options;
   const FilterPolicy* const filter_policy;
   const InternalKeyComparator& internal_comparator;
   Status status;
   std::unique_ptr<RandomAccessFileReader> file;
-  char cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t cache_key_prefix_size = 0;
-  char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t persistent_cache_key_prefix_size = 0;
-  char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t compressed_cache_key_prefix_size = 0;
+  OffsetableCacheKey base_cache_key;
   PersistentCacheOptions persistent_cache_options;
 
   // Footer contains the fixed table information
@@ -561,6 +606,9 @@
   // and every key have it's own seqno.
   SequenceNumber global_seqno;
 
+  // Size of the table file on disk
+  uint64_t file_size;
+
   // the level when the table is opened, could potentially change when trivial
   // move is involved
   int level;
@@ -606,219 +654,68 @@
   uint64_t sst_number_for_tracing() const {
     return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
   }
-  void CreateFilePrefetchBuffer(
-      size_t readahead_size, size_t max_readahead_size,
-      std::unique_ptr<FilePrefetchBuffer>* fpb) const {
-    fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size,
-                                      max_readahead_size,
-                                      !ioptions.allow_mmap_reads /* enable */));
-  }
-};
-
-// Iterates over the contents of BlockBasedTable.
-template <class TBlockIter, typename TValue = Slice>
-class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
-  // compaction_readahead_size: its value will only be used if for_compaction =
-  // true
- public:
-  BlockBasedTableIterator(const BlockBasedTable* table,
-                          const ReadOptions& read_options,
-                          const InternalKeyComparator& icomp,
-                          InternalIteratorBase<IndexValue>* index_iter,
-                          bool check_filter, bool need_upper_bound_check,
-                          const SliceTransform* prefix_extractor,
-                          BlockType block_type, TableReaderCaller caller,
-                          size_t compaction_readahead_size = 0)
-      : table_(table),
-        read_options_(read_options),
-        icomp_(icomp),
-        user_comparator_(icomp.user_comparator()),
-        index_iter_(index_iter),
-        pinned_iters_mgr_(nullptr),
-        block_iter_points_to_real_block_(false),
-        check_filter_(check_filter),
-        need_upper_bound_check_(need_upper_bound_check),
-        prefix_extractor_(prefix_extractor),
-        block_type_(block_type),
-        lookup_context_(caller),
-        compaction_readahead_size_(compaction_readahead_size) {}
-
-  ~BlockBasedTableIterator() { delete index_iter_; }
-
-  void Seek(const Slice& target) override;
-  void SeekForPrev(const Slice& target) override;
-  void SeekToFirst() override;
-  void SeekToLast() override;
-  void Next() final override;
-  bool NextAndGetResult(IterateResult* result) override;
-  void Prev() override;
-  bool Valid() const override {
-    return !is_out_of_bound_ &&
-           (is_at_first_key_from_index_ ||
-            (block_iter_points_to_real_block_ && block_iter_.Valid()));
-  }
-  Slice key() const override {
-    assert(Valid());
-    if (is_at_first_key_from_index_) {
-      return index_iter_->value().first_internal_key;
-    } else {
-      return block_iter_.key();
-    }
-  }
-  Slice user_key() const override {
-    assert(Valid());
-    if (is_at_first_key_from_index_) {
-      return ExtractUserKey(index_iter_->value().first_internal_key);
-    } else {
-      return block_iter_.user_key();
-    }
+  void CreateFilePrefetchBuffer(size_t readahead_size,
+                                size_t max_readahead_size,
+                                std::unique_ptr<FilePrefetchBuffer>* fpb,
+                                bool implicit_auto_readahead) const {
+    fpb->reset(new FilePrefetchBuffer(readahead_size, max_readahead_size,
+                                      !ioptions.allow_mmap_reads /* enable */,
+                                      false /* track_min_offset */,
+                                      implicit_auto_readahead));
   }
-  TValue value() const override {
-    assert(Valid());
 
-    // Load current block if not loaded.
-    if (is_at_first_key_from_index_ &&
-        !const_cast<BlockBasedTableIterator*>(this)
-             ->MaterializeCurrentBlock()) {
-      // Oops, index is not consistent with block contents, but we have
-      // no good way to report error at this point. Let's return empty value.
-      return TValue();
-    }
-
-    return block_iter_.value();
-  }
-  Status status() const override {
-    // Prefix index set status to NotFound when the prefix does not exist
-    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
-      return index_iter_->status();
-    } else if (block_iter_points_to_real_block_) {
-      return block_iter_.status();
-    } else {
-      return Status::OK();
+  void CreateFilePrefetchBufferIfNotExists(
+      size_t readahead_size, size_t max_readahead_size,
+      std::unique_ptr<FilePrefetchBuffer>* fpb,
+      bool implicit_auto_readahead) const {
+    if (!(*fpb)) {
+      CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb,
+                               implicit_auto_readahead);
     }
   }
+};
 
-  // Whether iterator invalidated for being out of bound.
-  bool IsOutOfBound() override { return is_out_of_bound_; }
-
-  inline bool MayBeOutOfUpperBound() override {
-    assert(Valid());
-    return !data_block_within_upper_bound_;
-  }
-
-  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
-    pinned_iters_mgr_ = pinned_iters_mgr;
-  }
-  bool IsKeyPinned() const override {
-    // Our key comes either from block_iter_'s current key
-    // or index_iter_'s current *value*.
-    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
-           ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
-            (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
-  }
-  bool IsValuePinned() const override {
-    // Load current block if not loaded.
-    if (is_at_first_key_from_index_) {
-      const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
-    }
-    // BlockIter::IsValuePinned() is always true. No need to check
-    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
-           block_iter_points_to_real_block_;
-  }
+// This is an adapter class for `WritableFile` to be used for `std::ostream`.
+// The adapter wraps a `WritableFile`, which can be passed to a `std::ostream`
+// constructor for storing streaming data.
+// Note:
+//  * This adapter doesn't provide any buffering, each write is forwarded to
+//    `WritableFile->Append()` directly.
+//  * For a failed write, the user needs to check the status by `ostream.good()`
+class WritableFileStringStreamAdapter : public std::stringbuf {
+ public:
+  explicit WritableFileStringStreamAdapter(WritableFile* writable_file)
+      : file_(writable_file) {}
 
-  void ResetDataIter() {
-    if (block_iter_points_to_real_block_) {
-      if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
-        block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
+  // Override overflow() to handle `sputc()`. There are cases that will not go
+  // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
+  // `os.put()` directly and will call `sputc()` By internal implementation:
+  //    int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) {  // put a character
+  //        return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
+  //        overflow(_Traits::to_int_type(_Ch));
+  //    }
+  // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
+  // not captured by xsputn(), becomes an overflow here.
+  int overflow(int ch = EOF) override {
+    if (ch != EOF) {
+      Status s = file_->Append(Slice((char*)&ch, 1));
+      if (s.ok()) {
+        return ch;
       }
-      block_iter_.Invalidate(Status::OK());
-      block_iter_points_to_real_block_ = false;
     }
+    return EOF;
   }
 
-  void SavePrevIndexValue() {
-    if (block_iter_points_to_real_block_) {
-      // Reseek. If they end up with the same data block, we shouldn't re-fetch
-      // the same data block.
-      prev_block_offset_ = index_iter_->value().handle.offset();
+  std::streamsize xsputn(char const* p, std::streamsize n) override {
+    Status s = file_->Append(Slice(p, n));
+    if (!s.ok()) {
+      return 0;
     }
+    return n;
   }
 
  private:
-  enum class IterDirection {
-    kForward,
-    kBackward,
-  };
-
-  const BlockBasedTable* table_;
-  const ReadOptions read_options_;
-  const InternalKeyComparator& icomp_;
-  UserComparatorWrapper user_comparator_;
-  InternalIteratorBase<IndexValue>* index_iter_;
-  PinnedIteratorsManager* pinned_iters_mgr_;
-  TBlockIter block_iter_;
-
-  // True if block_iter_ is initialized and points to the same block
-  // as index iterator.
-  bool block_iter_points_to_real_block_;
-  // See InternalIteratorBase::IsOutOfBound().
-  bool is_out_of_bound_ = false;
-  // Whether current data block being fully within iterate upper bound.
-  bool data_block_within_upper_bound_ = false;
-  // True if we're standing at the first key of a block, and we haven't loaded
-  // that block yet. A call to value() will trigger loading the block.
-  bool is_at_first_key_from_index_ = false;
-  bool check_filter_;
-  // TODO(Zhongyi): pick a better name
-  bool need_upper_bound_check_;
-  const SliceTransform* prefix_extractor_;
-  BlockType block_type_;
-  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
-  BlockCacheLookupContext lookup_context_;
-  // Readahead size used in compaction, its value is used only if
-  // lookup_context_.caller = kCompaction.
-  size_t compaction_readahead_size_;
-
-  size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
-  size_t readahead_limit_ = 0;
-  int64_t num_file_reads_ = 0;
-  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
-
-  // If `target` is null, seek to first.
-  void SeekImpl(const Slice* target);
-
-  void InitDataBlock();
-  bool MaterializeCurrentBlock();
-  void FindKeyForward();
-  void FindBlockForward();
-  void FindKeyBackward();
-  void CheckOutOfBound();
-
-  // Check if data block is fully within iterate_upper_bound.
-  //
-  // Note MyRocks may update iterate bounds between seek. To workaround it,
-  // we need to check and update data_block_within_upper_bound_ accordingly.
-  void CheckDataBlockWithinUpperBound();
-
-  bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
-    if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
-      // Upper bound check isn't sufficnet for backward direction to
-      // guarantee the same result as total order, so disable prefix
-      // check.
-      return true;
-    }
-    if (check_filter_ &&
-        !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
-                                need_upper_bound_check_, &lookup_context_)) {
-      // TODO remember the iterator is invalidated because of prefix
-      // match. This can avoid the upper level file iterator to falsely
-      // believe the position is the end of the SST file and move to
-      // the first key of the next file.
-      ResetDataIter();
-      return false;
-    }
-    return true;
-  }
+  WritableFile* file_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,163 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+
+#include "table/block_based/reader_common.h"
+
+// The file contains some member functions of BlockBasedTable that
+// cannot be implemented in block_based_table_reader.cc because
+// it's called by other files (e.g. block_based_iterator.h) and
+// are templates.
+
+namespace ROCKSDB_NAMESPACE {
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
+    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context, Status s,
+    FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  CachableEntry<UncompressionDict> uncompression_dict;
+  if (rep_->uncompression_dict_reader) {
+    const bool no_io = (ro.read_tier == kBlockCacheTier);
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        prefetch_buffer, no_io, get_context, lookup_context,
+        &uncompression_dict);
+    if (!s.ok()) {
+      iter->Invalidate(s);
+      return iter;
+    }
+  }
+
+  const UncompressionDict& dict = uncompression_dict.GetValue()
+                                      ? *uncompression_dict.GetValue()
+                                      : UncompressionDict::GetEmptyDict();
+
+  CachableEntry<Block> block;
+  s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
+                    get_context, lookup_context, for_compaction,
+                    /* use_cache */ true, /* wait_for_cache */ true);
+
+  if (!s.ok()) {
+    assert(block.IsEmpty());
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), block_type, iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache) {
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      if (block_cache) {
+        // insert a dummy record to block cache to track the memory usage
+        Cache::Handle* cache_handle = nullptr;
+        CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
+        s = block_cache->Insert(key.AsSlice(), nullptr,
+                                block.GetValue()->ApproximateMemoryUsage(),
+                                nullptr, &cache_handle);
+
+        if (s.ok()) {
+          assert(cache_handle != nullptr);
+          iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                                cache_handle);
+        }
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+
+  return iter;
+}
+
+// Convert an uncompressed data block (i.e CachableEntry<Block>)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
+                                                  CachableEntry<Block>& block,
+                                                  TBlockIter* input_iter,
+                                                  Status s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), BlockType::kData,
+                                       iter, block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache) {
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      if (block_cache) {
+        // insert a dummy record to block cache to track the memory usage
+        Cache::Handle* cache_handle = nullptr;
+        CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
+        s = block_cache->Insert(key.AsSlice(), nullptr,
+                                block.GetValue()->ApproximateMemoryUsage(),
+                                nullptr, &cache_handle);
+
+        if (s.ok()) {
+          assert(cache_handle != nullptr);
+          iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                                cache_handle);
+        }
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+  return iter;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,357 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/block_based_table_reader.h"
+
+#include "db/table_properties_collector.h"
+#include "file/file_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/partitioned_index_iterator.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlockBasedTableReaderTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<
+          CompressionType, bool, BlockBasedTableOptions::IndexType, bool>> {
+ protected:
+  CompressionType compression_type_;
+  bool use_direct_reads_;
+
+  void SetUp() override {
+    BlockBasedTableOptions::IndexType index_type;
+    bool no_block_cache;
+    std::tie(compression_type_, use_direct_reads_, index_type, no_block_cache) =
+        GetParam();
+
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_based_table_reader_test");
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+
+    BlockBasedTableOptions opts;
+    opts.index_type = index_type;
+    opts.no_block_cache = no_block_cache;
+    table_factory_.reset(
+        static_cast<BlockBasedTableFactory*>(NewBlockBasedTableFactory(opts)));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  // Creates a table with the specificied key value pairs (kv).
+  void CreateTable(const std::string& table_name,
+                   const CompressionType& compression_type,
+                   const std::map<std::string, std::string>& kv) {
+    std::unique_ptr<WritableFileWriter> writer;
+    NewFileWriter(table_name, &writer);
+
+    // Create table builder.
+    Options options;
+    ImmutableOptions ioptions(options);
+    InternalKeyComparator comparator(options.comparator);
+    ColumnFamilyOptions cf_options;
+    MutableCFOptions moptions(cf_options);
+    IntTblPropCollectorFactories factories;
+    std::unique_ptr<TableBuilder> table_builder(table_factory_->NewTableBuilder(
+        TableBuilderOptions(ioptions, moptions, comparator, &factories,
+                            compression_type, CompressionOptions(),
+                            0 /* column_family_id */, kDefaultColumnFamilyName,
+                            -1 /* level */),
+        writer.get()));
+
+    // Build table.
+    for (auto it = kv.begin(); it != kv.end(); it++) {
+      std::string k = ToInternalKey(it->first);
+      std::string v = it->second;
+      table_builder->Add(k, v);
+    }
+    ASSERT_OK(table_builder->Finish());
+  }
+
+  void NewBlockBasedTableReader(const FileOptions& foptions,
+                                const ImmutableOptions& ioptions,
+                                const InternalKeyComparator& comparator,
+                                const std::string& table_name,
+                                std::unique_ptr<BlockBasedTable>* table) {
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
+
+    std::unique_ptr<TableReader> table_reader;
+    ReadOptions ro;
+    const auto* table_options =
+        table_factory_->GetOptions<BlockBasedTableOptions>();
+    ASSERT_NE(table_options, nullptr);
+    ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options,
+                                    comparator, std::move(file), file_size,
+                                    &table_reader));
+
+    table->reset(reinterpret_cast<BlockBasedTable*>(table_reader.release()));
+  }
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  const std::shared_ptr<FileSystem>& fs() const { return fs_; }
+
+ private:
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::unique_ptr<BlockBasedTableFactory> table_factory_;
+
+  void WriteToFile(const std::string& content, const std::string& filename) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    EnvOptions env_options;
+    FileOptions foptions;
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(fs_->NewWritableFile(path, foptions, &file, nullptr));
+    writer->reset(new WritableFileWriter(std::move(file), path, env_options));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
+  }
+
+  std::string ToInternalKey(const std::string& key) {
+    InternalKey internal_key(key, 0, ValueType::kTypeValue);
+    return internal_key.Encode().ToString();
+  }
+};
+
+// Tests MultiGet in both direct IO and non-direct IO mode.
+// The keys should be in cache after MultiGet.
+TEST_P(BlockBasedTableReaderTest, MultiGet) {
+  // Prepare key-value pairs to occupy multiple blocks.
+  // Each value is 256B, every 16 pairs constitute 1 block.
+  // Adjacent blocks contain values with different compression complexity:
+  // human readable strings are easier to compress than random strings.
+  std::map<std::string, std::string> kv;
+  {
+    Random rnd(101);
+    uint32_t key = 0;
+    for (int block = 0; block < 100; block++) {
+      for (int i = 0; i < 16; i++) {
+        char k[9] = {0};
+        // Internal key is constructed directly from this key,
+        // and internal key size is required to be >= 8 bytes,
+        // so use %08u as the format string.
+        sprintf(k, "%08u", key);
+        std::string v;
+        if (block % 2) {
+          v = rnd.HumanReadableString(256);
+        } else {
+          v = rnd.RandomString(256);
+        }
+        kv[std::string(k)] = v;
+        key++;
+      }
+    }
+  }
+
+  // Prepare keys, values, and statuses for MultiGet.
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> keys;
+  autovector<PinnableSlice, MultiGetContext::MAX_BATCH_SIZE> values;
+  autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+  {
+    const int step =
+        static_cast<int>(kv.size()) / MultiGetContext::MAX_BATCH_SIZE;
+    auto it = kv.begin();
+    for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE; i++) {
+      keys.emplace_back(it->first);
+      values.emplace_back();
+      statuses.emplace_back();
+      std::advance(it, step);
+    }
+  }
+
+  std::string table_name =
+      "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_);
+  CreateTable(table_name, compression_type_, kv);
+
+  std::unique_ptr<BlockBasedTable> table;
+  Options options;
+  ImmutableOptions ioptions(options);
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+
+  // Ensure that keys are not in cache before MultiGet.
+  for (auto& key : keys) {
+    ASSERT_FALSE(table->TEST_KeyInCache(ReadOptions(), key));
+  }
+
+  // Prepare MultiGetContext.
+  autovector<GetContext, MultiGetContext::MAX_BATCH_SIZE> get_context;
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    get_context.emplace_back(
+        BytewiseComparator(), nullptr, nullptr, nullptr, GetContext::kNotFound,
+        keys[i], &values[i], nullptr, nullptr, nullptr, true /* do_merge */,
+        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+    key_context.emplace_back(nullptr, keys[i], &values[i], nullptr,
+                             &statuses.back());
+    key_context.back().get_context = &get_context.back();
+  }
+  for (auto& key_ctx : key_context) {
+    sorted_keys.emplace_back(&key_ctx);
+  }
+  MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions());
+
+  // Execute MultiGet.
+  MultiGetContext::Range range = ctx.GetMultiGetRange();
+  PerfContext* perf_ctx = get_perf_context();
+  perf_ctx->Reset();
+  table->MultiGet(ReadOptions(), &range, nullptr);
+
+  ASSERT_GE(perf_ctx->block_read_count - perf_ctx->index_block_read_count -
+                perf_ctx->filter_block_read_count -
+                perf_ctx->compression_dict_block_read_count,
+            1);
+  ASSERT_GE(perf_ctx->block_read_byte, 1);
+
+  for (const Status& status : statuses) {
+    ASSERT_OK(status);
+  }
+  // Check that keys are in cache after MultiGet.
+  for (size_t i = 0; i < keys.size(); i++) {
+    ASSERT_TRUE(table->TEST_KeyInCache(ReadOptions(), keys[i]));
+    ASSERT_EQ(values[i].ToString(), kv[keys[i].ToString()]);
+  }
+}
+
+class BlockBasedTableReaderTestVerifyChecksum
+    : public BlockBasedTableReaderTest {
+ public:
+  BlockBasedTableReaderTestVerifyChecksum() : BlockBasedTableReaderTest() {}
+};
+
+TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
+  // Prepare key-value pairs to occupy multiple blocks.
+  // Each value is 256B, every 16 pairs constitute 1 block.
+  // Adjacent blocks contain values with different compression complexity:
+  // human readable strings are easier to compress than random strings.
+  Random rnd(101);
+  std::map<std::string, std::string> kv;
+  {
+    uint32_t key = 0;
+    for (int block = 0; block < 800; block++) {
+      for (int i = 0; i < 16; i++) {
+        char k[9] = {0};
+        // Internal key is constructed directly from this key,
+        // and internal key size is required to be >= 8 bytes,
+        // so use %08u as the format string.
+        sprintf(k, "%08u", key);
+        std::string v = rnd.RandomString(256);
+        kv[std::string(k)] = v;
+        key++;
+      }
+    }
+  }
+
+  std::string table_name =
+      "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_);
+  CreateTable(table_name, compression_type_, kv);
+
+  std::unique_ptr<BlockBasedTable> table;
+  Options options;
+  ImmutableOptions ioptions(options);
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+
+  // Use the top level iterator to find the offset/size of the first
+  // 2nd level index block and corrupt the block
+  IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum};
+  InternalIteratorBase<IndexValue>* iiter = table->NewIndexIterator(
+      ReadOptions(), /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, &context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+  ASSERT_OK(iiter->status());
+  iiter->SeekToFirst();
+  BlockHandle handle = static_cast<PartitionedIndexIterator*>(iiter)
+                           ->index_iter_->value()
+                           .handle;
+  table.reset();
+
+  // Corrupt the block pointed to by handle
+  ASSERT_OK(test::CorruptFile(options.env, Path(table_name),
+                              static_cast<int>(handle.offset()), 128));
+
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
+  Status s = table->VerifyChecksum(ReadOptions(),
+                                   TableReaderCaller::kUserVerifyChecksum);
+  ASSERT_EQ(s.code(), Status::kCorruption);
+}
+
+// Param 1: compression type
+// Param 2: whether to use direct reads
+// Param 3: Block Based Table Index type
+// Param 4: BBTO no_block_cache option
+#ifdef ROCKSDB_LITE
+// Skip direct I/O tests in lite mode since direct I/O is unsupported.
+INSTANTIATE_TEST_CASE_P(
+    MultiGet, BlockBasedTableReaderTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()),
+        ::testing::Values(false),
+        ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch),
+        ::testing::Values(false)));
+#else   // ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    MultiGet, BlockBasedTableReaderTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
+        ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch),
+        ::testing::Values(false)));
+#endif  // ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetSupportedCompressions()),
+        ::testing::Values(false),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
+        ::testing::Values(true)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -50,7 +50,7 @@
     : block_restart_interval_(block_restart_interval),
       use_delta_encoding_(use_delta_encoding),
       use_value_delta_encoding_(use_value_delta_encoding),
-      restarts_(),
+      restarts_(1, 0),  // First restart point is at offset 0
       counter_(0),
       finished_(false) {
   switch (index_type) {
@@ -64,14 +64,13 @@
       assert(0);
   }
   assert(block_restart_interval_ >= 1);
-  restarts_.push_back(0);  // First restart point is at offset 0
   estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
 }
 
 void BlockBuilder::Reset() {
   buffer_.clear();
-  restarts_.clear();
-  restarts_.push_back(0);  // First restart point is at offset 0
+  restarts_.resize(1);  // First restart point is at offset 0
+  assert(restarts_[0] == 0);
   estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
   counter_ = 0;
   finished_ = false;
@@ -79,6 +78,14 @@
   if (data_block_hash_index_builder_.Valid()) {
     data_block_hash_index_builder_.Reset();
   }
+#ifndef NDEBUG
+  add_with_last_key_called_ = false;
+#endif
+}
+
+void BlockBuilder::SwapAndReset(std::string& buffer) {
+  std::swap(buffer_, buffer);
+  Reset();
 }
 
 size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
@@ -134,33 +141,62 @@
 
 void BlockBuilder::Add(const Slice& key, const Slice& value,
                        const Slice* const delta_value) {
+  // Ensure no unsafe mixing of Add and AddWithLastKey
+  assert(!add_with_last_key_called_);
+
+  AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size());
+  if (use_delta_encoding_) {
+    // Update state
+    // We used to just copy the changed data, but it appears to be
+    // faster to just copy the whole thing.
+    last_key_.assign(key.data(), key.size());
+  }
+}
+
+void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
+                                  const Slice& last_key_param,
+                                  const Slice* const delta_value) {
+  // Ensure no unsafe mixing of Add and AddWithLastKey
+  assert(last_key_.empty());
+#ifndef NDEBUG
+  add_with_last_key_called_ = false;
+#endif
+
+  // Here we make sure to use an empty `last_key` on first call after creation
+  // or Reset. This is more convenient for the caller and we can be more
+  // clever inside BlockBuilder. On this hot code path, we want to avoid
+  // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a
+  // fast min operation instead, with an assertion to be sure our logic is
+  // sound.
+  size_t buffer_size = buffer_.size();
+  size_t last_key_size = last_key_param.size();
+  assert(buffer_size == 0 || buffer_size >= last_key_size);
+
+  Slice last_key(last_key_param.data(), std::min(buffer_size, last_key_size));
+
+  AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size);
+}
+
+inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
+                                             const Slice& value,
+                                             const Slice& last_key,
+                                             const Slice* const delta_value,
+                                             size_t buffer_size) {
   assert(!finished_);
   assert(counter_ <= block_restart_interval_);
   assert(!use_value_delta_encoding_ || delta_value);
   size_t shared = 0;  // number of bytes shared with prev key
   if (counter_ >= block_restart_interval_) {
     // Restart compression
-    restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
+    restarts_.push_back(static_cast<uint32_t>(buffer_size));
     estimate_ += sizeof(uint32_t);
     counter_ = 0;
-
-    if (use_delta_encoding_) {
-      // Update state
-      last_key_.assign(key.data(), key.size());
-    }
   } else if (use_delta_encoding_) {
-    Slice last_key_piece(last_key_);
     // See how much sharing to do with previous string
-    shared = key.difference_offset(last_key_piece);
-
-    // Update state
-    // We used to just copy the changed data here, but it appears to be
-    // faster to just copy the whole thing.
-    last_key_.assign(key.data(), key.size());
+    shared = key.difference_offset(last_key);
   }
 
   const size_t non_shared = key.size() - shared;
-  const size_t curr_size = buffer_.size();
 
   if (use_value_delta_encoding_) {
     // Add "<shared><non_shared>" to buffer_
@@ -190,7 +226,7 @@
   }
 
   counter_++;
-  estimate_ += buffer_.size() - curr_size;
+  estimate_ += buffer_.size() - buffer_size;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -32,11 +32,29 @@
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
 
+  // Swap the contents in BlockBuilder with buffer, then reset the BlockBuilder.
+  void SwapAndReset(std::string& buffer);
+
   // REQUIRES: Finish() has not been called since the last call to Reset().
   // REQUIRES: key is larger than any previously added key
+  // DO NOT mix with AddWithLastKey() between Resets. For efficiency, use
+  // AddWithLastKey() in contexts where previous added key is already known
+  // and delta encoding might be used.
   void Add(const Slice& key, const Slice& value,
            const Slice* const delta_value = nullptr);
 
+  // A faster version of Add() if the previous key is already known for all
+  // Add()s.
+  // REQUIRES: Finish() has not been called since the last call to Reset().
+  // REQUIRES: key is larger than any previously added key
+  // REQUIRES: if AddWithLastKey has been called since last Reset(), last_key
+  // is the key from most recent AddWithLastKey. (For convenience, last_key
+  // is ignored on first call after creation or Reset().)
+  // DO NOT mix with Add() between Resets.
+  void AddWithLastKey(const Slice& key, const Slice& value,
+                      const Slice& last_key,
+                      const Slice* const delta_value = nullptr);
+
   // Finish building the block and return a slice that refers to the
   // block contents.  The returned slice will remain valid for the
   // lifetime of this builder or until Reset() is called.
@@ -57,6 +75,11 @@
   bool empty() const { return buffer_.empty(); }
 
  private:
+  inline void AddWithLastKeyImpl(const Slice& key, const Slice& value,
+                                 const Slice& last_key,
+                                 const Slice* const delta_value,
+                                 size_t buffer_size);
+
   const int block_restart_interval_;
   // TODO(myabandeh): put it into a separate IndexBlockBuilder
   const bool use_delta_encoding_;
@@ -70,6 +93,9 @@
   bool finished_;  // Has Finish() been called?
   std::string last_key_;
   DataBlockHashIndexBuilder data_block_hash_index_builder_;
+#ifndef NDEBUG
+  bool add_with_last_key_called_ = false;
+#endif
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_like_traits.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,225 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "cache/cache_entry_roles.h"
+#include "port/lang.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole();
+
+template <typename TBlocklike>
+Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit,
+                                        Statistics* statistics, bool using_zstd,
+                                        const FilterPolicy* filter_policy) {
+  return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy](
+             void* buf, size_t size, void** out_obj, size_t* charge) -> Status {
+    assert(buf != nullptr);
+    std::unique_ptr<char[]> buf_data(new char[size]());
+    memcpy(buf_data.get(), buf, size);
+    BlockContents bc = BlockContents(std::move(buf_data), size);
+    TBlocklike* ucd_ptr = BlocklikeTraits<TBlocklike>::Create(
+        std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd,
+        filter_policy);
+    *out_obj = reinterpret_cast<void*>(ucd_ptr);
+    *charge = size;
+    return Status::OK();
+  };
+}
+
+template <>
+class BlocklikeTraits<BlockContents> {
+ public:
+  static BlockContents* Create(BlockContents&& contents,
+                               size_t /* read_amp_bytes_per_bit */,
+                               Statistics* /* statistics */,
+                               bool /* using_zstd */,
+                               const FilterPolicy* /* filter_policy */) {
+    return new BlockContents(std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    BlockContents* ptr = static_cast<BlockContents*>(obj);
+    return ptr->data.size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    BlockContents* ptr = static_cast<BlockContents*>(from_obj);
+    const char* buf = ptr->data.data();
+    assert(length == ptr->data.size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    if (block_type == BlockType::kFilter) {
+      return GetCacheItemHelperForRole<
+          BlockContents, CacheEntryRole::kDeprecatedFilterBlock>();
+    } else {
+      // E.g. compressed cache
+      return GetCacheItemHelperForRole<BlockContents,
+                                       CacheEntryRole::kOtherBlock>();
+    }
+  }
+};
+
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+  static ParsedFullFilterBlock* Create(BlockContents&& contents,
+                                       size_t /* read_amp_bytes_per_bit */,
+                                       Statistics* /* statistics */,
+                                       bool /* using_zstd */,
+                                       const FilterPolicy* filter_policy) {
+    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(obj);
+    return ptr->GetBlockContentsData().size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(from_obj);
+    const char* buf = ptr->GetBlockContentsData().data();
+    assert(length == ptr->GetBlockContentsData().size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kFilter);
+    return GetCacheItemHelperForRole<ParsedFullFilterBlock,
+                                     CacheEntryRole::kFilterBlock>();
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit,
+                       Statistics* statistics, bool /* using_zstd */,
+                       const FilterPolicy* /* filter_policy */) {
+    return new Block(std::move(contents), read_amp_bytes_per_bit, statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    Block* ptr = static_cast<Block*>(obj);
+    return ptr->size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    Block* ptr = static_cast<Block*>(from_obj);
+    const char* buf = ptr->data();
+    assert(length == ptr->size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    switch (block_type) {
+      case BlockType::kData:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kDataBlock>();
+      case BlockType::kIndex:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kIndexBlock>();
+      case BlockType::kFilter:
+        return GetCacheItemHelperForRole<Block,
+                                         CacheEntryRole::kFilterMetaBlock>();
+      default:
+        // Not a recognized combination
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case BlockType::kRangeDeletion:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kOtherBlock>();
+    }
+  }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+  static UncompressionDict* Create(BlockContents&& contents,
+                                   size_t /* read_amp_bytes_per_bit */,
+                                   Statistics* /* statistics */,
+                                   bool using_zstd,
+                                   const FilterPolicy* /* filter_policy */) {
+    return new UncompressionDict(contents.data, std::move(contents.allocation),
+                                 using_zstd);
+  }
+
+  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(obj);
+    return ptr->slice_.size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(from_obj);
+    const char* buf = ptr->slice_.data();
+    assert(length == ptr->slice_.size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kCompressionDictionary);
+    return GetCacheItemHelperForRole<UncompressionDict,
+                                     CacheEntryRole::kOtherBlock>();
+  }
+};
+
+// Get an CacheItemHelper pointer for value type T and role R.
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole() {
+  static Cache::CacheItemHelper cache_helper(
+      BlocklikeTraits<T>::SizeCallback, BlocklikeTraits<T>::SaveToCallback,
+      GetCacheEntryDeleterForRole<T, R>());
+  return &cache_helper;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_prefetcher.h"
+
+#include "table/block_based/block_based_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
+                                       const BlockHandle& handle,
+                                       size_t readahead_size,
+                                       bool is_for_compaction) {
+  if (is_for_compaction) {
+    rep->CreateFilePrefetchBufferIfNotExists(compaction_readahead_size_,
+                                             compaction_readahead_size_,
+                                             &prefetch_buffer_, false);
+    return;
+  }
+
+  // Explicit user requested readahead.
+  if (readahead_size > 0) {
+    rep->CreateFilePrefetchBufferIfNotExists(readahead_size, readahead_size,
+                                             &prefetch_buffer_, false);
+    return;
+  }
+
+  // Implicit readahead.
+
+  // If max_auto_readahead_size is set to be 0 by user, no data will be
+  // prefetched.
+  size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
+  if (max_auto_readahead_size == 0) {
+    return;
+  }
+
+  size_t len = BlockBasedTable::BlockSizeWithTrailer(handle);
+  size_t offset = handle.offset();
+
+  // If FS supports prefetching (readahead_limit_ will be non zero in that case)
+  // and current block exists in prefetch buffer then return.
+  if (offset + len <= readahead_limit_) {
+    UpdateReadPattern(offset, len);
+    return;
+  }
+
+  if (!IsBlockSequential(offset)) {
+    UpdateReadPattern(offset, len);
+    ResetValues();
+    return;
+  }
+  UpdateReadPattern(offset, len);
+
+  // Implicit auto readahead, which will be enabled if the number of reads
+  // reached `kMinNumFileReadsToStartAutoReadahead` (default: 2)  and scans are
+  // sequential.
+  num_file_reads_++;
+  if (num_file_reads_ <=
+      BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
+    return;
+  }
+
+  if (initial_auto_readahead_size_ > max_auto_readahead_size) {
+    initial_auto_readahead_size_ = max_auto_readahead_size;
+  }
+
+  if (rep->file->use_direct_io()) {
+    rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size_,
+                                             max_auto_readahead_size,
+                                             &prefetch_buffer_, true);
+    return;
+  }
+
+  if (readahead_size_ > max_auto_readahead_size) {
+    readahead_size_ = max_auto_readahead_size;
+  }
+
+  // If prefetch is not supported, fall back to use internal prefetch buffer.
+  // Discarding other return status of Prefetch calls intentionally, as
+  // we can fallback to reading from disk if Prefetch fails.
+  Status s = rep->file->Prefetch(
+      handle.offset(),
+      BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_);
+  if (s.IsNotSupported()) {
+    rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size_,
+                                             max_auto_readahead_size,
+                                             &prefetch_buffer_, true);
+    return;
+  }
+
+  readahead_limit_ = offset + len + readahead_size_;
+  // Keep exponentially increasing readahead size until
+  // max_auto_readahead_size.
+  readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_prefetcher.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+class BlockPrefetcher {
+ public:
+  explicit BlockPrefetcher(size_t compaction_readahead_size)
+      : compaction_readahead_size_(compaction_readahead_size) {}
+  void PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
+                        const BlockHandle& handle, size_t readahead_size,
+                        bool is_for_compaction);
+  FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); }
+
+  void UpdateReadPattern(const uint64_t& offset, const size_t& len) {
+    prev_offset_ = offset;
+    prev_len_ = len;
+  }
+
+  bool IsBlockSequential(const uint64_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  void ResetValues() {
+    num_file_reads_ = 1;
+    // Since initial_auto_readahead_size_ can be different from
+    // kInitAutoReadaheadSize in case of adaptive_readahead, so fallback the
+    // readahead_size_ to kInitAutoReadaheadSize in case of reset.
+    initial_auto_readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
+    readahead_size_ = initial_auto_readahead_size_;
+    readahead_limit_ = 0;
+    return;
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
+    num_file_reads_ = readahead_info->num_file_reads;
+    initial_auto_readahead_size_ = readahead_info->readahead_size;
+    TEST_SYNC_POINT_CALLBACK("BlockPrefetcher::SetReadaheadState",
+                             &initial_auto_readahead_size_);
+  }
+
+ private:
+  // Readahead size used in compaction, its value is used only if
+  // lookup_context_.caller = kCompaction.
+  size_t compaction_readahead_size_;
+
+  // readahead_size_ is used if underlying FS supports prefetching.
+  size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
+  size_t readahead_limit_ = 0;
+  // initial_auto_readahead_size_ is used if RocksDB uses internal prefetch
+  // buffer.
+  uint64_t initial_auto_readahead_size_ =
+      BlockBasedTable::kInitAutoReadaheadSize;
+  int64_t num_file_reads_ = 0;
+  uint64_t prev_offset_ = 0;
+  size_t prev_len_ = 0;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,7 +4,10 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 
+#include "table/block_based/block.h"
+
 #include <stdio.h>
+
 #include <algorithm>
 #include <set>
 #include <string>
@@ -20,7 +23,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
 #include "table/format.h"
 #include "test_util/testharness.h"
@@ -29,20 +32,16 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-static std::string RandomString(Random *rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
-                        Random *rnd) {
+std::string GenerateInternalKey(int primary_key, int secondary_key,
+                                int padding_size, Random *rnd) {
   char buf[50];
   char *p = &buf[0];
   snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
   std::string k(p);
   if (padding_size) {
-    k += RandomString(rnd, padding_size);
+    k += rnd->RandomString(padding_size);
   }
+  AppendInternalKeyFooter(&k, 0 /* seqno */, kTypeValue);
 
   return k;
 }
@@ -61,10 +60,11 @@
   for (int i = from; i < from + len; i += step) {
     // generating keys that shares the prefix
     for (int j = 0; j < keys_share_prefix; ++j) {
-      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+      // `DataBlockIter` assumes it reads only internal keys.
+      keys->emplace_back(GenerateInternalKey(i, j, padding_size, &rnd));
 
       // 100 bytes values
-      values->emplace_back(RandomString(&rnd, 100));
+      values->emplace_back(rnd.RandomString(100));
     }
   }
 }
@@ -93,12 +93,12 @@
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  Block reader(std::move(contents));
 
   // read contents of block sequentially
   int count = 0;
   InternalIterator *iter =
-      reader.NewDataIterator(options.comparator, options.comparator);
+      reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber);
   for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
     // read kv from block
     Slice k = iter->key();
@@ -111,7 +111,8 @@
   delete iter;
 
   // read block contents randomly
-  iter = reader.NewDataIterator(options.comparator, options.comparator);
+  iter =
+      reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber);
   for (int i = 0; i < num_records; i++) {
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
@@ -151,14 +152,14 @@
   const size_t prefix_size = 6;
   // create block reader
   BlockContents contents_ref(contents.data);
-  Block reader1(std::move(contents), kDisableGlobalSequenceNumber);
-  Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber);
+  Block reader1(std::move(contents));
+  Block reader2(std::move(contents_ref));
 
   std::unique_ptr<const SliceTransform> prefix_extractor(
       NewFixedPrefixTransform(prefix_size));
 
-  std::unique_ptr<InternalIterator> regular_iter(
-      reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator()));
+  std::unique_ptr<InternalIterator> regular_iter(reader2.NewDataIterator(
+      BytewiseComparator(), kDisableGlobalSequenceNumber));
 
   // Seek existent keys
   for (size_t i = 0; i < keys.size(); i++) {
@@ -175,7 +176,8 @@
   // simply be set as invalid; whereas the binary search based iterator will
   // return the one that is closest.
   for (int i = 1; i < max_key - 1; i += 2) {
-    auto key = GenerateKey(i, 0, 0, nullptr);
+    // `DataBlockIter` assumes its APIs receive only internal keys.
+    auto key = GenerateInternalKey(i, 0, 0, nullptr);
     regular_iter->Seek(key);
     ASSERT_TRUE(regular_iter->Valid());
   }
@@ -375,13 +377,12 @@
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    Block reader(std::move(contents), kDisableGlobalSequenceNumber,
-                 kBytesPerBit, stats.get());
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
 
     // read contents of block sequentially
     size_t read_bytes = 0;
     DataBlockIter *iter = reader.NewDataIterator(
-        options.comparator, options.comparator, nullptr, stats.get());
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       iter->value();
       read_bytes += iter->TEST_CurrentEntrySize();
@@ -408,12 +409,11 @@
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    Block reader(std::move(contents), kDisableGlobalSequenceNumber,
-                 kBytesPerBit, stats.get());
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
     DataBlockIter *iter = reader.NewDataIterator(
-        options.comparator, options.comparator, nullptr, stats.get());
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     for (int i = 0; i < num_records; i++) {
       Slice k(keys[i]);
 
@@ -443,12 +443,11 @@
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    Block reader(std::move(contents), kDisableGlobalSequenceNumber,
-                 kBytesPerBit, stats.get());
+    Block reader(std::move(contents), kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
     DataBlockIter *iter = reader.NewDataIterator(
-        options.comparator, options.comparator, nullptr, stats.get());
+        options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     std::unordered_set<int> read_keys;
     for (int i = 0; i < num_records; i++) {
       int index = rnd.Uniform(num_records);
@@ -524,7 +523,7 @@
     separators->emplace_back(*it++);
     uint64_t size = rnd.Uniform(1024 * 16);
     BlockHandle handle(offset, size);
-    offset += size + kBlockTrailerSize;
+    offset += size + BlockBasedTable::kBlockTrailerSize;
     block_handles->emplace_back(handle);
   }
 }
@@ -563,7 +562,7 @@
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  Block reader(std::move(contents));
 
   const bool kTotalOrderSeek = true;
   const bool kIncludesSeq = true;
@@ -572,7 +571,7 @@
   Statistics *kNullStats = nullptr;
   // read contents of block sequentially
   InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
-      options.comparator, options.comparator, kNullIter, kNullStats,
+      options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
       kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
   iter->SeekToFirst();
   for (int index = 0; index < num_records; ++index) {
@@ -592,9 +591,9 @@
   delete iter;
 
   // read block contents randomly
-  iter = reader.NewIndexIterator(options.comparator, options.comparator,
-                                 kNullIter, kNullStats, kTotalOrderSeek,
-                                 includeFirstKey(), kIncludesSeq, kValueIsFull);
+  iter = reader.NewIndexIterator(
+      options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
   for (int i = 0; i < num_records * 2; i++) {
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_type.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_type.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/block_type.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/block_type.h	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,8 @@
 
 #include <cstdint>
 
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 // Represents the types of blocks used in the block based table format.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/cachable_entry.h	2025-05-19 16:14:27.000000000 +0000
@@ -162,7 +162,6 @@
   }
 
   void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
-    assert(value != nullptr);
     assert(cache != nullptr);
     assert(cache_handle != nullptr);
 
@@ -179,6 +178,22 @@
     assert(!own_value_);
   }
 
+  void UpdateCachedValue() {
+    assert(cache_ != nullptr);
+    assert(cache_handle_ != nullptr);
+
+    value_ = static_cast<T*>(cache_->Value(cache_handle_));
+  }
+
+  bool IsReady() {
+    if (!own_value_) {
+      assert(cache_ != nullptr);
+      assert(cache_handle_ != nullptr);
+      return cache_->IsReady(cache_handle_);
+    }
+    return true;
+  }
+
 private:
   void ReleaseResource() {
     if (LIKELY(cache_handle_ != nullptr)) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,6 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "table/block_based/data_block_hash_index.h"
+
 #include <cstdlib>
 #include <string>
 #include <unordered_map>
@@ -12,11 +14,11 @@
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
-#include "table/block_based/data_block_hash_index.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -35,12 +37,6 @@
   return entry == restart_point;
 }
 
-// Random KV generator similer to block_test
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
 std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
                         Random* rnd) {
   char buf[50];
@@ -48,7 +44,7 @@
   snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
   std::string k(p);
   if (padding_size) {
-    k += RandomString(rnd, padding_size);
+    k += rnd->RandomString(padding_size);
   }
 
   return k;
@@ -71,7 +67,7 @@
       keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
 
       // 100 bytes values
-      values->emplace_back(RandomString(&rnd, 100));
+      values->emplace_back(rnd.RandomString(100));
     }
   }
 }
@@ -284,7 +280,7 @@
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+    Block reader(std::move(contents));
 
     ASSERT_EQ(reader.IndexType(),
               BlockBasedTableOptions::kDataBlockBinaryAndHash);
@@ -306,7 +302,7 @@
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+    Block reader(std::move(contents));
 
     ASSERT_EQ(reader.IndexType(),
               BlockBasedTableOptions::kDataBlockBinarySearch);
@@ -337,7 +333,7 @@
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+    Block reader(std::move(contents));
 
     ASSERT_EQ(reader.IndexType(),
               BlockBasedTableOptions::kDataBlockBinaryAndHash);
@@ -361,7 +357,7 @@
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+    Block reader(std::move(contents));
 
     // the index type have fallen back to binary when build finish.
     ASSERT_EQ(reader.IndexType(),
@@ -388,10 +384,11 @@
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  Block reader(std::move(contents));
 
   const InternalKeyComparator icmp(BytewiseComparator());
-  auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+  auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                     kDisableGlobalSequenceNumber);
   bool may_exist;
   // search in block for the key just inserted
   {
@@ -469,12 +466,13 @@
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  Block reader(std::move(contents));
   const InternalKeyComparator icmp(BytewiseComparator());
 
   // random seek existent keys
   for (int i = 0; i < num_records; i++) {
-    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+    auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                       kDisableGlobalSequenceNumber);
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
     std::string ukey(keys[index] + "1" /* existing key marker */);
@@ -511,7 +509,8 @@
   //     C         true          false
 
   for (int i = 0; i < num_records; i++) {
-    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
+    auto iter = reader.NewDataIterator(icmp.user_comparator(),
+                                       kDisableGlobalSequenceNumber);
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
     std::string ukey(keys[index] + "0" /* non-existing key marker */);
@@ -540,26 +539,27 @@
   int level_ = -1;
 
   std::vector<std::string> keys;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
 
   EnvOptions soptions;
 
   soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> f(sink);
   file_writer.reset(
-      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+      new WritableFileWriter(std::move(f), "" /* don't care */, FileOptions()));
   std::unique_ptr<TableBuilder> builder;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   builder.reset(ioptions.table_factory->NewTableBuilder(
-      TableBuilderOptions(ioptions, moptions, internal_comparator,
-                          &int_tbl_prop_collector_factories,
-                          options.compression, options.sample_for_compression,
-                          CompressionOptions(), false /* skip_filters */,
-                          column_family_name, level_),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(
+          ioptions, moptions, internal_comparator,
+          &int_tbl_prop_collector_factories, options.compression,
+          CompressionOptions(),
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+          column_family_name, level_),
       file_writer.get()));
 
   builder->Add(ik1.Encode().ToString(), v1);
@@ -570,23 +570,20 @@
   file_writer->Flush();
   EXPECT_TRUE(s.ok()) << s.ToString();
 
-  EXPECT_EQ(
-      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
-      builder->FileSize());
+  EXPECT_EQ(sink->contents().size(), builder->FileSize());
 
   // Open the table
-  file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource(
-      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents(),
-      0 /*uniq_id*/, ioptions.allow_mmap_reads)));
+  test::StringSource* source = new test::StringSource(
+      sink->contents(), 0 /*uniq_id*/, ioptions.allow_mmap_reads);
+  std::unique_ptr<FSRandomAccessFile> file(source);
+  file_reader.reset(new RandomAccessFileReader(std::move(file), "test"));
   const bool kSkipFilters = true;
   const bool kImmortal = true;
-  ioptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+  ASSERT_OK(ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
                          internal_comparator, !kSkipFilters, !kImmortal,
                          level_),
-      std::move(file_reader),
-      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
-      &table_reader);
+      std::move(file_reader), sink->contents().size(), &table_reader));
   // Search using Get()
   ReadOptions ro;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block.h	2025-05-19 16:14:27.000000000 +0000
@@ -20,10 +20,11 @@
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <memory>
 #include <string>
 #include <vector>
-#include "db/dbformat.h"
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
@@ -60,8 +61,11 @@
 
   virtual bool IsBlockBased() = 0;                    // If is blockbased filter
   virtual void StartBlock(uint64_t block_offset) = 0;  // Start new block filter
-  virtual void Add(const Slice& key) = 0;      // Add a key to current filter
-  virtual size_t NumAdded() const = 0;         // Number of keys added
+  virtual void Add(
+      const Slice& key_without_ts) = 0;        // Add a key to current filter
+  virtual bool IsEmpty() const = 0;            // Empty == none added
+  // For reporting stats on how many entries the builder considered unique
+  virtual size_t EstimateEntriesAdded() = 0;
   Slice Finish() {                             // Generate Filter
     const BlockHandle empty_handle;
     Status dont_care_status;
@@ -69,7 +73,19 @@
     assert(dont_care_status.ok());
     return ret;
   }
-  virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
+  // If filter_data is not nullptr, Finish() may transfer ownership of
+  // underlying filter data to the caller,  so that it can be freed as soon as
+  // possible. BlockBasedFilterBlock will ignore this parameter.
+  //
+  virtual Slice Finish(
+      const BlockHandle& tmp /* only used in PartitionedFilterBlock as
+                                last_partition_block_handle */
+      ,
+      Status* status, std::unique_ptr<const char[]>* filter_data = nullptr) = 0;
+
+  // It is for releasing the memory usage and cache reservation of filter bits
+  // builder in FullFilter and PartitionedFilter
+  virtual void ResetFilterBitsBuilder() {}
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
@@ -108,11 +124,11 @@
                             uint64_t block_offset, const bool no_io,
                             BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
-      const Slice ukey = iter->ukey;
+      const Slice ukey_without_ts = iter->ukey_without_ts;
       const Slice ikey = iter->ikey;
       GetContext* const get_context = iter->get_context;
-      if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey,
-                       get_context, lookup_context)) {
+      if (!KeyMayMatch(ukey_without_ts, prefix_extractor, block_offset, no_io,
+                       &ikey, get_context, lookup_context)) {
         range->SkipKey(iter);
       }
     }
@@ -133,13 +149,13 @@
                                 uint64_t block_offset, const bool no_io,
                                 BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
-      const Slice ukey = iter->ukey;
+      const Slice ukey_without_ts = iter->ukey_without_ts;
       const Slice ikey = iter->ikey;
       GetContext* const get_context = iter->get_context;
-      if (prefix_extractor->InDomain(ukey) &&
-          !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
-                          block_offset, no_io, &ikey, get_context,
-                          lookup_context)) {
+      if (prefix_extractor->InDomain(ukey_without_ts) &&
+          !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts),
+                          prefix_extractor, block_offset, no_io, &ikey,
+                          get_context, lookup_context)) {
         range->SkipKey(iter);
       }
     }
@@ -153,21 +169,24 @@
     return error_msg;
   }
 
-  virtual void CacheDependencies(bool /*pin*/) {}
+  virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) {
+    return Status::OK();
+  }
 
   virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
-                             const Slice& user_key,
+                             const Slice& user_key_without_ts,
                              const SliceTransform* prefix_extractor,
                              const Comparator* /*comparator*/,
                              const Slice* const const_ikey_ptr,
                              bool* filter_checked, bool need_upper_bound_check,
+                             bool no_io,
                              BlockCacheLookupContext* lookup_context) {
     if (need_upper_bound_check) {
       return true;
     }
     *filter_checked = true;
-    Slice prefix = prefix_extractor->Transform(user_key);
-    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+    Slice prefix = prefix_extractor->Transform(user_key_without_ts);
+    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
                           const_ikey_ptr, /* get_context */ nullptr,
                           lookup_context);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc	2025-05-19 16:14:27.000000000 +0000
@@ -30,7 +30,8 @@
       table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
                            UncompressionDict::GetEmptyDict(), filter_block,
                            BlockType::kFilter, get_context, lookup_context,
-                           /* for_compaction */ false, use_cache);
+                           /* for_compaction */ false, use_cache,
+                           /* wait_for_cache */ true);
 
   return s;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,30 +7,239 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "rocksdb/filter_policy.h"
+
 #include <array>
 #include <deque>
+#include <limits>
+#include <memory>
 
-#include "rocksdb/filter_policy.h"
-
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "logging/logging.h"
 #include "rocksdb/slice.h"
 #include "table/block_based/block_based_filter_block.h"
-#include "table/block_based/full_filter_block.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
 #include "third-party/folly/folly/ConstexprMath.h"
 #include "util/bloom_impl.h"
 #include "util/coding.h"
 #include "util/hash.h"
+#include "util/ribbon_config.h"
+#include "util/ribbon_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace {
 
+// Metadata trailer size for built-in filters. (This is separate from
+// block-based table block trailer.)
+//
+// Originally this was 1 byte for num_probes and 4 bytes for number of
+// cache lines in the Bloom filter, but now the first trailer byte is
+// usually an implementation marker and remaining 4 bytes have various
+// meanings.
+static constexpr uint32_t kMetadataLen = 5;
+
+Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
+  // Missing metadata, treated as zero entries
+  return Slice(nullptr, 0);
+}
+
+// Base class for filter builders using the XXH3 preview hash,
+// also known as Hash64 or GetSliceHash64.
+class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit XXPH3FilterBitsBuilder(
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr)
+      : aggregate_rounding_balance_(aggregate_rounding_balance),
+        cache_res_mgr_(cache_res_mgr) {}
+
+  ~XXPH3FilterBitsBuilder() override {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint64_t hash = GetSliceHash64(key);
+    // Especially with prefixes, it is common to have repetition,
+    // though only adjacent repetition, which we want to immediately
+    // recognize and collapse for estimating true filter space
+    // requirements.
+    if (hash_entries_.empty() || hash != hash_entries_.back()) {
+      hash_entries_.push_back(hash);
+      if (cache_res_mgr_ &&
+          // Traditional rounding to whole bucket size
+          ((hash_entries_.size() % kUint64tHashEntryCacheResBucketSize) ==
+           kUint64tHashEntryCacheResBucketSize / 2)) {
+        hash_entry_cache_res_bucket_handles_.emplace_back(nullptr);
+        Status s =
+            cache_res_mgr_
+                ->MakeCacheReservation<CacheEntryRole::kFilterConstruction>(
+                    kUint64tHashEntryCacheResBucketSize * sizeof(hash),
+                    &hash_entry_cache_res_bucket_handles_.back());
+        s.PermitUncheckedError();
+      }
+    }
+  }
+
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_.size();
+  }
+
+ protected:
+  static constexpr uint32_t kMetadataLen = 5;
+
+  // Number of hash entries to accumulate before charging their memory usage to
+  // the cache when cache reservation is available
+  static const std::size_t kUint64tHashEntryCacheResBucketSize =
+      CacheReservationManager::GetDummyEntrySize() / sizeof(uint64_t);
+
+  // For delegating between XXPH3FilterBitsBuilders
+  void SwapEntriesWith(XXPH3FilterBitsBuilder* other) {
+    std::swap(hash_entries_, other->hash_entries_);
+    if (cache_res_mgr_) {
+      std::swap(hash_entry_cache_res_bucket_handles_,
+                other->hash_entry_cache_res_bucket_handles_);
+    }
+  }
+
+  virtual size_t RoundDownUsableSpace(size_t available_size) = 0;
+
+  // To choose size using malloc_usable_size, we have to actually allocate.
+  size_t AllocateMaybeRounding(size_t target_len_with_metadata,
+                               size_t num_entries,
+                               std::unique_ptr<char[]>* buf) {
+    // Return value set to a default; overwritten in some cases
+    size_t rv = target_len_with_metadata;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    if (aggregate_rounding_balance_ != nullptr) {
+      // Do optimize_filters_for_memory, using malloc_usable_size.
+      // Approach: try to keep FP rate balance better than or on
+      // target (negative aggregate_rounding_balance_). We can then select a
+      // lower bound filter size (within reasonable limits) that gets us as
+      // close to on target as possible. We request allocation for that filter
+      // size and use malloc_usable_size to "round up" to the actual
+      // allocation size.
+
+      // Although it can be considered bad practice to use malloc_usable_size
+      // to access an object beyond its original size, this approach should be
+      // quite general: working for all allocators that properly support
+      // malloc_usable_size.
+
+      // Race condition on balance is OK because it can only cause temporary
+      // skew in rounding up vs. rounding down, as long as updates are atomic
+      // and relative.
+      int64_t balance = aggregate_rounding_balance_->load();
+
+      double target_fp_rate =
+          EstimatedFpRate(num_entries, target_len_with_metadata);
+      double rv_fp_rate = target_fp_rate;
+
+      if (balance < 0) {
+        // See formula for BloomFilterPolicy::aggregate_rounding_balance_
+        double for_balance_fp_rate =
+            -balance / double{0x100000000} + target_fp_rate;
+
+        // To simplify, we just try a few modified smaller sizes. This also
+        // caps how much we vary filter size vs. target, to avoid outlier
+        // behavior from excessive variance.
+        size_t target_len = target_len_with_metadata - kMetadataLen;
+        assert(target_len < target_len_with_metadata);  // check underflow
+        for (uint64_t maybe_len_rough :
+             {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16,
+              uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) {
+          size_t maybe_len_with_metadata =
+              RoundDownUsableSpace(maybe_len_rough + kMetadataLen);
+          double maybe_fp_rate =
+              EstimatedFpRate(num_entries, maybe_len_with_metadata);
+          if (maybe_fp_rate <= for_balance_fp_rate) {
+            rv = maybe_len_with_metadata;
+            rv_fp_rate = maybe_fp_rate;
+            break;
+          }
+        }
+      }
+
+      // Filter blocks are loaded into block cache with their block trailer.
+      // We need to make sure that's accounted for in choosing a
+      // fragmentation-friendly size.
+      const size_t kExtraPadding = BlockBasedTable::kBlockTrailerSize;
+      size_t requested = rv + kExtraPadding;
+
+      // Allocate and get usable size
+      buf->reset(new char[requested]);
+      size_t usable = malloc_usable_size(buf->get());
+
+      if (usable - usable / 4 > requested) {
+        // Ratio greater than 4/3 is too much for utilizing, if it's
+        // not a buggy or mislinked malloc_usable_size implementation.
+        // Non-linearity of FP rates with bits/key means rapidly
+        // diminishing returns in overall accuracy for additional
+        // storage on disk.
+        // Nothing to do, except assert that the result is accurate about
+        // the usable size. (Assignment never used.)
+        assert(((*buf)[usable - 1] = 'x'));
+      } else if (usable > requested) {
+        rv = RoundDownUsableSpace(usable - kExtraPadding);
+        assert(rv <= usable - kExtraPadding);
+        rv_fp_rate = EstimatedFpRate(num_entries, rv);
+      } else {
+        // Too small means bad malloc_usable_size
+        assert(usable == requested);
+      }
+      memset(buf->get(), 0, rv);
+
+      // Update balance
+      int64_t diff = static_cast<int64_t>((rv_fp_rate - target_fp_rate) *
+                                          double{0x100000000});
+      *aggregate_rounding_balance_ += diff;
+    } else {
+      buf->reset(new char[rv]());
+    }
+#else
+    (void)num_entries;
+    buf->reset(new char[rv]());
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return rv;
+  }
+
+  // A deque avoids unnecessary copying of already-saved values
+  // and has near-minimal peak memory use.
+  std::deque<uint64_t> hash_entries_;
+
+  // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr,
+  // always "round up" like historic behavior.
+  std::atomic<int64_t>* aggregate_rounding_balance_;
+
+  // For reserving memory used in (new) Bloom and Ribbon Filter construction
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+
+  // For managing cache reservation for buckets of hash entry in (new) Bloom and
+  // Ribbon Filter construction
+  std::deque<std::unique_ptr<
+      CacheReservationHandle<CacheEntryRole::kFilterConstruction>>>
+      hash_entry_cache_res_bucket_handles_;
+
+  // For managing cache reservation for final filter in (new) Bloom and Ribbon
+  // Filter construction
+  std::deque<std::unique_ptr<
+      CacheReservationHandle<CacheEntryRole::kFilterConstruction>>>
+      final_filter_cache_res_handles_;
+};
+
+// #################### FastLocalBloom implementation ################## //
+// ############## also known as format_version=5 Bloom filter ########## //
+
 // See description in FastLocalBloomImpl
-class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder {
  public:
-  explicit FastLocalBloomBitsBuilder(const int millibits_per_key)
-      : millibits_per_key_(millibits_per_key),
-        num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) {
+  // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
+  explicit FastLocalBloomBitsBuilder(
+      const int millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr)
+      : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr),
+        millibits_per_key_(millibits_per_key) {
     assert(millibits_per_key >= 1000);
   }
 
@@ -40,70 +249,126 @@
 
   ~FastLocalBloomBitsBuilder() override {}
 
-  virtual void AddKey(const Slice& key) override {
-    uint64_t hash = GetSliceHash64(key);
-    if (hash_entries_.empty() || hash != hash_entries_.back()) {
-      hash_entries_.push_back(hash);
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    size_t num_entries = hash_entries_.size();
+    size_t len_with_metadata = CalculateSpace(num_entries);
+
+    std::unique_ptr<char[]> mutable_buf;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+    // Cache reservation for mutable_buf
+    if (cache_res_mgr_) {
+      std::unique_ptr<
+          CacheReservationHandle<CacheEntryRole::kFilterConstruction>>
+          final_filter_cache_res_handle;
+      Status s =
+          cache_res_mgr_
+              ->MakeCacheReservation<CacheEntryRole::kFilterConstruction>(
+                  len_with_metadata * sizeof(char),
+                  &final_filter_cache_res_handle);
+      final_filter_cache_res_handles_.push_back(
+          std::move(final_filter_cache_res_handle));
+      s.PermitUncheckedError();
     }
-  }
 
-  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
-    uint32_t len_with_metadata =
-        CalculateSpace(static_cast<uint32_t>(hash_entries_.size()));
-    char* data = new char[len_with_metadata];
-    memset(data, 0, len_with_metadata);
+    assert(mutable_buf);
+    assert(len_with_metadata >= kMetadataLen);
+
+    // Max size supported by implementation
+    assert(len_with_metadata <= 0xffffffffU);
 
-    assert(data);
-    assert(len_with_metadata >= 5);
+    // Compute num_probes after any rounding / adjustments
+    int num_probes = GetNumProbes(num_entries, len_with_metadata);
 
-    uint32_t len = len_with_metadata - 5;
+    uint32_t len = static_cast<uint32_t>(len_with_metadata - kMetadataLen);
     if (len > 0) {
-      AddAllEntries(data, len);
+      AddAllEntries(mutable_buf.get(), len, num_probes);
     }
 
+    assert(hash_entries_.empty());
+    // Release cache for hash entries
+    hash_entry_cache_res_bucket_handles_.clear();
+
     // See BloomFilterPolicy::GetBloomBitsReader re: metadata
     // -1 = Marker for newer Bloom implementations
-    data[len] = static_cast<char>(-1);
+    mutable_buf[len] = static_cast<char>(-1);
     // 0 = Marker for this sub-implementation
-    data[len + 1] = static_cast<char>(0);
+    mutable_buf[len + 1] = static_cast<char>(0);
     // num_probes (and 0 in upper bits for 64-byte block size)
-    data[len + 2] = static_cast<char>(num_probes_);
+    mutable_buf[len + 2] = static_cast<char>(num_probes);
     // rest of metadata stays zero
 
-    const char* const_data = data;
-    buf->reset(const_data);
-    assert(hash_entries_.empty());
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
+    return rv;
+  }
+
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t bytes_no_meta =
+        bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0;
+    return static_cast<size_t>(uint64_t{8000} * bytes_no_meta /
+                               millibits_per_key_);
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    // If not for cache line blocks in the filter, what would the target
+    // length in bytes be?
+    size_t raw_target_len = static_cast<size_t>(
+        (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000);
+
+    if (raw_target_len >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      raw_target_len = size_t{0xffffffc0};
+    }
 
-    return Slice(data, len_with_metadata);
+    // Round up to nearest multiple of 64 (block size). This adjustment is
+    // used for target FP rate only so that we don't receive complaints about
+    // lower FP rate vs. historic Bloom filter behavior.
+    return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen;
   }
 
-  int CalculateNumEntry(const uint32_t bytes) override {
-    uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0;
-    return static_cast<int>(uint64_t{8000} * bytes_no_meta /
-                            millibits_per_key_);
+  double EstimatedFpRate(size_t keys, size_t len_with_metadata) override {
+    int num_probes = GetNumProbes(keys, len_with_metadata);
+    return FastLocalBloomImpl::EstimatedFpRate(
+        keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64);
   }
 
-  uint32_t CalculateSpace(const int num_entry) override {
-    uint32_t num_cache_lines = 0;
-    if (millibits_per_key_ > 0 && num_entry > 0) {
-      num_cache_lines = static_cast<uint32_t>(
-          (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000);
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    if (rv >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      rv = size_t{0xffffffc0};
     }
-    return num_cache_lines * 64 + /*metadata*/ 5;
-  }
 
-  double EstimatedFpRate(size_t keys, size_t bytes) override {
-    return FastLocalBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5,
-                                               num_probes_, /*hash bits*/ 64);
+    // round down to multiple of 64 (block size)
+    rv &= ~size_t{63};
+
+    return rv + kMetadataLen;
   }
 
  private:
-  void AddAllEntries(char* data, uint32_t len) {
+  // Compute num_probes after any rounding / adjustments
+  int GetNumProbes(size_t keys, size_t len_with_metadata) {
+    uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000;
+    int actual_millibits_per_key =
+        static_cast<int>(millibits / std::max(keys, size_t{1}));
+    // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to
+    // minimize unit test churn. Remove this some time.
+    if (!aggregate_rounding_balance_) {
+      actual_millibits_per_key = millibits_per_key_;
+    }
+    // END XXX/TODO
+    return FastLocalBloomImpl::ChooseNumProbes(actual_millibits_per_key);
+  }
+
+  void AddAllEntries(char* data, uint32_t len, int num_probes) {
     // Simple version without prefetching:
     //
     // for (auto h : hash_entries_) {
     //   FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
-    //                               num_probes_, data);
+    //                               num_probes, data);
     // }
 
     const size_t num_entries = hash_entries_.size();
@@ -129,7 +394,7 @@
       uint32_t& hash_ref = hashes[i & kBufferMask];
       uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
       // Process (add)
-      FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_,
+      FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes,
                                           data + byte_offset_ref);
       // And buffer
       uint64_t h = hash_entries_.front();
@@ -141,16 +406,13 @@
 
     // Finish processing
     for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
-      FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_,
+      FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes,
                                           data + byte_offsets[i]);
     }
   }
 
+  // Target allocation per added key, in thousandths of a bit.
   int millibits_per_key_;
-  int num_probes_;
-  // A deque avoids unnecessary copying of already-saved values
-  // and has near-minimal peak memory use.
-  std::deque<uint64_t> hash_entries_;
 };
 
 // See description in FastLocalBloomImpl
@@ -195,6 +457,405 @@
   const uint32_t len_bytes_;
 };
 
+// ##################### Ribbon filter implementation ################### //
+
+// Implements concept RehasherTypesAndSettings in ribbon_impl.h
+struct Standard128RibbonRehasherTypesAndSettings {
+  // These are schema-critical. Any change almost certainly changes
+  // underlying data.
+  static constexpr bool kIsFilter = true;
+  static constexpr bool kHomogeneous = false;
+  static constexpr bool kFirstCoeffAlwaysOne = true;
+  static constexpr bool kUseSmash = false;
+  using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
+  using Hash = uint64_t;
+  using Seed = uint32_t;
+  // Changing these doesn't necessarily change underlying data,
+  // but might affect supported scalability of those dimensions.
+  using Index = uint32_t;
+  using ResultRow = uint32_t;
+  // Save a conditional in Ribbon queries
+  static constexpr bool kAllowZeroStarts = false;
+};
+
+using Standard128RibbonTypesAndSettings =
+    ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
+
+class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
+ public:
+  explicit Standard128RibbonBitsBuilder(
+      double desired_one_in_fp_rate, int bloom_millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance,
+      std::shared_ptr<CacheReservationManager> cache_res_mgr, Logger* info_log)
+      : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr),
+        desired_one_in_fp_rate_(desired_one_in_fp_rate),
+        info_log_(info_log),
+        bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance,
+                        cache_res_mgr) {
+    assert(desired_one_in_fp_rate >= 1.0);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete;
+  void operator=(const Standard128RibbonBitsBuilder&) = delete;
+
+  ~Standard128RibbonBitsBuilder() override {}
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    if (hash_entries_.size() > kMaxRibbonEntries) {
+      ROCKS_LOG_WARN(info_log_, "Too many keys for Ribbon filter: %llu",
+                     static_cast<unsigned long long>(hash_entries_.size()));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_.empty());
+      return bloom_fallback_.Finish(buf);
+    }
+    if (hash_entries_.size() == 0) {
+      // Save a conditional in Ribbon queries by using alternate reader
+      // for zero entries added.
+      return FinishAlwaysFalse(buf);
+    }
+    uint32_t num_entries = static_cast<uint32_t>(hash_entries_.size());
+    uint32_t num_slots;
+    size_t len_with_metadata;
+
+    CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots);
+
+    // Bloom fall-back indicator
+    if (num_slots == 0) {
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_.empty());
+      return bloom_fallback_.Finish(buf);
+    }
+
+    uint32_t entropy = 0;
+    if (!hash_entries_.empty()) {
+      entropy = Lower32of64(hash_entries_.front());
+    }
+
+    BandingType banding;
+    std::size_t bytes_banding = ribbon::StandardBanding<
+        Standard128RibbonTypesAndSettings>::EstimateMemoryUsage(num_slots);
+    Status status_banding_cache_res = Status::OK();
+
+    // Cache reservation for banding
+    std::unique_ptr<CacheReservationHandle<CacheEntryRole::kFilterConstruction>>
+        banding_res_handle;
+    if (cache_res_mgr_) {
+      status_banding_cache_res =
+          cache_res_mgr_
+              ->MakeCacheReservation<CacheEntryRole::kFilterConstruction>(
+                  bytes_banding, &banding_res_handle);
+    }
+
+    if (status_banding_cache_res.IsIncomplete()) {
+      ROCKS_LOG_WARN(info_log_,
+                     "Cache reservation for Ribbon filter banding failed due "
+                     "to cache full");
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_.empty());
+      // Release cache for banding since the banding won't be allocated
+      banding_res_handle.reset();
+      return bloom_fallback_.Finish(buf);
+    }
+
+    bool success = banding.ResetAndFindSeedToSolve(
+        num_slots, hash_entries_.begin(), hash_entries_.end(),
+        /*starting seed*/ entropy & 255, /*seed mask*/ 255);
+    if (!success) {
+      ROCKS_LOG_WARN(info_log_,
+                     "Too many re-seeds (256) for Ribbon filter, %llu / %llu",
+                     static_cast<unsigned long long>(hash_entries_.size()),
+                     static_cast<unsigned long long>(num_slots));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_.empty());
+      return bloom_fallback_.Finish(buf);
+    }
+    hash_entries_.clear();
+    // Release cache for hash entries
+    hash_entry_cache_res_bucket_handles_.clear();
+
+    uint32_t seed = banding.GetOrdinalSeed();
+    assert(seed < 256);
+
+    std::unique_ptr<char[]> mutable_buf;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+    // Cache reservation for mutable_buf
+    if (cache_res_mgr_) {
+      std::unique_ptr<
+          CacheReservationHandle<CacheEntryRole::kFilterConstruction>>
+          final_filter_cache_res_handle;
+      Status s =
+          cache_res_mgr_
+              ->MakeCacheReservation<CacheEntryRole::kFilterConstruction>(
+                  len_with_metadata * sizeof(char),
+                  &final_filter_cache_res_handle);
+      final_filter_cache_res_handles_.push_back(
+          std::move(final_filter_cache_res_handle));
+      s.PermitUncheckedError();
+    }
+
+    SolnType soln(mutable_buf.get(), len_with_metadata);
+    soln.BackSubstFrom(banding);
+    uint32_t num_blocks = soln.GetNumBlocks();
+    // This should be guaranteed:
+    // num_entries < 2^30
+    // => (overhead_factor < 2.0)
+    // num_entries * overhead_factor == num_slots < 2^31
+    // => (num_blocks = num_slots / 128)
+    // num_blocks < 2^24
+    assert(num_blocks < 0x1000000U);
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -2 = Marker for Standard128 Ribbon
+    mutable_buf[len_with_metadata - 5] = static_cast<char>(-2);
+    // Hash seed
+    mutable_buf[len_with_metadata - 4] = static_cast<char>(seed);
+    // Number of blocks, in 24 bits
+    // (Along with bytes, we can derive other settings)
+    mutable_buf[len_with_metadata - 3] = static_cast<char>(num_blocks & 255);
+    mutable_buf[len_with_metadata - 2] =
+        static_cast<char>((num_blocks >> 8) & 255);
+    mutable_buf[len_with_metadata - 1] =
+        static_cast<char>((num_blocks >> 16) & 255);
+
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
+    return rv;
+  }
+
+  // Setting num_slots to 0 means "fall back on Bloom filter."
+  // And note this implementation does not support num_entries or num_slots
+  // beyond uint32_t; see kMaxRibbonEntries.
+  void CalculateSpaceAndSlots(size_t num_entries,
+                              size_t* target_len_with_metadata,
+                              uint32_t* num_slots) {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      *num_slots = 0;  // use Bloom
+      *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries);
+      return;
+    }
+    uint32_t entropy = 0;
+    if (!hash_entries_.empty()) {
+      entropy = Upper32of64(hash_entries_.front());
+    }
+
+    *num_slots = NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    *target_len_with_metadata =
+        SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_,
+                                         /*rounding*/ entropy) +
+        kMetadataLen;
+
+    // Consider possible Bloom fallback for small filters
+    if (*num_slots < 1024) {
+      size_t bloom = bloom_fallback_.CalculateSpace(num_entries);
+      if (bloom < *target_len_with_metadata) {
+        *num_slots = 0;  // use Bloom
+        *target_len_with_metadata = bloom;
+        return;
+      }
+    }
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    if (num_entries == 0) {
+      // See FinishAlwaysFalse
+      return 0;
+    }
+    size_t target_len_with_metadata;
+    uint32_t num_slots;
+    CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots);
+    (void)num_slots;
+    return target_len_with_metadata;
+  }
+
+  // This is a somewhat ugly but reasonably fast and reasonably accurate
+  // reversal of CalculateSpace.
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t len_no_metadata =
+        RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) -
+        kMetadataLen;
+
+    if (!(desired_one_in_fp_rate_ > 1.0)) {
+      // Effectively asking for 100% FP rate, or NaN etc.
+      // Note that NaN is neither < 1.0 nor > 1.0
+      return kMaxRibbonEntries;
+    }
+
+    // Find a slight under-estimate for actual average bits per slot
+    double min_real_bits_per_slot;
+    if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits<uint32_t>::max()) {
+      // Max of 32 solution columns (result bits)
+      min_real_bits_per_slot = 32.0;
+    } else {
+      // Account for mix of b and b+1 solution columns being slightly
+      // suboptimal vs. ideal log2(1/fp_rate) bits.
+      uint32_t rounded = static_cast<uint32_t>(desired_one_in_fp_rate_);
+      int upper_bits_per_key = 1 + FloorLog2(rounded);
+      double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key);
+      double portion_lower =
+          (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) /
+          fp_rate_for_upper;
+      min_real_bits_per_slot = upper_bits_per_key - portion_lower;
+      assert(min_real_bits_per_slot > 0.0);
+      assert(min_real_bits_per_slot <= 32.0);
+    }
+
+    // An overestimate, but this should only be O(1) slots away from truth.
+    double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot;
+
+    // Let's not bother accounting for overflow to Bloom filter
+    // (Includes NaN case)
+    if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) {
+      return kMaxRibbonEntries;
+    }
+
+    // Set up for short iteration
+    uint32_t slots = static_cast<uint32_t>(max_slots);
+    slots = SolnType::RoundUpNumSlots(slots);
+
+    // Assert that we have a valid upper bound on slots
+    assert(SolnType::GetBytesForOneInFpRate(
+               SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_,
+               /*rounding*/ 0) > len_no_metadata);
+
+    // Iterate up to a few times to rather precisely account for small effects
+    for (int i = 0; slots > 0; ++i) {
+      size_t reqd_bytes =
+          SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_,
+                                           /*rounding*/ 0);
+      if (reqd_bytes <= len_no_metadata) {
+        break;  // done
+      }
+      if (i >= 2) {
+        // should have been enough iterations
+        assert(false);
+        break;
+      }
+      slots = SolnType::RoundDownNumSlots(slots - 1);
+    }
+
+    uint32_t num_entries = ConfigHelper::GetNumToAdd(slots);
+
+    // Consider possible Bloom fallback for small filters
+    if (slots < 1024) {
+      size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes);
+      if (bloom > num_entries) {
+        return bloom;
+      } else {
+        return num_entries;
+      }
+    } else {
+      return std::min(num_entries, kMaxRibbonEntries);
+    }
+  }
+
+  double EstimatedFpRate(size_t num_entries,
+                         size_t len_with_metadata) override {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata);
+    }
+    uint32_t num_slots =
+        NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    SolnType fake_soln(nullptr, len_with_metadata);
+    fake_soln.ConfigureForNumSlots(num_slots);
+    return fake_soln.ExpectedFpRate();
+  }
+
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    // round down to multiple of 16 (segment size)
+    rv &= ~size_t{15};
+
+    return rv + kMetadataLen;
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  using SolnType = ribbon::SerializableInterleavedSolution<TS>;
+  using BandingType = ribbon::StandardBanding<TS>;
+  using ConfigHelper = ribbon::BandingConfigHelper1TS<ribbon::kOneIn20, TS>;
+
+  static uint32_t NumEntriesToNumSlots(uint32_t num_entries) {
+    uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries);
+    return SolnType::RoundUpNumSlots(num_slots1);
+  }
+
+  // Approximate num_entries to ensure number of bytes fits in 32 bits, which
+  // is not an inherent limitation but does ensure somewhat graceful Bloom
+  // fallback for crazy high number of entries, since the Bloom implementation
+  // does not support number of bytes bigger than fits in 32 bits. This is
+  // within an order of magnitude of implementation limit on num_slots
+  // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits
+  // (for filter metadata).
+  static constexpr uint32_t kMaxRibbonEntries = 950000000;  // ~ 1 billion
+
+  // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
+  // For warnings, or can be nullptr
+  Logger* info_log_;
+
+  // For falling back on Bloom filter in some exceptional cases and
+  // very small filter cases
+  FastLocalBloomBitsBuilder bloom_fallback_;
+};
+
+// for the linker, at least with DEBUG_LEVEL=2
+constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
+
+class Standard128RibbonBitsReader : public FilterBitsReader {
+ public:
+  Standard128RibbonBitsReader(const char* data, size_t len_bytes,
+                              uint32_t num_blocks, uint32_t seed)
+      : soln_(const_cast<char*>(data), len_bytes) {
+    soln_.ConfigureForNumBlocks(num_blocks);
+    hasher_.SetOrdinalSeed(seed);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete;
+  void operator=(const Standard128RibbonBitsReader&) = delete;
+
+  ~Standard128RibbonBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    return soln_.FilterQuery(h, hasher_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    struct SavedData {
+      uint64_t seeded_hash;
+      uint32_t segment_num;
+      uint32_t num_columns;
+      uint32_t start_bits;
+    };
+    std::array<SavedData, MultiGetContext::MAX_BATCH_SIZE> saved;
+    for (int i = 0; i < num_keys; ++i) {
+      ribbon::InterleavedPrepareQuery(
+          GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash,
+          &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = ribbon::InterleavedFilterQuery(
+          saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns,
+          saved[i].start_bits, hasher_, soln_);
+    }
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  ribbon::SerializableInterleavedSolution<TS> soln_;
+  ribbon::StandardHasher<TS> hasher_;
+};
+
+// ##################### Legacy Bloom implementation ################### //
+
 using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
 
 class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
@@ -209,21 +870,25 @@
 
   void AddKey(const Slice& key) override;
 
-  Slice Finish(std::unique_ptr<const char[]>* buf) override;
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_.size();
+  }
 
-  int CalculateNumEntry(const uint32_t bytes) override;
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
 
-  uint32_t CalculateSpace(const int num_entry) override {
+  size_t CalculateSpace(size_t num_entries) override {
     uint32_t dont_care1;
     uint32_t dont_care2;
-    return CalculateSpace(num_entry, &dont_care1, &dont_care2);
+    return CalculateSpace(num_entries, &dont_care1, &dont_care2);
   }
 
   double EstimatedFpRate(size_t keys, size_t bytes) override {
-    return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5,
+    return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen,
                                             num_probes_);
   }
 
+  size_t ApproximateNumEntries(size_t bytes) override;
+
  private:
   int bits_per_key_;
   int num_probes_;
@@ -234,11 +899,11 @@
   uint32_t GetTotalBitsForLocality(uint32_t total_bits);
 
   // Reserve space for new filter
-  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+  char* ReserveSpace(size_t num_entries, uint32_t* total_bits,
                      uint32_t* num_lines);
 
   // Implementation-specific variant of public CalculateSpace
-  uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
+  uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits,
                           uint32_t* num_lines);
 
   // Assuming single threaded access to this function.
@@ -306,7 +971,29 @@
   buf->reset(const_data);
   hash_entries_.clear();
 
-  return Slice(data, total_bits / 8 + 5);
+  return Slice(data, total_bits / 8 + kMetadataLen);
+}
+
+size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) {
+  assert(bits_per_key_);
+  assert(bytes > 0);
+
+  uint64_t total_bits_tmp = bytes * 8;
+  // total bits, including temporary computations, cannot exceed 2^32
+  // for compatibility
+  total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000});
+
+  uint32_t high = static_cast<uint32_t>(total_bits_tmp) /
+                      static_cast<uint32_t>(bits_per_key_) +
+                  1;
+  uint32_t low = 1;
+  uint32_t n = high;
+  for (; n >= low; n--) {
+    if (CalculateSpace(n) <= bytes) {
+      break;
+    }
+  }
+  return n;
 }
 
 uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
@@ -321,14 +1008,18 @@
   return num_lines * (CACHE_LINE_SIZE * 8);
 }
 
-uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry,
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries,
                                                 uint32_t* total_bits,
                                                 uint32_t* num_lines) {
   assert(bits_per_key_);
-  if (num_entry != 0) {
-    uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
+  if (num_entries != 0) {
+    size_t total_bits_tmp = num_entries * bits_per_key_;
+    // total bits, including temporary computations, cannot exceed 2^32
+    // for compatibility
+    total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000});
 
-    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+    *total_bits =
+        GetTotalBitsForLocality(static_cast<uint32_t>(total_bits_tmp));
     *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
     assert(*total_bits > 0 && *total_bits % 8 == 0);
   } else {
@@ -339,34 +1030,19 @@
 
   // Reserve space for Filter
   uint32_t sz = *total_bits / 8;
-  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
+  sz += kMetadataLen;  // 4 bytes for num_lines, 1 byte for num_probes
   return sz;
 }
 
-char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry,
+char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries,
                                            uint32_t* total_bits,
                                            uint32_t* num_lines) {
-  uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
+  uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines);
   char* data = new char[sz];
   memset(data, 0, sz);
   return data;
 }
 
-int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
-  assert(bits_per_key_);
-  assert(bytes > 0);
-  int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
-  int low = 1;
-  int n = high;
-  for (; n >= low; n--) {
-    if (CalculateSpace(n) <= bytes) {
-      break;
-    }
-  }
-  assert(n < high);  // High should be an overestimation
-  return n;
-}
-
 inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
                                             uint32_t num_lines,
                                             uint32_t total_bits) {
@@ -449,15 +1125,17 @@
     kLegacyBloom,
     kDeprecatedBlock,
     kFastLocalBloom,
+    kStandard128Ribbon,
 };
 
 const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
     kDeprecatedBlock,
-    kAuto,
+    kAutoBloom,
+    kStandard128Ribbon,
 };
 
 BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
-    : mode_(mode), warned_(false) {
+    : mode_(mode), warned_(false), aggregate_rounding_balance_(0) {
   // Sanitize bits_per_key
   if (bits_per_key < 1.0) {
     bits_per_key = 1.0;
@@ -470,6 +1148,15 @@
   // point are interpreted accurately.
   millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
 
+  // For now configure Ribbon filter to match Bloom FP rate and save
+  // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key
+  // for same FP rate.)
+  desired_one_in_fp_rate_ =
+      1.0 / BloomMath::CacheLocalFpRate(
+                bits_per_key,
+                FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_),
+                /*cache_line_bits*/ 512);
+
   // For better or worse, this is a rounding up of a nudged rounding up,
   // e.g. 7.4999999999999 will round up to 8, but that provides more
   // predictability against small arithmetic errors in floating point.
@@ -478,7 +1165,7 @@
 
 BloomFilterPolicy::~BloomFilterPolicy() {}
 
-const char* BloomFilterPolicy::Name() const {
+const char* BuiltinFilterPolicy::Name() const {
   return "rocksdb.BuiltinBloomFilter";
 }
 
@@ -511,8 +1198,8 @@
   }
 }
 
-bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
-                                    const Slice& bloom_filter) const {
+bool BuiltinFilterPolicy::KeyMayMatch(const Slice& key,
+                                      const Slice& bloom_filter) const {
   const size_t len = bloom_filter.size();
   if (len < 2 || len > 0xffffffffU) {
     return false;
@@ -534,7 +1221,7 @@
                                                  array);
 }
 
-FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
+FilterBitsBuilder* BuiltinFilterPolicy::GetFilterBitsBuilder() const {
   // This code path should no longer be used, for the built-in
   // BloomFilterPolicy. Internal to RocksDB and outside
   // BloomFilterPolicy, only get a FilterBitsBuilder with
@@ -549,11 +1236,20 @@
 FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
     const FilterBuildingContext& context) const {
   Mode cur = mode_;
+  bool offm = context.table_options.optimize_filters_for_memory;
+  bool reserve_filter_construction_mem =
+      (context.table_options.reserve_table_builder_memory &&
+       context.table_options.block_cache);
+  std::shared_ptr<CacheReservationManager> cache_res_mgr;
+  if (reserve_filter_construction_mem) {
+    cache_res_mgr = std::make_shared<CacheReservationManager>(
+        context.table_options.block_cache);
+  }
   // Unusual code construction so that we can have just
   // one exhaustive switch without (risky) recursion
   for (int i = 0; i < 2; ++i) {
     switch (cur) {
-      case kAuto:
+      case kAutoBloom:
         if (context.table_options.format_version < 5) {
           cur = kLegacyBloom;
         } else {
@@ -561,9 +1257,18 @@
         }
         break;
       case kDeprecatedBlock:
+        if (context.info_log && !warned_.load(std::memory_order_relaxed)) {
+          warned_ = true;
+          ROCKS_LOG_WARN(context.info_log,
+                         "Using deprecated block-based Bloom filter is "
+                         "inefficient (%d bits per key).",
+                         whole_bits_per_key_);
+        }
         return nullptr;
       case kFastLocalBloom:
-        return new FastLocalBloomBitsBuilder(millibits_per_key_);
+        return new FastLocalBloomBitsBuilder(
+            millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr,
+            cache_res_mgr);
       case kLegacyBloom:
         if (whole_bits_per_key_ >= 14 && context.info_log &&
             !warned_.load(std::memory_order_relaxed)) {
@@ -585,6 +1290,11 @@
         }
         return new LegacyBloomBitsBuilder(whole_bits_per_key_,
                                           context.info_log);
+      case kStandard128Ribbon:
+        return new Standard128RibbonBitsBuilder(
+            desired_one_in_fp_rate_, millibits_per_key_,
+            offm ? &aggregate_rounding_balance_ : nullptr, cache_res_mgr,
+            context.info_log);
     }
   }
   assert(false);
@@ -602,10 +1312,10 @@
 
 // Read metadata to determine what kind of FilterBitsReader is needed
 // and return a new one.
-FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
+FilterBitsReader* BuiltinFilterPolicy::GetFilterBitsReader(
     const Slice& contents) const {
   uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
-  if (len_with_meta <= 5) {
+  if (len_with_meta <= kMetadataLen) {
     // filter is empty or broken. Treat like zero keys added.
     return new AlwaysFalseFilter();
   }
@@ -623,7 +1333,7 @@
   // len_with_meta +-----------------------------------+
 
   int8_t raw_num_probes =
-      static_cast<int8_t>(contents.data()[len_with_meta - 5]);
+      static_cast<int8_t>(contents.data()[len_with_meta - kMetadataLen]);
   // NB: *num_probes > 30 and < 128 probably have not been used, because of
   // BloomFilterPolicy::initialize, unless directly calling
   // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
@@ -632,13 +1342,20 @@
   if (raw_num_probes < 1) {
     // Note: < 0 (or unsigned > 127) indicate special new implementations
     // (or reserved for future use)
-    if (raw_num_probes == -1) {
-      // Marker for newer Bloom implementations
-      return GetBloomBitsReader(contents);
+    switch (raw_num_probes) {
+      case 0:
+        // Treat as zero probes (always FP)
+        return new AlwaysTrueFilter();
+      case -1:
+        // Marker for newer Bloom implementations
+        return GetBloomBitsReader(contents);
+      case -2:
+        // Marker for Ribbon implementations
+        return GetRibbonBitsReader(contents);
+      default:
+        // Reserved (treat as zero probes, always FP, for now)
+        return new AlwaysTrueFilter();
     }
-    // otherwise
-    // Treat as zero probes (always FP) for now.
-    return new AlwaysTrueFilter();
   }
   // else attempt decode for LegacyBloomBitsReader
 
@@ -646,7 +1363,7 @@
   assert(num_probes >= 1);
   assert(num_probes <= 127);
 
-  uint32_t len = len_with_meta - 5;
+  uint32_t len = len_with_meta - kMetadataLen;
   assert(len > 0);
 
   uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
@@ -676,11 +1393,34 @@
                                    log2_cache_line_size);
 }
 
+FilterBitsReader* BuiltinFilterPolicy::GetRibbonBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - kMetadataLen;
+
+  assert(len > 0);  // precondition
+
+  uint32_t seed = static_cast<uint8_t>(contents.data()[len + 1]);
+  uint32_t num_blocks = static_cast<uint8_t>(contents.data()[len + 2]);
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 3]) << 8;
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 4]) << 16;
+  if (num_blocks < 2) {
+    // Not supported
+    // num_blocks == 1 is not used because num_starts == 1 is problematic
+    // for the hashing scheme. num_blocks == 0 is unused because there's
+    // already a concise encoding of an "always false" filter.
+    // Return something safe:
+    return new AlwaysTrueFilter();
+  }
+  return new Standard128RibbonBitsReader(contents.data(), len, num_blocks,
+                                         seed);
+}
+
 // For newer Bloom filter implementations
-FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader(
+FilterBitsReader* BuiltinFilterPolicy::GetBloomBitsReader(
     const Slice& contents) const {
   uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
-  uint32_t len = len_with_meta - 5;
+  uint32_t len = len_with_meta - kMetadataLen;
 
   assert(len > 0);  // precondition
 
@@ -742,7 +1482,7 @@
   if (use_block_based_builder) {
     m = BloomFilterPolicy::kDeprecatedBlock;
   } else {
-    m = BloomFilterPolicy::kAuto;
+    m = BloomFilterPolicy::kAutoBloom;
   }
   assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
                    BloomFilterPolicy::kAllUserModes.end(),
@@ -750,10 +1490,125 @@
   return new BloomFilterPolicy(bits_per_key, m);
 }
 
+// Chooses between two filter policies based on LSM level, but
+// only for Level and Universal compaction styles. Flush is treated
+// as level -1. Policy b is considered fallback / primary policy.
+LevelThresholdFilterPolicy::LevelThresholdFilterPolicy(
+    std::unique_ptr<const FilterPolicy>&& a,
+    std::unique_ptr<const FilterPolicy>&& b, int starting_level_for_b)
+    : policy_a_(std::move(a)),
+      policy_b_(std::move(b)),
+      starting_level_for_b_(starting_level_for_b) {
+  // Don't use this wrapper class if you were going to set to -1
+  assert(starting_level_for_b_ >= 0);
+}
+
+// Deprecated block-based filter only
+void LevelThresholdFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                              std::string* dst) const {
+  policy_b_->CreateFilter(keys, n, dst);
+}
+
+FilterBitsBuilder* LevelThresholdFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  switch (context.compaction_style) {
+    case kCompactionStyleLevel:
+    case kCompactionStyleUniversal: {
+      int levelish;
+      if (context.reason == TableFileCreationReason::kFlush) {
+        // Treat flush as level -1
+        assert(context.level_at_creation == 0);
+        levelish = -1;
+      } else if (context.level_at_creation == -1) {
+        // Unknown level
+        // Policy b considered fallback / primary
+        return policy_b_->GetBuilderWithContext(context);
+      } else {
+        levelish = context.level_at_creation;
+      }
+      if (levelish >= starting_level_for_b_) {
+        return policy_b_->GetBuilderWithContext(context);
+      } else {
+        return policy_a_->GetBuilderWithContext(context);
+      }
+    }
+    case kCompactionStyleFIFO:
+    case kCompactionStyleNone:
+      break;
+  }
+  // Policy b considered fallback / primary
+  return policy_b_->GetBuilderWithContext(context);
+}
+
+const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key,
+                                          int bloom_before_level) {
+  std::unique_ptr<const FilterPolicy> ribbon_only{new BloomFilterPolicy(
+      bloom_equivalent_bits_per_key, BloomFilterPolicy::kStandard128Ribbon)};
+  if (bloom_before_level > -1) {
+    // Could also use Bloom policy
+    std::unique_ptr<const FilterPolicy> bloom_only{new BloomFilterPolicy(
+        bloom_equivalent_bits_per_key, BloomFilterPolicy::kFastLocalBloom)};
+    return new LevelThresholdFilterPolicy(
+        std::move(bloom_only), std::move(ribbon_only), bloom_before_level);
+  } else {
+    return ribbon_only.release();
+  }
+}
+
 FilterBuildingContext::FilterBuildingContext(
     const BlockBasedTableOptions& _table_options)
     : table_options(_table_options) {}
 
 FilterPolicy::~FilterPolicy() { }
 
+Status FilterPolicy::CreateFromString(
+    const ConfigOptions& /*options*/, const std::string& value,
+    std::shared_ptr<const FilterPolicy>* policy) {
+  const std::string kBloomName = "bloomfilter:";
+  const std::string kExpRibbonName = "experimental_ribbon:";
+  const std::string kRibbonName = "ribbonfilter:";
+  if (value == kNullptrString || value == "rocksdb.BuiltinBloomFilter") {
+    policy->reset();
+#ifndef ROCKSDB_LITE
+  } else if (value.compare(0, kBloomName.size(), kBloomName) == 0) {
+    size_t pos = value.find(':', kBloomName.size());
+    if (pos == std::string::npos) {
+      return Status::InvalidArgument(
+          "Invalid filter policy config, missing bits_per_key");
+    } else {
+      double bits_per_key = ParseDouble(
+          trim(value.substr(kBloomName.size(), pos - kBloomName.size())));
+      bool use_block_based_builder =
+          ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
+      policy->reset(
+          NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
+    }
+  } else if (value.compare(0, kExpRibbonName.size(), kExpRibbonName) == 0) {
+    double bloom_equivalent_bits_per_key =
+        ParseDouble(trim(value.substr(kExpRibbonName.size())));
+    policy->reset(
+        NewExperimentalRibbonFilterPolicy(bloom_equivalent_bits_per_key));
+  } else if (value.compare(0, kRibbonName.size(), kRibbonName) == 0) {
+    size_t pos = value.find(':', kRibbonName.size());
+    int bloom_before_level;
+    if (pos == std::string::npos) {
+      pos = value.size();
+      bloom_before_level = 0;
+    } else {
+      bloom_before_level = ParseInt(trim(value.substr(pos + 1)));
+    }
+    double bloom_equivalent_bits_per_key =
+        ParseDouble(trim(value.substr(kRibbonName.size(), pos)));
+    policy->reset(NewRibbonFilterPolicy(bloom_equivalent_bits_per_key,
+                                        bloom_before_level));
+  } else {
+    return Status::NotFound("Invalid filter policy name ", value);
+#else
+  } else {
+    return Status::NotSupported("Cannot load filter policy in LITE mode ",
+                                value);
+#endif  // ROCKSDB_LITE
+  }
+  return Status::OK();
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/filter_policy_internal.h	2025-05-19 16:14:27.000000000 +0000
@@ -25,20 +25,52 @@
 class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
  public:
   // Calculate number of bytes needed for a new filter, including
-  // metadata. Passing the result to CalculateNumEntry should
-  // return >= the num_entry passed in.
-  virtual uint32_t CalculateSpace(const int num_entry) = 0;
+  // metadata. Passing the result to ApproximateNumEntries should
+  // (ideally, usually) return >= the num_entry passed in.
+  // When optimize_filters_for_memory is enabled, this function
+  // is not authoritative but represents a target size that should
+  // be close to the average size.
+  virtual size_t CalculateSpace(size_t num_entries) = 0;
 
   // Returns an estimate of the FP rate of the returned filter if
-  // `keys` keys are added and the filter returned by Finish is `bytes`
-  // bytes.
-  virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0;
+  // `num_entries` keys are added and the filter returned by Finish
+  // is `bytes` bytes.
+  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
 };
 
-// RocksDB built-in filter policy for Bloom or Bloom-like filters.
+// Abstract base class for RocksDB built-in filter policies.
 // This class is considered internal API and subject to change.
-// See NewBloomFilterPolicy.
-class BloomFilterPolicy : public FilterPolicy {
+class BuiltinFilterPolicy : public FilterPolicy {
+ public:
+  // Shared name because any built-in policy can read filters from
+  // any other
+  const char* Name() const override;
+
+  // Deprecated block-based filter only
+  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override;
+
+  // Old API
+  FilterBitsBuilder* GetFilterBitsBuilder() const override;
+
+  // Read metadata to determine what kind of FilterBitsReader is needed
+  // and return a new one. This must successfully process any filter data
+  // generated by a built-in FilterBitsBuilder, regardless of the impl
+  // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
+
+ private:
+  // For Bloom filter implementation(s) (except deprecated block-based filter)
+  FilterBitsReader* GetBloomBitsReader(const Slice& contents) const;
+
+  // For Ribbon filter implementation(s)
+  FilterBitsReader* GetRibbonBitsReader(const Slice& contents) const;
+};
+
+// RocksDB built-in filter policy for Bloom or Bloom-like filters including
+// Ribbon filters.
+// This class is considered internal API and subject to change.
+// See NewBloomFilterPolicy and NewRibbonFilterPolicy.
+class BloomFilterPolicy : public BuiltinFilterPolicy {
  public:
   // An internal marker for operating modes of BloomFilterPolicy, in terms
   // of selecting an implementation. This makes it easier for tests to track
@@ -64,10 +96,12 @@
     // FastLocalBloomImpl.
     // NOTE: TESTING ONLY as this mode does not check format_version
     kFastLocalBloom = 2,
-    // Automatically choose from the above (except kDeprecatedBlock) based on
+    // A Bloom alternative saving about 30% space for ~3-4x construction
+    // CPU time. See ribbon_alg.h and ribbon_impl.h.
+    kStandard128Ribbon = 3,
+    // Automatically choose between kLegacyBloom and kFastLocalBloom based on
     // context at build time, including compatibility with format_version.
-    // NOTE: This is currently the only recommended mode that is user exposed.
-    kAuto = 100,
+    kAutoBloom = 100,
   };
   // All the different underlying implementations that a BloomFilterPolicy
   // might use, as a mode that says "always use this implementation."
@@ -83,16 +117,9 @@
 
   ~BloomFilterPolicy() override;
 
-  const char* Name() const override;
-
   // Deprecated block-based filter only
   void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
 
-  // Deprecated block-based filter only
-  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override;
-
-  FilterBitsBuilder* GetFilterBitsBuilder() const override;
-
   // To use this function, call GetBuilderFromContext().
   //
   // Neither the context nor any objects therein should be saved beyond
@@ -105,18 +132,16 @@
   // (An internal convenience function to save boilerplate.)
   static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&);
 
-  // Read metadata to determine what kind of FilterBitsReader is needed
-  // and return a new one. This must successfully process any filter data
-  // generated by a built-in FilterBitsBuilder, regardless of the impl
-  // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
-  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
-
   // Essentially for testing only: configured millibits/key
   int GetMillibitsPerKey() const { return millibits_per_key_; }
   // Essentially for testing only: legacy whole bits/key
   int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+  // Testing only
+  Mode GetMode() const { return mode_; }
 
  private:
+  // Bits per key settings are for configuring Bloom filters.
+
   // Newer filters support fractional bits per key. For predictable behavior
   // of 0.001-precision values across floating point implementations, we
   // round to thousandths of a bit (on average) per key.
@@ -127,6 +152,10 @@
   // behavior with format_version < 5 just in case.)
   int whole_bits_per_key_;
 
+  // For configuring Ribbon filter: a desired value for 1/fp_rate. For
+  // example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
   // Selected mode (a specific implementation or way of selecting an
   // implementation) for building new SST filters.
   Mode mode_;
@@ -135,8 +164,42 @@
   // only report once per BloomFilterPolicy instance, to keep the noise down.)
   mutable std::atomic<bool> warned_;
 
-  // For newer Bloom filter implementation(s)
-  FilterBitsReader* GetBloomBitsReader(const Slice& contents) const;
+  // State for implementing optimize_filters_for_memory. Essentially, this
+  // tracks a surplus or deficit in total FP rate of filters generated by
+  // builders under this policy vs. what would have been generated without
+  // optimize_filters_for_memory.
+  //
+  // To avoid floating point weirdness, the actual value is
+  //  Sum over all generated filters f:
+  //   (predicted_fp_rate(f) - predicted_fp_rate(f|o_f_f_m=false)) * 2^32
+  mutable std::atomic<int64_t> aggregate_rounding_balance_;
+};
+
+// Chooses between two filter policies based on LSM level, but
+// only for Level and Universal compaction styles. Flush is treated
+// as level -1. Policy b is considered fallback / primary policy.
+class LevelThresholdFilterPolicy : public BuiltinFilterPolicy {
+ public:
+  LevelThresholdFilterPolicy(std::unique_ptr<const FilterPolicy>&& a,
+                             std::unique_ptr<const FilterPolicy>&& b,
+                             int starting_level_for_b);
+
+  // Deprecated block-based filter only
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override;
+
+  inline int TEST_GetStartingLevelForB() const { return starting_level_for_b_; }
+
+  inline const FilterPolicy* TEST_GetPolicyA() const { return policy_a_.get(); }
+
+  inline const FilterPolicy* TEST_GetPolicyB() const { return policy_b_.get(); }
+
+ private:
+  const std::unique_ptr<const FilterPolicy> policy_a_;
+  const std::unique_ptr<const FilterPolicy> policy_b_;
+  int starting_level_for_b_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,12 +4,18 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/flush_block_policy.h"
+
+#include <cassert>
+#include <mutex>
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy.h"
 #include "table/format.h"
 
-#include <cassert>
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -57,7 +63,7 @@
         data_block_builder_.EstimateSizeAfterKV(key, value);
 
     if (align_) {
-      estimated_size_after += kBlockTrailerSize;
+      estimated_size_after += BlockBasedTable::kBlockTrailerSize;
       return estimated_size_after > block_size_;
     }
 
@@ -85,4 +91,58 @@
   return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
 }
 
+#ifndef ROCKSDB_LITE
+static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library,
+                                             const std::string& /*arg*/) {
+  library.AddFactory<FlushBlockPolicyFactory>(
+      FlushBlockBySizePolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushBlockBySizePolicyFactory());
+        return guard->get();
+      });
+  library.AddFactory<FlushBlockPolicyFactory>(
+      FlushBlockEveryKeyPolicyFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FlushBlockPolicyFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FlushBlockEveryKeyPolicyFactory());
+        return guard->get();
+      });
+  return 2;
+}
+#endif  // ROCKSDB_LITE
+
+static bool LoadFlushPolicyFactory(
+    const std::string& id, std::shared_ptr<FlushBlockPolicyFactory>* result) {
+  if (id.empty()) {
+    result->reset(new FlushBlockBySizePolicyFactory());
+#ifdef ROCKSDB_LITE
+  } else if (id == FlushBlockBySizePolicyFactory::kClassName()) {
+    result->reset(new FlushBlockBySizePolicyFactory());
+  } else if (id == FlushBlockEveryKeyPolicyFactory::kClassName()) {
+    result->reset(new FlushBlockEveryKeyPolicyFactory());
+#endif  // ROCKSDB_LITE
+  } else {
+    return false;
+  }
+  return true;
+}
+
+FlushBlockBySizePolicyFactory::FlushBlockBySizePolicyFactory()
+    : FlushBlockPolicyFactory() {}
+
+Status FlushBlockPolicyFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<FlushBlockPolicyFactory>* factory) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterFlushBlockPolicyFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<FlushBlockPolicyFactory>(
+      config_options, value, LoadFlushPolicyFactory, factory);
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/flush_block_policy.h	2025-05-19 16:14:27.000000000 +0000
@@ -27,9 +27,8 @@
  public:
   explicit FlushBlockEveryKeyPolicyFactory() {}
 
-  const char* Name() const override {
-    return "FlushBlockEveryKeyPolicyFactory";
-  }
+  static const char* kClassName() { return "FlushBlockEveryKeyPolicyFactory"; }
+  const char* Name() const override { return kClassName(); }
 
   FlushBlockPolicy* NewFlushBlockPolicy(
       const BlockBasedTableOptions& /*table_options*/,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.cc	2025-05-19 16:14:27.000000000 +0000
@@ -22,42 +22,63 @@
       whole_key_filtering_(whole_key_filtering),
       last_whole_key_recorded_(false),
       last_prefix_recorded_(false),
-      num_added_(0) {
+      last_key_in_domain_(false),
+      any_added_(false) {
   assert(filter_bits_builder != nullptr);
   filter_bits_builder_.reset(filter_bits_builder);
 }
 
-void FullFilterBlockBuilder::Add(const Slice& key) {
-  const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key);
+size_t FullFilterBlockBuilder::EstimateEntriesAdded() {
+  return filter_bits_builder_->EstimateEntriesAdded();
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key_without_ts) {
+  const bool add_prefix =
+      prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts);
+
+  if (!last_prefix_recorded_ && last_key_in_domain_) {
+    // We can reach here when a new filter partition starts in partitioned
+    // filter. The last prefix in the previous partition should be added if
+    // necessary regardless of key_without_ts, to support prefix SeekForPrev.
+    AddKey(last_prefix_str_);
+    last_prefix_recorded_ = true;
+  }
+
   if (whole_key_filtering_) {
     if (!add_prefix) {
-      AddKey(key);
+      AddKey(key_without_ts);
     } else {
       // if both whole_key and prefix are added to bloom then we will have whole
-      // key and prefix addition being interleaved and thus cannot rely on the
-      // bits builder to properly detect the duplicates by comparing with the
-      // last item.
+      // key_without_ts and prefix addition being interleaved and thus cannot
+      // rely on the bits builder to properly detect the duplicates by comparing
+      // with the last item.
       Slice last_whole_key = Slice(last_whole_key_str_);
-      if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) {
-        AddKey(key);
+      if (!last_whole_key_recorded_ ||
+          last_whole_key.compare(key_without_ts) != 0) {
+        AddKey(key_without_ts);
         last_whole_key_recorded_ = true;
-        last_whole_key_str_.assign(key.data(), key.size());
+        last_whole_key_str_.assign(key_without_ts.data(),
+                                   key_without_ts.size());
       }
     }
   }
   if (add_prefix) {
-    AddPrefix(key);
+    last_key_in_domain_ = true;
+    AddPrefix(key_without_ts);
+  } else {
+    last_key_in_domain_ = false;
   }
 }
 
 // Add key to filter if needed
 inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
   filter_bits_builder_->AddKey(key);
-  num_added_++;
+  any_added_ = true;
 }
 
 // Add prefix to filter if needed
 void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+  assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
   Slice prefix = prefix_extractor_->Transform(key);
   if (whole_key_filtering_) {
     // if both whole_key and prefix are added to bloom then we will have whole
@@ -80,14 +101,17 @@
   last_prefix_recorded_ = false;
 }
 
-Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
-                                     Status* status) {
+Slice FullFilterBlockBuilder::Finish(
+    const BlockHandle& /*tmp*/, Status* status,
+    std::unique_ptr<const char[]>* filter_data) {
   Reset();
   // In this impl we ignore BlockHandle
   *status = Status::OK();
-  if (num_added_ != 0) {
-    num_added_ = 0;
-    return filter_bits_builder_->Finish(&filter_data_);
+  if (any_added_) {
+    any_added_ = false;
+    Slice filter_content =
+        filter_bits_builder_->Finish(filter_data ? filter_data : &filter_data_);
+    return filter_content;
   }
   return Slice();
 }
@@ -119,19 +143,20 @@
 }
 
 std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
-    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    bool use_cache, bool prefetch, bool pin,
-    BlockCacheLookupContext* lookup_context) {
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
   assert(table);
   assert(table->get_rep());
   assert(!pin || prefetch);
 
   CachableEntry<ParsedFullFilterBlock> filter_block;
   if (prefetch || !use_cache) {
-    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
-                                     use_cache, nullptr /* get_context */,
-                                     lookup_context, &filter_block);
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block);
     if (!s.ok()) {
+      IGNORE_STATUS_IF_ERROR(s);
       return std::unique_ptr<FilterBlockReader>();
     }
 
@@ -164,6 +189,7 @@
   const Status s =
       GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
   if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
     return true;
   }
 
@@ -189,7 +215,6 @@
     uint64_t block_offset, const bool no_io,
     BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
-  (void)range;
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
@@ -206,7 +231,6 @@
     uint64_t block_offset, const bool no_io,
     BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
-  (void)range;
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
@@ -221,6 +245,7 @@
   const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
                                         lookup_context, &filter_block);
   if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
     return;
   }
 
@@ -244,9 +269,9 @@
   MultiGetRange filter_range(*range, range->begin(), range->end());
   for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
     if (!prefix_extractor) {
-      keys[num_keys++] = &iter->ukey;
-    } else if (prefix_extractor->InDomain(iter->ukey)) {
-      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey));
+      keys[num_keys++] = &iter->ukey_without_ts;
+    } else if (prefix_extractor->InDomain(iter->ukey_without_ts)) {
+      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey_without_ts));
       keys[num_keys++] = &prefixes.back();
     } else {
       filter_range.SkipKey(iter);
@@ -282,22 +307,23 @@
 }
 
 bool FullFilterBlockReader::RangeMayExist(
-    const Slice* iterate_upper_bound, const Slice& user_key,
+    const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
     const SliceTransform* prefix_extractor, const Comparator* comparator,
     const Slice* const const_ikey_ptr, bool* filter_checked,
-    bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) {
-  if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
+    bool need_upper_bound_check, bool no_io,
+    BlockCacheLookupContext* lookup_context) {
+  if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
     *filter_checked = false;
     return true;
   }
-  Slice prefix = prefix_extractor->Transform(user_key);
+  Slice prefix = prefix_extractor->Transform(user_key_without_ts);
   if (need_upper_bound_check &&
       !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
     *filter_checked = false;
     return true;
   } else {
     *filter_checked = true;
-    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
                           const_ikey_ptr, /* get_context */ nullptr,
                           lookup_context);
   }
@@ -316,7 +342,8 @@
     }
     Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
     // first check if user_key and upper_bound all share the same prefix
-    if (!comparator->Equal(prefix, upper_bound_xform)) {
+    if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
+                                            false) != 0) {
       // second check if user_key's prefix is the immediate predecessor of
       // upper_bound and have the same length. If so, we know for sure all
       // keys in the range [user_key, upper_bound) share the same prefix.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block.h	2025-05-19 16:14:27.000000000 +0000
@@ -7,11 +7,12 @@
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "db/dbformat.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
@@ -50,17 +51,25 @@
 
   virtual bool IsBlockBased() override { return false; }
   virtual void StartBlock(uint64_t /*block_offset*/) override {}
-  virtual void Add(const Slice& key) override;
-  virtual size_t NumAdded() const override { return num_added_; }
-  virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
+  virtual void Add(const Slice& key_without_ts) override;
+  virtual bool IsEmpty() const override { return !any_added_; }
+  virtual size_t EstimateEntriesAdded() override;
+  virtual Slice Finish(
+      const BlockHandle& tmp, Status* status,
+      std::unique_ptr<const char[]>* filter_data = nullptr) override;
   using FilterBlockBuilder::Finish;
 
+  virtual void ResetFilterBitsBuilder() override {
+    filter_bits_builder_.reset();
+  }
+
  protected:
   virtual void AddKey(const Slice& key);
   std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
   virtual void Reset();
   void AddPrefix(const Slice& key);
   const SliceTransform* prefix_extractor() { return prefix_extractor_; }
+  const std::string& last_prefix_str() const { return last_prefix_str_; }
 
  private:
   // important: all of these might point to invalid addresses
@@ -72,10 +81,13 @@
   std::string last_whole_key_str_;
   bool last_prefix_recorded_;
   std::string last_prefix_str_;
-
-  uint32_t num_added_;
+  // Whether prefix_extractor_->InDomain(last_whole_key_) is true.
+  // Used in partitioned filters so that the last prefix from the previous
+  // filter partition will be added to the current partition if
+  // last_key_in_domain_ is true, regardless of the current key.
+  bool last_key_in_domain_;
+  bool any_added_;
   std::unique_ptr<const char[]> filter_data_;
-
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
@@ -87,9 +99,9 @@
                         CachableEntry<ParsedFullFilterBlock>&& filter_block);
 
   static std::unique_ptr<FilterBlockReader> Create(
-      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-      bool use_cache, bool prefetch, bool pin,
-      BlockCacheLookupContext* lookup_context);
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
 
   bool IsBlockBased() override { return false; }
 
@@ -119,7 +131,7 @@
                      const SliceTransform* prefix_extractor,
                      const Comparator* comparator,
                      const Slice* const const_ikey_ptr, bool* filter_checked,
-                     bool need_upper_bound_check,
+                     bool need_upper_bound_check, bool no_io,
                      BlockCacheLookupContext* lookup_context) override;
 
  private:
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,13 +3,16 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "table/block_based/full_filter_block.h"
+
 #include <set>
 
-#include "table/block_based/full_filter_block.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/status.h"
 #include "table/block_based/block_based_table_reader.h"
-#include "table/block_based/mock_block_based_table.h"
 #include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/format.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
@@ -224,8 +227,8 @@
     return rv;
   }
 
-  int CalculateNumEntry(const uint32_t bytes) override {
-    return b_->CalculateNumEntry(bytes);
+  size_t ApproximateNumEntries(size_t bytes) override {
+    return b_->ApproximateNumEntries(bytes);
   }
 
   size_t CountUnique() { return uniq_.size(); }
@@ -239,11 +242,9 @@
     const bool WHOLE_KEY = true;
     FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                    bits_builder);
-    ASSERT_EQ(0, builder.NumAdded());
     ASSERT_EQ(0, bits_builder->CountUnique());
     // adds key and empty prefix; both abstractions count them
     builder.Add("key1");
-    ASSERT_EQ(2, builder.NumAdded());
     ASSERT_EQ(2, bits_builder->CountUnique());
     // Add different key (unique) and also empty prefix (not unique).
     // From here in this test, it's immaterial whether the block builder
@@ -262,7 +263,6 @@
   const bool WHOLE_KEY = true;
   FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                  bits_builder);
-  ASSERT_EQ(0, builder.NumAdded());
   builder.Add("");  // test with empty key too
   builder.Add("prefix1key1");
   builder.Add("prefix1key1");
@@ -275,14 +275,19 @@
 
 TEST_F(FullFilterBlockTest, SingleChunk) {
   FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
-  ASSERT_EQ(0, builder.NumAdded());
+  ASSERT_TRUE(builder.IsEmpty());
   builder.Add("foo");
+  ASSERT_FALSE(builder.IsEmpty());
   builder.Add("bar");
   builder.Add("box");
   builder.Add("box");
   builder.Add("hello");
-  ASSERT_EQ(5, builder.NumAdded());
-  Slice slice = builder.Finish();
+  // "box" only counts once
+  ASSERT_EQ(4, builder.EstimateEntriesAdded());
+  ASSERT_FALSE(builder.IsEmpty());
+  Status s;
+  Slice slice = builder.Finish(BlockHandle(), &s);
+  ASSERT_OK(s);
 
   CachableEntry<ParsedFullFilterBlock> block(
       new ParsedFullFilterBlock(table_options_.filter_policy.get(),
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,147 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/hash_index_reader.h"
+
+#include "table/block_fetcher.h"
+#include "table/meta_blocks.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status HashIndexReader::Create(const BlockBasedTable* table,
+                               const ReadOptions& ro,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               InternalIterator* meta_index_iter,
+                               bool use_cache, bool prefetch, bool pin,
+                               BlockCacheLookupContext* lookup_context,
+                               std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(index_reader != nullptr);
+  assert(!pin || prefetch);
+
+  const BlockBasedTable::Rep* rep = table->get_rep();
+  assert(rep != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  // Note, failure to create prefix hash index does not need to be a
+  // hard error. We can still fall back to the original binary search index.
+  // So, Create will succeed regardless, from this point on.
+
+  index_reader->reset(new HashIndexReader(table, std::move(index_block)));
+
+  // Get prefixes block
+  BlockHandle prefixes_handle;
+  Status s =
+      FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, &prefixes_handle);
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  // Get index metadata block
+  BlockHandle prefixes_meta_handle;
+  s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
+                    &prefixes_meta_handle);
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  RandomAccessFileReader* const file = rep->file.get();
+  const Footer& footer = rep->footer;
+  const ImmutableOptions& ioptions = rep->ioptions;
+  const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
+  MemoryAllocator* const memory_allocator =
+      GetMemoryAllocator(rep->table_options);
+
+  // Read contents for the blocks
+  BlockContents prefixes_contents;
+  BlockFetcher prefixes_block_fetcher(
+      file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
+      &prefixes_contents, ioptions, true /*decompress*/,
+      true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = prefixes_block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    return s;
+  }
+  BlockContents prefixes_meta_contents;
+  BlockFetcher prefixes_meta_block_fetcher(
+      file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
+      &prefixes_meta_contents, ioptions, true /*decompress*/,
+      true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = prefixes_meta_block_fetcher.ReadBlockContents();
+  if (!s.ok()) {
+    // TODO: log error
+    return Status::OK();
+  }
+
+  BlockPrefixIndex* prefix_index = nullptr;
+  assert(rep->internal_prefix_transform.get() != nullptr);
+  s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
+                               prefixes_contents.data,
+                               prefixes_meta_contents.data, &prefix_index);
+  // TODO: log error
+  if (s.ok()) {
+    HashIndexReader* const hash_index_reader =
+        static_cast<HashIndexReader*>(index_reader->get());
+    hash_index_reader->prefix_index_.reset(prefix_index);
+  }
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* HashIndexReader::NewIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const BlockBasedTable::Rep* rep = table()->get_rep();
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  Statistics* kNullStats = nullptr;
+  const bool total_order_seek =
+      read_options.total_order_seek || disable_prefix_seek;
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  auto it = index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), iter, kNullStats,
+      total_order_seek, index_has_first_key(), index_key_includes_seq(),
+      index_value_is_full(), false /* block_contents_pinned */,
+      prefix_index_.get());
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/hash_index_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,49 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that leverages an internal hash table to quicken the lookup for a given
+// key.
+class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer,
+                       InternalIterator* meta_index_iter, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
+#else
+    if (prefix_index_) {
+      usage += prefix_index_->ApproximateMemoryUsage();
+    }
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  std::unique_ptr<BlockPrefixIndex> prefix_index_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -20,9 +20,8 @@
 #include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
 
-// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace ROCKSDB_NAMESPACE {
-// using namespace rocksdb;
+
 // Create a index builder based on its type.
 IndexBuilder* IndexBuilder::CreateIndexBuilder(
     BlockBasedTableOptions::IndexType index_type,
@@ -37,7 +36,8 @@
           comparator, table_opt.index_block_restart_interval,
           table_opt.format_version, use_value_delta_encoding,
           table_opt.index_shortening, /* include_first_key */ false);
-    } break;
+      break;
+    }
     case BlockBasedTableOptions::kHashSearch: {
       // Currently kHashSearch is incompatible with index_block_restart_interval
       // > 1
@@ -46,20 +46,24 @@
           comparator, int_key_slice_transform,
           table_opt.index_block_restart_interval, table_opt.format_version,
           use_value_delta_encoding, table_opt.index_shortening);
-    } break;
+      break;
+    }
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
       result = PartitionedIndexBuilder::CreateIndexBuilder(
           comparator, use_value_delta_encoding, table_opt);
-    } break;
+      break;
+    }
     case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
       result = new ShortenedIndexBuilder(
           comparator, table_opt.index_block_restart_interval,
           table_opt.format_version, use_value_delta_encoding,
           table_opt.index_shortening, /* include_first_key */ true);
-    } break;
+      break;
+    }
     default: {
       assert(!"Do not recognize the index type ");
-    } break;
+      break;
+    }
   }
   return result;
 }
@@ -104,6 +108,15 @@
       comparator_, table_opt_.index_block_restart_interval,
       table_opt_.format_version, use_value_delta_encoding_,
       table_opt_.index_shortening, /* include_first_key */ false);
+
+  // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
+  // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
+  // default on Creation) so that flush policy can point to
+  // sub_index_builder_->index_block_builder_
+  if (seperator_is_key_plus_seq_) {
+    sub_index_builder_->seperator_is_key_plus_seq_ = true;
+  }
+
   flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
       table_opt_.metadata_block_size, table_opt_.block_size_deviation,
       // Note: this is sub-optimal since sub_index_builder_ could later reset
@@ -129,9 +142,15 @@
     }
     sub_index_builder_->AddIndexEntry(last_key_in_current_block,
                                       first_key_in_next_block, block_handle);
-    if (sub_index_builder_->seperator_is_key_plus_seq_) {
-      // then we need to apply it to all sub-index builders
+    if (!seperator_is_key_plus_seq_ &&
+        sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders and reset
+      // flush_policy to point to Block Builder of sub_index_builder_ that store
+      // internal keys.
       seperator_is_key_plus_seq_ = true;
+      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+          sub_index_builder_->index_block_builder_));
     }
     sub_index_last_key_ = std::string(*last_key_in_current_block);
     entries_.push_back(
@@ -161,9 +180,15 @@
     sub_index_builder_->AddIndexEntry(last_key_in_current_block,
                                       first_key_in_next_block, block_handle);
     sub_index_last_key_ = std::string(*last_key_in_current_block);
-    if (sub_index_builder_->seperator_is_key_plus_seq_) {
-      // then we need to apply it to all sub-index builders
+    if (!seperator_is_key_plus_seq_ &&
+        sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders and reset
+      // flush_policy to point to Block Builder of sub_index_builder_ that store
+      // internal keys.
       seperator_is_key_plus_seq_ = true;
+      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
+          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
+          sub_index_builder_->index_block_builder_));
     }
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -307,12 +307,13 @@
     if (pending_block_num_ != 0) {
       FlushPendingPrefix();
     }
-    primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
+    Status s = primary_index_builder_.Finish(index_blocks,
+                                             last_partition_block_handle);
     index_blocks->meta_blocks.insert(
         {kHashIndexPrefixesBlock.c_str(), prefix_block_});
     index_blocks->meta_blocks.insert(
         {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
-    return Status::OK();
+    return s;
   }
 
   virtual size_t IndexSize() const override {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) {
+  PERF_TIMER_GUARD(read_index_block_nanos);
+
+  assert(table != nullptr);
+  assert(index_block != nullptr);
+  assert(index_block->IsEmpty());
+
+  const Rep* const rep = table->get_rep();
+  assert(rep != nullptr);
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->footer.index_handle(),
+      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
+      get_context, lookup_context, /* for_compaction */ false, use_cache,
+      /* wait_for_cache */ true);
+
+  return s;
+}
+
+Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) const {
+  assert(index_block != nullptr);
+
+  if (!index_block_.IsEmpty()) {
+    index_block->SetUnownedValue(index_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
+                        cache_index_blocks(), get_context, lookup_context,
+                        index_block);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/index_reader_common.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,85 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "table/block_based/block_based_table_reader.h"
+
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Encapsulates common functionality for the various index reader
+// implementations. Provides access to the index block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
+ public:
+  IndexReaderCommon(const BlockBasedTable* t,
+                    CachableEntry<Block>&& index_block)
+      : table_(t), index_block_(std::move(index_block)) {
+    assert(table_ != nullptr);
+  }
+
+ protected:
+  static Status ReadIndexBlock(const BlockBasedTable* table,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               const ReadOptions& read_options, bool use_cache,
+                               GetContext* get_context,
+                               BlockCacheLookupContext* lookup_context,
+                               CachableEntry<Block>* index_block);
+
+  const BlockBasedTable* table() const { return table_; }
+
+  const InternalKeyComparator* internal_comparator() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    return &table_->get_rep()->internal_comparator;
+  }
+
+  bool index_has_first_key() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_has_first_key;
+  }
+
+  bool index_key_includes_seq() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_key_includes_seq;
+  }
+
+  bool index_value_is_full() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_value_is_full;
+  }
+
+  bool cache_index_blocks() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+  }
+
+  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             CachableEntry<Block>* index_block) const;
+
+  size_t ApproximateIndexBlockMemoryUsage() const {
+    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
+    return index_block_.GetOwnValue()
+               ? index_block_.GetValue()->ApproximateMemoryUsage()
+               : 0;
+  }
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<Block> index_block_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/mock_block_based_table.h	2025-05-19 16:14:27.000000000 +0000
@@ -23,7 +23,7 @@
 
  public:
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
   InternalKeyComparator icomp_;
@@ -39,7 +39,7 @@
     constexpr bool immortal_table = false;
     table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
         ioptions_, env_options_, table_options_, icomp_, skip_filters,
-        kMockLevel, immortal_table)));
+        12345 /*file_size*/, kMockLevel, immortal_table)));
   }
 
   FilterBitsBuilder* GetBuilder() const {
@@ -47,7 +47,7 @@
     context.column_family_name = "mock_cf";
     context.compaction_style = ioptions_.compaction_style;
     context.level_at_creation = kMockLevel;
-    context.info_log = ioptions_.info_log;
+    context.info_log = ioptions_.logger;
     return BloomFilterPolicy::GetBuilderFromContext(context);
   }
 };
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h	2025-05-19 16:14:27.000000000 +0000
@@ -32,6 +32,8 @@
 
   bool own_bytes() const { return block_contents_.own_bytes(); }
 
+  const Slice GetBlockContentsData() const { return block_contents_.data; }
+
  private:
   BlockContents block_contents_;
   std::unique_ptr<FilterBitsReader> filter_bits_reader_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,6 +7,8 @@
 
 #include <utility>
 
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/malloc.h"
 #include "port/port.h"
@@ -32,9 +34,30 @@
                                                  true /*use_delta_encoding*/,
                                                  use_value_delta_encoding),
       p_index_builder_(p_index_builder),
-      keys_added_to_partition_(0) {
-  keys_per_partition_ =
-      filter_bits_builder_->CalculateNumEntry(partition_size);
+      keys_added_to_partition_(0),
+      total_added_in_built_(0) {
+  keys_per_partition_ = static_cast<uint32_t>(
+      filter_bits_builder_->ApproximateNumEntries(partition_size));
+  if (keys_per_partition_ < 1) {
+    // partition_size (minus buffer, ~10%) might be smaller than minimum
+    // filter size, sometimes based on cache line size. Try to find that
+    // minimum size without CalculateSpace (not necessarily available).
+    uint32_t larger = std::max(partition_size + 4, uint32_t{16});
+    for (;;) {
+      keys_per_partition_ = static_cast<uint32_t>(
+          filter_bits_builder_->ApproximateNumEntries(larger));
+      if (keys_per_partition_ >= 1) {
+        break;
+      }
+      larger += larger / 4;
+      if (larger > 100000) {
+        // might be a broken implementation. substitute something reasonable:
+        // 1 key / byte.
+        keys_per_partition_ = partition_size;
+        break;
+      }
+    }
+  }
 }
 
 PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
@@ -50,20 +73,24 @@
   if (!p_index_builder_->ShouldCutFilterBlock()) {
     return;
   }
-  filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
 
-  // Add the prefix of the next key before finishing the partition. This hack,
-  // fixes a bug with format_verison=3 where seeking for the prefix would lead
-  // us to the previous partition.
-  const bool add_prefix =
+  // Add the prefix of the next key before finishing the partition without
+  // updating last_prefix_str_. This hack, fixes a bug with format_verison=3
+  // where seeking for the prefix would lead us to the previous partition.
+  const bool maybe_add_prefix =
       next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
-  if (add_prefix) {
-    FullFilterBlockBuilder::AddPrefix(*next_key);
+  if (maybe_add_prefix) {
+    const Slice next_key_prefix = prefix_extractor()->Transform(*next_key);
+    if (next_key_prefix.compare(last_prefix_str()) != 0) {
+      AddKey(next_key_prefix);
+    }
   }
 
-  Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
+  total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded();
+  std::unique_ptr<const char[]> filter_data;
+  Slice filter = filter_bits_builder_->Finish(&filter_data);
   std::string& index_key = p_index_builder_->GetPartitionKey();
-  filters.push_back({index_key, filter});
+  filters.push_back({index_key, filter, std::move(filter_data)});
   keys_added_to_partition_ = 0;
   Reset();
 }
@@ -78,11 +105,15 @@
   keys_added_to_partition_++;
 }
 
+size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
+  return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
+}
+
 Slice PartitionedFilterBlockBuilder::Finish(
-    const BlockHandle& last_partition_block_handle, Status* status) {
+    const BlockHandle& last_partition_block_handle, Status* status,
+    std::unique_ptr<const char[]>* filter_data) {
   if (finishing_filters == true) {
     // Record the handle of the last written filter block in the index
-    FilterEntry& last_entry = filters.front();
     std::string handle_encoding;
     last_partition_block_handle.EncodeTo(&handle_encoding);
     std::string handle_delta_encoding;
@@ -91,14 +122,13 @@
         last_partition_block_handle.size() - last_encoded_handle_.size());
     last_encoded_handle_ = last_partition_block_handle;
     const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
+    index_on_filter_block_builder_.Add(last_filter_entry_key, handle_encoding,
                                        &handle_delta_encoding_slice);
     if (!p_index_builder_->seperator_is_key_plus_seq()) {
       index_on_filter_block_builder_without_seq_.Add(
-          ExtractUserKey(last_entry.key), handle_encoding,
+          ExtractUserKey(last_filter_entry_key), handle_encoding,
           &handle_delta_encoding_slice);
     }
-    filters.pop_front();
   } else {
     MaybeCutAFilterBlock(nullptr);
   }
@@ -106,7 +136,10 @@
   // partitions
   if (UNLIKELY(filters.empty())) {
     *status = Status::OK();
+    last_filter_data.reset();
     if (finishing_filters) {
+      // Simplest to just add them all at the end
+      total_added_in_built_ = 0;
       if (p_index_builder_->seperator_is_key_plus_seq()) {
         return index_on_filter_block_builder_.Finish();
       } else {
@@ -121,7 +154,15 @@
     // indicate we expect more calls to Finish
     *status = Status::Incomplete();
     finishing_filters = true;
-    return filters.front().filter;
+
+    last_filter_entry_key = filters.front().key;
+    Slice filter = filters.front().filter;
+    last_filter_data = std::move(filters.front().filter_data);
+    if (filter_data != nullptr) {
+      *filter_data = std::move(last_filter_data);
+    }
+    filters.pop_front();
+    return filter;
   }
 }
 
@@ -130,19 +171,20 @@
     : FilterBlockReaderCommon(t, std::move(filter_block)) {}
 
 std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
-    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    bool use_cache, bool prefetch, bool pin,
-    BlockCacheLookupContext* lookup_context) {
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
   assert(table);
   assert(table->get_rep());
   assert(!pin || prefetch);
 
   CachableEntry<Block> filter_block;
   if (prefetch || !use_cache) {
-    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
-                                     use_cache, nullptr /* get_context */,
-                                     lookup_context, &filter_block);
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache,
+                                     nullptr /* get_context */, lookup_context,
+                                     &filter_block);
     if (!s.ok()) {
+      IGNORE_STATUS_IF_ERROR(s);
       return std::unique_ptr<FilterBlockReader>();
     }
 
@@ -170,13 +212,23 @@
                   &FullFilterBlockReader::KeyMayMatch);
 }
 
+void PartitionedFilterBlockReader::KeysMayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
+  assert(block_offset == kNotValid);
+  if (!whole_key_filtering()) {
+    return;  // Any/all may match
+  }
+
+  MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context,
+           &FullFilterBlockReader::KeysMayMatch);
+}
+
 bool PartitionedFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* prefix_extractor,
     uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
     GetContext* get_context, BlockCacheLookupContext* lookup_context) {
-#ifdef NDEBUG
-  (void)block_offset;
-#endif
   assert(const_ikey_ptr != nullptr);
   assert(block_offset == kNotValid);
   if (!table_prefix_extractor() && !prefix_extractor) {
@@ -188,14 +240,28 @@
                   &FullFilterBlockReader::PrefixMayMatch);
 }
 
+void PartitionedFilterBlockReader::PrefixesMayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
+  assert(block_offset == kNotValid);
+  if (!table_prefix_extractor() && !prefix_extractor) {
+    return;  // Any/all may match
+  }
+
+  MayMatch(range, prefix_extractor, block_offset, no_io, lookup_context,
+           &FullFilterBlockReader::PrefixesMayMatch);
+}
+
 BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
     const CachableEntry<Block>& filter_block, const Slice& entry) const {
   IndexBlockIter iter;
   const InternalKeyComparator* const comparator = internal_comparator();
   Statistics* kNullStats = nullptr;
   filter_block.GetValue()->NewIndexIterator(
-      comparator, comparator->user_comparator(), &iter, kNullStats,
-      true /* total_order_seek */, false /* have_first_key */,
+      comparator->user_comparator(),
+      table()->get_rep()->get_global_seqno(BlockType::kFilter), &iter,
+      kNullStats, true /* total_order_seek */, false /* have_first_key */,
       index_key_includes_seq(), index_value_is_full());
   iter.Seek(entry);
   if (UNLIKELY(!iter.Valid())) {
@@ -239,7 +305,8 @@
       table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
                              UncompressionDict::GetEmptyDict(), filter_block,
                              BlockType::kFilter, get_context, lookup_context,
-                             /* for_compaction */ false, /* use_cache */ true);
+                             /* for_compaction */ false, /* use_cache */ true,
+                             /* wait_for_cache */ true);
 
   return s;
 }
@@ -253,6 +320,7 @@
   Status s =
       GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
   if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
     return true;
   }
 
@@ -270,6 +338,7 @@
                               no_io, get_context, lookup_context,
                               &filter_partition_block);
   if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
     return true;
   }
 
@@ -280,6 +349,79 @@
       lookup_context);
 }
 
+void PartitionedFilterBlockReader::MayMatch(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, bool no_io, BlockCacheLookupContext* lookup_context,
+    FilterManyFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
+                                  lookup_context, &filter_block);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;  // Any/all may match
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return;  // Any/all may match
+  }
+
+  auto start_iter_same_handle = range->begin();
+  BlockHandle prev_filter_handle = BlockHandle::NullBlockHandle();
+
+  // For all keys mapping to same partition (must be adjacent in sorted order)
+  // share block cache lookup and use full filter multiget on the partition
+  // filter.
+  for (auto iter = start_iter_same_handle; iter != range->end(); ++iter) {
+    // TODO: re-use one top-level index iterator
+    BlockHandle this_filter_handle =
+        GetFilterPartitionHandle(filter_block, iter->ikey);
+    if (!prev_filter_handle.IsNull() &&
+        this_filter_handle != prev_filter_handle) {
+      MultiGetRange subrange(*range, start_iter_same_handle, iter);
+      MayMatchPartition(&subrange, prefix_extractor, block_offset,
+                        prev_filter_handle, no_io, lookup_context,
+                        filter_function);
+      range->AddSkipsFrom(subrange);
+      start_iter_same_handle = iter;
+    }
+    if (UNLIKELY(this_filter_handle.size() == 0)) {  // key is out of range
+      // Not reachable with current behavior of GetFilterPartitionHandle
+      assert(false);
+      range->SkipKey(iter);
+      prev_filter_handle = BlockHandle::NullBlockHandle();
+    } else {
+      prev_filter_handle = this_filter_handle;
+    }
+  }
+  if (!prev_filter_handle.IsNull()) {
+    MultiGetRange subrange(*range, start_iter_same_handle, range->end());
+    MayMatchPartition(&subrange, prefix_extractor, block_offset,
+                      prev_filter_handle, no_io, lookup_context,
+                      filter_function);
+    range->AddSkipsFrom(subrange);
+  }
+}
+
+void PartitionedFilterBlockReader::MayMatchPartition(
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, BlockHandle filter_handle, bool no_io,
+    BlockCacheLookupContext* lookup_context,
+    FilterManyFunction filter_function) const {
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+  Status s = GetFilterPartitionBlock(
+      nullptr /* prefetch_buffer */, filter_handle, no_io,
+      range->begin()->get_context, lookup_context, &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    IGNORE_STATUS_IF_ERROR(s);
+    return;  // Any/all may match
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  (filter_partition.*filter_function)(range, prefix_extractor, block_offset,
+                                      no_io, lookup_context);
+}
+
 size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
   size_t usage = ApproximateFilterBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
@@ -292,7 +434,8 @@
 }
 
 // TODO(myabandeh): merge this with the same function in IndexReader
-void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
+Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
+                                                       bool pin) {
   assert(table());
 
   const BlockBasedTable::Rep* const rep = table()->get_rep();
@@ -305,11 +448,11 @@
   Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
                                   &lookup_context, &filter_block);
   if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
-                   "Error retrieving top-level filter block while trying to "
-                   "cache filter partitions: %s",
-                   s.ToString().c_str());
-    return;
+    ROCKS_LOG_ERROR(rep->ioptions.logger,
+                    "Error retrieving top-level filter block while trying to "
+                    "cache filter partitions: %s",
+                    s.ToString().c_str());
+    return s;
   }
 
   // Before read partitions, prefetch them to avoid lots of IOs
@@ -319,9 +462,10 @@
   const InternalKeyComparator* const comparator = internal_comparator();
   Statistics* kNullStats = nullptr;
   filter_block.GetValue()->NewIndexIterator(
-      comparator, comparator->user_comparator(), &biter, kNullStats,
-      true /* total_order_seek */, false /* have_first_key */,
-      index_key_includes_seq(), index_value_is_full());
+      comparator->user_comparator(), rep->get_global_seqno(BlockType::kFilter),
+      &biter, kNullStats, true /* total_order_seek */,
+      false /* have_first_key */, index_key_includes_seq(),
+      index_value_is_full());
   // Index partitions are assumed to be consecuitive. Prefetch them all.
   // Read the first block offset
   biter.SeekToFirst();
@@ -331,16 +475,24 @@
   // Read the last block's offset
   biter.SeekToLast();
   handle = biter.value().handle;
-  uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
+  uint64_t last_off =
+      handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize;
   uint64_t prefetch_len = last_off - prefetch_off;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
+                                false /* Implicit autoreadahead */);
 
-  prefetch_buffer.reset(new FilePrefetchBuffer());
-  s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
-                                static_cast<size_t>(prefetch_len));
+  IOOptions opts;
+  s = rep->file->PrepareIOOptions(ro, opts);
+  if (s.ok()) {
+    s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
+                                  static_cast<size_t>(prefetch_len));
+  }
+  if (!s.ok()) {
+    return s;
+  }
 
   // After prefetch, read the partitions one by one
-  ReadOptions read_options;
   for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
     handle = biter.value().handle;
 
@@ -348,12 +500,15 @@
     // TODO: Support counter batch update for partitioned index and
     // filter blocks
     s = table()->MaybeReadBlockAndLoadToCache(
-        prefetch_buffer.get(), read_options, handle,
-        UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter,
+        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+        /* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter,
         nullptr /* get_context */, &lookup_context, nullptr /* contents */);
-
+    if (!s.ok()) {
+      return s;
+    }
     assert(s.ok() || block.GetValue() == nullptr);
-    if (s.ok() && block.GetValue() != nullptr) {
+
+    if (block.GetValue() != nullptr) {
       if (block.IsCached()) {
         if (pin) {
           filter_map_[handle.offset()] = std::move(block);
@@ -361,6 +516,7 @@
       }
     }
   }
+  return biter.status();
 }
 
 const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,20 +5,22 @@
 
 #pragma once
 
+#include <deque>
 #include <list>
 #include <string>
 #include <unordered_map>
-#include "db/dbformat.h"
-#include "index_builder.h"
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "table/block_based/block.h"
 #include "table/block_based/filter_block_reader_common.h"
 #include "table/block_based/full_filter_block.h"
+#include "table/block_based/index_builder.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
+class InternalKeyComparator;
 
 class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
  public:
@@ -33,9 +35,11 @@
 
   void AddKey(const Slice& key) override;
   void Add(const Slice& key) override;
+  size_t EstimateEntriesAdded() override;
 
-  virtual Slice Finish(const BlockHandle& last_partition_block_handle,
-                       Status* status) override;
+  virtual Slice Finish(
+      const BlockHandle& last_partition_block_handle, Status* status,
+      std::unique_ptr<const char[]>* filter_data = nullptr) override;
 
  private:
   // Filter data
@@ -45,10 +49,13 @@
   struct FilterEntry {
     std::string key;
     Slice filter;
+    std::unique_ptr<const char[]> filter_data;
   };
-  std::list<FilterEntry> filters;  // list of partitioned indexes and their keys
+  std::deque<FilterEntry> filters;  // list of partitioned filters and keys used
+                                    // in building the index
+  std::string last_filter_entry_key;
+  std::unique_ptr<const char[]> last_filter_data;
   std::unique_ptr<IndexBuilder> value;
-  std::vector<std::unique_ptr<const char[]>> filter_gc;
   bool finishing_filters =
       false;  // true if Finish is called once but not complete yet.
   // The policy of when cut a filter block and Finish it
@@ -62,6 +69,9 @@
   uint32_t keys_per_partition_;
   // The number of keys added to the last partition so far
   uint32_t keys_added_to_partition_;
+  // According to the bits builders, how many keys/prefixes added
+  // in all the filters we have fully built
+  uint64_t total_added_in_built_;
   BlockHandle last_encoded_handle_;
 };
 
@@ -71,21 +81,30 @@
                                CachableEntry<Block>&& filter_block);
 
   static std::unique_ptr<FilterBlockReader> Create(
-      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-      bool use_cache, bool prefetch, bool pin,
-      BlockCacheLookupContext* lookup_context);
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
 
   bool IsBlockBased() override { return false; }
   bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
                    uint64_t block_offset, const bool no_io,
                    const Slice* const const_ikey_ptr, GetContext* get_context,
                    BlockCacheLookupContext* lookup_context) override;
+  void KeysMayMatch(MultiGetRange* range,
+                    const SliceTransform* prefix_extractor,
+                    uint64_t block_offset, const bool no_io,
+                    BlockCacheLookupContext* lookup_context) override;
+
   bool PrefixMayMatch(const Slice& prefix,
                       const SliceTransform* prefix_extractor,
                       uint64_t block_offset, const bool no_io,
                       const Slice* const const_ikey_ptr,
                       GetContext* get_context,
                       BlockCacheLookupContext* lookup_context) override;
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        uint64_t block_offset, const bool no_io,
+                        BlockCacheLookupContext* lookup_context) override;
 
   size_t ApproximateMemoryUsage() const override;
 
@@ -108,13 +127,28 @@
                 GetContext* get_context,
                 BlockCacheLookupContext* lookup_context,
                 FilterFunction filter_function) const;
-  void CacheDependencies(bool pin) override;
+  using FilterManyFunction = void (FullFilterBlockReader::*)(
+      MultiGetRange* range, const SliceTransform* prefix_extractor,
+      uint64_t block_offset, const bool no_io,
+      BlockCacheLookupContext* lookup_context);
+  void MayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor,
+                uint64_t block_offset, bool no_io,
+                BlockCacheLookupContext* lookup_context,
+                FilterManyFunction filter_function) const;
+  void MayMatchPartition(MultiGetRange* range,
+                         const SliceTransform* prefix_extractor,
+                         uint64_t block_offset, BlockHandle filter_handle,
+                         bool no_io, BlockCacheLookupContext* lookup_context,
+                         FilterManyFunction filter_function) const;
+  Status CacheDependencies(const ReadOptions& ro, bool pin) override;
 
   const InternalKeyComparator* internal_comparator() const;
   bool index_key_includes_seq() const;
   bool index_value_is_full() const;
 
  protected:
+  // For partition blocks pinned in cache. Can be a subset of blocks
+  // in case some fail insertion on attempt to pin.
   std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
       filter_map_;
 };
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,16 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "table/block_based/partitioned_filter_block.h"
+
 #include <map>
 
+#include "index_builder.h"
 #include "rocksdb/filter_policy.h"
-
 #include "table/block_based/block_based_table_reader.h"
-#include "table/block_based/partitioned_filter_block.h"
 #include "table/block_based/filter_policy_internal.h"
-
-#include "index_builder.h"
-#include "logging/logging.h"
+#include "table/format.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
@@ -59,7 +58,7 @@
       virtual public ::testing::WithParamInterface<uint32_t> {
  public:
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
   InternalKeyComparator icomp_;
@@ -137,22 +136,24 @@
     BlockHandle bh;
     Status status;
     Slice slice;
+    std::unique_ptr<const char[]> filter_data;
     do {
-      slice = builder->Finish(bh, &status);
+      slice = builder->Finish(bh, &status, &filter_data);
       bh = Write(slice);
     } while (status.IsIncomplete());
 
     constexpr bool skip_filters = false;
+    constexpr uint64_t file_size = 12345;
     constexpr int level = 0;
     constexpr bool immortal_table = false;
     table_.reset(new MockedBlockBasedTable(
         new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
-                                 icomp_, skip_filters, level, immortal_table),
+                                 icomp_, skip_filters, file_size, level,
+                                 immortal_table),
         pib));
     BlockContents contents(slice);
     CachableEntry<Block> block(
-        new Block(std::move(contents), kDisableGlobalSequenceNumber,
-                  0 /* read_amp_bytes_per_bit */, nullptr),
+        new Block(std::move(contents), 0 /* read_amp_bytes_per_bit */, nullptr),
         nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
     auto reader =
         new MyPartitionedFilterBlockReader(table_.get(), std::move(block));
@@ -291,10 +292,11 @@
   }
 };
 
-INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest,
-                        testing::Values(test::kDefaultFormatVersion));
-INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest,
-                        testing::Values(test::kLatestFormatVersion));
+// Format versions potentially intersting to partitioning
+INSTANTIATE_TEST_CASE_P(FormatVersions, PartitionedFilterBlockTest,
+                        testing::ValuesIn(std::set<uint32_t>{
+                            2, 3, 4, test::kDefaultFormatVersion,
+                            kLatestFormatVersion}));
 
 TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
   std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,162 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/partitioned_index_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+void PartitionedIndexIterator::Seek(const Slice& target) { SeekImpl(&target); }
+
+void PartitionedIndexIterator::SeekToFirst() { SeekImpl(nullptr); }
+
+void PartitionedIndexIterator::SeekImpl(const Slice* target) {
+  SavePrevIndexValue();
+
+  if (target) {
+    index_iter_->Seek(*target);
+  } else {
+    index_iter_->SeekToFirst();
+  }
+
+  if (!index_iter_->Valid()) {
+    ResetPartitionedIndexIter();
+    return;
+  }
+
+  InitPartitionedIndexBlock();
+
+  if (target) {
+    block_iter_.Seek(*target);
+  } else {
+    block_iter_.SeekToFirst();
+  }
+  FindKeyForward();
+
+  // We could check upper bound here, but that would be too complicated
+  // and checking index upper bound is less useful than for data blocks.
+
+  if (target) {
+    assert(!Valid() || (table_->get_rep()->index_key_includes_seq
+                            ? (icomp_.Compare(*target, key()) <= 0)
+                            : (user_comparator_.Compare(ExtractUserKey(*target),
+                                                        key()) <= 0)));
+  }
+}
+
+void PartitionedIndexIterator::SeekToLast() {
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetPartitionedIndexIter();
+    return;
+  }
+  InitPartitionedIndexBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+}
+
+void PartitionedIndexIterator::Next() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+}
+
+void PartitionedIndexIterator::Prev() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Prev();
+
+  FindKeyBackward();
+}
+
+void PartitionedIndexIterator::InitPartitionedIndexBlock() {
+  BlockHandle partitioned_index_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      partitioned_index_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetPartitionedIndexIter();
+    }
+    auto* rep = table_->get_rep();
+    bool is_for_compaction =
+        lookup_context_.caller == TableReaderCaller::kCompaction;
+    // Prefetch additional data for range scans (iterators).
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    block_prefetcher_.PrefetchIfNeeded(rep, partitioned_index_handle,
+                                       read_options_.readahead_size,
+                                       is_for_compaction);
+    Status s;
+    table_->NewDataBlockIterator<IndexBlockIter>(
+        read_options_, partitioned_index_handle, &block_iter_,
+        BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context_, s,
+        block_prefetcher_.prefetch_buffer(),
+        /*for_compaction=*/is_for_compaction);
+    block_iter_points_to_real_block_ = true;
+    // We could check upper bound here but it is complicated to reason about
+    // upper bound in index iterator. On the other than, in large scans, index
+    // iterators are moved much less frequently compared to data blocks. So
+    // the upper bound check is skipped for simplicity.
+  }
+}
+
+void PartitionedIndexIterator::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+void PartitionedIndexIterator::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    ResetPartitionedIndexIter();
+    index_iter_->Next();
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitPartitionedIndexBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+void PartitionedIndexIterator::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetPartitionedIndexIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitPartitionedIndexBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,159 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/block_based_table_reader.h"
+
+#include "table/block_based/block_based_table_reader_impl.h"
+#include "table/block_based/block_prefetcher.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Iterator that iterates over partitioned index.
+// Some upper and lower bound tricks played in block based table iterators
+// could be played here, but it's too complicated to reason about index
+// keys with upper or lower bound, so we skip it for simplicity.
+class PartitionedIndexIterator : public InternalIteratorBase<IndexValue> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
+ public:
+  PartitionedIndexIterator(
+      const BlockBasedTable* table, const ReadOptions& read_options,
+      const InternalKeyComparator& icomp,
+      std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
+      TableReaderCaller caller, size_t compaction_readahead_size = 0)
+      : index_iter_(std::move(index_iter)),
+        table_(table),
+        read_options_(read_options),
+#ifndef NDEBUG
+        icomp_(icomp),
+#endif
+        user_comparator_(icomp.user_comparator()),
+        block_iter_points_to_real_block_(false),
+        lookup_context_(caller),
+        block_prefetcher_(compaction_readahead_size) {
+  }
+
+  ~PartitionedIndexIterator() override {}
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice&) override {
+    // Shouldn't be called.
+    assert(false);
+  }
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult*) override {
+    assert(false);
+    return false;
+  }
+  void Prev() override;
+  bool Valid() const override {
+    return block_iter_points_to_real_block_ && block_iter_.Valid();
+  }
+  Slice key() const override {
+    assert(Valid());
+    return block_iter_.key();
+  }
+  Slice user_key() const override {
+    assert(Valid());
+    return block_iter_.user_key();
+  }
+  IndexValue value() const override {
+    assert(Valid());
+    return block_iter_.value();
+  }
+  Status status() const override {
+    // Prefix index set status to NotFound when the prefix does not exist
+    if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else {
+      return Status::OK();
+    }
+  }
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    // Shouldn't be called.
+    assert(false);
+    return IterBoundCheck::kUnknown;
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager*) override {
+    // Shouldn't be called.
+    assert(false);
+  }
+  bool IsKeyPinned() const override {
+    // Shouldn't be called.
+    assert(false);
+    return false;
+  }
+  bool IsValuePinned() const override {
+    // Shouldn't be called.
+    assert(false);
+    return false;
+  }
+
+  void ResetPartitionedIndexIter() {
+    if (block_iter_points_to_real_block_) {
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_block_offset_ = index_iter_->value().handle.offset();
+    }
+  }
+
+  void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (block_prefetcher_.prefetch_buffer() != nullptr &&
+        read_options_.adaptive_readahead) {
+      block_prefetcher_.prefetch_buffer()->GetReadaheadState(
+          &(readahead_file_info->index_block_readahead_info));
+    }
+  }
+
+  void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
+    if (read_options_.adaptive_readahead) {
+      block_prefetcher_.SetReadaheadState(
+          &(readahead_file_info->index_block_readahead_info));
+    }
+  }
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
+
+ private:
+  friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
+  const BlockBasedTable* table_;
+  const ReadOptions read_options_;
+#ifndef NDEBUG
+  const InternalKeyComparator& icomp_;
+#endif
+  UserComparatorWrapper user_comparator_;
+  IndexBlockIter block_iter_;
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+  BlockPrefetcher block_prefetcher_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target);
+
+  void InitPartitionedIndexBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,207 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/partitioned_index_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/partitioned_index_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status PartitionIndexReader::Create(
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  assert(table != nullptr);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(index_reader != nullptr);
+
+  CachableEntry<Block> index_block;
+  if (prefetch || !use_cache) {
+    const Status s =
+        ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
+                       /*get_context=*/nullptr, lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      index_block.Reset();
+    }
+  }
+
+  index_reader->reset(new PartitionIndexReader(table, std::move(index_block)));
+
+  return Status::OK();
+}
+
+InternalIteratorBase<IndexValue>* PartitionIndexReader::NewIterator(
+    const ReadOptions& read_options, bool /* disable_prefix_seek */,
+    IndexBlockIter* iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
+  const bool no_io = (read_options.read_tier == kBlockCacheTier);
+  CachableEntry<Block> index_block;
+  const Status s =
+      GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+  if (!s.ok()) {
+    if (iter != nullptr) {
+      iter->Invalidate(s);
+      return iter;
+    }
+
+    return NewErrorInternalIterator<IndexValue>(s);
+  }
+
+  const BlockBasedTable::Rep* rep = table()->rep_;
+  InternalIteratorBase<IndexValue>* it = nullptr;
+
+  Statistics* kNullStats = nullptr;
+  // Filters are already checked before seeking the index
+  if (!partition_map_.empty()) {
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    it = NewTwoLevelIterator(
+        new BlockBasedTable::PartitionedIndexIteratorState(table(),
+                                                           &partition_map_),
+        index_block.GetValue()->NewIndexIterator(
+            internal_comparator()->user_comparator(),
+            rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+            index_has_first_key(), index_key_includes_seq(),
+            index_value_is_full()));
+  } else {
+    ReadOptions ro;
+    ro.fill_cache = read_options.fill_cache;
+    ro.deadline = read_options.deadline;
+    ro.io_timeout = read_options.io_timeout;
+    ro.adaptive_readahead = read_options.adaptive_readahead;
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
+        index_block.GetValue()->NewIndexIterator(
+            internal_comparator()->user_comparator(),
+            rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
+            index_has_first_key(), index_key_includes_seq(),
+            index_value_is_full()));
+
+    it = new PartitionedIndexIterator(
+        table(), ro, *internal_comparator(), std::move(index_iter),
+        lookup_context ? lookup_context->caller
+                       : TableReaderCaller::kUncategorized);
+  }
+
+  assert(it != nullptr);
+  index_block.TransferTo(it);
+
+  return it;
+
+  // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
+  // on-stack BlockIter while the state is on heap. Currentlly it assumes
+  // the first level iter is always on heap and will attempt to delete it
+  // in its destructor.
+}
+Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
+                                               bool pin) {
+  // Before read partitions, prefetch them to avoid lots of IOs
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  const BlockBasedTable::Rep* rep = table()->rep_;
+  IndexBlockIter biter;
+  BlockHandle handle;
+  Statistics* kNullStats = nullptr;
+
+  CachableEntry<Block> index_block;
+  {
+    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
+                                   &lookup_context, &index_block);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  index_block.GetValue()->NewIndexIterator(
+      internal_comparator()->user_comparator(),
+      rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true,
+      index_has_first_key(), index_key_includes_seq(), index_value_is_full());
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  if (!biter.Valid()) {
+    // Empty index.
+    return biter.status();
+  }
+  handle = biter.value().handle;
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  if (!biter.Valid()) {
+    // Empty index.
+    return biter.status();
+  }
+  handle = biter.value().handle;
+  uint64_t last_off =
+      handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle);
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
+                                false /*Implicit auto readahead*/);
+  IOOptions opts;
+  {
+    Status s = rep->file->PrepareIOOptions(ro, opts);
+    if (s.ok()) {
+      s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
+                                    static_cast<size_t>(prefetch_len));
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // For saving "all or nothing" to partition_map_
+  std::unordered_map<uint64_t, CachableEntry<Block>> map_in_progress;
+
+  // After prefetch, read the partitions one by one
+  biter.SeekToFirst();
+  size_t partition_count = 0;
+  for (; biter.Valid(); biter.Next()) {
+    handle = biter.value().handle;
+    CachableEntry<Block> block;
+    ++partition_count;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    Status s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+        /*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr);
+
+    if (!s.ok()) {
+      return s;
+    }
+    if (block.GetValue() != nullptr) {
+      // Might need to "pin" some mmap-read blocks (GetOwnValue) if some
+      // partitions are successfully compressed (cached) and some are not
+      // compressed (mmap eligible)
+      if (block.IsCached() || block.GetOwnValue()) {
+        if (pin) {
+          map_in_progress[handle.offset()] = std::move(block);
+        }
+      }
+    }
+  }
+  Status s = biter.status();
+  // Save (pin) them only if everything checks out
+  if (map_in_progress.size() == partition_count && s.ok()) {
+    std::swap(partition_map_, map_in_progress);
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/partitioned_index_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "table/block_based/index_reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Index that allows binary search lookup in a two-level index structure.
+class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read the partition index from the file and create an instance for
+  // `PartitionIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table, const ReadOptions& ro,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader);
+
+  // return a two-level iterator: first level is on the partition index
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override;
+
+  Status CacheDependencies(const ReadOptions& ro, bool pin) override;
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
+    return usage;
+  }
+
+ private:
+  PartitionIndexReader(const BlockBasedTable* t,
+                       CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  // For partition blocks pinned in cache. This is expected to be "all or
+  // none" so that !partition_map_.empty() can use an iterator expecting
+  // all partitions to be saved here.
+  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/reader_common.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/table.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+void ForceReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, true /* force_erase */);
+}
+
+// WART: this is specific to block-based table
+Status VerifyBlockChecksum(ChecksumType type, const char* data,
+                           size_t block_size, const std::string& file_name,
+                           uint64_t offset) {
+  PERF_TIMER_GUARD(block_checksum_time);
+  // After block_size bytes is compression type (1 byte), which is part of
+  // the checksummed section.
+  size_t len = block_size + 1;
+  // And then the stored checksum value (4 bytes).
+  uint32_t stored = DecodeFixed32(data + len);
+
+  uint32_t computed = ComputeBuiltinChecksum(type, data, len);
+  if (stored == computed) {
+    return Status::OK();
+  } else {
+    // Unmask for people who might look for reference crc value
+    if (type == kCRC32c) {
+      stored = crc32c::Unmask(stored);
+      computed = crc32c::Unmask(computed);
+    }
+    return Status::Corruption(
+        "block checksum mismatch: stored = " + ToString(stored) +
+        ", computed = " + ToString(computed) + ", type = " + ToString(type) +
+        "  in " + file_name + " offset " + ToString(offset) + " size " +
+        ToString(block_size));
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/reader_common.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/reader_common.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "rocksdb/cache.h"
+#include "rocksdb/table.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Release the cached entry and decrement its ref count.
+extern void ForceReleaseCachedEntry(void* arg, void* h);
+
+inline MemoryAllocator* GetMemoryAllocator(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache.get()
+             ? table_options.block_cache->memory_allocator()
+             : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache_compressed.get()
+             ? table_options.block_cache_compressed->memory_allocator()
+             : nullptr;
+}
+
+// Assumes block has a trailer as in format.h. file_name and offset provided
+// for generating a diagnostic message in returned status.
+extern Status VerifyBlockChecksum(ChecksumType type, const char* data,
+                                  size_t block_size,
+                                  const std::string& file_name,
+                                  uint64_t offset);
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,8 @@
 //
 
 #include "table/block_based/uncompression_dict_reader.h"
+
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "util/compression.h"
@@ -12,9 +14,9 @@
 namespace ROCKSDB_NAMESPACE {
 
 Status UncompressionDictReader::Create(
-    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-    bool use_cache, bool prefetch, bool pin,
-    BlockCacheLookupContext* lookup_context,
+    const BlockBasedTable* table, const ReadOptions& ro,
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
     std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) {
   assert(table);
   assert(table->get_rep());
@@ -24,8 +26,8 @@
   CachableEntry<UncompressionDict> uncompression_dict;
   if (prefetch || !use_cache) {
     const Status s = ReadUncompressionDictionary(
-        table, prefetch_buffer, ReadOptions(), use_cache,
-        nullptr /* get_context */, lookup_context, &uncompression_dict);
+        table, prefetch_buffer, ro, use_cache, nullptr /* get_context */,
+        lookup_context, &uncompression_dict);
     if (!s.ok()) {
       return s;
     }
@@ -60,11 +62,11 @@
       prefetch_buffer, read_options, rep->compression_dict_handle,
       UncompressionDict::GetEmptyDict(), uncompression_dict,
       BlockType::kCompressionDictionary, get_context, lookup_context,
-      /* for_compaction */ false, use_cache);
+      /* for_compaction */ false, use_cache, /* wait_for_cache */ true);
 
   if (!s.ok()) {
     ROCKS_LOG_WARN(
-        rep->ioptions.info_log,
+        rep->ioptions.logger,
         "Encountered error while reading data from compression dictionary "
         "block %s",
         s.ToString().c_str());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -25,9 +25,9 @@
 class UncompressionDictReader {
  public:
   static Status Create(
-      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
-      bool use_cache, bool prefetch, bool pin,
-      BlockCacheLookupContext* lookup_context,
+      const BlockBasedTable* table, const ReadOptions& ro,
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context,
       std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader);
 
   Status GetOrReadUncompressionDictionary(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.cc	2025-05-19 16:14:27.000000000 +0000
@@ -15,54 +15,32 @@
 #include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
+#include "rocksdb/compression_type.h"
 #include "rocksdb/env.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/reader_common.h"
 #include "table/format.h"
 #include "table/persistent_cache_helper.h"
-#include "util/coding.h"
 #include "util/compression.h"
-#include "util/crc32c.h"
 #include "util/stop_watch.h"
-#include "util/string_util.h"
-#include "util/xxhash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-inline void BlockFetcher::CheckBlockChecksum() {
-  // Check the crc of the type and the block contents
-  if (read_options_.verify_checksums) {
-    const char* data = slice_.data();  // Pointer to where Read put the data
-    PERF_TIMER_GUARD(block_checksum_time);
-    uint32_t value = DecodeFixed32(data + block_size_ + 1);
-    uint32_t actual = 0;
-    switch (footer_.checksum()) {
-      case kNoChecksum:
-        break;
-      case kCRC32c:
-        value = crc32c::Unmask(value);
-        actual = crc32c::Value(data, block_size_ + 1);
-        break;
-      case kxxHash:
-        actual = XXH32(data, static_cast<int>(block_size_) + 1, 0);
-        break;
-      case kxxHash64:
-        actual = static_cast<uint32_t>(
-            XXH64(data, static_cast<int>(block_size_) + 1, 0) &
-            uint64_t{0xffffffff});
-        break;
-      default:
-        status_ = Status::Corruption(
-            "unknown checksum type " + ToString(footer_.checksum()) + " in " +
-            file_->file_name() + " offset " + ToString(handle_.offset()) +
-            " size " + ToString(block_size_));
-    }
-    if (status_.ok() && actual != value) {
-      status_ = Status::Corruption(
-          "block checksum mismatch: expected " + ToString(actual) + ", got " +
-          ToString(value) + "  in " + file_->file_name() + " offset " +
-          ToString(handle_.offset()) + " size " + ToString(block_size_));
+inline void BlockFetcher::ProcessTrailerIfPresent() {
+  if (footer_.GetBlockTrailerSize() > 0) {
+    assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize);
+    if (read_options_.verify_checksums) {
+      io_status_ = status_to_io_status(VerifyBlockChecksum(
+          footer_.checksum_type(), slice_.data(), block_size_,
+          file_->file_name(), handle_.offset()));
     }
+    compression_type_ =
+        BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);
+  } else {
+    // E.g. plain table or cuckoo table
+    compression_type_ = kNoCompression;
   }
 }
 
@@ -76,9 +54,9 @@
       return true;
     } else {
       // uncompressed page is not found
-      if (ioptions_.info_log && !status.IsNotFound()) {
+      if (ioptions_.logger && !status.IsNotFound()) {
         assert(!status.ok());
-        ROCKS_LOG_INFO(ioptions_.info_log,
+        ROCKS_LOG_INFO(ioptions_.logger,
                        "Error reading from persistent cache. %s",
                        status.ToString().c_str());
       }
@@ -88,18 +66,23 @@
 }
 
 inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
-  if (prefetch_buffer_ != nullptr &&
-      prefetch_buffer_->TryReadFromCache(
-          handle_.offset(),
-          static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_,
-          for_compaction_)) {
-    block_size_ = static_cast<size_t>(handle_.size());
-    CheckBlockChecksum();
-    if (!status_.ok()) {
+  if (prefetch_buffer_ != nullptr) {
+    IOOptions opts;
+    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+    if (io_s.ok() &&
+        prefetch_buffer_->TryReadFromCache(opts, file_, handle_.offset(),
+                                           block_size_with_trailer_, &slice_,
+                                           &io_s, for_compaction_)) {
+      ProcessTrailerIfPresent();
+      if (!io_status_.ok()) {
+        return true;
+      }
+      got_from_prefetch_buffer_ = true;
+      used_buf_ = const_cast<char*>(slice_.data());
+    } else if (!io_s.ok()) {
+      io_status_ = io_s;
       return true;
     }
-    got_from_prefetch_buffer_ = true;
-    used_buf_ = const_cast<char*>(slice_.data());
   }
   return got_from_prefetch_buffer_;
 }
@@ -109,18 +92,19 @@
       cache_options_.persistent_cache->IsCompressed()) {
     // lookup uncompressed cache mode p-cache
     std::unique_ptr<char[]> raw_data;
-    status_ = PersistentCacheHelper::LookupRawPage(
-        cache_options_, handle_, &raw_data, block_size_ + kBlockTrailerSize);
-    if (status_.ok()) {
+    io_status_ = status_to_io_status(PersistentCacheHelper::LookupRawPage(
+        cache_options_, handle_, &raw_data, block_size_with_trailer_));
+    if (io_status_.ok()) {
       heap_buf_ = CacheAllocationPtr(raw_data.release());
       used_buf_ = heap_buf_.get();
       slice_ = Slice(heap_buf_.get(), block_size_);
+      ProcessTrailerIfPresent();
       return true;
-    } else if (!status_.IsNotFound() && ioptions_.info_log) {
-      assert(!status_.ok());
-      ROCKS_LOG_INFO(ioptions_.info_log,
+    } else if (!io_status_.IsNotFound() && ioptions_.logger) {
+      assert(!io_status_.ok());
+      ROCKS_LOG_INFO(ioptions_.logger,
                      "Error reading from persistent cache. %s",
-                     status_.ToString().c_str());
+                     io_status_.ToString().c_str());
     }
   }
   return false;
@@ -128,35 +112,53 @@
 
 inline void BlockFetcher::PrepareBufferForBlockFromFile() {
   // cache miss read from device
-  if (do_uncompress_ &&
-      block_size_ + kBlockTrailerSize < kDefaultStackBufferSize) {
+  if ((do_uncompress_ || ioptions_.allow_mmap_reads) &&
+      block_size_with_trailer_ < kDefaultStackBufferSize) {
     // If we've got a small enough hunk of data, read it in to the
     // trivially allocated stack buffer instead of needing a full malloc()
+    //
+    // `GetBlockContents()` cannot return this data as its lifetime is tied to
+    // this `BlockFetcher`'s lifetime. That is fine because this is only used
+    // in cases where we do not expect the `GetBlockContents()` result to be the
+    // same buffer we are assigning here. If we guess incorrectly, there will be
+    // a heap allocation and memcpy in `GetBlockContents()` to obtain the final
+    // result. Considering we are eliding a heap allocation here by using the
+    // stack buffer, the cost of guessing incorrectly here is one extra memcpy.
+    //
+    // When `do_uncompress_` is true, we expect the uncompression step will
+    // allocate heap memory for the final result. However this expectation will
+    // be wrong if the block turns out to already be uncompressed, which we
+    // won't know for sure until after reading it.
+    //
+    // When `ioptions_.allow_mmap_reads` is true, we do not expect the file
+    // reader to use the scratch buffer at all, but instead return a pointer
+    // into the mapped memory. This expectation will be wrong when using a
+    // file reader that does not implement mmap reads properly.
     used_buf_ = &stack_buf_[0];
   } else if (maybe_compressed_ && !do_uncompress_) {
-    compressed_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize,
+    compressed_buf_ = AllocateBlock(block_size_with_trailer_,
                                     memory_allocator_compressed_);
     used_buf_ = compressed_buf_.get();
   } else {
     heap_buf_ =
-        AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
+        AllocateBlock(block_size_with_trailer_, memory_allocator_);
     used_buf_ = heap_buf_.get();
   }
 }
 
 inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
-  if (status_.ok() && read_options_.fill_cache &&
+  if (io_status_.ok() && read_options_.fill_cache &&
       cache_options_.persistent_cache &&
       cache_options_.persistent_cache->IsCompressed()) {
     // insert to raw cache
     PersistentCacheHelper::InsertRawPage(cache_options_, handle_, used_buf_,
-                                         block_size_ + kBlockTrailerSize);
+                                         block_size_with_trailer_);
   }
 }
 
 inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
-  if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache &&
-      cache_options_.persistent_cache &&
+  if (io_status_.ok() && !got_from_prefetch_buffer_ &&
+      read_options_.fill_cache && cache_options_.persistent_cache &&
       !cache_options_.persistent_cache->IsCompressed()) {
     // insert to uncompressed cache
     PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_,
@@ -164,12 +166,35 @@
   }
 }
 
-inline void BlockFetcher::CopyBufferToHeap() {
+inline void BlockFetcher::CopyBufferToHeapBuf() {
   assert(used_buf_ != heap_buf_.get());
-  heap_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
-  memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize);
+  heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_);
+  memcpy(heap_buf_.get(), used_buf_, block_size_with_trailer_);
+#ifndef NDEBUG
+  num_heap_buf_memcpy_++;
+#endif
+}
+
+inline void BlockFetcher::CopyBufferToCompressedBuf() {
+  assert(used_buf_ != compressed_buf_.get());
+  compressed_buf_ = AllocateBlock(block_size_with_trailer_,
+                                  memory_allocator_compressed_);
+  memcpy(compressed_buf_.get(), used_buf_, block_size_with_trailer_);
+#ifndef NDEBUG
+  num_compressed_buf_memcpy_++;
+#endif
 }
 
+// Entering this method means the block is not compressed or do not need to be
+// uncompressed. The block can be in one of the following buffers:
+// 1. prefetch buffer if prefetch is enabled and the block is prefetched before
+// 2. stack_buf_ if block size is smaller than the stack_buf_ size and block
+//    is not compressed
+// 3. heap_buf_ if the block is not compressed
+// 4. compressed_buf_ if the block is compressed
+// 5. direct_io_buf_ if direct IO is enabled
+// After this method, if the block is compressed, it should be in
+// compressed_buf_, otherwise should be in heap_buf_.
 inline void BlockFetcher::GetBlockContents() {
   if (slice_.data() != used_buf_) {
     // the slice content is not the buffer provided
@@ -178,12 +203,19 @@
     // page can be either uncompressed or compressed, the buffer either stack
     // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
     if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
-      CopyBufferToHeap();
+      CopyBufferToHeapBuf();
     } else if (used_buf_ == compressed_buf_.get()) {
       if (compression_type_ == kNoCompression &&
           memory_allocator_ != memory_allocator_compressed_) {
-        CopyBufferToHeap();
+        CopyBufferToHeapBuf();
+      } else {
+        heap_buf_ = std::move(compressed_buf_);
+      }
+    } else if (direct_io_buf_.get() != nullptr) {
+      if (compression_type_ == kNoCompression) {
+        CopyBufferToHeapBuf();
       } else {
+        CopyBufferToCompressedBuf();
         heap_buf_ = std::move(compressed_buf_);
       }
     }
@@ -194,31 +226,48 @@
 #endif
 }
 
-Status BlockFetcher::ReadBlockContents() {
-  block_size_ = static_cast<size_t>(handle_.size());
-
+IOStatus BlockFetcher::ReadBlockContents() {
   if (TryGetUncompressBlockFromPersistentCache()) {
     compression_type_ = kNoCompression;
 #ifndef NDEBUG
     contents_->is_raw_block = true;
 #endif  // NDEBUG
-    return Status::OK();
+    return IOStatus::OK();
   }
   if (TryGetFromPrefetchBuffer()) {
-    if (!status_.ok()) {
-      return status_;
+    if (!io_status_.ok()) {
+      return io_status_;
     }
   } else if (!TryGetCompressedBlockFromPersistentCache()) {
-    PrepareBufferForBlockFromFile();
-    Status s;
-
-    {
-      PERF_TIMER_GUARD(block_read_time);
-      // Actual file read
-      status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize,
-                            &slice_, used_buf_, for_compaction_);
+    IOOptions opts;
+    io_status_ = file_->PrepareIOOptions(read_options_, opts);
+    // Actual file read
+    if (io_status_.ok()) {
+      if (file_->use_direct_io()) {
+        PERF_TIMER_GUARD(block_read_time);
+        io_status_ =
+            file_->Read(opts, handle_.offset(), block_size_with_trailer_,
+                        &slice_, nullptr, &direct_io_buf_, for_compaction_);
+        PERF_COUNTER_ADD(block_read_count, 1);
+        used_buf_ = const_cast<char*>(slice_.data());
+      } else {
+        PrepareBufferForBlockFromFile();
+        PERF_TIMER_GUARD(block_read_time);
+        io_status_ =
+            file_->Read(opts, handle_.offset(), block_size_with_trailer_,
+                        &slice_, used_buf_, nullptr, for_compaction_);
+        PERF_COUNTER_ADD(block_read_count, 1);
+#ifndef NDEBUG
+        if (slice_.data() == &stack_buf_[0]) {
+          num_stack_buf_memcpy_++;
+        } else if (slice_.data() == heap_buf_.get()) {
+          num_heap_buf_memcpy_++;
+        } else if (slice_.data() == compressed_buf_.get()) {
+          num_compressed_buf_memcpy_++;
+        }
+#endif
+      }
     }
-    PERF_COUNTER_ADD(block_read_count, 1);
 
     // TODO: introduce dedicated perf counter for range tombstones
     switch (block_type_) {
@@ -239,38 +288,38 @@
         break;
     }
 
-    PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize);
-    if (!status_.ok()) {
-      return status_;
+    PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_);
+    if (!io_status_.ok()) {
+      return io_status_;
     }
 
-    if (slice_.size() != block_size_ + kBlockTrailerSize) {
-      return Status::Corruption("truncated block read from " +
-                                file_->file_name() + " offset " +
-                                ToString(handle_.offset()) + ", expected " +
-                                ToString(block_size_ + kBlockTrailerSize) +
-                                " bytes, got " + ToString(slice_.size()));
+    if (slice_.size() != block_size_with_trailer_) {
+      return IOStatus::Corruption("truncated block read from " +
+                                  file_->file_name() + " offset " +
+                                  ToString(handle_.offset()) + ", expected " +
+                                  ToString(block_size_with_trailer_) +
+                                  " bytes, got " + ToString(slice_.size()));
     }
 
-    CheckBlockChecksum();
-    if (status_.ok()) {
+    ProcessTrailerIfPresent();
+    if (io_status_.ok()) {
       InsertCompressedBlockToPersistentCacheIfNeeded();
     } else {
-      return status_;
+      return io_status_;
     }
   }
 
-  PERF_TIMER_GUARD(block_decompress_time);
-
-  compression_type_ = get_block_compression_type(slice_.data(), block_size_);
-
   if (do_uncompress_ && compression_type_ != kNoCompression) {
+    PERF_TIMER_GUARD(block_decompress_time);
     // compressed page, uncompress, update cache
     UncompressionContext context(compression_type_);
     UncompressionInfo info(context, uncompression_dict_, compression_type_);
-    status_ = UncompressBlockContents(info, slice_.data(), block_size_,
-                                      contents_, footer_.version(), ioptions_,
-                                      memory_allocator_);
+    io_status_ = status_to_io_status(UncompressBlockContents(
+        info, slice_.data(), block_size_, contents_, footer_.format_version(),
+        ioptions_, memory_allocator_));
+#ifndef NDEBUG
+    num_heap_buf_memcpy_++;
+#endif
     compression_type_ = kNoCompression;
   } else {
     GetBlockContents();
@@ -278,7 +327,7 @@
 
   InsertUncompressedBlockToPersistentCacheIfNeeded();
 
-  return status_;
+  return io_status_;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,6 +12,7 @@
 #include "table/block_based/block.h"
 #include "table/block_based/block_type.h"
 #include "table/format.h"
+#include "table/persistent_cache_options.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -37,12 +38,15 @@
 class BlockFetcher {
  public:
   BlockFetcher(RandomAccessFileReader* file,
-               FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
-               const ReadOptions& read_options, const BlockHandle& handle,
-               BlockContents* contents, const ImmutableCFOptions& ioptions,
+               FilePrefetchBuffer* prefetch_buffer,
+               const Footer& footer /* ref retained */,
+               const ReadOptions& read_options,
+               const BlockHandle& handle /* ref retained */,
+               BlockContents* contents,
+               const ImmutableOptions& ioptions /* ref retained */,
                bool do_uncompress, bool maybe_compressed, BlockType block_type,
-               const UncompressionDict& uncompression_dict,
-               const PersistentCacheOptions& cache_options,
+               const UncompressionDict& uncompression_dict /* ref retained */,
+               const PersistentCacheOptions& cache_options /* ref retained */,
                MemoryAllocator* memory_allocator = nullptr,
                MemoryAllocator* memory_allocator_compressed = nullptr,
                bool for_compaction = false)
@@ -56,16 +60,39 @@
         do_uncompress_(do_uncompress),
         maybe_compressed_(maybe_compressed),
         block_type_(block_type),
+        block_size_(static_cast<size_t>(handle_.size())),
+        block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()),
         uncompression_dict_(uncompression_dict),
         cache_options_(cache_options),
         memory_allocator_(memory_allocator),
         memory_allocator_compressed_(memory_allocator_compressed),
-        for_compaction_(for_compaction) {}
-
-  Status ReadBlockContents();
-  CompressionType get_compression_type() const { return compression_type_; }
+        for_compaction_(for_compaction) {
+    io_status_.PermitUncheckedError();  // TODO(AR) can we improve on this?
+  }
+
+  IOStatus ReadBlockContents();
+  inline CompressionType get_compression_type() const {
+    return compression_type_;
+  }
+  inline size_t GetBlockSizeWithTrailer() const {
+    return block_size_with_trailer_;
+  }
+
+#ifndef NDEBUG
+  int TEST_GetNumStackBufMemcpy() const { return num_stack_buf_memcpy_; }
+  int TEST_GetNumHeapBufMemcpy() const { return num_heap_buf_memcpy_; }
+  int TEST_GetNumCompressedBufMemcpy() const {
+    return num_compressed_buf_memcpy_;
+  }
 
+#endif
  private:
+#ifndef NDEBUG
+  int num_stack_buf_memcpy_ = 0;
+  int num_heap_buf_memcpy_ = 0;
+  int num_compressed_buf_memcpy_ = 0;
+
+#endif
   static const uint32_t kDefaultStackBufferSize = 5000;
 
   RandomAccessFileReader* file_;
@@ -74,23 +101,25 @@
   const ReadOptions read_options_;
   const BlockHandle& handle_;
   BlockContents* contents_;
-  const ImmutableCFOptions& ioptions_;
-  bool do_uncompress_;
-  bool maybe_compressed_;
-  BlockType block_type_;
+  const ImmutableOptions& ioptions_;
+  const bool do_uncompress_;
+  const bool maybe_compressed_;
+  const BlockType block_type_;
+  const size_t block_size_;
+  const size_t block_size_with_trailer_;
   const UncompressionDict& uncompression_dict_;
   const PersistentCacheOptions& cache_options_;
   MemoryAllocator* memory_allocator_;
   MemoryAllocator* memory_allocator_compressed_;
-  Status status_;
+  IOStatus io_status_;
   Slice slice_;
   char* used_buf_ = nullptr;
-  size_t block_size_;
+  AlignedBuf direct_io_buf_;
   CacheAllocationPtr heap_buf_;
   CacheAllocationPtr compressed_buf_;
   char stack_buf_[kDefaultStackBufferSize];
   bool got_from_prefetch_buffer_ = false;
-  ROCKSDB_NAMESPACE::CompressionType compression_type_;
+  CompressionType compression_type_;
   bool for_compaction_ = false;
 
   // return true if found
@@ -99,11 +128,13 @@
   bool TryGetFromPrefetchBuffer();
   bool TryGetCompressedBlockFromPersistentCache();
   void PrepareBufferForBlockFromFile();
-  // Copy content from used_buf_ to new heap buffer.
-  void CopyBufferToHeap();
+  // Copy content from used_buf_ to new heap_buf_.
+  void CopyBufferToHeapBuf();
+  // Copy content from used_buf_ to new compressed_buf_.
+  void CopyBufferToCompressedBuf();
   void GetBlockContents();
   void InsertCompressedBlockToPersistentCacheIfNeeded();
   void InsertUncompressedBlockToPersistentCacheIfNeeded();
-  void CheckBlockChecksum();
+  void ProcessTrailerIfPresent();
 };
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/block_fetcher_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/block_fetcher_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,521 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_fetcher.h"
+
+#include "db/table_properties_collector.h"
+#include "file/file_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/binary_search_index_reader.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+struct MemcpyStats {
+  int num_stack_buf_memcpy;
+  int num_heap_buf_memcpy;
+  int num_compressed_buf_memcpy;
+};
+
+struct BufAllocationStats {
+  int num_heap_buf_allocations;
+  int num_compressed_buf_allocations;
+};
+
+struct TestStats {
+  MemcpyStats memcpy_stats;
+  BufAllocationStats buf_allocation_stats;
+};
+
+class BlockFetcherTest : public testing::Test {
+ public:
+  enum class Mode {
+    kBufferedRead = 0,
+    kBufferedMmap,
+    kDirectRead,
+    kNumModes,
+  };
+  // use NumModes as array size to avoid "size of array '...' has non-integral
+  // type" errors.
+  const static int NumModes = static_cast<int>(Mode::kNumModes);
+
+ protected:
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_fetcher_test");
+    env_ = Env::Default();
+    fs_ = FileSystem::Default();
+    ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void AssertSameBlock(const std::string& block1, const std::string& block2) {
+    ASSERT_EQ(block1, block2);
+  }
+
+  // Creates a table with kv pairs (i, i) where i ranges from 0 to 9, inclusive.
+  void CreateTable(const std::string& table_name,
+                   const CompressionType& compression_type) {
+    std::unique_ptr<WritableFileWriter> writer;
+    NewFileWriter(table_name, &writer);
+
+    // Create table builder.
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    ColumnFamilyOptions cf_options(options_);
+    MutableCFOptions moptions(cf_options);
+    IntTblPropCollectorFactories factories;
+    std::unique_ptr<TableBuilder> table_builder(table_factory_.NewTableBuilder(
+        TableBuilderOptions(ioptions, moptions, comparator, &factories,
+                            compression_type, CompressionOptions(),
+                            0 /* column_family_id */, kDefaultColumnFamilyName,
+                            -1 /* level */),
+        writer.get()));
+
+    // Build table.
+    for (int i = 0; i < 9; i++) {
+      std::string key = ToInternalKey(std::to_string(i));
+      // Append "00000000" to string value to enhance compression ratio
+      std::string value = "00000000" + std::to_string(i);
+      table_builder->Add(key, value);
+    }
+    ASSERT_OK(table_builder->Finish());
+  }
+
+  void FetchIndexBlock(const std::string& table_name,
+                       CountedMemoryAllocator* heap_buf_allocator,
+                       CountedMemoryAllocator* compressed_buf_allocator,
+                       MemcpyStats* memcpy_stats, BlockContents* index_block,
+                       std::string* result) {
+    FileOptions fopt(options_);
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, fopt, &file);
+
+    // Get handle of the index block.
+    Footer footer;
+    ReadFooter(file.get(), &footer);
+    const BlockHandle& index_handle = footer.index_handle();
+
+    CompressionType compression_type;
+    FetchBlock(file.get(), index_handle, BlockType::kIndex,
+               false /* compressed */, false /* do_uncompress */,
+               heap_buf_allocator, compressed_buf_allocator, index_block,
+               memcpy_stats, &compression_type);
+    ASSERT_EQ(compression_type, CompressionType::kNoCompression);
+    result->assign(index_block->data.ToString());
+  }
+
+  // Fetches the first data block in both direct IO and non-direct IO mode.
+  //
+  // compressed: whether the data blocks are compressed;
+  // do_uncompress: whether the data blocks should be uncompressed on fetching.
+  // compression_type: the expected compression type.
+  //
+  // Expects:
+  // Block contents are the same.
+  // Bufferr allocation and memory copy statistics are expected.
+  void TestFetchDataBlock(
+      const std::string& table_name_prefix, bool compressed, bool do_uncompress,
+      std::array<TestStats, NumModes> expected_stats_by_mode) {
+    for (CompressionType compression_type : GetSupportedCompressions()) {
+      bool do_compress = compression_type != kNoCompression;
+      if (compressed != do_compress) continue;
+      std::string compression_type_str =
+          CompressionTypeToString(compression_type);
+
+      std::string table_name = table_name_prefix + compression_type_str;
+      CreateTable(table_name, compression_type);
+
+      CompressionType expected_compression_type_after_fetch =
+          (compressed && !do_uncompress) ? compression_type : kNoCompression;
+
+      BlockContents blocks[NumModes];
+      std::string block_datas[NumModes];
+      MemcpyStats memcpy_stats[NumModes];
+      CountedMemoryAllocator heap_buf_allocators[NumModes];
+      CountedMemoryAllocator compressed_buf_allocators[NumModes];
+      for (int i = 0; i < NumModes; ++i) {
+        SetMode(static_cast<Mode>(i));
+        FetchFirstDataBlock(table_name, compressed, do_uncompress,
+                            expected_compression_type_after_fetch,
+                            &heap_buf_allocators[i],
+                            &compressed_buf_allocators[i], &blocks[i],
+                            &block_datas[i], &memcpy_stats[i]);
+      }
+
+      for (int i = 0; i < NumModes - 1; ++i) {
+        AssertSameBlock(block_datas[i], block_datas[i + 1]);
+      }
+
+      // Check memcpy and buffer allocation statistics.
+      for (int i = 0; i < NumModes; ++i) {
+        const TestStats& expected_stats = expected_stats_by_mode[i];
+
+        ASSERT_EQ(memcpy_stats[i].num_stack_buf_memcpy,
+                  expected_stats.memcpy_stats.num_stack_buf_memcpy);
+        ASSERT_EQ(memcpy_stats[i].num_heap_buf_memcpy,
+                  expected_stats.memcpy_stats.num_heap_buf_memcpy);
+        ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy,
+                  expected_stats.memcpy_stats.num_compressed_buf_memcpy);
+
+        if (kXpressCompression == compression_type) {
+          // XPRESS allocates memory internally, thus does not support for
+          // custom allocator verification
+          continue;
+        } else {
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumAllocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+
+          // The allocated buffers are not deallocated until
+          // the block content is deleted.
+          ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0);
+          blocks[i].allocation.reset();
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumDeallocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+        }
+      }
+    }
+  }
+
+  void SetMode(Mode mode) {
+    switch (mode) {
+      case Mode::kBufferedRead:
+        options_.use_direct_reads = false;
+        options_.allow_mmap_reads = false;
+        break;
+      case Mode::kBufferedMmap:
+        options_.use_direct_reads = false;
+        options_.allow_mmap_reads = true;
+        break;
+      case Mode::kDirectRead:
+        options_.use_direct_reads = true;
+        options_.allow_mmap_reads = false;
+        break;
+      case Mode::kNumModes:
+        assert(false);
+    }
+  }
+
+ private:
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  BlockBasedTableFactory table_factory_;
+  Options options_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  void WriteToFile(const std::string& content, const std::string& filename) {
+    std::unique_ptr<FSWritableFile> f;
+    ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
+    ASSERT_OK(f->Append(content, IOOptions(), nullptr));
+    ASSERT_OK(f->Close(IOOptions(), nullptr));
+  }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    FileOptions file_options;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path,
+                                         file_options, writer, nullptr));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
+  }
+
+  void NewTableReader(const ImmutableOptions& ioptions,
+                      const FileOptions& foptions,
+                      const InternalKeyComparator& comparator,
+                      const std::string& table_name,
+                      std::unique_ptr<BlockBasedTable>* table) {
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
+
+    std::unique_ptr<TableReader> table_reader;
+    ReadOptions ro;
+    const auto* table_options =
+        table_factory_.GetOptions<BlockBasedTableOptions>();
+    ASSERT_NE(table_options, nullptr);
+    ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options,
+                                    comparator, std::move(file), file_size,
+                                    &table_reader));
+
+    table->reset(reinterpret_cast<BlockBasedTable*>(table_reader.release()));
+  }
+
+  std::string ToInternalKey(const std::string& key) {
+    InternalKey internal_key(key, 0, ValueType::kTypeValue);
+    return internal_key.Encode().ToString();
+  }
+
+  void ReadFooter(RandomAccessFileReader* file, Footer* footer) {
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size));
+    IOOptions opts;
+    ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */,
+                                 file_size, footer,
+                                 kBlockBasedTableMagicNumber));
+  }
+
+  // NOTE: compression_type returns the compression type of the fetched block
+  // contents, so if the block is fetched and uncompressed, then it's
+  // kNoCompression.
+  void FetchBlock(RandomAccessFileReader* file, const BlockHandle& block,
+                  BlockType block_type, bool compressed, bool do_uncompress,
+                  MemoryAllocator* heap_buf_allocator,
+                  MemoryAllocator* compressed_buf_allocator,
+                  BlockContents* contents, MemcpyStats* stats,
+                  CompressionType* compresstion_type) {
+    ImmutableOptions ioptions(options_);
+    ReadOptions roptions;
+    PersistentCacheOptions persistent_cache_options;
+    Footer footer;
+    ReadFooter(file, &footer);
+    std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
+        file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
+        ioptions, do_uncompress, compressed, block_type,
+        UncompressionDict::GetEmptyDict(), persistent_cache_options,
+        heap_buf_allocator, compressed_buf_allocator));
+
+    ASSERT_OK(fetcher->ReadBlockContents());
+
+    stats->num_stack_buf_memcpy = fetcher->TEST_GetNumStackBufMemcpy();
+    stats->num_heap_buf_memcpy = fetcher->TEST_GetNumHeapBufMemcpy();
+    stats->num_compressed_buf_memcpy =
+        fetcher->TEST_GetNumCompressedBufMemcpy();
+
+    *compresstion_type = fetcher->get_compression_type();
+  }
+
+  // NOTE: expected_compression_type is the expected compression
+  // type of the fetched block content, if the block is uncompressed,
+  // then the expected compression type is kNoCompression.
+  void FetchFirstDataBlock(const std::string& table_name, bool compressed,
+                           bool do_uncompress,
+                           CompressionType expected_compression_type,
+                           MemoryAllocator* heap_buf_allocator,
+                           MemoryAllocator* compressed_buf_allocator,
+                           BlockContents* block, std::string* result,
+                           MemcpyStats* memcpy_stats) {
+    ImmutableOptions ioptions(options_);
+    InternalKeyComparator comparator(options_.comparator);
+    FileOptions foptions(options_);
+
+    // Get block handle for the first data block.
+    std::unique_ptr<BlockBasedTable> table;
+    NewTableReader(ioptions, foptions, comparator, table_name, &table);
+
+    std::unique_ptr<BlockBasedTable::IndexReader> index_reader;
+    ReadOptions ro;
+    ASSERT_OK(BinarySearchIndexReader::Create(
+        table.get(), ro, nullptr /* prefetch_buffer */, false /* use_cache */,
+        false /* prefetch */, false /* pin */, nullptr /* lookup_context */,
+        &index_reader));
+
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
+        index_reader->NewIterator(
+            ReadOptions(), false /* disable_prefix_seek */, nullptr /* iter */,
+            nullptr /* get_context */, nullptr /* lookup_context */));
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    BlockHandle first_block_handle = iter->value().handle;
+
+    // Fetch first data block.
+    std::unique_ptr<RandomAccessFileReader> file;
+    NewFileReader(table_name, foptions, &file);
+    CompressionType compression_type;
+    FetchBlock(file.get(), first_block_handle, BlockType::kData, compressed,
+               do_uncompress, heap_buf_allocator, compressed_buf_allocator,
+               block, memcpy_stats, &compression_type);
+    ASSERT_EQ(compression_type, expected_compression_type);
+    result->assign(block->data.ToString());
+  }
+};
+
+// Skip the following tests in lite mode since direct I/O is unsupported.
+#ifndef ROCKSDB_LITE
+
+// Fetch index block under both direct IO and non-direct IO.
+// Expects:
+// the index block contents are the same for both read modes.
+TEST_F(BlockFetcherTest, FetchIndexBlock) {
+  for (CompressionType compression : GetSupportedCompressions()) {
+    std::string table_name =
+        "FetchIndexBlock" + CompressionTypeToString(compression);
+    CreateTable(table_name, compression);
+
+    CountedMemoryAllocator allocator;
+    MemcpyStats memcpy_stats;
+    BlockContents indexes[NumModes];
+    std::string index_datas[NumModes];
+    for (int i = 0; i < NumModes; ++i) {
+      SetMode(static_cast<Mode>(i));
+      FetchIndexBlock(table_name, &allocator, &allocator, &memcpy_stats,
+                      &indexes[i], &index_datas[i]);
+    }
+    for (int i = 0; i < NumModes - 1; ++i) {
+      AssertSameBlock(index_datas[i], index_datas[i + 1]);
+    }
+  }
+}
+
+// Data blocks are not compressed,
+// fetch data block under direct IO, mmap IO,and non-direct IO.
+// Expects:
+// 1. in non-direct IO mode, allocate a heap buffer and memcpy the block
+//    into the buffer;
+// 2. in direct IO mode, allocate a heap buffer and memcpy from the
+//    direct IO buffer to the heap buffer.
+TEST_F(BlockFetcherTest, FetchUncompressedDataBlock) {
+  TestStats expected_non_mmap_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       0 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       0 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_non_mmap_stats /* kBufferedRead */,
+      expected_mmap_stats /* kBufferedMmap */,
+      expected_non_mmap_stats /* kDirectRead */,
+  }};
+  TestFetchDataBlock("FetchUncompressedDataBlock", false, false,
+                     expected_stats_by_mode);
+}
+
+// Data blocks are compressed,
+// fetch data block under both direct IO and non-direct IO,
+// but do not uncompress.
+// Expects:
+// 1. in non-direct IO mode, allocate a compressed buffer and memcpy the block
+//    into the buffer;
+// 2. in direct IO mode, allocate a compressed buffer and memcpy from the
+//    direct IO buffer to the compressed buffer.
+TEST_F(BlockFetcherTest, FetchCompressedDataBlock) {
+  TestStats expected_non_mmap_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          0 /* num_heap_buf_memcpy */,
+          1 /* num_compressed_buf_memcpy */,
+      },
+      {
+          0 /* num_heap_buf_allocations */,
+          1 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       0 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       0 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_non_mmap_stats /* kBufferedRead */,
+      expected_mmap_stats /* kBufferedMmap */,
+      expected_non_mmap_stats /* kDirectRead */,
+  }};
+  TestFetchDataBlock("FetchCompressedDataBlock", true, false,
+                     expected_stats_by_mode);
+}
+
+// Data blocks are compressed,
+// fetch and uncompress data block under both direct IO and non-direct IO.
+// Expects:
+// 1. in non-direct IO mode, since the block is small, so it's first memcpyed
+//    to the stack buffer, then a heap buffer is allocated and the block is
+//    uncompressed into the heap.
+// 2. in direct IO mode mode, allocate a heap buffer, then directly uncompress
+//    and memcpy from the direct IO buffer to the heap buffer.
+TEST_F(BlockFetcherTest, FetchAndUncompressCompressedDataBlock) {
+  TestStats expected_buffered_read_stats = {
+      {
+          1 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  TestStats expected_mmap_stats = {{
+                                       0 /* num_stack_buf_memcpy */,
+                                       1 /* num_heap_buf_memcpy */,
+                                       0 /* num_compressed_buf_memcpy */,
+                                   },
+                                   {
+                                       1 /* num_heap_buf_allocations */,
+                                       0 /* num_compressed_buf_allocations */,
+                                   }};
+  TestStats expected_direct_read_stats = {
+      {
+          0 /* num_stack_buf_memcpy */,
+          1 /* num_heap_buf_memcpy */,
+          0 /* num_compressed_buf_memcpy */,
+      },
+      {
+          1 /* num_heap_buf_allocations */,
+          0 /* num_compressed_buf_allocations */,
+      }};
+  std::array<TestStats, NumModes> expected_stats_by_mode{{
+      expected_buffered_read_stats,
+      expected_mmap_stats,
+      expected_direct_read_stats,
+  }};
+  TestFetchDataBlock("FetchAndUncompressCompressedDataBlock", true, true,
+                     expected_stats_by_mode);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -53,7 +53,9 @@
     const Comparator* user_comparator, uint32_t cuckoo_block_size,
     bool use_module_hash, bool identity_as_first_hash,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
-    uint32_t column_family_id, const std::string& column_family_name)
+    uint32_t column_family_id, const std::string& column_family_name,
+    const std::string& db_id, const std::string& db_session_id,
+    uint64_t file_number)
     : num_hash_func_(2),
       file_(file),
       max_hash_table_ratio_(max_hash_table_ratio),
@@ -79,6 +81,11 @@
   properties_.filter_size = 0;
   properties_.column_family_id = column_family_id;
   properties_.column_family_name = column_family_name;
+  properties_.db_id = db_id;
+  properties_.db_session_id = db_session_id;
+  properties_.orig_file_number = file_number;
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
 }
 
 void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
@@ -87,8 +94,11 @@
     return;
   }
   ParsedInternalKey ikey;
-  if (!ParseInternalKey(key, &ikey)) {
-    status_ = Status::Corruption("Unable to parse key into inernal key.");
+  Status pik_status =
+      ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    status_ = Status::Corruption("Unable to parse key into internal key. ",
+                                 pik_status.getState());
     return;
   }
   if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
@@ -244,7 +254,6 @@
   assert(!closed_);
   closed_ = true;
   std::vector<CuckooBucket> buckets;
-  Status s;
   std::string unused_bucket;
   if (num_entries_ > 0) {
     // Calculate the real hash size if module hash is enabled.
@@ -252,9 +261,9 @@
       hash_table_size_ =
         static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
     }
-    s = MakeHashTable(&buckets);
-    if (!s.ok()) {
-      return s;
+    status_ = MakeHashTable(&buckets);
+    if (!status_.ok()) {
+      return status_;
     }
     // Determine unused_user_key to fill empty buckets.
     std::string unused_user_key = smallest_user_key_;
@@ -301,18 +310,19 @@
   uint32_t num_added = 0;
   for (auto& bucket : buckets) {
     if (bucket.vector_idx == kMaxVectorIdx) {
-      s = file_->Append(Slice(unused_bucket));
+      io_status_ = file_->Append(Slice(unused_bucket));
     } else {
       ++num_added;
-      s = file_->Append(GetKey(bucket.vector_idx));
-      if (s.ok()) {
+      io_status_ = file_->Append(GetKey(bucket.vector_idx));
+      if (io_status_.ok()) {
         if (value_size_ > 0) {
-          s = file_->Append(GetValue(bucket.vector_idx));
+          io_status_ = file_->Append(GetValue(bucket.vector_idx));
         }
       }
     }
-    if (!s.ok()) {
-      return s;
+    if (!io_status_.ok()) {
+      status_ = io_status_;
+      return status_;
     }
   }
   assert(num_added == NumEntries());
@@ -364,34 +374,31 @@
   BlockHandle property_block_handle;
   property_block_handle.set_offset(offset);
   property_block_handle.set_size(property_block.size());
-  s = file_->Append(property_block);
+  io_status_ = file_->Append(property_block);
   offset += property_block.size();
-  if (!s.ok()) {
-    return s;
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
   }
 
-  meta_index_builder.Add(kPropertiesBlock, property_block_handle);
+  meta_index_builder.Add(kPropertiesBlockName, property_block_handle);
   Slice meta_index_block = meta_index_builder.Finish();
 
   BlockHandle meta_index_block_handle;
   meta_index_block_handle.set_offset(offset);
   meta_index_block_handle.set_size(meta_index_block.size());
-  s = file_->Append(meta_index_block);
-  if (!s.ok()) {
-    return s;
+  io_status_ = file_->Append(meta_index_block);
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
   }
 
-  Footer footer(kCuckooTableMagicNumber, 1);
-  footer.set_metaindex_handle(meta_index_block_handle);
-  footer.set_index_handle(BlockHandle::NullBlockHandle());
-  std::string footer_encoding;
-  footer.EncodeTo(&footer_encoding);
-  s = file_->Append(footer_encoding);
-
-  if (file_ != nullptr) {
-    file_checksum_ = file_->GetFileChecksum();
-  }
-  return s;
+  FooterBuilder footer;
+  footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset,
+               kNoChecksum, meta_index_block_handle);
+  io_status_ = file_->Append(footer.GetSlice());
+  status_ = io_status_;
+  return status_;
 }
 
 void CuckooTableBuilder::Abandon() {
@@ -516,11 +523,19 @@
   return null_found;
 }
 
+std::string CuckooTableBuilder::GetFileChecksum() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
 const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
   if (file_ != nullptr) {
     return file_->GetFileChecksumFuncName();
   } else {
-    return kUnknownFileChecksumFuncName.c_str();
+    return kUnknownFileChecksumFuncName;
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -22,15 +22,15 @@
 
 class CuckooTableBuilder: public TableBuilder {
  public:
-  CuckooTableBuilder(WritableFileWriter* file, double max_hash_table_ratio,
-                     uint32_t max_num_hash_func, uint32_t max_search_depth,
-                     const Comparator* user_comparator,
-                     uint32_t cuckoo_block_size, bool use_module_hash,
-                     bool identity_as_first_hash,
-                     uint64_t (*get_slice_hash)(const Slice&, uint32_t,
-                                                uint64_t),
-                     uint32_t column_family_id,
-                     const std::string& column_family_name);
+  CuckooTableBuilder(
+      WritableFileWriter* file, double max_hash_table_ratio,
+      uint32_t max_num_hash_func, uint32_t max_search_depth,
+      const Comparator* user_comparator, uint32_t cuckoo_block_size,
+      bool use_module_hash, bool identity_as_first_hash,
+      uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
+      uint32_t column_family_id, const std::string& column_family_name,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      uint64_t file_number = 0);
   // No copying allowed
   CuckooTableBuilder(const CuckooTableBuilder&) = delete;
   void operator=(const CuckooTableBuilder&) = delete;
@@ -46,6 +46,9 @@
   // Return non-ok iff some error has been detected.
   Status status() const override { return status_; }
 
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return io_status_; }
+
   // Finish building the table.  Stops using the file passed to the
   // constructor after this function returns.
   // REQUIRES: Finish(), Abandon() have not been called
@@ -68,7 +71,7 @@
   TableProperties GetTableProperties() const override { return properties_; }
 
   // Get file checksum
-  const std::string& GetFileChecksum() const override { return file_checksum_; }
+  std::string GetFileChecksum() const override;
 
   // Get file checksum function name
   const char* GetFileChecksumFuncName() const override;
@@ -116,6 +119,7 @@
   // Number of keys that contain value (non-deletion op)
   uint64_t num_values_;
   Status status_;
+  IOStatus io_status_;
   TableProperties properties_;
   const Comparator* ucomp_;
   bool use_module_hash_;
@@ -126,9 +130,6 @@
   std::string smallest_user_key_ = "";
 
   bool closed_;  // Either Finish() or Abandon() has been called.
-
-  // Store file checksum. If checksum is disabled, its value is "0"
-  std::string file_checksum_ = kUnknownFileChecksum;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_builder_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,14 +5,17 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <vector>
-#include <string>
+#include "table/cuckoo/cuckoo_table_builder.h"
+
 #include <map>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "file/random_access_file_reader.h"
 #include "file/writable_file_writer.h"
-#include "table/cuckoo/cuckoo_table_builder.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "table/meta_blocks.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -35,7 +38,7 @@
     env_ = Env::Default();
     Options options;
     options.allow_mmap_reads = true;
-    env_options_ = EnvOptions(options);
+    file_options_ = FileOptions(options);
   }
 
   void CheckFileContents(const std::vector<std::string>& keys,
@@ -47,29 +50,27 @@
     uint64_t num_deletions = 0;
     for (const auto& key : keys) {
       ParsedInternalKey parsed;
-      if (ParseInternalKey(key, &parsed) && parsed.type == kTypeDeletion) {
+      Status pik_status =
+          ParseInternalKey(key, &parsed, true /* log_err_key */);
+      if (pik_status.ok() && parsed.type == kTypeDeletion) {
         num_deletions++;
       }
     }
     // Read file
-    std::unique_ptr<RandomAccessFile> read_file;
-    ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_));
     uint64_t read_file_size;
     ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env_->GetFileSystem(), fname, file_options_, &file_reader, nullptr));
 
-   // @lint-ignore TXT2 T25377293 Grandfathered in
-	  Options options;
-	  options.allow_mmap_reads = true;
-	  ImmutableCFOptions ioptions(options);
+    Options options;
+    options.allow_mmap_reads = true;
+    ImmutableOptions ioptions(options);
 
     // Assert Table Properties.
-    TableProperties* props = nullptr;
-    std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                   fname));
+    std::unique_ptr<TableProperties> props;
     ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
-                                  kCuckooTableMagicNumber, ioptions,
-                                  &props, true /* compression_type_missing */));
+                                  kCuckooTableMagicNumber, ioptions, &props));
     // Check unused bucket.
     std::string unused_key = props->user_collected_properties[
       CuckooTablePropertyNames::kEmptyKey];
@@ -106,15 +107,14 @@
     ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len);
     ASSERT_EQ(props->column_family_id, 0);
     ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName);
-    delete props;
 
     // Check contents of the bucket.
     std::vector<bool> keys_found(keys.size(), false);
     size_t bucket_size = expected_unused_bucket.size();
-    for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) {
+    for (uint32_t i = 0; i + 1 < table_size + cuckoo_block_size; ++i) {
       Slice read_slice;
-      ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice,
-                                  nullptr));
+      ASSERT_OK(file_reader->Read(IOOptions(), i * bucket_size, bucket_size,
+                                  &read_slice, nullptr, nullptr));
       size_t key_idx =
           std::find(expected_locations.begin(), expected_locations.end(), i) -
           expected_locations.begin();
@@ -157,7 +157,7 @@
 
 
   Env* env_;
-  EnvOptions env_options_;
+  FileOptions file_options_;
   std::string fname;
   const double kHashTableRatio = 0.9;
 };
@@ -165,10 +165,9 @@
 TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
   std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("EmptyFile");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
                              BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -206,12 +205,10 @@
     }
     uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-    std::unique_ptr<WritableFile> writable_file;
     fname = test::PerThreadDBPath("NoCollisionFullKey");
-    ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-        EnvOptions()));
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                         file_options_, &file_writer, nullptr));
     CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                                100, BytewiseComparator(), 1, false, false,
                                GetSliceHash, 0 /* column_family_id */,
@@ -256,12 +253,10 @@
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionFullKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -305,13 +300,11 @@
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   uint32_t cuckoo_block_size = 2;
   fname = test::PerThreadDBPath("WithCollisionFullKey2");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(
       file_writer.get(), kHashTableRatio, num_hash_fun, 100,
       BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
@@ -360,12 +353,10 @@
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathFullKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -411,12 +402,10 @@
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 2, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -455,12 +444,11 @@
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("NoCollisionUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -500,12 +488,11 @@
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -547,12 +534,11 @@
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -593,12 +579,10 @@
   };
   hash_map = std::move(hm);
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -622,12 +606,10 @@
   uint32_t num_hash_fun = 4;
   std::string user_key = "repeatedkey";
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,13 +7,15 @@
 #include "table/cuckoo/cuckoo_table_factory.h"
 
 #include "db/dbformat.h"
+#include "options/configurable_helper.h"
+#include "rocksdb/utilities/options_type.h"
 #include "table/cuckoo/cuckoo_table_builder.h"
 #include "table/cuckoo/cuckoo_table_reader.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 Status CuckooTableFactory::NewTableReader(
-    const TableReaderOptions& table_reader_options,
+    const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table,
     bool /*prefetch_index_and_filter_in_cache*/) const {
@@ -28,11 +30,8 @@
 }
 
 TableBuilder* CuckooTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
-  // Ignore the skipFIlters flag. Does not apply to this file format
-  //
-
   // TODO: change builder to take the option struct
   return new CuckooTableBuilder(
       file, table_options_.hash_table_ratio, 64,
@@ -40,10 +39,12 @@
       table_builder_options.internal_comparator.user_comparator(),
       table_options_.cuckoo_block_size, table_options_.use_module_hash,
       table_options_.identity_as_first_hash, nullptr /* get_slice_hash */,
-      column_family_id, table_builder_options.column_family_name);
+      table_builder_options.column_family_id,
+      table_builder_options.column_family_name, table_builder_options.db_id,
+      table_builder_options.db_session_id, table_builder_options.cur_file_num);
 }
 
-std::string CuckooTableFactory::GetPrintableTableOptions() const {
+std::string CuckooTableFactory::GetPrintableOptions() const {
   std::string ret;
   ret.reserve(2000);
   const int kBufferSize = 200;
@@ -64,6 +65,37 @@
   return ret;
 }
 
+static std::unordered_map<std::string, OptionTypeInfo> cuckoo_table_type_info =
+    {
+#ifndef ROCKSDB_LITE
+        {"hash_table_ratio",
+         {offsetof(struct CuckooTableOptions, hash_table_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"max_search_depth",
+         {offsetof(struct CuckooTableOptions, max_search_depth),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cuckoo_block_size",
+         {offsetof(struct CuckooTableOptions, cuckoo_block_size),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"identity_as_first_hash",
+         {offsetof(struct CuckooTableOptions, identity_as_first_hash),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"use_module_hash",
+         {offsetof(struct CuckooTableOptions, use_module_hash),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options)
+    : table_options_(table_options) {
+  RegisterOptions(&table_options_, &cuckoo_table_type_info);
+}
+
 TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
   return new CuckooTableFactory(table_options);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_factory.h	2025-05-19 16:14:27.000000000 +0000
@@ -52,37 +52,26 @@
 // - Does not support prefix bloom filters.
 class CuckooTableFactory : public TableFactory {
  public:
-  explicit CuckooTableFactory(const CuckooTableOptions& table_options)
-    : table_options_(table_options) {}
+  explicit CuckooTableFactory(
+      const CuckooTableOptions& table_option = CuckooTableOptions());
   ~CuckooTableFactory() {}
 
-  const char* Name() const override { return "CuckooTable"; }
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kCuckooTableName(); }
+  const char* Name() const override { return kCuckooTableName(); }
 
+  using TableFactory::NewTableReader;
   Status NewTableReader(
-      const TableReaderOptions& table_reader_options,
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
       std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
       std::unique_ptr<TableReader>* table,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
-  // Sanitizes the specified DB Options.
-  Status SanitizeOptions(
-      const DBOptions& /*db_opts*/,
-      const ColumnFamilyOptions& /*cf_opts*/) const override {
-    return Status::OK();
-  }
-
-  std::string GetPrintableTableOptions() const override;
-
-  void* GetOptions() override { return &table_options_; }
-
-  Status GetOptionString(std::string* /*opt_string*/,
-                         const std::string& /*delimiter*/) const override {
-    return Status::OK();
-  }
+  std::string GetPrintableOptions() const override;
 
  private:
   CuckooTableOptions table_options_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -15,7 +15,9 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "memory/arena.h"
+#include "options/cf_options.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
 #include "table/cuckoo/cuckoo_table_factory.h"
@@ -33,7 +35,7 @@
 extern const uint64_t kCuckooTableMagicNumber;
 
 CuckooTableReader::CuckooTableReader(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     const Comparator* comparator,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
@@ -54,15 +56,18 @@
       get_slice_hash_(get_slice_hash) {
   if (!ioptions.allow_mmap_reads) {
     status_ = Status::InvalidArgument("File is not mmaped");
-  }
-  TableProperties* props = nullptr;
-  status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
-      ioptions, &props, true /* compression_type_missing */);
-  if (!status_.ok()) {
     return;
   }
-  table_props_.reset(props);
-  auto& user_props = props->user_collected_properties;
+  {
+    std::unique_ptr<TableProperties> props;
+    status_ = ReadTableProperties(file_.get(), file_size,
+                                  kCuckooTableMagicNumber, ioptions, &props);
+    if (!status_.ok()) {
+      return;
+    }
+    table_props_ = std::move(props);
+  }
+  auto& user_props = table_props_->user_collected_properties;
   auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
   if (hash_funs == user_props.end()) {
     status_ = Status::Corruption("Number of hash functions not found");
@@ -76,7 +81,7 @@
   }
   unused_key_ = unused_key->second;
 
-  key_length_ = static_cast<uint32_t>(props->fixed_key_len);
+  key_length_ = static_cast<uint32_t>(table_props_->fixed_key_len);
   auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
   if (user_key_len == user_props.end()) {
     status_ = Status::Corruption("User key length not found");
@@ -136,7 +141,8 @@
   cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
       cuckoo_block_size->second.data());
   cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
-  status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr);
+  status_ = file_->Read(IOOptions(), 0, static_cast<size_t>(file_size),
+                        &file_data_, nullptr, nullptr);
 }
 
 Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
@@ -170,7 +176,9 @@
         } else {
           Slice full_key(bucket, key_length_);
           ParsedInternalKey found_ikey;
-          ParseInternalKey(full_key, &found_ikey);
+          Status s = ParseInternalKey(full_key, &found_ikey,
+                                      false /* log_err_key */);  // TODO
+          if (!s.ok()) return s;
           bool dont_care __attribute__((__unused__));
           get_context->SaveValue(found_ikey, value, &dont_care);
         }
@@ -378,7 +386,8 @@
     const ReadOptions& /*read_options*/,
     const SliceTransform* /* prefix_extractor */, Arena* arena,
     bool /*skip_filters*/, TableReaderCaller /*caller*/,
-    size_t /*compaction_readahead_size*/) {
+    size_t /*compaction_readahead_size*/,
+    bool /* allow_unprepared_value */) {
   if (!status().ok()) {
     return NewErrorInternalIterator<Slice>(
         Status::Corruption("CuckooTableReader status is not okay."), arena);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,9 +14,7 @@
 #include <utility>
 #include <vector>
 
-#include "db/dbformat.h"
 #include "file/random_access_file_reader.h"
-#include "options/cf_options.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "table/table_reader.h"
@@ -25,10 +23,11 @@
 
 class Arena;
 class TableReader;
+struct ImmutableOptions;
 
 class CuckooTableReader: public TableReader {
  public:
-  CuckooTableReader(const ImmutableCFOptions& ioptions,
+  CuckooTableReader(const ImmutableOptions& ioptions,
                     std::unique_ptr<RandomAccessFileReader>&& file,
                     uint64_t file_size, const Comparator* user_comparator,
                     uint64_t (*get_slice_hash)(const Slice&, uint32_t,
@@ -52,7 +51,8 @@
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena, bool skip_filters,
                                 TableReaderCaller caller,
-                                size_t compaction_readahead_size = 0) override;
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
   void Prepare(const Slice& target) override;
 
   // Report an approximation of how much memory has been used.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "memory/arena.h"
+#include "rocksdb/db.h"
 #include "table/cuckoo/cuckoo_table_builder.h"
 #include "table/cuckoo/cuckoo_table_factory.h"
 #include "table/cuckoo/cuckoo_table_reader.h"
@@ -31,7 +32,6 @@
 #include "util/string_util.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_string(file_dir, "", "Directory where the files will be created"
     " for benchmark. Added for using tmpfs.");
@@ -69,7 +69,7 @@
   CuckooReaderTest() {
     options.allow_mmap_reads = true;
     env = options.env;
-    env_options = EnvOptions(options);
+    file_options = FileOptions(options);
   }
 
   void SetUp(int num) {
@@ -89,12 +89,9 @@
 
   void CreateCuckooFileAndCheckReader(
       const Comparator* ucomp = BytewiseComparator()) {
-    std::unique_ptr<WritableFile> writable_file;
-    ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-        env_options));
-
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                         file_options, &file_writer, nullptr));
     CuckooTableBuilder builder(
         file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
         GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName);
@@ -110,12 +107,10 @@
     ASSERT_OK(file_writer->Close());
 
     // Check reader now.
-    std::unique_ptr<RandomAccessFile> read_file;
-    ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                   fname));
-    const ImmutableCFOptions ioptions(options);
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
                              GetSliceHash);
     ASSERT_OK(reader.status());
@@ -140,12 +135,10 @@
   }
 
   void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
-    std::unique_ptr<RandomAccessFile> read_file;
-    ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                   fname));
-    const ImmutableCFOptions ioptions(options);
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
                              GetSliceHash);
     ASSERT_OK(reader.status());
@@ -212,9 +205,17 @@
   uint64_t file_size;
   Options options;
   Env* env;
-  EnvOptions env_options;
+  FileOptions file_options;
 };
 
+TEST_F(CuckooReaderTest, FileNotMmaped) {
+  options.allow_mmap_reads = false;
+  ImmutableOptions ioptions(options);
+  CuckooTableReader reader(ioptions, nullptr, 0, nullptr, nullptr);
+  ASSERT_TRUE(reader.status().IsInvalidArgument());
+  ASSERT_STREQ("File is not mmaped", reader.status().getState());
+}
+
 TEST_F(CuckooReaderTest, WhenKeyExists) {
   SetUp(kNumHashFunc);
   fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists");
@@ -323,12 +324,12 @@
   }
   auto* ucmp = BytewiseComparator();
   CreateCuckooFileAndCheckReader();
-  std::unique_ptr<RandomAccessFile> read_file;
-  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                 fname));
-  const ImmutableCFOptions ioptions(options);
+
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(
+      env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
                            GetSliceHash);
   ASSERT_OK(reader.status());
@@ -408,15 +409,13 @@
     const uint64_t num, double hash_ratio) {
   Options options;
   options.allow_mmap_reads = true;
-  Env* env = options.env;
-  EnvOptions env_options = EnvOptions(options);
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
   std::string fname = GetFileName(num);
 
-  std::unique_ptr<WritableFile> writable_file;
-  ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      env_options));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(fs, fname, file_options, &file_writer,
+                                       nullptr));
   CuckooTableBuilder builder(
       file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
       false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
@@ -433,14 +432,13 @@
   ASSERT_OK(file_writer->Close());
 
   uint64_t file_size;
-  env->GetFileSize(fname, &file_size);
-  std::unique_ptr<RandomAccessFile> read_file;
-  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                 fname));
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
                            test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
@@ -462,18 +460,18 @@
   Options options;
   options.allow_mmap_reads = true;
   Env* env = options.env;
-  EnvOptions env_options = EnvOptions(options);
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
   std::string fname = GetFileName(num);
 
   uint64_t file_size;
-  env->GetFileSize(fname, &file_size);
-  std::unique_ptr<RandomAccessFile> read_file;
-  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                 fname));
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
                            test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
@@ -492,7 +490,7 @@
   for (uint64_t i = 0; i < num; ++i) {
     keys.push_back(2 * i);
   }
-  std::random_shuffle(keys.begin(), keys.end());
+  RandomShuffle(keys.begin(), keys.end());
 
   PinnableSlice value;
   // Assume only the fast path is triggered
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.cc	2025-05-19 16:14:27.000000000 +0000
@@ -14,19 +14,24 @@
 
 #include "block_fetcher.h"
 #include "file/random_access_file_reader.h"
-#include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
+#include "options/options_helper.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/persistent_cache_helper.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
+#include "util/hash.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
+#include "util/xxhash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -41,6 +46,7 @@
 const uint64_t kLegacyPlainTableMagicNumber = 0;
 const uint64_t kPlainTableMagicNumber = 0;
 #endif
+const char* kHostnameForDbHostId = "__hostname__";
 
 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
   return env != nullptr && stats != nullptr &&
@@ -49,11 +55,20 @@
 
 void BlockHandle::EncodeTo(std::string* dst) const {
   // Sanity check that all fields have been set
-  assert(offset_ != ~static_cast<uint64_t>(0));
-  assert(size_ != ~static_cast<uint64_t>(0));
+  assert(offset_ != ~uint64_t{0});
+  assert(size_ != ~uint64_t{0});
   PutVarint64Varint64(dst, offset_, size_);
 }
 
+char* BlockHandle::EncodeTo(char* dst) const {
+  // Sanity check that all fields have been set
+  assert(offset_ != ~uint64_t{0});
+  assert(size_ != ~uint64_t{0});
+  char* cur = EncodeVarint64(dst, offset_);
+  cur = EncodeVarint64(cur, size_);
+  return cur;
+}
+
 Status BlockHandle::DecodeFrom(Slice* input) {
   if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
     return Status::OK();
@@ -93,8 +108,10 @@
 void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
                           const BlockHandle* previous_handle) const {
   if (previous_handle) {
+    // WART: this is specific to Block-based table
     assert(handle.offset() == previous_handle->offset() +
-                                  previous_handle->size() + kBlockTrailerSize);
+                                  previous_handle->size() +
+                                  BlockBasedTable::kBlockTrailerSize);
     PutVarsignedint64(dst, handle.size() - previous_handle->size());
   } else {
     handle.EncodeTo(dst);
@@ -113,9 +130,10 @@
     if (!GetVarsignedint64(input, &delta)) {
       return Status::Corruption("bad delta-encoded index value");
     }
-    handle = BlockHandle(
-        previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
-        previous_handle->size() + delta);
+    // WART: this is specific to Block-based table
+    handle = BlockHandle(previous_handle->offset() + previous_handle->size() +
+                             BlockBasedTable::kBlockTrailerSize,
+                         previous_handle->size() + delta);
   } else {
     Status s = handle.DecodeFrom(input);
     if (!s.ok()) {
@@ -155,107 +173,156 @@
     return kPlainTableMagicNumber;
   }
   assert(false);
-  return 0;
+  return magic_number;
+}
+inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kBlockBasedTableMagicNumber) {
+    return kLegacyBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kPlainTableMagicNumber) {
+    return kLegacyPlainTableMagicNumber;
+  }
+  assert(false);
+  return magic_number;
 }
+inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
+  if (magic_number == kBlockBasedTableMagicNumber ||
+      magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
+  } else {
+    return 0;
+  }
+}
+
+// Footer format, in three parts:
+// * Part1
+//   -> format_version == 0 (inferred from legacy magic number)
+//      <empty> (0 bytes)
+//   -> format_version >= 1
+//      checksum type (char, 1 byte)
+// * Part2
+//      metaindex handle (varint64 offset, varint64 size)
+//      index handle     (varint64 offset, varint64 size)
+//      <zero padding> for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40
+// * Part3
+//   -> format_version == 0 (inferred from legacy magic number)
+//      legacy magic number (8 bytes)
+//   -> format_version >= 1 (inferred from NOT legacy magic number)
+//      format_version (uint32LE, 4 bytes), also called "footer version"
+//      newer magic number (8 bytes)
+
+constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength;
 }  // namespace
 
-// legacy footer format:
-//    metaindex handle (varint64 offset, varint64 size)
-//    index handle     (varint64 offset, varint64 size)
-//    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
-//    table_magic_number (8 bytes)
-// new footer format:
-//    checksum type (char, 1 byte)
-//    metaindex handle (varint64 offset, varint64 size)
-//    index handle     (varint64 offset, varint64 size)
-//    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
-//    footer version (4 bytes)
-//    table_magic_number (8 bytes)
-void Footer::EncodeTo(std::string* dst) const {
-  assert(HasInitializedTableMagicNumber());
-  if (IsLegacyFooterFormat(table_magic_number())) {
-    // has to be default checksum with legacy footer
-    assert(checksum_ == kCRC32c);
-    const size_t original_size = dst->size();
-    metaindex_handle_.EncodeTo(dst);
-    index_handle_.EncodeTo(dst);
-    dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength);  // Padding
-    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
-    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
-    assert(dst->size() == original_size + kVersion0EncodedLength);
+void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
+                          uint64_t footer_offset, ChecksumType checksum_type,
+                          const BlockHandle& metaindex_handle,
+                          const BlockHandle& index_handle) {
+  (void)footer_offset;  // Future use
+
+  assert(magic_number != Footer::kNullTableMagicNumber);
+  assert(IsSupportedFormatVersion(format_version));
+
+  char* part2;
+  char* part3;
+  if (format_version > 0) {
+    slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength);
+    // Generate parts 1 and 3
+    char* cur = data_.data();
+    // Part 1
+    *(cur++) = checksum_type;
+    // Part 2
+    part2 = cur;
+    // Skip over part 2 for now
+    cur += kFooterPart2Size;
+    // Part 3
+    part3 = cur;
+    EncodeFixed32(cur, format_version);
+    cur += 4;
+    EncodeFixed64(cur, magic_number);
+    assert(cur + 8 == slice_.data() + slice_.size());
   } else {
-    const size_t original_size = dst->size();
-    dst->push_back(static_cast<char>(checksum_));
-    metaindex_handle_.EncodeTo(dst);
-    index_handle_.EncodeTo(dst);
-    dst->resize(original_size + kNewVersionsEncodedLength - 12);  // Padding
-    PutFixed32(dst, version());
-    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
-    PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
-    assert(dst->size() == original_size + kNewVersionsEncodedLength);
+    slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
+    // Legacy SST files use kCRC32c checksum but it's not stored in footer.
+    assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
+    // Generate part 3 (part 1 empty, skip part 2 for now)
+    part2 = data_.data();
+    part3 = part2 + kFooterPart2Size;
+    char* cur = part3;
+    // Use legacy magic numbers to indicate format_version=0, for
+    // compatibility. No other cases should use format_version=0.
+    EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number));
+    assert(cur + 8 == slice_.data() + slice_.size());
+  }
+
+  {
+    char* cur = part2;
+    cur = metaindex_handle.EncodeTo(cur);
+    cur = index_handle.EncodeTo(cur);
+    // Zero pad remainder
+    std::fill(cur, part3, char{0});
   }
 }
 
-Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
-    : version_(_version),
-      checksum_(kCRC32c),
-      table_magic_number_(_table_magic_number) {
-  // This should be guaranteed by constructor callers
-  assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
-}
+Status Footer::DecodeFrom(Slice input, uint64_t input_offset) {
+  (void)input_offset;  // Future use
 
-Status Footer::DecodeFrom(Slice* input) {
-  assert(!HasInitializedTableMagicNumber());
+  // Only decode to unused Footer
+  assert(table_magic_number_ == kNullTableMagicNumber);
   assert(input != nullptr);
-  assert(input->size() >= kMinEncodedLength);
+  assert(input.size() >= kMinEncodedLength);
 
-  const char* magic_ptr =
-      input->data() + input->size() - kMagicNumberLengthByte;
-  const uint32_t magic_lo = DecodeFixed32(magic_ptr);
-  const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
-  uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
-                    (static_cast<uint64_t>(magic_lo)));
+  const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
+  uint64_t magic = DecodeFixed64(magic_ptr);
 
   // We check for legacy formats here and silently upconvert them
   bool legacy = IsLegacyFooterFormat(magic);
   if (legacy) {
     magic = UpconvertLegacyFooterFormat(magic);
   }
-  set_table_magic_number(magic);
+  table_magic_number_ = magic;
+  block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
 
+  // Parse Part3
   if (legacy) {
     // The size is already asserted to be at least kMinEncodedLength
     // at the beginning of the function
-    input->remove_prefix(input->size() - kVersion0EncodedLength);
-    version_ = 0 /* legacy */;
-    checksum_ = kCRC32c;
+    input.remove_prefix(input.size() - kVersion0EncodedLength);
+    format_version_ = 0 /* legacy */;
+    checksum_type_ = kCRC32c;
   } else {
-    version_ = DecodeFixed32(magic_ptr - 4);
-    // Footer version 1 and higher will always occupy exactly this many bytes.
-    // It consists of the checksum type, two block handles, padding,
-    // a version number, and a magic number
-    if (input->size() < kNewVersionsEncodedLength) {
-      return Status::Corruption("input is too short to be an sstable");
-    } else {
-      input->remove_prefix(input->size() - kNewVersionsEncodedLength);
+    const char* part3_ptr = magic_ptr - 4;
+    format_version_ = DecodeFixed32(part3_ptr);
+    if (!IsSupportedFormatVersion(format_version_)) {
+      return Status::Corruption("Corrupt or unsupported format_version: " +
+                                ROCKSDB_NAMESPACE::ToString(format_version_));
     }
-    uint32_t chksum;
-    if (!GetVarint32(input, &chksum)) {
-      return Status::Corruption("bad checksum type");
+    // All known format versions >= 1 occupy exactly this many bytes.
+    if (input.size() < kNewVersionsEncodedLength) {
+      return Status::Corruption("Input is too short to be an SST file");
     }
-    checksum_ = static_cast<ChecksumType>(chksum);
-  }
+    uint64_t adjustment = input.size() - kNewVersionsEncodedLength;
+    input.remove_prefix(adjustment);
 
-  Status result = metaindex_handle_.DecodeFrom(input);
-  if (result.ok()) {
-    result = index_handle_.DecodeFrom(input);
+    // Parse Part1
+    char chksum = input.data()[0];
+    checksum_type_ = lossless_cast<ChecksumType>(chksum);
+    if (!IsSupportedChecksumType(checksum_type())) {
+      return Status::Corruption(
+          "Corrupt or unsupported checksum type: " +
+          ROCKSDB_NAMESPACE::ToString(lossless_cast<uint8_t>(chksum)));
+    }
+    // Consume checksum type field
+    input.remove_prefix(1);
   }
+
+  // Parse Part2
+  Status result = metaindex_handle_.DecodeFrom(&input);
   if (result.ok()) {
-    // We skip over any leftover data (just padding for now) in "input"
-    const char* end = magic_ptr + kMagicNumberLengthByte;
-    *input = Slice(end, input->data() + input->size() - end);
+    result = index_handle_.DecodeFrom(&input);
   }
   return result;
+  // Padding in part2 is ignored
 }
 
 std::string Footer::ToString() const {
@@ -269,19 +336,17 @@
     result.append("table_magic_number: " +
                   ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n  ");
   } else {
-    result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) +
-                  "\n  ");
     result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
     result.append("index handle: " + index_handle_.ToString() + "\n  ");
-    result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) +
-                  "\n  ");
     result.append("table_magic_number: " +
                   ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n  ");
+    result.append("format version: " +
+                  ROCKSDB_NAMESPACE::ToString(format_version_) + "\n  ");
   }
   return result;
 }
 
-Status ReadFooterFromFile(RandomAccessFileReader* file,
+Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
                           FilePrefetchBuffer* prefetch_buffer,
                           uint64_t file_size, Footer* footer,
                           uint64_t enforce_table_magic_number) {
@@ -292,18 +357,30 @@
                               file->file_name());
   }
 
-  char footer_space[Footer::kMaxEncodedLength];
+  std::string footer_buf;
+  AlignedBuf internal_buf;
   Slice footer_input;
-  size_t read_offset =
-      (file_size > Footer::kMaxEncodedLength)
-          ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
-          : 0;
+  uint64_t read_offset = (file_size > Footer::kMaxEncodedLength)
+                             ? file_size - Footer::kMaxEncodedLength
+                             : 0;
   Status s;
+  // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
+  // there is no readahead for point lookups, so TryReadFromCache will fail if
+  // the required data is not in the prefetch buffer. Once deadline is enabled
+  // for iterator, TryReadFromCache might do a readahead. Revisit to see if we
+  // need to pass a timeout at that point
   if (prefetch_buffer == nullptr ||
-      !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
-                                         &footer_input)) {
-    s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
-                   footer_space);
+      !prefetch_buffer->TryReadFromCache(IOOptions(), file, read_offset,
+                                         Footer::kMaxEncodedLength,
+                                         &footer_input, nullptr)) {
+    if (file->use_direct_io()) {
+      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
+                     &footer_input, nullptr, &internal_buf);
+    } else {
+      footer_buf.reserve(Footer::kMaxEncodedLength);
+      s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
+                     &footer_input, &footer_buf[0], nullptr);
+    }
     if (!s.ok()) return s;
   }
 
@@ -316,7 +393,7 @@
                               file->file_name());
   }
 
-  s = footer->DecodeFrom(&footer_input);
+  s = footer->DecodeFrom(footer_input, read_offset);
   if (!s.ok()) {
     return s;
   }
@@ -330,117 +407,134 @@
   return Status::OK();
 }
 
+namespace {
+// Custom handling for the last byte of a block, to avoid invoking streaming
+// API to get an effective block checksum. This function is its own inverse
+// because it uses xor.
+inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) {
+  // This strategy bears some resemblance to extending a CRC checksum by one
+  // more byte, except we don't need to re-mix the input checksum as long as
+  // we do this step only once (per checksum).
+  const uint32_t kRandomPrime = 0x6b9083d9;
+  return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime;
+}
+}  // namespace
+
+uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
+                                size_t data_size) {
+  switch (type) {
+    case kCRC32c:
+      return crc32c::Mask(crc32c::Value(data, data_size));
+    case kxxHash:
+      return XXH32(data, data_size, /*seed*/ 0);
+    case kxxHash64:
+      return Lower32of64(XXH64(data, data_size, /*seed*/ 0));
+    case kXXH3: {
+      if (data_size == 0) {
+        // Special case because of special handling for last byte, not
+        // present in this case. Can be any value different from other
+        // small input size checksums.
+        return 0;
+      } else {
+        // See corresponding code in ComputeBuiltinChecksumWithLastByte
+        uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1));
+        return ModifyChecksumForLastByte(v, data[data_size - 1]);
+      }
+    }
+    default:  // including kNoChecksum
+      return 0;
+  }
+}
+
+uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
+                                            size_t data_size, char last_byte) {
+  switch (type) {
+    case kCRC32c: {
+      uint32_t crc = crc32c::Value(data, data_size);
+      // Extend to cover last byte (compression type)
+      crc = crc32c::Extend(crc, &last_byte, 1);
+      return crc32c::Mask(crc);
+    }
+    case kxxHash: {
+      XXH32_state_t* const state = XXH32_createState();
+      XXH32_reset(state, 0);
+      XXH32_update(state, data, data_size);
+      // Extend to cover last byte (compression type)
+      XXH32_update(state, &last_byte, 1);
+      uint32_t v = XXH32_digest(state);
+      XXH32_freeState(state);
+      return v;
+    }
+    case kxxHash64: {
+      XXH64_state_t* const state = XXH64_createState();
+      XXH64_reset(state, 0);
+      XXH64_update(state, data, data_size);
+      // Extend to cover last byte (compression type)
+      XXH64_update(state, &last_byte, 1);
+      uint32_t v = Lower32of64(XXH64_digest(state));
+      XXH64_freeState(state);
+      return v;
+    }
+    case kXXH3: {
+      // XXH3 is a complicated hash function that is extremely fast on
+      // contiguous input, but that makes its streaming support rather
+      // complex. It is worth custom handling of the last byte (`type`)
+      // in order to avoid allocating a large state object and bringing
+      // that code complexity into CPU working set.
+      uint32_t v = Lower32of64(XXH3_64bits(data, data_size));
+      return ModifyChecksumForLastByte(v, last_byte);
+    }
+    default:  // including kNoChecksum
+      return 0;
+  }
+}
+
 Status UncompressBlockContentsForCompressionType(
     const UncompressionInfo& uncompression_info, const char* data, size_t n,
     BlockContents* contents, uint32_t format_version,
-    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
-  CacheAllocationPtr ubuf;
+    const ImmutableOptions& ioptions, MemoryAllocator* allocator) {
+  Status ret = Status::OK();
 
   assert(uncompression_info.type() != kNoCompression &&
          "Invalid compression type");
 
-  StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
-                                        ioptions.env, ioptions.statistics));
-  int decompress_size = 0;
-  switch (uncompression_info.type()) {
-    case kSnappyCompression: {
-      size_t ulength = 0;
-      static char snappy_corrupt_msg[] =
-          "Snappy not supported or corrupted Snappy compressed block contents";
-      if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
-        return Status::Corruption(snappy_corrupt_msg);
-      }
-      ubuf = AllocateBlock(ulength, allocator);
-      if (!Snappy_Uncompress(data, n, ubuf.get())) {
-        return Status::Corruption(snappy_corrupt_msg);
-      }
-      *contents = BlockContents(std::move(ubuf), ulength);
-      break;
+  StopWatchNano timer(ioptions.clock,
+                      ShouldReportDetailedTime(ioptions.env, ioptions.stats));
+  size_t uncompressed_size = 0;
+  CacheAllocationPtr ubuf =
+      UncompressData(uncompression_info, data, n, &uncompressed_size,
+                     GetCompressFormatForVersion(format_version), allocator);
+  if (!ubuf) {
+    if (!CompressionTypeSupported(uncompression_info.type())) {
+      return Status::NotSupported(
+          "Unsupported compression method for this build",
+          CompressionTypeToString(uncompression_info.type()));
+    } else {
+      return Status::Corruption(
+          "Corrupted compressed block contents",
+          CompressionTypeToString(uncompression_info.type()));
     }
-    case kZlibCompression:
-      ubuf = Zlib_Uncompress(
-          uncompression_info, data, n, &decompress_size,
-          GetCompressFormatForVersion(kZlibCompression, format_version),
-          allocator);
-      if (!ubuf) {
-        static char zlib_corrupt_msg[] =
-            "Zlib not supported or corrupted Zlib compressed block contents";
-        return Status::Corruption(zlib_corrupt_msg);
-      }
-      *contents = BlockContents(std::move(ubuf), decompress_size);
-      break;
-    case kBZip2Compression:
-      ubuf = BZip2_Uncompress(
-          data, n, &decompress_size,
-          GetCompressFormatForVersion(kBZip2Compression, format_version),
-          allocator);
-      if (!ubuf) {
-        static char bzip2_corrupt_msg[] =
-            "Bzip2 not supported or corrupted Bzip2 compressed block contents";
-        return Status::Corruption(bzip2_corrupt_msg);
-      }
-      *contents = BlockContents(std::move(ubuf), decompress_size);
-      break;
-    case kLZ4Compression:
-      ubuf = LZ4_Uncompress(
-          uncompression_info, data, n, &decompress_size,
-          GetCompressFormatForVersion(kLZ4Compression, format_version),
-          allocator);
-      if (!ubuf) {
-        static char lz4_corrupt_msg[] =
-            "LZ4 not supported or corrupted LZ4 compressed block contents";
-        return Status::Corruption(lz4_corrupt_msg);
-      }
-      *contents = BlockContents(std::move(ubuf), decompress_size);
-      break;
-    case kLZ4HCCompression:
-      ubuf = LZ4_Uncompress(
-          uncompression_info, data, n, &decompress_size,
-          GetCompressFormatForVersion(kLZ4HCCompression, format_version),
-          allocator);
-      if (!ubuf) {
-        static char lz4hc_corrupt_msg[] =
-            "LZ4HC not supported or corrupted LZ4HC compressed block contents";
-        return Status::Corruption(lz4hc_corrupt_msg);
-      }
-      *contents = BlockContents(std::move(ubuf), decompress_size);
-      break;
-    case kXpressCompression:
-      // XPRESS allocates memory internally, thus no support for custom
-      // allocator.
-      ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
-      if (!ubuf) {
-        static char xpress_corrupt_msg[] =
-            "XPRESS not supported or corrupted XPRESS compressed block "
-            "contents";
-        return Status::Corruption(xpress_corrupt_msg);
-      }
-      *contents = BlockContents(std::move(ubuf), decompress_size);
-      break;
-    case kZSTD:
-    case kZSTDNotFinalCompression:
-      ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size,
-                             allocator);
-      if (!ubuf) {
-        static char zstd_corrupt_msg[] =
-            "ZSTD not supported or corrupted ZSTD compressed block contents";
-        return Status::Corruption(zstd_corrupt_msg);
-      }
-      *contents = BlockContents(std::move(ubuf), decompress_size);
-      break;
-    default:
-      return Status::Corruption("bad block type");
   }
 
-  if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
-    RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
+  *contents = BlockContents(std::move(ubuf), uncompressed_size);
+
+  if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
+    RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
                           timer.ElapsedNanos());
   }
-  RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
+  RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED,
                         contents->data.size());
-  RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
+  RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
 
-  return Status::OK();
+  TEST_SYNC_POINT_CALLBACK(
+      "UncompressBlockContentsForCompressionType:TamperWithReturnValue",
+      static_cast<void*>(&ret));
+  TEST_SYNC_POINT_CALLBACK(
+      "UncompressBlockContentsForCompressionType:"
+      "TamperWithDecompressionOutput",
+      static_cast<void*>(contents));
+
+  return ret;
 }
 
 //
@@ -453,13 +547,27 @@
 Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
                                const char* data, size_t n,
                                BlockContents* contents, uint32_t format_version,
-                               const ImmutableCFOptions& ioptions,
+                               const ImmutableOptions& ioptions,
                                MemoryAllocator* allocator) {
   assert(data[n] != kNoCompression);
-  assert(data[n] == uncompression_info.type());
+  assert(data[n] == static_cast<char>(uncompression_info.type()));
   return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
                                                    contents, format_version,
                                                    ioptions, allocator);
 }
 
+// Replace the contents of db_host_id with the actual hostname, if db_host_id
+// matches the keyword kHostnameForDbHostId
+Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
+  assert(db_host_id);
+  if (*db_host_id == kHostnameForDbHostId) {
+    Status s = env->GetHostNameString(db_host_id);
+    if (!s.ok()) {
+      db_host_id->clear();
+    }
+    return s;
+  }
+
+  return Status::OK();
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/format.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/format.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,21 +8,21 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <stdint.h>
+
+#include <array>
+#include <cstdint>
 #include <string>
+
 #include "file/file_prefetch_buffer.h"
 #include "file/random_access_file_reader.h"
-
-#include "rocksdb/options.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/status.h"
-#include "rocksdb/table.h"
-
 #include "memory/memory_allocator.h"
 #include "options/cf_options.h"
 #include "port/malloc.h"
 #include "port/port.h"  // noexcept
-#include "table/persistent_cache_options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "util/hash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -32,12 +32,14 @@
 extern bool ShouldReportDetailedTime(Env* env, Statistics* stats);
 
 // the length of the magic number in bytes.
-const int kMagicNumberLengthByte = 8;
+constexpr uint32_t kMagicNumberLengthByte = 8;
 
 // BlockHandle is a pointer to the extent of a file that stores a data
 // block or a meta block.
 class BlockHandle {
  public:
+  // Creates a block handle with special values indicating "uninitialized,"
+  // distinct from the "null" block handle.
   BlockHandle();
   BlockHandle(uint64_t offset, uint64_t size);
 
@@ -50,6 +52,7 @@
   void set_size(uint64_t _size) { size_ = _size; }
 
   void EncodeTo(std::string* dst) const;
+  char* EncodeTo(char* dst) const;
   Status DecodeFrom(Slice* input);
   Status DecodeSizeFrom(uint64_t offset, Slice* input);
 
@@ -63,7 +66,14 @@
   static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
 
   // Maximum encoding length of a BlockHandle
-  enum { kMaxEncodedLength = 10 + 10 };
+  static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
+
+  inline bool operator==(const BlockHandle& rhs) const {
+    return offset_ == rhs.offset_ && size_ == rhs.size_;
+  }
+  inline bool operator!=(const BlockHandle& rhs) const {
+    return !(*this == rhs);
+  }
 
  private:
   uint64_t offset_;
@@ -101,140 +111,160 @@
   std::string ToString(bool hex, bool have_first_key) const;
 };
 
-inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
-                                            uint32_t version) {
-#ifdef NDEBUG
-  (void)compression_type;
-#endif
-  // snappy is not versioned
-  assert(compression_type != kSnappyCompression &&
-         compression_type != kXpressCompression &&
-         compression_type != kNoCompression);
-  // As of version 2, we encode compressed block with
+inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
+  // As of format_version 2, we encode compressed block with
   // compress_format_version == 2. Before that, the version is 1.
   // DO NOT CHANGE THIS FUNCTION, it affects disk format
-  return version >= 2 ? 2 : 1;
+  return format_version >= 2 ? 2 : 1;
 }
 
-inline bool BlockBasedTableSupportedVersion(uint32_t version) {
-  return version <= 5;
+constexpr uint32_t kLatestFormatVersion = 5;
+
+inline bool IsSupportedFormatVersion(uint32_t version) {
+  return version <= kLatestFormatVersion;
 }
 
-// Footer encapsulates the fixed information stored at the tail
-// end of every table file.
+// Footer encapsulates the fixed information stored at the tail end of every
+// SST file. In general, it should only include things that cannot go
+// elsewhere under the metaindex block. For example, checksum_type is
+// required for verifying metaindex block checksum (when applicable), but
+// index block handle can easily go in metaindex block (possible future).
+// See also FooterBuilder below.
 class Footer {
  public:
-  // Constructs a footer without specifying its table magic number.
-  // In such case, the table magic number of such footer should be
-  // initialized via @ReadFooterFromFile().
-  // Use this when you plan to load Footer with DecodeFrom(). Never use this
-  // when you plan to EncodeTo.
-  Footer() : Footer(kInvalidTableMagicNumber, 0) {}
-
-  // Use this constructor when you plan to write out the footer using
-  // EncodeTo(). Never use this constructor with DecodeFrom().
-  Footer(uint64_t table_magic_number, uint32_t version);
-
-  // The version of the footer in this file
-  uint32_t version() const { return version_; }
-
-  // The checksum type used in this file
-  ChecksumType checksum() const { return checksum_; }
-  void set_checksum(const ChecksumType c) { checksum_ = c; }
+  // Create empty. Populate using DecodeFrom.
+  Footer() {}
+
+  // Deserialize a footer (populate fields) from `input` and check for various
+  // corruptions. `input_offset` is the offset within the target file of
+  // `input` buffer (future use).
+  Status DecodeFrom(Slice input, uint64_t input_offset);
+
+  // Table magic number identifies file as RocksDB SST file and which kind of
+  // SST format is use.
+  uint64_t table_magic_number() const { return table_magic_number_; }
+
+  // A version (footer and more) within a kind of SST. (It would add more
+  // unnecessary complexity to separate footer versions and
+  // BBTO::format_version.)
+  uint32_t format_version() const { return format_version_; }
 
-  // The block handle for the metaindex block of the table
+  // Block handle for metaindex block.
   const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
-  void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
 
-  // The block handle for the index block of the table
+  // Block handle for (top-level) index block.
   const BlockHandle& index_handle() const { return index_handle_; }
 
-  void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
+  // Checksum type used in the file.
+  ChecksumType checksum_type() const {
+    return static_cast<ChecksumType>(checksum_type_);
+  }
 
-  uint64_t table_magic_number() const { return table_magic_number_; }
+  // Block trailer size used by file with this footer (e.g. 5 for block-based
+  // table and 0 for plain table). This is inferred from magic number so
+  // not in the serialized form.
+  inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
 
-  void EncodeTo(std::string* dst) const;
+  // Convert this object to a human readable form
+  std::string ToString() const;
 
-  // Set the current footer based on the input slice.
+  // Encoded lengths of Footers. Bytes for serialized Footer will always be
+  // >= kMinEncodedLength and <= kMaxEncodedLength.
   //
-  // REQUIRES: table_magic_number_ is not set (i.e.,
-  // HasInitializedTableMagicNumber() is true). The function will initialize the
-  // magic number
-  Status DecodeFrom(Slice* input);
-
-  // Encoded length of a Footer.  Note that the serialization of a Footer will
-  // always occupy at least kMinEncodedLength bytes.  If fields are changed
-  // the version number should be incremented and kMaxEncodedLength should be
-  // increased accordingly.
-  enum {
-    // Footer version 0 (legacy) will always occupy exactly this many bytes.
-    // It consists of two block handles, padding, and a magic number.
-    kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
-    // Footer of versions 1 and higher will always occupy exactly this many
-    // bytes. It consists of the checksum type, two block handles, padding,
-    // a version number (bigger than 1), and a magic number
-    kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
-    kMinEncodedLength = kVersion0EncodedLength,
-    kMaxEncodedLength = kNewVersionsEncodedLength,
-  };
+  // Footer version 0 (legacy) will always occupy exactly this many bytes.
+  // It consists of two block handles, padding, and a magic number.
+  static constexpr uint32_t kVersion0EncodedLength =
+      2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
+  static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
+
+  // Footer of versions 1 and higher will always occupy exactly this many
+  // bytes. It originally consisted of the checksum type, two block handles,
+  // padding (to maximum handle encoding size), a format version number, and a
+  // magic number.
+  static constexpr uint32_t kNewVersionsEncodedLength =
+      1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
+  static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
 
-  static const uint64_t kInvalidTableMagicNumber = 0;
+  static constexpr uint64_t kNullTableMagicNumber = 0;
 
-  // convert this object to a human readable form
-  std::string ToString() const;
+  static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
 
  private:
-  // REQUIRES: magic number wasn't initialized.
-  void set_table_magic_number(uint64_t magic_number) {
-    assert(!HasInitializedTableMagicNumber());
-    table_magic_number_ = magic_number;
-  }
+  static constexpr int kInvalidChecksumType =
+      (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
 
-  // return true if @table_magic_number_ is set to a value different
-  // from @kInvalidTableMagicNumber.
-  bool HasInitializedTableMagicNumber() const {
-    return (table_magic_number_ != kInvalidTableMagicNumber);
-  }
-
-  uint32_t version_;
-  ChecksumType checksum_;
+  uint64_t table_magic_number_ = kNullTableMagicNumber;
+  uint32_t format_version_ = kInvalidFormatVersion;
   BlockHandle metaindex_handle_;
   BlockHandle index_handle_;
-  uint64_t table_magic_number_ = 0;
+  int checksum_type_ = kInvalidChecksumType;
+  uint8_t block_trailer_size_ = 0;
+};
+
+// Builder for Footer
+class FooterBuilder {
+ public:
+  // Run builder in inputs. This is a single step with lots of parameters for
+  // efficiency (based on perf testing).
+  // * table_magic_number identifies file as RocksDB SST file and which kind of
+  // SST format is use.
+  // * format_version is a version for the footer and can also apply to other
+  // aspects of the SST file (see BlockBasedTableOptions::format_version).
+  // NOTE: To save complexity in the caller, when format_version == 0 and
+  // there is a corresponding legacy magic number to the one specified, the
+  // legacy magic number will be written for forward compatibility.
+  // * footer_offset is the file offset where the footer will be written
+  // (for future use).
+  // * checksum_type is for formats using block checksums.
+  // * index_handle is optional for some kinds of SST files.
+  void Build(uint64_t table_magic_number, uint32_t format_version,
+             uint64_t footer_offset, ChecksumType checksum_type,
+             const BlockHandle& metaindex_handle,
+             const BlockHandle& index_handle = BlockHandle::NullBlockHandle());
+
+  // After Builder, get a Slice for the serialized Footer, backed by this
+  // FooterBuilder.
+  const Slice& GetSlice() const {
+    assert(slice_.size());
+    return slice_;
+  }
+
+ private:
+  Slice slice_;
+  std::array<char, Footer::kMaxEncodedLength> data_;
 };
 
 // Read the footer from file
 // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
 // corruption if table_magic number is not equal to enforce_table_magic_number
-Status ReadFooterFromFile(RandomAccessFileReader* file,
+Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
                           FilePrefetchBuffer* prefetch_buffer,
                           uint64_t file_size, Footer* footer,
                           uint64_t enforce_table_magic_number = 0);
 
-// 1-byte type + 32-bit crc
-static const size_t kBlockTrailerSize = 5;
-
-// Make block size calculation for IO less error prone
-inline uint64_t block_size(const BlockHandle& handle) {
-  return handle.size() + kBlockTrailerSize;
-}
-
-inline CompressionType get_block_compression_type(const char* block_data,
-                                                  size_t block_size) {
-  return static_cast<CompressionType>(block_data[block_size]);
-}
+// Computes a checksum using the given ChecksumType. Sometimes we need to
+// include one more input byte logically at the end but not part of the main
+// data buffer. If data_size >= 1, then
+// ComputeBuiltinChecksum(type, data, size)
+// ==
+// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
+uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
+                                size_t size);
+uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
+                                            size_t size, char last_byte);
 
 // Represents the contents of a block read from an SST file. Depending on how
 // it's created, it may or may not own the actual block bytes. As an example,
 // BlockContents objects representing data read from mmapped files only point
 // into the mmapped region.
 struct BlockContents {
-  Slice data;  // Actual contents of data
+  // Points to block payload (without trailer)
+  Slice data;
   CacheAllocationPtr allocation;
 
 #ifndef NDEBUG
-  // Whether the block is a raw block, which contains compression type
-  // byte. It is only used for assertion.
+  // Whether there is a known trailer after what is pointed to by `data`.
+  // See BlockBasedTable::GetCompressionType.
   bool is_raw_block = false;
 #endif  // NDEBUG
 
@@ -256,14 +286,6 @@
   // Returns whether the object has ownership of the underlying data bytes.
   bool own_bytes() const { return allocation.get() != nullptr; }
 
-  // It's the caller's responsibility to make sure that this is
-  // for raw block contents, which contains the compression
-  // byte in the end.
-  CompressionType get_compression_type() const {
-    assert(is_raw_block);
-    return get_block_compression_type(data.data(), data.size());
-  }
-
   // The additional memory space taken by the block data.
   size_t usable_size() const {
     if (allocation.get() != nullptr) {
@@ -299,15 +321,6 @@
   }
 };
 
-// Read the block identified by "handle" from "file".  On failure
-// return non-OK.  On success fill *result and return OK.
-extern Status ReadBlockContents(
-    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
-    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
-    BlockContents* contents, const ImmutableCFOptions& ioptions,
-    bool do_uncompress = true, const Slice& compression_dict = Slice(),
-    const PersistentCacheOptions& cache_options = PersistentCacheOptions());
-
 // The 'data' points to the raw block contents read in from file.
 // This method allocates a new heap buffer and the raw block
 // contents are uncompresed into this buffer. This buffer is
@@ -319,7 +332,7 @@
                                       const char* data, size_t n,
                                       BlockContents* contents,
                                       uint32_t compress_format_version,
-                                      const ImmutableCFOptions& ioptions,
+                                      const ImmutableOptions& ioptions,
                                       MemoryAllocator* allocator = nullptr);
 
 // This is an extension to UncompressBlockContents that accepts
@@ -328,15 +341,17 @@
 extern Status UncompressBlockContentsForCompressionType(
     const UncompressionInfo& info, const char* data, size_t n,
     BlockContents* contents, uint32_t compress_format_version,
-    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
+    const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr);
+
+// Replace db_host_id contents with the real hostname if necessary
+extern Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
 
 // Implementation details follow.  Clients should ignore,
 
 // TODO(andrewkr): we should prefer one way of representing a null/uninitialized
 // BlockHandle. Currently we use zeros for null and use negation-of-zeros for
 // uninitialized.
-inline BlockHandle::BlockHandle()
-    : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {}
+inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}
 
 inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
     : offset_(_offset), size_(_size) {}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,15 +4,17 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "table/get_context.h"
+
+#include "db/blob//blob_fetcher.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/read_callback.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
-#include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -38,13 +40,17 @@
 
 }  // namespace
 
-GetContext::GetContext(
-    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
-    Statistics* statistics, GetState init_state, const Slice& user_key,
-    PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
-    bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
-    SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
-    ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
+GetContext::GetContext(const Comparator* ucmp,
+                       const MergeOperator* merge_operator, Logger* logger,
+                       Statistics* statistics, GetState init_state,
+                       const Slice& user_key, PinnableSlice* pinnable_val,
+                       std::string* timestamp, bool* value_found,
+                       MergeContext* merge_context, bool do_merge,
+                       SequenceNumber* _max_covering_tombstone_seq,
+                       SystemClock* clock, SequenceNumber* seq,
+                       PinnedIteratorsManager* _pinned_iters_mgr,
+                       ReadCallback* callback, bool* is_blob_index,
+                       uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
     : ucmp_(ucmp),
       merge_operator_(merge_operator),
       logger_(logger),
@@ -52,23 +58,38 @@
       state_(init_state),
       user_key_(user_key),
       pinnable_val_(pinnable_val),
+      timestamp_(timestamp),
       value_found_(value_found),
       merge_context_(merge_context),
       max_covering_tombstone_seq_(_max_covering_tombstone_seq),
-      env_(env),
+      clock_(clock),
       seq_(seq),
       replay_log_(nullptr),
       pinned_iters_mgr_(_pinned_iters_mgr),
       callback_(callback),
       do_merge_(do_merge),
       is_blob_index_(is_blob_index),
-      tracing_get_id_(tracing_get_id) {
+      tracing_get_id_(tracing_get_id),
+      blob_fetcher_(blob_fetcher) {
   if (seq_) {
     *seq_ = kMaxSequenceNumber;
   }
   sample_ = should_sample_file_read();
 }
 
+GetContext::GetContext(
+    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
+    Statistics* statistics, GetState init_state, const Slice& user_key,
+    PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
+    bool do_merge, SequenceNumber* _max_covering_tombstone_seq,
+    SystemClock* clock, SequenceNumber* seq,
+    PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
+    bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
+    : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key,
+                 pinnable_val, nullptr, value_found, merge_context, do_merge,
+                 _max_covering_tombstone_seq, clock, seq, _pinned_iters_mgr,
+                 callback, is_blob_index, tracing_get_id, blob_fetcher) {}
+
 // Called from TableCache::Get and Table::Get when file/block in which
 // key may exist are not there in TableCache/BlockCache respectively. In this
 // case we can't guarantee that key does not exist and are not permitted to do
@@ -138,6 +159,10 @@
   if (get_context_stats_.num_cache_add > 0) {
     RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
   }
+  if (get_context_stats_.num_cache_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_ADD_REDUNDANT,
+               get_context_stats_.num_cache_add_redundant);
+  }
   if (get_context_stats_.num_cache_bytes_write > 0) {
     RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
                get_context_stats_.num_cache_bytes_write);
@@ -146,6 +171,10 @@
     RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
                get_context_stats_.num_cache_index_add);
   }
+  if (get_context_stats_.num_cache_index_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD_REDUNDANT,
+               get_context_stats_.num_cache_index_add_redundant);
+  }
   if (get_context_stats_.num_cache_index_bytes_insert > 0) {
     RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
                get_context_stats_.num_cache_index_bytes_insert);
@@ -154,6 +183,10 @@
     RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
                get_context_stats_.num_cache_data_add);
   }
+  if (get_context_stats_.num_cache_data_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_ADD_REDUNDANT,
+               get_context_stats_.num_cache_data_add_redundant);
+  }
   if (get_context_stats_.num_cache_data_bytes_insert > 0) {
     RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
                get_context_stats_.num_cache_data_bytes_insert);
@@ -162,6 +195,10 @@
     RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
                get_context_stats_.num_cache_filter_add);
   }
+  if (get_context_stats_.num_cache_filter_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD_REDUNDANT,
+               get_context_stats_.num_cache_filter_add_redundant);
+  }
   if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
     RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
                get_context_stats_.num_cache_filter_bytes_insert);
@@ -170,6 +207,10 @@
     RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
                get_context_stats_.num_cache_compression_dict_add);
   }
+  if (get_context_stats_.num_cache_compression_dict_add_redundant > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
+               get_context_stats_.num_cache_compression_dict_add_redundant);
+  }
   if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
     RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
                get_context_stats_.num_cache_compression_dict_bytes_insert);
@@ -182,7 +223,7 @@
   assert(matched);
   assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
          merge_context_ != nullptr);
-  if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
+  if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) {
     *matched = true;
     // If the value is not in the snapshot, skip it
     if (!CheckCallback(parsed_key.sequence)) {
@@ -211,9 +252,12 @@
         assert(state_ == kNotFound || state_ == kMerge);
         if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
           // Blob value not supported. Stop.
-          state_ = kBlobIndex;
+          state_ = kUnexpectedBlobIndex;
           return false;
         }
+        if (is_blob_index_ != nullptr) {
+          *is_blob_index_ = (type == kTypeBlobIndex);
+        }
         if (kNotFound == state_) {
           state_ = kFound;
           if (do_merge_) {
@@ -224,7 +268,6 @@
               } else {
                 TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
                                          this);
-
                 // Otherwise copy the value
                 pinnable_val_->PinSelf(value);
               }
@@ -233,35 +276,57 @@
             // It means this function is called as part of DB GetMergeOperands
             // API and the current value should be part of
             // merge_context_->operand_list
-            push_operand(value, value_pinner);
+            if (is_blob_index_ != nullptr && *is_blob_index_) {
+              PinnableSlice pin_val;
+              if (GetBlobValue(value, &pin_val) == false) {
+                return false;
+              }
+              Slice blob_value(pin_val);
+              push_operand(blob_value, nullptr);
+            } else {
+              push_operand(value, value_pinner);
+            }
           }
         } else if (kMerge == state_) {
           assert(merge_operator_ != nullptr);
-          state_ = kFound;
-          if (do_merge_) {
-            if (LIKELY(pinnable_val_ != nullptr)) {
-              Status merge_status = MergeHelper::TimedFullMerge(
-                  merge_operator_, user_key_, &value,
-                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                  logger_, statistics_, env_);
-              pinnable_val_->PinSelf();
-              if (!merge_status.ok()) {
-                state_ = kCorrupt;
-              }
+          if (is_blob_index_ != nullptr && *is_blob_index_) {
+            PinnableSlice pin_val;
+            if (GetBlobValue(value, &pin_val) == false) {
+              return false;
+            }
+            Slice blob_value(pin_val);
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&blob_value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(blob_value, nullptr);
             }
           } else {
-            // It means this function is called as part of DB GetMergeOperands
-            // API and the current value should be part of
-            // merge_context_->operand_list
-            push_operand(value, value_pinner);
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(value, value_pinner);
+            }
           }
         }
-        if (is_blob_index_ != nullptr) {
-          *is_blob_index_ = (type == kTypeBlobIndex);
+        if (state_ == kFound) {
+          size_t ts_sz = ucmp_->timestamp_size();
+          if (ts_sz > 0 && timestamp_ != nullptr) {
+            Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
+            timestamp_->assign(ts.data(), ts.size());
+          }
         }
         return false;
 
       case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
       case kTypeSingleDeletion:
       case kTypeRangeDeletion:
         // TODO(noetzli): Verify correctness once merge of single-deletes
@@ -271,20 +336,9 @@
           state_ = kDeleted;
         } else if (kMerge == state_) {
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            if (do_merge_) {
-              Status merge_status = MergeHelper::TimedFullMerge(
-                  merge_operator_, user_key_, nullptr,
-                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                  logger_, statistics_, env_);
-              pinnable_val_->PinSelf();
-              if (!merge_status.ok()) {
-                state_ = kCorrupt;
-              }
-            }
-            // If do_merge_ = false then the current value shouldn't be part of
-            // merge_context_->operand_list
-          }
+          Merge(nullptr);
+          // If do_merge_ = false then the current value shouldn't be part of
+          // merge_context_->operand_list
         }
         return false;
 
@@ -297,20 +351,7 @@
             merge_operator_->ShouldMerge(
                 merge_context_->GetOperandsDirectionBackward())) {
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            // do_merge_ = true this is the case where this function is called
-            // as part of DB Get API hence merge operators should be merged.
-            if (do_merge_) {
-              Status merge_status = MergeHelper::TimedFullMerge(
-                  merge_operator_, user_key_, nullptr,
-                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                  logger_, statistics_, env_);
-              pinnable_val_->PinSelf();
-              if (!merge_status.ok()) {
-                state_ = kCorrupt;
-              }
-            }
-          }
+          Merge(nullptr);
           return false;
         }
         return true;
@@ -325,6 +366,39 @@
   return false;
 }
 
+void GetContext::Merge(const Slice* value) {
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    if (do_merge_) {
+      Status merge_status = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key_, value, merge_context_->GetOperands(),
+          pinnable_val_->GetSelf(), logger_, statistics_, clock_);
+      pinnable_val_->PinSelf();
+      if (!merge_status.ok()) {
+        state_ = kCorrupt;
+      }
+    }
+  }
+}
+
+bool GetContext::GetBlobValue(const Slice& blob_index,
+                              PinnableSlice* blob_value) {
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr uint64_t* bytes_read = nullptr;
+
+  Status status = blob_fetcher_->FetchBlob(
+      user_key_, blob_index, prefetch_buffer, blob_value, bytes_read);
+  if (!status.ok()) {
+    if (status.IsIncomplete()) {
+      MarkKeyMayExist();
+      return false;
+    }
+    state_ = kCorrupt;
+    return false;
+  }
+  *is_blob_index_ = false;
+  return true;
+}
+
 void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
   if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
       value_pinner != nullptr) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/get_context.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/get_context.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,17 +5,20 @@
 
 #pragma once
 #include <string>
-#include "db/dbformat.h"
-#include "db/merge_context.h"
+
 #include "db/read_callback.h"
-#include "rocksdb/env.h"
-#include "rocksdb/statistics.h"
 #include "rocksdb/types.h"
-#include "table/block_based/block.h"
 
 namespace ROCKSDB_NAMESPACE {
+class BlobFetcher;
+class Comparator;
+class Logger;
 class MergeContext;
+class MergeOperator;
 class PinnedIteratorsManager;
+class Statistics;
+class SystemClock;
+struct ParsedInternalKey;
 
 // Data structure for accumulating statistics during a point lookup. At the
 // end of the point lookup, the corresponding ticker stats are updated. This
@@ -33,15 +36,25 @@
   uint64_t num_cache_bytes_read = 0;
   uint64_t num_cache_miss = 0;
   uint64_t num_cache_add = 0;
+  uint64_t num_cache_add_redundant = 0;
   uint64_t num_cache_bytes_write = 0;
   uint64_t num_cache_index_add = 0;
+  uint64_t num_cache_index_add_redundant = 0;
   uint64_t num_cache_index_bytes_insert = 0;
   uint64_t num_cache_data_add = 0;
+  uint64_t num_cache_data_add_redundant = 0;
   uint64_t num_cache_data_bytes_insert = 0;
   uint64_t num_cache_filter_add = 0;
+  uint64_t num_cache_filter_add_redundant = 0;
   uint64_t num_cache_filter_bytes_insert = 0;
   uint64_t num_cache_compression_dict_add = 0;
+  uint64_t num_cache_compression_dict_add_redundant = 0;
   uint64_t num_cache_compression_dict_bytes_insert = 0;
+  // MultiGet stats.
+  uint64_t num_filter_read = 0;
+  uint64_t num_index_read = 0;
+  uint64_t num_data_read = 0;
+  uint64_t num_sst_read = 0;
 };
 
 // A class to hold context about a point lookup, such as pointer to value
@@ -61,7 +74,7 @@
     kDeleted,
     kCorrupt,
     kMerge,  // saver contains the current merge result (the operands)
-    kBlobIndex,
+    kUnexpectedBlobIndex,
   };
   GetContextStats get_context_stats_;
 
@@ -89,11 +102,21 @@
              Logger* logger, Statistics* statistics, GetState init_state,
              const Slice& user_key, PinnableSlice* value, bool* value_found,
              MergeContext* merge_context, bool do_merge,
-             SequenceNumber* max_covering_tombstone_seq, Env* env,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
+             SequenceNumber* seq = nullptr,
+             PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
+  GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+             Logger* logger, Statistics* statistics, GetState init_state,
+             const Slice& user_key, PinnableSlice* value,
+             std::string* timestamp, bool* value_found,
+             MergeContext* merge_context, bool do_merge,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
              SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
              ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
-             uint64_t tracing_get_id = 0);
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
 
   GetContext() = delete;
 
@@ -150,6 +173,9 @@
   void push_operand(const Slice& value, Cleanable* value_pinner);
 
  private:
+  void Merge(const Slice* value);
+  bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value);
+
   const Comparator* ucmp_;
   const MergeOperator* merge_operator_;
   // the merge operations encountered;
@@ -159,10 +185,11 @@
   GetState state_;
   Slice user_key_;
   PinnableSlice* pinnable_val_;
+  std::string* timestamp_;
   bool* value_found_;  // Is value set correctly? Used by KeyMayExist
   MergeContext* merge_context_;
   SequenceNumber* max_covering_tombstone_seq_;
-  Env* env_;
+  SystemClock* clock_;
   // If a key is found, seq_ will be set to the SequenceNumber of most recent
   // write to the key or kMaxSequenceNumber if unknown
   SequenceNumber* seq_;
@@ -179,6 +206,7 @@
   // Used for block cache tracing only. A tracing get id uniquely identifies a
   // Get or a MultiGet.
   const uint64_t tracing_get_id_;
+  BlobFetcher* blob_fetcher_;
 };
 
 // Call this to replay a log and bring the get_context up to date. The replay
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/internal_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/internal_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/internal_iterator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/internal_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -7,7 +7,9 @@
 #pragma once
 
 #include <string>
+
 #include "db/dbformat.h"
+#include "file/readahead_file_info.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/status.h"
@@ -17,9 +19,17 @@
 
 class PinnedIteratorsManager;
 
+enum class IterBoundCheck : char {
+  kUnknown = 0,
+  kOutOfBound,
+  kInbound,
+};
+
 struct IterateResult {
   Slice key;
-  bool may_be_out_of_upper_bound;
+  IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
+  // If false, PrepareValue() needs to be called before value().
+  bool value_prepared = true;
 };
 
 template <class TValue>
@@ -52,6 +62,7 @@
   // All Seek*() methods clear any error status() that the iterator had prior to
   // the call; after the seek, status() indicates only the error (if any) that
   // happened during the seek, not any past errors.
+  // 'target' contains user timestamp if timestamp is enabled.
   virtual void Seek(const Slice& target) = 0;
 
   // Position at the first key in the source that at or before target
@@ -66,7 +77,7 @@
 
   // Moves to the next entry in the source, and return result. Iterator
   // implementation should override this method to help methods inline better,
-  // or when MayBeOutOfUpperBound() is non-trivial.
+  // or when UpperBoundCheckResult() is non-trivial.
   // REQUIRES: Valid()
   virtual bool NextAndGetResult(IterateResult* result) {
     Next();
@@ -74,10 +85,11 @@
     if (is_valid) {
       result->key = key();
       // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
-      // call. If an implementation has non-trivial MayBeOutOfUpperBound(),
+      // call. If an implementation has non-trivial UpperBoundCheckResult(),
       // it should also override NextAndGetResult().
-      result->may_be_out_of_upper_bound = true;
-      assert(MayBeOutOfUpperBound());
+      result->bound_check_result = IterBoundCheck::kUnknown;
+      result->value_prepared = false;
+      assert(UpperBoundCheckResult() != IterBoundCheck::kOutOfBound);
     }
     return is_valid;
   }
@@ -101,6 +113,7 @@
   // the returned slice is valid only until the next modification of
   // the iterator.
   // REQUIRES: Valid()
+  // REQUIRES: PrepareValue() has been called if needed (see PrepareValue()).
   virtual TValue value() const = 0;
 
   // If an error has occurred, return it.  Else return an ok status.
@@ -108,21 +121,32 @@
   // satisfied without doing some IO, then this returns Status::Incomplete().
   virtual Status status() const = 0;
 
-  // True if the iterator is invalidated because it reached a key that is above
-  // the iterator upper bound. Used by LevelIterator to decide whether it should
-  // stop or move on to the next file.
-  // Important: if iterator reached the end of the file without encountering any
-  // keys above the upper bound, IsOutOfBound() must return false.
-  virtual bool IsOutOfBound() { return false; }
+  // For some types of iterators, sometimes Seek()/Next()/SeekForPrev()/etc may
+  // load key but not value (to avoid the IO cost of reading the value from disk
+  // if it won't be not needed). This method loads the value in such situation.
+  //
+  // Needs to be called before value() at least once after each iterator
+  // movement (except if IterateResult::value_prepared = true), for iterators
+  // created with allow_unprepared_value = true.
+  //
+  // Returns false if an error occurred; in this case Valid() is also changed
+  // to false, and status() is changed to non-ok.
+  // REQUIRES: Valid()
+  virtual bool PrepareValue() { return true; }
 
   // Keys return from this iterator can be smaller than iterate_lower_bound.
   virtual bool MayBeOutOfLowerBound() { return true; }
 
-  // Keys return from this iterator can be larger or equal to
-  // iterate_upper_bound.
-  virtual bool MayBeOutOfUpperBound() { return true; }
+  // If the iterator has checked the key against iterate_upper_bound, returns
+  // the result here. The function can be used by user of the iterator to skip
+  // their own checks. If Valid() = true, IterBoundCheck::kUnknown is always
+  // a valid value. If Valid() = false, IterBoundCheck::kOutOfBound indicates
+  // that the iterator is filtered out by upper bound checks.
+  virtual IterBoundCheck UpperBoundCheckResult() {
+    return IterBoundCheck::kUnknown;
+  }
 
-  // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
+  // Pass the PinnedIteratorsManager to the Iterator, most Iterators don't
   // communicate with PinnedIteratorsManager so default implementation is no-op
   // but for Iterators that need to communicate with PinnedIteratorsManager
   // they will implement this function and use the passed pointer to communicate
@@ -143,12 +167,25 @@
   // If true, this means that the Slice returned by value() is valid as long as
   // PinnedIteratorsManager::ReleasePinnedData is not called and the
   // Iterator is not deleted.
+  // REQUIRES: Same as for value().
   virtual bool IsValuePinned() const { return false; }
 
   virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
     return Status::NotSupported("");
   }
 
+  // When iterator moves from one file to another file at same level, new file's
+  // readahead state (details of last block read) is updated with previous
+  // file's readahead state. This way internal readahead_size of Prefetch Buffer
+  // doesn't start from scratch and can fall back to 8KB with no prefetch if
+  // reads are not sequential.
+  //
+  // Default implementation is no-op and its implemented by iterators.
+  virtual void GetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
+
+  // Default implementation is no-op and its implemented by iterators.
+  virtual void SetReadaheadState(ReadaheadFileInfo* /*readahead_file_info*/) {}
+
  protected:
   void SeekForPrevImpl(const Slice& target, const Comparator* cmp) {
     Seek(target);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/iterator_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/iterator_wrapper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/iterator_wrapper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/iterator_wrapper.h	2025-05-19 16:14:27.000000000 +0000
@@ -70,11 +70,32 @@
     assert(iter_);
     return iter_->status();
   }
+  bool PrepareValue() {
+    assert(Valid());
+    if (result_.value_prepared) {
+      return true;
+    }
+    if (iter_->PrepareValue()) {
+      result_.value_prepared = true;
+      return true;
+    }
+
+    assert(!iter_->Valid());
+    valid_ = false;
+    return false;
+  }
   void Next() {
     assert(iter_);
     valid_ = iter_->NextAndGetResult(&result_);
     assert(!valid_ || iter_->status().ok());
   }
+  bool NextAndGetResult(IterateResult* result) {
+    assert(iter_);
+    valid_ = iter_->NextAndGetResult(&result_);
+    *result = result_;
+    assert(!valid_ || iter_->status().ok());
+    return valid_;
+  }
   void Prev() {
     assert(iter_);
     iter_->Prev();
@@ -106,9 +127,9 @@
     return iter_->MayBeOutOfLowerBound();
   }
 
-  bool MayBeOutOfUpperBound() {
+  IterBoundCheck UpperBoundCheckResult() {
     assert(Valid());
-    return result_.may_be_out_of_upper_bound;
+    return result_.bound_check_result;
   }
 
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
@@ -124,13 +145,31 @@
     return iter_->IsValuePinned();
   }
 
+  bool IsValuePrepared() const {
+    return result_.value_prepared;
+  }
+
+  Slice user_key() const {
+    assert(Valid());
+    return iter_->user_key();
+  }
+
+  void UpdateReadaheadState(InternalIteratorBase<TValue>* old_iter) {
+    if (old_iter && iter_) {
+      ReadaheadFileInfo readahead_file_info;
+      old_iter->GetReadaheadState(&readahead_file_info);
+      iter_->SetReadaheadState(&readahead_file_info);
+    }
+  }
+
  private:
   void Update() {
     valid_ = iter_->Valid();
     if (valid_) {
       assert(iter_->status().ok());
       result_.key = iter_->key();
-      result_.may_be_out_of_upper_bound = true;
+      result_.bound_check_result = IterBoundCheck::kUnknown;
+      result_.value_prepared = false;
     }
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/merger_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/merger_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/merger_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/merger_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,12 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "table/merging_iterator.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/vector_iterator.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -24,7 +26,7 @@
     std::vector<std::string> ret;
 
     for (size_t i = 0; i < len; ++i) {
-      InternalKey ik(test::RandomHumanReadableString(&rnd_, string_len), 0,
+      InternalKey ik(rnd_.HumanReadableString(string_len), 0,
                      ValueType::kTypeValue);
       ret.push_back(ik.Encode().ToString(false));
     }
@@ -44,8 +46,7 @@
   }
 
   void SeekToRandom() {
-    InternalKey ik(test::RandomHumanReadableString(&rnd_, 5), 0,
-                   ValueType::kTypeValue);
+    InternalKey ik(rnd_.HumanReadableString(5), 0, ValueType::kTypeValue);
     Seek(ik.Encode().ToString(false));
   }
 
@@ -101,14 +102,14 @@
     std::vector<InternalIterator*> small_iterators;
     for (size_t i = 0; i < num_iterators; ++i) {
       auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
-      small_iterators.push_back(new test::VectorIterator(strings));
+      small_iterators.push_back(new VectorIterator(strings, strings, &icomp_));
       all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
     }
 
     merging_iterator_.reset(
         NewMergingIterator(&icomp_, &small_iterators[0],
                            static_cast<int>(small_iterators.size())));
-    single_iterator_.reset(new test::VectorIterator(all_keys_));
+    single_iterator_.reset(new VectorIterator(all_keys_, all_keys_, &icomp_));
   }
 
   InternalKeyComparator icomp_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -28,8 +28,8 @@
 namespace ROCKSDB_NAMESPACE {
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace {
-typedef BinaryHeap<IteratorWrapper*, MaxIteratorComparator> MergerMaxIterHeap;
-typedef BinaryHeap<IteratorWrapper*, MinIteratorComparator> MergerMinIterHeap;
+using MergerMaxIterHeap = BinaryHeap<IteratorWrapper*, MaxIteratorComparator>;
+using MergerMinIterHeap = BinaryHeap<IteratorWrapper*, MinIteratorComparator>;
 }  // namespace
 
 const size_t kNumIterReserve = 4;
@@ -40,20 +40,16 @@
                   InternalIterator** children, int n, bool is_arena_mode,
                   bool prefix_seek_mode)
       : is_arena_mode_(is_arena_mode),
+        prefix_seek_mode_(prefix_seek_mode),
+        direction_(kForward),
         comparator_(comparator),
         current_(nullptr),
-        direction_(kForward),
         minHeap_(comparator_),
-        prefix_seek_mode_(prefix_seek_mode),
         pinned_iters_mgr_(nullptr) {
     children_.resize(n);
     for (int i = 0; i < n; i++) {
       children_[i].Set(children[i]);
     }
-    for (auto& child : children_) {
-      AddToMinHeapOrCheckStatus(&child);
-    }
-    current_ = CurrentForward();
   }
 
   void considerStatus(Status s) {
@@ -63,22 +59,20 @@
   }
 
   virtual void AddIterator(InternalIterator* iter) {
-    assert(direction_ == kForward);
     children_.emplace_back(iter);
     if (pinned_iters_mgr_) {
       iter->SetPinnedItersMgr(pinned_iters_mgr_);
     }
-    auto new_wrapper = children_.back();
-    AddToMinHeapOrCheckStatus(&new_wrapper);
-    if (new_wrapper.Valid()) {
-      current_ = CurrentForward();
-    }
+    // Invalidate to ensure `Seek*()` is called to construct the heaps before
+    // use.
+    current_ = nullptr;
   }
 
   ~MergingIterator() override {
     for (auto& child : children_) {
       child.DeleteIter(is_arena_mode_);
     }
+    status_.PermitUncheckedError();
   }
 
   bool Valid() const override { return current_ != nullptr && status_.ok(); }
@@ -194,7 +188,8 @@
     bool is_valid = Valid();
     if (is_valid) {
       result->key = key();
-      result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+      result->bound_check_result = UpperBoundCheckResult();
+      result->value_prepared = current_->IsValuePrepared();
     }
     return is_valid;
   }
@@ -240,6 +235,17 @@
     return current_->value();
   }
 
+  bool PrepareValue() override {
+    assert(Valid());
+    if (current_->PrepareValue()) {
+      return true;
+    }
+
+    considerStatus(current_->status());
+    assert(!status_.ok());
+    return false;
+  }
+
   // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
   // from current child iterator. Potentially as long as one of child iterator
   // report out of bound is not possible, we know current key is within bound.
@@ -249,9 +255,9 @@
     return current_->MayBeOutOfLowerBound();
   }
 
-  bool MayBeOutOfUpperBound() override {
+  IterBoundCheck UpperBoundCheckResult() override {
     assert(Valid());
-    return current_->MayBeOutOfUpperBound();
+    return current_->UpperBoundCheckResult();
   }
 
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
@@ -281,6 +287,10 @@
   void InitMaxHeap();
 
   bool is_arena_mode_;
+  bool prefix_seek_mode_;
+  // Which direction is the iterator moving?
+  enum Direction : uint8_t { kForward, kReverse };
+  Direction direction_;
   const InternalKeyComparator* comparator_;
   autovector<IteratorWrapper, kNumIterReserve> children_;
 
@@ -290,14 +300,7 @@
   IteratorWrapper* current_;
   // If any of the children have non-ok status, this is one of them.
   Status status_;
-  // Which direction is the iterator moving?
-  enum Direction {
-    kForward,
-    kReverse
-  };
-  Direction direction_;
   MergerMinIterHeap minHeap_;
-  bool prefix_seek_mode_;
 
   // Max heap is used for reverse iteration, which is way less common than
   // forward.  Lazily initialize it to save memory.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/merging_iterator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/merging_iterator.h	2025-05-19 16:14:27.000000000 +0000
@@ -9,14 +9,14 @@
 
 #pragma once
 
-#include "db/dbformat.h"
+#include "rocksdb/slice.h"
 #include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class Comparator;
-class Env;
 class Arena;
+class InternalKeyComparator;
+
 template <class TValue>
 class InternalIteratorBase;
 using InternalIterator = InternalIteratorBase<Slice>;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,18 +10,28 @@
 #include "block_fetcher.h"
 #include "db/table_properties_collector.h"
 #include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based/block.h"
+#include "table/block_based/reader_common.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "table/persistent_cache_helper.h"
+#include "table/sst_file_writer_collectors.h"
 #include "table/table_properties_internal.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+const std::string kPropertiesBlockName = "rocksdb.properties";
+// Old property block name for backward compatibility
+const std::string kPropertiesBlockOldName = "rocksdb.stats";
+const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
+const std::string kRangeDelBlockName = "rocksdb.range_del";
+
 MetaIndexBuilder::MetaIndexBuilder()
     : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
 
@@ -71,6 +81,7 @@
   TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start",
                            const_cast<TableProperties*>(&props));
 
+  Add(TablePropertiesNames::kOriginalFileNumber, props.orig_file_number);
   Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
   Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
   Add(TablePropertiesNames::kDataSize, props.data_size);
@@ -83,6 +94,7 @@
   Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
       props.index_value_is_delta_encoded);
   Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries);
   Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
   Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
   Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
@@ -96,6 +108,23 @@
   if (props.file_creation_time > 0) {
     Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time);
   }
+  if (props.slow_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+        props.slow_compression_estimated_data_size);
+  }
+  if (props.fast_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kFastCompressionEstimatedDataSize,
+        props.fast_compression_estimated_data_size);
+  }
+  if (!props.db_id.empty()) {
+    Add(TablePropertiesNames::kDbId, props.db_id);
+  }
+  if (!props.db_session_id.empty()) {
+    Add(TablePropertiesNames::kDbSessionId, props.db_session_id);
+  }
+  if (!props.db_host_id.empty()) {
+    Add(TablePropertiesNames::kDbHostId, props.db_host_id);
+  }
 
   if (!props.filter_policy_name.empty()) {
     Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
@@ -135,8 +164,8 @@
   return properties_block_->Finish();
 }
 
-void LogPropertiesCollectionError(
-    Logger* info_log, const std::string& method, const std::string& name) {
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name) {
   assert(method == "Add" || method == "Finish");
 
   std::string msg =
@@ -163,11 +192,11 @@
 
 void NotifyCollectTableCollectorsOnBlockAdd(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
-    const uint64_t blockRawBytes, const uint64_t blockCompressedBytesFast,
-    const uint64_t blockCompressedBytesSlow) {
+    const uint64_t block_raw_bytes, const uint64_t block_compressed_bytes_fast,
+    const uint64_t block_compressed_bytes_slow) {
   for (auto& collector : collectors) {
-    collector->BlockAdd(blockRawBytes, blockCompressedBytesFast,
-                        blockCompressedBytesSlow);
+    collector->BlockAdd(block_raw_bytes, block_compressed_bytes_fast,
+                        block_compressed_bytes_slow);
   }
 }
 
@@ -191,50 +220,48 @@
   return all_succeeded;
 }
 
-Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
-                      FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
-                      const ImmutableCFOptions& ioptions,
-                      TableProperties** table_properties, bool verify_checksum,
-                      BlockHandle* ret_block_handle,
-                      CacheAllocationPtr* verification_buf,
-                      bool /*compression_type_missing*/,
-                      MemoryAllocator* memory_allocator) {
+// FIXME: should be a parameter for reading table properties to use persistent
+// cache?
+Status ReadTablePropertiesHelper(
+    const ReadOptions& ro, const BlockHandle& handle,
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ImmutableOptions& ioptions,
+    std::unique_ptr<TableProperties>* table_properties,
+    MemoryAllocator* memory_allocator) {
   assert(table_properties);
 
-  Slice v = handle_value;
-  BlockHandle handle;
-  if (!handle.DecodeFrom(&v).ok()) {
-    return Status::InvalidArgument("Failed to decode properties block handle");
-  }
-
+  // If this is an external SST file ingested with write_global_seqno set to
+  // true, then we expect the checksum mismatch because checksum was written
+  // by SstFileWriter, but its global seqno in the properties block may have
+  // been changed during ingestion. For this reason, we initially read
+  // and process without checksum verification, then later try checksum
+  // verification so that if it fails, we can copy to a temporary buffer with
+  // global seqno set to its original value, i.e. 0, and attempt checksum
+  // verification again.
+  ReadOptions modified_ro = ro;
+  modified_ro.verify_checksums = false;
   BlockContents block_contents;
-  ReadOptions read_options;
-  read_options.verify_checksums = verify_checksum;
-  Status s;
-  PersistentCacheOptions cache_options;
-
-  BlockFetcher block_fetcher(
-      file, prefetch_buffer, footer, read_options, handle, &block_contents,
-      ioptions, false /* decompress */, false /*maybe_compressed*/,
-      BlockType::kProperties, UncompressionDict::GetEmptyDict(), cache_options,
-      memory_allocator);
-  s = block_fetcher.ReadBlockContents();
-  // property block is never compressed. Need to add uncompress logic if we are
-  // to compress it..
-
+  BlockFetcher block_fetcher(file, prefetch_buffer, footer, modified_ro, handle,
+                             &block_contents, ioptions, false /* decompress */,
+                             false /*maybe_compressed*/, BlockType::kProperties,
+                             UncompressionDict::GetEmptyDict(),
+                             PersistentCacheOptions::kEmpty, memory_allocator);
+  Status s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
   }
 
-  Block properties_block(std::move(block_contents),
-                         kDisableGlobalSequenceNumber);
-  DataBlockIter iter;
-  properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(),
-                                   &iter);
+  // Unfortunately, Block::size() might not equal block_contents.data.size(),
+  // and Block hides block_contents
+  uint64_t block_size = block_contents.data.size();
+  Block properties_block(std::move(block_contents));
+  std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
 
-  auto new_table_properties = new TableProperties();
+  std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
   // All pre-defined properties of type uint64_t
   std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+      {TablePropertiesNames::kOriginalFileNumber,
+       &new_table_properties->orig_file_number},
       {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
       {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
       {TablePropertiesNames::kIndexPartitions,
@@ -252,6 +279,8 @@
       {TablePropertiesNames::kNumDataBlocks,
        &new_table_properties->num_data_blocks},
       {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kNumFilterEntries,
+       &new_table_properties->num_filter_entries},
       {TablePropertiesNames::kDeletedKeys,
        &new_table_properties->num_deletions},
       {TablePropertiesNames::kMergeOperands,
@@ -270,16 +299,20 @@
        &new_table_properties->oldest_key_time},
       {TablePropertiesNames::kFileCreationTime,
        &new_table_properties->file_creation_time},
+      {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+       &new_table_properties->slow_compression_estimated_data_size},
+      {TablePropertiesNames::kFastCompressionEstimatedDataSize,
+       &new_table_properties->fast_compression_estimated_data_size},
   };
 
   std::string last_key;
-  for (iter.SeekToFirstOrReport(); iter.Valid(); iter.NextOrReport()) {
-    s = iter.status();
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
     if (!s.ok()) {
       break;
     }
 
-    auto key = iter.key().ToString();
+    auto key = iter->key().ToString();
     // properties block should be strictly sorted with no duplicate key.
     if (!last_key.empty() &&
         BytewiseComparator()->Compare(key, last_key) <= 0) {
@@ -288,11 +321,13 @@
     }
     last_key = key;
 
-    auto raw_val = iter.value();
+    auto raw_val = iter->value();
     auto pos = predefined_uint64_properties.find(key);
 
-    new_table_properties->properties_offsets.insert(
-        {key, handle.offset() + iter.ValueOffset()});
+    if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
+      new_table_properties->external_sst_file_global_seqno_offset =
+          handle.offset() + iter->ValueOffset();
+    }
 
     if (pos != predefined_uint64_properties.end()) {
       if (key == TablePropertiesNames::kDeletedKeys ||
@@ -308,10 +343,16 @@
         auto error_msg =
           "Detect malformed value in properties meta-block:"
           "\tkey: " + key + "\tval: " + raw_val.ToString();
-        ROCKS_LOG_ERROR(ioptions.info_log, "%s", error_msg.c_str());
+        ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
         continue;
       }
       *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kDbId) {
+      new_table_properties->db_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbSessionId) {
+      new_table_properties->db_session_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbHostId) {
+      new_table_properties->db_host_id = raw_val.ToString();
     } else if (key == TablePropertiesNames::kFilterPolicy) {
       new_table_properties->filter_policy_name = raw_val.ToString();
     } else if (key == TablePropertiesNames::kColumnFamilyName) {
@@ -334,21 +375,28 @@
           {key, raw_val.ToString()});
     }
   }
-  if (s.ok()) {
-    *table_properties = new_table_properties;
-    if (ret_block_handle != nullptr) {
-      *ret_block_handle = handle;
-    }
-    if (verification_buf != nullptr) {
-      size_t len = static_cast<size_t>(handle.size() + kBlockTrailerSize);
-      *verification_buf =
-          ROCKSDB_NAMESPACE::AllocateBlock(len, memory_allocator);
-      if (verification_buf->get() != nullptr) {
-        memcpy(verification_buf->get(), block_contents.data.data(), len);
+
+  // Modified version of BlockFetcher checksum verification
+  // (See write_global_seqno comment above)
+  if (s.ok() && footer.GetBlockTrailerSize() > 0) {
+    s = VerifyBlockChecksum(footer.checksum_type(), properties_block.data(),
+                            block_size, file->file_name(), handle.offset());
+    if (s.IsCorruption()) {
+      if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
+        std::string tmp_buf(properties_block.data(),
+                            block_fetcher.GetBlockSizeWithTrailer());
+        uint64_t global_seqno_offset =
+            new_table_properties->external_sst_file_global_seqno_offset -
+            handle.offset();
+        EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
+        s = VerifyBlockChecksum(footer.checksum_type(), tmp_buf.data(),
+                                block_size, file->file_name(), handle.offset());
       }
     }
-  } else {
-    delete new_table_properties;
+  }
+
+  if (s.ok()) {
+    *table_properties = std::move(new_table_properties);
   }
 
   return s;
@@ -356,111 +404,101 @@
 
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions& ioptions,
-                           TableProperties** properties,
-                           bool compression_type_missing,
-                           MemoryAllocator* memory_allocator) {
-  // -- Read metaindex block
+                           const ImmutableOptions& ioptions,
+                           std::unique_ptr<TableProperties>* properties,
+                           MemoryAllocator* memory_allocator,
+                           FilePrefetchBuffer* prefetch_buffer) {
+  BlockHandle block_handle;
   Footer footer;
-  auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
-                              &footer, table_magic_number);
+  Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
+                                 kPropertiesBlockName, &block_handle,
+                                 memory_allocator, prefetch_buffer, &footer);
   if (!s.ok()) {
     return s;
   }
 
-  auto metaindex_handle = footer.metaindex_handle();
-  BlockContents metaindex_contents;
-  ReadOptions read_options;
-  read_options.verify_checksums = false;
-  PersistentCacheOptions cache_options;
-
-  BlockFetcher block_fetcher(
-      file, nullptr /* prefetch_buffer */, footer, read_options,
-      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
-      false /*maybe_compressed*/, BlockType::kMetaIndex,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
-  s = block_fetcher.ReadBlockContents();
-  if (!s.ok()) {
-    return s;
-  }
-  // property blocks are never compressed. Need to add uncompress logic if we
-  // are to compress it.
-  Block metaindex_block(std::move(metaindex_contents),
-                        kDisableGlobalSequenceNumber);
-  std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
-      BytewiseComparator(), BytewiseComparator()));
-
-  // -- Read property block
-  bool found_properties_block = true;
-  s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
-  if (!s.ok()) {
-    return s;
-  }
-
-  TableProperties table_properties;
-  if (found_properties_block == true) {
-    s = ReadProperties(
-        meta_iter->value(), file, nullptr /* prefetch_buffer */, footer,
-        ioptions, properties, false /* verify_checksum */,
-        nullptr /* ret_block_hanel */, nullptr /* ret_block_contents */,
-        compression_type_missing, memory_allocator);
+  if (!block_handle.IsNull()) {
+    s = ReadTablePropertiesHelper(ReadOptions(), block_handle, file,
+                                  prefetch_buffer, footer, ioptions, properties,
+                                  memory_allocator);
   } else {
     s = Status::NotFound();
   }
-
   return s;
 }
 
+Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
+                             const std::string& meta_block_name,
+                             BlockHandle* block_handle) {
+  assert(block_handle != nullptr);
+  meta_index_iter->Seek(meta_block_name);
+  if (meta_index_iter->status().ok()) {
+    if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
+      Slice v = meta_index_iter->value();
+      return block_handle->DecodeFrom(&v);
+    } else if (meta_block_name == kPropertiesBlockName) {
+      // Have to try old name for compatibility
+      meta_index_iter->Seek(kPropertiesBlockOldName);
+      if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
+          meta_index_iter->key() == kPropertiesBlockOldName) {
+        Slice v = meta_index_iter->value();
+        return block_handle->DecodeFrom(&v);
+      }
+    }
+  }
+  // else
+  *block_handle = BlockHandle::NullBlockHandle();
+  return meta_index_iter->status();
+}
+
 Status FindMetaBlock(InternalIterator* meta_index_iter,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle) {
-  meta_index_iter->Seek(meta_block_name);
-  if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
-      meta_index_iter->key() == meta_block_name) {
-    Slice v = meta_index_iter->value();
-    return block_handle->DecodeFrom(&v);
-  } else {
+  Status s =
+      FindOptionalMetaBlock(meta_index_iter, meta_block_name, block_handle);
+  if (s.ok() && block_handle->IsNull()) {
     return Status::Corruption("Cannot find the meta block", meta_block_name);
+  } else {
+    return s;
   }
 }
 
-Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
-                     uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
-                     const std::string& meta_block_name,
-                     BlockHandle* block_handle,
-                     bool /*compression_type_missing*/,
-                     MemoryAllocator* memory_allocator) {
+Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           const std::string& meta_block_name,
+                           BlockHandle* block_handle,
+                           MemoryAllocator* memory_allocator,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           Footer* footer_out) {
   Footer footer;
-  auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
-                              &footer, table_magic_number);
+  IOOptions opts;
+  auto s = ReadFooterFromFile(opts, file, prefetch_buffer, file_size, &footer,
+                              table_magic_number);
   if (!s.ok()) {
     return s;
   }
+  if (footer_out) {
+    *footer_out = footer;
+  }
 
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
-  ReadOptions read_options;
-  read_options.verify_checksums = false;
-  PersistentCacheOptions cache_options;
-  BlockFetcher block_fetcher(
-      file, nullptr /* prefetch_buffer */, footer, read_options,
-      metaindex_handle, &metaindex_contents, ioptions,
-      false /* do decompression */, false /*maybe_compressed*/,
-      BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options,
-      memory_allocator);
-  s = block_fetcher.ReadBlockContents();
+  s = BlockFetcher(file, prefetch_buffer, footer, ReadOptions(),
+                   metaindex_handle, &metaindex_contents, ioptions,
+                   false /* do decompression */, false /*maybe_compressed*/,
+                   BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(),
+                   PersistentCacheOptions::kEmpty, memory_allocator)
+          .ReadBlockContents();
   if (!s.ok()) {
     return s;
   }
   // meta blocks are never compressed. Need to add uncompress logic if we are to
   // compress it.
-  Block metaindex_block(std::move(metaindex_contents),
-                        kDisableGlobalSequenceNumber);
+  Block metaindex_block(std::move(metaindex_contents));
 
   std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
-                                                  BytewiseComparator()));
+  meta_iter.reset(metaindex_block.NewMetaIterator());
 
   return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
 }
@@ -468,58 +506,29 @@
 Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
+                     const ImmutableOptions& ioptions,
                      const std::string& meta_block_name, BlockType block_type,
-                     BlockContents* contents, bool /*compression_type_missing*/,
+                     BlockContents* contents,
                      MemoryAllocator* memory_allocator) {
-  Status status;
-  Footer footer;
-  status = ReadFooterFromFile(file, prefetch_buffer, file_size, &footer,
-                              table_magic_number);
-  if (!status.ok()) {
-    return status;
-  }
-
-  // Reading metaindex block
-  auto metaindex_handle = footer.metaindex_handle();
-  BlockContents metaindex_contents;
-  ReadOptions read_options;
-  read_options.verify_checksums = false;
-  PersistentCacheOptions cache_options;
-
-  BlockFetcher block_fetcher(
-      file, prefetch_buffer, footer, read_options, metaindex_handle,
-      &metaindex_contents, ioptions, false /* decompress */,
-      false /*maybe_compressed*/, BlockType::kMetaIndex,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
-  status = block_fetcher.ReadBlockContents();
-  if (!status.ok()) {
-    return status;
-  }
-  // meta block is never compressed. Need to add uncompress logic if we are to
-  // compress it.
-
-  // Finding metablock
-  Block metaindex_block(std::move(metaindex_contents),
-                        kDisableGlobalSequenceNumber);
-
-  std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
-                                                  BytewiseComparator()));
+  // TableProperties requires special handling because of checksum issues.
+  // Call ReadTableProperties instead for that case.
+  assert(block_type != BlockType::kProperties);
 
   BlockHandle block_handle;
-  status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
-
+  Footer footer;
+  Status status = FindMetaBlockInFile(
+      file, file_size, table_magic_number, ioptions, meta_block_name,
+      &block_handle, memory_allocator, prefetch_buffer, &footer);
   if (!status.ok()) {
     return status;
   }
 
-  // Reading metablock
-  BlockFetcher block_fetcher2(
-      file, prefetch_buffer, footer, read_options, block_handle, contents,
-      ioptions, false /* decompress */, false /*maybe_compressed*/, block_type,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
-  return block_fetcher2.ReadBlockContents();
+  return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(),
+                      block_handle, contents, ioptions, false /* decompress */,
+                      false /*maybe_compressed*/, block_type,
+                      UncompressionDict::GetEmptyDict(),
+                      PersistentCacheOptions::kEmpty, memory_allocator)
+      .ReadBlockContents();
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/meta_blocks.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/meta_blocks.h	2025-05-19 16:14:27.000000000 +0000
@@ -30,6 +30,12 @@
 class RandomAccessFile;
 struct TableProperties;
 
+// Meta block names for metaindex
+extern const std::string kPropertiesBlockName;
+extern const std::string kPropertiesBlockOldName;
+extern const std::string kCompressionDictBlockName;
+extern const std::string kRangeDelBlockName;
+
 class MetaIndexBuilder {
  public:
   MetaIndexBuilder(const MetaIndexBuilder&) = delete;
@@ -70,8 +76,8 @@
 
 // Were we encounter any error occurs during user-defined statistics collection,
 // we'll write the warning message to info log.
-void LogPropertiesCollectionError(
-    Logger* info_log, const std::string& method, const std::string& name);
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name);
 
 // Utility functions help table builder to trigger batch events for user
 // defined property collectors.
@@ -86,8 +92,8 @@
 
 void NotifyCollectTableCollectorsOnBlockAdd(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
-    uint64_t blockRawBytes, uint64_t blockCompressedBytesFast,
-    uint64_t blockCompressedBytesSlow);
+    uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow);
 
 // NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all
 // property collectors. The collected properties will be added to `builder`.
@@ -95,47 +101,49 @@
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
     Logger* info_log, PropertyBlockBuilder* builder);
 
-// Read the properties from the table.
+// Read table properties from a file using known BlockHandle.
 // @returns a status to indicate if the operation succeeded. On success,
 //          *table_properties will point to a heap-allocated TableProperties
 //          object, otherwise value of `table_properties` will not be modified.
-Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
-                      FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
-                      const ImmutableCFOptions& ioptions,
-                      TableProperties** table_properties, bool verify_checksum,
-                      BlockHandle* block_handle,
-                      CacheAllocationPtr* verification_buf,
-                      bool compression_type_missing = false,
-                      MemoryAllocator* memory_allocator = nullptr);
+Status ReadTablePropertiesHelper(
+    const ReadOptions& ro, const BlockHandle& handle,
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ImmutableOptions& ioptions,
+    std::unique_ptr<TableProperties>* table_properties,
+    MemoryAllocator* memory_allocator = nullptr);
 
-// Directly read the properties from the properties block of a plain table.
+// Read table properties from the properties block of a plain table.
 // @returns a status to indicate if the operation succeeded. On success,
 //          *table_properties will point to a heap-allocated TableProperties
 //          object, otherwise value of `table_properties` will not be modified.
-// certain tables do not have compression_type byte setup properly for
-// uncompressed blocks, caller can request to reset compression type by
-// passing compression_type_missing = true, the same applies to
-// `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock`
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions& ioptions,
-                           TableProperties** properties,
-                           bool compression_type_missing = false,
-                           MemoryAllocator* memory_allocator = nullptr);
+                           const ImmutableOptions& ioptions,
+                           std::unique_ptr<TableProperties>* properties,
+                           MemoryAllocator* memory_allocator = nullptr,
+                           FilePrefetchBuffer* prefetch_buffer = nullptr);
+
+// Find the meta block from the meta index block. Returns OK and
+// block_handle->IsNull() if not found.
+Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
+                             const std::string& meta_block_name,
+                             BlockHandle* block_handle);
 
-// Find the meta block from the meta index block.
+// Find the meta block from the meta index block. Returns Corruption if not
+// found.
 Status FindMetaBlock(InternalIterator* meta_index_iter,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle);
 
 // Find the meta block
-Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
-                     uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
-                     const std::string& meta_block_name,
-                     BlockHandle* block_handle,
-                     bool compression_type_missing = false,
-                     MemoryAllocator* memory_allocator = nullptr);
+Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size,
+                           uint64_t table_magic_number,
+                           const ImmutableOptions& ioptions,
+                           const std::string& meta_block_name,
+                           BlockHandle* block_handle,
+                           MemoryAllocator* memory_allocator = nullptr,
+                           FilePrefetchBuffer* prefetch_buffer = nullptr,
+                           Footer* footer_out = nullptr);
 
 // Read the specified meta block with name meta_block_name
 // from `file` and initialize `contents` with contents of this block.
@@ -143,10 +151,9 @@
 Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
+                     const ImmutableOptions& ioptions,
                      const std::string& meta_block_name, BlockType block_type,
                      BlockContents* contents,
-                     bool compression_type_missing = false,
                      MemoryAllocator* memory_allocator = nullptr);
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.cc	2025-05-19 16:14:27.000000000 +0000
@@ -16,21 +16,187 @@
 namespace ROCKSDB_NAMESPACE {
 namespace mock {
 
-namespace {
+KVVector MakeMockFile(std::initializer_list<KVPair> l) { return KVVector(l); }
 
-const InternalKeyComparator icmp_(BytewiseComparator());
+void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) {
+  InternalKeyComparator icmp(ucmp);
+  std::sort(kv_vector->begin(), kv_vector->end(),
+            [icmp](KVPair a, KVPair b) -> bool {
+              return icmp.Compare(a.first, b.first) < 0;
+            });
+}
 
-}  // namespace
+class MockTableReader : public TableReader {
+ public:
+  explicit MockTableReader(const KVVector& table) : table_(table) {}
+
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
+
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               TableReaderCaller /*caller*/) override {
+    return 0;
+  }
 
-stl_wrappers::KVMap MakeMockFile(
-    std::initializer_list<std::pair<const std::string, std::string>> l) {
-  return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
-}
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  size_t ApproximateMemoryUsage() const override { return 0; }
+
+  void SetupForCompaction() override {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~MockTableReader() {}
+
+ private:
+  const KVVector& table_;
+};
+
+class MockTableIterator : public InternalIterator {
+ public:
+  explicit MockTableIterator(const KVVector& table) : table_(table) {
+    itr_ = table_.end();
+  }
+
+  bool Valid() const override { return itr_ != table_.end(); }
+
+  void SeekToFirst() override { itr_ = table_.begin(); }
+
+  void SeekToLast() override {
+    itr_ = table_.end();
+    --itr_;
+  }
+
+  void Seek(const Slice& target) override {
+    KVPair target_pair(target.ToString(), "");
+    InternalKeyComparator icmp(BytewiseComparator());
+    itr_ = std::lower_bound(table_.begin(), table_.end(), target_pair,
+                            [icmp](KVPair a, KVPair b) -> bool {
+                              return icmp.Compare(a.first, b.first) < 0;
+                            });
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    KVPair target_pair(target.ToString(), "");
+    InternalKeyComparator icmp(BytewiseComparator());
+    itr_ = std::upper_bound(table_.begin(), table_.end(), target_pair,
+                            [icmp](KVPair a, KVPair b) -> bool {
+                              return icmp.Compare(a.first, b.first) < 0;
+                            });
+    Prev();
+  }
+
+  void Next() override { ++itr_; }
+
+  void Prev() override {
+    if (itr_ == table_.begin()) {
+      itr_ = table_.end();
+    } else {
+      --itr_;
+    }
+  }
+
+  Slice key() const override { return Slice(itr_->first); }
+
+  Slice value() const override { return Slice(itr_->second); }
+
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const KVVector& table_;
+  KVVector::const_iterator itr_;
+};
+
+class MockTableBuilder : public TableBuilder {
+ public:
+  MockTableBuilder(uint32_t id, MockTableFileSystem* file_system,
+                   MockTableFactory::MockCorruptionMode corrupt_mode =
+                       MockTableFactory::kCorruptNone)
+      : id_(id), file_system_(file_system), corrupt_mode_(corrupt_mode) {
+    table_ = MakeMockFile({});
+  }
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~MockTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override {
+    if (corrupt_mode_ == MockTableFactory::kCorruptValue) {
+      // Corrupt the value
+      table_.push_back({key.ToString(), value.ToString() + " "});
+      corrupt_mode_ = MockTableFactory::kCorruptNone;
+    } else if (corrupt_mode_ == MockTableFactory::kCorruptKey) {
+      table_.push_back({key.ToString() + " ", value.ToString()});
+      corrupt_mode_ = MockTableFactory::kCorruptNone;
+    } else if (corrupt_mode_ == MockTableFactory::kCorruptReorderKey) {
+      if (prev_key_.empty()) {
+        prev_key_ = key.ToString();
+        prev_value_ = value.ToString();
+      } else {
+        table_.push_back({key.ToString(), value.ToString()});
+        table_.push_back({prev_key_, prev_value_});
+        corrupt_mode_ = MockTableFactory::kCorruptNone;
+      }
+    } else {
+      table_.push_back({key.ToString(), value.ToString()});
+    }
+  }
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return Status::OK(); }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return IOStatus::OK(); }
+
+  Status Finish() override {
+    MutexLock lock_guard(&file_system_->mutex);
+    file_system_->files.insert({id_, table_});
+    return Status::OK();
+  }
+
+  void Abandon() override {}
+
+  uint64_t NumEntries() const override { return table_.size(); }
+
+  uint64_t FileSize() const override { return table_.size(); }
+
+  TableProperties GetTableProperties() const override {
+    return TableProperties();
+  }
+
+  // Get file checksum
+  std::string GetFileChecksum() const override { return kUnknownFileChecksum; }
+  // Get file checksum function name
+  const char* GetFileChecksumFuncName() const override {
+    return kUnknownFileChecksumFuncName;
+  }
+
+ private:
+  uint32_t id_;
+  std::string prev_key_;
+  std::string prev_value_;
+  MockTableFileSystem* file_system_;
+  int corrupt_mode_;
+  KVVector table_;
+};
 
 InternalIterator* MockTableReader::NewIterator(
     const ReadOptions&, const SliceTransform* /* prefix_extractor */,
     Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
-    size_t /*compaction_readahead_size*/) {
+    size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
   return new MockTableIterator(table_);
 }
 
@@ -41,8 +207,10 @@
   std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
   for (iter->Seek(key); iter->Valid(); iter->Next()) {
     ParsedInternalKey parsed_key;
-    if (!ParseInternalKey(iter->key(), &parsed_key)) {
-      return Status::Corruption(Slice());
+    Status pik_status =
+        ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      return pik_status;
     }
 
     bool dont_care __attribute__((__unused__));
@@ -58,14 +226,20 @@
   return std::shared_ptr<const TableProperties>(new TableProperties());
 }
 
-MockTableFactory::MockTableFactory() : next_id_(1) {}
+MockTableFactory::MockTableFactory()
+    : next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {}
 
 Status MockTableFactory::NewTableReader(
+    const ReadOptions& /*ro*/,
     const TableReaderOptions& /*table_reader_options*/,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
     std::unique_ptr<TableReader>* table_reader,
     bool /*prefetch_index_and_filter_in_cache*/) const {
-  uint32_t id = GetIDFromFile(file.get());
+  uint32_t id;
+  Status s = GetIDFromFile(file.get(), &id);
+  if (!s.ok()) {
+    return s;
+  }
 
   MutexLock lock_guard(&file_system_.mutex);
 
@@ -81,52 +255,54 @@
 
 TableBuilder* MockTableFactory::NewTableBuilder(
     const TableBuilderOptions& /*table_builder_options*/,
-    uint32_t /*column_family_id*/, WritableFileWriter* file) const {
-  uint32_t id = GetAndWriteNextID(file);
+    WritableFileWriter* file) const {
+  uint32_t id;
+  Status s = GetAndWriteNextID(file, &id);
+  assert(s.ok());
 
-  return new MockTableBuilder(id, &file_system_);
+  return new MockTableBuilder(id, &file_system_, corrupt_mode_);
 }
 
 Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
-                                         stl_wrappers::KVMap file_contents) {
-  std::unique_ptr<WritableFile> file;
-  auto s = env->NewWritableFile(fname, &file, EnvOptions());
+                                         KVVector file_contents) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  Status s = WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                        FileOptions(), &file_writer, nullptr);
   if (!s.ok()) {
     return s;
   }
-
-  WritableFileWriter file_writer(NewLegacyWritableFileWrapper(std::move(file)),
-                                 fname, EnvOptions());
-
-  uint32_t id = GetAndWriteNextID(&file_writer);
-  file_system_.files.insert({id, std::move(file_contents)});
-  return Status::OK();
+  uint32_t id;
+  s = GetAndWriteNextID(file_writer.get(), &id);
+  if (s.ok()) {
+    file_system_.files.insert({id, std::move(file_contents)});
+  }
+  return s;
 }
 
-uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const {
-  uint32_t next_id = next_id_.fetch_add(1);
+Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file,
+                                           uint32_t* next_id) const {
+  *next_id = next_id_.fetch_add(1);
   char buf[4];
-  EncodeFixed32(buf, next_id);
-  file->Append(Slice(buf, 4));
-  return next_id;
+  EncodeFixed32(buf, *next_id);
+  return file->Append(Slice(buf, 4));
 }
 
-uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const {
+Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file,
+                                       uint32_t* id) const {
   char buf[4];
   Slice result;
-  file->Read(0, 4, &result, buf);
+  Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr);
   assert(result.size() == 4);
-  return DecodeFixed32(buf);
+  *id = DecodeFixed32(buf);
+  return s;
 }
 
-void MockTableFactory::AssertSingleFile(
-    const stl_wrappers::KVMap& file_contents) {
+void MockTableFactory::AssertSingleFile(const KVVector& file_contents) {
   ASSERT_EQ(file_system_.files.size(), 1U);
   ASSERT_EQ(file_contents, file_system_.files.begin()->second);
 }
 
-void MockTableFactory::AssertLatestFile(
-    const stl_wrappers::KVMap& file_contents) {
+void MockTableFactory::AssertLatestFile(const KVVector& file_contents) {
   ASSERT_GE(file_system_.files.size(), 1U);
   auto latest = file_system_.files.end();
   --latest;
@@ -137,8 +313,9 @@
       ParsedInternalKey ikey;
       std::string key, value;
       std::tie(key, value) = kv;
-      ParseInternalKey(Slice(key), &ikey);
-      std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
+      ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
+      std::cout << ikey.DebugString(true, false) << " -> " << value
+                << std::endl;
     }
     FAIL();
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/mock_table.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/mock_table.h	2025-05-19 16:14:27.000000000 +0000
@@ -15,6 +15,7 @@
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/io_status.h"
 #include "rocksdb/table.h"
 #include "table/internal_iterator.h"
 #include "table/table_builder.h"
@@ -26,188 +27,63 @@
 
 namespace ROCKSDB_NAMESPACE {
 namespace mock {
+using KVPair = std::pair<std::string, std::string>;
+using KVVector = std::vector<KVPair>;
 
-stl_wrappers::KVMap MakeMockFile(
-    std::initializer_list<std::pair<const std::string, std::string>> l = {});
+KVVector MakeMockFile(std::initializer_list<KVPair> l = {});
+void SortKVVector(KVVector* kv_vector,
+                  const Comparator* ucmp = BytewiseComparator());
 
 struct MockTableFileSystem {
   port::Mutex mutex;
-  std::map<uint32_t, stl_wrappers::KVMap> files;
-};
-
-class MockTableReader : public TableReader {
- public:
-  explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {}
-
-  InternalIterator* NewIterator(const ReadOptions&,
-                                const SliceTransform* prefix_extractor,
-                                Arena* arena, bool skip_filters,
-                                TableReaderCaller caller,
-                                size_t compaction_readahead_size = 0) override;
-
-  Status Get(const ReadOptions& readOptions, const Slice& key,
-             GetContext* get_context, const SliceTransform* prefix_extractor,
-             bool skip_filters = false) override;
-
-  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
-                               TableReaderCaller /*caller*/) override {
-    return 0;
-  }
-
-  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
-                           TableReaderCaller /*caller*/) override {
-    return 0;
-  }
-
-  size_t ApproximateMemoryUsage() const override { return 0; }
-
-  void SetupForCompaction() override {}
-
-  std::shared_ptr<const TableProperties> GetTableProperties() const override;
-
-  ~MockTableReader() {}
-
- private:
-  const stl_wrappers::KVMap& table_;
-};
-
-class MockTableIterator : public InternalIterator {
- public:
-  explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) {
-    itr_ = table_.end();
-  }
-
-  bool Valid() const override { return itr_ != table_.end(); }
-
-  void SeekToFirst() override { itr_ = table_.begin(); }
-
-  void SeekToLast() override {
-    itr_ = table_.end();
-    --itr_;
-  }
-
-  void Seek(const Slice& target) override {
-    std::string str_target(target.data(), target.size());
-    itr_ = table_.lower_bound(str_target);
-  }
-
-  void SeekForPrev(const Slice& target) override {
-    std::string str_target(target.data(), target.size());
-    itr_ = table_.upper_bound(str_target);
-    Prev();
-  }
-
-  void Next() override { ++itr_; }
-
-  void Prev() override {
-    if (itr_ == table_.begin()) {
-      itr_ = table_.end();
-    } else {
-      --itr_;
-    }
-  }
-
-  Slice key() const override { return Slice(itr_->first); }
-
-  Slice value() const override { return Slice(itr_->second); }
-
-  Status status() const override { return Status::OK(); }
-
- private:
-  const stl_wrappers::KVMap& table_;
-  stl_wrappers::KVMap::const_iterator itr_;
-};
-
-class MockTableBuilder : public TableBuilder {
- public:
-  MockTableBuilder(uint32_t id, MockTableFileSystem* file_system)
-      : id_(id), file_system_(file_system) {
-    table_ = MakeMockFile({});
-  }
-
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  ~MockTableBuilder() {}
-
-  // Add key,value to the table being constructed.
-  // REQUIRES: key is after any previously added key according to comparator.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Add(const Slice& key, const Slice& value) override {
-    table_.insert({key.ToString(), value.ToString()});
-  }
-
-  // Return non-ok iff some error has been detected.
-  Status status() const override { return Status::OK(); }
-
-  Status Finish() override {
-    MutexLock lock_guard(&file_system_->mutex);
-    file_system_->files.insert({id_, table_});
-    return Status::OK();
-  }
-
-  void Abandon() override {}
-
-  uint64_t NumEntries() const override { return table_.size(); }
-
-  uint64_t FileSize() const override { return table_.size(); }
-
-  TableProperties GetTableProperties() const override {
-    return TableProperties();
-  }
-
-  // Get file checksum
-  const std::string& GetFileChecksum() const override { return file_checksum_; }
-  // Get file checksum function name
-  const char* GetFileChecksumFuncName() const override {
-    return kUnknownFileChecksumFuncName.c_str();
-  }
-
- private:
-  uint32_t id_;
-  MockTableFileSystem* file_system_;
-  stl_wrappers::KVMap table_;
-  std::string file_checksum_ = kUnknownFileChecksum;
+  std::map<uint32_t, KVVector> files;
 };
 
 class MockTableFactory : public TableFactory {
  public:
+  enum MockCorruptionMode {
+    kCorruptNone,
+    kCorruptKey,
+    kCorruptValue,
+    kCorruptReorderKey,
+  };
+
   MockTableFactory();
-  const char* Name() const override { return "MockTable"; }
+  static const char* kClassName() { return "MockTable"; }
+  const char* Name() const override { return kClassName(); }
+  using TableFactory::NewTableReader;
   Status NewTableReader(
-      const TableReaderOptions& table_reader_options,
+      const ReadOptions& ro, const TableReaderOptions& table_reader_options,
       std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
       std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const override;
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_familly_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
   // This function will directly create mock table instead of going through
   // MockTableBuilder. file_contents has to have a format of <internal_key,
   // value>. Those key-value pairs will then be inserted into the mock table.
   Status CreateMockTable(Env* env, const std::string& fname,
-                         stl_wrappers::KVMap file_contents);
-
-  virtual Status SanitizeOptions(
-      const DBOptions& /*db_opts*/,
-      const ColumnFamilyOptions& /*cf_opts*/) const override {
-    return Status::OK();
-  }
+                         KVVector file_contents);
 
-  virtual std::string GetPrintableTableOptions() const override {
+  virtual std::string GetPrintableOptions() const override {
     return std::string();
   }
 
+  void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; }
   // This function will assert that only a single file exists and that the
   // contents are equal to file_contents
-  void AssertSingleFile(const stl_wrappers::KVMap& file_contents);
-  void AssertLatestFile(const stl_wrappers::KVMap& file_contents);
+  void AssertSingleFile(const KVVector& file_contents);
+  void AssertLatestFile(const KVVector& file_contents);
 
  private:
-  uint32_t GetAndWriteNextID(WritableFileWriter* file) const;
-  uint32_t GetIDFromFile(RandomAccessFileReader* file) const;
+  Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const;
+  Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const;
 
   mutable MockTableFileSystem file_system_;
   mutable std::atomic<uint32_t> next_id_;
+  MockCorruptionMode corrupt_mode_;
 };
 
 }  // namespace mock
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/multiget_context.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/multiget_context.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/multiget_context.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/multiget_context.h	2025-05-19 16:14:27.000000000 +0000
@@ -7,12 +7,15 @@
 #include <algorithm>
 #include <array>
 #include <string>
+
+#include "db/dbformat.h"
 #include "db/lookup_key.h"
 #include "db/merge_context.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/types.h"
 #include "util/autovector.h"
+#include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
 class GetContext;
@@ -20,27 +23,32 @@
 struct KeyContext {
   const Slice* key;
   LookupKey* lkey;
-  Slice ukey;
+  Slice ukey_with_ts;
+  Slice ukey_without_ts;
   Slice ikey;
   ColumnFamilyHandle* column_family;
   Status* s;
   MergeContext merge_context;
   SequenceNumber max_covering_tombstone_seq;
   bool key_exists;
+  bool is_blob_index;
   void* cb_arg;
   PinnableSlice* value;
+  std::string* timestamp;
   GetContext* get_context;
 
   KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
-             PinnableSlice* val, Status* stat)
+             PinnableSlice* val, std::string* ts, Status* stat)
       : key(&user_key),
         lkey(nullptr),
         column_family(col_family),
         s(stat),
         max_covering_tombstone_seq(0),
         key_exists(false),
+        is_blob_index(false),
         cb_arg(nullptr),
         value(val),
+        timestamp(ts),
         get_context(nullptr) {}
 
   KeyContext() = default;
@@ -84,16 +92,21 @@
 class MultiGetContext {
  public:
   // Limit the number of keys in a batch to this number. Benchmarks show that
-  // there is negligible benefit for batches exceeding this. Keeping this < 64
+  // there is negligible benefit for batches exceeding this. Keeping this < 32
   // simplifies iteration, as well as reduces the amount of stack allocations
-  // htat need to be performed
+  // that need to be performed
   static const int MAX_BATCH_SIZE = 32;
 
+  static_assert(MAX_BATCH_SIZE < 64, "MAX_BATCH_SIZE cannot exceed 63");
+
   MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
-                  size_t begin, size_t num_keys, SequenceNumber snapshot)
+                  size_t begin, size_t num_keys, SequenceNumber snapshot,
+                  const ReadOptions& read_opts)
       : num_keys_(num_keys),
         value_mask_(0),
+        value_size_(0),
         lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
+    assert(num_keys <= MAX_BATCH_SIZE);
     if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) {
       lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]);
       lookup_key_ptr_ = reinterpret_cast<LookupKey*>(
@@ -104,8 +117,11 @@
       // autovector may not be contiguous storage, so make a copy
       sorted_keys_[iter] = (*sorted_keys)[begin + iter];
       sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
-          LookupKey(*sorted_keys_[iter]->key, snapshot);
-      sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key();
+          LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp);
+      sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key();
+      sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey(
+          sorted_keys_[iter]->lkey->user_key(),
+          read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size());
       sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key();
     }
   }
@@ -123,6 +139,7 @@
   std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
   size_t num_keys_;
   uint64_t value_mask_;
+  uint64_t value_size_;
   std::unique_ptr<char[]> lookup_key_heap_buf;
   LookupKey* lookup_key_ptr_;
 
@@ -144,17 +161,17 @@
     class Iterator {
      public:
       // -- iterator traits
-      typedef Iterator self_type;
-      typedef KeyContext value_type;
-      typedef KeyContext& reference;
-      typedef KeyContext* pointer;
-      typedef int difference_type;
-      typedef std::forward_iterator_tag iterator_category;
+      using self_type = Iterator;
+      using value_type = KeyContext;
+      using reference = KeyContext&;
+      using pointer = KeyContext*;
+      using difference_type = int;
+      using iterator_category = std::forward_iterator_tag;
 
       Iterator(const Range* range, size_t idx)
           : range_(range), ctx_(range->ctx_), index_(idx) {
         while (index_ < range_->end_ &&
-               (1ull << index_) &
+               (uint64_t{1} << index_) &
                    (range_->ctx_->value_mask_ | range_->skip_mask_))
           index_++;
       }
@@ -164,7 +181,7 @@
 
       Iterator& operator++() {
         while (++index_ < range_->end_ &&
-               (1ull << index_) &
+               (uint64_t{1} << index_) &
                    (range_->ctx_->value_mask_ | range_->skip_mask_))
           ;
         return *this;
@@ -206,6 +223,8 @@
       start_ = first.index_;
       end_ = last.index_;
       skip_mask_ = mget_range.skip_mask_;
+      assert(start_ < 64);
+      assert(end_ < 64);
     }
 
     Range() = default;
@@ -214,33 +233,37 @@
 
     Iterator end() const { return Iterator(this, end_); }
 
-    bool empty() {
-      return (((1ull << end_) - 1) & ~((1ull << start_) - 1) &
-              ~(ctx_->value_mask_ | skip_mask_)) == 0;
-    }
+    bool empty() const { return RemainingMask() == 0; }
+
+    void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; }
 
-    void SkipKey(const Iterator& iter) { skip_mask_ |= 1ull << iter.index_; }
+    void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }
+
+    bool IsKeySkipped(const Iterator& iter) const {
+      return skip_mask_ & (uint64_t{1} << iter.index_);
+    }
 
     // Update the value_mask_ in MultiGetContext so its
     // immediately reflected in all the Range Iterators
     void MarkKeyDone(Iterator& iter) {
-      ctx_->value_mask_ |= (1ull << iter.index_);
+      ctx_->value_mask_ |= (uint64_t{1} << iter.index_);
     }
 
-    bool CheckKeyDone(Iterator& iter) {
-      return ctx_->value_mask_ & (1ull << iter.index_);
+    bool CheckKeyDone(Iterator& iter) const {
+      return ctx_->value_mask_ & (uint64_t{1} << iter.index_);
     }
 
-    uint64_t KeysLeft() {
-      uint64_t new_val = skip_mask_ | ctx_->value_mask_;
-      uint64_t count = 0;
-      while (new_val) {
-        new_val = new_val & (new_val - 1);
-        count++;
-      }
-      return end_ - count;
+    uint64_t KeysLeft() const { return BitsSetToOne(RemainingMask()); }
+
+    void AddSkipsFrom(const Range& other) {
+      assert(ctx_ == other.ctx_);
+      skip_mask_ |= other.skip_mask_;
     }
 
+    uint64_t GetValueSize() { return ctx_->value_size_; }
+
+    void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; }
+
    private:
     friend MultiGetContext;
     MultiGetContext* ctx_;
@@ -249,7 +272,14 @@
     uint64_t skip_mask_;
 
     Range(MultiGetContext* ctx, size_t num_keys)
-        : ctx_(ctx), start_(0), end_(num_keys), skip_mask_(0) {}
+        : ctx_(ctx), start_(0), end_(num_keys), skip_mask_(0) {
+      assert(num_keys < 64);
+    }
+
+    uint64_t RemainingMask() const {
+      return (((uint64_t{1} << end_) - 1) & ~((uint64_t{1} << start_) - 1) &
+              ~(ctx_->value_mask_ | skip_mask_));
+    }
   };
 
   // Return the initial range that encompasses all the keys in the batch
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_helper.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,19 +9,19 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+const PersistentCacheOptions PersistentCacheOptions::kEmpty;
+
 void PersistentCacheHelper::InsertRawPage(
     const PersistentCacheOptions& cache_options, const BlockHandle& handle,
     const char* data, const size_t size) {
   assert(cache_options.persistent_cache);
   assert(cache_options.persistent_cache->IsCompressed());
 
-  // construct the page key
-  char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
-                                          cache_options.key_prefix.size(),
-                                          handle, cache_key);
-  // insert content to cache
-  cache_options.persistent_cache->Insert(key, data, size);
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  cache_options.persistent_cache->Insert(key.AsSlice(), data, size)
+      .PermitUncheckedError();
 }
 
 void PersistentCacheHelper::InsertUncompressedPage(
@@ -33,14 +33,13 @@
   // (1) content is cacheable
   // (2) content is not compressed
 
-  // construct the page key
-  char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
-                                          cache_options.key_prefix.size(),
-                                          handle, cache_key);
-  // insert block contents to page cache
-  cache_options.persistent_cache->Insert(key, contents.data.data(),
-                                         contents.data.size());
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
+  cache_options.persistent_cache
+      ->Insert(key.AsSlice(), contents.data.data(), contents.data.size())
+      .PermitUncheckedError();
+  ;
 }
 
 Status PersistentCacheHelper::LookupRawPage(
@@ -52,14 +51,12 @@
   assert(cache_options.persistent_cache);
   assert(cache_options.persistent_cache->IsCompressed());
 
-  // construct the page key
-  char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
-                                          cache_options.key_prefix.size(),
-                                          handle, cache_key);
-  // Lookup page
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
   size_t size;
-  Status s = cache_options.persistent_cache->Lookup(key, raw_data, &size);
+  Status s =
+      cache_options.persistent_cache->Lookup(key.AsSlice(), raw_data, &size);
   if (!s.ok()) {
     // cache miss
     RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
@@ -67,7 +64,8 @@
   }
 
   // cache hit
-  assert(raw_data_size == handle.size() + kBlockTrailerSize);
+  // Block-based table is assumed
+  assert(raw_data_size == handle.size() + BlockBasedTable::kBlockTrailerSize);
   assert(size == raw_data_size);
   RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
   return Status::OK();
@@ -84,15 +82,13 @@
     return Status::NotFound();
   }
 
-  // construct the page key
-  char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = BlockBasedTable::GetCacheKey(cache_options.key_prefix.c_str(),
-                                          cache_options.key_prefix.size(),
-                                          handle, cache_key);
-  // Lookup page
+  CacheKey key =
+      BlockBasedTable::GetCacheKey(cache_options.base_cache_key, handle);
+
   std::unique_ptr<char[]> data;
   size_t size;
-  Status s = cache_options.persistent_cache->Lookup(key, &data, &size);
+  Status s =
+      cache_options.persistent_cache->Lookup(key.AsSlice(), &data, &size);
   if (!s.ok()) {
     // cache miss
     RecordTick(cache_options.statistics, PERSISTENT_CACHE_MISS);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_options.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/persistent_cache_options.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/persistent_cache_options.h	2025-05-19 16:14:27.000000000 +0000
@@ -6,6 +6,7 @@
 
 #include <string>
 
+#include "cache/cache_key.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/persistent_cache.h"
 
@@ -19,16 +20,18 @@
   PersistentCacheOptions() {}
   explicit PersistentCacheOptions(
       const std::shared_ptr<PersistentCache>& _persistent_cache,
-      const std::string _key_prefix, Statistics* const _statistics)
+      const OffsetableCacheKey& _base_cache_key, Statistics* const _statistics)
       : persistent_cache(_persistent_cache),
-        key_prefix(_key_prefix),
+        base_cache_key(_base_cache_key),
         statistics(_statistics) {}
 
   virtual ~PersistentCacheOptions() {}
 
   std::shared_ptr<PersistentCache> persistent_cache;
-  std::string key_prefix;
+  OffsetableCacheKey base_cache_key;
   Statistics* statistics = nullptr;
+
+  static const PersistentCacheOptions kEmpty;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_bloom.h	2025-05-19 16:14:27.000000000 +0000
@@ -132,4 +132,4 @@
   PlainTableBloomV1 bloom_;
 };
 
-};  // namespace ROCKSDB_NAMESPACE
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,12 +8,13 @@
 
 #include <assert.h>
 
-#include <string>
 #include <limits>
 #include <map>
+#include <string>
 
 #include "db/dbformat.h"
 #include "file/writable_file_writer.h"
+#include "logging/logging.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
@@ -36,16 +37,16 @@
 // a utility that helps writing block content to the file
 //   @offset will advance if @block_contents was successfully written.
 //   @block_handle the block handle this particular block.
-Status WriteBlock(const Slice& block_contents, WritableFileWriter* file,
-                  uint64_t* offset, BlockHandle* block_handle) {
+IOStatus WriteBlock(const Slice& block_contents, WritableFileWriter* file,
+                    uint64_t* offset, BlockHandle* block_handle) {
   block_handle->set_offset(*offset);
   block_handle->set_size(block_contents.size());
-  Status s = file->Append(block_contents);
+  IOStatus io_s = file->Append(block_contents);
 
-  if (s.ok()) {
+  if (io_s.ok()) {
     *offset += block_contents.size();
   }
-  return s;
+  return io_s;
 }
 
 }  // namespace
@@ -57,14 +58,14 @@
 extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
 
 PlainTableBuilder::PlainTableBuilder(
-    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len,
-    EncodingType encoding_type, size_t index_sparseness,
+    const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    uint32_t column_family_id, int level_at_creation, WritableFileWriter* file,
+    uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness,
     uint32_t bloom_bits_per_key, const std::string& column_family_name,
     uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio,
-    bool store_index_in_file)
+    bool store_index_in_file, const std::string& db_id,
+    const std::string& db_session_id, uint64_t file_number)
     : ioptions_(ioptions),
       moptions_(moptions),
       bloom_block_(num_probes),
@@ -97,22 +98,38 @@
   properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
   properties_.column_family_id = column_family_id;
   properties_.column_family_name = column_family_name;
-  properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr
-                                          ? moptions_.prefix_extractor->Name()
-                                          : "nullptr";
+  properties_.db_id = db_id;
+  properties_.db_session_id = db_session_id;
+  properties_.db_host_id = ioptions.db_host_id;
+  if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) {
+    ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set");
+  }
+  properties_.orig_file_number = file_number;
+  properties_.prefix_extractor_name =
+      moptions_.prefix_extractor != nullptr
+          ? moptions_.prefix_extractor->AsString()
+          : "nullptr";
 
   std::string val;
   PutFixed32(&val, static_cast<uint32_t>(encoder_.GetEncodingType()));
   properties_.user_collected_properties
       [PlainTablePropertyNames::kEncodingType] = val;
 
-  for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+  assert(int_tbl_prop_collector_factories);
+  for (auto& factory : *int_tbl_prop_collector_factories) {
+    assert(factory);
+
     table_properties_collectors_.emplace_back(
-        collector_factories->CreateIntTblPropCollector(column_family_id));
+        factory->CreateIntTblPropCollector(column_family_id,
+                                           level_at_creation));
   }
 }
 
 PlainTableBuilder::~PlainTableBuilder() {
+  // They are supposed to have been passed to users through Finish()
+  // if the file succeeds.
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
 }
 
 void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
@@ -121,7 +138,8 @@
   size_t meta_bytes_buf_size = 0;
 
   ParsedInternalKey internal_key;
-  if (!ParseInternalKey(key, &internal_key)) {
+  if (!ParseInternalKey(key, &internal_key, false /* log_err_key */)
+           .ok()) {  // TODO
     assert(false);
     return;
   }
@@ -145,41 +163,46 @@
   assert(offset_ <= std::numeric_limits<uint32_t>::max());
   auto prev_offset = static_cast<uint32_t>(offset_);
   // Write out the key
-  encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
-                     &meta_bytes_buf_size);
+  io_status_ = encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
+                                  &meta_bytes_buf_size);
   if (SaveIndexInFile()) {
     index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
   }
 
   // Write value length
   uint32_t value_size = static_cast<uint32_t>(value.size());
-  char* end_ptr =
-      EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
-  assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
-  meta_bytes_buf_size = end_ptr - meta_bytes_buf;
-  file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
+  if (io_status_.ok()) {
+    char* end_ptr =
+        EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
+    assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
+    meta_bytes_buf_size = end_ptr - meta_bytes_buf;
+    io_status_ = file_->Append(Slice(meta_bytes_buf, meta_bytes_buf_size));
+  }
 
   // Write value
-  file_->Append(value);
-  offset_ += value_size + meta_bytes_buf_size;
+  if (io_status_.ok()) {
+    io_status_ = file_->Append(value);
+    offset_ += value_size + meta_bytes_buf_size;
+  }
 
-  properties_.num_entries++;
-  properties_.raw_key_size += key.size();
-  properties_.raw_value_size += value.size();
-  if (internal_key.type == kTypeDeletion ||
-      internal_key.type == kTypeSingleDeletion) {
-    properties_.num_deletions++;
-  } else if (internal_key.type == kTypeMerge) {
-    properties_.num_merge_operands++;
+  if (io_status_.ok()) {
+    properties_.num_entries++;
+    properties_.raw_key_size += key.size();
+    properties_.raw_value_size += value.size();
+    if (internal_key.type == kTypeDeletion ||
+        internal_key.type == kTypeSingleDeletion) {
+      properties_.num_deletions++;
+    } else if (internal_key.type == kTypeMerge) {
+      properties_.num_merge_operands++;
+    }
   }
 
   // notify property collectors
   NotifyCollectTableCollectorsOnAdd(
-      key, value, offset_, table_properties_collectors_, ioptions_.info_log);
+      key, value, offset_, table_properties_collectors_, ioptions_.logger);
+  status_ = io_status_;
 }
 
-Status PlainTableBuilder::status() const { return status_; }
-
 Status PlainTableBuilder::Finish() {
   assert(!closed_);
   closed_ = true;
@@ -197,13 +220,12 @@
 
   if (store_index_in_file_ && (properties_.num_entries > 0)) {
     assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
-    Status s;
     BlockHandle bloom_block_handle;
     if (bloom_bits_per_key_ > 0) {
       bloom_block_.SetTotalBits(
           &arena_,
           static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
-          ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log);
+          ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger);
 
       PutVarint32(&properties_.user_collected_properties
                        [PlainTablePropertyNames::kNumBloomBlocks],
@@ -214,10 +236,12 @@
       Slice bloom_finish_result = bloom_block_.Finish();
 
       properties_.filter_size = bloom_finish_result.size();
-      s = WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
+      io_status_ =
+          WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
 
-      if (!s.ok()) {
-        return s;
+      if (!io_status_.ok()) {
+        status_ = io_status_;
+        return status_;
       }
       meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
     }
@@ -225,10 +249,12 @@
     Slice index_finish_result = index_builder_->Finish();
 
     properties_.index_size = index_finish_result.size();
-    s = WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
+    io_status_ =
+        WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
 
-    if (!s.ok()) {
-      return s;
+    if (!io_status_.ok()) {
+      status_ = io_status_;
+      return status_;
     }
 
     meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
@@ -243,51 +269,38 @@
   property_block_builder.Add(properties_.user_collected_properties);
 
   // -- Add user collected properties
-  NotifyCollectTableCollectorsOnFinish(table_properties_collectors_,
-                                       ioptions_.info_log,
-                                       &property_block_builder);
+  NotifyCollectTableCollectorsOnFinish(
+      table_properties_collectors_, ioptions_.logger, &property_block_builder);
 
   // -- Write property block
   BlockHandle property_block_handle;
-  auto s = WriteBlock(
-      property_block_builder.Finish(),
-      file_,
-      &offset_,
-      &property_block_handle
-  );
+  IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_,
+                          &property_block_handle);
   if (!s.ok()) {
-    return s;
+    return std::move(s);
   }
-  meta_index_builer.Add(kPropertiesBlock, property_block_handle);
+  meta_index_builer.Add(kPropertiesBlockName, property_block_handle);
 
   // -- write metaindex block
   BlockHandle metaindex_block_handle;
-  s = WriteBlock(
-      meta_index_builer.Finish(),
-      file_,
-      &offset_,
-      &metaindex_block_handle
-  );
-  if (!s.ok()) {
-    return s;
+  io_status_ = WriteBlock(meta_index_builer.Finish(), file_, &offset_,
+                          &metaindex_block_handle);
+  if (!io_status_.ok()) {
+    status_ = io_status_;
+    return status_;
   }
 
   // Write Footer
   // no need to write out new footer if we're using default checksum
-  Footer footer(kLegacyPlainTableMagicNumber, 0);
-  footer.set_metaindex_handle(metaindex_block_handle);
-  footer.set_index_handle(BlockHandle::NullBlockHandle());
-  std::string footer_encoding;
-  footer.EncodeTo(&footer_encoding);
-  s = file_->Append(footer_encoding);
-  if (s.ok()) {
-    offset_ += footer_encoding.size();
+  FooterBuilder footer;
+  footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_,
+               kNoChecksum, metaindex_block_handle);
+  io_status_ = file_->Append(footer.GetSlice());
+  if (io_status_.ok()) {
+    offset_ += footer.GetSlice().size();
   }
-
-  if (file_ != nullptr) {
-    file_checksum_ = file_->GetFileChecksum();
-  }
-  return s;
+  status_ = io_status_;
+  return status_;
 }
 
 void PlainTableBuilder::Abandon() {
@@ -302,11 +315,19 @@
   return offset_;
 }
 
+std::string PlainTableBuilder::GetFileChecksum() const {
+  if (file_ != nullptr) {
+    return file_->GetFileChecksum();
+  } else {
+    return kUnknownFileChecksum;
+  }
+}
+
 const char* PlainTableBuilder::GetFileChecksumFuncName() const {
   if (file_ != nullptr) {
     return file_->GetFileChecksumFuncName();
   } else {
-    return kUnknownFileChecksumFuncName.c_str();
+    return kUnknownFileChecksumFuncName;
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -37,15 +37,16 @@
   // will be part of level specified by 'level'.  A value of -1 means
   // that the caller does not know which level the output file will reside.
   PlainTableBuilder(
-      const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          int_tbl_prop_collector_factories,
-      uint32_t column_family_id, WritableFileWriter* file,
-      uint32_t user_key_size, EncodingType encoding_type,
-      size_t index_sparseness, uint32_t bloom_bits_per_key,
-      const std::string& column_family_name, uint32_t num_probes = 6,
-      size_t huge_page_tlb_size = 0, double hash_table_ratio = 0,
-      bool store_index_in_file = false);
+      const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+      const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+      uint32_t column_family_id, int level_at_creation,
+      WritableFileWriter* file, uint32_t user_key_size,
+      EncodingType encoding_type, size_t index_sparseness,
+      uint32_t bloom_bits_per_key, const std::string& column_family_name,
+      uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+      double hash_table_ratio = 0, bool store_index_in_file = false,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      uint64_t file_number = 0);
   // No copying allowed
   PlainTableBuilder(const PlainTableBuilder&) = delete;
   void operator=(const PlainTableBuilder&) = delete;
@@ -59,7 +60,10 @@
   void Add(const Slice& key, const Slice& value) override;
 
   // Return non-ok iff some error has been detected.
-  Status status() const override;
+  Status status() const override { return status_; }
+
+  // Return non-ok iff some error happens during IO.
+  IOStatus io_status() const override { return io_status_; }
 
   // Finish building the table.  Stops using the file passed to the
   // constructor after this function returns.
@@ -85,14 +89,14 @@
   bool SaveIndexInFile() const { return store_index_in_file_; }
 
   // Get file checksum
-  const std::string& GetFileChecksum() const override { return file_checksum_; }
+  std::string GetFileChecksum() const override;
 
   // Get file checksum function name
   const char* GetFileChecksumFuncName() const override;
 
  private:
   Arena arena_;
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   const MutableCFOptions& moptions_;
   std::vector<std::unique_ptr<IntTblPropCollector>>
       table_properties_collectors_;
@@ -105,6 +109,7 @@
   uint32_t bloom_bits_per_key_;
   size_t huge_page_tlb_size_;
   Status status_;
+  IOStatus io_status_;
   TableProperties properties_;
   PlainTableKeyEncoder encoder_;
 
@@ -115,9 +120,6 @@
 
   const SliceTransform* prefix_extractor_;
 
-  // Store file checksum. If checksum is disabled, its value is "0".
-  std::string file_checksum_ = kUnknownFileChecksum;
-
   Slice GetPrefix(const Slice& target) const {
     assert(target.size() >= 8);  // target is internal key
     return GetPrefixFromUserKey(GetUserKey(target));
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.cc	2025-05-19 16:14:27.000000000 +0000
@@ -3,23 +3,61 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef ROCKSDB_LITE
 #include "table/plain/plain_table_factory.h"
 
 #include <stdint.h>
+
 #include <memory>
+
 #include "db/dbformat.h"
-#include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "table/plain/plain_table_builder.h"
 #include "table/plain/plain_table_reader.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
+    {"user_key_len",
+     {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"bloom_bits_per_key",
+     {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"hash_table_ratio",
+     {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"index_sparseness",
+     {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"huge_page_tlb_size",
+     {offsetof(struct PlainTableOptions, huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"encoding_type",
+     {offsetof(struct PlainTableOptions, encoding_type),
+      OptionType::kEncodingType, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"full_scan_mode",
+     {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"store_index_in_file",
+     {offsetof(struct PlainTableOptions, store_index_in_file),
+      OptionType::kBoolean, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+};
+
+PlainTableFactory::PlainTableFactory(const PlainTableOptions& options)
+    : table_options_(options) {
+  RegisterOptions(&table_options_, &plain_table_type_info);
+}
 
 Status PlainTableFactory::NewTableReader(
-    const TableReaderOptions& table_reader_options,
+    const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table,
     bool /*prefetch_index_and_filter_in_cache*/) const {
@@ -29,11 +67,11 @@
       table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio,
       table_options_.index_sparseness, table_options_.huge_page_tlb_size,
       table_options_.full_scan_mode, table_reader_options.immortal,
-      table_reader_options.prefix_extractor);
+      table_reader_options.prefix_extractor.get());
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
   // Ignore the skip_filters flag. PlainTable format is optimized for small
   // in-memory dbs. The skip_filters optimization is not useful for plain
@@ -41,15 +79,18 @@
   //
   return new PlainTableBuilder(
       table_builder_options.ioptions, table_builder_options.moptions,
-      table_builder_options.int_tbl_prop_collector_factories, column_family_id,
-      file, table_options_.user_key_len, table_options_.encoding_type,
+      table_builder_options.int_tbl_prop_collector_factories,
+      table_builder_options.column_family_id,
+      table_builder_options.level_at_creation, file,
+      table_options_.user_key_len, table_options_.encoding_type,
       table_options_.index_sparseness, table_options_.bloom_bits_per_key,
       table_builder_options.column_family_name, 6,
       table_options_.huge_page_tlb_size, table_options_.hash_table_ratio,
-      table_options_.store_index_in_file);
+      table_options_.store_index_in_file, table_builder_options.db_id,
+      table_builder_options.db_session_id, table_builder_options.cur_file_num);
 }
 
-std::string PlainTableFactory::GetPrintableTableOptions() const {
+std::string PlainTableFactory::GetPrintableOptions() const {
   std::string ret;
   ret.reserve(20000);
   const int kBufferSize = 200;
@@ -82,11 +123,19 @@
   return ret;
 }
 
-const PlainTableOptions& PlainTableFactory::table_options() const {
-  return table_options_;
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = false;
+  config_options.ignore_unknown_options = false;
+  config_options.invoke_prepare_options = false;
+  return GetPlainTableOptionsFromString(config_options, table_options, opts_str,
+                                        new_table_options);
 }
 
-Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+Status GetPlainTableOptionsFromString(const ConfigOptions& config_options,
+                                      const PlainTableOptions& table_options,
                                       const std::string& opts_str,
                                       PlainTableOptions* new_table_options) {
   std::unordered_map<std::string, std::string> opts_map;
@@ -94,128 +143,183 @@
   if (!s.ok()) {
     return s;
   }
-  return GetPlainTableOptionsFromMap(table_options, opts_map,
-                                     new_table_options);
-}
 
-Status GetMemTableRepFactoryFromString(
-    const std::string& opts_str,
-    std::unique_ptr<MemTableRepFactory>* new_mem_factory) {
-  std::vector<std::string> opts_list = StringSplit(opts_str, ':');
-  size_t len = opts_list.size();
-
-  if (opts_list.empty() || opts_list.size() > 2) {
-    return Status::InvalidArgument("Can't parse memtable_factory option ",
-                                   opts_str);
-  }
-
-  MemTableRepFactory* mem_factory = nullptr;
-
-  if (opts_list[0] == "skip_list") {
-    // Expecting format
-    // skip_list:<lookahead>
-    if (2 == len) {
-      size_t lookahead = ParseSizeT(opts_list[1]);
-      mem_factory = new SkipListFactory(lookahead);
-    } else if (1 == len) {
-      mem_factory = new SkipListFactory();
-    }
-  } else if (opts_list[0] == "prefix_hash") {
-    // Expecting format
-    // prfix_hash:<hash_bucket_count>
-    if (2 == len) {
-      size_t hash_bucket_count = ParseSizeT(opts_list[1]);
-      mem_factory = NewHashSkipListRepFactory(hash_bucket_count);
-    } else if (1 == len) {
-      mem_factory = NewHashSkipListRepFactory();
-    }
-  } else if (opts_list[0] == "hash_linkedlist") {
-    // Expecting format
-    // hash_linkedlist:<hash_bucket_count>
-    if (2 == len) {
-      size_t hash_bucket_count = ParseSizeT(opts_list[1]);
-      mem_factory = NewHashLinkListRepFactory(hash_bucket_count);
-    } else if (1 == len) {
-      mem_factory = NewHashLinkListRepFactory();
-    }
-  } else if (opts_list[0] == "vector") {
-    // Expecting format
-    // vector:<count>
-    if (2 == len) {
-      size_t count = ParseSizeT(opts_list[1]);
-      mem_factory = new VectorRepFactory(count);
-    } else if (1 == len) {
-      mem_factory = new VectorRepFactory();
-    }
-  } else if (opts_list[0] == "cuckoo") {
-    return Status::NotSupported(
-        "cuckoo hash memtable is not supported anymore.");
+  s = GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                  new_table_options);
+  // Translate any errors (NotFound, NotSupported, to InvalidArgument
+  if (s.ok() || s.IsInvalidArgument()) {
+    return s;
   } else {
-    return Status::InvalidArgument("Unrecognized memtable_factory option ",
-                                   opts_str);
+    return Status::InvalidArgument(s.getState());
   }
+}
+#endif  // ROCKSDB_LITE
 
-  if (mem_factory != nullptr) {
-    new_mem_factory->reset(mem_factory);
-  }
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library,
+                                             const std::string& /*arg*/) {
+  // The MemTableRepFactory built-in classes will be either a class
+  // (VectorRepFactory) or a nickname (vector), followed optionally by ":#",
+  // where # is the "size" of the factory.
+  auto AsPattern = [](const std::string& name, const std::string& alt) {
+    auto pattern = ObjectLibrary::PatternEntry(name, true);
+    pattern.AnotherName(alt);
+    pattern.AddNumber(":");
+    return pattern;
+  };
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(VectorRepFactory::kClassName(), VectorRepFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new VectorRepFactory(count));
+        } else {
+          guard->reset(new VectorRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern(SkipListFactory::kClassName(), SkipListFactory::kNickName()),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t lookahead = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(new SkipListFactory(lookahead));
+        } else {
+          guard->reset(new SkipListFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashLinkListRepFactory", "hash_linkedlist"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: hash_linkedlist:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashLinkListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashLinkListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      AsPattern("HashSkipListRepFactory", "prefix_hash"),
+      [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+         std::string* /*errmsg*/) {
+        // Expecting format: prefix_hash:<hash_bucket_count>
+        auto colon = uri.find(":");
+        if (colon != std::string::npos) {
+          size_t hash_bucket_count = ParseSizeT(uri.substr(colon + 1));
+          guard->reset(NewHashSkipListRepFactory(hash_bucket_count));
+        } else {
+          guard->reset(NewHashSkipListRepFactory());
+        }
+        return guard->get();
+      });
+  library.AddFactory<MemTableRepFactory>(
+      "cuckoo",
+      [](const std::string& /*uri*/,
+         std::unique_ptr<MemTableRepFactory>* /*guard*/, std::string* errmsg) {
+        *errmsg = "cuckoo hash memtable is not supported anymore.";
+        return nullptr;
+      });
+
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
 
-  return Status::OK();
+Status GetMemTableRepFactoryFromString(
+    const std::string& opts_str, std::unique_ptr<MemTableRepFactory>* result) {
+  ConfigOptions config_options;
+  config_options.ignore_unsupported_options = false;
+  config_options.ignore_unknown_options = false;
+  return MemTableRepFactory::CreateFromString(config_options, opts_str, result);
 }
 
-std::string ParsePlainTableOptions(const std::string& name,
-                                   const std::string& org_value,
-                                   PlainTableOptions* new_options,
-                                   bool input_strings_escaped = false,
-                                   bool ignore_unknown_options = false) {
-  const std::string& value =
-      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
-  const auto iter = plain_table_type_info.find(name);
-  if (iter == plain_table_type_info.end()) {
-    if (ignore_unknown_options) {
-      return "";
-    } else {
-      return "Unrecognized option";
+Status MemTableRepFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::unique_ptr<MemTableRepFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (value.empty()) {
+    // No Id and no options.  Clear the object
+    result->reset();
+    return Status::OK();
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    status = NewUniqueObject<MemTableRepFactory>(config_options, id, opt_map,
+                                                 result);
+#else
+    // To make it possible to configure the memtables in LITE mode, the ID
+    // is of the form <name>:<size>, where name is the name of the class and
+    // <size> is the length of the object (e.g. skip_list:10).
+    std::vector<std::string> opts_list = StringSplit(id, ':');
+    if (opts_list.empty() || opts_list.size() > 2 || !opt_map.empty()) {
+      status = Status::InvalidArgument("Can't parse memtable_factory option ",
+                                       value);
+    } else if (opts_list[0] == "skip_list" ||
+               opts_list[0] == SkipListFactory::kClassName()) {
+      // Expecting format
+      // skip_list:<lookahead>
+      if (opts_list.size() == 2) {
+        size_t lookahead = ParseSizeT(opts_list[1]);
+        result->reset(new SkipListFactory(lookahead));
+      } else {
+        result->reset(new SkipListFactory());
+      }
+    } else if (!config_options.ignore_unsupported_options) {
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
     }
+#endif  // ROCKSDB_LITE
   }
-  const auto& opt_info = iter->second;
-  if (opt_info.verification != OptionVerificationType::kDeprecated &&
-      !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
-                         opt_info.type, value)) {
-    return "Invalid value";
-  }
-  return "";
+  return status;
 }
 
+#ifndef ROCKSDB_LITE
 Status GetPlainTableOptionsFromMap(
     const PlainTableOptions& table_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     PlainTableOptions* new_table_options, bool input_strings_escaped,
-    bool /*ignore_unknown_options*/) {
+    bool ignore_unknown_options) {
+  ConfigOptions config_options;
+  config_options.input_strings_escaped = input_strings_escaped;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  return GetPlainTableOptionsFromMap(config_options, table_options, opts_map,
+                                     new_table_options);
+}
+
+Status GetPlainTableOptionsFromMap(
+    const ConfigOptions& config_options, const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options) {
   assert(new_table_options);
-  *new_table_options = table_options;
-  for (const auto& o : opts_map) {
-    auto error_message = ParsePlainTableOptions(
-        o.first, o.second, new_table_options, input_strings_escaped);
-    if (error_message != "") {
-      const auto iter = plain_table_type_info.find(o.first);
-      if (iter == plain_table_type_info.end() ||
-          !input_strings_escaped ||  // !input_strings_escaped indicates
-                                     // the old API, where everything is
-                                     // parsable.
-          (iter->second.verification != OptionVerificationType::kByName &&
-           iter->second.verification !=
-               OptionVerificationType::kByNameAllowNull &&
-           iter->second.verification !=
-               OptionVerificationType::kByNameAllowFromNull &&
-           iter->second.verification != OptionVerificationType::kDeprecated)) {
-        // Restore "new_options" to the default "base_options".
-        *new_table_options = table_options;
-        return Status::InvalidArgument("Can't parse PlainTableOptions:",
-                                       o.first + " " + error_message);
-      }
-    }
+  PlainTableFactory ptf(table_options);
+  Status s = ptf.ConfigureFromMap(config_options, opts_map);
+  if (s.ok()) {
+    *new_table_options = *(ptf.GetOptions<PlainTableOptions>());
+  } else {
+    // Restore "new_options" to the default "base_options".
+    *new_table_options = table_options;
   }
-  return Status::OK();
+  return s;
 }
 
 extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
@@ -231,5 +335,5 @@
 const std::string PlainTablePropertyNames::kNumBloomBlocks =
     "rocksdb.plain.table.bloom.numblocks";
 
-}  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_factory.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,8 +10,6 @@
 #include <string>
 #include <stdint.h>
 
-#include "options/options_helper.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -35,7 +33,7 @@
 // 1. Data compression is not supported.
 // 2. Data is not checksumed.
 // it is not recommended to use this format on other type of file systems.
-// 
+//
 // PlainTable requires fixed length key, configured as a constructor
 // parameter of the factory class. Output file format:
 // +-------------+-----------------+
@@ -156,68 +154,29 @@
   // page TLB and the page size if allocating from there. See comments of
   // Arena::AllocateAligned() for details.
   explicit PlainTableFactory(
-      const PlainTableOptions& _table_options = PlainTableOptions())
-      : table_options_(_table_options) {}
+      const PlainTableOptions& _table_options = PlainTableOptions());
 
-  const char* Name() const override { return "PlainTable"; }
-  Status NewTableReader(const TableReaderOptions& table_reader_options,
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kPlainTableName(); }
+  const char* Name() const override { return kPlainTableName(); }
+  using TableFactory::NewTableReader;
+  Status NewTableReader(const ReadOptions& ro,
+                        const TableReaderOptions& table_reader_options,
                         std::unique_ptr<RandomAccessFileReader>&& file,
                         uint64_t file_size, std::unique_ptr<TableReader>* table,
                         bool prefetch_index_and_filter_in_cache) const override;
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
-
-  std::string GetPrintableTableOptions() const override;
-
-  const PlainTableOptions& table_options() const;
+      WritableFileWriter* file) const override;
 
+  std::string GetPrintableOptions() const override;
   static const char kValueTypeSeqId0 = char(~0);
 
-  // Sanitizes the specified DB Options.
-  Status SanitizeOptions(
-      const DBOptions& /*db_opts*/,
-      const ColumnFamilyOptions& /*cf_opts*/) const override {
-    return Status::OK();
-  }
-
-  void* GetOptions() override { return &table_options_; }
-
-  Status GetOptionString(std::string* /*opt_string*/,
-                         const std::string& /*delimiter*/) const override {
-    return Status::OK();
-  }
-
  private:
   PlainTableOptions table_options_;
 };
 
-static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
-    {"user_key_len",
-     {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"bloom_bits_per_key",
-     {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
-      OptionVerificationType::kNormal, false, 0}},
-    {"hash_table_ratio",
-     {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
-      OptionVerificationType::kNormal, false, 0}},
-    {"index_sparseness",
-     {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"huge_page_tlb_size",
-     {offsetof(struct PlainTableOptions, huge_page_tlb_size),
-      OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-    {"encoding_type",
-     {offsetof(struct PlainTableOptions, encoding_type),
-      OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
-    {"full_scan_mode",
-     {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"store_index_in_file",
-     {offsetof(struct PlainTableOptions, store_index_in_file),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
 
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,10 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
+#include "table/plain/plain_table_index.h"
 
 #include <cinttypes>
 
-#include "table/plain/plain_table_index.h"
+#include "logging/logging.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
@@ -98,7 +99,7 @@
   BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
 
   keys_per_prefix_hist_.Add(num_keys_per_prefix_);
-  ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
+  ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s",
                  keys_per_prefix_hist_.ToString().c_str());
 
   // From the temp data structure, populate indexes.
@@ -153,12 +154,12 @@
 Slice PlainTableIndexBuilder::FillIndexes(
     const std::vector<IndexRecord*>& hash_to_offsets,
     const std::vector<uint32_t>& entries_per_bucket) {
-  ROCKS_LOG_DEBUG(ioptions_.info_log,
+  ROCKS_LOG_DEBUG(ioptions_.logger,
                   "Reserving %" PRIu32 " bytes for plain table's sub_index",
                   sub_index_size_);
   auto total_allocate_size = GetTotalSize();
   char* allocated = arena_->AllocateAligned(
-      total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
+      total_allocate_size, huge_page_tlb_size_, ioptions_.logger);
 
   auto temp_ptr = EncodeVarint32(allocated, index_size_);
   uint32_t* index =
@@ -198,7 +199,7 @@
   }
   assert(sub_index_offset == sub_index_size_);
 
-  ROCKS_LOG_DEBUG(ioptions_.info_log,
+  ROCKS_LOG_DEBUG(ioptions_.logger,
                   "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
                   index_size_, sub_index_size_);
   return Slice(allocated, GetTotalSize());
@@ -206,6 +207,6 @@
 
 const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
     "PlainTableIndexBlock";
-};  // namespace ROCKSDB_NAMESPACE
+}  // namespace ROCKSDB_NAMESPACE
 
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_index.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_index.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,7 +10,6 @@
 #include <string>
 #include <vector>
 
-#include "db/dbformat.h"
 #include "memory/arena.h"
 #include "monitoring/histogram.h"
 #include "options/cf_options.h"
@@ -20,7 +19,7 @@
 
 // The file contains two classes PlainTableIndex and PlainTableIndexBuilder
 // The two classes implement the index format of PlainTable.
-// For descripton of PlainTable format, see comments of class
+// For description of PlainTable format, see comments of class
 // PlainTableFactory
 //
 //
@@ -131,7 +130,7 @@
 // The class is used by PlainTableBuilder class.
 class PlainTableIndexBuilder {
  public:
-  PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
+  PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
                          const SliceTransform* prefix_extractor,
                          size_t index_sparseness, double hash_table_ratio,
                          size_t huge_page_tlb_size)
@@ -222,7 +221,7 @@
                     const std::vector<uint32_t>& entries_per_bucket);
 
   Arena* arena_;
-  const ImmutableCFOptions ioptions_;
+  const ImmutableOptions ioptions_;
   HistogramImpl keys_per_prefix_hist_;
   IndexRecordList record_list_;
   bool is_first_record_;
@@ -244,6 +243,6 @@
   static const size_t kRecordsPerGroup = 256;
 };
 
-};  // namespace ROCKSDB_NAMESPACE
+}  // namespace ROCKSDB_NAMESPACE
 
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.cc	2025-05-19 16:14:27.000000000 +0000
@@ -80,13 +80,15 @@
   }
 }
 
-Status PlainTableKeyEncoder::AppendKey(const Slice& key,
-                                       WritableFileWriter* file,
-                                       uint64_t* offset, char* meta_bytes_buf,
-                                       size_t* meta_bytes_buf_size) {
+IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key,
+                                         WritableFileWriter* file,
+                                         uint64_t* offset, char* meta_bytes_buf,
+                                         size_t* meta_bytes_buf_size) {
   ParsedInternalKey parsed_key;
-  if (!ParseInternalKey(key, &parsed_key)) {
-    return Status::Corruption(Slice());
+  Status pik_status =
+      ParseInternalKey(key, &parsed_key, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    return IOStatus::Corruption(pik_status.getState());
   }
 
   Slice key_to_write = key;  // Portion of internal key to write out.
@@ -99,9 +101,9 @@
       char* ptr = EncodeVarint32(key_size_buf, user_key_size);
       assert(ptr <= key_size_buf + sizeof(key_size_buf));
       auto len = ptr - key_size_buf;
-      Status s = file->Append(Slice(key_size_buf, len));
-      if (!s.ok()) {
-        return s;
+      IOStatus io_s = file->Append(Slice(key_size_buf, len));
+      if (!io_s.ok()) {
+        return io_s;
       }
       *offset += len;
     }
@@ -117,9 +119,9 @@
       key_count_for_prefix_ = 1;
       pre_prefix_.SetUserKey(prefix);
       size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
-      Status s = file->Append(Slice(size_bytes, size_bytes_pos));
-      if (!s.ok()) {
-        return s;
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
       }
       *offset += size_bytes_pos;
     } else {
@@ -135,9 +137,9 @@
           static_cast<uint32_t>(pre_prefix_.GetUserKey().size());
       size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
                                    size_bytes + size_bytes_pos);
-      Status s = file->Append(Slice(size_bytes, size_bytes_pos));
-      if (!s.ok()) {
-        return s;
+      IOStatus io_s = file->Append(Slice(size_bytes, size_bytes_pos));
+      if (!io_s.ok()) {
+        return io_s;
       }
       *offset += size_bytes_pos;
       key_to_write = Slice(key.data() + prefix_len, key.size() - prefix_len);
@@ -149,20 +151,23 @@
   // If the row is of value type with seqId 0, flush the special flag together
   // in this buffer to safe one file append call, which takes 1 byte.
   if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
-    Status s =
+    IOStatus io_s =
         file->Append(Slice(key_to_write.data(), key_to_write.size() - 8));
-    if (!s.ok()) {
-      return s;
+    if (!io_s.ok()) {
+      return io_s;
     }
     *offset += key_to_write.size() - 8;
     meta_bytes_buf[*meta_bytes_buf_size] = PlainTableFactory::kValueTypeSeqId0;
     *meta_bytes_buf_size += 1;
   } else {
-    file->Append(key_to_write);
+    IOStatus io_s = file->Append(key_to_write);
+    if (!io_s.ok()) {
+      return io_s;
+    }
     *offset += key_to_write.size();
   }
 
-  return Status::OK();
+  return IOStatus::OK();
 }
 
 Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset,
@@ -207,8 +212,9 @@
     new_buffer->buf_len = 0;
   }
   Slice read_result;
-  Status s = file_info_->file->Read(file_offset, size_to_read, &read_result,
-                                    new_buffer->buf.get());
+  Status s =
+      file_info_->file->Read(IOOptions(), file_offset, size_to_read,
+                             &read_result, new_buffer->buf.get(), nullptr);
   if (!s.ok()) {
     status_ = s;
     return false;
@@ -275,9 +281,12 @@
       return file_reader_.status();
     }
     *internal_key_valid = true;
-    if (!ParseInternalKey(*internal_key, parsed_key)) {
+    Status pik_status = ParseInternalKey(*internal_key, parsed_key,
+                                         false /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
       return Status::Corruption(
-          Slice("Incorrect value type found when reading the next key"));
+          Slice("Corrupted key found during next key read. "),
+          pik_status.getState());
     }
     *bytes_read += user_key_size + 8;
   }
@@ -483,7 +492,6 @@
   if (seekable != nullptr) {
     *seekable = true;
   }
-  Status s;
   if (encoding_type_ == kPlain) {
     return NextPlainEncodingKey(start_offset, parsed_key, internal_key,
                                 bytes_read, seekable);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_key_coding.h	2025-05-19 16:14:27.000000000 +0000
@@ -8,7 +8,7 @@
 #ifndef ROCKSDB_LITE
 
 #include <array>
-#include "db/dbformat.h"
+
 #include "rocksdb/slice.h"
 #include "table/plain/plain_table_reader.h"
 
@@ -44,8 +44,9 @@
   // meta_bytes_buf: buffer for extra meta bytes
   // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
   //                      if meta_bytes_buf is updated.
-  Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset,
-                   char* meta_bytes_buf, size_t* meta_bytes_buf_size);
+  IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
+                     uint64_t* offset, char* meta_bytes_buf,
+                     size_t* meta_bytes_buf_size);
 
   // Return actual encoding type to be picked
   EncodingType GetEncodingType() { return encoding_type_; }
@@ -67,6 +68,12 @@
  public:
   explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
       : file_info_(_file_info), num_buf_(0) {}
+
+  ~PlainTableFileReader() {
+    // Should fix.
+    status_.PermitUncheckedError();
+  }
+
   // In mmaped mode, the results point to mmaped area of the file, which
   // means it is always valid before closing the file.
   // In non-mmap mode, the results point to an internal buffer. If the caller
@@ -145,6 +152,7 @@
         fixed_user_key_len_(user_key_len),
         prefix_extractor_(prefix_extractor),
         in_prefix_(false) {}
+
   // Find the next key.
   // start: char array where the key starts.
   // limit: boundary of the char array
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -93,7 +93,7 @@
 
 extern const uint64_t kPlainTableMagicNumber;
 PlainTableReader::PlainTableReader(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     std::unique_ptr<RandomAccessFileReader>&& file,
     const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
     EncodingType encoding_type, uint64_t file_size,
@@ -113,10 +113,12 @@
       table_properties_(nullptr) {}
 
 PlainTableReader::~PlainTableReader() {
+  // Should fix?
+  status_.PermitUncheckedError();
 }
 
 Status PlainTableReader::Open(
-    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const ImmutableOptions& ioptions, const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
@@ -127,11 +129,9 @@
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
-  TableProperties* props_ptr = nullptr;
+  std::unique_ptr<TableProperties> props;
   auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                               ioptions, &props_ptr,
-                               true /* compression_type_missing */);
-  std::shared_ptr<TableProperties> props(props_ptr);
+                               ioptions, &props);
   if (!s.ok()) {
     return s;
   }
@@ -147,8 +147,7 @@
       return Status::InvalidArgument(
           "Prefix extractor is missing when opening a PlainTable built "
           "using a prefix extractor");
-    } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
-               0) {
+    } else if (prefix_extractor_in_file != prefix_extractor->AsString()) {
       return Status::InvalidArgument(
           "Prefix extractor given doesn't match the one used to build "
           "PlainTable");
@@ -185,7 +184,7 @@
     new_reader->full_scan_mode_ = true;
   }
   // PopulateIndex can add to the props, so don't store them until now
-  new_reader->table_properties_ = props;
+  new_reader->table_properties_ = std::move(props);
 
   if (immortal_table && new_reader->file_info_.is_mmap_mode) {
     new_reader->dummy_cleanable_.reset(new Cleanable());
@@ -201,7 +200,8 @@
 InternalIterator* PlainTableReader::NewIterator(
     const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
     Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
-    size_t /*compaction_readahead_size*/) {
+    size_t /*compaction_readahead_size*/,
+    bool /* allow_unprepared_value */) {
   // Not necessarily used here, but make sure this has been initialized
   assert(table_properties_);
 
@@ -274,7 +274,7 @@
   if (bloom_total_bits > 0) {
     enable_bloom_ = true;
     bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
-                        huge_page_tlb_size, ioptions_.info_log);
+                        huge_page_tlb_size, ioptions_.logger);
   }
 }
 
@@ -288,7 +288,9 @@
 Status PlainTableReader::MmapDataIfNeeded() {
   if (file_info_.is_mmap_mode) {
     // Get mmapped memory.
-    return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr);
+    return file_info_.file->Read(IOOptions(), 0,
+                                 static_cast<size_t>(file_size_),
+                                 &file_info_.file_data, nullptr, nullptr);
   }
   return Status::OK();
 }
@@ -304,8 +306,7 @@
   Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                            file_size_, kPlainTableMagicNumber, ioptions_,
                            PlainTableIndexBuilder::kPlainTableIndexBlock,
-                           BlockType::kIndex, &index_block_contents,
-                           true /* compression_type_missing */);
+                           BlockType::kIndex, &index_block_contents);
 
   bool index_in_file = s.ok();
 
@@ -316,8 +317,7 @@
     s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                       file_size_, kPlainTableMagicNumber, ioptions_,
                       BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
-                      &bloom_block_contents,
-                      true /* compression_type_missing */);
+                      &bloom_block_contents);
     bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
   }
 
@@ -445,23 +445,23 @@
   }
 
   // point to sub-index, need to do a binary search
-  uint32_t upper_bound;
+  uint32_t upper_bound = 0;
   const char* base_ptr =
       index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
   uint32_t low = 0;
   uint32_t high = upper_bound;
   ParsedInternalKey mid_key;
   ParsedInternalKey parsed_target;
-  if (!ParseInternalKey(target, &parsed_target)) {
-    return Status::Corruption(Slice());
-  }
+  Status s = ParseInternalKey(target, &parsed_target,
+                              false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
 
   // The key is between [low, high). Do a binary search between it.
   while (high - low > 1) {
     uint32_t mid = (high + low) / 2;
     uint32_t file_offset = GetFixed32Element(base_ptr, mid);
     uint32_t tmp;
-    Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
+    s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
     if (!s.ok()) {
       return s;
     }
@@ -486,7 +486,7 @@
   ParsedInternalKey low_key;
   uint32_t tmp;
   uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
-  Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
+  s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
   if (!s.ok()) {
     return s;
   }
@@ -589,9 +589,10 @@
   }
   ParsedInternalKey found_key;
   ParsedInternalKey parsed_target;
-  if (!ParseInternalKey(target, &parsed_target)) {
-    return Status::Corruption(Slice());
-  }
+  s = ParseInternalKey(target, &parsed_target,
+                       false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
+
   Slice found_value;
   while (offset < file_info_.data_end_offset) {
     s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/plain/plain_table_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -12,7 +12,6 @@
 #include <string>
 #include <stdint.h>
 
-#include "db/dbformat.h"
 #include "file/random_access_file_reader.h"
 #include "memory/arena.h"
 #include "rocksdb/env.h"
@@ -67,7 +66,7 @@
 // whether it points to the data offset of the first key with the key prefix
 // or the offset of it. If there are too many keys share this prefix, it will
 // create a binary search-able index from the suffix to offset on disk.
-  static Status Open(const ImmutableCFOptions& ioptions,
+  static Status Open(const ImmutableOptions& ioptions,
                      const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
                      std::unique_ptr<RandomAccessFileReader>&& file,
@@ -84,7 +83,8 @@
                                 const SliceTransform* prefix_extractor,
                                 Arena* arena, bool skip_filters,
                                 TableReaderCaller caller,
-                                size_t compaction_readahead_size = 0) override;
+                                size_t compaction_readahead_size = 0,
+                                bool allow_unprepared_value = false) override;
 
   void Prepare(const Slice& target) override;
 
@@ -109,7 +109,7 @@
     return arena_.MemoryAllocatedBytes();
   }
 
-  PlainTableReader(const ImmutableCFOptions& ioptions,
+  PlainTableReader(const ImmutableOptions& ioptions,
                    std::unique_ptr<RandomAccessFileReader>&& file,
                    const EnvOptions& env_options,
                    const InternalKeyComparator& internal_comparator,
@@ -162,7 +162,7 @@
   CacheAllocationPtr index_block_alloc_;
   CacheAllocationPtr bloom_block_alloc_;
 
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   std::unique_ptr<Cleanable> dummy_cleanable_;
   uint64_t file_size_;
  protected: // for testing
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,502 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+
+#include "table/sst_file_dumper.h"
+
+#include <chrono>
+#include <cinttypes>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+SstFileDumper::SstFileDumper(const Options& options,
+                             const std::string& file_path,
+                             size_t readahead_size, bool verify_checksum,
+                             bool output_hex, bool decode_blob_index,
+                             const EnvOptions& soptions, bool silent)
+    : file_name_(file_path),
+      read_num_(0),
+      output_hex_(output_hex),
+      decode_blob_index_(decode_blob_index),
+      soptions_(soptions),
+      silent_(silent),
+      options_(options),
+      ioptions_(options_),
+      moptions_(ColumnFamilyOptions(options_)),
+      read_options_(verify_checksum, false),
+      internal_comparator_(BytewiseComparator()) {
+  read_options_.readahead_size = readahead_size;
+  if (!silent_) {
+    fprintf(stdout, "Process %s\n", file_path.c_str());
+  }
+  init_result_ = GetTableReader(file_name_);
+}
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
+
+const char* testFileName = "test_file_name";
+
+Status SstFileDumper::GetTableReader(const std::string& file_path) {
+  // Warning about 'magic_number' being uninitialized shows up only in UBsan
+  // builds. Though access is guarded by 's.ok()' checks, fix the issue to
+  // avoid any warnings.
+  uint64_t magic_number = Footer::kNullTableMagicNumber;
+
+  // read table magic number
+  Footer footer;
+
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<FSRandomAccessFile> file;
+  uint64_t file_size = 0;
+  Status s = fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file,
+                                     nullptr);
+  if (s.ok()) {
+    s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  }
+
+  // check empty file
+  // if true, skip further processing of this file
+  if (file_size == 0) {
+    return Status::Aborted(file_path, "Empty file");
+  }
+
+  file_.reset(new RandomAccessFileReader(std::move(file), file_path));
+
+  FilePrefetchBuffer prefetch_buffer(
+      0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
+      false /* track_min_offset */);
+  if (s.ok()) {
+    const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
+    uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
+                                 ? kSstDumpTailPrefetchSize
+                                 : file_size;
+    uint64_t prefetch_off = file_size - prefetch_size;
+    IOOptions opts;
+    s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off,
+                                 static_cast<size_t>(prefetch_size));
+
+    s = ReadFooterFromFile(opts, file_.get(), &prefetch_buffer, file_size,
+                           &footer);
+  }
+  if (s.ok()) {
+    magic_number = footer.table_magic_number();
+  }
+
+  if (s.ok()) {
+    if (magic_number == kPlainTableMagicNumber ||
+        magic_number == kLegacyPlainTableMagicNumber) {
+      soptions_.use_mmap_reads = true;
+
+      fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file,
+                              nullptr);
+      file_.reset(new RandomAccessFileReader(std::move(file), file_path));
+    }
+    options_.comparator = &internal_comparator_;
+    // For old sst format, ReadTableProperties might fail but file can be read
+    if (ReadTableProperties(magic_number, file_.get(), file_size,
+                            (magic_number == kBlockBasedTableMagicNumber)
+                                ? &prefetch_buffer
+                                : nullptr)
+            .ok()) {
+      s = SetTableOptionsByMagicNumber(magic_number);
+    } else {
+      s = SetOldTableOptions();
+    }
+  }
+
+  if (s.ok()) {
+    s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
+                       &table_reader_);
+  }
+  return s;
+}
+
+Status SstFileDumper::NewTableReader(
+    const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
+    const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
+    std::unique_ptr<TableReader>* /*table_reader*/) {
+  auto t_opt =
+      TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_,
+                         internal_comparator_, false /* skip_filters */,
+                         false /* imortal */, true /* force_direct_prefetch */);
+  // Allow open file with global sequence number for backward compatibility.
+  t_opt.largest_seqno = kMaxSequenceNumber;
+
+  // We need to turn off pre-fetching of index and filter nodes for
+  // BlockBasedTable
+  if (options_.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName())) {
+    return options_.table_factory->NewTableReader(t_opt, std::move(file_),
+                                                  file_size, &table_reader_,
+                                                  /*enable_prefetch=*/false);
+  }
+
+  // For all other factory implementation
+  return options_.table_factory->NewTableReader(t_opt, std::move(file_),
+                                                file_size, &table_reader_);
+}
+
+Status SstFileDumper::VerifyChecksum() {
+  // We could pass specific readahead setting into read options if needed.
+  return table_reader_->VerifyChecksum(read_options_,
+                                       TableReaderCaller::kSSTDumpTool);
+}
+
+Status SstFileDumper::DumpTable(const std::string& out_filename) {
+  std::unique_ptr<WritableFile> out_file;
+  Env* env = options_.env;
+  Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
+  if (s.ok()) {
+    s = table_reader_->DumpTable(out_file.get());
+  }
+  if (!s.ok()) {
+    // close the file before return error, ignore the close error if there's any
+    out_file->Close().PermitUncheckedError();
+    return s;
+  }
+  return out_file->Close();
+}
+
+Status SstFileDumper::CalculateCompressedTableSize(
+    const TableBuilderOptions& tb_options, size_t block_size,
+    uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
+  std::unique_ptr<Env> env(NewMemEnv(options_.env));
+  std::unique_ptr<WritableFileWriter> dest_writer;
+  Status s =
+      WritableFileWriter::Create(env->GetFileSystem(), testFileName,
+                                 FileOptions(soptions_), &dest_writer, nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.block_size = block_size;
+  BlockBasedTableFactory block_based_tf(table_options);
+  std::unique_ptr<TableBuilder> table_builder;
+  table_builder.reset(block_based_tf.NewTableBuilder(
+      tb_options,
+      dest_writer.get()));
+  std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    table_builder->Add(iter->key(), iter->value());
+  }
+  s = iter->status();
+  if (!s.ok()) {
+    return s;
+  }
+  s = table_builder->Finish();
+  if (!s.ok()) {
+    return s;
+  }
+  *compressed_table_size = table_builder->FileSize();
+  assert(num_data_blocks != nullptr);
+  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
+  return env->DeleteFile(testFileName);
+}
+
+Status SstFileDumper::ShowAllCompressionSizes(
+    size_t block_size,
+    const std::vector<std::pair<CompressionType, const char*>>&
+        compression_types,
+    int32_t compress_level_from, int32_t compress_level_to,
+    uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+    uint64_t max_dict_buffer_bytes) {
+  fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
+  for (auto& i : compression_types) {
+    if (CompressionTypeSupported(i.first)) {
+      fprintf(stdout, "Compression: %-24s\n", i.second);
+      CompressionOptions compress_opt;
+      compress_opt.max_dict_bytes = max_dict_bytes;
+      compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
+      compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
+      for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
+        fprintf(stdout, "Compression level: %d", j);
+        compress_opt.level = j;
+        Status s = ShowCompressionSize(block_size, i.first, compress_opt);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    } else {
+      fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
+    }
+  }
+  return Status::OK();
+}
+
+Status SstFileDumper::ShowCompressionSize(
+    size_t block_size, CompressionType compress_type,
+    const CompressionOptions& compress_opt) {
+  Options opts;
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.statistics->set_stats_level(StatsLevel::kAll);
+  const ImmutableOptions imoptions(opts);
+  const ColumnFamilyOptions cfo(opts);
+  const MutableCFOptions moptions(cfo);
+  ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+  IntTblPropCollectorFactories block_based_table_factories;
+
+  std::string column_family_name;
+  int unknown_level = -1;
+  TableBuilderOptions tb_opts(
+      imoptions, moptions, ikc, &block_based_table_factories, compress_type,
+      compress_opt,
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      column_family_name, unknown_level);
+  uint64_t num_data_blocks = 0;
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
+  uint64_t file_size;
+  Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
+                                          &file_size);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
+  fprintf(stdout, " Size: %10" PRIu64, file_size);
+  fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
+  fprintf(stdout, " Time Taken: %10s microsecs",
+          std::to_string(
+              std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                  .count())
+              .c_str());
+  const uint64_t compressed_blocks =
+      opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
+  const uint64_t not_compressed_blocks =
+      opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
+  // When the option enable_index_compression is true,
+  // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
+  if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
+    num_data_blocks = compressed_blocks + not_compressed_blocks;
+  }
+
+  const uint64_t ratio_not_compressed_blocks =
+      (num_data_blocks - compressed_blocks) - not_compressed_blocks;
+  const double compressed_pcnt =
+      (0 == num_data_blocks) ? 0.0
+                             : ((static_cast<double>(compressed_blocks) /
+                                 static_cast<double>(num_data_blocks)) *
+                                100.0);
+  const double ratio_not_compressed_pcnt =
+      (0 == num_data_blocks)
+          ? 0.0
+          : ((static_cast<double>(ratio_not_compressed_blocks) /
+              static_cast<double>(num_data_blocks)) *
+             100.0);
+  const double not_compressed_pcnt =
+      (0 == num_data_blocks) ? 0.0
+                             : ((static_cast<double>(not_compressed_blocks) /
+                                 static_cast<double>(num_data_blocks)) *
+                                100.0);
+  fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+          compressed_pcnt);
+  fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+          ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
+  fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+          not_compressed_blocks, not_compressed_pcnt);
+  return Status::OK();
+}
+
+// Reads TableProperties prior to opening table reader in order to set up
+// options.
+Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
+                                          RandomAccessFileReader* file,
+                                          uint64_t file_size,
+                                          FilePrefetchBuffer* prefetch_buffer) {
+  Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
+      file, file_size, table_magic_number, ioptions_, &table_properties_,
+      /* memory_allocator= */ nullptr, prefetch_buffer);
+  if (!s.ok()) {
+    if (!silent_) {
+      fprintf(stdout, "Not able to read table properties\n");
+    }
+  }
+  return s;
+}
+
+Status SstFileDumper::SetTableOptionsByMagicNumber(
+    uint64_t table_magic_number) {
+  assert(table_properties_);
+  if (table_magic_number == kBlockBasedTableMagicNumber ||
+      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
+    BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
+    // To force tail prefetching, we fake reporting two useful reads of 512KB
+    // from the tail.
+    // It needs at least two data points to warm up the stats.
+    bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
+    bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
+
+    options_.table_factory.reset(bbtf);
+    if (!silent_) {
+      fprintf(stdout, "Sst file format: block-based\n");
+    }
+
+    auto& props = table_properties_->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+      if (index_type_on_file ==
+          BlockBasedTableOptions::IndexType::kHashSearch) {
+        options_.prefix_extractor.reset(NewNoopTransform());
+      }
+    }
+  } else if (table_magic_number == kPlainTableMagicNumber ||
+             table_magic_number == kLegacyPlainTableMagicNumber) {
+    options_.allow_mmap_reads = true;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 0;
+    plain_table_options.hash_table_ratio = 0;
+    plain_table_options.index_sparseness = 1;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+    plain_table_options.full_scan_mode = true;
+
+    options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    if (!silent_) {
+      fprintf(stdout, "Sst file format: plain table\n");
+    }
+  } else {
+    char error_msg_buffer[80];
+    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
+             "Unsupported table magic number --- %lx",
+             (long)table_magic_number);
+    return Status::InvalidArgument(error_msg_buffer);
+  }
+
+  return Status::OK();
+}
+
+Status SstFileDumper::SetOldTableOptions() {
+  assert(table_properties_ == nullptr);
+  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+  if (!silent_) {
+    fprintf(stdout, "Sst file format: block-based(old version)\n");
+  }
+
+  return Status::OK();
+}
+
+Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
+                                     bool has_from, const std::string& from_key,
+                                     bool has_to, const std::string& to_key,
+                                     bool use_from_as_prefix) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  InternalIterator* iter = table_reader_->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(),
+      /*arena=*/nullptr, /*skip_filters=*/false,
+      TableReaderCaller::kSSTDumpTool);
+  uint64_t i = 0;
+  if (has_from) {
+    InternalKey ikey;
+    ikey.SetMinPossibleForUserKey(from_key);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+  for (; iter->Valid(); iter->Next()) {
+    Slice key = iter->key();
+    Slice value = iter->value();
+    ++i;
+    if (read_num > 0 && i > read_num) break;
+
+    ParsedInternalKey ikey;
+    Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      std::cerr << pik_status.getState() << "\n";
+      continue;
+    }
+
+    // the key returned is not prefixed with out 'from' key
+    if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
+      break;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+      break;
+    }
+
+    if (print_kv) {
+      if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
+                value.ToString(output_hex_).c_str());
+      } else {
+        BlobIndex blob_index;
+
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          fprintf(stderr, "%s => error decoding blob index\n",
+                  ikey.DebugString(true, output_hex_).c_str());
+          continue;
+        }
+
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
+                blob_index.DebugString(output_hex_).c_str());
+      }
+    }
+  }
+
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
+
+// Provides TableProperties to API user
+Status SstFileDumper::ReadTableProperties(
+    std::shared_ptr<const TableProperties>* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_dumper.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_dumper.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,97 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SstFileDumper {
+ public:
+  explicit SstFileDumper(const Options& options, const std::string& file_name,
+                         size_t readahead_size, bool verify_checksum,
+                         bool output_hex, bool decode_blob_index,
+                         const EnvOptions& soptions = EnvOptions(),
+                         bool silent = false);
+
+  Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
+                        const std::string& from_key, bool has_to,
+                        const std::string& to_key,
+                        bool use_from_as_prefix = false);
+
+  Status ReadTableProperties(
+      std::shared_ptr<const TableProperties>* table_properties);
+  uint64_t GetReadNumber() { return read_num_; }
+  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
+
+  Status VerifyChecksum();
+  Status DumpTable(const std::string& out_filename);
+  Status getStatus() { return init_result_; }
+
+  Status ShowAllCompressionSizes(
+      size_t block_size,
+      const std::vector<std::pair<CompressionType, const char*>>&
+          compression_types,
+      int32_t compress_level_from, int32_t compress_level_to,
+      uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+      uint64_t max_dict_buffer_bytes);
+
+  Status ShowCompressionSize(size_t block_size, CompressionType compress_type,
+                             const CompressionOptions& compress_opt);
+
+ private:
+  // Get the TableReader implementation for the sst file
+  Status GetTableReader(const std::string& file_path);
+  Status ReadTableProperties(uint64_t table_magic_number,
+                             RandomAccessFileReader* file, uint64_t file_size,
+                             FilePrefetchBuffer* prefetch_buffer);
+
+  Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
+                                      size_t block_size,
+                                      uint64_t* num_data_blocks,
+                                      uint64_t* compressed_table_size);
+
+  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
+  Status SetOldTableOptions();
+
+  // Helper function to call the factory with settings specific to the
+  // factory implementation
+  Status NewTableReader(const ImmutableOptions& ioptions,
+                        const EnvOptions& soptions,
+                        const InternalKeyComparator& internal_comparator,
+                        uint64_t file_size,
+                        std::unique_ptr<TableReader>* table_reader);
+
+  std::string file_name_;
+  uint64_t read_num_;
+  bool output_hex_;
+  bool decode_blob_index_;
+  EnvOptions soptions_;
+  // less verbose in stdout/stderr
+  bool silent_;
+
+  // options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options options_;
+
+  Status init_result_;
+  std::unique_ptr<TableReader> table_reader_;
+  std::unique_ptr<RandomAccessFileReader> file_;
+
+  const ImmutableOptions ioptions_;
+  const MutableCFOptions moptions_;
+  ReadOptions read_options_;
+  InternalKeyComparator internal_comparator_;
+  std::unique_ptr<TableProperties> table_properties_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,11 +7,13 @@
 
 #include "rocksdb/sst_file_reader.h"
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
@@ -21,7 +23,7 @@
 struct SstFileReader::Rep {
   Options options;
   EnvOptions soptions;
-  ImmutableCFOptions ioptions;
+  ImmutableOptions ioptions;
   MutableCFOptions moptions;
 
   std::unique_ptr<TableReader> table_reader;
@@ -41,18 +43,20 @@
   auto r = rep_.get();
   Status s;
   uint64_t file_size = 0;
-  std::unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<FSRandomAccessFile> file;
   std::unique_ptr<RandomAccessFileReader> file_reader;
-  s = r->options.env->GetFileSize(file_path, &file_size);
+  FileOptions fopts(r->soptions);
+  const auto& fs = r->options.env->GetFileSystem();
+
+  s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr);
   if (s.ok()) {
-    s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions);
+    s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
   }
   if (s.ok()) {
-    file_reader.reset(new RandomAccessFileReader(
-        NewLegacyRandomAccessFileWrapper(file), file_path));
+    file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
   }
   if (s.ok()) {
-    TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor.get(),
+    TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor,
                              r->soptions, r->ioptions.internal_comparator);
     // Allow open file with global sequence number for backward compatibility.
     t_opt.largest_seqno = kMaxSequenceNumber;
@@ -62,18 +66,24 @@
   return s;
 }
 
-Iterator* SstFileReader::NewIterator(const ReadOptions& options) {
+Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) {
   auto r = rep_.get();
-  auto sequence = options.snapshot != nullptr
-                      ? options.snapshot->GetSequenceNumber()
+  auto sequence = roptions.snapshot != nullptr
+                      ? roptions.snapshot->GetSequenceNumber()
                       : kMaxSequenceNumber;
+  ArenaWrappedDBIter* res = new ArenaWrappedDBIter();
+  res->Init(r->options.env, roptions, r->ioptions, r->moptions,
+            nullptr /* version */, sequence,
+            r->moptions.max_sequential_skip_in_iterations,
+            0 /* version_number */, nullptr /* read_callback */,
+            nullptr /* db_impl */, nullptr /* cfd */,
+            true /* expose_blob_index */, false /* allow_refresh */);
   auto internal_iter = r->table_reader->NewIterator(
-      options, r->moptions.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kSSTFileReader);
-  return NewDBIterator(r->options.env, options, r->ioptions, r->moptions,
-                       r->ioptions.user_comparator, internal_iter, sequence,
-                       r->moptions.max_sequential_skip_in_iterations,
-                       nullptr /* read_callback */);
+      res->GetReadOptions(), r->moptions.prefix_extractor.get(),
+      res->GetArena(), false /* skip_filters */,
+      TableReaderCaller::kSSTFileReader);
+  res->SetIterUnderDBIter(internal_iter);
+  return res;
 }
 
 std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_reader_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -5,10 +5,13 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "rocksdb/sst_file_reader.h"
+
 #include <cinttypes>
 
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
-#include "rocksdb/sst_file_reader.h"
 #include "rocksdb/sst_file_writer.h"
 #include "table/sst_file_writer_collectors.h"
 #include "test_util/testharness.h"
@@ -34,11 +37,18 @@
   SstFileReaderTest() {
     options_.merge_operator = MergeOperators::CreateUInt64AddOperator();
     sst_name_ = test::PerThreadDBPath("sst_file");
+
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(nullptr, base_env);
+    env_ = base_env;
+    options_.env = env_;
   }
 
   ~SstFileReaderTest() {
-    Status s = Env::Default()->DeleteFile(sst_name_);
-    assert(s.ok());
+    Status s = env_->DeleteFile(sst_name_);
+    EXPECT_OK(s);
   }
 
   void CreateFile(const std::string& file_name,
@@ -76,6 +86,9 @@
     if (check_global_seqno) {
       auto properties = reader.GetTableProperties();
       ASSERT_TRUE(properties);
+      std::string hostname;
+      ASSERT_OK(env_->GetHostNameString(&hostname));
+      ASSERT_EQ(properties->db_host_id, hostname);
       auto& user_properties = properties->user_collected_properties;
       ASSERT_TRUE(
           user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno));
@@ -91,6 +104,8 @@
   Options options_;
   EnvOptions soptions_;
   std::string sst_name_;
+  std::shared_ptr<Env> env_guard_;
+  Env* env_;
 };
 
 const uint64_t kNumKeys = 100;
@@ -112,6 +127,31 @@
   CreateFileAndCheck(keys);
 }
 
+TEST_F(SstFileReaderTest, ReadOptionsOutOfScope) {
+  // Repro a bug where the SstFileReader depended on its configured ReadOptions
+  // outliving it.
+  options_.comparator = test::Uint64Comparator();
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsUint64(i));
+  }
+  CreateFile(sst_name_, keys);
+
+  SstFileReader reader(options_);
+  ASSERT_OK(reader.Open(sst_name_));
+  std::unique_ptr<Iterator> iter;
+  {
+    // Make sure ReadOptions go out of scope ASAP so we know the iterator
+    // operations do not depend on it.
+    ReadOptions ropts;
+    iter.reset(reader.NewIterator(ropts));
+  }
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->Next();
+  }
+}
+
 TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
   std::vector<std::string> keys;
   for (uint64_t i = 0; i < kNumKeys; i++) {
@@ -155,10 +195,230 @@
   ASSERT_OK(DestroyDB(db_name, options));
 }
 
+TEST_F(SstFileReaderTest, TimestampSizeMismatch) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Comparator is not timestamp-aware; calls to APIs taking timestamps should
+  // fail.
+  ASSERT_NOK(writer.Put("key", EncodeAsUint64(100), "value"));
+  ASSERT_NOK(writer.Delete("another_key", EncodeAsUint64(200)));
+}
+
+class SstFileReaderTimestampTest : public testing::Test {
+ public:
+  SstFileReaderTimestampTest() {
+    Env* env = Env::Default();
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_));
+    EXPECT_NE(nullptr, env);
+
+    options_.env = env;
+
+    options_.comparator = test::ComparatorWithU64Ts();
+
+    sst_name_ = test::PerThreadDBPath("sst_file_ts");
+  }
+
+  ~SstFileReaderTimestampTest() {
+    EXPECT_OK(options_.env->DeleteFile(sst_name_));
+  }
+
+  struct KeyValueDesc {
+    KeyValueDesc(std::string k, std::string ts, std::string v)
+        : key(std::move(k)), timestamp(std::move(ts)), value(std::move(v)) {}
+
+    std::string key;
+    std::string timestamp;
+    std::string value;
+  };
+
+  struct InputKeyValueDesc : public KeyValueDesc {
+    InputKeyValueDesc(std::string k, std::string ts, std::string v, bool is_del,
+                      bool use_contig_buf)
+        : KeyValueDesc(std::move(k), std::move(ts), std::move(v)),
+          is_delete(is_del),
+          use_contiguous_buffer(use_contig_buf) {}
+
+    bool is_delete = false;
+    bool use_contiguous_buffer = false;
+  };
+
+  struct OutputKeyValueDesc : public KeyValueDesc {
+    OutputKeyValueDesc(std::string k, std::string ts, std::string v)
+        : KeyValueDesc(std::move(k), std::string(ts), std::string(v)) {}
+  };
+
+  void CreateFile(const std::vector<InputKeyValueDesc>& descs) {
+    SstFileWriter writer(soptions_, options_);
+
+    ASSERT_OK(writer.Open(sst_name_));
+
+    for (const auto& desc : descs) {
+      if (desc.is_delete) {
+        if (desc.use_contiguous_buffer) {
+          std::string key_with_ts(desc.key + desc.timestamp);
+          ASSERT_OK(writer.Delete(Slice(key_with_ts.data(), desc.key.size()),
+                                  Slice(key_with_ts.data() + desc.key.size(),
+                                        desc.timestamp.size())));
+        } else {
+          ASSERT_OK(writer.Delete(desc.key, desc.timestamp));
+        }
+      } else {
+        if (desc.use_contiguous_buffer) {
+          std::string key_with_ts(desc.key + desc.timestamp);
+          ASSERT_OK(writer.Put(Slice(key_with_ts.data(), desc.key.size()),
+                               Slice(key_with_ts.data() + desc.key.size(),
+                                     desc.timestamp.size()),
+                               desc.value));
+        } else {
+          ASSERT_OK(writer.Put(desc.key, desc.timestamp, desc.value));
+        }
+      }
+    }
+
+    ASSERT_OK(writer.Finish());
+  }
+
+  void CheckFile(const std::string& timestamp,
+                 const std::vector<OutputKeyValueDesc>& descs) {
+    SstFileReader reader(options_);
+
+    ASSERT_OK(reader.Open(sst_name_));
+    ASSERT_OK(reader.VerifyChecksum());
+
+    Slice ts_slice(timestamp);
+
+    ReadOptions read_options;
+    read_options.timestamp = &ts_slice;
+
+    std::unique_ptr<Iterator> iter(reader.NewIterator(read_options));
+    iter->SeekToFirst();
+
+    for (const auto& desc : descs) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key(), desc.key);
+      ASSERT_EQ(iter->timestamp(), desc.timestamp);
+      ASSERT_EQ(iter->value(), desc.value);
+
+      iter->Next();
+    }
+
+    ASSERT_FALSE(iter->Valid());
+  }
+
+ protected:
+  std::shared_ptr<Env> env_guard_;
+  Options options_;
+  EnvOptions soptions_;
+  std::string sst_name_;
+};
+
+TEST_F(SstFileReaderTimestampTest, Basic) {
+  std::vector<InputKeyValueDesc> input_descs;
+
+  for (uint64_t k = 0; k < kNumKeys; k += 4) {
+    // A Put with key k, timestamp k that gets overwritten by a subsequent Put
+    // with timestamp (k + 1). Note that the comparator uses descending order
+    // for the timestamp part, so we add the later Put first.
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k + 1),
+        /* value */ EncodeAsString(k * 2), /* is_delete */ false,
+        /* use_contiguous_buffer */ false);
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k), /* timestamp */ EncodeAsUint64(k),
+        /* value */ EncodeAsString(k * 3), /* is_delete */ false,
+        /* use_contiguous_buffer */ true);
+
+    // A Put with key (k + 2), timestamp (k + 2) that gets cancelled out by a
+    // Delete with timestamp (k + 3).  Note that the comparator uses descending
+    // order for the timestamp part, so we add the Delete first.
+    input_descs.emplace_back(/* key */ EncodeAsString(k + 2),
+                             /* timestamp */ EncodeAsUint64(k + 3),
+                             /* value */ std::string(), /* is_delete */ true,
+                             /* use_contiguous_buffer */ (k % 8) == 0);
+    input_descs.emplace_back(
+        /* key */ EncodeAsString(k + 2), /* timestamp */ EncodeAsUint64(k + 2),
+        /* value */ EncodeAsString(k * 5), /* is_delete */ false,
+        /* use_contiguous_buffer */ (k % 8) != 0);
+  }
+
+  CreateFile(input_descs);
+
+  // Note: below, we check the results as of each timestamp in the range,
+  // updating the expected result as needed.
+  std::vector<OutputKeyValueDesc> output_descs;
+
+  for (uint64_t ts = 0; ts < kNumKeys; ++ts) {
+    const uint64_t k = ts - (ts % 4);
+
+    switch (ts % 4) {
+      case 0:  // Initial Put for key k
+        output_descs.emplace_back(/* key */ EncodeAsString(k),
+                                  /* timestamp */ EncodeAsUint64(ts),
+                                  /* value */ EncodeAsString(k * 3));
+        break;
+
+      case 1:  // Second Put for key k
+        assert(output_descs.back().key == EncodeAsString(k));
+        assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1));
+        assert(output_descs.back().value == EncodeAsString(k * 3));
+        output_descs.back().timestamp = EncodeAsUint64(ts);
+        output_descs.back().value = EncodeAsString(k * 2);
+        break;
+
+      case 2:  // Put for key (k + 2)
+        output_descs.emplace_back(/* key */ EncodeAsString(k + 2),
+                                  /* timestamp */ EncodeAsUint64(ts),
+                                  /* value */ EncodeAsString(k * 5));
+        break;
+
+      case 3:  // Delete for key (k + 2)
+        assert(output_descs.back().key == EncodeAsString(k + 2));
+        assert(output_descs.back().timestamp == EncodeAsUint64(ts - 1));
+        assert(output_descs.back().value == EncodeAsString(k * 5));
+        output_descs.pop_back();
+        break;
+    }
+
+    CheckFile(EncodeAsUint64(ts), output_descs);
+  }
+}
+
+TEST_F(SstFileReaderTimestampTest, TimestampsOutOfOrder) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Note: KVs that have the same user key disregarding timestamps should be in
+  // descending order of timestamps.
+  ASSERT_OK(writer.Put("key", EncodeAsUint64(1), "value1"));
+  ASSERT_NOK(writer.Put("key", EncodeAsUint64(2), "value2"));
+}
+
+TEST_F(SstFileReaderTimestampTest, TimestampSizeMismatch) {
+  SstFileWriter writer(soptions_, options_);
+
+  ASSERT_OK(writer.Open(sst_name_));
+
+  // Comparator expects 64-bit timestamps; timestamps with other sizes as well
+  // as calls to the timestamp-less APIs should be rejected.
+  ASSERT_NOK(writer.Put("key", "not_an_actual_64_bit_timestamp", "value"));
+  ASSERT_NOK(writer.Delete("another_key", "timestamp_of_unexpected_size"));
+
+  ASSERT_NOK(writer.Put("key_without_timestamp", "value"));
+  ASSERT_NOK(writer.Merge("another_key_missing_a_timestamp", "merge_operand"));
+  ASSERT_NOK(writer.Delete("yet_another_key_still_no_timestamp"));
+  ASSERT_NOK(writer.DeleteRange("begin_key_timestamp_absent",
+                                "end_key_with_a_complete_lack_of_timestamps"));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,9 +7,10 @@
 
 #include <vector>
 
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/writable_file_writer.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_writer_collectors.h"
@@ -29,7 +30,8 @@
 struct SstFileWriter::Rep {
   Rep(const EnvOptions& _env_options, const Options& options,
       Env::IOPriority _io_priority, const Comparator* _user_comparator,
-      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters)
+      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters,
+      std::string _db_session_id)
       : env_options(_env_options),
         ioptions(options),
         mutable_cf_options(options),
@@ -37,13 +39,13 @@
         internal_comparator(_user_comparator),
         cfh(_cfh),
         invalidate_page_cache(_invalidate_page_cache),
-        last_fadvise_size(0),
-        skip_filters(_skip_filters) {}
+        skip_filters(_skip_filters),
+        db_session_id(_db_session_id) {}
 
   std::unique_ptr<WritableFileWriter> file_writer;
   std::unique_ptr<TableBuilder> builder;
   EnvOptions env_options;
-  ImmutableCFOptions ioptions;
+  ImmutableOptions ioptions;
   MutableCFOptions mutable_cf_options;
   Env::IOPriority io_priority;
   InternalKeyComparator internal_comparator;
@@ -56,10 +58,13 @@
   bool invalidate_page_cache;
   // The size of the file during the last time we called Fadvise to remove
   // cached pages from page cache.
-  uint64_t last_fadvise_size;
+  uint64_t last_fadvise_size = 0;
   bool skip_filters;
-  Status Add(const Slice& user_key, const Slice& value,
-             const ValueType value_type) {
+  std::string db_session_id;
+  uint64_t next_file_number = 1;
+
+  Status AddImpl(const Slice& user_key, const Slice& value,
+                 ValueType value_type) {
     if (!builder) {
       return Status::InvalidArgument("File is not opened");
     }
@@ -75,23 +80,14 @@
       }
     }
 
-    // TODO(tec) : For external SST files we could omit the seqno and type.
-    switch (value_type) {
-      case ValueType::kTypeValue:
-        ikey.Set(user_key, 0 /* Sequence Number */,
-                 ValueType::kTypeValue /* Put */);
-        break;
-      case ValueType::kTypeMerge:
-        ikey.Set(user_key, 0 /* Sequence Number */,
-                 ValueType::kTypeMerge /* Merge */);
-        break;
-      case ValueType::kTypeDeletion:
-        ikey.Set(user_key, 0 /* Sequence Number */,
-                 ValueType::kTypeDeletion /* Delete */);
-        break;
-      default:
-        return Status::InvalidArgument("Value type is not supported");
-    }
+    assert(value_type == kTypeValue || value_type == kTypeMerge ||
+           value_type == kTypeDeletion ||
+           value_type == kTypeDeletionWithTimestamp);
+
+    constexpr SequenceNumber sequence_number = 0;
+
+    ikey.Set(user_key, sequence_number, value_type);
+
     builder->Add(ikey.Encode(), value);
 
     // update file info
@@ -99,12 +95,46 @@
     file_info.largest_key.assign(user_key.data(), user_key.size());
     file_info.file_size = builder->FileSize();
 
-    InvalidatePageCache(false /* closing */);
-
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
     return Status::OK();
   }
 
+  Status Add(const Slice& user_key, const Slice& value, ValueType value_type) {
+    if (internal_comparator.timestamp_size() != 0) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    return AddImpl(user_key, value, value_type);
+  }
+
+  Status Add(const Slice& user_key, const Slice& timestamp, const Slice& value,
+             ValueType value_type) {
+    const size_t timestamp_size = timestamp.size();
+
+    if (internal_comparator.timestamp_size() != timestamp_size) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
+    const size_t user_key_size = user_key.size();
+
+    if (user_key.data() + user_key_size == timestamp.data()) {
+      Slice user_key_with_ts(user_key.data(), user_key_size + timestamp_size);
+      return AddImpl(user_key_with_ts, value, value_type);
+    }
+
+    std::string user_key_with_ts;
+    user_key_with_ts.reserve(user_key_size + timestamp_size);
+    user_key_with_ts.append(user_key.data(), user_key_size);
+    user_key_with_ts.append(timestamp.data(), timestamp_size);
+
+    return AddImpl(user_key_with_ts, value, value_type);
+  }
+
   Status DeleteRange(const Slice& begin_key, const Slice& end_key) {
+    if (internal_comparator.timestamp_size() != 0) {
+      return Status::InvalidArgument("Timestamp size mismatch");
+    }
+
     if (!builder) {
       return Status::InvalidArgument("File is not opened");
     }
@@ -135,27 +165,32 @@
     file_info.num_range_del_entries++;
     file_info.file_size = builder->FileSize();
 
-    InvalidatePageCache(false /* closing */);
-
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
     return Status::OK();
   }
 
-  void InvalidatePageCache(bool closing) {
+  Status InvalidatePageCache(bool closing) {
+    Status s = Status::OK();
     if (invalidate_page_cache == false) {
       // Fadvise disabled
-      return;
+      return s;
     }
     uint64_t bytes_since_last_fadvise =
       builder->FileSize() - last_fadvise_size;
     if (bytes_since_last_fadvise > kFadviseTrigger || closing) {
       TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache",
                                &(bytes_since_last_fadvise));
-      // Tell the OS that we dont need this file in page cache
-      file_writer->InvalidateCache(0, 0);
+      // Tell the OS that we don't need this file in page cache
+      s = file_writer->InvalidateCache(0, 0);
+      if (s.IsNotSupported()) {
+        // NotSupported is fine as it could be a file type that doesn't use page
+        // cache.
+        s = Status::OK();
+      }
       last_fadvise_size = builder->FileSize();
     }
+    return s;
   }
-
 };
 
 SstFileWriter::SstFileWriter(const EnvOptions& env_options,
@@ -165,7 +200,14 @@
                              bool invalidate_page_cache,
                              Env::IOPriority io_priority, bool skip_filters)
     : rep_(new Rep(env_options, options, io_priority, user_comparator,
-                   column_family, invalidate_page_cache, skip_filters)) {
+                   column_family, invalidate_page_cache, skip_filters,
+                   DBImpl::GenerateDbSessionId(options.env))) {
+  // SstFileWriter is used to create sst files that can be added to database
+  // later. Therefore, no real db_id and db_session_id are associated with it.
+  // Here we mimic the way db_session_id behaves by getting a db_session_id
+  // for each SstFileWriter, and (later below) assign unique file numbers
+  // in the table properties. The db_id is set to be "SST Writer" for clarity.
+
   rep_->file_info.file_size = 0;
 }
 
@@ -180,8 +222,10 @@
 Status SstFileWriter::Open(const std::string& file_path) {
   Rep* r = rep_.get();
   Status s;
-  std::unique_ptr<WritableFile> sst_file;
-  s = r->ioptions.env->NewWritableFile(file_path, &sst_file, r->env_options);
+  std::unique_ptr<FSWritableFile> sst_file;
+  FileOptions cur_file_opts(r->env_options);
+  s = r->ioptions.env->GetFileSystem()->NewWritableFile(
+      file_path, cur_file_opts, &sst_file, nullptr);
   if (!s.ok()) {
     return s;
   }
@@ -190,26 +234,24 @@
 
   CompressionType compression_type;
   CompressionOptions compression_opts;
-  if (r->ioptions.bottommost_compression != kDisableCompressionOption) {
-    compression_type = r->ioptions.bottommost_compression;
-    if (r->ioptions.bottommost_compression_opts.enabled) {
-      compression_opts = r->ioptions.bottommost_compression_opts;
+  if (r->mutable_cf_options.bottommost_compression !=
+      kDisableCompressionOption) {
+    compression_type = r->mutable_cf_options.bottommost_compression;
+    if (r->mutable_cf_options.bottommost_compression_opts.enabled) {
+      compression_opts = r->mutable_cf_options.bottommost_compression_opts;
     } else {
-      compression_opts = r->ioptions.compression_opts;
+      compression_opts = r->mutable_cf_options.compression_opts;
     }
   } else if (!r->ioptions.compression_per_level.empty()) {
     // Use the compression of the last level if we have per level compression
     compression_type = *(r->ioptions.compression_per_level.rbegin());
-    compression_opts = r->ioptions.compression_opts;
+    compression_opts = r->mutable_cf_options.compression_opts;
   } else {
     compression_type = r->mutable_cf_options.compression;
-    compression_opts = r->ioptions.compression_opts;
+    compression_opts = r->mutable_cf_options.compression_opts;
   }
-  uint64_t sample_for_compression =
-      r->mutable_cf_options.sample_for_compression;
 
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
 
   // SstFileWriter properties collector to add SstFileWriter version.
   int_tbl_prop_collector_factories.emplace_back(
@@ -236,21 +278,33 @@
     r->column_family_name = "";
     cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
   }
-
   TableBuilderOptions table_builder_options(
       r->ioptions, r->mutable_cf_options, r->internal_comparator,
-      &int_tbl_prop_collector_factories, compression_type,
-      sample_for_compression, compression_opts, r->skip_filters,
-      r->column_family_name, unknown_level);
-  r->file_writer.reset(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(sst_file)),
-                             file_path, r->env_options, r->ioptions.env,
-                             nullptr /* stats */, r->ioptions.listeners));
+      &int_tbl_prop_collector_factories, compression_type, compression_opts,
+      cf_id, r->column_family_name, unknown_level, false /* is_bottommost */,
+      TableFileCreationReason::kMisc, 0 /* creation_time */,
+      0 /* oldest_key_time */, 0 /* file_creation_time */,
+      "SST Writer" /* db_id */, r->db_session_id, 0 /* target_file_size */,
+      r->next_file_number);
+  // External SST files used to each get a unique session id. Now for
+  // slightly better uniqueness probability in constructing cache keys, we
+  // assign fake file numbers to each file (into table properties) and keep
+  // the same session id for the life of the SstFileWriter.
+  r->next_file_number++;
+  // XXX: when we can remove skip_filters from the SstFileWriter public API
+  // we can remove it from TableBuilderOptions.
+  table_builder_options.skip_filters = r->skip_filters;
+  FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types;
+  r->file_writer.reset(new WritableFileWriter(
+      std::move(sst_file), file_path, r->env_options, r->ioptions.clock,
+      nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners,
+      r->ioptions.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile), false));
 
   // TODO(tec) : If table_factory is using compressed block cache, we will
   // be adding the external sst file blocks into it, which is wasteful.
   r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
-      table_builder_options, cf_id, r->file_writer.get()));
+      table_builder_options, r->file_writer.get()));
 
   r->file_info = ExternalSstFileInfo();
   r->file_info.file_path = file_path;
@@ -266,6 +320,11 @@
   return rep_->Add(user_key, value, ValueType::kTypeValue);
 }
 
+Status SstFileWriter::Put(const Slice& user_key, const Slice& timestamp,
+                          const Slice& value) {
+  return rep_->Add(user_key, timestamp, value, ValueType::kTypeValue);
+}
+
 Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) {
   return rep_->Add(user_key, value, ValueType::kTypeMerge);
 }
@@ -274,6 +333,11 @@
   return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion);
 }
 
+Status SstFileWriter::Delete(const Slice& user_key, const Slice& timestamp) {
+  return rep_->Add(user_key, timestamp, Slice(),
+                   ValueType::kTypeDeletionWithTimestamp);
+}
+
 Status SstFileWriter::DeleteRange(const Slice& begin_key,
                                   const Slice& end_key) {
   return rep_->DeleteRange(begin_key, end_key);
@@ -294,11 +358,16 @@
 
   if (s.ok()) {
     s = r->file_writer->Sync(r->ioptions.use_fsync);
-    r->InvalidatePageCache(true /* closing */);
+    r->InvalidatePageCache(true /* closing */).PermitUncheckedError();
     if (s.ok()) {
       s = r->file_writer->Close();
     }
   }
+  if (s.ok()) {
+    r->file_info.file_checksum = r->file_writer->GetFileChecksum();
+    r->file_info.file_checksum_func_name =
+        r->file_writer->GetFileChecksumFuncName();
+  }
   if (!s.ok()) {
     r->ioptions.env->DeleteFile(r->file_info.file_path);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/sst_file_writer_collectors.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,9 +5,10 @@
 
 #pragma once
 #include <string>
-#include "db/dbformat.h"
+
 #include "db/table_properties_collector.h"
 #include "rocksdb/types.h"
+#include "util/coding.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -35,9 +36,9 @@
     return Status::OK();
   }
 
-  virtual void BlockAdd(uint64_t /* blockRawBytes */,
-                        uint64_t /* blockCompressedBytesFast */,
-                        uint64_t /* blockCompressedBytesSlow */) override {
+  virtual void BlockAdd(uint64_t /* block_raw_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
     // Intentionally left blank. No interest in collecting stats for
     // blocks.
     return;
@@ -78,7 +79,7 @@
       : version_(version), global_seqno_(global_seqno) {}
 
   virtual IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t /*column_family_id*/) override {
+      uint32_t /*column_family_id*/, int /* level_at_creation */) override {
     return new SstFileWriterPropertiesCollector(version_, global_seqno_);
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_builder.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_builder.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_builder.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_builder.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,9 +10,11 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "file/writable_file_writer.h"
@@ -28,92 +30,133 @@
 
 struct TableReaderOptions {
   // @param skip_filters Disables loading/accessing the filter block
-  TableReaderOptions(const ImmutableCFOptions& _ioptions,
-                     const SliceTransform* _prefix_extractor,
-                     const EnvOptions& _env_options,
-                     const InternalKeyComparator& _internal_comparator,
-                     bool _skip_filters = false, bool _immortal = false,
-                     int _level = -1,
-                     BlockCacheTracer* const _block_cache_tracer = nullptr)
-      : TableReaderOptions(_ioptions, _prefix_extractor, _env_options,
-                           _internal_comparator, _skip_filters, _immortal,
-                           _level, 0 /* _largest_seqno */,
-                           _block_cache_tracer) {}
+  TableReaderOptions(
+      const ImmutableOptions& _ioptions,
+      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      const EnvOptions& _env_options,
+      const InternalKeyComparator& _internal_comparator,
+      bool _skip_filters = false, bool _immortal = false,
+      bool _force_direct_prefetch = false, int _level = -1,
+      BlockCacheTracer* const _block_cache_tracer = nullptr,
+      size_t _max_file_size_for_l0_meta_pin = 0,
+      const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0)
+      : TableReaderOptions(
+            _ioptions, _prefix_extractor, _env_options, _internal_comparator,
+            _skip_filters, _immortal, _force_direct_prefetch, _level,
+            0 /* _largest_seqno */, _block_cache_tracer,
+            _max_file_size_for_l0_meta_pin, _cur_db_session_id, _cur_file_num) {
+  }
 
   // @param skip_filters Disables loading/accessing the filter block
-  TableReaderOptions(const ImmutableCFOptions& _ioptions,
-                     const SliceTransform* _prefix_extractor,
-                     const EnvOptions& _env_options,
-                     const InternalKeyComparator& _internal_comparator,
-                     bool _skip_filters, bool _immortal, int _level,
-                     SequenceNumber _largest_seqno,
-                     BlockCacheTracer* const _block_cache_tracer)
+  TableReaderOptions(
+      const ImmutableOptions& _ioptions,
+      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      const EnvOptions& _env_options,
+      const InternalKeyComparator& _internal_comparator, bool _skip_filters,
+      bool _immortal, bool _force_direct_prefetch, int _level,
+      SequenceNumber _largest_seqno,
+      BlockCacheTracer* const _block_cache_tracer,
+      size_t _max_file_size_for_l0_meta_pin,
+      const std::string& _cur_db_session_id, uint64_t _cur_file_num)
       : ioptions(_ioptions),
         prefix_extractor(_prefix_extractor),
         env_options(_env_options),
         internal_comparator(_internal_comparator),
         skip_filters(_skip_filters),
         immortal(_immortal),
+        force_direct_prefetch(_force_direct_prefetch),
         level(_level),
         largest_seqno(_largest_seqno),
-        block_cache_tracer(_block_cache_tracer) {}
+        block_cache_tracer(_block_cache_tracer),
+        max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin),
+        cur_db_session_id(_cur_db_session_id),
+        cur_file_num(_cur_file_num) {}
 
-  const ImmutableCFOptions& ioptions;
-  const SliceTransform* prefix_extractor;
+  const ImmutableOptions& ioptions;
+  const std::shared_ptr<const SliceTransform>& prefix_extractor;
   const EnvOptions& env_options;
   const InternalKeyComparator& internal_comparator;
   // This is only used for BlockBasedTable (reader)
   bool skip_filters;
   // Whether the table will be valid as long as the DB is open
   bool immortal;
-  // what level this table/file is on, -1 for "not set, don't know"
+  // When data prefetching is needed, even if direct I/O is off, read data to
+  // fetch into RocksDB's buffer, rather than relying
+  // RandomAccessFile::Prefetch().
+  bool force_direct_prefetch;
+  // What level this table/file is on, -1 for "not set, don't know." Used
+  // for level-specific statistics.
   int level;
   // largest seqno in the table
   SequenceNumber largest_seqno;
   BlockCacheTracer* const block_cache_tracer;
+  // Largest L0 file size whose meta-blocks may be pinned (can be zero when
+  // unknown).
+  const size_t max_file_size_for_l0_meta_pin;
+
+  std::string cur_db_session_id;
+
+  uint64_t cur_file_num;
 };
 
 struct TableBuilderOptions {
   TableBuilderOptions(
-      const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
+      const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions,
       const InternalKeyComparator& _internal_comparator,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          _int_tbl_prop_collector_factories,
-      CompressionType _compression_type, uint64_t _sample_for_compression,
-      const CompressionOptions& _compression_opts, bool _skip_filters,
+      const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories,
+      CompressionType _compression_type,
+      const CompressionOptions& _compression_opts, uint32_t _column_family_id,
       const std::string& _column_family_name, int _level,
+      bool _is_bottommost = false,
+      TableFileCreationReason _reason = TableFileCreationReason::kMisc,
       const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0,
-      const uint64_t _target_file_size = 0,
-      const uint64_t _file_creation_time = 0)
+      const uint64_t _file_creation_time = 0, const std::string& _db_id = "",
+      const std::string& _db_session_id = "",
+      const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0)
       : ioptions(_ioptions),
         moptions(_moptions),
         internal_comparator(_internal_comparator),
         int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
         compression_type(_compression_type),
-        sample_for_compression(_sample_for_compression),
         compression_opts(_compression_opts),
-        skip_filters(_skip_filters),
+        column_family_id(_column_family_id),
         column_family_name(_column_family_name),
-        level(_level),
         creation_time(_creation_time),
         oldest_key_time(_oldest_key_time),
         target_file_size(_target_file_size),
-        file_creation_time(_file_creation_time) {}
-  const ImmutableCFOptions& ioptions;
+        file_creation_time(_file_creation_time),
+        db_id(_db_id),
+        db_session_id(_db_session_id),
+        level_at_creation(_level),
+        is_bottommost(_is_bottommost),
+        reason(_reason),
+        cur_file_num(_cur_file_num) {}
+
+  const ImmutableOptions& ioptions;
   const MutableCFOptions& moptions;
   const InternalKeyComparator& internal_comparator;
-  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-      int_tbl_prop_collector_factories;
-  CompressionType compression_type;
-  uint64_t sample_for_compression;
+  const IntTblPropCollectorFactories* int_tbl_prop_collector_factories;
+  const CompressionType compression_type;
   const CompressionOptions& compression_opts;
-  bool skip_filters;  // only used by BlockBasedTableBuilder
+  const uint32_t column_family_id;
   const std::string& column_family_name;
-  int level; // what level this table/file is on, -1 for "not set, don't know"
   const uint64_t creation_time;
   const int64_t oldest_key_time;
   const uint64_t target_file_size;
   const uint64_t file_creation_time;
+  const std::string db_id;
+  const std::string db_session_id;
+  // BEGIN for FilterBuildingContext
+  const int level_at_creation;
+  const bool is_bottommost;
+  const TableFileCreationReason reason;
+  // END for FilterBuildingContext
+
+  // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you
+  // want to skip filters, that should be (for example) null filter_policy
+  // in the table options of the ioptions.table_factory
+  bool skip_filters = false;
+  const uint64_t cur_file_num;
 };
 
 // TableBuilder provides the interface used to build a Table
@@ -136,6 +179,9 @@
   // Return non-ok iff some error has been detected.
   virtual Status status() const = 0;
 
+  // Return non-ok iff some error happens during IO.
+  virtual IOStatus io_status() const = 0;
+
   // Finish building the table.
   // REQUIRES: Finish(), Abandon() have not been called
   virtual Status Finish() = 0;
@@ -149,10 +195,21 @@
   // Number of calls to Add() so far.
   virtual uint64_t NumEntries() const = 0;
 
+  // Whether the output file is completely empty. It has neither entries
+  // or tombstones.
+  virtual bool IsEmpty() const {
+    return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0;
+  }
+
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
   virtual uint64_t FileSize() const = 0;
 
+  // Estimated size of the file generated so far. This is used when
+  // FileSize() cannot estimate final SST size, e.g. parallel compression
+  // is enabled.
+  virtual uint64_t EstimatedFileSize() const { return FileSize(); }
+
   // If the user defined table properties collector suggest the file to
   // be further compacted.
   virtual bool NeedCompact() const { return false; }
@@ -161,7 +218,7 @@
   virtual TableProperties GetTableProperties() const = 0;
 
   // Return file checksum
-  virtual const std::string& GetFileChecksum() const = 0;
+  virtual std::string GetFileChecksum() const = 0;
 
   // Return file checksum function name
   virtual const char* GetFileChecksumFuncName() const = 0;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_factory.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_factory.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_factory.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_factory.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,65 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <mutex>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/plain/plain_table_factory.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void RegisterTableFactories(const std::string& /*arg*/) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag loaded;
+  std::call_once(loaded, []() {
+    auto library = ObjectLibrary::Default();
+    library->AddFactory<TableFactory>(
+        TableFactory::kBlockBasedTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new BlockBasedTableFactory());
+          return guard->get();
+        });
+    library->AddFactory<TableFactory>(
+        TableFactory::kPlainTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new PlainTableFactory());
+          return guard->get();
+        });
+    library->AddFactory<TableFactory>(
+        TableFactory::kCuckooTableName(),
+        [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+           std::string* /* errmsg */) {
+          guard->reset(new CuckooTableFactory());
+          return guard->get();
+        });
+  });
+#endif  // ROCKSDB_LITE
+}
+
+static bool LoadFactory(const std::string& name,
+                        std::shared_ptr<TableFactory>* factory) {
+  if (name == TableFactory::kBlockBasedTableName()) {
+    factory->reset(new BlockBasedTableFactory());
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status TableFactory::CreateFromString(const ConfigOptions& config_options,
+                                      const std::string& value,
+                                      std::shared_ptr<TableFactory>* factory) {
+  RegisterTableFactories("");
+  return LoadSharedObject<TableFactory>(config_options, value, LoadFactory,
+                                        factory);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,10 +7,10 @@
 
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "rocksdb/iterator.h"
-#include "table/block_based/block.h"
-#include "table/internal_iterator.h"
+#include "rocksdb/unique_id.h"
 #include "table/table_properties_internal.h"
+#include "table/unique_id_impl.h"
+#include "util/random.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -42,31 +42,6 @@
         props, key, ToString(value), prop_delim, kv_delim
     );
   }
-
-  // Seek to the specified meta block.
-  // Return true if it successfully seeks to that block.
-  Status SeekToMetaBlock(InternalIterator* meta_iter,
-                         const std::string& block_name, bool* is_found,
-                         BlockHandle* block_handle = nullptr) {
-    if (block_handle != nullptr) {
-      *block_handle = BlockHandle::NullBlockHandle();
-    }
-    *is_found = true;
-    meta_iter->Seek(block_name);
-    if (meta_iter->status().ok()) {
-      if (meta_iter->Valid() && meta_iter->key() == block_name) {
-        *is_found = true;
-        if (block_handle) {
-          Slice v = meta_iter->value();
-          return block_handle->DecodeFrom(&v);
-        }
-      } else {
-        *is_found = false;
-        return Status::OK();
-      }
-    }
-    return meta_iter->status();
-  }
 }
 
 std::string TableProperties::ToString(
@@ -111,6 +86,8 @@
   }
   AppendProperty(result, "filter block size", filter_size, prop_delim,
                  kv_delim);
+  AppendProperty(result, "# entries for filter", num_filter_entries, prop_delim,
+                 kv_delim);
   AppendProperty(result, "(estimated) table size",
                  data_size + index_size + filter_size, prop_delim, kv_delim);
 
@@ -168,6 +145,26 @@
   AppendProperty(result, "file creation time", file_creation_time, prop_delim,
                  kv_delim);
 
+  AppendProperty(result, "slow compression estimated data size",
+                 slow_compression_estimated_data_size, prop_delim, kv_delim);
+  AppendProperty(result, "fast compression estimated data size",
+                 fast_compression_estimated_data_size, prop_delim, kv_delim);
+
+  // DB identity and DB session ID
+  AppendProperty(result, "DB identity", db_id, prop_delim, kv_delim);
+  AppendProperty(result, "DB session identity", db_session_id, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "DB host id", db_host_id, prop_delim, kv_delim);
+  AppendProperty(result, "original file number", orig_file_number, prop_delim,
+                 kv_delim);
+
+  // Unique ID, when available
+  std::string id;
+  Status s = GetUniqueIdFromTableProperties(*this, &id);
+  AppendProperty(result, "unique ID",
+                 s.ok() ? UniqueIdToHumanString(id) : "N/A", prop_delim,
+                 kv_delim);
+
   return result;
 }
 
@@ -183,11 +180,46 @@
   raw_value_size += tp.raw_value_size;
   num_data_blocks += tp.num_data_blocks;
   num_entries += tp.num_entries;
+  num_filter_entries += tp.num_filter_entries;
   num_deletions += tp.num_deletions;
   num_merge_operands += tp.num_merge_operands;
   num_range_deletions += tp.num_range_deletions;
+  slow_compression_estimated_data_size +=
+      tp.slow_compression_estimated_data_size;
+  fast_compression_estimated_data_size +=
+      tp.fast_compression_estimated_data_size;
 }
 
+std::map<std::string, uint64_t>
+TableProperties::GetAggregatablePropertiesAsMap() const {
+  std::map<std::string, uint64_t> rv;
+  rv["data_size"] = data_size;
+  rv["index_size"] = index_size;
+  rv["index_partitions"] = index_partitions;
+  rv["top_level_index_size"] = top_level_index_size;
+  rv["filter_size"] = filter_size;
+  rv["raw_key_size"] = raw_key_size;
+  rv["raw_value_size"] = raw_value_size;
+  rv["num_data_blocks"] = num_data_blocks;
+  rv["num_entries"] = num_entries;
+  rv["num_filter_entries"] = num_filter_entries;
+  rv["num_deletions"] = num_deletions;
+  rv["num_merge_operands"] = num_merge_operands;
+  rv["num_range_deletions"] = num_range_deletions;
+  rv["slow_compression_estimated_data_size"] =
+      slow_compression_estimated_data_size;
+  rv["fast_compression_estimated_data_size"] =
+      fast_compression_estimated_data_size;
+  return rv;
+}
+
+const std::string TablePropertiesNames::kDbId = "rocksdb.creating.db.identity";
+const std::string TablePropertiesNames::kDbSessionId =
+    "rocksdb.creating.session.identity";
+const std::string TablePropertiesNames::kDbHostId =
+    "rocksdb.creating.host.identity";
+const std::string TablePropertiesNames::kOriginalFileNumber =
+    "rocksdb.original.file.number";
 const std::string TablePropertiesNames::kDataSize  =
     "rocksdb.data.size";
 const std::string TablePropertiesNames::kIndexSize =
@@ -210,6 +242,8 @@
     "rocksdb.num.data.blocks";
 const std::string TablePropertiesNames::kNumEntries =
     "rocksdb.num.entries";
+const std::string TablePropertiesNames::kNumFilterEntries =
+    "rocksdb.num.filter_entries";
 const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys";
 const std::string TablePropertiesNames::kMergeOperands =
     "rocksdb.merge.operands";
@@ -240,33 +274,32 @@
     "rocksdb.oldest.key.time";
 const std::string TablePropertiesNames::kFileCreationTime =
     "rocksdb.file.creation.time";
+const std::string TablePropertiesNames::kSlowCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.slow.data.size";
+const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.fast.data.size";
+
+#ifndef NDEBUG
+void TEST_SetRandomTableProperties(TableProperties* props) {
+  Random* r = Random::GetTLSInstance();
+  // For now, TableProperties is composed of a number of uint64_t followed by
+  // a number of std::string, followed by some extras starting with
+  // user_collected_properties.
+  uint64_t* pu = &props->orig_file_number;
+  assert(static_cast<void*>(pu) == static_cast<void*>(props));
+  std::string* ps = &props->db_id;
+  const uint64_t* const pu_end = reinterpret_cast<const uint64_t*>(ps);
+  const std::string* const ps_end =
+      reinterpret_cast<const std::string*>(&props->user_collected_properties);
 
-extern const std::string kPropertiesBlock = "rocksdb.properties";
-// Old property block name for backward compatibility
-extern const std::string kPropertiesBlockOldName = "rocksdb.stats";
-extern const std::string kCompressionDictBlock = "rocksdb.compression_dict";
-extern const std::string kRangeDelBlock = "rocksdb.range_del";
-
-// Seek to the properties block.
-// Return true if it successfully seeks to the properties block.
-Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) {
-  Status status = SeekToMetaBlock(meta_iter, kPropertiesBlock, is_found);
-  if (!*is_found && status.ok()) {
-    status = SeekToMetaBlock(meta_iter, kPropertiesBlockOldName, is_found);
+  for (; pu < pu_end; ++pu) {
+    *pu = r->Next64();
+  }
+  assert(static_cast<void*>(pu) == static_cast<void*>(ps));
+  for (; ps < ps_end; ++ps) {
+    *ps = r->RandomBinaryString(13);
   }
-  return status;
-}
-
-// Seek to the compression dictionary block.
-// Return true if it successfully seeks to that block.
-Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found,
-                                  BlockHandle* block_handle) {
-  return SeekToMetaBlock(meta_iter, kCompressionDictBlock, is_found, block_handle);
-}
-
-Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found,
-                           BlockHandle* block_handle = nullptr) {
-  return SeekToMetaBlock(meta_iter, kRangeDelBlock, is_found, block_handle);
 }
+#endif
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties_internal.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties_internal.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_properties_internal.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_properties_internal.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,26 +5,10 @@
 
 #pragma once
 
-#include "rocksdb/status.h"
-#include "rocksdb/iterator.h"
+#include "rocksdb/table_properties.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-class BlockHandle;
-
-// Seek to the properties block.
-// If it successfully seeks to the properties block, "is_found" will be
-// set to true.
-Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found);
-
-// Seek to the compression dictionary block.
-// If it successfully seeks to the properties block, "is_found" will be
-// set to true.
-Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found,
-                          BlockHandle* block_handle);
-
-// TODO(andrewkr) should not put all meta block in table_properties.h/cc
-Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found,
-                           BlockHandle* block_handle);
-
+#ifndef NDEBUG
+void TEST_SetRandomTableProperties(TableProperties* props);
+#endif
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader.h	2025-05-19 16:14:27.000000000 +0000
@@ -39,6 +39,8 @@
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
+  //
+  // read_options: Must outlive the returned iterator.
   // arena: If not null, the arena needs to be used to allocate the Iterator.
   //        When destroying the iterator, the caller will not call "delete"
   //        but Iterator::~Iterator() directly. The destructor needs to destroy
@@ -48,9 +50,10 @@
   // compaction_readahead_size: its value will only be used if caller =
   // kCompaction
   virtual InternalIterator* NewIterator(
-      const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena,
-      bool skip_filters, TableReaderCaller caller,
-      size_t compaction_readahead_size = 0) = 0;
+      const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+      Arena* arena, bool skip_filters, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0,
+      bool allow_unprepared_value = false) = 0;
 
   virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& /*read_options*/) {
@@ -63,12 +66,19 @@
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
+  // TODO(peterd): Since this function is only used for approximate size
+  // from beginning of file, reduce code duplication by removing this
+  // function and letting ApproximateSize take optional start and end, so
+  // that absolute start and end can be specified and optimized without
+  // key / index work.
   virtual uint64_t ApproximateOffsetOf(const Slice& key,
                                        TableReaderCaller caller) = 0;
 
   // Given start and end keys, return the approximate data size in the file
   // between the keys. The returned value is in terms of file bytes, and so
-  // includes effects like compression of the underlying data.
+  // includes effects like compression of the underlying data and applicable
+  // portions of metadata including filters and indexes. Nullptr for start or
+  // end (or both) indicates absolute start or end of the table.
   virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
                                    TableReaderCaller caller) = 0;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -13,11 +13,12 @@
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/random_access_file_reader.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/get_context.h"
@@ -50,8 +51,8 @@
   return key.Encode().ToString();
 }
 
-uint64_t Now(Env* env, bool measured_by_nanosecond) {
-  return measured_by_nanosecond ? env->NowNanos() : env->NowMicros();
+uint64_t Now(SystemClock* clock, bool measured_by_nanosecond) {
+  return measured_by_nanosecond ? clock->NowNanos() : clock->NowMicros();
 }
 }  // namespace
 
@@ -81,30 +82,28 @@
   std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db");
   WriteOptions wo;
   Env* env = Env::Default();
+  auto* clock = env->GetSystemClock().get();
   TableBuilder* tb = nullptr;
   DB* db = nullptr;
   Status s;
-  const ImmutableCFOptions ioptions(opts);
+  const ImmutableOptions ioptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
   std::unique_ptr<WritableFileWriter> file_writer;
   if (!through_db) {
-    std::unique_ptr<WritableFile> file;
-    env->NewWritableFile(file_name, &file, env_options);
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), file_name,
+                                         FileOptions(env_options), &file_writer,
+                                         nullptr));
 
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
-        int_tbl_prop_collector_factories;
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
 
-    file_writer.reset(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), file_name, env_options));
     int unknown_level = -1;
     tb = opts.table_factory->NewTableBuilder(
         TableBuilderOptions(
             ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-            CompressionType::kNoCompression, 0 /* sample_for_compression */,
-            CompressionOptions(), false /* skip_filters */,
-            kDefaultColumnFamilyName, unknown_level),
-        0 /* column_family_id */, file_writer.get());
+            CompressionType::kNoCompression, CompressionOptions(),
+            0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level),
+        file_writer.get());
   } else {
     s = DB::Open(opts, dbname, &db);
     ASSERT_OK(s);
@@ -130,20 +129,22 @@
 
   std::unique_ptr<TableReader> table_reader;
   if (!through_db) {
-    std::unique_ptr<RandomAccessFile> raf;
-    s = env->NewRandomAccessFile(file_name, &raf, env_options);
+    const auto& fs = env->GetFileSystem();
+    FileOptions fopts(env_options);
+
+    std::unique_ptr<FSRandomAccessFile> raf;
+    s = fs->NewRandomAccessFile(file_name, fopts, &raf, nullptr);
     if (!s.ok()) {
       fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
       exit(1);
     }
     uint64_t file_size;
-    env->GetFileSize(file_name, &file_size);
+    fs->GetFileSize(file_name, fopts.io_options, &file_size, nullptr);
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(raf),
-                                   file_name));
+        new RandomAccessFileReader(std::move(raf), file_name));
     s = opts.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
-                           env_options, ikc),
+        TableReaderOptions(ioptions, moptions.prefix_extractor, env_options,
+                           ikc),
         std::move(file_reader), file_size, &table_reader);
     if (!s.ok()) {
       fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
@@ -168,21 +169,21 @@
         if (!for_iterator) {
           // Query one existing key;
           std::string key = MakeKey(r1, r2, through_db);
-          uint64_t start_time = Now(env, measured_by_nanosecond);
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
           if (!through_db) {
             PinnableSlice value;
             MergeContext merge_context;
             SequenceNumber max_covering_tombstone_seq = 0;
-            GetContext get_context(ioptions.user_comparator,
-                                   ioptions.merge_operator, ioptions.info_log,
-                                   ioptions.statistics, GetContext::kNotFound,
-                                   Slice(key), &value, nullptr, &merge_context,
-                                   true, &max_covering_tombstone_seq, env);
+            GetContext get_context(
+                ioptions.user_comparator, ioptions.merge_operator.get(),
+                ioptions.logger, ioptions.stats, GetContext::kNotFound,
+                Slice(key), &value, nullptr, &merge_context, true,
+                &max_covering_tombstone_seq, clock);
             s = table_reader->Get(read_options, key, &get_context, nullptr);
           } else {
             s = db->Get(read_options, key, &result);
           }
-          hist.Add(Now(env, measured_by_nanosecond) - start_time);
+          hist.Add(Now(clock, measured_by_nanosecond) - start_time);
         } else {
           int r2_len;
           if (if_query_empty_keys) {
@@ -196,7 +197,7 @@
           std::string start_key = MakeKey(r1, r2, through_db);
           std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
           uint64_t total_time = 0;
-          uint64_t start_time = Now(env, measured_by_nanosecond);
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
           Iterator* iter = nullptr;
           InternalIterator* iiter = nullptr;
           if (!through_db) {
@@ -214,10 +215,10 @@
               break;
             }
             // verify key;
-            total_time += Now(env, measured_by_nanosecond) - start_time;
+            total_time += Now(clock, measured_by_nanosecond) - start_time;
             assert(Slice(MakeKey(r1, r2 + count, through_db)) ==
                    (through_db ? iter->key() : iiter->key()));
-            start_time = Now(env, measured_by_nanosecond);
+            start_time = Now(clock, measured_by_nanosecond);
             if (++count >= r2_len) {
               break;
             }
@@ -229,7 +230,7 @@
             assert(false);
           }
           delete iter;
-          total_time += Now(env, measured_by_nanosecond) - start_time;
+          total_time += Now(clock, measured_by_nanosecond) - start_time;
           hist.Add(total_time);
         }
       }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_caller.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_caller.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_reader_caller.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_reader_caller.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 // A list of callers for a table reader. It is used to trace the caller that
 // accesses on a block. This is only used for block cache tracing and analysis.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/table_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/table_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,33 +7,44 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "rocksdb/table.h"
+
+#include <gtest/gtest.h>
+#include <stddef.h>
 #include <stdio.h>
+
 #include <algorithm>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
-#include "block_fetcher.h"
 #include "cache/lru_cache.h"
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "memtable/stl_wrappers.h"
-#include "meta_blocks.h"
 #include "monitoring/statistics.h"
+#include "options/options_helper.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/unique_id.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_builder.h"
@@ -41,19 +52,24 @@
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_builder.h"
 #include "table/block_based/flush_block_policy.h"
+#include "table/block_fetcher.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
 #include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/coding_lean.h"
 #include "util/compression.h"
 #include "util/file_checksum_helper.h"
 #include "util/random.h"
 #include "util/string_util.h"
+#include "utilities/memory_allocators.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -70,7 +86,7 @@
 // DummyPropertiesCollector used to test BlockBasedTableProperties
 class DummyPropertiesCollector : public TablePropertiesCollector {
  public:
-  const char* Name() const override { return ""; }
+  const char* Name() const override { return "DummyPropertiesCollector"; }
 
   Status Finish(UserCollectedProperties* /*properties*/) override {
     return Status::OK();
@@ -92,7 +108,9 @@
       TablePropertiesCollectorFactory::Context /*context*/) override {
     return new DummyPropertiesCollector();
   }
-  const char* Name() const override { return "DummyPropertiesCollector1"; }
+  const char* Name() const override {
+    return "DummyPropertiesCollectorFactory1";
+  }
 };
 
 class DummyPropertiesCollectorFactory2
@@ -102,7 +120,9 @@
       TablePropertiesCollectorFactory::Context /*context*/) override {
     return new DummyPropertiesCollector();
   }
-  const char* Name() const override { return "DummyPropertiesCollector2"; }
+  const char* Name() const override {
+    return "DummyPropertiesCollectorFactory2";
+  }
 };
 
 // Return reverse of "key".
@@ -151,6 +171,9 @@
   }
 }
 
+const auto kUnknownColumnFamily =
+    TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+
 }  // namespace
 
 // Helper class for tests to unify the interface between
@@ -168,12 +191,12 @@
   // Finish constructing the data structure with all the keys that have
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
-  void Finish(const Options& options, const ImmutableCFOptions& ioptions,
+  void Finish(const Options& options, const ImmutableOptions& ioptions,
               const MutableCFOptions& moptions,
               const BlockBasedTableOptions& table_options,
               const InternalKeyComparator& internal_comparator,
               std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
-    last_internal_key_ = &internal_comparator;
+    last_internal_comparator_ = &internal_comparator;
     *kvmap = data_;
     keys->clear();
     for (const auto& kv : data_) {
@@ -187,7 +210,7 @@
 
   // Construct the data structure from the data in "data"
   virtual Status FinishImpl(const Options& options,
-                            const ImmutableCFOptions& ioptions,
+                            const ImmutableOptions& ioptions,
                             const MutableCFOptions& moptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
@@ -205,52 +228,12 @@
   virtual bool AnywayDeleteIterator() const { return false; }
 
  protected:
-  const InternalKeyComparator* last_internal_key_;
+  const InternalKeyComparator* last_internal_comparator_;
 
  private:
   stl_wrappers::KVMap data_;
 };
 
-class BlockConstructor: public Constructor {
- public:
-  explicit BlockConstructor(const Comparator* cmp)
-      : Constructor(cmp),
-        comparator_(cmp),
-        block_(nullptr) { }
-  ~BlockConstructor() override { delete block_; }
-  Status FinishImpl(const Options& /*options*/,
-                    const ImmutableCFOptions& /*ioptions*/,
-                    const MutableCFOptions& /*moptions*/,
-                    const BlockBasedTableOptions& table_options,
-                    const InternalKeyComparator& /*internal_comparator*/,
-                    const stl_wrappers::KVMap& kv_map) override {
-    delete block_;
-    block_ = nullptr;
-    BlockBuilder builder(table_options.block_restart_interval);
-
-    for (const auto kv : kv_map) {
-      builder.Add(kv.first, kv.second);
-    }
-    // Open the block
-    data_ = builder.Finish().ToString();
-    BlockContents contents;
-    contents.data = data_;
-    block_ = new Block(std::move(contents), kDisableGlobalSequenceNumber);
-    return Status::OK();
-  }
-  InternalIterator* NewIterator(
-      const SliceTransform* /*prefix_extractor*/) const override {
-    return block_->NewDataIterator(comparator_, comparator_);
-  }
-
- private:
-  const Comparator* comparator_;
-  std::string data_;
-  Block* block_;
-
-  BlockConstructor();
-};
-
 // A helper class that converts internal format keys into user keys
 class KeyConvertingIterator : public InternalIterator {
  public:
@@ -281,14 +264,18 @@
   void SeekToLast() override { iter_->SeekToLast(); }
   void Next() override { iter_->Next(); }
   void Prev() override { iter_->Prev(); }
-  bool IsOutOfBound() override { return iter_->IsOutOfBound(); }
+  IterBoundCheck UpperBoundCheckResult() override {
+    return iter_->UpperBoundCheckResult();
+  }
 
   Slice key() const override {
     assert(Valid());
     ParsedInternalKey parsed_key;
-    if (!ParseInternalKey(iter_->key(), &parsed_key)) {
-      status_ = Status::Corruption("malformed internal key");
-      return Slice("corrupted key");
+    Status pik_status =
+        ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      status_ = pik_status;
+      return Slice(status_.getState());
     }
     return parsed_key.user_key;
   }
@@ -308,7 +295,56 @@
   void operator=(const KeyConvertingIterator&);
 };
 
-class TableConstructor: public Constructor {
+// `BlockConstructor` APIs always accept/return user keys.
+class BlockConstructor : public Constructor {
+ public:
+  explicit BlockConstructor(const Comparator* cmp)
+      : Constructor(cmp), comparator_(cmp), block_(nullptr) {}
+  ~BlockConstructor() override { delete block_; }
+  Status FinishImpl(const Options& /*options*/,
+                    const ImmutableOptions& /*ioptions*/,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& table_options,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
+    delete block_;
+    block_ = nullptr;
+    BlockBuilder builder(table_options.block_restart_interval);
+
+    for (const auto& kv : kv_map) {
+      // `DataBlockIter` assumes it reads only internal keys. `BlockConstructor`
+      // clients provide user keys, so we need to convert to internal key format
+      // before writing the data block.
+      ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
+      std::string encoded;
+      AppendInternalKey(&encoded, ikey);
+      builder.Add(encoded, kv.second);
+    }
+    // Open the block
+    data_ = builder.Finish().ToString();
+    BlockContents contents;
+    contents.data = data_;
+    block_ = new Block(std::move(contents));
+    return Status::OK();
+  }
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    // `DataBlockIter` returns the internal keys it reads.
+    // `KeyConvertingIterator` converts them to user keys before they are
+    // exposed to the `BlockConstructor` clients.
+    return new KeyConvertingIterator(
+        block_->NewDataIterator(comparator_, kDisableGlobalSequenceNumber));
+  }
+
+ private:
+  const Comparator* comparator_;
+  std::string data_;
+  Block* block_;
+
+  BlockConstructor();
+};
+
+class TableConstructor : public Constructor {
  public:
   explicit TableConstructor(const Comparator* cmp,
                             bool convert_to_internal_key = false,
@@ -321,18 +357,18 @@
   }
   ~TableConstructor() override { Reset(); }
 
-  Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions,
+  Status FinishImpl(const Options& options, const ImmutableOptions& ioptions,
                     const MutableCFOptions& moptions,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& internal_comparator,
                     const stl_wrappers::KVMap& kv_map) override {
     Reset();
     soptions.use_mmap_reads = ioptions.allow_mmap_reads;
-    file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(),
-                                                   "" /* don't care */));
+    std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+    file_writer_.reset(new WritableFileWriter(
+        std::move(sink), "" /* don't care */, FileOptions()));
     std::unique_ptr<TableBuilder> builder;
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-        int_tbl_prop_collector_factories;
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
 
     if (largest_seqno_ != 0) {
       // Pretend that it's an external file written by SstFileWriter.
@@ -345,13 +381,11 @@
     builder.reset(ioptions.table_factory->NewTableBuilder(
         TableBuilderOptions(ioptions, moptions, internal_comparator,
                             &int_tbl_prop_collector_factories,
-                            options.compression, options.sample_for_compression,
-                            options.compression_opts, false /* skip_filters */,
-                            column_family_name, level_),
-        TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                            options.compression, options.compression_opts,
+                            kUnknownColumnFamily, column_family_name, level_),
         file_writer_.get()));
 
-    for (const auto kv : kv_map) {
+    for (const auto& kv : kv_map) {
       if (convert_to_internal_key_) {
         ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
         std::string encoded;
@@ -360,34 +394,25 @@
       } else {
         builder->Add(kv.first, kv.second);
       }
-      EXPECT_TRUE(builder->status().ok());
+      EXPECT_OK(builder->status());
     }
     Status s = builder->Finish();
-    file_writer_->Flush();
+    EXPECT_OK(file_writer_->Flush());
     EXPECT_TRUE(s.ok()) << s.ToString();
 
     EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
 
     // Open the table
     uniq_id_ = cur_uniq_id_++;
-    file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
-        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
-    const bool kSkipFilters = true;
-    const bool kImmortal = true;
-    return ioptions.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
-                           internal_comparator, !kSkipFilters, !kImmortal,
-                           level_, largest_seqno_, &block_cache_tracer_),
-        std::move(file_reader_), TEST_GetSink()->contents().size(),
-        &table_reader_);
+
+    return Reopen(ioptions, moptions);
   }
 
   InternalIterator* NewIterator(
       const SliceTransform* prefix_extractor) const override {
-    ReadOptions ro;
     InternalIterator* iter = table_reader_->NewIterator(
-        ro, prefix_extractor, /*arena=*/nullptr, /*skip_filters=*/false,
-        TableReaderCaller::kUncategorized);
+        read_options_, prefix_extractor, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
     if (convert_to_internal_key_) {
       return new KeyConvertingIterator(iter);
     } else {
@@ -406,13 +431,18 @@
         key, TableReaderCaller::kUncategorized);
   }
 
-  virtual Status Reopen(const ImmutableCFOptions& ioptions,
+  virtual Status Reopen(const ImmutableOptions& ioptions,
                         const MutableCFOptions& moptions) {
-    file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
-        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+    std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads));
+
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
     return ioptions.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
-                           *last_internal_key_),
+        TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+                           *last_internal_comparator_, /*skip_filters*/ false,
+                           /*immortal*/ false, false, level_, largest_seqno_,
+                           &block_cache_tracer_, moptions.write_buffer_size, "",
+                           uniq_id_),
         std::move(file_reader_), TEST_GetSink()->contents().size(),
         &table_reader_);
   }
@@ -428,8 +458,7 @@
   bool ConvertToInternalKey() { return convert_to_internal_key_; }
 
   test::StringSink* TEST_GetSink() {
-    return ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(
-        file_writer_.get());
+    return static_cast<test::StringSink*>(file_writer_->writable_file());
   }
 
   BlockCacheTracer block_cache_tracer_;
@@ -442,6 +471,7 @@
     file_reader_.reset();
   }
 
+  const ReadOptions read_options_;
   uint64_t uniq_id_;
   std::unique_ptr<WritableFileWriter> file_writer_;
   std::unique_ptr<RandomAccessFileReader> file_reader_;
@@ -466,27 +496,31 @@
         write_buffer_manager_(wb),
         table_factory_(new SkipListFactory) {
     options_.memtable_factory = table_factory_;
-    ImmutableCFOptions ioptions(options_);
+    ImmutableOptions ioptions(options_);
     memtable_ =
         new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_),
                      wb, kMaxSequenceNumber, 0 /* column_family_id */);
     memtable_->Ref();
   }
   ~MemTableConstructor() override { delete memtable_->Unref(); }
-  Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
+  Status FinishImpl(const Options&, const ImmutableOptions& ioptions,
                     const MutableCFOptions& /*moptions*/,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& /*internal_comparator*/,
                     const stl_wrappers::KVMap& kv_map) override {
     delete memtable_->Unref();
-    ImmutableCFOptions mem_ioptions(ioptions);
+    ImmutableOptions mem_ioptions(ioptions);
     memtable_ = new MemTable(internal_comparator_, mem_ioptions,
                              MutableCFOptions(options_), write_buffer_manager_,
                              kMaxSequenceNumber, 0 /* column_family_id */);
     memtable_->Ref();
     int seq = 1;
-    for (const auto kv : kv_map) {
-      memtable_->Add(seq, kTypeValue, kv.first, kv.second);
+    for (const auto& kv : kv_map) {
+      Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second,
+                                nullptr /* kv_prot_info */);
+      if (!s.ok()) {
+        return s;
+      }
       seq++;
     }
     return Status::OK();
@@ -538,7 +572,7 @@
   }
   ~DBConstructor() override { delete db_; }
   Status FinishImpl(const Options& /*options*/,
-                    const ImmutableCFOptions& /*ioptions*/,
+                    const ImmutableOptions& /*ioptions*/,
                     const MutableCFOptions& /*moptions*/,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& /*internal_comparator*/,
@@ -546,9 +580,9 @@
     delete db_;
     db_ = nullptr;
     NewDB();
-    for (const auto kv : kv_map) {
+    for (const auto& kv : kv_map) {
       WriteBatch batch;
-      batch.Put(kv.first, kv.second);
+      EXPECT_OK(batch.Put(kv.first, kv.second));
       EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok());
     }
     return Status::OK();
@@ -598,10 +632,22 @@
   bool reverse_compare;
   int restart_interval;
   CompressionType compression;
+  uint32_t compression_parallel_threads;
   uint32_t format_version;
   bool use_mmap;
 };
 
+std::ostream& operator<<(std::ostream& os, const TestArgs& args) {
+  os << "type: " << args.type << " reverse_compare: " << args.reverse_compare
+     << " restart_interval: " << args.restart_interval
+     << " compression: " << args.compression
+     << " compression_parallel_threads: " << args.compression_parallel_threads
+     << " format_version: " << args.format_version
+     << " use_mmap: " << args.use_mmap;
+
+  return os;
+}
+
 static std::vector<TestArgs> GenerateArgList() {
   std::vector<TestArgs> test_args;
   std::vector<TestType> test_types = {
@@ -615,6 +661,7 @@
       MEMTABLE_TEST, DB_TEST};
   std::vector<bool> reverse_compare_types = {false, true};
   std::vector<int> restart_intervals = {16, 1, 1024};
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
 
   // Only add compression if it is supported
   std::vector<std::pair<CompressionType, bool>> compression_types;
@@ -657,6 +704,8 @@
         one_arg.reverse_compare = reverse_compare;
         one_arg.restart_interval = restart_intervals[0];
         one_arg.compression = compression_types[0].first;
+        one_arg.compression_parallel_threads = 1;
+        one_arg.format_version = 0;
         one_arg.use_mmap = true;
         test_args.push_back(one_arg);
         one_arg.use_mmap = false;
@@ -667,14 +716,17 @@
 
       for (auto restart_interval : restart_intervals) {
         for (auto compression_type : compression_types) {
-          TestArgs one_arg;
-          one_arg.type = test_type;
-          one_arg.reverse_compare = reverse_compare;
-          one_arg.restart_interval = restart_interval;
-          one_arg.compression = compression_type.first;
-          one_arg.format_version = compression_type.second ? 2 : 1;
-          one_arg.use_mmap = false;
-          test_args.push_back(one_arg);
+          for (auto num_threads : compression_parallel_threads) {
+            TestArgs one_arg;
+            one_arg.type = test_type;
+            one_arg.reverse_compare = reverse_compare;
+            one_arg.restart_interval = restart_interval;
+            one_arg.compression = compression_type.first;
+            one_arg.compression_parallel_threads = num_threads;
+            one_arg.format_version = compression_type.second ? 2 : 1;
+            one_arg.use_mmap = false;
+            test_args.push_back(one_arg);
+          }
         }
       }
     }
@@ -715,41 +767,38 @@
 
 class HarnessTest : public testing::Test {
  public:
-  HarnessTest()
-      : ioptions_(options_),
+  explicit HarnessTest(const TestArgs& args)
+      : args_(args),
+        ioptions_(options_),
         moptions_(options_),
-        constructor_(nullptr),
-        write_buffer_(options_.db_write_buffer_size) {}
-
-  void Init(const TestArgs& args) {
-    delete constructor_;
-    constructor_ = nullptr;
-    options_ = Options();
-    options_.compression = args.compression;
+        write_buffer_(options_.db_write_buffer_size),
+        support_prev_(true),
+        only_support_prefix_seek_(false) {
+    options_.compression = args_.compression;
+    options_.compression_opts.parallel_threads =
+        args_.compression_parallel_threads;
     // Use shorter block size for tests to exercise block boundary
     // conditions more.
-    if (args.reverse_compare) {
+    if (args_.reverse_compare) {
       options_.comparator = &reverse_key_comparator;
     }
 
     internal_comparator_.reset(
         new test::PlainInternalKeyComparator(options_.comparator));
 
-    support_prev_ = true;
-    only_support_prefix_seek_ = false;
-    options_.allow_mmap_reads = args.use_mmap;
-    switch (args.type) {
+    options_.allow_mmap_reads = args_.use_mmap;
+    switch (args_.type) {
       case BLOCK_BASED_TABLE_TEST:
         table_options_.flush_block_policy_factory.reset(
             new FlushBlockBySizePolicyFactory());
         table_options_.block_size = 256;
-        table_options_.block_restart_interval = args.restart_interval;
-        table_options_.index_block_restart_interval = args.restart_interval;
-        table_options_.format_version = args.format_version;
+        table_options_.block_restart_interval = args_.restart_interval;
+        table_options_.index_block_restart_interval = args_.restart_interval;
+        table_options_.format_version = args_.format_version;
         options_.table_factory.reset(
             new BlockBasedTableFactory(table_options_));
-        constructor_ = new TableConstructor(
-            options_.comparator, true /* convert_to_internal_key_ */);
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
         internal_comparator_.reset(
             new InternalKeyComparator(options_.comparator));
         break;
@@ -760,8 +809,8 @@
         only_support_prefix_seek_ = true;
         options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
         options_.table_factory.reset(NewPlainTableFactory());
-        constructor_ = new TableConstructor(
-            options_.comparator, true /* convert_to_internal_key_ */);
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
         internal_comparator_.reset(
             new InternalKeyComparator(options_.comparator));
         break;
@@ -770,8 +819,8 @@
         only_support_prefix_seek_ = true;
         options_.prefix_extractor.reset(NewNoopTransform());
         options_.table_factory.reset(NewPlainTableFactory());
-        constructor_ = new TableConstructor(
-            options_.comparator, true /* convert_to_internal_key_ */);
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
         internal_comparator_.reset(
             new InternalKeyComparator(options_.comparator));
         break;
@@ -789,8 +838,8 @@
           options_.table_factory.reset(
               NewPlainTableFactory(plain_table_options));
         }
-        constructor_ = new TableConstructor(
-            options_.comparator, true /* convert_to_internal_key_ */);
+        constructor_.reset(new TableConstructor(
+            options_.comparator, true /* convert_to_internal_key_ */));
         internal_comparator_.reset(
             new InternalKeyComparator(options_.comparator));
         break;
@@ -799,28 +848,26 @@
         table_options_.block_size = 256;
         options_.table_factory.reset(
             new BlockBasedTableFactory(table_options_));
-        constructor_ = new BlockConstructor(options_.comparator);
+        constructor_.reset(new BlockConstructor(options_.comparator));
         break;
       case MEMTABLE_TEST:
         table_options_.block_size = 256;
         options_.table_factory.reset(
             new BlockBasedTableFactory(table_options_));
-        constructor_ = new MemTableConstructor(options_.comparator,
-                                               &write_buffer_);
+        constructor_.reset(
+            new MemTableConstructor(options_.comparator, &write_buffer_));
         break;
       case DB_TEST:
         table_options_.block_size = 256;
         options_.table_factory.reset(
             new BlockBasedTableFactory(table_options_));
-        constructor_ = new DBConstructor(options_.comparator);
+        constructor_.reset(new DBConstructor(options_.comparator));
         break;
     }
-    ioptions_ = ImmutableCFOptions(options_);
+    ioptions_ = ImmutableOptions(options_);
     moptions_ = MutableCFOptions(options_);
   }
 
-  ~HarnessTest() override { delete constructor_; }
-
   void Add(const std::string& key, const std::string& value) {
     constructor_->Add(key, value);
   }
@@ -843,12 +890,15 @@
     InternalIterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     iter->SeekToFirst();
+    ASSERT_OK(iter->status());
     for (stl_wrappers::KVMap::const_iterator model_iter = data.begin();
          model_iter != data.end(); ++model_iter) {
       ASSERT_EQ(ToString(data, model_iter), ToString(iter));
       iter->Next();
+      ASSERT_OK(iter->status());
     }
     ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
     if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
       iter->~InternalIterator();
     } else {
@@ -861,12 +911,15 @@
     InternalIterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     iter->SeekToLast();
+    ASSERT_OK(iter->status());
     for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin();
          model_iter != data.rend(); ++model_iter) {
       ASSERT_EQ(ToString(data, model_iter), ToString(iter));
       iter->Prev();
+      ASSERT_OK(iter->status());
     }
     ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
     if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
       iter->~InternalIterator();
     } else {
@@ -888,6 +941,7 @@
           if (iter->Valid()) {
             if (kVerbose) fprintf(stderr, "Next\n");
             iter->Next();
+            ASSERT_OK(iter->status());
             ++model_iter;
             ASSERT_EQ(ToString(data, model_iter), ToString(iter));
           }
@@ -897,6 +951,7 @@
         case 1: {
           if (kVerbose) fprintf(stderr, "SeekToFirst\n");
           iter->SeekToFirst();
+          ASSERT_OK(iter->status());
           model_iter = data.begin();
           ASSERT_EQ(ToString(data, model_iter), ToString(iter));
           break;
@@ -908,6 +963,7 @@
           if (kVerbose) fprintf(stderr, "Seek '%s'\n",
                                 EscapeString(key).c_str());
           iter->Seek(Slice(key));
+          ASSERT_OK(iter->status());
           ASSERT_EQ(ToString(data, model_iter), ToString(iter));
           break;
         }
@@ -916,6 +972,7 @@
           if (iter->Valid()) {
             if (kVerbose) fprintf(stderr, "Prev\n");
             iter->Prev();
+            ASSERT_OK(iter->status());
             if (model_iter == data.begin()) {
               model_iter = data.end();   // Wrap around to invalid value
             } else {
@@ -929,6 +986,7 @@
         case 4: {
           if (kVerbose) fprintf(stderr, "SeekToLast\n");
           iter->SeekToLast();
+          ASSERT_OK(iter->status());
           if (keys.empty()) {
             model_iter = data.end();
           } else {
@@ -1006,40 +1064,37 @@
   // Returns nullptr if not running against a DB
   DB* db() const { return constructor_->db(); }
 
-  void RandomizedHarnessTest(size_t part, size_t total) {
-    std::vector<TestArgs> args = GenerateArgList();
-    assert(part);
-    assert(part <= total);
-    for (size_t i = 0; i < args.size(); i++) {
-      if ((i % total) + 1 != part) {
-        continue;
-      }
-      Init(args[i]);
-      Random rnd(test::RandomSeed() + 5);
-      for (int num_entries = 0; num_entries < 2000;
-           num_entries += (num_entries < 50 ? 1 : 200)) {
-        for (int e = 0; e < num_entries; e++) {
-          std::string v;
-          Add(test::RandomKey(&rnd, rnd.Skewed(4)),
-              test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
-        }
-        Test(&rnd);
-      }
-    }
-  }
-
  private:
-  Options options_ = Options();
-  ImmutableCFOptions ioptions_;
+  TestArgs args_;
+  Options options_;
+  ImmutableOptions ioptions_;
   MutableCFOptions moptions_;
-  BlockBasedTableOptions table_options_ = BlockBasedTableOptions();
-  Constructor* constructor_;
+  BlockBasedTableOptions table_options_;
+  std::unique_ptr<Constructor> constructor_;
   WriteBufferManager write_buffer_;
   bool support_prev_;
   bool only_support_prefix_seek_;
   std::shared_ptr<InternalKeyComparator> internal_comparator_;
 };
 
+class ParameterizedHarnessTest : public HarnessTest,
+                                 public testing::WithParamInterface<TestArgs> {
+ public:
+  ParameterizedHarnessTest() : HarnessTest(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(TableTest, ParameterizedHarnessTest,
+                        ::testing::ValuesIn(GenerateArgList()));
+
+class DBHarnessTest : public HarnessTest {
+ public:
+  DBHarnessTest()
+      : HarnessTest(TestArgs{DB_TEST, /* reverse_compare */ false,
+                             /* restart_interval */ 16, kNoCompression,
+                             /* compression_parallel_threads */ 1,
+                             /* format_version */ 0, /* use_mmap */ false}) {}
+};
+
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
   if (!result) {
@@ -1091,7 +1146,11 @@
     std::unique_ptr<TraceWriter> trace_writer;
     EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
                                  &trace_writer));
-    c->block_cache_tracer_.StartTrace(env_, trace_opt, std::move(trace_writer));
+    // Always return Status::OK().
+    assert(c->block_cache_tracer_
+               .StartTrace(env_->GetSystemClock().get(), trace_opt,
+                           std::move(trace_writer))
+               .ok());
     {
       std::string user_key = "k01";
       InternalKey internal_key(user_key, 0, kTypeValue);
@@ -1111,51 +1170,53 @@
       const std::vector<BlockCacheTraceRecord>& expected_records) {
     c->block_cache_tracer_.EndTrace();
 
-    std::unique_ptr<TraceReader> trace_reader;
-    Status s =
-        NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
-    EXPECT_OK(s);
-    BlockCacheTraceReader reader(std::move(trace_reader));
-    BlockCacheTraceHeader header;
-    EXPECT_OK(reader.ReadHeader(&header));
-    uint32_t index = 0;
-    while (s.ok()) {
-      BlockCacheTraceRecord access;
-      s = reader.ReadAccess(&access);
-      if (!s.ok()) {
-        break;
-      }
-      ASSERT_LT(index, expected_records.size());
-      EXPECT_NE("", access.block_key);
-      EXPECT_EQ(access.block_type, expected_records[index].block_type);
-      EXPECT_GT(access.block_size, 0);
-      EXPECT_EQ(access.caller, expected_records[index].caller);
-      EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
-      EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
-      // Get
-      if (access.caller == TableReaderCaller::kUserGet) {
-        EXPECT_EQ(access.referenced_key,
-                  expected_records[index].referenced_key);
-        EXPECT_EQ(access.get_id, expected_records[index].get_id);
-        EXPECT_EQ(access.get_from_user_specified_snapshot,
-                  expected_records[index].get_from_user_specified_snapshot);
-        if (access.block_type == TraceType::kBlockTraceDataBlock) {
-          EXPECT_GT(access.referenced_data_size, 0);
-          EXPECT_GT(access.num_keys_in_block, 0);
-          EXPECT_EQ(access.referenced_key_exist_in_block,
-                    expected_records[index].referenced_key_exist_in_block);
+    {
+      std::unique_ptr<TraceReader> trace_reader;
+      Status s =
+          NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+      EXPECT_OK(s);
+      BlockCacheTraceReader reader(std::move(trace_reader));
+      BlockCacheTraceHeader header;
+      EXPECT_OK(reader.ReadHeader(&header));
+      uint32_t index = 0;
+      while (s.ok()) {
+        BlockCacheTraceRecord access;
+        s = reader.ReadAccess(&access);
+        if (!s.ok()) {
+          break;
         }
-      } else {
-        EXPECT_EQ(access.referenced_key, "");
-        EXPECT_EQ(access.get_id, 0);
-        EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse);
-        EXPECT_EQ(access.referenced_data_size, 0);
-        EXPECT_EQ(access.num_keys_in_block, 0);
-        EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse);
+        ASSERT_LT(index, expected_records.size());
+        EXPECT_NE("", access.block_key);
+        EXPECT_EQ(access.block_type, expected_records[index].block_type);
+        EXPECT_GT(access.block_size, 0);
+        EXPECT_EQ(access.caller, expected_records[index].caller);
+        EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
+        EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
+        // Get
+        if (access.caller == TableReaderCaller::kUserGet) {
+          EXPECT_EQ(access.referenced_key,
+                    expected_records[index].referenced_key);
+          EXPECT_EQ(access.get_id, expected_records[index].get_id);
+          EXPECT_EQ(access.get_from_user_specified_snapshot,
+                    expected_records[index].get_from_user_specified_snapshot);
+          if (access.block_type == TraceType::kBlockTraceDataBlock) {
+            EXPECT_GT(access.referenced_data_size, 0);
+            EXPECT_GT(access.num_keys_in_block, 0);
+            EXPECT_EQ(access.referenced_key_exist_in_block,
+                      expected_records[index].referenced_key_exist_in_block);
+          }
+        } else {
+          EXPECT_EQ(access.referenced_key, "");
+          EXPECT_EQ(access.get_id, 0);
+          EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse);
+          EXPECT_EQ(access.referenced_data_size, 0);
+          EXPECT_EQ(access.num_keys_in_block, 0);
+          EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse);
+        }
+        index++;
       }
-      index++;
+      EXPECT_EQ(index, expected_records.size());
     }
-    EXPECT_EQ(index, expected_records.size());
     EXPECT_OK(env_->DeleteFile(trace_file_path_));
     EXPECT_OK(env_->DeleteDir(test_path_));
   }
@@ -1178,17 +1239,21 @@
  public:
   FileChecksumTestHelper(bool convert_to_internal_key = false)
       : convert_to_internal_key_(convert_to_internal_key) {
-    sink_ = new test::StringSink();
   }
   ~FileChecksumTestHelper() {}
 
   void CreateWriteableFile() {
-    file_writer_.reset(test::GetWritableFileWriter(sink_, "" /* don't care */));
+    sink_ = new test::StringSink();
+    std::unique_ptr<FSWritableFile> holder(sink_);
+    file_writer_.reset(new WritableFileWriter(
+        std::move(holder), "" /* don't care */, FileOptions()));
   }
 
-  void SetFileChecksumFunc(FileChecksumFunc* checksum_func) {
+  void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) {
     if (file_writer_ != nullptr) {
-      file_writer_->TEST_SetFileChecksumFunc(checksum_func);
+      file_writer_->TEST_SetFileChecksumGenerator(checksum_generator);
+    } else {
+      delete checksum_generator;
     }
   }
 
@@ -1203,14 +1268,13 @@
   void AddKVtoKVMap(int num_entries) {
     Random rnd(test::RandomSeed());
     for (int i = 0; i < num_entries; i++) {
-      std::string v;
-      test::RandomString(&rnd, 100, &v);
+      std::string v = rnd.RandomString(100);
       kv_map_[test::RandomKey(&rnd, 20)] = v;
     }
   }
 
   Status WriteKVAndFlushTable() {
-    for (const auto kv : kv_map_) {
+    for (const auto& kv : kv_map_) {
       if (convert_to_internal_key_) {
         ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
         std::string encoded;
@@ -1222,54 +1286,53 @@
       EXPECT_TRUE(table_builder_->status().ok());
     }
     Status s = table_builder_->Finish();
-    file_writer_->Flush();
-    EXPECT_TRUE(s.ok());
+    EXPECT_OK(file_writer_->Flush());
+    EXPECT_OK(s);
 
     EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize());
     return s;
   }
 
-  std::string GetFileChecksum() { return table_builder_->GetFileChecksum(); }
+  std::string GetFileChecksum() {
+    EXPECT_OK(file_writer_->Close());
+    return table_builder_->GetFileChecksum();
+  }
 
   const char* GetFileChecksumFuncName() {
     return table_builder_->GetFileChecksumFuncName();
   }
 
-  Status CalculateFileChecksum(FileChecksumFunc* file_checksum_func,
+  Status CalculateFileChecksum(FileChecksumGenerator* file_checksum_generator,
                                std::string* checksum) {
-    assert(file_checksum_func != nullptr);
+    assert(file_checksum_generator != nullptr);
     cur_uniq_id_ = checksum_uniq_id_++;
     test::StringSink* ss_rw =
-        ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(
-            file_writer_.get());
-    file_reader_.reset(test::GetRandomAccessFileReader(
-        new test::StringSource(ss_rw->contents())));
+        static_cast<test::StringSink*>(file_writer_->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw->contents()));
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
+
     std::unique_ptr<char[]> scratch(new char[2048]);
     Slice result;
     uint64_t offset = 0;
-    std::string tmp_checksum;
-    bool first_read = true;
     Status s;
-    s = file_reader_->Read(offset, 2048, &result, scratch.get(), false);
+    s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
+                           nullptr, false);
     if (!s.ok()) {
       return s;
     }
     while (result.size() != 0) {
-      if (first_read) {
-        first_read = false;
-        tmp_checksum = file_checksum_func->Value(scratch.get(), result.size());
-      } else {
-        tmp_checksum = file_checksum_func->Extend(tmp_checksum, scratch.get(),
-                                                  result.size());
-      }
+      file_checksum_generator->Update(scratch.get(), result.size());
       offset += static_cast<uint64_t>(result.size());
-      s = file_reader_->Read(offset, 2048, &result, scratch.get(), false);
+      s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
+                             nullptr, false);
       if (!s.ok()) {
         return s;
       }
     }
     EXPECT_EQ(offset, static_cast<uint64_t>(table_builder_->FileSize()));
-    *checksum = tmp_checksum;
+    file_checksum_generator->Finalize();
+    *checksum = file_checksum_generator->GetChecksum();
     return Status::OK();
   }
 
@@ -1280,17 +1343,15 @@
   std::unique_ptr<RandomAccessFileReader> file_reader_;
   std::unique_ptr<TableBuilder> table_builder_;
   stl_wrappers::KVMap kv_map_;
-  test::StringSink* sink_;
+  test::StringSink* sink_ = nullptr;
 
   static uint64_t checksum_uniq_id_;
 };
 
 uint64_t FileChecksumTestHelper::checksum_uniq_id_ = 1;
 
-INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest,
-                        testing::Values(test::kDefaultFormatVersion));
-INSTANTIATE_TEST_CASE_P(FormatLatest, BlockBasedTableTest,
-                        testing::Values(test::kLatestFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
 
 // This test serves as the living tutorial for the prefix scan of user collected
 // properties.
@@ -1306,7 +1367,7 @@
                                 {"num.555.3", "3"}, };
 
   // prefixes that exist
-  for (const std::string& prefix : {"num.111", "num.333", "num.555"}) {
+  for (const std::string prefix : {"num.111", "num.333", "num.555"}) {
     int num = 0;
     for (auto pos = props.lower_bound(prefix);
          pos != props.end() &&
@@ -1321,7 +1382,7 @@
   }
 
   // prefixes that don't exist
-  for (const std::string& prefix :
+  for (const std::string prefix :
        {"num.000", "num.222", "num.444", "num.666"}) {
     auto pos = props.lower_bound(prefix);
     ASSERT_TRUE(pos == props.end() ||
@@ -1329,6 +1390,257 @@
   }
 }
 
+namespace {
+struct TestIds {
+  UniqueId64x3 internal_id;
+  UniqueId64x3 external_id;
+};
+
+inline bool operator==(const TestIds& lhs, const TestIds& rhs) {
+  return lhs.internal_id == rhs.internal_id &&
+         lhs.external_id == rhs.external_id;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestIds& ids) {
+  return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x"
+            << ids.internal_id[1] << "U, 0x" << ids.internal_id[2]
+            << "U }}, {{ 0x" << ids.external_id[0] << "U, 0x"
+            << ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}";
+}
+
+TestIds GetUniqueId(TableProperties* tp, std::unordered_set<uint64_t>* seen,
+                    const std::string& db_id, const std::string& db_session_id,
+                    uint64_t file_number) {
+  // First test session id logic
+  if (db_session_id.size() == 20) {
+    uint64_t upper;
+    uint64_t lower;
+    EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower));
+    EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id);
+  }
+
+  // Get external using public API
+  tp->db_id = db_id;
+  tp->db_session_id = db_session_id;
+  tp->orig_file_number = file_number;
+  TestIds t;
+  {
+    std::string uid;
+    EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid));
+    EXPECT_EQ(uid.size(), 24U);
+    t.external_id[0] = DecodeFixed64(&uid[0]);
+    t.external_id[1] = DecodeFixed64(&uid[8]);
+    t.external_id[2] = DecodeFixed64(&uid[16]);
+  }
+  // All these should be effectively random
+  EXPECT_TRUE(seen->insert(t.external_id[0]).second);
+  EXPECT_TRUE(seen->insert(t.external_id[1]).second);
+  EXPECT_TRUE(seen->insert(t.external_id[2]).second);
+
+  // Get internal with internal API
+  EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number,
+                                   &t.internal_id));
+
+  // Verify relationship
+  UniqueId64x3 tmp = t.internal_id;
+  InternalUniqueIdToExternal(&tmp);
+  EXPECT_EQ(tmp, t.external_id);
+  ExternalUniqueIdToInternal(&tmp);
+  EXPECT_EQ(tmp, t.internal_id);
+  return t;
+}
+}  // namespace
+
+TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) {
+  // To ensure the computation only depends on the expected entries, we set
+  // the rest randomly
+  TableProperties tp;
+  TEST_SetRandomTableProperties(&tp);
+
+  // DB id is normally RFC-4122
+  const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
+  // Allow other forms of DB id
+  const std::string db_id2 = "1728000184588763620";
+  const std::string db_id3 = "x";
+
+  // DB session id is normally 20 chars in base-36, but 13 to 24 chars
+  // is ok, roughly 64 to 128 bits.
+  const std::string ses_id1 = "ABCDEFGHIJ0123456789";
+  // Same trailing 13 digits
+  const std::string ses_id2 = "HIJ0123456789";
+  const std::string ses_id3 = "0123ABCDEFGHIJ0123456789";
+  // Different trailing 12 digits
+  const std::string ses_id4 = "ABCDEFGH888888888888";
+  // And change length
+  const std::string ses_id5 = "ABCDEFGHIJ012";
+  const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD";
+
+  using T = TestIds;
+  std::unordered_set<uint64_t> seen;
+  // Establish a stable schema for the unique IDs. These values must not
+  // change for existing table files.
+  // (Note: parens needed for macro parsing, extra braces needed for some
+  // compilers.)
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
+         {{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}}));
+  // Only change internal_id[1] with file number
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 2),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}},
+         {{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789),
+      T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}},
+         {{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}}));
+  // Change internal_id[1] and internal_id[2] with db_id
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id2, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}},
+         {{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id3, ses_id1, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}},
+         {{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}}));
+  // Keeping same last 13 digits of ses_id keeps same internal_id[0]
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id2, 1),
+      T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}},
+         {{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id3, 1),
+      T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}},
+         {{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}}));
+  // Changing last 12 digits of ses_id only changes internal_id[0]
+  // (vs. db_id1, ses_id1, 1)
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id4, 1),
+      T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
+         {{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}}));
+  // ses_id can change everything.
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id5, 1),
+      T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}},
+         {{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}}));
+  EXPECT_EQ(
+      GetUniqueId(&tp, &seen, db_id1, ses_id6, 1),
+      T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}},
+         {{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}}));
+
+  // Now verify more thoroughly that any small change in inputs completely
+  // changes external unique id.
+  // (Relying on 'seen' checks etc. in GetUniqueId)
+  std::string db_id = "00000000-0000-0000-0000-000000000000";
+  std::string ses_id = "000000000000000000000000";
+  uint64_t file_num = 1;
+  // change db_id
+  for (size_t i = 0; i < db_id.size(); ++i) {
+    if (db_id[i] == '-') {
+      continue;
+    }
+    for (char alt : std::string("123456789abcdef")) {
+      db_id[i] = alt;
+      GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
+    }
+    db_id[i] = '0';
+  }
+  // change ses_id
+  for (size_t i = 0; i < ses_id.size(); ++i) {
+    for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) {
+      ses_id[i] = alt;
+      GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
+    }
+    ses_id[i] = '0';
+  }
+  // change file_num
+  for (int i = 1; i < 64; ++i) {
+    GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i);
+  }
+
+  // Verify that "all zeros" in first 128 bits is equivalent for internal and
+  // external IDs. This way, as long as we avoid "all zeros" in internal IDs,
+  // we avoid it in external IDs.
+  {
+    UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}};
+    UniqueId64x3 id2 = id1;
+    InternalUniqueIdToExternal(&id1);
+    EXPECT_EQ(id1, id2);
+    ExternalUniqueIdToInternal(&id2);
+    EXPECT_EQ(id1, id2);
+  }
+}
+
+namespace {
+void SetGoodTableProperties(TableProperties* tp) {
+  // To ensure the computation only depends on the expected entries, we set
+  // the rest randomly
+  TEST_SetRandomTableProperties(tp);
+  tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
+  tp->db_session_id = "ABCDEFGHIJ0123456789";
+  tp->orig_file_number = 1;
+}
+}  // namespace
+
+TEST_F(TablePropertyTest, UniqueIdHumanStrings) {
+  TableProperties tp;
+  SetGoodTableProperties(&tp);
+
+  std::string tmp;
+  EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp));
+  EXPECT_EQ(tmp,
+            (std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23',
+                          '\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3',
+                          '\x03', '\x93', '\x08', '\xca', '\x17', '\x28',
+                          '\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}}));
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B");
+
+  // including zero padding
+  tmp = std::string(24U, '\0');
+  tmp[15] = '\x12';
+  tmp[23] = '\xAB';
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "0000000000000000-0000000000000012-00000000000000AB");
+
+  // And shortened
+  tmp = std::string(20U, '\0');
+  tmp[5] = '\x12';
+  tmp[10] = '\xAB';
+  tmp[17] = '\xEF';
+  EXPECT_EQ(UniqueIdToHumanString(tmp),
+            "0000000000120000-0000AB0000000000-00EF0000");
+
+  tmp.resize(16);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000");
+
+  tmp.resize(11);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB");
+
+  tmp.resize(6);
+  EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012");
+}
+
+TEST_F(TablePropertyTest, UniqueIdsFailure) {
+  TableProperties tp;
+  std::string tmp;
+
+  // Missing DB id
+  SetGoodTableProperties(&tp);
+  tp.db_id = "";
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+
+  // Missing session id
+  SetGoodTableProperties(&tp);
+  tp.db_session_id = "";
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+
+  // Missing file number
+  SetGoodTableProperties(&tp);
+  tp.orig_file_number = 0;
+  EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
+}
+
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
 TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
@@ -1355,9 +1667,8 @@
   table_options.block_restart_interval = 1;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
-  ioptions.statistics = options.statistics.get();
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0);
@@ -1379,7 +1690,8 @@
     block_builder.Add(item.first, item.second);
   }
   Slice content = block_builder.Finish();
-  ASSERT_EQ(content.size() + kBlockTrailerSize + diff_internal_user_bytes,
+  ASSERT_EQ(content.size() + BlockBasedTable::kBlockTrailerSize +
+                diff_internal_user_bytes,
             props.data_size);
   c.ResetTableReader();
 }
@@ -1404,9 +1716,8 @@
   table_options.enable_index_compression = compressed;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
-  ioptions.statistics = options.statistics.get();
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   c.ResetTableReader();
@@ -1431,7 +1742,7 @@
     BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1465,7 +1776,7 @@
     options.table_properties_collector_factories.emplace_back(
         new DummyPropertiesCollectorFactory2());
 
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1475,8 +1786,9 @@
     ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name);
     ASSERT_EQ("UInt64AddOperator", props.merge_operator_name);
     ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name);
-    ASSERT_EQ("[DummyPropertiesCollector1,DummyPropertiesCollector2]",
-              props.property_collectors_names);
+    ASSERT_EQ(
+        "[DummyPropertiesCollectorFactory1,DummyPropertiesCollectorFactory2]",
+        props.property_collectors_names);
     ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
     c.ResetTableReader();
   }
@@ -1508,7 +1820,7 @@
   table_options.block_restart_interval = 1;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   std::unique_ptr<InternalKeyComparator> internal_cmp(
       new InternalKeyComparator(options.comparator));
@@ -1529,7 +1841,8 @@
     for (size_t i = 0; i < expected_tombstones.size(); i++) {
       ASSERT_TRUE(iter->Valid());
       ParsedInternalKey parsed_key;
-      ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
+      ASSERT_OK(
+          ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */));
       RangeTombstone t(parsed_key, iter->value());
       const auto& expected_t = expected_tombstones[i];
       ASSERT_EQ(t.start_key_, expected_t.start_key_);
@@ -1551,7 +1864,7 @@
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1595,7 +1908,7 @@
   // reset the cache and reopen the table
   table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
   opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
-  const ImmutableCFOptions ioptions2(*opt);
+  const ImmutableOptions ioptions2(*opt);
   const MutableCFOptions moptions(*opt);
   ASSERT_OK(c->Reopen(ioptions2, moptions));
 
@@ -1653,7 +1966,7 @@
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(opt);
+  const ImmutableOptions ioptions(opt);
   const MutableCFOptions moptions(opt);
   c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
   c.ResetTableReader();
@@ -1754,7 +2067,7 @@
     c.Add("cccc2", std::string('a', 56));
     std::vector<std::string> keys;
     stl_wrappers::KVMap kvmap;
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1813,7 +2126,7 @@
   c.Add(key.Encode().ToString(), "b");
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
@@ -1851,19 +2164,20 @@
   c.Add(key.Encode().ToString(), "test");
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
            &keys, &kvmap);
   // TODO(Zhongyi): update test to use MutableCFOptions
   options.prefix_extractor.reset(NewFixedPrefixTransform(9));
-  const ImmutableCFOptions new_ioptions(options);
+  const ImmutableOptions new_ioptions(options);
   const MutableCFOptions new_moptions(options);
-  c.Reopen(new_ioptions, new_moptions);
+  ASSERT_OK(c.Reopen(new_ioptions, new_moptions));
   auto reader = c.GetTableReader();
+  ReadOptions read_options;
   std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
-      ReadOptions(), new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      read_options, new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // Test point lookup
@@ -1877,16 +2191,156 @@
   }
 }
 
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
+TEST_P(BlockBasedTableTest, BadChecksumType) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+
+  Options options;
+  options.comparator = BytewiseComparator();
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+  InternalKey key("abc", 1, kTypeValue);
+  c.Add(key.Encode().ToString(), "test");
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  // Corrupt checksum type (123 is invalid)
+  auto& sink = *c.TEST_GetSink();
+  size_t len = sink.contents_.size();
+  ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength], kCRC32c);
+  sink.contents_[len - Footer::kNewVersionsEncodedLength] = char{123};
+
+  // (Re-)Open table file with bad checksum type
+  const ImmutableOptions new_ioptions(options);
+  const MutableCFOptions new_moptions(options);
+  Status s = c.Reopen(new_ioptions, new_moptions);
+  ASSERT_NOK(s);
+  ASSERT_EQ(s.ToString(),
+            "Corruption: Corrupt or unsupported checksum type: 123");
+}
+
+namespace {
+std::string ChecksumAsString(const std::string& data,
+                             ChecksumType checksum_type) {
+  uint32_t v = ComputeBuiltinChecksum(checksum_type, data.data(), data.size());
+
+  // Verify consistency with other function
+  if (data.size() >= 1) {
+    EXPECT_EQ(v, ComputeBuiltinChecksumWithLastByte(
+                     checksum_type, data.data(), data.size() - 1, data.back()));
+  }
+  // Little endian as in file
+  std::array<char, 4> raw_bytes;
+  EncodeFixed32(raw_bytes.data(), v);
+  return Slice(raw_bytes.data(), raw_bytes.size()).ToString(/*hex*/ true);
+}
+
+std::string ChecksumAsString(std::string* data, char new_last_byte,
+                             ChecksumType checksum_type) {
+  data->back() = new_last_byte;
+  return ChecksumAsString(*data, checksum_type);
+}
+}  // namespace
+
+// Make sure that checksum values don't change in later versions, even if
+// consistent within current version.
+TEST_P(BlockBasedTableTest, ChecksumSchemas) {
+  std::string b0 = "x";
+  std::string b1 = "This is a short block!x";
+  std::string b2;
+  for (int i = 0; i < 100; ++i) {
+    b2.append("This is a long block!");
+  }
+  b2.append("x");
+  // Trailing 'x' will be replaced by compression type
+
+  std::string empty;
+
+  char ct1 = kNoCompression;
+  char ct2 = kSnappyCompression;
+  char ct3 = kZSTD;
+
+  // Note: first byte of trailer is compression type, last 4 are checksum
+
+  for (ChecksumType t : GetSupportedChecksums()) {
+    switch (t) {
+      case kNoChecksum:
+        EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000");
+        break;
+      case kCRC32c:
+        EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63");
+        break;
+      case kxxHash:
+        EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338");
+        break;
+      case kxxHash64:
+        EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1");
+        break;
+      case kXXH3:
+        EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
+        EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338");
+        EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353");
+        EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8");
+        EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6");
+        EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D");
+        EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616");
+        EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E");
+        EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845");
+        EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE");
+        break;
+      default:
+        // Force this test to be updated on new ChecksumTypes
+        assert(false);
+        break;
+    }
+  }
 }
 
 void AddInternalKey(TableConstructor* c, const std::string& prefix,
                     std::string value = "v", int /*suffix_len*/ = 800) {
   static Random rnd(1023);
-  InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
+  InternalKey k(prefix + rnd.RandomString(800), 0, kTypeValue);
   c->Add(k.Encode().ToString(), value);
 }
 
@@ -1920,7 +2374,7 @@
 
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
            &kvmap);
@@ -1930,14 +2384,16 @@
   ASSERT_EQ(5u, props->num_data_blocks);
 
   // TODO(Zhongyi): update test to use MutableCFOptions
+  ReadOptions read_options;
   std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
-      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // -- Find keys do not exist, but have common prefix.
   std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
-  std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2],
-                                          keys[7], keys[9], };
+  std::vector<std::string> lower_bound = {
+      keys[0], keys[1], keys[2], keys[7], keys[9],
+  };
 
   // find the lower bound of the prefix
   for (size_t i = 0; i < prefixes.size(); ++i) {
@@ -2014,6 +2470,80 @@
       ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0);
     }
   }
+
+  {
+    // Test reseek case. It should impact partitioned index more.
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<InternalIterator> index_iter2(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    // Things to cover in partitioned index:
+    // 1. Both of Seek() and SeekToLast() has optimization to prevent
+    //    rereek leaf index block if it remains to the same one, and
+    //    they reuse the same variable.
+    // 2. When Next() or Prev() is called, the block moves, so the
+    //    optimization should kick in only with the current one.
+    index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Prev();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Seek(InternalKey("0075", 0, kTypeValue).Encode());
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->Next();
+    ASSERT_TRUE(index_iter2->Valid());
+    index_iter2->Next();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+
+    index_iter2->SeekToLast();
+    ASSERT_TRUE(index_iter2->Valid());
+    ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
+  }
+
   c.ResetTableReader();
 }
 
@@ -2047,7 +2577,7 @@
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
 
   TableConstructor c(BytewiseComparator());
@@ -2092,7 +2622,8 @@
   explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
       : keys_per_block_(keys_per_block) {}
 
-  const char* Name() const override { return "table_test"; }
+  const char* Name() const override { return "CustomFlushBlockPolicy"; }
+
   FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
                                         const BlockBuilder&) const override {
     return new CustomFlushBlockPolicy(keys_per_block_);
@@ -2133,7 +2664,7 @@
     Statistics* stats = options.statistics.get();
     std::unique_ptr<InternalKeyComparator> comparator(
         new InternalKeyComparator(BytewiseComparator()));
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
 
     TableConstructor c(BytewiseComparator());
@@ -2164,9 +2695,11 @@
     auto reader = c.GetTableReader();
     auto props = reader->GetTableProperties();
     ASSERT_EQ(4u, props->num_data_blocks);
+    ReadOptions read_options;
     std::unique_ptr<InternalIterator> iter(reader->NewIterator(
-        ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
-        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+        read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized,
+        /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
 
     // Shouldn't have read data blocks before iterator is seeked.
     EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
@@ -2183,6 +2716,7 @@
     EXPECT_EQ(keys[2], iter->key().ToString());
     EXPECT_EQ(use_first_key ? 0 : 1,
               stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v2", iter->value().ToString());
     EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
     EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
@@ -2193,6 +2727,7 @@
     EXPECT_EQ(keys[4], iter->key().ToString());
     EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
     EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v4", iter->value().ToString());
     EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
 
@@ -2208,6 +2743,7 @@
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ(keys[5], iter->key().ToString());
     EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v5", iter->value().ToString());
     EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
     EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
@@ -2225,6 +2761,7 @@
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ(keys[7], iter->key().ToString());
     EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v7", iter->value().ToString());
     EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
 
@@ -2246,6 +2783,7 @@
     EXPECT_EQ(keys[3], iter->key().ToString());
     EXPECT_EQ(use_first_key ? 1 : 2,
               stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v3", iter->value().ToString());
     EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
     EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
@@ -2265,6 +2803,7 @@
               stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
     // All blocks are in cache now, there'll be no more misses ever.
     EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v1", iter->value().ToString());
 
     // Next into the next block again.
@@ -2292,6 +2831,7 @@
     EXPECT_EQ(keys[4], iter->key().ToString());
     EXPECT_EQ(use_first_key ? 3 : 6,
               stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v4", iter->value().ToString());
     EXPECT_EQ(use_first_key ? 3 : 6,
               stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
@@ -2301,6 +2841,7 @@
     EXPECT_EQ(keys[7], iter->key().ToString());
     EXPECT_EQ(use_first_key ? 4 : 7,
               stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    ASSERT_TRUE(iter->PrepareValue());
     EXPECT_EQ("v7", iter->value().ToString());
     EXPECT_EQ(use_first_key ? 4 : 7,
               stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
@@ -2321,7 +2862,7 @@
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
 
   TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
@@ -2339,9 +2880,11 @@
   auto reader = c.GetTableReader();
   auto props = reader->GetTableProperties();
   ASSERT_EQ(1u, props->num_data_blocks);
+  ReadOptions read_options;
   std::unique_ptr<InternalIterator> iter(reader->NewIterator(
-      ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+      read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized,
+      /*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
 
   iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
   ASSERT_TRUE(iter->Valid());
@@ -2351,6 +2894,7 @@
   // Key should have been served from index, without reading data blocks.
   EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
 
+  ASSERT_TRUE(iter->PrepareValue());
   EXPECT_EQ("x", iter->value().ToString());
   EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
   EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
@@ -2373,7 +2917,7 @@
   std::vector<std::string> keys;
 
   for (int i = 0; i < 100; ++i) {
-    keys.push_back(RandomString(&rnd, 10000));
+    keys.push_back(rnd.RandomString(10000));
   }
 
   // Each time we load one more key to the table. the table index block
@@ -2393,7 +2937,7 @@
     table_options.block_restart_interval = 1;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &ks, &kvmap);
@@ -2417,12 +2961,12 @@
   for (int i = 0; i < 10; ++i) {
     // the key/val are slightly smaller than block size, so that each block
     // holds roughly one key/value pair.
-    c.Add(RandomString(&rnd, 900), "val");
+    c.Add(rnd.RandomString(900), "val");
   }
 
   std::vector<std::string> ks;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &ks, &kvmap);
@@ -2443,7 +2987,7 @@
   SetupTracingTest(&c);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2517,7 +3061,7 @@
   SetupTracingTest(&c);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2561,14 +3105,15 @@
   SetupTracingTest(&c);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
   for (uint32_t i = 1; i <= 2; i++) {
+    ReadOptions read_options;
     std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
-        ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
         /*skip_filters=*/false, TableReaderCaller::kUserIterator));
     iter->SeekToFirst();
     while (iter->Valid()) {
@@ -2690,7 +3235,7 @@
 
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   c.Add("key", "value");
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2712,8 +3257,8 @@
                            GetContext::kNotFound, Slice(), nullptr, nullptr,
                            nullptr, true, nullptr, nullptr);
     // a hack that just to trigger BlockBasedTable::GetFilter.
-    reader->Get(ReadOptions(), "non-exist-key", &get_context,
-                moptions.prefix_extractor.get());
+    ASSERT_OK(reader->Get(ReadOptions(), "non-exist-key", &get_context,
+                          moptions.prefix_extractor.get()));
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertIndexBlockStat(0, 0);
     props.AssertFilterBlockStat(0, 0);
@@ -2742,7 +3287,7 @@
 
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   c.Add("key", "value");
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2787,6 +3332,7 @@
   // Only data block will be accessed
   {
     iter->SeekToFirst();
+    ASSERT_OK(iter->status());
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1, 1, 0 + 1,  // data block miss
                       0);
@@ -2801,6 +3347,7 @@
   {
     iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
     iter->SeekToFirst();
+    ASSERT_OK(iter->status());
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1, 1 + 1, /* index block hit */
                       1, 0 + 1 /* data block hit */);
@@ -2820,9 +3367,9 @@
   table_options.block_cache = NewLRUCache(1, 4);
   options.statistics = CreateDBStatistics();
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions2(options);
+  const ImmutableOptions ioptions2(options);
   const MutableCFOptions moptions2(options);
-  c.Reopen(ioptions2, moptions2);
+  ASSERT_OK(c.Reopen(ioptions2, moptions2));
   {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1,  // index block miss
@@ -2848,6 +3395,7 @@
     // SeekToFirst() accesses data block. With similar reason, we expect data
     // block's cache miss.
     iter->SeekToFirst();
+    ASSERT_OK(iter->status());
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(2, 0, 0 + 1,  // data block miss
                       0);
@@ -2866,7 +3414,7 @@
   std::string user_key = "k01";
   InternalKey internal_key(user_key, 0, kTypeValue);
   c3.Add(internal_key.Encode().ToString(), "hello");
-  ImmutableCFOptions ioptions3(options);
+  ImmutableOptions ioptions3(options);
   MutableCFOptions moptions3(options);
   // Generate table without filter policy
   c3.Finish(options, ioptions3, moptions3, table_options,
@@ -2877,7 +3425,7 @@
   table_options.filter_policy.reset(NewBloomFilterPolicy(1));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   options.statistics = CreateDBStatistics();
-  ImmutableCFOptions ioptions4(options);
+  ImmutableOptions ioptions4(options);
   MutableCFOptions moptions4(options);
   ASSERT_OK(c3.Reopen(ioptions4, moptions4));
   reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
@@ -2900,7 +3448,7 @@
   BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
 
   const BlockBasedTableOptions* normalized_table_options =
-      (const BlockBasedTableOptions*)factory->GetOptions();
+      factory->GetOptions<BlockBasedTableOptions>();
   ASSERT_EQ(normalized_table_options->block_size_deviation, expected);
 
   delete factory;
@@ -2912,7 +3460,7 @@
   BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
 
   const BlockBasedTableOptions* normalized_table_options =
-      (const BlockBasedTableOptions*)factory->GetOptions();
+      factory->GetOptions<BlockBasedTableOptions>();
   ASSERT_EQ(normalized_table_options->block_restart_interval, expected);
 
   delete factory;
@@ -2961,7 +3509,7 @@
       InternalKey internal_key(user_key, 0, kTypeValue);
       std::string encoded_key = internal_key.Encode().ToString();
       c.Add(encoded_key, "hello");
-      ImmutableCFOptions ioptions(options);
+      ImmutableOptions ioptions(options);
       MutableCFOptions moptions(options);
       // Generate table with filter policy
       c.Finish(options, ioptions, moptions, table_options,
@@ -3049,7 +3597,7 @@
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(opt);
+  const ImmutableOptions ioptions(opt);
   const MutableCFOptions moptions(opt);
   c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
 
@@ -3064,7 +3612,7 @@
   ASSERT_OK(iter->status());
   iter.reset();
 
-  const ImmutableCFOptions ioptions1(opt);
+  const ImmutableOptions ioptions1(opt);
   const MutableCFOptions moptions1(opt);
   ASSERT_OK(c.Reopen(ioptions1, moptions1));
   auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
@@ -3077,7 +3625,7 @@
   // rerun with different block cache
   table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
   opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions2(opt);
+  const ImmutableOptions ioptions2(opt);
   const MutableCFOptions moptions2(opt);
   ASSERT_OK(c.Reopen(ioptions2, moptions2));
   table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
@@ -3088,30 +3636,10 @@
   c.ResetTableReader();
 }
 
-namespace {
-class CustomMemoryAllocator : public MemoryAllocator {
- public:
-  const char* Name() const override { return "CustomMemoryAllocator"; }
-
-  void* Allocate(size_t size) override {
-    ++numAllocations;
-    auto ptr = new char[size + 16];
-    memcpy(ptr, "memory_allocator_", 16);  // mangle first 16 bytes
-    return reinterpret_cast<void*>(ptr + 16);
-  }
-  void Deallocate(void* p) override {
-    ++numDeallocations;
-    char* ptr = reinterpret_cast<char*>(p) - 16;
-    delete[] ptr;
-  }
-
-  std::atomic<int> numAllocations;
-  std::atomic<int> numDeallocations;
-};
-}  // namespace
-
 TEST_P(BlockBasedTableTest, MemoryAllocator) {
-  auto custom_memory_allocator = std::make_shared<CustomMemoryAllocator>();
+  auto default_memory_allocator = std::make_shared<DefaultMemoryAllocator>();
+  auto custom_memory_allocator =
+      std::make_shared<CountedMemoryAllocator>(default_memory_allocator);
   {
     Options opt;
     std::unique_ptr<InternalKeyComparator> ikc;
@@ -3137,7 +3665,7 @@
     c.Add("k07", std::string(100000, 'x'));
     std::vector<std::string> keys;
     stl_wrappers::KVMap kvmap;
-    const ImmutableCFOptions ioptions(opt);
+    const ImmutableOptions ioptions(opt);
     const MutableCFOptions moptions(opt);
     c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
 
@@ -3154,31 +3682,22 @@
 
   // out of scope, block cache should have been deleted, all allocations
   // deallocated
-  EXPECT_EQ(custom_memory_allocator->numAllocations.load(),
-            custom_memory_allocator->numDeallocations.load());
+  EXPECT_EQ(custom_memory_allocator->GetNumAllocations(),
+            custom_memory_allocator->GetNumDeallocations());
   // make sure that allocations actually happened through the cache allocator
-  EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
+  EXPECT_GT(custom_memory_allocator->GetNumAllocations(), 0);
 }
 
 // Test the file checksum of block based table
 TEST_P(BlockBasedTableTest, NoFileChecksum) {
   Options options;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  SequenceNumber largest_seqno = 0;
   int level = 0;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
-
-  if (largest_seqno != 0) {
-    // Pretend that it's an external file written by SstFileWriter.
-    int_tbl_prop_collector_factories.emplace_back(
-        new SstFileWriterPropertiesCollectorFactory(2 /* version */,
-                                                    0 /* global_seqno*/));
-  }
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
 
   FileChecksumTestHelper f(true);
@@ -3187,61 +3706,66 @@
   builder.reset(ioptions.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, *comparator,
                           &int_tbl_prop_collector_factories,
-                          options.compression, options.sample_for_compression,
-                          options.compression_opts, false /* skip_filters */,
-                          column_family_name, level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
       f.GetFileWriter()));
-  f.ResetTableBuilder(std::move(builder));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
-  ASSERT_STREQ(f.GetFileChecksumFuncName(),
-               kUnknownFileChecksumFuncName.c_str());
-  ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum.c_str());
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
+  ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum);
 }
 
-TEST_P(BlockBasedTableTest, Crc32FileChecksum) {
+TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
+  FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
+      new FileChecksumGenCrc32cFactory();
   Options options;
-  options.sst_file_checksum_func =
-      std::shared_ptr<FileChecksumFunc>(CreateFileChecksumFuncCrc32c());
-  ImmutableCFOptions ioptions(options);
+  options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  SequenceNumber largest_seqno = 0;
   int level = 0;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
-
-  if (largest_seqno != 0) {
-    // Pretend that it's an external file written by SstFileWriter.
-    int_tbl_prop_collector_factories.emplace_back(
-        new SstFileWriterPropertiesCollectorFactory(2 /* version */,
-                                                    0 /* global_seqno*/));
-  }
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
 
+  FileChecksumGenContext gen_context;
+  gen_context.file_name = "db/tmp";
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
   FileChecksumTestHelper f(true);
   f.CreateWriteableFile();
-  f.SetFileChecksumFunc(options.sst_file_checksum_func.get());
+  f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
   std::unique_ptr<TableBuilder> builder;
   builder.reset(ioptions.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, *comparator,
                           &int_tbl_prop_collector_factories,
-                          options.compression, options.sample_for_compression,
-                          options.compression_opts, false /* skip_filters */,
-                          column_family_name, level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
       f.GetFileWriter()));
-  f.ResetTableBuilder(std::move(builder));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
+  ASSERT_OK(f.WriteKVAndFlushTable());
   ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
   std::string checksum;
-  ASSERT_OK(
-      f.CalculateFileChecksum(options.sst_file_checksum_func.get(), &checksum));
+  ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
   ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
+
+  // Unit test the generator itself for schema stability
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen3 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
+  const char data[] = "here is some data";
+  checksum_crc32c_gen3->Update(data, sizeof(data));
+  checksum_crc32c_gen3->Finalize();
+  checksum = checksum_crc32c_gen3->GetChecksum();
+  ASSERT_STREQ(checksum.c_str(), "\345\245\277\110");
 }
 
 // Plain table is not supported in ROCKSDB_LITE
@@ -3253,23 +3777,21 @@
   plain_table_options.hash_table_ratio = 0;
 
   PlainTableFactory factory(plain_table_options);
-  test::StringSink sink;
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+  std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
   Options options;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   int unknown_level = -1;
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
-      TableBuilderOptions(
-          ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-          kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
-          false /* skip_filters */, column_family_name, unknown_level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
       file_writer.get()));
 
   for (char c = 'a'; c <= 'z'; ++c) {
@@ -3279,19 +3801,18 @@
     builder->Add(key, value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
   test::StringSink* ss =
-      ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(file_writer.get());
+      static_cast<test::StringSink*>(file_writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(ss->contents(), 72242, true));
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(ss->contents(), 72242, true)));
+      new RandomAccessFileReader(std::move(source), "test"));
 
-  TableProperties* props = nullptr;
+  std::unique_ptr<TableProperties> props;
   auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
-                               kPlainTableMagicNumber, ioptions,
-                               &props, true /* compression_type_missing */);
-  std::unique_ptr<TableProperties> props_guard(props);
+                               kPlainTableMagicNumber, ioptions, &props);
   ASSERT_OK(s);
 
   ASSERT_EQ(0ul, props->index_size);
@@ -3310,66 +3831,71 @@
   PlainTableFactory factory(plain_table_options);
 
   Options options;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   int unknown_level = -1;
   FileChecksumTestHelper f(true);
   f.CreateWriteableFile();
 
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
-      TableBuilderOptions(
-          ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-          kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
-          false /* skip_filters */, column_family_name, unknown_level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
       f.GetFileWriter()));
-  f.ResetTableBuilder(std::move(builder));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
-  ASSERT_STREQ(f.GetFileChecksumFuncName(),
-               kUnknownFileChecksumFuncName.c_str());
-  EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum.c_str());
+  ASSERT_OK(f.WriteKVAndFlushTable());
+  ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
+  EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum);
 }
 
-TEST_F(PlainTableTest, Crc32FileChecksum) {
+TEST_F(PlainTableTest, Crc32cFileChecksum) {
   PlainTableOptions plain_table_options;
   plain_table_options.user_key_len = 20;
   plain_table_options.bloom_bits_per_key = 8;
   plain_table_options.hash_table_ratio = 0;
   PlainTableFactory factory(plain_table_options);
 
+  FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
+      new FileChecksumGenCrc32cFactory();
   Options options;
-  options.sst_file_checksum_func =
-      std::shared_ptr<FileChecksumFunc>(CreateFileChecksumFuncCrc32c());
-  const ImmutableCFOptions ioptions(options);
+  options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   int unknown_level = -1;
+
+  FileChecksumGenContext gen_context;
+  gen_context.file_name = "db/tmp";
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
   FileChecksumTestHelper f(true);
   f.CreateWriteableFile();
-  f.SetFileChecksumFunc(options.sst_file_checksum_func.get());
+  f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
 
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
-      TableBuilderOptions(
-          ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-          kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
-          false /* skip_filters */, column_family_name, unknown_level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
       f.GetFileWriter()));
-  f.ResetTableBuilder(std::move(builder));
+  ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
+  ASSERT_OK(f.WriteKVAndFlushTable());
   ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
+
+  std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
+      options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+          gen_context);
   std::string checksum;
-  ASSERT_OK(
-      f.CalculateFileChecksum(options.sst_file_checksum_func.get(), &checksum));
+  ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
   EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
 }
 
@@ -3387,11 +3913,12 @@
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
   Options options;
+  options.db_host_id = "";
   test::PlainInternalKeyComparator internal_comparator(options.comparator);
   options.compression = kNoCompression;
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
            &keys, &kvmap);
@@ -3427,16 +3954,16 @@
   options.compression = comp;
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
 
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3500));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3500));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6500));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7050));
   c.ResetTableReader();
 }
 
@@ -3482,75 +4009,26 @@
   }
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
-// RandomizedHarnessTest is very slow for certain combination of arguments
-// Split into 8 pieces to reduce the time individual tests take.
-TEST_F(HarnessTest, Randomized1) {
-  // part 1 out of 8
-  const size_t part = 1;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
-}
-
-TEST_F(HarnessTest, Randomized2) {
-  // part 2 out of 8
-  const size_t part = 2;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
-}
-
-TEST_F(HarnessTest, Randomized3) {
-  // part 3 out of 8
-  const size_t part = 3;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
-}
-
-TEST_F(HarnessTest, Randomized4) {
-  // part 4 out of 8
-  const size_t part = 4;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
-}
-
-TEST_F(HarnessTest, Randomized5) {
-  // part 5 out of 8
-  const size_t part = 5;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
-}
-
-TEST_F(HarnessTest, Randomized6) {
-  // part 6 out of 8
-  const size_t part = 6;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
-}
-
-TEST_F(HarnessTest, Randomized7) {
-  // part 7 out of 8
-  const size_t part = 7;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
-}
-
-TEST_F(HarnessTest, Randomized8) {
-  // part 8 out of 8
-  const size_t part = 8;
-  const size_t total = 8;
-  RandomizedHarnessTest(part, total);
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(ParameterizedHarnessTest, RandomizedHarnessTest) {
+  Random rnd(test::RandomSeed() + 5);
+  for (int num_entries = 0; num_entries < 2000;
+       num_entries += (num_entries < 50 ? 1 : 200)) {
+    for (int e = 0; e < num_entries; e++) {
+      Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+          rnd.RandomString(rnd.Skewed(5)));
+    }
+    Test(&rnd);
+  }
 }
 
 #ifndef ROCKSDB_LITE
-TEST_F(HarnessTest, RandomizedLongDB) {
+TEST_F(DBHarnessTest, RandomizedLongDB) {
   Random rnd(test::RandomSeed());
-  TestArgs args = {DB_TEST, false, 16, kNoCompression, 0, false};
-  Init(args);
   int num_entries = 100000;
   for (int e = 0; e < num_entries; e++) {
     std::string v;
-    Add(test::RandomKey(&rnd, rnd.Skewed(4)),
-        test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+    Add(test::RandomKey(&rnd, rnd.Skewed(4)), rnd.RandomString(rnd.Skewed(5)));
   }
   Test(&rnd);
 
@@ -3566,30 +4044,44 @@
   ASSERT_GT(files, 0);
 }
 #endif  // ROCKSDB_LITE
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
-class MemTableTest : public testing::Test {};
+class MemTableTest : public testing::Test {
+ public:
+  MemTableTest() {
+    InternalKeyComparator cmp(BytewiseComparator());
+    auto table_factory = std::make_shared<SkipListFactory>();
+    options_.memtable_factory = table_factory;
+    ImmutableOptions ioptions(options_);
+    wb_ = new WriteBufferManager(options_.db_write_buffer_size);
+    memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_,
+                             kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+  }
+
+  ~MemTableTest() {
+    delete memtable_->Unref();
+    delete wb_;
+  }
+
+  MemTable* GetMemTable() { return memtable_; }
+
+ private:
+  MemTable* memtable_;
+  Options options_;
+  WriteBufferManager* wb_;
+};
 
 TEST_F(MemTableTest, Simple) {
-  InternalKeyComparator cmp(BytewiseComparator());
-  auto table_factory = std::make_shared<SkipListFactory>();
-  Options options;
-  options.memtable_factory = table_factory;
-  ImmutableCFOptions ioptions(options);
-  WriteBufferManager wb(options.db_write_buffer_size);
-  MemTable* memtable =
-      new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
-                   kMaxSequenceNumber, 0 /* column_family_id */);
-  memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
-  batch.Put(std::string("k1"), std::string("v1"));
-  batch.Put(std::string("k2"), std::string("v2"));
-  batch.Put(std::string("k3"), std::string("v3"));
-  batch.Put(std::string("largekey"), std::string("vlarge"));
-  batch.DeleteRange(std::string("chi"), std::string("xigua"));
-  batch.DeleteRange(std::string("begin"), std::string("end"));
-  ColumnFamilyMemTablesDefault cf_mems_default(memtable);
+  ASSERT_OK(batch.Put(std::string("k1"), std::string("v1")));
+  ASSERT_OK(batch.Put(std::string("k2"), std::string("v2")));
+  ASSERT_OK(batch.Put(std::string("k3"), std::string("v3")));
+  ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge")));
+  ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua")));
+  ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end")));
+  ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable());
   ASSERT_TRUE(
       WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
           .ok());
@@ -3600,10 +4092,10 @@
     std::unique_ptr<InternalIterator> iter_guard;
     InternalIterator* iter;
     if (i == 0) {
-      iter = memtable->NewIterator(ReadOptions(), &arena);
+      iter = GetMemTable()->NewIterator(ReadOptions(), &arena);
       arena_iter_guard.set(iter);
     } else {
-      iter = memtable->NewRangeTombstoneIterator(
+      iter = GetMemTable()->NewRangeTombstoneIterator(
           ReadOptions(), kMaxSequenceNumber /* read_seq */);
       iter_guard.reset(iter);
     }
@@ -3617,174 +4109,123 @@
       iter->Next();
     }
   }
-
-  delete memtable->Unref();
 }
 
 // Test the empty key
-TEST_F(HarnessTest, SimpleEmptyKey) {
-  auto args = GenerateArgList();
-  for (const auto& arg : args) {
-    Init(arg);
-    Random rnd(test::RandomSeed() + 1);
-    Add("", "v");
-    Test(&rnd);
-  }
+TEST_P(ParameterizedHarnessTest, SimpleEmptyKey) {
+  Random rnd(test::RandomSeed() + 1);
+  Add("", "v");
+  Test(&rnd);
 }
 
-TEST_F(HarnessTest, SimpleSingle) {
-  auto args = GenerateArgList();
-  for (const auto& arg : args) {
-    Init(arg);
-    Random rnd(test::RandomSeed() + 2);
-    Add("abc", "v");
-    Test(&rnd);
-  }
+TEST_P(ParameterizedHarnessTest, SimpleSingle) {
+  Random rnd(test::RandomSeed() + 2);
+  Add("abc", "v");
+  Test(&rnd);
 }
 
-TEST_F(HarnessTest, SimpleMulti) {
-  auto args = GenerateArgList();
-  for (const auto& arg : args) {
-    Init(arg);
-    Random rnd(test::RandomSeed() + 3);
-    Add("abc", "v");
-    Add("abcd", "v");
-    Add("ac", "v2");
-    Test(&rnd);
-  }
+TEST_P(ParameterizedHarnessTest, SimpleMulti) {
+  Random rnd(test::RandomSeed() + 3);
+  Add("abc", "v");
+  Add("abcd", "v");
+  Add("ac", "v2");
+  Test(&rnd);
 }
 
-TEST_F(HarnessTest, SimpleSpecialKey) {
-  auto args = GenerateArgList();
-  for (const auto& arg : args) {
-    Init(arg);
-    Random rnd(test::RandomSeed() + 4);
-    Add("\xff\xff", "v3");
-    Test(&rnd);
-  }
+TEST_P(ParameterizedHarnessTest, SimpleSpecialKey) {
+  Random rnd(test::RandomSeed() + 4);
+  Add("\xff\xff", "v3");
+  Test(&rnd);
 }
 
-TEST_F(HarnessTest, FooterTests) {
-  {
-    // upconvert legacy block based
-    std::string encoded;
-    Footer footer(kLegacyBlockBasedTableMagicNumber, 0);
-    BlockHandle meta_index(10, 5), index(20, 15);
-    footer.set_metaindex_handle(meta_index);
-    footer.set_index_handle(index);
-    footer.EncodeTo(&encoded);
-    Footer decoded_footer;
-    Slice encoded_slice(encoded);
-    decoded_footer.DecodeFrom(&encoded_slice);
-    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
-    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.version(), 0U);
-  }
-  {
-    // xxhash block based
-    std::string encoded;
-    Footer footer(kBlockBasedTableMagicNumber, 1);
-    BlockHandle meta_index(10, 5), index(20, 15);
-    footer.set_metaindex_handle(meta_index);
-    footer.set_index_handle(index);
-    footer.set_checksum(kxxHash);
-    footer.EncodeTo(&encoded);
-    Footer decoded_footer;
-    Slice encoded_slice(encoded);
-    decoded_footer.DecodeFrom(&encoded_slice);
-    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum(), kxxHash);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
-    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.version(), 1U);
-  }
+TEST(TableTest, FooterTests) {
+  Random* r = Random::GetTLSInstance();
+  uint64_t data_size = (uint64_t{1} << r->Uniform(40)) + r->Uniform(100);
+  uint64_t index_size = r->Uniform(1000000000);
+  uint64_t metaindex_size = r->Uniform(1000000);
+  // 5 == block trailer size
+  BlockHandle index(data_size + 5, index_size);
+  BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
+  uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
   {
-    // xxhash64 block based
-    std::string encoded;
-    Footer footer(kBlockBasedTableMagicNumber, 1);
-    BlockHandle meta_index(10, 5), index(20, 15);
-    footer.set_metaindex_handle(meta_index);
-    footer.set_index_handle(index);
-    footer.set_checksum(kxxHash64);
-    footer.EncodeTo(&encoded);
+    // legacy block based
+    FooterBuilder footer;
+    footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
+                 footer_offset, kCRC32c, meta_index, index);
     Footer decoded_footer;
-    Slice encoded_slice(encoded);
-    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
     ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum(), kxxHash64);
+    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
     ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.version(), 1U);
+    ASSERT_EQ(decoded_footer.format_version(), 0U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
+    // Ensure serialized with legacy magic
+    ASSERT_EQ(
+        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
+        kLegacyBlockBasedTableMagicNumber);
+  }
+  // block based, various checksums, various versions
+  for (auto t : GetSupportedChecksums()) {
+    for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
+      FooterBuilder footer;
+      footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t,
+                   meta_index, index);
+      Footer decoded_footer;
+      ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
+      ASSERT_EQ(decoded_footer.table_magic_number(),
+                kBlockBasedTableMagicNumber);
+      ASSERT_EQ(decoded_footer.checksum_type(), t);
+      ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
+                meta_index.offset());
+      ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+      ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+      ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+      ASSERT_EQ(decoded_footer.format_version(), fv);
+      ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
+    }
   }
 // Plain table is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
   {
-    // upconvert legacy plain table
-    std::string encoded;
-    Footer footer(kLegacyPlainTableMagicNumber, 0);
-    BlockHandle meta_index(10, 5), index(20, 15);
-    footer.set_metaindex_handle(meta_index);
-    footer.set_index_handle(index);
-    footer.EncodeTo(&encoded);
+    // legacy plain table
+    FooterBuilder footer;
+    footer.Build(kPlainTableMagicNumber, /* format_version */ 0, footer_offset,
+                 kNoChecksum, meta_index);
     Footer decoded_footer;
-    Slice encoded_slice(encoded);
-    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
     ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
     ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
-    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.version(), 0U);
+    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
+    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
+    ASSERT_EQ(decoded_footer.format_version(), 0U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+    // Ensure serialized with legacy magic
+    ASSERT_EQ(
+        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
+        kLegacyPlainTableMagicNumber);
   }
   {
-    // xxhash block based
-    std::string encoded;
-    Footer footer(kPlainTableMagicNumber, 1);
-    BlockHandle meta_index(10, 5), index(20, 15);
-    footer.set_metaindex_handle(meta_index);
-    footer.set_index_handle(index);
-    footer.set_checksum(kxxHash);
-    footer.EncodeTo(&encoded);
+    // xxhash plain table (not currently used)
+    FooterBuilder footer;
+    footer.Build(kPlainTableMagicNumber, /* format_version */ 1, footer_offset,
+                 kxxHash, meta_index);
     Footer decoded_footer;
-    Slice encoded_slice(encoded);
-    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
     ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum(), kxxHash);
+    ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
     ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
-    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.version(), 1U);
+    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
+    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
+    ASSERT_EQ(decoded_footer.format_version(), 1U);
+    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
   }
 #endif  // !ROCKSDB_LITE
-  {
-    // version == 2
-    std::string encoded;
-    Footer footer(kBlockBasedTableMagicNumber, 2);
-    BlockHandle meta_index(10, 5), index(20, 15);
-    footer.set_metaindex_handle(meta_index);
-    footer.set_index_handle(index);
-    footer.EncodeTo(&encoded);
-    Footer decoded_footer;
-    Slice encoded_slice(encoded);
-    decoded_footer.DecodeFrom(&encoded_slice);
-    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
-    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.version(), 2U);
-  }
 }
 
 class IndexBlockRestartIntervalTest
@@ -3816,28 +4257,31 @@
   table_options.index_block_restart_interval = index_block_restart_interval;
   if (value_delta_encoding) {
     table_options.format_version = 4;
+  } else {
+    table_options.format_version = 3;
   }
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
 
   TableConstructor c(BytewiseComparator());
   static Random rnd(301);
   for (int i = 0; i < kKeysInTable; i++) {
-    InternalKey k(RandomString(&rnd, kKeySize), 0, kTypeValue);
-    c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize));
+    InternalKey k(rnd.RandomString(kKeySize), 0, kTypeValue);
+    c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
   }
 
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
            &kvmap);
   auto reader = c.GetTableReader();
 
+  ReadOptions read_options;
   std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
-      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // Test point lookup
@@ -3881,8 +4325,7 @@
   }
 
   bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override {
-    assert(IsValid(src));
-    return true;
+    return IsValid(src);
   }
 
   bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
@@ -3925,7 +4368,7 @@
 
   const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyDB(kDBPath, options);
+  ASSERT_OK(DestroyDB(kDBPath, options));
   ROCKSDB_NAMESPACE::DB* db;
   ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
@@ -3934,12 +4377,12 @@
     std::string prefix = "[" + std::to_string(i) + "]";
     for (int j = 0; j < 10; j++) {
       std::string key = prefix + std::to_string(j);
-      db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1");
+      ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1"));
     }
   }
 
   // Trigger compaction.
-  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   delete db;
   // In the second round, turn whole_key_filtering off and expect
   // rocksdb still works.
@@ -3955,15 +4398,15 @@
 TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   test::StringSink* sink = new test::StringSink();
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(sink, "" /* don't care */));
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   int_tbl_prop_collector_factories.emplace_back(
       new SstFileWriterPropertiesCollectorFactory(2 /* version */,
                                                   0 /* global_seqno*/));
@@ -3971,9 +4414,8 @@
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, ikc,
                           &int_tbl_prop_collector_factories, kNoCompression,
-                          0 /* sample_for_compression */, CompressionOptions(),
-                          false /* skip_filters */, column_family_name, -1),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
       file_writer.get()));
 
   for (char c = 'a'; c <= 'z'; ++c) {
@@ -3984,7 +4426,7 @@
     builder->Add(ik.Encode(), value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
   test::RandomRWStringSink ss_rw(sink);
   uint32_t version;
@@ -3993,24 +4435,22 @@
 
   // Helper function to get version, global_seqno, global_seqno_offset
   std::function<void()> GetVersionAndGlobalSeqno = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        test::GetRandomAccessFileReader(
-            new test::StringSource(ss_rw.contents(), 73342, true)));
+        new RandomAccessFileReader(std::move(source), ""));
 
-    TableProperties* props = nullptr;
+    std::unique_ptr<TableProperties> props;
     ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
                                   kBlockBasedTableMagicNumber, ioptions,
-                                  &props, true /* compression_type_missing */));
+                                  &props));
 
     UserCollectedProperties user_props = props->user_collected_properties;
     version = DecodeFixed32(
         user_props[ExternalSstFilePropertyNames::kVersion].c_str());
     global_seqno = DecodeFixed64(
         user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str());
-    global_seqno_offset =
-        props->properties_offsets[ExternalSstFilePropertyNames::kGlobalSeqno];
-
-    delete props;
+    global_seqno_offset = props->external_sst_file_global_seqno_offset;
   };
 
   // Helper function to update the value of the global seqno in the file
@@ -4018,23 +4458,26 @@
     std::string new_global_seqno;
     PutFixed64(&new_global_seqno, val);
 
-    ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno));
+    ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(),
+                          nullptr));
   };
 
   // Helper function to get the contents of the table InternalIterator
   std::unique_ptr<TableReader> table_reader;
+  const ReadOptions read_options;
   std::function<InternalIterator*()> GetTableInternalIter = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        test::GetRandomAccessFileReader(
-            new test::StringSource(ss_rw.contents(), 73342, true)));
+        new RandomAccessFileReader(std::move(source), ""));
 
     options.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
-                           EnvOptions(), ikc),
+        TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
+                           ikc),
         std::move(file_reader), ss_rw.contents().size(), &table_reader);
 
     return table_reader->NewIterator(
-        ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
         /*skip_filters=*/false, TableReaderCaller::kUncategorized);
   };
 
@@ -4046,7 +4489,7 @@
   char current_c = 'a';
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey pik;
-    ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 0);
@@ -4067,7 +4510,7 @@
   current_c = 'a';
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey pik;
-    ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 10);
@@ -4085,7 +4528,7 @@
     ASSERT_TRUE(iter->Valid());
 
     ParsedInternalKey pik;
-    ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 10);
@@ -4104,7 +4547,7 @@
   current_c = 'a';
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey pik;
-    ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 3);
@@ -4123,7 +4566,7 @@
     ASSERT_TRUE(iter->Valid());
 
     ParsedInternalKey pik;
-    ASSERT_TRUE(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 3);
@@ -4138,23 +4581,22 @@
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(sink, "" /* don't care */));
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
   Options options;
   options.compression = kNoCompression;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, ikc,
                           &int_tbl_prop_collector_factories, kNoCompression,
-                          0 /* sample_for_compression */, CompressionOptions(),
-                          false /* skip_filters */, column_family_name, -1),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
       file_writer.get()));
 
   for (int i = 1; i <= 10000; ++i) {
@@ -4167,24 +4609,22 @@
     builder->Add(ik.Encode(), value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
-  test::RandomRWStringSink ss_rw(sink);
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, false));
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(ss_rw.contents(), 73342, true)));
-
+      new RandomAccessFileReader(std::move(source), "test"));
   // Helper function to get version, global_seqno, global_seqno_offset
   std::function<void()> VerifyBlockAlignment = [&]() {
-    TableProperties* props = nullptr;
-    ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
+    std::unique_ptr<TableProperties> props;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(),
                                   kBlockBasedTableMagicNumber, ioptions,
-                                  &props, true /* compression_type_missing */));
+                                  &props));
 
     uint64_t data_block_size = props->data_size / props->num_data_blocks;
     ASSERT_EQ(data_block_size, 4096);
     ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks);
-    delete props;
   };
 
   VerifyBlockAlignment();
@@ -4196,17 +4636,17 @@
   bbto.block_align = false;
   Options options2;
   options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  ImmutableCFOptions ioptions2(options2);
+  ImmutableOptions ioptions2(options2);
   const MutableCFOptions moptions2(options2);
 
   ASSERT_OK(ioptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(),
-                         EnvOptions(),
+      TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(),
                          GetPlainInternalComparator(options2.comparator)),
-      std::move(file_reader), ss_rw.contents().size(), &table_reader));
+      std::move(file_reader), sink->contents().size(), &table_reader));
 
+  ReadOptions read_options;
   std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
-      ReadOptions(), moptions2.prefix_extractor.get(), /*arena=*/nullptr,
+      read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   int expected_key = 1;
@@ -4229,26 +4669,25 @@
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(sink, "" /* don't care */));
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
 
   Options options;
   options.compression = kNoCompression;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
 
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, ikc,
                           &int_tbl_prop_collector_factories, kNoCompression,
-                          0 /* sample_for_compression */, CompressionOptions(),
-                          false /* skip_filters */, column_family_name, -1),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
       file_writer.get()));
 
   for (int i = 1; i <= 10000; ++i) {
@@ -4261,20 +4700,22 @@
     builder->Add(ik.Encode(), value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
-  test::RandomRWStringSink ss_rw(sink);
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, true));
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(ss_rw.contents(), 73342, true)));
+      new RandomAccessFileReader(std::move(source), "test"));
 
   {
     RandomAccessFileReader* file = file_reader.get();
-    uint64_t file_size = ss_rw.contents().size();
+    uint64_t file_size = sink->contents().size();
 
     Footer footer;
-    ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
-                                 &footer, kBlockBasedTableMagicNumber));
+    IOOptions opts;
+    ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */,
+                                 file_size, &footer,
+                                 kBlockBasedTableMagicNumber));
 
     auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
                                 BlockContents* contents) {
@@ -4297,25 +4738,20 @@
 
     BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
                      &metaindex_contents);
-    Block metaindex_block(std::move(metaindex_contents),
-                          kDisableGlobalSequenceNumber);
+    Block metaindex_block(std::move(metaindex_contents));
 
     std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
-        BytewiseComparator(), BytewiseComparator()));
-    bool found_properties_block = true;
-    ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block));
-    ASSERT_TRUE(found_properties_block);
+        BytewiseComparator(), kDisableGlobalSequenceNumber));
 
     // -- Read properties block
-    Slice v = meta_iter->value();
     BlockHandle properties_handle;
-    ASSERT_OK(properties_handle.DecodeFrom(&v));
+    ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlockName,
+                                    &properties_handle));
+    ASSERT_FALSE(properties_handle.IsNull());
     BlockContents properties_contents;
-
     BlockFetchHelper(properties_handle, BlockType::kProperties,
                      &properties_contents);
-    Block properties_block(std::move(properties_contents),
-                           kDisableGlobalSequenceNumber);
+    Block properties_block(std::move(properties_contents));
 
     ASSERT_EQ(properties_block.NumRestarts(), 1u);
   }
@@ -4344,7 +4780,7 @@
   table_options.filter_policy.reset(NewBloomFilterPolicy(
       8 /* bits_per_key */, false /* use_block_based_filter */));
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
@@ -4353,15 +4789,17 @@
 
   // get file reader
   test::StringSink* table_sink = c.TEST_GetSink();
-  std::unique_ptr<RandomAccessFileReader> table_reader{
-      test::GetRandomAccessFileReader(
-          new test::StringSource(table_sink->contents(), 0 /* unique_id */,
-                                 false /* allow_mmap_reads */))};
+  std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+      table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
+
+  std::unique_ptr<RandomAccessFileReader> table_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
   size_t table_size = table_sink->contents().size();
 
   // read footer
   Footer footer;
-  ASSERT_OK(ReadFooterFromFile(table_reader.get(),
+  IOOptions opts;
+  ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(),
                                nullptr /* prefetch_buffer */, table_size,
                                &footer, kBlockBasedTableMagicNumber));
 
@@ -4376,12 +4814,11 @@
       UncompressionDict::GetEmptyDict(), pcache_opts,
       nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
-  Block metaindex_block(std::move(metaindex_contents),
-                        kDisableGlobalSequenceNumber);
+  Block metaindex_block(std::move(metaindex_contents));
 
   // verify properties block comes last
   std::unique_ptr<InternalIterator> metaindex_iter{
-      metaindex_block.NewDataIterator(options.comparator, options.comparator)};
+      metaindex_block.NewMetaIterator()};
   uint64_t max_offset = 0;
   std::string key_at_max_offset;
   for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
@@ -4394,13 +4831,97 @@
       key_at_max_offset = metaindex_iter->key().ToString();
     }
   }
-  ASSERT_EQ(kPropertiesBlock, key_at_max_offset);
+  ASSERT_EQ(kPropertiesBlockName, key_at_max_offset);
   // index handle is stored in footer rather than metaindex block, so need
   // separate logic to verify it comes before properties block.
   ASSERT_GT(max_offset, footer.index_handle().offset());
   c.ResetTableReader();
 }
 
+TEST_P(BlockBasedTableTest, SeekMetaBlocks) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("foo_a1", "val1");
+  c.Add("foo_b2", "val2");
+  c.Add("foo_c3", "val3");
+  c.Add("foo_d4", "val4");
+  c.Add("foo_e5", "val5");
+  c.Add("foo_f6", "val6");
+  c.Add("foo_g7", "val7");
+  c.Add("foo_h8", "val8");
+  c.Add("foo_j9", "val9");
+
+  // write an SST file
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(
+      8 /* bits_per_key */, false /* use_block_based_filter */));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // get file reader
+  test::StringSink* table_sink = c.TEST_GetSink();
+  std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+      table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
+
+  std::unique_ptr<RandomAccessFileReader> table_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+  size_t table_size = table_sink->contents().size();
+
+  // read footer
+  Footer footer;
+  IOOptions opts;
+  ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(),
+                               nullptr /* prefetch_buffer */, table_size,
+                               &footer, kBlockBasedTableMagicNumber));
+
+  // read metaindex
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  PersistentCacheOptions pcache_opts;
+  BlockFetcher block_fetcher(
+      table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
+      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), pcache_opts,
+      nullptr /*memory_allocator*/);
+  ASSERT_OK(block_fetcher.ReadBlockContents());
+  Block metaindex_block(std::move(metaindex_contents));
+
+  // verify properties block comes last
+  std::unique_ptr<MetaBlockIter> metaindex_iter(
+      metaindex_block.NewMetaIterator());
+  bool has_hash_prefixes = false;
+  bool has_hash_metadata = false;
+  for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+       metaindex_iter->Next()) {
+    if (metaindex_iter->key().ToString() == kHashIndexPrefixesBlock) {
+      has_hash_prefixes = true;
+    } else if (metaindex_iter->key().ToString() ==
+               kHashIndexPrefixesMetadataBlock) {
+      has_hash_metadata = true;
+    }
+  }
+  if (has_hash_metadata) {
+    metaindex_iter->Seek(kHashIndexPrefixesMetadataBlock);
+    ASSERT_TRUE(metaindex_iter->Valid());
+    ASSERT_EQ(kHashIndexPrefixesMetadataBlock,
+              metaindex_iter->key().ToString());
+  }
+  if (has_hash_prefixes) {
+    metaindex_iter->Seek(kHashIndexPrefixesBlock);
+    ASSERT_TRUE(metaindex_iter->Valid());
+    ASSERT_EQ(kHashIndexPrefixesBlock, metaindex_iter->key().ToString());
+  }
+  c.ResetTableReader();
+}
+
 TEST_P(BlockBasedTableTest, BadOptions) {
   ROCKSDB_NAMESPACE::Options options;
   options.compression = kNoCompression;
@@ -4411,7 +4932,7 @@
   const std::string kDBPath =
       test::PerThreadDBPath("block_based_table_bad_options_test");
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyDB(kDBPath, options);
+  ASSERT_OK(DestroyDB(kDBPath, options));
   ROCKSDB_NAMESPACE::DB* db;
   ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
@@ -4457,10 +4978,18 @@
 
 TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
   TailPrefetchStats tpstats;
-  FilePrefetchBuffer buffer(nullptr, 0, 0, false, true);
-  buffer.TryReadFromCache(500, 10, nullptr);
-  buffer.TryReadFromCache(480, 10, nullptr);
-  buffer.TryReadFromCache(490, 10, nullptr);
+  FilePrefetchBuffer buffer(0 /* readahead_size */, 0 /* max_readahead_size */,
+                            false /* enable */, true /* track_min_offset */);
+  IOOptions opts;
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */);
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 480 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */);
+  buffer.TryReadFromCache(opts, nullptr /* reader */, 490 /* offset */,
+                          10 /* n */, nullptr /* result */,
+                          nullptr /* status */);
   ASSERT_EQ(480, buffer.min_offset_read());
 }
 
@@ -4483,14 +5012,14 @@
   static Random rnd(1048);
   for (int i = 0; i < kNumKeys; i++) {
     // padding one "0" to mark existent keys.
-    std::string random_key(RandomString(&rnd, kKeySize - 1) + "1");
+    std::string random_key(rnd.RandomString(kKeySize - 1) + "1");
     InternalKey k(random_key, 0, kTypeValue);
-    c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize));
+    c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
   }
 
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
@@ -4499,8 +5028,9 @@
   auto reader = c.GetTableReader();
 
   std::unique_ptr<InternalIterator> seek_iter;
+  ReadOptions read_options;
   seek_iter.reset(reader->NewIterator(
-      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   for (int i = 0; i < 2; ++i) {
     ReadOptions ro;
@@ -4572,7 +5102,7 @@
   Options options;
   BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
   options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_opt,
            GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
@@ -4587,13 +5117,15 @@
       /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->SeekToFirst();
   ASSERT_FALSE(iter->Valid());
-  ASSERT_TRUE(iter->IsOutOfBound());
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
   iter.reset(new KeyConvertingIterator(reader->NewIterator(
       read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->Seek("foo");
   ASSERT_FALSE(iter->Valid());
-  ASSERT_TRUE(iter->IsOutOfBound());
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
 }
 
 // BlockBasedTableIterator should invalidate itself and return
@@ -4610,7 +5142,7 @@
   table_opt.flush_block_policy_factory =
       std::make_shared<FlushBlockEveryKeyPolicyFactory>();
   options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_opt,
            GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
@@ -4628,7 +5160,7 @@
   ASSERT_EQ("bar", iter->key());
   iter->Next();
   ASSERT_FALSE(iter->Valid());
-  ASSERT_TRUE(iter->IsOutOfBound());
+  ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
   std::string ub2 = "foo_after";
   Slice ub_slice2(ub2);
   read_opt.iterate_upper_bound = &ub_slice2;
@@ -4640,12 +5172,246 @@
   ASSERT_EQ("foo", iter->key());
   iter->Next();
   ASSERT_FALSE(iter->Valid());
-  ASSERT_FALSE(iter->IsOutOfBound());
+  ASSERT_FALSE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
+}
+
+TEST_P(
+    BlockBasedTableTest,
+    IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnBuilderFinish) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  constexpr std::size_t kMaxDictBufferBytes = 1024;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache(NewLRUCache(lo));
+  table_options.block_cache = cache;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+
+  Options options;
+  options.compression = kSnappyCompression;
+  options.compression_opts.max_dict_bytes = kMaxDictBytes;
+  options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "test_file_name", FileOptions()));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kSnappyCompression,
+                          options.compression_opts, kUnknownColumnFamily,
+                          "test_cf", -1 /* level */),
+      file_writer.get()));
+
+  std::string key1 = "key1";
+  std::string value1 = "val1";
+  InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+  // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+  // therefore won't trigger any data block's buffering
+  builder->Add(ik1.Encode(), value1);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  std::string key2 = "key2";
+  std::string value2 = "val2";
+  InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+  // Adding the second key will trigger a flush of the last data block (the one
+  // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of that data block.
+  builder->Add(ik2.Encode(), value2);
+  // Cache reservation will increase for last buffered data block (the one
+  // containing key1 and value1) since the buffer limit is not exceeded after
+  // that buffering and the cache will not be full after this reservation
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  ASSERT_OK(builder->Finish());
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+}
+
+TEST_P(
+    BlockBasedTableTest,
+    IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnBufferLimitExceed) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  constexpr std::size_t kMaxDictBufferBytes = 2 * kSizeDummyEntry;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache(NewLRUCache(lo));
+  table_options.block_cache = cache;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+
+  Options options;
+  options.compression = kSnappyCompression;
+  options.compression_opts.max_dict_bytes = kMaxDictBytes;
+  options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "test_file_name", FileOptions()));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kSnappyCompression,
+                          options.compression_opts, kUnknownColumnFamily,
+                          "test_cf", -1 /* level */),
+      file_writer.get()));
+
+  std::string key1 = "key1";
+  std::string value1(kSizeDummyEntry, '0');
+  InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+  // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+  // therefore won't trigger any data block's buffering
+  builder->Add(ik1.Encode(), value1);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  std::string key2 = "key2";
+  std::string value2(kSizeDummyEntry, '0');
+  InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+  // Adding the second key will trigger a flush of the last data block (the one
+  // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik2.Encode(), value2);
+  // Cache reservation will increase for last buffered data block (the one
+  // containing key1 and value1) since the buffer limit is not exceeded after
+  // the buffering and the cache will not be full after this reservation
+  EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
+  EXPECT_LT(cache->GetPinnedUsage(),
+            2 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  std::string key3 = "key3";
+  std::string value3 = "val3";
+  InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
+  // Adding the third key will trigger a flush of the last data block (the one
+  // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik3.Encode(), value3);
+  // Cache reservation will decrease since the buffer limit is now exceeded
+  // after the last buffering and EnterUnbuffered() is triggered
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  ASSERT_OK(builder->Finish());
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+}
+
+TEST_P(
+    BlockBasedTableTest,
+    IncreaseCacheReservationForCompressDictBuildingBufferOnBuilderAddAndDecreaseOnCacheFull) {
+  constexpr std::size_t kSizeDummyEntry = 256 * 1024;
+  constexpr std::size_t kMetaDataChargeOverhead = 10000;
+  // A small kCacheCapacity is chosen so that increase cache reservation for
+  // buffering two data blocks, each containing key1/value1, key2/a big
+  // value2, will cause cache full
+  constexpr std::size_t kCacheCapacity =
+      1 * kSizeDummyEntry + kSizeDummyEntry / 2;
+  constexpr std::size_t kMaxDictBytes = 1024;
+  // A big kMaxDictBufferBytes is chosen so that adding a big key value pair
+  // (key2, value2) won't exceed the buffer limit
+  constexpr std::size_t kMaxDictBufferBytes = 1024 * 1024 * 1024;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  LRUCacheOptions lo;
+  lo.capacity = kCacheCapacity;
+  lo.num_shard_bits = 0;  // 2^0 shard
+  lo.strict_capacity_limit = true;
+  std::shared_ptr<Cache> cache(NewLRUCache(lo));
+  table_options.block_cache = cache;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+
+  Options options;
+  options.compression = kSnappyCompression;
+  options.compression_opts.max_dict_bytes = kMaxDictBytes;
+  options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "test_file_name", FileOptions()));
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kSnappyCompression,
+                          options.compression_opts, kUnknownColumnFamily,
+                          "test_cf", -1 /* level */),
+      file_writer.get()));
+
+  std::string key1 = "key1";
+  std::string value1 = "val1";
+  InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
+  // Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
+  // therefore won't trigger any data block's buffering
+  builder->Add(ik1.Encode(), value1);
+  ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  std::string key2 = "key2";
+  std::string value2(kSizeDummyEntry, '0');
+  InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
+  // Adding the second key will trigger a flush of the last data block (the one
+  // containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik2.Encode(), value2);
+  // Cache reservation will increase for the last buffered data block (the one
+  // containing key1 and value1) since the buffer limit is not exceeded after
+  // the buffering and the cache will not be full after this reservation
+  EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
+  EXPECT_LT(cache->GetPinnedUsage(),
+            1 * kSizeDummyEntry + kMetaDataChargeOverhead);
+
+  std::string key3 = "key3";
+  std::string value3 = "value3";
+  InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
+  // Adding the third key will trigger a flush of the last data block (the one
+  // containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
+  // buffering of the last data block.
+  builder->Add(ik3.Encode(), value3);
+  // Cache reservation will decrease since the cache is now full after
+  // increasing reservation for the last buffered block and EnterUnbuffered() is
+  // triggered
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
+
+  ASSERT_OK(builder->Finish());
+  EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/two_level_iterator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/two_level_iterator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/two_level_iterator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/two_level_iterator.cc	2025-05-19 16:14:27.000000000 +0000
@@ -43,6 +43,10 @@
     assert(Valid());
     return second_level_iter_.key();
   }
+  Slice user_key() const override {
+    assert(Valid());
+    return second_level_iter_.user_key();
+  }
   IndexValue value() const override {
     assert(Valid());
     return second_level_iter_.value();
@@ -197,6 +201,10 @@
           state_->NewSecondaryIterator(handle);
       data_block_handle_ = handle;
       SetSecondLevelIterator(iter);
+      if (iter == nullptr) {
+        status_ = Status::Corruption("Missing block for partition " +
+                                     handle.ToString());
+      }
     }
   }
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id.cc mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,166 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdint>
+
+#include "table/unique_id_impl.h"
+#include "util/coding_lean.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
+  std::string db_session_id(20U, '\0');
+  char *buf = &db_session_id[0];
+  // Preserving `lower` is slightly tricky. 36^12 is slightly more than
+  // 62 bits, so we use 12 chars plus the bottom two bits of one more.
+  // (A tiny fraction of 20 digit strings go unused.)
+  uint64_t a = (upper << 2) | (lower >> 62);
+  uint64_t b = lower & (UINT64_MAX >> 2);
+  PutBaseChars<36>(&buf, 8, a, /*uppercase*/ true);
+  PutBaseChars<36>(&buf, 12, b, /*uppercase*/ true);
+  assert(buf == &db_session_id.back() + 1);
+  return db_session_id;
+}
+
+Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
+                       uint64_t *lower) {
+  const size_t len = db_session_id.size();
+  if (len == 0) {
+    return Status::NotSupported("Missing db_session_id");
+  }
+  // Anything from 13 to 24 chars is reasonable. We don't have to limit to
+  // exactly 20.
+  if (len < 13) {
+    return Status::NotSupported("Too short db_session_id");
+  }
+  if (len > 24) {
+    return Status::NotSupported("Too long db_session_id");
+  }
+  uint64_t a = 0, b = 0;
+  const char *buf = &db_session_id.front();
+  bool success = ParseBaseChars<36>(&buf, len - 12U, &a);
+  if (!success) {
+    return Status::NotSupported("Bad digit in db_session_id");
+  }
+  success = ParseBaseChars<36>(&buf, 12U, &b);
+  if (!success) {
+    return Status::NotSupported("Bad digit in db_session_id");
+  }
+  assert(buf == &db_session_id.back() + 1);
+  *upper = a >> 2;
+  *lower = (b & (UINT64_MAX >> 2)) | (a << 62);
+  return Status::OK();
+}
+
+Status GetSstInternalUniqueId(const std::string &db_id,
+                              const std::string &db_session_id,
+                              uint64_t file_number, UniqueId64x3 *out) {
+  if (db_id.empty()) {
+    return Status::NotSupported("Missing db_id");
+  }
+  if (file_number == 0) {
+    return Status::NotSupported("Missing or bad file number");
+  }
+  if (db_session_id.empty()) {
+    return Status::NotSupported("Missing db_session_id");
+  }
+  uint64_t session_upper = 0;  // Assignment to appease clang-analyze
+  uint64_t session_lower = 0;  // Assignment to appease clang-analyze
+  {
+    Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Exactly preserve session lower to ensure that session ids generated
+  // during the same process lifetime are guaranteed unique.
+  // DBImpl also guarantees (in recent versions) that this is not zero,
+  // so that we can guarantee unique ID is never all zeros. (Can't assert
+  // that here because of testing and old versions.)
+  // We put this first in anticipation of matching a small-ish set of cache
+  // key prefixes to cover entries relevant to any DB.
+  (*out)[0] = session_lower;
+
+  // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy)
+  // for very high global uniqueness entropy.
+  // (It is possible that many DBs descended from one common DB id are copied
+  // around and proliferate, in which case session id is critical, but it is
+  // more common for different DBs to have different DB ids.)
+  uint64_t db_a, db_b;
+  Hash2x64(db_id.data(), db_id.size(), session_upper, &db_a, &db_b);
+
+  // Xor in file number for guaranteed uniqueness by file number for a given
+  // session and DB id. (Xor slightly better than + here. See
+  // https://github.com/pdillinger/unique_id )
+  (*out)[1] = db_a ^ file_number;
+
+  // Extra (optional) global uniqueness
+  (*out)[2] = db_b;
+
+  return Status::OK();
+}
+
+namespace {
+// For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all
+// zeros in first 128 bits to map to itself, so that excluding zero in
+// internal IDs (session_lower != 0 above) does the same for external IDs.
+// These values are meaningless except for making that work.
+constexpr uint64_t kHiOffsetForZero = 17391078804906429400U;
+constexpr uint64_t kLoOffsetForZero = 6417269962128484497U;
+}  // namespace
+
+void InternalUniqueIdToExternal(UniqueId64x3 *in_out) {
+  uint64_t hi, lo;
+  BijectiveHash2x64((*in_out)[1] + kHiOffsetForZero,
+                    (*in_out)[0] + kLoOffsetForZero, &hi, &lo);
+  (*in_out)[0] = lo;
+  (*in_out)[1] = hi;
+  (*in_out)[2] += lo + hi;
+}
+
+void ExternalUniqueIdToInternal(UniqueId64x3 *in_out) {
+  uint64_t lo = (*in_out)[0];
+  uint64_t hi = (*in_out)[1];
+  (*in_out)[2] -= lo + hi;
+  BijectiveUnhash2x64(hi, lo, &hi, &lo);
+  (*in_out)[0] = lo - kLoOffsetForZero;
+  (*in_out)[1] = hi - kHiOffsetForZero;
+}
+
+std::string EncodeUniqueIdBytes(const UniqueId64x3 &in) {
+  std::string ret(24U, '\0');
+  EncodeFixed64(&ret[0], in[0]);
+  EncodeFixed64(&ret[8], in[1]);
+  EncodeFixed64(&ret[16], in[2]);
+  return ret;
+}
+
+Status GetUniqueIdFromTableProperties(const TableProperties &props,
+                                      std::string *out_id) {
+  UniqueId64x3 tmp{};
+  Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id,
+                                    props.orig_file_number, &tmp);
+  if (s.ok()) {
+    InternalUniqueIdToExternal(&tmp);
+    *out_id = EncodeUniqueIdBytes(tmp);
+  } else {
+    out_id->clear();
+  }
+  return s;
+}
+
+std::string UniqueIdToHumanString(const std::string &id) {
+  // Not so efficient, but that's OK
+  std::string str = Slice(id).ToString(/*hex*/ true);
+  for (size_t i = 16; i < str.size(); i += 17) {
+    str.insert(i, "-");
+  }
+  return str;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/table/unique_id_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/table/unique_id_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,59 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+
+#include "rocksdb/unique_id.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using UniqueId64x3 = std::array<uint64_t, 3>;
+
+// Helper for GetUniqueIdFromTableProperties. This function can also be used
+// for temporary ids for files without sufficient information in table
+// properties. The internal unique id is more structured than the public
+// unique id, so can be manipulated in more ways but very carefully.
+// These must be long term stable to ensure GetUniqueIdFromTableProperties
+// is long term stable.
+Status GetSstInternalUniqueId(const std::string &db_id,
+                              const std::string &db_session_id,
+                              uint64_t file_number, UniqueId64x3 *out);
+
+// Helper for GetUniqueIdFromTableProperties. External unique ids go through
+// this extra hashing layer so that prefixes of the unique id have predictable
+// "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on
+// the full 192 bits.
+// This transformation must be long term stable to ensure
+// GetUniqueIdFromTableProperties is long term stable.
+void InternalUniqueIdToExternal(UniqueId64x3 *in_out);
+
+// Reverse of InternalUniqueIdToExternal mostly for testing purposes
+// (demonstrably 1-to-1 on the first 128 bits and on the full 192 bits).
+void ExternalUniqueIdToInternal(UniqueId64x3 *in_out);
+
+// Convert numerical format to byte format for public API
+std::string EncodeUniqueIdBytes(const UniqueId64x3 &in);
+
+// Reformat a random value down to our "DB session id" format,
+// which is intended to be compact and friendly for use in file names.
+// `lower` is fully preserved and data is lost from `upper`.
+//
+// Detail: Encoded into 20 chars in base-36 ([0-9A-Z]), which is ~103 bits of
+// entropy, which is enough to expect no collisions across a billion servers
+// each opening DBs a million times (~2^50). Benefits vs. RFC-4122 unique id:
+// * Save ~ dozen bytes per SST file
+// * Shorter shared backup file names (some platforms have low limits)
+// * Visually distinct from DB id format (usually RFC-4122)
+std::string EncodeSessionId(uint64_t upper, uint64_t lower);
+
+// Reverse of EncodeSessionId. Returns NotSupported on error rather than
+// Corruption because non-standard session IDs should be allowed with degraded
+// functionality.
+Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
+                       uint64_t *lower);
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,437 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// This test uses a custom Env to keep track of the state of a filesystem as of
-// the last "sync". It then checks for data loss errors by purposely dropping
-// file data (or entire files) not protected by a "sync".
-
-#include "test_util/fault_injection_test_env.h"
-#include <functional>
-#include <utility>
-
-namespace ROCKSDB_NAMESPACE {
-
-// Assume a filename, and not a directory name like "/foo/bar/"
-std::string GetDirName(const std::string filename) {
-  size_t found = filename.find_last_of("/\\");
-  if (found == std::string::npos) {
-    return "";
-  } else {
-    return filename.substr(0, found);
-  }
-}
-
-// A basic file truncation function suitable for this test.
-Status Truncate(Env* env, const std::string& filename, uint64_t length) {
-  std::unique_ptr<SequentialFile> orig_file;
-  const EnvOptions options;
-  Status s = env->NewSequentialFile(filename, &orig_file, options);
-  if (!s.ok()) {
-    fprintf(stderr, "Cannot open file %s for truncation: %s\n",
-            filename.c_str(), s.ToString().c_str());
-    return s;
-  }
-
-  std::unique_ptr<char[]> scratch(new char[length]);
-  ROCKSDB_NAMESPACE::Slice result;
-  s = orig_file->Read(length, &result, scratch.get());
-#ifdef OS_WIN
-  orig_file.reset();
-#endif
-  if (s.ok()) {
-    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
-    std::unique_ptr<WritableFile> tmp_file;
-    s = env->NewWritableFile(tmp_name, &tmp_file, options);
-    if (s.ok()) {
-      s = tmp_file->Append(result);
-      if (s.ok()) {
-        s = env->RenameFile(tmp_name, filename);
-      } else {
-        fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
-                filename.c_str(), s.ToString().c_str());
-        env->DeleteFile(tmp_name);
-      }
-    }
-  }
-  if (!s.ok()) {
-    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
-            s.ToString().c_str());
-  }
-
-  return s;
-}
-
-// Trim the tailing "/" in the end of `str`
-std::string TrimDirname(const std::string& str) {
-  size_t found = str.find_last_not_of("/");
-  if (found == std::string::npos) {
-    return str;
-  }
-  return str.substr(0, found + 1);
-}
-
-// Return pair <parent directory name, file name> of a full path.
-std::pair<std::string, std::string> GetDirAndName(const std::string& name) {
-  std::string dirname = GetDirName(name);
-  std::string fname = name.substr(dirname.size() + 1);
-  return std::make_pair(dirname, fname);
-}
-
-Status FileState::DropUnsyncedData(Env* env) const {
-  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
-  return Truncate(env, filename_, sync_pos);
-}
-
-Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
-  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
-  assert(pos_ >= sync_pos);
-  int range = static_cast<int>(pos_ - sync_pos);
-  uint64_t truncated_size =
-      static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
-  return Truncate(env, filename_, truncated_size);
-}
-
-Status TestDirectory::Fsync() {
-  if (!env_->IsFilesystemActive()) {
-    return env_->GetError();
-  }
-  env_->SyncDir(dirname_);
-  return dir_->Fsync();
-}
-
-TestWritableFile::TestWritableFile(const std::string& fname,
-                                   std::unique_ptr<WritableFile>&& f,
-                                   FaultInjectionTestEnv* env)
-    : state_(fname),
-      target_(std::move(f)),
-      writable_file_opened_(true),
-      env_(env) {
-  assert(target_ != nullptr);
-  state_.pos_ = 0;
-}
-
-TestWritableFile::~TestWritableFile() {
-  if (writable_file_opened_) {
-    Close();
-  }
-}
-
-Status TestWritableFile::Append(const Slice& data) {
-  if (!env_->IsFilesystemActive()) {
-    return env_->GetError();
-  }
-  Status s = target_->Append(data);
-  if (s.ok()) {
-    state_.pos_ += data.size();
-    env_->WritableFileAppended(state_);
-  }
-  return s;
-}
-
-Status TestWritableFile::Close() {
-  writable_file_opened_ = false;
-  Status s = target_->Close();
-  if (s.ok()) {
-    env_->WritableFileClosed(state_);
-  }
-  return s;
-}
-
-Status TestWritableFile::Flush() {
-  Status s = target_->Flush();
-  if (s.ok() && env_->IsFilesystemActive()) {
-    state_.pos_at_last_flush_ = state_.pos_;
-  }
-  return s;
-}
-
-Status TestWritableFile::Sync() {
-  if (!env_->IsFilesystemActive()) {
-    return Status::IOError("FaultInjectionTestEnv: not active");
-  }
-  // No need to actual sync.
-  state_.pos_at_last_sync_ = state_.pos_;
-  env_->WritableFileSynced(state_);
-  return Status::OK();
-}
-
-TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/,
-                                   std::unique_ptr<RandomRWFile>&& f,
-                                   FaultInjectionTestEnv* env)
-    : target_(std::move(f)), file_opened_(true), env_(env) {
-  assert(target_ != nullptr);
-}
-
-TestRandomRWFile::~TestRandomRWFile() {
-  if (file_opened_) {
-    Close();
-  }
-}
-
-Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) {
-  if (!env_->IsFilesystemActive()) {
-    return env_->GetError();
-  }
-  return target_->Write(offset, data);
-}
-
-Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
-                              char* scratch) const {
-  if (!env_->IsFilesystemActive()) {
-    return env_->GetError();
-  }
-  return target_->Read(offset, n, result, scratch);
-}
-
-Status TestRandomRWFile::Close() {
-  file_opened_ = false;
-  return target_->Close();
-}
-
-Status TestRandomRWFile::Flush() {
-  if (!env_->IsFilesystemActive()) {
-    return env_->GetError();
-  }
-  return target_->Flush();
-}
-
-Status TestRandomRWFile::Sync() {
-  if (!env_->IsFilesystemActive()) {
-    return env_->GetError();
-  }
-  return target_->Sync();
-}
-
-Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
-                                           std::unique_ptr<Directory>* result) {
-  std::unique_ptr<Directory> r;
-  Status s = target()->NewDirectory(name, &r);
-  assert(s.ok());
-  if (!s.ok()) {
-    return s;
-  }
-  result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
-  return Status::OK();
-}
-
-Status FaultInjectionTestEnv::NewWritableFile(
-    const std::string& fname, std::unique_ptr<WritableFile>* result,
-    const EnvOptions& soptions) {
-  if (!IsFilesystemActive()) {
-    return GetError();
-  }
-  // Not allow overwriting files
-  Status s = target()->FileExists(fname);
-  if (s.ok()) {
-    return Status::Corruption("File already exists.");
-  } else if (!s.IsNotFound()) {
-    assert(s.IsIOError());
-    return s;
-  }
-  s = target()->NewWritableFile(fname, result, soptions);
-  if (s.ok()) {
-    result->reset(new TestWritableFile(fname, std::move(*result), this));
-    // WritableFileWriter* file is opened
-    // again then it will be truncated - so forget our saved state.
-    UntrackFile(fname);
-    MutexLock l(&mutex_);
-    open_files_.insert(fname);
-    auto dir_and_name = GetDirAndName(fname);
-    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
-    list.insert(dir_and_name.second);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::ReopenWritableFile(
-    const std::string& fname, std::unique_ptr<WritableFile>* result,
-    const EnvOptions& soptions) {
-  if (!IsFilesystemActive()) {
-    return GetError();
-  }
-  Status s = target()->ReopenWritableFile(fname, result, soptions);
-  if (s.ok()) {
-    result->reset(new TestWritableFile(fname, std::move(*result), this));
-    // WritableFileWriter* file is opened
-    // again then it will be truncated - so forget our saved state.
-    UntrackFile(fname);
-    MutexLock l(&mutex_);
-    open_files_.insert(fname);
-    auto dir_and_name = GetDirAndName(fname);
-    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
-    list.insert(dir_and_name.second);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::NewRandomRWFile(
-    const std::string& fname, std::unique_ptr<RandomRWFile>* result,
-    const EnvOptions& soptions) {
-  if (!IsFilesystemActive()) {
-    return GetError();
-  }
-  Status s = target()->NewRandomRWFile(fname, result, soptions);
-  if (s.ok()) {
-    result->reset(new TestRandomRWFile(fname, std::move(*result), this));
-    // WritableFileWriter* file is opened
-    // again then it will be truncated - so forget our saved state.
-    UntrackFile(fname);
-    MutexLock l(&mutex_);
-    open_files_.insert(fname);
-    auto dir_and_name = GetDirAndName(fname);
-    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
-    list.insert(dir_and_name.second);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::NewRandomAccessFile(
-    const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
-    const EnvOptions& soptions) {
-  if (!IsFilesystemActive()) {
-    return GetError();
-  }
-  return target()->NewRandomAccessFile(fname, result, soptions);
-}
-
-Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
-  if (!IsFilesystemActive()) {
-    return GetError();
-  }
-  Status s = EnvWrapper::DeleteFile(f);
-  if (!s.ok()) {
-    fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
-            s.ToString().c_str());
-  }
-  if (s.ok()) {
-    UntrackFile(f);
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::RenameFile(const std::string& s,
-                                         const std::string& t) {
-  if (!IsFilesystemActive()) {
-    return GetError();
-  }
-  Status ret = EnvWrapper::RenameFile(s, t);
-
-  if (ret.ok()) {
-    MutexLock l(&mutex_);
-    if (db_file_state_.find(s) != db_file_state_.end()) {
-      db_file_state_[t] = db_file_state_[s];
-      db_file_state_.erase(s);
-    }
-
-    auto sdn = GetDirAndName(s);
-    auto tdn = GetDirAndName(t);
-    if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
-      auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
-      assert(tlist.find(tdn.second) == tlist.end());
-      tlist.insert(tdn.second);
-    }
-  }
-
-  return ret;
-}
-
-void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
-  MutexLock l(&mutex_);
-  if (open_files_.find(state.filename_) != open_files_.end()) {
-    db_file_state_[state.filename_] = state;
-    open_files_.erase(state.filename_);
-  }
-}
-
-void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) {
-  MutexLock l(&mutex_);
-  if (open_files_.find(state.filename_) != open_files_.end()) {
-    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
-      db_file_state_.insert(std::make_pair(state.filename_, state));
-    } else {
-      db_file_state_[state.filename_] = state;
-    }
-  }
-}
-
-void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) {
-  MutexLock l(&mutex_);
-  if (open_files_.find(state.filename_) != open_files_.end()) {
-    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
-      db_file_state_.insert(std::make_pair(state.filename_, state));
-    } else {
-      db_file_state_[state.filename_] = state;
-    }
-  }
-}
-
-// For every file that is not fully synced, make a call to `func` with
-// FileState of the file as the parameter.
-Status FaultInjectionTestEnv::DropFileData(
-    std::function<Status(Env*, FileState)> func) {
-  Status s;
-  MutexLock l(&mutex_);
-  for (std::map<std::string, FileState>::const_iterator it =
-           db_file_state_.begin();
-       s.ok() && it != db_file_state_.end(); ++it) {
-    const FileState& state = it->second;
-    if (!state.IsFullySynced()) {
-      s = func(target(), state);
-    }
-  }
-  return s;
-}
-
-Status FaultInjectionTestEnv::DropUnsyncedFileData() {
-  return DropFileData([&](Env* env, const FileState& state) {
-    return state.DropUnsyncedData(env);
-  });
-}
-
-Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) {
-  return DropFileData([&](Env* env, const FileState& state) {
-    return state.DropRandomUnsyncedData(env, rnd);
-  });
-}
-
-Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
-  // Because DeleteFile access this container make a copy to avoid deadlock
-  std::map<std::string, std::set<std::string>> map_copy;
-  {
-    MutexLock l(&mutex_);
-    map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
-                    dir_to_new_files_since_last_sync_.end());
-  }
-
-  for (auto& pair : map_copy) {
-    for (std::string name : pair.second) {
-      Status s = DeleteFile(pair.first + "/" + name);
-      if (!s.ok()) {
-        return s;
-      }
-    }
-  }
-  return Status::OK();
-}
-void FaultInjectionTestEnv::ResetState() {
-  MutexLock l(&mutex_);
-  db_file_state_.clear();
-  dir_to_new_files_since_last_sync_.clear();
-  SetFilesystemActiveNoLock(true);
-}
-
-void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
-  MutexLock l(&mutex_);
-  auto dir_and_name = GetDirAndName(f);
-  dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
-      dir_and_name.second);
-  db_file_state_.erase(f);
-  open_files_.erase(f);
-}
-}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/fault_injection_test_env.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,225 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright 2014 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// This test uses a custom Env to keep track of the state of a filesystem as of
-// the last "sync". It then checks for data loss errors by purposely dropping
-// file data (or entire files) not protected by a "sync".
-
-#pragma once
-
-#include <map>
-#include <set>
-#include <string>
-
-#include "db/version_set.h"
-#include "env/mock_env.h"
-#include "file/filename.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class TestWritableFile;
-class FaultInjectionTestEnv;
-
-struct FileState {
-  std::string filename_;
-  ssize_t pos_;
-  ssize_t pos_at_last_sync_;
-  ssize_t pos_at_last_flush_;
-
-  explicit FileState(const std::string& filename)
-      : filename_(filename),
-        pos_(-1),
-        pos_at_last_sync_(-1),
-        pos_at_last_flush_(-1) {}
-
-  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
-
-  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
-
-  Status DropUnsyncedData(Env* env) const;
-
-  Status DropRandomUnsyncedData(Env* env, Random* rand) const;
-};
-
-// A wrapper around WritableFileWriter* file
-// is written to or sync'ed.
-class TestWritableFile : public WritableFile {
- public:
-  explicit TestWritableFile(const std::string& fname,
-                            std::unique_ptr<WritableFile>&& f,
-                            FaultInjectionTestEnv* env);
-  virtual ~TestWritableFile();
-  virtual Status Append(const Slice& data) override;
-  virtual Status Truncate(uint64_t size) override {
-    return target_->Truncate(size);
-  }
-  virtual Status Close() override;
-  virtual Status Flush() override;
-  virtual Status Sync() override;
-  virtual bool IsSyncThreadSafe() const override { return true; }
-  virtual Status PositionedAppend(const Slice& data,
-                                  uint64_t offset) override {
-    return target_->PositionedAppend(data, offset);
-  }
-  virtual bool use_direct_io() const override {
-    return target_->use_direct_io();
-  };
-
- private:
-  FileState state_;
-  std::unique_ptr<WritableFile> target_;
-  bool writable_file_opened_;
-  FaultInjectionTestEnv* env_;
-};
-
-// A wrapper around WritableFileWriter* file
-// is written to or sync'ed.
-class TestRandomRWFile : public RandomRWFile {
- public:
-  explicit TestRandomRWFile(const std::string& fname,
-                            std::unique_ptr<RandomRWFile>&& f,
-                            FaultInjectionTestEnv* env);
-  virtual ~TestRandomRWFile();
-  Status Write(uint64_t offset, const Slice& data) override;
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override;
-  Status Close() override;
-  Status Flush() override;
-  Status Sync() override;
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  bool use_direct_io() const override { return target_->use_direct_io(); };
-
- private:
-  std::unique_ptr<RandomRWFile> target_;
-  bool file_opened_;
-  FaultInjectionTestEnv* env_;
-};
-
-class TestDirectory : public Directory {
- public:
-  explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
-                         Directory* dir)
-      : env_(env), dirname_(dirname), dir_(dir) {}
-  ~TestDirectory() {}
-
-  virtual Status Fsync() override;
-
- private:
-  FaultInjectionTestEnv* env_;
-  std::string dirname_;
-  std::unique_ptr<Directory> dir_;
-};
-
-class FaultInjectionTestEnv : public EnvWrapper {
- public:
-  explicit FaultInjectionTestEnv(Env* base)
-      : EnvWrapper(base), filesystem_active_(true) {}
-  virtual ~FaultInjectionTestEnv() {}
-
-  Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override;
-
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& soptions) override;
-
-  Status ReopenWritableFile(const std::string& fname,
-                            std::unique_ptr<WritableFile>* result,
-                            const EnvOptions& soptions) override;
-
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& soptions) override;
-
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& soptions) override;
-
-  virtual Status DeleteFile(const std::string& f) override;
-
-  virtual Status RenameFile(const std::string& s,
-                            const std::string& t) override;
-
-// Undef to eliminate clash on Windows
-#undef GetFreeSpace
-  virtual Status GetFreeSpace(const std::string& path,
-                              uint64_t* disk_free) override {
-    if (!IsFilesystemActive() && error_ == Status::NoSpace()) {
-      *disk_free = 0;
-      return Status::OK();
-    } else {
-      return target()->GetFreeSpace(path, disk_free);
-    }
-  }
-
-  void WritableFileClosed(const FileState& state);
-
-  void WritableFileSynced(const FileState& state);
-
-  void WritableFileAppended(const FileState& state);
-
-  // For every file that is not fully synced, make a call to `func` with
-  // FileState of the file as the parameter.
-  Status DropFileData(std::function<Status(Env*, FileState)> func);
-
-  Status DropUnsyncedFileData();
-
-  Status DropRandomUnsyncedFileData(Random* rnd);
-
-  Status DeleteFilesCreatedAfterLastDirSync();
-
-  void ResetState();
-
-  void UntrackFile(const std::string& f);
-
-  void SyncDir(const std::string& dirname) {
-    MutexLock l(&mutex_);
-    dir_to_new_files_since_last_sync_.erase(dirname);
-  }
-
-  // Setting the filesystem to inactive is the test equivalent to simulating a
-  // system reset. Setting to inactive will freeze our saved filesystem state so
-  // that it will stop being recorded. It can then be reset back to the state at
-  // the time of the reset.
-  bool IsFilesystemActive() {
-    MutexLock l(&mutex_);
-    return filesystem_active_;
-  }
-  void SetFilesystemActiveNoLock(bool active,
-      Status error = Status::Corruption("Not active")) {
-    filesystem_active_ = active;
-    if (!active) {
-      error_ = error;
-    }
-  }
-  void SetFilesystemActive(bool active,
-      Status error = Status::Corruption("Not active")) {
-    MutexLock l(&mutex_);
-    SetFilesystemActiveNoLock(active, error);
-  }
-  void AssertNoOpenFile() { assert(open_files_.empty()); }
-  Status GetError() { return error_; }
-
- private:
-  port::Mutex mutex_;
-  std::map<std::string, FileState> db_file_state_;
-  std::set<std::string> open_files_;
-  std::unordered_map<std::string, std::set<std::string>>
-      dir_to_new_files_since_last_sync_;
-  bool filesystem_active_;  // Record flushes, syncs, writes
-  Status error_;
-};
-
-}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/mock_time_env.h"
+
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: this is a workaround for the different behavior on different platform
+// for timedwait timeout. Ideally timedwait API should be moved to env.
+// details: PR #7101.
+void MockSystemClock::InstallTimedWaitFixCallback() {
+#ifndef NDEBUG
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+#ifdef OS_MACOSX
+  // This is an alternate way (vs. SpecialEnv) of dealing with the fact
+  // that on some platforms, pthread_cond_timedwait does not appear to
+  // release the lock for other threads to operate if the deadline time
+  // is already passed. (TimedWait calls are currently a bad abstraction
+  // because the deadline parameter is usually computed from Env time,
+  // but is interpreted in real clock time.)
+  SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < this->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = this->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX
+  SyncPoint::GetInstance()->EnableProcessing();
+#endif  // !NDEBUG
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/mock_time_env.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/mock_time_env.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,41 +5,73 @@
 
 #pragma once
 
-#include "rocksdb/env.h"
+#include <atomic>
+#include <limits>
+
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class MockTimeEnv : public EnvWrapper {
+// NOTE: SpecialEnv offers most of this functionality, along with hooks
+// for safe DB behavior under a mock time environment, so should be used
+// instead of MockSystemClock for DB tests.
+class MockSystemClock : public SystemClockWrapper {
  public:
-  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
+  explicit MockSystemClock(const std::shared_ptr<SystemClock>& base)
+      : SystemClockWrapper(base) {}
 
-  virtual Status GetCurrentTime(int64_t* time) override {
-    assert(time != nullptr);
-    assert(current_time_ <=
-           static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
-    *time = static_cast<int64_t>(current_time_);
+  static const char* kClassName() { return "MockSystemClock"; }
+  const char* Name() const override { return kClassName(); }
+  virtual Status GetCurrentTime(int64_t* time_sec) override {
+    assert(time_sec != nullptr);
+    *time_sec = static_cast<int64_t>(current_time_us_ / kMicrosInSecond);
     return Status::OK();
   }
 
-  virtual uint64_t NowMicros() override {
-    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
-    return current_time_ * 1000000;
-  }
+  virtual uint64_t NowSeconds() { return current_time_us_ / kMicrosInSecond; }
+
+  virtual uint64_t NowMicros() override { return current_time_us_; }
 
   virtual uint64_t NowNanos() override {
-    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
-    return current_time_ * 1000000000;
+    assert(current_time_us_ <= std::numeric_limits<uint64_t>::max() / 1000);
+    return current_time_us_ * 1000;
+  }
+
+  uint64_t RealNowMicros() { return target_->NowMicros(); }
+
+  void SetCurrentTime(uint64_t time_sec) {
+    assert(time_sec < std::numeric_limits<uint64_t>::max() / kMicrosInSecond);
+    assert(time_sec * kMicrosInSecond >= current_time_us_);
+    current_time_us_ = time_sec * kMicrosInSecond;
   }
 
-  uint64_t RealNowMicros() { return target()->NowMicros(); }
+  // It's a fake sleep that just updates the Env current time, which is similar
+  // to `NoSleepEnv.SleepForMicroseconds()` and
+  // `SpecialEnv.MockSleepForMicroseconds()`.
+  // It's also similar to `set_current_time()`, which takes an absolute time in
+  // seconds, vs. this one takes the sleep in microseconds.
+  // Note: Not thread safe.
+  void SleepForMicroseconds(int micros) override {
+    assert(micros >= 0);
+    assert(current_time_us_ + static_cast<uint64_t>(micros) >=
+           current_time_us_);
+    current_time_us_.fetch_add(micros);
+  }
 
-  void set_current_time(uint64_t time) {
-    assert(time >= current_time_);
-    current_time_ = time;
+  void MockSleepForSeconds(int seconds) {
+    assert(seconds >= 0);
+    int micros = seconds * kMicrosInSecond;
+    SleepForMicroseconds(micros);
   }
 
+  // TODO: this is a workaround for the different behavior on different platform
+  // for timedwait timeout. Ideally timedwait API should be moved to env.
+  // details: PR #7101.
+  void InstallTimedWaitFixCallback();
+
  private:
-  std::atomic<uint64_t> current_time_{0};
+  std::atomic<uint64_t> current_time_us_{0};
+  static constexpr uint64_t kMicrosInSecond = 1000U * 1000U;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.cc	2025-05-19 16:14:27.000000000 +0000
@@ -4,10 +4,12 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "test_util/sync_point.h"
+
+#include <fcntl.h>
+
 #include "test_util/sync_point_impl.h"
 
-int rocksdb_kill_odds = 0;
-std::vector<std::string> rocksdb_kill_prefix_blacklist;
+std::vector<std::string> rocksdb_kill_exclude_prefixes;
 
 #ifndef NDEBUG
 namespace ROCKSDB_NAMESPACE {
@@ -58,9 +60,33 @@
   impl_->ClearTrace();
 }
 
-void SyncPoint::Process(const std::string& point, void* cb_arg) {
+void SyncPoint::Process(const Slice& point, void* cb_arg) {
   impl_->Process(point, cb_arg);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // NDEBUG
+
+namespace ROCKSDB_NAMESPACE {
+void SetupSyncPointsToMockDirectIO() {
+#if !defined(NDEBUG) && !defined(OS_MACOSX) && !defined(OS_WIN) && \
+    !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD)
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile:O_DIRECT", [&](void* arg) {
+        int* val = static_cast<int*>(arg);
+        *val &= ~O_DIRECT;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
+        int* val = static_cast<int*>(arg);
+        *val &= ~O_DIRECT;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewSequentialFile:O_DIRECT", [&](void* arg) {
+        int* val = static_cast<int*>(arg);
+        *val &= ~O_DIRECT;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+#endif
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point.h	2025-05-19 16:14:27.000000000 +0000
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <assert.h>
+
 #include <functional>
 #include <mutex>
 #include <string>
@@ -12,35 +13,44 @@
 #include <vector>
 
 #include "rocksdb/rocksdb_namespace.h"
-
-// This is only set from db_stress.cc and for testing only.
-// If non-zero, kill at various points in source code with probability 1/this
-extern int rocksdb_kill_odds;
-// If kill point has a prefix on this list, will skip killing.
-extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
+#include "rocksdb/slice.h"
 
 #ifdef NDEBUG
 // empty in release build
-#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds)
+#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight)
+#define TEST_KILL_RANDOM(kill_point)
 #else
 
 namespace ROCKSDB_NAMESPACE {
-// Kill the process with probability 1/odds for testing.
-extern void TestKillRandom(std::string kill_point, int odds,
-                           const std::string& srcfile, int srcline);
 
 // To avoid crashing always at some frequently executed codepaths (during
 // kill random test), use this factor to reduce odds
 #define REDUCE_ODDS 2
 #define REDUCE_ODDS2 4
 
-#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds)                  \
-  {                                                                      \
-    if (rocksdb_kill_odds > 0) {                                         \
-      TestKillRandom(kill_point, rocksdb_kill_odds, __FILE__, __LINE__); \
-    }                                                                    \
+// A class used to pass when a kill point is reached.
+struct KillPoint {
+ public:
+  // This is only set from db_stress.cc and for testing only.
+  // If non-zero, kill at various points in source code with probability 1/this
+  int rocksdb_kill_odds = 0;
+  // If kill point has a prefix on this list, will skip killing.
+  std::vector<std::string> rocksdb_kill_exclude_prefixes;
+  // Kill the process with probability 1/odds for testing.
+  void TestKillRandom(std::string kill_point, int odds,
+                      const std::string& srcfile, int srcline);
+
+  static KillPoint* GetInstance();
+};
+
+#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) \
+  {                                                                        \
+    KillPoint::GetInstance()->TestKillRandom(                              \
+        kill_point, rocksdb_kill_odds_weight, __FILE__, __LINE__);         \
   }
+#define TEST_KILL_RANDOM(kill_point) TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, 1)
 }  // namespace ROCKSDB_NAMESPACE
+
 #endif
 
 #ifdef NDEBUG
@@ -109,7 +119,16 @@
   // triggered by TEST_SYNC_POINT, blocking execution until all predecessors
   // are executed.
   // And/or call registered callback function, with argument `cb_arg`
-  void Process(const std::string& point, void* cb_arg = nullptr);
+  void Process(const Slice& point, void* cb_arg = nullptr);
+
+  // template gets length of const string at compile time,
+  //  avoiding strlen() at runtime
+  template <size_t kLen>
+  void Process(const char (&point)[kLen], void* cb_arg = nullptr) {
+    static_assert(kLen > 0, "Must not be empty");
+    assert(point[kLen - 1] == '\0');
+    Process(Slice(point, kLen - 1), cb_arg);
+  }
 
   // TODO: it might be useful to provide a function that blocks until all
   // sync points are cleared.
@@ -124,10 +143,13 @@
   Data*  impl_;
 };
 
+// Sets up sync points to mock direct IO instead of actually issuing direct IO
+// to the file system.
+void SetupSyncPointsToMockDirectIO();
 }  // namespace ROCKSDB_NAMESPACE
 
 // Use TEST_SYNC_POINT to specify sync points inside code base.
-// Sync points can have happens-after depedency on other sync points,
+// Sync points can have happens-after dependency on other sync points,
 // configured at runtime via SyncPoint::LoadDependency. This could be
 // utilized to re-produce race conditions between threads.
 // See TransactionLogIteratorRace in db_test.cc for an example use case.
@@ -142,3 +164,17 @@
 #define INIT_SYNC_POINT_SINGLETONS() \
   (void)ROCKSDB_NAMESPACE::SyncPoint::GetInstance();
 #endif  // NDEBUG
+
+// Callback sync point for any read IO errors that should be ignored by
+// the fault injection framework
+// Disable in release mode
+#ifdef NDEBUG
+#define IGNORE_STATUS_IF_ERROR(_status_)
+#else
+#define IGNORE_STATUS_IF_ERROR(_status_)            \
+  {                                                 \
+    if (!_status_.ok()) {                           \
+      TEST_SYNC_POINT("FaultInjectionIgnoreError"); \
+    }                                               \
+  }
+#endif  // NDEBUG
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.cc	2025-05-19 16:14:27.000000000 +0000
@@ -7,10 +7,18 @@
 
 #ifndef NDEBUG
 namespace ROCKSDB_NAMESPACE {
+KillPoint* KillPoint::GetInstance() {
+  static KillPoint kp;
+  return &kp;
+}
 
-void TestKillRandom(std::string kill_point, int odds,
-                    const std::string& srcfile, int srcline) {
-  for (auto& p : rocksdb_kill_prefix_blacklist) {
+void KillPoint::TestKillRandom(std::string kill_point, int odds_weight,
+                               const std::string& srcfile, int srcline) {
+  if (rocksdb_kill_odds <= 0) {
+    return;
+  }
+  int odds = rocksdb_kill_odds * odds_weight;
+  for (auto& p : rocksdb_kill_exclude_prefixes) {
     if (kill_point.substr(0, p.length()) == p) {
       return;
     }
@@ -29,7 +37,6 @@
   }
 }
 
-
 void SyncPoint::Data::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
   std::lock_guard<std::mutex> lock(mutex_);
   successors_.clear();
@@ -38,6 +45,8 @@
   for (const auto& dependency : dependencies) {
     successors_[dependency.predecessor].push_back(dependency.successor);
     predecessors_[dependency.successor].push_back(dependency.predecessor);
+    point_filter_.Add(dependency.successor);
+    point_filter_.Add(dependency.predecessor);
   }
   cv_.notify_all();
 }
@@ -54,11 +63,15 @@
   for (const auto& dependency : dependencies) {
     successors_[dependency.predecessor].push_back(dependency.successor);
     predecessors_[dependency.successor].push_back(dependency.predecessor);
+    point_filter_.Add(dependency.successor);
+    point_filter_.Add(dependency.predecessor);
   }
   for (const auto& marker : markers) {
     successors_[marker.predecessor].push_back(marker.successor);
     predecessors_[marker.successor].push_back(marker.predecessor);
     markers_[marker.predecessor].push_back(marker.successor);
+    point_filter_.Add(marker.predecessor);
+    point_filter_.Add(marker.successor);
   }
   cv_.notify_all();
 }
@@ -88,33 +101,42 @@
   callbacks_.clear();
 }
 
-void SyncPoint::Data::Process(const std::string& point, void* cb_arg) {
+void SyncPoint::Data::Process(const Slice& point, void* cb_arg) {
   if (!enabled_) {
     return;
   }
 
+  // Use a filter to prevent mutex lock if possible.
+  if (!point_filter_.MayContain(point)) {
+    return;
+  }
+
+  // Must convert to std::string for remaining work.  Take
+  //  heap hit.
+  std::string point_string(point.ToString());
   std::unique_lock<std::mutex> lock(mutex_);
   auto thread_id = std::this_thread::get_id();
 
-  auto marker_iter = markers_.find(point);
+  auto marker_iter = markers_.find(point_string);
   if (marker_iter != markers_.end()) {
     for (auto& marked_point : marker_iter->second) {
       marked_thread_id_.emplace(marked_point, thread_id);
+      point_filter_.Add(marked_point);
     }
   }
 
-  if (DisabledByMarker(point, thread_id)) {
+  if (DisabledByMarker(point_string, thread_id)) {
     return;
   }
 
-  while (!PredecessorsAllCleared(point)) {
+  while (!PredecessorsAllCleared(point_string)) {
     cv_.wait(lock);
-    if (DisabledByMarker(point, thread_id)) {
+    if (DisabledByMarker(point_string, thread_id)) {
       return;
     }
   }
 
-  auto callback_pair = callbacks_.find(point);
+  auto callback_pair = callbacks_.find(point_string);
   if (callback_pair != callbacks_.end()) {
     num_callbacks_running_++;
     mutex_.unlock();
@@ -122,7 +144,7 @@
     mutex_.lock();
     num_callbacks_running_--;
   }
-  cleared_points_.insert(point);
+  cleared_points_.insert(point_string);
   cv_.notify_all();
 }
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/sync_point_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/sync_point_impl.h	2025-05-19 16:14:27.000000000 +0000
@@ -3,9 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "test_util/sync_point.h"
-
 #include <assert.h>
+
 #include <atomic>
 #include <condition_variable>
 #include <functional>
@@ -15,15 +14,39 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "memory/concurrent_arena.h"
 #include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/dynamic_bloom.h"
 #include "util/random.h"
 
 #pragma once
 
 #ifndef NDEBUG
 namespace ROCKSDB_NAMESPACE {
+// A hacky allocator for single use.
+// Arena depends on SyncPoint and create circular dependency.
+class SingleAllocator : public Allocator {
+ public:
+  char* Allocate(size_t) override {
+    assert(false);
+    return nullptr;
+  }
+  char* AllocateAligned(size_t bytes, size_t, Logger*) override {
+    buf_.resize(bytes);
+    return const_cast<char*>(buf_.data());
+  }
+  size_t BlockSize() const override {
+    assert(false);
+    return 0;
+  }
+
+ private:
+  std::string buf_;
+};
+
 struct SyncPoint::Data {
-  Data() : enabled_(false) {}
+  Data() : point_filter_(&alloc_, /*total_bits=*/8192), enabled_(false) {}
   // Enable proper deletion by subclasses
   virtual ~Data() {}
   // successor/predecessor map loaded from LoadDependency
@@ -37,6 +60,9 @@
   std::condition_variable cv_;
   // sync points that have been passed through
   std::unordered_set<std::string> cleared_points_;
+  SingleAllocator alloc_;
+  // A filter before holding mutex to speed up process.
+  DynamicBloom point_filter_;
   std::atomic<bool> enabled_;
   int num_callbacks_running_ = 0;
 
@@ -48,6 +74,7 @@
     const std::function<void(void*)>& callback) {
   std::lock_guard<std::mutex> lock(mutex_);
   callbacks_[point] = callback;
+  point_filter_.Add(point);
 }
 
   void ClearCallBack(const std::string& point);
@@ -68,7 +95,7 @@
     return marked_point_iter != marked_thread_id_.end() &&
            thread_id != marked_point_iter->second;
   }
-  void Process(const std::string& point, void* cb_arg);
+  void Process(const Slice& point, void* cb_arg);
 };
 }  // namespace ROCKSDB_NAMESPACE
 #endif // NDEBUG
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,12 +8,22 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "test_util/testharness.h"
+
+#include <regex>
 #include <string>
 #include <thread>
 
 namespace ROCKSDB_NAMESPACE {
 namespace test {
 
+#ifdef OS_WIN
+#include <windows.h>
+
+std::string GetPidStr() { return std::to_string(GetCurrentProcessId()); }
+#else
+std::string GetPidStr() { return std::to_string(getpid()); }
+#endif
+
 ::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) {
   if (s.ok()) {
     return ::testing::AssertionSuccess();
@@ -26,13 +36,13 @@
 std::string TmpDir(Env* env) {
   std::string dir;
   Status s = env->GetTestDirectory(&dir);
-  EXPECT_TRUE(s.ok()) << s.ToString();
+  EXPECT_OK(s);
   return dir;
 }
 
 std::string PerThreadDBPath(std::string dir, std::string name) {
   size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
-  return dir + "/" + name + "_" + std::to_string(tid);
+  return dir + "/" + name + "_" + GetPidStr() + "_" + std::to_string(tid);
 }
 
 std::string PerThreadDBPath(std::string name) {
@@ -52,5 +62,49 @@
   return result;
 }
 
+TestRegex::TestRegex(const std::string& pattern)
+    : impl_(std::make_shared<Impl>(pattern)), pattern_(pattern) {}
+TestRegex::TestRegex(const char* pattern)
+    : impl_(std::make_shared<Impl>(pattern)), pattern_(pattern) {}
+
+const std::string& TestRegex::GetPattern() const { return pattern_; }
+
+// Sorry about code duplication with regex.cc, but it doesn't support LITE
+// due to exception handling
+class TestRegex::Impl : public std::regex {
+ public:
+  using std::regex::basic_regex;
+};
+
+bool TestRegex::Matches(const std::string& str) const {
+  if (impl_) {
+    return std::regex_match(str, *impl_);
+  } else {
+    // Should not call Matches on unset Regex
+    assert(false);
+    return false;
+  }
+}
+
+::testing::AssertionResult AssertMatchesRegex(const char* str_expr,
+                                              const char* pattern_expr,
+                                              const std::string& str,
+                                              const TestRegex& pattern) {
+  if (pattern.Matches(str)) {
+    return ::testing::AssertionSuccess();
+  } else if (TestRegex("\".*\"").Matches(pattern_expr)) {
+    // constant regex string
+    return ::testing::AssertionFailure()
+           << str << " (" << str_expr << ")" << std::endl
+           << "does not match regex " << pattern.GetPattern();
+  } else {
+    // runtime regex string
+    return ::testing::AssertionFailure()
+           << str << " (" << str_expr << ")" << std::endl
+           << "does not match regex" << std::endl
+           << pattern.GetPattern() << " (" << pattern_expr << ")";
+  }
+}
+
 }  // namespace test
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testharness.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testharness.h	2025-05-19 16:14:27.000000000 +0000
@@ -14,6 +14,43 @@
 #else
 #include <gtest/gtest.h>
 #endif
+#include "rocksdb/utilities/regex.h"
+
+// A "skipped" test has a specific meaning in Facebook infrastructure: the
+// test is in good shape and should be run, but something about the
+// compilation or execution environment means the test cannot be run.
+// Specifically, there is a hole in intended testing if any
+// parameterization of a test (e.g. Foo/FooTest.Bar/42) is skipped for all
+// tested build configurations/platforms/etc.
+//
+// If GTEST_SKIP is available, use it. Otherwise, define skip as success.
+//
+// The GTEST macros do not seem to print the message, even with -verbose,
+// so these print to stderr. Note that these do not exit the test themselves;
+// calling code should 'return' or similar from the test.
+#ifdef GTEST_SKIP_
+#define ROCKSDB_GTEST_SKIP(m)          \
+  do {                                 \
+    fputs("SKIPPED: " m "\n", stderr); \
+    GTEST_SKIP_(m);                    \
+  } while (false) /* user ; */
+#else
+#define ROCKSDB_GTEST_SKIP(m)          \
+  do {                                 \
+    fputs("SKIPPED: " m "\n", stderr); \
+    GTEST_SUCCESS_("SKIPPED: " m);     \
+  } while (false) /* user ; */
+#endif
+
+// We add "bypass" as an alternative to ROCKSDB_GTEST_SKIP that is allowed to
+// be a permanent condition, e.g. for intentionally omitting or disabling some
+// parameterizations for some tests. (Use _DISABLED at the end of the test
+// name to disable an entire test.)
+#define ROCKSDB_GTEST_BYPASS(m)         \
+  do {                                  \
+    fputs("BYPASSED: " m "\n", stderr); \
+    GTEST_SUCCESS_("BYPASSED: " m);     \
+  } while (false) /* user ; */
 
 #include <string>
 #include "rocksdb/env.h"
@@ -43,5 +80,39 @@
   EXPECT_PRED_FORMAT1(ROCKSDB_NAMESPACE::test::AssertStatus, s)
 #define EXPECT_NOK(s) EXPECT_FALSE((s).ok())
 
+// Useful for testing
+// * No need to deal with Status like in Regex public API
+// * No triggering lint reports on use of std::regex in tests
+// * Available in LITE (unlike public API)
+class TestRegex {
+ public:
+  // These throw on bad pattern
+  /*implicit*/ TestRegex(const std::string& pattern);
+  /*implicit*/ TestRegex(const char* pattern);
+
+  // Checks that the whole of str is matched by this regex
+  bool Matches(const std::string& str) const;
+
+  const std::string& GetPattern() const;
+
+ private:
+  class Impl;
+  std::shared_ptr<Impl> impl_;  // shared_ptr for simple implementation
+  std::string pattern_;
+};
+
+::testing::AssertionResult AssertMatchesRegex(const char* str_expr,
+                                              const char* pattern_expr,
+                                              const std::string& str,
+                                              const TestRegex& pattern);
+
+#define ASSERT_MATCHES_REGEX(str, pattern) \
+  ASSERT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern)
+#define EXPECT_MATCHES_REGEX(str, pattern) \
+  EXPECT_PRED_FORMAT2(ROCKSDB_NAMESPACE::test::AssertMatchesRegex, str, pattern)
+
 }  // namespace test
+
+using test::TestRegex;
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,6 +9,9 @@
 
 #include "test_util/testutil.h"
 
+#include <fcntl.h>
+#include <sys/stat.h>
+
 #include <array>
 #include <cctype>
 #include <fstream>
@@ -20,29 +23,27 @@
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/mock_time_env.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+#ifndef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif
 
 namespace ROCKSDB_NAMESPACE {
 namespace test {
 
 const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
-const uint32_t kLatestFormatVersion = 5u;
-
-Slice RandomString(Random* rnd, int len, std::string* dst) {
-  dst->resize(len);
-  for (int i = 0; i < len; i++) {
-    (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));  // ' ' .. '~'
-  }
-  return Slice(*dst);
-}
-
-extern std::string RandomHumanReadableString(Random* rnd, int len) {
-  std::string ret;
-  ret.resize(len);
-  for (int i = 0; i < len; ++i) {
-    ret[i] = static_cast<char>('a' + rnd->Uniform(26));
-  }
-  return ret;
-}
+const std::set<uint32_t> kFooterFormatVersionsToTest{
+    5U,
+    // In case any interesting future changes
+    kDefaultFormatVersion,
+    kLatestFormatVersion,
+};
 
 std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
   // Make sure to generate a wide variety of characters so we
@@ -75,8 +76,7 @@
                                 int len, std::string* dst) {
   int raw = static_cast<int>(len * compressed_fraction);
   if (raw < 1) raw = 1;
-  std::string raw_data;
-  RandomString(rnd, raw, &raw_data);
+  std::string raw_data = rnd->RandomString(raw);
 
   // Duplicate the random data until we have filled "len" bytes
   dst->clear();
@@ -118,6 +118,59 @@
 
   void FindShortSuccessor(std::string* /*key*/) const override { return; }
 };
+
+// A test implementation of comparator with 64-bit integer timestamp.
+class ComparatorWithU64TsImpl : public Comparator {
+ public:
+  ComparatorWithU64TsImpl()
+      : Comparator(/*ts_sz=*/sizeof(uint64_t)),
+        cmp_without_ts_(BytewiseComparator()) {
+    assert(cmp_without_ts_);
+    assert(cmp_without_ts_->timestamp_size() == 0);
+  }
+  const char* Name() const override { return "ComparatorWithU64Ts"; }
+  void FindShortSuccessor(std::string*) const override {}
+  void FindShortestSeparator(std::string*, const Slice&) const override {}
+  int Compare(const Slice& a, const Slice& b) const override {
+    int ret = CompareWithoutTimestamp(a, b);
+    size_t ts_sz = timestamp_size();
+    if (ret != 0) {
+      return ret;
+    }
+    // Compare timestamp.
+    // For the same user key with different timestamps, larger (newer) timestamp
+    // comes first.
+    return -CompareTimestamp(ExtractTimestampFromUserKey(a, ts_sz),
+                             ExtractTimestampFromUserKey(b, ts_sz));
+  }
+  using Comparator::CompareWithoutTimestamp;
+  int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                              bool b_has_ts) const override {
+    const size_t ts_sz = timestamp_size();
+    assert(!a_has_ts || a.size() >= ts_sz);
+    assert(!b_has_ts || b.size() >= ts_sz);
+    Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, ts_sz) : a;
+    Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, ts_sz) : b;
+    return cmp_without_ts_->Compare(lhs, rhs);
+  }
+  int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+    assert(ts1.size() == sizeof(uint64_t));
+    assert(ts2.size() == sizeof(uint64_t));
+    uint64_t lhs = DecodeFixed64(ts1.data());
+    uint64_t rhs = DecodeFixed64(ts2.data());
+    if (lhs < rhs) {
+      return -1;
+    } else if (lhs > rhs) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+ private:
+  const Comparator* cmp_without_ts_{nullptr};
+};
+
 }  // namespace
 
 const Comparator* Uint64Comparator() {
@@ -125,23 +178,9 @@
   return &uint64comp;
 }
 
-WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
-                                          const std::string& fname) {
-  std::unique_ptr<WritableFile> file(wf);
-  return new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)),
-                                fname, EnvOptions());
-}
-
-RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
-  std::unique_ptr<RandomAccessFile> file(raf);
-  return new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                    "[test RandomAccessFileReader]");
-}
-
-SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
-                                              const std::string& fname) {
-  std::unique_ptr<SequentialFile> file(se);
-  return new SequentialFileReader(NewLegacySequentialFileWrapper(file), fname);
+const Comparator* ComparatorWithU64Ts() {
+  static ComparatorWithU64TsImpl comp_with_u64_ts;
+  return &comp_with_u64_ts;
 }
 
 void CorruptKeyType(InternalKey* ikey) {
@@ -159,6 +198,38 @@
   return k.Encode().ToString();
 }
 
+std::string KeyStr(uint64_t ts, const std::string& user_key,
+                   const SequenceNumber& seq, const ValueType& t,
+                   bool corrupt) {
+  std::string user_key_with_ts(user_key);
+  std::string ts_str;
+  PutFixed64(&ts_str, ts);
+  user_key_with_ts.append(ts_str);
+  return KeyStr(user_key_with_ts, seq, t, corrupt);
+}
+
+bool SleepingBackgroundTask::TimedWaitUntilSleeping(uint64_t wait_time) {
+  auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
+  MutexLock l(&mutex_);
+  while (!sleeping_ || !should_sleep_) {
+    if (bg_cv_.TimedWait(abs_time)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool SleepingBackgroundTask::TimedWaitUntilDone(uint64_t wait_time) {
+  auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
+  MutexLock l(&mutex_);
+  while (!done_with_sleep_) {
+    if (bg_cv_.TimedWait(abs_time)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 std::string RandomName(Random* rnd, const size_t len) {
   std::stringstream ss;
   for (size_t i = 0; i < len; ++i) {
@@ -263,6 +334,7 @@
   db_opt->error_if_exists = rnd->Uniform(2);
   db_opt->is_fd_close_on_exec = rnd->Uniform(2);
   db_opt->paranoid_checks = rnd->Uniform(2);
+  db_opt->track_and_verify_wals_in_manifest = rnd->Uniform(2);
   db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
   db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
   db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
@@ -323,12 +395,17 @@
   cf_opt->force_consistency_checks = rnd->Uniform(2);
   cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
   cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
+  cf_opt->enable_blob_files = rnd->Uniform(2);
+  cf_opt->enable_blob_garbage_collection = rnd->Uniform(2);
 
   // double options
   cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
   cf_opt->soft_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
   cf_opt->memtable_prefix_bloom_size_ratio =
       static_cast<double>(rnd->Uniform(10000)) / 20000.0;
+  cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0;
+  cf_opt->blob_garbage_collection_force_threshold =
+      rnd->Uniform(10000) / 10000.0;
 
   // int options
   cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);
@@ -372,6 +449,9 @@
       cf_opt->target_file_size_base * rnd->Uniform(100);
   cf_opt->compaction_options_fifo.max_table_files_size =
       uint_max + rnd->Uniform(10000);
+  cf_opt->min_blob_size = uint_max + rnd->Uniform(10000);
+  cf_opt->blob_file_size = uint_max + rnd->Uniform(10000);
+  cf_opt->blob_compaction_readahead_size = uint_max + rnd->Uniform(10000);
 
   // unsigned int options
   cf_opt->rate_limit_delay_max_milliseconds = rnd->Uniform(10000);
@@ -390,31 +470,7 @@
   cf_opt->compression = RandomCompressionType(rnd);
   RandomCompressionTypeVector(cf_opt->num_levels,
                               &cf_opt->compression_per_level, rnd);
-}
-
-Status DestroyDir(Env* env, const std::string& dir) {
-  Status s;
-  if (env->FileExists(dir).IsNotFound()) {
-    return s;
-  }
-  std::vector<std::string> files_in_dir;
-  s = env->GetChildren(dir, &files_in_dir);
-  if (s.ok()) {
-    for (auto& file_in_dir : files_in_dir) {
-      if (file_in_dir == "." || file_in_dir == "..") {
-        continue;
-      }
-      s = env->DeleteFile(dir + "/" + file_in_dir);
-      if (!s.ok()) {
-        break;
-      }
-    }
-  }
-
-  if (s.ok()) {
-    s = env->DeleteDir(dir);
-  }
-  return s;
+  cf_opt->blob_compression_type = RandomCompressionType(rnd);
 }
 
 bool IsDirectIOSupported(Env* env, const std::string& dir) {
@@ -433,6 +489,26 @@
   return s.ok();
 }
 
+bool IsPrefetchSupported(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& dir) {
+  bool supported = false;
+  std::string tmp = TempFileName(dir, 999);
+  Random rnd(301);
+  std::string test_string = rnd.RandomString(4096);
+  Slice data(test_string);
+  Status s = WriteStringToFile(fs.get(), data, tmp, true);
+  if (s.ok()) {
+    std::unique_ptr<FSRandomAccessFile> file;
+    auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr);
+    if (io_s.ok()) {
+      supported = !(file->Prefetch(0, data.size(), IOOptions(), nullptr)
+                        .IsNotSupported());
+    }
+    s = fs->DeleteFile(tmp, IOOptions(), nullptr);
+  }
+  return s.ok() && supported;
+}
+
 size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
   std::stringstream ssbuf;
   std::string line;
@@ -450,5 +526,269 @@
   return count;
 }
 
+Status CorruptFile(Env* env, const std::string& fname, int offset,
+                   int bytes_to_corrupt, bool verify_checksum /*=true*/) {
+  uint64_t size;
+  Status s = env->GetFileSize(fname, &size);
+  if (!s.ok()) {
+    return s;
+  } else if (offset < 0) {
+    // Relative to end of file; make it absolute
+    if (-offset > static_cast<int>(size)) {
+      offset = 0;
+    } else {
+      offset = static_cast<int>(size + offset);
+    }
+  }
+  if (offset > static_cast<int>(size)) {
+    offset = static_cast<int>(size);
+  }
+  if (offset + bytes_to_corrupt > static_cast<int>(size)) {
+    bytes_to_corrupt = static_cast<int>(size - offset);
+  }
+
+  // Do it
+  std::string contents;
+  s = ReadFileToString(env, fname, &contents);
+  if (s.ok()) {
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(env, contents, fname);
+  }
+  if (s.ok() && verify_checksum) {
+#ifndef ROCKSDB_LITE
+    Options options;
+    options.env = env;
+    EnvOptions env_options;
+    Status v = VerifySstFileChecksum(options, env_options, fname);
+    assert(!v.ok());
+#endif
+  }
+  return s;
+}
+
+Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) {
+  uint64_t old_length;
+  Status s = env->GetFileSize(fname, &old_length);
+  if (!s.ok() || new_length == old_length) {
+    return s;
+  }
+  // Do it
+  std::string contents;
+  s = ReadFileToString(env, fname, &contents);
+  if (s.ok()) {
+    contents.resize(static_cast<size_t>(new_length), 'b');
+    s = WriteStringToFile(env, contents, fname);
+  }
+  return s;
+}
+
+// Try and delete a directory if it exists
+Status TryDeleteDir(Env* env, const std::string& dirname) {
+  bool is_dir = false;
+  Status s = env->IsDirectory(dirname, &is_dir);
+  if (s.ok() && is_dir) {
+    s = env->DeleteDir(dirname);
+  }
+  return s;
+}
+
+// Delete a directory if it exists
+void DeleteDir(Env* env, const std::string& dirname) {
+  TryDeleteDir(env, dirname).PermitUncheckedError();
+}
+
+Status CreateEnvFromSystem(const ConfigOptions& config_options, Env** result,
+                           std::shared_ptr<Env>* guard) {
+  const char* env_uri = getenv("TEST_ENV_URI");
+  const char* fs_uri = getenv("TEST_FS_URI");
+  if (env_uri || fs_uri) {
+    return Env::CreateFromUri(config_options,
+                              (env_uri != nullptr) ? env_uri : "",
+                              (fs_uri != nullptr) ? fs_uri : "", result, guard);
+  } else {
+    // Neither specified.  Use the default
+    *result = config_options.env;
+    guard->reset();
+    return Status::OK();
+  }
+}
+namespace {
+// A hacky skip list mem table that triggers flush after number of entries.
+class SpecialMemTableRep : public MemTableRep {
+ public:
+  explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable,
+                              int num_entries_flush)
+      : MemTableRep(allocator),
+        memtable_(memtable),
+        num_entries_flush_(num_entries_flush),
+        num_entries_(0) {}
+
+  virtual KeyHandle Allocate(const size_t len, char** buf) override {
+    return memtable_->Allocate(len, buf);
+  }
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  virtual void Insert(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
+  void InsertConcurrently(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  virtual bool Contains(const char* key) const override {
+    return memtable_->Contains(key);
+  }
+
+  virtual size_t ApproximateMemoryUsage() override {
+    // Return a high memory usage when number of entries exceeds the threshold
+    // to trigger a flush.
+    return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024;
+  }
+
+  virtual void Get(const LookupKey& k, void* callback_args,
+                   bool (*callback_func)(void* arg,
+                                         const char* entry)) override {
+    memtable_->Get(k, callback_args, callback_func);
+  }
+
+  uint64_t ApproximateNumEntries(const Slice& start_ikey,
+                                 const Slice& end_ikey) override {
+    return memtable_->ApproximateNumEntries(start_ikey, end_ikey);
+  }
+
+  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
+    return memtable_->GetIterator(arena);
+  }
+
+  virtual ~SpecialMemTableRep() override {}
+
+ private:
+  std::unique_ptr<MemTableRep> memtable_;
+  int num_entries_flush_;
+  int num_entries_;
+};
+class SpecialSkipListFactory : public MemTableRepFactory {
+ public:
+#ifndef ROCKSDB_LITE
+  static bool Register(ObjectLibrary& library, const std::string& /*arg*/) {
+    library.AddFactory<MemTableRepFactory>(
+        ObjectLibrary::PatternEntry(SpecialSkipListFactory::kClassName(), true)
+            .AddNumber(":"),
+        [](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
+           std::string* /* errmsg */) {
+          auto colon = uri.find(":");
+          if (colon != std::string::npos) {
+            auto count = ParseInt(uri.substr(colon + 1));
+            guard->reset(new SpecialSkipListFactory(count));
+          } else {
+            guard->reset(new SpecialSkipListFactory(2));
+          }
+          return guard->get();
+        });
+    return true;
+  }
+#endif  // ROCKSDB_LITE
+  // After number of inserts exceeds `num_entries_flush` in a mem table, trigger
+  // flush.
+  explicit SpecialSkipListFactory(int num_entries_flush)
+      : num_entries_flush_(num_entries_flush) {}
+
+  using MemTableRepFactory::CreateMemTableRep;
+  virtual MemTableRep* CreateMemTableRep(
+      const MemTableRep::KeyComparator& compare, Allocator* allocator,
+      const SliceTransform* transform, Logger* /*logger*/) override {
+    return new SpecialMemTableRep(
+        allocator,
+        factory_.CreateMemTableRep(compare, allocator, transform, nullptr),
+        num_entries_flush_);
+  }
+  static const char* kClassName() { return "SpecialSkipListFactory"; }
+  virtual const char* Name() const override { return kClassName(); }
+  std::string GetId() const override {
+    std::string id = Name();
+    if (num_entries_flush_ > 0) {
+      id.append(":").append(ROCKSDB_NAMESPACE::ToString(num_entries_flush_));
+    }
+    return id;
+  }
+
+  bool IsInsertConcurrentlySupported() const override {
+    return factory_.IsInsertConcurrentlySupported();
+  }
+
+ private:
+  SkipListFactory factory_;
+  int num_entries_flush_;
+};
+}  // namespace
+
+MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush) {
+  RegisterTestLibrary();
+  return new SpecialSkipListFactory(num_entries_per_flush);
+}
+
+#ifndef ROCKSDB_LITE
+// This method loads existing test classes into the ObjectRegistry
+int RegisterTestObjects(ObjectLibrary& library, const std::string& arg) {
+  size_t num_types;
+  library.AddFactory<const Comparator>(
+      test::SimpleSuffixReverseComparator::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const Comparator>* /*guard*/,
+         std::string* /* errmsg */) {
+        static test::SimpleSuffixReverseComparator ssrc;
+        return &ssrc;
+      });
+  SpecialSkipListFactory::Register(library, arg);
+  library.AddFactory<MergeOperator>(
+      "Changling",
+      [](const std::string& uri, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new test::ChanglingMergeOperator(uri));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilter>(
+      "Changling",
+      [](const std::string& uri, std::unique_ptr<CompactionFilter>* /*guard*/,
+         std::string* /* errmsg */) {
+        return new test::ChanglingCompactionFilter(uri);
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      "Changling", [](const std::string& uri,
+                      std::unique_ptr<CompactionFilterFactory>* guard,
+                      std::string* /* errmsg */) {
+        guard->reset(new test::ChanglingCompactionFilterFactory(uri));
+        return guard->get();
+      });
+  library.AddFactory<SystemClock>(
+      MockSystemClock::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockSystemClock(SystemClock::Default()));
+        return guard->get();
+      });
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+
+#endif  // ROCKSDB_LITE
+
+void RegisterTestLibrary(const std::string& arg) {
+  static bool registered = false;
+  if (!registered) {
+    registered = true;
+#ifndef ROCKSDB_LITE
+    ObjectRegistry::Default()->AddLibrary("test", RegisterTestObjects, arg);
+#else
+    (void)arg;
+#endif  // ROCKSDB_LITE
+  }
+}
 }  // namespace test
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.h mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil.h	2025-05-19 16:14:27.000000000 +0000
@@ -22,26 +22,29 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "table/internal_iterator.h"
-#include "table/plain/plain_table_factory.h"
 #include "util/mutexlock.h"
-#include "util/random.h"
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int argc, char** argv);
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
 
 namespace ROCKSDB_NAMESPACE {
+class FileSystem;
+class MemTableRepFactory;
+class ObjectLibrary;
+class Random;
 class SequentialFile;
 class SequentialFileReader;
 
 namespace test {
 
 extern const uint32_t kDefaultFormatVersion;
-extern const uint32_t kLatestFormatVersion;
-
-// Store in *dst a random string of length "len" and return a Slice that
-// references the generated data.
-extern Slice RandomString(Random* rnd, int len, std::string* dst);
-
-extern std::string RandomHumanReadableString(Random* rnd, int len);
+extern const std::set<uint32_t> kFooterFormatVersionsToTest;
 
 // Return a random key with the specified length that may contain interesting
 // characters (e.g. \x00, \xff, etc.).
@@ -55,28 +58,6 @@
 extern Slice CompressibleString(Random* rnd, double compressed_fraction,
                                 int len, std::string* dst);
 
-// A wrapper that allows injection of errors.
-class ErrorEnv : public EnvWrapper {
- public:
-  bool writable_file_error_;
-  int num_writable_file_errors_;
-
-  ErrorEnv() : EnvWrapper(Env::Default()),
-               writable_file_error_(false),
-               num_writable_file_errors_(0) { }
-
-  virtual Status NewWritableFile(const std::string& fname,
-                                 std::unique_ptr<WritableFile>* result,
-                                 const EnvOptions& soptions) override {
-    result->reset();
-    if (writable_file_error_) {
-      ++num_writable_file_errors_;
-      return Status::IOError(fname, "fake error");
-    }
-    return target()->NewWritableFile(fname, result, soptions);
-  }
-};
-
 #ifndef NDEBUG
 // An internal comparator that just forward comparing results from the
 // user comparator in it. Can be used to test entities that have no dependency
@@ -104,10 +85,8 @@
 class SimpleSuffixReverseComparator : public Comparator {
  public:
   SimpleSuffixReverseComparator() {}
-
-  virtual const char* Name() const override {
-    return "SimpleSuffixReverseComparator";
-  }
+  static const char* kClassName() { return "SimpleSuffixReverseComparator"; }
+  virtual const char* Name() const override { return kClassName(); }
 
   virtual int Compare(const Slice& a, const Slice& b) const override {
     Slice prefix_a = Slice(a.data(), 8);
@@ -134,74 +113,15 @@
 // endian machines.
 extern const Comparator* Uint64Comparator();
 
-// Iterator over a vector of keys/values
-class VectorIterator : public InternalIterator {
- public:
-  explicit VectorIterator(const std::vector<std::string>& keys)
-      : keys_(keys), current_(keys.size()) {
-    std::sort(keys_.begin(), keys_.end());
-    values_.resize(keys.size());
-  }
-
-  VectorIterator(const std::vector<std::string>& keys,
-      const std::vector<std::string>& values)
-    : keys_(keys), values_(values), current_(keys.size()) {
-    assert(keys_.size() == values_.size());
-  }
-
-  virtual bool Valid() const override { return current_ < keys_.size(); }
-
-  virtual void SeekToFirst() override { current_ = 0; }
-  virtual void SeekToLast() override { current_ = keys_.size() - 1; }
-
-  virtual void Seek(const Slice& target) override {
-    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
-               keys_.begin();
-  }
-
-  virtual void SeekForPrev(const Slice& target) override {
-    current_ = std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) -
-               keys_.begin();
-    if (!Valid()) {
-      SeekToLast();
-    } else {
-      Prev();
-    }
-  }
-
-  virtual void Next() override { current_++; }
-  virtual void Prev() override { current_--; }
-
-  virtual Slice key() const override { return Slice(keys_[current_]); }
-  virtual Slice value() const override { return Slice(values_[current_]); }
-
-  virtual Status status() const override { return Status::OK(); }
-
-  virtual bool IsKeyPinned() const override { return true; }
-  virtual bool IsValuePinned() const override { return true; }
-
- private:
-  std::vector<std::string> keys_;
-  std::vector<std::string> values_;
-  size_t current_;
-};
-extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
-                                                 const std::string& fname);
-
-extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf);
-
-extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
-                                                     const std::string& fname);
-
-class StringSink: public WritableFile {
+class StringSink : public FSWritableFile {
  public:
   std::string contents_;
 
-  explicit StringSink(Slice* reader_contents = nullptr) :
-      WritableFile(),
-      contents_(""),
-      reader_contents_(reader_contents),
-      last_flush_(0) {
+  explicit StringSink(Slice* reader_contents = nullptr)
+      : FSWritableFile(),
+        contents_(""),
+        reader_contents_(reader_contents),
+        last_flush_(0) {
     if (reader_contents_ != nullptr) {
       *reader_contents_ = Slice(contents_.data(), 0);
     }
@@ -209,12 +129,15 @@
 
   const std::string& contents() const { return contents_; }
 
-  virtual Status Truncate(uint64_t size) override {
+  IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
     contents_.resize(static_cast<size_t>(size));
-    return Status::OK();
+    return IOStatus::OK();
+  }
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
   }
-  virtual Status Close() override { return Status::OK(); }
-  virtual Status Flush() override {
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
     if (reader_contents_ != nullptr) {
       assert(reader_contents_->size() <= last_flush_);
       size_t offset = last_flush_ - reader_contents_->size();
@@ -224,12 +147,17 @@
       last_flush_ = contents_.size();
     }
 
-    return Status::OK();
+    return IOStatus::OK();
   }
-  virtual Status Sync() override { return Status::OK(); }
-  virtual Status Append(const Slice& slice) override {
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                  IODebugContext* /*dbg*/) override {
     contents_.append(slice.data(), slice.size());
-    return Status::OK();
+    return IOStatus::OK();
   }
   void Drop(size_t bytes) {
     if (reader_contents_ != nullptr) {
@@ -246,36 +174,44 @@
 };
 
 // A wrapper around a StringSink to give it a RandomRWFile interface
-class RandomRWStringSink : public RandomRWFile {
+class RandomRWStringSink : public FSRandomRWFile {
  public:
   explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {}
 
-  Status Write(uint64_t offset, const Slice& data) override {
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& /*opts*/,
+                 IODebugContext* /*dbg*/) override {
     if (offset + data.size() > ss_->contents_.size()) {
       ss_->contents_.resize(static_cast<size_t>(offset) + data.size(), '\0');
     }
 
     char* pos = const_cast<char*>(ss_->contents_.data() + offset);
     memcpy(pos, data.data(), data.size());
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* /*scratch*/) const override {
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/,
+                Slice* result, char* /*scratch*/,
+                IODebugContext* /*dbg*/) const override {
     *result = Slice(nullptr, 0);
     if (offset < ss_->contents_.size()) {
       size_t str_res_sz =
           std::min(static_cast<size_t>(ss_->contents_.size() - offset), n);
       *result = Slice(ss_->contents_.data() + offset, str_res_sz);
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Flush() override { return Status::OK(); }
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  Status Sync() override { return Status::OK(); }
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  Status Close() override { return Status::OK(); }
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
   const std::string& contents() const { return ss_->contents(); }
 
@@ -286,34 +222,42 @@
 // Like StringSink, this writes into a string.  Unlink StringSink, it
 // has some initial content and overwrites it, just like a recycled
 // log file.
-class OverwritingStringSink : public WritableFile {
+class OverwritingStringSink : public FSWritableFile {
  public:
   explicit OverwritingStringSink(Slice* reader_contents)
-      : WritableFile(),
+      : FSWritableFile(),
         contents_(""),
         reader_contents_(reader_contents),
         last_flush_(0) {}
 
   const std::string& contents() const { return contents_; }
 
-  virtual Status Truncate(uint64_t size) override {
+  IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
     contents_.resize(static_cast<size_t>(size));
-    return Status::OK();
+    return IOStatus::OK();
+  }
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
   }
-  virtual Status Close() override { return Status::OK(); }
-  virtual Status Flush() override {
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
     if (last_flush_ < contents_.size()) {
       assert(reader_contents_->size() >= contents_.size());
       memcpy((char*)reader_contents_->data() + last_flush_,
              contents_.data() + last_flush_, contents_.size() - last_flush_);
       last_flush_ = contents_.size();
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
-  virtual Status Sync() override { return Status::OK(); }
-  virtual Status Append(const Slice& slice) override {
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                  IODebugContext* /*dbg*/) override {
     contents_.append(slice.data(), slice.size());
-    return Status::OK();
+    return IOStatus::OK();
   }
   void Drop(size_t bytes) {
     contents_.resize(contents_.size() - bytes);
@@ -326,7 +270,7 @@
   size_t last_flush_;
 };
 
-class StringSource: public RandomAccessFile {
+class StringSource : public FSRandomAccessFile {
  public:
   explicit StringSource(const Slice& contents, uint64_t uniq_id = 0,
                         bool mmap = false)
@@ -339,11 +283,23 @@
 
   uint64_t Size() const { return contents_.size(); }
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-      char* scratch) const override {
+  IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    // If we are using mmap_, it is equivalent to performing a prefetch
+    if (mmap_) {
+      return IOStatus::OK();
+    } else {
+      return IOStatus::NotSupported("Prefetch not supported");
+    }
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
     total_reads_++;
     if (offset > contents_.size()) {
-      return Status::InvalidArgument("invalid Read offset");
+      return IOStatus::InvalidArgument("invalid Read offset");
     }
     if (offset + n > contents_.size()) {
       n = contents_.size() - static_cast<size_t>(offset);
@@ -354,10 +310,10 @@
     } else {
       *result = Slice(&contents_[static_cast<size_t>(offset)], n);
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
+  size_t GetUniqueId(char* id, size_t max_size) const override {
     if (max_size < 20) {
       return 0;
     }
@@ -379,13 +335,6 @@
   mutable int total_reads_;
 };
 
-inline StringSink* GetStringSinkFromLegacyWriter(
-    const WritableFileWriter* writer) {
-  LegacyWritableFileWrapper* file =
-      static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
-  return static_cast<StringSink*>(file->target());
-}
-
 class NullLogger : public Logger {
  public:
   using Logger::Logv;
@@ -400,6 +349,10 @@
                           const SequenceNumber& seq, const ValueType& t,
                           bool corrupt = false);
 
+extern std::string KeyStr(uint64_t ts, const std::string& user_key,
+                          const SequenceNumber& seq, const ValueType& t,
+                          bool corrupt = false);
+
 class SleepingBackgroundTask {
  public:
   SleepingBackgroundTask()
@@ -433,16 +386,8 @@
   // otherwise times out.
   // wait_time is in microseconds.
   // Returns true when times out, false otherwise.
-  bool TimedWaitUntilSleeping(uint64_t wait_time) {
-    auto abs_time = Env::Default()->NowMicros() + wait_time;
-    MutexLock l(&mutex_);
-    while (!sleeping_ || !should_sleep_) {
-      if (bg_cv_.TimedWait(abs_time)) {
-        return true;
-      }
-    }
-    return false;
-  }
+  bool TimedWaitUntilSleeping(uint64_t wait_time);
+
   void WakeUp() {
     MutexLock l(&mutex_);
     should_sleep_ = false;
@@ -456,16 +401,8 @@
   }
   // Similar to TimedWaitUntilSleeping.
   // Waits until the task is done.
-  bool TimedWaitUntilDone(uint64_t wait_time) {
-    auto abs_time = Env::Default()->NowMicros() + wait_time;
-    MutexLock l(&mutex_);
-    while (!done_with_sleep_) {
-      if (bg_cv_.TimedWait(abs_time)) {
-        return true;
-      }
-    }
-    return false;
-  }
+  bool TimedWaitUntilDone(uint64_t wait_time);
+
   bool WokenUp() {
     MutexLock l(&mutex_);
     return should_sleep_ == false;
@@ -528,173 +465,223 @@
   return result;
 }
 
-  class SeqStringSource : public SequentialFile {
+class SeqStringSource : public FSSequentialFile {
+ public:
+  SeqStringSource(const std::string& data, std::atomic<int>* read_count)
+      : data_(data), offset_(0), read_count_(read_count) {}
+  ~SeqStringSource() override {}
+  IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+                char* scratch, IODebugContext* /*dbg*/) override {
+    std::string output;
+    if (offset_ < data_.size()) {
+      n = std::min(data_.size() - offset_, n);
+      memcpy(scratch, data_.data() + offset_, n);
+      offset_ += n;
+      *result = Slice(scratch, n);
+    } else {
+      return IOStatus::InvalidArgument(
+          "Attempt to read when it already reached eof.");
+    }
+    (*read_count_)++;
+    return IOStatus::OK();
+  }
+
+  IOStatus Skip(uint64_t n) override {
+    if (offset_ >= data_.size()) {
+      return IOStatus::InvalidArgument(
+          "Attempt to read when it already reached eof.");
+    }
+    // TODO(yhchiang): Currently doesn't handle the overflow case.
+    offset_ += static_cast<size_t>(n);
+    return IOStatus::OK();
+  }
+
+ private:
+  std::string data_;
+  size_t offset_;
+  std::atomic<int>* read_count_;
+};
+
+class StringFS : public FileSystemWrapper {
+ public:
+  class StringSink : public FSWritableFile {
    public:
-    SeqStringSource(const std::string& data, std::atomic<int>* read_count)
-        : data_(data), offset_(0), read_count_(read_count) {}
-    ~SeqStringSource() override {}
-    Status Read(size_t n, Slice* result, char* scratch) override {
-      std::string output;
-      if (offset_ < data_.size()) {
-        n = std::min(data_.size() - offset_, n);
-        memcpy(scratch, data_.data() + offset_, n);
-        offset_ += n;
-        *result = Slice(scratch, n);
-      } else {
-        return Status::InvalidArgument(
-            "Attemp to read when it already reached eof.");
-      }
-      (*read_count_)++;
-      return Status::OK();
-    }
-    Status Skip(uint64_t n) override {
-      if (offset_ >= data_.size()) {
-        return Status::InvalidArgument(
-            "Attemp to read when it already reached eof.");
-      }
-      // TODO(yhchiang): Currently doesn't handle the overflow case.
-      offset_ += static_cast<size_t>(n);
-      return Status::OK();
+    explicit StringSink(std::string* contents)
+        : FSWritableFile(), contents_(contents) {}
+    IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                      IODebugContext* /*dbg*/) override {
+      contents_->resize(static_cast<size_t>(size));
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*opts*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*opts*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
+      contents_->append(slice.data(), slice.size());
+      return IOStatus::OK();
     }
 
    private:
-    std::string data_;
-    size_t offset_;
-    std::atomic<int>* read_count_;
+    std::string* contents_;
   };
 
-  class StringEnv : public EnvWrapper {
-   public:
-    class StringSink : public WritableFile {
-     public:
-      explicit StringSink(std::string* contents)
-          : WritableFile(), contents_(contents) {}
-      virtual Status Truncate(uint64_t size) override {
-        contents_->resize(static_cast<size_t>(size));
-        return Status::OK();
-      }
-      virtual Status Close() override { return Status::OK(); }
-      virtual Status Flush() override { return Status::OK(); }
-      virtual Status Sync() override { return Status::OK(); }
-      virtual Status Append(const Slice& slice) override {
-        contents_->append(slice.data(), slice.size());
-        return Status::OK();
-      }
-
-     private:
-      std::string* contents_;
-    };
+  explicit StringFS(const std::shared_ptr<FileSystem>& t)
+      : FileSystemWrapper(t) {}
+  ~StringFS() override {}
 
-    explicit StringEnv(Env* t) : EnvWrapper(t) {}
-    ~StringEnv() override {}
+  static const char* kClassName() { return "StringFS"; }
+  const char* Name() const override { return kClassName(); }
 
-    const std::string& GetContent(const std::string& f) { return files_[f]; }
+  const std::string& GetContent(const std::string& f) { return files_[f]; }
 
-    const Status WriteToNewFile(const std::string& file_name,
+  const IOStatus WriteToNewFile(const std::string& file_name,
                                 const std::string& content) {
-      std::unique_ptr<WritableFile> r;
-      auto s = NewWritableFile(file_name, &r, EnvOptions());
-      if (!s.ok()) {
-        return s;
-      }
-      r->Append(content);
-      r->Flush();
-      r->Close();
-      assert(files_[file_name] == content);
-      return Status::OK();
-    }
-
-    // The following text is boilerplate that forwards all methods to target()
-    Status NewSequentialFile(const std::string& f,
-                             std::unique_ptr<SequentialFile>* r,
-                             const EnvOptions& /*options*/) override {
-      auto iter = files_.find(f);
-      if (iter == files_.end()) {
-        return Status::NotFound("The specified file does not exist", f);
-      }
-      r->reset(new SeqStringSource(iter->second, &num_seq_file_read_));
-      return Status::OK();
-    }
-    Status NewRandomAccessFile(const std::string& /*f*/,
-                               std::unique_ptr<RandomAccessFile>* /*r*/,
-                               const EnvOptions& /*options*/) override {
-      return Status::NotSupported();
-    }
-    Status NewWritableFile(const std::string& f,
-                           std::unique_ptr<WritableFile>* r,
-                           const EnvOptions& /*options*/) override {
-      auto iter = files_.find(f);
-      if (iter != files_.end()) {
-        return Status::IOError("The specified file already exists", f);
-      }
-      r->reset(new StringSink(&files_[f]));
-      return Status::OK();
-    }
-    virtual Status NewDirectory(
-        const std::string& /*name*/,
-        std::unique_ptr<Directory>* /*result*/) override {
-      return Status::NotSupported();
-    }
-    Status FileExists(const std::string& f) override {
-      if (files_.find(f) == files_.end()) {
-        return Status::NotFound();
-      }
-      return Status::OK();
-    }
-    Status GetChildren(const std::string& /*dir*/,
-                       std::vector<std::string>* /*r*/) override {
-      return Status::NotSupported();
-    }
-    Status DeleteFile(const std::string& f) override {
-      files_.erase(f);
-      return Status::OK();
-    }
-    Status CreateDir(const std::string& /*d*/) override {
-      return Status::NotSupported();
-    }
-    Status CreateDirIfMissing(const std::string& /*d*/) override {
-      return Status::NotSupported();
-    }
-    Status DeleteDir(const std::string& /*d*/) override {
-      return Status::NotSupported();
-    }
-    Status GetFileSize(const std::string& f, uint64_t* s) override {
-      auto iter = files_.find(f);
-      if (iter == files_.end()) {
-        return Status::NotFound("The specified file does not exist:", f);
-      }
-      *s = iter->second.size();
-      return Status::OK();
-    }
-
-    Status GetFileModificationTime(const std::string& /*fname*/,
-                                   uint64_t* /*file_mtime*/) override {
-      return Status::NotSupported();
-    }
-
-    Status RenameFile(const std::string& /*s*/,
-                      const std::string& /*t*/) override {
-      return Status::NotSupported();
-    }
-
-    Status LinkFile(const std::string& /*s*/,
-                    const std::string& /*t*/) override {
-      return Status::NotSupported();
-    }
-
-    Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
-      return Status::NotSupported();
+    std::unique_ptr<FSWritableFile> r;
+    FileOptions file_opts;
+    IOOptions io_opts;
+
+    auto s = NewWritableFile(file_name, file_opts, &r, nullptr);
+    if (s.ok()) {
+      s = r->Append(content, io_opts, nullptr);
+    }
+    if (s.ok()) {
+      s = r->Flush(io_opts, nullptr);
+    }
+    if (s.ok()) {
+      s = r->Close(io_opts, nullptr);
+    }
+    assert(!s.ok() || files_[file_name] == content);
+    return s;
+  }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f,
+                             const FileOptions& /*options*/,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return IOStatus::NotFound("The specified file does not exist", f);
     }
+    r->reset(new SeqStringSource(iter->second, &num_seq_file_read_));
+    return IOStatus::OK();
+  }
 
-    Status UnlockFile(FileLock* /*l*/) override {
-      return Status::NotSupported();
+  IOStatus NewRandomAccessFile(const std::string& /*f*/,
+                               const FileOptions& /*options*/,
+                               std::unique_ptr<FSRandomAccessFile>* /*r*/,
+                               IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& /*options*/,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter != files_.end()) {
+      return IOStatus::IOError("The specified file already exists", f);
     }
+    r->reset(new StringSink(&files_[f]));
+    return IOStatus::OK();
+  }
+  IOStatus NewDirectory(const std::string& /*name*/,
+                        const IOOptions& /*options*/,
+                        std::unique_ptr<FSDirectory>* /*result*/,
+                        IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
 
-    std::atomic<int> num_seq_file_read_;
+  IOStatus FileExists(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    if (files_.find(f) == files_.end()) {
+      return IOStatus::NotFound();
+    }
+    return IOStatus::OK();
+  }
 
-   protected:
-    std::unordered_map<std::string, std::string> files_;
-  };
+  IOStatus GetChildren(const std::string& /*dir*/, const IOOptions& /*options*/,
+                       std::vector<std::string>* /*r*/,
+                       IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    files_.erase(f);
+    return IOStatus::OK();
+  }
+
+  IOStatus CreateDir(const std::string& /*d*/, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus CreateDirIfMissing(const std::string& /*d*/,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus DeleteDir(const std::string& /*d*/, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
+                       uint64_t* s, IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return IOStatus::NotFound("The specified file does not exist:", f);
+    }
+    *s = iter->second.size();
+    return IOStatus::OK();
+  }
+
+  IOStatus GetFileModificationTime(const std::string& /*fname*/,
+                                   const IOOptions& /*options*/,
+                                   uint64_t* /*file_mtime*/,
+                                   IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus RenameFile(const std::string& /*s*/, const std::string& /*t*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus LinkFile(const std::string& /*s*/, const std::string& /*t*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus LockFile(const std::string& /*f*/, const IOOptions& /*options*/,
+                    FileLock** /*l*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus UnlockFile(FileLock* /*l*/, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  std::atomic<int> num_seq_file_read_;
+
+ protected:
+  std::unordered_map<std::string, std::string> files_;
+};
 
 // Randomly initialize the given DBOptions
 void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
@@ -723,6 +710,15 @@
                                  Logger* /*logger*/) const override {
     return false;
   }
+  static const char* kClassName() { return "ChanglingMergeOperator"; }
+  virtual bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return MergeOperator::IsInstanceOf(id);
+    }
+  }
+
   virtual const char* Name() const override { return name_.c_str(); }
 
  protected:
@@ -747,6 +743,15 @@
     return false;
   }
 
+  static const char* kClassName() { return "ChanglingCompactionFilter"; }
+  virtual bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return CompactionFilter::IsInstanceOf(id);
+    }
+  }
+
   const char* Name() const override { return name_.c_str(); }
 
  private:
@@ -772,11 +777,25 @@
 
   // Returns a name that identifies this compaction filter factory.
   const char* Name() const override { return name_.c_str(); }
+  static const char* kClassName() { return "ChanglingCompactionFilterFactory"; }
+  virtual bool IsInstanceOf(const std::string& id) const override {
+    if (id == kClassName()) {
+      return true;
+    } else {
+      return CompactionFilterFactory::IsInstanceOf(id);
+    }
+  }
 
  protected:
   std::string name_;
 };
 
+// The factory for the hacky skip list mem table that triggers flush after
+// number of entries exceeds a threshold.
+extern MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush);
+
+extern const Comparator* ComparatorWithU64Ts();
+
 CompressionType RandomCompressionType(Random* rnd);
 
 void RandomCompressionTypeVector(const size_t count,
@@ -791,12 +810,40 @@
 
 std::string RandomName(Random* rnd, const size_t len);
 
-Status DestroyDir(Env* env, const std::string& dir);
-
 bool IsDirectIOSupported(Env* env, const std::string& dir);
 
+bool IsPrefetchSupported(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& dir);
+
 // Return the number of lines where a given pattern was found in a file.
 size_t GetLinesCount(const std::string& fname, const std::string& pattern);
 
+// TEST_TMPDIR may be set to /dev/shm in Makefile,
+// but /dev/shm does not support direct IO.
+// Tries to set TEST_TMPDIR to a directory supporting direct IO.
+void ResetTmpDirForDirectIO();
+
+Status CorruptFile(Env* env, const std::string& fname, int offset,
+                   int bytes_to_corrupt, bool verify_checksum = true);
+Status TruncateFile(Env* env, const std::string& fname, uint64_t length);
+
+// Try and delete a directory if it exists
+Status TryDeleteDir(Env* env, const std::string& dirname);
+
+// Delete a directory if it exists
+void DeleteDir(Env* env, const std::string& dirname);
+
+// Creates an Env from the system environment by looking at the system
+// environment variables.
+Status CreateEnvFromSystem(const ConfigOptions& options, Env** result,
+                           std::shared_ptr<Env>* guard);
+
+#ifndef ROCKSDB_LITE
+// Registers the testutil classes with the ObjectLibrary
+int RegisterTestObjects(ObjectLibrary& library, const std::string& /*arg*/);
+#endif  // ROCKSDB_LITE
+
+// Register the testutil classes with the default ObjectRegistry/Library
+void RegisterTestLibrary(const std::string& arg = "");
 }  // namespace test
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/testutil_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/testutil_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testutil.h"
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CreateFile(Env* env, const std::string& path) {
+  std::unique_ptr<WritableFile> f;
+  ASSERT_OK(env->NewWritableFile(path, &f, EnvOptions()));
+  f->Close();
+}
+
+TEST(TestUtil, DestroyDirRecursively) {
+  auto env = Env::Default();
+  // test_util/file
+  //          /dir
+  //          /dir/file
+  std::string test_dir = test::PerThreadDBPath("test_util");
+  ASSERT_OK(env->CreateDir(test_dir));
+  CreateFile(env, test_dir + "/file");
+  ASSERT_OK(env->CreateDir(test_dir + "/dir"));
+  CreateFile(env, test_dir + "/dir/file");
+
+  ASSERT_OK(DestroyDir(env, test_dir));
+  auto s = env->FileExists(test_dir);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/test_util/transaction_test_util.cc	2025-05-19 16:14:27.000000000 +0000
@@ -139,7 +139,7 @@
 
   std::vector<uint16_t> set_vec(num_sets_);
   std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
-  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
+  RandomShuffle(set_vec.begin(), set_vec.end());
 
   // For each set, pick a key at random and increment it
   for (uint16_t set_i : set_vec) {
@@ -165,12 +165,19 @@
       // Increment key
       std::string sum = ToString(int_value + incr);
       if (txn != nullptr) {
-        s = txn->Put(key, sum);
+        s = txn->SingleDelete(key);
         if (!get_for_update && (s.IsBusy() || s.IsTimedOut())) {
           // If the initial get was not for update, then the key is not locked
           // before put and put could fail due to concurrent writes.
           break;
         } else if (!s.ok()) {
+          // Since we did a GetForUpdate, SingleDelete should not fail.
+          fprintf(stderr, "SingleDelete returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+          unexpected_error = true;
+        }
+        s = txn->Put(key, sum);
+        if (!s.ok()) {
           // Since we did a GetForUpdate, Put should not fail.
           fprintf(stderr, "Put returned an unexpected error: %s\n",
                   s.ToString().c_str());
@@ -197,6 +204,10 @@
       if (with_prepare) {
         // Also try commit without prepare
         s = txn->Prepare();
+        if (!s.ok()) {
+          fprintf(stderr, "Prepare returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+        }
         assert(s.ok());
         ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
                         "Prepare of %" PRIu64 " %s (%s)", txn->GetId(),
@@ -296,7 +307,7 @@
 
   std::vector<uint16_t> set_vec(num_sets);
   std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
-  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
+  RandomShuffle(set_vec.begin(), set_vec.end());
 
   // For each set of keys with the same prefix, sum all the values
   for (uint16_t set_i : set_vec) {
@@ -349,6 +360,7 @@
             static_cast<int>(key.size()), key.data(), int_value);
         total += int_value;
       }
+      iter->status().PermitUncheckedError();
       delete iter;
     }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/Portability.h	2025-05-19 16:14:27.000000000 +0000
@@ -31,6 +31,12 @@
 #define FOLLY_PPC64 0
 #endif
 
+#if defined(__s390x__)
+#define FOLLY_S390X 1
+#else
+#define FOLLY_S390X 0
+#endif
+
 #if defined(__has_builtin)
 #define FOLLY_HAS_BUILTIN(...) __has_builtin(__VA_ARGS__)
 #else
@@ -57,6 +63,7 @@
 constexpr bool kIsArchAmd64 = FOLLY_X64 == 1;
 constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1;
 constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1;
+constexpr bool kIsArchS390X = FOLLY_S390X == 1;
 } // namespace folly
 
 namespace folly {
@@ -82,3 +89,11 @@
 constexpr bool kIsSanitizeThread = false;
 #endif
 } // namespace folly
+
+namespace folly {
+#if defined(__linux__) && !FOLLY_MOBILE
+constexpr auto kIsLinux = true;
+#else
+constexpr auto kIsLinux = false;
+#endif
+} // namespace folly
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/chrono/Hardware.h	2025-05-19 16:14:27.000000000 +0000
@@ -10,7 +10,7 @@
 #include <chrono>
 #include <cstdint>
 
-#if _MSC_VER
+#if _MSC_VER && (defined(_M_IX86) || defined(_M_X64))
 extern "C" std::uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@@ -18,7 +18,7 @@
 namespace folly {
 
 inline std::uint64_t hardware_timestamp() {
-#if _MSC_VER
+#if _MSC_VER && (defined(_M_IX86) || defined(_M_X64))
   return __rdtsc();
 #elif __GNUC__ && (__i386__ || FOLLY_X64)
   return __builtin_ia32_rdtsc();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/detail/Futex.cpp	2025-05-19 16:14:27.000000000 +0000
@@ -20,8 +20,6 @@
 #include <unistd.h>
 #endif
 
-using namespace std::chrono;
-
 namespace folly {
 namespace detail {
 
@@ -69,7 +67,7 @@
 }
 
 template <class Clock>
-struct timespec timeSpecFromTimePoint(time_point<Clock> absTime) {
+struct timespec timeSpecFromTimePoint(std::chrono::time_point<Clock> absTime) {
   auto epoch = absTime.time_since_epoch();
   if (epoch.count() < 0) {
     // kernel timespec_valid requires non-negative seconds and nanos in [0,1G)
@@ -79,20 +77,21 @@
   // timespec-safe seconds and nanoseconds;
   // chrono::{nano,}seconds are `long long int`
   // whereas timespec uses smaller types
-  using time_t_seconds = duration<std::time_t, seconds::period>;
-  using long_nanos = duration<long int, nanoseconds::period>;
+  using time_t_seconds =
+      std::chrono::duration<std::time_t, std::chrono::seconds::period>;
+  using long_nanos =
+      std::chrono::duration<long int, std::chrono::nanoseconds::period>;
 
-  auto secs = duration_cast<time_t_seconds>(epoch);
-  auto nanos = duration_cast<long_nanos>(epoch - secs);
+  auto secs = std::chrono::duration_cast<time_t_seconds>(epoch);
+  auto nanos = std::chrono::duration_cast<long_nanos>(epoch - secs);
   struct timespec result = {secs.count(), nanos.count()};
   return result;
 }
 
 FutexResult nativeFutexWaitImpl(
-    const void* addr,
-    uint32_t expected,
-    system_clock::time_point const* absSystemTime,
-    steady_clock::time_point const* absSteadyTime,
+    const void* addr, uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
     uint32_t waitMask) {
   assert(absSystemTime == nullptr || absSteadyTime == nullptr);
 
@@ -171,10 +170,9 @@
 
 template <typename F>
 FutexResult emulatedFutexWaitImpl(
-    F* futex,
-    uint32_t expected,
-    system_clock::time_point const* absSystemTime,
-    steady_clock::time_point const* absSteadyTime,
+    F* futex, uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
     uint32_t waitMask) {
   static_assert(
       std::is_same<F, const Futex<std::atomic>>::value ||
@@ -235,10 +233,9 @@
 }
 
 FutexResult futexWaitImpl(
-    const Futex<std::atomic>* futex,
-    uint32_t expected,
-    system_clock::time_point const* absSystemTime,
-    steady_clock::time_point const* absSteadyTime,
+    const Futex<std::atomic>* futex, uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
     uint32_t waitMask) {
 #ifdef __linux__
   return nativeFutexWaitImpl(
@@ -250,10 +247,9 @@
 }
 
 FutexResult futexWaitImpl(
-    const Futex<EmulatedFutexAtomic>* futex,
-    uint32_t expected,
-    system_clock::time_point const* absSystemTime,
-    steady_clock::time_point const* absSteadyTime,
+    const Futex<EmulatedFutexAtomic>* futex, uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
     uint32_t waitMask) {
   return emulatedFutexWaitImpl(
       futex, expected, absSystemTime, absSteadyTime, waitMask);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/lang/Align.h	2025-05-19 16:14:27.000000000 +0000
@@ -1,14 +1,110 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
+#include <cstddef>
 #include <cstdint>
 
+#include <folly/Portability.h>
+#include <folly/ConstexprMath.h>
+
+// Work around bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56019
+#ifdef __GNUC__
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
+namespace std {
+using ::max_align_t;
+}
+#endif
+#endif
+
 namespace folly {
 
+//  has_extended_alignment
+//
+//  True if it may be presumed that the platform has static extended alignment;
+//  false if it may not be so presumed, even when the platform might actually
+//  have it. Static extended alignment refers to extended alignment of objects
+//  with automatic, static, or thread storage. Whether the there is support for
+//  dynamic extended alignment is a property of the allocator which is used for
+//  each given dynamic allocation.
+//
+//  Currently, very heuristical - only non-mobile 64-bit linux gets the extended
+//  alignment treatment. Theoretically, this could be tuned better.
+constexpr bool has_extended_alignment =
+    kIsLinux && sizeof(void*) >= sizeof(std::uint64_t);
+
+namespace detail {
+
+// Implemented this way because of a bug in Clang for ARMv7, which gives the
+// wrong result for `alignof` a `union` with a field of each scalar type.
+// Modified for RocksDB to use C++11 only
+constexpr std::size_t max_align_v = constexpr_max(
+    alignof(long double),
+    alignof(double),
+    alignof(float),
+    alignof(long long int),
+    alignof(long int),
+    alignof(int),
+    alignof(short int),
+    alignof(bool),
+    alignof(char),
+    alignof(char16_t),
+    alignof(char32_t),
+    alignof(wchar_t),
+    alignof(void*),
+    alignof(std::max_align_t));
+
+} // namespace detail
+
+// max_align_v is the alignment of max_align_t.
+//
+// max_align_t is a type which is aligned at least as strictly as the
+// most-aligned basic type (see the specification of std::max_align_t). This
+// implementation exists because 32-bit iOS platforms have a broken
+// std::max_align_t (see below).
+//
+// You should refer to this as `::folly::max_align_t` in portable code, even if
+// you have `using namespace folly;` because C11 defines a global namespace
+// `max_align_t` type.
+//
+// To be certain, we consider every non-void fundamental type specified by the
+// standard. On most platforms `long double` would be enough, but iOS 32-bit
+// has an 8-byte aligned `double` and `long long int` and a 4-byte aligned
+// `long double`.
+//
+// So far we've covered locals and other non-allocated storage, but we also need
+// confidence that allocated storage from `malloc`, `new`, etc will also be
+// suitable for objects with this alignment requirement.
+//
+// Apple document that their implementation of malloc will issue 16-byte
+// granularity chunks for small allocations (large allocations are page-size
+// granularity and page-aligned). We think that allocated storage will be
+// suitable for these objects based on the following assumptions:
+//
+// 1. 16-byte granularity also means 16-byte aligned.
+// 2. `new` and other allocators follow the `malloc` rules.
+//
+// We also have some anecdotal evidence: we don't see lots of misaligned-storage
+// crashes on 32-bit iOS apps that use `double`.
+//
+// Apple's allocation reference: http://bit.ly/malloc-small
+constexpr std::size_t max_align_v = detail::max_align_v;
+struct alignas(max_align_v) max_align_t {};
+
 //  Memory locations within the same cache line are subject to destructive
 //  interference, also known as false sharing, which is when concurrent
 //  accesses to these different memory locations from different cores, where at
@@ -23,7 +119,9 @@
 //  to avoid destructive interference.
 //
 //  mimic: std::hardware_destructive_interference_size, C++17
-constexpr std::size_t hardware_destructive_interference_size = 128;
+constexpr std::size_t hardware_destructive_interference_size =
+    (kIsArchArm || kIsArchS390X) ? 64 : 128;
+static_assert(hardware_destructive_interference_size >= max_align_v, "math?");
 
 //  Memory locations within the same cache line are subject to constructive
 //  interference, also known as true sharing, which is when accesses to some
@@ -33,6 +131,14 @@
 //
 //  mimic: std::hardware_constructive_interference_size, C++17
 constexpr std::size_t hardware_constructive_interference_size = 64;
+static_assert(hardware_constructive_interference_size >= max_align_v, "math?");
 
-} // namespace folly
+//  A value corresponding to hardware_constructive_interference_size but which
+//  may be used with alignas, since hardware_constructive_interference_size may
+//  be too large on some platforms to be used with alignas.
+constexpr std::size_t cacheline_align_v = has_extended_alignment
+    ? hardware_constructive_interference_size
+    : max_align_v;
+struct alignas(cacheline_align_v) cacheline_align_t {};
 
+} // namespace folly
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/Baton.h	2025-05-19 16:14:27.000000000 +0000
@@ -249,7 +249,8 @@
   bool tryWaitSlow(
       const std::chrono::time_point<Clock, Duration>& deadline,
       const WaitOptions& opt) noexcept {
-    switch (detail::spin_pause_until(deadline, opt, [=] { return ready(); })) {
+    switch (
+        detail::spin_pause_until(deadline, opt, [this] { return ready(); })) {
       case detail::spin_result::success:
         return true;
       case detail::spin_result::timeout:
@@ -259,7 +260,7 @@
     }
 
     if (!MayBlock) {
-      switch (detail::spin_yield_until(deadline, [=] { return ready(); })) {
+      switch (detail::spin_yield_until(deadline, [this] { return ready(); })) {
         case detail::spin_result::success:
           return true;
         case detail::spin_result::timeout:
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/DistributedMutex-inl.h	2025-05-19 16:14:27.000000000 +0000
@@ -1374,7 +1374,8 @@
     // we need release here because of the write to waker_ and also because we
     // are unlocking the mutex, the thread we do the handoff to here should
     // see the modified data
-    new (&waiter->metadata_) Metadata(waker, bit_cast<uintptr_t>(sleepers));
+    new (&waiter->metadata_)
+        Metadata(waker, folly::bit_cast<std::uintptr_t>(sleepers));
     waiter->futex_.store(kWake, std::memory_order_release);
     return 0;
   }
@@ -1527,7 +1528,7 @@
 
 template <template <typename> class Atomic, bool Publish>
 void DistributedMutex<Atomic, Publish>::unlock(
-    DistributedMutex::DistributedMutexStateProxy proxy) {
+    typename DistributedMutex::DistributedMutexStateProxy proxy) {
   // we always wake up ready threads and timed waiters if we saw either
   assert(proxy);
   assert(!proxy.combined_);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp	2025-05-19 16:14:27.000000000 +0000
@@ -13,7 +13,8 @@
 #include <gtest/gtest.h>
 #endif
 
-#if !defined(ROCKSDB_LITE) && !defined(__ARM_ARCH)
+#if !defined(ROCKSDB_LITE) && !defined(__ARM_ARCH) && \
+    !defined(ROCKSDB_VALGRIND_RUN)
 
 #include <chrono>
 #include <cmath>
@@ -1136,7 +1137,9 @@
 
 #else
 int main(int /*argc*/, char** /*argv*/) {
-  printf("DistributedMutex is not supported in ROCKSDB_LITE or on ARM\n");
+  printf(
+      "DistributedMutex is not supported in ROCKSDB_LITE, on ARM, or in "
+      "valgrind_test runs\n");
   return 0;
 }
-#endif  // !ROCKSDB_LITE && !__ARM_ARCH
+#endif  // !ROCKSDB_LITE && !__ARM_ARCH && !ROCKSDB_VALGRIND_RUN
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/gcc/ppc-asm.h mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gcc/ppc-asm.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/gcc/ppc-asm.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gcc/ppc-asm.h	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,390 @@
+/* PowerPC asm definitions for GNU C.
+
+Copyright (C) 2002-2020 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Under winnt, 1) gas supports the following as names and 2) in particular
+   defining "toc" breaks the FUNC_START macro as ".toc" becomes ".2" */
+
+#define r0	0
+#define sp	1
+#define toc	2
+#define r3	3
+#define r4	4
+#define r5	5
+#define r6	6
+#define r7	7
+#define r8	8
+#define r9	9
+#define r10	10
+#define r11	11
+#define r12	12
+#define r13	13
+#define r14	14
+#define r15	15
+#define r16	16
+#define r17	17
+#define r18	18
+#define r19     19
+#define r20	20
+#define r21	21
+#define r22	22
+#define r23	23
+#define r24	24
+#define r25	25
+#define r26	26
+#define r27	27
+#define r28	28
+#define r29	29
+#define r30	30
+#define r31	31
+
+#define cr0	0
+#define cr1	1
+#define cr2	2
+#define cr3	3
+#define cr4	4
+#define cr5	5
+#define cr6	6
+#define cr7	7
+
+#define f0	0
+#define f1	1
+#define f2	2
+#define f3	3
+#define f4	4
+#define f5	5
+#define f6	6
+#define f7	7
+#define f8	8
+#define f9	9
+#define f10	10
+#define f11	11
+#define f12	12
+#define f13	13
+#define f14	14
+#define f15	15
+#define f16	16
+#define f17	17
+#define f18	18
+#define f19	19
+#define f20	20
+#define f21	21
+#define f22	22
+#define f23	23
+#define f24	24
+#define f25	25
+#define f26	26
+#define f27	27
+#define f28	28
+#define f29	29
+#define f30	30
+#define f31	31
+
+#ifdef __VSX__
+#define f32	32
+#define f33	33
+#define f34	34
+#define f35	35
+#define f36	36
+#define f37	37
+#define f38	38
+#define f39	39
+#define f40	40
+#define f41	41
+#define f42	42
+#define f43	43
+#define f44	44
+#define f45	45
+#define f46	46
+#define f47	47
+#define f48	48
+#define f49	49
+#define f50	50
+#define f51	51
+#define f52	52
+#define f53	53
+#define f54	54
+#define f55	55
+#define f56	56
+#define f57	57
+#define f58	58
+#define f59	59
+#define f60	60
+#define f61	61
+#define f62	62
+#define f63	63
+#endif
+
+#ifdef __ALTIVEC__
+#define v0	0
+#define v1	1
+#define v2	2
+#define v3	3
+#define v4	4
+#define v5	5
+#define v6	6
+#define v7	7
+#define v8	8
+#define v9	9
+#define v10	10
+#define v11	11
+#define v12	12
+#define v13	13
+#define v14	14
+#define v15	15
+#define v16	16
+#define v17	17
+#define v18	18
+#define v19	19
+#define v20	20
+#define v21	21
+#define v22	22
+#define v23	23
+#define v24	24
+#define v25	25
+#define v26	26
+#define v27	27
+#define v28	28
+#define v29	29
+#define v30	30
+#define v31	31
+#endif
+
+#ifdef __VSX__
+#define vs0	0
+#define vs1	1
+#define vs2	2
+#define vs3	3
+#define vs4	4
+#define vs5	5
+#define vs6	6
+#define vs7	7
+#define vs8	8
+#define vs9	9
+#define vs10	10
+#define vs11	11
+#define vs12	12
+#define vs13	13
+#define vs14	14
+#define vs15	15
+#define vs16	16
+#define vs17	17
+#define vs18	18
+#define vs19	19
+#define vs20	20
+#define vs21	21
+#define vs22	22
+#define vs23	23
+#define vs24	24
+#define vs25	25
+#define vs26	26
+#define vs27	27
+#define vs28	28
+#define vs29	29
+#define vs30	30
+#define vs31	31
+#define vs32	32
+#define vs33	33
+#define vs34	34
+#define vs35	35
+#define vs36	36
+#define vs37	37
+#define vs38	38
+#define vs39	39
+#define vs40	40
+#define vs41	41
+#define vs42	42
+#define vs43	43
+#define vs44	44
+#define vs45	45
+#define vs46	46
+#define vs47	47
+#define vs48	48
+#define vs49	49
+#define vs50	50
+#define vs51	51
+#define vs52	52
+#define vs53	53
+#define vs54	54
+#define vs55	55
+#define vs56	56
+#define vs57	57
+#define vs58	58
+#define vs59	59
+#define vs60	60
+#define vs61	61
+#define vs62	62
+#define vs63	63
+#endif
+
+/*
+ * Macros to glue together two tokens.
+ */
+
+#ifdef __STDC__
+#define XGLUE(a,b) a##b
+#else
+#define XGLUE(a,b) a/**/b
+#endif
+
+#define GLUE(a,b) XGLUE(a,b)
+
+/*
+ * Macros to begin and end a function written in assembler.  If -mcall-aixdesc
+ * or -mcall-nt, create a function descriptor with the given name, and create
+ * the real function with one or two leading periods respectively.
+ */
+
+#if defined(__powerpc64__) && _CALL_ELF == 2
+
+/* Defining "toc" above breaks @toc in assembler code.  */
+#undef toc
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#ifdef __PCREL__
+#define JUMP_TARGET(name) GLUE(FUNC_NAME(name),@notoc)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+	.localentry FUNC_NAME(name),1
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+0:	addis 2,12,(.TOC.-0b)@ha; \
+	addi 2,2,(.TOC.-0b)@l; \
+	.localentry FUNC_NAME(name),.-FUNC_NAME(name)
+#endif /* !__PCREL__ */
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+	.size FUNC_NAME(name),.-FUNC_NAME(name)
+
+#elif defined (__powerpc64__)
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section ".opd","aw"; \
+name: \
+	.quad GLUE(.,name); \
+	.quad .TOC.@tocbase; \
+	.quad 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name;	\
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#elif defined(_CALL_AIXDESC)
+
+#ifdef _RELOCATABLE
+#define DESC_SECTION ".got2"
+#else
+#define DESC_SECTION ".got1"
+#endif
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section DESC_SECTION,"aw"; \
+name: \
+	.long GLUE(.,name); \
+	.long _GLOBAL_OFFSET_TABLE_; \
+	.long 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name; \
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#else
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#if defined __PIC__ || defined __pic__
+#define JUMP_TARGET(name) FUNC_NAME(name@plt)
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#endif
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size FUNC_NAME(name),GLUE(.L,name)-FUNC_NAME(name)
+#endif
+
+#ifdef IN_GCC
+/* For HAVE_GAS_CFI_DIRECTIVE.  */
+#include "auto-host.h"
+
+#ifdef HAVE_GAS_CFI_DIRECTIVE
+# define CFI_STARTPROC			.cfi_startproc
+# define CFI_ENDPROC			.cfi_endproc
+# define CFI_OFFSET(reg, off)		.cfi_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)	.cfi_def_cfa_register reg
+# define CFI_RESTORE(reg)		.cfi_restore reg
+#else
+# define CFI_STARTPROC
+# define CFI_ENDPROC
+# define CFI_OFFSET(reg, off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_RESTORE(reg)
+#endif
+#endif
+
+#if defined __linux__ && !defined __powerpc64__
+	.section .note.GNU-stack
+	.previous
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -1 +1,4 @@
 add_library(gtest gtest-all.cc)
+
+# Add dependency of gtest on thread library
+target_link_libraries(gtest ${CMAKE_THREAD_LIBS_INIT})
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/CMakeLists.txt mariadb-10.11.13/storage/rocksdb/rocksdb/tools/CMakeLists.txt
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/CMakeLists.txt	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/CMakeLists.txt	2025-05-19 16:14:27.000000000 +0000
@@ -1,21 +1,30 @@
-set(TOOLS
+set(CORE_TOOLS
   sst_dump.cc
-  db_sanity_test.cc
-  write_stress.cc
-  ldb.cc
-  db_repl_stress.cc
-  dump/rocksdb_dump.cc
-  dump/rocksdb_undump.cc)
-foreach(src ${TOOLS})
+  ldb.cc)
+foreach(src ${CORE_TOOLS})
   get_filename_component(exename ${src} NAME_WE)
   add_executable(${exename}${ARTIFACT_SUFFIX}
     ${src})
   target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB})
-  list(APPEND tool_deps ${exename})
+  list(APPEND core_tool_deps ${exename})
 endforeach()
 
-list(APPEND tool_deps)
+if(WITH_TOOLS)
+  set(TOOLS
+    db_sanity_test.cc
+    write_stress.cc
+    db_repl_stress.cc
+    dump/rocksdb_dump.cc
+    dump/rocksdb_undump.cc)
+  foreach(src ${TOOLS})
+    get_filename_component(exename ${src} NAME_WE)
+    add_executable(${exename}${ARTIFACT_SUFFIX}
+      ${src})
+    target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
+    list(APPEND tool_deps ${exename})
+  endforeach()
 
-add_custom_target(ldb_tests
-  COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/ldb_tests.py
-  DEPENDS ldb)
+  add_custom_target(ldb_tests
+    COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/ldb_tests.py
+    DEPENDS ldb)
+endif()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/advisor/README.md mariadb-10.11.13/storage/rocksdb/rocksdb/tools/advisor/README.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/advisor/README.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/advisor/README.md	2025-05-19 16:14:27.000000000 +0000
@@ -12,17 +12,17 @@
 ## Overview
 
 Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
-[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)).
 Users provide the Rocksdb configuration that they want to improve upon (as the
 familiar Rocksdb OPTIONS file —
-[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini))
+[example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini))
 and the path of the file which contains Rocksdb logs and statistics.
-The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py)
+The [Advisor](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser_example.py)
 creates appropriate DataSource objects (for Rocksdb
-[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py),
-[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py),
-[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.)
-and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py).
+[logs](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rule_parser.py).
 The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
 The Advisor's output gives information about which rules were triggered,
 why they were triggered and what each of them suggests. Each suggestion
@@ -48,9 +48,9 @@
 
 Most important amongst all the input that the Advisor needs, are the rules
 spec and starting Rocksdb configuration. The configuration is provided as the
-familiar Rocksdb Options file (refer [example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini)).
+familiar Rocksdb Options file (refer [example](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini)).
 The Rules spec is written in the INI format (more details in
-[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+[rules.ini](https://github.com/facebook/rocksdb/blob/main/tools/advisor/advisor/rules.ini)).
 
 In brief, a Rule is made of conditions and is triggered when all its
 constituent conditions are triggered. When triggered, a Rule suggests changes
@@ -61,8 +61,8 @@
 'regex' is found in the Rocksdb LOG files. As of now the Rules Engine
 supports 3 types of Conditions (and consequently data-sources):
 LOG, OPTIONS, TIME_SERIES. The TIME_SERIES data can be sourced from the
-Rocksdb [statistics](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/statistics.h)
-or [perf context](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/perf_context.h).
+Rocksdb [statistics](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/statistics.h)
+or [perf context](https://github.com/facebook/rocksdb/blob/main/include/rocksdb/perf_context.h).
 
 For more information about the remaining command-line arguments, run:
 
@@ -87,7 +87,7 @@
 ## Running the tests
 
 Tests for the code have been added to the
-[test/](https://github.com/facebook/rocksdb/tree/master/tools/advisor/test)
+[test/](https://github.com/facebook/rocksdb/tree/main/tools/advisor/test)
 directory. For example, to run the unit tests for db_log_parser.py:
 
 ```shell
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/backup_db.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/backup_db.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/backup_db.sh	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/backup_db.sh	2025-05-19 16:14:27.000000000 +0000
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ${BASH_SOURCE[0]} <DB Path> <Backup Dir>"
+  exit 1
+fi
+
+db_dir="$1"
+backup_dir="$2"
+
+echo "== Backing up DB $db_dir to $backup_dir"
+./ldb backup --db="$db_dir" --backup_dir="$backup_dir"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/benchmark.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/benchmark.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/benchmark.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/benchmark.sh	2025-05-19 16:14:27.000000000 +0000
@@ -2,42 +2,102 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 # REQUIRE: db_bench binary exists in the current directory
 
-if [ $# -ne 1 ]; then
-  echo -n "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/"
-  echo    "readrandom/readwhilewriting/readwhilemerging/updaterandom/"
-  echo    "mergerandom/randomtransaction/compact]"
-  exit 0
+# Exit Codes
+EXIT_INVALID_ARGS=1
+EXIT_NOT_COMPACTION_TEST=2
+EXIT_UNKNOWN_JOB=3
+
+# Size Constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+T=$((1024 * G))
+
+function display_usage() {
+  echo "useage: benchmark.sh [--help] <test>"
+  echo ""
+  echo "These are the available benchmark tests:"
+  echo -e "\tbulkload"
+  echo -e "\tfillseq_disable_wal\t\tSequentially fill the database with no WAL"
+  echo -e "\tfillseq_enable_wal\t\tSequentially fill the database with WAL"
+  echo -e "\toverwrite"
+  echo -e "\tupdaterandom"
+  echo -e "\treadrandom"
+  echo -e "\tmergerandom"
+  echo -e "\tfilluniquerandom"
+  echo -e "\tmultireadrandom"
+  echo -e "\tfwdrange"
+  echo -e "\trevrange"
+  echo -e "\treadwhilewriting"
+  echo -e "\treadwhilemerging"
+  echo -e "\tfwdrangewhilewriting"
+  echo -e "\trevrangewhilewriting"
+  echo -e "\tfwdrangewhilemerging"
+  echo -e "\trevrangewhilemerging"
+  echo -e "\trandomtransaction"
+  echo -e "\tuniversal_compaction"
+  echo -e "\tdebug"
+  echo ""
+  echo "Enviroment Variables:"
+  echo -e "\tJOB_ID\t\tAn identifier for the benchmark job, will appear in the results"
+  echo -e "\tDB_DIR\t\t\t\tPath to write the database data directory"
+  echo -e "\tWAL_DIR\t\t\t\tPath to write the database WAL directory"
+  echo -e "\tOUTPUT_DIR\t\t\tPath to write the benchmark results to (default: /tmp)"
+  echo -e "\tNUM_KEYS\t\t\tThe number of keys to use in the benchmark"
+  echo -e "\tKEY_SIZE\t\t\tThe size of the keys to use in the benchmark (default: 20 bytes)"
+  echo -e "\tVALUE_SIZE\t\t\tThe size of the values to use in the benchmark (default: 400 bytes)"
+  echo -e "\tBLOCK_SIZE\t\t\tThe size of the database blocks in the benchmark (default: 8 KB)"
+  echo -e "\tDB_BENCH_NO_SYNC\t\tDisable fsync on the WAL"
+  echo -e "\tNUM_THREADS\t\t\tThe number of threads to use (default: 64)"
+  echo -e "\tMB_WRITE_PER_SEC"
+  echo -e "\tNUM_NEXTS_PER_SEEK\t\t(default: 10)"
+  echo -e "\tCACHE_SIZE\t\t\t(default: 16GB)"
+  echo -e "\tCOMPRESSION_MAX_DICT_BYTES"
+  echo -e "\tCOMPRESSION_TYPE\t\t(default: zstd)"
+  echo -e "\tDURATION"
+}
+
+if [ $# -lt 1 ]; then
+  display_usage
+  exit $EXIT_INVALID_ARGS
+fi
+bench_cmd=$1
+shift
+bench_args=$*
+
+if [[ "$bench_cmd" == "--help" ]]; then
+  display_usage
+  exit
 fi
 
+job_id=${JOB_ID}
+
 # Make it easier to run only the compaction test. Getting valid data requires
 # a number of iterations and having an ability to run the test separately from
 # rest of the benchmarks helps.
-if [ "$COMPACTION_TEST" == "1" -a "$1" != "universal_compaction" ]; then
+if [ "$COMPACTION_TEST" == "1" -a "$bench_cmd" != "universal_compaction" ]; then
   echo "Skipping $1 because it's not a compaction test."
-  exit 0
+  exit $EXIT_NOT_COMPACTION_TEST
 fi
 
-# size constants
-K=1024
-M=$((1024 * K))
-G=$((1024 * M))
-T=$((1024 * G))
-
 if [ -z $DB_DIR ]; then
   echo "DB_DIR is not defined"
-  exit 0
+  exit $EXIT_INVALID_ARGS
 fi
 
 if [ -z $WAL_DIR ]; then
   echo "WAL_DIR is not defined"
-  exit 0
+  exit $EXIT_INVALID_ARGS
 fi
 
-output_dir=${OUTPUT_DIR:-/tmp/}
+output_dir=${OUTPUT_DIR:-/tmp}
 if [ ! -d $output_dir ]; then
   mkdir -p $output_dir
 fi
 
+report="$output_dir/report.tsv"
+schedule="$output_dir/schedule.txt"
+
 # all multithreaded tests run with sync=1 unless
 # $DB_BENCH_NO_SYNC is defined
 syncval="1"
@@ -97,7 +157,9 @@
   \
   --memtablerep=skip_list \
   --bloom_bits=10 \
-  --open_files=-1"
+  --open_files=-1 \
+  \
+  $bench_args"
 
 l0_config="
   --level0_file_num_compaction_trigger=4 \
@@ -107,23 +169,24 @@
   const_params="$const_params --duration=$duration"
 fi
 
-params_w="$const_params \
-          $l0_config \
+params_w="$l0_config \
           --max_background_compactions=16 \
           --max_write_buffer_number=8 \
-          --max_background_flushes=7"
+          --max_background_flushes=7 \
+          $const_params"
 
-params_bulkload="$const_params \
-                 --max_background_compactions=16 \
+params_bulkload="--max_background_compactions=16 \
                  --max_write_buffer_number=8 \
                  --allow_concurrent_memtable_write=false \
                  --max_background_flushes=7 \
                  --level0_file_num_compaction_trigger=$((10 * M)) \
                  --level0_slowdown_writes_trigger=$((10 * M)) \
-                 --level0_stop_writes_trigger=$((10 * M))"
+                 --level0_stop_writes_trigger=$((10 * M)) \
+                 $const_params "
+
+params_fillseq="--allow_concurrent_memtable_write=false \
+                $params_w "
 
-params_fillseq="$params_w \
-		--allow_concurrent_memtable_write=false"
 #
 # Tune values for level and universal compaction.
 # For universal compaction, these level0_* options mean total sorted of runs in
@@ -143,6 +206,23 @@
                 --level0_slowdown_writes_trigger=16 \
                 --level0_stop_writes_trigger=20"
 
+function month_to_num() {
+    local date_str=$1
+    date_str="${date_str/Jan/01}"
+    date_str="${date_str/Feb/02}"
+    date_str="${date_str/Mar/03}"
+    date_str="${date_str/Apr/04}"
+    date_str="${date_str/May/05}"
+    date_str="${date_str/Jun/06}"
+    date_str="${date_str/Jul/07}"
+    date_str="${date_str/Aug/08}"
+    date_str="${date_str/Sep/09}"
+    date_str="${date_str/Oct/10}"
+    date_str="${date_str/Nov/11}"
+    date_str="${date_str/Dec/12}"
+    echo $date_str
+}
+
 function summarize_result {
   test_out=$1
   test_name=$2
@@ -152,30 +232,48 @@
   # that "Compaction Stats" is written to stdout at least once. If it won't
   # happen then empty output from grep when searching for "Sum" will cause
   # syntax errors.
+  version=$( grep ^RocksDB: $test_out | awk '{ print $3 }' )
+  date=$( grep ^Date: $test_out | awk '{ print $6 "-" $3 "-" $4 "T" $5 ".000" }' )
+  iso_date=$( month_to_num $date )
+  tz=$( date "+%z" )
+  iso_tz="${tz:0:3}:${tz:3:2}"
+  iso_date="$iso_date$iso_tz"
   uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' )
   stall_time=$( grep "^Cumulative stall" $test_out | tail -1  | awk '{  print $3 }' )
   stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
   ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
   mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
-  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $9 }' )
+  l0_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $9 }' )
   sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $9 }' )
   sum_size=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $3 / 1024.0 }' )
-  wamp=$( echo "scale=1; $sum_wgb / $lo_wgb" | bc )
-  wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
+  wamp=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $12 }' )
+  if [[ "$sum_wgb" == "" ]]; then
+      wmb_ps=""
+  else
+      wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
+  fi
   usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
   p50=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $3 }' )
   p75=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $5 }' )
   p99=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $7 }' )
   p999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $9 }' )
   p9999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $11 }' )
-  echo -e "$ops_sec\t$mb_sec\t$sum_size\t$lo_wgb\t$sum_wgb\t$wamp\t$wmb_ps\t$usecs_op\t$p50\t$p75\t$p99\t$p999\t$p9999\t$uptime\t$stall_time\t$stall_pct\t$test_name" \
-    >> $output_dir/report.txt
+
+  # if the report TSV (Tab Separate Values) file does not yet exist, create it and write the header row to it
+  if [ ! -f "$report" ]; then
+    echo -e "ops_sec\tmb_sec\ttotal_size_gb\tlevel0_size_gb\tsum_gb\twrite_amplification\twrite_mbps\tusec_op\tpercentile_50\tpercentile_75\tpercentile_99\tpercentile_99.9\tpercentile_99.99\tuptime\tstall_time\tstall_percent\ttest_name\ttest_date\trocksdb_version\tjob_id" \
+      >> $report
+  fi
+
+  echo -e "$ops_sec\t$mb_sec\t$sum_size\t$l0_wgb\t$sum_wgb\t$wamp\t$wmb_ps\t$usecs_op\t$p50\t$p75\t$p99\t$p999\t$p9999\t$uptime\t$stall_time\t$stall_pct\t$test_name\t$iso_date\t$version\t$job_id" \
+    >> $report
 }
 
 function run_bulkload {
   # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
   # client can discover where to restart a load after a crash. I think this is a good way to load.
   echo "Bulk loading $num_keys random keys"
+  log_file_name=$output_dir/benchmark_bulkload_fillrandom.log
   cmd="./db_bench --benchmarks=fillrandom \
        --use_existing_db=0 \
        --disable_auto_compactions=1 \
@@ -186,19 +284,31 @@
        --allow_concurrent_memtable_write=false \
        --disable_wal=1 \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
-  echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
-  summarize_result $output_dir/benchmark_bulkload_fillrandom.log bulkload fillrandom
+  summarize_result $log_file_name bulkload fillrandom
+
   echo "Compacting..."
+  log_file_name=$output_dir/benchmark_bulkload_compact.log
   cmd="./db_bench --benchmarks=compact \
        --use_existing_db=1 \
        --disable_auto_compactions=1 \
        --sync=0 \
        $params_w \
        --threads=1 \
-       2>&1 | tee -a $output_dir/benchmark_bulkload_compact.log"
-  echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
 }
 
@@ -216,8 +326,7 @@
   # load after a crash. I think this is a good way to load.
   echo "Bulk loading $num_keys random keys for manual compaction."
 
-  fillrandom_output_file=$output_dir/benchmark_man_compact_fillrandom_$3.log
-  man_compact_output_log=$output_dir/benchmark_man_compact_$3.log
+  log_file_name=$output_dir/benchmark_man_compact_fillrandom_$3.log
 
   if [ "$2" == "1" ]; then
     extra_params=$params_univ_compact
@@ -240,15 +349,22 @@
        --disable_wal=1 \
        --max_background_compactions=$4 \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $fillrandom_output_file"
+       2>&1 | tee -a $log_file_name"
 
-  echo $cmd | tee $fillrandom_output_file
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
 
-  summarize_result $fillrandom_output_file man_compact_fillrandom_$3 fillrandom
+  summarize_result $log_file_namefillrandom_output_file man_compact_fillrandom_$3 fillrandom
 
   echo "Compacting with $3 subcompactions specified ..."
 
+  log_file_name=$output_dir/benchmark_man_compact_$3.log
+
   # This is the part we're really interested in. Given that compact benchmark
   # doesn't output regular statistics then we'll just use the time command to
   # measure how long this step takes.
@@ -264,9 +380,14 @@
        --subcompactions=$3 \
        --max_background_compactions=$4 \
        ;}
-       2>&1 | tee -a $man_compact_output_log"
+       2>&1 | tee -a $log_file_name"
 
-  echo $cmd | tee $man_compact_output_log
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
 
   # Can't use summarize_result here. One way to analyze the results is to run
@@ -306,10 +427,10 @@
   # Make sure that we'll have unique names for all the files so that data won't
   # be overwritten.
   if [ $1 == 1 ]; then
-    log_file_name=$output_dir/benchmark_fillseq.wal_disabled.v${value_size}.log
+    log_file_name="${output_dir}/benchmark_fillseq.wal_disabled.v${value_size}.log"
     test_name=fillseq.wal_disabled.v${value_size}
   else
-    log_file_name=$output_dir/benchmark_fillseq.wal_enabled.v${value_size}.log
+    log_file_name="${output_dir}/benchmark_fillseq.wal_enabled.v${value_size}.log"
     test_name=fillseq.wal_enabled.v${value_size}
   fi
 
@@ -325,7 +446,13 @@
        --disable_wal=$1 \
        --seed=$( date +%s ) \
        2>&1 | tee -a $log_file_name"
-  echo $cmd | tee $log_file_name
+
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
 
   # The constant "fillseq" which we pass to db_bench is the benchmark name.
@@ -335,7 +462,7 @@
 function run_change {
   operation=$1
   echo "Do $num_keys random $operation"
-  out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+  log_file_name="$output_dir/benchmark_${operation}.t${num_threads}.s${syncval}.log"
   cmd="./db_bench --benchmarks=$operation \
        --use_existing_db=1 \
        --sync=$syncval \
@@ -343,44 +470,80 @@
        --threads=$num_threads \
        --merge_operator=\"put\" \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $output_dir/${out_name}"
-  echo $cmd | tee $output_dir/${out_name}
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
-  summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation
+  summarize_result $log_file_name ${operation}.t${num_threads}.s${syncval} $operation
 }
 
 function run_filluniquerandom {
   echo "Loading $num_keys unique keys randomly"
+  log_file_name=$output_dir/benchmark_filluniquerandom.log
   cmd="./db_bench --benchmarks=filluniquerandom \
        --use_existing_db=0 \
        --sync=0 \
        $params_w \
        --threads=1 \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $output_dir/benchmark_filluniquerandom.log"
-  echo $cmd | tee $output_dir/benchmark_filluniquerandom.log
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
-  summarize_result $output_dir/benchmark_filluniquerandom.log filluniquerandom filluniquerandom
+  summarize_result $log_file_name filluniquerandom filluniquerandom
 }
 
 function run_readrandom {
   echo "Reading $num_keys random keys"
-  out_name="benchmark_readrandom.t${num_threads}.log"
+  log_file_name="${output_dir}/benchmark_readrandom.t${num_threads}.log"
   cmd="./db_bench --benchmarks=readrandom \
        --use_existing_db=1 \
        $params_w \
        --threads=$num_threads \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $output_dir/${out_name}"
-  echo $cmd | tee $output_dir/${out_name}
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
+  eval $cmd
+  summarize_result $log_file_name readrandom.t${num_threads} readrandom
+}
+
+function run_multireadrandom {
+  echo "Multi-Reading $num_keys random keys"
+  log_file_name="${output_dir}/benchmark_multireadrandom.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=multireadrandom \
+       --use_existing_db=1 \
+       --threads=$num_threads \
+       --batch_size=10 \
+       $params_w \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
-  summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom
+  summarize_result $log_file_name multireadrandom.t${num_threads} multireadrandom
 }
 
 function run_readwhile {
   operation=$1
   echo "Reading $num_keys random keys while $operation"
-  out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+  log_file_name="${output_dir}/benchmark_readwhile${operation}.t${num_threads}.log"
   cmd="./db_bench --benchmarks=readwhile${operation} \
        --use_existing_db=1 \
        --sync=$syncval \
@@ -388,17 +551,22 @@
        --threads=$num_threads \
        --merge_operator=\"put\" \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $output_dir/${out_name}"
-  echo $cmd | tee $output_dir/${out_name}
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
-  summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation}
+  summarize_result $log_file_name readwhile${operation}.t${num_threads} readwhile${operation}
 }
 
 function run_rangewhile {
   operation=$1
   full_name=$2
   reverse_arg=$3
-  out_name="benchmark_${full_name}.t${num_threads}.log"
+  log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log"
   echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}"
   cmd="./db_bench --benchmarks=seekrandomwhile${operation} \
        --use_existing_db=1 \
@@ -409,16 +577,16 @@
        --seek_nexts=$num_nexts_per_seek \
        --reverse_iterator=$reverse_arg \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $output_dir/${out_name}"
-  echo $cmd | tee $output_dir/${out_name}
+       2>&1 | tee -a $log_file_name"
+  echo $cmd | tee $log_file_name
   eval $cmd
-  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandomwhile${operation}
+  summarize_result $log_file_name ${full_name}.t${num_threads} seekrandomwhile${operation}
 }
 
 function run_range {
   full_name=$1
   reverse_arg=$2
-  out_name="benchmark_${full_name}.t${num_threads}.log"
+  log_file_name="${output_dir}/benchmark_${full_name}.t${num_threads}.log"
   echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}"
   cmd="./db_bench --benchmarks=seekrandom \
        --use_existing_db=1 \
@@ -427,21 +595,32 @@
        --seek_nexts=$num_nexts_per_seek \
        --reverse_iterator=$reverse_arg \
        --seed=$( date +%s ) \
-       2>&1 | tee -a $output_dir/${out_name}"
-  echo $cmd | tee $output_dir/${out_name}
+       2>&1 | tee -a $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
-  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandom
+  summarize_result $log_file_name ${full_name}.t${num_threads} seekrandom
 }
 
 function run_randomtransaction {
   echo "..."
+  log_file_name=$output_dir/benchmark_randomtransaction.log
   cmd="./db_bench $params_r --benchmarks=randomtransaction \
        --num=$num_keys \
        --transaction_db \
        --threads=5 \
        --transaction_sets=5 \
-       2>&1 | tee $output_dir/benchmark_randomtransaction.log"
-  echo $cmd | tee $output_dir/benchmark_rangescanwhilewriting.log
+       2>&1 | tee $log_file_name"
+  if [[ "$job_id" != "" ]]; then
+    echo "Job ID: ${job_id}" > $log_file_name
+    echo $cmd | tee -a $log_file_name
+  else
+    echo $cmd | tee $log_file_name
+  fi
   eval $cmd
 }
 
@@ -449,18 +628,16 @@
   echo `date +"%s"`
 }
 
-report="$output_dir/report.txt"
-schedule="$output_dir/schedule.txt"
 
 echo "===== Benchmark ====="
 
 # Run!!!
-IFS=',' read -a jobs <<< $1
+IFS=',' read -a jobs <<< $bench_cmd
 # shellcheck disable=SC2068
 for job in ${jobs[@]}; do
 
   if [ $job != debug ]; then
-    echo "Start $job at `date`" | tee -a $schedule
+    echo "Starting $job (ID: $job_id) at `date`" | tee -a $schedule
   fi
 
   start=$(now)
@@ -473,10 +650,10 @@
   elif [ $job = overwrite ]; then
     syncval="0"
     params_w="$params_w \
-	--writes=125000000 \
-	--subcompactions=4 \
-	--soft_pending_compaction_bytes_limit=$((1 * T)) \
-	--hard_pending_compaction_bytes_limit=$((4 * T)) "
+        --writes=125000000 \
+        --subcompactions=4 \
+        --soft_pending_compaction_bytes_limit=$((1 * T)) \
+        --hard_pending_compaction_bytes_limit=$((4 * T)) "
     run_change overwrite
   elif [ $job = updaterandom ]; then
     run_change updaterandom
@@ -486,6 +663,8 @@
     run_filluniquerandom
   elif [ $job = readrandom ]; then
     run_readrandom
+  elif [ $job = multireadrandom ]; then
+    run_multireadrandom
   elif [ $job = fwdrange ]; then
     run_range $job false
   elif [ $job = revrange ]; then
@@ -511,15 +690,15 @@
     echo "Setting num_keys to $num_keys"
   else
     echo "unknown job $job"
-    exit
+    exit $EXIT_UNKNOWN_JOB
   fi
   end=$(now)
 
   if [ $job != debug ]; then
-    echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
+    echo "Completed $job (ID: $job_id) in $((end-start)) seconds" | tee -a $schedule
   fi
 
-  echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest"
-  tail -1 $output_dir/report.txt
+  echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest\tDate\tVersion\tJob-ID"
+  tail -1 $report
 
 done
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/blob_dump.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/blob_dump.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/blob_dump.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/blob_dump.cc	2025-05-19 16:14:27.000000000 +0000
@@ -11,8 +11,8 @@
 
 #include "utilities/blob_db/blob_dump_tool.h"
 
-using namespace ROCKSDB_NAMESPACE;
-using namespace ROCKSDB_NAMESPACE::blob_db;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::blob_db::BlobDumpTool;
 
 int main(int argc, char** argv) {
   using DisplayType = BlobDumpTool::DisplayType;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc	2025-05-19 16:14:27.000000000 +0000
@@ -19,6 +19,8 @@
 #include <sstream>
 
 #include "monitoring/histogram.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_record.h"
 #include "util/gflags_compat.h"
 #include "util/string_util.h"
 
@@ -67,7 +69,7 @@
     "Group the number of accesses per block per second using these labels. "
     "Possible labels are a combination of the following: cf (column family), "
     "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" "
-    "means the number of acccess per second is grouped by unique pairs of "
+    "means the number of access per second is grouped by unique pairs of "
     "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per "
     "second across all possible labels.");
 DEFINE_string(reuse_distance_labels, "",
@@ -578,8 +580,8 @@
   }
   // Sort in descending order.
   sort(pairs.begin(), pairs.end(),
-       [=](const std::pair<std::string, uint64_t>& a,
-           const std::pair<std::string, uint64_t>& b) {
+       [](const std::pair<std::string, uint64_t>& a,
+          const std::pair<std::string, uint64_t>& b) {
          return b.second < a.second;
        });
 
@@ -652,7 +654,6 @@
     const std::map<std::string, Features>& label_features,
     const std::map<std::string, Predictions>& label_predictions,
     uint32_t max_number_of_values) const {
-  std::default_random_engine rand_engine(static_cast<std::default_random_engine::result_type>(env_->NowMicros()));
   for (auto const& label_feature_vectors : label_features) {
     const Features& past = label_feature_vectors.second;
     auto it = label_predictions.find(label_feature_vectors.first);
@@ -674,7 +675,7 @@
     for (uint32_t i = 0; i < past.num_accesses_since_last_access.size(); i++) {
       indexes.push_back(i);
     }
-    std::shuffle(indexes.begin(), indexes.end(), rand_engine);
+    RandomShuffle(indexes.begin(), indexes.end());
     for (uint32_t i = 0; i < max_number_of_values && i < indexes.size(); i++) {
       uint32_t rand_index = indexes[i];
       out << std::to_string(past.num_accesses_since_last_access[rand_index])
@@ -1520,6 +1521,7 @@
 }
 
 Status BlockCacheTraceAnalyzer::Analyze() {
+  SystemClock* clock = env_->GetSystemClock().get();
   std::unique_ptr<BlockCacheTraceReader> reader;
   Status s = Status::OK();
   if (is_human_readable_trace_file_) {
@@ -1543,7 +1545,7 @@
       return s;
     }
   }
-  uint64_t start = env_->NowMicros();
+  uint64_t start = clock->NowMicros();
   uint64_t time_interval = 0;
   while (s.ok()) {
     BlockCacheTraceRecord access;
@@ -1569,7 +1571,7 @@
       cache_simulator_->Access(access);
     }
     access_sequence_number_++;
-    uint64_t now = env_->NowMicros();
+    uint64_t now = clock->NowMicros();
     uint64_t duration = (now - start) / kMicrosInSecond;
     if (duration > 10 * time_interval) {
       uint64_t trace_duration =
@@ -1583,7 +1585,7 @@
       time_interval++;
     }
   }
-  uint64_t now = env_->NowMicros();
+  uint64_t now = clock->NowMicros();
   uint64_t duration = (now - start) / kMicrosInSecond;
   uint64_t trace_duration =
       trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h	2025-05-19 16:14:27.000000000 +0000
@@ -11,6 +11,7 @@
 
 #include "db/dbformat.h"
 #include "rocksdb/env.h"
+#include "rocksdb/trace_record.h"
 #include "rocksdb/utilities/sim_cache.h"
 #include "trace_replay/block_cache_tracer.h"
 #include "utilities/simulator_cache/cache_simulator.h"
@@ -103,7 +104,9 @@
         num_referenced_key_exist_in_block++;
         if (referenced_data_size > block_size && block_size != 0) {
           ParsedInternalKey internal_key;
-          ParseInternalKey(access.referenced_key, &internal_key);
+          Status s = ParseInternalKey(access.referenced_key, &internal_key,
+                                      false /* log_err_key */);  // TODO
+          assert(s.ok());  // TODO
         }
       } else {
         non_exist_key_num_access_map[access.referenced_key][access.caller]++;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -9,7 +9,7 @@
 int main() {
   fprintf(stderr,
           "Please install gflags to run block_cache_trace_analyzer_test\n");
-  return 1;
+  return 0;
 }
 #else
 
@@ -18,9 +18,11 @@
 #include <map>
 #include <vector>
 
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
@@ -44,7 +46,7 @@
 class BlockCacheTracerTest : public testing::Test {
  public:
   BlockCacheTracerTest() {
-    test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+    test_path_ = test::PerThreadDBPath("block_cache_trace_analyzer_test");
     env_ = ROCKSDB_NAMESPACE::Env::Default();
     EXPECT_OK(env_->CreateDir(test_path_));
     trace_file_path_ = test_path_ + "/block_cache_trace";
@@ -225,7 +227,9 @@
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    const auto& clock = env_->GetSystemClock();
+    BlockCacheTraceWriter writer(clock.get(), trace_opt,
+                                 std::move(trace_writer));
     ASSERT_OK(writer.WriteHeader());
     WriteBlockAccess(&writer, 0, TraceType::kBlockTraceDataBlock, 50);
     ASSERT_OK(env_->FileExists(trace_file_path_));
@@ -327,7 +331,7 @@
           }
           num_misses += ParseInt(substr);
         }
-        ASSERT_EQ(51, num_misses);
+        ASSERT_EQ(51u, num_misses);
         ASSERT_FALSE(getline(mt_file, line));
         mt_file.close();
         ASSERT_OK(env_->DeleteFile(miss_timeline_path));
@@ -594,7 +598,7 @@
         sum_percent += ParseDouble(percent);
         nrows++;
       }
-      ASSERT_EQ(11, nrows);
+      ASSERT_EQ(11u, nrows);
       ASSERT_EQ(100.0, sum_percent);
       ASSERT_OK(env_->DeleteFile(filename));
     }
@@ -610,9 +614,11 @@
     // kSSTStoringEvenKeys.
     TraceOptions trace_opt;
     std::unique_ptr<TraceWriter> trace_writer;
+    const auto& clock = env_->GetSystemClock();
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    BlockCacheTraceWriter writer(clock.get(), trace_opt,
+                                 std::move(trace_writer));
     ASSERT_OK(writer.WriteHeader());
     // Write blocks of different types.
     WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
@@ -632,8 +638,10 @@
     BlockCacheTraceReader reader(std::move(trace_reader));
     BlockCacheTraceHeader header;
     ASSERT_OK(reader.ReadHeader(&header));
-    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
-    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    ASSERT_EQ(static_cast<uint32_t>(kMajorVersion),
+              header.rocksdb_major_version);
+    ASSERT_EQ(static_cast<uint32_t>(kMinorVersion),
+              header.rocksdb_minor_version);
     // Read blocks.
     BlockCacheTraceAnalyzer analyzer(
         trace_file_path_,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/check_all_python.py mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_all_python.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/check_all_python.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_all_python.py	2025-05-19 16:14:27.000000000 +0000
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import glob
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/check_format_compatible.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_format_compatible.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/check_format_compatible.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/check_format_compatible.sh	2025-05-19 16:14:27.000000000 +0000
@@ -1,29 +1,90 @@
 #!/usr/bin/env bash
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #
-# A shell script to load some pre generated data file to a DB using ldb tool
-# ./ldb needs to be avaible to be executed.
+# A shell script to build and run different versions of ldb to check for
+# expected forward and backward compatibility with "current" version. The
+# working copy must have no uncommitted changes.
+#
+# Usage: <SCRIPT> [ref_for_current]
+# `ref_for_current` can be a revision, tag, commit or branch name. Default is HEAD.
 #
-# Usage: <SCRIPT> [checkout]
-# `checkout` can be a tag, commit or branch name. Will build using it and check DBs generated by all previous branches (or tags for very old versions without branch) can be opened by it.
 # Return value 0 means all regression tests pass. 1 if not pass.
+#
+# Environment options:
+#  SHORT_TEST=1 - Test only the oldest branch for each kind of test. This is
+#    a good choice for PR validation as it is relatively fast and will find
+#    most issues.
+#  USE_SSH=1 - Connect to GitHub with ssh instead of https
+
+if ! git diff-index --quiet HEAD; then
+  echo "You have uncommitted changes. Aborting."
+  exit 1
+fi
+
+current_checkout_name=${1:-HEAD}
+# This allows the script to work even if with transient refs like "HEAD"
+current_checkout_hash="$(git rev-parse --quiet --verify $current_checkout_name)"
+
+if [ "$current_checkout_hash" == "" ]; then
+  echo "Not a recognized ref: $current_checkout_name"
+  exit 1
+fi
+
+# To restore to prior branch at the end
+orig_branch="$(git rev-parse --abbrev-ref HEAD)"
+tmp_branch=_tmp_format_compatible
+tmp_origin=_tmp_origin
+
+# Don't depend on what current "origin" might be
+set -e
+git remote remove $tmp_origin 2>/dev/null || true
+if [ "$USE_SSH" ]; then
+  git remote add $tmp_origin "git@github.com:facebook/rocksdb.git"
+else
+  git remote add $tmp_origin "https://github.com/facebook/rocksdb.git"
+fi
+git fetch $tmp_origin
+
+cleanup() {
+  echo "== Cleaning up"
+  git reset --hard || true
+  git checkout "$orig_branch" || true
+  git branch -D $tmp_branch || true
+  git remote remove $tmp_origin || true
+}
+trap cleanup EXIT # Always clean up, even on failure or Ctrl+C
+
+scriptpath=`dirname ${BASH_SOURCE[0]}`
 
-scriptpath=`dirname $BASH_SOURCE`
-test_dir=${TEST_TMPDIR:-"/tmp"}"/format_compatible_check"
+test_dir=${TEST_TMPDIR:-"/tmp"}"/rocksdb_format_compatible_$USER"
+rm -rf ${test_dir:?}
+
+# For saving current version of scripts as we checkout different versions to test
 script_copy_dir=$test_dir"/script_copy"
-input_data_path=$test_dir"/test_data_input/"
+mkdir -p $script_copy_dir
+cp -f $scriptpath/*.sh $script_copy_dir
+
+# For shared raw input data
+input_data_path=$test_dir"/test_data_input"
+mkdir -p $input_data_path
+# For external sst ingestion test
+ext_test_dir=$test_dir"/ext"
+mkdir -p $ext_test_dir
+# For DB dump test
+db_test_dir=$test_dir"/db"
+mkdir -p $db_test_dir
+# For backup/restore test (uses DB test)
+bak_test_dir=$test_dir"/bak"
+mkdir -p $bak_test_dir
 
-mkdir $test_dir || true
-mkdir $input_data_path || true
-rm -rf $script_copy_dir
-cp $scriptpath $script_copy_dir -rf
+python_bin=$(which python3 || which python || echo python3)
 
 # Generate random files.
 for i in {1..6}
 do
   input_data[$i]=$input_data_path/data$i
   echo == Generating random input file ${input_data[$i]}
-  python - <<EOF
+  $python_bin - <<EOF
 import random
 random.seed($i)
 symbols=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
@@ -38,26 +99,70 @@
     v = ""
     for j in range(1, random.randint(1, 5)):
       v = v + vb
-    print >> f, k + " ==> " + v
+    print(k + " ==> " + v, file=f)
 EOF
 done
 
 # Generate file(s) with sorted keys.
 sorted_input_data=$input_data_path/sorted_data
 echo == Generating file with sorted keys ${sorted_input_data}
-python - <<EOF
+$python_bin - <<EOF
 with open('${sorted_input_data}', 'w') as f:
   for i in range(0,10):
     k = str(i)
     v = "value" + k
-    print >> f, k + " ==> " + v
+    print(k + " ==> " + v, file=f)
 EOF
 
-declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
-declare -a forward_compatible_checkout_objs=("4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
-declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
-declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
-declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
+# db_backward_only_refs defined below the rest
+
+# To check for DB forward compatibility with loading options (old version
+# reading data from new), as well as backward compatibility
+declare -a db_forward_with_options_refs=("6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb")
+# To check for DB forward compatibility without loading options (in addition
+# to the "with loading options" set), as well as backward compatibility
+declare -a db_forward_no_options_refs=() # N/A at the moment
+
+# To check for SST ingestion backward compatibility (new version reading
+# data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to
+# 5.14.x, 5.15.x)
+declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
+# To check for SST ingestion forward compatibility (old version reading
+# data from new) as well as backward compatibility
+declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
+
+# To check for backup backward compatibility (new version reading data
+# from old) (ldb backup/restore added in 4.11.x)
+declare -a bak_backward_only_refs=("4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb" "5.11.fb" "5.12.fb" "5.13.fb" "${ext_backward_only_refs[@]}")
+# To check for backup forward compatibility (old version reading data
+# from new) as well as backward compatibility
+declare -a bak_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
+
+# Branches (git refs) to check for DB backward compatibility (new version
+# reading data from old) (in addition to the "forward compatible" list)
+# NOTE: 2.7.fb.branch shows assertion violation in some configurations
+declare -a db_backward_only_refs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}")
+
+if [ "$SHORT_TEST" ]; then
+  # Use only the first (if exists) of each list
+  db_backward_only_refs=(${db_backward_only_refs[0]})
+  db_forward_no_options_refs=(${db_forward_no_options_refs[0]})
+  db_forward_with_options_refs=(${db_forward_with_options_refs[0]})
+  ext_backward_only_refs=(${ext_backward_only_refs[0]})
+  ext_forward_refs=(${ext_forward_refs[0]})
+  bak_backward_only_refs=(${bak_backward_only_refs[0]})
+  bak_forward_refs=(${bak_forward_refs[0]})
+fi
+
+# De-duplicate & accumulate
+declare -a checkout_refs=()
+for checkout_ref in "${db_backward_only_refs[@]}" "${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}" "${ext_backward_only_refs[@]}" "${ext_forward_refs[@]}" "${bak_backward_only_refs[@]}" "${bak_forward_refs[@]}"
+do
+  if [ ! -e $db_test_dir/$checkout_ref ]; then
+    mkdir -p $db_test_dir/$checkout_ref
+    checkout_refs+=($checkout_ref)
+  fi
+done
 
 generate_db()
 {
@@ -103,89 +208,165 @@
     set -e
 }
 
-# Sandcastle sets us up with a remote that is just another directory on the same
-# machine and doesn't have our branches. Need to fetch them so checkout works.
-# Remote add may fail if added previously (we don't cleanup).
-git remote add github_origin "https://github.com/facebook/rocksdb.git"
-set -e
-https_proxy="fwdproxy:8080" git fetch github_origin
-
-# Compatibility test for external SST file ingestion
-for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
-do
-  echo == Generating DB with extern SST file in "$checkout_obj" ...
-  https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_obj -b $checkout_obj
-  make clean
-  make ldb -j32
-  write_external_sst $input_data_path $test_dir/$checkout_obj $test_dir/$checkout_obj
-  ingest_external_sst $test_dir/$checkout_obj $test_dir/$checkout_obj
-done
-
-checkout_flag=${1:-"master"}
+backup_db()
+{
+    set +e
+    $script_copy_dir/backup_db.sh $1 $2
+    if [ $? -ne 0 ]; then
+        echo ==== Error backing up DB $1 to $2 ====
+        exit 1
+    fi
+    set -e
+}
 
-echo == Building $checkout_flag debug
-https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
-make clean
-make ldb -j32
-compare_base_db_dir=$test_dir"/base_db_dir"
-write_external_sst $input_data_path $compare_base_db_dir $compare_base_db_dir
-ingest_external_sst $compare_base_db_dir $compare_base_db_dir
+restore_db()
+{
+    set +e
+    $script_copy_dir/restore_db.sh $1 $2
+    if [ $? -ne 0 ]; then
+        echo ==== Error restoring from $1 to $2 ====
+        exit 1
+    fi
+    set -e
+}
 
-for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
-do
-  echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag
-  git checkout $checkout_obj
-  make clean
-  make ldb -j32
-  compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 1
-  git checkout tmp-$checkout_flag
-  # Clean up
-  git branch -D $checkout_obj
-done
+member_of_array()
+{
+  local e match="$1"
+  shift
+  for e; do [[ "$e" == "$match" ]] && return 0; done
+  return 1
+}
 
-echo == Finish compatibility test for SST ingestion.
+force_no_fbcode()
+{
+  # Not all branches recognize ROCKSDB_NO_FBCODE and we should not need
+  # to patch old branches for changes to available FB compilers.
+  sed -i -e 's|-d /mnt/gvfs/third-party|"$ROCKSDB_FORCE_FBCODE"|' build_tools/build_detect_platform
+}
 
-for checkout_obj in "${checkout_objs[@]}"
-do
-   echo == Generating DB from "$checkout_obj" ...
-   https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_obj -b $checkout_obj
-   make clean
-   make ldb -j32
-   generate_db $input_data_path $test_dir/$checkout_obj
-done
+# General structure from here:
+# * Check out, build, and do stuff with the "current" branch.
+# * For each older branch under consideration,
+#   * Check out, build, and do stuff with it, potentially using data
+#     generated from "current" branch.
+# * (Again) check out, build, and do (other) stuff with the "current"
+#    branch, potentially using data from older branches.
+#
+# This way, we only do at most n+1 checkout+build steps, without the
+# need to stash away executables.
 
-checkout_flag=${1:-"master"}
+# Decorate name
+current_checkout_name="$current_checkout_name ($current_checkout_hash)"
 
-echo == Building $checkout_flag debug
-git checkout tmp-$checkout_flag
+echo "== Building $current_checkout_name debug"
+git checkout -B $tmp_branch $current_checkout_hash
+force_no_fbcode
 make clean
-make ldb -j32
-compare_base_db_dir=$test_dir"/base_db_dir"
-echo == Generate compare base DB to $compare_base_db_dir
-generate_db $input_data_path $compare_base_db_dir
+DISABLE_WARNING_AS_ERROR=1 make ldb -j32
 
-for checkout_obj in "${checkout_objs[@]}"
-do
-   echo == Opening DB from "$checkout_obj" using debug build of $checkout_flag ...
-   compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 0
-done
+echo "== Using $current_checkout_name, generate DB with extern SST and ingest"
+current_ext_test_dir=$ext_test_dir"/current"
+write_external_sst $input_data_path ${current_ext_test_dir}_pointless $current_ext_test_dir
+ingest_external_sst ${current_ext_test_dir}_ingest $current_ext_test_dir
+
+echo "== Generating DB from $current_checkout_name ..."
+current_db_test_dir=$db_test_dir"/current"
+generate_db $input_data_path $current_db_test_dir
+
+echo "== Creating backup of DB from $current_checkout_name ..."
+current_bak_test_dir=$bak_test_dir"/current"
+backup_db $current_db_test_dir $current_bak_test_dir
 
-for checkout_obj in "${forward_compatible_checkout_objs[@]}"
+for checkout_ref in "${checkout_refs[@]}"
 do
-   echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag...
-   git checkout $checkout_obj
-   make clean
-   make ldb -j32
-   compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 0
+  echo "== Building $checkout_ref debug"
+  git reset --hard $tmp_origin/$checkout_ref
+  force_no_fbcode
+  make clean
+  DISABLE_WARNING_AS_ERROR=1 make ldb -j32
+
+  # We currently assume DB backward compatibility for every branch listed
+  echo "== Use $checkout_ref to generate a DB ..."
+  generate_db $input_data_path $db_test_dir/$checkout_ref
+
+  if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to generate DB with extern SST file"
+    write_external_sst $input_data_path $ext_test_dir/${checkout_ref}_pointless $ext_test_dir/$checkout_ref
+  fi
+
+  if member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to ingest extern SST file and compare vs. $current_checkout_name"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $ext_test_dir/$checkout_ref
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+
+    rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest
+    echo "== Use $checkout_ref to ingest extern SST file from $current_checkout_name"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+  fi
+
+  if member_of_array "$checkout_ref" "${db_forward_no_options_refs[@]}" ||
+    member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"
+  then
+    echo "== Use $checkout_ref to open DB generated using $current_checkout_name..."
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+  fi
+
+  if member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"
+  then
+    echo "== Use $checkout_ref to open DB generated using $current_checkout_name with its options..."
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 1 1
+  fi
+
+  if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to backup DB"
+    backup_db $db_test_dir/$checkout_ref $bak_test_dir/$checkout_ref
+  fi
+
+  if member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to restore DB from $current_checkout_name"
+    rm -rf ${db_test_dir:?}/$checkout_ref
+    restore_db $current_bak_test_dir $db_test_dir/$checkout_ref
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+  fi
 done
 
-for checkout_obj in "${forward_compatible_with_options_checkout_objs[@]}"
+echo "== Building $current_checkout_name debug (again, final)"
+git reset --hard $current_checkout_hash
+force_no_fbcode
+make clean
+DISABLE_WARNING_AS_ERROR=1 make ldb -j32
+
+for checkout_ref in "${checkout_refs[@]}"
 do
-   echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag with its options...
-   git checkout $checkout_obj
-   make clean
-   make ldb -j32
-   compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 1 1
+  # We currently assume DB backward compatibility for every branch listed
+  echo "== Use $current_checkout_name to open DB generated using $checkout_ref..."
+  compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
+
+  if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest
+    echo "== Use $current_checkout_name to ingest extern SST file from $checkout_ref"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+  fi
+
+  if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $current_checkout_name to restore DB from $checkout_ref"
+    rm -rf ${db_test_dir:?}/$checkout_ref
+    restore_db $bak_test_dir/$checkout_ref $db_test_dir/$checkout_ref
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
+  fi
 done
 
 echo ==== Compatibility Test PASSED ====
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench.cc	2025-05-19 16:14:27.000000000 +0000
@@ -14,7 +14,7 @@
   return 1;
 }
 #else
-#include <rocksdb/db_bench_tool.h>
+#include "rocksdb/db_bench_tool.h"
 int main(int argc, char** argv) {
   return ROCKSDB_NAMESPACE::db_bench_tool(argc, argv);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_bench_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_bench_tool.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool.cc	2025-05-19 16:14:27.000000000 +0000
@@ -10,7 +10,6 @@
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
-#include <numaif.h>
 #endif
 #ifndef OS_WIN
 #include <unistd.h>
@@ -19,12 +18,21 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
+#ifdef __APPLE__
+#include <mach/host_info.h>
+#include <mach/mach_host.h>
+#include <sys/sysctl.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#endif
 #include <atomic>
 #include <cinttypes>
 #include <condition_variable>
 #include <cstddef>
 #include <memory>
 #include <mutex>
+#include <queue>
 #include <thread>
 #include <unordered_map>
 
@@ -38,6 +46,7 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
@@ -46,18 +55,25 @@
 #include "rocksdb/perf_context.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/secondary_cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/stats_history.h"
+#include "rocksdb/table.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/options_type.h"
 #include "rocksdb/utilities/options_util.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/replayer.h"
+#endif  // ROCKSDB_LITE
 #include "rocksdb/utilities/sim_cache.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/write_batch.h"
 #include "test_util/testutil.h"
 #include "test_util/transaction_test_util.h"
+#include "tools/simulated_hybrid_file_system.h"
 #include "util/cast_util.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
@@ -73,6 +89,10 @@
 #include "utilities/merge_operators/sortlist.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
 
+#ifdef MEMKIND
+#include "memory/memkind_kmem_allocator.h"
+#endif
+
 #ifdef OS_WIN
 #include <io.h>  // open/close
 #endif
@@ -81,6 +101,12 @@
 using GFLAGS_NAMESPACE::RegisterFlagValidator;
 using GFLAGS_NAMESPACE::SetUsageMessage;
 
+#ifdef ROCKSDB_LITE
+#define IF_ROCKSDB_LITE(Then, Else) Then
+#else
+#define IF_ROCKSDB_LITE(Then, Else) Else
+#endif
+
 DEFINE_string(
     benchmarks,
     "fillseq,"
@@ -99,6 +125,12 @@
     "readreverse,"
     "compact,"
     "compactall,"
+    "flush,"
+IF_ROCKSDB_LITE("",
+    "compact0,"
+    "compact1,"
+    "waitforcompaction,"
+)
     "multireadrandom,"
     "mixgraph,"
     "readseq,"
@@ -111,10 +143,13 @@
     "readrandomwriterandom,"
     "updaterandom,"
     "xorupdaterandom,"
+    "approximatesizerandom,"
     "randomwithverify,"
     "fill100K,"
     "crc32c,"
     "xxhash,"
+    "xxhash64,"
+    "xxh3,"
     "compress,"
     "uncompress,"
     "acquireload,"
@@ -173,8 +208,10 @@
     "overwrite\n"
     "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
     "merge\n"
-    "\tcrc32c        -- repeated crc32c of 4K of data\n"
-    "\txxhash        -- repeated xxHash of 4K of data\n"
+    "\tcrc32c        -- repeated crc32c of <block size> data\n"
+    "\txxhash        -- repeated xxHash of <block size> data\n"
+    "\txxhash64      -- repeated xxHash64 of <block size> data\n"
+    "\txxh3          -- repeated XXH3 of <block size> data\n"
     "\tacquireload   -- load N*1000 times\n"
     "\tfillseekseq   -- write N values in sequential key, then read "
     "them by seeking to each key\n"
@@ -187,12 +224,21 @@
     "Meta operations:\n"
     "\tcompact     -- Compact the entire DB; If multiple, randomly choose one\n"
     "\tcompactall  -- Compact the entire DB\n"
+IF_ROCKSDB_LITE("",
+    "\tcompact0  -- compact L0 into L1\n"
+    "\tcompact1  -- compact L1 into L2\n"
+    "\twaitforcompaction - pause until compaction is (probably) done\n"
+)
+    "\tflush - flush the memtable\n"
     "\tstats       -- Print DB stats\n"
     "\tresetstats  -- Reset DB stats\n"
     "\tlevelstats  -- Print the number of files and bytes per level\n"
+    "\tmemstats  -- Print memtable stats\n"
     "\tsstables    -- Print sstable info\n"
     "\theapprofile -- Dump a heap profile (if supported by this port)\n"
+IF_ROCKSDB_LITE("",
     "\treplay      -- replay the trace file specified with trace_file\n"
+)
     "\tgetmergeoperands -- Insert lots of merge records which are a list of "
     "sorted ints for a key and then compare performance of lookup for another "
     "key "
@@ -287,12 +333,71 @@
 
 DEFINE_int32(key_size, 16, "size of each key");
 
+DEFINE_int32(user_timestamp_size, 0,
+             "number of bytes in a user-defined timestamp");
+
 DEFINE_int32(num_multi_db, 0,
              "Number of DBs used in the benchmark. 0 means single DB.");
 
 DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
               " to this fraction of their original size after compression");
 
+DEFINE_double(
+    overwrite_probability, 0.0,
+    "Used in 'filluniquerandom' benchmark: for each write operation, "
+    "we give a probability to perform an overwrite instead. The key used for "
+    "the overwrite is randomly chosen from the last 'overwrite_window_size' "
+    "keys "
+    "previously inserted into the DB. "
+    "Valid overwrite_probability values: [0.0, 1.0].");
+
+DEFINE_uint32(overwrite_window_size, 1,
+              "Used in 'filluniquerandom' benchmark. For each write "
+              "operation, when "
+              "the overwrite_probability flag is set by the user, the key used "
+              "to perform "
+              "an overwrite is randomly chosen from the last "
+              "'overwrite_window_size' keys "
+              "previously inserted into the DB. "
+              "Warning: large values can affect throughput. "
+              "Valid overwrite_window_size values: [1, kMaxUint32].");
+
+DEFINE_uint64(
+    disposable_entries_delete_delay, 0,
+    "Minimum delay in microseconds for the series of Deletes "
+    "to be issued. When 0 the insertion of the last disposable entry is "
+    "immediately followed by the issuance of the Deletes. "
+    "(only compatible with fillanddeleteuniquerandom benchmark).");
+
+DEFINE_uint64(disposable_entries_batch_size, 0,
+              "Number of consecutively inserted disposable KV entries "
+              "that will be deleted after 'delete_delay' microseconds. "
+              "A series of Deletes is always issued once all the "
+              "disposable KV entries it targets have been inserted "
+              "into the DB. When 0 no deletes are issued and a "
+              "regular 'filluniquerandom' benchmark occurs. "
+              "(only compatible with fillanddeleteuniquerandom benchmark)");
+
+DEFINE_int32(disposable_entries_value_size, 64,
+             "Size of the values (in bytes) of the entries targeted by "
+             "selective deletes. "
+             "(only compatible with fillanddeleteuniquerandom benchmark)");
+
+DEFINE_uint64(
+    persistent_entries_batch_size, 0,
+    "Number of KV entries being inserted right before the deletes "
+    "targeting the disposable KV entries are issued. These "
+    "persistent keys are not targeted by the deletes, and will always "
+    "remain valid in the DB. (only compatible with "
+    "--benchmarks='fillanddeleteuniquerandom' "
+    "and used when--disposable_entries_batch_size is > 0).");
+
+DEFINE_int32(persistent_entries_value_size, 64,
+             "Size of the values (in bytes) of the entries not targeted by "
+             "deletes. (only compatible with "
+             "--benchmarks='fillanddeleteuniquerandom' "
+             "and used when--disposable_entries_batch_size is > 0).");
+
 DEFINE_double(read_random_exp_range, 0.0,
               "Read random's key will be generated using distribution of "
               "num * exp(-r) where r is uniform number from 0 to this value. "
@@ -316,6 +421,9 @@
 DEFINE_bool(cost_write_buffer_to_cache, false,
             "The usage of memtable is costed to the block cache");
 
+DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
+             "The size, in bytes, of one block in arena memory allocation.");
+
 DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
              "Number of bytes to buffer in memtable before compacting");
 
@@ -429,6 +537,9 @@
 DEFINE_bool(universal_allow_trivial_move, false,
             "Allow trivial move in universal compaction.");
 
+DEFINE_bool(universal_incremental, false,
+            "Enable incremental compactions in universal compaction.");
+
 DEFINE_int64(cache_size, 8 << 20,  // 8MB
              "Number of bytes to use as a cache of uncompressed data");
 
@@ -452,11 +563,26 @@
 DEFINE_bool(cache_index_and_filter_blocks, false,
             "Cache index/filter blocks in block cache.");
 
+DEFINE_bool(use_cache_memkind_kmem_allocator, false,
+            "Use memkind kmem allocator for block cache.");
+
 DEFINE_bool(partition_index_and_filters, false,
             "Partition index and filter blocks.");
 
 DEFINE_bool(partition_index, false, "Partition index blocks");
 
+DEFINE_bool(index_with_first_key, false, "Include first key in the index");
+
+DEFINE_bool(
+    optimize_filters_for_memory,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
+    "Minimize memory footprint of filters");
+
+DEFINE_int64(
+    index_shortening_mode, 2,
+    "mode to shorten index: 0 for no shortening; 1 for only shortening "
+    "separaters; 2 for shortening shortening and successor");
+
 DEFINE_int64(metadata_block_size,
              ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
              "Max partition size when partitioning index/filters");
@@ -507,6 +633,10 @@
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
             "Align data blocks on page size");
 
+DEFINE_int64(prepopulate_block_cache, 0,
+             "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
+             "to insert during flush");
+
 DEFINE_bool(use_data_block_hash_index, false,
             "if use kDataBlockBinaryAndHash "
             "instead of kDataBlockBinarySearch. "
@@ -546,8 +676,12 @@
 DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
              "Maximum write buffer for Writable File");
 
-DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
-             " use default settings.");
+DEFINE_int32(bloom_bits, -1,
+             "Bloom filter bits per key. Negative means use default."
+             "Zero disables.");
+
+DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
+
 DEFINE_double(memtable_bloom_size_ratio, 0,
               "Ratio of memtable size used for bloom filter. 0 means no bloom "
               "filter.");
@@ -604,6 +738,10 @@
             "Verify checksum for every block read"
             " from storage");
 
+DEFINE_int32(checksum_type,
+             ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum,
+             "ChecksumType as an int");
+
 DEFINE_bool(statistics, false, "Database statistics");
 DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
              "stats level for statistics");
@@ -621,6 +759,9 @@
 
 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
 
+DEFINE_bool(manual_wal_flush, false,
+            "If true, buffer WAL until buffer is full or a manual FlushWAL().");
+
 DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
 
 DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
@@ -665,6 +806,13 @@
              "Number of files in level-0"
              " when compactions start");
 
+DEFINE_uint64(periodic_compaction_seconds,
+              ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
+              "Files older than this will be picked up for compaction and"
+              " rewritten to the same level");
+
+DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl");
+
 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
   if (value <= 0 || value>=100) {
     fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
@@ -689,11 +837,25 @@
              "deletepercent), so deletepercent must be smaller than (100 - "
              "FLAGS_readwritepercent)");
 
-DEFINE_bool(optimize_filters_for_hits, false,
+DEFINE_bool(optimize_filters_for_hits,
+            ROCKSDB_NAMESPACE::Options().optimize_filters_for_hits,
             "Optimizes bloom filters for workloads for most lookups return "
             "a value. For now this doesn't create bloom filters for the max "
             "level of the LSM to reduce metadata that should fit in RAM. ");
 
+DEFINE_bool(paranoid_checks, ROCKSDB_NAMESPACE::Options().paranoid_checks,
+            "RocksDB will aggressively check consistency of the data.");
+
+DEFINE_bool(force_consistency_checks,
+            ROCKSDB_NAMESPACE::Options().force_consistency_checks,
+            "Runs consistency checks on the LSM every time a change is "
+            "applied.");
+
+DEFINE_bool(check_flush_compaction_key_order,
+            ROCKSDB_NAMESPACE::Options().check_flush_compaction_key_order,
+            "During flush or compaction, check whether keys inserted to "
+            "output files are in order.");
+
 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
               "Ignored. Left here for backward compatibility");
 
@@ -763,55 +925,104 @@
 
 DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
 
-// Blob DB Options
-DEFINE_bool(use_blob_db, false,
-            "Open a BlobDB instance. "
-            "Required for large value benchmark.");
+DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");
+
+// Stacked BlobDB Options
+DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
 
 DEFINE_bool(
     blob_db_enable_gc,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
-    "Enable BlobDB garbage collection.");
+    "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
 DEFINE_double(
     blob_db_gc_cutoff,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "Cutoff ratio for BlobDB garbage collection.");
+    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
 
 DEFINE_bool(blob_db_is_fifo,
             ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
-            "Enable FIFO eviction strategy in BlobDB.");
+            "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
 
 DEFINE_uint64(blob_db_max_db_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
-              "Max size limit of the directory where blob files are stored.");
+              "[Stacked BlobDB] Max size limit of the directory where blob "
+              "files are stored.");
+
+DEFINE_uint64(blob_db_max_ttl_range, 0,
+              "[Stacked BlobDB] TTL range to generate BlobDB data (in "
+              "seconds). 0 means no TTL.");
 
 DEFINE_uint64(
-    blob_db_max_ttl_range, 0,
-    "TTL range to generate BlobDB data (in seconds). 0 means no TTL.");
+    blob_db_ttl_range_secs,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
+    "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
 
-DEFINE_uint64(blob_db_ttl_range_secs,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
-              "TTL bucket size to use when creating blob files.");
-
-DEFINE_uint64(blob_db_min_blob_size,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-              "Smallest blob to store in a file. Blobs smaller than this "
-              "will be inlined with the key in the LSM tree.");
+DEFINE_uint64(
+    blob_db_min_blob_size,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
+    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
+    "smaller than this will be inlined with the key in the LSM tree.");
 
 DEFINE_uint64(blob_db_bytes_per_sync,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-              "Bytes to sync blob file at.");
+              "[Stacked BlobDB] Bytes to sync blob file at.");
 
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
-              "Target size of each blob file.");
+              "[Stacked BlobDB] Target size of each blob file.");
 
-DEFINE_string(blob_db_compression_type, "snappy",
-              "Algorithm to use to compress blob in blob file");
+DEFINE_string(
+    blob_db_compression_type, "snappy",
+    "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
 static enum ROCKSDB_NAMESPACE::CompressionType
     FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
 
+#endif  // ROCKSDB_LITE
+
+// Integrated BlobDB options
+DEFINE_bool(
+    enable_blob_files,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
+    "[Integrated BlobDB] Enable writing large values to separate blob files.");
+
+DEFINE_uint64(min_blob_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
+              "[Integrated BlobDB] The size of the smallest value to be stored "
+              "separately in a blob file.");
+
+DEFINE_uint64(blob_file_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
+              "[Integrated BlobDB] The size limit for blob files.");
+
+DEFINE_string(blob_compression_type, "none",
+              "[Integrated BlobDB] The compression algorithm to use for large "
+              "values stored in blob files.");
+
+DEFINE_bool(enable_blob_garbage_collection,
+            ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                .enable_blob_garbage_collection,
+            "[Integrated BlobDB] Enable blob garbage collection.");
+
+DEFINE_double(blob_garbage_collection_age_cutoff,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_age_cutoff,
+              "[Integrated BlobDB] The cutoff in terms of blob file age for "
+              "garbage collection.");
+
+DEFINE_double(blob_garbage_collection_force_threshold,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_force_threshold,
+              "[Integrated BlobDB] The threshold for the ratio of garbage in "
+              "the oldest blob files for forcing garbage collection.");
+
+DEFINE_uint64(blob_compaction_readahead_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_compaction_readahead_size,
+              "[Integrated BlobDB] Compaction readahead for blob files.");
+
+#ifndef ROCKSDB_LITE
+
 // Secondary DB instance Options
 DEFINE_bool(use_secondary_db, false,
             "Open a RocksDB secondary instance. A primary instance can be "
@@ -833,10 +1044,12 @@
 DEFINE_bool(use_stderr_info_logger, false,
             "Write info logs to stderr instead of to LOG file. ");
 
+#ifndef ROCKSDB_LITE
+
 DEFINE_string(trace_file, "", "Trace workload to a file. ");
 
-DEFINE_int32(trace_replay_fast_forward, 1,
-             "Fast forward trace replay, must >= 1. ");
+DEFINE_double(trace_replay_fast_forward, 1.0,
+              "Fast forward trace replay, must > 0.0.");
 DEFINE_int32(block_cache_trace_sampling_frequency, 1,
              "Block cache trace sampling frequency, termed s. It uses spatial "
              "downsampling and samples accesses to one out of s blocks.");
@@ -850,6 +1063,15 @@
 DEFINE_int32(trace_replay_threads, 1,
              "The number of threads to replay, must >=1.");
 
+DEFINE_bool(io_uring_enabled, true,
+            "If true, enable the use of IO uring if the platform supports it");
+extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; }
+#endif  // ROCKSDB_LITE
+
+DEFINE_bool(adaptive_readahead, false,
+            "carry forward internal auto readahead size from one file to next "
+            "file at each level during iteration");
+
 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
     const char* ctype) {
   assert(ctype);
@@ -912,10 +1134,17 @@
              " not compressed. Otherwise, apply compression_type to "
              "all levels.");
 
+DEFINE_int32(compression_parallel_threads, 1,
+             "Number of threads for parallel compression.");
+
+DEFINE_uint64(compression_max_dict_buffer_bytes,
+              ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
+              "Maximum bytes to buffer to collect samples for dictionary.");
+
 static bool ValidateTableCacheNumshardbits(const char* flagname,
                                            int32_t value) {
-  if (0 >= value || value > 20) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val <= 20\n",
+  if (0 >= value || value >= 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val < 20\n",
             flagname, value);
     return false;
   }
@@ -924,11 +1153,25 @@
 DEFINE_int32(table_cache_numshardbits, 4, "");
 
 #ifndef ROCKSDB_LITE
-DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive"
-              " with --hdfs.");
+DEFINE_string(env_uri, "",
+              "URI for registry Env lookup. Mutually exclusive"
+              " with --hdfs and --fs_uri");
+DEFINE_string(fs_uri, "",
+              "URI for registry Filesystem lookup. Mutually exclusive"
+              " with --hdfs and --env_uri."
+              " Creates a default environment with the specified filesystem.");
 #endif  // ROCKSDB_LITE
-DEFINE_string(hdfs, "", "Name of hdfs environment. Mutually exclusive with"
-              " --env_uri.");
+DEFINE_string(hdfs, "",
+              "Name of hdfs environment. Mutually exclusive with"
+              " --env_uri and --fs_uri");
+DEFINE_string(simulate_hybrid_fs_file, "",
+              "File for Store Metadata for Simulate hybrid FS. Empty means "
+              "disable the feature. Now, if it is set, "
+              "bottommost_temperature is set to kWarm.");
+DEFINE_int32(simulate_hybrid_hdd_multipliers, 1,
+             "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs "
+             "are simulated.");
+DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD.");
 
 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
 
@@ -944,7 +1187,7 @@
              " this is greater than 0.");
 
 DEFINE_int64(report_interval_seconds, 0,
-             "If greater than zero, it will write simple stats in CVS format "
+             "If greater than zero, it will write simple stats in CSV format "
              "to --report_file every N seconds");
 
 DEFINE_string(report_file, "report.csv",
@@ -984,12 +1227,18 @@
 DEFINE_bool(enable_pipelined_write, true,
             "Allow WAL and memtable writes to be pipelined");
 
-DEFINE_bool(unordered_write, false,
-            "Allow WAL and memtable writes to be pipelined");
+DEFINE_bool(
+    unordered_write, false,
+    "Enable the unordered write feature, which provides higher throughput but "
+    "relaxes the guarantees around atomic reads and immutable snapshots");
 
 DEFINE_bool(allow_concurrent_memtable_write, true,
             "Allow multi-writers to update mem tables in parallel.");
 
+DEFINE_double(experimental_mempurge_threshold, 0.0,
+              "Maximum useful payload ratio estimate that triggers a mempurge "
+              "(memtable garbage collection).");
+
 DEFINE_bool(inplace_update_support,
             ROCKSDB_NAMESPACE::Options().inplace_update_support,
             "Support in-place memtable update for smaller or same-size values");
@@ -1015,6 +1264,10 @@
 
 DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
 
+DEFINE_int64(rate_limiter_refill_period_us, 100 * 1000,
+             "Set refill period on "
+             "rate limiter.");
+
 DEFINE_bool(rate_limiter_auto_tuned, false,
             "Enable dynamic adjustment of rate limit according to demand for "
             "background I/O");
@@ -1061,7 +1314,7 @@
               "f(x)=a*exp(b*x)+c*exp(d*x)");
 DEFINE_int64(keyrange_num, 1,
              "The number of key ranges that are in the same prefix "
-             "group, each prefix range will have its key acccess "
+             "group, each prefix range will have its key access "
              "distribution");
 DEFINE_double(key_dist_a, 0.0,
               "The parameter 'a' of key access distribution model "
@@ -1094,8 +1347,6 @@
 DEFINE_double(mix_seek_ratio, 0.0,
               "The ratio of Seek queries of mix_graph workload");
 DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
-DEFINE_int64(mix_ave_kv_size, 512,
-             "The average key-value size of this workload");
 DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
 DEFINE_double(
     sine_mix_rate_noise, 0.0,
@@ -1247,30 +1498,6 @@
              "Stride length for the keys in a MultiGet batch");
 DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
 
-enum RepFactory {
-  kSkipList,
-  kPrefixHash,
-  kVectorRep,
-  kHashLinkedList,
-};
-
-static enum RepFactory StringToRepFactory(const char* ctype) {
-  assert(ctype);
-
-  if (!strcasecmp(ctype, "skip_list"))
-    return kSkipList;
-  else if (!strcasecmp(ctype, "prefix_hash"))
-    return kPrefixHash;
-  else if (!strcasecmp(ctype, "vector"))
-    return kVectorRep;
-  else if (!strcasecmp(ctype, "hash_linkedlist"))
-    return kHashLinkedList;
-
-  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
-  return kSkipList;
-}
-
-static enum RepFactory FLAGS_rep_factory;
 DEFINE_string(memtablerep, "skip_list", "");
 DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
 DEFINE_bool(use_plain_table, false, "if use plain table "
@@ -1292,8 +1519,19 @@
              "position");
 DEFINE_bool(report_file_operations, false, "if report number of file "
             "operations");
+DEFINE_bool(report_open_timing, false, "if report open timing");
 DEFINE_int32(readahead_size, 0, "Iterator readahead size");
 
+DEFINE_bool(read_with_latest_user_timestamp, true,
+            "If true, always use the current latest timestamp for read. If "
+            "false, choose a random timestamp from the past.");
+
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a custom secondary cache object");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
+#endif  // ROCKSDB_LITE
+
 static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
 
@@ -1323,10 +1561,41 @@
                           &ValidateTableCacheNumshardbits);
 
 namespace ROCKSDB_NAMESPACE {
-
 namespace {
+static Status CreateMemTableRepFactory(
+    const ConfigOptions& config_options,
+    std::shared_ptr<MemTableRepFactory>* factory) {
+  Status s;
+  if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) {
+    factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead));
+#ifndef ROCKSDB_LITE
+  } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) {
+    factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
+  } else if (!strcasecmp(FLAGS_memtablerep.c_str(),
+                         VectorRepFactory::kNickName())) {
+    factory->reset(new VectorRepFactory());
+  } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) {
+    factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count));
+#endif  // ROCKSDB_LITE
+  } else {
+    std::unique_ptr<MemTableRepFactory> unique;
+    s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep,
+                                             &unique);
+    if (s.ok()) {
+      factory->reset(unique.release());
+    }
+  }
+  return s;
+}
+
 struct ReportFileOpCounters {
   std::atomic<int> open_counter_;
+  std::atomic<int> delete_counter_;
+  std::atomic<int> rename_counter_;
+  std::atomic<int> flush_counter_;
+  std::atomic<int> sync_counter_;
+  std::atomic<int> fsync_counter_;
+  std::atomic<int> close_counter_;
   std::atomic<int> read_counter_;
   std::atomic<int> append_counter_;
   std::atomic<uint64_t> bytes_read_;
@@ -1337,9 +1606,16 @@
 class ReportFileOpEnv : public EnvWrapper {
  public:
   explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
+  const char* Name() const override { return "ReportFileOpEnv"; }
 
   void reset() {
     counters_.open_counter_ = 0;
+    counters_.delete_counter_ = 0;
+    counters_.rename_counter_ = 0;
+    counters_.flush_counter_ = 0;
+    counters_.sync_counter_ = 0;
+    counters_.fsync_counter_ = 0;
+    counters_.close_counter_ = 0;
     counters_.read_counter_ = 0;
     counters_.append_counter_ = 0;
     counters_.bytes_read_ = 0;
@@ -1378,6 +1654,22 @@
     return s;
   }
 
+  Status DeleteFile(const std::string& fname) override {
+    Status s = target()->DeleteFile(fname);
+    if (s.ok()) {
+      counters()->delete_counter_.fetch_add(1, std::memory_order_relaxed);
+    }
+    return s;
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) override {
+    Status st = target()->RenameFile(s, t);
+    if (st.ok()) {
+      counters()->rename_counter_.fetch_add(1, std::memory_order_relaxed);
+    }
+    return st;
+  }
+
   Status NewRandomAccessFile(const std::string& f,
                              std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& soptions) override {
@@ -1428,10 +1720,43 @@
         return rv;
       }
 
-      Status Truncate(uint64_t size) override { return target_->Truncate(size); }
-      Status Close() override { return target_->Close(); }
-      Status Flush() override { return target_->Flush(); }
-      Status Sync() override { return target_->Sync(); }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
+
+      Status Truncate(uint64_t size) override {
+        return target_->Truncate(size);
+      }
+      Status Close() override {
+        Status s = target_->Close();
+        if (s.ok()) {
+          counters_->close_counter_.fetch_add(1, std::memory_order_relaxed);
+        }
+        return s;
+      }
+      Status Flush() override {
+        Status s = target_->Flush();
+        if (s.ok()) {
+          counters_->flush_counter_.fetch_add(1, std::memory_order_relaxed);
+        }
+        return s;
+      }
+      Status Sync() override {
+        Status s = target_->Sync();
+        if (s.ok()) {
+          counters_->sync_counter_.fetch_add(1, std::memory_order_relaxed);
+        }
+        return s;
+      }
+      Status Fsync() override {
+        Status s = target_->Fsync();
+        if (s.ok()) {
+          counters_->fsync_counter_.fetch_add(1, std::memory_order_relaxed);
+        }
+        return s;
+      }
     };
 
     Status s = target()->NewWritableFile(f, r, soptions);
@@ -1475,9 +1800,8 @@
 
 class BaseDistribution {
  public:
-  BaseDistribution(unsigned int min, unsigned int max) :
-    min_value_size_(min),
-    max_value_size_(max) {}
+  BaseDistribution(unsigned int _min, unsigned int _max)
+      : min_value_size_(_min), max_value_size_(_max) {}
   virtual ~BaseDistribution() {}
 
   unsigned int Generate() {
@@ -1516,12 +1840,14 @@
 class NormalDistribution
     : public BaseDistribution, public std::normal_distribution<double> {
  public:
-  NormalDistribution(unsigned int min, unsigned int max) :
-    BaseDistribution(min, max),
-    // 99.7% values within the range [min, max].
-    std::normal_distribution<double>((double)(min + max) / 2.0 /*mean*/,
-                                     (double)(max - min) / 6.0 /*stddev*/),
-    gen_(rd_()) {}
+  NormalDistribution(unsigned int _min, unsigned int _max)
+      : BaseDistribution(_min, _max),
+        // 99.7% values within the range [min, max].
+        std::normal_distribution<double>(
+            (double)(_min + _max) / 2.0 /*mean*/,
+            (double)(_max - _min) / 6.0 /*stddev*/),
+        gen_(rd_()) {}
+
  private:
   virtual unsigned int Get() override {
     return static_cast<unsigned int>((*this)(gen_));
@@ -1534,10 +1860,11 @@
     : public BaseDistribution,
       public std::uniform_int_distribution<unsigned int> {
  public:
-  UniformDistribution(unsigned int min, unsigned int max) :
-    BaseDistribution(min, max),
-    std::uniform_int_distribution<unsigned int>(min, max),
-    gen_(rd_()) {}
+  UniformDistribution(unsigned int _min, unsigned int _max)
+      : BaseDistribution(_min, _max),
+        std::uniform_int_distribution<unsigned int>(_min, _max),
+        gen_(rd_()) {}
+
  private:
   virtual unsigned int Get() override {
     return (*this)(gen_);
@@ -1750,7 +2077,8 @@
  private:
   std::string Header() const { return "secs_elapsed,interval_qps"; }
   void SleepAndReport() {
-    auto time_started = env_->NowMicros();
+    auto* clock = env_->GetSystemClock().get();
+    auto time_started = clock->NowMicros();
     while (true) {
       {
         std::unique_lock<std::mutex> lk(mutex_);
@@ -1765,7 +2093,7 @@
       auto total_ops_done_snapshot = total_ops_done_.load();
       // round the seconds elapsed
       auto secs_elapsed =
-          (env_->NowMicros() - time_started + kMicrosInSecond / 2) /
+          (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
           kMicrosInSecond;
       std::string report = ToString(secs_elapsed) + "," +
                            ToString(total_ops_done_snapshot - last_report_) +
@@ -1828,8 +2156,9 @@
 class CombinedStats;
 class Stats {
  private:
+  SystemClock* clock_;
   int id_;
-  uint64_t start_;
+  uint64_t start_ = 0;
   uint64_t sine_interval_;
   uint64_t finish_;
   double seconds_;
@@ -1847,7 +2176,7 @@
   friend class CombinedStats;
 
  public:
-  Stats() { Start(-1); }
+  Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
 
   void SetReporterAgent(ReporterAgent* reporter_agent) {
     reporter_agent_ = reporter_agent;
@@ -1862,8 +2191,8 @@
     last_report_done_ = 0;
     bytes_ = 0;
     seconds_ = 0;
-    start_ = FLAGS_env->NowMicros();
-    sine_interval_ = FLAGS_env->NowMicros();
+    start_ = clock_->NowMicros();
+    sine_interval_ = clock_->NowMicros();
     finish_ = start_;
     last_report_finish_ = start_;
     message_.clear();
@@ -1895,7 +2224,7 @@
   }
 
   void Stop() {
-    finish_ = FLAGS_env->NowMicros();
+    finish_ = clock_->NowMicros();
     seconds_ = (finish_ - start_) * 1e-6;
   }
 
@@ -1915,7 +2244,7 @@
         "ElapsedTime", "Stage", "State", "OperationProperties");
 
     int64_t current_time = 0;
-    FLAGS_env->GetCurrentTime(&current_time);
+    clock_->GetCurrentTime(&current_time).PermitUncheckedError();
     for (auto ts : thread_list) {
       fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
           ts.thread_id,
@@ -1936,9 +2265,7 @@
     }
   }
 
-  void ResetSineInterval() {
-    sine_interval_ = FLAGS_env->NowMicros();
-  }
+  void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
 
   uint64_t GetSineInterval() {
     return sine_interval_;
@@ -1950,7 +2277,7 @@
 
   void ResetLastOpTime() {
     // Set to now to avoid latency from calls to SleepForMicroseconds
-    last_op_finish_ = FLAGS_env->NowMicros();
+    last_op_finish_ = clock_->NowMicros();
   }
 
   void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
@@ -1959,7 +2286,7 @@
       reporter_agent_->ReportFinishedOps(num_ops);
     }
     if (FLAGS_histogram) {
-      uint64_t now = FLAGS_env->NowMicros();
+      uint64_t now = clock_->NowMicros();
       uint64_t micros = now - last_op_finish_;
 
       if (hist_.find(op_type) == hist_.end())
@@ -1988,7 +2315,7 @@
         else                            next_report_ += 100000;
         fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
       } else {
-        uint64_t now = FLAGS_env->NowMicros();
+        uint64_t now = clock_->NowMicros();
         int64_t usecs_since_last = now - last_report_finish_;
 
         // Determine whether to print status where interval is either
@@ -2000,15 +2327,13 @@
           next_report_ += FLAGS_stats_interval;
 
         } else {
-
           fprintf(stderr,
-                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
+                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64
+                  ") ops and "
                   "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
-                  FLAGS_env->TimeToString(now/1000000).c_str(),
-                  id_,
+                  clock_->TimeToString(now / 1000000).c_str(), id_,
                   done_ - last_report_done_, done_,
-                  (done_ - last_report_done_) /
-                  (usecs_since_last / 1000000.0),
+                  (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
                   done_ / ((now - start_) / 1000000.0),
                   (now - last_report_finish_) / 1000000.0,
                   (now - start_) / 1000000.0);
@@ -2108,6 +2433,18 @@
       ReportFileOpCounters* counters = env->counters();
       fprintf(stdout, "Num files opened: %d\n",
               counters->open_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num files deleted: %d\n",
+              counters->delete_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num files renamed: %d\n",
+              counters->rename_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Flush(): %d\n",
+              counters->flush_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Sync(): %d\n",
+              counters->sync_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Fsync(): %d\n",
+              counters->fsync_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Close(): %d\n",
+              counters->close_counter_.load(std::memory_order_relaxed));
       fprintf(stdout, "Num Read(): %d\n",
               counters->read_counter_.load(std::memory_order_relaxed));
       fprintf(stdout, "Num Append(): %d\n",
@@ -2199,6 +2536,25 @@
   TimestampEmulator() : timestamp_(0) {}
   uint64_t Get() const { return timestamp_.load(); }
   void Inc() { timestamp_++; }
+  Slice Allocate(char* scratch) {
+    // TODO: support larger timestamp sizes
+    assert(FLAGS_user_timestamp_size == 8);
+    assert(scratch);
+    uint64_t ts = timestamp_.fetch_add(1);
+    EncodeFixed64(scratch, ts);
+    return Slice(scratch, FLAGS_user_timestamp_size);
+  }
+  Slice GetTimestampForRead(Random64& rand, char* scratch) {
+    assert(FLAGS_user_timestamp_size == 8);
+    assert(scratch);
+    if (FLAGS_read_with_latest_user_timestamp) {
+      return Allocate(scratch);
+    }
+    // Choose a random timestamp from the past.
+    uint64_t ts = rand.Next() % Get();
+    EncodeFixed64(scratch, ts);
+    return Slice(scratch, FLAGS_user_timestamp_size);
+  }
 };
 
 // State shared by all concurrent executions of the same benchmark.
@@ -2230,10 +2586,8 @@
   Stats stats;
   SharedState* shared;
 
-  /* implicit */ ThreadState(int index)
-      : tid(index),
-        rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {
-  }
+  explicit ThreadState(int index)
+      : tid(index), rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {}
 };
 
 class Duration {
@@ -2278,12 +2632,12 @@
  private:
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> compressed_cache_;
-  std::shared_ptr<const FilterPolicy> filter_policy_;
   const SliceTransform* prefix_extractor_;
   DBWithColumnFamilies db_;
   std::vector<DBWithColumnFamilies> multi_dbs_;
   int64_t num_;
   int key_size_;
+  int user_timestamp_size_;
   int prefix_size_;
   int64_t keys_per_prefix_;
   int64_t entries_per_batch_;
@@ -2304,7 +2658,7 @@
   int64_t readwrites_;
   int64_t merge_keys_;
   bool report_file_operations_;
-  bool use_blob_db_;
+  bool use_blob_db_;  // Stacked BlobDB
   std::vector<std::string> keys_;
 
   class ErrorHandlerListener : public EventListener {
@@ -2318,6 +2672,9 @@
 
     ~ErrorHandlerListener() override {}
 
+    const char* Name() const override { return kClassName(); }
+    static const char* kClassName() { return "ErrorHandlerListener"; }
+
     void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
                               Status /*bg_error*/,
                               bool* auto_recovery) override {
@@ -2359,6 +2716,8 @@
 
   std::shared_ptr<ErrorHandlerListener> listener_;
 
+  std::unique_ptr<TimestampEmulator> mock_app_clock_;
+
   bool SanityCheck() {
     if (FLAGS_compression_ratio > 1) {
       fprintf(stderr, "compression_ratio should be between 0 and 1\n");
@@ -2369,45 +2728,17 @@
 
   inline bool CompressSlice(const CompressionInfo& compression_info,
                             const Slice& input, std::string* compressed) {
-    bool ok = true;
-    switch (FLAGS_compression_type_e) {
-      case ROCKSDB_NAMESPACE::kSnappyCompression:
-        ok = Snappy_Compress(compression_info, input.data(), input.size(),
-                             compressed);
-        break;
-      case ROCKSDB_NAMESPACE::kZlibCompression:
-        ok = Zlib_Compress(compression_info, 2, input.data(), input.size(),
-                           compressed);
-        break;
-      case ROCKSDB_NAMESPACE::kBZip2Compression:
-        ok = BZip2_Compress(compression_info, 2, input.data(), input.size(),
-                            compressed);
-        break;
-      case ROCKSDB_NAMESPACE::kLZ4Compression:
-        ok = LZ4_Compress(compression_info, 2, input.data(), input.size(),
-                          compressed);
-        break;
-      case ROCKSDB_NAMESPACE::kLZ4HCCompression:
-        ok = LZ4HC_Compress(compression_info, 2, input.data(), input.size(),
-                            compressed);
-        break;
-      case ROCKSDB_NAMESPACE::kXpressCompression:
-        ok = XPRESS_Compress(input.data(),
-          input.size(), compressed);
-        break;
-      case ROCKSDB_NAMESPACE::kZSTD:
-        ok = ZSTD_Compress(compression_info, input.data(), input.size(),
-                           compressed);
-        break;
-      default:
-        ok = false;
-    }
-    return ok;
+    constexpr uint32_t compress_format_version = 2;
+
+    return CompressData(input, compression_info, compress_format_version,
+                        compressed);
   }
 
-  void PrintHeader() {
+  void PrintHeader(const Options& options) {
     PrintEnvironment();
-    fprintf(stdout, "Keys:       %d bytes each\n", FLAGS_key_size);
+    fprintf(stdout,
+            "Keys:       %d bytes each (+ %d bytes user-defined timestamp)\n",
+            FLAGS_key_size, FLAGS_user_timestamp_size);
     auto avg_value_size = FLAGS_value_size;
     if (FLAGS_value_size_distribution_type_e == kFixed) {
       fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
@@ -2453,20 +2784,9 @@
     fprintf(stdout, "Compression: %s\n", compression.c_str());
     fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
             FLAGS_sample_for_compression);
-
-    switch (FLAGS_rep_factory) {
-      case kPrefixHash:
-        fprintf(stdout, "Memtablerep: prefix_hash\n");
-        break;
-      case kSkipList:
-        fprintf(stdout, "Memtablerep: skip_list\n");
-        break;
-      case kVectorRep:
-        fprintf(stdout, "Memtablerep: vector\n");
-        break;
-      case kHashLinkedList:
-        fprintf(stdout, "Memtablerep: hash_linkedlist\n");
-        break;
+    if (options.memtable_factory != nullptr) {
+      fprintf(stdout, "Memtablerep: %s\n",
+              options.memtable_factory->GetId().c_str());
     }
     fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
 
@@ -2525,7 +2845,7 @@
     fprintf(stderr, "RocksDB:    version %d.%d\n",
             kMajorVersion, kMinorVersion);
 
-#if defined(__linux)
+#if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
     time_t now = time(nullptr);
     char buf[52];
     // Lint complains about ctime() usage, so replace it with ctime_r(). The
@@ -2533,6 +2853,7 @@
     fprintf(stderr, "Date:       %s",
             ctime_r(&now, buf));  // ctime_r() adds newline
 
+#if defined(__linux)
     FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
     if (cpuinfo != nullptr) {
       char line[1000];
@@ -2557,6 +2878,45 @@
       fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
       fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
     }
+#elif defined(__APPLE__)
+    struct host_basic_info h;
+    size_t hlen = HOST_BASIC_INFO_COUNT;
+    if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
+                  (uint32_t*)&hlen) == KERN_SUCCESS) {
+      std::string cpu_type;
+      std::string cache_size;
+      size_t hcache_size;
+      hlen = sizeof(hcache_size);
+      if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
+        cache_size = std::to_string(hcache_size);
+      }
+      switch (h.cpu_type) {
+        case CPU_TYPE_X86_64:
+          cpu_type = "x86_64";
+          break;
+        case CPU_TYPE_ARM64:
+          cpu_type = "arm64";
+          break;
+        default:
+          break;
+      }
+      fprintf(stderr, "CPU:        %d * %s\n", h.max_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#elif defined(__FreeBSD__)
+    int ncpus;
+    size_t len = sizeof(ncpus);
+    int mib[2] = {CTL_HW, HW_NCPU};
+    if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
+      char cpu_type[16];
+      len = sizeof(cpu_type) - 1;
+      mib[1] = HW_MACHINE;
+      if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
+
+      fprintf(stderr, "CPU:        %d * %s\n", ncpus, cpu_type);
+      // no programmatic way to get the cache line size except on PPC
+    }
+#endif
 #endif
   }
 
@@ -2617,9 +2977,38 @@
       }
       return cache;
     } else {
-      return NewLRUCache(
+      LRUCacheOptions opts(
           static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
-          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
+          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
+#ifdef MEMKIND
+          FLAGS_use_cache_memkind_kmem_allocator
+              ? std::make_shared<MemkindKmemAllocator>()
+              : nullptr
+#else
+          nullptr
+#endif
+      );
+      if (FLAGS_use_cache_memkind_kmem_allocator) {
+#ifndef MEMKIND
+        fprintf(stderr, "Memkind library is not linked with the binary.");
+        exit(1);
+#endif
+      }
+#ifndef ROCKSDB_LITE
+      if (!FLAGS_secondary_cache_uri.empty()) {
+        Status s = SecondaryCache::CreateFromString(
+            ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
+        if (secondary_cache == nullptr) {
+          fprintf(
+              stderr,
+              "No secondary cache registered matching string: %s status=%s\n",
+              FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+          exit(1);
+        }
+        opts.secondary_cache = secondary_cache;
+      }
+#endif  // ROCKSDB_LITE
+      return NewLRUCache(opts);
     }
   }
 
@@ -2627,13 +3016,10 @@
   Benchmark()
       : cache_(NewCache(FLAGS_cache_size)),
         compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
-        filter_policy_(FLAGS_bloom_bits >= 0
-                           ? NewBloomFilterPolicy(FLAGS_bloom_bits,
-                                                  FLAGS_use_block_based_filter)
-                           : nullptr),
         prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
         num_(FLAGS_num),
         key_size_(FLAGS_key_size),
+        user_timestamp_size_(FLAGS_user_timestamp_size),
         prefix_size_(FLAGS_prefix_size),
         keys_per_prefix_(FLAGS_keys_per_prefix),
         entries_per_batch_(1),
@@ -2647,9 +3033,9 @@
         merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
         report_file_operations_(FLAGS_report_file_operations),
 #ifndef ROCKSDB_LITE
-        use_blob_db_(FLAGS_use_blob_db)
+        use_blob_db_(FLAGS_use_blob_db)  // Stacked BlobDB
 #else
-        use_blob_db_(false)
+        use_blob_db_(false)  // Stacked BlobDB
 #endif  // !ROCKSDB_LITE
   {
     // use simcache instead of cache
@@ -2692,6 +3078,7 @@
       }
 #ifndef ROCKSDB_LITE
       if (use_blob_db_) {
+        // Stacked BlobDB
         blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
       }
 #endif  // !ROCKSDB_LITE
@@ -2709,12 +3096,24 @@
     }
 
     listener_.reset(new ErrorHandlerListener());
+    if (user_timestamp_size_ > 0) {
+      mock_app_clock_.reset(new TimestampEmulator());
+    }
   }
 
-  ~Benchmark() {
+  void DeleteDBs() {
     db_.DeleteDBs();
+    for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
+      delete dbwcf.db;
+    }
+  }
+
+  ~Benchmark() {
+    DeleteDBs();
     delete prefix_extractor_;
     if (cache_.get() != nullptr) {
+      // Clear cache reference first
+      open_options_.write_buffer_manager.reset();
       // this will leak, but we're shutting down so nobody cares
       cache_->DisownData();
     }
@@ -2728,17 +3127,19 @@
   }
 
   // Generate key according to the given specification and random number.
-  // The resulting key will have the following format (if keys_per_prefix_
-  // is positive), extra trailing bytes are either cut off or padded with '0'.
-  // The prefix value is derived from key value.
-  //   ----------------------------
-  //   | prefix 00000 | key 00000 |
-  //   ----------------------------
-  // If keys_per_prefix_ is 0, the key is simply a binary representation of
-  // random number followed by trailing '0's
-  //   ----------------------------
-  //   |        key 00000         |
-  //   ----------------------------
+  // The resulting key will have the following format:
+  //   - If keys_per_prefix_ is positive, extra trailing bytes are either cut
+  //     off or padded with '0'.
+  //     The prefix value is derived from key value.
+  //     ----------------------------
+  //     | prefix 00000 | key 00000 |
+  //     ----------------------------
+  //
+  //   - If keys_per_prefix_ is 0, the key is simply a binary representation of
+  //     random number followed by trailing '0's
+  //     ----------------------------
+  //     |        key 00000         |
+  //     ----------------------------
   void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
     if (!keys_.empty()) {
       assert(FLAGS_use_existing_keys);
@@ -2840,12 +3241,17 @@
     fprintf(stderr, "...Verified\n");
   }
 
+  void ErrorExit() {
+    DeleteDBs();
+    exit(1);
+  }
+
   void Run() {
     if (!SanityCheck()) {
-      exit(1);
+      ErrorExit();
     }
     Open(&open_options_);
-    PrintHeader();
+    PrintHeader(open_options_);
     std::stringstream benchmark_stream(FLAGS_benchmarks);
     std::string name;
     std::unique_ptr<ExpiredTimeFilter> filter;
@@ -2881,7 +3287,7 @@
         auto it = name.find('[');
         if (it == std::string::npos) {
           fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
-          exit(1);
+          ErrorExit();
         }
         std::string args = name.substr(it + 1);
         args.resize(args.size() - 1);
@@ -2914,7 +3320,7 @@
           fprintf(stderr,
                   "Please disable_auto_compactions in FillDeterministic "
                   "benchmark\n");
-          exit(1);
+          ErrorExit();
         }
         if (num_threads > 1) {
           fprintf(stderr,
@@ -2938,12 +3344,13 @@
       } else if (name == "fillrandom") {
         fresh_db = true;
         method = &Benchmark::WriteRandom;
-      } else if (name == "filluniquerandom") {
+      } else if (name == "filluniquerandom" ||
+                 name == "fillanddeleteuniquerandom") {
         fresh_db = true;
         if (num_threads > 1) {
           fprintf(stderr,
-                  "filluniquerandom multithreaded not supported"
-                  ", use 1 thread");
+                  "filluniquerandom and fillanddeleteuniquerandom "
+                  "multithreaded not supported, use 1 thread");
           num_threads = 1;
         }
         method = &Benchmark::WriteUniqueRandom;
@@ -2966,7 +3373,7 @@
           fprintf(stderr,
                   "Please set use_existing_keys to true and specify a "
                   "row cache size in readtorowcache benchmark\n");
-          exit(1);
+          ErrorExit();
         }
         method = &Benchmark::ReadToRowCache;
       } else if (name == "readtocache") {
@@ -2987,6 +3394,10 @@
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
+      } else if (name == "approximatesizerandom") {
+        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+                entries_per_batch_);
+        method = &Benchmark::ApproximateSizeRandom;
       } else if (name == "mixgraph") {
         method = &Benchmark::MixGraph;
       } else if (name == "readmissing") {
@@ -3027,7 +3438,7 @@
         if (FLAGS_merge_operator.empty()) {
           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
                   name.c_str());
-          exit(1);
+          ErrorExit();
         }
         method = &Benchmark::ReadRandomMergeRandom;
       } else if (name == "updaterandom") {
@@ -3051,10 +3462,24 @@
         method = &Benchmark::Compact;
       } else if (name == "compactall") {
         CompactAll();
+#ifndef ROCKSDB_LITE
+      } else if (name == "compact0") {
+        CompactLevel(0);
+      } else if (name == "compact1") {
+        CompactLevel(1);
+      } else if (name == "waitforcompaction") {
+        WaitForCompaction();
+#endif
+      } else if (name == "flush") {
+        Flush();
       } else if (name == "crc32c") {
         method = &Benchmark::Crc32c;
       } else if (name == "xxhash") {
         method = &Benchmark::xxHash;
+      } else if (name == "xxhash64") {
+        method = &Benchmark::xxHash64;
+      } else if (name == "xxh3") {
+        method = &Benchmark::xxh3;
       } else if (name == "acquireload") {
         method = &Benchmark::AcquireLoad;
       } else if (name == "compress") {
@@ -3086,25 +3511,35 @@
         VerifyDBFromDB(FLAGS_truth_db);
       } else if (name == "levelstats") {
         PrintStats("rocksdb.levelstats");
+      } else if (name == "memstats") {
+        std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
+                                      "rocksdb.cur-size-active-mem-table",
+                                      "rocksdb.cur-size-all-mem-tables",
+                                      "rocksdb.size-all-mem-tables",
+                                      "rocksdb.num-entries-active-mem-table",
+                                      "rocksdb.num-entries-imm-mem-tables"};
+        PrintStats(keys);
       } else if (name == "sstables") {
         PrintStats("rocksdb.sstables");
       } else if (name == "stats_history") {
         PrintStatsHistory();
+#ifndef ROCKSDB_LITE
       } else if (name == "replay") {
         if (num_threads > 1) {
           fprintf(stderr, "Multi-threaded replay is not yet supported\n");
-          exit(1);
+          ErrorExit();
         }
         if (FLAGS_trace_file == "") {
           fprintf(stderr, "Please set --trace_file to be replayed from\n");
-          exit(1);
+          ErrorExit();
         }
         method = &Benchmark::Replay;
+#endif  // ROCKSDB_LITE
       } else if (name == "getmergeoperands") {
         method = &Benchmark::GetMergeOperands;
       } else if (!name.empty()) {  // No error message for empty name
         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
-        exit(1);
+        ErrorExit();
       }
 
       if (fresh_db) {
@@ -3145,13 +3580,13 @@
           if (!s.ok()) {
             fprintf(stderr, "Encountered an error starting a trace, %s\n",
                     s.ToString().c_str());
-            exit(1);
+            ErrorExit();
           }
           s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
           if (!s.ok()) {
             fprintf(stderr, "Encountered an error starting a trace, %s\n",
                     s.ToString().c_str());
-            exit(1);
+            ErrorExit();
           }
           fprintf(stdout, "Tracing the workload to: [%s]\n",
                   FLAGS_trace_file.c_str());
@@ -3163,13 +3598,13 @@
             fprintf(stderr,
                     "Block cache trace sampling frequency must be higher than "
                     "0.\n");
-            exit(1);
+            ErrorExit();
           }
           if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
             fprintf(stderr,
                     "The maximum file size for block cache tracing must be "
                     "higher than 0.\n");
-            exit(1);
+            ErrorExit();
           }
           block_cache_trace_options_.max_trace_file_size =
               FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
@@ -3183,7 +3618,7 @@
             fprintf(stderr,
                     "Encountered an error when creating trace writer, %s\n",
                     s.ToString().c_str());
-            exit(1);
+            ErrorExit();
           }
           s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
                                            std::move(block_cache_trace_writer));
@@ -3192,7 +3627,7 @@
                 stderr,
                 "Encountered an error when starting block cache tracing, %s\n",
                 s.ToString().c_str());
-            exit(1);
+            ErrorExit();
           }
           fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
                   FLAGS_block_cache_trace_file.c_str());
@@ -3253,10 +3688,9 @@
       fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
     }
     if (FLAGS_simcache_size >= 0) {
-      fprintf(stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
-              static_cast_with_check<SimCache, Cache>(cache_.get())
-                  ->ToString()
-                  .c_str());
+      fprintf(
+          stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
+          static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
     }
 
 #ifndef ROCKSDB_LITE
@@ -3390,44 +3824,42 @@
     return merge_stats;
   }
 
-  void Crc32c(ThreadState* thread) {
-    // Checksum about 500MB of data total
+  template <OperationType kOpType, typename FnType, typename... Args>
+  static inline void ChecksumBenchmark(FnType fn, ThreadState* thread,
+                                       Args... args) {
     const int size = FLAGS_block_size; // use --block_size option for db_bench
     std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
     const char* label = labels.c_str();
 
     std::string data(size, 'x');
-    int64_t bytes = 0;
-    uint32_t crc = 0;
-    while (bytes < 500 * 1048576) {
-      crc = crc32c::Value(data.data(), size);
-      thread->stats.FinishedOps(nullptr, nullptr, 1, kCrc);
+    uint64_t bytes = 0;
+    uint32_t val = 0;
+    while (bytes < 5000U * uint64_t{1048576}) {  // ~5GB
+      val += static_cast<uint32_t>(fn(data.data(), size, args...));
+      thread->stats.FinishedOps(nullptr, nullptr, 1, kOpType);
       bytes += size;
     }
     // Print so result is not dead
-    fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
+    fprintf(stderr, "... val=0x%x\r", static_cast<unsigned int>(val));
 
     thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(label);
   }
 
+  void Crc32c(ThreadState* thread) {
+    ChecksumBenchmark<kCrc>(crc32c::Value, thread);
+  }
+
   void xxHash(ThreadState* thread) {
-    // Checksum about 500MB of data total
-    const int size = 4096;
-    const char* label = "(4K per op)";
-    std::string data(size, 'x');
-    int64_t bytes = 0;
-    unsigned int xxh32 = 0;
-    while (bytes < 500 * 1048576) {
-      xxh32 = XXH32(data.data(), size, 0);
-      thread->stats.FinishedOps(nullptr, nullptr, 1, kHash);
-      bytes += size;
-    }
-    // Print so result is not dead
-    fprintf(stderr, "... xxh32=0x%x\r", static_cast<unsigned int>(xxh32));
+    ChecksumBenchmark<kHash>(XXH32, thread, /*seed*/ 0);
+  }
 
-    thread->stats.AddBytes(bytes);
-    thread->stats.AddMessage(label);
+  void xxHash64(ThreadState* thread) {
+    ChecksumBenchmark<kHash>(XXH64, thread, /*seed*/ 0);
+  }
+
+  void xxh3(ThreadState* thread) {
+    ChecksumBenchmark<kHash>(XXH3_64bits, thread);
   }
 
   void AcquireLoad(ThreadState* thread) {
@@ -3495,57 +3927,15 @@
 
     bool ok = CompressSlice(compression_info, input, &compressed);
     int64_t bytes = 0;
-    int decompress_size;
+    size_t uncompressed_size = 0;
     while (ok && bytes < 1024 * 1048576) {
-      CacheAllocationPtr uncompressed;
-      switch (FLAGS_compression_type_e) {
-        case ROCKSDB_NAMESPACE::kSnappyCompression: {
-          // get size and allocate here to make comparison fair
-          size_t ulength = 0;
-          if (!Snappy_GetUncompressedLength(compressed.data(),
-                                            compressed.size(), &ulength)) {
-            ok = false;
-            break;
-          }
-          uncompressed = AllocateBlock(ulength, nullptr);
-          ok = Snappy_Uncompress(compressed.data(), compressed.size(),
-                                 uncompressed.get());
-          break;
-        }
-        case ROCKSDB_NAMESPACE::kZlibCompression:
-          uncompressed =
-              Zlib_Uncompress(uncompression_info, compressed.data(),
-                              compressed.size(), &decompress_size, 2);
-          ok = uncompressed.get() != nullptr;
-          break;
-        case ROCKSDB_NAMESPACE::kBZip2Compression:
-          uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
-                                          &decompress_size, 2);
-          ok = uncompressed.get() != nullptr;
-          break;
-        case ROCKSDB_NAMESPACE::kLZ4Compression:
-          uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
-                                        compressed.size(), &decompress_size, 2);
-          ok = uncompressed.get() != nullptr;
-          break;
-        case ROCKSDB_NAMESPACE::kLZ4HCCompression:
-          uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
-                                        compressed.size(), &decompress_size, 2);
-          ok = uncompressed.get() != nullptr;
-          break;
-        case ROCKSDB_NAMESPACE::kXpressCompression:
-          uncompressed.reset(XPRESS_Uncompress(
-              compressed.data(), compressed.size(), &decompress_size));
-          ok = uncompressed.get() != nullptr;
-          break;
-        case ROCKSDB_NAMESPACE::kZSTD:
-          uncompressed = ZSTD_Uncompress(uncompression_info, compressed.data(),
-                                         compressed.size(), &decompress_size);
-          ok = uncompressed.get() != nullptr;
-          break;
-        default:
-          ok = false;
-      }
+      constexpr uint32_t compress_format_version = 2;
+
+      CacheAllocationPtr uncompressed = UncompressData(
+          uncompression_info, compressed.data(), compressed.size(),
+          &uncompressed_size, compress_format_version);
+
+      ok = uncompressed.get() != nullptr;
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
     }
@@ -3585,6 +3975,8 @@
   void InitializeOptionsFromFlags(Options* opts) {
     printf("Initializing RocksDB Options from command-line flags\n");
     Options& options = *opts;
+    ConfigOptions config_options(options);
+    config_options.ignore_unsupported_options = false;
 
     assert(db_.db == nullptr);
 
@@ -3594,6 +3986,7 @@
       options.write_buffer_manager.reset(
           new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
     }
+    options.arena_block_size = FLAGS_arena_block_size;
     options.write_buffer_size = FLAGS_write_buffer_size;
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options.min_write_buffer_number_to_merge =
@@ -3613,11 +4006,13 @@
     options.use_direct_reads = FLAGS_use_direct_reads;
     options.use_direct_io_for_flush_and_compaction =
         FLAGS_use_direct_io_for_flush_and_compaction;
+    options.manual_wal_flush = FLAGS_manual_wal_flush;
 #ifndef ROCKSDB_LITE
     options.ttl = FLAGS_fifo_compaction_ttl;
     options.compaction_options_fifo = CompactionOptionsFIFO(
         FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
         FLAGS_fifo_compaction_allow_compaction);
+    options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
 #endif  // ROCKSDB_LITE
     if (FLAGS_prefix_size != 0) {
       options.prefix_extractor.reset(
@@ -3658,47 +4053,30 @@
         FLAGS_level_compaction_dynamic_level_bytes;
     options.max_bytes_for_level_multiplier =
         FLAGS_max_bytes_for_level_multiplier;
-    if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
-                                     FLAGS_rep_factory == kHashLinkedList)) {
+    Status s =
+        CreateMemTableRepFactory(config_options, &options.memtable_factory);
+    if (!s.ok()) {
+      fprintf(stderr, "Could not create memtable factory: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    } else if ((FLAGS_prefix_size == 0) &&
+               (options.memtable_factory->IsInstanceOf("prefix_hash") ||
+                options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
       fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
                       "HashLinkedList memtablerep is used\n");
       exit(1);
     }
-    switch (FLAGS_rep_factory) {
-      case kSkipList:
-        options.memtable_factory.reset(new SkipListFactory(
-            FLAGS_skip_list_lookahead));
-        break;
-#ifndef ROCKSDB_LITE
-      case kPrefixHash:
-        options.memtable_factory.reset(
-            NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
-        break;
-      case kHashLinkedList:
-        options.memtable_factory.reset(NewHashLinkListRepFactory(
-            FLAGS_hash_bucket_count));
-        break;
-      case kVectorRep:
-        options.memtable_factory.reset(
-          new VectorRepFactory
-        );
-        break;
-#else
-      default:
-        fprintf(stderr, "Only skip list is supported in lite mode\n");
-        exit(1);
-#endif  // ROCKSDB_LITE
-    }
     if (FLAGS_use_plain_table) {
 #ifndef ROCKSDB_LITE
-      if (FLAGS_rep_factory != kPrefixHash &&
-          FLAGS_rep_factory != kHashLinkedList) {
-        fprintf(stderr, "Waring: plain table is used with skipList\n");
+      if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
+          !options.memtable_factory->IsInstanceOf("hash_linkedlist")) {
+        fprintf(stderr, "Warning: plain table is used with %s\n",
+                options.memtable_factory->Name());
       }
 
       int bloom_bits_per_key = FLAGS_bloom_bits;
       if (bloom_bits_per_key < 0) {
-        bloom_bits_per_key = 0;
+        bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key;
       }
 
       PlainTableOptions plain_table_options;
@@ -3734,6 +4112,8 @@
 #endif  // ROCKSDB_LITE
     } else {
       BlockBasedTableOptions block_based_options;
+      block_based_options.checksum =
+          static_cast<ChecksumType>(FLAGS_checksum_type);
       if (FLAGS_use_hash_search) {
         if (FLAGS_prefix_size == 0) {
           fprintf(stderr,
@@ -3745,6 +4125,11 @@
         block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
       }
       if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
+        if (FLAGS_index_with_first_key) {
+          fprintf(stderr,
+                  "--index_with_first_key is not compatible with"
+                  " partition index.");
+        }
         if (FLAGS_use_hash_search) {
           fprintf(stderr,
                   "use_hash_search is incompatible with "
@@ -3756,7 +4141,31 @@
         if (FLAGS_partition_index_and_filters) {
           block_based_options.partition_filters = true;
         }
+      } else if (FLAGS_index_with_first_key) {
+        block_based_options.index_type =
+            BlockBasedTableOptions::kBinarySearchWithFirstKey;
       }
+      BlockBasedTableOptions::IndexShorteningMode index_shortening =
+          block_based_options.index_shortening;
+      switch (FLAGS_index_shortening_mode) {
+        case 0:
+          index_shortening =
+              BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+          break;
+        case 1:
+          index_shortening =
+              BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
+          break;
+        case 2:
+          index_shortening = BlockBasedTableOptions::IndexShorteningMode::
+              kShortenSeparatorsAndSuccessor;
+          break;
+        default:
+          fprintf(stderr, "Unknown key shortening mode\n");
+      }
+      block_based_options.optimize_filters_for_memory =
+          FLAGS_optimize_filters_for_memory;
+      block_based_options.index_shortening = index_shortening;
       if (cache_ == nullptr) {
         block_based_options.no_block_cache = true;
       }
@@ -3776,13 +4185,27 @@
       block_based_options.block_restart_interval = FLAGS_block_restart_interval;
       block_based_options.index_block_restart_interval =
           FLAGS_index_block_restart_interval;
-      block_based_options.filter_policy = filter_policy_;
       block_based_options.format_version =
           static_cast<uint32_t>(FLAGS_format_version);
       block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
       block_based_options.enable_index_compression =
           FLAGS_enable_index_compression;
       block_based_options.block_align = FLAGS_block_align;
+      BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
+          block_based_options.prepopulate_block_cache;
+      switch (FLAGS_prepopulate_block_cache) {
+        case 0:
+          prepopulate_block_cache =
+              BlockBasedTableOptions::PrepopulateBlockCache::kDisable;
+          break;
+        case 1:
+          prepopulate_block_cache =
+              BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+          break;
+        default:
+          fprintf(stderr, "Unknown prepopulate block cache mode\n");
+      }
+      block_based_options.prepopulate_block_cache = prepopulate_block_cache;
       if (FLAGS_use_data_block_hash_index) {
         block_based_options.data_block_index_type =
             ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
@@ -3851,6 +4274,9 @@
     options.level0_slowdown_writes_trigger =
       FLAGS_level0_slowdown_writes_trigger;
     options.compression = FLAGS_compression_type_e;
+    if (FLAGS_simulate_hybrid_fs_file != "") {
+      options.bottommost_temperature = Temperature::kWarm;
+    }
     options.sample_for_compression = FLAGS_sample_for_compression;
     options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
     options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
@@ -3876,6 +4302,8 @@
     options.delayed_write_rate = FLAGS_delayed_write_rate;
     options.allow_concurrent_memtable_write =
         FLAGS_allow_concurrent_memtable_write;
+    options.experimental_mempurge_threshold =
+        FLAGS_experimental_mempurge_threshold;
     options.inplace_update_support = FLAGS_inplace_update_support;
     options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
     options.enable_write_thread_adaptive_yield =
@@ -3890,7 +4318,12 @@
     options.max_compaction_bytes = FLAGS_max_compaction_bytes;
     options.disable_auto_compactions = FLAGS_disable_auto_compactions;
     options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
-
+    options.paranoid_checks = FLAGS_paranoid_checks;
+    options.force_consistency_checks = FLAGS_force_consistency_checks;
+    options.check_flush_compaction_key_order =
+        FLAGS_check_flush_compaction_key_order;
+    options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+    options.ttl = FLAGS_ttl_seconds;
     // fill storage options
     options.advise_random_on_open = FLAGS_advise_random_on_open;
     options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
@@ -3899,12 +4332,14 @@
     options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
 
     // merge operator options
-    options.merge_operator = MergeOperators::CreateFromStringId(
-        FLAGS_merge_operator);
-    if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
-      fprintf(stderr, "invalid merge operator: %s\n",
-              FLAGS_merge_operator.c_str());
-      exit(1);
+    if (!FLAGS_merge_operator.empty()) {
+      s = MergeOperator::CreateFromString(config_options, FLAGS_merge_operator,
+                                          &options.merge_operator);
+      if (!s.ok()) {
+        fprintf(stderr, "invalid merge operator[%s]: %s\n",
+                FLAGS_merge_operator.c_str(), s.ToString().c_str());
+        exit(1);
+      }
     }
     options.max_successive_merges = FLAGS_max_successive_merges;
     options.report_bg_io_stats = FLAGS_report_bg_io_stats;
@@ -3932,10 +4367,35 @@
     }
     options.compaction_options_universal.allow_trivial_move =
         FLAGS_universal_allow_trivial_move;
+    options.compaction_options_universal.incremental =
+        FLAGS_universal_incremental;
     if (FLAGS_thread_status_per_interval > 0) {
       options.enable_thread_tracking = true;
     }
 
+    if (FLAGS_user_timestamp_size > 0) {
+      if (FLAGS_user_timestamp_size != 8) {
+        fprintf(stderr, "Only 64 bits timestamps are supported.\n");
+        exit(1);
+      }
+      options.comparator = ROCKSDB_NAMESPACE::test::ComparatorWithU64Ts();
+    }
+
+    // Integrated BlobDB
+    options.enable_blob_files = FLAGS_enable_blob_files;
+    options.min_blob_size = FLAGS_min_blob_size;
+    options.blob_file_size = FLAGS_blob_file_size;
+    options.blob_compression_type =
+        StringToCompressionType(FLAGS_blob_compression_type.c_str());
+    options.enable_blob_garbage_collection =
+        FLAGS_enable_blob_garbage_collection;
+    options.blob_garbage_collection_age_cutoff =
+        FLAGS_blob_garbage_collection_age_cutoff;
+    options.blob_garbage_collection_force_threshold =
+        FLAGS_blob_garbage_collection_force_threshold;
+    options.blob_compaction_readahead_size =
+        FLAGS_blob_compaction_readahead_size;
+
 #ifndef ROCKSDB_LITE
     if (FLAGS_readonly && FLAGS_transaction_db) {
       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
@@ -3970,18 +4430,27 @@
     options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
     options.compression_opts.zstd_max_train_bytes =
         FLAGS_compression_zstd_max_train_bytes;
+    options.compression_opts.parallel_threads =
+        FLAGS_compression_parallel_threads;
+    options.compression_opts.max_dict_buffer_bytes =
+        FLAGS_compression_max_dict_buffer_bytes;
     // If this is a block based table, set some related options
-    if (options.table_factory->Name() == BlockBasedTableFactory::kName &&
-        options.table_factory->GetOptions() != nullptr) {
-      BlockBasedTableOptions* table_options =
-          reinterpret_cast<BlockBasedTableOptions*>(
-              options.table_factory->GetOptions());
+    auto table_options =
+        options.table_factory->GetOptions<BlockBasedTableOptions>();
+    if (table_options != nullptr) {
       if (FLAGS_cache_size) {
         table_options->block_cache = cache_;
       }
-      if (FLAGS_bloom_bits >= 0) {
-        table_options->filter_policy.reset(NewBloomFilterPolicy(
-            FLAGS_bloom_bits, FLAGS_use_block_based_filter));
+      if (FLAGS_bloom_bits < 0) {
+        table_options->filter_policy = BlockBasedTableOptions().filter_policy;
+      } else if (FLAGS_bloom_bits == 0) {
+        table_options->filter_policy.reset();
+      } else {
+        table_options->filter_policy.reset(
+            FLAGS_use_ribbon_filter
+                ? NewRibbonFilterPolicy(FLAGS_bloom_bits)
+                : NewBloomFilterPolicy(FLAGS_bloom_bits,
+                                       FLAGS_use_block_based_filter));
       }
     }
     if (FLAGS_row_cache_size) {
@@ -4014,7 +4483,7 @@
         exit(1);
       }
       options.rate_limiter.reset(NewGenericRateLimiter(
-          FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
+          FLAGS_rate_limiter_bytes_per_sec, FLAGS_rate_limiter_refill_period_us,
           10 /* fairness */,
           FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
                                     : RateLimiter::Mode::kWritesOnly,
@@ -4022,6 +4491,7 @@
     }
 
     options.listeners.emplace_back(listener_);
+
     if (FLAGS_num_multi_db <= 1) {
       OpenDb(options, FLAGS_db, &db_);
     } else {
@@ -4067,6 +4537,7 @@
 
   void OpenDb(Options options, const std::string& db_name,
       DBWithColumnFamilies* db) {
+    uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0;
     Status s;
     // Open with column families if necessary.
     if (FLAGS_num_column_families > 1) {
@@ -4161,6 +4632,7 @@
         db->db = ptr;
       }
     } else if (FLAGS_use_blob_db) {
+      // Stacked BlobDB
       blob_db::BlobDBOptions blob_db_options;
       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
       blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
@@ -4206,6 +4678,11 @@
     } else {
       s = DB::Open(options, db_name, &db->db);
     }
+    if (FLAGS_report_open_timing) {
+      std::cout << "OpenDb:     "
+                << (FLAGS_env->NowNanos() - open_start) / 1000000.0
+                << " milliseconds\n";
+    }
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
       exit(1);
@@ -4251,9 +4728,8 @@
         for (uint64_t i = 0; i < num_; ++i) {
           values_[i] = i;
         }
-        std::shuffle(
-            values_.begin(), values_.end(),
-            std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
+        RandomShuffle(values_.begin(), values_.end(),
+                      static_cast<uint32_t>(FLAGS_seed));
       }
     }
 
@@ -4271,6 +4747,13 @@
       return std::numeric_limits<uint64_t>::max();
     }
 
+    // Only available for UNIQUE_RANDOM mode.
+    uint64_t Fetch(uint64_t index) {
+      assert(mode_ == UNIQUE_RANDOM);
+      assert(index < values_.size());
+      return values_[index];
+    }
+
    private:
     Random64* rand_;
     WriteMode mode_;
@@ -4317,10 +4800,10 @@
     }
 
     Duration duration(test_duration, max_ops, ops_per_stage);
+    const uint64_t num_per_key_gen = num_ + max_num_range_tombstones_;
     for (size_t i = 0; i < num_key_gens; i++) {
       key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
-                                         num_ + max_num_range_tombstones_,
-                                         ops_per_stage));
+                                         num_per_key_gen, ops_per_stage));
     }
 
     if (num_ != FLAGS_num) {
@@ -4330,7 +4813,8 @@
     }
 
     RandomGenerator gen;
-    WriteBatch batch;
+    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                     user_timestamp_size_);
     Status s;
     int64_t bytes = 0;
 
@@ -4340,6 +4824,79 @@
     Slice begin_key = AllocateKey(&begin_key_guard);
     std::unique_ptr<const char[]> end_key_guard;
     Slice end_key = AllocateKey(&end_key_guard);
+    double p = 0.0;
+    uint64_t num_overwrites = 0, num_unique_keys = 0, num_selective_deletes = 0;
+    // If user set overwrite_probability flag,
+    // check if value is in [0.0,1.0].
+    if (FLAGS_overwrite_probability > 0.0) {
+      p = FLAGS_overwrite_probability > 1.0 ? 1.0 : FLAGS_overwrite_probability;
+      // If overwrite set by user, and UNIQUE_RANDOM mode on,
+      // the overwrite_window_size must be > 0.
+      if (write_mode == UNIQUE_RANDOM && FLAGS_overwrite_window_size == 0) {
+        fprintf(stderr,
+                "Overwrite_window_size must be  strictly greater than 0.\n");
+        ErrorExit();
+      }
+    }
+
+    // Default_random_engine provides slightly
+    // improved throughput over mt19937.
+    std::default_random_engine overwrite_gen{
+        static_cast<unsigned int>(FLAGS_seed)};
+    std::bernoulli_distribution overwrite_decider(p);
+
+    // Inserted key window is filled with the last N
+    // keys previously inserted into the DB (with
+    // N=FLAGS_overwrite_window_size).
+    // We use a deque struct because:
+    // - random access is O(1)
+    // - insertion/removal at beginning/end is also O(1).
+    std::deque<int64_t> inserted_key_window;
+    Random64 reservoir_id_gen(FLAGS_seed);
+
+    // --- Variables used in disposable/persistent keys simulation:
+    // The following variables are used when
+    // disposable_entries_batch_size is >0. We simualte a workload
+    // where the following sequence is repeated multiple times:
+    // "A set of keys S1 is inserted ('disposable entries'), then after
+    // some delay another set of keys S2 is inserted ('persistent entries')
+    // and the first set of keys S1 is deleted. S2 artificially represents
+    // the insertion of hypothetical results from some undefined computation
+    // done on the first set of keys S1. The next sequence can start as soon
+    // as the last disposable entry in the set S1 of this sequence is
+    // inserted, if the delay is non negligible"
+    bool skip_for_loop = false, is_disposable_entry = true;
+    std::vector<uint64_t> disposable_entries_index(num_key_gens, 0);
+    std::vector<uint64_t> persistent_ent_and_del_index(num_key_gens, 0);
+    const uint64_t kNumDispAndPersEntries =
+        FLAGS_disposable_entries_batch_size +
+        FLAGS_persistent_entries_batch_size;
+    if (kNumDispAndPersEntries > 0) {
+      if ((write_mode != UNIQUE_RANDOM) || (writes_per_range_tombstone_ > 0) ||
+          (p > 0.0)) {
+        fprintf(
+            stderr,
+            "Disposable/persistent deletes are not compatible with overwrites "
+            "and DeleteRanges; and are only supported in filluniquerandom.\n");
+        ErrorExit();
+      }
+      if (FLAGS_disposable_entries_value_size < 0 ||
+          FLAGS_persistent_entries_value_size < 0) {
+        fprintf(
+            stderr,
+            "disposable_entries_value_size and persistent_entries_value_size"
+            "have to be positive.\n");
+        ErrorExit();
+      }
+    }
+    Random rnd_disposable_entry(static_cast<uint32_t>(FLAGS_seed));
+    std::string random_value;
+    // Queue that stores scheduled timestamp of disposable entries deletes,
+    // along with starting index of disposable entry keys to delete.
+    std::vector<std::queue<std::pair<uint64_t, uint64_t>>> disposable_entries_q(
+        num_key_gens);
+    // --- End of variables used in disposable/persistent keys simulation.
+
     std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
     std::vector<Slice> expanded_keys;
     if (FLAGS_expand_range_tombstones) {
@@ -4349,9 +4906,14 @@
       }
     }
 
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
     int64_t stage = 0;
     int64_t num_written = 0;
-    while (!duration.Done(entries_per_batch_)) {
+    while ((num_per_key_gen != 0) && !duration.Done(entries_per_batch_)) {
       if (duration.GetStage() != stage) {
         stage = duration.GetStage();
         if (db_.db != nullptr) {
@@ -4369,11 +4931,121 @@
       int64_t batch_bytes = 0;
 
       for (int64_t j = 0; j < entries_per_batch_; j++) {
-        int64_t rand_num = key_gens[id]->Next();
+        int64_t rand_num = 0;
+        if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
+          if ((inserted_key_window.size() > 0) &&
+              overwrite_decider(overwrite_gen)) {
+            num_overwrites++;
+            rand_num = inserted_key_window[reservoir_id_gen.Next() %
+                                           inserted_key_window.size()];
+          } else {
+            num_unique_keys++;
+            rand_num = key_gens[id]->Next();
+            if (inserted_key_window.size() < FLAGS_overwrite_window_size) {
+              inserted_key_window.push_back(rand_num);
+            } else {
+              inserted_key_window.pop_front();
+              inserted_key_window.push_back(rand_num);
+            }
+          }
+        } else if (kNumDispAndPersEntries > 0) {
+          // Check if queue is non-empty and if we need to insert
+          // 'persistent' KV entries (KV entries that are never deleted)
+          // and delete disposable entries previously inserted.
+          if (!disposable_entries_q[id].empty() &&
+              (disposable_entries_q[id].front().first <
+               FLAGS_env->NowMicros())) {
+            // If we need to perform a "merge op" pattern,
+            // we first write all the persistent KV entries not targeted
+            // by deletes, and then we write the disposable entries deletes.
+            if (persistent_ent_and_del_index[id] <
+                FLAGS_persistent_entries_batch_size) {
+              // Generate key to insert.
+              rand_num =
+                  key_gens[id]->Fetch(disposable_entries_q[id].front().second +
+                                      FLAGS_disposable_entries_batch_size +
+                                      persistent_ent_and_del_index[id]);
+              persistent_ent_and_del_index[id]++;
+              is_disposable_entry = false;
+              skip_for_loop = false;
+            } else if (persistent_ent_and_del_index[id] <
+                       kNumDispAndPersEntries) {
+              // Find key of the entry to delete.
+              rand_num =
+                  key_gens[id]->Fetch(disposable_entries_q[id].front().second +
+                                      (persistent_ent_and_del_index[id] -
+                                       FLAGS_persistent_entries_batch_size));
+              persistent_ent_and_del_index[id]++;
+              GenerateKeyFromInt(rand_num, FLAGS_num, &key);
+              // For the delete operation, everything happens here and we
+              // skip the rest of the for-loop, which is designed for
+              // inserts.
+              if (FLAGS_num_column_families <= 1) {
+                batch.Delete(key);
+              } else {
+                // We use same rand_num as seed for key and column family so
+                // that we can deterministically find the cfh corresponding to a
+                // particular key while reading the key.
+                batch.Delete(db_with_cfh->GetCfh(rand_num), key);
+              }
+              // A delete only includes Key+Timestamp (no value).
+              batch_bytes += key_size_ + user_timestamp_size_;
+              bytes += key_size_ + user_timestamp_size_;
+              num_selective_deletes++;
+              // Skip rest of the for-loop (j=0, j<entries_per_batch_,j++).
+              skip_for_loop = true;
+            } else {
+              assert(false);  // should never reach this point.
+            }
+            // If disposable_entries_q needs to be updated (ie: when a selective
+            // insert+delete was successfully completed, pop the job out of the
+            // queue).
+            if (!disposable_entries_q[id].empty() &&
+                (disposable_entries_q[id].front().first <
+                 FLAGS_env->NowMicros()) &&
+                persistent_ent_and_del_index[id] == kNumDispAndPersEntries) {
+              disposable_entries_q[id].pop();
+              persistent_ent_and_del_index[id] = 0;
+            }
+
+            // If we are deleting disposable entries, skip the rest of the
+            // for-loop since there is no key-value inserts at this moment in
+            // time.
+            if (skip_for_loop) {
+              continue;
+            }
+
+          }
+          // If no job is in the queue, then we keep inserting disposable KV
+          // entries that will be deleted later by a series of deletes.
+          else {
+            rand_num = key_gens[id]->Fetch(disposable_entries_index[id]);
+            disposable_entries_index[id]++;
+            is_disposable_entry = true;
+            if ((disposable_entries_index[id] %
+                 FLAGS_disposable_entries_batch_size) == 0) {
+              // Skip the persistent KV entries inserts for now
+              disposable_entries_index[id] +=
+                  FLAGS_persistent_entries_batch_size;
+            }
+          }
+        } else {
+          rand_num = key_gens[id]->Next();
+        }
         GenerateKeyFromInt(rand_num, FLAGS_num, &key);
-        Slice val = gen.Generate();
+        Slice val;
+        if (kNumDispAndPersEntries > 0) {
+          random_value = rnd_disposable_entry.RandomString(
+              is_disposable_entry ? FLAGS_disposable_entries_value_size
+                                  : FLAGS_persistent_entries_value_size);
+          val = Slice(random_value);
+          num_unique_keys++;
+        } else {
+          val = gen.Generate();
+        }
         if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
+          // Stacked BlobDB
           blob_db::BlobDB* blobdb =
               static_cast<blob_db::BlobDB*>(db_with_cfh->db);
           if (FLAGS_blob_db_max_ttl_range > 0) {
@@ -4392,9 +5064,26 @@
           batch.Put(db_with_cfh->GetCfh(rand_num), key,
                     val);
         }
-        batch_bytes += val.size() + key_size_;
-        bytes += val.size() + key_size_;
+        batch_bytes += val.size() + key_size_ + user_timestamp_size_;
+        bytes += val.size() + key_size_ + user_timestamp_size_;
         ++num_written;
+
+        // If all disposable entries have been inserted, then we need to
+        // add in the job queue a call for 'persistent entry insertions +
+        // disposable entry deletions'.
+        if (kNumDispAndPersEntries > 0 && is_disposable_entry &&
+            ((disposable_entries_index[id] % kNumDispAndPersEntries) == 0)) {
+          // Queue contains [timestamp, starting_idx],
+          // timestamp = current_time + delay (minimum aboslute time when to
+          // start inserting the selective deletes) starting_idx = index in the
+          // keygen of the rand_num to generate the key of the first KV entry to
+          // delete (= key of the first selective delete).
+          disposable_entries_q[id].push(std::make_pair(
+              FLAGS_env->NowMicros() +
+                  FLAGS_disposable_entries_delete_delay /* timestamp */,
+              disposable_entries_index[id] - kNumDispAndPersEntries
+              /*starting idx*/));
+        }
         if (writes_per_range_tombstone_ > 0 &&
             num_written > writes_before_delete_range_ &&
             (num_written - writes_before_delete_range_) /
@@ -4411,6 +5100,7 @@
                                  &expanded_keys[offset]);
               if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
+                // Stacked BlobDB
                 s = db_with_cfh->db->Delete(write_options_,
                                             expanded_keys[offset]);
 #endif  //  ROCKSDB_LITE
@@ -4427,6 +5117,7 @@
                                &end_key);
             if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
+              // Stacked BlobDB
               s = db_with_cfh->db->DeleteRange(
                   write_options_, db_with_cfh->db->DefaultColumnFamily(),
                   begin_key, end_key);
@@ -4449,7 +5140,17 @@
         // once per write.
         thread->stats.ResetLastOpTime();
       }
+      if (user_timestamp_size_ > 0) {
+        Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = batch.AssignTimestamp(user_ts);
+        if (!s.ok()) {
+          fprintf(stderr, "assign timestamp to write batch: %s\n",
+                  s.ToString().c_str());
+          ErrorExit();
+        }
+      }
       if (!use_blob_db_) {
+        // Not stacked BlobDB
         s = db_with_cfh->db->Write(write_options_, &batch);
       }
       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
@@ -4481,9 +5182,20 @@
 
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-        exit(1);
+        ErrorExit();
       }
     }
+    if ((write_mode == UNIQUE_RANDOM) && (p > 0.0)) {
+      fprintf(stdout,
+              "Number of unique keys inserted: %" PRIu64
+              ".\nNumber of overwrites: %" PRIu64 "\n",
+              num_unique_keys, num_overwrites);
+    } else if (kNumDispAndPersEntries > 0) {
+      fprintf(stdout,
+              "Number of unique keys inserted (disposable+persistent): %" PRIu64
+              ".\nNumber of 'disposable entry delete': %" PRIu64 "\n",
+              num_written, num_selective_deletes);
+    }
     thread->stats.AddBytes(bytes);
   }
 
@@ -4762,7 +5474,7 @@
         }
         if (levelMeta.level == 0) {
           for (auto& fileMeta : levelMeta.files) {
-            fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
+            fprintf(stdout, "Level[%d]: %s(size: %" PRIi64 " bytes)\n",
                     levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
           }
         } else {
@@ -4805,7 +5517,15 @@
   void ReadSequential(ThreadState* thread, DB* db) {
     ReadOptions options(FLAGS_verify_checksum, true);
     options.tailing = FLAGS_use_tailing_iterator;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      options.timestamp = &ts;
+    }
 
+    options.adaptive_readahead = FLAGS_adaptive_readahead;
     Iterator* iter = db->NewIterator(options);
     int64_t i = 0;
     int64_t bytes = 0;
@@ -4900,7 +5620,9 @@
   }
 
   void ReadReverse(ThreadState* thread, DB* db) {
-    Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+    ReadOptions options(FLAGS_verify_checksum, true);
+    options.adaptive_readahead = FLAGS_adaptive_readahead;
+    Iterator* iter = db->NewIterator(options);
     int64_t i = 0;
     int64_t bytes = 0;
     for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
@@ -4926,6 +5648,11 @@
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
     std::string value;
+    Slice ts;
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
     DB* db = SelectDBWithCfh(thread)->db;
 
     int64_t pot = 1;
@@ -4939,7 +5666,15 @@
         int64_t key_rand = thread->rand.Next() & (pot - 1);
         GenerateKeyFromInt(key_rand, FLAGS_num, &key);
         ++read;
-        auto status = db->Get(options, key, &value);
+        std::string ts_ret;
+        std::string* ts_ptr = nullptr;
+        if (user_timestamp_size_ > 0) {
+          ts = mock_app_clock_->GetTimestampForRead(thread->rand,
+                                                    ts_guard.get());
+          options.timestamp = &ts;
+          ts_ptr = &ts_ret;
+        }
+        auto status = db->Get(options, key, &value, ts_ptr);
         if (status.ok()) {
           ++found;
         } else if (!status.IsNotFound()) {
@@ -4998,11 +5733,16 @@
     int64_t found = 0;
     int64_t bytes = 0;
     int num_keys = 0;
-    int64_t key_rand = GetRandomKey(&thread->rand);
+    int64_t key_rand = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
     PinnableSlice pinnable_val;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
 
     Duration duration(FLAGS_duration, reads_);
     while (!duration.Done(1)) {
@@ -5010,7 +5750,6 @@
       // We use same key_rand as seed for key and column family so that we can
       // deterministically find the cfh corresponding to a particular key, as it
       // is done in DoWrite method.
-      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
       if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
         if (++num_keys == entries_per_batch_) {
           num_keys = 0;
@@ -5025,20 +5764,28 @@
       } else {
         key_rand = GetRandomKey(&thread->rand);
       }
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
       read++;
+      std::string ts_ret;
+      std::string* ts_ptr = nullptr;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+        options.timestamp = &ts;
+        ts_ptr = &ts_ret;
+      }
       Status s;
+      pinnable_val.Reset();
       if (FLAGS_num_column_families > 1) {
         s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
-                                 &pinnable_val);
+                                 &pinnable_val, ts_ptr);
       } else {
-        pinnable_val.Reset();
         s = db_with_cfh->db->Get(options,
                                  db_with_cfh->db->DefaultColumnFamily(), key,
-                                 &pinnable_val);
+                                 &pinnable_val, ts_ptr);
       }
       if (s.ok()) {
         found++;
-        bytes += key.size() + pinnable_val.size();
+        bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
       } else if (!s.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
         abort();
@@ -5070,6 +5817,7 @@
   // Returns the total number of keys found.
   void MultiReadRandom(ThreadState* thread) {
     int64_t read = 0;
+    int64_t bytes = 0;
     int64_t num_multireads = 0;
     int64_t found = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
@@ -5084,8 +5832,13 @@
       keys.push_back(AllocateKey(&key_guards.back()));
     }
 
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
     Duration duration(FLAGS_duration, reads_);
-    while (!duration.Done(1)) {
+    while (!duration.Done(entries_per_batch_)) {
       DB* db = SelectDB(thread);
       if (FLAGS_multiread_stride) {
         int64_t key = GetRandomKey(&thread->rand);
@@ -5102,6 +5855,11 @@
           GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
         }
       }
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+        options.timestamp = &ts;
+      }
       if (!FLAGS_multiread_batched) {
         std::vector<Status> statuses = db->MultiGet(options, keys, &values);
         assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
@@ -5110,6 +5868,7 @@
         num_multireads++;
         for (int64_t i = 0; i < entries_per_batch_; ++i) {
           if (statuses[i].ok()) {
+            bytes += keys[i].size() + values[i].size() + user_timestamp_size_;
             ++found;
           } else if (!statuses[i].IsNotFound()) {
             fprintf(stderr, "MultiGet returned an error: %s\n",
@@ -5125,6 +5884,8 @@
         num_multireads++;
         for (int64_t i = 0; i < entries_per_batch_; ++i) {
           if (stat_list[i].ok()) {
+            bytes +=
+                keys[i].size() + pin_values[i].size() + user_timestamp_size_;
             ++found;
           } else if (!stat_list[i].IsNotFound()) {
             fprintf(stderr, "MultiGet returned an error: %s\n",
@@ -5147,6 +5908,54 @@
     char msg[100];
     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
              found, read);
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+  }
+
+  // Calls ApproximateSize over random key ranges.
+  void ApproximateSizeRandom(ThreadState* thread) {
+    int64_t size_sum = 0;
+    int64_t num_sizes = 0;
+    const size_t batch_size = entries_per_batch_;
+    std::vector<Range> ranges;
+    std::vector<Slice> lkeys;
+    std::vector<std::unique_ptr<const char[]>> lkey_guards;
+    std::vector<Slice> rkeys;
+    std::vector<std::unique_ptr<const char[]>> rkey_guards;
+    std::vector<uint64_t> sizes;
+    while (ranges.size() < batch_size) {
+      // Ugly without C++17 return from emplace_back
+      lkey_guards.emplace_back();
+      rkey_guards.emplace_back();
+      lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
+      rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
+      ranges.emplace_back(lkeys.back(), rkeys.back());
+      sizes.push_back(0);
+    }
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      for (size_t i = 0; i < batch_size; ++i) {
+        int64_t lkey = GetRandomKey(&thread->rand);
+        int64_t rkey = GetRandomKey(&thread->rand);
+        if (lkey > rkey) {
+          std::swap(lkey, rkey);
+        }
+        GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
+        GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
+      }
+      db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
+                              &sizes[0]);
+      num_sizes += entries_per_batch_;
+      for (int64_t size : sizes) {
+        size_sum += size;
+      }
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
+             static_cast<double>(size_sum) / static_cast<double>(num_sizes));
     thread->stats.AddMessage(msg);
   }
 
@@ -5245,19 +6054,12 @@
   // based on the hotness of the prefix and also the key hotness distribution.
   class GenerateTwoTermExpKeys {
    public:
-    int64_t keyrange_rand_max_;
-    int64_t keyrange_size_;
-    int64_t keyrange_num_;
-    bool initiated_;
+    // Avoid uninitialized warning-as-error in some compilers
+    int64_t keyrange_rand_max_ = 0;
+    int64_t keyrange_size_ = 0;
+    int64_t keyrange_num_ = 0;
     std::vector<KeyrangeUnit> keyrange_set_;
 
-    GenerateTwoTermExpKeys() {
-      keyrange_rand_max_ = FLAGS_num;
-      initiated_ = false;
-    }
-
-    ~GenerateTwoTermExpKeys() {}
-
     // Initiate the KeyrangeUnit vector and calculate the size of each
     // KeyrangeUnit.
     Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
@@ -5265,7 +6067,6 @@
                                    double prefix_d) {
       int64_t amplify = 0;
       int64_t keyrange_start = 0;
-      initiated_ = true;
       if (FLAGS_keyrange_num <= 0) {
         keyrange_num_ = 1;
       } else {
@@ -5354,24 +6155,26 @@
 
       // Select one key in the key-range and compose the keyID
       int64_t key_offset = 0, key_seed;
-      if (key_dist_a == 0.0 && key_dist_b == 0.0) {
+      if (key_dist_a == 0.0 || key_dist_b == 0.0) {
         key_offset = ini_rand % keyrange_size_;
       } else {
+        double u =
+            static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
         key_seed = static_cast<int64_t>(
-            ceil(std::pow((ini_rand / key_dist_a), (1 / key_dist_b))));
+            ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
         Random64 rand_key(key_seed);
-        key_offset = static_cast<int64_t>(rand_key.Next()) % keyrange_size_;
+        key_offset = rand_key.Next() % keyrange_size_;
       }
       return keyrange_size_ * keyrange_id + key_offset;
     }
   };
 
-  // The social graph wokrload mixed with Get, Put, Iterator queries.
+  // The social graph workload mixed with Get, Put, Iterator queries.
   // The value size and iterator length follow Pareto distribution.
   // The overall key access follow power distribution. If user models the
   // workload based on different key-ranges (or different prefixes), user
   // can use two-term-exponential distribution to fit the workload. User
-  // needs to decides the ratio between Get, Put, Iterator queries before
+  // needs to decide the ratio between Get, Put, Iterator queries before
   // starting the benchmark.
   void MixGraph(ThreadState* thread) {
     int64_t read = 0;  // including single gets and Next of iterators
@@ -5387,6 +6190,7 @@
     double write_rate = 1000000.0;
     double read_rate = 1000000.0;
     bool use_prefix_modeling = false;
+    bool use_random_modeling = false;
     GenerateTwoTermExpKeys gen_exp;
     std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
                               FLAGS_mix_seek_ratio};
@@ -5405,10 +6209,9 @@
     query.Initiate(ratio);
 
     // the limit of qps initiation
-    if (FLAGS_sine_a != 0 || FLAGS_sine_d != 0) {
-      thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
-          static_cast<int64_t>(read_rate), 100000 /* refill_period_us */, 10 /* fairness */,
-          RateLimiter::Mode::kReadsOnly));
+    if (FLAGS_sine_mix_rate) {
+      thread->shared->read_rate_limiter.reset(
+          NewGenericRateLimiter(static_cast<int64_t>(read_rate)));
       thread->shared->write_rate_limiter.reset(
           NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
     }
@@ -5421,6 +6224,9 @@
           FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
           FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
     }
+    if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
+      use_random_modeling = true;
+    }
 
     Duration duration(FLAGS_duration, reads_);
     while (!duration.Done(1)) {
@@ -5431,7 +6237,9 @@
       double u = static_cast<double>(rand_v) / FLAGS_num;
 
       // Generate the keyID based on the key hotness and prefix hotness
-      if (use_prefix_modeling) {
+      if (use_random_modeling) {
+        key_rand = ini_rand;
+      } else if (use_prefix_modeling) {
         key_rand =
             gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
       } else {
@@ -5451,23 +6259,25 @@
         usecs_since_last = 0;
       }
 
-      if (usecs_since_last >
-          (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
+      if (FLAGS_sine_mix_rate &&
+          usecs_since_last >
+              (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
         double usecs_since_start =
             static_cast<double>(now - thread->stats.GetStart());
         thread->stats.ResetSineInterval();
         double mix_rate_with_noise = AddNoise(
             SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
         read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
-        write_rate =
-            mix_rate_with_noise * query.ratio_[1] * FLAGS_mix_ave_kv_size;
+        write_rate = mix_rate_with_noise * query.ratio_[1];
 
-        thread->shared->write_rate_limiter.reset(
-            NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
-        thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
-            static_cast<int64_t>(read_rate),
-            FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000}, 10,
-            RateLimiter::Mode::kReadsOnly));
+        if (read_rate > 0) {
+          thread->shared->read_rate_limiter->SetBytesPerSecond(
+              static_cast<int64_t>(read_rate));
+        }
+        if (write_rate > 0) {
+          thread->shared->write_rate_limiter->SetBytesPerSecond(
+              static_cast<int64_t>(write_rate));
+        }
       }
       // Start the query
       if (query_type == 0) {
@@ -5492,11 +6302,9 @@
           abort();
         }
 
-        if (thread->shared->read_rate_limiter.get() != nullptr &&
-            read % 256 == 255) {
-          thread->shared->read_rate_limiter->Request(
-              256, Env::IO_HIGH, nullptr /* stats */,
-              RateLimiter::OpType::kRead);
+        if (thread->shared->read_rate_limiter && read % 100 == 0) {
+          thread->shared->read_rate_limiter->Request(100, Env::IO_HIGH,
+                                                     nullptr /*stats*/);
         }
         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
       } else if (query_type == 1) {
@@ -5514,13 +6322,12 @@
             gen.Generate(static_cast<unsigned int>(val_size)));
         if (!s.ok()) {
           fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-          exit(1);
+          ErrorExit();
         }
 
-        if (thread->shared->write_rate_limiter) {
-          thread->shared->write_rate_limiter->Request(
-              key.size() + val_size, Env::IO_HIGH, nullptr /*stats*/,
-              RateLimiter::OpType::kWrite);
+        if (thread->shared->write_rate_limiter && puts % 100 == 0) {
+          thread->shared->write_rate_limiter->Request(100, Env::IO_HIGH,
+                                                      nullptr /*stats*/);
         }
         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
       } else if (query_type == 2) {
@@ -5571,8 +6378,17 @@
   void IteratorCreation(ThreadState* thread) {
     Duration duration(FLAGS_duration, reads_);
     ReadOptions options(FLAGS_verify_checksum, true);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+        options.timestamp = &ts;
+      }
       Iterator* iter = db->NewIterator(options);
       delete iter;
       thread->stats.FinishedOps(nullptr, db, 1, kOthers);
@@ -5596,14 +6412,23 @@
     options.prefix_same_as_start = FLAGS_prefix_same_as_start;
     options.tailing = FLAGS_use_tailing_iterator;
     options.readahead_size = FLAGS_readahead_size;
+    options.adaptive_readahead = FLAGS_adaptive_readahead;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      options.timestamp = &ts;
+    }
 
-    Iterator* single_iter = nullptr;
-    std::vector<Iterator*> multi_iters;
-    if (db_.db != nullptr) {
-      single_iter = db_.db->NewIterator(options);
-    } else {
-      for (const auto& db_with_cfh : multi_dbs_) {
-        multi_iters.push_back(db_with_cfh.db->NewIterator(options));
+    std::vector<Iterator*> tailing_iters;
+    if (FLAGS_use_tailing_iterator) {
+      if (db_.db != nullptr) {
+        tailing_iters.push_back(db_.db->NewIterator(options));
+      } else {
+        for (const auto& db_with_cfh : multi_dbs_) {
+          tailing_iters.push_back(db_with_cfh.db->NewIterator(options));
+        }
       }
     }
 
@@ -5637,24 +6462,22 @@
         }
       }
 
-      if (!FLAGS_use_tailing_iterator) {
+      // Pick a Iterator to use
+      size_t db_idx_to_use =
+          (db_.db == nullptr)
+              ? (size_t{thread->rand.Next()} % multi_dbs_.size())
+              : 0;
+      std::unique_ptr<Iterator> single_iter;
+      Iterator* iter_to_use;
+      if (FLAGS_use_tailing_iterator) {
+        iter_to_use = tailing_iters[db_idx_to_use];
+      } else {
         if (db_.db != nullptr) {
-          delete single_iter;
-          single_iter = db_.db->NewIterator(options);
+          single_iter.reset(db_.db->NewIterator(options));
         } else {
-          for (auto iter : multi_iters) {
-            delete iter;
-          }
-          multi_iters.clear();
-          for (const auto& db_with_cfh : multi_dbs_) {
-            multi_iters.push_back(db_with_cfh.db->NewIterator(options));
-          }
+          single_iter.reset(multi_dbs_[db_idx_to_use].db->NewIterator(options));
         }
-      }
-      // Pick a Iterator to use
-      Iterator* iter_to_use = single_iter;
-      if (single_iter == nullptr) {
-        iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
+        iter_to_use = single_iter.get();
       }
 
       iter_to_use->Seek(key);
@@ -5686,8 +6509,7 @@
 
       thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
     }
-    delete single_iter;
-    for (auto iter : multi_iters) {
+    for (auto iter : tailing_iters) {
       delete iter;
     }
 
@@ -5719,11 +6541,17 @@
   }
 
   void DoDelete(ThreadState* thread, bool seq) {
-    WriteBatch batch;
+    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                     user_timestamp_size_);
     Duration duration(seq ? 0 : FLAGS_duration, deletes_);
     int64_t i = 0;
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
 
     while (!duration.Done(entries_per_batch_)) {
       DB* db = SelectDB(thread);
@@ -5733,7 +6561,16 @@
         GenerateKeyFromInt(k, FLAGS_num, &key);
         batch.Delete(key);
       }
-      auto s = db->Write(write_options_, &batch);
+      Status s;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        s = batch.AssignTimestamp(ts);
+        if (!s.ok()) {
+          fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
+          ErrorExit();
+        }
+      }
+      s = db->Write(write_options_, &batch);
       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
       if (!s.ok()) {
         fprintf(stderr, "del error: %s\n", s.ToString().c_str());
@@ -5783,6 +6620,10 @@
 
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
     uint32_t written = 0;
     bool hint_printed = false;
 
@@ -5814,18 +6655,27 @@
       Status s;
 
       Slice val = gen.Generate();
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        write_options_.timestamp = &ts;
+      }
       if (write_merge == kWrite) {
         s = db->Put(write_options_, key, val);
       } else {
         s = db->Merge(write_options_, key, val);
       }
+      // Restore write_options_
+      if (user_timestamp_size_ > 0) {
+        write_options_.timestamp = nullptr;
+      }
       written++;
 
       if (!s.ok()) {
         fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      bytes += key.size() + val.size();
+      bytes += key.size() + val.size() + user_timestamp_size_;
       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
 
       if (FLAGS_benchmark_write_rate_limit > 0) {
@@ -5852,6 +6702,14 @@
     }
     assert(db_.db != nullptr);
     ReadOptions read_options;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      read_options.timestamp = &ts;
+    }
+    read_options.adaptive_readahead = FLAGS_adaptive_readahead;
     Iterator* iter = db_.db->NewIterator(read_options);
 
     fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
@@ -5883,13 +6741,26 @@
     std::string suffixes[3] = {"2", "1", "0"};
     std::string keys[3];
 
-    WriteBatch batch;
+    WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
+                     user_timestamp_size_);
     Status s;
     for (int i = 0; i < 3; i++) {
       keys[i] = key.ToString() + suffixes[i];
       batch.Put(keys[i], value);
     }
 
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+      s = batch.AssignTimestamp(ts);
+      if (!s.ok()) {
+        fprintf(stderr, "assign timestamp to batch: %s\n",
+                s.ToString().c_str());
+        ErrorExit();
+      }
+    }
+
     s = db->Write(writeoptions, &batch);
     return s;
   }
@@ -5902,13 +6773,25 @@
     std::string suffixes[3] = {"1", "2", "0"};
     std::string keys[3];
 
-    WriteBatch batch;
+    WriteBatch batch(0, 0, user_timestamp_size_);
     Status s;
     for (int i = 0; i < 3; i++) {
       keys[i] = key.ToString() + suffixes[i];
       batch.Delete(keys[i]);
     }
 
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      Slice ts = mock_app_clock_->Allocate(ts_guard.get());
+      s = batch.AssignTimestamp(ts);
+      if (!s.ok()) {
+        fprintf(stderr, "assign timestamp to batch: %s\n",
+                s.ToString().c_str());
+        ErrorExit();
+      }
+    }
+
     s = db->Write(writeoptions, &batch);
     return s;
   }
@@ -5923,6 +6806,15 @@
     Slice key_slices[3];
     std::string values[3];
     ReadOptions readoptionscopy = readoptions;
+
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->Allocate(ts_guard.get());
+      readoptionscopy.timestamp = &ts;
+    }
+
     readoptionscopy.snapshot = db->GetSnapshot();
     Status s;
     for (int i = 0; i < 3; i++) {
@@ -6045,6 +6937,11 @@
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
 
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
+
     // the number of iterations is the larger of read_ or write_
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
@@ -6056,6 +6953,12 @@
       }
       if (get_weight > 0) {
         // do all the gets first
+        Slice ts;
+        if (user_timestamp_size_ > 0) {
+          ts = mock_app_clock_->GetTimestampForRead(thread->rand,
+                                                    ts_guard.get());
+          options.timestamp = &ts;
+        }
         Status s = db->Get(options, key, &value);
         if (!s.ok() && !s.IsNotFound()) {
           fprintf(stderr, "get error: %s\n", s.ToString().c_str());
@@ -6070,10 +6973,15 @@
       } else  if (put_weight > 0) {
         // then do all the corresponding number of puts
         // for all the gets we have done earlier
+        Slice ts;
+        if (user_timestamp_size_ > 0) {
+          ts = mock_app_clock_->Allocate(ts_guard.get());
+          write_options_.timestamp = &ts;
+        }
         Status s = db->Put(write_options_, key, gen.Generate());
         if (!s.ok()) {
           fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-          exit(1);
+          ErrorExit();
         }
         put_weight--;
         writes_done++;
@@ -6099,15 +7007,25 @@
 
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
     // the number of iterations is the larger of read_ or write_
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        // Read with newest timestamp because we are doing rmw.
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        options.timestamp = &ts;
+      }
 
       auto status = db->Get(options, key, &value);
       if (status.ok()) {
         ++found;
-        bytes += key.size() + value.size();
+        bytes += key.size() + value.size() + user_timestamp_size_;
       } else if (!status.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n",
                 status.ToString().c_str());
@@ -6121,12 +7039,16 @@
       }
 
       Slice val = gen.Generate();
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        write_options_.timestamp = &ts;
+      }
       Status s = db->Put(write_options_, key, val);
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      bytes += key.size() + val.size();
+      bytes += key.size() + val.size() + user_timestamp_size_;
       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
     }
     char msg[100];
@@ -6151,10 +7073,19 @@
 
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
     // the number of iterations is the larger of read_ or write_
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        options.timestamp = &ts;
+      }
 
       auto status = db->Get(options, key, &existing_value);
       if (status.ok()) {
@@ -6175,10 +7106,15 @@
         xor_operator.XOR(nullptr, value, &new_value);
       }
 
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        write_options_.timestamp = &ts;
+      }
+
       Status s = db->Put(write_options_, key, Slice(new_value));
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-        exit(1);
+        ErrorExit();
       }
       thread->stats.FinishedOps(nullptr, db, 1);
     }
@@ -6200,16 +7136,25 @@
 
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
     // The number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        options.timestamp = &ts;
+      }
 
       auto status = db->Get(options, key, &value);
       if (status.ok()) {
         ++found;
-        bytes += key.size() + value.size();
+        bytes += key.size() + value.size() + user_timestamp_size_;
       } else if (!status.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n",
                 status.ToString().c_str());
@@ -6227,13 +7172,18 @@
       }
       value.append(operand.data(), operand.size());
 
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        write_options_.timestamp = &ts;
+      }
+
       // Write back to the database
       Status s = db->Put(write_options_, key, value);
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-        exit(1);
+        ErrorExit();
       }
-      bytes += key.size() + value.size();
+      bytes += key.size() + value.size() + user_timestamp_size_;
       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
     }
 
@@ -6359,8 +7309,16 @@
     thread->stats.Start(thread->tid);
 
     DB* db = SelectDB(thread);
-    std::unique_ptr<Iterator> iter(
-      db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)));
+    ReadOptions read_opts(FLAGS_verify_checksum, true);
+    read_opts.adaptive_readahead = FLAGS_adaptive_readahead;
+    std::unique_ptr<char[]> ts_guard;
+    Slice ts;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+      ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
+      read_opts.timestamp = &ts;
+    }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
 
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
@@ -6588,6 +7546,10 @@
   void RandomReplaceKeys(ThreadState* thread) {
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
+    std::unique_ptr<char[]> ts_guard;
+    if (user_timestamp_size_ > 0) {
+      ts_guard.reset(new char[user_timestamp_size_]);
+    }
     std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
     size_t max_counter = 50;
     RandomGenerator gen;
@@ -6596,6 +7558,11 @@
     DB* db = SelectDB(thread);
     for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
       GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        write_options_.timestamp = &ts;
+      }
       s = db->Put(write_options_, key, gen.Generate());
       if (!s.ok()) {
         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
@@ -6615,12 +7582,21 @@
                                 static_cast<int64_t>(0));
       GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
                          &key);
+      Slice ts;
+      if (user_timestamp_size_ > 0) {
+        ts = mock_app_clock_->Allocate(ts_guard.get());
+        write_options_.timestamp = &ts;
+      }
       s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
                                    : db->Delete(write_options_, key);
       if (s.ok()) {
         counters[key_id] = (counters[key_id] + 1) % max_counter;
         GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
                            &key);
+        if (user_timestamp_size_ > 0) {
+          ts = mock_app_clock_->Allocate(ts_guard.get());
+          write_options_.timestamp = &ts;
+        }
         s = db->Put(write_options_, key, Slice());
       }
 
@@ -6768,7 +7744,7 @@
 
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-        exit(1);
+        ErrorExit();
       }
       bytes = key.size() + val.size();
       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
@@ -6811,6 +7787,167 @@
     }
   }
 
+#ifndef ROCKSDB_LITE
+  void WaitForCompactionHelper(DBWithColumnFamilies& db) {
+    // This is an imperfect way of waiting for compaction. The loop and sleep
+    // is done because a thread that finishes a compaction job should get a
+    // chance to pickup a new compaction job.
+
+    std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
+                                     DB::Properties::kNumRunningFlushes,
+                                     DB::Properties::kCompactionPending,
+                                     DB::Properties::kNumRunningCompactions};
+
+    fprintf(stdout, "waitforcompaction(%s): started\n",
+            db.db->GetName().c_str());
+
+    while (true) {
+      bool retry = false;
+
+      for (const auto& k : keys) {
+        uint64_t v;
+        if (!db.db->GetIntProperty(k, &v)) {
+          fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
+                  db.db->GetName().c_str(), k.c_str());
+          exit(1);
+        } else if (v > 0) {
+          fprintf(stdout,
+                  "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
+                  db.db->GetName().c_str(), k.c_str());
+          FLAGS_env->SleepForMicroseconds(10 * 1000000);
+          retry = true;
+          break;
+        }
+      }
+
+      if (!retry) {
+        fprintf(stdout, "waitforcompaction(%s): finished\n",
+                db.db->GetName().c_str());
+        return;
+      }
+    }
+  }
+
+  void WaitForCompaction() {
+    // Give background threads a chance to wake
+    FLAGS_env->SleepForMicroseconds(5 * 1000000);
+
+    // I am skeptical that this check race free. I hope that checking twice
+    // reduces the chance.
+    if (db_.db != nullptr) {
+      WaitForCompactionHelper(db_);
+      WaitForCompactionHelper(db_);
+    } else {
+      for (auto& db_with_cfh : multi_dbs_) {
+        WaitForCompactionHelper(db_with_cfh);
+        WaitForCompactionHelper(db_with_cfh);
+      }
+    }
+  }
+
+  bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
+    std::vector<LiveFileMetaData> files;
+    db_with_cfh.db->GetLiveFilesMetaData(&files);
+
+    assert(from_level == 0 || from_level == 1);
+
+    int real_from_level = from_level;
+    if (real_from_level > 0) {
+      // With dynamic leveled compaction the first level with data beyond L0
+      // might not be L1.
+      real_from_level = std::numeric_limits<int>::max();
+
+      for (auto& f : files) {
+        if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
+      }
+
+      if (real_from_level == std::numeric_limits<int>::max()) {
+        fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
+        return true;
+      }
+    }
+
+    // The goal is to compact from from_level to the level that follows it,
+    // and with dynamic leveled compaction the next level might not be
+    // real_from_level+1
+    int next_level = std::numeric_limits<int>::max();
+
+    std::vector<std::string> files_to_compact;
+    for (auto& f : files) {
+      if (f.level == real_from_level)
+        files_to_compact.push_back(f.name);
+      else if (f.level > real_from_level && f.level < next_level)
+        next_level = f.level;
+    }
+
+    if (files_to_compact.empty()) {
+      fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
+      return true;
+    } else if (next_level == std::numeric_limits<int>::max()) {
+      // There is no data beyond real_from_level. So we are done.
+      fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
+              real_from_level);
+      return true;
+    }
+
+    fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
+            from_level, static_cast<int>(files_to_compact.size()),
+            real_from_level, next_level);
+
+    ROCKSDB_NAMESPACE::CompactionOptions options;
+    // Lets RocksDB use the configured compression for this level
+    options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
+
+    ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
+    db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
+    options.output_file_size_limit = cfDesc.options.target_file_size_base;
+
+    Status status =
+        db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
+    if (!status.ok()) {
+      // This can fail for valid reasons including the operation was aborted
+      // or a filename is invalid because background compaction removed it.
+      // Having read the current cases for which an error is raised I prefer
+      // not to figure out whether an exception should be thrown here.
+      fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
+              status.ToString().c_str());
+      return false;
+    }
+    return true;
+  }
+
+  void CompactLevel(int from_level) {
+    if (db_.db != nullptr) {
+      while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
+    }
+    for (auto& db_with_cfh : multi_dbs_) {
+      while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
+    }
+  }
+#endif
+
+  void Flush() {
+    FlushOptions flush_opt;
+    flush_opt.wait = true;
+
+    if (db_.db != nullptr) {
+      Status s = db_.db->Flush(flush_opt, db_.cfh);
+      if (!s.ok()) {
+        fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    } else {
+      for (const auto& db_with_cfh : multi_dbs_) {
+        Status s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
+        if (!s.ok()) {
+          fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+      }
+    }
+    fprintf(stdout, "flush memtable\n");
+  }
+
   void ResetStats() {
     if (db_.db != nullptr) {
       db_.db->ResetStats();
@@ -6873,6 +8010,32 @@
     fprintf(stdout, "\n%s\n", stats.c_str());
   }
 
+  void PrintStats(const std::vector<std::string>& keys) {
+    if (db_.db != nullptr) {
+      PrintStats(db_.db, keys);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStats(db_with_cfh.db, keys, true);
+    }
+  }
+
+  void PrintStats(DB* db, const std::vector<std::string>& keys,
+                  bool print_header = false) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+
+    for (const auto& key : keys) {
+      std::string stats;
+      if (!db->GetProperty(key, &stats)) {
+        stats = "(failed)";
+      }
+      fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+
   void Replay(ThreadState* thread) {
     if (db_.db != nullptr) {
       Replay(thread, &db_);
@@ -6892,24 +8055,40 @@
           s.ToString().c_str());
       exit(1);
     }
-    Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
-                      std::move(trace_reader));
-    replayer.SetFastForward(
-        static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
-    s = replayer.MultiThreadReplay(
-        static_cast<uint32_t>(FLAGS_trace_replay_threads));
+    std::unique_ptr<Replayer> replayer;
+    s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
+                                            std::move(trace_reader), &replayer);
+    if (!s.ok()) {
+      fprintf(stderr,
+              "Encountered an error creating a default Replayer. "
+              "Error: %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+    s = replayer->Prepare();
+    if (!s.ok()) {
+      fprintf(stderr, "Prepare for replay failed. Error: %s\n",
+              s.ToString().c_str());
+    }
+    s = replayer->Replay(
+        ReplayOptions(static_cast<uint32_t>(FLAGS_trace_replay_threads),
+                      FLAGS_trace_replay_fast_forward),
+        nullptr);
+    replayer.reset();
     if (s.ok()) {
-      fprintf(stdout, "Replay started from trace_file: %s\n",
+      fprintf(stdout, "Replay completed from trace_file: %s\n",
               FLAGS_trace_file.c_str());
     } else {
-      fprintf(stderr, "Starting replay failed. Error: %s\n",
-              s.ToString().c_str());
+      fprintf(stderr, "Replay failed. Error: %s\n", s.ToString().c_str());
     }
   }
+
+#endif  // ROCKSDB_LITE
 };
 
 int db_bench_tool(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ConfigOptions config_options;
   static bool initialized = false;
   if (!initialized) {
     SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
@@ -6926,8 +8105,8 @@
     exit(1);
   }
   if (!FLAGS_statistics_string.empty()) {
-    Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
-        FLAGS_statistics_string, &dbstats);
+    Status s = Statistics::CreateFromString(config_options,
+                                            FLAGS_statistics_string, &dbstats);
     if (dbstats == nullptr) {
       fprintf(stderr,
               "No Statistics registered matching string: %s status=%s\n",
@@ -6960,18 +8139,35 @@
     StringToCompressionType(FLAGS_compression_type.c_str());
 
 #ifndef ROCKSDB_LITE
+  // Stacked BlobDB
   FLAGS_blob_db_compression_type_e =
     StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
 
-  if (!FLAGS_hdfs.empty() && !FLAGS_env_uri.empty()) {
-    fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n");
+  int env_opts =
+      !FLAGS_hdfs.empty() + !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
+  if (env_opts > 1) {
+    fprintf(stderr,
+            "Error: --hdfs, --env_uri and --fs_uri are mutually exclusive\n");
     exit(1);
-  } else if (!FLAGS_env_uri.empty()) {
-    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
-    if (FLAGS_env == nullptr) {
-      fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
+  }
+
+  if (env_opts == 1) {
+    Status s = Env::CreateFromUri(config_options, FLAGS_env_uri, FLAGS_fs_uri,
+                                  &FLAGS_env, &env_guard);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
       exit(1);
     }
+  } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
+    //**TODO: Make the simulate fs something that can be loaded
+    // from the ObjectRegistry...
+    static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
+        NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
+            FileSystem::Default(), FLAGS_simulate_hybrid_fs_file,
+            /*throughput_multiplier=*/
+            int{FLAGS_simulate_hybrid_hdd_multipliers},
+            /*is_full_fs_warm=*/FLAGS_simulate_hdd));
+    FLAGS_env = composite_env.get();
   }
 #endif  // ROCKSDB_LITE
   if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
@@ -7001,8 +8197,6 @@
   FLAGS_value_size_distribution_type_e =
     StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
 
-  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
-
   // Note options sanitization may increase thread pool sizes according to
   // max_background_flushes/max_background_compactions/max_background_jobs
   FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_bench_tool_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_bench_tool_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_bench_tool_test.cc	2025-05-19 16:14:27.000000000 +0000
@@ -8,6 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/db_bench_tool.h"
+
+#include "db/db_impl/db_impl.h"
 #include "options/options_parser.h"
 #include "rocksdb/utilities/options_util.h"
 #include "test_util/testharness.h"
@@ -53,9 +55,29 @@
     }
   }
 
+  // Gets the default options for this test/db_bench.
+  // Note that db_bench may change some of the default option values and that
+  // the database might as well.  The options changed by db_bench are
+  // specified here; the ones by the DB are set via SanitizeOptions
+  Options GetDefaultOptions(CompactionStyle style = kCompactionStyleLevel,
+                            int levels = 7) const {
+    Options opt;
+
+    opt.create_if_missing = true;
+    opt.max_open_files = 256;
+    opt.max_background_compactions = 10;
+    opt.dump_malloc_stats = true;  // db_bench uses a different default
+    opt.compaction_style = style;
+    opt.num_levels = levels;
+    opt.compression = kNoCompression;
+    opt.arena_block_size = 8388608;
+
+    return SanitizeOptions(db_path_, opt);
+  }
+
   void RunDbBench(const std::string& options_file_name) {
     AppendArgs({"./db_bench", "--benchmarks=fillseq", "--use_existing_db=0",
-                "--num=1000",
+                "--num=1000", "--compression_type=none",
                 std::string(std::string("--db=") + db_path_).c_str(),
                 std::string(std::string("--wal_dir=") + wal_path_).c_str(),
                 std::string(std::string("--options_file=") + options_file_name)
@@ -66,19 +88,22 @@
   void VerifyOptions(const Options& opt) {
     DBOptions loaded_db_opts;
     std::vector<ColumnFamilyDescriptor> cf_descs;
-    ASSERT_OK(LoadLatestOptions(db_path_, FileSystem::Default(),
-                                &loaded_db_opts, &cf_descs));
+    ASSERT_OK(LoadLatestOptions(db_path_, Env::Default(), &loaded_db_opts,
+                                &cf_descs));
 
-    ASSERT_OK(
-        RocksDBOptionsParser::VerifyDBOptions(DBOptions(opt), loaded_db_opts));
-    ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(ColumnFamilyOptions(opt),
-                                                    cf_descs[0].options));
+    ConfigOptions exact;
+    exact.input_strings_escaped = false;
+    exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+    ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, DBOptions(opt),
+                                                    loaded_db_opts));
+    ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+        exact, ColumnFamilyOptions(opt), cf_descs[0].options));
 
     // check with the default rocksdb options and expect failure
-    ASSERT_NOK(
-        RocksDBOptionsParser::VerifyDBOptions(DBOptions(), loaded_db_opts));
-    ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(ColumnFamilyOptions(),
-                                                     cf_descs[0].options));
+    ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(exact, DBOptions(),
+                                                     loaded_db_opts));
+    ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+        exact, ColumnFamilyOptions(), cf_descs[0].options));
   }
 
   char** argv() { return argv_; }
@@ -100,21 +125,17 @@
 
 TEST_F(DBBenchTest, OptionsFile) {
   const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
-
-  Options opt;
-  opt.create_if_missing = true;
-  opt.max_open_files = 256;
-  opt.max_background_compactions = 10;
-  opt.arena_block_size = 8388608;
+  Options opt = GetDefaultOptions();
   ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
                                   {ColumnFamilyOptions(opt)}, kOptionsFileName,
-                                  Env::Default()));
+                                  opt.env->GetFileSystem().get()));
 
   // override the following options as db_bench will not take these
   // options from the options file
   opt.wal_dir = wal_path_;
 
   RunDbBench(kOptionsFileName);
+  opt.delayed_write_rate = 16 * 1024 * 1024;  // Set by SanitizeOptions
 
   VerifyOptions(opt);
 }
@@ -122,21 +143,15 @@
 TEST_F(DBBenchTest, OptionsFileUniversal) {
   const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
 
-  Options opt;
-  opt.compaction_style = kCompactionStyleUniversal;
-  opt.num_levels = 1;
-  opt.create_if_missing = true;
-  opt.max_open_files = 256;
-  opt.max_background_compactions = 10;
-  opt.arena_block_size = 8388608;
+  Options opt = GetDefaultOptions(kCompactionStyleUniversal, 1);
+
   ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
                                   {ColumnFamilyOptions(opt)}, kOptionsFileName,
-                                  Env::Default()));
+                                  opt.env->GetFileSystem().get()));
 
   // override the following options as db_bench will not take these
   // options from the options file
   opt.wal_dir = wal_path_;
-
   RunDbBench(kOptionsFileName);
 
   VerifyOptions(opt);
@@ -145,23 +160,17 @@
 TEST_F(DBBenchTest, OptionsFileMultiLevelUniversal) {
   const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
 
-  Options opt;
-  opt.compaction_style = kCompactionStyleUniversal;
-  opt.num_levels = 12;
-  opt.create_if_missing = true;
-  opt.max_open_files = 256;
-  opt.max_background_compactions = 10;
-  opt.arena_block_size = 8388608;
+  Options opt = GetDefaultOptions(kCompactionStyleUniversal, 12);
+
   ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
                                   {ColumnFamilyOptions(opt)}, kOptionsFileName,
-                                  Env::Default()));
+                                  opt.env->GetFileSystem().get()));
 
   // override the following options as db_bench will not take these
   // options from the options file
   opt.wal_dir = wal_path_;
 
   RunDbBench(kOptionsFileName);
-
   VerifyOptions(opt);
 }
 
@@ -213,7 +222,7 @@
   max_log_file_size=83886080
   random_access_max_buffer_size=1048576
   advise_random_on_open=true
-
+  dump_malloc_stats=true
 
 [CFOptions "default"]
   compaction_filter_factory=nullptr
@@ -236,7 +245,7 @@
   max_grandparent_overlap_factor=10
   max_bytes_for_level_multiplier=10
   memtable_factory=SkipListFactory
-  compression=kSnappyCompression
+  compression=kNoCompression
   min_partial_merge_operands=2
   level0_stop_writes_trigger=100
   num_levels=1
@@ -262,6 +271,14 @@
   hard_pending_compaction_bytes_limit=0
   disable_auto_compactions=false
   compaction_measure_io_stats=false
+  enable_blob_files=true
+  min_blob_size=16
+  blob_file_size=10485760
+  blob_compression_type=kNoCompression
+  enable_blob_garbage_collection=true
+  blob_garbage_collection_age_cutoff=0.5
+  blob_garbage_collection_force_threshold=0.75
+  blob_compaction_readahead_size=262144
 
 [TableOptions/BlockBasedTable "default"]
   format_version=0
@@ -292,7 +309,6 @@
   ASSERT_OK(LoadOptionsFromFile(kOptionsFileName, Env::Default(), &db_opt,
                                 &cf_descs));
   Options opt(db_opt, cf_descs[0].options);
-
   opt.create_if_missing = true;
 
   // override the following options as db_bench will not take these
@@ -301,14 +317,14 @@
 
   RunDbBench(kOptionsFileName);
 
-  VerifyOptions(opt);
+  VerifyOptions(SanitizeOptions(db_path_, opt));
 }
 
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_crashtest.py mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_crashtest.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_crashtest.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_crashtest.py	2025-05-19 16:14:28.000000000 +0000
@@ -1,9 +1,12 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import os
 import sys
 import time
 import random
+import re
 import tempfile
 import subprocess
 import shutil
@@ -22,10 +25,13 @@
 #   for txn:
 #       default_params < {blackbox,whitebox}_default_params < txn_params < args
 
-expected_values_file = tempfile.NamedTemporaryFile()
 
 default_params = {
     "acquire_snapshot_one_in": 10000,
+    "backup_max_size": 100 * 1024 * 1024,
+    # Consider larger number when backups considered more stable
+    "backup_one_in": 100000,
+    "batch_protection_bytes_per_key": lambda: random.choice([0, 8]),
     "block_size": 16384,
     "bloom_bits": lambda: random.choice([random.randint(0,19),
                                          random.lognormvariate(2.3, 1.3)]),
@@ -39,9 +45,13 @@
         random.choice(
             ["none", "snappy", "zlib", "bzip2", "lz4", "lz4hc", "xpress",
              "zstd"]),
-    "checksum_type" : lambda: random.choice(["kCRC32c", "kxxHash", "kxxHash64"]),
+    "checksum_type" : lambda: random.choice(["kCRC32c", "kxxHash", "kxxHash64", "kXXH3"]),
     "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
     "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
+    # Disabled compression_parallel_threads as the feature is not stable
+    # lambda: random.choice([1] * 9 + [4])
+    "compression_parallel_threads": 1,
+    "compression_max_dict_buffer_bytes": lambda: (1 << random.randint(0, 40)) - 1,
     "clear_column_family_one_in": 0,
     "compact_files_one_in": 1000000,
     "compact_range_one_in": 1000000,
@@ -49,34 +59,58 @@
     "delrangepercent": 1,
     "destroy_db_initially": 0,
     "enable_pipelined_write": lambda: random.randint(0, 1),
-    "expected_values_path": expected_values_file.name,
+    "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]),
+    "expected_values_dir": lambda: setup_expected_values_dir(),
+    "fail_if_options_file_error": lambda: random.randint(0, 1),
     "flush_one_in": 1000000,
-    "get_live_files_and_wal_files_one_in": 1000000,
+    "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]),
+    "get_live_files_one_in": 1000000,
+    # Note: the following two are intentionally disabled as the corresponding
+    # APIs are not guaranteed to succeed.
+    "get_sorted_wal_files_one_in": 0,
+    "get_current_wal_file_one_in": 0,
     # Temporarily disable hash index
-    "index_type": lambda: random.choice([0,2]),
+    "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]),
+    "iterpercent": 10,
+    "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1),
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
     "max_write_buffer_number": 3,
     "mmap_read": lambda: random.randint(0, 1),
+    # Setting `nooverwritepercent > 0` is only possible because we do not vary
+    # the random seed, so the same keys are chosen by every run for disallowing
+    # overwrites.
     "nooverwritepercent": 1,
-    "open_files": lambda : random.choice([-1, 500000]),
+    "open_files": lambda : random.choice([-1, -1, 100, 500000]),
+    "optimize_filters_for_memory": lambda: random.randint(0, 1),
     "partition_filters": lambda: random.randint(0, 1),
+    "partition_pinning": lambda: random.randint(0, 3),
     "pause_background_one_in": 1000000,
+    "prefix_size" : lambda: random.choice([-1, 1, 5, 7, 8]),
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
     "recycle_log_file_num": lambda: random.randint(0, 1),
-    "reopen": 20,
     "snapshot_hold_ops": 100000,
+    "sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]),
+    "sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]),
     "long_running_snapshots": lambda: random.randint(0, 1),
     "subcompactions": lambda: random.randint(1, 4),
     "target_file_size_base": 2097152,
     "target_file_size_multiplier": 2,
+    "test_batches_snapshots": lambda: random.randint(0, 1),
+    "top_level_index_pinning": lambda: random.randint(0, 3),
+    "unpartitioned_pinning": lambda: random.randint(0, 3),
     "use_direct_reads": lambda: random.randint(0, 1),
     "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
+    "mock_direct_io": False,
+    "use_clock_cache": 0, # currently broken
     "use_full_merge_v1": lambda: random.randint(0, 1),
     "use_merge": lambda: random.randint(0, 1),
+    # 999 -> use Bloom API
+    "ribbon_starting_level": lambda: random.choice([random.randint(-1, 10), 999]),
+    "use_block_based_filter": lambda: random.randint(0, 1),
     "verify_checksum": 1,
     "write_buffer_size": 4 * 1024 * 1024,
     "writepercent": 35,
@@ -100,6 +134,8 @@
         [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024]),
     "avoid_unnecessary_blocking_io" : random.randint(0, 1),
     "write_dbid_to_manifest" : random.randint(0, 1),
+    "avoid_flush_during_recovery" : random.choice(
+        [1 if t == 0 else 0 for t in range(0, 8)]),
     "max_write_batch_group_size_bytes" : lambda: random.choice(
         [16, 64, 1024 * 1024, 16 * 1024 * 1024]),
     "level_compaction_dynamic_level_bytes" : True,
@@ -107,65 +143,112 @@
     "verify_db_one_in": 100000,
     "continuous_verification_interval" : 0,
     "max_key_len": 3,
-    "key_len_percent_dist": "1,30,69"
+    "key_len_percent_dist": "1,30,69",
+    "read_fault_one_in": lambda: random.choice([0, 32, 1000]),
+    "open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]),
+    "open_write_fault_one_in": lambda: random.choice([0, 0, 16]),
+    "open_read_fault_one_in": lambda: random.choice([0, 0, 32]),
+    "sync_fault_injection": False,
+    "get_property_one_in": 1000000,
+    "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]),
+    "max_write_buffer_size_to_maintain": lambda: random.choice(
+        [0, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024]),
+    "user_timestamp_size": 0,
+    "secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]),
+    "prepopulate_block_cache" : lambda: random.choice([0, 1]),
+    "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]),
+    "memtable_whole_key_filtering": lambda: random.randint(0, 1),
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
+_DEBUG_LEVEL_ENV_VAR = 'DEBUG_LEVEL'
+
+stress_cmd = "./db_stress"
+
+def is_release_mode():
+    return os.environ.get(_DEBUG_LEVEL_ENV_VAR) == "0"
 
 
 def get_dbname(test_name):
+    test_dir_name = "rocksdb_crashtest_" + test_name
     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
     if test_tmpdir is None or test_tmpdir == "":
-        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
+        dbname = tempfile.mkdtemp(prefix=test_dir_name)
     else:
-        dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
+        dbname = test_tmpdir + "/" + test_dir_name
         shutil.rmtree(dbname, True)
         os.mkdir(dbname)
     return dbname
 
+expected_values_dir = None
+def setup_expected_values_dir():
+    global expected_values_dir
+    if expected_values_dir is not None:
+        return expected_values_dir
+    expected_dir_prefix = "rocksdb_crashtest_expected_"
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+    if test_tmpdir is None or test_tmpdir == "":
+        expected_values_dir = tempfile.mkdtemp(
+            prefix=expected_dir_prefix)
+    else:
+        # if tmpdir is specified, store the expected_values_dir under that dir
+        expected_values_dir = test_tmpdir + "/rocksdb_crashtest_expected"
+        if os.path.exists(expected_values_dir):
+            shutil.rmtree(expected_values_dir)
+        os.mkdir(expected_values_dir)
+    return expected_values_dir
+
 
 def is_direct_io_supported(dbname):
     with tempfile.NamedTemporaryFile(dir=dbname) as f:
         try:
             os.open(f.name, os.O_DIRECT)
-        except:
+        except BaseException:
             return False
         return True
 
 
 blackbox_default_params = {
+    "disable_wal": lambda: random.choice([0, 0, 0, 1]),
     # total time for this script to test db_stress
     "duration": 6000,
     # time for one db_stress instance to run
     "interval": 120,
     # since we will be killing anyway, use large value for ops_per_thread
     "ops_per_thread": 100000000,
+    "reopen": 0,
     "set_options_one_in": 10000,
-    "test_batches_snapshots": 1,
 }
 
 whitebox_default_params = {
+    # TODO: enable this once we figure out how to adjust kill odds for WAL-
+    # disabled runs, and either (1) separate full `db_stress` runs out of
+    # whitebox crash or (2) support verification at end of `db_stress` runs
+    # that ran with WAL disabled.
+    "disable_wal": 0,
     "duration": 10000,
     "log2_keys_per_lock": 10,
     "ops_per_thread": 200000,
     "random_kill_odd": 888887,
-    "test_batches_snapshots": lambda: random.randint(0, 1),
+    "reopen": 20,
 }
 
 simple_default_params = {
     "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
     "column_families": 1,
+    # TODO: re-enable once below loop succeeds for a while (a few minutes should
+    # suffice):
+    # `while rm -rf /dev/shm/single_stress && ./db_stress --clear_column_family_one_in=0 --column_families=1 --db=/dev/shm/single_stress --experimental_mempurge_threshold=5.493146827397074 --flush_one_in=10000 --reopen=0 --write_buffer_size=262144 --value_size_mult=33 --max_write_buffer_number=3 -ops_per_thread=10000; do : ; done`
+    "experimental_mempurge_threshold": 0,
     "max_background_compactions": 1,
     "max_bytes_for_level_base": 67108864,
     "memtablerep": "skip_list",
-    "prefixpercent": 0,
-    "readpercent": 50,
-    "prefix_size" : -1,
     "target_file_size_base": 16777216,
     "target_file_size_multiplier": 1,
     "test_batches_snapshots": 0,
     "write_buffer_size": 32 * 1024 * 1024,
     "level_compaction_dynamic_level_bytes": False,
+    "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]),
 }
 
 blackbox_simple_default_params = {
@@ -183,6 +266,9 @@
     # more frequently
     "write_buffer_size": 1024 * 1024,
     "enable_pipelined_write": lambda: random.randint(0, 1),
+    # Snapshots are used heavily in this test mode, while they are incompatible
+    # with compaction filter.
+    "enable_compaction_filter": 0,
 }
 
 txn_params = {
@@ -190,6 +276,8 @@
     # Avoid lambda to set it once for the entire test
     "txn_write_policy": random.randint(0, 2),
     "unordered_write": random.randint(0, 1),
+    # TODO: there is such a thing as transactions with WAL disabled. We should
+    # cover that case.
     "disable_wal": 0,
     # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
     "checkpoint_one_in": 0,
@@ -197,21 +285,74 @@
     "enable_pipelined_write": 0,
 }
 
+best_efforts_recovery_params = {
+    "best_efforts_recovery": True,
+    "skip_verifydb": True,
+    "verify_db_one_in": 0,
+    "continuous_verification_interval": 0,
+}
+
+blob_params = {
+    "allow_setting_blob_options_dynamically": 1,
+    # Enable blob files and GC with a 75% chance initially; note that they might still be
+    # enabled/disabled during the test via SetOptions
+    "enable_blob_files": lambda: random.choice([0] + [1] * 3),
+    "min_blob_size": lambda: random.choice([0, 8, 16]),
+    "blob_file_size": lambda: random.choice([1048576, 16777216, 268435456, 1073741824]),
+    "blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]),
+    "enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3),
+    "blob_garbage_collection_age_cutoff": lambda: random.choice([0.0, 0.25, 0.5, 0.75, 1.0]),
+    "blob_garbage_collection_force_threshold": lambda: random.choice([0.5, 0.75, 1.0]),
+    "blob_compaction_readahead_size": lambda: random.choice([0, 1048576, 4194304]),
+}
+
+ts_params = {
+    "test_cf_consistency": 0,
+    "test_batches_snapshots": 0,
+    "user_timestamp_size": 8,
+    "use_merge": 0,
+    "use_full_merge_v1": 0,
+    "use_txn": 0,
+    "read_only": 0,
+    "secondary_catch_up_one_in": 0,
+    "continuous_verification_interval": 0,
+    "checkpoint_one_in": 0,
+    "enable_blob_files": 0,
+    "use_blob_db": 0,
+    "enable_compaction_filter": 0,
+    "ingest_external_file_one_in": 0,
+    "use_block_based_filter": 0,
+}
+
 def finalize_and_sanitize(src_params):
     dest_params = dict([(k,  v() if callable(v) else v)
                         for (k, v) in src_params.items()])
-    if dest_params.get("compression_type") != "zstd" or \
-            dest_params.get("compression_max_dict_bytes") == 0:
+    if dest_params.get("compression_max_dict_bytes") == 0:
+        dest_params["compression_zstd_max_train_bytes"] = 0
+        dest_params["compression_max_dict_buffer_bytes"] = 0
+    if dest_params.get("compression_type") != "zstd":
         dest_params["compression_zstd_max_train_bytes"] = 0
     if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
         dest_params["memtablerep"] = "skip_list"
-    if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
-            dest_params["db"]):
+    if dest_params["mmap_read"] == 1:
         dest_params["use_direct_io_for_flush_and_compaction"] = 0
         dest_params["use_direct_reads"] = 0
-    # DeleteRange is not currnetly compatible with Txns
-    if dest_params.get("test_batches_snapshots") == 1 or \
-            dest_params.get("use_txn") == 1:
+    if (dest_params["use_direct_io_for_flush_and_compaction"] == 1
+            or dest_params["use_direct_reads"] == 1) and \
+            not is_direct_io_supported(dest_params["db"]):
+        if is_release_mode():
+            print("{} does not support direct IO. Disabling use_direct_reads and "
+                    "use_direct_io_for_flush_and_compaction.\n".format(
+                        dest_params["db"]))
+            dest_params["use_direct_reads"] = 0
+            dest_params["use_direct_io_for_flush_and_compaction"] = 0
+        else:
+            dest_params["mock_direct_io"] = True
+
+    # DeleteRange is not currnetly compatible with Txns and timestamp
+    if (dest_params.get("test_batches_snapshots") == 1 or
+        dest_params.get("use_txn") == 1 or
+        dest_params.get("user_timestamp_size") > 0):
         dest_params["delpercent"] += dest_params["delrangepercent"]
         dest_params["delrangepercent"] = 0
     # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
@@ -220,7 +361,12 @@
         dest_params["allow_concurrent_memtable_write"] = 1
     if dest_params.get("disable_wal", 0) == 1:
         dest_params["atomic_flush"] = 1
+        # The `DbStressCompactionFilter` can apply memtable updates to SST
+        # files, which would be problematic without WAL since such updates are
+        # expected to be lost in crash recoveries.
+        dest_params["enable_compaction_filter"] = 0
         dest_params["sync"] = 0
+        dest_params["write_fault_one_in"] = 0
     if dest_params.get("open_files", 1) != -1:
         # Compaction TTL and periodic compactions are only compatible
         # with open_files = -1
@@ -236,9 +382,32 @@
             dest_params["partition_filters"] = 0
         else:
             dest_params["use_block_based_filter"] = 0
+    if dest_params["ribbon_starting_level"] < 999:
+        dest_params["use_block_based_filter"] = 0
     if dest_params.get("atomic_flush", 0) == 1:
         # disable pipelined write when atomic flush is used.
         dest_params["enable_pipelined_write"] = 0
+    if dest_params.get("sst_file_manager_bytes_per_sec", 0) == 0:
+        dest_params["sst_file_manager_bytes_per_truncate"] = 0
+    if dest_params.get("enable_compaction_filter", 0) == 1:
+        # Compaction filter is incompatible with snapshots. Need to avoid taking
+        # snapshots, as well as avoid operations that use snapshots for
+        # verification.
+        dest_params["acquire_snapshot_one_in"] = 0
+        dest_params["compact_range_one_in"] = 0
+        # Give the iterator ops away to reads.
+        dest_params["readpercent"] += dest_params.get("iterpercent", 10)
+        dest_params["iterpercent"] = 0
+        dest_params["test_batches_snapshots"] = 0
+    if dest_params.get("prefix_size") == -1:
+        dest_params["readpercent"] += dest_params.get("prefixpercent", 20)
+        dest_params["prefixpercent"] = 0
+        dest_params["test_batches_snapshots"] = 0
+    if dest_params.get("test_batches_snapshots") == 0:
+        dest_params["batch_protection_bytes_per_key"] = 0
+    if (dest_params.get("prefix_size") == -1 and
+        dest_params.get("memtable_whole_key_filtering") == 0):
+        dest_params["memtable_prefix_bloom_size_ratio"] = 0
     return dest_params
 
 def gen_cmd_params(args):
@@ -259,6 +428,18 @@
         params.update(cf_consistency_params)
     if args.txn:
         params.update(txn_params)
+    if args.test_best_efforts_recovery:
+        params.update(best_efforts_recovery_params)
+    if args.enable_ts:
+        params.update(ts_params)
+
+    # Best-effort recovery and BlobDB are currently incompatible. Test BE recovery
+    # if specified on the command line; otherwise, apply BlobDB related overrides
+    # with a 10% chance.
+    if (not args.test_best_efforts_recovery and
+        not args.enable_ts and
+        random.choice([0] * 9 + [1]) == 1):
+        params.update(blob_params)
 
     for k, v in vars(args).items():
         if v is not None:
@@ -268,15 +449,72 @@
 
 def gen_cmd(params, unknown_params):
     finalzied_params = finalize_and_sanitize(params)
-    cmd = ['./db_stress'] + [
+    cmd = [stress_cmd] + [
         '--{0}={1}'.format(k, v)
         for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)]
         if k not in set(['test_type', 'simple', 'duration', 'interval',
-                         'random_kill_odd', 'cf_consistency', 'txn'])
+                         'random_kill_odd', 'cf_consistency', 'txn',
+                         'test_best_efforts_recovery', 'enable_ts', 'stress_cmd'])
         and v is not None] + unknown_params
     return cmd
 
 
+# Inject inconsistency to db directory.
+def inject_inconsistencies_to_db_dir(dir_path):
+    files = os.listdir(dir_path)
+    file_num_rgx = re.compile(r'(?P<number>[0-9]{6})')
+    largest_fnum = 0
+    for f in files:
+        m = file_num_rgx.search(f)
+        if m and not f.startswith('LOG'):
+            largest_fnum = max(largest_fnum, int(m.group('number')))
+
+    candidates = [
+        f for f in files if re.search(r'[0-9]+\.sst', f)
+    ]
+    deleted = 0
+    corrupted = 0
+    for f in candidates:
+        rnd = random.randint(0, 99)
+        f_path = os.path.join(dir_path, f)
+        if rnd < 10:
+            os.unlink(f_path)
+            deleted = deleted + 1
+        elif 10 <= rnd and rnd < 30:
+            with open(f_path, "a") as fd:
+                fd.write('12345678')
+            corrupted = corrupted + 1
+    print('Removed %d table files' % deleted)
+    print('Corrupted %d table files' % corrupted)
+
+    # Add corrupted MANIFEST and SST
+    for num in range(largest_fnum + 1, largest_fnum + 10):
+        rnd = random.randint(0, 1)
+        fname = ("MANIFEST-%06d" % num) if rnd == 0 else ("%06d.sst" % num)
+        print('Write %s' % fname)
+        with open(os.path.join(dir_path, fname), "w") as fd:
+            fd.write("garbage")
+
+def execute_cmd(cmd, timeout):
+    child = subprocess.Popen(cmd, stderr=subprocess.PIPE,
+                             stdout=subprocess.PIPE)
+    print("Running db_stress with pid=%d: %s\n\n"
+          % (child.pid, ' '.join(cmd)))
+
+    try:
+        outs, errs = child.communicate(timeout=timeout)
+        hit_timeout = False
+        print("WARNING: db_stress ended before kill: exitcode=%d\n"
+              % child.returncode)
+    except subprocess.TimeoutExpired:
+        hit_timeout = True
+        child.kill()
+        print("KILLED %d\n" % child.pid)
+        outs, errs = child.communicate()
+
+    return hit_timeout, child.returncode, outs.decode('utf-8'), errs.decode('utf-8')
+
+
 # This script runs and kills db_stress multiple times. It checks consistency
 # in case of unsafe crashes in RocksDB.
 def blackbox_crash_main(args, unknown_args):
@@ -289,46 +527,29 @@
           + "total-duration=" + str(cmd_params['duration']) + "\n")
 
     while time.time() < exit_time:
-        run_had_errors = False
-        killtime = time.time() + cmd_params['interval']
-
         cmd = gen_cmd(dict(
-            cmd_params.items() +
-            {'db': dbname}.items()), unknown_args)
+            list(cmd_params.items())
+            + list({'db': dbname}.items())), unknown_args)
+
+        hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params['interval'])
 
-        child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
-        print("Running db_stress with pid=%d: %s\n\n"
-              % (child.pid, ' '.join(cmd)))
-
-        stop_early = False
-        while time.time() < killtime:
-            if child.poll() is not None:
-                print("WARNING: db_stress ended before kill: exitcode=%d\n"
-                      % child.returncode)
-                stop_early = True
-                break
-            time.sleep(1)
-
-        if not stop_early:
-            if child.poll() is not None:
-                print("WARNING: db_stress ended before kill: exitcode=%d\n"
-                      % child.returncode)
-            else:
-                child.kill()
-                print("KILLED %d\n" % child.pid)
-                time.sleep(1)  # time to stabilize after a kill
-
-        while True:
-            line = child.stderr.readline().strip()
-            if line == '':
-                break
-            elif not line.startswith('WARNING'):
-                run_had_errors = True
+        if not hit_timeout:
+            print('Exit Before Killing')
+            print('stdout:')
+            print(outs)
+            print('stderr:')
+            print(errs)
+            sys.exit(2)
+
+        for line in errs.split('\n'):
+            if line != '' and  not line.startswith('WARNING'):
                 print('stderr has error message:')
                 print('***' + line + '***')
 
-        if run_had_errors:
-            sys.exit(2)
+        time.sleep(1)  # time to stabilize before the next run
+
+        if args.test_best_efforts_recovery:
+            inject_inconsistencies_to_db_dir(dbname)
 
         time.sleep(1)  # time to stabilize before the next run
 
@@ -344,7 +565,7 @@
 
     cur_time = time.time()
     exit_time = cur_time + cmd_params['duration']
-    half_time = cur_time + cmd_params['duration'] / 2
+    half_time = cur_time + cmd_params['duration'] // 2
 
     print("Running whitebox-crash-test with \n"
           + "total-duration=" + str(cmd_params['duration']) + "\n")
@@ -370,20 +591,20 @@
                 })
             elif kill_mode == 1:
                 if cmd_params.get('disable_wal', 0) == 1:
-                    my_kill_odd = kill_random_test / 50 + 1
+                    my_kill_odd = kill_random_test // 50 + 1
                 else:
-                    my_kill_odd = kill_random_test / 10 + 1
+                    my_kill_odd = kill_random_test // 10 + 1
                 additional_opts.update({
                     "kill_random_test": my_kill_odd,
-                    "kill_prefix_blacklist": "WritableFileWriter::Append,"
+                    "kill_exclude_prefixes": "WritableFileWriter::Append,"
                     + "WritableFileWriter::WriteBuffered",
                 })
             elif kill_mode == 2:
                 # TODO: May need to adjust random odds if kill_random_test
                 # is too small.
                 additional_opts.update({
-                    "kill_random_test": (kill_random_test / 5000 + 1),
-                    "kill_prefix_blacklist": "WritableFileWriter::Append,"
+                    "kill_random_test": (kill_random_test // 5000 + 1),
+                    "kill_exclude_prefixes": "WritableFileWriter::Append,"
                     "WritableFileWriter::WriteBuffered,"
                     "PosixMmapFile::Allocate,WritableFileWriter::Flush",
                 })
@@ -396,13 +617,19 @@
                 "ops_per_thread": cmd_params['ops_per_thread'],
                 "compaction_style": 1,
             }
+            # Single level universal has a lot of special logic. Ensure we cover
+            # it sometimes.
+            if random.randint(0, 1) == 1:
+                additional_opts.update({
+                    "num_levels": 1,
+                })
         elif check_mode == 2:
             # normal run with FIFO compaction mode
             # ops_per_thread is divided by 5 because FIFO compaction
             # style is quite a bit slower on reads with lot of files
             additional_opts = {
                 "kill_random_test": None,
-                "ops_per_thread": cmd_params['ops_per_thread'] / 5,
+                "ops_per_thread": cmd_params['ops_per_thread'] // 5,
                 "compaction_style": 2,
             }
         else:
@@ -412,19 +639,30 @@
                 "ops_per_thread": cmd_params['ops_per_thread'],
             }
 
-        cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
-                           + {'db': dbname}.items()), unknown_args)
-
-        print "Running:" + ' '.join(cmd) + "\n"  # noqa: E999 T25377293 Grandfathered in
-
-        popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
-                                 stderr=subprocess.STDOUT)
-        stdoutdata, stderrdata = popen.communicate()
-        retncode = popen.returncode
+        cmd = gen_cmd(dict(list(cmd_params.items())
+            + list(additional_opts.items())
+            + list({'db': dbname}.items())), unknown_args)
+
+        print("Running:" + ' '.join(cmd) + "\n")  # noqa: E999 T25377293 Grandfathered in
+
+        # If the running time is 15 minutes over the run time, explicit kill and
+        # exit even if white box kill didn't hit. This is to guarantee run time
+        # limit, as if it runs as a job, running too long will create problems
+        # for job scheduling or execution.
+        # TODO detect a hanging condition. The job might run too long as RocksDB
+        # hits a hanging bug.
+        hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
+            cmd, exit_time - time.time() + 900)
         msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
                check_mode, additional_opts['kill_random_test'], retncode))
-        print msg
-        print stdoutdata
+
+        print(msg)
+        print(stdoutdata)
+        print(stderrdata)
+
+        if hit_timeout:
+            print("Killing the run for running too long")
+            break
 
         expected = False
         if additional_opts['kill_random_test'] is None and (retncode == 0):
@@ -436,19 +674,20 @@
             expected = True
 
         if not expected:
-            print "TEST FAILED. See kill option and exit code above!!!\n"
+            print("TEST FAILED. See kill option and exit code above!!!\n")
             sys.exit(1)
 
-        stdoutdata = stdoutdata.lower()
-        errorcount = (stdoutdata.count('error') -
-                      stdoutdata.count('got errors 0 times'))
-        print "#times error occurred in output is " + str(errorcount) + "\n"
+        stderrdata = stderrdata.lower()
+        errorcount = (stderrdata.count('error') -
+                      stderrdata.count('got errors 0 times'))
+        print("#times error occurred in output is " + str(errorcount) +
+                "\n")
 
         if (errorcount > 0):
-            print "TEST FAILED. Output has 'error'!!!\n"
+            print("TEST FAILED. Output has 'error'!!!\n")
             sys.exit(2)
-        if (stdoutdata.find('fail') >= 0):
-            print "TEST FAILED. Output has 'fail'!!!\n"
+        if (stderrdata.find('fail') >= 0):
+            print("TEST FAILED. Output has 'fail'!!!\n")
             sys.exit(2)
 
         # First half of the duration, keep doing kill test. For the next half,
@@ -458,26 +697,33 @@
             # success
             shutil.rmtree(dbname, True)
             os.mkdir(dbname)
-            cmd_params.pop('expected_values_path', None)
+            cmd_params.pop('expected_values_dir', None)
             check_mode = (check_mode + 1) % total_check_mode
 
         time.sleep(1)  # time to stabilize after a kill
 
 
 def main():
+    global stress_cmd
+
     parser = argparse.ArgumentParser(description="This script runs and kills \
         db_stress multiple times")
     parser.add_argument("test_type", choices=["blackbox", "whitebox"])
     parser.add_argument("--simple", action="store_true")
     parser.add_argument("--cf_consistency", action='store_true')
     parser.add_argument("--txn", action='store_true')
-
-    all_params = dict(default_params.items()
-                      + blackbox_default_params.items()
-                      + whitebox_default_params.items()
-                      + simple_default_params.items()
-                      + blackbox_simple_default_params.items()
-                      + whitebox_simple_default_params.items())
+    parser.add_argument("--test_best_efforts_recovery", action='store_true')
+    parser.add_argument("--enable_ts", action='store_true')
+    parser.add_argument("--stress_cmd")
+
+    all_params = dict(list(default_params.items())
+                      + list(blackbox_default_params.items())
+                      + list(whitebox_default_params.items())
+                      + list(simple_default_params.items())
+                      + list(blackbox_simple_default_params.items())
+                      + list(whitebox_simple_default_params.items())
+                      + list(blob_params.items())
+                      + list(ts_params.items()))
 
     for k, v in all_params.items():
         parser.add_argument("--" + k, type=type(v() if callable(v) else v))
@@ -490,10 +736,16 @@
                 (_TEST_DIR_ENV_VAR, test_tmpdir))
         sys.exit(1)
 
+    if args.stress_cmd:
+        stress_cmd = args.stress_cmd
     if args.test_type == 'blackbox':
         blackbox_crash_main(args, unknown_args)
     if args.test_type == 'whitebox':
         whitebox_crash_main(args, unknown_args)
+    # Only delete the `expected_values_dir` if test passes
+    if expected_values_dir is not None:
+        shutil.rmtree(expected_values_dir)
+
 
 if __name__ == '__main__':
     main()
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_repl_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_repl_stress.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/db_repl_stress.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/db_repl_stress.cc	2025-05-19 16:14:28.000000000 +0000
@@ -27,30 +27,41 @@
 // --num_inserts = the num of inserts the first thread should perform.
 // --wal_ttl = the wal ttl for the run.
 
-using namespace ROCKSDB_NAMESPACE;
+DEFINE_uint64(num_inserts, 1000,
+              "the num of inserts the first thread should"
+              " perform.");
+DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
+DEFINE_uint64(wal_size_limit_MB, 10,
+              "the wal size limit for the run"
+              "(in MB)");
+
+using ROCKSDB_NAMESPACE::BatchResult;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DestroyDB;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::Random;
+using ROCKSDB_NAMESPACE::SequenceNumber;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TransactionLogIterator;
+using ROCKSDB_NAMESPACE::WriteOptions;
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
 
 struct DataPumpThread {
-  size_t no_records;
   DB* db;  // Assumption DB is Open'ed already.
 };
 
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-
 static void DataPumpThreadBody(void* arg) {
   DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
   DB* db = t->db;
   Random rnd(301);
-  size_t i = 0;
-  while (i++ < t->no_records) {
-    if (!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
-                 Slice(RandomString(&rnd, 500)))
+  uint64_t i = 0;
+  while (i++ < FLAGS_num_inserts) {
+    if (!db->Put(WriteOptions(), Slice(rnd.RandomString(500)),
+                 Slice(rnd.RandomString(500)))
              .ok()) {
       fprintf(stderr, "Error in put\n");
       exit(1);
@@ -58,45 +69,6 @@
   }
 }
 
-struct ReplicationThread {
-  std::atomic<bool> stop;
-  DB* db;
-  volatile size_t no_read;
-};
-
-static void ReplicationThreadBody(void* arg) {
-  ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
-  DB* db = t->db;
-  std::unique_ptr<TransactionLogIterator> iter;
-  SequenceNumber currentSeqNum = 1;
-  while (!t->stop.load(std::memory_order_acquire)) {
-    iter.reset();
-    Status s;
-    while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
-      if (t->stop.load(std::memory_order_acquire)) {
-        return;
-      }
-    }
-    fprintf(stderr, "Refreshing iterator\n");
-    for (; iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
-      BatchResult res = iter->GetBatch();
-      if (res.sequence != currentSeqNum) {
-        fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n",
-                (long)currentSeqNum, (long)res.sequence);
-        exit(1);
-      }
-    }
-  }
-}
-
-DEFINE_uint64(num_inserts, 1000,
-              "the num of inserts the first thread should"
-              " perform.");
-DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
-DEFINE_uint64(wal_size_limit_MB, 10,
-              "the wal size limit for the run"
-              "(in MB)");
-
 int main(int argc, const char** argv) {
   SetUsageMessage(
       std::string("\nUSAGE:\n") + std::string(argv[0]) +
@@ -123,29 +95,38 @@
   }
 
   DataPumpThread dataPump;
-  dataPump.no_records = FLAGS_num_inserts;
   dataPump.db = db;
   env->StartThread(DataPumpThreadBody, &dataPump);
 
-  ReplicationThread replThread;
-  replThread.db = db;
-  replThread.no_read = 0;
-  replThread.stop.store(false, std::memory_order_release);
-
-  env->StartThread(ReplicationThreadBody, &replThread);
-  while (replThread.no_read < FLAGS_num_inserts)
-    ;
-  replThread.stop.store(true, std::memory_order_release);
-  if (replThread.no_read < dataPump.no_records) {
-    // no. read should be => than inserted.
-    fprintf(stderr,
-            "No. of Record's written and read not same\nRead : %" ROCKSDB_PRIszt
-            " Written : %" ROCKSDB_PRIszt "\n",
-            replThread.no_read, dataPump.no_records);
-    exit(1);
+  std::unique_ptr<TransactionLogIterator> iter;
+  SequenceNumber currentSeqNum = 1;
+  uint64_t num_read = 0;
+  for (;;) {
+    iter.reset();
+    // Continue to probe a bit more after all received
+    size_t probes = 0;
+    while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+      probes++;
+      if (probes > 100 && num_read >= FLAGS_num_inserts) {
+        if (num_read > FLAGS_num_inserts) {
+          fprintf(stderr, "Too many updates read: %ld expected: %ld\n",
+                  (long)num_read, (long)FLAGS_num_inserts);
+          exit(1);
+        }
+        fprintf(stderr, "Successful!\n");
+        return 0;
+      }
+    }
+    fprintf(stderr, "Refreshing iterator\n");
+    for (; iter->Valid(); iter->Next(), num_read++, currentSeqNum++) {
+      BatchResult res = iter->GetBatch();
+      if (res.sequence != currentSeqNum) {
+        fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n",
+                (long)currentSeqNum, (long)res.sequence);
+        exit(1);
+      }
+    }
   }
-  fprintf(stderr, "Successful!\n");
-  exit(0);
 }
 
 #endif  // GFLAGS
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else  // GFLAGS
+#include "tools/io_tracer_parser_tool.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv);
+}
+#endif  // GFLAGS
+#else   // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,189 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run io_tracer_parser_test\n");
+  return 0;
+}
+#else
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/io_tracer_parser_tool.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
+}  // namespace
+
+class IOTracerParserTest : public testing::Test {
+ public:
+  IOTracerParserTest() {
+    test_path_ = test::PerThreadDBPath("io_tracer_parser_test");
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+    EXPECT_OK(env_->CreateDirIfMissing(test_path_));
+    trace_file_path_ = test_path_ + "/io_trace_file";
+    dbname_ = test_path_ + "/db";
+    Options options;
+    options.create_if_missing = true;
+    EXPECT_OK(DB::Open(options, dbname_, &db_));
+  }
+
+  ~IOTracerParserTest() {
+    if (env_->FileExists(trace_file_path_).ok()) {
+      EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    }
+    if (db_ != nullptr) {
+      Options options;
+      options.env = env_;
+      delete db_;
+      db_ = nullptr;
+      EXPECT_OK(DestroyDB(dbname_, options));
+    }
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  void GenerateIOTrace() {
+    WriteOptions write_opt;
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+
+    ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer)));
+
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i),
+                         "value_" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  void RunIOTracerParserTool() {
+    std::vector<std::string> params = {"./io_tracer_parser",
+                                       "-io_trace_file=" + trace_file_path_};
+
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
+  }
+
+  DB* db_;
+  Env* env_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string output_file_;
+  std::string test_path_;
+  std::string dbname_;
+};
+
+TEST_F(IOTracerParserTest, InvalidArguments) {
+  {
+    std::vector<std::string> params = {"./io_tracer_parser"};
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(1, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
+  }
+}
+
+TEST_F(IOTracerParserTest, DumpAndParseIOTraceRecords) {
+  GenerateIOTrace();
+  RunIOTracerParserTool();
+}
+
+TEST_F(IOTracerParserTest, NoRecordingAfterEndIOTrace) {
+  uint64_t file_size = 0;
+  // Generate IO trace records and parse them.
+  {
+    GenerateIOTrace();
+    RunIOTracerParserTool();
+    ASSERT_OK(env_->GetFileSize(trace_file_path_, &file_size));
+  }
+  // Once DB::EndIOTrace is invoked in GenerateIOTrace(), no new records should
+  // be appended.
+  {
+    WriteOptions write_opt;
+    for (int i = 10; i < 20; i++) {
+      ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i),
+                         "value_" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+  }
+
+  uint64_t new_file_size = 0;
+  ASSERT_OK(env_->GetFileSize(trace_file_path_, &new_file_size));
+  ASSERT_EQ(file_size, new_file_size);
+}
+
+TEST_F(IOTracerParserTest, NoRecordingBeforeStartIOTrace) {
+  {
+    WriteOptions write_opt;
+    for (int i = 10; i < 20; i++) {
+      ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i),
+                         "value_" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+    // IO trace file doesn't exist
+    ASSERT_NOK(env_->FileExists(trace_file_path_));
+  }
+  // Generate IO trace records and parse them.
+  {
+    GenerateIOTrace();
+    RunIOTracerParserTool();
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GFLAGS
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "io_tracer_parser_test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,144 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//    This source code is licensed under both the GPLv2 (found in the
+//    COPYING file in the root directory) and Apache 2.0 License
+//    (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+#include "tools/io_tracer_parser_tool.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+
+#include "port/lang.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "trace_replay/io_tracer.h"
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(io_trace_file, "", "The IO trace file path.");
+
+namespace ROCKSDB_NAMESPACE {
+
+IOTraceRecordParser::IOTraceRecordParser(const std::string& input_file)
+    : input_file_(input_file) {}
+
+void IOTraceRecordParser::PrintHumanReadableHeader(
+    const IOTraceHeader& header) {
+  std::stringstream ss;
+  ss << "Start Time: " << header.start_time
+     << "\nRocksDB Major Version: " << header.rocksdb_major_version
+     << "\nRocksDB Minor Version: " << header.rocksdb_minor_version << "\n";
+  fprintf(stdout, "%s", ss.str().c_str());
+}
+
+void IOTraceRecordParser::PrintHumanReadableIOTraceRecord(
+    const IOTraceRecord& record) {
+  std::stringstream ss;
+  ss << "Access Time : " << std::setw(20) << std::left
+     << record.access_timestamp << ", File Name: " << std::setw(20) << std::left
+     << record.file_name.c_str() << ", File Operation: " << std::setw(18)
+     << std::left << record.file_operation.c_str()
+     << ", Latency: " << std::setw(10) << std::left << record.latency
+     << ", IO Status: " << record.io_status.c_str();
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Read remaining options based on io_op_data set by file operation */
+  int64_t io_op_data = static_cast<int64_t>(record.io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        ss << ", File Size: " << record.file_size;
+        break;
+      case IOTraceOp::kIOLen:
+        ss << ", Length: " << record.len;
+        break;
+      case IOTraceOp::kIOOffset:
+        ss << ", Offset: " << record.offset;
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  int64_t trace_data = static_cast<int64_t>(record.trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID:
+        ss << ", Request Id: " << record.request_id;
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
+  }
+
+  ss << "\n";
+  fprintf(stdout, "%s", ss.str().c_str());
+}
+
+int IOTraceRecordParser::ReadIOTraceRecords() {
+  Status status;
+  Env* env(Env::Default());
+  std::unique_ptr<TraceReader> trace_reader;
+  std::unique_ptr<IOTraceReader> io_trace_reader;
+
+  status = NewFileTraceReader(env, EnvOptions(), input_file_, &trace_reader);
+  if (!status.ok()) {
+    fprintf(stderr, "%s: %s\n", input_file_.c_str(), status.ToString().c_str());
+    return 1;
+  }
+  io_trace_reader.reset(new IOTraceReader(std::move(trace_reader)));
+
+  // Read the header and dump it in a file.
+  IOTraceHeader header;
+  status = io_trace_reader->ReadHeader(&header);
+  if (!status.ok()) {
+    fprintf(stderr, "%s: %s\n", input_file_.c_str(), status.ToString().c_str());
+    return 1;
+  }
+  PrintHumanReadableHeader(header);
+
+  // Read the records one by one and print them in human readable format.
+  while (status.ok()) {
+    IOTraceRecord record;
+    status = io_trace_reader->ReadIOOp(&record);
+    if (!status.ok()) {
+      break;
+    }
+    PrintHumanReadableIOTraceRecord(record);
+  }
+  return 0;
+}
+
+int io_tracer_parser(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_io_trace_file.empty()) {
+    fprintf(stderr, "IO Trace file path is empty\n");
+    return 1;
+  }
+
+  IOTraceRecordParser io_tracer_parser(FLAGS_io_trace_file);
+  return io_tracer_parser.ReadIOTraceRecords();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.h mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/io_tracer_parser_tool.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,40 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct IOTraceHeader;
+struct IOTraceRecord;
+
+// IOTraceRecordParser class reads the IO trace file (in binary format) and
+// dumps the human readable records in output_file_.
+class IOTraceRecordParser {
+ public:
+  explicit IOTraceRecordParser(const std::string& input_file);
+
+  // ReadIOTraceRecords reads the binary trace file records one by one and
+  // invoke PrintHumanReadableIOTraceRecord to dump the records in output_file_.
+  int ReadIOTraceRecords();
+
+ private:
+  void PrintHumanReadableHeader(const IOTraceHeader& header);
+  void PrintHumanReadableIOTraceRecord(const IOTraceRecord& record);
+
+  // Binary file that contains IO trace records.
+  std::string input_file_;
+};
+
+int io_tracer_parser(int argc, char** argv);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_cmd.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_cmd.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,16 +8,24 @@
 #include "rocksdb/utilities/ldb_cmd.h"
 
 #include <cinttypes>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
-#include "port/port_dirent.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/file_checksum.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/checkpoint.h"
@@ -26,8 +34,8 @@
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
+#include "table/sst_file_dumper.h"
 #include "tools/ldb_cmd_impl.h"
-#include "tools/sst_dump_tool_imp.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_checksum_helper.h"
@@ -36,21 +44,13 @@
 #include "utilities/merge_operators.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
-#include <cstdlib>
-#include <ctime>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
 namespace ROCKSDB_NAMESPACE {
 
-class FileChecksumFuncCrc32c;
+class FileChecksumGenCrc32c;
+class FileChecksumGenCrc32cFactory;
 
 const std::string LDBCommand::ARG_ENV_URI = "env_uri";
+const std::string LDBCommand::ARG_FS_URI = "fs_uri";
 const std::string LDBCommand::ARG_DB = "db";
 const std::string LDBCommand::ARG_PATH = "path";
 const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path";
@@ -63,6 +63,8 @@
 const std::string LDBCommand::ARG_TTL_END = "end_time";
 const std::string LDBCommand::ARG_TIMESTAMP = "timestamp";
 const std::string LDBCommand::ARG_TRY_LOAD_OPTIONS = "try_load_options";
+const std::string LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS =
+    "disable_consistency_checks";
 const std::string LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS =
     "ignore_unknown_options";
 const std::string LDBCommand::ARG_FROM = "from";
@@ -94,7 +96,7 @@
 };
 
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
-    int argc, char** argv, const Options& options,
+    int argc, char const* const* argv, const Options& options,
     const LDBOptions& ldb_options,
     const std::vector<ColumnFamilyDescriptor>* column_families) {
   std::vector<std::string> args;
@@ -226,6 +228,10 @@
     return new FileChecksumDumpCommand(parsed_params.cmd_params,
                                        parsed_params.option_map,
                                        parsed_params.flags);
+  } else if (parsed_params.cmd == GetPropertyCommand::Name()) {
+    return new GetPropertyCommand(parsed_params.cmd_params,
+                                  parsed_params.option_map,
+                                  parsed_params.flags);
   } else if (parsed_params.cmd == ListColumnFamiliesCommand::Name()) {
     return new ListColumnFamiliesCommand(parsed_params.cmd_params,
                                          parsed_params.option_map,
@@ -242,6 +248,10 @@
     return new DBFileDumperCommand(parsed_params.cmd_params,
                                    parsed_params.option_map,
                                    parsed_params.flags);
+  } else if (parsed_params.cmd == DBLiveFilesMetadataDumperCommand::Name()) {
+    return new DBLiveFilesMetadataDumperCommand(parsed_params.cmd_params,
+                                                parsed_params.option_map,
+                                                parsed_params.flags);
   } else if (parsed_params.cmd == InternalDumpCommand::Name()) {
     return new InternalDumpCommand(parsed_params.cmd_params,
                                    parsed_params.option_map,
@@ -274,6 +284,10 @@
   } else if (parsed_params.cmd == ListFileRangeDeletesCommand::Name()) {
     return new ListFileRangeDeletesCommand(parsed_params.option_map,
                                            parsed_params.flags);
+  } else if (parsed_params.cmd == UnsafeRemoveSstFileCommand::Name()) {
+    return new UnsafeRemoveSstFileCommand(parsed_params.cmd_params,
+                                          parsed_params.option_map,
+                                          parsed_params.flags);
   }
   return nullptr;
 }
@@ -286,17 +300,16 @@
 
   if (!options_.env || options_.env == Env::Default()) {
     Env* env = Env::Default();
-    Status s = Env::LoadEnv(env_uri_, &env, &env_guard_);
-    if (!s.ok() && !s.IsNotFound()) {
-      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+    Status s = Env::CreateFromUri(config_options_, env_uri_, fs_uri_, &env,
+                                  &env_guard_);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
       exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
       return;
     }
     options_.env = env;
   }
 
-  options_.file_system.reset(new LegacyFileSystemWrapper(options_.env));
-
   if (db_ == nullptr && !NoDBOpen()) {
     OpenDB();
     if (exec_state_.IsFailed() && try_load_options_) {
@@ -331,7 +344,6 @@
       is_db_ttl_(false),
       timestamp_(false),
       try_load_options_(false),
-      ignore_unknown_options_(false),
       create_if_missing_(false),
       option_map_(options),
       flags_(flags),
@@ -346,6 +358,11 @@
     env_uri_ = itr->second;
   }
 
+  itr = options.find(ARG_FS_URI);
+  if (itr != options.end()) {
+    fs_uri_ = itr->second;
+  }
+
   itr = options.find(ARG_CF_NAME);
   if (itr != options.end()) {
     column_family_name_ = itr->second;
@@ -364,37 +381,14 @@
   is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
   timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP);
   try_load_options_ = IsFlagPresent(flags, ARG_TRY_LOAD_OPTIONS);
-  ignore_unknown_options_ = IsFlagPresent(flags, ARG_IGNORE_UNKNOWN_OPTIONS);
+  force_consistency_checks_ =
+      !IsFlagPresent(flags, ARG_DISABLE_CONSISTENCY_CHECKS);
+  config_options_.ignore_unknown_options =
+      IsFlagPresent(flags, ARG_IGNORE_UNKNOWN_OPTIONS);
 }
 
 void LDBCommand::OpenDB() {
-  if (!create_if_missing_ && try_load_options_) {
-    Status s = LoadLatestOptions(db_path_, options_.env, &options_,
-                                 &column_families_, ignore_unknown_options_);
-    if (!s.ok() && !s.IsNotFound()) {
-      // Option file exists but load option file error.
-      std::string msg = s.ToString();
-      exec_state_ = LDBCommandExecuteResult::Failed(msg);
-      db_ = nullptr;
-      return;
-    }
-    if (options_.env->FileExists(options_.wal_dir).IsNotFound()) {
-      options_.wal_dir = db_path_;
-      fprintf(
-          stderr,
-          "wal_dir loaded from the option file doesn't exist. Ignore it.\n");
-    }
-
-    // If merge operator is not set, set a string append operator. There is
-    // no harm doing it.
-    for (auto& cf_entry : column_families_) {
-      if (!cf_entry.options.merge_operator) {
-        cf_entry.options.merge_operator =
-            MergeOperators::CreateStringAppendOperator(':');
-      }
-    }
-  }
-  options_ = PrepareOptionsForOpenDB();
+  PrepareOptions();
   if (!exec_state_.IsNotStarted()) {
     return;
   }
@@ -422,21 +416,6 @@
     }
     db_ = db_ttl_;
   } else {
-    if (column_families_.empty()) {
-      // Try to figure out column family lists
-      std::vector<std::string> cf_list;
-      st = DB::ListColumnFamilies(options_, db_path_, &cf_list);
-      // There is possible the DB doesn't exist yet, for "create if not
-      // "existing case". The failure is ignored here. We rely on DB::Open()
-      // to give us the correct error message for problem with opening
-      // existing DB.
-      if (st.ok() && cf_list.size() > 1) {
-        // Ignore single column family DB.
-        for (auto cf_name : cf_list) {
-          column_families_.emplace_back(cf_name, options_);
-        }
-      }
-    }
     if (is_read_only_ && secondary_path_.empty()) {
       if (column_families_.empty()) {
         st = DB::OpenForReadOnly(options_, db_path_, &db_);
@@ -516,6 +495,7 @@
 std::vector<std::string> LDBCommand::BuildCmdLineOptions(
     std::vector<std::string> options) {
   std::vector<std::string> ret = {ARG_ENV_URI,
+                                  ARG_FS_URI,
                                   ARG_DB,
                                   ARG_SECONDARY_PATH,
                                   ARG_BLOOM_BITS,
@@ -527,6 +507,7 @@
                                   ARG_FILE_SIZE,
                                   ARG_FIX_PREFIX_LEN,
                                   ARG_TRY_LOAD_OPTIONS,
+                                  ARG_DISABLE_CONSISTENCY_CHECKS,
                                   ARG_IGNORE_UNKNOWN_OPTIONS,
                                   ARG_CF_NAME};
   ret.insert(ret.end(), options.begin(), options.end());
@@ -581,23 +562,29 @@
   return false;
 }
 
-Options LDBCommand::PrepareOptionsForOpenDB() {
-  ColumnFamilyOptions* cf_opts;
-  auto column_families_iter =
-      std::find_if(column_families_.begin(), column_families_.end(),
-                   [this](const ColumnFamilyDescriptor& cf_desc) {
-                     return cf_desc.name == column_family_name_;
-                   });
-  if (column_families_iter != column_families_.end()) {
-    cf_opts = &column_families_iter->options;
-  } else {
-    cf_opts = static_cast<ColumnFamilyOptions*>(&options_);
+void LDBCommand::OverrideBaseOptions() {
+  options_.create_if_missing = false;
+
+  int db_write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
+                     db_write_buffer_size, exec_state_)) {
+    if (db_write_buffer_size >= 0) {
+      options_.db_write_buffer_size = db_write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
+                                                    " must be >= 0.");
+    }
   }
-  DBOptions* db_opts = static_cast<DBOptions*>(&options_);
-  db_opts->create_if_missing = false;
 
-  std::map<std::string, std::string>::const_iterator itr;
+  if (options_.db_paths.size() == 0) {
+    options_.db_paths.emplace_back(db_path_,
+                                   std::numeric_limits<uint64_t>::max());
+  }
+
+  OverrideBaseCFOptions(static_cast<ColumnFamilyOptions*>(&options_));
+}
 
+void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) {
   BlockBasedTableOptions table_options;
   bool use_table_options = false;
   int bits;
@@ -622,11 +609,12 @@
     }
   }
 
+  cf_opts->force_consistency_checks = force_consistency_checks_;
   if (use_table_options) {
     cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options));
   }
 
-  itr = option_map_.find(ARG_AUTO_COMPACTION);
+  auto itr = option_map_.find(ARG_AUTO_COMPACTION);
   if (itr != option_map_.end()) {
     cf_opts->disable_auto_compactions = !StringToBool(itr->second);
   }
@@ -668,17 +656,6 @@
     }
   }
 
-  int db_write_buffer_size;
-  if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
-        db_write_buffer_size, exec_state_)) {
-    if (db_write_buffer_size >= 0) {
-      db_opts->db_write_buffer_size = db_write_buffer_size;
-    } else {
-      exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
-                                                    " must be >= 0.");
-    }
-  }
-
   int write_buffer_size;
   if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
         exec_state_)) {
@@ -700,11 +677,6 @@
     }
   }
 
-  if (db_opts->db_paths.size() == 0) {
-    db_opts->db_paths.emplace_back(db_path_,
-                                   std::numeric_limits<uint64_t>::max());
-  }
-
   int fix_prefix_len;
   if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
                      exec_state_)) {
@@ -716,11 +688,82 @@
           LDBCommandExecuteResult::Failed(ARG_FIX_PREFIX_LEN + " must be > 0.");
     }
   }
+}
+
+// First, initializes the options state using the OPTIONS file when enabled.
+// Second, overrides the options according to the CLI arguments and the
+// specific subcommand being run.
+void LDBCommand::PrepareOptions() {
+  if (!create_if_missing_ && try_load_options_) {
+    config_options_.env = options_.env;
+    Status s = LoadLatestOptions(config_options_, db_path_, &options_,
+                                 &column_families_);
+    if (!s.ok() && !s.IsNotFound()) {
+      // Option file exists but load option file error.
+      std::string msg = s.ToString();
+      exec_state_ = LDBCommandExecuteResult::Failed(msg);
+      db_ = nullptr;
+      return;
+    }
+    if (!options_.wal_dir.empty()) {
+      if (options_.env->FileExists(options_.wal_dir).IsNotFound()) {
+        options_.wal_dir = db_path_;
+        fprintf(
+            stderr,
+            "wal_dir loaded from the option file doesn't exist. Ignore it.\n");
+      }
+    }
+
+    // If merge operator is not set, set a string append operator.
+    for (auto& cf_entry : column_families_) {
+      if (!cf_entry.options.merge_operator) {
+        cf_entry.options.merge_operator =
+            MergeOperators::CreateStringAppendOperator(':');
+      }
+    }
+  }
+
+  if (options_.env == Env::Default()) {
+    options_.env = config_options_.env;
+  }
 
-  // TODO(ajkr): this return value doesn't reflect the CF options changed, so
-  // subcommands that rely on this won't see the effect of CF-related CLI args.
-  // Such subcommands need to be changed to properly support CFs.
-  return options_;
+  OverrideBaseOptions();
+  if (exec_state_.IsFailed()) {
+    return;
+  }
+
+  if (column_families_.empty()) {
+    // Reads the MANIFEST to figure out what column families exist. In this
+    // case, the option overrides from the CLI argument/specific subcommand
+    // apply to all column families.
+    std::vector<std::string> cf_list;
+    Status st = DB::ListColumnFamilies(options_, db_path_, &cf_list);
+    // It is possible the DB doesn't exist yet, for "create if not
+    // existing" case. The failure is ignored here. We rely on DB::Open()
+    // to give us the correct error message for problem with opening
+    // existing DB.
+    if (st.ok() && cf_list.size() > 1) {
+      // Ignore single column family DB.
+      for (auto cf_name : cf_list) {
+        column_families_.emplace_back(cf_name, options_);
+      }
+    }
+  } else {
+    // We got column families from the OPTIONS file. In this case, the option
+    // overrides from the CLI argument/specific subcommand only apply to the
+    // column family specified by `--column_family_name`.
+    auto column_families_iter =
+        std::find_if(column_families_.begin(), column_families_.end(),
+                     [this](const ColumnFamilyDescriptor& cf_desc) {
+                       return cf_desc.name == column_family_name_;
+                     });
+    if (column_families_iter == column_families_.end()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Non-existing column family " + column_family_name_);
+      return;
+    }
+    OverrideBaseCFOptions(&column_families_iter->options);
+  }
 }
 
 bool LDBCommand::ParseKeyValue(const std::string& line, std::string* key,
@@ -919,8 +962,14 @@
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
 
-  db_->CompactRange(cro, GetCfHandle(), begin, end);
-  exec_state_ = LDBCommandExecuteResult::Succeed("");
+  Status s = db_->CompactRange(cro, GetCfHandle(), begin, end);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Succeed("");
+  }
 
   delete begin;
   delete end;
@@ -959,13 +1008,12 @@
   ret.append("\n");
 }
 
-Options DBLoaderCommand::PrepareOptionsForOpenDB() {
-  Options opt = LDBCommand::PrepareOptionsForOpenDB();
-  opt.create_if_missing = create_if_missing_;
+void DBLoaderCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
   if (bulk_load_) {
-    opt.PrepareForBulkLoad();
+    options_.PrepareForBulkLoad();
   }
-  return opt;
 }
 
 void DBLoaderCommand::DoCommand() {
@@ -984,11 +1032,12 @@
   // prefer ifstream getline performance vs that from std::cin istream
   std::ifstream ifs_stdin("/dev/stdin");
   std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
-  while (getline(*istream_p, line, '\n')) {
+  Status s;
+  while (s.ok() && getline(*istream_p, line, '\n')) {
     std::string key;
     std::string value;
     if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
-      db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+      s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
     } else if (0 == line.find("Keys in range:")) {
       // ignore this line
     } else if (0 == line.find("Created bg thread 0x")) {
@@ -1001,8 +1050,19 @@
   if (bad_lines > 0) {
     std::cout << "Warning: " << bad_lines << " bad lines ignored." << std::endl;
   }
-  if (compact_) {
-    db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Load failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
+  if (compact_ && s.ok()) {
+    s = db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr,
+                          nullptr);
+  }
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
   }
 }
 
@@ -1025,7 +1085,8 @@
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
   VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr);
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
   Status s = versions.DumpManifest(options, file, verbose, hex, json);
   if (!s.ok()) {
     fprintf(stderr, "Error in processing file %s %s\n", file.c_str(),
@@ -1112,7 +1173,7 @@
         fname = file_path;
       }
       uint64_t file_num = 0;
-      FileType file_type = kLogFile;  // Just for initialization
+      FileType file_type = kWalFile;  // Just for initialization
       if (ParseFileName(fname, &file_num, &file_type) &&
           file_type == kDescriptorFile) {
         if (!matched_file.empty()) {
@@ -1150,9 +1211,9 @@
 // ----------------------------------------------------------------------------
 namespace {
 
-void GetLiveFilesChecksumInfoFromVersionSet(Options options,
-                                            const std::string& db_path,
-                                            FileChecksumList* checksum_list) {
+Status GetLiveFilesChecksumInfoFromVersionSet(Options options,
+                                              const std::string& db_path,
+                                              FileChecksumList* checksum_list) {
   EnvOptions sopt;
   Status s;
   std::string dbname(db_path);
@@ -1167,10 +1228,11 @@
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
   VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr);
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
   std::vector<std::string> cf_name_list;
   s = versions.ListColumnFamilies(&cf_name_list, db_path,
-                                  options.file_system.get());
+                                  immutable_db_options.fs.get());
   if (s.ok()) {
     std::vector<ColumnFamilyDescriptor> cf_list;
     for (const auto& name : cf_name_list) {
@@ -1181,9 +1243,7 @@
   if (s.ok()) {
     s = versions.GetLiveFilesChecksumInfo(checksum_list);
   }
-  if (!s.ok()) {
-    fprintf(stderr, "Error Status: %s", s.ToString().c_str());
-  }
+  return s;
 }
 
 }  // namespace
@@ -1201,7 +1261,8 @@
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_PATH})),
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_PATH, ARG_HEX})),
       path_("") {
   std::map<std::string, std::string>::const_iterator itr =
       options.find(ARG_PATH);
@@ -1211,6 +1272,7 @@
       exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname");
     }
   }
+  is_checksum_hex_ = IsFlagPresent(flags, ARG_HEX);
 }
 
 void FileChecksumDumpCommand::DoCommand() {
@@ -1220,24 +1282,86 @@
   //  ......
 
   std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList());
-  GetLiveFilesChecksumInfoFromVersionSet(options_, db_path_,
-                                         checksum_list.get());
-  if (checksum_list != nullptr) {
+  Status s = GetLiveFilesChecksumInfoFromVersionSet(options_, db_path_,
+                                                    checksum_list.get());
+  if (s.ok() && checksum_list != nullptr) {
     std::vector<uint64_t> file_numbers;
     std::vector<std::string> checksums;
     std::vector<std::string> checksum_func_names;
-    Status s = checksum_list->GetAllFileChecksums(&file_numbers, &checksums,
-                                                  &checksum_func_names);
+    s = checksum_list->GetAllFileChecksums(&file_numbers, &checksums,
+                                           &checksum_func_names);
     if (s.ok()) {
       for (size_t i = 0; i < file_numbers.size(); i++) {
         assert(i < file_numbers.size());
         assert(i < checksums.size());
         assert(i < checksum_func_names.size());
+        std::string checksum;
+        if (is_checksum_hex_) {
+          checksum = StringToHex(checksums[i]);
+        } else {
+          checksum = std::move(checksums[i]);
+        }
         fprintf(stdout, "%" PRId64 ", %s, %s\n", file_numbers[i],
-                checksum_func_names[i].c_str(), checksums[i].c_str());
+                checksum_func_names[i].c_str(), checksum.c_str());
       }
+      fprintf(stdout, "Print SST file checksum information finished \n");
     }
-    fprintf(stdout, "Print SST file checksum information finished \n");
+  }
+
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+  }
+}
+
+// ----------------------------------------------------------------------------
+
+void GetPropertyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(GetPropertyCommand::Name());
+  ret.append(" <property_name>");
+  ret.append("\n");
+}
+
+GetPropertyCommand::GetPropertyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {
+  if (params.size() != 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("property name must be specified");
+  } else {
+    property_ = params[0];
+  }
+}
+
+void GetPropertyCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  std::map<std::string, std::string> value_map;
+  std::string value;
+
+  // Rather than having different ldb command for map properties vs. string
+  // properties, we simply try Map property first. (This order only chosen
+  // because I prefer the map-style output for
+  // "rocksdb.aggregated-table-properties".)
+  if (db_->GetMapProperty(GetCfHandle(), property_, &value_map)) {
+    if (value_map.empty()) {
+      fprintf(stdout, "%s: <empty map>\n", property_.c_str());
+    } else {
+      for (auto& e : value_map) {
+        fprintf(stdout, "%s.%s: %s\n", property_.c_str(), e.first.c_str(),
+                e.second.c_str());
+      }
+    }
+  } else if (db_->GetProperty(GetCfHandle(), property_, &value)) {
+    fprintf(stdout, "%s: %s\n", property_.c_str(), value.c_str());
+  } else {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("failed to get property: " + property_);
   }
 }
 
@@ -1296,6 +1420,10 @@
 }
 
 void CreateColumnFamilyCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
   ColumnFamilyHandle* new_cf_handle = nullptr;
   Status st = db_->CreateColumnFamily(options_, new_cf_name_, &new_cf_handle);
   if (st.ok()) {
@@ -1329,6 +1457,10 @@
 }
 
 void DropColumnFamilyCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
   auto iter = cf_handles_.find(cf_name_to_drop_);
   if (iter == cf_handles_.end()) {
     exec_state_ = LDBCommandExecuteResult::Failed(
@@ -1647,7 +1779,7 @@
     }
 
     switch (type) {
-      case kLogFile:
+      case kWalFile:
         // TODO(myabandeh): allow configuring is_write_commited
         DumpWalFile(options_, path_, /* print_header_ */ true,
                     /* print_values_ */ true, true /* is_write_commited */,
@@ -1748,8 +1880,8 @@
     if (max_keys == 0)
       break;
     if (is_db_ttl_) {
-      TtlIterator* it_ttl = static_cast_with_check<TtlIterator, Iterator>(iter);
-      rawtime = it_ttl->timestamp();
+      TtlIterator* it_ttl = static_cast_with_check<TtlIterator>(iter);
+      rawtime = it_ttl->ttl_timestamp();
       if (rawtime < ttl_start || rawtime >= ttl_end) {
         continue;
       }
@@ -1859,14 +1991,15 @@
   ret.append("\n");
 }
 
-Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
-  Options opt = LDBCommand::PrepareOptionsForOpenDB();
-  opt.num_levels = old_levels_;
-  opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1);
+void ReduceDBLevelsCommand::OverrideBaseCFOptions(
+    ColumnFamilyOptions* cf_opts) {
+  LDBCommand::OverrideBaseCFOptions(cf_opts);
+  cf_opts->num_levels = old_levels_;
+  cf_opts->max_bytes_for_level_multiplier_additional.resize(cf_opts->num_levels,
+                                                            1);
   // Disable size compaction
-  opt.max_bytes_for_level_base = 1ULL << 50;
-  opt.max_bytes_for_level_multiplier = 1;
-  return opt;
+  cf_opts->max_bytes_for_level_base = 1ULL << 50;
+  cf_opts->max_bytes_for_level_multiplier = 1;
 }
 
 Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
@@ -1879,7 +2012,8 @@
   WriteController wc(opt.delayed_write_rate);
   WriteBufferManager wb(opt.db_write_buffer_size);
   VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr);
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
@@ -1911,11 +2045,9 @@
   }
 
   Status st;
-  Options opt = PrepareOptionsForOpenDB();
+  PrepareOptions();
   int old_level_num = -1;
-  opt.file_system.reset(new LegacyFileSystemWrapper(opt.env));
-  ;
-  st = GetOldNumOfLevels(opt, &old_level_num);
+  st = GetOldNumOfLevels(options_, &old_level_num);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -1938,11 +2070,16 @@
   assert(db_ != nullptr);
   // Compact the whole DB to put all files to the highest level.
   fprintf(stdout, "Compacting the db...\n");
-  db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+  st =
+      db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+
   CloseDB();
 
-  EnvOptions soptions;
-  st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
+  if (st.ok()) {
+    EnvOptions soptions;
+    st = VersionSet::ReduceNumberOfLevels(db_path_, &options_, soptions,
+                                          new_levels_);
+  }
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -2009,30 +2146,31 @@
   ret.append("\n");
 }
 
-Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() {
-  Options opt = LDBCommand::PrepareOptionsForOpenDB();
-
+void ChangeCompactionStyleCommand::OverrideBaseCFOptions(
+    ColumnFamilyOptions* cf_opts) {
+  LDBCommand::OverrideBaseCFOptions(cf_opts);
   if (old_compaction_style_ == kCompactionStyleLevel &&
       new_compaction_style_ == kCompactionStyleUniversal) {
     // In order to convert from level compaction to universal compaction, we
     // need to compact all data into a single file and move it to level 0.
-    opt.disable_auto_compactions = true;
-    opt.target_file_size_base = INT_MAX;
-    opt.target_file_size_multiplier = 1;
-    opt.max_bytes_for_level_base = INT_MAX;
-    opt.max_bytes_for_level_multiplier = 1;
+    cf_opts->disable_auto_compactions = true;
+    cf_opts->target_file_size_base = INT_MAX;
+    cf_opts->target_file_size_multiplier = 1;
+    cf_opts->max_bytes_for_level_base = INT_MAX;
+    cf_opts->max_bytes_for_level_multiplier = 1;
   }
-
-  return opt;
 }
 
 void ChangeCompactionStyleCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
   // print db stats before we have made any change
   std::string property;
   std::string files_per_level;
   for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) {
-    db_->GetProperty(GetCfHandle(),
-                     "rocksdb.num-files-at-level" + NumberToString(i),
+    db_->GetProperty(GetCfHandle(), "rocksdb.num-files-at-level" + ToString(i),
                      &property);
 
     // format print string
@@ -2047,14 +2185,20 @@
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
   compact_options.target_level = 0;
-  db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr);
+  Status s =
+      db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+    return;
+  }
 
   // verify compaction result
   files_per_level = "";
   int num_files = 0;
   for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) {
-    db_->GetProperty(GetCfHandle(),
-                     "rocksdb.num-files-at-level" + NumberToString(i),
+    db_->GetProperty(GetCfHandle(), "rocksdb.num-files-at-level" + ToString(i),
                      &property);
 
     // format print string
@@ -2178,6 +2322,14 @@
     return Status::OK();
   }
 
+  Status MarkCommitWithTimestamp(const Slice& xid,
+                                 const Slice& commit_ts) override {
+    row_ << "COMMIT_WITH_TIMESTAMP(";
+    row_ << LDBCommand::StringToHex(xid.ToString()) << ", ";
+    row_ << LDBCommand::StringToHex(commit_ts.ToString()) << ") ";
+    return Status::OK();
+  }
+
   ~InMemoryHandler() override {}
 
  protected:
@@ -2192,19 +2344,11 @@
 void DumpWalFile(Options options, std::string wal_file, bool print_header,
                  bool print_values, bool is_write_committed,
                  LDBCommandExecuteResult* exec_state) {
-  Env* env = options.env;
-  EnvOptions soptions(options);
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions soptions(options);
   std::unique_ptr<SequentialFileReader> wal_file_reader;
-
-  Status status;
-  {
-    std::unique_ptr<SequentialFile> file;
-    status = env->NewSequentialFile(wal_file, &file, soptions);
-    if (status.ok()) {
-      wal_file_reader.reset(new SequentialFileReader(
-          NewLegacySequentialFileWrapper(file), wal_file));
-    }
-  }
+  Status status = SequentialFileReader::Create(fs, wal_file, soptions,
+                                               &wal_file_reader, nullptr);
   if (!status.ok()) {
     if (exec_state) {
       *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " +
@@ -2240,19 +2384,38 @@
       }
       std::cout << "\n";
     }
-    while (reader.ReadRecord(&record, &scratch)) {
+    while (status.ok() && reader.ReadRecord(&record, &scratch)) {
       row.str("");
       if (record.size() < WriteBatchInternal::kHeader) {
         reporter.Corruption(record.size(),
                             Status::Corruption("log record too small"));
       } else {
-        WriteBatchInternal::SetContents(&batch, record);
+        status = WriteBatchInternal::SetContents(&batch, record);
+        if (!status.ok()) {
+          std::stringstream oss;
+          oss << "Parsing write batch failed: " << status.ToString();
+          if (exec_state) {
+            *exec_state = LDBCommandExecuteResult::Failed(oss.str());
+          } else {
+            std::cerr << oss.str() << std::endl;
+          }
+          break;
+        }
         row << WriteBatchInternal::Sequence(&batch) << ",";
         row << WriteBatchInternal::Count(&batch) << ",";
         row << WriteBatchInternal::ByteSize(&batch) << ",";
         row << reader.LastRecordOffset() << ",";
         InMemoryHandler handler(row, print_values, is_write_committed);
-        batch.Iterate(&handler);
+        status = batch.Iterate(&handler);
+        if (!status.ok()) {
+          if (exec_state) {
+            std::stringstream oss;
+            oss << "Print write batch error: " << status.ToString();
+            *exec_state = LDBCommandExecuteResult::Failed(oss.str());
+          }
+          row << "error: " << status.ToString();
+          break;
+        }
         row << "\n";
       }
       std::cout << row.str();
@@ -2350,7 +2513,9 @@
     fprintf(stdout, "%s\n",
               (is_value_hex_ ? StringToHex(value) : value).c_str());
   } else {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    std::stringstream oss;
+    oss << "Get failed: " << st.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
   }
 }
 
@@ -2400,14 +2565,14 @@
   Range ranges[1];
   ranges[0] = Range(start_key_, end_key_);
   uint64_t sizes[1];
-  db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes);
-  fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
-  /* Weird that GetApproximateSizes() returns void, although documentation
-   * says that it returns a Status object.
-  if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  Status s = db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "ApproximateSize failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  } else {
+    fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
   }
-  */
 }
 
 // ----------------------------------------------------------------------------
@@ -2441,6 +2606,7 @@
   ret.append("  ");
   ret.append(BatchPutCommand::Name());
   ret.append(" <key> <value> [<key> <value>] [..]");
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
   ret.append(" [--" + ARG_TTL + "]");
   ret.append("\n");
 }
@@ -2452,23 +2618,34 @@
   }
   WriteBatch batch;
 
+  Status st;
+  std::stringstream oss;
   for (std::vector<std::pair<std::string, std::string>>::const_iterator itr =
            key_values_.begin();
        itr != key_values_.end(); ++itr) {
-    batch.Put(GetCfHandle(), itr->first, itr->second);
+    st = batch.Put(GetCfHandle(), itr->first, itr->second);
+    if (!st.ok()) {
+      oss << "Put to write batch failed: " << itr->first << "=>" << itr->second
+          << " error: " << st.ToString();
+      break;
+    }
+  }
+  if (st.ok()) {
+    st = db_->Write(WriteOptions(), &batch);
+    if (!st.ok()) {
+      oss << "Write failed: " << st.ToString();
+    }
   }
-  Status st = db_->Write(WriteOptions(), &batch);
   if (st.ok()) {
     fprintf(stdout, "OK\n");
   } else {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
   }
 }
 
-Options BatchPutCommand::PrepareOptionsForOpenDB() {
-  Options opt = LDBCommand::PrepareOptionsForOpenDB();
-  opt.create_if_missing = create_if_missing_;
-  return opt;
+void BatchPutCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
 }
 
 // ----------------------------------------------------------------------------
@@ -2577,8 +2754,8 @@
         it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
         it->Next()) {
     if (is_db_ttl_) {
-      TtlIterator* it_ttl = static_cast_with_check<TtlIterator, Iterator>(it);
-      int rawtime = it_ttl->timestamp();
+      TtlIterator* it_ttl = static_cast_with_check<TtlIterator>(it);
+      int rawtime = it_ttl->ttl_timestamp();
       if (rawtime < ttl_start || rawtime >= ttl_end) {
         continue;
       }
@@ -2727,7 +2904,8 @@
 void PutCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(PutCommand::Name());
-  ret.append(" <key> <value> ");
+  ret.append(" <key> <value>");
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
   ret.append(" [--" + ARG_TTL + "]");
   ret.append("\n");
 }
@@ -2745,10 +2923,9 @@
   }
 }
 
-Options PutCommand::PrepareOptionsForOpenDB() {
-  Options opt = LDBCommand::PrepareOptionsForOpenDB();
-  opt.create_if_missing = create_if_missing_;
-  return opt;
+void PutCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
 }
 
 // ----------------------------------------------------------------------------
@@ -2790,7 +2967,9 @@
   std::string line;
   std::string key;
   std::string value;
-  while (getline(std::cin, line, '\n')) {
+  Status s;
+  std::stringstream oss;
+  while (s.ok() && getline(std::cin, line, '\n')) {
     // Parse line into std::vector<std::string>
     std::vector<std::string> tokens;
     size_t pos = 0;
@@ -2813,26 +2992,42 @@
               "delete <key>\n");
     } else if (cmd == DELETE_CMD && tokens.size() == 2) {
       key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
-      db_->Delete(write_options, GetCfHandle(), Slice(key));
-      fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+      s = db_->Delete(write_options, GetCfHandle(), Slice(key));
+      if (s.ok()) {
+        fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+      } else {
+        oss << "delete " << key << " failed: " << s.ToString();
+      }
     } else if (cmd == PUT_CMD && tokens.size() == 3) {
       key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
       value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]);
-      db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
-      fprintf(stdout, "Successfully put %s %s\n",
-              tokens[1].c_str(), tokens[2].c_str());
+      s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+      if (s.ok()) {
+        fprintf(stdout, "Successfully put %s %s\n", tokens[1].c_str(),
+                tokens[2].c_str());
+      } else {
+        oss << "put " << key << "=>" << value << " failed: " << s.ToString();
+      }
     } else if (cmd == GET_CMD && tokens.size() == 2) {
       key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
-      if (db_->Get(read_options, GetCfHandle(), Slice(key), &value).ok()) {
+      s = db_->Get(read_options, GetCfHandle(), Slice(key), &value);
+      if (s.ok()) {
         fprintf(stdout, "%s\n", PrintKeyValue(key, value,
               is_key_hex_, is_value_hex_).c_str());
       } else {
-        fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+        if (s.IsNotFound()) {
+          fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+        } else {
+          oss << "get " << key << " error: " << s.ToString();
+        }
       }
     } else {
       fprintf(stdout, "Unknown command %s\n", line.c_str());
     }
   }
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
 }
 
 // ----------------------------------------------------------------------------
@@ -2841,7 +3036,7 @@
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
 
 void CheckConsistencyCommand::Help(std::string& ret) {
   ret.append("  ");
@@ -2850,19 +3045,13 @@
 }
 
 void CheckConsistencyCommand::DoCommand() {
-  Options opt = PrepareOptionsForOpenDB();
-  opt.paranoid_checks = true;
-  if (!exec_state_.IsNotStarted()) {
-    return;
-  }
-  DB* db;
-  Status st = DB::OpenForReadOnly(opt, db_path_, &db, false);
-  delete db;
-  if (st.ok()) {
+  options_.paranoid_checks = true;
+  options_.num_levels = 64;
+  OpenDB();
+  if (exec_state_.IsSucceed() || exec_state_.IsNotStarted()) {
     fprintf(stdout, "OK\n");
-  } else {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
   }
+  CloseDB();
 }
 
 // ----------------------------------------------------------------------------
@@ -2905,21 +3094,31 @@
 
 // ----------------------------------------------------------------------------
 
+const std::string RepairCommand::ARG_VERBOSE = "verbose";
+
 RepairCommand::RepairCommand(const std::vector<std::string>& /*params*/,
                              const std::map<std::string, std::string>& options,
                              const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_VERBOSE})) {
+  verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
+}
 
 void RepairCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(RepairCommand::Name());
+  ret.append(" [--" + ARG_VERBOSE + "]");
   ret.append("\n");
 }
 
+void RepairCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  auto level = verbose_ ? InfoLogLevel::INFO_LEVEL : InfoLogLevel::WARN_LEVEL;
+  options_.info_log.reset(new StderrLogger(level));
+}
+
 void RepairCommand::DoCommand() {
-  Options options = PrepareOptionsForOpenDB();
-  options.info_log.reset(new StderrLogger(InfoLogLevel::WARN_LEVEL));
-  Status status = RepairDB(db_path_, options);
+  PrepareOptions();
+  Status status = RepairDB(db_path_, options_);
   if (status.ok()) {
     fprintf(stdout, "OK\n");
   } else {
@@ -2931,6 +3130,7 @@
 
 const std::string BackupableCommand::ARG_NUM_THREADS = "num_threads";
 const std::string BackupableCommand::ARG_BACKUP_ENV_URI = "backup_env_uri";
+const std::string BackupableCommand::ARG_BACKUP_FS_URI = "backup_fs_uri";
 const std::string BackupableCommand::ARG_BACKUP_DIR = "backup_dir";
 const std::string BackupableCommand::ARG_STDERR_LOG_LEVEL = "stderr_log_level";
 
@@ -2939,8 +3139,9 @@
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false /* is_read_only */,
-                 BuildCmdLineOptions({ARG_BACKUP_ENV_URI, ARG_BACKUP_DIR,
-                                      ARG_NUM_THREADS, ARG_STDERR_LOG_LEVEL})),
+                 BuildCmdLineOptions({ARG_BACKUP_ENV_URI, ARG_BACKUP_FS_URI,
+                                      ARG_BACKUP_DIR, ARG_NUM_THREADS,
+                                      ARG_STDERR_LOG_LEVEL})),
       num_threads_(1) {
   auto itr = options.find(ARG_NUM_THREADS);
   if (itr != options.end()) {
@@ -2950,6 +3151,15 @@
   if (itr != options.end()) {
     backup_env_uri_ = itr->second;
   }
+  itr = options.find(ARG_BACKUP_FS_URI);
+  if (itr != options.end()) {
+    backup_fs_uri_ = itr->second;
+  }
+  if (!backup_env_uri_.empty() && !backup_fs_uri_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "you may not specity both --" + ARG_BACKUP_ENV_URI + " and --" +
+        ARG_BACKUP_FS_URI);
+  }
   itr = options.find(ARG_BACKUP_DIR);
   if (itr == options.end()) {
     exec_state_ = LDBCommandExecuteResult::Failed("--" + ARG_BACKUP_DIR +
@@ -2976,7 +3186,7 @@
 void BackupableCommand::Help(const std::string& name, std::string& ret) {
   ret.append("  ");
   ret.append(name);
-  ret.append(" [--" + ARG_BACKUP_ENV_URI + "] ");
+  ret.append(" [--" + ARG_BACKUP_ENV_URI + " | --" + ARG_BACKUP_FS_URI + "] ");
   ret.append(" [--" + ARG_BACKUP_DIR + "] ");
   ret.append(" [--" + ARG_NUM_THREADS + "] ");
   ret.append(" [--" + ARG_STDERR_LOG_LEVEL + "=<int (InfoLogLevel)>] ");
@@ -3002,15 +3212,24 @@
     return;
   }
   fprintf(stdout, "open db OK\n");
-  Env* custom_env = nullptr;
-  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+
+  Env* custom_env = backup_env_guard_.get();
+  if (custom_env == nullptr) {
+    Status s =
+        Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_,
+                           &custom_env, &backup_env_guard_);
+    if (!s.ok()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+  }
   assert(custom_env != nullptr);
 
   BackupableDBOptions backup_options =
       BackupableDBOptions(backup_dir_, custom_env);
   backup_options.info_log = logger_.get();
   backup_options.max_background_operations = num_threads_;
-  status = BackupEngine::Open(custom_env, backup_options, &backup_engine);
+  status = BackupEngine::Open(options_.env, backup_options, &backup_engine);
   if (status.ok()) {
     fprintf(stdout, "open backup engine OK\n");
   } else {
@@ -3039,8 +3258,16 @@
 }
 
 void RestoreCommand::DoCommand() {
-  Env* custom_env = nullptr;
-  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+  Env* custom_env = backup_env_guard_.get();
+  if (custom_env == nullptr) {
+    Status s =
+        Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_,
+                           &custom_env, &backup_env_guard_);
+    if (!s.ok()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+  }
   assert(custom_env != nullptr);
 
   std::unique_ptr<BackupEngineReadOnly> restore_engine;
@@ -3051,7 +3278,7 @@
     opts.max_background_operations = num_threads_;
     BackupEngineReadOnly* raw_restore_engine_ptr;
     status =
-        BackupEngineReadOnly::Open(custom_env, opts, &raw_restore_engine_ptr);
+        BackupEngineReadOnly::Open(options_.env, opts, &raw_restore_engine_ptr);
     if (status.ok()) {
       restore_engine.reset(raw_restore_engine_ptr);
     }
@@ -3083,7 +3310,8 @@
   // no verification
   // TODO: add support for decoding blob indexes in ldb as well
   ROCKSDB_NAMESPACE::SstFileDumper dumper(
-      options, filename, /* verify_checksum */ false, output_hex,
+      options, filename, 2 * 1024 * 1024 /* readahead_size */,
+      /* verify_checksum */ false, output_hex,
       /* decode_blob_index */ false);
   Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(),
                                     false,            // has_from
@@ -3149,6 +3377,11 @@
   // remove the trailing '\n'
   manifest_filename.resize(manifest_filename.size() - 1);
   std::string manifest_filepath = db_->GetName() + "/" + manifest_filename;
+  // Correct concatenation of filepath and filename:
+  // Check that there is no double slashes (or more!) when concatenation
+  // happens.
+  manifest_filepath = NormalizePath(manifest_filepath);
+
   std::cout << manifest_filepath << std::endl;
   DumpManifestFile(options_, manifest_filepath, false, false, false);
   std::cout << std::endl;
@@ -3158,7 +3391,11 @@
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
   for (auto& fileMetadata : metadata) {
-    std::string filename = fileMetadata.db_path + fileMetadata.name;
+    std::string filename = fileMetadata.db_path + "/" + fileMetadata.name;
+    // Correct concatenation of filepath and filename:
+    // Check that there is no double slashes (or more!) when concatenation
+    // happens.
+    filename = NormalizePath(filename);
     std::cout << filename << " level:" << fileMetadata.level << std::endl;
     std::cout << "------------------------------" << std::endl;
     DumpSstFile(options_, filename, false, true);
@@ -3173,9 +3410,15 @@
   if (!s.ok()) {
     std::cerr << "Error when getting WAL files" << std::endl;
   } else {
+    std::string wal_dir;
+    if (options_.wal_dir.empty()) {
+      wal_dir = db_->GetName();
+    } else {
+      wal_dir = NormalizePath(options_.wal_dir + "/");
+    }
     for (auto& wal : wal_files) {
       // TODO(qyang): option.wal_dir should be passed into ldb command
-      std::string filename = db_->GetOptions().wal_dir + wal->PathName();
+      std::string filename = wal_dir + wal->PathName();
       std::cout << filename << std::endl;
       // TODO(myabandeh): allow configuring is_write_commited
       DumpWalFile(options_, filename, true, true, true /* is_write_commited */,
@@ -3184,6 +3427,129 @@
   }
 }
 
+const std::string DBLiveFilesMetadataDumperCommand::ARG_SORT_BY_FILENAME =
+    "sort_by_filename";
+
+DBLiveFilesMetadataDumperCommand::DBLiveFilesMetadataDumperCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true,
+                 BuildCmdLineOptions({ARG_SORT_BY_FILENAME})) {
+  sort_by_filename_ = IsFlagPresent(flags, ARG_SORT_BY_FILENAME);
+}
+
+void DBLiveFilesMetadataDumperCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DBLiveFilesMetadataDumperCommand::Name());
+  ret.append(" [--" + ARG_SORT_BY_FILENAME + "] ");
+  ret.append("\n");
+}
+
+void DBLiveFilesMetadataDumperCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  Status s;
+
+  std::vector<ColumnFamilyMetaData> metadata;
+  db_->GetAllColumnFamilyMetaData(&metadata);
+  if (sort_by_filename_) {
+    std::cout << "Live SST and Blob Files:" << std::endl;
+    // tuple of <file path, level, column family name>
+    std::vector<std::tuple<std::string, int, std::string>> all_files;
+
+    for (const auto& column_metadata : metadata) {
+      // Iterate Levels
+      const auto& levels = column_metadata.levels;
+      const std::string& cf = column_metadata.name;
+      for (const auto& level_metadata : levels) {
+        // Iterate SST files
+        const auto& sst_files = level_metadata.files;
+        int level = level_metadata.level;
+        for (const auto& sst_metadata : sst_files) {
+          // The SstFileMetaData.name always starts with "/",
+          // however SstFileMetaData.db_path is the string provided by
+          // the user as an input. Therefore we check if we can
+          // concantenate the two strings directly or if we need to
+          // drop a possible extra "/" at the end of SstFileMetaData.db_path.
+          std::string filename =
+              NormalizePath(sst_metadata.db_path + "/" + sst_metadata.name);
+          all_files.emplace_back(filename, level, cf);
+        }  // End of for-loop over sst files
+      }    // End of for-loop over levels
+
+      const auto& blob_files = column_metadata.blob_files;
+      for (const auto& blob_metadata : blob_files) {
+        // The BlobMetaData.blob_file_name always starts with "/",
+        // however BlobMetaData.blob_file_path is the string provided by
+        // the user as an input. Therefore we check if we can
+        // concantenate the two strings directly or if we need to
+        // drop a possible extra "/" at the end of BlobMetaData.blob_file_path.
+        std::string filename = NormalizePath(
+            blob_metadata.blob_file_path + "/" + blob_metadata.blob_file_name);
+        // Level for blob files is encoded as -1
+        all_files.emplace_back(filename, -1, cf);
+      }  // End of for-loop over blob files
+    }    // End of for-loop over column metadata
+
+    // Sort by filename (i.e. first entry in tuple)
+    std::sort(all_files.begin(), all_files.end());
+
+    for (const auto& item : all_files) {
+      const std::string& filename = std::get<0>(item);
+      int level = std::get<1>(item);
+      const std::string& cf = std::get<2>(item);
+      if (level == -1) {  // Blob File
+        std::cout << filename << ", column family '" << cf << "'" << std::endl;
+      } else {  // SST file
+        std::cout << filename << " : level " << level << ", column family '"
+                  << cf << "'" << std::endl;
+      }
+    }
+  } else {
+    for (const auto& column_metadata : metadata) {
+      std::cout << "===== Column Family: " << column_metadata.name
+                << " =====" << std::endl;
+
+      std::cout << "Live SST Files:" << std::endl;
+      // Iterate levels
+      const auto& levels = column_metadata.levels;
+      for (const auto& level_metadata : levels) {
+        std::cout << "---------- level " << level_metadata.level
+                  << " ----------" << std::endl;
+        // Iterate SST files
+        const auto& sst_files = level_metadata.files;
+        for (const auto& sst_metadata : sst_files) {
+          // The SstFileMetaData.name always starts with "/",
+          // however SstFileMetaData.db_path is the string provided by
+          // the user as an input. Therefore we check if we can
+          // concantenate the two strings directly or if we need to
+          // drop a possible extra "/" at the end of SstFileMetaData.db_path.
+          std::string filename =
+              NormalizePath(sst_metadata.db_path + "/" + sst_metadata.name);
+          std::cout << filename << std::endl;
+        }  // End of for-loop over sst files
+      }    // End of for-loop over levels
+
+      std::cout << "Live Blob Files:" << std::endl;
+      const auto& blob_files = column_metadata.blob_files;
+      for (const auto& blob_metadata : blob_files) {
+        // The BlobMetaData.blob_file_name always starts with "/",
+        // however BlobMetaData.blob_file_path is the string provided by
+        // the user as an input. Therefore we check if we can
+        // concantenate the two strings directly or if we need to
+        // drop a possible extra "/" at the end of BlobMetaData.blob_file_path.
+        std::string filename = NormalizePath(
+            blob_metadata.blob_file_path + "/" + blob_metadata.blob_file_name);
+        std::cout << filename << std::endl;
+      }  // End of for-loop over blob files
+    }    // End of for-loop over column metadata
+  }      // End of else ("not sort_by_filename")
+  std::cout << "------------------------------" << std::endl;
+}
+
 void WriteExternalSstFilesCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(WriteExternalSstFilesCommand::Name());
@@ -3261,10 +3627,9 @@
       "external SST file written to " + output_sst_path_);
 }
 
-Options WriteExternalSstFilesCommand::PrepareOptionsForOpenDB() {
-  Options opt = LDBCommand::PrepareOptionsForOpenDB();
-  opt.create_if_missing = create_if_missing_;
-  return opt;
+void WriteExternalSstFilesCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
 }
 
 const std::string IngestExternalSstFilesCommand::ARG_MOVE_FILES = "move_files";
@@ -3376,10 +3741,9 @@
   }
 }
 
-Options IngestExternalSstFilesCommand::PrepareOptionsForOpenDB() {
-  Options opt = LDBCommand::PrepareOptionsForOpenDB();
-  opt.create_if_missing = create_if_missing_;
-  return opt;
+void IngestExternalSstFilesCommand::OverrideBaseOptions() {
+  LDBCommand::OverrideBaseOptions();
+  options_.create_if_missing = create_if_missing_;
 }
 
 ListFileRangeDeletesCommand::ListFileRangeDeletesCommand(
@@ -3418,7 +3782,7 @@
     return;
   }
 
-  DBImpl* db_impl = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
 
   std::string out_str;
 
@@ -3428,8 +3792,88 @@
     TEST_SYNC_POINT_CALLBACK(
         "ListFileRangeDeletesCommand::DoCommand:BeforePrint", &out_str);
     fprintf(stdout, "%s\n", out_str.c_str());
+  }
+}
+
+void UnsafeRemoveSstFileCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(UnsafeRemoveSstFileCommand::Name());
+  ret.append(" <SST file number>");
+  ret.append("  ");
+  ret.append("    MUST NOT be used on a live DB.");
+  ret.append("\n");
+}
+
+UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false /* is_read_only */,
+                 BuildCmdLineOptions({})) {
+  if (params.size() != 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("SST file number must be specified");
   } else {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    char* endptr = nullptr;
+    sst_file_number_ = strtoull(params.at(0).c_str(), &endptr, 10 /* base */);
+    if (endptr == nullptr || *endptr != '\0') {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Failed to parse SST file number " + params.at(0));
+    }
+  }
+}
+
+void UnsafeRemoveSstFileCommand::DoCommand() {
+  // Instead of opening a `DB` and calling `DeleteFile()`, this implementation
+  // uses the underlying `VersionSet` API to read and modify the MANIFEST. This
+  // allows us to use the user's real options, while not having to worry about
+  // the DB persisting new SST files via flush/compaction or attempting to read/
+  // compact files which may fail, particularly for the file we intend to remove
+  // (the user may want to remove an already deleted file from MANIFEST).
+  PrepareOptions();
+
+  if (options_.db_paths.empty()) {
+    // `VersionSet` expects options that have been through `SanitizeOptions()`,
+    // which would sanitize an empty `db_paths`.
+    options_.db_paths.emplace_back(db_path_, 0 /* target_size */);
+  }
+
+  WriteController wc(options_.delayed_write_rate);
+  WriteBufferManager wb(options_.db_write_buffer_size);
+  ImmutableDBOptions immutable_db_options(options_);
+  std::shared_ptr<Cache> tc(
+      NewLRUCache(1 << 20 /* capacity */, options_.table_cache_numshardbits));
+  EnvOptions sopt;
+  VersionSet versions(db_path_, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
+  Status s = versions.Recover(column_families_);
+
+  ColumnFamilyData* cfd = nullptr;
+  int level = -1;
+  if (s.ok()) {
+    FileMetaData* metadata = nullptr;
+    s = versions.GetMetadataForFile(sst_file_number_, &level, &metadata, &cfd);
+  }
+
+  if (s.ok()) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    edit.DeleteFile(level, sst_file_number_);
+    // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`.
+    InstrumentedMutex mutex;
+    mutex.Lock();
+    s = versions.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+                             &mutex, nullptr /* db_directory */,
+                             false /* new_descriptor_log */);
+    mutex.Unlock();
+  }
+
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "failed to unsafely remove SST file: " + s.ToString());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Succeed("unsafely removed SST file");
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_cmd_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_cmd_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -46,6 +46,25 @@
   virtual void DoCommand() override;
 };
 
+class DBLiveFilesMetadataDumperCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "list_live_files_metadata"; }
+
+  DBLiveFilesMetadataDumperCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  virtual void DoCommand() override;
+
+ private:
+  bool sort_by_filename_;
+
+  static const std::string ARG_SORT_BY_FILENAME;
+};
+
 class DBDumperCommand : public LDBCommand {
  public:
   static std::string Name() { return "dump"; }
@@ -136,7 +155,7 @@
   static void Help(std::string& ret);
   virtual void DoCommand() override;
 
-  virtual Options PrepareOptionsForOpenDB() override;
+  virtual void OverrideBaseOptions() override;
 
  private:
   bool disable_wal_;
@@ -186,10 +205,26 @@
 
  private:
   std::string path_;
+  bool is_checksum_hex_;
 
   static const std::string ARG_PATH;
 };
 
+class GetPropertyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "get_property"; }
+
+  GetPropertyCommand(const std::vector<std::string>& params,
+                     const std::map<std::string, std::string>& options,
+                     const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+  void DoCommand() override;
+
+ private:
+  std::string property_;
+};
+
 class ListColumnFamiliesCommand : public LDBCommand {
  public:
   static std::string Name() { return "list_column_families"; }
@@ -246,7 +281,7 @@
                         const std::map<std::string, std::string>& options,
                         const std::vector<std::string>& flags);
 
-  virtual Options PrepareOptionsForOpenDB() override;
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override;
 
   virtual void DoCommand() override;
 
@@ -278,7 +313,7 @@
       const std::map<std::string, std::string>& options,
       const std::vector<std::string>& flags);
 
-  virtual Options PrepareOptionsForOpenDB() override;
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override;
 
   virtual void DoCommand() override;
 
@@ -362,7 +397,7 @@
 
   static void Help(std::string& ret);
 
-  virtual Options PrepareOptionsForOpenDB() override;
+  virtual void OverrideBaseOptions() override;
 
  private:
   /**
@@ -437,7 +472,7 @@
 
   static void Help(std::string& ret);
 
-  virtual Options PrepareOptionsForOpenDB() override;
+  virtual void OverrideBaseOptions() override;
 
  private:
   std::string key_;
@@ -511,7 +546,15 @@
 
   virtual bool NoDBOpen() override { return true; }
 
+  virtual void OverrideBaseOptions() override;
+
   static void Help(std::string& ret);
+
+ protected:
+  bool verbose_;
+
+ private:
+  static const std::string ARG_VERBOSE;
 };
 
 class BackupableCommand : public LDBCommand {
@@ -523,6 +566,7 @@
  protected:
   static void Help(const std::string& name, std::string& ret);
   std::string backup_env_uri_;
+  std::string backup_fs_uri_;
   std::string backup_dir_;
   int num_threads_;
   std::unique_ptr<Logger> logger_;
@@ -531,6 +575,7 @@
  private:
   static const std::string ARG_BACKUP_DIR;
   static const std::string ARG_BACKUP_ENV_URI;
+  static const std::string ARG_BACKUP_FS_URI;
   static const std::string ARG_NUM_THREADS;
   static const std::string ARG_STDERR_LOG_LEVEL;
 };
@@ -568,7 +613,7 @@
 
   virtual bool NoDBOpen() override { return false; }
 
-  virtual Options PrepareOptionsForOpenDB() override;
+  virtual void OverrideBaseOptions() override;
 
   static void Help(std::string& ret);
 
@@ -588,7 +633,7 @@
 
   virtual bool NoDBOpen() override { return false; }
 
-  virtual Options PrepareOptionsForOpenDB() override;
+  virtual void OverrideBaseOptions() override;
 
   static void Help(std::string& ret);
 
@@ -625,4 +670,23 @@
   int max_keys_ = 1000;
 };
 
+// Command that removes the SST file forcibly from the manifest.
+class UnsafeRemoveSstFileCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "unsafe_remove_sst_file"; }
+
+  UnsafeRemoveSstFileCommand(const std::vector<std::string>& params,
+                             const std::map<std::string, std::string>& options,
+                             const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  virtual void DoCommand() override;
+
+  virtual bool NoDBOpen() override { return true; }
+
+ private:
+  uint64_t sst_file_number_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_cmd_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_cmd_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_cmd_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,16 +6,20 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/ldb_cmd.h"
+
 #include "db/version_edit.h"
 #include "db/version_set.h"
 #include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/file_checksum.h"
+#include "rocksdb/utilities/options_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/file_checksum_helper.h"
+#include "util/random.h"
 
 using std::string;
 using std::vector;
@@ -28,12 +32,8 @@
   LdbCmdTest() : testing::Test() {}
 
   Env* TryLoadCustomOrDefaultEnv() {
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    if (!test_env_uri) {
-      return Env::Default();
-    }
     Env* env = Env::Default();
-    Env::LoadEnv(test_env_uri, &env, &env_guard_);
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_));
     return env;
   }
 
@@ -41,6 +41,18 @@
   std::shared_ptr<Env> env_guard_;
 };
 
+TEST_F(LdbCmdTest, HelpAndVersion) {
+  Options o;
+  o.env = TryLoadCustomOrDefaultEnv();
+  LDBOptions lo;
+  static const char* help[] = {"./ldb", "--help"};
+  ASSERT_EQ(0, LDBCommandRunner::RunCommand(2, help, o, lo, nullptr));
+  static const char* version[] = {"./ldb", "--version"};
+  ASSERT_EQ(0, LDBCommandRunner::RunCommand(2, version, o, lo, nullptr));
+  static const char* bad[] = {"./ldb", "--not_an_option"};
+  ASSERT_NE(0, LDBCommandRunner::RunCommand(2, bad, o, lo, nullptr));
+}
+
 TEST_F(LdbCmdTest, HexToString) {
   // map input to expected outputs.
   // odd number of "hex" half bytes doesn't make sense
@@ -64,7 +76,7 @@
   const vector<string> badInputs = {
       "0xZZ", "123", "0xx5", "0x111G", "0x123", "Ox12", "0xT", "0x1Q1",
   };
-  for (const auto badInput : badInputs) {
+  for (const auto& badInput : badInputs) {
     try {
       ROCKSDB_NAMESPACE::LDBCommand::HexToString(badInput);
       std::cerr << "Should fail on bad hex value: " << badInput << "\n";
@@ -81,10 +93,8 @@
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
-
   DB* db = nullptr;
-  std::string dbname = test::TmpDir();
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   WriteOptions wopts;
@@ -128,33 +138,31 @@
       return s;
     }
     std::unique_ptr<char[]> scratch(new char[2048]);
-    bool first_read = true;
     Slice result;
-    FileChecksumFunc* file_checksum_func =
-        options_.sst_file_checksum_func.get();
-    if (file_checksum_func == nullptr) {
+    FileChecksumGenFactory* file_checksum_gen_factory =
+        options_.file_checksum_gen_factory.get();
+    if (file_checksum_gen_factory == nullptr) {
       cur_checksum = kUnknownFileChecksum;
       checksum_func_name = kUnknownFileChecksumFuncName;
     } else {
-      checksum_func_name = file_checksum_func->Name();
+      FileChecksumGenContext gen_context;
+      gen_context.file_name = file_meta.name;
+      std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
+      checksum_func_name = file_checksum_gen->Name();
       s = file_reader->Read(2048, &result, scratch.get());
       if (!s.ok()) {
         return s;
       }
       while (result.size() != 0) {
-        if (first_read) {
-          first_read = false;
-          cur_checksum =
-              file_checksum_func->Value(scratch.get(), result.size());
-        } else {
-          cur_checksum = file_checksum_func->Extend(cur_checksum, scratch.get(),
-                                                    result.size());
-        }
+        file_checksum_gen->Update(scratch.get(), result.size());
         s = file_reader->Read(2048, &result, scratch.get());
         if (!s.ok()) {
           return s;
         }
       }
+      file_checksum_gen->Finalize();
+      cur_checksum = file_checksum_gen->GetChecksum();
     }
 
     std::string stored_checksum = file_meta.file_checksum;
@@ -195,11 +203,11 @@
     WriteBufferManager wb(options_.db_write_buffer_size);
     ImmutableDBOptions immutable_db_options(options_);
     VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
-                        &wc, nullptr);
+                        &wc, nullptr, nullptr, "");
     std::vector<std::string> cf_name_list;
     Status s;
     s = versions.ListColumnFamilies(&cf_name_list, dbname_,
-                                    options_.file_system.get());
+                                    immutable_db_options.fs.get());
     if (s.ok()) {
       std::vector<ColumnFamilyDescriptor> cf_list;
       for (const auto& name : cf_name_list) {
@@ -246,15 +254,18 @@
   // comparing it with the one being generated when a SST file is created.
   Status VerifyEachFileChecksum() {
     assert(db_ != nullptr);
+    EXPECT_OK(db_->DisableFileDeletions());
     std::vector<LiveFileMetaData> live_files;
     db_->GetLiveFilesMetaData(&live_files);
+    Status cs;
     for (auto a_file : live_files) {
-      Status cs = VerifyChecksum(a_file);
+      cs = VerifyChecksum(a_file);
       if (!cs.ok()) {
-        return cs;
+        break;
       }
     }
-    return Status::OK();
+    EXPECT_OK(db_->EnableFileDeletions());
+    return cs;
   }
 };
 
@@ -264,10 +275,9 @@
   Options opts;
   opts.env = env.get();
   opts.create_if_missing = true;
-  opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
 
   DB* db = nullptr;
-  std::string dbname = test::TmpDir();
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   WriteOptions wopts;
@@ -277,44 +287,45 @@
   for (int i = 0; i < 200; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
   for (int i = 100; i < 300; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
   for (int i = 200; i < 400; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
   for (int i = 300; i < 400; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
 
   char arg1[] = "./ldb";
   char arg2[1024];
   snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
   char arg3[] = "file_checksum_dump";
-  char* argv[] = {arg1, arg2, arg3};
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, arg2, arg3, arg4};
 
   ASSERT_EQ(0,
-            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst file checksum value and checksum name
   FileChecksumTestHelper fct_helper(opts, db, dbname);
@@ -333,8 +344,13 @@
   FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
+  ASSERT_OK(db->Close());
+  delete db;
+
   ASSERT_EQ(0,
-            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify the checksum information in memory is the same as that in Manifest;
   std::vector<LiveFileMetaData> live_files;
@@ -343,18 +359,101 @@
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
 }
 
+TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+  opts.enable_blob_files = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 200; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 300; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 200; i < 400; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 300; i < 400; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
+
+  char arg1[] = "./ldb";
+  std::string arg2_str = "--db=" + dbname;
+  char arg3[] = "file_checksum_dump";
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify each sst and blob file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  std::ostringstream oss_b_buf;
+  oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0;
+  std::ostringstream oss_e_buf;
+  oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 399;
+  std::string b_buf = oss_b_buf.str();
+  std::string e_buf = oss_e_buf.str();
+  Slice begin(b_buf);
+  Slice end(e_buf);
+
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_OK(db->Close());
+  delete db;
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+}
+
 TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   Env* base_env = TryLoadCustomOrDefaultEnv();
   std::unique_ptr<Env> env(NewMemEnv(base_env));
   Options opts;
   opts.env = env.get();
   opts.create_if_missing = true;
-  opts.sst_file_checksum_func =
-      std::shared_ptr<FileChecksumFunc>(CreateFileChecksumFuncCrc32c());
-  opts.file_system.reset(new LegacyFileSystemWrapper(opts.env));
+  opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
 
   DB* db = nullptr;
-  std::string dbname = test::TmpDir();
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   WriteOptions wopts;
@@ -364,44 +463,45 @@
   for (int i = 0; i < 100; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
   for (int i = 50; i < 150; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
   for (int i = 100; i < 200; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
   for (int i = 150; i < 250; i++) {
     char buf[16];
     snprintf(buf, sizeof(buf), "%08d", i);
-    std::string v;
-    test::RandomString(&rnd, 100, &v);
+    std::string v = rnd.RandomString(100);
     ASSERT_OK(db->Put(wopts, buf, v));
   }
   ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
 
   char arg1[] = "./ldb";
   char arg2[1024];
   snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
   char arg3[] = "file_checksum_dump";
-  char* argv[] = {arg1, arg2, arg3};
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, arg2, arg3, arg4};
 
   ASSERT_EQ(0,
-            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst file checksum value and checksum name
   FileChecksumTestHelper fct_helper(opts, db, dbname);
@@ -420,14 +520,107 @@
   FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
+  ASSERT_OK(db->Close());
+  delete db;
+
   ASSERT_EQ(0,
-            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify the checksum information in memory is the same as that in Manifest;
   std::vector<LiveFileMetaData> live_files;
   db->GetLiveFilesMetaData(&live_files);
-  delete db;
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
+
+  ASSERT_OK(db->Close());
+  delete db;
+}
+
+TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+  opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  opts.enable_blob_files = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 100; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 50; i < 150; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 200; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 150; i < 250; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  ASSERT_OK(db->Close());
+  delete db;
+
+  char arg1[] = "./ldb";
+  std::string arg2_str = "--db=" + dbname;
+  char arg3[] = "file_checksum_dump";
+  char arg4[] = "--hex";
+  char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3, arg4};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  // Verify each sst and blob file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  std::ostringstream oss_b_buf;
+  oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0;
+  std::ostringstream oss_e_buf;
+  oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 249;
+  std::string b_buf = oss_b_buf.str();
+  std::string e_buf = oss_e_buf.str();
+  Slice begin(b_buf);
+  Slice end(e_buf);
+
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_OK(db->Close());
+  delete db;
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
 }
 
 TEST_F(LdbCmdTest, OptionParsing) {
@@ -477,7 +670,7 @@
   opts.create_if_missing = true;
 
   DB* db = nullptr;
-  std::string dbname = test::TmpDir();
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   WriteOptions wopts;
@@ -558,15 +751,231 @@
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
-}  // namespace ROCKSDB_NAMESPACE
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
-}
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+TEST_F(LdbCmdTest, DisableConsistencyChecks) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+
+  {
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(opts, dbname, &db));
+
+    WriteOptions wopts;
+    FlushOptions fopts;
+    fopts.wait = true;
+
+    ASSERT_OK(db->Put(wopts, "foo1", "1"));
+    ASSERT_OK(db->Put(wopts, "bar1", "2"));
+    ASSERT_OK(db->Flush(fopts));
+
+    ASSERT_OK(db->Put(wopts, "foo2", "3"));
+    ASSERT_OK(db->Put(wopts, "bar2", "4"));
+    ASSERT_OK(db->Flush(fopts));
+
+    delete db;
+  }
+
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "checkconsistency";
+    char* argv[] = {arg1, arg2, arg3};
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "Version::PrepareApply:forced_check", [&](void* arg) {
+          bool* forced = reinterpret_cast<bool*>(arg);
+          ASSERT_TRUE(*forced);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "scan";
+    char* argv[] = {arg1, arg2, arg3};
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "Version::PrepareApply:forced_check", [&](void* arg) {
+          bool* forced = reinterpret_cast<bool*>(arg);
+          ASSERT_TRUE(*forced);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "scan";
+    char arg4[] = "--disable_consistency_checks";
+    char* argv[] = {arg1, arg2, arg3, arg4};
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "ColumnFamilyData::ColumnFamilyData", [&](void* arg) {
+          ColumnFamilyOptions* cfo =
+              reinterpret_cast<ColumnFamilyOptions*>(arg);
+          ASSERT_FALSE(cfo->force_consistency_checks);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(LdbCmdTest, TestBadDbPath) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s/.no_such_dir", dbname.c_str());
+  char arg3[1024];
+  snprintf(arg3, sizeof(arg3), "create_column_family");
+  char arg4[] = "bad cf";
+  char* argv[] = {arg1, arg2, arg3, arg4};
+
+  ASSERT_EQ(1,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+  snprintf(arg3, sizeof(arg3), "drop_column_family");
+  ASSERT_EQ(1,
+            LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+}
+namespace {
+class WrappedEnv : public EnvWrapper {
+ public:
+  explicit WrappedEnv(Env* t) : EnvWrapper(t) {}
+  static const char* kClassName() { return "WrappedEnv"; }
+  const char* Name() const override { return kClassName(); }
+};
+}  // namespace
+TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) {
+  // Env* base_env = TryLoadCustomOrDefaultEnv();
+  // std::unique_ptr<Env> env(NewMemEnv(base_env));
+  std::unique_ptr<Env> env(new WrappedEnv(Env::Default()));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  DestroyDB(dbname, opts);
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  ColumnFamilyHandle* cf_handle;
+  ColumnFamilyOptions cf_opts;
+  cf_opts.num_levels = 20;
+  ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle));
+
+  delete cf_handle;
+  delete db;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "put";
+  char arg4[] = "key1";
+  char arg5[] = "value1";
+  char arg6[] = "--try_load_options";
+  char arg7[] = "--column_family=cf1";
+  char arg8[] = "--write_buffer_size=268435456";
+  char* argv[] = {arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(8, argv, opts, LDBOptions(), nullptr));
+
+  ConfigOptions config_opts;
+  Options options;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  config_opts.env = env.get();
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname, &options, &column_families));
+  ASSERT_EQ(column_families.size(), 2);
+  ASSERT_EQ(options.num_levels, opts.num_levels);
+  ASSERT_EQ(column_families[1].options.num_levels, cf_opts.num_levels);
+  ASSERT_EQ(column_families[1].options.write_buffer_size, 268435456);
+}
+
+TEST_F(LdbCmdTest, RenameDbAndLoadOptions) {
+  Env* env = TryLoadCustomOrDefaultEnv();
+  Options opts;
+  opts.env = env;
+  opts.create_if_missing = false;
+
+  std::string old_dbname = test::PerThreadDBPath(env, "ldb_cmd_test");
+  std::string new_dbname = old_dbname + "_2";
+  DestroyDB(old_dbname, opts);
+  DestroyDB(new_dbname, opts);
+
+  char old_arg[1024];
+  snprintf(old_arg, sizeof(old_arg), "--db=%s", old_dbname.c_str());
+  char new_arg[1024];
+  snprintf(new_arg, sizeof(old_arg), "--db=%s", new_dbname.c_str());
+  const char* argv1[] = {"./ldb",
+                         old_arg,
+                         "put",
+                         "key1",
+                         "value1",
+                         "--try_load_options",
+                         "--create_if_missing"};
+
+  const char* argv2[] = {"./ldb", old_arg, "get", "key1", "--try_load_options"};
+  const char* argv3[] = {"./ldb", new_arg,  "put",
+                         "key2",  "value2", "--try_load_options"};
+
+  const char* argv4[] = {"./ldb", new_arg, "get", "key1", "--try_load_options"};
+  const char* argv5[] = {"./ldb", new_arg, "get", "key2", "--try_load_options"};
+
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(7, argv1, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(5, argv2, opts, LDBOptions(), nullptr));
+  ConfigOptions config_opts;
+  Options options;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  config_opts.env = env;
+  ASSERT_OK(
+      LoadLatestOptions(config_opts, old_dbname, &options, &column_families));
+  ASSERT_EQ(options.wal_dir, "");
+
+  ASSERT_OK(env->RenameFile(old_dbname, new_dbname));
+  ASSERT_NE(
+      0, LDBCommandRunner::RunCommand(6, argv1, opts, LDBOptions(), nullptr));
+  ASSERT_NE(
+      0, LDBCommandRunner::RunCommand(5, argv2, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(6, argv3, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(5, argv4, opts, LDBOptions(), nullptr));
+  ASSERT_EQ(
+      0, LDBCommandRunner::RunCommand(5, argv5, opts, LDBOptions(), nullptr));
+  DestroyDB(new_dbname, opts);
+}
+}  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_test.py mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_test.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_test.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_test.py	2025-05-19 16:14:28.000000000 +0000
@@ -1,5 +1,7 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import os
 import glob
 import os.path
@@ -27,7 +29,7 @@
             cmd = popenargs[0]
         raise Exception("Exit code is not 0.  It is %d.  Command: %s" %
                 (retcode, cmd))
-    return output
+    return output.decode('utf-8')
 
 def run_err_null(cmd):
     return os.system(cmd + " 2>/dev/null ")
@@ -52,7 +54,6 @@
         """
         All command-line params must be specified.
         Allows full flexibility in testing; for example: missing db param.
-
         """
         output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" %
                             params, shell=True)
@@ -72,7 +73,6 @@
         """
         All command-line params must be specified.
         Allows full flexibility in testing; for example: missing db param.
-
         """
         try:
 
@@ -87,7 +87,6 @@
     def assertRunOK(self, params, expectedOutput, unexpected=False):
         """
         Uses the default test db.
-
         """
         self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params),
                              expectedOutput, unexpected)
@@ -99,7 +98,7 @@
         self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params))
 
     def testSimpleStringPutGet(self):
-        print "Running testSimpleStringPutGet..."
+        print("Running testSimpleStringPutGet...")
         self.assertRunFAIL("put x1 y1")
         self.assertRunOK("put --create_if_missing x1 y1", "OK")
         self.assertRunOK("get x1", "y1")
@@ -157,7 +156,7 @@
                                      % (inputSst, params))
 
     def testStringBatchPut(self):
-        print "Running testStringBatchPut..."
+        print("Running testStringBatchPut...")
         self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
         self.assertRunOK("scan", "x1 : y1")
         self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK")
@@ -167,7 +166,7 @@
         self.assertRunFAIL("batchput k1 v1 k2")
 
     def testCountDelimDump(self):
-        print "Running testCountDelimDump..."
+        print("Running testCountDelimDump...")
         self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
         self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
         self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
@@ -176,7 +175,7 @@
         self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
 
     def testCountDelimIDump(self):
-        print "Running testCountDelimIDump..."
+        print("Running testCountDelimIDump...")
         self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
         self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
         self.assertRunOK("idump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
@@ -185,7 +184,7 @@
         self.assertRunOK("idump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
 
     def testInvalidCmdLines(self):
-        print "Running testInvalidCmdLines..."
+        print("Running testInvalidCmdLines...")
         # db not specified
         self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
         # No param called he
@@ -195,7 +194,7 @@
         # hex has invalid boolean value
 
     def testHexPutGet(self):
-        print "Running testHexPutGet..."
+        print("Running testHexPutGet...")
         self.assertRunOK("put a1 b1 --create_if_missing", "OK")
         self.assertRunOK("scan", "a1 : b1")
         self.assertRunOK("scan --hex", "0x6131 : 0x6231")
@@ -225,7 +224,7 @@
         self.assertRunOK("checkconsistency", "OK")
 
     def testTtlPutGet(self):
-        print "Running testTtlPutGet..."
+        print("Running testTtlPutGet...")
         self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK")
         self.assertRunOK("scan --hex", "0x6131 : 0x6231", True)
         self.assertRunOK("dump --ttl ", "a1 ==> b1", True)
@@ -240,7 +239,7 @@
         self.assertRunOK("checkconsistency", "OK")
 
     def testInvalidCmdLines(self):  # noqa: F811 T25377293 Grandfathered in
-        print "Running testInvalidCmdLines..."
+        print("Running testInvalidCmdLines...")
         # db not specified
         self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
         # No param called he
@@ -251,7 +250,7 @@
         self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing")
 
     def testDumpLoad(self):
-        print "Running testDumpLoad..."
+        print("Running testDumpLoad...")
         self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
                 "OK")
         self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
@@ -342,7 +341,7 @@
             "--db=%s --create_if_missing" % origDbPath, dumpFilePath))
 
     def testIDumpBasics(self):
-        print "Running testIDumpBasics..."
+        print("Running testIDumpBasics...")
         self.assertRunOK("put a val --create_if_missing", "OK")
         self.assertRunOK("put b val", "OK")
         self.assertRunOK(
@@ -354,7 +353,7 @@
                 "'a' seq:1, type:1 => val\nInternal keys in range: 1")
 
     def testMiscAdminTask(self):
-        print "Running testMiscAdminTask..."
+        print("Running testMiscAdminTask...")
         # These tests need to be improved; for example with asserts about
         # whether compaction or level reduction actually took place.
         self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
@@ -390,7 +389,7 @@
         self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
 
     def testCheckConsistency(self):
-        print "Running testCheckConsistency..."
+        print("Running testCheckConsistency...")
 
         dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
         self.assertRunOK("put x1 y1 --create_if_missing", "OK")
@@ -414,7 +413,7 @@
             params, dumpFile))
 
     def testDumpLiveFiles(self):
-        print "Running testDumpLiveFiles..."
+        print("Running testDumpLiveFiles...")
 
         dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
         self.assertRunOK("put x1 y1 --create_if_missing", "OK")
@@ -424,8 +423,123 @@
         self.assertRunOK("delete x1", "OK")
         self.assertRunOK("put x3 y3", "OK")
         dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+
+        # Test that if the user provides a db path that ends with
+        # a slash '/', there is no double (or more!) slashes in the
+        # SST and manifest file names.
+
+        # Add a '/' at the end of dbPath (which normally shouldnt contain any)
+        if dbPath[-1] != "/":
+            dbPath += "/"
+
+        # Call the dump_live_files function with the edited dbPath name.
         self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
 
+        # Investigate the output
+        with open(dumpFilePath, "r") as tmp:
+            data = tmp.read()
+
+        # Check that all the SST filenames have a correct full path (no multiple '/').
+        sstFileList = re.findall(r"%s.*\d+.sst" % dbPath, data)
+        for sstFilename in sstFileList:
+            filenumber = re.findall(r"\d+.sst", sstFilename)[0]
+            self.assertEqual(sstFilename, dbPath+filenumber)
+
+        # Check that all the manifest filenames
+        # have a correct full path (no multiple '/').
+        manifestFileList = re.findall(r"%s.*MANIFEST-\d+" % dbPath, data)
+        for manifestFilename in manifestFileList:
+            filenumber = re.findall(r"(?<=MANIFEST-)\d+", manifestFilename)[0]
+            self.assertEqual(manifestFilename, dbPath+"MANIFEST-"+filenumber)
+
+    def listLiveFilesMetadata(self, params, dumpFile):
+        return 0 == run_err_null("./ldb list_live_files_metadata %s > %s" % (
+            params, dumpFile))
+
+    def testListLiveFilesMetadata(self):
+        print("Running testListLiveFilesMetadata...")
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2", "OK")
+
+        # Compare the SST filename and the level of list_live_files_metadata
+        # with the data collected from dump_live_files.
+        dumpFilePath1 = os.path.join(self.TMP_DIR, "dump1")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath1))
+        dumpFilePath2 = os.path.join(self.TMP_DIR, "dump2")
+        self.assertTrue(self.listLiveFilesMetadata("--sort_by_filename --db=%s" % dbPath, dumpFilePath2))
+
+        # Collect SST filename and level from dump_live_files
+        with open(dumpFilePath1, "r") as tmp:
+            data = tmp.read()
+            filename1 = re.findall(r".*\d+\.sst",data)[0]
+            level1 = re.findall(r"level:\d+",data)[0].split(':')[1]
+
+        # Collect SST filename and level from list_live_files_metadata
+        with open(dumpFilePath2, "r") as tmp:
+            data = tmp.read()
+            filename2 = re.findall(r".*\d+\.sst",data)[0]
+            level2 = re.findall(r"level \d+",data)[0].split(' ')[1]
+
+        # Assert equality between filenames and levels.
+        self.assertEqual(filename1,filename2)
+        self.assertEqual(level1,level2)
+
+        # Create multiple column families and compare the output
+        # of list_live_files_metadata with dump_live_files once again.
+        # Create new CF, and insert data:
+        self.assertRunOK("create_column_family mycol1", "OK")
+        self.assertRunOK("put --column_family=mycol1 v1 v2", "OK")
+        self.assertRunOK("create_column_family mycol2", "OK")
+        self.assertRunOK("put --column_family=mycol2 h1 h2", "OK")
+        self.assertRunOK("put --column_family=mycol2 h3 h4", "OK")
+
+        # Call dump_live_files and list_live_files_metadata
+        # and pipe the output to compare them later.
+        dumpFilePath3 = os.path.join(self.TMP_DIR, "dump3")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath3))
+        dumpFilePath4 = os.path.join(self.TMP_DIR, "dump4")
+        self.assertTrue(self.listLiveFilesMetadata("--sort_by_filename --db=%s" % dbPath, dumpFilePath4))
+
+        # dump_live_files:
+        # parse the output and create a map:
+        # [key: sstFilename]->[value:[LSM level, Column Family Name]]
+        referenceMap = {}
+        with open(dumpFilePath3, "r") as tmp:
+            data = tmp.read()
+            # Note: the following regex are contingent on what the
+            # dump_live_files outputs.
+            namesAndLevels = re.findall(r"\d+.sst level:\d+", data)
+            cfs = re.findall(r"(?<=column family name=)\w+", data)
+            # re.findall should not reorder the data.
+            # Therefore namesAndLevels[i] matches the data from cfs[i].
+            for count, nameAndLevel in enumerate(namesAndLevels):
+                sstFilename = re.findall(r"\d+.sst",nameAndLevel)[0]
+                sstLevel = re.findall(r"(?<=level:)\d+", nameAndLevel)[0]
+                cf = cfs[count]
+                referenceMap[sstFilename] = [sstLevel, cf]
+
+        # list_live_files_metadata:
+        # parse the output and create a map:
+        # [key: sstFilename]->[value:[LSM level, Column Family Name]]
+        testMap = {}
+        with open(dumpFilePath4, "r") as tmp:
+            data = tmp.read()
+            # Since for each SST file, all the information is contained
+            # on one line, the parsing is easy to perform and relies on
+            # the appearance of an "00xxx.sst" pattern.
+            sstLines = re.findall(r".*\d+.sst.*", data)
+            for line in sstLines:
+                sstFilename = re.findall(r"\d+.sst", line)[0]
+                sstLevel = re.findall(r"(?<=level )\d+",line)[0]
+                cf = re.findall(r"(?<=column family \')\w+(?=\')",line)[0]
+                testMap[sstFilename] = [sstLevel, cf]
+
+        # Compare the map obtained from dump_live_files and the map
+        # obtained from list_live_files_metadata. Everything should match.
+        self.assertEqual(referenceMap,testMap)
+
     def getManifests(self, directory):
         return glob.glob(directory + "/MANIFEST-*")
 
@@ -439,7 +553,7 @@
         return 0 == run_err_null("cp " + src + " " + dest)
 
     def testManifestDump(self):
-        print "Running testManifestDump..."
+        print("Running testManifestDump...")
         dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
         self.assertRunOK("put 1 1 --create_if_missing", "OK")
         self.assertRunOK("put 2 2", "OK")
@@ -474,8 +588,54 @@
                              expected_pattern, unexpected=False,
                              isPattern=True)
 
+        # Check if null characters doesn't infer with output format.
+        self.assertRunOK("put a1 b1", "OK")
+        self.assertRunOK("put a2 b2", "OK")
+        self.assertRunOK("put --hex 0x12000DA0 0x80C0000B", "OK")
+        self.assertRunOK("put --hex 0x7200004f 0x80000004", "OK")
+        self.assertRunOK("put --hex 0xa000000a 0xf000000f", "OK")
+        self.assertRunOK("put a3 b3", "OK")
+        self.assertRunOK("put a4 b4", "OK")
+
+        # Verifies that all "levels" are printed out.
+        # There should be 66 mentions of levels.
+        expected_verbose_output = re.compile("matched")
+        # Test manifest_dump verbose and verify that key 0x7200004f
+        # is present. Note that we are forced to use grep here because
+        # an output with a non-terminating null character in it isn't piped
+        # correctly through the Python subprocess object.
+        # Also note that 0x72=r and 0x4f=O, hence the regex \'r.{2}O\'
+        # (we cannot use null character in the subprocess input either,
+        # so we have to use '.{2}')
+        cmd_verbose = "manifest_dump --verbose --db=%s | grep -aq $'\'r.{2}O\'' && echo 'matched' || echo 'not matched'" %dbPath
+
+        self.assertRunOKFull(cmd_verbose , expected_verbose_output,
+                             unexpected=False, isPattern=True)
+
+
+    def testGetProperty(self):
+        print("Running testGetProperty...")
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put 1 1 --create_if_missing", "OK")
+        self.assertRunOK("put 2 2", "OK")
+        # A "string" property
+        cmd = "--db=%s get_property rocksdb.estimate-num-keys"
+        self.assertRunOKFull(cmd % dbPath,
+                             "rocksdb.estimate-num-keys: 2")
+        # A "map" property
+        # FIXME: why doesn't this pick up two entries?
+        cmd = "--db=%s get_property rocksdb.aggregated-table-properties"
+        part = "rocksdb.aggregated-table-properties.num_entries: "
+        expected_pattern = re.compile(part)
+        self.assertRunOKFull(cmd % dbPath,
+                             expected_pattern, unexpected=False,
+                             isPattern=True)
+        # An invalid property
+        cmd = "--db=%s get_property rocksdb.this-property-does-not-exist"
+        self.assertRunFAILFull(cmd % dbPath)
+
     def testSSTDump(self):
-        print "Running testSSTDump..."
+        print("Running testSSTDump...")
 
         dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
         self.assertRunOK("put sst1 sst1_val --create_if_missing", "OK")
@@ -495,7 +655,7 @@
                              isPattern=True)
 
     def testWALDump(self):
-        print "Running testWALDump..."
+        print("Running testWALDump...")
 
         dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
         self.assertRunOK("put wal1 wal1_val --create_if_missing", "OK")
@@ -515,7 +675,7 @@
                              isPattern=True)
 
     def testListColumnFamilies(self):
-        print "Running testListColumnFamilies..."
+        print("Running testListColumnFamilies...")
         self.assertRunOK("put x1 y1 --create_if_missing", "OK")
         cmd = "list_column_families | grep -v \"Column families\""
         # Test on valid dbPath.
@@ -524,7 +684,7 @@
         self.assertRunFAIL(cmd)
 
     def testColumnFamilies(self):
-        print "Running testColumnFamilies..."
+        print("Running testColumnFamilies...")
         dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)  # noqa: F841 T25377293 Grandfathered in
         self.assertRunOK("put cf1_1 1 --create_if_missing", "OK")
         self.assertRunOK("put cf1_2 2 --create_if_missing", "OK")
@@ -559,7 +719,7 @@
         self.assertRunFAIL("drop_column_family four")
 
     def testIngestExternalSst(self):
-        print "Running testIngestExternalSst..."
+        print("Running testIngestExternalSst...")
 
         # Dump, load, write external sst and ingest it in another db
         dbPath = os.path.join(self.TMP_DIR, "db1")
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/ldb_tool.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/ldb_tool.cc	2025-05-19 16:14:28.000000000 +0000
@@ -13,7 +13,7 @@
 LDBOptions::LDBOptions() {}
 
 void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
-                                 const char* /*exec_name*/) {
+                                 const char* /*exec_name*/, bool to_stderr) {
   std::string ret;
 
   ret.append(ldb_options.print_help_header);
@@ -22,7 +22,8 @@
              "=<full_path_to_db_directory> when necessary\n");
   ret.append("\n");
   ret.append("commands can optionally specify --" + LDBCommand::ARG_ENV_URI +
-             "=<uri_of_environment> if necessary\n\n");
+             "=<uri_of_environment> or --" + LDBCommand::ARG_FS_URI +
+             "=<uri_of_filesystem> if necessary\n\n");
   ret.append(
       "The following optional parameters control if keys/values are "
       "input/output as hex or as plain strings:\n");
@@ -46,6 +47,8 @@
              " : DB supports ttl and value is internally timestamp-suffixed\n");
   ret.append("  --" + LDBCommand::ARG_TRY_LOAD_OPTIONS +
              " : Try to load option file from DB.\n");
+  ret.append("  --" + LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS +
+             " : Set options.force_consistency_checks = false.\n");
   ret.append("  --" + LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS +
              " : Ignore unknown options when loading option file.\n");
   ret.append("  --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n");
@@ -85,34 +88,50 @@
   DBLoaderCommand::Help(ret);
   ManifestDumpCommand::Help(ret);
   FileChecksumDumpCommand::Help(ret);
+  GetPropertyCommand::Help(ret);
   ListColumnFamiliesCommand::Help(ret);
   CreateColumnFamilyCommand::Help(ret);
   DropColumnFamilyCommand::Help(ret);
   DBFileDumperCommand::Help(ret);
   InternalDumpCommand::Help(ret);
+  DBLiveFilesMetadataDumperCommand::Help(ret);
   RepairCommand::Help(ret);
   BackupCommand::Help(ret);
   RestoreCommand::Help(ret);
   CheckPointCommand::Help(ret);
   WriteExternalSstFilesCommand::Help(ret);
   IngestExternalSstFilesCommand::Help(ret);
+  UnsafeRemoveSstFileCommand::Help(ret);
 
-  fprintf(stderr, "%s\n", ret.c_str());
+  fprintf(to_stderr ? stderr : stdout, "%s\n", ret.c_str());
 }
 
 int LDBCommandRunner::RunCommand(
-    int argc, char** argv, Options options, const LDBOptions& ldb_options,
+    int argc, char const* const* argv, Options options,
+    const LDBOptions& ldb_options,
     const std::vector<ColumnFamilyDescriptor>* column_families) {
   if (argc <= 2) {
-    PrintHelp(ldb_options, argv[0]);
-    return 1;
+    if (argc <= 1) {
+      PrintHelp(ldb_options, argv[0], /*to_stderr*/ true);
+      return 1;
+    } else if (std::string(argv[1]) == "--version") {
+      printf("ldb from RocksDB %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
+             ROCKSDB_PATCH);
+      return 0;
+    } else if (std::string(argv[1]) == "--help") {
+      PrintHelp(ldb_options, argv[0], /*to_stderr*/ false);
+      return 0;
+    } else {
+      PrintHelp(ldb_options, argv[0], /*to_stderr*/ true);
+      return 1;
+    }
   }
 
   LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(
       argc, argv, options, ldb_options, column_families);
   if (cmdObj == nullptr) {
     fprintf(stderr, "Unknown command\n");
-    PrintHelp(ldb_options, argv[0]);
+    PrintHelp(ldb_options, argv[0], /*to_stderr*/ true);
     return 1;
   }
 
@@ -122,7 +141,9 @@
 
   cmdObj->Run();
   LDBCommandExecuteResult ret = cmdObj->GetExecuteState();
-  fprintf(stderr, "%s\n", ret.ToString().c_str());
+  if (!ret.ToString().empty()) {
+    fprintf(stderr, "%s\n", ret.ToString().c_str());
+  }
   delete cmdObj;
 
   return ret.IsFailed() ? 1 : 0;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.cc	2025-05-19 16:14:28.000000000 +0000
@@ -27,9 +27,6 @@
   }
 }
 
-using namespace v8;
-
-
 Persistent<Function> DBWrapper::constructor;
 
 DBWrapper::DBWrapper() {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/db_wrapper.h	2025-05-19 16:14:28.000000000 +0000
@@ -9,8 +9,6 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/options.h"
 
-using namespace v8;
-
 // Used to encapsulate a particular instance of an opened database.
 //
 // This object should not be used directly in C++; it exists solely to provide
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/rdb/rdb.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/rdb.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/rdb/rdb.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/rdb/rdb.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,8 +7,6 @@
 #include <v8.h>
 #include "db/_wrapper.h"
 
-using namespace v8;
-
 void InitAll(Handle<Object> exports) {
   DBWrapper::Init(exports);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/reduce_levels_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/reduce_levels_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/reduce_levels_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/reduce_levels_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -13,6 +13,7 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "tools/ldb_cmd_impl.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -47,12 +48,12 @@
     if (db_ == nullptr) {
       return Status::InvalidArgument("DB not opened.");
     }
-    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
     return db_impl->TEST_FlushMemTable();
   }
 
   void MoveL0FileToLevel(int level) {
-    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
     for (int i = 0; i < level; ++i) {
       ASSERT_OK(db_impl->TEST_CompactRange(i, nullptr, nullptr));
     }
@@ -69,8 +70,8 @@
 
   int FilesOnLevel(int level) {
     std::string property;
-    EXPECT_TRUE(db_->GetProperty(
-        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    EXPECT_TRUE(db_->GetProperty("rocksdb.num-files-at-level" + ToString(level),
+                                 &property));
     return atoi(property.c_str());
   }
 
@@ -106,7 +107,7 @@
 TEST_F(ReduceLevelTest, Last_Level) {
   ASSERT_OK(OpenDB(true, 4));
   ASSERT_OK(Put("aaaa", "11111"));
-  Flush();
+  ASSERT_OK(Flush());
   MoveL0FileToLevel(3);
   ASSERT_EQ(FilesOnLevel(3), 1);
   CloseDB();
@@ -125,7 +126,7 @@
 TEST_F(ReduceLevelTest, Top_Level) {
   ASSERT_OK(OpenDB(true, 5));
   ASSERT_OK(Put("aaaa", "11111"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_EQ(FilesOnLevel(0), 1);
   CloseDB();
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/regression_test.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/regression_test.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/regression_test.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/regression_test.sh	2025-05-19 16:14:28.000000000 +0000
@@ -37,8 +37,8 @@
 #       ./tools/regression_test.sh
 #
 # = Regression test environmental parameters =
-#   DEBUG: If true, then the script will not checkout master and build db_bench
-#       if db_bench already exists
+#   DEBUG: If true, then the script will not build db_bench if db_bench already
+#       exists
 #       Default: 0
 #   TEST_MODE: If 1, run fillseqdeterminstic and benchmarks both
 #       if 0, only run fillseqdeterministc
@@ -142,6 +142,7 @@
       run_db_bench "deleterandom" $((NUM_KEYS / 10 / $NUM_THREADS))
       run_db_bench "seekrandom"
       run_db_bench "seekrandomwhilewriting"
+      run_db_bench "multireadrandom" 
   fi
 
   cleanup_test_directory $TEST_ROOT_DIR
@@ -157,7 +158,7 @@
 
   current_time=$(date +"%F-%H:%M:%S")
   RESULT_PATH=${RESULT_PATH:-"$1/results/$current_time"}
-  COMMIT_ID=`git log | head -n1 | cut -c 8-`
+  COMMIT_ID=`hg id -i 2>/dev/null || git rev-parse HEAD 2>/dev/null || echo 'unknown'`
   SUMMARY_FILE="$RESULT_PATH/SUMMARY.csv"
 
   DB_PATH=${3:-"$1/db"}
@@ -192,6 +193,9 @@
   DELETE_TEST_PATH=${DELETE_TEST_PATH:-0}
   SEEK_NEXTS=${SEEK_NEXTS:-10}
   SEED=${SEED:-$( date +%s )}
+  MULTIREAD_BATCH_SIZE=${MULTIREAD_BATCH_SIZE:-128}
+  MULTIREAD_STRIDE=${MULTIREAD_STRIDE:-12}
+  PERF_LEVEL=${PERF_LEVEL:-1}
 }
 
 # $1 --- benchmark name
@@ -219,6 +223,7 @@
   db_bench_cmd="("'\$(which time)'" -p $DB_BENCH_DIR/db_bench \
       --benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \
       --use_existing_db=$USE_EXISTING_DB \
+      --perf_level=$PERF_LEVEL \
       --disable_auto_compactions \
       --threads=$threads \
       --num=$NUM_KEYS \
@@ -240,7 +245,10 @@
       --max_background_compactions=$MAX_BACKGROUND_COMPACTIONS \
       --num_high_pri_threads=$NUM_HIGH_PRI_THREADS \
       --num_low_pri_threads=$NUM_LOW_PRI_THREADS \
-      --seed=$SEED) 2>&1"
+      --seed=$SEED \
+      --multiread_batched=true \
+      --batch_size=$MULTIREAD_BATCH_SIZE \
+      --multiread_stride=$MULTIREAD_STRIDE) 2>&1"
   ps_cmd="ps aux"
   if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
     echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
@@ -288,14 +296,14 @@
             db_index=$(basename $dir)
             echo "Building checkpoints: $ORIGIN_PATH/$db_index -> $DB_PATH/$db_index ..."
             $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index \
-                        --db=$ORIGIN_PATH/$db_index 2>&1
+                        --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1
         done
     else
         # checkpoint cannot build in directory already exists
         $cmd_prefix rm -rf $DB_PATH
         echo "Building checkpoint: $ORIGIN_PATH -> $DB_PATH ..."
         $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH \
-                    --db=$ORIGIN_PATH 2>&1
+                    --db=$ORIGIN_PATH --try_load_options 2>&1
     fi
 }
 
@@ -361,23 +369,13 @@
   fi
 }
 
-function checkout_rocksdb {
-  echo "Checking out commit $1 ..."
-
-  git fetch --all
-  exit_on_error $?
-
-  git checkout $1
-  exit_on_error $?
-}
-
 function build_db_bench_and_ldb {
   echo "Building db_bench & ldb ..."
 
   make clean
   exit_on_error $?
 
-  DEBUG_LEVEL=0 PORTABLE=1 make db_bench ldb -j32
+  DEBUG_LEVEL=0 make db_bench ldb -j32
   exit_on_error $?
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/report_lite_binary_size.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/report_lite_binary_size.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/report_lite_binary_size.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/report_lite_binary_size.sh	1970-01-01 00:00:00.000000000 +0000
@@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-# Script to report lite build binary size for latest RocksDB commits.
-# Usage:
-#   ./report_lite_binary_size [num_recent_commits]
-
-num_recent_commits=${1:-10}
-
-echo "Computing RocksDB lite build binary size for the most recent $num_recent_commits commits."
-
-for ((i=0; i < num_recent_commits; i++))
-do
-  git checkout master~$i
-  commit_hash=$(git show -s --format=%H)
-  commit_time=$(git show -s --format=%ct)
-
-  # It would be nice to check if scuba already have a record for the commit,
-  # but sandcastle don't seems to have scuba CLI installed.
-
-  make clean
-  make OPT=-DROCKSDB_LITE static_lib
-
-  if make OPT=-DROCKSDB_LITE static_lib
-  then
-    build_succeeded='true'
-    strip librocksdb.a
-    binary_size=$(stat -c %s librocksdb.a)
-  else
-    build_succeeded='false'
-    binary_size=0
-  fi
-
-  current_time="\"time\": $(date +%s)"
-  commit_hash="\"hash\": \"$commit_hash\""
-  commit_time="\"commit_time\": $commit_time"
-  build_succeeded="\"build_succeeded\": \"$build_succeeded\""
-  binary_size="\"binary_size\": $binary_size"
-
-  scribe_log="{\"int\":{$current_time, $commit_time, $binary_size}, \"normal\":{$commit_hash, $build_succeeded}}"
-  echo "Logging to scribe: $scribe_log"
-  scribe_cat perfpipe_rocksdb_lite_build "$scribe_log"
-done
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/restore_db.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/restore_db.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/restore_db.sh	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/restore_db.sh	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ${BASH_SOURCE[0]} <Backup Dir> <DB Path>"
+  exit 1
+fi
+
+backup_dir="$1"
+db_dir="$2"
+
+echo "== Restoring latest from $backup_dir to $db_dir"
+./ldb restore --db="$db_dir" --backup_dir="$backup_dir"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/run_blob_bench.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/run_blob_bench.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/run_blob_bench.sh	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/run_blob_bench.sh	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,195 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# BlobDB benchmark script
+#
+# REQUIRES: benchmark.sh is in the tools subdirectory
+#
+# After the execution of this script, log files are available in $output_dir.
+# report.tsv provides high level statistics.
+#
+# Should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_blob_bench.sh
+#
+# This runs the following sequence of BlobDB performance tests:
+#   phase 1) write-only - bulkload+compact, overwrite+waitforcompaction
+#   phase 2) read-write - readwhilewriting, fwdrangewhilewriting
+#   phase 3) read-only - readrandom, fwdrange
+#
+
+# Exit Codes
+EXIT_INVALID_ARGS=1
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+T=$((1024 * G))
+
+function display_usage() {
+  echo "usage: run_blob_bench.sh [--help]"
+  echo ""
+  echo "Runs the following sequence of BlobDB benchmark tests using tools/benchmark.sh:"
+  echo -e "\tPhase 1: write-only tests: bulkload+compact, overwrite+waitforcompaction"
+  echo -e "\tPhase 2: read-write tests: readwhilewriting, fwdrangewhilewriting"
+  echo -e "\tPhase 3: read-only tests: readrandom, fwdrange"
+  echo ""
+  echo "Environment Variables:"
+  echo -e "\tJOB_ID\t\t\t\tIdentifier for the benchmark job, will appear in the results (default: empty)"
+  echo -e "\tDB_DIR\t\t\t\tPath for the RocksDB data directory (mandatory)"
+  echo -e "\tWAL_DIR\t\t\t\tPath for the RocksDB WAL directory (mandatory)"
+  echo -e "\tOUTPUT_DIR\t\t\tPath for the benchmark results (mandatory)"
+  echo -e "\tNUM_THREADS\t\t\tNumber of threads (default: 16)"
+  echo -e "\tCOMPRESSION_TYPE\t\tCompression type for the SST files (default: lz4)"
+  echo -e "\tDB_SIZE\t\t\t\tRaw (uncompressed) database size (default: 1 TB)"
+  echo -e "\tVALUE_SIZE\t\t\tValue size (default: 1 KB)"
+  echo -e "\tNUM_KEYS\t\t\tNumber of keys (default: raw database size divided by value size)"
+  echo -e "\tDURATION\t\t\tIndividual duration for read-write/read-only tests in seconds (default: 1800)"
+  echo -e "\tWRITE_BUFFER_SIZE\t\tWrite buffer (memtable) size (default: 1 GB)"
+  echo -e "\tENABLE_BLOB_FILES\t\tEnable blob files (default: 1)"
+  echo -e "\tMIN_BLOB_SIZE\t\t\tSize threshold for storing values in blob files (default: 0)"
+  echo -e "\tBLOB_FILE_SIZE\t\t\tBlob file size (default: same as write buffer size)"
+  echo -e "\tBLOB_COMPRESSION_TYPE\t\tCompression type for the blob files (default: lz4)"
+  echo -e "\tENABLE_BLOB_GC\t\t\tEnable blob garbage collection (default: 1)"
+  echo -e "\tBLOB_GC_AGE_CUTOFF\t\tBlob garbage collection age cutoff (default: 0.25)"
+  echo -e "\tBLOB_GC_FORCE_THRESHOLD\t\tThreshold for forcing garbage collection of the oldest blob files (default: 1.0)"
+  echo -e "\tTARGET_FILE_SIZE_BASE\t\tTarget SST file size for compactions (default: write buffer size, scaled down if blob files are enabled)"
+  echo -e "\tMAX_BYTES_FOR_LEVEL_BASE\tMaximum size for the base level (default: 8 * target SST file size)"
+}
+
+if [ $# -ge 1 ]; then
+  display_usage
+
+  if [ "$1" == "--help" ]; then
+    exit
+  else
+    exit $EXIT_INVALID_ARGS
+  fi
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$DB_DIR" ]; then
+  echo "DB_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$WAL_DIR" ]; then
+  echo "WAL_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+if [ -z "$OUTPUT_DIR" ]; then
+  echo "OUTPUT_DIR is not defined"
+  exit $EXIT_INVALID_ARGS
+fi
+
+# shellcheck disable=SC2153
+job_id=$JOB_ID
+
+db_dir=$DB_DIR
+wal_dir=$WAL_DIR
+output_dir=$OUTPUT_DIR
+
+num_threads=${NUM_THREADS:-16}
+
+compression_type=${COMPRESSION_TYPE:-lz4}
+
+db_size=${DB_SIZE:-$((1 * T))}
+value_size=${VALUE_SIZE:-$((1 * K))}
+num_keys=${NUM_KEYS:-$((db_size / value_size))}
+
+duration=${DURATION:-1800}
+
+write_buffer_size=${WRITE_BUFFER_SIZE:-$((1 * G))}
+
+enable_blob_files=${ENABLE_BLOB_FILES:-1}
+min_blob_size=${MIN_BLOB_SIZE:-0}
+blob_file_size=${BLOB_FILE_SIZE:-$write_buffer_size}
+blob_compression_type=${BLOB_COMPRESSION_TYPE:-lz4}
+enable_blob_garbage_collection=${ENABLE_BLOB_GC:-1}
+blob_garbage_collection_age_cutoff=${BLOB_GC_AGE_CUTOFF:-0.25}
+blob_garbage_collection_force_threshold=${BLOB_GC_FORCE_THRESHOLD:-1.0}
+
+if [ "$enable_blob_files" == "1" ]; then
+  target_file_size_base=${TARGET_FILE_SIZE_BASE:-$((32 * write_buffer_size / value_size))}
+else
+  target_file_size_base=${TARGET_FILE_SIZE_BASE:-$write_buffer_size}
+fi
+
+max_bytes_for_level_base=${MAX_BYTES_FOR_LEVEL_BASE:-$((8 * target_file_size_base))}
+
+echo "======================== Benchmark setup ========================"
+echo -e "Job ID:\t\t\t\t\t$job_id"
+echo -e "Data directory:\t\t\t\t$db_dir"
+echo -e "WAL directory:\t\t\t\t$wal_dir"
+echo -e "Output directory:\t\t\t$output_dir"
+echo -e "Number of threads:\t\t\t$num_threads"
+echo -e "Compression type for SST files:\t\t$compression_type"
+echo -e "Raw database size:\t\t\t$db_size"
+echo -e "Value size:\t\t\t\t$value_size"
+echo -e "Number of keys:\t\t\t\t$num_keys"
+echo -e "Duration of read-write/read-only tests:\t$duration"
+echo -e "Write buffer size:\t\t\t$write_buffer_size"
+echo -e "Blob files enabled:\t\t\t$enable_blob_files"
+echo -e "Blob size threshold:\t\t\t$min_blob_size"
+echo -e "Blob file size:\t\t\t\t$blob_file_size"
+echo -e "Compression type for blob files:\t$blob_compression_type"
+echo -e "Blob GC enabled:\t\t\t$enable_blob_garbage_collection"
+echo -e "Blob GC age cutoff:\t\t\t$blob_garbage_collection_age_cutoff"
+echo -e "Blob GC force threshold:\t\t$blob_garbage_collection_force_threshold"
+echo -e "Target SST file size:\t\t\t$target_file_size_base"
+echo -e "Maximum size of base level:\t\t$max_bytes_for_level_base"
+echo "================================================================="
+
+rm -rf "$db_dir"
+rm -rf "$wal_dir"
+rm -rf "$output_dir"
+
+ENV_VARS="\
+  JOB_ID=$job_id \
+  DB_DIR=$db_dir \
+  WAL_DIR=$wal_dir \
+  OUTPUT_DIR=$output_dir \
+  NUM_THREADS=$num_threads \
+  COMPRESSION_TYPE=$compression_type \
+  VALUE_SIZE=$value_size \
+  NUM_KEYS=$num_keys"
+
+ENV_VARS_D="$ENV_VARS DURATION=$duration"
+
+PARAMS="\
+  --enable_blob_files=$enable_blob_files \
+  --min_blob_size=$min_blob_size \
+  --blob_file_size=$blob_file_size \
+  --blob_compression_type=$blob_compression_type \
+  --write_buffer_size=$write_buffer_size \
+  --target_file_size_base=$target_file_size_base \
+  --max_bytes_for_level_base=$max_bytes_for_level_base"
+
+PARAMS_GC="$PARAMS \
+  --enable_blob_garbage_collection=$enable_blob_garbage_collection \
+  --blob_garbage_collection_age_cutoff=$blob_garbage_collection_age_cutoff \
+  --blob_garbage_collection_force_threshold=$blob_garbage_collection_force_threshold"
+
+# bulk load (using fillrandom) + compact
+env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh bulkload "$PARAMS"
+
+# overwrite + waitforcompaction
+env -u DURATION -S "$ENV_VARS" ./tools/benchmark.sh overwrite "$PARAMS_GC"
+
+# readwhilewriting
+env -S "$ENV_VARS_D" ./tools/benchmark.sh readwhilewriting "$PARAMS_GC"
+
+# fwdrangewhilewriting
+env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrangewhilewriting "$PARAMS_GC"
+
+# readrandom
+env -S "$ENV_VARS_D" ./tools/benchmark.sh readrandom "$PARAMS_GC"
+
+# fwdrange
+env -S "$ENV_VARS_D" ./tools/benchmark.sh fwdrange "$PARAMS_GC"
+
+# save logs to output directory
+cp "$db_dir"/LOG* "$output_dir/"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,246 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/stop_watch.h"
+#ifndef ROCKSDB_LITE
+
+#include "tools/simulated_hybrid_file_system.h"
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+
+#include "rocksdb/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const int64_t kUsPerSec = 1000000;
+const int64_t kDummyBytesPerUs = 1024;
+
+namespace {
+// From bytes to read/write, calculate service time needed by an HDD.
+// This is used to simulate latency from HDD.
+int CalculateServeTimeUs(size_t bytes) {
+  return 12200 + static_cast<int>(static_cast<double>(bytes) * 0.005215);
+}
+
+// There is a bug in rater limiter that would crash with small requests
+// Hack to get it around.
+void RateLimiterRequest(RateLimiter* rater_limiter, int64_t amount) {
+  int64_t left = amount * kDummyBytesPerUs;
+  const int64_t kMaxToRequest = kDummyBytesPerUs * kUsPerSec / 1024;
+  while (left > 0) {
+    int64_t to_request = std::min(kMaxToRequest, left);
+    rater_limiter->Request(to_request, Env::IOPriority::IO_LOW, nullptr);
+    left -= to_request;
+  }
+}
+}  // namespace
+
+// The metadata file format: each line is a full filename of a file which is
+// warm
+SimulatedHybridFileSystem::SimulatedHybridFileSystem(
+    const std::shared_ptr<FileSystem>& base,
+    const std::string& metadata_file_name, int throughput_multiplier,
+    bool is_full_fs_warm)
+    : FileSystemWrapper(base),
+      // Limit to 100 requests per second.
+      rate_limiter_(NewGenericRateLimiter(
+          int64_t{throughput_multiplier} * kDummyBytesPerUs *
+              kUsPerSec /* rate_bytes_per_sec */,
+          1000 /* refill_period_us */)),
+      metadata_file_name_(metadata_file_name),
+      name_("SimulatedHybridFileSystem: " + std::string(target()->Name())),
+      is_full_fs_warm_(is_full_fs_warm) {
+  IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr);
+  if (s.IsNotFound()) {
+    return;
+  }
+  std::string metadata;
+  s = ReadFileToString(base.get(), metadata_file_name, &metadata);
+  if (!s.ok()) {
+    fprintf(stderr, "Error reading from file %s: %s",
+            metadata_file_name.c_str(), s.ToString().c_str());
+    // Exit rather than assert as this file system is built to run with
+    // benchmarks, which usually run on release mode.
+    std::exit(1);
+  }
+  std::istringstream input;
+  input.str(metadata);
+  std::string line;
+  while (std::getline(input, line)) {
+    fprintf(stderr, "Warm file %s\n", line.c_str());
+    warm_file_set_.insert(line);
+  }
+}
+
+// Need to write out the metadata file to file. See comment of
+// SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the
+// file.
+SimulatedHybridFileSystem::~SimulatedHybridFileSystem() {
+  if (metadata_file_name_.empty()) {
+    return;
+  }
+  std::string metadata;
+  for (const auto& f : warm_file_set_) {
+    metadata += f;
+    metadata += "\n";
+  }
+  IOStatus s = WriteStringToFile(target(), metadata, metadata_file_name_, true);
+  if (!s.ok()) {
+    fprintf(stderr, "Error writing to file %s: %s", metadata_file_name_.c_str(),
+            s.ToString().c_str());
+  }
+}
+
+IOStatus SimulatedHybridFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  Temperature temperature = Temperature::kUnknown;
+  if (is_full_fs_warm_) {
+    temperature = Temperature::kWarm;
+  } else {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    if (warm_file_set_.find(fname) != warm_file_set_.end()) {
+      temperature = Temperature::kWarm;
+    }
+    assert(temperature == file_opts.temperature);
+  }
+  IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  result->reset(
+      new SimulatedHybridRaf(std::move(*result), rate_limiter_, temperature));
+  return s;
+}
+
+IOStatus SimulatedHybridFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  if (file_opts.temperature == Temperature::kWarm) {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    warm_file_set_.insert(fname);
+  }
+
+  IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+  if (file_opts.temperature == Temperature::kWarm || is_full_fs_warm_) {
+    result->reset(new SimulatedWritableFile(std::move(*result), rate_limiter_));
+  }
+  return s;
+}
+
+IOStatus SimulatedHybridFileSystem::DeleteFile(const std::string& fname,
+                                               const IOOptions& options,
+                                               IODebugContext* dbg) {
+  {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    warm_file_set_.erase(fname);
+  }
+  return target()->DeleteFile(fname, options, dbg);
+}
+
+IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n,
+                                  const IOOptions& options, Slice* result,
+                                  char* scratch, IODebugContext* dbg) const {
+  if (temperature_ == Temperature::kWarm) {
+    SimulateIOWait(n);
+  }
+  return target()->Read(offset, n, options, result, scratch, dbg);
+}
+
+IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                                       const IOOptions& options,
+                                       IODebugContext* dbg) {
+  if (temperature_ == Temperature::kWarm) {
+    for (size_t i = 0; i < num_reqs; i++) {
+      SimulateIOWait(reqs[i].len);
+    }
+  }
+  return target()->MultiRead(reqs, num_reqs, options, dbg);
+}
+
+IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) {
+  if (temperature_ == Temperature::kWarm) {
+    SimulateIOWait(n);
+  }
+  return target()->Prefetch(offset, n, options, dbg);
+}
+
+void SimulatedHybridRaf::SimulateIOWait(int64_t bytes) const {
+  int serve_time = CalculateServeTimeUs(bytes);
+  {
+    StopWatchNano stop_watch(Env::Default()->GetSystemClock().get(),
+                             /*auto_start=*/true);
+    RateLimiterRequest(rate_limiter_.get(), serve_time);
+    int time_passed_us = static_cast<int>(stop_watch.ElapsedNanos() / 1000);
+    if (time_passed_us < serve_time) {
+      Env::Default()->SleepForMicroseconds(serve_time - time_passed_us);
+    }
+  }
+}
+
+void SimulatedWritableFile::SimulateIOWait(int64_t bytes) const {
+  int serve_time = CalculateServeTimeUs(bytes);
+  Env::Default()->SleepForMicroseconds(serve_time);
+  RateLimiterRequest(rate_limiter_.get(), serve_time);
+}
+
+IOStatus SimulatedWritableFile::Append(const Slice& data, const IOOptions& ioo,
+                                       IODebugContext* idc) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    unsynced_bytes += data.size();
+  }
+  return target()->Append(data, ioo, idc);
+}
+
+IOStatus SimulatedWritableFile::Append(
+    const Slice& data, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    unsynced_bytes += data.size();
+  }
+  return target()->Append(data, options, verification_info, dbg);
+}
+
+IOStatus SimulatedWritableFile::PositionedAppend(const Slice& data,
+                                                 uint64_t offset,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    // This might be overcalculated, but it's probably OK.
+    unsynced_bytes += data.size();
+  }
+  return target()->PositionedAppend(data, offset, options, dbg);
+}
+IOStatus SimulatedWritableFile::PositionedAppend(
+    const Slice& data, uint64_t offset, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  if (use_direct_io()) {
+    SimulateIOWait(data.size());
+  } else {
+    // This might be overcalculated, but it's probably OK.
+    unsynced_bytes += data.size();
+  }
+  return target()->PositionedAppend(data, offset, options, verification_info,
+                                    dbg);
+}
+
+IOStatus SimulatedWritableFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  if (unsynced_bytes > 0) {
+    SimulateIOWait(unsynced_bytes);
+    unsynced_bytes = 0;
+  }
+  return target()->Sync(options, dbg);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/simulated_hybrid_file_system.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,126 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <utility>
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A FileSystem simulates hybrid file system by ingesting latency and limit
+// IOPs.
+// This class is only used for development purpose and should not be used
+// in production.
+// Right now we ingest 15ms latency and allow 100 requests per second when
+// the file is for warm temperature.
+// When the object is destroyed, the list of warm files are written to a
+// file, which can be used to reopen a FileSystem and still recover the
+// list. This is to allow the information to preserve between db_bench
+// runs.
+class SimulatedHybridFileSystem : public FileSystemWrapper {
+ public:
+  // metadata_file_name stores metadata of the files, so that it can be
+  // loaded after process restarts. If the file doesn't exist, create
+  // one. The file is written when the class is destroyed.
+  // throughput_multiplier: multiplier of throughput. For example, 1 is to
+  //      simulate single disk spindle. 4 is to simualte 4 disk spindles.
+  // is_full_fs_warm: if true, all files are all included in slow I/O
+  // simulation.
+  SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base,
+                            const std::string& metadata_file_name,
+                            int throughput_multiplier, bool is_full_fs_warm);
+
+  ~SimulatedHybridFileSystem() override;
+
+ public:
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  // Limit 100 requests per second. Rate limiter is designed to byte but
+  // we use it as fixed bytes is one request.
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  std::mutex mutex_;
+  std::unordered_set<std::string> warm_file_set_;
+  std::string metadata_file_name_;
+  std::string name_;
+  bool is_full_fs_warm_;
+};
+
+// Simulated random access file that can control IOPs and latency to simulate
+// specific storage media
+class SimulatedHybridRaf : public FSRandomAccessFileOwnerWrapper {
+ public:
+  SimulatedHybridRaf(std::unique_ptr<FSRandomAccessFile>&& t,
+                     std::shared_ptr<RateLimiter> rate_limiter,
+                     Temperature temperature)
+      : FSRandomAccessFileOwnerWrapper(std::move(t)),
+        rate_limiter_(rate_limiter),
+        temperature_(temperature) {}
+
+  ~SimulatedHybridRaf() override {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  Temperature temperature_;
+
+  void SimulateIOWait(int64_t num_requests) const;
+};
+
+class SimulatedWritableFile : public FSWritableFileWrapper {
+ public:
+  SimulatedWritableFile(std::unique_ptr<FSWritableFile>&& t,
+                        std::shared_ptr<RateLimiter> rate_limiter)
+      : FSWritableFileWrapper(t.get()),
+        file_guard_(std::move(t)),
+        rate_limiter_(rate_limiter) {}
+  IOStatus Append(const Slice& data, const IOOptions&,
+                  IODebugContext*) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& verification_info,
+                  IODebugContext* dbg) override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& verification_info,
+                            IODebugContext* dbg) override;
+
+ private:
+  std::unique_ptr<FSWritableFile> file_guard_;
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  size_t unsynced_bytes = 0;
+
+  void SimulateIOWait(int64_t num_requests) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump.cc	2025-05-19 16:14:28.000000000 +0000
@@ -9,8 +9,7 @@
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  tool.Run(argc, argv);
-  return 0;
+  return tool.Run(argc, argv);
 }
 #else
 #include <stdio.h>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -10,11 +10,12 @@
 #ifndef ROCKSDB_LITE
 
 #include <stdint.h>
-#include "rocksdb/sst_dump_tool.h"
 
 #include "file/random_access_file_reader.h"
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/sst_dump_tool.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/table_builder.h"
 #include "test_util/testharness.h"
@@ -22,7 +23,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-const uint32_t optLength = 100;
+const uint32_t kOptLength = 1024;
 
 namespace {
 static std::string MakeKey(int i) {
@@ -39,48 +40,12 @@
   return key.Encode().ToString();
 }
 
-void createSST(const Options& opts, const std::string& file_name) {
-  Env* env = opts.env;
-  EnvOptions env_options(opts);
-  ReadOptions read_options;
-  const ImmutableCFOptions imoptions(opts);
-  const MutableCFOptions moptions(opts);
-  ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
-  std::unique_ptr<TableBuilder> tb;
-
-  std::unique_ptr<WritableFile> file;
-  ASSERT_OK(env->NewWritableFile(file_name, &file, env_options));
-
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
-      int_tbl_prop_collector_factories;
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(file)), file_name, EnvOptions()));
-  std::string column_family_name;
-  int unknown_level = -1;
-  tb.reset(opts.table_factory->NewTableBuilder(
-      TableBuilderOptions(
-          imoptions, moptions, ikc, &int_tbl_prop_collector_factories,
-          CompressionType::kNoCompression, 0 /* sample_for_compression */,
-          CompressionOptions(), false /* skip_filters */, column_family_name,
-          unknown_level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
-      file_writer.get()));
-
-  // Populate slightly more than 1K keys
-  uint32_t num_keys = 1024;
-  for (uint32_t i = 0; i < num_keys; i++) {
-    tb->Add(MakeKey(i), MakeValue(i));
-  }
-  tb->Finish();
-  file_writer->Close();
-}
-
 void cleanup(const Options& opts, const std::string& file_name) {
   Env* env = opts.env;
-  env->DeleteFile(file_name);
+  ASSERT_OK(env->DeleteFile(file_name));
   std::string outfile_name = file_name.substr(0, file_name.length() - 4);
   outfile_name.append("_dump.txt");
-  env->DeleteFile(outfile_name);
+  env->DeleteFile(outfile_name).PermitUncheckedError();
 }
 }  // namespace
 
@@ -92,10 +57,7 @@
 
  public:
   SSTDumpToolTest() : env_(Env::Default()) {
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    if (test_env_uri) {
-      Env::LoadEnv(test_env_uri, &env_, &env_guard_);
-    }
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
     test_dir_ = test::PerThreadDBPath(env_, "sst_dump_test_db");
     Status s = env_->CreateDirIfMissing(test_dir_);
     EXPECT_OK(s);
@@ -121,14 +83,66 @@
   void PopulateCommandArgs(const std::string& file_path, const char* command,
                            char* (&usage)[N]) const {
     for (int i = 0; i < static_cast<int>(N); ++i) {
-      usage[i] = new char[optLength];
+      usage[i] = new char[kOptLength];
+    }
+    snprintf(usage[0], kOptLength, "./sst_dump");
+    snprintf(usage[1], kOptLength, "%s", command);
+    snprintf(usage[2], kOptLength, "--file=%s", file_path.c_str());
+  }
+
+  void createSST(const Options& opts, const std::string& file_name) {
+    Env* test_env = opts.env;
+    FileOptions file_options(opts);
+    ReadOptions read_options;
+    const ImmutableOptions imoptions(opts);
+    const MutableCFOptions moptions(opts);
+    ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
+    std::unique_ptr<TableBuilder> tb;
+
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(test_env->GetFileSystem(), file_name,
+                                         file_options, &file_writer, nullptr));
+
+    std::string column_family_name;
+    int unknown_level = -1;
+    tb.reset(opts.table_factory->NewTableBuilder(
+        TableBuilderOptions(
+            imoptions, moptions, ikc, &int_tbl_prop_collector_factories,
+            CompressionType::kNoCompression, CompressionOptions(),
+            TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+            column_family_name, unknown_level),
+        file_writer.get()));
+
+    // Populate slightly more than 1K keys
+    uint32_t num_keys = kNumKey;
+    for (uint32_t i = 0; i < num_keys; i++) {
+      tb->Add(MakeKey(i), MakeValue(i));
     }
-    snprintf(usage[0], optLength, "./sst_dump");
-    snprintf(usage[1], optLength, "%s", command);
-    snprintf(usage[2], optLength, "--file=%s", file_path.c_str());
+    ASSERT_OK(tb->Finish());
+    ASSERT_OK(file_writer->Close());
   }
+
+ protected:
+  constexpr static int kNumKey = 1024;
 };
 
+constexpr int SSTDumpToolTest::kNumKey;
+
+TEST_F(SSTDumpToolTest, HelpAndVersion) {
+  Options opts;
+  opts.env = env();
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+
+  static const char* help[] = {"./sst_dump", "--help"};
+  ASSERT_TRUE(!tool.Run(2, help, opts));
+  static const char* version[] = {"./sst_dump", "--version"};
+  ASSERT_TRUE(!tool.Run(2, version, opts));
+  static const char* bad[] = {"./sst_dump", "--not_an_option"};
+  ASSERT_TRUE(tool.Run(2, bad, opts));
+}
+
 TEST_F(SSTDumpToolTest, EmptyFilter) {
   Options opts;
   opts.env = env();
@@ -254,15 +268,134 @@
   }
 }
 
-}  // namespace ROCKSDB_NAMESPACE
+TEST_F(SSTDumpToolTest, ReadaheadSize) {
+  Options opts;
+  opts.env = env();
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[4];
+  PopulateCommandArgs(file_path, "--command=verify", usage);
+  snprintf(usage[3], kOptLength, "--readahead_size=4000000");
+
+  int num_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack("RandomAccessFileReader::Read",
+                                        [&](void*) { num_reads++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(4, usage, opts));
+
+  // The file is approximately 10MB. Readahead is 4MB.
+  // We usually need 3 reads + one metadata read.
+  // One extra read is needed before opening the file for metadata.
+  ASSERT_EQ(5, num_reads);
+
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 4; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, NoSstFile) {
+  Options opts;
+  opts.env = env();
+  std::string file_path = MakeFilePath("no_such_file.sst");
+  char* usage[3];
+  PopulateCommandArgs(file_path, "", usage);
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  for (const auto& command :
+       {"--command=check", "--command=dump", "--command=raw",
+        "--command=verify", "--command=recompress", "--command=verify_checksum",
+        "--show_properties"}) {
+    snprintf(usage[1], kOptLength, "%s", command);
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+  }
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, ValidSSTPath) {
+  Options opts;
+  opts.env = env();
+  char* usage[3];
+  PopulateCommandArgs("", "", usage);
+  SSTDumpTool tool;
+  std::string file_not_exists = MakeFilePath("file_not_exists.sst");
+  std::string sst_file = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, sst_file);
+  std::string text_file = MakeFilePath("text_file");
+  ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file));
+  std::string fake_sst = MakeFilePath("fake_sst.sst");
+  ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst));
+
+  for (const auto& command_arg : {"--command=verify", "--command=identify"}) {
+    snprintf(usage[1], kOptLength, "%s", command_arg);
+
+    snprintf(usage[2], kOptLength, "--file=%s", file_not_exists.c_str());
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+
+    snprintf(usage[2], kOptLength, "--file=%s", sst_file.c_str());
+    ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+    snprintf(usage[2], kOptLength, "--file=%s", text_file.c_str());
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+
+    snprintf(usage[2], kOptLength, "--file=%s", fake_sst.c_str());
+    ASSERT_TRUE(tool.Run(3, usage, opts));
+  }
+  ASSERT_OK(opts.env->DeleteFile(sst_file));
+  ASSERT_OK(opts.env->DeleteFile(text_file));
+  ASSERT_OK(opts.env->DeleteFile(fake_sst));
+
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST_F(SSTDumpToolTest, RawOutput) {
+  Options opts;
+  opts.env = env();
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=raw", usage);
 
-#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
-extern "C" {
-void RegisterCustomObjects(int argc, char** argv);
-}
-#else
-void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
-#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  const std::string raw_path = MakeFilePath("rocksdb_sst_test_dump.txt");
+  std::ifstream raw_file(raw_path);
+
+  std::string tp;
+  bool is_data_block = false;
+  int key_count = 0;
+  while (getline(raw_file, tp)) {
+    if (tp.find("Data Block #") != std::string::npos) {
+      is_data_block = true;
+    }
+
+    if (is_data_block && tp.find("HEX") != std::string::npos) {
+      key_count++;
+    }
+  }
+
+  ASSERT_EQ(kNumKey, key_count);
+
+  raw_file.close();
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump_tool.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool.cc	2025-05-19 16:14:28.000000000 +0000
@@ -1,4 +1,3 @@
-
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -6,65 +5,17 @@
 //
 #ifndef ROCKSDB_LITE
 
-#include "tools/sst_dump_tool_imp.h"
+#include "rocksdb/sst_dump_tool.h"
 
 #include <cinttypes>
 #include <iostream>
-#include <map>
-#include <memory>
-#include <sstream>
-#include <vector>
-
-#include "db/blob_index.h"
-#include "db/memtable.h"
-#include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
-#include "options/cf_options.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/status.h"
-#include "rocksdb/table_properties.h"
-#include "rocksdb/utilities/ldb_cmd.h"
-#include "table/block_based/block.h"
-#include "table/block_based/block_based_table_builder.h"
-#include "table/block_based/block_based_table_factory.h"
-#include "table/block_based/block_builder.h"
-#include "table/format.h"
-#include "table/meta_blocks.h"
-#include "table/plain/plain_table_factory.h"
-#include "table/table_reader.h"
-#include "util/compression.h"
-#include "util/random.h"
 
 #include "port/port.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/sst_file_dumper.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-SstFileDumper::SstFileDumper(const Options& options,
-                             const std::string& file_path, bool verify_checksum,
-                             bool output_hex, bool decode_blob_index)
-    : file_name_(file_path),
-      read_num_(0),
-      verify_checksum_(verify_checksum),
-      output_hex_(output_hex),
-      decode_blob_index_(decode_blob_index),
-      options_(options),
-      ioptions_(options_),
-      moptions_(ColumnFamilyOptions(options_)),
-      internal_comparator_(BytewiseComparator()) {
-  fprintf(stdout, "Process %s\n", file_path.c_str());
-  init_result_ = GetTableReader(file_name_);
-}
-
-extern const uint64_t kBlockBasedTableMagicNumber;
-extern const uint64_t kLegacyBlockBasedTableMagicNumber;
-extern const uint64_t kPlainTableMagicNumber;
-extern const uint64_t kLegacyPlainTableMagicNumber;
-
-const char* testFileName = "test_file_name";
-
 static const std::vector<std::pair<CompressionType, const char*>>
     kCompressions = {
         {CompressionType::kNoCompression, "kNoCompression"},
@@ -76,370 +27,29 @@
         {CompressionType::kXpressCompression, "kXpressCompression"},
         {CompressionType::kZSTD, "kZSTD"}};
 
-Status SstFileDumper::GetTableReader(const std::string& file_path) {
-  // Warning about 'magic_number' being uninitialized shows up only in UBsan
-  // builds. Though access is guarded by 's.ok()' checks, fix the issue to
-  // avoid any warnings.
-  uint64_t magic_number = Footer::kInvalidTableMagicNumber;
-
-  // read table magic number
-  Footer footer;
-
-  std::unique_ptr<RandomAccessFile> file;
-  uint64_t file_size = 0;
-  Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_);
-  if (s.ok()) {
-    s = options_.env->GetFileSize(file_path, &file_size);
-  }
-
-  file_.reset(new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                         file_path));
-
-  if (s.ok()) {
-    s = ReadFooterFromFile(file_.get(), nullptr /* prefetch_buffer */,
-                           file_size, &footer);
-  }
-  if (s.ok()) {
-    magic_number = footer.table_magic_number();
-  }
-
-  if (s.ok()) {
-    if (magic_number == kPlainTableMagicNumber ||
-        magic_number == kLegacyPlainTableMagicNumber) {
-      soptions_.use_mmap_reads = true;
-      options_.env->NewRandomAccessFile(file_path, &file, soptions_);
-      file_.reset(new RandomAccessFileReader(
-          NewLegacyRandomAccessFileWrapper(file), file_path));
-    }
-    options_.comparator = &internal_comparator_;
-    // For old sst format, ReadTableProperties might fail but file can be read
-    if (ReadTableProperties(magic_number, file_.get(), file_size).ok()) {
-      SetTableOptionsByMagicNumber(magic_number);
-    } else {
-      SetOldTableOptions();
-    }
-  }
-
-  if (s.ok()) {
-    s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
-                       &table_reader_);
-  }
-  return s;
-}
-
-Status SstFileDumper::NewTableReader(
-    const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
-    const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
-    std::unique_ptr<TableReader>* /*table_reader*/) {
-  // We need to turn off pre-fetching of index and filter nodes for
-  // BlockBasedTable
-  if (BlockBasedTableFactory::kName == options_.table_factory->Name()) {
-    return options_.table_factory->NewTableReader(
-        TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(),
-                           soptions_, internal_comparator_),
-        std::move(file_), file_size, &table_reader_, /*enable_prefetch=*/false);
-  }
-
-  // For all other factory implementation
-  return options_.table_factory->NewTableReader(
-      TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(), soptions_,
-                         internal_comparator_),
-      std::move(file_), file_size, &table_reader_);
-}
-
-Status SstFileDumper::VerifyChecksum() {
-  // We could pass specific readahead setting into read options if needed.
-  return table_reader_->VerifyChecksum(ReadOptions(),
-                                       TableReaderCaller::kSSTDumpTool);
-}
-
-Status SstFileDumper::DumpTable(const std::string& out_filename) {
-  std::unique_ptr<WritableFile> out_file;
-  Env* env = options_.env;
-  env->NewWritableFile(out_filename, &out_file, soptions_);
-  Status s = table_reader_->DumpTable(out_file.get());
-  out_file->Close();
-  return s;
-}
-
-uint64_t SstFileDumper::CalculateCompressedTableSize(
-    const TableBuilderOptions& tb_options, size_t block_size,
-    uint64_t* num_data_blocks) {
-  std::unique_ptr<WritableFile> out_file;
-  std::unique_ptr<Env> env(NewMemEnv(options_.env));
-  env->NewWritableFile(testFileName, &out_file, soptions_);
-  std::unique_ptr<WritableFileWriter> dest_writer;
-  dest_writer.reset(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(out_file)),
-                             testFileName, soptions_));
-  BlockBasedTableOptions table_options;
-  table_options.block_size = block_size;
-  BlockBasedTableFactory block_based_tf(table_options);
-  std::unique_ptr<TableBuilder> table_builder;
-  table_builder.reset(block_based_tf.NewTableBuilder(
-      tb_options,
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
-      dest_writer.get()));
-  std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
-      ReadOptions(), moptions_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    if (!iter->status().ok()) {
-      fputs(iter->status().ToString().c_str(), stderr);
-      exit(1);
-    }
-    table_builder->Add(iter->key(), iter->value());
-  }
-  Status s = table_builder->Finish();
-  if (!s.ok()) {
-    fputs(s.ToString().c_str(), stderr);
-    exit(1);
-  }
-  uint64_t size = table_builder->FileSize();
-  assert(num_data_blocks != nullptr);
-  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
-  env->DeleteFile(testFileName);
-  return size;
-}
-
-int SstFileDumper::ShowAllCompressionSizes(
-    size_t block_size,
-    const std::vector<std::pair<CompressionType, const char*>>&
-        compression_types) {
-  ReadOptions read_options;
-  Options opts;
-  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  opts.statistics->set_stats_level(StatsLevel::kAll);
-  const ImmutableCFOptions imoptions(opts);
-  const ColumnFamilyOptions cfo(opts);
-  const MutableCFOptions moptions(cfo);
-  ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
-      block_based_table_factories;
-
-  fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
-
-  for (auto& i : compression_types) {
-    if (CompressionTypeSupported(i.first)) {
-      CompressionOptions compress_opt;
-      std::string column_family_name;
-      int unknown_level = -1;
-      TableBuilderOptions tb_opts(
-          imoptions, moptions, ikc, &block_based_table_factories, i.first,
-          0 /* sample_for_compression */, compress_opt,
-          false /* skip_filters */, column_family_name, unknown_level);
-      uint64_t num_data_blocks = 0;
-      uint64_t file_size =
-          CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks);
-      fprintf(stdout, "Compression: %-24s", i.second);
-      fprintf(stdout, " Size: %10" PRIu64, file_size);
-      fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
-      const uint64_t compressed_blocks =
-          opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
-      const uint64_t not_compressed_blocks =
-          opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
-      // When the option enable_index_compression is true,
-      // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
-      if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
-        num_data_blocks = compressed_blocks + not_compressed_blocks;
-      }
-      const uint64_t ratio_not_compressed_blocks =
-          (num_data_blocks - compressed_blocks) - not_compressed_blocks;
-      const double compressed_pcnt =
-          (0 == num_data_blocks) ? 0.0
-                                 : ((static_cast<double>(compressed_blocks) /
-                                     static_cast<double>(num_data_blocks)) *
-                                    100.0);
-      const double ratio_not_compressed_pcnt =
-          (0 == num_data_blocks)
-              ? 0.0
-              : ((static_cast<double>(ratio_not_compressed_blocks) /
-                  static_cast<double>(num_data_blocks)) *
-                 100.0);
-      const double not_compressed_pcnt =
-          (0 == num_data_blocks)
-              ? 0.0
-              : ((static_cast<double>(not_compressed_blocks) /
-                  static_cast<double>(num_data_blocks)) *
-                 100.0);
-      fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
-              compressed_pcnt);
-      fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
-              ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
-      fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
-              not_compressed_blocks, not_compressed_pcnt);
-    } else {
-      fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
-    }
-  }
-  return 0;
-}
-Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
-                                          RandomAccessFileReader* file,
-                                          uint64_t file_size) {
-  TableProperties* table_properties = nullptr;
-  Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
-      file, file_size, table_magic_number, ioptions_, &table_properties);
-  if (s.ok()) {
-    table_properties_.reset(table_properties);
-  } else {
-    fprintf(stdout, "Not able to read table properties\n");
-  }
-  return s;
-}
-
-Status SstFileDumper::SetTableOptionsByMagicNumber(
-    uint64_t table_magic_number) {
-  assert(table_properties_);
-  if (table_magic_number == kBlockBasedTableMagicNumber ||
-      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
-    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-    fprintf(stdout, "Sst file format: block-based\n");
-    auto& props = table_properties_->user_collected_properties;
-    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
-    if (pos != props.end()) {
-      auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
-          DecodeFixed32(pos->second.c_str()));
-      if (index_type_on_file ==
-          BlockBasedTableOptions::IndexType::kHashSearch) {
-        options_.prefix_extractor.reset(NewNoopTransform());
-      }
-    }
-  } else if (table_magic_number == kPlainTableMagicNumber ||
-             table_magic_number == kLegacyPlainTableMagicNumber) {
-    options_.allow_mmap_reads = true;
-
-    PlainTableOptions plain_table_options;
-    plain_table_options.user_key_len = kPlainTableVariableLength;
-    plain_table_options.bloom_bits_per_key = 0;
-    plain_table_options.hash_table_ratio = 0;
-    plain_table_options.index_sparseness = 1;
-    plain_table_options.huge_page_tlb_size = 0;
-    plain_table_options.encoding_type = kPlain;
-    plain_table_options.full_scan_mode = true;
-
-    options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
-    fprintf(stdout, "Sst file format: plain table\n");
-  } else {
-    char error_msg_buffer[80];
-    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
-             "Unsupported table magic number --- %lx",
-             (long)table_magic_number);
-    return Status::InvalidArgument(error_msg_buffer);
-  }
-
-  return Status::OK();
-}
-
-Status SstFileDumper::SetOldTableOptions() {
-  assert(table_properties_ == nullptr);
-  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-  fprintf(stdout, "Sst file format: block-based(old version)\n");
-
-  return Status::OK();
-}
-
-Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
-                                     bool has_from, const std::string& from_key,
-                                     bool has_to, const std::string& to_key,
-                                     bool use_from_as_prefix) {
-  if (!table_reader_) {
-    return init_result_;
-  }
-
-  InternalIterator* iter = table_reader_->NewIterator(
-      ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get(),
-      /*arena=*/nullptr, /*skip_filters=*/false,
-      TableReaderCaller::kSSTDumpTool);
-  uint64_t i = 0;
-  if (has_from) {
-    InternalKey ikey;
-    ikey.SetMinPossibleForUserKey(from_key);
-    iter->Seek(ikey.Encode());
-  } else {
-    iter->SeekToFirst();
-  }
-  for (; iter->Valid(); iter->Next()) {
-    Slice key = iter->key();
-    Slice value = iter->value();
-    ++i;
-    if (read_num > 0 && i > read_num)
-      break;
-
-    ParsedInternalKey ikey;
-    if (!ParseInternalKey(key, &ikey)) {
-      std::cerr << "Internal Key ["
-                << key.ToString(true /* in hex*/)
-                << "] parse error!\n";
-      continue;
-    }
-
-    // the key returned is not prefixed with out 'from' key
-    if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
-      break;
-    }
-
-    // If end marker was specified, we stop before it
-    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
-      break;
-    }
-
-    if (print_kv) {
-      if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
-        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
-                value.ToString(output_hex_).c_str());
-      } else {
-        BlobIndex blob_index;
-
-        const Status s = blob_index.DecodeFrom(value);
-        if (!s.ok()) {
-          fprintf(stderr, "%s => error decoding blob index\n",
-                  ikey.DebugString(output_hex_).c_str());
-          continue;
-        }
-
-        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
-                blob_index.DebugString(output_hex_).c_str());
-      }
-    }
-  }
-
-  read_num_ += i;
-
-  Status ret = iter->status();
-  delete iter;
-  return ret;
-}
-
-Status SstFileDumper::ReadTableProperties(
-    std::shared_ptr<const TableProperties>* table_properties) {
-  if (!table_reader_) {
-    return init_result_;
-  }
-
-  *table_properties = table_reader_->GetTableProperties();
-  return init_result_;
-}
-
 namespace {
 
-void print_help() {
+void print_help(bool to_stderr) {
   fprintf(
-      stderr,
-      R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress]
+      to_stderr ? stderr : stdout,
+      R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress|identify]
     --file=<data_dir_OR_sst_file>
       Path to SST file or directory containing SST files
 
     --env_uri=<uri of underlying Env>
-      URI of underlying Env
+      URI of underlying Env, mutually exclusive with fs_uri
 
-    --command=check|scan|raw|verify
+    --fs_uri=<uri of underlying FileSystem>
+      URI of underlying FileSystem, mutually exclusive with env_uri
+
+    --command=check|scan|raw|verify|identify
         check: Iterate over entries in files but don't print anything except if an error is encountered (default command)
         scan: Iterate over entries in files and print them to screen
         raw: Dump all the table contents to <file_name>_dump.txt
         verify: Iterate all the blocks in files verifying checksum to detect possible corruption but don't print anything except if a corruption is encountered
         recompress: reports the SST file size if recompressed with different
                     compression types
+        identify: Reports a file is a valid SST file or lists all valid SST files under a directory
 
     --output_hex
       Can be combined with scan command to print the keys and values in Hex
@@ -468,7 +78,7 @@
 
     --show_properties
       Print table properties after iterating over the file when executing
-      check|scan|raw
+      check|scan|raw|identify
 
     --set_block_size=<block_size>
       Can be combined with --command=recompress to set the block size that will
@@ -482,13 +92,48 @@
     --parse_internal_key=<0xKEY>
       Convenience option to parse an internal key on the command line. Dumps the
       internal key in hex format {'key' @ SN: type}
+
+    --compression_level_from=<compression_level>
+      Compression level to start compressing when executing recompress. One compression type
+      and compression_level_to must also be specified
+
+    --compression_level_to=<compression_level>
+      Compression level to stop compressing when executing recompress. One compression type
+      and compression_level_from must also be specified
+
+    --compression_max_dict_bytes=<uint32_t>
+      Maximum size of dictionary used to prime the compression library
+
+    --compression_zstd_max_train_bytes=<uint32_t>
+      Maximum size of training data passed to zstd's dictionary trainer
+
+    --compression_max_dict_buffer_bytes=<int64_t>
+      Limit on buffer size from which we collect samples for dictionary generation.
 )");
 }
 
+// arg_name would include all prefix, e.g. "--my_arg="
+// arg_val is the parses value.
+// True if there is a match. False otherwise.
+// Woud exit after printing errmsg if cannot be parsed.
+bool ParseIntArg(const char* arg, const std::string arg_name,
+                 const std::string err_msg, int64_t* arg_val) {
+  if (strncmp(arg, arg_name.c_str(), arg_name.size()) == 0) {
+    std::string input_str = arg + arg_name.size();
+    std::istringstream iss(input_str);
+    iss >> *arg_val;
+    if (iss.fail()) {
+      fprintf(stderr, "%s\n", err_msg.c_str());
+      exit(1);
+    }
+    return true;
+  }
+  return false;
+}
 }  // namespace
 
-int SSTDumpTool::Run(int argc, char** argv, Options options) {
-  const char* env_uri = nullptr;
+int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
+  std::string env_uri, fs_uri;
   const char* dir_or_file = nullptr;
   uint64_t read_num = std::numeric_limits<uint64_t>::max();
   std::string command;
@@ -505,19 +150,38 @@
   bool show_properties = false;
   bool show_summary = false;
   bool set_block_size = false;
+  bool has_compression_level_from = false;
+  bool has_compression_level_to = false;
+  bool has_specified_compression_types = false;
   std::string from_key;
   std::string to_key;
   std::string block_size_str;
+  std::string compression_level_from_str;
+  std::string compression_level_to_str;
   size_t block_size = 0;
+  size_t readahead_size = 2 * 1024 * 1024;
   std::vector<std::pair<CompressionType, const char*>> compression_types;
   uint64_t total_num_files = 0;
   uint64_t total_num_data_blocks = 0;
   uint64_t total_data_block_size = 0;
   uint64_t total_index_block_size = 0;
   uint64_t total_filter_block_size = 0;
+  int32_t compress_level_from = CompressionOptions::kDefaultCompressionLevel;
+  int32_t compress_level_to = CompressionOptions::kDefaultCompressionLevel;
+  uint32_t compression_max_dict_bytes =
+      ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes;
+  uint32_t compression_zstd_max_train_bytes =
+      ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes;
+  uint64_t compression_max_dict_buffer_bytes =
+      ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes;
+
+  int64_t tmp_val;
+
   for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--env_uri=", 10) == 0) {
       env_uri = argv[i] + 10;
+    } else if (strncmp(argv[i], "--fs_uri=", 9) == 0) {
+      fs_uri = argv[i] + 9;
     } else if (strncmp(argv[i], "--file=", 7) == 0) {
       dir_or_file = argv[i] + 7;
     } else if (strcmp(argv[i], "--output_hex") == 0) {
@@ -546,19 +210,18 @@
       show_properties = true;
     } else if (strcmp(argv[i], "--show_summary") == 0) {
       show_summary = true;
-    } else if (strncmp(argv[i], "--set_block_size=", 17) == 0) {
+    } else if (ParseIntArg(argv[i], "--set_block_size=",
+                           "block size must be numeric", &tmp_val)) {
       set_block_size = true;
-      block_size_str = argv[i] + 17;
-      std::istringstream iss(block_size_str);
-      iss >> block_size;
-      if (iss.fail()) {
-        fprintf(stderr, "block size must be numeric\n");
-        exit(1);
-      }
+      block_size = static_cast<size_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--readahead_size=",
+                           "readahead_size must be numeric", &tmp_val)) {
+      readahead_size = static_cast<size_t>(tmp_val);
     } else if (strncmp(argv[i], "--compression_types=", 20) == 0) {
       std::string compression_types_csv = argv[i] + 20;
       std::istringstream iss(compression_types_csv);
       std::string compression_type;
+      has_specified_compression_types = true;
       while (std::getline(iss, compression_type, ',')) {
         auto iter = std::find_if(
             kCompressions.begin(), kCompressions.end(),
@@ -585,18 +248,77 @@
       Slice sl_key = ROCKSDB_NAMESPACE::Slice(in_key);
       ParsedInternalKey ikey;
       int retc = 0;
-      if (!ParseInternalKey(sl_key, &ikey)) {
-        std::cerr << "Internal Key [" << sl_key.ToString(true /* in hex*/)
-                  << "] parse error!\n";
+      Status pik_status =
+          ParseInternalKey(sl_key, &ikey, true /* log_err_key */);
+      if (!pik_status.ok()) {
+        std::cerr << pik_status.getState() << "\n";
         retc = -1;
       }
-      fprintf(stdout, "key=%s\n", ikey.DebugString(true).c_str());
+      fprintf(stdout, "key=%s\n", ikey.DebugString(true, true).c_str());
       return retc;
+    } else if (ParseIntArg(argv[i], "--compression_level_from=",
+                           "compression_level_from must be numeric",
+                           &tmp_val)) {
+      has_compression_level_from = true;
+      compress_level_from = static_cast<int>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_level_to=",
+                           "compression_level_to must be numeric", &tmp_val)) {
+      has_compression_level_to = true;
+      compress_level_to = static_cast<int>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_max_dict_bytes=",
+                           "compression_max_dict_bytes must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0 || tmp_val > port::kMaxUint32) {
+        fprintf(stderr, "compression_max_dict_bytes must be a uint32_t: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_max_dict_bytes = static_cast<uint32_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_zstd_max_train_bytes=",
+                           "compression_zstd_max_train_bytes must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0 || tmp_val > port::kMaxUint32) {
+        fprintf(stderr,
+                "compression_zstd_max_train_bytes must be a uint32_t: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_zstd_max_train_bytes = static_cast<uint32_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_max_dict_buffer_bytes=",
+                           "compression_max_dict_buffer_bytes must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0) {
+        fprintf(stderr,
+                "compression_max_dict_buffer_bytes must be positive: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_max_dict_buffer_bytes = static_cast<uint64_t>(tmp_val);
+    } else if (strcmp(argv[i], "--help") == 0) {
+      print_help(/*to_stderr*/ false);
+      return 0;
+    } else if (strcmp(argv[i], "--version") == 0) {
+      printf("%s\n", GetRocksBuildInfoAsString("sst_dump").c_str());
+      return 0;
     } else {
       fprintf(stderr, "Unrecognized argument '%s'\n\n", argv[i]);
-      print_help();
+      print_help(/*to_stderr*/ true);
+      return 1;
+    }
+  }
+
+  if(has_compression_level_from && has_compression_level_to) {
+    if(!has_specified_compression_types || compression_types.size() != 1) {
+      fprintf(stderr, "Specify one compression type.\n\n");
       exit(1);
     }
+  } else if(has_compression_level_from || has_compression_level_to) {
+    fprintf(stderr, "Specify both --compression_level_from and "
+                     "--compression_level_to.\n\n");
+    exit(1);
   }
 
   if (use_from_as_prefix && has_from) {
@@ -615,42 +337,55 @@
 
   if (dir_or_file == nullptr) {
     fprintf(stderr, "file or directory must be specified.\n\n");
-    print_help();
+    print_help(/*to_stderr*/ true);
     exit(1);
   }
 
   std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
 
   // If caller of SSTDumpTool::Run(...) does not specify a different env other
-  // than Env::Default(), then try to load custom env based on dir_or_file.
+  // than Env::Default(), then try to load custom env based on env_uri/fs_uri.
   // Otherwise, the caller is responsible for creating custom env.
-  if (!options.env || options.env == ROCKSDB_NAMESPACE::Env::Default()) {
-    Env* env = Env::Default();
-    Status s = Env::LoadEnv(env_uri ? env_uri : "", &env, &env_guard);
-    if (!s.ok() && !s.IsNotFound()) {
-      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+  {
+    ConfigOptions config_options;
+    config_options.env = options.env;
+    Status s = Env::CreateFromUri(config_options, env_uri, fs_uri, &options.env,
+                                  &env_guard);
+    if (!s.ok()) {
+      fprintf(stderr, "CreateEnvFromUri: %s\n", s.ToString().c_str());
       exit(1);
+    } else {
+      fprintf(stdout, "options.env is %p\n", options.env);
     }
-    options.env = env;
-  } else {
-    fprintf(stdout, "options.env is %p\n", options.env);
   }
 
   std::vector<std::string> filenames;
   ROCKSDB_NAMESPACE::Env* env = options.env;
   ROCKSDB_NAMESPACE::Status st = env->GetChildren(dir_or_file, &filenames);
   bool dir = true;
-  if (!st.ok()) {
+  if (!st.ok() || filenames.empty()) {
+    // dir_or_file does not exist or does not contain children
+    // Check its existence first
+    Status s = env->FileExists(dir_or_file);
+    // dir_or_file does not exist
+    if (!s.ok()) {
+      fprintf(stderr, "%s%s: No such file or directory\n", s.ToString().c_str(),
+              dir_or_file);
+      return 1;
+    }
+    // dir_or_file exists and is treated as a "file"
+    // since it has no children
+    // This is ok since later it will be checked
+    // that whether it is a valid sst or not
+    // (A directory "file" is not a valid sst)
     filenames.clear();
     filenames.push_back(dir_or_file);
     dir = false;
   }
 
-  fprintf(stdout, "from [%s] to [%s]\n",
-          ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
-          ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
-
   uint64_t total_read = 0;
+  // List of RocksDB SST file without corruption
+  std::vector<std::string> valid_sst_files;
   for (size_t i = 0; i < filenames.size(); i++) {
     std::string filename = filenames.at(i);
     if (filename.length() <= 4 ||
@@ -658,22 +393,43 @@
       // ignore
       continue;
     }
+
     if (dir) {
       filename = std::string(dir_or_file) + "/" + filename;
     }
 
-    ROCKSDB_NAMESPACE::SstFileDumper dumper(options, filename, verify_checksum,
-                                            output_hex, decode_blob_index);
+    ROCKSDB_NAMESPACE::SstFileDumper dumper(options, filename, readahead_size,
+                                            verify_checksum, output_hex,
+                                            decode_blob_index);
+    // Not a valid SST
     if (!dumper.getStatus().ok()) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
               dumper.getStatus().ToString().c_str());
       continue;
+    } else {
+      valid_sst_files.push_back(filename);
+      // Print out from and to key information once
+      // where there is at least one valid SST
+      if (valid_sst_files.size() == 1) {
+        // from_key and to_key are only used for "check", "scan", or ""
+        if (command == "check" || command == "scan" || command == "") {
+          fprintf(stdout, "from [%s] to [%s]\n",
+                  ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
+                  ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
+        }
+      }
     }
 
     if (command == "recompress") {
-      dumper.ShowAllCompressionSizes(
+      st = dumper.ShowAllCompressionSizes(
           set_block_size ? block_size : 16384,
-          compression_types.empty() ? kCompressions : compression_types);
+          compression_types.empty() ? kCompressions : compression_types,
+          compress_level_from, compress_level_to, compression_max_dict_bytes,
+          compression_zstd_max_train_bytes, compression_max_dict_buffer_bytes);
+      if (!st.ok()) {
+        fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str());
+        exit(1);
+      }
       return 0;
     }
 
@@ -771,7 +527,35 @@
     fprintf(stdout, "total filter block size: %" PRIu64 "\n",
             total_filter_block_size);
   }
-  return 0;
+
+  if (valid_sst_files.empty()) {
+    // No valid SST files are found
+    // Exit with an error state
+    if (dir) {
+      fprintf(stdout, "------------------------------\n");
+      fprintf(stderr, "No valid SST files found in %s\n", dir_or_file);
+    } else {
+      fprintf(stderr, "%s is not a valid SST file\n", dir_or_file);
+    }
+    return 1;
+  } else {
+    if (command == "identify") {
+      if (dir) {
+        fprintf(stdout, "------------------------------\n");
+        fprintf(stdout, "List of valid SST files found in %s:\n", dir_or_file);
+        for (const auto& f : valid_sst_files) {
+          fprintf(stdout, "%s\n", f.c_str());
+        }
+        fprintf(stdout, "Number of valid SST files: %zu\n",
+                valid_sst_files.size());
+      } else {
+        fprintf(stdout, "%s is a valid SST file\n", dir_or_file);
+      }
+    }
+    // At least one valid SST
+    // exit with a success state
+    return 0;
+  }
 }
 }  // namespace ROCKSDB_NAMESPACE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump_tool_imp.h mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool_imp.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/sst_dump_tool_imp.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/sst_dump_tool_imp.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,87 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include "rocksdb/sst_dump_tool.h"
-
-#include <memory>
-#include <string>
-#include "db/dbformat.h"
-#include "file/writable_file_writer.h"
-#include "options/cf_options.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class SstFileDumper {
- public:
-  explicit SstFileDumper(const Options& options, const std::string& file_name,
-                         bool verify_checksum, bool output_hex,
-                         bool decode_blob_index);
-
-  Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
-                        const std::string& from_key, bool has_to,
-                        const std::string& to_key,
-                        bool use_from_as_prefix = false);
-
-  Status ReadTableProperties(
-      std::shared_ptr<const TableProperties>* table_properties);
-  uint64_t GetReadNumber() { return read_num_; }
-  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
-
-  Status VerifyChecksum();
-  Status DumpTable(const std::string& out_filename);
-  Status getStatus() { return init_result_; }
-
-  int ShowAllCompressionSizes(
-      size_t block_size,
-      const std::vector<std::pair<CompressionType, const char*>>&
-          compression_types);
-
- private:
-  // Get the TableReader implementation for the sst file
-  Status GetTableReader(const std::string& file_path);
-  Status ReadTableProperties(uint64_t table_magic_number,
-                             RandomAccessFileReader* file, uint64_t file_size);
-
-  uint64_t CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
-                                        size_t block_size,
-                                        uint64_t* num_data_blocks);
-
-  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
-  Status SetOldTableOptions();
-
-  // Helper function to call the factory with settings specific to the
-  // factory implementation
-  Status NewTableReader(const ImmutableCFOptions& ioptions,
-                        const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        uint64_t file_size,
-                        std::unique_ptr<TableReader>* table_reader);
-
-  std::string file_name_;
-  uint64_t read_num_;
-  bool verify_checksum_;
-  bool output_hex_;
-  bool decode_blob_index_;
-  EnvOptions soptions_;
-
-  // options_ and internal_comparator_ will also be used in
-  // ReadSequential internally (specifically, seek-related operations)
-  Options options_;
-
-  Status init_result_;
-  std::unique_ptr<TableReader> table_reader_;
-  std::unique_ptr<RandomAccessFileReader> file_;
-
-  const ImmutableCFOptions ioptions_;
-  const MutableCFOptions moptions_;
-  InternalKeyComparator internal_comparator_;
-  std::unique_ptr<TableProperties> table_properties_;
-};
-
-}  // namespace ROCKSDB_NAMESPACE
-
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/trace_analyzer_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/trace_analyzer_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -12,7 +12,7 @@
 #include <cstdio>
 int main() {
   fprintf(stderr, "Please install gflags to run trace_analyzer test\n");
-  return 1;
+  return 0;
 }
 #else
 
@@ -23,7 +23,7 @@
 #include <thread>
 
 #include "db/db_test_util.h"
-#include "file/read_write_util.h"
+#include "file/line_file_reader.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
@@ -40,6 +40,10 @@
 static const size_t kArgBufferSize = 100000;
 }  // namespace
 
+// Note that, the QPS part verification of the analyzing result is not robost
+// enough and causes the failure in some rare cases. Disable them temporally and
+// wait for future refactor.
+
 // The helper functions for the test
 class TraceAnalyzerTest : public testing::Test {
  public:
@@ -47,7 +51,7 @@
     // test_path_ = test::TmpDir() + "trace_analyzer_test";
     test_path_ = test::PerThreadDBPath("trace_analyzer_test");
     env_ = ROCKSDB_NAMESPACE::Env::Default();
-    env_->CreateDir(test_path_);
+    env_->CreateDir(test_path_).PermitUncheckedError();
     dbname_ = test_path_ + "/db";
   }
 
@@ -57,7 +61,11 @@
     Options options;
     options.create_if_missing = true;
     options.merge_operator = MergeOperators::CreatePutOperator();
+    Slice upper_bound("a");
+    Slice lower_bound("abce");
     ReadOptions ro;
+    ro.iterate_upper_bound = &upper_bound;
+    ro.iterate_lower_bound = &lower_bound;
     WriteOptions wo;
     TraceOptions trace_opt;
     DB* db_ = nullptr;
@@ -77,15 +85,35 @@
     ASSERT_OK(batch.SingleDelete("d"));
     ASSERT_OK(batch.DeleteRange("e", "f"));
     ASSERT_OK(db_->Write(wo, &batch));
-
+    std::vector<Slice> keys;
+    keys.push_back("a");
+    keys.push_back("b");
+    keys.push_back("df");
+    keys.push_back("gege");
+    keys.push_back("hjhjhj");
+    std::vector<std::string> values;
+    std::vector<Status> ss = db_->MultiGet(ro, keys, &values);
+    ASSERT_GE(ss.size(), 0);
+    ASSERT_OK(ss[0]);
+    ASSERT_NOK(ss[2]);
+    std::vector<ColumnFamilyHandle*> cfs(2, db_->DefaultColumnFamily());
+    std::vector<PinnableSlice> values2(keys.size());
+    db_->MultiGet(ro, 2, cfs.data(), keys.data(), values2.data(), ss.data(),
+                  false);
+    ASSERT_OK(ss[0]);
+    db_->MultiGet(ro, db_->DefaultColumnFamily(), 2, keys.data() + 3,
+                  values2.data(), ss.data(), false);
     ASSERT_OK(db_->Get(ro, "a", &value));
+
     single_iter = db_->NewIterator(ro);
     single_iter->Seek("a");
+    ASSERT_OK(single_iter->status());
     single_iter->SeekForPrev("b");
+    ASSERT_OK(single_iter->status());
     delete single_iter;
     std::this_thread::sleep_for (std::chrono::seconds(1));
 
-    db_->Get(ro, "g", &value);
+    db_->Get(ro, "g", &value).PermitUncheckedError();
 
     ASSERT_OK(db_->EndTrace());
 
@@ -120,29 +148,26 @@
 
   void CheckFileContent(const std::vector<std::string>& cnt,
                         std::string file_path, bool full_content) {
-    ASSERT_OK(env_->FileExists(file_path));
-    std::unique_ptr<SequentialFile> f_ptr;
-    ASSERT_OK(env_->NewSequentialFile(file_path, &f_ptr, env_options_));
-
-    std::string get_line;
-    std::istringstream iss;
-    bool has_data = true;
+    const auto& fs = env_->GetFileSystem();
+    FileOptions fopts(env_options_);
+
+    ASSERT_OK(fs->FileExists(file_path, fopts.io_options, nullptr));
+    std::unique_ptr<FSSequentialFile> file;
+    ASSERT_OK(fs->NewSequentialFile(file_path, fopts, &file, nullptr));
+
+    LineFileReader lf_reader(std::move(file), file_path,
+                             4096 /* filereadahead_size */);
+
     std::vector<std::string> result;
-    uint32_t count;
-    Status s;
-    std::unique_ptr<FSSequentialFile> file =
-        NewLegacySequentialFileWrapper(f_ptr);
-    SequentialFileReader sf_reader(std::move(file), file_path,
-                                   4096 /* filereadahead_size */);
-
-    for (count = 0; ReadOneLine(&iss, &sf_reader, &get_line, &has_data, &s);
-         ++count) {
-      ASSERT_OK(s);
-      result.push_back(get_line);
+    std::string line;
+    while (lf_reader.ReadLine(&line)) {
+      result.push_back(line);
     }
 
-    ASSERT_EQ(cnt.size(), result.size());
-    for (int i = 0; i < static_cast<int>(result.size()); i++) {
+    ASSERT_OK(lf_reader.GetStatus());
+
+    size_t min_size = std::min(cnt.size(), result.size());
+    for (size_t i = 0; i < min_size; i++) {
       if (full_content) {
         ASSERT_EQ(result[i], cnt[i]);
       } else {
@@ -173,7 +198,7 @@
     if (!s.ok()) {
       GenerateTrace(trace_path);
     }
-    env_->CreateDir(output_path);
+    ASSERT_OK(env_->CreateDir(output_path));
     RunTraceAnalyzer(paras);
   }
 
@@ -188,7 +213,11 @@
   std::string trace_path = test_path_ + "/trace";
   std::string output_path = test_path_ + "/get";
   std::string file_path;
-  std::vector<std::string> paras = {"-analyze_get"};
+  std::vector<std::string> paras = {
+      "-analyze_get=true",           "-analyze_put=false",
+      "-analyze_delete=false",       "-analyze_single_delete=false",
+      "-analyze_range_delete=false", "-analyze_iterator=false",
+      "-analyze_multiget=false"};
   paras.push_back("-output_dir=" + output_path);
   paras.push_back("-trace_path=" + trace_path);
   paras.push_back("-key_space_dir=" + test_path_);
@@ -205,8 +234,9 @@
   CheckFileContent(k_dist, file_path, true);
 
   // Check the trace sequence
-  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
-                                         "0", "6", "7", "0"};
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
   file_path = output_path + "/test-human_readable_trace.txt";
   CheckFileContent(k_sequence, file_path, false);
 
@@ -232,8 +262,9 @@
   file_path = output_path + "/test-get-0-whole_key_prefix_cut.txt";
   CheckFileContent(k_whole_prefix, file_path, true);
 
+  /*
   // Check the overall qps
-  std::vector<std::string> all_qps = {"1 0 0 0 0 0 0 0 1"};
+  std::vector<std::string> all_qps = {"1 0 0 0 0 0 0 0 0 1"};
   file_path = output_path + "/test-qps_stats.txt";
   CheckFileContent(all_qps, file_path, true);
 
@@ -247,6 +278,7 @@
                                       "The prefix: 0x61 Access count: 1"};
   file_path = output_path + "/test-get-0-accessed_top_k_qps_prefix_cut.txt";
   CheckFileContent(top_qps, file_path, true);
+  */
 }
 
 // Test analyzing of Put
@@ -254,7 +286,11 @@
   std::string trace_path = test_path_ + "/trace";
   std::string output_path = test_path_ + "/put";
   std::string file_path;
-  std::vector<std::string> paras = {"-analyze_put"};
+  std::vector<std::string> paras = {
+      "-analyze_get=false",          "-analyze_put=true",
+      "-analyze_delete=false",       "-analyze_single_delete=false",
+      "-analyze_range_delete=false", "-analyze_iterator=false",
+      "-analyze_multiget=false"};
   paras.push_back("-output_dir=" + output_path);
   paras.push_back("-trace_path=" + trace_path);
   paras.push_back("-key_space_dir=" + test_path_);
@@ -271,8 +307,9 @@
   CheckFileContent(k_dist, file_path, true);
 
   // Check the trace sequence
-  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
-                                         "0", "6", "7", "0"};
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
   file_path = output_path + "/test-human_readable_trace.txt";
   CheckFileContent(k_sequence, file_path, false);
 
@@ -298,10 +335,11 @@
   CheckFileContent(k_whole_prefix, file_path, true);
 
   // Check the overall qps
-  std::vector<std::string> all_qps = {"1 1 0 0 0 0 0 0 2"};
+  std::vector<std::string> all_qps = {"0 1 0 0 0 0 0 0 0 1"};
   file_path = output_path + "/test-qps_stats.txt";
   CheckFileContent(all_qps, file_path, true);
 
+  /*
   // Check the qps of Put
   std::vector<std::string> get_qps = {"1"};
   file_path = output_path + "/test-put-0-qps_stats.txt";
@@ -318,6 +356,7 @@
       "Number_of_value_size_between 0 and 16 is: 1"};
   file_path = output_path + "/test-put-0-accessed_value_size_distribution.txt";
   CheckFileContent(value_dist, file_path, true);
+  */
 }
 
 // Test analyzing of delete
@@ -325,14 +364,18 @@
   std::string trace_path = test_path_ + "/trace";
   std::string output_path = test_path_ + "/delete";
   std::string file_path;
-  std::vector<std::string> paras = {"-analyze_delete"};
+  std::vector<std::string> paras = {
+      "-analyze_get=false",          "-analyze_put=false",
+      "-analyze_delete=true",        "-analyze_single_delete=false",
+      "-analyze_range_delete=false", "-analyze_iterator=false",
+      "-analyze_multiget=false"};
   paras.push_back("-output_dir=" + output_path);
   paras.push_back("-trace_path=" + trace_path);
   paras.push_back("-key_space_dir=" + test_path_);
   AnalyzeTrace(paras, output_path, trace_path);
 
   // check the key_stats file
-  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000"};
   file_path = output_path + "/test-delete-0-accessed_key_stats.txt";
   CheckFileContent(k_stats, file_path, true);
 
@@ -343,8 +386,9 @@
   CheckFileContent(k_dist, file_path, true);
 
   // Check the trace sequence
-  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
-                                         "0", "6", "7", "0"};
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
   file_path = output_path + "/test-human_readable_trace.txt";
   CheckFileContent(k_sequence, file_path, false);
 
@@ -369,8 +413,9 @@
   file_path = output_path + "/test-delete-0-whole_key_prefix_cut.txt";
   CheckFileContent(k_whole_prefix, file_path, true);
 
+  /*
   // Check the overall qps
-  std::vector<std::string> all_qps = {"1 1 1 0 0 0 0 0 3"};
+  std::vector<std::string> all_qps = {"0 0 1 0 0 0 0 0 0 1"};
   file_path = output_path + "/test-qps_stats.txt";
   CheckFileContent(all_qps, file_path, true);
 
@@ -384,6 +429,7 @@
                                       "The prefix: 0x63 Access count: 1"};
   file_path = output_path + "/test-delete-0-accessed_top_k_qps_prefix_cut.txt";
   CheckFileContent(top_qps, file_path, true);
+  */
 }
 
 // Test analyzing of Merge
@@ -391,7 +437,11 @@
   std::string trace_path = test_path_ + "/trace";
   std::string output_path = test_path_ + "/merge";
   std::string file_path;
-  std::vector<std::string> paras = {"-analyze_merge"};
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=true",
+      "-analyze_single_delete=false", "-analyze_range_delete=false",
+      "-analyze_iterator=false",      "-analyze_multiget=false"};
   paras.push_back("-output_dir=" + output_path);
   paras.push_back("-trace_path=" + trace_path);
   paras.push_back("-key_space_dir=" + test_path_);
@@ -408,8 +458,9 @@
   CheckFileContent(k_dist, file_path, true);
 
   // Check the trace sequence
-  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
-                                         "0", "6", "7", "0"};
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
   file_path = output_path + "/test-human_readable_trace.txt";
   CheckFileContent(k_sequence, file_path, false);
 
@@ -434,8 +485,9 @@
   file_path = output_path + "/test-merge-0-whole_key_prefix_cut.txt";
   CheckFileContent(k_whole_prefix, file_path, true);
 
+  /*
   // Check the overall qps
-  std::vector<std::string> all_qps = {"1 1 1 0 0 1 0 0 4"};
+  std::vector<std::string> all_qps = {"0 0 0 0 0 1 0 0 0 1"};
   file_path = output_path + "/test-qps_stats.txt";
   CheckFileContent(all_qps, file_path, true);
 
@@ -449,6 +501,7 @@
                                       "The prefix: 0x62 Access count: 1"};
   file_path = output_path + "/test-merge-0-accessed_top_k_qps_prefix_cut.txt";
   CheckFileContent(top_qps, file_path, true);
+  */
 
   // Check the value size distribution
   std::vector<std::string> value_dist = {
@@ -463,14 +516,18 @@
   std::string trace_path = test_path_ + "/trace";
   std::string output_path = test_path_ + "/single_delete";
   std::string file_path;
-  std::vector<std::string> paras = {"-analyze_single_delete"};
+  std::vector<std::string> paras = {
+      "-analyze_get=false",          "-analyze_put=false",
+      "-analyze_delete=false",       "-analyze_merge=false",
+      "-analyze_single_delete=true", "-analyze_range_delete=false",
+      "-analyze_iterator=false",     "-analyze_multiget=false"};
   paras.push_back("-output_dir=" + output_path);
   paras.push_back("-trace_path=" + trace_path);
   paras.push_back("-key_space_dir=" + test_path_);
   AnalyzeTrace(paras, output_path, trace_path);
 
   // check the key_stats file
-  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000"};
   file_path = output_path + "/test-single_delete-0-accessed_key_stats.txt";
   CheckFileContent(k_stats, file_path, true);
 
@@ -481,8 +538,9 @@
   CheckFileContent(k_dist, file_path, true);
 
   // Check the trace sequence
-  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
-                                         "0", "6", "7", "0"};
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
   file_path = output_path + "/test-human_readable_trace.txt";
   CheckFileContent(k_sequence, file_path, false);
 
@@ -507,8 +565,9 @@
   file_path = output_path + "/test-single_delete-0-whole_key_prefix_cut.txt";
   CheckFileContent(k_whole_prefix, file_path, true);
 
+  /*
   // Check the overall qps
-  std::vector<std::string> all_qps = {"1 1 1 1 0 1 0 0 5"};
+  std::vector<std::string> all_qps = {"0 0 0 1 0 0 0 0 0 1"};
   file_path = output_path + "/test-qps_stats.txt";
   CheckFileContent(all_qps, file_path, true);
 
@@ -523,6 +582,7 @@
   file_path =
       output_path + "/test-single_delete-0-accessed_top_k_qps_prefix_cut.txt";
   CheckFileContent(top_qps, file_path, true);
+  */
 }
 
 // Test analyzing of delete
@@ -530,14 +590,18 @@
   std::string trace_path = test_path_ + "/trace";
   std::string output_path = test_path_ + "/range_delete";
   std::string file_path;
-  std::vector<std::string> paras = {"-analyze_range_delete"};
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=false",
+      "-analyze_single_delete=false", "-analyze_range_delete=true",
+      "-analyze_iterator=false",      "-analyze_multiget=false"};
   paras.push_back("-output_dir=" + output_path);
   paras.push_back("-trace_path=" + trace_path);
   paras.push_back("-key_space_dir=" + test_path_);
   AnalyzeTrace(paras, output_path, trace_path);
 
   // check the key_stats file
-  std::vector<std::string> k_stats = {"0 0 0 1 1.000000", "0 0 1 1 1.000000"};
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"};
   file_path = output_path + "/test-range_delete-0-accessed_key_stats.txt";
   CheckFileContent(k_stats, file_path, true);
 
@@ -548,8 +612,9 @@
   CheckFileContent(k_dist, file_path, true);
 
   // Check the trace sequence
-  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
-                                         "0", "6", "7", "0"};
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
   file_path = output_path + "/test-human_readable_trace.txt";
   CheckFileContent(k_sequence, file_path, false);
 
@@ -575,8 +640,9 @@
   file_path = output_path + "/test-range_delete-0-whole_key_prefix_cut.txt";
   CheckFileContent(k_whole_prefix, file_path, true);
 
+  /*
   // Check the overall qps
-  std::vector<std::string> all_qps = {"1 1 1 1 2 1 0 0 7"};
+  std::vector<std::string> all_qps = {"0 0 0 0 2 0 0 0 0 2"};
   file_path = output_path + "/test-qps_stats.txt";
   CheckFileContent(all_qps, file_path, true);
 
@@ -592,6 +658,7 @@
   file_path =
       output_path + "/test-range_delete-0-accessed_top_k_qps_prefix_cut.txt";
   CheckFileContent(top_qps, file_path, true);
+  */
 }
 
 // Test analyzing of Iterator
@@ -599,7 +666,11 @@
   std::string trace_path = test_path_ + "/trace";
   std::string output_path = test_path_ + "/iterator";
   std::string file_path;
-  std::vector<std::string> paras = {"-analyze_iterator"};
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=false",
+      "-analyze_single_delete=false", "-analyze_range_delete=false",
+      "-analyze_iterator=true",       "-analyze_multiget=false"};
   paras.push_back("-output_dir=" + output_path);
   paras.push_back("-trace_path=" + trace_path);
   paras.push_back("-key_space_dir=" + test_path_);
@@ -607,7 +678,7 @@
 
   // Check the output of Seek
   // check the key_stats file
-  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000"};
   file_path = output_path + "/test-iterator_Seek-0-accessed_key_stats.txt";
   CheckFileContent(k_stats, file_path, true);
 
@@ -618,8 +689,9 @@
   CheckFileContent(k_dist, file_path, true);
 
   // Check the trace sequence
-  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
-                                         "0", "6", "7", "0"};
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
   file_path = output_path + "/test-human_readable_trace.txt";
   CheckFileContent(k_sequence, file_path, false);
 
@@ -644,8 +716,9 @@
   file_path = output_path + "/test-iterator_Seek-0-whole_key_prefix_cut.txt";
   CheckFileContent(k_whole_prefix, file_path, true);
 
+  /*
   // Check the overall qps
-  std::vector<std::string> all_qps = {"1 1 1 1 2 1 1 1 9"};
+  std::vector<std::string> all_qps = {"0 0 0 0 0 0 1 1 0 2"};
   file_path = output_path + "/test-qps_stats.txt";
   CheckFileContent(all_qps, file_path, true);
 
@@ -660,10 +733,11 @@
   file_path =
       output_path + "/test-iterator_Seek-0-accessed_top_k_qps_prefix_cut.txt";
   CheckFileContent(top_qps, file_path, true);
+  */
 
   // Check the output of SeekForPrev
   // check the key_stats file
-  k_stats = {"0 0 0 1 1.000000"};
+  k_stats = {"0 10 0 1 1.000000"};
   file_path =
       output_path + "/test-iterator_SeekForPrev-0-accessed_key_stats.txt";
   CheckFileContent(k_stats, file_path, true);
@@ -697,6 +771,7 @@
       output_path + "/test-iterator_SeekForPrev-0-whole_key_prefix_cut.txt";
   CheckFileContent(k_whole_prefix, file_path, true);
 
+  /*
   // Check the qps of Iterator_SeekForPrev
   get_qps = {"1"};
   file_path = output_path + "/test-iterator_SeekForPrev-0-qps_stats.txt";
@@ -707,6 +782,92 @@
   file_path = output_path +
               "/test-iterator_SeekForPrev-0-accessed_top_k_qps_prefix_cut.txt";
   CheckFileContent(top_qps, file_path, true);
+  */
+}
+
+// Test analyzing of multiget
+TEST_F(TraceAnalyzerTest, MultiGet) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/multiget";
+  std::string file_path;
+  std::vector<std::string> paras = {
+      "-analyze_get=false",           "-analyze_put=false",
+      "-analyze_delete=false",        "-analyze_merge=false",
+      "-analyze_single_delete=false", "-analyze_range_delete=true",
+      "-analyze_iterator=false",      "-analyze_multiget=true"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 2 1.000000", "0 10 1 2 1.000000",
+                                      "0 10 2 1 1.000000", "0 10 3 2 1.000000",
+                                      "0 10 4 2 1.000000"};
+  file_path = output_path + "/test-multiget-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1",
+                                     "access_count: 2 num: 4"};
+  file_path =
+      output_path + "/test-multiget-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4", "8",
+                                         "8", "8", "8", "8", "8", "8",
+                                         "8", "8", "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {
+      "0 0 0 0.000000 0.000000 0x30", "1 2 1 2.000000 1.000000 0x61",
+      "2 2 1 2.000000 1.000000 0x62", "3 1 1 1.000000 1.000000 0x64",
+      "4 2 1 2.000000 1.000000 0x67"};
+  file_path = output_path + "/test-multiget-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"8 0 0", "8 0 1", "8 0 2",
+                                       "8 0 3", "8 0 4", "8 0 0",
+                                       "8 0 1", "8 0 3", "8 0 4"};
+  file_path = output_path + "/test-multiget-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 2", "1 2"};
+  file_path = output_path + "/test-multiget-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-multiget-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  /*
+  // Check the overall qps. We have 3 MultiGet queries and it requested 9 keys
+  // in total
+  std::vector<std::string> all_qps = {"0 0 0 0 2 0 0 0 9 11"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of DeleteRange
+  std::vector<std::string> get_qps = {"9"};
+  file_path = output_path + "/test-multiget-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {
+      "At time: 0 with QPS: 9",           "The prefix: 0x61 Access count: 2",
+      "The prefix: 0x62 Access count: 2", "The prefix: 0x64 Access count: 1",
+      "The prefix: 0x67 Access count: 2", "The prefix: 0x68 Access count: 2"};
+  file_path =
+      output_path + "/test-multiget-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+  */
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.cc	2025-05-19 16:14:28.000000000 +0000
@@ -9,12 +9,10 @@
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
-#include <numaif.h>
 #endif
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
-
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
@@ -27,7 +25,7 @@
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "env/composite_env_wrapper.h"
-#include "file/read_write_util.h"
+#include "file/line_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
@@ -40,7 +38,6 @@
 #include "rocksdb/utilities/ldb_cmd.h"
 #include "rocksdb/write_batch.h"
 #include "table/meta_blocks.h"
-#include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "tools/trace_analyzer_tool.h"
 #include "trace_replay/trace_replay.h"
@@ -51,8 +48,6 @@
 #include "util/string_util.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_string(trace_path, "", "The trace file path.");
 DEFINE_string(output_dir, "", "The directory to store the output files.");
@@ -136,7 +131,13 @@
 DEFINE_bool(analyze_range_delete, false, "Analyze the DeleteRange query.");
 DEFINE_bool(analyze_merge, false, "Analyze the Merge query.");
 DEFINE_bool(analyze_iterator, false,
-            " Analyze the iterate query like seek() and seekForPrev().");
+            " Analyze the iterate query like Seek() and SeekForPrev().");
+DEFINE_bool(analyze_multiget, false,
+            " Analyze the MultiGet query. NOTE: for"
+            " MultiGet, we analyze each KV-pair read in one MultiGet query. "
+            "Therefore, the total queries and QPS are calculated based on "
+            "the number of KV-pairs being accessed not the number of MultiGet."
+            "It can be improved in the future if needed");
 DEFINE_bool(no_key, false,
             " Does not output the key to the result files to make smaller.");
 DEFINE_bool(print_overall_stats, true,
@@ -167,17 +168,21 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+const size_t kShadowValueSize = 10;
+
 std::map<std::string, int> taOptToIndex = {
     {"get", 0},           {"put", 1},
     {"delete", 2},        {"single_delete", 3},
     {"range_delete", 4},  {"merge", 5},
-    {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7}};
+    {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7},
+    {"multiget", 8}};
 
 std::map<int, std::string> taIndexToOpt = {
     {0, "get"},           {1, "put"},
     {2, "delete"},        {3, "single_delete"},
     {4, "range_delete"},  {5, "merge"},
-    {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"}};
+    {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"},
+    {8, "multiget"}};
 
 namespace {
 
@@ -191,12 +196,6 @@
   return (op1 * op2);
 }
 
-void DecodeCFAndKeyFromString(std::string& buffer, uint32_t* cf_id, Slice* key) {
-  Slice buf(buffer);
-  GetFixed32(&buf, cf_id);
-  GetLengthPrefixedSlice(&buf, key);
-}
-
 }  // namespace
 
 // The default constructor of AnalyzerOptions
@@ -271,17 +270,20 @@
 // The trace analyzer constructor
 TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
                              AnalyzerOptions _analyzer_opts)
-    : trace_name_(trace_path),
+    : write_batch_ts_(0),
+      trace_name_(trace_path),
       output_path_(output_path),
       analyzer_opts_(_analyzer_opts) {
   ROCKSDB_NAMESPACE::EnvOptions env_options;
   env_ = ROCKSDB_NAMESPACE::Env::Default();
   offset_ = 0;
-  c_time_ = 0;
   total_requests_ = 0;
   total_access_keys_ = 0;
   total_gets_ = 0;
   total_writes_ = 0;
+  total_seeks_ = 0;
+  total_seek_prevs_ = 0;
+  total_multigets_ = 0;
   trace_create_time_ = 0;
   begin_time_ = 0;
   end_time_ = 0;
@@ -342,6 +344,12 @@
   } else {
     ta_[7].enabled = false;
   }
+  ta_[8].type_name = "multiget";
+  if (FLAGS_analyze_multiget) {
+    ta_[8].enabled = true;
+  } else {
+    ta_[8].enabled = false;
+  }
   for (int i = 0; i < kTaTypeNum; i++) {
     ta_[i].sample_count = 0;
   }
@@ -354,7 +362,11 @@
 Status TraceAnalyzer::PrepareProcessing() {
   Status s;
   // Prepare the trace reader
-  s = NewFileTraceReader(env_, env_options_, trace_name_, &trace_reader_);
+  if (trace_reader_ == nullptr) {
+    s = NewFileTraceReader(env_, env_options_, trace_name_, &trace_reader_);
+  } else {
+    s = trace_reader_->Reset();
+  }
   if (!s.ok()) {
     return s;
   }
@@ -393,10 +405,15 @@
 
 Status TraceAnalyzer::ReadTraceHeader(Trace* header) {
   assert(header != nullptr);
-  Status s = ReadTraceRecord(header);
+  std::string encoded_trace;
+  // Read the trace head
+  Status s = trace_reader_->Read(&encoded_trace);
   if (!s.ok()) {
     return s;
   }
+
+  s = TracerHelper::DecodeTrace(encoded_trace, header);
+
   if (header->type != kTraceBegin) {
     return Status::Corruption("Corrupted trace file. Incorrect header.");
   }
@@ -426,13 +443,7 @@
   if (!s.ok()) {
     return s;
   }
-
-  Slice enc_slice = Slice(encoded_trace);
-  GetFixed64(&enc_slice, &trace->ts);
-  trace->type = static_cast<TraceType>(enc_slice[0]);
-  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
-  trace->payload = enc_slice.ToString();
-  return s;
+  return TracerHelper::DecodeTrace(encoded_trace, trace);
 }
 
 // process the trace itself and redirect the trace content
@@ -446,12 +457,19 @@
     fprintf(stderr, "Cannot read the header\n");
     return s;
   }
+  // Set the default trace file version as version 0.2
+  int trace_file_version = 2;
+  s = TracerHelper::ParseTraceHeader(header, &trace_file_version, &db_version_);
+  if (!s.ok()) {
+    return s;
+  }
   trace_create_time_ = header.ts;
   if (FLAGS_output_time_series) {
     time_series_start_ = header.ts;
   }
 
   Trace trace;
+  std::unique_ptr<TraceRecord> record;
   while (s.ok()) {
     trace.reset();
     s = ReadTraceRecord(&trace);
@@ -459,56 +477,28 @@
       break;
     }
 
-    total_requests_++;
     end_time_ = trace.ts;
-    if (trace.type == kTraceWrite) {
-      total_writes_++;
-      c_time_ = trace.ts;
-      WriteBatch batch(trace.payload);
-
-      // Note that, if the write happens in a transaction,
-      // 'Write' will be called twice, one for Prepare, one for
-      // Commit. Thus, in the trace, for the same WriteBatch, there
-      // will be two reords if it is in a transaction. Here, we only
-      // process the reord that is committed. If write is non-transaction,
-      // HasBeginPrepare()==false, so we process it normally.
-      if (batch.HasBeginPrepare() && !batch.HasCommit()) {
-        continue;
-      }
-      TraceWriteHandler write_handler(this);
-      s = batch.Iterate(&write_handler);
-      if (!s.ok()) {
-        fprintf(stderr, "Cannot process the write batch in the trace\n");
-        return s;
-      }
-    } else if (trace.type == kTraceGet) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
-      total_gets_++;
-
-      s = HandleGet(cf_id, key.ToString(), trace.ts, 1);
-      if (!s.ok()) {
-        fprintf(stderr, "Cannot process the get in the trace\n");
-        return s;
-      }
-    } else if (trace.type == kTraceIteratorSeek ||
-               trace.type == kTraceIteratorSeekForPrev) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
-      s = HandleIter(cf_id, key.ToString(), trace.ts, trace.type);
-      if (!s.ok()) {
-        fprintf(stderr, "Cannot process the iterator in the trace\n");
-        return s;
-      }
-    } else if (trace.type == kTraceEnd) {
+    if (trace.type == kTraceEnd) {
       break;
     }
+    // Do not count TraceEnd (if there is one)
+    total_requests_++;
+
+    s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version, &record);
+    if (s.IsNotSupported()) {
+      continue;
+    }
+    if (!s.ok()) {
+      return s;
+    }
+    s = record->Accept(this, nullptr);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot process the TraceRecord\n");
+      return s;
+    }
   }
   if (s.IsIncomplete()) {
     // Fix it: Reaching eof returns Incomplete status at the moment.
-    //
     return Status::OK();
   }
   return s;
@@ -784,7 +774,7 @@
 
 // Process the statistics of QPS
 Status TraceAnalyzer::MakeStatisticQPS() {
-  if(begin_time_ == 0) {
+  if (begin_time_ == 0) {
     begin_time_ = trace_create_time_;
   }
   uint32_t duration =
@@ -1049,32 +1039,23 @@
           FLAGS_key_space_dir + "/" + std::to_string(cf_id) + ".txt";
       std::string input_key, get_key;
       std::vector<std::string> prefix(kTaTypeNum);
-      std::istringstream iss;
-      bool has_data = true;
-      std::unique_ptr<SequentialFile> wkey_input_f;
+      std::unique_ptr<FSSequentialFile> file;
 
-      s = env_->NewSequentialFile(whole_key_path, &wkey_input_f, env_options_);
+      s = env_->GetFileSystem()->NewSequentialFile(
+          whole_key_path, FileOptions(env_options_), &file, nullptr);
       if (!s.ok()) {
         fprintf(stderr, "Cannot open the whole key space file of CF: %u\n",
                 cf_id);
-        wkey_input_f.reset();
+        file.reset();
       }
 
-      if (wkey_input_f) {
-        std::unique_ptr<FSSequentialFile> file;
-        file = NewLegacySequentialFileWrapper(wkey_input_f);
+      if (file) {
         size_t kTraceFileReadaheadSize = 2 * 1024 * 1024;
-        SequentialFileReader sf_reader(
+        LineFileReader lf_reader(
             std::move(file), whole_key_path,
             kTraceFileReadaheadSize /* filereadahead_size */);
-        for (cfs_[cf_id].w_count = 0;
-             ReadOneLine(&iss, &sf_reader, &get_key, &has_data, &s);
+        for (cfs_[cf_id].w_count = 0; lf_reader.ReadLine(&get_key);
              ++cfs_[cf_id].w_count) {
-          if (!s.ok()) {
-            fprintf(stderr, "Read whole key space file failed\n");
-            return s;
-          }
-
           input_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(get_key);
           for (int type = 0; type < kTaTypeNum; type++) {
             if (!ta_[type].enabled) {
@@ -1131,6 +1112,11 @@
             }
           }
         }
+        s = lf_reader.GetStatus();
+        if (!s.ok()) {
+          fprintf(stderr, "Read whole key space file failed\n");
+          return s;
+        }
       }
     }
 
@@ -1163,15 +1149,18 @@
 
 // End the processing, print the requested results
 Status TraceAnalyzer::EndProcessing() {
+  Status s;
   if (trace_sequence_f_) {
-    trace_sequence_f_->Close();
+    s = trace_sequence_f_->Close();
   }
   if (FLAGS_no_print) {
-    return Status::OK();
+    return s;
   }
   PrintStatistics();
-  CloseOutputFiles();
-  return Status::OK();
+  if (s.ok()) {
+    s = CloseOutputFiles();
+  }
+  return s;
 }
 
 // Insert the corresponding key statistics to the correct type
@@ -1188,7 +1177,9 @@
   unit.value_size = value_size;
   unit.access_count = 1;
   unit.latest_ts = ts;
-  if (type != TraceOperationType::kGet || value_size > 0) {
+  if ((type != TraceOperationType::kGet &&
+       type != TraceOperationType::kMultiGet) ||
+      value_size > 0) {
     unit.succ_count = 1;
   } else {
     unit.succ_count = 0;
@@ -1325,7 +1316,7 @@
     ta_[type].stats[cf_id].time_series.push_back(trace_u);
   }
 
-  return Status::OK();
+  return s;
 }
 
 // Update the correlation unit of each key if enabled
@@ -1429,7 +1420,7 @@
                          &new_stats.a_qps_f);
   }
 
-  return Status::OK();
+  return s;
 }
 
 // create the output path of the files to be opened
@@ -1450,306 +1441,233 @@
 }
 
 // Close the output files in the TraceStats if they are opened
-void TraceAnalyzer::CloseOutputFiles() {
+Status TraceAnalyzer::CloseOutputFiles() {
+  Status s;
   for (int type = 0; type < kTaTypeNum; type++) {
     if (!ta_[type].enabled) {
       continue;
     }
     for (auto& stat : ta_[type].stats) {
-      if (stat.second.time_series_f) {
-        stat.second.time_series_f->Close();
+      if (s.ok() && stat.second.time_series_f) {
+        s = stat.second.time_series_f->Close();
       }
 
-      if (stat.second.a_key_f) {
-        stat.second.a_key_f->Close();
+      if (s.ok() && stat.second.a_key_f) {
+        s = stat.second.a_key_f->Close();
       }
 
-      if (stat.second.a_key_num_f) {
-        stat.second.a_key_num_f->Close();
+      if (s.ok() && stat.second.a_key_num_f) {
+        s = stat.second.a_key_num_f->Close();
       }
 
-      if (stat.second.a_count_dist_f) {
-        stat.second.a_count_dist_f->Close();
+      if (s.ok() && stat.second.a_count_dist_f) {
+        s = stat.second.a_count_dist_f->Close();
       }
 
-      if (stat.second.a_prefix_cut_f) {
-        stat.second.a_prefix_cut_f->Close();
+      if (s.ok() && stat.second.a_prefix_cut_f) {
+        s = stat.second.a_prefix_cut_f->Close();
       }
 
-      if (stat.second.a_value_size_f) {
-        stat.second.a_value_size_f->Close();
+      if (s.ok() && stat.second.a_value_size_f) {
+        s = stat.second.a_value_size_f->Close();
       }
 
-      if (stat.second.a_key_size_f) {
-        stat.second.a_key_size_f->Close();
+      if (s.ok() && stat.second.a_key_size_f) {
+        s = stat.second.a_key_size_f->Close();
       }
 
-      if (stat.second.a_qps_f) {
-        stat.second.a_qps_f->Close();
+      if (s.ok() && stat.second.a_qps_f) {
+        s = stat.second.a_qps_f->Close();
       }
 
-      if (stat.second.a_top_qps_prefix_f) {
-        stat.second.a_top_qps_prefix_f->Close();
+      if (s.ok() && stat.second.a_top_qps_prefix_f) {
+        s = stat.second.a_top_qps_prefix_f->Close();
       }
 
-      if (stat.second.w_key_f) {
-        stat.second.w_key_f->Close();
+      if (s.ok() && stat.second.w_key_f) {
+        s = stat.second.w_key_f->Close();
       }
-      if (stat.second.w_prefix_cut_f) {
-        stat.second.w_prefix_cut_f->Close();
+      if (s.ok() && stat.second.w_prefix_cut_f) {
+        s = stat.second.w_prefix_cut_f->Close();
       }
     }
   }
-  return;
+  return s;
 }
 
-// Handle the Get request in the trace
-Status TraceAnalyzer::HandleGet(uint32_t column_family_id,
-                                const std::string& key, const uint64_t& ts,
-                                const uint32_t& get_ret) {
-  Status s;
-  size_t value_size = 0;
-  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
-    s = WriteTraceSequence(TraceOperationType::kGet, column_family_id, key,
-                           value_size, ts);
-    if (!s.ok()) {
-      return Status::Corruption("Failed to write the trace sequence to file");
-    }
-  }
-
-  if (ta_[TraceOperationType::kGet].sample_count >= sample_max_) {
-    ta_[TraceOperationType::kGet].sample_count = 0;
-  }
-  if (ta_[TraceOperationType::kGet].sample_count > 0) {
-    ta_[TraceOperationType::kGet].sample_count++;
+Status TraceAnalyzer::Handle(const WriteQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  total_writes_++;
+  // Note that, if the write happens in a transaction,
+  // 'Write' will be called twice, one for Prepare, one for
+  // Commit. Thus, in the trace, for the same WriteBatch, there
+  // will be two records if it is in a transaction. Here, we only
+  // process the reord that is committed. If write is non-transaction,
+  // HasBeginPrepare()==false, so we process it normally.
+  WriteBatch batch(record.GetWriteBatchRep().ToString());
+  if (batch.Count() == 0 || (batch.HasBeginPrepare() && !batch.HasCommit())) {
     return Status::OK();
   }
-  ta_[TraceOperationType::kGet].sample_count++;
+  write_batch_ts_ = record.GetTimestamp();
 
-  if (!ta_[TraceOperationType::kGet].enabled) {
-    return Status::OK();
-  }
-  if (get_ret == 1) {
-    value_size = 10;
-  }
-  s = KeyStatsInsertion(TraceOperationType::kGet, column_family_id, key,
-                        value_size, ts);
+  // write_result_ will be updated in batch's handler during iteration.
+  Status s = batch.Iterate(this);
+  write_batch_ts_ = 0;
   if (!s.ok()) {
-    return Status::Corruption("Failed to insert key statistics");
-  }
-  return s;
-}
-
-// Handle the Put request in the write batch of the trace
-Status TraceAnalyzer::HandlePut(uint32_t column_family_id, const Slice& key,
-                                const Slice& value) {
-  Status s;
-  size_t value_size = value.ToString().size();
-  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
-    s = WriteTraceSequence(TraceOperationType::kPut, column_family_id,
-                           key.ToString(), value_size, c_time_);
-    if (!s.ok()) {
-      return Status::Corruption("Failed to write the trace sequence to file");
-    }
+    fprintf(stderr, "Cannot process the write batch in the trace\n");
+    return s;
   }
 
-  if (ta_[TraceOperationType::kPut].sample_count >= sample_max_) {
-    ta_[TraceOperationType::kPut].sample_count = 0;
-  }
-  if (ta_[TraceOperationType::kPut].sample_count > 0) {
-    ta_[TraceOperationType::kPut].sample_count++;
-    return Status::OK();
-  }
-  ta_[TraceOperationType::kPut].sample_count++;
+  return Status::OK();
+}
 
-  if (!ta_[TraceOperationType::kPut].enabled) {
-    return Status::OK();
-  }
-  s = KeyStatsInsertion(TraceOperationType::kPut, column_family_id,
-                        key.ToString(), value_size, c_time_);
-  if (!s.ok()) {
-    return Status::Corruption("Failed to insert key statistics");
-  }
-  return s;
+Status TraceAnalyzer::Handle(const GetQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  total_gets_++;
+  return OutputAnalysisResult(TraceOperationType::kGet, record.GetTimestamp(),
+                              record.GetColumnFamilyID(),
+                              std::move(record.GetKey()), 0);
 }
 
-// Handle the Delete request in the write batch of the trace
-Status TraceAnalyzer::HandleDelete(uint32_t column_family_id,
-                                   const Slice& key) {
-  Status s;
-  size_t value_size = 0;
-  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
-    s = WriteTraceSequence(TraceOperationType::kDelete, column_family_id,
-                           key.ToString(), value_size, c_time_);
-    if (!s.ok()) {
-      return Status::Corruption("Failed to write the trace sequence to file");
-    }
+Status TraceAnalyzer::Handle(const IteratorSeekQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  TraceOperationType op_type;
+  if (record.GetSeekType() == IteratorSeekQueryTraceRecord::kSeek) {
+    op_type = TraceOperationType::kIteratorSeek;
+    total_seeks_++;
+  } else {
+    op_type = TraceOperationType::kIteratorSeekForPrev;
+    total_seek_prevs_++;
   }
 
-  if (ta_[TraceOperationType::kDelete].sample_count >= sample_max_) {
-    ta_[TraceOperationType::kDelete].sample_count = 0;
-  }
-  if (ta_[TraceOperationType::kDelete].sample_count > 0) {
-    ta_[TraceOperationType::kDelete].sample_count++;
-    return Status::OK();
-  }
-  ta_[TraceOperationType::kDelete].sample_count++;
+  // To do: shall we add lower/upper bounds?
 
-  if (!ta_[TraceOperationType::kDelete].enabled) {
-    return Status::OK();
-  }
-  s = KeyStatsInsertion(TraceOperationType::kDelete, column_family_id,
-                        key.ToString(), value_size, c_time_);
-  if (!s.ok()) {
-    return Status::Corruption("Failed to insert key statistics");
-  }
-  return s;
+  return OutputAnalysisResult(op_type, record.GetTimestamp(),
+                              record.GetColumnFamilyID(),
+                              std::move(record.GetKey()), 0);
+}
+
+Status TraceAnalyzer::Handle(const MultiGetQueryTraceRecord& record,
+                             std::unique_ptr<TraceRecordResult>* /*result*/) {
+  total_multigets_++;
+
+  std::vector<uint32_t> cf_ids = record.GetColumnFamilyIDs();
+  std::vector<Slice> keys = record.GetKeys();
+  std::vector<size_t> value_sizes;
+
+  // If the size does not match is not the error of tracing and anayzing, we
+  // just report it to the user. The analyzing continues.
+  if (cf_ids.size() > keys.size()) {
+    printf("The CF ID vector size does not match the keys vector size!\n");
+    // Make the sure the 2 vectors are of the same (smaller) size.
+    cf_ids.resize(keys.size());
+  } else if (cf_ids.size() < keys.size()) {
+    printf("The CF ID vector size does not match the keys vector size!\n");
+    // Make the sure the 2 vectors are of the same (smaller) size.
+    keys.resize(cf_ids.size());
+  }
+  // Now the 2 vectors must be of the same size.
+  value_sizes.resize(cf_ids.size(), 0);
+
+  return OutputAnalysisResult(TraceOperationType::kMultiGet,
+                              record.GetTimestamp(), std::move(cf_ids),
+                              std::move(keys), std::move(value_sizes));
 }
 
-// Handle the SingleDelete request in the write batch of the trace
-Status TraceAnalyzer::HandleSingleDelete(uint32_t column_family_id,
-                                         const Slice& key) {
-  Status s;
-  size_t value_size = 0;
-  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
-    s = WriteTraceSequence(TraceOperationType::kSingleDelete, column_family_id,
-                           key.ToString(), value_size, c_time_);
-    if (!s.ok()) {
-      return Status::Corruption("Failed to write the trace sequence to file");
-    }
-  }
+// Handle the Put request in the write batch of the trace
+Status TraceAnalyzer::PutCF(uint32_t column_family_id, const Slice& key,
+                            const Slice& value) {
+  return OutputAnalysisResult(TraceOperationType::kPut, write_batch_ts_,
+                              column_family_id, key, value.size());
+}
 
-  if (ta_[TraceOperationType::kSingleDelete].sample_count >= sample_max_) {
-    ta_[TraceOperationType::kSingleDelete].sample_count = 0;
-  }
-  if (ta_[TraceOperationType::kSingleDelete].sample_count > 0) {
-    ta_[TraceOperationType::kSingleDelete].sample_count++;
-    return Status::OK();
-  }
-  ta_[TraceOperationType::kSingleDelete].sample_count++;
+// Handle the Delete request in the write batch of the trace
+Status TraceAnalyzer::DeleteCF(uint32_t column_family_id, const Slice& key) {
+  return OutputAnalysisResult(TraceOperationType::kDelete, write_batch_ts_,
+                              column_family_id, key, 0);
+}
 
-  if (!ta_[TraceOperationType::kSingleDelete].enabled) {
-    return Status::OK();
-  }
-  s = KeyStatsInsertion(TraceOperationType::kSingleDelete, column_family_id,
-                        key.ToString(), value_size, c_time_);
-  if (!s.ok()) {
-    return Status::Corruption("Failed to insert key statistics");
-  }
-  return s;
+// Handle the SingleDelete request in the write batch of the trace
+Status TraceAnalyzer::SingleDeleteCF(uint32_t column_family_id,
+                                     const Slice& key) {
+  return OutputAnalysisResult(TraceOperationType::kSingleDelete,
+                              write_batch_ts_, column_family_id, key, 0);
 }
 
 // Handle the DeleteRange request in the write batch of the trace
-Status TraceAnalyzer::HandleDeleteRange(uint32_t column_family_id,
-                                        const Slice& begin_key,
-                                        const Slice& end_key) {
-  Status s;
-  size_t value_size = 0;
-  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
-    s = WriteTraceSequence(TraceOperationType::kRangeDelete, column_family_id,
-                           begin_key.ToString(), value_size, c_time_);
-    if (!s.ok()) {
-      return Status::Corruption("Failed to write the trace sequence to file");
-    }
-  }
-
-  if (ta_[TraceOperationType::kRangeDelete].sample_count >= sample_max_) {
-    ta_[TraceOperationType::kRangeDelete].sample_count = 0;
-  }
-  if (ta_[TraceOperationType::kRangeDelete].sample_count > 0) {
-    ta_[TraceOperationType::kRangeDelete].sample_count++;
-    return Status::OK();
-  }
-  ta_[TraceOperationType::kRangeDelete].sample_count++;
-
-  if (!ta_[TraceOperationType::kRangeDelete].enabled) {
-    return Status::OK();
-  }
-  s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
-                        begin_key.ToString(), value_size, c_time_);
-  s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
-                        end_key.ToString(), value_size, c_time_);
-  if (!s.ok()) {
-    return Status::Corruption("Failed to insert key statistics");
-  }
-  return s;
+Status TraceAnalyzer::DeleteRangeCF(uint32_t column_family_id,
+                                    const Slice& begin_key,
+                                    const Slice& end_key) {
+  return OutputAnalysisResult(TraceOperationType::kRangeDelete, write_batch_ts_,
+                              {column_family_id, column_family_id},
+                              {begin_key, end_key}, {0, 0});
 }
 
 // Handle the Merge request in the write batch of the trace
-Status TraceAnalyzer::HandleMerge(uint32_t column_family_id, const Slice& key,
-                                  const Slice& value) {
+Status TraceAnalyzer::MergeCF(uint32_t column_family_id, const Slice& key,
+                              const Slice& value) {
+  return OutputAnalysisResult(TraceOperationType::kMerge, write_batch_ts_,
+                              column_family_id, key, value.size());
+}
+
+Status TraceAnalyzer::OutputAnalysisResult(TraceOperationType op_type,
+                                           uint64_t timestamp,
+                                           std::vector<uint32_t> cf_ids,
+                                           std::vector<Slice> keys,
+                                           std::vector<size_t> value_sizes) {
+  assert(!cf_ids.empty());
+  assert(cf_ids.size() == keys.size());
+  assert(cf_ids.size() == value_sizes.size());
+
   Status s;
-  size_t value_size = value.ToString().size();
+
   if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
-    s = WriteTraceSequence(TraceOperationType::kMerge, column_family_id,
-                           key.ToString(), value_size, c_time_);
-    if (!s.ok()) {
-      return Status::Corruption("Failed to write the trace sequence to file");
+    // DeleteRane only writes the begin_key.
+    size_t cnt =
+        op_type == TraceOperationType::kRangeDelete ? 1 : cf_ids.size();
+    for (size_t i = 0; i < cnt; i++) {
+      s = WriteTraceSequence(op_type, cf_ids[i], keys[i], value_sizes[i],
+                             timestamp);
+      if (!s.ok()) {
+        return Status::Corruption("Failed to write the trace sequence to file");
+      }
     }
   }
 
-  if (ta_[TraceOperationType::kMerge].sample_count >= sample_max_) {
-    ta_[TraceOperationType::kMerge].sample_count = 0;
+  if (ta_[op_type].sample_count >= sample_max_) {
+    ta_[op_type].sample_count = 0;
   }
-  if (ta_[TraceOperationType::kMerge].sample_count > 0) {
-    ta_[TraceOperationType::kMerge].sample_count++;
+  if (ta_[op_type].sample_count > 0) {
+    ta_[op_type].sample_count++;
     return Status::OK();
   }
-  ta_[TraceOperationType::kMerge].sample_count++;
+  ta_[op_type].sample_count++;
 
-  if (!ta_[TraceOperationType::kMerge].enabled) {
+  if (!ta_[op_type].enabled) {
     return Status::OK();
   }
-  s = KeyStatsInsertion(TraceOperationType::kMerge, column_family_id,
-                        key.ToString(), value_size, c_time_);
-  if (!s.ok()) {
-    return Status::Corruption("Failed to insert key statistics");
-  }
-  return s;
-}
-
-// Handle the Iterator request in the trace
-Status TraceAnalyzer::HandleIter(uint32_t column_family_id,
-                                 const std::string& key, const uint64_t& ts,
-                                 TraceType& trace_type) {
-  Status s;
-  size_t value_size = 0;
-  int type = -1;
-  if (trace_type == kTraceIteratorSeek) {
-    type = TraceOperationType::kIteratorSeek;
-  } else if (trace_type == kTraceIteratorSeekForPrev) {
-    type = TraceOperationType::kIteratorSeekForPrev;
-  } else {
-    return s;
-  }
-  if (type == -1) {
-    return s;
-  }
 
-  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
-    s = WriteTraceSequence(type, column_family_id, key, value_size, ts);
+  for (size_t i = 0; i < cf_ids.size(); i++) {
+    // Get query does not have value part, just give a fixed value 10 for easy
+    // calculation.
+    s = KeyStatsInsertion(
+        op_type, cf_ids[i], keys[i].ToString(),
+        value_sizes[i] == 0 ? kShadowValueSize : value_sizes[i], timestamp);
     if (!s.ok()) {
-      return Status::Corruption("Failed to write the trace sequence to file");
+      return Status::Corruption("Failed to insert key statistics");
     }
   }
 
-  if (ta_[type].sample_count >= sample_max_) {
-    ta_[type].sample_count = 0;
-  }
-  if (ta_[type].sample_count > 0) {
-    ta_[type].sample_count++;
-    return Status::OK();
-  }
-  ta_[type].sample_count++;
+  return Status::OK();
+}
 
-  if (!ta_[type].enabled) {
-    return Status::OK();
-  }
-  s = KeyStatsInsertion(type, column_family_id, key, value_size, ts);
-  if (!s.ok()) {
-    return Status::Corruption("Failed to insert key statistics");
-  }
-  return s;
+Status TraceAnalyzer::OutputAnalysisResult(TraceOperationType op_type,
+                                           uint64_t timestamp, uint32_t cf_id,
+                                           const Slice& key,
+                                           size_t value_size) {
+  return OutputAnalysisResult(
+      op_type, timestamp, std::vector<uint32_t>({cf_id}),
+      std::vector<Slice>({key}), std::vector<size_t>({value_size}));
 }
 
 // Before the analyzer is closed, the requested general statistic results are
@@ -1903,8 +1821,11 @@
     printf("The statistics related to query number need to times: %u\n",
            sample_max_);
     printf("Total_requests: %" PRIu64 " Total_accessed_keys: %" PRIu64
-           " Total_gets: %" PRIu64 " Total_write_batch: %" PRIu64 "\n",
-           total_requests_, total_access_keys_, total_gets_, total_writes_);
+           " Total_gets: %" PRIu64 " Total_write_batches: %" PRIu64
+           " Total_seeks: %" PRIu64 " Total_seek_for_prevs: %" PRIu64
+           " Total_multigets: %" PRIu64 "\n",
+           total_requests_, total_access_keys_, total_gets_, total_writes_,
+           total_seeks_, total_seek_prevs_, total_multigets_);
     for (int type = 0; type < kTaTypeNum; type++) {
       if (!ta_[type].enabled) {
         continue;
@@ -1918,10 +1839,11 @@
 // Write the trace sequence to file
 Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
                                          const uint32_t& cf_id,
-                                         const std::string& key,
+                                         const Slice& key,
                                          const size_t value_size,
                                          const uint64_t ts) {
-  std::string hex_key = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(key);
+  std::string hex_key =
+      ROCKSDB_NAMESPACE::LDBCommand::StringToHex(key.ToString());
   int ret;
   ret = snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type,
                  cf_id, value_size, ts);
@@ -1966,7 +1888,7 @@
   s = analyzer->StartProcessing();
   if (!s.ok() && !FLAGS_try_process_corrupted_trace) {
     fprintf(stderr, "%s\n", s.getState());
-    fprintf(stderr, "Cannot processing the trace\n");
+    fprintf(stderr, "Cannot process the trace\n");
     exit(1);
   }
 
@@ -1995,6 +1917,7 @@
 
   return 0;
 }
+
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // Endif of Gflag
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.h mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/trace_analyzer_tool.h	2025-05-19 16:14:28.000000000 +0000
@@ -15,13 +15,15 @@
 
 #include "rocksdb/env.h"
 #include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
 #include "rocksdb/write_batch.h"
 #include "trace_replay/trace_replay.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class DBImpl;
-class WriteBatch;
+// Value sizes may be used as denominators. Replacing 0 value sizes with this
+// positive integer avoids division error.
+extern const size_t kShadowValueSize /* = 10*/;
 
 enum TraceOperationType : int {
   kGet = 0,
@@ -32,7 +34,8 @@
   kMerge = 5,
   kIteratorSeek = 6,
   kIteratorSeekForPrev = 7,
-  kTaTypeNum = 8
+  kMultiGet = 8,
+  kTaTypeNum = 9
 };
 
 struct TraceUnit {
@@ -162,7 +165,8 @@
   std::map<uint32_t, uint32_t> cf_qps;
 };
 
-class TraceAnalyzer {
+class TraceAnalyzer : private TraceRecord::Handler,
+                      private WriteBatch::Handler {
  public:
   TraceAnalyzer(std::string& trace_path, std::string& output_path,
                 AnalyzerOptions _analyzer_opts);
@@ -180,28 +184,88 @@
 
   Status WriteTraceUnit(TraceUnit& unit);
 
-  // The trace  processing functions for different type
-  Status HandleGet(uint32_t column_family_id, const std::string& key,
-                   const uint64_t& ts, const uint32_t& get_ret);
-  Status HandlePut(uint32_t column_family_id, const Slice& key,
-                   const Slice& value);
-  Status HandleDelete(uint32_t column_family_id, const Slice& key);
-  Status HandleSingleDelete(uint32_t column_family_id, const Slice& key);
-  Status HandleDeleteRange(uint32_t column_family_id, const Slice& begin_key,
-                           const Slice& end_key);
-  Status HandleMerge(uint32_t column_family_id, const Slice& key,
-                     const Slice& value);
-  Status HandleIter(uint32_t column_family_id, const std::string& key,
-                    const uint64_t& ts, TraceType& trace_type);
   std::vector<TypeUnit>& GetTaVector() { return ta_; }
 
  private:
+  using TraceRecord::Handler::Handle;
+  Status Handle(const WriteQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+  Status Handle(const GetQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+  Status Handle(const IteratorSeekQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+  Status Handle(const MultiGetQueryTraceRecord& record,
+                std::unique_ptr<TraceRecordResult>* result) override;
+
+  using WriteBatch::Handler::PutCF;
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override;
+
+  using WriteBatch::Handler::DeleteCF;
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override;
+
+  using WriteBatch::Handler::SingleDeleteCF;
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override;
+
+  using WriteBatch::Handler::DeleteRangeCF;
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override;
+
+  using WriteBatch::Handler::MergeCF;
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override;
+
+  // The following hanlders are not implemented, return Status::OK() to avoid
+  // the running time assertion and other irrelevant falures.
+  using WriteBatch::Handler::PutBlobIndexCF;
+  Status PutBlobIndexCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                        const Slice& /*value*/) override {
+    return Status::OK();
+  }
+
+  // The default implementation of LogData does nothing.
+  using WriteBatch::Handler::LogData;
+  void LogData(const Slice& /*blob*/) override {}
+
+  using WriteBatch::Handler::MarkBeginPrepare;
+  Status MarkBeginPrepare(bool = false) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkEndPrepare;
+  Status MarkEndPrepare(const Slice& /*xid*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkNoop;
+  Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkRollback;
+  Status MarkRollback(const Slice& /*xid*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkCommit;
+  Status MarkCommit(const Slice& /*xid*/) override { return Status::OK(); }
+
+  using WriteBatch::Handler::MarkCommitWithTimestamp;
+  Status MarkCommitWithTimestamp(const Slice& /*xid*/,
+                                 const Slice& /*commit_ts*/) override {
+    return Status::OK();
+  }
+
+  // Process each trace operation and output the analysis result to
+  // stdout/files.
+  Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp,
+                              std::vector<uint32_t> cf_ids,
+                              std::vector<Slice> keys,
+                              std::vector<size_t> value_sizes);
+
+  Status OutputAnalysisResult(TraceOperationType op_type, uint64_t timestamp,
+                              uint32_t cf_id, const Slice& key,
+                              size_t value_size);
+
   ROCKSDB_NAMESPACE::Env* env_;
   EnvOptions env_options_;
   std::unique_ptr<TraceReader> trace_reader_;
   size_t offset_;
   char buffer_[1024];
-  uint64_t c_time_;
+  // Timestamp of a WriteBatch, used in its iteration.
+  uint64_t write_batch_ts_;
   std::string trace_name_;
   std::string output_path_;
   AnalyzerOptions analyzer_opts_;
@@ -209,6 +273,9 @@
   uint64_t total_access_keys_;
   uint64_t total_gets_;
   uint64_t total_writes_;
+  uint64_t total_seeks_;
+  uint64_t total_seek_prevs_;
+  uint64_t total_multigets_;
   uint64_t trace_create_time_;
   uint64_t begin_time_;
   uint64_t end_time_;
@@ -238,51 +305,18 @@
       const std::string& type, const std::string& cf_name,
       const std::string& ending,
       std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>* f_ptr);
-  void CloseOutputFiles();
+  Status CloseOutputFiles();
 
   void PrintStatistics();
   Status TraceUnitWriter(
       std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>& f_ptr, TraceUnit& unit);
   Status WriteTraceSequence(const uint32_t& type, const uint32_t& cf_id,
-                            const std::string& key, const size_t value_size,
+                            const Slice& key, const size_t value_size,
                             const uint64_t ts);
   Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats);
   Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit);
   Status MakeStatisticQPS();
-};
-
-// write bach handler to be used for WriteBache iterator
-// when processing the write trace
-class TraceWriteHandler : public WriteBatch::Handler {
- public:
-  TraceWriteHandler() { ta_ptr = nullptr; }
-  explicit TraceWriteHandler(TraceAnalyzer* _ta_ptr) { ta_ptr = _ta_ptr; }
-  ~TraceWriteHandler() {}
-
-  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) override {
-    return ta_ptr->HandlePut(column_family_id, key, value);
-  }
-  virtual Status DeleteCF(uint32_t column_family_id,
-                          const Slice& key) override {
-    return ta_ptr->HandleDelete(column_family_id, key);
-  }
-  virtual Status SingleDeleteCF(uint32_t column_family_id,
-                                const Slice& key) override {
-    return ta_ptr->HandleSingleDelete(column_family_id, key);
-  }
-  virtual Status DeleteRangeCF(uint32_t column_family_id,
-                               const Slice& begin_key,
-                               const Slice& end_key) override {
-    return ta_ptr->HandleDeleteRange(column_family_id, begin_key, end_key);
-  }
-  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
-    return ta_ptr->HandleMerge(column_family_id, key, value);
-  }
-
- private:
-  TraceAnalyzer* ta_ptr;
+  int db_version_;
 };
 
 int trace_analyzer_tool(int argc, char** argv);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/write_external_sst.sh mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_external_sst.sh
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/write_external_sst.sh	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_external_sst.sh	2025-05-19 16:14:28.000000000 +0000
@@ -12,6 +12,7 @@
 db_dir=$2
 extern_sst_dir=$3
 rm -rf $db_dir
+mkdir -p $extern_sst_dir
 
 set -e
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/write_stress.cc mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/write_stress.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress.cc	2025-05-19 16:14:28.000000000 +0000
@@ -69,6 +69,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
 #include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
@@ -187,8 +188,8 @@
   void IteratorHoldThread() {
     while (!stop_.load(std::memory_order_relaxed)) {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
-      Env::Default()->SleepForMicroseconds(FLAGS_iterator_hold_sec * 1000 *
-                                           1000LL);
+      SystemClock::Default()->SleepForMicroseconds(FLAGS_iterator_hold_sec *
+                                                   1000 * 1000LL);
       for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
       }
       if (!iterator->status().ok()) {
@@ -204,9 +205,8 @@
     std::uniform_real_distribution<double> dist(0, 1);
     std::uniform_int_distribution<int> char_dist('a', 'z');
     while (!stop_.load(std::memory_order_relaxed)) {
-      Env::Default()->SleepForMicroseconds(static_cast<int>(
-                                           FLAGS_prefix_mutate_period_sec *
-                                           1000 * 1000LL));
+      SystemClock::Default()->SleepForMicroseconds(
+          static_cast<int>(FLAGS_prefix_mutate_period_sec * 1000 * 1000LL));
       if (dist(rng) < FLAGS_first_char_mutate_probability) {
         key_prefix_[0].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
       }
@@ -227,11 +227,12 @@
     if (FLAGS_runtime_sec == -1) {
       // infinite runtime, until we get killed
       while (true) {
-        Env::Default()->SleepForMicroseconds(1000 * 1000);
+        SystemClock::Default()->SleepForMicroseconds(1000 * 1000);
       }
     }
 
-    Env::Default()->SleepForMicroseconds(FLAGS_runtime_sec * 1000 * 1000);
+    SystemClock::Default()->SleepForMicroseconds(FLAGS_runtime_sec * 1000 *
+                                                 1000);
 
     stop_.store(true, std::memory_order_relaxed);
     for (auto& t : threads_) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/tools/write_stress_runner.py mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress_runner.py
--- mariadb-10.11.11/storage/rocksdb/rocksdb/tools/write_stress_runner.py	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/tools/write_stress_runner.py	2025-05-19 16:14:28.000000000 +0000
@@ -1,5 +1,7 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import subprocess
 import argparse
 import random
@@ -10,7 +12,7 @@
 def generate_runtimes(total_runtime):
     # combination of short runtimes and long runtimes, with heavier
     # weight on short runtimes
-    possible_runtimes_sec = range(1, 10) + range(1, 20) + [100, 1000]
+    possible_runtimes_sec = list(range(1, 10)) + list(range(1, 20)) + [100, 1000]
     runtimes = []
     while total_runtime > 0:
         chosen = random.choice(possible_runtimes_sec)
@@ -22,7 +24,7 @@
 
 def main(args):
     runtimes = generate_runtimes(int(args.runtime_sec))
-    print "Going to execute write stress for " + str(runtimes)  # noqa: E999 T25377293 Grandfathered in
+    print("Going to execute write stress for " + str(runtimes))  # noqa: E999 T25377293 Grandfathered in
     first_time = True
 
     for runtime in runtimes:
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc	2025-05-19 16:14:28.000000000 +0000
@@ -12,6 +12,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/trace_record.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/string_util.h"
@@ -19,8 +20,6 @@
 namespace ROCKSDB_NAMESPACE {
 
 namespace {
-const unsigned int kCharSize = 1;
-
 bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
   if (trace_options.sampling_frequency == 0 ||
       trace_options.sampling_frequency == 1) {
@@ -28,8 +27,7 @@
   }
   // We use spatial downsampling so that we have a complete access history for a
   // block.
-  return 0 == fastrange64(GetSliceNPHash64(block_key),
-                          trace_options.sampling_frequency);
+  return 0 == GetSliceRangedNPHash(block_key, trace_options.sampling_frequency);
 }
 }  // namespace
 
@@ -102,9 +100,9 @@
 }
 
 BlockCacheTraceWriter::BlockCacheTraceWriter(
-    Env* env, const TraceOptions& trace_options,
+    SystemClock* clock, const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer)
-    : env_(env),
+    : clock_(clock),
       trace_options_(trace_options),
       trace_writer_(std::move(trace_writer)) {}
 
@@ -145,7 +143,7 @@
 
 Status BlockCacheTraceWriter::WriteHeader() {
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = TraceType::kTraceBegin;
   PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
   PutFixed32(&trace.payload, kMajorVersion);
@@ -217,6 +215,8 @@
   record->block_type = trace.type;
   Slice enc_slice = Slice(trace.payload);
 
+  const unsigned int kCharSize = 1;
+
   Slice block_key;
   if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) {
     return Status::Incomplete(
@@ -306,8 +306,8 @@
 
 BlockCacheHumanReadableTraceWriter::~BlockCacheHumanReadableTraceWriter() {
   if (human_readable_trace_file_writer_) {
-    human_readable_trace_file_writer_->Flush();
-    human_readable_trace_file_writer_->Close();
+    human_readable_trace_file_writer_->Flush().PermitUncheckedError();
+    human_readable_trace_file_writer_->Close().PermitUncheckedError();
   }
 }
 
@@ -445,7 +445,7 @@
 BlockCacheTracer::~BlockCacheTracer() { EndTrace(); }
 
 Status BlockCacheTracer::StartTrace(
-    Env* env, const TraceOptions& trace_options,
+    SystemClock* clock, const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer) {
   InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
   if (writer_.load()) {
@@ -454,7 +454,7 @@
   get_id_counter_.store(1);
   trace_options_ = trace_options;
   writer_.store(
-      new BlockCacheTraceWriter(env, trace_options, std::move(trace_writer)));
+      new BlockCacheTraceWriter(clock, trace_options, std::move(trace_writer)));
   return writer_.load()->WriteHeader();
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.h mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer.h	2025-05-19 16:14:28.000000000 +0000
@@ -9,13 +9,14 @@
 #include <fstream>
 
 #include "monitoring/instrumented_mutex.h"
-#include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/trace_reader_writer.h"
 #include "table/table_reader_caller.h"
 #include "trace_replay/trace_replay.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Env;
+class SystemClock;
 
 extern const uint64_t kMicrosInSecond;
 extern const uint64_t kSecondInMinute;
@@ -172,7 +173,7 @@
 // payload.
 class BlockCacheTraceWriter {
  public:
-  BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options,
+  BlockCacheTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
                         std::unique_ptr<TraceWriter>&& trace_writer);
   ~BlockCacheTraceWriter() = default;
   // No copy and move.
@@ -191,7 +192,7 @@
   Status WriteHeader();
 
  private:
-  Env* env_;
+  SystemClock* clock_;
   TraceOptions trace_options_;
   std::unique_ptr<TraceWriter> trace_writer_;
 };
@@ -220,7 +221,7 @@
 class BlockCacheTraceReader {
  public:
   BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader);
-  ~BlockCacheTraceReader() = default;
+  virtual ~BlockCacheTraceReader() = default;
   // No copy and move.
   BlockCacheTraceReader(const BlockCacheTraceReader&) = delete;
   BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete;
@@ -266,7 +267,7 @@
   BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
 
   // Start writing block cache accesses to the trace_writer.
-  Status StartTrace(Env* env, const TraceOptions& trace_options,
+  Status StartTrace(SystemClock* clock, const TraceOptions& trace_options,
                     std::unique_ptr<TraceWriter>&& trace_writer);
 
   // Stop writing block cache accesses to the trace_writer.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -4,9 +4,12 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "trace_replay/block_cache_tracer.h"
+
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
@@ -28,6 +31,7 @@
   BlockCacheTracerTest() {
     test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
     env_ = ROCKSDB_NAMESPACE::Env::Default();
+    clock_ = env_->GetSystemClock().get();
     EXPECT_OK(env_->CreateDir(test_path_));
     trace_file_path_ = test_path_ + "/block_cache_trace";
   }
@@ -52,6 +56,7 @@
         return TableReaderCaller::kUserIterator;
     }
     assert(false);
+    return TableReaderCaller::kMaxBlockCacheLookupCaller;
   }
 
   void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
@@ -63,7 +68,7 @@
       record.block_type = block_type;
       record.block_size = kBlockSize + key_id;
       record.block_key = (kBlockKeyPrefix + std::to_string(key_id));
-      record.access_timestamp = env_->NowMicros();
+      record.access_timestamp = clock_->NowMicros();
       record.cf_id = kCFId;
       record.cf_name = kDefaultColumnFamilyName;
       record.caller = GetCaller(key_id);
@@ -93,7 +98,7 @@
     record.block_type = TraceType::kBlockTraceDataBlock;
     record.block_size = kBlockSize;
     record.block_key = kBlockKeyPrefix + std::to_string(key_id);
-    record.access_timestamp = env_->NowMicros();
+    record.access_timestamp = clock_->NowMicros();
     record.cf_id = kCFId;
     record.cf_name = kDefaultColumnFamilyName;
     record.caller = GetCaller(key_id);
@@ -150,6 +155,7 @@
   }
 
   Env* env_;
+  SystemClock* clock_;
   EnvOptions env_options_;
   std::string trace_file_path_;
   std::string test_path_;
@@ -187,7 +193,7 @@
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
     BlockCacheTracer writer;
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
                                       record.referenced_key));
     ASSERT_OK(env_->FileExists(trace_file_path_));
@@ -200,8 +206,8 @@
     BlockCacheTraceReader reader(std::move(trace_reader));
     BlockCacheTraceHeader header;
     ASSERT_OK(reader.ReadHeader(&header));
-    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
-    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
     VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
     ASSERT_NOK(reader.ReadAccess(&record));
   }
@@ -213,8 +219,8 @@
   ASSERT_OK(
       NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer));
   BlockCacheTracer writer;
-  ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
-  ASSERT_NOK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+  ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
+  ASSERT_NOK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
   ASSERT_OK(env_->FileExists(trace_file_path_));
 }
 
@@ -226,7 +232,7 @@
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
     BlockCacheTracer writer;
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
                                       record.referenced_key));
     writer.EndTrace();
@@ -244,8 +250,8 @@
     BlockCacheTraceReader reader(std::move(trace_reader));
     BlockCacheTraceHeader header;
     ASSERT_OK(reader.ReadHeader(&header));
-    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
-    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
     VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
     ASSERT_NOK(reader.ReadAccess(&record));
   }
@@ -261,7 +267,7 @@
     // next get id should always return 0 before we call StartTrace.
     ASSERT_EQ(0, writer.NextGetId());
     ASSERT_EQ(0, writer.NextGetId());
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_EQ(1, writer.NextGetId());
     ASSERT_EQ(2, writer.NextGetId());
     writer.EndTrace();
@@ -275,7 +281,7 @@
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_EQ(1, writer.NextGetId());
   }
 }
@@ -287,7 +293,7 @@
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    BlockCacheTraceWriter writer(clock_, trace_opt, std::move(trace_writer));
     ASSERT_OK(writer.WriteHeader());
     // Write blocks of different types.
     WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
@@ -307,8 +313,8 @@
     BlockCacheTraceReader reader(std::move(trace_reader));
     BlockCacheTraceHeader header;
     ASSERT_OK(reader.ReadHeader(&header));
-    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
-    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
     // Read blocks.
     VerifyAccess(&reader, 0, TraceType::kBlockTraceUncompressionDictBlock, 10);
     VerifyAccess(&reader, 10, TraceType::kBlockTraceDataBlock, 10);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/io_tracer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/io_tracer.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,303 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/io_tracer.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+IOTraceWriter::IOTraceWriter(SystemClock* clock,
+                             const TraceOptions& trace_options,
+                             std::unique_ptr<TraceWriter>&& trace_writer)
+    : clock_(clock),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)) {}
+
+Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record,
+                                IODebugContext* dbg) {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  if (trace_file_size > trace_options_.max_trace_file_size) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = record.access_timestamp;
+  trace.type = record.trace_type;
+  PutFixed64(&trace.payload, record.io_op_data);
+  Slice file_operation(record.file_operation);
+  PutLengthPrefixedSlice(&trace.payload, file_operation);
+  PutFixed64(&trace.payload, record.latency);
+  Slice io_status(record.io_status);
+  PutLengthPrefixedSlice(&trace.payload, io_status);
+  Slice file_name(record.file_name);
+  PutLengthPrefixedSlice(&trace.payload, file_name);
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Write remaining options based on io_op_data set by file operation */
+  int64_t io_op_data = static_cast<int64_t>(record.io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        PutFixed64(&trace.payload, record.file_size);
+        break;
+      case IOTraceOp::kIOLen:
+        PutFixed64(&trace.payload, record.len);
+        break;
+      case IOTraceOp::kIOOffset:
+        PutFixed64(&trace.payload, record.offset);
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  int64_t trace_data = 0;
+  if (dbg) {
+    trace_data = static_cast<int64_t>(dbg->trace_data);
+  }
+  PutFixed64(&trace.payload, trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID: {
+        Slice request_id(dbg->request_id);
+        PutLengthPrefixedSlice(&trace.payload, request_id);
+      } break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
+  }
+
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+Status IOTraceWriter::WriteHeader() {
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = TraceType::kTraceBegin;
+  PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
+  PutFixed32(&trace.payload, kMajorVersion);
+  PutFixed32(&trace.payload, kMinorVersion);
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+IOTraceReader::IOTraceReader(std::unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {}
+
+Status IOTraceReader::ReadHeader(IOTraceHeader* header) {
+  assert(header != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  header->start_time = trace.ts;
+  Slice enc_slice = Slice(trace.payload);
+  Slice magic_number;
+  if (!GetLengthPrefixedSlice(&enc_slice, &magic_number)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read the magic number.");
+  }
+  if (magic_number.ToString() != kTraceMagic) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Magic number does not match.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb major "
+        "version number.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb minor "
+        "version number.");
+  }
+  // We should have retrieved all information in the header.
+  if (!enc_slice.empty()) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: The length of header is too "
+        "long.");
+  }
+  return Status::OK();
+}
+
+Status IOTraceReader::ReadIOOp(IOTraceRecord* record) {
+  assert(record);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  record->access_timestamp = trace.ts;
+  record->trace_type = trace.type;
+  Slice enc_slice = Slice(trace.payload);
+
+  if (!GetFixed64(&enc_slice, &record->io_op_data)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read trace data.");
+  }
+  Slice file_operation;
+  if (!GetLengthPrefixedSlice(&enc_slice, &file_operation)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read file operation.");
+  }
+  record->file_operation = file_operation.ToString();
+  if (!GetFixed64(&enc_slice, &record->latency)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read latency.");
+  }
+  Slice io_status;
+  if (!GetLengthPrefixedSlice(&enc_slice, &io_status)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read IO status.");
+  }
+  record->io_status = io_status.ToString();
+  Slice file_name;
+  if (!GetLengthPrefixedSlice(&enc_slice, &file_name)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read file name.");
+  }
+  record->file_name = file_name.ToString();
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Read remaining options based on io_op_data set by file operation */
+  // Assuming 63 bits will be used at max.
+  int64_t io_op_data = static_cast<int64_t>(record->io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        if (!GetFixed64(&enc_slice, &record->file_size)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read file size.");
+        }
+        break;
+      case IOTraceOp::kIOLen:
+        if (!GetFixed64(&enc_slice, &record->len)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read length.");
+        }
+        break;
+      case IOTraceOp::kIOOffset:
+        if (!GetFixed64(&enc_slice, &record->offset)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read offset.");
+        }
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  if (!GetFixed64(&enc_slice, &record->trace_data)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read trace op.");
+  }
+  int64_t trace_data = static_cast<int64_t>(record->trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID: {
+        Slice request_id;
+        if (!GetLengthPrefixedSlice(&enc_slice, &request_id)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to request id.");
+        }
+        record->request_id = request_id.ToString();
+      } break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
+  }
+
+  return Status::OK();
+}
+
+IOTracer::IOTracer() : tracing_enabled(false) { writer_.store(nullptr); }
+
+IOTracer::~IOTracer() { EndIOTrace(); }
+
+Status IOTracer::StartIOTrace(SystemClock* clock,
+                              const TraceOptions& trace_options,
+                              std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (writer_.load()) {
+    return Status::Busy();
+  }
+  trace_options_ = trace_options;
+  writer_.store(
+      new IOTraceWriter(clock, trace_options, std::move(trace_writer)));
+  tracing_enabled = true;
+  return writer_.load()->WriteHeader();
+}
+
+void IOTracer::EndIOTrace() {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return;
+  }
+  delete writer_.load();
+  writer_.store(nullptr);
+  tracing_enabled = false;
+}
+
+void IOTracer::WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg) {
+  if (!writer_.load()) {
+    return;
+  }
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return;
+  }
+  writer_.load()->WriteIOOp(record, dbg).PermitUncheckedError();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/io_tracer.h mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/io_tracer.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,185 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <fstream>
+
+#include "monitoring/instrumented_mutex.h"
+#include "port/lang.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/trace_record.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+class TraceReader;
+class TraceWriter;
+
+/* In order to log new data in trace record for specified operations, do
+   following:
+   1. Add new data in IOTraceOP (say kIONewData= 3)
+   2. Log it in IOTraceWriter::WriteIOOp, and read that in
+   IOTraceReader::ReadIOOp and
+   IOTraceRecordParser::PrintHumanReadableIOTraceRecord in the switch case.
+   3. In the FileSystemTracer APIs where this data will be logged with, update
+   io_op_data |= (1 << IOTraceOp::kIONewData).
+*/
+enum IOTraceOp : char {
+  // The value of each enum represents the bitwise position for
+  // IOTraceRecord.io_op_data.
+  kIOFileSize = 0,
+  kIOLen = 1,
+  kIOOffset = 2,
+};
+
+struct IOTraceRecord {
+  // Required fields for all accesses.
+  uint64_t access_timestamp = 0;
+  TraceType trace_type = TraceType::kTraceMax;
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record.
+  uint64_t io_op_data = 0;
+  std::string file_operation;
+  uint64_t latency = 0;
+  std::string io_status;
+  // Stores file name instead of full path.
+  std::string file_name;
+
+  // Fields added to record based on IO operation.
+  uint64_t len = 0;
+  uint64_t offset = 0;
+  uint64_t file_size = 0;
+
+  // Additional information passed in IODebugContext.
+  uint64_t trace_data = 0;
+  std::string request_id;
+
+  IOTraceRecord() {}
+
+  IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
+                const uint64_t& _io_op_data, const std::string& _file_operation,
+                const uint64_t& _latency, const std::string& _io_status,
+                const std::string& _file_name, const uint64_t& _file_size = 0)
+      : access_timestamp(_access_timestamp),
+        trace_type(_trace_type),
+        io_op_data(_io_op_data),
+        file_operation(_file_operation),
+        latency(_latency),
+        io_status(_io_status),
+        file_name(_file_name),
+        file_size(_file_size) {}
+
+  IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
+                const uint64_t& _io_op_data, const std::string& _file_operation,
+                const uint64_t& _latency, const std::string& _io_status,
+                const std::string& _file_name, const uint64_t& _len,
+                const uint64_t& _offset)
+      : access_timestamp(_access_timestamp),
+        trace_type(_trace_type),
+        io_op_data(_io_op_data),
+        file_operation(_file_operation),
+        latency(_latency),
+        io_status(_io_status),
+        file_name(_file_name),
+        len(_len),
+        offset(_offset) {}
+};
+
+struct IOTraceHeader {
+  uint64_t start_time;
+  uint32_t rocksdb_major_version;
+  uint32_t rocksdb_minor_version;
+};
+
+// IOTraceWriter writes IO operation as a single trace. Each trace will have a
+// timestamp and type, followed by the trace payload.
+class IOTraceWriter {
+ public:
+  IOTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
+                std::unique_ptr<TraceWriter>&& trace_writer);
+  ~IOTraceWriter() = default;
+  // No copy and move.
+  IOTraceWriter(const IOTraceWriter&) = delete;
+  IOTraceWriter& operator=(const IOTraceWriter&) = delete;
+  IOTraceWriter(IOTraceWriter&&) = delete;
+  IOTraceWriter& operator=(IOTraceWriter&&) = delete;
+
+  Status WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg);
+
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number and RocksDB version.
+  Status WriteHeader();
+
+ private:
+  SystemClock* clock_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+};
+
+// IOTraceReader helps read the trace file generated by IOTraceWriter.
+class IOTraceReader {
+ public:
+  explicit IOTraceReader(std::unique_ptr<TraceReader>&& reader);
+  ~IOTraceReader() = default;
+  // No copy and move.
+  IOTraceReader(const IOTraceReader&) = delete;
+  IOTraceReader& operator=(const IOTraceReader&) = delete;
+  IOTraceReader(IOTraceReader&&) = delete;
+  IOTraceReader& operator=(IOTraceReader&&) = delete;
+
+  Status ReadHeader(IOTraceHeader* header);
+
+  Status ReadIOOp(IOTraceRecord* record);
+
+ private:
+  std::unique_ptr<TraceReader> trace_reader_;
+};
+
+// An IO tracer. It uses IOTraceWriter to write the access record to the
+// trace file.
+class IOTracer {
+ public:
+  IOTracer();
+  ~IOTracer();
+  // No copy and move.
+  IOTracer(const IOTracer&) = delete;
+  IOTracer& operator=(const IOTracer&) = delete;
+  IOTracer(IOTracer&&) = delete;
+  IOTracer& operator=(IOTracer&&) = delete;
+
+  // no_sanitize is added for tracing_enabled. writer_ is protected under mutex
+  // so even if user call Start/EndIOTrace and tracing_enabled is not updated in
+  // the meanwhile, WriteIOOp will anyways check the writer_ protected under
+  // mutex and ignore the operation if writer_is null. So its ok if
+  // tracing_enabled shows non updated value.
+
+  // Start writing IO operations to the trace_writer.
+  TSAN_SUPPRESSION Status
+  StartIOTrace(SystemClock* clock, const TraceOptions& trace_options,
+               std::unique_ptr<TraceWriter>&& trace_writer);
+
+  // Stop writing IO operations to the trace_writer.
+  TSAN_SUPPRESSION void EndIOTrace();
+
+  TSAN_SUPPRESSION bool is_tracing_enabled() const { return tracing_enabled; }
+
+  void WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg);
+
+ private:
+  TraceOptions trace_options_;
+  // A mutex protects the writer_.
+  InstrumentedMutex trace_writer_mutex_;
+  std::atomic<IOTraceWriter*> writer_;
+  // bool tracing_enabled is added to avoid costly operation of checking atomic
+  // variable 'writer_' is nullptr or not in is_tracing_enabled().
+  // is_tracing_enabled() is invoked multiple times by FileSystem classes.
+  bool tracing_enabled;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/io_tracer_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/io_tracer_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/io_tracer_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,352 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/io_tracer.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+const std::string kDummyFile = "/dummy/file";
+
+}  // namespace
+
+class IOTracerTest : public testing::Test {
+ public:
+  IOTracerTest() {
+    test_path_ = test::PerThreadDBPath("io_tracer_test");
+    env_ = ROCKSDB_NAMESPACE::Env::Default();
+    clock_ = env_->GetSystemClock().get();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/io_trace";
+  }
+
+  ~IOTracerTest() override {
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  std::string GetFileOperation(uint64_t id) {
+    id = id % 4;
+    switch (id) {
+      case 0:
+        return "CreateDir";
+      case 1:
+        return "GetChildren";
+      case 2:
+        return "FileSize";
+      case 3:
+        return "DeleteDir";
+      default:
+        assert(false);
+    }
+    return "";
+  }
+
+  void WriteIOOp(IOTraceWriter* writer, uint64_t nrecords) {
+    assert(writer);
+    for (uint64_t i = 0; i < nrecords; i++) {
+      IOTraceRecord record;
+      record.io_op_data = 0;
+      record.trace_type = TraceType::kIOTracer;
+      record.io_op_data |= (1 << IOTraceOp::kIOLen);
+      record.io_op_data |= (1 << IOTraceOp::kIOOffset);
+      record.file_operation = GetFileOperation(i);
+      record.io_status = IOStatus::OK().ToString();
+      record.file_name = kDummyFile + std::to_string(i);
+      record.len = i;
+      record.offset = i + 20;
+      EXPECT_OK(writer->WriteIOOp(record, nullptr));
+    }
+  }
+
+  void VerifyIOOp(IOTraceReader* reader, uint32_t nrecords) {
+    assert(reader);
+    for (uint32_t i = 0; i < nrecords; i++) {
+      IOTraceRecord record;
+      ASSERT_OK(reader->ReadIOOp(&record));
+      ASSERT_EQ(record.file_operation, GetFileOperation(i));
+      ASSERT_EQ(record.io_status, IOStatus::OK().ToString());
+      ASSERT_EQ(record.len, i);
+      ASSERT_EQ(record.offset, i + 20);
+    }
+  }
+
+  Env* env_;
+  SystemClock* clock_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+
+TEST_F(IOTracerTest, MultipleRecordsWithDifferentIOOpOptions) {
+  std::string file_name = kDummyFile + std::to_string(5);
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+
+    // Write general record.
+    IOTraceRecord record0(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                          GetFileOperation(0), 155 /*latency*/,
+                          IOStatus::OK().ToString(), file_name);
+    writer.WriteIOOp(record0, nullptr);
+
+    // Write record with FileSize.
+    uint64_t io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOFileSize);
+    IOTraceRecord record1(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(1), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name,
+                          256 /*file_size*/);
+    writer.WriteIOOp(record1, nullptr);
+
+    // Write record with Length.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOLen);
+    IOTraceRecord record2(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(2), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 100 /*length*/,
+                          200 /*offset*/);
+    writer.WriteIOOp(record2, nullptr);
+
+    // Write record with Length and offset.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOLen);
+    io_op_data |= (1 << IOTraceOp::kIOOffset);
+    IOTraceRecord record3(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(3), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 120 /*length*/,
+                          17 /*offset*/);
+    writer.WriteIOOp(record3, nullptr);
+
+    // Write record with offset.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOOffset);
+    IOTraceRecord record4(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(4), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 13 /*length*/,
+                          50 /*offset*/);
+    writer.WriteIOOp(record4, nullptr);
+
+    // Write record with IODebugContext.
+    io_op_data = 0;
+    IODebugContext dbg;
+    dbg.SetRequestId("request_id_1");
+    IOTraceRecord record5(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(5), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name);
+    writer.WriteIOOp(record5, &dbg);
+
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    // Read general record.
+    IOTraceRecord record0;
+    ASSERT_OK(reader.ReadIOOp(&record0));
+    ASSERT_EQ(record0.file_operation, GetFileOperation(0));
+    ASSERT_EQ(record0.latency, 155);
+    ASSERT_EQ(record0.file_name, file_name);
+
+    // Read record with FileSize.
+    IOTraceRecord record1;
+    ASSERT_OK(reader.ReadIOOp(&record1));
+    ASSERT_EQ(record1.file_size, 256);
+    ASSERT_EQ(record1.len, 0);
+    ASSERT_EQ(record1.offset, 0);
+
+    // Read record with Length.
+    IOTraceRecord record2;
+    ASSERT_OK(reader.ReadIOOp(&record2));
+    ASSERT_EQ(record2.len, 100);
+    ASSERT_EQ(record2.file_size, 0);
+    ASSERT_EQ(record2.offset, 0);
+
+    // Read record with Length and offset.
+    IOTraceRecord record3;
+    ASSERT_OK(reader.ReadIOOp(&record3));
+    ASSERT_EQ(record3.len, 120);
+    ASSERT_EQ(record3.file_size, 0);
+    ASSERT_EQ(record3.offset, 17);
+
+    // Read record with offset.
+    IOTraceRecord record4;
+    ASSERT_OK(reader.ReadIOOp(&record4));
+    ASSERT_EQ(record4.len, 0);
+    ASSERT_EQ(record4.file_size, 0);
+    ASSERT_EQ(record4.offset, 50);
+
+    IOTraceRecord record5;
+    ASSERT_OK(reader.ReadIOOp(&record5));
+    ASSERT_EQ(record5.len, 0);
+    ASSERT_EQ(record5.file_size, 0);
+    ASSERT_EQ(record5.offset, 0);
+    ASSERT_EQ(record5.request_id, "request_id_1");
+    // Read one more record and it should report error.
+    IOTraceRecord record6;
+    ASSERT_NOK(reader.ReadIOOp(&record6));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicWrite) {
+  std::string file_name = kDummyFile + std::to_string(0);
+  {
+    IOTraceRecord record(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                         GetFileOperation(0), 10 /*latency*/,
+                         IOStatus::OK().ToString(), file_name);
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+    writer.WriteIOOp(record, nullptr);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+    // Read record and verify data.
+    IOTraceRecord access_record;
+    ASSERT_OK(reader.ReadIOOp(&access_record));
+    ASSERT_EQ(access_record.file_operation, GetFileOperation(0));
+    ASSERT_EQ(access_record.io_status, IOStatus::OK().ToString());
+    ASSERT_EQ(access_record.file_name, file_name);
+    ASSERT_NOK(reader.ReadIOOp(&access_record));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicWriteBeforeStartTrace) {
+  std::string file_name = kDummyFile + std::to_string(0);
+  {
+    IOTraceRecord record(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                         GetFileOperation(0), 0, IOStatus::OK().ToString(),
+                         file_name);
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    // The record should not be written to the trace_file since StartIOTrace is
+    // not called.
+    writer.WriteIOOp(record, nullptr);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains nothing.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_NOK(reader.ReadHeader(&header));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicNoWriteAfterEndTrace) {
+  std::string file_name = kDummyFile + std::to_string(0);
+  {
+    uint64_t io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOFileSize);
+    IOTraceRecord record(
+        0, TraceType::kIOTracer, io_op_data, GetFileOperation(2), 0 /*latency*/,
+        IOStatus::OK().ToString(), file_name, 10 /*file_size*/);
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+    writer.WriteIOOp(record, nullptr);
+    writer.EndIOTrace();
+    // Write the record again. This time the record should not be written since
+    // EndIOTrace is called.
+    writer.WriteIOOp(record, nullptr);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    IOTraceRecord access_record;
+    ASSERT_OK(reader.ReadIOOp(&access_record));
+    ASSERT_EQ(access_record.file_operation, GetFileOperation(2));
+    ASSERT_EQ(access_record.io_status, IOStatus::OK().ToString());
+    ASSERT_EQ(access_record.file_size, 10);
+    // No more record.
+    ASSERT_NOK(reader.ReadIOOp(&access_record));
+  }
+}
+
+TEST_F(IOTracerTest, AtomicMultipleWrites) {
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTraceWriter writer(clock_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    // Write 10 records
+    WriteIOOp(&writer, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+    // Read 10 records.
+    VerifyIOOp(&reader, 10);
+    // Read one more and record and it should report error.
+    IOTraceRecord record;
+    ASSERT_NOK(reader.ReadIOOp(&record));
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,206 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/trace_record.h"
+
+#include <utility>
+
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record_result.h"
+#include "trace_replay/trace_record_handler.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TraceRecord
+TraceRecord::TraceRecord(uint64_t timestamp) : timestamp_(timestamp) {}
+
+uint64_t TraceRecord::GetTimestamp() const { return timestamp_; }
+
+TraceRecord::Handler* TraceRecord::NewExecutionHandler(
+    DB* db, const std::vector<ColumnFamilyHandle*>& handles) {
+  return new TraceExecutionHandler(db, handles);
+}
+
+// QueryTraceRecord
+QueryTraceRecord::QueryTraceRecord(uint64_t timestamp)
+    : TraceRecord(timestamp) {}
+
+// WriteQueryTraceRecord
+WriteQueryTraceRecord::WriteQueryTraceRecord(PinnableSlice&& write_batch_rep,
+                                             uint64_t timestamp)
+    : QueryTraceRecord(timestamp), rep_(std::move(write_batch_rep)) {}
+
+WriteQueryTraceRecord::WriteQueryTraceRecord(const std::string& write_batch_rep,
+                                             uint64_t timestamp)
+    : QueryTraceRecord(timestamp) {
+  rep_.PinSelf(write_batch_rep);
+}
+
+WriteQueryTraceRecord::~WriteQueryTraceRecord() { rep_.clear(); }
+
+Slice WriteQueryTraceRecord::GetWriteBatchRep() const { return Slice(rep_); }
+
+Status WriteQueryTraceRecord::Accept(
+    Handler* handler, std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+// GetQueryTraceRecord
+GetQueryTraceRecord::GetQueryTraceRecord(uint32_t column_family_id,
+                                         PinnableSlice&& key,
+                                         uint64_t timestamp)
+    : QueryTraceRecord(timestamp),
+      cf_id_(column_family_id),
+      key_(std::move(key)) {}
+
+GetQueryTraceRecord::GetQueryTraceRecord(uint32_t column_family_id,
+                                         const std::string& key,
+                                         uint64_t timestamp)
+    : QueryTraceRecord(timestamp), cf_id_(column_family_id) {
+  key_.PinSelf(key);
+}
+
+GetQueryTraceRecord::~GetQueryTraceRecord() { key_.clear(); }
+
+uint32_t GetQueryTraceRecord::GetColumnFamilyID() const { return cf_id_; }
+
+Slice GetQueryTraceRecord::GetKey() const { return Slice(key_); }
+
+Status GetQueryTraceRecord::Accept(Handler* handler,
+                                   std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+// IteratorQueryTraceRecord
+IteratorQueryTraceRecord::IteratorQueryTraceRecord(uint64_t timestamp)
+    : QueryTraceRecord(timestamp) {}
+
+IteratorQueryTraceRecord::IteratorQueryTraceRecord(PinnableSlice&& lower_bound,
+                                                   PinnableSlice&& upper_bound,
+                                                   uint64_t timestamp)
+    : QueryTraceRecord(timestamp),
+      lower_(std::move(lower_bound)),
+      upper_(std::move(upper_bound)) {}
+
+IteratorQueryTraceRecord::IteratorQueryTraceRecord(
+    const std::string& lower_bound, const std::string& upper_bound,
+    uint64_t timestamp)
+    : QueryTraceRecord(timestamp) {
+  lower_.PinSelf(lower_bound);
+  upper_.PinSelf(upper_bound);
+}
+
+IteratorQueryTraceRecord::~IteratorQueryTraceRecord() {}
+
+Slice IteratorQueryTraceRecord::GetLowerBound() const { return Slice(lower_); }
+
+Slice IteratorQueryTraceRecord::GetUpperBound() const { return Slice(upper_); }
+
+// IteratorSeekQueryTraceRecord
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, PinnableSlice&& key,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id),
+      key_(std::move(key)) {}
+
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, const std::string& key,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id) {
+  key_.PinSelf(key);
+}
+
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, PinnableSlice&& key,
+    PinnableSlice&& lower_bound, PinnableSlice&& upper_bound,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(std::move(lower_bound), std::move(upper_bound),
+                               timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id),
+      key_(std::move(key)) {}
+
+IteratorSeekQueryTraceRecord::IteratorSeekQueryTraceRecord(
+    SeekType seek_type, uint32_t column_family_id, const std::string& key,
+    const std::string& lower_bound, const std::string& upper_bound,
+    uint64_t timestamp)
+    : IteratorQueryTraceRecord(lower_bound, upper_bound, timestamp),
+      type_(seek_type),
+      cf_id_(column_family_id) {
+  key_.PinSelf(key);
+}
+
+IteratorSeekQueryTraceRecord::~IteratorSeekQueryTraceRecord() { key_.clear(); }
+
+TraceType IteratorSeekQueryTraceRecord::GetTraceType() const {
+  return static_cast<TraceType>(type_);
+}
+
+IteratorSeekQueryTraceRecord::SeekType
+IteratorSeekQueryTraceRecord::GetSeekType() const {
+  return type_;
+}
+
+uint32_t IteratorSeekQueryTraceRecord::GetColumnFamilyID() const {
+  return cf_id_;
+}
+
+Slice IteratorSeekQueryTraceRecord::GetKey() const { return Slice(key_); }
+
+Status IteratorSeekQueryTraceRecord::Accept(
+    Handler* handler, std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+// MultiGetQueryTraceRecord
+MultiGetQueryTraceRecord::MultiGetQueryTraceRecord(
+    std::vector<uint32_t> column_family_ids, std::vector<PinnableSlice>&& keys,
+    uint64_t timestamp)
+    : QueryTraceRecord(timestamp),
+      cf_ids_(column_family_ids),
+      keys_(std::move(keys)) {}
+
+MultiGetQueryTraceRecord::MultiGetQueryTraceRecord(
+    std::vector<uint32_t> column_family_ids,
+    const std::vector<std::string>& keys, uint64_t timestamp)
+    : QueryTraceRecord(timestamp), cf_ids_(column_family_ids) {
+  keys_.reserve(keys.size());
+  for (const std::string& key : keys) {
+    PinnableSlice ps;
+    ps.PinSelf(key);
+    keys_.push_back(std::move(ps));
+  }
+}
+
+MultiGetQueryTraceRecord::~MultiGetQueryTraceRecord() {
+  cf_ids_.clear();
+  keys_.clear();
+}
+
+std::vector<uint32_t> MultiGetQueryTraceRecord::GetColumnFamilyIDs() const {
+  return cf_ids_;
+}
+
+std::vector<Slice> MultiGetQueryTraceRecord::GetKeys() const {
+  return std::vector<Slice>(keys_.begin(), keys_.end());
+}
+
+Status MultiGetQueryTraceRecord::Accept(
+    Handler* handler, std::unique_ptr<TraceRecordResult>* result) {
+  assert(handler != nullptr);
+  return handler->Handle(*this, result);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,190 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/trace_record_handler.h"
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/write_batch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TraceExecutionHandler
+TraceExecutionHandler::TraceExecutionHandler(
+    DB* db, const std::vector<ColumnFamilyHandle*>& handles)
+    : TraceRecord::Handler(),
+      db_(db),
+      write_opts_(WriteOptions()),
+      read_opts_(ReadOptions()) {
+  assert(db != nullptr);
+  assert(!handles.empty());
+  cf_map_.reserve(handles.size());
+  for (ColumnFamilyHandle* handle : handles) {
+    assert(handle != nullptr);
+    cf_map_.insert({handle->GetID(), handle});
+  }
+  clock_ = db_->GetEnv()->GetSystemClock().get();
+}
+
+TraceExecutionHandler::~TraceExecutionHandler() { cf_map_.clear(); }
+
+Status TraceExecutionHandler::Handle(
+    const WriteQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  uint64_t start = clock_->NowMicros();
+
+  WriteBatch batch(record.GetWriteBatchRep().ToString());
+  Status s = db_->Write(write_opts_, &batch);
+
+  uint64_t end = clock_->NowMicros();
+
+  if (s.ok() && result != nullptr) {
+    result->reset(new StatusOnlyTraceExecutionResult(s, start, end,
+                                                     record.GetTraceType()));
+  }
+
+  return s;
+}
+
+Status TraceExecutionHandler::Handle(
+    const GetQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  auto it = cf_map_.find(record.GetColumnFamilyID());
+  if (it == cf_map_.end()) {
+    return Status::Corruption("Invalid Column Family ID.");
+  }
+
+  uint64_t start = clock_->NowMicros();
+
+  std::string value;
+  Status s = db_->Get(read_opts_, it->second, record.GetKey(), &value);
+
+  uint64_t end = clock_->NowMicros();
+
+  // Treat not found as ok, return other errors.
+  if (!s.ok() && !s.IsNotFound()) {
+    return s;
+  }
+
+  if (result != nullptr) {
+    // Report the actual opetation status in TraceExecutionResult
+    result->reset(new SingleValueTraceExecutionResult(
+        std::move(s), std::move(value), start, end, record.GetTraceType()));
+  }
+  return Status::OK();
+}
+
+Status TraceExecutionHandler::Handle(
+    const IteratorSeekQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  auto it = cf_map_.find(record.GetColumnFamilyID());
+  if (it == cf_map_.end()) {
+    return Status::Corruption("Invalid Column Family ID.");
+  }
+
+  ReadOptions r_opts = read_opts_;
+  Slice lower = record.GetLowerBound();
+  if (!lower.empty()) {
+    r_opts.iterate_lower_bound = &lower;
+  }
+  Slice upper = record.GetUpperBound();
+  if (!upper.empty()) {
+    r_opts.iterate_upper_bound = &upper;
+  }
+  Iterator* single_iter = db_->NewIterator(r_opts, it->second);
+
+  uint64_t start = clock_->NowMicros();
+
+  switch (record.GetSeekType()) {
+    case IteratorSeekQueryTraceRecord::kSeekForPrev: {
+      single_iter->SeekForPrev(record.GetKey());
+      break;
+    }
+    default: {
+      single_iter->Seek(record.GetKey());
+      break;
+    }
+  }
+
+  uint64_t end = clock_->NowMicros();
+
+  Status s = single_iter->status();
+  if (s.ok() && result != nullptr) {
+    if (single_iter->Valid()) {
+      PinnableSlice ps_key;
+      ps_key.PinSelf(single_iter->key());
+      PinnableSlice ps_value;
+      ps_value.PinSelf(single_iter->value());
+      result->reset(new IteratorTraceExecutionResult(
+          true, s, std::move(ps_key), std::move(ps_value), start, end,
+          record.GetTraceType()));
+    } else {
+      result->reset(new IteratorTraceExecutionResult(
+          false, s, "", "", start, end, record.GetTraceType()));
+    }
+  }
+  delete single_iter;
+
+  return s;
+}
+
+Status TraceExecutionHandler::Handle(
+    const MultiGetQueryTraceRecord& record,
+    std::unique_ptr<TraceRecordResult>* result) {
+  if (result != nullptr) {
+    result->reset(nullptr);
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  handles.reserve(record.GetColumnFamilyIDs().size());
+  for (uint32_t cf_id : record.GetColumnFamilyIDs()) {
+    auto it = cf_map_.find(cf_id);
+    if (it == cf_map_.end()) {
+      return Status::Corruption("Invalid Column Family ID.");
+    }
+    handles.push_back(it->second);
+  }
+
+  std::vector<Slice> keys = record.GetKeys();
+
+  if (handles.empty() || keys.empty()) {
+    return Status::InvalidArgument("Empty MultiGet cf_ids or keys.");
+  }
+  if (handles.size() != keys.size()) {
+    return Status::InvalidArgument("MultiGet cf_ids and keys size mismatch.");
+  }
+
+  uint64_t start = clock_->NowMicros();
+
+  std::vector<std::string> values;
+  std::vector<Status> ss = db_->MultiGet(read_opts_, handles, keys, &values);
+
+  uint64_t end = clock_->NowMicros();
+
+  // Treat not found as ok, return other errors.
+  for (const Status& s : ss) {
+    if (!s.ok() && !s.IsNotFound()) {
+      return s;
+    }
+  }
+
+  if (result != nullptr) {
+    // Report the actual opetation status in TraceExecutionResult
+    result->reset(new MultiValuesTraceExecutionResult(
+        std::move(ss), std::move(values), start, end, record.GetTraceType()));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.h mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_handler.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_record.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Handler to execute TraceRecord.
+class TraceExecutionHandler : public TraceRecord::Handler {
+ public:
+  TraceExecutionHandler(DB* db,
+                        const std::vector<ColumnFamilyHandle*>& handles);
+  virtual ~TraceExecutionHandler() override;
+
+  virtual Status Handle(const WriteQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+  virtual Status Handle(const GetQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+  virtual Status Handle(const IteratorSeekQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+  virtual Status Handle(const MultiGetQueryTraceRecord& record,
+                        std::unique_ptr<TraceRecordResult>* result) override;
+
+ private:
+  DB* db_;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
+  WriteOptions write_opts_;
+  ReadOptions read_opts_;
+  SystemClock* clock_;
+};
+
+// To do: Handler for trace_analyzer.
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record_result.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_result.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_record_result.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_record_result.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/trace_record_result.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TraceRecordResult
+TraceRecordResult::TraceRecordResult(TraceType trace_type)
+    : trace_type_(trace_type) {}
+
+TraceType TraceRecordResult::GetTraceType() const { return trace_type_; }
+
+// TraceExecutionResult
+TraceExecutionResult::TraceExecutionResult(uint64_t start_timestamp,
+                                           uint64_t end_timestamp,
+                                           TraceType trace_type)
+    : TraceRecordResult(trace_type),
+      ts_start_(start_timestamp),
+      ts_end_(end_timestamp) {
+  assert(ts_start_ <= ts_end_);
+}
+
+uint64_t TraceExecutionResult::GetStartTimestamp() const { return ts_start_; }
+
+uint64_t TraceExecutionResult::GetEndTimestamp() const { return ts_end_; }
+
+// StatusOnlyTraceExecutionResult
+StatusOnlyTraceExecutionResult::StatusOnlyTraceExecutionResult(
+    Status status, uint64_t start_timestamp, uint64_t end_timestamp,
+    TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      status_(std::move(status)) {}
+
+const Status& StatusOnlyTraceExecutionResult::GetStatus() const {
+  return status_;
+}
+
+Status StatusOnlyTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+// SingleValueTraceExecutionResult
+SingleValueTraceExecutionResult::SingleValueTraceExecutionResult(
+    Status status, const std::string& value, uint64_t start_timestamp,
+    uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      status_(std::move(status)),
+      value_(value) {}
+
+SingleValueTraceExecutionResult::SingleValueTraceExecutionResult(
+    Status status, std::string&& value, uint64_t start_timestamp,
+    uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      status_(std::move(status)),
+      value_(std::move(value)) {}
+
+SingleValueTraceExecutionResult::~SingleValueTraceExecutionResult() {
+  value_.clear();
+}
+
+const Status& SingleValueTraceExecutionResult::GetStatus() const {
+  return status_;
+}
+
+const std::string& SingleValueTraceExecutionResult::GetValue() const {
+  return value_;
+}
+
+Status SingleValueTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+// MultiValuesTraceExecutionResult
+MultiValuesTraceExecutionResult::MultiValuesTraceExecutionResult(
+    std::vector<Status> multi_status, std::vector<std::string> values,
+    uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      multi_status_(std::move(multi_status)),
+      values_(std::move(values)) {}
+
+MultiValuesTraceExecutionResult::~MultiValuesTraceExecutionResult() {
+  multi_status_.clear();
+  values_.clear();
+}
+
+const std::vector<Status>& MultiValuesTraceExecutionResult::GetMultiStatus()
+    const {
+  return multi_status_;
+}
+
+const std::vector<std::string>& MultiValuesTraceExecutionResult::GetValues()
+    const {
+  return values_;
+}
+
+Status MultiValuesTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+// IteratorTraceExecutionResult
+IteratorTraceExecutionResult::IteratorTraceExecutionResult(
+    bool valid, Status status, PinnableSlice&& key, PinnableSlice&& value,
+    uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      valid_(valid),
+      status_(std::move(status)),
+      key_(std::move(key)),
+      value_(std::move(value)) {}
+
+IteratorTraceExecutionResult::IteratorTraceExecutionResult(
+    bool valid, Status status, const std::string& key, const std::string& value,
+    uint64_t start_timestamp, uint64_t end_timestamp, TraceType trace_type)
+    : TraceExecutionResult(start_timestamp, end_timestamp, trace_type),
+      valid_(valid),
+      status_(std::move(status)) {
+  key_.PinSelf(key);
+  value_.PinSelf(value);
+}
+
+IteratorTraceExecutionResult::~IteratorTraceExecutionResult() {
+  key_.clear();
+  value_.clear();
+}
+
+bool IteratorTraceExecutionResult::GetValid() const { return valid_; }
+
+const Status& IteratorTraceExecutionResult::GetStatus() const {
+  return status_;
+}
+
+Slice IteratorTraceExecutionResult::GetKey() const { return Slice(key_); }
+
+Slice IteratorTraceExecutionResult::GetValue() const { return Slice(value_); }
+
+Status IteratorTraceExecutionResult::Accept(Handler* handler) {
+  assert(handler != nullptr);
+  return handler->Handle(*this);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_replay.cc mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_replay.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,23 +8,23 @@
 #include <chrono>
 #include <sstream>
 #include <thread>
+
 #include "db/db_impl/db_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "rocksdb/write_batch.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "util/threadpool_imp.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 const std::string kTraceMagic = "feedcafedeadbeef";
 
 namespace {
-void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) {
-  PutFixed32(dst, cf_id);
-  PutLengthPrefixedSlice(dst, key);
-}
-
 void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
   Slice buf(buffer);
   GetFixed32(&buf, cf_id);
@@ -32,6 +32,54 @@
 }
 }  // namespace
 
+Status TracerHelper::ParseVersionStr(std::string& v_string, int* v_num) {
+  if (v_string.find_first_of('.') == std::string::npos ||
+      v_string.find_first_of('.') != v_string.find_last_of('.')) {
+    return Status::Corruption(
+        "Corrupted trace file. Incorrect version format.");
+  }
+  int tmp_num = 0;
+  for (int i = 0; i < static_cast<int>(v_string.size()); i++) {
+    if (v_string[i] == '.') {
+      continue;
+    } else if (isdigit(v_string[i])) {
+      tmp_num = tmp_num * 10 + (v_string[i] - '0');
+    } else {
+      return Status::Corruption(
+          "Corrupted trace file. Incorrect version format");
+    }
+  }
+  *v_num = tmp_num;
+  return Status::OK();
+}
+
+Status TracerHelper::ParseTraceHeader(const Trace& header, int* trace_version,
+                                      int* db_version) {
+  std::vector<std::string> s_vec;
+  int begin = 0, end;
+  for (int i = 0; i < 3; i++) {
+    assert(header.payload.find("\t", begin) != std::string::npos);
+    end = static_cast<int>(header.payload.find("\t", begin));
+    s_vec.push_back(header.payload.substr(begin, end - begin));
+    begin = end + 1;
+  }
+
+  std::string t_v_str, db_v_str;
+  assert(s_vec.size() == 3);
+  assert(s_vec[1].find("Trace Version: ") != std::string::npos);
+  t_v_str = s_vec[1].substr(15);
+  assert(s_vec[2].find("RocksDB Version: ") != std::string::npos);
+  db_v_str = s_vec[2].substr(17);
+
+  Status s;
+  s = ParseVersionStr(t_v_str, trace_version);
+  if (s != Status::OK()) {
+    return s;
+  }
+  s = ParseVersionStr(db_v_str, db_version);
+  return s;
+}
+
 void TracerHelper::EncodeTrace(const Trace& trace, std::string* encoded_trace) {
   assert(encoded_trace);
   PutFixed64(encoded_trace, trace.ts);
@@ -56,13 +104,250 @@
   return Status::OK();
 }
 
-Tracer::Tracer(Env* env, const TraceOptions& trace_options,
+Status TracerHelper::DecodeHeader(const std::string& encoded_trace,
+                                  Trace* header) {
+  Status s = TracerHelper::DecodeTrace(encoded_trace, header);
+
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+bool TracerHelper::SetPayloadMap(uint64_t& payload_map,
+                                 const TracePayloadType payload_type) {
+  uint64_t old_state = payload_map;
+  uint64_t tmp = 1;
+  payload_map |= (tmp << payload_type);
+  return old_state != payload_map;
+}
+
+Status TracerHelper::DecodeTraceRecord(Trace* trace, int trace_file_version,
+                                       std::unique_ptr<TraceRecord>* record) {
+  assert(trace != nullptr);
+
+  if (record != nullptr) {
+    record->reset(nullptr);
+  }
+
+  switch (trace->type) {
+    // Write
+    case kTraceWrite: {
+      PinnableSlice rep;
+      if (trace_file_version < 2) {
+        rep.PinSelf(trace->payload);
+      } else {
+        Slice buf(trace->payload);
+        GetFixed64(&buf, &trace->payload_map);
+        int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+        Slice write_batch_data;
+        while (payload_map) {
+          // Find the rightmost set bit.
+          uint32_t set_pos =
+              static_cast<uint32_t>(log2(payload_map & -payload_map));
+          switch (set_pos) {
+            case TracePayloadType::kWriteBatchData: {
+              GetLengthPrefixedSlice(&buf, &write_batch_data);
+              break;
+            }
+            default: {
+              assert(false);
+            }
+          }
+          // unset the rightmost bit.
+          payload_map &= (payload_map - 1);
+        }
+        rep.PinSelf(write_batch_data);
+      }
+
+      if (record != nullptr) {
+        record->reset(new WriteQueryTraceRecord(std::move(rep), trace->ts));
+      }
+
+      return Status::OK();
+    }
+    // Get
+    case kTraceGet: {
+      uint32_t cf_id = 0;
+      Slice get_key;
+
+      if (trace_file_version < 2) {
+        DecodeCFAndKey(trace->payload, &cf_id, &get_key);
+      } else {
+        Slice buf(trace->payload);
+        GetFixed64(&buf, &trace->payload_map);
+        int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+        while (payload_map) {
+          // Find the rightmost set bit.
+          uint32_t set_pos =
+              static_cast<uint32_t>(log2(payload_map & -payload_map));
+          switch (set_pos) {
+            case TracePayloadType::kGetCFID: {
+              GetFixed32(&buf, &cf_id);
+              break;
+            }
+            case TracePayloadType::kGetKey: {
+              GetLengthPrefixedSlice(&buf, &get_key);
+              break;
+            }
+            default: {
+              assert(false);
+            }
+          }
+          // unset the rightmost bit.
+          payload_map &= (payload_map - 1);
+        }
+      }
+
+      if (record != nullptr) {
+        PinnableSlice ps;
+        ps.PinSelf(get_key);
+        record->reset(new GetQueryTraceRecord(cf_id, std::move(ps), trace->ts));
+      }
+
+      return Status::OK();
+    }
+    // Iterator Seek and SeekForPrev
+    case kTraceIteratorSeek:
+    case kTraceIteratorSeekForPrev: {
+      uint32_t cf_id = 0;
+      Slice iter_key;
+      Slice lower_bound;
+      Slice upper_bound;
+
+      if (trace_file_version < 2) {
+        DecodeCFAndKey(trace->payload, &cf_id, &iter_key);
+      } else {
+        Slice buf(trace->payload);
+        GetFixed64(&buf, &trace->payload_map);
+        int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+        while (payload_map) {
+          // Find the rightmost set bit.
+          uint32_t set_pos =
+              static_cast<uint32_t>(log2(payload_map & -payload_map));
+          switch (set_pos) {
+            case TracePayloadType::kIterCFID: {
+              GetFixed32(&buf, &cf_id);
+              break;
+            }
+            case TracePayloadType::kIterKey: {
+              GetLengthPrefixedSlice(&buf, &iter_key);
+              break;
+            }
+            case TracePayloadType::kIterLowerBound: {
+              GetLengthPrefixedSlice(&buf, &lower_bound);
+              break;
+            }
+            case TracePayloadType::kIterUpperBound: {
+              GetLengthPrefixedSlice(&buf, &upper_bound);
+              break;
+            }
+            default: {
+              assert(false);
+            }
+          }
+          // unset the rightmost bit.
+          payload_map &= (payload_map - 1);
+        }
+      }
+
+      if (record != nullptr) {
+        PinnableSlice ps_key;
+        ps_key.PinSelf(iter_key);
+        PinnableSlice ps_lower;
+        ps_lower.PinSelf(lower_bound);
+        PinnableSlice ps_upper;
+        ps_upper.PinSelf(upper_bound);
+        record->reset(new IteratorSeekQueryTraceRecord(
+            static_cast<IteratorSeekQueryTraceRecord::SeekType>(trace->type),
+            cf_id, std::move(ps_key), std::move(ps_lower), std::move(ps_upper),
+            trace->ts));
+      }
+
+      return Status::OK();
+    }
+    // MultiGet
+    case kTraceMultiGet: {
+      if (trace_file_version < 2) {
+        return Status::Corruption("MultiGet is not supported.");
+      }
+
+      uint32_t multiget_size = 0;
+      std::vector<uint32_t> cf_ids;
+      std::vector<PinnableSlice> multiget_keys;
+
+      Slice cfids_payload;
+      Slice keys_payload;
+      Slice buf(trace->payload);
+      GetFixed64(&buf, &trace->payload_map);
+      int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+      while (payload_map) {
+        // Find the rightmost set bit.
+        uint32_t set_pos =
+            static_cast<uint32_t>(log2(payload_map & -payload_map));
+        switch (set_pos) {
+          case TracePayloadType::kMultiGetSize: {
+            GetFixed32(&buf, &multiget_size);
+            break;
+          }
+          case TracePayloadType::kMultiGetCFIDs: {
+            GetLengthPrefixedSlice(&buf, &cfids_payload);
+            break;
+          }
+          case TracePayloadType::kMultiGetKeys: {
+            GetLengthPrefixedSlice(&buf, &keys_payload);
+            break;
+          }
+          default: {
+            assert(false);
+          }
+        }
+        // unset the rightmost bit.
+        payload_map &= (payload_map - 1);
+      }
+      if (multiget_size == 0) {
+        return Status::InvalidArgument("Empty MultiGet cf_ids or keys.");
+      }
+
+      // Decode the cfids_payload and keys_payload
+      cf_ids.reserve(multiget_size);
+      multiget_keys.reserve(multiget_size);
+      for (uint32_t i = 0; i < multiget_size; i++) {
+        uint32_t tmp_cfid;
+        Slice tmp_key;
+        GetFixed32(&cfids_payload, &tmp_cfid);
+        GetLengthPrefixedSlice(&keys_payload, &tmp_key);
+        cf_ids.push_back(tmp_cfid);
+        Slice s(tmp_key);
+        PinnableSlice ps;
+        ps.PinSelf(s);
+        multiget_keys.push_back(std::move(ps));
+      }
+
+      if (record != nullptr) {
+        record->reset(new MultiGetQueryTraceRecord(
+            std::move(cf_ids), std::move(multiget_keys), trace->ts));
+      }
+
+      return Status::OK();
+    }
+    default:
+      return Status::NotSupported("Unsupported trace type.");
+  }
+}
+
+Tracer::Tracer(SystemClock* clock, const TraceOptions& trace_options,
                std::unique_ptr<TraceWriter>&& trace_writer)
-    : env_(env),
+    : clock_(clock),
       trace_options_(trace_options),
       trace_writer_(std::move(trace_writer)),
-      trace_request_count_ (0) {
-  WriteHeader();
+      trace_request_count_(0) {
+  // TODO: What if this fails?
+  WriteHeader().PermitUncheckedError();
 }
 
 Tracer::~Tracer() { trace_writer_.reset(); }
@@ -73,9 +358,12 @@
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  trace.payload = write_batch->Data();
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kWriteBatchData);
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutLengthPrefixedSlice(&trace.payload, Slice(write_batch->Data()));
   return WriteTrace(trace);
 }
 
@@ -85,33 +373,158 @@
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  EncodeCFAndKey(&trace.payload, column_family->GetID(), key);
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetKey);
+  // Encode the Get struct members into payload. Make sure add them in order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, column_family->GetID());
+  PutLengthPrefixedSlice(&trace.payload, key);
   return WriteTrace(trace);
 }
 
-Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) {
+Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key,
+                            const Slice& lower_bound, const Slice upper_bound) {
   TraceType trace_type = kTraceIteratorSeek;
   if (ShouldSkipTrace(trace_type)) {
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  EncodeCFAndKey(&trace.payload, cf_id, key);
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey);
+  if (lower_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterLowerBound);
+  }
+  if (upper_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterUpperBound);
+  }
+  // Encode the Iterator struct members into payload. Make sure add them in
+  // order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, cf_id);
+  PutLengthPrefixedSlice(&trace.payload, key);
+  if (lower_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, lower_bound);
+  }
+  if (upper_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, upper_bound);
+  }
   return WriteTrace(trace);
 }
 
-Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) {
+Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                   const Slice& lower_bound,
+                                   const Slice upper_bound) {
   TraceType trace_type = kTraceIteratorSeekForPrev;
   if (ShouldSkipTrace(trace_type)) {
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  EncodeCFAndKey(&trace.payload, cf_id, key);
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey);
+  if (lower_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterLowerBound);
+  }
+  if (upper_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterUpperBound);
+  }
+  // Encode the Iterator struct members into payload. Make sure add them in
+  // order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, cf_id);
+  PutLengthPrefixedSlice(&trace.payload, key);
+  if (lower_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, lower_bound);
+  }
+  if (upper_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, upper_bound);
+  }
+  return WriteTrace(trace);
+}
+
+Status Tracer::MultiGet(const size_t num_keys,
+                        ColumnFamilyHandle** column_families,
+                        const Slice* keys) {
+  if (num_keys == 0) {
+    return Status::OK();
+  }
+  std::vector<ColumnFamilyHandle*> v_column_families;
+  std::vector<Slice> v_keys;
+  v_column_families.resize(num_keys);
+  v_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    v_column_families[i] = column_families[i];
+    v_keys[i] = keys[i];
+  }
+  return MultiGet(v_column_families, v_keys);
+}
+
+Status Tracer::MultiGet(const size_t num_keys,
+                        ColumnFamilyHandle* column_family, const Slice* keys) {
+  if (num_keys == 0) {
+    return Status::OK();
+  }
+  std::vector<ColumnFamilyHandle*> column_families;
+  std::vector<Slice> v_keys;
+  column_families.resize(num_keys);
+  v_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    column_families[i] = column_family;
+    v_keys[i] = keys[i];
+  }
+  return MultiGet(column_families, v_keys);
+}
+
+Status Tracer::MultiGet(const std::vector<ColumnFamilyHandle*>& column_families,
+                        const std::vector<Slice>& keys) {
+  if (column_families.size() != keys.size()) {
+    return Status::Corruption("the CFs size and keys size does not match!");
+  }
+  TraceType trace_type = kTraceMultiGet;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  uint32_t multiget_size = static_cast<uint32_t>(keys.size());
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = trace_type;
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetSize);
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetCFIDs);
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetKeys);
+  // Encode the CFIDs inorder
+  std::string cfids_payload;
+  std::string keys_payload;
+  for (uint32_t i = 0; i < multiget_size; i++) {
+    assert(i < column_families.size());
+    assert(i < keys.size());
+    PutFixed32(&cfids_payload, column_families[i]->GetID());
+    PutLengthPrefixedSlice(&keys_payload, keys[i]);
+  }
+  // Encode the Get struct members into payload. Make sure add them in order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, multiget_size);
+  PutLengthPrefixedSlice(&trace.payload, cfids_payload);
+  PutLengthPrefixedSlice(&trace.payload, keys_payload);
   return WriteTrace(trace);
 }
 
@@ -119,12 +532,46 @@
   if (IsTraceFileOverMax()) {
     return true;
   }
-  if ((trace_options_.filter & kTraceFilterGet
-    && trace_type == kTraceGet)
-   || (trace_options_.filter & kTraceFilterWrite
-    && trace_type == kTraceWrite)) {
+
+  TraceFilterType filter_mask = kTraceFilterNone;
+  switch (trace_type) {
+    case kTraceNone:
+    case kTraceBegin:
+    case kTraceEnd:
+      filter_mask = kTraceFilterNone;
+      break;
+    case kTraceWrite:
+      filter_mask = kTraceFilterWrite;
+      break;
+    case kTraceGet:
+      filter_mask = kTraceFilterGet;
+      break;
+    case kTraceIteratorSeek:
+      filter_mask = kTraceFilterIteratorSeek;
+      break;
+    case kTraceIteratorSeekForPrev:
+      filter_mask = kTraceFilterIteratorSeekForPrev;
+      break;
+    case kBlockTraceIndexBlock:
+    case kBlockTraceFilterBlock:
+    case kBlockTraceDataBlock:
+    case kBlockTraceUncompressionDictBlock:
+    case kBlockTraceRangeDeletionBlock:
+    case kIOTracer:
+      filter_mask = kTraceFilterNone;
+      break;
+    case kTraceMultiGet:
+      filter_mask = kTraceFilterMultiGet;
+      break;
+    case kTraceMax:
+      assert(false);
+      filter_mask = kTraceFilterNone;
+      break;
+  }
+  if (filter_mask != kTraceFilterNone && trace_options_.filter & filter_mask) {
     return true;
   }
+
   ++trace_request_count_;
   if (trace_request_count_ < trace_options_.sampling_frequency) {
     return true;
@@ -141,13 +588,14 @@
 Status Tracer::WriteHeader() {
   std::ostringstream s;
   s << kTraceMagic << "\t"
-    << "Trace Version: 0.1\t"
+    << "Trace Version: " << kTraceFileMajorVersion << "."
+    << kTraceFileMinorVersion << "\t"
     << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
     << "Format: Timestamp OpType Payload\n";
   std::string header(s.str());
 
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = kTraceBegin;
   trace.payload = header;
   return WriteTrace(trace);
@@ -155,8 +603,10 @@
 
 Status Tracer::WriteFooter() {
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = kTraceEnd;
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kEmptyPayload);
   trace.payload = "";
   return WriteTrace(trace);
 }
@@ -169,317 +619,4 @@
 
 Status Tracer::Close() { return WriteFooter(); }
 
-Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
-                   std::unique_ptr<TraceReader>&& reader)
-    : trace_reader_(std::move(reader)) {
-  assert(db != nullptr);
-  db_ = static_cast<DBImpl*>(db->GetRootDB());
-  env_ = Env::Default();
-  for (ColumnFamilyHandle* cfh : handles) {
-    cf_map_[cfh->GetID()] = cfh;
-  }
-  fast_forward_ = 1;
-}
-
-Replayer::~Replayer() { trace_reader_.reset(); }
-
-Status Replayer::SetFastForward(uint32_t fast_forward) {
-  Status s;
-  if (fast_forward < 1) {
-    s = Status::InvalidArgument("Wrong fast forward speed!");
-  } else {
-    fast_forward_ = fast_forward;
-    s = Status::OK();
-  }
-  return s;
-}
-
-Status Replayer::Replay() {
-  Status s;
-  Trace header;
-  s = ReadHeader(&header);
-  if (!s.ok()) {
-    return s;
-  }
-
-  std::chrono::system_clock::time_point replay_epoch =
-      std::chrono::system_clock::now();
-  WriteOptions woptions;
-  ReadOptions roptions;
-  Trace trace;
-  uint64_t ops = 0;
-  Iterator* single_iter = nullptr;
-  while (s.ok()) {
-    trace.reset();
-    s = ReadTrace(&trace);
-    if (!s.ok()) {
-      break;
-    }
-
-    std::this_thread::sleep_until(
-        replay_epoch +
-        std::chrono::microseconds((trace.ts - header.ts) / fast_forward_));
-    if (trace.type == kTraceWrite) {
-      WriteBatch batch(trace.payload);
-      db_->Write(woptions, &batch);
-      ops++;
-    } else if (trace.type == kTraceGet) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKey(trace.payload, &cf_id, &key);
-      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
-        return Status::Corruption("Invalid Column Family ID.");
-      }
-
-      std::string value;
-      if (cf_id == 0) {
-        db_->Get(roptions, key, &value);
-      } else {
-        db_->Get(roptions, cf_map_[cf_id], key, &value);
-      }
-      ops++;
-    } else if (trace.type == kTraceIteratorSeek) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKey(trace.payload, &cf_id, &key);
-      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
-        return Status::Corruption("Invalid Column Family ID.");
-      }
-
-      if (cf_id == 0) {
-        single_iter = db_->NewIterator(roptions);
-      } else {
-        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
-      }
-      single_iter->Seek(key);
-      ops++;
-      delete single_iter;
-    } else if (trace.type == kTraceIteratorSeekForPrev) {
-      // Currently, only support to call the Seek()
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKey(trace.payload, &cf_id, &key);
-      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
-        return Status::Corruption("Invalid Column Family ID.");
-      }
-
-      if (cf_id == 0) {
-        single_iter = db_->NewIterator(roptions);
-      } else {
-        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
-      }
-      single_iter->SeekForPrev(key);
-      ops++;
-      delete single_iter;
-    } else if (trace.type == kTraceEnd) {
-      // Do nothing for now.
-      // TODO: Add some validations later.
-      break;
-    }
-  }
-
-  if (s.IsIncomplete()) {
-    // Reaching eof returns Incomplete status at the moment.
-    // Could happen when killing a process without calling EndTrace() API.
-    // TODO: Add better error handling.
-    return Status::OK();
-  }
-  return s;
-}
-
-// The trace can be replayed with multithread by configurnge the number of
-// threads in the thread pool. Trace records are read from the trace file
-// sequentially and the corresponding queries are scheduled in the task
-// queue based on the timestamp. Currently, we support Write_batch (Put,
-// Delete, SingleDelete, DeleteRange), Get, Iterator (Seek and SeekForPrev).
-Status Replayer::MultiThreadReplay(uint32_t threads_num) {
-  Status s;
-  Trace header;
-  s = ReadHeader(&header);
-  if (!s.ok()) {
-    return s;
-  }
-
-  ThreadPoolImpl thread_pool;
-  thread_pool.SetHostEnv(env_);
-
-  if (threads_num > 1) {
-    thread_pool.SetBackgroundThreads(static_cast<int>(threads_num));
-  } else {
-    thread_pool.SetBackgroundThreads(1);
-  }
-
-  std::chrono::system_clock::time_point replay_epoch =
-      std::chrono::system_clock::now();
-  WriteOptions woptions;
-  ReadOptions roptions;
-  uint64_t ops = 0;
-  while (s.ok()) {
-    std::unique_ptr<ReplayerWorkerArg> ra(new ReplayerWorkerArg);
-    ra->db = db_;
-    s = ReadTrace(&(ra->trace_entry));
-    if (!s.ok()) {
-      break;
-    }
-    ra->woptions = woptions;
-    ra->roptions = roptions;
-
-    std::this_thread::sleep_until(
-        replay_epoch + std::chrono::microseconds(
-                           (ra->trace_entry.ts - header.ts) / fast_forward_));
-    if (ra->trace_entry.type == kTraceWrite) {
-      thread_pool.Schedule(&Replayer::BGWorkWriteBatch, ra.release(), nullptr,
-                           nullptr);
-      ops++;
-    } else if (ra->trace_entry.type == kTraceGet) {
-      thread_pool.Schedule(&Replayer::BGWorkGet, ra.release(), nullptr,
-                           nullptr);
-      ops++;
-    } else if (ra->trace_entry.type == kTraceIteratorSeek) {
-      thread_pool.Schedule(&Replayer::BGWorkIterSeek, ra.release(), nullptr,
-                           nullptr);
-      ops++;
-    } else if (ra->trace_entry.type == kTraceIteratorSeekForPrev) {
-      thread_pool.Schedule(&Replayer::BGWorkIterSeekForPrev, ra.release(),
-                           nullptr, nullptr);
-      ops++;
-    } else if (ra->trace_entry.type == kTraceEnd) {
-      // Do nothing for now.
-      // TODO: Add some validations later.
-      break;
-    } else {
-      // Other trace entry types that are not implemented for replay.
-      // To finish the replay, we continue the process.
-      continue;
-    }
-  }
-
-  if (s.IsIncomplete()) {
-    // Reaching eof returns Incomplete status at the moment.
-    // Could happen when killing a process without calling EndTrace() API.
-    // TODO: Add better error handling.
-    s = Status::OK();
-  }
-  thread_pool.JoinAllThreads();
-  return s;
-}
-
-Status Replayer::ReadHeader(Trace* header) {
-  assert(header != nullptr);
-  Status s = ReadTrace(header);
-  if (!s.ok()) {
-    return s;
-  }
-  if (header->type != kTraceBegin) {
-    return Status::Corruption("Corrupted trace file. Incorrect header.");
-  }
-  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
-    return Status::Corruption("Corrupted trace file. Incorrect magic.");
-  }
-
-  return s;
-}
-
-Status Replayer::ReadFooter(Trace* footer) {
-  assert(footer != nullptr);
-  Status s = ReadTrace(footer);
-  if (!s.ok()) {
-    return s;
-  }
-  if (footer->type != kTraceEnd) {
-    return Status::Corruption("Corrupted trace file. Incorrect footer.");
-  }
-
-  // TODO: Add more validations later
-  return s;
-}
-
-Status Replayer::ReadTrace(Trace* trace) {
-  assert(trace != nullptr);
-  std::string encoded_trace;
-  Status s = trace_reader_->Read(&encoded_trace);
-  if (!s.ok()) {
-    return s;
-  }
-  return TracerHelper::DecodeTrace(encoded_trace, trace);
-}
-
-void Replayer::BGWorkGet(void* arg) {
-  std::unique_ptr<ReplayerWorkerArg> ra(
-      reinterpret_cast<ReplayerWorkerArg*>(arg));
-  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
-      ra->cf_map);
-  uint32_t cf_id = 0;
-  Slice key;
-  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
-  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
-    return;
-  }
-
-  std::string value;
-  if (cf_id == 0) {
-    ra->db->Get(ra->roptions, key, &value);
-  } else {
-    ra->db->Get(ra->roptions, (*cf_map)[cf_id], key, &value);
-  }
-
-  return;
-}
-
-void Replayer::BGWorkWriteBatch(void* arg) {
-  std::unique_ptr<ReplayerWorkerArg> ra(
-      reinterpret_cast<ReplayerWorkerArg*>(arg));
-  WriteBatch batch(ra->trace_entry.payload);
-  ra->db->Write(ra->woptions, &batch);
-  return;
-}
-
-void Replayer::BGWorkIterSeek(void* arg) {
-  std::unique_ptr<ReplayerWorkerArg> ra(
-      reinterpret_cast<ReplayerWorkerArg*>(arg));
-  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
-      ra->cf_map);
-  uint32_t cf_id = 0;
-  Slice key;
-  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
-  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
-    return;
-  }
-
-  std::string value;
-  Iterator* single_iter = nullptr;
-  if (cf_id == 0) {
-    single_iter = ra->db->NewIterator(ra->roptions);
-  } else {
-    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
-  }
-  single_iter->Seek(key);
-  delete single_iter;
-  return;
-}
-
-void Replayer::BGWorkIterSeekForPrev(void* arg) {
-  std::unique_ptr<ReplayerWorkerArg> ra(
-      reinterpret_cast<ReplayerWorkerArg*>(arg));
-  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
-      ra->cf_map);
-  uint32_t cf_id = 0;
-  Slice key;
-  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
-  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
-    return;
-  }
-
-  std::string value;
-  Iterator* single_iter = nullptr;
-  if (cf_id == 0) {
-    single_iter = ra->db->NewIterator(ra->roptions);
-  } else {
-    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
-  }
-  single_iter->SeekForPrev(key);
-  delete single_iter;
-  return;
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_replay.h mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/trace_replay/trace_replay.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/trace_replay/trace_replay.h	2025-05-19 16:14:28.000000000 +0000
@@ -5,13 +5,17 @@
 
 #pragma once
 
+#include <atomic>
 #include <memory>
+#include <mutex>
 #include <unordered_map>
 #include <utility>
 
-#include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/utilities/replayer.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -22,9 +26,17 @@
 class ColumnFamilyData;
 class DB;
 class DBImpl;
+class Env;
 class Slice;
+class SystemClock;
+class TraceReader;
+class TraceWriter;
 class WriteBatch;
 
+struct ReadOptions;
+struct TraceOptions;
+struct WriteOptions;
+
 extern const std::string kTraceMagic;
 const unsigned int kTraceTimestampSize = 8;
 const unsigned int kTraceTypeSize = 1;
@@ -32,47 +44,74 @@
 const unsigned int kTraceMetadataSize =
     kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
 
-// Supported Trace types.
-enum TraceType : char {
-  kTraceBegin = 1,
-  kTraceEnd = 2,
-  kTraceWrite = 3,
-  kTraceGet = 4,
-  kTraceIteratorSeek = 5,
-  kTraceIteratorSeekForPrev = 6,
-  // Block cache related types.
-  kBlockTraceIndexBlock = 7,
-  kBlockTraceFilterBlock = 8,
-  kBlockTraceDataBlock = 9,
-  kBlockTraceUncompressionDictBlock = 10,
-  kBlockTraceRangeDeletionBlock = 11,
-  // All trace types should be added before kTraceMax
-  kTraceMax,
-};
+static const int kTraceFileMajorVersion = 0;
+static const int kTraceFileMinorVersion = 2;
 
-// TODO: This should also be made part of public interface to help users build
-// custom TracerReaders and TraceWriters.
-//
 // The data structure that defines a single trace.
 struct Trace {
   uint64_t ts;  // timestamp
   TraceType type;
+  // Each bit in payload_map stores which corresponding struct member added in
+  // the payload. Each TraceType has its corresponding payload struct. For
+  // example, if bit at position 0 is set in write payload, then the write batch
+  // will be addedd.
+  uint64_t payload_map = 0;
+  // Each trace type has its own payload_struct, which will be serilized in the
+  // payload.
   std::string payload;
 
   void reset() {
     ts = 0;
     type = kTraceMax;
+    payload_map = 0;
     payload.clear();
   }
 };
 
+enum TracePayloadType : char {
+  // Each member of all query payload structs should have a corresponding flag
+  // here. Make sure to add them sequentially in the order of it is added.
+  kEmptyPayload = 0,
+  kWriteBatchData = 1,
+  kGetCFID = 2,
+  kGetKey = 3,
+  kIterCFID = 4,
+  kIterKey = 5,
+  kIterLowerBound = 6,
+  kIterUpperBound = 7,
+  kMultiGetSize = 8,
+  kMultiGetCFIDs = 9,
+  kMultiGetKeys = 10,
+};
+
 class TracerHelper {
  public:
-  // Encode a trace object into the given string.
+  // Parse the string with major and minor version only
+  static Status ParseVersionStr(std::string& v_string, int* v_num);
+
+  // Parse the trace file version and db version in trace header
+  static Status ParseTraceHeader(const Trace& header, int* trace_version,
+                                 int* db_version);
+
+  // Encode a version 0.1 trace object into the given string.
   static void EncodeTrace(const Trace& trace, std::string* encoded_trace);
 
   // Decode a string into the given trace object.
   static Status DecodeTrace(const std::string& encoded_trace, Trace* trace);
+
+  // Decode a string into the given trace header.
+  static Status DecodeHeader(const std::string& encoded_trace, Trace* header);
+
+  // Set the payload map based on the payload type
+  static bool SetPayloadMap(uint64_t& payload_map,
+                            const TracePayloadType payload_type);
+
+  // Decode a Trace object into the corresponding TraceRecord.
+  // Return Status::OK() if nothing is wrong, record will be set accordingly.
+  // Return Status::NotSupported() if the trace type is not support, or the
+  // corresponding error status, record will be set to nullptr.
+  static Status DecodeTraceRecord(Trace* trace, int trace_file_version,
+                                  std::unique_ptr<TraceRecord>* record);
 };
 
 // Tracer captures all RocksDB operations using a user-provided TraceWriter.
@@ -80,7 +119,7 @@
 // timestamp and type, followed by the trace payload.
 class Tracer {
  public:
-  Tracer(Env* env, const TraceOptions& trace_options,
+  Tracer(SystemClock* clock, const TraceOptions& trace_options,
          std::unique_ptr<TraceWriter>&& trace_writer);
   ~Tracer();
 
@@ -91,13 +130,30 @@
   Status Get(ColumnFamilyHandle* cfname, const Slice& key);
 
   // Trace Iterators.
-  Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
-  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+  Status IteratorSeek(const uint32_t& cf_id, const Slice& key,
+                      const Slice& lower_bound, const Slice upper_bound);
+  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                             const Slice& lower_bound, const Slice upper_bound);
+
+  // Trace MultiGet
+
+  Status MultiGet(const size_t num_keys, ColumnFamilyHandle** column_families,
+                  const Slice* keys);
+
+  Status MultiGet(const size_t num_keys, ColumnFamilyHandle* column_family,
+                  const Slice* keys);
+
+  Status MultiGet(const std::vector<ColumnFamilyHandle*>& column_family,
+                  const std::vector<Slice>& keys);
 
   // Returns true if the trace is over the configured max trace file limit.
   // False otherwise.
   bool IsTraceFileOverMax();
 
+  // Returns true if the order of write trace records must match the order of
+  // the corresponding records logged to WAL and applied to the DB.
+  bool IsWriteOrderPreserved() { return trace_options_.preserve_write_order; }
+
   // Writes a trace footer at the end of the tracing
   Status Close();
 
@@ -118,72 +174,10 @@
   // Returns true if a trace should be skipped, false otherwise.
   bool ShouldSkipTrace(const TraceType& type);
 
-  Env* env_;
+  SystemClock* clock_;
   TraceOptions trace_options_;
   std::unique_ptr<TraceWriter> trace_writer_;
   uint64_t trace_request_count_;
 };
 
-// Replayer helps to replay the captured RocksDB operations, using a user
-// provided TraceReader.
-// The Replayer is instantiated via db_bench today, on using "replay" benchmark.
-class Replayer {
- public:
-  Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
-           std::unique_ptr<TraceReader>&& reader);
-  ~Replayer();
-
-  // Replay all the traces from the provided trace stream, taking the delay
-  // between the traces into consideration.
-  Status Replay();
-
-  // Replay the provide trace stream, which is the same as Replay(), with
-  // multi-threads. Queries are scheduled in the thread pool job queue.
-  // User can set the number of threads in the thread pool.
-  Status MultiThreadReplay(uint32_t threads_num);
-
-  // Enables fast forwarding a replay by reducing the delay between the ingested
-  // traces.
-  // fast_forward : Rate of replay speedup.
-  //   If 1, replay the operations at the same rate as in the trace stream.
-  //   If > 1, speed up the replay by this amount.
-  Status SetFastForward(uint32_t fast_forward);
-
- private:
-  Status ReadHeader(Trace* header);
-  Status ReadFooter(Trace* footer);
-  Status ReadTrace(Trace* trace);
-
-  // The background function for MultiThreadReplay to execute Get query
-  // based on the trace records.
-  static void BGWorkGet(void* arg);
-
-  // The background function for MultiThreadReplay to execute WriteBatch
-  // (Put, Delete, SingleDelete, DeleteRange) based on the trace records.
-  static void BGWorkWriteBatch(void* arg);
-
-  // The background function for MultiThreadReplay to execute Iterator (Seek)
-  // based on the trace records.
-  static void BGWorkIterSeek(void* arg);
-
-  // The background function for MultiThreadReplay to execute Iterator
-  // (SeekForPrev) based on the trace records.
-  static void BGWorkIterSeekForPrev(void* arg);
-
-  DBImpl* db_;
-  Env* env_;
-  std::unique_ptr<TraceReader> trace_reader_;
-  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
-  uint32_t fast_forward_;
-};
-
-// The passin arg of MultiThreadRepkay for each trace record.
-struct ReplayerWorkerArg {
-  DB* db;
-  Trace trace_entry;
-  std::unordered_map<uint32_t, ColumnFamilyHandle*>* cf_map;
-  WriteOptions woptions;
-  ReadOptions roptions;
-};
-
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/aligned_buffer.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/aligned_buffer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/aligned_buffer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/aligned_buffer.h	2025-05-19 16:14:28.000000000 +0000
@@ -18,8 +18,8 @@
 // Truncate to a multiple of page_size, which is also a page boundary. This
 // helps to figuring out the right alignment.
 // Example:
-//   TruncateToPageBoundary(5000, 4096)  => 4096
-//   TruncateToPageBoundary(10000, 4096) => 8192
+//   TruncateToPageBoundary(4096, 5000)  => 4096
+//   TruncateToPageBoundary((4096, 10000) => 8192
 inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
   s -= (s & (page_size - 1));
   assert((s % page_size) == 0);
@@ -116,6 +116,13 @@
     cursize_ = 0;
   }
 
+  char* Release() {
+    cursize_ = 0;
+    capacity_ = 0;
+    bufstart_ = nullptr;
+    return buf_.release();
+  }
+
   void Alignment(size_t alignment) {
     assert(alignment > 0);
     assert((alignment & (alignment - 1)) == 0);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/autovector.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/autovector.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector.h	2025-05-19 16:14:28.000000000 +0000
@@ -19,6 +19,12 @@
 template <class T, size_t kSize = 8>
 class autovector : public std::vector<T> {
   using std::vector<T>::vector;
+
+ public:
+  autovector() {
+    // Make sure the initial vector has space for kSize elements
+    std::vector<T>::reserve(kSize);
+  }
 };
 #else
 // A vector that leverages pre-allocated stack-based array to achieve better
@@ -45,25 +51,25 @@
 class autovector {
  public:
   // General STL-style container member types.
-  typedef T value_type;
-  typedef typename std::vector<T>::difference_type difference_type;
-  typedef typename std::vector<T>::size_type size_type;
-  typedef value_type& reference;
-  typedef const value_type& const_reference;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
+  using value_type = T;
+  using difference_type = typename std::vector<T>::difference_type;
+  using size_type = typename std::vector<T>::size_type;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
 
   // This class is the base for regular/const iterator
   template <class TAutoVector, class TValueType>
   class iterator_impl {
    public:
     // -- iterator traits
-    typedef iterator_impl<TAutoVector, TValueType> self_type;
-    typedef TValueType value_type;
-    typedef TValueType& reference;
-    typedef TValueType* pointer;
-    typedef typename TAutoVector::difference_type difference_type;
-    typedef std::random_access_iterator_tag iterator_category;
+    using self_type = iterator_impl<TAutoVector, TValueType>;
+    using value_type = TValueType;
+    using reference = TValueType&;
+    using pointer = TValueType*;
+    using difference_type = typename TAutoVector::difference_type;
+    using iterator_category = std::random_access_iterator_tag;
 
     iterator_impl(TAutoVector* vect, size_t index)
         : vect_(vect), index_(index) {};
@@ -169,10 +175,10 @@
     size_t index_ = 0;
   };
 
-  typedef iterator_impl<autovector, value_type> iterator;
-  typedef iterator_impl<const autovector, const value_type> const_iterator;
-  typedef std::reverse_iterator<iterator> reverse_iterator;
-  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  using iterator = iterator_impl<autovector, value_type>;
+  using const_iterator = iterator_impl<const autovector, const value_type>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
   autovector() : values_(reinterpret_cast<pointer>(buf_)) {}
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/autovector_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/autovector_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/autovector_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -64,7 +64,7 @@
 }
 
 TEST_F(AutoVectorTest, EmplaceBack) {
-  typedef std::pair<size_t, std::string> ValType;
+  using ValType = std::pair<size_t, std::string>;
   autovector<ValType, kSize> vec;
 
   for (size_t i = 0; i < 1000 * kSize; ++i) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/bloom_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/bloom_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -10,8 +10,10 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+
 #include <cmath>
 
+#include "port/port.h"  // for PREFETCH
 #include "rocksdb/slice.h"
 #include "util/hash.h"
 
@@ -87,7 +89,7 @@
 
 // A fast, flexible, and accurate cache-local Bloom implementation with
 // SIMD-optimized query performance (currently using AVX2 on Intel). Write
-// performance and non-SIMD read are very good, benefiting from fastrange32
+// performance and non-SIMD read are very good, benefiting from FastRange32
 // used in place of % and single-cycle multiplication on recent processors.
 //
 // Most other SIMD Bloom implementations sacrifice flexibility and/or
@@ -193,7 +195,7 @@
 
   static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
                              int num_probes, char *data) {
-    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
     AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
   }
 
@@ -210,7 +212,7 @@
   static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
                                  const char *data,
                                  uint32_t /*out*/ *byte_offset) {
-    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
     PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
     PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
     *byte_offset = bytes_to_cache_line;
@@ -218,7 +220,7 @@
 
   static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
                                   int num_probes, const char *data) {
-    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
     return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/bloom_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/bloom_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/bloom_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -19,8 +19,10 @@
 #include <cmath>
 #include <vector>
 
-#include "logging/logging.h"
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
 #include "memory/arena.h"
+#include "port/jemalloc_helper.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block_based/filter_policy_internal.h"
 #include "test_util/testharness.h"
@@ -30,6 +32,9 @@
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
+// The test is not fully designed for bits_per_key other than 10, but with
+// this parameter you can easily explore the behavior of other bits_per_key.
+// See also filter_bench.
 DEFINE_int32(bits_per_key, 10, "");
 
 namespace ROCKSDB_NAMESPACE {
@@ -158,7 +163,8 @@
     }
     Build();
 
-    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
+    ASSERT_LE(FilterSize(), (size_t)((length * FLAGS_bits_per_key / 8) + 40))
+        << length;
 
     // All added keys must match
     for (int i = 0; i < length; i++) {
@@ -172,11 +178,16 @@
       fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
               rate*100.0, length, static_cast<int>(FilterSize()));
     }
-    ASSERT_LE(rate, 0.02);   // Must not be over 2%
-    if (rate > 0.0125) mediocre_filters++;  // Allowed, but not too often
-    else good_filters++;
+    if (FLAGS_bits_per_key == 10) {
+      ASSERT_LE(rate, 0.02);  // Must not be over 2%
+      if (rate > 0.0125) {
+        mediocre_filters++;  // Allowed, but not too often
+      } else {
+        good_filters++;
+      }
+    }
   }
-  if (kVerbose >= 1) {
+  if (FLAGS_bits_per_key == 10 && kVerbose >= 1) {
     fprintf(stderr, "Filters: %d good, %d mediocre\n",
             good_filters, mediocre_filters);
   }
@@ -200,14 +211,14 @@
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()), 969445585);
+  ASSERT_EQ(BloomHash(FilterData()), 969445585U);
 
   ResetPolicy(11);  // num_probes = 7
   for (int key = 0; key < 87; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()), 1694458207);
+  ASSERT_EQ(BloomHash(FilterData()), 1694458207U);
 
   ResetPolicy(10);  // num_probes = 6
   for (int key = 0; key < 87; key++) {
@@ -221,7 +232,7 @@
     Add(Key(key, buffer));
   }
   Build();
-  ASSERT_EQ(BloomHash(FilterData()), 1908442116);
+  ASSERT_EQ(BloomHash(FilterData()), 1908442116U);
 
   ResetPolicy(10);
   for (int key = 1; key < /*CHANGED*/ 88; key++) {
@@ -252,8 +263,10 @@
 // Different bits-per-byte
 
 class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
- private:
+ protected:
   BlockBasedTableOptions table_options_;
+
+ private:
   std::shared_ptr<const FilterPolicy>& policy_;
   std::unique_ptr<FilterBitsBuilder> bits_builder_;
   std::unique_ptr<FilterBitsReader> bits_reader_;
@@ -320,6 +333,13 @@
     }
   }
 
+  int GetRibbonSeedFromFilterData() {
+    assert(filter_size_ >= 5);
+    // Check for ribbon marker
+    assert(-2 == static_cast<int8_t>(buf_.get()[filter_size_ - 5]));
+    return static_cast<uint8_t>(buf_.get()[filter_size_ - 4]);
+  }
+
   bool Matches(const Slice& s) {
     if (bits_reader_ == nullptr) {
       Build();
@@ -370,22 +390,6 @@
     }
     return result / 10000.0;
   }
-
-  uint32_t SelectByImpl(uint32_t for_legacy_bloom,
-                        uint32_t for_fast_local_bloom) {
-    switch (GetParam()) {
-      case BloomFilterPolicy::kLegacyBloom:
-        return for_legacy_bloom;
-      case BloomFilterPolicy::kFastLocalBloom:
-        return for_fast_local_bloom;
-      case BloomFilterPolicy::kDeprecatedBlock:
-      case BloomFilterPolicy::kAuto:
-          /* N/A */;
-    }
-    // otherwise
-    assert(false);
-    return 0;
-  }
 };
 
 TEST_P(FullBloomTest, FilterSize) {
@@ -429,12 +433,29 @@
     EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
 
     auto bits_builder = GetBuiltinFilterBitsBuilder();
-    for (int n = 1; n < 100; n++) {
-      auto space = bits_builder->CalculateSpace(n);
-      auto n2 = bits_builder->CalculateNumEntry(space);
+
+    size_t n = 1;
+    size_t space = 0;
+    for (; n < 1000000; n += 1 + n / 1000) {
+      // Ensure consistency between CalculateSpace and ApproximateNumEntries
+      space = bits_builder->CalculateSpace(n);
+      size_t n2 = bits_builder->ApproximateNumEntries(space);
       EXPECT_GE(n2, n);
-      auto space2 = bits_builder->CalculateSpace(n2);
-      EXPECT_EQ(space, space2);
+      size_t space2 = bits_builder->CalculateSpace(n2);
+      if (n > 12000 && GetParam() == BloomFilterPolicy::kStandard128Ribbon) {
+        // TODO(peterd): better approximation?
+        EXPECT_GE(space2, space);
+        EXPECT_LE(space2 * 0.998, space * 1.0);
+      } else {
+        EXPECT_EQ(space2, space);
+      }
+    }
+    // Until size_t overflow
+    for (; n < (n + n / 3); n += n / 3) {
+      // Ensure space computation is not overflowing; capped is OK
+      size_t space2 = bits_builder->CalculateSpace(n);
+      EXPECT_GE(space2, space);
+      space = space2;
     }
   }
   // Check that the compiler hasn't optimized our computation into nothing
@@ -471,8 +492,8 @@
     }
     Build();
 
-    ASSERT_LE(FilterSize(),
-              (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
+    EXPECT_LE(FilterSize(), (size_t)((length * FLAGS_bits_per_key / 8) +
+                                     CACHE_LINE_SIZE * 2 + 5));
 
     // All added keys must match
     for (int i = 0; i < length; i++) {
@@ -486,17 +507,156 @@
       fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
               rate*100.0, length, static_cast<int>(FilterSize()));
     }
-    ASSERT_LE(rate, 0.02);   // Must not be over 2%
-    if (rate > 0.0125)
-      mediocre_filters++;  // Allowed, but not too often
-    else
-      good_filters++;
+    if (FLAGS_bits_per_key == 10) {
+      EXPECT_LE(rate, 0.02);  // Must not be over 2%
+      if (rate > 0.0125) {
+        mediocre_filters++;  // Allowed, but not too often
+      } else {
+        good_filters++;
+      }
+    }
   }
   if (kVerbose >= 1) {
     fprintf(stderr, "Filters: %d good, %d mediocre\n",
             good_filters, mediocre_filters);
   }
-  ASSERT_LE(mediocre_filters, good_filters/5);
+  EXPECT_LE(mediocre_filters, good_filters / 5);
+}
+
+TEST_P(FullBloomTest, OptimizeForMemory) {
+  char buffer[sizeof(int)];
+  for (bool offm : {true, false}) {
+    table_options_.optimize_filters_for_memory = offm;
+    ResetPolicy();
+    Random32 rnd(12345);
+    uint64_t total_size = 0;
+    uint64_t total_mem = 0;
+    int64_t total_keys = 0;
+    double total_fp_rate = 0;
+    constexpr int nfilters = 100;
+    for (int i = 0; i < nfilters; ++i) {
+      int nkeys = static_cast<int>(rnd.Uniformish(10000)) + 100;
+      Reset();
+      for (int j = 0; j < nkeys; ++j) {
+        Add(Key(j, buffer));
+      }
+      Build();
+      size_t size = FilterData().size();
+      total_size += size;
+      // optimize_filters_for_memory currently depends on malloc_usable_size
+      // but we run the rest of the test to ensure no bad behavior without it.
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      size = malloc_usable_size(const_cast<char*>(FilterData().data()));
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+      total_mem += size;
+      total_keys += nkeys;
+      total_fp_rate += FalsePositiveRate();
+    }
+    if (FLAGS_bits_per_key == 10) {
+      EXPECT_LE(total_fp_rate / double{nfilters}, 0.011);
+      EXPECT_GE(total_fp_rate / double{nfilters},
+                CACHE_LINE_SIZE >= 256 ? 0.007 : 0.008);
+    }
+
+    int64_t ex_min_total_size = int64_t{FLAGS_bits_per_key} * total_keys / 8;
+    if (GetParam() == BloomFilterPolicy::kStandard128Ribbon) {
+      // ~ 30% savings vs. Bloom filter
+      ex_min_total_size = 7 * ex_min_total_size / 10;
+    }
+    EXPECT_GE(static_cast<int64_t>(total_size), ex_min_total_size);
+
+    int64_t blocked_bloom_overhead = nfilters * (CACHE_LINE_SIZE + 5);
+    if (GetParam() == BloomFilterPolicy::kLegacyBloom) {
+      // this config can add extra cache line to make odd number
+      blocked_bloom_overhead += nfilters * CACHE_LINE_SIZE;
+    }
+
+    EXPECT_GE(total_mem, total_size);
+
+    // optimize_filters_for_memory not implemented with legacy Bloom
+    if (offm && GetParam() != BloomFilterPolicy::kLegacyBloom) {
+      // This value can include a small extra penalty for kExtraPadding
+      fprintf(stderr, "Internal fragmentation (optimized): %g%%\n",
+              (total_mem - total_size) * 100.0 / total_size);
+      // Less than 1% internal fragmentation
+      EXPECT_LE(total_mem, total_size * 101 / 100);
+      // Up to 2% storage penalty
+      EXPECT_LE(static_cast<int64_t>(total_size),
+                ex_min_total_size * 102 / 100 + blocked_bloom_overhead);
+    } else {
+      fprintf(stderr, "Internal fragmentation (not optimized): %g%%\n",
+              (total_mem - total_size) * 100.0 / total_size);
+      // TODO: add control checks for more allocators?
+#ifdef ROCKSDB_JEMALLOC
+      fprintf(stderr, "Jemalloc detected? %d\n", HasJemalloc());
+      if (HasJemalloc()) {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+        // More than 5% internal fragmentation
+        EXPECT_GE(total_mem, total_size * 105 / 100);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+      }
+#endif  // ROCKSDB_JEMALLOC
+      // No storage penalty, just usual overhead
+      EXPECT_LE(static_cast<int64_t>(total_size),
+                ex_min_total_size + blocked_bloom_overhead);
+    }
+  }
+}
+
+TEST(FullBloomFilterConstructionReserveMemTest,
+     RibbonFilterFallBackOnLargeBanding) {
+  constexpr std::size_t kCacheCapacity =
+      8 * CacheReservationManager::GetDummyEntrySize();
+  constexpr std::size_t num_entries_for_cache_full = kCacheCapacity / 8;
+
+  for (bool reserve_builder_mem : {true, false}) {
+    bool will_fall_back = reserve_builder_mem;
+
+    BlockBasedTableOptions table_options;
+    table_options.reserve_table_builder_memory = reserve_builder_mem;
+    LRUCacheOptions lo;
+    lo.capacity = kCacheCapacity;
+    lo.num_shard_bits = 0;  // 2^0 shard
+    lo.strict_capacity_limit = true;
+    std::shared_ptr<Cache> cache(NewLRUCache(lo));
+    table_options.block_cache = cache;
+    table_options.filter_policy.reset(new BloomFilterPolicy(
+        FLAGS_bits_per_key, BloomFilterPolicy::Mode::kStandard128Ribbon));
+    FilterBuildingContext ctx(table_options);
+    std::unique_ptr<FilterBitsBuilder> filter_bits_builder(
+        table_options.filter_policy->GetBuilderWithContext(ctx));
+
+    char key_buffer[sizeof(int)];
+    for (std::size_t i = 0; i < num_entries_for_cache_full; ++i) {
+      filter_bits_builder->AddKey(Key(static_cast<int>(i), key_buffer));
+    }
+
+    std::unique_ptr<const char[]> buf;
+    Slice filter = filter_bits_builder->Finish(&buf);
+
+    // To verify Ribbon Filter fallbacks to Bloom Filter properly
+    // based on cache reservation result
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    // -2 = Marker for Standard128 Ribbon
+    if (will_fall_back) {
+      EXPECT_EQ(filter.data()[filter.size() - 5], static_cast<char>(-1));
+    } else {
+      EXPECT_EQ(filter.data()[filter.size() - 5], static_cast<char>(-2));
+    }
+
+    if (reserve_builder_mem) {
+      const size_t dummy_entry_num = static_cast<std::size_t>(std::ceil(
+          filter.size() * 1.0 / CacheReservationManager::GetDummyEntrySize()));
+      EXPECT_GE(cache->GetPinnedUsage(),
+                dummy_entry_num * CacheReservationManager::GetDummyEntrySize());
+      EXPECT_LT(
+          cache->GetPinnedUsage(),
+          (dummy_entry_num + 1) * CacheReservationManager::GetDummyEntrySize());
+    } else {
+      EXPECT_EQ(cache->GetPinnedUsage(), 0);
+    }
+  }
 }
 
 namespace {
@@ -523,94 +683,160 @@
 // ability to read filters generated using other cache line sizes.
 // See RawSchema.
 TEST_P(FullBloomTest, Schema) {
+#define EXPECT_EQ_Bloom(a, b)                                  \
+  {                                                            \
+    if (GetParam() != BloomFilterPolicy::kStandard128Ribbon) { \
+      EXPECT_EQ(a, b);                                         \
+    }                                                          \
+  }
+#define EXPECT_EQ_Ribbon(a, b)                                 \
+  {                                                            \
+    if (GetParam() == BloomFilterPolicy::kStandard128Ribbon) { \
+      EXPECT_EQ(a, b);                                         \
+    }                                                          \
+  }
+#define EXPECT_EQ_FastBloom(a, b)                           \
+  {                                                         \
+    if (GetParam() == BloomFilterPolicy::kFastLocalBloom) { \
+      EXPECT_EQ(a, b);                                      \
+    }                                                       \
+  }
+#define EXPECT_EQ_LegacyBloom(a, b)                      \
+  {                                                      \
+    if (GetParam() == BloomFilterPolicy::kLegacyBloom) { \
+      EXPECT_EQ(a, b);                                   \
+    }                                                    \
+  }
+#define EXPECT_EQ_NotLegacy(a, b)                        \
+  {                                                      \
+    if (GetParam() != BloomFilterPolicy::kLegacyBloom) { \
+      EXPECT_EQ(a, b);                                   \
+    }                                                    \
+  }
+
   char buffer[sizeof(int)];
 
-  // Use enough keys so that changing bits / key by 1 is guaranteed to
+  // First do a small number of keys, where Ribbon config will fall back on
+  // fast Bloom filter and generate the same data
+  ResetPolicy(5);  // num_probes = 3
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 3);
+
+  EXPECT_EQ_NotLegacy(BloomHash(FilterData()), 4130687756U);
+
+  EXPECT_EQ_NotLegacy("31,38,40,43,61,83,86,112,125,131", FirstFPs(10));
+
+  // Now use enough keys so that changing bits / key by 1 is guaranteed to
   // change number of allocated cache lines. So keys > max cache line bits.
 
+  // Note that the first attempted Ribbon seed is determined by the hash
+  // of the first key added (for pseudorandomness in practice, determinism in
+  // testing)
+
   ResetPolicy(2);  // num_probes = 1
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 1);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 1);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1567096579, 1964771444, 2659542661U),
-                   3817481309U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
-  }
+      SelectByCacheLineSize(1567096579, 1964771444, 2659542661U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3817481309U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1705851228U);
+
+  EXPECT_EQ_FastBloom("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,8,10,17,19,20,23,28,31,32", FirstFPs(10));
 
   ResetPolicy(3);  // num_probes = 2
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 2);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 2);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(2707206547U, 2571983456U, 218344685),
-                   2807269961U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
-  }
+      SelectByCacheLineSize(2707206547U, 2571983456U, 218344685));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2807269961U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1095342358U);
+
+  EXPECT_EQ_FastBloom("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,17,20,28,32,33,36,43,49,54", FirstFPs(10));
 
   ResetPolicy(5);  // num_probes = 3
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 3);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 3);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(515748486, 94611728, 2436112214U),
-                   204628445));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
-  }
+      SelectByCacheLineSize(515748486, 94611728, 2436112214U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 204628445U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3971337699U);
+
+  EXPECT_EQ_FastBloom("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,33,36,43,67,70,76,78,84,102", FirstFPs(10));
 
   ResetPolicy(8);  // num_probes = 5
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 5);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 5);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1302145999, 2811644657U, 756553699),
-                   355564975));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
-  }
+      SelectByCacheLineSize(1302145999, 2811644657U, 756553699));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 355564975U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3651449053U);
+
+  EXPECT_EQ_FastBloom("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
+  EXPECT_EQ_Ribbon("33,187,203,296,300,322,411,419,547,582", FirstFPs(10));
 
   ResetPolicy(9);  // num_probes = 6
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(2092755149, 661139132, 1182970461),
-                   2137566013U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("156,367,791,872,945,1015,1139,1159,1265,1435", FirstFPs(10));
-  }
+      SelectByCacheLineSize(2092755149, 661139132, 1182970461));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2137566013U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1005676675U);
+
+  EXPECT_EQ_FastBloom("156,367,791,872,945,1015,1139,1159,1265", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,187,203,296,411,419,604,612,615,619", FirstFPs(10));
 
   ResetPolicy(11);  // num_probes = 7
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 7);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(3755609649U, 1812694762, 1449142939),
-                   2561502687U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
-  }
+      SelectByCacheLineSize(3755609649U, 1812694762, 1449142939));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2561502687U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3129900846U);
+
+  EXPECT_EQ_FastBloom("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
+  EXPECT_EQ_Ribbon("411,419,623,665,727,794,955,1052,1323,1330", FirstFPs(10));
 
   // This used to be 9 probes, but 8 is a better choice for speed,
   // especially with SIMD groups of 8 probes, with essentially no
@@ -623,14 +849,18 @@
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(9, 8));
-  EXPECT_EQ(
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 8);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(178861123, 379087593, 2574136516U),
-                   3709876890U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
-  }
+      SelectByCacheLineSize(178861123, 379087593, 2574136516U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3709876890U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1855638875U);
+
+  EXPECT_EQ_FastBloom("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
+  EXPECT_EQ_Ribbon("665,727,1323,1755,3866,4232,4442,4492,4736", FirstFPs(9));
 
   // This used to be 11 probes, but 9 is a better choice for speed
   // AND accuracy.
@@ -642,56 +872,69 @@
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(11, 9));
-  EXPECT_EQ(
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 11);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1129406313, 3049154394U, 1727750964),
-                   1087138490));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("3299,3611,3916,6620,7822,8079,8482,8942,10167", FirstFPs(9));
-  }
+      SelectByCacheLineSize(1129406313, 3049154394U, 1727750964));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 1087138490U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 459379967U);
+
+  EXPECT_EQ_FastBloom("3299,3611,3916,6620,7822,8079,8482,8942", FirstFPs(8));
+  EXPECT_EQ_Ribbon("727,1323,1755,4442,4736,5386,6974,7154,8222", FirstFPs(9));
 
   ResetPolicy(10);  // num_probes = 6, but different memory ratio vs. 9
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1478976371, 2910591341U, 1182970461),
-                   2498541272U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
-  }
+      SelectByCacheLineSize(1478976371, 2910591341U, 1182970461));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2498541272U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1273231667U);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("296,411,419,612,619,623,630,665,686,727", FirstFPs(10));
 
   ResetPolicy(10);
   for (int key = /*CHANGED*/ 1; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), /*CHANGED*/ 184);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U),
-                   2058382345U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
-  }
+      SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2058382345U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3007790572U);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,152,383,497,589,633,737,781,911,990", FirstFPs(10));
 
   ResetPolicy(10);
   for (int key = 1; key < /*CHANGED*/ 2088; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(2885052954U, 769447944, 4175124908U),
-                   23699164));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
-  }
+      SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 23699164U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1942323379U);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,95,360,589,737,911,990,1048,1081,1414", FirstFPs(10));
 
   // With new fractional bits_per_key, check that we are rounding to
   // whole bits per key for old Bloom filters but fractional for
@@ -701,30 +944,35 @@
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(BloomHash(FilterData()),
-            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
-                                                        4175124908U),
-                         /*CHANGED*/ 3166884174U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ(/*CHANGED*/ "126,156,367,444,458,791,813,976,1015,1035",
-              FirstFPs(10));
-  }
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3166884174U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1148258663U);
+
+  EXPECT_EQ_FastBloom("126,156,367,444,458,791,813,976,1015", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,54,95,360,589,693,737,911,990,1048", FirstFPs(10));
 
   ResetPolicy(10.499);
   for (int key = 1; key < 2088; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(6, 7));
-  EXPECT_EQ(BloomHash(FilterData()),
-            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
-                                                        4175124908U),
-                         /*CHANGED*/ 4098502778U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ(/*CHANGED*/ "16,236,240,472,1015,1045,1111,1409,1465,1612",
-              FirstFPs(10));
-  }
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 4098502778U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 792138188U);
+
+  EXPECT_EQ_FastBloom("16,236,240,472,1015,1045,1111,1409,1465", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,95,360,589,737,990,1048,1081,1414,1643", FirstFPs(10));
 
   ResetPolicy();
 }
@@ -766,6 +1014,7 @@
 
 TEST_P(FullBloomTest, RawSchema) {
   RawFilterTester cft;
+  // Legacy Bloom configurations
   // Two probes, about 3/4 bits set: ~50% "FP" rate
   // One 256-byte cache line.
   OpenRaw(cft.ResetWeirdFill(256, 1, 2));
@@ -778,12 +1027,37 @@
   // Four 64-byte cache lines.
   OpenRaw(cft.ResetWeirdFill(256, 4, 2));
   EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
+
+  // Fast local Bloom configurations (marker 255 -> -1)
+  // Two probes, about 3/4 bits set: ~50% "FP" rate
+  // Four 64-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 2U << 8, 255));
+  EXPECT_EQ(uint64_t{9957045189927952471U}, PackedMatches());
+
+  // Ribbon configurations (marker 254 -> -2)
+
+  // Even though the builder never builds configurations this
+  // small (preferring Bloom), we can test that the configuration
+  // can be read, for possible future-proofing.
+
+  // 256 slots, one result column = 32 bytes (2 blocks, seed 0)
+  // ~50% FP rate:
+  // 0b0101010111110101010000110000011011011111100100001110010011101010
+  OpenRaw(cft.ResetWeirdFill(32, 2U << 8, 254));
+  EXPECT_EQ(uint64_t{6193930559317665002U}, PackedMatches());
+
+  // 256 slots, three-to-four result columns = 112 bytes
+  // ~ 1 in 10 FP rate:
+  // 0b0000000000100000000000000000000001000001000000010000101000000000
+  OpenRaw(cft.ResetWeirdFill(112, 2U << 8, 254));
+  EXPECT_EQ(uint64_t{9007200345328128U}, PackedMatches());
 }
 
 TEST_P(FullBloomTest, CorruptFilters) {
   RawFilterTester cft;
 
   for (bool fill : {false, true}) {
+    // Legacy Bloom configurations
     // Good filter bits - returns same as fill
     OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill));
     ASSERT_EQ(fill, Matches("hello"));
@@ -858,11 +1132,11 @@
 
     // Bad filter bits - returns false as if built from zero keys
     // < 5 bytes overall means missing even metadata
-    OpenRaw(cft.Reset(-1, 3, 6, fill));
+    OpenRaw(cft.Reset(static_cast<uint32_t>(-1), 3, 6, fill));
     ASSERT_FALSE(Matches("hello"));
     ASSERT_FALSE(Matches("world"));
 
-    OpenRaw(cft.Reset(-5, 3, 6, fill));
+    OpenRaw(cft.Reset(static_cast<uint32_t>(-5), 3, 6, fill));
     ASSERT_FALSE(Matches("hello"));
     ASSERT_FALSE(Matches("world"));
 
@@ -889,16 +1163,186 @@
     ASSERT_TRUE(Matches("world"));
 
     // Dubious filter bits - returns true (for now)
-    // Similar, with 255 / -1
-    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 255, fill));
+    // Similar, with 253 / -3
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 253, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // #########################################################
+    // Fast local Bloom configurations (marker 255 -> -1)
+    // Good config with six probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 6U << 8, 255, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Becomes bad/reserved config (always true) if any other byte set
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | 1U, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 16), 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 24), 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Good config, max 30 probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 30U << 8, 255, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Bad/reserved config (always true) if more than 30
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 31U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 33U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 66U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 130U << 8, 255, fill));
     ASSERT_TRUE(Matches("hello"));
     ASSERT_TRUE(Matches("world"));
   }
+
+  // #########################################################
+  // Ribbon configurations (marker 254 -> -2)
+  // ("fill" doesn't work to detect good configurations, we just
+  // have to rely on TN probability)
+
+  // Good: 2 blocks * 16 bytes / segment * 4 columns = 128 bytes
+  // seed = 123
+  OpenRaw(cft.Reset(128, (2U << 8) + 123U, 254, false));
+  ASSERT_FALSE(Matches("hello"));
+  ASSERT_FALSE(Matches("world"));
+
+  // Good: 2 blocks * 16 bytes / segment * 8 columns = 256 bytes
+  OpenRaw(cft.Reset(256, (2U << 8) + 123U, 254, false));
+  ASSERT_FALSE(Matches("hello"));
+  ASSERT_FALSE(Matches("world"));
+
+  // Surprisingly OK: 5000 blocks (640,000 slots) in only 1024 bits
+  // -> average close to 0 columns
+  OpenRaw(cft.Reset(128, (5000U << 8) + 123U, 254, false));
+  // *Almost* all FPs
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  // Need many queries to find a "true negative"
+  for (int i = 0; Matches(ToString(i)); ++i) {
+    ASSERT_LT(i, 1000);
+  }
+
+  // Bad: 1 block not allowed (for implementation detail reasons)
+  OpenRaw(cft.Reset(128, (1U << 8) + 123U, 254, false));
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+
+  // Bad: 0 blocks not allowed
+  OpenRaw(cft.Reset(128, (0U << 8) + 123U, 254, false));
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
 }
 
 INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
                         testing::Values(BloomFilterPolicy::kLegacyBloom,
-                                        BloomFilterPolicy::kFastLocalBloom));
+                                        BloomFilterPolicy::kFastLocalBloom,
+                                        BloomFilterPolicy::kStandard128Ribbon));
+
+static double GetEffectiveBitsPerKey(FilterBitsBuilder* builder) {
+  union {
+    uint64_t key_value = 0;
+    char key_bytes[8];
+  };
+
+  const unsigned kNumKeys = 1000;
+
+  Slice key_slice{key_bytes, 8};
+  for (key_value = 0; key_value < kNumKeys; ++key_value) {
+    builder->AddKey(key_slice);
+  }
+
+  std::unique_ptr<const char[]> buf;
+  auto filter = builder->Finish(&buf);
+  return filter.size() * /*bits per byte*/ 8 / (1.0 * kNumKeys);
+}
+
+static void SetTestingLevel(int levelish, FilterBuildingContext* ctx) {
+  if (levelish == -1) {
+    // Flush is treated as level -1 for this option but actually level 0
+    ctx->level_at_creation = 0;
+    ctx->reason = TableFileCreationReason::kFlush;
+  } else {
+    ctx->level_at_creation = levelish;
+    ctx->reason = TableFileCreationReason::kCompaction;
+  }
+}
+
+TEST(RibbonTest, RibbonTestLevelThreshold) {
+  BlockBasedTableOptions opts;
+  FilterBuildingContext ctx(opts);
+  // A few settings
+  for (CompactionStyle cs : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                             kCompactionStyleFIFO, kCompactionStyleNone}) {
+    ctx.compaction_style = cs;
+    for (int bloom_before_level : {-1, 0, 1, 10}) {
+      std::vector<std::unique_ptr<const FilterPolicy> > policies;
+      policies.emplace_back(NewRibbonFilterPolicy(10, bloom_before_level));
+
+      if (bloom_before_level == -1) {
+        // Also test old API
+        policies.emplace_back(NewExperimentalRibbonFilterPolicy(10));
+      }
+
+      if (bloom_before_level == 0) {
+        // Also test old API and new API default
+        policies.emplace_back(NewRibbonFilterPolicy(10));
+      }
+
+      for (std::unique_ptr<const FilterPolicy>& policy : policies) {
+        // Claim to be generating filter for this level
+        SetTestingLevel(bloom_before_level, &ctx);
+
+        std::unique_ptr<FilterBitsBuilder> builder{
+            policy->GetBuilderWithContext(ctx)};
+
+        // Must be Ribbon (more space efficient than 10 bits per key)
+        ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8);
+
+        if (bloom_before_level >= 0) {
+          // Claim to be generating filter for previous level
+          SetTestingLevel(bloom_before_level - 1, &ctx);
+
+          builder.reset(policy->GetBuilderWithContext(ctx));
+
+          if (cs == kCompactionStyleLevel || cs == kCompactionStyleUniversal) {
+            // Level is considered.
+            // Must be Bloom (~ 10 bits per key)
+            ASSERT_GT(GetEffectiveBitsPerKey(builder.get()), 9);
+          } else {
+            // Level is ignored under non-traditional compaction styles.
+            // Must be Ribbon (more space efficient than 10 bits per key)
+            ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8);
+          }
+        }
+
+        // Like SST file writer
+        ctx.level_at_creation = -1;
+        ctx.reason = TableFileCreationReason::kMisc;
+
+        builder.reset(policy->GetBuilderWithContext(ctx));
+
+        // Must be Ribbon (more space efficient than 10 bits per key)
+        ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8);
+      }
+    }
+  }
+}
 
 }  // namespace ROCKSDB_NAMESPACE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/build_version.cc.in mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.cc.in
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/build_version.cc.in	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.cc.in	2025-05-19 16:14:28.000000000 +0000
@@ -1,5 +1,71 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#include "build_version.h"
-const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:@@GIT_SHA@@";
-const char* rocksdb_build_git_date = "rocksdb_build_git_date:@@GIT_DATE_TIME@@";
-const char* rocksdb_build_compile_date = __DATE__;
+
+#include <memory>
+
+#include "rocksdb/version.h"
+#include "util/string_util.h"
+
+// The build script may replace these values with real values based
+// on whether or not GIT is available and the platform settings
+static const std::string rocksdb_build_git_sha  = "rocksdb_build_git_sha:@GIT_SHA@";
+static const std::string rocksdb_build_git_tag = "rocksdb_build_git_tag:@GIT_TAG@";
+#define HAS_GIT_CHANGES @GIT_MOD@
+#if HAS_GIT_CHANGES == 0
+// If HAS_GIT_CHANGES is 0, the GIT date is used.
+// Use the time the branch/tag was last modified
+static const std::string rocksdb_build_date = "rocksdb_build_date:@GIT_DATE@";
+#else
+// If HAS_GIT_CHANGES is > 0, the branch/tag has modifications.
+// Use the time the build was created.
+static const std::string rocksdb_build_date = "rocksdb_build_date:@BUILD_DATE@";
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+static void AddProperty(std::unordered_map<std::string, std::string> *props, const std::string& name) {
+  size_t colon = name.find(":");
+  if (colon != std::string::npos && colon > 0 && colon < name.length() - 1) {
+    // If we found a "@:", then this property was a build-time substitution that failed.  Skip it
+    size_t at = name.find("@", colon);
+    if (at != colon + 1) {
+      // Everything before the colon is the name, after is the value
+      (*props)[name.substr(0, colon)] = name.substr(colon + 1);
+    }
+  }
+}
+  
+static std::unordered_map<std::string, std::string>* LoadPropertiesSet() {
+  auto * properties = new std::unordered_map<std::string, std::string>();
+  AddProperty(properties, rocksdb_build_git_sha);
+  AddProperty(properties, rocksdb_build_git_tag);
+  AddProperty(properties, rocksdb_build_date);
+  return properties;
+}
+
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties() {
+  static std::unique_ptr<std::unordered_map<std::string, std::string>> props(LoadPropertiesSet());
+  return *props;
+}
+
+std::string GetRocksVersionAsString(bool with_patch) {
+  std::string version = ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR);
+  if (with_patch) {
+    return version + "." + ToString(ROCKSDB_PATCH);
+  } else {
+    return version;
+  }
+}
+  
+std::string GetRocksBuildInfoAsString(const std::string& program, bool verbose) {
+  std::string info = program + " (RocksDB) " + GetRocksVersionAsString(true);
+  if (verbose) {
+    for (const auto& it : GetRocksBuildProperties()) {
+      info.append("\n    ");
+      info.append(it.first);
+      info.append(": ");
+      info.append(it.second);
+    }
+  }
+  return info;
+}
+} // namespace ROCKSDB_NAMESPACE
+
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/build_version.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/build_version.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/build_version.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,15 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#pragma once
-#if !defined(IOS_CROSS_COMPILE)
-// if we compile with Xcode, we don't run build_detect_version, so we don't
-// generate these variables
-// this variable tells us about the git revision
-extern const char* rocksdb_build_git_sha;
-
-// Date on which the code was compiled:
-extern const char* rocksdb_build_compile_date;
-#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/cast_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/cast_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/cast_util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/cast_util.h	2025-05-19 16:14:28.000000000 +0000
@@ -5,10 +5,14 @@
 
 #pragma once
 
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 // The helper function to assert the move from dynamic_cast<> to
 // static_cast<> is correct. This function is to deal with legacy code.
-// It is not recommanded to add new code to issue class casting. The preferred
+// It is not recommended to add new code to issue class casting. The preferred
 // solution is to implement the functionality without a need of casting.
 template <class DestClass, class SrcClass>
 inline DestClass* static_cast_with_check(SrcClass* x) {
@@ -18,4 +22,21 @@
 #endif
   return ret;
 }
+
+// A wrapper around static_cast for lossless conversion between integral
+// types, including enum types. For example, this can be used for converting
+// between signed/unsigned or enum type and underlying type without fear of
+// stripping away data, now or in the future.
+template <typename To, typename From>
+inline To lossless_cast(From x) {
+  using FromValue = typename std::remove_reference<From>::type;
+  static_assert(
+      std::is_integral<FromValue>::value || std::is_enum<FromValue>::value,
+      "Only works on integral types");
+  static_assert(std::is_integral<To>::value || std::is_enum<To>::value,
+                "Only works on integral types");
+  static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless");
+  return static_cast<To>(x);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/channel.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/channel.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/channel.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/channel.h	2025-05-19 16:14:28.000000000 +0000
@@ -10,6 +10,8 @@
 #include <queue>
 #include <utility>
 
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 template <class T>
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/coding.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/coding.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding.h	2025-05-19 16:14:28.000000000 +0000
@@ -7,19 +7,21 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
-// Endian-neutral encoding:
+// Encoding independent of machine byte order:
 // * Fixed-length numbers are encoded with least-significant byte first
+//   (little endian, native order on Intel and others)
 // * In addition we support variable length "varint" encoding
 // * Strings are encoded prefixed by their length in varint format
+//
+// Some related functions are provided in coding_lean.h
 
 #pragma once
 #include <algorithm>
-#include <stdint.h>
-#include <string.h>
 #include <string>
 
-#include "rocksdb/write_batch.h"
 #include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/coding_lean.h"
 
 // Some processors does not allow unaligned access to memory
 #if defined(__sparc)
@@ -29,7 +31,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 // The maximum length of a varint in bytes for 64-bit.
-const unsigned int kMaxVarint64Length = 10;
+const uint32_t kMaxVarint64Length = 10;
 
 // Standard Put... routines append to a string
 extern void PutFixed16(std::string* dst, uint16_t value);
@@ -94,59 +96,11 @@
 extern int VarintLength(uint64_t v);
 
 // Lower-level versions of Put... that write directly into a character buffer
-// REQUIRES: dst has enough space for the value being written
-extern void EncodeFixed16(char* dst, uint16_t value);
-extern void EncodeFixed32(char* dst, uint32_t value);
-extern void EncodeFixed64(char* dst, uint64_t value);
-
-// Lower-level versions of Put... that write directly into a character buffer
 // and return a pointer just past the last byte written.
 // REQUIRES: dst has enough space for the value being written
 extern char* EncodeVarint32(char* dst, uint32_t value);
 extern char* EncodeVarint64(char* dst, uint64_t value);
 
-// Lower-level versions of Get... that read directly from a character buffer
-// without any bounds checking.
-
-inline uint16_t DecodeFixed16(const char* ptr) {
-  if (port::kLittleEndian) {
-    // Load the raw bytes
-    uint16_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-    return result;
-  } else {
-    return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
-            (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
-  }
-}
-
-inline uint32_t DecodeFixed32(const char* ptr) {
-  if (port::kLittleEndian) {
-    // Load the raw bytes
-    uint32_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-    return result;
-  } else {
-    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
-  }
-}
-
-inline uint64_t DecodeFixed64(const char* ptr) {
-  if (port::kLittleEndian) {
-    // Load the raw bytes
-    uint64_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-    return result;
-  } else {
-    uint64_t lo = DecodeFixed32(ptr);
-    uint64_t hi = DecodeFixed32(ptr + 4);
-    return (hi << 32) | lo;
-  }
-}
-
 // Internal routine for use by fallback path of GetVarint32Ptr
 extern const char* GetVarint32PtrFallback(const char* p,
                                           const char* limit,
@@ -164,42 +118,6 @@
   return GetVarint32PtrFallback(p, limit, value);
 }
 
-// -- Implementation of the functions declared above
-inline void EncodeFixed16(char* buf, uint16_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-  }
-}
-
-inline void EncodeFixed32(char* buf, uint32_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-    buf[2] = (value >> 16) & 0xff;
-    buf[3] = (value >> 24) & 0xff;
-  }
-}
-
-inline void EncodeFixed64(char* buf, uint64_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-    buf[2] = (value >> 16) & 0xff;
-    buf[3] = (value >> 24) & 0xff;
-    buf[4] = (value >> 32) & 0xff;
-    buf[5] = (value >> 40) & 0xff;
-    buf[6] = (value >> 48) & 0xff;
-    buf[7] = (value >> 56) & 0xff;
-  }
-}
-
 // Pull the last 8 bits and cast it to a character
 inline void PutFixed16(std::string* dst, uint16_t value) {
   if (port::kLittleEndian) {
@@ -402,17 +320,6 @@
   }
 }
 
-// Provide an interface for platform independent endianness transformation
-inline uint64_t EndianTransform(uint64_t input, size_t size) {
-  char* pos = reinterpret_cast<char*>(&input);
-  uint64_t ret_val = 0;
-  for (size_t i = 0; i < size; ++i) {
-    ret_val |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
-                << ((size - i - 1) << 3));
-  }
-  return ret_val;
-}
-
 inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
   uint32_t len = 0;
   if (GetVarint32(input, &len) && input->size() >= len) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/coding_lean.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_lean.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/coding_lean.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_lean.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,101 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// Encoding independent of machine byte order:
+// * Fixed-length numbers are encoded with least-significant byte first
+//   (little endian, native order on Intel and others)
+//
+// More functions in coding.h
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "port/port.h"  // for port::kLittleEndian
+
+namespace ROCKSDB_NAMESPACE {
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+// -- Implementation of the functions declared above
+inline void EncodeFixed16(char* buf, uint16_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+  }
+}
+
+inline void EncodeFixed32(char* buf, uint32_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+  }
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+    buf[4] = (value >> 32) & 0xff;
+    buf[5] = (value >> 40) & 0xff;
+    buf[6] = (value >> 48) & 0xff;
+    buf[7] = (value >> 56) & 0xff;
+  }
+}
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint16_t DecodeFixed16(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint16_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
+  }
+}
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint32_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint64_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64_t lo = DecodeFixed32(ptr);
+    uint64_t hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/coding_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/coding_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/coding_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -161,7 +161,7 @@
   std::string s;
   PutVarint32(&s, large_value);
   uint32_t result;
-  for (unsigned int len = 0; len < s.size() - 1; len++) {
+  for (unsigned int len = 0; len + 1 < s.size(); len++) {
     ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
   }
   ASSERT_TRUE(
@@ -181,7 +181,7 @@
   std::string s;
   PutVarint64(&s, large_value);
   uint64_t result;
-  for (unsigned int len = 0; len < s.size() - 1; len++) {
+  for (unsigned int len = 0; len + 1 < s.size(); len++) {
     ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
   }
   ASSERT_TRUE(
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/compaction_job_stats_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/compaction_job_stats_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/compaction_job_stats_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/compaction_job_stats_impl.cc	2025-05-19 16:14:28.000000000 +0000
@@ -14,16 +14,21 @@
   cpu_micros = 0;
 
   num_input_records = 0;
+  num_blobs_read = 0;
   num_input_files = 0;
   num_input_files_at_output_level = 0;
 
   num_output_records = 0;
   num_output_files = 0;
+  num_output_files_blob = 0;
 
-  is_manual_compaction = 0;
+  is_full_compaction = false;
+  is_manual_compaction = false;
 
   total_input_bytes = 0;
+  total_blob_bytes_read = 0;
   total_output_bytes = 0;
+  total_output_bytes_blob = 0;
 
   num_records_replaced = 0;
 
@@ -52,14 +57,18 @@
   cpu_micros += stats.cpu_micros;
 
   num_input_records += stats.num_input_records;
+  num_blobs_read += stats.num_blobs_read;
   num_input_files += stats.num_input_files;
   num_input_files_at_output_level += stats.num_input_files_at_output_level;
 
   num_output_records += stats.num_output_records;
   num_output_files += stats.num_output_files;
+  num_output_files_blob += stats.num_output_files_blob;
 
   total_input_bytes += stats.total_input_bytes;
+  total_blob_bytes_read += stats.total_blob_bytes_read;
   total_output_bytes += stats.total_output_bytes;
+  total_output_bytes_blob += stats.total_output_bytes_blob;
 
   num_records_replaced += stats.num_records_replaced;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/comparator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/comparator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/comparator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/comparator.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,12 +8,18 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/comparator.h"
+
 #include <stdint.h>
+
 #include <algorithm>
 #include <memory>
-#include "logging/logging.h"
+#include <mutex>
+
 #include "port/port.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -21,8 +27,8 @@
 class BytewiseComparatorImpl : public Comparator {
  public:
   BytewiseComparatorImpl() { }
-
-  const char* Name() const override { return "leveldb.BytewiseComparator"; }
+  static const char* kClassName() { return "leveldb.BytewiseComparator"; }
+  const char* Name() const override { return kClassName(); }
 
   int Compare(const Slice& a, const Slice& b) const override {
     return a.compare(b);
@@ -125,18 +131,25 @@
     return false;
   }
 
-  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+  using Comparator::CompareWithoutTimestamp;
+  int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b,
+                              bool /*b_has_ts*/) const override {
     return a.compare(b);
   }
+
+  bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return a == b;
+  }
 };
 
 class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
  public:
   ReverseBytewiseComparatorImpl() { }
 
-  const char* Name() const override {
+  static const char* kClassName() {
     return "rocksdb.ReverseBytewiseComparator";
   }
+  const char* Name() const override { return kClassName(); }
 
   int Compare(const Slice& a, const Slice& b) const override {
     return -a.compare(b);
@@ -197,7 +210,9 @@
     return false;
   }
 
-  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+  using Comparator::CompareWithoutTimestamp;
+  int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b,
+                              bool /*b_has_ts*/) const override {
     return -a.compare(b);
   }
 };
@@ -213,4 +228,67 @@
   return &rbytewise;
 }
 
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinComparators(ObjectLibrary& library,
+                                      const std::string& /*arg*/) {
+  library.AddFactory<const Comparator>(
+      BytewiseComparatorImpl::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const Comparator>* /*guard */,
+         std::string* /* errmsg */) { return BytewiseComparator(); });
+  library.AddFactory<const Comparator>(
+      ReverseBytewiseComparatorImpl::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const Comparator>* /*guard */,
+         std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
+  return 2;
+}
+#endif  // ROCKSDB_LITE
+
+Status Comparator::CreateFromString(const ConfigOptions& config_options,
+                                    const std::string& value,
+                                    const Comparator** result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinComparators(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, *result, value,
+                                              &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  }
+  if (id == BytewiseComparatorImpl::kClassName()) {
+    *result = BytewiseComparator();
+  } else if (id == ReverseBytewiseComparatorImpl::kClassName()) {
+    *result = ReverseBytewiseComparator();
+  } else if (value.empty()) {
+    // No Id and no options.  Clear the object
+    *result = nullptr;
+    return Status::OK();
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewStaticObject(id, result);
+#else
+    status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+    if (!status.ok()) {
+      if (config_options.ignore_unsupported_options &&
+          status.IsNotSupported()) {
+        return Status::OK();
+      } else {
+        return status;
+      }
+    } else if (!opt_map.empty()) {
+      Comparator* comparator = const_cast<Comparator*>(*result);
+      status = comparator->ConfigureFromMap(config_options, opt_map);
+    }
+  }
+  return status;
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/compression.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/compression.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression.h	2025-05-19 16:14:28.000000000 +0000
@@ -23,6 +23,7 @@
 #include "memory/memory_allocator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
 #include "util/string_util.h"
@@ -420,10 +421,6 @@
   ZSTDUncompressCachedData uncomp_cached_data_;
 
  public:
-  struct NoCache {};
-  // Do not use context cache, used by TableBuilder
-  UncompressionContext(NoCache, CompressionType /* type */) {}
-
   explicit UncompressionContext(CompressionType type) {
     if (type == kZSTD || type == kZSTDNotFinalCompression) {
       ctx_cache_ = CompressionContextCache::Instance();
@@ -543,6 +540,43 @@
   }
 }
 
+inline bool DictCompressionTypeSupported(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return false;
+    case kSnappyCompression:
+      return false;
+    case kZlibCompression:
+      return Zlib_Supported();
+    case kBZip2Compression:
+      return false;
+    case kLZ4Compression:
+    case kLZ4HCCompression:
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+      return LZ4_Supported();
+#else
+      return false;
+#endif
+    case kXpressCompression:
+      return false;
+    case kZSTDNotFinalCompression:
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+      return ZSTDNotFinal_Supported();
+#else
+      return false;
+#endif
+    case kZSTD:
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+      return ZSTD_Supported();
+#else
+      return false;
+#endif
+    default:
+      assert(false);
+      return false;
+  }
+}
+
 inline std::string CompressionTypeToString(CompressionType compression_type) {
   switch (compression_type) {
     case kNoCompression:
@@ -593,6 +627,9 @@
   result.append("enabled=")
       .append(ToString(compression_options.enabled))
       .append("; ");
+  result.append("max_dict_buffer_bytes=")
+      .append(ToString(compression_options.max_dict_buffer_bytes))
+      .append("; ");
   return result;
 }
 
@@ -619,26 +656,30 @@
 #endif
 }
 
-inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
-                                         size_t* result) {
+inline CacheAllocationPtr Snappy_Uncompress(
+    const char* input, size_t length, size_t* uncompressed_size,
+    MemoryAllocator* allocator = nullptr) {
 #ifdef SNAPPY
-  return snappy::GetUncompressedLength(input, length, result);
-#else
-  (void)input;
-  (void)length;
-  (void)result;
-  return false;
-#endif
-}
+  size_t uncompressed_length = 0;
+  if (!snappy::GetUncompressedLength(input, length, &uncompressed_length)) {
+    return nullptr;
+  }
 
-inline bool Snappy_Uncompress(const char* input, size_t length, char* output) {
-#ifdef SNAPPY
-  return snappy::RawUncompress(input, length, output);
+  CacheAllocationPtr output = AllocateBlock(uncompressed_length, allocator);
+
+  if (!snappy::RawUncompress(input, length, output.get())) {
+    return nullptr;
+  }
+
+  *uncompressed_size = uncompressed_length;
+
+  return output;
 #else
   (void)input;
   (void)length;
-  (void)output;
-  return false;
+  (void)uncompressed_size;
+  (void)allocator;
+  return nullptr;
 #endif
 }
 
@@ -683,9 +724,6 @@
     output_header_len = compression::PutDecompressedSizeInfo(
         output, static_cast<uint32_t>(length));
   }
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(output_header_len + length);
 
   // The memLevel parameter specifies how much memory should be allocated for
   // the internal compression state.
@@ -719,12 +757,17 @@
     }
   }
 
+  // Get an upper bound on the compressed size.
+  size_t upper_bound =
+      deflateBound(&_stream, static_cast<unsigned long>(length));
+  output->resize(output_header_len + upper_bound);
+
   // Compress the input, and put compressed data in output.
   _stream.next_in = (Bytef*)input;
   _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.avail_out = static_cast<unsigned int>(upper_bound);
   _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
 
   bool compressed = false;
@@ -757,7 +800,7 @@
 //    dictionary.
 inline CacheAllocationPtr Zlib_Uncompress(
     const UncompressionInfo& info, const char* input_data, size_t input_length,
-    int* decompress_size, uint32_t compress_format_version,
+    size_t* uncompressed_size, uint32_t compress_format_version,
     MemoryAllocator* allocator = nullptr, int windowBits = -14) {
 #ifdef ZLIB
   uint32_t output_len = 0;
@@ -839,14 +882,15 @@
 
   // If we encoded decompressed block size, we should have no bytes left
   assert(compress_format_version != 2 || _stream.avail_out == 0);
-  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  assert(output_len >= _stream.avail_out);
+  *uncompressed_size = output_len - _stream.avail_out;
   inflateEnd(&_stream);
   return output;
 #else
   (void)info;
   (void)input_data;
   (void)input_length;
-  (void)decompress_size;
+  (void)uncompressed_size;
   (void)compress_format_version;
   (void)allocator;
   (void)windowBits;
@@ -920,7 +964,7 @@
 // compress_format_version == 2 -- decompressed size is included in the block
 // header in varint32 format
 inline CacheAllocationPtr BZip2_Uncompress(
-    const char* input_data, size_t input_length, int* decompress_size,
+    const char* input_data, size_t input_length, size_t* uncompressed_size,
     uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
 #ifdef BZIP2
   uint32_t output_len = 0;
@@ -985,13 +1029,14 @@
 
   // If we encoded decompressed block size, we should have no bytes left
   assert(compress_format_version != 2 || _stream.avail_out == 0);
-  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  assert(output_len >= _stream.avail_out);
+  *uncompressed_size = output_len - _stream.avail_out;
   BZ2_bzDecompressEnd(&_stream);
   return output;
 #else
   (void)input_data;
   (void)input_length;
-  (void)decompress_size;
+  (void)uncompressed_size;
   (void)compress_format_version;
   (void)allocator;
   return nullptr;
@@ -1076,7 +1121,7 @@
 inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
                                          const char* input_data,
                                          size_t input_length,
-                                         int* decompress_size,
+                                         size_t* uncompressed_size,
                                          uint32_t compress_format_version,
                                          MemoryAllocator* allocator = nullptr) {
 #ifdef LZ4
@@ -1093,12 +1138,19 @@
     if (input_length < 8) {
       return nullptr;
     }
-    memcpy(&output_len, input_data, sizeof(output_len));
+    if (port::kLittleEndian) {
+      memcpy(&output_len, input_data, sizeof(output_len));
+    } else {
+      memcpy(&output_len, input_data + 4, sizeof(output_len));
+    }
     input_length -= 8;
     input_data += 8;
   }
 
   auto output = AllocateBlock(output_len, allocator);
+
+  int decompress_bytes = 0;
+
 #if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
   const Slice& compression_dict = info.dict().GetRawDict();
@@ -1106,26 +1158,27 @@
     LZ4_setStreamDecode(stream, compression_dict.data(),
                         static_cast<int>(compression_dict.size()));
   }
-  *decompress_size = LZ4_decompress_safe_continue(
+  decompress_bytes = LZ4_decompress_safe_continue(
       stream, input_data, output.get(), static_cast<int>(input_length),
       static_cast<int>(output_len));
   LZ4_freeStreamDecode(stream);
 #else   // up to r123
-  *decompress_size = LZ4_decompress_safe(input_data, output.get(),
+  decompress_bytes = LZ4_decompress_safe(input_data, output.get(),
                                          static_cast<int>(input_length),
                                          static_cast<int>(output_len));
 #endif  // LZ4_VERSION_NUMBER >= 10400
 
-  if (*decompress_size < 0) {
+  if (decompress_bytes < 0) {
     return nullptr;
   }
-  assert(*decompress_size == static_cast<int>(output_len));
+  assert(decompress_bytes == static_cast<int>(output_len));
+  *uncompressed_size = decompress_bytes;
   return output;
 #else  // LZ4
   (void)info;
   (void)input_data;
   (void)input_length;
-  (void)decompress_size;
+  (void)uncompressed_size;
   (void)compress_format_version;
   (void)allocator;
   return nullptr;
@@ -1177,8 +1230,10 @@
   const char* compression_dict_data =
       compression_dict.size() > 0 ? compression_dict.data() : nullptr;
   size_t compression_dict_size = compression_dict.size();
-  LZ4_loadDictHC(stream, compression_dict_data,
-                 static_cast<int>(compression_dict_size));
+  if (compression_dict_data != nullptr) {
+    LZ4_loadDictHC(stream, compression_dict_data,
+                  static_cast<int>(compression_dict_size));
+  }
 
 #if LZ4_VERSION_NUMBER >= 10700  // r129+
   outlen =
@@ -1230,13 +1285,13 @@
 
 #ifdef XPRESS
 inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
-                               int* decompress_size) {
-  return port::xpress::Decompress(input_data, input_length, decompress_size);
+                               size_t* uncompressed_size) {
+  return port::xpress::Decompress(input_data, input_length, uncompressed_size);
 }
 #else
 inline char* XPRESS_Uncompress(const char* /*input_data*/,
                                size_t /*input_length*/,
-                               int* /*decompress_size*/) {
+                               size_t* /*uncompressed_size*/) {
   return nullptr;
 }
 #endif
@@ -1301,7 +1356,7 @@
 //    dictionary.
 inline CacheAllocationPtr ZSTD_Uncompress(
     const UncompressionInfo& info, const char* input_data, size_t input_length,
-    int* decompress_size, MemoryAllocator* allocator = nullptr) {
+    size_t* uncompressed_size, MemoryAllocator* allocator = nullptr) {
 #ifdef ZSTD
   uint32_t output_len = 0;
   if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
@@ -1332,13 +1387,13 @@
       ZSTD_decompress(output.get(), output_len, input_data, input_length);
 #endif  // ZSTD_VERSION_NUMBER >= 500
   assert(actual_output_length == output_len);
-  *decompress_size = static_cast<int>(actual_output_length);
+  *uncompressed_size = actual_output_length;
   return output;
 #else  // ZSTD
   (void)info;
   (void)input_data;
   (void)input_length;
-  (void)decompress_size;
+  (void)uncompressed_size;
   (void)allocator;
   return nullptr;
 #endif
@@ -1404,4 +1459,82 @@
 #endif  // ZSTD_VERSION_NUMBER >= 10103
 }
 
+inline bool CompressData(const Slice& raw,
+                         const CompressionInfo& compression_info,
+                         uint32_t compress_format_version,
+                         std::string* compressed_output) {
+  bool ret = false;
+
+  // Will return compressed block contents if (1) the compression method is
+  // supported in this platform and (2) the compression rate is "good enough".
+  switch (compression_info.type()) {
+    case kSnappyCompression:
+      ret = Snappy_Compress(compression_info, raw.data(), raw.size(),
+                            compressed_output);
+      break;
+    case kZlibCompression:
+      ret = Zlib_Compress(compression_info, compress_format_version, raw.data(),
+                          raw.size(), compressed_output);
+      break;
+    case kBZip2Compression:
+      ret = BZip2_Compress(compression_info, compress_format_version,
+                           raw.data(), raw.size(), compressed_output);
+      break;
+    case kLZ4Compression:
+      ret = LZ4_Compress(compression_info, compress_format_version, raw.data(),
+                         raw.size(), compressed_output);
+      break;
+    case kLZ4HCCompression:
+      ret = LZ4HC_Compress(compression_info, compress_format_version,
+                           raw.data(), raw.size(), compressed_output);
+      break;
+    case kXpressCompression:
+      ret = XPRESS_Compress(raw.data(), raw.size(), compressed_output);
+      break;
+    case kZSTD:
+    case kZSTDNotFinalCompression:
+      ret = ZSTD_Compress(compression_info, raw.data(), raw.size(),
+                          compressed_output);
+      break;
+    default:
+      // Do not recognize this compression type
+      break;
+  }
+
+  TEST_SYNC_POINT_CALLBACK("CompressData:TamperWithReturnValue",
+                           static_cast<void*>(&ret));
+
+  return ret;
+}
+
+inline CacheAllocationPtr UncompressData(
+    const UncompressionInfo& uncompression_info, const char* data, size_t n,
+    size_t* uncompressed_size, uint32_t compress_format_version,
+    MemoryAllocator* allocator = nullptr) {
+  switch (uncompression_info.type()) {
+    case kSnappyCompression:
+      return Snappy_Uncompress(data, n, uncompressed_size, allocator);
+    case kZlibCompression:
+      return Zlib_Uncompress(uncompression_info, data, n, uncompressed_size,
+                             compress_format_version, allocator);
+    case kBZip2Compression:
+      return BZip2_Uncompress(data, n, uncompressed_size,
+                              compress_format_version, allocator);
+    case kLZ4Compression:
+    case kLZ4HCCompression:
+      return LZ4_Uncompress(uncompression_info, data, n, uncompressed_size,
+                            compress_format_version, allocator);
+    case kXpressCompression:
+      // XPRESS allocates memory internally, thus no support for custom
+      // allocator.
+      return CacheAllocationPtr(XPRESS_Uncompress(data, n, uncompressed_size));
+    case kZSTD:
+    case kZSTDNotFinalCompression:
+      return ZSTD_Uncompress(uncompression_info, data, n, uncompressed_size,
+                             allocator);
+    default:
+      return CacheAllocationPtr();
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/compression_context_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression_context_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/compression_context_cache.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/compression_context_cache.cc	2025-05-19 16:14:28.000000000 +0000
@@ -65,8 +65,6 @@
               "Expected CACHE_LINE_SIZE alignment");
 }  // namespace compression_cache
 
-using namespace compression_cache;
-
 class CompressionContextCache::Rep {
  public:
   Rep() {}
@@ -82,7 +80,7 @@
   }
 
  private:
-  CoreLocalArray<ZSTDCachedData> per_core_uncompr_;
+  CoreLocalArray<compression_cache::ZSTDCachedData> per_core_uncompr_;
 };
 
 CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.cc	2025-05-19 16:14:28.000000000 +0000
@@ -10,22 +10,29 @@
 // A portable implementation of crc32c, optimized to handle
 // four bytes at a time.
 #include "util/crc32c.h"
+
 #include <stdint.h>
+
+#include <array>
+#include <utility>
 #ifdef HAVE_SSE42
 #include <nmmintrin.h>
 #include <wmmintrin.h>
 #endif
-#include "util/coding.h"
-#include "util/util.h"
 
+#include "port/lang.h"
+#include "util/coding.h"
 #include "util/crc32c_arm64.h"
+#include "util/math.h"
 
 #ifdef __powerpc64__
 #include "util/crc32c_ppc.h"
 #include "util/crc32c_ppc_constants.h"
 
 #if __linux__
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
 #include <sys/auxv.h>
+#endif
 
 #ifndef PPC_FEATURE2_VEC_CRYPTO
 #define PPC_FEATURE2_VEC_CRYPTO 0x02000000
@@ -35,10 +42,18 @@
 #define AT_HWCAP2 26
 #endif
 
+#elif __FreeBSD__
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#include <sys/elf_common.h>
 #endif /* __linux__ */
 
 #endif
 
+#if defined(HAVE_ARM64_CRC)
+bool pmull_runtime_flag = false;
+#endif
+
 namespace ROCKSDB_NAMESPACE {
 namespace crc32c {
 
@@ -340,6 +355,9 @@
   table0_[c >> 24];
 }
 
+#if (!(defined(HAVE_POWER8) && defined(HAS_ALTIVEC))) && \
+        (!defined(HAVE_ARM64_CRC)) ||                    \
+    defined(NO_THREEWAY_CRC32C)
 static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
 #ifndef HAVE_SSE42
   Slow_CRC32(l, p);
@@ -353,6 +371,7 @@
   *p += 4;
 #endif
 }
+#endif
 
 template<void (*CRC32)(uint64_t*, uint8_t const**)>
 uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
@@ -440,7 +459,7 @@
 #endif  // HAVE_POWER8
 #endif  // HAVE_ARM64_CRC
 
-typedef uint32_t (*Function)(uint32_t, const char*, size_t);
+using Function = uint32_t (*)(uint32_t, const char*, size_t);
 
 #if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
 uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) {
@@ -451,12 +470,24 @@
 static int arch_ppc_probe(void) {
   arch_ppc_crc32 = 0;
 
-#if defined(__powerpc64__)
+#if defined(__powerpc64__) && defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
   if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1;
 #endif /* __powerpc64__ */
 
   return arch_ppc_crc32;
 }
+#elif __FreeBSD__
+static int arch_ppc_probe(void) {
+  unsigned long cpufeatures;
+  arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__)
+  elf_aux_info(AT_HWCAP2, &cpufeatures, sizeof(cpufeatures));
+  if (cpufeatures & PPC_FEATURE2_HAS_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif  /* __powerpc64__ */
+
+  return arch_ppc_crc32;
+}
 #endif  // __linux__
 
 static bool isAltiVec() {
@@ -468,7 +499,7 @@
 }
 #endif
 
-#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+#if defined(HAVE_ARM64_CRC)
 uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
   return crc32c_arm64(crc, (const unsigned char *)buf, size);
 }
@@ -488,10 +519,11 @@
   has_fast_crc = false;
   arch = "PPC";
 #endif
-#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+#elif defined(HAVE_ARM64_CRC)
   if (crc32c_runtime_check()) {
     has_fast_crc = true;
     arch = "Arm64";
+    pmull_runtime_flag = crc32c_pmull_runtime_check();
   } else {
     has_fast_crc = false;
     arch = "Arm64";
@@ -723,29 +755,29 @@
           do {
             // jumps here for a full block of len 128
             CRCtriplet(crc, next, -128);
-	    FALLTHROUGH_INTENDED;
+            FALLTHROUGH_INTENDED;
             case 127:
               // jumps here or below for the first block smaller
               CRCtriplet(crc, next, -127);
-	      FALLTHROUGH_INTENDED;
+              FALLTHROUGH_INTENDED;
             case 126:
               CRCtriplet(crc, next, -126); // than 128
-	      FALLTHROUGH_INTENDED;
+              FALLTHROUGH_INTENDED;
             case 125:
               CRCtriplet(crc, next, -125);
-	      FALLTHROUGH_INTENDED;
+              FALLTHROUGH_INTENDED;
             case 124:
               CRCtriplet(crc, next, -124);
-	      FALLTHROUGH_INTENDED;
+              FALLTHROUGH_INTENDED;
             case 123:
               CRCtriplet(crc, next, -123);
-	      FALLTHROUGH_INTENDED;
+              FALLTHROUGH_INTENDED;
             case 122:
               CRCtriplet(crc, next, -122);
-	      FALLTHROUGH_INTENDED;
+              FALLTHROUGH_INTENDED;
             case 121:
               CRCtriplet(crc, next, -121);
-	      FALLTHROUGH_INTENDED;
+              FALLTHROUGH_INTENDED;
             case 120:
               CRCtriplet(crc, next, -120);
               FALLTHROUGH_INTENDED;
@@ -1220,8 +1252,9 @@
 static inline Function Choose_Extend() {
 #ifdef HAVE_POWER8
   return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
-#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+#elif defined(HAVE_ARM64_CRC)
   if(crc32c_runtime_check()) {
+    pmull_runtime_flag = crc32c_pmull_runtime_check();
     return ExtendARMImpl;
   } else {
     return ExtendImpl<Slow_CRC32>;
@@ -1250,6 +1283,164 @@
   return ChosenExtend(crc, buf, size);
 }
 
+// The code for crc32c combine, copied with permission from folly
+
+// Standard galois-field multiply.  The only modification is that a,
+// b, m, and p are all bit-reflected.
+//
+// https://en.wikipedia.org/wiki/Finite_field_arithmetic
+static constexpr uint32_t gf_multiply_sw_1(
+    size_t i, uint32_t p, uint32_t a, uint32_t b, uint32_t m) {
+  // clang-format off
+  return i == 32 ? p : gf_multiply_sw_1(
+      /* i = */ i + 1,
+      /* p = */ p ^ ((0u-((b >> 31) & 1)) & a),
+      /* a = */ (a >> 1) ^ ((0u-(a & 1)) & m),
+      /* b = */ b << 1,
+      /* m = */ m);
+  // clang-format on
+}
+static constexpr uint32_t gf_multiply_sw(uint32_t a, uint32_t b, uint32_t m) {
+  return gf_multiply_sw_1(/* i = */ 0, /* p = */ 0, a, b, m);
+}
+
+static constexpr uint32_t gf_square_sw(uint32_t a, uint32_t m) {
+  return gf_multiply_sw(a, a, m);
+}
+
+template <size_t i, uint32_t m>
+struct gf_powers_memo {
+  static constexpr uint32_t value =
+      gf_square_sw(gf_powers_memo<i - 1, m>::value, m);
+};
+template <uint32_t m>
+struct gf_powers_memo<0, m> {
+  static constexpr uint32_t value = m;
+};
+
+template <typename T, T... Ints>
+struct integer_sequence {
+  using value_type = T;
+  static constexpr size_t size() { return sizeof...(Ints); }
+};
+
+template <typename T, std::size_t N, T... Is>
+struct make_integer_sequence : make_integer_sequence<T, N - 1, N - 1, Is...> {};
+
+template <typename T, T... Is>
+struct make_integer_sequence<T, 0, Is...> : integer_sequence<T, Is...> {};
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <uint32_t m>
+struct gf_powers_make {
+  template <size_t... i>
+  using index_sequence = integer_sequence<size_t, i...>;
+  template <size_t... i>
+  constexpr std::array<uint32_t, sizeof...(i)> operator()(
+      index_sequence<i...>) const {
+    return std::array<uint32_t, sizeof...(i)>{{gf_powers_memo<i, m>::value...}};
+  }
+};
+
+static constexpr uint32_t crc32c_m = 0x82f63b78;
+
+static constexpr std::array<uint32_t, 62> const crc32c_powers =
+    gf_powers_make<crc32c_m>{}(make_index_sequence<62>{});
+
+// Expects a "pure" crc (see Crc32cCombine)
+static uint32_t Crc32AppendZeroes(
+    uint32_t crc, size_t len_over_4, uint32_t polynomial,
+    std::array<uint32_t, 62> const& powers_array) {
+  auto powers = powers_array.data();
+  // Append by multiplying by consecutive powers of two of the zeroes
+  // array
+  size_t len_bits = len_over_4;
+
+  while (len_bits) {
+    // Advance directly to next bit set.
+    auto r = CountTrailingZeroBits(len_bits);
+    len_bits >>= r;
+    powers += r;
+
+    crc = gf_multiply_sw(crc, *powers, polynomial);
+
+    len_bits >>= 1;
+    powers++;
+  }
+
+  return crc;
+}
+
+static inline uint32_t InvertedToPure(uint32_t crc) { return ~crc; }
+
+static inline uint32_t PureToInverted(uint32_t crc) { return ~crc; }
+
+static inline uint32_t PureExtend(uint32_t crc, const char* buf, size_t size) {
+  return InvertedToPure(Extend(PureToInverted(crc), buf, size));
+}
+
+// Background:
+// RocksDB uses two kinds of crc32c values: masked and unmasked. Neither is
+// a "pure" CRC because a pure CRC satisfies (^ for xor)
+//  crc(a ^ b) = crc(a) ^ crc(b)
+// The unmasked is closest, and this function takes unmasked crc32c values.
+// The unmasked values are impure in two ways:
+// * The initial setting at the start of CRC computation is all 1 bits
+// (like -1) instead of zero.
+// * The result has all bits invered.
+// Note that together, these result in the empty string having a crc32c of
+// zero. See
+// https://en.wikipedia.org/wiki/Computation_of_cyclic_redundancy_checks#CRC_variants
+//
+// Simplified version of strategy, using xor through pure CRCs (+ for concat):
+//
+// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^
+//                         pure_crc(zeros(len(str1)) + str2)
+//
+// because the xor of these two zero-padded strings is str1 + str2. For pure
+// CRC, leading zeros don't affect the result, so we only need
+//
+// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^
+//                         pure_crc(str2)
+//
+// Considering we aren't working with pure CRCs, what is actually in the input?
+//
+// crc1 = PureToInverted(PureExtendCrc32c(-1, zeros, crc1len) ^
+//                       PureCrc32c(str1, crc1len))
+// crc2 = PureToInverted(PureExtendCrc32c(-1, zeros, crc2len) ^
+//                       PureCrc32c(str2, crc2len))
+//
+// The result we want to compute is
+// combined = PureToInverted(PureExtendCrc32c(PureExtendCrc32c(-1, zeros,
+//                                                             crc1len) ^
+//                                            PureCrc32c(str1, crc1len),
+//                                            zeros, crc2len) ^
+//                           PureCrc32c(str2, crc2len))
+//
+// Thus, in addition to extending crc1 over the length of str2 in (virtual)
+// zeros, we need to cancel out the -1 initializer that was used in computing
+// crc2. To cancel it out, we also need to extend it over crc2len in zeros.
+// To simplify, since the end of str1 and that -1 initializer for crc2 are at
+// the same logical position, we can combine them before we extend over the
+// zeros.
+uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  uint32_t pure_crc1_with_init = InvertedToPure(crc1);
+  uint32_t pure_crc2_with_init = InvertedToPure(crc2);
+  uint32_t pure_crc2_init = static_cast<uint32_t>(-1);
+
+  // Append up to 32 bits of zeroes in the normal way
+  char zeros[4] = {0, 0, 0, 0};
+  auto len = crc2len & 3;
+  uint32_t tmp = pure_crc1_with_init ^ pure_crc2_init;
+  if (len) {
+    tmp = PureExtend(tmp, zeros, len);
+  }
+  return PureToInverted(
+      Crc32AppendZeroes(tmp, crc2len / 4, crc32c_m, crc32c_powers) ^
+      pure_crc2_with_init);
+}
 
 }  // namespace crc32c
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c.h	2025-05-19 16:14:28.000000000 +0000
@@ -24,6 +24,12 @@
 // crc32c of a stream of data.
 extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
 
+// Takes two unmasked crc32c values, and the length of the string from
+// which `crc2` was computed, and computes a crc32c value for the
+// concatenation of the original two input strings. Running time is
+// ~ log(crc2len).
+extern uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len);
+
 // Return the crc32c of data[0,n-1]
 inline uint32_t Value(const char* data, size_t n) {
   return Extend(0, data, n);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_arm64.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_arm64.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,13 +5,23 @@
 
 #include "util/crc32c_arm64.h"
 
-#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+#if defined(HAVE_ARM64_CRC)
 
+#if defined(__linux__)
 #include <asm/hwcap.h>
+#endif
+#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
 #include <sys/auxv.h>
+#endif
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32 (1 << 7)
 #endif
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL (1 << 4)
+#endif
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
 
 #ifdef HAVE_ARM64_CRYPTO
 /* unfolding to compute 8 * 3 = 24 bytes parallelly */
@@ -33,18 +43,60 @@
   } while (0)
 #endif
 
+extern bool pmull_runtime_flag;
+
 uint32_t crc32c_runtime_check(void) {
-  uint64_t auxv = getauxval(AT_HWCAP);
+#if !defined(__APPLE__)
+  uint64_t auxv = 0;
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+  auxv = getauxval(AT_HWCAP);
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
+#endif
   return (auxv & HWCAP_CRC32) != 0;
+#else
+  int r;
+  size_t l = sizeof(r);
+  if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0;
+  return r == 1;
+#endif
 }
 
-uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
-                             unsigned len) {
+bool crc32c_pmull_runtime_check(void) {
+#if !defined(__APPLE__)
+  uint64_t auxv = 0;
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+  auxv = getauxval(AT_HWCAP);
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
+#endif
+  return (auxv & HWCAP_PMULL) != 0;
+#else
+  return true;
+#endif
+}
+
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint32_t
+crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
   const uint8_t *buf8;
   const uint64_t *buf64 = (uint64_t *)data;
   int length = (int)len;
   crc ^= 0xffffffff;
 
+  /*
+   * Pmull runtime check here.
+   * Raspberry Pi supports crc32 but doesn't support pmull.
+   * Skip Crc32c Parallel computation if no crypto extension available.
+   */
+  if (pmull_runtime_flag) {
+/* Macro (HAVE_ARM64_CRYPTO) is used for compiling check  */
 #ifdef HAVE_ARM64_CRYPTO
 /* Crc32c Parallel computation
  *   Algorithm comes from Intel whitepaper:
@@ -55,51 +107,53 @@
  *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
  */
 #define BLK_LENGTH 42
-  while (length >= 1024) {
-    uint64_t t0, t1;
-    uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
-
-    /* Parallel Param:
-     *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
-     *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
-     */
-    uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
-
-    /* Prefetch data for following block to avoid cache miss */
-    PREF1KL1((uint8_t *)buf64, 1024);
-
-    /* First 8 byte for better pipelining */
-    crc0 = crc32c_u64(crc, *buf64++);
-
-    /* 3 blocks crc32c parallel computation
-     * Macro unfolding to compute parallelly
-     * 168 * 6 = 1008 (bytes)
-     */
-    CRC32C7X24BYTES(0);
-    CRC32C7X24BYTES(1);
-    CRC32C7X24BYTES(2);
-    CRC32C7X24BYTES(3);
-    CRC32C7X24BYTES(4);
-    CRC32C7X24BYTES(5);
-    buf64 += (BLK_LENGTH * 3);
-
-    /* Last 8 bytes */
-    crc = crc32c_u64(crc2, *buf64++);
-
-    t0 = (uint64_t)vmull_p64(crc0, k0);
-    t1 = (uint64_t)vmull_p64(crc1, k1);
-
-    /* Merge (crc0, crc1, crc2) -> crc */
-    crc1 = crc32c_u64(0, t1);
-    crc ^= crc1;
-    crc0 = crc32c_u64(0, t0);
-    crc ^= crc0;
+    while (length >= 1024) {
+      uint64_t t0, t1;
+      uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
+
+      /* Parallel Param:
+       *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+       *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+       */
+      uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+      /* Prefetch data for following block to avoid cache miss */
+      PREF1KL1((uint8_t *)buf64, 1024);
+
+      /* First 8 byte for better pipelining */
+      crc0 = crc32c_u64(crc, *buf64++);
+
+      /* 3 blocks crc32c parallel computation
+       * Macro unfolding to compute parallelly
+       * 168 * 6 = 1008 (bytes)
+       */
+      CRC32C7X24BYTES(0);
+      CRC32C7X24BYTES(1);
+      CRC32C7X24BYTES(2);
+      CRC32C7X24BYTES(3);
+      CRC32C7X24BYTES(4);
+      CRC32C7X24BYTES(5);
+      buf64 += (BLK_LENGTH * 3);
+
+      /* Last 8 bytes */
+      crc = crc32c_u64(crc2, *buf64++);
+
+      t0 = (uint64_t)vmull_p64(crc0, k0);
+      t1 = (uint64_t)vmull_p64(crc1, k1);
+
+      /* Merge (crc0, crc1, crc2) -> crc */
+      crc1 = crc32c_u64(0, t1);
+      crc ^= crc1;
+      crc0 = crc32c_u64(0, t0);
+      crc ^= crc0;
 
-    length -= 1024;
-  }
+      length -= 1024;
+    }
 
-  if (length == 0) return crc ^ (0xffffffffU);
+    if (length == 0) return crc ^ (0xffffffffU);
 #endif
+  }  // if Pmull runtime check here
+
   buf8 = (const uint8_t *)buf64;
   while (length >= 8) {
     crc = crc32c_u64(crc, *(const uint64_t *)buf8);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_arm64.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_arm64.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_arm64.h	2025-05-19 16:14:28.000000000 +0000
@@ -7,6 +7,7 @@
 #define UTIL_CRC32C_ARM64_H
 
 #include <cinttypes>
+#include <cstddef>
 
 #if defined(__aarch64__) || defined(__AARCH64__)
 
@@ -33,8 +34,10 @@
   PREF4X64L1(buffer, (PREF_OFFSET), 8) \
   PREF4X64L1(buffer, (PREF_OFFSET), 12)
 
-extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
+extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
+                             size_t len);
 extern uint32_t crc32c_runtime_check(void);
+extern bool crc32c_pmull_runtime_check(void);
 
 #ifdef __ARM_FEATURE_CRYPTO
 #define HAVE_ARM64_CRYPTO
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_ppc.c mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.c
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_ppc.c	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.c	2025-05-19 16:14:28.000000000 +0000
@@ -27,7 +27,7 @@
                             unsigned long len);
 
 static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
-                             unsigned len) {
+                             size_t len) {
   unsigned int prealign;
   unsigned int tail;
 
@@ -67,7 +67,7 @@
  * does not gracefully handle the case where the data pointer is NULL.  There
  * may be room for performance improvement here.
  */
-uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) {
   unsigned char *buf2;
 
   if (!data) {
@@ -87,7 +87,7 @@
  * ppc systems using power7 or below) in order to compile properly
  * there, even though it won't be called.
  */
-uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) {
   return 0;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_ppc.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_ppc.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc.h	2025-05-19 16:14:28.000000000 +0000
@@ -7,12 +7,15 @@
 
 #pragma once
 
+#include <cstddef>
+#include <cstdint>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
-                           unsigned len);
+                           size_t len);
 
 #ifdef __cplusplus
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_ppc_asm.S mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc_asm.S
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_ppc_asm.S	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_ppc_asm.S	2025-05-19 16:14:28.000000000 +0000
@@ -6,7 +6,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#if defined (__clang__)
+#include "third-party/gcc/ppc-asm.h"
+#else
 #include <ppc-asm.h>
+#endif
 #include "ppc-opcode.h"
 
 #undef toc
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/crc32c_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/crc32c_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,8 +7,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "util/crc32c.h"
+
 #include "test_util/testharness.h"
 #include "util/coding.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace crc32c {
@@ -137,6 +139,51 @@
   ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
 }
 
+TEST(CRC, Crc32cCombineBasicTest) {
+  uint32_t crc1 = Value("hello ", 6);
+  uint32_t crc2 = Value("world", 5);
+  uint32_t crc3 = Value("hello world", 11);
+  uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, 5);
+  ASSERT_EQ(crc3, crc1_2_combine);
+}
+
+TEST(CRC, Crc32cCombineOrderMattersTest) {
+  uint32_t crc1 = Value("hello ", 6);
+  uint32_t crc2 = Value("world", 5);
+  uint32_t crc3 = Value("hello world", 11);
+  uint32_t crc2_1_combine = Crc32cCombine(crc2, crc1, 6);
+  ASSERT_NE(crc3, crc2_1_combine);
+}
+
+TEST(CRC, Crc32cCombineFullCoverTest) {
+  int scale = 4 * 1024;
+  Random rnd(test::RandomSeed());
+  int size_1 = 1024 * 1024;
+  std::string s1 = rnd.RandomBinaryString(size_1);
+  uint32_t crc1 = Value(s1.data(), size_1);
+  for (int i = 0; i < scale; i++) {
+    int size_2 = i;
+    std::string s2 = rnd.RandomBinaryString(size_2);
+    uint32_t crc2 = Value(s2.data(), s2.size());
+    uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
+    uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
+    ASSERT_EQ(crc1_2, crc1_2_combine);
+  }
+}
+
+TEST(CRC, Crc32cCombineBigSizeTest) {
+  Random rnd(test::RandomSeed());
+  int size_1 = 1024 * 1024;
+  std::string s1 = rnd.RandomBinaryString(size_1);
+  uint32_t crc1 = Value(s1.data(), size_1);
+  int size_2 = 16 * 1024 * 1024 - 1;
+  std::string s2 = rnd.RandomBinaryString(size_2);
+  uint32_t crc2 = Value(s2.data(), s2.size());
+  uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
+  uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
+  ASSERT_EQ(crc1_2, crc1_2_combine);
+}
+
 }  // namespace crc32c
 }  // namespace ROCKSDB_NAMESPACE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/defer.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/defer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer.h	2025-05-19 16:14:28.000000000 +0000
@@ -7,6 +7,8 @@
 
 #include <functional>
 
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 // Defers the execution of the provided function until the Defer
@@ -38,7 +40,7 @@
 // lambda passed to Defer, and you can return immediately on failure when necessary.
 class Defer final {
  public:
-  Defer(std::function<void()>&& fn) : fn_(std::move(fn)) {}
+  explicit Defer(std::function<void()>&& fn) : fn_(std::move(fn)) {}
   ~Defer() { fn_(); }
 
   // Disallow copy.
@@ -49,4 +51,31 @@
   std::function<void()> fn_;
 };
 
+// An RAII utility object that saves the current value of an object so that
+// it can be overwritten, and restores it to the saved value when the
+// SaveAndRestore object goes out of scope.
+template <typename T>
+class SaveAndRestore {
+ public:
+  // obj is non-null pointer to value to be saved and later restored.
+  explicit SaveAndRestore(T* obj) : obj_(obj), saved_(*obj) {}
+  // new_value is stored in *obj
+  SaveAndRestore(T* obj, const T& new_value)
+      : obj_(obj), saved_(std::move(*obj)) {
+    *obj = new_value;
+  }
+  SaveAndRestore(T* obj, T&& new_value) : obj_(obj), saved_(std::move(*obj)) {
+    *obj = std::move(new_value);
+  }
+  ~SaveAndRestore() { *obj_ = std::move(saved_); }
+
+  // No copies
+  SaveAndRestore(const SaveAndRestore&) = delete;
+  SaveAndRestore& operator=(const SaveAndRestore&) = delete;
+
+ private:
+  T* const obj_;
+  T saved_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/defer_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/defer_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/defer_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -30,6 +30,17 @@
   ASSERT_EQ(4, v);
 }
 
+TEST(SaveAndRestoreTest, BlockScope) {
+  int v = 1;
+  {
+    SaveAndRestore<int> sr(&v);
+    ASSERT_EQ(v, 1);
+    v = 2;
+    ASSERT_EQ(v, 2);
+  }
+  ASSERT_EQ(v, 1);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/duplicate_detector.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/duplicate_detector.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/duplicate_detector.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/duplicate_detector.h	2025-05-19 16:14:28.000000000 +0000
@@ -5,8 +5,10 @@
 
 #pragma once
 
-#include <cinttypes>
+#include <cstdint>
 
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
 #include "util/set_comparator.h"
 
 namespace ROCKSDB_NAMESPACE {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/dynamic_bloom.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/dynamic_bloom.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom.h	2025-05-19 16:14:28.000000000 +0000
@@ -65,7 +65,7 @@
   // Multithreaded access to this function is OK
   bool MayContain(const Slice& key) const;
 
-  void MayContain(int num_keys, Slice** keys, bool* may_match) const;
+  void MayContain(int num_keys, Slice* keys, bool* may_match) const;
 
   // Multithreaded access to this function is OK
   bool MayContainHash(uint32_t hash) const;
@@ -120,13 +120,13 @@
   return (MayContainHash(BloomHash(key)));
 }
 
-inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
+inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
                                      bool* may_match) const {
   std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
   std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
   for (int i = 0; i < num_keys; ++i) {
-    hashes[i] = BloomHash(*keys[i]);
-    size_t a = fastrange32(kLen, hashes[i]);
+    hashes[i] = BloomHash(keys[i]);
+    size_t a = FastRange32(kLen, hashes[i]);
     PREFETCH(data_ + a, 0, 3);
     byte_offsets[i] = a;
   }
@@ -142,7 +142,7 @@
 #pragma warning(disable : 4189)
 #endif
 inline void DynamicBloom::Prefetch(uint32_t h32) {
-  size_t a = fastrange32(kLen, h32);
+  size_t a = FastRange32(kLen, h32);
   PREFETCH(data_ + a, 0, 3);
 }
 #if defined(_MSC_VER)
@@ -171,7 +171,7 @@
 // because of false positives.)
 
 inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
-  size_t a = fastrange32(kLen, h32);
+  size_t a = FastRange32(kLen, h32);
   PREFETCH(data_ + a, 0, 3);
   return DoubleProbe(h32, a);
 }
@@ -195,7 +195,7 @@
 
 template <typename OrFunc>
 inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
-  size_t a = fastrange32(kLen, h32);
+  size_t a = FastRange32(kLen, h32);
   PREFETCH(data_ + a, 0, 3);
   // Expand/remix with 64-bit golden ratio
   uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/dynamic_bloom_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/dynamic_bloom_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/dynamic_bloom_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -20,9 +20,9 @@
 #include <vector>
 
 #include "dynamic_bloom.h"
-#include "logging/logging.h"
 #include "memory/arena.h"
 #include "port/port.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
@@ -178,7 +178,7 @@
 
 TEST_F(DynamicBloomTest, perf) {
   KeyMaker km;
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(SystemClock::Default().get());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
   if (!FLAGS_enable_perf) {
@@ -238,7 +238,7 @@
 
     std::function<void(size_t)> adder([&](size_t t) {
       KeyMaker km;
-      StopWatchNano timer(Env::Default());
+      StopWatchNano timer(SystemClock::Default().get());
       timer.Start();
       for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
         std_bloom.AddConcurrently(km.Seq(i));
@@ -261,7 +261,7 @@
     elapsed = 0;
     std::function<void(size_t)> hitter([&](size_t t) {
       KeyMaker km;
-      StopWatchNano timer(Env::Default());
+      StopWatchNano timer(SystemClock::Default().get());
       timer.Start();
       for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
         bool f = std_bloom.MayContain(km.Seq(i));
@@ -286,7 +286,7 @@
     std::atomic<uint32_t> false_positives(0);
     std::function<void(size_t)> misser([&](size_t t) {
       KeyMaker km;
-      StopWatchNano timer(Env::Default());
+      StopWatchNano timer(SystemClock::Default().get());
       timer.Start();
       for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; i += num_threads) {
         bool f = std_bloom.MayContain(km.Seq(i));
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/fastrange.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/fastrange.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/fastrange.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/fastrange.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,114 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// fastrange/FastRange: A faster alternative to % for mapping a hash value
+// to an arbitrary range. See https://github.com/lemire/fastrange
+//
+// Generally recommended are FastRange32 for mapping results of 32-bit
+// hash functions and FastRange64 for mapping results of 64-bit hash
+// functions. FastRange is less forgiving than % if the input hashes are
+// not well distributed over the full range of the type (32 or 64 bits).
+//
+// Also included is a templated implementation FastRangeGeneric for use
+// in generic algorithms, but not otherwise recommended because of
+// potential ambiguity. Unlike with %, it is critical to use the right
+// FastRange variant for the output size of your hash function.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+#ifdef TEST_UINT128_COMPAT
+#undef HAVE_UINT128_EXTENSION
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace detail {
+
+// Using a class template to support partial specialization
+template <typename Hash, typename Range>
+struct FastRangeGenericImpl {
+  // only reach this on no supported specialization
+};
+
+template <typename Range>
+struct FastRangeGenericImpl<uint32_t, Range> {
+  static inline Range Fn(uint32_t hash, Range range) {
+    static_assert(std::is_unsigned<Range>::value, "must be unsigned");
+    static_assert(sizeof(Range) <= sizeof(uint32_t),
+                  "cannot be larger than hash (32 bits)");
+
+    uint64_t product = uint64_t{range} * hash;
+    return static_cast<Range>(product >> 32);
+  }
+};
+
+template <typename Range>
+struct FastRangeGenericImpl<uint64_t, Range> {
+  static inline Range Fn(uint64_t hash, Range range) {
+    static_assert(std::is_unsigned<Range>::value, "must be unsigned");
+    static_assert(sizeof(Range) <= sizeof(uint64_t),
+                  "cannot be larger than hash (64 bits)");
+
+#ifdef HAVE_UINT128_EXTENSION
+    // Can use compiler's 128-bit type. Trust it to do the right thing.
+    __uint128_t wide = __uint128_t{range} * hash;
+    return static_cast<Range>(wide >> 64);
+#else
+    // Fall back: full decomposition.
+    // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit
+    // -> 128-bit multiplication and optimize it appropriately
+    uint64_t range64 = range;  // ok to shift by 32, even if Range is 32-bit
+    uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF};
+    tmp >>= 32;
+    tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32};
+    // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+    uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF};
+    tmp += static_cast<uint32_t>(tmp2);
+    tmp >>= 32;
+    tmp += (tmp2 >> 32);
+    tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32};
+    return static_cast<Range>(tmp);
+#endif
+  }
+};
+
+}  // namespace detail
+
+// Now an omnibus templated function (yay parameter inference).
+//
+// NOTICE:
+// This templated version is not recommended for typical use because
+// of the potential to mix a 64-bit FastRange with a 32-bit bit hash,
+// most likely because you put your 32-bit hash in an "unsigned long"
+// which is 64 bits on some platforms. That doesn't really matter for
+// an operation like %, but 64-bit FastRange gives extremely bad results,
+// mostly zero, on 32-bit hash values. And because good hashing is not
+// generally required for correctness, this kind of mistake could go
+// unnoticed with just unit tests. Plus it could vary by platform.
+template <typename Hash, typename Range>
+inline Range FastRangeGeneric(Hash hash, Range range) {
+  return detail::FastRangeGenericImpl<Hash, Range>::Fn(hash, range);
+}
+
+// The most popular / convenient / recommended variants:
+
+// Map a quality 64-bit hash value down to an arbitrary size_t range.
+// (size_t is standard for mapping to things in memory.)
+inline size_t FastRange64(uint64_t hash, size_t range) {
+  return FastRangeGeneric(hash, range);
+}
+
+// Map a quality 32-bit hash value down to an arbitrary uint32_t range.
+inline uint32_t FastRange32(uint32_t hash, uint32_t range) {
+  return FastRangeGeneric(hash, range);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/file_checksum_helper.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/file_checksum_helper.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.cc	2025-05-19 16:14:28.000000000 +0000
@@ -9,6 +9,14 @@
 
 #include "util/file_checksum_helper.h"
 
+#include <unordered_set>
+
+#include "db/log_reader.h"
+#include "db/version_edit.h"
+#include "db/version_edit_handler.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/utilities/customizable_util.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 void FileChecksumListImpl::reset() { checksum_map_.clear(); }
@@ -77,9 +85,88 @@
   return checksum_list;
 }
 
-FileChecksumFunc* CreateFileChecksumFuncCrc32c() {
-  FileChecksumFunc* file_checksum_crc32c = new FileChecksumFuncCrc32c();
-  return file_checksum_crc32c;
+std::shared_ptr<FileChecksumGenFactory> GetFileChecksumGenCrc32cFactory() {
+  static std::shared_ptr<FileChecksumGenFactory> default_crc32c_gen_factory(
+      new FileChecksumGenCrc32cFactory());
+  return default_crc32c_gen_factory;
+}
+
+Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path,
+                                    uint64_t manifest_file_size,
+                                    FileChecksumList* checksum_list) {
+  if (checksum_list == nullptr) {
+    return Status::InvalidArgument("checksum_list is nullptr");
+  }
+  assert(checksum_list);
+  checksum_list->reset();
+  Status s;
+
+  std::unique_ptr<SequentialFileReader> file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    const std::shared_ptr<FileSystem>& fs = src_env->GetFileSystem();
+    s = fs->NewSequentialFile(abs_path,
+                              fs->OptimizeForManifestRead(FileOptions()), &file,
+                              nullptr /* dbg */);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader.reset(new SequentialFileReader(std::move(file), abs_path));
+  }
+
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status_ptr;
+    virtual void Corruption(size_t /*bytes*/, const Status& st) override {
+      if (status_ptr->ok()) {
+        *status_ptr = st;
+      }
+    }
+  } reporter;
+  reporter.status_ptr = &s;
+  log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                     true /* checksum */, 0 /* log_number */);
+  FileChecksumRetriever retriever(manifest_file_size, *checksum_list);
+  retriever.Iterate(reader, &s);
+  assert(!retriever.status().ok() ||
+         manifest_file_size == std::numeric_limits<uint64_t>::max() ||
+         reader.LastRecordEnd() == manifest_file_size);
+
+  return retriever.status();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static int RegisterFileChecksumGenFactories(ObjectLibrary& library,
+                                            const std::string& /*arg*/) {
+  library.AddFactory<FileChecksumGenFactory>(
+      FileChecksumGenCrc32cFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<FileChecksumGenFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new FileChecksumGenCrc32cFactory());
+        return guard->get();
+      });
+  return 1;
+}
+}  // namespace
+#endif  // !ROCKSDB_LITE
+
+Status FileChecksumGenFactory::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<FileChecksumGenFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterFileChecksumGenFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  if (value == FileChecksumGenCrc32cFactory::kClassName()) {
+    *result = GetFileChecksumGenCrc32cFactory();
+    return Status::OK();
+  } else {
+    Status s = LoadSharedObject<FileChecksumGenFactory>(options, value, nullptr,
+                                                        result);
+    return s;
+  }
 }
-
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/file_checksum_helper.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/file_checksum_helper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_checksum_helper.h	2025-05-19 16:14:28.000000000 +0000
@@ -6,84 +6,61 @@
 #pragma once
 #include <cassert>
 #include <unordered_map>
+
 #include "port/port.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/status.h"
+#include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/string_util.h"
+#include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 // This is the class to generate the file checksum based on Crc32. It
 // will be used as the default checksum method for SST file checksum
-class FileChecksumFuncCrc32c : public FileChecksumFunc {
+class FileChecksumGenCrc32c : public FileChecksumGenerator {
  public:
-  std::string Extend(const std::string& init_checksum, const char* data,
-                     size_t n) override {
-    assert(data != nullptr);
-    uint32_t checksum_value = StringToUint32(init_checksum);
-    return Uint32ToString(crc32c::Extend(checksum_value, data, n));
+  FileChecksumGenCrc32c(const FileChecksumGenContext& /*context*/) {
+    checksum_ = 0;
+  }
+
+  void Update(const char* data, size_t n) override {
+    checksum_ = crc32c::Extend(checksum_, data, n);
   }
 
-  std::string Value(const char* data, size_t n) override {
-    assert(data != nullptr);
-    return Uint32ToString(crc32c::Value(data, n));
+  void Finalize() override {
+    assert(checksum_str_.empty());
+    // Store as big endian raw bytes
+    PutFixed32(&checksum_str_, EndianSwapValue(checksum_));
   }
 
-  std::string ProcessChecksum(const std::string& checksum) override {
-    uint32_t checksum_value = StringToUint32(checksum);
-    return Uint32ToString(crc32c::Mask(checksum_value));
+  std::string GetChecksum() const override {
+    assert(!checksum_str_.empty());
+    return checksum_str_;
   }
 
   const char* Name() const override { return "FileChecksumCrc32c"; }
 
-  // Convert a uint32_t type data into a 4 bytes string.
-  static std::string Uint32ToString(uint32_t v) {
-    std::string s;
-    if (port::kLittleEndian) {
-      s.append(reinterpret_cast<char*>(&v), sizeof(v));
-    } else {
-      char buf[sizeof(v)];
-      buf[0] = v & 0xff;
-      buf[1] = (v >> 8) & 0xff;
-      buf[2] = (v >> 16) & 0xff;
-      buf[3] = (v >> 24) & 0xff;
-      s.append(buf, sizeof(v));
-    }
-    size_t i = 0, j = s.size() - 1;
-    while (i < j) {
-      char tmp = s[i];
-      s[i] = s[j];
-      s[j] = tmp;
-      ++i;
-      --j;
-    }
-    return s;
-  }
+ private:
+  uint32_t checksum_;
+  std::string checksum_str_;
+};
 
-  // Convert a 4 bytes size string into a uint32_t type data.
-  static uint32_t StringToUint32(std::string s) {
-    assert(s.size() == sizeof(uint32_t));
-    size_t i = 0, j = s.size() - 1;
-    while (i < j) {
-      char tmp = s[i];
-      s[i] = s[j];
-      s[j] = tmp;
-      ++i;
-      --j;
-    }
-    uint32_t v = 0;
-    if (port::kLittleEndian) {
-      memcpy(&v, s.c_str(), sizeof(uint32_t));
+class FileChecksumGenCrc32cFactory : public FileChecksumGenFactory {
+ public:
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) override {
+    if (context.requested_checksum_func_name.empty() ||
+        context.requested_checksum_func_name == "FileChecksumCrc32c") {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new FileChecksumGenCrc32c(context));
     } else {
-      const char* buf = s.c_str();
-      v |= static_cast<uint32_t>(buf[0]);
-      v |= (static_cast<uint32_t>(buf[1]) << 8);
-      v |= (static_cast<uint32_t>(buf[2]) << 16);
-      v |= (static_cast<uint32_t>(buf[3]) << 24);
+      return nullptr;
     }
-    return v;
   }
+
+  static const char* kClassName() { return "FileChecksumGenCrc32cFactory"; }
+  const char* Name() const override { return kClassName(); }
 };
 
 // The default implementaion of FileChecksumList
@@ -114,4 +91,10 @@
       checksum_map_;
 };
 
+// If manifest_file_size < std::numeric_limits<uint64_t>::max(), only use
+// that length prefix of the manifest file.
+Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path,
+                                    uint64_t manifest_file_size,
+                                    FileChecksumList* checksum_list);
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/file_reader_writer_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_reader_writer_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/file_reader_writer_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/file_reader_writer_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,14 +5,21 @@
 //
 #include <algorithm>
 #include <vector>
-#include "env/composite_env_wrapper.h"
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/line_file_reader.h"
 #include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
 #include "file/readahead_raf.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
+#include "rocksdb/file_system.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/crc32c.h"
 #include "util/random.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -21,42 +28,64 @@
 const uint32_t kMb = 1 << 20;
 
 TEST_F(WritableFileWriterTest, RangeSync) {
-  class FakeWF : public WritableFile {
+  class FakeWF : public FSWritableFile {
    public:
     explicit FakeWF() : size_(0), last_synced_(0) {}
     ~FakeWF() override {}
 
-    Status Append(const Slice& data) override {
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
       size_ += data.size();
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Truncate(uint64_t /*size*/) override { return Status::OK(); }
-    Status Close() override {
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
       EXPECT_GE(size_, last_synced_ + kMb);
       EXPECT_LT(size_, last_synced_ + 2 * kMb);
       // Make sure random writes generated enough writes.
       EXPECT_GT(size_, 10 * kMb);
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Fsync(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Fsync() override { return Status::OK(); }
     void SetIOPriority(Env::IOPriority /*pri*/) override {}
-    uint64_t GetFileSize() override { return size_; }
+    uint64_t GetFileSize(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) override {
+      return size_;
+    }
     void GetPreallocationStatus(size_t* /*block_size*/,
                                 size_t* /*last_allocated_block*/) override {}
     size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
       return 0;
     }
-    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
-      return Status::OK();
+    IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return IOStatus::OK();
     }
 
    protected:
-    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
-      return Status::OK();
-    }
-    Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+                       const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
       EXPECT_EQ(offset % 4096, 0u);
       EXPECT_EQ(nbytes % 4096, 0u);
 
@@ -66,7 +95,7 @@
       if (size_ > 2 * kMb) {
         EXPECT_LT(size_, last_synced_ + 2 * kMb);
       }
-      return Status::OK();
+      return IOStatus::OK();
     }
 
     uint64_t size_;
@@ -77,25 +106,28 @@
   env_options.bytes_per_sync = kMb;
   std::unique_ptr<FakeWF> wf(new FakeWF);
   std::unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                             "" /* don't care */, env_options));
+      new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
   Random r(301);
+  Status s;
   std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
   for (int i = 0; i < 1000; i++) {
     int skew_limit = (i < 700) ? 10 : 15;
     uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
-    writer->Append(Slice(large_buf.get(), num));
+    s = writer->Append(Slice(large_buf.get(), num));
+    ASSERT_OK(s);
 
     // Flush in a chance of 1/10.
     if (r.Uniform(10) == 0) {
-      writer->Flush();
+      s = writer->Flush();
+      ASSERT_OK(s);
     }
   }
-  writer->Close();
+  s = writer->Close();
+  ASSERT_OK(s);
 }
 
 TEST_F(WritableFileWriterTest, IncrementalBuffer) {
-  class FakeWF : public WritableFile {
+  class FakeWF : public FSWritableFile {
    public:
     explicit FakeWF(std::string* _file_data, bool _use_direct_io,
                     bool _no_flush)
@@ -104,37 +136,58 @@
           no_flush_(_no_flush) {}
     ~FakeWF() override {}
 
-    Status Append(const Slice& data) override {
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
       file_data_->append(data.data(), data.size());
       size_ += data.size();
-      return Status::OK();
+      return IOStatus::OK();
     }
-    Status PositionedAppend(const Slice& data, uint64_t pos) override {
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& data, uint64_t pos,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
       EXPECT_TRUE(pos % 512 == 0);
       EXPECT_TRUE(data.size() % 512 == 0);
       file_data_->resize(pos);
       file_data_->append(data.data(), data.size());
       size_ += data.size();
-      return Status::OK();
+      return IOStatus::OK();
     }
 
-    Status Truncate(uint64_t size) override {
+    IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
       file_data_->resize(size);
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Fsync(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Fsync() override { return Status::OK(); }
     void SetIOPriority(Env::IOPriority /*pri*/) override {}
-    uint64_t GetFileSize() override { return size_; }
+    uint64_t GetFileSize(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) override {
+      return size_;
+    }
     void GetPreallocationStatus(size_t* /*block_size*/,
                                 size_t* /*last_allocated_block*/) override {}
     size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
       return 0;
     }
-    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
-      return Status::OK();
+    IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return IOStatus::OK();
     }
     bool use_direct_io() const override { return use_direct_io_; }
 
@@ -159,52 +212,238 @@
                                           false,
 #endif
                                           no_flush));
-    std::unique_ptr<WritableFileWriter> writer(
-        new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                               "" /* don't care */, env_options));
+    std::unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
+        std::move(wf), "" /* don't care */, env_options));
 
     std::string target;
     for (int i = 0; i < 20; i++) {
       uint32_t num = r.Skewed(16) * 100 + r.Uniform(100);
-      std::string random_string;
-      test::RandomString(&r, num, &random_string);
-      writer->Append(Slice(random_string.c_str(), num));
+      std::string random_string = r.RandomString(num);
+      ASSERT_OK(writer->Append(Slice(random_string.c_str(), num)));
       target.append(random_string.c_str(), num);
 
       // In some attempts, flush in a chance of 1/10.
       if (!no_flush && r.Uniform(10) == 0) {
-        writer->Flush();
+        ASSERT_OK(writer->Flush());
       }
     }
-    writer->Flush();
-    writer->Close();
+    ASSERT_OK(writer->Flush());
+    ASSERT_OK(writer->Close());
     ASSERT_EQ(target.size(), actual.size());
     ASSERT_EQ(target, actual);
   }
 }
 
+class DBWritableFileWriterTest : public DBTestBase {
+ public:
+  DBWritableFileWriterTest()
+      : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+TEST_F(DBWritableFileWriterTest, AppendWithChecksum) {
+  FileOptions file_options = FileOptions();
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  std::string fname = this->dbname_ + "/test_file";
+  std::unique_ptr<FSWritableFile> writable_file_ptr;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr,
+                                       /*dbg*/ nullptr));
+  std::unique_ptr<TestFSWritableFile> file;
+  file.reset(new TestFSWritableFile(
+      fname, file_options, std::move(writable_file_ptr), fault_fs_.get()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ImmutableOptions ioptions(options);
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+
+  Random rnd(301);
+  std::string data = rnd.RandomString(1000);
+  uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size());
+  fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+  ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+  ASSERT_OK(file_writer->Flush());
+  Random size_r(47);
+  for (int i = 0; i < 2000; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+  }
+  ASSERT_OK(file_writer->Close());
+  Destroy(options);
+}
+
+TEST_F(DBWritableFileWriterTest, AppendVerifyNoChecksum) {
+  FileOptions file_options = FileOptions();
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  std::string fname = this->dbname_ + "/test_file";
+  std::unique_ptr<FSWritableFile> writable_file_ptr;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr,
+                                       /*dbg*/ nullptr));
+  std::unique_ptr<TestFSWritableFile> file;
+  file.reset(new TestFSWritableFile(
+      fname, file_options, std::move(writable_file_ptr), fault_fs_.get()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ImmutableOptions ioptions(options);
+  // Enable checksum handoff for this file, but do not enable buffer checksum.
+  // So Append with checksum logic will not be triggered
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, false));
+
+  Random rnd(301);
+  std::string data = rnd.RandomString(1000);
+  uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size());
+  fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+  ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+  ASSERT_OK(file_writer->Flush());
+  Random size_r(47);
+  for (int i = 0; i < 1000; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+  }
+  ASSERT_OK(file_writer->Close());
+  Destroy(options);
+}
+
+TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) {
+  FileOptions file_options = FileOptions();
+  file_options.rate_limiter = nullptr;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  std::string fname = this->dbname_ + "/test_file";
+  std::unique_ptr<FSWritableFile> writable_file_ptr;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr,
+                                       /*dbg*/ nullptr));
+  std::unique_ptr<TestFSWritableFile> file;
+  file.reset(new TestFSWritableFile(
+      fname, file_options, std::move(writable_file_ptr), fault_fs_.get()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ImmutableOptions ioptions(options);
+  // Enable checksum handoff for this file, but do not enable buffer checksum.
+  // So Append with checksum logic will not be triggered
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+  fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+  Random rnd(301);
+  std::string data;
+  uint32_t data_crc32c;
+  uint64_t start = fault_env_->NowMicros();
+  Random size_r(47);
+  uint64_t bytes_written = 0;
+  for (int i = 0; i < 100; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+    bytes_written += static_cast<uint64_t>(data.size());
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+    bytes_written += static_cast<uint64_t>(data.size());
+  }
+  uint64_t elapsed = fault_env_->NowMicros() - start;
+  double raw_rate = bytes_written * 1000000.0 / elapsed;
+  ASSERT_OK(file_writer->Close());
+
+  // Set the rate-limiter
+  FileOptions file_options1 = FileOptions();
+  file_options1.rate_limiter =
+      NewGenericRateLimiter(static_cast<int64_t>(0.5 * raw_rate));
+  fname = this->dbname_ + "/test_file_1";
+  std::unique_ptr<FSWritableFile> writable_file_ptr1;
+  ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options1,
+                                       &writable_file_ptr1,
+                                       /*dbg*/ nullptr));
+  file.reset(new TestFSWritableFile(
+      fname, file_options1, std::move(writable_file_ptr1), fault_fs_.get()));
+  // Enable checksum handoff for this file, but do not enable buffer checksum.
+  // So Append with checksum logic will not be triggered
+  file_writer.reset(new WritableFileWriter(
+      std::move(file), fname, file_options1, SystemClock::Default().get(),
+      nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+
+  for (int i = 0; i < 1000; i++) {
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 10000));
+    data_crc32c = crc32c::Value(data.c_str(), data.size());
+    ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c));
+
+    data = rnd.RandomString((static_cast<int>(size_r.Next()) % 97));
+    ASSERT_OK(file_writer->Append(Slice(data.c_str())));
+    ASSERT_OK(file_writer->Flush());
+  }
+  ASSERT_OK(file_writer->Close());
+  if (file_options1.rate_limiter != nullptr) {
+    delete file_options1.rate_limiter;
+  }
+
+  Destroy(options);
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(WritableFileWriterTest, AppendStatusReturn) {
-  class FakeWF : public WritableFile {
+  class FakeWF : public FSWritableFile {
    public:
     explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
 
     bool use_direct_io() const override { return use_direct_io_; }
-    Status Append(const Slice& /*data*/) override {
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& /*data*/, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
       if (io_error_) {
-        return Status::IOError("Fake IO error");
+        return IOStatus::IOError("Fake IO error");
       }
-      return Status::OK();
+      return IOStatus::OK();
     }
-    Status PositionedAppend(const Slice& /*data*/, uint64_t) override {
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& /*data*/, uint64_t,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
       if (io_error_) {
-        return Status::IOError("Fake IO error");
+        return IOStatus::IOError("Fake IO error");
       }
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
     void Setuse_direct_io(bool val) { use_direct_io_ = val; }
     void SetIOError(bool val) { io_error_ = val; }
 
@@ -215,15 +454,13 @@
   std::unique_ptr<FakeWF> wf(new FakeWF());
   wf->Setuse_direct_io(true);
   std::unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                             "" /* don't care */, EnvOptions()));
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
 
   ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
 
   // Next call to WritableFile::Append() should fail
-  LegacyWritableFileWrapper* file =
-      static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
-  static_cast<FakeWF*>(file->target())->SetIOError(true);
+  FakeWF* fwf = static_cast<FakeWF*>(writer->writable_file());
+  fwf->SetIOError(true);
   ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b')));
 }
 #endif
@@ -243,16 +480,21 @@
   ReadaheadRandomAccessFileTest() : control_contents_() {}
   std::string Read(uint64_t offset, size_t n) {
     Slice result;
-    test_read_holder_->Read(offset, n, &result, scratch_.get());
+    Status s = test_read_holder_->Read(offset, n, IOOptions(), &result,
+                                       scratch_.get(), nullptr);
+    EXPECT_TRUE(s.ok() || s.IsInvalidArgument());
     return std::string(result.data(), result.size());
   }
   void ResetSourceStr(const std::string& str = "") {
-    auto write_holder =
-        std::unique_ptr<WritableFileWriter>(test::GetWritableFileWriter(
-            new test::StringSink(&control_contents_), "" /* don't care */));
-    write_holder->Append(Slice(str));
-    write_holder->Flush();
-    auto read_holder = std::unique_ptr<RandomAccessFile>(
+    std::unique_ptr<FSWritableFile> sink(
+        new test::StringSink(&control_contents_));
+    std::unique_ptr<WritableFileWriter> write_holder(new WritableFileWriter(
+        std::move(sink), "" /* don't care */, FileOptions()));
+    Status s = write_holder->Append(Slice(str));
+    EXPECT_OK(s);
+    s = write_holder->Flush();
+    EXPECT_OK(s);
+    std::unique_ptr<FSRandomAccessFile> read_holder(
         new test::StringSource(control_contents_));
     test_read_holder_ =
         NewReadaheadRandomAccessFile(std::move(read_holder), readahead_size_);
@@ -262,7 +504,7 @@
  private:
   size_t readahead_size_;
   Slice control_contents_;
-  std::unique_ptr<RandomAccessFile> test_read_holder_;
+  std::unique_ptr<FSRandomAccessFile> test_read_holder_;
   std::unique_ptr<char[]> scratch_;
 };
 
@@ -288,8 +530,7 @@
   for (int k = 0; k < 100; ++k) {
     size_t strLen = k * GetReadaheadSize() +
                     rng.Uniform(static_cast<int>(GetReadaheadSize()));
-    std::string str =
-        test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+    std::string str = rng.HumanReadableString(static_cast<int>(strLen));
     ResetSourceStr(str);
     for (int test = 1; test <= 100; ++test) {
       size_t offset = rng.Uniform(static_cast<int>(strLen));
@@ -304,8 +545,7 @@
   Random rng(7);
   size_t strLen = 4 * GetReadaheadSize() +
                   rng.Uniform(static_cast<int>(GetReadaheadSize()));
-  std::string str =
-      test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+  std::string str = rng.HumanReadableString(static_cast<int>(strLen));
   ResetSourceStr(str);
   for (int test = 1; test <= 100; ++test) {
     size_t offset = rng.Uniform(static_cast<int>(strLen));
@@ -343,15 +583,16 @@
   ReadaheadSequentialFileTest() {}
   std::string Read(size_t n) {
     Slice result;
-    test_read_holder_->Read(n, &result, scratch_.get());
+    Status s = test_read_holder_->Read(n, &result, scratch_.get());
+    EXPECT_TRUE(s.ok() || s.IsInvalidArgument());
     return std::string(result.data(), result.size());
   }
   void Skip(size_t n) { test_read_holder_->Skip(n); }
   void ResetSourceStr(const std::string& str = "") {
-    auto read_holder = std::unique_ptr<SequentialFile>(
+    auto read_holder = std::unique_ptr<FSSequentialFile>(
         new test::SeqStringSource(str, &seq_read_count_));
-    test_read_holder_.reset(new SequentialFileReader(
-        NewLegacySequentialFileWrapper(read_holder), "test", readahead_size_));
+    test_read_holder_.reset(new SequentialFileReader(std::move(read_holder),
+                                                     "test", readahead_size_));
   }
   size_t GetReadaheadSize() const { return readahead_size_; }
 
@@ -383,8 +624,7 @@
     for (int k = 0; k < 100; ++k) {
       size_t strLen = k * GetReadaheadSize() +
                       rng.Uniform(static_cast<int>(GetReadaheadSize()));
-      std::string str =
-          test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+      std::string str = rng.HumanReadableString(static_cast<int>(strLen));
       ResetSourceStr(str);
       size_t offset = 0;
       for (int test = 1; test <= 100; ++test) {
@@ -406,8 +646,7 @@
     for (int k = 0; k < 100; ++k) {
       size_t strLen = k * GetReadaheadSize() +
                       rng.Uniform(static_cast<int>(GetReadaheadSize()));
-      std::string str =
-          test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+      std::string str = rng.HumanReadableString(static_cast<int>(strLen));
       ResetSourceStr(str);
       size_t offset = 0;
       for (int test = 1; test <= 100; ++test) {
@@ -436,6 +675,218 @@
 INSTANTIATE_TEST_CASE_P(
     ReadExceedsReadaheadSize, ReadaheadSequentialFileTest,
     ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+
+namespace {
+std::string GenerateLine(int n) {
+  std::string rv;
+  // Multiples of 17 characters per line, for likely bad buffer alignment
+  for (int i = 0; i < n; ++i) {
+    rv.push_back(static_cast<char>('0' + (i % 10)));
+    rv.append("xxxxxxxxxxxxxxxx");
+  }
+  return rv;
+}
+}  // namespace
+
+TEST(LineFileReaderTest, LineFileReaderTest) {
+  const int nlines = 1000;
+
+  std::unique_ptr<Env> mem_env(MockEnv::Create(Env::Default()));
+  std::shared_ptr<FileSystem> fs = mem_env->GetFileSystem();
+  // Create an input file
+  {
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(
+        fs->NewWritableFile("testfile", FileOptions(), &file, /*dbg*/ nullptr));
+
+    for (int i = 0; i < nlines; ++i) {
+      std::string line = GenerateLine(i);
+      line.push_back('\n');
+      ASSERT_OK(file->Append(line, IOOptions(), /*dbg*/ nullptr));
+    }
+  }
+
+  // Verify with no I/O errors
+  {
+    std::unique_ptr<LineFileReader> reader;
+    ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader,
+                                     nullptr));
+    std::string line;
+    int count = 0;
+    while (reader->ReadLine(&line)) {
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_OK(reader->GetStatus());
+    ASSERT_EQ(count, nlines);
+    ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    // And still
+    ASSERT_FALSE(reader->ReadLine(&line));
+    ASSERT_OK(reader->GetStatus());
+    ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+  }
+
+  // Verify with injected I/O error
+  {
+    std::unique_ptr<LineFileReader> reader;
+    ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader,
+                                     nullptr));
+    std::string line;
+    int count = 0;
+    // Read part way through the file
+    while (count < nlines / 4) {
+      ASSERT_TRUE(reader->ReadLine(&line));
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_OK(reader->GetStatus());
+
+    // Inject error
+    int callback_count = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "MemFile::Read:IOStatus", [&](void* arg) {
+          IOStatus* status = static_cast<IOStatus*>(arg);
+          *status = IOStatus::Corruption("test");
+          ++callback_count;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    while (reader->ReadLine(&line)) {
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_TRUE(reader->GetStatus().IsCorruption());
+    ASSERT_LT(count, nlines / 2);
+    ASSERT_EQ(callback_count, 1);
+
+    // Still get error & no retry
+    ASSERT_FALSE(reader->ReadLine(&line));
+    ASSERT_TRUE(reader->GetStatus().IsCorruption());
+    ASSERT_EQ(callback_count, 1);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+class IOErrorEventListener : public EventListener {
+ public:
+  IOErrorEventListener() { notify_error_.store(0); }
+
+  void OnIOError(const IOErrorInfo& io_error_info) override {
+    notify_error_++;
+    EXPECT_FALSE(io_error_info.file_path.empty());
+    EXPECT_FALSE(io_error_info.io_status.ok());
+  }
+
+  size_t NotifyErrorCount() { return notify_error_; }
+
+  bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+ private:
+  std::atomic<size_t> notify_error_;
+};
+
+TEST_F(DBWritableFileWriterTest, IOErrorNotification) {
+  class FakeWF : public FSWritableFile {
+   public:
+    explicit FakeWF() : io_error_(false) {
+      file_append_errors_.store(0);
+      file_flush_errors_.store(0);
+    }
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& /*data*/, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        file_append_errors_++;
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& /*data*/, uint64_t,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      if (io_error_) {
+        file_flush_errors_++;
+        return IOStatus::IOError("Fake IO error");
+      }
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+
+    void SetIOError(bool val) { io_error_ = val; }
+
+    void CheckCounters(int file_append_errors, int file_flush_errors) {
+      ASSERT_EQ(file_append_errors, file_append_errors_);
+      ASSERT_EQ(file_flush_errors_, file_flush_errors);
+    }
+
+   protected:
+    bool io_error_;
+    std::atomic<size_t> file_append_errors_;
+    std::atomic<size_t> file_flush_errors_;
+  };
+
+  FileOptions file_options = FileOptions();
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  IOErrorEventListener* listener = new IOErrorEventListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+  ImmutableOptions ioptions(options);
+
+  std::string fname = this->dbname_ + "/test_file";
+  std::unique_ptr<FakeWF> writable_file_ptr(new FakeWF);
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  writable_file_ptr->SetIOError(true);
+
+  file_writer.reset(new WritableFileWriter(
+      std::move(writable_file_ptr), fname, file_options,
+      SystemClock::Default().get(), nullptr, ioptions.stats, ioptions.listeners,
+      ioptions.file_checksum_gen_factory.get(), true, true));
+
+  FakeWF* fwf = static_cast<FakeWF*>(file_writer->writable_file());
+
+  fwf->SetIOError(true);
+  ASSERT_NOK(file_writer->Append(std::string(2 * kMb, 'a')));
+  fwf->CheckCounters(1, 0);
+  ASSERT_EQ(listener->NotifyErrorCount(), 1);
+
+  fwf->SetIOError(true);
+  ASSERT_NOK(file_writer->Flush());
+  fwf->CheckCounters(1, 1);
+  ASSERT_EQ(listener->NotifyErrorCount(), 2);
+
+  /* No error generation */
+  fwf->SetIOError(false);
+  ASSERT_OK(file_writer->Append(std::string(2 * kMb, 'b')));
+  ASSERT_EQ(listener->NotifyErrorCount(), 2);
+  fwf->CheckCounters(1, 1);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/filelock_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/filelock_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/filelock_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/filelock_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,9 +7,14 @@
 #include "rocksdb/env.h"
 
 #include <fcntl.h>
+#ifdef __FreeBSD__
+#include <sys/types.h>
+#include <sys/wait.h>
+#endif
 #include <vector>
 #include "test_util/testharness.h"
 #include "util/coding.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -120,7 +125,13 @@
   ASSERT_TRUE( AssertFileIsLocked() );
 
   // re-acquire the lock on the same file. This should fail.
-  ASSERT_TRUE(LockFile(&lock2).IsIOError());
+  Status s = LockFile(&lock2);
+  ASSERT_TRUE(s.IsIOError());
+#ifndef OS_WIN
+  // Validate that error message contains current thread ID.
+  ASSERT_TRUE(s.ToString().find(ToString(Env::Default()->GetThreadID())) !=
+              std::string::npos);
+#endif
 
   // check the file is locked
   ASSERT_TRUE( AssertFileIsLocked() );
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/filter_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/filter_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/filter_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/filter_bench.cc	2025-05-19 16:14:28.000000000 +0000
@@ -19,10 +19,13 @@
 #include "memory/arena.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/system_clock.h"
 #include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/mock_block_based_table.h"
 #include "table/plain/plain_table_bloom.h"
+#include "util/cast_util.h"
 #include "util/gflags_compat.h"
 #include "util/hash.h"
 #include "util/random.h"
@@ -79,7 +82,8 @@
 
 DEFINE_uint32(impl, 0,
               "Select filter implementation. Without -use_plain_table_bloom:"
-              "0 = full filter, 1 = block-based filter. With "
+              "0 = legacy full Bloom filter, 1 = block-based Bloom filter, "
+              "2 = format_version 5 Bloom filter, 3 = Ribbon128 filter. With "
               "-use_plain_table_bloom: 0 = no locality, 1 = locality.");
 
 DEFINE_bool(net_includes_hashing, false,
@@ -87,6 +91,21 @@
             "(if not, dry run will include hashing) "
             "(build times always include hashing)");
 
+DEFINE_bool(optimize_filters_for_memory, false,
+            "Setting for BlockBasedTableOptions::optimize_filters_for_memory");
+
+DEFINE_uint32(block_cache_capacity_MB, 8,
+              "Setting for "
+              "LRUCacheOptions::capacity");
+
+DEFINE_bool(reserve_table_builder_memory, false,
+            "Setting for "
+            "BlockBasedTableOptions::reserve_table_builder_memory");
+
+DEFINE_bool(strict_capacity_limit, false,
+            "Setting for "
+            "LRUCacheOptions::strict_capacity_limit");
+
 DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
 
 DEFINE_bool(best_case, false, "Run limited tests only for best-case");
@@ -119,18 +138,21 @@
 using ROCKSDB_NAMESPACE::BloomHash;
 using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder;
 using ROCKSDB_NAMESPACE::CachableEntry;
+using ROCKSDB_NAMESPACE::Cache;
 using ROCKSDB_NAMESPACE::EncodeFixed32;
-using ROCKSDB_NAMESPACE::fastrange32;
+using ROCKSDB_NAMESPACE::FastRange32;
 using ROCKSDB_NAMESPACE::FilterBitsReader;
 using ROCKSDB_NAMESPACE::FilterBuildingContext;
 using ROCKSDB_NAMESPACE::FullFilterBlockReader;
 using ROCKSDB_NAMESPACE::GetSliceHash;
 using ROCKSDB_NAMESPACE::GetSliceHash64;
 using ROCKSDB_NAMESPACE::Lower32of64;
+using ROCKSDB_NAMESPACE::LRUCacheOptions;
 using ROCKSDB_NAMESPACE::ParsedFullFilterBlock;
 using ROCKSDB_NAMESPACE::PlainTableBloomV1;
 using ROCKSDB_NAMESPACE::Random32;
 using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::static_cast_with_check;
 using ROCKSDB_NAMESPACE::StderrLogger;
 using ROCKSDB_NAMESPACE::mock::MockBlockBasedTableTester;
 
@@ -156,7 +178,7 @@
     if (FLAGS_vary_key_size_log2_interval < 30) {
       // To get range [avg_size - 2, avg_size + 2]
       // use range [smallest_size, smallest_size + 4]
-      len += fastrange32(
+      len += FastRange32(
           (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5);
     }
     char * data = buf_.get() + start;
@@ -263,8 +285,8 @@
   Random32 random_;
   std::ostringstream fp_rate_report_;
   Arena arena_;
-  StderrLogger stderr_logger_;
   double m_queries_;
+  StderrLogger stderr_logger_;
 
   FilterBench()
       : MockBlockBasedTableTester(new BloomFilterPolicy(
@@ -275,7 +297,19 @@
     for (uint32_t i = 0; i < FLAGS_batch_size; ++i) {
       kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size);
     }
-    ioptions_.info_log = &stderr_logger_;
+    ioptions_.logger = &stderr_logger_;
+    table_options_.optimize_filters_for_memory =
+        FLAGS_optimize_filters_for_memory;
+    if (FLAGS_reserve_table_builder_memory) {
+      table_options_.reserve_table_builder_memory = true;
+      table_options_.no_block_cache = false;
+      LRUCacheOptions lo;
+      lo.capacity = FLAGS_block_cache_capacity_MB * 1024 * 1024;
+      lo.num_shard_bits = 0;  // 2^0 shard
+      lo.strict_capacity_limit = FLAGS_strict_capacity_limit;
+      std::shared_ptr<Cache> cache(NewLRUCache(lo));
+      table_options_.block_cache = cache;
+    }
   }
 
   void Go();
@@ -299,9 +333,9 @@
       throw std::runtime_error(
           "Block-based filter not currently supported by filter_bench");
     }
-    if (FLAGS_impl > 2) {
+    if (FLAGS_impl > 3) {
       throw std::runtime_error(
-          "-impl must currently be 0 or 2 for Block-based table");
+          "-impl must currently be 0, 2, or 3 for Block-based table");
     }
   }
 
@@ -335,6 +369,7 @@
   std::unique_ptr<BuiltinFilterBitsBuilder> builder;
 
   size_t total_memory_used = 0;
+  size_t total_size = 0;
   size_t total_keys_added = 0;
 #ifdef PREDICT_FP_RATE
   double weighted_predicted_fp_rate = 0.0;
@@ -349,15 +384,15 @@
     max_mem = static_cast<size_t>(1024 * 1024 * working_mem_size_mb);
   }
 
-  ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
-                                         true);
+  ROCKSDB_NAMESPACE::StopWatchNano timer(
+      ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
 
   infos_.clear();
-  while ((working_mem_size_mb == 0 || total_memory_used < max_mem) &&
+  while ((working_mem_size_mb == 0 || total_size < max_mem) &&
          total_keys_added < max_total_keys) {
     uint32_t filter_id = random_.Next();
     uint32_t keys_to_add = FLAGS_average_keys_per_filter +
-                           fastrange32(random_.Next(), variance_range) -
+                           FastRange32(random_.Next(), variance_range) -
                            variance_offset;
     if (max_total_keys - total_keys_added < keys_to_add) {
       keys_to_add = static_cast<uint32_t>(max_total_keys - total_keys_added);
@@ -378,7 +413,8 @@
       info.filter_ = info.plain_table_bloom_->GetRawData();
     } else {
       if (!builder) {
-        builder.reset(&dynamic_cast<BuiltinFilterBitsBuilder &>(*GetBuilder()));
+        builder.reset(
+            static_cast_with_check<BuiltinFilterBitsBuilder>(GetBuilder()));
       }
       for (uint32_t i = 0; i < keys_to_add; ++i) {
         builder->AddKey(kms_[0].Get(filter_id, i));
@@ -402,7 +438,11 @@
       info.full_block_reader_.reset(
           new FullFilterBlockReader(table_.get(), std::move(block)));
     }
-    total_memory_used += info.filter_.size();
+    total_size += info.filter_.size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    total_memory_used +=
+        malloc_usable_size(const_cast<char *>(info.filter_.data()));
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
     total_keys_added += keys_to_add;
   }
 
@@ -410,11 +450,17 @@
   double ns = double(elapsed_nanos) / total_keys_added;
   std::cout << "Build avg ns/key: " << ns << std::endl;
   std::cout << "Number of filters: " << infos_.size() << std::endl;
-  std::cout << "Total memory (MB): " << total_memory_used / 1024.0 / 1024.0
-            << std::endl;
+  std::cout << "Total size (MB): " << total_size / 1024.0 / 1024.0 << std::endl;
+  if (total_memory_used > 0) {
+    std::cout << "Reported total allocated memory (MB): "
+              << total_memory_used / 1024.0 / 1024.0 << std::endl;
+    std::cout << "Reported internal fragmentation: "
+              << (total_memory_used - total_size) * 100.0 / total_size << "%"
+              << std::endl;
+  }
 
-  double bpk = total_memory_used * 8.0 / total_keys_added;
-  std::cout << "Bits/key actual: " << bpk << std::endl;
+  double bpk = total_size * 8.0 / total_keys_added;
+  std::cout << "Bits/key stored: " << bpk << std::endl;
 #ifdef PREDICT_FP_RATE
   std::cout << "Predicted FP rate %: "
             << 100.0 * (weighted_predicted_fp_rate / total_keys_added)
@@ -542,15 +588,25 @@
     // 100% of queries to 1 filter
     num_primary_filters = 1;
   } else if (mode == kFiftyOneFilter) {
+    if (num_infos < 50) {
+      return 0.0;  // skip
+    }
     // 50% of queries
     primary_filter_threshold /= 2;
     // to 1% of filters
     num_primary_filters = (num_primary_filters + 99) / 100;
   } else if (mode == kEightyTwentyFilter) {
+    if (num_infos < 5) {
+      return 0.0;  // skip
+    }
     // 80% of queries
     primary_filter_threshold = primary_filter_threshold / 5 * 4;
     // to 20% of filters
     num_primary_filters = (num_primary_filters + 4) / 5;
+  } else if (mode == kRandomFilter) {
+    if (num_infos == 1) {
+      return 0.0;  // skip
+    }
   }
   uint32_t batch_size = 1;
   std::unique_ptr<Slice[]> batch_slices;
@@ -568,8 +624,8 @@
     batch_slice_ptrs[i] = &batch_slices[i];
   }
 
-  ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
-                                         true);
+  ROCKSDB_NAMESPACE::StopWatchNano timer(
+      ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
 
   for (uint64_t q = 0; q < max_queries; q += batch_size) {
     bool inside_this_time = random_.Next() <= inside_threshold;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/gflags_compat.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/gflags_compat.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/gflags_compat.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/gflags_compat.h	2025-05-19 16:14:28.000000000 +0000
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
 #include <gflags/gflags.h>
 
 #ifndef GFLAGS_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,14 +7,21 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <string.h>
-#include "util/coding.h"
 #include "util/hash.h"
-#include "util/util.h"
+
+#include <string>
+
+#include "port/lang.h"
+#include "util/coding.h"
+#include "util/hash128.h"
+#include "util/math128.h"
 #include "util/xxhash.h"
+#include "util/xxph3.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64;
+
 uint32_t Hash(const char* data, size_t n, uint32_t seed) {
   // MurmurHash1 - fast but mediocre quality
   // https://github.com/aappleby/smhasher/wiki/MurmurHash1
@@ -72,12 +79,123 @@
 // bundling hash functions specialized for particular lengths with
 // the prefix extractors.
 uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
-  return XXH3p_64bits_withSeed(data, n, seed);
+  return XXPH3_64bits_withSeed(data, n, seed);
 }
 
 uint64_t Hash64(const char* data, size_t n) {
   // Same as seed = 0
-  return XXH3p_64bits(data, n);
+  return XXPH3_64bits(data, n);
+}
+
+uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
+  // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation.
+  size_t concat_len = 0;
+  for (int i = 0; i < data.num_parts; ++i) {
+    concat_len += data.parts[i].size();
+  }
+  std::string concat_data;
+  concat_data.reserve(concat_len);
+  for (int i = 0; i < data.num_parts; ++i) {
+    concat_data.append(data.parts[i].data(), data.parts[i].size());
+  }
+  assert(concat_data.size() == concat_len);
+  return NPHash64(concat_data.data(), concat_len, seed);
+}
+
+Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
+  auto h = XXH3_128bits_withSeed(data, n, seed);
+  return (Unsigned128{h.high64} << 64) | (h.low64);
+}
+
+Unsigned128 Hash128(const char* data, size_t n) {
+  // Same as seed = 0
+  auto h = XXH3_128bits(data, n);
+  return (Unsigned128{h.high64} << 64) | (h.low64);
+}
+
+void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) {
+  // Same as seed = 0
+  auto h = XXH3_128bits(data, n);
+  *high64 = h.high64;
+  *low64 = h.low64;
+}
+
+void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
+              uint64_t* low64) {
+  auto h = XXH3_128bits_withSeed(data, n, seed);
+  *high64 = h.high64;
+  *low64 = h.low64;
+}
+
+namespace {
+
+inline uint64_t XXH3_avalanche(uint64_t h64) {
+  h64 ^= h64 >> 37;
+  h64 *= 0x165667919E3779F9U;
+  h64 ^= h64 >> 32;
+  return h64;
+}
+
+inline uint64_t XXH3_unavalanche(uint64_t h64) {
+  h64 ^= h64 >> 32;
+  h64 *= 0x8da8ee41d6df849U;  // inverse of 0x165667919E3779F9U
+  h64 ^= h64 >> 37;
+  return h64;
 }
 
+}  // namespace
+
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                       uint64_t* out_high64, uint64_t* out_low64) {
+  // Adapted from XXH3_len_9to16_128b
+  const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
+  const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
+  Unsigned128 tmp128 =
+      Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U);
+  uint64_t lo = Lower64of128(tmp128);
+  uint64_t hi = Upper64of128(tmp128);
+  lo += 0x3c0000000000000U;  // (len - 1) << 54
+  in_high64 ^= bitfliph;
+  hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76});
+  lo ^= EndianSwapValue(hi);
+  tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU);
+  lo = Lower64of128(tmp128);
+  hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU);
+  *out_low64 = XXH3_avalanche(lo);
+  *out_high64 = XXH3_avalanche(hi);
+}
+
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                         uint64_t* out_high64, uint64_t* out_low64) {
+  // Inverted above (also consulting XXH3_len_9to16_128b)
+  const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
+  const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
+  uint64_t lo = XXH3_unavalanche(in_low64);
+  uint64_t hi = XXH3_unavalanche(in_high64);
+  lo *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
+  hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU));
+  hi *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
+  lo ^= EndianSwapValue(hi);
+  lo -= 0x3c0000000000000U;
+  lo *= 0x887493432badb37U;  // inverse of 0x9E3779B185EBCA87U
+  hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U));
+  uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47;  // inverse of 0x85EBCA77
+  hi -= tmp32;
+  hi = (hi & 0xFFFFFFFF00000000U) -
+       ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32;
+  hi ^= bitfliph;
+  lo ^= hi ^ bitflipl;
+  *out_high64 = hi;
+  *out_low64 = lo;
+}
+
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
+                       uint64_t* out_high64, uint64_t* out_low64) {
+  BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
+}
+
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
+                         uint64_t* out_high64, uint64_t* out_low64) {
+  BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
+}
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash.h	2025-05-19 16:14:28.000000000 +0000
@@ -10,7 +10,7 @@
 // Common hash functions with convenient interfaces. If hashing a
 // statically-sized input in a performance-critical context, consider
 // calling a specific hash implementation directly, such as
-// XXH3p_64bits from xxhash.h.
+// XXH3_64bits from xxhash.h.
 //
 // Since this is a very common header, implementation details are kept
 // out-of-line. Out-of-lining also aids in tracking the time spent in
@@ -18,37 +18,75 @@
 // hash inputs.
 
 #pragma once
-#include <stddef.h>
-#include <stdint.h>
+
+#include <cstddef>
+#include <cstdint>
 
 #include "rocksdb/slice.h"
+#include "util/fastrange.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 // Stable/persistent 64-bit hash. Higher quality and generally faster than
 // Hash(), especially for inputs > 24 bytes.
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend incrementing by a large odd number.
 extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
 
 // Specific optimization without seed (same as seed = 0)
 extern uint64_t Hash64(const char* data, size_t n);
 
-// Non-persistent hash. Must only used for in-memory data structure.
-// The hash results are thus applicable to change. (Thus, it rarely makes
-// sense to specify a seed for this function.)
-inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
+// Non-persistent hash. Must only used for in-memory data structures.
+// The hash results are thus subject to change between releases,
+// architectures, build configuration, etc. (Thus, it rarely makes sense
+// to specify a seed for this function, except for a "rolling" hash.)
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend incrementing by a large odd number.
+inline uint64_t NPHash64(const char* data, size_t n, uint64_t seed) {
+#ifdef ROCKSDB_MODIFY_NPHASH
+  // For testing "subject to change"
+  return Hash64(data, n, seed + 123456789);
+#else
   // Currently same as Hash64
   return Hash64(data, n, seed);
+#endif
 }
 
 // Specific optimization without seed (same as seed = 0)
 inline uint64_t NPHash64(const char* data, size_t n) {
+#ifdef ROCKSDB_MODIFY_NPHASH
+  // For testing "subject to change"
+  return Hash64(data, n, 123456789);
+#else
   // Currently same as Hash64
   return Hash64(data, n);
+#endif
 }
 
+// Convenient and equivalent version of Hash128 without depending on 128-bit
+// scalars
+void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64);
+void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
+              uint64_t* low64);
+
+// Hash 128 bits to 128 bits, guaranteed not to lose data (equivalent to
+// Hash2x64 on 16 bytes little endian)
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
+                       uint64_t* out_high64, uint64_t* out_low64);
+void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                       uint64_t* out_high64, uint64_t* out_low64);
+
+// Inverse of above (mostly for testing)
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
+                         uint64_t* out_high64, uint64_t* out_low64);
+void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
+                         uint64_t* out_high64, uint64_t* out_low64);
+
 // Stable/persistent 32-bit hash. Moderate quality and high speed on
 // small inputs.
 // TODO: consider rename to Hash32
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend pseudorandom or hashed seeds.
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 
 // TODO: consider rename to LegacyBloomHash32
@@ -59,11 +97,26 @@
 inline uint64_t GetSliceHash64(const Slice& key) {
   return Hash64(key.data(), key.size());
 }
+// Provided for convenience for use with template argument deduction, where a
+// specific overload needs to be used.
+extern uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&);
 
 inline uint64_t GetSliceNPHash64(const Slice& s) {
   return NPHash64(s.data(), s.size());
 }
 
+inline uint64_t GetSliceNPHash64(const Slice& s, uint64_t seed) {
+  return NPHash64(s.data(), s.size(), seed);
+}
+
+// Similar to `GetSliceNPHash64()` with `seed`, but input comes from
+// concatenation of `Slice`s in `data`.
+extern uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed);
+
+inline size_t GetSliceRangedNPHash(const Slice& s, size_t range) {
+  return FastRange64(NPHash64(s.data(), s.size()), range);
+}
+
 // TODO: consider rename to GetSliceHash32
 inline uint32_t GetSliceHash(const Slice& s) {
   return Hash(s.data(), s.size(), 397);
@@ -81,40 +134,4 @@
   uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
 };
 
-// An alternative to % for mapping a hash value to an arbitrary range. See
-// https://github.com/lemire/fastrange
-inline uint32_t fastrange32(uint32_t hash, uint32_t range) {
-  uint64_t product = uint64_t{range} * hash;
-  return static_cast<uint32_t>(product >> 32);
-}
-
-// An alternative to % for mapping a 64-bit hash value to an arbitrary range
-// that fits in size_t. See https://github.com/lemire/fastrange
-// We find size_t more convenient than uint64_t for the range, with side
-// benefit of better optimization on 32-bit platforms.
-inline size_t fastrange64(uint64_t hash, size_t range) {
-#if defined(HAVE_UINT128_EXTENSION)
-  // Can use compiler's 128-bit type. Trust it to do the right thing.
-  __uint128_t wide = __uint128_t{range} * hash;
-  return static_cast<size_t>(wide >> 64);
-#else
-  // Fall back: full decomposition.
-  // NOTE: GCC seems to fully understand this code as 64-bit x {32 or 64}-bit
-  // -> {96 or 128}-bit multiplication and optimize it down to a single
-  // wide-result multiplication (64-bit platform) or two wide-result
-  // multiplications (32-bit platforms, where range64 >> 32 is zero).
-  uint64_t range64 = range;  // ok to shift by 32, even if size_t is 32-bit
-  uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF};
-  tmp >>= 32;
-  tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32};
-  // Avoid overflow: first add lower 32 of tmp2, and later upper 32
-  uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF};
-  tmp += static_cast<uint32_t>(tmp2);
-  tmp >>= 32;
-  tmp += (tmp2 >> 32);
-  tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32};
-  return static_cast<size_t>(tmp);
-#endif
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash128.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash128.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash128.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash128.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,26 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+// 128-bit hash gets it own header so that more popular hash.h doesn't
+// depend on math128.h
+
+#include "rocksdb/slice.h"
+#include "util/math128.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Stable/persistent 128-bit hash for non-cryptographic applications.
+Unsigned128 Hash128(const char* data, size_t n, uint64_t seed);
+
+// Specific optimization without seed (same as seed = 0)
+Unsigned128 Hash128(const char* data, size_t n);
+
+inline Unsigned128 GetSliceHash128(const Slice& key) {
+  return Hash128(key.data(), key.size());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash_map.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_map.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash_map.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_map.h	2025-05-19 16:14:28.000000000 +0000
@@ -36,7 +36,7 @@
     return it != bucket.end();
   }
 
-  void Insert(K key, V value) {
+  void Insert(K key, const V& value) {
     auto& bucket = table_[key % size];
     bucket.push_back({key, value});
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/hash_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/hash_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,20 +7,36 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "util/hash.h"
+
 #include <cstring>
+#include <type_traits>
 #include <vector>
 
 #include "test_util/testharness.h"
 #include "util/coding.h"
-#include "util/hash.h"
-
+#include "util/coding_lean.h"
+#include "util/hash128.h"
+#include "util/math.h"
+#include "util/math128.h"
+
+using ROCKSDB_NAMESPACE::BijectiveHash2x64;
+using ROCKSDB_NAMESPACE::BijectiveUnhash2x64;
+using ROCKSDB_NAMESPACE::DecodeFixed64;
 using ROCKSDB_NAMESPACE::EncodeFixed32;
+using ROCKSDB_NAMESPACE::EndianSwapValue;
 using ROCKSDB_NAMESPACE::GetSliceHash64;
 using ROCKSDB_NAMESPACE::Hash;
+using ROCKSDB_NAMESPACE::Hash128;
+using ROCKSDB_NAMESPACE::Hash2x64;
 using ROCKSDB_NAMESPACE::Hash64;
 using ROCKSDB_NAMESPACE::Lower32of64;
+using ROCKSDB_NAMESPACE::Lower64of128;
+using ROCKSDB_NAMESPACE::ReverseBits;
 using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::Unsigned128;
 using ROCKSDB_NAMESPACE::Upper32of64;
+using ROCKSDB_NAMESPACE::Upper64of128;
 
 // The hash algorithm is part of the file format, for example for the Bloom
 // filters. Test that the hash values are stable for a set of random strings of
@@ -91,7 +107,8 @@
     for (size_t size = 0; size <= max_size; ++size) {
       uint64_t here = Hash64(str.data(), size, kSeed);
 
-      // Must be same as GetSliceHash64
+      // Must be same as unseeded Hash64 and GetSliceHash64
+      EXPECT_EQ(here, Hash64(str.data(), size));
       EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size)));
 
       // Upper and Lower must reconstruct hash
@@ -232,7 +249,7 @@
   return rv;
 }
 
-// XXH3p changes its algorithm for various sizes up through 250 bytes, so
+// XXPH3 changes its algorithm for various sizes up through 250 bytes, so
 // we need to check the stability of larger sizes also.
 TEST(HashTest, Hash64LargeValueSchema) {
   // Each of these derives a "descriptor" from the hash values for all
@@ -265,112 +282,494 @@
       "eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm");
 }
 
-TEST(Fastrange32Test, Values) {
-  using ROCKSDB_NAMESPACE::fastrange32;
+TEST(HashTest, Hash128Misc) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash128
+
+  for (char fill : {'\0', 'a', '1', '\xff', 'e'}) {
+    const size_t max_size = 1000;
+    std::string str(max_size, fill);
+
+    if (fill == 'e') {
+      // Use different characters to check endianness handling
+      for (size_t i = 0; i < str.size(); ++i) {
+        str[i] += static_cast<char>(i);
+      }
+    }
+
+    for (size_t size = 0; size <= max_size; ++size) {
+      Unsigned128 here = Hash128(str.data(), size, kSeed);
+
+      // Must be same as unseeded Hash128 and GetSliceHash128
+      EXPECT_EQ(here, Hash128(str.data(), size));
+      EXPECT_EQ(here, GetSliceHash128(Slice(str.data(), size)));
+      {
+        uint64_t hi, lo;
+        Hash2x64(str.data(), size, &hi, &lo);
+        EXPECT_EQ(Lower64of128(here), lo);
+        EXPECT_EQ(Upper64of128(here), hi);
+      }
+      if (size == 16) {
+        const uint64_t in_hi = DecodeFixed64(str.data() + 8);
+        const uint64_t in_lo = DecodeFixed64(str.data());
+        uint64_t hi, lo;
+        BijectiveHash2x64(in_hi, in_lo, &hi, &lo);
+        EXPECT_EQ(Lower64of128(here), lo);
+        EXPECT_EQ(Upper64of128(here), hi);
+        uint64_t un_hi, un_lo;
+        BijectiveUnhash2x64(hi, lo, &un_hi, &un_lo);
+        EXPECT_EQ(in_lo, un_lo);
+        EXPECT_EQ(in_hi, un_hi);
+      }
+
+      // Upper and Lower must reconstruct hash
+      EXPECT_EQ(here,
+                (Unsigned128{Upper64of128(here)} << 64) | Lower64of128(here));
+      EXPECT_EQ(here,
+                (Unsigned128{Upper64of128(here)} << 64) ^ Lower64of128(here));
+
+      // Seed changes hash value (with high probability)
+      for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
+        Unsigned128 seeded = Hash128(str.data(), size, var_seed);
+        EXPECT_NE(here, seeded);
+        // Must match seeded Hash2x64
+        {
+          uint64_t hi, lo;
+          Hash2x64(str.data(), size, var_seed, &hi, &lo);
+          EXPECT_EQ(Lower64of128(seeded), lo);
+          EXPECT_EQ(Upper64of128(seeded), hi);
+        }
+        if (size == 16) {
+          const uint64_t in_hi = DecodeFixed64(str.data() + 8);
+          const uint64_t in_lo = DecodeFixed64(str.data());
+          uint64_t hi, lo;
+          BijectiveHash2x64(in_hi, in_lo, var_seed, &hi, &lo);
+          EXPECT_EQ(Lower64of128(seeded), lo);
+          EXPECT_EQ(Upper64of128(seeded), hi);
+          uint64_t un_hi, un_lo;
+          BijectiveUnhash2x64(hi, lo, var_seed, &un_hi, &un_lo);
+          EXPECT_EQ(in_lo, un_lo);
+          EXPECT_EQ(in_hi, un_hi);
+        }
+      }
+
+      // Size changes hash value (with high probability)
+      size_t max_smaller_by = std::min(size_t{30}, size);
+      for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
+        EXPECT_NE(here, Hash128(str.data(), size - smaller_by, kSeed));
+      }
+    }
+  }
+}
+
+// Test that hash values are "non-trivial" for "trivial" inputs
+TEST(HashTest, Hash128Trivial) {
+  // Thorough test too slow for regression testing
+  constexpr bool thorough = false;
+
+  // For various seeds, make sure hash of empty string is not zero.
+  constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
+  for (uint64_t seed = 0; seed < max_seed; ++seed) {
+    Unsigned128 here = Hash128("", 0, seed);
+    EXPECT_NE(Lower64of128(here), 0u);
+    EXPECT_NE(Upper64of128(here), 0u);
+  }
+
+  // For standard seed, make sure hash of small strings are not zero
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash128
+  char input[4];
+  constexpr int max_len = thorough ? 3 : 2;
+  for (int len = 1; len <= max_len; ++len) {
+    for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
+      EncodeFixed32(input, i);
+      Unsigned128 here = Hash128(input, len, kSeed);
+      EXPECT_NE(Lower64of128(here), 0u);
+      EXPECT_NE(Upper64of128(here), 0u);
+    }
+  }
+}
+
+std::string Hash128TestDescriptor(const char *repeat, size_t limit) {
+  const char *mod61_encode =
+      "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  std::string input;
+  while (input.size() < limit) {
+    input.append(repeat);
+  }
+  std::string rv;
+  for (size_t i = 0; i < limit; ++i) {
+    auto h = GetSliceHash128(Slice(input.data(), i));
+    uint64_t h2 = Upper64of128(h) + Lower64of128(h);
+    rv.append(1, mod61_encode[static_cast<size_t>(h2 % 61)]);
+  }
+  return rv;
+}
+
+// XXH3 changes its algorithm for various sizes up through 250 bytes, so
+// we need to check the stability of larger sizes also.
+TEST(HashTest, Hash128ValueSchema) {
+  // Each of these derives a "descriptor" from the hash values for all
+  // lengths up to 430.
+  // Note that "b" is common for the zero-length string.
+  EXPECT_EQ(
+      Hash128TestDescriptor("foo", 430),
+      "bUMA3As8n9I4vNGhThXlEevxZlyMcbb6TYAlIKJ2f5ponsv99q962rYclQ7u3gfnRdCDQ5JI"
+      "2LrGUaCycbXrvLFe4SjgRb9RQwCfrnmNQ7VSEwSKMnkGCK3bDbXSrnIh5qLXdtvIZklbJpGH"
+      "Dqr93BlqF9ubTnOSYkSdx89XvQqflMIW8bjfQp9BPjQejWOeEQspnN1D3sfgVdFhpaQdHYA5"
+      "pI2XcPlCMFPxvrFuRr7joaDvjNe9IUZaunLPMewuXmC3EL95h52Ju3D7y9RNKhgYxMTrA84B"
+      "yJrMvyjdm3vlBxet4EN7v2GEyjbGuaZW9UL6lrX6PghJDg7ACfLGdxNbH3qXM4zaiG2RKnL5"
+      "S3WXKR78RBB5fRFQ8KDIEQjHFvSNsc3GrAEi6W8P2lv8JMTzjBODO2uN4wadVQFT9wpGfV");
+  // Note that "35D2v" is common for "Rocks"
+  EXPECT_EQ(
+      Hash128TestDescriptor("Rocks", 430),
+      "b35D2vzvklFVDqJmyLRXyApwGGO3EAT3swhe8XJAN3mY2UVPglzdmydxcba6JI2tSvwO6zSu"
+      "ANpjSM7tc9G5iMhsa7R8GfyCXRO1TnLg7HvdWNdgGGBirxZR68BgT7TQsYJt6zyEyISeXI1n"
+      "MXA48Xo7dWfJeYN6Z4KWlqZY7TgFXGbks9AX4ehZNSGtIhdO5i58qlgVX1bEejeOVaCcjC79"
+      "67DrMfOKds7rUQzjBa77sMPcoPW1vu6ljGJPZH3XkRyDMZ1twxXKkNxN3tE8nR7JHwyqBAxE"
+      "fTcjbOWrLZ1irWxRSombD8sGDEmclgF11IxqEhe3Rt7gyofO3nExGckKkS9KfRqsCHbiUyva"
+      "JGkJwUHRXaZnh58b4i1Ei9aQKZjXlvIVDixoZrjcNaH5XJIJlRZce9Z9t82wYapTpckYSg");
+  EXPECT_EQ(
+      Hash128TestDescriptor("RocksDB", 430),
+      "b35D2vFUst3XDZCRlSrhmYYakmqImV97LbBsV6EZlOEQpUPH1d1sD3xMKAPlA5UErHehg5O7"
+      "n966fZqhAf3hRc24kGCLfNAWjyUa7vSNOx3IcPoTyVRFZeFlcCtfl7t1QJumHOCpS33EBmBF"
+      "hvK13QjBbDWYWeHQhJhgV9Mqbx17TIcvUkEnYZxb8IzWNmjVsJG44Z7v52DjGj1ZzS62S2Vv"
+      "qWcDO7apvH5VHg68E9Wl6nXP21vlmUqEH9GeWRehfWVvY7mUpsAg5drHHQyDSdiMceiUuUxJ"
+      "XJqHFcDdzbbPk7xDvbLgWCKvH8k3MpQNWOmbSSRDdAP6nGlDjoTToYkcqVREHJzztSWAAq5h"
+      "GHSUNJ6OxsMHhf8EhXfHtKyUzRmPtjYyeckQcGmrQfFFLidc6cjMDKCdBG6c6HVBrS7H2R");
+}
+
+TEST(FastRange32Test, Values) {
+  using ROCKSDB_NAMESPACE::FastRange32;
   // Zero range
-  EXPECT_EQ(fastrange32(0, 0), 0U);
-  EXPECT_EQ(fastrange32(123, 0), 0U);
-  EXPECT_EQ(fastrange32(0xffffffff, 0), 0U);
+  EXPECT_EQ(FastRange32(0, 0), 0U);
+  EXPECT_EQ(FastRange32(123, 0), 0U);
+  EXPECT_EQ(FastRange32(0xffffffff, 0), 0U);
 
   // One range
-  EXPECT_EQ(fastrange32(0, 1), 0U);
-  EXPECT_EQ(fastrange32(123, 1), 0U);
-  EXPECT_EQ(fastrange32(0xffffffff, 1), 0U);
+  EXPECT_EQ(FastRange32(0, 1), 0U);
+  EXPECT_EQ(FastRange32(123, 1), 0U);
+  EXPECT_EQ(FastRange32(0xffffffff, 1), 0U);
 
   // Two range
-  EXPECT_EQ(fastrange32(0, 2), 0U);
-  EXPECT_EQ(fastrange32(123, 2), 0U);
-  EXPECT_EQ(fastrange32(0x7fffffff, 2), 0U);
-  EXPECT_EQ(fastrange32(0x80000000, 2), 1U);
-  EXPECT_EQ(fastrange32(0xffffffff, 2), 1U);
+  EXPECT_EQ(FastRange32(0, 2), 0U);
+  EXPECT_EQ(FastRange32(123, 2), 0U);
+  EXPECT_EQ(FastRange32(0x7fffffff, 2), 0U);
+  EXPECT_EQ(FastRange32(0x80000000, 2), 1U);
+  EXPECT_EQ(FastRange32(0xffffffff, 2), 1U);
 
   // Seven range
-  EXPECT_EQ(fastrange32(0, 7), 0U);
-  EXPECT_EQ(fastrange32(123, 7), 0U);
-  EXPECT_EQ(fastrange32(613566756, 7), 0U);
-  EXPECT_EQ(fastrange32(613566757, 7), 1U);
-  EXPECT_EQ(fastrange32(1227133513, 7), 1U);
-  EXPECT_EQ(fastrange32(1227133514, 7), 2U);
+  EXPECT_EQ(FastRange32(0, 7), 0U);
+  EXPECT_EQ(FastRange32(123, 7), 0U);
+  EXPECT_EQ(FastRange32(613566756, 7), 0U);
+  EXPECT_EQ(FastRange32(613566757, 7), 1U);
+  EXPECT_EQ(FastRange32(1227133513, 7), 1U);
+  EXPECT_EQ(FastRange32(1227133514, 7), 2U);
   // etc.
-  EXPECT_EQ(fastrange32(0xffffffff, 7), 6U);
+  EXPECT_EQ(FastRange32(0xffffffff, 7), 6U);
 
   // Big
-  EXPECT_EQ(fastrange32(1, 0x80000000), 0U);
-  EXPECT_EQ(fastrange32(2, 0x80000000), 1U);
-  EXPECT_EQ(fastrange32(4, 0x7fffffff), 1U);
-  EXPECT_EQ(fastrange32(4, 0x80000000), 2U);
-  EXPECT_EQ(fastrange32(0xffffffff, 0x7fffffff), 0x7ffffffeU);
-  EXPECT_EQ(fastrange32(0xffffffff, 0x80000000), 0x7fffffffU);
+  EXPECT_EQ(FastRange32(1, 0x80000000), 0U);
+  EXPECT_EQ(FastRange32(2, 0x80000000), 1U);
+  EXPECT_EQ(FastRange32(4, 0x7fffffff), 1U);
+  EXPECT_EQ(FastRange32(4, 0x80000000), 2U);
+  EXPECT_EQ(FastRange32(0xffffffff, 0x7fffffff), 0x7ffffffeU);
+  EXPECT_EQ(FastRange32(0xffffffff, 0x80000000), 0x7fffffffU);
 }
 
-TEST(Fastrange64Test, Values) {
-  using ROCKSDB_NAMESPACE::fastrange64;
+TEST(FastRange64Test, Values) {
+  using ROCKSDB_NAMESPACE::FastRange64;
   // Zero range
-  EXPECT_EQ(fastrange64(0, 0), 0U);
-  EXPECT_EQ(fastrange64(123, 0), 0U);
-  EXPECT_EQ(fastrange64(0xffffFFFF, 0), 0U);
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0), 0U);
+  EXPECT_EQ(FastRange64(0, 0), 0U);
+  EXPECT_EQ(FastRange64(123, 0), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 0), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0), 0U);
 
   // One range
-  EXPECT_EQ(fastrange64(0, 1), 0U);
-  EXPECT_EQ(fastrange64(123, 1), 0U);
-  EXPECT_EQ(fastrange64(0xffffFFFF, 1), 0U);
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 1), 0U);
+  EXPECT_EQ(FastRange64(0, 1), 0U);
+  EXPECT_EQ(FastRange64(123, 1), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 1), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 1), 0U);
 
   // Two range
-  EXPECT_EQ(fastrange64(0, 2), 0U);
-  EXPECT_EQ(fastrange64(123, 2), 0U);
-  EXPECT_EQ(fastrange64(0xffffFFFF, 2), 0U);
-  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 2), 0U);
-  EXPECT_EQ(fastrange64(0x8000000000000000, 2), 1U);
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 2), 1U);
+  EXPECT_EQ(FastRange64(0, 2), 0U);
+  EXPECT_EQ(FastRange64(123, 2), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 2), 0U);
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 2), 0U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 2), 1U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 2), 1U);
 
   // Seven range
-  EXPECT_EQ(fastrange64(0, 7), 0U);
-  EXPECT_EQ(fastrange64(123, 7), 0U);
-  EXPECT_EQ(fastrange64(0xffffFFFF, 7), 0U);
-  EXPECT_EQ(fastrange64(2635249153387078802, 7), 0U);
-  EXPECT_EQ(fastrange64(2635249153387078803, 7), 1U);
-  EXPECT_EQ(fastrange64(5270498306774157604, 7), 1U);
-  EXPECT_EQ(fastrange64(5270498306774157605, 7), 2U);
-  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 7), 3U);
-  EXPECT_EQ(fastrange64(0x8000000000000000, 7), 3U);
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 7), 6U);
+  EXPECT_EQ(FastRange64(0, 7), 0U);
+  EXPECT_EQ(FastRange64(123, 7), 0U);
+  EXPECT_EQ(FastRange64(0xffffFFFF, 7), 0U);
+  EXPECT_EQ(FastRange64(2635249153387078802, 7), 0U);
+  EXPECT_EQ(FastRange64(2635249153387078803, 7), 1U);
+  EXPECT_EQ(FastRange64(5270498306774157604, 7), 1U);
+  EXPECT_EQ(FastRange64(5270498306774157605, 7), 2U);
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 7), 3U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 7), 3U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 7), 6U);
 
   // Big but 32-bit range
-  EXPECT_EQ(fastrange64(0x100000000, 0x80000000), 0U);
-  EXPECT_EQ(fastrange64(0x200000000, 0x80000000), 1U);
-  EXPECT_EQ(fastrange64(0x400000000, 0x7fffFFFF), 1U);
-  EXPECT_EQ(fastrange64(0x400000000, 0x80000000), 2U);
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU);
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU);
+  EXPECT_EQ(FastRange64(0x100000000, 0x80000000), 0U);
+  EXPECT_EQ(FastRange64(0x200000000, 0x80000000), 1U);
+  EXPECT_EQ(FastRange64(0x400000000, 0x7fffFFFF), 1U);
+  EXPECT_EQ(FastRange64(0x400000000, 0x80000000), 2U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU);
 
   // Big, > 32-bit range
 #if SIZE_MAX == UINT64_MAX
-  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U);
-  EXPECT_EQ(fastrange64(0x8000000000000000, 0x4200000002), 0x2100000001U);
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 0x4200000002), 0x2100000001U);
 
-  EXPECT_EQ(fastrange64(0x0000000000000000, 420000000002), 0U);
-  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U);
-  EXPECT_EQ(fastrange64(0x8000000000000000, 420000000002), 210000000001U);
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U);
+  EXPECT_EQ(FastRange64(0x0000000000000000, 420000000002), 0U);
+  EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U);
+  EXPECT_EQ(FastRange64(0x8000000000000000, 420000000002), 210000000001U);
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U);
 
-  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF),
+  EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF),
             0xffffFFFFffffFFFEU);
 #endif
 }
 
+TEST(FastRangeGenericTest, Values) {
+  using ROCKSDB_NAMESPACE::FastRangeGeneric;
+  // Generic (including big and small)
+  // Note that FastRangeGeneric is also tested indirectly above via
+  // FastRange32 and FastRange64.
+  EXPECT_EQ(
+      FastRangeGeneric(uint64_t{0x8000000000000000}, uint64_t{420000000002}),
+      uint64_t{210000000001});
+  EXPECT_EQ(FastRangeGeneric(uint64_t{0x8000000000000000}, uint16_t{12468}),
+            uint16_t{6234});
+  EXPECT_EQ(FastRangeGeneric(uint32_t{0x80000000}, uint16_t{12468}),
+            uint16_t{6234});
+  // Not recommended for typical use because for example this could fail on
+  // some platforms and pass on others:
+  //EXPECT_EQ(FastRangeGeneric(static_cast<unsigned long>(0x80000000),
+  //                           uint16_t{12468}),
+  //          uint16_t{6234});
+}
+
 // for inspection of disassembly
-uint32_t fastrange32(uint32_t hash, uint32_t range) {
-  return ROCKSDB_NAMESPACE::fastrange32(hash, range);
+uint32_t FastRange32(uint32_t hash, uint32_t range) {
+  return ROCKSDB_NAMESPACE::FastRange32(hash, range);
 }
 
 // for inspection of disassembly
-size_t fastrange64(uint64_t hash, size_t range) {
-  return ROCKSDB_NAMESPACE::fastrange64(hash, range);
+size_t FastRange64(uint64_t hash, size_t range) {
+  return ROCKSDB_NAMESPACE::FastRange64(hash, range);
+}
+
+// Tests for math.h / math128.h (not worth a separate test binary)
+using ROCKSDB_NAMESPACE::BitParity;
+using ROCKSDB_NAMESPACE::BitsSetToOne;
+using ROCKSDB_NAMESPACE::CountTrailingZeroBits;
+using ROCKSDB_NAMESPACE::DecodeFixed128;
+using ROCKSDB_NAMESPACE::DecodeFixedGeneric;
+using ROCKSDB_NAMESPACE::EncodeFixed128;
+using ROCKSDB_NAMESPACE::EncodeFixedGeneric;
+using ROCKSDB_NAMESPACE::FloorLog2;
+using ROCKSDB_NAMESPACE::Lower64of128;
+using ROCKSDB_NAMESPACE::Multiply64to128;
+using ROCKSDB_NAMESPACE::Unsigned128;
+using ROCKSDB_NAMESPACE::Upper64of128;
+
+template <typename T>
+static void test_BitOps() {
+  // This complex code is to generalize to 128-bit values. Otherwise
+  // we could just use = static_cast<T>(0x5555555555555555ULL);
+  T everyOtherBit = 0;
+  for (unsigned i = 0; i < sizeof(T); ++i) {
+    everyOtherBit = (everyOtherBit << 8) | T{0x55};
+  }
+
+  // This one built using bit operations, as our 128-bit layer
+  // might not implement arithmetic such as subtraction.
+  T vm1 = 0;  // "v minus one"
+
+  for (int i = 0; i < int{8 * sizeof(T)}; ++i) {
+    T v = T{1} << i;
+    // If we could directly use arithmetic:
+    // T vm1 = static_cast<T>(v - 1);
+
+    // FloorLog2
+    if (v > 0) {
+      EXPECT_EQ(FloorLog2(v), i);
+    }
+    if (vm1 > 0) {
+      EXPECT_EQ(FloorLog2(vm1), i - 1);
+      EXPECT_EQ(FloorLog2(everyOtherBit & vm1), (i - 1) & ~1);
+    }
+
+    // CountTrailingZeroBits
+    if (v != 0) {
+      EXPECT_EQ(CountTrailingZeroBits(v), i);
+    }
+    if (vm1 != 0) {
+      EXPECT_EQ(CountTrailingZeroBits(vm1), 0);
+    }
+    if (i < int{8 * sizeof(T)} - 1) {
+      EXPECT_EQ(CountTrailingZeroBits(~vm1 & everyOtherBit), (i + 1) & ~1);
+    }
+
+    // BitsSetToOne
+    EXPECT_EQ(BitsSetToOne(v), 1);
+    EXPECT_EQ(BitsSetToOne(vm1), i);
+    EXPECT_EQ(BitsSetToOne(vm1 & everyOtherBit), (i + 1) / 2);
+
+    // BitParity
+    EXPECT_EQ(BitParity(v), 1);
+    EXPECT_EQ(BitParity(vm1), i & 1);
+    EXPECT_EQ(BitParity(vm1 & everyOtherBit), ((i + 1) / 2) & 1);
+
+    // EndianSwapValue
+    T ev = T{1} << (((sizeof(T) - 1 - (i / 8)) * 8) + i % 8);
+    EXPECT_EQ(EndianSwapValue(v), ev);
+
+    // ReverseBits
+    EXPECT_EQ(ReverseBits(v), static_cast<T>(T{1} << (8 * sizeof(T) - 1 - i)));
+#ifdef HAVE_UINT128_EXTENSION          // Uses multiplication
+    if (std::is_unsigned<T>::value) {  // Technical UB on signed type
+      T rv = T{1} << (8 * sizeof(T) - 1 - i);
+      EXPECT_EQ(ReverseBits(vm1), static_cast<T>(rv * ~T{1}));
+    }
+#endif
+    vm1 = (vm1 << 1) | 1;
+  }
+}
+
+TEST(MathTest, BitOps) {
+  test_BitOps<uint32_t>();
+  test_BitOps<uint64_t>();
+  test_BitOps<uint16_t>();
+  test_BitOps<uint8_t>();
+  test_BitOps<unsigned char>();
+  test_BitOps<unsigned short>();
+  test_BitOps<unsigned int>();
+  test_BitOps<unsigned long>();
+  test_BitOps<unsigned long long>();
+  test_BitOps<char>();
+  test_BitOps<size_t>();
+  test_BitOps<int32_t>();
+  test_BitOps<int64_t>();
+  test_BitOps<int16_t>();
+  test_BitOps<int8_t>();
+  test_BitOps<signed char>();
+  test_BitOps<short>();
+  test_BitOps<int>();
+  test_BitOps<long>();
+  test_BitOps<long long>();
+  test_BitOps<ptrdiff_t>();
+}
+
+TEST(MathTest, BitOps128) { test_BitOps<Unsigned128>(); }
+
+TEST(MathTest, Math128) {
+  const Unsigned128 sixteenHexOnes = 0x1111111111111111U;
+  const Unsigned128 thirtyHexOnes = (sixteenHexOnes << 56) | sixteenHexOnes;
+  const Unsigned128 sixteenHexTwos = 0x2222222222222222U;
+  const Unsigned128 thirtyHexTwos = (sixteenHexTwos << 56) | sixteenHexTwos;
+
+  // v will slide from all hex ones to all hex twos
+  Unsigned128 v = thirtyHexOnes;
+  for (int i = 0; i <= 30; ++i) {
+    // Test bitwise operations
+    EXPECT_EQ(BitsSetToOne(v), 30);
+    EXPECT_EQ(BitsSetToOne(~v), 128 - 30);
+    EXPECT_EQ(BitsSetToOne(v & thirtyHexOnes), 30 - i);
+    EXPECT_EQ(BitsSetToOne(v | thirtyHexOnes), 30 + i);
+    EXPECT_EQ(BitsSetToOne(v ^ thirtyHexOnes), 2 * i);
+    EXPECT_EQ(BitsSetToOne(v & thirtyHexTwos), i);
+    EXPECT_EQ(BitsSetToOne(v | thirtyHexTwos), 60 - i);
+    EXPECT_EQ(BitsSetToOne(v ^ thirtyHexTwos), 60 - 2 * i);
+
+    // Test comparisons
+    EXPECT_EQ(v == thirtyHexOnes, i == 0);
+    EXPECT_EQ(v == thirtyHexTwos, i == 30);
+    EXPECT_EQ(v > thirtyHexOnes, i > 0);
+    EXPECT_EQ(v > thirtyHexTwos, false);
+    EXPECT_EQ(v >= thirtyHexOnes, true);
+    EXPECT_EQ(v >= thirtyHexTwos, i == 30);
+    EXPECT_EQ(v < thirtyHexOnes, false);
+    EXPECT_EQ(v < thirtyHexTwos, i < 30);
+    EXPECT_EQ(v <= thirtyHexOnes, i == 0);
+    EXPECT_EQ(v <= thirtyHexTwos, true);
+
+    // Update v, clearing upper-most byte
+    v = ((v << 12) >> 8) | 0x2;
+  }
+
+  for (int i = 0; i < 128; ++i) {
+    // Test shifts
+    Unsigned128 sl = thirtyHexOnes << i;
+    Unsigned128 sr = thirtyHexOnes >> i;
+    EXPECT_EQ(BitsSetToOne(sl), std::min(30, 32 - i / 4));
+    EXPECT_EQ(BitsSetToOne(sr), std::max(0, 30 - (i + 3) / 4));
+    EXPECT_EQ(BitsSetToOne(sl & sr), i % 2 ? 0 : std::max(0, 30 - i / 2));
+  }
+
+  // Test 64x64->128 multiply
+  Unsigned128 product =
+      Multiply64to128(0x1111111111111111U, 0x2222222222222222U);
+  EXPECT_EQ(Lower64of128(product), 2295594818061633090U);
+  EXPECT_EQ(Upper64of128(product), 163971058432973792U);
+}
+
+TEST(MathTest, Coding128) {
+  const char *in = "_1234567890123456";
+  // Note: in + 1 is likely unaligned
+  Unsigned128 decoded = DecodeFixed128(in + 1);
+  EXPECT_EQ(Lower64of128(decoded), 0x3837363534333231U);
+  EXPECT_EQ(Upper64of128(decoded), 0x3635343332313039U);
+  char out[18];
+  out[0] = '_';
+  EncodeFixed128(out + 1, decoded);
+  out[17] = '\0';
+  EXPECT_EQ(std::string(in), std::string(out));
+}
+
+TEST(MathTest, CodingGeneric) {
+  const char *in = "_1234567890123456";
+  // Decode
+  // Note: in + 1 is likely unaligned
+  Unsigned128 decoded128 = DecodeFixedGeneric<Unsigned128>(in + 1);
+  EXPECT_EQ(Lower64of128(decoded128), 0x3837363534333231U);
+  EXPECT_EQ(Upper64of128(decoded128), 0x3635343332313039U);
+
+  uint64_t decoded64 = DecodeFixedGeneric<uint64_t>(in + 1);
+  EXPECT_EQ(decoded64, 0x3837363534333231U);
+
+  uint32_t decoded32 = DecodeFixedGeneric<uint32_t>(in + 1);
+  EXPECT_EQ(decoded32, 0x34333231U);
+
+  uint16_t decoded16 = DecodeFixedGeneric<uint16_t>(in + 1);
+  EXPECT_EQ(decoded16, 0x3231U);
+
+  // Encode
+  char out[18];
+  out[0] = '_';
+  memset(out + 1, '\0', 17);
+  EncodeFixedGeneric(out + 1, decoded128);
+  EXPECT_EQ(std::string(in), std::string(out));
+
+  memset(out + 1, '\0', 9);
+  EncodeFixedGeneric(out + 1, decoded64);
+  EXPECT_EQ(std::string("_12345678"), std::string(out));
+
+  memset(out + 1, '\0', 5);
+  EncodeFixedGeneric(out + 1, decoded32);
+  EXPECT_EQ(std::string("_1234"), std::string(out));
+
+  memset(out + 1, '\0', 3);
+  EncodeFixedGeneric(out + 1, decoded16);
+  EXPECT_EQ(std::string("_12"), std::string(out));
 }
 
 int main(int argc, char** argv) {
+  fprintf(stderr, "NPHash64 id: %x\n",
+          static_cast<int>(ROCKSDB_NAMESPACE::GetSliceNPHash64("RocksDB")));
   ::testing::InitGoogleTest(&argc, argv);
 
   return RUN_ALL_TESTS();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/heap.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/heap.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/heap.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/heap.h	2025-05-19 16:14:28.000000000 +0000
@@ -72,7 +72,12 @@
 
   void pop() {
     assert(!empty());
-    data_.front() = std::move(data_.back());
+    if (data_.size() > 1) {
+      // Avoid self-move-assign, because it could cause problems with
+      // classes which are not prepared for this and it trips up the
+      // STL debugger when activated.
+      data_.front() = std::move(data_.back());
+    }
     data_.pop_back();
     if (!empty()) {
       downheap(get_root());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/kv_map.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/kv_map.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/kv_map.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/kv_map.h	2025-05-19 16:14:28.000000000 +0000
@@ -28,6 +28,6 @@
   const Comparator* cmp;
 };
 
-typedef std::map<std::string, std::string, LessOfComparator> KVMap;
+using KVMap = std::map<std::string, std::string, LessOfComparator>;
 }
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/log_write_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/log_write_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/log_write_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/log_write_bench.cc	2025-05-19 16:14:28.000000000 +0000
@@ -14,6 +14,7 @@
 #include "file/writable_file_writer.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
@@ -34,13 +35,14 @@
   std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
   DBOptions options;
   Env* env = Env::Default();
+  const auto& clock = env->GetSystemClock();
   EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options);
   env_options.bytes_per_sync = FLAGS_bytes_per_sync;
   std::unique_ptr<WritableFile> file;
   env->NewWritableFile(file_name, &file, env_options);
   std::unique_ptr<WritableFileWriter> writer;
   writer.reset(new WritableFileWriter(std::move(file), file_name, env_options,
-                                      env, nullptr /* stats */,
+                                      clock, nullptr /* stats */,
                                       options.listeners));
 
   std::string record;
@@ -48,24 +50,24 @@
 
   HistogramImpl hist;
 
-  uint64_t start_time = env->NowMicros();
+  uint64_t start_time = clock->NowMicros();
   for (int i = 0; i < FLAGS_num_records; i++) {
-    uint64_t start_nanos = env->NowNanos();
+    uint64_t start_nanos = clock->NowNanos();
     writer->Append(record);
     writer->Flush();
     if (FLAGS_enable_sync) {
       writer->Sync(false);
     }
-    hist.Add(env->NowNanos() - start_nanos);
+    hist.Add(clock->NowNanos() - start_nanos);
 
     if (i % 1000 == 1) {
       fprintf(stderr, "Wrote %d records...\n", i);
     }
 
     int time_to_sleep =
-        (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time);
+        (i + 1) * FLAGS_record_interval - (clock->NowMicros() - start_time);
     if (time_to_sleep > 0) {
-      env->SleepForMicroseconds(time_to_sleep);
+      clock->SleepForMicroseconds(time_to_sleep);
     }
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/math.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/math.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/math.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/math.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,242 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstdint>
+#include <type_traits>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Fast implementation of floor(log2(v)). Undefined for 0 or negative
+// numbers (in case of signed type).
+template <typename T>
+inline int FloorLog2(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+  assert(v > 0);
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  unsigned long idx = 0;
+  if (sizeof(T) <= sizeof(uint32_t)) {
+    _BitScanReverse(&idx, static_cast<uint32_t>(v));
+  } else {
+#if defined(_M_X64) || defined(_M_ARM64)
+    _BitScanReverse64(&idx, static_cast<uint64_t>(v));
+#else
+    const auto vh = static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32);
+    if (vh != 0) {
+      _BitScanReverse(&idx, static_cast<uint32_t>(vh));
+      idx += 32;
+    } else {
+      _BitScanReverse(&idx, static_cast<uint32_t>(v));
+    }
+#endif
+  }
+  return idx;
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    int lz = __builtin_clz(static_cast<unsigned int>(v));
+    return int{sizeof(unsigned int)} * 8 - 1 - lz;
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    int lz = __builtin_clzl(static_cast<unsigned long>(v));
+    return int{sizeof(unsigned long)} * 8 - 1 - lz;
+  } else {
+    int lz = __builtin_clzll(static_cast<unsigned long long>(v));
+    return int{sizeof(unsigned long long)} * 8 - 1 - lz;
+  }
+#endif
+}
+
+// Number of low-order zero bits before the first 1 bit. Undefined for 0.
+template <typename T>
+inline int CountTrailingZeroBits(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+  assert(v != 0);
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  unsigned long tz = 0;
+  if (sizeof(T) <= sizeof(uint32_t)) {
+    _BitScanForward(&tz, static_cast<uint32_t>(v));
+  } else {
+#if defined(_M_X64) || defined(_M_ARM64)
+    _BitScanForward64(&tz, static_cast<uint64_t>(v));
+#else
+    _BitScanForward(&tz, static_cast<uint32_t>(v));
+    if (tz == 0) {
+      _BitScanForward(&tz,
+                      static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32));
+      tz += 32;
+    }
+#endif
+  }
+  return static_cast<int>(tz);
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    return __builtin_ctz(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_ctzl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_ctzll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include
+// the following code at coarse granularity for simpler macros. It's important
+// to exclude at least so our non-MSVC unit test coverage tool doesn't see it.
+#ifdef _MSC_VER
+
+namespace detail {
+
+template <typename T>
+int BitsSetToOneFallback(T v) {
+  const int kBits = static_cast<int>(sizeof(T)) * 8;
+  static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits");
+  // we static_cast these bit patterns in order to truncate them to the correct
+  // size. Warning C4309 dislikes this technique, so disable it here.
+#pragma warning(disable : 4309)
+  v = static_cast<T>(v - ((v >> 1) & static_cast<T>(0x5555555555555555ull)));
+  v = static_cast<T>((v & static_cast<T>(0x3333333333333333ull)) +
+                     ((v >> 2) & static_cast<T>(0x3333333333333333ull)));
+  v = static_cast<T>((v + (v >> 4)) & static_cast<T>(0x0F0F0F0F0F0F0F0Full));
+#pragma warning(default : 4309)
+  for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) {
+    v += static_cast<T>(v >> shift_bits);
+  }
+  // we want the bottom "slot" that's big enough to represent a value up to
+  // (and including) kBits.
+  return static_cast<int>(v & static_cast<T>(kBits | (kBits - 1)));
+}
+
+}  // namespace detail
+
+#endif  // _MSC_VER
+
+// Number of bits set to 1. Also known as "population count".
+template <typename T>
+inline int BitsSetToOne(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+#ifdef _MSC_VER
+  static_assert(sizeof(T) <= sizeof(uint64_t), "type too big");
+  if (sizeof(T) < sizeof(uint32_t)) {
+    // This bit mask is to avoid a compiler warning on unused path
+    constexpr auto mm = 8 * sizeof(uint32_t) - 1;
+    // The bit mask is to neutralize sign extension on small signed types
+    constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1;
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
+    return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m));
+#else
+    return static_cast<int>(detail::BitsSetToOneFallback(v) & m);
+#endif
+  } else if (sizeof(T) == sizeof(uint32_t)) {
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
+    return static_cast<int>(__popcnt(static_cast<uint32_t>(v)));
+#else
+    return detail::BitsSetToOneFallback(static_cast<uint32_t>(v));
+#endif
+  } else {
+#if defined(HAVE_SSE42) && defined(_M_X64)
+    return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
+#elif defined(HAVE_SSE42) && defined(_M_IX86)
+    return static_cast<int>(
+        __popcnt(static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32) +
+                 __popcnt(static_cast<uint32_t>(v))));
+#else
+    return detail::BitsSetToOneFallback(static_cast<uint64_t>(v));
+#endif
+  }
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) < sizeof(unsigned int)) {
+    // This bit mask is to avoid a compiler warning on unused path
+    constexpr auto mm = 8 * sizeof(unsigned int) - 1;
+    // This bit mask is to neutralize sign extension on small signed types
+    constexpr unsigned int m = (1U << ((8 * sizeof(T)) & mm)) - 1;
+    return __builtin_popcount(static_cast<unsigned int>(v) & m);
+  } else if (sizeof(T) == sizeof(unsigned int)) {
+    return __builtin_popcount(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_popcountl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_popcountll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+template <typename T>
+inline int BitParity(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+#ifdef _MSC_VER
+  // bit parity == oddness of popcount
+  return BitsSetToOne(v) & 1;
+#else
+  static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big");
+  if (sizeof(T) <= sizeof(unsigned int)) {
+    // On any sane systen, potential sign extension here won't change parity
+    return __builtin_parity(static_cast<unsigned int>(v));
+  } else if (sizeof(T) <= sizeof(unsigned long)) {
+    return __builtin_parityl(static_cast<unsigned long>(v));
+  } else {
+    return __builtin_parityll(static_cast<unsigned long long>(v));
+  }
+#endif
+}
+
+// Swaps between big and little endian. Can be used in combination with the
+// little-endian encoding/decoding functions in coding_lean.h and coding.h to
+// encode/decode big endian.
+template <typename T>
+inline T EndianSwapValue(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+
+#ifdef _MSC_VER
+  if (sizeof(T) == 2) {
+    return static_cast<T>(_byteswap_ushort(static_cast<uint16_t>(v)));
+  } else if (sizeof(T) == 4) {
+    return static_cast<T>(_byteswap_ulong(static_cast<uint32_t>(v)));
+  } else if (sizeof(T) == 8) {
+    return static_cast<T>(_byteswap_uint64(static_cast<uint64_t>(v)));
+  }
+#else
+  if (sizeof(T) == 2) {
+    return static_cast<T>(__builtin_bswap16(static_cast<uint16_t>(v)));
+  } else if (sizeof(T) == 4) {
+    return static_cast<T>(__builtin_bswap32(static_cast<uint32_t>(v)));
+  } else if (sizeof(T) == 8) {
+    return static_cast<T>(__builtin_bswap64(static_cast<uint64_t>(v)));
+  }
+#endif
+  // Recognized by clang as bswap, but not by gcc :(
+  T ret_val = 0;
+  for (std::size_t i = 0; i < sizeof(T); ++i) {
+    ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i));
+  }
+  return ret_val;
+}
+
+// Reverses the order of bits in an integral value
+template <typename T>
+inline T ReverseBits(T v) {
+  T r = EndianSwapValue(v);
+  const T kHighestByte = T{1} << ((sizeof(T) - 1) * 8);
+  const T kEveryByte = kHighestByte | (kHighestByte / 255);
+
+  r = ((r & (kEveryByte * 0x0f)) << 4) | ((r >> 4) & (kEveryByte * 0x0f));
+  r = ((r & (kEveryByte * 0x33)) << 2) | ((r >> 2) & (kEveryByte * 0x33));
+  r = ((r & (kEveryByte * 0x55)) << 1) | ((r >> 1) & (kEveryByte * 0x55));
+
+  return r;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/math128.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/math128.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/math128.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/math128.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,310 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "util/coding_lean.h"
+#include "util/math.h"
+
+#ifdef TEST_UINT128_COMPAT
+#undef HAVE_UINT128_EXTENSION
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// Unsigned128 is a 128 bit value supporting (at least) bitwise operators,
+// shifts, and comparisons. __uint128_t is not always available.
+
+#ifdef HAVE_UINT128_EXTENSION
+using Unsigned128 = __uint128_t;
+#else
+struct Unsigned128 {
+  uint64_t lo;
+  uint64_t hi;
+
+  inline Unsigned128() {
+    static_assert(sizeof(Unsigned128) == 2 * sizeof(uint64_t),
+                  "unexpected overhead in representation");
+    lo = 0;
+    hi = 0;
+  }
+
+  inline Unsigned128(uint64_t lower) {
+    lo = lower;
+    hi = 0;
+  }
+
+  inline Unsigned128(uint64_t lower, uint64_t upper) {
+    lo = lower;
+    hi = upper;
+  }
+
+  explicit operator uint64_t() { return lo; }
+
+  explicit operator uint32_t() { return static_cast<uint32_t>(lo); }
+
+  explicit operator uint16_t() { return static_cast<uint16_t>(lo); }
+
+  explicit operator uint8_t() { return static_cast<uint8_t>(lo); }
+};
+
+inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) {
+  shift &= 127;
+  Unsigned128 rv;
+  if (shift >= 64) {
+    rv.lo = 0;
+    rv.hi = lhs.lo << (shift & 63);
+  } else {
+    uint64_t tmp = lhs.lo;
+    rv.lo = tmp << shift;
+    // Ensure shift==0 shifts away everything. (This avoids another
+    // conditional branch on shift == 0.)
+    tmp = tmp >> 1 >> (63 - shift);
+    rv.hi = tmp | (lhs.hi << shift);
+  }
+  return rv;
+}
+
+inline Unsigned128& operator<<=(Unsigned128& lhs, unsigned shift) {
+  lhs = lhs << shift;
+  return lhs;
+}
+
+inline Unsigned128 operator>>(const Unsigned128& lhs, unsigned shift) {
+  shift &= 127;
+  Unsigned128 rv;
+  if (shift >= 64) {
+    rv.hi = 0;
+    rv.lo = lhs.hi >> (shift & 63);
+  } else {
+    uint64_t tmp = lhs.hi;
+    rv.hi = tmp >> shift;
+    // Ensure shift==0 shifts away everything
+    tmp = tmp << 1 << (63 - shift);
+    rv.lo = tmp | (lhs.lo >> shift);
+  }
+  return rv;
+}
+
+inline Unsigned128& operator>>=(Unsigned128& lhs, unsigned shift) {
+  lhs = lhs >> shift;
+  return lhs;
+}
+
+inline Unsigned128 operator&(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo & rhs.lo, lhs.hi & rhs.hi);
+}
+
+inline Unsigned128& operator&=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs & rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator|(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo | rhs.lo, lhs.hi | rhs.hi);
+}
+
+inline Unsigned128& operator|=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs | rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator^(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return Unsigned128(lhs.lo ^ rhs.lo, lhs.hi ^ rhs.hi);
+}
+
+inline Unsigned128& operator^=(Unsigned128& lhs, const Unsigned128& rhs) {
+  lhs = lhs ^ rhs;
+  return lhs;
+}
+
+inline Unsigned128 operator~(const Unsigned128& v) {
+  return Unsigned128(~v.lo, ~v.hi);
+}
+
+inline bool operator==(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.lo == rhs.lo && lhs.hi == rhs.hi;
+}
+
+inline bool operator!=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.lo != rhs.lo || lhs.hi != rhs.hi;
+}
+
+inline bool operator>(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo > rhs.lo);
+}
+
+inline bool operator<(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo < rhs.lo);
+}
+
+inline bool operator>=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo >= rhs.lo);
+}
+
+inline bool operator<=(const Unsigned128& lhs, const Unsigned128& rhs) {
+  return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo <= rhs.lo);
+}
+#endif
+
+inline uint64_t Lower64of128(Unsigned128 v) {
+#ifdef HAVE_UINT128_EXTENSION
+  return static_cast<uint64_t>(v);
+#else
+  return v.lo;
+#endif
+}
+
+inline uint64_t Upper64of128(Unsigned128 v) {
+#ifdef HAVE_UINT128_EXTENSION
+  return static_cast<uint64_t>(v >> 64);
+#else
+  return v.hi;
+#endif
+}
+
+// This generally compiles down to a single fast instruction on 64-bit.
+// This doesn't really make sense as operator* because it's not a
+// general 128x128 multiply and provides more output than 64x64 multiply.
+inline Unsigned128 Multiply64to128(uint64_t a, uint64_t b) {
+#ifdef HAVE_UINT128_EXTENSION
+  return Unsigned128{a} * Unsigned128{b};
+#else
+  // Full decomposition
+  // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit
+  // -> 128-bit multiplication and optimize it appropriately.
+  uint64_t tmp = uint64_t{b & 0xffffFFFF} * uint64_t{a & 0xffffFFFF};
+  uint64_t lower = tmp & 0xffffFFFF;
+  tmp >>= 32;
+  tmp += uint64_t{b & 0xffffFFFF} * uint64_t{a >> 32};
+  // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+  uint64_t tmp2 = uint64_t{b >> 32} * uint64_t{a & 0xffffFFFF};
+  tmp += tmp2 & 0xffffFFFF;
+  lower |= tmp << 32;
+  tmp >>= 32;
+  tmp += tmp2 >> 32;
+  tmp += uint64_t{b >> 32} * uint64_t{a >> 32};
+  return Unsigned128(lower, tmp);
+#endif
+}
+
+template <>
+inline int FloorLog2(Unsigned128 v) {
+  if (Upper64of128(v) == 0) {
+    return FloorLog2(Lower64of128(v));
+  } else {
+    return FloorLog2(Upper64of128(v)) + 64;
+  }
+}
+
+template <>
+inline int CountTrailingZeroBits(Unsigned128 v) {
+  if (Lower64of128(v) != 0) {
+    return CountTrailingZeroBits(Lower64of128(v));
+  } else {
+    return CountTrailingZeroBits(Upper64of128(v)) + 64;
+  }
+}
+
+template <>
+inline int BitsSetToOne(Unsigned128 v) {
+  return BitsSetToOne(Lower64of128(v)) + BitsSetToOne(Upper64of128(v));
+}
+
+template <>
+inline int BitParity(Unsigned128 v) {
+  return BitParity(Lower64of128(v) ^ Upper64of128(v));
+}
+
+template <>
+inline Unsigned128 EndianSwapValue(Unsigned128 v) {
+  return (Unsigned128{EndianSwapValue(Lower64of128(v))} << 64) |
+         EndianSwapValue(Upper64of128(v));
+}
+
+template <>
+inline Unsigned128 ReverseBits(Unsigned128 v) {
+  return (Unsigned128{ReverseBits(Lower64of128(v))} << 64) |
+         ReverseBits(Upper64of128(v));
+}
+
+template <typename T>
+struct IsUnsignedUpTo128
+    : std::integral_constant<bool, std::is_unsigned<T>::value ||
+                                       std::is_same<T, Unsigned128>::value> {};
+
+inline void EncodeFixed128(char* dst, Unsigned128 value) {
+  EncodeFixed64(dst, Lower64of128(value));
+  EncodeFixed64(dst + 8, Upper64of128(value));
+}
+
+inline Unsigned128 DecodeFixed128(const char* ptr) {
+  Unsigned128 rv = DecodeFixed64(ptr + 8);
+  return (rv << 64) | DecodeFixed64(ptr);
+}
+
+// A version of EncodeFixed* for generic algorithms. Likely to be used
+// with Unsigned128, so lives here for now.
+template <typename T>
+inline void EncodeFixedGeneric(char* /*dst*/, T /*value*/) {
+  // Unfortunately, GCC does not appear to optimize this simple code down
+  // to a trivial load on Intel:
+  //
+  // T ret_val = 0;
+  // for (size_t i = 0; i < sizeof(T); ++i) {
+  //   ret_val |= (static_cast<T>(static_cast<unsigned char>(ptr[i])) << (8 *
+  //   i));
+  // }
+  // return ret_val;
+  //
+  // But does unroll the loop, and does optimize manually unrolled version
+  // for specific sizes down to a trivial load. I have no idea why it doesn't
+  // do both on this code.
+
+  // So instead, we rely on specializations
+  static_assert(sizeof(T) == 0, "No specialization provided for this type");
+}
+
+template <>
+inline void EncodeFixedGeneric(char* dst, uint16_t value) {
+  return EncodeFixed16(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, uint32_t value) {
+  return EncodeFixed32(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, uint64_t value) {
+  return EncodeFixed64(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, Unsigned128 value) {
+  return EncodeFixed128(dst, value);
+}
+
+// A version of EncodeFixed* for generic algorithms.
+template <typename T>
+inline T DecodeFixedGeneric(const char* /*dst*/) {
+  static_assert(sizeof(T) == 0, "No specialization provided for this type");
+}
+
+template <>
+inline uint16_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed16(dst);
+}
+template <>
+inline uint32_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed32(dst);
+}
+template <>
+inline uint64_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed64(dst);
+}
+template <>
+inline Unsigned128 DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed128(dst);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/murmurhash.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/murmurhash.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.cc	2025-05-19 16:14:28.000000000 +0000
@@ -10,7 +10,7 @@
   is under the MIT license.
 */
 #include "murmurhash.h"
-#include "util/util.h"
+#include "port/lang.h"
 
 #if defined(__x86_64__)
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/murmurhash.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/murmurhash.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/murmurhash.h	2025-05-19 16:14:28.000000000 +0000
@@ -17,19 +17,19 @@
 #define MURMUR_HASH MurmurHash64A
 uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed );
 #define MurmurHash MurmurHash64A
-typedef uint64_t murmur_t;
+using murmur_t = uint64_t;
 
 #elif defined(__i386__)
 #define MURMUR_HASH MurmurHash2
 unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed );
 #define MurmurHash MurmurHash2
-typedef unsigned int murmur_t;
+using murmur_t = unsigned int;
 
 #else
 #define MURMUR_HASH MurmurHashNeutral2
 unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed );
 #define MurmurHash MurmurHashNeutral2
-typedef unsigned int murmur_t;
+using murmur_t = unsigned int;
 #endif
 
 // Allow slice to be hashable by murmur hash.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/mutexlock.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/mutexlock.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/mutexlock.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/mutexlock.h	2025-05-19 16:14:28.000000000 +0000
@@ -132,4 +132,55 @@
   std::atomic<bool> locked_;
 };
 
+// We want to prevent false sharing
+template <class T>
+struct ALIGN_AS(CACHE_LINE_SIZE) LockData {
+  T lock_;
+};
+
+//
+// Inspired by Guava: https://github.com/google/guava/wiki/StripedExplained
+// A striped Lock. This offers the underlying lock striping similar
+// to that of ConcurrentHashMap in a reusable form, and extends it for
+// semaphores and read-write locks. Conceptually, lock striping is the technique
+// of dividing a lock into many <i>stripes</i>, increasing the granularity of a
+// single lock and allowing independent operations to lock different stripes and
+// proceed concurrently, instead of creating contention for a single lock.
+//
+template <class T, class P>
+class Striped {
+ public:
+  Striped(size_t stripes, std::function<uint64_t(const P &)> hash)
+      : stripes_(stripes), hash_(hash) {
+
+    locks_ = reinterpret_cast<LockData<T> *>(
+        port::cacheline_aligned_alloc(sizeof(LockData<T>) * stripes));
+    for (size_t i = 0; i < stripes; i++) {
+      new (&locks_[i]) LockData<T>();
+    }
+
+  }
+
+  virtual ~Striped() {
+    if (locks_ != nullptr) {
+      assert(stripes_ > 0);
+      for (size_t i = 0; i < stripes_; i++) {
+        locks_[i].~LockData<T>();
+      }
+      port::cacheline_aligned_free(locks_);
+    }
+  }
+
+  T *get(const P &key) {
+    uint64_t h = hash_(key);
+    size_t index = h % stripes_;
+    return &reinterpret_cast<LockData<T> *>(&locks_[index])->lock_;
+  }
+
+ private:
+  size_t stripes_;
+  LockData<T> *locks_;
+  std::function<uint64_t(const P &)> hash_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/random.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/random.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.cc	2025-05-19 16:14:28.000000000 +0000
@@ -35,4 +35,31 @@
   return rv;
 }
 
+std::string Random::HumanReadableString(int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; ++i) {
+    ret[i] = static_cast<char>('a' + Uniform(26));
+  }
+  return ret;
+}
+
+std::string Random::RandomString(int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; i++) {
+    ret[i] = static_cast<char>(' ' + Uniform(95));  // ' ' .. '~'
+  }
+  return ret;
+}
+
+std::string Random::RandomBinaryString(int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; i++) {
+    ret[i] = static_cast<char>(Uniform(CHAR_MAX));
+  }
+  return ret;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/random.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/random.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/random.h	2025-05-19 16:14:28.000000000 +0000
@@ -9,6 +9,8 @@
 
 #pragma once
 #include <stdint.h>
+
+#include <algorithm>
 #include <random>
 
 #include "rocksdb/rocksdb_namespace.h"
@@ -59,6 +61,8 @@
     return seed_;
   }
 
+  uint64_t Next64() { return (uint64_t{Next()} << 32) | Next(); }
+
   // Returns a uniformly distributed value in the range [0..n-1]
   // REQUIRES: n > 0
   uint32_t Uniform(int n) { return Next() % n; }
@@ -85,6 +89,15 @@
     return Uniform(1 << Uniform(max_log + 1));
   }
 
+  // Returns a random string of length "len"
+  std::string RandomString(int len);
+
+  // Generates a random string of len bytes using human-readable characters
+  std::string HumanReadableString(int len);
+
+  // Generates a random binary data
+  std::string RandomBinaryString(int len);
+
   // Returns a Random instance for use by the current thread without
   // additional locking
   static Random* GetTLSInstance();
@@ -163,4 +176,17 @@
   }
 };
 
+// A seeded replacement for removed std::random_shuffle
+template <class RandomIt>
+void RandomShuffle(RandomIt first, RandomIt last, uint32_t seed) {
+  std::mt19937 rng(seed);
+  std::shuffle(first, last, rng);
+}
+
+// A replacement for removed std::random_shuffle
+template <class RandomIt>
+void RandomShuffle(RandomIt first, RandomIt last) {
+  RandomShuffle(first, last, std::random_device{}());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/rate_limiter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/rate_limiter.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,14 +8,21 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/rate_limiter.h"
+
+#include <algorithm>
+
 #include "monitoring/statistics.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "test_util/sync_point.h"
 #include "util/aligned_buffer.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
-
 size_t RateLimiter::RequestToken(size_t bytes, size_t alignment,
                                  Env::IOPriority io_priority, Statistics* stats,
                                  RateLimiter::OpType op_type) {
@@ -43,52 +50,104 @@
   bool granted;
 };
 
-GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
-                                       int64_t refill_period_us,
-                                       int32_t fairness, RateLimiter::Mode mode,
-                                       Env* env, bool auto_tuned)
+static std::unordered_map<std::string, OptionTypeInfo>
+    generic_rate_limiter_type_info = {
+#ifndef ROCKSDB_LITE
+        {"rate_bytes_per_sec",
+         {offsetof(struct GenericRateLimiter::GenericRateLimiterOptions,
+                   max_bytes_per_sec),
+          OptionType::kInt64T}},
+        {"refill_period_us",
+         {offsetof(struct GenericRateLimiter::GenericRateLimiterOptions,
+                   refill_period_us),
+          OptionType::kInt64T}},
+        {"fairness",
+         {offsetof(struct GenericRateLimiter::GenericRateLimiterOptions,
+                   fairness),
+          OptionType::kInt32T}},
+        {"auto_tuned",
+         {offsetof(struct GenericRateLimiter::GenericRateLimiterOptions,
+                   auto_tuned),
+          OptionType::kBoolean}},
+        {"clock",
+         OptionTypeInfo::AsCustomSharedPtr<SystemClock>(
+             offsetof(struct GenericRateLimiter::GenericRateLimiterOptions,
+                      clock),
+             OptionVerificationType::kByNameAllowFromNull,
+             OptionTypeFlags::kAllowNull)},
+#endif  // ROCKSDB_LITE
+};
+
+GenericRateLimiter::GenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness,
+    RateLimiter::Mode mode, const std::shared_ptr<SystemClock>& clock,
+    bool auto_tuned)
     : RateLimiter(mode),
-      refill_period_us_(refill_period_us),
-      rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
-                                     : rate_bytes_per_sec),
-      refill_bytes_per_period_(
-          CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)),
-      env_(env),
+      options_(rate_bytes_per_sec, refill_period_us, fairness, clock,
+               auto_tuned),
       stop_(false),
       exit_cv_(&request_mutex_),
       requests_to_wait_(0),
       available_bytes_(0),
-      next_refill_us_(NowMicrosMonotonic(env_)),
-      fairness_(fairness > 100 ? 100 : fairness),
       rnd_((uint32_t)time(nullptr)),
-      leader_(nullptr),
-      auto_tuned_(auto_tuned),
+      wait_until_refill_pending_(false),
       num_drains_(0),
-      prev_num_drains_(0),
-      max_bytes_per_sec_(rate_bytes_per_sec),
-      tuned_time_(NowMicrosMonotonic(env_)) {
-  total_requests_[0] = 0;
-  total_requests_[1] = 0;
-  total_bytes_through_[0] = 0;
-  total_bytes_through_[1] = 0;
+      prev_num_drains_(0) {
+  RegisterOptions(&options_, &generic_rate_limiter_type_info);
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    total_requests_[i] = 0;
+    total_bytes_through_[i] = 0;
+  }
+  Initialize();
+}
+void GenericRateLimiter::Initialize() {
+  if (options_.clock == nullptr) {
+    options_.clock = SystemClock::Default();
+  }
+  options_.fairness = std::min(options_.fairness, 100);
+  next_refill_us_ = NowMicrosMonotonic();
+  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic());
+  if (options_.auto_tuned) {
+    rate_bytes_per_sec_ = options_.max_bytes_per_sec / 2;
+  } else {
+    rate_bytes_per_sec_ = options_.max_bytes_per_sec;
+  }
+  refill_bytes_per_period_ = CalculateRefillBytesPerPeriod(rate_bytes_per_sec_);
 }
 
 GenericRateLimiter::~GenericRateLimiter() {
   MutexLock g(&request_mutex_);
   stop_ = true;
-  requests_to_wait_ = static_cast<int32_t>(queue_[Env::IO_LOW].size() +
-                                           queue_[Env::IO_HIGH].size());
-  for (auto& r : queue_[Env::IO_HIGH]) {
-    r->cv.Signal();
-  }
-  for (auto& r : queue_[Env::IO_LOW]) {
-    r->cv.Signal();
+  std::deque<Req*>::size_type queues_size_sum = 0;
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    queues_size_sum += queue_[i].size();
+  }
+  requests_to_wait_ = static_cast<int32_t>(queues_size_sum);
+
+  for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) {
+    std::deque<Req*> queue = queue_[i];
+    for (auto& r : queue) {
+      r->cv.Signal();
+    }
   }
+
   while (requests_to_wait_ > 0) {
     exit_cv_.Wait();
   }
 }
 
+Status GenericRateLimiter::PrepareOptions(const ConfigOptions& options) {
+  if (options_.fairness <= 0) {
+    return Status::InvalidArgument("Fairness must be > 0");
+  } else if (options_.max_bytes_per_sec <= 0) {
+    return Status::InvalidArgument("max_bytes_per_sec must be > 0");
+  } else if (options_.refill_period_us <= 0) {
+    return Status::InvalidArgument("Refill_period_us must be > 0");
+  }
+  Initialize();
+  return RateLimiter::PrepareOptions(options);
+}
+
 // This API allows user to dynamically change rate limiter's bytes per second.
 void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) {
   assert(bytes_per_second > 0);
@@ -101,21 +160,26 @@
 void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
                                  Statistics* stats) {
   assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed));
+  bytes = std::max(static_cast<int64_t>(0), bytes);
   TEST_SYNC_POINT("GenericRateLimiter::Request");
   TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1",
                            &rate_bytes_per_sec_);
   MutexLock g(&request_mutex_);
 
-  if (auto_tuned_) {
+  if (options_.auto_tuned) {
     static const int kRefillsPerTune = 100;
-    std::chrono::microseconds now(NowMicrosMonotonic(env_));
-    if (now - tuned_time_ >=
-        kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) {
-      Tune();
+    std::chrono::microseconds now(NowMicrosMonotonic());
+    if (now - tuned_time_ >= kRefillsPerTune * std::chrono::microseconds(
+                                                   options_.refill_period_us)) {
+      Status s = Tune();
+      s.PermitUncheckedError();  //**TODO: What to do on error?
     }
   }
 
   if (stop_) {
+    // It is now in the clean-up of ~GenericRateLimiter().
+    // Therefore any new incoming request will exit from here
+    // and not get satiesfied.
     return;
   }
 
@@ -133,102 +197,118 @@
   // Request cannot be satisfied at this moment, enqueue
   Req r(bytes, &request_mutex_);
   queue_[pri].push_back(&r);
-
+  TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostEnqueueRequest",
+                           &request_mutex_);
+  // A thread representing a queued request coordinates with other such threads.
+  // There are two main duties.
+  //
+  // (1) Waiting for the next refill time.
+  // (2) Refilling the bytes and granting requests.
   do {
-    bool timedout = false;
-    // Leader election, candidates can be:
-    // (1) a new incoming request,
-    // (2) a previous leader, whose quota has not been not assigned yet due
-    //     to lower priority
-    // (3) a previous waiter at the front of queue, who got notified by
-    //     previous leader
-    if (leader_ == nullptr &&
-        ((!queue_[Env::IO_HIGH].empty() &&
-            &r == queue_[Env::IO_HIGH].front()) ||
-         (!queue_[Env::IO_LOW].empty() &&
-            &r == queue_[Env::IO_LOW].front()))) {
-      leader_ = &r;
-      int64_t delta = next_refill_us_ - NowMicrosMonotonic(env_);
-      delta = delta > 0 ? delta : 0;
-      if (delta == 0) {
-        timedout = true;
+    int64_t time_until_refill_us = next_refill_us_ - NowMicrosMonotonic();
+    if (time_until_refill_us > 0) {
+      if (wait_until_refill_pending_) {
+        // Somebody is performing (1). Trust we'll be woken up when our request
+        // is granted or we are needed for future duties.
+        r.cv.Wait();
       } else {
-        int64_t wait_until = env_->NowMicros() + delta;
+        // Whichever thread reaches here first performs duty (1) as described
+        // above.
+        int64_t wait_until = options_.clock->NowMicros() + time_until_refill_us;
         RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
         ++num_drains_;
-        timedout = r.cv.TimedWait(wait_until);
+        wait_until_refill_pending_ = true;
+        r.cv.TimedWait(wait_until);
+        TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostTimedWait",
+                                 &time_until_refill_us);
+        wait_until_refill_pending_ = false;
       }
     } else {
-      // Not at the front of queue or an leader has already been elected
-      r.cv.Wait();
-    }
-
-    // request_mutex_ is held from now on
-    if (stop_) {
-      --requests_to_wait_;
-      exit_cv_.Signal();
-      return;
-    }
-
-    // Make sure the waken up request is always the header of its queue
-    assert(r.granted ||
-           (!queue_[Env::IO_HIGH].empty() &&
-            &r == queue_[Env::IO_HIGH].front()) ||
-           (!queue_[Env::IO_LOW].empty() &&
-            &r == queue_[Env::IO_LOW].front()));
-    assert(leader_ == nullptr ||
-           (!queue_[Env::IO_HIGH].empty() &&
-            leader_ == queue_[Env::IO_HIGH].front()) ||
-           (!queue_[Env::IO_LOW].empty() &&
-            leader_ == queue_[Env::IO_LOW].front()));
-
-    if (leader_ == &r) {
-      // Waken up from TimedWait()
-      if (timedout) {
-        // Time to do refill!
-        Refill();
-
-        // Re-elect a new leader regardless. This is to simplify the
-        // election handling.
-        leader_ = nullptr;
-
-        // Notify the header of queue if current leader is going away
-        if (r.granted) {
-          // Current leader already got granted with quota. Notify header
-          // of waiting queue to participate next round of election.
-          assert((queue_[Env::IO_HIGH].empty() ||
-                    &r != queue_[Env::IO_HIGH].front()) &&
-                 (queue_[Env::IO_LOW].empty() ||
-                    &r != queue_[Env::IO_LOW].front()));
-          if (!queue_[Env::IO_HIGH].empty()) {
-            queue_[Env::IO_HIGH].front()->cv.Signal();
-          } else if (!queue_[Env::IO_LOW].empty()) {
-            queue_[Env::IO_LOW].front()->cv.Signal();
+      // Whichever thread reaches here first performs duty (2) as described
+      // above.
+      RefillBytesAndGrantRequests();
+      if (r.granted) {
+        // If there is any remaining requests, make sure there exists at least
+        // one candidate is awake for future duties by signaling a front request
+        // of a queue.
+        for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) {
+          std::deque<Req*> queue = queue_[i];
+          if (!queue.empty()) {
+            queue.front()->cv.Signal();
+            break;
           }
-          // Done
-          break;
         }
-      } else {
-        // Spontaneous wake up, need to continue to wait
-        assert(!r.granted);
-        leader_ = nullptr;
       }
+    }
+    // Invariant: non-granted request is always in one queue, and granted
+    // request is always in zero queues.
+#ifndef NDEBUG
+    int num_found = 0;
+    for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+      if (std::find(queue_[i].begin(), queue_[i].end(), &r) !=
+          queue_[i].end()) {
+        ++num_found;
+      }
+    }
+    if (r.granted) {
+      assert(num_found == 0);
     } else {
-      // Waken up by previous leader:
-      // (1) if requested quota is granted, it is done.
-      // (2) if requested quota is not granted, this means current thread
-      // was picked as a new leader candidate (previous leader got quota).
-      // It needs to participate leader election because a new request may
-      // come in before this thread gets waken up. So it may actually need
-      // to do Wait() again.
-      assert(!timedout);
+      assert(num_found == 1);
     }
-  } while (!r.granted);
+#endif  // NDEBUG
+  } while (!stop_ && !r.granted);
+
+  if (stop_) {
+    // It is now in the clean-up of ~GenericRateLimiter().
+    // Therefore any woken-up request will have come out of the loop and then
+    // exit here. It might or might not have been satisfied.
+    --requests_to_wait_;
+    exit_cv_.Signal();
+  }
+}
+
+std::vector<Env::IOPriority>
+GenericRateLimiter::GeneratePriorityIterationOrder() {
+  std::vector<Env::IOPriority> pri_iteration_order(Env::IO_TOTAL /* 4 */);
+  // We make Env::IO_USER a superior priority by always iterating its queue
+  // first
+  pri_iteration_order[0] = Env::IO_USER;
+
+  bool high_pri_iterated_after_mid_low_pri = rnd_.OneIn(options_.fairness);
+  TEST_SYNC_POINT_CALLBACK(
+      "GenericRateLimiter::GeneratePriorityIterationOrder::"
+      "PostRandomOneInFairnessForHighPri",
+      &high_pri_iterated_after_mid_low_pri);
+  bool mid_pri_itereated_after_low_pri = rnd_.OneIn(options_.fairness);
+  TEST_SYNC_POINT_CALLBACK(
+      "GenericRateLimiter::GeneratePriorityIterationOrder::"
+      "PostRandomOneInFairnessForMidPri",
+      &mid_pri_itereated_after_low_pri);
+
+  if (high_pri_iterated_after_mid_low_pri) {
+    pri_iteration_order[3] = Env::IO_HIGH;
+    pri_iteration_order[2] =
+        mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW;
+    pri_iteration_order[1] =
+        (pri_iteration_order[2] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID;
+  } else {
+    pri_iteration_order[1] = Env::IO_HIGH;
+    pri_iteration_order[3] =
+        mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW;
+    pri_iteration_order[2] =
+        (pri_iteration_order[3] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID;
+  }
+
+  TEST_SYNC_POINT_CALLBACK(
+      "GenericRateLimiter::GeneratePriorityIterationOrder::"
+      "PreReturnPriIterationOrder",
+      &pri_iteration_order);
+  return pri_iteration_order;
 }
 
-void GenericRateLimiter::Refill() {
-  TEST_SYNC_POINT("GenericRateLimiter::Refill");
-  next_refill_us_ = NowMicrosMonotonic(env_) + refill_period_us_;
+void GenericRateLimiter::RefillBytesAndGrantRequests() {
+  TEST_SYNC_POINT("GenericRateLimiter::RefillBytesAndGrantRequests");
+  next_refill_us_ = NowMicrosMonotonic() + options_.refill_period_us;
   // Carry over the left over quota from the last period
   auto refill_bytes_per_period =
       refill_bytes_per_period_.load(std::memory_order_relaxed);
@@ -236,41 +316,44 @@
     available_bytes_ += refill_bytes_per_period;
   }
 
-  int use_low_pri_first = rnd_.OneIn(fairness_) ? 0 : 1;
-  for (int q = 0; q < 2; ++q) {
-    auto use_pri = (use_low_pri_first == q) ? Env::IO_LOW : Env::IO_HIGH;
-    auto* queue = &queue_[use_pri];
+  std::vector<Env::IOPriority> pri_iteration_order =
+      GeneratePriorityIterationOrder();
+
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    assert(!pri_iteration_order.empty());
+    Env::IOPriority current_pri = pri_iteration_order[i];
+    auto* queue = &queue_[current_pri];
     while (!queue->empty()) {
       auto* next_req = queue->front();
       if (available_bytes_ < next_req->request_bytes) {
-        // avoid starvation
+        // Grant partial request_bytes to avoid starvation of requests
+        // that become asking for more bytes than available_bytes_
+        // due to dynamically reduced rate limiter's bytes_per_second that
+        // leads to reduced refill_bytes_per_period hence available_bytes_
         next_req->request_bytes -= available_bytes_;
         available_bytes_ = 0;
         break;
       }
       available_bytes_ -= next_req->request_bytes;
       next_req->request_bytes = 0;
-      total_bytes_through_[use_pri] += next_req->bytes;
+      total_bytes_through_[current_pri] += next_req->bytes;
       queue->pop_front();
 
       next_req->granted = true;
-      if (next_req != leader_) {
-        // Quota granted, signal the thread
-        next_req->cv.Signal();
-      }
+      // Quota granted, signal the thread to exit
+      next_req->cv.Signal();
     }
   }
 }
 
 int64_t GenericRateLimiter::CalculateRefillBytesPerPeriod(
     int64_t rate_bytes_per_sec) {
-  if (port::kMaxInt64 / rate_bytes_per_sec < refill_period_us_) {
+  if (port::kMaxInt64 / rate_bytes_per_sec < options_.refill_period_us) {
     // Avoid unexpected result in the overflow case. The result now is still
     // inaccurate but is a number that is large enough.
     return port::kMaxInt64 / 1000000;
   } else {
-    return std::max(kMinRefillBytesPerPeriod,
-                    rate_bytes_per_sec * refill_period_us_ / 1000000);
+    return rate_bytes_per_sec * options_.refill_period_us / 1000000;
   }
 }
 
@@ -283,12 +366,13 @@
   const int kAllowedRangeFactor = 20;
 
   std::chrono::microseconds prev_tuned_time = tuned_time_;
-  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_));
+  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic());
 
-  int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time +
-                               std::chrono::microseconds(refill_period_us_) -
-                               std::chrono::microseconds(1)) /
-                              std::chrono::microseconds(refill_period_us_);
+  int64_t elapsed_intervals =
+      (tuned_time_ - prev_tuned_time +
+       std::chrono::microseconds(options_.refill_period_us) -
+       std::chrono::microseconds(1)) /
+      std::chrono::microseconds(options_.refill_period_us);
   // We tune every kRefillsPerTune intervals, so the overflow and division-by-
   // zero conditions should never happen.
   assert(num_drains_ - prev_num_drains_ <= port::kMaxInt64 / 100);
@@ -299,20 +383,20 @@
   int64_t prev_bytes_per_sec = GetBytesPerSecond();
   int64_t new_bytes_per_sec;
   if (drained_pct == 0) {
-    new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor;
+    new_bytes_per_sec = options_.max_bytes_per_sec / kAllowedRangeFactor;
   } else if (drained_pct < kLowWatermarkPct) {
     // sanitize to prevent overflow
     int64_t sanitized_prev_bytes_per_sec =
         std::min(prev_bytes_per_sec, port::kMaxInt64 / 100);
     new_bytes_per_sec =
-        std::max(max_bytes_per_sec_ / kAllowedRangeFactor,
+        std::max(options_.max_bytes_per_sec / kAllowedRangeFactor,
                  sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct));
   } else if (drained_pct > kHighWatermarkPct) {
     // sanitize to prevent overflow
     int64_t sanitized_prev_bytes_per_sec = std::min(
         prev_bytes_per_sec, port::kMaxInt64 / (100 + kAdjustFactorPct));
     new_bytes_per_sec =
-        std::min(max_bytes_per_sec_,
+        std::min(options_.max_bytes_per_sec,
                  sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100);
   } else {
     new_bytes_per_sec = prev_bytes_per_sec;
@@ -332,8 +416,81 @@
   assert(rate_bytes_per_sec > 0);
   assert(refill_period_us > 0);
   assert(fairness > 0);
-  return new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
-                                mode, Env::Default(), auto_tuned);
+  std::unique_ptr<RateLimiter> limiter(
+      new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
+                             mode, SystemClock::Default(), auto_tuned));
+  Status s = limiter->PrepareOptions(ConfigOptions());
+  if (s.ok()) {
+    return limiter.release();
+  } else {
+    assert(false);
+    return nullptr;
+  }
+}
+namespace {
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinRateLimiters(ObjectLibrary& library,
+                                       const std::string& /*arg*/) {
+  library.AddFactory<RateLimiter>(
+      GenericRateLimiter::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<RateLimiter>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new GenericRateLimiter(port::kMaxInt64));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+
+static std::unordered_map<std::string, RateLimiter::Mode>
+    rate_limiter_mode_map = {
+        {"kReadsOnly", RateLimiter::Mode::kReadsOnly},
+        {"kWritesOnly", RateLimiter::Mode::kWritesOnly},
+        {"kAllIo", RateLimiter::Mode::kAllIo},
+};
+#endif  // ROCKSDB_LITE
+static bool LoadRateLimiter(const std::string& name,
+                            std::shared_ptr<RateLimiter>* limiter) {
+  auto plen = strlen(GenericRateLimiter::kClassName());
+  if (name.size() > plen + 2 && name[plen] == ':' &&
+      StartsWith(name, GenericRateLimiter::kClassName())) {
+    auto rate = ParseInt64(name.substr(plen + 1));
+    limiter->reset(new GenericRateLimiter(rate));
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> rate_limiter_type_info =
+    {
+#ifndef ROCKSDB_LITE
+        {"mode",
+         OptionTypeInfo::Enum<RateLimiter::Mode>(0, &rate_limiter_mode_map)},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
+
+RateLimiter::RateLimiter(Mode mode) : mode_(mode) {
+  RegisterOptions("", &mode_, &rate_limiter_type_info);
+}
+
+Status RateLimiter::CreateFromString(const ConfigOptions& config_options,
+                                     const std::string& value,
+                                     std::shared_ptr<RateLimiter>* result) {
+  if (value.empty()) {
+    result->reset();
+    return Status::OK();
+  } else {
+#ifndef ROCKSDB_LITE
+    static std::once_flag once;
+    std::call_once(once, [&]() {
+      RegisterBuiltinRateLimiters(*(ObjectLibrary::Default().get()), "");
+    });
+#endif  // ROCKSDB_LITE
+    return LoadSharedObject<RateLimiter>(config_options, value, LoadRateLimiter,
+                                         result);
+  }
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/rate_limiter.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/rate_limiter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter.h	2025-05-19 16:14:28.000000000 +0000
@@ -13,9 +13,12 @@
 #include <atomic>
 #include <chrono>
 #include <deque>
+
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
@@ -23,18 +26,45 @@
 
 class GenericRateLimiter : public RateLimiter {
  public:
-  GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
-                     int32_t fairness, RateLimiter::Mode mode, Env* env,
-                     bool auto_tuned);
+  struct GenericRateLimiterOptions {
+    static const char* kName() { return "GenericRateLimiterOptions"; }
+    GenericRateLimiterOptions(int64_t _rate_bytes_per_sec,
+                              int64_t _refill_period_us, int32_t _fairness,
+                              const std::shared_ptr<SystemClock>& _clock,
+                              bool _auto_tuned)
+        : max_bytes_per_sec(_rate_bytes_per_sec),
+          refill_period_us(_refill_period_us),
+          clock(_clock),
+          fairness(_fairness > 100 ? 100 : _fairness),
+          auto_tuned(_auto_tuned) {}
+    int64_t max_bytes_per_sec;
+    int64_t refill_period_us;
+    std::shared_ptr<SystemClock> clock;
+    int32_t fairness;
+    bool auto_tuned;
+  };
+
+ public:
+  explicit GenericRateLimiter(
+      int64_t refill_bytes, int64_t refill_period_us = 100 * 1000,
+      int32_t fairness = 10,
+      RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
+      const std::shared_ptr<SystemClock>& clock = nullptr,
+      bool auto_tuned = false);
 
   virtual ~GenericRateLimiter();
 
+  static const char* kClassName() { return "GenericRateLimiter"; }
+  const char* Name() const override { return kClassName(); }
+  Status PrepareOptions(const ConfigOptions& options) override;
+
   // This API allows user to dynamically change rate limiter's bytes per second.
   virtual void SetBytesPerSecond(int64_t bytes_per_second) override;
 
   // Request for token to write bytes. If this request can not be satisfied,
   // the call is blocked. Caller is responsible to make sure
-  // bytes <= GetSingleBurstBytes()
+  // bytes <= GetSingleBurstBytes() and bytes >= 0. Negative bytes
+  // passed in will be rounded up to 0.
   using RateLimiter::Request;
   virtual void Request(const int64_t bytes, const Env::IOPriority pri,
                        Statistics* stats) override;
@@ -47,8 +77,11 @@
       const Env::IOPriority pri = Env::IO_TOTAL) const override {
     MutexLock g(&request_mutex_);
     if (pri == Env::IO_TOTAL) {
-      return total_bytes_through_[Env::IO_LOW] +
-             total_bytes_through_[Env::IO_HIGH];
+      int64_t total_bytes_through_sum = 0;
+      for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+        total_bytes_through_sum += total_bytes_through_[i];
+      }
+      return total_bytes_through_sum;
     }
     return total_bytes_through_[pri];
   }
@@ -57,35 +90,55 @@
       const Env::IOPriority pri = Env::IO_TOTAL) const override {
     MutexLock g(&request_mutex_);
     if (pri == Env::IO_TOTAL) {
-      return total_requests_[Env::IO_LOW] + total_requests_[Env::IO_HIGH];
+      int64_t total_requests_sum = 0;
+      for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+        total_requests_sum += total_requests_[i];
+      }
+      return total_requests_sum;
     }
     return total_requests_[pri];
   }
 
+  virtual Status GetTotalPendingRequests(
+      int64_t* total_pending_requests,
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    assert(total_pending_requests != nullptr);
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      int64_t total_pending_requests_sum = 0;
+      for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+        total_pending_requests_sum += static_cast<int64_t>(queue_[i].size());
+      }
+      *total_pending_requests = total_pending_requests_sum;
+    } else {
+      *total_pending_requests = static_cast<int64_t>(queue_[pri].size());
+    }
+    return Status::OK();
+  }
+
   virtual int64_t GetBytesPerSecond() const override {
     return rate_bytes_per_sec_;
   }
 
  private:
-  void Refill();
+  void Initialize();
+  void RefillBytesAndGrantRequests();
+  std::vector<Env::IOPriority> GeneratePriorityIterationOrder();
   int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec);
   Status Tune();
 
-  uint64_t NowMicrosMonotonic(Env* env) {
-    return env->NowNanos() / std::milli::den;
+  uint64_t NowMicrosMonotonic() {
+    return options_.clock->NowNanos() / std::milli::den;
   }
 
   // This mutex guard all internal states
   mutable port::Mutex request_mutex_;
 
-  const int64_t kMinRefillBytesPerPeriod = 100;
-
-  const int64_t refill_period_us_;
+  GenericRateLimiterOptions options_;
 
   int64_t rate_bytes_per_sec_;
   // This variable can be changed dynamically.
   std::atomic<int64_t> refill_bytes_per_period_;
-  Env* const env_;
 
   bool stop_;
   port::CondVar exit_cv_;
@@ -96,17 +149,14 @@
   int64_t available_bytes_;
   int64_t next_refill_us_;
 
-  int32_t fairness_;
   Random rnd_;
 
   struct Req;
-  Req* leader_;
   std::deque<Req*> queue_[Env::IO_TOTAL];
+  bool wait_until_refill_pending_;
 
-  bool auto_tuned_;
   int64_t num_drains_;
   int64_t prev_num_drains_;
-  const int64_t max_bytes_per_sec_;
   std::chrono::microseconds tuned_time_;
 };
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/rate_limiter_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/rate_limiter_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/rate_limiter_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -11,10 +11,15 @@
 
 #include <chrono>
 #include <cinttypes>
+#include <cstdint>
 #include <limits>
 
 #include "db/db_test_util.h"
-#include "rocksdb/env.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/options_type.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "util/random.h"
@@ -22,12 +27,18 @@
 namespace ROCKSDB_NAMESPACE {
 
 // TODO(yhchiang): the rate will not be accurate when we run test in parallel.
-class RateLimiterTest : public testing::Test {};
+class RateLimiterTest : public testing::Test {
+ protected:
+  ~RateLimiterTest() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+};
 
 TEST_F(RateLimiterTest, OverflowRate) {
   GenericRateLimiter limiter(port::kMaxInt64, 1000, 10,
-                             RateLimiter::Mode::kWritesOnly, Env::Default(),
-                             false /* auto_tuned */);
+                             RateLimiter::Mode::kWritesOnly,
+                             SystemClock::Default(), false /* auto_tuned */);
   ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll);
 }
 
@@ -35,12 +46,127 @@
   std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(100, 100, 10));
 }
 
+TEST_F(RateLimiterTest, GetTotalBytesThrough) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    ASSERT_EQ(limiter->GetTotalBytesThrough(static_cast<Env::IOPriority>(i)),
+              0);
+  }
+
+  std::int64_t request_byte = 200;
+  std::int64_t request_byte_sum = 0;
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    limiter->Request(request_byte, static_cast<Env::IOPriority>(i),
+                     nullptr /* stats */, RateLimiter::OpType::kWrite);
+    request_byte_sum += request_byte;
+  }
+
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    EXPECT_EQ(limiter->GetTotalBytesThrough(static_cast<Env::IOPriority>(i)),
+              request_byte)
+        << "Failed to track total_bytes_through_ correctly when IOPriority = "
+        << static_cast<Env::IOPriority>(i);
+  }
+  EXPECT_EQ(limiter->GetTotalBytesThrough(Env::IO_TOTAL), request_byte_sum)
+      << "Failed to track total_bytes_through_ correctly when IOPriority = "
+         "Env::IO_TOTAL";
+}
+
+TEST_F(RateLimiterTest, GetTotalRequests) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    ASSERT_EQ(limiter->GetTotalRequests(static_cast<Env::IOPriority>(i)), 0);
+  }
+
+  std::int64_t total_requests_sum = 0;
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    limiter->Request(200, static_cast<Env::IOPriority>(i), nullptr /* stats */,
+                     RateLimiter::OpType::kWrite);
+    total_requests_sum += 1;
+  }
+
+  for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
+    EXPECT_EQ(limiter->GetTotalRequests(static_cast<Env::IOPriority>(i)), 1)
+        << "Failed to track total_requests_ correctly when IOPriority = "
+        << static_cast<Env::IOPriority>(i);
+  }
+  EXPECT_EQ(limiter->GetTotalRequests(Env::IO_TOTAL), total_requests_sum)
+      << "Failed to track total_requests_ correctly when IOPriority = "
+         "Env::IO_TOTAL";
+}
+
+TEST_F(RateLimiterTest, GetTotalPendingRequests) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+  int64_t total_pending_requests = 0;
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    ASSERT_OK(limiter->GetTotalPendingRequests(
+        &total_pending_requests, static_cast<Env::IOPriority>(i)));
+    ASSERT_EQ(total_pending_requests, 0);
+  }
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted
+  bool nonzero_pending_requests_verified = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:PostEnqueueRequest", [&](void* arg) {
+        port::Mutex* request_mutex = (port::Mutex*)arg;
+        // We temporarily unlock the mutex so that the following
+        // GetTotalPendingRequests() can acquire it
+        request_mutex->Unlock();
+        for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+          EXPECT_OK(limiter->GetTotalPendingRequests(
+              &total_pending_requests, static_cast<Env::IOPriority>(i)))
+              << "Failed to return total pending requests for priority level = "
+              << static_cast<Env::IOPriority>(i);
+          if (i == Env::IO_USER || i == Env::IO_TOTAL) {
+            EXPECT_EQ(total_pending_requests, 1)
+                << "Failed to correctly return total pending requests for "
+                   "priority level = "
+                << static_cast<Env::IOPriority>(i);
+          } else {
+            EXPECT_EQ(total_pending_requests, 0)
+                << "Failed to correctly return total pending requests for "
+                   "priority level = "
+                << static_cast<Env::IOPriority>(i);
+          }
+        }
+        // We lock the mutex again so that the request thread can resume running
+        // with the mutex locked
+        request_mutex->Lock();
+        nonzero_pending_requests_verified = true;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  limiter->Request(200, Env::IO_USER, nullptr /* stats */,
+                   RateLimiter::OpType::kWrite);
+  ASSERT_EQ(nonzero_pending_requests_verified, true);
+  for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) {
+    EXPECT_OK(limiter->GetTotalPendingRequests(&total_pending_requests,
+                                               static_cast<Env::IOPriority>(i)))
+        << "Failed to return total pending requests for priority level = "
+        << static_cast<Env::IOPriority>(i);
+    EXPECT_EQ(total_pending_requests, 0)
+        << "Failed to correctly return total pending requests for priority "
+           "level = "
+        << static_cast<Env::IOPriority>(i);
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostEnqueueRequest");
+}
+
 TEST_F(RateLimiterTest, Modes) {
   for (auto mode : {RateLimiter::Mode::kWritesOnly,
                     RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) {
-    GenericRateLimiter limiter(
-        2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
-        10 /* fairness */, mode, Env::Default(), false /* auto_tuned */);
+    GenericRateLimiter limiter(2000 /* rate_bytes_per_sec */,
+                               1000 * 1000 /* refill_period_us */,
+                               10 /* fairness */, mode, SystemClock::Default(),
+                               false /* auto_tuned */);
     limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
                     RateLimiter::OpType::kRead);
     if (mode == RateLimiter::Mode::kWritesOnly) {
@@ -59,13 +185,93 @@
   }
 }
 
-#if !(defined(TRAVIS) && defined(OS_MACOSX))
+TEST_F(RateLimiterTest, GeneratePriorityIterationOrder) {
+  std::unique_ptr<RateLimiter> limiter(NewGenericRateLimiter(
+      200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */));
+
+  bool possible_random_one_in_fairness_results_for_high_mid_pri[4][2] = {
+      {false, false}, {false, true}, {true, false}, {true, true}};
+  std::vector<Env::IOPriority> possible_priority_iteration_orders[4] = {
+      {Env::IO_USER, Env::IO_HIGH, Env::IO_MID, Env::IO_LOW},
+      {Env::IO_USER, Env::IO_HIGH, Env::IO_LOW, Env::IO_MID},
+      {Env::IO_USER, Env::IO_MID, Env::IO_LOW, Env::IO_HIGH},
+      {Env::IO_USER, Env::IO_LOW, Env::IO_MID, Env::IO_HIGH}};
+
+  for (int i = 0; i < 4; ++i) {
+    // These are variables for making sure the following callbacks are called
+    // and the assertion in the last callback is indeed excuted
+    bool high_pri_iterated_after_mid_low_pri_set = false;
+    bool mid_pri_itereated_after_low_pri_set = false;
+    bool pri_iteration_order_verified = false;
+    SyncPoint::GetInstance()->SetCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrder::"
+        "PostRandomOneInFairnessForHighPri",
+        [&](void* arg) {
+          bool* high_pri_iterated_after_mid_low_pri = (bool*)arg;
+          *high_pri_iterated_after_mid_low_pri =
+              possible_random_one_in_fairness_results_for_high_mid_pri[i][0];
+          high_pri_iterated_after_mid_low_pri_set = true;
+        });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrder::"
+        "PostRandomOneInFairnessForMidPri",
+        [&](void* arg) {
+          bool* mid_pri_itereated_after_low_pri = (bool*)arg;
+          *mid_pri_itereated_after_low_pri =
+              possible_random_one_in_fairness_results_for_high_mid_pri[i][1];
+          mid_pri_itereated_after_low_pri_set = true;
+        });
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrder::"
+        "PreReturnPriIterationOrder",
+        [&](void* arg) {
+          std::vector<Env::IOPriority>* pri_iteration_order =
+              (std::vector<Env::IOPriority>*)arg;
+          EXPECT_EQ(*pri_iteration_order, possible_priority_iteration_orders[i])
+              << "Failed to generate priority iteration order correctly when "
+                 "high_pri_iterated_after_mid_low_pri = "
+              << possible_random_one_in_fairness_results_for_high_mid_pri[i][0]
+              << ", mid_pri_itereated_after_low_pri = "
+              << possible_random_one_in_fairness_results_for_high_mid_pri[i][1]
+              << std::endl;
+          pri_iteration_order_verified = true;
+        });
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    limiter->Request(200 /* request max bytes to drain so that refill and order
+                           generation will be triggered every time
+                           GenericRateLimiter::Request() is called */
+                     ,
+                     Env::IO_USER, nullptr /* stats */,
+                     RateLimiter::OpType::kWrite);
+    ASSERT_EQ(high_pri_iterated_after_mid_low_pri_set, true);
+    ASSERT_EQ(mid_pri_itereated_after_low_pri_set, true);
+    ASSERT_EQ(pri_iteration_order_verified, true);
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrder::"
+        "PreReturnPriIterationOrder");
+    SyncPoint::GetInstance()->ClearCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrder::"
+        "PostRandomOneInFairnessForMidPri");
+    SyncPoint::GetInstance()->ClearCallBack(
+        "GenericRateLimiter::GeneratePriorityIterationOrder::"
+        "PostRandomOneInFairnessForHighPri");
+  }
+}
+
 TEST_F(RateLimiterTest, Rate) {
   auto* env = Env::Default();
   struct Arg {
     Arg(int32_t _target_rate, int _burst)
-        : limiter(NewGenericRateLimiter(_target_rate, 100 * 1000, 10)),
-          request_size(_target_rate / 10),
+        : limiter(NewGenericRateLimiter(_target_rate /* rate_bytes_per_sec */,
+                                        100 * 1000 /* refill_period_us */,
+                                        10 /* fairness */)),
+          request_size(_target_rate /
+                       10 /* refill period here is 1/10 second */),
           burst(_burst) {}
     std::unique_ptr<RateLimiter> limiter;
     int32_t request_size;
@@ -73,23 +279,39 @@
   };
 
   auto writer = [](void* p) {
-    auto* thread_env = Env::Default();
+    const auto& thread_clock = SystemClock::Default();
     auto* arg = static_cast<Arg*>(p);
     // Test for 2 seconds
-    auto until = thread_env->NowMicros() + 2 * 1000000;
-    Random r((uint32_t)(thread_env->NowNanos() %
+    auto until = thread_clock->NowMicros() + 2 * 1000000;
+    Random r((uint32_t)(thread_clock->NowNanos() %
                         std::numeric_limits<uint32_t>::max()));
-    while (thread_env->NowMicros() < until) {
+    while (thread_clock->NowMicros() < until) {
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst * 2) + 1); ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
+                              Env::IO_USER, nullptr /* stats */,
+                              RateLimiter::OpType::kWrite);
+      }
+
       for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
         arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
                               Env::IO_HIGH, nullptr /* stats */,
                               RateLimiter::OpType::kWrite);
       }
+
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst / 2 + 1) + 1);
+           ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_MID,
+                              nullptr /* stats */, RateLimiter::OpType::kWrite);
+      }
+
       arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW,
                             nullptr /* stats */, RateLimiter::OpType::kWrite);
     }
   };
 
+  int samples = 0;
+  int samples_at_minimum = 0;
+
   for (int i = 1; i <= 16; i *= 2) {
     int32_t target = i * 1024 * 10;
     Arg arg(target, i / 4 + 1);
@@ -117,12 +339,28 @@
               arg.request_size - 1, target / 1024, rate / 1024,
               elapsed / 1000000.0);
 
-      ASSERT_GE(rate / target, 0.80);
+      ++samples;
+      if (rate / target >= 0.80) {
+        ++samples_at_minimum;
+      }
       ASSERT_LE(rate / target, 1.25);
     }
   }
-}
+
+  // This can fail in heavily loaded CI environments
+  bool skip_minimum_rate_check =
+#if (defined(TRAVIS) || defined(CIRCLECI)) && defined(OS_MACOSX)
+      true;
+#else
+      getenv("SANDCASTLE");
 #endif
+  if (skip_minimum_rate_check) {
+    fprintf(stderr, "Skipped minimum rate check (%d / %d passed)\n",
+            samples_at_minimum, samples);
+  } else {
+    ASSERT_EQ(samples_at_minimum, samples);
+  }
+}
 
 TEST_F(RateLimiterTest, LimitChangeTest) {
   // starvation test when limit changes to a smaller value
@@ -151,12 +389,12 @@
       std::shared_ptr<RateLimiter> limiter =
           std::make_shared<GenericRateLimiter>(
               target, refill_period, 10, RateLimiter::Mode::kWritesOnly,
-              Env::Default(), false /* auto_tuned */);
+              SystemClock::Default(), false /* auto_tuned */);
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
           {{"GenericRateLimiter::Request",
             "RateLimiterTest::LimitChangeTest:changeLimitStart"},
            {"RateLimiterTest::LimitChangeTest:changeLimitEnd",
-            "GenericRateLimiter::Refill"}});
+            "GenericRateLimiter::RefillBytesAndGrantRequests"}});
       Arg arg(target, Env::IO_HIGH, limiter);
       // The idea behind is to start a request first, then before it refills,
       // update limit to a different value (2X/0.5X). No starvation should
@@ -174,30 +412,29 @@
               target / 1024, new_limit / 1024, refill_period / 1000);
     }
   }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) {
   const std::chrono::seconds kTimePerRefill(1);
   const int kRefillsPerTune = 100;  // needs to match util/rate_limiter.cc
 
-  SpecialEnv special_env(Env::Default());
-  special_env.no_slowdown_ = true;
-  special_env.time_elapse_only_sleep_ = true;
+  SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true);
 
   auto stats = CreateDBStatistics();
   std::unique_ptr<RateLimiter> rate_limiter(new GenericRateLimiter(
       1000 /* rate_bytes_per_sec */,
       std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */,
-      RateLimiter::Mode::kWritesOnly, &special_env, true /* auto_tuned */));
+      RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(),
+      true /* auto_tuned */));
 
-  // Use callback to advance time because we need to advance (1) after Request()
-  // has determined the bytes are not available; and (2) before Refill()
-  // computes the next refill time (ensuring refill time in the future allows
-  // the next request to drain the rate limiter).
+  // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+  // `Env` to advance its time according to the fake wait duration. The
+  // workaround is to install a callback that advance the `Env`'s mock time.
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "GenericRateLimiter::Refill", [&](void* /*arg*/) {
-        special_env.SleepForMicroseconds(static_cast<int>(
-            std::chrono::microseconds(kTimePerRefill).count()));
+      "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+        int64_t time_waited_us = *static_cast<int64_t*>(arg);
+        special_env.SleepForMicroseconds(static_cast<int>(time_waited_us));
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -215,6 +452,8 @@
   ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostTimedWait");
 
   // decreases after a sequence of periods where rate limiter is not drained
   orig_bytes_per_sec = new_bytes_per_sec;
@@ -227,6 +466,95 @@
   ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec);
 }
 
+TEST_F(RateLimiterTest, CreateGenericRateLimiterFromString) {
+  std::shared_ptr<RateLimiter> limiter;
+  ConfigOptions config_options;
+  std::string limiter_id = GenericRateLimiter::kClassName();
+  ASSERT_OK(RateLimiter::CreateFromString(config_options, limiter_id + ":1024",
+                                          &limiter));
+  ASSERT_NE(limiter, nullptr);
+  ASSERT_EQ(limiter->GetBytesPerSecond(), 1024U);
+#ifndef ROCKSDB_LITE
+  ASSERT_OK(RateLimiter::CreateFromString(
+      config_options, "rate_bytes_per_sec=2048;id=" + limiter_id, &limiter));
+  ASSERT_NE(limiter, nullptr);
+  ASSERT_EQ(limiter->GetBytesPerSecond(), 2048U);
+  ASSERT_NOK(RateLimiter::CreateFromString(
+      config_options, "rate_bytes_per_sec=0;id=" + limiter_id, &limiter));
+  ASSERT_NOK(RateLimiter::CreateFromString(
+      config_options, "rate_bytes_per_sec=2048;fairness=0;id=" + limiter_id,
+      &limiter));
+
+  ASSERT_OK(
+      RateLimiter::CreateFromString(config_options,
+                                    "rate_bytes_per_sec=2048;refill_period_us="
+                                    "1024;fairness=42;auto_tuned=true;"
+                                    "mode=kReadsOnly;id=" +
+                                        limiter_id,
+                                    &limiter));
+  ASSERT_NE(limiter, nullptr);
+  auto opts =
+      limiter->GetOptions<GenericRateLimiter::GenericRateLimiterOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->max_bytes_per_sec, 2048);
+  ASSERT_EQ(opts->refill_period_us, 1024);
+  ASSERT_EQ(opts->fairness, 42);
+  ASSERT_EQ(opts->auto_tuned, true);
+  ASSERT_TRUE(limiter->IsRateLimited(RateLimiter::OpType::kRead));
+  ASSERT_FALSE(limiter->IsRateLimited(RateLimiter::OpType::kWrite));
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+// This test is for a rate limiter that has no name (Name() returns "").
+// When the default Name() method is deprecated, this test should be removed.
+TEST_F(RateLimiterTest, NoNameRateLimiter) {
+  static std::unordered_map<std::string, OptionTypeInfo> dummy_limiter_options =
+      {
+          {"dummy",
+           {0, OptionType::kInt, OptionVerificationType::kNormal,
+            OptionTypeFlags::kNone}},
+      };
+  class NoNameRateLimiter : public RateLimiter {
+   public:
+    explicit NoNameRateLimiter(bool do_register) {
+      if (do_register) {
+        RegisterOptions("", &dummy, &dummy_limiter_options);
+      }
+    }
+    void SetBytesPerSecond(int64_t /*bytes_per_second*/) override {}
+    int64_t GetSingleBurstBytes() const override { return 0; }
+    int64_t GetTotalBytesThrough(const Env::IOPriority /*pri*/) const override {
+      return 0;
+    }
+    int64_t GetTotalRequests(const Env::IOPriority /*pri*/) const override {
+      return 0;
+    }
+    int64_t GetBytesPerSecond() const override { return 0; }
+
+   private:
+    int dummy;
+  };
+
+  ConfigOptions config_options;
+  DBOptions db_opts, copy;
+  db_opts.rate_limiter.reset(new NoNameRateLimiter(false));
+  ASSERT_EQ(db_opts.rate_limiter->GetId(), "");
+  ASSERT_EQ(db_opts.rate_limiter->ToString(config_options), "");
+  db_opts.rate_limiter.reset(new NoNameRateLimiter(true));
+  ASSERT_EQ(db_opts.rate_limiter->GetId(), "");
+  ASSERT_EQ(db_opts.rate_limiter->ToString(config_options), "");
+  std::string opt_str;
+  ASSERT_OK(GetStringFromDBOptions(config_options, db_opts, &opt_str));
+  ASSERT_OK(
+      GetDBOptionsFromString(config_options, DBOptions(), opt_str, &copy));
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(config_options, db_opts, copy));
+  ASSERT_EQ(copy.rate_limiter, nullptr);
+  ASSERT_NE(copy.rate_limiter, db_opts.rate_limiter);
+}
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/regex.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/regex.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/regex.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/regex.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,50 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// LITE not supported here in part because of exception handling
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/regex.h"
+
+#include <cassert>
+#include <regex>
+
+namespace ROCKSDB_NAMESPACE {
+
+// This section would change for alternate underlying implementations other
+// than std::regex.
+#if 1
+class Regex::Impl : public std::regex {
+ public:
+  using std::regex::basic_regex;
+};
+
+bool Regex::Matches(const std::string &str) const {
+  if (impl_) {
+    return std::regex_match(str, *impl_);
+  } else {
+    // Should not call Matches on unset Regex
+    assert(false);
+    return false;
+  }
+}
+
+Status Regex::Parse(const std::string &pattern, Regex *out) {
+  try {
+    out->impl_.reset(new Impl(pattern));
+    return Status::OK();
+  } catch (const std::regex_error &e) {
+    return Status::InvalidArgument(e.what());
+  }
+}
+#endif
+
+Status Regex::Parse(const char *pattern, Regex *out) {
+  return Parse(std::string(pattern), out);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/repeatable_thread.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/repeatable_thread.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread.h	2025-05-19 16:14:28.000000000 +0000
@@ -8,9 +8,9 @@
 #include <functional>
 #include <string>
 
+#include "monitoring/instrumented_mutex.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
-#include "test_util/mock_time_env.h"
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -21,14 +21,14 @@
 class RepeatableThread {
  public:
   RepeatableThread(std::function<void()> function,
-                   const std::string& thread_name, Env* env, uint64_t delay_us,
-                   uint64_t initial_delay_us = 0)
+                   const std::string& thread_name, SystemClock* clock,
+                   uint64_t delay_us, uint64_t initial_delay_us = 0)
       : function_(function),
         thread_name_("rocksdb:" + thread_name),
-        env_(env),
+        clock_(clock),
         delay_us_(delay_us),
         initial_delay_us_(initial_delay_us),
-        mutex_(env),
+        mutex_(clock),
         cond_var_(&mutex_),
         running_(true),
 #ifndef NDEBUG
@@ -57,7 +57,7 @@
 #ifndef NDEBUG
   // Wait until RepeatableThread starting waiting, call the optional callback,
   // then wait for one run of RepeatableThread. Tests can use provide a
-  // custom env object to mock time, and use the callback here to bump current
+  // custom clock object to mock time, and use the callback here to bump current
   // time and trigger RepeatableThread. See repeatable_thread_test for example.
   //
   // Note: only support one caller of this method.
@@ -81,14 +81,14 @@
   bool wait(uint64_t delay) {
     InstrumentedMutexLock l(&mutex_);
     if (running_ && delay > 0) {
-      uint64_t wait_until = env_->NowMicros() + delay;
+      uint64_t wait_until = clock_->NowMicros() + delay;
 #ifndef NDEBUG
       waiting_ = true;
       cond_var_.SignalAll();
 #endif
       while (running_) {
         cond_var_.TimedWait(wait_until);
-        if (env_->NowMicros() >= wait_until) {
+        if (clock_->NowMicros() >= wait_until) {
           break;
         }
       }
@@ -128,7 +128,7 @@
 
   const std::function<void()> function_;
   const std::string thread_name_;
-  Env* const env_;
+  SystemClock* clock_;
   const uint64_t delay_us_;
   const uint64_t initial_delay_us_;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/repeatable_thread_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/repeatable_thread_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/repeatable_thread_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -3,44 +3,46 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "util/repeatable_thread.h"
+
 #include <atomic>
 #include <memory>
 
 #include "db/db_test_util.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
-#include "util/repeatable_thread.h"
 
 class RepeatableThreadTest : public testing::Test {
  public:
   RepeatableThreadTest()
-      : mock_env_(new ROCKSDB_NAMESPACE::MockTimeEnv(
-            ROCKSDB_NAMESPACE::Env::Default())) {}
+      : mock_clock_(std::make_shared<ROCKSDB_NAMESPACE::MockSystemClock>(
+            ROCKSDB_NAMESPACE::SystemClock::Default())) {}
 
  protected:
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env_;
+  std::shared_ptr<ROCKSDB_NAMESPACE::MockSystemClock> mock_clock_;
 };
 
 TEST_F(RepeatableThreadTest, TimedTest) {
   constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
   constexpr int kIteration = 3;
-  ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
+  const auto& clock = ROCKSDB_NAMESPACE::SystemClock::Default();
   ROCKSDB_NAMESPACE::port::Mutex mutex;
   ROCKSDB_NAMESPACE::port::CondVar test_cv(&mutex);
   int count = 0;
-  uint64_t prev_time = env->NowMicros();
+  uint64_t prev_time = clock->NowMicros();
   ROCKSDB_NAMESPACE::RepeatableThread thread(
       [&] {
         ROCKSDB_NAMESPACE::MutexLock l(&mutex);
         count++;
-        uint64_t now = env->NowMicros();
+        uint64_t now = clock->NowMicros();
         assert(count == 1 || prev_time + 1 * kSecond <= now);
         prev_time = now;
         if (count >= kIteration) {
           test_cv.SignalAll();
         }
       },
-      "rt_test", env, 1 * kSecond);
+      "rt_test", clock.get(), 1 * kSecond);
   // Wait for execution finish.
   {
     ROCKSDB_NAMESPACE::MutexLock l(&mutex);
@@ -56,7 +58,7 @@
 TEST_F(RepeatableThreadTest, MockEnvTest) {
   constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
   constexpr int kIteration = 3;
-  mock_env_->set_current_time(0);  // in seconds
+  mock_clock_->SetCurrentTime(0);  // in seconds
   std::atomic<int> count{0};
 
 #if defined(OS_MACOSX) && !defined(NDEBUG)
@@ -72,26 +74,27 @@
         // immediately, the mutex will not be released, and
         // RepeatableThread::TEST_WaitForRun never has a chance to execute the
         // callback which, in this case, updates the result returned by
-        // mock_env->NowMicros. Consequently, RepeatableThread::wait cannot
+        // mock_clock->NowMicros. Consequently, RepeatableThread::wait cannot
         // break out of the loop, causing test to hang. The extra 1000 seconds
         // is a best-effort approach because there seems no reliable and
         // deterministic way to provide the aforementioned guarantee. By the
         // time RepeatableThread::wait is called, it is no guarantee that the
-        // delay + mock_env->NowMicros will be greater than the current real
+        // delay + mock_clock->NowMicros will be greater than the current real
         // time. However, 1000 seconds should be sufficient in most cases.
         uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env_->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env_->RealNowMicros() + 1000;
+        if (time_us < mock_clock_->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) =
+              mock_clock_->RealNowMicros() + 1000;
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 #endif  // OS_MACOSX && !NDEBUG
 
   ROCKSDB_NAMESPACE::RepeatableThread thread(
-      [&] { count++; }, "rt_test", mock_env_.get(), 1 * kSecond, 1 * kSecond);
+      [&] { count++; }, "rt_test", mock_clock_.get(), 1 * kSecond, 1 * kSecond);
   for (int i = 1; i <= kIteration; i++) {
     // Bump current time
-    thread.TEST_WaitForRun([&] { mock_env_->set_current_time(i); });
+    thread.TEST_WaitForRun([&] { mock_clock_->SetCurrentTime(i); });
   }
   // Test function should be exectued exactly kIteraion times.
   ASSERT_EQ(kIteration, count.load());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_alg.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_alg.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_alg.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_alg.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,1225 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/math128.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_alg.h: generic versions of core algorithms.
+//
+// Ribbon is a Perfect Hash Static Function construction useful as a compact
+// static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
+// system construction that approximates a Band Matrix with hashing,
+// (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
+// remarkably efficient and adaptable at constructing an upper-triangular
+// band matrix from a set of band-approximating inputs from (a), and
+// (c) a storage layout that is fast and adaptable as a filter.
+//
+// Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
+// with One Short Random Block per Row, with Applications" by Stefan
+// Walzer and Martin Dietzfelbinger ("DW paper")
+// (b) developed by Peter C. Dillinger, though not the first on-the-fly
+// GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
+// Bioglio, Grangetto, Gaeta, and Sereno.
+// (c) see "interleaved" solution storage below.
+//
+// See ribbon_impl.h for high-level behavioral summary. This file focuses
+// on the core design details.
+//
+// ######################################################################
+// ################# PHSF -> static filter reduction ####################
+//
+// A Perfect Hash Static Function is a data structure representing a
+// map from anything hashable (a "key") to values of some fixed size.
+// Crucially, it is allowed to return garbage values for anything not in
+// the original set of map keys, and it is a "static" structure: entries
+// cannot be added or deleted after construction. PHSFs representing n
+// mappings to b-bit values (assume uniformly distributed) require at least
+// n * b bits to represent, or at least b bits per entry. We typically
+// describe the compactness of a PHSF by typical bits per entry as some
+// function of b. For example, the MWHC construction (k=3 "peeling")
+// requires about 1.0222*b and a variant called Xor+ requires about
+// 1.08*b + 0.5 bits per entry.
+//
+// With more hashing, a PHSF can over-approximate a set as a Bloom filter
+// does, with no FN queries and predictable false positive (FP) query
+// rate. Instead of the user providing a value to map each input key to,
+// a hash function provides the value. Keys in the original set will
+// return a positive membership query because the underlying PHSF returns
+// the same value as hashing the key. When a key is not in the original set,
+// the PHSF returns a "garbage" value, which is only equal to the key's
+// hash with (false positive) probability 1 in 2^b.
+//
+// For a matching false positive rate, standard Bloom filters require
+// 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
+// require a bit more, around 1.5*b bits per entry. Thus, a Bloom
+// alternative could save up to or nearly 1/3rd of memory and storage
+// that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
+// is dynamic.)
+//
+// Recommended reading:
+// "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
+// by Graf and Lemire
+// First three sections of "Fast Scalable Construction of (Minimal
+// Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
+//
+// ######################################################################
+// ################## PHSF vs. hash table vs. Bloom #####################
+//
+// You can think of traditional hash tables and related filter variants
+// such as Cuckoo filters as utilizing an "OR" construction: a hash
+// function associates a key with some slots and the data is returned if
+// the data is found in any one of those slots. The collision resolution
+// is visible in the final data structure and requires extra information.
+// For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
+// Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
+// structure associates each input key with data in one slot, the
+// structure implicitly constructs a (near-)minimal (near-)perfect hash
+// (MPH) of the keys, which requires at least 1.44 bits per key to
+// represent. This is why approaches with visible collision resolution
+// have a fixed + 1.5 or more in storage overhead per entry, often in
+// addition to an overhead multiplier on b.
+//
+// By contrast Bloom filters utilize an "AND" construction: a query only
+// returns true if all bit positions associated with a key are set to 1.
+// There is no collision resolution, so Bloom filters do not suffer a
+// fixed bits per entry overhead like the above structures.
+//
+// PHSFs typically use a bitwise XOR construction: the data you want is
+// not in a single slot, but in a linear combination of several slots.
+// For static data, this gives the best of "AND" and "OR" constructions:
+// avoids the +1.44 or more fixed overhead by not approximating a MPH and
+// can do much better than Bloom's 1.44 factor on b with collision
+// resolution, which here is done ahead of time and invisible at query
+// time.
+//
+// ######################################################################
+// ######################## PHSF construction ###########################
+//
+// For a typical PHSF, construction is solving a linear system of
+// equations, typically in GF(2), which is to say that values are boolean
+// and XOR serves both as addition and subtraction. We can use matrices to
+// represent the problem:
+//
+//    C    *    S    =    R
+// (n x m)   (m x b)   (n x b)
+// where C = coefficients, S = solution, R = results
+// and solving for S given C and R.
+//
+// Note that C and R each have n rows, one for each input entry for the
+// PHSF. A row in C is given by a hash function on the PHSF input key,
+// and the corresponding row in R is the b-bit value to associate with
+// that input key. (In a filter, rows of R are given by another hash
+// function on the input key.)
+//
+// On solving, the matrix S (solution) is the final PHSF data, as it
+// maps any row from the original C to its corresponding desired result
+// in R. We just have to hash our query inputs and compute a linear
+// combination of rows in S.
+//
+// In theory, we could chose m = n and let a hash function associate
+// each input key with random rows in C. A solution exists with high
+// probability, and uses essentially minimum space, b bits per entry
+// (because we set m = n) but this has terrible scaling, something
+// like O(n^2) space and O(n^3) time during construction (Gaussian
+// elimination) and O(n) query time. But computational efficiency is
+// key, and the core of this is avoiding scanning all of S to answer
+// each query.
+//
+// The traditional approach (MWHC, aka Xor filter) starts with setting
+// only some small fixed number of columns (typically k=3) to 1 for each
+// row of C, with remaining entries implicitly 0. This is implemented as
+// three hash functions over [0,m), and S can be implemented as a vector
+// vector of b-bit values. Now, a query only involves looking up k rows
+// (values) in S and computing their bitwise XOR. Additionally, this
+// construction can use a linear time algorithm called "peeling" for
+// finding a solution in many cases of one existing, but peeling
+// generally requires a larger space overhead factor in the solution
+// (m/n) than is required with Gaussian elimination.
+//
+// Recommended reading:
+// "Peeling Close to the Orientability Threshold – Spatial Coupling in
+// Hashing-Based Data Structures" by Stefan Walzer
+//
+// ######################################################################
+// ##################### Ribbon PHSF construction #######################
+//
+// Ribbon constructs coefficient rows essentially the same as in the
+// Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
+// r (kCoeffBits in code), each key is hashed to a starting column in
+// [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
+// coefficients (GetCoeffRow() in code). If you sort the rows by start,
+// the C matrix would look something like this:
+//
+// [####00000000000000000000]
+// [####00000000000000000000]
+// [000####00000000000000000]
+// [0000####0000000000000000]
+// [0000000####0000000000000]
+// [000000000####00000000000]
+// [000000000####00000000000]
+// [0000000000000####0000000]
+// [0000000000000000####0000]
+// [00000000000000000####000]
+// [00000000000000000000####]
+//
+// where each # could be a 0 or 1, chosen uniformly by a hash function.
+// (Except we typically set the start column value to 1.) This scheme
+// uses hashing to approximate a band matrix, and it has a solution iff
+// it reduces to an upper-triangular boolean r-band matrix, like this:
+//
+// [1###00000000000000000000]
+// [01##00000000000000000000]
+// [000000000000000000000000]
+// [0001###00000000000000000]
+// [000000000000000000000000]
+// [000001##0000000000000000]
+// [000000000000000000000000]
+// [00000001###0000000000000]
+// [000000001###000000000000]
+// [0000000001##000000000000]
+// ...
+// [00000000000000000000001#]
+// [000000000000000000000001]
+//
+// where we have expanded to an m x m matrix by filling with rows of
+// all zeros as needed. As in Gaussian elimination, this form is ready for
+// generating a solution through back-substitution.
+//
+// The awesome thing about the Ribbon construction (from the DW paper) is
+// how row reductions keep each row representable as a start column and
+// r coefficients, because row reductions are only needed when two rows
+// have the same number of leading zero columns. Thus, the combination
+// of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
+// out the leading 1s, so starts (at least) one column later and only
+// needs (at most) r - 1 coefficients.
+//
+// ######################################################################
+// ###################### Ribbon PHSF scalability #######################
+//
+// Although more practical detail is in ribbon_impl.h, it's worth
+// understanding some of the overall benefits and limitations of the
+// Ribbon PHSFs.
+//
+// High-end scalability is a primary issue for Ribbon PHSFs, because in
+// a single Ribbon linear system with fixed r and fixed m/n ratio, the
+// solution probability approaches zero as n approaches infinity.
+// For a given n, solution probability improves with larger r and larger
+// m/n.
+//
+// By contrast, peeling-based PHSFs have somewhat worse storage ratio
+// or solution probability for small n (less than ~1000). This is
+// especially true with spatial-coupling, where benefits are only
+// notable for n on the order of 100k or 1m or more.
+//
+// To make best use of current hardware, r=128 seems to be closest to
+// a "generally good" choice for Ribbon, at least in RocksDB where SST
+// Bloom filters typically hold around 10-100k keys, and almost always
+// less than 10m keys. r=128 ribbon has a high chance of encoding success
+// (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
+// for roughly 10k - 10m keys in a single linear system. r=64 only scales
+// up to about 10k keys with the same storage overhead. Construction and
+// access times for r=128 are similar to r=64. r=128 tracks nearly
+// twice as much data during construction, but in most cases we expect
+// the scalability benefits of r=128 vs. r=64 to make it preferred.
+//
+// A natural approach to scaling Ribbon beyond ~10m keys is splitting
+// (or "sharding") the inputs into multiple linear systems with their
+// own hash seeds. This can also help to control peak memory consumption.
+// TODO: much more to come
+//
+// ######################################################################
+// #################### Ribbon on-the-fly banding #######################
+//
+// "Banding" is what we call the process of reducing the inputs to an
+// upper-triangular r-band matrix ready for finishing a solution with
+// back-substitution. Although the DW paper presents an algorithm for
+// this ("SGauss"), the awesome properties of their construction enable
+// an even simpler, faster, and more backtrackable algorithm. In simplest
+// terms, the SGauss algorithm requires sorting the inputs by start
+// columns, but it's possible to make Gaussian elimination resemble hash
+// table insertion!
+//
+// The enhanced algorithm is based on these observations:
+// - When processing a coefficient row with first 1 in column j,
+//   - If it's the first at column j to be processed, it can be part of
+//     the banding at row j. (And that decision never overwritten, with
+//     no loss of generality!)
+//   - Else, it can be combined with existing row j and re-processed,
+//     which will look for a later "empty" row or reach "no solution".
+//
+// We call our banding algorithm "incremental" and "on-the-fly" because
+// (like hash table insertion) we are "finished" after each input
+// processed, with respect to all inputs processed so far. Although the
+// band matrix is an intermediate step to the solution structure, we have
+// eliminated intermediate steps and unnecessary data tracking for
+// banding.
+//
+// Building on "incremental" and "on-the-fly", the banding algorithm is
+// easily backtrackable because no (non-empty) rows are overwritten in
+// the banding. Thus, if we want to "try" adding an additional set of
+// inputs to the banding, we only have to record which rows were written
+// in order to efficiently backtrack to our state before considering
+// the additional set. (TODO: how this can mitigate scalability and
+// reach sub-1% overheads)
+//
+// Like in a linear-probed hash table, as the occupancy approaches and
+// surpasses 90-95%, collision resolution dominates the construction
+// time. (Ribbon doesn't usually pay at query time; see solution
+// storage below.) This means that we can speed up construction time
+// by using a higher m/n ratio, up to negative returns around 1.2.
+// At m/n ~= 1.2, which still saves memory substantially vs. Bloom
+// filter's 1.5, construction speed (including back-substitution) is not
+// far from sorting speed, but still a few times slower than cache-local
+// Bloom construction speed.
+//
+// Back-substitution from an upper-triangular boolean band matrix is
+// especially fast and easy. All the memory accesses are sequential or at
+// least local, no random. If the number of result bits (b) is a
+// compile-time constant, the back-substitution state can even be tracked
+// in CPU registers. Regardless of the solution representation, we prefer
+// column-major representation for tracking back-substitution state, as
+// r (the band width) will typically be much larger than b (result bits
+// or columns), so better to handle r-bit values b times (per solution
+// row) than b-bit values r times.
+//
+// ######################################################################
+// ##################### Ribbon solution storage ########################
+//
+// Row-major layout is typical for boolean (bit) matrices, including for
+// MWHC (Xor) filters where a query combines k b-bit values, and k is
+// typically smaller than b. Even for k=4 and b=2, at least k=4 random
+// look-ups are required regardless of layout.
+//
+// Ribbon PHSFs are quite different, however, because
+// (a) all of the solution rows relevant to a query are within a single
+// range of r rows, and
+// (b) the number of solution rows involved (r/2 on average, or r if
+// avoiding conditional accesses) is typically much greater than
+// b, the number of solution columns.
+//
+// Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
+// by processing (up to) r entries of b bits each, where b is typically
+// less than 10 for filter applications.
+//
+// Column-major layout has poor locality because of accessing up to b
+// memory locations in different pages (and obviously cache lines). Note
+// that negative filter queries do not typically need to access all
+// solution columns, as they can return when a mismatch is found in any
+// result/solution column. This optimization doesn't always pay off on
+// recent hardware, where the penalty for unpredictable conditional
+// branching can exceed the penalty for unnecessary work, but the
+// optimization is essentially unavailable with row-major layout.
+//
+// The best compromise seems to be interleaving column-major on the small
+// scale with row-major on the large scale. For example, let a solution
+// "block" be r rows column-major encoded as b r-bit values in sequence.
+// Each query accesses (up to) 2 adjacent blocks, which will typically
+// span 1-3 cache lines in adjacent memory. We get very close to the same
+// locality as row-major, but with much faster reconstruction of each
+// result column, at least for filter applications where b is relatively
+// small and negative queries can return early.
+//
+// ######################################################################
+// ###################### Fractional result bits ########################
+//
+// Bloom filters have great flexibility that alternatives mostly do not
+// have. One of those flexibilities is in utilizing any ratio of data
+// structure bits per key. With a typical memory allocator like jemalloc,
+// this flexibility can save roughly 10% of the filters' footprint in
+// DRAM by rounding up and down filter sizes to minimize memory internal
+// fragmentation (see optimize_filters_for_memory RocksDB option).
+//
+// At first glance, PHSFs only offer a whole number of bits per "slot"
+// (m rather than number of keys n), but coefficient locality in the
+// Ribbon construction makes fractional bits/key quite possible and
+// attractive for filter applications. This works by a prefix of the
+// structure using b-1 solution columns and the rest using b solution
+// columns. See InterleavedSolutionStorage below for more detail.
+//
+// Because false positive rates are non-linear in bits/key, this approach
+// is not quite optimal in terms of information theory. In common cases,
+// we see additional space overhead up to about 1.5% vs. theoretical
+// optimal to achieve the same FP rate. We consider this a quite acceptable
+// overhead for very efficiently utilizing space that might otherwise be
+// wasted.
+//
+// This property of Ribbon even makes it "elastic." A Ribbon filter and
+// its small metadata for answering queries can be adapted into another
+// Ribbon filter filling any smaller multiple of r bits (plus small
+// metadata), with a correspondingly higher FP rate. None of the data
+// thrown away during construction needs to be recalled for this reduction.
+// Similarly a single Ribbon construction can be separated (by solution
+// column) into two or more structures (or "layers" or "levels") with
+// independent filtering ability (no FP correlation, just as solution or
+// result columns in a single structure) despite being constructed as part
+// of a single linear system. (TODO: implement)
+// See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
+// Efficient Read for LSM-tree-based KV Stores."
+//
+
+// ######################################################################
+// ################### CODE: Ribbon core algorithms #####################
+// ######################################################################
+//
+// These algorithms are templatized for genericity but near-maximum
+// performance in a given application. The template parameters
+// adhere to informal class/struct type concepts outlined below. (This
+// code is written for C++11 so does not use formal C++ concepts.)
+
+// Rough architecture for these algorithms:
+//
+// +-----------+     +---+     +-----------------+
+// | AddInputs | --> | H | --> | BandingStorage  |
+// +-----------+     | a |     +-----------------+
+//                   | s |             |
+//                   | h |      Back substitution
+//                   | e |             V
+// +-----------+     | r |     +-----------------+
+// | Query Key | --> |   | >+< | SolutionStorage |
+// +-----------+     +---+  |  +-----------------+
+//                          V
+//                     Query result
+
+// Common to other concepts
+// concept RibbonTypes {
+//   // An unsigned integer type for an r-bit subsequence of coefficients.
+//   // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
+//   // generally only hurt scalability to leave bits of CoeffRow unused.
+//   typename CoeffRow;
+//   // An unsigned integer type big enough to hold a result row (b bits,
+//   // or number of solution/result columns).
+//   // In many applications, especially filters, the number of result
+//   // columns is decided at run time, so ResultRow simply needs to be
+//   // big enough for the largest number of columns allowed.
+//   typename ResultRow;
+//   // An unsigned integer type sufficient for representing the number of
+//   // rows in the solution structure, and at least the arithmetic
+//   // promotion size (usually 32 bits). uint32_t recommended because a
+//   // single Ribbon construction doesn't really scale to billions of
+//   // entries.
+//   typename Index;
+// };
+
+// ######################################################################
+// ######################## Hashers and Banding #########################
+
+// Hasher concepts abstract out hashing details.
+
+// concept PhsfQueryHasher extends RibbonTypes {
+//   // Type for a lookup key, which is hashable.
+//   typename Key;
+//
+//   // Type for hashed summary of a Key. uint64_t is recommended.
+//   typename Hash;
+//
+//   // Compute a hash value summarizing a Key
+//   Hash GetHash(const Key &) const;
+//
+//   // Given a hash value and a number of columns that can start an
+//   // r-sequence of coefficients (== m - r + 1), return the start
+//   // column to associate with that hash value. (Starts can be chosen
+//   // uniformly or "smash" extra entries into the beginning and end for
+//   // better utilization at those extremes of the structure. Details in
+//   // ribbon.impl.h)
+//   Index GetStart(Hash, Index num_starts) const;
+//
+//   // Given a hash value, return the r-bit sequence of coefficients to
+//   // associate with it. It's generally OK if
+//   //   sizeof(CoeffRow) > sizeof(Hash)
+//   // as long as the hash itself is not too prone to collisions for the
+//   // applications and the CoeffRow is generated uniformly from
+//   // available hash data, but relatively independent of the start.
+//   //
+//   // Must be non-zero, because that's required for a solution to exist
+//   // when mapping to non-zero result row. (Note: BandingAdd could be
+//   // modified to allow 0 coeff row if that only occurs with 0 result
+//   // row, which really only makes sense for filter implementation,
+//   // where both values are hash-derived. Or BandingAdd could reject 0
+//   // coeff row, forcing next seed, but that has potential problems with
+//   // generality/scalability.)
+//   CoeffRow GetCoeffRow(Hash) const;
+// };
+
+// concept FilterQueryHasher extends PhsfQueryHasher {
+//   // For building or querying a filter, this returns the expected
+//   // result row associated with a hashed input. For general PHSF,
+//   // this must return 0.
+//   //
+//   // Although not strictly required, there's a slightly better chance of
+//   // solver success if result row is masked down here to only the bits
+//   // actually needed.
+//   ResultRow GetResultRowFromHash(Hash) const;
+// }
+
+// concept BandingHasher extends FilterQueryHasher {
+//   // For a filter, this will generally be the same as Key.
+//   // For a general PHSF, it must either
+//   // (a) include a key and a result it maps to (e.g. in a std::pair), or
+//   // (b) GetResultRowFromInput looks up the result somewhere rather than
+//   // extracting it.
+//   typename AddInput;
+//
+//   // Instead of requiring a way to extract a Key from an
+//   // AddInput, we require getting the hash of the Key part
+//   // of an AddInput, which is trivial if AddInput == Key.
+//   Hash GetHash(const AddInput &) const;
+//
+//   // For building a non-filter PHSF, this extracts or looks up the result
+//   // row to associate with an input. For filter PHSF, this must return 0.
+//   ResultRow GetResultRowFromInput(const AddInput &) const;
+//
+//   // Whether the solver can assume the lowest bit of GetCoeffRow is
+//   // always 1. When true, it should improve solver efficiency slightly.
+//   static bool kFirstCoeffAlwaysOne;
+// }
+
+// Abstract storage for the the result of "banding" the inputs (Gaussian
+// elimination to an upper-triangular boolean band matrix). Because the
+// banding is an incremental / on-the-fly algorithm, this also represents
+// all the intermediate state between input entries.
+//
+// concept BandingStorage extends RibbonTypes {
+//   // Tells the banding algorithm to prefetch memory associated with
+//   // the next input before processing the current input. Generally
+//   // recommended iff the BandingStorage doesn't easily fit in CPU
+//   // cache.
+//   bool UsePrefetch() const;
+//
+//   // Prefetches (e.g. __builtin_prefetch) memory associated with a
+//   // slot index i.
+//   void Prefetch(Index i) const;
+//
+//   // Load or store CoeffRow and ResultRow for slot index i.
+//   // (Gaussian row operations involve both sides of the equation.)
+//   // Bool `for_back_subst` indicates that customizing values for
+//   // unconstrained solution rows (cr == 0) is allowed.
+//   void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst)
+//        const;
+//   void StoreRow(Index i, CoeffRow cr, ResultRow rr);
+//
+//   // Returns the number of columns that can start an r-sequence of
+//   // coefficients, which is the number of slots minus r (kCoeffBits)
+//   // plus one. (m - r + 1)
+//   Index GetNumStarts() const;
+// };
+
+// Optional storage for backtracking data in banding a set of input
+// entries. It exposes an array structure which will generally be
+// used as a stack. It must be able to accommodate as many entries
+// as are passed in as inputs to `BandingAddRange`.
+//
+// concept BacktrackStorage extends RibbonTypes {
+//   // If false, backtracking support will be disabled in the algorithm.
+//   // This should preferably be an inline compile-time constant function.
+//   bool UseBacktrack() const;
+//
+//   // Records `to_save` as the `i`th backtrack entry
+//   void BacktrackPut(Index i, Index to_save);
+//
+//   // Recalls the `i`th backtrack entry
+//   Index BacktrackGet(Index i) const;
+// }
+
+// Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
+// returning true if successful or false if solution is impossible with
+// current hasher (and presumably its seed) and number of "slots" (solution
+// or banding rows). (A solution is impossible when there is a linear
+// dependence among the inputs that doesn't "cancel out".)
+//
+// Pre- and post-condition: the BandingStorage represents a band matrix
+// ready for back substitution (row echelon form except for zero rows),
+// augmented with result values such that back substitution would give a
+// solution satisfying all the cr@start -> rr entries added.
+template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
+          typename BacktrackStorage>
+bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
+                typename BandingStorage::ResultRow rr,
+                typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
+                typename BandingStorage::Index *backtrack_pos) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using ResultRow = typename BandingStorage::ResultRow;
+  using Index = typename BandingStorage::Index;
+
+  Index i = start;
+
+  if (!kFirstCoeffAlwaysOne) {
+    // Requires/asserts that cr != 0
+    int tz = CountTrailingZeroBits(cr);
+    i += static_cast<Index>(tz);
+    cr >>= tz;
+  }
+
+  for (;;) {
+    assert((cr & 1) == 1);
+    CoeffRow cr_at_i;
+    ResultRow rr_at_i;
+    bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false);
+    if (cr_at_i == 0) {
+      bs->StoreRow(i, cr, rr);
+      bts->BacktrackPut(*backtrack_pos, i);
+      ++*backtrack_pos;
+      return true;
+    }
+    assert((cr_at_i & 1) == 1);
+    // Gaussian row reduction
+    cr ^= cr_at_i;
+    rr ^= rr_at_i;
+    if (cr == 0) {
+      // Inconsistency or (less likely) redundancy
+      break;
+    }
+    // Find relative offset of next non-zero coefficient.
+    int tz = CountTrailingZeroBits(cr);
+    i += static_cast<Index>(tz);
+    cr >>= tz;
+  }
+
+  // Failed, unless result row == 0 because e.g. a duplicate input or a
+  // stock hash collision, with same result row. (For filter, stock hash
+  // collision implies same result row.) Or we could have a full equation
+  // equal to sum of other equations, which is very possible with
+  // small range of values for result row.
+  return rr == 0;
+}
+
+// Adds a range of entries to BandingStorage returning true if successful
+// or false if solution is impossible with current hasher (and presumably
+// its seed) and number of "slots" (solution or banding rows). (A solution
+// is impossible when there is a linear dependence among the inputs that
+// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
+//
+// If UseBacktrack in the BacktrackStorage, this function call rolls back
+// to prior state on failure. If !UseBacktrack, some subset of the entries
+// will have been added to the BandingStorage, so best considered to be in
+// an indeterminate state.
+//
+template <typename BandingStorage, typename BacktrackStorage,
+          typename BandingHasher, typename InputIterator>
+bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
+                     const BandingHasher &bh, InputIterator begin,
+                     InputIterator end) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+  using Hash = typename BandingHasher::Hash;
+
+  static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
+  static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
+  static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
+
+  constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
+
+  if (begin == end) {
+    // trivial
+    return true;
+  }
+
+  const Index num_starts = bs->GetNumStarts();
+
+  InputIterator cur = begin;
+  Index backtrack_pos = 0;
+  if (!bs->UsePrefetch()) {
+    // Simple version, no prefetch
+    for (;;) {
+      Hash h = bh.GetHash(*cur);
+      Index start = bh.GetStart(h, num_starts);
+      ResultRow rr =
+          bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
+      CoeffRow cr = bh.GetCoeffRow(h);
+
+      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+        break;
+      }
+      if ((++cur) == end) {
+        return true;
+      }
+    }
+  } else {
+    // Pipelined w/prefetch
+    // Prime the pipeline
+    Hash h = bh.GetHash(*cur);
+    Index start = bh.GetStart(h, num_starts);
+    ResultRow rr = bh.GetResultRowFromInput(*cur);
+    bs->Prefetch(start);
+
+    // Pipeline
+    for (;;) {
+      rr |= bh.GetResultRowFromHash(h);
+      CoeffRow cr = bh.GetCoeffRow(h);
+      if ((++cur) == end) {
+        if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+          break;
+        }
+        return true;
+      }
+      Hash next_h = bh.GetHash(*cur);
+      Index next_start = bh.GetStart(next_h, num_starts);
+      ResultRow next_rr = bh.GetResultRowFromInput(*cur);
+      bs->Prefetch(next_start);
+      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+        break;
+      }
+      h = next_h;
+      start = next_start;
+      rr = next_rr;
+    }
+  }
+  // failed; backtrack (if implemented)
+  if (bts->UseBacktrack()) {
+    while (backtrack_pos > 0) {
+      --backtrack_pos;
+      Index i = bts->BacktrackGet(backtrack_pos);
+      // Clearing the ResultRow is not strictly required, but is required
+      // for good FP rate on inputs that might have been backtracked out.
+      // (We don't want anything we've backtracked on to leak into final
+      // result, as that might not be "harmless".)
+      bs->StoreRow(i, 0, 0);
+    }
+  }
+  return false;
+}
+
+// Adds a range of entries to BandingStorage returning true if successful
+// or false if solution is impossible with current hasher (and presumably
+// its seed) and number of "slots" (solution or banding rows). (A solution
+// is impossible when there is a linear dependence among the inputs that
+// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
+//
+// On failure, some subset of the entries will have been added to the
+// BandingStorage, so best considered to be in an indeterminate state.
+//
+template <typename BandingStorage, typename BandingHasher,
+          typename InputIterator>
+bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
+                     InputIterator begin, InputIterator end) {
+  using Index = typename BandingStorage::Index;
+  struct NoopBacktrackStorage {
+    bool UseBacktrack() { return false; }
+    void BacktrackPut(Index, Index) {}
+    Index BacktrackGet(Index) {
+      assert(false);
+      return 0;
+    }
+  } nbts;
+  return BandingAddRange(bs, &nbts, bh, begin, end);
+}
+
+// ######################################################################
+// ######################### Solution Storage ###########################
+
+// Back-substitution and query algorithms unfortunately depend on some
+// details of data layout in the final data structure ("solution"). Thus,
+// there is no common SolutionStorage covering all the reasonable
+// possibilities.
+
+// ###################### SimpleSolutionStorage #########################
+
+// SimpleSolutionStorage is for a row-major storage, typically with no
+// unused bits in each ResultRow. This is mostly for demonstration
+// purposes as the simplest solution storage scheme. It is relatively slow
+// for filter queries.
+
+// concept SimpleSolutionStorage extends RibbonTypes {
+//   // This is called at the beginning of back-substitution for the
+//   // solution storage to do any remaining configuration before data
+//   // is stored to it. If configuration is previously finalized, this
+//   // could be a simple assertion or even no-op. Ribbon algorithms
+//   // only call this from back-substitution, and only once per call,
+//   // before other functions here.
+//   void PrepareForNumStarts(Index num_starts) const;
+//   // Must return num_starts passed to PrepareForNumStarts, or the most
+//   // recent call to PrepareForNumStarts if this storage object can be
+//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
+//   // there must be a run of kCoeffBits slots starting from each start.
+//   Index GetNumStarts() const;
+//   // Load the solution row (type ResultRow) for a slot
+//   ResultRow Load(Index slot_num) const;
+//   // Store the solution row (type ResultRow) for a slot
+//   void Store(Index slot_num, ResultRow data);
+// };
+
+// Back-substitution for generating a solution from BandingStorage to
+// SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename BandingStorage>
+void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
+                "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
+      "must be same");
+  static_assert(
+      sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+  constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
+
+  // A column-major buffer of the solution matrix, containing enough
+  // recently-computed solution data to compute the next solution row
+  // (based also on banding data).
+  std::array<CoeffRow, kResultBits> state;
+  state.fill(0);
+
+  const Index num_starts = bs.GetNumStarts();
+  sss->PrepareForNumStarts(num_starts);
+  const Index num_slots = num_starts + kCoeffBits - 1;
+
+  for (Index i = num_slots; i > 0;) {
+    --i;
+    CoeffRow cr;
+    ResultRow rr;
+    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
+    // solution row
+    ResultRow sr = 0;
+    for (Index j = 0; j < kResultBits; ++j) {
+      // Compute next solution bit at row i, column j (see derivation below)
+      CoeffRow tmp = state[j] << 1;
+      bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
+      tmp |= bit ? CoeffRow{1} : CoeffRow{0};
+
+      // Now tmp is solution at column j from row i for next kCoeffBits
+      // more rows. Thus, for valid solution, the dot product of the
+      // solution column with the coefficient row has to equal the result
+      // at that column,
+      //   BitParity(tmp & cr) == ((rr >> j) & 1)
+
+      // Update state.
+      state[j] = tmp;
+      // add to solution row
+      sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
+    }
+    sss->Store(i, sr);
+  }
+}
+
+// Common functionality for querying a key (already hashed) in
+// SimpleSolutionStorage.
+template <typename SimpleSolutionStorage>
+typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
+    typename SimpleSolutionStorage::Index start_slot,
+    typename SimpleSolutionStorage::CoeffRow cr,
+    const SimpleSolutionStorage &sss) {
+  using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
+  using ResultRow = typename SimpleSolutionStorage::ResultRow;
+
+  constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
+
+  ResultRow result = 0;
+  for (unsigned i = 0; i < kCoeffBits; ++i) {
+    // Bit masking whole value is generally faster here than 'if'
+    result ^= sss.Load(start_slot + i) &
+              (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
+  }
+  return result;
+}
+
+// General PHSF query a key from SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
+typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
+    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
+    const SimpleSolutionStorage &sss) {
+  const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
+
+  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
+                    sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
+                    sizeof(typename PhsfQueryHasher::CoeffRow),
+                "must be same");
+
+  return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
+                           hasher.GetCoeffRow(hash), sss);
+}
+
+// Filter query a key from SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename FilterQueryHasher>
+bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
+                       const FilterQueryHasher &hasher,
+                       const SimpleSolutionStorage &sss) {
+  const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
+  const typename SimpleSolutionStorage::ResultRow expected =
+      hasher.GetResultRowFromHash(hash);
+
+  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
+                    sizeof(typename FilterQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
+                    sizeof(typename FilterQueryHasher::CoeffRow),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
+                    sizeof(typename FilterQueryHasher::ResultRow),
+                "must be same");
+
+  return expected ==
+         SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
+                           hasher.GetCoeffRow(hash), sss);
+}
+
+// #################### InterleavedSolutionStorage ######################
+
+// InterleavedSolutionStorage is row-major at a high level, for good
+// locality, and column-major at a low level, for CPU efficiency
+// especially in filter queries or relatively small number of result bits
+// (== solution columns). The storage is a sequence of "blocks" where a
+// block has one CoeffRow-sized segment for each solution column. Each
+// query spans at most two blocks; the starting solution row is typically
+// in the row-logical middle of a block and spans to the middle of the
+// next block. (See diagram below.)
+//
+// InterleavedSolutionStorage supports choosing b (number of result or
+// solution columns) at run time, and even supports mixing b and b-1 solution
+// columns in a single linear system solution, for filters that can
+// effectively utilize any size space (multiple of CoeffRow) for minimizing
+// FP rate for any number of added keys. To simplify query implementation
+// (with lower-index columns first), the b-bit portion comes after the b-1
+// portion of the structure.
+//
+// Diagram (=== marks logical block boundary; b=4; ### is data used by a
+// query crossing the b-1 to b boundary, each Segment has type CoeffRow):
+//  ...
+// +======================+
+// | S e g m e n t  col=0 |
+// +----------------------+
+// | S e g m e n t  col=1 |
+// +----------------------+
+// | S e g m e n t  col=2 |
+// +======================+
+// | S e g m e n #########|
+// +----------------------+
+// | S e g m e n #########|
+// +----------------------+
+// | S e g m e n #########|
+// +======================+ Result/solution columns: above = 3, below = 4
+// |#############t  col=0 |
+// +----------------------+
+// |#############t  col=1 |
+// +----------------------+
+// |#############t  col=2 |
+// +----------------------+
+// | S e g m e n t  col=3 |
+// +======================+
+// | S e g m e n t  col=0 |
+// +----------------------+
+// | S e g m e n t  col=1 |
+// +----------------------+
+// | S e g m e n t  col=2 |
+// +----------------------+
+// | S e g m e n t  col=3 |
+// +======================+
+//  ...
+//
+// InterleavedSolutionStorage will be adapted by the algorithms from
+// simple array-like segment storage. That array-like storage is templatized
+// in part so that an implementation may choose to handle byte ordering
+// at access time.
+//
+// concept InterleavedSolutionStorage extends RibbonTypes {
+//   // This is called at the beginning of back-substitution for the
+//   // solution storage to do any remaining configuration before data
+//   // is stored to it. If configuration is previously finalized, this
+//   // could be a simple assertion or even no-op. Ribbon algorithms
+//   // only call this from back-substitution, and only once per call,
+//   // before other functions here.
+//   void PrepareForNumStarts(Index num_starts) const;
+//   // Must return num_starts passed to PrepareForNumStarts, or the most
+//   // recent call to PrepareForNumStarts if this storage object can be
+//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
+//   // there must be a run of kCoeffBits slots starting from each start.
+//   Index GetNumStarts() const;
+//   // The larger number of solution columns used (called "b" above).
+//   Index GetUpperNumColumns() const;
+//   // If returns > 0, then block numbers below that use
+//   // GetUpperNumColumns() - 1 columns per solution row, and the rest
+//   // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
+//   // where all but the last kCoeffBits - 1 slots are also starts. And
+//   // a block contains a segment for each solution column.
+//   // An implementation may only support uniform columns per solution
+//   // row and return constant 0 here.
+//   Index GetUpperStartBlock() const;
+//
+//   // ### "Array of segments" portion of API ###
+//   // The number of values of type CoeffRow used in this solution
+//   // representation. (This value can be inferred from the previous
+//   // three functions, but is expected at least for sanity / assertion
+//   // checking.)
+//   Index GetNumSegments() const;
+//   // Load an entry from the logical array of segments
+//   CoeffRow LoadSegment(Index segment_num) const;
+//   // Store an entry to the logical array of segments
+//   void StoreSegment(Index segment_num, CoeffRow data);
+// };
+
+// A helper for InterleavedBackSubst.
+template <typename BandingStorage>
+inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
+                           typename BandingStorage::Index num_columns,
+                           const BandingStorage &bs,
+                           typename BandingStorage::Index start_slot) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  for (Index i = start_slot + kCoeffBits; i > start_slot;) {
+    --i;
+    CoeffRow cr;
+    ResultRow rr;
+    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
+    for (Index j = 0; j < num_columns; ++j) {
+      // Compute next solution bit at row i, column j (see derivation below)
+      CoeffRow tmp = state[j] << 1;
+      int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
+      tmp |= static_cast<CoeffRow>(bit);
+
+      // Now tmp is solution at column j from row i for next kCoeffBits
+      // more rows. Thus, for valid solution, the dot product of the
+      // solution column with the coefficient row has to equal the result
+      // at that column,
+      //   BitParity(tmp & cr) == ((rr >> j) & 1)
+
+      // Update state.
+      state[j] = tmp;
+    }
+  }
+}
+
+// Back-substitution for generating a solution from BandingStorage to
+// InterleavedSolutionStorage.
+template <typename InterleavedSolutionStorage, typename BandingStorage>
+void InterleavedBackSubst(InterleavedSolutionStorage *iss,
+                          const BandingStorage &bs) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+
+  static_assert(
+      sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
+      "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const Index num_starts = bs.GetNumStarts();
+  // Although it might be nice to have a filter that returns "always false"
+  // when no key is added, we aren't specifically supporting that here
+  // because it would require another condition branch in the query.
+  assert(num_starts > 0);
+  iss->PrepareForNumStarts(num_starts);
+
+  const Index num_slots = num_starts + kCoeffBits - 1;
+  assert(num_slots % kCoeffBits == 0);
+  const Index num_blocks = num_slots / kCoeffBits;
+  const Index num_segments = iss->GetNumSegments();
+
+  // For now upper, then lower
+  Index num_columns = iss->GetUpperNumColumns();
+  const Index upper_start_block = iss->GetUpperStartBlock();
+
+  if (num_columns == 0) {
+    // Nothing to do, presumably because there's not enough space for even
+    // a single segment.
+    assert(num_segments == 0);
+    // When num_columns == 0, a Ribbon filter query will always return true,
+    // or a PHSF query always 0.
+    return;
+  }
+
+  // We should be utilizing all available segments
+  assert(num_segments == (upper_start_block * (num_columns - 1)) +
+                             ((num_blocks - upper_start_block) * num_columns));
+
+  // TODO: consider fixed-column specializations with stack-allocated state
+
+  // A column-major buffer of the solution matrix, containing enough
+  // recently-computed solution data to compute the next solution row
+  // (based also on banding data).
+  std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
+
+  Index block = num_blocks;
+  Index segment_num = num_segments;
+  while (block > upper_start_block) {
+    --block;
+    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
+    segment_num -= num_columns;
+    for (Index i = 0; i < num_columns; ++i) {
+      iss->StoreSegment(segment_num + i, state[i]);
+    }
+  }
+  // Now (if applicable), region using lower number of columns
+  // (This should be optimized away if GetUpperStartBlock() returns
+  // constant 0.)
+  --num_columns;
+  while (block > 0) {
+    --block;
+    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
+    segment_num -= num_columns;
+    for (Index i = 0; i < num_columns; ++i) {
+      iss->StoreSegment(segment_num + i, state[i]);
+    }
+  }
+  // Verify everything processed
+  assert(block == 0);
+  assert(segment_num == 0);
+}
+
+// Prefetch memory for a key in InterleavedSolutionStorage.
+template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
+inline void InterleavedPrepareQuery(
+    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
+    const InterleavedSolutionStorage &iss,
+    typename PhsfQueryHasher::Hash *saved_hash,
+    typename InterleavedSolutionStorage::Index *saved_segment_num,
+    typename InterleavedSolutionStorage::Index *saved_num_columns,
+    typename InterleavedSolutionStorage::Index *saved_start_bit) {
+  using Hash = typename PhsfQueryHasher::Hash;
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+
+  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+
+  const Hash hash = hasher.GetHash(key);
+  const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const Index upper_start_block = iss.GetUpperStartBlock();
+  Index num_columns = iss.GetUpperNumColumns();
+  Index start_block_num = start_slot / kCoeffBits;
+  Index segment_num = start_block_num * num_columns -
+                      std::min(start_block_num, upper_start_block);
+  // Change to lower num columns if applicable.
+  // (This should not compile to a conditional branch.)
+  num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
+
+  Index start_bit = start_slot % kCoeffBits;
+
+  Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns);
+
+  iss.PrefetchSegmentRange(segment_num, segment_num + segment_count);
+
+  *saved_hash = hash;
+  *saved_segment_num = segment_num;
+  *saved_num_columns = num_columns;
+  *saved_start_bit = start_bit;
+}
+
+// General PHSF query from InterleavedSolutionStorage, using data for
+// the query key from InterleavedPrepareQuery
+template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
+inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
+    typename PhsfQueryHasher::Hash hash,
+    typename InterleavedSolutionStorage::Index segment_num,
+    typename InterleavedSolutionStorage::Index num_columns,
+    typename InterleavedSolutionStorage::Index start_bit,
+    const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
+                "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const CoeffRow cr = hasher.GetCoeffRow(hash);
+
+  ResultRow sr = 0;
+  const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
+  for (Index i = 0; i < num_columns; ++i) {
+    sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i;
+  }
+
+  if (start_bit > 0) {
+    segment_num += num_columns;
+    const CoeffRow cr_right =
+        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
+    for (Index i = 0; i < num_columns; ++i) {
+      sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i;
+    }
+  }
+
+  return sr;
+}
+
+// Filter query a key from InterleavedFilterQuery.
+template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
+inline bool InterleavedFilterQuery(
+    typename FilterQueryHasher::Hash hash,
+    typename InterleavedSolutionStorage::Index segment_num,
+    typename InterleavedSolutionStorage::Index num_columns,
+    typename InterleavedSolutionStorage::Index start_bit,
+    const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
+                "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
+      "must be same");
+  static_assert(
+      sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const CoeffRow cr = hasher.GetCoeffRow(hash);
+  const ResultRow expected = hasher.GetResultRowFromHash(hash);
+
+  // TODO: consider optimizations such as
+  // * get rid of start_bit == 0 condition with careful fetching & shifting
+  if (start_bit == 0) {
+    for (Index i = 0; i < num_columns; ++i) {
+      if (BitParity(iss.LoadSegment(segment_num + i) & cr) !=
+          (static_cast<int>(expected >> i) & 1)) {
+        return false;
+      }
+    }
+  } else {
+    const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
+    const CoeffRow cr_right =
+        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
+
+    for (Index i = 0; i < num_columns; ++i) {
+      CoeffRow soln_data =
+          (iss.LoadSegment(segment_num + i) & cr_left) ^
+          (iss.LoadSegment(segment_num + num_columns + i) & cr_right);
+      if (BitParity(soln_data) != (static_cast<int>(expected >> i) & 1)) {
+        return false;
+      }
+    }
+  }
+  // otherwise, all match
+  return true;
+}
+
+// TODO: refactor Interleaved*Query so that queries can be "prepared" by
+// prefetching memory, to hide memory latency for multiple queries in a
+// single thread.
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_config.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_config.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,506 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/ribbon_config.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+namespace detail {
+
+// Each instantiation of this struct is sufficiently unique for configuration
+// purposes, and is only instantiated for settings where we support the
+// configuration API. An application might only reference one instantiation,
+// meaning the rest could be pruned at link time.
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash>
+struct BandingConfigHelperData {
+  static constexpr size_t kKnownSize = 18U;
+
+  // Because of complexity in the data, for smaller numbers of slots
+  // (powers of two up to 2^17), we record known numbers that can be added
+  // with kCfc chance of construction failure and settings in template
+  // parameters. Zero means "unsupported (too small) number of slots".
+  // (GetNumToAdd below will use interpolation for numbers of slots
+  // between powers of two; double rather than integer values here make
+  // that more accurate.)
+  static const std::array<double, kKnownSize> kKnownToAddByPow2;
+
+  // For sufficiently large number of slots, doubling the number of
+  // slots will increase the expected overhead (slots over number added)
+  // by approximately this constant.
+  // (This is roughly constant regardless of ConstructionFailureChance and
+  // smash setting.)
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetFactorPerPow2() {
+    if (kCoeffBits == 128U) {
+      return 0.0038;
+    } else {
+      assert(kCoeffBits == 64U);
+      return 0.0083;
+    }
+  }
+
+  // Overhead factor for 2^(kKnownSize-1) slots
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetFinalKnownFactor() {
+    return 1.0 * (uint32_t{1} << (kKnownSize - 1)) /
+           kKnownToAddByPow2[kKnownSize - 1];
+  }
+
+  // GetFinalKnownFactor() - (kKnownSize-1) * GetFactorPerPow2()
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetBaseFactor() {
+    return GetFinalKnownFactor() - (kKnownSize - 1) * GetFactorPerPow2();
+  }
+
+  // Get overhead factor (slots over number to add) for sufficiently large
+  // number of slots (by log base 2)
+  static inline double GetFactorForLarge(double log2_num_slots) {
+    return GetBaseFactor() + log2_num_slots * GetFactorPerPow2();
+  }
+
+  // For a given power of two number of slots (specified by whole number
+  // log base 2), implements GetNumToAdd for such limited case, returning
+  // double for better interpolation in GetNumToAdd and GetNumSlots.
+  static inline double GetNumToAddForPow2(uint32_t log2_num_slots) {
+    assert(log2_num_slots <= 32);  // help clang-analyze
+    if (log2_num_slots < kKnownSize) {
+      return kKnownToAddByPow2[log2_num_slots];
+    } else {
+      return 1.0 * (uint64_t{1} << log2_num_slots) /
+             GetFactorForLarge(1.0 * log2_num_slots);
+    }
+  }
+};
+
+// Based on data from FindOccupancy in ribbon_test
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        252.984,
+        506.109,
+        1013.71,
+        2029.47,
+        4060.43,
+        8115.63,
+        16202.2,
+        32305.1,
+        64383.5,
+        128274,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        126.274,
+        254.279,
+        510.27,
+        1022.24,
+        2046.02,
+        4091.99,
+        8154.98,
+        16244.3,
+        32349.7,
+        64426.6,
+        128307,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        124.94,
+        249.968,
+        501.234,
+        1004.06,
+        2006.15,
+        3997.89,
+        7946.99,
+        15778.4,
+        31306.9,
+        62115.3,
+        123284,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 64U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        62.2683,
+        126.259,
+        254.268,
+        509.975,
+        1019.98,
+        2026.16,
+        4019.75,
+        7969.8,
+        15798.2,
+        31330.3,
+        62134.2,
+        123255,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        248.851,
+        499.532,
+        1001.26,
+        2003.97,
+        4005.59,
+        8000.39,
+        15966.6,
+        31828.1,
+        63447.3,
+        126506,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        122.637,
+        250.651,
+        506.625,
+        1018.54,
+        2036.43,
+        4041.6,
+        8039.25,
+        16005,
+        31869.6,
+        63492.8,
+        126537,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        120.659,
+        243.346,
+        488.168,
+        976.373,
+        1948.86,
+        3875.85,
+        7704.97,
+        15312.4,
+        30395.1,
+        60321.8,
+        119813,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 64U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        58.6016,
+        122.619,
+        250.641,
+        503.595,
+        994.165,
+        1967.36,
+        3898.17,
+        7727.21,
+        15331.5,
+        30405.8,
+        60376.2,
+        119836,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        242.61,
+        491.887,
+        983.603,
+        1968.21,
+        3926.98,
+        7833.99,
+        15629,
+        31199.9,
+        62307.8,
+        123870,
+    }};
+
+template <>
+const std::array<double, 18> BandingConfigHelperData<
+    kOneIn1000, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,  // unsupported
+    117.19,
+    245.105,
+    500.748,
+    1010.67,
+    1993.4,
+    3950.01,
+    7863.31,
+    15652,
+    31262.1,
+    62462.8,
+    124095,
+}};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        114,
+        234.8,
+        471.498,
+        940.165,
+        1874,
+        3721.5,
+        7387.5,
+        14592,
+        29160,
+        57745,
+        115082,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 64U, /*smash*/ true>::kKnownToAddByPow2{
+        {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,  // unsupported
+            53.0434,
+            117,
+            245.312,
+            483.571,
+            950.251,
+            1878,
+            3736.34,
+            7387.97,
+            14618,
+            29142.9,
+            57838.8,
+            114932,
+        }};
+
+// We hide these implementation details from the .h file with explicit
+// instantiations below these partial specializations.
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+uint32_t BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+    true /* kIsSupported */>::GetNumToAdd(uint32_t num_slots) {
+  using Data = detail::BandingConfigHelperData<kCfc, kCoeffBits, kUseSmash>;
+  if (num_slots == 0) {
+    return 0;
+  }
+  uint32_t num_to_add;
+  double log2_num_slots = std::log(num_slots) * 1.4426950409;
+  uint32_t floor_log2 = static_cast<uint32_t>(log2_num_slots);
+  if (floor_log2 + 1 < Data::kKnownSize) {
+    double ceil_portion = 1.0 * num_slots / (uint32_t{1} << floor_log2) - 1.0;
+    // Must be a supported number of slots
+    assert(Data::kKnownToAddByPow2[floor_log2] > 0.0);
+    // Weighted average of two nearest known data points
+    num_to_add = static_cast<uint32_t>(
+        ceil_portion * Data::kKnownToAddByPow2[floor_log2 + 1] +
+        (1.0 - ceil_portion) * Data::kKnownToAddByPow2[floor_log2]);
+  } else {
+    // Use formula for large values
+    double factor = Data::GetFactorForLarge(log2_num_slots);
+    assert(factor >= 1.0);
+    num_to_add = static_cast<uint32_t>(num_slots / factor);
+  }
+  if (kHomogeneous) {
+    // Even when standard filter construction would succeed, we might
+    // have loaded things up too much for Homogeneous filter. (Complete
+    // explanation not known but observed empirically.) This seems to
+    // correct for that, mostly affecting small filter configurations.
+    if (num_to_add >= 8) {
+      num_to_add -= 8;
+    } else {
+      assert(false);
+    }
+  }
+  return num_to_add;
+}
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+uint32_t BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+    true /* kIsSupported */>::GetNumSlots(uint32_t num_to_add) {
+  using Data = detail::BandingConfigHelperData<kCfc, kCoeffBits, kUseSmash>;
+
+  if (num_to_add == 0) {
+    return 0;
+  }
+  if (kHomogeneous) {
+    // Reverse of above in GetNumToAdd
+    num_to_add += 8;
+  }
+  double log2_num_to_add = std::log(num_to_add) * 1.4426950409;
+  uint32_t approx_log2_slots = static_cast<uint32_t>(log2_num_to_add + 0.5);
+  assert(approx_log2_slots <= 32);  // help clang-analyze
+
+  double lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots);
+  double upper_num_to_add;
+  if (approx_log2_slots == 0 || lower_num_to_add == /* unsupported */ 0) {
+    // Return minimum non-zero slots in standard implementation
+    return kUseSmash ? kCoeffBits : 2 * kCoeffBits;
+  } else if (num_to_add < lower_num_to_add) {
+    upper_num_to_add = lower_num_to_add;
+    --approx_log2_slots;
+    lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots);
+  } else {
+    upper_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots + 1);
+  }
+
+  assert(num_to_add >= lower_num_to_add);
+  assert(num_to_add < upper_num_to_add);
+
+  double upper_portion =
+      (num_to_add - lower_num_to_add) / (upper_num_to_add - lower_num_to_add);
+
+  double lower_num_slots = 1.0 * (uint64_t{1} << approx_log2_slots);
+
+  // Interpolation, round up
+  return static_cast<uint32_t>(upper_portion * lower_num_slots +
+                               lower_num_slots + 0.999999999);
+}
+
+// These explicit instantiations enable us to hide most of the
+// implementation details from the .h file. (The .h file currently
+// needs to determine whether settings are "supported" or not.)
+
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ false, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ true, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ false, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ true, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 64U, /*sm*/ false, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn1000, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 64U, /*sm*/ false, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn1000, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+}  // namespace detail
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_config.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_config.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_config.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,182 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+
+#include "port/lang.h"  // for FALLTHROUGH_INTENDED
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_config.h: APIs for relating numbers of slots with numbers of
+// additions for tolerable construction failure probabilities. This is
+// separate from ribbon_impl.h because it might not be needed for
+// some applications.
+//
+// This API assumes uint32_t for number of slots, as a single Ribbon
+// linear system should not normally overflow that without big penalties.
+//
+// Template parameter kCoeffBits uses uint64_t for convenience in case it
+// comes from size_t.
+//
+// Most of the complexity here is trying to optimize speed and
+// compiled code size, using templates to minimize table look-ups and
+// the compiled size of all linked look-up tables. Look-up tables are
+// required because we don't have good formulas, and the data comes
+// from running FindOccupancy in ribbon_test.
+
+// Represents a chosen chance of successful Ribbon construction for a single
+// seed. Allowing higher chance of failed construction can reduce space
+// overhead but takes extra time in construction.
+enum ConstructionFailureChance {
+  kOneIn2,
+  kOneIn20,
+  // When using kHomogeneous==true, construction failure chance should
+  // not generally exceed target FP rate, so it unlikely useful to
+  // allow a higher "failure" chance. In some cases, even more overhead
+  // is appropriate. (TODO)
+  kOneIn1000,
+};
+
+namespace detail {
+
+// It is useful to compile ribbon_test linking to BandingConfigHelper with
+// settings for which we do not have configuration data, as long as we don't
+// run the code. This template hack supports that.
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous, bool kIsSupported>
+struct BandingConfigHelper1MaybeSupported {
+ public:
+  static uint32_t GetNumToAdd(uint32_t num_slots) {
+    // Unsupported
+    assert(num_slots == 0);
+    (void)num_slots;
+    return 0;
+  }
+
+  static uint32_t GetNumSlots(uint32_t num_to_add) {
+    // Unsupported
+    assert(num_to_add == 0);
+    (void)num_to_add;
+    return 0;
+  }
+};
+
+// Base class for BandingConfigHelper1 and helper for BandingConfigHelper
+// with core implementations built on above data
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+struct BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous, true /* kIsSupported */> {
+ public:
+  // See BandingConfigHelper1. Implementation in ribbon_config.cc
+  static uint32_t GetNumToAdd(uint32_t num_slots);
+
+  // See BandingConfigHelper1. Implementation in ribbon_config.cc
+  static uint32_t GetNumSlots(uint32_t num_to_add);
+};
+
+}  // namespace detail
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+struct BandingConfigHelper1
+    : public detail::BandingConfigHelper1MaybeSupported<
+          kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+          /* kIsSupported */ kCoeffBits == 64 || kCoeffBits == 128> {
+ public:
+  // Returns a number of entries that can be added to a given number of
+  // slots, with roughly kCfc chance of construction failure per seed,
+  // or better. Does NOT do rounding for InterleavedSoln; call
+  // RoundUpNumSlots for that.
+  //
+  // inherited:
+  // static uint32_t GetNumToAdd(uint32_t num_slots);
+
+  // Returns a number of slots for a given number of entries to add
+  // that should have roughly kCfc chance of construction failure per
+  // seed, or better. Does NOT do rounding for InterleavedSoln; call
+  // RoundUpNumSlots for that.
+  //
+  // num_to_add should not exceed roughly 2/3rds of the maximum value
+  // of the uint32_t type to avoid overflow.
+  //
+  // inherited:
+  // static uint32_t GetNumSlots(uint32_t num_to_add);
+};
+
+// Configured using TypesAndSettings as in ribbon_impl.h
+template <ConstructionFailureChance kCfc, class TypesAndSettings>
+struct BandingConfigHelper1TS
+    : public BandingConfigHelper1<
+          kCfc,
+          /* kCoeffBits */ sizeof(typename TypesAndSettings::CoeffRow) * 8U,
+          TypesAndSettings::kUseSmash, TypesAndSettings::kHomogeneous> {};
+
+// Like BandingConfigHelper1TS except failure chance can be a runtime rather
+// than compile time value.
+template <class TypesAndSettings>
+struct BandingConfigHelper {
+ public:
+  static constexpr ConstructionFailureChance kDefaultFailureChance =
+      TypesAndSettings::kHomogeneous ? kOneIn1000 : kOneIn20;
+
+  static uint32_t GetNumToAdd(
+      uint32_t num_slots,
+      ConstructionFailureChance max_failure = kDefaultFailureChance) {
+    switch (max_failure) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case kOneIn20: {
+        using H1 = BandingConfigHelper1TS<kOneIn20, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+      case kOneIn2: {
+        using H1 = BandingConfigHelper1TS<kOneIn2, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+      case kOneIn1000: {
+        using H1 = BandingConfigHelper1TS<kOneIn1000, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+    }
+  }
+
+  static uint32_t GetNumSlots(
+      uint32_t num_to_add,
+      ConstructionFailureChance max_failure = kDefaultFailureChance) {
+    switch (max_failure) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case kOneIn20: {
+        using H1 = BandingConfigHelper1TS<kOneIn20, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+      case kOneIn2: {
+        using H1 = BandingConfigHelper1TS<kOneIn2, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+      case kOneIn1000: {
+        using H1 = BandingConfigHelper1TS<kOneIn1000, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+    }
+  }
+};
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,1137 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cmath>
+
+#include "port/port.h"  // for PREFETCH
+#include "util/fastrange.h"
+#include "util/ribbon_alg.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_impl.h: templated (parameterized) standard implementations
+//
+// Ribbon is a Perfect Hash Static Function construction useful as a compact
+// static Bloom filter alternative. See ribbon_alg.h for core algorithms
+// and core design details.
+//
+// TODO: more details on trade-offs and practical issues.
+//
+// APIs for configuring Ribbon are in ribbon_config.h
+
+// Ribbon implementations in this file take these parameters, which must be
+// provided in a class/struct type with members expressed in this concept:
+
+// concept TypesAndSettings {
+//   // See RibbonTypes and *Hasher in ribbon_alg.h, except here we have
+//   // the added constraint that Hash be equivalent to either uint32_t or
+//   // uint64_t.
+//   typename Hash;
+//   typename CoeffRow;
+//   typename ResultRow;
+//   typename Index;
+//   typename Key;
+//   static constexpr bool kFirstCoeffAlwaysOne;
+//
+//   // An unsigned integer type for identifying a hash seed, typically
+//   // uint32_t or uint64_t. Importantly, this is the amount of data
+//   // stored in memory for identifying a raw seed. See StandardHasher.
+//   typename Seed;
+//
+//   // When true, the PHSF implements a static filter, expecting just
+//   // keys as inputs for construction. When false, implements a general
+//   // PHSF and expects std::pair<Key, ResultRow> as inputs for
+//   // construction.
+//   static constexpr bool kIsFilter;
+//
+//   // When true, enables a special "homogeneous" filter implementation that
+//   // is slightly faster to construct, and never fails to construct though
+//   // FP rate can quickly explode in cases where corresponding
+//   // non-homogeneous filter would fail (or nearly fail?) to construct.
+//   // For smaller filters, you can configure with ConstructionFailureChance
+//   // smaller than desired FP rate to largely counteract this effect.
+//   // TODO: configuring Homogeneous Ribbon for arbitrarily large filters
+//   // based on data from OptimizeHomogAtScale
+//   static constexpr bool kHomogeneous;
+//
+//   // When true, adds a tiny bit more hashing logic on queries and
+//   // construction to improve utilization at the beginning and end of
+//   // the structure.  Recommended when CoeffRow is only 64 bits (or
+//   // less), so typical num_starts < 10k. Although this is compatible
+//   // with kHomogeneous, the competing space vs. time priorities might
+//   // not be useful.
+//   static constexpr bool kUseSmash;
+//
+//   // When true, allows number of "starts" to be zero, for best support
+//   // of the "no keys to add" case by always returning false for filter
+//   // queries. (This is distinct from the "keys added but no space for
+//   // any data" case, in which a filter always returns true.) The cost
+//   // supporting this is a conditional branch (probably predictable) in
+//   // queries.
+//   static constexpr bool kAllowZeroStarts;
+//
+//   // A seedable stock hash function on Keys. All bits of Hash must
+//   // be reasonably high quality. XXH functions recommended, but
+//   // Murmur, City, Farm, etc. also work.
+//   static Hash HashFn(const Key &, Seed raw_seed);
+// };
+
+// A bit of a hack to automatically construct the type for
+// AddInput based on a constexpr bool.
+template <typename Key, typename ResultRow, bool IsFilter>
+struct AddInputSelector {
+  // For general PHSF, not filter
+  using T = std::pair<Key, ResultRow>;
+};
+
+template <typename Key, typename ResultRow>
+struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
+  // For Filter
+  using T = Key;
+};
+
+// To avoid writing 'typename' everywhere that we use types like 'Index'
+#define IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings)                   \
+  using CoeffRow = typename TypesAndSettings::CoeffRow;                      \
+  using ResultRow = typename TypesAndSettings::ResultRow;                    \
+  using Index = typename TypesAndSettings::Index;                            \
+  using Hash = typename TypesAndSettings::Hash;                              \
+  using Key = typename TypesAndSettings::Key;                                \
+  using Seed = typename TypesAndSettings::Seed;                              \
+                                                                             \
+  /* Some more additions */                                                  \
+  using QueryInput = Key;                                                    \
+  using AddInput = typename ROCKSDB_NAMESPACE::ribbon::AddInputSelector<     \
+      Key, ResultRow, TypesAndSettings::kIsFilter>::T;                       \
+  static constexpr auto kCoeffBits =                                         \
+      static_cast<Index>(sizeof(CoeffRow) * 8U);                             \
+                                                                             \
+  /* Export to algorithm */                                                  \
+  static constexpr bool kFirstCoeffAlwaysOne =                               \
+      TypesAndSettings::kFirstCoeffAlwaysOne;                                \
+                                                                             \
+  static_assert(sizeof(CoeffRow) + sizeof(ResultRow) + sizeof(Index) +       \
+                        sizeof(Hash) + sizeof(Key) + sizeof(Seed) +          \
+                        sizeof(QueryInput) + sizeof(AddInput) + kCoeffBits + \
+                        kFirstCoeffAlwaysOne >                               \
+                    0,                                                       \
+                "avoid unused warnings, semicolon expected after macro call")
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4309)  // cast truncating constant
+#pragma warning(disable : 4307)  // arithmetic constant overflow
+#endif
+
+// StandardHasher: A standard implementation of concepts RibbonTypes,
+// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h.
+//
+// This implementation should be suitable for most all practical purposes
+// as it "behaves" across a wide range of settings, with little room left
+// for improvement. The key functionality in this hasher is generating
+// CoeffRows, starts, and (for filters) ResultRows, which could be ~150
+// bits of data or more, from a modest hash of 64 or even just 32 bits, with
+// enough uniformity and bitwise independence to be close to "the best you
+// can do" with available hash information in terms of FP rate and
+// compactness. (64 bits recommended and sufficient for PHSF practical
+// purposes.)
+//
+// Another feature of this hasher is a minimal "premixing" of seeds before
+// they are provided to TypesAndSettings::HashFn in case that function does
+// not provide sufficiently independent hashes when iterating merely
+// sequentially on seeds. (This for example works around a problem with the
+// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXPH3 or Hash64, and
+// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step
+// translates "ordinal seeds," which we iterate sequentially to find a
+// solution, into "raw seeds," with many more bits changing for each
+// iteration. The translation is an easily reversible lightweight mixing,
+// not suitable for hashing on its own. An advantage of this approach is that
+// StandardHasher can store just the raw seed (e.g. 64 bits) for fast query
+// times, while from the application perspective, we can limit to a small
+// number of ordinal keys (e.g. 64 in 6 bits) for saving in metadata.
+//
+// The default constructor initializes the seed to ordinal seed zero, which
+// is equal to raw seed zero.
+//
+template <class TypesAndSettings>
+class StandardHasher {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  inline Hash GetHash(const Key& key) const {
+    return TypesAndSettings::HashFn(key, raw_seed_);
+  };
+  // For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
+  inline Hash GetHash(const std::pair<Key, ResultRow>& bi) const {
+    return GetHash(bi.first);
+  };
+  inline Index GetStart(Hash h, Index num_starts) const {
+    // This is "critical path" code because it's required before memory
+    // lookup.
+    //
+    // FastRange gives us a fast and effective mapping from h to the
+    // appropriate range. This depends most, sometimes exclusively, on
+    // upper bits of h.
+    //
+    if (TypesAndSettings::kUseSmash) {
+      // Extra logic to "smash" entries at beginning and end, for
+      // better utilization. For example, without smash and with
+      // kFirstCoeffAlwaysOne, there's about a 30% chance that the
+      // first slot in the banding will be unused, and worse without
+      // kFirstCoeffAlwaysOne. The ending slots are even less utilized
+      // without smash.
+      //
+      // But since this only affects roughly kCoeffBits of the slots,
+      // it's usually small enough to be ignorable (less computation in
+      // this function) when number of slots is roughly 10k or larger.
+      //
+      // The best values for these smash weights might depend on how
+      // densely you're packing entries, and also kCoeffBits, but this
+      // seems to work well for roughly 95% success probability.
+      //
+      constexpr Index kFrontSmash = kCoeffBits / 4;
+      constexpr Index kBackSmash = kCoeffBits / 4;
+      Index start = FastRangeGeneric(h, num_starts + kFrontSmash + kBackSmash);
+      start = std::max(start, kFrontSmash);
+      start -= kFrontSmash;
+      start = std::min(start, num_starts - 1);
+      return start;
+    } else {
+      // For query speed, we allow small number of initial and final
+      // entries to be under-utilized.
+      // NOTE: This call statically enforces that Hash is equivalent to
+      // either uint32_t or uint64_t.
+      return FastRangeGeneric(h, num_starts);
+    }
+  }
+  inline CoeffRow GetCoeffRow(Hash h) const {
+    // This is not so much "critical path" code because it can be done in
+    // parallel (instruction level) with memory lookup.
+    //
+    // When we might have many entries squeezed into a single start,
+    // we need reasonably good remixing for CoeffRow.
+    if (TypesAndSettings::kUseSmash) {
+      // Reasonably good, reasonably fast, reasonably general.
+      // Probably not 1:1 but probably close enough.
+      Unsigned128 a = Multiply64to128(h, kAltCoeffFactor1);
+      Unsigned128 b = Multiply64to128(h, kAltCoeffFactor2);
+      auto cr = static_cast<CoeffRow>(b ^ (a << 64) ^ (a >> 64));
+
+      // Now ensure the value is non-zero
+      if (kFirstCoeffAlwaysOne) {
+        cr |= 1;
+      } else {
+        // Still have to ensure some bit is non-zero
+        cr |= (cr == 0) ? 1 : 0;
+      }
+      return cr;
+    }
+    // If not kUseSmash, we ensure we're not squeezing many entries into a
+    // single start, in part by ensuring num_starts > num_slots / 2. Thus,
+    // here we do not need good remixing for CoeffRow, but just enough that
+    // (a) every bit is reasonably independent from Start.
+    // (b) every Hash-length bit subsequence of the CoeffRow has full or
+    // nearly full entropy from h.
+    // (c) if nontrivial bit subsequences within are correlated, it needs to
+    // be more complicated than exact copy or bitwise not (at least without
+    // kFirstCoeffAlwaysOne), or else there seems to be a kind of
+    // correlated clustering effect.
+    // (d) the CoeffRow is not zero, so that no one input on its own can
+    // doom construction success. (Preferably a mix of 1's and 0's if
+    // satisfying above.)
+
+    // First, establish sufficient bitwise independence from Start, with
+    // multiplication by a large random prime.
+    // Note that we cast to Hash because if we use product bits beyond
+    // original input size, that's going to correlate with Start (FastRange)
+    // even with a (likely) different multiplier here.
+    Hash a = h * kCoeffAndResultFactor;
+
+    static_assert(
+        sizeof(Hash) == sizeof(uint64_t) || sizeof(Hash) == sizeof(uint32_t),
+        "Supported sizes");
+    // If that's big enough, we're done. If not, we have to expand it,
+    // maybe up to 4x size.
+    uint64_t b;
+    if (sizeof(Hash) < sizeof(uint64_t)) {
+      // Almost-trivial hash expansion (OK - see above), favoring roughly
+      // equal number of 1's and 0's in result
+      b = (uint64_t{a} << 32) ^ (a ^ kCoeffXor32);
+    } else {
+      b = a;
+    }
+    static_assert(sizeof(CoeffRow) <= sizeof(Unsigned128), "Supported sizes");
+    Unsigned128 c;
+    if (sizeof(uint64_t) < sizeof(CoeffRow)) {
+      // Almost-trivial hash expansion (OK - see above), favoring roughly
+      // equal number of 1's and 0's in result
+      c = (Unsigned128{b} << 64) ^ (b ^ kCoeffXor64);
+    } else {
+      c = b;
+    }
+    auto cr = static_cast<CoeffRow>(c);
+
+    // Now ensure the value is non-zero
+    if (kFirstCoeffAlwaysOne) {
+      cr |= 1;
+    } else if (sizeof(CoeffRow) == sizeof(Hash)) {
+      // Still have to ensure some bit is non-zero
+      cr |= (cr == 0) ? 1 : 0;
+    } else {
+      // (We did trivial expansion with constant xor, which ensures some
+      // bits are non-zero.)
+    }
+    return cr;
+  }
+  inline ResultRow GetResultRowMask() const {
+    // TODO: will be used with InterleavedSolutionStorage?
+    // For now, all bits set (note: might be a small type so might need to
+    // narrow after promotion)
+    return static_cast<ResultRow>(~ResultRow{0});
+  }
+  inline ResultRow GetResultRowFromHash(Hash h) const {
+    if (TypesAndSettings::kIsFilter && !TypesAndSettings::kHomogeneous) {
+      // This is not so much "critical path" code because it can be done in
+      // parallel (instruction level) with memory lookup.
+      //
+      // ResultRow bits only needs to be independent from CoeffRow bits if
+      // many entries might have the same start location, where "many" is
+      // comparable to number of hash bits or kCoeffBits. If !kUseSmash
+      // and num_starts > kCoeffBits, it is safe and efficient to draw from
+      // the same bits computed for CoeffRow, which are reasonably
+      // independent from Start. (Inlining and common subexpression
+      // elimination with GetCoeffRow should make this
+      // a single shared multiplication in generated code when !kUseSmash.)
+      Hash a = h * kCoeffAndResultFactor;
+
+      // The bits here that are *most* independent of Start are the highest
+      // order bits (as in Knuth multiplicative hash). To make those the
+      // most preferred for use in the result row, we do a bswap here.
+      auto rr = static_cast<ResultRow>(EndianSwapValue(a));
+      return rr & GetResultRowMask();
+    } else {
+      // Must be zero
+      return 0;
+    }
+  }
+  // For when AddInput == Key (kIsFilter == true)
+  inline ResultRow GetResultRowFromInput(const Key&) const {
+    // Must be zero
+    return 0;
+  }
+  // For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
+  inline ResultRow GetResultRowFromInput(
+      const std::pair<Key, ResultRow>& bi) const {
+    // Simple extraction
+    return bi.second;
+  }
+
+  // Seed tracking APIs - see class comment
+  void SetRawSeed(Seed seed) { raw_seed_ = seed; }
+  Seed GetRawSeed() { return raw_seed_; }
+  void SetOrdinalSeed(Seed count) {
+    // A simple, reversible mixing of any size (whole bytes) up to 64 bits.
+    // This allows casting the raw seed to any smaller size we use for
+    // ordinal seeds without risk of duplicate raw seeds for unique ordinal
+    // seeds.
+
+    // Seed type might be smaller than numerical promotion size, but Hash
+    // should be at least that size, so we use Hash as intermediate type.
+    static_assert(sizeof(Seed) <= sizeof(Hash),
+                  "Hash must be at least size of Seed");
+
+    // Multiply by a large random prime (one-to-one for any prefix of bits)
+    Hash tmp = count * kToRawSeedFactor;
+    // Within-byte one-to-one mixing
+    static_assert((kSeedMixMask & (kSeedMixMask >> kSeedMixShift)) == 0,
+                  "Illegal mask+shift");
+    tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
+    raw_seed_ = static_cast<Seed>(tmp);
+    // dynamic verification
+    assert(GetOrdinalSeed() == count);
+  }
+  Seed GetOrdinalSeed() {
+    Hash tmp = raw_seed_;
+    // Within-byte one-to-one mixing (its own inverse)
+    tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
+    // Multiply by 64-bit multiplicative inverse
+    static_assert(kToRawSeedFactor * kFromRawSeedFactor == Hash{1},
+                  "Must be inverses");
+    return static_cast<Seed>(tmp * kFromRawSeedFactor);
+  }
+
+ protected:
+  // For expanding hash:
+  // large random prime
+  static constexpr Hash kCoeffAndResultFactor =
+      static_cast<Hash>(0xc28f82822b650bedULL);
+  static constexpr uint64_t kAltCoeffFactor1 = 0x876f170be4f1fcb9U;
+  static constexpr uint64_t kAltCoeffFactor2 = 0xf0433a4aecda4c5fU;
+  // random-ish data
+  static constexpr uint32_t kCoeffXor32 = 0xa6293635U;
+  static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU;
+
+  // For pre-mixing seeds
+  static constexpr Hash kSeedMixMask = static_cast<Hash>(0xf0f0f0f0f0f0f0f0ULL);
+  static constexpr unsigned kSeedMixShift = 4U;
+  static constexpr Hash kToRawSeedFactor =
+      static_cast<Hash>(0xc78219a23eeadd03ULL);
+  static constexpr Hash kFromRawSeedFactor =
+      static_cast<Hash>(0xfe1a137d14b475abULL);
+
+  // See class description
+  Seed raw_seed_ = 0;
+};
+
+// StandardRehasher (and StandardRehasherAdapter): A variant of
+// StandardHasher that uses the same type for keys as for hashes.
+// This is primarily intended for building a Ribbon filter
+// from existing hashes without going back to original inputs in
+// order to apply a different seed. This hasher seeds a 1-to-1 mixing
+// transformation to apply a seed to an existing hash. (Untested for
+// hash-sized keys that are not already uniformly distributed.) This
+// transformation builds on the seed pre-mixing done in StandardHasher.
+//
+// Testing suggests essentially no degradation of solution success rate
+// vs. going back to original inputs when changing hash seeds. For example:
+// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys
+// is about 1.10 for both StandardHasher and StandardRehasher.
+//
+// StandardRehasher is not really recommended for general PHSFs (not
+// filters) because a collision in the original hash could prevent
+// construction despite re-seeding the Rehasher. (Such collisions
+// do not interfere with filter construction.)
+//
+// concept RehasherTypesAndSettings: like TypesAndSettings but
+// does not require Key or HashFn.
+template <class RehasherTypesAndSettings>
+class StandardRehasherAdapter : public RehasherTypesAndSettings {
+ public:
+  using Hash = typename RehasherTypesAndSettings::Hash;
+  using Key = Hash;
+  using Seed = typename RehasherTypesAndSettings::Seed;
+
+  static Hash HashFn(const Hash& input, Seed raw_seed) {
+    // Note: raw_seed is already lightly pre-mixed, and this multiplication
+    // by a large prime is sufficient mixing (low-to-high bits) on top of
+    // that for good FastRange results, which depends primarily on highest
+    // bits. (The hashed CoeffRow and ResultRow are less sensitive to
+    // mixing than Start.)
+    // Also note: did consider adding ^ (input >> some) before the
+    // multiplication, but doesn't appear to be necessary.
+    return (input ^ raw_seed) * kRehashFactor;
+  }
+
+ private:
+  static constexpr Hash kRehashFactor =
+      static_cast<Hash>(0x6193d459236a3a0dULL);
+};
+
+// See comment on StandardRehasherAdapter
+template <class RehasherTypesAndSettings>
+using StandardRehasher =
+    StandardHasher<StandardRehasherAdapter<RehasherTypesAndSettings>>;
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Especially with smaller hashes (e.g. 32 bit), there can be noticeable
+// false positives due to collisions in the Hash returned by GetHash.
+// This function returns the expected FP rate due to those collisions,
+// which can be added to the expected FP rate from the underlying data
+// structure. (Note: technically, a + b is only a good approximation of
+// 1-(1-a)(1-b) == a + b - a*b, if a and b are much closer to 0 than to 1.)
+// The number of entries added can be a double here in case it's an
+// average.
+template <class Hasher, typename Numerical>
+double ExpectedCollisionFpRate(const Hasher& hasher, Numerical added) {
+  // Standardize on the 'double' specialization
+  return ExpectedCollisionFpRate(hasher, 1.0 * added);
+}
+template <class Hasher>
+double ExpectedCollisionFpRate(const Hasher& /*hasher*/, double added) {
+  // Technically, there could be overlap among the added, but ignoring that
+  // is typically close enough.
+  return added / std::pow(256.0, sizeof(typename Hasher::Hash));
+}
+
+// StandardBanding: a canonical implementation of BandingStorage and
+// BacktrackStorage, with convenience API for banding (solving with on-the-fly
+// Gaussian elimination) with and without backtracking.
+template <class TypesAndSettings>
+class StandardBanding : public StandardHasher<TypesAndSettings> {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  StandardBanding(Index num_slots = 0, Index backtrack_size = 0) {
+    Reset(num_slots, backtrack_size);
+  }
+
+  void Reset(Index num_slots, Index backtrack_size = 0) {
+    if (num_slots == 0) {
+      // Unusual (TypesAndSettings::kAllowZeroStarts) or "uninitialized"
+      num_starts_ = 0;
+    } else {
+      // Normal
+      assert(num_slots >= kCoeffBits);
+      if (num_slots > num_slots_allocated_) {
+        coeff_rows_.reset(new CoeffRow[num_slots]());
+        if (!TypesAndSettings::kHomogeneous) {
+          // Note: don't strictly have to zero-init result_rows,
+          // except possible information leakage, etc ;)
+          result_rows_.reset(new ResultRow[num_slots]());
+        }
+        num_slots_allocated_ = num_slots;
+      } else {
+        for (Index i = 0; i < num_slots; ++i) {
+          coeff_rows_[i] = 0;
+          if (!TypesAndSettings::kHomogeneous) {
+            // Note: don't strictly have to zero-init result_rows,
+            // except possible information leakage, etc ;)
+            result_rows_[i] = 0;
+          }
+        }
+      }
+      num_starts_ = num_slots - kCoeffBits + 1;
+    }
+    EnsureBacktrackSize(backtrack_size);
+  }
+
+  void EnsureBacktrackSize(Index backtrack_size) {
+    if (backtrack_size > backtrack_size_) {
+      backtrack_.reset(new Index[backtrack_size]);
+      backtrack_size_ = backtrack_size;
+    }
+  }
+
+  // ********************************************************************
+  // From concept BandingStorage
+
+  inline bool UsePrefetch() const {
+    // A rough guesstimate of when prefetching during construction pays off.
+    // TODO: verify/validate
+    return num_starts_ > 1500;
+  }
+  inline void Prefetch(Index i) const {
+    PREFETCH(&coeff_rows_[i], 1 /* rw */, 1 /* locality */);
+    if (!TypesAndSettings::kHomogeneous) {
+      PREFETCH(&result_rows_[i], 1 /* rw */, 1 /* locality */);
+    }
+  }
+  inline void LoadRow(Index i, CoeffRow* cr, ResultRow* rr,
+                      bool for_back_subst) const {
+    *cr = coeff_rows_[i];
+    if (TypesAndSettings::kHomogeneous) {
+      if (for_back_subst && *cr == 0) {
+        // Cheap pseudorandom data to fill unconstrained solution rows
+        *rr = static_cast<ResultRow>(i * 0x9E3779B185EBCA87ULL);
+      } else {
+        *rr = 0;
+      }
+    } else {
+      *rr = result_rows_[i];
+    }
+  }
+  inline void StoreRow(Index i, CoeffRow cr, ResultRow rr) {
+    coeff_rows_[i] = cr;
+    if (TypesAndSettings::kHomogeneous) {
+      assert(rr == 0);
+    } else {
+      result_rows_[i] = rr;
+    }
+  }
+  inline Index GetNumStarts() const { return num_starts_; }
+
+  // from concept BacktrackStorage, for when backtracking is used
+  inline bool UseBacktrack() const { return true; }
+  inline void BacktrackPut(Index i, Index to_save) { backtrack_[i] = to_save; }
+  inline Index BacktrackGet(Index i) const { return backtrack_[i]; }
+
+  // ********************************************************************
+  // Some useful API, still somewhat low level. Here an input is
+  // a Key for filters, or std::pair<Key, ResultRow> for general PHSF.
+
+  // Adds a range of inputs to the banding, returning true if successful.
+  // False means none or some may have been successfully added, so it's
+  // best to Reset this banding before any further use.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  template <typename InputIterator>
+  bool AddRange(InputIterator begin, InputIterator end) {
+    assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Can't add any in this case.
+      return begin == end;
+    }
+    // Normal
+    return BandingAddRange(this, *this, begin, end);
+  }
+
+  // Adds a range of inputs to the banding, returning true if successful,
+  // or if unsuccessful, rolls back to state before this call and returns
+  // false. Caller guarantees that the number of inputs in this batch
+  // does not exceed `backtrack_size` provided to Reset.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  template <typename InputIterator>
+  bool AddRangeOrRollBack(InputIterator begin, InputIterator end) {
+    assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Can't add any in this case.
+      return begin == end;
+    }
+    // else Normal
+    return BandingAddRange(this, this, *this, begin, end);
+  }
+
+  // Adds a single input to the banding, returning true if successful.
+  // If unsuccessful, returns false and banding state is unchanged.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  bool Add(const AddInput& input) {
+    // Pointer can act as iterator
+    return AddRange(&input, &input + 1);
+  }
+
+  // Return the number of "occupied" rows (with non-zero coefficients stored).
+  Index GetOccupiedCount() const {
+    Index count = 0;
+    if (num_starts_ > 0) {
+      const Index num_slots = num_starts_ + kCoeffBits - 1;
+      for (Index i = 0; i < num_slots; ++i) {
+        if (coeff_rows_[i] != 0) {
+          ++count;
+        }
+      }
+    }
+    return count;
+  }
+
+  // Returns whether a row is "occupied" in the banding (non-zero
+  // coefficients stored). (Only recommended for debug/test)
+  bool IsOccupied(Index i) { return coeff_rows_[i] != 0; }
+
+  // ********************************************************************
+  // High-level API
+
+  // Iteratively (a) resets the structure for `num_slots`, (b) attempts
+  // to add the range of inputs, and (c) if unsuccessful, chooses next
+  // hash seed, until either successful or unsuccessful with all the
+  // allowed seeds. Returns true if successful. In that case, use
+  // GetOrdinalSeed() or GetRawSeed() to get the successful seed.
+  //
+  // The allowed sequence of hash seeds is determined by
+  // `starting_ordinal_seed,` the first ordinal seed to be attempted
+  // (see StandardHasher), and `ordinal_seed_mask,` a bit mask (power of
+  // two minus one) for the range of ordinal seeds to consider. The
+  // max number of seeds considered will be ordinal_seed_mask + 1.
+  // For filters we suggest `starting_ordinal_seed` be chosen randomly
+  // or round-robin, to minimize false positive correlations between keys.
+  //
+  // If unsuccessful, how best to continue is going to be application
+  // specific. It should be possible to choose parameters such that
+  // failure is extremely unlikely, using max_seed around 32 to 64.
+  // (TODO: APIs to help choose parameters) One option for fallback in
+  // constructing a filter is to construct a Bloom filter instead.
+  // Increasing num_slots is an option, but should not be used often
+  // unless construction maximum latency is a concern (rather than
+  // average running time of construction). Instead, choose parameters
+  // appropriately and trust that seeds are independent. (Also,
+  // increasing num_slots without changing hash seed would have a
+  // significant correlation in success, rather than independence.)
+  template <typename InputIterator>
+  bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin,
+                               InputIterator end,
+                               Seed starting_ordinal_seed = 0U,
+                               Seed ordinal_seed_mask = 63U) {
+    // power of 2 minus 1
+    assert((ordinal_seed_mask & (ordinal_seed_mask + 1)) == 0);
+    // starting seed is within mask
+    assert((starting_ordinal_seed & ordinal_seed_mask) ==
+           starting_ordinal_seed);
+    starting_ordinal_seed &= ordinal_seed_mask;  // if not debug
+
+    Seed cur_ordinal_seed = starting_ordinal_seed;
+    do {
+      StandardHasher<TypesAndSettings>::SetOrdinalSeed(cur_ordinal_seed);
+      Reset(num_slots);
+      bool success = AddRange(begin, end);
+      if (success) {
+        return true;
+      }
+      cur_ordinal_seed = (cur_ordinal_seed + 1) & ordinal_seed_mask;
+    } while (cur_ordinal_seed != starting_ordinal_seed);
+    // Reached limit by circling around
+    return false;
+  }
+
+  static std::size_t EstimateMemoryUsage(uint32_t num_slots) {
+    std::size_t bytes_coeff_rows = num_slots * sizeof(CoeffRow);
+    std::size_t bytes_result_rows = num_slots * sizeof(ResultRow);
+    std::size_t bytes_backtrack = 0;
+    std::size_t bytes_banding =
+        bytes_coeff_rows + bytes_result_rows + bytes_backtrack;
+
+    return bytes_banding;
+  }
+
+ protected:
+  // TODO: explore combining in a struct
+  std::unique_ptr<CoeffRow[]> coeff_rows_;
+  std::unique_ptr<ResultRow[]> result_rows_;
+  // We generally store "starts" instead of slots for speed of GetStart(),
+  // as in StandardHasher.
+  Index num_starts_ = 0;
+  Index num_slots_allocated_ = 0;
+  std::unique_ptr<Index[]> backtrack_;
+  Index backtrack_size_ = 0;
+};
+
+// Implements concept SimpleSolutionStorage, mostly for demonstration
+// purposes. This is "in memory" only because it does not handle byte
+// ordering issues for serialization.
+template <class TypesAndSettings>
+class InMemSimpleSolution {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  void PrepareForNumStarts(Index num_starts) {
+    if (TypesAndSettings::kAllowZeroStarts && num_starts == 0) {
+      // Unusual
+      num_starts_ = 0;
+    } else {
+      // Normal
+      const Index num_slots = num_starts + kCoeffBits - 1;
+      assert(num_slots >= kCoeffBits);
+      if (num_slots > num_slots_allocated_) {
+        // Do not need to init the memory
+        solution_rows_.reset(new ResultRow[num_slots]);
+        num_slots_allocated_ = num_slots;
+      }
+      num_starts_ = num_starts;
+    }
+  }
+
+  Index GetNumStarts() const { return num_starts_; }
+
+  ResultRow Load(Index slot_num) const { return solution_rows_[slot_num]; }
+
+  void Store(Index slot_num, ResultRow solution_row) {
+    solution_rows_[slot_num] = solution_row;
+  }
+
+  // ********************************************************************
+  // High-level API
+
+  template <typename BandingStorage>
+  void BackSubstFrom(const BandingStorage& bs) {
+    if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) {
+      // Unusual
+      PrepareForNumStarts(0);
+    } else {
+      // Normal
+      SimpleBackSubst(this, bs);
+    }
+  }
+
+  template <typename PhsfQueryHasher>
+  ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const {
+    // assert(!TypesAndSettings::kIsFilter);  Can be useful in testing
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual
+      return 0;
+    } else {
+      // Normal
+      return SimplePhsfQuery(input, hasher, *this);
+    }
+  }
+
+  template <typename FilterQueryHasher>
+  bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return false;
+    } else {
+      // Normal, or upper_num_columns_ == 0 means "no space for data" and
+      // thus will always return true.
+      return SimpleFilterQuery(input, hasher, *this);
+    }
+  }
+
+  double ExpectedFpRate() const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual, but we don't have FPs if we always return false.
+      return 0.0;
+    }
+    // else Normal
+
+    // Each result (solution) bit (column) cuts FP rate in half
+    return std::pow(0.5, 8U * sizeof(ResultRow));
+  }
+
+  // ********************************************************************
+  // Static high-level API
+
+  // Round up to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundUpNumSlots(Index num_slots) {
+    // Must be at least kCoeffBits for at least one start
+    // Or if not smash, even more because hashing not equipped
+    // for stacking up so many entries on a single start location
+    auto min_slots = kCoeffBits * (TypesAndSettings::kUseSmash ? 1 : 2);
+    return std::max(num_slots, static_cast<Index>(min_slots));
+  }
+
+ protected:
+  // We generally store "starts" instead of slots for speed of GetStart(),
+  // as in StandardHasher.
+  Index num_starts_ = 0;
+  Index num_slots_allocated_ = 0;
+  std::unique_ptr<ResultRow[]> solution_rows_;
+};
+
+// Implements concept InterleavedSolutionStorage always using little-endian
+// byte order, so easy for serialization/deserialization. This implementation
+// fully supports fractional bits per key, where any number of segments
+// (number of bytes multiple of sizeof(CoeffRow)) can be used with any number
+// of slots that is a multiple of kCoeffBits.
+//
+// The structure is passed an externally allocated/de-allocated byte buffer
+// that is optionally pre-populated (from storage) for answering queries,
+// or can be populated by BackSubstFrom.
+//
+template <class TypesAndSettings>
+class SerializableInterleavedSolution {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  // Does not take ownership of `data` but uses it (up to `data_len` bytes)
+  // throughout lifetime
+  SerializableInterleavedSolution(char* data, size_t data_len)
+      : data_(data), data_len_(data_len) {}
+
+  void PrepareForNumStarts(Index num_starts) {
+    assert(num_starts == 0 || (num_starts % kCoeffBits == 1));
+    num_starts_ = num_starts;
+
+    InternalConfigure();
+  }
+
+  Index GetNumStarts() const { return num_starts_; }
+
+  Index GetNumBlocks() const {
+    const Index num_slots = num_starts_ + kCoeffBits - 1;
+    return num_slots / kCoeffBits;
+  }
+
+  Index GetUpperNumColumns() const { return upper_num_columns_; }
+
+  Index GetUpperStartBlock() const { return upper_start_block_; }
+
+  Index GetNumSegments() const {
+    return static_cast<Index>(data_len_ / sizeof(CoeffRow));
+  }
+
+  CoeffRow LoadSegment(Index segment_num) const {
+    assert(data_ != nullptr);  // suppress clang analyzer report
+    return DecodeFixedGeneric<CoeffRow>(data_ + segment_num * sizeof(CoeffRow));
+  }
+  void StoreSegment(Index segment_num, CoeffRow val) {
+    assert(data_ != nullptr);  // suppress clang analyzer report
+    EncodeFixedGeneric(data_ + segment_num * sizeof(CoeffRow), val);
+  }
+  void PrefetchSegmentRange(Index begin_segment_num,
+                            Index end_segment_num) const {
+    if (end_segment_num == begin_segment_num) {
+      // Nothing to do
+      return;
+    }
+    char* cur = data_ + begin_segment_num * sizeof(CoeffRow);
+    char* last = data_ + (end_segment_num - 1) * sizeof(CoeffRow);
+    while (cur < last) {
+      PREFETCH(cur, 0 /* rw */, 1 /* locality */);
+      cur += CACHE_LINE_SIZE;
+    }
+    PREFETCH(last, 0 /* rw */, 1 /* locality */);
+  }
+
+  // ********************************************************************
+  // High-level API
+
+  void ConfigureForNumBlocks(Index num_blocks) {
+    if (num_blocks == 0) {
+      PrepareForNumStarts(0);
+    } else {
+      PrepareForNumStarts(num_blocks * kCoeffBits - kCoeffBits + 1);
+    }
+  }
+
+  void ConfigureForNumSlots(Index num_slots) {
+    assert(num_slots % kCoeffBits == 0);
+    ConfigureForNumBlocks(num_slots / kCoeffBits);
+  }
+
+  template <typename BandingStorage>
+  void BackSubstFrom(const BandingStorage& bs) {
+    if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) {
+      // Unusual
+      PrepareForNumStarts(0);
+    } else {
+      // Normal
+      InterleavedBackSubst(this, bs);
+    }
+  }
+
+  template <typename PhsfQueryHasher>
+  ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const {
+    // assert(!TypesAndSettings::kIsFilter);  Can be useful in testing
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual
+      return 0;
+    } else {
+      // Normal
+      // NOTE: not using a struct to encourage compiler optimization
+      Hash hash;
+      Index segment_num;
+      Index num_columns;
+      Index start_bit;
+      InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num,
+                              &num_columns, &start_bit);
+      return InterleavedPhsfQuery(hash, segment_num, num_columns, start_bit,
+                                  hasher, *this);
+    }
+  }
+
+  template <typename FilterQueryHasher>
+  bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return false;
+    } else {
+      // Normal, or upper_num_columns_ == 0 means "no space for data" and
+      // thus will always return true.
+      // NOTE: not using a struct to encourage compiler optimization
+      Hash hash;
+      Index segment_num;
+      Index num_columns;
+      Index start_bit;
+      InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num,
+                              &num_columns, &start_bit);
+      return InterleavedFilterQuery(hash, segment_num, num_columns, start_bit,
+                                    hasher, *this);
+    }
+  }
+
+  double ExpectedFpRate() const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return 0.0;
+    }
+    // else Normal
+
+    // Note: Ignoring smash setting; still close enough in that case
+    double lower_portion =
+        (upper_start_block_ * 1.0 * kCoeffBits) / num_starts_;
+
+    // Each result (solution) bit (column) cuts FP rate in half. Weight that
+    // for upper and lower number of bits (columns).
+    return lower_portion * std::pow(0.5, upper_num_columns_ - 1) +
+           (1.0 - lower_portion) * std::pow(0.5, upper_num_columns_);
+  }
+
+  // ********************************************************************
+  // Static high-level API
+
+  // Round up to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundUpNumSlots(Index num_slots) {
+    // Must be multiple of kCoeffBits
+    Index corrected = (num_slots + kCoeffBits - 1) / kCoeffBits * kCoeffBits;
+
+    // Do not use num_starts==1 unless kUseSmash, because the hashing
+    // might not be equipped for stacking up so many entries on a
+    // single start location.
+    if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) {
+      corrected += kCoeffBits;
+    }
+    return corrected;
+  }
+
+  // Round down to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundDownNumSlots(Index num_slots) {
+    // Must be multiple of kCoeffBits
+    Index corrected = num_slots / kCoeffBits * kCoeffBits;
+
+    // Do not use num_starts==1 unless kUseSmash, because the hashing
+    // might not be equipped for stacking up so many entries on a
+    // single start location.
+    if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) {
+      corrected = 0;
+    }
+    return corrected;
+  }
+
+  // Compute the number of bytes for a given number of slots and desired
+  // FP rate. Since desired FP rate might not be exactly achievable,
+  // rounding_bias32==0 means to always round toward lower FP rate
+  // than desired (more bytes); rounding_bias32==max uint32_t means always
+  // round toward higher FP rate than desired (fewer bytes); other values
+  // act as a proportional threshold or bias between the two.
+  static size_t GetBytesForFpRate(Index num_slots, double desired_fp_rate,
+                                  uint32_t rounding_bias32) {
+    return InternalGetBytesForFpRate(num_slots, desired_fp_rate,
+                                     1.0 / desired_fp_rate, rounding_bias32);
+  }
+
+  // The same, but specifying desired accuracy as 1.0 / FP rate, or
+  // one_in_fp_rate. E.g. desired_one_in_fp_rate=100 means 1% FP rate.
+  static size_t GetBytesForOneInFpRate(Index num_slots,
+                                       double desired_one_in_fp_rate,
+                                       uint32_t rounding_bias32) {
+    return InternalGetBytesForFpRate(num_slots, 1.0 / desired_one_in_fp_rate,
+                                     desired_one_in_fp_rate, rounding_bias32);
+  }
+
+ protected:
+  static size_t InternalGetBytesForFpRate(Index num_slots,
+                                          double desired_fp_rate,
+                                          double desired_one_in_fp_rate,
+                                          uint32_t rounding_bias32) {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts) {
+      if (num_slots == 0) {
+        // Unusual. Zero starts presumes no keys added -> always false (no FPs)
+        return 0U;
+      }
+    } else {
+      assert(num_slots > 0);
+    }
+    // Must be rounded up already.
+    assert(RoundUpNumSlots(num_slots) == num_slots);
+
+    if (desired_one_in_fp_rate > 1.0 && desired_fp_rate < 1.0) {
+      // Typical: less than 100% FP rate
+      if (desired_one_in_fp_rate <= static_cast<ResultRow>(-1)) {
+        // Typical: Less than maximum result row entropy
+        ResultRow rounded = static_cast<ResultRow>(desired_one_in_fp_rate);
+        int lower_columns = FloorLog2(rounded);
+        double lower_columns_fp_rate = std::pow(2.0, -lower_columns);
+        double upper_columns_fp_rate = std::pow(2.0, -(lower_columns + 1));
+        // Floating point don't let me down!
+        assert(lower_columns_fp_rate >= desired_fp_rate);
+        assert(upper_columns_fp_rate <= desired_fp_rate);
+
+        double lower_portion = (desired_fp_rate - upper_columns_fp_rate) /
+                               (lower_columns_fp_rate - upper_columns_fp_rate);
+        // Floating point don't let me down!
+        assert(lower_portion >= 0.0);
+        assert(lower_portion <= 1.0);
+
+        double rounding_bias = (rounding_bias32 + 0.5) / double{0x100000000};
+        assert(rounding_bias > 0.0);
+        assert(rounding_bias < 1.0);
+
+        // Note: Ignoring smash setting; still close enough in that case
+        Index num_starts = num_slots - kCoeffBits + 1;
+        // Lower upper_start_block means lower FP rate (higher accuracy)
+        Index upper_start_block = static_cast<Index>(
+            (lower_portion * num_starts + rounding_bias) / kCoeffBits);
+        Index num_blocks = num_slots / kCoeffBits;
+        assert(upper_start_block < num_blocks);
+
+        // Start by assuming all blocks use lower number of columns
+        Index num_segments = num_blocks * static_cast<Index>(lower_columns);
+        // Correct by 1 each for blocks using upper number of columns
+        num_segments += (num_blocks - upper_start_block);
+        // Total bytes
+        return num_segments * sizeof(CoeffRow);
+      } else {
+        // one_in_fp_rate too big, thus requested FP rate is smaller than
+        // supported. Use max number of columns for minimum supported FP rate.
+        return num_slots * sizeof(ResultRow);
+      }
+    } else {
+      // Effectively asking for 100% FP rate, or NaN etc.
+      if (TypesAndSettings::kAllowZeroStarts) {
+        // Zero segments
+        return 0U;
+      } else {
+        // One segment (minimum size, maximizing FP rate)
+        return sizeof(CoeffRow);
+      }
+    }
+  }
+
+  void InternalConfigure() {
+    const Index num_blocks = GetNumBlocks();
+    Index num_segments = GetNumSegments();
+
+    if (num_blocks == 0) {
+      // Exceptional
+      upper_num_columns_ = 0;
+      upper_start_block_ = 0;
+    } else {
+      // Normal
+      upper_num_columns_ =
+          (num_segments + /*round up*/ num_blocks - 1) / num_blocks;
+      upper_start_block_ = upper_num_columns_ * num_blocks - num_segments;
+      // Unless that's more columns than supported by ResultRow data type
+      if (upper_num_columns_ > 8U * sizeof(ResultRow)) {
+        // Use maximum columns (there will be space unused)
+        upper_num_columns_ = static_cast<Index>(8U * sizeof(ResultRow));
+        upper_start_block_ = 0;
+        num_segments = num_blocks * upper_num_columns_;
+      }
+    }
+    // Update data_len_ for correct rounding and/or unused space
+    // NOTE: unused space stays gone if we PrepareForNumStarts again.
+    // We are prioritizing minimizing the number of fields over making
+    // the "unusued space" feature work well.
+    data_len_ = num_segments * sizeof(CoeffRow);
+  }
+
+  char* const data_;
+  size_t data_len_;
+  Index num_starts_ = 0;
+  Index upper_num_columns_ = 0;
+  Index upper_start_block_ = 0;
+};
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
+
+// For convenience working with templates
+#define IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings)                            \
+  using Hasher = ROCKSDB_NAMESPACE::ribbon::StandardHasher<TypesAndSettings>; \
+  using Banding =                                                             \
+      ROCKSDB_NAMESPACE::ribbon::StandardBanding<TypesAndSettings>;           \
+  using SimpleSoln =                                                          \
+      ROCKSDB_NAMESPACE::ribbon::InMemSimpleSolution<TypesAndSettings>;       \
+  using InterleavedSoln =                                                     \
+      ROCKSDB_NAMESPACE::ribbon::SerializableInterleavedSolution<             \
+          TypesAndSettings>;                                                  \
+  static_assert(sizeof(Hasher) + sizeof(Banding) + sizeof(SimpleSoln) +       \
+                        sizeof(InterleavedSoln) >                             \
+                    0,                                                        \
+                "avoid unused warnings, semicolon expected after macro call")
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/ribbon_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/ribbon_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,1308 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/ribbon_config.h"
+#include "util/ribbon_impl.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+#ifndef GFLAGS
+uint32_t FLAGS_thoroughness = 5;
+uint32_t FLAGS_max_add = 0;
+uint32_t FLAGS_min_check = 4000;
+uint32_t FLAGS_max_check = 100000;
+bool FLAGS_verbose = false;
+
+bool FLAGS_find_occ = false;
+bool FLAGS_find_slot_occ = false;
+double FLAGS_find_next_factor = 1.618;
+uint32_t FLAGS_find_iters = 10000;
+uint32_t FLAGS_find_min_slots = 128;
+uint32_t FLAGS_find_max_slots = 1000000;
+
+bool FLAGS_optimize_homog = false;
+uint32_t FLAGS_optimize_homog_slots = 30000000;
+uint32_t FLAGS_optimize_homog_check = 200000;
+double FLAGS_optimize_homog_granularity = 0.002;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+// Using 500 is a good test when you have time to be thorough.
+// Default is for general RocksDB regression test runs.
+DEFINE_uint32(thoroughness, 5, "iterations per configuration");
+DEFINE_uint32(max_add, 0,
+              "Add up to this number of entries to a single filter in "
+              "CompactnessAndBacktrackAndFpRate; 0 == reasonable default");
+DEFINE_uint32(min_check, 4000,
+              "Minimum number of novel entries for testing FP rate");
+DEFINE_uint32(max_check, 10000,
+              "Maximum number of novel entries for testing FP rate");
+DEFINE_bool(verbose, false, "Print extra details");
+
+// Options for FindOccupancy, which is more of a tool than a test.
+DEFINE_bool(find_occ, false, "whether to run the FindOccupancy tool");
+DEFINE_bool(find_slot_occ, false,
+            "whether to show individual slot occupancies with "
+            "FindOccupancy tool");
+DEFINE_double(find_next_factor, 1.618,
+              "factor to next num_slots for FindOccupancy");
+DEFINE_uint32(find_iters, 10000, "number of samples for FindOccupancy");
+DEFINE_uint32(find_min_slots, 128, "number of slots for FindOccupancy");
+DEFINE_uint32(find_max_slots, 1000000, "number of slots for FindOccupancy");
+
+// Options for OptimizeHomogAtScale, which is more of a tool than a test.
+DEFINE_bool(optimize_homog, false,
+            "whether to run the OptimizeHomogAtScale tool");
+DEFINE_uint32(optimize_homog_slots, 30000000,
+              "number of slots for OptimizeHomogAtScale");
+DEFINE_uint32(optimize_homog_check, 200000,
+              "number of queries for checking FP rate in OptimizeHomogAtScale");
+DEFINE_double(
+    optimize_homog_granularity, 0.002,
+    "overhead change between FP rate checking in OptimizeHomogAtScale");
+
+#endif  // GFLAGS
+
+template <typename TypesAndSettings>
+class RibbonTypeParamTest : public ::testing::Test {};
+
+class RibbonTest : public ::testing::Test {};
+
+namespace {
+
+// Different ways of generating keys for testing
+
+// Generate semi-sequential keys
+struct StandardKeyGen {
+  StandardKeyGen(const std::string& prefix, uint64_t id)
+      : id_(id), str_(prefix) {
+    ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
+  }
+
+  // Prefix (only one required)
+  StandardKeyGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  StandardKeyGen& operator+=(uint64_t i) {
+    id_ += i;
+    return *this;
+  }
+
+  const std::string& operator*() {
+    // Use multiplication to mix things up a little in the key
+    ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
+                                     id_ * uint64_t{0x1500000001});
+    return str_;
+  }
+
+  bool operator==(const StandardKeyGen& other) {
+    // Same prefix is assumed
+    return id_ == other.id_;
+  }
+  bool operator!=(const StandardKeyGen& other) {
+    // Same prefix is assumed
+    return id_ != other.id_;
+  }
+
+  uint64_t id_;
+  std::string str_;
+};
+
+// Generate small sequential keys, that can misbehave with sequential seeds
+// as in https://github.com/Cyan4973/xxHash/issues/469.
+// These keys are only heuristically unique, but that's OK with 64 bits,
+// for testing purposes.
+struct SmallKeyGen {
+  SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
+    // Hash the prefix for a heuristically unique offset
+    id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
+    ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
+  }
+
+  // Prefix (only one required)
+  SmallKeyGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  SmallKeyGen& operator+=(uint64_t i) {
+    id_ += i;
+    return *this;
+  }
+
+  const std::string& operator*() {
+    ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
+    return str_;
+  }
+
+  bool operator==(const SmallKeyGen& other) { return id_ == other.id_; }
+  bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; }
+
+  uint64_t id_;
+  std::string str_;
+};
+
+template <typename KeyGen>
+struct Hash32KeyGenWrapper : public KeyGen {
+  Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
+      : KeyGen(prefix, id) {}
+  uint32_t operator*() {
+    auto& key = *static_cast<KeyGen&>(*this);
+    // unseeded
+    return ROCKSDB_NAMESPACE::GetSliceHash(key);
+  }
+};
+
+template <typename KeyGen>
+struct Hash64KeyGenWrapper : public KeyGen {
+  Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
+      : KeyGen(prefix, id) {}
+  uint64_t operator*() {
+    auto& key = *static_cast<KeyGen&>(*this);
+    // unseeded
+    return ROCKSDB_NAMESPACE::GetSliceHash64(key);
+  }
+};
+
+using ROCKSDB_NAMESPACE::ribbon::ConstructionFailureChance;
+
+const std::vector<ConstructionFailureChance> kFailureOnly50Pct = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn2};
+
+const std::vector<ConstructionFailureChance> kFailureOnlyRare = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
+
+const std::vector<ConstructionFailureChance> kFailureAll = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn2, ROCKSDB_NAMESPACE::ribbon::kOneIn20,
+    ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
+
+}  // namespace
+
+using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
+using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
+using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
+
+struct DefaultTypesAndSettings {
+  using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
+  using ResultRow = uint8_t;
+  using Index = uint32_t;
+  using Hash = uint64_t;
+  using Seed = uint32_t;
+  using Key = ROCKSDB_NAMESPACE::Slice;
+  static constexpr bool kIsFilter = true;
+  static constexpr bool kHomogeneous = false;
+  static constexpr bool kFirstCoeffAlwaysOne = true;
+  static constexpr bool kUseSmash = false;
+  static constexpr bool kAllowZeroStarts = false;
+  static Hash HashFn(const Key& key, uint64_t raw_seed) {
+    // This version 0.7.2 preview of XXH3 (a.k.a. XXPH3) function does
+    // not pass SmallKeyGen tests below without some seed premixing from
+    // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
+    return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
+  }
+  // For testing
+  using KeyGen = StandardKeyGen;
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureAll;
+  }
+};
+
+using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
+struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_Coeff64Smash : public TypesAndSettings_Coeff64 {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash {
+  static constexpr bool kFirstCoeffAlwaysOne = false;
+};
+
+// Homogeneous Ribbon configurations
+struct TypesAndSettings_Coeff128_Homog : public DefaultTypesAndSettings {
+  static constexpr bool kHomogeneous = true;
+  // Since our best construction success setting still has 1/1000 failure
+  // rate, the best FP rate we test is 1/256
+  using ResultRow = uint8_t;
+  // Homogeneous only makes sense with sufficient slots for equivalent of
+  // almost sure construction success
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureOnlyRare;
+  }
+};
+struct TypesAndSettings_Coeff128Smash_Homog
+    : public TypesAndSettings_Coeff128_Homog {
+  // Smash (extra time to save space) + Homog (extra space to save time)
+  // doesn't make much sense in practice, but we minimally test it
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64_Homog : public TypesAndSettings_Coeff128_Homog {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_Coeff64Smash_Homog
+    : public TypesAndSettings_Coeff64_Homog {
+  // Smash (extra time to save space) + Homog (extra space to save time)
+  // doesn't make much sense in practice, but we minimally test it
+  static constexpr bool kUseSmash = true;
+};
+
+// Less exhaustive mix of coverage, but still covering the most stressful case
+// (only 50% construction success)
+struct AbridgedTypesAndSettings : public DefaultTypesAndSettings {
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureOnly50Pct;
+  }
+};
+struct TypesAndSettings_Result16 : public AbridgedTypesAndSettings {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_Result32 : public AbridgedTypesAndSettings {
+  using ResultRow = uint32_t;
+};
+struct TypesAndSettings_IndexSizeT : public AbridgedTypesAndSettings {
+  using Index = size_t;
+};
+struct TypesAndSettings_Hash32 : public AbridgedTypesAndSettings {
+  using Hash = uint32_t;
+  static Hash HashFn(const Key& key, Hash raw_seed) {
+    // This MurmurHash1 function does not pass tests below without the
+    // seed premixing from StandardHasher. In fact, it needs more than
+    // just a multiplication mixer on the ordinal seed.
+    return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
+  }
+};
+struct TypesAndSettings_Hash32_Result16 : public AbridgedTypesAndSettings {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_KeyString : public AbridgedTypesAndSettings {
+  using Key = std::string;
+};
+struct TypesAndSettings_Seed8 : public AbridgedTypesAndSettings {
+  // This is not a generally recommended configuration. With the configured
+  // hash function, it would fail with SmallKeyGen due to insufficient
+  // independence among the seeds.
+  using Seed = uint8_t;
+};
+struct TypesAndSettings_NoAlwaysOne : public AbridgedTypesAndSettings {
+  static constexpr bool kFirstCoeffAlwaysOne = false;
+};
+struct TypesAndSettings_AllowZeroStarts : public AbridgedTypesAndSettings {
+  static constexpr bool kAllowZeroStarts = true;
+};
+struct TypesAndSettings_Seed64 : public AbridgedTypesAndSettings {
+  using Seed = uint64_t;
+};
+struct TypesAndSettings_Rehasher
+    : public StandardRehasherAdapter<AbridgedTypesAndSettings> {
+  using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
+};
+struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
+  using ResultRow = uint32_t;
+};
+struct TypesAndSettings_Rehasher_Seed64
+    : public StandardRehasherAdapter<TypesAndSettings_Seed64> {
+  using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
+  // Note: 64-bit seed with Rehasher gives slightly better average reseeds
+};
+struct TypesAndSettings_Rehasher32
+    : public StandardRehasherAdapter<TypesAndSettings_Hash32> {
+  using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
+};
+struct TypesAndSettings_Rehasher32_Coeff64
+    : public TypesAndSettings_Rehasher32 {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_SmallKeyGen : public AbridgedTypesAndSettings {
+  // SmallKeyGen stresses the independence of different hash seeds
+  using KeyGen = SmallKeyGen;
+};
+struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
+  // SmallKeyGen stresses the independence of different hash seeds
+  using KeyGen = SmallKeyGen;
+};
+struct TypesAndSettings_Coeff32 : public DefaultTypesAndSettings {
+  using CoeffRow = uint32_t;
+};
+struct TypesAndSettings_Coeff32Smash : public TypesAndSettings_Coeff32 {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff16 : public DefaultTypesAndSettings {
+  using CoeffRow = uint16_t;
+};
+struct TypesAndSettings_Coeff16Smash : public TypesAndSettings_Coeff16 {
+  static constexpr bool kUseSmash = true;
+};
+
+using TestTypesAndSettings = ::testing::Types<
+    TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
+    TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash,
+    TypesAndSettings_Coeff64Smash0, TypesAndSettings_Coeff128_Homog,
+    TypesAndSettings_Coeff128Smash_Homog, TypesAndSettings_Coeff64_Homog,
+    TypesAndSettings_Coeff64Smash_Homog, TypesAndSettings_Result16,
+    TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
+    TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
+    TypesAndSettings_KeyString, TypesAndSettings_Seed8,
+    TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
+    TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
+    TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
+    TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
+    TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
+    TypesAndSettings_Hash32_SmallKeyGen, TypesAndSettings_Coeff32,
+    TypesAndSettings_Coeff32Smash, TypesAndSettings_Coeff16,
+    TypesAndSettings_Coeff16Smash>;
+TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
+
+namespace {
+
+// For testing Poisson-distributed (or similar) statistics, get value for
+// `stddevs_allowed` standard deviations above expected mean
+// `expected_count`.
+// (Poisson approximates Binomial only if probability of a trial being
+// in the count is low.)
+uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) {
+  return static_cast<uint64_t>(
+      expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0);
+}
+
+uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) {
+  return static_cast<uint64_t>(std::max(
+      0.0, expected_count - stddevs_allowed * std::sqrt(expected_count)));
+}
+
+uint64_t FrequentPoissonUpperBound(double expected_count) {
+  // Allow up to 5.0 standard deviations for frequently checked statistics
+  return PoissonUpperBound(expected_count, 5.0);
+}
+
+uint64_t FrequentPoissonLowerBound(double expected_count) {
+  return PoissonLowerBound(expected_count, 5.0);
+}
+
+uint64_t InfrequentPoissonUpperBound(double expected_count) {
+  // Allow up to 3 standard deviations for infrequently checked statistics
+  return PoissonUpperBound(expected_count, 3.0);
+}
+
+uint64_t InfrequentPoissonLowerBound(double expected_count) {
+  return PoissonLowerBound(expected_count, 3.0);
+}
+
+}  // namespace
+
+TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+  using ConfigHelper =
+      ROCKSDB_NAMESPACE::ribbon::BandingConfigHelper<TypeParam>;
+
+  if (sizeof(CoeffRow) < 8) {
+    ROCKSDB_GTEST_BYPASS("Not fully supported");
+    return;
+  }
+
+  const auto log2_thoroughness =
+      static_cast<uint32_t>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
+
+  // We are going to choose num_to_add using an exponential distribution,
+  // so that we have good representation of small-to-medium filters.
+  // Here we just pick some reasonable, practical upper bound based on
+  // kCoeffBits or option.
+  const double log_max_add = std::log(
+      FLAGS_max_add > 0 ? FLAGS_max_add
+                        : static_cast<uint32_t>(kCoeffBits * kCoeffBits) *
+                              std::max(FLAGS_thoroughness, uint32_t{32}));
+
+  // This needs to be enough below the minimum number of slots to get a
+  // reasonable number of samples with the minimum number of slots.
+  const double log_min_add = std::log(0.66 * SimpleSoln::RoundUpNumSlots(1));
+
+  ASSERT_GT(log_max_add, log_min_add);
+
+  const double diff_log_add = log_max_add - log_min_add;
+
+  for (ConstructionFailureChance cs : TypeParam::FailureChanceToTest()) {
+    double expected_reseeds;
+    switch (cs) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn2:
+        fprintf(stderr, "== Failure: 50 percent\n");
+        expected_reseeds = 1.0;
+        break;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn20:
+        fprintf(stderr, "== Failure: 95 percent\n");
+        expected_reseeds = 0.053;
+        break;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn1000:
+        fprintf(stderr, "== Failure: 1/1000\n");
+        expected_reseeds = 0.001;
+        break;
+    }
+
+    uint64_t total_reseeds = 0;
+    uint64_t total_singles = 0;
+    uint64_t total_single_failures = 0;
+    uint64_t total_batch = 0;
+    uint64_t total_batch_successes = 0;
+    uint64_t total_fp_count = 0;
+    uint64_t total_added = 0;
+    uint64_t total_expand_trials = 0;
+    uint64_t total_expand_failures = 0;
+    double total_expand_overhead = 0.0;
+
+    uint64_t soln_query_nanos = 0;
+    uint64_t soln_query_count = 0;
+    uint64_t bloom_query_nanos = 0;
+    uint64_t isoln_query_nanos = 0;
+    uint64_t isoln_query_count = 0;
+
+    // Take different samples if you change thoroughness
+    ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
+
+    for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
+      // We are going to choose num_to_add using an exponential distribution
+      // as noted above, but instead of randomly choosing them, we generate
+      // samples linearly using the golden ratio, which ensures a nice spread
+      // even for a small number of samples, and starting with the minimum
+      // number of slots to ensure it is tested.
+      double log_add =
+          std::fmod(0.6180339887498948482 * diff_log_add * i, diff_log_add) +
+          log_min_add;
+      uint32_t num_to_add = static_cast<uint32_t>(std::exp(log_add));
+
+      // Most of the time, test the Interleaved solution storage, but when
+      // we do we have to make num_slots a multiple of kCoeffBits. So
+      // sometimes we want to test without that limitation.
+      bool test_interleaved = (i % 7) != 6;
+
+      // Compute num_slots, and re-adjust num_to_add to get as close as possible
+      // to next num_slots, to stress that num_slots in terms of construction
+      // success. Ensure at least one iteration:
+      Index num_slots = Index{0} - 1;
+      --num_to_add;
+      for (;;) {
+        Index next_num_slots = SimpleSoln::RoundUpNumSlots(
+            ConfigHelper::GetNumSlots(num_to_add + 1, cs));
+        if (test_interleaved) {
+          next_num_slots = InterleavedSoln::RoundUpNumSlots(next_num_slots);
+          // assert idempotent
+          EXPECT_EQ(next_num_slots,
+                    InterleavedSoln::RoundUpNumSlots(next_num_slots));
+        }
+        // assert idempotent with InterleavedSoln::RoundUpNumSlots
+        EXPECT_EQ(next_num_slots, SimpleSoln::RoundUpNumSlots(next_num_slots));
+
+        if (next_num_slots > num_slots) {
+          break;
+        }
+        num_slots = next_num_slots;
+        ++num_to_add;
+      }
+      assert(num_slots < Index{0} - 1);
+
+      total_added += num_to_add;
+
+      std::string prefix;
+      ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
+
+      // Batch that must be added
+      std::string added_str = prefix + "added";
+      KeyGen keys_begin(added_str, 0);
+      KeyGen keys_end(added_str, num_to_add);
+
+      // A couple more that will probably be added
+      KeyGen one_more(prefix + "more", 1);
+      KeyGen two_more(prefix + "more", 2);
+
+      // Batch that may or may not be added
+      uint32_t batch_size =
+          static_cast<uint32_t>(2.0 * std::sqrt(num_slots - num_to_add));
+      if (batch_size < 10U) {
+        batch_size = 0;
+      }
+      std::string batch_str = prefix + "batch";
+      KeyGen batch_begin(batch_str, 0);
+      KeyGen batch_end(batch_str, batch_size);
+
+      // Batch never (successfully) added, but used for querying FP rate
+      std::string not_str = prefix + "not";
+      KeyGen other_keys_begin(not_str, 0);
+      KeyGen other_keys_end(not_str, FLAGS_max_check);
+
+      double overhead_ratio = 1.0 * num_slots / num_to_add;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "Adding(%s) %u / %u   Overhead: %g   Batch size: %u\n",
+                test_interleaved ? "i" : "s", (unsigned)num_to_add,
+                (unsigned)num_slots, overhead_ratio, (unsigned)batch_size);
+      }
+
+      // Vary bytes for InterleavedSoln to use number of solution columns
+      // from 0 to max allowed by ResultRow type (and used by SimpleSoln).
+      // Specifically include 0 and max, and otherwise skew toward max.
+      uint32_t max_ibytes =
+          static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
+      size_t ibytes;
+      if (i == 0) {
+        ibytes = 0;
+      } else if (i == 1) {
+        ibytes = max_ibytes;
+      } else {
+        // Skewed
+        ibytes =
+            std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
+      }
+      std::unique_ptr<char[]> idata(new char[ibytes]);
+      InterleavedSoln isoln(idata.get(), ibytes);
+
+      SimpleSoln soln;
+      Hasher hasher;
+      bool first_single;
+      bool second_single;
+      bool batch_success;
+      {
+        Banding banding;
+        // Traditional solve for a fixed set.
+        ASSERT_TRUE(
+            banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
+
+        Index occupied_count = banding.GetOccupiedCount();
+        Index more_added = 0;
+
+        if (TypeParam::kHomogeneous || overhead_ratio < 1.01 ||
+            batch_size == 0) {
+          // Homogeneous not compatible with backtracking because add
+          // doesn't fail. Small overhead ratio too packed to expect more
+          first_single = false;
+          second_single = false;
+          batch_success = false;
+        } else {
+          // Now to test backtracking, starting with guaranteed fail. By using
+          // the keys that will be used to test FP rate, we are then doing an
+          // extra check that after backtracking there are no remnants (e.g. in
+          // result side of banding) of these entries.
+          KeyGen other_keys_too_big_end = other_keys_begin;
+          other_keys_too_big_end += num_to_add;
+          banding.EnsureBacktrackSize(std::max(num_to_add, batch_size));
+          EXPECT_FALSE(banding.AddRangeOrRollBack(other_keys_begin,
+                                                  other_keys_too_big_end));
+          EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
+
+          // Check that we still have a good chance of adding a couple more
+          // individually
+          first_single = banding.Add(*one_more);
+          second_single = banding.Add(*two_more);
+          more_added += (first_single ? 1 : 0) + (second_single ? 1 : 0);
+          total_singles += 2U;
+          total_single_failures += 2U - more_added;
+
+          // Or as a batch
+          batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end);
+          ++total_batch;
+          if (batch_success) {
+            more_added += batch_size;
+            ++total_batch_successes;
+          }
+          EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
+        }
+
+        // Also verify that redundant adds are OK (no effect)
+        ASSERT_TRUE(
+            banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
+        EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
+
+        // Now back-substitution
+        soln.BackSubstFrom(banding);
+        if (test_interleaved) {
+          isoln.BackSubstFrom(banding);
+        }
+
+        Seed reseeds = banding.GetOrdinalSeed();
+        total_reseeds += reseeds;
+
+        EXPECT_LE(reseeds, 8 + log2_thoroughness);
+        if (reseeds > log2_thoroughness + 1) {
+          fprintf(
+              stderr, "%s high reseeds at %u, %u/%u: %u\n",
+              reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
+              static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
+              static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
+        }
+
+        if (reseeds > 0) {
+          // "Expand" test: given a failed construction, how likely is it to
+          // pass with same seed and more slots. At each step, we increase
+          // enough to ensure there is at least one shift within each coeff
+          // block.
+          ++total_expand_trials;
+          Index expand_count = 0;
+          Index ex_slots = num_slots;
+          banding.SetOrdinalSeed(0);
+          for (;; ++expand_count) {
+            ASSERT_LE(expand_count, log2_thoroughness);
+            ex_slots += ex_slots / kCoeffBits;
+            if (test_interleaved) {
+              ex_slots = InterleavedSoln::RoundUpNumSlots(ex_slots);
+            }
+            banding.Reset(ex_slots);
+            bool success = banding.AddRange(keys_begin, keys_end);
+            if (success) {
+              break;
+            }
+          }
+          total_expand_failures += expand_count;
+          total_expand_overhead += 1.0 * (ex_slots - num_slots) / num_slots;
+        }
+
+        hasher.SetOrdinalSeed(reseeds);
+      }
+      // soln and hasher now independent of Banding object
+
+      // Verify keys added
+      KeyGen cur = keys_begin;
+      while (cur != keys_end) {
+        ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
+        ++cur;
+      }
+      // We (maybe) snuck these in!
+      if (first_single) {
+        ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
+      }
+      if (second_single) {
+        ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
+      }
+      if (batch_success) {
+        cur = batch_begin;
+        while (cur != batch_end) {
+          ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
+          ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
+          ++cur;
+        }
+      }
+
+      // Check FP rate (depends only on number of result bits == solution
+      // columns)
+      Index fp_count = 0;
+      cur = other_keys_begin;
+      {
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          bool fp = soln.FilterQuery(*cur, hasher);
+          fp_count += fp ? 1 : 0;
+          ++cur;
+        }
+        soln_query_nanos += timer.ElapsedNanos();
+        soln_query_count += FLAGS_max_check;
+      }
+      {
+        double expected_fp_count = soln.ExpectedFpRate() * FLAGS_max_check;
+        // For expected FP rate, also include false positives due to collisions
+        // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
+        double correction =
+            FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
+
+        // NOTE: rare violations expected with kHomogeneous
+        EXPECT_LE(fp_count,
+                  FrequentPoissonUpperBound(expected_fp_count + correction));
+        EXPECT_GE(fp_count,
+                  FrequentPoissonLowerBound(expected_fp_count + correction));
+      }
+      total_fp_count += fp_count;
+
+      // And also check FP rate for isoln
+      if (test_interleaved) {
+        Index ifp_count = 0;
+        cur = other_keys_begin;
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0;
+          ++cur;
+        }
+        isoln_query_nanos += timer.ElapsedNanos();
+        isoln_query_count += FLAGS_max_check;
+        {
+          double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
+          // For expected FP rate, also include false positives due to
+          // collisions in Hash value. (Negligible for 64-bit, can matter for
+          // 32-bit.)
+          double correction =
+              FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
+
+          // NOTE: rare violations expected with kHomogeneous
+          EXPECT_LE(ifp_count,
+                    FrequentPoissonUpperBound(expected_fp_count + correction));
+
+          // FIXME: why sometimes can we slightly "beat the odds"?
+          // (0.95 factor should not be needed)
+          EXPECT_GE(ifp_count, FrequentPoissonLowerBound(
+                                   0.95 * expected_fp_count + correction));
+        }
+        // Since the bits used in isoln are a subset of the bits used in soln,
+        // it cannot have fewer FPs
+        EXPECT_GE(ifp_count, fp_count);
+      }
+
+      // And compare to Bloom time, for fun
+      if (ibytes >= /* minimum Bloom impl bytes*/ 64) {
+        Index bfp_count = 0;
+        cur = other_keys_begin;
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          uint64_t h = hasher.GetHash(*cur);
+          uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h);
+          uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h)
+                                          : h1 * 0x9e3779b9;
+          bfp_count +=
+              ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch(
+                  h1, h2, static_cast<uint32_t>(ibytes), 6, idata.get())
+                  ? 1
+                  : 0;
+          ++cur;
+        }
+        bloom_query_nanos += timer.ElapsedNanos();
+        // ensure bfp_count is used
+        ASSERT_LT(bfp_count, FLAGS_max_check);
+      }
+    }
+
+    // "outside" == key not in original set so either negative or false positive
+    fprintf(stderr,
+            "Simple      outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * soln_query_nanos / soln_query_count);
+    fprintf(stderr,
+            "Interleaved outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * isoln_query_nanos / isoln_query_count);
+    fprintf(stderr,
+            "Bloom       outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * bloom_query_nanos / soln_query_count);
+
+    if (TypeParam::kHomogeneous) {
+      EXPECT_EQ(total_reseeds, 0U);
+    } else {
+      double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
+      fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
+      // Values above were chosen to target around 50% chance of encoding
+      // success rate (average of 1.0 re-seeds) or slightly better. But 1.15 is
+      // also close enough.
+      EXPECT_LE(total_reseeds,
+                InfrequentPoissonUpperBound(1.15 * expected_reseeds *
+                                            FLAGS_thoroughness));
+      // Would use 0.85 here instead of 0.75, but
+      // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
+      // sequential keys with a small, cheap hash function. We accept that
+      // there are surely inputs that are somewhat bad for this setup, but
+      // these somewhat good inputs are probably more likely.
+      EXPECT_GE(total_reseeds,
+                InfrequentPoissonLowerBound(0.75 * expected_reseeds *
+                                            FLAGS_thoroughness));
+    }
+
+    if (total_expand_trials > 0) {
+      double average_expand_failures =
+          1.0 * total_expand_failures / total_expand_trials;
+      fprintf(stderr, "Average expand failures, and overhead: %g, %g\n",
+              average_expand_failures,
+              total_expand_overhead / total_expand_trials);
+      // Seems to be a generous allowance
+      EXPECT_LE(total_expand_failures,
+                InfrequentPoissonUpperBound(1.0 * total_expand_trials));
+    } else {
+      fprintf(stderr, "Average expand failures: N/A\n");
+    }
+
+    if (total_singles > 0) {
+      double single_failure_rate = 1.0 * total_single_failures / total_singles;
+      fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate);
+      // A rough bound (one sided) based on nothing in particular
+      double expected_single_failures =
+          1.0 * total_singles /
+          (sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32);
+      EXPECT_LE(total_single_failures,
+                InfrequentPoissonUpperBound(expected_single_failures));
+    }
+
+    if (total_batch > 0) {
+      // Counting successes here for Poisson to approximate the Binomial
+      // distribution.
+      // A rough bound (one sided) based on nothing in particular.
+      double expected_batch_successes = 1.0 * total_batch / 2;
+      uint64_t lower_bound =
+          InfrequentPoissonLowerBound(expected_batch_successes);
+      fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n",
+              1.0 * total_batch_successes / total_batch,
+              1.0 * lower_bound / total_batch);
+      EXPECT_GE(total_batch_successes, lower_bound);
+    }
+
+    {
+      uint64_t total_checked = uint64_t{FLAGS_max_check} * FLAGS_thoroughness;
+      double expected_total_fp_count =
+          total_checked * std::pow(0.5, 8U * sizeof(ResultRow));
+      // For expected FP rate, also include false positives due to collisions
+      // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
+      double average_added = 1.0 * total_added / FLAGS_thoroughness;
+      expected_total_fp_count +=
+          total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
+
+      uint64_t upper_bound =
+          InfrequentPoissonUpperBound(expected_total_fp_count);
+      uint64_t lower_bound =
+          InfrequentPoissonLowerBound(expected_total_fp_count);
+      fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n",
+              1.0 * total_fp_count / total_checked,
+              expected_total_fp_count / total_checked,
+              1.0 * upper_bound / total_checked,
+              1.0 * lower_bound / total_checked);
+      EXPECT_LE(total_fp_count, upper_bound);
+      EXPECT_GE(total_fp_count, lower_bound);
+    }
+  }
+}
+
+TYPED_TEST(RibbonTypeParamTest, Extremes) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  size_t bytes = 128 * 1024;
+  std::unique_ptr<char[]> buf(new char[bytes]);
+  InterleavedSoln isoln(buf.get(), bytes);
+  SimpleSoln soln;
+  Hasher hasher;
+  Banding banding;
+
+  // ########################################
+  // Add zero keys to minimal number of slots
+  KeyGen begin_and_end("foo", 123);
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
+      /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
+      /* seed mask*/ 0));
+
+  soln.BackSubstFrom(banding);
+  isoln.BackSubstFrom(banding);
+
+  // Because there's plenty of memory, we expect the interleaved solution to
+  // use maximum supported columns (same as simple solution)
+  ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow));
+  ASSERT_EQ(isoln.GetUpperStartBlock(), 0U);
+
+  // Somewhat oddly, we expect same FP rate as if we had essentially filled
+  // up the slots.
+  KeyGen other_keys_begin("not", 0);
+  KeyGen other_keys_end("not", FLAGS_max_check);
+
+  Index fp_count = 0;
+  KeyGen cur = other_keys_begin;
+  while (cur != other_keys_end) {
+    bool isoln_query_result = isoln.FilterQuery(*cur, hasher);
+    bool soln_query_result = soln.FilterQuery(*cur, hasher);
+    // Solutions are equivalent
+    ASSERT_EQ(isoln_query_result, soln_query_result);
+    if (!TypeParam::kHomogeneous) {
+      // And in fact we only expect an FP when ResultRow is 0
+      // (except Homogeneous)
+      ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
+                                       hasher.GetHash(*cur)) == ResultRow{0});
+    }
+    fp_count += soln_query_result ? 1 : 0;
+    ++cur;
+  }
+  {
+    ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate());
+    double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
+    EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count));
+    if (TypeParam::kHomogeneous) {
+      // Pseudorandom garbage in Homogeneous filter can "beat the odds" if
+      // nothing added
+    } else {
+      EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count));
+    }
+  }
+
+  // ######################################################
+  // Use zero bytes for interleaved solution (key(s) added)
+
+  // Add one key
+  KeyGen key_begin("added", 0);
+  KeyGen key_end("added", 1);
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
+      /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
+      /* seed mask*/ 0));
+
+  InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
+
+  isoln2.BackSubstFrom(banding);
+
+  ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U);
+  ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U);
+
+  // All queries return true
+  ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher));
+  ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0);
+}
+
+TEST(RibbonTest, AllowZeroStarts) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
+  IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
+  using KeyGen = StandardKeyGen;
+
+  InterleavedSoln isoln(nullptr, /*bytes*/ 0);
+  SimpleSoln soln;
+  Hasher hasher;
+  Banding banding;
+
+  KeyGen begin("foo", 0);
+  KeyGen end("foo", 1);
+  // Can't add 1 entry
+  ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
+
+  KeyGen begin_and_end("foo", 123);
+  // Can add 0 entries
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
+                                              begin_and_end));
+
+  Seed reseeds = banding.GetOrdinalSeed();
+  ASSERT_EQ(reseeds, 0U);
+  hasher.SetOrdinalSeed(reseeds);
+
+  // Can construct 0-slot solutions
+  isoln.BackSubstFrom(banding);
+  soln.BackSubstFrom(banding);
+
+  // Should always return false
+  ASSERT_FALSE(isoln.FilterQuery(*begin, hasher));
+  ASSERT_FALSE(soln.FilterQuery(*begin, hasher));
+
+  // And report that in FP rate
+  ASSERT_EQ(isoln.ExpectedFpRate(), 0.0);
+  ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
+}
+
+TEST(RibbonTest, RawAndOrdinalSeeds) {
+  StandardHasher<TypesAndSettings_Seed64> hasher64;
+  StandardHasher<DefaultTypesAndSettings> hasher64_32;
+  StandardHasher<TypesAndSettings_Hash32> hasher32;
+  StandardHasher<TypesAndSettings_Seed8> hasher8;
+
+  for (uint32_t limit : {0xffU, 0xffffU}) {
+    std::vector<bool> seen(limit + 1);
+    for (uint32_t i = 0; i < limit; ++i) {
+      hasher64.SetOrdinalSeed(i);
+      auto raw64 = hasher64.GetRawSeed();
+      hasher32.SetOrdinalSeed(i);
+      auto raw32 = hasher32.GetRawSeed();
+      hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
+      auto raw8 = hasher8.GetRawSeed();
+      {
+        hasher64_32.SetOrdinalSeed(i);
+        auto raw64_32 = hasher64_32.GetRawSeed();
+        ASSERT_EQ(raw64_32, raw32);  // Same size seed
+      }
+      if (i == 0) {
+        // Documented that ordinal seed 0 == raw seed 0
+        ASSERT_EQ(raw64, 0U);
+        ASSERT_EQ(raw32, 0U);
+        ASSERT_EQ(raw8, 0U);
+      } else {
+        // Extremely likely that upper bits are set
+        ASSERT_GT(raw64, raw32);
+        ASSERT_GT(raw32, raw8);
+      }
+      // Hashers agree on lower bits
+      ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
+      ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
+
+      // The translation is one-to-one for this size prefix
+      uint32_t v = static_cast<uint32_t>(raw32 & limit);
+      ASSERT_EQ(raw64 & limit, v);
+      ASSERT_FALSE(seen[v]);
+      seen[v] = true;
+    }
+  }
+}
+
+namespace {
+
+struct PhsfInputGen {
+  PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
+    val_.first = prefix;
+    ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
+  }
+
+  // Prefix (only one required)
+  PhsfInputGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  const std::pair<std::string, uint8_t>& operator*() {
+    // Use multiplication to mix things up a little in the key
+    ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
+                                     id_ * uint64_t{0x1500000001});
+    // Occasionally repeat values etc.
+    val_.second = static_cast<uint8_t>(id_ * 7 / 8);
+    return val_;
+  }
+
+  const std::pair<std::string, uint8_t>* operator->() { return &**this; }
+
+  bool operator==(const PhsfInputGen& other) {
+    // Same prefix is assumed
+    return id_ == other.id_;
+  }
+  bool operator!=(const PhsfInputGen& other) {
+    // Same prefix is assumed
+    return id_ != other.id_;
+  }
+
+  uint64_t id_;
+  std::pair<std::string, uint8_t> val_;
+};
+
+struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
+  static constexpr bool kIsFilter = false;
+};
+}  // namespace
+
+TEST(RibbonTest, PhsfBasic) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
+  IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
+
+  Index num_slots = 12800;
+  Index num_to_add = static_cast<Index>(num_slots / 1.02);
+
+  PhsfInputGen begin("in", 0);
+  PhsfInputGen end("in", num_to_add);
+
+  std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
+  InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
+  SimpleSoln soln;
+  Hasher hasher;
+
+  {
+    Banding banding;
+    ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
+
+    soln.BackSubstFrom(banding);
+    isoln.BackSubstFrom(banding);
+
+    hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
+  }
+
+  for (PhsfInputGen cur = begin; cur != end; ++cur) {
+    ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
+    ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
+  }
+}
+
+// Not a real test, but a tool used to build APIs in ribbon_config.h
+TYPED_TEST(RibbonTypeParamTest, FindOccupancy) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  if (!FLAGS_find_occ) {
+    ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
+    return;
+  }
+
+  KeyGen cur(ROCKSDB_NAMESPACE::ToString(
+                 testing::UnitTest::GetInstance()->random_seed()),
+             0);
+
+  Banding banding;
+  Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots);
+  Index max_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_max_slots);
+  while (num_slots <= max_slots) {
+    std::map<int32_t, uint32_t> rem_histogram;
+    std::map<Index, uint32_t> slot_histogram;
+    if (FLAGS_find_slot_occ) {
+      for (Index i = 0; i < kCoeffBits; ++i) {
+        slot_histogram[i] = 0;
+        slot_histogram[num_slots - 1 - i] = 0;
+        slot_histogram[num_slots / 2 - kCoeffBits / 2 + i] = 0;
+      }
+    }
+    uint64_t total_added = 0;
+    for (uint32_t i = 0; i < FLAGS_find_iters; ++i) {
+      banding.Reset(num_slots);
+      uint32_t j = 0;
+      KeyGen end = cur;
+      end += num_slots + num_slots / 10;
+      for (; cur != end; ++cur) {
+        if (banding.Add(*cur)) {
+          ++j;
+        } else {
+          break;
+        }
+      }
+      total_added += j;
+      for (auto& slot : slot_histogram) {
+        slot.second += banding.IsOccupied(slot.first);
+      }
+
+      int32_t bucket =
+          static_cast<int32_t>(num_slots) - static_cast<int32_t>(j);
+      rem_histogram[bucket]++;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "num_slots: %u i: %u / %u avg_overhead: %g\r",
+                static_cast<unsigned>(num_slots), static_cast<unsigned>(i),
+                static_cast<unsigned>(FLAGS_find_iters),
+                1.0 * (i + 1) * num_slots / total_added);
+      }
+    }
+    if (FLAGS_verbose) {
+      fprintf(stderr, "\n");
+    }
+
+    uint32_t cumulative = 0;
+
+    double p50_rem = 0;
+    double p95_rem = 0;
+    double p99_9_rem = 0;
+
+    for (auto& h : rem_histogram) {
+      double before = 1.0 * cumulative / FLAGS_find_iters;
+      double not_after = 1.0 * (cumulative + h.second) / FLAGS_find_iters;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "overhead: %g before: %g not_after: %g\n",
+                1.0 * num_slots / (num_slots - h.first), before, not_after);
+      }
+      cumulative += h.second;
+      if (before < 0.5 && 0.5 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.5 - before) / (not_after - before);
+        p50_rem = h.first + portion;
+      } else if (before < 0.95 && 0.95 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.95 - before) / (not_after - before);
+        p95_rem = h.first + portion;
+      } else if (before < 0.999 && 0.999 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.999 - before) / (not_after - before);
+        p99_9_rem = h.first + portion;
+      }
+    }
+    for (auto& slot : slot_histogram) {
+      fprintf(stderr, "slot[%u] occupied: %g\n", (unsigned)slot.first,
+              1.0 * slot.second / FLAGS_find_iters);
+    }
+
+    double mean_rem =
+        (1.0 * FLAGS_find_iters * num_slots - total_added) / FLAGS_find_iters;
+    fprintf(
+        stderr,
+        "num_slots: %u iters: %u mean_ovr: %g p50_ovr: %g p95_ovr: %g "
+        "p99.9_ovr: %g mean_rem: %g p50_rem: %g p95_rem: %g p99.9_rem: %g\n",
+        static_cast<unsigned>(num_slots),
+        static_cast<unsigned>(FLAGS_find_iters),
+        1.0 * num_slots / (num_slots - mean_rem),
+        1.0 * num_slots / (num_slots - p50_rem),
+        1.0 * num_slots / (num_slots - p95_rem),
+        1.0 * num_slots / (num_slots - p99_9_rem), mean_rem, p50_rem, p95_rem,
+        p99_9_rem);
+
+    num_slots = std::max(
+        num_slots + 1, static_cast<Index>(num_slots * FLAGS_find_next_factor));
+    num_slots = InterleavedSoln::RoundUpNumSlots(num_slots);
+  }
+}
+
+// Not a real test, but a tool to understand Homogeneous Ribbon
+// behavior (TODO: configuration APIs & tests)
+TYPED_TEST(RibbonTypeParamTest, OptimizeHomogAtScale) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  if (!FLAGS_optimize_homog) {
+    ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
+    return;
+  }
+
+  if (!TypeParam::kHomogeneous) {
+    ROCKSDB_GTEST_BYPASS("Only for Homogeneous Ribbon");
+    return;
+  }
+
+  KeyGen cur(ROCKSDB_NAMESPACE::ToString(
+                 testing::UnitTest::GetInstance()->random_seed()),
+             0);
+
+  Banding banding;
+  Index num_slots = SimpleSoln::RoundUpNumSlots(FLAGS_optimize_homog_slots);
+  banding.Reset(num_slots);
+
+  // This and "band_ovr" is the "allocated overhead", or slots over added.
+  // It does not take into account FP rates.
+  double target_overhead = 1.20;
+  uint32_t num_added = 0;
+
+  do {
+    do {
+      (void)banding.Add(*cur);
+      ++cur;
+      ++num_added;
+    } while (1.0 * num_slots / num_added > target_overhead);
+
+    SimpleSoln soln;
+    soln.BackSubstFrom(banding);
+
+    std::array<uint32_t, 8U * sizeof(ResultRow)> fp_counts_by_cols;
+    fp_counts_by_cols.fill(0U);
+    for (uint32_t i = 0; i < FLAGS_optimize_homog_check; ++i) {
+      ResultRow r = soln.PhsfQuery(*cur, banding);
+      ++cur;
+      for (size_t j = 0; j < fp_counts_by_cols.size(); ++j) {
+        if ((r & 1) == 1) {
+          break;
+        }
+        fp_counts_by_cols[j]++;
+        r /= 2;
+      }
+    }
+    fprintf(stderr, "band_ovr: %g ", 1.0 * num_slots / num_added);
+    for (unsigned j = 0; j < fp_counts_by_cols.size(); ++j) {
+      double inv_fp_rate =
+          1.0 * FLAGS_optimize_homog_check / fp_counts_by_cols[j];
+      double equiv_cols = std::log(inv_fp_rate) * 1.4426950409;
+      // Overhead vs. information-theoretic minimum based on observed
+      // FP rate (subject to sampling error, especially for low FP rates)
+      double actual_overhead =
+          1.0 * (j + 1) * num_slots / (equiv_cols * num_added);
+      fprintf(stderr, "ovr_%u: %g ", j + 1, actual_overhead);
+    }
+    fprintf(stderr, "\n");
+    target_overhead -= FLAGS_optimize_homog_granularity;
+  } while (target_overhead > 1.0);
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/set_comparator.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/set_comparator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/set_comparator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/set_comparator.h	2025-05-19 16:14:28.000000000 +0000
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include "rocksdb/comparator.h"
+
 namespace ROCKSDB_NAMESPACE {
 // A comparator to be used in std::set
 struct SetComparator {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/slice.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/slice.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,32 +7,47 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "rocksdb/slice.h"
+
+#include <stdio.h>
+
 #include <algorithm>
+
+#include "rocksdb/convenience.h"
 #include "rocksdb/slice_transform.h"
-#include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/string_util.h"
-#include <stdio.h>
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    slice_transform_length_info = {
+#ifndef ROCKSDB_LITE
+        {"length",
+         {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever}},
+#endif  // ROCKSDB_LITE
+};
 
 class FixedPrefixTransform : public SliceTransform {
  private:
   size_t prefix_len_;
-  std::string name_;
 
  public:
-  explicit FixedPrefixTransform(size_t prefix_len)
-      : prefix_len_(prefix_len),
-        // Note that if any part of the name format changes, it will require
-        // changes on options_helper in order to make RocksDBOptionsParser work
-        // for the new change.
-        // TODO(yhchiang): move serialization / deserializaion code inside
-        // the class implementation itself.
-        name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {}
+  explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) {
+    RegisterOptions(Name(), &prefix_len_, &slice_transform_length_info);
+  }
 
-  const char* Name() const override { return name_.c_str(); }
+  static const char* kClassName() { return "rocksdb.FixedPrefix"; }
+  static const char* kNickName() { return "fixed"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+
+  std::string GetId() const override {
+    return std::string(Name()) + "." + ROCKSDB_NAMESPACE::ToString(prefix_len_);
+  }
 
   Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
@@ -60,19 +75,19 @@
 class CappedPrefixTransform : public SliceTransform {
  private:
   size_t cap_len_;
-  std::string name_;
 
  public:
-  explicit CappedPrefixTransform(size_t cap_len)
-      : cap_len_(cap_len),
-        // Note that if any part of the name format changes, it will require
-        // changes on options_helper in order to make RocksDBOptionsParser work
-        // for the new change.
-        // TODO(yhchiang): move serialization / deserializaion code inside
-        // the class implementation itself.
-        name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {}
+  explicit CappedPrefixTransform(size_t cap_len) : cap_len_(cap_len) {
+    RegisterOptions(Name(), &cap_len_, &slice_transform_length_info);
+  }
 
-  const char* Name() const override { return name_.c_str(); }
+  static const char* kClassName() { return "rocksdb.CappedPrefix"; }
+  static const char* kNickName() { return "capped"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
+  std::string GetId() const override {
+    return std::string(Name()) + "." + ROCKSDB_NAMESPACE::ToString(cap_len_);
+  }
 
   Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
@@ -99,7 +114,8 @@
  public:
   explicit NoopTransform() { }
 
-  const char* Name() const override { return "rocksdb.Noop"; }
+  static const char* kClassName() { return "rocksdb.Noop"; }
+  const char* Name() const override { return kClassName(); }
 
   Slice Transform(const Slice& src) const override { return src; }
 
@@ -112,6 +128,155 @@
   }
 };
 
+}  // end namespace
+
+const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
+  return new FixedPrefixTransform(prefix_len);
+}
+
+const SliceTransform* NewCappedPrefixTransform(size_t cap_len) {
+  return new CappedPrefixTransform(cap_len);
+}
+
+const SliceTransform* NewNoopTransform() { return new NoopTransform; }
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinSliceTransform(ObjectLibrary& library,
+                                         const std::string& /*arg*/) {
+  // For the builtin transforms, the format is typically
+  // [Name] or [Name].[0-9]+
+  // [NickName]:[0-9]+
+  library.AddFactory<const SliceTransform>(
+      NoopTransform::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(NewNoopTransform());
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(FixedPrefixTransform::kNickName(), false)
+          .AddNumber(":"),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        auto len = ParseSizeT(uri.substr(colon + 1));
+        guard->reset(NewFixedPrefixTransform(len));
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(FixedPrefixTransform::kClassName(), true)
+          .AddNumber("."),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        if (uri == FixedPrefixTransform::kClassName()) {
+          guard->reset(NewFixedPrefixTransform(0));
+        } else {
+          auto len = ParseSizeT(
+              uri.substr(strlen(FixedPrefixTransform::kClassName()) + 1));
+          guard->reset(NewFixedPrefixTransform(len));
+        }
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(CappedPrefixTransform::kNickName(), false)
+          .AddNumber(":"),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        auto colon = uri.find(":");
+        auto len = ParseSizeT(uri.substr(colon + 1));
+        guard->reset(NewCappedPrefixTransform(len));
+        return guard->get();
+      });
+  library.AddFactory<const SliceTransform>(
+      ObjectLibrary::PatternEntry(CappedPrefixTransform::kClassName(), true)
+          .AddNumber("."),
+      [](const std::string& uri, std::unique_ptr<const SliceTransform>* guard,
+         std::string* /*errmsg*/) {
+        if (uri == CappedPrefixTransform::kClassName()) {
+          guard->reset(NewCappedPrefixTransform(0));
+        } else {
+          auto len = ParseSizeT(
+              uri.substr(strlen(CappedPrefixTransform::kClassName()) + 1));
+          guard->reset(NewCappedPrefixTransform(len));
+        }
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status SliceTransform::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<const SliceTransform>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinSliceTransform(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (id.empty() && opt_map.empty()) {
+    result->reset();
+  } else {
+#ifndef ROCKSDB_LITE
+    status = config_options.registry->NewSharedObject(id, result);
+#else
+    auto Matches = [](const std::string& input, size_t size,
+                      const char* pattern, char sep) {
+      auto plen = strlen(pattern);
+      return (size > plen + 2 && input[plen] == sep &&
+              StartsWith(input, pattern));
+    };
+
+    auto size = id.size();
+    if (id == NoopTransform::kClassName()) {
+      result->reset(NewNoopTransform());
+    } else if (Matches(id, size, FixedPrefixTransform::kNickName(), ':')) {
+      auto fixed = strlen(FixedPrefixTransform::kNickName());
+      auto len = ParseSizeT(id.substr(fixed + 1));
+      result->reset(NewFixedPrefixTransform(len));
+    } else if (Matches(id, size, CappedPrefixTransform::kNickName(), ':')) {
+      auto capped = strlen(CappedPrefixTransform::kNickName());
+      auto len = ParseSizeT(id.substr(capped + 1));
+      result->reset(NewCappedPrefixTransform(len));
+    } else if (Matches(id, size, CappedPrefixTransform::kClassName(), '.')) {
+      auto capped = strlen(CappedPrefixTransform::kClassName());
+      auto len = ParseSizeT(id.substr(capped + 1));
+      result->reset(NewCappedPrefixTransform(len));
+    } else if (Matches(id, size, FixedPrefixTransform::kClassName(), '.')) {
+      auto fixed = strlen(FixedPrefixTransform::kClassName());
+      auto len = ParseSizeT(id.substr(fixed + 1));
+      result->reset(NewFixedPrefixTransform(len));
+    } else {
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+    }
+#endif  // ROCKSDB_LITE
+    if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+      return Status::OK();
+    } else if (status.ok()) {
+      SliceTransform* transform = const_cast<SliceTransform*>(result->get());
+      status =
+          Customizable::ConfigureNewObject(config_options, transform, opt_map);
+    }
+  }
+  return status;
+}
+
+std::string SliceTransform::AsString() const {
+#ifndef ROCKSDB_LITE
+  ConfigOptions config_options;
+  config_options.delimiter = ";";
+  return ToString(config_options);
+#else
+  return GetId();
+#endif  // ROCKSDB_LITE
 }
 
 // 2 small internal utility functions, for efficient hex conversions
@@ -197,18 +362,6 @@
   return true;
 }
 
-const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
-  return new FixedPrefixTransform(prefix_len);
-}
-
-const SliceTransform* NewCappedPrefixTransform(size_t cap_len) {
-  return new CappedPrefixTransform(cap_len);
-}
-
-const SliceTransform* NewNoopTransform() {
-  return new NoopTransform;
-}
-
 PinnableSlice::PinnableSlice(PinnableSlice&& other) {
   *this = std::move(other);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/slice_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/slice_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/slice_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -3,9 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "rocksdb/slice.h"
+
+#include <gtest/gtest.h>
+
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "rocksdb/slice.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/regex.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
@@ -154,6 +160,52 @@
   ASSERT_EQ(2, res);
 }
 
+// ***************************************************************** //
+// Unit test for SmallEnumSet
+class SmallEnumSetTest : public testing::Test {
+ public:
+  SmallEnumSetTest() {}
+  ~SmallEnumSetTest() {}
+};
+
+TEST_F(SmallEnumSetTest, SmallSetTest) {
+  FileTypeSet fs;
+  ASSERT_TRUE(fs.Add(FileType::kIdentityFile));
+  ASSERT_FALSE(fs.Add(FileType::kIdentityFile));
+  ASSERT_TRUE(fs.Add(FileType::kInfoLogFile));
+  ASSERT_TRUE(fs.Contains(FileType::kIdentityFile));
+  ASSERT_FALSE(fs.Contains(FileType::kDBLockFile));
+}
+
+// ***************************************************************** //
+// Unit test for Regex
+#ifndef ROCKSDB_LITE
+TEST(RegexTest, ParseEtc) {
+  Regex r;
+  ASSERT_OK(Regex::Parse("[abc]{5}", &r));
+  ASSERT_TRUE(r.Matches("abcba"));
+  ASSERT_FALSE(r.Matches("abcb"));    // too short
+  ASSERT_FALSE(r.Matches("abcbaa"));  // too long
+
+  ASSERT_OK(Regex::Parse(".*foo.*", &r));
+  ASSERT_TRUE(r.Matches("123forfoodie456"));
+  ASSERT_FALSE(r.Matches("123forfodie456"));
+  // Ensure copy operator
+  Regex r2;
+  r2 = r;
+  ASSERT_TRUE(r2.Matches("123forfoodie456"));
+  ASSERT_FALSE(r2.Matches("123forfodie456"));
+  // Ensure copy constructor
+  Regex r3{r};
+  ASSERT_TRUE(r3.Matches("123forfoodie456"));
+  ASSERT_FALSE(r3.Matches("123forfodie456"));
+
+  ASSERT_TRUE(Regex::Parse("*foo.*", &r).IsInvalidArgument());
+  ASSERT_TRUE(Regex::Parse("[abc", &r).IsInvalidArgument());
+  ASSERT_TRUE(Regex::Parse("[abc]{1", &r).IsInvalidArgument());
+}
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/status.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/status.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/status.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/status.cc	2025-05-19 16:14:28.000000000 +0000
@@ -52,12 +52,14 @@
     "Insufficient capacity for merge operands",
     // kManualCompactionPaused
     "Manual compaction paused",
+    " (overwritten)",    // kOverwritten, subcode of OK
+    "Txn not prepared",  // kTxnNotPrepared
+    "IO fenced off",     // kIOFenced
 };
 
 Status::Status(Code _code, SubCode _subcode, const Slice& msg,
-               const Slice& msg2)
-    : code_(_code), subcode_(_subcode), sev_(kNoError) {
-  assert(code_ != kOk);
+               const Slice& msg2, Severity sev)
+    : code_(_code), subcode_(_subcode), sev_(sev) {
   assert(subcode_ != kMaxSubCode);
   const size_t len1 = msg.size();
   const size_t len2 = msg2.size();
@@ -74,8 +76,10 @@
 }
 
 std::string Status::ToString() const {
-  char tmp[30];
-  const char* type;
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  const char* type = nullptr;
   switch (code_) {
     case kOk:
       return "OK";
@@ -118,23 +122,36 @@
     case kTryAgain:
       type = "Operation failed. Try again.: ";
       break;
+    case kCompactionTooLarge:
+      type = "Compaction too large: ";
+      break;
     case kColumnFamilyDropped:
       type = "Column family dropped: ";
       break;
-    default:
-      snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
-               static_cast<int>(code()));
-      type = tmp;
+    case kMaxCode:
+      assert(false);
       break;
   }
+  char tmp[30];
+  if (type == nullptr) {
+    // This should not happen since `code_` should be a valid non-`kMaxCode`
+    // member of the `Code` enum. The above switch-statement should have had a
+    // case assigning `type` to a corresponding string.
+    assert(false);
+    snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast<int>(code()));
+    type = tmp;
+  }
   std::string result(type);
   if (subcode_ != kNone) {
     uint32_t index = static_cast<int32_t>(subcode_);
-    assert(sizeof(msgs) > index);
+    assert(sizeof(msgs) / sizeof(msgs[0]) > index);
     result.append(msgs[index]);
   }
 
   if (state_ != nullptr) {
+    if (subcode_ != kNone) {
+      result.append(": ");
+    }
     result.append(state_);
   }
   return result;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/stop_watch.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/stop_watch.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/stop_watch.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/stop_watch.h	2025-05-19 16:14:28.000000000 +0000
@@ -5,7 +5,7 @@
 //
 #pragma once
 #include "monitoring/statistics.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 // Auto-scoped.
@@ -14,10 +14,10 @@
 // and overwrite is true, it will be added to *elapsed if overwrite is false.
 class StopWatch {
  public:
-  StopWatch(Env* const env, Statistics* statistics, const uint32_t hist_type,
-            uint64_t* elapsed = nullptr, bool overwrite = true,
-            bool delay_enabled = false)
-      : env_(env),
+  StopWatch(SystemClock* clock, Statistics* statistics,
+            const uint32_t hist_type, uint64_t* elapsed = nullptr,
+            bool overwrite = true, bool delay_enabled = false)
+      : clock_(clock),
         statistics_(statistics),
         hist_type_(hist_type),
         elapsed_(elapsed),
@@ -29,15 +29,15 @@
         delay_enabled_(delay_enabled),
         total_delay_(0),
         delay_start_time_(0),
-        start_time_((stats_enabled_ || elapsed != nullptr) ? env->NowMicros()
+        start_time_((stats_enabled_ || elapsed != nullptr) ? clock->NowMicros()
                                                            : 0) {}
 
   ~StopWatch() {
     if (elapsed_) {
       if (overwrite_) {
-        *elapsed_ = env_->NowMicros() - start_time_;
+        *elapsed_ = clock_->NowMicros() - start_time_;
       } else {
-        *elapsed_ += env_->NowMicros() - start_time_;
+        *elapsed_ += clock_->NowMicros() - start_time_;
       }
     }
     if (elapsed_ && delay_enabled_) {
@@ -47,7 +47,7 @@
       statistics_->reportTimeToHistogram(
           hist_type_, (elapsed_ != nullptr)
                           ? *elapsed_
-                          : (env_->NowMicros() - start_time_));
+                          : (clock_->NowMicros() - start_time_));
     }
   }
 
@@ -55,13 +55,13 @@
     // if delay_start_time_ is not 0, it means we are already tracking delay,
     // so delay_start_time_ should not be overwritten
     if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) {
-      delay_start_time_ = env_->NowMicros();
+      delay_start_time_ = clock_->NowMicros();
     }
   }
 
   void DelayStop() {
     if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) {
-      total_delay_ += env_->NowMicros() - delay_start_time_;
+      total_delay_ += clock_->NowMicros() - delay_start_time_;
     }
     // reset to 0 means currently no delay is being tracked, so two consecutive
     // calls to DelayStop will not increase total_delay_
@@ -73,7 +73,7 @@
   uint64_t start_time() const { return start_time_; }
 
  private:
-  Env* const env_;
+  SystemClock* clock_;
   Statistics* statistics_;
   const uint32_t hist_type_;
   uint64_t* elapsed_;
@@ -88,17 +88,17 @@
 // a nano second precision stopwatch
 class StopWatchNano {
  public:
-  explicit StopWatchNano(Env* const env, bool auto_start = false)
-      : env_(env), start_(0) {
+  explicit StopWatchNano(SystemClock* clock, bool auto_start = false)
+      : clock_(clock), start_(0) {
     if (auto_start) {
       Start();
     }
   }
 
-  void Start() { start_ = env_->NowNanos(); }
+  void Start() { start_ = clock_->NowNanos(); }
 
   uint64_t ElapsedNanos(bool reset = false) {
-    auto now = env_->NowNanos();
+    auto now = clock_->NowNanos();
     auto elapsed = now - start_;
     if (reset) {
       start_ = now;
@@ -107,11 +107,11 @@
   }
 
   uint64_t ElapsedNanosSafe(bool reset = false) {
-    return (env_ != nullptr) ? ElapsedNanos(reset) : 0U;
+    return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U;
   }
 
  private:
-  Env* const env_;
+  SystemClock* clock_;
   uint64_t start_;
 };
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/string_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/string_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,7 +6,6 @@
 #include "util/string_util.h"
 
 #include <errno.h>
-#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <algorithm>
@@ -20,6 +19,20 @@
 #include "port/sys_time.h"
 #include "rocksdb/slice.h"
 
+#ifndef __has_cpp_attribute
+#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) 0
+#else
+#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#endif
+
+#if ROCKSDB_HAS_CPP_ATTRIBUTE(maybe_unused) && __cplusplus >= 201703L
+#define ROCKSDB_MAYBE_UNUSED [[maybe_unused]]
+#elif ROCKSDB_HAS_CPP_ATTRIBUTE(gnu::unused) || __GNUC__
+#define ROCKSDB_MAYBE_UNUSED [[gnu::unused]]
+#else
+#define ROCKSDB_MAYBE_UNUSED
+#endif
+
 namespace ROCKSDB_NAMESPACE {
 
 const std::string kNullptrString = "nullptr";
@@ -99,12 +112,6 @@
   }
 }
 
-std::string NumberToString(uint64_t num) {
-  std::string r;
-  AppendNumberTo(&r, num);
-  return r;
-}
-
 std::string NumberToHumanString(int64_t num) {
   char buf[19];
   int64_t absnum = num < 0 ? -num : num;
@@ -263,6 +270,20 @@
   return std::string();
 }
 
+bool EndsWith(const std::string& string, const std::string& pattern) {
+  size_t plen = pattern.size();
+  size_t slen = string.size();
+  if (plen <= slen) {
+    return string.compare(slen - plen, plen, pattern) == 0;
+  } else {
+    return false;
+  }
+}
+
+bool StartsWith(const std::string& string, const std::string& pattern) {
+  return string.compare(0, pattern.size(), pattern) == 0;
+}
+
 #ifndef ROCKSDB_LITE
 
 bool ParseBoolean(const std::string& type, const std::string& value) {
@@ -274,6 +295,15 @@
   throw std::invalid_argument(type);
 }
 
+uint8_t ParseUint8(const std::string& value) {
+  uint64_t num = ParseUint64(value);
+  if ((num >> 8LL) == 0) {
+    return static_cast<uint8_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
 uint32_t ParseUint32(const std::string& value) {
   uint64_t num = ParseUint64(value);
   if ((num >> 32LL) == 0) {
@@ -406,4 +436,66 @@
   return true;
 }
 
+// Copied from folly/string.cpp:
+// https://github.com/facebook/folly/blob/0deef031cb8aab76dc7e736f8b7c22d701d5f36b/folly/String.cpp#L457
+// There are two variants of `strerror_r` function, one returns
+// `int`, and another returns `char*`. Selecting proper version using
+// preprocessor macros portably is extremely hard.
+//
+// For example, on Android function signature depends on `__USE_GNU` and
+// `__ANDROID_API__` macros (https://git.io/fjBBE).
+//
+// So we are using C++ overloading trick: we pass a pointer of
+// `strerror_r` to `invoke_strerror_r` function, and C++ compiler
+// selects proper function.
+
+#if !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
+ROCKSDB_MAYBE_UNUSED
+static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t),
+                                     int err, char* buf, size_t buflen) {
+  // Using XSI-compatible strerror_r
+  int r = strerror_r(err, buf, buflen);
+
+  // OSX/FreeBSD use EINVAL and Linux uses -1 so just check for non-zero
+  if (r != 0) {
+    snprintf(buf, buflen, "Unknown error %d (strerror_r failed with error %d)",
+             err, errno);
+  }
+  return buf;
+}
+
+ROCKSDB_MAYBE_UNUSED
+static std::string invoke_strerror_r(char* (*strerror_r)(int, char*, size_t),
+                                     int err, char* buf, size_t buflen) {
+  // Using GNU strerror_r
+  return strerror_r(err, buf, buflen);
+}
+#endif  // !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
+
+std::string errnoStr(int err) {
+  char buf[1024];
+  buf[0] = '\0';
+
+  std::string result;
+
+  // https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man3/strerror_r.3.html
+  // http://www.kernel.org/doc/man-pages/online/pages/man3/strerror.3.html
+#if defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))
+  // mingw64 has no strerror_r, but Windows has strerror_s, which C11 added
+  // as well. So maybe we should use this across all platforms (together
+  // with strerrorlen_s). Note strerror_r and _s have swapped args.
+  int r = strerror_s(buf, sizeof(buf), err);
+  if (r != 0) {
+    snprintf(buf, sizeof(buf),
+             "Unknown error %d (strerror_r failed with error %d)", err, errno);
+  }
+  result.assign(buf);
+#else
+  // Using any strerror_r
+  result.assign(invoke_strerror_r(strerror_r, err, buf, sizeof(buf)));
+#endif
+
+  return result;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/string_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/string_util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/string_util.h	2025-05-19 16:14:28.000000000 +0000
@@ -39,8 +39,47 @@
 // Escapes any non-printable characters found in "value".
 extern void AppendEscapedStringTo(std::string* str, const Slice& value);
 
-// Return a string printout of "num"
-extern std::string NumberToString(uint64_t num);
+// Put n digits from v in base kBase to (*buf)[0] to (*buf)[n-1] and
+// advance *buf to the position after what was written.
+template <size_t kBase>
+inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) {
+  const char* digitChars = uppercase ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                     : "0123456789abcdefghijklmnopqrstuvwxyz";
+  for (size_t i = n; i > 0; --i) {
+    (*buf)[i - 1] = digitChars[static_cast<size_t>(v % kBase)];
+    v /= kBase;
+  }
+  *buf += n;
+}
+
+// Parse n digits from *buf in base kBase to *v and advance *buf to the
+// position after what was read. On success, true is returned. On failure,
+// false is returned, *buf is placed at the first bad character, and *v
+// contains the partial parsed data. Overflow is not checked but the
+// result is accurate mod 2^64. Requires the starting value of *v to be
+// zero or previously accumulated parsed digits, i.e.
+//   ParseBaseChars(&b, n, &v);
+// is equivalent to n calls to
+//   ParseBaseChars(&b, 1, &v);
+template <int kBase>
+inline bool ParseBaseChars(const char** buf, size_t n, uint64_t* v) {
+  while (n) {
+    char c = **buf;
+    *v *= static_cast<uint64_t>(kBase);
+    if (c >= '0' && (kBase >= 10 ? c <= '9' : c < '0' + kBase)) {
+      *v += static_cast<uint64_t>(c - '0');
+    } else if (kBase > 10 && c >= 'A' && c < 'A' + kBase - 10) {
+      *v += static_cast<uint64_t>(c - 'A' + 10);
+    } else if (kBase > 10 && c >= 'a' && c < 'a' + kBase - 10) {
+      *v += static_cast<uint64_t>(c - 'a' + 10);
+    } else {
+      return false;
+    }
+    --n;
+    ++*buf;
+  }
+  return true;
+}
 
 // Return a human-readable version of num.
 // for num >= 10.000, prints "xxK"
@@ -111,9 +150,17 @@
 
 std::string trim(const std::string& str);
 
+// Returns true if "string" ends with "pattern"
+bool EndsWith(const std::string& string, const std::string& pattern);
+
+// Returns true if "string" starts with "pattern"
+bool StartsWith(const std::string& string, const std::string& pattern);
+
 #ifndef ROCKSDB_LITE
 bool ParseBoolean(const std::string& type, const std::string& value);
 
+uint8_t ParseUint8(const std::string& value);
+
 uint32_t ParseUint32(const std::string& value);
 
 int32_t ParseInt32(const std::string& value);
@@ -135,4 +182,8 @@
 
 extern const std::string kNullptrString;
 
+// errnoStr() function returns a string that describes the error code passed in
+// the argument err
+extern std::string errnoStr(int err);
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_guard.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_guard.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_guard.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_guard.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,41 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Resource management object for threads that joins the thread upon
+// destruction. Has unique ownership of the thread object, so copying it is not
+// allowed, while moving it transfers ownership.
+class ThreadGuard {
+ public:
+  ThreadGuard() = default;
+
+  explicit ThreadGuard(port::Thread&& thread) : thread_(std::move(thread)) {}
+
+  ThreadGuard(const ThreadGuard&) = delete;
+  ThreadGuard& operator=(const ThreadGuard&) = delete;
+
+  ThreadGuard(ThreadGuard&&) noexcept = default;
+  ThreadGuard& operator=(ThreadGuard&&) noexcept = default;
+
+  ~ThreadGuard() {
+    if (thread_.joinable()) {
+      thread_.join();
+    }
+  }
+
+  const port::Thread& GetThread() const { return thread_; }
+  port::Thread& GetThread() { return thread_; }
+
+ private:
+  port::Thread thread_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_list_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_list_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_list_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_list_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -38,6 +38,7 @@
   void Run() {
     std::unique_lock<std::mutex> l(mutex_);
     running_count_++;
+    bg_cv_.notify_all();
     Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
     Env::Default()->GetThreadStatusUpdater()->SetThreadOperation(
         operation_type_);
@@ -58,9 +59,10 @@
     bg_cv_.notify_all();
   }
 
-  void WaitUntilScheduled(int job_count, Env* env) {
+  void WaitUntilScheduled(int job_count) {
+    std::unique_lock<std::mutex> l(mutex_);
     while (running_count_ < job_count) {
-      env->SleepForMicroseconds(1000);
+      bg_cv_.wait(l);
     }
   }
 
@@ -139,13 +141,13 @@
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &running_task, Env::Priority::LOW);
   }
-  running_task.WaitUntilScheduled(
-      kSimulatedHighPriThreads + kSimulatedLowPriThreads, env);
+  running_task.WaitUntilScheduled(kSimulatedHighPriThreads +
+                                  kSimulatedLowPriThreads);
 
   std::vector<ThreadStatus> thread_list;
 
   // Verify the number of running threads in each pool.
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0};
   for (auto thread_status : thread_list) {
     if (thread_status.cf_name == "pikachu" &&
@@ -166,7 +168,7 @@
   running_task.WaitUntilDone();
 
   // Verify none of the threads are running
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
 
   for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) {
     running_count[i] = 0;
@@ -256,32 +258,32 @@
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &flush_write_task, Env::Priority::HIGH);
   }
-  flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env);
+  flush_write_task.WaitUntilScheduled(kFlushWriteTasks);
 
   for (int t = 0; t < kCompactionWriteTasks; ++t) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &compaction_write_task, Env::Priority::LOW);
   }
-  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env);
+  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks);
 
   for (int t = 0; t < kCompactionReadTasks; ++t) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &compaction_read_task, Env::Priority::LOW);
   }
-  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env);
+  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks);
 
   for (int t = 0; t < kCompactionWaitTasks; ++t) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &compaction_wait_task, Env::Priority::LOW);
   }
-  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env);
+  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks);
 
   // verify the thread-status
   int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
   int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0};
 
   std::vector<ThreadStatus> thread_list;
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -293,7 +295,7 @@
   UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
               ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -305,7 +307,7 @@
   UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH,
               ThreadStatus::OP_UNKNOWN, kFlushWriteTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -317,7 +319,7 @@
   UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
               ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -329,7 +331,7 @@
   UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
               ThreadStatus::OP_UNKNOWN, kCompactionReadTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_local.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_local.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.cc	2025-05-19 16:14:28.000000000 +0000
@@ -29,7 +29,7 @@
 // global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr
 // instances, each thread will have a ThreadData with a vector of size 3:
 //     ---------------------------------------------------
-//     |          | instance 1 | instance 2 | instnace 3 |
+//     |          | instance 1 | instance 2 | instance 3 |
 //     ---------------------------------------------------
 //     | thread 1 |    void*   |    void*   |    void*   | <- ThreadData
 //     ---------------------------------------------------
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_local.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_local.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local.h	2025-05-19 16:14:28.000000000 +0000
@@ -31,7 +31,7 @@
 // is needed to avoid deadlocks. In particular, the handler shouldn't lock any
 // mutexes and shouldn't call any methods of any ThreadLocalPtr instances,
 // unless you know what you're doing.
-typedef void (*UnrefHandler)(void* ptr);
+using UnrefHandler = void (*)(void* ptr);
 
 // ThreadLocalPtr stores only values of pointer type.  Different from
 // the usual thread-local-storage, ThreadLocalPtr has the ability to
@@ -69,7 +69,7 @@
   // data for all existing threads
   void Scrape(autovector<void*>* ptrs, void* const replacement);
 
-  typedef std::function<void(void*, void*)> FoldFunc;
+  using FoldFunc = std::function<void(void*, void*)>;
   // Update res by applying func on each thread-local value. Holds a lock that
   // prevents unref handler from running during this call, but clients must
   // still provide external synchronization since the owning thread can
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_local_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/thread_local_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/thread_local_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -3,9 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <thread>
+#include "util/thread_local.h"
+
 #include <atomic>
 #include <string>
+#include <thread>
 
 #include "port/port.h"
 #include "rocksdb/env.h"
@@ -13,7 +15,6 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/autovector.h"
-#include "util/thread_local.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -51,10 +52,8 @@
 };
 
 class IDChecker : public ThreadLocalPtr {
-public:
-  static uint32_t PeekId() {
-    return TEST_PeekId();
-  }
+ public:
+  static uint32_t PeekId() { return TEST_PeekId(); }
 };
 
 }  // anonymous namespace
@@ -119,9 +118,11 @@
   ThreadLocalPtr tls2;
   p.tls2 = &tls2;
 
-  auto func = [](void* ptr) {
-    auto& params = *static_cast<Params*>(ptr);
+  ASSERT_GT(IDChecker::PeekId(), base_id);
+  base_id = IDChecker::PeekId();
 
+  auto func = [](Params* ptr) {
+    Params& params = *ptr;
     ASSERT_TRUE(params.tls1.Get() == nullptr);
     params.tls1.Reset(reinterpret_cast<int*>(1));
     ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1));
@@ -141,15 +142,16 @@
   };
 
   for (int iter = 0; iter < 1024; ++iter) {
-    ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+    ASSERT_EQ(IDChecker::PeekId(), base_id);
     // Another new thread, read/write should not see value from previous thread
-    env_->StartThread(func, static_cast<void*>(&p));
+    env_->StartThreadTyped(func, &p);
+
     mu.Lock();
     while (p.completed != iter + 1) {
       cv.Wait();
     }
     mu.Unlock();
-    ASSERT_EQ(IDChecker::PeekId(), base_id + 1u);
+    ASSERT_EQ(IDChecker::PeekId(), base_id);
   }
 }
 
@@ -218,10 +220,10 @@
   // Each thread local copy of the value are also different from each
   // other.
   for (int th = 0; th < p1.total; ++th) {
-    env_->StartThread(func, static_cast<void*>(&p1));
+    env_->StartThreadTyped(func, &p1);
   }
   for (int th = 0; th < p2.total; ++th) {
-    env_->StartThread(func, static_cast<void*>(&p2));
+    env_->StartThreadTyped(func, &p2);
   }
 
   mu1.Lock();
@@ -248,9 +250,8 @@
   };
 
   // Case 0: no unref triggered if ThreadLocalPtr is never accessed
-  auto func0 = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
-
+  auto func0 = [](Params* ptr) {
+    auto& p = *ptr;
     p.mu->Lock();
     ++(p.started);
     p.cv->SignalAll();
@@ -267,15 +268,15 @@
     Params p(&mu, &cv, &unref_count, th, unref);
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func0, static_cast<void*>(&p));
+      env_->StartThreadTyped(func0, &p);
     }
     env_->WaitForJoin();
     ASSERT_EQ(unref_count, 0);
   }
 
   // Case 1: unref triggered by thread exit
-  auto func1 = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
+  auto func1 = [](Params* ptr) {
+    auto& p = *ptr;
 
     p.mu->Lock();
     ++(p.started);
@@ -304,7 +305,7 @@
     p.tls2 = &tls2;
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func1, static_cast<void*>(&p));
+      env_->StartThreadTyped(func1, &p);
     }
 
     env_->WaitForJoin();
@@ -314,8 +315,8 @@
   }
 
   // Case 2: unref triggered by ThreadLocal instance destruction
-  auto func2 = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
+  auto func2 = [](Params* ptr) {
+    auto& p = *ptr;
 
     p.mu->Lock();
     ++(p.started);
@@ -353,7 +354,7 @@
     p.tls2 = new ThreadLocalPtr(unref);
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func2, static_cast<void*>(&p));
+      env_->StartThreadTyped(func2, &p);
     }
 
     // Wait for all threads to finish using Params
@@ -428,7 +429,7 @@
     p.tls2 = new ThreadLocalPtr(unref);
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func, static_cast<void*>(&p));
+      env_->StartThreadTyped(func, &p);
     }
 
     // Wait for all threads to finish using Params
@@ -487,7 +488,7 @@
   };
 
   for (int th = 0; th < params.total; ++th) {
-    env_->StartThread(func, static_cast<void*>(&params));
+    env_->StartThread(func, &params);
   }
 
   // Wait for all threads to finish using Params
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/threadpool_imp.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/threadpool_imp.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.cc	2025-05-19 16:14:28.000000000 +0000
@@ -9,9 +9,6 @@
 
 #include "util/threadpool_imp.h"
 
-#include "monitoring/thread_status_util.h"
-#include "port/port.h"
-
 #ifndef OS_WIN
 #  include <unistd.h>
 #endif
@@ -22,6 +19,7 @@
 #endif
 
 #include <stdlib.h>
+
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
@@ -31,11 +29,16 @@
 #include <thread>
 #include <vector>
 
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 void ThreadPoolImpl::PthreadCall(const char* label, int result) {
   if (result != 0) {
-    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str());
     abort();
   }
 }
@@ -56,7 +59,7 @@
 
   void LowerIOPriority();
 
-  void LowerCPUPriority();
+  void LowerCPUPriority(CpuPriority pri);
 
   void WakeUpAllThreads() {
     bgsignal_.notify_all();
@@ -101,7 +104,7 @@
  static void BGThreadWrapper(void* arg);
 
  bool low_io_priority_;
- bool low_cpu_priority_;
+ CpuPriority cpu_priority_;
  Env::Priority priority_;
  Env* env_;
 
@@ -125,12 +128,9 @@
   std::vector<port::Thread> bgthreads_;
 };
 
-
-inline
-ThreadPoolImpl::Impl::Impl()
-    :
-      low_io_priority_(false),
-      low_cpu_priority_(false),
+inline ThreadPoolImpl::Impl::Impl()
+    : low_io_priority_(false),
+      cpu_priority_(CpuPriority::kNormal),
       priority_(Env::LOW),
       env_(nullptr),
       total_threads_limit_(0),
@@ -140,8 +140,7 @@
       queue_(),
       mu_(),
       bgsignal_(),
-      bgthreads_() {
-}
+      bgthreads_() {}
 
 inline
 ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); }
@@ -177,15 +176,14 @@
   low_io_priority_ = true;
 }
 
-inline
-void ThreadPoolImpl::Impl::LowerCPUPriority() {
+inline void ThreadPoolImpl::Impl::LowerCPUPriority(CpuPriority pri) {
   std::lock_guard<std::mutex> lock(mu_);
-  low_cpu_priority_ = true;
+  cpu_priority_ = pri;
 }
 
 void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
   bool low_io_priority = false;
-  bool low_cpu_priority = false;
+  CpuPriority current_cpu_priority = CpuPriority::kNormal;
 
   while (true) {
     // Wait until there is an item that is ready to run
@@ -202,12 +200,12 @@
           queue_.empty()) {
         break;
        }
-    }
-
-    if (IsLastExcessiveThread(thread_id)) {
+    } else if (IsLastExcessiveThread(thread_id)) {
       // Current thread is the last generated one and is excessive.
       // We always terminate excessive thread in the reverse order of
-      // generation time.
+      // generation time. But not when `exit_all_threads_ == true`,
+      // otherwise `JoinThreads()` could try to `join()` a `detach()`ed
+      // thread.
       auto& terminating_thread = bgthreads_.back();
       terminating_thread.detach();
       bgthreads_.pop_back();
@@ -226,20 +224,20 @@
                      std::memory_order_relaxed);
 
     bool decrease_io_priority = (low_io_priority != low_io_priority_);
-    bool decrease_cpu_priority = (low_cpu_priority != low_cpu_priority_);
+    CpuPriority cpu_priority = cpu_priority_;
     lock.unlock();
 
-#ifdef OS_LINUX
-    if (decrease_cpu_priority) {
-      setpriority(
-          PRIO_PROCESS,
-          // Current thread.
-          0,
-          // Lowest priority possible.
-          19);
-      low_cpu_priority = true;
+    if (cpu_priority < current_cpu_priority) {
+      TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::BeforeSetCpuPriority",
+                               &current_cpu_priority);
+      // 0 means current thread.
+      port::SetCpuPriority(0, cpu_priority);
+      current_cpu_priority = cpu_priority;
+      TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::AfterSetCpuPriority",
+                               &current_cpu_priority);
     }
 
+#ifdef OS_LINUX
     if (decrease_io_priority) {
 #define IOPRIO_CLASS_SHIFT (13)
 #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
@@ -260,8 +258,11 @@
     }
 #else
     (void)decrease_io_priority;  // avoid 'unused variable' error
-    (void)decrease_cpu_priority;
 #endif
+
+    TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::Impl::BGThread:BeforeRun",
+                             &priority_);
+
     func();
   }
 }
@@ -346,7 +347,6 @@
     for (char c : thread_priority) {
       thread_name_stream << static_cast<char>(tolower(c));
     }
-    thread_name_stream << bgthreads_.size();
     pthread_setname_np(th_handle, thread_name_stream.str().c_str());
 #endif
 #endif
@@ -451,8 +451,8 @@
   impl_->LowerIOPriority();
 }
 
-void ThreadPoolImpl::LowerCPUPriority() {
-  impl_->LowerCPUPriority();
+void ThreadPoolImpl::LowerCPUPriority(CpuPriority pri) {
+  impl_->LowerCPUPriority(pri);
 }
 
 void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/threadpool_imp.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/threadpool_imp.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/threadpool_imp.h	2025-05-19 16:14:28.000000000 +0000
@@ -51,7 +51,7 @@
 
   // Make threads to run at a lower kernel CPU priority
   // Currently only has effect on Linux
-  void LowerCPUPriority();
+  void LowerCPUPriority(CpuPriority pri);
 
   // Ensure there is at aleast num threads in the pool
   // but do not kill threads if there are more
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/timer.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/timer.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,331 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <queue>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A Timer class to handle repeated work.
+//
+// `Start()` and `Shutdown()` are currently not thread-safe. The client must
+// serialize calls to these two member functions.
+//
+// A single timer instance can handle multiple functions via a single thread.
+// It is better to leave long running work to a dedicated thread pool.
+//
+// Timer can be started by calling `Start()`, and ended by calling `Shutdown()`.
+// Work (in terms of a `void function`) can be scheduled by calling `Add` with
+// a unique function name and de-scheduled by calling `Cancel`.
+// Many functions can be added.
+//
+// Impl Details:
+// A heap is used to keep track of when the next timer goes off.
+// A map from a function name to the function keeps track of all the functions.
+class Timer {
+ public:
+  explicit Timer(SystemClock* clock)
+      : clock_(clock),
+        mutex_(clock),
+        cond_var_(&mutex_),
+        running_(false),
+        executing_task_(false) {}
+
+  ~Timer() { Shutdown(); }
+
+  // Add a new function to run.
+  // fn_name has to be identical, otherwise, the new one overrides the existing
+  // one, regardless if the function is pending removed (invalid) or not.
+  // start_after_us is the initial delay.
+  // repeat_every_us is the interval between ending time of the last call and
+  // starting time of the next call. For example, repeat_every_us = 2000 and
+  // the function takes 1000us to run. If it starts at time [now]us, then it
+  // finishes at [now]+1000us, 2nd run starting time will be at [now]+3000us.
+  // repeat_every_us == 0 means do not repeat.
+  void Add(std::function<void()> fn,
+           const std::string& fn_name,
+           uint64_t start_after_us,
+           uint64_t repeat_every_us) {
+    std::unique_ptr<FunctionInfo> fn_info(new FunctionInfo(
+        std::move(fn), fn_name, clock_->NowMicros() + start_after_us,
+        repeat_every_us));
+    {
+      InstrumentedMutexLock l(&mutex_);
+      auto it = map_.find(fn_name);
+      if (it == map_.end()) {
+        heap_.push(fn_info.get());
+        map_.emplace(std::make_pair(fn_name, std::move(fn_info)));
+      } else {
+        // If it already exists, overriding it.
+        it->second->fn = std::move(fn_info->fn);
+        it->second->valid = true;
+        it->second->next_run_time_us = clock_->NowMicros() + start_after_us;
+        it->second->repeat_every_us = repeat_every_us;
+      }
+    }
+    cond_var_.SignalAll();
+  }
+
+  void Cancel(const std::string& fn_name) {
+    InstrumentedMutexLock l(&mutex_);
+
+    // Mark the function with fn_name as invalid so that it will not be
+    // requeued.
+    auto it = map_.find(fn_name);
+    if (it != map_.end() && it->second) {
+      it->second->Cancel();
+    }
+
+    // If the currently running function is fn_name, then we need to wait
+    // until it finishes before returning to caller.
+    while (!heap_.empty() && executing_task_) {
+      FunctionInfo* func_info = heap_.top();
+      assert(func_info);
+      if (func_info->name == fn_name) {
+        WaitForTaskCompleteIfNecessary();
+      } else {
+        break;
+      }
+    }
+  }
+
+  void CancelAll() {
+    InstrumentedMutexLock l(&mutex_);
+    CancelAllWithLock();
+  }
+
+  // Start the Timer
+  bool Start() {
+    InstrumentedMutexLock l(&mutex_);
+    if (running_) {
+      return false;
+    }
+
+    running_ = true;
+    thread_.reset(new port::Thread(&Timer::Run, this));
+    return true;
+  }
+
+  // Shutdown the Timer
+  bool Shutdown() {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (!running_) {
+        return false;
+      }
+      running_ = false;
+      CancelAllWithLock();
+      cond_var_.SignalAll();
+    }
+
+    if (thread_) {
+      thread_->join();
+    }
+    return true;
+  }
+
+  bool HasPendingTask() const {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto it = map_.begin(); it != map_.end(); it++) {
+      if (it->second->IsValid()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+#ifndef NDEBUG
+  // Wait until Timer starting waiting, call the optional callback, then wait
+  // for Timer waiting again.
+  // Tests can provide a custom Clock object to mock time, and use the callback
+  // here to bump current time and trigger Timer. See timer_test for example.
+  //
+  // Note: only support one caller of this method.
+  void TEST_WaitForRun(std::function<void()> callback = nullptr) {
+    InstrumentedMutexLock l(&mutex_);
+    // It act as a spin lock
+    while (executing_task_ ||
+           (!heap_.empty() &&
+            heap_.top()->next_run_time_us <= clock_->NowMicros())) {
+      cond_var_.TimedWait(clock_->NowMicros() + 1000);
+    }
+    if (callback != nullptr) {
+      callback();
+    }
+    cond_var_.SignalAll();
+    do {
+      cond_var_.TimedWait(clock_->NowMicros() + 1000);
+    } while (executing_task_ ||
+             (!heap_.empty() &&
+              heap_.top()->next_run_time_us <= clock_->NowMicros()));
+  }
+
+  size_t TEST_GetPendingTaskNum() const {
+    InstrumentedMutexLock l(&mutex_);
+    size_t ret = 0;
+    for (auto it = map_.begin(); it != map_.end(); it++) {
+      if (it->second->IsValid()) {
+        ret++;
+      }
+    }
+    return ret;
+  }
+#endif  // NDEBUG
+
+ private:
+
+  void Run() {
+    InstrumentedMutexLock l(&mutex_);
+
+    while (running_) {
+      if (heap_.empty()) {
+        // wait
+        TEST_SYNC_POINT("Timer::Run::Waiting");
+        cond_var_.Wait();
+        continue;
+      }
+
+      FunctionInfo* current_fn = heap_.top();
+      assert(current_fn);
+
+      if (!current_fn->IsValid()) {
+        heap_.pop();
+        map_.erase(current_fn->name);
+        continue;
+      }
+
+      if (current_fn->next_run_time_us <= clock_->NowMicros()) {
+        // make a copy of the function so it won't be changed after
+        // mutex_.unlock.
+        std::function<void()> fn = current_fn->fn;
+        executing_task_ = true;
+        mutex_.Unlock();
+        // Execute the work
+        fn();
+        mutex_.Lock();
+        executing_task_ = false;
+        cond_var_.SignalAll();
+
+        // Remove the work from the heap once it is done executing.
+        // Note that we are just removing the pointer from the heap. Its
+        // memory is still managed in the map (as it holds a unique ptr).
+        // So current_fn is still a valid ptr.
+        heap_.pop();
+
+        // current_fn may be cancelled already.
+        if (current_fn->IsValid() && current_fn->repeat_every_us > 0) {
+          assert(running_);
+          current_fn->next_run_time_us =
+              clock_->NowMicros() + current_fn->repeat_every_us;
+
+          // Schedule new work into the heap with new time.
+          heap_.push(current_fn);
+        }
+      } else {
+        cond_var_.TimedWait(current_fn->next_run_time_us);
+      }
+    }
+  }
+
+  void CancelAllWithLock() {
+    mutex_.AssertHeld();
+    if (map_.empty() && heap_.empty()) {
+      return;
+    }
+
+    // With mutex_ held, set all tasks to invalid so that they will not be
+    // re-queued.
+    for (auto& elem : map_) {
+      auto& func_info = elem.second;
+      assert(func_info);
+      func_info->Cancel();
+    }
+
+    // WaitForTaskCompleteIfNecessary() may release mutex_
+    WaitForTaskCompleteIfNecessary();
+
+    while (!heap_.empty()) {
+      heap_.pop();
+    }
+    map_.clear();
+  }
+
+  // A wrapper around std::function to keep track when it should run next
+  // and at what frequency.
+  struct FunctionInfo {
+    // the actual work
+    std::function<void()> fn;
+    // name of the function
+    std::string name;
+    // when the function should run next
+    uint64_t next_run_time_us;
+    // repeat interval
+    uint64_t repeat_every_us;
+    // controls whether this function is valid.
+    // A function is valid upon construction and until someone explicitly
+    // calls `Cancel()`.
+    bool valid;
+
+    FunctionInfo(std::function<void()>&& _fn, const std::string& _name,
+                 const uint64_t _next_run_time_us, uint64_t _repeat_every_us)
+        : fn(std::move(_fn)),
+          name(_name),
+          next_run_time_us(_next_run_time_us),
+          repeat_every_us(_repeat_every_us),
+          valid(true) {}
+
+    void Cancel() {
+      valid = false;
+    }
+
+    bool IsValid() const { return valid; }
+  };
+
+  void WaitForTaskCompleteIfNecessary() {
+    mutex_.AssertHeld();
+    while (executing_task_) {
+      TEST_SYNC_POINT("Timer::WaitForTaskCompleteIfNecessary:TaskExecuting");
+      cond_var_.Wait();
+    }
+  }
+
+  struct RunTimeOrder {
+    bool operator()(const FunctionInfo* f1,
+                    const FunctionInfo* f2) {
+      return f1->next_run_time_us > f2->next_run_time_us;
+    }
+  };
+
+  SystemClock* clock_;
+  // This mutex controls both the heap_ and the map_. It needs to be held for
+  // making any changes in them.
+  mutable InstrumentedMutex mutex_;
+  InstrumentedCondVar cond_var_;
+  std::unique_ptr<port::Thread> thread_;
+  bool running_;
+  bool executing_task_;
+
+  std::priority_queue<FunctionInfo*,
+                      std::vector<FunctionInfo*>,
+                      RunTimeOrder> heap_;
+
+  // In addition to providing a mapping from a function name to a function,
+  // it is also responsible for memory management.
+  std::unordered_map<std::string, std::unique_ptr<FunctionInfo>> map_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/timer_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/timer_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/timer_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,402 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/timer.h"
+
+#include "db/db_test_util.h"
+#include "rocksdb/file_system.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TimerTest : public testing::Test {
+ public:
+  TimerTest()
+      : mock_clock_(std::make_shared<MockSystemClock>(SystemClock::Default())) {
+  }
+
+ protected:
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override { mock_clock_->InstallTimedWaitFixCallback(); }
+
+  const int kUsPerSec = 1000000;
+};
+
+TEST_F(TimerTest, SingleScheduleOnce) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  Timer timer(mock_clock_.get());
+
+  int count = 0;
+  timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, 0);
+
+  ASSERT_TRUE(timer.Start());
+
+  ASSERT_EQ(0, count);
+  // Wait for execution to finish
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+  ASSERT_EQ(1, count);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, MultipleScheduleOnce) {
+  const int kInitDelay1Us = 1 * kUsPerSec;
+  const int kInitDelay2Us = 3 * kUsPerSec;
+  Timer timer(mock_clock_.get());
+
+  int count1 = 0;
+  timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, 0);
+
+  int count2 = 0;
+  timer.Add([&] { count2++; }, "fn_sch_test2", kInitDelay2Us, 0);
+
+  ASSERT_TRUE(timer.Start());
+  ASSERT_EQ(0, count1);
+  ASSERT_EQ(0, count2);
+
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelay1Us); });
+
+  ASSERT_EQ(1, count1);
+  ASSERT_EQ(0, count2);
+
+  timer.TEST_WaitForRun([&] {
+    mock_clock_->SleepForMicroseconds(kInitDelay2Us - kInitDelay1Us);
+  });
+
+  ASSERT_EQ(1, count1);
+  ASSERT_EQ(1, count2);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, SingleScheduleRepeatedly) {
+  const int kIterations = 5;
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 1 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+  int count = 0;
+  timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
+
+  ASSERT_TRUE(timer.Start());
+  ASSERT_EQ(0, count);
+
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+
+  ASSERT_EQ(1, count);
+
+  // Wait for execution to finish
+  for (int i = 1; i < kIterations; i++) {
+    timer.TEST_WaitForRun(
+        [&] { mock_clock_->SleepForMicroseconds(kRepeatUs); });
+  }
+  ASSERT_EQ(kIterations, count);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, MultipleScheduleRepeatedly) {
+  const int kIterations = 5;
+  const int kInitDelay1Us = 0 * kUsPerSec;
+  const int kInitDelay2Us = 1 * kUsPerSec;
+  const int kInitDelay3Us = 0 * kUsPerSec;
+  const int kRepeatUs = 2 * kUsPerSec;
+  const int kLargeRepeatUs = 100 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+
+  int count1 = 0;
+  timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, kRepeatUs);
+
+  int count2 = 0;
+  timer.Add([&] { count2++; }, "fn_sch_test2", kInitDelay2Us, kRepeatUs);
+
+  // Add a function with relatively large repeat interval
+  int count3 = 0;
+  timer.Add([&] { count3++; }, "fn_sch_test3", kInitDelay3Us, kLargeRepeatUs);
+
+  ASSERT_TRUE(timer.Start());
+
+  ASSERT_EQ(0, count2);
+  // Wait for execution to finish
+  for (int i = 1; i < kIterations * (kRepeatUs / kUsPerSec); i++) {
+    timer.TEST_WaitForRun(
+        [&] { mock_clock_->SleepForMicroseconds(1 * kUsPerSec); });
+    ASSERT_EQ((i + 2) / (kRepeatUs / kUsPerSec), count1);
+    ASSERT_EQ((i + 1) / (kRepeatUs / kUsPerSec), count2);
+
+    // large interval function should only run once (the first one).
+    ASSERT_EQ(1, count3);
+  }
+
+  timer.Cancel("fn_sch_test1");
+
+  // Wait for execution to finish
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(1 * kUsPerSec); });
+  ASSERT_EQ(kIterations, count1);
+  ASSERT_EQ(kIterations, count2);
+  ASSERT_EQ(1, count3);
+
+  timer.Cancel("fn_sch_test2");
+
+  ASSERT_EQ(kIterations, count1);
+  ASSERT_EQ(kIterations, count2);
+
+  // execute the long interval one
+  timer.TEST_WaitForRun([&] {
+    mock_clock_->SleepForMicroseconds(
+        kLargeRepeatUs - static_cast<int>(mock_clock_->NowMicros()));
+  });
+  ASSERT_EQ(2, count3);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, AddAfterStartTest) {
+  const int kIterations = 5;
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 1 * kUsPerSec;
+
+  // wait timer to run and then add a new job
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"Timer::Run::Waiting", "TimerTest:AddAfterStartTest:1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Timer timer(mock_clock_.get());
+
+  ASSERT_TRUE(timer.Start());
+
+  TEST_SYNC_POINT("TimerTest:AddAfterStartTest:1");
+  int count = 0;
+  timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
+  ASSERT_EQ(0, count);
+  // Wait for execution to finish
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+  ASSERT_EQ(1, count);
+
+  for (int i = 1; i < kIterations; i++) {
+    timer.TEST_WaitForRun(
+        [&] { mock_clock_->SleepForMicroseconds(kRepeatUs); });
+  }
+  ASSERT_EQ(kIterations, count);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, CancelRunningTask) {
+  static constexpr char kTestFuncName[] = "test_func";
+  const int kRepeatUs = 1 * kUsPerSec;
+  Timer timer(mock_clock_.get());
+  ASSERT_TRUE(timer.Start());
+  int* value = new int;
+  *value = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"TimerTest::CancelRunningTask:test_func:0",
+       "TimerTest::CancelRunningTask:BeforeCancel"},
+      {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting",
+       "TimerTest::CancelRunningTask:test_func:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  timer.Add(
+      [&]() {
+        *value = 1;
+        TEST_SYNC_POINT("TimerTest::CancelRunningTask:test_func:0");
+        TEST_SYNC_POINT("TimerTest::CancelRunningTask:test_func:1");
+      },
+      kTestFuncName, 0, kRepeatUs);
+  port::Thread control_thr([&]() {
+    TEST_SYNC_POINT("TimerTest::CancelRunningTask:BeforeCancel");
+    timer.Cancel(kTestFuncName);
+    // Verify that *value has been set to 1.
+    ASSERT_EQ(1, *value);
+    delete value;
+    value = nullptr;
+  });
+  mock_clock_->SleepForMicroseconds(kRepeatUs);
+  control_thr.join();
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, ShutdownRunningTask) {
+  const int kRepeatUs = 1 * kUsPerSec;
+  constexpr char kTestFunc1Name[] = "test_func1";
+  constexpr char kTestFunc2Name[] = "test_func2";
+  Timer timer(mock_clock_.get());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"TimerTest::ShutdownRunningTest:test_func:0",
+       "TimerTest::ShutdownRunningTest:BeforeShutdown"},
+      {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting",
+       "TimerTest::ShutdownRunningTest:test_func:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(timer.Start());
+
+  int* value = new int;
+  *value = 0;
+  timer.Add(
+      [&]() {
+        TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:test_func:0");
+        *value = 1;
+        TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:test_func:1");
+      },
+      kTestFunc1Name, 0, kRepeatUs);
+
+  timer.Add([&]() { ++(*value); }, kTestFunc2Name, 0, kRepeatUs);
+
+  port::Thread control_thr([&]() {
+    TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:BeforeShutdown");
+    timer.Shutdown();
+  });
+  mock_clock_->SleepForMicroseconds(kRepeatUs);
+  control_thr.join();
+  delete value;
+}
+
+TEST_F(TimerTest, AddSameFuncName) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeat1Us = 5 * kUsPerSec;
+  const int kRepeat2Us = 4 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+  ASSERT_TRUE(timer.Start());
+
+  int func_counter1 = 0;
+  timer.Add([&] { func_counter1++; }, "duplicated_func", kInitDelayUs,
+            kRepeat1Us);
+
+  int func2_counter = 0;
+  timer.Add([&] { func2_counter++; }, "func2", kInitDelayUs, kRepeat2Us);
+
+  // New function with the same name should override the existing one
+  int func_counter2 = 0;
+  timer.Add([&] { func_counter2++; }, "duplicated_func", kInitDelayUs,
+            kRepeat1Us);
+
+  ASSERT_EQ(0, func_counter1);
+  ASSERT_EQ(0, func2_counter);
+  ASSERT_EQ(0, func_counter2);
+
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+
+  ASSERT_EQ(0, func_counter1);
+  ASSERT_EQ(1, func2_counter);
+  ASSERT_EQ(1, func_counter2);
+
+  timer.TEST_WaitForRun([&] { mock_clock_->SleepForMicroseconds(kRepeat1Us); });
+
+  ASSERT_EQ(0, func_counter1);
+  ASSERT_EQ(2, func2_counter);
+  ASSERT_EQ(2, func_counter2);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, RepeatIntervalWithFuncRunningTime) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 5 * kUsPerSec;
+  const int kFuncRunningTimeUs = 1 * kUsPerSec;
+
+  Timer timer(mock_clock_.get());
+  ASSERT_TRUE(timer.Start());
+
+  int func_counter = 0;
+  timer.Add(
+      [&] {
+        mock_clock_->SleepForMicroseconds(kFuncRunningTimeUs);
+        func_counter++;
+      },
+      "func", kInitDelayUs, kRepeatUs);
+
+  ASSERT_EQ(0, func_counter);
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+  ASSERT_EQ(1, func_counter);
+  ASSERT_EQ(kInitDelayUs + kFuncRunningTimeUs, mock_clock_->NowMicros());
+
+  // After repeat interval time, the function is not executed, as running
+  // the function takes some time (`kFuncRunningTimeSec`). The repeat interval
+  // is the time between ending time of the last call and starting time of the
+  // next call.
+  uint64_t next_abs_interval_time_us = kInitDelayUs + kRepeatUs;
+  timer.TEST_WaitForRun([&] {
+    mock_clock_->SetCurrentTime(next_abs_interval_time_us / kUsPerSec);
+  });
+  ASSERT_EQ(1, func_counter);
+
+  // After the function running time, it's executed again
+  timer.TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kFuncRunningTimeUs); });
+  ASSERT_EQ(2, func_counter);
+
+  ASSERT_TRUE(timer.Shutdown());
+}
+
+TEST_F(TimerTest, DestroyRunningTimer) {
+  const int kInitDelayUs = 1 * kUsPerSec;
+  const int kRepeatUs = 1 * kUsPerSec;
+
+  auto timer_ptr = new Timer(mock_clock_.get());
+
+  int count = 0;
+  timer_ptr->Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
+  ASSERT_TRUE(timer_ptr->Start());
+
+  timer_ptr->TEST_WaitForRun(
+      [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
+
+  // delete a running timer should not cause any exception
+  delete timer_ptr;
+}
+
+TEST_F(TimerTest, DestroyTimerWithRunningFunc) {
+  const int kRepeatUs = 1 * kUsPerSec;
+  auto timer_ptr = new Timer(mock_clock_.get());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"TimerTest::DestroyTimerWithRunningFunc:test_func:0",
+       "TimerTest::DestroyTimerWithRunningFunc:BeforeDelete"},
+      {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting",
+       "TimerTest::DestroyTimerWithRunningFunc:test_func:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(timer_ptr->Start());
+
+  int count = 0;
+  timer_ptr->Add(
+      [&]() {
+        TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:test_func:0");
+        count++;
+        TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:test_func:1");
+      },
+      "fn_running_test", 0, kRepeatUs);
+
+  port::Thread control_thr([&] {
+    TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:BeforeDelete");
+    delete timer_ptr;
+  });
+  mock_clock_->SleepForMicroseconds(kRepeatUs);
+  control_thr.join();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/user_comparator_wrapper.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/user_comparator_wrapper.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/user_comparator_wrapper.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/user_comparator_wrapper.h	2025-05-19 16:14:28.000000000 +0000
@@ -17,9 +17,13 @@
 // perf_context.user_key_comparison_count.
 class UserComparatorWrapper final : public Comparator {
  public:
+  // `UserComparatorWrapper`s constructed with the default constructor are not
+  // usable and will segfault on any attempt to use them for comparisons.
+  UserComparatorWrapper() : user_comparator_(nullptr) {}
+
   explicit UserComparatorWrapper(const Comparator* const user_cmp)
-      : user_comparator_(user_cmp) {}
-  
+      : Comparator(user_cmp->timestamp_size()), user_comparator_(user_cmp) {}
+
   ~UserComparatorWrapper() = default;
 
   const Comparator* user_comparator() const { return user_comparator_; }
@@ -58,6 +62,21 @@
     return user_comparator_->CanKeysWithDifferentByteContentsBeEqual();
   }
 
+  int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+    return user_comparator_->CompareTimestamp(ts1, ts2);
+  }
+
+  using Comparator::CompareWithoutTimestamp;
+  int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                              bool b_has_ts) const override {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->CompareWithoutTimestamp(a, a_has_ts, b, b_has_ts);
+  }
+
+  bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return user_comparator_->EqualWithoutTimestamp(a, b);
+  }
+
  private:
   const Comparator* user_comparator_;
 };
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/util.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/util.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,16 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#ifndef FALLTHROUGH_INTENDED
-#if defined(__clang__)
-#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
-#elif defined(__GNUC__) && __GNUC__ >= 7
-#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
-#else
-#define FALLTHROUGH_INTENDED do {} while (0)
-#endif
-#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/vector_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/vector_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/vector_iterator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/vector_iterator.h	2025-05-19 16:14:28.000000000 +0000
@@ -16,18 +16,20 @@
 class VectorIterator : public InternalIterator {
  public:
   VectorIterator(std::vector<std::string> keys, std::vector<std::string> values,
-                 const InternalKeyComparator* icmp)
+                 const Comparator* icmp = nullptr)
       : keys_(std::move(keys)),
         values_(std::move(values)),
-        indexed_cmp_(icmp, &keys_),
-        current_(0) {
+        current_(keys_.size()),
+        indexed_cmp_(icmp, &keys_) {
     assert(keys_.size() == values_.size());
 
     indices_.reserve(keys_.size());
     for (size_t i = 0; i < keys_.size(); i++) {
       indices_.push_back(i);
     }
-    std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
+    if (icmp != nullptr) {
+      std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
+    }
   }
 
   virtual bool Valid() const override {
@@ -38,15 +40,27 @@
   virtual void SeekToLast() override { current_ = indices_.size() - 1; }
 
   virtual void Seek(const Slice& target) override {
-    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
-                                indexed_cmp_) -
-               indices_.begin();
+    if (indexed_cmp_.cmp != nullptr) {
+      current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                  indexed_cmp_) -
+                 indices_.begin();
+    } else {
+      current_ =
+          std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+          keys_.begin();
+    }
   }
 
   virtual void SeekForPrev(const Slice& target) override {
-    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
-                                indexed_cmp_) -
-               indices_.begin();
+    if (indexed_cmp_.cmp != nullptr) {
+      current_ = std::upper_bound(indices_.begin(), indices_.end(), target,
+                                  indexed_cmp_) -
+                 indices_.begin();
+    } else {
+      current_ =
+          std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) -
+          keys_.begin();
+    }
     if (!Valid()) {
       SeekToLast();
     } else {
@@ -69,9 +83,14 @@
   virtual bool IsKeyPinned() const override { return true; }
   virtual bool IsValuePinned() const override { return true; }
 
+ protected:
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  size_t current_;
+
  private:
   struct IndexedKeyComparator {
-    IndexedKeyComparator(const InternalKeyComparator* c,
+    IndexedKeyComparator(const Comparator* c,
                          const std::vector<std::string>* ks)
         : cmp(c), keys(ks) {}
 
@@ -87,15 +106,12 @@
       return cmp->Compare(a, (*keys)[b]) < 0;
     }
 
-    const InternalKeyComparator* cmp;
+    const Comparator* cmp;
     const std::vector<std::string>* keys;
   };
 
-  std::vector<std::string> keys_;
-  std::vector<std::string> values_;
   IndexedKeyComparator indexed_cmp_;
   std::vector<size_t> indices_;
-  size_t current_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/work_queue.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/work_queue.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,150 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <condition_variable>
+#include <cstddef>
+#include <functional>
+#include <mutex>
+#include <queue>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/// Unbounded thread-safe work queue.
+//
+// This file is an excerpt from Facebook's zstd repo at
+// https://github.com/facebook/zstd/. The relevant file is
+// contrib/pzstd/utils/WorkQueue.h.
+
+template <typename T>
+class WorkQueue {
+  // Protects all member variable access
+  std::mutex mutex_;
+  std::condition_variable readerCv_;
+  std::condition_variable writerCv_;
+  std::condition_variable finishCv_;
+
+  std::queue<T> queue_;
+  bool done_;
+  std::size_t maxSize_;
+
+  // Must have lock to call this function
+  bool full() const {
+    if (maxSize_ == 0) {
+      return false;
+    }
+    return queue_.size() >= maxSize_;
+  }
+
+ public:
+  /**
+   * Constructs an empty work queue with an optional max size.
+   * If `maxSize == 0` the queue size is unbounded.
+   *
+   * @param maxSize The maximum allowed size of the work queue.
+   */
+  WorkQueue(std::size_t maxSize = 0) : done_(false), maxSize_(maxSize) {}
+
+  /**
+   * Push an item onto the work queue.  Notify a single thread that work is
+   * available.  If `finish()` has been called, do nothing and return false.
+   * If `push()` returns false, then `item` has not been copied from.
+   *
+   * @param item  Item to push onto the queue.
+   * @returns     True upon success, false if `finish()` has been called.  An
+   *               item was pushed iff `push()` returns true.
+   */
+  template <typename U>
+  bool push(U&& item) {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      while (full() && !done_) {
+        writerCv_.wait(lock);
+      }
+      if (done_) {
+        return false;
+      }
+      queue_.push(std::forward<U>(item));
+    }
+    readerCv_.notify_one();
+    return true;
+  }
+
+  /**
+   * Attempts to pop an item off the work queue.  It will block until data is
+   * available or `finish()` has been called.
+   *
+   * @param[out] item  If `pop` returns `true`, it contains the popped item.
+   *                    If `pop` returns `false`, it is unmodified.
+   * @returns          True upon success.  False if the queue is empty and
+   *                    `finish()` has been called.
+   */
+  bool pop(T& item) {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      while (queue_.empty() && !done_) {
+        readerCv_.wait(lock);
+      }
+      if (queue_.empty()) {
+        assert(done_);
+        return false;
+      }
+      item = queue_.front();
+      queue_.pop();
+    }
+    writerCv_.notify_one();
+    return true;
+  }
+
+  /**
+   * Sets the maximum queue size.  If `maxSize == 0` then it is unbounded.
+   *
+   * @param maxSize The new maximum queue size.
+   */
+  void setMaxSize(std::size_t maxSize) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      maxSize_ = maxSize;
+    }
+    writerCv_.notify_all();
+  }
+
+  /**
+   * Promise that `push()` won't be called again, so once the queue is empty
+   * there will never any more work.
+   */
+  void finish() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      assert(!done_);
+      done_ = true;
+    }
+    readerCv_.notify_all();
+    writerCv_.notify_all();
+    finishCv_.notify_all();
+  }
+
+  /// Blocks until `finish()` has been called (but the queue may not be empty).
+  void waitUntilFinished() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (!done_) {
+      finishCv_.wait(lock);
+    }
+  }
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/work_queue_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/work_queue_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/work_queue_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,268 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ */
+#include "util/work_queue.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+namespace ROCKSDB_NAMESPACE {
+
+// Unit test for work_queue.h.
+//
+// This file is an excerpt from Facebook's zstd repo at
+// https://github.com/facebook/zstd/. The relevant file is
+// contrib/pzstd/utils/test/WorkQueueTest.cpp.
+
+struct Popper {
+  WorkQueue<int>* queue;
+  int* results;
+  std::mutex* mutex;
+
+  void operator()() {
+    int result;
+    while (queue->pop(result)) {
+      std::lock_guard<std::mutex> lock(*mutex);
+      results[result] = result;
+    }
+  }
+};
+
+TEST(WorkQueue, SingleThreaded) {
+  WorkQueue<int> queue;
+  int result;
+
+  queue.push(5);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(5, result);
+
+  queue.push(1);
+  queue.push(2);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(1, result);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(2, result);
+
+  queue.push(1);
+  queue.push(2);
+  queue.finish();
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(1, result);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(2, result);
+  EXPECT_FALSE(queue.pop(result));
+
+  queue.waitUntilFinished();
+}
+
+TEST(WorkQueue, SPSC) {
+  WorkQueue<int> queue;
+  const int max = 100;
+
+  for (int i = 0; i < 10; ++i) {
+    queue.push(i);
+  }
+
+  std::thread thread([&queue, max] {
+    int result;
+    for (int i = 0;; ++i) {
+      if (!queue.pop(result)) {
+        EXPECT_EQ(i, max);
+        break;
+      }
+      EXPECT_EQ(i, result);
+    }
+  });
+
+  std::this_thread::yield();
+  for (int i = 10; i < max; ++i) {
+    queue.push(i);
+  }
+  queue.finish();
+
+  thread.join();
+}
+
+TEST(WorkQueue, SPMC) {
+  WorkQueue<int> queue;
+  std::vector<int> results(50, -1);
+  std::mutex mutex;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 5; ++i) {
+    threads.emplace_back(Popper{&queue, results.data(), &mutex});
+  }
+
+  for (int i = 0; i < 50; ++i) {
+    queue.push(i);
+  }
+  queue.finish();
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  for (int i = 0; i < 50; ++i) {
+    EXPECT_EQ(i, results[i]);
+  }
+}
+
+TEST(WorkQueue, MPMC) {
+  WorkQueue<int> queue;
+  std::vector<int> results(100, -1);
+  std::mutex mutex;
+  std::vector<std::thread> popperThreads;
+  for (int i = 0; i < 4; ++i) {
+    popperThreads.emplace_back(Popper{&queue, results.data(), &mutex});
+  }
+
+  std::vector<std::thread> pusherThreads;
+  for (int i = 0; i < 2; ++i) {
+    auto min = i * 50;
+    auto max = (i + 1) * 50;
+    pusherThreads.emplace_back([&queue, min, max] {
+      for (int j = min; j < max; ++j) {
+        queue.push(j);
+      }
+    });
+  }
+
+  for (auto& thread : pusherThreads) {
+    thread.join();
+  }
+  queue.finish();
+
+  for (auto& thread : popperThreads) {
+    thread.join();
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_EQ(i, results[i]);
+  }
+}
+
+TEST(WorkQueue, BoundedSizeWorks) {
+  WorkQueue<int> queue(1);
+  int result;
+  queue.push(5);
+  queue.pop(result);
+  queue.push(5);
+  queue.pop(result);
+  queue.push(5);
+  queue.finish();
+  queue.pop(result);
+  EXPECT_EQ(5, result);
+}
+
+TEST(WorkQueue, BoundedSizePushAfterFinish) {
+  WorkQueue<int> queue(1);
+  int result;
+  queue.push(5);
+  std::thread pusher([&queue] { queue.push(6); });
+  // Dirtily try and make sure that pusher has run.
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  queue.finish();
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(5, result);
+  EXPECT_FALSE(queue.pop(result));
+
+  pusher.join();
+}
+
+TEST(WorkQueue, SetMaxSize) {
+  WorkQueue<int> queue(2);
+  int result;
+  queue.push(5);
+  queue.push(6);
+  queue.setMaxSize(1);
+  std::thread pusher([&queue] { queue.push(7); });
+  // Dirtily try and make sure that pusher has run.
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  queue.finish();
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(5, result);
+  EXPECT_TRUE(queue.pop(result));
+  EXPECT_EQ(6, result);
+  EXPECT_FALSE(queue.pop(result));
+
+  pusher.join();
+}
+
+TEST(WorkQueue, BoundedSizeMPMC) {
+  WorkQueue<int> queue(10);
+  std::vector<int> results(200, -1);
+  std::mutex mutex;
+  std::cerr << "Creating popperThreads" << std::endl;
+  std::vector<std::thread> popperThreads;
+  for (int i = 0; i < 4; ++i) {
+    popperThreads.emplace_back(Popper{&queue, results.data(), &mutex});
+  }
+
+  std::cerr << "Creating pusherThreads" << std::endl;
+  std::vector<std::thread> pusherThreads;
+  for (int i = 0; i < 2; ++i) {
+    auto min = i * 100;
+    auto max = (i + 1) * 100;
+    pusherThreads.emplace_back([&queue, min, max] {
+      for (int j = min; j < max; ++j) {
+        queue.push(j);
+      }
+    });
+  }
+
+  std::cerr << "Joining pusherThreads" << std::endl;
+  for (auto& thread : pusherThreads) {
+    thread.join();
+  }
+  std::cerr << "Finishing queue" << std::endl;
+  queue.finish();
+
+  std::cerr << "Joining popperThreads" << std::endl;
+  for (auto& thread : popperThreads) {
+    thread.join();
+  }
+
+  std::cerr << "Inspecting results" << std::endl;
+  for (int i = 0; i < 200; ++i) {
+    EXPECT_EQ(i, results[i]);
+  }
+}
+
+TEST(WorkQueue, FailedPush) {
+  WorkQueue<int> queue;
+  EXPECT_TRUE(queue.push(1));
+  queue.finish();
+  EXPECT_FALSE(queue.push(1));
+}
+
+TEST(WorkQueue, FailedPop) {
+  WorkQueue<int> queue;
+  int x = 5;
+  EXPECT_TRUE(queue.push(x));
+  queue.finish();
+  x = 0;
+  EXPECT_TRUE(queue.pop(x));
+  EXPECT_EQ(5, x);
+  EXPECT_FALSE(queue.pop(x));
+  EXPECT_EQ(5, x);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxh3p.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxh3p.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxh3p.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxh3p.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,1648 +0,0 @@
-//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-/*
-   xxHash - Extremely Fast Hash algorithm
-   Development source file for `xxh3`
-   Copyright (C) 2019-present, Yann Collet.
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - xxHash source repository : https://github.com/Cyan4973/xxHash
-*/
-
-/* RocksDB Note: This file contains a preview release (xxhash repository
-   version 0.7.2) of XXH3 that is unlikely to be compatible with the final
-   version of XXH3. We have therefore renamed this XXH3p ("preview"), for
-   clarity so that we can continue to use this version even after
-   integrating a newer incompatible version.
-*/
-
-/* Note :
-   This file is separated for development purposes.
-   It will be integrated into `xxhash.c` when development phase is complete.
-*/
-
-#ifndef XXH3p_H
-#define XXH3p_H
-
-
-/* ===   Dependencies   === */
-
-#undef XXH_INLINE_ALL   /* in case it's already defined */
-#define XXH_INLINE_ALL
-#include "xxhash.h"
-
-
-/* ===   Compiler specifics   === */
-
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
-#  define XXH_RESTRICT   restrict
-#else
-/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
-#  define XXH_RESTRICT   /* disable */
-#endif
-
-#if defined(__GNUC__)
-#  if defined(__AVX2__)
-#    include <immintrin.h>
-#  elif defined(__SSE2__)
-#    include <emmintrin.h>
-#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-#    define inline __inline__  /* clang bug */
-#    include <arm_neon.h>
-#    undef inline
-#  endif
-#elif defined(_MSC_VER)
-#  include <intrin.h>
-#endif
-
-/*
- * Sanity check.
- *
- * XXH3 only requires these features to be efficient:
- *
- *  - Usable unaligned access
- *  - A 32-bit or 64-bit ALU
- *      - If 32-bit, a decent ADC instruction
- *  - A 32 or 64-bit multiply with a 64-bit result
- *
- * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
- * classic 16-bit only subset of ARM's instruction set.
- *
- * First of all, Thumb-1 lacks support for the UMULL instruction which
- * performs the important long multiply. This means numerous __aeabi_lmul
- * calls.
- *
- * Second of all, the 8 functional registers are just not enough.
- * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
- * Lo registers, and this shuffling results in thousands more MOVs than A32.
- *
- * A32 and T32 don't have this limitation. They can access all 14 registers,
- * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
- *
- * If compiling Thumb-1 for a target which supports ARM instructions, we
- * will give a warning.
- *
- * Usually, if this happens, it is because of an accident and you probably
- * need to specify -march, as you probably meant to compileh for a newer
- * architecture.
- */
-#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
-#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
-#endif
-
-/* ==========================================
- * Vectorization detection
- * ========================================== */
-#define XXH_SCALAR 0
-#define XXH_SSE2   1
-#define XXH_AVX2   2
-#define XXH_NEON   3
-#define XXH_VSX    4
-
-#ifndef XXH_VECTOR    /* can be defined on command line */
-#  if defined(__AVX2__)
-#    define XXH_VECTOR XXH_AVX2
-#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
-#    define XXH_VECTOR XXH_SSE2
-#  elif defined(__GNUC__) /* msvc support maybe later */ \
-  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
-  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
-    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
-#    define XXH_VECTOR XXH_NEON
-#  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
-#    define XXH_VECTOR XXH_VSX
-#  else
-#    define XXH_VECTOR XXH_SCALAR
-#  endif
-#endif
-
-/* control alignment of accumulator,
- * for compatibility with fast vector loads */
-#ifndef XXH_ACC_ALIGN
-#  if XXH_VECTOR == 0   /* scalar */
-#     define XXH_ACC_ALIGN 8
-#  elif XXH_VECTOR == 1  /* sse2 */
-#     define XXH_ACC_ALIGN 16
-#  elif XXH_VECTOR == 2  /* avx2 */
-#     define XXH_ACC_ALIGN 32
-#  elif XXH_VECTOR == 3  /* neon */
-#     define XXH_ACC_ALIGN 16
-#  elif XXH_VECTOR == 4  /* vsx */
-#     define XXH_ACC_ALIGN 16
-#  endif
-#endif
-
-/* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
-#if defined(_MSC_VER) && defined(_M_IX86)
-#    include <intrin.h>
-#    define XXH_mult32to64(x, y) __emulu(x, y)
-#else
-#    define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
-#endif
-
-/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
- * there is a lot of mischief with endianness. */
-#if XXH_VECTOR == XXH_VSX
-#  include <altivec.h>
-#  undef vector
-typedef __vector unsigned long long U64x2;
-typedef __vector unsigned char U8x16;
-typedef __vector unsigned U32x4;
-
-#ifndef XXH_VSX_BE
-#  if defined(__BIG_ENDIAN__) \
-  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#    define XXH_VSX_BE 1
-#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
-#    warning "-maltivec=be is not recommended. Please use native endianness."
-#    define XXH_VSX_BE 1
-#  else
-#    define XXH_VSX_BE 0
-#  endif
-#endif
-
-/* We need some helpers for big endian mode. */
-#if XXH_VSX_BE
-/* A wrapper for POWER9's vec_revb. */
-#  ifdef __POWER9_VECTOR__
-#    define XXH_vec_revb vec_revb
-#  else
-XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
-{
-    U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
-                              0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
-    return vec_perm(val, val, vByteSwap);
-}
-#  endif
-
-/* Power8 Crypto gives us vpermxor which is very handy for
- * PPC64EB.
- *
- * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
- * {
- *     U8x16 ret;
- *     for (int i = 0; i < 16; i++) {
- *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
- *     }
- *     return ret;
- * }
- *
- * Because both of the main loops load the key, swap, and xor it with input,
- * we can combine the key swap into this instruction.
- */
-#  ifdef vec_permxor
-#    define XXH_vec_permxor vec_permxor
-#  else
-#    define XXH_vec_permxor __builtin_crypto_vpermxor
-#  endif
-#endif  /* XXH_VSX_BE */
-/*
- * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
- * vec_mule.
- *
- * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
- * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
- * GCC needs inline assembly. */
-#if __has_builtin(__builtin_altivec_vmuleuw)
-#  define XXH_vec_mulo __builtin_altivec_vmulouw
-#  define XXH_vec_mule __builtin_altivec_vmuleuw
-#else
-/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
-XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
-    U64x2 result;
-    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
-    return result;
-}
-XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
-    U64x2 result;
-    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
-    return result;
-}
-#endif /* __has_builtin(__builtin_altivec_vmuleuw) */
-#endif /* XXH_VECTOR == XXH_VSX */
-
-/* prefetch
- * can be disabled, by declaring XXH_NO_PREFETCH build macro */
-#if defined(XXH_NO_PREFETCH)
-#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
-#else
-#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
-#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
-#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-#  else
-#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
-#  endif
-#endif  /* XXH_NO_PREFETCH */
-
-
-/* ==========================================
- * XXH3 default settings
- * ========================================== */
-
-#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
-
-#if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
-#  error "default keyset is not large enough"
-#endif
-
-XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
-    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
-    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
-    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
-    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
-    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
-    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
-    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
-    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
-
-    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
-    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
-    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
-    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
-};
-
-/*
- * GCC for x86 has a tendency to use SSE in this loop. While it
- * successfully avoids swapping (as MUL overwrites EAX and EDX), it
- * slows it down because instead of free register swap shifts, it
- * must use pshufd and punpckl/hd.
- *
- * To prevent this, we use this attribute to shut off SSE.
- */
-#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
-__attribute__((__target__("no-sse")))
-#endif
-static XXH128_hash_t
-XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
-{
-    /*
-     * GCC/Clang __uint128_t method.
-     *
-     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
-     * This is usually the best way as it usually uses a native long 64-bit
-     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
-     *
-     * Usually.
-     *
-     * Despite being a 32-bit platform, Clang (and emscripten) define this
-     * type despite not having the arithmetic for it. This results in a
-     * laggy compiler builtin call which calculates a full 128-bit multiply.
-     * In that case it is best to use the portable one.
-     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
-     */
-#if defined(__GNUC__) && !defined(__wasm__) \
-    && defined(__SIZEOF_INT128__) \
-    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
-
-    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
-    XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
-    return r128;
-
-    /*
-     * MSVC for x64's _umul128 method.
-     *
-     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
-     *
-     * This compiles to single operand MUL on x64.
-     */
-#elif defined(_M_X64) || defined(_M_IA64)
-
-#ifndef _MSC_VER
-#   pragma intrinsic(_umul128)
-#endif
-    xxh_u64 product_high;
-    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
-    XXH128_hash_t const r128 = { product_low, product_high };
-    return r128;
-
-#else
-    /*
-     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
-     *
-     * This is a fast and simple grade school multiply, which is shown
-     * below with base 10 arithmetic instead of base 0x100000000.
-     *
-     *           9 3 // D2 lhs = 93
-     *         x 7 5 // D2 rhs = 75
-     *     ----------
-     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
-     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
-     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
-     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
-     *     ---------
-     *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
-     *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
-     *     ---------
-     *       6 9 7 5
-     *
-     * The reasons for adding the products like this are:
-     *  1. It avoids manual carry tracking. Just like how
-     *     (9 * 9) + 9 + 9 = 99, the same applies with this for
-     *     UINT64_MAX. This avoids a lot of complexity.
-     *
-     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
-     *     instruction available in ARMv6+ A32/T32, which is shown below:
-     *
-     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
-     *         {
-     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
-     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
-     *             *RdHi = (xxh_u32)(product >> 32);
-     *         }
-     *
-     *     This instruction was designed for efficient long multiplication,
-     *     and allows this to be calculated in only 4 instructions which
-     *     is comparable to some 64-bit ALUs.
-     *
-     *  3. It isn't terrible on other platforms. Usually this will be
-     *     a couple of 32-bit ADD/ADCs.
-     */
-
-    /* First calculate all of the cross products. */
-    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
-    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
-    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
-    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
-
-    /* Now add the products together. These will never overflow. */
-    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
-    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
-    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
-
-    XXH128_hash_t r128 = { lower, upper };
-    return r128;
-#endif
-}
-
-/*
- * We want to keep the attribute here because a target switch
- * disables inlining.
- *
- * Does a 64-bit to 128-bit multiply, then XOR folds it.
- * The reason for the separate function is to prevent passing
- * too many structs around by value. This will hopefully inline
- * the multiply, but we don't force it.
- */
-#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
-__attribute__((__target__("no-sse")))
-#endif
-static xxh_u64
-XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
-{
-    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
-    return product.low64 ^ product.high64;
-}
-
-
-static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
-{
-    h64 ^= h64 >> 37;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-    return h64;
-}
-
-
-/* ==========================================
- * Short keys
- * ========================================== */
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(input != NULL);
-    XXH_ASSERT(1 <= len && len <= 3);
-    XXH_ASSERT(secret != NULL);
-    {   xxh_u8 const c1 = input[0];
-        xxh_u8 const c2 = input[len >> 1];
-        xxh_u8 const c3 = input[len - 1];
-        xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
-        xxh_u64  const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
-        xxh_u64  const mixed = keyed * PRIME64_1;
-        return XXH3p_avalanche(mixed);
-    }
-}
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(input != NULL);
-    XXH_ASSERT(secret != NULL);
-    XXH_ASSERT(4 <= len && len <= 8);
-    {   xxh_u32 const input_lo = XXH_readLE32(input);
-        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
-        xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
-        xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
-        xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
-        return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
-    }
-}
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(input != NULL);
-    XXH_ASSERT(secret != NULL);
-    XXH_ASSERT(9 <= len && len <= 16);
-    {   xxh_u64 const input_lo = XXH_readLE64(input)           ^ (XXH_readLE64(secret)     + seed);
-        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
-        xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
-        return XXH3p_avalanche(acc);
-    }
-}
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(len <= 16);
-    {   if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
-        if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
-        if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
-        /*
-         * RocksDB modification from XXH3 preview: zero result for empty
-         * string can be problematic for multiplication-based algorithms.
-         * Return a hash of the seed instead.
-         */
-        return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
-    }
-}
-
-
-/* ===    Long Keys    === */
-
-#define STRIPE_LEN 64
-#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
-#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
-
-typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
-
-XXH_FORCE_INLINE void
-XXH3p_accumulate_512(      void* XXH_RESTRICT acc,
-                    const void* XXH_RESTRICT input,
-                    const void* XXH_RESTRICT secret,
-                    XXH3p_accWidth_e accWidth)
-{
-#if (XXH_VECTOR == XXH_AVX2)
-
-    XXH_ASSERT((((size_t)acc) & 31) == 0);
-    {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
-        const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
-        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
-
-        size_t i;
-        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
-            __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
-            __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
-            __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
-            __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
-            if (accWidth == XXH3p_acc_128bits) {
-                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
-                __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
-                xacc[i]  = _mm256_add_epi64(product, sum);
-            } else {  /* XXH3p_acc_64bits */
-                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
-                xacc[i]  = _mm256_add_epi64(product, sum);
-            }
-    }   }
-
-#elif (XXH_VECTOR == XXH_SSE2)
-
-    XXH_ASSERT((((size_t)acc) & 15) == 0);
-    {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
-        const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
-        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
-
-        size_t i;
-        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
-            __m128i const data_vec = _mm_loadu_si128 (xinput+i);
-            __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
-            __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
-            __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
-            if (accWidth == XXH3p_acc_128bits) {
-                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
-                __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
-                xacc[i]  = _mm_add_epi64(product, sum);
-            } else {  /* XXH3p_acc_64bits */
-                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
-                xacc[i]  = _mm_add_epi64(product, sum);
-            }
-    }   }
-
-#elif (XXH_VECTOR == XXH_NEON)
-
-    XXH_ASSERT((((size_t)acc) & 15) == 0);
-    {
-        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
-        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
-        uint8_t const* const xinput = (const uint8_t *) input;
-        uint8_t const* const xsecret  = (const uint8_t *) secret;
-
-        size_t i;
-        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
-#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
-            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
-             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
-             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
-             * a lot.
-             * aarch64 not only uses an entirely different syntax, but it requires three
-             * instructions...
-             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
-             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
-             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
-             * ...to do what ARM does in one:
-             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
-
-            /* data_vec = xsecret[i]; */
-            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
-            /* key_vec  = xsecret[i];  */
-            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
-            /* data_key = data_vec ^ key_vec; */
-            uint32x4_t       data_key;
-
-            if (accWidth == XXH3p_acc_64bits) {
-                /* Add first to prevent register swaps */
-                /* xacc[i] += data_vec; */
-                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
-            } else {  /* XXH3p_acc_128bits */
-                /* xacc[i] += swap(data_vec); */
-                /* can probably be optimized better */
-                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
-                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
-                xacc[i] = vaddq_u64 (xacc[i], swapped);
-            }
-
-            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
-
-            /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
-             * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
-            __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
-            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
-            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
-
-#else
-            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
-
-            /* data_vec = xsecret[i]; */
-            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
-            /* key_vec  = xsecret[i];  */
-            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
-            /* data_key = data_vec ^ key_vec; */
-            uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
-            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
-            uint32x2_t const data_key_lo = vmovn_u64  (data_key);
-            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
-            uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
-            if (accWidth == XXH3p_acc_64bits) {
-                /* xacc[i] += data_vec; */
-                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
-            } else {  /* XXH3p_acc_128bits */
-                /* xacc[i] += swap(data_vec); */
-                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
-                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
-                xacc[i] = vaddq_u64 (xacc[i], swapped);
-            }
-            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
-            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
-
-#endif
-        }
-    }
-
-#elif (XXH_VECTOR == XXH_VSX)
-          U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
-    U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
-    U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
-    U64x2 const v32 = { 32,  32 };
-#if XXH_VSX_BE
-    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
-                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
-#endif
-    size_t i;
-    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
-        /* data_vec = xinput[i]; */
-        /* key_vec = xsecret[i]; */
-#if XXH_VSX_BE
-        /* byteswap */
-        U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
-        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
-        /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
-        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
-#else
-        U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
-        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
-        U64x2 const data_key = data_vec ^ key_vec;
-#endif
-        /* shuffled = (data_key << 32) | (data_key >> 32); */
-        U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
-        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
-        U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
-        xacc[i] += product;
-
-        if (accWidth == XXH3p_acc_64bits) {
-            xacc[i] += data_vec;
-        } else {  /* XXH3p_acc_128bits */
-            /* swap high and low halves */
-            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
-            xacc[i] += data_swapped;
-        }
-    }
-
-#else   /* scalar variant of Accumulator - universal */
-
-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
-    const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
-    const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
-    size_t i;
-    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
-    for (i=0; i < ACC_NB; i++) {
-        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
-        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
-
-        if (accWidth == XXH3p_acc_64bits) {
-            xacc[i] += data_val;
-        } else {
-            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
-        }
-        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
-    }
-#endif
-}
-
-XXH_FORCE_INLINE void
-XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
-{
-#if (XXH_VECTOR == XXH_AVX2)
-
-    XXH_ASSERT((((size_t)acc) & 31) == 0);
-    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
-        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
-        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
-
-        size_t i;
-        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
-            /* xacc[i] ^= (xacc[i] >> 47) */
-            __m256i const acc_vec     = xacc[i];
-            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
-            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
-            /* xacc[i] ^= xsecret; */
-            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
-            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
-
-            /* xacc[i] *= PRIME32_1; */
-            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
-            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
-            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
-            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
-        }
-    }
-
-#elif (XXH_VECTOR == XXH_SSE2)
-
-    XXH_ASSERT((((size_t)acc) & 15) == 0);
-    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
-        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
-        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
-
-        size_t i;
-        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
-            /* xacc[i] ^= (xacc[i] >> 47) */
-            __m128i const acc_vec     = xacc[i];
-            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
-            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
-            /* xacc[i] ^= xsecret; */
-            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
-            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
-
-            /* xacc[i] *= PRIME32_1; */
-            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
-            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
-            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
-            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
-        }
-    }
-
-#elif (XXH_VECTOR == XXH_NEON)
-
-    XXH_ASSERT((((size_t)acc) & 15) == 0);
-
-    {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
-        uint8_t const* const xsecret = (uint8_t const*) secret;
-        uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
-
-        size_t i;
-        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
-            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
-            uint64x2_t const   acc_vec  = xacc[i];
-            uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
-            uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
-
-            /* key_vec  = xsecret[i]; */
-            uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
-            /* data_key = data_vec ^ key_vec; */
-            uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
-            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
-            uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
-
-            /* data_key *= PRIME32_1 */
-
-            /* prod_hi = (data_key >> 32) * PRIME32_1; */
-            uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
-            /* xacc[i] = prod_hi << 32; */
-            xacc[i] = vshlq_n_u64(prod_hi, 32);
-            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
-            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
-    }   }
-
-#elif (XXH_VECTOR == XXH_VSX)
-
-          U64x2* const xacc =       (U64x2*) acc;
-    const U64x2* const xsecret = (const U64x2*) secret;
-    /* constants */
-    U64x2 const v32  = { 32, 32 };
-    U64x2 const v47 = { 47, 47 };
-    U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
-    size_t i;
-#if XXH_VSX_BE
-    /* endian swap */
-    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
-                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
-#endif
-    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
-        U64x2 const acc_vec  = xacc[i];
-        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
-        /* key_vec = xsecret[i]; */
-#if XXH_VSX_BE
-        /* swap bytes words */
-        U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
-        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
-#else
-        U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
-        U64x2 const data_key = data_vec ^ key_vec;
-#endif
-
-        /* data_key *= PRIME32_1 */
-
-        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
-        U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
-        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
-        U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
-        xacc[i] = prod_odd + (prod_even << v32);
-    }
-
-#else   /* scalar variant of Scrambler - universal */
-
-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
-    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
-    size_t i;
-    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
-    for (i=0; i < ACC_NB; i++) {
-        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
-        xxh_u64 acc64 = xacc[i];
-        acc64 ^= acc64 >> 47;
-        acc64 ^= key64;
-        acc64 *= PRIME32_1;
-        xacc[i] = acc64;
-    }
-
-#endif
-}
-
-#define XXH_PREFETCH_DIST 384
-
-/* assumption : nbStripes will not overflow secret size */
-XXH_FORCE_INLINE void
-XXH3p_accumulate(       xxh_u64* XXH_RESTRICT acc,
-                const xxh_u8* XXH_RESTRICT input,
-                const xxh_u8* XXH_RESTRICT secret,
-                      size_t nbStripes,
-                      XXH3p_accWidth_e accWidth)
-{
-    size_t n;
-    for (n = 0; n < nbStripes; n++ ) {
-        const xxh_u8* const in = input + n*STRIPE_LEN;
-        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
-        XXH3p_accumulate_512(acc,
-                            in,
-                            secret + n*XXH_SECRET_CONSUME_RATE,
-                            accWidth);
-    }
-}
-
-/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
- *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
- *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
- *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
- */
-
-#if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
-static void
-#else
-XXH_FORCE_INLINE void
-#endif
-XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
-                      const xxh_u8* XXH_RESTRICT input, size_t len,
-                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                            XXH3p_accWidth_e accWidth)
-{
-    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
-    size_t const block_len = STRIPE_LEN * nb_rounds;
-    size_t const nb_blocks = len / block_len;
-
-    size_t n;
-
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
-
-    for (n = 0; n < nb_blocks; n++) {
-        XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
-        XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
-    }
-
-    /* last partial block */
-    XXH_ASSERT(len > STRIPE_LEN);
-    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
-        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
-        XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
-
-        /* last stripe */
-        if (len & (STRIPE_LEN - 1)) {
-            const xxh_u8* const p = input + len - STRIPE_LEN;
-#define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
-            XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
-    }   }
-}
-
-XXH_FORCE_INLINE xxh_u64
-XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
-{
-    return XXH3p_mul128_fold64(
-               acc[0] ^ XXH_readLE64(secret),
-               acc[1] ^ XXH_readLE64(secret+8) );
-}
-
-static XXH64_hash_t
-XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
-{
-    xxh_u64 result64 = start;
-
-    result64 += XXH3p_mix2Accs(acc+0, secret +  0);
-    result64 += XXH3p_mix2Accs(acc+2, secret + 16);
-    result64 += XXH3p_mix2Accs(acc+4, secret + 32);
-    result64 += XXH3p_mix2Accs(acc+6, secret + 48);
-
-    return XXH3p_avalanche(result64);
-}
-
-#define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
-                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
-                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
-{
-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
-
-    XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
-
-    /* converge into final hash */
-    XXH_STATIC_ASSERT(sizeof(acc) == 64);
-#define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
-    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-    return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
-}
-
-
-XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
-{
-    return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
-}
-
-XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
-                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
-{
-    return XXH3p_hashLong_internal(input, len, secret, secretSize);
-}
-
-
-XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
-{
-    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
-    memcpy(dst, &v64, sizeof(v64));
-}
-
-/* XXH3p_initCustomSecret() :
- * destination `customSecret` is presumed allocated and same size as `kSecret`.
- */
-XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
-{
-    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
-    int i;
-
-    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
-
-    for (i=0; i < nbRounds; i++) {
-        XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
-        XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
-    }
-}
-
-
-/* XXH3p_hashLong_64b_withSeed() :
- * Generate a custom key,
- * based on alteration of default kSecret with the seed,
- * and then use this key for long mode hashing.
- * This operation is decently fast but nonetheless costs a little bit of time.
- * Try to avoid it whenever possible (typically when seed==0).
- */
-XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
-{
-    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
-    if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
-    XXH3p_initCustomSecret(secret, seed);
-    return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
-}
-
-
-XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
-                                 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
-{
-    xxh_u64 const input_lo = XXH_readLE64(input);
-    xxh_u64 const input_hi = XXH_readLE64(input+8);
-    return XXH3p_mul128_fold64(
-               input_lo ^ (XXH_readLE64(secret)   + seed64),
-               input_hi ^ (XXH_readLE64(secret+8) - seed64) );
-}
-
-
-XXH_FORCE_INLINE XXH64_hash_t
-XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                     XXH64_hash_t seed)
-{
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
-    XXH_ASSERT(16 < len && len <= 128);
-
-    {   xxh_u64 acc = len * PRIME64_1;
-        if (len > 32) {
-            if (len > 64) {
-                if (len > 96) {
-                    acc += XXH3p_mix16B(input+48, secret+96, seed);
-                    acc += XXH3p_mix16B(input+len-64, secret+112, seed);
-                }
-                acc += XXH3p_mix16B(input+32, secret+64, seed);
-                acc += XXH3p_mix16B(input+len-48, secret+80, seed);
-            }
-            acc += XXH3p_mix16B(input+16, secret+32, seed);
-            acc += XXH3p_mix16B(input+len-32, secret+48, seed);
-        }
-        acc += XXH3p_mix16B(input+0, secret+0, seed);
-        acc += XXH3p_mix16B(input+len-16, secret+16, seed);
-
-        return XXH3p_avalanche(acc);
-    }
-}
-
-#define XXH3p_MIDSIZE_MAX 240
-
-XXH_NO_INLINE XXH64_hash_t
-XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                      XXH64_hash_t seed)
-{
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
-    XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
-
-    #define XXH3p_MIDSIZE_STARTOFFSET 3
-    #define XXH3p_MIDSIZE_LASTOFFSET  17
-
-    {   xxh_u64 acc = len * PRIME64_1;
-        int const nbRounds = (int)len / 16;
-        int i;
-        for (i=0; i<8; i++) {
-            acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
-        }
-        acc = XXH3p_avalanche(acc);
-        XXH_ASSERT(nbRounds >= 8);
-        for (i=8 ; i < nbRounds; i++) {
-            acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
-        }
-        /* last bytes */
-        acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
-        return XXH3p_avalanche(acc);
-    }
-}
-
-/* ===   Public entry point   === */
-
-XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
-{
-    if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
-    if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
-    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
-    return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
-}
-
-XXH_PUBLIC_API XXH64_hash_t
-XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
-{
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
-    /* if an action must be taken should `secret` conditions not be respected,
-     * it should be done here.
-     * For now, it's a contract pre-condition.
-     * Adding a check and a branch here would cost performance at every hash */
-     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
-     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
-     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
-     return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
-}
-
-XXH_PUBLIC_API XXH64_hash_t
-XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
-{
-    if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
-    if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
-    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
-    return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
-}
-
-/* ===   XXH3 streaming   === */
-
-XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void)
-{
-    return (XXH3p_state_t*)XXH_malloc(sizeof(XXH3p_state_t));
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr)
-{
-    XXH_free(statePtr);
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API void
-XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state)
-{
-    memcpy(dst_state, src_state, sizeof(*dst_state));
-}
-
-static void
-XXH3p_64bits_reset_internal(XXH3p_state_t* statePtr,
-                           XXH64_hash_t seed,
-                           const xxh_u8* secret, size_t secretSize)
-{
-    XXH_ASSERT(statePtr != NULL);
-    memset(statePtr, 0, sizeof(*statePtr));
-    statePtr->acc[0] = PRIME32_3;
-    statePtr->acc[1] = PRIME64_1;
-    statePtr->acc[2] = PRIME64_2;
-    statePtr->acc[3] = PRIME64_3;
-    statePtr->acc[4] = PRIME64_4;
-    statePtr->acc[5] = PRIME32_2;
-    statePtr->acc[6] = PRIME64_5;
-    statePtr->acc[7] = PRIME32_1;
-    statePtr->seed = seed;
-    XXH_ASSERT(secret != NULL);
-    statePtr->secret = secret;
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
-    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
-    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_64bits_reset(XXH3p_state_t* statePtr)
-{
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3p_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
-{
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3p_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
-    if (secret == NULL) return XXH_ERROR;
-    if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
-{
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3p_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
-    XXH3p_initCustomSecret(statePtr->customSecret, seed);
-    statePtr->secret = statePtr->customSecret;
-    return XXH_OK;
-}
-
-XXH_FORCE_INLINE void
-XXH3p_consumeStripes( xxh_u64* acc,
-                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
-                    const xxh_u8* input, size_t totalStripes,
-                    const xxh_u8* secret, size_t secretLimit,
-                    XXH3p_accWidth_e accWidth)
-{
-    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
-    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
-        /* need a scrambling operation */
-        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
-        XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
-        XXH3p_scrambleAcc(acc, secret + secretLimit);
-        XXH3p_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
-        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
-    } else {
-        XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
-        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
-    }
-}
-
-XXH_FORCE_INLINE XXH_errorcode
-XXH3p_update(XXH3p_state_t* state, const xxh_u8* input, size_t len, XXH3p_accWidth_e accWidth)
-{
-    if (input==NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
-        return XXH_OK;
-#else
-        return XXH_ERROR;
-#endif
-
-    {   const xxh_u8* const bEnd = input + len;
-
-        state->totalLen += len;
-
-        if (state->bufferedSize + len <= XXH3p_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
-            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
-            state->bufferedSize += (XXH32_hash_t)len;
-            return XXH_OK;
-        }
-        /* input now > XXH3p_INTERNALBUFFER_SIZE */
-
-        #define XXH3p_INTERNALBUFFER_STRIPES (XXH3p_INTERNALBUFFER_SIZE / STRIPE_LEN)
-        XXH_STATIC_ASSERT(XXH3p_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
-
-        if (state->bufferedSize) {   /* some input within internal buffer: fill then consume it */
-            size_t const loadSize = XXH3p_INTERNALBUFFER_SIZE - state->bufferedSize;
-            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
-            input += loadSize;
-            XXH3p_consumeStripes(state->acc,
-                               &state->nbStripesSoFar, state->nbStripesPerBlock,
-                                state->buffer, XXH3p_INTERNALBUFFER_STRIPES,
-                                state->secret, state->secretLimit,
-                                accWidth);
-            state->bufferedSize = 0;
-        }
-
-        /* consume input by full buffer quantities */
-        if (input+XXH3p_INTERNALBUFFER_SIZE <= bEnd) {
-            const xxh_u8* const limit = bEnd - XXH3p_INTERNALBUFFER_SIZE;
-            do {
-                XXH3p_consumeStripes(state->acc,
-                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
-                                    input, XXH3p_INTERNALBUFFER_STRIPES,
-                                    state->secret, state->secretLimit,
-                                    accWidth);
-                input += XXH3p_INTERNALBUFFER_SIZE;
-            } while (input<=limit);
-        }
-
-        if (input < bEnd) { /* some remaining input input : buffer it */
-            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
-            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
-        }
-    }
-
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_64bits_update(XXH3p_state_t* state, const void* input, size_t len)
-{
-    return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_64bits);
-}
-
-
-XXH_FORCE_INLINE void
-XXH3p_digest_long (XXH64_hash_t* acc, const XXH3p_state_t* state, XXH3p_accWidth_e accWidth)
-{
-    memcpy(acc, state->acc, sizeof(state->acc));  /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
-    if (state->bufferedSize >= STRIPE_LEN) {
-        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
-        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
-        XXH3p_consumeStripes(acc,
-                           &nbStripesSoFar, state->nbStripesPerBlock,
-                            state->buffer, totalNbStripes,
-                            state->secret, state->secretLimit,
-                            accWidth);
-        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
-            XXH3p_accumulate_512(acc,
-                                state->buffer + state->bufferedSize - STRIPE_LEN,
-                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
-                                accWidth);
-        }
-    } else {  /* bufferedSize < STRIPE_LEN */
-        if (state->bufferedSize) { /* one last stripe */
-            xxh_u8 lastStripe[STRIPE_LEN];
-            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
-            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
-            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
-            XXH3p_accumulate_512(acc,
-                                lastStripe,
-                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
-                                accWidth);
-    }   }
-}
-
-XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_digest (const XXH3p_state_t* state)
-{
-    if (state->totalLen > XXH3p_MIDSIZE_MAX) {
-        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
-        XXH3p_digest_long(acc, state, XXH3p_acc_64bits);
-        return XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
-    }
-    /* len <= XXH3p_MIDSIZE_MAX : short code */
-    if (state->seed)
-        return XXH3p_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
-    return XXH3p_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
-}
-
-/* ==========================================
- * XXH3 128 bits (=> XXH128)
- * ========================================== */
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(input != NULL);
-    XXH_ASSERT(1 <= len && len <= 3);
-    XXH_ASSERT(secret != NULL);
-    {   xxh_u8 const c1 = input[0];
-        xxh_u8 const c2 = input[len >> 1];
-        xxh_u8 const c3 = input[len - 1];
-        xxh_u32  const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
-        xxh_u32  const combinedh = XXH_swap32(combinedl);
-        xxh_u64  const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret)   + seed);
-        xxh_u64  const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
-        xxh_u64  const mixedl = keyed_lo * PRIME64_1;
-        xxh_u64  const mixedh = keyed_hi * PRIME64_5;
-        XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
-        return h128;
-    }
-}
-
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(input != NULL);
-    XXH_ASSERT(secret != NULL);
-    XXH_ASSERT(4 <= len && len <= 8);
-    {   xxh_u32 const input_lo = XXH_readLE32(input);
-        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
-        xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
-        xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
-        xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
-        xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
-        xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
-        xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
-        xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
-        xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
-        {   XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
-            return h128;
-    }   }
-}
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(input != NULL);
-    XXH_ASSERT(secret != NULL);
-    XXH_ASSERT(9 <= len && len <= 16);
-    {   xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
-        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
-        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
-        xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
-        m128.low64 += lenContrib;
-        m128.high64 += input_hi * PRIME64_1;
-        m128.low64  ^= (m128.high64 >> 32);
-        {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
-            h128.high64 += m128.high64 * PRIME64_2;
-            h128.low64   = XXH3p_avalanche(h128.low64);
-            h128.high64  = XXH3p_avalanche(h128.high64);
-            return h128;
-    }   }
-}
-
-/* Assumption : `secret` size is >= 16
- * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    XXH_ASSERT(len <= 16);
-    {   if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
-        if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
-        if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
-        {   XXH128_hash_t const h128 = { 0, 0 };
-            return h128;
-    }   }
-}
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
-                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
-{
-    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
-
-    XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
-
-    /* converge into final hash */
-    XXH_STATIC_ASSERT(sizeof(acc) == 64);
-    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-    {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
-        xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
-        XXH128_hash_t const h128 = { low64, high64 };
-        return h128;
-    }
-}
-
-XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
-{
-    return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
-}
-
-XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
-                              const xxh_u8* secret, size_t secretSize)
-{
-    return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
-}
-
-XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
-XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
-{
-    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
-    if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
-    XXH3p_initCustomSecret(secret, seed);
-    return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
-}
-
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
-{
-    acc.low64  += XXH3p_mix16B (input_1, secret+0, seed);
-    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
-    acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
-    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
-    return acc;
-}
-
-XXH_NO_INLINE XXH128_hash_t
-XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                       XXH64_hash_t seed)
-{
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
-    XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
-
-    {   XXH128_hash_t acc;
-        int const nbRounds = (int)len / 32;
-        int i;
-        acc.low64 = len * PRIME64_1;
-        acc.high64 = 0;
-        for (i=0; i<4; i++) {
-            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
-        }
-        acc.low64 = XXH3p_avalanche(acc.low64);
-        acc.high64 = XXH3p_avalanche(acc.high64);
-        XXH_ASSERT(nbRounds >= 4);
-        for (i=4 ; i < nbRounds; i++) {
-            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
-        }
-        /* last bytes */
-        acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
-
-        {   xxh_u64 const low64 = acc.low64 + acc.high64;
-            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
-            XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
-            return h128;
-        }
-    }
-}
-
-
-XXH_FORCE_INLINE XXH128_hash_t
-XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
-                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
-                      XXH64_hash_t seed)
-{
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
-    XXH_ASSERT(16 < len && len <= 128);
-
-    {   XXH128_hash_t acc;
-        acc.low64 = len * PRIME64_1;
-        acc.high64 = 0;
-        if (len > 32) {
-            if (len > 64) {
-                if (len > 96) {
-                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
-                }
-                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
-            }
-            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
-        }
-        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
-        {   xxh_u64 const low64 = acc.low64 + acc.high64;
-            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
-            XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
-            return h128;
-        }
-    }
-}
-
-XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
-{
-    if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
-    if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
-    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
-    return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
-}
-
-XXH_PUBLIC_API XXH128_hash_t
-XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
-{
-    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
-    /* if an action must be taken should `secret` conditions not be respected,
-     * it should be done here.
-     * For now, it's a contract pre-condition.
-     * Adding a check and a branch here would cost performance at every hash */
-     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
-     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
-     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
-     return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
-}
-
-XXH_PUBLIC_API XXH128_hash_t
-XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
-{
-    if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
-    if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
-    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
-    return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
-}
-
-XXH_PUBLIC_API XXH128_hash_t
-XXH128(const void* input, size_t len, XXH64_hash_t seed)
-{
-    return XXH3p_128bits_withSeed(input, len, seed);
-}
-
-
-/* ===   XXH3 128-bit streaming   === */
-
-/* all the functions are actually the same as for 64-bit streaming variant,
-   just the reset one is different (different initial acc values for 0,5,6,7),
-   and near the end of the digest function */
-
-static void
-XXH3p_128bits_reset_internal(XXH3p_state_t* statePtr,
-                           XXH64_hash_t seed,
-                           const xxh_u8* secret, size_t secretSize)
-{
-    XXH3p_64bits_reset_internal(statePtr, seed, secret, secretSize);
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_128bits_reset(XXH3p_state_t* statePtr)
-{
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3p_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
-{
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3p_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
-    if (secret == NULL) return XXH_ERROR;
-    if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
-{
-    if (statePtr == NULL) return XXH_ERROR;
-    XXH3p_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
-    XXH3p_initCustomSecret(statePtr->customSecret, seed);
-    statePtr->secret = statePtr->customSecret;
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH3p_128bits_update(XXH3p_state_t* state, const void* input, size_t len)
-{
-    return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_128bits);
-}
-
-XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* state)
-{
-    if (state->totalLen > XXH3p_MIDSIZE_MAX) {
-        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
-        XXH3p_digest_long(acc, state, XXH3p_acc_128bits);
-        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
-        {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
-            xxh_u64 const high64 = XXH3p_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
-            XXH128_hash_t const h128 = { low64, high64 };
-            return h128;
-        }
-    }
-    /* len <= XXH3p_MIDSIZE_MAX : short code */
-    if (state->seed)
-        return XXH3p_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
-    return XXH3p_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
-}
-
-/* 128-bit utility functions */
-
-#include <string.h>   /* memcmp */
-
-/* return : 1 is equal, 0 if different */
-XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
-{
-    /* note : XXH128_hash_t is compact, it has no padding byte */
-    return !(memcmp(&h1, &h2, sizeof(h1)));
-}
-
-/* This prototype is compatible with stdlib's qsort().
- * return : >0 if *h128_1  > *h128_2
- *          <0 if *h128_1  < *h128_2
- *          =0 if *h128_1 == *h128_2  */
-XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
-{
-    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
-    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
-    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
-    /* note : bets that, in most cases, hash values are different */
-    if (hcmp) return hcmp;
-    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
-}
-
-
-/*======   Canonical representation   ======*/
-XXH_PUBLIC_API void
-XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
-{
-    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
-    if (XXH_CPU_LITTLE_ENDIAN) {
-        hash.high64 = XXH_swap64(hash.high64);
-        hash.low64  = XXH_swap64(hash.low64);
-    }
-    memcpy(dst, &hash.high64, sizeof(hash.high64));
-    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
-}
-
-XXH_PUBLIC_API XXH128_hash_t
-XXH128_hashFromCanonical(const XXH128_canonical_t* src)
-{
-    XXH128_hash_t h;
-    h.high64 = XXH_readBE64(src);
-    h.low64  = XXH_readBE64(src->digest + 8);
-    return h;
-}
-
-
-
-#endif  /* XXH3p_H */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxhash.cc mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxhash.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.cc	2025-05-19 16:14:28.000000000 +0000
@@ -3,1158 +3,47 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 /*
-*  xxHash - Fast Hash algorithm
-*  Copyright (C) 2012-2016, Yann Collet
-*
-*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-*
-*  Redistribution and use in source and binary forms, with or without
-*  modification, are permitted provided that the following conditions are
-*  met:
-*
-*  * Redistributions of source code must retain the above copyright
-*  notice, this list of conditions and the following disclaimer.
-*  * Redistributions in binary form must reproduce the above
-*  copyright notice, this list of conditions and the following disclaimer
-*  in the documentation and/or other materials provided with the
-*  distribution.
-*
-*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-*  You can contact the author at :
-*  - xxHash homepage: http://www.xxhash.com
-*  - xxHash source repository : https://github.com/Cyan4973/xxHash
-*/
-
-
-/* since xxhash.c can be included (via XXH_INLINE_ALL),
- * it's good practice to protect it with guard
- * in case of multiples inclusions */
-#ifndef XXHASH_C_01393879
-#define XXHASH_C_01393879
-
-/* *************************************
-*  Tuning parameters
-***************************************/
-/*!XXH_FORCE_MEMORY_ACCESS :
- * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
- * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
- * The below switch allow to select different access method for improved performance.
- * Method 0 (default) : use `memcpy()`. Safe and portable.
- * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
- *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
- * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
- *            It can generate buggy code on targets which do not support unaligned memory accesses.
- *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
- * See http://stackoverflow.com/a/32095106/646947 for details.
- * Prefer these methods in priority order (0 > 1 > 2)
- */
-#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
-#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
-#    define XXH_FORCE_MEMORY_ACCESS 2
-#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
-  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
-#    define XXH_FORCE_MEMORY_ACCESS 1
-#  endif
-#endif
-
-/*!XXH_ACCEPT_NULL_INPUT_POINTER :
- * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
- * When this macro is enabled, xxHash actively checks input for null pointer.
- * It it is, result for null input pointers is the same as a null-length input.
- */
-#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
-#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
-#endif
-
-/*!XXH_FORCE_ALIGN_CHECK :
- * This is a minor performance trick, only useful with lots of very small keys.
- * It means : check for aligned/unaligned input.
- * The check costs one initial branch per hash;
- * set it to 0 when the input is guaranteed to be aligned,
- * or when alignment doesn't matter for performance.
- */
-#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
-#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#    define XXH_FORCE_ALIGN_CHECK 0
-#  else
-#    define XXH_FORCE_ALIGN_CHECK 1
-#  endif
-#endif
-
-/*!XXH_REROLL:
- * Whether to reroll XXH32_finalize, and XXH64_finalize,
- * instead of using an unrolled jump table/if statement loop.
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2020 Yann Collet
  *
- * This is automatically defined on -Os/-Oz on GCC and Clang. */
-#ifndef XXH_REROLL
-#  if defined(__OPTIMIZE_SIZE__)
-#    define XXH_REROLL 1
-#  else
-#    define XXH_REROLL 0
-#  endif
-#endif
-
-/* *************************************
-*  Includes & Memory related functions
-***************************************/
-/*! Modify the local functions below should you wish to use some other memory routines
-*   for malloc(), free() */
-#include <stdlib.h>
-static void* XXH_malloc(size_t s) { return malloc(s); }
-static void  XXH_free  (void* p)  { free(p); }
-/*! and for memcpy() */
-#include <string.h>
-static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
-
-#include <limits.h>   /* ULLONG_MAX */
-
-#ifndef XXH_STATIC_LINKING_ONLY
-#define XXH_STATIC_LINKING_ONLY
-#endif
-
-#include "xxhash.h"
-
-/* BEGIN RocksDB customizations */
-#include "util/util.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */
-/* END RocksDB customizations */
-
-/* *************************************
-*  Compiler Specific Options
-***************************************/
-#ifdef _MSC_VER    /* Visual Studio */
-#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
-#  define XXH_FORCE_INLINE static __forceinline
-#  define XXH_NO_INLINE static __declspec(noinline)
-#else
-#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
-#    ifdef __GNUC__
-#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
-#      define XXH_NO_INLINE static __attribute__((noinline))
-#    else
-#      define XXH_FORCE_INLINE static inline
-#      define XXH_NO_INLINE static
-#    endif
-#  else
-#    define XXH_FORCE_INLINE static
-#    define XXH_NO_INLINE static
-#  endif /* __STDC_VERSION__ */
-#endif
-
-
-
-/* *************************************
-*  Debug
-***************************************/
-/* DEBUGLEVEL is expected to be defined externally,
- * typically through compiler command line.
- * Value must be a number. */
-#ifndef DEBUGLEVEL
-#  define DEBUGLEVEL 0
-#endif
-
-#if (DEBUGLEVEL>=1)
-#  include <assert.h>   /* note : can still be disabled with NDEBUG */
-#  define XXH_ASSERT(c)   assert(c)
-#else
-#  define XXH_ASSERT(c)   ((void)0)
-#endif
-
-/* note : use after variable declarations */
-#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }
-
-
-/* *************************************
-*  Basic Types
-***************************************/
-#if !defined (__VMS) \
- && (defined (__cplusplus) \
- || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
-# include <stdint.h>
-  typedef uint8_t  xxh_u8;
-#else
-  typedef unsigned char      xxh_u8;
-#endif
-typedef XXH32_hash_t xxh_u32;
-
-
-/* ===   Memory access   === */
-
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
-
-/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
-
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
-
-/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
-/* currently only defined for gcc and icc */
-typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
-static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
-
-#else
-
-/* portable and safe solution. Generally efficient.
- * see : http://stackoverflow.com/a/32095106/646947
- */
-static xxh_u32 XXH_read32(const void* memPtr)
-{
-    xxh_u32 val;
-    memcpy(&val, memPtr, sizeof(val));
-    return val;
-}
-
-#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
-
-
-/* ===   Endianess   === */
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-
-/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
-#ifndef XXH_CPU_LITTLE_ENDIAN
-#  if defined(_WIN32) /* Windows is always little endian */ \
-     || defined(__LITTLE_ENDIAN__) \
-     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#    define XXH_CPU_LITTLE_ENDIAN 1
-#  elif defined(__BIG_ENDIAN__) \
-     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#    define XXH_CPU_LITTLE_ENDIAN 0
-#  else
-static int XXH_isLittleEndian(void)
-{
-    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
-    return one.c[0];
-}
-#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
-#  endif
-#endif
-
-
-
-
-/* ****************************************
-*  Compiler-specific Functions and Macros
-******************************************/
-#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-
-#ifndef __has_builtin
-#  define __has_builtin(x) 0
-#endif
-
-#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
-#  define XXH_rotl32 __builtin_rotateleft32
-#  define XXH_rotl64 __builtin_rotateleft64
-/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
-#elif defined(_MSC_VER)
-#  define XXH_rotl32(x,r) _rotl(x,r)
-#  define XXH_rotl64(x,r) _rotl64(x,r)
-#else
-#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
-#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
-#endif
-
-#if defined(_MSC_VER)     /* Visual Studio */
-#  define XXH_swap32 _byteswap_ulong
-#elif XXH_GCC_VERSION >= 403
-#  define XXH_swap32 __builtin_bswap32
-#else
-static xxh_u32 XXH_swap32 (xxh_u32 x)
-{
-    return  ((x << 24) & 0xff000000 ) |
-            ((x <<  8) & 0x00ff0000 ) |
-            ((x >>  8) & 0x0000ff00 ) |
-            ((x >> 24) & 0x000000ff );
-}
-#endif
-
-
-/* ***************************
-*  Memory reads
-*****************************/
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
-
-XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
-{
-    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
-}
-
-static xxh_u32 XXH_readBE32(const void* ptr)
-{
-    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
-}
-
-XXH_FORCE_INLINE xxh_u32
-XXH_readLE32_align(const void* ptr, XXH_alignment align)
-{
-    if (align==XXH_unaligned) {
-        return XXH_readLE32(ptr);
-    } else {
-        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
-    }
-}
-
-
-/* *************************************
-*  Misc
-***************************************/
-XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
-
-
-/* *******************************************************************
-*  32-bit hash functions
-*********************************************************************/
-static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
-static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
-static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
-static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
-static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
-
-static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
-{
-    acc += input * PRIME32_2;
-    acc  = XXH_rotl32(acc, 13);
-    acc *= PRIME32_1;
-#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
-    /* UGLY HACK:
-     * This inline assembly hack forces acc into a normal register. This is the
-     * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop
-     * (pragmas and attributes don't work for some resason) without globally
-     * disabling SSE4.1.
-     *
-     * The reason we want to avoid vectorization is because despite working on
-     * 4 integers at a time, there are multiple factors slowing XXH32 down on
-     * SSE4:
-     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
-     *   making it slightly slower to multiply four integers at once compared to four
-     *   integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
-     *   still not worth it to go into SSE just to multiply unless doing a long operation.
-     *
-     * - Four instructions are required to rotate,
-     *      movqda tmp,  v // not required with VEX encoding
-     *      pslld  tmp, 13 // tmp <<= 13
-     *      psrld  v,   19 // x >>= 19
-     *      por    v,  tmp // x |= tmp
-     *   compared to one for scalar:
-     *      roll   v, 13    // reliably fast across the board
-     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
-     *
-     * - Instruction level parallelism is actually more beneficial here because the
-     *   SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
-     *   while v3 can multiply. SSE forces them to operate together.
-     *
-     * How this hack works:
-     * __asm__(""       // Declare an assembly block but don't declare any instructions
-     *          :       // However, as an Input/Output Operand,
-     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
-     *          (acc)   // and set acc as the operand
-     * );
-     *
-     * Because of the 'r', the compiler has promised that seed will be in a
-     * general purpose register and the '+' says that it will be 'read/write',
-     * so it has to assume it has changed. It is like volatile without all the
-     * loads and stores.
-     *
-     * Since the argument has to be in a normal register (not an SSE register),
-     * each time XXH32_round is called, it is impossible to vectorize. */
-    __asm__("" : "+r" (acc));
-#endif
-    return acc;
-}
-
-/* mix all bits */
-static xxh_u32 XXH32_avalanche(xxh_u32 h32)
-{
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-    return(h32);
-}
-
-#define XXH_get32bits(p) XXH_readLE32_align(p, align)
-
-static xxh_u32
-XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
-{
-#define PROCESS1               \
-    h32 += (*ptr++) * PRIME32_5; \
-    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-
-#define PROCESS4                         \
-    h32 += XXH_get32bits(ptr) * PRIME32_3; \
-    ptr+=4;                                \
-    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
-
-    /* Compact rerolled version */
-    if (XXH_REROLL) {
-        len &= 15;
-        while (len >= 4) {
-            PROCESS4;
-            len -= 4;
-        }
-        while (len > 0) {
-            PROCESS1;
-            --len;
-        }
-        return XXH32_avalanche(h32);
-    } else {
-         switch(len&15) /* or switch(bEnd - p) */ {
-           case 12:      PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 8:       PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 4:       PROCESS4;
-                         return XXH32_avalanche(h32);
-
-           case 13:      PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 9:       PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 5:       PROCESS4;
-                         PROCESS1;
-                         return XXH32_avalanche(h32);
-
-           case 14:      PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 10:      PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 6:       PROCESS4;
-                         PROCESS1;
-                         PROCESS1;
-                         return XXH32_avalanche(h32);
-
-           case 15:      PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 11:      PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 7:       PROCESS4;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 3:       PROCESS1;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 2:       PROCESS1;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 1:       PROCESS1;
-                         FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 0:       return XXH32_avalanche(h32);
-        }
-        XXH_ASSERT(0);
-        return h32;   /* reaching this point is deemed impossible */
-    }
-}
-
-XXH_FORCE_INLINE xxh_u32
-XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
-{
-    const xxh_u8* bEnd = input + len;
-    xxh_u32 h32;
-
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
-    if (input==NULL) {
-        len=0;
-        bEnd=input=(const xxh_u8*)(size_t)16;
-    }
-#endif
-
-    if (len>=16) {
-        const xxh_u8* const limit = bEnd - 15;
-        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
-        xxh_u32 v2 = seed + PRIME32_2;
-        xxh_u32 v3 = seed + 0;
-        xxh_u32 v4 = seed - PRIME32_1;
-
-        do {
-            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
-            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
-            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
-            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
-        } while (input < limit);
-
-        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
-            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-    } else {
-        h32  = seed + PRIME32_5;
-    }
-
-    h32 += (xxh_u32)len;
-
-    return XXH32_finalize(h32, input, len&15, align);
-}
-
-
-XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
-{
-#if 0
-    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
-    XXH32_state_t state;
-    XXH32_reset(&state, seed);
-    XXH32_update(&state, (const xxh_u8*)input, len);
-    return XXH32_digest(&state);
-
-#else
-
-    if (XXH_FORCE_ALIGN_CHECK) {
-        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
-            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
-    }   }
-
-    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
-#endif
-}
-
-
-
-/*======   Hash streaming   ======*/
-
-XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
-{
-    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
-}
-XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
-{
-    XXH_free(statePtr);
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
-{
-    memcpy(dstState, srcState, sizeof(*dstState));
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
-{
-    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
-    memset(&state, 0, sizeof(state));
-    state.v1 = seed + PRIME32_1 + PRIME32_2;
-    state.v2 = seed + PRIME32_2;
-    state.v3 = seed + 0;
-    state.v4 = seed - PRIME32_1;
-    /* do not write into reserved, planned to be removed in a future version */
-    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
-    return XXH_OK;
-}
-
-
-XXH_PUBLIC_API XXH_errorcode
-XXH32_update(XXH32_state_t* state, const void* input, size_t len)
-{
-    if (input==NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
-        return XXH_OK;
-#else
-        return XXH_ERROR;
-#endif
-
-    {   const xxh_u8* p = (const xxh_u8*)input;
-        const xxh_u8* const bEnd = p + len;
-
-        state->total_len_32 += (XXH32_hash_t)len;
-        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
-
-        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
-            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
-            state->memsize += (XXH32_hash_t)len;
-            return XXH_OK;
-        }
-
-        if (state->memsize) {   /* some data left from previous update */
-            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
-            {   const xxh_u32* p32 = state->mem32;
-                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
-                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
-                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
-                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
-            }
-            p += 16-state->memsize;
-            state->memsize = 0;
-        }
-
-        // uintptr_t casts added to avoid array-bounds error on
-        // some inlined calls
-        if ((uintptr_t)p <= (uintptr_t)bEnd - 16) {
-            const uintptr_t limit = (uintptr_t)bEnd - 16;
-            xxh_u32 v1 = state->v1;
-            xxh_u32 v2 = state->v2;
-            xxh_u32 v3 = state->v3;
-            xxh_u32 v4 = state->v4;
-
-            do {
-                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
-                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
-                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
-                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
-            } while ((uintptr_t)p <= limit);
-
-            state->v1 = v1;
-            state->v2 = v2;
-            state->v3 = v3;
-            state->v4 = v4;
-        }
-
-        if (p < bEnd) {
-            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
-            state->memsize = (unsigned)(bEnd-p);
-        }
-    }
-
-    return XXH_OK;
-}
-
-
-XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
-{
-    xxh_u32 h32;
-
-    if (state->large_len) {
-        h32 = XXH_rotl32(state->v1, 1)
-            + XXH_rotl32(state->v2, 7)
-            + XXH_rotl32(state->v3, 12)
-            + XXH_rotl32(state->v4, 18);
-    } else {
-        h32 = state->v3 /* == seed */ + PRIME32_5;
-    }
-
-    h32 += state->total_len_32;
-
-    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
-}
-
-
-/*======   Canonical representation   ======*/
-
-/*! Default XXH result types are basic unsigned 32 and 64 bits.
-*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
-*   These functions allow transformation of hash result into and from its canonical format.
-*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
-*/
-
-XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
-{
-    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
-    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
-    memcpy(dst, &hash, sizeof(*dst));
-}
-
-XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
-{
-    return XXH_readBE32(src);
-}
-
-
-#ifndef XXH_NO_LONG_LONG
-
-/* *******************************************************************
-*  64-bit hash functions
-*********************************************************************/
-
-/*======   Memory access   ======*/
-
-typedef XXH64_hash_t xxh_u64;
-
-
-/*! XXH_REROLL_XXH64:
- * Whether to reroll the XXH64_finalize() loop.
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
  *
- * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a performance gain
- * on 64-bit hosts, as only one jump is required.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
  *
- * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit registers,
- * and 64-bit arithmetic needs to be simulated, it isn't beneficial to unroll. The code becomes
- * ridiculously large (the largest function in the binary on i386!), and rerolling it saves
- * anywhere from 3kB to 20kB. It is also slightly faster because it fits into cache better
- * and is more likely to be inlined by the compiler.
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
  *
- * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. */
-#ifndef XXH_REROLL_XXH64
-#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
-   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
-     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
-     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
-     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
-   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
-#    define XXH_REROLL_XXH64 1
-#  else
-#    define XXH_REROLL_XXH64 0
-#  endif
-#endif /* !defined(XXH_REROLL_XXH64) */
-
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
-
-/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
-
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
-
-/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
-/* currently only defined for gcc and icc */
-typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
-static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
-
-#else
-
-/* portable and safe solution. Generally efficient.
- * see : http://stackoverflow.com/a/32095106/646947
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
  */
 
-static xxh_u64 XXH_read64(const void* memPtr)
-{
-    xxh_u64 val;
-    memcpy(&val, memPtr, sizeof(val));
-    return val;
-}
-
-#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
-
-#if defined(_MSC_VER)     /* Visual Studio */
-#  define XXH_swap64 _byteswap_uint64
-#elif XXH_GCC_VERSION >= 403
-#  define XXH_swap64 __builtin_bswap64
-#else
-static xxh_u64 XXH_swap64 (xxh_u64 x)
-{
-    return  ((x << 56) & 0xff00000000000000ULL) |
-            ((x << 40) & 0x00ff000000000000ULL) |
-            ((x << 24) & 0x0000ff0000000000ULL) |
-            ((x << 8)  & 0x000000ff00000000ULL) |
-            ((x >> 8)  & 0x00000000ff000000ULL) |
-            ((x >> 24) & 0x0000000000ff0000ULL) |
-            ((x >> 40) & 0x000000000000ff00ULL) |
-            ((x >> 56) & 0x00000000000000ffULL);
-}
-#endif
-
-XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
-{
-    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
-}
-
-static xxh_u64 XXH_readBE64(const void* ptr)
-{
-    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
-}
-
-XXH_FORCE_INLINE xxh_u64
-XXH_readLE64_align(const void* ptr, XXH_alignment align)
-{
-    if (align==XXH_unaligned)
-        return XXH_readLE64(ptr);
-    else
-        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
-}
-
-
-/*======   xxh64   ======*/
-
-static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
-static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
-static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
-static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
-static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
-
-static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
-{
-    acc += input * PRIME64_2;
-    acc  = XXH_rotl64(acc, 31);
-    acc *= PRIME64_1;
-    return acc;
-}
-
-static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
-{
-    val  = XXH64_round(0, val);
-    acc ^= val;
-    acc  = acc * PRIME64_1 + PRIME64_4;
-    return acc;
-}
-
-static xxh_u64 XXH64_avalanche(xxh_u64 h64)
-{
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-    return h64;
-}
-
-
-#define XXH_get64bits(p) XXH_readLE64_align(p, align)
-
-static xxh_u64
-XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
-{
-#define PROCESS1_64            \
-    h64 ^= (*ptr++) * PRIME64_5; \
-    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-
-#define PROCESS4_64          \
-    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \
-    ptr+=4;                    \
-    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-
-#define PROCESS8_64 {        \
-    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
-    ptr+=8;                    \
-    h64 ^= k1;               \
-    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
-}
-
-    /* Rerolled version for 32-bit targets is faster and much smaller. */
-    if (XXH_REROLL || XXH_REROLL_XXH64) {
-        len &= 31;
-        while (len >= 8) {
-            PROCESS8_64;
-            len -= 8;
-        }
-        if (len >= 4) {
-            PROCESS4_64;
-            len -= 4;
-        }
-        while (len > 0) {
-            PROCESS1_64;
-            --len;
-        }
-         return  XXH64_avalanche(h64);
-    } else {
-        switch(len & 31) {
-           case 24: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 16: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  8: PROCESS8_64;
-                    return XXH64_avalanche(h64);
-
-           case 28: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 20: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 12: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  4: PROCESS4_64;
-                    return XXH64_avalanche(h64);
-
-           case 25: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 17: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  9: PROCESS8_64;
-                    PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 29: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 21: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 13: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  5: PROCESS4_64;
-                    PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 26: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 18: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 10: PROCESS8_64;
-                    PROCESS1_64;
-                    PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 30: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 22: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 14: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  6: PROCESS4_64;
-                    PROCESS1_64;
-                    PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 27: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 19: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 11: PROCESS8_64;
-                    PROCESS1_64;
-                    PROCESS1_64;
-                    PROCESS1_64;
-                    return XXH64_avalanche(h64);
-
-           case 31: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 23: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case 15: PROCESS8_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  7: PROCESS4_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  3: PROCESS1_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  2: PROCESS1_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  1: PROCESS1_64;
-                    FALLTHROUGH_INTENDED;
-                         /* fallthrough */
-           case  0: return XXH64_avalanche(h64);
-        }
-    }
-    /* impossible to reach */
-    XXH_ASSERT(0);
-    return 0;  /* unreachable, but some compilers complain without it */
-}
-
-XXH_FORCE_INLINE xxh_u64
-XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
-{
-    const xxh_u8* bEnd = input + len;
-    xxh_u64 h64;
-
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
-    if (input==NULL) {
-        len=0;
-        bEnd=input=(const xxh_u8*)(size_t)32;
-    }
-#endif
-
-    if (len>=32) {
-        const xxh_u8* const limit = bEnd - 32;
-        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
-        xxh_u64 v2 = seed + PRIME64_2;
-        xxh_u64 v3 = seed + 0;
-        xxh_u64 v4 = seed - PRIME64_1;
-
-        do {
-            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
-            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
-            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
-            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
-        } while (input<=limit);
-
-        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
-        h64 = XXH64_mergeRound(h64, v1);
-        h64 = XXH64_mergeRound(h64, v2);
-        h64 = XXH64_mergeRound(h64, v3);
-        h64 = XXH64_mergeRound(h64, v4);
-
-    } else {
-        h64  = seed + PRIME64_5;
-    }
-
-    h64 += (xxh_u64) len;
-
-    return XXH64_finalize(h64, input, len, align);
-}
-
-
-XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
-{
-#if 0
-    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
-    XXH64_state_t state;
-    XXH64_reset(&state, seed);
-    XXH64_update(&state, (const xxh_u8*)input, len);
-    return XXH64_digest(&state);
-
-#else
-
-    if (XXH_FORCE_ALIGN_CHECK) {
-        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
-            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
-    }   }
-
-    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
-
-#endif
-}
-
-/*======   Hash Streaming   ======*/
-
-XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
-{
-    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
-}
-XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
-{
-    XXH_free(statePtr);
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
-{
-    memcpy(dstState, srcState, sizeof(*dstState));
-}
-
-XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
-{
-    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
-    memset(&state, 0, sizeof(state));
-    state.v1 = seed + PRIME64_1 + PRIME64_2;
-    state.v2 = seed + PRIME64_2;
-    state.v3 = seed + 0;
-    state.v4 = seed - PRIME64_1;
-     /* do not write into reserved64, might be removed in a future version */
-    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
-    return XXH_OK;
-}
-
-XXH_PUBLIC_API XXH_errorcode
-XXH64_update (XXH64_state_t* state, const void* input, size_t len)
-{
-    if (input==NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
-        return XXH_OK;
-#else
-        return XXH_ERROR;
-#endif
-
-    {   const xxh_u8* p = (const xxh_u8*)input;
-        const xxh_u8* const bEnd = p + len;
-
-        state->total_len += len;
-
-        if (state->memsize + len < 32) {  /* fill in tmp buffer */
-            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
-            state->memsize += (xxh_u32)len;
-            return XXH_OK;
-        }
-
-        if (state->memsize) {   /* tmp buffer is full */
-            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
-            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
-            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
-            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
-            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
-            p += 32-state->memsize;
-            state->memsize = 0;
-        }
-
-        // uintptr_t casts added to avoid array-bounds error on
-        // some inlined calls
-        if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) {
-            const uintptr_t limit = (uintptr_t)bEnd - 32;
-            xxh_u64 v1 = state->v1;
-            xxh_u64 v2 = state->v2;
-            xxh_u64 v3 = state->v3;
-            xxh_u64 v4 = state->v4;
-
-            do {
-                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
-                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
-                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
-                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
-            } while ((uintptr_t)p <= limit);
-
-            state->v1 = v1;
-            state->v2 = v2;
-            state->v3 = v3;
-            state->v4 = v4;
-        }
-
-        if (p < bEnd) {
-            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
-            state->memsize = (unsigned)(bEnd-p);
-        }
-    }
-
-    return XXH_OK;
-}
-
-
-XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
-{
-    xxh_u64 h64;
-
-    if (state->total_len >= 32) {
-        xxh_u64 const v1 = state->v1;
-        xxh_u64 const v2 = state->v2;
-        xxh_u64 const v3 = state->v3;
-        xxh_u64 const v4 = state->v4;
-
-        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
-        h64 = XXH64_mergeRound(h64, v1);
-        h64 = XXH64_mergeRound(h64, v2);
-        h64 = XXH64_mergeRound(h64, v3);
-        h64 = XXH64_mergeRound(h64, v4);
-    } else {
-        h64  = state->v3 /*seed*/ + PRIME64_5;
-    }
-
-    h64 += (xxh_u64) state->total_len;
-
-    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
-}
-
-
-/*====== Canonical representation   ======*/
-
-XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
-{
-    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
-    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
-    memcpy(dst, &hash, sizeof(*dst));
-}
-
-XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
-{
-    return XXH_readBE64(src);
-}
-
-
-
-/* *********************************************************************
-*  XXH3
-*  New generation hash designed for speed on small keys and vectorization
-************************************************************************ */
-
-#include "xxh3p.h" /* XXH3 preview for RocksDB */
 
+/*
+ * xxhash.c instantiates functions defined in xxhash.h
+ */
 
-#endif  /* XXH_NO_LONG_LONG */
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#endif // !defined(XXH_STATIC_LINKING_ONLY)
+#define XXH_IMPLEMENTATION   /* access definitions */
 
-#endif  /* XXHASH_C_01393879 */
+#include "xxhash.h"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxhash.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxhash.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxhash.h	2025-05-19 16:14:28.000000000 +0000
@@ -2,43 +2,61 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-/*
-   xxHash - Extremely Fast Hash algorithm
-   Header File
-   Copyright (C) 2012-2016, Yann Collet.
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-   You can contact the author at :
-   - xxHash source repository : https://github.com/Cyan4973/xxHash
-*/
+/* BEGIN RocksDB customizations */
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY 1 /* using xxhash.cc */
+#endif // !defined(XXH_STATIC_LINKING_ONLY)
+#ifndef XXH_NAMESPACE
+#define XXH_NAMESPACE ROCKSDB_
+#endif // !defined(XXH_NAMESPACE)
+#include "port/lang.h" // for FALLTHROUGH_INTENDED, inserted as appropriate
+/* END RocksDB customizations */
 
-/* Notice extracted from xxHash homepage :
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2020 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+/*!
+ * @mainpage xxHash
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+/* TODO: update */
+/* Notice extracted from xxHash homepage:
 
-xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+xxHash is an extremely fast hash algorithm, running at RAM speed limits.
 It also successfully passes all tests from the SMHasher suite.
 
 Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
@@ -46,24 +64,26 @@
 Name            Speed       Q.Score   Author
 xxHash          5.4 GB/s     10
 CrapWow         3.2 GB/s      2       Andrew
-MumurHash 3a    2.7 GB/s     10       Austin Appleby
+MurmurHash 3a   2.7 GB/s     10       Austin Appleby
 SpookyHash      2.0 GB/s     10       Bob Jenkins
 SBox            1.4 GB/s      9       Bret Mulvey
 Lookup3         1.2 GB/s      9       Bob Jenkins
 SuperFastHash   1.2 GB/s      1       Paul Hsieh
 CityHash64      1.05 GB/s    10       Pike & Alakuijala
 FNV             0.55 GB/s     5       Fowler, Noll, Vo
-CRC32           0.43 GB/s #   9
+CRC32           0.43 GB/s     9
 MD5-32          0.33 GB/s    10       Ronald L. Rivest
 SHA1-32         0.28 GB/s    10
 
-Note #: other CRC32 implementations can be over 40x faster than SMHasher's:
-http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
-
 Q.Score is a measure of quality of the hash function.
 It depends on successfully passing SMHasher test set.
 10 is a perfect score.
 
+Note: SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction:
+https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
 A 64-bit version, named XXH64, is available since r35.
 It offers much better speed, but for 64-bit applications only.
 Name     Speed on 64 bits    Speed on 32 bits
@@ -71,47 +91,38 @@
 XXH32        6.8 GB/s            6.0 GB/s
 */
 
-#ifndef XXHASH_H_5627135585666179
-#define XXHASH_H_5627135585666179 1
-
-/* BEGIN RocksDB customizations */
-#ifndef XXH_STATIC_LINKING_ONLY
-#define XXH_STATIC_LINKING_ONLY 1 /* access experimental APIs like XXH3 */
-#endif
-#define XXH_NAMESPACE ROCKSDB_
-/* END RocksDB customizations */
-
 #if defined (__cplusplus)
 extern "C" {
 #endif
 
-
 /* ****************************
-*  Definitions
-******************************/
-#include <stddef.h>   /* size_t */
-typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
-
-
-/* ****************************
- *  API modifier
+ *  INLINE mode
  ******************************/
-/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
- *  This build macro includes xxhash functions in `static` mode
- *  in order to inline them, and remove their symbol from the public list.
- *  Inlining offers great performance improvement on small keys,
- *  and dramatic ones when length is expressed as a compile-time constant.
- *  See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
- *  Methodology :
+/*!
+ * XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
  *     #define XXH_INLINE_ALL
  *     #include "xxhash.h"
- * `xxhash.c` is automatically included.
- *  It's not useful to compile and link it as a separate object.
+ *
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
  */
-#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
-#  ifndef XXH_STATIC_LINKING_ONLY
-#    define XXH_STATIC_LINKING_ONLY
-#  endif
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
 #  if defined(__GNUC__)
 #    define XXH_PUBLIC_API static __inline __attribute__((unused))
 #  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
@@ -119,10 +130,69 @@
 #  elif defined(_MSC_VER)
 #    define XXH_PUBLIC_API static __inline
 #  else
-     /* this version may generate warnings for unused static functions */
+     /* note: this version may generate warnings for unused static functions */
 #    define XXH_PUBLIC_API static
 #  endif
-#else
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
+    * as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+#  ifdef XXH_NAMESPACE
+#    error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
+     /*
+      * Note: Alternative: #undef all symbols (it's a pretty large list).
+      * Without #error: it compiles, but functions are actually not inlined.
+      */
+#  endif
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols, but they must
+    * still be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and is a more dispersed action.
+    * Meanwhile, renaming can be achieved in a single block
+    */
+#  define XXH_IPREF(Id)   XXH_INLINE_ ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
 #  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
 #    ifdef XXH_EXPORT
 #      define XXH_PUBLIC_API __declspec(dllexport)
@@ -132,23 +202,31 @@
 #  else
 #    define XXH_PUBLIC_API   /* do nothing */
 #  endif
-#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+#endif
 
-/*! XXH_NAMESPACE, aka Namespace Emulation :
- *
- * If you want to include _and expose_ xxHash functions from within your own library,
- * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
  *
- * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
- * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
  *
- * Note that no change is required within the calling program as long as it includes `xxhash.h` :
- * regular symbol name will be automatically translated by this header.
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
  */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
 #ifdef XXH_NAMESPACE
 #  define XXH_CAT(A,B) A##B
 #  define XXH_NAME2(A,B) XXH_CAT(A,B)
 #  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
 #  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
 #  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
 #  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
@@ -158,6 +236,7 @@
 #  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
 #  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
 #  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
 #  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
 #  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
 #  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
@@ -167,6 +246,33 @@
 #  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
 #  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
 #  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
 #endif
 
 
@@ -174,16 +280,39 @@
 *  Version
 ***************************************/
 #define XXH_VERSION_MAJOR    0
-#define XXH_VERSION_MINOR    7
-#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  1
 #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is only useful when xxHash is compiled as a shared library, as it is
+ * independent of the version defined in the header.
+ *
+ * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
+ */
 XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
 
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
 /*-**********************************************************************
 *  32-bit hash
 ************************************************************************/
-#if !defined (__VMS) \
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+#elif !defined (__VMS) \
   && (defined (__cplusplus) \
   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
 #   include <stdint.h>
@@ -196,94 +325,318 @@
 #     if ULONG_MAX == 0xFFFFFFFFUL
         typedef unsigned long XXH32_hash_t;
 #     else
-#       error "unsupported platform : need a 32-bit type"
+#       error "unsupported platform: need a 32-bit type"
 #     endif
 #   endif
 #endif
 
-/*! XXH32() :
-    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
-    The memory between input & input+length must be valid (allocated and read-accessible).
-    "seed" can be used to alter the result predictably.
-    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
-XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+/*!
+ * @}
+ *
+ * @defgroup xxh32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is considered rather weak by today's standards.
+ *   The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
+ *   systems, and offers true 64/128 bit hash results. It provides a superior
+ *   level of dispersion, and greatly reduces the risks of collisions.
+ *
+ * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
+ * @see @ref xxh32_impl for implementation details
+ * @{
+ */
 
-/*======   Streaming   ======*/
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit hash value.
+ *
+ * @see
+ *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
 
-/*
- * Streaming functions generate the xxHash value from an incrememtal input.
+/*!
+ * Streaming functions generate the xxHash value from an incremental input.
  * This method is slower than single-call functions, due to state management.
  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
  *
- * XXH state must first be allocated, using XXH*_createState() .
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
  *
- * Start a new hash by initializing state with a seed, using XXH*_reset().
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
  *
- * Then, feed the hash state by calling XXH*_update() as many times as necessary.
- * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
  *
- * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
  * This function returns the nn-bits hash as an int or long long.
  *
- * It's still possible to continue inserting input into the hash state after a digest,
- * and generate some new hash values later on, by invoking again XXH*_digest().
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ * Example code for incrementally hashing a file:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <xxhash.h>
+ *    #define BUFFER_SIZE 256
+ *
+ *    // Note: XXH64 and XXH3 use the same interface.
+ *    XXH32_hash_t
+ *    hashFile(FILE* stream)
+ *    {
+ *        XXH32_state_t* state;
+ *        unsigned char buf[BUFFER_SIZE];
+ *        size_t amt;
+ *        XXH32_hash_t hash;
+ *
+ *        state = XXH32_createState();       // Create a state
+ *        assert(state != NULL);             // Error check here
+ *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
+ *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
+ *            XXH32_update(state, buf, amt); // Hash the file in chunks
+ *        }
+ *        hash = XXH32_digest(state);        // Finalize the hash
+ *        XXH32_freeState(state);            // Clean up
+ *        return hash;
+ *    }
+ * @endcode
+ */
+
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
  *
- * When done, release the state, using XXH*_freeState().
+ * @see XXH32_state_s for details.
  */
+typedef struct XXH32_state_s XXH32_state_t;
 
-typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * Must be freed with XXH32_freeState().
+ * @return An allocated XXH32_state_t on success, `NULL` on failure.
+ */
 XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * Must be allocated with XXH32_createState().
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ * @return XXH_OK.
+ */
 XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
 
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
+ */
 XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated xxHash32 value from that state.
+ */
 XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
 
-/*======   Canonical representation   ======*/
+/*******   Canonical representation   *******/
 
-/* Default return values from XXH functions are basic unsigned 32 and 64 bits.
+/*
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
  * This the simplest and fastest format for further post-processing.
- * However, this leaves open the question of what is the order of bytes,
- * since little and big endian conventions will write the same number differently.
  *
- * The canonical representation settles this issue,
- * by mandating big-endian convention,
- * aka, the same convention as human-readable numbers (large digits first).
- * When writing hash values to storage, sending them over a network, or printing them,
- * it's highly recommended to use the canonical representation,
- * to ensure portability across a wider range of systems, present and future.
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
  *
- * The following functions allow transformation of hash values into and from canonical format.
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
  */
 
-typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ */
 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ */
 XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
 
 
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
 #ifndef XXH_NO_LONG_LONG
 /*-**********************************************************************
 *  64-bit hash
 ************************************************************************/
-#if !defined (__VMS) \
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
   && (defined (__cplusplus) \
   || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
-#   include <stdint.h>
-    typedef uint64_t XXH64_hash_t;
+#  include <stdint.h>
+   typedef uint64_t XXH64_hash_t;
 #else
-    /* the following type must have a width of 64-bit */
-    typedef unsigned long long XXH64_hash_t;
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
 #endif
 
-/*! XXH64() :
-    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
-    "seed" can be used to alter the result predictably.
-    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
-*/
-XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+/*!
+ * @}
+ *
+ * @defgroup xxh64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results. It provides a superior level of
+ *   dispersion, and greatly reduces the risks of collisions.
+ */
+
 
-/*======   Streaming   ======*/
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * This function usually runs faster on 64-bit systems, but slower on 32-bit
+ * systems (see benchmark).
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit hash.
+ *
+ * @see
+ *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
+ *    Direct equivalents for the other variants of xxHash.
+ * @see
+ *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ */
 typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
 XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
 XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
@@ -293,172 +646,289 @@
 XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
 XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
 
-/*======   Canonical representation   ======*/
-typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
 XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
 
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup xxh3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Any 32-bit and 64-bit targets that can run XXH32 smoothly
+ * can run XXH3 at competitive speeds, even without vector support.
+ * Further details are explained in the implementation.
+ *
+ * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
+ * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generage exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
 
-#endif  /* XXH_NO_LONG_LONG */
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
 
+/* XXH3_64bits():
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
 
+/*
+ * XXH3_64bits_withSeed():
+ * This variant generates a custom secret on the fly
+ * based on default secret altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * Note: seed==0 produces the same results as XXH3_64bits().
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
 
-#ifdef XXH_STATIC_LINKING_ONLY
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
 
-/* ================================================================================================
-   This section contains declarations which are not guaranteed to remain stable.
-   They may change in future versions, becoming incompatible with a different version of the library.
-   These declarations should only be used with static linking.
-   Never use them in association with dynamic linking !
-=================================================================================================== */
-
-/* These definitions are only present to allow
- * static allocation of XXH state, on stack or in a struct for example.
- * Never **ever** use members directly. */
+/*
+ * XXH3_64bits_withSecret():
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * However, the quality of produced hash values depends on secret's entropy.
+ * Technically, the secret must look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever unsure about the "randomness" of the blob of bytes,
+ * consider relabelling it as a "custom seed" instead,
+ * and employ "XXH3_generateSecret()" (see below)
+ * to generate a high entropy secret derived from the custom seed.
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
 
-struct XXH32_state_s {
-   XXH32_hash_t total_len_32;
-   XXH32_hash_t large_len;
-   XXH32_hash_t v1;
-   XXH32_hash_t v2;
-   XXH32_hash_t v3;
-   XXH32_hash_t v4;
-   XXH32_hash_t mem32[4];
-   XXH32_hash_t memsize;
-   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
-};   /* typedef'd to XXH32_state_t */
 
-#ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
-struct XXH64_state_s {
-   XXH64_hash_t total_len;
-   XXH64_hash_t v1;
-   XXH64_hash_t v2;
-   XXH64_hash_t v3;
-   XXH64_hash_t v4;
-   XXH64_hash_t mem64[4];
-   XXH32_hash_t memsize;
-   XXH32_hash_t reserved32;  /* required for padding anyway */
-   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
-};   /* typedef'd to XXH64_state_t */
-#endif   /* XXH_NO_LONG_LONG */
+/*******   Streaming   *******/
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+/*
+ * XXH3_64bits_reset():
+ * Initialize with default parameters.
+ * digest will be equivalent to `XXH3_64bits()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/*
+ * XXH3_64bits_reset_withSeed():
+ * Generate a custom secret from `seed`, and store it into `statePtr`.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*
+ * XXH3_64bits_reset_withSecret():
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
 
 
 /*-**********************************************************************
-*  XXH3
-*  New experimental hash
+*  XXH3 128-bit variant
 ************************************************************************/
-#ifndef XXH_NO_LONG_LONG
 
-
-/* ============================================
- * XXH3 is a new hash algorithm,
- * featuring improved speed performance for both small and large inputs.
- * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
- * In general, expect XXH3 to run about ~2x faster on large inputs,
- * and >3x faster on small ones, though exact differences depend on platform.
- *
- * The algorithm is portable, will generate the same hash on all platforms.
- * It benefits greatly from vectorization units, but does not require it.
+/*!
+ * @brief The return value from 128-bit hashes.
  *
- * XXH3 offers 2 variants, _64bits and _128bits.
- * When only 64 bits are needed, prefer calling the _64bits variant :
- * it reduces the amount of mixing, resulting in faster speed on small inputs.
- * It's also generally simpler to manipulate a scalar return type than a struct.
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
  *
- * The XXH3 algorithm is still considered experimental.
- * Produced results can still change between versions.
- * Results produced by v0.7.x are not comparable with results from v0.7.y .
- * It's nonetheless possible to use XXH3 for ephemeral data (local sessions),
- * but avoid storing values in long-term storage for later reads.
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
  *
- * The API supports one-shot hashing, streaming mode, and custom secrets.
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * XXH128_isEqual():
+ * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
+ */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * XXH128_cmp():
  *
- * There are still a number of opened questions that community can influence during the experimental period.
- * I'm trying to list a few of them below, though don't consider this list as complete.
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
  *
- * - 128-bits output type : currently defined as a structure of two 64-bits fields.
- *                          That's because 128-bit values do not exist in C standard.
- *                          Note that it means that, at byte level, result is not identical depending on endianess.
- *                          However, at field level, they are identical on all platforms.
- *                          The canonical representation solves the issue of identical byte-level representation across platforms,
- *                          which is necessary for serialization.
- *                          Q1 : Would there be a better representation for a 128-bit hash result ?
- *                          Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ?
- *
- * - Prototype XXH128() :   XXH128() uses the same arguments as XXH64(), for consistency.
- *                          It means it maps to XXH3p_128bits_withSeed().
- *                          This variant is slightly slower than XXH3p_128bits(),
- *                          because the seed is now part of the algorithm, and can't be simplified.
- *                          Is that a good idea ?
- *
- * - Seed type for XXH128() : currently, it's a single 64-bit value, like the 64-bit variant.
- *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
- *                          But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
- *                          Such a variant could either replace current one, or become an additional one.
- *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
- *                          Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXH128 ?
- *
- * - Result for len==0 :    Currently, the result of hashing a zero-length input is always `0`.
- *                          It seems okay as a return value when using "default" secret and seed.
- *                          But is it still fine to return `0` when secret or seed are non-default ?
- *                          Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
- *
- * - Consistency (1) :      Streaming XXH128 uses an XXH3 state, which is the same state as XXH3p_64bits().
- *                          It means a 128bit streaming loop must invoke the following symbols :
- *                          XXH3p_createState(), XXH3p_128bits_reset(), XXH3p_128bits_update() (loop), XXH3p_128bits_digest(), XXH3p_freeState().
- *                          Is that consistent enough ?
- *
- * - Consistency (2) :      The canonical representation of `XXH3p_64bits` is provided by existing functions
- *                          XXH64_canonicalFromHash(), and reverse operation XXH64_hashFromCanonical().
- *                          As a mirror, canonical functions for XXH128_hash_t results generated by `XXH3p_128bits`
- *                          are XXH128_canonicalFromHash() and XXH128_hashFromCanonical().
- *                          Which means, `XXH3` doesn't appear in the names, because canonical functions operate on a type,
- *                          independently of which algorithm was used to generate that type.
- *                          Is that consistent enough ?
+ * return: >0 if *h128_1  > *h128_2
+ *         =0 if *h128_1 == *h128_2
+ *         <0 if *h128_1  < *h128_2
  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
 
-#ifdef XXH_NAMESPACE
-#  define XXH3p_64bits XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits)
-#  define XXH3p_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSecret)
-#  define XXH3p_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSeed)
-
-#  define XXH3p_createState XXH_NAME2(XXH_NAMESPACE, XXH3p_createState)
-#  define XXH3p_freeState XXH_NAME2(XXH_NAMESPACE, XXH3p_freeState)
-#  define XXH3p_copyState XXH_NAME2(XXH_NAMESPACE, XXH3p_copyState)
-
-#  define XXH3p_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset)
-#  define XXH3p_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSeed)
-#  define XXH3p_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSecret)
-#  define XXH3p_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_update)
-#  define XXH3p_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_digest)
-#endif
 
-/* XXH3p_64bits() :
- * default 64-bit variant, using default secret and default seed of 0.
- * It's the fastest variant. */
-XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* data, size_t len);
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
 
-/* XXH3p_64bits_withSecret() :
- * It's possible to provide any blob of bytes as a "secret" to generate the hash.
- * This makes it more difficult for an external actor to prepare an intentional collision.
- * The secret *must* be large enough (>= XXH3p_SECRET_SIZE_MIN).
- * It should consist of random bytes.
- * Avoid repeating same character, or sequences of bytes,
- * and especially avoid swathes of \0.
- * Failure to respect these conditions will result in a poor quality hash.
- */
-#define XXH3p_SECRET_SIZE_MIN 136
-XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
-
-/* XXH3p_64bits_withSeed() :
- * This variant generates on the fly a custom secret,
- * based on the default secret, altered using the `seed` value.
- * While this operation is decently fast, note that it's not completely free.
- * note : seed==0 produces same results as XXH3p_64bits() */
-XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
 
+#endif  /* XXH_NO_LONG_LONG */
 
-/* streaming 64-bit */
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v1;           /*!< First accumulator lane */
+   XXH32_hash_t v2;           /*!< Second accumulator lane */
+   XXH32_hash_t v3;           /*!< Third accumulator lane */
+   XXH32_hash_t v4;           /*!< Fourth accumulator lane */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read or write to it, it may be removed. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v1;           /*!< First accumulator lane */
+   XXH64_hash_t v2;           /*!< Second accumulator lane */
+   XXH64_hash_t v3;           /*!< Third accumulator lane */
+   XXH64_hash_t v4;           /*!< Fourth accumulator lane */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it, it may be removed. */
+};   /* typedef'd to XXH64_state_t */
 
 #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
 #  include <stdalign.h>
@@ -471,128 +941,4402 @@
 #  define XXH_ALIGN(n)   /* disabled */
 #endif
 
-typedef struct XXH3p_state_s XXH3p_state_t;
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
 
-#define XXH3p_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
-#define XXH3p_INTERNALBUFFER_SIZE 256
-struct XXH3p_state_s {
-   XXH_ALIGN(64) XXH64_hash_t acc[8];
-   XXH_ALIGN(64) unsigned char customSecret[XXH3p_SECRET_DEFAULT_SIZE];  /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
-   XXH_ALIGN(64) unsigned char buffer[XXH3p_INTERNALBUFFER_SIZE];
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * @note **This structure has a strict alignment requirement of 64 bytes.** Do
+ * not allocate this with `malloc()` or `new`, it will not be sufficiently
+ * aligned. Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack
+ * allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do not access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
    XXH32_hash_t bufferedSize;
-   XXH32_hash_t nbStripesPerBlock;
-   XXH32_hash_t nbStripesSoFar;
-   XXH32_hash_t secretLimit;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
    XXH32_hash_t reserved32;
-   XXH32_hash_t reserved32_2;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
    XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
    XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
    XXH64_hash_t reserved64;
-   const unsigned char* secret;    /* note : there is some padding after, due to alignment on 64 bytes */
-};   /* typedef'd to XXH3p_state_t */
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
 
-/* Streaming requires state maintenance.
- * This operation costs memory and cpu.
- * As a consequence, streaming is slower than one-shot hashing.
- * For better performance, prefer using one-shot functions whenever possible. */
+#undef XXH_ALIGN_MEMBER
 
-XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void);
-XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr);
-XXH_PUBLIC_API void XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state);
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)   { (XXH3_state_ptr)->seed = 0; }
 
 
-/* XXH3p_64bits_reset() :
- * initialize with default parameters.
- * result will be equivalent to `XXH3p_64bits()`. */
-XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset(XXH3p_state_t* statePtr);
-/* XXH3p_64bits_reset_withSeed() :
- * generate a custom secret from `seed`, and store it into state.
- * digest will be equivalent to `XXH3p_64bits_withSeed()`. */
-XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
-/* XXH3p_64bits_reset_withSecret() :
- * `secret` is referenced, and must outlive the hash streaming session.
- * secretSize must be >= XXH3p_SECRET_SIZE_MIN.
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*
+ * XXH3_generateSecret():
+ *
+ * Derive a high-entropy secret from any user-defined content, named customSeed.
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
+ * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
+ * into an already allocated buffer secretBuffer.
+ * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
+ * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
+ * are part of this list. They all accept a `secret` parameter
+ * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so
+ * this function can be used to generate a secret of proper quality.
+ *
+ * customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
+ * The resulting `secret` will nonetheless provide all expected qualities.
+ *
+ * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
+ * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
  */
-XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
 
-XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
-XXH_PUBLIC_API XXH64_hash_t  XXH3p_64bits_digest (const XXH3p_state_t* statePtr);
 
+/* simple short-cut to pre-selected XXH3_128bits variant */
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
 
-/* 128-bit */
 
-#ifdef XXH_NAMESPACE
-#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
-#  define XXH3p_128bits XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits)
-#  define XXH3p_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSeed)
-#  define XXH3p_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSecret)
-
-#  define XXH3p_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset)
-#  define XXH3p_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSeed)
-#  define XXH3p_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSecret)
-#  define XXH3p_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_update)
-#  define XXH3p_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_digest)
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
 
-#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
-#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
-#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
-#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *  .
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://stackoverflow.com/a/32095106/646947 for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+/*!
+ * @def XXH_ACCEPT_NULL_INPUT_POINTER
+ * @brief Whether to add explicit `NULL` checks.
+ *
+ * If the input pointer is `NULL` and the length is non-zero, xxHash's default
+ * behavior is to dereference it, triggering a segfault.
+ *
+ * When this macro is enabled, xxHash actively checks the input for a null pointer.
+ * If it is, the result for null input pointers is the same as a zero-length input.
+ */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64 & arm64,
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
+ * -fno-inline with GCC or Clang, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH_REROLL
+ * @brief Whether to reroll `XXH32_finalize` and `XXH64_finalize`.
+ *
+ * For performance, `XXH32_finalize` and `XXH64_finalize` use an unrolled loop
+ * in the form of a switch statement.
+ *
+ * This is not always desirable, as it generates larger code, and depending on
+ * the architecture, may even be slower
+ *
+ * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
+ */
+#  define XXH_REROLL 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for gcc on armv7 and armv8 */
+#  if !defined(__clang__) && ( \
+    (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+    (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)) )
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
 #endif
 
-typedef struct {
-    XXH64_hash_t low64;
-    XXH64_hash_t high64;
-} XXH128_hash_t;
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
 
-XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
-XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* data, size_t len);
-XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
-XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+#  if defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
+   || defined(__NO_INLINE__)     /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__)
+#    define XXH_FORCE_INLINE static __attribute__((unused))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined(__GNUC__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+#  define XXH_NO_INLINE static __attribute__((noinline))
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note: use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#ifdef __GNUC__
+#  define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t xxh_u8;
+#else
+  typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
+    return ((const xxh_unalign*)ptr)->u32;
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined, a runtime check (which is usually constant folded)
+ * is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup xxh32_impl XXH32 implementation
+ * @ingroup impl
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is the only thing that prevents GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
+     * and it is pointless writing a NEON implementation that is basically the
+     * same speed as scalar for XXH32.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param h32 The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= XXH_PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= XXH_PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param h32 The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ */
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                           \
+    h32 += (*ptr++) * XXH_PRIME32_5;                \
+    h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                           \
+    h32 += XXH_get32bits(ptr) * XXH_PRIME32_3;      \
+    ptr += 4;                                   \
+    h32  = XXH_rotl32(h32, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 8:       XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 9:       XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 10:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 11:      XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 7:       XXH_PROCESS4;
+                         FALLTHROUGH_INTENDED;
+           case 3:       XXH_PROCESS1;
+                         FALLTHROUGH_INTENDED;
+           case 2:       XXH_PROCESS1;
+                         FALLTHROUGH_INTENDED;
+           case 1:       XXH_PROCESS1;
+                         FALLTHROUGH_INTENDED;
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input, len, seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input ? input + len : NULL;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        xxh_u32 v2 = seed + XXH_PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - XXH_PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+/*!
+ * @ingroup xxh32_family
+ */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    state.v2 = seed + XXH_PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - XXH_PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        /* uintptr_t casts avoid UB or compiler warning on out-of-bounds
+         * pointer arithmetic */
+        if ((uintptr_t)p <= (uintptr_t)bEnd - 16) {
+            const uintptr_t limit = (uintptr_t)bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while ((uintptr_t)p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @ingroup xxh32_family
+ * The default return values from XXH functions are unsigned 32 and 64 bit
+ * integers.
+ *
+ * The canonical representation uses big endian convention, the same convention
+ * as human-readable numbers (large digits first).
+ *
+ * This way, hash values can be written into a file or buffer, remaining
+ * comparable across different systems.
+ *
+ * The following functions allow transformation of hash values to and from their
+ * canonical format.
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup xxh32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers.
+ *
+ * Currently only defined for GCC and ICC.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
+    return ((const xxh_unalign64*)ptr)->u64;
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup xxh64_impl XXH64 implementation
+ * @ingroup impl
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= XXH_PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= XXH_PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        h64 ^= (*ptr++) * XXH_PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(h64);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input ? input + len : NULL;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        xxh_u64 v2 = seed + XXH_PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - XXH_PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+/*! @ingroup xxh64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    state.v2 = seed + XXH_PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - XXH_PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
 
-XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset(XXH3p_state_t* statePtr);
-XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
-XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+        state->total_len += len;
 
-XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
-XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* statePtr);
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
+            state->memsize = 0;
+        }
+
+        /* uintptr_t casts avoid UB or compiler warning on out-of-bounds
+         * pointer arithmetic */
+        if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) {
+            const uintptr_t limit = (uintptr_t)bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while ((uintptr_t)p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
 
+    return XXH_OK;
+}
+
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup xxh64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup xxh3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
 
-/* Note : for better performance, following functions can be inlined,
- * using XXH_INLINE_ALL */
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * @ref XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!< NEON for most ARMv7-A and all AArch64 */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment reqired for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+
+#if XXH_VECTOR == XXH_NEON
+/*
+ * NEON's setup for vmlal_u32 is a little more complicated than it is on
+ * SSE2, AVX2, and VSX.
+ *
+ * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
+ *
+ * To do the same operation, the 128-bit 'Q' register needs to be split into
+ * two 64-bit 'D' registers, performing this operation::
+ *
+ *   [                a                 |                 b                ]
+ *            |              '---------. .--------'                |
+ *            |                         x                          |
+ *            |              .---------' '--------.                |
+ *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32    ]
+ *
+ * Due to significant changes in aarch64, the fastest method for aarch64 is
+ * completely different than the fastest method for ARMv7-A.
+ *
+ * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
+ * D11 will modify the high half of Q5. This is similar to how modifying AH
+ * will only affect bits 8-15 of AX on x86.
+ *
+ * VZIP takes two registers, and puts even lanes in one register and odd lanes
+ * in the other.
+ *
+ * On ARMv7-A, this strangely modifies both parameters in place instead of
+ * taking the usual 3-operand form.
+ *
+ * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
+ * lower and upper halves of the Q register to end up with the high and low
+ * halves where we want - all in one instruction.
+ *
+ *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
+ *
+ * Unfortunately we need inline assembly for this: Instructions modifying two
+ * registers at once is not possible in GCC or Clang's IR, and they have to
+ * create a copy.
+ *
+ * aarch64 requires a different approach.
+ *
+ * In order to make it easier to write a decent compiler for aarch64, many
+ * quirks were removed, such as conditional execution.
+ *
+ * NEON was also affected by this.
+ *
+ * aarch64 cannot access the high bits of a Q-form register, and writes to a
+ * D-form register zero the high bits, similar to how writes to W-form scalar
+ * registers (or DWORD registers on x86_64) work.
+ *
+ * The formerly free vget_high intrinsics now require a vext (with a few
+ * exceptions)
+ *
+ * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
+ * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
+ * operand.
+ *
+ * The equivalent of the VZIP.32 on the lower and upper halves would be this
+ * mess:
+ *
+ *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
+ *   zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] }
+ *   zip2    v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
+ *
+ * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
+ *
+ *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
+ *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
+ *
+ * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
+ */
+
+/*!
+ * Function-like macro:
+ * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
+ * {
+ *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
+ *     outHi = (uint32x2_t)(in >> 32);
+ *     in = UNDEFINED;
+ * }
+ */
+# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
+   && defined(__GNUC__) \
+   && !defined(__aarch64__) && !defined(__arm64__)
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                              \
+    do {                                                                                    \
+      /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
+      /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */     \
+      /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
+      __asm__("vzip.32  %e0, %f0" : "+w" (in));                                             \
+      (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in));                                   \
+      (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                   \
+   } while (0)
+# else
+#  define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                            \
+    do {                                                                                  \
+      (outLo) = vmovn_u64    (in);                                                        \
+      (outHi) = vshrn_n_u64  ((in), 32);                                                  \
+    } while (0)
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+/* gcc's altivec.h can have the unwanted consequence to unconditionally
+ * #define bool, vector, and pixel keywords,
+ * with bad consequences for programs already using these keywords for other purposes.
+ * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
+ * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
+ * but it seems that, in some cases, it isn't.
+ * Force the build macro to be defined, so that keywords are not altered.
+ */
+#    if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
+#      define __APPLE_ALTIVEC__
+#    endif
+#    include <altivec.h>
+#  endif
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*! Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs, rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs, rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= 0x165667919E3779F9ULL;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= 0x9FB21C651E98DF25ULL;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= 0x9FB21C651E98DF25ULL;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        __m512i const data_vec    = _mm512_xor_si512     (acc_vec, shifted);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
+
+        XXH_ALIGN(64) const __m512i* const src  = (const __m512i*) XXH3_kSecret;
+        XXH_ALIGN(64)       __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        for (i=0; i < nbRounds; ++i) {
+            /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
+             * this will warn "discards ‘const’ qualifier". */
+            union {
+                XXH_ALIGN(64) const __m512i* cp;
+                XXH_ALIGN(64) void* p;
+            } remote_const_void;
+            remote_const_void.cp = src + i;
+            dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        XXH_ALIGN(64) const __m256i* const src  = (const __m256i*) XXH3_kSecret;
+        XXH_ALIGN(64)       __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        // MSVC 32bit mode does not support _mm_set_epi64x before 2015
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        XXH_ALIGN(64)        const float* const src  = (float const*) XXH3_kSecret;
+        XXH_ALIGN(XXH_SEC_ALIGN) __m128i*       dest = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
+            /* data_vec = xinput[i]; */
+            uint8x16_t data_vec    = vld1q_u8(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t key_vec     = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key;
+            uint32x2_t data_key_lo, data_key_hi;
+            /* xacc[i] += swap(data_vec); */
+            uint64x2_t const data64  = vreinterpretq_u64_u8(data_vec);
+            uint64x2_t const swapped = vextq_u64(data64, data64, 1);
+            xacc[i] = vaddq_u64 (xacc[i], swapped);
+            /* data_key = data_vec ^ key_vec; */
+            data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (data_key >> 32);
+             * data_key = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+        }
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* xacc       = (uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+        uint32x2_t prime       = vdup_n_u32 (XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint8x16_t key_vec  = vld1q_u8(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            uint32x2_t data_key_lo, data_key_hi;
+            /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
+             * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
+             * xacc[i] = UNDEFINED; */
+            XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
+            {   /*
+                 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
+                 *
+                 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
+                 * incorrectly "optimize" this:
+                 *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
+                 *   shifted = vshll_n_u32(tmp, 32);
+                 * to this:
+                 *   tmp     = "vmulq_u64"(a, b); // no such thing!
+                 *   shifted = vshlq_n_u64(tmp, 32);
+                 *
+                 * However, unlike SSE, Clang lacks a 64-bit multiply routine
+                 * for NEON, and it scalarizes two 64-bit multiplies instead.
+                 *
+                 * vmull_u32 has the same timing as vmul_u32, and it avoids
+                 * this bug completely.
+                 * See https://bugs.llvm.org/show_bug.cgi?id=39967
+                 */
+                uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
+                /* xacc[i] = prod_hi << 32; */
+                xacc[i] = vshlq_n_u64(prod_hi, 32);
+                /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
+                xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
+            }
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+          xxh_u64x2* const xacc     =       (xxh_u64x2*) acc;    /* presumed aligned */
+    xxh_u64x2 const* const xinput   = (xxh_u64x2 const*) input;   /* no alignment restriction */
+    xxh_u64x2 const* const xsecret  = (xxh_u64x2 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        xacc[i] += vec_permi(data_vec, data_vec, 2);
+#else
+        xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {         xxh_u64x2* const xacc    =       (xxh_u64x2*) acc;
+        const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
+    const xxh_u8* const xinput  = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < XXH_ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+        xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < XXH_ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[i] = acc64;
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__clang__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), long MOVK chains stall the
+     * integer pipelines:
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes Clang to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    /*
+     * Note: in debug mode, this overrides the asm optimization
+     * and Clang will emit MOVK chains again.
+     */
+    XXH_ASSERT(kSecretPtr == XXH3_kSecret);
+
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes Clang to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * XXH3_accumulate()
+ * Loops over XXH3_accumulate_512().
+ * Assumption: nbStripes will not overflow the secret size
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3_f_accumulate_512 f_acc512)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+        f_acc512(acc,
+                 in,
+                 secret + n*XXH_SECRET_CONSUME_RATE);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate_512 f_acc512,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate_512 f_acc512,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ * Since the function is not inlined, the compiler may not be able to understand that,
+ * in some scenarios, its `secret` argument is actually a compile time constant.
+ * This variant enforces that the compiler can detect that,
+ * and uses this opportunity to streamline the generated code for better performance.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate_512 f_acc512,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc512, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc512, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+
+/* ===   XXH3 streaming   === */
+
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/* Note : when XXH3_consumeStripes() is invoked,
+ * there must be a guarantee that at least one more byte must be consumed from input
+ * so that the function can blindly consume all stripes using the "normal" secret segment */
+XXH_FORCE_INLINE void
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate_512 f_acc512,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ASSERT(nbStripes <= nbStripesPerBlock);  /* can handle max 1 scramble per invocation */
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
+        size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
+        f_scramble(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
+        *nbStripesSoFarPtr = nbStripesAfterBlock;
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
+        *nbStripesSoFarPtr += nbStripes;
+    }
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state,
+            const xxh_u8* input, size_t len,
+            XXH3_f_accumulate_512 f_acc512,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc512, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+
+        /* Consume input by a multiple of internal buffer size */
+        if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    secret, state->secretLimit,
+                                    f_acc512, f_scramble);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<limit);
+            /* for last partial stripe */
+            memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+        }
+        XXH_ASSERT(input < bEnd);
+
+        /* Some remaining input (always) : buffer it */
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+    }
+
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate_512, XXH3_scrambleAcc);
+        /* last stripe */
+        XXH3_accumulate_512(acc,
+                            state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        xxh_u8 lastStripe[XXH_STRIPE_LEN];
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        XXH3_accumulate_512(acc,
+                            lastStripe,
+                            secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+    }
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_mergeAccs(acc,
+                              secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+
+
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
+{
+    XXH_ASSERT(secretBuffer != NULL);
+    if (customSeedSize == 0) {
+        memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+        return;
+    }
+    XXH_ASSERT(customSeed != NULL);
+
+    {   size_t const segmentSize = sizeof(XXH128_hash_t);
+        size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
+        XXH128_canonical_t scrambler;
+        XXH64_hash_t seeds[12];
+        size_t segnb;
+        XXH_ASSERT(nbSegments == 12);
+        XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+
+        /*
+        * Copy customSeed to seeds[], truncating or repeating as necessary.
+        */
+        {   size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
+            size_t filled = toFill;
+            memcpy(seeds, customSeed, toFill);
+            while (filled < sizeof(seeds)) {
+                toFill = XXH_MIN(filled, sizeof(seeds) - filled);
+                memcpy((char*)seeds + filled, seeds, toFill);
+                filled += toFill;
+        }   }
+
+        /* generate secret */
+        memcpy(secretBuffer, &scrambler, sizeof(scrambler));
+        for (segnb=1; segnb < nbSegments; segnb++) {
+            size_t const segmentStart = segnb * segmentSize;
+            XXH128_canonical_t segment;
+            XXH128_canonicalFromHash(&segment,
+                XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
+            memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
+    }   }
+}
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= 0x9FB21C651E98DF25ULL;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc,
+                                input  + (32 * i),
+                                input  + (32 * i) + 16,
+                                secret + (32 * i),
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc,
+                                input + (32 * i),
+                                input + (32 * i) + 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            0ULL - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate_512 f_acc512,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * XXH_PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate_512 f_acc512,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc512, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc512, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/*
+ * All the functions are actually the same as for 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_128bits_reset(statePtr);
+    if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate_512, XXH3_scrambleAcc);
+}
+
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * XXH_PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         secret + state->secretLimit + XXH_STRIPE_LEN
+                                                - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
 
 /* return : 1 is equal, 0 if different */
-XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
 
-/* This comparator is compatible with stdlib's qsort().
+/* This prototype is compatible with stdlib's qsort().
  * return : >0 if *h128_1  > *h128_2
  *          <0 if *h128_1  < *h128_2
  *          =0 if *h128_1 == *h128_2  */
-XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
 
 
 /*======   Canonical representation   ======*/
-typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
-XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
-XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
-
-
-#endif  /* XXH_NO_LONG_LONG */
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
 
+/*! @ingroup xxh3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
 
-/*-**********************************************************************
-*  XXH_INLINE_ALL
-************************************************************************/
-#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
-#  include "xxhash.cc"   /* include xxhash function bodies as `static`, for inlining */
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
+#  pragma GCC pop_options
 #endif
 
+#endif  /* XXH_NO_LONG_LONG */
 
+#endif  /* XXH_NO_XXH3 */
 
-#endif /* XXH_STATIC_LINKING_ONLY */
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
 
 
 #if defined (__cplusplus)
 }
 #endif
-
-#endif /* XXHASH_H_5627135585666179 */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxph3.h mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxph3.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/util/xxph3.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/util/xxph3.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,1762 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+// This is a fork of a preview version of xxHash, as RocksDB depends on
+// this preview version of XXH3. To allow this to coexist with the
+// standard xxHash, including in the "unity" build where all source files
+// and headers go into a single translation unit, here "XXH" has been
+// replaced with "XXPH" for XX Preview Hash.
+
+#ifndef XXPHASH_H_5627135585666179
+#define XXPHASH_H_5627135585666179 1
+
+/* BEGIN RocksDB customizations */
+#ifndef XXPH_STATIC_LINKING_ONLY
+#define XXPH_STATIC_LINKING_ONLY 1 /* access experimental APIs like XXPH3 */
+#endif
+#define XXPH_NAMESPACE ROCKSDB_
+#define XXPH_INLINE_ALL
+#include <cstring>
+/* END RocksDB customizations */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXPH_OK=0, XXPH_ERROR } XXPH_errorcode;
+
+
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXPH_INLINE_ALL (and XXPH_PRIVATE_API)
+ *  This build macro includes xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining offers great performance improvement on small keys,
+ *  and dramatic ones when length is expressed as a compile-time constant.
+ *  See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
+ *  Methodology :
+ *     #define XXPH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate object.
+ */
+#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API)
+#  ifndef XXPH_STATIC_LINKING_ONLY
+#    define XXPH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXPH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXPH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXPH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXPH_PUBLIC_API static
+#  endif
+#else
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXPH_IMPORT) || defined(XXPH_EXPORT))
+#    ifdef XXPH_EXPORT
+#      define XXPH_PUBLIC_API __declspec(dllexport)
+#    elif XXPH_IMPORT
+#      define XXPH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXPH_PUBLIC_API   /* do nothing */
+#  endif
+#endif /* XXPH_INLINE_ALL || XXPH_PRIVATE_API */
+
+/*! XXPH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXPH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXPH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXPH_NAMESPACE
+#  define XXPH_CAT(A,B) A##B
+#  define XXPH_NAME2(A,B) XXPH_CAT(A,B)
+#  define XXPH_versionNumber XXPH_NAME2(XXPH_NAMESPACE, XXPH_versionNumber)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXPH_VERSION_MAJOR    0
+#define XXPH_VERSION_MINOR    7
+#define XXPH_VERSION_RELEASE  2
+#define XXPH_VERSION_NUMBER  (XXPH_VERSION_MAJOR *100*100 + XXPH_VERSION_MINOR *100 + XXPH_VERSION_RELEASE)
+XXPH_PUBLIC_API unsigned XXPH_versionNumber (void);
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXPH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXPH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXPH32_hash_t;
+#     else
+#       error "unsupported platform : need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+#ifndef XXPH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXPH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXPH64_hash_t;
+#endif
+
+#endif  /* XXPH_NO_LONG_LONG */
+
+
+
+#ifdef XXPH_STATIC_LINKING_ONLY
+
+/* ================================================================================================
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+=================================================================================================== */
+
+
+/*-**********************************************************************
+*  XXPH3
+*  New experimental hash
+************************************************************************/
+#ifndef XXPH_NO_LONG_LONG
+
+
+/* ============================================
+ * XXPH3 is a new hash algorithm,
+ * featuring improved speed performance for both small and large inputs.
+ * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ * In general, expect XXPH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact differences depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXPH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The XXPH3 algorithm is still considered experimental.
+ * Produced results can still change between versions.
+ * Results produced by v0.7.x are not comparable with results from v0.7.y .
+ * It's nonetheless possible to use XXPH3 for ephemeral data (local sessions),
+ * but avoid storing values in long-term storage for later reads.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of two 64-bits fields.
+ *                          That's because 128-bit values do not exist in C standard.
+ *                          Note that it means that, at byte level, result is not identical depending on endianess.
+ *                          However, at field level, they are identical on all platforms.
+ *                          The canonical representation solves the issue of identical byte-level representation across platforms,
+ *                          which is necessary for serialization.
+ *                          Q1 : Would there be a better representation for a 128-bit hash result ?
+ *                          Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ?
+ *
+ * - Prototype XXPH128() :   XXPH128() uses the same arguments as XXPH64(), for consistency.
+ *                          It means it maps to XXPH3_128bits_withSeed().
+ *                          This variant is slightly slower than XXPH3_128bits(),
+ *                          because the seed is now part of the algorithm, and can't be simplified.
+ *                          Is that a good idea ?
+ *
+ * - Seed type for XXPH128() : currently, it's a single 64-bit value, like the 64-bit variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
+ *                          But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
+ *                          Such a variant could either replace current one, or become an additional one.
+ *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *                          Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXPH128 ?
+ *
+ * - Result for len==0 :    Currently, the result of hashing a zero-length input is always `0`.
+ *                          It seems okay as a return value when using "default" secret and seed.
+ *                          But is it still fine to return `0` when secret or seed are non-default ?
+ *                          Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
+ *
+ * - Consistency (1) :      Streaming XXPH128 uses an XXPH3 state, which is the same state as XXPH3_64bits().
+ *                          It means a 128bit streaming loop must invoke the following symbols :
+ *                          XXPH3_createState(), XXPH3_128bits_reset(), XXPH3_128bits_update() (loop), XXPH3_128bits_digest(), XXPH3_freeState().
+ *                          Is that consistent enough ?
+ *
+ * - Consistency (2) :      The canonical representation of `XXPH3_64bits` is provided by existing functions
+ *                          XXPH64_canonicalFromHash(), and reverse operation XXPH64_hashFromCanonical().
+ *                          As a mirror, canonical functions for XXPH128_hash_t results generated by `XXPH3_128bits`
+ *                          are XXPH128_canonicalFromHash() and XXPH128_hashFromCanonical().
+ *                          Which means, `XXPH3` doesn't appear in the names, because canonical functions operate on a type,
+ *                          independently of which algorithm was used to generate that type.
+ *                          Is that consistent enough ?
+ */
+
+#ifdef XXPH_NAMESPACE
+#  define XXPH3_64bits XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits)
+#  define XXPH3_64bits_withSecret XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSecret)
+#  define XXPH3_64bits_withSeed XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSeed)
+#endif
+
+/* XXPH3_64bits() :
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* data, size_t len);
+
+/* XXPH3_64bits_withSecret() :
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The secret *must* be large enough (>= XXPH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid repeating same character, or sequences of bytes,
+ * and especially avoid swathes of \0.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXPH3_SECRET_SIZE_MIN 136
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/* XXPH3_64bits_withSeed() :
+ * This variant generates on the fly a custom secret,
+ * based on the default secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * note : seed==0 produces same results as XXPH3_64bits() */
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSeed(const void* data, size_t len, XXPH64_hash_t seed);
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXPH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXPH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXPH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXPH_ALIGN(n)   /* disabled */
+#endif
+
+#define XXPH3_SECRET_DEFAULT_SIZE 192   /* minimum XXPH3_SECRET_SIZE_MIN */
+
+#endif  /* XXPH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+*  XXPH_INLINE_ALL
+************************************************************************/
+#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API)
+
+/* === RocksDB modification: was #include here but permanently inlining === */
+
+typedef struct {
+    XXPH64_hash_t low64;
+    XXPH64_hash_t high64;
+} XXPH128_hash_t;
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXPH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXPH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXPH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXPH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXPH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
+ */
+#ifndef XXPH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXPH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!XXPH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXPH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXPH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXPH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!XXPH_REROLL:
+ * Whether to reroll XXPH32_finalize, and XXPH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang. */
+#ifndef XXPH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXPH_REROLL 1
+#  else
+#    define XXPH_REROLL 0
+#  endif
+#endif
+
+#include <limits.h>   /* ULLONG_MAX */
+
+#ifndef XXPH_STATIC_LINKING_ONLY
+#define XXPH_STATIC_LINKING_ONLY
+#endif
+
+/* BEGIN RocksDB customizations */
+#include "port/lang.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */
+/* END RocksDB customizations */
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define XXPH_FORCE_INLINE static __forceinline
+#  define XXPH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXPH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXPH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXPH_FORCE_INLINE static inline
+#      define XXPH_NO_INLINE static
+#    endif
+#  else
+#    define XXPH_FORCE_INLINE static
+#    define XXPH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note : can still be disabled with NDEBUG */
+#  define XXPH_ASSERT(c)   assert(c)
+#else
+#  define XXPH_ASSERT(c)   ((void)0)
+#endif
+
+/* note : use after variable declarations */
+#define XXPH_STATIC_ASSERT(c)  { enum { XXPH_sa = 1/(int)(!!(c)) }; }
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXPH32_hash_t xxh_u32;
+
+
+/* ===   Memory access   === */
+
+#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u32 XXPH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXPH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXPH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ===   Endianess   === */
+
+/* XXPH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXPH_CPU_LITTLE_ENDIAN
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXPH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXPH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXPH_isLittleEndian(void)
+{
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXPH_CPU_LITTLE_ENDIAN   XXPH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXPH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
+#  define XXPH_rotl32 __builtin_rotateleft32
+#  define XXPH_rotl64 __builtin_rotateleft64
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXPH_rotl32(x,r) _rotl(x,r)
+#  define XXPH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXPH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXPH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXPH_swap32 _byteswap_ulong
+#elif XXPH_GCC_VERSION >= 403
+#  define XXPH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXPH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXPH_aligned, XXPH_unaligned } XXPH_alignment;
+
+XXPH_FORCE_INLINE xxh_u32 XXPH_readLE32(const void* ptr)
+{
+    return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read32(ptr) : XXPH_swap32(XXPH_read32(ptr));
+}
+
+XXPH_FORCE_INLINE xxh_u32
+XXPH_readLE32_align(const void* ptr, XXPH_alignment align)
+{
+    if (align==XXPH_unaligned) {
+        return XXPH_readLE32(ptr);
+    } else {
+        return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXPH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXPH_PUBLIC_API unsigned XXPH_versionNumber (void) { return XXPH_VERSION_NUMBER; }
+
+
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+#ifndef XXPH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*======   Memory access   ======*/
+
+typedef XXPH64_hash_t xxh_u64;
+
+#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXPH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXPH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static xxh_u64 XXPH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXPH_swap64 _byteswap_uint64
+#elif XXPH_GCC_VERSION >= 403
+#  define XXPH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXPH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+XXPH_FORCE_INLINE xxh_u64 XXPH_readLE64(const void* ptr)
+{
+    return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read64(ptr) : XXPH_swap64(XXPH_read64(ptr));
+}
+
+XXPH_FORCE_INLINE xxh_u64
+XXPH_readLE64_align(const void* ptr, XXPH_alignment align)
+{
+    if (align==XXPH_unaligned)
+        return XXPH_readLE64(ptr);
+    else
+        return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXPH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*======   xxh64   ======*/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+
+/* *********************************************************************
+*  XXPH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+/*======== Was #include "xxh3.h", now inlined below ==========*/
+
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Development source file for `xxh3`
+   Copyright (C) 2019-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* RocksDB Note: This file contains a preview release (xxhash repository
+   version 0.7.2) of XXPH3 that is unlikely to be compatible with the final
+   version of XXPH3. We have therefore renamed this XXPH3 ("preview"), for
+   clarity so that we can continue to use this version even after
+   integrating a newer incompatible version.
+*/
+
+/* ===   Dependencies   === */
+
+#undef XXPH_INLINE_ALL   /* in case it's already defined */
+#define XXPH_INLINE_ALL
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXPH_RESTRICT   restrict
+#else
+/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXPH_RESTRICT   /* disable */
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * Sanity check.
+ *
+ * XXPH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *
+ * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
+ * classic 16-bit only subset of ARM's instruction set.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will give a warning.
+ *
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you probably meant to compileh for a newer
+ * architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXPH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXPH_SCALAR 0
+#define XXPH_SSE2   1
+#define XXPH_AVX2   2
+#define XXPH_NEON   3
+#define XXPH_VSX    4
+
+#ifndef XXPH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX2__)
+#    define XXPH_VECTOR XXPH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXPH_VECTOR XXPH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXPH_VECTOR XXPH_NEON
+#  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
+#    define XXPH_VECTOR XXPH_VSX
+#  else
+#    define XXPH_VECTOR XXPH_SCALAR
+#  endif
+#endif
+
+/* control alignment of accumulator,
+ * for compatibility with fast vector loads */
+#ifndef XXPH_ACC_ALIGN
+#  if XXPH_VECTOR == 0   /* scalar */
+#     define XXPH_ACC_ALIGN 8
+#  elif XXPH_VECTOR == 1  /* sse2 */
+#     define XXPH_ACC_ALIGN 16
+#  elif XXPH_VECTOR == 2  /* avx2 */
+#     define XXPH_ACC_ALIGN 32
+#  elif XXPH_VECTOR == 3  /* neon */
+#     define XXPH_ACC_ALIGN 16
+#  elif XXPH_VECTOR == 4  /* vsx */
+#     define XXPH_ACC_ALIGN 16
+#  endif
+#endif
+
+/* xxh_u64 XXPH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXPH_mult32to64(x, y) __emulu(x, y)
+#else
+#    define XXPH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
+#endif
+
+/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
+ * there is a lot of mischief with endianness. */
+#if XXPH_VECTOR == XXPH_VSX
+#  include <altivec.h>
+#  undef vector
+typedef __vector unsigned long long U64x2;
+typedef __vector unsigned char U8x16;
+typedef __vector unsigned U32x4;
+
+#ifndef XXPH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXPH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXPH_VSX_BE 1
+#  else
+#    define XXPH_VSX_BE 0
+#  endif
+#endif
+
+/* We need some helpers for big endian mode. */
+#if XXPH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  ifdef __POWER9_VECTOR__
+#    define XXPH_vec_revb vec_revb
+#  else
+XXPH_FORCE_INLINE U64x2 XXPH_vec_revb(U64x2 val)
+{
+    U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                              0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+
+/* Power8 Crypto gives us vpermxor which is very handy for
+ * PPC64EB.
+ *
+ * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
+ * {
+ *     U8x16 ret;
+ *     for (int i = 0; i < 16; i++) {
+ *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
+ *     }
+ *     return ret;
+ * }
+ *
+ * Because both of the main loops load the key, swap, and xor it with input,
+ * we can combine the key swap into this instruction.
+ */
+#  ifdef vec_permxor
+#    define XXPH_vec_permxor vec_permxor
+#  else
+#    define XXPH_vec_permxor __builtin_crypto_vpermxor
+#  endif
+#endif  /* XXPH_VSX_BE */
+/*
+ * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
+ * vec_mule.
+ *
+ * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
+ * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
+ * GCC needs inline assembly. */
+#if __has_builtin(__builtin_altivec_vmuleuw)
+#  define XXPH_vec_mulo __builtin_altivec_vmulouw
+#  define XXPH_vec_mule __builtin_altivec_vmuleuw
+#else
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXPH_FORCE_INLINE U64x2 XXPH_vec_mulo(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXPH_FORCE_INLINE U64x2 XXPH_vec_mule(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+#endif /* __has_builtin(__builtin_altivec_vmuleuw) */
+#endif /* XXPH_VECTOR == XXPH_VSX */
+
+/* prefetch
+ * can be disabled, by declaring XXPH_NO_PREFETCH build macro */
+#if defined(XXPH_NO_PREFETCH)
+#  define XXPH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#if defined(_MSC_VER) && \
+    (defined(_M_X64) ||  \
+     defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXPH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXPH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXPH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXPH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXPH3 default settings
+ * ========================================== */
+
+#define XXPH_SECRET_DEFAULT_SIZE 192   /* minimum XXPH3_SECRET_SIZE_MIN */
+
+#if (XXPH_SECRET_DEFAULT_SIZE < XXPH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+XXPH_ALIGN(64) static const xxh_u8 kSecret[XXPH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * GCC for x86 has a tendency to use SSE in this loop. While it
+ * successfully avoids swapping (as MUL overwrites EAX and EDX), it
+ * slows it down because instead of free register swap shifts, it
+ * must use pshufd and punpckl/hd.
+ *
+ * To prevent this, we use this attribute to shut off SSE.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static XXPH128_hash_t
+XXPH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this
+     * type despite not having the arithmetic for it. This results in a
+     * laggy compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXPH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXPH128_hash_t const r128 = { product_low, product_high };
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown
+     * below with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
+     *     ---------
+     *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
+     *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
+     *     ---------
+     *       6 9 7 5
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for
+     *     UINT64_MAX. This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARMv6+ A32/T32, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication,
+     *     and allows this to be calculated in only 4 instructions which
+     *     is comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be
+     *     a couple of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXPH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXPH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXPH128_hash_t r128 = { lower, upper };
+    return r128;
+#endif
+}
+
+/*
+ * We want to keep the attribute here because a target switch
+ * disables inlining.
+ *
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ * The reason for the separate function is to prevent passing
+ * too many structs around by value. This will hopefully inline
+ * the multiply, but we don't force it.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static xxh_u64
+XXPH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXPH128_hash_t product = XXPH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+
+static XXPH64_hash_t XXPH3_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 37;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ========================================== */
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(input != NULL);
+    XXPH_ASSERT(1 <= len && len <= 3);
+    XXPH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
+        xxh_u64  const keyed = (xxh_u64)combined ^ (XXPH_readLE32(secret) + seed);
+        xxh_u64  const mixed = keyed * PRIME64_1;
+        return XXPH3_avalanche(mixed);
+    }
+}
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(input != NULL);
+    XXPH_ASSERT(secret != NULL);
+    XXPH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXPH_readLE32(input);
+        xxh_u32 const input_hi = XXPH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
+        xxh_u64 const keyed = input_64 ^ (XXPH_readLE64(secret) + seed);
+        xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
+        return XXPH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
+    }
+}
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(input != NULL);
+    XXPH_ASSERT(secret != NULL);
+    XXPH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXPH_readLE64(input)           ^ (XXPH_readLE64(secret)     + seed);
+        xxh_u64 const input_hi = XXPH_readLE64(input + len - 8) ^ (XXPH_readLE64(secret + 8) - seed);
+        xxh_u64 const acc = len + (input_lo + input_hi) + XXPH3_mul128_fold64(input_lo, input_hi);
+        return XXPH3_avalanche(acc);
+    }
+}
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(len <= 16);
+    {   if (len > 8) return XXPH3_len_9to16_64b(input, len, secret, seed);
+        if (len >= 4) return XXPH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXPH3_len_1to3_64b(input, len, secret, seed);
+        /*
+         * RocksDB modification from XXPH3 preview: zero result for empty
+         * string can be problematic for multiplication-based algorithms.
+         * Return a hash of the seed instead.
+         */
+        return XXPH3_mul128_fold64(seed + XXPH_readLE64(secret), PRIME64_2);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXPH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXPH3_acc_64bits, XXPH3_acc_128bits } XXPH3_accWidth_e;
+
+XXPH_FORCE_INLINE void
+XXPH3_accumulate_512(      void* XXPH_RESTRICT acc,
+                    const void* XXPH_RESTRICT input,
+                    const void* XXPH_RESTRICT secret,
+                    XXPH3_accWidth_e accWidth)
+{
+#if (XXPH_VECTOR == XXPH_AVX2)
+
+    XXPH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXPH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
+        const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
+            __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
+            __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXPH3_acc_128bits) {
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            } else {  /* XXPH3_acc_64bits */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXPH_VECTOR == XXPH_SSE2)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXPH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
+        const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i const data_vec = _mm_loadu_si128 (xinput+i);
+            __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
+            __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXPH3_acc_128bits) {
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            } else {  /* XXPH3_acc_64bits */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXPH_VECTOR == XXPH_NEON)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXPH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
+            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+             * a lot.
+             * aarch64 not only uses an entirely different syntax, but it requires three
+             * instructions...
+             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
+             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
+             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
+             * ...to do what ARM does in one:
+             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t       data_key;
+
+            if (accWidth == XXPH3_acc_64bits) {
+                /* Add first to prevent register swaps */
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXPH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                /* can probably be optimized better */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+
+            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
+
+            /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
+             * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
+            __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
+            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
+
+#else
+            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
+            uint32x2_t const data_key_lo = vmovn_u64  (data_key);
+            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
+            uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
+            if (accWidth == XXPH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXPH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+#endif
+        }
+    }
+
+#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
+          U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
+    U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
+    U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
+    U64x2 const v32 = { 32,  32 };
+#if XXPH_VSX_BE
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        /* data_vec = xinput[i]; */
+        /* key_vec = xsecret[i]; */
+#if XXPH_VSX_BE
+        /* byteswap */
+        U64x2 const data_vec = XXPH_vec_revb(vec_vsx_ld(0, xinput + i));
+        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
+        /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
+        U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
+        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
+        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
+        U64x2 const product = XXPH_vec_mulo((U32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXPH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXPH3_acc_128bits */
+            /* swap high and low halves */
+            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXPH_ASSERT(((size_t)acc & (XXPH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXPH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXPH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXPH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXPH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+XXPH_FORCE_INLINE void
+XXPH3_scrambleAcc(void* XXPH_RESTRICT acc, const void* XXPH_RESTRICT secret)
+{
+#if (XXPH_VECTOR == XXPH_AVX2)
+
+    XXPH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXPH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXPH_VECTOR == XXPH_SSE2)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXPH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXPH_VECTOR == XXPH_NEON)
+
+    XXPH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
+        uint8_t const* const xsecret = (uint8_t const*) secret;
+        uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
+            uint64x2_t const   acc_vec  = xacc[i];
+            uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* key_vec  = xsecret[i]; */
+            uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
+            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
+            uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
+
+            /* data_key *= PRIME32_1 */
+
+            /* prod_hi = (data_key >> 32) * PRIME32_1; */
+            uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
+            /* xacc[i] = prod_hi << 32; */
+            xacc[i] = vshlq_n_u64(prod_hi, 32);
+            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
+    }   }
+
+#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
+
+          U64x2* const xacc =       (U64x2*) acc;
+    const U64x2* const xsecret = (const U64x2*) secret;
+    /* constants */
+    U64x2 const v32  = { 32, 32 };
+    U64x2 const v47 = { 47, 47 };
+    U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+    size_t i;
+#if XXPH_VSX_BE
+    /* endian swap */
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        U64x2 const acc_vec  = xacc[i];
+        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+        /* key_vec = xsecret[i]; */
+#if XXPH_VSX_BE
+        /* swap bytes words */
+        U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+
+        /* data_key *= PRIME32_1 */
+
+        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
+        U64x2 const prod_even  = XXPH_vec_mule((U32x4)data_key, prime);
+        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
+        U64x2 const prod_odd  = XXPH_vec_mulo((U32x4)data_key, prime);
+        xacc[i] = prod_odd + (prod_even << v32);
+    }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXPH_ASSERT((((size_t)acc) & (XXPH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXPH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 ^= acc64 >> 47;
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXPH_PREFETCH_DIST 384
+
+/* assumption : nbStripes will not overflow secret size */
+XXPH_FORCE_INLINE void
+XXPH3_accumulate(       xxh_u64* XXPH_RESTRICT acc,
+                const xxh_u8* XXPH_RESTRICT input,
+                const xxh_u8* XXPH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXPH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+        XXPH_PREFETCH(in + XXPH_PREFETCH_DIST);
+        XXPH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXPH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
+ *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
+ *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
+ *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
+ */
+
+#if defined(__clang__) && (XXPH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
+static void
+#else
+XXPH_FORCE_INLINE void
+#endif
+XXPH3_hashLong_internal_loop( xxh_u64* XXPH_RESTRICT acc,
+                      const xxh_u8* XXPH_RESTRICT input, size_t len,
+                      const xxh_u8* XXPH_RESTRICT secret, size_t secretSize,
+                            XXPH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXPH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXPH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXPH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXPH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXPH_ASSERT(nbStripes <= (secretSize / XXPH_SECRET_CONSUME_RATE));
+        XXPH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+#define XXPH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
+            XXPH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXPH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXPH_FORCE_INLINE xxh_u64
+XXPH3_mix2Accs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret)
+{
+    return XXPH3_mul128_fold64(
+               acc[0] ^ XXPH_readLE64(secret),
+               acc[1] ^ XXPH_readLE64(secret+8) );
+}
+
+static XXPH64_hash_t
+XXPH3_mergeAccs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+
+    result64 += XXPH3_mix2Accs(acc+0, secret +  0);
+    result64 += XXPH3_mix2Accs(acc+2, secret + 16);
+    result64 += XXPH3_mix2Accs(acc+4, secret + 32);
+    result64 += XXPH3_mix2Accs(acc+6, secret + 48);
+
+    return XXPH3_avalanche(result64);
+}
+
+#define XXPH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_hashLong_internal(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                       const xxh_u8* XXPH_RESTRICT secret, size_t secretSize)
+{
+    XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXPH3_INIT_ACC;
+
+    XXPH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXPH3_acc_64bits);
+
+    /* converge into final hash */
+    XXPH_STATIC_ASSERT(sizeof(acc) == 64);
+#define XXPH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
+    XXPH_ASSERT(secretSize >= sizeof(acc) + XXPH_SECRET_MERGEACCS_START);
+    return XXPH3_mergeAccs(acc, secret + XXPH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+
+XXPH_NO_INLINE XXPH64_hash_t    /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXPH3_hashLong_64b_defaultSecret(const xxh_u8* XXPH_RESTRICT input, size_t len)
+{
+    return XXPH3_hashLong_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXPH_NO_INLINE XXPH64_hash_t    /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXPH3_hashLong_64b_withSecret(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                             const xxh_u8* XXPH_RESTRICT secret, size_t secretSize)
+{
+    return XXPH3_hashLong_internal(input, len, secret, secretSize);
+}
+
+
+XXPH_FORCE_INLINE void XXPH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXPH_CPU_LITTLE_ENDIAN) v64 = XXPH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXPH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXPH_FORCE_INLINE void XXPH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXPH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+
+    XXPH_STATIC_ASSERT((XXPH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+    for (i=0; i < nbRounds; i++) {
+        XXPH_writeLE64(customSecret + 16*i,     XXPH_readLE64(kSecret + 16*i)     + seed64);
+        XXPH_writeLE64(customSecret + 16*i + 8, XXPH_readLE64(kSecret + 16*i + 8) - seed64);
+    }
+}
+
+
+/* XXPH3_hashLong_64b_withSeed() :
+ * Generate a custom key,
+ * based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ */
+XXPH_NO_INLINE XXPH64_hash_t    /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXPH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXPH64_hash_t seed)
+{
+    XXPH_ALIGN(8) xxh_u8 secret[XXPH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXPH3_hashLong_64b_defaultSecret(input, len);
+    XXPH3_initCustomSecret(secret, seed);
+    return XXPH3_hashLong_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXPH_FORCE_INLINE xxh_u64 XXPH3_mix16B(const xxh_u8* XXPH_RESTRICT input,
+                                 const xxh_u8* XXPH_RESTRICT secret, xxh_u64 seed64)
+{
+    xxh_u64 const input_lo = XXPH_readLE64(input);
+    xxh_u64 const input_hi = XXPH_readLE64(input+8);
+    return XXPH3_mul128_fold64(
+               input_lo ^ (XXPH_readLE64(secret)   + seed64),
+               input_hi ^ (XXPH_readLE64(secret+8) - seed64) );
+}
+
+
+XXPH_FORCE_INLINE XXPH64_hash_t
+XXPH3_len_17to128_64b(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                     const xxh_u8* XXPH_RESTRICT secret, size_t secretSize,
+                     XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXPH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXPH3_mix16B(input+48, secret+96, seed);
+                    acc += XXPH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXPH3_mix16B(input+32, secret+64, seed);
+                acc += XXPH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXPH3_mix16B(input+16, secret+32, seed);
+            acc += XXPH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXPH3_mix16B(input+0, secret+0, seed);
+        acc += XXPH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXPH3_avalanche(acc);
+    }
+}
+
+#define XXPH3_MIDSIZE_MAX 240
+
+XXPH_NO_INLINE XXPH64_hash_t
+XXPH3_len_129to240_64b(const xxh_u8* XXPH_RESTRICT input, size_t len,
+                      const xxh_u8* XXPH_RESTRICT secret, size_t secretSize,
+                      XXPH64_hash_t seed)
+{
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXPH_ASSERT(128 < len && len <= XXPH3_MIDSIZE_MAX);
+
+    #define XXPH3_MIDSIZE_STARTOFFSET 3
+    #define XXPH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXPH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXPH3_avalanche(acc);
+        XXPH_ASSERT(nbRounds >= 8);
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXPH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXPH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXPH3_mix16B(input + len - 16, secret + XXPH3_SECRET_SIZE_MIN - XXPH3_MIDSIZE_LASTOFFSET, seed);
+        return XXPH3_avalanche(acc);
+    }
+}
+
+/* ===   Public entry point   === */
+
+XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXPH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXPH_PUBLIC_API XXPH64_hash_t
+XXPH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+     if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     return XXPH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXPH_PUBLIC_API XXPH64_hash_t
+XXPH3_64bits_withSeed(const void* input, size_t len, XXPH64_hash_t seed)
+{
+    if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXPH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXPH3 streaming   === */
+
+/* RocksDB Note: unused & removed due to bug in preview version */
+
+/*======== END #include "xxh3.h", now inlined above ==========*/
+
+#endif  /* XXPH_NO_LONG_LONG */
+
+/* === END RocksDB modification of permanently inlining === */
+
+#endif /*  defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) */
+
+#endif /* XXPH_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* XXPHASH_H_5627135585666179 */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/backupable/backupable_db.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/backupable/backupable_db.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db.cc	2025-05-19 16:14:28.000000000 +0000
@@ -9,10 +9,10 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <stdlib.h>
 #include <algorithm>
 #include <atomic>
 #include <cinttypes>
+#include <cstdlib>
 #include <functional>
 #include <future>
 #include <limits>
@@ -26,23 +26,62 @@
 #include <vector>
 
 #include "env/composite_env_wrapper.h"
+#include "env/fs_readonly.h"
+#include "env/fs_remap.h"
 #include "file/filename.h"
+#include "file/line_file_reader.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
+#include "rocksdb/env.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/transaction_log.h"
-#include "rocksdb/utilities/backupable_db.h"
+#include "table/sst_file_dumper.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/channel.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/math.h"
 #include "util/string_util.h"
+#include "utilities/backupable/backupable_db_impl.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+namespace {
+using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming;
+
+constexpr BackupID kLatestBackupIDMarker = static_cast<BackupID>(-2);
+
+inline uint32_t ChecksumHexToInt32(const std::string& checksum_hex) {
+  std::string checksum_str;
+  Slice(checksum_hex).DecodeHex(&checksum_str);
+  return EndianSwapValue(DecodeFixed32(checksum_str.c_str()));
+}
+inline std::string ChecksumStrToHex(const std::string& checksum_str) {
+  return Slice(checksum_str).ToString(true);
+}
+inline std::string ChecksumInt32ToHex(const uint32_t& checksum_value) {
+  std::string checksum_str;
+  PutFixed32(&checksum_str, EndianSwapValue(checksum_value));
+  return ChecksumStrToHex(checksum_str);
+}
+
+const std::string kPrivateDirName = "private";
+const std::string kMetaDirName = "meta";
+const std::string kSharedDirName = "shared";
+const std::string kSharedChecksumDirName = "shared_checksum";
+const std::string kPrivateDirSlash = kPrivateDirName + "/";
+const std::string kMetaDirSlash = kMetaDirName + "/";
+const std::string kSharedDirSlash = kSharedDirName + "/";
+const std::string kSharedChecksumDirSlash = kSharedChecksumDirName + "/";
+
+}  // namespace
+
 void BackupStatistics::IncrementNumberSuccessBackup() {
   number_success_backup++;
 }
@@ -64,7 +103,7 @@
   return result;
 }
 
-void BackupableDBOptions::Dump(Logger* logger) const {
+void BackupEngineOptions::Dump(Logger* logger) const {
   ROCKS_LOG_INFO(logger, "               Options.backup_dir: %s",
                  backup_dir.c_str());
   ROCKS_LOG_INFO(logger, "               Options.backup_env: %p", backup_env);
@@ -86,53 +125,79 @@
 }
 
 // -------- BackupEngineImpl class ---------
-class BackupEngineImpl : public BackupEngine {
+class BackupEngineImpl {
  public:
-  BackupEngineImpl(Env* db_env, const BackupableDBOptions& options,
+  BackupEngineImpl(const BackupEngineOptions& options, Env* db_env,
                    bool read_only = false);
-  ~BackupEngineImpl() override;
-  Status CreateNewBackupWithMetadata(DB* db, const std::string& app_metadata,
-                                     bool flush_before_backup = false,
-                                     std::function<void()> progress_callback =
-                                         []() {}) override;
-  Status PurgeOldBackups(uint32_t num_backups_to_keep) override;
-  Status DeleteBackup(BackupID backup_id) override;
-  void StopBackup() override {
-    stop_backup_.store(true, std::memory_order_release);
-  }
-  Status GarbageCollect() override;
+  ~BackupEngineImpl();
+
+  IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options,
+                                       DB* db, const std::string& app_metadata,
+                                       BackupID* new_backup_id_ptr);
+
+  IOStatus PurgeOldBackups(uint32_t num_backups_to_keep);
+
+  IOStatus DeleteBackup(BackupID backup_id);
+
+  void StopBackup() { stop_backup_.store(true, std::memory_order_release); }
+
+  IOStatus GarbageCollect();
 
   // The returned BackupInfos are in chronological order, which means the
   // latest backup comes last.
-  void GetBackupInfo(std::vector<BackupInfo>* backup_info) override;
-  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) override;
-  Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) override;
-  Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) override {
-    return RestoreDBFromBackup(latest_valid_backup_id_, db_dir, wal_dir,
-                               restore_options);
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                     bool include_file_details) const;
+
+  Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                       bool include_file_details = false) const;
+
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) const;
+
+  IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+                               BackupID backup_id, const std::string& db_dir,
+                               const std::string& wal_dir) const;
+
+  IOStatus RestoreDBFromLatestBackup(const RestoreOptions& options,
+                                     const std::string& db_dir,
+                                     const std::string& wal_dir) const {
+    // Note: don't read latest_valid_backup_id_ outside of lock
+    return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
   }
 
-  Status VerifyBackup(BackupID backup_id) override;
+  IOStatus VerifyBackup(BackupID backup_id,
+                        bool verify_with_checksum = false) const;
 
-  Status Initialize();
+  IOStatus Initialize();
+
+  ShareFilesNaming GetNamingNoFlags() const {
+    return options_.share_files_with_checksum_naming &
+           BackupEngineOptions::kMaskNoNamingFlags;
+  }
+  ShareFilesNaming GetNamingFlags() const {
+    return options_.share_files_with_checksum_naming &
+           BackupEngineOptions::kMaskNamingFlags;
+  }
 
  private:
-  void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0);
-  Status DeleteBackupInternal(BackupID backup_id);
+  void DeleteChildren(const std::string& dir,
+                      uint32_t file_type_filter = 0) const;
+  IOStatus DeleteBackupNoGC(BackupID backup_id);
 
   // Extends the "result" map with pathname->size mappings for the contents of
   // "dir" in "env". Pathnames are prefixed with "dir".
-  Status InsertPathnameToSizeBytes(
-      const std::string& dir, Env* env,
-      std::unordered_map<std::string, uint64_t>* result);
+  IOStatus ReadChildFileCurrentSizes(
+      const std::string& dir, const std::shared_ptr<FileSystem>&,
+      std::unordered_map<std::string, uint64_t>* result) const;
 
   struct FileInfo {
-    FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum)
-      : refs(0), filename(fname), size(sz), checksum_value(checksum) {}
+    FileInfo(const std::string& fname, uint64_t sz, const std::string& checksum,
+             const std::string& id = "", const std::string& sid = "")
+        : refs(0),
+          filename(fname),
+          size(sz),
+          checksum_hex(checksum),
+          db_id(id),
+          db_session_id(sid) {}
 
     FileInfo(const FileInfo&) = delete;
     FileInfo& operator=(const FileInfo&) = delete;
@@ -140,7 +205,148 @@
     int refs;
     const std::string filename;
     const uint64_t size;
-    const uint32_t checksum_value;
+    // crc32c checksum as hex. empty == unknown / unavailable
+    std::string checksum_hex;
+    // DB identities
+    // db_id is obtained for potential usage in the future but not used
+    // currently
+    const std::string db_id;
+    // db_session_id appears in the backup SST filename if the table naming
+    // option is kUseDbSessionId
+    const std::string db_session_id;
+
+    std::string GetDbFileName() {
+      std::string rv;
+      // extract the filename part
+      size_t slash = filename.find_last_of('/');
+      // file will either be shared/<file>, shared_checksum/<file_crc32c_size>,
+      // shared_checksum/<file_session>, shared_checksum/<file_crc32c_session>,
+      // or private/<number>/<file>
+      assert(slash != std::string::npos);
+      rv = filename.substr(slash + 1);
+
+      // if the file was in shared_checksum, extract the real file name
+      // in this case the file is <number>_<checksum>_<size>.<type>,
+      // <number>_<session>.<type>, or <number>_<checksum>_<session>.<type>
+      if (filename.substr(0, slash) == kSharedChecksumDirName) {
+        rv = GetFileFromChecksumFile(rv);
+      }
+      return rv;
+    }
+  };
+
+  static void LoopRateLimitRequestHelper(const size_t total_bytes_to_request,
+                                         RateLimiter* rate_limiter,
+                                         const Env::IOPriority pri,
+                                         Statistics* stats,
+                                         const RateLimiter::OpType op_type);
+
+  static inline std::string WithoutTrailingSlash(const std::string& path) {
+    if (path.empty() || path.back() != '/') {
+      return path;
+    } else {
+      return path.substr(path.size() - 1);
+    }
+  }
+
+  static inline std::string WithTrailingSlash(const std::string& path) {
+    if (path.empty() || path.back() != '/') {
+      return path + '/';
+    } else {
+      return path;
+    }
+  }
+
+  // A filesystem wrapper that makes shared backup files appear to be in the
+  // private backup directory (dst_dir), so that the private backup dir can
+  // be opened as a read-only DB.
+  class RemapSharedFileSystem : public RemapFileSystem {
+   public:
+    RemapSharedFileSystem(const std::shared_ptr<FileSystem>& base,
+                          const std::string& dst_dir,
+                          const std::string& src_base_dir,
+                          const std::vector<std::shared_ptr<FileInfo>>& files)
+        : RemapFileSystem(base),
+          dst_dir_(WithoutTrailingSlash(dst_dir)),
+          dst_dir_slash_(WithTrailingSlash(dst_dir)),
+          src_base_dir_(WithTrailingSlash(src_base_dir)) {
+      for (auto& info : files) {
+        if (!StartsWith(info->filename, kPrivateDirSlash)) {
+          assert(StartsWith(info->filename, kSharedDirSlash) ||
+                 StartsWith(info->filename, kSharedChecksumDirSlash));
+          remaps_[info->GetDbFileName()] = info;
+        }
+      }
+    }
+
+    const char* Name() const override {
+      return "BackupEngineImpl::RemapSharedFileSystem";
+    }
+
+    // Sometimes a directory listing is required in opening a DB
+    IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                         std::vector<std::string>* result,
+                         IODebugContext* dbg) override {
+      IOStatus s = RemapFileSystem::GetChildren(dir, options, result, dbg);
+      if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+        // Assume remapped files exist
+        for (auto& r : remaps_) {
+          result->push_back(r.first);
+        }
+      }
+      return s;
+    }
+
+    // Sometimes a directory listing is required in opening a DB
+    IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                       const IOOptions& options,
+                                       std::vector<FileAttributes>* result,
+                                       IODebugContext* dbg) override {
+      IOStatus s =
+          RemapFileSystem::GetChildrenFileAttributes(dir, options, result, dbg);
+      if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+        // Assume remapped files exist with recorded size
+        for (auto& r : remaps_) {
+          result->emplace_back();  // clean up with C++20
+          FileAttributes& attr = result->back();
+          attr.name = r.first;
+          attr.size_bytes = r.second->size;
+        }
+      }
+      return s;
+    }
+
+   protected:
+    // When a file in dst_dir is requested, see if we need to remap to shared
+    // file path.
+    std::pair<IOStatus, std::string> EncodePath(
+        const std::string& path) override {
+      if (path.empty() || path[0] != '/') {
+        return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+      }
+      std::pair<IOStatus, std::string> rv{IOStatus(), path};
+      if (StartsWith(path, dst_dir_slash_)) {
+        std::string relative = path.substr(dst_dir_slash_.size());
+        auto it = remaps_.find(relative);
+        if (it != remaps_.end()) {
+          rv.second = src_base_dir_ + it->second->filename;
+        }
+      }
+      return rv;
+    }
+
+   private:
+    // Absolute path to a directory that some extra files will be mapped into.
+    const std::string dst_dir_;
+    // Includes a trailing slash.
+    const std::string dst_dir_slash_;
+    // Absolute path to a directory containing some files to be mapped into
+    // dst_dir_. Includes a trailing slash.
+    const std::string src_base_dir_;
+    // If remaps_[x] exists, attempt to read dst_dir_ / x should instead read
+    // src_base_dir_ / remaps_[x]->filename. FileInfo is used to maximize
+    // sharing with other backup data in memory.
+    std::unordered_map<std::string, std::shared_ptr<FileInfo>> remaps_;
   };
 
   class BackupMeta {
@@ -148,14 +354,15 @@
     BackupMeta(
         const std::string& meta_filename, const std::string& meta_tmp_filename,
         std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos,
-        Env* env)
+        Env* env, const std::shared_ptr<FileSystem>& fs)
         : timestamp_(0),
           sequence_number_(0),
           size_(0),
           meta_filename_(meta_filename),
           meta_tmp_filename_(meta_tmp_filename),
           file_infos_(file_infos),
-          env_(env) {}
+          env_(env),
+          fs_(fs) {}
 
     BackupMeta(const BackupMeta&) = delete;
     BackupMeta& operator=(const BackupMeta&) = delete;
@@ -163,7 +370,11 @@
     ~BackupMeta() {}
 
     void RecordTimestamp() {
-      env_->GetCurrentTime(&timestamp_);
+      // Best effort
+      Status s = env_->GetCurrentTime(&timestamp_);
+      if (!s.ok()) {
+        timestamp_ = /* something clearly fabricated */ 1;
+      }
     }
     int64_t GetTimestamp() const {
       return timestamp_;
@@ -171,13 +382,13 @@
     uint64_t GetSize() const {
       return size_;
     }
-    uint32_t GetNumberFiles() { return static_cast<uint32_t>(files_.size()); }
+    uint32_t GetNumberFiles() const {
+      return static_cast<uint32_t>(files_.size());
+    }
     void SetSequenceNumber(uint64_t sequence_number) {
       sequence_number_ = sequence_number;
     }
-    uint64_t GetSequenceNumber() {
-      return sequence_number_;
-    }
+    uint64_t GetSequenceNumber() const { return sequence_number_; }
 
     const std::string& GetAppMetadata() const { return app_metadata_; }
 
@@ -185,13 +396,11 @@
       app_metadata_ = app_metadata;
     }
 
-    Status AddFile(std::shared_ptr<FileInfo> file_info);
+    IOStatus AddFile(std::shared_ptr<FileInfo> file_info);
 
-    Status Delete(bool delete_meta = true);
+    IOStatus Delete(bool delete_meta = true);
 
-    bool Empty() {
-      return files_.empty();
-    }
+    bool Empty() const { return files_.empty(); }
 
     std::shared_ptr<FileInfo> GetFile(const std::string& filename) const {
       auto it = file_infos_->find(filename);
@@ -200,15 +409,18 @@
       return it->second;
     }
 
-    const std::vector<std::shared_ptr<FileInfo>>& GetFiles() {
+    const std::vector<std::shared_ptr<FileInfo>>& GetFiles() const {
       return files_;
     }
 
     // @param abs_path_to_size Pre-fetched file sizes (bytes).
-    Status LoadFromFile(
+    IOStatus LoadFromFile(
         const std::string& backup_dir,
-        const std::unordered_map<std::string, uint64_t>& abs_path_to_size);
-    Status StoreToFile(bool sync);
+        const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+        RateLimiter* rate_limiter, Logger* info_log,
+        std::unordered_set<std::string>* reported_ignored_fields);
+    IOStatus StoreToFile(
+        bool sync, const TEST_FutureSchemaVersion2Options* test_future_options);
 
     std::string GetInfoString() {
       std::ostringstream ss;
@@ -225,6 +437,27 @@
       return ss.str();
     }
 
+    const std::shared_ptr<Env>& GetEnvForOpen() const {
+      if (!env_for_open_) {
+        // Lazy initialize
+        // Find directories
+        std::string dst_dir = meta_filename_;
+        auto i = dst_dir.rfind(kMetaDirSlash);
+        assert(i != std::string::npos);
+        std::string src_base_dir = dst_dir.substr(0, i);
+        dst_dir.replace(i, kMetaDirSlash.size(), kPrivateDirSlash);
+        // Make the RemapSharedFileSystem
+        std::shared_ptr<FileSystem> remap_fs =
+            std::make_shared<RemapSharedFileSystem>(fs_, dst_dir, src_base_dir,
+                                                    files_);
+        // Make it read-only for safety
+        remap_fs = std::make_shared<ReadOnlyFileSystem>(remap_fs);
+        // Make an Env wrapper
+        env_for_open_ = std::make_shared<CompositeEnvWrapper>(env_, remap_fs);
+      }
+      return env_for_open_;
+    }
+
    private:
     int64_t timestamp_;
     // sequence number is only approximate, should not be used
@@ -238,61 +471,72 @@
     std::vector<std::shared_ptr<FileInfo>> files_;
     std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos_;
     Env* env_;
-
-    static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024;  // 10MB
+    mutable std::shared_ptr<Env> env_for_open_;
+    std::shared_ptr<FileSystem> fs_;
+    IOOptions iooptions_ = IOOptions();
   };  // BackupMeta
 
+  void SetBackupInfoFromBackupMeta(BackupID id, const BackupMeta& meta,
+                                   BackupInfo* backup_info,
+                                   bool include_file_details) const;
+
   inline std::string GetAbsolutePath(
       const std::string &relative_path = "") const {
     assert(relative_path.size() == 0 || relative_path[0] != '/');
     return options_.backup_dir + "/" + relative_path;
   }
-  inline std::string GetPrivateDirRel() const {
-    return "private";
-  }
-  inline std::string GetSharedChecksumDirRel() const {
-    return "shared_checksum";
-  }
   inline std::string GetPrivateFileRel(BackupID backup_id,
                                        bool tmp = false,
                                        const std::string& file = "") const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetPrivateDirRel() + "/" + ROCKSDB_NAMESPACE::ToString(backup_id) +
+    return kPrivateDirSlash + ROCKSDB_NAMESPACE::ToString(backup_id) +
            (tmp ? ".tmp" : "") + "/" + file;
   }
   inline std::string GetSharedFileRel(const std::string& file = "",
                                       bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return std::string("shared/") + (tmp ? "." : "") + file +
+    return kSharedDirSlash + std::string(tmp ? "." : "") + file +
            (tmp ? ".tmp" : "");
   }
   inline std::string GetSharedFileWithChecksumRel(const std::string& file = "",
                                                   bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetSharedChecksumDirRel() + "/" + (tmp ? "." : "") + file +
+    return kSharedChecksumDirSlash + std::string(tmp ? "." : "") + file +
            (tmp ? ".tmp" : "");
   }
-  inline std::string GetSharedFileWithChecksum(const std::string& file,
-                                               const uint32_t checksum_value,
-                                               const uint64_t file_size) const {
+  inline bool UseLegacyNaming(const std::string& sid) const {
+    return GetNamingNoFlags() ==
+               BackupEngineOptions::kLegacyCrc32cAndFileSize ||
+           sid.empty();
+  }
+  inline std::string GetSharedFileWithChecksum(
+      const std::string& file, const std::string& checksum_hex,
+      const uint64_t file_size, const std::string& db_session_id) const {
     assert(file.size() == 0 || file[0] != '/');
     std::string file_copy = file;
-    return file_copy.insert(file_copy.find_last_of('.'),
-                            "_" + ROCKSDB_NAMESPACE::ToString(checksum_value) +
-                                "_" + ROCKSDB_NAMESPACE::ToString(file_size));
+    if (UseLegacyNaming(db_session_id)) {
+      assert(!checksum_hex.empty());
+      file_copy.insert(file_copy.find_last_of('.'),
+                       "_" + ToString(ChecksumHexToInt32(checksum_hex)) + "_" +
+                           ToString(file_size));
+    } else {
+      file_copy.insert(file_copy.find_last_of('.'), "_s" + db_session_id);
+      if (GetNamingFlags() & BackupEngineOptions::kFlagIncludeFileSize) {
+        file_copy.insert(file_copy.find_last_of('.'),
+                         "_" + ToString(file_size));
+      }
+    }
+    return file_copy;
   }
-  inline std::string GetFileFromChecksumFile(const std::string& file) const {
+  static inline std::string GetFileFromChecksumFile(const std::string& file) {
     assert(file.size() == 0 || file[0] != '/');
     std::string file_copy = file;
     size_t first_underscore = file_copy.find_first_of('_');
     return file_copy.erase(first_underscore,
                            file_copy.find_last_of('.') - first_underscore);
   }
-  inline std::string GetBackupMetaDir() const {
-    return GetAbsolutePath("meta");
-  }
   inline std::string GetBackupMetaFile(BackupID backup_id, bool tmp) const {
-    return GetBackupMetaDir() + "/" + (tmp ? "." : "") +
+    return GetAbsolutePath(kMetaDirName) + "/" + (tmp ? "." : "") +
            ROCKSDB_NAMESPACE::ToString(backup_id) + (tmp ? ".tmp" : "");
   }
 
@@ -302,23 +546,43 @@
   //
   // @param src If non-empty, the file is copied from this pathname.
   // @param contents If non-empty, the file will be created with these contents.
-  Status CopyOrCreateFile(const std::string& src, const std::string& dst,
-                          const std::string& contents, Env* src_env,
-                          Env* dst_env, const EnvOptions& src_env_options,
-                          bool sync, RateLimiter* rate_limiter,
-                          uint64_t* size = nullptr,
-                          uint32_t* checksum_value = nullptr,
-                          uint64_t size_limit = 0,
-                          std::function<void()> progress_callback = []() {});
-
-  Status CalculateChecksum(const std::string& src, Env* src_env,
-                           const EnvOptions& src_env_options,
-                           uint64_t size_limit, uint32_t* checksum_value);
+  IOStatus CopyOrCreateFile(const std::string& src, const std::string& dst,
+                            const std::string& contents, uint64_t size_limit,
+                            Env* src_env, Env* dst_env,
+                            const EnvOptions& src_env_options, bool sync,
+                            RateLimiter* rate_limiter,
+                            std::function<void()> progress_callback,
+                            uint64_t* bytes_toward_next_callback,
+                            uint64_t* size, std::string* checksum_hex);
+
+  IOStatus ReadFileAndComputeChecksum(const std::string& src,
+                                      const std::shared_ptr<FileSystem>& src_fs,
+                                      const EnvOptions& src_env_options,
+                                      uint64_t size_limit,
+                                      std::string* checksum_hex) const;
+
+  // Obtain db_id and db_session_id from the table properties of file_path
+  Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options,
+                             const std::string& file_path,
+                             RateLimiter* rate_limiter, std::string* db_id,
+                             std::string* db_session_id);
 
   struct CopyOrCreateResult {
+    ~CopyOrCreateResult() {
+      // The Status needs to be ignored here for two reasons.
+      // First, if the BackupEngineImpl shuts down with jobs outstanding, then
+      // it is possible that the Status in the future/promise is never read,
+      // resulting in an unchecked Status. Second, if there are items in the
+      // channel when the BackupEngineImpl is shutdown, these will also have
+      // Status that have not been checked.  This
+      // TODO: Fix those issues so that the Status
+      io_status.PermitUncheckedError();
+    }
     uint64_t size;
-    uint32_t checksum_value;
-    Status status;
+    std::string checksum_hex;
+    std::string db_id;
+    std::string db_session_id;
+    IOStatus io_status;
   };
 
   // Exactly one of src_path and contents must be non-empty. If src_path is
@@ -334,8 +598,13 @@
     bool sync;
     RateLimiter* rate_limiter;
     uint64_t size_limit;
+    Statistics* stats;
     std::promise<CopyOrCreateResult> result;
     std::function<void()> progress_callback;
+    std::string src_checksum_func_name;
+    std::string src_checksum_hex;
+    std::string db_id;
+    std::string db_session_id;
 
     CopyOrCreateWorkItem()
         : src_path(""),
@@ -346,7 +615,12 @@
           src_env_options(),
           sync(false),
           rate_limiter(nullptr),
-          size_limit(0) {}
+          size_limit(0),
+          stats(nullptr),
+          src_checksum_func_name(kUnknownFileChecksumFuncName),
+          src_checksum_hex(""),
+          db_id(""),
+          db_session_id("") {}
 
     CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete;
     CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete;
@@ -365,16 +639,25 @@
       sync = o.sync;
       rate_limiter = o.rate_limiter;
       size_limit = o.size_limit;
+      stats = o.stats;
       result = std::move(o.result);
       progress_callback = std::move(o.progress_callback);
+      src_checksum_func_name = std::move(o.src_checksum_func_name);
+      src_checksum_hex = std::move(o.src_checksum_hex);
+      db_id = std::move(o.db_id);
+      db_session_id = std::move(o.db_session_id);
       return *this;
     }
 
-    CopyOrCreateWorkItem(std::string _src_path, std::string _dst_path,
-                         std::string _contents, Env* _src_env, Env* _dst_env,
-                         EnvOptions _src_env_options, bool _sync,
-                         RateLimiter* _rate_limiter, uint64_t _size_limit,
-                         std::function<void()> _progress_callback = []() {})
+    CopyOrCreateWorkItem(
+        std::string _src_path, std::string _dst_path, std::string _contents,
+        Env* _src_env, Env* _dst_env, EnvOptions _src_env_options, bool _sync,
+        RateLimiter* _rate_limiter, uint64_t _size_limit, Statistics* _stats,
+        std::function<void()> _progress_callback = []() {},
+        const std::string& _src_checksum_func_name =
+            kUnknownFileChecksumFuncName,
+        const std::string& _src_checksum_hex = "",
+        const std::string& _db_id = "", const std::string& _db_session_id = "")
         : src_path(std::move(_src_path)),
           dst_path(std::move(_dst_path)),
           contents(std::move(_contents)),
@@ -384,7 +667,12 @@
           sync(_sync),
           rate_limiter(_rate_limiter),
           size_limit(_size_limit),
-          progress_callback(_progress_callback) {}
+          stats(_stats),
+          progress_callback(_progress_callback),
+          src_checksum_func_name(_src_checksum_func_name),
+          src_checksum_hex(_src_checksum_hex),
+          db_id(_db_id),
+          db_session_id(_db_session_id) {}
   };
 
   struct BackupAfterCopyOrCreateWorkItem {
@@ -436,12 +724,18 @@
 
   struct RestoreAfterCopyOrCreateWorkItem {
     std::future<CopyOrCreateResult> result;
-    uint32_t checksum_value;
-    RestoreAfterCopyOrCreateWorkItem()
-      : checksum_value(0) {}
+    std::string from_file;
+    std::string to_file;
+    std::string checksum_hex;
+    RestoreAfterCopyOrCreateWorkItem() : checksum_hex("") {}
     RestoreAfterCopyOrCreateWorkItem(std::future<CopyOrCreateResult>&& _result,
-                                     uint32_t _checksum_value)
-        : result(std::move(_result)), checksum_value(_checksum_value) {}
+                                     const std::string& _from_file,
+                                     const std::string& _to_file,
+                                     const std::string& _checksum_hex)
+        : result(std::move(_result)),
+          from_file(_from_file),
+          to_file(_to_file),
+          checksum_hex(_checksum_hex) {}
     RestoreAfterCopyOrCreateWorkItem(RestoreAfterCopyOrCreateWorkItem&& o)
         ROCKSDB_NOEXCEPT {
       *this = std::move(o);
@@ -450,15 +744,17 @@
     RestoreAfterCopyOrCreateWorkItem& operator=(
         RestoreAfterCopyOrCreateWorkItem&& o) ROCKSDB_NOEXCEPT {
       result = std::move(o.result);
-      checksum_value = o.checksum_value;
+      checksum_hex = std::move(o.checksum_hex);
       return *this;
     }
   };
 
   bool initialized_;
   std::mutex byte_report_mutex_;
-  channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
+  mutable channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
   std::vector<port::Thread> threads_;
+  std::atomic<CpuPriority> threads_cpu_priority_;
+
   // Certain operations like PurgeOldBackups and DeleteBackup will trigger
   // automatic GarbageCollect (true) unless we've already done one in this
   // session and have not failed to delete backup files since then (false).
@@ -473,69 +769,182 @@
   //    copied.
   // @param fname Name of destination file and, in case of copy, source file.
   // @param contents If non-empty, the file will be created with these contents.
-  Status AddBackupFileWorkItem(
+  IOStatus AddBackupFileWorkItem(
       std::unordered_set<std::string>& live_dst_paths,
       std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
       BackupID backup_id, bool shared, const std::string& src_dir,
       const std::string& fname,  // starts with "/"
       const EnvOptions& src_env_options, RateLimiter* rate_limiter,
-      uint64_t size_bytes, uint64_t size_limit = 0,
-      bool shared_checksum = false,
+      FileType file_type, uint64_t size_bytes, Statistics* stats,
+      uint64_t size_limit = 0, bool shared_checksum = false,
       std::function<void()> progress_callback = []() {},
-      const std::string& contents = std::string());
+      const std::string& contents = std::string(),
+      const std::string& src_checksum_func_name = kUnknownFileChecksumFuncName,
+      const std::string& src_checksum_str = kUnknownFileChecksum);
 
   // backup state data
   BackupID latest_backup_id_;
   BackupID latest_valid_backup_id_;
   std::map<BackupID, std::unique_ptr<BackupMeta>> backups_;
-  std::map<BackupID, std::pair<Status, std::unique_ptr<BackupMeta>>>
+  std::map<BackupID, std::pair<IOStatus, std::unique_ptr<BackupMeta>>>
       corrupt_backups_;
   std::unordered_map<std::string,
                      std::shared_ptr<FileInfo>> backuped_file_infos_;
   std::atomic<bool> stop_backup_;
 
   // options data
-  BackupableDBOptions options_;
+  BackupEngineOptions options_;
   Env* db_env_;
   Env* backup_env_;
 
   // directories
-  std::unique_ptr<Directory> backup_directory_;
-  std::unique_ptr<Directory> shared_directory_;
-  std::unique_ptr<Directory> meta_directory_;
-  std::unique_ptr<Directory> private_directory_;
+  std::unique_ptr<FSDirectory> backup_directory_;
+  std::unique_ptr<FSDirectory> shared_directory_;
+  std::unique_ptr<FSDirectory> meta_directory_;
+  std::unique_ptr<FSDirectory> private_directory_;
 
   static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
-  size_t copy_file_buffer_size_;
   bool read_only_;
   BackupStatistics backup_statistics_;
+  std::unordered_set<std::string> reported_ignored_fields_;
   static const size_t kMaxAppMetaSize = 1024 * 1024;  // 1MB
+  std::shared_ptr<FileSystem> db_fs_;
+  std::shared_ptr<FileSystem> backup_fs_;
+  IOOptions io_options_ = IOOptions();
+
+ public:
+  std::unique_ptr<TEST_FutureSchemaVersion2Options> test_future_options_;
 };
 
-Status BackupEngine::Open(Env* env, const BackupableDBOptions& options,
-                          BackupEngine** backup_engine_ptr) {
-  std::unique_ptr<BackupEngineImpl> backup_engine(
-      new BackupEngineImpl(env, options));
+// -------- BackupEngineImplThreadSafe class ---------
+// This locking layer for thread safety in the public API is layered on
+// top to prevent accidental recursive locking with RWMutex, which is UB.
+// Note: BackupEngineReadOnlyBase inherited twice, but has no fields
+class BackupEngineImplThreadSafe : public BackupEngine,
+                                   public BackupEngineReadOnly {
+ public:
+  BackupEngineImplThreadSafe(const BackupEngineOptions& options, Env* db_env,
+                             bool read_only = false)
+      : impl_(options, db_env, read_only) {}
+  ~BackupEngineImplThreadSafe() override {}
+
+  using BackupEngine::CreateNewBackupWithMetadata;
+  IOStatus CreateNewBackupWithMetadata(const CreateBackupOptions& options,
+                                       DB* db, const std::string& app_metadata,
+                                       BackupID* new_backup_id) override {
+    WriteLock lock(&mutex_);
+    return impl_.CreateNewBackupWithMetadata(options, db, app_metadata,
+                                             new_backup_id);
+  }
+
+  IOStatus PurgeOldBackups(uint32_t num_backups_to_keep) override {
+    WriteLock lock(&mutex_);
+    return impl_.PurgeOldBackups(num_backups_to_keep);
+  }
+
+  IOStatus DeleteBackup(BackupID backup_id) override {
+    WriteLock lock(&mutex_);
+    return impl_.DeleteBackup(backup_id);
+  }
+
+  void StopBackup() override {
+    // No locking needed
+    impl_.StopBackup();
+  }
+
+  IOStatus GarbageCollect() override {
+    WriteLock lock(&mutex_);
+    return impl_.GarbageCollect();
+  }
+
+  Status GetLatestBackupInfo(BackupInfo* backup_info,
+                             bool include_file_details = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.GetBackupInfo(kLatestBackupIDMarker, backup_info,
+                               include_file_details);
+  }
+
+  Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                       bool include_file_details = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.GetBackupInfo(backup_id, backup_info, include_file_details);
+  }
+
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                     bool include_file_details) const override {
+    ReadLock lock(&mutex_);
+    impl_.GetBackupInfo(backup_info, include_file_details);
+  }
+
+  void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) const override {
+    ReadLock lock(&mutex_);
+    impl_.GetCorruptedBackups(corrupt_backup_ids);
+  }
+
+  using BackupEngine::RestoreDBFromBackup;
+  IOStatus RestoreDBFromBackup(const RestoreOptions& options,
+                               BackupID backup_id, const std::string& db_dir,
+                               const std::string& wal_dir) const override {
+    ReadLock lock(&mutex_);
+    return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+  }
+
+  using BackupEngine::RestoreDBFromLatestBackup;
+  IOStatus RestoreDBFromLatestBackup(
+      const RestoreOptions& options, const std::string& db_dir,
+      const std::string& wal_dir) const override {
+    // Defer to above function, which locks
+    return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
+  }
+
+  IOStatus VerifyBackup(BackupID backup_id,
+                        bool verify_with_checksum = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.VerifyBackup(backup_id, verify_with_checksum);
+  }
+
+  // Not public API but needed
+  IOStatus Initialize() {
+    // No locking needed
+    return impl_.Initialize();
+  }
+
+  // Not public API but used in testing
+  void TEST_EnableWriteFutureSchemaVersion2(
+      const TEST_FutureSchemaVersion2Options& options) {
+    impl_.test_future_options_.reset(
+        new TEST_FutureSchemaVersion2Options(options));
+  }
+
+ private:
+  mutable port::RWMutex mutex_;
+  BackupEngineImpl impl_;
+};
+
+IOStatus BackupEngine::Open(const BackupEngineOptions& options, Env* env,
+                            BackupEngine** backup_engine_ptr) {
+  std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+      new BackupEngineImplThreadSafe(options, env));
   auto s = backup_engine->Initialize();
   if (!s.ok()) {
     *backup_engine_ptr = nullptr;
     return s;
   }
   *backup_engine_ptr = backup_engine.release();
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-BackupEngineImpl::BackupEngineImpl(Env* db_env,
-                                   const BackupableDBOptions& options,
-                                   bool read_only)
+BackupEngineImpl::BackupEngineImpl(const BackupEngineOptions& options,
+                                   Env* db_env, bool read_only)
     : initialized_(false),
+      threads_cpu_priority_(),
       latest_backup_id_(0),
       latest_valid_backup_id_(0),
       stop_backup_(false),
       options_(options),
       db_env_(db_env),
       backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_),
-      copy_file_buffer_size_(kDefaultCopyFileBufferSize),
       read_only_(read_only) {
   if (options_.backup_rate_limiter == nullptr &&
       options_.backup_rate_limit > 0) {
@@ -547,6 +956,8 @@
     options_.restore_rate_limiter.reset(
         NewGenericRateLimiter(options_.restore_rate_limit));
   }
+  db_fs_ = db_env_->GetFileSystem();
+  backup_fs_ = backup_env_->GetFileSystem();
 }
 
 BackupEngineImpl::~BackupEngineImpl() {
@@ -555,9 +966,12 @@
     t.join();
   }
   LogFlush(options_.info_log);
+  for (const auto& it : corrupt_backups_) {
+    it.second.first.PermitUncheckedError();
+  }
 }
 
-Status BackupEngineImpl::Initialize() {
+IOStatus BackupEngineImpl::Initialize() {
   assert(!initialized_);
   initialized_ = true;
   if (read_only_) {
@@ -565,6 +979,8 @@
   }
   options_.Dump(options_.info_log);
 
+  auto meta_path = GetAbsolutePath(kMetaDirName);
+
   if (!read_only_) {
     // we might need to clean up from previous crash or I/O errors
     might_need_garbage_collect_ = true;
@@ -578,7 +994,7 @@
     }
 
     // gather the list of directories that we need to create
-    std::vector<std::pair<std::string, std::unique_ptr<Directory>*>>
+    std::vector<std::pair<std::string, std::unique_ptr<FSDirectory>*>>
         directories;
     directories.emplace_back(GetAbsolutePath(), &backup_directory_);
     if (options_.share_table_files) {
@@ -591,54 +1007,55 @@
                                  &shared_directory_);
       }
     }
-    directories.emplace_back(GetAbsolutePath(GetPrivateDirRel()),
+    directories.emplace_back(GetAbsolutePath(kPrivateDirName),
                              &private_directory_);
-    directories.emplace_back(GetBackupMetaDir(), &meta_directory_);
+    directories.emplace_back(meta_path, &meta_directory_);
     // create all the dirs we need
     for (const auto& d : directories) {
-      auto s = backup_env_->CreateDirIfMissing(d.first);
-      if (s.ok()) {
-        s = backup_env_->NewDirectory(d.first, d.second);
+      IOStatus io_s =
+          backup_fs_->CreateDirIfMissing(d.first, io_options_, nullptr);
+      if (io_s.ok()) {
+        io_s =
+            backup_fs_->NewDirectory(d.first, io_options_, d.second, nullptr);
       }
-      if (!s.ok()) {
-        return s;
+      if (!io_s.ok()) {
+        return io_s;
       }
     }
   }
 
   std::vector<std::string> backup_meta_files;
   {
-    auto s = backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
-    if (s.IsNotFound()) {
-      return Status::NotFound(GetBackupMetaDir() + " is missing");
-    } else if (!s.ok()) {
-      return s;
+    IOStatus io_s = backup_fs_->GetChildren(meta_path, io_options_,
+                                            &backup_meta_files, nullptr);
+    if (io_s.IsNotFound()) {
+      return IOStatus::NotFound(meta_path + " is missing");
+    } else if (!io_s.ok()) {
+      return io_s;
     }
   }
   // create backups_ structure
   for (auto& file : backup_meta_files) {
-    if (file == "." || file == "..") {
-      continue;
-    }
     ROCKS_LOG_INFO(options_.info_log, "Detected backup %s", file.c_str());
     BackupID backup_id = 0;
     sscanf(file.c_str(), "%u", &backup_id);
     if (backup_id == 0 || file != ROCKSDB_NAMESPACE::ToString(backup_id)) {
-      if (!read_only_) {
-        // invalid file name, delete that
-        auto s = backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
-        ROCKS_LOG_INFO(options_.info_log,
-                       "Unrecognized meta file %s, deleting -- %s",
-                       file.c_str(), s.ToString().c_str());
-      }
+      // Invalid file name, will be deleted with auto-GC when user
+      // initiates an append or write operation. (Behave as read-only until
+      // then.)
+      ROCKS_LOG_INFO(options_.info_log, "Skipping unrecognized meta file %s",
+                     file.c_str());
       continue;
     }
     assert(backups_.find(backup_id) == backups_.end());
+    // Insert all the (backup_id, BackupMeta) that will be loaded later
+    // The loading performed later will check whether there are corrupt backups
+    // and move the corrupt backups to corrupt_backups_
     backups_.insert(std::make_pair(
         backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
                        GetBackupMetaFile(backup_id, false /* tmp */),
                        GetBackupMetaFile(backup_id, true /* tmp */),
-                       &backuped_file_infos_, backup_env_))));
+                       &backuped_file_infos_, backup_env_, backup_fs_))));
   }
 
   latest_backup_id_ = 0;
@@ -649,19 +1066,28 @@
         options_.info_log,
         "Backup Engine started with destroy_old_data == true, deleting all "
         "backups");
-    auto s = PurgeOldBackups(0);
-    if (s.ok()) {
-      s = GarbageCollect();
+    IOStatus io_s = PurgeOldBackups(0);
+    if (io_s.ok()) {
+      io_s = GarbageCollect();
     }
-    if (!s.ok()) {
-      return s;
+    if (!io_s.ok()) {
+      return io_s;
     }
   } else {  // Load data from storage
+    // abs_path_to_size: maps absolute paths of files in backup directory to
+    // their corresponding sizes
     std::unordered_map<std::string, uint64_t> abs_path_to_size;
+    // Insert files and their sizes in backup sub-directories (shared and
+    // shared_checksum) to abs_path_to_size
     for (const auto& rel_dir :
          {GetSharedFileRel(), GetSharedFileWithChecksumRel()}) {
       const auto abs_dir = GetAbsolutePath(rel_dir);
-      InsertPathnameToSizeBytes(abs_dir, backup_env_, &abs_path_to_size);
+      IOStatus io_s =
+          ReadChildFileCurrentSizes(abs_dir, backup_fs_, &abs_path_to_size);
+      if (!io_s.ok()) {
+        // I/O error likely impacting all backups
+        return io_s;
+      }
     }
     // load the backups if any, until valid_backups_to_open of the latest
     // non-corrupted backups have been successfully opened.
@@ -677,22 +1103,28 @@
         break;
       }
 
-      InsertPathnameToSizeBytes(
-          GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_env_,
+      // Insert files and their sizes in backup sub-directories
+      // (private/backup_id) to abs_path_to_size
+      IOStatus io_s = ReadChildFileCurrentSizes(
+          GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_fs_,
           &abs_path_to_size);
-      Status s = backup_iter->second->LoadFromFile(options_.backup_dir,
-                                                   abs_path_to_size);
-      if (s.IsCorruption()) {
+      if (io_s.ok()) {
+        io_s = backup_iter->second->LoadFromFile(
+            options_.backup_dir, abs_path_to_size,
+            options_.backup_rate_limiter.get(), options_.info_log,
+            &reported_ignored_fields_);
+      }
+      if (io_s.IsCorruption() || io_s.IsNotSupported()) {
         ROCKS_LOG_INFO(options_.info_log, "Backup %u corrupted -- %s",
-                       backup_iter->first, s.ToString().c_str());
-        corrupt_backups_.insert(
-            std::make_pair(backup_iter->first,
-                           std::make_pair(s, std::move(backup_iter->second))));
-      } else if (!s.ok()) {
+                       backup_iter->first, io_s.ToString().c_str());
+        corrupt_backups_.insert(std::make_pair(
+            backup_iter->first,
+            std::make_pair(io_s, std::move(backup_iter->second))));
+      } else if (!io_s.ok()) {
         // Distinguish corruption errors from errors in the backup Env.
         // Errors in the backup Env (i.e., this code path) will cause Open() to
         // fail, whereas corruption errors would not cause Open() failures.
-        return s;
+        return io_s;
       } else {
         ROCKS_LOG_INFO(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s",
                        backup_iter->first,
@@ -730,6 +1162,8 @@
 
   // set up threads perform copies from files_to_copy_or_create_ in the
   // background
+  threads_cpu_priority_ = CpuPriority::kNormal;
+  threads_.reserve(options_.max_background_operations);
   for (int t = 0; t < options_.max_background_operations; t++) {
     threads_.emplace_back([this]() {
 #if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
@@ -737,40 +1171,105 @@
       pthread_setname_np(pthread_self(), "backup_engine");
 #endif
 #endif
+      CpuPriority current_priority = CpuPriority::kNormal;
       CopyOrCreateWorkItem work_item;
+      uint64_t bytes_toward_next_callback = 0;
       while (files_to_copy_or_create_.read(work_item)) {
+        CpuPriority priority = threads_cpu_priority_;
+        if (current_priority != priority) {
+          TEST_SYNC_POINT_CALLBACK(
+              "BackupEngineImpl::Initialize:SetCpuPriority", &priority);
+          port::SetCpuPriority(0, priority);
+          current_priority = priority;
+        }
+        // `bytes_read` and `bytes_written` stats are enabled based on
+        // compile-time support and cannot be dynamically toggled. So we do not
+        // need to worry about `PerfLevel` here, unlike many other
+        // `IOStatsContext` / `PerfContext` stats.
+        uint64_t prev_bytes_read = IOSTATS(bytes_read);
+        uint64_t prev_bytes_written = IOSTATS(bytes_written);
+
         CopyOrCreateResult result;
-        result.status = CopyOrCreateFile(
+        result.io_status = CopyOrCreateFile(
             work_item.src_path, work_item.dst_path, work_item.contents,
-            work_item.src_env, work_item.dst_env, work_item.src_env_options,
-            work_item.sync, work_item.rate_limiter, &result.size,
-            &result.checksum_value, work_item.size_limit,
-            work_item.progress_callback);
+            work_item.size_limit, work_item.src_env, work_item.dst_env,
+            work_item.src_env_options, work_item.sync, work_item.rate_limiter,
+            work_item.progress_callback, &bytes_toward_next_callback,
+            &result.size, &result.checksum_hex);
+
+        RecordTick(work_item.stats, BACKUP_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        RecordTick(work_item.stats, BACKUP_WRITE_BYTES,
+                   IOSTATS(bytes_written) - prev_bytes_written);
+
+        result.db_id = work_item.db_id;
+        result.db_session_id = work_item.db_session_id;
+        if (result.io_status.ok() && !work_item.src_checksum_hex.empty()) {
+          // unknown checksum function name implies no db table file checksum in
+          // db manifest; work_item.src_checksum_hex not empty means
+          // backup engine has calculated its crc32c checksum for the table
+          // file; therefore, we are able to compare the checksums.
+          if (work_item.src_checksum_func_name ==
+                  kUnknownFileChecksumFuncName ||
+              work_item.src_checksum_func_name == kDbFileChecksumFuncName) {
+            if (work_item.src_checksum_hex != result.checksum_hex) {
+              std::string checksum_info(
+                  "Expected checksum is " + work_item.src_checksum_hex +
+                  " while computed checksum is " + result.checksum_hex);
+              result.io_status = IOStatus::Corruption(
+                  "Checksum mismatch after copying to " + work_item.dst_path +
+                  ": " + checksum_info);
+            }
+          } else {
+            // FIXME(peterd): dead code?
+            std::string checksum_function_info(
+                "Existing checksum function is " +
+                work_item.src_checksum_func_name +
+                " while provided checksum function is " +
+                kBackupFileChecksumFuncName);
+            ROCKS_LOG_INFO(
+                options_.info_log,
+                "Unable to verify checksum after copying to %s: %s\n",
+                work_item.dst_path.c_str(), checksum_function_info.c_str());
+          }
+        }
         work_item.result.set_value(std::move(result));
       }
     });
   }
   ROCKS_LOG_INFO(options_.info_log, "Initialized BackupEngine");
 
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status BackupEngineImpl::CreateNewBackupWithMetadata(
-    DB* db, const std::string& app_metadata, bool flush_before_backup,
-    std::function<void()> progress_callback) {
+IOStatus BackupEngineImpl::CreateNewBackupWithMetadata(
+    const CreateBackupOptions& options, DB* db, const std::string& app_metadata,
+    BackupID* new_backup_id_ptr) {
   assert(initialized_);
   assert(!read_only_);
   if (app_metadata.size() > kMaxAppMetaSize) {
-    return Status::InvalidArgument("App metadata too large");
+    return IOStatus::InvalidArgument("App metadata too large");
+  }
+
+  if (options.decrease_background_thread_cpu_priority) {
+    if (options.background_thread_cpu_priority < threads_cpu_priority_) {
+      threads_cpu_priority_.store(options.background_thread_cpu_priority);
+    }
   }
 
   BackupID new_backup_id = latest_backup_id_ + 1;
 
+  // `bytes_read` and `bytes_written` stats are enabled based on compile-time
+  // support and cannot be dynamically toggled. So we do not need to worry about
+  // `PerfLevel` here, unlike many other `IOStatsContext` / `PerfContext` stats.
+  uint64_t prev_bytes_read = IOSTATS(bytes_read);
+  uint64_t prev_bytes_written = IOSTATS(bytes_written);
+
   assert(backups_.find(new_backup_id) == backups_.end());
 
   auto private_dir = GetAbsolutePath(GetPrivateFileRel(new_backup_id));
-  Status s = backup_env_->FileExists(private_dir);
-  if (s.ok()) {
+  IOStatus io_s = backup_fs_->FileExists(private_dir, io_options_, nullptr);
+  if (io_s.ok()) {
     // maybe last backup failed and left partial state behind, clean it up.
     // need to do this before updating backups_ such that a private dir
     // named after new_backup_id will be cleaned up.
@@ -778,17 +1277,17 @@
     // of the latest full backup, then there could be more than one next
     // id with a private dir, the last thing to be deleted in delete
     // backup, but all will be cleaned up with a GarbageCollect.)
-    s = GarbageCollect();
-  } else if (s.IsNotFound()) {
+    io_s = GarbageCollect();
+  } else if (io_s.IsNotFound()) {
     // normal case, the new backup's private dir doesn't exist yet
-    s = Status::OK();
+    io_s = IOStatus::OK();
   }
 
   auto ret = backups_.insert(std::make_pair(
       new_backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
                          GetBackupMetaFile(new_backup_id, false /* tmp */),
                          GetBackupMetaFile(new_backup_id, true /* tmp */),
-                         &backuped_file_infos_, backup_env_))));
+                         &backuped_file_infos_, backup_env_, backup_fs_))));
   assert(ret.second == true);
   auto& new_backup = ret.first->second;
   new_backup->RecordTimestamp();
@@ -799,13 +1298,15 @@
   ROCKS_LOG_INFO(options_.info_log,
                  "Started the backup process -- creating backup %u",
                  new_backup_id);
-  if (s.ok()) {
-    s = backup_env_->CreateDir(private_dir);
+
+  if (options_.share_table_files && !options_.share_files_with_checksum) {
+    ROCKS_LOG_WARN(options_.info_log,
+                   "BackupEngineOptions::share_files_with_checksum=false is "
+                   "DEPRECATED and could lead to data loss.");
   }
 
-  RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
-  if (rate_limiter) {
-    copy_file_buffer_size_ = static_cast<size_t>(rate_limiter->GetSingleBurstBytes());
+  if (io_s.ok()) {
+    io_s = backup_fs_->CreateDir(private_dir, io_options_, nullptr);
   }
 
   // A set into which we will insert the dst_paths that are calculated for live
@@ -816,34 +1317,47 @@
 
   std::vector<BackupAfterCopyOrCreateWorkItem> backup_items_to_finish;
   // Add a CopyOrCreateWorkItem to the channel for each live file
-  db->DisableFileDeletions();
-  if (s.ok()) {
+  Status disabled = db->DisableFileDeletions();
+  DBOptions db_options = db->GetDBOptions();
+  Statistics* stats = db_options.statistics.get();
+  if (io_s.ok()) {
     CheckpointImpl checkpoint(db);
     uint64_t sequence_number = 0;
-    DBOptions db_options = db->GetDBOptions();
+    FileChecksumGenFactory* db_checksum_factory =
+        db_options.file_checksum_gen_factory.get();
+    const std::string kFileChecksumGenFactoryName =
+        "FileChecksumGenCrc32cFactory";
+    bool compare_checksum =
+        db_checksum_factory != nullptr &&
+                db_checksum_factory->Name() == kFileChecksumGenFactoryName
+            ? true
+            : false;
     EnvOptions src_raw_env_options(db_options);
-    s = checkpoint.CreateCustomCheckpoint(
-        db_options,
+    RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
+    io_s = status_to_io_status(checkpoint.CreateCustomCheckpoint(
         [&](const std::string& /*src_dirname*/, const std::string& /*fname*/,
             FileType) {
           // custom checkpoint will switch to calling copy_file_cb after it sees
           // NotSupported returned from link_file_cb.
-          return Status::NotSupported();
+          return IOStatus::NotSupported();
         } /* link_file_cb */,
         [&](const std::string& src_dirname, const std::string& fname,
-            uint64_t size_limit_bytes, FileType type) {
-          if (type == kLogFile && !options_.backup_log_files) {
-            return Status::OK();
+            uint64_t size_limit_bytes, FileType type,
+            const std::string& checksum_func_name,
+            const std::string& checksum_val) {
+          if (type == kWalFile && !options_.backup_log_files) {
+            return IOStatus::OK();
           }
           Log(options_.info_log, "add file for backup %s", fname.c_str());
           uint64_t size_bytes = 0;
-          Status st;
-          if (type == kTableFile) {
-            st = db_env_->GetFileSize(src_dirname + fname, &size_bytes);
+          IOStatus io_st;
+          if (type == kTableFile || type == kBlobFile) {
+            io_st = db_fs_->GetFileSize(src_dirname + "/" + fname, io_options_,
+                                        &size_bytes, nullptr);
           }
           EnvOptions src_env_options;
           switch (type) {
-            case kLogFile:
+            case kWalFile:
               src_env_options =
                   db_env_->OptimizeForLogRead(src_raw_env_options);
               break;
@@ -855,6 +1369,10 @@
               src_env_options =
                   db_env_->OptimizeForManifestRead(src_raw_env_options);
               break;
+            case kBlobFile:
+              src_env_options = db_env_->OptimizeForBlobFileRead(
+                  src_raw_env_options, ImmutableDBOptions(db_options));
+              break;
             default:
               // Other backed up files (like options file) are not read by live
               // DB, so don't need to worry about avoiding mixing buffered and
@@ -862,126 +1380,140 @@
               src_env_options = src_raw_env_options;
               break;
           }
-          if (st.ok()) {
-            st = AddBackupFileWorkItem(
+          if (io_st.ok()) {
+            io_st = AddBackupFileWorkItem(
                 live_dst_paths, backup_items_to_finish, new_backup_id,
-                options_.share_table_files && type == kTableFile, src_dirname,
-                fname, src_env_options, rate_limiter, size_bytes,
-                size_limit_bytes,
-                options_.share_files_with_checksum && type == kTableFile,
-                progress_callback);
+                options_.share_table_files &&
+                    (type == kTableFile || type == kBlobFile),
+                src_dirname, fname, src_env_options, rate_limiter, type,
+                size_bytes, db_options.statistics.get(), size_limit_bytes,
+                options_.share_files_with_checksum &&
+                    (type == kTableFile || type == kBlobFile),
+                options.progress_callback, "" /* contents */,
+                checksum_func_name, checksum_val);
           }
-          return st;
+          return io_st;
         } /* copy_file_cb */,
-        [&](const std::string& fname, const std::string& contents, FileType) {
+        [&](const std::string& fname, const std::string& contents,
+            FileType type) {
           Log(options_.info_log, "add file for backup %s", fname.c_str());
           return AddBackupFileWorkItem(
               live_dst_paths, backup_items_to_finish, new_backup_id,
               false /* shared */, "" /* src_dir */, fname,
-              EnvOptions() /* src_env_options */, rate_limiter, contents.size(),
-              0 /* size_limit */, false /* shared_checksum */,
-              progress_callback, contents);
+              EnvOptions() /* src_env_options */, rate_limiter, type,
+              contents.size(), db_options.statistics.get(), 0 /* size_limit */,
+              false /* shared_checksum */, options.progress_callback, contents);
         } /* create_file_cb */,
-        &sequence_number, flush_before_backup ? 0 : port::kMaxUint64);
-    if (s.ok()) {
+        &sequence_number, options.flush_before_backup ? 0 : port::kMaxUint64,
+        compare_checksum));
+    if (io_s.ok()) {
       new_backup->SetSequenceNumber(sequence_number);
     }
   }
   ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish.");
-  Status item_status;
+  IOStatus item_io_status;
   for (auto& item : backup_items_to_finish) {
     item.result.wait();
     auto result = item.result.get();
-    item_status = result.status;
-    if (item_status.ok() && item.shared && item.needed_to_copy) {
-      item_status = item.backup_env->RenameFile(item.dst_path_tmp,
-                                                item.dst_path);
-    }
-    if (item_status.ok()) {
-      item_status = new_backup.get()->AddFile(
-              std::make_shared<FileInfo>(item.dst_relative,
-                                         result.size,
-                                         result.checksum_value));
+    item_io_status = result.io_status;
+    if (item_io_status.ok() && item.shared && item.needed_to_copy) {
+      item_io_status = item.backup_env->GetFileSystem()->RenameFile(
+          item.dst_path_tmp, item.dst_path, io_options_, nullptr);
+    }
+    if (item_io_status.ok()) {
+      item_io_status = new_backup.get()->AddFile(std::make_shared<FileInfo>(
+          item.dst_relative, result.size, result.checksum_hex, result.db_id,
+          result.db_session_id));
     }
-    if (!item_status.ok()) {
-      s = item_status;
+    if (!item_io_status.ok()) {
+      io_s = item_io_status;
     }
   }
 
   // we copied all the files, enable file deletions
-  db->EnableFileDeletions(false);
-
+  if (disabled.ok()) {  // If we successfully disabled file deletions
+    db->EnableFileDeletions(false).PermitUncheckedError();
+  }
   auto backup_time = backup_env_->NowMicros() - start_backup;
 
-  if (s.ok()) {
+  if (io_s.ok()) {
     // persist the backup metadata on the disk
-    s = new_backup->StoreToFile(options_.sync);
+    io_s = new_backup->StoreToFile(options_.sync, test_future_options_.get());
   }
-  if (s.ok() && options_.sync) {
-    std::unique_ptr<Directory> backup_private_directory;
-    backup_env_->NewDirectory(
-        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
-        &backup_private_directory);
+  if (io_s.ok() && options_.sync) {
+    std::unique_ptr<FSDirectory> backup_private_directory;
+    backup_fs_
+        ->NewDirectory(GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
+                       io_options_, &backup_private_directory, nullptr)
+        .PermitUncheckedError();
     if (backup_private_directory != nullptr) {
-      s = backup_private_directory->Fsync();
+      io_s = backup_private_directory->FsyncWithDirOptions(io_options_, nullptr,
+                                                           DirFsyncOptions());
     }
-    if (s.ok() && private_directory_ != nullptr) {
-      s = private_directory_->Fsync();
+    if (io_s.ok() && private_directory_ != nullptr) {
+      io_s = private_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                     DirFsyncOptions());
     }
-    if (s.ok() && meta_directory_ != nullptr) {
-      s = meta_directory_->Fsync();
+    if (io_s.ok() && meta_directory_ != nullptr) {
+      io_s = meta_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                  DirFsyncOptions());
     }
-    if (s.ok() && shared_directory_ != nullptr) {
-      s = shared_directory_->Fsync();
+    if (io_s.ok() && shared_directory_ != nullptr) {
+      io_s = shared_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                    DirFsyncOptions());
     }
-    if (s.ok() && backup_directory_ != nullptr) {
-      s = backup_directory_->Fsync();
+    if (io_s.ok() && backup_directory_ != nullptr) {
+      io_s = backup_directory_->FsyncWithDirOptions(io_options_, nullptr,
+                                                    DirFsyncOptions());
     }
   }
 
-  if (s.ok()) {
+  if (io_s.ok()) {
     backup_statistics_.IncrementNumberSuccessBackup();
-  }
-  if (!s.ok()) {
+    // here we know that we succeeded and installed the new backup
+    latest_backup_id_ = new_backup_id;
+    latest_valid_backup_id_ = new_backup_id;
+    if (new_backup_id_ptr) {
+      *new_backup_id_ptr = new_backup_id;
+    }
+    ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good");
+
+    // backup_speed is in byte/second
+    double backup_speed = new_backup->GetSize() / (1.048576 * backup_time);
+    ROCKS_LOG_INFO(options_.info_log, "Backup number of files: %u",
+                   new_backup->GetNumberFiles());
+    char human_size[16];
+    AppendHumanBytes(new_backup->GetSize(), human_size, sizeof(human_size));
+    ROCKS_LOG_INFO(options_.info_log, "Backup size: %s", human_size);
+    ROCKS_LOG_INFO(options_.info_log, "Backup time: %" PRIu64 " microseconds",
+                   backup_time);
+    ROCKS_LOG_INFO(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
+    ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s",
+                   backup_statistics_.ToString().c_str());
+  } else {
     backup_statistics_.IncrementNumberFailBackup();
     // clean all the files we might have created
     ROCKS_LOG_INFO(options_.info_log, "Backup failed -- %s",
-                   s.ToString().c_str());
+                   io_s.ToString().c_str());
     ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s\n",
                    backup_statistics_.ToString().c_str());
     // delete files that we might have already written
     might_need_garbage_collect_ = true;
-    DeleteBackup(new_backup_id);
-    return s;
+    DeleteBackup(new_backup_id).PermitUncheckedError();
   }
 
-  // here we know that we succeeded and installed the new backup
-  // in the LATEST_BACKUP file
-  latest_backup_id_ = new_backup_id;
-  latest_valid_backup_id_ = new_backup_id;
-  ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good");
-
-  // backup_speed is in byte/second
-  double backup_speed = new_backup->GetSize() / (1.048576 * backup_time);
-  ROCKS_LOG_INFO(options_.info_log, "Backup number of files: %u",
-                 new_backup->GetNumberFiles());
-  char human_size[16];
-  AppendHumanBytes(new_backup->GetSize(), human_size, sizeof(human_size));
-  ROCKS_LOG_INFO(options_.info_log, "Backup size: %s", human_size);
-  ROCKS_LOG_INFO(options_.info_log, "Backup time: %" PRIu64 " microseconds",
-                 backup_time);
-  ROCKS_LOG_INFO(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
-  ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s",
-                 backup_statistics_.ToString().c_str());
-  return s;
+  RecordTick(stats, BACKUP_READ_BYTES, IOSTATS(bytes_read) - prev_bytes_read);
+  RecordTick(stats, BACKUP_WRITE_BYTES,
+             IOSTATS(bytes_written) - prev_bytes_written);
+  return io_s;
 }
 
-Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
+IOStatus BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
   assert(initialized_);
   assert(!read_only_);
 
   // Best effort deletion even with errors
-  Status overall_status = Status::OK();
+  IOStatus overall_status = IOStatus::OK();
 
   ROCKS_LOG_INFO(options_.info_log, "Purging old backups, keeping %u",
                  num_backups_to_keep);
@@ -992,25 +1524,26 @@
     itr++;
   }
   for (auto backup_id : to_delete) {
-    auto s = DeleteBackupInternal(backup_id);
-    if (!s.ok()) {
-      overall_status = s;
+    // Do not GC until end
+    IOStatus io_s = DeleteBackupNoGC(backup_id);
+    if (!io_s.ok()) {
+      overall_status = io_s;
     }
   }
   // Clean up after any incomplete backup deletion, potentially from
   // earlier session.
   if (might_need_garbage_collect_) {
-    auto s = GarbageCollect();
-    if (!s.ok() && overall_status.ok()) {
-      overall_status = s;
+    IOStatus io_s = GarbageCollect();
+    if (!io_s.ok() && overall_status.ok()) {
+      overall_status = io_s;
     }
   }
   return overall_status;
 }
 
-Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
-  auto s1 = DeleteBackupInternal(backup_id);
-  auto s2 = Status::OK();
+IOStatus BackupEngineImpl::DeleteBackup(BackupID backup_id) {
+  IOStatus s1 = DeleteBackupNoGC(backup_id);
+  IOStatus s2 = IOStatus::OK();
 
   // Clean up after any incomplete backup deletion, potentially from
   // earlier session.
@@ -1019,34 +1552,38 @@
   }
 
   if (!s1.ok()) {
+    // Any failure in the primary objective trumps any failure in the
+    // secondary objective.
+    s2.PermitUncheckedError();
     return s1;
   } else {
     return s2;
   }
 }
 
-// Does not auto-GarbageCollect
-Status BackupEngineImpl::DeleteBackupInternal(BackupID backup_id) {
+// Does not auto-GarbageCollect nor lock
+IOStatus BackupEngineImpl::DeleteBackupNoGC(BackupID backup_id) {
   assert(initialized_);
   assert(!read_only_);
 
   ROCKS_LOG_INFO(options_.info_log, "Deleting backup %u", backup_id);
   auto backup = backups_.find(backup_id);
   if (backup != backups_.end()) {
-    auto s = backup->second->Delete();
-    if (!s.ok()) {
-      return s;
+    IOStatus io_s = backup->second->Delete();
+    if (!io_s.ok()) {
+      return io_s;
     }
     backups_.erase(backup);
   } else {
     auto corrupt = corrupt_backups_.find(backup_id);
     if (corrupt == corrupt_backups_.end()) {
-      return Status::NotFound("Backup not found");
+      return IOStatus::NotFound("Backup not found");
     }
-    auto s = corrupt->second.second->Delete();
-    if (!s.ok()) {
-      return s;
+    IOStatus io_s = corrupt->second.second->Delete();
+    if (!io_s.ok()) {
+      return io_s;
     }
+    corrupt->second.first.PermitUncheckedError();
     corrupt_backups_.erase(corrupt);
   }
 
@@ -1056,11 +1593,12 @@
   std::vector<std::string> to_delete;
   for (auto& itr : backuped_file_infos_) {
     if (itr.second->refs == 0) {
-      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+      IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(itr.first),
+                                             io_options_, nullptr);
       ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
-                     s.ToString().c_str());
+                     io_s.ToString().c_str());
       to_delete.push_back(itr.first);
-      if (!s.ok()) {
+      if (!io_s.ok()) {
         // Trying again later might work
         might_need_garbage_collect_ = true;
       }
@@ -1073,31 +1611,89 @@
   // take care of private dirs -- GarbageCollect() will take care of them
   // if they are not empty
   std::string private_dir = GetPrivateFileRel(backup_id);
-  Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
+  IOStatus io_s =
+      backup_fs_->DeleteDir(GetAbsolutePath(private_dir), io_options_, nullptr);
   ROCKS_LOG_INFO(options_.info_log, "Deleting private dir %s -- %s",
-                 private_dir.c_str(), s.ToString().c_str());
-  if (!s.ok()) {
+                 private_dir.c_str(), io_s.ToString().c_str());
+  if (!io_s.ok()) {
     // Full gc or trying again later might work
     might_need_garbage_collect_ = true;
   }
+  return IOStatus::OK();
+}
+
+void BackupEngineImpl::SetBackupInfoFromBackupMeta(
+    BackupID id, const BackupMeta& meta, BackupInfo* backup_info,
+    bool include_file_details) const {
+  *backup_info = BackupInfo(id, meta.GetTimestamp(), meta.GetSize(),
+                            meta.GetNumberFiles(), meta.GetAppMetadata());
+  std::string dir = options_.backup_dir + "/" + kPrivateDirSlash +
+                    ROCKSDB_NAMESPACE::ToString(id);
+  if (include_file_details) {
+    auto& file_details = backup_info->file_details;
+    file_details.reserve(meta.GetFiles().size());
+    for (auto& file_ptr : meta.GetFiles()) {
+      BackupFileInfo& finfo = *file_details.emplace(file_details.end());
+      finfo.relative_filename = file_ptr->filename;
+      finfo.size = file_ptr->size;
+      finfo.directory = dir;
+      uint64_t number;
+      FileType type;
+      bool ok = ParseFileName(file_ptr->filename, &number, &type);
+      if (ok) {
+        finfo.file_number = number;
+        finfo.file_type = type;
+      }
+      // TODO: temperature, file_checksum, file_checksum_func_name
+    }
+    backup_info->name_for_open = GetAbsolutePath(GetPrivateFileRel(id));
+    backup_info->name_for_open.pop_back();  // remove trailing '/'
+    backup_info->env_for_open = meta.GetEnvForOpen();
+  }
+}
+
+Status BackupEngineImpl::GetBackupInfo(BackupID backup_id,
+                                       BackupInfo* backup_info,
+                                       bool include_file_details) const {
+  assert(initialized_);
+  if (backup_id == kLatestBackupIDMarker) {
+    // Note: Read latest_valid_backup_id_ inside of lock
+    backup_id = latest_valid_backup_id_;
+  }
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return Status::Corruption(corrupt_itr->second.first.ToString());
+  }
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return Status::NotFound("Backup not found");
+  }
+  auto& backup = backup_itr->second;
+  if (backup->Empty()) {
+    return Status::NotFound("Backup not found");
+  }
+
+  SetBackupInfoFromBackupMeta(backup_id, *backup, backup_info,
+                              include_file_details);
   return Status::OK();
 }
 
-void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                                     bool include_file_details) const {
   assert(initialized_);
-  backup_info->reserve(backups_.size());
+  backup_info->resize(backups_.size());
+  size_t i = 0;
   for (auto& backup : backups_) {
-    if (!backup.second->Empty()) {
-      backup_info->push_back(BackupInfo(
-          backup.first, backup.second->GetTimestamp(), backup.second->GetSize(),
-          backup.second->GetNumberFiles(), backup.second->GetAppMetadata()));
+    const BackupMeta& meta = *backup.second;
+    if (!meta.Empty()) {
+      SetBackupInfoFromBackupMeta(backup.first, meta, &backup_info->at(i++),
+                                  include_file_details);
     }
   }
 }
 
-void
-BackupEngineImpl::GetCorruptedBackups(
-    std::vector<BackupID>* corrupt_backup_ids) {
+void BackupEngineImpl::GetCorruptedBackups(
+    std::vector<BackupID>* corrupt_backup_ids) const {
   assert(initialized_);
   corrupt_backup_ids->reserve(corrupt_backups_.size());
   for (auto& backup : corrupt_backups_) {
@@ -1105,52 +1701,59 @@
   }
 }
 
-Status BackupEngineImpl::RestoreDBFromBackup(
-    BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-    const RestoreOptions& restore_options) {
+IOStatus BackupEngineImpl::RestoreDBFromBackup(
+    const RestoreOptions& options, BackupID backup_id,
+    const std::string& db_dir, const std::string& wal_dir) const {
   assert(initialized_);
+  if (backup_id == kLatestBackupIDMarker) {
+    // Note: Read latest_valid_backup_id_ inside of lock
+    backup_id = latest_valid_backup_id_;
+  }
   auto corrupt_itr = corrupt_backups_.find(backup_id);
   if (corrupt_itr != corrupt_backups_.end()) {
     return corrupt_itr->second.first;
   }
   auto backup_itr = backups_.find(backup_id);
   if (backup_itr == backups_.end()) {
-    return Status::NotFound("Backup not found");
+    return IOStatus::NotFound("Backup not found");
   }
   auto& backup = backup_itr->second;
   if (backup->Empty()) {
-    return Status::NotFound("Backup not found");
+    return IOStatus::NotFound("Backup not found");
   }
 
   ROCKS_LOG_INFO(options_.info_log, "Restoring backup id %u\n", backup_id);
   ROCKS_LOG_INFO(options_.info_log, "keep_log_files: %d\n",
-                 static_cast<int>(restore_options.keep_log_files));
+                 static_cast<int>(options.keep_log_files));
 
   // just in case. Ignore errors
-  db_env_->CreateDirIfMissing(db_dir);
-  db_env_->CreateDirIfMissing(wal_dir);
+  db_fs_->CreateDirIfMissing(db_dir, io_options_, nullptr)
+      .PermitUncheckedError();
+  db_fs_->CreateDirIfMissing(wal_dir, io_options_, nullptr)
+      .PermitUncheckedError();
 
-  if (restore_options.keep_log_files) {
+  if (options.keep_log_files) {
     // delete files in db_dir, but keep all the log files
-    DeleteChildren(db_dir, 1 << kLogFile);
+    DeleteChildren(db_dir, 1 << kWalFile);
     // move all the files from archive dir to wal_dir
     std::string archive_dir = ArchivalDirectory(wal_dir);
     std::vector<std::string> archive_files;
-    db_env_->GetChildren(archive_dir, &archive_files);  // ignore errors
+    db_fs_->GetChildren(archive_dir, io_options_, &archive_files, nullptr)
+        .PermitUncheckedError();  // ignore errors
     for (const auto& f : archive_files) {
       uint64_t number;
       FileType type;
       bool ok = ParseFileName(f, &number, &type);
-      if (ok && type == kLogFile) {
+      if (ok && type == kWalFile) {
         ROCKS_LOG_INFO(options_.info_log,
                        "Moving log file from archive/ to wal_dir: %s",
                        f.c_str());
-        Status s =
-            db_env_->RenameFile(archive_dir + "/" + f, wal_dir + "/" + f);
-        if (!s.ok()) {
+        IOStatus io_s = db_fs_->RenameFile(
+            archive_dir + "/" + f, wal_dir + "/" + f, io_options_, nullptr);
+        if (!io_s.ok()) {
           // if we can't move log file from archive_dir to wal_dir,
           // we should fail, since it might mean data loss
-          return s;
+          return io_s;
         }
       }
     }
@@ -1160,76 +1763,127 @@
     DeleteChildren(db_dir);
   }
 
-  RateLimiter* rate_limiter = options_.restore_rate_limiter.get();
-  if (rate_limiter) {
-    copy_file_buffer_size_ = static_cast<size_t>(rate_limiter->GetSingleBurstBytes());
-  }
-  Status s;
+  IOStatus io_s;
   std::vector<RestoreAfterCopyOrCreateWorkItem> restore_items_to_finish;
+  std::string temporary_current_file;
+  std::string final_current_file;
+  std::unique_ptr<FSDirectory> db_dir_for_fsync;
+  std::unique_ptr<FSDirectory> wal_dir_for_fsync;
+
   for (const auto& file_info : backup->GetFiles()) {
-    const std::string &file = file_info->filename;
-    std::string dst;
-    // 1. extract the filename
-    size_t slash = file.find_last_of('/');
-    // file will either be shared/<file>, shared_checksum/<file_crc32_size>
-    // or private/<number>/<file>
-    assert(slash != std::string::npos);
-    dst = file.substr(slash + 1);
-
-    // if the file was in shared_checksum, extract the real file name
-    // in this case the file is <number>_<checksum>_<size>.<type>
-    if (file.substr(0, slash) == GetSharedChecksumDirRel()) {
-      dst = GetFileFromChecksumFile(dst);
-    }
+    const std::string& file = file_info->filename;
+    // 1. get DB filename
+    std::string dst = file_info->GetDbFileName();
 
     // 2. find the filetype
     uint64_t number;
     FileType type;
     bool ok = ParseFileName(dst, &number, &type);
     if (!ok) {
-      return Status::Corruption("Backup corrupted");
+      return IOStatus::Corruption("Backup corrupted: Fail to parse filename " +
+                                  dst);
     }
     // 3. Construct the final path
-    // kLogFile lives in wal_dir and all the rest live in db_dir
-    dst = ((type == kLogFile) ? wal_dir : db_dir) +
-      "/" + dst;
+    // kWalFile lives in wal_dir and all the rest live in db_dir
+    if (type == kWalFile) {
+      dst = wal_dir + "/" + dst;
+      if (options_.sync && !wal_dir_for_fsync) {
+        io_s = db_fs_->NewDirectory(wal_dir, io_options_, &wal_dir_for_fsync,
+                                    nullptr);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+    } else {
+      dst = db_dir + "/" + dst;
+      if (options_.sync && !db_dir_for_fsync) {
+        io_s = db_fs_->NewDirectory(db_dir, io_options_, &db_dir_for_fsync,
+                                    nullptr);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+    }
+    // For atomicity, initially restore CURRENT file to a temporary name.
+    // This is useful even without options_.sync e.g. in case the restore
+    // process is interrupted.
+    if (type == kCurrentFile) {
+      final_current_file = dst;
+      dst = temporary_current_file = dst + ".tmp";
+    }
 
     ROCKS_LOG_INFO(options_.info_log, "Restoring %s to %s\n", file.c_str(),
                    dst.c_str());
     CopyOrCreateWorkItem copy_or_create_work_item(
         GetAbsolutePath(file), dst, "" /* contents */, backup_env_, db_env_,
-        EnvOptions() /* src_env_options */, false, rate_limiter,
-        0 /* size_limit */);
+        EnvOptions() /* src_env_options */, options_.sync,
+        options_.restore_rate_limiter.get(), 0 /* size_limit */,
+        nullptr /* stats */);
     RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
-        copy_or_create_work_item.result.get_future(),
-        file_info->checksum_value);
+        copy_or_create_work_item.result.get_future(), file, dst,
+        file_info->checksum_hex);
     files_to_copy_or_create_.write(std::move(copy_or_create_work_item));
     restore_items_to_finish.push_back(
         std::move(after_copy_or_create_work_item));
   }
-  Status item_status;
+  IOStatus item_io_status;
   for (auto& item : restore_items_to_finish) {
     item.result.wait();
     auto result = item.result.get();
-    item_status = result.status;
+    item_io_status = result.io_status;
     // Note: It is possible that both of the following bad-status cases occur
     // during copying. But, we only return one status.
-    if (!item_status.ok()) {
-      s = item_status;
+    if (!item_io_status.ok()) {
+      io_s = item_io_status;
       break;
-    } else if (item.checksum_value != result.checksum_value) {
-      s = Status::Corruption("Checksum check failed");
+    } else if (!item.checksum_hex.empty() &&
+               item.checksum_hex != result.checksum_hex) {
+      io_s = IOStatus::Corruption(
+          "While restoring " + item.from_file + " -> " + item.to_file +
+          ": expected checksum is " + item.checksum_hex +
+          " while computed checksum is " + result.checksum_hex);
       break;
     }
   }
 
+  // When enabled, the first FsyncWithDirOptions is to ensure all files are
+  // fully persisted before renaming CURRENT.tmp
+  if (io_s.ok() && db_dir_for_fsync) {
+    ROCKS_LOG_INFO(options_.info_log, "Restore: fsync\n");
+    io_s = db_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr,
+                                                 DirFsyncOptions());
+  }
+
+  if (io_s.ok() && wal_dir_for_fsync) {
+    io_s = wal_dir_for_fsync->FsyncWithDirOptions(io_options_, nullptr,
+                                                  DirFsyncOptions());
+  }
+
+  if (io_s.ok() && !temporary_current_file.empty()) {
+    ROCKS_LOG_INFO(options_.info_log, "Restore: atomic rename CURRENT.tmp\n");
+    assert(!final_current_file.empty());
+    io_s = db_fs_->RenameFile(temporary_current_file, final_current_file,
+                              io_options_, nullptr);
+  }
+
+  if (io_s.ok() && db_dir_for_fsync && !temporary_current_file.empty()) {
+    // Second FsyncWithDirOptions is to ensure the final atomic rename of DB
+    // restore is fully persisted even if power goes out right after restore
+    // operation returns success
+    assert(db_dir_for_fsync);
+    io_s = db_dir_for_fsync->FsyncWithDirOptions(
+        io_options_, nullptr, DirFsyncOptions(final_current_file));
+  }
+
   ROCKS_LOG_INFO(options_.info_log, "Restoring done -- %s\n",
-                 s.ToString().c_str());
-  return s;
+                 io_s.ToString().c_str());
+  return io_s;
 }
 
-Status BackupEngineImpl::VerifyBackup(BackupID backup_id) {
+IOStatus BackupEngineImpl::VerifyBackup(BackupID backup_id,
+                                        bool verify_with_checksum) const {
   assert(initialized_);
+  // Check if backup_id is corrupted, or valid and registered
   auto corrupt_itr = corrupt_backups_.find(backup_id);
   if (corrupt_itr != corrupt_backups_.end()) {
     return corrupt_itr->second.first;
@@ -1237,162 +1891,268 @@
 
   auto backup_itr = backups_.find(backup_id);
   if (backup_itr == backups_.end()) {
-    return Status::NotFound();
+    return IOStatus::NotFound();
   }
 
   auto& backup = backup_itr->second;
   if (backup->Empty()) {
-    return Status::NotFound();
+    return IOStatus::NotFound();
   }
 
   ROCKS_LOG_INFO(options_.info_log, "Verifying backup id %u\n", backup_id);
 
+  // Find all existing backup files belong to backup_id
   std::unordered_map<std::string, uint64_t> curr_abs_path_to_size;
   for (const auto& rel_dir : {GetPrivateFileRel(backup_id), GetSharedFileRel(),
                               GetSharedFileWithChecksumRel()}) {
     const auto abs_dir = GetAbsolutePath(rel_dir);
-    InsertPathnameToSizeBytes(abs_dir, backup_env_, &curr_abs_path_to_size);
+    // Shared directories allowed to be missing in some cases. Expected but
+    // missing files will be reported a few lines down.
+    ReadChildFileCurrentSizes(abs_dir, backup_fs_, &curr_abs_path_to_size)
+        .PermitUncheckedError();
   }
 
+  // For all files registered in backup
   for (const auto& file_info : backup->GetFiles()) {
     const auto abs_path = GetAbsolutePath(file_info->filename);
+    // check existence of the file
     if (curr_abs_path_to_size.find(abs_path) == curr_abs_path_to_size.end()) {
-      return Status::NotFound("File missing: " + abs_path);
+      return IOStatus::NotFound("File missing: " + abs_path);
     }
+    // verify file size
     if (file_info->size != curr_abs_path_to_size[abs_path]) {
-      return Status::Corruption("File corrupted: " + abs_path);
+      std::string size_info("Expected file size is " +
+                            ToString(file_info->size) +
+                            " while found file size is " +
+                            ToString(curr_abs_path_to_size[abs_path]));
+      return IOStatus::Corruption("File corrupted: File size mismatch for " +
+                                  abs_path + ": " + size_info);
+    }
+    if (verify_with_checksum && !file_info->checksum_hex.empty()) {
+      // verify file checksum
+      std::string checksum_hex;
+      ROCKS_LOG_INFO(options_.info_log, "Verifying %s checksum...\n",
+                     abs_path.c_str());
+      IOStatus io_s =
+          ReadFileAndComputeChecksum(abs_path, backup_fs_, EnvOptions(),
+                                     0 /* size_limit */, &checksum_hex);
+      if (!io_s.ok()) {
+        return io_s;
+      } else if (file_info->checksum_hex != checksum_hex) {
+        std::string checksum_info(
+            "Expected checksum is " + file_info->checksum_hex +
+            " while computed checksum is " + checksum_hex);
+        return IOStatus::Corruption("File corrupted: Checksum mismatch for " +
+                                    abs_path + ": " + checksum_info);
+      }
     }
   }
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status BackupEngineImpl::CopyOrCreateFile(
+IOStatus BackupEngineImpl::CopyOrCreateFile(
     const std::string& src, const std::string& dst, const std::string& contents,
-    Env* src_env, Env* dst_env, const EnvOptions& src_env_options, bool sync,
-    RateLimiter* rate_limiter, uint64_t* size, uint32_t* checksum_value,
-    uint64_t size_limit, std::function<void()> progress_callback) {
+    uint64_t size_limit, Env* src_env, Env* dst_env,
+    const EnvOptions& src_env_options, bool sync, RateLimiter* rate_limiter,
+    std::function<void()> progress_callback,
+    uint64_t* bytes_toward_next_callback, uint64_t* size,
+    std::string* checksum_hex) {
   assert(src.empty() != contents.empty());
-  Status s;
-  std::unique_ptr<WritableFile> dst_file;
-  std::unique_ptr<SequentialFile> src_file;
-  EnvOptions dst_env_options;
-  dst_env_options.use_mmap_writes = false;
+  IOStatus io_s;
+  std::unique_ptr<FSWritableFile> dst_file;
+  std::unique_ptr<FSSequentialFile> src_file;
+  FileOptions dst_file_options;
+  dst_file_options.use_mmap_writes = false;
   // TODO:(gzh) maybe use direct reads/writes here if possible
   if (size != nullptr) {
     *size = 0;
   }
-  if (checksum_value != nullptr) {
-    *checksum_value = 0;
-  }
+  uint32_t checksum_value = 0;
 
   // Check if size limit is set. if not, set it to very big number
   if (size_limit == 0) {
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  s = dst_env->NewWritableFile(dst, &dst_file, dst_env_options);
-  if (s.ok() && !src.empty()) {
-    s = src_env->NewSequentialFile(src, &src_file, src_env_options);
-  }
-  if (!s.ok()) {
-    return s;
-  }
+  io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options,
+                                                   &dst_file, nullptr);
+  if (io_s.ok() && !src.empty()) {
+    io_s = src_env->GetFileSystem()->NewSequentialFile(
+        src, FileOptions(src_env_options), &src_file, nullptr);
+  }
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  size_t buf_size =
+      rate_limiter ? static_cast<size_t>(rate_limiter->GetSingleBurstBytes())
+                   : kDefaultCopyFileBufferSize;
 
-  std::unique_ptr<WritableFileWriter> dest_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(dst_file)), dst, dst_env_options));
+  std::unique_ptr<WritableFileWriter> dest_writer(
+      new WritableFileWriter(std::move(dst_file), dst, dst_file_options));
   std::unique_ptr<SequentialFileReader> src_reader;
   std::unique_ptr<char[]> buf;
   if (!src.empty()) {
-    src_reader.reset(new SequentialFileReader(
-        NewLegacySequentialFileWrapper(src_file), src));
-    buf.reset(new char[copy_file_buffer_size_]);
+    src_reader.reset(new SequentialFileReader(std::move(src_file), src));
+    buf.reset(new char[buf_size]);
   }
 
   Slice data;
-  uint64_t processed_buffer_size = 0;
   do {
     if (stop_backup_.load(std::memory_order_acquire)) {
-      return Status::Incomplete("Backup stopped");
+      return status_to_io_status(Status::Incomplete("Backup stopped"));
     }
     if (!src.empty()) {
-      size_t buffer_to_read = (copy_file_buffer_size_ < size_limit)
-                                  ? copy_file_buffer_size_
-                                  : static_cast<size_t>(size_limit);
-      s = src_reader->Read(buffer_to_read, &data, buf.get());
-      processed_buffer_size += buffer_to_read;
+      size_t buffer_to_read =
+          (buf_size < size_limit) ? buf_size : static_cast<size_t>(size_limit);
+      io_s = src_reader->Read(buffer_to_read, &data, buf.get());
+      if (rate_limiter != nullptr) {
+        rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */,
+                              RateLimiter::OpType::kRead);
+      }
+      *bytes_toward_next_callback += data.size();
     } else {
       data = contents;
     }
     size_limit -= data.size();
+    TEST_SYNC_POINT_CALLBACK(
+        "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+        (src.length() > 4 && src.rfind(".sst") == src.length() - 4) ? &data
+                                                                    : nullptr);
 
-    if (!s.ok()) {
-      return s;
+    if (!io_s.ok()) {
+      return io_s;
     }
 
     if (size != nullptr) {
       *size += data.size();
     }
-    if (checksum_value != nullptr) {
-      *checksum_value =
-          crc32c::Extend(*checksum_value, data.data(), data.size());
+    if (checksum_hex != nullptr) {
+      checksum_value = crc32c::Extend(checksum_value, data.data(), data.size());
     }
-    s = dest_writer->Append(data);
+    io_s = dest_writer->Append(data);
+
     if (rate_limiter != nullptr) {
-      rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */,
-                            RateLimiter::OpType::kWrite);
+      if (!src.empty()) {
+        rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */,
+                              RateLimiter::OpType::kWrite);
+      } else {
+        LoopRateLimitRequestHelper(data.size(), rate_limiter, Env::IO_LOW,
+                                   nullptr /* stats */,
+                                   RateLimiter::OpType::kWrite);
+      }
     }
-    if (processed_buffer_size > options_.callback_trigger_interval_size) {
-      processed_buffer_size -= options_.callback_trigger_interval_size;
+    while (*bytes_toward_next_callback >=
+           options_.callback_trigger_interval_size) {
+      *bytes_toward_next_callback -= options_.callback_trigger_interval_size;
       std::lock_guard<std::mutex> lock(byte_report_mutex_);
       progress_callback();
     }
-  } while (s.ok() && contents.empty() && data.size() > 0 && size_limit > 0);
+  } while (io_s.ok() && contents.empty() && data.size() > 0 && size_limit > 0);
 
-  if (s.ok() && sync) {
-    s = dest_writer->Sync(false);
+  // Convert uint32_t checksum to hex checksum
+  if (checksum_hex != nullptr) {
+    checksum_hex->assign(ChecksumInt32ToHex(checksum_value));
   }
-  if (s.ok()) {
-    s = dest_writer->Close();
+
+  if (io_s.ok() && sync) {
+    io_s = dest_writer->Sync(false);
+  }
+  if (io_s.ok()) {
+    io_s = dest_writer->Close();
   }
-  return s;
+  return io_s;
 }
 
 // fname will always start with "/"
-Status BackupEngineImpl::AddBackupFileWorkItem(
+IOStatus BackupEngineImpl::AddBackupFileWorkItem(
     std::unordered_set<std::string>& live_dst_paths,
     std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
     BackupID backup_id, bool shared, const std::string& src_dir,
     const std::string& fname, const EnvOptions& src_env_options,
-    RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit,
-    bool shared_checksum, std::function<void()> progress_callback,
-    const std::string& contents) {
-  assert(!fname.empty() && fname[0] == '/');
+    RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes,
+    Statistics* stats, uint64_t size_limit, bool shared_checksum,
+    std::function<void()> progress_callback, const std::string& contents,
+    const std::string& src_checksum_func_name,
+    const std::string& src_checksum_str) {
   assert(contents.empty() != src_dir.empty());
 
-  std::string dst_relative = fname.substr(1);
+  std::string src_path = src_dir + "/" + fname;
+  std::string dst_relative;
   std::string dst_relative_tmp;
-  Status s;
-  uint32_t checksum_value = 0;
+  std::string db_id;
+  std::string db_session_id;
+  // crc32c checksum in hex. empty == unavailable / unknown
+  std::string checksum_hex;
+
+  // Whenever a default checksum function name is passed in, we will compares
+  // the corresponding checksum values after copying. Note that only table and
+  // blob files may have a known checksum function name passed in.
+  //
+  // If no default checksum function name is passed in and db session id is not
+  // available, we will calculate the checksum *before* copying in two cases
+  // (we always calcuate checksums when copying or creating for any file types):
+  // a) share_files_with_checksum is true and file type is table;
+  // b) share_table_files is true and the file exists already.
+  //
+  // Step 0: Check if default checksum function name is passed in
+  if (kDbFileChecksumFuncName == src_checksum_func_name) {
+    if (src_checksum_str == kUnknownFileChecksum) {
+      return status_to_io_status(
+          Status::Aborted("Unknown checksum value for " + fname));
+    }
+    checksum_hex = ChecksumStrToHex(src_checksum_str);
+  }
 
+  // Step 1: Prepare the relative path to destination
   if (shared && shared_checksum) {
-    // add checksum and file length to the file name
-    s = CalculateChecksum(src_dir + fname, db_env_, src_env_options, size_limit,
-                          &checksum_value);
-    if (!s.ok()) {
-      return s;
+    if (GetNamingNoFlags() != BackupEngineOptions::kLegacyCrc32cAndFileSize &&
+        file_type != kBlobFile) {
+      // Prepare db_session_id to add to the file name
+      // Ignore the returned status
+      // In the failed cases, db_id and db_session_id will be empty
+      GetFileDbIdentities(db_env_, src_env_options, src_path, rate_limiter,
+                          &db_id, &db_session_id)
+          .PermitUncheckedError();
+    }
+    // Calculate checksum if checksum and db session id are not available.
+    // If db session id is available, we will not calculate the checksum
+    // since the session id should suffice to avoid file name collision in
+    // the shared_checksum directory.
+    if (checksum_hex.empty() && db_session_id.empty()) {
+      IOStatus io_s = ReadFileAndComputeChecksum(
+          src_path, db_fs_, src_env_options, size_limit, &checksum_hex);
+      if (!io_s.ok()) {
+        return io_s;
+      }
     }
     if (size_bytes == port::kMaxUint64) {
-      return Status::NotFound("File missing: " + src_dir + fname);
+      return IOStatus::NotFound("File missing: " + src_path);
     }
-    dst_relative =
-        GetSharedFileWithChecksum(dst_relative, checksum_value, size_bytes);
+    // dst_relative depends on the following conditions:
+    // 1) the naming scheme is kUseDbSessionId,
+    // 2) db_session_id is not empty,
+    // 3) checksum is available in the DB manifest.
+    // If 1,2,3) are satisfied, then dst_relative will be of the form:
+    // shared_checksum/<file_number>_<checksum>_<db_session_id>.sst
+    // If 1,2) are satisfied, then dst_relative will be of the form:
+    // shared_checksum/<file_number>_<db_session_id>.sst
+    // Otherwise, dst_relative is of the form
+    // shared_checksum/<file_number>_<checksum>_<size>.sst
+    //
+    // For blob files, db_session_id is not supported with the blob file format.
+    // It uses original/legacy naming scheme.
+    // dst_relative will be of the form:
+    // shared_checksum/<file_number>_<checksum>_<size>.blob
+    dst_relative = GetSharedFileWithChecksum(fname, checksum_hex, size_bytes,
+                                             db_session_id);
     dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true);
     dst_relative = GetSharedFileWithChecksumRel(dst_relative, false);
   } else if (shared) {
-    dst_relative_tmp = GetSharedFileRel(dst_relative, true);
-    dst_relative = GetSharedFileRel(dst_relative, false);
+    dst_relative_tmp = GetSharedFileRel(fname, true);
+    dst_relative = GetSharedFileRel(fname, false);
   } else {
-    dst_relative = GetPrivateFileRel(backup_id, false, dst_relative);
+    dst_relative = GetPrivateFileRel(backup_id, false, fname);
   }
 
   // We copy into `temp_dest_path` and, once finished, rename it to
@@ -1409,6 +2169,7 @@
     copy_dest_path = &final_dest_path;
   }
 
+  // Step 2: Determine whether to copy or not
   // if it's shared, we also need to check if it exists -- if it does, no need
   // to copy it again.
   bool need_to_copy = true;
@@ -1418,13 +2179,15 @@
 
   bool file_exists = false;
   if (shared && !same_path) {
-    Status exist = backup_env_->FileExists(final_dest_path);
+    // Should be in shared directory but not a live path, check existence in
+    // shared directory
+    IOStatus exist =
+        backup_fs_->FileExists(final_dest_path, io_options_, nullptr);
     if (exist.ok()) {
       file_exists = true;
     } else if (exist.IsNotFound()) {
       file_exists = false;
     } else {
-      assert(s.IsIOError());
       return exist;
     }
   }
@@ -1433,38 +2196,80 @@
     need_to_copy = false;
   } else if (shared && (same_path || file_exists)) {
     need_to_copy = false;
-    if (shared_checksum) {
-      ROCKS_LOG_INFO(options_.info_log,
-                     "%s already present, with checksum %u and size %" PRIu64,
-                     fname.c_str(), checksum_value, size_bytes);
-    } else if (backuped_file_infos_.find(dst_relative) ==
-               backuped_file_infos_.end() && !same_path) {
-      // file already exists, but it's not referenced by any backup. overwrite
-      // the file
+    auto find_result = backuped_file_infos_.find(dst_relative);
+    if (find_result == backuped_file_infos_.end() && !same_path) {
+      // file exists but not referenced
       ROCKS_LOG_INFO(
           options_.info_log,
           "%s already present, but not referenced by any backup. We will "
           "overwrite the file.",
           fname.c_str());
       need_to_copy = true;
-      backup_env_->DeleteFile(final_dest_path);
+      // Defer any failure reporting to when we try to write the file
+      backup_fs_->DeleteFile(final_dest_path, io_options_, nullptr)
+          .PermitUncheckedError();
     } else {
-      // the file is present and referenced by a backup
-      ROCKS_LOG_INFO(options_.info_log,
-                     "%s already present, calculate checksum", fname.c_str());
-      s = CalculateChecksum(src_dir + fname, db_env_, src_env_options,
-                            size_limit, &checksum_value);
+      // file exists and referenced
+      if (checksum_hex.empty()) {
+        // same_path should not happen for a standard DB, so OK to
+        // read file contents to check for checksum mismatch between
+        // two files from same DB getting same name.
+        // For compatibility with future meta file that might not have
+        // crc32c checksum available, consider it might be empty, but
+        // we don't currently generate meta file without crc32c checksum.
+        // Therefore we have to read & compute it if we don't have it.
+        if (!same_path && !find_result->second->checksum_hex.empty()) {
+          assert(find_result != backuped_file_infos_.end());
+          // Note: to save I/O on incremental backups, we copy prior known
+          // checksum of the file instead of reading entire file contents
+          // to recompute it.
+          checksum_hex = find_result->second->checksum_hex;
+          // Regarding corruption detection, consider:
+          // (a) the DB file is corrupt (since previous backup) and the backup
+          // file is OK: we failed to detect, but the backup is safe. DB can
+          // be repaired/restored once its corruption is detected.
+          // (b) the backup file is corrupt (since previous backup) and the
+          // db file is OK: we failed to detect, but the backup is corrupt.
+          // CreateNewBackup should support fast incremental backups and
+          // there's no way to support that without reading all the files.
+          // We might add an option for extra checks on incremental backup,
+          // but until then, use VerifyBackups to check existing backup data.
+          // (c) file name collision with legitimately different content.
+          // This is almost inconceivable with a well-generated DB session
+          // ID, but even in that case, we double check the file sizes in
+          // BackupMeta::AddFile.
+        } else {
+          IOStatus io_s = ReadFileAndComputeChecksum(
+              src_path, db_fs_, src_env_options, size_limit, &checksum_hex);
+          if (!io_s.ok()) {
+            return io_s;
+          }
+        }
+      }
+      if (!db_session_id.empty()) {
+        ROCKS_LOG_INFO(options_.info_log,
+                       "%s already present, with checksum %s, size %" PRIu64
+                       " and DB session identity %s",
+                       fname.c_str(), checksum_hex.c_str(), size_bytes,
+                       db_session_id.c_str());
+      } else {
+        ROCKS_LOG_INFO(options_.info_log,
+                       "%s already present, with checksum %s and size %" PRIu64,
+                       fname.c_str(), checksum_hex.c_str(), size_bytes);
+      }
     }
   }
   live_dst_paths.insert(final_dest_path);
 
+  // Step 3: Add work item
   if (!contents.empty() || need_to_copy) {
     ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(),
                    copy_dest_path->c_str());
     CopyOrCreateWorkItem copy_or_create_work_item(
-        src_dir.empty() ? "" : src_dir + fname, *copy_dest_path, contents,
-        db_env_, backup_env_, src_env_options, options_.sync, rate_limiter,
-        size_limit, progress_callback);
+        src_dir.empty() ? "" : src_path, *copy_dest_path, contents, db_env_,
+        backup_env_, src_env_options, options_.sync, rate_limiter, size_limit,
+        stats, progress_callback, src_checksum_func_name, checksum_hex, db_id,
+        db_session_id);
     BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
         copy_or_create_work_item.result.get_future(), shared, need_to_copy,
         backup_env_, temp_dest_path, final_dest_path, dst_relative);
@@ -1477,57 +2282,150 @@
         temp_dest_path, final_dest_path, dst_relative);
     backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
     CopyOrCreateResult result;
-    result.status = s;
+    result.io_status = IOStatus::OK();
     result.size = size_bytes;
-    result.checksum_value = checksum_value;
+    result.checksum_hex = std::move(checksum_hex);
+    result.db_id = std::move(db_id);
+    result.db_session_id = std::move(db_session_id);
     promise_result.set_value(std::move(result));
   }
-  return s;
+  return IOStatus::OK();
 }
 
-Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
-                                           const EnvOptions& src_env_options,
-                                           uint64_t size_limit,
-                                           uint32_t* checksum_value) {
-  *checksum_value = 0;
+IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
+    const std::string& src, const std::shared_ptr<FileSystem>& src_fs,
+    const EnvOptions& src_env_options, uint64_t size_limit,
+    std::string* checksum_hex) const {
+  if (checksum_hex == nullptr) {
+    return status_to_io_status(Status::Aborted("Checksum pointer is null"));
+  }
+  uint32_t checksum_value = 0;
   if (size_limit == 0) {
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  std::unique_ptr<SequentialFile> src_file;
-  Status s = src_env->NewSequentialFile(src, &src_file, src_env_options);
-  if (!s.ok()) {
-    return s;
+  std::unique_ptr<SequentialFileReader> src_reader;
+  IOStatus io_s = SequentialFileReader::Create(
+      src_fs, src, FileOptions(src_env_options), &src_reader, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
   }
 
-  std::unique_ptr<SequentialFileReader> src_reader(
-      new SequentialFileReader(NewLegacySequentialFileWrapper(src_file), src));
-  std::unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
+  RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
+  size_t buf_size =
+      rate_limiter ? static_cast<size_t>(rate_limiter->GetSingleBurstBytes())
+                   : kDefaultCopyFileBufferSize;
+  std::unique_ptr<char[]> buf(new char[buf_size]);
   Slice data;
 
   do {
     if (stop_backup_.load(std::memory_order_acquire)) {
-      return Status::Incomplete("Backup stopped");
+      return status_to_io_status(Status::Incomplete("Backup stopped"));
     }
-    size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
-      copy_file_buffer_size_ : static_cast<size_t>(size_limit);
-    s = src_reader->Read(buffer_to_read, &data, buf.get());
-
-    if (!s.ok()) {
-      return s;
+    size_t buffer_to_read =
+        (buf_size < size_limit) ? buf_size : static_cast<size_t>(size_limit);
+    io_s = src_reader->Read(buffer_to_read, &data, buf.get());
+    if (rate_limiter != nullptr) {
+      rate_limiter->Request(data.size(), Env::IO_LOW, nullptr /* stats */,
+                            RateLimiter::OpType::kRead);
+    }
+    if (!io_s.ok()) {
+      return io_s;
     }
 
     size_limit -= data.size();
-    *checksum_value = crc32c::Extend(*checksum_value, data.data(), data.size());
+    checksum_value = crc32c::Extend(checksum_value, data.data(), data.size());
   } while (data.size() > 0 && size_limit > 0);
 
-  return s;
+  checksum_hex->assign(ChecksumInt32ToHex(checksum_value));
+
+  return io_s;
+}
+
+Status BackupEngineImpl::GetFileDbIdentities(Env* src_env,
+                                             const EnvOptions& src_env_options,
+                                             const std::string& file_path,
+                                             RateLimiter* rate_limiter,
+                                             std::string* db_id,
+                                             std::string* db_session_id) {
+  assert(db_id != nullptr || db_session_id != nullptr);
+
+  Options options;
+  options.env = src_env;
+  SstFileDumper sst_reader(options, file_path,
+                           2 * 1024 * 1024
+                           /* readahead_size */,
+                           false /* verify_checksum */, false /* output_hex */,
+                           false /* decode_blob_index */, src_env_options,
+                           true /* silent */);
+
+  const TableProperties* table_properties = nullptr;
+  std::shared_ptr<const TableProperties> tp;
+  Status s = sst_reader.getStatus();
+
+  if (s.ok()) {
+    // Try to get table properties from the table reader of sst_reader
+    if (!sst_reader.ReadTableProperties(&tp).ok()) {
+      // Try to use table properites from the initialization of sst_reader
+      table_properties = sst_reader.GetInitTableProperties();
+    } else {
+      table_properties = tp.get();
+      if (table_properties != nullptr && rate_limiter != nullptr) {
+        // sizeof(*table_properties) is a sufficent but far-from-exact
+        // approximation of read bytes due to metaindex block, std::string
+        // properties and varint compression
+        LoopRateLimitRequestHelper(sizeof(*table_properties), rate_limiter,
+                                   Env::IO_LOW, nullptr /* stats */,
+                                   RateLimiter::OpType::kRead);
+      }
+    }
+  } else {
+    ROCKS_LOG_INFO(options_.info_log, "Failed to read %s: %s",
+                   file_path.c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  if (table_properties != nullptr) {
+    if (db_id != nullptr) {
+      db_id->assign(table_properties->db_id);
+    }
+    if (db_session_id != nullptr) {
+      db_session_id->assign(table_properties->db_session_id);
+      if (db_session_id->empty()) {
+        s = Status::NotFound("DB session identity not found in " + file_path);
+        ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str());
+        return s;
+      }
+    }
+    return Status::OK();
+  } else {
+    s = Status::Corruption("Table properties missing in " + file_path);
+    ROCKS_LOG_INFO(options_.info_log, "%s", s.ToString().c_str());
+    return s;
+  }
+}
+
+void BackupEngineImpl::LoopRateLimitRequestHelper(
+    const size_t total_bytes_to_request, RateLimiter* rate_limiter,
+    const Env::IOPriority pri, Statistics* stats,
+    const RateLimiter::OpType op_type) {
+  assert(rate_limiter != nullptr);
+  size_t remaining_bytes = total_bytes_to_request;
+  size_t request_bytes = 0;
+  while (remaining_bytes > 0) {
+    request_bytes =
+        std::min(static_cast<size_t>(rate_limiter->GetSingleBurstBytes()),
+                 remaining_bytes);
+    rate_limiter->Request(request_bytes, pri, stats, op_type);
+    remaining_bytes -= request_bytes;
+  }
 }
 
 void BackupEngineImpl::DeleteChildren(const std::string& dir,
-                                      uint32_t file_type_filter) {
+                                      uint32_t file_type_filter) const {
   std::vector<std::string> children;
-  db_env_->GetChildren(dir, &children);  // ignore errors
+  db_fs_->GetChildren(dir, io_options_, &children, nullptr)
+      .PermitUncheckedError();  // ignore errors
 
   for (const auto& f : children) {
     uint64_t number;
@@ -1537,36 +2435,38 @@
       // don't delete this file
       continue;
     }
-    db_env_->DeleteFile(dir + "/" + f);  // ignore errors
+    db_fs_->DeleteFile(dir + "/" + f, io_options_, nullptr)
+        .PermitUncheckedError();  // ignore errors
   }
 }
 
-Status BackupEngineImpl::InsertPathnameToSizeBytes(
-    const std::string& dir, Env* env,
-    std::unordered_map<std::string, uint64_t>* result) {
+IOStatus BackupEngineImpl::ReadChildFileCurrentSizes(
+    const std::string& dir, const std::shared_ptr<FileSystem>& fs,
+    std::unordered_map<std::string, uint64_t>* result) const {
   assert(result != nullptr);
   std::vector<Env::FileAttributes> files_attrs;
-  Status status = env->FileExists(dir);
-  if (status.ok()) {
-    status = env->GetChildrenFileAttributes(dir, &files_attrs);
-  } else if (status.IsNotFound()) {
+  IOStatus io_status = fs->FileExists(dir, io_options_, nullptr);
+  if (io_status.ok()) {
+    io_status =
+        fs->GetChildrenFileAttributes(dir, io_options_, &files_attrs, nullptr);
+  } else if (io_status.IsNotFound()) {
     // Insert no entries can be considered success
-    status = Status::OK();
+    io_status = IOStatus::OK();
   }
   const bool slash_needed = dir.empty() || dir.back() != '/';
   for (const auto& file_attrs : files_attrs) {
     result->emplace(dir + (slash_needed ? "/" : "") + file_attrs.name,
                     file_attrs.size_bytes);
   }
-  return status;
+  return io_status;
 }
 
-Status BackupEngineImpl::GarbageCollect() {
+IOStatus BackupEngineImpl::GarbageCollect() {
   assert(!read_only_);
 
   // We will make a best effort to remove all garbage even in the presence
   // of inconsistencies or I/O failures that inhibit finding garbage.
-  Status overall_status = Status::OK();
+  IOStatus overall_status = IOStatus::OK();
   // If all goes well, we don't need another auto-GC this session
   might_need_garbage_collect_ = false;
 
@@ -1582,22 +2482,20 @@
       } else {
         shared_path = GetAbsolutePath(GetSharedFileRel());
       }
-      auto s = backup_env_->FileExists(shared_path);
-      if (s.ok()) {
-        s = backup_env_->GetChildren(shared_path, &shared_children);
-      } else if (s.IsNotFound()) {
-        s = Status::OK();
+      IOStatus io_s = backup_fs_->FileExists(shared_path, io_options_, nullptr);
+      if (io_s.ok()) {
+        io_s = backup_fs_->GetChildren(shared_path, io_options_,
+                                       &shared_children, nullptr);
+      } else if (io_s.IsNotFound()) {
+        io_s = IOStatus::OK();
       }
-      if (!s.ok()) {
-        overall_status = s;
+      if (!io_s.ok()) {
+        overall_status = io_s;
         // Trying again later might work
         might_need_garbage_collect_ = true;
       }
     }
     for (auto& child : shared_children) {
-      if (child == "." || child == "..") {
-        continue;
-      }
       std::string rel_fname;
       if (with_checksum) {
         rel_fname = GetSharedFileWithChecksumRel(child);
@@ -1610,11 +2508,12 @@
           child_itr->second->refs == 0) {
         // this might be a directory, but DeleteFile will just fail in that
         // case, so we're good
-        Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
+        IOStatus io_s = backup_fs_->DeleteFile(GetAbsolutePath(rel_fname),
+                                               io_options_, nullptr);
         ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
-                       rel_fname.c_str(), s.ToString().c_str());
+                       rel_fname.c_str(), io_s.ToString().c_str());
         backuped_file_infos_.erase(rel_fname);
-        if (!s.ok()) {
+        if (!io_s.ok()) {
           // Trying again later might work
           might_need_garbage_collect_ = true;
         }
@@ -1625,19 +2524,16 @@
   // delete obsolete private files
   std::vector<std::string> private_children;
   {
-    auto s = backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
-                                      &private_children);
-    if (!s.ok()) {
-      overall_status = s;
+    IOStatus io_s =
+        backup_fs_->GetChildren(GetAbsolutePath(kPrivateDirName), io_options_,
+                                &private_children, nullptr);
+    if (!io_s.ok()) {
+      overall_status = io_s;
       // Trying again later might work
       might_need_garbage_collect_ = true;
     }
   }
   for (auto& child : private_children) {
-    if (child == "." || child == "..") {
-      continue;
-    }
-
     BackupID backup_id = 0;
     bool tmp_dir = child.find(".tmp") != std::string::npos;
     sscanf(child.c_str(), "%u", &backup_id);
@@ -1650,25 +2546,27 @@
     std::string full_private_path =
         GetAbsolutePath(GetPrivateFileRel(backup_id));
     std::vector<std::string> subchildren;
-    backup_env_->GetChildren(full_private_path, &subchildren);
-    for (auto& subchild : subchildren) {
-      if (subchild == "." || subchild == "..") {
-        continue;
-      }
-      Status s = backup_env_->DeleteFile(full_private_path + subchild);
-      ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
-                     (full_private_path + subchild).c_str(),
-                     s.ToString().c_str());
-      if (!s.ok()) {
-        // Trying again later might work
-        might_need_garbage_collect_ = true;
+    if (backup_fs_
+            ->GetChildren(full_private_path, io_options_, &subchildren, nullptr)
+            .ok()) {
+      for (auto& subchild : subchildren) {
+        IOStatus io_s = backup_fs_->DeleteFile(full_private_path + subchild,
+                                               io_options_, nullptr);
+        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+                       (full_private_path + subchild).c_str(),
+                       io_s.ToString().c_str());
+        if (!io_s.ok()) {
+          // Trying again later might work
+          might_need_garbage_collect_ = true;
+        }
       }
     }
     // finally delete the private dir
-    Status s = backup_env_->DeleteDir(full_private_path);
+    IOStatus io_s =
+        backup_fs_->DeleteDir(full_private_path, io_options_, nullptr);
     ROCKS_LOG_INFO(options_.info_log, "Deleting dir %s -- %s",
-                   full_private_path.c_str(), s.ToString().c_str());
-    if (!s.ok()) {
+                   full_private_path.c_str(), io_s.ToString().c_str());
+    if (!io_s.ok()) {
       // Trying again later might work
       might_need_garbage_collect_ = true;
     }
@@ -1680,7 +2578,7 @@
 
 // ------- BackupMeta class --------
 
-Status BackupEngineImpl::BackupMeta::AddFile(
+IOStatus BackupEngineImpl::BackupMeta::AddFile(
     std::shared_ptr<FileInfo> file_info) {
   auto itr = file_infos_->find(file_info->filename);
   if (itr == file_infos_->end()) {
@@ -1690,13 +2588,42 @@
       itr->second->refs = 1;
     } else {
       // if this happens, something is seriously wrong
-      return Status::Corruption("In memory metadata insertion error");
+      return IOStatus::Corruption("In memory metadata insertion error");
     }
   } else {
-    if (itr->second->checksum_value != file_info->checksum_value) {
-      return Status::Corruption(
-          "Checksum mismatch for existing backup file. Delete old backups and "
-          "try again.");
+    // Compare sizes, because we scanned that off the filesystem on both
+    // ends. This is like a check in VerifyBackup.
+    if (itr->second->size != file_info->size) {
+      std::string msg = "Size mismatch for existing backup file: ";
+      msg.append(file_info->filename);
+      msg.append(" Size in backup is " + ToString(itr->second->size) +
+                 " while size in DB is " + ToString(file_info->size));
+      msg.append(
+          " If this DB file checks as not corrupt, try deleting old"
+          " backups or backing up to a different backup directory.");
+      return IOStatus::Corruption(msg);
+    }
+    if (file_info->checksum_hex.empty()) {
+      // No checksum available to check
+    } else if (itr->second->checksum_hex.empty()) {
+      // Remember checksum if newly acquired
+      itr->second->checksum_hex = file_info->checksum_hex;
+    } else if (itr->second->checksum_hex != file_info->checksum_hex) {
+      // Note: to save I/O, these will be equal trivially on already backed
+      // up files that don't have the checksum in their name. And it should
+      // never fail for files that do have checksum in their name.
+
+      // Should never reach here, but produce an appropriate corruption
+      // message in case we do in a release build.
+      assert(false);
+      std::string msg = "Checksum mismatch for existing backup file: ";
+      msg.append(file_info->filename);
+      msg.append(" Expected checksum is " + itr->second->checksum_hex +
+                 " while computed checksum is " + file_info->checksum_hex);
+      msg.append(
+          " If this DB file checks as not corrupt, try deleting old"
+          " backups or backing up to a different backup directory.");
+      return IOStatus::Corruption(msg);
     }
     ++itr->second->refs;  // increase refcount if already present
   }
@@ -1704,284 +2631,444 @@
   size_ += file_info->size;
   files_.push_back(itr->second);
 
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
-  Status s;
+IOStatus BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
+  IOStatus io_s;
   for (const auto& file : files_) {
     --file->refs;  // decrease refcount
   }
   files_.clear();
   // delete meta file
   if (delete_meta) {
-    s = env_->FileExists(meta_filename_);
-    if (s.ok()) {
-      s = env_->DeleteFile(meta_filename_);
-    } else if (s.IsNotFound()) {
-      s = Status::OK();  // nothing to delete
+    io_s = fs_->FileExists(meta_filename_, iooptions_, nullptr);
+    if (io_s.ok()) {
+      io_s = fs_->DeleteFile(meta_filename_, iooptions_, nullptr);
+    } else if (io_s.IsNotFound()) {
+      io_s = IOStatus::OK();  // nothing to delete
     }
   }
   timestamp_ = 0;
-  return s;
+  return io_s;
 }
 
-Slice kMetaDataPrefix("metadata ");
+// Constants for backup meta file schema (see LoadFromFile)
+namespace {
+
+const std::string kSchemaVersionPrefix{"schema_version "};
+const std::string kFooterMarker{"// FOOTER"};
+
+const std::string kAppMetaDataFieldName{"metadata"};
+
+// WART: The checksums are crc32c but named "crc32"
+const std::string kFileCrc32cFieldName{"crc32"};
+const std::string kFileSizeFieldName{"size"};
+
+// Marks a (future) field that should cause failure if not recognized.
+// Other fields are assumed to be ignorable. For example, in the future
+// we might add
+//  ni::file_name_escape uri_percent
+// to indicate all file names have had spaces and special characters
+// escaped using a URI percent encoding.
+const std::string kNonIgnorableFieldPrefix{"ni::"};
+}  // namespace
 
-// each backup meta file is of the format:
+// Each backup meta file is of the format (schema version 1):
+//----------------------------------------------------------
 // <timestamp>
 // <seq number>
-// <metadata(literal string)> <metadata> (optional)
+// metadata <metadata> (optional)
+// <number of files>
+// <file1> crc32 <crc32c_as_unsigned_decimal>
+// <file2> crc32 <crc32c_as_unsigned_decimal>
+// ...
+//----------------------------------------------------------
+//
+// For schema version 2.x (not in public APIs, but
+// forward-compatibility started):
+//----------------------------------------------------------
+// schema_version <ver>
+// <timestamp>
+// <seq number>
+// [<field name> <field data>]
+// ...
 // <number of files>
-// <file1> <crc32(literal string)> <crc32_value>
-// <file2> <crc32(literal string)> <crc32_value>
+// <file1>( <field name> <field data no spaces>)*
+// <file2>( <field name> <field data no spaces>)*
 // ...
-Status BackupEngineImpl::BackupMeta::LoadFromFile(
+// [// FOOTER]
+// [<field name> <field data>]
+// ...
+//----------------------------------------------------------
+// where
+// <ver> ::= [0-9]+([.][0-9]+)
+// <field name> ::= [A-Za-z_][A-Za-z_0-9.]+
+// <field data> is anything but newline
+// <field data no spaces> is anything but space and newline
+// Although "// FOOTER" wouldn't strictly be required as a delimiter
+// given the number of files is included, it is there for parsing
+// sanity in case of corruption. It is only required if followed
+// by footer fields, such as a checksum of the meta file (so far).
+// Unrecognized fields are ignored, to support schema evolution on
+// non-critical features with forward compatibility. Update schema
+// major version for breaking changes. Schema minor versions are indicated
+// only for diagnostic/debugging purposes.
+//
+// Fields in schema version 2.0:
+// * Top-level meta fields:
+//   * Only "metadata" as in schema version 1
+// * File meta fields:
+//   * "crc32" - a crc32c checksum as in schema version 1
+//   * "size" - the size of the file (new)
+// * Footer meta fields:
+//   * None yet (future use for meta file checksum anticipated)
+//
+IOStatus BackupEngineImpl::BackupMeta::LoadFromFile(
     const std::string& backup_dir,
-    const std::unordered_map<std::string, uint64_t>& abs_path_to_size) {
+    const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+    RateLimiter* rate_limiter, Logger* info_log,
+    std::unordered_set<std::string>* reported_ignored_fields) {
+  assert(reported_ignored_fields);
   assert(Empty());
-  Status s;
-  std::unique_ptr<SequentialFile> backup_meta_file;
-  s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions());
-  if (!s.ok()) {
-    return s;
-  }
-
-  std::unique_ptr<SequentialFileReader> backup_meta_reader(
-      new SequentialFileReader(NewLegacySequentialFileWrapper(backup_meta_file),
-                               meta_filename_));
-  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
-  Slice data;
-  s = backup_meta_reader->Read(max_backup_meta_file_size_, &data, buf.get());
 
-  if (!s.ok() || data.size() == max_backup_meta_file_size_) {
-    return s.ok() ? Status::Corruption("File size too big") : s;
+  std::unique_ptr<LineFileReader> backup_meta_reader;
+  {
+    IOStatus io_s = LineFileReader::Create(fs_, meta_filename_, FileOptions(),
+                                           &backup_meta_reader, nullptr);
+    if (!io_s.ok()) {
+      return io_s;
+    }
   }
-  buf[data.size()] = 0;
 
-  uint32_t num_files = 0;
-  char *next;
-  timestamp_ = strtoull(data.data(), &next, 10);
-  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
-  sequence_number_ = strtoull(data.data(), &next, 10);
-  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
+  // If we don't read an explicit schema_version, that implies version 1,
+  // which is what we call the original backup meta schema.
+  int schema_major_version = 1;
 
-  if (data.starts_with(kMetaDataPrefix)) {
-    // app metadata present
-    data.remove_prefix(kMetaDataPrefix.size());
-    Slice hex_encoded_metadata = GetSliceUntil(&data, '\n');
-    bool decode_success = hex_encoded_metadata.DecodeHex(&app_metadata_);
-    if (!decode_success) {
-      return Status::Corruption(
-          "Failed to decode stored hex encoded app metadata");
+  // Failures handled at the end
+  std::string line;
+  if (backup_meta_reader->ReadLine(&line)) {
+    if (rate_limiter != nullptr) {
+      LoopRateLimitRequestHelper(line.size(), rate_limiter, Env::IO_LOW,
+                                 nullptr /* stats */,
+                                 RateLimiter::OpType::kRead);
+    }
+    if (StartsWith(line, kSchemaVersionPrefix)) {
+      std::string ver = line.substr(kSchemaVersionPrefix.size());
+      if (ver == "2" || StartsWith(ver, "2.")) {
+        schema_major_version = 2;
+      } else {
+        return IOStatus::NotSupported(
+            "Unsupported/unrecognized schema version: " + ver);
+      }
+      line.clear();
+    } else if (line.empty()) {
+      return IOStatus::Corruption("Unexpected empty line");
+    }
+  }
+  if (!line.empty()) {
+    timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  } else if (backup_meta_reader->ReadLine(&line)) {
+    if (rate_limiter != nullptr) {
+      LoopRateLimitRequestHelper(line.size(), rate_limiter, Env::IO_LOW,
+                                 nullptr /* stats */,
+                                 RateLimiter::OpType::kRead);
+    }
+    timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  }
+  if (backup_meta_reader->ReadLine(&line)) {
+    if (rate_limiter != nullptr) {
+      LoopRateLimitRequestHelper(line.size(), rate_limiter, Env::IO_LOW,
+                                 nullptr /* stats */,
+                                 RateLimiter::OpType::kRead);
+    }
+    sequence_number_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  }
+  uint32_t num_files = UINT32_MAX;
+  while (backup_meta_reader->ReadLine(&line)) {
+    if (rate_limiter != nullptr) {
+      LoopRateLimitRequestHelper(line.size(), rate_limiter, Env::IO_LOW,
+                                 nullptr /* stats */,
+                                 RateLimiter::OpType::kRead);
+    }
+    if (line.empty()) {
+      return IOStatus::Corruption("Unexpected empty line");
+    }
+    // Number -> number of files -> exit loop reading optional meta fields
+    if (line[0] >= '0' && line[0] <= '9') {
+      num_files = static_cast<uint32_t>(strtoul(line.c_str(), nullptr, 10));
+      break;
+    }
+    // else, must be a meta field assignment
+    auto space_pos = line.find_first_of(' ');
+    if (space_pos == std::string::npos) {
+      return IOStatus::Corruption("Expected number of files or meta field");
+    }
+    std::string field_name = line.substr(0, space_pos);
+    std::string field_data = line.substr(space_pos + 1);
+    if (field_name == kAppMetaDataFieldName) {
+      // app metadata present
+      bool decode_success = Slice(field_data).DecodeHex(&app_metadata_);
+      if (!decode_success) {
+        return IOStatus::Corruption(
+            "Failed to decode stored hex encoded app metadata");
+      }
+    } else if (schema_major_version < 2) {
+      return IOStatus::Corruption("Expected number of files or \"" +
+                                  kAppMetaDataFieldName + "\" field");
+    } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+      return IOStatus::NotSupported("Unrecognized non-ignorable meta field " +
+                                    field_name + " (from future version?)");
+    } else {
+      // Warn the first time we see any particular unrecognized meta field
+      if (reported_ignored_fields->insert("meta:" + field_name).second) {
+        ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup meta field %s",
+                       field_name.c_str());
+      }
     }
   }
-
-  num_files = static_cast<uint32_t>(strtoul(data.data(), &next, 10));
-  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
-
   std::vector<std::shared_ptr<FileInfo>> files;
+  bool footer_present = false;
+  while (backup_meta_reader->ReadLine(&line)) {
+    if (rate_limiter != nullptr) {
+      LoopRateLimitRequestHelper(line.size(), rate_limiter, Env::IO_LOW,
+                                 nullptr /* stats */,
+                                 RateLimiter::OpType::kRead);
+    }
+    std::vector<std::string> components = StringSplit(line, ' ');
+
+    if (components.size() < 1) {
+      return IOStatus::Corruption("Empty line instead of file entry.");
+    }
+    if (schema_major_version >= 2 && components.size() == 2 &&
+        line == kFooterMarker) {
+      footer_present = true;
+      break;
+    }
 
-  Slice checksum_prefix("crc32 ");
-
-  for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
-    auto line = GetSliceUntil(&data, '\n');
-    std::string filename = GetSliceUntil(&line, ' ').ToString();
+    const std::string& filename = components[0];
 
-    uint64_t size;
+    uint64_t actual_size;
     const std::shared_ptr<FileInfo> file_info = GetFile(filename);
     if (file_info) {
-      size = file_info->size;
+      actual_size = file_info->size;
     } else {
       std::string abs_path = backup_dir + "/" + filename;
-      try {
-        size = abs_path_to_size.at(abs_path);
-      } catch (std::out_of_range&) {
-        return Status::Corruption("Size missing for pathname: " + abs_path);
+      auto e = abs_path_to_size.find(abs_path);
+      if (e == abs_path_to_size.end()) {
+        return IOStatus::Corruption(
+            "Pathname in meta file not found on disk: " + abs_path);
+      }
+      actual_size = e->second;
+    }
+
+    if (schema_major_version >= 2) {
+      if (components.size() % 2 != 1) {
+        return IOStatus::Corruption(
+            "Bad number of line components for file entry.");
+      }
+    } else {
+      // Check restricted original schema
+      if (components.size() < 3) {
+        return IOStatus::Corruption("File checksum is missing for " + filename +
+                                    " in " + meta_filename_);
+      }
+      if (components[1] != kFileCrc32cFieldName) {
+        return IOStatus::Corruption("Unknown checksum type for " + filename +
+                                    " in " + meta_filename_);
+      }
+      if (components.size() > 3) {
+        return IOStatus::Corruption("Extra data for entry " + filename +
+                                    " in " + meta_filename_);
       }
     }
 
-    if (line.empty()) {
-      return Status::Corruption("File checksum is missing for " + filename +
-                                " in " + meta_filename_);
+    std::string checksum_hex;
+    for (unsigned i = 1; i < components.size(); i += 2) {
+      const std::string& field_name = components[i];
+      const std::string& field_data = components[i + 1];
+
+      if (field_name == kFileCrc32cFieldName) {
+        uint32_t checksum_value =
+            static_cast<uint32_t>(strtoul(field_data.c_str(), nullptr, 10));
+        if (field_data != ROCKSDB_NAMESPACE::ToString(checksum_value)) {
+          return IOStatus::Corruption("Invalid checksum value for " + filename +
+                                      " in " + meta_filename_);
+        }
+        checksum_hex = ChecksumInt32ToHex(checksum_value);
+      } else if (field_name == kFileSizeFieldName) {
+        uint64_t ex_size =
+            std::strtoull(field_data.c_str(), nullptr, /*base*/ 10);
+        if (ex_size != actual_size) {
+          return IOStatus::Corruption(
+              "For file " + filename + " expected size " + ToString(ex_size) +
+              " but found size" + ToString(actual_size));
+        }
+      } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+        return IOStatus::NotSupported("Unrecognized non-ignorable file field " +
+                                      field_name + " (from future version?)");
+      } else {
+        // Warn the first time we see any particular unrecognized file field
+        if (reported_ignored_fields->insert("file:" + field_name).second) {
+          ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup file field %s",
+                         field_name.c_str());
+        }
+      }
     }
 
-    uint32_t checksum_value = 0;
-    if (line.starts_with(checksum_prefix)) {
-      line.remove_prefix(checksum_prefix.size());
-      checksum_value = static_cast<uint32_t>(
-          strtoul(line.data(), nullptr, 10));
-      if (line != ROCKSDB_NAMESPACE::ToString(checksum_value)) {
-        return Status::Corruption("Invalid checksum value for " + filename +
-                                  " in " + meta_filename_);
+    files.emplace_back(new FileInfo(filename, actual_size, checksum_hex));
+  }
+
+  if (footer_present) {
+    assert(schema_major_version >= 2);
+    while (backup_meta_reader->ReadLine(&line)) {
+      if (rate_limiter != nullptr) {
+        LoopRateLimitRequestHelper(line.size(), rate_limiter, Env::IO_LOW,
+                                   nullptr /* stats */,
+                                   RateLimiter::OpType::kRead);
+      }
+      if (line.empty()) {
+        return IOStatus::Corruption("Unexpected empty line");
+      }
+      auto space_pos = line.find_first_of(' ');
+      if (space_pos == std::string::npos) {
+        return IOStatus::Corruption("Expected footer field");
+      }
+      std::string field_name = line.substr(0, space_pos);
+      std::string field_data = line.substr(space_pos + 1);
+      if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+        return IOStatus::NotSupported("Unrecognized non-ignorable field " +
+                                      field_name + " (from future version?)");
+      } else if (reported_ignored_fields->insert("footer:" + field_name)
+                     .second) {
+        // Warn the first time we see any particular unrecognized footer field
+        ROCKS_LOG_WARN(info_log,
+                       "Ignoring unrecognized backup meta footer field %s",
+                       field_name.c_str());
       }
-    } else {
-      return Status::Corruption("Unknown checksum type for " + filename +
-                                " in " + meta_filename_);
     }
+  }
 
-    files.emplace_back(new FileInfo(filename, size, checksum_value));
+  {
+    IOStatus io_s = backup_meta_reader->GetStatus();
+    if (!io_s.ok()) {
+      return io_s;
+    }
   }
 
-  if (s.ok() && data.size() > 0) {
-    // file has to be read completely. if not, we count it as corruption
-    s = Status::Corruption("Tailing data in backup meta file in " +
-                           meta_filename_);
+  if (num_files != files.size()) {
+    return IOStatus::Corruption(
+        "Inconsistent number of files or missing/incomplete header in " +
+        meta_filename_);
   }
 
-  if (s.ok()) {
-    files_.reserve(files.size());
-    for (const auto& file_info : files) {
-      s = AddFile(file_info);
-      if (!s.ok()) {
-        break;
-      }
+  files_.reserve(files.size());
+  for (const auto& file_info : files) {
+    IOStatus io_s = AddFile(file_info);
+    if (!io_s.ok()) {
+      return io_s;
     }
   }
 
-  return s;
+  return IOStatus::OK();
 }
 
-Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
-  Status s;
-  std::unique_ptr<WritableFile> backup_meta_file;
-  EnvOptions env_options;
-  env_options.use_mmap_writes = false;
-  env_options.use_direct_writes = false;
-  s = env_->NewWritableFile(meta_tmp_filename_, &backup_meta_file, env_options);
-  if (!s.ok()) {
-    return s;
+IOStatus BackupEngineImpl::BackupMeta::StoreToFile(
+    bool sync, const TEST_FutureSchemaVersion2Options* test_future_options) {
+  IOStatus io_s;
+  std::unique_ptr<FSWritableFile> backup_meta_file;
+  FileOptions file_options;
+  file_options.use_mmap_writes = false;
+  file_options.use_direct_writes = false;
+  io_s = fs_->NewWritableFile(meta_tmp_filename_, file_options,
+                              &backup_meta_file, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
   }
 
-  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
-  size_t len = 0, buf_size = max_backup_meta_file_size_;
-  len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
-  len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
-                  sequence_number_);
+  std::ostringstream buf;
+  if (test_future_options) {
+    buf << kSchemaVersionPrefix << test_future_options->version << "\n";
+  }
+  buf << static_cast<unsigned long long>(timestamp_) << "\n";
+  buf << sequence_number_ << "\n";
+
   if (!app_metadata_.empty()) {
     std::string hex_encoded_metadata =
         Slice(app_metadata_).ToString(/* hex */ true);
-
-    // +1 to accommodate newline character
-    size_t hex_meta_strlen = kMetaDataPrefix.ToString().length() + hex_encoded_metadata.length() + 1;
-    if (hex_meta_strlen >= buf_size) {
-      return Status::Corruption("Buffer too small to fit backup metadata");
-    }
-    else if (len + hex_meta_strlen >= buf_size) {
-      backup_meta_file->Append(Slice(buf.get(), len));
-      buf.reset();
-      std::unique_ptr<char[]> new_reset_buf(
-          new char[max_backup_meta_file_size_]);
-      buf.swap(new_reset_buf);
-      len = 0;
-    }
-    len += snprintf(buf.get() + len, buf_size - len, "%s%s\n",
-                    kMetaDataPrefix.ToString().c_str(),
-                    hex_encoded_metadata.c_str());
-  }
-
-  char writelen_temp[19];
-  if (len + snprintf(writelen_temp, sizeof(writelen_temp),
-                     "%" ROCKSDB_PRIszt "\n", files_.size()) >= buf_size) {
-    backup_meta_file->Append(Slice(buf.get(), len));
-    buf.reset();
-    std::unique_ptr<char[]> new_reset_buf(new char[max_backup_meta_file_size_]);
-    buf.swap(new_reset_buf);
-    len = 0;
+    buf << kAppMetaDataFieldName << " " << hex_encoded_metadata << "\n";
   }
-  {
-    const char *const_write = writelen_temp;
-    len += snprintf(buf.get() + len, buf_size - len, "%s", const_write);
+  if (test_future_options) {
+    for (auto& e : test_future_options->meta_fields) {
+      buf << e.first << " " << e.second << "\n";
+    }
   }
+  buf << files_.size() << "\n";
 
   for (const auto& file : files_) {
-    // use crc32 for now, switch to something else if needed
-
-    size_t newlen = len + file->filename.length() + snprintf(writelen_temp,
-      sizeof(writelen_temp), " crc32 %u\n", file->checksum_value);
-    const char *const_write = writelen_temp;
-    if (newlen >= buf_size) {
-      backup_meta_file->Append(Slice(buf.get(), len));
-      buf.reset();
-      std::unique_ptr<char[]> new_reset_buf(
-          new char[max_backup_meta_file_size_]);
-      buf.swap(new_reset_buf);
-      len = 0;
-    }
-    len += snprintf(buf.get() + len, buf_size - len, "%s%s",
-                    file->filename.c_str(), const_write);
-  }
-
-  s = backup_meta_file->Append(Slice(buf.get(), len));
-  if (s.ok() && sync) {
-    s = backup_meta_file->Sync();
-  }
-  if (s.ok()) {
-    s = backup_meta_file->Close();
-  }
-  if (s.ok()) {
-    s = env_->RenameFile(meta_tmp_filename_, meta_filename_);
-  }
-  return s;
-}
-
-// -------- BackupEngineReadOnlyImpl ---------
-class BackupEngineReadOnlyImpl : public BackupEngineReadOnly {
- public:
-  BackupEngineReadOnlyImpl(Env* db_env, const BackupableDBOptions& options)
-      : backup_engine_(new BackupEngineImpl(db_env, options, true)) {}
-
-  ~BackupEngineReadOnlyImpl() override {}
-
-  // The returned BackupInfos are in chronological order, which means the
-  // latest backup comes last.
-  void GetBackupInfo(std::vector<BackupInfo>* backup_info) override {
-    backup_engine_->GetBackupInfo(backup_info);
+    buf << file->filename;
+    if (test_future_options == nullptr ||
+        test_future_options->crc32c_checksums) {
+      // use crc32c for now, switch to something else if needed
+      buf << " " << kFileCrc32cFieldName << " "
+          << ChecksumHexToInt32(file->checksum_hex);
+    }
+    if (test_future_options && test_future_options->file_sizes) {
+      buf << " " << kFileSizeFieldName << " " << ToString(file->size);
+    }
+    if (test_future_options) {
+      for (auto& e : test_future_options->file_fields) {
+        buf << " " << e.first << " " << e.second;
+      }
+    }
+    buf << "\n";
   }
 
-  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) override {
-    backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
+  if (test_future_options && !test_future_options->footer_fields.empty()) {
+    buf << kFooterMarker << "\n";
+    for (auto& e : test_future_options->footer_fields) {
+      buf << e.first << " " << e.second << "\n";
+    }
   }
 
-  Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) override {
-    return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir,
-                                               restore_options);
+  io_s = backup_meta_file->Append(Slice(buf.str()), iooptions_, nullptr);
+  IOSTATS_ADD(bytes_written, buf.str().size());
+  if (io_s.ok() && sync) {
+    io_s = backup_meta_file->Sync(iooptions_, nullptr);
   }
-
-  Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& restore_options = RestoreOptions()) override {
-    return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir,
-                                                     restore_options);
+  if (io_s.ok()) {
+    io_s = backup_meta_file->Close(iooptions_, nullptr);
   }
-
-  Status VerifyBackup(BackupID backup_id) override {
-    return backup_engine_->VerifyBackup(backup_id);
+  if (io_s.ok()) {
+    io_s = fs_->RenameFile(meta_tmp_filename_, meta_filename_, iooptions_,
+                           nullptr);
   }
+  return io_s;
+}
 
-  Status Initialize() { return backup_engine_->Initialize(); }
-
- private:
-  std::unique_ptr<BackupEngineImpl> backup_engine_;
-};
-
-Status BackupEngineReadOnly::Open(Env* env, const BackupableDBOptions& options,
-                                  BackupEngineReadOnly** backup_engine_ptr) {
+IOStatus BackupEngineReadOnly::Open(const BackupEngineOptions& options,
+                                    Env* env,
+                                    BackupEngineReadOnly** backup_engine_ptr) {
   if (options.destroy_old_data) {
-    return Status::InvalidArgument(
+    return IOStatus::InvalidArgument(
         "Can't destroy old data with ReadOnly BackupEngine");
   }
-  std::unique_ptr<BackupEngineReadOnlyImpl> backup_engine(
-      new BackupEngineReadOnlyImpl(env, options));
+  std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+      new BackupEngineImplThreadSafe(options, env, true /*read_only*/));
   auto s = backup_engine->Initialize();
   if (!s.ok()) {
     *backup_engine_ptr = nullptr;
     return s;
   }
   *backup_engine_ptr = backup_engine.release();
-  return Status::OK();
+  return IOStatus::OK();
+}
+
+void TEST_EnableWriteFutureSchemaVersion2(
+    BackupEngine* engine, const TEST_FutureSchemaVersion2Options& options) {
+  BackupEngineImplThreadSafe* impl =
+      static_cast_with_check<BackupEngineImplThreadSafe>(engine);
+  impl->TEST_EnableWriteFutureSchemaVersion2(options);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,29 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/backupable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TEST_FutureSchemaVersion2Options {
+  std::string version = "2";
+  bool crc32c_checksums = false;
+  bool file_sizes = true;
+  std::map<std::string, std::string> meta_fields;
+  std::map<std::string, std::string> file_fields;
+  std::map<std::string, std::string> footer_fields;
+};
+
+// Modifies the BackupEngine(Impl) to write backup meta files using the
+// unpublished schema version 2, for the life of this object (not backup_dir).
+// TEST_FutureSchemaVersion2Options offers some customization for testing.
+void TEST_EnableWriteFutureSchemaVersion2(
+    BackupEngine *engine, const TEST_FutureSchemaVersion2Options &options);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/backupable/backupable_db_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -9,32 +9,52 @@
 
 #if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 
+#include "rocksdb/utilities/backupable_db.h"
+
 #include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
 #include <limits>
+#include <memory>
+#include <random>
 #include <string>
 #include <utility>
 
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "env/env_chroot.h"
 #include "file/filename.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/types.h"
-#include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/options_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/cast_util.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
+#include "util/rate_limiter.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
+#include "utilities/backupable/backupable_db_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace {
+using ShareFilesNaming = BackupableDBOptions::ShareFilesNaming;
+const auto kLegacyCrc32cAndFileSize =
+    BackupableDBOptions::kLegacyCrc32cAndFileSize;
+const auto kUseDbSessionId = BackupableDBOptions::kUseDbSessionId;
+const auto kFlagIncludeFileSize = BackupableDBOptions::kFlagIncludeFileSize;
+const auto kNamingDefault = kUseDbSessionId | kFlagIncludeFileSize;
 
 class DummyDB : public StackableDB {
  public:
@@ -70,14 +90,6 @@
     return Status::OK();
   }
 
-  Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
-                      bool /*flush_memtable*/ = true) override {
-    EXPECT_TRUE(!deletions_enabled_);
-    vec = live_files_;
-    *mfs = 100;
-    return Status::OK();
-  }
-
   ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
 
   class DummyLogFile : public LogFile {
@@ -111,12 +123,36 @@
      bool alive_;
   }; // DummyLogFile
 
-  Status GetSortedWalFiles(VectorLogPtr& files) override {
-    EXPECT_TRUE(!deletions_enabled_);
-    files.resize(wal_files_.size());
-    for (size_t i = 0; i < files.size(); ++i) {
-      files[i].reset(
-          new DummyLogFile(wal_files_[i].first, wal_files_[i].second));
+  Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override {
+    uint64_t number;
+    FileType type;
+    files->clear();
+    for (auto& f : live_files_) {
+      bool success = ParseFileName(f, &number, &type);
+      if (!success) {
+        return Status::InvalidArgument("Bad file name: " + f);
+      }
+      files->emplace_back();
+      LiveFileStorageInfo& info = files->back();
+      info.relative_filename = f;
+      info.directory = dbname_;
+      info.file_number = number;
+      info.file_type = type;
+      if (type == kDescriptorFile) {
+        info.size = 100;  // See TestEnv::GetChildrenFileAttributes below
+        info.trim_to_size = true;
+      } else if (type == kCurrentFile) {
+        info.size = 0;
+        info.trim_to_size = true;
+      } else {
+        info.size = 200;  // See TestEnv::GetChildrenFileAttributes below
+      }
+      if (opts.include_checksum_info) {
+        info.file_checksum = kUnknownFileChecksum;
+        info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      }
     }
     return Status::OK();
   }
@@ -125,8 +161,7 @@
   Status FlushWAL(bool /*sync*/) override { return Status::OK(); }
 
   std::vector<std::string> live_files_;
-  // pair<filename, alive?>
-  std::vector<std::pair<std::string, bool>> wal_files_;
+
  private:
   Options options_;
   std::string dbname_;
@@ -137,6 +172,7 @@
 class TestEnv : public EnvWrapper {
  public:
   explicit TestEnv(Env* t) : EnvWrapper(t) {}
+  const char* Name() const override { return "TestEnv"; }
 
   class DummySequentialFile : public SequentialFile {
    public:
@@ -292,7 +328,11 @@
       const std::string& dir, std::vector<Env::FileAttributes>* r) override {
     if (filenames_for_mocked_attrs_.size() > 0) {
       for (const auto& filename : filenames_for_mocked_attrs_) {
-        r->push_back({dir + filename, 10 /* size_bytes */});
+        uint64_t size_bytes = 200;  // Match TestEnv
+        if (filename.find("MANIFEST") == 0) {
+          size_bytes = 100;  // Match DummyDB::GetLiveFiles
+        }
+        r->push_back({dir + "/" + filename, size_bytes});
       }
       return Status::OK();
     }
@@ -300,11 +340,14 @@
   }
   Status GetFileSize(const std::string& path, uint64_t* size_bytes) override {
     if (filenames_for_mocked_attrs_.size() > 0) {
-      auto fname = path.substr(path.find_last_of('/'));
+      auto fname = path.substr(path.find_last_of('/') + 1);
       auto filename_iter = std::find(filenames_for_mocked_attrs_.begin(),
                                      filenames_for_mocked_attrs_.end(), fname);
       if (filename_iter != filenames_for_mocked_attrs_.end()) {
-        *size_bytes = 10;
+        *size_bytes = 200;  // Match TestEnv
+        if (fname.find("MANIFEST") == 0) {
+          *size_bytes = 100;  // Match DummyDB::GetLiveFiles
+        }
         return Status::OK();
       }
       return Status::NotFound(fname);
@@ -375,18 +418,38 @@
 class FileManager : public EnvWrapper {
  public:
   explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
+  const char* Name() const override { return "FileManager"; }
+
+  Status GetRandomFileInDir(const std::string& dir, std::string* fname,
+                            uint64_t* fsize) {
+    std::vector<FileAttributes> children;
+    auto s = GetChildrenFileAttributes(dir, &children);
+    if (!s.ok()) {
+      return s;
+    } else if (children.size() <= 2) {  // . and ..
+      return Status::NotFound("Empty directory: " + dir);
+    }
+    assert(fname != nullptr);
+    while (true) {
+      int i = rnd_.Next() % children.size();
+      fname->assign(dir + "/" + children[i].name);
+      *fsize = children[i].size_bytes;
+      return Status::OK();
+    }
+    // should never get here
+    assert(false);
+    return Status::NotFound("");
+  }
 
   Status DeleteRandomFileInDir(const std::string& dir) {
     std::vector<std::string> children;
-    GetChildren(dir, &children);
-    if (children.size() <= 2) { // . and ..
-      return Status::NotFound("");
+    Status s = GetChildren(dir, &children);
+    if (!s.ok()) {
+      return s;
     }
     while (true) {
       int i = rnd_.Next() % children.size();
-      if (children[i] != "." && children[i] != "..") {
-        return DeleteFile(dir + "/" + children[i]);
-      }
+      return DeleteFile(dir + "/" + children[i]);
     }
     // should never get here
     assert(false);
@@ -396,15 +459,13 @@
   Status AppendToRandomFileInDir(const std::string& dir,
                                  const std::string& data) {
     std::vector<std::string> children;
-    GetChildren(dir, &children);
-    if (children.size() <= 2) {
-      return Status::NotFound("");
+    Status s = GetChildren(dir, &children);
+    if (!s.ok()) {
+      return s;
     }
     while (true) {
       int i = rnd_.Next() % children.size();
-      if (children[i] != "." && children[i] != "..") {
-        return WriteToFile(dir + "/" + children[i], data);
-      }
+      return WriteToFile(dir + "/" + children[i], data);
     }
     // should never get here
     assert(false);
@@ -423,13 +484,29 @@
     }
 
     for (uint64_t i = 0; i < bytes_to_corrupt; ++i) {
-      std::string tmp;
-      test::RandomString(&rnd_, 1, &tmp);
+      std::string tmp = rnd_.RandomString(1);
       file_contents[rnd_.Next() % file_contents.size()] = tmp[0];
     }
     return WriteToFile(fname, file_contents);
   }
 
+  Status CorruptFileStart(const std::string& fname) {
+    std::string to_xor = "blah";
+    std::string file_contents;
+    Status s = ReadFileToString(this, fname, &file_contents);
+    if (!s.ok()) {
+      return s;
+    }
+    s = DeleteFile(fname);
+    if (!s.ok()) {
+      return s;
+    }
+    for (size_t i = 0; i < to_xor.size(); ++i) {
+      file_contents[i] ^= to_xor[i];
+    }
+    return WriteToFile(fname, file_contents);
+  }
+
   Status CorruptChecksum(const std::string& fname, bool appear_valid) {
     std::string metadata;
     Status s = ReadFileToString(this, fname, &metadata);
@@ -484,7 +561,20 @@
 }; // FileManager
 
 // utility functions
-static size_t FillDB(DB* db, int from, int to) {
+namespace {
+
+enum FillDBFlushAction {
+  kFlushMost,
+  kFlushAll,
+  kAutoFlushOnly,
+};
+
+// Many tests in this file expect FillDB to write at least one sst file,
+// so the default behavior (if not kAutoFlushOnly) of FillDB is to force
+// a flush. But to ensure coverage of the WAL file case, we also (by default)
+// do one Put after the Flush (kFlushMost).
+size_t FillDB(DB* db, int from, int to,
+              FillDBFlushAction flush_action = kFlushMost) {
   size_t bytes_written = 0;
   for (int i = from; i < to; ++i) {
     std::string key = "testkey" + ToString(i);
@@ -492,11 +582,18 @@
     bytes_written += key.size() + value.size();
 
     EXPECT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
+
+    if (flush_action == kFlushMost && i == to - 2) {
+      EXPECT_OK(db->Flush(FlushOptions()));
+    }
+  }
+  if (flush_action == kFlushAll) {
+    EXPECT_OK(db->Flush(FlushOptions()));
   }
   return bytes_written;
 }
 
-static void AssertExists(DB* db, int from, int to) {
+void AssertExists(DB* db, int from, int to) {
   for (int i = from; i < to; ++i) {
     std::string key = "testkey" + ToString(i);
     std::string value;
@@ -505,7 +602,7 @@
   }
 }
 
-static void AssertEmpty(DB* db, int from, int to) {
+void AssertEmpty(DB* db, int from, int to) {
   for (int i = from; i < to; ++i) {
     std::string key = "testkey" + ToString(i);
     std::string value = "testvalue" + ToString(i);
@@ -514,8 +611,9 @@
     ASSERT_TRUE(s.IsNotFound());
   }
 }
+}  // namespace
 
-class BackupableDBTest : public testing::Test {
+class BackupEngineTest : public testing::Test {
  public:
   enum ShareOption {
     kNoShare,
@@ -526,14 +624,15 @@
   const std::vector<ShareOption> kAllShareOptions = {
       kNoShare, kShareNoChecksum, kShareWithChecksum};
 
-  BackupableDBTest() {
+  BackupEngineTest() {
     // set up files
-    std::string db_chroot = test::PerThreadDBPath("backupable_db");
-    std::string backup_chroot = test::PerThreadDBPath("backupable_db_backup");
-    Env::Default()->CreateDir(db_chroot);
-    Env::Default()->CreateDir(backup_chroot);
+    std::string db_chroot = test::PerThreadDBPath("db_for_backup");
+    std::string backup_chroot = test::PerThreadDBPath("db_backups");
+    EXPECT_OK(Env::Default()->CreateDirIfMissing(db_chroot));
+    EXPECT_OK(Env::Default()->CreateDirIfMissing(backup_chroot));
     dbname_ = "/tempdb";
     backupdir_ = "/tempbk";
+    latest_backup_ = backupdir_ + "/LATEST_BACKUP";
 
     // set up envs
     db_chroot_env_.reset(NewChrootEnv(Env::Default(), db_chroot));
@@ -541,6 +640,7 @@
     test_db_env_.reset(new TestEnv(db_chroot_env_.get()));
     test_backup_env_.reset(new TestEnv(backup_chroot_env_.get()));
     file_manager_.reset(new FileManager(backup_chroot_env_.get()));
+    db_file_manager_.reset(new FileManager(db_chroot_env_.get()));
 
     // set up db options
     options_.create_if_missing = true;
@@ -548,21 +648,42 @@
     options_.write_buffer_size = 1 << 17; // 128KB
     options_.env = test_db_env_.get();
     options_.wal_dir = dbname_;
+    options_.enable_blob_files = true;
 
     // Create logger
     DBOptions logger_options;
     logger_options.env = db_chroot_env_.get();
-    CreateLoggerFromOptions(dbname_, logger_options, &logger_);
+    // TODO: This should really be an EXPECT_OK, but this CreateLogger fails
+    // regularly in some environments with "no such directory"
+    CreateLoggerFromOptions(dbname_, logger_options, &logger_)
+        .PermitUncheckedError();
+
+    // The sync option is not easily testable in unit tests, but should be
+    // smoke tested across all the other backup tests. However, it is
+    // certainly not worth doubling the runtime of backup tests for it.
+    // Thus, we can enable sync for one of our alternate testing
+    // configurations.
+    constexpr bool kUseSync =
+#ifdef ROCKSDB_MODIFY_NPHASH
+        true;
+#else
+        false;
+#endif  // ROCKSDB_MODIFY_NPHASH
 
     // set up backup db options
     backupable_options_.reset(new BackupableDBOptions(
-        backupdir_, test_backup_env_.get(), true, logger_.get(), true));
+        backupdir_, test_backup_env_.get(), /*share_table_files*/ true,
+        logger_.get(), kUseSync));
 
     // most tests will use multi-threaded backups
     backupable_options_->max_background_operations = 7;
 
     // delete old files in db
     DestroyDB(dbname_, options_);
+
+    // delete old LATEST_BACKUP file, which some tests create for compatibility
+    // testing.
+    backup_chroot_env_->DeleteFile(latest_backup_).PermitUncheckedError();
   }
 
   DB* OpenDB() {
@@ -571,10 +692,23 @@
     return db;
   }
 
-  void OpenDBAndBackupEngine(bool destroy_old_data = false, bool dummy = false,
-                             ShareOption shared_option = kShareNoChecksum) {
-    // reset all the defaults
-    test_backup_env_->SetLimitWrittenFiles(1000000);
+  void CloseAndReopenDB(bool read_only = false) {
+    // Close DB
+    db_.reset();
+
+    // Open DB
+    test_db_env_->SetLimitWrittenFiles(1000000);
+    DB* db;
+    if (read_only) {
+      ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
+    } else {
+      ASSERT_OK(DB::Open(options_, dbname_, &db));
+    }
+    db_.reset(db);
+  }
+
+  void InitializeDBAndBackupEngine(bool dummy = false) {
+    // reset all the db env defaults
     test_db_env_->SetLimitWrittenFiles(1000000);
     test_db_env_->SetDummySequentialFile(dummy);
 
@@ -586,14 +720,19 @@
       ASSERT_OK(DB::Open(options_, dbname_, &db));
     }
     db_.reset(db);
+  }
+
+  virtual void OpenDBAndBackupEngine(
+      bool destroy_old_data = false, bool dummy = false,
+      ShareOption shared_option = kShareNoChecksum) {
+    InitializeDBAndBackupEngine(dummy);
+    // reset backup env defaults
+    test_backup_env_->SetLimitWrittenFiles(1000000);
     backupable_options_->destroy_old_data = destroy_old_data;
     backupable_options_->share_table_files = shared_option != kNoShare;
     backupable_options_->share_files_with_checksum =
         shared_option == kShareWithChecksum;
-    BackupEngine* backup_engine;
-    ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
-                                 &backup_engine));
-    backup_engine_.reset(backup_engine);
+    OpenBackupEngine(destroy_old_data);
   }
 
   void CloseDBAndBackupEngine() {
@@ -601,8 +740,8 @@
     backup_engine_.reset();
   }
 
-  void OpenBackupEngine() {
-    backupable_options_->destroy_old_data = false;
+  void OpenBackupEngine(bool destroy_old_data = false) {
+    backupable_options_->destroy_old_data = destroy_old_data;
     BackupEngine* backup_engine;
     ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
                                  &backup_engine));
@@ -611,6 +750,69 @@
 
   void CloseBackupEngine() { backup_engine_.reset(nullptr); }
 
+  // cross-cutting test of GetBackupInfo
+  void AssertBackupInfoConsistency() {
+    std::vector<BackupInfo> backup_info;
+    backup_engine_->GetBackupInfo(&backup_info, /*with file details*/ true);
+    std::map<std::string, uint64_t> file_sizes;
+
+    // Find the files that are supposed to be there
+    for (auto& backup : backup_info) {
+      uint64_t sum_for_backup = 0;
+      for (auto& file : backup.file_details) {
+        auto e = file_sizes.find(file.relative_filename);
+        if (e == file_sizes.end()) {
+          // fprintf(stderr, "Adding %s -> %u\n",
+          // file.relative_filename.c_str(), (unsigned)file.size);
+          file_sizes[file.relative_filename] = file.size;
+        } else {
+          ASSERT_EQ(file_sizes[file.relative_filename], file.size);
+        }
+        sum_for_backup += file.size;
+      }
+      ASSERT_EQ(backup.size, sum_for_backup);
+    }
+
+    std::vector<BackupID> corrupt_backup_ids;
+    backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+    bool has_corrupt = corrupt_backup_ids.size() > 0;
+
+    // Compare with what's in backup dir
+    std::vector<std::string> child_dirs;
+    ASSERT_OK(
+        test_backup_env_->GetChildren(backupdir_ + "/private", &child_dirs));
+    for (auto& dir : child_dirs) {
+      dir = "private/" + dir;
+    }
+    child_dirs.push_back("shared");           // might not exist
+    child_dirs.push_back("shared_checksum");  // might not exist
+    for (auto& dir : child_dirs) {
+      std::vector<std::string> children;
+      test_backup_env_->GetChildren(backupdir_ + "/" + dir, &children)
+          .PermitUncheckedError();
+      // fprintf(stderr, "ls %s\n", (backupdir_ + "/" + dir).c_str());
+      for (auto& file : children) {
+        uint64_t size;
+        size = UINT64_MAX;  // appease clang-analyze
+        std::string rel_file = dir + "/" + file;
+        // fprintf(stderr, "stat %s\n", (backupdir_ + "/" + rel_file).c_str());
+        ASSERT_OK(
+            test_backup_env_->GetFileSize(backupdir_ + "/" + rel_file, &size));
+        auto e = file_sizes.find(rel_file);
+        if (e == file_sizes.end()) {
+          // The only case in which we should find files not reported
+          ASSERT_TRUE(has_corrupt);
+        } else {
+          ASSERT_EQ(e->second, size);
+          file_sizes.erase(e);
+        }
+      }
+    }
+
+    // Everything should have been matched
+    ASSERT_EQ(file_sizes.size(), 0);
+  }
+
   // restores backup backup_id and asserts the existence of
   // [start_exist, end_exist> and not-existence of
   // [end_exist, end>
@@ -626,6 +828,9 @@
       opened_backup_engine = true;
       OpenBackupEngine();
     }
+    AssertBackupInfoConsistency();
+
+    // Now perform restore
     if (backup_id > 0) {
       ASSERT_OK(backup_engine_->RestoreDBFromBackup(backup_id, dbname_, dbname_,
                                                     restore_options));
@@ -634,6 +839,7 @@
                                                           restore_options));
     }
     DB* db = OpenDB();
+    // Check DB contents
     AssertExists(db, start_exist, end_exist);
     if (end != 0) {
       AssertEmpty(db, end_exist, end);
@@ -646,20 +852,119 @@
 
   void DeleteLogFiles() {
     std::vector<std::string> delete_logs;
-    db_chroot_env_->GetChildren(dbname_, &delete_logs);
+    ASSERT_OK(db_chroot_env_->GetChildren(dbname_, &delete_logs));
     for (auto f : delete_logs) {
       uint64_t number;
       FileType type;
       bool ok = ParseFileName(f, &number, &type);
-      if (ok && type == kLogFile) {
-        db_chroot_env_->DeleteFile(dbname_ + "/" + f);
+      if (ok && type == kWalFile) {
+        ASSERT_OK(db_chroot_env_->DeleteFile(dbname_ + "/" + f));
+      }
+    }
+  }
+
+  Status GetDataFilesInDB(const FileType& file_type,
+                          std::vector<FileAttributes>* files) {
+    std::vector<std::string> live;
+    uint64_t ignore_manifest_size;
+    Status s = db_->GetLiveFiles(live, &ignore_manifest_size, /*flush*/ false);
+    if (!s.ok()) {
+      return s;
+    }
+    std::vector<FileAttributes> children;
+    s = test_db_env_->GetChildrenFileAttributes(dbname_, &children);
+    for (const auto& child : children) {
+      FileType type;
+      uint64_t number = 0;
+      if (ParseFileName(child.name, &number, &type) && type == file_type &&
+          std::find(live.begin(), live.end(), "/" + child.name) != live.end()) {
+        files->push_back(child);
       }
     }
+    return s;
+  }
+
+  Status GetRandomDataFileInDB(const FileType& file_type,
+                               std::string* fname_out,
+                               uint64_t* fsize_out = nullptr) {
+    Random rnd(6);  // NB: hardly "random"
+    std::vector<FileAttributes> files;
+    Status s = GetDataFilesInDB(file_type, &files);
+    if (!s.ok()) {
+      return s;
+    }
+    if (files.empty()) {
+      return Status::NotFound("");
+    }
+    size_t i = rnd.Uniform(static_cast<int>(files.size()));
+    *fname_out = dbname_ + "/" + files[i].name;
+    if (fsize_out) {
+      *fsize_out = files[i].size_bytes;
+    }
+    return Status::OK();
+  }
+
+  Status CorruptRandomDataFileInDB(const FileType& file_type) {
+    std::string fname;
+    uint64_t fsize = 0;
+    Status s = GetRandomDataFileInDB(file_type, &fname, &fsize);
+    if (!s.ok()) {
+      return s;
+    }
+
+    std::string file_contents;
+    s = ReadFileToString(test_db_env_.get(), fname, &file_contents);
+    if (!s.ok()) {
+      return s;
+    }
+    s = test_db_env_->DeleteFile(fname);
+    if (!s.ok()) {
+      return s;
+    }
+
+    file_contents[0] = (file_contents[0] + 257) % 256;
+    return WriteStringToFile(test_db_env_.get(), file_contents, fname);
+  }
+
+  void AssertDirectoryFilesMatchRegex(const std::string& dir,
+                                      const TestRegex& pattern,
+                                      const std::string& file_type,
+                                      int minimum_count) {
+    std::vector<FileAttributes> children;
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+    int found_count = 0;
+    for (const auto& child : children) {
+      if (EndsWith(child.name, file_type)) {
+        ASSERT_MATCHES_REGEX(child.name, pattern);
+        ++found_count;
+      }
+    }
+    ASSERT_GE(found_count, minimum_count);
+  }
+
+  void AssertDirectoryFilesSizeIndicators(const std::string& dir,
+                                          int minimum_count) {
+    std::vector<FileAttributes> children;
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+    int found_count = 0;
+    for (const auto& child : children) {
+      auto last_underscore = child.name.find_last_of('_');
+      auto last_dot = child.name.find_last_of('.');
+      ASSERT_NE(child.name, child.name.substr(0, last_underscore));
+      ASSERT_NE(child.name, child.name.substr(0, last_dot));
+      ASSERT_LT(last_underscore, last_dot);
+      std::string s = child.name.substr(last_underscore + 1,
+                                        last_dot - (last_underscore + 1));
+      ASSERT_EQ(s, ToString(child.size_bytes));
+      ++found_count;
+    }
+    ASSERT_GE(found_count, minimum_count);
   }
 
   // files
   std::string dbname_;
   std::string backupdir_;
+  std::string latest_backup_;
 
   // logger_ must be above backup_engine_ such that the engine's destructor,
   // which uses a raw pointer to the logger, executes first.
@@ -671,9 +976,10 @@
   std::unique_ptr<TestEnv> test_db_env_;
   std::unique_ptr<TestEnv> test_backup_env_;
   std::unique_ptr<FileManager> file_manager_;
+  std::unique_ptr<FileManager> db_file_manager_;
 
   // all the dbs!
-  DummyDB* dummy_db_; // BackupableDB owns dummy_db_
+  DummyDB* dummy_db_;  // owned as db_ when present
   std::unique_ptr<DB> db_;
   std::unique_ptr<BackupEngine> backup_engine_;
 
@@ -682,7 +988,7 @@
 
  protected:
   std::unique_ptr<BackupableDBOptions> backupable_options_;
-}; // BackupableDBTest
+};  // BackupEngineTest
 
 void AppendPath(const std::string& path, std::vector<std::string>& v) {
   for (auto& f : v) {
@@ -690,20 +996,66 @@
   }
 }
 
-class BackupableDBTestWithParam : public BackupableDBTest,
+class BackupEngineTestWithParam : public BackupEngineTest,
                                   public testing::WithParamInterface<bool> {
  public:
-  BackupableDBTestWithParam() {
+  BackupEngineTestWithParam() {
     backupable_options_->share_files_with_checksum = GetParam();
   }
+  void OpenDBAndBackupEngine(
+      bool destroy_old_data = false, bool dummy = false,
+      ShareOption shared_option = kShareNoChecksum) override {
+    BackupEngineTest::InitializeDBAndBackupEngine(dummy);
+    // reset backup env defaults
+    test_backup_env_->SetLimitWrittenFiles(1000000);
+    backupable_options_->destroy_old_data = destroy_old_data;
+    backupable_options_->share_table_files = shared_option != kNoShare;
+    // NOTE: keep share_files_with_checksum setting from constructor
+    OpenBackupEngine(destroy_old_data);
+  }
 };
 
+TEST_F(BackupEngineTest, FileCollision) {
+  const int keys_iteration = 100;
+  for (const auto& sopt : kAllShareOptions) {
+    OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+    FillDB(db_.get(), 0, keys_iteration);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    FillDB(db_.get(), keys_iteration, keys_iteration * 2);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    CloseDBAndBackupEngine();
+
+    // If the db directory has been cleaned up, it is sensitive to file
+    // collision.
+    ASSERT_OK(DestroyDB(dbname_, options_));
+
+    // open fresh DB, but old backups present
+    OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                          sopt);
+    FillDB(db_.get(), 0, keys_iteration);
+    ASSERT_OK(db_->Flush(FlushOptions()));  // like backup would do
+    FillDB(db_.get(), keys_iteration, keys_iteration * 2);
+    if (sopt != kShareNoChecksum) {
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    } else {
+      // The new table files created in FillDB() will clash with the old
+      // backup and sharing tables with no checksum will have the file
+      // collision problem.
+      ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+      ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    }
+    CloseDBAndBackupEngine();
+
+    // delete old data
+    ASSERT_OK(DestroyDB(dbname_, options_));
+  }
+}
+
 // This test verifies that the verifyBackup method correctly identifies
 // invalid backups
-TEST_P(BackupableDBTestWithParam, VerifyBackup) {
+TEST_P(BackupEngineTestWithParam, VerifyBackup) {
   const int keys_iteration = 5000;
-  Random rnd(6);
-  Status s;
   OpenDBAndBackupEngine(true);
   // create five backups
   for (int i = 0; i < 5; ++i) {
@@ -717,13 +1069,13 @@
   ASSERT_TRUE(backup_engine_->VerifyBackup(1).ok());
 
   // ---------- case 2. - delete a file -----------i
-  file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1");
+  ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1"));
   ASSERT_TRUE(backup_engine_->VerifyBackup(1).IsNotFound());
 
   // ---------- case 3. - corrupt a file -----------
   std::string append_data = "Corrupting a random file";
-  file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2",
-                                         append_data);
+  ASSERT_OK(file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2",
+                                                   append_data));
   ASSERT_TRUE(backup_engine_->VerifyBackup(2).IsCorruption());
 
   // ---------- case 4. - invalid backup -----------
@@ -731,8 +1083,9 @@
   CloseDBAndBackupEngine();
 }
 
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 // open DB, write, close DB, backup, restore, repeat
-TEST_P(BackupableDBTestWithParam, OfflineIntegrationTest) {
+TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) {
   // has to be a big number, so that it triggers the memtable flush
   const int keys_iteration = 5000;
   const int max_key = keys_iteration * 4 + 10;
@@ -754,7 +1107,8 @@
       // ---- insert new data and back up ----
       OpenDBAndBackupEngine(destroy_data);
       destroy_data = false;
-      FillDB(db_.get(), keys_iteration * i, fill_up_to);
+      // kAutoFlushOnly to preserve legacy test behavior (consider updating)
+      FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly);
       ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0));
       CloseDBAndBackupEngine();
       DestroyDB(dbname_, options_);
@@ -779,7 +1133,7 @@
 }
 
 // open DB, write, backup, write, backup, close, restore
-TEST_P(BackupableDBTestWithParam, OnlineIntegrationTest) {
+TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) {
   // has to be a big number, so that it triggers the memtable flush
   const int keys_iteration = 5000;
   const int max_key = keys_iteration * 4 + 10;
@@ -787,6 +1141,11 @@
   // delete old data
   DestroyDB(dbname_, options_);
 
+  // TODO: Implement & test db_paths support in backup (not supported in
+  // restore)
+  // options_.db_paths.emplace_back(dbname_, 500 * 1024);
+  // options_.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+
   OpenDBAndBackupEngine(true);
   // write some data, backup, repeat
   for (int i = 0; i < 5; ++i) {
@@ -797,7 +1156,8 @@
     // in last iteration, put smaller amount of data,
     // so that backups can share sst files
     int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
-    FillDB(db_.get(), keys_iteration * i, fill_up_to);
+    // kAutoFlushOnly to preserve legacy test behavior (consider updating)
+    FillDB(db_.get(), keys_iteration * i, fill_up_to, kAutoFlushOnly);
     // we should get consistent results with flush_before_backup
     // set to both true and false
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
@@ -839,21 +1199,21 @@
 
   CloseBackupEngine();
 }
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
-INSTANTIATE_TEST_CASE_P(BackupableDBTestWithParam, BackupableDBTestWithParam,
+INSTANTIATE_TEST_CASE_P(BackupEngineTestWithParam, BackupEngineTestWithParam,
                         ::testing::Bool());
 
 // this will make sure that backup does not copy the same file twice
-TEST_F(BackupableDBTest, NoDoubleCopy_And_AutoGC) {
+TEST_F(BackupEngineTest, NoDoubleCopy_And_AutoGC) {
   OpenDBAndBackupEngine(true, true);
 
   // should write 5 DB files + one meta file
   test_backup_env_->SetLimitWrittenFiles(7);
   test_backup_env_->ClearWrittenFiles();
   test_db_env_->SetLimitWrittenFiles(0);
-  dummy_db_->live_files_ = {"/00010.sst", "/00011.sst", "/CURRENT",
-                            "/MANIFEST-01"};
-  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+  dummy_db_->live_files_ = {"00010.sst", "00011.sst", "CURRENT", "MANIFEST-01",
+                            "00011.log"};
   test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
   std::vector<std::string> should_have_written = {
@@ -870,9 +1230,8 @@
     test_backup_env_->SetLimitWrittenFiles(6);
     test_backup_env_->ClearWrittenFiles();
 
-    dummy_db_->live_files_ = {"/00010.sst", "/" + other_sst, "/CURRENT",
-                              "/MANIFEST-01"};
-    dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+    dummy_db_->live_files_ = {"00010.sst", other_sst, "CURRENT", "MANIFEST-01",
+                              "00011.log"};
     test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
     // should not open 00010.sst - it's already there
@@ -897,9 +1256,11 @@
 
   // MANIFEST file size should be only 100
   uint64_t size = 0;
-  test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size);
+  ASSERT_OK(test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01",
+                                          &size));
   ASSERT_EQ(100UL, size);
-  test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
+  ASSERT_OK(
+      test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size));
   ASSERT_EQ(200UL, size);
 
   CloseBackupEngine();
@@ -949,7 +1310,7 @@
 //      fine
 // 3. Corrupted checksum value - if the checksum is not a valid uint32_t,
 //      db open should fail, otherwise, it aborts during the restore process.
-TEST_F(BackupableDBTest, CorruptionsTest) {
+TEST_F(BackupEngineTest, CorruptionsTest) {
   const int keys_iteration = 5000;
   Random rnd(6);
   Status s;
@@ -967,7 +1328,7 @@
   test_backup_env_->SetLimitWrittenFiles(2);
   // should fail
   s = backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2));
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
   test_backup_env_->SetLimitWrittenFiles(1000000);
   // latest backup should have all the keys
   CloseDBAndBackupEngine();
@@ -979,7 +1340,7 @@
   AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
   OpenBackupEngine();
   s = backup_engine_->RestoreDBFromBackup(5, dbname_, dbname_);
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
   CloseBackupEngine();
   ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
   // 4 is corrupted, 3 is the latest backup now
@@ -987,7 +1348,7 @@
   OpenBackupEngine();
   s = backup_engine_->RestoreDBFromBackup(4, dbname_, dbname_);
   CloseBackupEngine();
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
 
   // --------- case 3. corrupted checksum value ----
   ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false));
@@ -1001,7 +1362,7 @@
   OpenBackupEngine();
   ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
   s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
 
   // make sure that no corrupt backups have actually been deleted!
   ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/1"));
@@ -1038,7 +1399,6 @@
             file_manager_->FileExists(backupdir_ + "/meta/2"));
   ASSERT_EQ(Status::NotFound(),
             file_manager_->FileExists(backupdir_ + "/private/2"));
-
   CloseBackupEngine();
   AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
 
@@ -1050,7 +1410,329 @@
   AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
 }
 
-TEST_F(BackupableDBTest, InterruptCreationTest) {
+// Corrupt a file but maintain its size
+TEST_F(BackupEngineTest, CorruptFileMaintainSize) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true);
+  // create a backup
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  // verify with file size
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // verify with file checksum
+  ASSERT_OK(backup_engine_->VerifyBackup(1, true));
+
+  std::string file_to_corrupt;
+  uint64_t file_size = 0;
+  // under normal circumstance, there should be at least one nonempty file
+  while (file_size == 0) {
+    // get a random file in /private/1
+    assert(file_manager_
+               ->GetRandomFileInDir(backupdir_ + "/private/1", &file_to_corrupt,
+                                    &file_size)
+               .ok());
+    // corrupt the file by replacing its content by file_size random bytes
+    ASSERT_OK(file_manager_->CorruptFile(file_to_corrupt, file_size));
+  }
+  // file sizes match
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // file checksums mismatch
+  ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+  // sanity check, use default second argument
+  ASSERT_OK(backup_engine_->VerifyBackup(1));
+  CloseDBAndBackupEngine();
+
+  // an extra challenge
+  // set share_files_with_checksum to true and do two more backups
+  // corrupt all the table files in shared_checksum but maintain their sizes
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum);
+  // creat two backups
+  for (int i = 1; i < 3; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  std::vector<FileAttributes> children;
+  const std::string dir = backupdir_ + "/shared_checksum";
+  ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+  for (const auto& child : children) {
+    if (child.size_bytes == 0) {
+      continue;
+    }
+    // corrupt the file by replacing its content by file_size random bytes
+    ASSERT_OK(
+        file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes));
+  }
+  // file sizes match
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  ASSERT_OK(backup_engine_->VerifyBackup(2, false));
+  // file checksums mismatch
+  ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+  ASSERT_NOK(backup_engine_->VerifyBackup(2, true));
+  CloseDBAndBackupEngine();
+}
+
+// Corrupt a blob file but maintain its size
+TEST_P(BackupEngineTestWithParam, CorruptBlobFileMaintainSize) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true);
+  // create a backup
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  // verify with file size
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // verify with file checksum
+  ASSERT_OK(backup_engine_->VerifyBackup(1, true));
+
+  std::string file_to_corrupt;
+  std::vector<FileAttributes> children;
+
+  std::string dir = backupdir_;
+  if (backupable_options_->share_files_with_checksum) {
+    dir += "/shared_checksum";
+  } else {
+    dir += "/shared";
+  }
+
+  ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+
+  for (const auto& child : children) {
+    if (EndsWith(child.name, ".blob") && child.size_bytes != 0) {
+      // corrupt the blob files by replacing its content by file_size random
+      // bytes
+      ASSERT_OK(
+          file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes));
+    }
+  }
+
+  // file sizes match
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // file checksums mismatch
+  ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+  // sanity check, use default second argument
+  ASSERT_OK(backup_engine_->VerifyBackup(1));
+  CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some table has been
+// corrupted and the table file checksum is stored in the DB manifest
+TEST_F(BackupEngineTest, TableFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // file_checksum_gen_factory is null, and thus table checksum is not
+  // verified for creating a new backup; no correction is detected
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  ASSERT_OK(DestroyDB(dbname_, options_));
+
+  // Enable table file checksum in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // table file checksum is enabled so we should be able to detect any
+  // corruption
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some blob files has
+// been corrupted and the blob file checksum is stored in the DB manifest
+TEST_F(BackupEngineTest, BlobFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // file_checksum_gen_factory is null, and thus blob checksum is not
+  // verified for creating a new backup; no correction is detected
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  ASSERT_OK(DestroyDB(dbname_, options_));
+
+  // Enable file checksum in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+
+  // file checksum is enabled so we should be able to detect any
+  // corruption
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test if BackupEngine will fail to create new backup if some table has been
+// corrupted and the table file checksum is stored in the DB manifest for the
+// case when backup table files will be stored in a shared directory
+TEST_P(BackupEngineTestWithParam, TableFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // cannot detect corruption since DB manifest has no table checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  ASSERT_OK(DestroyDB(dbname_, options_));
+
+  // Enable table checksums in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random table file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // corruption is detected
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some blob files have
+// been corrupted and the blob file checksum is stored in the DB manifest for
+// the case when backup blob files will be stored in a shared directory
+TEST_P(BackupEngineTestWithParam, BlobFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // cannot detect corruption since DB manifest has no blob file checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  ASSERT_OK(DestroyDB(dbname_, options_));
+
+  // Enable blob file checksums in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // corruption is detected
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(BackupEngineTest, TableFileWithoutDbChecksumCorruptedDuringBackup) {
+  const int keys_iteration = 50000;
+  backupable_options_->share_files_with_checksum_naming =
+      kLegacyCrc32cAndFileSize;
+  // When share_files_with_checksum is on, we calculate checksums of table
+  // files before and after copying. So we can test whether a corruption has
+  // happened during the file is copied to backup directory.
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum);
+
+  FillDB(db_.get(), 0, keys_iteration);
+  std::atomic<bool> corrupted{false};
+  // corrupt files when copying to the backup directory
+  SyncPoint::GetInstance()->SetCallBack(
+      "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+      [&](void* data) {
+        if (data != nullptr) {
+          Slice* d = reinterpret_cast<Slice*>(data);
+          if (!d->empty()) {
+            d->remove_suffix(1);
+            corrupted = true;
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = backup_engine_->CreateNewBackup(db_.get());
+  if (corrupted) {
+    ASSERT_NOK(s);
+  } else {
+    // should not in this path in normal cases
+    ASSERT_OK(s);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  CloseDBAndBackupEngine();
+  // delete old files in db
+  ASSERT_OK(DestroyDB(dbname_, options_));
+}
+
+TEST_F(BackupEngineTest, TableFileWithDbChecksumCorruptedDuringBackup) {
+  const int keys_iteration = 50000;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  for (auto& sopt : kAllShareOptions) {
+    // Since the default DB table file checksum is on, we obtain checksums of
+    // table files from the DB manifest before copying and verify it with the
+    // one calculated during copying.
+    // Therefore, we can test whether a corruption has happened during the file
+    // being copied to backup directory.
+    OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+
+    FillDB(db_.get(), 0, keys_iteration);
+
+    // corrupt files when copying to the backup directory
+    SyncPoint::GetInstance()->SetCallBack(
+        "BackupEngineImpl::CopyOrCreateFile:CorruptionDuringBackup",
+        [&](void* data) {
+          if (data != nullptr) {
+            Slice* d = reinterpret_cast<Slice*>(data);
+            if (!d->empty()) {
+              d->remove_suffix(1);
+            }
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    // The only case that we can't detect a corruption is when the file
+    // being backed up is empty. But as keys_iteration is large, such
+    // a case shouldn't have happened and we should be able to detect
+    // the corruption.
+    ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    CloseDBAndBackupEngine();
+    // delete old files in db
+    ASSERT_OK(DestroyDB(dbname_, options_));
+  }
+}
+
+TEST_F(BackupEngineTest, InterruptCreationTest) {
   // Interrupt backup creation by failing new writes and failing cleanup of the
   // partial state. Then verify a subsequent backup can still succeed.
   const int keys_iteration = 5000;
@@ -1061,8 +1743,7 @@
   test_backup_env_->SetLimitWrittenFiles(2);
   test_backup_env_->SetDeleteFileFailure(true);
   // should fail creation
-  ASSERT_FALSE(
-      backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)).ok());
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
   CloseDBAndBackupEngine();
   // should also fail cleanup so the tmp directory stays behind
   ASSERT_OK(backup_chroot_env_->FileExists(backupdir_ + "/private/1/"));
@@ -1076,6 +1757,56 @@
   AssertBackupConsistency(0, 0, keys_iteration);
 }
 
+TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
+  const int keys_iteration = 5000;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  for (const auto& sopt : kAllShareOptions) {
+    OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
+    FillDB(db_.get(), 0, keys_iteration);
+    // That FillDB leaves a mix of flushed and unflushed data
+    SyncPoint::GetInstance()->LoadDependency(
+        {{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1",
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before"},
+         {"BackupEngineTest::FlushCompactDuringBackupCheckpoint:After",
+          "CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}});
+    SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() {
+      TEST_SYNC_POINT(
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before");
+      FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      ASSERT_OK(dbi->TEST_WaitForCompact());
+      TEST_SYNC_POINT(
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:After");
+    }};
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    flush_thread.join();
+    CloseDBAndBackupEngine();
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    /* FIXME(peterd): reinstate with option for checksum in file names
+    if (sopt == kShareWithChecksum) {
+      // Ensure we actually got DB manifest checksums by inspecting
+      // shared_checksum file names for hex checksum component
+      TestRegex expected("[^_]+_[0-9A-F]{8}_[^_]+.sst");
+      std::vector<FileAttributes> children;
+      const std::string dir = backupdir_ + "/shared_checksum";
+      ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+      for (const auto& child : children) {
+        if (child.size_bytes == 0) {
+          continue;
+        }
+        EXPECT_MATCHES_REGEX(child.name, expected);
+      }
+    }
+    */
+    AssertBackupConsistency(0, 0, keys_iteration);
+  }
+}
+
 inline std::string OptionsPath(std::string ret, int backupID) {
   ret += "/private/";
   ret += std::to_string(backupID);
@@ -1086,7 +1817,7 @@
 // Backup the LATEST options file to
 // "<backup_dir>/private/<backup_id>/OPTIONS<number>"
 
-TEST_F(BackupableDBTest, BackupOptions) {
+TEST_F(BackupEngineTest, BackupOptions) {
   OpenDBAndBackupEngine(true);
   for (int i = 1; i < 5; i++) {
     std::string name;
@@ -1096,10 +1827,11 @@
     db_.reset();
     db_.reset(OpenDB());
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
-    ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(), options_.env,
-                                                &name);
+    ASSERT_OK(ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(),
+                                                          options_.env, &name));
     ASSERT_OK(file_manager_->FileExists(OptionsPath(backupdir_, i) + name));
-    backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i), &filenames);
+    ASSERT_OK(backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i),
+                                              &filenames));
     for (auto fn : filenames) {
       if (fn.compare(0, 7, "OPTIONS") == 0) {
         ASSERT_EQ(name, fn);
@@ -1110,17 +1842,17 @@
   CloseDBAndBackupEngine();
 }
 
-TEST_F(BackupableDBTest, SetOptionsBackupRaceCondition) {
+TEST_F(BackupEngineTest, SetOptionsBackupRaceCondition) {
   OpenDBAndBackupEngine(true);
   SyncPoint::GetInstance()->LoadDependency(
       {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
-        "BackupableDBTest::SetOptionsBackupRaceCondition:BeforeSetOptions"},
-       {"BackupableDBTest::SetOptionsBackupRaceCondition:AfterSetOptions",
+        "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions"},
+       {"BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions",
         "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
   SyncPoint::GetInstance()->EnableProcessing();
   ROCKSDB_NAMESPACE::port::Thread setoptions_thread{[this]() {
     TEST_SYNC_POINT(
-        "BackupableDBTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
+        "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
     DBImpl* dbi = static_cast<DBImpl*>(db_.get());
     // Change arbitrary option to trigger OPTIONS file deletion
     ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
@@ -1130,7 +1862,7 @@
     ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "false"}}));
     TEST_SYNC_POINT(
-        "BackupableDBTest::SetOptionsBackupRaceCondition:AfterSetOptions");
+        "BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions");
   }};
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
   setoptions_thread.join();
@@ -1139,10 +1871,9 @@
 
 // This test verifies we don't delete the latest backup when read-only option is
 // set
-TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
+TEST_F(BackupEngineTest, NoDeleteWithReadOnly) {
   const int keys_iteration = 5000;
   Random rnd(6);
-  Status s;
 
   OpenDBAndBackupEngine(true);
   // create five backups
@@ -1151,7 +1882,7 @@
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
   }
   CloseDBAndBackupEngine();
-  ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "4"));
+  ASSERT_OK(file_manager_->WriteToFile(latest_backup_, "4"));
 
   backupable_options_->destroy_old_data = false;
   BackupEngineReadOnly* read_only_backup_engine;
@@ -1172,7 +1903,7 @@
   delete read_only_backup_engine;
 }
 
-TEST_F(BackupableDBTest, FailOverwritingBackups) {
+TEST_F(BackupEngineTest, FailOverwritingBackups) {
   options_.write_buffer_size = 1024 * 1024 * 1024;  // 1GB
   options_.disable_auto_compactions = true;
 
@@ -1182,8 +1913,8 @@
     CloseDBAndBackupEngine();
     DeleteLogFiles();
     OpenDBAndBackupEngine(false);
-    FillDB(db_.get(), 100 * i, 100 * (i + 1));
-    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    FillDB(db_.get(), 100 * i, 100 * (i + 1), kFlushAll);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
   }
   CloseDBAndBackupEngine();
 
@@ -1193,23 +1924,24 @@
   CloseBackupEngine();
 
   OpenDBAndBackupEngine(false);
-  FillDB(db_.get(), 0, 300);
-  Status s = backup_engine_->CreateNewBackup(db_.get(), true);
+  // More data, bigger SST
+  FillDB(db_.get(), 1000, 1300, kFlushAll);
+  Status s = backup_engine_->CreateNewBackup(db_.get());
   // the new backup fails because new table files
   // clash with old table files from backups 4 and 5
   // (since write_buffer_size is huge, we can be sure that
   // each backup will generate only one sst file and that
-  // a file generated by a new backup is the same as
-  // sst file generated by backup 4)
+  // a file generated here would have the same name as an
+  // sst file generated by backup 4, and will be bigger)
   ASSERT_TRUE(s.IsCorruption());
   ASSERT_OK(backup_engine_->DeleteBackup(4));
   ASSERT_OK(backup_engine_->DeleteBackup(5));
   // now, the backup can succeed
-  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
   CloseDBAndBackupEngine();
 }
 
-TEST_F(BackupableDBTest, NoShareTableFiles) {
+TEST_F(BackupEngineTest, NoShareTableFiles) {
   const int keys_iteration = 5000;
   OpenDBAndBackupEngine(true, false, kNoShare);
   for (int i = 0; i < 5; ++i) {
@@ -1225,7 +1957,7 @@
 }
 
 // Verify that you can backup and restore with share_files_with_checksum on
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksums) {
   const int keys_iteration = 5000;
   OpenDBAndBackupEngine(true, false, kShareWithChecksum);
   for (int i = 0; i < 5; ++i) {
@@ -1242,7 +1974,7 @@
 
 // Verify that you can backup and restore using share_files_with_checksum set to
 // false and then transition this option to true
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsTransition) {
   const int keys_iteration = 5000;
   // set share_files_with_checksum to false
   OpenDBAndBackupEngine(true, false, kShareNoChecksum);
@@ -1272,8 +2004,8 @@
   // For an extra challenge, make sure that GarbageCollect / DeleteBackup
   // is OK even if we open without share_table_files
   OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
-  backup_engine_->DeleteBackup(1);
-  backup_engine_->GarbageCollect();
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
   CloseDBAndBackupEngine();
 
   // Verify rest (not deleted)
@@ -1283,9 +2015,467 @@
   }
 }
 
+// Verify backup and restore with various naming options, check names
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) {
+  ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+
+  const int keys_iteration = 5000;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseDBAndBackupEngine();
+
+  static const std::map<ShareFilesNaming, TestRegex> option_to_expected = {
+      {kLegacyCrc32cAndFileSize, "[0-9]+_[0-9]+_[0-9]+[.]sst"},
+      // kFlagIncludeFileSize redundant here
+      {kLegacyCrc32cAndFileSize | kFlagIncludeFileSize,
+       "[0-9]+_[0-9]+_[0-9]+[.]sst"},
+      {kUseDbSessionId, "[0-9]+_s[0-9A-Z]{20}[.]sst"},
+      {kUseDbSessionId | kFlagIncludeFileSize,
+       "[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"},
+  };
+
+  const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
+
+  for (const auto& pair : option_to_expected) {
+    CloseAndReopenDB();
+    backupable_options_->share_files_with_checksum_naming = pair.first;
+    OpenBackupEngine(true /*destroy_old_data*/);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    CloseDBAndBackupEngine();
+    AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", pair.second,
+                                   ".sst", 1 /* minimum_count */);
+    if (std::string::npos != pair.second.GetPattern().find("_[0-9]+[.]sst")) {
+      AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum",
+                                         1 /* minimum_count */);
+    }
+
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   blobfile_pattern, ".blob",
+                                   1 /* minimum_count */);
+  }
+}
+
+// Mimic SST file generated by pre-6.12 releases and verify that
+// old names are always used regardless of naming option.
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsOldFileNaming) {
+  const int keys_iteration = 5000;
+
+  // Pre-6.12 release did not include db id and db session id properties.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        props->db_id = "";
+        props->db_session_id = "";
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseDBAndBackupEngine();
+
+  // Old names should always be used on old files
+  const TestRegex sstfile_pattern("[0-9]+_[0-9]+_[0-9]+[.]sst");
+
+  const TestRegex blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
+
+  for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) {
+    CloseAndReopenDB();
+    backupable_options_->share_files_with_checksum_naming = option;
+    OpenBackupEngine(true /*destroy_old_data*/);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+    CloseDBAndBackupEngine();
+    AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   sstfile_pattern, ".sst",
+                                   1 /* minimum_count */);
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   blobfile_pattern, ".blob",
+                                   1 /* minimum_count */);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Test how naming options interact with detecting DB corruption
+// between incremental backups
+TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
+  const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
+
+  for (bool corrupt_before_first_backup : {false, true}) {
+    for (ShareFilesNaming option :
+         {share_no_checksum, kLegacyCrc32cAndFileSize, kNamingDefault}) {
+      auto share =
+          option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum;
+      if (option != share_no_checksum) {
+        backupable_options_->share_files_with_checksum_naming = option;
+      }
+      OpenDBAndBackupEngine(true, false, share);
+      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+      // A small SST file
+      ASSERT_OK(dbi->Put(WriteOptions(), "x", "y"));
+      ASSERT_OK(dbi->Flush(FlushOptions()));
+      // And a bigger one
+      ASSERT_OK(dbi->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+      ASSERT_OK(dbi->Flush(FlushOptions()));
+      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      CloseAndReopenDB(/*read_only*/ true);
+
+      std::vector<FileAttributes> table_files;
+      ASSERT_OK(GetDataFilesInDB(kTableFile, &table_files));
+      ASSERT_EQ(table_files.size(), 2);
+      std::string tf0 = dbname_ + "/" + table_files[0].name;
+      std::string tf1 = dbname_ + "/" + table_files[1].name;
+
+      CloseDBAndBackupEngine();
+
+      if (corrupt_before_first_backup) {
+        // This corrupts a data block, which does not cause DB open
+        // failure, only failure on accessing the block.
+        ASSERT_OK(db_file_manager_->CorruptFileStart(tf0));
+      }
+
+      OpenDBAndBackupEngine(false, false, share);
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+      CloseDBAndBackupEngine();
+
+      // if corrupt_before_first_backup, this undoes the initial corruption
+      ASSERT_OK(db_file_manager_->CorruptFileStart(tf0));
+
+      OpenDBAndBackupEngine(false, false, share);
+      Status s = backup_engine_->CreateNewBackup(db_.get());
+
+      // Even though none of the naming options catch the inconsistency
+      // between the first and second time backing up fname, in the case
+      // of kUseDbSessionId (kNamingDefault), this is an intentional
+      // trade-off to avoid full scan of files from the DB that are
+      // already backed up. If we did the scan, kUseDbSessionId could catch
+      // the corruption. kLegacyCrc32cAndFileSize does the scan (to
+      // compute checksum for name) without catching the corruption,
+      // because the corruption means the names don't merge.
+      EXPECT_OK(s);
+
+      // VerifyBackup doesn't check DB integrity or table file internal
+      // checksums
+      EXPECT_OK(backup_engine_->VerifyBackup(1, true));
+      EXPECT_OK(backup_engine_->VerifyBackup(2, true));
+
+      db_.reset();
+      ASSERT_OK(backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_));
+      {
+        DB* db = OpenDB();
+        s = db->VerifyChecksum();
+        delete db;
+      }
+      if (option != kLegacyCrc32cAndFileSize && !corrupt_before_first_backup) {
+        // Second backup is OK because it used (uncorrupt) file from first
+        // backup instead of (corrupt) file from DB.
+        // This is arguably a good trade-off vs. treating the file as distinct
+        // from the old version, because a file should be more likely to be
+        // corrupt as it ages. Although the backed-up file might also corrupt
+        // with age, the alternative approach (checksum in file name computed
+        // from current DB file contents) wouldn't detect that case at backup
+        // time either. Although you would have both copies of the file with
+        // the alternative approach, that would only last until the older
+        // backup is deleted.
+        ASSERT_OK(s);
+      } else if (option == kLegacyCrc32cAndFileSize &&
+                 corrupt_before_first_backup) {
+        // Second backup is OK because it saved the updated (uncorrupt)
+        // file from DB, instead of the sharing with first backup.
+        // Recall: if corrupt_before_first_backup, [second CorruptFileStart]
+        // undoes the initial corruption.
+        // This is arguably a bad trade-off vs. sharing the old version of the
+        // file because a file should be more likely to corrupt as it ages.
+        // (Not likely that the previously backed-up version was already
+        // corrupt and the new version is non-corrupt. This approach doesn't
+        // help if backed-up version is corrupted after taking the backup.)
+        ASSERT_OK(s);
+      } else {
+        // Something is legitimately corrupted, but we can't be sure what
+        // with information available (TODO? unless one passes block checksum
+        // test and other doesn't. Probably better to use end-to-end full file
+        // checksum anyway.)
+        ASSERT_TRUE(s.IsCorruption());
+      }
+
+      CloseDBAndBackupEngine();
+      ASSERT_OK(DestroyDB(dbname_, options_));
+    }
+  }
+}
+
+// Test how naming options interact with detecting file size corruption
+// between incremental backups
+TEST_F(BackupEngineTest, FileSizeForIncremental) {
+  const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
+  // TODO: enable blob files once Integrated BlobDB supports DB session id.
+  options_.enable_blob_files = false;
+
+  for (ShareFilesNaming option : {share_no_checksum, kLegacyCrc32cAndFileSize,
+                                  kNamingDefault, kUseDbSessionId}) {
+    auto share =
+        option == share_no_checksum ? kShareNoChecksum : kShareWithChecksum;
+    if (option != share_no_checksum) {
+      backupable_options_->share_files_with_checksum_naming = option;
+    }
+    OpenDBAndBackupEngine(true, false, share);
+
+    std::vector<FileAttributes> children;
+    const std::string shared_dir =
+        backupdir_ +
+        (option == share_no_checksum ? "/shared" : "/shared_checksum");
+
+    // A single small SST file
+    ASSERT_OK(db_->Put(WriteOptions(), "x", "y"));
+
+    // First, test that we always detect file size corruption on the shared
+    // backup side on incremental. (Since sizes aren't really part of backup
+    // meta file, this works by querying the filesystem for the sizes.)
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
+    CloseDBAndBackupEngine();
+
+    // Corrupt backup SST file
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+    ASSERT_EQ(children.size(), 1U);  // one sst
+    for (const auto& child : children) {
+      if (child.name.size() > 4 && child.size_bytes > 0) {
+        ASSERT_OK(
+            file_manager_->WriteToFile(shared_dir + "/" + child.name, "asdf"));
+        break;
+      }
+    }
+
+    OpenDBAndBackupEngine(false, false, share);
+    Status s = backup_engine_->CreateNewBackup(db_.get());
+    EXPECT_TRUE(s.IsCorruption());
+
+    ASSERT_OK(backup_engine_->PurgeOldBackups(0));
+    CloseDBAndBackupEngine();
+
+    // Second, test that a hypothetical db session id collision would likely
+    // not suffice to corrupt a backup, because there's a good chance of
+    // file size difference (in this test, guaranteed) so either no name
+    // collision or detected collision.
+
+    // Create backup 1
+    OpenDBAndBackupEngine(false, false, share);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+    // Even though we have "the same" DB state as backup 1, we need
+    // to restore to recreate the same conditions as later restore.
+    db_.reset();
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_));
+    CloseDBAndBackupEngine();
+
+    // Forge session id
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::SetDbSessionId", [](void* sid_void_star) {
+          std::string* sid = static_cast<std::string*>(sid_void_star);
+          *sid = "01234567890123456789";
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Create another SST file
+    OpenDBAndBackupEngine(false, false, share);
+    ASSERT_OK(db_->Put(WriteOptions(), "y", "x"));
+
+    // Create backup 2
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
+
+    // Restore backup 1 (again)
+    db_.reset();
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    ASSERT_OK(backup_engine_->RestoreDBFromBackup(1, dbname_, dbname_));
+    CloseDBAndBackupEngine();
+
+    // Create another SST file with same number and db session id, only bigger
+    OpenDBAndBackupEngine(false, false, share);
+    ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+
+    // Count backup SSTs files.
+    children.clear();
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+    ASSERT_EQ(children.size(), 2U);  // two sst files
+
+    // Try create backup 3
+    s = backup_engine_->CreateNewBackup(db_.get(), true /*flush*/);
+
+    // Re-count backup SSTs
+    children.clear();
+    ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
+
+    if (option == kUseDbSessionId) {
+      // Acceptable to call it corruption if size is not in name and
+      // db session id collision is practically impossible.
+      EXPECT_TRUE(s.IsCorruption());
+      EXPECT_EQ(children.size(), 2U);  // no SST file added
+    } else if (option == share_no_checksum) {
+      // Good to call it corruption if both backups cannot be
+      // accommodated.
+      EXPECT_TRUE(s.IsCorruption());
+      EXPECT_EQ(children.size(), 2U);  // no SST file added
+    } else {
+      // Since opening a DB seems sufficient for detecting size corruption
+      // on the DB side, this should be a good thing, ...
+      EXPECT_OK(s);
+      // ... as long as we did actually treat it as a distinct SST file.
+      EXPECT_EQ(children.size(), 3U);  // Another SST added
+    }
+    CloseDBAndBackupEngine();
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+}
+
+// Verify backup and restore with share_files_with_checksum off and then
+// transition this option to on and share_files_with_checksum_naming to be
+// based on kUseDbSessionId
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingTransition) {
+  const int keys_iteration = 5000;
+  // We may set share_files_with_checksum_naming to kLegacyCrc32cAndFileSize
+  // here but even if we don't, it should have no effect when
+  // share_files_with_checksum is false
+  ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+  // set share_files_with_checksum to false
+  OpenDBAndBackupEngine(true, false, kShareNoChecksum);
+  int j = 3;
+  for (int i = 0; i < j; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+
+  // set share_files_with_checksum to true and do some more backups
+  // and use session id in the name of SST file backup
+  ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+  // Use checksum in the name as well
+  ++j;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  // Verify first (about to delete)
+  AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1));
+
+  // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+  // is OK even if we open without share_table_files but with
+  // share_files_with_checksum_naming based on kUseDbSessionId
+  ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming ==
+              kNamingDefault);
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify second (about to delete)
+  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1));
+
+  // Use checksum and file size for backup table file names and open without
+  // share_table_files
+  // Again, make sure that GarbageCollect / DeleteBackup is OK
+  backupable_options_->share_files_with_checksum_naming =
+      kLegacyCrc32cAndFileSize;
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify rest (not deleted)
+  for (int i = 2; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+}
+
+// Verify backup and restore with share_files_with_checksum on and transition
+// from kLegacyCrc32cAndFileSize to kUseDbSessionId
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
+  backupable_options_->share_files_with_checksum_naming =
+      kLegacyCrc32cAndFileSize;
+  const int keys_iteration = 5000;
+  // set share_files_with_checksum to true
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  int j = 3;
+  for (int i = 0; i < j; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+
+  backupable_options_->share_files_with_checksum_naming = kUseDbSessionId;
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  ++j;
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  FillDB(db_.get(), keys_iteration * j, keys_iteration * (j + 1));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  // Verify first (about to delete)
+  AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * (j + 1));
+
+  // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+  // is OK even if we open without share_table_files
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify second (about to delete)
+  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * (j + 1));
+
+  // Use checksum and file size for backup table file names and open without
+  // share_table_files
+  // Again, make sure that GarbageCollect / DeleteBackup is OK
+  backupable_options_->share_files_with_checksum_naming =
+      kLegacyCrc32cAndFileSize;
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  ASSERT_OK(backup_engine_->GarbageCollect());
+  CloseDBAndBackupEngine();
+
+  // Verify rest (not deleted)
+  for (int i = 2; i < j; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
+                            keys_iteration * (j + 1));
+  }
+}
+
 // This test simulates cleaning up after aborted or incomplete creation
 // of a new backup.
-TEST_F(BackupableDBTest, DeleteTmpFiles) {
+TEST_F(BackupEngineTest, DeleteTmpFiles) {
   for (int cleanup_fn : {1, 2, 3, 4}) {
     for (ShareOption shared_option : kAllShareOptions) {
       OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
@@ -1318,11 +2508,11 @@
                std::make_pair(next_private, std::string("00003.sst")),
            }) {
         std::string dir = backupdir_ + "/" + dir_and_file.first;
-        file_manager_->CreateDir(dir);
+        ASSERT_OK(file_manager_->CreateDirIfMissing(dir));
         ASSERT_OK(file_manager_->FileExists(dir));
 
         std::string file = dir + "/" + dir_and_file.second;
-        file_manager_->WriteToFile(file, "tmp");
+        ASSERT_OK(file_manager_->WriteToFile(file, "tmp"));
         ASSERT_OK(file_manager_->FileExists(file));
 
         tmp_files_and_dirs.push_back(file);
@@ -1363,85 +2553,382 @@
   }
 }
 
-TEST_F(BackupableDBTest, KeepLogFiles) {
+TEST_F(BackupEngineTest, KeepLogFiles) {
   backupable_options_->backup_log_files = false;
   // basically infinite
   options_.WAL_ttl_seconds = 24 * 60 * 60;
   OpenDBAndBackupEngine(true);
-  FillDB(db_.get(), 0, 100);
-  ASSERT_OK(db_->Flush(FlushOptions()));
-  FillDB(db_.get(), 100, 200);
+  FillDB(db_.get(), 0, 100, kFlushAll);
+  FillDB(db_.get(), 100, 200, kFlushAll);
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
-  FillDB(db_.get(), 200, 300);
-  ASSERT_OK(db_->Flush(FlushOptions()));
-  FillDB(db_.get(), 300, 400);
-  ASSERT_OK(db_->Flush(FlushOptions()));
-  FillDB(db_.get(), 400, 500);
-  ASSERT_OK(db_->Flush(FlushOptions()));
+  FillDB(db_.get(), 200, 300, kFlushAll);
+  FillDB(db_.get(), 300, 400, kFlushAll);
+  FillDB(db_.get(), 400, 500, kFlushAll);
   CloseDBAndBackupEngine();
 
   // all data should be there if we call with keep_log_files = true
   AssertBackupConsistency(0, 0, 500, 600, true);
 }
 
-TEST_F(BackupableDBTest, RateLimiting) {
-  size_t const kMicrosPerSec = 1000 * 1000LL;
-  uint64_t const MB = 1024 * 1024;
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class BackupEngineRateLimitingTestWithParam
+    : public BackupEngineTest,
+      public testing::WithParamInterface<
+          std::tuple<bool /* make throttle */,
+                     int /* 0 = single threaded, 1 = multi threaded*/,
+                     std::pair<uint64_t, uint64_t> /* limits */>> {
+ public:
+  BackupEngineRateLimitingTestWithParam() {}
+};
 
-  const std::vector<std::pair<uint64_t, uint64_t>> limits(
-      {{1 * MB, 5 * MB}, {2 * MB, 3 * MB}});
+uint64_t const MB = 1024 * 1024;
+
+INSTANTIATE_TEST_CASE_P(
+    RateLimiting, BackupEngineRateLimitingTestWithParam,
+    ::testing::Values(std::make_tuple(false, 0, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(false, 0, std::make_pair(2 * MB, 3 * MB)),
+                      std::make_tuple(false, 1, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(false, 1, std::make_pair(2 * MB, 3 * MB)),
+                      std::make_tuple(true, 0, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(true, 0, std::make_pair(2 * MB, 3 * MB)),
+                      std::make_tuple(true, 1, std::make_pair(1 * MB, 5 * MB)),
+                      std::make_tuple(true, 1,
+                                      std::make_pair(2 * MB, 3 * MB))));
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimiting) {
+  size_t const kMicrosPerSec = 1000 * 1000LL;
 
   std::shared_ptr<RateLimiter> backupThrottler(NewGenericRateLimiter(1));
   std::shared_ptr<RateLimiter> restoreThrottler(NewGenericRateLimiter(1));
 
-  for (bool makeThrottler : {false, true}) {
-    if (makeThrottler) {
-      backupable_options_->backup_rate_limiter = backupThrottler;
-      backupable_options_->restore_rate_limiter = restoreThrottler;
-    }
-    // iter 0 -- single threaded
-    // iter 1 -- multi threaded
-    for (int iter = 0; iter < 2; ++iter) {
-      for (const auto& limit : limits) {
-        // destroy old data
-        DestroyDB(dbname_, Options());
-        if (makeThrottler) {
-          backupThrottler->SetBytesPerSecond(limit.first);
-          restoreThrottler->SetBytesPerSecond(limit.second);
-        } else {
-          backupable_options_->backup_rate_limit = limit.first;
-          backupable_options_->restore_rate_limit = limit.second;
-        }
-        backupable_options_->max_background_operations = (iter == 0) ? 1 : 10;
-        options_.compression = kNoCompression;
-        OpenDBAndBackupEngine(true);
-        size_t bytes_written = FillDB(db_.get(), 0, 100000);
+  bool makeThrottler = std::get<0>(GetParam());
+  if (makeThrottler) {
+    backupable_options_->backup_rate_limiter = backupThrottler;
+    backupable_options_->restore_rate_limiter = restoreThrottler;
+  }
+
+  // iter 0 -- single threaded
+  // iter 1 -- multi threaded
+  int iter = std::get<1>(GetParam());
+  const std::pair<uint64_t, uint64_t> limit = std::get<2>(GetParam());
+
+  // destroy old data
+  DestroyDB(dbname_, Options());
+  if (makeThrottler) {
+    backupThrottler->SetBytesPerSecond(limit.first);
+    restoreThrottler->SetBytesPerSecond(limit.second);
+  } else {
+    backupable_options_->backup_rate_limit = limit.first;
+    backupable_options_->restore_rate_limit = limit.second;
+  }
+  backupable_options_->max_background_operations = (iter == 0) ? 1 : 10;
+  options_.compression = kNoCompression;
+  OpenDBAndBackupEngine(true);
+  size_t bytes_written = FillDB(db_.get(), 0, 100000);
 
-        auto start_backup = db_chroot_env_->NowMicros();
-        ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
-        auto backup_time = db_chroot_env_->NowMicros() - start_backup;
-        auto rate_limited_backup_time =
-            (bytes_written * kMicrosPerSec) / limit.first;
-        ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
+  auto start_backup = db_chroot_env_->NowMicros();
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+  auto backup_time = db_chroot_env_->NowMicros() - start_backup;
+  auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / limit.first;
+  ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
 
-        CloseDBAndBackupEngine();
+  CloseDBAndBackupEngine();
 
-        OpenBackupEngine();
-        auto start_restore = db_chroot_env_->NowMicros();
-        ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
-        auto restore_time = db_chroot_env_->NowMicros() - start_restore;
-        CloseBackupEngine();
-        auto rate_limited_restore_time =
-            (bytes_written * kMicrosPerSec) / limit.second;
-        ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
+  OpenBackupEngine();
+  auto start_restore = db_chroot_env_->NowMicros();
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  auto restore_time = db_chroot_env_->NowMicros() - start_restore;
+  CloseBackupEngine();
+  auto rate_limited_restore_time =
+      (bytes_written * kMicrosPerSec) / limit.second;
+  ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
 
-        AssertBackupConsistency(0, 0, 100000, 100010);
-      }
-    }
+  AssertBackupConsistency(0, 0, 100000, 100010);
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingVerifyBackup) {
+  const std::size_t kMicrosPerSec = 1000 * 1000LL;
+  std::shared_ptr<RateLimiter> backupThrottler(NewGenericRateLimiter(
+      1, 100 * 1000 /* refill_period_us */, 10 /* fairness */,
+      RateLimiter::Mode::kAllIo /* mode */));
+
+  bool makeThrottler = std::get<0>(GetParam());
+  if (makeThrottler) {
+    backupable_options_->backup_rate_limiter = backupThrottler;
+  }
+
+  bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  backupable_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+  if (makeThrottler) {
+    backupable_options_->backup_rate_limiter->SetBytesPerSecond(
+        backup_rate_limiter_limit);
+  } else {
+    backupable_options_->backup_rate_limit = backup_rate_limiter_limit;
+  }
+
+  DestroyDB(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, 100000);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+
+  std::vector<BackupInfo> backup_infos;
+  BackupInfo backup_info;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  const int backup_id = 1;
+  ASSERT_EQ(backup_id, backup_infos[0].backup_id);
+  ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info,
+                                          true /* include_file_details */));
+
+  std::uint64_t bytes_read_during_verify_backup = 0;
+  for (BackupFileInfo backup_file_info : backup_info.file_details) {
+    bytes_read_during_verify_backup += backup_file_info.size;
   }
+
+  auto start_verify_backup = db_chroot_env_->NowMicros();
+  ASSERT_OK(
+      backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */));
+  auto verify_backup_time = db_chroot_env_->NowMicros() - start_verify_backup;
+  auto rate_limited_verify_backup_time =
+      (bytes_read_during_verify_backup * kMicrosPerSec) /
+      backup_rate_limiter_limit;
+
+  if (makeThrottler) {
+    EXPECT_GE(verify_backup_time, 0.8 * rate_limited_verify_backup_time);
+  }
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(backup_id, 0, 100000, 100010);
+  DestroyDB(dbname_, Options());
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInBackup) {
+  bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  backupable_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+  std::shared_ptr<RateLimiter> backup_rate_limiter(NewGenericRateLimiter(
+      backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */));
+  backupable_options_->backup_rate_limiter = backup_rate_limiter;
+
+  DestroyDB(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum /* shared_option */);
+  FillDB(db_.get(), 0, 10);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  std::int64_t total_bytes_through_with_no_read_charged =
+      backup_rate_limiter->GetTotalBytesThrough();
+  CloseBackupEngine();
+
+  backup_rate_limiter.reset(NewGenericRateLimiter(
+      backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+  backupable_options_->backup_rate_limiter = backup_rate_limiter;
+
+  OpenBackupEngine(true);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  std::int64_t total_bytes_through_with_read_charged =
+      backup_rate_limiter->GetTotalBytesThrough();
+  EXPECT_GT(total_bytes_through_with_read_charged,
+            total_bytes_through_with_no_read_charged);
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(1, 0, 10, 20);
+  DestroyDB(dbname_, Options());
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimitingChargeReadInRestore) {
+  bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  backupable_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  const std::uint64_t restore_rate_limiter_limit =
+      std::get<2>(GetParam()).second;
+  std::shared_ptr<RateLimiter> restore_rate_limiter(NewGenericRateLimiter(
+      restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kWritesOnly /* mode */));
+  backupable_options_->restore_rate_limiter = restore_rate_limiter;
+
+  DestroyDB(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, 10);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  CloseDBAndBackupEngine();
+  DestroyDB(dbname_, Options());
+
+  OpenBackupEngine(false /* destroy_old_data */);
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  std::int64_t total_bytes_through_with_no_read_charged =
+      restore_rate_limiter->GetTotalBytesThrough();
+  CloseBackupEngine();
+  DestroyDB(dbname_, Options());
+
+  restore_rate_limiter.reset(NewGenericRateLimiter(
+      restore_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+  backupable_options_->restore_rate_limiter = restore_rate_limiter;
+
+  OpenBackupEngine(false /* destroy_old_data */);
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  std::int64_t total_bytes_through_with_read_charged =
+      restore_rate_limiter->GetTotalBytesThrough();
+  EXPECT_EQ(total_bytes_through_with_read_charged,
+            total_bytes_through_with_no_read_charged * 2);
+  CloseBackupEngine();
+  AssertBackupConsistency(1, 0, 10, 20);
+  DestroyDB(dbname_, Options());
+}
+
+TEST_P(BackupEngineRateLimitingTestWithParam,
+       RateLimitingChargeReadInInitialize) {
+  bool is_single_threaded = std::get<1>(GetParam()) == 0 ? true : false;
+  backupable_options_->max_background_operations = is_single_threaded ? 1 : 10;
+
+  const std::uint64_t backup_rate_limiter_limit = std::get<2>(GetParam()).first;
+  std::shared_ptr<RateLimiter> backup_rate_limiter(NewGenericRateLimiter(
+      backup_rate_limiter_limit, 100 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */));
+  backupable_options_->backup_rate_limiter = backup_rate_limiter;
+
+  DestroyDB(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, 10);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(1, 0, 10, 20);
+
+  std::int64_t total_bytes_through_before_initialize =
+      backupable_options_->backup_rate_limiter->GetTotalBytesThrough();
+  OpenDBAndBackupEngine(false /* destroy_old_data */);
+  // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile,
+  // which is called in BackupEngineImpl::Initialize() during
+  // OpenBackupEngine(false)
+  EXPECT_GT(backupable_options_->backup_rate_limiter->GetTotalBytesThrough(),
+            total_bytes_through_before_initialize);
+  CloseDBAndBackupEngine();
+  DestroyDB(dbname_, Options());
+}
+
+class BackupEngineRateLimitingTestWithParam2
+    : public BackupEngineTest,
+      public testing::WithParamInterface<
+          std::tuple<std::pair<uint64_t, uint64_t> /* limits */>> {
+ public:
+  BackupEngineRateLimitingTestWithParam2() {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    LowRefillBytesPerPeriod, BackupEngineRateLimitingTestWithParam2,
+    ::testing::Values(std::make_tuple(std::make_pair(1, 1))));
+// To verify we don't request over-sized bytes relative to
+// refill_bytes_per_period_ in each RateLimiter::Request() called in
+// BackupEngine through verifying we don't trigger assertion
+// failure on over-sized request in GenericRateLimiter in debug builds
+TEST_P(BackupEngineRateLimitingTestWithParam2,
+       RateLimitingWithLowRefillBytesPerPeriod) {
+  SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true);
+
+  backupable_options_->max_background_operations = 1;
+  const uint64_t backup_rate_limiter_limit = std::get<0>(GetParam()).first;
+  std::shared_ptr<RateLimiter> backup_rate_limiter(
+      std::make_shared<GenericRateLimiter>(
+          backup_rate_limiter_limit, 1000 * 1000 /* refill_period_us */,
+          10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+          special_env.GetSystemClock(), false /* auto_tuned */));
+
+  backupable_options_->backup_rate_limiter = backup_rate_limiter;
+
+  const uint64_t restore_rate_limiter_limit = std::get<0>(GetParam()).second;
+  std::shared_ptr<RateLimiter> restore_rate_limiter(
+      std::make_shared<GenericRateLimiter>(
+          restore_rate_limiter_limit, 1000 * 1000 /* refill_period_us */,
+          10 /* fairness */, RateLimiter::Mode::kAllIo /* mode */,
+          special_env.GetSystemClock(), false /* auto_tuned */));
+
+  backupable_options_->restore_rate_limiter = restore_rate_limiter;
+
+  // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the
+  // `Env` to advance its time according to the fake wait duration. The
+  // workaround is to install a callback that advance the `Env`'s mock time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) {
+        int64_t time_waited_us = *static_cast<int64_t*>(arg);
+        special_env.SleepForMicroseconds(static_cast<int>(time_waited_us));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyDB(dbname_, Options());
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum /* shared_option */);
+
+  FillDB(db_.get(), 0, 100);
+  int64_t total_bytes_through_before_backup =
+      backupable_options_->backup_rate_limiter->GetTotalBytesThrough();
+  EXPECT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  int64_t total_bytes_through_after_backup =
+      backupable_options_->backup_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_backup,
+            total_bytes_through_before_backup);
+
+  std::vector<BackupInfo> backup_infos;
+  BackupInfo backup_info;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  const int backup_id = 1;
+  ASSERT_EQ(backup_id, backup_infos[0].backup_id);
+  ASSERT_OK(backup_engine_->GetBackupInfo(backup_id, &backup_info,
+                                          true /* include_file_details */));
+  int64_t total_bytes_through_before_verify_backup =
+      backupable_options_->backup_rate_limiter->GetTotalBytesThrough();
+  EXPECT_OK(
+      backup_engine_->VerifyBackup(backup_id, true /* verify_with_checksum */));
+  int64_t total_bytes_through_after_verify_backup =
+      backupable_options_->backup_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_verify_backup,
+            total_bytes_through_before_verify_backup);
+
+  CloseDBAndBackupEngine();
+  AssertBackupConsistency(backup_id, 0, 100, 101);
+
+  int64_t total_bytes_through_before_initialize =
+      backupable_options_->backup_rate_limiter->GetTotalBytesThrough();
+  OpenDBAndBackupEngine(false /* destroy_old_data */);
+  // We charge read in BackupEngineImpl::BackupMeta::LoadFromFile,
+  // which is called in BackupEngineImpl::Initialize() during
+  // OpenBackupEngine(false)
+  int64_t total_bytes_through_after_initialize =
+      backupable_options_->backup_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_initialize,
+            total_bytes_through_before_initialize);
+  CloseDBAndBackupEngine();
+
+  DestroyDB(dbname_, Options());
+  OpenBackupEngine(false /* destroy_old_data */);
+  int64_t total_bytes_through_before_restore =
+      backupable_options_->restore_rate_limiter->GetTotalBytesThrough();
+  EXPECT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  int64_t total_bytes_through_after_restore =
+      backupable_options_->restore_rate_limiter->GetTotalBytesThrough();
+  ASSERT_GT(total_bytes_through_after_restore,
+            total_bytes_through_before_restore);
+  CloseBackupEngine();
+
+  DestroyDB(dbname_, Options());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "GenericRateLimiter::Request:PostTimedWait");
 }
 
-TEST_F(BackupableDBTest, ReadOnlyBackupEngine) {
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
   DestroyDB(dbname_, options_);
   OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100);
@@ -1473,38 +2960,114 @@
   delete db;
 }
 
-TEST_F(BackupableDBTest, ProgressCallbackDuringBackup) {
+TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
   DestroyDB(dbname_, options_);
+  options_.write_dbid_to_manifest = false;
+
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+
+  options_.write_dbid_to_manifest = true;  // exercises some read-only DB code
+  CloseAndReopenDB();
+
+  FillDB(db_.get(), 100, 200);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+  db_.reset();  // CloseDB
+  DestroyDB(dbname_, options_);
+  BackupInfo backup_info;
+  // First, check that we get empty fields without include_file_details
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+                                          /*with file details*/ false));
+  ASSERT_EQ(backup_info.name_for_open, "");
+  ASSERT_FALSE(backup_info.env_for_open);
+
+  // Now for the real test
+  backup_info = BackupInfo();
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+                                          /*with file details*/ true));
+
+  // Caution: DBOptions only holds a raw pointer to Env, so something else
+  // must keep it alive.
+  // Case 1: Keeping BackupEngine open suffices to keep Env alive
+  DB* db = nullptr;
+  Options opts = options_;
+  // Ensure some key defaults are set
+  opts.wal_dir = "";
+  opts.create_if_missing = false;
+  opts.info_log.reset();
+
+  opts.env = backup_info.env_for_open.get();
+  std::string name = backup_info.name_for_open;
+  backup_info = BackupInfo();
+  ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+  AssertExists(db, 0, 100);
+  AssertEmpty(db, 100, 200);
+
+  delete db;
+  db = nullptr;
+
+  // Case 2: Keeping BackupInfo alive rather than BackupEngine also suffices
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 2U, &backup_info,
+                                          /*with file details*/ true));
+  CloseBackupEngine();
+  opts.create_if_missing = true;  // check also OK (though pointless)
+  opts.env = backup_info.env_for_open.get();
+  name = backup_info.name_for_open;
+  // Note: keeping backup_info alive
+  ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+  AssertExists(db, 0, 200);
+  delete db;
+  db = nullptr;
+
+  // Now try opening read-write and make sure it fails, for safety.
+  ASSERT_TRUE(DB::Open(opts, name, &db).IsIOError());
+}
+
+TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) {
+  DestroyDB(dbname_, options_);
+  // Too big for this small DB
+  backupable_options_->callback_trigger_interval_size = 100000;
   OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100);
   bool is_callback_invoked = false;
   ASSERT_OK(backup_engine_->CreateNewBackup(
       db_.get(), true,
       [&is_callback_invoked]() { is_callback_invoked = true; }));
+  ASSERT_FALSE(is_callback_invoked);
+  CloseBackupEngine();
 
+  // Easily small enough for this small DB
+  backupable_options_->callback_trigger_interval_size = 1000;
+  OpenBackupEngine();
+  ASSERT_OK(backup_engine_->CreateNewBackup(
+      db_.get(), true,
+      [&is_callback_invoked]() { is_callback_invoked = true; }));
   ASSERT_TRUE(is_callback_invoked);
   CloseDBAndBackupEngine();
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) {
+TEST_F(BackupEngineTest, GarbageCollectionBeforeBackup) {
   DestroyDB(dbname_, options_);
   OpenDBAndBackupEngine(true);
 
-  backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared");
-  std::string file_five = backupdir_ + "/shared/000007.sst";
+  ASSERT_OK(backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared"));
+  std::string file_five = backupdir_ + "/shared/000009.sst";
   std::string file_five_contents = "I'm not really a sst file";
-  // this depends on the fact that 00007.sst is the first file created by the DB
+  // this depends on the fact that 00009.sst is the first file created by the DB
   ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents));
 
   FillDB(db_.get(), 0, 100);
-  // backup overwrites file 000007.sst
-  ASSERT_TRUE(backup_engine_->CreateNewBackup(db_.get(), true).ok());
+  // backup overwrites file 000009.sst
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
 
   std::string new_file_five_contents;
   ASSERT_OK(ReadFileToString(backup_chroot_env_.get(), file_five,
                              &new_file_five_contents));
-  // file 000007.sst was overwritten
+  // file 000009.sst was overwritten
   ASSERT_TRUE(new_file_five_contents != file_five_contents);
 
   CloseDBAndBackupEngine();
@@ -1513,7 +3076,7 @@
 }
 
 // Test that we properly propagate Env failures
-TEST_F(BackupableDBTest, EnvFailures) {
+TEST_F(BackupEngineTest, EnvFailures) {
   BackupEngine* backup_engine;
 
   // get children failure
@@ -1545,7 +3108,7 @@
     DestroyDB(dbname_, options_);
     OpenDBAndBackupEngine(true);
     FillDB(db_.get(), 0, 100);
-    ASSERT_TRUE(backup_engine_->CreateNewBackup(db_.get(), true).ok());
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
     CloseDBAndBackupEngine();
     test_backup_env_->SetDummySequentialFile(true);
     test_backup_env_->SetDummySequentialFileFailReads(true);
@@ -1566,11 +3129,11 @@
 
 // Verify manifest can roll while a backup is being created with the old
 // manifest.
-TEST_F(BackupableDBTest, ChangeManifestDuringBackupCreation) {
+TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
   DestroyDB(dbname_, options_);
   options_.max_manifest_file_size = 0;  // always rollover manifest for file add
   OpenDBAndBackupEngine(true);
-  FillDB(db_.get(), 0, 100);
+  FillDB(db_.get(), 0, 100, kAutoFlushOnly);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
       {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
@@ -1591,12 +3154,16 @@
   // The last manifest roll would've already been cleaned up by the full scan
   // that happens when CreateNewBackup invokes EnableFileDeletions. We need to
   // trigger another roll to verify non-full scan purges stale manifests.
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_.get());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
   std::string prev_manifest_path =
       DescriptorFileName(dbname_, db_impl->TEST_Current_Manifest_FileNo());
-  FillDB(db_.get(), 0, 100);
+  FillDB(db_.get(), 0, 100, kAutoFlushOnly);
   ASSERT_OK(db_chroot_env_->FileExists(prev_manifest_path));
   ASSERT_OK(db_->Flush(FlushOptions()));
+  // Even though manual flush completed above, the background thread may not
+  // have finished its cleanup work. `TEST_WaitForBackgroundWork()` will wait
+  // until all the background thread's work has completed, including cleanup.
+  ASSERT_OK(db_impl->TEST_WaitForBackgroundWork());
   ASSERT_TRUE(db_chroot_env_->FileExists(prev_manifest_path).IsNotFound());
 
   CloseDBAndBackupEngine();
@@ -1605,10 +3172,11 @@
 }
 
 // see https://github.com/facebook/rocksdb/issues/921
-TEST_F(BackupableDBTest, Issue921Test) {
+TEST_F(BackupEngineTest, Issue921Test) {
   BackupEngine* backup_engine;
   backupable_options_->share_table_files = false;
-  backup_chroot_env_->CreateDirIfMissing(backupable_options_->backup_dir);
+  ASSERT_OK(
+      backup_chroot_env_->CreateDirIfMissing(backupable_options_->backup_dir));
   backupable_options_->backup_dir += "/new_dir";
   ASSERT_OK(BackupEngine::Open(backup_chroot_env_.get(), *backupable_options_,
                                &backup_engine));
@@ -1616,30 +3184,45 @@
   delete backup_engine;
 }
 
-TEST_F(BackupableDBTest, BackupWithMetadata) {
+TEST_F(BackupEngineTest, BackupWithMetadata) {
   const int keys_iteration = 5000;
   OpenDBAndBackupEngine(true);
   // create five backups
   for (int i = 0; i < 5; ++i) {
     const std::string metadata = std::to_string(i);
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(
-        backup_engine_->CreateNewBackupWithMetadata(db_.get(), metadata, true));
+    // Here also test CreateNewBackupWithMetadata with CreateBackupOptions
+    // and outputting saved BackupID.
+    CreateBackupOptions opts;
+    opts.flush_before_backup = true;
+    BackupID new_id = 0;
+    ASSERT_OK(backup_engine_->CreateNewBackupWithMetadata(opts, db_.get(),
+                                                          metadata, &new_id));
+    ASSERT_EQ(new_id, static_cast<BackupID>(i + 1));
   }
   CloseDBAndBackupEngine();
 
   OpenDBAndBackupEngine();
-  std::vector<BackupInfo> backup_infos;
-  backup_engine_->GetBackupInfo(&backup_infos);
-  ASSERT_EQ(5, backup_infos.size());
+  {  // Verify in bulk BackupInfo
+    std::vector<BackupInfo> backup_infos;
+    backup_engine_->GetBackupInfo(&backup_infos);
+    ASSERT_EQ(5, backup_infos.size());
+    for (int i = 0; i < 5; i++) {
+      ASSERT_EQ(std::to_string(i), backup_infos[i].app_metadata);
+    }
+  }
+  // Also verify in individual BackupInfo
   for (int i = 0; i < 5; i++) {
-    ASSERT_EQ(std::to_string(i), backup_infos[i].app_metadata);
+    BackupInfo backup_info;
+    ASSERT_OK(backup_engine_->GetBackupInfo(static_cast<BackupID>(i + 1),
+                                            &backup_info));
+    ASSERT_EQ(std::to_string(i), backup_info.app_metadata);
   }
   CloseDBAndBackupEngine();
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, BinaryMetadata) {
+TEST_F(BackupEngineTest, BinaryMetadata) {
   OpenDBAndBackupEngine(true);
   std::string binaryMetadata = "abc\ndef";
   binaryMetadata.push_back('\0');
@@ -1657,7 +3240,7 @@
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, MetadataTooLarge) {
+TEST_F(BackupEngineTest, MetadataTooLarge) {
   OpenDBAndBackupEngine(true);
   std::string largeMetadata(1024 * 1024 + 1, 0);
   ASSERT_NOK(
@@ -1666,7 +3249,372 @@
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, LimitBackupsOpened) {
+TEST_F(BackupEngineTest, FutureMetaSchemaVersion2_SizeCorruption) {
+  OpenDBAndBackupEngine(true);
+
+  // Backup 1: no future schema, no sizes, with checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 2: no checksums, no sizes
+  TEST_FutureSchemaVersion2Options test_opts;
+  test_opts.crc32c_checksums = false;
+  test_opts.file_sizes = false;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 3: no checksums, with sizes
+  test_opts.file_sizes = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 4: with checksums and sizes
+  test_opts.crc32c_checksums = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  CloseDBAndBackupEngine();
+
+  // Corrupt all the CURRENT files with the wrong size
+  const std::string private_dir = backupdir_ + "/private";
+
+  for (int id = 1; id <= 3; ++id) {
+    ASSERT_OK(file_manager_->WriteToFile(
+        private_dir + "/" + ToString(id) + "/CURRENT", "x"));
+  }
+  // Except corrupt Backup 4 with same size CURRENT file
+  {
+    uint64_t size = 0;
+    ASSERT_OK(test_backup_env_->GetFileSize(private_dir + "/4/CURRENT", &size));
+    ASSERT_OK(file_manager_->WriteToFile(private_dir + "/4/CURRENT",
+                                         std::string(size, 'x')));
+  }
+
+  OpenBackupEngine();
+
+  // Only the one with sizes in metadata will be immediately detected
+  // as corrupt
+  std::vector<BackupID> corrupted;
+  backup_engine_->GetCorruptedBackups(&corrupted);
+  ASSERT_EQ(corrupted.size(), 1);
+  ASSERT_EQ(corrupted[0], 3);
+
+  // Size corruption detected on Restore with checksum
+  ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(1 /*id*/, dbname_, dbname_)
+                  .IsCorruption());
+
+  // Size corruption not detected without checksums nor sizes
+  ASSERT_OK(backup_engine_->RestoreDBFromBackup(2 /*id*/, dbname_, dbname_));
+
+  // Non-size corruption detected on Restore with checksum
+  ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(4 /*id*/, dbname_, dbname_)
+                  .IsCorruption());
+
+  CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, FutureMetaSchemaVersion2_NotSupported) {
+  TEST_FutureSchemaVersion2Options test_opts;
+  std::string app_metadata = "abc\ndef";
+
+  OpenDBAndBackupEngine(true);
+  // Start with supported
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+
+  // Because we are injecting badness with a TEST API, the badness is only
+  // detected on attempt to restore.
+  // Not supported versions
+  test_opts.version = "3";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.version = "23.45.67";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.version = "2";
+
+  // Non-ignorable fields
+  test_opts.meta_fields["ni::blah"] = "123";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.meta_fields.clear();
+
+  test_opts.file_fields["ni::123"] = "xyz";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.file_fields.clear();
+
+  test_opts.footer_fields["ni::123"] = "xyz";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.footer_fields.clear();
+  CloseDBAndBackupEngine();
+
+  OpenBackupEngine();
+  std::vector<BackupID> corrupted;
+  backup_engine_->GetCorruptedBackups(&corrupted);
+  ASSERT_EQ(corrupted.size(), 5);
+
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, FutureMetaSchemaVersion2_Restore) {
+  TEST_FutureSchemaVersion2Options test_opts;
+  const int keys_iteration = 5000;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  FillDB(db_.get(), 0, keys_iteration);
+  // Start with minimum metadata to ensure it works without it being filled
+  // based on shared files also in other backups with the metadata.
+  test_opts.crc32c_checksums = false;
+  test_opts.file_sizes = false;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  AssertBackupConsistency(1 /* id */, 0, keys_iteration, keys_iteration * 2);
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  test_opts.file_sizes = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 2; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  test_opts.crc32c_checksums = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 3; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  // No TEST_EnableWriteFutureSchemaVersion2
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 4; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  // Minor version updates should be forward-compatible
+  test_opts.version = "2.5.70";
+  test_opts.meta_fields["asdf.3456"] = "-42";
+  test_opts.meta_fields["__QRST"] = " 1 $ %%& ";
+  test_opts.file_fields["z94._"] = "^\\";
+  test_opts.file_fields["_7yyyyyyyyy"] = "111111111111";
+  test_opts.footer_fields["Qwzn.tz89"] = "ASDF!!@# ##=\t ";
+  test_opts.footer_fields["yes"] = "no!";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 5; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+}
+
+TEST_F(BackupEngineTest, Concurrency) {
+  // Check that we can simultaneously:
+  // * Run several read operations in different threads on a single
+  // BackupEngine object, and
+  // * With another BackupEngine object on the same
+  // backup_dir, run the same read operations in another thread, and
+  // * With yet another BackupEngine object on the same
+  // backup_dir, create two new backups in parallel threads.
+  //
+  // Because of the challenges of integrating this into db_stress,
+  // this is a non-deterministic mini-stress test here instead.
+
+  // To check for a race condition in handling buffer size based on byte
+  // burst limit, we need a (generous) rate limiter
+  std::shared_ptr<RateLimiter> limiter{NewGenericRateLimiter(1000000000)};
+  backupable_options_->backup_rate_limiter = limiter;
+  backupable_options_->restore_rate_limiter = limiter;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+
+  static constexpr int keys_iteration = 5000;
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  static constexpr int max_factor = 3;
+  FillDB(db_.get(), 2 * keys_iteration, max_factor * keys_iteration);
+  // will create another backup soon...
+
+  Options db_opts = options_;
+  db_opts.wal_dir = "";
+  db_opts.create_if_missing = false;
+  BackupableDBOptions be_opts = *backupable_options_;
+  be_opts.destroy_old_data = false;
+
+  std::mt19937 rng{std::random_device()()};
+
+  std::array<std::thread, 4> read_threads;
+  std::array<std::thread, 4> restore_verify_threads;
+  for (uint32_t i = 0; i < read_threads.size(); ++i) {
+    uint32_t sleep_micros = rng() % 100000;
+    read_threads[i] =
+        std::thread([this, i, sleep_micros, &db_opts, &be_opts,
+                     &restore_verify_threads, &limiter] {
+          test_db_env_->SleepForMicroseconds(sleep_micros);
+
+          // Whether to also re-open the BackupEngine, potentially seeing
+          // additional backups
+          bool reopen = i == 3;
+          // Whether we are going to restore "latest"
+          bool latest = i > 1;
+
+          BackupEngine* my_be;
+          if (reopen) {
+            ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &my_be));
+          } else {
+            my_be = backup_engine_.get();
+          }
+
+          // Verify metadata (we don't receive updates from concurrently
+          // creating a new backup)
+          std::vector<BackupInfo> infos;
+          my_be->GetBackupInfo(&infos);
+          const uint32_t count = static_cast<uint32_t>(infos.size());
+          infos.clear();
+          if (reopen) {
+            ASSERT_GE(count, 2U);
+            ASSERT_LE(count, 4U);
+            fprintf(stderr, "Reopen saw %u backups\n", count);
+          } else {
+            ASSERT_EQ(count, 2U);
+          }
+          std::vector<BackupID> ids;
+          my_be->GetCorruptedBackups(&ids);
+          ASSERT_EQ(ids.size(), 0U);
+
+          // (Eventually, see below) Restore one of the backups, or "latest"
+          std::string restore_db_dir = dbname_ + "/restore" + ToString(i);
+          DestroyDir(test_db_env_.get(), restore_db_dir).PermitUncheckedError();
+          BackupID to_restore;
+          if (latest) {
+            to_restore = count;
+          } else {
+            to_restore = i + 1;
+          }
+
+          // Open restored DB to verify its contents, but test atomic restore
+          // by doing it async and ensuring we either get OK or InvalidArgument
+          restore_verify_threads[i] =
+              std::thread([this, &db_opts, restore_db_dir, to_restore] {
+                DB* restored;
+                Status s;
+                for (;;) {
+                  s = DB::Open(db_opts, restore_db_dir, &restored);
+                  if (s.IsInvalidArgument()) {
+                    // Restore hasn't finished
+                    test_db_env_->SleepForMicroseconds(1000);
+                    continue;
+                  } else {
+                    // We should only get InvalidArgument if restore is
+                    // incomplete, or OK if complete
+                    ASSERT_OK(s);
+                    break;
+                  }
+                }
+                int factor = std::min(static_cast<int>(to_restore), max_factor);
+                AssertExists(restored, 0, factor * keys_iteration);
+                AssertEmpty(restored, factor * keys_iteration,
+                            (factor + 1) * keys_iteration);
+                delete restored;
+              });
+
+          // (Ok now) Restore one of the backups, or "latest"
+          if (latest) {
+            ASSERT_OK(my_be->RestoreDBFromLatestBackup(restore_db_dir,
+                                                       restore_db_dir));
+          } else {
+            ASSERT_OK(my_be->VerifyBackup(to_restore, true));
+            ASSERT_OK(my_be->RestoreDBFromBackup(to_restore, restore_db_dir,
+                                                 restore_db_dir));
+          }
+
+          // Test for race condition in reconfiguring limiter
+          // FIXME: this could set to a different value in all threads, except
+          // GenericRateLimiter::SetBytesPerSecond has a write-write race
+          // reported by TSAN
+          if (i == 0) {
+            limiter->SetBytesPerSecond(2000000000);
+          }
+
+          // Re-verify metadata (we don't receive updates from concurrently
+          // creating a new backup)
+          my_be->GetBackupInfo(&infos);
+          ASSERT_EQ(infos.size(), count);
+          my_be->GetCorruptedBackups(&ids);
+          ASSERT_EQ(ids.size(), 0);
+          // fprintf(stderr, "Finished read thread\n");
+
+          if (reopen) {
+            delete my_be;
+          }
+        });
+  }
+
+  BackupEngine* alt_be;
+  ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &alt_be));
+
+  std::array<std::thread, 2> append_threads;
+  for (unsigned i = 0; i < append_threads.size(); ++i) {
+    uint32_t sleep_micros = rng() % 100000;
+    append_threads[i] = std::thread([this, sleep_micros, alt_be] {
+      test_db_env_->SleepForMicroseconds(sleep_micros);
+      // WART: CreateNewBackup doesn't tell you the BackupID it just created,
+      // which is ugly for multithreaded setting.
+      // TODO: add delete backup also when that is added
+      ASSERT_OK(alt_be->CreateNewBackup(db_.get()));
+      // fprintf(stderr, "Finished append thread\n");
+    });
+  }
+
+  for (auto& t : append_threads) {
+    t.join();
+  }
+  // Verify metadata
+  std::vector<BackupInfo> infos;
+  alt_be->GetBackupInfo(&infos);
+  ASSERT_EQ(infos.size(), 2 + append_threads.size());
+
+  for (auto& t : read_threads) {
+    t.join();
+  }
+
+  delete alt_be;
+
+  for (auto& t : restore_verify_threads) {
+    t.join();
+  }
+
+  CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, LimitBackupsOpened) {
   // Verify the specified max backups are opened, including skipping over
   // corrupted backups.
   //
@@ -1703,7 +3651,7 @@
   delete read_only_backup_engine;
 }
 
-TEST_F(BackupableDBTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
+TEST_F(BackupEngineTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
   // Verify the specified max_valid_backups_to_open is ignored if the engine
   // is not read-only.
   //
@@ -1737,11 +3685,13 @@
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, CreateWhenLatestBackupCorrupted) {
+TEST_F(BackupEngineTest, CreateWhenLatestBackupCorrupted) {
   // we should pick an ID greater than corrupted backups' IDs so creation can
   // succeed even when latest backup is corrupted.
   const int kNumKeys = 5000;
   OpenDBAndBackupEngine(true /* destroy_old_data */);
+  BackupInfo backup_info;
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
   FillDB(db_.get(), 0 /* from */, kNumKeys);
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
                                             true /* flush_before_backup */));
@@ -1750,15 +3700,29 @@
   CloseDBAndBackupEngine();
 
   OpenDBAndBackupEngine();
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
+
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
                                             true /* flush_before_backup */));
+
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).ok());
+  ASSERT_EQ(2, backup_info.backup_id);
+
   std::vector<BackupInfo> backup_infos;
   backup_engine_->GetBackupInfo(&backup_infos);
   ASSERT_EQ(1, backup_infos.size());
   ASSERT_EQ(2, backup_infos[0].backup_id);
+
+  // Verify individual GetBackupInfo by ID
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(0U, &backup_info).IsNotFound());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(1U, &backup_info).IsCorruption());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(2U, &backup_info).ok());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(3U, &backup_info).IsNotFound());
+  ASSERT_TRUE(
+      backup_engine_->GetBackupInfo(999999U, &backup_info).IsNotFound());
 }
 
-TEST_F(BackupableDBTest, WriteOnlyEngineNoSharedFileDeletion) {
+TEST_F(BackupEngineTest, WriteOnlyEngineNoSharedFileDeletion) {
   // Verifies a write-only BackupEngine does not delete files belonging to valid
   // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called.
   const int kNumKeys = 5000;
@@ -1790,12 +3754,13 @@
   }
 }
 
-TEST_P(BackupableDBTestWithParam, BackupUsingDirectIO) {
+TEST_P(BackupEngineTestWithParam, BackupUsingDirectIO) {
   // Tests direct I/O on the backup engine's reads and writes on the DB env and
   // backup env
   // We use ChrootEnv underneath so the below line checks for direct I/O support
   // in the chroot directory, not the true filesystem root.
   if (!test::IsDirectIOSupported(test_db_env_.get(), "/")) {
+    ROCKSDB_GTEST_SKIP("Test requires Direct I/O Support");
     return;
   }
   const int kNumKeysPerBackup = 100;
@@ -1804,8 +3769,7 @@
   OpenDBAndBackupEngine(true /* destroy_old_data */);
   for (int i = 0; i < kNumBackups; ++i) {
     FillDB(db_.get(), i * kNumKeysPerBackup /* from */,
-           (i + 1) * kNumKeysPerBackup /* to */);
-    ASSERT_OK(db_->Flush(FlushOptions()));
+           (i + 1) * kNumKeysPerBackup /* to */, kFlushAll);
 
     // Clear the file open counters and then do a bunch of backup engine ops.
     // For all ops, files should be opened in direct mode.
@@ -1824,13 +3788,14 @@
 
     // Verify backup engine always opened files with direct I/O
     ASSERT_EQ(0, test_db_env_->num_writers());
-    ASSERT_EQ(0, test_db_env_->num_rand_readers());
+    ASSERT_GE(test_db_env_->num_direct_rand_readers(), 0);
     ASSERT_GT(test_db_env_->num_direct_seq_readers(), 0);
     // Currently the DB doesn't support reading WALs or manifest with direct
     // I/O, so subtract two.
     ASSERT_EQ(test_db_env_->num_seq_readers() - 2,
               test_db_env_->num_direct_seq_readers());
-    ASSERT_EQ(0, test_db_env_->num_rand_readers());
+    ASSERT_EQ(test_db_env_->num_rand_readers(),
+              test_db_env_->num_direct_rand_readers());
   }
   CloseDBAndBackupEngine();
 
@@ -1842,6 +3807,178 @@
   }
 }
 
+TEST_F(BackupEngineTest, BackgroundThreadCpuPriority) {
+  std::atomic<CpuPriority> priority(CpuPriority::kNormal);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackupEngineImpl::Initialize:SetCpuPriority", [&](void* new_priority) {
+        priority.store(*reinterpret_cast<CpuPriority*>(new_priority));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // 1 thread is easier to test, otherwise, we may not be sure which thread
+  // actually does the work during CreateNewBackup.
+  backupable_options_->max_background_operations = 1;
+  OpenDBAndBackupEngine(true);
+
+  {
+    FillDB(db_.get(), 0, 100);
+
+    // by default, cpu priority is not changed.
+    CreateBackupOptions options;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kNormal);
+  }
+
+  {
+    FillDB(db_.get(), 101, 200);
+
+    // decrease cpu priority from normal to low.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kLow;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kLow);
+  }
+
+  {
+    FillDB(db_.get(), 201, 300);
+
+    // try to upgrade cpu priority back to normal,
+    // the priority should still low.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kNormal;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kLow);
+  }
+
+  {
+    FillDB(db_.get(), 301, 400);
+
+    // decrease cpu priority from low to idle.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kIdle;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    ASSERT_EQ(priority, CpuPriority::kIdle);
+  }
+
+  {
+    FillDB(db_.get(), 301, 400);
+
+    // reset priority to later verify that it's not updated by SetCpuPriority.
+    priority = CpuPriority::kNormal;
+
+    // setting the same cpu priority won't call SetCpuPriority.
+    CreateBackupOptions options;
+    options.decrease_background_thread_cpu_priority = true;
+    options.background_thread_cpu_priority = CpuPriority::kIdle;
+
+    // Also check output backup_id with CreateNewBackup
+    BackupID new_id = 0;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get(), &new_id));
+    ASSERT_EQ(new_id, 5U);
+
+    ASSERT_EQ(priority, CpuPriority::kNormal);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  CloseDBAndBackupEngine();
+  DestroyDB(dbname_, options_);
+}
+
+// Populates `*total_size` with the size of all files under `backup_dir`.
+// We don't go through `BackupEngine` currently because it's hard to figure out
+// the metadata file size.
+Status GetSizeOfBackupFiles(FileSystem* backup_fs,
+                            const std::string& backup_dir, size_t* total_size) {
+  *total_size = 0;
+  std::vector<std::string> dir_stack = {backup_dir};
+  Status s;
+  while (s.ok() && !dir_stack.empty()) {
+    std::string dir = std::move(dir_stack.back());
+    dir_stack.pop_back();
+    std::vector<std::string> children;
+    s = backup_fs->GetChildren(dir, IOOptions(), &children, nullptr /* dbg */);
+    for (size_t i = 0; s.ok() && i < children.size(); ++i) {
+      std::string path = dir + "/" + children[i];
+      bool is_dir;
+      s = backup_fs->IsDirectory(path, IOOptions(), &is_dir, nullptr /* dbg */);
+      uint64_t file_size = 0;
+      if (s.ok()) {
+        if (is_dir) {
+          dir_stack.emplace_back(std::move(path));
+        } else {
+          s = backup_fs->GetFileSize(path, IOOptions(), &file_size,
+                                     nullptr /* dbg */);
+        }
+      }
+      if (s.ok()) {
+        *total_size += file_size;
+      }
+    }
+  }
+  return s;
+}
+
+TEST_F(BackupEngineTest, IOStats) {
+  // Tests the `BACKUP_READ_BYTES` and `BACKUP_WRITE_BYTES` ticker stats have
+  // the expected values according to the files in the backups.
+
+  // These ticker stats are expected to be populated regardless of `PerfLevel`
+  // in user thread
+  SetPerfLevel(kDisable);
+
+  options_.statistics = CreateDBStatistics();
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kShareWithChecksum);
+
+  FillDB(db_.get(), 0 /* from */, 100 /* to */, kFlushMost);
+
+  ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+  ASSERT_EQ(0, options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+
+  size_t orig_backup_files_size;
+  ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(),
+                                 backupdir_, &orig_backup_files_size));
+  size_t expected_bytes_written = orig_backup_files_size;
+  ASSERT_EQ(expected_bytes_written,
+            options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+  // Bytes read is more difficult to pin down since there are reads for many
+  // purposes other than creating file, like `GetSortedWalFiles()` to find first
+  // sequence number, or `CreateNewBackup()` thread to find SST file session ID.
+  // So we loosely require there are at least as many reads as needed for
+  // copying, but not as many as twice that.
+  ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES),
+            expected_bytes_written);
+  ASSERT_LT(expected_bytes_written,
+            2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+
+  FillDB(db_.get(), 100 /* from */, 200 /* to */, kFlushMost);
+
+  ASSERT_OK(options_.statistics->Reset());
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            false /* flush_before_backup */));
+  size_t final_backup_files_size;
+  ASSERT_OK(GetSizeOfBackupFiles(test_backup_env_->GetFileSystem().get(),
+                                 backupdir_, &final_backup_files_size));
+  expected_bytes_written = final_backup_files_size - orig_backup_files_size;
+  ASSERT_EQ(expected_bytes_written,
+            options_.statistics->getTickerCount(BACKUP_WRITE_BYTES));
+  // See above for why these bounds were chosen.
+  ASSERT_GE(options_.statistics->getTickerCount(BACKUP_READ_BYTES),
+            expected_bytes_written);
+  ASSERT_LT(expected_bytes_written,
+            2 * options_.statistics->getTickerCount(BACKUP_READ_BYTES));
+}
+
 }  // anon namespace
 
 }  // namespace ROCKSDB_NAMESPACE
@@ -1856,7 +3993,7 @@
 #include <stdio.h>
 
 int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr, "SKIPPED as BackupableDB is not supported in ROCKSDB_LITE\n");
+  fprintf(stderr, "SKIPPED as BackupEngine is not supported in ROCKSDB_LITE\n");
   return 0;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,18 +6,42 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/blob_db/blob_compaction_filter.h"
-#include "db/dbformat.h"
 
 #include <cinttypes>
 
+#include "db/dbformat.h"
+#include "logging/logging.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+
 namespace ROCKSDB_NAMESPACE {
 namespace blob_db {
 
+BlobIndexCompactionFilterBase::~BlobIndexCompactionFilterBase() {
+  if (blob_file_) {
+    CloseAndRegisterNewBlobFile();
+  }
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
+  RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
+}
+
 CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
-    int /*level*/, const Slice& key, ValueType value_type, const Slice& value,
-    std::string* /*new_value*/, std::string* /*skip_until*/) const {
+    int level, const Slice& key, ValueType value_type, const Slice& value,
+    std::string* new_value, std::string* skip_until) const {
+  const CompactionFilter* ucf = user_comp_filter();
   if (value_type != kBlobIndex) {
-    return Decision::kKeep;
+    if (ucf == nullptr) {
+      return Decision::kKeep;
+    }
+    // Apply user compaction filter for inlined data.
+    CompactionFilter::Decision decision =
+        ucf->FilterV2(level, key, value_type, value, new_value, skip_until);
+    if (decision == Decision::kChangeValue) {
+      return HandleValueChange(key, new_value);
+    }
+    return decision;
   }
   BlobIndex blob_index;
   Status s = blob_index.DecodeFrom(value);
@@ -44,26 +68,88 @@
     // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
     // get sequence number.
     ParsedInternalKey ikey;
-    bool ok = ParseInternalKey(key, &ikey);
+    if (!ParseInternalKey(
+             key, &ikey,
+             context_.blob_db_impl->db_options_.allow_data_in_errors)
+             .ok()) {
+      assert(false);
+      return Decision::kKeep;
+    }
     // Remove keys that could have been remove by last FIFO eviction.
     // If get error while parsing key, ignore and continue.
-    if (ok && ikey.sequence < context_.fifo_eviction_seq) {
+    if (ikey.sequence < context_.fifo_eviction_seq) {
       evicted_count_++;
       evicted_size_ += key.size() + value.size();
       return Decision::kRemove;
     }
   }
+  // Apply user compaction filter for all non-TTL blob data.
+  if (ucf != nullptr && !blob_index.HasTTL()) {
+    // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
+    // get sequence number.
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(
+             key, &ikey,
+             context_.blob_db_impl->db_options_.allow_data_in_errors)
+             .ok()) {
+      assert(false);
+      return Decision::kKeep;
+    }
+    // Read value from blob file.
+    PinnableSlice blob;
+    CompressionType compression_type = kNoCompression;
+    constexpr bool need_decompress = true;
+    if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress,
+                             &compression_type)) {
+      return Decision::kIOError;
+    }
+    CompactionFilter::Decision decision = ucf->FilterV2(
+        level, ikey.user_key, kValue, blob, new_value, skip_until);
+    if (decision == Decision::kChangeValue) {
+      return HandleValueChange(ikey.user_key, new_value);
+    }
+    return decision;
+  }
   return Decision::kKeep;
 }
 
-BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() {
-  if (blob_file_) {
-    CloseAndRegisterNewBlobFile();
+CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
+    const Slice& key, std::string* new_value) const {
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
+  assert(blob_db_impl);
+
+  if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) {
+    // Keep new_value inlined.
+    return Decision::kChangeValue;
+  }
+  if (!OpenNewBlobFileIfNeeded()) {
+    return Decision::kIOError;
+  }
+  Slice new_blob_value(*new_value);
+  std::string compression_output;
+  if (blob_db_impl->bdb_options_.compression != kNoCompression) {
+    new_blob_value =
+        blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output);
   }
+  uint64_t new_blob_file_number = 0;
+  uint64_t new_blob_offset = 0;
+  if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number,
+                          &new_blob_offset)) {
+    return Decision::kIOError;
+  }
+  if (!CloseAndRegisterNewBlobFileIfNeeded()) {
+    return Decision::kIOError;
+  }
+  BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
+                        new_blob_value.size(),
+                        blob_db_impl->bdb_options_.compression);
+  return Decision::kChangeBlobIndex;
+}
 
-  assert(context_gc_.blob_db_impl);
+BlobIndexCompactionFilterGC::~BlobIndexCompactionFilterGC() {
+  assert(context().blob_db_impl);
 
-  ROCKS_LOG_INFO(context_gc_.blob_db_impl->db_options_.info_log,
+  ROCKS_LOG_INFO(context().blob_db_impl->db_options_.info_log,
                  "GC pass finished %s: encountered %" PRIu64 " blobs (%" PRIu64
                  " bytes), relocated %" PRIu64 " blobs (%" PRIu64
                  " bytes), created %" PRIu64 " new blob file(s)",
@@ -80,129 +166,84 @@
   RecordTick(statistics(), BLOB_DB_GC_FAILURES, gc_stats_.HasError());
 }
 
-CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
-    const Slice& key, const Slice& existing_value,
-    std::string* new_value) const {
-  assert(new_value);
-
-  const BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
-  (void)blob_db_impl;
-
-  assert(blob_db_impl);
-  assert(blob_db_impl->bdb_options_.enable_garbage_collection);
-
-  BlobIndex blob_index;
-  const Status s = blob_index.DecodeFrom(existing_value);
-  if (!s.ok()) {
-    gc_stats_.SetError();
-    return BlobDecision::kCorruption;
-  }
-
-  if (blob_index.IsInlined()) {
-    gc_stats_.AddBlob(blob_index.value().size());
-
-    return BlobDecision::kKeep;
-  }
-
-  gc_stats_.AddBlob(blob_index.size());
-
-  if (blob_index.HasTTL()) {
-    return BlobDecision::kKeep;
-  }
-
-  if (blob_index.file_number() >= context_gc_.cutoff_file_number) {
-    return BlobDecision::kKeep;
-  }
-
-  // Note: each compaction generates its own blob files, which, depending on the
-  // workload, might result in many small blob files. The total number of files
-  // is bounded though (determined by the number of compactions and the blob
-  // file size option).
-  if (!OpenNewBlobFileIfNeeded()) {
-    gc_stats_.SetError();
-    return BlobDecision::kIOError;
-  }
-
-  PinnableSlice blob;
-  CompressionType compression_type = kNoCompression;
-  if (!ReadBlobFromOldFile(key, blob_index, &blob, &compression_type)) {
-    gc_stats_.SetError();
-    return BlobDecision::kIOError;
-  }
-
-  uint64_t new_blob_file_number = 0;
-  uint64_t new_blob_offset = 0;
-  if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
-    gc_stats_.SetError();
-    return BlobDecision::kIOError;
-  }
-
-  if (!CloseAndRegisterNewBlobFileIfNeeded()) {
-    gc_stats_.SetError();
-    return BlobDecision::kIOError;
+bool BlobIndexCompactionFilterBase::IsBlobFileOpened() const {
+  if (blob_file_) {
+    assert(writer_);
+    return true;
   }
-
-  BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
-                        blob.size(), compression_type);
-
-  gc_stats_.AddRelocatedBlob(blob_index.size());
-
-  return BlobDecision::kChangeValue;
+  return false;
 }
 
-bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const {
-  if (blob_file_) {
-    assert(writer_);
+bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const {
+  if (IsBlobFileOpened()) {
     return true;
   }
 
-  BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
   assert(blob_db_impl);
 
   const Status s = blob_db_impl->CreateBlobFileAndWriter(
-      /* has_ttl */ false, ExpirationRange(), "GC", &blob_file_, &writer_);
+      /* has_ttl */ false, ExpirationRange(), "compaction/GC", &blob_file_,
+      &writer_);
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
-                    "Error opening new blob file during GC, status: %s",
-                    s.ToString().c_str());
-
+    ROCKS_LOG_ERROR(
+        blob_db_impl->db_options_.info_log,
+        "Error opening new blob file during compaction/GC, status: %s",
+        s.ToString().c_str());
+    blob_file_.reset();
+    writer_.reset();
     return false;
   }
 
   assert(blob_file_);
   assert(writer_);
 
-  gc_stats_.AddNewFile();
-
   return true;
 }
 
-bool BlobIndexCompactionFilterGC::ReadBlobFromOldFile(
+bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
     const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob,
-    CompressionType* compression_type) const {
-  BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
+    bool need_decompress, CompressionType* compression_type) const {
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
   assert(blob_db_impl);
 
-  const Status s = blob_db_impl->GetRawBlobFromFile(
+  Status s = blob_db_impl->GetRawBlobFromFile(
       key, blob_index.file_number(), blob_index.offset(), blob_index.size(),
       blob, compression_type);
 
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
-                    "Error reading blob during GC, key: %s (%s), status: %s",
-                    key.ToString(/* output_hex */ true).c_str(),
-                    blob_index.DebugString(/* output_hex */ true).c_str(),
-                    s.ToString().c_str());
+    ROCKS_LOG_ERROR(
+        blob_db_impl->db_options_.info_log,
+        "Error reading blob during compaction/GC, key: %s (%s), status: %s",
+        key.ToString(/* output_hex */ true).c_str(),
+        blob_index.DebugString(/* output_hex */ true).c_str(),
+        s.ToString().c_str());
 
     return false;
   }
 
+  if (need_decompress && *compression_type != kNoCompression) {
+    s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          blob_db_impl->db_options_.info_log,
+          "Uncompression error during blob read from file: %" PRIu64
+          " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+          " key: %s status: '%s'",
+          blob_index.file_number(), blob_index.offset(), blob_index.size(),
+          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+
+      return false;
+    }
+  }
+
   return true;
 }
 
-bool BlobIndexCompactionFilterGC::WriteBlobToNewFile(
+bool BlobIndexCompactionFilterBase::WriteBlobToNewFile(
     const Slice& key, const Slice& blob, uint64_t* new_blob_file_number,
     uint64_t* new_blob_offset) const {
+  TEST_SYNC_POINT("BlobIndexCompactionFilterBase::WriteBlobToNewFile");
   assert(new_blob_file_number);
   assert(new_blob_offset);
 
@@ -215,14 +256,15 @@
                                       new_blob_offset);
 
   if (!s.ok()) {
-    const BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
+    const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
     assert(blob_db_impl);
 
-    ROCKS_LOG_ERROR(
-        blob_db_impl->db_options_.info_log,
-        "Error writing blob to new file %s during GC, key: %s, status: %s",
-        blob_file_->PathName().c_str(),
-        key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+    ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
+                    "Error writing blob to new file %s during compaction/GC, "
+                    "key: %s, status: %s",
+                    blob_file_->PathName().c_str(),
+                    key.ToString(/* output_hex */ true).c_str(),
+                    s.ToString().c_str());
     return false;
   }
 
@@ -230,7 +272,7 @@
       BlobLogRecord::kHeaderSize + key.size() + blob.size();
   blob_file_->BlobRecordAdded(new_size);
 
-  BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
   assert(blob_db_impl);
 
   blob_db_impl->total_blob_size_ += new_size;
@@ -238,8 +280,9 @@
   return true;
 }
 
-bool BlobIndexCompactionFilterGC::CloseAndRegisterNewBlobFileIfNeeded() const {
-  const BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
+bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFileIfNeeded()
+    const {
+  const BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
   assert(blob_db_impl);
 
   assert(blob_file_);
@@ -250,8 +293,8 @@
   return CloseAndRegisterNewBlobFile();
 }
 
-bool BlobIndexCompactionFilterGC::CloseAndRegisterNewBlobFile() const {
-  BlobDBImpl* const blob_db_impl = context_gc_.blob_db_impl;
+bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const {
+  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
   assert(blob_db_impl);
   assert(blob_file_);
 
@@ -263,31 +306,141 @@
     s = blob_db_impl->CloseBlobFile(blob_file_);
 
     // Note: we delay registering the new blob file until it's closed to
-    // prevent FIFO eviction from processing it during the GC run.
+    // prevent FIFO eviction from processing it during compaction/GC.
     blob_db_impl->RegisterBlobFile(blob_file_);
   }
 
   assert(blob_file_->Immutable());
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        blob_db_impl->db_options_.info_log,
+        "Error closing new blob file %s during compaction/GC, status: %s",
+        blob_file_->PathName().c_str(), s.ToString().c_str());
+  }
+
   blob_file_.reset();
+  return s.ok();
+}
+
+CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
+    const Slice& key, const Slice& existing_value,
+    std::string* new_value) const {
+  assert(new_value);
+
+  const BlobDBImpl* const blob_db_impl = context().blob_db_impl;
+  (void)blob_db_impl;
+
+  assert(blob_db_impl);
+  assert(blob_db_impl->bdb_options_.enable_garbage_collection);
 
+  BlobIndex blob_index;
+  const Status s = blob_index.DecodeFrom(existing_value);
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(blob_db_impl->db_options_.info_log,
-                    "Error closing new blob file %s during GC, status: %s",
-                    blob_file_->PathName().c_str(), s.ToString().c_str());
+    gc_stats_.SetError();
+    return BlobDecision::kCorruption;
+  }
 
-    return false;
+  if (blob_index.IsInlined()) {
+    gc_stats_.AddBlob(blob_index.value().size());
+
+    return BlobDecision::kKeep;
   }
 
-  return true;
+  gc_stats_.AddBlob(blob_index.size());
+
+  if (blob_index.HasTTL()) {
+    return BlobDecision::kKeep;
+  }
+
+  if (blob_index.file_number() >= context_gc_.cutoff_file_number) {
+    return BlobDecision::kKeep;
+  }
+
+  // Note: each compaction generates its own blob files, which, depending on the
+  // workload, might result in many small blob files. The total number of files
+  // is bounded though (determined by the number of compactions and the blob
+  // file size option).
+  if (!OpenNewBlobFileIfNeeded()) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  PinnableSlice blob;
+  CompressionType compression_type = kNoCompression;
+  std::string compression_output;
+  if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  // If the compression_type is changed, re-compress it with the new compression
+  // type.
+  if (compression_type != blob_db_impl->bdb_options_.compression) {
+    if (compression_type != kNoCompression) {
+      const Status status =
+          blob_db_impl->DecompressSlice(blob, compression_type, &blob);
+      if (!status.ok()) {
+        gc_stats_.SetError();
+        return BlobDecision::kCorruption;
+      }
+    }
+    if (blob_db_impl->bdb_options_.compression != kNoCompression) {
+      blob_db_impl->GetCompressedSlice(blob, &compression_output);
+      blob = PinnableSlice(&compression_output);
+      blob.PinSelf();
+    }
+  }
+
+  uint64_t new_blob_file_number = 0;
+  uint64_t new_blob_offset = 0;
+  if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  if (!CloseAndRegisterNewBlobFileIfNeeded()) {
+    gc_stats_.SetError();
+    return BlobDecision::kIOError;
+  }
+
+  BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
+                        blob.size(), compression_type);
+
+  gc_stats_.AddRelocatedBlob(blob_index.size());
+
+  return BlobDecision::kChangeValue;
+}
+
+bool BlobIndexCompactionFilterGC::OpenNewBlobFileIfNeeded() const {
+  if (IsBlobFileOpened()) {
+    return true;
+  }
+  bool result = BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded();
+  if (result) {
+    gc_stats_.AddNewFile();
+  }
+  return result;
+}
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactoryBase::CreateUserCompactionFilterFromFactory(
+    const CompactionFilter::Context& context) const {
+  std::unique_ptr<CompactionFilter> user_comp_filter_from_factory;
+  if (user_comp_filter_factory_) {
+    user_comp_filter_from_factory =
+        user_comp_filter_factory_->CreateCompactionFilter(context);
+  }
+  return user_comp_filter_from_factory;
 }
 
 std::unique_ptr<CompactionFilter>
 BlobIndexCompactionFilterFactory::CreateCompactionFilter(
-    const CompactionFilter::Context& /*context*/) {
-  assert(env());
+    const CompactionFilter::Context& _context) {
+  assert(clock());
 
   int64_t current_time = 0;
-  Status s = env()->GetCurrentTime(&current_time);
+  Status s = clock()->GetCurrentTime(&current_time);
   if (!s.ok()) {
     return nullptr;
   }
@@ -298,17 +451,21 @@
   BlobCompactionContext context;
   blob_db_impl()->GetCompactionContext(&context);
 
+  std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
+      CreateUserCompactionFilterFromFactory(_context);
+
   return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
-      std::move(context), current_time, statistics()));
+      std::move(context), user_comp_filter(),
+      std::move(user_comp_filter_from_factory), current_time, statistics()));
 }
 
 std::unique_ptr<CompactionFilter>
 BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter(
-    const CompactionFilter::Context& /*context*/) {
-  assert(env());
+    const CompactionFilter::Context& _context) {
+  assert(clock());
 
   int64_t current_time = 0;
-  Status s = env()->GetCurrentTime(&current_time);
+  Status s = clock()->GetCurrentTime(&current_time);
   if (!s.ok()) {
     return nullptr;
   }
@@ -320,8 +477,12 @@
   BlobCompactionContextGC context_gc;
   blob_db_impl()->GetCompactionContext(&context, &context_gc);
 
+  std::unique_ptr<CompactionFilter> user_comp_filter_from_factory =
+      CreateUserCompactionFilterFromFactory(_context);
+
   return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilterGC(
-      std::move(context), std::move(context_gc), current_time, statistics()));
+      std::move(context), std::move(context_gc), user_comp_filter(),
+      std::move(user_comp_filter_from_factory), current_time, statistics()));
 }
 
 }  // namespace blob_db
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_compaction_filter.h	2025-05-19 16:14:28.000000000 +0000
@@ -7,17 +7,19 @@
 
 #include <unordered_set>
 
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/compaction_filter.h"
-#include "rocksdb/env.h"
 #include "utilities/blob_db/blob_db_gc_stats.h"
 #include "utilities/blob_db/blob_db_impl.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 namespace blob_db {
 
 struct BlobCompactionContext {
+  BlobDBImpl* blob_db_impl = nullptr;
   uint64_t next_file_number = 0;
   std::unordered_set<uint64_t> current_blob_files;
   SequenceNumber fifo_eviction_seq = 0;
@@ -25,41 +27,61 @@
 };
 
 struct BlobCompactionContextGC {
-  BlobDBImpl* blob_db_impl = nullptr;
   uint64_t cutoff_file_number = 0;
 };
 
 // Compaction filter that deletes expired blob indexes from the base DB.
 // Comes into two varieties, one for the non-GC case and one for the GC case.
-class BlobIndexCompactionFilterBase : public CompactionFilter {
+class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase {
  public:
-  BlobIndexCompactionFilterBase(BlobCompactionContext&& context,
-                                uint64_t current_time, Statistics* stats)
-      : context_(std::move(context)),
+  BlobIndexCompactionFilterBase(
+      BlobCompactionContext&& _context,
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+      uint64_t current_time, Statistics* stats)
+      : LayeredCompactionFilterBase(_user_comp_filter,
+                                    std::move(_user_comp_filter_from_factory)),
+        context_(std::move(_context)),
         current_time_(current_time),
         statistics_(stats) {}
 
-  ~BlobIndexCompactionFilterBase() override {
-    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
-    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
-    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
-    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
-  }
+  ~BlobIndexCompactionFilterBase() override;
 
   // Filter expired blob indexes regardless of snapshots.
   bool IgnoreSnapshots() const override { return true; }
 
-  Decision FilterV2(int /*level*/, const Slice& key, ValueType value_type,
-                    const Slice& value, std::string* /*new_value*/,
-                    std::string* /*skip_until*/) const override;
+  Decision FilterV2(int level, const Slice& key, ValueType value_type,
+                    const Slice& value, std::string* new_value,
+                    std::string* skip_until) const override;
+
+  bool IsStackedBlobDbInternalCompactionFilter() const override { return true; }
 
  protected:
+  bool IsBlobFileOpened() const;
+  virtual bool OpenNewBlobFileIfNeeded() const;
+  bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
+                           PinnableSlice* blob, bool need_decompress,
+                           CompressionType* compression_type) const;
+  bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
+                          uint64_t* new_blob_file_number,
+                          uint64_t* new_blob_offset) const;
+  bool CloseAndRegisterNewBlobFileIfNeeded() const;
+  bool CloseAndRegisterNewBlobFile() const;
+
   Statistics* statistics() const { return statistics_; }
+  const BlobCompactionContext& context() const { return context_; }
+
+ private:
+  Decision HandleValueChange(const Slice& key, std::string* new_value) const;
 
  private:
   BlobCompactionContext context_;
   const uint64_t current_time_;
   Statistics* statistics_;
+
+  mutable std::shared_ptr<BlobFile> blob_file_;
+  mutable std::shared_ptr<BlobLogWriter> writer_;
+
   // It is safe to not using std::atomic since the compaction filter, created
   // from a compaction filter factroy, will not be called from multiple threads.
   mutable uint64_t expired_count_ = 0;
@@ -70,20 +92,28 @@
 
 class BlobIndexCompactionFilter : public BlobIndexCompactionFilterBase {
  public:
-  BlobIndexCompactionFilter(BlobCompactionContext&& context,
-                            uint64_t current_time, Statistics* stats)
-      : BlobIndexCompactionFilterBase(std::move(context), current_time, stats) {
-  }
+  BlobIndexCompactionFilter(
+      BlobCompactionContext&& _context,
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+      uint64_t current_time, Statistics* stats)
+      : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
+                                      std::move(_user_comp_filter_from_factory),
+                                      current_time, stats) {}
 
   const char* Name() const override { return "BlobIndexCompactionFilter"; }
 };
 
 class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase {
  public:
-  BlobIndexCompactionFilterGC(BlobCompactionContext&& context,
-                              BlobCompactionContextGC&& context_gc,
-                              uint64_t current_time, Statistics* stats)
-      : BlobIndexCompactionFilterBase(std::move(context), current_time, stats),
+  BlobIndexCompactionFilterGC(
+      BlobCompactionContext&& _context, BlobCompactionContextGC&& context_gc,
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory,
+      uint64_t current_time, Statistics* stats)
+      : BlobIndexCompactionFilterBase(std::move(_context), _user_comp_filter,
+                                      std::move(_user_comp_filter_from_factory),
+                                      current_time, stats),
         context_gc_(std::move(context_gc)) {}
 
   ~BlobIndexCompactionFilterGC() override;
@@ -94,20 +124,10 @@
                                  std::string* new_value) const override;
 
  private:
-  bool OpenNewBlobFileIfNeeded() const;
-  bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
-                           PinnableSlice* blob,
-                           CompressionType* compression_type) const;
-  bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
-                          uint64_t* new_blob_file_number,
-                          uint64_t* new_blob_offset) const;
-  bool CloseAndRegisterNewBlobFileIfNeeded() const;
-  bool CloseAndRegisterNewBlobFile() const;
+  bool OpenNewBlobFileIfNeeded() const override;
 
  private:
   BlobCompactionContextGC context_gc_;
-  mutable std::shared_ptr<BlobFile> blob_file_;
-  mutable std::shared_ptr<Writer> writer_;
   mutable BlobDBGarbageCollectionStats gc_stats_;
 };
 
@@ -116,51 +136,67 @@
 // that creates non-GC filters.
 class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory {
  public:
-  BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl, Env* _env,
+  BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl,
+                                       SystemClock* _clock,
+                                       const ColumnFamilyOptions& _cf_options,
                                        Statistics* _statistics)
-      : blob_db_impl_(_blob_db_impl), env_(_env), statistics_(_statistics) {}
+      : blob_db_impl_(_blob_db_impl),
+        clock_(_clock),
+        statistics_(_statistics),
+        user_comp_filter_(_cf_options.compaction_filter),
+        user_comp_filter_factory_(_cf_options.compaction_filter_factory) {}
 
  protected:
+  std::unique_ptr<CompactionFilter> CreateUserCompactionFilterFromFactory(
+      const CompactionFilter::Context& context) const;
+
   BlobDBImpl* blob_db_impl() const { return blob_db_impl_; }
-  Env* env() const { return env_; }
+  SystemClock* clock() const { return clock_; }
   Statistics* statistics() const { return statistics_; }
+  const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
 
  private:
   BlobDBImpl* blob_db_impl_;
-  Env* env_;
+  SystemClock* clock_;
   Statistics* statistics_;
+  const CompactionFilter* user_comp_filter_;
+  std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
 };
 
 class BlobIndexCompactionFilterFactory
     : public BlobIndexCompactionFilterFactoryBase {
  public:
-  BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl, Env* _env,
+  BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl,
+                                   SystemClock* _clock,
+                                   const ColumnFamilyOptions& _cf_options,
                                    Statistics* _statistics)
-      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _statistics) {
-  }
+      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
+                                             _statistics) {}
 
   const char* Name() const override {
     return "BlobIndexCompactionFilterFactory";
   }
 
   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& /*context*/) override;
+      const CompactionFilter::Context& context) override;
 };
 
 class BlobIndexCompactionFilterFactoryGC
     : public BlobIndexCompactionFilterFactoryBase {
  public:
-  BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl, Env* _env,
+  BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl,
+                                     SystemClock* _clock,
+                                     const ColumnFamilyOptions& _cf_options,
                                      Statistics* _statistics)
-      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _statistics) {
-  }
+      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
+                                             _statistics) {}
 
   const char* Name() const override {
     return "BlobIndexCompactionFilterFactoryGC";
   }
 
   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& /*context*/) override;
+      const CompactionFilter::Context& context) override;
 };
 
 }  // namespace blob_db
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,6 +8,8 @@
 #include "utilities/blob_db/blob_db.h"
 
 #include <cinttypes>
+
+#include "logging/logging.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -38,6 +40,8 @@
                     const std::vector<ColumnFamilyDescriptor>& column_families,
                     std::vector<ColumnFamilyHandle*>* handles,
                     BlobDB** blob_db) {
+  assert(handles);
+
   if (column_families.size() != 1 ||
       column_families[0].name != kDefaultColumnFamilyName) {
     return Status::NotSupported(
@@ -50,6 +54,14 @@
   if (s.ok()) {
     *blob_db = static_cast<BlobDB*>(blob_db_impl);
   } else {
+    if (!handles->empty()) {
+      for (ColumnFamilyHandle* cfh : *handles) {
+        blob_db_impl->DestroyColumnFamilyHandle(cfh);
+      }
+
+      handles->clear();
+    }
+
     delete blob_db_impl;
     *blob_db = nullptr;
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db.h	2025-05-19 16:14:28.000000000 +0000
@@ -8,8 +8,10 @@
 #ifndef ROCKSDB_LITE
 
 #include <functional>
+#include <limits>
 #include <string>
 #include <vector>
+
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/stackable_db.h"
@@ -24,9 +26,12 @@
 // The factory needs to be moved to include/rocksdb/utilities to allow
 // users to use blob DB.
 
+constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
+
 struct BlobDBOptions {
-  // name of the directory under main db, where blobs will be stored.
-  // default is "blob_dir"
+  // Name of the directory under the base DB where blobs will be stored. Using
+  // a directory where the base DB stores its SST files is not supported.
+  // Default is "blob_dir"
   std::string blob_dir = "blob_dir";
 
   // whether the blob_dir path is relative or absolute.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_gc_stats.h	2025-05-19 16:14:28.000000000 +0000
@@ -5,6 +5,10 @@
 //
 #pragma once
 
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
 #ifndef ROCKSDB_LITE
 
 namespace ROCKSDB_NAMESPACE {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.cc	2025-05-19 16:14:28.000000000 +0000
@@ -12,10 +12,9 @@
 #include <memory>
 #include <sstream>
 
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
 #include "db/db_impl/db_impl.h"
 #include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
 #include "file/file_util.h"
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
@@ -81,7 +80,7 @@
       bdb_options_(blob_db_options),
       db_options_(db_options),
       cf_options_(cf_options),
-      env_options_(db_options),
+      file_options_(db_options),
       statistics_(db_options_.statistics.get()),
       next_file_number_(1),
       flush_sequence_(0),
@@ -92,10 +91,11 @@
       fifo_eviction_seq_(0),
       evict_expiration_up_to_(0),
       debug_level_(0) {
+  clock_ = env_->GetSystemClock().get();
   blob_dir_ = (bdb_options_.path_relative)
                   ? dbname + "/" + bdb_options_.blob_dir
                   : bdb_options_.blob_dir;
-  env_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
+  file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
 }
 
 BlobDBImpl::~BlobDBImpl() {
@@ -137,11 +137,6 @@
     return Status::NotSupported("No blob directory in options");
   }
 
-  if (cf_options_.compaction_filter != nullptr ||
-      cf_options_.compaction_filter_factory != nullptr) {
-    return Status::NotSupported("Blob DB doesn't support compaction filter.");
-  }
-
   if (bdb_options_.garbage_collection_cutoff < 0.0 ||
       bdb_options_.garbage_collection_cutoff > 1.0) {
     return Status::InvalidArgument(
@@ -169,6 +164,12 @@
 
   ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB...");
 
+  if ((cf_options_.compaction_filter != nullptr ||
+       cf_options_.compaction_filter_factory != nullptr)) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "BlobDB only support compaction filter on non-TTL values.");
+  }
+
   // Open blob directory.
   s = env_->CreateDirIfMissing(blob_dir_);
   if (!s.ok()) {
@@ -176,7 +177,8 @@
                     "Failed to create blob_dir %s, status: %s",
                     blob_dir_.c_str(), s.ToString().c_str());
   }
-  s = env_->NewDirectory(blob_dir_, &dir_ent_);
+  s = env_->GetFileSystem()->NewDirectory(blob_dir_, IOOptions(), &dir_ent_,
+                                          nullptr);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(),
@@ -194,22 +196,53 @@
   if (bdb_options_.enable_garbage_collection) {
     db_options_.listeners.push_back(std::make_shared<BlobDBListenerGC>(this));
     cf_options_.compaction_filter_factory =
-        std::make_shared<BlobIndexCompactionFilterFactoryGC>(this, env_,
-                                                             statistics_);
+        std::make_shared<BlobIndexCompactionFilterFactoryGC>(
+            this, clock_, cf_options_, statistics_);
   } else {
     db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
     cf_options_.compaction_filter_factory =
-        std::make_shared<BlobIndexCompactionFilterFactory>(this, env_,
-                                                           statistics_);
+        std::make_shared<BlobIndexCompactionFilterFactory>(
+            this, clock_, cf_options_, statistics_);
   }
 
+  // Reset user compaction filter after building into compaction factory.
+  cf_options_.compaction_filter = nullptr;
+
   // Open base db.
   ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
   s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
   if (!s.ok()) {
     return s;
   }
-  db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+  db_impl_ = static_cast_with_check<DBImpl>(db_->GetRootDB());
+
+  // Sanitize the blob_dir provided. Using a directory where the
+  // base DB stores its files for the default CF is not supported.
+  const ColumnFamilyData* const cfd =
+      static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd();
+  assert(cfd);
+
+  const ImmutableCFOptions* const ioptions = cfd->ioptions();
+  assert(ioptions);
+
+  assert(env_);
+
+  for (const auto& cf_path : ioptions->cf_paths) {
+    bool blob_dir_same_as_cf_dir = false;
+    s = env_->AreFilesSame(blob_dir_, cf_path.path, &blob_dir_same_as_cf_dir);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Error while sanitizing blob_dir %s, status: %s",
+                      blob_dir_.c_str(), s.ToString().c_str());
+      return s;
+    }
+
+    if (blob_dir_same_as_cf_dir) {
+      return Status::NotSupported(
+          "Using the base DB's storage directories for BlobDB files is not "
+          "supported.");
+    }
+  }
 
   // Initialize SST file <-> oldest blob file mapping if garbage collection
   // is enabled.
@@ -313,7 +346,8 @@
     blob_file->MarkImmutable(/* sequence */ 0);
 
     // Read file header and footer
-    Status read_metadata_status = blob_file->ReadMetadata(env_, env_options_);
+    Status read_metadata_status =
+        blob_file->ReadMetadata(env_->GetFileSystem(), file_options_);
     if (read_metadata_status.IsCorruption()) {
       // Remove incomplete file.
       if (!obsolete_files_.empty()) {
@@ -622,7 +656,7 @@
   const SequenceNumber obsolete_seq = GetLatestSequenceNumber();
 
   MarkUnreferencedBlobFilesObsoleteImpl(
-      [=](const std::shared_ptr<BlobFile>& blob_file) {
+      [this, obsolete_seq](const std::shared_ptr<BlobFile>& blob_file) {
         WriteLock file_lock(&blob_file->mutex_);
         return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq);
       });
@@ -630,7 +664,7 @@
 
 void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() {
   MarkUnreferencedBlobFilesObsoleteImpl(
-      [=](const std::shared_ptr<BlobFile>& blob_file) {
+      [this](const std::shared_ptr<BlobFile>& blob_file) {
         return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0);
       });
 }
@@ -646,7 +680,7 @@
     std::shared_ptr<RandomAccessFileReader>* reader) {
   assert(reader != nullptr);
   bool fresh_open = false;
-  Status s = blob_file->GetReader(env_, env_options_, reader, &fresh_open);
+  Status s = blob_file->GetReader(env_, file_options_, reader, &fresh_open);
   if (s.ok() && fresh_open) {
     assert(*reader != nullptr);
     open_file_count_++;
@@ -687,21 +721,23 @@
 
 Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
   std::string fpath(bfile->PathName());
-  std::unique_ptr<WritableFile> wfile;
+  std::unique_ptr<FSWritableFile> wfile;
+  const auto& fs = env_->GetFileSystem();
 
-  Status s = env_->ReopenWritableFile(fpath, &wfile, env_options_);
+  Status s = fs->ReopenWritableFile(fpath, file_options_, &wfile, nullptr);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to open blob file for write: %s status: '%s'"
                     " exists: '%s'",
                     fpath.c_str(), s.ToString().c_str(),
-                    env_->FileExists(fpath).ToString().c_str());
+                    fs->FileExists(fpath, file_options_.io_options, nullptr)
+                        .ToString()
+                        .c_str());
     return s;
   }
 
   std::unique_ptr<WritableFileWriter> fwriter;
-  fwriter.reset(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(wfile)), fpath, env_options_));
+  fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_));
 
   uint64_t boffset = bfile->GetFileSize();
   if (debug_level_ >= 2 && boffset) {
@@ -710,11 +746,11 @@
                     boffset);
   }
 
-  Writer::ElemType et = Writer::kEtNone;
+  BlobLogWriter::ElemType et = BlobLogWriter::kEtNone;
   if (bfile->file_size_ == BlobLogHeader::kSize) {
-    et = Writer::kEtFileHdr;
+    et = BlobLogWriter::kEtFileHdr;
   } else if (bfile->file_size_ > BlobLogHeader::kSize) {
-    et = Writer::kEtRecord;
+    et = BlobLogWriter::kEtRecord;
   } else if (bfile->file_size_) {
     ROCKS_LOG_WARN(db_options_.info_log,
                    "Open blob file: %s with wrong size: %" PRIu64,
@@ -722,9 +758,11 @@
     return Status::Corruption("Invalid blob file size");
   }
 
-  bfile->log_writer_ = std::make_shared<Writer>(
-      std::move(fwriter), env_, statistics_, bfile->file_number_,
-      bdb_options_.bytes_per_sync, db_options_.use_fsync, boffset);
+  constexpr bool do_flush = true;
+
+  bfile->log_writer_ = std::make_shared<BlobLogWriter>(
+      std::move(fwriter), clock_, statistics_, bfile->file_number_,
+      db_options_.use_fsync, do_flush, boffset);
   bfile->log_writer_->last_elem_type_ = et;
 
   return s;
@@ -766,7 +804,7 @@
 
 Status BlobDBImpl::CheckOrCreateWriterLocked(
     const std::shared_ptr<BlobFile>& blob_file,
-    std::shared_ptr<Writer>* writer) {
+    std::shared_ptr<BlobLogWriter>* writer) {
   assert(writer != nullptr);
   *writer = blob_file->GetWriter();
   if (*writer != nullptr) {
@@ -782,7 +820,8 @@
 Status BlobDBImpl::CreateBlobFileAndWriter(
     bool has_ttl, const ExpirationRange& expiration_range,
     const std::string& reason, std::shared_ptr<BlobFile>* blob_file,
-    std::shared_ptr<Writer>* writer) {
+    std::shared_ptr<BlobLogWriter>* writer) {
+  TEST_SYNC_POINT("BlobDBImpl::CreateBlobFileAndWriter");
   assert(has_ttl == (expiration_range.first || expiration_range.second));
   assert(blob_file);
   assert(writer);
@@ -838,7 +877,7 @@
     return Status::OK();
   }
 
-  std::shared_ptr<Writer> writer;
+  std::shared_ptr<BlobLogWriter> writer;
   const Status s = CreateBlobFileAndWriter(
       /* has_ttl */ false, ExpirationRange(),
       /* reason */ "SelectBlobFile", blob_file, &writer);
@@ -884,7 +923,7 @@
   std::ostringstream oss;
   oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')';
 
-  std::shared_ptr<Writer> writer;
+  std::shared_ptr<BlobLogWriter> writer;
   const Status s =
       CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range,
                               /* reason */ oss.str(), blob_file, &writer);
@@ -959,10 +998,11 @@
 };
 
 Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
-  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_WRITE);
   uint32_t default_cf_id =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+          ->GetID();
   Status s;
   BlobInserter blob_inserter(options, this, default_cf_id);
   {
@@ -993,7 +1033,7 @@
 
 Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
                             const Slice& value, uint64_t expiration) {
-  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_PUT);
   Status s;
   WriteBatch batch;
@@ -1017,7 +1057,8 @@
   Status s;
   std::string index_entry;
   uint32_t column_family_id =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+          ->GetID();
   if (value.size() < bdb_options_.min_blob_size) {
     if (expiration == kNoExpiration) {
       // Put as normal value
@@ -1035,7 +1076,8 @@
     Slice value_compressed = GetCompressedSlice(value, &compression_output);
 
     std::string headerbuf;
-    Writer::ConstructBlobHeader(&headerbuf, key, value_compressed, expiration);
+    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_compressed,
+                                       expiration);
 
     // Check DB size limit before selecting blob file to
     // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
@@ -1098,7 +1140,7 @@
   if (bdb_options_.compression == kNoCompression) {
     return raw;
   }
-  StopWatch compression_sw(env_, statistics_, BLOB_DB_COMPRESSION_MICROS);
+  StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS);
   CompressionType type = bdb_options_.compression;
   CompressionOptions opts;
   CompressionContext context(type);
@@ -1109,6 +1151,33 @@
   return *compression_output;
 }
 
+Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
+                                   CompressionType compression_type,
+                                   PinnableSlice* value_output) const {
+  assert(compression_type != kNoCompression);
+
+  BlockContents contents;
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+
+  {
+    StopWatch decompression_sw(clock_, statistics_,
+                               BLOB_DB_DECOMPRESSION_MICROS);
+    UncompressionContext context(compression_type);
+    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                           compression_type);
+    Status s = UncompressBlockContentsForCompressionType(
+        info, compressed_value.data(), compressed_value.size(), &contents,
+        kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
+    if (!s.ok()) {
+      return Status::Corruption("Unable to decompress blob.");
+    }
+  }
+
+  value_output->PinSelf(contents.data);
+
+  return Status::OK();
+}
+
 Status BlobDBImpl::CompactFiles(
     const CompactionOptions& compact_options,
     const std::vector<std::string>& input_file_names, const int output_level,
@@ -1137,10 +1206,10 @@
   return s;
 }
 
-void BlobDBImpl::GetCompactionContextCommon(
-    BlobCompactionContext* context) const {
+void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) {
   assert(context);
 
+  context->blob_db_impl = this;
   context->next_file_number = next_file_number_.load();
   context->current_blob_files.clear();
   for (auto& p : blob_files_) {
@@ -1165,8 +1234,6 @@
   ReadLock l(&mutex_);
   GetCompactionContextCommon(context);
 
-  context_gc->blob_db_impl = this;
-
   if (!live_imm_non_ttl_blob_files_.empty()) {
     auto it = live_imm_non_ttl_blob_files_.begin();
     std::advance(it, bdb_options_.garbage_collection_cutoff *
@@ -1284,7 +1351,7 @@
   uint64_t key_offset = 0;
   {
     WriteLock lockbfile_w(&bfile->mutex_);
-    std::shared_ptr<Writer> writer;
+    std::shared_ptr<BlobLogWriter> writer;
     s = CheckOrCreateWriterLocked(bfile, &writer);
     if (!s.ok()) {
       return s;
@@ -1321,7 +1388,7 @@
 std::vector<Status> BlobDBImpl::MultiGet(
     const ReadOptions& read_options,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  StopWatch multiget_sw(env_, statistics_, BLOB_DB_MULTIGET_MICROS);
+  StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
   // Get a snapshot to avoid blob file get deleted between we
   // fetch and index entry and reading from the file.
@@ -1390,20 +1457,7 @@
   }
 
   if (compression_type != kNoCompression) {
-    BlockContents contents;
-    auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-
-    {
-      StopWatch decompression_sw(env_, statistics_,
-                                 BLOB_DB_DECOMPRESSION_MICROS);
-      UncompressionContext context(compression_type);
-      UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                             compression_type);
-      s = UncompressBlockContentsForCompressionType(
-          info, value->data(), value->size(), &contents,
-          kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
-    }
-
+    s = DecompressSlice(*value, compression_type, value);
     if (!s.ok()) {
       if (debug_level_ >= 2) {
         ROCKS_LOG_ERROR(
@@ -1414,11 +1468,8 @@
             blob_index.file_number(), blob_index.offset(), blob_index.size(),
             key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
       }
-
-      return Status::Corruption("Unable to uncompress blob.");
+      return s;
     }
-
-    value->PinSelf(contents.data);
   }
 
   return Status::OK();
@@ -1482,15 +1533,24 @@
   const uint64_t record_size = sizeof(uint32_t) + key.size() + size;
 
   // Allocate the buffer. This is safe in C++11
-  std::string buffer_str(static_cast<size_t>(record_size), static_cast<char>(0));
-  char* buffer = &buffer_str[0];
+  std::string buf;
+  AlignedBuf aligned_buf;
 
   // A partial blob record contain checksum, key and value.
   Slice blob_record;
 
   {
-    StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
-    s = reader->Read(record_offset, static_cast<size_t>(record_size), &blob_record, buffer);
+    StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+    if (reader->use_direct_io()) {
+      s = reader->Read(IOOptions(), record_offset,
+                       static_cast<size_t>(record_size), &blob_record, nullptr,
+                       &aligned_buf);
+    } else {
+      buf.reserve(static_cast<size_t>(record_size));
+      s = reader->Read(IOOptions(), record_offset,
+                       static_cast<size_t>(record_size), &blob_record, &buf[0],
+                       nullptr);
+    }
     RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
   }
 
@@ -1552,13 +1612,14 @@
 Status BlobDBImpl::Get(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        PinnableSlice* value) {
-  return Get(read_options, column_family, key, value, nullptr /*expiration*/);
+  return Get(read_options, column_family, key, value,
+             static_cast<uint64_t*>(nullptr) /*expiration*/);
 }
 
 Status BlobDBImpl::Get(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        PinnableSlice* value, uint64_t* expiration) {
-  StopWatch get_sw(env_, statistics_, BLOB_DB_GET_MICROS);
+  StopWatch get_sw(clock_, statistics_, BLOB_DB_GET_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_GET);
   return GetImpl(read_options, column_family, key, value, expiration);
 }
@@ -1632,35 +1693,30 @@
 
   for (auto blob_file_pair : blob_files_) {
     auto blob_file = blob_file_pair.second;
-    char buf[1000];
-    int pos = snprintf(buf, sizeof(buf),
-                       "Blob file %" PRIu64 ", size %" PRIu64
-                       ", blob count %" PRIu64 ", immutable %d",
-                       blob_file->BlobFileNumber(), blob_file->GetFileSize(),
-                       blob_file->BlobCount(), blob_file->Immutable());
+    std::ostringstream buf;
+
+    buf << "Blob file " << blob_file->BlobFileNumber() << ", size "
+        << blob_file->GetFileSize() << ", blob count " << blob_file->BlobCount()
+        << ", immutable " << blob_file->Immutable();
+
     if (blob_file->HasTTL()) {
       ExpirationRange expiration_range;
-
       {
         ReadLock file_lock(&blob_file->mutex_);
         expiration_range = blob_file->GetExpirationRange();
       }
+      buf << ", expiration range (" << expiration_range.first << ", "
+          << expiration_range.second << ")";
 
-      pos += snprintf(buf + pos, sizeof(buf) - pos,
-                      ", expiration range (%" PRIu64 ", %" PRIu64 ")",
-                      expiration_range.first, expiration_range.second);
       if (!blob_file->Obsolete()) {
-        pos += snprintf(buf + pos, sizeof(buf) - pos,
-                        ", expire in %" PRIu64 " seconds",
-                        expiration_range.second - now);
+        buf << ", expire in " << (expiration_range.second - now) << "seconds";
       }
     }
     if (blob_file->Obsolete()) {
-      pos += snprintf(buf + pos, sizeof(buf) - pos, ", obsolete at %" PRIu64,
-                      blob_file->GetObsoleteSequence());
+      buf << ", obsolete at " << blob_file->GetObsoleteSequence();
     }
-    snprintf(buf + pos, sizeof(buf) - pos, ".");
-    ROCKS_LOG_INFO(db_options_.info_log, "%s", buf);
+    buf << ".";
+    ROCKS_LOG_INFO(db_options_.info_log, "%s", buf.str().c_str());
   }
 
   // reschedule
@@ -1668,6 +1724,7 @@
 }
 
 Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
+  TEST_SYNC_POINT("BlobDBImpl::CloseBlobFile");
   assert(bfile);
   assert(!bfile->Immutable());
   assert(!bfile->Obsolete());
@@ -1857,7 +1914,7 @@
     }
   }
 
-  s = dir_ent_->Fsync();
+  s = dir_ent_->FsyncWithDirOptions(IOOptions(), nullptr, DirFsyncOptions());
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to sync blob directory, status: %s",
@@ -1949,7 +2006,9 @@
 
   // directory change. Fsync
   if (file_deleted) {
-    Status s = dir_ent_->Fsync();
+    Status s = dir_ent_->FsyncWithDirOptions(
+        IOOptions(), nullptr,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kFileDeleted));
     if (!s.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s",
                       blob_dir_.c_str(), s.ToString().c_str());
@@ -1978,7 +2037,8 @@
 
 Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
   auto* cfd =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd();
+      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+          ->cfd();
   // Get a snapshot to avoid blob file get deleted between we
   // fetch and index entry and reading from the file.
   ManagedSnapshot* own_snapshot = nullptr;
@@ -1989,8 +2049,8 @@
   }
   auto* iter = db_impl_->NewIteratorImpl(
       read_options, cfd, snapshot->GetSequenceNumber(),
-      nullptr /*read_callback*/, true /*allow_blob*/);
-  return new BlobDBIterator(own_snapshot, iter, this, env_, statistics_);
+      nullptr /*read_callback*/, true /*expose_blob_index*/);
+  return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_);
 }
 
 Status DestroyBlobDB(const std::string& dbname, const Options& options,
@@ -2004,21 +2064,21 @@
                                         : bdb_options.blob_dir;
 
   std::vector<std::string> filenames;
-  env->GetChildren(blobdir, &filenames);
-
-  for (const auto& f : filenames) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kBlobFile) {
-      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
-                                /*force_fg=*/false);
-      if (status.ok() && !del.ok()) {
-        status = del;
+  if (env->GetChildren(blobdir, &filenames).ok()) {
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == kBlobFile) {
+        Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
+                                  /*force_fg=*/false);
+        if (status.ok() && !del.ok()) {
+          status = del;
+        }
       }
     }
+    // TODO: What to do if we cannot delete the directory?
+    env->DeleteDir(blobdir).PermitUncheckedError();
   }
-  env->DeleteDir(blobdir);
-
   Status destroy = DestroyDB(dbname, options);
   if (status.ok() && !destroy.ok()) {
     status = destroy;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -19,9 +19,12 @@
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
 #include "db/db_iter.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
@@ -30,15 +33,14 @@
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/blob_db/blob_file.h"
-#include "utilities/blob_db/blob_log_format.h"
-#include "utilities/blob_db/blob_log_reader.h"
-#include "utilities/blob_db/blob_log_writer.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class DBImpl;
 class ColumnFamilyHandle;
 class ColumnFamilyData;
+class SystemClock;
+
 struct FlushJobInfo;
 
 namespace blob_db {
@@ -71,6 +73,7 @@
   friend class BlobDBIterator;
   friend class BlobDBListener;
   friend class BlobDBListenerGC;
+  friend class BlobIndexCompactionFilterBase;
   friend class BlobIndexCompactionFilterGC;
 
  public:
@@ -168,7 +171,7 @@
 
   // Common part of the two GetCompactionContext methods below.
   // REQUIRES: read lock on mutex_
-  void GetCompactionContextCommon(BlobCompactionContext* context) const;
+  void GetCompactionContextCommon(BlobCompactionContext* context);
 
   void GetCompactionContext(BlobCompactionContext* context);
   void GetCompactionContext(BlobCompactionContext* context,
@@ -232,11 +235,16 @@
   Slice GetCompressedSlice(const Slice& raw,
                            std::string* compression_output) const;
 
+  Status DecompressSlice(const Slice& compressed_value,
+                         CompressionType compression_type,
+                         PinnableSlice* value_output) const;
+
   // Close a file by appending a footer, and removes file from open files list.
   // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
   // and the blob file's mutex_. If called on a blob file which is visible only
-  // to a single thread (like in the case of new files written during GC), the
-  // locks on write_mutex_ and the blob file's mutex_ can be avoided.
+  // to a single thread (like in the case of new files written during
+  // compaction/GC), the locks on write_mutex_ and the blob file's mutex_ can be
+  // avoided.
   Status CloseBlobFile(std::shared_ptr<BlobFile> bfile);
 
   // Close a file if its size exceeds blob_file_size
@@ -263,7 +271,7 @@
                                  const ExpirationRange& expiration_range,
                                  const std::string& reason,
                                  std::shared_ptr<BlobFile>* blob_file,
-                                 std::shared_ptr<Writer>* writer);
+                                 std::shared_ptr<BlobLogWriter>* writer);
 
   // Get the open non-TTL blob log file, or create a new one if no such file
   // exists.
@@ -368,10 +376,10 @@
   // creates a sequential (append) writer for this blobfile
   Status CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile);
 
-  // returns a Writer object for the file. If writer is not
+  // returns a BlobLogWriter object for the file. If writer is not
   // already present, creates one. Needs Write Mutex to be held
   Status CheckOrCreateWriterLocked(const std::shared_ptr<BlobFile>& blob_file,
-                                   std::shared_ptr<Writer>* writer);
+                                   std::shared_ptr<BlobLogWriter>* writer);
 
   // checks if there is no snapshot which is referencing the
   // blobs
@@ -380,7 +388,7 @@
 
   void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
 
-  uint64_t EpochNow() { return env_->NowMicros() / 1000000; }
+  uint64_t EpochNow() { return clock_->NowMicros() / 1000000; }
 
   // Check if inserting a new blob will make DB grow out of space.
   // If is_fifo = true, FIFO eviction will be triggered to make room for the
@@ -395,12 +403,12 @@
   // the base DB
   DBImpl* db_impl_;
   Env* env_;
-
+  SystemClock* clock_;
   // the options that govern the behavior of Blob Storage
   BlobDBOptions bdb_options_;
   DBOptions db_options_;
   ColumnFamilyOptions cf_options_;
-  EnvOptions env_options_;
+  FileOptions file_options_;
 
   // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold
   // ownership.
@@ -411,7 +419,7 @@
   std::string blob_dir_;
 
   // pointer to directory
-  std::unique_ptr<Directory> dir_ent_;
+  std::unique_ptr<FSDirectory> dir_ent_;
 
   // Read Write Mutex, which protects all the data structures
   // HEAVILY TRAFFICKED
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,11 +5,11 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "utilities/blob_db/blob_db_impl.h"
-
 #include "file/filename.h"
 #include "logging/logging.h"
+#include "util/cast_util.h"
 #include "util/mutexlock.h"
+#include "utilities/blob_db/blob_db_impl.h"
 
 // BlobDBImpl methods to get snapshot of files, e.g. for replication.
 
@@ -32,7 +32,7 @@
   }
 
   ROCKS_LOG_INFO(db_options_.info_log,
-                 "Disalbed blob file deletions. count: %d", count);
+                 "Disabled blob file deletions. count: %d", count);
   return Status::OK();
 }
 
@@ -98,7 +98,11 @@
     // Path should be relative to db_name, but begin with slash.
     filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number);
     filemetadata.file_number = file_number;
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+    if (blob_file->HasTTL()) {
+      filemetadata.oldest_ancester_time = blob_file->GetExpirationRange().first;
+    }
+    auto cfh =
+        static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily());
     filemetadata.column_family_name = cfh->GetName();
     metadata->emplace_back(filemetadata);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_iterator.h	2025-05-19 16:14:28.000000000 +0000
@@ -7,12 +7,14 @@
 #ifndef ROCKSDB_LITE
 
 #include "db/arena_wrapped_db_iter.h"
-#include "monitoring/statistics.h"
 #include "rocksdb/iterator.h"
 #include "util/stop_watch.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class SystemClock;
+
 namespace blob_db {
 
 using ROCKSDB_NAMESPACE::ManagedSnapshot;
@@ -20,11 +22,12 @@
 class BlobDBIterator : public Iterator {
  public:
   BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
-                 BlobDBImpl* blob_db, Env* env, Statistics* statistics)
+                 BlobDBImpl* blob_db, SystemClock* clock,
+                 Statistics* statistics)
       : snapshot_(snapshot),
         iter_(iter),
         blob_db_(blob_db),
-        env_(env),
+        clock_(clock),
         statistics_(statistics) {}
 
   virtual ~BlobDBIterator() = default;
@@ -44,7 +47,7 @@
   }
 
   void SeekToFirst() override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekToFirst();
     while (UpdateBlobValue()) {
@@ -53,7 +56,7 @@
   }
 
   void SeekToLast() override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekToLast();
     while (UpdateBlobValue()) {
@@ -62,7 +65,7 @@
   }
 
   void Seek(const Slice& target) override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->Seek(target);
     while (UpdateBlobValue()) {
@@ -71,7 +74,7 @@
   }
 
   void SeekForPrev(const Slice& target) override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekForPrev(target);
     while (UpdateBlobValue()) {
@@ -81,7 +84,7 @@
 
   void Next() override {
     assert(Valid());
-    StopWatch next_sw(env_, statistics_, BLOB_DB_NEXT_MICROS);
+    StopWatch next_sw(clock_, statistics_, BLOB_DB_NEXT_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_NEXT);
     iter_->Next();
     while (UpdateBlobValue()) {
@@ -91,7 +94,7 @@
 
   void Prev() override {
     assert(Valid());
-    StopWatch prev_sw(env_, statistics_, BLOB_DB_PREV_MICROS);
+    StopWatch prev_sw(clock_, statistics_, BLOB_DB_PREV_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_PREV);
     iter_->Prev();
     while (UpdateBlobValue()) {
@@ -137,7 +140,7 @@
   std::unique_ptr<ManagedSnapshot> snapshot_;
   std::unique_ptr<ArenaWrappedDBIter> iter_;
   BlobDBImpl* blob_db_;
-  Env* env_;
+  SystemClock* clock_;
   Statistics* statistics_;
   Status status_;
   PinnableSlice value_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_listener.h	2025-05-19 16:14:28.000000000 +0000
@@ -37,6 +37,9 @@
     blob_db_impl_->UpdateLiveSSTSize();
   }
 
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "BlobDBListener"; }
+
  protected:
   BlobDBImpl* blob_db_impl_;
 };
@@ -46,6 +49,8 @@
   explicit BlobDBListenerGC(BlobDBImpl* blob_db_impl)
       : BlobDBListener(blob_db_impl) {}
 
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "BlobDBListenerGC"; }
   void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
     BlobDBListener::OnFlushCompleted(db, info);
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,6 +5,8 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "utilities/blob_db/blob_db.h"
+
 #include <algorithm>
 #include <chrono>
 #include <cstdlib>
@@ -15,21 +17,20 @@
 #include <string>
 #include <vector>
 
-#include "db/blob_index.h"
+#include "db/blob/blob_index.h"
 #include "db/db_test_util.h"
 #include "env/composite_env_wrapper.h"
 #include "file/file_util.h"
 #include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
-#include "test_util/fault_injection_test_env.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
-#include "util/cast_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "utilities/blob_db/blob_db.h"
 #include "utilities/blob_db/blob_db_impl.h"
+#include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace blob_db {
@@ -58,9 +59,11 @@
 
   BlobDBTest()
       : dbname_(test::PerThreadDBPath("blob_db_test")),
-        mock_env_(new MockTimeEnv(Env::Default())),
-        fault_injection_env_(new FaultInjectionTestEnv(Env::Default())),
         blob_db_(nullptr) {
+    mock_clock_ = std::make_shared<MockSystemClock>(SystemClock::Default());
+    mock_env_.reset(new CompositeEnvWrapper(Env::Default(), mock_clock_));
+    fault_injection_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+
     Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions());
     assert(s.ok());
   }
@@ -73,6 +76,18 @@
   Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(),
                  Options options = Options()) {
     options.create_if_missing = true;
+    if (options.env == mock_env_.get()) {
+      // Need to disable stats dumping and persisting which also use
+      // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
+      // With mocked time, this can hang on some platforms (MacOS)
+      // because (a) on some platforms, pthread_cond_timedwait does not appear
+      // to release the lock for other threads to operate if the deadline time
+      // is already passed, and (b) TimedWait calls are currently a bad
+      // abstraction because the deadline parameter is usually computed from
+      // Env time, but is interpreted in real clock time.
+      options.stats_dump_period_sec = 0;
+      options.stats_persist_period_sec = 0;
+    }
     return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
   }
 
@@ -142,7 +157,7 @@
   void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
                         std::map<std::string, std::string> *data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
-    std::string value = test::RandomHumanReadableString(rnd, len);
+    std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(
         blob_db_->PutWithTTL(WriteOptions(), Slice(key), Slice(value), ttl));
     if (data != nullptr) {
@@ -153,7 +168,7 @@
   void PutRandomUntil(const std::string &key, uint64_t expiration, Random *rnd,
                       std::map<std::string, std::string> *data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
-    std::string value = test::RandomHumanReadableString(rnd, len);
+    std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(blob_db_->PutUntil(WriteOptions(), Slice(key), Slice(value),
                                  expiration));
     if (data != nullptr) {
@@ -169,7 +184,7 @@
   void PutRandom(DB *db, const std::string &key, Random *rnd,
                  std::map<std::string, std::string> *data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
-    std::string value = test::RandomHumanReadableString(rnd, len);
+    std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
     if (data != nullptr) {
       (*data)[key] = value;
@@ -180,7 +195,7 @@
       const std::string &key, Random *rnd, WriteBatch *batch,
       std::map<std::string, std::string> *data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
-    std::string value = test::RandomHumanReadableString(rnd, len);
+    std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(batch->Put(key, value));
     if (data != nullptr) {
       (*data)[key] = value;
@@ -224,7 +239,7 @@
     DB *db = blob_db_->GetRootDB();
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
-    GetAllKeyVersions(db, "", "", kMaxKeys, &versions);
+    ASSERT_OK(GetAllKeyVersions(db, "", "", kMaxKeys, &versions));
     ASSERT_EQ(expected_versions.size(), versions.size());
     size_t i = 0;
     for (auto &key_version : expected_versions) {
@@ -300,7 +315,8 @@
   }
 
   const std::string dbname_;
-  std::unique_ptr<MockTimeEnv> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+  std::unique_ptr<Env> mock_env_;
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
   BlobDB *blob_db_;
 };  // class BlobDBTest
@@ -329,13 +345,13 @@
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   for (size_t i = 0; i < 100; i++) {
     uint64_t ttl = rnd.Next() % 100;
     PutRandomWithTTL("key" + ToString(i), ttl, &rnd,
                      (ttl <= 50 ? nullptr : &data));
   }
-  mock_env_->set_current_time(100);
+  mock_clock_->SetCurrentTime(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
@@ -355,13 +371,13 @@
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   for (size_t i = 0; i < 100; i++) {
     uint64_t expiration = rnd.Next() % 100 + 50;
     PutRandomUntil("key" + ToString(i), expiration, &rnd,
                    (expiration <= 100 ? nullptr : &data));
   }
-  mock_env_->set_current_time(100);
+  mock_clock_->SetCurrentTime(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
@@ -398,10 +414,10 @@
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
   bdb_options.disable_background_tasks = true;
-  mock_env_->set_current_time(100);
+  mock_clock_->SetCurrentTime(100);
   Open(bdb_options, options);
-  Put("key1", "value1");
-  PutWithTTL("key2", "value2", 200);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(PutWithTTL("key2", "value2", 200));
   PinnableSlice value;
   uint64_t expiration;
   ASSERT_OK(blob_db_->Get(ReadOptions(), "key1", &value, &expiration));
@@ -454,7 +470,8 @@
     for (size_t j = 0; j < 10; j++) {
       PutRandomToWriteBatch("key" + ToString(j * 100 + i), &rnd, &batch, &data);
     }
-    blob_db_->Write(WriteOptions(), &batch);
+
+    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
   }
   VerifyDB(data);
 }
@@ -486,7 +503,7 @@
   }
   WriteBatch batch;
   for (size_t i = 0; i < 100; i++) {
-    batch.Delete("key" + ToString(i));
+    ASSERT_OK(batch.Delete("key" + ToString(i)));
   }
   ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
   // DB should be empty.
@@ -528,7 +545,7 @@
       PutRandomToWriteBatch("write-batch-key" + ToString(j * 100 + i), &rnd,
                             &batch, &data);
     }
-    blob_db_->Write(WriteOptions(), &batch);
+    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
   }
   VerifyDB(data);
 }
@@ -549,7 +566,168 @@
   Reopen(bdb_options);
   VerifyDB(data);
 }
-#endif
+
+TEST_F(BlobDBTest, EnableDisableCompressionGC) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.garbage_collection_cutoff = 1.0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = kSnappyCompression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  size_t data_idx = 0;
+  for (; data_idx < 100; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_EQ(kSnappyCompression, blob_files[0]->GetCompressionType());
+
+  // disable compression
+  bdb_options.compression = kNoCompression;
+  Reopen(bdb_options);
+
+  // Add more data with new compression type
+  for (; data_idx < 200; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  ASSERT_EQ(kNoCompression, blob_files[1]->GetCompressionType());
+
+  // Enable GC. If we do it earlier the snapshot release triggered compaction
+  // may compact files and trigger GC before we can verify there are two files.
+  bdb_options.enable_garbage_collection = true;
+  Reopen(bdb_options);
+
+  // Trigger compaction
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  VerifyDB(data);
+
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
+  }
+
+  // enabling the compression again
+  bdb_options.compression = kSnappyCompression;
+  Reopen(bdb_options);
+
+  // Add more data with new compression type
+  for (; data_idx < 300; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  // Trigger compaction
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  VerifyDB(data);
+
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
+  }
+}
+
+#ifdef LZ4
+// Test switch compression types and run GC, it needs both Snappy and LZ4
+// support.
+TEST_F(BlobDBTest, ChangeCompressionGC) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.garbage_collection_cutoff = 1.0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = kLZ4Compression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  size_t data_idx = 0;
+  for (; data_idx < 100; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_EQ(kLZ4Compression, blob_files[0]->GetCompressionType());
+
+  // Change compression type
+  bdb_options.compression = kSnappyCompression;
+  Reopen(bdb_options);
+
+  // Add more data with Snappy compression type
+  for (; data_idx < 200; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  // Verify blob file compression type
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  ASSERT_EQ(kSnappyCompression, blob_files[1]->GetCompressionType());
+
+  // Enable GC. If we do it earlier the snapshot release triggered compaction
+  // may compact files and trigger GC before we can verify there are two files.
+  bdb_options.enable_garbage_collection = true;
+  Reopen(bdb_options);
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyDB(data);
+
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
+  }
+
+  // Disable compression
+  bdb_options.compression = kNoCompression;
+  Reopen(bdb_options);
+  for (; data_idx < 300; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyDB(data);
+
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
+  }
+
+  // switching different compression types to generate mixed compression types
+  bdb_options.compression = kSnappyCompression;
+  Reopen(bdb_options);
+  for (; data_idx < 400; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  bdb_options.compression = kLZ4Compression;
+  Reopen(bdb_options);
+  for (; data_idx < 500; data_idx++) {
+    PutRandom("put-key" + ToString(data_idx), &rnd, &data);
+  }
+  VerifyDB(data);
+
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyDB(data);
+
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  for (auto bfile : blob_files) {
+    ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType());
+  }
+}
+#endif  // LZ4
+#endif  // SNAPPY
 
 TEST_F(BlobDBTest, MultipleWriters) {
   Open(BlobDBOptions());
@@ -567,7 +745,7 @@
             } else {
               WriteBatch batch;
               PutRandomToWriteBatch(key, &rnd, &batch, &data_set[id]);
-              blob_db_->Write(WriteOptions(), &batch);
+              ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
             }
           }
         },
@@ -610,7 +788,7 @@
   Open(bdb_options, db_options);
 
   // Create one obselete file and clean it.
-  blob_db_->Put(WriteOptions(), "foo", "bar");
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   std::shared_ptr<BlobFile> bfile = blob_files[0];
@@ -655,14 +833,14 @@
 
   Open(bdb_options, db_options);
   std::string blob_dir = blob_db_impl()->TEST_blob_dir();
-  blob_db_->Put(WriteOptions(), "foo", "bar");
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
   Close();
 
   // Create 3 dummy trash files under the blob_dir
-  LegacyFileSystemWrapper fs(db_options.env);
-  CreateFile(&fs, blob_dir + "/000666.blob.trash", "", false);
-  CreateFile(&fs, blob_dir + "/000888.blob.trash", "", true);
-  CreateFile(&fs, blob_dir + "/something_not_match.trash", "", false);
+  const auto &fs = db_options.env->GetFileSystem();
+  ASSERT_OK(CreateFile(fs, blob_dir + "/000666.blob.trash", "", false));
+  ASSERT_OK(CreateFile(fs, blob_dir + "/000888.blob.trash", "", true));
+  ASSERT_OK(CreateFile(fs, blob_dir + "/something_not_match.trash", "", false));
 
   // Make sure that reopening the DB rescan the existing trash files
   Open(bdb_options, db_options);
@@ -693,10 +871,13 @@
   bdb_options.garbage_collection_cutoff = 1.0;
   bdb_options.disable_background_tasks = true;
 
+  Options options;
+  options.disable_auto_compactions = true;
+
   // i = when to take snapshot
   for (int i = 0; i < 4; i++) {
     Destroy();
-    Open(bdb_options);
+    Open(bdb_options, options);
 
     const Snapshot *snapshot = nullptr;
 
@@ -758,7 +939,7 @@
 TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
   Options options;
   options.env = mock_env_.get();
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   Open(BlobDBOptions(), options);
   ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily();
   ColumnFamilyHandle *handle = nullptr;
@@ -773,8 +954,8 @@
   ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100)
                   .IsNotSupported());
   WriteBatch batch;
-  batch.Put("k1", "v1");
-  batch.Put(handle, "k2", "v2");
+  ASSERT_OK(batch.Put("k1", "v1"));
+  ASSERT_OK(batch.Put(handle, "k2", "v2"));
   ASSERT_TRUE(blob_db_->Write(WriteOptions(), &batch).IsNotSupported());
   ASSERT_TRUE(blob_db_->Get(ReadOptions(), "k1", &value).IsNotFound());
   ASSERT_TRUE(
@@ -790,29 +971,50 @@
 
 TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   Random rnd(301);
+
   BlobDBOptions bdb_options;
   bdb_options.blob_dir = "blob_dir";
   bdb_options.path_relative = true;
+  bdb_options.ttl_range_secs = 10;
   bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
-  Open(bdb_options);
+
+  Options options;
+  options.env = mock_env_.get();
+
+  Open(bdb_options, options);
+
   std::map<std::string, std::string> data;
   for (size_t i = 0; i < 100; i++) {
     PutRandom("key" + ToString(i), &rnd, &data);
   }
+
+  constexpr uint64_t expiration = 1000ULL;
+  PutRandomUntil("key100", expiration, &rnd, &data);
+
   std::vector<LiveFileMetaData> metadata;
   blob_db_->GetLiveFilesMetaData(&metadata);
-  ASSERT_EQ(1U, metadata.size());
+
+  ASSERT_EQ(2U, metadata.size());
   // Path should be relative to db_name, but begin with slash.
-  std::string filename = "/blob_dir/000001.blob";
-  ASSERT_EQ(filename, metadata[0].name);
+  const std::string filename1("/blob_dir/000001.blob");
+  ASSERT_EQ(filename1, metadata[0].name);
   ASSERT_EQ(1, metadata[0].file_number);
-  ASSERT_EQ("default", metadata[0].column_family_name);
+  ASSERT_EQ(0, metadata[0].oldest_ancester_time);
+  ASSERT_EQ(kDefaultColumnFamilyName, metadata[0].column_family_name);
+
+  const std::string filename2("/blob_dir/000002.blob");
+  ASSERT_EQ(filename2, metadata[1].name);
+  ASSERT_EQ(2, metadata[1].file_number);
+  ASSERT_EQ(expiration, metadata[1].oldest_ancester_time);
+  ASSERT_EQ(kDefaultColumnFamilyName, metadata[1].column_family_name);
+
   std::vector<std::string> livefile;
   uint64_t mfs;
   ASSERT_OK(blob_db_->GetLiveFiles(livefile, &mfs, false));
-  ASSERT_EQ(4U, livefile.size());
-  ASSERT_EQ(filename, livefile[3]);
+  ASSERT_EQ(5U, livefile.size());
+  ASSERT_EQ(filename1, livefile[3]);
+  ASSERT_EQ(filename2, livefile[4]);
   VerifyDB(data);
 }
 
@@ -859,7 +1061,7 @@
     if (data.count(key) == 0) {
       ASSERT_TRUE(s.IsNotFound());
     } else if (is_blob[i]) {
-      ASSERT_TRUE(s.IsNotSupported());
+      ASSERT_TRUE(s.IsCorruption());
     } else {
       ASSERT_OK(s);
       ASSERT_EQ(data[key], value);
@@ -1069,7 +1271,7 @@
   bdb_options.disable_background_tasks = true;
   Options options;
   options.env = mock_env_.get();
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
   std::map<std::string, KeyVersion> versions;
@@ -1079,7 +1281,7 @@
     uint64_t expiration = rnd.Next() % kMaxExpiration;
     int len = is_small_value ? 50 : 200;
     std::string key = "key" + ToString(i);
-    std::string value = test::RandomHumanReadableString(&rnd, len);
+    std::string value = rnd.HumanReadableString(len);
     std::string blob_index;
     data[key] = value;
     SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
@@ -1111,27 +1313,187 @@
   ASSERT_TRUE(ttl_file->HasTTL());
 }
 
-TEST_F(BlobDBTest, CompactionFilterNotSupported) {
-  class TestCompactionFilter : public CompactionFilter {
-    const char *Name() const override { return "TestCompactionFilter"; }
+TEST_F(BlobDBTest, UserCompactionFilter) {
+  class CustomerFilter : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
+                std::string *new_value, bool *value_changed) const override {
+      *value_changed = false;
+      // changing value size to test value transitions between inlined data
+      // and stored-in-blob data
+      if (value.size() % 4 == 1) {
+        *new_value = value.ToString();
+        // double size by duplicating value
+        *new_value += *new_value;
+        *value_changed = true;
+        return false;
+      } else if (value.size() % 3 == 1) {
+        *new_value = value.ToString();
+        // trancate value size by half
+        *new_value = new_value->substr(0, new_value->size() / 2);
+        *value_changed = true;
+        return false;
+      } else if (value.size() % 2 == 1) {
+        return true;
+      }
+      return false;
+    }
+    bool IgnoreSnapshots() const override { return true; }
+    const char *Name() const override { return "CustomerFilter"; }
   };
-  class TestCompactionFilterFactory : public CompactionFilterFactory {
-    const char *Name() const override { return "TestCompactionFilterFactory"; }
+  class CustomerFilterFactory : public CompactionFilterFactory {
+    const char *Name() const override { return "CustomerFilterFactory"; }
     std::unique_ptr<CompactionFilter> CreateCompactionFilter(
         const CompactionFilter::Context & /*context*/) override {
-      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+      return std::unique_ptr<CompactionFilter>(new CustomerFilter());
     }
   };
-  for (int i = 0; i < 2; i++) {
+
+  constexpr size_t kNumPuts = 1 << 10;
+  // Generate both inlined and blob value
+  constexpr uint64_t kMinValueSize = 1 << 6;
+  constexpr uint64_t kMaxValueSize = 1 << 8;
+  constexpr uint64_t kMinBlobSize = 1 << 7;
+  static_assert(kMinValueSize < kMinBlobSize, "");
+  static_assert(kMaxValueSize > kMinBlobSize, "");
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = kMinBlobSize;
+  bdb_options.blob_file_size = kMaxValueSize * 10;
+  bdb_options.disable_background_tasks = true;
+  if (Snappy_Supported()) {
+    bdb_options.compression = CompressionType::kSnappyCompression;
+  }
+  // case_num == 0: Test user defined compaction filter
+  // case_num == 1: Test user defined compaction filter factory
+  for (int case_num = 0; case_num < 2; case_num++) {
     Options options;
-    if (i == 0) {
-      options.compaction_filter = new TestCompactionFilter();
+    if (case_num == 0) {
+      options.compaction_filter = new CustomerFilter();
     } else {
-      options.compaction_filter_factory.reset(
-          new TestCompactionFilterFactory());
+      options.compaction_filter_factory.reset(new CustomerFilterFactory());
+    }
+    options.disable_auto_compactions = true;
+    options.env = mock_env_.get();
+    options.statistics = CreateDBStatistics();
+    Open(bdb_options, options);
+
+    std::map<std::string, std::string> data;
+    std::map<std::string, std::string> data_after_compact;
+    Random rnd(301);
+    uint64_t value_size = kMinValueSize;
+    int drop_record = 0;
+    for (size_t i = 0; i < kNumPuts; ++i) {
+      std::ostringstream oss;
+      oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+      const std::string key(oss.str());
+      const std::string value = rnd.HumanReadableString((int)value_size);
+      const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+      ASSERT_OK(Put(key, value));
+      ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+
+      data[key] = value;
+      if (value.length() % 4 == 1) {
+        data_after_compact[key] = value + value;
+      } else if (value.length() % 3 == 1) {
+        data_after_compact[key] = value.substr(0, value.size() / 2);
+      } else if (value.length() % 2 == 1) {
+        ++drop_record;
+      } else {
+        data_after_compact[key] = value;
+      }
+
+      if (++value_size > kMaxValueSize) {
+        value_size = kMinValueSize;
+      }
+    }
+    // Verify full data set
+    VerifyDB(data);
+    // Applying compaction filter for records
+    ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    // Verify data after compaction, only value with even length left.
+    VerifyDB(data_after_compact);
+    ASSERT_EQ(drop_record,
+              options.statistics->getTickerCount(COMPACTION_KEY_DROP_USER));
+    delete options.compaction_filter;
+    Destroy();
+  }
+}
+
+// Test user comapction filter when there is IO error on blob data.
+TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
+  class CustomerFilter : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
+                std::string *new_value, bool *value_changed) const override {
+      *new_value = value.ToString() + "_new";
+      *value_changed = true;
+      return false;
+    }
+    bool IgnoreSnapshots() const override { return true; }
+    const char *Name() const override { return "CustomerFilter"; }
+  };
+
+  constexpr size_t kNumPuts = 100;
+  constexpr int kValueSize = 100;
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.blob_file_size = kValueSize * 10;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = CompressionType::kNoCompression;
+
+  std::vector<std::string> io_failure_cases = {
+      "BlobDBImpl::CreateBlobFileAndWriter",
+      "BlobIndexCompactionFilterBase::WriteBlobToNewFile",
+      "BlobDBImpl::CloseBlobFile"};
+
+  for (size_t case_num = 0; case_num < io_failure_cases.size(); case_num++) {
+    Options options;
+    options.compaction_filter = new CustomerFilter();
+    options.disable_auto_compactions = true;
+    options.env = fault_injection_env_.get();
+    options.statistics = CreateDBStatistics();
+    Open(bdb_options, options);
+
+    std::map<std::string, std::string> data;
+    Random rnd(301);
+    for (size_t i = 0; i < kNumPuts; ++i) {
+      std::ostringstream oss;
+      oss << "key" << std::setw(4) << std::setfill('0') << i;
+
+      const std::string key(oss.str());
+      const std::string value = rnd.HumanReadableString(kValueSize);
+      const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+
+      ASSERT_OK(Put(key, value));
+      ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+      data[key] = value;
     }
-    ASSERT_TRUE(TryOpen(BlobDBOptions(), options).IsNotSupported());
+
+    // Verify full data set
+    VerifyDB(data);
+
+    SyncPoint::GetInstance()->SetCallBack(
+        io_failure_cases[case_num], [&](void * /*arg*/) {
+          fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto s = blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_TRUE(s.IsIOError());
+
+    // Reactivate file system to allow test to verify and close DB.
+    fault_injection_env_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // Verify full data set after compaction failure
+    VerifyDB(data);
+
     delete options.compaction_filter;
+    Destroy();
   }
 }
 
@@ -1143,7 +1505,7 @@
   constexpr uint64_t kCompactTime = 500;
   constexpr uint64_t kMinBlobSize = 100;
   Random rnd(301);
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   BlobDBOptions bdb_options;
   bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.disable_background_tasks = true;
@@ -1159,7 +1521,7 @@
     uint64_t expiration = rnd.Next() % kMaxExpiration;
     int len = is_small_value ? 10 : 200;
     std::string key = "key" + ToString(rnd.Next() % kNumKeys);
-    std::string value = test::RandomHumanReadableString(&rnd, len);
+    std::string value = rnd.HumanReadableString(len);
     if (!has_ttl) {
       if (is_small_value) {
         std::string blob_entry;
@@ -1182,7 +1544,7 @@
   }
   VerifyDB(data);
 
-  mock_env_->set_current_time(kCompactTime);
+  mock_clock_->SetCurrentTime(kCompactTime);
   // Take a snapshot before compaction. Make sure expired blob indexes is
   // filtered regardless of snapshot.
   const Snapshot *snapshot = blob_db_->GetSnapshot();
@@ -1192,7 +1554,7 @@
   // Verify expired blob index are filtered.
   std::vector<KeyVersion> versions;
   const size_t kMaxKeys = 10000;
-  GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions);
+  ASSERT_OK(GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions));
   ASSERT_EQ(data_after_compact.size(), versions.size());
   for (auto &version : versions) {
     ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
@@ -1268,7 +1630,7 @@
   bdb_options.disable_background_tasks = true;
   Options options;
   // Use mock env to stop wall clock.
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   options.env = mock_env_.get();
   auto statistics = CreateDBStatistics();
   options.statistics = statistics;
@@ -1280,7 +1642,7 @@
   // Insert some small values that will be inlined.
   for (int i = 0; i < 1000; i++) {
     std::string key = "key" + ToString(i);
-    std::string value = test::RandomHumanReadableString(&rnd, 50);
+    std::string value = rnd.HumanReadableString(50);
     uint64_t ttl = rnd.Next() % 120 + 1;
     ASSERT_OK(PutWithTTL(key, value, ttl, &data));
     if (ttl >= 60) {
@@ -1388,8 +1750,7 @@
     oss << "key" << std::setw(4) << std::setfill('0') << i;
 
     const std::string key(oss.str());
-    const std::string value(
-        test::RandomHumanReadableString(&rnd, kLargeValueSize));
+    const std::string value = rnd.HumanReadableString(kLargeValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
     ASSERT_OK(Put(key, value));
@@ -1406,8 +1767,7 @@
   // First, add a large TTL value will be written to its own TTL blob file.
   {
     const std::string key("key2000");
-    const std::string value(
-        test::RandomHumanReadableString(&rnd, kLargeValueSize));
+    const std::string value = rnd.HumanReadableString(kLargeValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
     ASSERT_OK(PutUntil(key, value, kExpiration));
@@ -1423,8 +1783,7 @@
   // Now add a small TTL value (which will be inlined).
   {
     const std::string key("key3000");
-    const std::string value(
-        test::RandomHumanReadableString(&rnd, kSmallValueSize));
+    const std::string value = rnd.HumanReadableString(kSmallValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
     ASSERT_OK(PutUntil(key, value, kExpiration));
@@ -1440,8 +1799,7 @@
   // value).
   {
     const std::string key("key4000");
-    const std::string value(
-        test::RandomHumanReadableString(&rnd, kSmallValueSize));
+    const std::string value = rnd.HumanReadableString(kSmallValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
     ASSERT_OK(Put(key, value));
@@ -1469,7 +1827,7 @@
     }
   }
 
-  mock_env_->set_current_time(kCompactTime);
+  mock_clock_->SetCurrentTime(kCompactTime);
 
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
@@ -1547,8 +1905,8 @@
   Open(bdb_options, db_options);
 
   // Write a couple of valid blobs.
-  Put("foo", "bar");
-  Put("dead", "beef");
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("dead", "beef"));
 
   // Write a fake blob reference into the base DB that cannot be parsed.
   WriteBatch batch;
@@ -1584,7 +1942,7 @@
   Options options;
   options.env = mock_env_.get();
   Open(bdb_options, options);
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   std::map<std::string, std::string> data;
   ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
@@ -1593,7 +1951,7 @@
   ASSERT_FALSE(blob_file->Immutable());
   ASSERT_FALSE(blob_file->Obsolete());
   VerifyDB(data);
-  mock_env_->set_current_time(250);
+  mock_clock_->SetCurrentTime(250);
   // The key should expired now.
   blob_db_impl()->TEST_EvictExpiredFiles();
   ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
@@ -1925,7 +2283,7 @@
       {"BlobDBTest.ShutdownWait:3", "BlobDBImpl::EvictExpiredFiles:3"},
   });
   // Force all tasks to be scheduled immediately.
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+  SyncPoint::GetInstance()->SetCallBack(
       "TimeQueue::Add:item.end", [&](void *arg) {
         std::chrono::steady_clock::time_point *tp =
             static_cast<std::chrono::steady_clock::time_point *>(arg);
@@ -1933,7 +2291,7 @@
             std::chrono::steady_clock::now() - std::chrono::milliseconds(10000);
       });
 
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+  SyncPoint::GetInstance()->SetCallBack(
       "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) {
         // Sleep 3 ms to increase the chance of data race.
         // We've synced up the code so that EvictExpiredFiles()
@@ -1950,7 +2308,7 @@
   SyncPoint::GetInstance()->EnableProcessing();
 
   Open(bdb_options, options);
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   std::map<std::string, std::string> data;
   ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
@@ -1961,7 +2319,7 @@
   VerifyDB(data);
 
   TEST_SYNC_POINT("BlobDBTest.ShutdownWait:0");
-  mock_env_->set_current_time(250);
+  mock_clock_->SetCurrentTime(250);
   // The key should expired now.
   TEST_SYNC_POINT("BlobDBTest.ShutdownWait:1");
 
@@ -1972,6 +2330,57 @@
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(BlobDBTest, SyncBlobFileBeforeClose) {
+  Options options;
+  options.statistics = CreateDBStatistics();
+
+  BlobDBOptions blob_options;
+  blob_options.min_blob_size = 0;
+  blob_options.bytes_per_sync = 1 << 20;
+  blob_options.disable_background_tasks = true;
+
+  Open(blob_options, options);
+
+  ASSERT_OK(Put("foo", "bar"));
+
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+  ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_BLOB_FILE_SYNCED), 1);
+}
+
+TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+
+  BlobDBOptions blob_options;
+  blob_options.min_blob_size = 0;
+  blob_options.bytes_per_sync = 1 << 20;
+  blob_options.disable_background_tasks = true;
+
+  Open(blob_options, options);
+
+  ASSERT_OK(Put("foo", "bar"));
+
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobLogWriter::Sync", [this](void * /* arg */) {
+        fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = blob_db_impl()->TEST_CloseBlobFile(blob_files[0]);
+
+  fault_injection_env_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(s.IsIOError());
+}
+
 }  //  namespace blob_db
 }  // namespace ROCKSDB_NAMESPACE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,17 +5,19 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/blob_db/blob_dump_tool.h"
+
 #include <stdio.h>
+
 #include <cinttypes>
 #include <iostream>
 #include <memory>
 #include <string>
-#include "env/composite_env_wrapper.h"
+
 #include "file/random_access_file_reader.h"
 #include "file/readahead_raf.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
-#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/string_util.h"
@@ -32,18 +34,19 @@
                          bool show_summary) {
   constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
   Status s;
-  Env* env = Env::Default();
-  s = env->FileExists(filename);
+  const auto fs = FileSystem::Default();
+  IOOptions io_opts;
+  s = fs->FileExists(filename, io_opts, nullptr);
   if (!s.ok()) {
     return s;
   }
   uint64_t file_size = 0;
-  s = env->GetFileSize(filename, &file_size);
+  s = fs->GetFileSize(filename, io_opts, &file_size, nullptr);
   if (!s.ok()) {
     return s;
   }
-  std::unique_ptr<RandomAccessFile> file;
-  s = env->NewRandomAccessFile(filename, &file, EnvOptions());
+  std::unique_ptr<FSRandomAccessFile> file;
+  s = fs->NewRandomAccessFile(filename, FileOptions(), &file, nullptr);
   if (!s.ok()) {
     return s;
   }
@@ -51,8 +54,7 @@
   if (file_size == 0) {
     return Status::Corruption("File is empty.");
   }
-  reader_.reset(new RandomAccessFileReader(
-      NewLegacyRandomAccessFileWrapper(file), filename));
+  reader_.reset(new RandomAccessFileReader(std::move(file), filename));
   uint64_t offset = 0;
   uint64_t footer_offset = 0;
   CompressionType compression = kNoCompression;
@@ -101,7 +103,8 @@
     }
     buffer_.reset(new char[buffer_size_]);
   }
-  Status s = reader_->Read(offset, size, result, buffer_.get());
+  Status s =
+      reader_->Read(IOOptions(), offset, size, result, buffer_.get(), nullptr);
   if (!s.ok()) {
     return s;
   }
@@ -212,8 +215,7 @@
                            compression);
     s = UncompressBlockContentsForCompressionType(
         info, slice.data() + key_size, static_cast<size_t>(value_size),
-        &contents, 2 /*compress_format_version*/,
-        ImmutableCFOptions(Options()));
+        &contents, 2 /*compress_format_version*/, ImmutableOptions(Options()));
     if (!s.ok()) {
       return s;
     }
@@ -254,7 +256,7 @@
         snprintf(buf + j * 3 + 16, 2, "%x", c & 0xf);
         snprintf(buf + j + 65, 2, "%c", (0x20 <= c && c <= 0x7e) ? c : '.');
       }
-      for (size_t p = 0; p < sizeof(buf) - 1; p++) {
+      for (size_t p = 0; p + 1 < sizeof(buf); p++) {
         if (buf[p] == 0) {
           buf[p] = ' ';
         }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_dump_tool.h	2025-05-19 16:14:28.000000000 +0000
@@ -8,10 +8,11 @@
 #include <memory>
 #include <string>
 #include <utility>
+
+#include "db/blob/blob_log_format.h"
 #include "file/random_access_file_reader.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "utilities/blob_db/blob_log_format.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace blob_db {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.cc	2025-05-19 16:14:28.000000000 +0000
@@ -15,7 +15,6 @@
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "file/readahead_raf.h"
 #include "logging/logging.h"
@@ -61,29 +60,6 @@
   return BlobFileName(path_to_dir_, file_number_);
 }
 
-std::shared_ptr<Reader> BlobFile::OpenRandomAccessReader(
-    Env* env, const DBOptions& db_options,
-    const EnvOptions& env_options) const {
-  constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
-  std::unique_ptr<RandomAccessFile> sfile;
-  std::string path_name(PathName());
-  Status s = env->NewRandomAccessFile(path_name, &sfile, env_options);
-  if (!s.ok()) {
-    // report something here.
-    return nullptr;
-  }
-  sfile = NewReadaheadRandomAccessFile(std::move(sfile), kReadaheadSize);
-
-  std::unique_ptr<RandomAccessFileReader> sfile_reader;
-  sfile_reader.reset(new RandomAccessFileReader(
-      NewLegacyRandomAccessFileWrapper(sfile), path_name));
-
-  std::shared_ptr<Reader> log_reader = std::make_shared<Reader>(
-      std::move(sfile_reader), db_options.env, db_options.statistics.get());
-
-  return log_reader;
-}
-
 std::string BlobFile::DumpState() const {
   char str[1000];
   snprintf(
@@ -103,12 +79,6 @@
   obsolete_.store(true);
 }
 
-bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
-  assert(last_fsync_ <= file_size_);
-  return (hard) ? file_size_ > last_fsync_
-                : (file_size_ - last_fsync_) >= bytes_per_sync;
-}
-
 Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) {
   BlobLogFooter footer;
   footer.blob_count = blob_count_;
@@ -117,7 +87,8 @@
   }
 
   // this will close the file and reset the Writable File Pointer.
-  Status s = log_writer_->AppendFooter(footer);
+  Status s = log_writer_->AppendFooter(footer, /* checksum_method */ nullptr,
+                                       /* checksum_value */ nullptr);
   if (s.ok()) {
     closed_ = true;
     immutable_sequence_ = sequence;
@@ -138,9 +109,17 @@
   assert(ra_file_reader_);
 
   Slice result;
-  char scratch[BlobLogFooter::kSize + 10];
-  Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result,
-                                   scratch);
+  std::string buf;
+  AlignedBuf aligned_buf;
+  Status s;
+  if (ra_file_reader_->use_direct_io()) {
+    s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
+                              &result, nullptr, &aligned_buf);
+  } else {
+    buf.reserve(BlobLogFooter::kSize + 10);
+    s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
+                              &result, &buf[0], nullptr);
+  }
   if (!s.ok()) return s;
   if (result.size() != BlobLogFooter::kSize) {
     // should not happen
@@ -152,8 +131,6 @@
 }
 
 Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
-  // assume that file has been fully fsync'd
-  last_fsync_.store(file_size_);
   blob_count_ = footer.blob_count;
   expiration_range_ = footer.expiration_range;
   closed_ = true;
@@ -164,7 +141,6 @@
   Status s;
   if (log_writer_.get()) {
     s = log_writer_->Sync();
-    last_fsync_.store(file_size_.load());
   }
   return s;
 }
@@ -174,15 +150,16 @@
   last_access_ = -1;
 }
 
-Status BlobFile::GetReader(Env* env, const EnvOptions& env_options,
+Status BlobFile::GetReader(Env* env, const FileOptions& file_options,
                            std::shared_ptr<RandomAccessFileReader>* reader,
                            bool* fresh_open) {
   assert(reader != nullptr);
   assert(fresh_open != nullptr);
   *fresh_open = false;
   int64_t current_time = 0;
-  env->GetCurrentTime(&current_time);
-  last_access_.store(current_time);
+  if (env->GetCurrentTime(&current_time).ok()) {
+    last_access_.store(current_time);
+  }
   Status s;
 
   {
@@ -200,8 +177,9 @@
     return s;
   }
 
-  std::unique_ptr<RandomAccessFile> rfile;
-  s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
+  std::unique_ptr<FSRandomAccessFile> rfile;
+  s = env->GetFileSystem()->NewRandomAccessFile(PathName(), file_options,
+                                                &rfile, nullptr);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_,
                     "Failed to open blob file for random-read: %s status: '%s'"
@@ -211,18 +189,20 @@
     return s;
   }
 
-  ra_file_reader_ = std::make_shared<RandomAccessFileReader>(
-      NewLegacyRandomAccessFileWrapper(rfile), PathName());
+  ra_file_reader_ =
+      std::make_shared<RandomAccessFileReader>(std::move(rfile), PathName());
   *reader = ra_file_reader_;
   *fresh_open = true;
   return s;
 }
 
-Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) {
+Status BlobFile::ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+                              const FileOptions& file_options) {
   assert(Immutable());
   // Get file size.
   uint64_t file_size = 0;
-  Status s = env->GetFileSize(PathName(), &file_size);
+  Status s =
+      fs->GetFileSize(PathName(), file_options.io_options, &file_size, nullptr);
   if (s.ok()) {
     file_size_ = file_size;
   } else {
@@ -241,22 +221,28 @@
   }
 
   // Create file reader.
-  std::unique_ptr<RandomAccessFile> file;
-  s = env->NewRandomAccessFile(PathName(), &file, env_options);
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  s = RandomAccessFileReader::Create(fs, PathName(), file_options, &file_reader,
+                                     nullptr);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_,
                     "Failed to open blob file %" PRIu64 ", status: %s",
                     file_number_, s.ToString().c_str());
     return s;
   }
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                 PathName()));
 
   // Read file header.
-  char header_buf[BlobLogHeader::kSize];
+  std::string header_buf;
+  AlignedBuf aligned_buf;
   Slice header_slice;
-  s = file_reader->Read(0, BlobLogHeader::kSize, &header_slice, header_buf);
+  if (file_reader->use_direct_io()) {
+    s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
+                          nullptr, &aligned_buf);
+  } else {
+    header_buf.reserve(BlobLogHeader::kSize);
+    s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
+                          &header_buf[0], nullptr);
+  }
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_,
                     "Failed to read header of blob file %" PRIu64
@@ -287,10 +273,18 @@
     assert(!footer_valid_);
     return Status::OK();
   }
-  char footer_buf[BlobLogFooter::kSize];
+  std::string footer_buf;
   Slice footer_slice;
-  s = file_reader->Read(file_size - BlobLogFooter::kSize, BlobLogFooter::kSize,
-                        &footer_slice, footer_buf);
+  if (file_reader->use_direct_io()) {
+    s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
+                          BlobLogFooter::kSize, &footer_slice, nullptr,
+                          &aligned_buf);
+  } else {
+    footer_buf.reserve(BlobLogFooter::kSize);
+    s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
+                          BlobLogFooter::kSize, &footer_slice, &footer_buf[0],
+                          nullptr);
+  }
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_,
                     "Failed to read footer of blob file %" PRIu64
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_file.h	2025-05-19 16:14:28.000000000 +0000
@@ -10,13 +10,13 @@
 #include <memory>
 #include <unordered_set>
 
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
 #include "file/random_access_file_reader.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/options.h"
-#include "utilities/blob_db/blob_log_format.h"
-#include "utilities/blob_db/blob_log_reader.h"
-#include "utilities/blob_db/blob_log_writer.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace blob_db {
@@ -27,6 +27,7 @@
   friend class BlobDBImpl;
   friend struct BlobFileComparator;
   friend struct BlobFileComparatorTTL;
+  friend class BlobIndexCompactionFilterBase;
   friend class BlobIndexCompactionFilterGC;
 
  private:
@@ -85,7 +86,7 @@
   SequenceNumber obsolete_sequence_{0};
 
   // Sequential/Append writer for blobs
-  std::shared_ptr<Writer> log_writer_;
+  std::shared_ptr<BlobLogWriter> log_writer_;
 
   // random access file reader for GET calls
   std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
@@ -97,9 +98,6 @@
   // time when the random access reader was last created.
   std::atomic<std::int64_t> last_access_{-1};
 
-  // last time file was fsync'd/fdatasyncd
-  std::atomic<uint64_t> last_fsync_{0};
-
   bool header_valid_{false};
 
   bool footer_valid_{false};
@@ -183,9 +181,6 @@
     return obsolete_sequence_;
   }
 
-  // we will assume this is atomic
-  bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
-
   Status Fsync();
 
   uint64_t GetFileSize() const {
@@ -194,7 +189,9 @@
 
   // All Get functions which are not atomic, will need ReadLock on the mutex
 
-  ExpirationRange GetExpirationRange() const { return expiration_range_; }
+  const ExpirationRange& GetExpirationRange() const {
+    return expiration_range_;
+  }
 
   void ExtendExpirationRange(uint64_t expiration) {
     expiration_range_.first = std::min(expiration_range_.first, expiration);
@@ -207,22 +204,19 @@
 
   CompressionType GetCompressionType() const { return compression_; }
 
-  std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
+  std::shared_ptr<BlobLogWriter> GetWriter() const { return log_writer_; }
 
   // Read blob file header and footer. Return corruption if file header is
   // malform or incomplete. If footer is malform or incomplete, set
   // footer_valid_ to false and return Status::OK.
-  Status ReadMetadata(Env* env, const EnvOptions& env_options);
+  Status ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+                      const FileOptions& file_options);
 
-  Status GetReader(Env* env, const EnvOptions& env_options,
+  Status GetReader(Env* env, const FileOptions& file_options,
                    std::shared_ptr<RandomAccessFileReader>* reader,
                    bool* fresh_open);
 
  private:
-  std::shared_ptr<Reader> OpenRandomAccessReader(
-      Env* env, const DBOptions& db_options,
-      const EnvOptions& env_options) const;
-
   Status ReadFooter(BlobLogFooter* footer);
 
   Status WriteFooterAndCloseLocked(SequenceNumber sequence);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,149 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#ifndef ROCKSDB_LITE
-
-#include "utilities/blob_db/blob_log_format.h"
-
-#include "util/coding.h"
-#include "util/crc32c.h"
-
-namespace ROCKSDB_NAMESPACE {
-namespace blob_db {
-
-void BlobLogHeader::EncodeTo(std::string* dst) {
-  assert(dst != nullptr);
-  dst->clear();
-  dst->reserve(BlobLogHeader::kSize);
-  PutFixed32(dst, kMagicNumber);
-  PutFixed32(dst, version);
-  PutFixed32(dst, column_family_id);
-  unsigned char flags = (has_ttl ? 1 : 0);
-  dst->push_back(flags);
-  dst->push_back(compression);
-  PutFixed64(dst, expiration_range.first);
-  PutFixed64(dst, expiration_range.second);
-}
-
-Status BlobLogHeader::DecodeFrom(Slice src) {
-  static const std::string kErrorMessage =
-      "Error while decoding blob log header";
-  if (src.size() != BlobLogHeader::kSize) {
-    return Status::Corruption(kErrorMessage,
-                              "Unexpected blob file header size");
-  }
-  uint32_t magic_number;
-  unsigned char flags;
-  if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
-      !GetFixed32(&src, &column_family_id)) {
-    return Status::Corruption(
-        kErrorMessage,
-        "Error decoding magic number, version and column family id");
-  }
-  if (magic_number != kMagicNumber) {
-    return Status::Corruption(kErrorMessage, "Magic number mismatch");
-  }
-  if (version != kVersion1) {
-    return Status::Corruption(kErrorMessage, "Unknown header version");
-  }
-  flags = src.data()[0];
-  compression = static_cast<CompressionType>(src.data()[1]);
-  has_ttl = (flags & 1) == 1;
-  src.remove_prefix(2);
-  if (!GetFixed64(&src, &expiration_range.first) ||
-      !GetFixed64(&src, &expiration_range.second)) {
-    return Status::Corruption(kErrorMessage, "Error decoding expiration range");
-  }
-  return Status::OK();
-}
-
-void BlobLogFooter::EncodeTo(std::string* dst) {
-  assert(dst != nullptr);
-  dst->clear();
-  dst->reserve(BlobLogFooter::kSize);
-  PutFixed32(dst, kMagicNumber);
-  PutFixed64(dst, blob_count);
-  PutFixed64(dst, expiration_range.first);
-  PutFixed64(dst, expiration_range.second);
-  crc = crc32c::Value(dst->c_str(), dst->size());
-  crc = crc32c::Mask(crc);
-  PutFixed32(dst, crc);
-}
-
-Status BlobLogFooter::DecodeFrom(Slice src) {
-  static const std::string kErrorMessage =
-      "Error while decoding blob log footer";
-  if (src.size() != BlobLogFooter::kSize) {
-    return Status::Corruption(kErrorMessage,
-                              "Unexpected blob file footer size");
-  }
-  uint32_t src_crc = 0;
-  src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t));
-  src_crc = crc32c::Mask(src_crc);
-  uint32_t magic_number = 0;
-  if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
-      !GetFixed64(&src, &expiration_range.first) ||
-      !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) {
-    return Status::Corruption(kErrorMessage, "Error decoding content");
-  }
-  if (magic_number != kMagicNumber) {
-    return Status::Corruption(kErrorMessage, "Magic number mismatch");
-  }
-  if (src_crc != crc) {
-    return Status::Corruption(kErrorMessage, "CRC mismatch");
-  }
-  return Status::OK();
-}
-
-void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
-  assert(dst != nullptr);
-  dst->clear();
-  dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
-  PutFixed64(dst, key.size());
-  PutFixed64(dst, value.size());
-  PutFixed64(dst, expiration);
-  header_crc = crc32c::Value(dst->c_str(), dst->size());
-  header_crc = crc32c::Mask(header_crc);
-  PutFixed32(dst, header_crc);
-  blob_crc = crc32c::Value(key.data(), key.size());
-  blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
-  blob_crc = crc32c::Mask(blob_crc);
-  PutFixed32(dst, blob_crc);
-}
-
-Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
-  static const std::string kErrorMessage = "Error while decoding blob record";
-  if (src.size() != BlobLogRecord::kHeaderSize) {
-    return Status::Corruption(kErrorMessage,
-                              "Unexpected blob record header size");
-  }
-  uint32_t src_crc = 0;
-  src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
-  src_crc = crc32c::Mask(src_crc);
-  if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
-      !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
-      !GetFixed32(&src, &blob_crc)) {
-    return Status::Corruption(kErrorMessage, "Error decoding content");
-  }
-  if (src_crc != header_crc) {
-    return Status::Corruption(kErrorMessage, "Header CRC mismatch");
-  }
-  return Status::OK();
-}
-
-Status BlobLogRecord::CheckBlobCRC() const {
-  uint32_t expected_crc = 0;
-  expected_crc = crc32c::Value(key.data(), key.size());
-  expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
-  expected_crc = crc32c::Mask(expected_crc);
-  if (expected_crc != blob_crc) {
-    return Status::Corruption("Blob CRC mismatch");
-  }
-  return Status::OK();
-}
-
-}  // namespace blob_db
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_format.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,133 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Log format information shared by reader and writer.
-
-#pragma once
-
-#ifndef ROCKSDB_LITE
-
-#include <limits>
-#include <memory>
-#include <utility>
-
-#include "rocksdb/options.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/status.h"
-#include "rocksdb/types.h"
-
-namespace ROCKSDB_NAMESPACE {
-namespace blob_db {
-
-constexpr uint32_t kMagicNumber = 2395959;  // 0x00248f37
-constexpr uint32_t kVersion1 = 1;
-constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
-
-using ExpirationRange = std::pair<uint64_t, uint64_t>;
-
-// Format of blob log file header (30 bytes):
-//
-//    +--------------+---------+---------+-------+-------------+-------------------+
-//    | magic number | version |  cf id  | flags | compression | expiration range  |
-//    +--------------+---------+---------+-------+-------------+-------------------+
-//    |   Fixed32    | Fixed32 | Fixed32 | char  |    char     | Fixed64   Fixed64 |
-//    +--------------+---------+---------+-------+-------------+-------------------+
-//
-// List of flags:
-//   has_ttl: Whether the file contain TTL data.
-//
-// Expiration range in the header is a rough range based on
-// blob_db_options.ttl_range_secs.
-struct BlobLogHeader {
-  static constexpr size_t kSize = 30;
-
-  BlobLogHeader() = default;
-  BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
-                bool _has_ttl, const ExpirationRange& _expiration_range)
-      : column_family_id(_column_family_id),
-        compression(_compression),
-        has_ttl(_has_ttl),
-        expiration_range(_expiration_range) {}
-
-  uint32_t version = kVersion1;
-  uint32_t column_family_id = 0;
-  CompressionType compression = kNoCompression;
-  bool has_ttl = false;
-  ExpirationRange expiration_range;
-
-  void EncodeTo(std::string* dst);
-
-  Status DecodeFrom(Slice slice);
-};
-
-// Format of blob log file footer (32 bytes):
-//
-//    +--------------+------------+-------------------+------------+
-//    | magic number | blob count | expiration range  | footer CRC |
-//    +--------------+------------+-------------------+------------+
-//    |   Fixed32    |  Fixed64   | Fixed64 + Fixed64 |   Fixed32  |
-//    +--------------+------------+-------------------+------------+
-//
-// The footer will be presented only when the blob file is properly closed.
-//
-// Unlike the same field in file header, expiration range in the footer is the
-// range of smallest and largest expiration of the data in this file.
-struct BlobLogFooter {
-  static constexpr size_t kSize = 32;
-
-  uint64_t blob_count = 0;
-  ExpirationRange expiration_range = std::make_pair(0, 0);
-  uint32_t crc = 0;
-
-  void EncodeTo(std::string* dst);
-
-  Status DecodeFrom(Slice slice);
-};
-
-// Blob record format (32 bytes header + key + value):
-//
-//    +------------+--------------+------------+------------+----------+---------+-----------+
-//    | key length | value length | expiration | header CRC | blob CRC |   key   |   value   |
-//    +------------+--------------+------------+------------+----------+---------+-----------+
-//    |   Fixed64  |   Fixed64    |  Fixed64   |  Fixed32   | Fixed32  | key len | value len |
-//    +------------+--------------+------------+------------+----------+---------+-----------+
-//
-// If file has has_ttl = false, expiration field is always 0, and the blob
-// doesn't has expiration.
-//
-// Also note that if compression is used, value is compressed value and value
-// length is compressed value length.
-//
-// Header CRC is the checksum of (key_len + val_len + expiration), while
-// blob CRC is the checksum of (key + value).
-//
-// We could use variable length encoding (Varint64) to save more space, but it
-// make reader more complicated.
-struct BlobLogRecord {
-  // header include fields up to blob CRC
-  static constexpr size_t kHeaderSize = 32;
-
-  uint64_t key_size = 0;
-  uint64_t value_size = 0;
-  uint64_t expiration = 0;
-  uint32_t header_crc = 0;
-  uint32_t blob_crc = 0;
-  Slice key;
-  Slice value;
-  std::unique_ptr<char[]> key_buf;
-  std::unique_ptr<char[]> value_buf;
-
-  uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
-
-  void EncodeHeaderTo(std::string* dst);
-
-  Status DecodeHeaderFrom(Slice src);
-
-  Status CheckBlobCRC() const;
-};
-
-}  // namespace blob_db
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,105 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#ifndef ROCKSDB_LITE
-
-#include "utilities/blob_db/blob_log_reader.h"
-
-#include <algorithm>
-
-#include "file/random_access_file_reader.h"
-#include "monitoring/statistics.h"
-#include "util/stop_watch.h"
-
-namespace ROCKSDB_NAMESPACE {
-namespace blob_db {
-
-Reader::Reader(std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
-               Statistics* statistics)
-    : file_(std::move(file_reader)),
-      env_(env),
-      statistics_(statistics),
-      buffer_(),
-      next_byte_(0) {}
-
-Status Reader::ReadSlice(uint64_t size, Slice* slice, char* buf) {
-  StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
-  Status s = file_->Read(next_byte_, static_cast<size_t>(size), slice, buf);
-  next_byte_ += size;
-  if (!s.ok()) {
-    return s;
-  }
-  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
-  if (slice->size() != size) {
-    return Status::Corruption("EOF reached while reading record");
-  }
-  return s;
-}
-
-Status Reader::ReadHeader(BlobLogHeader* header) {
-  assert(file_.get() != nullptr);
-  assert(next_byte_ == 0);
-  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
-  if (!s.ok()) {
-    return s;
-  }
-
-  if (buffer_.size() != BlobLogHeader::kSize) {
-    return Status::Corruption("EOF reached before file header");
-  }
-
-  return header->DecodeFrom(buffer_);
-}
-
-Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
-                          uint64_t* blob_offset) {
-  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
-  if (!s.ok()) {
-    return s;
-  }
-  if (buffer_.size() != BlobLogRecord::kHeaderSize) {
-    return Status::Corruption("EOF reached before record header");
-  }
-
-  s = record->DecodeHeaderFrom(buffer_);
-  if (!s.ok()) {
-    return s;
-  }
-
-  uint64_t kb_size = record->key_size + record->value_size;
-  if (blob_offset != nullptr) {
-    *blob_offset = next_byte_ + record->key_size;
-  }
-
-  switch (level) {
-    case kReadHeader:
-      next_byte_ += kb_size;
-      break;
-
-    case kReadHeaderKey:
-      record->key_buf.reset(new char[record->key_size]);
-      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
-      next_byte_ += record->value_size;
-      break;
-
-    case kReadHeaderKeyBlob:
-      record->key_buf.reset(new char[record->key_size]);
-      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
-      if (s.ok()) {
-        record->value_buf.reset(new char[record->value_size]);
-        s = ReadSlice(record->value_size, &record->value,
-                      record->value_buf.get());
-      }
-      if (s.ok()) {
-        s = record->CheckBlobCRC();
-      }
-      break;
-  }
-  return s;
-}
-
-}  // namespace blob_db
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_reader.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,82 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#pragma once
-
-#ifndef ROCKSDB_LITE
-
-#include <memory>
-#include <string>
-
-#include "file/random_access_file_reader.h"
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/status.h"
-#include "utilities/blob_db/blob_log_format.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class SequentialFileReader;
-class Logger;
-
-namespace blob_db {
-
-/**
- * Reader is a general purpose log stream reader implementation. The actual job
- * of reading from the device is implemented by the SequentialFile interface.
- *
- * Please see Writer for details on the file and record layout.
- */
-class Reader {
- public:
-  enum ReadLevel {
-    kReadHeader,
-    kReadHeaderKey,
-    kReadHeaderKeyBlob,
-  };
-
-  // Create a reader that will return log records from "*file".
-  // "*file" must remain live while this Reader is in use.
-  Reader(std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
-         Statistics* statistics);
-  // No copying allowed
-  Reader(const Reader&) = delete;
-  Reader& operator=(const Reader&) = delete;
-
-  ~Reader() = default;
-
-  Status ReadHeader(BlobLogHeader* header);
-
-  // Read the next record into *record.  Returns true if read
-  // successfully, false if we hit end of the input.  May use
-  // "*scratch" as temporary storage.  The contents filled in *record
-  // will only be valid until the next mutating operation on this
-  // reader or the next mutation to *scratch.
-  // If blob_offset is non-null, return offset of the blob through it.
-  Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
-                    uint64_t* blob_offset = nullptr);
-
-  void ResetNextByte() { next_byte_ = 0; }
-
-  uint64_t GetNextByte() const { return next_byte_; }
-
- private:
-  Status ReadSlice(uint64_t size, Slice* slice, char* buf);
-
-  const std::unique_ptr<RandomAccessFileReader> file_;
-  Env* env_;
-  Statistics* statistics_;
-
-  Slice buffer_;
-  char header_buf_[BlobLogRecord::kHeaderSize];
-
-  // which byte to read next. For asserting proper usage
-  uint64_t next_byte_;
-};
-
-}  // namespace blob_db
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,139 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_LITE
-
-#include "utilities/blob_db/blob_log_writer.h"
-
-#include <cstdint>
-#include <string>
-
-#include "file/writable_file_writer.h"
-#include "monitoring/statistics.h"
-#include "rocksdb/env.h"
-#include "util/coding.h"
-#include "util/stop_watch.h"
-#include "utilities/blob_db/blob_log_format.h"
-
-namespace ROCKSDB_NAMESPACE {
-namespace blob_db {
-
-Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
-               Statistics* statistics, uint64_t log_number, uint64_t bpsync,
-               bool use_fs, uint64_t boffset)
-    : dest_(std::move(dest)),
-      env_(env),
-      statistics_(statistics),
-      log_number_(log_number),
-      block_offset_(boffset),
-      bytes_per_sync_(bpsync),
-      next_sync_offset_(0),
-      use_fsync_(use_fs),
-      last_elem_type_(kEtNone) {}
-
-Status Writer::Sync() {
-  StopWatch sync_sw(env_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
-  Status s = dest_->Sync(use_fsync_);
-  RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
-  return s;
-}
-
-Status Writer::WriteHeader(BlobLogHeader& header) {
-  assert(block_offset_ == 0);
-  assert(last_elem_type_ == kEtNone);
-  std::string str;
-  header.EncodeTo(&str);
-
-  Status s = dest_->Append(Slice(str));
-  if (s.ok()) {
-    block_offset_ += str.size();
-    s = dest_->Flush();
-  }
-  last_elem_type_ = kEtFileHdr;
-  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
-             BlobLogHeader::kSize);
-  return s;
-}
-
-Status Writer::AppendFooter(BlobLogFooter& footer) {
-  assert(block_offset_ != 0);
-  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
-
-  std::string str;
-  footer.EncodeTo(&str);
-
-  Status s = dest_->Append(Slice(str));
-  if (s.ok()) {
-    block_offset_ += str.size();
-    s = dest_->Close();
-    dest_.reset();
-  }
-
-  last_elem_type_ = kEtFileFooter;
-  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
-             BlobLogFooter::kSize);
-  return s;
-}
-
-Status Writer::AddRecord(const Slice& key, const Slice& val,
-                         uint64_t expiration, uint64_t* key_offset,
-                         uint64_t* blob_offset) {
-  assert(block_offset_ != 0);
-  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
-
-  std::string buf;
-  ConstructBlobHeader(&buf, key, val, expiration);
-
-  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
-  return s;
-}
-
-Status Writer::AddRecord(const Slice& key, const Slice& val,
-                         uint64_t* key_offset, uint64_t* blob_offset) {
-  assert(block_offset_ != 0);
-  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
-
-  std::string buf;
-  ConstructBlobHeader(&buf, key, val, 0);
-
-  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
-  return s;
-}
-
-void Writer::ConstructBlobHeader(std::string* buf, const Slice& key,
-                                 const Slice& val, uint64_t expiration) {
-  BlobLogRecord record;
-  record.key = key;
-  record.value = val;
-  record.expiration = expiration;
-  record.EncodeHeaderTo(buf);
-}
-
-Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
-                                  const Slice& key, const Slice& val,
-                                  uint64_t* key_offset, uint64_t* blob_offset) {
-  StopWatch write_sw(env_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
-  Status s = dest_->Append(Slice(headerbuf));
-  if (s.ok()) {
-    s = dest_->Append(key);
-  }
-  if (s.ok()) {
-    s = dest_->Append(val);
-  }
-  if (s.ok()) {
-    s = dest_->Flush();
-  }
-
-  *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
-  *blob_offset = *key_offset + key.size();
-  block_offset_ = *blob_offset + val.size();
-  last_elem_type_ = kEtRecord;
-  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
-             BlobLogRecord::kHeaderSize + key.size() + val.size());
-  return s;
-}
-
-}  // namespace blob_db
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/blob_db/blob_log_writer.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,94 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-
-#ifndef ROCKSDB_LITE
-
-#include <cstdint>
-#include <memory>
-#include <string>
-
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/status.h"
-#include "rocksdb/types.h"
-#include "utilities/blob_db/blob_log_format.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class WritableFileWriter;
-
-namespace blob_db {
-
-/**
- * Writer is the blob log stream writer. It provides an append-only
- * abstraction for writing blob data.
- *
- *
- * Look at blob_db_format.h to see the details of the record formats.
- */
-
-class Writer {
- public:
-  // Create a writer that will append data to "*dest".
-  // "*dest" must be initially empty.
-  // "*dest" must remain live while this Writer is in use.
-  Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
-         Statistics* statistics, uint64_t log_number, uint64_t bpsync,
-         bool use_fsync, uint64_t boffset = 0);
-  // No copying allowed
-  Writer(const Writer&) = delete;
-  Writer& operator=(const Writer&) = delete;
-
-  ~Writer() = default;
-
-  static void ConstructBlobHeader(std::string* buf, const Slice& key,
-                                  const Slice& val, uint64_t expiration);
-
-  Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
-                   uint64_t* blob_offset);
-
-  Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
-                   uint64_t* key_offset, uint64_t* blob_offset);
-
-  Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
-                            const Slice& val, uint64_t* key_offset,
-                            uint64_t* blob_offset);
-
-  Status AppendFooter(BlobLogFooter& footer);
-
-  Status WriteHeader(BlobLogHeader& header);
-
-  WritableFileWriter* file() { return dest_.get(); }
-
-  const WritableFileWriter* file() const { return dest_.get(); }
-
-  uint64_t get_log_number() const { return log_number_; }
-
-  bool ShouldSync() const { return block_offset_ > next_sync_offset_; }
-
-  Status Sync();
-
-  void ResetSyncPointer() { next_sync_offset_ += bytes_per_sync_; }
-
- private:
-  std::unique_ptr<WritableFileWriter> dest_;
-  Env* env_;
-  Statistics* statistics_;
-  uint64_t log_number_;
-  uint64_t block_offset_;  // Current offset in block
-  uint64_t bytes_per_sync_;
-  uint64_t next_sync_offset_;
-  bool use_fsync_;
-
- public:
-  enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
-  ElemType last_elem_type_;
-};
-
-}  // namespace blob_db
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cache_dump_load.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cache_dump_load.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,69 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/cache_dump_load.h"
+
+#include "file/writable_file_writer.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "table/format.h"
+#include "util/crc32c.h"
+#include "utilities/cache_dump_load_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+IOStatus NewToFileCacheDumpWriter(const std::shared_ptr<FileSystem>& fs,
+                                  const FileOptions& file_opts,
+                                  const std::string& file_name,
+                                  std::unique_ptr<CacheDumpWriter>* writer) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  IOStatus io_s = WritableFileWriter::Create(fs, file_name, file_opts,
+                                             &file_writer, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  writer->reset(new ToFileCacheDumpWriter(std::move(file_writer)));
+  return io_s;
+}
+
+IOStatus NewFromFileCacheDumpReader(const std::shared_ptr<FileSystem>& fs,
+                                    const FileOptions& file_opts,
+                                    const std::string& file_name,
+                                    std::unique_ptr<CacheDumpReader>* reader) {
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  IOStatus io_s = RandomAccessFileReader::Create(fs, file_name, file_opts,
+                                                 &file_reader, nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  reader->reset(new FromFileCacheDumpReader(std::move(file_reader)));
+  return io_s;
+}
+
+Status NewDefaultCacheDumper(const CacheDumpOptions& dump_options,
+                             const std::shared_ptr<Cache>& cache,
+                             std::unique_ptr<CacheDumpWriter>&& writer,
+                             std::unique_ptr<CacheDumper>* cache_dumper) {
+  cache_dumper->reset(
+      new CacheDumperImpl(dump_options, cache, std::move(writer)));
+  return Status::OK();
+}
+
+Status NewDefaultCacheDumpedLoader(
+    const CacheDumpOptions& dump_options,
+    const BlockBasedTableOptions& toptions,
+    const std::shared_ptr<SecondaryCache>& secondary_cache,
+    std::unique_ptr<CacheDumpReader>&& reader,
+    std::unique_ptr<CacheDumpedLoader>* cache_dump_loader) {
+  cache_dump_loader->reset(new CacheDumpedLoaderImpl(
+      dump_options, toptions, secondary_cache, std::move(reader)));
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,489 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_key.h"
+#include "table/block_based/block_based_table_reader.h"
+#ifndef ROCKSDB_LITE
+
+#include "utilities/cache_dump_load_impl.h"
+
+#include "cache/cache_entry_roles.h"
+#include "file/writable_file_writer.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "table/format.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Set the dump filter with a list of DBs. Block cache may be shared by multipe
+// DBs and we may only want to dump out the blocks belonging to certain DB(s).
+// Therefore, a filter is need to decide if the key of the block satisfy the
+// requirement.
+Status CacheDumperImpl::SetDumpFilter(std::vector<DB*> db_list) {
+  Status s = Status::OK();
+  for (size_t i = 0; i < db_list.size(); i++) {
+    assert(i < db_list.size());
+    TablePropertiesCollection ptc;
+    assert(db_list[i] != nullptr);
+    s = db_list[i]->GetPropertiesOfAllTables(&ptc);
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto id = ptc.begin(); id != ptc.end(); id++) {
+      OffsetableCacheKey base;
+      // We only want to save cache entries that are portable to another
+      // DB::Open, so only save entries with stable keys.
+      bool is_stable;
+      // WART: if the file is extremely large (> kMaxFileSizeStandardEncoding)
+      // then the prefix will be different. But this should not be a concern
+      // in practice because that limit is currently 4TB on a single file.
+      BlockBasedTable::SetupBaseCacheKey(
+          id->second.get(), /*cur_db_session_id*/ "", /*cur_file_num*/ 0,
+          /*file_size*/ 42, &base, &is_stable);
+      if (is_stable) {
+        Slice prefix_slice = base.CommonPrefixSlice();
+        assert(prefix_slice.size() == OffsetableCacheKey::kCommonPrefixSize);
+        prefix_filter_.insert(prefix_slice.ToString());
+      }
+    }
+  }
+  return s;
+}
+
+// This is the main function to dump out the cache block entries to the writer.
+// The writer may create a file or write to other systems. Currently, we will
+// iterate the whole block cache, get the blocks, and write them to the writer
+IOStatus CacheDumperImpl::DumpCacheEntriesToWriter() {
+  // Prepare stage, check the parameters.
+  if (cache_ == nullptr) {
+    return IOStatus::InvalidArgument("Cache is null");
+  }
+  if (writer_ == nullptr) {
+    return IOStatus::InvalidArgument("CacheDumpWriter is null");
+  }
+  // Set the system clock
+  if (options_.clock == nullptr) {
+    return IOStatus::InvalidArgument("System clock is null");
+  }
+  clock_ = options_.clock;
+  // We copy the Cache Deleter Role Map as its member.
+  role_map_ = CopyCacheDeleterRoleMap();
+  // Set the sequence number
+  sequence_num_ = 0;
+
+  // Dump stage, first, we write the hader
+  IOStatus io_s = WriteHeader();
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Then, we iterate the block cache and dump out the blocks that are not
+  // filtered out.
+  cache_->ApplyToAllEntries(DumpOneBlockCallBack(), {});
+
+  // Finally, write the footer
+  io_s = WriteFooter();
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  io_s = writer_->Close();
+  return io_s;
+}
+
+// Check if we need to filter out the block based on its key
+bool CacheDumperImpl::ShouldFilterOut(const Slice& key) {
+  if (key.size() < OffsetableCacheKey::kCommonPrefixSize) {
+    return /*filter out*/ true;
+  }
+  Slice key_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize);
+  std::string prefix = key_prefix.ToString();
+  // Filter out if not found
+  return prefix_filter_.find(prefix) == prefix_filter_.end();
+}
+
+// This is the callback function which will be applied to
+// Cache::ApplyToAllEntries. In this callback function, we will get the block
+// type, decide if the block needs to be dumped based on the filter, and write
+// the block through the provided writer.
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+CacheDumperImpl::DumpOneBlockCallBack() {
+  return [&](const Slice& key, void* value, size_t /*charge*/,
+             Cache::DeleterFn deleter) {
+    // Step 1: get the type of the block from role_map_
+    auto e = role_map_.find(deleter);
+    CacheEntryRole role;
+    CacheDumpUnitType type = CacheDumpUnitType::kBlockTypeMax;
+    if (e == role_map_.end()) {
+      role = CacheEntryRole::kMisc;
+    } else {
+      role = e->second;
+    }
+    bool filter_out = false;
+
+    // Step 2: based on the key prefix, check if the block should be filter out.
+    if (ShouldFilterOut(key)) {
+      filter_out = true;
+    }
+
+    // Step 3: based on the block type, get the block raw pointer and length.
+    const char* block_start = nullptr;
+    size_t block_len = 0;
+    switch (role) {
+      case CacheEntryRole::kDataBlock:
+        type = CacheDumpUnitType::kData;
+        block_start = (static_cast<Block*>(value))->data();
+        block_len = (static_cast<Block*>(value))->size();
+        break;
+      case CacheEntryRole::kDeprecatedFilterBlock:
+        type = CacheDumpUnitType::kDeprecatedFilterBlock;
+        block_start = (static_cast<BlockContents*>(value))->data.data();
+        block_len = (static_cast<BlockContents*>(value))->data.size();
+        break;
+      case CacheEntryRole::kFilterBlock:
+        type = CacheDumpUnitType::kFilter;
+        block_start = (static_cast<ParsedFullFilterBlock*>(value))
+                          ->GetBlockContentsData()
+                          .data();
+        block_len = (static_cast<ParsedFullFilterBlock*>(value))
+                        ->GetBlockContentsData()
+                        .size();
+        break;
+      case CacheEntryRole::kFilterMetaBlock:
+        type = CacheDumpUnitType::kFilterMetaBlock;
+        block_start = (static_cast<Block*>(value))->data();
+        block_len = (static_cast<Block*>(value))->size();
+        break;
+      case CacheEntryRole::kIndexBlock:
+        type = CacheDumpUnitType::kIndex;
+        block_start = (static_cast<Block*>(value))->data();
+        block_len = (static_cast<Block*>(value))->size();
+        break;
+      case CacheEntryRole::kMisc:
+        filter_out = true;
+        break;
+      case CacheEntryRole::kOtherBlock:
+        filter_out = true;
+        break;
+      case CacheEntryRole::kWriteBuffer:
+        filter_out = true;
+        break;
+      default:
+        filter_out = true;
+    }
+
+    // Step 4: if the block should not be filter out, write the block to the
+    // CacheDumpWriter
+    if (!filter_out && block_start != nullptr) {
+      char* buffer = new char[block_len];
+      memcpy(buffer, block_start, block_len);
+      WriteCacheBlock(type, key, (void*)buffer, block_len)
+          .PermitUncheckedError();
+      delete[] buffer;
+    }
+  };
+}
+// Write the raw block to the writer. It takes the timestamp of the block being
+// copied from block cache, block type, key, block pointer, raw block size and
+// the block checksum as the input. When writing the raw block, we first create
+// the dump unit and encoude it to a string. Then, we calculate the checksum of
+// the how dump unit string and store it in the dump unit metadata.
+// First, we write the metadata first, which is a fixed size string. Then, we
+// Append the dump unit string to the writer.
+IOStatus CacheDumperImpl::WriteRawBlock(uint64_t timestamp,
+                                        CacheDumpUnitType type,
+                                        const Slice& key, void* value,
+                                        size_t len, uint32_t checksum) {
+  // First, serilize the block information in a string
+  DumpUnit dump_unit;
+  dump_unit.timestamp = timestamp;
+  dump_unit.key = key;
+  dump_unit.type = type;
+  dump_unit.value_len = len;
+  dump_unit.value = value;
+  dump_unit.value_checksum = checksum;
+  std::string encoded_data;
+  CacheDumperHelper::EncodeDumpUnit(dump_unit, &encoded_data);
+
+  // Second, create the metadata, which contains a sequence number, the dump
+  // unit string checksum and the string size. The sequence number monotonically
+  // increases from 0.
+  DumpUnitMeta unit_meta;
+  unit_meta.sequence_num = sequence_num_;
+  sequence_num_++;
+  unit_meta.dump_unit_checksum =
+      crc32c::Value(encoded_data.c_str(), encoded_data.size());
+  unit_meta.dump_unit_size = static_cast<uint64_t>(encoded_data.size());
+  std::string encoded_meta;
+  CacheDumperHelper::EncodeDumpUnitMeta(unit_meta, &encoded_meta);
+
+  // We write the metadata first.
+  assert(writer_ != nullptr);
+  IOStatus io_s = writer_->WriteMetadata(Slice(encoded_meta));
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  // followed by the dump unit.
+  return writer_->WritePacket(Slice(encoded_data));
+}
+
+// Before we write any block, we write the header first to store the cache dump
+// format version, rocksdb version, and brief intro.
+IOStatus CacheDumperImpl::WriteHeader() {
+  std::string header_key = "header";
+  std::ostringstream s;
+  s << kTraceMagic << "\t"
+    << "Cache dump format version: " << kCacheDumpMajorVersion << "."
+    << kCacheDumpMinorVersion << "\t"
+    << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
+    << "Format: dump_unit_metadata <sequence_number, dump_unit_checksum, "
+       "dump_unit_size>, dump_unit <timestamp, key, block_type, "
+       "block_size, raw_block, raw_block_checksum> cache_value\n";
+  std::string header_value(s.str());
+  CacheDumpUnitType type = CacheDumpUnitType::kHeader;
+  uint64_t timestamp = clock_->NowMicros();
+  uint32_t header_checksum =
+      crc32c::Value(header_value.c_str(), header_value.size());
+  return WriteRawBlock(timestamp, type, Slice(header_key),
+                       (void*)header_value.c_str(), header_value.size(),
+                       header_checksum);
+}
+
+// Write the block dumped from cache
+IOStatus CacheDumperImpl::WriteCacheBlock(const CacheDumpUnitType type,
+                                          const Slice& key, void* value,
+                                          size_t len) {
+  uint64_t timestamp = clock_->NowMicros();
+  uint32_t value_checksum = crc32c::Value((char*)value, len);
+  return WriteRawBlock(timestamp, type, key, value, len, value_checksum);
+}
+
+// Write the footer after all the blocks are stored to indicate the ending.
+IOStatus CacheDumperImpl::WriteFooter() {
+  std::string footer_key = "footer";
+  std::ostringstream s;
+  std::string footer_value("cache dump completed");
+  CacheDumpUnitType type = CacheDumpUnitType::kFooter;
+  uint64_t timestamp = clock_->NowMicros();
+  uint32_t footer_checksum =
+      crc32c::Value(footer_value.c_str(), footer_value.size());
+  return WriteRawBlock(timestamp, type, Slice(footer_key),
+                       (void*)footer_value.c_str(), footer_value.size(),
+                       footer_checksum);
+}
+
+// This is the main function to restore the cache entries to secondary cache.
+// First, we check if all the arguments are valid. Then, we read the block
+// sequentially from the reader and insert them to the secondary cache.
+IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() {
+  // TODO: remove this line when options are used in the loader
+  (void)options_;
+  // Step 1: we check if all the arguments are valid
+  if (secondary_cache_ == nullptr) {
+    return IOStatus::InvalidArgument("Secondary Cache is null");
+  }
+  if (reader_ == nullptr) {
+    return IOStatus::InvalidArgument("CacheDumpReader is null");
+  }
+  // we copy the Cache Deleter Role Map as its member.
+  role_map_ = CopyCacheDeleterRoleMap();
+
+  // Step 2: read the header
+  // TODO: we need to check the cache dump format version and RocksDB version
+  // after the header is read out.
+  IOStatus io_s;
+  DumpUnit dump_unit;
+  std::string data;
+  io_s = ReadHeader(&data, &dump_unit);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Step 3: read out the rest of the blocks from the reader. The loop will stop
+  // either I/O status is not ok or we reach to the the end.
+  while (io_s.ok() && dump_unit.type != CacheDumpUnitType::kFooter) {
+    dump_unit.reset();
+    data.clear();
+    // read the content and store in the dump_unit
+    io_s = ReadCacheBlock(&data, &dump_unit);
+    if (!io_s.ok()) {
+      break;
+    }
+    // create the raw_block_content based on the information in the dump_unit
+    BlockContents raw_block_contents(
+        Slice((char*)dump_unit.value, dump_unit.value_len));
+    Cache::CacheItemHelper* helper = nullptr;
+    Statistics* statistics = nullptr;
+    Status s = Status::OK();
+    // according to the block type, get the helper callback function and create
+    // the corresponding block
+    switch (dump_unit.type) {
+      case CacheDumpUnitType::kDeprecatedFilterBlock: {
+        helper = BlocklikeTraits<BlockContents>::GetCacheItemHelper(
+            BlockType::kFilter);
+        std::unique_ptr<BlockContents> block_holder;
+        block_holder.reset(BlocklikeTraits<BlockContents>::Create(
+            std::move(raw_block_contents), 0, statistics, false,
+            toptions_.filter_policy.get()));
+        // Insert the block to secondary cache.
+        // Note that, if we cannot get the correct helper callback, the block
+        // will not be inserted.
+        if (helper != nullptr) {
+          s = secondary_cache_->Insert(dump_unit.key,
+                                       (void*)(block_holder.get()), helper);
+        }
+        break;
+      }
+      case CacheDumpUnitType::kFilter: {
+        helper = BlocklikeTraits<ParsedFullFilterBlock>::GetCacheItemHelper(
+            BlockType::kFilter);
+        std::unique_ptr<ParsedFullFilterBlock> block_holder;
+        block_holder.reset(BlocklikeTraits<ParsedFullFilterBlock>::Create(
+            std::move(raw_block_contents), toptions_.read_amp_bytes_per_bit,
+            statistics, false, toptions_.filter_policy.get()));
+        if (helper != nullptr) {
+          s = secondary_cache_->Insert(dump_unit.key,
+                                       (void*)(block_holder.get()), helper);
+        }
+        break;
+      }
+      case CacheDumpUnitType::kData: {
+        helper = BlocklikeTraits<Block>::GetCacheItemHelper(BlockType::kData);
+        std::unique_ptr<Block> block_holder;
+        block_holder.reset(BlocklikeTraits<Block>::Create(
+            std::move(raw_block_contents), toptions_.read_amp_bytes_per_bit,
+            statistics, false, toptions_.filter_policy.get()));
+        if (helper != nullptr) {
+          s = secondary_cache_->Insert(dump_unit.key,
+                                       (void*)(block_holder.get()), helper);
+        }
+        break;
+      }
+      case CacheDumpUnitType::kIndex: {
+        helper = BlocklikeTraits<Block>::GetCacheItemHelper(BlockType::kIndex);
+        std::unique_ptr<Block> block_holder;
+        block_holder.reset(BlocklikeTraits<Block>::Create(
+            std::move(raw_block_contents), 0, statistics, false,
+            toptions_.filter_policy.get()));
+        if (helper != nullptr) {
+          s = secondary_cache_->Insert(dump_unit.key,
+                                       (void*)(block_holder.get()), helper);
+        }
+        break;
+      }
+      case CacheDumpUnitType::kFilterMetaBlock: {
+        helper = BlocklikeTraits<Block>::GetCacheItemHelper(BlockType::kFilter);
+        std::unique_ptr<Block> block_holder;
+        block_holder.reset(BlocklikeTraits<Block>::Create(
+            std::move(raw_block_contents), toptions_.read_amp_bytes_per_bit,
+            statistics, false, toptions_.filter_policy.get()));
+        if (helper != nullptr) {
+          s = secondary_cache_->Insert(dump_unit.key,
+                                       (void*)(block_holder.get()), helper);
+        }
+        break;
+      }
+      case CacheDumpUnitType::kFooter:
+        break;
+      default:
+        continue;
+    }
+    if (!s.ok()) {
+      io_s = status_to_io_status(std::move(s));
+    }
+  }
+  if (dump_unit.type == CacheDumpUnitType::kFooter) {
+    return IOStatus::OK();
+  } else {
+    return io_s;
+  }
+}
+
+// Read and copy the dump unit metadata to std::string data, decode and create
+// the unit metadata based on the string
+IOStatus CacheDumpedLoaderImpl::ReadDumpUnitMeta(std::string* data,
+                                                 DumpUnitMeta* unit_meta) {
+  assert(reader_ != nullptr);
+  assert(data != nullptr);
+  assert(unit_meta != nullptr);
+  IOStatus io_s = reader_->ReadMetadata(data);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  return status_to_io_status(
+      CacheDumperHelper::DecodeDumpUnitMeta(*data, unit_meta));
+}
+
+// Read and copy the dump unit to std::string data, decode and create the unit
+// based on the string
+IOStatus CacheDumpedLoaderImpl::ReadDumpUnit(size_t len, std::string* data,
+                                             DumpUnit* unit) {
+  assert(reader_ != nullptr);
+  assert(data != nullptr);
+  assert(unit != nullptr);
+  IOStatus io_s = reader_->ReadPacket(data);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  if (data->size() != len) {
+    return IOStatus::Corruption(
+        "The data being read out does not match the size stored in metadata!");
+  }
+  Slice block;
+  return status_to_io_status(CacheDumperHelper::DecodeDumpUnit(*data, unit));
+}
+
+// Read the header
+IOStatus CacheDumpedLoaderImpl::ReadHeader(std::string* data,
+                                           DumpUnit* dump_unit) {
+  DumpUnitMeta header_meta;
+  header_meta.reset();
+  std::string meta_string;
+  IOStatus io_s = ReadDumpUnitMeta(&meta_string, &header_meta);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  io_s = ReadDumpUnit(header_meta.dump_unit_size, data, dump_unit);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  uint32_t unit_checksum = crc32c::Value(data->c_str(), data->size());
+  if (unit_checksum != header_meta.dump_unit_checksum) {
+    return IOStatus::Corruption("Read header unit corrupted!");
+  }
+  return io_s;
+}
+
+// Read the blocks after header is read out
+IOStatus CacheDumpedLoaderImpl::ReadCacheBlock(std::string* data,
+                                               DumpUnit* dump_unit) {
+  // According to the write process, we read the dump_unit_metadata first
+  DumpUnitMeta unit_meta;
+  unit_meta.reset();
+  std::string unit_string;
+  IOStatus io_s = ReadDumpUnitMeta(&unit_string, &unit_meta);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Based on the information in the dump_unit_metadata, we read the dump_unit
+  // and verify if its content is correct.
+  io_s = ReadDumpUnit(unit_meta.dump_unit_size, data, dump_unit);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  uint32_t unit_checksum = crc32c::Value(data->c_str(), data->size());
+  if (unit_checksum != unit_meta.dump_unit_checksum) {
+    return IOStatus::Corruption(
+        "Checksum does not match! Read dumped unit corrupted!");
+  }
+  return io_s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cache_dump_load_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,365 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <unordered_map>
+
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/utilities/cache_dump_load.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_like_traits.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/block_based/reader_common.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// the read buffer size of for the default CacheDumpReader
+const unsigned int kDumpReaderBufferSize = 1024;  // 1KB
+static const unsigned int kSizePrefixLen = 4;
+
+enum CacheDumpUnitType : unsigned char {
+  kHeader = 1,
+  kFooter = 2,
+  kData = 3,
+  kFilter = 4,
+  kProperties = 5,
+  kCompressionDictionary = 6,
+  kRangeDeletion = 7,
+  kHashIndexPrefixes = 8,
+  kHashIndexMetadata = 9,
+  kMetaIndex = 10,
+  kIndex = 11,
+  kDeprecatedFilterBlock = 12,
+  kFilterMetaBlock = 13,
+  kBlockTypeMax,
+};
+
+// The metadata of a dump unit. After it is serilized, its size is fixed 16
+// bytes.
+struct DumpUnitMeta {
+  // sequence number is a monotonically increasing number to indicate the order
+  // of the blocks being written. Header is 0.
+  uint32_t sequence_num;
+  // The Crc32c checksum of its dump unit.
+  uint32_t dump_unit_checksum;
+  // The dump unit size after the dump unit is serilized to a string.
+  uint64_t dump_unit_size;
+
+  void reset() {
+    sequence_num = 0;
+    dump_unit_checksum = 0;
+    dump_unit_size = 0;
+  }
+};
+
+// The data structure to hold a block and its information.
+struct DumpUnit {
+  // The timestamp when the block is identified, copied, and dumped from block
+  // cache
+  uint64_t timestamp;
+  // The type of the block
+  CacheDumpUnitType type;
+  // The key of this block when the block is referenced by this Cache
+  Slice key;
+  // The block size
+  size_t value_len;
+  // The Crc32c checksum of the block
+  uint32_t value_checksum;
+  // Pointer to the block. Note that, in the dump process, it points to a memory
+  // buffer copied from cache block. The buffer is freed when we process the
+  // next block. In the load process, we use an std::string to store the
+  // serilized dump_unit read from the reader. So it points to the memory
+  // address of the begin of the block in this string.
+  void* value;
+
+  DumpUnit() { reset(); }
+
+  void reset() {
+    timestamp = 0;
+    type = CacheDumpUnitType::kBlockTypeMax;
+    key.clear();
+    value_len = 0;
+    value_checksum = 0;
+    value = nullptr;
+  }
+};
+
+// The default implementation of the Cache Dumper
+class CacheDumperImpl : public CacheDumper {
+ public:
+  CacheDumperImpl(const CacheDumpOptions& dump_options,
+                  const std::shared_ptr<Cache>& cache,
+                  std::unique_ptr<CacheDumpWriter>&& writer)
+      : options_(dump_options), cache_(cache), writer_(std::move(writer)) {}
+  ~CacheDumperImpl() { writer_.reset(); }
+  Status SetDumpFilter(std::vector<DB*> db_list) override;
+  IOStatus DumpCacheEntriesToWriter() override;
+
+ private:
+  IOStatus WriteRawBlock(uint64_t timestamp, CacheDumpUnitType type,
+                         const Slice& key, void* value, size_t len,
+                         uint32_t checksum);
+
+  IOStatus WriteHeader();
+
+  IOStatus WriteCacheBlock(const CacheDumpUnitType type, const Slice& key,
+                           void* value, size_t len);
+  IOStatus WriteFooter();
+  bool ShouldFilterOut(const Slice& key);
+  std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+  DumpOneBlockCallBack();
+
+  CacheDumpOptions options_;
+  std::shared_ptr<Cache> cache_;
+  std::unique_ptr<CacheDumpWriter> writer_;
+  std::unordered_map<Cache::DeleterFn, CacheEntryRole> role_map_;
+  SystemClock* clock_;
+  uint32_t sequence_num_;
+  // The cache key prefix filter. Currently, we use db_session_id as the prefix,
+  // so using std::set to store the prefixes as filter is enough. Further
+  // improvement can be applied like BloomFilter or others to speedup the
+  // filtering.
+  std::set<std::string> prefix_filter_;
+};
+
+// The default implementation of CacheDumpedLoader
+class CacheDumpedLoaderImpl : public CacheDumpedLoader {
+ public:
+  CacheDumpedLoaderImpl(const CacheDumpOptions& dump_options,
+                        const BlockBasedTableOptions& toptions,
+                        const std::shared_ptr<SecondaryCache>& secondary_cache,
+                        std::unique_ptr<CacheDumpReader>&& reader)
+      : options_(dump_options),
+        toptions_(toptions),
+        secondary_cache_(secondary_cache),
+        reader_(std::move(reader)) {}
+  ~CacheDumpedLoaderImpl() {}
+  IOStatus RestoreCacheEntriesToSecondaryCache() override;
+
+ private:
+  IOStatus ReadDumpUnitMeta(std::string* data, DumpUnitMeta* unit_meta);
+  IOStatus ReadDumpUnit(size_t len, std::string* data, DumpUnit* unit);
+  IOStatus ReadHeader(std::string* data, DumpUnit* dump_unit);
+  IOStatus ReadCacheBlock(std::string* data, DumpUnit* dump_unit);
+
+  CacheDumpOptions options_;
+  const BlockBasedTableOptions& toptions_;
+  std::shared_ptr<SecondaryCache> secondary_cache_;
+  std::unique_ptr<CacheDumpReader> reader_;
+  std::unordered_map<Cache::DeleterFn, CacheEntryRole> role_map_;
+};
+
+// The default implementation of CacheDumpWriter. We write the blocks to a file
+// sequentially.
+class ToFileCacheDumpWriter : public CacheDumpWriter {
+ public:
+  explicit ToFileCacheDumpWriter(
+      std::unique_ptr<WritableFileWriter>&& file_writer)
+      : file_writer_(std::move(file_writer)) {}
+
+  ~ToFileCacheDumpWriter() { Close().PermitUncheckedError(); }
+
+  // Write the serilized metadata to the file
+  virtual IOStatus WriteMetadata(const Slice& metadata) override {
+    assert(file_writer_ != nullptr);
+    std::string prefix;
+    PutFixed32(&prefix, static_cast<uint32_t>(metadata.size()));
+    IOStatus io_s = file_writer_->Append(Slice(prefix));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    io_s = file_writer_->Append(metadata);
+    return io_s;
+  }
+
+  // Write the serilized data to the file
+  virtual IOStatus WritePacket(const Slice& data) override {
+    assert(file_writer_ != nullptr);
+    std::string prefix;
+    PutFixed32(&prefix, static_cast<uint32_t>(data.size()));
+    IOStatus io_s = file_writer_->Append(Slice(prefix));
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    io_s = file_writer_->Append(data);
+    return io_s;
+  }
+
+  // Reset the writer
+  virtual IOStatus Close() override {
+    file_writer_.reset();
+    return IOStatus::OK();
+  }
+
+ private:
+  std::unique_ptr<WritableFileWriter> file_writer_;
+};
+
+// The default implementation of CacheDumpReader. It is implemented based on
+// RandomAccessFileReader. Note that, we keep an internal variable to remember
+// the current offset.
+class FromFileCacheDumpReader : public CacheDumpReader {
+ public:
+  explicit FromFileCacheDumpReader(
+      std::unique_ptr<RandomAccessFileReader>&& reader)
+      : file_reader_(std::move(reader)),
+        offset_(0),
+        buffer_(new char[kDumpReaderBufferSize]) {}
+
+  ~FromFileCacheDumpReader() { delete[] buffer_; }
+
+  virtual IOStatus ReadMetadata(std::string* metadata) override {
+    uint32_t metadata_len = 0;
+    IOStatus io_s = ReadSizePrefix(&metadata_len);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    return Read(metadata_len, metadata);
+  }
+
+  virtual IOStatus ReadPacket(std::string* data) override {
+    uint32_t data_len = 0;
+    IOStatus io_s = ReadSizePrefix(&data_len);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    return Read(data_len, data);
+  }
+
+ private:
+  IOStatus ReadSizePrefix(uint32_t* len) {
+    std::string prefix;
+    IOStatus io_s = Read(kSizePrefixLen, &prefix);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+    Slice encoded_slice(prefix);
+    if (!GetFixed32(&encoded_slice, len)) {
+      return IOStatus::Corruption("Decode size prefix string failed");
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus Read(size_t len, std::string* data) {
+    assert(file_reader_ != nullptr);
+    IOStatus io_s;
+
+    unsigned int bytes_to_read = static_cast<unsigned int>(len);
+    unsigned int to_read = bytes_to_read > kDumpReaderBufferSize
+                               ? kDumpReaderBufferSize
+                               : bytes_to_read;
+
+    while (to_read > 0) {
+      io_s = file_reader_->Read(IOOptions(), offset_, to_read, &result_,
+                                buffer_, nullptr);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      if (result_.size() < to_read) {
+        return IOStatus::Corruption("Corrupted cache dump file.");
+      }
+      data->append(result_.data(), result_.size());
+
+      offset_ += to_read;
+      bytes_to_read -= to_read;
+      to_read = bytes_to_read > kDumpReaderBufferSize ? kDumpReaderBufferSize
+                                                      : bytes_to_read;
+    }
+    return io_s;
+  }
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  Slice result_;
+  size_t offset_;
+  char* buffer_;
+};
+
+// The cache dump and load helper class
+class CacheDumperHelper {
+ public:
+  // serilize the dump_unit_meta to a string, it is fixed 16 bytes size.
+  static void EncodeDumpUnitMeta(const DumpUnitMeta& meta, std::string* data) {
+    assert(data);
+    PutFixed32(data, static_cast<uint32_t>(meta.sequence_num));
+    PutFixed32(data, static_cast<uint32_t>(meta.dump_unit_checksum));
+    PutFixed64(data, meta.dump_unit_size);
+  }
+
+  // Serilize the dump_unit to a string.
+  static void EncodeDumpUnit(const DumpUnit& dump_unit, std::string* data) {
+    assert(data);
+    PutFixed64(data, dump_unit.timestamp);
+    data->push_back(dump_unit.type);
+    PutLengthPrefixedSlice(data, dump_unit.key);
+    PutFixed32(data, static_cast<uint32_t>(dump_unit.value_len));
+    PutFixed32(data, dump_unit.value_checksum);
+    PutLengthPrefixedSlice(data,
+                           Slice((char*)dump_unit.value, dump_unit.value_len));
+  }
+
+  // Deserilize the dump_unit_meta from a string
+  static Status DecodeDumpUnitMeta(const std::string& encoded_data,
+                                   DumpUnitMeta* unit_meta) {
+    assert(unit_meta != nullptr);
+    Slice encoded_slice = Slice(encoded_data);
+    if (!GetFixed32(&encoded_slice, &(unit_meta->sequence_num))) {
+      return Status::Incomplete("Decode dumped unit meta sequence_num failed");
+    }
+    if (!GetFixed32(&encoded_slice, &(unit_meta->dump_unit_checksum))) {
+      return Status::Incomplete(
+          "Decode dumped unit meta dump_unit_checksum failed");
+    }
+    if (!GetFixed64(&encoded_slice, &(unit_meta->dump_unit_size))) {
+      return Status::Incomplete(
+          "Decode dumped unit meta dump_unit_size failed");
+    }
+    return Status::OK();
+  }
+
+  // Deserilize the dump_unit from a string.
+  static Status DecodeDumpUnit(const std::string& encoded_data,
+                               DumpUnit* dump_unit) {
+    assert(dump_unit != nullptr);
+    Slice encoded_slice = Slice(encoded_data);
+
+    // Decode timestamp
+    if (!GetFixed64(&encoded_slice, &dump_unit->timestamp)) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    // Decode the block type
+    dump_unit->type = static_cast<CacheDumpUnitType>(encoded_slice[0]);
+    encoded_slice.remove_prefix(1);
+    // Decode the key
+    if (!GetLengthPrefixedSlice(&encoded_slice, &(dump_unit->key))) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    // Decode the value size
+    uint32_t value_len;
+    if (!GetFixed32(&encoded_slice, &value_len)) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    dump_unit->value_len = static_cast<size_t>(value_len);
+    // Decode the value checksum
+    if (!GetFixed32(&encoded_slice, &(dump_unit->value_checksum))) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    // Decode the block content and copy to the memory space whose pointer
+    // will be managed by the cache finally.
+    Slice block;
+    if (!GetLengthPrefixedSlice(&encoded_slice, &block)) {
+      return Status::Incomplete("Decode dumped unit string failed");
+    }
+    dump_unit->value = (void*)block.data();
+    assert(block.size() == dump_unit->value_len);
+    return Status::OK();
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc	2025-05-19 16:14:28.000000000 +0000
@@ -4,15 +4,35 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "utilities/cassandra/cassandra_compaction_filter.h"
+
 #include <string>
+
 #include "rocksdb/slice.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "utilities/cassandra/format.h"
+#include "utilities/cassandra/merge_operator.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
-
-const char* CassandraCompactionFilter::Name() const {
-  return "CassandraCompactionFilter";
+static std::unordered_map<std::string, OptionTypeInfo>
+    cassandra_filter_type_info = {
+#ifndef ROCKSDB_LITE
+        {"purge_ttl_on_expiration",
+         {offsetof(struct CassandraOptions, purge_ttl_on_expiration),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"gc_grace_period_in_seconds",
+         {offsetof(struct CassandraOptions, gc_grace_period_in_seconds),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+CassandraCompactionFilter::CassandraCompactionFilter(
+    bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds)
+    : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) {
+  RegisterOptions(&options_, &cassandra_filter_type_info);
 }
 
 CompactionFilter::Decision CassandraCompactionFilter::FilterV2(
@@ -23,12 +43,12 @@
   RowValue row_value = RowValue::Deserialize(
     existing_value.data(), existing_value.size());
   RowValue compacted =
-      purge_ttl_on_expiration_
+      options_.purge_ttl_on_expiration
           ? row_value.RemoveExpiredColumns(&value_changed)
           : row_value.ConvertExpiredColumnsToTombstones(&value_changed);
 
   if (value_type == ValueType::kValue) {
-    compacted = compacted.RemoveTombstones(gc_grace_period_in_seconds_);
+    compacted = compacted.RemoveTombstones(options_.gc_grace_period_in_seconds);
   }
 
   if(compacted.Empty()) {
@@ -43,5 +63,48 @@
   return Decision::kKeep;
 }
 
+CassandraCompactionFilterFactory::CassandraCompactionFilterFactory(
+    bool purge_ttl_on_expiration, int32_t gc_grace_period_in_seconds)
+    : options_(gc_grace_period_in_seconds, 0, purge_ttl_on_expiration) {
+  RegisterOptions(&options_, &cassandra_filter_type_info);
+}
+
+std::unique_ptr<CompactionFilter>
+CassandraCompactionFilterFactory::CreateCompactionFilter(
+    const CompactionFilter::Context&) {
+  std::unique_ptr<CompactionFilter> result(new CassandraCompactionFilter(
+      options_.purge_ttl_on_expiration, options_.gc_grace_period_in_seconds));
+  return result;
+}
+
+#ifndef ROCKSDB_LITE
+int RegisterCassandraObjects(ObjectLibrary& library,
+                             const std::string& /*arg*/) {
+  library.AddFactory<MergeOperator>(
+      CassandraValueMergeOperator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new CassandraValueMergeOperator(0));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilter>(
+      CassandraCompactionFilter::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilter>* /*guard */,
+         std::string* /* errmsg */) {
+        return new CassandraCompactionFilter(false, 0);
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      CassandraCompactionFilterFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilterFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new CassandraCompactionFilterFactory(false, 0));
+        return guard->get();
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
 }  // namespace cassandra
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_compaction_filter.h	2025-05-19 16:14:28.000000000 +0000
@@ -5,8 +5,10 @@
 
 #pragma once
 #include <string>
+
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/slice.h"
+#include "utilities/cassandra/cassandra_options.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
@@ -25,18 +27,31 @@
 class CassandraCompactionFilter : public CompactionFilter {
 public:
  explicit CassandraCompactionFilter(bool purge_ttl_on_expiration,
-                                    int32_t gc_grace_period_in_seconds)
-     : purge_ttl_on_expiration_(purge_ttl_on_expiration),
-       gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {}
+                                    int32_t gc_grace_period_in_seconds);
+ static const char* kClassName() { return "CassandraCompactionFilter"; }
+ const char* Name() const override { return kClassName(); }
 
- const char* Name() const override;
  virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
                            const Slice& existing_value, std::string* new_value,
                            std::string* skip_until) const override;
 
 private:
-  bool purge_ttl_on_expiration_;
-  int32_t gc_grace_period_in_seconds_;
+ CassandraOptions options_;
+};
+
+class CassandraCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit CassandraCompactionFilterFactory(bool purge_ttl_on_expiration,
+                                            int32_t gc_grace_period_in_seconds);
+  ~CassandraCompactionFilterFactory() override {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override;
+  static const char* kClassName() { return "CassandraCompactionFilterFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+ private:
+  CassandraOptions options_;
 };
 }  // namespace cassandra
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_format_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_format_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_format_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_format_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -10,7 +10,6 @@
 #include "utilities/cassandra/serialize.h"
 #include "utilities/cassandra/test_utils.h"
 
-using namespace ROCKSDB_NAMESPACE::cassandra;
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
@@ -184,6 +183,8 @@
       == 0);
 }
 
+class RowValueTest : public testing::Test {};
+
 TEST(RowValueTest, RowTombstone) {
   int32_t local_deletion_time = 1494022807;
   int64_t marked_for_delete_at = 1494022807044;
@@ -327,10 +328,13 @@
   bool changed = false;
   auto purged = row_value.RemoveExpiredColumns(&changed);
   EXPECT_TRUE(changed);
-  EXPECT_EQ(purged.columns_.size(), 3);
-  VerifyRowValueColumns(purged.columns_, 0, kColumn, 0, ToMicroSeconds(now));
-  VerifyRowValueColumns(purged.columns_, 1, kExpiringColumn, 2, ToMicroSeconds(now));
-  VerifyRowValueColumns(purged.columns_, 2, kTombstone, 3, ToMicroSeconds(now));
+  EXPECT_EQ(purged.get_columns().size(), 3);
+  VerifyRowValueColumns(purged.get_columns(), 0, kColumn, 0,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(purged.get_columns(), 1, kExpiringColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(purged.get_columns(), 2, kTombstone, 3,
+                        ToMicroSeconds(now));
 
   purged.RemoveExpiredColumns(&changed);
   EXPECT_FALSE(changed);
@@ -349,11 +353,15 @@
   bool changed = false;
   auto compacted = row_value.ConvertExpiredColumnsToTombstones(&changed);
   EXPECT_TRUE(changed);
-  EXPECT_EQ(compacted.columns_.size(), 4);
-  VerifyRowValueColumns(compacted.columns_, 0, kColumn, 0, ToMicroSeconds(now));
-  VerifyRowValueColumns(compacted.columns_, 1, kTombstone, 1, ToMicroSeconds(now - 10));
-  VerifyRowValueColumns(compacted.columns_, 2, kExpiringColumn, 2, ToMicroSeconds(now));
-  VerifyRowValueColumns(compacted.columns_, 3, kTombstone, 3, ToMicroSeconds(now));
+  EXPECT_EQ(compacted.get_columns().size(), 4);
+  VerifyRowValueColumns(compacted.get_columns(), 0, kColumn, 0,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(compacted.get_columns(), 1, kTombstone, 1,
+                        ToMicroSeconds(now - 10));
+  VerifyRowValueColumns(compacted.get_columns(), 2, kExpiringColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(compacted.get_columns(), 3, kTombstone, 3,
+                        ToMicroSeconds(now));
 
   compacted.ConvertExpiredColumnsToTombstones(&changed);
   EXPECT_FALSE(changed);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_functional_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -4,18 +4,20 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <iostream>
+
 #include "db/db_impl/db_impl.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
-#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
 #include "util/random.h"
 #include "utilities/cassandra/cassandra_compaction_filter.h"
 #include "utilities/cassandra/merge_operator.h"
 #include "utilities/cassandra/test_utils.h"
 #include "utilities/merge_operators.h"
 
-using namespace ROCKSDB_NAMESPACE;
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
@@ -57,14 +59,17 @@
     }
   }
 
-  void Flush() {
-    dbfull()->TEST_FlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+  Status Flush() {
+    Status s = dbfull()->TEST_FlushMemTable();
+    if (s.ok()) {
+      s = dbfull()->TEST_WaitForCompact();
+    }
+    return s;
   }
 
-  void Compact() {
-    dbfull()->TEST_CompactRange(
-      0, nullptr, nullptr, db_->DefaultColumnFamily());
+  Status Compact() {
+    return dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+                                       db_->DefaultColumnFamily());
   }
 
   std::tuple<bool, RowValue> Get(const std::string& key){
@@ -89,7 +94,7 @@
   WriteOptions write_option_;
   ReadOptions get_option_;
 
-  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_.get()); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 };
 
 class TestCompactionFilterFactory : public CompactionFilterFactory {
@@ -164,43 +169,58 @@
 
   ASSERT_TRUE(std::get<0>(ret));
   RowValue& merged = std::get<1>(ret);
-  EXPECT_EQ(merged.columns_.size(), 5);
-  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 0, ToMicroSeconds(now + 6));
-  VerifyRowValueColumns(merged.columns_, 1, kColumn, 1, ToMicroSeconds(now + 8));
-  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 2, ToMicroSeconds(now + 7));
-  VerifyRowValueColumns(merged.columns_, 3, kExpiringColumn, 7, ToMicroSeconds(now + 17));
-  VerifyRowValueColumns(merged.columns_, 4, kTombstone, 11, ToMicroSeconds(now + 11));
+  EXPECT_EQ(merged.get_columns().size(), 5);
+  VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0,
+                        ToMicroSeconds(now + 6));
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1,
+                        ToMicroSeconds(now + 8));
+  VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2,
+                        ToMicroSeconds(now + 7));
+  VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7,
+                        ToMicroSeconds(now + 17));
+  VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11,
+                        ToMicroSeconds(now + 11));
 }
 
+constexpr int64_t kTestTimeoutSecs = 600;
+
 TEST_F(CassandraFunctionalTest,
        CompactionShouldConvertExpiredColumnsToTombstone) {
   CassandraStore store(OpenDb());
   int64_t now= time(nullptr);
 
-  store.Append("k1", CreateTestRowValue({
-    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired
-    CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl + 10)), // not expired
-    CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
-  }));
+  store.Append(
+      "k1",
+      CreateTestRowValue(
+          {CreateTestColumnSpec(kExpiringColumn, 0,
+                                ToMicroSeconds(now - kTtl - 20)),  // expired
+           CreateTestColumnSpec(
+               kExpiringColumn, 1,
+               ToMicroSeconds(now - kTtl + kTestTimeoutSecs)),  // not expired
+           CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
     CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
 
   auto ret = store.Get("k1");
   ASSERT_TRUE(std::get<0>(ret));
   RowValue& merged = std::get<1>(ret);
-  EXPECT_EQ(merged.columns_.size(), 4);
-  VerifyRowValueColumns(merged.columns_, 0, kTombstone, 0, ToMicroSeconds(now - 10));
-  VerifyRowValueColumns(merged.columns_, 1, kExpiringColumn, 1, ToMicroSeconds(now - kTtl + 10));
-  VerifyRowValueColumns(merged.columns_, 2, kColumn, 2, ToMicroSeconds(now));
-  VerifyRowValueColumns(merged.columns_, 3, kTombstone, 3, ToMicroSeconds(now));
+  EXPECT_EQ(merged.get_columns().size(), 4);
+  VerifyRowValueColumns(merged.get_columns(), 0, kTombstone, 0,
+                        ToMicroSeconds(now - 10));
+  VerifyRowValueColumns(merged.get_columns(), 1, kExpiringColumn, 1,
+                        ToMicroSeconds(now - kTtl + kTestTimeoutSecs));
+  VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.get_columns(), 3, kTombstone, 3,
+                        ToMicroSeconds(now));
 }
 
 
@@ -216,23 +236,26 @@
     CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
   }));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
     CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
 
   auto ret = store.Get("k1");
   ASSERT_TRUE(std::get<0>(ret));
   RowValue& merged = std::get<1>(ret);
-  EXPECT_EQ(merged.columns_.size(), 3);
-  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 1, ToMicroSeconds(now));
-  VerifyRowValueColumns(merged.columns_, 1, kColumn, 2, ToMicroSeconds(now));
-  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 3, ToMicroSeconds(now));
+  EXPECT_EQ(merged.get_columns().size(), 3);
+  VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 1,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 2,
+                        ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 3,
+                        ToMicroSeconds(now));
 }
 
 TEST_F(CassandraFunctionalTest,
@@ -246,14 +269,14 @@
     CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 20)),
   }));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)),
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
   ASSERT_FALSE(std::get<0>(store.Get("k1")));
 }
 
@@ -272,20 +295,20 @@
     CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now))
   }));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)),
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
 
   auto ret = store.Get("k1");
   ASSERT_TRUE(std::get<0>(ret));
   RowValue& gced = std::get<1>(ret);
-  EXPECT_EQ(gced.columns_.size(), 1);
-  VerifyRowValueColumns(gced.columns_, 0, kColumn, 1, ToMicroSeconds(now));
+  EXPECT_EQ(gced.get_columns().size(), 1);
+  VerifyRowValueColumns(gced.get_columns(), 0, kColumn, 1, ToMicroSeconds(now));
 }
 
 TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) {
@@ -297,11 +320,104 @@
     CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
   ASSERT_FALSE(std::get<0>(store.Get("k1")));
 }
 
+#ifndef ROCKSDB_LITE
+TEST_F(CassandraFunctionalTest, LoadMergeOperator) {
+  ConfigOptions config_options;
+  std::shared_ptr<MergeOperator> mo;
+  config_options.ignore_unsupported_options = false;
+
+  ASSERT_NOK(MergeOperator::CreateFromString(
+      config_options, CassandraValueMergeOperator::kClassName(), &mo));
+
+  config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+                                      "cassandra");
+
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options, CassandraValueMergeOperator::kClassName(), &mo));
+  ASSERT_NE(mo, nullptr);
+  ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName());
+  mo.reset();
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options,
+      std::string("operands_limit=20;gc_grace_period_in_seconds=42;id=") +
+          CassandraValueMergeOperator::kClassName(),
+      &mo));
+  ASSERT_NE(mo, nullptr);
+  ASSERT_STREQ(mo->Name(), CassandraValueMergeOperator::kClassName());
+  const auto* opts = mo->GetOptions<CassandraOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+  ASSERT_EQ(opts->operands_limit, 20);
+}
+
+TEST_F(CassandraFunctionalTest, LoadCompactionFilter) {
+  ConfigOptions config_options;
+  const CompactionFilter* filter = nullptr;
+  config_options.ignore_unsupported_options = false;
+
+  ASSERT_NOK(CompactionFilter::CreateFromString(
+      config_options, CassandraCompactionFilter::kClassName(), &filter));
+  config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+                                      "cassandra");
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options, CassandraCompactionFilter::kClassName(), &filter));
+  ASSERT_NE(filter, nullptr);
+  ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName());
+  delete filter;
+  filter = nullptr;
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options,
+      std::string(
+          "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") +
+          CassandraCompactionFilter::kClassName(),
+      &filter));
+  ASSERT_NE(filter, nullptr);
+  ASSERT_STREQ(filter->Name(), CassandraCompactionFilter::kClassName());
+  const auto* opts = filter->GetOptions<CassandraOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+  ASSERT_TRUE(opts->purge_ttl_on_expiration);
+  delete filter;
+}
+
+TEST_F(CassandraFunctionalTest, LoadCompactionFilterFactory) {
+  ConfigOptions config_options;
+  std::shared_ptr<CompactionFilterFactory> factory;
+
+  config_options.ignore_unsupported_options = false;
+  ASSERT_NOK(CompactionFilterFactory::CreateFromString(
+      config_options, CassandraCompactionFilterFactory::kClassName(),
+      &factory));
+  config_options.registry->AddLibrary("cassandra", RegisterCassandraObjects,
+                                      "cassandra");
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options, CassandraCompactionFilterFactory::kClassName(),
+      &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName());
+  factory.reset();
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options,
+      std::string(
+          "purge_ttl_on_expiration=true;gc_grace_period_in_seconds=42;id=") +
+          CassandraCompactionFilterFactory::kClassName(),
+      &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), CassandraCompactionFilterFactory::kClassName());
+  const auto* opts = factory->GetOptions<CassandraOptions>();
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->gc_grace_period_in_seconds, 42);
+  ASSERT_TRUE(opts->purge_ttl_on_expiration);
+}
+#endif  // ROCKSDB_LITE
+
 } // namespace cassandra
 }  // namespace ROCKSDB_NAMESPACE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_options.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+class ObjectLibrary;
+namespace cassandra {
+struct CassandraOptions {
+  static const char* kName() { return "CassandraOptions"; }
+  CassandraOptions(int32_t _gc_grace_period_in_seconds, size_t _operands_limit,
+                   bool _purge_ttl_on_expiration = false)
+      : operands_limit(_operands_limit),
+        gc_grace_period_in_seconds(_gc_grace_period_in_seconds),
+        purge_ttl_on_expiration(_purge_ttl_on_expiration) {}
+  // Limit on the number of merge operands.
+  size_t operands_limit;
+
+  // How long (in seconds) tombstoned data remains before it is purged
+  int32_t gc_grace_period_in_seconds;
+
+  // If is set to true, expired data will be directly purged.
+  // Otherwise expired data will be converted tombstones first,
+  // then be eventually removed after gc grace period. This value should
+  // only true if all writes have same ttl setting, otherwise it could bring old
+  // data back.
+  bool purge_ttl_on_expiration;
+};
+#ifndef ROCKSDB_LITE
+extern "C" {
+int RegisterCassandraObjects(ObjectLibrary& library, const std::string& arg);
+}  // extern "C"
+#endif  // ROCKSDB_LITE
+}  // namespace cassandra
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -11,6 +11,8 @@
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
 
+class RowValueMergeTest : public testing::Test {};
+
 TEST(RowValueMergeTest, Merge) {
   std::vector<RowValue> row_values;
   row_values.push_back(
@@ -41,12 +43,12 @@
 
   RowValue merged = RowValue::Merge(std::move(row_values));
   EXPECT_FALSE(merged.IsTombstone());
-  EXPECT_EQ(merged.columns_.size(), 5);
-  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 0, 6);
-  VerifyRowValueColumns(merged.columns_, 1, kColumn, 1, 8);
-  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 2, 7);
-  VerifyRowValueColumns(merged.columns_, 3, kExpiringColumn, 7, 17);
-  VerifyRowValueColumns(merged.columns_, 4, kTombstone, 11, 11);
+  EXPECT_EQ(merged.get_columns().size(), 5);
+  VerifyRowValueColumns(merged.get_columns(), 0, kExpiringColumn, 0, 6);
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 1, 8);
+  VerifyRowValueColumns(merged.get_columns(), 2, kTombstone, 2, 7);
+  VerifyRowValueColumns(merged.get_columns(), 3, kExpiringColumn, 7, 17);
+  VerifyRowValueColumns(merged.get_columns(), 4, kTombstone, 11, 11);
 }
 
 TEST(RowValueMergeTest, MergeWithRowTombstone) {
@@ -83,10 +85,10 @@
 
   RowValue merged = RowValue::Merge(std::move(row_values));
   EXPECT_FALSE(merged.IsTombstone());
-  EXPECT_EQ(merged.columns_.size(), 3);
-  VerifyRowValueColumns(merged.columns_, 0, kColumn, 3, 12);
-  VerifyRowValueColumns(merged.columns_, 1, kColumn, 4, 13);
-  VerifyRowValueColumns(merged.columns_, 2, kColumn, 5, 14);
+  EXPECT_EQ(merged.get_columns().size(), 3);
+  VerifyRowValueColumns(merged.get_columns(), 0, kColumn, 3, 12);
+  VerifyRowValueColumns(merged.get_columns(), 1, kColumn, 4, 13);
+  VerifyRowValueColumns(merged.get_columns(), 2, kColumn, 5, 14);
 
   // If the tombstone's timestamp is the latest, then it returns a
   // row tombstone.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_serialize_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_serialize_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_serialize_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/cassandra_serialize_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,7 +6,6 @@
 #include "test_util/testharness.h"
 #include "utilities/cassandra/serialize.h"
 
-using namespace ROCKSDB_NAMESPACE::cassandra;
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/format.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/format.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/format.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/format.h	2025-05-19 16:14:28.000000000 +0000
@@ -60,7 +60,6 @@
 #include <vector>
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
-#include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
@@ -143,7 +142,7 @@
   std::chrono::seconds Ttl() const;
 };
 
-typedef std::vector<std::shared_ptr<ColumnBase>> Columns;
+using Columns = std::vector<std::shared_ptr<ColumnBase>>;
 
 class RowValue {
 public:
@@ -157,7 +156,7 @@
   RowValue& operator=(const RowValue& /*that*/) = delete;
   RowValue& operator=(RowValue&& /*that*/) = default;
 
-  std::size_t Size() const;;
+  std::size_t Size() const;
   bool IsTombstone() const;
   // For Tombstone this returns the marked_for_delete_at_,
   // otherwise it returns the max timestamp of containing columns.
@@ -172,25 +171,13 @@
   // Merge multiple rows according to their timestamp.
   static RowValue Merge(std::vector<RowValue>&& values);
 
-private:
+  const Columns& get_columns() { return columns_; }
+
+ private:
   int32_t local_deletion_time_;
   int64_t marked_for_delete_at_;
   Columns columns_;
   int64_t last_modified_time_;
-
-  FRIEND_TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired);
-  FRIEND_TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones);
-  FRIEND_TEST(RowValueMergeTest, Merge);
-  FRIEND_TEST(RowValueMergeTest, MergeWithRowTombstone);
-  FRIEND_TEST(CassandraFunctionalTest, SimpleMergeTest);
-  FRIEND_TEST(
-    CassandraFunctionalTest, CompactionShouldConvertExpiredColumnsToTombstone);
-  FRIEND_TEST(
-    CassandraFunctionalTest, CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn);
-  FRIEND_TEST(
-    CassandraFunctionalTest, CompactionShouldRemoveRowWhenAllColumnExpiredIfPurgeTtlIsOn);
-  FRIEND_TEST(CassandraFunctionalTest,
-              CompactionShouldRemoveTombstoneExceedingGCGracePeriod);
 };
 
 } // namepsace cassandrda
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,16 +5,36 @@
 
 #include "merge_operator.h"
 
-#include <memory>
 #include <assert.h>
 
-#include "rocksdb/slice.h"
+#include <memory>
+
 #include "rocksdb/merge_operator.h"
-#include "utilities/merge_operators.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
 #include "utilities/cassandra/format.h"
+#include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
+static std::unordered_map<std::string, OptionTypeInfo>
+    merge_operator_options_info = {
+#ifndef ROCKSDB_LITE
+        {"gc_grace_period_in_seconds",
+         {offsetof(struct CassandraOptions, gc_grace_period_in_seconds),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"operands_limit",
+         {offsetof(struct CassandraOptions, operands_limit), OptionType::kSizeT,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+CassandraValueMergeOperator::CassandraValueMergeOperator(
+    int32_t gc_grace_period_in_seconds, size_t operands_limit)
+    : options_(gc_grace_period_in_seconds, operands_limit) {
+  RegisterOptions(&options_, &merge_operator_options_info);
+}
 
 // Implementation for the merge operation (merges two Cassandra values)
 bool CassandraValueMergeOperator::FullMergeV2(
@@ -34,7 +54,7 @@
   }
 
   RowValue merged = RowValue::Merge(std::move(row_values));
-  merged = merged.RemoveTombstones(gc_grace_period_in_seconds_);
+  merged = merged.RemoveTombstones(options_.gc_grace_period_in_seconds);
   merge_out->new_value.reserve(merged.Size());
   merged.Serialize(&(merge_out->new_value));
 
@@ -58,10 +78,6 @@
   return true;
 }
 
-const char* CassandraValueMergeOperator::Name() const  {
-  return "CassandraValueMergeOperator";
-}
-
 } // namespace cassandra
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/merge_operator.h	2025-05-19 16:14:28.000000000 +0000
@@ -6,6 +6,7 @@
 #pragma once
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
+#include "utilities/cassandra/cassandra_options.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
@@ -16,9 +17,7 @@
 class CassandraValueMergeOperator : public MergeOperator {
 public:
  explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds,
-                                      size_t operands_limit = 0)
-     : gc_grace_period_in_seconds_(gc_grace_period_in_seconds),
-       operands_limit_(operands_limit) {}
+                                      size_t operands_limit = 0);
 
  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
                           MergeOperationOutput* merge_out) const override;
@@ -28,17 +27,18 @@
                                 std::string* new_value,
                                 Logger* logger) const override;
 
- virtual const char* Name() const override;
+ const char* Name() const override { return kClassName(); }
+ static const char* kClassName() { return "CassandraValueMergeOperator"; }
 
  virtual bool AllowSingleOperand() const override { return true; }
 
  virtual bool ShouldMerge(const std::vector<Slice>& operands) const override {
-   return operands_limit_ > 0 && operands.size() >= operands_limit_;
+   return options_.operands_limit > 0 &&
+          operands.size() >= options_.operands_limit;
  }
 
 private:
- int32_t gc_grace_period_in_seconds_;
- size_t operands_limit_;
+ CassandraOptions options_;
 };
 } // namespace cassandra
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/serialize.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/serialize.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/serialize.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/serialize.h	2025-05-19 16:14:28.000000000 +0000
@@ -10,6 +10,11 @@
 
 #pragma once
 
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
 namespace {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.cc	2025-05-19 16:14:28.000000000 +0000
@@ -53,12 +53,9 @@
 }
 
 void VerifyRowValueColumns(
-  std::vector<std::shared_ptr<ColumnBase>> &columns,
-  std::size_t index_of_vector,
-  int8_t expected_mask,
-  int8_t expected_index,
-  int64_t expected_timestamp
-) {
+    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
+    int64_t expected_timestamp) {
   EXPECT_EQ(expected_timestamp, columns[index_of_vector]->Timestamp());
   EXPECT_EQ(expected_mask, columns[index_of_vector]->Mask());
   EXPECT_EQ(expected_index, columns[index_of_vector]->Index());
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/cassandra/test_utils.h	2025-05-19 16:14:28.000000000 +0000
@@ -33,12 +33,9 @@
 RowValue CreateRowTombstone(int64_t timestamp);
 
 void VerifyRowValueColumns(
-  std::vector<std::shared_ptr<ColumnBase>> &columns,
-  std::size_t index_of_vector,
-  int8_t expected_mask,
-  int8_t expected_index,
-  int64_t expected_timestamp
-);
+    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
+    int64_t expected_timestamp);
 
 int64_t ToMicroSeconds(int64_t seconds);
 int32_t ToSeconds(int64_t microseconds);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc	2025-05-19 16:14:28.000000000 +0000
@@ -14,18 +14,25 @@
 #include <algorithm>
 #include <cinttypes>
 #include <string>
+#include <tuple>
+#include <unordered_set>
 #include <vector>
 
 #include "db/wal_manager.h"
 #include "file/file_util.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/metadata.h"
+#include "rocksdb/options.h"
 #include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/file_checksum_helper.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -35,30 +42,33 @@
 }
 
 Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/,
-                                    uint64_t /*log_size_for_flush*/) {
+                                    uint64_t /*log_size_for_flush*/,
+                                    uint64_t* /*sequence_number_ptr*/) {
   return Status::NotSupported("");
 }
 
-void CheckpointImpl::CleanStagingDirectory(
-    const std::string& full_private_path, Logger* info_log) {
-    std::vector<std::string> subchildren;
+void CheckpointImpl::CleanStagingDirectory(const std::string& full_private_path,
+                                           Logger* info_log) {
+  std::vector<std::string> subchildren;
   Status s = db_->GetEnv()->FileExists(full_private_path);
   if (s.IsNotFound()) {
     return;
   }
-  ROCKS_LOG_INFO(info_log, "File exists %s -- %s",
-                 full_private_path.c_str(), s.ToString().c_str());
-  db_->GetEnv()->GetChildren(full_private_path, &subchildren);
-  for (auto& subchild : subchildren) {
-    std::string subchild_path = full_private_path + "/" + subchild;
-    s = db_->GetEnv()->DeleteFile(subchild_path);
-    ROCKS_LOG_INFO(info_log, "Delete file %s -- %s",
-                   subchild_path.c_str(), s.ToString().c_str());
+  ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(),
+                 s.ToString().c_str());
+  s = db_->GetEnv()->GetChildren(full_private_path, &subchildren);
+  if (s.ok()) {
+    for (auto& subchild : subchildren) {
+      std::string subchild_path = full_private_path + "/" + subchild;
+      s = db_->GetEnv()->DeleteFile(subchild_path);
+      ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(),
+                     s.ToString().c_str());
+    }
   }
   // finally delete the private dir
   s = db_->GetEnv()->DeleteDir(full_private_path);
-  ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s",
-                 full_private_path.c_str(), s.ToString().c_str());
+  ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(),
+                 s.ToString().c_str());
 }
 
 Status Checkpoint::ExportColumnFamily(
@@ -69,7 +79,8 @@
 
 // Builds an openable snapshot of RocksDB
 Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
-                                        uint64_t log_size_for_flush) {
+                                        uint64_t log_size_for_flush,
+                                        uint64_t* sequence_number_ptr) {
   DBOptions db_options = db_->GetDBOptions();
 
   Status s = db_->GetEnv()->FileExists(checkpoint_dir);
@@ -96,40 +107,52 @@
 
   std::string full_private_path =
       checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
-  ROCKS_LOG_INFO(
-      db_options.info_log,
-      "Snapshot process -- using temporary directory %s",
-      full_private_path.c_str());
+  ROCKS_LOG_INFO(db_options.info_log,
+                 "Snapshot process -- using temporary directory %s",
+                 full_private_path.c_str());
   CleanStagingDirectory(full_private_path, db_options.info_log.get());
   // create snapshot directory
   s = db_->GetEnv()->CreateDir(full_private_path);
   uint64_t sequence_number = 0;
   if (s.ok()) {
-    db_->DisableFileDeletions();
-    s = CreateCustomCheckpoint(
-        db_options,
-        [&](const std::string& src_dirname, const std::string& fname,
-            FileType) {
-          ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s", fname.c_str());
-          return db_->GetFileSystem()->LinkFile(src_dirname + fname,
-                                                full_private_path + fname,
-                                                IOOptions(), nullptr);
-        } /* link_file_cb */,
-        [&](const std::string& src_dirname, const std::string& fname,
-            uint64_t size_limit_bytes, FileType) {
-          ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
-          return CopyFile(db_->GetFileSystem(), src_dirname + fname,
-                          full_private_path + fname, size_limit_bytes,
-                          db_options.use_fsync);
-        } /* copy_file_cb */,
-        [&](const std::string& fname, const std::string& contents, FileType) {
-          ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
-          return CreateFile(db_->GetFileSystem(), full_private_path + fname,
-                            contents, db_options.use_fsync);
-        } /* create_file_cb */,
-        &sequence_number, log_size_for_flush);
-    // we copied all the files, enable file deletions
-    db_->EnableFileDeletions(false);
+    // enable file deletions
+    s = db_->DisableFileDeletions();
+    const bool disabled_file_deletions = s.ok();
+
+    if (s.ok() || s.IsNotSupported()) {
+      s = CreateCustomCheckpoint(
+          [&](const std::string& src_dirname, const std::string& fname,
+              FileType) {
+            ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s",
+                           fname.c_str());
+            return db_->GetFileSystem()->LinkFile(
+                src_dirname + "/" + fname, full_private_path + "/" + fname,
+                IOOptions(), nullptr);
+          } /* link_file_cb */,
+          [&](const std::string& src_dirname, const std::string& fname,
+              uint64_t size_limit_bytes, FileType,
+              const std::string& /* checksum_func_name */,
+              const std::string& /* checksum_val */) {
+            ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
+            return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname,
+                            full_private_path + "/" + fname, size_limit_bytes,
+                            db_options.use_fsync);
+          } /* copy_file_cb */,
+          [&](const std::string& fname, const std::string& contents, FileType) {
+            ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
+            return CreateFile(db_->GetFileSystem(),
+                              full_private_path + "/" + fname, contents,
+                              db_options.use_fsync);
+          } /* create_file_cb */,
+          &sequence_number, log_size_for_flush);
+
+      // we copied all the files, enable file deletions
+      if (disabled_file_deletions) {
+        Status ss = db_->EnableFileDeletions(false);
+        assert(ss.ok());
+        ss.PermitUncheckedError();
+      }
+    }
   }
 
   if (s.ok()) {
@@ -137,14 +160,20 @@
     s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
   }
   if (s.ok()) {
-    std::unique_ptr<Directory> checkpoint_directory;
-    db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
-    if (checkpoint_directory != nullptr) {
-      s = checkpoint_directory->Fsync();
+    std::unique_ptr<FSDirectory> checkpoint_directory;
+    s = db_->GetFileSystem()->NewDirectory(checkpoint_dir, IOOptions(),
+                                           &checkpoint_directory, nullptr);
+    if (s.ok() && checkpoint_directory != nullptr) {
+      s = checkpoint_directory->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
     }
   }
 
   if (s.ok()) {
+    if (sequence_number_ptr != nullptr) {
+      *sequence_number_ptr = sequence_number;
+    }
     // here we know that we succeeded and installed the new snapshot
     ROCKS_LOG_INFO(db_options.info_log, "Snapshot DONE. All is good");
     ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64,
@@ -159,175 +188,93 @@
 }
 
 Status CheckpointImpl::CreateCustomCheckpoint(
-    const DBOptions& db_options,
     std::function<Status(const std::string& src_dirname,
                          const std::string& src_fname, FileType type)>
         link_file_cb,
-    std::function<Status(const std::string& src_dirname,
-                         const std::string& src_fname,
-                         uint64_t size_limit_bytes, FileType type)>
+    std::function<Status(
+        const std::string& src_dirname, const std::string& src_fname,
+        uint64_t size_limit_bytes, FileType type,
+        const std::string& checksum_func_name, const std::string& checksum_val)>
         copy_file_cb,
     std::function<Status(const std::string& fname, const std::string& contents,
                          FileType type)>
         create_file_cb,
-    uint64_t* sequence_number, uint64_t log_size_for_flush) {
-  Status s;
-  std::vector<std::string> live_files;
-  uint64_t manifest_file_size = 0;
-  uint64_t min_log_num = port::kMaxUint64;
+    uint64_t* sequence_number, uint64_t log_size_for_flush,
+    bool get_live_table_checksum) {
   *sequence_number = db_->GetLatestSequenceNumber();
-  bool same_fs = true;
-  VectorLogPtr live_wal_files;
-
-  bool flush_memtable = true;
-  if (s.ok()) {
-    if (!db_options.allow_2pc) {
-      if (log_size_for_flush == port::kMaxUint64) {
-        flush_memtable = false;
-      } else if (log_size_for_flush > 0) {
-        // If out standing log files are small, we skip the flush.
-        s = db_->GetSortedWalFiles(live_wal_files);
-
-        if (!s.ok()) {
-          return s;
-        }
 
-        // Don't flush column families if total log size is smaller than
-        // log_size_for_flush. We copy the log files instead.
-        // We may be able to cover 2PC case too.
-        uint64_t total_wal_size = 0;
-        for (auto& wal : live_wal_files) {
-          total_wal_size += wal->SizeFileBytes();
-        }
-        if (total_wal_size < log_size_for_flush) {
-          flush_memtable = false;
-        }
-        live_wal_files.clear();
-      }
+  LiveFilesStorageInfoOptions opts;
+  opts.include_checksum_info = get_live_table_checksum;
+  opts.wal_size_for_flush = log_size_for_flush;
+
+  std::vector<LiveFileStorageInfo> infos;
+  {
+    Status s = db_->GetLiveFilesStorageInfo(opts, &infos);
+    if (!s.ok()) {
+      return s;
     }
+  }
 
-    // this will return live_files prefixed with "/"
-    s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable);
-
-    if (s.ok() && db_options.allow_2pc) {
-      // If 2PC is enabled, we need to get minimum log number after the flush.
-      // Need to refetch the live files to recapture the snapshot.
-      if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep,
-                               &min_log_num)) {
-        return Status::InvalidArgument(
-            "2PC enabled but cannot fine the min log number to keep.");
-      }
-      // We need to refetch live files with flush to handle this case:
-      // A previous 000001.log contains the prepare record of transaction tnx1.
-      // The current log file is 000002.log, and sequence_number points to this
-      // file.
-      // After calling GetLiveFiles(), 000003.log is created.
-      // Then tnx1 is committed. The commit record is written to 000003.log.
-      // Now we fetch min_log_num, which will be 3.
-      // Then only 000002.log and 000003.log will be copied, and 000001.log will
-      // be skipped. 000003.log contains commit message of tnx1, but we don't
-      // have respective prepare record for it.
-      // In order to avoid this situation, we need to force flush to make sure
-      // all transactions committed before getting min_log_num will be flushed
-      // to SST files.
-      // We cannot get min_log_num before calling the GetLiveFiles() for the
-      // first time, because if we do that, all the logs files will be included,
-      // far more than needed.
-      s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable);
+  // Verify that everything except WAL files are in same directory
+  // (db_paths / cf_paths not supported)
+  std::unordered_set<std::string> dirs;
+  for (auto& info : infos) {
+    if (info.file_type != kWalFile) {
+      dirs.insert(info.directory);
     }
-
-    TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
-    TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
-    db_->FlushWAL(false /* sync */);
   }
-  // if we have more than one column family, we need to also get WAL files
-  if (s.ok()) {
-    s = db_->GetSortedWalFiles(live_wal_files);
+  if (dirs.size() > 1) {
+    return Status::NotSupported(
+        "db_paths / cf_paths not supported for Checkpoint nor BackupEngine");
   }
-  if (!s.ok()) {
-    return s;
-  }
-
-  size_t wal_size = live_wal_files.size();
 
-  // copy/hard link live_files
-  std::string manifest_fname, current_fname;
-  for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
-    uint64_t number;
-    FileType type;
-    bool ok = ParseFileName(live_files[i], &number, &type);
-    if (!ok) {
-      s = Status::Corruption("Can't parse file name. This is very bad");
-      break;
-    }
-    // we should only get sst, options, manifest and current files here
-    assert(type == kTableFile || type == kDescriptorFile ||
-           type == kCurrentFile || type == kOptionsFile);
-    assert(live_files[i].size() > 0 && live_files[i][0] == '/');
-    if (type == kCurrentFile) {
-      // We will craft the current file manually to ensure it's consistent with
-      // the manifest number. This is necessary because current's file contents
-      // can change during checkpoint creation.
-      current_fname = live_files[i];
-      continue;
-    } else if (type == kDescriptorFile) {
-      manifest_fname = live_files[i];
-    }
-    std::string src_fname = live_files[i];
+  bool same_fs = true;
 
-    // rules:
-    // * if it's kTableFile, then it's shared
-    // * if it's kDescriptorFile, limit the size to manifest_file_size
-    // * always copy if cross-device link
-    if ((type == kTableFile) && same_fs) {
-      s = link_file_cb(db_->GetName(), src_fname, type);
-      if (s.IsNotSupported()) {
-        same_fs = false;
-        s = Status::OK();
-      }
-    }
-    if ((type != kTableFile) || (!same_fs)) {
-      s = copy_file_cb(db_->GetName(), src_fname,
-                       (type == kDescriptorFile) ? manifest_file_size : 0,
-                       type);
-    }
-  }
-  if (s.ok() && !current_fname.empty() && !manifest_fname.empty()) {
-    create_file_cb(current_fname, manifest_fname.substr(1) + "\n",
-                   kCurrentFile);
-  }
-  ROCKS_LOG_INFO(db_options.info_log, "Number of log files %" ROCKSDB_PRIszt,
-                 live_wal_files.size());
-
-  // Link WAL files. Copy exact size of last one because it is the only one
-  // that has changes after the last flush.
-  for (size_t i = 0; s.ok() && i < wal_size; ++i) {
-    if ((live_wal_files[i]->Type() == kAliveLogFile) &&
-        (!flush_memtable ||
-         live_wal_files[i]->StartSequence() >= *sequence_number ||
-         live_wal_files[i]->LogNumber() >= min_log_num)) {
-      if (i + 1 == wal_size) {
-        s = copy_file_cb(db_options.wal_dir, live_wal_files[i]->PathName(),
-                         live_wal_files[i]->SizeFileBytes(), kLogFile);
-        break;
+  for (auto& info : infos) {
+    Status s;
+    if (!info.replacement_contents.empty()) {
+      // Currently should only be used for CURRENT file.
+      assert(info.file_type == kCurrentFile);
+
+      if (info.size != info.replacement_contents.size()) {
+        s = Status::Corruption("Inconsistent size metadata for " +
+                               info.relative_filename);
+      } else {
+        s = create_file_cb(info.relative_filename, info.replacement_contents,
+                           info.file_type);
       }
-      if (same_fs) {
-        // we only care about live log files
-        s = link_file_cb(db_options.wal_dir, live_wal_files[i]->PathName(),
-                         kLogFile);
+    } else {
+      if (same_fs && !info.trim_to_size) {
+        s = link_file_cb(info.directory, info.relative_filename,
+                         info.file_type);
         if (s.IsNotSupported()) {
           same_fs = false;
           s = Status::OK();
         }
+        s.MustCheck();
       }
-      if (!same_fs) {
-        s = copy_file_cb(db_options.wal_dir, live_wal_files[i]->PathName(), 0,
-                         kLogFile);
+      if (!same_fs || info.trim_to_size) {
+        assert(info.file_checksum_func_name.empty() ==
+               !opts.include_checksum_info);
+        // no assertion on file_checksum because empty is used for both "not
+        // set" and "unknown"
+        if (opts.include_checksum_info) {
+          s = copy_file_cb(info.directory, info.relative_filename, info.size,
+                           info.file_type, info.file_checksum_func_name,
+                           info.file_checksum);
+        } else {
+          s = copy_file_cb(info.directory, info.relative_filename, info.size,
+                           info.file_type, kUnknownFileChecksumFuncName,
+                           kUnknownFileChecksum);
+        }
       }
     }
+    if (!s.ok()) {
+      return s;
+    }
   }
 
-  return s;
+  return Status::OK();
 }
 
 // Exports all live SST files of a specified Column Family onto export_dir,
@@ -335,7 +282,7 @@
 Status CheckpointImpl::ExportColumnFamily(
     ColumnFamilyHandle* handle, const std::string& export_dir,
     ExportImportFilesMetaData** metadata) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handle);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
   const auto cf_name = cfh->GetName();
   const auto db_options = db_->GetDBOptions();
 
@@ -404,11 +351,14 @@
   if (s.ok()) {
     // Fsync export directory.
     moved_to_user_specified_dir = true;
-    std::unique_ptr<Directory> dir_ptr;
-    s = db_->GetEnv()->NewDirectory(export_dir, &dir_ptr);
+    std::unique_ptr<FSDirectory> dir_ptr;
+    s = db_->GetFileSystem()->NewDirectory(export_dir, IOOptions(), &dir_ptr,
+                                           nullptr);
     if (s.ok()) {
       assert(dir_ptr != nullptr);
-      s = dir_ptr->Fsync();
+      s = dir_ptr->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
     }
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -16,46 +16,32 @@
 
 class CheckpointImpl : public Checkpoint {
  public:
-  // Creates a Checkpoint object to be used for creating openable snapshots
   explicit CheckpointImpl(DB* db) : db_(db) {}
 
-  // Builds an openable snapshot of RocksDB on the same disk, which
-  // accepts an output directory on the same disk, and under the directory
-  // (1) hard-linked SST files pointing to existing live SST files
-  // SST files will be copied if output directory is on a different filesystem
-  // (2) a copied manifest files and other files
-  // The directory should not already exist and will be created by this API.
-  // The directory will be an absolute path
-  using Checkpoint::CreateCheckpoint;
-  virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
-                                  uint64_t log_size_for_flush) override;
+  Status CreateCheckpoint(const std::string& checkpoint_dir,
+                          uint64_t log_size_for_flush,
+                          uint64_t* sequence_number_ptr) override;
 
-  // Exports all live SST files of a specified Column Family onto export_dir
-  // and returning SST files information in metadata.
-  //  - SST files will be created as hard links when the directory specified
-  //    is in the same partition as the db directory, copied otherwise.
-  //  - export_dir should not already exist and will be created by this API.
-  //  - Always triggers a flush.
-  using Checkpoint::ExportColumnFamily;
-  virtual Status ExportColumnFamily(
-      ColumnFamilyHandle* handle, const std::string& export_dir,
-      ExportImportFilesMetaData** metadata) override;
+  Status ExportColumnFamily(ColumnFamilyHandle* handle,
+                            const std::string& export_dir,
+                            ExportImportFilesMetaData** metadata) override;
 
   // Checkpoint logic can be customized by providing callbacks for link, copy,
   // or create.
   Status CreateCustomCheckpoint(
-      const DBOptions& db_options,
       std::function<Status(const std::string& src_dirname,
                            const std::string& fname, FileType type)>
           link_file_cb,
       std::function<Status(const std::string& src_dirname,
                            const std::string& fname, uint64_t size_limit_bytes,
-                           FileType type)>
+                           FileType type, const std::string& checksum_func_name,
+                           const std::string& checksum_val)>
           copy_file_cb,
       std::function<Status(const std::string& fname,
                            const std::string& contents, FileType type)>
           create_file_cb,
-      uint64_t* sequence_number, uint64_t log_size_for_flush);
+      uint64_t* sequence_number, uint64_t log_size_for_flush,
+      bool get_live_table_checksum = false);
 
  private:
   void CleanStagingDirectory(const std::string& path, Logger* info_log);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -9,6 +9,7 @@
 
 // Syncpoint prevents us building and running tests in release
 #ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/checkpoint.h"
 
 #ifndef OS_WIN
 #include <unistd.h>
@@ -16,17 +17,19 @@
 #include <iostream>
 #include <thread>
 #include <utility>
+
 #include "db/db_impl/db_impl.h"
+#include "file/file_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/transaction_db.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 class CheckpointTest : public testing::Test {
@@ -64,12 +67,12 @@
     snapshot_name_ = test::PerThreadDBPath(env_, "snapshot");
     std::string snapshot_tmp_name = snapshot_name_ + ".tmp";
     EXPECT_OK(DestroyDB(snapshot_name_, options));
-    env_->DeleteDir(snapshot_name_);
+    test::DeleteDir(env_, snapshot_name_);
     EXPECT_OK(DestroyDB(snapshot_tmp_name, options));
-    env_->DeleteDir(snapshot_tmp_name);
+    test::DeleteDir(env_, snapshot_tmp_name);
     Reopen(options);
-    export_path_ = test::TmpDir(env_) + "/export";
-    test::DestroyDir(env_, export_path_);
+    export_path_ = test::PerThreadDBPath("/export");
+    DestroyDir(env_, export_path_).PermitUncheckedError();
     cfh_reverse_comp_ = nullptr;
     metadata_ = nullptr;
   }
@@ -94,7 +97,7 @@
     options.db_paths.emplace_back(dbname_ + "_4", 0);
     EXPECT_OK(DestroyDB(dbname_, options));
     EXPECT_OK(DestroyDB(snapshot_name_, options));
-    test::DestroyDir(env_, export_path_);
+    DestroyDir(env_, export_path_).PermitUncheckedError();
   }
 
   // Return the current option configuration.
@@ -272,7 +275,6 @@
     ASSERT_OK(DestroyDB(dbname_, options));
 
     // Create a database
-    Status s;
     options.create_if_missing = true;
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     std::string key = std::string("foo");
@@ -312,9 +314,62 @@
   }
 }
 
+TEST_F(CheckpointTest, CheckpointWithBlob) {
+  // Create a database with a blob file
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  ASSERT_OK(Put(key, blob));
+  ASSERT_OK(Flush());
+
+  // Create a checkpoint
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+  std::unique_ptr<Checkpoint> checkpoint_guard(checkpoint);
+
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+
+  // Make sure it contains the blob file
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
+
+  bool blob_file_found = false;
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kWalFile;
+
+    if (ParseFileName(file, &number, &type) && type == kBlobFile) {
+      blob_file_found = true;
+      break;
+    }
+  }
+
+  ASSERT_TRUE(blob_file_found);
+
+  // Make sure the checkpoint can be opened and the blob value read
+  options.create_if_missing = false;
+  DB* checkpoint_db = nullptr;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &checkpoint_db));
+
+  std::unique_ptr<DB> checkpoint_db_guard(checkpoint_db);
+
+  PinnableSlice value;
+  ASSERT_OK(checkpoint_db->Get(
+      ReadOptions(), checkpoint_db->DefaultColumnFamily(), key, &value));
+
+  ASSERT_EQ(value, blob);
+}
+
 TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
   // Create a database
-  Status s;
   auto options = CurrentOptions();
   options.create_if_missing = true;
   CreateAndReopenWithCF({}, options);
@@ -324,14 +379,8 @@
                                    int num_files_expected) {
     ASSERT_EQ(metadata.files.size(), num_files_expected);
     std::vector<std::string> subchildren;
-    env_->GetChildren(export_path_, &subchildren);
-    int num_children = 0;
-    for (const auto& child : subchildren) {
-      if (child != "." && child != "..") {
-        ++num_children;
-      }
-    }
-    ASSERT_EQ(num_children, num_files_expected);
+    ASSERT_OK(env_->GetChildren(export_path_, &subchildren));
+    ASSERT_EQ(subchildren.size(), num_files_expected);
   };
 
   // Test DefaultColumnFamily
@@ -347,7 +396,7 @@
                                              export_path_, &metadata_));
     verify_files_exported(*metadata_, 1);
     ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
-    test::DestroyDir(env_, export_path_);
+    ASSERT_OK(DestroyDir(env_, export_path_));
     delete metadata_;
     metadata_ = nullptr;
 
@@ -358,7 +407,7 @@
                                              export_path_, &metadata_));
     verify_files_exported(*metadata_, 2);
     ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
-    test::DestroyDir(env_, export_path_);
+    ASSERT_OK(DestroyDir(env_, export_path_));
     delete metadata_;
     metadata_ = nullptr;
     delete checkpoint;
@@ -388,7 +437,6 @@
 
 TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
   // Create a database
-  Status s;
   auto options = CurrentOptions();
   options.create_if_missing = true;
   CreateAndReopenWithCF({}, options);
@@ -400,11 +448,11 @@
   ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
 
   // Export onto existing directory
-  env_->CreateDirIfMissing(export_path_);
+  ASSERT_OK(env_->CreateDirIfMissing(export_path_));
   ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
                                            export_path_, &metadata_),
             Status::InvalidArgument("Specified export_dir exists"));
-  test::DestroyDir(env_, export_path_);
+  ASSERT_OK(DestroyDir(env_, export_path_));
 
   // Export with invalid directory specification
   export_path_ = "";
@@ -435,7 +483,6 @@
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
 
-  Status s;
   // Take a snapshot
   ROCKSDB_NAMESPACE::port::Thread t([&]() {
     Checkpoint* checkpoint;
@@ -463,7 +510,7 @@
   // Open snapshot and verify contents while DB is running
   options.create_if_missing = false;
   std::vector<std::string> cfs;
-  cfs=  {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
+  cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
   std::vector<ColumnFamilyDescriptor> column_families;
     for (size_t i = 0; i < cfs.size(); ++i) {
       column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
@@ -491,7 +538,7 @@
 
   ASSERT_OK(Put(0, "Default", "Default"));
   ASSERT_OK(Put(1, "one", "one"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put(2, "two", "two"));
 
   DB* snapshotDB;
@@ -499,7 +546,6 @@
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
 
-  Status s;
   // Take a snapshot
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
@@ -552,7 +598,7 @@
       {// Get past the flush in the checkpoint thread before adding any keys to
        // the db so the checkpoint thread won't hit the WriteManifest
        // syncpoints.
-       {"DBImpl::GetLiveFiles:1",
+       {"CheckpointImpl::CreateCheckpoint:FlushDone",
         "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut"},
        // Roll the manifest during checkpointing right after live files are
        // snapshotted.
@@ -588,7 +634,7 @@
   Close();
   const std::string dbname = test::PerThreadDBPath("transaction_testdb");
   ASSERT_OK(DestroyDB(dbname, CurrentOptions()));
-  env_->DeleteDir(dbname);
+  test::DeleteDir(env_, dbname);
 
   Options options = CurrentOptions();
   options.allow_2pc = true;
@@ -597,7 +643,7 @@
   TransactionDBOptions txn_db_options;
   TransactionDB* txdb;
   Status s = TransactionDB::Open(options, txn_db_options, dbname, &txdb);
-  assert(s.ok());
+  ASSERT_OK(s);
   ColumnFamilyHandle* cfa;
   ColumnFamilyHandle* cfb;
   ColumnFamilyOptions cf_options;
@@ -618,6 +664,7 @@
   ASSERT_EQ(txdb->GetTransactionByName("xid"), txn);
 
   s = txn->Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
   s = txn->Put(cfa, Slice("foocfa"), Slice("barcfa"));
   ASSERT_OK(s);
   // Writing prepare into middle of first WAL, then flush WALs many times
@@ -629,7 +676,7 @@
     ASSERT_OK(tx->Prepare());
     ASSERT_OK(tx->Commit());
     if (i % 10000 == 0) {
-      txdb->Flush(FlushOptions());
+      ASSERT_OK(txdb->Flush(FlushOptions()));
     }
     if (i == 88888) {
       ASSERT_OK(txn->Prepare());
@@ -660,13 +707,13 @@
 
   // No more than two logs files should exist.
   std::vector<std::string> files;
-  env_->GetChildren(snapshot_name_, &files);
+  ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
   int num_log_files = 0;
   for (auto& file : files) {
     uint64_t num;
     FileType type;
     WalFileType log_type;
-    if (ParseFileName(file, &num, &type, &log_type) && type == kLogFile) {
+    if (ParseFileName(file, &num, &type, &log_type) && type == kWalFile) {
       num_log_files++;
     }
   }
@@ -704,7 +751,8 @@
   for (std::string checkpoint_dir : {"", "/", "////"}) {
     Checkpoint* checkpoint;
     ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-    ASSERT_TRUE(checkpoint->CreateCheckpoint("").IsInvalidArgument());
+    ASSERT_TRUE(
+        checkpoint->CreateCheckpoint(checkpoint_dir).IsInvalidArgument());
     delete checkpoint;
   }
 }
@@ -731,7 +779,7 @@
   ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
-  env->DropUnsyncedFileData();
+  ASSERT_OK(env->DropUnsyncedFileData());
 
   // make sure it's openable even though whatever data that wasn't synced got
   // dropped.
@@ -747,6 +795,50 @@
   db_ = nullptr;
 }
 
+TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) {
+  // Regression test for a bug where checkpoint failed on a DB where persisting
+  // OPTIONS file failed and the DB was opened with
+  // `fail_if_options_file_error == false`.
+  Options options = CurrentOptions();
+  options.fail_if_options_file_error = false;
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+
+  // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one
+  // operation when inside the OPTIONS file persisting code.
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  fault_fs->SetRandomMetadataWriteError(1 /* one_in */);
+  SyncPoint::GetInstance()->SetCallBack(
+      "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) {
+        fault_fs->EnableMetadataWriteErrorInjection();
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FaultInjectionTestFS::InjectMetadataWriteError:Injected",
+      [fault_fs](void* /* arg */) {
+        fault_fs->DisableMetadataWriteErrorInjection();
+      });
+  options.env = fault_fs_env.get();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+
+  // Make sure it's usable.
+  options.env = env_;
+  DB* snapshot_db;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+  ASSERT_EQ("val1", get_result);
+  delete snapshot_db;
+  delete db_;
+  db_ = nullptr;
+}
+
 TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
   ASSERT_OK(Put("foo", "foo_value"));
   ASSERT_OK(Flush());
@@ -810,6 +902,19 @@
   delete snapshot_db;
 }
 
+TEST_F(CheckpointTest, CheckpointWithDbPath) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 0);
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Flush();
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  // Currently not supported
+  ASSERT_TRUE(checkpoint->CreateCheckpoint(snapshot_name_).IsNotSupported());
+  delete checkpoint;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/layered_compaction_filter_base.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,41 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Abstract base class for building layered compaction filter on top of
+// user compaction filter.
+// See BlobIndexCompactionFilter or TtlCompactionFilter for a basic usage.
+class LayeredCompactionFilterBase : public CompactionFilter {
+ public:
+  LayeredCompactionFilterBase(
+      const CompactionFilter* _user_comp_filter,
+      std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory)
+      : user_comp_filter_(_user_comp_filter),
+        user_comp_filter_from_factory_(
+            std::move(_user_comp_filter_from_factory)) {
+    if (!user_comp_filter_) {
+      user_comp_filter_ = user_comp_filter_from_factory_.get();
+    }
+  }
+
+  // Return a pointer to user compaction filter
+  const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
+
+  const Customizable* Inner() const override { return user_comp_filter_; }
+
+ protected:
+  const CompactionFilter* user_comp_filter_;
+
+ private:
+  std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
+};
+
+}  //  namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc	2025-05-19 16:14:28.000000000 +0000
@@ -12,10 +12,6 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-const char* RemoveEmptyValueCompactionFilter::Name() const {
-  return "RemoveEmptyValueCompactionFilter";
-}
-
 bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/,
                                               const Slice& /*key*/,
                                               const Slice& existing_value,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h	2025-05-19 16:14:28.000000000 +0000
@@ -16,12 +16,13 @@
 
 class RemoveEmptyValueCompactionFilter : public CompactionFilter {
  public:
-    const char* Name() const override;
-    bool Filter(int level,
-        const Slice& key,
-        const Slice& existing_value,
-        std::string* new_value,
-        bool* value_changed) const override;
+  static const char* kClassName() { return "RemoveEmptyValueCompactionFilter"; }
+
+  const char* Name() const override { return kClassName(); }
+
+  bool Filter(int level, const Slice& key, const Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override;
 };
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // !ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/compaction_filters.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/compaction_filters.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,56 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/options_type.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinCompactionFilters(ObjectLibrary& library,
+                                            const std::string& /*arg*/) {
+  library.AddFactory<CompactionFilter>(
+      RemoveEmptyValueCompactionFilter::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilter>* /*guard*/,
+         std::string* /*errmsg*/) {
+        return new RemoveEmptyValueCompactionFilter();
+      });
+  return 1;
+}
+#endif  // ROCKSDB_LITE
+Status CompactionFilter::CreateFromString(const ConfigOptions& config_options,
+                                          const std::string& value,
+                                          const CompactionFilter** result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinCompactionFilters(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  CompactionFilter* filter = const_cast<CompactionFilter*>(*result);
+  Status status = LoadStaticObject<CompactionFilter>(config_options, value,
+                                                     nullptr, &filter);
+  if (status.ok()) {
+    *result = const_cast<CompactionFilter*>(filter);
+  }
+  return status;
+}
+
+Status CompactionFilterFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<CompactionFilterFactory>* result) {
+  // Currently there are no builtin CompactionFilterFactories.
+  // If any are introduced, they need to be registered here.
+  Status status = LoadSharedObject<CompactionFilterFactory>(
+      config_options, value, nullptr, result);
+  return status;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/convenience/info_log_finder.cc	2025-05-19 16:14:28.000000000 +0000
@@ -19,7 +19,7 @@
   }
   std::string parent_path;
   const Options& options = db->GetOptions();
-  return GetInfoLogFiles(options.env, options.db_log_dir, db->GetName(),
-                         &parent_path, info_log_list);
+  return GetInfoLogFiles(options.env->GetFileSystem(), options.db_log_dir,
+                         db->GetName(), &parent_path, info_log_list);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/debug.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/debug.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/debug.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/debug.cc	2025-05-19 16:14:28.000000000 +0000
@@ -37,11 +37,12 @@
 
   DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
   auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator);
+  ReadOptions read_options;
   ReadRangeDelAggregator range_del_agg(&icmp,
                                        kMaxSequenceNumber /* upper_bound */);
   Arena arena;
-  ScopedArenaIterator iter(idb->NewInternalIterator(&arena, &range_del_agg,
-                                                    kMaxSequenceNumber, cfh));
+  ScopedArenaIterator iter(idb->NewInternalIterator(
+      read_options, &arena, &range_del_agg, kMaxSequenceNumber, cfh));
 
   if (!begin_key.empty()) {
     InternalKey ikey;
@@ -54,9 +55,10 @@
   size_t num_keys = 0;
   for (; iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
-    if (!ParseInternalKey(iter->key(), &ikey)) {
-      return Status::Corruption("Internal Key [" + iter->key().ToString() +
-                                "] parse error!");
+    Status pik_status =
+        ParseInternalKey(iter->key(), &ikey, true /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
+      return pik_status;
     }
 
     if (!end_key.empty() &&
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_librados.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_librados.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.cc	2025-05-19 16:14:28.000000000 +0000
@@ -172,7 +172,7 @@
    *
    * @return [description]
    */
-  Status InvalidateCache(size_t offset, size_t length) {
+  Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
     return Status::OK();
   }
 };
@@ -237,8 +237,7 @@
   };
 
   //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
-  void Hint(AccessPattern pattern) {
-    /* Do nothing */
+  void Hint(AccessPattern /*pattern*/) { /* Do nothing */
   }
 
   /**
@@ -250,7 +249,7 @@
    *
    * @return [description]
    */
-  Status InvalidateCache(size_t offset, size_t length) {
+  Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
     return Status::OK();
   }
 };
@@ -315,6 +314,7 @@
     Sync();
   }
 
+  using WritableFile::Append;
   /**
    * @brief append data to file
    * @details
@@ -324,7 +324,7 @@
    * @param data [description]
    * @return [description]
    */
-  Status Append(const Slice& data) {
+  Status Append(const Slice& data) override {
     // append buffer
     LOG_DEBUG("[IN] %i | %s\n", (int)data.size(), data.data());
     int r = 0;
@@ -341,14 +341,14 @@
     return err_to_status(r);
   }
 
+  using WritableFile::PositionedAppend;
   /**
    * @brief not supported
    * @details [long description]
    * @return [description]
    */
-  Status PositionedAppend(
-    const Slice& /* data */,
-    uint64_t /* offset */) {
+  Status PositionedAppend(const Slice& /* data */,
+                          uint64_t /* offset */) override {
     return Status::NotSupported();
   }
 
@@ -359,7 +359,7 @@
    * @param size [description]
    * @return [description]
    */
-  Status Truncate(uint64_t size) {
+  Status Truncate(uint64_t size) override {
     LOG_DEBUG("[IN]%lld|%lld|%lld\n", (long long)size, (long long)_file_size, (long long)_buffer_size);
     int r = 0;
 
@@ -391,7 +391,7 @@
    * @details [long description]
    * @return [description]
    */
-  Status Close() {
+  Status Close() override {
     LOG_DEBUG("%s | %lld | %lld\n", _hint.c_str(), (long long)_buffer_size, (long long)_file_size);
     return Sync();
   }
@@ -402,7 +402,7 @@
    *
    * @return [description]
    */
-  Status Flush() {
+  Status Flush() override {
     librados::AioCompletion *write_completion = librados::Rados::aio_create_completion();
     int r = 0;
 
@@ -425,7 +425,7 @@
    * @details initiate an aio write and wait for result
    * @return [description]
    */
-  Status Sync() { // sync data
+  Status Sync() override {  // sync data
     int r = 0;
 
     std::lock_guard<std::mutex> lock(_mutex);
@@ -441,18 +441,14 @@
    * @details [long description]
    * @return true if Sync() and Fsync() are safe to call concurrently with Append()and Flush().
    */
-  bool IsSyncThreadSafe() const {
-    return true;
-  }
+  bool IsSyncThreadSafe() const override { return true; }
 
   /**
    * @brief Indicates the upper layers if the current WritableFile implementation uses direct IO.
    * @details [long description]
    * @return [description]
    */
-  bool use_direct_io() const {
-    return false;
-  }
+  bool use_direct_io() const override { return false; }
 
   /**
    * @brief Get file size
@@ -460,7 +456,7 @@
    *  This API will use cached file_size.
    * @return [description]
    */
-  uint64_t GetFileSize() {
+  uint64_t GetFileSize() override {
     LOG_DEBUG("%lld|%lld\n", (long long)_buffer_size, (long long)_file_size);
 
     std::lock_guard<std::mutex> lock(_mutex);
@@ -478,7 +474,7 @@
    *
    * @return [description]
    */
-  size_t GetUniqueId(char* id, size_t max_size) const {
+  size_t GetUniqueId(char* id, size_t max_size) const override {
     // All fid has the same db_id prefix, so we need to ignore db_id prefix
     size_t s = std::min(max_size, _fid.size());
     strncpy(id, _fid.c_str() + (_fid.size() - s), s);
@@ -495,11 +491,10 @@
    *
    * @return [description]
    */
-  Status InvalidateCache(size_t offset, size_t length) {
+  Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
     return Status::OK();
   }
 
-  using WritableFile::RangeSync;
   /**
    * @brief No RangeSync support, just call Sync()
    * @details [long description]
@@ -509,12 +504,11 @@
    *
    * @return [description]
    */
-  Status RangeSync(off_t offset, off_t nbytes) {
+  Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override {
     return Sync();
   }
 
-protected:
-  using WritableFile::Allocate;
+ protected:
   /**
    * @brief noop
    * @details [long description]
@@ -524,7 +518,7 @@
    *
    * @return [description]
    */
-  Status Allocate(off_t offset, off_t len) {
+  Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
     return Status::OK();
   }
 };
@@ -533,16 +527,14 @@
 // Directory object represents collection of files and implements
 // filesystem operations that can be executed on directories.
 class LibradosDirectory : public Directory {
-  librados::IoCtx * _io_ctx;
   std::string _fid;
-public:
-  explicit LibradosDirectory(librados::IoCtx * io_ctx, std::string fid):
-    _io_ctx(io_ctx), _fid(fid) {}
+
+ public:
+  explicit LibradosDirectory(librados::IoCtx* /*io_ctx*/, std::string fid)
+      : _fid(fid) {}
 
   // Fsync directory. Can be called concurrently from multiple threads.
-  Status Fsync() {
-    return Status::OK();
-  }
+  Status Fsync() { return Status::OK(); }
 };
 
 // Identifies a locked file.
@@ -552,8 +544,8 @@
   const std::string _obj_name;
   const std::string _lock_name;
   const std::string _cookie;
-  int lock_state;
-public:
+
+ public:
   LibradosFileLock(
     librados::IoCtx * io_ctx,
     const std::string obj_name):
@@ -862,25 +854,6 @@
                 public functions
 ************************************************************/
 /**
- * @brief generate unique id
- * @details Combine system time and random number.
- * @return [description]
- */
-std::string EnvLibrados::GenerateUniqueId() {
-  Random64 r(time(nullptr));
-  uint64_t random_uuid_portion =
-    r.Uniform(std::numeric_limits<uint64_t>::max());
-  uint64_t nanos_uuid_portion = NowNanos();
-  char uuid2[200];
-  snprintf(uuid2,
-           200,
-           "%16lx-%16lx",
-           (unsigned long)nanos_uuid_portion,
-           (unsigned long)random_uuid_portion);
-  return uuid2;
-}
-
-/**
  * @brief create a new sequential read file handler
  * @details it will check the existence of fname
  *
@@ -889,11 +862,9 @@
  * @param options [description]
  * @return [description]
  */
-Status EnvLibrados::NewSequentialFile(
-  const std::string& fname,
-  std::unique_ptr<SequentialFile>* result,
-  const EnvOptions& options)
-{
+Status EnvLibrados::NewSequentialFile(const std::string& fname,
+                                      std::unique_ptr<SequentialFile>* result,
+                                      const EnvOptions& /*options*/) {
   LOG_DEBUG("[IN]%s\n", fname.c_str());
   std::string dir, file, fid;
   split(fname, &dir, &file);
@@ -933,10 +904,8 @@
  * @return [description]
  */
 Status EnvLibrados::NewRandomAccessFile(
-  const std::string& fname,
-  std::unique_ptr<RandomAccessFile>* result,
-  const EnvOptions& options)
-{
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
+    const EnvOptions& /*options*/) {
   LOG_DEBUG("[IN]%s\n", fname.c_str());
   std::string dir, file, fid;
   split(fname, &dir, &file);
@@ -1393,6 +1362,8 @@
   const std::string& src,
   const std::string& target_in)
 {
+  (void)src;
+  (void)target_in;
   LOG_DEBUG("[IO]%s => %s\n", src.c_str(), target_in.c_str());
   return Status::NotSupported();
 }
@@ -1474,10 +1445,9 @@
  *
  * @return [description]
  */
-Status EnvLibrados::GetAbsolutePath(
-  const std::string& db_path,
-  std::string* output_path)
-{
+Status EnvLibrados::GetAbsolutePath(const std::string& db_path,
+                                    std::string* /*output_path*/) {
+  (void)db_path;
   LOG_DEBUG("[IO]%s\n", db_path.c_str());
   return Status::NotSupported();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_librados.md mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.md
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_librados.md	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados.md	2025-05-19 16:14:28.000000000 +0000
@@ -40,7 +40,7 @@
 
 # Performance Test
 ## Compile
-Check this [link](https://github.com/facebook/rocksdb/blob/master/INSTALL.md) to install the dependencies of RocksDB. Then you can compile it by running `$ make env_librados_test ROCKSDB_USE_LIBRADOS=1` under `rocksdb\`. The configure file used by env_librados_test is `../ceph/src/ceph.conf`. For Ubuntu 14.04, just run following commands:
+Check this [link](https://github.com/facebook/rocksdb/blob/main/INSTALL.md) to install the dependencies of RocksDB. Then you can compile it by running `$ make env_librados_test ROCKSDB_USE_LIBRADOS=1` under `rocksdb\`. The configure file used by env_librados_test is `../ceph/src/ceph.conf`. For Ubuntu 14.04, just run following commands:
 ```bash
 $ sudo apt-get install libgflags-dev
 $ sudo apt-get install libsnappy-dev
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_librados_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_librados_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_librados_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -20,9 +20,10 @@
 #include "rocksdb/utilities/transaction_db.h"
 
 class Timer {
-  typedef std::chrono::high_resolution_clock high_resolution_clock;
-  typedef std::chrono::milliseconds milliseconds;
-public:
+  using high_resolution_clock = std::chrono::high_resolution_clock;
+  using milliseconds = std::chrono::milliseconds;
+
+ public:
   explicit Timer(bool run = false)
   {
     if (run)
@@ -1132,6 +1133,11 @@
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
+  if (getenv("CIRCLECI")) {
+    fprintf(stderr,
+            "TODO: get env_librados_test working in CI. Skipping for now.\n");
+    return 0;
+  }
   return RUN_ALL_TESTS();
 }
 
@@ -1139,7 +1145,7 @@
 #include <stdio.h>
 
 int main(int argc, char** argv) {
-  fprintf(stderr, "SKIPPED as EnvMirror is not supported in ROCKSDB_LITE\n");
+  fprintf(stderr, "SKIPPED as EnvLibrados is not supported in ROCKSDB_LITE\n");
   return 0;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_mirror.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_mirror.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_mirror.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_mirror.cc	2025-05-19 16:14:28.000000000 +0000
@@ -27,13 +27,17 @@
     if (as == Status::OK()) {
       char* bscratch = new char[n];
       Slice bslice;
+#ifndef NDEBUG
       size_t off = 0;
+#endif
       size_t left = aslice.size();
       while (left) {
         Status bs = b_->Read(left, &bslice, bscratch);
+#ifndef NDEBUG
         assert(as == bs);
         assert(memcmp(bscratch, scratch + off, bslice.size()) == 0);
         off += bslice.size();
+#endif
         left -= bslice.size();
       }
       delete[] bscratch;
@@ -107,12 +111,21 @@
     assert(as == bs);
     return as;
   }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& /* verification_info */) override {
+    return Append(data);
+  }
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
     Status as = a_->PositionedAppend(data, offset);
     Status bs = b_->PositionedAppend(data, offset);
     assert(as == bs);
     return as;
   }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& /* verification_info */) override {
+    return PositionedAppend(data, offset);
+  }
   Status Truncate(uint64_t size) override {
     Status as = a_->Truncate(size);
     Status bs = b_->Truncate(size);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_timed.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_timed.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.cc	2025-05-19 16:14:28.000000000 +0000
@@ -2,139 +2,181 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
+#include "utilities/env_timed.h"
 
+#include "env/composite_env_wrapper.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
+TimedFileSystem::TimedFileSystem(const std::shared_ptr<FileSystem>& base)
+    : FileSystemWrapper(base) {}
+IOStatus TimedFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_sequential_file_nanos);
+  return FileSystemWrapper::NewSequentialFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_random_access_file_nanos);
+  return FileSystemWrapper::NewRandomAccessFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_writable_file_nanos);
+  return FileSystemWrapper::NewWritableFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_reuse_writable_file_nanos);
+  return FileSystemWrapper::ReuseWritableFile(fname, old_fname, options, result,
+                                              dbg);
+}
+
+IOStatus TimedFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_random_rw_file_nanos);
+  return FileSystemWrapper::NewRandomRWFile(fname, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::NewDirectory(const std::string& name,
+                                       const IOOptions& options,
+                                       std::unique_ptr<FSDirectory>* result,
+                                       IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_directory_nanos);
+  return FileSystemWrapper::NewDirectory(name, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::FileExists(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_file_exists_nanos);
+  return FileSystemWrapper::FileExists(fname, options, dbg);
+}
+
+IOStatus TimedFileSystem::GetChildren(const std::string& dir,
+                                      const IOOptions& options,
+                                      std::vector<std::string>* result,
+                                      IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_children_nanos);
+  return FileSystemWrapper::GetChildren(dir, options, result, dbg);
+}
+
+IOStatus TimedFileSystem::GetChildrenFileAttributes(
+    const std::string& dir, const IOOptions& options,
+    std::vector<FileAttributes>* result, IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_children_file_attributes_nanos);
+  return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result,
+                                                      dbg);
+}
+
+IOStatus TimedFileSystem::DeleteFile(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_delete_file_nanos);
+  return FileSystemWrapper::DeleteFile(fname, options, dbg);
+}
+
+IOStatus TimedFileSystem::CreateDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_create_dir_nanos);
+  return FileSystemWrapper::CreateDir(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_create_dir_if_missing_nanos);
+  return FileSystemWrapper::CreateDirIfMissing(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::DeleteDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_delete_dir_nanos);
+  return FileSystemWrapper::DeleteDir(dirname, options, dbg);
+}
+
+IOStatus TimedFileSystem::GetFileSize(const std::string& fname,
+                                      const IOOptions& options,
+                                      uint64_t* file_size,
+                                      IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_file_size_nanos);
+  return FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
+}
+
+IOStatus TimedFileSystem::GetFileModificationTime(const std::string& fname,
+                                                  const IOOptions& options,
+                                                  uint64_t* file_mtime,
+                                                  IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_get_file_modification_time_nanos);
+  return FileSystemWrapper::GetFileModificationTime(fname, options, file_mtime,
+                                                    dbg);
+}
+
+IOStatus TimedFileSystem::RenameFile(const std::string& src,
+                                     const std::string& dst,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_rename_file_nanos);
+  return FileSystemWrapper::RenameFile(src, dst, options, dbg);
+}
+
+IOStatus TimedFileSystem::LinkFile(const std::string& src,
+                                   const std::string& dst,
+                                   const IOOptions& options,
+                                   IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_link_file_nanos);
+  return FileSystemWrapper::LinkFile(src, dst, options, dbg);
+}
+
+IOStatus TimedFileSystem::LockFile(const std::string& fname,
+                                   const IOOptions& options, FileLock** lock,
+                                   IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_lock_file_nanos);
+  return FileSystemWrapper::LockFile(fname, options, lock, dbg);
+}
+
+IOStatus TimedFileSystem::UnlockFile(FileLock* lock, const IOOptions& options,
+                                     IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_unlock_file_nanos);
+  return FileSystemWrapper::UnlockFile(lock, options, dbg);
+}
+
+IOStatus TimedFileSystem::NewLogger(const std::string& fname,
+                                    const IOOptions& options,
+                                    std::shared_ptr<Logger>* result,
+                                    IODebugContext* dbg) {
+  PERF_TIMER_GUARD(env_new_logger_nanos);
+  return FileSystemWrapper::NewLogger(fname, options, result, dbg);
+}
+
+std::shared_ptr<FileSystem> NewTimedFileSystem(
+    const std::shared_ptr<FileSystem>& base) {
+  return std::make_shared<TimedFileSystem>(base);
+}
 
 // An environment that measures function call times for filesystem
 // operations, reporting results to variables in PerfContext.
-class TimedEnv : public EnvWrapper {
- public:
-  explicit TimedEnv(Env* base_env) : EnvWrapper(base_env) {}
-
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& options) override {
-    PERF_TIMER_GUARD(env_new_sequential_file_nanos);
-    return EnvWrapper::NewSequentialFile(fname, result, options);
-  }
-
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& options) override {
-    PERF_TIMER_GUARD(env_new_random_access_file_nanos);
-    return EnvWrapper::NewRandomAccessFile(fname, result, options);
-  }
-
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& options) override {
-    PERF_TIMER_GUARD(env_new_writable_file_nanos);
-    return EnvWrapper::NewWritableFile(fname, result, options);
-  }
-
-  Status ReuseWritableFile(const std::string& fname,
-                           const std::string& old_fname,
-                           std::unique_ptr<WritableFile>* result,
-                           const EnvOptions& options) override {
-    PERF_TIMER_GUARD(env_reuse_writable_file_nanos);
-    return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options);
-  }
-
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override {
-    PERF_TIMER_GUARD(env_new_random_rw_file_nanos);
-    return EnvWrapper::NewRandomRWFile(fname, result, options);
-  }
-
-  Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override {
-    PERF_TIMER_GUARD(env_new_directory_nanos);
-    return EnvWrapper::NewDirectory(name, result);
-  }
-
-  Status FileExists(const std::string& fname) override {
-    PERF_TIMER_GUARD(env_file_exists_nanos);
-    return EnvWrapper::FileExists(fname);
-  }
-
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override {
-    PERF_TIMER_GUARD(env_get_children_nanos);
-    return EnvWrapper::GetChildren(dir, result);
-  }
-
-  Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
-    PERF_TIMER_GUARD(env_get_children_file_attributes_nanos);
-    return EnvWrapper::GetChildrenFileAttributes(dir, result);
-  }
-
-  Status DeleteFile(const std::string& fname) override {
-    PERF_TIMER_GUARD(env_delete_file_nanos);
-    return EnvWrapper::DeleteFile(fname);
-  }
-
-  Status CreateDir(const std::string& dirname) override {
-    PERF_TIMER_GUARD(env_create_dir_nanos);
-    return EnvWrapper::CreateDir(dirname);
-  }
-
-  Status CreateDirIfMissing(const std::string& dirname) override {
-    PERF_TIMER_GUARD(env_create_dir_if_missing_nanos);
-    return EnvWrapper::CreateDirIfMissing(dirname);
-  }
-
-  Status DeleteDir(const std::string& dirname) override {
-    PERF_TIMER_GUARD(env_delete_dir_nanos);
-    return EnvWrapper::DeleteDir(dirname);
-  }
-
-  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
-    PERF_TIMER_GUARD(env_get_file_size_nanos);
-    return EnvWrapper::GetFileSize(fname, file_size);
-  }
-
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) override {
-    PERF_TIMER_GUARD(env_get_file_modification_time_nanos);
-    return EnvWrapper::GetFileModificationTime(fname, file_mtime);
-  }
-
-  Status RenameFile(const std::string& src, const std::string& dst) override {
-    PERF_TIMER_GUARD(env_rename_file_nanos);
-    return EnvWrapper::RenameFile(src, dst);
-  }
-
-  Status LinkFile(const std::string& src, const std::string& dst) override {
-    PERF_TIMER_GUARD(env_link_file_nanos);
-    return EnvWrapper::LinkFile(src, dst);
-  }
-
-  Status LockFile(const std::string& fname, FileLock** lock) override {
-    PERF_TIMER_GUARD(env_lock_file_nanos);
-    return EnvWrapper::LockFile(fname, lock);
-  }
-
-  Status UnlockFile(FileLock* lock) override {
-    PERF_TIMER_GUARD(env_unlock_file_nanos);
-    return EnvWrapper::UnlockFile(lock);
-  }
-
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override {
-    PERF_TIMER_GUARD(env_new_logger_nanos);
-    return EnvWrapper::NewLogger(fname, result);
-  }
-};
-
-Env* NewTimedEnv(Env* base_env) { return new TimedEnv(base_env); }
+Env* NewTimedEnv(Env* base_env) {
+  std::shared_ptr<FileSystem> timed_fs =
+      NewTimedFileSystem(base_env->GetFileSystem());
+  return new CompositeEnvWrapper(base_env, timed_fs);
+}
 
 #else  // ROCKSDB_LITE
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_timed.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_timed.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,97 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include "rocksdb/file_system.h"
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+class TimedFileSystem : public FileSystemWrapper {
+ public:
+  explicit TimedFileSystem(const std::shared_ptr<FileSystem>& base);
+
+  static const char* kClassName() { return "TimedFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus FileExists(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+
+  IOStatus RenameFile(const std::string& src, const std::string& dst,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LinkFile(const std::string& src, const std::string& dst,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+};
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_timed_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/env_timed_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/env_timed_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -21,7 +21,7 @@
   std::unique_ptr<Env> mem_env(NewMemEnv(Env::Default()));
   std::unique_ptr<Env> timed_env(NewTimedEnv(mem_env.get()));
   std::unique_ptr<WritableFile> writable_file;
-  timed_env->NewWritableFile("f", &writable_file, EnvOptions());
+  ASSERT_OK(timed_env->NewWritableFile("f", &writable_file, EnvOptions()));
 
   ASSERT_GT(get_perf_context()->env_new_writable_file_nanos, 0);
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_env.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_env.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,548 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "utilities/fault_injection_env.h"
+
+#include <functional>
+#include <utility>
+
+#include "util/random.h"
+namespace ROCKSDB_NAMESPACE {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+std::string GetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(Env* env, const std::string& filename, uint64_t length) {
+  std::unique_ptr<SequentialFile> orig_file;
+  const EnvOptions options;
+  Status s = env->NewSequentialFile(filename, &orig_file, options);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot open file %s for truncation: %s\n",
+            filename.c_str(), s.ToString().c_str());
+    return s;
+  }
+
+  std::unique_ptr<char[]> scratch(new char[length]);
+  ROCKSDB_NAMESPACE::Slice result;
+  s = orig_file->Read(length, &result, scratch.get());
+#ifdef OS_WIN
+  orig_file.reset();
+#endif
+  if (s.ok()) {
+    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+    std::unique_ptr<WritableFile> tmp_file;
+    s = env->NewWritableFile(tmp_name, &tmp_file, options);
+    if (s.ok()) {
+      s = tmp_file->Append(result);
+      if (s.ok()) {
+        s = env->RenameFile(tmp_name, filename);
+      } else {
+        fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
+                filename.c_str(), s.ToString().c_str());
+        env->DeleteFile(tmp_name);
+      }
+    }
+  }
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+  }
+
+  return s;
+}
+
+// Trim the tailing "/" in the end of `str`
+std::string TrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+std::pair<std::string, std::string> GetDirAndName(const std::string& name) {
+  std::string dirname = GetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
+}
+
+Status FileState::DropUnsyncedData(Env* env) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  return Truncate(env, filename_, sync_pos);
+}
+
+Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  assert(pos_ >= sync_pos);
+  int range = static_cast<int>(pos_ - sync_pos);
+  uint64_t truncated_size =
+      static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
+  return Truncate(env, filename_, truncated_size);
+}
+
+Status TestDirectory::Fsync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  env_->SyncDir(dirname_);
+  return dir_->Fsync();
+}
+
+TestRandomAccessFile::TestRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& target, FaultInjectionTestEnv* env)
+    : target_(std::move(target)), env_(env) {
+  assert(target_);
+  assert(env_);
+}
+
+Status TestRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
+                                  char* scratch) const {
+  assert(env_);
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+
+  assert(target_);
+  return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomAccessFile::Prefetch(uint64_t offset, size_t n) {
+  assert(env_);
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+
+  assert(target_);
+  return target_->Prefetch(offset, n);
+}
+
+Status TestRandomAccessFile::MultiRead(ReadRequest* reqs, size_t num_reqs) {
+  assert(env_);
+  if (!env_->IsFilesystemActive()) {
+    const Status s = env_->GetError();
+
+    assert(reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      reqs[i].status = s;
+    }
+
+    return s;
+  }
+
+  assert(target_);
+  return target_->MultiRead(reqs, num_reqs);
+}
+
+TestWritableFile::TestWritableFile(const std::string& fname,
+                                   std::unique_ptr<WritableFile>&& f,
+                                   FaultInjectionTestEnv* env)
+    : state_(fname),
+      target_(std::move(f)),
+      writable_file_opened_(true),
+      env_(env) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestWritableFile::~TestWritableFile() {
+  if (writable_file_opened_) {
+    Close().PermitUncheckedError();
+  }
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  Status s = target_->Append(data);
+  if (s.ok()) {
+    state_.pos_ += data.size();
+    env_->WritableFileAppended(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Close() {
+  writable_file_opened_ = false;
+  Status s = target_->Close();
+  if (s.ok()) {
+    env_->WritableFileClosed(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Flush() {
+  Status s = target_->Flush();
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return s;
+}
+
+Status TestWritableFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return Status::IOError("FaultInjectionTestEnv: not active");
+  }
+  // No need to actual sync.
+  state_.pos_at_last_sync_ = state_.pos_;
+  env_->WritableFileSynced(state_);
+  return Status::OK();
+}
+
+TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/,
+                                   std::unique_ptr<RandomRWFile>&& f,
+                                   FaultInjectionTestEnv* env)
+    : target_(std::move(f)), file_opened_(true), env_(env) {
+  assert(target_ != nullptr);
+}
+
+TestRandomRWFile::~TestRandomRWFile() {
+  if (file_opened_) {
+    Close().PermitUncheckedError();
+  }
+}
+
+Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Write(offset, data);
+}
+
+Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
+                              char* scratch) const {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomRWFile::Close() {
+  file_opened_ = false;
+  return target_->Close();
+}
+
+Status TestRandomRWFile::Flush() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Flush();
+}
+
+Status TestRandomRWFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Sync();
+}
+
+Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
+                                           std::unique_ptr<Directory>* result) {
+  std::unique_ptr<Directory> r;
+  Status s = target()->NewDirectory(name, &r);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+  result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
+  return Status::OK();
+}
+
+Status FaultInjectionTestEnv::NewWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  // Not allow overwriting files
+  Status s = target()->FileExists(fname);
+  if (s.ok()) {
+    return Status::Corruption("File already exists.");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+  s = target()->NewWritableFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestWritableFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_managed_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::ReopenWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+
+  bool exists;
+  Status s, exists_s = target()->FileExists(fname);
+  if (exists_s.IsNotFound()) {
+    exists = false;
+  } else if (exists_s.ok()) {
+    exists = true;
+  } else {
+    s = exists_s;
+    exists = false;
+  }
+
+  if (s.ok()) {
+    s = target()->ReopenWritableFile(fname, result, soptions);
+  }
+
+  // Only track files we created. Files created outside of this
+  // `FaultInjectionTestEnv` are not eligible for tracking/data dropping
+  // (for example, they may contain data a previous db_stress run expects to
+  // be recovered). This could be extended to track/drop data appended once
+  // the file is under `FaultInjectionTestEnv`'s control.
+  if (s.ok()) {
+    bool should_track;
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(fname) != db_file_state_.end()) {
+        // It was written by this `Env` earlier.
+        assert(exists);
+        should_track = true;
+      } else if (!exists) {
+        // It was created by this `Env` just now.
+        should_track = true;
+        open_managed_files_.insert(fname);
+        auto dir_and_name = GetDirAndName(fname);
+        auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+        list.insert(dir_and_name.second);
+      } else {
+        should_track = false;
+      }
+    }
+    if (should_track) {
+      result->reset(new TestWritableFile(fname, std::move(*result), this));
+    }
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomRWFile(
+    const std::string& fname, std::unique_ptr<RandomRWFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = target()->NewRandomRWFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestRandomRWFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_managed_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomAccessFile(
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+
+  assert(target());
+  const Status s = target()->NewRandomAccessFile(fname, result, soptions);
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(result);
+  result->reset(new TestRandomAccessFile(std::move(*result), this));
+
+  return Status::OK();
+}
+
+Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = EnvWrapper::DeleteFile(f);
+  if (s.ok()) {
+    UntrackFile(f);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::RenameFile(const std::string& s,
+                                         const std::string& t) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status ret = EnvWrapper::RenameFile(s, t);
+
+  if (ret.ok()) {
+    MutexLock l(&mutex_);
+    if (db_file_state_.find(s) != db_file_state_.end()) {
+      db_file_state_[t] = db_file_state_[s];
+      db_file_state_.erase(s);
+    }
+
+    auto sdn = GetDirAndName(s);
+    auto tdn = GetDirAndName(t);
+    if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+      auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+      assert(tlist.find(tdn.second) == tlist.end());
+      tlist.insert(tdn.second);
+    }
+  }
+
+  return ret;
+}
+
+Status FaultInjectionTestEnv::LinkFile(const std::string& s,
+                                       const std::string& t) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status ret = EnvWrapper::LinkFile(s, t);
+
+  if (ret.ok()) {
+    MutexLock l(&mutex_);
+    if (db_file_state_.find(s) != db_file_state_.end()) {
+      db_file_state_[t] = db_file_state_[s];
+    }
+
+    auto sdn = GetDirAndName(s);
+    auto tdn = GetDirAndName(t);
+    if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) !=
+        dir_to_new_files_since_last_sync_[sdn.first].end()) {
+      auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+      assert(tlist.find(tdn.second) == tlist.end());
+      tlist.insert(tdn.second);
+    }
+  }
+
+  return ret;
+}
+
+void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    db_file_state_[state.filename_] = state;
+    open_managed_files_.erase(state.filename_);
+  }
+}
+
+void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+// For every file that is not fully synced, make a call to `func` with
+// FileState of the file as the parameter.
+Status FaultInjectionTestEnv::DropFileData(
+    std::function<Status(Env*, FileState)> func) {
+  Status s;
+  MutexLock l(&mutex_);
+  for (std::map<std::string, FileState>::const_iterator it =
+           db_file_state_.begin();
+       s.ok() && it != db_file_state_.end(); ++it) {
+    const FileState& state = it->second;
+    if (!state.IsFullySynced()) {
+      s = func(target(), state);
+    }
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::DropUnsyncedFileData() {
+  return DropFileData([&](Env* env, const FileState& state) {
+    return state.DropUnsyncedData(env);
+  });
+}
+
+Status FaultInjectionTestEnv::DropRandomUnsyncedFileData(Random* rnd) {
+  return DropFileData([&](Env* env, const FileState& state) {
+    return state.DropRandomUnsyncedData(env, rnd);
+  });
+}
+
+Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
+  // Because DeleteFile access this container make a copy to avoid deadlock
+  std::map<std::string, std::set<std::string>> map_copy;
+  {
+    MutexLock l(&mutex_);
+    map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                    dir_to_new_files_since_last_sync_.end());
+  }
+
+  for (auto& pair : map_copy) {
+    for (std::string name : pair.second) {
+      Status s = DeleteFile(pair.first + "/" + name);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+  return Status::OK();
+}
+void FaultInjectionTestEnv::ResetState() {
+  MutexLock l(&mutex_);
+  db_file_state_.clear();
+  dir_to_new_files_since_last_sync_.clear();
+  SetFilesystemActiveNoLock(true);
+}
+
+void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
+  MutexLock l(&mutex_);
+  auto dir_and_name = GetDirAndName(f);
+  dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+      dir_and_name.second);
+  db_file_state_.erase(f);
+  open_managed_files_.erase(f);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_env.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_env.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_env.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,258 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Random;
+class TestWritableFile;
+class FaultInjectionTestEnv;
+
+struct FileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+
+  explicit FileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) {}
+
+  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  Status DropUnsyncedData(Env* env) const;
+
+  Status DropRandomUnsyncedData(Env* env, Random* rand) const;
+};
+
+class TestRandomAccessFile : public RandomAccessFile {
+ public:
+  TestRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+                       FaultInjectionTestEnv* env);
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override;
+
+  Status Prefetch(uint64_t offset, size_t n) override;
+
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override;
+
+ private:
+  std::unique_ptr<RandomAccessFile> target_;
+  FaultInjectionTestEnv* env_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+  explicit TestWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestWritableFile();
+  virtual Status Append(const Slice& data) override;
+  virtual Status Append(
+      const Slice& data,
+      const DataVerificationInfo& /*verification_info*/) override {
+    return Append(data);
+  }
+  virtual Status Truncate(uint64_t size) override {
+    return target_->Truncate(size);
+  }
+  virtual Status Close() override;
+  virtual Status Flush() override;
+  virtual Status Sync() override;
+  virtual bool IsSyncThreadSafe() const override { return true; }
+  virtual Status PositionedAppend(const Slice& data,
+                                  uint64_t offset) override {
+    return target_->PositionedAppend(data, offset);
+  }
+  virtual Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& /*verification_info*/) override {
+    return PositionedAppend(data, offset);
+  }
+  virtual bool use_direct_io() const override {
+    return target_->use_direct_io();
+  };
+
+ private:
+  FileState state_;
+  std::unique_ptr<WritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestRandomRWFile : public RandomRWFile {
+ public:
+  explicit TestRandomRWFile(const std::string& fname,
+                            std::unique_ptr<RandomRWFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestRandomRWFile();
+  Status Write(uint64_t offset, const Slice& data) override;
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+  std::unique_ptr<RandomRWFile> target_;
+  bool file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
+class TestDirectory : public Directory {
+ public:
+  explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
+                         Directory* dir)
+      : env_(env), dirname_(dirname), dir_(dir) {}
+  ~TestDirectory() {}
+
+  virtual Status Fsync() override;
+
+ private:
+  FaultInjectionTestEnv* env_;
+  std::string dirname_;
+  std::unique_ptr<Directory> dir_;
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+  explicit FaultInjectionTestEnv(Env* base)
+      : EnvWrapper(base), filesystem_active_(true) {}
+  virtual ~FaultInjectionTestEnv() { error_.PermitUncheckedError(); }
+
+  static const char* kClassName() { return "FaultInjectionTestEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override;
+
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& soptions) override;
+
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& soptions) override;
+
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& soptions) override;
+
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& soptions) override;
+
+  virtual Status DeleteFile(const std::string& f) override;
+
+  virtual Status RenameFile(const std::string& s,
+                            const std::string& t) override;
+
+  virtual Status LinkFile(const std::string& s, const std::string& t) override;
+
+// Undef to eliminate clash on Windows
+#undef GetFreeSpace
+  virtual Status GetFreeSpace(const std::string& path,
+                              uint64_t* disk_free) override {
+    if (!IsFilesystemActive() &&
+        error_.subcode() == IOStatus::SubCode::kNoSpace) {
+      *disk_free = 0;
+      return Status::OK();
+    } else {
+      return target()->GetFreeSpace(path, disk_free);
+    }
+  }
+
+  void WritableFileClosed(const FileState& state);
+
+  void WritableFileSynced(const FileState& state);
+
+  void WritableFileAppended(const FileState& state);
+
+  // For every file that is not fully synced, make a call to `func` with
+  // FileState of the file as the parameter.
+  Status DropFileData(std::function<Status(Env*, FileState)> func);
+
+  Status DropUnsyncedFileData();
+
+  Status DropRandomUnsyncedFileData(Random* rnd);
+
+  Status DeleteFilesCreatedAfterLastDirSync();
+
+  void ResetState();
+
+  void UntrackFile(const std::string& f);
+
+  void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
+    dir_to_new_files_since_last_sync_.erase(dirname);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+  void SetFilesystemActiveNoLock(bool active,
+      Status error = Status::Corruption("Not active")) {
+    error.PermitUncheckedError();
+    filesystem_active_ = active;
+    if (!active) {
+      error_ = error;
+    }
+    error.PermitUncheckedError();
+  }
+  void SetFilesystemActive(bool active,
+      Status error = Status::Corruption("Not active")) {
+    error.PermitUncheckedError();
+    MutexLock l(&mutex_);
+    SetFilesystemActiveNoLock(active, error);
+    error.PermitUncheckedError();
+  }
+  void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
+  Status GetError() { return error_; }
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FileState> db_file_state_;
+  std::set<std::string> open_managed_files_;
+  std::unordered_map<std::string, std::set<std::string>>
+      dir_to_new_files_since_last_sync_;
+  bool filesystem_active_;  // Record flushes, syncs, writes
+  Status error_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_fs.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_fs.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,994 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom FileSystem to keep track of the state of a file
+// system the last "Sync". The data being written is cached in a "buffer".
+// Only when "Sync" is called, the data will be persistent. It can simulate
+// file data loss (or entire files) not protected by a "Sync". For any of the
+// FileSystem related operations, by specify the "IOStatus Error", a specific
+// error can be returned when file system is not activated.
+
+#include "utilities/fault_injection_fs.h"
+
+#include <functional>
+#include <utility>
+
+#include "env/composite_env_wrapper.h"
+#include "port/lang.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kNewFileNoOverwrite = "";
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+std::string TestFSGetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+// Trim the tailing "/" in the end of `str`
+std::string TestFSTrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+std::pair<std::string, std::string> TestFSGetDirAndName(
+    const std::string& name) {
+  std::string dirname = TestFSGetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
+}
+
+// Calculate the checksum of the data with corresponding checksum
+// type. If name does not match, no checksum is returned.
+void CalculateTypedChecksum(const ChecksumType& checksum_type, const char* data,
+                            size_t size, std::string* checksum) {
+  if (checksum_type == ChecksumType::kCRC32c) {
+    uint32_t v_crc32c = crc32c::Extend(0, data, size);
+    PutFixed32(checksum, v_crc32c);
+    return;
+  } else if (checksum_type == ChecksumType::kxxHash) {
+    uint32_t v = XXH32(data, size, 0);
+    PutFixed32(checksum, v);
+  }
+  return;
+}
+
+IOStatus FSFileState::DropUnsyncedData() {
+  buffer_.resize(0);
+  return IOStatus::OK();
+}
+
+IOStatus FSFileState::DropRandomUnsyncedData(Random* rand) {
+  int range = static_cast<int>(buffer_.size());
+  size_t truncated_size = static_cast<size_t>(rand->Uniform(range));
+  buffer_.resize(truncated_size);
+  return IOStatus::OK();
+}
+
+IOStatus TestFSDirectory::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  fs_->SyncDir(dirname_);
+  IOStatus s = dir_->Fsync(options, dbg);
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  return s;
+}
+
+IOStatus TestFSDirectory::FsyncWithDirOptions(
+    const IOOptions& options, IODebugContext* dbg,
+    const DirFsyncOptions& dir_fsync_options) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  fs_->SyncDir(dirname_);
+  IOStatus s = dir_->FsyncWithDirOptions(options, dbg, dir_fsync_options);
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  return s;
+}
+
+TestFSWritableFile::TestFSWritableFile(const std::string& fname,
+                                       const FileOptions& file_opts,
+                                       std::unique_ptr<FSWritableFile>&& f,
+                                       FaultInjectionTestFS* fs)
+    : state_(fname),
+      file_opts_(file_opts),
+      target_(std::move(f)),
+      writable_file_opened_(true),
+      fs_(fs) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestFSWritableFile::~TestFSWritableFile() {
+  if (writable_file_opened_) {
+    Close(IOOptions(), nullptr).PermitUncheckedError();
+  }
+}
+
+IOStatus TestFSWritableFile::Append(const Slice& data, const IOOptions& options,
+                                    IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (target_->use_direct_io()) {
+    target_->Append(data, options, dbg).PermitUncheckedError();
+  } else {
+    state_.buffer_.append(data.data(), data.size());
+    state_.pos_ += data.size();
+    fs_->WritableFileAppended(state_);
+  }
+  IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+  return io_s;
+}
+
+// By setting the IngestDataCorruptionBeforeWrite(), the data corruption is
+// simulated.
+IOStatus TestFSWritableFile::Append(
+    const Slice& data, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (fs_->ShouldDataCorruptionBeforeWrite()) {
+    return IOStatus::Corruption("Data is corrupted!");
+  }
+
+  // Calculate the checksum
+  std::string checksum;
+  CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(),
+                         data.size(), &checksum);
+  if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum &&
+      checksum != verification_info.checksum.ToString()) {
+    std::string msg = "Data is corrupted! Origin data checksum: " +
+                      verification_info.checksum.ToString() +
+                      "current data checksum: " + checksum;
+    return IOStatus::Corruption(msg);
+  }
+  if (target_->use_direct_io()) {
+    target_->Append(data, options, dbg).PermitUncheckedError();
+  } else {
+    state_.buffer_.append(data.data(), data.size());
+    state_.pos_ += data.size();
+    fs_->WritableFileAppended(state_);
+  }
+  IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+  return io_s;
+}
+
+IOStatus TestFSWritableFile::PositionedAppend(
+    const Slice& data, uint64_t offset, const IOOptions& options,
+    const DataVerificationInfo& verification_info, IODebugContext* dbg) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (fs_->ShouldDataCorruptionBeforeWrite()) {
+    return IOStatus::Corruption("Data is corrupted!");
+  }
+
+  // Calculate the checksum
+  std::string checksum;
+  CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(),
+                         data.size(), &checksum);
+  if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum &&
+      checksum != verification_info.checksum.ToString()) {
+    std::string msg = "Data is corrupted! Origin data checksum: " +
+                      verification_info.checksum.ToString() +
+                      "current data checksum: " + checksum;
+    return IOStatus::Corruption(msg);
+  }
+  target_->PositionedAppend(data, offset, options, dbg);
+  IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+  return io_s;
+}
+
+IOStatus TestFSWritableFile::Close(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  writable_file_opened_ = false;
+  IOStatus io_s;
+  if (!target_->use_direct_io()) {
+    io_s = target_->Append(state_.buffer_, options, dbg);
+  }
+  if (io_s.ok()) {
+    state_.buffer_.resize(0);
+    // Ignore sync errors
+    target_->Sync(options, dbg).PermitUncheckedError();
+    io_s = target_->Close(options, dbg);
+  }
+  if (io_s.ok()) {
+    fs_->WritableFileClosed(state_);
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  return io_s;
+}
+
+IOStatus TestFSWritableFile::Flush(const IOOptions&, IODebugContext*) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (fs_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return IOStatus::OK();
+}
+
+IOStatus TestFSWritableFile::Sync(const IOOptions& options,
+                                  IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (target_->use_direct_io()) {
+    // For Direct IO mode, we don't buffer anything in TestFSWritableFile.
+    // So just return
+    return IOStatus::OK();
+  }
+  IOStatus io_s = target_->Append(state_.buffer_, options, dbg);
+  state_.buffer_.resize(0);
+  // Ignore sync errors
+  target_->Sync(options, dbg).PermitUncheckedError();
+  state_.pos_at_last_sync_ = state_.pos_;
+  fs_->WritableFileSynced(state_);
+  return io_s;
+}
+
+TestFSRandomRWFile::TestFSRandomRWFile(const std::string& /*fname*/,
+                                       std::unique_ptr<FSRandomRWFile>&& f,
+                                       FaultInjectionTestFS* fs)
+    : target_(std::move(f)), file_opened_(true), fs_(fs) {
+  assert(target_ != nullptr);
+}
+
+TestFSRandomRWFile::~TestFSRandomRWFile() {
+  if (file_opened_) {
+    Close(IOOptions(), nullptr).PermitUncheckedError();
+  }
+}
+
+IOStatus TestFSRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                   const IOOptions& options,
+                                   IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Write(offset, data, options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Read(uint64_t offset, size_t n,
+                                  const IOOptions& options, Slice* result,
+                                  char* scratch, IODebugContext* dbg) const {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Read(offset, n, options, result, scratch, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Close(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  file_opened_ = false;
+  return target_->Close(options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Flush(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Flush(options, dbg);
+}
+
+IOStatus TestFSRandomRWFile::Sync(const IOOptions& options,
+                                  IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  return target_->Sync(options, dbg);
+}
+
+TestFSRandomAccessFile::TestFSRandomAccessFile(const std::string& /*fname*/,
+                                       std::unique_ptr<FSRandomAccessFile>&& f,
+                                       FaultInjectionTestFS* fs)
+    : target_(std::move(f)), fs_(fs) {
+  assert(target_ != nullptr);
+}
+
+IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n,
+                                      const IOOptions& options, Slice* result,
+                                      char* scratch,
+                                      IODebugContext* dbg) const {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  IOStatus s = target_->Read(offset, n, options, result, scratch, dbg);
+  if (s.ok()) {
+    s = fs_->InjectThreadSpecificReadError(
+        FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(),
+        scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr);
+  }
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected read error");
+  }
+  return s;
+}
+
+IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                                           const IOOptions& options,
+                                           IODebugContext* dbg) {
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg);
+  bool injected_error = false;
+  for (size_t i = 0; i < num_reqs; i++) {
+    if (!reqs[i].status.ok()) {
+      // Already seeing an error.
+      break;
+    }
+    bool this_injected_error;
+    reqs[i].status = fs_->InjectThreadSpecificReadError(
+        FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq,
+        &(reqs[i].result), use_direct_io(), reqs[i].scratch,
+        /*need_count_increase=*/true,
+        /*fault_injected=*/&this_injected_error);
+    injected_error |= this_injected_error;
+  }
+  if (s.ok()) {
+    s = fs_->InjectThreadSpecificReadError(
+        FaultInjectionTestFS::ErrorOperation::kMultiRead, nullptr,
+        use_direct_io(), nullptr, /*need_count_increase=*/!injected_error,
+        /*fault_injected=*/nullptr);
+  }
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected read error");
+  }
+  return s;
+}
+
+size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+  if (fs_->ShouldFailGetUniqueId()) {
+    return 0;
+  } else {
+    return target_->GetUniqueId(id, max_size);
+  }
+}
+IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options,
+                                    Slice* result, char* scratch,
+                                    IODebugContext* dbg) {
+  IOStatus s = target()->Read(n, options, result, scratch, dbg);
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected seq read error");
+  }
+  return s;
+}
+
+IOStatus TestFSSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                              const IOOptions& options,
+                                              Slice* result, char* scratch,
+                                              IODebugContext* dbg) {
+  IOStatus s =
+      target()->PositionedRead(offset, n, options, result, scratch, dbg);
+  if (s.ok() && fs_->ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected seq positioned read error");
+  }
+  return s;
+}
+
+IOStatus FaultInjectionTestFS::NewDirectory(
+    const std::string& name, const IOOptions& options,
+    std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
+  std::unique_ptr<FSDirectory> r;
+  IOStatus io_s = target()->NewDirectory(name, options, &r, dbg);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  result->reset(
+      new TestFSDirectory(this, TestFSTrimDirname(name), r.release()));
+  return IOStatus::OK();
+}
+
+IOStatus FaultInjectionTestFS::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  if (ShouldUseDiretWritable(fname)) {
+    return target()->NewWritableFile(fname, file_opts, result, dbg);
+  }
+
+  IOStatus io_s = target()->NewWritableFile(fname, file_opts, result, dbg);
+  if (io_s.ok()) {
+    result->reset(
+        new TestFSWritableFile(fname, file_opts, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    {
+      MutexLock l(&mutex_);
+      open_managed_files_.insert(fname);
+      auto dir_and_name = TestFSGetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      // The new file could overwrite an old one. Here we simplify
+      // the implementation by assuming no file of this name after
+      // dropping unsynced files.
+      list[dir_and_name.second] = kNewFileNoOverwrite;
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  if (ShouldUseDiretWritable(fname)) {
+    return target()->ReopenWritableFile(fname, file_opts, result, dbg);
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  bool exists;
+  IOStatus io_s,
+      exists_s = target()->FileExists(fname, IOOptions(), nullptr /* dbg */);
+  if (exists_s.IsNotFound()) {
+    exists = false;
+  } else if (exists_s.ok()) {
+    exists = true;
+  } else {
+    io_s = exists_s;
+    exists = false;
+  }
+
+  if (io_s.ok()) {
+    io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
+  }
+
+  // Only track files we created. Files created outside of this
+  // `FaultInjectionTestFS` are not eligible for tracking/data dropping
+  // (for example, they may contain data a previous db_stress run expects to
+  // be recovered). This could be extended to track/drop data appended once
+  // the file is under `FaultInjectionTestFS`'s control.
+  if (io_s.ok()) {
+    bool should_track;
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(fname) != db_file_state_.end()) {
+        // It was written by this `FileSystem` earlier.
+        assert(exists);
+        should_track = true;
+      } else if (!exists) {
+        // It was created by this `FileSystem` just now.
+        should_track = true;
+        open_managed_files_.insert(fname);
+        auto dir_and_name = TestFSGetDirAndName(fname);
+        auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+        list[dir_and_name.second] = kNewFileNoOverwrite;
+      } else {
+        should_track = false;
+      }
+    }
+    if (should_track) {
+      result->reset(
+          new TestFSWritableFile(fname, file_opts, std::move(*result), this));
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewRandomRWFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  if (ShouldUseDiretWritable(fname)) {
+    return target()->NewRandomRWFile(fname, file_opts, result, dbg);
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  IOStatus io_s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
+  if (io_s.ok()) {
+    result->reset(new TestFSRandomRWFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    {
+      MutexLock l(&mutex_);
+      open_managed_files_.insert(fname);
+      auto dir_and_name = TestFSGetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      // It could be overwriting an old file, but we simplify the
+      // implementation by ignoring it.
+      list[dir_and_name.second] = kNewFileNoOverwrite;
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  if (ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected error when open random access file");
+  }
+  IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr,
+                                                false, nullptr,
+                                                /*need_count_increase=*/true,
+                                                /*fault_injected=*/nullptr);
+  if (io_s.ok()) {
+    io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  }
+  if (io_s.ok()) {
+    result->reset(new TestFSRandomAccessFile(fname, std::move(*result), this));
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+
+  if (ShouldInjectRandomReadError()) {
+    return IOStatus::IOError("Injected read error when creating seq file");
+  }
+  IOStatus io_s = target()->NewSequentialFile(fname, file_opts, result, dbg);
+  if (io_s.ok()) {
+    result->reset(new TestFSSequentialFile(std::move(*result), this));
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DeleteFile(const std::string& f,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  IOStatus io_s = FileSystemWrapper::DeleteFile(f, options, dbg);
+  if (io_s.ok()) {
+    UntrackFile(f);
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::RenameFile(const std::string& s,
+                                          const std::string& t,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  // We preserve contents of overwritten files up to a size threshold.
+  // We could keep previous file in another name, but we need to worry about
+  // garbage collect the those files. We do it if it is needed later.
+  // We ignore I/O errors here for simplicity.
+  std::string previous_contents = kNewFileNoOverwrite;
+  if (target()->FileExists(t, IOOptions(), nullptr).ok()) {
+    uint64_t file_size;
+    if (target()->GetFileSize(t, IOOptions(), &file_size, nullptr).ok() &&
+        file_size < 1024) {
+      ReadFileToString(target(), t, &previous_contents).PermitUncheckedError();
+    }
+  }
+  IOStatus io_s = FileSystemWrapper::RenameFile(s, t, options, dbg);
+
+  if (io_s.ok()) {
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+        db_file_state_.erase(s);
+      }
+
+      auto sdn = TestFSGetDirAndName(s);
+      auto tdn = TestFSGetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist[tdn.second] = previous_contents;
+      }
+    }
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::LinkFile(const std::string& s,
+                                        const std::string& t,
+                                        const IOOptions& options,
+                                        IODebugContext* dbg) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  // Using the value in `dir_to_new_files_since_last_sync_` for the source file
+  // may be a more reasonable choice.
+  std::string previous_contents = kNewFileNoOverwrite;
+
+  IOStatus io_s = FileSystemWrapper::LinkFile(s, t, options, dbg);
+
+  if (io_s.ok()) {
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+      }
+
+      auto sdn = TestFSGetDirAndName(s);
+      auto tdn = TestFSGetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].find(sdn.second) !=
+          dir_to_new_files_since_last_sync_[sdn.first].end()) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist[tdn.second] = previous_contents;
+      }
+    }
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+
+  return io_s;
+}
+
+void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    db_file_state_[state.filename_] = state;
+    open_managed_files_.erase(state.filename_);
+  }
+}
+
+void FaultInjectionTestFS::WritableFileSynced(const FSFileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+void FaultInjectionTestFS::WritableFileAppended(const FSFileState& state) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+IOStatus FaultInjectionTestFS::DropUnsyncedFileData() {
+  IOStatus io_s;
+  MutexLock l(&mutex_);
+  for (std::map<std::string, FSFileState>::iterator it = db_file_state_.begin();
+       io_s.ok() && it != db_file_state_.end(); ++it) {
+    FSFileState& fs_state = it->second;
+    if (!fs_state.IsFullySynced()) {
+      io_s = fs_state.DropUnsyncedData();
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DropRandomUnsyncedFileData(Random* rnd) {
+  IOStatus io_s;
+  MutexLock l(&mutex_);
+  for (std::map<std::string, FSFileState>::iterator it = db_file_state_.begin();
+       io_s.ok() && it != db_file_state_.end(); ++it) {
+    FSFileState& fs_state = it->second;
+    if (!fs_state.IsFullySynced()) {
+      io_s = fs_state.DropRandomUnsyncedData(rnd);
+    }
+  }
+  return io_s;
+}
+
+IOStatus FaultInjectionTestFS::DeleteFilesCreatedAfterLastDirSync(
+    const IOOptions& options, IODebugContext* dbg) {
+  // Because DeleteFile access this container make a copy to avoid deadlock
+  std::map<std::string, std::map<std::string, std::string>> map_copy;
+  {
+    MutexLock l(&mutex_);
+    map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                    dir_to_new_files_since_last_sync_.end());
+  }
+
+  for (auto& pair : map_copy) {
+    for (auto& file_pair : pair.second) {
+      if (file_pair.second == kNewFileNoOverwrite) {
+        IOStatus io_s =
+            DeleteFile(pair.first + "/" + file_pair.first, options, dbg);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      } else {
+        IOStatus io_s =
+            WriteStringToFile(target(), file_pair.second,
+                              pair.first + "/" + file_pair.first, true);
+        if (!io_s.ok()) {
+          return io_s;
+        }
+      }
+    }
+  }
+  return IOStatus::OK();
+}
+
+void FaultInjectionTestFS::ResetState() {
+  MutexLock l(&mutex_);
+  db_file_state_.clear();
+  dir_to_new_files_since_last_sync_.clear();
+  SetFilesystemActiveNoLock(true);
+}
+
+void FaultInjectionTestFS::UntrackFile(const std::string& f) {
+  MutexLock l(&mutex_);
+  auto dir_and_name = TestFSGetDirAndName(f);
+  dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+      dir_and_name.second);
+  db_file_state_.erase(f);
+  open_managed_files_.erase(f);
+}
+
+IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError(
+    ErrorOperation op, Slice* result, bool direct_io, char* scratch,
+    bool need_count_increase, bool* fault_injected) {
+  bool dummy_bool;
+  bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool;
+  ret_fault_injected = false;
+  ErrorContext* ctx =
+        static_cast<ErrorContext*>(thread_local_error_->Get());
+  if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) {
+    return IOStatus::OK();
+  }
+
+  if (ctx->rand.OneIn(ctx->one_in)) {
+    if (ctx->count == 0) {
+      ctx->message = "";
+    }
+    if (need_count_increase) {
+      ctx->count++;
+    }
+    if (ctx->callstack) {
+      free(ctx->callstack);
+    }
+    ctx->callstack = port::SaveStack(&ctx->frames);
+
+    if (op != ErrorOperation::kMultiReadSingleReq) {
+      // Likely non-per read status code for MultiRead
+      ctx->message += "error; ";
+      ret_fault_injected = true;
+      return IOStatus::IOError();
+    } else if (Random::GetTLSInstance()->OneIn(8)) {
+      assert(result);
+      // For a small chance, set the failure to status but turn the
+      // result to be empty, which is supposed to be caught for a check.
+      *result = Slice();
+      ctx->message += "inject empty result; ";
+      ret_fault_injected = true;
+    } else if (!direct_io && Random::GetTLSInstance()->OneIn(7) &&
+               scratch != nullptr && result->data() == scratch) {
+      assert(result);
+      // With direct I/O, many extra bytes might be read so corrupting
+      // one byte might not cause checksum mismatch. Skip checksum
+      // corruption injection.
+      // We only corrupt data if the result is filled to `scratch`. For other
+      // cases, the data might not be able to be modified (e.g mmaped files)
+      // or has unintended side effects.
+      // For a small chance, set the failure to status but corrupt the
+      // result in a way that checksum checking is supposed to fail.
+      // Corrupt the last byte, which is supposed to be a checksum byte
+      // It would work for CRC. Not 100% sure for xxhash and will adjust
+      // if it is not the case.
+      const_cast<char*>(result->data())[result->size() - 1]++;
+      ctx->message += "corrupt last byte; ";
+      ret_fault_injected = true;
+    } else {
+      ctx->message += "error result multiget single; ";
+      ret_fault_injected = true;
+      return IOStatus::IOError();
+    }
+  }
+  return IOStatus::OK();
+}
+
+bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name,
+                                            uint64_t* number, FileType* type) {
+  std::size_t found = file_name.find_last_of("/");
+  std::string file = file_name.substr(found);
+  return ParseFileName(file, number, type);
+}
+
+IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) {
+  MutexLock l(&mutex_);
+  if (!enable_write_error_injection_ || !write_error_one_in_) {
+    return IOStatus::OK();
+  }
+  bool allowed_type = false;
+
+  if (inject_for_all_file_types_) {
+    allowed_type = true;
+  } else {
+    uint64_t number;
+    FileType cur_type = kTempFile;
+    if (TryParseFileName(file_name, &number, &cur_type)) {
+      for (const auto& type : write_error_allowed_types_) {
+        if (cur_type == type) {
+          allowed_type = true;
+        }
+      }
+    }
+  }
+
+  if (allowed_type) {
+    if (write_error_rand_.OneIn(write_error_one_in_)) {
+      return GetError();
+    }
+  }
+  return IOStatus::OK();
+}
+
+IOStatus FaultInjectionTestFS::InjectMetadataWriteError() {
+  {
+    MutexLock l(&mutex_);
+    if (!enable_metadata_write_error_injection_ ||
+        !metadata_write_error_one_in_ ||
+        !write_error_rand_.OneIn(metadata_write_error_one_in_)) {
+      return IOStatus::OK();
+    }
+  }
+  TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected");
+  return IOStatus::IOError();
+}
+
+void FaultInjectionTestFS::PrintFaultBacktrace() {
+#if defined(OS_LINUX)
+  ErrorContext* ctx =
+        static_cast<ErrorContext*>(thread_local_error_->Get());
+  if (ctx == nullptr) {
+    return;
+  }
+  fprintf(stderr, "Injected error type = %d\n", ctx->type);
+  fprintf(stderr, "Message: %s\n", ctx->message.c_str());
+  port::PrintAndFreeStack(ctx->callstack, ctx->frames);
+  ctx->callstack = nullptr;
+#endif
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_fs.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_fs.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_fs.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,582 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom FileSystem to keep track of the state of a file
+// system the last "Sync". The data being written is cached in a "buffer".
+// Only when "Sync" is called, the data will be persistent. It can similate
+// file data loss (or entire files) not protected by a "Sync". For any of the
+// FileSystem related operations, by specify the "IOStatus Error", a specific
+// error can be returned when file system is not activated.
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+
+#include "file/filename.h"
+#include "rocksdb/file_system.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFSWritableFile;
+class FaultInjectionTestFS;
+
+struct FSFileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+  std::string buffer_;
+
+  explicit FSFileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) {}
+
+  FSFileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  IOStatus DropUnsyncedData();
+
+  IOStatus DropRandomUnsyncedData(Random* rand);
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestFSWritableFile : public FSWritableFile {
+ public:
+  explicit TestFSWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>&& f,
+                              FaultInjectionTestFS* fs);
+  virtual ~TestFSWritableFile();
+  virtual IOStatus Append(const Slice& data, const IOOptions&,
+                          IODebugContext*) override;
+  virtual IOStatus Append(const Slice& data, const IOOptions& options,
+                          const DataVerificationInfo& verification_info,
+                          IODebugContext* dbg) override;
+  virtual IOStatus Truncate(uint64_t size, const IOOptions& options,
+                            IODebugContext* dbg) override {
+    return target_->Truncate(size, options, dbg);
+  }
+  virtual IOStatus Close(const IOOptions& options,
+                         IODebugContext* dbg) override;
+  virtual IOStatus Flush(const IOOptions&, IODebugContext*) override;
+  virtual IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+  virtual bool IsSyncThreadSafe() const override { return true; }
+  virtual IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) override {
+    return target_->PositionedAppend(data, offset, options, dbg);
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            const DataVerificationInfo& verification_info,
+                            IODebugContext* dbg) override;
+  virtual size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  virtual bool use_direct_io() const override {
+    return target_->use_direct_io();
+  };
+
+ private:
+  FSFileState state_;
+  FileOptions file_opts_;
+  std::unique_ptr<FSWritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestFS* fs_;
+  port::Mutex mutex_;
+};
+
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestFSRandomRWFile : public FSRandomRWFile {
+ public:
+  explicit TestFSRandomRWFile(const std::string& fname,
+                              std::unique_ptr<FSRandomRWFile>&& f,
+                              FaultInjectionTestFS* fs);
+  virtual ~TestFSRandomRWFile();
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+  std::unique_ptr<FSRandomRWFile> target_;
+  bool file_opened_;
+  FaultInjectionTestFS* fs_;
+};
+
+class TestFSRandomAccessFile : public FSRandomAccessFile {
+ public:
+  explicit TestFSRandomAccessFile(const std::string& fname,
+                              std::unique_ptr<FSRandomAccessFile>&& f,
+                              FaultInjectionTestFS* fs);
+  ~TestFSRandomAccessFile() override {}
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override;
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> target_;
+  FaultInjectionTestFS* fs_;
+};
+
+class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
+ public:
+  explicit TestFSSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
+                                FaultInjectionTestFS* fs)
+      : FSSequentialFileOwnerWrapper(std::move(f)), fs_(fs) {}
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
+
+ private:
+  FaultInjectionTestFS* fs_;
+};
+
+class TestFSDirectory : public FSDirectory {
+ public:
+  explicit TestFSDirectory(FaultInjectionTestFS* fs, std::string dirname,
+                           FSDirectory* dir)
+      : fs_(fs), dirname_(dirname), dir_(dir) {}
+  ~TestFSDirectory() {}
+
+  virtual IOStatus Fsync(const IOOptions& options,
+                         IODebugContext* dbg) override;
+
+  virtual IOStatus FsyncWithDirOptions(
+      const IOOptions& options, IODebugContext* dbg,
+      const DirFsyncOptions& dir_fsync_options) override;
+
+ private:
+  FaultInjectionTestFS* fs_;
+  std::string dirname_;
+  std::unique_ptr<FSDirectory> dir_;
+};
+
+class FaultInjectionTestFS : public FileSystemWrapper {
+ public:
+  explicit FaultInjectionTestFS(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base),
+        filesystem_active_(true),
+        filesystem_writable_(false),
+        thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)),
+        enable_write_error_injection_(false),
+        enable_metadata_write_error_injection_(false),
+        write_error_rand_(0),
+        write_error_one_in_(0),
+        metadata_write_error_one_in_(0),
+        read_error_one_in_(0),
+        ingest_data_corruption_before_write_(false),
+        fail_get_file_unique_id_(false) {}
+  virtual ~FaultInjectionTestFS() { error_.PermitUncheckedError(); }
+
+  static const char* kClassName() { return "FaultInjectionTestFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewDirectory(const std::string& name, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override;
+
+  virtual IOStatus DeleteFile(const std::string& f, const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  virtual IOStatus RenameFile(const std::string& s, const std::string& t,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  virtual IOStatus LinkFile(const std::string& src, const std::string& target,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+
+// Undef to eliminate clash on Windows
+#undef GetFreeSpace
+  virtual IOStatus GetFreeSpace(const std::string& path,
+                                const IOOptions& options, uint64_t* disk_free,
+                                IODebugContext* dbg) override {
+    IOStatus io_s;
+    if (!IsFilesystemActive() &&
+        error_.subcode() == IOStatus::SubCode::kNoSpace) {
+      *disk_free = 0;
+    } else {
+      io_s = target()->GetFreeSpace(path, options, disk_free, dbg);
+    }
+    return io_s;
+  }
+
+  void WritableFileClosed(const FSFileState& state);
+
+  void WritableFileSynced(const FSFileState& state);
+
+  void WritableFileAppended(const FSFileState& state);
+
+  IOStatus DropUnsyncedFileData();
+
+  IOStatus DropRandomUnsyncedFileData(Random* rnd);
+
+  IOStatus DeleteFilesCreatedAfterLastDirSync(const IOOptions& options,
+                                              IODebugContext* dbg);
+
+  void ResetState();
+
+  void UntrackFile(const std::string& f);
+
+  void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
+    dir_to_new_files_since_last_sync_.erase(dirname);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+
+  // Setting filesystem_writable_ makes NewWritableFile. ReopenWritableFile,
+  // and NewRandomRWFile bypass FaultInjectionTestFS and go directly to the
+  // target FS
+  bool IsFilesystemDirectWritable() {
+    MutexLock l(&mutex_);
+    return filesystem_writable_;
+  }
+  bool ShouldUseDiretWritable(const std::string& file_name) {
+    MutexLock l(&mutex_);
+    if (filesystem_writable_) {
+      return true;
+    }
+    FileType file_type = kTempFile;
+    uint64_t file_number = 0;
+    if (!TryParseFileName(file_name, &file_number, &file_type)) {
+      return false;
+    }
+    return skip_direct_writable_types_.find(file_type) !=
+           skip_direct_writable_types_.end();
+  }
+  void SetFilesystemActiveNoLock(
+      bool active, IOStatus error = IOStatus::Corruption("Not active")) {
+    error.PermitUncheckedError();
+    filesystem_active_ = active;
+    if (!active) {
+      error_ = error;
+    }
+  }
+  void SetFilesystemActive(
+      bool active, IOStatus error = IOStatus::Corruption("Not active")) {
+    MutexLock l(&mutex_);
+    error.PermitUncheckedError();
+    SetFilesystemActiveNoLock(active, error);
+  }
+  void SetFilesystemDirectWritable(
+      bool writable) {
+    MutexLock l(&mutex_);
+    filesystem_writable_ = writable;
+  }
+  void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
+
+  IOStatus GetError() { return error_; }
+
+  void SetFileSystemIOError(IOStatus io_error) {
+    MutexLock l(&mutex_);
+    io_error.PermitUncheckedError();
+    error_ = io_error;
+  }
+
+  // To simulate the data corruption before data is written in FS
+  void IngestDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    ingest_data_corruption_before_write_ = true;
+  }
+
+  void NoDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    ingest_data_corruption_before_write_ = false;
+  }
+
+  bool ShouldDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    return ingest_data_corruption_before_write_;
+  }
+
+  void SetChecksumHandoffFuncType(const ChecksumType& func_type) {
+    MutexLock l(&mutex_);
+    checksum_handoff_func_tpye_ = func_type;
+  }
+
+  const ChecksumType& GetChecksumHandoffFuncType() {
+    MutexLock l(&mutex_);
+    return checksum_handoff_func_tpye_;
+  }
+
+  void SetFailGetUniqueId(bool flag) {
+    MutexLock l(&mutex_);
+    fail_get_file_unique_id_ = flag;
+  }
+
+  bool ShouldFailGetUniqueId() {
+    MutexLock l(&mutex_);
+    return fail_get_file_unique_id_;
+  }
+
+  // Specify what the operation, so we can inject the right type of error
+  enum ErrorOperation : char {
+    kRead = 0,
+    kMultiReadSingleReq = 1,
+    kMultiRead = 2,
+    kOpen,
+  };
+
+  // Set thread-local parameters for error injection. The first argument,
+  // seed is the seed for the random number generator, and one_in determines
+  // the probability of injecting error (i.e an error is injected with
+  // 1/one_in probability)
+  void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) {
+    struct ErrorContext* ctx =
+          static_cast<struct ErrorContext*>(thread_local_error_->Get());
+    if (ctx == nullptr) {
+      ctx = new ErrorContext(seed);
+      thread_local_error_->Reset(ctx);
+    }
+    ctx->one_in = one_in;
+    ctx->count = 0;
+  }
+
+  static void DeleteThreadLocalErrorContext(void *p) {
+    ErrorContext* ctx = static_cast<ErrorContext*>(p);
+    delete ctx;
+  }
+
+  // This is to set the parameters for the write error injection.
+  // seed is the seed for the random number generator, and one_in determines
+  // the probability of injecting error (i.e an error is injected with
+  // 1/one_in probability). For write error, we can specify the error we
+  // want to inject. Types decides the file types we want to inject the
+  // error (e.g., Wal files, SST files), which is empty by default.
+  void SetRandomWriteError(uint32_t seed, int one_in, IOStatus error,
+                           bool inject_for_all_file_types,
+                           const std::vector<FileType>& types) {
+    MutexLock l(&mutex_);
+    Random tmp_rand(seed);
+    error.PermitUncheckedError();
+    error_ = error;
+    write_error_rand_ = tmp_rand;
+    write_error_one_in_ = one_in;
+    inject_for_all_file_types_ = inject_for_all_file_types;
+    write_error_allowed_types_ = types;
+  }
+
+  void SetSkipDirectWritableTypes(const std::set<FileType>& types) {
+    MutexLock l(&mutex_);
+    skip_direct_writable_types_ = types;
+  }
+
+  void SetRandomMetadataWriteError(int one_in) {
+    MutexLock l(&mutex_);
+    metadata_write_error_one_in_ = one_in;
+  }
+  // If the value is not 0, it is enabled. Otherwise, it is disabled.
+  void SetRandomReadError(int one_in) { read_error_one_in_ = one_in; }
+
+  bool ShouldInjectRandomReadError() {
+    return read_error_one_in() &&
+           Random::GetTLSInstance()->OneIn(read_error_one_in());
+  }
+
+  // Inject an write error with randomlized parameter and the predefined
+  // error type. Only the allowed file types will inject the write error
+  IOStatus InjectWriteError(const std::string& file_name);
+
+  // Ingest error to metadata operations.
+  IOStatus InjectMetadataWriteError();
+
+  // Inject an error. For a READ operation, a status of IOError(), a
+  // corruption in the contents of scratch, or truncation of slice
+  // are the types of error with equal probability. For OPEN,
+  // its always an IOError.
+  // fault_injected returns whether a fault is injected. It is needed
+  // because some fault is inected with IOStatus to be OK.
+  IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice,
+                                         bool direct_io, char* scratch,
+                                         bool need_count_increase,
+                                         bool* fault_injected);
+
+  // Get the count of how many times we injected since the previous call
+  int GetAndResetErrorCount() {
+    ErrorContext* ctx =
+          static_cast<ErrorContext*>(thread_local_error_->Get());
+    int count = 0;
+    if (ctx != nullptr) {
+      count = ctx->count;
+      ctx->count = 0;
+    }
+    return count;
+  }
+
+  void EnableErrorInjection() {
+    ErrorContext* ctx =
+          static_cast<ErrorContext*>(thread_local_error_->Get());
+    if (ctx) {
+      ctx->enable_error_injection = true;
+    }
+  }
+
+  void EnableWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_write_error_injection_ = true;
+  }
+  void EnableMetadataWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_metadata_write_error_injection_ = true;
+  }
+
+  void DisableWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_write_error_injection_ = false;
+  }
+
+  void DisableErrorInjection() {
+    ErrorContext* ctx =
+          static_cast<ErrorContext*>(thread_local_error_->Get());
+    if (ctx) {
+      ctx->enable_error_injection = false;
+    }
+  }
+
+  void DisableMetadataWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_metadata_write_error_injection_ = false;
+  }
+
+  int read_error_one_in() const { return read_error_one_in_.load(); }
+
+  int write_error_one_in() const { return write_error_one_in_; }
+
+  // We capture a backtrace every time a fault is injected, for debugging
+  // purposes. This call prints the backtrace to stderr and frees the
+  // saved callstack
+  void PrintFaultBacktrace();
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FSFileState> db_file_state_;
+  std::set<std::string> open_managed_files_;
+  // directory -> (file name -> file contents to recover)
+  // When data is recovered from unsyned parent directory, the files with
+  // empty file contents to recover is deleted. Those with non-empty ones
+  // will be recovered to content accordingly.
+  std::unordered_map<std::string, std::map<std::string, std::string>>
+      dir_to_new_files_since_last_sync_;
+  bool filesystem_active_;  // Record flushes, syncs, writes
+  bool filesystem_writable_;  // Bypass FaultInjectionTestFS and go directly
+                              // to underlying FS for writable files
+  IOStatus error_;
+
+  enum ErrorType : int {
+    kErrorTypeStatus = 0,
+    kErrorTypeCorruption,
+    kErrorTypeTruncated,
+    kErrorTypeMax
+  };
+
+  struct ErrorContext {
+    Random rand;
+    int one_in;
+    int count;
+    bool enable_error_injection;
+    void* callstack;
+    std::string message;
+    int frames;
+    ErrorType type;
+
+    explicit ErrorContext(uint32_t seed)
+        : rand(seed),
+          enable_error_injection(false),
+          callstack(nullptr),
+          frames(0) {}
+    ~ErrorContext() {
+      if (callstack) {
+        free(callstack);
+      }
+    }
+  };
+
+  std::unique_ptr<ThreadLocalPtr> thread_local_error_;
+  bool enable_write_error_injection_;
+  bool enable_metadata_write_error_injection_;
+  Random write_error_rand_;
+  int write_error_one_in_;
+  int metadata_write_error_one_in_;
+  std::atomic<int> read_error_one_in_;
+  bool inject_for_all_file_types_;
+  std::vector<FileType> write_error_allowed_types_;
+  // File types where direct writable is skipped.
+  std::set<FileType> skip_direct_writable_types_;
+  bool ingest_data_corruption_before_write_;
+  ChecksumType checksum_handoff_func_tpye_;
+  bool fail_get_file_unique_id_;
+
+  // Extract number of type from file name. Return false if failing to fine
+  // them.
+  bool TryParseFileName(const std::string& file_name, uint64_t* number,
+                        FileType* type);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,110 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// This class implements a custom SecondaryCache that randomly injects an
+// error status into Inserts/Lookups based on a specified probability.
+
+#include "utilities/fault_injection_secondary_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FaultInjectionSecondaryCache::ResultHandle::UpdateHandleValue(
+    FaultInjectionSecondaryCache::ResultHandle* handle) {
+  ErrorContext* ctx = handle->cache_->GetErrorContext();
+  if (!ctx->rand.OneIn(handle->cache_->prob_)) {
+    handle->value_ = handle->base_->Value();
+    handle->size_ = handle->base_->Size();
+  }
+  handle->base_.reset();
+}
+
+bool FaultInjectionSecondaryCache::ResultHandle::IsReady() {
+  bool ready = true;
+  if (base_) {
+    ready = base_->IsReady();
+    if (ready) {
+      UpdateHandleValue(this);
+    }
+  }
+  return ready;
+}
+
+void FaultInjectionSecondaryCache::ResultHandle::Wait() {
+  base_->Wait();
+  UpdateHandleValue(this);
+}
+
+void* FaultInjectionSecondaryCache::ResultHandle::Value() { return value_; }
+
+size_t FaultInjectionSecondaryCache::ResultHandle::Size() { return size_; }
+
+void FaultInjectionSecondaryCache::ResultHandle::WaitAll(
+    FaultInjectionSecondaryCache* cache,
+    std::vector<SecondaryCacheResultHandle*> handles) {
+  std::vector<SecondaryCacheResultHandle*> base_handles;
+  for (SecondaryCacheResultHandle* hdl : handles) {
+    FaultInjectionSecondaryCache::ResultHandle* handle =
+        static_cast<FaultInjectionSecondaryCache::ResultHandle*>(hdl);
+    if (!handle->base_) {
+      continue;
+    }
+    base_handles.emplace_back(handle->base_.get());
+  }
+
+  cache->base_->WaitAll(base_handles);
+  for (SecondaryCacheResultHandle* hdl : handles) {
+    FaultInjectionSecondaryCache::ResultHandle* handle =
+        static_cast<FaultInjectionSecondaryCache::ResultHandle*>(hdl);
+    if (handle->base_) {
+      UpdateHandleValue(handle);
+    }
+  }
+}
+
+FaultInjectionSecondaryCache::ErrorContext*
+FaultInjectionSecondaryCache::GetErrorContext() {
+  ErrorContext* ctx = static_cast<ErrorContext*>(thread_local_error_->Get());
+  if (!ctx) {
+    ctx = new ErrorContext(seed_);
+    thread_local_error_->Reset(ctx);
+  }
+
+  return ctx;
+}
+
+Status FaultInjectionSecondaryCache::Insert(
+    const Slice& key, void* value, const Cache::CacheItemHelper* helper) {
+  ErrorContext* ctx = GetErrorContext();
+  if (ctx->rand.OneIn(prob_)) {
+    return Status::IOError();
+  }
+
+  return base_->Insert(key, value, helper);
+}
+
+std::unique_ptr<SecondaryCacheResultHandle>
+FaultInjectionSecondaryCache::Lookup(const Slice& key,
+                                     const Cache::CreateCallback& create_cb,
+                                     bool wait) {
+  std::unique_ptr<SecondaryCacheResultHandle> hdl =
+      base_->Lookup(key, create_cb, wait);
+  ErrorContext* ctx = GetErrorContext();
+  if (wait && ctx->rand.OneIn(prob_)) {
+    hdl.reset();
+  }
+  return std::unique_ptr<FaultInjectionSecondaryCache::ResultHandle>(
+      new FaultInjectionSecondaryCache::ResultHandle(this, std::move(hdl)));
+}
+
+void FaultInjectionSecondaryCache::Erase(const Slice& key) {
+  base_->Erase(key);
+}
+
+void FaultInjectionSecondaryCache::WaitAll(
+    std::vector<SecondaryCacheResultHandle*> handles) {
+  FaultInjectionSecondaryCache::ResultHandle::WaitAll(this, handles);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,94 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/secondary_cache.h"
+#include "util/random.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class implements a custom SecondaryCache that randomly injects an
+// error status into Inserts/Lookups based on a specified probability.
+// Its used by db_stress to verify correctness in the presence of
+// secondary cache errors.
+//
+class FaultInjectionSecondaryCache : public SecondaryCache {
+ public:
+  explicit FaultInjectionSecondaryCache(
+      const std::shared_ptr<SecondaryCache>& base, uint32_t seed, int prob)
+      : base_(base),
+        seed_(seed),
+        prob_(prob),
+        thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)) {
+  }
+
+  virtual ~FaultInjectionSecondaryCache() override {}
+
+  const char* Name() const override { return "FaultInjectionSecondaryCache"; }
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper) override;
+
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb,
+      bool wait) override;
+
+  void Erase(const Slice& /*key*/) override;
+
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override;
+
+  std::string GetPrintableOptions() const override { return ""; }
+
+  void EnableErrorInjection(uint64_t prob);
+
+ private:
+  class ResultHandle : public SecondaryCacheResultHandle {
+   public:
+    ResultHandle(FaultInjectionSecondaryCache* cache,
+                 std::unique_ptr<SecondaryCacheResultHandle>&& base)
+        : cache_(cache), base_(std::move(base)), value_(nullptr), size_(0) {}
+
+    ~ResultHandle() override {}
+
+    bool IsReady() override;
+
+    void Wait() override;
+
+    void* Value() override;
+
+    size_t Size() override;
+
+    static void WaitAll(FaultInjectionSecondaryCache* cache,
+                        std::vector<SecondaryCacheResultHandle*> handles);
+
+   private:
+    static void UpdateHandleValue(ResultHandle* handle);
+
+    FaultInjectionSecondaryCache* cache_;
+    std::unique_ptr<SecondaryCacheResultHandle> base_;
+    void* value_;
+    size_t size_;
+  };
+
+  static void DeleteThreadLocalErrorContext(void* p) {
+    ErrorContext* ctx = static_cast<ErrorContext*>(p);
+    delete ctx;
+  }
+
+  const std::shared_ptr<SecondaryCache> base_;
+  uint32_t seed_;
+  int prob_;
+
+  struct ErrorContext {
+    Random rand;
+
+    explicit ErrorContext(uint32_t seed) : rand(seed) {}
+  };
+  std::unique_ptr<ThreadLocalPtr> thread_local_error_;
+
+  ErrorContext* GetErrorContext();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/memory/memory_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory/memory_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/memory/memory_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory/memory_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -13,6 +13,7 @@
 #include "table/block_based/block_based_table_factory.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -25,12 +26,6 @@
 
   std::string GetDBName(int id) { return kDbDir + "db_" + ToString(id); }
 
-  std::string RandomString(int len) {
-    std::string r;
-    test::RandomString(&rnd_, len, &r);
-    return r;
-  }
-
   void UpdateUsagesHistory(const std::vector<DB*>& dbs) {
     std::map<MemoryUtil::UsageType, uint64_t> usage_by_type;
     ASSERT_OK(GetApproximateMemoryUsageByType(dbs, &usage_by_type));
@@ -43,12 +38,10 @@
   void GetCachePointersFromTableFactory(
       const TableFactory* factory,
       std::unordered_set<const Cache*>* cache_set) {
-    const BlockBasedTableFactory* bbtf =
-        dynamic_cast<const BlockBasedTableFactory*>(factory);
-    if (bbtf != nullptr) {
-      const auto bbt_opts = bbtf->table_options();
-      cache_set->insert(bbt_opts.block_cache.get());
-      cache_set->insert(bbt_opts.block_cache_compressed.get());
+    const auto bbto = factory->GetOptions<BlockBasedTableOptions>();
+    if (bbto != nullptr) {
+      cache_set->insert(bbto->block_cache.get());
+      cache_set->insert(bbto->block_cache_compressed.get());
     }
   }
 
@@ -75,7 +68,8 @@
         ASSERT_OK(db_impl->TEST_GetAllImmutableCFOptions(&iopts_map));
       }
       for (auto pair : iopts_map) {
-        GetCachePointersFromTableFactory(pair.second->table_factory, cache_set);
+        GetCachePointersFromTableFactory(pair.second->table_factory.get(),
+                                         cache_set);
       }
     }
   }
@@ -110,7 +104,7 @@
   BlockBasedTableOptions bbt_opts;
   bbt_opts.block_cache = NewLRUCache(4096 * 1000 * 10);
   for (int i = 0; i < kNumDBs; ++i) {
-    DestroyDB(GetDBName(i), opt);
+    ASSERT_OK(DestroyDB(GetDBName(i), opt));
     DB* db = nullptr;
     ASSERT_OK(DB::Open(opt, GetDBName(i), &db));
     dbs.push_back(db);
@@ -122,17 +116,17 @@
   for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) {
     for (int i = 0; i < kNumDBs; ++i) {
       for (int j = 0; j < 100; ++j) {
-        keys_by_db[i].emplace_back(RandomString(kKeySize));
-        dbs[i]->Put(WriteOptions(), keys_by_db[i].back(),
-                    RandomString(kValueSize));
+        keys_by_db[i].emplace_back(rnd_.RandomString(kKeySize));
+        ASSERT_OK(dbs[i]->Put(WriteOptions(), keys_by_db[i].back(),
+                              rnd_.RandomString(kValueSize)));
       }
-      dbs[i]->Flush(FlushOptions());
+      ASSERT_OK(dbs[i]->Flush(FlushOptions()));
     }
   }
   for (int i = 0; i < kNumDBs; ++i) {
     for (auto& key : keys_by_db[i]) {
       std::string value;
-      dbs[i]->Get(ReadOptions(), key, &value);
+      ASSERT_OK(dbs[i]->Get(ReadOptions(), key, &value));
     }
     UpdateUsagesHistory(dbs);
   }
@@ -152,8 +146,10 @@
   std::vector<uint64_t> usage_by_type;
   std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
   const int kNumDBs = 10;
+  // These key/value sizes ensure each KV has its own memtable. Note that the
+  // minimum write_buffer_size allowed is 64 KB.
   const int kKeySize = 100;
-  const int kValueSize = 500;
+  const int kValueSize = 1 << 16;
   Options opt;
   opt.create_if_missing = true;
   opt.create_missing_column_families = true;
@@ -169,7 +165,7 @@
   };
 
   for (int i = 0; i < kNumDBs; ++i) {
-    DestroyDB(GetDBName(i), opt);
+    ASSERT_OK(DestroyDB(GetDBName(i), opt));
     std::vector<ColumnFamilyHandle*> handles;
     dbs.emplace_back();
     vec_handles.emplace_back();
@@ -181,8 +177,9 @@
   for (int p = 0; p < opt.min_write_buffer_number_to_merge / 2; ++p) {
     for (int i = 0; i < kNumDBs; ++i) {
       for (auto* handle : vec_handles[i]) {
-        dbs[i]->Put(WriteOptions(), handle, RandomString(kKeySize),
-                    RandomString(kValueSize));
+        ASSERT_OK(dbs[i]->Put(WriteOptions(), handle,
+                              rnd_.RandomString(kKeySize),
+                              rnd_.RandomString(kValueSize)));
         UpdateUsagesHistory(dbs);
       }
     }
@@ -204,11 +201,12 @@
   // Create an iterator and flush all memtables for each db
   for (int i = 0; i < kNumDBs; ++i) {
     iters.push_back(dbs[i]->NewIterator(ReadOptions()));
-    dbs[i]->Flush(FlushOptions());
+    ASSERT_OK(dbs[i]->Flush(FlushOptions()));
 
     for (int j = 0; j < 100; ++j) {
       std::string value;
-      dbs[i]->Get(ReadOptions(), RandomString(kKeySize), &value);
+      ASSERT_NOK(
+          dbs[i]->Get(ReadOptions(), rnd_.RandomString(kKeySize), &value));
     }
 
     UpdateUsagesHistory(dbs);
@@ -233,6 +231,8 @@
   }
   usage_check_point = usage_history_[MemoryUtil::kMemTableTotal].size();
   for (int i = 0; i < kNumDBs; ++i) {
+    // iterator is not used.
+    ASSERT_OK(iters[i]->status());
     delete iters[i];
     UpdateUsagesHistory(dbs);
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/memory_allocators.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory_allocators.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/memory_allocators.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/memory_allocators.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,104 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+
+#include "rocksdb/memory_allocator.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A memory allocator using new/delete
+class DefaultMemoryAllocator : public MemoryAllocator {
+ public:
+  static const char* kClassName() { return "DefaultMemoryAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  void* Allocate(size_t size) override {
+    return static_cast<void*>(new char[size]);
+  }
+
+  void Deallocate(void* p) override { delete[] static_cast<char*>(p); }
+};
+
+// Base class for a MemoryAllocator.  This implementation does nothing
+// and implements the methods in failuse mode (assert if the methods are
+// invoked). Implementations can extend this class and override these methods
+// when they are enabled via compiler switches (e.g., the
+// JeMallocMemoryAllocator can define these methods if ROCKSDB_JEMALLOC is
+// defined at compile time.  If compiled in "disabled" mode, this class provides
+// default/failure implementations.  If compiled in "enabled" mode, the derived
+// class needs to provide the appopriate "enabled" methods for the "real"
+// implementation. Failure of the "real" implementation to implement ovreride
+// any of these methods will result in an assert failure.
+class BaseMemoryAllocator : public MemoryAllocator {
+ public:
+  void* Allocate(size_t /*size*/) override {
+    assert(false);
+    return nullptr;
+  }
+
+  void Deallocate(void* /*p*/) override { assert(false); }
+};
+
+// A Wrapped MemoryAllocator.  Delegates the memory allcator functions to the
+// wrapped one.
+class MemoryAllocatorWrapper : public MemoryAllocator {
+ public:
+  // Initialize an MemoryAllocatorWrapper that delegates all calls to *t
+  explicit MemoryAllocatorWrapper(const std::shared_ptr<MemoryAllocator>& t);
+  ~MemoryAllocatorWrapper() override {}
+
+  // Return the target to which to forward all calls
+  MemoryAllocator* target() const { return target_.get(); }
+  // Allocate a block of at least size. Has to be thread-safe.
+  void* Allocate(size_t size) override { return target_->Allocate(size); }
+
+  // Deallocate previously allocated block. Has to be thread-safe.
+  void Deallocate(void* p) override { return target_->Deallocate(p); }
+
+  // Returns the memory size of the block allocated at p. The default
+  // implementation that just returns the original allocation_size is fine.
+  size_t UsableSize(void* p, size_t allocation_size) const override {
+    return target_->UsableSize(p, allocation_size);
+  }
+
+  const Customizable* Inner() const override { return target_.get(); }
+
+ protected:
+  std::shared_ptr<MemoryAllocator> target_;
+};
+
+// A memory allocator that counts the number of allocations and deallocations
+// This class is useful if the number of memory allocations/dellocations is
+// important.
+class CountedMemoryAllocator : public MemoryAllocatorWrapper {
+ public:
+  CountedMemoryAllocator()
+      : MemoryAllocatorWrapper(std::make_shared<DefaultMemoryAllocator>()),
+        allocations_(0),
+        deallocations_(0) {}
+
+  explicit CountedMemoryAllocator(const std::shared_ptr<MemoryAllocator>& t)
+      : MemoryAllocatorWrapper(t), allocations_(0), deallocations_(0) {}
+  static const char* kClassName() { return "CountedMemoryAllocator"; }
+  const char* Name() const override { return kClassName(); }
+  std::string GetId() const override { return std::string(Name()); }
+  void* Allocate(size_t size) override {
+    allocations_++;
+    return MemoryAllocatorWrapper::Allocate(size);
+  }
+
+  void Deallocate(void* p) override {
+    deallocations_++;
+    MemoryAllocatorWrapper::Deallocate(p);
+  }
+  uint64_t GetNumAllocations() const { return allocations_; }
+  uint64_t GetNumDeallocations() const { return deallocations_; }
+
+ private:
+  std::atomic<uint64_t> allocations_;
+  std::atomic<uint64_t> deallocations_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.cc	2025-05-19 16:14:28.000000000 +0000
@@ -49,7 +49,7 @@
       new_value->push_back(existing_value_data[i]);
     }
   } else {
-	  assert(value.size() == max_size);
+    assert(value.size() == max_size);
     for (size_t i = min_size; i < max_size; i++) {
       new_value->push_back(value_data[i]);
     }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/bytesxor.h	2025-05-19 16:14:28.000000000 +0000
@@ -28,9 +28,11 @@
                      std::string* new_value,
                      Logger* logger) const override;
 
-  virtual const char* Name() const override {
-    return "BytesXOR";
-  }
+  static const char* kClassName() { return "BytesXOR"; }
+  static const char* kNickName() { return "bytesxor"; }
+
+  const char* NickName() const override { return kNickName(); }
+  const char* Name() const override { return kClassName(); }
 
   void XOR(const Slice* existing_value, const Slice& value,
           std::string* new_value) const;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/max.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/max.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/max.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/max.cc	2025-05-19 16:14:28.000000000 +0000
@@ -64,7 +64,10 @@
     return true;
   }
 
-  const char* Name() const override { return "MaxOperator"; }
+  static const char* kClassName() { return "MaxOperator"; }
+  static const char* kNickName() { return "max"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
 };
 
 }  // end of anonymous namespace
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/put.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/put.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/put.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/put.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,10 +8,12 @@
 #include "rocksdb/merge_operator.h"
 #include "utilities/merge_operators.h"
 
-using namespace ROCKSDB_NAMESPACE;
-
 namespace { // anonymous namespace
 
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::Slice;
+
 // A merge operator that mimics Put semantics
 // Since this merge-operator will not be used in production,
 // it is implemented as a non-associative merge operator to illustrate the
@@ -48,7 +50,10 @@
     return true;
   }
 
-  const char* Name() const override { return "PutOperator"; }
+  static const char* kClassName() { return "PutOperator"; }
+  static const char* kNickName() { return "put_v1"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
 };
 
 class PutOperatorV2 : public PutOperator {
@@ -67,6 +72,9 @@
     merge_out->existing_operand = merge_in.operand_list.back();
     return true;
   }
+
+  static const char* kNickName() { return "put"; }
+  const char* NickName() const override { return kNickName(); }
 };
 
 } // end of anonymous namespace
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,9 +7,6 @@
 #include "rocksdb/slice.h"
 #include "utilities/merge_operators.h"
 
-using ROCKSDB_NAMESPACE::Logger;
-using ROCKSDB_NAMESPACE::MergeOperator;
-using ROCKSDB_NAMESPACE::Slice;
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -52,8 +49,6 @@
   return true;
 }
 
-const char* SortList::Name() const { return "MergeSortOperator"; }
-
 void SortList::MakeVector(std::vector<int>& operand, Slice slice) const {
   do {
     const char* begin = slice.data_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/sortlist.h	2025-05-19 16:14:28.000000000 +0000
@@ -27,7 +27,11 @@
                          const std::deque<Slice>& operand_list,
                          std::string* new_value, Logger* logger) const override;
 
-  const char* Name() const override;
+  static const char* kClassName() { return "MergeSortOperator"; }
+  static const char* kNickName() { return "sortlist"; }
+
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
 
   void MakeVector(std::vector<int>& operand, Slice slice) const;
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,18 +6,35 @@
 
 #include "stringappend.h"
 
-#include <memory>
 #include <assert.h>
 
-#include "rocksdb/slice.h"
+#include <memory>
+
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
-
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    stringappend_merge_type_info = {
+#ifndef ROCKSDB_LITE
+        {"delimiter",
+         {0, OptionType::kString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
 // Constructor: also specify the delimiter character.
 StringAppendOperator::StringAppendOperator(char delim_char)
-    : delim_(delim_char) {
+    : delim_(1, delim_char) {
+  RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info);
+}
+
+StringAppendOperator::StringAppendOperator(const std::string& delim)
+    : delim_(delim) {
+  RegisterOptions("Delimiter", &delim_, &stringappend_merge_type_info);
 }
 
 // Implementation for the merge operation (concatenates two strings)
@@ -35,18 +52,15 @@
   } else {
     // Generic append (existing_value != null).
     // Reserve *new_value to correct size, and apply concatenation.
-    new_value->reserve(existing_value->size() + 1 + value.size());
-    new_value->assign(existing_value->data(),existing_value->size());
-    new_value->append(1,delim_);
+    new_value->reserve(existing_value->size() + delim_.size() + value.size());
+    new_value->assign(existing_value->data(), existing_value->size());
+    new_value->append(delim_);
     new_value->append(value.data(), value.size());
   }
 
   return true;
 }
 
-const char* StringAppendOperator::Name() const  {
-  return "StringAppendOperator";
-}
 
 std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator() {
   return std::make_shared<StringAppendOperator>(',');
@@ -56,4 +70,9 @@
   return std::make_shared<StringAppendOperator>(delim_char);
 }
 
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator(
+    const std::string& delim) {
+  return std::make_shared<StringAppendOperator>(delim);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend.h	2025-05-19 16:14:28.000000000 +0000
@@ -14,6 +14,7 @@
  public:
   // Constructor: specify delimiter
   explicit StringAppendOperator(char delim_char);
+  explicit StringAppendOperator(const std::string& delim);
 
   virtual bool Merge(const Slice& key,
                      const Slice* existing_value,
@@ -21,11 +22,13 @@
                      std::string* new_value,
                      Logger* logger) const override;
 
-  virtual const char* Name() const override;
+  static const char* kClassName() { return "StringAppendOperator"; }
+  static const char* kNickName() { return "stringappend"; }
+  virtual const char* Name() const override { return kClassName(); }
+  virtual const char* NickName() const override { return kNickName(); }
 
  private:
-  char delim_;         // The delimiter is inserted between elements
-
+  std::string delim_;  // The delimiter is inserted between elements
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,19 +5,37 @@
 
 #include "stringappend2.h"
 
+#include <assert.h>
+
 #include <memory>
 #include <string>
-#include <assert.h>
 
-#include "rocksdb/slice.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/options_type.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+static std::unordered_map<std::string, OptionTypeInfo>
+    stringappend2_merge_type_info = {
+#ifndef ROCKSDB_LITE
+        {"delimiter",
+         {0, OptionType::kString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+}  // namespace
 
 // Constructor: also specify the delimiter character.
 StringAppendTESTOperator::StringAppendTESTOperator(char delim_char)
-    : delim_(delim_char) {
+    : delim_(1, delim_char) {
+  RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info);
+}
+
+StringAppendTESTOperator::StringAppendTESTOperator(const std::string& delim)
+    : delim_(delim) {
+  RegisterOptions("Delimiter", &delim_, &stringappend2_merge_type_info);
 }
 
 // Implementation for the merge operation (concatenates two strings)
@@ -35,9 +53,10 @@
 
   // Compute the space needed for the final result.
   size_t numBytes = 0;
+
   for (auto it = merge_in.operand_list.begin();
        it != merge_in.operand_list.end(); ++it) {
-    numBytes += it->size() + 1;   // Plus 1 for the delimiter
+    numBytes += it->size() + delim_.size();
   }
 
   // Only print the delimiter after the first entry has been printed
@@ -50,15 +69,16 @@
                                 merge_in.existing_value->size());
     printDelim = true;
   } else if (numBytes) {
-    merge_out->new_value.reserve(
-        numBytes - 1);  // Minus 1 since we have one less delimiter
+    // Without the existing (initial) value, the delimiter before the first of
+    // subsequent operands becomes redundant.
+    merge_out->new_value.reserve(numBytes - delim_.size());
   }
 
   // Concatenate the sequence of strings (and add a delimiter between each)
   for (auto it = merge_in.operand_list.begin();
        it != merge_in.operand_list.end(); ++it) {
     if (printDelim) {
-      merge_out->new_value.append(1, delim_);
+      merge_out->new_value.append(delim_);
     }
     merge_out->new_value.append(it->data(), it->size());
     printDelim = true;
@@ -89,7 +109,7 @@
   for (const auto& operand : operand_list) {
     size += operand.size();
   }
-  size += operand_list.size() - 1;  // Delimiters
+  size += (operand_list.size() - 1) * delim_.length();  // Delimiters
   new_value->reserve(size);
 
   // Apply concatenation
@@ -97,18 +117,13 @@
 
   for (std::deque<Slice>::const_iterator it = operand_list.begin() + 1;
        it != operand_list.end(); ++it) {
-    new_value->append(1, delim_);
+    new_value->append(delim_);
     new_value->append(it->data(), it->size());
   }
 
   return true;
 }
 
-const char* StringAppendTESTOperator::Name() const  {
-  return "StringAppendTESTOperator";
-}
-
-
 std::shared_ptr<MergeOperator>
 MergeOperators::CreateStringAppendTESTOperator() {
   return std::make_shared<StringAppendTESTOperator>(',');
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend2.h	2025-05-19 16:14:28.000000000 +0000
@@ -24,6 +24,7 @@
  public:
   // Constructor with delimiter
   explicit StringAppendTESTOperator(char delim_char);
+  explicit StringAppendTESTOperator(const std::string& delim);
 
   virtual bool FullMergeV2(const MergeOperationInput& merge_in,
                            MergeOperationOutput* merge_out) const override;
@@ -33,7 +34,10 @@
                                  std::string* new_value, Logger* logger) const
       override;
 
-  virtual const char* Name() const override;
+  static const char* kClassName() { return "StringAppendTESTOperator"; }
+  static const char* kNickName() { return "stringappendtest"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
 
  private:
   // A version of PartialMerge that actually performs "partial merging".
@@ -42,8 +46,7 @@
                                const std::deque<Slice>& operand_list,
                                std::string* new_value, Logger* logger) const;
 
-  char delim_;         // The delimiter is inserted between elements
-
+  std::string delim_;  // The delimiter is inserted between elements
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -1,3 +1,9 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
 /**
  * An persistent map : key -> (list of strings), using rocksdb merge.
  * This file is a test-harness / use-case for the StringAppendOperator.
@@ -6,19 +12,21 @@
  * Copyright 2013 Facebook, Inc.
 */
 
+#include "utilities/merge_operators/string_append/stringappend.h"
+
 #include <iostream>
 #include <map>
+#include <tuple>
 
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
 #include "util/random.h"
 #include "utilities/merge_operators.h"
-#include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
 
-using namespace ROCKSDB_NAMESPACE;
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -27,22 +35,34 @@
 
 namespace {
 // OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
-std::shared_ptr<DB> OpenNormalDb(char delim_char) {
+std::shared_ptr<DB> OpenNormalDb(const std::string& delim) {
   DB* db;
   Options options;
   options.create_if_missing = true;
-  options.merge_operator.reset(new StringAppendOperator(delim_char));
+  MergeOperator* mergeOperator;
+  if (delim.size() == 1) {
+    mergeOperator = new StringAppendOperator(delim[0]);
+  } else {
+    mergeOperator = new StringAppendOperator(delim);
+  }
+  options.merge_operator.reset(mergeOperator);
   EXPECT_OK(DB::Open(options, kDbName, &db));
   return std::shared_ptr<DB>(db);
 }
 
 #ifndef ROCKSDB_LITE  // TtlDb is not supported in Lite
 // Open a TtlDB with a non-associative StringAppendTESTOperator
-std::shared_ptr<DB> OpenTtlDb(char delim_char) {
+std::shared_ptr<DB> OpenTtlDb(const std::string& delim) {
   DBWithTTL* db;
   Options options;
   options.create_if_missing = true;
-  options.merge_operator.reset(new StringAppendTESTOperator(delim_char));
+  MergeOperator* mergeOperator;
+  if (delim.size() == 1) {
+    mergeOperator = new StringAppendTESTOperator(delim[0]);
+  } else {
+    mergeOperator = new StringAppendTESTOperator(delim);
+  }
+  options.merge_operator.reset(mergeOperator);
   EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
   return std::shared_ptr<DB>(db);
 }
@@ -108,13 +128,28 @@
 
 
 // The class for unit-testing
-class StringAppendOperatorTest : public testing::Test {
+class StringAppendOperatorTest : public testing::Test,
+                                 public ::testing::WithParamInterface<bool> {
  public:
   StringAppendOperatorTest() {
-    DestroyDB(kDbName, Options());    // Start each test with a fresh DB
+    EXPECT_OK(
+        DestroyDB(kDbName, Options()));  // Start each test with a fresh DB
+  }
+
+  void SetUp() override {
+#ifndef ROCKSDB_LITE  // TtlDb is not supported in Lite
+    bool if_use_ttl = GetParam();
+    if (if_use_ttl) {
+      fprintf(stderr, "Running tests with ttl db and generic operator.\n");
+      StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
+      return;
+    }
+#endif  // !ROCKSDB_LITE
+    fprintf(stderr, "Running tests with regular db and operator.\n");
+    StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
   }
 
-  typedef std::shared_ptr<DB> (* OpenFuncPtr)(char);
+  using OpenFuncPtr = std::shared_ptr<DB> (*)(const std::string&);
 
   // Allows user to open databases with different configurations.
   // e.g.: Can open a DB or a TtlDB, etc.
@@ -129,8 +164,8 @@
 
 // THE TEST CASES BEGIN HERE
 
-TEST_F(StringAppendOperatorTest, IteratorTest) {
-  auto db_ = OpenDb(',');
+TEST_P(StringAppendOperatorTest, IteratorTest) {
+  auto db_ = OpenDb(",");
   StringLists slists(db_);
 
   slists.Append("k1", "v1");
@@ -220,11 +255,10 @@
       ASSERT_EQ(res, "g1");
     }
   }
-
 }
 
-TEST_F(StringAppendOperatorTest, SimpleTest) {
-  auto db = OpenDb(',');
+TEST_P(StringAppendOperatorTest, SimpleTest) {
+  auto db = OpenDb(",");
   StringLists slists(db);
 
   slists.Append("k1", "v1");
@@ -232,14 +266,12 @@
   slists.Append("k1", "v3");
 
   std::string res;
-  bool status = slists.Get("k1", &res);
-
-  ASSERT_TRUE(status);
+  ASSERT_TRUE(slists.Get("k1", &res));
   ASSERT_EQ(res, "v1,v2,v3");
 }
 
-TEST_F(StringAppendOperatorTest, SimpleDelimiterTest) {
-  auto db = OpenDb('|');
+TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
+  auto db = OpenDb("|");
   StringLists slists(db);
 
   slists.Append("k1", "v1");
@@ -247,23 +279,64 @@
   slists.Append("k1", "v3");
 
   std::string res;
-  slists.Get("k1", &res);
+  ASSERT_TRUE(slists.Get("k1", &res));
   ASSERT_EQ(res, "v1|v2|v3");
 }
 
-TEST_F(StringAppendOperatorTest, OneValueNoDelimiterTest) {
-  auto db = OpenDb('!');
+TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) {
+  auto db = OpenDb("");
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1v2v3");
+}
+
+TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) {
+  auto db = OpenDb("<>");
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1<>v2<>v3");
+}
+
+TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) {
+  std::string delimiter = "<>";
+  auto db = OpenDb(delimiter);
+  StringLists slists(db);
+
+  slists.Append("k1", "v1");
+  slists.Append("k1", "v2");
+  delimiter.clear();
+  slists.Append("k1", "v3");
+
+  std::string res;
+  ASSERT_TRUE(slists.Get("k1", &res));
+  ASSERT_EQ(res, "v1<>v2<>v3");
+}
+
+TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
+  auto db = OpenDb("!");
   StringLists slists(db);
 
   slists.Append("random_key", "single_val");
 
   std::string res;
-  slists.Get("random_key", &res);
+  ASSERT_TRUE(slists.Get("random_key", &res));
   ASSERT_EQ(res, "single_val");
 }
 
-TEST_F(StringAppendOperatorTest, VariousKeys) {
-  auto db = OpenDb('\n');
+TEST_P(StringAppendOperatorTest, VariousKeys) {
+  auto db = OpenDb("\n");
   StringLists slists(db);
 
   slists.Append("c", "asdasd");
@@ -288,8 +361,8 @@
 }
 
 // Generate semi random keys/words from a small distribution.
-TEST_F(StringAppendOperatorTest, RandomMixGetAppend) {
-  auto db = OpenDb(' ');
+TEST_P(StringAppendOperatorTest, RandomMixGetAppend) {
+  auto db = OpenDb(" ");
   StringLists slists(db);
 
   // Generate a list of random keys and values
@@ -336,11 +409,10 @@
     }
 
   }
-
 }
 
-TEST_F(StringAppendOperatorTest, BIGRandomMixGetAppend) {
-  auto db = OpenDb(' ');
+TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) {
+  auto db = OpenDb(" ");
   StringLists slists(db);
 
   // Generate a list of random keys and values
@@ -387,13 +459,12 @@
     }
 
   }
-
 }
 
-TEST_F(StringAppendOperatorTest, PersistentVariousKeys) {
+TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
   // Perform the following operations in limited scope
   {
-    auto db = OpenDb('\n');
+    auto db = OpenDb("\n");
     StringLists slists(db);
 
     slists.Append("c", "asdasd");
@@ -405,9 +476,9 @@
     slists.Append("c", "asdasd");
 
     std::string a, b, c;
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr");
     ASSERT_EQ(b, "y\n2");
@@ -416,7 +487,7 @@
 
   // Reopen the database (the previous changes should persist / be remembered)
   {
-    auto db = OpenDb('\n');
+    auto db = OpenDb("\n");
     StringLists slists(db);
 
     slists.Append("c", "bbnagnagsx");
@@ -431,9 +502,9 @@
     // The most recent changes should be in memory (MemTable)
     // Hence, this will test both Get() paths.
     std::string a, b, c;
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\ndf\nl;");
@@ -442,14 +513,14 @@
 
   // Reopen the database (the previous changes should persist / be remembered)
   {
-    auto db = OpenDb('\n');
+    auto db = OpenDb("\n");
     StringLists slists(db);
 
     // All changes should be on disk. This will test VersionSet Get()
     std::string a, b, c;
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\ndf\nl;");
@@ -457,47 +528,40 @@
   }
 }
 
-TEST_F(StringAppendOperatorTest, PersistentFlushAndCompaction) {
+TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
   // Perform the following operations in limited scope
   {
-    auto db = OpenDb('\n');
+    auto db = OpenDb("\n");
     StringLists slists(db);
     std::string a, b, c;
-    bool success;
 
     // Append, Flush, Get
     slists.Append("c", "asdasd");
-    db->Flush(ROCKSDB_NAMESPACE::FlushOptions());
-    success = slists.Get("c", &c);
-    ASSERT_TRUE(success);
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+    ASSERT_TRUE(slists.Get("c", &c));
     ASSERT_EQ(c, "asdasd");
 
     // Append, Flush, Append, Get
     slists.Append("a", "x");
     slists.Append("b", "y");
-    db->Flush(ROCKSDB_NAMESPACE::FlushOptions());
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
     slists.Append("a", "t");
     slists.Append("a", "r");
     slists.Append("b", "2");
 
-    success = slists.Get("a", &a);
-    assert(success == true);
+    ASSERT_TRUE(slists.Get("a", &a));
     ASSERT_EQ(a, "x\nt\nr");
 
-    success = slists.Get("b", &b);
-    assert(success == true);
+    ASSERT_TRUE(slists.Get("b", &b));
     ASSERT_EQ(b, "y\n2");
 
     // Append, Get
-    success = slists.Append("c", "asdasd");
-    assert(success);
-    success = slists.Append("b", "monkey");
-    assert(success);
-
-    // I omit the "assert(success)" checks here.
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Append("c", "asdasd"));
+    ASSERT_TRUE(slists.Append("b", "monkey"));
+
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr");
     ASSERT_EQ(b, "y\n2\nmonkey");
@@ -506,22 +570,22 @@
 
   // Reopen the database (the previous changes should persist / be remembered)
   {
-    auto db = OpenDb('\n');
+    auto db = OpenDb("\n");
     StringLists slists(db);
     std::string a, b, c;
 
     // Get (Quick check for persistence of previous database)
-    slists.Get("a", &a);
+    ASSERT_TRUE(slists.Get("a", &a));
     ASSERT_EQ(a, "x\nt\nr");
 
     //Append, Compact, Get
     slists.Append("c", "bbnagnagsx");
     slists.Append("a", "sa");
     slists.Append("b", "df");
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
     ASSERT_EQ(a, "x\nt\nr\nsa");
     ASSERT_EQ(b, "y\n2\nmonkey\ndf");
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx");
@@ -531,30 +595,30 @@
     slists.Append("a", "jk");
     slists.Append("b", "l;");
     slists.Append("c", "rogosh");
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
 
     // Compact, Get
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
 
     // Append, Flush, Compact, Get
     slists.Append("b", "afcg");
-    db->Flush(ROCKSDB_NAMESPACE::FlushOptions());
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-    slists.Get("b", &b);
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_TRUE(slists.Get("b", &b));
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg");
   }
 }
 
-TEST_F(StringAppendOperatorTest, SimpleTestNullDelimiter) {
-  auto db = OpenDb('\0');
+TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) {
+  auto db = OpenDb(std::string(1, '\0'));
   StringLists slists(db);
 
   slists.Append("k1", "v1");
@@ -562,40 +626,26 @@
   slists.Append("k1", "v3");
 
   std::string res;
-  bool status = slists.Get("k1", &res);
-  ASSERT_TRUE(status);
+  ASSERT_TRUE(slists.Get("k1", &res));
 
   // Construct the desired string. Default constructor doesn't like '\0' chars.
   std::string checker("v1,v2,v3");    // Verify that the string is right size.
   checker[2] = '\0';                  // Use null delimiter instead of comma.
   checker[5] = '\0';
-  assert(checker.size() == 8);        // Verify it is still the correct size
+  ASSERT_EQ(checker.size(), 8);  // Verify it is still the correct size
 
   // Check that the rocksdb result string matches the desired string
-  assert(res.size() == checker.size());
+  ASSERT_EQ(res.size(), checker.size());
   ASSERT_EQ(res, checker);
 }
 
+INSTANTIATE_TEST_CASE_P(StringAppendOperatorTest, StringAppendOperatorTest,
+                        testing::Bool());
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
-  // Run with regular database
-  int result;
-  {
-    fprintf(stderr, "Running tests with regular db and operator.\n");
-    StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
-    result = RUN_ALL_TESTS();
-  }
-
-#ifndef ROCKSDB_LITE  // TtlDb is not supported in Lite
-  // Run with TTL
-  {
-    fprintf(stderr, "Running tests with ttl db and generic operator.\n");
-    StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
-    result |= RUN_ALL_TESTS();
-  }
-#endif  // !ROCKSDB_LITE
-
-  return result;
+  return RUN_ALL_TESTS();
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators/uint64add.cc	2025-05-19 16:14:28.000000000 +0000
@@ -12,10 +12,13 @@
 #include "util/coding.h"
 #include "utilities/merge_operators.h"
 
-using namespace ROCKSDB_NAMESPACE;
-
 namespace { // anonymous namespace
 
+using ROCKSDB_NAMESPACE::AssociativeMergeOperator;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::Slice;
+
 // A 'model' merge operator with uint64 addition semantics
 // Implemented as an AssociativeMergeOperator for simplicity and example.
 class UInt64AddOperator : public AssociativeMergeOperator {
@@ -31,12 +34,15 @@
 
     assert(new_value);
     new_value->clear();
-    PutFixed64(new_value, orig_value + operand);
+    ROCKSDB_NAMESPACE::PutFixed64(new_value, orig_value + operand);
 
     return true;  // Return true always since corruption will be treated as 0
   }
 
-  const char* Name() const override { return "UInt64AddOperator"; }
+  static const char* kClassName() { return "UInt64AddOperator"; }
+  static const char* kNickName() { return "uint64add"; }
+  const char* Name() const override { return kClassName(); }
+  const char* NickName() const override { return kNickName(); }
 
  private:
   // Takes the string and decodes it into a uint64_t
@@ -45,20 +51,20 @@
     uint64_t result = 0;
 
     if (value.size() == sizeof(uint64_t)) {
-      result = DecodeFixed64(value.data());
+      result = ROCKSDB_NAMESPACE::DecodeFixed64(value.data());
     } else if (logger != nullptr) {
       // If value is corrupted, treat it as 0
-      ROCKS_LOG_ERROR(logger, "uint64 value corruption, size: %" ROCKSDB_PRIszt
-                              " > %" ROCKSDB_PRIszt,
+      ROCKS_LOG_ERROR(logger,
+                      "uint64 value corruption, size: %" ROCKSDB_PRIszt
+                      " > %" ROCKSDB_PRIszt,
                       value.size(), sizeof(uint64_t));
     }
 
     return result;
   }
-
 };
 
-}
+}  // anonymous namespace
 
 namespace ROCKSDB_NAMESPACE {
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,120 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "utilities/merge_operators.h"
+
+#include <memory>
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+static bool LoadMergeOperator(const std::string& id,
+                              std::shared_ptr<MergeOperator>* result) {
+  bool success = true;
+  // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
+  // Requires these classes be moved into a header file...
+  if (id == "put" || id == "PutOperator") {
+    *result = MergeOperators::CreatePutOperator();
+  } else if (id == "put_v1") {
+    *result = MergeOperators::CreateDeprecatedPutOperator();
+  } else if (id == "uint64add" || id == "UInt64AddOperator") {
+    *result = MergeOperators::CreateUInt64AddOperator();
+  } else if (id == "max" || id == "MaxOperator") {
+    *result = MergeOperators::CreateMaxOperator();
+#ifdef ROCKSDB_LITE
+    // The remainder of the classes are handled by the ObjectRegistry in
+    // non-LITE mode
+  } else if (id == StringAppendOperator::kNickName() ||
+             id == StringAppendOperator::kClassName()) {
+    *result = MergeOperators::CreateStringAppendOperator();
+  } else if (id == StringAppendTESTOperator::kNickName() ||
+             id == StringAppendTESTOperator::kClassName()) {
+    *result = MergeOperators::CreateStringAppendTESTOperator();
+  } else if (id == BytesXOROperator::kNickName() ||
+             id == BytesXOROperator::kClassName()) {
+    *result = MergeOperators::CreateBytesXOROperator();
+  } else if (id == SortList::kNickName() || id == SortList::kClassName()) {
+    *result = MergeOperators::CreateSortOperator();
+#endif  // ROCKSDB_LITE
+  } else {
+    success = false;
+  }
+  return success;
+}
+
+#ifndef ROCKSDB_LITE
+static int RegisterBuiltinMergeOperators(ObjectLibrary& library,
+                                         const std::string& /*arg*/) {
+  size_t num_types;
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(StringAppendOperator::kClassName())
+          .AnotherName(StringAppendOperator::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new StringAppendOperator(","));
+        return guard->get();
+      });
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(StringAppendTESTOperator::kClassName())
+          .AnotherName(StringAppendTESTOperator::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new StringAppendTESTOperator(","));
+        return guard->get();
+      });
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(SortList::kClassName())
+          .AnotherName(SortList::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new SortList());
+        return guard->get();
+      });
+  library.AddFactory<MergeOperator>(
+      ObjectLibrary::PatternEntry(BytesXOROperator::kClassName())
+          .AnotherName(BytesXOROperator::kNickName()),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /*errmsg*/) {
+        guard->reset(new BytesXOROperator());
+        return guard->get();
+      });
+
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+#endif  // ROCKSDB_LITE
+
+Status MergeOperator::CreateFromString(const ConfigOptions& config_options,
+                                       const std::string& value,
+                                       std::shared_ptr<MergeOperator>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterBuiltinMergeOperators(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<MergeOperator>(config_options, value,
+                                         LoadMergeOperator, result);
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateFromStringId(
+    const std::string& id) {
+  std::shared_ptr<MergeOperator> result;
+  Status s = MergeOperator::CreateFromString(ConfigOptions(), id, &result);
+  if (s.ok()) {
+    return result;
+  } else {
+    // Empty or unknown, just return nullptr
+    return nullptr;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/merge_operators.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/merge_operators.h	2025-05-19 16:14:28.000000000 +0000
@@ -20,36 +20,16 @@
   static std::shared_ptr<MergeOperator> CreateUInt64AddOperator();
   static std::shared_ptr<MergeOperator> CreateStringAppendOperator();
   static std::shared_ptr<MergeOperator> CreateStringAppendOperator(char delim_char);
+  static std::shared_ptr<MergeOperator> CreateStringAppendOperator(
+      const std::string& delim);
   static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
   static std::shared_ptr<MergeOperator> CreateMaxOperator();
   static std::shared_ptr<MergeOperator> CreateBytesXOROperator();
   static std::shared_ptr<MergeOperator> CreateSortOperator();
 
   // Will return a different merge operator depending on the string.
-  // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
   static std::shared_ptr<MergeOperator> CreateFromStringId(
-      const std::string& name) {
-    if (name == "put") {
-      return CreatePutOperator();
-    } else if (name == "put_v1") {
-      return CreateDeprecatedPutOperator();
-    } else if ( name == "uint64add") {
-      return CreateUInt64AddOperator();
-    } else if (name == "stringappend") {
-      return CreateStringAppendOperator();
-    } else if (name == "stringappendtest") {
-      return CreateStringAppendTESTOperator();
-    } else if (name == "max") {
-      return CreateMaxOperator();
-    } else if (name == "bytesxor") {
-      return CreateBytesXOROperator();
-    } else if (name == "sortlist") {
-      return CreateSortOperator();
-    } else {
-      // Empty or unknown, just return nullptr
-      return nullptr;
-    }
-  }
+      const std::string& name);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/object_registry.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/object_registry.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,41 +5,127 @@
 
 #include "rocksdb/utilities/object_registry.h"
 
+#include <ctype.h>
+
 #include "logging/logging.h"
+#include "rocksdb/customizable.h"
 #include "rocksdb/env.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 #ifndef ROCKSDB_LITE
-// Looks through the "type" factories for one that matches "name".
-// If found, returns the pointer to the Entry matching this name.
-// Otherwise, nullptr is returned
-const ObjectLibrary::Entry *ObjectLibrary::FindEntry(
-    const std::string &type, const std::string &name) const {
-  auto entries = entries_.find(type);
-  if (entries != entries_.end()) {
-    for (const auto &entry : entries->second) {
-      if (entry->matches(name)) {
-        return entry.get();
+size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
+    size_t start, Quantifier mode, const std::string &target, size_t tlen,
+    const std::string &separator) const {
+  size_t slen = separator.size();
+  // See if there is enough space.  If so, find the separator
+  if (tlen < start + slen) {
+    return std::string::npos;  // not enough space left
+  } else if (mode == kMatchExact) {
+    // Exact mode means the next thing we are looking for is the separator
+    if (target.compare(start, slen, separator) != 0) {
+      return std::string::npos;
+    } else {
+      return start + slen;  // Found the separator, return where we found it
+    }
+  } else {
+    auto pos = start + 1;
+    if (!separator.empty()) {
+      pos = target.find(separator, pos);
+    }
+    if (pos == std::string::npos) {
+      return pos;
+    } else if (mode == kMatchNumeric) {
+      // If it is numeric, everything up to the match must be a number
+      while (start < pos) {
+        if (!isdigit(target[start++])) {
+          return std::string::npos;
+        }
+      }
+    }
+    return pos + slen;
+  }
+}
+
+bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
+                                                size_t nlen,
+                                                const std::string &target,
+                                                size_t tlen) const {
+  if (separators_.empty()) {
+    assert(optional_);  // If there are no separators, it must be only a name
+    return nlen == tlen && name == target;
+  } else if (nlen == tlen) {  // The lengths are the same
+    return optional_ && name == target;
+  } else if (tlen < nlen + slength_) {
+    // The target is not long enough
+    return false;
+  } else if (target.compare(0, nlen, name) != 0) {
+    return false;  // Target does not start with name
+  } else {
+    // Loop through all of the separators one at a time matching them.
+    // Note that we first match the separator and then its quantifiers.
+    // Since we expect the separator first, we start with an exact match
+    // Subsequent matches will use the quantifier of the previous separator
+    size_t start = nlen;
+    auto mode = kMatchExact;
+    for (size_t idx = 0; idx < separators_.size(); ++idx) {
+      const auto &separator = separators_[idx];
+      start = MatchSeparatorAt(start, mode, target, tlen, separator.first);
+      if (start == std::string::npos) {
+        return false;
+      } else {
+        mode = separator.second;
+      }
+    }
+    // We have matched all of the separators.  Now check that what is left
+    // unmatched in the target is acceptable.
+    if (mode == kMatchExact) {
+      return (start == tlen);
+    } else if (start > tlen || (start == tlen && mode != kMatchZeroOrMore)) {
+      return false;
+    } else if (mode == kMatchNumeric) {
+      while (start < tlen) {
+        if (!isdigit(target[start++])) {
+          return false;
+        }
       }
     }
   }
-  return nullptr;
-}
-
-void ObjectLibrary::AddEntry(const std::string &type,
-                             std::unique_ptr<Entry> &entry) {
-  auto &entries = entries_[type];
-  entries.emplace_back(std::move(entry));
+  return true;
+}
+
+bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
+  auto tlen = target.size();
+  if (MatchesTarget(name_, nlength_, target, tlen)) {
+    return true;
+  } else if (!names_.empty()) {
+    for (const auto &alt : names_) {
+      if (MatchesTarget(alt, alt.size(), target, tlen)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
+  std::unique_lock<std::mutex> lock(mu_);
+  *types = factories_.size();
+  size_t factories = 0;
+  for (const auto &e : factories_) {
+    factories += e.second.size();
+  }
+  return factories;
 }
 
 void ObjectLibrary::Dump(Logger *logger) const {
-  for (const auto &iter : entries_) {
+  std::unique_lock<std::mutex> lock(mu_);
+  for (const auto &iter : factories_) {
     ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
                      iter.first.c_str());
     bool printed_one = false;
     for (const auto &e : iter.second) {
-      ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':',
-                       e->Name().c_str());
+      ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name());
       printed_one = true;
     }
   }
@@ -50,36 +136,103 @@
 // This instance will contain most of the "standard" registered objects
 std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
   static std::shared_ptr<ObjectLibrary> instance =
-      std::make_shared<ObjectLibrary>();
+      std::make_shared<ObjectLibrary>("default");
   return instance;
 }
 
-std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
-  std::shared_ptr<ObjectRegistry> instance = std::make_shared<ObjectRegistry>();
+std::shared_ptr<ObjectRegistry> ObjectRegistry::Default() {
+  static std::shared_ptr<ObjectRegistry> instance(
+      new ObjectRegistry(ObjectLibrary::Default()));
   return instance;
 }
 
-ObjectRegistry::ObjectRegistry() {
-  libraries_.push_back(ObjectLibrary::Default());
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
+  return std::make_shared<ObjectRegistry>(Default());
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance(
+    const std::shared_ptr<ObjectRegistry> &parent) {
+  return std::make_shared<ObjectRegistry>(parent);
+}
+
+Status ObjectRegistry::SetManagedObject(
+    const std::string &type, const std::string &id,
+    const std::shared_ptr<Customizable> &object) {
+  std::string object_key = ToManagedObjectKey(type, id);
+  std::shared_ptr<Customizable> curr;
+  if (parent_ != nullptr) {
+    curr = parent_->GetManagedObject(type, id);
+  }
+  if (curr == nullptr) {
+    // We did not find the object in any parent.  Update in the current
+    std::unique_lock<std::mutex> lock(objects_mutex_);
+    auto iter = managed_objects_.find(object_key);
+    if (iter != managed_objects_.end()) {  // The object exists
+      curr = iter->second.lock();
+      if (curr != nullptr && curr != object) {
+        return Status::InvalidArgument("Object already exists: ", object_key);
+      } else {
+        iter->second = object;
+      }
+    } else {
+      // The object does not exist.  Add it
+      managed_objects_[object_key] = object;
+    }
+  } else if (curr != object) {
+    return Status::InvalidArgument("Object already exists: ", object_key);
+  }
+  return Status::OK();
 }
 
-// Searches (from back to front) the libraries looking for the
-// an entry that matches this pattern.
-// Returns the entry if it is found, and nullptr otherwise
-const ObjectLibrary::Entry *ObjectRegistry::FindEntry(
-    const std::string &type, const std::string &name) const {
-  for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
-    const auto *entry = iter->get()->FindEntry(type, name);
-    if (entry != nullptr) {
-      return entry;
+std::shared_ptr<Customizable> ObjectRegistry::GetManagedObject(
+    const std::string &type, const std::string &id) const {
+  {
+    std::unique_lock<std::mutex> lock(objects_mutex_);
+    auto iter = managed_objects_.find(ToManagedObjectKey(type, id));
+    if (iter != managed_objects_.end()) {
+      return iter->second.lock();
     }
   }
-  return nullptr;
+  if (parent_ != nullptr) {
+    return parent_->GetManagedObject(type, id);
+  } else {
+    return nullptr;
+  }
+}
+
+Status ObjectRegistry::ListManagedObjects(
+    const std::string &type, const std::string &name,
+    std::vector<std::shared_ptr<Customizable>> *results) const {
+  {
+    std::string key = ToManagedObjectKey(type, name);
+    std::unique_lock<std::mutex> lock(objects_mutex_);
+    for (auto iter = managed_objects_.lower_bound(key);
+         iter != managed_objects_.end() && StartsWith(iter->first, key);
+         ++iter) {
+      auto shared = iter->second.lock();
+      if (shared != nullptr) {
+        if (name.empty() || shared->IsInstanceOf(name)) {
+          results->emplace_back(shared);
+        }
+      }
+    }
+  }
+  if (parent_ != nullptr) {
+    return parent_->ListManagedObjects(type, name, results);
+  } else {
+    return Status::OK();
+  }
 }
 
 void ObjectRegistry::Dump(Logger *logger) const {
-  for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
-    iter->get()->Dump(logger);
+  {
+    std::unique_lock<std::mutex> lock(library_mutex_);
+    for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
+      iter->get()->Dump(logger);
+    }
+  }
+  if (parent_ != nullptr) {
+    parent_->Dump(logger);
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/object_registry_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/object_registry_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/object_registry_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,36 +6,49 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/object_registry.h"
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/utilities/regex.h"
 #include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class EnvRegistryTest : public testing::Test {
+class ObjRegistryTest : public testing::Test {
  public:
   static int num_a, num_b;
 };
 
-int EnvRegistryTest::num_a = 0;
-int EnvRegistryTest::num_b = 0;
-static FactoryFunc<Env> test_reg_a = ObjectLibrary::Default()->Register<Env>(
-    "a://.*",
+int ObjRegistryTest::num_a = 0;
+int ObjRegistryTest::num_b = 0;
+static FactoryFunc<Env> test_reg_a = ObjectLibrary::Default()->AddFactory<Env>(
+    ObjectLibrary::PatternEntry("a", false).AddSeparator("://"),
     [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
        std::string* /* errmsg */) {
-      ++EnvRegistryTest::num_a;
+      ++ObjRegistryTest::num_a;
       return Env::Default();
     });
 
-static FactoryFunc<Env> test_reg_b = ObjectLibrary::Default()->Register<Env>(
-    "b://.*", [](const std::string& /*uri*/, std::unique_ptr<Env>* env_guard,
-                 std::string* /* errmsg */) {
-      ++EnvRegistryTest::num_b;
+class WrappedEnv : public EnvWrapper {
+ private:
+  std::string id_;
+
+ public:
+  WrappedEnv(Env* t, const std::string& id) : EnvWrapper(t), id_(id) {}
+  const char* Name() const override { return id_.c_str(); }
+  std::string GetId() const override { return id_; }
+};
+static FactoryFunc<Env> test_reg_b = ObjectLibrary::Default()->AddFactory<Env>(
+    ObjectLibrary::PatternEntry("b", false).AddSeparator("://"),
+    [](const std::string& uri, std::unique_ptr<Env>* env_guard,
+       std::string* /* errmsg */) {
+      ++ObjRegistryTest::num_b;
       // Env::Default() is a singleton so we can't grant ownership directly to
       // the caller - we must wrap it first.
-      env_guard->reset(new EnvWrapper(Env::Default()));
+      env_guard->reset(new WrappedEnv(Env::Default(), uri));
       return env_guard->get();
     });
 
-TEST_F(EnvRegistryTest, Basics) {
+TEST_F(ObjRegistryTest, Basics) {
   std::string msg;
   std::unique_ptr<Env> env_guard;
   auto registry = ObjectRegistry::NewInstance();
@@ -58,18 +71,19 @@
   ASSERT_EQ(1, num_b);
 }
 
-TEST_F(EnvRegistryTest, LocalRegistry) {
+TEST_F(ObjRegistryTest, LocalRegistry) {
   std::string msg;
   std::unique_ptr<Env> guard;
   auto registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("local");
   registry->AddLibrary(library);
-  library->Register<Env>(
+  library->AddFactory<Env>(
       "test-local",
       [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
          std::string* /* errmsg */) { return Env::Default(); });
 
-  ObjectLibrary::Default()->Register<Env>(
+  ObjectLibrary::Default()->AddFactory<Env>(
       "test-global",
       [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
          std::string* /* errmsg */) { return Env::Default(); });
@@ -84,20 +98,21 @@
   ASSERT_NE(registry->NewObject<Env>("test-global", &guard, &msg), nullptr);
 }
 
-TEST_F(EnvRegistryTest, CheckShared) {
+TEST_F(ObjRegistryTest, CheckShared) {
   std::shared_ptr<Env> shared;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("shared");
   registry->AddLibrary(library);
-  library->Register<Env>(
+  library->AddFactory<Env>(
       "unguarded",
       [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
          std::string* /* errmsg */) { return Env::Default(); });
 
-  library->Register<Env>(
-      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+  library->AddFactory<Env>(
+      "guarded", [](const std::string& uri, std::unique_ptr<Env>* guard,
                     std::string* /* errmsg */) {
-        guard->reset(new EnvWrapper(Env::Default()));
+        guard->reset(new WrappedEnv(Env::Default(), uri));
         return guard->get();
       });
 
@@ -108,20 +123,21 @@
   ASSERT_EQ(shared, nullptr);
 }
 
-TEST_F(EnvRegistryTest, CheckStatic) {
+TEST_F(ObjRegistryTest, CheckStatic) {
   Env* env = nullptr;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("static");
   registry->AddLibrary(library);
-  library->Register<Env>(
+  library->AddFactory<Env>(
       "unguarded",
       [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
          std::string* /* errmsg */) { return Env::Default(); });
 
-  library->Register<Env>(
-      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+  library->AddFactory<Env>(
+      "guarded", [](const std::string& uri, std::unique_ptr<Env>* guard,
                     std::string* /* errmsg */) {
-        guard->reset(new EnvWrapper(Env::Default()));
+        guard->reset(new WrappedEnv(Env::Default(), uri));
         return guard->get();
       });
 
@@ -132,20 +148,21 @@
   ASSERT_NE(env, nullptr);
 }
 
-TEST_F(EnvRegistryTest, CheckUnique) {
+TEST_F(ObjRegistryTest, CheckUnique) {
   std::unique_ptr<Env> unique;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("unique");
   registry->AddLibrary(library);
-  library->Register<Env>(
+  library->AddFactory<Env>(
       "unguarded",
       [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
          std::string* /* errmsg */) { return Env::Default(); });
 
-  library->Register<Env>(
-      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+  library->AddFactory<Env>(
+      "guarded", [](const std::string& uri, std::unique_ptr<Env>* guard,
                     std::string* /* errmsg */) {
-        guard->reset(new EnvWrapper(Env::Default()));
+        guard->reset(new WrappedEnv(Env::Default(), uri));
         return guard->get();
       });
 
@@ -156,6 +173,538 @@
   ASSERT_EQ(unique, nullptr);
 }
 
+TEST_F(ObjRegistryTest, TestRegistryParents) {
+  auto grand = ObjectRegistry::Default();
+  auto parent = ObjectRegistry::NewInstance();  // parent with a grandparent
+  auto uncle = ObjectRegistry::NewInstance(grand);
+  auto child = ObjectRegistry::NewInstance(parent);
+  auto cousin = ObjectRegistry::NewInstance(uncle);
+
+  auto library = parent->AddLibrary("parent");
+  library->AddFactory<Env>(
+      "parent", [](const std::string& uri, std::unique_ptr<Env>* guard,
+                   std::string* /* errmsg */) {
+        guard->reset(new WrappedEnv(Env::Default(), uri));
+        return guard->get();
+      });
+  library = cousin->AddLibrary("cousin");
+  library->AddFactory<Env>(
+      "cousin", [](const std::string& uri, std::unique_ptr<Env>* guard,
+                   std::string* /* errmsg */) {
+        guard->reset(new WrappedEnv(Env::Default(), uri));
+        return guard->get();
+      });
+
+  std::unique_ptr<Env> guard;
+  std::string msg;
+
+  // a:://* is registered in Default, so they should all workd
+  ASSERT_NE(parent->NewObject<Env>("a://test", &guard, &msg), nullptr);
+  ASSERT_NE(child->NewObject<Env>("a://test", &guard, &msg), nullptr);
+  ASSERT_NE(uncle->NewObject<Env>("a://test", &guard, &msg), nullptr);
+  ASSERT_NE(cousin->NewObject<Env>("a://test", &guard, &msg), nullptr);
+
+  // The parent env is only registered for parent, not uncle,
+  // So parent and child should return success and uncle and cousin should fail
+  ASSERT_OK(parent->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_OK(child->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_NOK(uncle->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_NOK(cousin->NewUniqueObject<Env>("parent", &guard));
+
+  // The cousin is only registered in the cousin, so all of the others should
+  // fail
+  ASSERT_OK(cousin->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(parent->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(child->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(uncle->NewUniqueObject<Env>("cousin", &guard));
+}
+class MyCustomizable : public Customizable {
+ public:
+  static const char* Type() { return "MyCustomizable"; }
+  MyCustomizable(const char* prefix, const std::string& id) : id_(id) {
+    name_ = id_.substr(0, strlen(prefix) - 1);
+  }
+  const char* Name() const override { return name_.c_str(); }
+  std::string GetId() const override { return id_; }
+
+ private:
+  std::string id_;
+  std::string name_;
+};
+
+TEST_F(ObjRegistryTest, TestManagedObjects) {
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a1 = std::make_shared<MyCustomizable>("", "A");
+  auto m_a2 = std::make_shared<MyCustomizable>("", "A");
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a1));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a1);
+
+  ASSERT_NOK(registry->SetManagedObject<MyCustomizable>(m_a2));
+  ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a1));
+  m_a1.reset();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->SetManagedObject<MyCustomizable>(m_a2));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a2);
+}
+
+TEST_F(ObjRegistryTest, TestTwoManagedObjects) {
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a = std::make_shared<MyCustomizable>("", "A");
+  auto m_b = std::make_shared<MyCustomizable>("", "B");
+  std::vector<std::shared_ptr<MyCustomizable>> objects;
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 0U);
+  ASSERT_OK(registry->SetManagedObject(m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_a);
+
+  ASSERT_OK(registry->SetManagedObject(m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), m_b);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 2U);
+  ASSERT_OK(registry->ListManagedObjects("A", &objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_a);
+  ASSERT_OK(registry->ListManagedObjects("B", &objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_b);
+  ASSERT_OK(registry->ListManagedObjects("C", &objects));
+  ASSERT_EQ(objects.size(), 0U);
+
+  m_a.reset();
+  objects.clear();
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), m_b);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_b);
+
+  m_b.reset();
+  objects.clear();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+}
+
+TEST_F(ObjRegistryTest, TestAlternateNames) {
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a = std::make_shared<MyCustomizable>("", "A");
+  auto m_b = std::make_shared<MyCustomizable>("", "B");
+  std::vector<std::shared_ptr<MyCustomizable>> objects;
+  // Test no objects exist
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 0U);
+
+  // Mark "TheOne" to be A
+  ASSERT_OK(registry->SetManagedObject("TheOne", m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_a);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_a);
+
+  // Try to mark "TheOne" again.
+  ASSERT_NOK(registry->SetManagedObject("TheOne", m_b));
+  ASSERT_OK(registry->SetManagedObject("TheOne", m_a));
+
+  // Add "A" as a managed object.  Registered 2x
+  ASSERT_OK(registry->SetManagedObject(m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("B"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_a);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 2U);
+
+  // Delete "A".
+  m_a.reset();
+  objects.clear();
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+  ASSERT_OK(registry->SetManagedObject("TheOne", m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), m_b);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 1U);
+  ASSERT_EQ(objects.front(), m_b);
+
+  m_b.reset();
+  objects.clear();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("TheOne"), nullptr);
+  ASSERT_OK(registry->ListManagedObjects(&objects));
+  ASSERT_EQ(objects.size(), 0U);
+}
+
+TEST_F(ObjRegistryTest, TestTwoManagedClasses) {
+  class MyCustomizable2 : public MyCustomizable {
+   public:
+    static const char* Type() { return "MyCustomizable2"; }
+    MyCustomizable2(const char* prefix, const std::string& id)
+        : MyCustomizable(prefix, id) {}
+  };
+
+  auto registry = ObjectRegistry::NewInstance();
+  auto m_a1 = std::make_shared<MyCustomizable>("", "A");
+  auto m_a2 = std::make_shared<MyCustomizable2>("", "A");
+  std::vector<std::shared_ptr<MyCustomizable>> obj1s;
+  std::vector<std::shared_ptr<MyCustomizable2>> obj2s;
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+
+  ASSERT_OK(registry->SetManagedObject(m_a1));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a1);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+
+  ASSERT_OK(registry->SetManagedObject(m_a2));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), m_a2);
+  ASSERT_OK(registry->ListManagedObjects(&obj1s));
+  ASSERT_OK(registry->ListManagedObjects(&obj2s));
+  ASSERT_EQ(obj1s.size(), 1U);
+  ASSERT_EQ(obj2s.size(), 1U);
+  ASSERT_EQ(obj1s.front(), m_a1);
+  ASSERT_EQ(obj2s.front(), m_a2);
+  m_a1.reset();
+  obj1s.clear();
+  obj2s.clear();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), m_a2);
+
+  m_a2.reset();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable2>("A"), nullptr);
+}
+
+TEST_F(ObjRegistryTest, TestManagedObjectsWithParent) {
+  auto base = ObjectRegistry::NewInstance();
+  auto registry = ObjectRegistry::NewInstance(base);
+
+  auto m_a = std::make_shared<MyCustomizable>("", "A");
+  auto m_b = std::make_shared<MyCustomizable>("", "A");
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(base->SetManagedObject(m_a));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_a);
+
+  ASSERT_NOK(registry->SetManagedObject(m_b));
+  ASSERT_OK(registry->SetManagedObject(m_a));
+
+  m_a.reset();
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), nullptr);
+  ASSERT_OK(registry->SetManagedObject(m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("A"), m_b);
+}
+
+TEST_F(ObjRegistryTest, TestGetOrCreateManagedObject) {
+  auto registry = ObjectRegistry::NewInstance();
+  registry->AddLibrary("test")->AddFactory<MyCustomizable>(
+      ObjectLibrary::PatternEntry::AsIndividualId("MC"),
+      [](const std::string& uri, std::unique_ptr<MyCustomizable>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MyCustomizable("MC", uri));
+        return guard->get();
+      });
+  std::shared_ptr<MyCustomizable> m_a, m_b, obj;
+  std::vector<std::shared_ptr<MyCustomizable>> objs;
+
+  std::unordered_map<std::string, std::string> opt_map;
+
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), nullptr);
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@B#1"), nullptr);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &m_b));
+  ASSERT_EQ(registry->GetManagedObject<MyCustomizable>("MC@A#1"), m_a);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &obj));
+  ASSERT_EQ(obj, m_a);
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
+  ASSERT_EQ(obj, m_b);
+  ASSERT_OK(registry->ListManagedObjects(&objs));
+  ASSERT_EQ(objs.size(), 2U);
+
+  objs.clear();
+  m_a.reset();
+  obj.reset();
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@A#1", &m_a));
+  ASSERT_EQ(1, m_a.use_count());
+  ASSERT_OK(registry->GetOrCreateManagedObject("MC@B#1", &obj));
+  ASSERT_EQ(2, obj.use_count());
+}
+
+TEST_F(ObjRegistryTest, TestDeprecatedRegex) {
+  Regex regex;
+  Env* env = nullptr;
+  auto registry = ObjectRegistry::NewInstance();
+  if (Regex::Parse("XYZ", &regex).ok()) {
+    registry->AddLibrary("XYZ")->Register<Env>(
+        "XYZ",
+        [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
+           std::string* /* errmsg */) { return Env::Default(); });
+    ASSERT_NOK(registry->NewStaticObject<Env>("X", &env));
+    ASSERT_OK(registry->NewStaticObject<Env>("XYZ", &env));
+    ASSERT_EQ(env, Env::Default());
+  }
+  if (Regex::Parse("ABC://.*", &regex).ok()) {
+    registry->AddLibrary("ABC")->Register<Env>(
+        "ABC://.*",
+        [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
+           std::string* /* errmsg */) { return Env::Default(); });
+    ASSERT_NOK(registry->NewStaticObject<Env>("ABC", &env));
+    ASSERT_OK(registry->NewStaticObject<Env>("ABC://123", &env));
+    ASSERT_EQ(env, Env::Default());
+    ASSERT_OK(registry->NewStaticObject<Env>("ABC://", &env));
+    ASSERT_EQ(env, Env::Default());
+  }
+}
+
+class PatternEntryTest : public testing::Test {};
+
+TEST_F(PatternEntryTest, TestSimpleEntry) {
+  ObjectLibrary::PatternEntry entry("ABC", true);
+
+  ASSERT_TRUE(entry.Matches("ABC"));
+  ASSERT_FALSE(entry.Matches("AABC"));
+  ASSERT_FALSE(entry.Matches("ABCA"));
+  ASSERT_FALSE(entry.Matches("AABCA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("BC"));
+  ASSERT_FALSE(entry.Matches("ABD"));
+  ASSERT_FALSE(entry.Matches("BCA"));
+}
+
+TEST_F(PatternEntryTest, TestPatternEntry) {
+  // Matches A:+
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddSeparator(":");
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+
+  entry.SetOptional(true);  // Now matches "A" or "A:+"
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+}
+
+TEST_F(PatternEntryTest, MatchZeroOrMore) {
+  // Matches A:*
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddSeparator(":", false);
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_TRUE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("B:"));
+  ASSERT_FALSE(entry.Matches("B:A"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+
+  entry.SetOptional(true);  // Now matches "A" or "A:*"
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_TRUE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("B:"));
+  ASSERT_FALSE(entry.Matches("B:A"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA:B"));
+  ASSERT_FALSE(entry.Matches("AA:BB"));
+  ASSERT_TRUE(entry.Matches("A:B"));
+  ASSERT_TRUE(entry.Matches("A:BB"));
+}
+
+TEST_F(PatternEntryTest, TestSuffixEntry) {
+  ObjectLibrary::PatternEntry entry("AA", true);
+  entry.AddSuffix("BB");
+
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AABB"));
+
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AB"));
+  ASSERT_FALSE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("BB"));
+  ASSERT_FALSE(entry.Matches("ABA"));
+  ASSERT_FALSE(entry.Matches("BBAA"));
+  ASSERT_FALSE(entry.Matches("AABBA"));
+  ASSERT_FALSE(entry.Matches("AABBB"));
+}
+
+TEST_F(PatternEntryTest, TestNumericEntry) {
+  ObjectLibrary::PatternEntry entry("A", false);
+  entry.AddNumber(":");
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("A:"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_TRUE(entry.Matches("A:1"));
+  ASSERT_TRUE(entry.Matches("A:11"));
+  ASSERT_FALSE(entry.Matches("AA:1"));
+  ASSERT_FALSE(entry.Matches("AA:11"));
+  ASSERT_FALSE(entry.Matches("A:B"));
+  ASSERT_FALSE(entry.Matches("A:1B"));
+  ASSERT_FALSE(entry.Matches("A:B1"));
+}
+
+TEST_F(PatternEntryTest, TestIndividualIdEntry) {
+  auto entry = ObjectLibrary::PatternEntry::AsIndividualId("AA");
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AA@123#456"));
+  ASSERT_TRUE(entry.Matches("AA@deadbeef#id"));
+
+  ASSERT_FALSE(entry.Matches("A"));
+  ASSERT_FALSE(entry.Matches("AAA"));
+  ASSERT_FALSE(entry.Matches("AA@123"));
+  ASSERT_FALSE(entry.Matches("AA@123#"));
+  ASSERT_FALSE(entry.Matches("AA@#123"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNameEntry) {
+  ObjectLibrary::PatternEntry entry("A");
+  entry.AnotherName("B");
+  ASSERT_TRUE(entry.Matches("A"));
+  ASSERT_TRUE(entry.Matches("B"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BB"));
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BA"));
+  ASSERT_FALSE(entry.Matches("AB"));
+}
+
+TEST_F(PatternEntryTest, TestTwoPatternEntry) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddSeparator(":");
+  entry.AddSeparator(":");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::12"));
+  ASSERT_TRUE(entry.Matches("AA:1:2"));
+  ASSERT_TRUE(entry.Matches("AA:1:2:"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddSeparator("::");
+  entry2.AddSeparator("##");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA:"));
+  ASSERT_FALSE(entry2.Matches("AA::"));
+  ASSERT_FALSE(entry2.Matches("AA::#"));
+  ASSERT_FALSE(entry2.Matches("AA::##"));
+  ASSERT_FALSE(entry2.Matches("AA##1::2"));
+  ASSERT_FALSE(entry2.Matches("AA::123##"));
+  ASSERT_TRUE(entry2.Matches("AA::1##2"));
+  ASSERT_TRUE(entry2.Matches("AA::12##34:"));
+  ASSERT_TRUE(entry2.Matches("AA::12::34##56"));
+  ASSERT_TRUE(entry2.Matches("AA::12##34::56"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNumbersEntry) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddNumber(":");
+  entry.AddNumber(":");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA:"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::12"));
+  ASSERT_FALSE(entry.Matches("AA:1:2:"));
+  ASSERT_TRUE(entry.Matches("AA:1:2"));
+  ASSERT_TRUE(entry.Matches("AA:12:23456"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddNumber(":");
+  entry2.AddNumber("#");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA:"));
+  ASSERT_FALSE(entry2.Matches("AA:#"));
+  ASSERT_FALSE(entry2.Matches("AA#:"));
+  ASSERT_FALSE(entry2.Matches("AA:123#"));
+  ASSERT_FALSE(entry2.Matches("AA:123#B"));
+  ASSERT_FALSE(entry2.Matches("AA:B#123"));
+  ASSERT_TRUE(entry2.Matches("AA:1#2"));
+  ASSERT_FALSE(entry2.Matches("AA:123#23:"));
+  ASSERT_FALSE(entry2.Matches("AA::12#234"));
+}
+
+TEST_F(PatternEntryTest, TestPatternAndSuffix) {
+  ObjectLibrary::PatternEntry entry("AA", false);
+  entry.AddSeparator("::");
+  entry.AddSuffix("##");
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AA::##"));
+  ASSERT_FALSE(entry.Matches("AB::1##"));
+  ASSERT_FALSE(entry.Matches("AB::1##2"));
+  ASSERT_FALSE(entry.Matches("AA##1::"));
+  ASSERT_TRUE(entry.Matches("AA::1##"));
+  ASSERT_FALSE(entry.Matches("AA::1###"));
+
+  ObjectLibrary::PatternEntry entry2("AA", false);
+  entry2.AddSuffix("::");
+  entry2.AddSeparator("##");
+  ASSERT_FALSE(entry2.Matches("AA"));
+  ASSERT_FALSE(entry2.Matches("AA::"));
+  ASSERT_FALSE(entry2.Matches("AA::##"));
+  ASSERT_FALSE(entry2.Matches("AB::1##"));
+  ASSERT_FALSE(entry2.Matches("AB::1##2"));
+  ASSERT_TRUE(entry2.Matches("AA::##12"));
+}
+
+TEST_F(PatternEntryTest, TestTwoNamesAndPattern) {
+  ObjectLibrary::PatternEntry entry("AA", true);
+  entry.AddSeparator("::");
+  entry.AnotherName("BBB");
+  ASSERT_TRUE(entry.Matches("AA"));
+  ASSERT_TRUE(entry.Matches("AA::1"));
+  ASSERT_TRUE(entry.Matches("BBB"));
+  ASSERT_TRUE(entry.Matches("BBB::2"));
+
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AAA::"));
+  ASSERT_FALSE(entry.Matches("BBB::"));
+
+  entry.SetOptional(false);
+  ASSERT_FALSE(entry.Matches("AA"));
+  ASSERT_FALSE(entry.Matches("BBB"));
+
+  ASSERT_FALSE(entry.Matches("AA::"));
+  ASSERT_FALSE(entry.Matches("AAA::"));
+  ASSERT_FALSE(entry.Matches("BBB::"));
+
+  ASSERT_TRUE(entry.Matches("AA::1"));
+  ASSERT_TRUE(entry.Matches("BBB::2"));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
@@ -167,7 +716,7 @@
 #include <stdio.h>
 
 int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr, "SKIPPED as EnvRegistry is not supported in ROCKSDB_LITE\n");
+  fprintf(stderr, "SKIPPED as ObjRegistry is not supported in ROCKSDB_LITE\n");
   return 0;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc	2025-05-19 16:14:28.000000000 +0000
@@ -60,9 +60,9 @@
     // generate one output file
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   }
-  db->CompactRange(cro, nullptr, nullptr);
+  s = db->CompactRange(cro, nullptr, nullptr);
 
-  if (need_reopen) {
+  if (s.ok() && need_reopen) {
     // Need to restart DB to rewrite the manifest file.
     // In order to open a DB with specific num_levels, the manifest file should
     // contain no record that mentiones any level beyond num_levels. Issuing a
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,9 +8,13 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/utilities/option_change_migration.h"
+
 #include <set>
+
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "util/random.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 class DBOptionChangeMigrationTests
@@ -19,7 +23,7 @@
           std::tuple<int, int, bool, int, int, bool>> {
  public:
   DBOptionChangeMigrationTests()
-      : DBTestBase("/db_option_change_migration_test") {
+      : DBTestBase("db_option_change_migration_test", /*env_do_fsync=*/true) {
     level1_ = std::get<0>(GetParam());
     compaction_style1_ = std::get<1>(GetParam());
     is_dynamic1_ = std::get<2>(GetParam());
@@ -50,7 +54,9 @@
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
-
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -68,8 +74,8 @@
   for (int num = 0; num < 20; num++) {
     GenerateNewFile(&rnd, &key_idx);
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -88,6 +94,9 @@
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level2_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -96,8 +105,8 @@
   Reopen(new_options);
 
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -119,6 +128,9 @@
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -136,8 +148,8 @@
   for (int num = 0; num < 20; num++) {
     GenerateNewFile(&rnd, &key_idx);
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -157,6 +169,9 @@
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level1_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -164,8 +179,8 @@
   ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
   Reopen(new_options);
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -187,7 +202,9 @@
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
-
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -200,19 +217,19 @@
   Random rnd(301);
   for (int num = 0; num < 20; num++) {
     for (int i = 0; i < 50; i++) {
-      ASSERT_OK(Put(Key(num * 100 + i), RandomString(&rnd, 900)));
+      ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
     }
     Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     if (num == 9) {
       // Issue a full compaction to generate some zero-out files
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-      dbfull()->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
     }
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -231,6 +248,9 @@
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level2_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -239,8 +259,8 @@
   Reopen(new_options);
 
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -262,6 +282,9 @@
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -274,19 +297,19 @@
   Random rnd(301);
   for (int num = 0; num < 20; num++) {
     for (int i = 0; i < 50; i++) {
-      ASSERT_OK(Put(Key(num * 100 + i), RandomString(&rnd, 900)));
+      ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
     }
     Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     if (num == 9) {
       // Issue a full compaction to generate some zero-out files
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-      dbfull()->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
     }
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -306,6 +329,9 @@
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level1_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -313,8 +339,8 @@
   ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
   Reopen(new_options);
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -350,7 +376,7 @@
 class DBOptionChangeMigrationTest : public DBTestBase {
  public:
   DBOptionChangeMigrationTest()
-      : DBTestBase("/db_option_change_migration_test2") {}
+      : DBTestBase("db_option_change_migration_test2", /*env_do_fsync=*/true) {}
 };
 
 TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
@@ -370,13 +396,13 @@
   Random rnd(301);
   for (int num = 0; num < 20; num++) {
     for (int i = 0; i < 50; i++) {
-      ASSERT_OK(Put(Key(num * 100 + i), RandomString(&rnd, 900)));
+      ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
     }
   }
   Flush();
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -399,8 +425,8 @@
   ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
   Reopen(new_options);
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -412,6 +438,7 @@
       it->Next();
     }
     ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
   }
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/options/options_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/options/options_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,10 +7,11 @@
 
 #include "rocksdb/utilities/options_util.h"
 
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "options/options_parser.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/options.h"
+#include "table/block_based/block_based_table_factory.h"
 
 namespace ROCKSDB_NAMESPACE {
 Status LoadOptionsFromFile(const std::string& file_name, Env* env,
@@ -18,10 +19,22 @@
                            std::vector<ColumnFamilyDescriptor>* cf_descs,
                            bool ignore_unknown_options,
                            std::shared_ptr<Cache>* cache) {
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = true;
+  config_options.env = env;
+
+  return LoadOptionsFromFile(config_options, file_name, db_options, cf_descs,
+                             cache);
+}
+
+Status LoadOptionsFromFile(const ConfigOptions& config_options,
+                           const std::string& file_name, DBOptions* db_options,
+                           std::vector<ColumnFamilyDescriptor>* cf_descs,
+                           std::shared_ptr<Cache>* cache) {
   RocksDBOptionsParser parser;
-  LegacyFileSystemWrapper fs(env);
-  Status s = parser.Parse(file_name, &fs, ignore_unknown_options,
-                          0 /* file_readahead_size */);
+  const auto& fs = config_options.env->GetFileSystem();
+  Status s = parser.Parse(config_options, file_name, fs.get());
   if (!s.ok()) {
     return s;
   }
@@ -33,11 +46,11 @@
     cf_descs->push_back({cf_names[i], cf_opts[i]});
     if (cache != nullptr) {
       TableFactory* tf = cf_opts[i].table_factory.get();
-      if (tf != nullptr && tf->GetOptions() != nullptr &&
-          tf->Name() == BlockBasedTableFactory().Name()) {
-        auto* loaded_bbt_opt =
-            reinterpret_cast<BlockBasedTableOptions*>(tf->GetOptions());
-        loaded_bbt_opt->block_cache = *cache;
+      if (tf != nullptr) {
+        auto* opts = tf->GetOptions<BlockBasedTableOptions>();
+        if (opts != nullptr) {
+          opts->block_cache = *cache;
+        }
       }
     }
   }
@@ -51,7 +64,11 @@
   uint64_t latest_time_stamp = 0;
   std::vector<std::string> file_names;
   s = env->GetChildren(dbpath, &file_names);
-  if (!s.ok()) {
+  if (s.IsNotFound()) {
+    return Status::NotFound(Status::kPathNotFound,
+                            "No options files found in the DB directory.",
+                            dbpath);
+  } else if (!s.ok()) {
     return s;
   }
   for (auto& file_name : file_names) {
@@ -65,7 +82,9 @@
     }
   }
   if (latest_file_name.size() == 0) {
-    return Status::NotFound("No options files found in the DB directory.");
+    return Status::NotFound(Status::kPathNotFound,
+                            "No options files found in the DB directory.",
+                            dbpath);
   }
   *options_file_name = latest_file_name;
   return Status::OK();
@@ -76,21 +95,48 @@
                          std::vector<ColumnFamilyDescriptor>* cf_descs,
                          bool ignore_unknown_options,
                          std::shared_ptr<Cache>* cache) {
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = true;
+  config_options.env = env;
+
+  return LoadLatestOptions(config_options, dbpath, db_options, cf_descs, cache);
+}
+
+Status LoadLatestOptions(const ConfigOptions& config_options,
+                         const std::string& dbpath, DBOptions* db_options,
+                         std::vector<ColumnFamilyDescriptor>* cf_descs,
+                         std::shared_ptr<Cache>* cache) {
   std::string options_file_name;
-  Status s = GetLatestOptionsFileName(dbpath, env, &options_file_name);
+  Status s =
+      GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name);
   if (!s.ok()) {
     return s;
   }
-  return LoadOptionsFromFile(dbpath + "/" + options_file_name, env, db_options,
-                             cf_descs, ignore_unknown_options, cache);
+  return LoadOptionsFromFile(config_options, dbpath + "/" + options_file_name,
+                             db_options, cf_descs, cache);
 }
 
 Status CheckOptionsCompatibility(
     const std::string& dbpath, Env* env, const DBOptions& db_options,
     const std::vector<ColumnFamilyDescriptor>& cf_descs,
     bool ignore_unknown_options) {
+  ConfigOptions config_options(db_options);
+  config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+  config_options.ignore_unknown_options = ignore_unknown_options;
+  config_options.input_strings_escaped = true;
+  config_options.env = env;
+  return CheckOptionsCompatibility(config_options, dbpath, db_options,
+                                   cf_descs);
+}
+
+Status CheckOptionsCompatibility(
+    const ConfigOptions& config_options, const std::string& dbpath,
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& cf_descs) {
   std::string options_file_name;
-  Status s = GetLatestOptionsFileName(dbpath, env, &options_file_name);
+  Status s =
+      GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name);
   if (!s.ok()) {
     return s;
   }
@@ -102,12 +148,11 @@
     cf_opts.push_back(cf_desc.options);
   }
 
-  const OptionsSanityCheckLevel kDefaultLevel = kSanityLevelLooselyCompatible;
-  LegacyFileSystemWrapper fs(env);
+  const auto& fs = config_options.env->GetFileSystem();
 
   return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
-      db_options, cf_names, cf_opts, dbpath + "/" + options_file_name, &fs,
-      kDefaultLevel, ignore_unknown_options);
+      config_options, db_options, cf_names, cf_opts,
+      dbpath + "/" + options_file_name, fs.get());
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/options/options_util_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/options/options_util_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/options/options_util_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,15 +5,18 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <cinttypes>
+#include "rocksdb/utilities/options_util.h"
 
 #include <cctype>
+#include <cinttypes>
 #include <unordered_map>
 
+#include "env/mock_env.h"
+#include "file/filename.h"
 #include "options/options_parser.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/table.h"
-#include "rocksdb/utilities/options_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
@@ -30,22 +33,16 @@
 class OptionsUtilTest : public testing::Test {
  public:
   OptionsUtilTest() : rnd_(0xFB) {
-    env_.reset(new test::StringEnv(Env::Default()));
-    fs_.reset(new LegacyFileSystemWrapper(env_.get()));
+    env_.reset(NewMemEnv(Env::Default()));
     dbname_ = test::PerThreadDBPath("options_util_test");
   }
 
  protected:
-  std::unique_ptr<test::StringEnv> env_;
-  std::unique_ptr<LegacyFileSystemWrapper> fs_;
+  std::unique_ptr<Env> env_;
   std::string dbname_;
   Random rnd_;
 };
 
-bool IsBlockBasedTableFactory(TableFactory* tf) {
-  return tf->Name() == BlockBasedTableFactory().Name();
-}
-
 TEST_F(OptionsUtilTest, SaveAndLoad) {
   const size_t kCFCount = 5;
 
@@ -61,31 +58,34 @@
   }
 
   const std::string kFileName = "OPTIONS-123456";
-  PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, fs_.get());
+  ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName,
+                                  env_->GetFileSystem().get()));
 
   DBOptions loaded_db_opt;
   std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
   ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt,
                                 &loaded_cf_descs));
-
-  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(db_opt, loaded_db_opt));
+  ConfigOptions exact;
+  exact.sanity_level = ConfigOptions::kSanityLevelExactMatch;
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt));
   test::RandomInitDBOptions(&db_opt, &rnd_);
-  ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(db_opt, loaded_db_opt));
+  ASSERT_NOK(
+      RocksDBOptionsParser::VerifyDBOptions(exact, db_opt, loaded_db_opt));
 
   for (size_t i = 0; i < kCFCount; ++i) {
     ASSERT_EQ(cf_names[i], loaded_cf_descs[i].name);
     ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
-        cf_opts[i], loaded_cf_descs[i].options));
-    if (IsBlockBasedTableFactory(cf_opts[i].table_factory.get())) {
-      ASSERT_OK(RocksDBOptionsParser::VerifyTableFactory(
-          cf_opts[i].table_factory.get(),
-          loaded_cf_descs[i].options.table_factory.get()));
-    }
+        exact, cf_opts[i], loaded_cf_descs[i].options));
+    ASSERT_OK(RocksDBOptionsParser::VerifyTableFactory(
+        exact, cf_opts[i].table_factory.get(),
+        loaded_cf_descs[i].options.table_factory.get()));
     test::RandomInitCFOptions(&cf_opts[i], db_opt, &rnd_);
     ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
-        cf_opts[i], loaded_cf_descs[i].options));
+        exact, cf_opts[i], loaded_cf_descs[i].options));
   }
 
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_opts[0])));
   for (size_t i = 0; i < kCFCount; ++i) {
     if (cf_opts[i].compaction_filter) {
       delete cf_opts[i].compaction_filter;
@@ -122,21 +122,40 @@
   cf_names.push_back("cf_plain_table_sample");
   // Saving DB in file
   const std::string kFileName = "OPTIONS-LOAD_CACHE_123456";
-  PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, fs_.get());
+  ASSERT_OK(PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName,
+                                  env_->GetFileSystem().get()));
   DBOptions loaded_db_opt;
   std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
+
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.input_strings_escaped = true;
+  config_options.env = env_.get();
+  ASSERT_OK(LoadOptionsFromFile(config_options, kFileName, &loaded_db_opt,
+                                &loaded_cf_descs, &cache));
+  for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
+    auto* loaded_bbt_opt =
+        loaded_cf_descs[i]
+            .options.table_factory->GetOptions<BlockBasedTableOptions>();
+    // Expect the same cache will be loaded
+    if (loaded_bbt_opt != nullptr) {
+      ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
+    }
+  }
+
+  // Test the old interface
   ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt,
                                 &loaded_cf_descs, false, &cache));
   for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
-    if (IsBlockBasedTableFactory(cf_opts[i].table_factory.get())) {
-      auto* loaded_bbt_opt = reinterpret_cast<BlockBasedTableOptions*>(
-          loaded_cf_descs[i].options.table_factory->GetOptions());
-      // Expect the same cache will be loaded
-      if (loaded_bbt_opt != nullptr) {
-        ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
-      }
+    auto* loaded_bbt_opt =
+        loaded_cf_descs[i]
+            .options.table_factory->GetOptions<BlockBasedTableOptions>();
+    // Expect the same cache will be loaded
+    if (loaded_bbt_opt != nullptr) {
+      ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
     }
   }
+  ASSERT_OK(DestroyDB(dbname_, Options(loaded_db_opt, cf_opts[0])));
 }
 
 namespace {
@@ -147,7 +166,9 @@
 
   const char* Name() const override { return "DummyTableFactory"; }
 
+  using TableFactory::NewTableReader;
   Status NewTableReader(
+      const ReadOptions& /*ro*/,
       const TableReaderOptions& /*table_reader_options*/,
       std::unique_ptr<RandomAccessFileReader>&& /*file*/,
       uint64_t /*file_size*/, std::unique_ptr<TableReader>* /*table_reader*/,
@@ -157,23 +178,17 @@
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& /*table_builder_options*/,
-      uint32_t /*column_family_id*/,
       WritableFileWriter* /*file*/) const override {
     return nullptr;
   }
 
-  Status SanitizeOptions(
+  Status ValidateOptions(
       const DBOptions& /*db_opts*/,
       const ColumnFamilyOptions& /*cf_opts*/) const override {
     return Status::NotSupported();
   }
 
-  std::string GetPrintableTableOptions() const override { return ""; }
-
-  Status GetOptionString(std::string* /*opt_string*/,
-                         const std::string& /*delimiter*/) const override {
-    return Status::OK();
-  }
+  std::string GetPrintableOptions() const override { return ""; }
 };
 
 class DummyMergeOperator : public MergeOperator {
@@ -236,7 +251,7 @@
   db_opt.create_missing_column_families = true;
   db_opt.create_if_missing = true;
 
-  DestroyDB(dbname_, Options(db_opt, cf_descs[0].options));
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
   DB* db;
   std::vector<ColumnFamilyHandle*> handles;
   // open and persist the options
@@ -248,9 +263,13 @@
   }
   delete db;
 
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.input_strings_escaped = true;
+  config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
   // perform sanity check
   ASSERT_OK(
-      CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+      CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
   ASSERT_GE(kCFCount, 5);
   // merge operator
@@ -261,15 +280,15 @@
     ASSERT_NE(merge_op.get(), nullptr);
     cf_descs[0].options.merge_operator.reset();
     ASSERT_NOK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     cf_descs[0].options.merge_operator.reset(new DummyMergeOperator());
     ASSERT_NOK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     cf_descs[0].options.merge_operator = merge_op;
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
   }
 
   // prefix extractor
@@ -281,15 +300,15 @@
     ASSERT_NE(prefix_extractor, nullptr);
     cf_descs[1].options.prefix_extractor.reset();
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     cf_descs[1].options.prefix_extractor.reset(new DummySliceTransform());
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     cf_descs[1].options.prefix_extractor = prefix_extractor;
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
   }
 
   // prefix extractor nullptr case
@@ -301,16 +320,16 @@
     ASSERT_EQ(prefix_extractor, nullptr);
     cf_descs[0].options.prefix_extractor.reset();
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     // It's okay to change prefix_extractor from nullptr to non-nullptr
     cf_descs[0].options.prefix_extractor.reset(new DummySliceTransform());
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     cf_descs[0].options.prefix_extractor = prefix_extractor;
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
   }
 
   // comparator
@@ -320,11 +339,11 @@
     auto* prev_comparator = cf_descs[2].options.comparator;
     cf_descs[2].options.comparator = &comparator;
     ASSERT_NOK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     cf_descs[2].options.comparator = prev_comparator;
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
   }
 
   // table factory
@@ -335,14 +354,410 @@
     ASSERT_NE(table_factory, nullptr);
     cf_descs[3].options.table_factory.reset(new DummyTableFactory());
     ASSERT_NOK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
 
     cf_descs[3].options.table_factory = table_factory;
     ASSERT_OK(
-        CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs));
+        CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
+  }
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
+}
+
+TEST_F(OptionsUtilTest, LatestOptionsNotFound) {
+  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Status s;
+  Options options;
+  ConfigOptions config_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+
+  options.env = env.get();
+  options.create_if_missing = true;
+  config_opts.env = options.env;
+  config_opts.ignore_unknown_options = false;
+
+  std::vector<std::string> children;
+
+  std::string options_file_name;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  // First, test where the db directory does not exist
+  ASSERT_NOK(options.env->GetChildren(dbname_, &children));
+
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs);
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  // Second, test where the db directory exists but is empty
+  ASSERT_OK(options.env->CreateDir(dbname_));
+
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  // Finally, test where a file exists but is not an "Options" file
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(
+      options.env->NewWritableFile(dbname_ + "/temp.txt", &file, EnvOptions()));
+  ASSERT_OK(file->Close());
+  s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+
+  s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_TRUE(s.IsPathNotFound());
+  ASSERT_OK(options.env->DeleteFile(dbname_ + "/temp.txt"));
+  ASSERT_OK(options.env->DeleteDir(dbname_));
+}
+
+TEST_F(OptionsUtilTest, LoadLatestOptions) {
+  Options options;
+  options.OptimizeForSmallDb();
+  ColumnFamilyDescriptor cf_desc;
+  ConfigOptions config_opts;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+  DB* db;
+  options.create_if_missing = true;
+
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  cf_descs.emplace_back();
+  cf_descs.back().name = kDefaultColumnFamilyName;
+  cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory());
+  cf_descs.emplace_back();
+  cf_descs.back().name = "Plain";
+  cf_descs.back().options.table_factory.reset(NewPlainTableFactory());
+  db_opts.create_missing_column_families = true;
+  db_opts.create_if_missing = true;
+
+  // open and persist the options
+  ASSERT_OK(DB::Open(db_opts, dbname_, cf_descs, &handles, &db));
+
+  std::string options_file_name;
+  std::string new_options_file;
+
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file_name));
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_EQ(cf_descs.size(), 2U);
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+                                                  db->GetDBOptions(), db_opts));
+  ASSERT_OK(handles[0]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[0].options));
+  ASSERT_OK(handles[1]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[1].options));
+
+  // Now change some of the DBOptions
+  ASSERT_OK(db->SetDBOptions(
+      {{"delayed_write_rate", "1234"}, {"bytes_per_sync", "32768"}}));
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file));
+  ASSERT_NE(options_file_name, new_options_file);
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+                                                  db->GetDBOptions(), db_opts));
+  options_file_name = new_options_file;
+
+  // Now change some of the ColumnFamilyOptions
+  ASSERT_OK(db->SetOptions(handles[1], {{"write_buffer_size", "32768"}}));
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &new_options_file));
+  ASSERT_NE(options_file_name, new_options_file);
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts,
+                                                  db->GetDBOptions(), db_opts));
+  ASSERT_OK(handles[0]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[0].options));
+  ASSERT_OK(handles[1]->GetDescriptor(&cf_desc));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_opts, cf_desc.options,
+                                                  cf_descs[1].options));
+
+  // close the db
+  for (auto* handle : handles) {
+    delete handle;
+  }
+  delete db;
+  ASSERT_OK(DestroyDB(dbname_, options, cf_descs));
+}
+
+static void WriteOptionsFile(Env* env, const std::string& path,
+                             const std::string& options_file, int major,
+                             int minor, const std::string& db_opts,
+                             const std::string& cf_opts,
+                             const std::string& bbt_opts = "") {
+  std::string options_file_header =
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=" +
+      ToString(major) + "." + ToString(minor) +
+      ".0\n"
+      "  options_file_version=1\n";
+
+  std::unique_ptr<WritableFile> wf;
+  ASSERT_OK(env->NewWritableFile(path + "/" + options_file, &wf, EnvOptions()));
+  ASSERT_OK(
+      wf->Append(options_file_header + "[ DBOptions ]\n" + db_opts + "\n"));
+  ASSERT_OK(wf->Append(
+      "[CFOptions   \"default\"]  # column family must be specified\n" +
+      cf_opts + "\n"));
+  ASSERT_OK(wf->Append("[TableOptions/BlockBasedTable   \"default\"]\n" +
+                       bbt_opts + "\n"));
+  ASSERT_OK(wf->Close());
+
+  std::string latest_options_file;
+  ASSERT_OK(GetLatestOptionsFileName(path, env, &latest_options_file));
+  ASSERT_EQ(latest_options_file, options_file);
+}
+
+TEST_F(OptionsUtilTest, BadLatestOptions) {
+  Status s;
+  ConfigOptions config_opts;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  Options options;
+  options.env = env_.get();
+  config_opts.env = env_.get();
+  config_opts.ignore_unknown_options = false;
+  config_opts.delimiter = "\n";
+
+  ConfigOptions ignore_opts = config_opts;
+  ignore_opts.ignore_unknown_options = true;
+
+  std::string options_file_name;
+
+  // Test where the db directory exists but is empty
+  ASSERT_OK(options.env->CreateDir(dbname_));
+  ASSERT_NOK(
+      GetLatestOptionsFileName(dbname_, options.env, &options_file_name));
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+
+  // Write an options file for a previous major release with an unknown DB
+  // Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0001", ROCKSDB_MAJOR - 1,
+                   ROCKSDB_MINOR, "unknown_db_opt=true", "");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Write an options file for a previous minor release with an unknown CF
+  // Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0002", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR - 1, "", "unknown_cf_opt=true");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Write an options file for a previous minor release with an unknown BBT
+  // Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0003", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR - 1, "", "", "unknown_bbt_opt=true");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the current release with an unknown DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0004", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR, "unknown_db_opt=true", "");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the current release with an unknown CF Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0005", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR, "", "unknown_cf_opt=true");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the current release with an invalid DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0006", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR, "create_if_missing=hello", "");
+  s = LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  // Even though ignore_unknown_options=true, we still return an error...
+  s = LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Write an options file for the next release with an invalid DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0007", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR + 1, "create_if_missing=hello", "");
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+
+  // Write an options file for the next release with an unknown DB Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0008", ROCKSDB_MAJOR,
+                   ROCKSDB_MINOR + 1, "unknown_db_opt=true", "");
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  // Ignore the errors for future releases when ignore_unknown_options=true
+  ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+
+  // Write an options file for the next major release with an unknown CF Option
+  WriteOptionsFile(options.env, dbname_, "OPTIONS-0009", ROCKSDB_MAJOR + 1,
+                   ROCKSDB_MINOR, "", "unknown_cf_opt=true");
+  ASSERT_NOK(LoadLatestOptions(config_opts, dbname_, &db_opts, &cf_descs));
+  // Ignore the errors for future releases when ignore_unknown_options=true
+  ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
+}
+
+TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
+  DB* db;
+  Options options;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  options.create_if_missing = true;
+
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value0"));
+  delete db;
+
+  auto new_dbname = dbname_ + "_2";
+
+  ASSERT_OK(options.env->RenameFile(dbname_, new_dbname));
+  ASSERT_OK(LoadLatestOptions(new_dbname, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(cf_descs.size(), 1U);
+
+  db_opts.create_if_missing = false;
+  ASSERT_OK(DB::Open(db_opts, new_dbname, cf_descs, &handles, &db));
+  std::string value;
+  ASSERT_OK(db->Get(ReadOptions(), "foo", &value));
+  ASSERT_EQ("value0", value);
+  // close the db
+  for (auto* handle : handles) {
+    delete handle;
   }
+  delete db;
+  Options new_options(db_opts, cf_descs[0].options);
+  ASSERT_OK(DestroyDB(new_dbname, new_options, cf_descs));
+  ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+TEST_F(OptionsUtilTest, WalDirSettings) {
+  DB* db;
+  Options options;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  options.create_if_missing = true;
+
+  // Open a DB with no wal dir set.  The wal_dir should stay empty
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+
+  // Open a DB with wal_dir == dbname.  The wal_dir should be set to empty
+  options.wal_dir = dbname_;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+
+  // Open a DB with no wal_dir but a db_path==dbname_.  The wal_dir should be
+  // empty
+  options.wal_dir = "";
+  options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+
+  // Open a DB with no wal_dir==dbname_ and db_path==dbname_.  The wal_dir
+  // should be empty
+  options.wal_dir = dbname_ + "/";
+  options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  // Open a DB with no wal_dir but db_path != db_name.  The wal_dir == dbname_
+  options.wal_dir = "";
+  options.db_paths.clear();
+  options.db_paths.emplace_back(dbname_ + "_0",
+                                std::numeric_limits<uint64_t>::max());
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, dbname_);
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  // Open a DB with wal_dir != db_name.  The wal_dir remains unchanged
+  options.wal_dir = dbname_ + "/wal";
+  options.db_paths.clear();
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, dbname_ + "/wal");
+  ASSERT_OK(DestroyDB(dbname_, options));
 }
 
+TEST_F(OptionsUtilTest, WalDirInOptins) {
+  DB* db;
+  Options options;
+  DBOptions db_opts;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  // Store an options file with wal_dir=dbname_ and make sure it still loads
+  // when the input wal_dir is empty
+  options.create_if_missing = true;
+  options.wal_dir = "";
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  options.wal_dir = dbname_;
+  std::string options_file;
+  ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file));
+  ASSERT_OK(PersistRocksDBOptions(options, {"default"}, {options},
+                                  dbname_ + "/" + options_file,
+                                  options.env->GetFileSystem().get()));
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, dbname_);
+  options.wal_dir = "";
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  delete db;
+  ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs));
+  ASSERT_EQ(db_opts.wal_dir, "");
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,7 +6,6 @@
 
 #include "utilities/persistent_cache/block_cache_tier.h"
 
-#include <regex>
 #include <utility>
 #include <vector>
 
@@ -222,7 +221,7 @@
   assert(data.size());
   assert(cache_file_);
 
-  StopWatchNano timer(opt_.env, /*auto_start=*/ true);
+  StopWatchNano timer(opt_.clock, /*auto_start=*/true);
 
   WriteLock _(&lock_);
 
@@ -265,7 +264,7 @@
 
 Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr<char[]>* val,
                               size_t* size) {
-  StopWatchNano timer(opt_.env, /*auto_start=*/ true);
+  StopWatchNano timer(opt_.clock, /*auto_start=*/true);
 
   LBA lba;
   bool status;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier.h	2025-05-19 16:14:28.000000000 +0000
@@ -19,21 +19,20 @@
 #include <string>
 #include <thread>
 
-#include "rocksdb/cache.h"
-#include "rocksdb/comparator.h"
-#include "rocksdb/persistent_cache.h"
-
-#include "utilities/persistent_cache/block_cache_tier_file.h"
-#include "utilities/persistent_cache/block_cache_tier_metadata.h"
-#include "utilities/persistent_cache/persistent_cache_util.h"
-
 #include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "monitoring/histogram.h"
 #include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/system_clock.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/mutexlock.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+#include "utilities/persistent_cache/block_cache_tier_metadata.h"
+#include "utilities/persistent_cache/persistent_cache_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -53,7 +52,7 @@
 
   virtual ~BlockCacheTier() {
     // Close is re-entrant so we can call close even if it is already closed
-    Close();
+    Close().PermitUncheckedError();
     assert(!insert_th_.joinable());
   }
 
@@ -74,7 +73,7 @@
   void TEST_Flush() override {
     while (insert_ops_.Size()) {
       /* sleep override */
-      Env::Default()->SleepForMicroseconds(1000000);
+      SystemClock::Default()->SleepForMicroseconds(1000000);
     }
   }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc	2025-05-19 16:14:28.000000000 +0000
@@ -16,6 +16,7 @@
 #include "env/composite_env_wrapper.h"
 #include "logging/logging.h"
 #include "port/port.h"
+#include "rocksdb/system_clock.h"
 #include "util/crc32c.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -32,15 +33,15 @@
   return s;
 }
 
-Status NewRandomAccessCacheFile(Env* const env, const std::string& filepath,
-                                std::unique_ptr<RandomAccessFile>* file,
+Status NewRandomAccessCacheFile(const std::shared_ptr<FileSystem>& fs,
+                                const std::string& filepath,
+                                std::unique_ptr<FSRandomAccessFile>* file,
                                 const bool use_direct_reads = true) {
-  assert(env);
+  assert(fs.get());
 
-  EnvOptions opt;
+  FileOptions opt;
   opt.use_direct_reads = use_direct_reads;
-  Status s = env->NewRandomAccessFile(filepath, file, opt);
-  return s;
+  return fs->NewRandomAccessFile(filepath, opt, file, nullptr);
 }
 
 //
@@ -209,17 +210,18 @@
   rwlock_.AssertHeld();
 
   ROCKS_LOG_DEBUG(log_, "Opening cache file %s", Path().c_str());
+  assert(env_);
 
-  std::unique_ptr<RandomAccessFile> file;
-  Status status =
-      NewRandomAccessCacheFile(env_, Path(), &file, enable_direct_reads);
+  std::unique_ptr<FSRandomAccessFile> file;
+  Status status = NewRandomAccessCacheFile(env_->GetFileSystem(), Path(), &file,
+                                           enable_direct_reads);
   if (!status.ok()) {
     Error(log_, "Error opening random access file %s. %s", Path().c_str(),
           status.ToString().c_str());
     return false;
   }
-  freader_.reset(new RandomAccessFileReader(
-      NewLegacyRandomAccessFileWrapper(file), Path(), env_));
+  freader_.reset(new RandomAccessFileReader(std::move(file), Path(),
+                                            env_->GetSystemClock().get()));
 
   return true;
 }
@@ -235,7 +237,8 @@
   }
 
   Slice result;
-  Status s = freader_->Read(lba.off_, lba.size_, &result, scratch);
+  Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch,
+                            nullptr);
   if (!s.ok()) {
     Error(log_, "Error reading from file %s. %s", Path().c_str(),
           s.ToString().c_str());
@@ -578,7 +581,7 @@
       // We can fail to reserve space if every file in the system
       // is being currently accessed
       /* sleep override */
-      Env::Default()->SleepForMicroseconds(1000000);
+      SystemClock::Default()->SleepForMicroseconds(1000000);
     }
 
     DispatchIO(io);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_file.h	2025-05-19 16:14:28.000000000 +0000
@@ -65,7 +65,7 @@
   uint32_t size_ = 0;
 };
 
-typedef LogicalBlockAddress LBA;
+using LBA = LogicalBlockAddress;
 
 // class Writer
 //
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.h	2025-05-19 16:14:28.000000000 +0000
@@ -95,9 +95,9 @@
     }
   };
 
-  typedef EvictableHashTable<BlockCacheFile, BlockCacheFileHash,
-                             BlockCacheFileEqual>
-      CacheFileIndexType;
+  using CacheFileIndexType =
+      EvictableHashTable<BlockCacheFile, BlockCacheFileHash,
+                         BlockCacheFileEqual>;
 
   // Block Lookup Index
   //
@@ -114,7 +114,7 @@
     }
   };
 
-  typedef HashTable<BlockInfo*, Hash, Equal> BlockIndexType;
+  using BlockIndexType = HashTable<BlockInfo*, Hash, Equal>;
 
   CacheFileIndexType cache_file_index_;
   BlockIndexType block_index_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/hash_table_evictable.h	2025-05-19 16:14:28.000000000 +0000
@@ -24,7 +24,7 @@
 template <class T, class Hash, class Equal>
 class EvictableHashTable : private HashTable<T*, Hash, Equal> {
  public:
-  typedef HashTable<T*, Hash, Equal> hash_table;
+  using hash_table = HashTable<T*, Hash, Equal>;
 
   explicit EvictableHashTable(const size_t capacity = 1024 * 1024,
                               const float load_factor = 2.0,
@@ -141,7 +141,7 @@
   port::RWMutex* GetMutex(T* t) { return hash_table::GetMutex(t); }
 
  private:
-  typedef LRUList<T> LRUListType;
+  using LRUListType = LRUList<T>;
 
   typename hash_table::Bucket& GetBucket(const uint64_t h) {
     const uint32_t bucket_idx = h % hash_table::nbuckets_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc	2025-05-19 16:14:28.000000000 +0000
@@ -15,18 +15,17 @@
 #include <sstream>
 #include <unordered_map>
 
-#include "rocksdb/env.h"
-
-#include "utilities/persistent_cache/block_cache_tier.h"
-#include "utilities/persistent_cache/persistent_cache_tier.h"
-#include "utilities/persistent_cache/volatile_tier_impl.h"
-
 #include "monitoring/histogram.h"
 #include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "table/block_based/block_builder.h"
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+#include "utilities/persistent_cache/volatile_tier_impl.h"
 
 DEFINE_int32(nsec, 10, "nsec");
 DEFINE_int32(nthread_write, 1, "Insert threads");
@@ -128,7 +127,7 @@
           std::bind(&CacheTierBenchmark::Read, this));
 
     // Wait till FLAGS_nsec and then signal to quit
-    StopWatchNano t(Env::Default(), /*auto_start=*/true);
+    StopWatchNano t(SystemClock::Default().get(), /*auto_start=*/true);
     size_t sec = t.ElapsedNanos() / 1000000000ULL;
     while (!quit_) {
       sec = t.ElapsedNanos() / 1000000000ULL;
@@ -195,7 +194,7 @@
     auto block = NewBlock(key);
 
     // insert
-    StopWatchNano timer(Env::Default(), /*auto_start=*/true);
+    StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
     while (true) {
       Status status = cache_->Insert(block_key, block.get(), FLAGS_iosize);
       if (status.ok()) {
@@ -227,7 +226,7 @@
     Slice key = FillKey(k, val);
 
     // Lookup in cache
-    StopWatchNano timer(Env::Default(), /*auto_start=*/true);
+    StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
     std::unique_ptr<char[]> block;
     size_t size;
     Status status = cache_->Lookup(key, &block, &size);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,10 +6,7 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// GetUniqueIdFromFile is not implemented on Windows. Persistent cache
-// breaks when that function is not implemented
-#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+#if !defined ROCKSDB_LITE
 
 #include "utilities/persistent_cache/persistent_cache_test.h"
 
@@ -17,6 +14,7 @@
 #include <memory>
 #include <thread>
 
+#include "file/file_util.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -41,30 +39,9 @@
 }
 #endif
 
-static void RemoveDirectory(const std::string& folder) {
-  std::vector<std::string> files;
-  Status status = Env::Default()->GetChildren(folder, &files);
-  if (!status.ok()) {
-    // we assume the directory does not exist
-    return;
-  }
-
-  // cleanup files with the patter :digi:.rc
-  for (auto file : files) {
-    if (file == "." || file == "..") {
-      continue;
-    }
-    status = Env::Default()->DeleteFile(folder + "/" + file);
-    assert(status.ok());
-  }
-
-  status = Env::Default()->DeleteDir(folder);
-  assert(status.ok());
-}
-
 static void OnDeleteDir(void* arg) {
   char* dir = static_cast<char*>(arg);
-  RemoveDirectory(std::string(dir));
+  ASSERT_OK(DestroyDir(Env::Default(), std::string(dir)));
 }
 
 //
@@ -160,7 +137,6 @@
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
-#if defined(TRAVIS) || defined(ROCKSDB_VALGRIND_RUN)
 // Travis is unable to handle the normal version of the tests running out of
 // fds, out of space and timeouts. This is an easier version of the test
 // specifically written for Travis
@@ -177,9 +153,10 @@
                           /*memory_size=*/static_cast<size_t>(1 * 1024 * 1024));
   RunInsertTest(/*nthreads=*/1, /*max_keys=*/1024);
 }
-#else
+
 // Volatile cache tests
-TEST_F(PersistentCacheTierTest, VolatileCacheInsert) {
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsert) {
   for (auto nthreads : {1, 5}) {
     for (auto max_keys :
          {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
@@ -189,7 +166,8 @@
   }
 }
 
-TEST_F(PersistentCacheTierTest, VolatileCacheInsertWithEviction) {
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsertWithEviction) {
   for (auto nthreads : {1, 5}) {
     for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
       cache_ = std::make_shared<VolatileCacheTier>(
@@ -200,7 +178,8 @@
 }
 
 // Block cache tests
-TEST_F(PersistentCacheTierTest, BlockCacheInsert) {
+// DISABLED for now (expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsert) {
   for (auto direct_writes : {true, false}) {
     for (auto nthreads : {1, 5}) {
       for (auto max_keys :
@@ -214,7 +193,8 @@
   }
 }
 
-TEST_F(PersistentCacheTierTest, BlockCacheInsertWithEviction) {
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithEviction) {
   for (auto nthreads : {1, 5}) {
     for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
       cache_ = NewBlockCache(Env::Default(), path_,
@@ -225,7 +205,8 @@
 }
 
 // Tiered cache tests
-TEST_F(PersistentCacheTierTest, TieredCacheInsert) {
+// DISABLED for now (expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsert) {
   for (auto nthreads : {1, 5}) {
     for (auto max_keys :
          {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
@@ -238,7 +219,8 @@
 
 // the tests causes a lot of file deletions which Travis limited testing
 // environment cannot handle
-TEST_F(PersistentCacheTierTest, TieredCacheInsertWithEviction) {
+// DISABLED for now (somewhat expensive)
+TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsertWithEviction) {
   for (auto nthreads : {1, 5}) {
     for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
       cache_ = NewTieredCache(
@@ -249,21 +231,21 @@
     }
   }
 }
-#endif
 
 std::shared_ptr<PersistentCacheTier> MakeVolatileCache(
-    const std::string& /*dbname*/) {
+    Env* /*env*/, const std::string& /*dbname*/) {
   return std::make_shared<VolatileCacheTier>();
 }
 
-std::shared_ptr<PersistentCacheTier> MakeBlockCache(const std::string& dbname) {
-  return NewBlockCache(Env::Default(), dbname);
+std::shared_ptr<PersistentCacheTier> MakeBlockCache(Env* env,
+                                                    const std::string& dbname) {
+  return NewBlockCache(env, dbname);
 }
 
 std::shared_ptr<PersistentCacheTier> MakeTieredCache(
-    const std::string& dbname) {
+    Env* env, const std::string& dbname) {
   const auto memory_size = 1 * 1024 * 1024 * kStressFactor;
-  return NewTieredCache(Env::Default(), dbname, static_cast<size_t>(memory_size));
+  return NewTieredCache(env, dbname, static_cast<size_t>(memory_size));
 }
 
 #ifdef OS_LINUX
@@ -294,7 +276,8 @@
   }
 }
 
-PersistentCacheDBTest::PersistentCacheDBTest() : DBTestBase("/cache_test") {
+PersistentCacheDBTest::PersistentCacheDBTest()
+    : DBTestBase("cache_test", /*env_do_fsync=*/true) {
 #ifdef OS_LINUX
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -308,9 +291,6 @@
 void PersistentCacheDBTest::RunTest(
     const std::function<std::shared_ptr<PersistentCacheTier>(bool)>& new_pcache,
     const size_t max_keys = 100 * 1024, const size_t max_usecase = 5) {
-  if (!Snappy_Supported()) {
-    return;
-  }
 
   // number of insertion interations
   int num_iter = static_cast<int>(max_keys * kStressFactor);
@@ -434,34 +414,36 @@
     options.create_if_missing = true;
     DestroyAndReopen(options);
 
-    pcache->Close();
+    ASSERT_OK(pcache->Close());
   }
 }
 
-#if defined(TRAVIS) || defined(ROCKSDB_VALGRIND_RUN)
 // Travis is unable to handle the normal version of the tests running out of
 // fds, out of space and timeouts. This is an easier version of the test
-// specifically written for Travis
+// specifically written for Travis.
+// Now used generally because main tests are too expensive as unit tests.
 TEST_F(PersistentCacheDBTest, BasicTest) {
-  RunTest(std::bind(&MakeBlockCache, dbname_), /*max_keys=*/1024,
+  RunTest(std::bind(&MakeBlockCache, env_, dbname_), /*max_keys=*/1024,
           /*max_usecase=*/1);
 }
-#else
+
 // test table with block page cache
-TEST_F(PersistentCacheDBTest, BlockCacheTest) {
-  RunTest(std::bind(&MakeBlockCache, dbname_));
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_BlockCacheTest) {
+  RunTest(std::bind(&MakeBlockCache, env_, dbname_));
 }
 
 // test table with volatile page cache
-TEST_F(PersistentCacheDBTest, VolatileCacheTest) {
-  RunTest(std::bind(&MakeVolatileCache, dbname_));
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_VolatileCacheTest) {
+  RunTest(std::bind(&MakeVolatileCache, env_, dbname_));
 }
 
 // test table with tiered page cache
-TEST_F(PersistentCacheDBTest, TieredCacheTest) {
-  RunTest(std::bind(&MakeTieredCache, dbname_));
+// DISABLED for now (very expensive, especially memory)
+TEST_F(PersistentCacheDBTest, DISABLED_TieredCacheTest) {
+  RunTest(std::bind(&MakeTieredCache, env_, dbname_));
 }
-#endif
 
 }  // namespace ROCKSDB_NAMESPACE
 
@@ -469,6 +451,6 @@
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
-#else   // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+#else   // !defined ROCKSDB_LITE
 int main() { return 0; }
-#endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
+#endif  // !defined ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_test.h	2025-05-19 16:14:28.000000000 +0000
@@ -24,6 +24,7 @@
 #include "rocksdb/cache.h"
 #include "table/block_based/block_builder.h"
 #include "test_util/testharness.h"
+#include "util/random.h"
 #include "utilities/persistent_cache/volatile_tier_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -182,7 +183,7 @@
     ASSERT_EQ(stats_verify_hits_, max_keys);
     ASSERT_EQ(stats_verify_missed_, 0);
 
-    cache_->Close();
+    ASSERT_OK(cache_->Close());
     cache_.reset();
   }
 
@@ -193,7 +194,7 @@
     ASSERT_LT(stats_verify_hits_, max_keys);
     ASSERT_GT(stats_verify_missed_, 0);
 
-    cache_->Close();
+    ASSERT_OK(cache_->Close());
     cache_.reset();
   }
 
@@ -205,7 +206,7 @@
     ASSERT_GT(stats_verify_hits_, 0);
     ASSERT_GT(stats_verify_missed_, 0);
 
-    cache_->Close();
+    ASSERT_OK(cache_->Close());
     cache_.reset();
   }
 
@@ -255,7 +256,7 @@
     std::string str;
     for (int i = 0; i < num_iter; i++) {
       if (i % 4 == 0) {  // high compression ratio
-        str = RandomString(&rnd, 1000);
+        str = rnd.RandomString(1000);
       }
       values->push_back(str);
       ASSERT_OK(Put(1, Key(i), (*values)[i]));
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc	2025-05-19 16:14:28.000000000 +0000
@@ -99,6 +99,10 @@
   return PersistentCache::StatsType{};
 }
 
+uint64_t PersistentCacheTier::NewId() {
+  return last_id_.fetch_add(1, std::memory_order_relaxed);
+}
+
 //
 // PersistentTieredCache implementation
 //
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/persistent_cache_tier.h	2025-05-19 16:14:28.000000000 +0000
@@ -17,6 +17,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
 
 // Persistent Cache
 //
@@ -86,6 +87,8 @@
       const std::shared_ptr<Logger>& _log,
       const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
     env = _env;
+    clock = (env != nullptr) ? env->GetSystemClock().get()
+                             : SystemClock::Default().get();
     path = _path;
     log = _log;
     cache_size = _cache_size;
@@ -124,10 +127,10 @@
   }
 
   //
-  // Env abstraction to use for systmer level operations
+  // Env abstraction to use for system level operations
   //
   Env* env;
-
+  SystemClock* clock;
   //
   // Path for the block cache where blocks are persisted
   //
@@ -232,7 +235,7 @@
 // to enable management and stacking of tiers.
 class PersistentCacheTier : public PersistentCache {
  public:
-  typedef std::shared_ptr<PersistentCacheTier> Tier;
+  using Tier = std::shared_ptr<PersistentCacheTier>;
 
   virtual ~PersistentCacheTier() {}
 
@@ -266,6 +269,8 @@
 
   virtual std::string GetPrintableOptions() const override = 0;
 
+  virtual uint64_t NewId() override;
+
   // Return a reference to next tier
   virtual Tier& next_tier() { return next_tier_; }
 
@@ -283,6 +288,7 @@
 
  private:
   Tier next_tier_;  // next tier
+  std::atomic<uint64_t> last_id_{1};
 };
 
 // PersistentTieredCache
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc	2025-05-19 16:14:28.000000000 +0000
@@ -122,8 +122,10 @@
 
   // push the evicted object to the next level
   if (next_tier()) {
-    next_tier()->Insert(Slice(edata->key), edata->value.c_str(),
-                        edata->value.size());
+    // TODO: Should the insert error be ignored?
+    Status s = next_tier()->Insert(Slice(edata->key), edata->value.c_str(),
+                                   edata->value.size());
+    s.PermitUncheckedError();
   }
 
   // adjust size and destroy data
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -124,8 +124,8 @@
     }
   };
 
-  typedef EvictableHashTable<CacheData, CacheDataHash, CacheDataEqual>
-      IndexType;
+  using IndexType =
+      EvictableHashTable<CacheData, CacheDataHash, CacheDataEqual>;
 
   // Evict LRU tail
   bool Evict();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc	2025-05-19 16:14:28.000000000 +0000
@@ -4,8 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "utilities/simulator_cache/cache_simulator.h"
+
 #include <algorithm>
+
 #include "db/dbformat.h"
+#include "rocksdb/trace_record.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -22,8 +25,10 @@
     sim_cache_->Release(handle);
     return true;
   }
-  sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(),
-                     /*deleter=*/nullptr);
+  // TODO: Should we check for errors here?
+  auto s = sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(),
+                              /*deleter=*/nullptr);
+  s.PermitUncheckedError();
   return false;
 }
 
@@ -45,8 +50,11 @@
     is_cache_miss = false;
   } else {
     if (access.no_insert == Boolean::kFalse && admit && access.block_size > 0) {
-      sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
-                         /*deleter=*/nullptr);
+      // Ignore errors on insert
+      auto s = sim_cache_->Insert(access.block_key, /*value=*/nullptr,
+                                  access.block_size,
+                                  /*deleter=*/nullptr);
+      s.PermitUncheckedError();
     }
   }
   miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
@@ -100,8 +108,11 @@
     sim_cache_->Release(handle);
     *is_cache_miss = false;
   } else if (!no_insert && *admitted && value_size > 0) {
-    sim_cache_->Insert(key, /*value=*/nullptr, value_size, /*deleter=*/nullptr,
-                       /*handle=*/nullptr, priority);
+    // TODO: Should we check for an error here?
+    auto s = sim_cache_->Insert(key, /*value=*/nullptr, value_size,
+                                /*deleter=*/nullptr,
+                                /*handle=*/nullptr, priority);
+    s.PermitUncheckedError();
   }
   if (update_metrics) {
     miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
@@ -176,9 +187,12 @@
         /*is_user_access=*/true, &is_cache_miss, &admitted,
         /*update_metrics=*/true);
     if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) {
-      sim_cache_->Insert(row_key, /*value=*/nullptr,
-                         access.referenced_data_size, /*deleter=*/nullptr,
-                         /*handle=*/nullptr, Cache::Priority::HIGH);
+      // TODO: Should we check for an error here?
+      auto s = sim_cache_->Insert(row_key, /*value=*/nullptr,
+                                  access.referenced_data_size,
+                                  /*deleter=*/nullptr,
+                                  /*handle=*/nullptr, Cache::Priority::HIGH);
+      s.PermitUncheckedError();
       status.row_key_status[row_key] = InsertResult::INSERTED;
     }
     return;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,7 +6,9 @@
 #include "utilities/simulator_cache/cache_simulator.h"
 
 #include <cstdlib>
+
 #include "rocksdb/env.h"
+#include "rocksdb/trace_record.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc	2025-05-19 16:14:28.000000000 +0000
@@ -4,14 +4,16 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/utilities/sim_cache.h"
+
 #include <atomic>
-#include "env/composite_env_wrapper.h"
+#include <iomanip>
+
 #include "file/writable_file_writer.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "util/mutexlock.h"
-#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -26,6 +28,7 @@
     MutexLock l(&mutex_);
 
     StopLoggingInternal();
+    bg_status_.PermitUncheckedError();
   }
 
   Status StartLogging(const std::string& activity_log_file, Env* env,
@@ -34,8 +37,7 @@
     assert(env != nullptr);
 
     Status status;
-    EnvOptions env_opts;
-    std::unique_ptr<WritableFile> log_file;
+    FileOptions file_opts;
 
     MutexLock l(&mutex_);
 
@@ -43,13 +45,11 @@
     StopLoggingInternal();
 
     // Open log file
-    status = env->NewWritableFile(activity_log_file, &log_file, env_opts);
+    status = WritableFileWriter::Create(env->GetFileSystem(), activity_log_file,
+                                        file_opts, &file_writer_, nullptr);
     if (!status.ok()) {
       return status;
     }
-    file_writer_.reset(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(log_file)), activity_log_file,
-        env_opts));
 
     max_logging_size_ = max_logging_size;
     activity_logging_enabled_.store(true);
@@ -68,11 +68,12 @@
       return;
     }
 
-    std::string log_line = "LOOKUP - " + key.ToString(true) + "\n";
-
+    std::ostringstream oss;
     // line format: "LOOKUP - <KEY>"
+    oss << "LOOKUP - " << key.ToString(true) << std::endl;
+
     MutexLock l(&mutex_);
-    Status s = file_writer_->Append(log_line);
+    Status s = file_writer_->Append(oss.str());
     if (!s.ok() && bg_status_.ok()) {
       bg_status_ = s;
     }
@@ -88,16 +89,11 @@
       return;
     }
 
-    std::string log_line = "ADD - ";
-    log_line += key.ToString(true);
-    log_line += " - ";
-    AppendNumberTo(&log_line, size);
-  // @lint-ignore TXT2 T25377293 Grandfathered in
-		log_line += "\n";
-
+    std::ostringstream oss;
     // line format: "ADD - <KEY> - <KEY-SIZE>"
+    oss << "ADD - " << key.ToString(true) << " - " << size << std::endl;
     MutexLock l(&mutex_);
-    Status s = file_writer_->Append(log_line);
+    Status s = file_writer_->Append(oss.str());
     if (!s.ok() && bg_status_.ok()) {
       bg_status_ = s;
     }
@@ -168,6 +164,7 @@
     cache_->SetStrictCapacityLimit(strict_capacity_limit);
   }
 
+  using Cache::Insert;
   Status Insert(const Slice& key, void* value, size_t charge,
                 void (*deleter)(const Slice& key, void* value), Handle** handle,
                 Priority priority) override {
@@ -178,9 +175,11 @@
     // *Lambda function without capture can be assgined to a function pointer
     Handle* h = key_only_cache_->Lookup(key);
     if (h == nullptr) {
-      key_only_cache_->Insert(key, nullptr, charge,
-                              [](const Slice& /*k*/, void* /*v*/) {}, nullptr,
-                              priority);
+      // TODO: Check for error here?
+      auto s = key_only_cache_->Insert(
+          key, nullptr, charge, [](const Slice& /*k*/, void* /*v*/) {}, nullptr,
+          priority);
+      s.PermitUncheckedError();
     } else {
       key_only_cache_->Release(h);
     }
@@ -192,6 +191,7 @@
     return cache_->Insert(key, value, charge, deleter, handle, priority);
   }
 
+  using Cache::Lookup;
   Handle* Lookup(const Slice& key, Statistics* stats) override {
     Handle* h = key_only_cache_->Lookup(key);
     if (h != nullptr) {
@@ -212,6 +212,7 @@
 
   bool Ref(Handle* handle) override { return cache_->Ref(handle); }
 
+  using Cache::Release;
   bool Release(Handle* handle, bool force_erase = false) override {
     return cache_->Release(handle, force_erase);
   }
@@ -241,6 +242,10 @@
     return cache_->GetCharge(handle);
   }
 
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return cache_->GetDeleter(handle);
+  }
+
   size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
 
   void DisownData() override {
@@ -254,6 +259,13 @@
     cache_->ApplyToAllCacheEntries(callback, thread_safe);
   }
 
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    cache_->ApplyToAllEntries(callback, opts);
+  }
+
   void EraseUnRefEntries() override {
     cache_->EraseUnRefEntries();
     key_only_cache_->EraseUnRefEntries();
@@ -283,25 +295,23 @@
   }
 
   std::string ToString() const override {
-    std::string res;
-    res.append("SimCache MISSes: " + std::to_string(get_miss_counter()) + "\n");
-    res.append("SimCache HITs:    " + std::to_string(get_hit_counter()) + "\n");
-    char buff[350];
+    std::ostringstream oss;
+    oss << "SimCache MISSes:  " << get_miss_counter() << std::endl;
+    oss << "SimCache HITs:    " << get_hit_counter() << std::endl;
     auto lookups = get_miss_counter() + get_hit_counter();
-    snprintf(buff, sizeof(buff), "SimCache HITRATE: %.2f%%\n",
-             (lookups == 0 ? 0 : get_hit_counter() * 100.0f / lookups));
-    res.append(buff);
-    return res;
+    oss << "SimCache HITRATE: " << std::fixed << std::setprecision(2)
+        << (lookups == 0 ? 0 : get_hit_counter() * 100.0f / lookups)
+        << std::endl;
+    return oss.str();
   }
 
   std::string GetPrintableOptions() const override {
-    std::string ret;
-    ret.reserve(20000);
-    ret.append("    cache_options:\n");
-    ret.append(cache_->GetPrintableOptions());
-    ret.append("    sim_cache_options:\n");
-    ret.append(key_only_cache_->GetPrintableOptions());
-    return ret;
+    std::ostringstream oss;
+    oss << "    cache_options:" << std::endl;
+    oss << cache_->GetPrintableOptions();
+    oss << "    sim_cache_options:" << std::endl;
+    oss << key_only_cache_->GetPrintableOptions();
+    return oss.str();
   }
 
   Status StartActivityLogging(const std::string& activity_log_file, Env* env,
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -4,7 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/utilities/sim_cache.h"
+
 #include <cstdlib>
+
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 
@@ -21,7 +23,7 @@
   const size_t kNumBlocks = 5;
   const size_t kValueSize = 1000;
 
-  SimCacheTest() : DBTestBase("/sim_cache_test") {}
+  SimCacheTest() : DBTestBase("sim_cache_test", /*env_do_fsync=*/true) {}
 
   BlockBasedTableOptions GetTableOptions() {
     BlockBasedTableOptions table_options;
@@ -35,7 +37,7 @@
     options.create_if_missing = true;
     // options.compression = kNoCompression;
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     return options;
   }
 
@@ -84,9 +86,11 @@
   co.metadata_charge_policy = kDontChargeCacheMetadata;
   std::shared_ptr<SimCache> simCache = NewSimCache(NewLRUCache(co), 20000, 0);
   table_options.block_cache = simCache;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   RecordCacheCounters(options);
+  // due to cache entry stats collector
+  uint64_t base_misses = simCache->get_miss_counter();
 
   std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks);
   Iterator* iter = nullptr;
@@ -99,8 +103,8 @@
     CheckCacheCounters(options, 1, 0, 1, 0);
     iterators[i].reset(iter);
   }
-  ASSERT_EQ(kNumBlocks,
-            simCache->get_hit_counter() + simCache->get_miss_counter());
+  ASSERT_EQ(kNumBlocks, simCache->get_hit_counter() +
+                            simCache->get_miss_counter() - base_misses);
   ASSERT_EQ(0, simCache->get_hit_counter());
   size_t usage = simCache->GetUsage();
   ASSERT_LT(0, usage);
@@ -137,8 +141,8 @@
     CheckCacheCounters(options, 1, 0, 1, 0);
   }
   ASSERT_EQ(0, simCache->GetPinnedUsage());
-  ASSERT_EQ(3 * kNumBlocks + 1,
-            simCache->get_hit_counter() + simCache->get_miss_counter());
+  ASSERT_EQ(3 * kNumBlocks + 1, simCache->get_hit_counter() +
+                                    simCache->get_miss_counter() - base_misses);
   ASSERT_EQ(6, simCache->get_hit_counter());
 }
 
@@ -151,13 +155,13 @@
   co.metadata_charge_policy = kDontChargeCacheMetadata;
   std::shared_ptr<SimCache> sim_cache = NewSimCache(NewLRUCache(co), 20000, 0);
   table_options.block_cache = sim_cache;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
 
   int num_block_entries = 20;
   for (int i = 0; i < num_block_entries; i++) {
-    Put(Key(i), "val");
-    Flush();
+    ASSERT_OK(Put(Key(i), "val"));
+    ASSERT_OK(Flush());
   }
 
   std::string log_file = test::PerThreadDBPath(env_, "cache_log.txt");
@@ -172,24 +176,22 @@
   ASSERT_OK(sim_cache->GetActivityLoggingStatus());
 
   std::string file_contents = "";
-  ReadFileToString(env_, log_file, &file_contents);
+  ASSERT_OK(ReadFileToString(env_, log_file, &file_contents));
+  std::istringstream contents(file_contents);
 
   int lookup_num = 0;
   int add_num = 0;
-  std::string::size_type pos;
-
-  // count number of lookups
-  pos = 0;
-  while ((pos = file_contents.find("LOOKUP -", pos)) != std::string::npos) {
-    ++lookup_num;
-    pos += 1;
-  }
 
-  // count number of additions
-  pos = 0;
-  while ((pos = file_contents.find("ADD -", pos)) != std::string::npos) {
-    ++add_num;
-    pos += 1;
+  std::string line;
+  // count number of lookups and additions
+  while (std::getline(contents, line)) {
+    // check if the line starts with LOOKUP or ADD
+    if (line.rfind("LOOKUP -", 0) == 0) {
+      ++lookup_num;
+    }
+    if (line.rfind("ADD -", 0) == 0) {
+      ++add_num;
+    }
   }
 
   // We asked for every block twice
@@ -199,8 +201,7 @@
   ASSERT_EQ(add_num, num_block_entries);
 
   // Log things again but stop logging automatically after reaching 512 bytes
- // @lint-ignore TXT2 T25377293 Grandfathered in
-	int max_size = 512;
+  int max_size = 512;
   ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_, max_size));
   for (int it = 0; it < 10; it++) {
     for (int i = 0; i < num_block_entries; i++) {
@@ -211,9 +212,9 @@
 
   uint64_t fsize = 0;
   ASSERT_OK(env_->GetFileSize(log_file, &fsize));
-	// error margin of 100 bytes
+  // error margin of 100 bytes
   ASSERT_LT(fsize, max_size + 100);
-	ASSERT_GT(fsize, max_size - 100);
+  ASSERT_GT(fsize, max_size - 100);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc	2025-05-19 16:14:28.000000000 +0000
@@ -3,21 +3,29 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef ROCKSDB_LITE
 #include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
 
 #include <memory>
+#include <sstream>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
 
 CompactOnDeletionCollector::CompactOnDeletionCollector(
-    size_t sliding_window_size, size_t deletion_trigger)
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
     : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets),
       current_bucket_(0),
       num_keys_in_current_bucket_(0),
       num_deletions_in_observation_window_(0),
       deletion_trigger_(deletion_trigger),
+      deletion_ratio_(deletion_ratio),
+      deletion_ratio_enabled_(deletion_ratio > 0 && deletion_ratio <= 1),
       need_compaction_(false),
       finished_(false) {
   memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets);
@@ -34,7 +42,7 @@
                                               SequenceNumber /*seq*/,
                                               uint64_t /*file_size*/) {
   assert(!finished_);
-  if (bucket_size_ == 0) {
+  if (!bucket_size_ && !deletion_ratio_enabled_) {
     // This collector is effectively disabled
     return Status::OK();
   }
@@ -44,47 +52,176 @@
     return Status::OK();
   }
 
-  if (num_keys_in_current_bucket_ == bucket_size_) {
-    // When the current bucket is full, advance the cursor of the
-    // ring buffer to the next bucket.
-    current_bucket_ = (current_bucket_ + 1) % kNumBuckets;
-
-    // Update the current count of observed deletion keys by excluding
-    // the number of deletion keys in the oldest bucket in the
-    // observation window.
-    assert(num_deletions_in_observation_window_ >=
-        num_deletions_in_buckets_[current_bucket_]);
-    num_deletions_in_observation_window_ -=
-        num_deletions_in_buckets_[current_bucket_];
-    num_deletions_in_buckets_[current_bucket_] = 0;
-    num_keys_in_current_bucket_ = 0;
+  if (deletion_ratio_enabled_) {
+    total_entries_++;
+    if (type == kEntryDelete) {
+      deletion_entries_++;
+    }
   }
 
-  num_keys_in_current_bucket_++;
-  if (type == kEntryDelete) {
-    num_deletions_in_observation_window_++;
-    num_deletions_in_buckets_[current_bucket_]++;
-    if (num_deletions_in_observation_window_ >= deletion_trigger_) {
-      need_compaction_ = true;
+  if (bucket_size_) {
+    if (num_keys_in_current_bucket_ == bucket_size_) {
+      // When the current bucket is full, advance the cursor of the
+      // ring buffer to the next bucket.
+      current_bucket_ = (current_bucket_ + 1) % kNumBuckets;
+
+      // Update the current count of observed deletion keys by excluding
+      // the number of deletion keys in the oldest bucket in the
+      // observation window.
+      assert(num_deletions_in_observation_window_ >=
+             num_deletions_in_buckets_[current_bucket_]);
+      num_deletions_in_observation_window_ -=
+          num_deletions_in_buckets_[current_bucket_];
+      num_deletions_in_buckets_[current_bucket_] = 0;
+      num_keys_in_current_bucket_ = 0;
+    }
+
+    num_keys_in_current_bucket_++;
+    if (type == kEntryDelete) {
+      num_deletions_in_observation_window_++;
+      num_deletions_in_buckets_[current_bucket_]++;
+      if (num_deletions_in_observation_window_ >= deletion_trigger_) {
+        need_compaction_ = true;
+      }
     }
   }
+
   return Status::OK();
 }
 
+Status CompactOnDeletionCollector::Finish(
+    UserCollectedProperties* /*properties*/) {
+  if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0) {
+    double ratio = static_cast<double>(deletion_entries_) / total_entries_;
+    need_compaction_ = ratio >= deletion_ratio_;
+  }
+  finished_ = true;
+  return Status::OK();
+}
+static std::unordered_map<std::string, OptionTypeInfo>
+    on_deletion_collector_type_info = {
+#ifndef ROCKSDB_LITE
+        {"window_size",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetWindowSize(ParseSizeT(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = ToString(factory->GetWindowSize());
+            return Status::OK();
+          },
+          nullptr}},
+        {"deletion_trigger",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetDeletionTrigger(ParseSizeT(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = ToString(factory->GetDeletionTrigger());
+            return Status::OK();
+          },
+          nullptr}},
+        {"deletion_ratio",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetDeletionRatio(ParseDouble(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = ToString(factory->GetDeletionRatio());
+            return Status::OK();
+          },
+          nullptr}},
+
+#endif  // ROCKSDB_LITE
+};
+
+CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory(
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+    : sliding_window_size_(sliding_window_size),
+      deletion_trigger_(deletion_trigger),
+      deletion_ratio_(deletion_ratio) {
+  RegisterOptions("", this, &on_deletion_collector_type_info);
+}
+
 TablePropertiesCollector*
 CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector(
     TablePropertiesCollectorFactory::Context /*context*/) {
-  return new CompactOnDeletionCollector(
-      sliding_window_size_.load(), deletion_trigger_.load());
+  return new CompactOnDeletionCollector(sliding_window_size_.load(),
+                                        deletion_trigger_.load(),
+                                        deletion_ratio_.load());
+}
+
+std::string CompactOnDeletionCollectorFactory::ToString() const {
+  std::ostringstream cfg;
+  cfg << Name() << " (Sliding window size = " << sliding_window_size_.load()
+      << " Deletion trigger = " << deletion_trigger_.load()
+      << " Deletion ratio = " << deletion_ratio_.load() << ')';
+  return cfg.str();
 }
 
 std::shared_ptr<CompactOnDeletionCollectorFactory>
-    NewCompactOnDeletionCollectorFactory(
-        size_t sliding_window_size,
-        size_t deletion_trigger) {
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                     size_t deletion_trigger,
+                                     double deletion_ratio) {
   return std::shared_ptr<CompactOnDeletionCollectorFactory>(
-      new CompactOnDeletionCollectorFactory(
-          sliding_window_size, deletion_trigger));
+      new CompactOnDeletionCollectorFactory(sliding_window_size,
+                                            deletion_trigger, deletion_ratio));
 }
-}  // namespace ROCKSDB_NAMESPACE
+namespace {
+static int RegisterTablePropertiesCollectorFactories(
+    ObjectLibrary& library, const std::string& /*arg*/) {
+  library.AddFactory<TablePropertiesCollectorFactory>(
+      CompactOnDeletionCollectorFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<TablePropertiesCollectorFactory>* guard,
+         std::string* /* errmsg */) {
+        // By default, create a CompactionOnDeletionCollector that is disabled.
+        // Users will need to provide configuration parameters or call the
+        // corresponding Setter to enable the factory.
+        guard->reset(new CompactOnDeletionCollectorFactory(0, 0, 0));
+        return guard->get();
+      });
+  return 1;
+}
+}  // namespace
 #endif  // !ROCKSDB_LITE
+
+Status TablePropertiesCollectorFactory::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<TablePropertiesCollectorFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterTablePropertiesCollectorFactories(*(ObjectLibrary::Default().get()),
+                                              "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<TablePropertiesCollectorFactory>(options, value,
+                                                           nullptr, result);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h	2025-05-19 16:14:28.000000000 +0000
@@ -11,9 +11,8 @@
 
 class CompactOnDeletionCollector : public TablePropertiesCollector {
  public:
-  CompactOnDeletionCollector(
-      size_t sliding_window_size,
-      size_t deletion_trigger);
+  CompactOnDeletionCollector(size_t sliding_window_size,
+                             size_t deletion_trigger, double deletion_raatio);
 
   // AddUserKey() will be called when a new key/value pair is inserted into the
   // table.
@@ -28,10 +27,7 @@
   // for writing the properties block.
   // @params properties  User will add their collected statistics to
   // `properties`.
-  virtual Status Finish(UserCollectedProperties* /*properties*/) override {
-    finished_ = true;
-    return Status::OK();
-  }
+  virtual Status Finish(UserCollectedProperties* /*properties*/) override;
 
   // Return the human-readable properties, where the key is property name and
   // the value is the human-readable form of value.
@@ -64,6 +60,10 @@
   size_t num_keys_in_current_bucket_;
   size_t num_deletions_in_observation_window_;
   size_t deletion_trigger_;
+  const double deletion_ratio_;
+  const bool deletion_ratio_enabled_;
+  size_t total_entries_ = 0;
+  size_t deletion_entries_ = 0;
   // true if the current SST file needs to be compacted.
   bool need_compaction_;
   bool finished_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -14,19 +14,78 @@
 #include <cmath>
 #include <vector>
 
+#include "port/stack_trace.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
 #include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
 
-int main(int /*argc*/, char** /*argv*/) {
+namespace ROCKSDB_NAMESPACE {
+
+TEST(CompactOnDeletionCollector, DeletionRatio) {
+  TablePropertiesCollectorFactory::Context context;
+  context.column_family_id =
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  const size_t kTotalEntries = 100;
+
+  {
+    // Disable deletion ratio.
+    for (double deletion_ratio : {-1.5, -1.0, 0.0, 1.5, 2.0}) {
+      auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio);
+      std::unique_ptr<TablePropertiesCollector> collector(
+          factory->CreateTablePropertiesCollector(context));
+      for (size_t i = 0; i < kTotalEntries; i++) {
+        // All entries are deletion entries.
+        ASSERT_OK(
+            collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+        ASSERT_FALSE(collector->NeedCompact());
+      }
+      ASSERT_OK(collector->Finish(nullptr));
+      ASSERT_FALSE(collector->NeedCompact());
+    }
+  }
+
+  {
+    for (double deletion_ratio : {0.3, 0.5, 0.8, 1.0}) {
+      auto factory = NewCompactOnDeletionCollectorFactory(0, 0, deletion_ratio);
+      const size_t deletion_entries_trigger =
+          static_cast<size_t>(deletion_ratio * kTotalEntries);
+      for (int delta : {-1, 0, 1}) {
+        // Actual deletion entry ratio <, =, > deletion_ratio
+        size_t actual_deletion_entries = deletion_entries_trigger + delta;
+        std::unique_ptr<TablePropertiesCollector> collector(
+            factory->CreateTablePropertiesCollector(context));
+        for (size_t i = 0; i < kTotalEntries; i++) {
+          if (i < actual_deletion_entries) {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
+          } else {
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
+          }
+          ASSERT_FALSE(collector->NeedCompact());
+        }
+        ASSERT_OK(collector->Finish(nullptr));
+        if (delta >= 0) {
+          // >= deletion_ratio
+          ASSERT_TRUE(collector->NeedCompact());
+        } else {
+          ASSERT_FALSE(collector->NeedCompact());
+        }
+      }
+    }
+  }
+}
+
+TEST(CompactOnDeletionCollector, SlidingWindow) {
   const int kWindowSizes[] =
       {1000, 10000, 10000, 127, 128, 129, 255, 256, 257, 2, 10000};
   const int kDeletionTriggers[] =
       {500, 9500, 4323, 47, 61, 128, 250, 250, 250, 2, 2};
-  ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context context;
-  context.column_family_id = ROCKSDB_NAMESPACE::
+  TablePropertiesCollectorFactory::Context context;
+  context.column_family_id =
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
 
   std::vector<int> window_sizes;
@@ -38,7 +97,7 @@
   }
 
   // randomize tests
-  ROCKSDB_NAMESPACE::Random rnd(301);
+  Random rnd(301);
   const int kMaxTestSize = 100000l;
   for (int random_test = 0; random_test < 10; random_test++) {
     int window_size = rnd.Uniform(kMaxTestSize) + 1;
@@ -58,21 +117,21 @@
     const int kBias = (kNumDeletionTrigger + kBucketSize - 1) / kBucketSize;
     // Simple test
     {
-      auto factory = ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory(
-          kWindowSize, kNumDeletionTrigger);
+      auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+                                                          kNumDeletionTrigger);
       const int kSample = 10;
       for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
-        std::unique_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollector> collector(
+        std::unique_ptr<TablePropertiesCollector> collector(
             factory->CreateTablePropertiesCollector(context));
         int deletions = 0;
         for (int i = 0; i < kPaddedWindowSize; ++i) {
           if (i % kSample < delete_rate) {
-            collector->AddUserKey("hello", "rocksdb",
-                                  ROCKSDB_NAMESPACE::kEntryDelete, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
             deletions++;
           } else {
-            collector->AddUserKey("hello", "rocksdb",
-                                  ROCKSDB_NAMESPACE::kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         if (collector->NeedCompact() !=
@@ -82,43 +141,43 @@
                   " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
                   deletions, kNumDeletionTrigger,
                   kWindowSize, kNumDeletionTrigger);
-          assert(false);
+          ASSERT_TRUE(false);
         }
-        collector->Finish(nullptr);
+        ASSERT_OK(collector->Finish(nullptr));
       }
     }
 
     // Only one section of a file satisfies the compaction trigger
     {
-      auto factory = ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory(
-          kWindowSize, kNumDeletionTrigger);
+      auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+                                                          kNumDeletionTrigger);
       const int kSample = 10;
       for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
-        std::unique_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollector> collector(
+        std::unique_ptr<TablePropertiesCollector> collector(
             factory->CreateTablePropertiesCollector(context));
         int deletions = 0;
         for (int section = 0; section < 5; ++section) {
           int initial_entries = rnd.Uniform(kWindowSize) + kWindowSize;
           for (int i = 0; i < initial_entries; ++i) {
-            collector->AddUserKey("hello", "rocksdb",
-                                  ROCKSDB_NAMESPACE::kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         for (int i = 0; i < kPaddedWindowSize; ++i) {
           if (i % kSample < delete_rate) {
-            collector->AddUserKey("hello", "rocksdb",
-                                  ROCKSDB_NAMESPACE::kEntryDelete, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
             deletions++;
           } else {
-            collector->AddUserKey("hello", "rocksdb",
-                                  ROCKSDB_NAMESPACE::kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         for (int section = 0; section < 5; ++section) {
           int ending_entries = rnd.Uniform(kWindowSize) + kWindowSize;
           for (int i = 0; i < ending_entries; ++i) {
-            collector->AddUserKey("hello", "rocksdb",
-                                  ROCKSDB_NAMESPACE::kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) &&
@@ -128,18 +187,18 @@
                   collector->NeedCompact(),
                   deletions, kNumDeletionTrigger, kWindowSize,
                   kNumDeletionTrigger);
-          assert(false);
+          ASSERT_TRUE(false);
         }
-        collector->Finish(nullptr);
+        ASSERT_OK(collector->Finish(nullptr));
       }
     }
 
     // TEST 3:  Issues a lots of deletes, but their density is not
     // high enough to trigger compaction.
     {
-      std::unique_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollector> collector;
-      auto factory = ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory(
-          kWindowSize, kNumDeletionTrigger);
+      std::unique_ptr<TablePropertiesCollector> collector;
+      auto factory = NewCompactOnDeletionCollectorFactory(kWindowSize,
+                                                          kNumDeletionTrigger);
       collector.reset(factory->CreateTablePropertiesCollector(context));
       assert(collector->NeedCompact() == false);
       // Insert "kNumDeletionTrigger * 0.95" deletions for every
@@ -149,11 +208,11 @@
         for (int section = 0; section < 200; ++section) {
           for (int i = 0; i < kPaddedWindowSize; ++i) {
             if (i < kDeletionsPerSection) {
-              collector->AddUserKey("hello", "rocksdb",
-                                    ROCKSDB_NAMESPACE::kEntryDelete, 0, 0);
+              ASSERT_OK(collector->AddUserKey("hello", "rocksdb", kEntryDelete,
+                                              0, 0));
             } else {
-              collector->AddUserKey("hello", "rocksdb",
-                                    ROCKSDB_NAMESPACE::kEntryPut, 0, 0);
+              ASSERT_OK(
+                  collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
             }
           }
         }
@@ -162,13 +221,20 @@
           fprintf(stderr, "[Error] collector->NeedCompact() != false"
                   " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
                   kWindowSize, kNumDeletionTrigger);
-          assert(false);
+          ASSERT_TRUE(false);
         }
-        collector->Finish(nullptr);
+        ASSERT_OK(collector->Finish(nullptr));
       }
     }
   }
-  fprintf(stderr, "PASSED\n");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
 #else
 int main(int /*argc*/, char** /*argv*/) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.cc	2025-05-19 16:14:28.000000000 +0000
@@ -22,7 +22,7 @@
       buffer_(new char[kBufferSize]) {}
 
 FileTraceReader::~FileTraceReader() {
-  Close();
+  Close().PermitUncheckedError();
   delete[] buffer_;
 }
 
@@ -31,9 +31,18 @@
   return Status::OK();
 }
 
+Status FileTraceReader::Reset() {
+  if (file_reader_ == nullptr) {
+    return Status::IOError("TraceReader is closed.");
+  }
+  offset_ = 0;
+  return Status::OK();
+}
+
 Status FileTraceReader::Read(std::string* data) {
   assert(file_reader_ != nullptr);
-  Status s = file_reader_->Read(offset_, kTraceMetadataSize, &result_, buffer_);
+  Status s = file_reader_->Read(IOOptions(), offset_, kTraceMetadataSize,
+                                &result_, buffer_, nullptr);
   if (!s.ok()) {
     return s;
   }
@@ -57,7 +66,8 @@
   unsigned int to_read =
       bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
   while (to_read > 0) {
-    s = file_reader_->Read(offset_, to_read, &result_, buffer_);
+    s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, buffer_,
+                           nullptr);
     if (!s.ok()) {
       return s;
     }
@@ -74,7 +84,11 @@
   return s;
 }
 
-FileTraceWriter::~FileTraceWriter() { Close(); }
+FileTraceWriter::FileTraceWriter(
+    std::unique_ptr<WritableFileWriter>&& file_writer)
+    : file_writer_(std::move(file_writer)) {}
+
+FileTraceWriter::~FileTraceWriter() { Close().PermitUncheckedError(); }
 
 Status FileTraceWriter::Close() {
   file_writer_.reset();
@@ -90,15 +104,13 @@
 Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceReader>* trace_reader) {
-  std::unique_ptr<RandomAccessFile> trace_file;
-  Status s = env->NewRandomAccessFile(trace_filename, &trace_file, env_options);
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  Status s = RandomAccessFileReader::Create(
+      env->GetFileSystem(), trace_filename, FileOptions(env_options),
+      &file_reader, nullptr);
   if (!s.ok()) {
     return s;
   }
-
-  std::unique_ptr<RandomAccessFileReader> file_reader;
-  file_reader.reset(new RandomAccessFileReader(
-      NewLegacyRandomAccessFileWrapper(trace_file), trace_filename));
   trace_reader->reset(new FileTraceReader(std::move(file_reader)));
   return s;
 }
@@ -106,16 +118,13 @@
 Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceWriter>* trace_writer) {
-  std::unique_ptr<WritableFile> trace_file;
-  Status s = env->NewWritableFile(trace_filename, &trace_file, env_options);
+  std::unique_ptr<WritableFileWriter> file_writer;
+  Status s = WritableFileWriter::Create(env->GetFileSystem(), trace_filename,
+                                        FileOptions(env_options), &file_writer,
+                                        nullptr);
   if (!s.ok()) {
     return s;
   }
-
-  std::unique_ptr<WritableFileWriter> file_writer;
-  file_writer.reset(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(trace_file)), trace_filename,
-      env_options));
   trace_writer->reset(new FileTraceWriter(std::move(file_writer)));
   return s;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/file_trace_reader_writer.h	2025-05-19 16:14:28.000000000 +0000
@@ -20,6 +20,7 @@
 
   virtual Status Read(std::string* data) override;
   virtual Status Close() override;
+  virtual Status Reset() override;
 
  private:
   std::unique_ptr<RandomAccessFileReader> file_reader_;
@@ -33,8 +34,7 @@
 // FileTraceWriter allows writing RocksDB traces to a file.
 class FileTraceWriter : public TraceWriter {
  public:
-  explicit FileTraceWriter(std::unique_ptr<WritableFileWriter>&& file_writer)
-      : file_writer_(std::move(file_writer)) {}
+  explicit FileTraceWriter(std::unique_ptr<WritableFileWriter>&& file_writer);
   ~FileTraceWriter();
 
   virtual Status Write(const Slice& data) override;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,316 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/trace/replayer_impl.h"
+
+#include <cmath>
+#include <thread>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "util/threadpool_imp.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ReplayerImpl::ReplayerImpl(DB* db,
+                           const std::vector<ColumnFamilyHandle*>& handles,
+                           std::unique_ptr<TraceReader>&& reader)
+    : Replayer(),
+      trace_reader_(std::move(reader)),
+      prepared_(false),
+      trace_end_(false),
+      header_ts_(0),
+      exec_handler_(TraceRecord::NewExecutionHandler(db, handles)),
+      env_(db->GetEnv()),
+      trace_file_version_(-1) {}
+
+ReplayerImpl::~ReplayerImpl() {
+  exec_handler_.reset();
+  trace_reader_.reset();
+}
+
+Status ReplayerImpl::Prepare() {
+  Trace header;
+  int db_version;
+  Status s = ReadHeader(&header);
+  if (!s.ok()) {
+    return s;
+  }
+  s = TracerHelper::ParseTraceHeader(header, &trace_file_version_, &db_version);
+  if (!s.ok()) {
+    return s;
+  }
+  header_ts_ = header.ts;
+  prepared_ = true;
+  trace_end_ = false;
+  return Status::OK();
+}
+
+Status ReplayerImpl::Next(std::unique_ptr<TraceRecord>* record) {
+  if (!prepared_) {
+    return Status::Incomplete("Not prepared!");
+  }
+  if (trace_end_) {
+    return Status::Incomplete("Trace end.");
+  }
+
+  Trace trace;
+  Status s = ReadTrace(&trace);  // ReadTrace is atomic
+  // Reached the trace end.
+  if (s.ok() && trace.type == kTraceEnd) {
+    trace_end_ = true;
+    return Status::Incomplete("Trace end.");
+  }
+  if (!s.ok() || record == nullptr) {
+    return s;
+  }
+
+  return TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, record);
+}
+
+Status ReplayerImpl::Execute(const std::unique_ptr<TraceRecord>& record,
+                             std::unique_ptr<TraceRecordResult>* result) {
+  return record->Accept(exec_handler_.get(), result);
+}
+
+Status ReplayerImpl::Replay(
+    const ReplayOptions& options,
+    const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+        result_callback) {
+  if (options.fast_forward <= 0.0) {
+    return Status::InvalidArgument("Wrong fast forward speed!");
+  }
+
+  if (!prepared_) {
+    return Status::Incomplete("Not prepared!");
+  }
+  if (trace_end_) {
+    return Status::Incomplete("Trace end.");
+  }
+
+  Status s = Status::OK();
+
+  if (options.num_threads <= 1) {
+    // num_threads == 0 or num_threads == 1 uses single thread.
+    std::chrono::system_clock::time_point replay_epoch =
+        std::chrono::system_clock::now();
+
+    while (s.ok()) {
+      Trace trace;
+      s = ReadTrace(&trace);
+      // If already at trace end, ReadTrace should return Status::Incomplete().
+      if (!s.ok()) {
+        break;
+      }
+
+      // No need to sleep before breaking the loop if at the trace end.
+      if (trace.type == kTraceEnd) {
+        trace_end_ = true;
+        s = Status::Incomplete("Trace end.");
+        break;
+      }
+
+      // In single-threaded replay, decode first then sleep.
+      std::unique_ptr<TraceRecord> record;
+      s = TracerHelper::DecodeTraceRecord(&trace, trace_file_version_, &record);
+      if (!s.ok() && !s.IsNotSupported()) {
+        break;
+      }
+
+      std::chrono::system_clock::time_point sleep_to =
+          replay_epoch +
+          std::chrono::microseconds(static_cast<uint64_t>(std::llround(
+              1.0 * (trace.ts - header_ts_) / options.fast_forward)));
+      if (sleep_to > std::chrono::system_clock::now()) {
+        std::this_thread::sleep_until(sleep_to);
+      }
+
+      // Skip unsupported traces, stop for other errors.
+      if (s.IsNotSupported()) {
+        if (result_callback != nullptr) {
+          result_callback(s, nullptr);
+        }
+        s = Status::OK();
+        continue;
+      }
+
+      if (result_callback == nullptr) {
+        s = Execute(record, nullptr);
+      } else {
+        std::unique_ptr<TraceRecordResult> res;
+        s = Execute(record, &res);
+        result_callback(s, std::move(res));
+      }
+    }
+  } else {
+    // Multi-threaded replay.
+    ThreadPoolImpl thread_pool;
+    thread_pool.SetHostEnv(env_);
+    thread_pool.SetBackgroundThreads(static_cast<int>(options.num_threads));
+
+    std::mutex mtx;
+    // Background decoding and execution status.
+    Status bg_s = Status::OK();
+    uint64_t last_err_ts = static_cast<uint64_t>(-1);
+    // Callback function used in background work to update bg_s for the ealiest
+    // TraceRecord which has execution error. This is different from the
+    // timestamp of the first execution error (either start or end timestamp).
+    //
+    // Suppose TraceRecord R1, R2, with timestamps T1 < T2. Their execution
+    // timestamps are T1_start, T1_end, T2_start, T2_end.
+    // Single-thread: there must be T1_start < T1_end < T2_start < T2_end.
+    // Multi-thread: T1_start < T2_start may not be enforced. Orders of them are
+    // totally unknown.
+    // In order to report the same `first` error in both single-thread and
+    // multi-thread replay, we can only rely on the TraceRecords' timestamps,
+    // rather than their executin timestamps. Although in single-thread replay,
+    // the first error is also the last error, while in multi-thread replay, the
+    // first error may not be the first error in execution, and it may not be
+    // the last error in exeution as well.
+    auto error_cb = [&mtx, &bg_s, &last_err_ts](Status err, uint64_t err_ts) {
+      std::lock_guard<std::mutex> gd(mtx);
+      // Only record the first error.
+      if (!err.ok() && !err.IsNotSupported() && err_ts < last_err_ts) {
+        bg_s = err;
+        last_err_ts = err_ts;
+      }
+    };
+
+    std::chrono::system_clock::time_point replay_epoch =
+        std::chrono::system_clock::now();
+
+    while (bg_s.ok() && s.ok()) {
+      Trace trace;
+      s = ReadTrace(&trace);
+      // If already at trace end, ReadTrace should return Status::Incomplete().
+      if (!s.ok()) {
+        break;
+      }
+
+      TraceType trace_type = trace.type;
+
+      // No need to sleep before breaking the loop if at the trace end.
+      if (trace_type == kTraceEnd) {
+        trace_end_ = true;
+        s = Status::Incomplete("Trace end.");
+        break;
+      }
+
+      // In multi-threaded replay, sleep first then start decoding and
+      // execution in a thread.
+      std::chrono::system_clock::time_point sleep_to =
+          replay_epoch +
+          std::chrono::microseconds(static_cast<uint64_t>(std::llround(
+              1.0 * (trace.ts - header_ts_) / options.fast_forward)));
+      if (sleep_to > std::chrono::system_clock::now()) {
+        std::this_thread::sleep_until(sleep_to);
+      }
+
+      if (trace_type == kTraceWrite || trace_type == kTraceGet ||
+          trace_type == kTraceIteratorSeek ||
+          trace_type == kTraceIteratorSeekForPrev ||
+          trace_type == kTraceMultiGet) {
+        std::unique_ptr<ReplayerWorkerArg> ra(new ReplayerWorkerArg);
+        ra->trace_entry = std::move(trace);
+        ra->handler = exec_handler_.get();
+        ra->trace_file_version = trace_file_version_;
+        ra->error_cb = error_cb;
+        ra->result_cb = result_callback;
+        thread_pool.Schedule(&ReplayerImpl::BackgroundWork, ra.release(),
+                             nullptr, nullptr);
+      } else {
+        // Skip unsupported traces.
+        if (result_callback != nullptr) {
+          result_callback(Status::NotSupported("Unsupported trace type."),
+                          nullptr);
+        }
+      }
+    }
+
+    thread_pool.WaitForJobsAndJoinAllThreads();
+    if (!bg_s.ok()) {
+      s = bg_s;
+    }
+  }
+
+  if (s.IsIncomplete()) {
+    // Reaching eof returns Incomplete status at the moment.
+    // Could happen when killing a process without calling EndTrace() API.
+    // TODO: Add better error handling.
+    trace_end_ = true;
+    return Status::OK();
+  }
+  return s;
+}
+
+uint64_t ReplayerImpl::GetHeaderTimestamp() const { return header_ts_; }
+
+Status ReplayerImpl::ReadHeader(Trace* header) {
+  assert(header != nullptr);
+  Status s = trace_reader_->Reset();
+  if (!s.ok()) {
+    return s;
+  }
+  std::string encoded_trace;
+  // Read the trace head
+  s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return TracerHelper::DecodeHeader(encoded_trace, header);
+}
+
+Status ReplayerImpl::ReadTrace(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  // We don't know if TraceReader is implemented thread-safe, so we protect the
+  // reading trace part with a mutex. The decoding part does not need to be
+  // protected since it's local.
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    Status s = trace_reader_->Read(&encoded_trace);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return TracerHelper::DecodeTrace(encoded_trace, trace);
+}
+
+void ReplayerImpl::BackgroundWork(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  assert(ra != nullptr);
+
+  std::unique_ptr<TraceRecord> record;
+  Status s = TracerHelper::DecodeTraceRecord(&(ra->trace_entry),
+                                             ra->trace_file_version, &record);
+  if (!s.ok()) {
+    // Stop the replay
+    if (ra->error_cb != nullptr) {
+      ra->error_cb(s, ra->trace_entry.ts);
+    }
+    // Report the result
+    if (ra->result_cb != nullptr) {
+      ra->result_cb(s, nullptr);
+    }
+    return;
+  }
+
+  if (ra->result_cb == nullptr) {
+    s = record->Accept(ra->handler, nullptr);
+  } else {
+    std::unique_ptr<TraceRecordResult> res;
+    s = record->Accept(ra->handler, &res);
+    ra->result_cb(s, std::move(res));
+  }
+  record.reset();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/trace/replayer_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,86 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/replayer.h"
+#include "trace_replay/trace_replay.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReplayerImpl : public Replayer {
+ public:
+  ReplayerImpl(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+               std::unique_ptr<TraceReader>&& reader);
+  ~ReplayerImpl() override;
+
+  using Replayer::Prepare;
+  Status Prepare() override;
+
+  using Replayer::Next;
+  Status Next(std::unique_ptr<TraceRecord>* record) override;
+
+  using Replayer::Execute;
+  Status Execute(const std::unique_ptr<TraceRecord>& record,
+                 std::unique_ptr<TraceRecordResult>* result) override;
+
+  using Replayer::Replay;
+  Status Replay(
+      const ReplayOptions& options,
+      const std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)>&
+          result_callback) override;
+
+  using Replayer::GetHeaderTimestamp;
+  uint64_t GetHeaderTimestamp() const override;
+
+ private:
+  Status ReadHeader(Trace* header);
+  Status ReadTrace(Trace* trace);
+
+  // Generic function to execute a Trace in a thread pool.
+  static void BackgroundWork(void* arg);
+
+  std::unique_ptr<TraceReader> trace_reader_;
+  std::mutex mutex_;
+  std::atomic<bool> prepared_;
+  std::atomic<bool> trace_end_;
+  uint64_t header_ts_;
+  std::unique_ptr<TraceRecord::Handler> exec_handler_;
+  Env* env_;
+  // When reading the trace header, the trace file version can be parsed.
+  // Replayer will use different decode method to get the trace content based
+  // on different trace file version.
+  int trace_file_version_;
+};
+
+// Arguments passed to BackgroundWork() for replaying in a thread pool.
+struct ReplayerWorkerArg {
+  Trace trace_entry;
+  int trace_file_version;
+  // Handler to execute TraceRecord.
+  TraceRecord::Handler* handler;
+  // Callback function to report the error status and the timestamp of the
+  // TraceRecord (not the start/end timestamp of executing the TraceRecord).
+  std::function<void(Status, uint64_t)> error_cb;
+  // Callback function to report the trace execution status and operation
+  // execution status/result(s).
+  std::function<void(Status, std::unique_ptr<TraceRecordResult>&&)> result_cb;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,29 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+                                            const TransactionDBOptions& opt) {
+  assert(db);
+  if (opt.lock_mgr_handle) {
+    // A custom lock manager was provided in options
+    auto mgr = opt.lock_mgr_handle->getLockManager();
+    return std::shared_ptr<LockManager>(opt.lock_mgr_handle, mgr);
+  } else {
+    // Use a point lock manager by default
+    return std::shared_ptr<LockManager>(new PointLockManager(db, opt));
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_manager.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB;
+
+class LockManager {
+ public:
+  virtual ~LockManager() {}
+
+  // Whether supports locking a specific key.
+  virtual bool IsPointLockSupported() const = 0;
+
+  // Whether supports locking a range of keys.
+  virtual bool IsRangeLockSupported() const = 0;
+
+  // Locks acquired through this LockManager should be tracked by
+  // the LockTrackers created through the returned factory.
+  virtual const LockTrackerFactory& GetLockTrackerFactory() const = 0;
+
+  // Enable locking for the specified column family.
+  // Caller should guarantee that this column family is not already enabled.
+  virtual void AddColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+  // Disable locking for the specified column family.
+  // Caller should guarantee that this column family is no longer used.
+  virtual void RemoveColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+  // Attempt to lock a key or a key range.  If OK status is returned, the caller
+  // is responsible for calling UnLock() on this key.
+  virtual Status TryLock(PessimisticTransaction* txn,
+                         ColumnFamilyId column_family_id,
+                         const std::string& key, Env* env, bool exclusive) = 0;
+  // The range [start, end] are inclusive at both sides.
+  virtual Status TryLock(PessimisticTransaction* txn,
+                         ColumnFamilyId column_family_id, const Endpoint& start,
+                         const Endpoint& end, Env* env, bool exclusive) = 0;
+
+  // Unlock a key or a range locked by TryLock().  txn must be the same
+  // Transaction that locked this key.
+  virtual void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+                      Env* env) = 0;
+  virtual void UnLock(PessimisticTransaction* txn,
+                      ColumnFamilyId column_family_id, const std::string& key,
+                      Env* env) = 0;
+  virtual void UnLock(PessimisticTransaction* txn,
+                      ColumnFamilyId column_family_id, const Endpoint& start,
+                      const Endpoint& end, Env* env) = 0;
+
+  using PointLockStatus = std::unordered_multimap<ColumnFamilyId, KeyLockInfo>;
+  virtual PointLockStatus GetPointLockStatus() = 0;
+
+  using RangeLockStatus =
+      std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+  virtual RangeLockStatus GetRangeLockStatus() = 0;
+
+  virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+
+  virtual void Resize(uint32_t new_size) = 0;
+};
+
+// LockManager should always be constructed through this factory method,
+// instead of constructing through concrete implementations' constructor.
+// Caller owns the returned pointer.
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+                                            const TransactionDBOptions& opt);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_tracker.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_tracker.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_tracker.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/lock_tracker.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,209 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Request for locking a single key.
+struct PointLockRequest {
+  // The id of the key's column family.
+  ColumnFamilyId column_family_id = 0;
+  // The key to lock.
+  std::string key;
+  // The sequence number from which there is no concurrent update to key.
+  SequenceNumber seq = 0;
+  // Whether the lock is acquired only for read.
+  bool read_only = false;
+  // Whether the lock is in exclusive mode.
+  bool exclusive = true;
+};
+
+// Request for locking a range of keys.
+struct RangeLockRequest {
+  // The id of the key's column family.
+  ColumnFamilyId column_family_id;
+
+  // The range to be locked
+  Endpoint start_endp;
+  Endpoint end_endp;
+};
+
+struct PointLockStatus {
+  // Whether the key is locked.
+  bool locked = false;
+  // Whether the key is locked in exclusive mode.
+  bool exclusive = true;
+  // The sequence number in the tracked PointLockRequest.
+  SequenceNumber seq = 0;
+};
+
+// Return status when calling LockTracker::Untrack.
+enum class UntrackStatus {
+  // The lock is not tracked at all, so no lock to untrack.
+  NOT_TRACKED,
+  // The lock is untracked but not removed from the tracker.
+  UNTRACKED,
+  // The lock is removed from the tracker.
+  REMOVED,
+};
+
+// Tracks the lock requests.
+// In PessimisticTransaction, it tracks the locks acquired through LockMgr;
+// In OptimisticTransaction, since there is no LockMgr, it tracks the lock
+// intention. Not thread-safe.
+class LockTracker {
+ public:
+  virtual ~LockTracker() {}
+
+  // Whether supports locking a specific key.
+  virtual bool IsPointLockSupported() const = 0;
+
+  // Whether supports locking a range of keys.
+  virtual bool IsRangeLockSupported() const = 0;
+
+  // Tracks the acquirement of a lock on key.
+  //
+  // If this method is not supported, leave it as a no-op.
+  virtual void Track(const PointLockRequest& /*lock_request*/) = 0;
+
+  // Untracks the lock on a key.
+  // seq and exclusive in lock_request are not used.
+  //
+  // If this method is not supported, leave it as a no-op and
+  // returns NOT_TRACKED.
+  virtual UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) = 0;
+
+  // Counterpart of Track(const PointLockRequest&) for RangeLockRequest.
+  virtual void Track(const RangeLockRequest& /*lock_request*/) = 0;
+
+  // Counterpart of Untrack(const PointLockRequest&) for RangeLockRequest.
+  virtual UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) = 0;
+
+  // Merges lock requests tracked in the specified tracker into the current
+  // tracker.
+  //
+  // E.g. for point lock, if a key in tracker is not yet tracked,
+  // track this new key; otherwise, merge the tracked information of the key
+  // such as lock's exclusiveness, read/write statistics.
+  //
+  // If this method is not supported, leave it as a no-op.
+  //
+  // REQUIRED: the specified tracker must be of the same concrete class type as
+  // the current tracker.
+  virtual void Merge(const LockTracker& /*tracker*/) = 0;
+
+  // This is a reverse operation of Merge.
+  //
+  // E.g. for point lock, if a key exists in both current and the sepcified
+  // tracker, then subtract the information (such as read/write statistics) of
+  // the key in the specified tracker from the current tracker.
+  //
+  // If this method is not supported, leave it as a no-op.
+  //
+  // REQUIRED:
+  // The specified tracker must be of the same concrete class type as
+  // the current tracker.
+  // The tracked locks in the specified tracker must be a subset of those
+  // tracked by the current tracker.
+  virtual void Subtract(const LockTracker& /*tracker*/) = 0;
+
+  // Clears all tracked locks.
+  virtual void Clear() = 0;
+
+  // Gets the new locks (excluding the locks that have been tracked before the
+  // save point) tracked since the specified save point, the result is stored
+  // in an internally constructed LockTracker and returned.
+  //
+  // save_point_tracker is the tracker used by a SavePoint to track locks
+  // tracked after creating the SavePoint.
+  //
+  // The implementation should document whether point lock, or range lock, or
+  // both are considered in this method.
+  // If this method is not supported, returns nullptr.
+  //
+  // REQUIRED:
+  // The save_point_tracker must be of the same concrete class type as the
+  // current tracker.
+  // The tracked locks in the specified tracker must be a subset of those
+  // tracked by the current tracker.
+  virtual LockTracker* GetTrackedLocksSinceSavePoint(
+      const LockTracker& /*save_point_tracker*/) const = 0;
+
+  // Gets lock related information of the key.
+  //
+  // If point lock is not supported, always returns LockStatus with
+  // locked=false.
+  virtual PointLockStatus GetPointLockStatus(
+      ColumnFamilyId /*column_family_id*/,
+      const std::string& /*key*/) const = 0;
+
+  // Gets number of tracked point locks.
+  //
+  // If point lock is not supported, always returns 0.
+  virtual uint64_t GetNumPointLocks() const = 0;
+
+  class ColumnFamilyIterator {
+   public:
+    virtual ~ColumnFamilyIterator() {}
+
+    // Whether there are remaining column families.
+    virtual bool HasNext() const = 0;
+
+    // Gets next column family id.
+    //
+    // If HasNext is false, calling this method has undefined behavior.
+    virtual ColumnFamilyId Next() = 0;
+  };
+
+  // Gets an iterator for column families.
+  //
+  // Returned iterator must not be nullptr.
+  // If there is no column family to iterate,
+  // returns an empty non-null iterator.
+  // Caller owns the returned pointer.
+  virtual ColumnFamilyIterator* GetColumnFamilyIterator() const = 0;
+
+  class KeyIterator {
+   public:
+    virtual ~KeyIterator() {}
+
+    // Whether there are remaining keys.
+    virtual bool HasNext() const = 0;
+
+    // Gets the next key.
+    //
+    // If HasNext is false, calling this method has undefined behavior.
+    virtual const std::string& Next() = 0;
+  };
+
+  // Gets an iterator for keys with tracked point locks in the column family.
+  //
+  // The column family must exist.
+  // Returned iterator must not be nullptr.
+  // Caller owns the returned pointer.
+  virtual KeyIterator* GetKeyIterator(
+      ColumnFamilyId /*column_family_id*/) const = 0;
+};
+
+// LockTracker should always be constructed through this factory.
+// Each LockManager owns a LockTrackerFactory.
+class LockTrackerFactory {
+ public:
+  // Caller owns the returned pointer.
+  virtual LockTracker* Create() const = 0;
+  virtual ~LockTrackerFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,718 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <mutex>
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct LockInfo {
+  bool exclusive;
+  autovector<TransactionID> txn_ids;
+
+  // Transaction locks are not valid after this time in us
+  uint64_t expiration_time;
+
+  LockInfo(TransactionID id, uint64_t time, bool ex)
+      : exclusive(ex), expiration_time(time) {
+    txn_ids.push_back(id);
+  }
+  LockInfo(const LockInfo& lock_info)
+      : exclusive(lock_info.exclusive),
+        txn_ids(lock_info.txn_ids),
+        expiration_time(lock_info.expiration_time) {}
+  void operator=(const LockInfo& lock_info) {
+    exclusive = lock_info.exclusive;
+    txn_ids = lock_info.txn_ids;
+    expiration_time = lock_info.expiration_time;
+  }
+};
+
+struct LockMapStripe {
+  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
+    stripe_mutex = factory->AllocateMutex();
+    stripe_cv = factory->AllocateCondVar();
+    assert(stripe_mutex);
+    assert(stripe_cv);
+  }
+
+  // Mutex must be held before modifying keys map
+  std::shared_ptr<TransactionDBMutex> stripe_mutex;
+
+  // Condition Variable per stripe for waiting on a lock
+  std::shared_ptr<TransactionDBCondVar> stripe_cv;
+
+  // Locked keys mapped to the info about the transactions that locked them.
+  // TODO(agiardullo): Explore performance of other data structures.
+  std::unordered_map<std::string, LockInfo> keys;
+};
+
+// Map of #num_stripes LockMapStripes
+struct LockMap {
+  explicit LockMap(size_t num_stripes,
+                   std::shared_ptr<TransactionDBMutexFactory> factory)
+      : num_stripes_(num_stripes) {
+    lock_map_stripes_.reserve(num_stripes);
+    for (size_t i = 0; i < num_stripes; i++) {
+      LockMapStripe* stripe = new LockMapStripe(factory);
+      lock_map_stripes_.push_back(stripe);
+    }
+  }
+
+  ~LockMap() {
+    for (auto stripe : lock_map_stripes_) {
+      delete stripe;
+    }
+  }
+
+  // Number of sepearate LockMapStripes to create, each with their own Mutex
+  const size_t num_stripes_;
+
+  // Count of keys that are currently locked in this column family.
+  // (Only maintained if PointLockManager::max_num_locks_ is positive.)
+  std::atomic<int64_t> lock_cnt{0};
+
+  std::vector<LockMapStripe*> lock_map_stripes_;
+
+  size_t GetStripe(const std::string& key) const;
+};
+
+namespace {
+void UnrefLockMapsCache(void* ptr) {
+  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+  auto lock_maps_cache =
+      static_cast<std::unordered_map<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
+  delete lock_maps_cache;
+}
+}  // anonymous namespace
+
+PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
+                                   const TransactionDBOptions& opt)
+    : txn_db_impl_(txn_db),
+      default_num_stripes_(opt.num_stripes),
+      max_num_locks_(opt.max_num_locks),
+      lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
+      dlock_buffer_(opt.max_num_deadlocks),
+      mutex_factory_(opt.custom_mutex_factory
+                         ? opt.custom_mutex_factory
+                         : std::make_shared<TransactionDBMutexFactoryImpl>()) {}
+
+size_t LockMap::GetStripe(const std::string& key) const {
+  assert(num_stripes_ > 0);
+  return FastRange64(GetSliceNPHash64(key), num_stripes_);
+}
+
+void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) {
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) {
+    lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>(
+                                        default_num_stripes_, mutex_factory_));
+  } else {
+    // column_family already exists in lock map
+    assert(false);
+  }
+}
+
+void PointLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cf) {
+  // Remove lock_map for this column family.  Since the lock map is stored
+  // as a shared ptr, concurrent transactions can still keep using it
+  // until they release their references to it.
+  {
+    InstrumentedMutexLock l(&lock_map_mutex_);
+
+    auto lock_maps_iter = lock_maps_.find(cf->GetID());
+    if (lock_maps_iter == lock_maps_.end()) {
+      return;
+    }
+
+    lock_maps_.erase(lock_maps_iter);
+  }  // lock_map_mutex_
+
+  // Clear all thread-local caches
+  autovector<void*> local_caches;
+  lock_maps_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockMaps*>(cache);
+  }
+}
+
+// Look up the LockMap std::shared_ptr for a given column_family_id.
+// Note:  The LockMap is only valid as long as the caller is still holding on
+//   to the returned std::shared_ptr.
+std::shared_ptr<LockMap> PointLockManager::GetLockMap(
+    ColumnFamilyId column_family_id) {
+  // First check thread-local cache
+  if (lock_maps_cache_->Get() == nullptr) {
+    lock_maps_cache_->Reset(new LockMaps());
+  }
+
+  auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get());
+
+  auto lock_map_iter = lock_maps_cache->find(column_family_id);
+  if (lock_map_iter != lock_maps_cache->end()) {
+    // Found lock map for this column family.
+    return lock_map_iter->second;
+  }
+
+  // Not found in local cache, grab mutex and check shared LockMaps
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  lock_map_iter = lock_maps_.find(column_family_id);
+  if (lock_map_iter == lock_maps_.end()) {
+    return std::shared_ptr<LockMap>(nullptr);
+  } else {
+    // Found lock map.  Store in thread-local cache and return.
+    std::shared_ptr<LockMap>& lock_map = lock_map_iter->second;
+    lock_maps_cache->insert({column_family_id, lock_map});
+
+    return lock_map;
+  }
+}
+
+// Returns true if this lock has expired and can be acquired by another
+// transaction.
+// If false, sets *expire_time to the expiration time of the lock according
+// to Env->GetMicros() or 0 if no expiration.
+bool PointLockManager::IsLockExpired(TransactionID txn_id,
+                                     const LockInfo& lock_info, Env* env,
+                                     uint64_t* expire_time) {
+  if (lock_info.expiration_time == 0) {
+    *expire_time = 0;
+    return false;
+  }
+
+  auto now = env->NowMicros();
+  bool expired = lock_info.expiration_time <= now;
+  if (!expired) {
+    // return how many microseconds until lock will be expired
+    *expire_time = lock_info.expiration_time;
+  } else {
+    for (auto id : lock_info.txn_ids) {
+      if (txn_id == id) {
+        continue;
+      }
+
+      bool success = txn_db_impl_->TryStealingExpiredTransactionLocks(id);
+      if (!success) {
+        expired = false;
+        *expire_time = 0;
+        break;
+      }
+    }
+  }
+
+  return expired;
+}
+
+Status PointLockManager::TryLock(PessimisticTransaction* txn,
+                                 ColumnFamilyId column_family_id,
+                                 const std::string& key, Env* env,
+                                 bool exclusive) {
+  // Lookup lock map for this column family id
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    char msg[255];
+    snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32,
+             column_family_id);
+
+    return Status::InvalidArgument(msg);
+  }
+
+  // Need to lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive);
+  int64_t timeout = txn->GetLockTimeout();
+
+  return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env,
+                            timeout, std::move(lock_info));
+}
+
+// Helper function for TryLock().
+Status PointLockManager::AcquireWithTimeout(
+    PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
+    ColumnFamilyId column_family_id, const std::string& key, Env* env,
+    int64_t timeout, LockInfo&& lock_info) {
+  Status result;
+  uint64_t end_time = 0;
+
+  if (timeout > 0) {
+    uint64_t start_time = env->NowMicros();
+    end_time = start_time + timeout;
+  }
+
+  if (timeout < 0) {
+    // If timeout is negative, we wait indefinitely to acquire the lock
+    result = stripe->stripe_mutex->Lock();
+  } else {
+    result = stripe->stripe_mutex->TryLockFor(timeout);
+  }
+
+  if (!result.ok()) {
+    // failed to acquire mutex
+    return result;
+  }
+
+  // Acquire lock if we are able to
+  uint64_t expire_time_hint = 0;
+  autovector<TransactionID> wait_ids;
+  result = AcquireLocked(lock_map, stripe, key, env, std::move(lock_info),
+                         &expire_time_hint, &wait_ids);
+
+  if (!result.ok() && timeout != 0) {
+    PERF_TIMER_GUARD(key_lock_wait_time);
+    PERF_COUNTER_ADD(key_lock_wait_count, 1);
+    // If we weren't able to acquire the lock, we will keep retrying as long
+    // as the timeout allows.
+    bool timed_out = false;
+    do {
+      // Decide how long to wait
+      int64_t cv_end_time = -1;
+      if (expire_time_hint > 0 && end_time > 0) {
+        cv_end_time = std::min(expire_time_hint, end_time);
+      } else if (expire_time_hint > 0) {
+        cv_end_time = expire_time_hint;
+      } else if (end_time > 0) {
+        cv_end_time = end_time;
+      }
+
+      assert(result.IsBusy() || wait_ids.size() != 0);
+
+      // We are dependent on a transaction to finish, so perform deadlock
+      // detection.
+      if (wait_ids.size() != 0) {
+        if (txn->IsDeadlockDetect()) {
+          if (IncrementWaiters(txn, wait_ids, key, column_family_id,
+                               lock_info.exclusive, env)) {
+            result = Status::Busy(Status::SubCode::kDeadlock);
+            stripe->stripe_mutex->UnLock();
+            return result;
+          }
+        }
+        txn->SetWaitingTxn(wait_ids, column_family_id, &key);
+      }
+
+      TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn");
+      if (cv_end_time < 0) {
+        // Wait indefinitely
+        result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
+      } else {
+        uint64_t now = env->NowMicros();
+        if (static_cast<uint64_t>(cv_end_time) > now) {
+          result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
+                                              cv_end_time - now);
+        }
+      }
+
+      if (wait_ids.size() != 0) {
+        txn->ClearWaitingTxn();
+        if (txn->IsDeadlockDetect()) {
+          DecrementWaiters(txn, wait_ids);
+        }
+      }
+
+      if (result.IsTimedOut()) {
+          timed_out = true;
+          // Even though we timed out, we will still make one more attempt to
+          // acquire lock below (it is possible the lock expired and we
+          // were never signaled).
+      }
+
+      if (result.ok() || result.IsTimedOut()) {
+        result = AcquireLocked(lock_map, stripe, key, env, std::move(lock_info),
+                               &expire_time_hint, &wait_ids);
+      }
+    } while (!result.ok() && !timed_out);
+  }
+
+  stripe->stripe_mutex->UnLock();
+
+  return result;
+}
+
+void PointLockManager::DecrementWaiters(
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids) {
+  std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
+  DecrementWaitersImpl(txn, wait_ids);
+}
+
+void PointLockManager::DecrementWaitersImpl(
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids) {
+  auto id = txn->GetID();
+  assert(wait_txn_map_.Contains(id));
+  wait_txn_map_.Delete(id);
+
+  for (auto wait_id : wait_ids) {
+    rev_wait_txn_map_.Get(wait_id)--;
+    if (rev_wait_txn_map_.Get(wait_id) == 0) {
+      rev_wait_txn_map_.Delete(wait_id);
+    }
+  }
+}
+
+bool PointLockManager::IncrementWaiters(
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids, const std::string& key,
+    const uint32_t& cf_id, const bool& exclusive, Env* const env) {
+  auto id = txn->GetID();
+  std::vector<int> queue_parents(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+  std::vector<TransactionID> queue_values(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+  std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
+  assert(!wait_txn_map_.Contains(id));
+
+  wait_txn_map_.Insert(id, {wait_ids, cf_id, exclusive, key});
+
+  for (auto wait_id : wait_ids) {
+    if (rev_wait_txn_map_.Contains(wait_id)) {
+      rev_wait_txn_map_.Get(wait_id)++;
+    } else {
+      rev_wait_txn_map_.Insert(wait_id, 1);
+    }
+  }
+
+  // No deadlock if nobody is waiting on self.
+  if (!rev_wait_txn_map_.Contains(id)) {
+    return false;
+  }
+
+  const auto* next_ids = &wait_ids;
+  int parent = -1;
+  int64_t deadlock_time = 0;
+  for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) {
+    int i = 0;
+    if (next_ids) {
+      for (; i < static_cast<int>(next_ids->size()) &&
+             tail + i < txn->GetDeadlockDetectDepth();
+           i++) {
+        queue_values[tail + i] = (*next_ids)[i];
+        queue_parents[tail + i] = parent;
+      }
+      tail += i;
+    }
+
+    // No more items in the list, meaning no deadlock.
+    if (tail == head) {
+      return false;
+    }
+
+    auto next = queue_values[head];
+    if (next == id) {
+      std::vector<DeadlockInfo> path;
+      while (head != -1) {
+        assert(wait_txn_map_.Contains(queue_values[head]));
+
+        auto extracted_info = wait_txn_map_.Get(queue_values[head]);
+        path.push_back({queue_values[head], extracted_info.m_cf_id,
+                        extracted_info.m_exclusive,
+                        extracted_info.m_waiting_key});
+        head = queue_parents[head];
+      }
+      if (!env->GetCurrentTime(&deadlock_time).ok()) {
+        /*
+          TODO(AR) this preserves the current behaviour whilst checking the
+          status of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED
+          passes. Should we instead raise an error if !ok() ?
+        */
+        deadlock_time = 0;
+      }
+      std::reverse(path.begin(), path.end());
+      dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time));
+      deadlock_time = 0;
+      DecrementWaitersImpl(txn, wait_ids);
+      return true;
+    } else if (!wait_txn_map_.Contains(next)) {
+      next_ids = nullptr;
+      continue;
+    } else {
+      parent = head;
+      next_ids = &(wait_txn_map_.Get(next).m_neighbors);
+    }
+  }
+
+  // Wait cycle too big, just assume deadlock.
+  if (!env->GetCurrentTime(&deadlock_time).ok()) {
+    /*
+      TODO(AR) this preserves the current behaviour whilst checking the status
+      of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED passes.
+      Should we instead raise an error if !ok() ?
+    */
+    deadlock_time = 0;
+  }
+  dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true));
+  DecrementWaitersImpl(txn, wait_ids);
+  return true;
+}
+
+// Try to lock this key after we have acquired the mutex.
+// Sets *expire_time to the expiration time in microseconds
+//  or 0 if no expiration.
+// REQUIRED:  Stripe mutex must be held.
+Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                                       const std::string& key, Env* env,
+                                       LockInfo&& txn_lock_info,
+                                       uint64_t* expire_time,
+                                       autovector<TransactionID>* txn_ids) {
+  assert(txn_lock_info.txn_ids.size() == 1);
+
+  Status result;
+  // Check if this key is already locked
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    // Lock already held
+    LockInfo& lock_info = stripe_iter->second;
+    assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
+
+    if (lock_info.exclusive || txn_lock_info.exclusive) {
+      if (lock_info.txn_ids.size() == 1 &&
+          lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
+        // The list contains one txn and we're it, so just take it.
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+      } else {
+        // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
+        // it's there for a shared lock with multiple holders which was not
+        // caught in the first case.
+        if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
+                          expire_time)) {
+          // lock is expired, can steal it
+          lock_info.txn_ids = txn_lock_info.txn_ids;
+          lock_info.exclusive = txn_lock_info.exclusive;
+          lock_info.expiration_time = txn_lock_info.expiration_time;
+          // lock_cnt does not change
+        } else {
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+          *txn_ids = lock_info.txn_ids;
+        }
+      }
+    } else {
+      // We are requesting shared access to a shared lock, so just grant it.
+      lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
+      // Using std::max means that expiration time never goes down even when
+      // a transaction is removed from the list. The correct solution would be
+      // to track expiry for every transaction, but this would also work for
+      // now.
+      lock_info.expiration_time =
+          std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
+    }
+  } else {  // Lock not held.
+    // Check lock limit
+    if (max_num_locks_ > 0 &&
+        lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
+      result = Status::Busy(Status::SubCode::kLockLimit);
+    } else {
+      // acquire lock
+      stripe->keys.emplace(key, std::move(txn_lock_info));
+
+      // Maintain lock count if there is a limit on the number of locks
+      if (max_num_locks_) {
+        lock_map->lock_cnt++;
+      }
+    }
+  }
+
+  return result;
+}
+
+void PointLockManager::UnLockKey(PessimisticTransaction* txn,
+                                 const std::string& key, LockMapStripe* stripe,
+                                 LockMap* lock_map, Env* env) {
+#ifdef NDEBUG
+  (void)env;
+#endif
+  TransactionID txn_id = txn->GetID();
+
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    auto& txns = stripe_iter->second.txn_ids;
+    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
+    // Found the key we locked.  unlock it.
+    if (txn_it != txns.end()) {
+      if (txns.size() == 1) {
+        stripe->keys.erase(stripe_iter);
+      } else {
+        auto last_it = txns.end() - 1;
+        if (txn_it != last_it) {
+          *txn_it = *last_it;
+        }
+        txns.pop_back();
+      }
+
+      if (max_num_locks_ > 0) {
+        // Maintain lock count if there is a limit on the number of locks.
+        assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
+        lock_map->lock_cnt--;
+      }
+    }
+  } else {
+    // This key is either not locked or locked by someone else.  This should
+    // only happen if the unlocking transaction has expired.
+    assert(txn->GetExpirationTime() > 0 &&
+           txn->GetExpirationTime() < env->NowMicros());
+  }
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              ColumnFamilyId column_family_id,
+                              const std::string& key, Env* env) {
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    // Column Family must have been dropped.
+    return;
+  }
+
+  // Lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  stripe->stripe_mutex->Lock().PermitUncheckedError();
+  UnLockKey(txn, key, stripe, lock_map, env);
+  stripe->stripe_mutex->UnLock();
+
+  // Signal waiting threads to retry locking
+  stripe->stripe_cv->NotifyAll();
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              const LockTracker& tracker, Env* env) {
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(cf);
+    LockMap* lock_map = lock_map_ptr.get();
+    if (!lock_map) {
+      // Column Family must have been dropped.
+      return;
+    }
+
+    // Bucket keys by lock_map_ stripe
+    std::unordered_map<size_t, std::vector<const std::string*>> keys_by_stripe(
+        lock_map->num_stripes_);
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      size_t stripe_num = lock_map->GetStripe(key);
+      keys_by_stripe[stripe_num].push_back(&key);
+    }
+
+    // For each stripe, grab the stripe mutex and unlock all keys in this stripe
+    for (auto& stripe_iter : keys_by_stripe) {
+      size_t stripe_num = stripe_iter.first;
+      auto& stripe_keys = stripe_iter.second;
+
+      assert(lock_map->lock_map_stripes_.size() > stripe_num);
+      LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+      stripe->stripe_mutex->Lock().PermitUncheckedError();
+
+      for (const std::string* key : stripe_keys) {
+        UnLockKey(txn, *key, stripe, lock_map, env);
+      }
+
+      stripe->stripe_mutex->UnLock();
+
+      // Signal waiting threads to retry locking
+      stripe->stripe_cv->NotifyAll();
+    }
+  }
+}
+
+PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() {
+  PointLockStatus data;
+  // Lock order here is important. The correct order is lock_map_mutex_, then
+  // for every column family ID in ascending order lock every stripe in
+  // ascending order.
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  std::vector<uint32_t> cf_ids;
+  for (const auto& map : lock_maps_) {
+    cf_ids.push_back(map.first);
+  }
+  std::sort(cf_ids.begin(), cf_ids.end());
+
+  for (auto i : cf_ids) {
+    const auto& stripes = lock_maps_[i]->lock_map_stripes_;
+    // Iterate and lock all stripes in ascending order.
+    for (const auto& j : stripes) {
+      j->stripe_mutex->Lock().PermitUncheckedError();
+      for (const auto& it : j->keys) {
+        struct KeyLockInfo info;
+        info.exclusive = it.second.exclusive;
+        info.key = it.first;
+        for (const auto& id : it.second.txn_ids) {
+          info.ids.push_back(id);
+        }
+        data.insert({i, info});
+      }
+    }
+  }
+
+  // Unlock everything. Unlocking order is not important.
+  for (auto i : cf_ids) {
+    const auto& stripes = lock_maps_[i]->lock_map_stripes_;
+    for (const auto& j : stripes) {
+      j->stripe_mutex->UnLock();
+    }
+  }
+
+  return data;
+}
+
+std::vector<DeadlockPath> PointLockManager::GetDeadlockInfoBuffer() {
+  return dlock_buffer_.PrepareBuffer();
+}
+
+void PointLockManager::Resize(uint32_t target_size) {
+  dlock_buffer_.Resize(target_size);
+}
+
+PointLockManager::RangeLockStatus PointLockManager::GetRangeLockStatus() {
+  return {};
+}
+
+Status PointLockManager::TryLock(PessimisticTransaction* /* txn */,
+                                 ColumnFamilyId /* cf_id */,
+                                 const Endpoint& /* start */,
+                                 const Endpoint& /* end */, Env* /* env */,
+                                 bool /* exclusive */) {
+  return Status::NotSupported(
+      "PointLockManager does not support range locking");
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* /* txn */,
+                              ColumnFamilyId /* cf_id */,
+                              const Endpoint& /* start */,
+                              const Endpoint& /* end */, Env* /* env */) {
+  // no-op
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,223 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/utilities/transaction.h"
+#include "util/autovector.h"
+#include "util/hash_map.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyHandle;
+struct LockInfo;
+struct LockMap;
+struct LockMapStripe;
+
+template <class Path>
+class DeadlockInfoBufferTempl {
+ private:
+  std::vector<Path> paths_buffer_;
+  uint32_t buffer_idx_;
+  std::mutex paths_buffer_mutex_;
+
+  std::vector<Path> Normalize() {
+    auto working = paths_buffer_;
+
+    if (working.empty()) {
+      return working;
+    }
+
+    // Next write occurs at a nonexistent path's slot
+    if (paths_buffer_[buffer_idx_].empty()) {
+      working.resize(buffer_idx_);
+    } else {
+      std::rotate(working.begin(), working.begin() + buffer_idx_,
+                  working.end());
+    }
+
+    return working;
+  }
+
+ public:
+  explicit DeadlockInfoBufferTempl(uint32_t n_latest_dlocks)
+      : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {}
+
+  void AddNewPath(Path path) {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    if (paths_buffer_.empty()) {
+      return;
+    }
+
+    paths_buffer_[buffer_idx_] = std::move(path);
+    buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
+  }
+
+  void Resize(uint32_t target_size) {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    paths_buffer_ = Normalize();
+
+    // Drop the deadlocks that will no longer be needed ater the normalize
+    if (target_size < paths_buffer_.size()) {
+      paths_buffer_.erase(
+          paths_buffer_.begin(),
+          paths_buffer_.begin() + (paths_buffer_.size() - target_size));
+      buffer_idx_ = 0;
+    }
+    // Resize the buffer to the target size and restore the buffer's idx
+    else {
+      auto prev_size = paths_buffer_.size();
+      paths_buffer_.resize(target_size);
+      buffer_idx_ = (uint32_t)prev_size;
+    }
+  }
+
+  std::vector<Path> PrepareBuffer() {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    // Reversing the normalized vector returns the latest deadlocks first
+    auto working = Normalize();
+    std::reverse(working.begin(), working.end());
+
+    return working;
+  }
+};
+
+using DeadlockInfoBuffer = DeadlockInfoBufferTempl<DeadlockPath>;
+
+struct TrackedTrxInfo {
+  autovector<TransactionID> m_neighbors;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+  std::string m_waiting_key;
+};
+
+class PointLockManager : public LockManager {
+ public:
+  PointLockManager(PessimisticTransactionDB* db,
+                   const TransactionDBOptions& opt);
+  // No copying allowed
+  PointLockManager(const PointLockManager&) = delete;
+  PointLockManager& operator=(const PointLockManager&) = delete;
+
+  ~PointLockManager() override {}
+
+  bool IsPointLockSupported() const override { return true; }
+
+  bool IsRangeLockSupported() const override { return false; }
+
+  const LockTrackerFactory& GetLockTrackerFactory() const override {
+    return PointLockTrackerFactory::Get();
+  }
+
+  // Creates a new LockMap for this column family.  Caller should guarantee
+  // that this column family does not already exist.
+  void AddColumnFamily(const ColumnFamilyHandle* cf) override;
+  // Deletes the LockMap for this column family.  Caller should guarantee that
+  // this column family is no longer in use.
+  void RemoveColumnFamily(const ColumnFamilyHandle* cf) override;
+
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const std::string& key, Env* env, bool exclusive) override;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const Endpoint& start, const Endpoint& end, Env* env,
+                 bool exclusive) override;
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const Endpoint& start, const Endpoint& end, Env* env) override;
+
+  PointLockStatus GetPointLockStatus() override;
+
+  RangeLockStatus GetRangeLockStatus() override;
+
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+
+  void Resize(uint32_t new_size) override;
+
+ private:
+  PessimisticTransactionDB* txn_db_impl_;
+
+  // Default number of lock map stripes per column family
+  const size_t default_num_stripes_;
+
+  // Limit on number of keys locked per column family
+  const int64_t max_num_locks_;
+
+  // The following lock order must be satisfied in order to avoid deadlocking
+  // ourselves.
+  //   - lock_map_mutex_
+  //   - stripe mutexes in ascending cf id, ascending stripe order
+  //   - wait_txn_map_mutex_
+  //
+  // Must be held when accessing/modifying lock_maps_.
+  InstrumentedMutex lock_map_mutex_;
+
+  // Map of ColumnFamilyId to locked key info
+  using LockMaps = std::unordered_map<uint32_t, std::shared_ptr<LockMap>>;
+  LockMaps lock_maps_;
+
+  // Thread-local cache of entries in lock_maps_.  This is an optimization
+  // to avoid acquiring a mutex in order to look up a LockMap
+  std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
+
+  // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_.
+  std::mutex wait_txn_map_mutex_;
+
+  // Maps from waitee -> number of waiters.
+  HashMap<TransactionID, int> rev_wait_txn_map_;
+  // Maps from waiter -> waitee.
+  HashMap<TransactionID, TrackedTrxInfo> wait_txn_map_;
+  DeadlockInfoBuffer dlock_buffer_;
+
+  // Used to allocate mutexes/condvars to use when locking keys
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  bool IsLockExpired(TransactionID txn_id, const LockInfo& lock_info, Env* env,
+                     uint64_t* wait_time);
+
+  std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
+
+  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
+                            LockMapStripe* stripe, uint32_t column_family_id,
+                            const std::string& key, Env* env, int64_t timeout,
+                            LockInfo&& lock_info);
+
+  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                       const std::string& key, Env* env,
+                       LockInfo&& lock_info, uint64_t* wait_time,
+                       autovector<TransactionID>* txn_ids);
+
+  void UnLockKey(PessimisticTransaction* txn, const std::string& key,
+                 LockMapStripe* stripe, LockMap* lock_map, Env* env);
+
+  bool IncrementWaiters(const PessimisticTransaction* txn,
+                        const autovector<TransactionID>& wait_ids,
+                        const std::string& key, const uint32_t& cf_id,
+                        const bool& exclusive, Env* const env);
+  void DecrementWaiters(const PessimisticTransaction* txn,
+                        const autovector<TransactionID>& wait_ids);
+  void DecrementWaitersImpl(const PessimisticTransaction* txn,
+                            const autovector<TransactionID>& wait_ids);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,181 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This test is not applicable for Range Lock manager as Range Lock Manager
+// operates on Column Families, not their ids.
+TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
+  MockColumnFamilyHandle cf(1024);
+  locker_->RemoveColumnFamily(&cf);
+  auto txn = NewTxn();
+  auto s = locker_->TryLock(txn, 1024, "k", env_, true);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_STREQ(s.getState(), "Column family id not found: 1024");
+  delete txn;
+}
+
+TEST_F(PointLockManagerTest, LockStatus) {
+  MockColumnFamilyHandle cf1(1024), cf2(2048);
+  locker_->AddColumnFamily(&cf1);
+  locker_->AddColumnFamily(&cf2);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1024, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn1, 2048, "k1", env_, true));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1024, "k2", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 2048, "k2", env_, false));
+
+  auto s = locker_->GetPointLockStatus();
+  ASSERT_EQ(s.size(), 4u);
+  for (uint32_t cf_id : {1024, 2048}) {
+    ASSERT_EQ(s.count(cf_id), 2u);
+    auto range = s.equal_range(cf_id);
+    for (auto it = range.first; it != range.second; it++) {
+      ASSERT_TRUE(it->second.key == "k1" || it->second.key == "k2");
+      if (it->second.key == "k1") {
+        ASSERT_EQ(it->second.exclusive, true);
+        ASSERT_EQ(it->second.ids.size(), 1u);
+        ASSERT_EQ(it->second.ids[0], txn1->GetID());
+      } else if (it->second.key == "k2") {
+        ASSERT_EQ(it->second.exclusive, false);
+        ASSERT_EQ(it->second.ids.size(), 1u);
+        ASSERT_EQ(it->second.ids[0], txn2->GetID());
+      }
+    }
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1024, "k1", env_);
+  locker_->UnLock(txn1, 2048, "k1", env_);
+  locker_->UnLock(txn2, 1024, "k2", env_);
+  locker_->UnLock(txn2, 2048, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockExclusive) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, true));
+  locker_->UnLock(txn1, 1, "k", env_);
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockShared) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  locker_->UnLock(txn1, 1, "k", env_);
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+// This test doesn't work with Range Lock Manager, because Range Lock Manager
+// doesn't support deadlock_detect_depth.
+
+TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
+  // Tests that when detecting deadlock, if the detection depth is exceeded,
+  // it's also viewed as deadlock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.deadlock_detect_depth = 1;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+  auto txn3 = NewTxn(txn_opt);
+  auto txn4 = NewTxn(txn_opt);
+  // "a ->(k) b" means transaction a is waiting for transaction b to release
+  // the held lock on key k.
+  // txn4 ->(k3) -> txn3 ->(k2) txn2 ->(k1) txn1
+  // txn3's deadlock detection will exceed the detection depth 1,
+  // which will be viewed as a deadlock.
+  // NOTE:
+  // txn4 ->(k3) -> txn3 must be set up before
+  // txn3 ->(k2) -> txn2, because to trigger deadlock detection for txn3,
+  // it must have another txn waiting on it, which is txn4 in this case.
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+    // block because txn1 is holding a lock on k1.
+    locker_->TryLock(txn2, 1, "k1", env_, true);
+  });
+
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true));
+
+  port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    // block because txn3 is holding a lock on k1.
+    locker_->TryLock(txn4, 1, "k3", env_, true);
+  });
+
+  auto s = locker_->TryLock(txn3, 1, "k2", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_TRUE(deadlock_paths[0].limit_exceeded);
+
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn3, 1, "k3", env_);
+  t1.join();
+  t2.join();
+
+  delete txn4;
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest,
+                        ::testing::Values(nullptr));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_manager_test.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,319 @@
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockColumnFamilyHandle : public ColumnFamilyHandle {
+ public:
+  explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
+
+  ~MockColumnFamilyHandle() override {}
+
+  const std::string& GetName() const override { return name_; }
+
+  ColumnFamilyId GetID() const override { return cf_id_; }
+
+  Status GetDescriptor(ColumnFamilyDescriptor*) override {
+    return Status::OK();
+  }
+
+  const Comparator* GetComparator() const override {
+    return BytewiseComparator();
+  }
+
+ private:
+  ColumnFamilyId cf_id_;
+  std::string name_ = "MockCF";
+};
+
+class PointLockManagerTest : public testing::Test {
+ public:
+  void SetUp() override {
+    env_ = Env::Default();
+    db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+    ASSERT_OK(env_->CreateDir(db_dir_));
+
+    Options opt;
+    opt.create_if_missing = true;
+    TransactionDBOptions txn_opt;
+    txn_opt.transaction_lock_timeout = 0;
+
+    ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_));
+
+    // CAUTION: This test creates a separate lock manager object (right, NOT
+    // the one that the TransactionDB is using!), and runs tests on it.
+    locker_.reset(new PointLockManager(
+        static_cast<PessimisticTransactionDB*>(db_), txn_opt));
+
+    wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+  }
+
+  void TearDown() override {
+    delete db_;
+    EXPECT_OK(DestroyDir(env_, db_dir_));
+  }
+
+  PessimisticTransaction* NewTxn(
+      TransactionOptions txn_opt = TransactionOptions()) {
+    Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt);
+    return reinterpret_cast<PessimisticTransaction*>(txn);
+  }
+
+ protected:
+  Env* env_;
+  std::shared_ptr<LockManager> locker_;
+  const char* wait_sync_point_name_;
+  friend void PointLockManagerTestExternalSetup(PointLockManagerTest*);
+
+ private:
+  std::string db_dir_;
+  TransactionDB* db_;
+};
+
+using init_func_t = void (*)(PointLockManagerTest*);
+
+class AnyLockManagerTest : public PointLockManagerTest,
+                           public testing::WithParamInterface<init_func_t> {
+ public:
+  void SetUp() override {
+    // If a custom setup function was provided, use it. Otherwise, use what we
+    // have inherited.
+    auto init_func = GetParam();
+    if (init_func)
+      (*init_func)(this);
+    else
+      PointLockManagerTest::SetUp();
+  }
+};
+
+TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
+  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
+  // Tests that a txn can acquire shared lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockUpgrade) {
+  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockDowngrade) {
+  // Tests that a txn can acquire a shared lock after acquiring an exclusive
+  // lock on the same key.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockConflict) {
+  // Tests that lock conflicts lead to lock timeout.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+
+  {
+    // exclusive-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // exclusive-shared conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // shared-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+port::Thread BlockUntilWaitingTxn(const char* sync_point_name,
+                                  std::function<void()> f) {
+  std::atomic<bool> reached(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      sync_point_name, [&](void* /*arg*/) { reached.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread t(f);
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  return t;
+}
+
+TEST_P(AnyLockManagerTest, SharedLocks) {
+  // Tests that shared locks can be concurrently held by multiple transactions.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, Deadlock) {
+  // Tests that deadlock can be detected.
+  // Deadlock scenario:
+  // txn1 exclusively locks k1, and wants to lock k2;
+  // txn2 exclusively locks k2, and wants to lock k1.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+
+  // txn1 tries to lock k2, will block forever.
+  port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    // block because txn2 is holding a lock on k2.
+    locker_->TryLock(txn1, 1, "k2", env_, true);
+  });
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  locker_->UnLock(txn2, 1, "k2", env_);
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  auto txn3 = NewTxn();
+  txn3->SetLockTimeout(10000);
+  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
+    locker_->UnLock(txn3, 1, "k", env_);
+  });
+
+  // Ok, now txn3 is waiting for lock on "k", which is owned by two
+  // transactions. Check that GetWaitingTxns reports this correctly
+  uint32_t wait_cf_id;
+  std::string wait_key;
+  auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
+
+  ASSERT_EQ(wait_cf_id, 1u);
+  ASSERT_EQ(wait_key, "k");
+  ASSERT_EQ(waiters.size(), 2);
+  bool waits_correct =
+      (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
+      (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
+  ASSERT_EQ(waits_correct, true);
+
+  // Release locks so txn3 can proceed with execution
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  // Wait until txn3 finishes
+  t1.join();
+
+  delete txn1;
+  delete txn2;
+  delete txn3;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,270 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class TrackedKeysColumnFamilyIterator
+    : public LockTracker::ColumnFamilyIterator {
+ public:
+  explicit TrackedKeysColumnFamilyIterator(const TrackedKeys& keys)
+      : tracked_keys_(keys), it_(keys.begin()) {}
+
+  bool HasNext() const override { return it_ != tracked_keys_.end(); }
+
+  ColumnFamilyId Next() override { return (it_++)->first; }
+
+ private:
+  const TrackedKeys& tracked_keys_;
+  TrackedKeys::const_iterator it_;
+};
+
+class TrackedKeysIterator : public LockTracker::KeyIterator {
+ public:
+  TrackedKeysIterator(const TrackedKeys& keys, ColumnFamilyId id)
+      : key_infos_(keys.at(id)), it_(key_infos_.begin()) {}
+
+  bool HasNext() const override { return it_ != key_infos_.end(); }
+
+  const std::string& Next() override { return (it_++)->first; }
+
+ private:
+  const TrackedKeyInfos& key_infos_;
+  TrackedKeyInfos::const_iterator it_;
+};
+
+}  // namespace
+
+void PointLockTracker::Track(const PointLockRequest& r) {
+  auto& keys = tracked_keys_[r.column_family_id];
+#ifdef __cpp_lib_unordered_map_try_emplace
+  // use c++17's try_emplace if available, to avoid rehashing the key
+  // in case it is not already in the map
+  auto result = keys.try_emplace(r.key, r.seq);
+  auto it = result.first;
+  if (!result.second && r.seq < it->second.seq) {
+    // Now tracking this key with an earlier sequence number
+    it->second.seq = r.seq;
+  }
+#else
+  auto it = keys.find(r.key);
+  if (it == keys.end()) {
+    auto result = keys.emplace(r.key, TrackedKeyInfo(r.seq));
+    it = result.first;
+  } else if (r.seq < it->second.seq) {
+    // Now tracking this key with an earlier sequence number
+    it->second.seq = r.seq;
+  }
+#endif
+  // else we do not update the seq. The smaller the tracked seq, the stronger it
+  // the guarantee since it implies from the seq onward there has not been a
+  // concurrent update to the key. So we update the seq if it implies stronger
+  // guarantees, i.e., if it is smaller than the existing tracked seq.
+
+  if (r.read_only) {
+    it->second.num_reads++;
+  } else {
+    it->second.num_writes++;
+  }
+
+  it->second.exclusive = it->second.exclusive || r.exclusive;
+}
+
+UntrackStatus PointLockTracker::Untrack(const PointLockRequest& r) {
+  auto cf_keys = tracked_keys_.find(r.column_family_id);
+  if (cf_keys == tracked_keys_.end()) {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  auto& keys = cf_keys->second;
+  auto it = keys.find(r.key);
+  if (it == keys.end()) {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  bool untracked = false;
+  auto& info = it->second;
+  if (r.read_only) {
+    if (info.num_reads > 0) {
+      info.num_reads--;
+      untracked = true;
+    }
+  } else {
+    if (info.num_writes > 0) {
+      info.num_writes--;
+      untracked = true;
+    }
+  }
+
+  bool removed = false;
+  if (info.num_reads == 0 && info.num_writes == 0) {
+    keys.erase(it);
+    if (keys.empty()) {
+      tracked_keys_.erase(cf_keys);
+    }
+    removed = true;
+  }
+
+  if (removed) {
+    return UntrackStatus::REMOVED;
+  }
+  if (untracked) {
+    return UntrackStatus::UNTRACKED;
+  }
+  return UntrackStatus::NOT_TRACKED;
+}
+
+void PointLockTracker::Merge(const LockTracker& tracker) {
+  const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker);
+  for (const auto& cf_keys : t.tracked_keys_) {
+    ColumnFamilyId cf = cf_keys.first;
+    const auto& keys = cf_keys.second;
+
+    auto current_cf_keys = tracked_keys_.find(cf);
+    if (current_cf_keys == tracked_keys_.end()) {
+      tracked_keys_.emplace(cf_keys);
+    } else {
+      auto& current_keys = current_cf_keys->second;
+      for (const auto& key_info : keys) {
+        const std::string& key = key_info.first;
+        const TrackedKeyInfo& info = key_info.second;
+        // If key was not previously tracked, just copy the whole struct over.
+        // Otherwise, some merging needs to occur.
+        auto current_info = current_keys.find(key);
+        if (current_info == current_keys.end()) {
+          current_keys.emplace(key_info);
+        } else {
+          current_info->second.Merge(info);
+        }
+      }
+    }
+  }
+}
+
+void PointLockTracker::Subtract(const LockTracker& tracker) {
+  const PointLockTracker& t = static_cast<const PointLockTracker&>(tracker);
+  for (const auto& cf_keys : t.tracked_keys_) {
+    ColumnFamilyId cf = cf_keys.first;
+    const auto& keys = cf_keys.second;
+
+    auto& current_keys = tracked_keys_.at(cf);
+    for (const auto& key_info : keys) {
+      const std::string& key = key_info.first;
+      const TrackedKeyInfo& info = key_info.second;
+      uint32_t num_reads = info.num_reads;
+      uint32_t num_writes = info.num_writes;
+
+      auto current_key_info = current_keys.find(key);
+      assert(current_key_info != current_keys.end());
+
+      // Decrement the total reads/writes of this key by the number of
+      // reads/writes done since the last SavePoint.
+      if (num_reads > 0) {
+        assert(current_key_info->second.num_reads >= num_reads);
+        current_key_info->second.num_reads -= num_reads;
+      }
+      if (num_writes > 0) {
+        assert(current_key_info->second.num_writes >= num_writes);
+        current_key_info->second.num_writes -= num_writes;
+      }
+      if (current_key_info->second.num_reads == 0 &&
+          current_key_info->second.num_writes == 0) {
+        current_keys.erase(current_key_info);
+      }
+    }
+  }
+}
+
+LockTracker* PointLockTracker::GetTrackedLocksSinceSavePoint(
+    const LockTracker& save_point_tracker) const {
+  // Examine the number of reads/writes performed on all keys written
+  // since the last SavePoint and compare to the total number of reads/writes
+  // for each key.
+  LockTracker* t = new PointLockTracker();
+  const PointLockTracker& save_point_t =
+      static_cast<const PointLockTracker&>(save_point_tracker);
+  for (const auto& cf_keys : save_point_t.tracked_keys_) {
+    ColumnFamilyId cf = cf_keys.first;
+    const auto& keys = cf_keys.second;
+
+    auto& current_keys = tracked_keys_.at(cf);
+    for (const auto& key_info : keys) {
+      const std::string& key = key_info.first;
+      const TrackedKeyInfo& info = key_info.second;
+      uint32_t num_reads = info.num_reads;
+      uint32_t num_writes = info.num_writes;
+
+      auto current_key_info = current_keys.find(key);
+      assert(current_key_info != current_keys.end());
+      assert(current_key_info->second.num_reads >= num_reads);
+      assert(current_key_info->second.num_writes >= num_writes);
+
+      if (current_key_info->second.num_reads == num_reads &&
+          current_key_info->second.num_writes == num_writes) {
+        // All the reads/writes to this key were done in the last savepoint.
+        PointLockRequest r;
+        r.column_family_id = cf;
+        r.key = key;
+        r.seq = info.seq;
+        r.read_only = (num_writes == 0);
+        r.exclusive = info.exclusive;
+        t->Track(r);
+      }
+    }
+  }
+  return t;
+}
+
+PointLockStatus PointLockTracker::GetPointLockStatus(
+    ColumnFamilyId column_family_id, const std::string& key) const {
+  assert(IsPointLockSupported());
+  PointLockStatus status;
+  auto it = tracked_keys_.find(column_family_id);
+  if (it == tracked_keys_.end()) {
+    return status;
+  }
+
+  const auto& keys = it->second;
+  auto key_it = keys.find(key);
+  if (key_it == keys.end()) {
+    return status;
+  }
+
+  const TrackedKeyInfo& key_info = key_it->second;
+  status.locked = true;
+  status.exclusive = key_info.exclusive;
+  status.seq = key_info.seq;
+  return status;
+}
+
+uint64_t PointLockTracker::GetNumPointLocks() const {
+  uint64_t num_keys = 0;
+  for (const auto& cf_keys : tracked_keys_) {
+    num_keys += cf_keys.second.size();
+  }
+  return num_keys;
+}
+
+LockTracker::ColumnFamilyIterator* PointLockTracker::GetColumnFamilyIterator()
+    const {
+  return new TrackedKeysColumnFamilyIterator(tracked_keys_);
+}
+
+LockTracker::KeyIterator* PointLockTracker::GetKeyIterator(
+    ColumnFamilyId column_family_id) const {
+  assert(tracked_keys_.find(column_family_id) != tracked_keys_.end());
+  return new TrackedKeysIterator(tracked_keys_, column_family_id);
+}
+
+void PointLockTracker::Clear() { tracked_keys_.clear(); }
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/point/point_lock_tracker.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,99 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "utilities/transactions/lock/lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TrackedKeyInfo {
+  // Earliest sequence number that is relevant to this transaction for this key
+  SequenceNumber seq;
+
+  uint32_t num_writes;
+  uint32_t num_reads;
+
+  bool exclusive;
+
+  explicit TrackedKeyInfo(SequenceNumber seq_no)
+      : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {}
+
+  void Merge(const TrackedKeyInfo& info) {
+    assert(seq <= info.seq);
+    num_reads += info.num_reads;
+    num_writes += info.num_writes;
+    exclusive = exclusive || info.exclusive;
+  }
+};
+
+using TrackedKeyInfos = std::unordered_map<std::string, TrackedKeyInfo>;
+
+using TrackedKeys = std::unordered_map<ColumnFamilyId, TrackedKeyInfos>;
+
+// Tracks point locks on single keys.
+class PointLockTracker : public LockTracker {
+ public:
+  PointLockTracker() = default;
+
+  PointLockTracker(const PointLockTracker&) = delete;
+  PointLockTracker& operator=(const PointLockTracker&) = delete;
+
+  bool IsPointLockSupported() const override { return true; }
+
+  bool IsRangeLockSupported() const override { return false; }
+
+  void Track(const PointLockRequest& lock_request) override;
+
+  UntrackStatus Untrack(const PointLockRequest& lock_request) override;
+
+  void Track(const RangeLockRequest& /*lock_request*/) override {}
+
+  UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  void Merge(const LockTracker& tracker) override;
+
+  void Subtract(const LockTracker& tracker) override;
+
+  void Clear() override;
+
+  virtual LockTracker* GetTrackedLocksSinceSavePoint(
+      const LockTracker& save_point_tracker) const override;
+
+  PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
+                                     const std::string& key) const override;
+
+  uint64_t GetNumPointLocks() const override;
+
+  ColumnFamilyIterator* GetColumnFamilyIterator() const override;
+
+  KeyIterator* GetKeyIterator(ColumnFamilyId column_family_id) const override;
+
+ private:
+  TrackedKeys tracked_keys_;
+};
+
+class PointLockTrackerFactory : public LockTrackerFactory {
+ public:
+  static const PointLockTrackerFactory& Get() {
+    static const PointLockTrackerFactory instance;
+    return instance;
+  }
+
+  LockTracker* Create() const override { return new PointLockTracker(); }
+
+ private:
+  PointLockTrackerFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_lock_manager.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,30 @@
+//
+// Generic definitions for a Range-based Lock Manager
+//
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+  A base class for all Range-based lock managers
+
+  See also class RangeLockManagerHandle in
+  include/rocksdb/utilities/transaction_db.h
+*/
+class RangeLockManagerBase : public LockManager {
+ public:
+  // Geting a point lock is reduced to getting a range lock on a single-point
+  // range
+  using LockManager::TryLock;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const std::string& key, Env* env, bool exclusive) override {
+    Endpoint endp(key.data(), key.size(), false);
+    return TryLock(txn, column_family_id, endp, endp, env, exclusive);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_locking_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,422 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_test.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeLockingTest : public ::testing::Test {
+ public:
+  TransactionDB* db;
+  std::string dbname;
+  Options options;
+
+  std::shared_ptr<RangeLockManagerHandle> range_lock_mgr;
+  TransactionDBOptions txn_db_options;
+
+  RangeLockingTest() : db(nullptr) {
+    options.create_if_missing = true;
+    dbname = test::PerThreadDBPath("range_locking_testdb");
+
+    DestroyDB(dbname, options);
+
+    range_lock_mgr.reset(NewRangeLockManager(nullptr));
+    txn_db_options.lock_mgr_handle = range_lock_mgr;
+
+    auto s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    assert(s.ok());
+  }
+
+  ~RangeLockingTest() {
+    delete db;
+    db = nullptr;
+    // This is to skip the assert statement in FaultInjectionTestEnv. There
+    // seems to be a bug in btrfs that the makes readdir return recently
+    // unlink-ed files. By using the default fs we simply ignore errors resulted
+    // from attempting to delete such files in DestroyDB.
+    DestroyDB(dbname, options);
+  }
+
+  PessimisticTransaction* NewTxn(
+      TransactionOptions txn_opt = TransactionOptions()) {
+    Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt);
+    return reinterpret_cast<PessimisticTransaction*>(txn);
+  }
+};
+
+// TODO: set a smaller lock wait timeout so that the test runs faster.
+TEST_F(RangeLockingTest, BasicRangeLocking) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  std::string value;
+  ReadOptions read_options;
+  auto cf = db->DefaultColumnFamily();
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  // Check that range Lock inhibits an overlapping range lock
+  {
+    auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Check that range Lock inhibits an overlapping point lock
+  {
+    auto s = txn1->GetForUpdate(read_options, cf, Slice("b"), &value);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Get a point lock, check that it inhibits range locks
+  ASSERT_OK(txn0->Put(cf, Slice("n"), Slice("value")));
+  {
+    auto s = txn1->GetRangeLock(cf, Endpoint("m"), Endpoint("p"));
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  ASSERT_OK(txn0->Commit());
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+TEST_F(RangeLockingTest, MyRocksLikeUpdate) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  auto cf = db->DefaultColumnFamily();
+  Status s;
+
+  // Get a range lock for the range we are about to update
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  bool try_range_lock_called = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RangeTreeLockManager::TryRangeLock:enter",
+      [&](void* /*arg*/) { try_range_lock_called = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // For performance reasons, the following must NOT call lock_mgr->TryLock():
+  // We verify that by checking the value of try_range_lock_called.
+  ASSERT_OK(txn0->Put(cf, Slice("b"), Slice("value"),
+                      /*assume_tracked=*/true));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_FALSE(try_range_lock_called);
+
+  txn0->Rollback();
+
+  delete txn0;
+}
+
+TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+  Status s;
+  std::string value;
+  txn_options.lock_timeout= 10;
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get the shared lock in txn0
+  s = txn0->GetForUpdate(ReadOptions(), cf,
+                                Slice("a"), &value,
+                                false /*exclusive*/);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Get the shared lock on the same key in txn1
+  s = txn1->GetForUpdate(ReadOptions(), cf,
+                         Slice("a"), &value,
+                         false /*exclusive*/);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Now, try getting an exclusive lock that overlaps with the above
+  s = txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("b"));
+  ASSERT_TRUE(s.IsTimedOut());
+
+  txn0->Rollback();
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+
+TEST_F(RangeLockingTest, SnapshotValidation) {
+  Status s;
+  Slice key_slice = Slice("k");
+  ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+
+  auto txn0 = NewTxn();
+  txn0->Put(key_slice, Slice("initial"));
+  txn0->Commit();
+
+  // txn1
+  auto txn1 = NewTxn();
+  txn1->SetSnapshot();
+  std::string val1;
+  ASSERT_OK(txn1->Get(ReadOptions(), cfh, key_slice, &val1));
+  ASSERT_EQ(val1, "initial");
+  val1 = val1 + std::string("-txn1");
+
+  ASSERT_OK(txn1->Put(cfh, key_slice, Slice(val1)));
+
+  // txn2
+  auto txn2 = NewTxn();
+  txn2->SetSnapshot();
+  std::string val2;
+  // This will see the original value as nothing is committed
+  // This is also Get, so it is doesn't acquire any locks.
+  ASSERT_OK(txn2->Get(ReadOptions(), cfh, key_slice, &val2));
+  ASSERT_EQ(val2, "initial");
+
+  // txn1
+  ASSERT_OK(txn1->Commit());
+
+  // txn2
+  val2 = val2 + std::string("-txn2");
+  // Now, this call should do Snapshot Validation and fail:
+  s = txn2->Put(cfh, key_slice, Slice(val2));
+  ASSERT_TRUE(s.IsBusy());
+
+  ASSERT_OK(txn2->Commit());
+
+  delete txn0;
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(RangeLockingTest, MultipleTrxLockStatusData) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("z"), Endpoint("z")));
+  ASSERT_OK(txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("e")));
+
+  auto s = range_lock_mgr->GetRangeLockStatusData();
+  ASSERT_EQ(s.size(), 2);
+  for (auto it = s.begin(); it != s.end(); ++it) {
+    ASSERT_EQ(it->first, cf->GetID());
+    auto val = it->second;
+    ASSERT_FALSE(val.start.inf_suffix);
+    ASSERT_FALSE(val.end.inf_suffix);
+    ASSERT_TRUE(val.exclusive);
+    ASSERT_EQ(val.ids.size(), 1);
+    if (val.ids[0] == txn0->GetID()) {
+      ASSERT_EQ(val.start.slice, "z");
+      ASSERT_EQ(val.end.slice, "z");
+    } else if (val.ids[0] == txn1->GetID()) {
+      ASSERT_EQ(val.start.slice, "b");
+      ASSERT_EQ(val.end.slice, "e");
+    } else {
+      FAIL();  // Unknown transaction ID.
+    }
+  }
+
+  delete txn0;
+  delete txn1;
+}
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+#else
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+
+#ifndef SKIP_LOCK_ESCALATION_TEST
+TEST_F(RangeLockingTest, BasicLockEscalation) {
+  auto cf = db->DefaultColumnFamily();
+
+  auto counters = range_lock_mgr->GetStatus();
+
+  // Initially not using any lock memory
+  ASSERT_EQ(counters.current_lock_memory, 0);
+  ASSERT_EQ(counters.escalation_count, 0);
+
+  ASSERT_EQ(0, range_lock_mgr->SetMaxLockMemory(2000));
+
+  // Insert until we see lock escalations
+  auto txn = NewTxn();
+
+  // Get the locks until we hit an escalation
+  for (int i = 0; i < 2020; i++) {
+    std::ostringstream buf;
+    buf << std::setw(8) << std::setfill('0') << i;
+    std::string buf_str = buf.str();
+    ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+  }
+  counters = range_lock_mgr->GetStatus();
+  ASSERT_GT(counters.escalation_count, 0);
+  ASSERT_LE(counters.current_lock_memory, 2000);
+
+  delete txn;
+}
+
+// An escalation barrier function. Allow escalation iff the first two bytes are
+// identical.
+static bool escalation_barrier(const Endpoint& a, const Endpoint& b) {
+  assert(a.slice.size() > 2);
+  assert(b.slice.size() > 2);
+  if (memcmp(a.slice.data(), b.slice.data(), 2)) {
+    return true;  // This is a barrier
+  } else {
+    return false;  // No barrier
+  }
+}
+
+TEST_F(RangeLockingTest, LockEscalationBarrier) {
+  auto cf = db->DefaultColumnFamily();
+
+  auto counters = range_lock_mgr->GetStatus();
+
+  // Initially not using any lock memory
+  ASSERT_EQ(counters.escalation_count, 0);
+
+  range_lock_mgr->SetMaxLockMemory(8000);
+  range_lock_mgr->SetEscalationBarrierFunc(escalation_barrier);
+
+  // Insert enough locks to cause lock escalations to happen
+  auto txn = NewTxn();
+  const int N = 2000;
+  for (int i = 0; i < N; i++) {
+    std::ostringstream buf;
+    buf << std::setw(4) << std::setfill('0') << i;
+    std::string buf_str = buf.str();
+    ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+  }
+  counters = range_lock_mgr->GetStatus();
+  ASSERT_GT(counters.escalation_count, 0);
+
+  // Check that lock escalation was not performed across escalation barriers:
+  // Use another txn to acquire locks near the barriers.
+  auto txn2 = NewTxn();
+  range_lock_mgr->SetMaxLockMemory(500000);
+  for (int i = 100; i < N; i += 100) {
+    std::ostringstream buf;
+    buf << std::setw(4) << std::setfill('0') << i - 1 << "-a";
+    std::string buf_str = buf.str();
+    // Check that we CAN get a lock near the escalation barrier
+    ASSERT_OK(txn2->GetRangeLock(cf, Endpoint(buf_str), Endpoint(buf_str)));
+  }
+
+  txn->Rollback();
+  txn2->Rollback();
+  delete txn;
+  delete txn2;
+}
+
+#endif
+
+TEST_F(RangeLockingTest, LockWaitCount) {
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+  txn_options.lock_timeout = 50;
+  Transaction* txn0 = db->BeginTransaction(WriteOptions(), txn_options);
+  Transaction* txn1 = db->BeginTransaction(WriteOptions(), txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  uint64_t lock_waits1 = range_lock_mgr->GetStatus().lock_wait_count;
+  // Attempt to get a conflicting lock
+  auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // Check that the counter was incremented
+  uint64_t lock_waits2 = range_lock_mgr->GetStatus().lock_wait_count;
+  ASSERT_EQ(lock_waits1 + 1, lock_waits2);
+
+  txn0->Rollback();
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+void PointLockManagerTestExternalSetup(PointLockManagerTest* self) {
+  self->env_ = Env::Default();
+  self->db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+  ASSERT_OK(self->env_->CreateDir(self->db_dir_));
+
+  Options opt;
+  opt.create_if_missing = true;
+  TransactionDBOptions txn_opt;
+  txn_opt.transaction_lock_timeout = 0;
+
+  auto mutex_factory = std::make_shared<TransactionDBMutexFactoryImpl>();
+  self->locker_.reset(NewRangeLockManager(mutex_factory)->getLockManager());
+  std::shared_ptr<RangeLockManagerHandle> range_lock_mgr =
+      std::dynamic_pointer_cast<RangeLockManagerHandle>(self->locker_);
+  txn_opt.lock_mgr_handle = range_lock_mgr;
+
+  ASSERT_OK(TransactionDB::Open(opt, txn_opt, self->db_dir_, &self->db_));
+  self->wait_sync_point_name_ = "RangeTreeLockManager::TryRangeLock:WaitingTxn";
+}
+
+INSTANTIATE_TEST_CASE_P(RangeLockManager, AnyLockManagerTest,
+                        ::testing::Values(PointLockManagerTestExternalSetup));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else  // OS_WIN
+
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "skipped as Range Locking is not supported on Windows\n");
+  return 0;
+}
+
+#endif  // OS_WIN
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "skipped as transactions are not supported in rocksdb_lite\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,174 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/README	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,13 @@
+The files in this directory originally come from
+https://github.com/percona/PerconaFT/.
+
+This directory only includes the "locktree" part of PerconaFT, and its
+dependencies.
+
+The following modifications were made:
+- Make locktree usable outside of PerconaFT library
+- Add shared read-only lock support
+
+The files named *_subst.* are substitutes of the PerconaFT's files, they
+contain replacements of PerconaFT's functionality.
+
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/db.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,76 @@
+#ifndef _DB_H
+#define _DB_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+typedef struct __toku_dbt DBT;
+
+// port: this is currently not used
+struct simple_dbt {
+  uint32_t len;
+  void *data;
+};
+
+// engine status info
+// engine status is passed to handlerton as an array of
+// TOKU_ENGINE_STATUS_ROW_S[]
+typedef enum {
+  STATUS_FS_STATE = 0,  // interpret as file system state (redzone) enum
+  STATUS_UINT64,        // interpret as uint64_t
+  STATUS_CHARSTR,       // interpret as char *
+  STATUS_UNIXTIME,      // interpret as time_t
+  STATUS_TOKUTIME,      // interpret as tokutime_t
+  STATUS_PARCOUNT,      // interpret as PARTITIONED_COUNTER
+  STATUS_DOUBLE         // interpret as double
+} toku_engine_status_display_type;
+
+typedef enum {
+  TOKU_ENGINE_STATUS = (1ULL << 0),  // Include when asking for engine status
+  TOKU_GLOBAL_STATUS =
+      (1ULL << 1),  // Include when asking for information_schema.global_status
+} toku_engine_status_include_type;
+
+typedef struct __toku_engine_status_row {
+  const char *keyname;  // info schema key, should not change across revisions
+                        // without good reason
+  const char
+      *columnname;  // column for mysql, e.g. information_schema.global_status.
+                    // TOKUDB_ will automatically be prefixed.
+  const char *legend;  // the text that will appear at user interface
+  toku_engine_status_display_type type;  // how to interpret the value
+  toku_engine_status_include_type
+      include;  // which kinds of callers should get read this row?
+  union {
+    double dnum;
+    uint64_t num;
+    const char *str;
+    char datebuf[26];
+    struct partitioned_counter *parcount;
+  } value;
+} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S;
+
+#define DB_BUFFER_SMALL -30999
+#define DB_LOCK_DEADLOCK -30995
+#define DB_LOCK_NOTGRANTED -30994
+#define DB_NOTFOUND -30989
+#define DB_KEYEXIST -30996
+#define DB_DBT_MALLOC 8
+#define DB_DBT_REALLOC 64
+#define DB_DBT_USERMEM 256
+
+/* PerconaFT specific error codes */
+#define TOKUDB_OUT_OF_LOCKS -100000
+
+typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid,
+                                   uint64_t blocking_txnid);
+
+struct __toku_dbt {
+  void *data;
+  size_t size;
+  size_t ulen;
+  // One of DB_DBT_XXX flags
+  uint32_t flags;
+};
+
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,138 @@
+/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b);
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+                    size_t key2len);
+
+int toku_builtin_compare_fun(const DBT *, const DBT *)
+    __attribute__((__visibility__("default")));
+
+namespace toku {
+
+// a comparator object encapsulates the data necessary for
+// comparing two keys in a fractal tree. it further understands
+// that points may be positive or negative infinity.
+
+class comparator {
+  void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) {
+    _cmp = cmp;
+    _cmp_arg = cmp_arg;
+    _memcmp_magic = memcmp_magic;
+  }
+
+ public:
+  // This magic value is reserved to mean that the magic has not been set.
+  static const uint8_t MEMCMP_MAGIC_NONE = 0;
+
+  void create(ft_compare_func cmp, void *cmp_arg,
+              uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) {
+    init(cmp, cmp_arg, memcmp_magic);
+  }
+
+  // inherit the attributes of another comparator, but keep our own
+  // copy of fake_db that is owned separately from the one given.
+  void inherit(const comparator &cmp) {
+    invariant_notnull(cmp._cmp);
+    init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic);
+  }
+
+  // like inherit, but doesn't require that the this comparator
+  // was already created
+  void create_from(const comparator &cmp) { inherit(cmp); }
+
+  void destroy() {}
+
+  ft_compare_func get_compare_func() const { return _cmp; }
+
+  uint8_t get_memcmp_magic() const { return _memcmp_magic; }
+
+  bool valid() const { return _cmp != nullptr; }
+
+  inline bool dbt_has_memcmp_magic(const DBT *dbt) const {
+    return *reinterpret_cast<const char *>(dbt->data) == _memcmp_magic;
+  }
+
+  int operator()(const DBT *a, const DBT *b) const {
+    if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b),
+                         0)) {
+      return toku_dbt_infinite_compare(a, b);
+    } else if (_memcmp_magic != MEMCMP_MAGIC_NONE
+               // If `a' has the memcmp magic..
+               && dbt_has_memcmp_magic(a)
+               // ..then we expect `b' to also have the memcmp magic
+               && __builtin_expect(dbt_has_memcmp_magic(b), 1)) {
+      assert(0);  // psergey: this branch should not be taken.
+      return toku_builtin_compare_fun(a, b);
+    } else {
+      // yikes, const sadness here
+      return _cmp(_cmp_arg, a, b);
+    }
+  }
+
+ private:
+  ft_compare_func _cmp;
+  void *_cmp_arg;
+
+  uint8_t _memcmp_magic;
+};
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,102 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../portability/toku_race_tools.h"
+#include "../util/status.h"
+
+//
+// Lock Tree Manager statistics
+//
+class LTM_STATUS_S {
+ public:
+  enum {
+    LTM_SIZE_CURRENT = 0,
+    LTM_SIZE_LIMIT,
+    LTM_ESCALATION_COUNT,
+    LTM_ESCALATION_TIME,
+    LTM_ESCALATION_LATEST_RESULT,
+    LTM_NUM_LOCKTREES,
+    LTM_LOCK_REQUESTS_PENDING,
+    LTM_STO_NUM_ELIGIBLE,
+    LTM_STO_END_EARLY_COUNT,
+    LTM_STO_END_EARLY_TIME,
+    LTM_WAIT_COUNT,
+    LTM_WAIT_TIME,
+    LTM_LONG_WAIT_COUNT,
+    LTM_LONG_WAIT_TIME,
+    LTM_TIMEOUT_COUNT,
+    LTM_WAIT_ESCALATION_COUNT,
+    LTM_WAIT_ESCALATION_TIME,
+    LTM_LONG_WAIT_ESCALATION_COUNT,
+    LTM_LONG_WAIT_ESCALATION_TIME,
+    LTM_STATUS_NUM_ROWS  // must be last
+  };
+
+  void init(void);
+  void destroy(void);
+
+  TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS];
+
+ private:
+  bool m_initialized = false;
+};
+typedef LTM_STATUS_S* LTM_STATUS;
+extern LTM_STATUS_S ltm_status;
+
+#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num
+
+void toku_status_init(void);     // just call ltm_status.init();
+void toku_status_destroy(void);  // just call ltm_status.destroy();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,139 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "concurrent_tree.h"
+
+// PORT #include <toku_assert.h>
+namespace toku {
+
+void concurrent_tree::create(const comparator *cmp) {
+  // start with an empty root node. we do this instead of
+  // setting m_root to null so there's always a root to lock
+  m_root.create_root(cmp);
+}
+
+void concurrent_tree::destroy(void) { m_root.destroy_root(); }
+
+bool concurrent_tree::is_empty(void) { return m_root.is_empty(); }
+
+uint64_t concurrent_tree::get_insertion_memory_overhead(void) {
+  return sizeof(treenode);
+}
+
+void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) {
+  // the first step in acquiring a locked keyrange is locking the root
+  treenode *const root = &tree->m_root;
+  m_tree = tree;
+  m_subtree = root;
+  m_range = keyrange::get_infinite_range();
+  root->mutex_lock();
+}
+
+void concurrent_tree::locked_keyrange::acquire(const keyrange &range) {
+  treenode *const root = &m_tree->m_root;
+
+  treenode *subtree;
+  if (root->is_empty() || root->range_overlaps(range)) {
+    subtree = root;
+  } else {
+    // we do not have a precomputed comparison hint, so pass null
+    const keyrange::comparison *cmp_hint = nullptr;
+    subtree = root->find_node_with_overlapping_child(range, cmp_hint);
+  }
+
+  // subtree is locked. it will be unlocked when this is release()'d
+  invariant_notnull(subtree);
+  m_range = range;
+  m_subtree = subtree;
+}
+
+bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range,
+                                                        TXNID new_owner) {
+  return m_subtree->insert(range, new_owner, /*is_shared*/ true);
+}
+
+void concurrent_tree::locked_keyrange::release(void) {
+  m_subtree->mutex_unlock();
+}
+
+void concurrent_tree::locked_keyrange::insert(const keyrange &range,
+                                              TXNID txnid, bool is_shared) {
+  // empty means no children, and only the root should ever be empty
+  if (m_subtree->is_empty()) {
+    m_subtree->set_range_and_txnid(range, txnid, is_shared);
+  } else {
+    m_subtree->insert(range, txnid, is_shared);
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove(const keyrange &range,
+                                              TXNID txnid) {
+  invariant(!m_subtree->is_empty());
+  treenode *new_subtree = m_subtree->remove(range, txnid);
+  // if removing range changed the root of the subtree,
+  // then the subtree must be the root of the entire tree.
+  if (new_subtree == nullptr) {
+    invariant(m_subtree->is_root());
+    invariant(m_subtree->is_empty());
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove_all(void) {
+  m_subtree->recursive_remove();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,174 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+#include "keyrange.h"
+#include "treenode.h"
+
+namespace toku {
+
+// A concurrent_tree stores non-overlapping ranges.
+// Access to disjoint parts of the tree usually occurs concurrently.
+
+class concurrent_tree {
+ public:
+  // A locked_keyrange gives you exclusive access to read and write
+  // operations that occur on any keys in that range. You only have
+  // the right to operate on keys in that range or keys that were read
+  // from the keyrange using iterate()
+  //
+  // Access model:
+  // - user prepares a locked keyrange. all threads serialize behind prepare().
+  // - user breaks the serialzation point by acquiring a range, or releasing.
+  // - one thread operates on a certain locked_keyrange object at a time.
+  // - when the thread is finished, it releases
+
+  class locked_keyrange {
+   public:
+    // effect: prepare to acquire a locked keyrange over the given
+    //         concurrent_tree, preventing other threads from preparing
+    //         until this thread either does acquire() or release().
+    // note: operations performed on a prepared keyrange are equivalent
+    //         to ones performed on an acquired keyrange over -inf, +inf.
+    // rationale: this provides the user with a serialization point for
+    // descending
+    //            or modifying the the tree. it also proives a convenient way of
+    //            doing serializable operations on the tree.
+    // There are two valid sequences of calls:
+    //  - prepare, acquire, [operations], release
+    //  - prepare, [operations],release
+    void prepare(concurrent_tree *tree);
+
+    // requires: the locked keyrange was prepare()'d
+    // effect: acquire a locked keyrange over the given concurrent_tree.
+    //         the locked keyrange represents the range of keys overlapped
+    //         by the given range
+    void acquire(const keyrange &range);
+
+    // effect: releases a locked keyrange and the mutex it holds
+    void release(void);
+
+    // effect: iterate over each range this locked_keyrange represents,
+    //         calling function->fn() on each node's keyrange and txnid
+    //         until there are no more or the function returns false
+    template <class F>
+    void iterate(F *function) const {
+      // if the subtree is non-empty, traverse it by calling the given
+      // function on each range, txnid pair found that overlaps.
+      if (!m_subtree->is_empty()) {
+        m_subtree->traverse_overlaps(m_range, function);
+      }
+    }
+
+    // Adds another owner to the lock on the specified keyrange.
+    // requires: the keyrange contains one treenode whose bounds are
+    //           exactly equal to the specifed range (no sub/supersets)
+    bool add_shared_owner(const keyrange &range, TXNID new_owner);
+
+    // inserts the given range into the tree, with an associated txnid.
+    // requires: range does not overlap with anything in this locked_keyrange
+    // rationale: caller is responsible for only inserting unique ranges
+    void insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+    // effect: removes the given range from the tree.
+    //         - txnid=TXNID_ANY means remove the range no matter what its
+    //           owners are
+    //         - Other value means remove the specified txnid from
+    //           ownership (if the range has other owners, it will remain
+    //           in the tree)
+    // requires: range exists exactly in this locked_keyrange
+    // rationale: caller is responsible for only removing existing ranges
+    void remove(const keyrange &range, TXNID txnid);
+
+    // effect: removes all of the keys represented by this locked keyrange
+    // rationale: we'd like a fast way to empty out a tree
+    void remove_all(void);
+
+   private:
+    // the concurrent tree this locked keyrange is for
+    concurrent_tree *m_tree;
+
+    // the range of keys this locked keyrange represents
+    keyrange m_range;
+
+    // the subtree under which all overlapping ranges exist
+    treenode *m_subtree;
+
+    friend class concurrent_tree_unit_test;
+  };
+
+  // effect: initialize the tree to an empty state
+  void create(const comparator *cmp);
+
+  // effect: destroy the tree.
+  // requires: tree is empty
+  void destroy(void);
+
+  // returns: true iff the tree is empty
+  bool is_empty(void);
+
+  // returns: the memory overhead of a single insertion into the tree
+  static uint64_t get_insertion_memory_overhead(void);
+
+ private:
+  // the root needs to always exist so there's a lock to grab
+  // even if the tree is empty. that's why we store a treenode
+  // here and not a pointer to one.
+  treenode m_root;
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,222 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "keyrange.h"
+
+#include "../util/dbt.h"
+
+namespace toku {
+
+// create a keyrange by borrowing the left and right dbt
+// pointers. no memory is copied. no checks for infinity needed.
+void keyrange::create(const DBT *left, const DBT *right) {
+  init_empty();
+  m_left_key = left;
+  m_right_key = right;
+}
+
+// destroy the key copies. if they were never set, then destroy does nothing.
+void keyrange::destroy(void) {
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+}
+
+// create a keyrange by copying the keys from the given range.
+void keyrange::create_copy(const keyrange &range) {
+  // start with an initialized, empty range
+  init_empty();
+
+  // optimize the case where the left and right keys are the same.
+  // we'd like to only have one copy of the data.
+  if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) {
+    set_both_keys(range.get_left_key());
+  } else {
+    // replace our empty left and right keys with
+    // copies of the range's left and right keys
+    replace_left_key(range.get_left_key());
+    replace_right_key(range.get_right_key());
+  }
+}
+
+// extend this keyrange by choosing the leftmost and rightmost
+// endpoints between this range and the given. replaced keys
+// in this range are freed and inherited keys are copied.
+void keyrange::extend(const comparator &cmp, const keyrange &range) {
+  const DBT *range_left = range.get_left_key();
+  const DBT *range_right = range.get_right_key();
+  if (cmp(range_left, get_left_key()) < 0) {
+    replace_left_key(range_left);
+  }
+  if (cmp(range_right, get_right_key()) > 0) {
+    replace_right_key(range_right);
+  }
+}
+
+// how much memory does this keyrange take?
+// - the size of the left and right keys
+// --- ignore the fact that we may have optimized the point case.
+//     it complicates things for little gain.
+// - the size of the keyrange class itself
+uint64_t keyrange::get_memory_size(void) const {
+  const DBT *left_key = get_left_key();
+  const DBT *right_key = get_right_key();
+  return left_key->size + right_key->size + sizeof(keyrange);
+}
+
+// compare ranges.
+keyrange::comparison keyrange::compare(const comparator &cmp,
+                                       const keyrange &range) const {
+  if (cmp(get_right_key(), range.get_left_key()) < 0) {
+    return comparison::LESS_THAN;
+  } else if (cmp(get_left_key(), range.get_right_key()) > 0) {
+    return comparison::GREATER_THAN;
+  } else if (cmp(get_left_key(), range.get_left_key()) == 0 &&
+             cmp(get_right_key(), range.get_right_key()) == 0) {
+    return comparison::EQUALS;
+  } else {
+    return comparison::OVERLAPS;
+  }
+}
+
+bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const {
+  // equality is a stronger form of overlapping.
+  // so two ranges "overlap" if they're either equal or just overlapping.
+  comparison c = compare(cmp, range);
+  return c == comparison::EQUALS || c == comparison::OVERLAPS;
+}
+
+keyrange keyrange::get_infinite_range(void) {
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+  return range;
+}
+
+void keyrange::init_empty(void) {
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+  m_point_range = false;
+}
+
+const DBT *keyrange::get_left_key(void) const {
+  if (m_left_key) {
+    return m_left_key;
+  } else {
+    return &m_left_key_copy;
+  }
+}
+
+const DBT *keyrange::get_right_key(void) const {
+  if (m_right_key) {
+    return m_right_key;
+  } else {
+    return &m_right_key_copy;
+  }
+}
+
+// copy the given once and set both the left and right pointers.
+// optimization for point ranges, so the left and right ranges
+// are not copied twice.
+void keyrange::set_both_keys(const DBT *key) {
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    toku_copyref_dbt(&m_right_key_copy, m_left_key_copy);
+  }
+  m_point_range = true;
+}
+
+// destroy the current left key. set and possibly copy the new one
+void keyrange::replace_left_key(const DBT *key) {
+  // a little magic:
+  //
+  // if this is a point range, then the left and right keys share
+  // one copy of the data, and it lives in the left key copy. so
+  // if we're replacing the left key, move the real data to the
+  // right key copy instead of destroying it. now, the memory is
+  // owned by the right key and the left key may be replaced.
+  if (m_point_range) {
+    m_right_key_copy = m_left_key_copy;
+  } else {
+    toku_destroy_dbt(&m_left_key_copy);
+  }
+
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    m_left_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+// destroy the current right key. set and possibly copy the new one
+void keyrange::replace_right_key(const DBT *key) {
+  toku_destroy_dbt(&m_right_key_copy);
+  if (toku_dbt_is_infinite(key)) {
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_right_key_copy, *key);
+    m_right_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+
+namespace toku {
+
+// A keyrange has a left and right key as endpoints.
+//
+// When a keyrange is created it owns no memory, but when it copies
+// or extends another keyrange, it copies memory as necessary. This
+// means it is cheap in the common case.
+
+class keyrange {
+ public:
+  // effect: constructor that borrows left and right key pointers.
+  //         no memory is allocated or copied.
+  void create(const DBT *left_key, const DBT *right_key);
+
+  // effect: constructor that allocates and copies another keyrange's points.
+  void create_copy(const keyrange &range);
+
+  // effect: destroys the keyrange, freeing any allocated memory
+  void destroy(void);
+
+  // effect: extends the keyrange by choosing the leftmost and rightmost
+  //         endpoints from this range and the given range.
+  //         replaced keys in this range are freed, new keys are copied.
+  void extend(const comparator &cmp, const keyrange &range);
+
+  // returns: the amount of memory this keyrange takes. does not account
+  //          for point optimizations or malloc overhead.
+  uint64_t get_memory_size(void) const;
+
+  // returns: pointer to the left key of this range
+  const DBT *get_left_key(void) const;
+
+  // returns: pointer to the right key of this range
+  const DBT *get_right_key(void) const;
+
+  // two ranges are either equal, lt, gt, or overlapping
+  enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS };
+
+  // effect: compares this range to the given range
+  // returns: LESS_THAN    if given range is strictly to the left
+  //          GREATER_THAN if given range is strictly to the right
+  //          EQUALS       if given range has the same left and right endpoints
+  //          OVERLAPS     if at least one of the given range's endpoints falls
+  //                       between this range's endpoints
+  comparison compare(const comparator &cmp, const keyrange &range) const;
+
+  // returns: true if the range and the given range are equal or overlapping
+  bool overlaps(const comparator &cmp, const keyrange &range) const;
+
+  // returns: a keyrange representing -inf, +inf
+  static keyrange get_infinite_range(void);
+
+ private:
+  // some keys should be copied, some keys should not be.
+  //
+  // to support both, we use two DBTs for copies and two pointers
+  // for temporaries. the access rule is:
+  // - if a pointer is non-null, then it reprsents the key.
+  // - otherwise the pointer is null, and the key is in the copy.
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+
+  // if this range is a point range, then m_left_key == m_right_key
+  // and the actual data is stored exactly once in m_left_key_copy.
+  bool m_point_range;
+
+  // effect: initializes a keyrange to be empty
+  void init_empty(void);
+
+  // effect: copies the given key once into the left key copy
+  //         and sets the right key copy to share the left.
+  // rationale: optimization for point ranges to only do one malloc
+  void set_both_keys(const DBT *key);
+
+  // effect: destroys the current left key. sets and copies the new one.
+  void replace_left_key(const DBT *key);
+
+  // effect: destroys the current right key. sets and copies the new one.
+  void replace_right_key(const DBT *key);
+};
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,525 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "lock_request.h"
+
+#include "../portability/toku_race_tools.h"
+#include "../portability/txn_subst.h"
+#include "../util/dbt.h"
+#include "locktree.h"
+
+namespace toku {
+
+// initialize a lock request's internals
+void lock_request::create(toku_external_mutex_factory_t mutex_factory) {
+  m_txnid = TXNID_NONE;
+  m_conflicting_txnid = TXNID_NONE;
+  m_start_time = 0;
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+
+  m_type = type::UNKNOWN;
+  m_lt = nullptr;
+
+  m_complete_r = 0;
+  m_state = state::UNINITIALIZED;
+  m_info = nullptr;
+
+  // psergey-todo: this condition is for interruptible wait
+  // note: moved to here from lock_request::create:
+  toku_external_cond_init(mutex_factory, &m_wait_cond);
+
+  m_start_test_callback = nullptr;
+  m_start_before_pending_test_callback = nullptr;
+  m_retry_test_callback = nullptr;
+}
+
+// destroy a lock request.
+void lock_request::destroy(void) {
+  invariant(m_state != state::PENDING);
+  invariant(m_state != state::DESTROYED);
+  m_state = state::DESTROYED;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  toku_external_cond_destroy(&m_wait_cond);
+}
+
+// set the lock request parameters. this API allows a lock request to be reused.
+void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, lock_request::type lock_type,
+                       bool big_txn, void *extra) {
+  invariant(m_state != state::PENDING);
+  m_lt = lt;
+
+  m_txnid = txnid;
+  m_left_key = left_key;
+  m_right_key = right_key;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  m_type = lock_type;
+  m_state = state::INITIALIZED;
+  m_info = lt ? lt->get_lock_request_info() : nullptr;
+  m_big_txn = big_txn;
+  m_extra = extra;
+}
+
+// get rid of any stored left and right key copies and
+// replace them with copies of the given left and right key
+void lock_request::copy_keys() {
+  if (!toku_dbt_is_infinite(m_left_key)) {
+    toku_clone_dbt(&m_left_key_copy, *m_left_key);
+    m_left_key = &m_left_key_copy;
+  }
+  if (!toku_dbt_is_infinite(m_right_key)) {
+    toku_clone_dbt(&m_right_key_copy, *m_right_key);
+    m_right_key = &m_right_key_copy;
+  }
+}
+
+// what are the conflicts for this pending lock request?
+void lock_request::get_conflicts(txnid_set *conflicts) {
+  invariant(m_state == state::PENDING);
+  const bool is_write_request = m_type == type::WRITE;
+  m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key,
+                      conflicts);
+}
+
+// build a wait-for-graph for this lock request and the given conflict set
+// for each transaction B that blocks A's lock request
+//     if B is blocked then
+//         add (A,T) to the WFG and if B is new, fill in the WFG from B
+void lock_request::build_wait_graph(wfg *wait_graph,
+                                    const txnid_set &conflicts) {
+  uint32_t num_conflicts = conflicts.size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    TXNID conflicting_txnid = conflicts.get(i);
+    lock_request *conflicting_request = find_lock_request(conflicting_txnid);
+    invariant(conflicting_txnid != m_txnid);
+    invariant(conflicting_request != this);
+    if (conflicting_request) {
+      bool already_exists = wait_graph->node_exists(conflicting_txnid);
+      wait_graph->add_edge(m_txnid, conflicting_txnid);
+      if (!already_exists) {
+        // recursively build the wait for graph rooted at the conflicting
+        // request, given its set of lock conflicts.
+        txnid_set other_conflicts;
+        other_conflicts.create();
+        conflicting_request->get_conflicts(&other_conflicts);
+        conflicting_request->build_wait_graph(wait_graph, other_conflicts);
+        other_conflicts.destroy();
+      }
+    }
+  }
+}
+
+// returns: true if the current set of lock requests contains
+//          a deadlock, false otherwise.
+bool lock_request::deadlock_exists(const txnid_set &conflicts) {
+  wfg wait_graph;
+  wait_graph.create();
+
+  build_wait_graph(&wait_graph, conflicts);
+
+  std::function<void(TXNID)> reporter;
+  if (m_deadlock_cb) {
+    reporter = [this](TXNID a) {
+      lock_request *req = find_lock_request(a);
+      if (req) {
+        m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE),
+                      req->m_left_key, req->m_right_key);
+      }
+    };
+  }
+
+  bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter);
+  wait_graph.destroy();
+  return deadlock;
+}
+
+// try to acquire a lock described by this lock request.
+int lock_request::start(void) {
+  int r;
+
+  txnid_set conflicts;
+  conflicts.create();
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    invariant(m_type == type::READ);
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the lock is not granted, save it to the set of lock requests
+  // and check for a deadlock. if there is one, complete it as failed
+  if (r == DB_LOCK_NOTGRANTED) {
+    copy_keys();
+    m_state = state::PENDING;
+    m_start_time = toku_current_time_microsec() / 1000;
+    m_conflicting_txnid = conflicts.get(0);
+    if (m_start_before_pending_test_callback)
+      m_start_before_pending_test_callback();
+    toku_external_mutex_lock(&m_info->mutex);
+    insert_into_lock_requests();
+    if (deadlock_exists(conflicts)) {
+      remove_from_lock_requests();
+      r = DB_LOCK_DEADLOCK;
+    }
+    toku_external_mutex_unlock(&m_info->mutex);
+    if (m_start_test_callback) m_start_test_callback();  // test callback
+  }
+
+  if (r != DB_LOCK_NOTGRANTED) {
+    complete(r);
+  }
+
+  conflicts.destroy();
+  return r;
+}
+
+// sleep on the lock request until it becomes resolved or the wait time has
+// elapsed.
+int lock_request::wait(uint64_t wait_time_ms) {
+  return wait(wait_time_ms, 0, nullptr);
+}
+
+int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+                       int (*killed_callback)(void),
+                       void (*lock_wait_callback)(void *, lock_wait_infos *),
+                       void *callback_arg) {
+  uint64_t t_now = toku_current_time_microsec();
+  uint64_t t_start = t_now;
+  uint64_t t_end = t_start + wait_time_ms * 1000;
+
+  toku_external_mutex_lock(&m_info->mutex);
+
+  // check again, this time locking out other retry calls
+  if (m_state == state::PENDING) {
+    lock_wait_infos conflicts_collector;
+    retry(&conflicts_collector);
+    if (m_state == state::PENDING) {
+      report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+    }
+  }
+
+  while (m_state == state::PENDING) {
+    // check if this thread is killed
+    if (killed_callback && killed_callback()) {
+      remove_from_lock_requests();
+      complete(DB_LOCK_NOTGRANTED);
+      continue;
+    }
+
+    // compute the time until we should wait
+    uint64_t t_wait;
+    if (killed_time_ms == 0) {
+      t_wait = t_end;
+    } else {
+      t_wait = t_now + killed_time_ms * 1000;
+      if (t_wait > t_end) t_wait = t_end;
+    }
+
+    int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex,
+                                         (int64_t)(t_wait - t_now));
+    invariant(r == 0 || r == ETIMEDOUT);
+
+    t_now = toku_current_time_microsec();
+    if (m_state == state::PENDING && (t_now >= t_end)) {
+      m_info->counters.timeout_count += 1;
+
+      // if we're still pending and we timed out, then remove our
+      // request from the set of lock requests and fail.
+      remove_from_lock_requests();
+
+      // complete sets m_state to COMPLETE, breaking us out of the loop
+      complete(DB_LOCK_NOTGRANTED);
+    }
+  }
+
+  uint64_t t_real_end = toku_current_time_microsec();
+  uint64_t duration = t_real_end - t_start;
+  m_info->counters.wait_count += 1;
+  m_info->counters.wait_time += duration;
+  if (duration >= 1000000) {
+    m_info->counters.long_wait_count += 1;
+    m_info->counters.long_wait_time += duration;
+  }
+  toku_external_mutex_unlock(&m_info->mutex);
+
+  invariant(m_state == state::COMPLETE);
+  return m_complete_r;
+}
+
+// complete this lock request with the given return value
+void lock_request::complete(int complete_r) {
+  m_complete_r = complete_r;
+  m_state = state::COMPLETE;
+}
+
+const DBT *lock_request::get_left_key(void) const { return m_left_key; }
+
+const DBT *lock_request::get_right_key(void) const { return m_right_key; }
+
+TXNID lock_request::get_txnid(void) const { return m_txnid; }
+
+uint64_t lock_request::get_start_time(void) const { return m_start_time; }
+
+TXNID lock_request::get_conflicting_txnid(void) const {
+  return m_conflicting_txnid;
+}
+
+int lock_request::retry(lock_wait_infos *conflicts_collector) {
+  invariant(m_state == state::PENDING);
+  int r;
+  txnid_set conflicts;
+  conflicts.create();
+
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the acquisition succeeded then remove ourselves from the
+  // set of lock requests, complete, and signal the waiting thread.
+  if (r == 0) {
+    remove_from_lock_requests();
+    complete(r);
+    if (m_retry_test_callback) m_retry_test_callback();  // test callback
+    toku_external_cond_broadcast(&m_wait_cond);
+  } else {
+    m_conflicting_txnid = conflicts.get(0);
+    add_conflicts_to_waits(&conflicts, conflicts_collector);
+  }
+  conflicts.destroy();
+
+  return r;
+}
+
+void lock_request::retry_all_lock_requests(
+    locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *),
+    void *callback_arg, void (*after_retry_all_test_callback)(void)) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+
+  // if there are no pending lock requests than there is nothing to do
+  // the unlocked data race on pending_is_empty is OK since lock requests
+  // are retried after added to the pending set.
+  if (info->pending_is_empty) return;
+
+  // get my retry generation (post increment of retry_want)
+  unsigned long long my_retry_want = (info->retry_want += 1);
+
+  toku_mutex_lock(&info->retry_mutex);
+
+  lock_wait_infos conflicts_collector;
+
+  // here is the group retry algorithm.
+  // get the latest retry_want count and use it as the generation number of
+  // this retry operation. if this retry generation is > the last retry
+  // generation, then do the lock retries.  otherwise, no lock retries
+  // are needed.
+  if ((my_retry_want - 1) == info->retry_done) {
+    for (;;) {
+      if (!info->running_retry) {
+        info->running_retry = true;
+        info->retry_done = info->retry_want;
+        toku_mutex_unlock(&info->retry_mutex);
+        retry_all_lock_requests_info(info, &conflicts_collector);
+        if (after_retry_all_test_callback) after_retry_all_test_callback();
+        toku_mutex_lock(&info->retry_mutex);
+        info->running_retry = false;
+        toku_cond_broadcast(&info->retry_cv);
+        break;
+      } else {
+        toku_cond_wait(&info->retry_cv, &info->retry_mutex);
+      }
+    }
+  }
+  toku_mutex_unlock(&info->retry_mutex);
+
+  report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+}
+
+void lock_request::retry_all_lock_requests_info(lt_lock_request_info *info,
+                                                lock_wait_infos *collector) {
+  toku_external_mutex_lock(&info->mutex);
+  // retry all of the pending lock requests.
+  for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    invariant_zero(r);
+
+    // retry the lock request. if it didn't succeed,
+    // move on to the next lock request. otherwise
+    // the request is gone from the list so we may
+    // read the i'th entry for the next one.
+    r = request->retry(collector);
+    if (r != 0) {
+      i++;
+    }
+  }
+
+  // future threads should only retry lock requests if some still exist
+  info->should_retry_lock_requests = info->pending_lock_requests.size() > 0;
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
+                                          lock_wait_infos *wait_conflicts) {
+  wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}});
+  uint32_t num_conflicts = conflicts->size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    wait_conflicts->back().waitees.push_back(conflicts->get(i));
+  }
+}
+
+void lock_request::report_waits(lock_wait_infos *wait_conflicts,
+                                void (*lock_wait_callback)(void *,
+                                                           lock_wait_infos *),
+                                void *callback_arg) {
+  if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts);
+}
+
+void *lock_request::get_extra(void) const { return m_extra; }
+
+void lock_request::kill_waiter(void) {
+  remove_from_lock_requests();
+  complete(DB_LOCK_NOTGRANTED);
+  toku_external_cond_broadcast(&m_wait_cond);
+}
+
+void lock_request::kill_waiter(locktree *lt, void *extra) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+  toku_external_mutex_lock(&info->mutex);
+  for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    if (r == 0 && request->get_extra() == extra) {
+      request->kill_waiter();
+      break;
+    }
+  }
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+// find another lock request by txnid. must hold the mutex.
+lock_request *lock_request::find_lock_request(const TXNID &txnid) {
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      txnid, &request, nullptr);
+  if (r != 0) {
+    request = nullptr;
+  }
+  return request;
+}
+
+// insert this lock request into the locktree's set. must hold the mutex.
+void lock_request::insert_into_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant(r == DB_NOTFOUND);
+  r = m_info->pending_lock_requests.insert_at(this, idx);
+  invariant_zero(r);
+  m_info->pending_is_empty = false;
+}
+
+// remove this lock request from the locktree's set. must hold the mutex.
+void lock_request::remove_from_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant_zero(r);
+  invariant(request == this);
+  r = m_info->pending_lock_requests.delete_at(idx);
+  invariant_zero(r);
+  if (m_info->pending_lock_requests.size() == 0)
+    m_info->pending_is_empty = true;
+}
+
+int lock_request::find_by_txnid(lock_request *const &request,
+                                const TXNID &txnid) {
+  TXNID request_txnid = request->m_txnid;
+  if (request_txnid < txnid) {
+    return -1;
+  } else if (request_txnid == txnid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void lock_request::set_start_test_callback(void (*f)(void)) {
+  m_start_test_callback = f;
+}
+
+void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
+  m_start_before_pending_test_callback = f;
+}
+
+void lock_request::set_retry_test_callback(void (*f)(void)) {
+  m_retry_test_callback = f;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,253 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_pthread.h"
+#include "locktree.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Information about a lock wait
+struct lock_wait_info {
+  locktree *ltree;  // the tree where wait happens
+  TXNID waiter;     // the waiting transaction
+  void *m_extra;    // lock_request's m_extra
+
+  // The transactions that are waited for.
+  std::vector<TXNID> waitees;
+};
+
+typedef std::vector<lock_wait_info> lock_wait_infos;
+
+// A lock request contains the db, the key range, the lock type, and
+// the transaction id that describes a potential row range lock.
+//
+// the typical use case is:
+// - initialize a lock request
+// - start to try to acquire the lock
+// - do something else
+// - wait for the lock request to be resolved on a timed condition
+// - destroy the lock request
+// a lock request is resolved when its state is no longer pending, or
+// when it becomes granted, or timedout, or deadlocked. when resolved, the
+// state of the lock request is changed and any waiting threads are awakened.
+
+class lock_request {
+ public:
+  enum type { UNKNOWN, READ, WRITE };
+
+  // effect: Initializes a lock request.
+  void create(toku_external_mutex_factory_t mutex_factory);
+
+  // effect: Destroys a lock request.
+  void destroy(void);
+
+  // effect: Resets the lock request parameters, allowing it to be reused.
+  // requires: Lock request was already created at some point
+  void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key,
+           type lock_type, bool big_txn, void *extra = nullptr);
+
+  // effect: Tries to acquire a lock described by this lock request.
+  // returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or DB_LOCK_DEADLOCK if this request would end up deadlocked.
+  int start(void);
+
+  // effect: Sleeps until either the request is granted or the wait time
+  // expires. returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or simply DB_LOCK_NOTGRANTED if the wait time expired.
+  int wait(uint64_t wait_time_ms);
+  int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+           int (*killed_callback)(void),
+           void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+           void *callback_arg = nullptr);
+
+  // return: left end-point of the lock range
+  const DBT *get_left_key(void) const;
+
+  // return: right end-point of the lock range
+  const DBT *get_right_key(void) const;
+
+  // return: the txnid waiting for a lock
+  TXNID get_txnid(void) const;
+
+  // return: when this lock request started, as milliseconds from epoch
+  uint64_t get_start_time(void) const;
+
+  // return: which txnid is blocking this request (there may be more, though)
+  TXNID get_conflicting_txnid(void) const;
+
+  // effect: Retries all of the lock requests for the given locktree.
+  //         Any lock requests successfully restarted is completed and woken
+  //         up.
+  //         The rest remain pending.
+  static void retry_all_lock_requests(
+      locktree *lt,
+      void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+      void *callback_arg = nullptr,
+      void (*after_retry_test_callback)(void) = nullptr);
+  static void retry_all_lock_requests_info(lt_lock_request_info *info,
+                                           lock_wait_infos *collector);
+
+  void set_start_test_callback(void (*f)(void));
+  void set_start_before_pending_test_callback(void (*f)(void));
+  void set_retry_test_callback(void (*f)(void));
+
+  void *get_extra(void) const;
+
+  void kill_waiter(void);
+  static void kill_waiter(locktree *lt, void *extra);
+
+ private:
+  enum state {
+    UNINITIALIZED,
+    INITIALIZED,
+    PENDING,
+    COMPLETE,
+    DESTROYED,
+  };
+
+  // The keys for a lock request are stored "unowned" in m_left_key
+  // and m_right_key. When the request is about to go to sleep, it
+  // copies these keys and stores them in m_left_key_copy etc and
+  // sets the temporary pointers to null.
+  TXNID m_txnid;
+  TXNID m_conflicting_txnid;
+  uint64_t m_start_time;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+
+  // The lock request type and associated locktree
+  type m_type;
+  locktree *m_lt;
+
+  // If the lock request is in the completed state, then its
+  // final return value is stored in m_complete_r
+  int m_complete_r;
+  state m_state;
+
+  toku_external_cond_t m_wait_cond;
+
+  bool m_big_txn;
+
+  // the lock request info state stored in the
+  // locktree that this lock request is for.
+  struct lt_lock_request_info *m_info;
+
+  void *m_extra;
+
+  // effect: tries again to acquire the lock described by this lock request
+  // returns: 0 if retrying the request succeeded and is now complete
+  int retry(lock_wait_infos *collector);
+
+  void complete(int complete_r);
+
+  // effect: Finds another lock request by txnid.
+  // requires: The lock request info mutex is held
+  lock_request *find_lock_request(const TXNID &txnid);
+
+  // effect: Insert this lock request into the locktree's set.
+  // requires: the locktree's mutex is held
+  void insert_into_lock_requests(void);
+
+  // effect: Removes this lock request from the locktree's set.
+  // requires: The lock request info mutex is held
+  void remove_from_lock_requests(void);
+
+  // effect: Asks this request's locktree which txnids are preventing
+  //         us from getting the lock described by this request.
+  // returns: conflicts is populated with the txnid's that this request
+  //          is blocked on
+  void get_conflicts(txnid_set *conflicts);
+
+  // effect: Builds a wait-for-graph for this lock request and the given
+  // conflict set
+  void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts);
+
+  // returns: True if this lock request is in deadlock with the given conflicts
+  // set
+  bool deadlock_exists(const txnid_set &conflicts);
+
+  void copy_keys(void);
+
+  static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
+
+  // Report list of conflicts to lock wait callback.
+  static void report_waits(lock_wait_infos *wait_conflicts,
+                           void (*lock_wait_callback)(void *,
+                                                      lock_wait_infos *),
+                           void *callback_arg);
+  void add_conflicts_to_waits(txnid_set *conflicts,
+                              lock_wait_infos *wait_conflicts);
+
+  void (*m_start_test_callback)(void);
+  void (*m_start_before_pending_test_callback)(void);
+  void (*m_retry_test_callback)(void);
+
+ public:
+  std::function<void(TXNID, bool, const DBT *, const DBT *)> m_deadlock_cb;
+
+  friend class lock_request_unit_test;
+};
+// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t
+//  This is ok as the PODness is not really required: lock_request objects are
+//  not moved in memory or anything.
+// ENSURE_POD(lock_request);
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,1024 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "locktree.h"
+
+#include <memory.h>
+
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+#include "../util/growable_array.h"
+#include "range_buffer.h"
+
+// including the concurrent_tree here expands the templates
+// and "defines" the implementation, so we do it here in
+// the locktree source file instead of the header.
+#include "concurrent_tree.h"
+
+namespace toku {
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given descriptor
+// and comparison fn.
+//
+// Each locktree has a reference count which it manages
+// but does nothing based on the value of the reference count - it is
+// up to the user of the locktree to destroy it when it sees fit.
+
+void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+                      const comparator &cmp,
+                      toku_external_mutex_factory_t mutex_factory) {
+  m_mgr = mgr;
+  m_dict_id = dict_id;
+
+  m_cmp.create_from(cmp);
+  m_reference_count = 1;
+  m_userdata = nullptr;
+
+  XCALLOC(m_rangetree);
+  m_rangetree->create(&m_cmp);
+
+  m_sto_txnid = TXNID_NONE;
+  m_sto_buffer.create();
+  m_sto_score = STO_SCORE_THRESHOLD;
+  m_sto_end_early_count = 0;
+  m_sto_end_early_time = 0;
+
+  m_escalation_barrier = [](const DBT *, const DBT *, void *) -> bool {
+    return false;
+  };
+
+  m_lock_request_info.init(mutex_factory);
+}
+
+void locktree::set_escalation_barrier_func(
+    lt_escalation_barrier_check_func func, void *extra) {
+  m_escalation_barrier = func;
+  m_escalation_barrier_arg = extra;
+}
+
+void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) {
+  pending_lock_requests.create();
+  pending_is_empty = true;
+  toku_external_mutex_init(mutex_factory, &mutex);
+  retry_want = retry_done = 0;
+  ZERO_STRUCT(counters);
+  ZERO_STRUCT(retry_mutex);
+  toku_mutex_init(locktree_request_info_retry_mutex_key, &retry_mutex, nullptr);
+  toku_cond_init(locktree_request_info_retry_cv_key, &retry_cv, nullptr);
+  running_retry = false;
+
+  TOKU_VALGRIND_HG_DISABLE_CHECKING(&pending_is_empty,
+                                    sizeof(pending_is_empty));
+  TOKU_DRD_IGNORE_VAR(pending_is_empty);
+}
+
+void locktree::destroy(void) {
+  invariant(m_reference_count == 0);
+  invariant(m_lock_request_info.pending_lock_requests.size() == 0);
+  m_cmp.destroy();
+  m_rangetree->destroy();
+  toku_free(m_rangetree);
+  m_sto_buffer.destroy();
+  m_lock_request_info.destroy();
+}
+
+void lt_lock_request_info::destroy(void) {
+  pending_lock_requests.destroy();
+  toku_external_mutex_destroy(&mutex);
+  toku_mutex_destroy(&retry_mutex);
+  toku_cond_destroy(&retry_cv);
+}
+
+void locktree::add_reference(void) {
+  (void)toku_sync_add_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::release_reference(void) {
+  return toku_sync_sub_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::get_reference_count(void) { return m_reference_count; }
+
+// a container for a range/txnid pair
+struct row_lock {
+  keyrange range;
+  TXNID txnid;
+  bool is_shared;
+  TxnidVector *owners;
+};
+
+// iterate over a locked keyrange and copy out all of the data,
+// storing each row lock into the given growable array. the
+// caller does not own the range inside the returned row locks,
+// so remove from the tree with care using them as keys.
+static void iterate_and_get_overlapping_row_locks(
+    const concurrent_tree::locked_keyrange *lkr,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  lkr->iterate(&copy_fn);
+}
+
+// given a txnid and a set of overlapping row locks, determine
+// which txnids are conflicting, and store them in the conflicts
+// set, if given.
+static bool determine_conflicting_txnids(
+    const GrowableArray<row_lock> &row_locks, const TXNID &txnid,
+    txnid_set *conflicts) {
+  bool conflicts_exist = false;
+  const size_t num_overlaps = row_locks.get_size();
+  for (size_t i = 0; i < num_overlaps; i++) {
+    const row_lock lock = row_locks.fetch_unchecked(i);
+    const TXNID other_txnid = lock.txnid;
+    if (other_txnid != txnid) {
+      if (conflicts) {
+        if (other_txnid == TXNID_SHARED) {
+          // Add all shared lock owners, except this transaction.
+          for (TXNID shared_id : *lock.owners) {
+            if (shared_id != txnid)
+              conflicts->add(shared_id);
+          }
+        } else {
+          conflicts->add(other_txnid);
+        }
+      }
+      conflicts_exist = true;
+    }
+  }
+  return conflicts_exist;
+}
+
+// how much memory does a row lock take up in a concurrent tree?
+static uint64_t row_lock_size_in_tree(const row_lock &lock) {
+  const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead();
+  return lock.range.get_memory_size() + overhead;
+}
+
+// remove and destroy the given row lock from the locked keyrange,
+// then notify the memory tracker of the newly freed lock.
+static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock, TXNID txnid,
+                                      locktree_manager *mgr) {
+  const uint64_t mem_released = row_lock_size_in_tree(lock);
+  lkr->remove(lock.range, txnid);
+  if (mgr != nullptr) {
+    mgr->note_mem_released(mem_released);
+  }
+}
+
+// insert a row lock into the locked keyrange, then notify
+// the memory tracker of this newly acquired lock.
+static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock,
+                                      locktree_manager *mgr) {
+  uint64_t mem_used = row_lock_size_in_tree(lock);
+  lkr->insert(lock.range, lock.txnid, lock.is_shared);
+  if (mgr != nullptr) {
+    mgr->note_mem_used(mem_used);
+  }
+}
+
+void locktree::sto_begin(TXNID txnid) {
+  invariant(m_sto_txnid == TXNID_NONE);
+  invariant(m_sto_buffer.is_empty());
+  m_sto_txnid = txnid;
+}
+
+void locktree::sto_append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  uint64_t buffer_mem, delta;
+
+  // psergey: the below two lines do not make any sense
+  // (and it's the same in upstream TokuDB)
+  keyrange range;
+  range.create(left_key, right_key);
+
+  buffer_mem = m_sto_buffer.total_memory_size();
+  m_sto_buffer.append(left_key, right_key, is_write_request);
+  delta = m_sto_buffer.total_memory_size() - buffer_mem;
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_used(delta);
+  }
+}
+
+void locktree::sto_end(void) {
+  uint64_t mem_size = m_sto_buffer.total_memory_size();
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_released(mem_size);
+  }
+  m_sto_buffer.destroy();
+  m_sto_buffer.create();
+  m_sto_txnid = TXNID_NONE;
+}
+
+void locktree::sto_end_early_no_accounting(void *prepared_lkr) {
+  sto_migrate_buffer_ranges_to_tree(prepared_lkr);
+  sto_end();
+  toku_unsafe_set(m_sto_score, 0);
+}
+
+void locktree::sto_end_early(void *prepared_lkr) {
+  m_sto_end_early_count++;
+
+  tokutime_t t0 = toku_time_now();
+  sto_end_early_no_accounting(prepared_lkr);
+  tokutime_t t1 = toku_time_now();
+
+  m_sto_end_early_time += (t1 - t0);
+}
+
+void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
+  // There should be something to migrate, and nothing in the rangetree.
+  invariant(!m_sto_buffer.is_empty());
+  invariant(m_rangetree->is_empty());
+
+  concurrent_tree sto_rangetree;
+  concurrent_tree::locked_keyrange sto_lkr;
+  sto_rangetree.create(&m_cmp);
+
+  // insert all of the ranges from the single txnid buffer into a new rangtree
+  range_buffer::iterator iter(&m_sto_buffer);
+  range_buffer::iterator::record rec;
+  while (iter.current(&rec)) {
+    sto_lkr.prepare(&sto_rangetree);
+    int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, rec.get_left_key(),
+                                      rec.get_right_key(),
+                                      rec.get_exclusive_flag(), nullptr);
+    invariant_zero(r);
+    sto_lkr.release();
+    iter.next();
+  }
+
+  // Iterate the newly created rangetree and insert each range into the
+  // locktree's rangetree, on behalf of the old single txnid.
+  struct migrate_fn_obj {
+    concurrent_tree::locked_keyrange *dst_lkr;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      // There can't be multiple owners in STO mode
+      invariant_zero(owners);
+      dst_lkr->insert(range, txnid, is_shared);
+      return true;
+    }
+  } migrate_fn;
+  migrate_fn.dst_lkr =
+      static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  sto_lkr.prepare(&sto_rangetree);
+  sto_lkr.iterate(&migrate_fn);
+  sto_lkr.remove_all();
+  sto_lkr.release();
+  sto_rangetree.destroy();
+  invariant(!m_rangetree->is_empty());
+}
+
+bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               bool is_write_request) {
+  if (m_rangetree->is_empty() && m_sto_buffer.is_empty() &&
+      toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) {
+    // We can do the optimization because the rangetree is empty, and
+    // we know its worth trying because the sto score is big enough.
+    sto_begin(txnid);
+  } else if (m_sto_txnid != TXNID_NONE) {
+    // We are currently doing the optimization. Check if we need to cancel
+    // it because a new txnid appeared, or if the current single txnid has
+    // taken too many locks already.
+    if (m_sto_txnid != txnid ||
+        m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) {
+      sto_end_early(prepared_lkr);
+    }
+  }
+
+  // At this point the sto txnid is properly set. If it is valid, then
+  // this txnid can append its lock to the sto buffer successfully.
+  if (m_sto_txnid != TXNID_NONE) {
+    invariant(m_sto_txnid == txnid);
+    sto_append(left_key, right_key, is_write_request);
+    return true;
+  } else {
+    invariant(m_sto_buffer.is_empty());
+    return false;
+  }
+}
+
+/*
+  Do the same as iterate_and_get_overlapping_row_locks does, but also check for
+  this:
+    The set of overlapping rows locks consists of just one read-only shared
+    lock with the same endpoints as specified (in that case, we can just add
+    ourselves into that list)
+
+  @return true - One compatible shared lock
+         false - Otherwise
+*/
+static bool iterate_and_get_overlapping_row_locks2(
+    const concurrent_tree::locked_keyrange *lkr, const DBT *left_key,
+    const DBT *right_key, comparator *cmp, TXNID,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool first_call = true;
+    bool matching_lock_found = false;
+    const DBT *left_key, *right_key;
+    comparator *cmp;
+
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (first_call) {
+        first_call = false;
+        if (is_shared && !(*cmp)(left_key, range.get_left_key()) &&
+            !(*cmp)(right_key, range.get_right_key())) {
+          matching_lock_found = true;
+        }
+      } else {
+        // if we see multiple matching locks, it doesn't matter whether
+        // the first one was matching.
+        matching_lock_found = false;
+      }
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  copy_fn.left_key = left_key;
+  copy_fn.right_key = right_key;
+  copy_fn.cmp = cmp;
+  lkr->iterate(&copy_fn);
+  return copy_fn.matching_lock_found;
+}
+
+// try to acquire a lock and consolidate it with existing locks if possible
+// param: lkr, a prepared locked keyrange
+// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist.
+int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                        const DBT *left_key,
+                                        const DBT *right_key,
+                                        bool is_write_request,
+                                        txnid_set *conflicts) {
+  int r = 0;
+  concurrent_tree::locked_keyrange *lkr;
+
+  keyrange requested_range;
+  requested_range.create(left_key, right_key);
+  lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  lkr->acquire(requested_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  bool matching_shared_lock_found = false;
+
+  if (is_write_request)
+    iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+  else {
+    matching_shared_lock_found = iterate_and_get_overlapping_row_locks2(
+        lkr, left_key, right_key, &m_cmp, txnid, &overlapping_row_locks);
+    // psergey-todo: what to do now? So, we have figured we have just one
+    // shareable lock. Need to add us into it as an owner but the lock
+    // pointer cannot be kept?
+    // A: use find_node_with_overlapping_child(key_range, nullptr);
+    //  then, add ourselves to the owner list.
+    // Dont' foreget to release the subtree after that.
+  }
+
+  if (matching_shared_lock_found) {
+    // there is just one non-confliting matching shared lock.
+    //  we are hilding a lock on it (see acquire() call above).
+    //  we need to modify it to indicate there is another locker...
+    if (lkr->add_shared_owner(requested_range, txnid)) {
+      // Pretend shared lock uses as much memory.
+      row_lock new_lock = {.range = requested_range,
+                           .txnid = txnid,
+                           .is_shared = false,
+                           .owners = nullptr};
+      uint64_t mem_used = row_lock_size_in_tree(new_lock);
+      if (m_mgr) {
+        m_mgr->note_mem_used(mem_used);
+      }
+    }
+    requested_range.destroy();
+    overlapping_row_locks.deinit();
+    return 0;
+  }
+
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  // if any overlapping row locks conflict with this request, bail out.
+
+  bool conflicts_exist =
+      determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+  if (!conflicts_exist) {
+    // there are no conflicts, so all of the overlaps are for the requesting
+    // txnid. so, we must consolidate all existing overlapping ranges and the
+    // requested range into one dominating range. then we insert the dominating
+    // range.
+    bool all_shared = !is_write_request;
+    for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+      row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i);
+      invariant(overlapping_lock.txnid == txnid);
+      requested_range.extend(m_cmp, overlapping_lock.range);
+      remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr);
+      all_shared = all_shared && overlapping_lock.is_shared;
+    }
+
+    row_lock new_lock = {.range = requested_range,
+                         .txnid = txnid,
+                         .is_shared = all_shared,
+                         .owners = nullptr};
+    insert_row_lock_into_tree(lkr, new_lock, m_mgr);
+  } else {
+    r = DB_LOCK_NOTGRANTED;
+  }
+
+  requested_range.destroy();
+  overlapping_row_locks.deinit();
+  return r;
+}
+
+// acquire a lock in the given key range, inclusive. if successful,
+// return 0. otherwise, populate the conflicts txnid_set with the set of
+// transactions that conflict with this request.
+int locktree::acquire_lock(bool is_write_request, TXNID txnid,
+                           const DBT *left_key, const DBT *right_key,
+                           txnid_set *conflicts) {
+  int r = 0;
+
+  // we are only supporting write locks for simplicity
+  // invariant(is_write_request);
+
+  // acquire and prepare a locked keyrange over the requested range.
+  // prepare is a serialzation point, so we take the opportunity to
+  // try the single txnid optimization first.
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+
+  bool acquired =
+      sto_try_acquire(&lkr, txnid, left_key, right_key, is_write_request);
+  if (!acquired) {
+    r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key,
+                                  is_write_request, conflicts);
+  }
+
+  lkr.release();
+  return r;
+}
+
+int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               txnid_set *conflicts, bool big_txn) {
+  // All ranges in the locktree must have left endpoints <= right endpoints.
+  // Range comparisons rely on this fact, so we make a paranoid invariant here.
+  paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+  int r = m_mgr == nullptr ? 0 : m_mgr->check_current_lock_constraints(big_txn);
+  if (r == 0) {
+    r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts);
+  }
+  return r;
+}
+
+// the locktree silently upgrades read locks to write locks for simplicity
+int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key,
+                                const DBT *right_key, txnid_set *conflicts,
+                                bool big_txn) {
+  return try_acquire_lock(false, txnid, left_key, right_key, conflicts,
+                          big_txn);
+}
+
+int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key,
+                                 const DBT *right_key, txnid_set *conflicts,
+                                 bool big_txn) {
+  return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn);
+}
+
+// typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+// TXNID txnid);
+void locktree::dump_locks(void *cdata, dump_callback cb) {
+  concurrent_tree::locked_keyrange lkr;
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  TXNID sto_txn;
+  if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) {
+    // insert all of the ranges from the single txnid buffer into a new rangtree
+    range_buffer::iterator iter(&m_sto_buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      (*cb)(cdata, rec.get_left_key(), rec.get_right_key(), sto_txn,
+            !rec.get_exclusive_flag(), nullptr);
+      iter.next();
+    }
+  } else {
+    GrowableArray<row_lock> all_locks;
+    all_locks.init();
+    iterate_and_get_overlapping_row_locks(&lkr, &all_locks);
+
+    const size_t n_locks = all_locks.get_size();
+    for (size_t i = 0; i < n_locks; i++) {
+      const row_lock lock = all_locks.fetch_unchecked(i);
+      (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(),
+            lock.txnid, lock.is_shared, lock.owners);
+    }
+    all_locks.deinit();
+  }
+  lkr.release();
+  range.destroy();
+}
+
+void locktree::get_conflicts(bool is_write_request, TXNID txnid,
+                             const DBT *left_key, const DBT *right_key,
+                             txnid_set *conflicts) {
+  // because we only support write locks, ignore this bit for now.
+  (void)is_write_request;
+
+  // preparing and acquire a locked keyrange over the range
+  keyrange range;
+  range.create(left_key, right_key);
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  // copy out the set of overlapping row locks and determine the conflicts
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+
+  // we don't care if conflicts exist. we just want the conflicts set populated.
+  (void)determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  range.destroy();
+}
+
+// Effect:
+//  For each range in the lock tree that overlaps the given range and has
+//  the given txnid, remove it.
+// Rationale:
+//  In the common case, there is only the range [left_key, right_key] and
+//  it is associated with txnid, so this is a single tree delete.
+//
+//  However, consolidation and escalation change the objects in the tree
+//  without telling the txn anything.  In this case, the txn may own a
+//  large range lock that represents its ownership of many smaller range
+//  locks.  For example, the txn may think it owns point locks on keys 1,
+//  2, and 3, but due to escalation, only the object [1,3] exists in the
+//  tree.
+//
+//  The first call for a small lock will remove the large range lock, and
+//  the rest of the calls should do nothing.  After the first release,
+//  another thread can acquire one of the locks that the txn thinks it
+//  still owns.  That's ok, because the txn doesn't want it anymore (it
+//  unlocks everything at once), but it may find a lock that it does not
+//  own.
+//
+//  In our example, the txn unlocks key 1, which actually removes the
+//  whole lock [1,3].  Now, someone else can lock 2 before our txn gets
+//  around to unlocking 2, so we should not remove that lock.
+void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
+                                                  const DBT *left_key,
+                                                  const DBT *right_key) {
+  keyrange release_range;
+  release_range.create(left_key, right_key);
+
+  // acquire and prepare a locked keyrange over the release range
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(release_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+    row_lock lock = overlapping_row_locks.fetch_unchecked(i);
+    // If this isn't our lock, that's ok, just don't remove it.
+    // See rationale above.
+    // psergey-todo: for shared locks, just remove ourselves from the
+    //               owners.
+    if (lock.txnid == txnid || (lock.owners && lock.owners->contains(txnid))) {
+      remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr);
+    }
+  }
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  release_range.destroy();
+}
+
+bool locktree::sto_txnid_is_valid_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE;
+}
+
+int locktree::sto_get_score_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_score);
+}
+
+bool locktree::sto_try_release(TXNID txnid) {
+  bool released = false;
+  if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+    // check the bit again with a prepared locked keyrange,
+    // which protects the optimization bits and rangetree data
+    concurrent_tree::locked_keyrange lkr;
+    lkr.prepare(m_rangetree);
+    if (m_sto_txnid != TXNID_NONE) {
+      // this txnid better be the single txnid on this locktree,
+      // or else we are in big trouble (meaning the logic is broken)
+      invariant(m_sto_txnid == txnid);
+      invariant(m_rangetree->is_empty());
+      sto_end();
+      released = true;
+    }
+    lkr.release();
+  }
+  return released;
+}
+
+// release all of the locks for a txnid whose endpoints are pairs
+// in the given range buffer.
+void locktree::release_locks(TXNID txnid, const range_buffer *ranges,
+                             bool all_trx_locks_hint) {
+  // try the single txn optimization. if it worked, then all of the
+  // locks are already released, otherwise we need to do it here.
+  bool released;
+  if (all_trx_locks_hint) {
+    // This will release all of the locks the transaction is holding
+    released = sto_try_release(txnid);
+  } else {
+    /*
+      psergey: we are asked to release *Some* of the locks the transaction
+      is holding.
+      We could try doing that without leaving the STO mode, but right now,
+      the easiest way is to exit the STO mode and let the non-STO code path
+      handle it.
+    */
+    if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+      // check the bit again with a prepared locked keyrange,
+      // which protects the optimization bits and rangetree data
+      concurrent_tree::locked_keyrange lkr;
+      lkr.prepare(m_rangetree);
+      if (m_sto_txnid != TXNID_NONE) {
+        sto_end_early(&lkr);
+      }
+      lkr.release();
+    }
+    released = false;
+  }
+  if (!released) {
+    range_buffer::iterator iter(ranges);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      const DBT *left_key = rec.get_left_key();
+      const DBT *right_key = rec.get_right_key();
+      // All ranges in the locktree must have left endpoints <= right endpoints.
+      // Range comparisons rely on this fact, so we make a paranoid invariant
+      // here.
+      paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+      remove_overlapping_locks_for_txnid(txnid, left_key, right_key);
+      iter.next();
+    }
+    // Increase the sto score slightly. Eventually it will hit
+    // the threshold and we'll try the optimization again. This
+    // is how a previously multithreaded system transitions into
+    // a single threaded system that benefits from the optimization.
+    if (toku_unsafe_fetch(m_sto_score) < STO_SCORE_THRESHOLD) {
+      toku_sync_fetch_and_add(&m_sto_score, 1);
+    }
+  }
+}
+
+// iterate over a locked keyrange and extract copies of the first N
+// row locks, storing each one into the given array of size N,
+// then removing each extracted lock from the locked keyrange.
+static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,
+                                     locktree_manager *mgr, row_lock *row_locks,
+                                     int num_to_extract) {
+  struct extract_fn_obj {
+    int num_extracted;
+    int num_to_extract;
+    row_lock *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (num_extracted < num_to_extract) {
+        row_lock lock;
+        lock.range.create_copy(range);
+        lock.txnid = txnid;
+        lock.is_shared = is_shared;
+        // deep-copy the set of owners:
+        if (owners)
+          lock.owners = new TxnidVector(*owners);
+        else
+          lock.owners = nullptr;
+        row_locks[num_extracted++] = lock;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  } extract_fn;
+
+  extract_fn.row_locks = row_locks;
+  extract_fn.num_to_extract = num_to_extract;
+  extract_fn.num_extracted = 0;
+  lkr->iterate(&extract_fn);
+
+  // now that the ranges have been copied out, complete
+  // the extraction by removing the ranges from the tree.
+  // use remove_row_lock_from_tree() so we properly track the
+  // amount of memory and number of locks freed.
+  int num_extracted = extract_fn.num_extracted;
+  invariant(num_extracted <= num_to_extract);
+  for (int i = 0; i < num_extracted; i++) {
+    remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr);
+  }
+
+  return num_extracted;
+}
+
+// Store each newly escalated lock in a range buffer for appropriate txnid.
+// We'll rebuild the locktree by iterating over these ranges, and then we
+// can pass back each txnid/buffer pair individually through a callback
+// to notify higher layers that locks have changed.
+struct txnid_range_buffer {
+  TXNID txnid;
+  range_buffer buffer;
+
+  static int find_by_txnid(struct txnid_range_buffer *const &other_buffer,
+                           const TXNID &txnid) {
+    if (txnid < other_buffer->txnid) {
+      return -1;
+    } else if (other_buffer->txnid == txnid) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+};
+
+// escalate the locks in the locktree by merging adjacent
+// locks that have the same txnid into one larger lock.
+//
+// if there's only one txnid in the locktree then this
+// approach works well. if there are many txnids and each
+// has locks in a random/alternating order, then this does
+// not work so well.
+void locktree::escalate(lt_escalate_cb after_escalate_callback,
+                        void *after_escalate_callback_extra) {
+  omt<struct txnid_range_buffer *, struct txnid_range_buffer *> range_buffers;
+  range_buffers.create();
+
+  // prepare and acquire a locked keyrange on the entire locktree
+  concurrent_tree::locked_keyrange lkr;
+  keyrange infinite_range = keyrange::get_infinite_range();
+  lkr.prepare(m_rangetree);
+  lkr.acquire(infinite_range);
+
+  // if we're in the single txnid optimization, simply call it off.
+  // if you have to run escalation, you probably don't care about
+  // the optimization anyway, and this makes things easier.
+  if (m_sto_txnid != TXNID_NONE) {
+    // We are already accounting for this escalation time and
+    // count, so don't do it for sto_end_early too.
+    sto_end_early_no_accounting(&lkr);
+  }
+
+  // extract and remove batches of row locks from the locktree
+  int num_extracted;
+  const int num_row_locks_per_batch = 128;
+  row_lock *XCALLOC_N(num_row_locks_per_batch, extracted_buf);
+
+  // we always remove the "first" n because we are removing n
+  // each time we do an extraction. so this loops until its empty.
+  while ((num_extracted = extract_first_n_row_locks(
+              &lkr, m_mgr, extracted_buf, num_row_locks_per_batch)) > 0) {
+    int current_index = 0;
+    while (current_index < num_extracted) {
+      // every batch of extracted locks is in range-sorted order. search
+      // through them and merge adjacent locks with the same txnid into
+      // one dominating lock and save it to a set of escalated locks.
+      //
+      // first, find the index of the next row lock that
+      //  - belongs to a different txnid, or
+      //  - belongs to several txnids, or
+      //  - is a shared lock (we could potentially merge those but
+      //    currently we don't), or
+      //  - is across a lock escalation barrier.
+      int next_txnid_index = current_index + 1;
+
+      while (next_txnid_index < num_extracted &&
+             (extracted_buf[current_index].txnid ==
+              extracted_buf[next_txnid_index].txnid) &&
+             !extracted_buf[next_txnid_index].is_shared &&
+             !extracted_buf[next_txnid_index].owners &&
+             !m_escalation_barrier(
+                 extracted_buf[current_index].range.get_right_key(),
+                 extracted_buf[next_txnid_index].range.get_left_key(),
+                 m_escalation_barrier_arg)) {
+        next_txnid_index++;
+      }
+
+      // Create an escalated range for the current txnid that dominates
+      // each range between the current indext and the next txnid's index.
+      // const TXNID current_txnid = extracted_buf[current_index].txnid;
+      const DBT *escalated_left_key =
+          extracted_buf[current_index].range.get_left_key();
+      const DBT *escalated_right_key =
+          extracted_buf[next_txnid_index - 1].range.get_right_key();
+
+      // Try to find a range buffer for the current txnid. Create one if it
+      // doesn't exist. Then, append the new escalated range to the buffer. (If
+      // a lock is shared by multiple txnids, append it each of txnid's lists)
+      TxnidVector *owners_ptr;
+      TxnidVector singleton_owner;
+      if (extracted_buf[current_index].owners)
+        owners_ptr = extracted_buf[current_index].owners;
+      else {
+        singleton_owner.insert(extracted_buf[current_index].txnid);
+        owners_ptr = &singleton_owner;
+      }
+
+      for (auto cur_txnid : *owners_ptr) {
+        uint32_t idx;
+        struct txnid_range_buffer *existing_range_buffer;
+        int r =
+            range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>(
+                cur_txnid, &existing_range_buffer, &idx);
+        if (r == DB_NOTFOUND) {
+          struct txnid_range_buffer *XMALLOC(new_range_buffer);
+          new_range_buffer->txnid = cur_txnid;
+          new_range_buffer->buffer.create();
+          new_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+          range_buffers.insert_at(new_range_buffer, idx);
+        } else {
+          invariant_zero(r);
+          invariant(existing_range_buffer->txnid == cur_txnid);
+          existing_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+        }
+      }
+
+      current_index = next_txnid_index;
+    }
+
+    // destroy the ranges copied during the extraction
+    for (int i = 0; i < num_extracted; i++) {
+      delete extracted_buf[i].owners;
+      extracted_buf[i].range.destroy();
+    }
+  }
+  toku_free(extracted_buf);
+
+  // Rebuild the locktree from each range in each range buffer,
+  // then notify higher layers that the txnid's locks have changed.
+  //
+  // (shared locks: if a lock was initially shared between transactions TRX1,
+  //  TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on
+  //  TRX2, etc.  This will succeed and an identical shared lock will be
+  //  constructed)
+
+  invariant(m_rangetree->is_empty());
+  const uint32_t num_range_buffers = range_buffers.size();
+  for (uint32_t i = 0; i < num_range_buffers; i++) {
+    struct txnid_range_buffer *current_range_buffer;
+    int r = range_buffers.fetch(i, &current_range_buffer);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    const TXNID current_txnid = current_range_buffer->txnid;
+    range_buffer::iterator iter(&current_range_buffer->buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      keyrange range;
+      range.create(rec.get_left_key(), rec.get_right_key());
+      row_lock lock = {.range = range,
+                       .txnid = current_txnid,
+                       .is_shared = !rec.get_exclusive_flag(),
+                       .owners = nullptr};
+      insert_row_lock_into_tree(&lkr, lock, m_mgr);
+      iter.next();
+    }
+
+    // Notify higher layers that locks have changed for the current txnid
+    if (after_escalate_callback) {
+      after_escalate_callback(current_txnid, this, current_range_buffer->buffer,
+                              after_escalate_callback_extra);
+    }
+    current_range_buffer->buffer.destroy();
+  }
+
+  while (range_buffers.size() > 0) {
+    struct txnid_range_buffer *buffer;
+    int r = range_buffers.fetch(0, &buffer);
+    invariant_zero(r);
+    r = range_buffers.delete_at(0);
+    invariant_zero(r);
+    toku_free(buffer);
+  }
+  range_buffers.destroy();
+
+  lkr.release();
+}
+
+void *locktree::get_userdata(void) const { return m_userdata; }
+
+void locktree::set_userdata(void *userdata) { m_userdata = userdata; }
+
+struct lt_lock_request_info *locktree::get_lock_request_info(void) {
+  return &m_lock_request_info;
+}
+
+void locktree::set_comparator(const comparator &cmp) { m_cmp.inherit(cmp); }
+
+locktree_manager *locktree::get_manager(void) const { return m_mgr; }
+
+int locktree::compare(const locktree *lt) const {
+  if (m_dict_id.dictid < lt->m_dict_id.dictid) {
+    return -1;
+  } else if (m_dict_id.dictid == lt->m_dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; }
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,580 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <atomic>
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_external_pthread.h"
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+// PORT #include <ft/ft-ops.h>  // just for DICTIONARY_ID..
+// PORT: ft-status for LTM_STATUS:
+#include "../ft/ft-status.h"
+
+struct DICTIONARY_ID {
+  uint64_t dictid;
+};
+
+#include "../util/omt.h"
+#include "range_buffer.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+class locktree;
+class locktree_manager;
+class lock_request;
+class concurrent_tree;
+
+typedef int (*lt_create_cb)(locktree *lt, void *extra);
+typedef void (*lt_destroy_cb)(locktree *lt);
+typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
+                               const range_buffer &buffer, void *extra);
+
+typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b,
+                                                 void *extra);
+
+struct lt_counters {
+  uint64_t wait_count, wait_time;
+  uint64_t long_wait_count, long_wait_time;
+  uint64_t timeout_count;
+
+  void add(const lt_counters &rhs) {
+    wait_count += rhs.wait_count;
+    wait_time += rhs.wait_time;
+    long_wait_count += rhs.long_wait_count;
+    long_wait_time += rhs.long_wait_time;
+    timeout_count += rhs.timeout_count;
+  }
+};
+
+// Lock request state for some locktree
+struct lt_lock_request_info {
+  omt<lock_request *> pending_lock_requests;
+  std::atomic_bool pending_is_empty;
+  toku_external_mutex_t mutex;
+  bool should_retry_lock_requests;
+  lt_counters counters;
+  std::atomic_ullong retry_want;
+  unsigned long long retry_done;
+  toku_mutex_t retry_mutex;
+  toku_cond_t retry_cv;
+  bool running_retry;
+
+  void init(toku_external_mutex_factory_t mutex_factory);
+  void destroy(void);
+};
+
+// The locktree manager manages a set of locktrees, one for each open
+// dictionary. Locktrees are retrieved from the manager. When they are no
+// longer needed, they are be released by the user.
+class locktree_manager {
+ public:
+  // param: create_cb, called just after a locktree is first created.
+  //        destroy_cb, called just before a locktree is destroyed.
+  //        escalate_cb, called after a locktree is escalated (with extra
+  //        param)
+  void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+              lt_escalate_cb escalate_cb, void *extra,
+              toku_external_mutex_factory_t mutex_factory_arg);
+
+  void destroy(void);
+
+  size_t get_max_lock_memory(void);
+
+  int set_max_lock_memory(size_t max_lock_memory);
+
+  // effect: Get a locktree from the manager. If a locktree exists with the
+  // given
+  //         dict_id, it is referenced and then returned. If one did not exist,
+  //         it is created. It will use the comparator for comparing keys. The
+  //         on_create callback (passed to locktree_manager::create()) will be
+  //         called with the given extra parameter.
+  locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                   void *on_create_extra);
+
+  void reference_lt(locktree *lt);
+
+  // effect: Releases one reference on a locktree. If the reference count
+  // transitions
+  //         to zero, the on_destroy callback is called before it gets
+  //         destroyed.
+  void release_lt(locktree *lt);
+
+  void get_status(LTM_STATUS status);
+
+  // effect: calls the iterate function on each pending lock request
+  // note: holds the manager's mutex
+  typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
+                                               TXNID txnid, const DBT *left_key,
+                                               const DBT *right_key,
+                                               TXNID blocking_txnid,
+                                               uint64_t start_time,
+                                               void *extra);
+  int iterate_pending_lock_requests(lock_request_iterate_callback cb,
+                                    void *extra);
+
+  // effect: Determines if too many locks or too much memory is being used,
+  //         Runs escalation on the manager if so.
+  // param: big_txn, if the current transaction is 'big' (has spilled rollback
+  // logs) returns: 0 if there enough resources to create a new lock, or
+  // TOKUDB_OUT_OF_LOCKS
+  //          if there are not enough resources and lock escalation failed to
+  //          free up enough resources for a new lock.
+  int check_current_lock_constraints(bool big_txn);
+
+  bool over_big_threshold(void);
+
+  void note_mem_used(uint64_t mem_used);
+
+  void note_mem_released(uint64_t mem_freed);
+
+  bool out_of_locks(void) const;
+
+  // Escalate all locktrees
+  void escalate_all_locktrees(void);
+
+  // Escalate a set of locktrees
+  void escalate_locktrees(locktree **locktrees, int num_locktrees);
+
+  // effect: calls the private function run_escalation(), only ok to
+  //         do for tests.
+  // rationale: to get better stress test coverage, we want a way to
+  //            deterministicly trigger lock escalation.
+  void run_escalation_for_test(void);
+  void run_escalation(void);
+
+  // Add time t to the escalator's wait time statistics
+  void add_escalator_wait_time(uint64_t t);
+
+  void kill_waiter(void *extra);
+
+ private:
+  static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
+
+  // tracks the current number of locks and lock memory
+  uint64_t m_max_lock_memory;
+  uint64_t m_current_lock_memory;
+
+  struct lt_counters m_lt_counters;
+
+  // the create and destroy callbacks for the locktrees
+  lt_create_cb m_lt_create_callback;
+  lt_destroy_cb m_lt_destroy_callback;
+  lt_escalate_cb m_lt_escalate_callback;
+  void *m_lt_escalate_callback_extra;
+
+  omt<locktree *> m_locktree_map;
+
+  toku_external_mutex_factory_t mutex_factory;
+
+  // the manager's mutex protects the locktree map
+  toku_mutex_t m_mutex;
+
+  void mutex_lock(void);
+
+  void mutex_unlock(void);
+
+  // Manage the set of open locktrees
+  locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
+  void locktree_map_put(locktree *lt);
+  void locktree_map_remove(locktree *lt);
+
+  static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
+
+  void escalator_init(void);
+  void escalator_destroy(void);
+
+  // statistics about lock escalation.
+  toku_mutex_t m_escalation_mutex;
+  uint64_t m_escalation_count;
+  tokutime_t m_escalation_time;
+  uint64_t m_escalation_latest_result;
+  uint64_t m_wait_escalation_count;
+  uint64_t m_wait_escalation_time;
+  uint64_t m_long_wait_escalation_count;
+  uint64_t m_long_wait_escalation_time;
+
+  // the escalator coordinates escalation on a set of locktrees for a bunch of
+  // threads
+  class locktree_escalator {
+   public:
+    void create(void);
+    void destroy(void);
+    void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+             void *extra);
+
+   private:
+    toku_mutex_t m_escalator_mutex;
+    toku_cond_t m_escalator_done;
+    bool m_escalator_running;
+  };
+
+  locktree_escalator m_escalator;
+
+  friend class manager_unit_test;
+};
+
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given comparator
+//
+// Locktrees are not created and destroyed by the user. Instead, they are
+// referenced and released using the locktree manager.
+//
+// A sample workflow looks like this:
+// - Create a manager.
+// - Get a locktree by dictionaroy id from the manager.
+// - Perform read/write lock acquision on the locktree, add references to
+//   the locktree using the manager, release locks, release references, etc.
+// - ...
+// - Release the final reference to the locktree. It will be destroyed.
+// - Destroy the manager.
+class locktree {
+ public:
+  // effect: Creates a locktree
+  void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+              const comparator &cmp,
+              toku_external_mutex_factory_t mutex_factory);
+
+  void destroy(void);
+
+  // For thread-safe, external reference counting
+  void add_reference(void);
+
+  // requires: the reference count is > 0
+  // returns: the reference count, after decrementing it by one
+  uint32_t release_reference(void);
+
+  // returns: the current reference count
+  uint32_t get_reference_count(void);
+
+  // effect: Attempts to grant a read lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  // note: Read locks cannot be shared between txnids, as one would expect.
+  //       This is for simplicity since read locks are rare in MySQL.
+  int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                        txnid_set *conflicts, bool big_txn);
+
+  // effect: Attempts to grant a write lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                         txnid_set *conflicts, bool big_txn);
+
+  // effect: populate the conflicts set with the txnids that would preventing
+  //         the given txnid from getting a lock on [left_key, right_key]
+  void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
+                     const DBT *right_key, txnid_set *conflicts);
+
+  // effect: Release all of the lock ranges represented by the range buffer for
+  // a txnid.
+  void release_locks(TXNID txnid, const range_buffer *ranges,
+                     bool all_trx_locks_hint = false);
+
+  // effect: Runs escalation on this locktree
+  void escalate(lt_escalate_cb after_escalate_callback, void *extra);
+
+  // returns: The userdata associated with this locktree, or null if it has not
+  // been set.
+  void *get_userdata(void) const;
+
+  void set_userdata(void *userdata);
+
+  locktree_manager *get_manager(void) const;
+
+  void set_comparator(const comparator &cmp);
+
+  // Set the user-provided Lock Escalation Barrier check function and its
+  // argument
+  //
+  // Lock Escalation Barrier limits the scope of Lock Escalation.
+  // For two keys A and B (such that A < B),
+  // escalation_barrier_check_func(A, B)==true means that there's a lock
+  // escalation barrier between A and B, and lock escalation is not allowed to
+  // bridge the gap between A and B.
+  //
+  // This method sets the user-provided barrier check function and its
+  // parameter.
+  void set_escalation_barrier_func(lt_escalation_barrier_check_func func,
+                                   void *extra);
+
+  int compare(const locktree *lt) const;
+
+  DICTIONARY_ID get_dict_id() const;
+
+  // Private info struct for storing pending lock request state.
+  // Only to be used by lock requests. We store it here as
+  // something less opaque than usual to strike a tradeoff between
+  // abstraction and code complexity. It is still fairly abstract
+  // since the lock_request object is opaque
+  struct lt_lock_request_info *get_lock_request_info(void);
+
+  typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+                                TXNID txnid, bool is_shared,
+                                TxnidVector *owners);
+  void dump_locks(void *cdata, dump_callback cb);
+
+ private:
+  locktree_manager *m_mgr;
+  DICTIONARY_ID m_dict_id;
+  uint32_t m_reference_count;
+
+  // Since the memory referenced by this comparator is not owned by the
+  // locktree, the user must guarantee it will outlive the locktree.
+  //
+  // The ydb API accomplishes this by opening an ft_handle in the on_create
+  // callback, which will keep the underlying FT (and its descriptor) in memory
+  // for as long as the handle is open. The ft_handle is stored opaquely in the
+  // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
+  comparator m_cmp;
+
+  lt_escalation_barrier_check_func m_escalation_barrier;
+  void *m_escalation_barrier_arg;
+
+  concurrent_tree *m_rangetree;
+
+  void *m_userdata;
+  struct lt_lock_request_info m_lock_request_info;
+
+  // psergey-todo:
+  //  Each transaction also keeps a list of ranges it has locked.
+  //  So, when a transaction is running in STO mode, two identical
+  //  lists are kept: the STO lock list and transaction's owned locks
+  //  list. Why can't we do with just one list?
+
+  // The following fields and members prefixed with "sto_" are for
+  // the single txnid optimization, intended to speed up the case
+  // when only one transaction is using the locktree. If we know
+  // the locktree has only one transaction, then acquiring locks
+  // takes O(1) work and releasing all locks takes O(1) work.
+  //
+  // How do we know that the locktree only has a single txnid?
+  // What do we do if it does?
+  //
+  // When a txn with txnid T requests a lock:
+  // - If the tree is empty, the optimization is possible. Set the single
+  // txnid to T, and insert the lock range into the buffer.
+  // - If the tree is not empty, check if the single txnid is T. If so,
+  // append the lock range to the buffer. Otherwise, migrate all of
+  // the locks in the buffer into the rangetree on behalf of txnid T,
+  // and invalid the single txnid.
+  //
+  // When a txn with txnid T releases its locks:
+  // - If the single txnid is valid, it must be for T. Destroy the buffer.
+  // - If it's not valid, release locks the normal way in the rangetree.
+  //
+  // To carry out the optimization we need to record a single txnid
+  // and a range buffer for each locktree, each protected by the root
+  // lock of the locktree's rangetree. The root lock for a rangetree
+  // is grabbed by preparing a locked keyrange on the rangetree.
+  TXNID m_sto_txnid;
+  range_buffer m_sto_buffer;
+
+  // The single txnid optimization speeds up the case when only one
+  // transaction is using the locktree. But it has the potential to
+  // hurt the case when more than one txnid exists.
+  //
+  // There are two things we need to do to make the optimization only
+  // optimize the case we care about, and not hurt the general case.
+  //
+  // Bound the worst-case latency for lock migration when the
+  // optimization stops working:
+  // - Idea: Stop the optimization and migrate immediate if we notice
+  // the single txnid has takes many locks in the range buffer.
+  // - Implementation: Enforce a max size on the single txnid range buffer.
+  // - Analysis: Choosing the perfect max value, M, is difficult to do
+  // without some feedback from the field. Intuition tells us that M should
+  // not be so small that the optimization is worthless, and it should not
+  // be so big that it's unreasonable to have to wait behind a thread doing
+  // the work of converting M buffer locks into rangetree locks.
+  //
+  // Prevent concurrent-transaction workloads from trying the optimization
+  // in vain:
+  // - Idea: Don't even bother trying the optimization if we think the
+  // system is in a concurrent-transaction state.
+  // - Implementation: Do something even simpler than detecting whether the
+  // system is in a concurent-transaction state. Just keep a "score" value
+  // and some threshold. If at any time the locktree is eligible for the
+  // optimization, only do it if the score is at this threshold. When you
+  // actually do the optimization but someone has to migrate locks in the buffer
+  // (expensive), then reset the score back to zero. Each time a txn
+  // releases locks, the score is incremented by 1.
+  // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
+  // do the optimization in a concurrent-transaction system. Similarly, it
+  // takes at most C txns to start using the single txnid optimzation, which
+  // is good when the system transitions from multithreaded to single threaded.
+  //
+  // STO_BUFFER_MAX_SIZE:
+  //
+  // We choose the max value to be 1 million since most transactions are smaller
+  // than 1 million and we can create a rangetree of 1 million elements in
+  // less than a second. So we can be pretty confident that this threshold
+  // enables the optimization almost always, and prevents super pathological
+  // latency issues for the first lock taken by a second thread.
+  //
+  // STO_SCORE_THRESHOLD:
+  //
+  // A simple first guess at a good value for the score threshold is 100.
+  // By our analysis, we'd end up doing the optimization in vain for
+  // around 1% of all transactions, which seems reasonable. Further,
+  // if the system goes single threaded, it ought to be pretty quick
+  // for 100 transactions to go by, so we won't have to wait long before
+  // we start doing the single txind optimzation again.
+  static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
+  static const int STO_SCORE_THRESHOLD = 100;
+  int m_sto_score;
+
+  // statistics about time spent ending the STO early
+  uint64_t m_sto_end_early_count;
+  tokutime_t m_sto_end_early_time;
+
+  // effect: begins the single txnid optimizaiton, setting m_sto_txnid
+  //         to the given txnid.
+  // requires: m_sto_txnid is invalid
+  void sto_begin(TXNID txnid);
+
+  // effect: append a range to the sto buffer
+  // requires: m_sto_txnid is valid
+  void sto_append(const DBT *left_key, const DBT *right_key,
+                  bool is_write_request);
+
+  // effect: ends the single txnid optimization, releaseing any memory
+  //         stored in the sto buffer, notifying the tracker, and
+  //         invalidating m_sto_txnid.
+  // requires: m_sto_txnid is valid
+  void sto_end(void);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
+  // effect: ends the single txnid optimization early, migrating buffer locks
+  //         into the rangetree, calling sto_end(), and then setting the
+  //         sto_score back to zero.
+  // requires: m_sto_txnid is valid
+  void sto_end_early(void *prepared_lkr);
+  void sto_end_early_no_accounting(void *prepared_lkr);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. we can't
+  // use
+  //         the real type because the compiler won't allow us to forward
+  //         declare concurrent_tree::locked_keyrange without including
+  //         concurrent_tree.h, which we cannot do here because it is a template
+  //         implementation.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // requires: m_sto_txnid is valid
+  // effect: migrates each lock in the single txnid buffer into the locktree's
+  //         rangetree, notifying the memory tracker as necessary.
+  void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
+
+  // effect: If m_sto_txnid is valid, then release the txnid's locks
+  //         by ending the optimization.
+  // requires: If m_sto_txnid is valid, it is equal to the given txnid
+  // returns: True if locks were released for this txnid
+  bool sto_try_release(TXNID txnid);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // effect: If m_sto_txnid is valid and equal to the given txnid, then
+  // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
+  //        but not equal to this txnid, then migrate the buffer's locks
+  //        into the rangetree and end the optimization, setting the score
+  //        back to zero.
+  // returns: true if the lock was acquired for this txnid
+  bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, bool is_write_request);
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  true if m_sto_txnid is not TXNID_NONE
+  bool sto_txnid_is_valid_unsafe(void) const;
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  m_sto_score
+  int sto_get_score_unsafe(void) const;
+
+  void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
+                                          const DBT *right_key);
+
+  int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                const DBT *left_key, const DBT *right_key,
+                                bool is_write_request, txnid_set *conflicts);
+
+  int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                   const DBT *right_key, txnid_set *conflicts);
+
+  int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, txnid_set *conflicts,
+                       bool big_txn);
+
+  friend class locktree_unit_test;
+  friend class manager_unit_test;
+  friend class lock_request_unit_test;
+
+  // engine status reaches into the locktree to read some stats
+  friend void locktree_manager::get_status(LTM_STATUS status);
+};
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../portability/toku_pthread.h"
+#include "../util/status.h"
+#include "lock_request.h"
+#include "locktree.h"
+
+namespace toku {
+
+void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+                              lt_escalate_cb escalate_cb, void *escalate_extra,
+                              toku_external_mutex_factory_t mutex_factory_arg) {
+  mutex_factory = mutex_factory_arg;
+  m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY;
+  m_current_lock_memory = 0;
+
+  m_locktree_map.create();
+  m_lt_create_callback = create_cb;
+  m_lt_destroy_callback = destroy_cb;
+  m_lt_escalate_callback = escalate_cb;
+  m_lt_escalate_callback_extra = escalate_extra;
+  ZERO_STRUCT(m_mutex);
+  toku_mutex_init(manager_mutex_key, &m_mutex, nullptr);
+
+  ZERO_STRUCT(m_lt_counters);
+
+  escalator_init();
+}
+
+void locktree_manager::destroy(void) {
+  escalator_destroy();
+  invariant(m_current_lock_memory == 0);
+  invariant(m_locktree_map.size() == 0);
+  m_locktree_map.destroy();
+  toku_mutex_destroy(&m_mutex);
+}
+
+void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; }
+
+int locktree_manager::set_max_lock_memory(size_t max_lock_memory) {
+  int r = 0;
+  mutex_lock();
+  if (max_lock_memory < m_current_lock_memory) {
+    r = EDOM;
+  } else {
+    m_max_lock_memory = max_lock_memory;
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::find_by_dict_id(locktree *const &lt,
+                                      const DICTIONARY_ID &dict_id) {
+  if (lt->get_dict_id().dictid < dict_id.dictid) {
+    return -1;
+  } else if (lt->get_dict_id().dictid == dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) {
+  locktree *lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(dict_id, &lt,
+                                                                   nullptr);
+  return r == 0 ? lt : nullptr;
+}
+
+void locktree_manager::locktree_map_put(locktree *lt) {
+  int r = m_locktree_map.insert<DICTIONARY_ID, find_by_dict_id>(
+      lt, lt->get_dict_id(), nullptr);
+  invariant_zero(r);
+}
+
+void locktree_manager::locktree_map_remove(locktree *lt) {
+  uint32_t idx;
+  locktree *found_lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(
+      lt->get_dict_id(), &found_lt, &idx);
+  invariant_zero(r);
+  invariant(found_lt == lt);
+  r = m_locktree_map.delete_at(idx);
+  invariant_zero(r);
+}
+
+locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                                   void *on_create_extra) {
+  // hold the mutex around searching and maybe
+  // inserting into the locktree map
+  mutex_lock();
+
+  locktree *lt = locktree_map_find(dict_id);
+  if (lt == nullptr) {
+    XCALLOC(lt);
+    lt->create(this, dict_id, cmp, mutex_factory);
+
+    // new locktree created - call the on_create callback
+    // and put it in the locktree map
+    if (m_lt_create_callback) {
+      int r = m_lt_create_callback(lt, on_create_extra);
+      if (r != 0) {
+        lt->release_reference();
+        lt->destroy();
+        toku_free(lt);
+        lt = nullptr;
+      }
+    }
+    if (lt) {
+      locktree_map_put(lt);
+    }
+  } else {
+    reference_lt(lt);
+  }
+
+  mutex_unlock();
+
+  return lt;
+}
+
+void locktree_manager::reference_lt(locktree *lt) {
+  // increment using a sync fetch and add.
+  // the caller guarantees that the lt won't be
+  // destroyed while we increment the count here.
+  //
+  // the caller can do this by already having an lt
+  // reference or by holding the manager mutex.
+  //
+  // if the manager's mutex is held, it is ok for the
+  // reference count to transition from 0 to 1 (no race),
+  // since we're serialized with other opens and closes.
+  lt->add_reference();
+}
+
+void locktree_manager::release_lt(locktree *lt) {
+  bool do_destroy = false;
+  DICTIONARY_ID dict_id = lt->get_dict_id();
+
+  // Release a reference on the locktree. If the count transitions to zero,
+  // then we *may* need to do the cleanup.
+  //
+  // Grab the manager's mutex and look for a locktree with this locktree's
+  // dictionary id. Since dictionary id's never get reused, any locktree
+  // found must be the one we just released a reference on.
+  //
+  // At least two things could have happened since we got the mutex:
+  // - Another thread gets a locktree with the same dict_id, increments
+  // the reference count. In this case, we shouldn't destroy it.
+  // - Another thread gets a locktree with the same dict_id and then
+  // releases it quickly, transitioning the reference count from zero to
+  // one and back to zero. In this case, only one of us should destroy it.
+  // It doesn't matter which. We originally missed this case, see #5776.
+  //
+  // After 5776, the high level rule for release is described below.
+  //
+  // If a thread releases a locktree and notices the reference count transition
+  // to zero, then that thread must immediately:
+  // - assume the locktree object is invalid
+  // - grab the manager's mutex
+  // - search the locktree map for a locktree with the same dict_id and remove
+  // it, if it exists. the destroy may be deferred.
+  // - release the manager's mutex
+  //
+  // This way, if many threads transition the same locktree's reference count
+  // from 1 to zero and wait behind the manager's mutex, only one of them will
+  // do the actual destroy and the others will happily do nothing.
+  uint32_t refs = lt->release_reference();
+  if (refs == 0) {
+    mutex_lock();
+    // lt may not have already been destroyed, so look it up.
+    locktree *find_lt = locktree_map_find(dict_id);
+    if (find_lt != nullptr) {
+      // A locktree is still in the map with that dict_id, so it must be
+      // equal to lt. This is true because dictionary ids are never reused.
+      // If the reference count is zero, it's our responsibility to remove
+      // it and do the destroy. Otherwise, someone still wants it.
+      // If the locktree is still valid then check if it should be deleted.
+      if (find_lt == lt) {
+        if (lt->get_reference_count() == 0) {
+          locktree_map_remove(lt);
+          do_destroy = true;
+        }
+        m_lt_counters.add(lt->get_lock_request_info()->counters);
+      }
+    }
+    mutex_unlock();
+  }
+
+  // if necessary, do the destroy without holding the mutex
+  if (do_destroy) {
+    if (m_lt_destroy_callback) {
+      m_lt_destroy_callback(lt);
+    }
+    lt->destroy();
+    toku_free(lt);
+  }
+}
+
+void locktree_manager::run_escalation(void) {
+  struct escalation_fn {
+    static void run(void *extra) {
+      locktree_manager *mgr = (locktree_manager *)extra;
+      mgr->escalate_all_locktrees();
+    };
+  };
+  m_escalator.run(this, escalation_fn::run, this);
+}
+
+// test-only version of lock escalation
+void locktree_manager::run_escalation_for_test(void) { run_escalation(); }
+
+void locktree_manager::escalate_all_locktrees(void) {
+  uint64_t t0 = toku_current_time_microsec();
+
+  // get all locktrees
+  mutex_lock();
+  int num_locktrees = m_locktree_map.size();
+  locktree **locktrees = new locktree *[num_locktrees];
+  for (int i = 0; i < num_locktrees; i++) {
+    int r = m_locktree_map.fetch(i, &locktrees[i]);
+    invariant_zero(r);
+    reference_lt(locktrees[i]);
+  }
+  mutex_unlock();
+
+  // escalate them
+  escalate_locktrees(locktrees, num_locktrees);
+
+  delete[] locktrees;
+
+  uint64_t t1 = toku_current_time_microsec();
+  add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::note_mem_used(uint64_t mem_used) {
+  (void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used);
+}
+
+void locktree_manager::note_mem_released(uint64_t mem_released) {
+  uint64_t old_mem_used =
+      toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released);
+  invariant(old_mem_used >= mem_released);
+}
+
+bool locktree_manager::out_of_locks(void) const {
+  return m_current_lock_memory >= m_max_lock_memory;
+}
+
+bool locktree_manager::over_big_threshold(void) {
+  return m_current_lock_memory >= m_max_lock_memory / 2;
+}
+
+int locktree_manager::iterate_pending_lock_requests(
+    lock_request_iterate_callback callback, void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees && r == 0; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    struct lt_lock_request_info *info = lt->get_lock_request_info();
+    toku_external_mutex_lock(&info->mutex);
+
+    uint32_t num_requests = info->pending_lock_requests.size();
+    for (uint32_t k = 0; k < num_requests && r == 0; k++) {
+      lock_request *req;
+      r = info->pending_lock_requests.fetch(k, &req);
+      invariant_zero(r);
+      if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+        continue;
+      r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(),
+                   req->get_right_key(), req->get_conflicting_txnid(),
+                   req->get_start_time(), extra);
+    }
+
+    toku_external_mutex_unlock(&info->mutex);
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::check_current_lock_constraints(bool big_txn) {
+  int r = 0;
+  if (big_txn && over_big_threshold()) {
+    run_escalation();
+    if (over_big_threshold()) {
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  if (r == 0 && out_of_locks()) {
+    run_escalation();
+    if (out_of_locks()) {
+      // return an error if we're still out of locks after escalation.
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  return r;
+}
+
+void locktree_manager::escalator_init(void) {
+  ZERO_STRUCT(m_escalation_mutex);
+  toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr);
+  m_escalation_count = 0;
+  m_escalation_time = 0;
+  m_wait_escalation_count = 0;
+  m_wait_escalation_time = 0;
+  m_long_wait_escalation_count = 0;
+  m_long_wait_escalation_time = 0;
+  m_escalation_latest_result = 0;
+  m_escalator.create();
+}
+
+void locktree_manager::escalator_destroy(void) {
+  m_escalator.destroy();
+  toku_mutex_destroy(&m_escalation_mutex);
+}
+
+void locktree_manager::add_escalator_wait_time(uint64_t t) {
+  toku_mutex_lock(&m_escalation_mutex);
+  m_wait_escalation_count += 1;
+  m_wait_escalation_time += t;
+  if (t >= 1000000) {
+    m_long_wait_escalation_count += 1;
+    m_long_wait_escalation_time += t;
+  }
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+void locktree_manager::escalate_locktrees(locktree **locktrees,
+                                          int num_locktrees) {
+  // there are too many row locks in the system and we need to tidy up.
+  //
+  // a simple implementation of escalation does not attempt
+  // to reduce the memory foot print of each txn's range buffer.
+  // doing so would require some layering hackery (or a callback)
+  // and more complicated locking. for now, just escalate each
+  // locktree individually, in-place.
+  tokutime_t t0 = toku_time_now();
+  for (int i = 0; i < num_locktrees; i++) {
+    locktrees[i]->escalate(m_lt_escalate_callback,
+                           m_lt_escalate_callback_extra);
+    release_lt(locktrees[i]);
+  }
+  tokutime_t t1 = toku_time_now();
+
+  toku_mutex_lock(&m_escalation_mutex);
+  m_escalation_count++;
+  m_escalation_time += (t1 - t0);
+  m_escalation_latest_result = m_current_lock_memory;
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+struct escalate_args {
+  locktree_manager *mgr;
+  locktree **locktrees;
+  int num_locktrees;
+};
+
+void locktree_manager::locktree_escalator::create(void) {
+  ZERO_STRUCT(m_escalator_mutex);
+  toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr);
+  toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr);
+  m_escalator_running = false;
+}
+
+void locktree_manager::locktree_escalator::destroy(void) {
+  toku_cond_destroy(&m_escalator_done);
+  toku_mutex_destroy(&m_escalator_mutex);
+}
+
+void locktree_manager::locktree_escalator::run(
+    locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+    void *extra) {
+  uint64_t t0 = toku_current_time_microsec();
+  toku_mutex_lock(&m_escalator_mutex);
+  if (!m_escalator_running) {
+    // run escalation on this thread
+    m_escalator_running = true;
+    toku_mutex_unlock(&m_escalator_mutex);
+    escalate_locktrees_fun(extra);
+    toku_mutex_lock(&m_escalator_mutex);
+    m_escalator_running = false;
+    toku_cond_broadcast(&m_escalator_done);
+  } else {
+    toku_cond_wait(&m_escalator_done, &m_escalator_mutex);
+  }
+  toku_mutex_unlock(&m_escalator_mutex);
+  uint64_t t1 = toku_current_time_microsec();
+  mgr->add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::get_status(LTM_STATUS statp) {
+  ltm_status.init();
+  LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory;
+  LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory;
+  LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count;
+  LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time;
+  LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time;
+
+  uint64_t lock_requests_pending = 0;
+  uint64_t sto_num_eligible = 0;
+  uint64_t sto_end_early_count = 0;
+  tokutime_t sto_end_early_time = 0;
+  uint32_t num_locktrees = 0;
+  struct lt_counters lt_counters;
+  ZERO_STRUCT(lt_counters);  // PORT: instead of ={}.
+
+  if (toku_mutex_trylock(&m_mutex) == 0) {
+    lt_counters = m_lt_counters;
+    num_locktrees = m_locktree_map.size();
+    for (uint32_t i = 0; i < num_locktrees; i++) {
+      locktree *lt;
+      int r = m_locktree_map.fetch(i, &lt);
+      invariant_zero(r);
+      if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+        continue;
+      if (toku_external_mutex_trylock(&lt->m_lock_request_info.mutex) == 0) {
+        lock_requests_pending +=
+            lt->m_lock_request_info.pending_lock_requests.size();
+        lt_counters.add(lt->get_lock_request_info()->counters);
+        toku_external_mutex_unlock(&lt->m_lock_request_info.mutex);
+      }
+      sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
+      sto_end_early_count += lt->m_sto_end_early_count;
+      sto_end_early_time += lt->m_sto_end_early_time;
+    }
+    mutex_unlock();
+  }
+
+  LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees;
+  LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending;
+  LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
+  LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count;
+  LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time;
+  LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count;
+  *statp = ltm_status;
+}
+
+void locktree_manager::kill_waiter(void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    lock_request::kill_waiter(lt, extra);
+  }
+  mutex_unlock();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,265 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "range_buffer.h"
+
+#include <string.h>
+
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+namespace toku {
+
+bool range_buffer::record_header::left_is_infinite(void) const {
+  return left_neg_inf || left_pos_inf;
+}
+
+bool range_buffer::record_header::right_is_infinite(void) const {
+  return right_neg_inf || right_pos_inf;
+}
+
+void range_buffer::record_header::init(const DBT *left_key,
+                                       const DBT *right_key,
+                                       bool is_exclusive) {
+  is_exclusive_lock = is_exclusive;
+  left_neg_inf = left_key == toku_dbt_negative_infinity();
+  left_pos_inf = left_key == toku_dbt_positive_infinity();
+  left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size;
+  if (right_key) {
+    right_neg_inf = right_key == toku_dbt_negative_infinity();
+    right_pos_inf = right_key == toku_dbt_positive_infinity();
+    right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size;
+  } else {
+    right_neg_inf = left_neg_inf;
+    right_pos_inf = left_pos_inf;
+    right_key_size = 0;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_left_key(void) const {
+  if (_header.left_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.left_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_left_key;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_right_key(void) const {
+  if (_header.right_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.right_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_right_key;
+  }
+}
+
+size_t range_buffer::iterator::record::size(void) const {
+  return sizeof(record_header) + _header.left_key_size + _header.right_key_size;
+}
+
+void range_buffer::iterator::record::deserialize(const char *buf) {
+  size_t current = 0;
+
+  // deserialize the header
+  memcpy(&_header, buf, sizeof(record_header));
+  current += sizeof(record_header);
+
+  // deserialize the left key if necessary
+  if (!_header.left_is_infinite()) {
+    // point the left DBT's buffer into ours
+    toku_fill_dbt(&_left_key, buf + current, _header.left_key_size);
+    current += _header.left_key_size;
+  }
+
+  // deserialize the right key if necessary
+  if (!_header.right_is_infinite()) {
+    if (_header.right_key_size == 0) {
+      toku_copyref_dbt(&_right_key, _left_key);
+    } else {
+      toku_fill_dbt(&_right_key, buf + current, _header.right_key_size);
+    }
+  }
+}
+
+toku::range_buffer::iterator::iterator()
+    : _ma_chunk_iterator(nullptr),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {}
+
+toku::range_buffer::iterator::iterator(const range_buffer *buffer)
+    : _ma_chunk_iterator(&buffer->_arena),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {
+  reset_current_chunk();
+}
+
+void range_buffer::iterator::reset_current_chunk() {
+  _current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max);
+  _current_chunk_offset = 0;
+}
+
+bool range_buffer::iterator::current(record *rec) {
+  if (_current_chunk_offset < _current_chunk_max) {
+    const char *buf = reinterpret_cast<const char *>(_current_chunk_base);
+    rec->deserialize(buf + _current_chunk_offset);
+    _current_rec_size = rec->size();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// move the iterator to the next record in the buffer
+void range_buffer::iterator::next(void) {
+  invariant(_current_chunk_offset < _current_chunk_max);
+  invariant(_current_rec_size > 0);
+
+  // the next record is _current_rec_size bytes forward
+  _current_chunk_offset += _current_rec_size;
+  // now, we don't know how big the current is, set it to 0.
+  _current_rec_size = 0;
+
+  if (_current_chunk_offset >= _current_chunk_max) {
+    // current chunk is exhausted, try moving to the next one
+    if (_ma_chunk_iterator.more()) {
+      _ma_chunk_iterator.next();
+      reset_current_chunk();
+    }
+  }
+}
+
+void range_buffer::create(void) {
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  _arena.create(0);
+  _num_ranges = 0;
+}
+
+void range_buffer::append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  // if the keys are equal, then only one copy is stored.
+  if (toku_dbt_equals(left_key, right_key)) {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    append_point(left_key, is_write_request);
+  } else {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    invariant(right_key->size <= MAX_KEY_SIZE);
+    append_range(left_key, right_key, is_write_request);
+  }
+  _num_ranges++;
+}
+
+bool range_buffer::is_empty(void) const { return total_memory_size() == 0; }
+
+uint64_t range_buffer::total_memory_size(void) const {
+  return _arena.total_size_in_use();
+}
+
+int range_buffer::get_num_ranges(void) const { return _num_ranges; }
+
+void range_buffer::destroy(void) { _arena.destroy(); }
+
+void range_buffer::append_range(const DBT *left_key, const DBT *right_key,
+                                bool is_exclusive) {
+  size_t record_length =
+      sizeof(record_header) + left_key->size + right_key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(left_key, right_key, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the left key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, left_key->data, left_key->size);
+    buf += left_key->size;
+  }
+
+  // serialize the right key if necessary
+  if (!h.right_is_infinite()) {
+    memcpy(buf, right_key->data, right_key->size);
+  }
+}
+
+void range_buffer::append_point(const DBT *key, bool is_exclusive) {
+  size_t record_length = sizeof(record_header) + key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(key, nullptr, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, key->data, key->size);
+  }
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,178 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdint.h>
+
+#include "../util/dbt.h"
+#include "../util/memarena.h"
+
+namespace toku {
+
+// a key range buffer represents a set of key ranges that can
+// be stored, iterated over, and then destroyed all at once.
+class range_buffer {
+ private:
+  // the key range buffer is a bunch of records in a row.
+  // each record has the following header, followed by the
+  // left key and right key data payload, if applicable.
+  // we limit keys to be 2^16, since we store lengths as 2 bytes.
+  static const size_t MAX_KEY_SIZE = 1 << 16;
+
+  struct record_header {
+    bool left_neg_inf;
+    bool left_pos_inf;
+    bool right_pos_inf;
+    bool right_neg_inf;
+    uint16_t left_key_size;
+    uint16_t right_key_size;
+    bool is_exclusive_lock;
+
+    bool left_is_infinite(void) const;
+
+    bool right_is_infinite(void) const;
+
+    void init(const DBT *left_key, const DBT *right_key, bool is_exclusive);
+  };
+  // PORT static_assert(sizeof(record_header) == 8, "record header format is
+  // off");
+
+ public:
+  // the iterator abstracts reading over a buffer of variable length
+  // records one by one until there are no more left.
+  class iterator {
+   public:
+    iterator();
+    iterator(const range_buffer *buffer);
+
+    // a record represents the user-view of a serialized key range.
+    // it handles positive and negative infinity and the optimized
+    // point range case, where left and right points share memory.
+    class record {
+     public:
+      // get a read-only pointer to the left key of this record's range
+      const DBT *get_left_key(void) const;
+
+      // get a read-only pointer to the right key of this record's range
+      const DBT *get_right_key(void) const;
+
+      // how big is this record? this tells us where the next record is
+      size_t size(void) const;
+
+      bool get_exclusive_flag() const { return _header.is_exclusive_lock; }
+
+      // populate a record header and point our DBT's
+      // buffers into ours if they are not infinite.
+      void deserialize(const char *buf);
+
+     private:
+      record_header _header;
+      DBT _left_key;
+      DBT _right_key;
+    };
+
+    // populate the given record object with the current
+    // the memory referred to by record is valid for only
+    // as long as the record exists.
+    bool current(record *rec);
+
+    // move the iterator to the next record in the buffer
+    void next(void);
+
+   private:
+    void reset_current_chunk();
+
+    // the key range buffer we are iterating over, the current
+    // offset in that buffer, and the size of the current record.
+    memarena::chunk_iterator _ma_chunk_iterator;
+    const void *_current_chunk_base;
+    size_t _current_chunk_offset;
+    size_t _current_chunk_max;
+    size_t _current_rec_size;
+  };
+
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  void create(void);
+
+  // append a left/right key range to the buffer.
+  // if the keys are equal, then only one copy is stored.
+  void append(const DBT *left_key, const DBT *right_key,
+              bool is_write_request = false);
+
+  // is this range buffer empty?
+  bool is_empty(void) const;
+
+  // how much memory is being used by this range buffer?
+  uint64_t total_memory_size(void) const;
+
+  // how many ranges are stored in this range buffer?
+  int get_num_ranges(void) const;
+
+  void destroy(void);
+
+ private:
+  memarena _arena;
+  int _num_ranges;
+
+  void append_range(const DBT *left_key, const DBT *right_key,
+                    bool is_write_request);
+
+  // append a point to the buffer. this is the space/time saving
+  // optimization for key ranges where left == right.
+  void append_point(const DBT *key, bool is_write_request);
+};
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "treenode.h"
+
+#include "../portability/toku_race_tools.h"
+
+namespace toku {
+
+// TODO: source location info might have to be pulled up one caller
+// to be useful
+void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+void treenode::init(const comparator *cmp) {
+  m_txnid = TXNID_NONE;
+  m_is_root = false;
+  m_is_empty = true;
+  m_cmp = cmp;
+
+  m_is_shared = false;
+  m_owners = nullptr;
+
+  // use an adaptive mutex at each node since we expect the time the
+  // lock is held to be relatively short compared to a context switch.
+  // indeed, this improves performance at high thread counts considerably.
+  memset(&m_mutex, 0, sizeof(toku_mutex_t));
+  toku_pthread_mutexattr_t attr;
+  toku_mutexattr_init(&attr);
+  toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
+  toku_mutex_init(treenode_mutex_key, &m_mutex, &attr);
+  toku_mutexattr_destroy(&attr);
+  m_left_child.set(nullptr);
+  m_right_child.set(nullptr);
+}
+
+void treenode::create_root(const comparator *cmp) {
+  init(cmp);
+  m_is_root = true;
+}
+
+void treenode::destroy_root(void) {
+  invariant(is_root());
+  invariant(is_empty());
+  toku_mutex_destroy(&m_mutex);
+  m_cmp = nullptr;
+}
+
+void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid,
+                                   bool is_shared) {
+  // allocates a new copy of the range for this node
+  m_range.create_copy(range);
+  m_txnid = txnid;
+  m_is_shared = is_shared;
+  m_is_empty = false;
+}
+
+bool treenode::is_root(void) { return m_is_root; }
+
+bool treenode::is_empty(void) { return m_is_empty; }
+
+bool treenode::range_overlaps(const keyrange &range) {
+  return m_range.overlaps(*m_cmp, range);
+}
+
+treenode *treenode::alloc(const comparator *cmp, const keyrange &range,
+                          TXNID txnid, bool is_shared) {
+  treenode *XCALLOC(node);
+  node->init(cmp);
+  node->set_range_and_txnid(range, txnid, is_shared);
+  return node;
+}
+
+void treenode::swap_in_place(treenode *node1, treenode *node2) {
+  keyrange tmp_range = node1->m_range;
+  TXNID tmp_txnid = node1->m_txnid;
+  node1->m_range = node2->m_range;
+  node1->m_txnid = node2->m_txnid;
+  node2->m_range = tmp_range;
+  node2->m_txnid = tmp_txnid;
+
+  bool tmp_is_shared = node1->m_is_shared;
+  node1->m_is_shared = node2->m_is_shared;
+  node2->m_is_shared = tmp_is_shared;
+
+  auto tmp_m_owners = node1->m_owners;
+  node1->m_owners = node2->m_owners;
+  node2->m_owners = tmp_m_owners;
+}
+
+bool treenode::add_shared_owner(TXNID txnid) {
+  assert(m_is_shared);
+  if (txnid == m_txnid)
+    return false;  // acquiring a lock on the same range by the same trx
+
+  if (m_txnid != TXNID_SHARED) {
+    m_owners = new TxnidVector;
+    m_owners->insert(m_txnid);
+    m_txnid = TXNID_SHARED;
+  }
+  m_owners->insert(txnid);
+  return true;
+}
+
+void treenode::free(treenode *node) {
+  // destroy the range, freeing any copied keys
+  node->m_range.destroy();
+
+  if (node->m_owners) {
+    delete node->m_owners;
+    node->m_owners = nullptr;  // need this?
+  }
+
+  // the root is simply marked as empty.
+  if (node->is_root()) {
+    // PORT toku_mutex_assert_locked(&node->m_mutex);
+    node->m_is_empty = true;
+  } else {
+    // PORT toku_mutex_assert_unlocked(&node->m_mutex);
+    toku_mutex_destroy(&node->m_mutex);
+    toku_free(node);
+  }
+}
+
+uint32_t treenode::get_depth_estimate(void) const {
+  const uint32_t left_est = m_left_child.depth_est;
+  const uint32_t right_est = m_right_child.depth_est;
+  return (left_est > right_est ? left_est : right_est) + 1;
+}
+
+treenode *treenode::find_node_with_overlapping_child(
+    const keyrange &range, const keyrange::comparison *cmp_hint) {
+  // determine which child to look at based on a comparison. if we were
+  // given a comparison hint, use that. otherwise, compare them now.
+  keyrange::comparison c =
+      cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
+
+  treenode *child;
+  if (c == keyrange::comparison::LESS_THAN) {
+    child = lock_and_rebalance_left();
+  } else {
+    // The caller (locked_keyrange::acquire) handles the case where
+    // the root of the locked_keyrange is the node that overlaps.
+    // range is guaranteed not to overlap this node.
+    invariant(c == keyrange::comparison::GREATER_THAN);
+    child = lock_and_rebalance_right();
+  }
+
+  // if the search would lead us to an empty subtree (child == nullptr),
+  // or the child overlaps, then we know this node is the parent we want.
+  // otherwise we need to recur into that child.
+  if (child == nullptr) {
+    return this;
+  } else {
+    c = range.compare(*m_cmp, child->m_range);
+    if (c == keyrange::comparison::EQUALS ||
+        c == keyrange::comparison::OVERLAPS) {
+      child->mutex_unlock();
+      return this;
+    } else {
+      // unlock this node before recurring into the locked child,
+      // passing in a comparison hint since we just comapred range
+      // to the child's range.
+      mutex_unlock();
+      return child->find_node_with_overlapping_child(range, &c);
+    }
+  }
+}
+
+bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) {
+  int rc = true;
+  // choose a child to check. if that child is null, then insert the new node
+  // there. otherwise recur down that child's subtree
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  if (c == keyrange::comparison::LESS_THAN) {
+    treenode *left_child = lock_and_rebalance_left();
+    if (left_child == nullptr) {
+      left_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_left_child.set(left_child);
+    } else {
+      left_child->insert(range, txnid, is_shared);
+      left_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::GREATER_THAN) {
+    // invariant(c == keyrange::comparison::GREATER_THAN);
+    treenode *right_child = lock_and_rebalance_right();
+    if (right_child == nullptr) {
+      right_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_right_child.set(right_child);
+    } else {
+      right_child->insert(range, txnid, is_shared);
+      right_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::EQUALS) {
+    invariant(is_shared);
+    invariant(m_is_shared);
+    rc = add_shared_owner(txnid);
+  } else {
+    invariant(0);
+  }
+  return rc;
+}
+
+treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
+  treenode *child =
+      direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked();
+
+  if (child) {
+    *parent = this;
+    treenode *child_extreme = child->find_child_at_extreme(direction, parent);
+    child->mutex_unlock();
+    return child_extreme;
+  } else {
+    return this;
+  }
+}
+
+treenode *treenode::find_leftmost_child(treenode **parent) {
+  return find_child_at_extreme(-1, parent);
+}
+
+treenode *treenode::find_rightmost_child(treenode **parent) {
+  return find_child_at_extreme(1, parent);
+}
+
+treenode *treenode::remove_root_of_subtree() {
+  // if this node has no children, just free it and return null
+  if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
+    // treenode::free requires that non-root nodes are unlocked
+    if (!is_root()) {
+      mutex_unlock();
+    }
+    treenode::free(this);
+    return nullptr;
+  }
+
+  // we have a child, so get either the in-order successor or
+  // predecessor of this node to be our replacement.
+  // replacement_parent is updated by the find functions as
+  // they recur down the tree, so initialize it to this.
+  treenode *child, *replacement;
+  treenode *replacement_parent = this;
+  if (m_left_child.ptr != nullptr) {
+    child = m_left_child.get_locked();
+    replacement = child->find_rightmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_left_child = replacement->m_left_child;
+    } else {
+      replacement_parent->m_right_child = replacement->m_left_child;
+    }
+  } else {
+    child = m_right_child.get_locked();
+    replacement = child->find_leftmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_right_child = replacement->m_right_child;
+    } else {
+      replacement_parent->m_left_child = replacement->m_right_child;
+    }
+  }
+  child->mutex_unlock();
+
+  // swap in place with the detached replacement, then destroy it
+  treenode::swap_in_place(replacement, this);
+  treenode::free(replacement);
+
+  return this;
+}
+
+void treenode::recursive_remove(void) {
+  treenode *left = m_left_child.ptr;
+  if (left) {
+    left->recursive_remove();
+  }
+  m_left_child.set(nullptr);
+
+  treenode *right = m_right_child.ptr;
+  if (right) {
+    right->recursive_remove();
+  }
+  m_right_child.set(nullptr);
+
+  // we do not take locks on the way down, so we know non-root nodes
+  // are unlocked here and the caller is required to pass a locked
+  // root, so this free is correct.
+  treenode::free(this);
+}
+
+void treenode::remove_shared_owner(TXNID txnid) {
+  assert(m_owners->size() > 1);
+  m_owners->erase(txnid);
+  assert(m_owners->size() > 0);
+  /* if there is just one owner left, move it to m_txnid */
+  if (m_owners->size() == 1) {
+    m_txnid = *m_owners->begin();
+    delete m_owners;
+    m_owners = nullptr;
+  }
+}
+
+treenode *treenode::remove(const keyrange &range, TXNID txnid) {
+  treenode *child;
+  // if the range is equal to this node's range, then just remove
+  // the root of this subtree. otherwise search down the tree
+  // in either the left or right children.
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  switch (c) {
+    case keyrange::comparison::EQUALS: {
+      // if we are the only owners, remove. Otherwise, just remove
+      // us from the owners list.
+      if (txnid != TXNID_ANY && has_multiple_owners()) {
+        remove_shared_owner(txnid);
+        return this;
+      } else {
+        return remove_root_of_subtree();
+      }
+    }
+    case keyrange::comparison::LESS_THAN:
+      child = m_left_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_left_child.set(child);
+      break;
+    case keyrange::comparison::GREATER_THAN:
+      child = m_right_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_right_child.set(child);
+      break;
+    case keyrange::comparison::OVERLAPS:
+      // shouldn't be overlapping, since the tree is
+      // non-overlapping and this range must exist
+      abort();
+  }
+
+  return this;
+}
+
+bool treenode::left_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
+}
+
+bool treenode::right_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
+}
+
+// effect: rebalances the subtree rooted at this node
+//         using AVL style O(1) rotations. unlocks this
+//         node if it is not the new root of the subtree.
+// requires: node is locked by this thread, children are not
+// returns: locked root node of the rebalanced tree
+treenode *treenode::maybe_rebalance(void) {
+  // if we end up not rotating at all, the new root is this
+  treenode *new_root = this;
+  treenode *child = nullptr;
+
+  if (left_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_left_child.get_locked();
+    if (child->right_imbalanced(0)) {
+      treenode *grandchild = child->m_right_child.get_locked();
+
+      child->m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(child);
+
+      m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_left_child = child->m_right_child;
+      child->m_right_child.set(this);
+      new_root = child;
+    }
+  } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_right_child.get_locked();
+    if (child->left_imbalanced(0)) {
+      treenode *grandchild = child->m_left_child.get_locked();
+
+      child->m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(child);
+
+      m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_right_child = child->m_left_child;
+      child->m_left_child.set(this);
+      new_root = child;
+    }
+  }
+
+  // up to three nodes may be locked.
+  // - this
+  // - child
+  // - grandchild (but if it is locked, its the new root)
+  //
+  // one of them is the new root. we unlock everything except the new root.
+  if (child && child != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
+    child->mutex_unlock();
+  }
+  if (this != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
+    mutex_unlock();
+  }
+  TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
+  return new_root;
+}
+
+treenode *treenode::lock_and_rebalance_left(void) {
+  treenode *child = m_left_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_left_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+treenode *treenode::lock_and_rebalance_right(void) {
+  treenode *child = m_right_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_right_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+void treenode::child_ptr::set(treenode *node) {
+  ptr = node;
+  depth_est = ptr ? ptr->get_depth_estimate() : 0;
+}
+
+treenode *treenode::child_ptr::get_locked(void) {
+  if (ptr) {
+    ptr->mutex_lock();
+    depth_est = ptr->get_depth_estimate();
+  }
+  return ptr;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,302 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../ft/comparator.h"
+#include "../portability/memory.h"
+#include "../portability/toku_pthread.h"
+// PORT: we need LTM_STATUS
+#include "../ft/ft-status.h"
+#include "../portability/txn_subst.h"
+#include "keyrange.h"
+
+namespace toku {
+
+// a node in a tree with its own mutex
+// - range is the "key" of this node
+// - txnid is the single txnid associated with this node
+// - left and right children may be null
+//
+// to build a tree on top of this abstraction, the user:
+// - provides memory for a root node, initializes it via create_root()
+// - performs tree operations on the root node. memory management
+//   below the root node is handled by the abstraction, not the user.
+// this pattern:
+// - guaruntees a root node always exists.
+// - does not allow for rebalances on the root node
+
+class treenode {
+ public:
+  // every treenode function has some common requirements:
+  // - node is locked and children are never locked
+  // - node may be unlocked if no other thread has visibility
+
+  // effect: create the root node
+  void create_root(const comparator *cmp);
+
+  // effect: destroys the root node
+  void destroy_root(void);
+
+  // effect: sets the txnid and copies the given range for this node
+  void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // returns: true iff this node is marked as empty
+  bool is_empty(void);
+
+  // returns: true if this is the root node, denoted by a null parent
+  bool is_root(void);
+
+  // returns: true if the given range overlaps with this node's range
+  bool range_overlaps(const keyrange &range);
+
+  // effect: locks the node
+  void mutex_lock(void);
+
+  // effect: unlocks the node
+  void mutex_unlock(void);
+
+  // return: node whose child overlaps, or a child that is empty
+  //         and would contain range if it existed
+  // given: if cmp_hint is non-null, then it is a precomputed
+  //        comparison of this node's range to the given range.
+  treenode *find_node_with_overlapping_child(
+      const keyrange &range, const keyrange::comparison *cmp_hint);
+
+  // effect: performs an in-order traversal of the ranges that overlap the
+  //         given range, calling function->fn() on each node that does
+  // requires: function signature is: bool fn(const keyrange &range, TXNID
+  // txnid) requires: fn returns true to keep iterating, false to stop iterating
+  // requires: fn does not attempt to use any ranges read out by value
+  //           after removing a node with an overlapping range from the tree.
+  template <class F>
+  void traverse_overlaps(const keyrange &range, F *function) {
+    keyrange::comparison c = range.compare(*m_cmp, m_range);
+    if (c == keyrange::comparison::EQUALS) {
+      // Doesn't matter if fn wants to keep going, there
+      // is nothing left, so return.
+      function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      return;
+    }
+
+    treenode *left = m_left_child.get_locked();
+    if (left) {
+      if (c != keyrange::comparison::GREATER_THAN) {
+        // Target range is less than this node, or it overlaps this
+        // node.  There may be something on the left.
+        left->traverse_overlaps(range, function);
+      }
+      left->mutex_unlock();
+    }
+
+    if (c == keyrange::comparison::OVERLAPS) {
+      bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      if (!keep_going) {
+        return;
+      }
+    }
+
+    treenode *right = m_right_child.get_locked();
+    if (right) {
+      if (c != keyrange::comparison::LESS_THAN) {
+        // Target range is greater than this node, or it overlaps this
+        // node.  There may be something on the right.
+        right->traverse_overlaps(range, function);
+      }
+      right->mutex_unlock();
+    }
+  }
+
+  // effect: inserts the given range and txnid into a subtree, recursively
+  // requires: range does not overlap with any node below the subtree
+  bool insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // effect: removes the given range from the subtree
+  // requires: range exists in the subtree
+  // returns: the root of the resulting subtree
+  treenode *remove(const keyrange &range, TXNID txnid);
+
+  // effect: removes this node and all of its children, recursively
+  // requires: every node at and below this node is unlocked
+  void recursive_remove(void);
+
+ private:
+  // the child_ptr is a light abstraction for the locking of
+  // a child and the maintenence of its depth estimate.
+
+  struct child_ptr {
+    // set the child pointer
+    void set(treenode *node);
+
+    // get and lock this child if it exists
+    treenode *get_locked(void);
+
+    treenode *ptr;
+    uint32_t depth_est;
+  };
+
+  // the balance factor at which a node is considered imbalanced
+  static const int32_t IMBALANCE_THRESHOLD = 2;
+
+  // node-level mutex
+  toku_mutex_t m_mutex;
+
+  // the range and txnid for this node. the range contains a copy
+  // of the keys originally inserted into the tree. nodes may
+  // swap ranges. but at the end of the day, when a node is
+  // destroyed, it frees the memory associated with whatever range
+  // it has at the time of destruction.
+  keyrange m_range;
+
+  void remove_shared_owner(TXNID txnid);
+
+  bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); }
+
+ private:
+  // Owner transaction id.
+  // A value of TXNID_SHARED means this node has multiple owners
+  TXNID m_txnid;
+
+  // If true, this lock is a non-exclusive lock, and it can have either
+  // one or several owners.
+  bool m_is_shared;
+
+  // List of the owners, or nullptr if there's just one owner.
+  TxnidVector *m_owners;
+
+  // two child pointers
+  child_ptr m_left_child;
+  child_ptr m_right_child;
+
+  // comparator for ranges
+  // psergey-todo: Is there any sense to store the comparator in each tree
+  // node?
+  const comparator *m_cmp;
+
+  // marked for the root node. the root node is never free()'d
+  // when removed, but instead marked as empty.
+  bool m_is_root;
+
+  // marked for an empty node. only valid for the root.
+  bool m_is_empty;
+
+  // effect: initializes an empty node with the given comparator
+  void init(const comparator *cmp);
+
+  // requires: this is a shared node (m_is_shared==true)
+  // effect: another transaction is added as an owner.
+  // returns: true <=> added another owner
+  //          false <=> this transaction is already an owner
+  bool add_shared_owner(TXNID txnid);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the leftmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_leftmost_child(treenode **parent);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the rightmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_rightmost_child(treenode **parent);
+
+  // effect: remove the root of this subtree, destroying the old root
+  // returns: the new root of the subtree
+  treenode *remove_root_of_subtree(void);
+
+  // requires: subtree is non-empty, direction is not 0
+  // returns: the child of the subtree at either the left or rightmost extreme
+  treenode *find_child_at_extreme(int direction, treenode **parent);
+
+  // effect: retrieves and possibly rebalances the left child
+  // returns: a locked left child, if it exists
+  treenode *lock_and_rebalance_left(void);
+
+  // effect: retrieves and possibly rebalances the right child
+  // returns: a locked right child, if it exists
+  treenode *lock_and_rebalance_right(void);
+
+  // returns: the estimated depth of this subtree
+  uint32_t get_depth_estimate(void) const;
+
+  // returns: true iff left subtree depth is sufficiently less than the right
+  bool left_imbalanced(int threshold) const;
+
+  // returns: true iff right subtree depth is sufficiently greater than the left
+  bool right_imbalanced(int threshold) const;
+
+  // effect: performs an O(1) rebalance, which will "heal" an imbalance by at
+  // most 1. effect: if the new root is not this node, then this node is
+  // unlocked. returns: locked node representing the new root of the rebalanced
+  // subtree
+  treenode *maybe_rebalance(void);
+
+  // returns: allocated treenode populated with a copy of the range and txnid
+  static treenode *alloc(const comparator *cmp, const keyrange &range,
+                         TXNID txnid, bool is_shared);
+
+  // requires: node is a locked root node, or an unlocked non-root node
+  static void free(treenode *node);
+
+  // effect: swaps the range/txnid pairs for node1 and node2.
+  static void swap_in_place(treenode *node1, treenode *node2);
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,120 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "txnid_set.h"
+
+#include "../db.h"
+
+namespace toku {
+
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b);
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) {
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void txnid_set::create(void) {
+  // lazily allocate the underlying omt, since it is common
+  // to create a txnid set and never put anything in it.
+  m_txnids.create_no_array();
+}
+
+void txnid_set::destroy(void) { m_txnids.destroy(); }
+
+// Return true if the given transaction id is a member of the set.
+// Otherwise, return false.
+bool txnid_set::contains(TXNID txnid) const {
+  TXNID find_txnid;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, &find_txnid, nullptr);
+  return r == 0 ? true : false;
+}
+
+// Add a given txnid to the set
+void txnid_set::add(TXNID txnid) {
+  int r = m_txnids.insert<TXNID, find_by_txnid>(txnid, txnid, nullptr);
+  invariant(r == 0 || r == DB_KEYEXIST);
+}
+
+// Delete a given txnid from the set.
+void txnid_set::remove(TXNID txnid) {
+  uint32_t idx;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, nullptr, &idx);
+  if (r == 0) {
+    r = m_txnids.delete_at(idx);
+    invariant_zero(r);
+  }
+}
+
+// Return the size of the set
+uint32_t txnid_set::size(void) const { return m_txnids.size(); }
+
+// Get the ith id in the set, assuming that the set is sorted.
+TXNID txnid_set::get(uint32_t i) const {
+  TXNID txnid;
+  int r = m_txnids.fetch(i, &txnid);
+  if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+    return TXNID_NONE;
+  invariant_zero(r);
+  return txnid;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,92 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../portability/txn_subst.h"
+#include "../util/omt.h"
+
+namespace toku {
+
+class txnid_set {
+ public:
+  // effect: Creates an empty set. Does not malloc space for
+  //         any entries yet. That is done lazily on add().
+  void create(void);
+
+  // effect: Destroy the set's internals.
+  void destroy(void);
+
+  // returns: True if the given txnid is a member of the set.
+  bool contains(TXNID id) const;
+
+  // effect: Adds a given txnid to the set if it did not exist
+  void add(TXNID txnid);
+
+  // effect: Deletes a txnid from the set if it exists.
+  void remove(TXNID txnid);
+
+  // returns: Size of the set
+  uint32_t size(void) const;
+
+  // returns: The "i'th" id in the set, as if it were sorted.
+  TXNID get(uint32_t i) const;
+
+ private:
+  toku::omt<TXNID> m_txnids;
+
+  friend class txnid_set_unit_test;
+};
+ENSURE_POD(txnid_set);
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,213 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "../db.h"
+#include "../portability/memory.h"
+// PORT #include <toku_assert.h>
+#include <memory.h>
+#include <string.h>
+
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Create a lock request graph
+void wfg::create(void) { m_nodes.create(); }
+
+// Destroy the internals of the lock request graph
+void wfg::destroy(void) {
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    node *n;
+    int r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    invariant_notnull(n);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    node::free(n);
+  }
+  m_nodes.destroy();
+}
+
+// Add an edge (a_id, b_id) to the graph
+void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) {
+  node *a_node = find_create_node(a_txnid);
+  node *b_node = find_create_node(b_txnid);
+  a_node->edges.add(b_node->txnid);
+}
+
+// Return true if a node with the given transaction id exists in the graph.
+// Return false otherwise.
+bool wfg::node_exists(TXNID txnid) {
+  node *n = find_node(txnid);
+  return n != NULL;
+}
+
+bool wfg::cycle_exists_from_node(node *target, node *head,
+                                 std::function<void(TXNID)> reporter) {
+  bool cycle_found = false;
+  head->visited = true;
+  uint32_t n_edges = head->edges.size();
+  for (uint32_t i = 0; i < n_edges && !cycle_found; i++) {
+    TXNID edge_id = head->edges.get(i);
+    if (target->txnid == edge_id) {
+      cycle_found = true;
+      if (reporter) reporter(edge_id);
+    } else {
+      node *new_head = find_node(edge_id);
+      if (new_head && !new_head->visited) {
+        cycle_found = cycle_exists_from_node(target, new_head, reporter);
+        if (cycle_found && reporter) reporter(edge_id);
+      }
+    }
+  }
+  head->visited = false;
+  return cycle_found;
+}
+
+// Return true if there exists a cycle from a given transaction id in the graph.
+// Return false otherwise.
+bool wfg::cycle_exists_from_txnid(TXNID txnid,
+                                  std::function<void(TXNID)> reporter) {
+  node *a_node = find_node(txnid);
+  bool cycles_found = false;
+  if (a_node) {
+    cycles_found = cycle_exists_from_node(a_node, a_node, reporter);
+  }
+  return cycles_found;
+}
+
+// Apply a given function f to all of the nodes in the graph.  The apply
+// function returns when the function f is called for all of the nodes in the
+// graph, or the function f returns non-zero.
+void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) {
+  int r = 0;
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes && r == 0; i++) {
+    node *n;
+    r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    r = fn(n->txnid, extra);
+  }
+}
+
+// Apply a given function f to all of the edges whose origin is a given node id.
+// The apply function returns when the function f is called for all edges in the
+// graph rooted at node id, or the function f returns non-zero.
+void wfg::apply_edges(TXNID txnid,
+                      int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                      void *extra) {
+  node *n = find_node(txnid);
+  if (n) {
+    int r = 0;
+    uint32_t n_edges = n->edges.size();
+    for (uint32_t i = 0; i < n_edges && r == 0; i++) {
+      r = fn(txnid, n->edges.get(i), extra);
+    }
+  }
+}
+
+// find node by id
+wfg::node *wfg::find_node(TXNID txnid) {
+  node *n = nullptr;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, nullptr);
+  invariant(r == 0 || r == DB_NOTFOUND);
+  return n;
+}
+
+// this is the omt comparison function
+// nodes are compared by their txnid.
+int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) {
+  TXNID txnid_a = node_a->txnid;
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+// insert a new node
+wfg::node *wfg::find_create_node(TXNID txnid) {
+  node *n;
+  uint32_t idx;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, &idx);
+  if (r == DB_NOTFOUND) {
+    n = node::alloc(txnid);
+    r = m_nodes.insert_at(n, idx);
+    invariant_zero(r);
+  }
+  invariant_notnull(n);
+  return n;
+}
+
+wfg::node *wfg::node::alloc(TXNID txnid) {
+  node *XCALLOC(n);
+  n->txnid = txnid;
+  n->visited = false;
+  n->edges.create();
+  return n;
+}
+
+void wfg::node::free(wfg::node *n) {
+  n->edges.destroy();
+  toku_free(n);
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,124 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <functional>
+
+#include "../util/omt.h"
+#include "txnid_set.h"
+
+namespace toku {
+
+// A wfg is a 'wait-for' graph. A directed edge in represents one
+// txn waiting for another to finish before it can acquire a lock.
+
+class wfg {
+ public:
+  // Create a lock request graph
+  void create(void);
+
+  // Destroy the internals of the lock request graph
+  void destroy(void);
+
+  // Add an edge (a_id, b_id) to the graph
+  void add_edge(TXNID a_txnid, TXNID b_txnid);
+
+  // Return true if a node with the given transaction id exists in the graph.
+  // Return false otherwise.
+  bool node_exists(TXNID txnid);
+
+  // Return true if there exists a cycle from a given transaction id in the
+  // graph. Return false otherwise.
+  bool cycle_exists_from_txnid(TXNID txnid,
+                               std::function<void(TXNID)> reporter);
+
+  // Apply a given function f to all of the nodes in the graph.  The apply
+  // function returns when the function f is called for all of the nodes in the
+  // graph, or the function f returns non-zero.
+  void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra);
+
+  // Apply a given function f to all of the edges whose origin is a given node
+  // id. The apply function returns when the function f is called for all edges
+  // in the graph rooted at node id, or the function f returns non-zero.
+  void apply_edges(TXNID txnid,
+                   int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                   void *extra);
+
+ private:
+  struct node {
+    // txnid for this node and the associated set of edges
+    TXNID txnid;
+    txnid_set edges;
+    bool visited;
+
+    static node *alloc(TXNID txnid);
+
+    static void free(node *n);
+  };
+  ENSURE_POD(node);
+
+  toku::omt<node *> m_nodes;
+
+  node *find_node(TXNID txnid);
+
+  node *find_create_node(TXNID txnid);
+
+  bool cycle_exists_from_node(node *target, node *head,
+                              std::function<void(TXNID)> reporter);
+
+  static int find_by_txnid(node *const &node_a, const TXNID &txnid_b);
+};
+ENSURE_POD(wfg);
+
+} /* namespace toku */
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/memory.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,215 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+#include "toku_portability.h"
+
+/* Percona memory allocation functions and macros.
+ * These are functions for malloc and free */
+
+int toku_memory_startup(void) __attribute__((constructor));
+void toku_memory_shutdown(void) __attribute__((destructor));
+
+/* Generally: errno is set to 0 or a value to indicate problems. */
+
+// Everything should call toku_malloc() instead of malloc(), and toku_calloc()
+// instead of calloc() That way the tests can can, e.g.,  replace the malloc
+// function using toku_set_func_malloc().
+void *toku_calloc(size_t nmemb, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_xcalloc(size_t nmemb, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_malloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_malloc_aligned(size_t alignment, size_t size)
+    __attribute__((__visibility__("default")));
+
+// xmalloc aborts instead of return NULL if we run out of memory
+void *toku_xmalloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_xrealloc(void *, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_xmalloc_aligned(size_t alignment, size_t size)
+    __attribute__((__visibility__("default")));
+// Effect: Perform a os_malloc_aligned(size) with the additional property that
+// the returned pointer is a multiple of ALIGNMENT.
+//  Fail with a resource_assert if the allocation fails (don't return an error
+//  code). If the alloc_aligned function has been set then call it instead.
+// Requires: alignment is a power of two.
+
+void toku_free(void *) __attribute__((__visibility__("default")));
+
+size_t toku_malloc_usable_size(void *p)
+    __attribute__((__visibility__("default")));
+
+/* MALLOC is a macro that helps avoid a common error:
+ * Suppose I write
+ *    struct foo *x = malloc(sizeof(struct foo));
+ * That works fine.  But if I change it to this, I've probably made an mistake:
+ *    struct foo *x = malloc(sizeof(struct bar));
+ * It can get worse, since one might have something like
+ *    struct foo *x = malloc(sizeof(struct foo *))
+ * which looks reasonable, but it allocoates enough to hold a pointer instead of
+ * the amount needed for the struct. So instead, write struct foo *MALLOC(x);
+ * and you cannot go wrong.
+ */
+#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v)))
+/* MALLOC_N is like calloc(Except no 0ing of data):  It makes an array.  Write
+ *   int *MALLOC_N(5,x);
+ * to make an array of 5 integers.
+ */
+#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v)))
+#define MALLOC_N_ALIGNED(align, n, v) \
+  CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v)))
+
+// CALLOC_N is like calloc with auto-figuring out size of members
+#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
+
+#define CALLOC(v) CALLOC_N(1, v)
+
+// XMALLOC macros are like MALLOC except they abort if the operation fails
+#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
+#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v)))
+#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v))))
+#define XCALLOC(v) XCALLOC_N(1, v)
+#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
+#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v)))
+
+#define XMALLOC_N_ALIGNED(align, n, v) \
+  CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v)))
+
+#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
+#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))
+
+// ZERO_ARRAY writes zeroes to a stack-allocated array
+#define ZERO_ARRAY(o)          \
+  do {                         \
+    memset((o), 0, sizeof(o)); \
+  } while (0)
+// ZERO_STRUCT writes zeroes to a stack-allocated struct
+#define ZERO_STRUCT(o)          \
+  do {                          \
+    memset(&(o), 0, sizeof(o)); \
+  } while (0)
+
+/* Copy memory.  Analogous to strdup() */
+void *toku_memdup(const void *v, size_t len);
+/* Toku-version of strdup.  Use this so that it calls toku_malloc() */
+char *toku_strdup(const char *s) __attribute__((__visibility__("default")));
+/* Toku-version of strndup.  Use this so that it calls toku_malloc() */
+char *toku_strndup(const char *s, size_t n)
+    __attribute__((__visibility__("default")));
+/* Copy memory.  Analogous to strdup() Crashes instead of returning NULL */
+void *toku_xmemdup(const void *v, size_t len)
+    __attribute__((__visibility__("default")));
+/* Toku-version of strdup.  Use this so that it calls toku_xmalloc()  Crashes
+ * instead of returning NULL */
+char *toku_xstrdup(const char *s) __attribute__((__visibility__("default")));
+
+void toku_malloc_cleanup(
+    void); /* Before exiting, call this function to free up any internal data
+              structures from toku_malloc.  Otherwise valgrind will complain of
+              memory leaks. */
+
+/* Check to see if everything malloc'd was free.  Might be a no-op depending on
+ * how memory.c is configured. */
+void toku_memory_check_all_free(void);
+/* Check to see if memory is "sane".  Might be a no-op.  Probably better to
+ * simply use valgrind. */
+void toku_do_memory_check(void);
+
+typedef void *(*malloc_fun_t)(size_t);
+typedef void (*free_fun_t)(void *);
+typedef void *(*realloc_fun_t)(void *, size_t);
+typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
+typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/,
+                                       size_t /*size*/);
+
+void toku_set_func_malloc(malloc_fun_t f);
+void toku_set_func_xmalloc_only(malloc_fun_t f);
+void toku_set_func_malloc_only(malloc_fun_t f);
+void toku_set_func_realloc(realloc_fun_t f);
+void toku_set_func_xrealloc_only(realloc_fun_t f);
+void toku_set_func_realloc_only(realloc_fun_t f);
+void toku_set_func_free(free_fun_t f);
+
+typedef struct memory_status {
+  uint64_t malloc_count;   // number of malloc operations
+  uint64_t free_count;     // number of free operations
+  uint64_t realloc_count;  // number of realloc operations
+  uint64_t malloc_fail;    // number of malloc operations that failed
+  uint64_t realloc_fail;   // number of realloc operations that failed
+  uint64_t requested;      // number of bytes requested
+  uint64_t used;   // number of bytes used (requested + overhead), obtained from
+                   // malloc_usable_size()
+  uint64_t freed;  // number of bytes freed;
+  uint64_t max_requested_size;  // largest attempted allocation size
+  uint64_t last_failed_size;    // size of the last failed allocation attempt
+  volatile uint64_t
+      max_in_use;  // maximum memory footprint (used - freed), approximate (not
+                   // worth threadsafety overhead for exact)
+  const char *mallocator_version;
+  uint64_t mmap_threshold;
+} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS;
+
+void toku_memory_get_status(LOCAL_MEMORY_STATUS s);
+
+// Effect: Like toku_memory_footprint, except instead of passing p,
+//   we pass toku_malloc_usable_size(p).
+size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable);
+
+// Effect: Return an estimate how how much space an object is using, possibly by
+//   using toku_malloc_usable_size(p).
+//   If p is NULL then returns 0.
+size_t toku_memory_footprint(void *p, size_t touched);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,39 @@
+//
+// A replacement for toku_assert.h
+//
+#pragma once
+
+#include <assert.h>
+#include <errno.h>
+
+#ifdef NDEBUG
+
+#define assert_zero(a) ((void)(a))
+#define invariant(a) ((void)(a))
+#define invariant_notnull(a) ((void)(a))
+#define invariant_zero(a) ((void)(a))
+
+#else
+
+#define assert_zero(a) assert((a) == 0)
+#define invariant(a) assert(a)
+#define invariant_notnull(a) assert(a)
+#define invariant_zero(a) assert_zero(a)
+
+#endif
+
+#define lazy_assert_zero(a) assert_zero(a)
+
+#define paranoid_invariant_zero(a) assert_zero(a)
+#define paranoid_invariant_notnull(a) assert(a)
+#define paranoid_invariant(a) assert(a)
+
+#define ENSURE_POD(type)                                                    \
+  static_assert(                                                            \
+      std::is_standard_layout<type>::value && std::is_trivial<type>::value, \
+      #type "isn't POD")
+
+inline int get_error_errno(void) {
+  invariant(errno);
+  return errno;
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,130 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "toku_assert_subst.h"
+
+__attribute__((const, always_inline)) static inline intptr_t which_cache_line(
+    intptr_t addr) {
+  static const size_t assumed_cache_line_size = 64;
+  return addr / assumed_cache_line_size;
+}
+template <typename T>
+__attribute__((const, always_inline)) static inline bool crosses_boundary(
+    T *addr, size_t width) {
+  const intptr_t int_addr = reinterpret_cast<intptr_t>(addr);
+  const intptr_t last_byte = int_addr + width - 1;
+  return which_cache_line(int_addr) != which_cache_line(last_byte);
+}
+
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_fetch_and_add(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_add_and_fetch(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_fetch_and_sub(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_sub_and_fetch(addr, diff);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap(
+    T *addr, U oldval, V newval) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_val_compare_and_swap(addr, oldval, newval);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline bool
+toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_bool_compare_and_swap(addr, oldval, newval);
+}
+
+// in case you include this but not toku_portability.h
+#pragma GCC poison __sync_fetch_and_add
+#pragma GCC poison __sync_fetch_and_sub
+#pragma GCC poison __sync_fetch_and_or
+#pragma GCC poison __sync_fetch_and_and
+#pragma GCC poison __sync_fetch_and_xor
+#pragma GCC poison __sync_fetch_and_nand
+#pragma GCC poison __sync_add_and_fetch
+#pragma GCC poison __sync_sub_and_fetch
+#pragma GCC poison __sync_or_and_fetch
+#pragma GCC poison __sync_and_and_fetch
+#pragma GCC poison __sync_xor_and_fetch
+#pragma GCC poison __sync_nand_and_fetch
+#pragma GCC poison __sync_bool_compare_and_swap
+#pragma GCC poison __sync_val_compare_and_swap
+#pragma GCC poison __sync_synchronize
+#pragma GCC poison __sync_lock_test_and_set
+#pragma GCC poison __sync_release
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,83 @@
+/*
+  A wrapper around ROCKSDB_NAMESPACE::TransactionDBMutexFactory-provided
+  condition and mutex that provides toku_pthread_*-like interface. The functions
+  are named
+
+    toku_external_{mutex|cond}_XXX
+
+  Lock Tree uses this mutex and condition for interruptible (long) lock waits.
+
+  (It also still uses toku_pthread_XXX calls for mutexes/conditions for
+   shorter waits on internal objects)
+*/
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "toku_portability.h"
+
+using ROCKSDB_NAMESPACE::TransactionDBCondVar;
+using ROCKSDB_NAMESPACE::TransactionDBMutex;
+
+typedef std::shared_ptr<ROCKSDB_NAMESPACE::TransactionDBMutexFactory>
+    toku_external_mutex_factory_t;
+
+typedef std::shared_ptr<TransactionDBMutex> toku_external_mutex_t;
+typedef std::shared_ptr<TransactionDBCondVar> toku_external_cond_t;
+
+static inline void toku_external_cond_init(
+    toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) {
+  *cond = mutex_factory->AllocateCondVar();
+}
+
+inline void toku_external_cond_destroy(toku_external_cond_t *cond) {
+  cond->reset();  // this will destroy the managed object
+}
+
+inline void toku_external_cond_signal(toku_external_cond_t *cond) {
+  (*cond)->Notify();
+}
+
+inline void toku_external_cond_broadcast(toku_external_cond_t *cond) {
+  (*cond)->NotifyAll();
+}
+
+inline int toku_external_cond_timedwait(toku_external_cond_t *cond,
+                                        toku_external_mutex_t *mutex,
+                                        int64_t timeout_microsec) {
+  auto res = (*cond)->WaitFor(*mutex, timeout_microsec);
+  if (res.ok())
+    return 0;
+  else
+    return ETIMEDOUT;
+}
+
+inline void toku_external_mutex_init(toku_external_mutex_factory_t factory,
+                                     toku_external_mutex_t *mutex) {
+  // Use placement new: the memory has been allocated but constructor wasn't
+  // called
+  new (mutex) toku_external_mutex_t;
+  *mutex = factory->AllocateMutex();
+}
+
+inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) {
+  (*mutex)->Lock();
+}
+
+inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) {
+  (*mutex)->Lock();
+  return 0;
+}
+
+inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) {
+  (*mutex)->UnLock();
+}
+
+inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) {
+  mutex->reset();  // this will destroy the managed object
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,286 @@
+/*======
+This file is part of PerconaFT.
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#pragma once
+
+#include <stdio.h>  // FILE
+
+// Performance instrumentation object identifier type
+typedef unsigned int pfs_key_t;
+
+enum class toku_instr_object_type { mutex, rwlock, cond, thread, file };
+
+struct PSI_file;
+
+struct TOKU_FILE {
+  /** The real file. */
+  FILE *file;
+  struct PSI_file *key;
+  TOKU_FILE() : file(nullptr), key(nullptr) {}
+};
+
+struct PSI_mutex;
+struct PSI_cond;
+struct PSI_rwlock;
+
+struct toku_mutex_t;
+struct toku_cond_t;
+struct toku_pthread_rwlock_t;
+
+class toku_instr_key;
+
+class toku_instr_probe_empty {
+ public:
+  explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {}
+
+  void start_with_source_location(UU(const char *src_file), UU(int src_line)) {}
+
+  void stop() {}
+};
+
+#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__)
+#define TOKU_PROBE_STOP(p) p->stop
+
+extern toku_instr_key toku_uninstrumented;
+
+#ifndef MYSQL_TOKUDB_ENGINE
+
+#include <pthread.h>
+
+class toku_instr_key {
+ public:
+  toku_instr_key(UU(toku_instr_object_type type), UU(const char *group),
+                 UU(const char *name)) {}
+
+  explicit toku_instr_key(UU(pfs_key_t key_id)) {}
+  // No-instrumentation constructor:
+  toku_instr_key() {}
+  ~toku_instr_key() {}
+};
+
+typedef toku_instr_probe_empty toku_instr_probe;
+
+enum class toku_instr_file_op {
+  file_stream_open,
+  file_create,
+  file_open,
+  file_delete,
+  file_rename,
+  file_read,
+  file_write,
+  file_sync,
+  file_stream_close,
+  file_close,
+  file_stat
+};
+
+struct PSI_file {};
+struct PSI_mutex {};
+
+struct toku_io_instrumentation {};
+
+inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread,
+                               const pthread_attr_t *attr,
+                               void *(*start_routine)(void *), void *arg) {
+  return pthread_create(thread, attr, start_routine, arg);
+}
+
+inline void toku_instr_register_current_thread() {}
+
+inline void toku_instr_delete_current_thread() {}
+
+// Instrument file creation, opening, closing, and renaming
+inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr),
+                                       UU(const toku_instr_key &key),
+                                       UU(toku_instr_file_op op),
+                                       UU(const char *name),
+                                       UU(const char *src_file),
+                                       UU(int src_line)) {}
+
+inline void toku_instr_file_stream_open_end(
+    UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {}
+
+inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr),
+                                     UU(int fd)) {}
+
+inline void toku_instr_file_name_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+    UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_file_stream_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_fd_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(int fd), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr),
+                                      UU(int result)) {}
+
+inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr),
+                                     UU(toku_instr_file_op op), UU(int fd),
+                                     UU(unsigned int count),
+                                     UU(const char *src_file),
+                                     UU(int src_line)) {}
+
+inline void toku_instr_file_name_io_begin(
+    UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+    UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_stream_io_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr),
+                                   UU(unsigned int count)) {}
+
+struct toku_mutex_t;
+
+struct toku_mutex_instrumentation {};
+
+inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key),
+                                        UU(toku_mutex_t &mutex)) {
+  return nullptr;
+}
+
+inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {}
+
+inline void toku_instr_mutex_lock_start(
+    UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_trylock_start(
+    UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_lock_end(
+    UU(toku_mutex_instrumentation &mutex_instr),
+    UU(int pthread_mutex_lock_result)) {}
+
+inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {}
+
+struct toku_cond_instrumentation {};
+
+enum class toku_instr_cond_op {
+  cond_wait,
+  cond_timedwait,
+};
+
+inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key),
+                                      UU(toku_cond_t &cond)) {
+  return nullptr;
+}
+
+inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {}
+
+inline void toku_instr_cond_wait_start(
+    UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op),
+    UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr),
+                                     UU(int pthread_cond_wait_result)) {}
+
+inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {}
+
+inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {}
+
+#if 0
+// rw locks are not used 
+// rwlock instrumentation
+struct toku_rwlock_instrumentation {};
+
+inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key),
+                                          UU(toku_pthread_rwlock_t &rwlock)) {
+    return nullptr;
+}
+
+inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_start(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(toku_pthread_rwlock_t &rwlock),
+    UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_start(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(toku_pthread_rwlock_t &rwlock),
+    UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_end(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_end(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {}
+#endif
+
+#else  // MYSQL_TOKUDB_ENGINE
+// There can be not only mysql but also mongodb or any other PFS stuff
+#include <toku_instr_mysql.h>
+#endif  // MYSQL_TOKUDB_ENGINE
+
+// Mutexes
+extern toku_instr_key manager_escalation_mutex_key;
+extern toku_instr_key manager_escalator_mutex_key;
+extern toku_instr_key manager_mutex_key;
+extern toku_instr_key treenode_mutex_key;
+extern toku_instr_key locktree_request_info_mutex_key;
+extern toku_instr_key locktree_request_info_retry_mutex_key;
+
+// condition vars
+extern toku_instr_key lock_request_m_wait_cond_key;
+extern toku_instr_key locktree_request_info_retry_cv_key;
+extern toku_instr_key manager_m_escalator_done_key;  // unused
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,87 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#if defined(__clang__)
+#define constexpr_static_assert(a, b)
+#else
+#define constexpr_static_assert(a, b) static_assert(a, b)
+#endif
+
+// include here, before they get deprecated
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "toku_atomic.h"
+
+#if defined(__cplusplus)
+#include <type_traits>
+#endif
+
+#if defined(__cplusplus)
+// decltype() here gives a reference-to-pointer instead of just a pointer,
+// just use __typeof__
+#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value)
+#else
+#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value)
+#endif
+
+#define UU(x) x __attribute__((__unused__))
+
+#include "toku_instrumentation.h"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "toku_portability.h"
+// PORT2: #include "toku_assert.h"
+
+// TODO: some things moved toku_instrumentation.h, not necessarily the best
+// place
+typedef pthread_attr_t toku_pthread_attr_t;
+typedef pthread_t toku_pthread_t;
+typedef pthread_mutex_t toku_pthread_mutex_t;
+typedef pthread_condattr_t toku_pthread_condattr_t;
+typedef pthread_cond_t toku_pthread_cond_t;
+typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t;
+typedef pthread_key_t toku_pthread_key_t;
+typedef struct timespec toku_timespec_t;
+
+// TODO: break this include loop
+#include <pthread.h>
+typedef pthread_mutexattr_t toku_pthread_mutexattr_t;
+
+struct toku_mutex_t {
+  pthread_mutex_t pmutex;
+  struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */
+#if defined(TOKU_PTHREAD_DEBUG)
+  pthread_t owner;  // = pthread_self(); // for debugging
+  bool locked;
+  bool valid;
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+struct toku_cond_t {
+  pthread_cond_t pcond;
+  struct PSI_cond *psi_cond;
+#if defined(TOKU_PTHREAD_DEBUG)
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_COND_INITIALIZER \
+  { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 }
+#else
+#define TOKU_COND_INITIALIZER \
+  { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+struct toku_pthread_rwlock_t {
+  pthread_rwlock_t rwlock;
+  struct PSI_rwlock *psi_rwlock;
+#if defined(TOKU_PTHREAD_DEBUG)
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+typedef struct toku_mutex_aligned {
+  toku_mutex_t aligned_mutex __attribute__((__aligned__(64)));
+} toku_mutex_aligned_t;
+
+// Initializing with {} will fill in a struct with all zeros.
+// But you may also need a pragma to suppress the warnings, as follows
+//
+//   #pragma GCC diagnostic push
+//   #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+//   toku_mutex_t foo = ZERO_MUTEX_INITIALIZER;
+//   #pragma GCC diagnostic pop
+//
+// In general it will be a lot of busy work to make this codebase compile
+// cleanly with -Wmissing-field-initializers
+
+#define ZERO_MUTEX_INITIALIZER \
+  {}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_MUTEX_INITIALIZER                                             \
+  {                                                                        \
+    .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+    .locked = false, .valid = true, .instr_key_id = 0                      \
+  }
+#else
+#define TOKU_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+// Darwin doesn't provide adaptive mutexes
+#if defined(__APPLE__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER                                    \
+  {                                                                        \
+    .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+    .locked = false, .valid = true, .instr_key_id = 0                      \
+  }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+#else   // __FreeBSD__, __linux__, at least
+#if defined(__GLIBC__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP
+#else
+// not all libc (e.g. musl) implement NP (Non-POSIX) attributes
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#endif
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER                                    \
+  {                                                                        \
+    .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \
+    .owner = 0, .locked = false, .valid = true, .instr_key_id = 0          \
+  }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+#endif  // defined(__APPLE__)
+
+// Different OSes implement mutexes as different amounts of nested structs.
+// C++ will fill out all missing values with zeroes if you provide at least one
+// zero, but it needs the right amount of nesting.
+#if defined(__FreeBSD__)
+#define ZERO_COND_INITIALIZER \
+  { 0 }
+#elif defined(__APPLE__)
+#define ZERO_COND_INITIALIZER \
+  {                           \
+    { 0 }                     \
+  }
+#else  // __linux__, at least
+#define ZERO_COND_INITIALIZER \
+  {}
+#endif
+
+static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) {
+  int r = pthread_mutexattr_init(attr);
+  assert_zero(r);
+}
+
+static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr,
+                                          int type) {
+  int r = pthread_mutexattr_settype(attr, type);
+  assert_zero(r);
+}
+
+static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) {
+  int r = pthread_mutexattr_destroy(attr);
+  assert_zero(r);
+}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) {
+  invariant(mutex->locked);
+  invariant(mutex->owner == pthread_self());
+}
+#else
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex
+                                            __attribute__((unused))) {}
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+// asserting that a mutex is unlocked only makes sense
+// if the calling thread can guaruntee that no other threads
+// are trying to lock this mutex at the time of the assertion
+//
+// a good example of this is a tree with mutexes on each node.
+// when a node is locked the caller knows that no other threads
+// can be trying to lock its childrens' mutexes. the children
+// are in one of two fixed states: locked or unlocked.
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) {
+  invariant(mutex->owner == 0);
+  invariant(!mutex->locked);
+}
+#else
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex
+                                              __attribute__((unused))) {}
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+#define toku_mutex_lock(M) \
+  toku_mutex_lock_with_source_location(M, __FILE__, __LINE__)
+
+static inline void toku_cond_init(toku_cond_t *cond,
+                                  const toku_pthread_condattr_t *attr) {
+  int r = pthread_cond_init(&cond->pcond, attr);
+  assert_zero(r);
+}
+
+#define toku_mutex_trylock(M) \
+  toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__)
+
+inline void toku_mutex_unlock(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->owner == pthread_self());
+  invariant(mutex->valid);
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_unlock(mutex->psi_mutex);
+  int r = pthread_mutex_unlock(&mutex->pmutex);
+  assert_zero(r);
+}
+
+inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex,
+                                                 const char *src_file,
+                                                 int src_line) {
+  toku_mutex_instrumentation mutex_instr;
+  toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line);
+
+  const int r = pthread_mutex_lock(&mutex->pmutex);
+  toku_instr_mutex_lock_end(mutex_instr, r);
+
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->valid);
+  invariant(!mutex->locked);
+  invariant(mutex->owner == 0);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex,
+                                                   const char *src_file,
+                                                   int src_line) {
+  toku_mutex_instrumentation mutex_instr;
+  toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line);
+
+  const int r = pthread_mutex_lock(&mutex->pmutex);
+  toku_instr_mutex_lock_end(mutex_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+  if (r == 0) {
+    invariant(mutex->valid);
+    invariant(!mutex->locked);
+    invariant(mutex->owner == 0);
+    mutex->locked = true;
+    mutex->owner = pthread_self();
+  }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  return r;
+}
+
+#define toku_cond_wait(C, M) \
+  toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__)
+
+#define toku_cond_timedwait(C, M, W) \
+  toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__)
+
+inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond,
+                           const pthread_condattr_t *attr) {
+  toku_instr_cond_init(key, *cond);
+  int r = pthread_cond_init(&cond->pcond, attr);
+  assert_zero(r);
+}
+
+inline void toku_cond_destroy(toku_cond_t *cond) {
+  toku_instr_cond_destroy(cond->psi_cond);
+  int r = pthread_cond_destroy(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_cond_wait_with_source_location(toku_cond_t *cond,
+                                                toku_mutex_t *mutex,
+                                                const char *src_file,
+                                                int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+  /* Instrumentation start */
+  toku_cond_instrumentation cond_instr;
+  toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond,
+                             *mutex, src_file, src_line);
+
+  /* Instrumented code */
+  const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex);
+
+  /* Instrumentation end */
+  toku_instr_cond_wait_end(cond_instr, r);
+
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(!mutex->locked);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond,
+                                                    toku_mutex_t *mutex,
+                                                    toku_timespec_t *wakeup_at,
+                                                    const char *src_file,
+                                                    int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+  /* Instrumentation start */
+  toku_cond_instrumentation cond_instr;
+  toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait,
+                             *cond, *mutex, src_file, src_line);
+
+  /* Instrumented code */
+  const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at);
+
+  /* Instrumentation end */
+  toku_instr_cond_wait_end(cond_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(!mutex->locked);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  return r;
+}
+
+inline void toku_cond_signal(toku_cond_t *cond) {
+  toku_instr_cond_signal(*cond);
+  const int r = pthread_cond_signal(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_cond_broadcast(toku_cond_t *cond) {
+  toku_instr_cond_broadcast(*cond);
+  const int r = pthread_cond_broadcast(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex,
+                            const toku_pthread_mutexattr_t *attr) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  mutex->valid = true;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_init(key, *mutex);
+  const int r = pthread_mutex_init(&mutex->pmutex, attr);
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  mutex->locked = false;
+  invariant(mutex->valid);
+  mutex->valid = true;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline void toku_mutex_destroy(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->valid);
+  mutex->valid = false;
+  invariant(!mutex->locked);
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_destroy(mutex->psi_mutex);
+  int r = pthread_mutex_destroy(&mutex->pmutex);
+  assert_zero(r);
+}
+
+#define toku_pthread_rwlock_rdlock(RW) \
+  toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__)
+
+#define toku_pthread_rwlock_wrlock(RW) \
+  toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__)
+
+#if 0
+inline void toku_pthread_rwlock_init(
+    const toku_instr_key &key,
+    toku_pthread_rwlock_t *__restrict rwlock,
+    const toku_pthread_rwlockattr_t *__restrict attr) {
+    toku_instr_rwlock_init(key, *rwlock);
+    int r = pthread_rwlock_init(&rwlock->rwlock, attr);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_destroy(rwlock->psi_rwlock);
+    int r = pthread_rwlock_destroy(&rwlock->rwlock);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdlock_with_source_location(
+    toku_pthread_rwlock_t *rwlock,
+    const char *src_file,
+    uint src_line) {
+
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    toku_instr_rwlock_rdlock_wait_start(
+        rwlock_instr, *rwlock, src_file, src_line);
+    /* Instrumented code */
+    const int r = pthread_rwlock_rdlock(&rwlock->rwlock);
+
+    /* Instrumentation end */
+    toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r);
+
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrlock_with_source_location(
+    toku_pthread_rwlock_t *rwlock,
+    const char *src_file,
+    uint src_line) {
+
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    toku_instr_rwlock_wrlock_wait_start(
+        rwlock_instr, *rwlock, src_file, src_line);
+    /* Instrumented code */
+    const int r = pthread_rwlock_wrlock(&rwlock->rwlock);
+
+    /* Instrumentation end */
+    toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r);
+
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_unlock(*rwlock);
+    const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_unlock(*rwlock);
+    const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+    assert_zero(r);
+}
+#endif
+
+static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) {
+  return pthread_join(thread, value_ptr);
+}
+
+static inline int toku_pthread_detach(toku_pthread_t thread) {
+  return pthread_detach(thread);
+}
+
+static inline int toku_pthread_key_create(toku_pthread_key_t *key,
+                                          void (*destroyf)(void *)) {
+  return pthread_key_create(key, destroyf);
+}
+
+static inline int toku_pthread_key_delete(toku_pthread_key_t key) {
+  return pthread_key_delete(key);
+}
+
+static inline void *toku_pthread_getspecific(toku_pthread_key_t key) {
+  return pthread_getspecific(key);
+}
+
+static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) {
+  return pthread_setspecific(key, data);
+}
+
+int toku_pthread_yield(void) __attribute__((__visibility__("default")));
+
+static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); }
+
+static inline void *toku_pthread_done(void *exit_value) {
+  toku_instr_delete_current_thread();
+  pthread_exit(exit_value);
+  return nullptr;  // Avoid compiler warning
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,179 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+
+#ifdef HAVE_valgrind
+#undef USE_VALGRIND
+#define USE_VALGRIND 1
+#endif
+
+#if defined(__linux__) && USE_VALGRIND
+
+#include <valgrind/drd.h>
+#include <valgrind/helgrind.h>
+
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \
+  VALGRIND_HG_ENABLE_CHECKING(p, size)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \
+  VALGRIND_HG_DISABLE_CHECKING(p, size)
+#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END()
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END()
+
+/*
+ * How to make helgrind happy about tree rotations and new mutex orderings:
+ *
+ * // Tell helgrind that we unlocked it so that the next call doesn't get a
+ * "destroyed a locked mutex" error.
+ * // Tell helgrind that we destroyed the mutex.
+ * VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka);
+ * VALGRIND_HG_MUTEX_DESTROY_PRE(&locka);
+ *
+ * // And recreate it.  It would be better to simply be able to say that the
+ * order on these two can now be reversed, because this code forgets all the
+ * ordering information for this mutex.
+ * // Then tell helgrind that we have locked it again.
+ * VALGRIND_HG_MUTEX_INIT_POST(&locka, 0);
+ * VALGRIND_HG_MUTEX_LOCK_POST(&locka);
+ *
+ * When the ordering of two locks changes, we don't need tell Helgrind about do
+ * both locks.  Just one is good enough.
+ */
+
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \
+  VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex);                 \
+  VALGRIND_HG_MUTEX_DESTROY_PRE(mutex);                \
+  VALGRIND_HG_MUTEX_INIT_POST(mutex, 0);               \
+  VALGRIND_HG_MUTEX_LOCK_POST(mutex);
+
+#else  // !defined(__linux__) || !USE_VALGRIND
+
+#define NVALGRIND 1
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0)
+#define TOKU_DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0)
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex)
+#undef RUNNING_ON_VALGRIND
+#define RUNNING_ON_VALGRIND (0U)
+#endif
+
+// Valgrind 3.10.1 (and previous versions).
+// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING.
+// Helgrind's implementation of disable and enable checking causes false races
+// to be reported.  In addition, the race report does not include ANY
+// information about the code that uses the helgrind disable and enable
+// functions.  Therefore, it is very difficult to figure out the cause of the
+// race. DRD does implement the disable and enable functions.
+
+// Problems with ANNOTATE_IGNORE_READS.
+// Helgrind does not implement ignore reads.
+// Annotate ignore reads is the way to inform DRD to ignore racy reads.
+
+// FT code uses unsafe reads in several places.  These unsafe reads have been
+// noted as valid since they use the toku_unsafe_fetch function. Unfortunately,
+// this causes helgrind to report erroneous data races which makes use of
+// helgrind problematic.
+
+// Unsafely fetch and return a `T' from src, telling drd to ignore
+// racey access to src for the next sizeof(*src) bytes
+template <typename T>
+T toku_unsafe_fetch(T *src) {
+  if (0)
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(src,
+                                      sizeof *src);  // disabled, see comment
+  TOKU_ANNOTATE_IGNORE_READS_BEGIN();
+  T r = *src;
+  TOKU_ANNOTATE_IGNORE_READS_END();
+  if (0)
+    TOKU_VALGRIND_HG_ENABLE_CHECKING(src,
+                                     sizeof *src);  // disabled, see comment
+  return r;
+}
+
+template <typename T>
+T toku_unsafe_fetch(T &src) {
+  return toku_unsafe_fetch(&src);
+}
+
+// Unsafely set a `T' value into *dest from src, telling drd to ignore
+// racey access to dest for the next sizeof(*dest) bytes
+template <typename T>
+void toku_unsafe_set(T *dest, const T src) {
+  if (0)
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(dest,
+                                      sizeof *dest);  // disabled, see comment
+  TOKU_ANNOTATE_IGNORE_WRITES_BEGIN();
+  *dest = src;
+  TOKU_ANNOTATE_IGNORE_WRITES_END();
+  if (0)
+    TOKU_VALGRIND_HG_ENABLE_CHECKING(dest,
+                                     sizeof *dest);  // disabled, see comment
+}
+
+template <typename T>
+void toku_unsafe_set(T &dest, const T src) {
+  toku_unsafe_set(&dest, src);
+}
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,176 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include "toku_config.h"
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <time.h>
+#if defined(__powerpc__)
+#include <sys/platform/ppc.h>
+#endif
+
+#if 0
+static inline float toku_tdiff (struct timeval *a, struct timeval *b) {
+    return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec));
+}
+// PORT2: temporary:
+#define HAVE_CLOCK_REALTIME
+#if !defined(HAVE_CLOCK_REALTIME)
+// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time.
+typedef int clockid_t;
+// just something bogus, it doesn't matter, we just want to make sure we're
+// only supporting this mode because we're not sure we can support other modes
+// without a real clock_gettime()
+#define CLOCK_REALTIME 0x01867234
+#endif
+int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default")));
+#endif
+
+// *************** Performance timers ************************
+// What do you really want from a performance timer:
+//  (1) Can determine actual time of day from the performance time.
+//  (2) Time goes forward, never backward.
+//  (3) Same time on different processors (or even different machines).
+//  (4) Time goes forward at a constant rate (doesn't get faster and slower)
+//  (5) Portable.
+//  (6) Getting the time is cheap.
+// Unfortuately it seems tough to get Properties 1-5.  So we go for Property 6,,
+// but we abstract it. We offer a type tokutime_t which can hold the time. This
+// type can be subtracted to get a time difference. We can get the present time
+// cheaply. We can convert this type to seconds (but that can be expensive). The
+// implementation is to use RDTSC (hence we lose property 3: not portable).
+// Recent machines have constant_tsc in which case we get property (4).
+// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock
+// skew, so we get property (3). We get property 2 with RDTSC (as long as
+// there's not any skew). We don't even try to get propety 1, since we don't
+// need it. The decision here is that these times are really accurate only on
+// modern machines with modern OSs.
+typedef uint64_t tokutime_t;  // Time type used in by tokutek timers.
+
+#if 0
+// The value of tokutime_t is not specified here. 
+// It might be microseconds since 1/1/1970 (if gettimeofday() is
+// used), or clock cycles since boot (if rdtsc is used).  Or something
+// else.
+// Two tokutime_t values can be subtracted to get a time difference.
+// Use tokutime_to_seconds to that convert difference  to seconds.
+// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds();
+//
+// For accurate time calculations do the subtraction in the right order:
+//   Right:  tokutime_to_seconds(t1-t2);
+//   Wrong   tokutime_to_seconds(t1)-toku_time_to_seconds(t2);
+// Doing it the wrong way is likely to result in loss of precision.
+// A double can hold numbers up to about 53 bits.  RDTSC which uses about 33 bits every second, so that leaves
+// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double.
+//
+double tokutime_to_seconds(tokutime_t)  __attribute__((__visibility__("default"))); // Convert tokutime to seconds.
+
+#endif
+
+// Get the value of tokutime for right now.  We want this to be fast, so we
+// expose the implementation as RDTSC.
+static inline tokutime_t toku_time_now(void) {
+#if defined(__x86_64__) || defined(__i386__)
+  uint32_t lo, hi;
+  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+  return (uint64_t)hi << 32 | lo;
+#elif defined(__aarch64__)
+  uint64_t result;
+  __asm __volatile__("mrs %[rt], cntvct_el0" : [ rt ] "=r"(result));
+  return result;
+#elif defined(__powerpc__)
+  return __ppc_get_timebase();
+#elif defined(__s390x__)
+  uint64_t result;
+  asm volatile("stckf %0" : "=Q"(result) : : "cc");
+  return result;
+#else
+#error No timer implementation for this platform
+#endif
+}
+
+static inline uint64_t toku_current_time_microsec(void) {
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec;
+}
+
+#if 0
+// sleep microseconds
+static inline void toku_sleep_microsec(uint64_t ms) {
+    struct timeval  t;
+
+    t.tv_sec = ms / 1000000;
+    t.tv_usec = ms % 1000000;
+
+    select(0, NULL, NULL, NULL, &t);
+}
+#endif
+
+/*
+  PORT: Usage of this file:
+
+  uint64_t toku_current_time_microsec()   // uses gettimeoday
+      is used to track how much time various operations took (for example, lock
+      escalation). (TODO: it is not clear why these operations are tracked with
+      microsecond precision while others use nanoseconds)
+
+  tokutime_t toku_time_now() // uses rdtsc
+      seems to be used for a very similar purpose. This has greater precision
+
+  RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which
+  should be adequate substitutes.
+*/
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,27 @@
+//
+// A substitute for ft/txn/txn.h
+//
+#pragma once
+
+#include <set>
+
+#include "../util/omt.h"
+
+typedef uint64_t TXNID;
+#define TXNID_NONE ((TXNID)0)
+
+// A set of transactions
+//  (TODO: consider using class toku::txnid_set. The reason for using STL
+//   container was that its API is easier)
+class TxnidVector : public std::set<TXNID> {
+ public:
+  bool contains(TXNID txnid) { return find(txnid) != end(); }
+};
+
+// A value for lock structures with a meaning "the lock is owned by multiple
+// transactions (and one has to check the TxnidVector to get their ids)
+#define TXNID_SHARED (TXNID(-1))
+
+// Auxiliary value meaning "any transaction id will do".  No real transaction
+// may have this is as id.
+#define TXNID_ANY (TXNID(-2))
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,132 @@
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+/*
+  This is a dump ground to make Lock Tree work without the rest of TokuDB.
+*/
+#include <string.h>
+
+#include "db.h"
+#include "ft/ft-status.h"
+#include "portability/memory.h"
+#include "util/dbt.h"
+
+// portability/os_malloc.cc
+
+void toku_free(void *p) { free(p); }
+
+void *toku_xmalloc(size_t size) { return malloc(size); }
+
+void *toku_xrealloc(void *v, size_t size) { return realloc(v, size); }
+
+void *toku_xmemdup(const void *v, size_t len) {
+  void *p = toku_xmalloc(len);
+  memcpy(p, v, len);
+  return p;
+}
+
+// TODO: what are the X-functions? Xcalloc, Xrealloc?
+void *toku_xcalloc(size_t nmemb, size_t size) { return calloc(nmemb, size); }
+
+// ft-ft-opts.cc:
+
+// locktree
+toku_instr_key lock_request_m_wait_cond_key;
+toku_instr_key manager_m_escalator_done_key;
+toku_instr_key locktree_request_info_mutex_key;
+toku_instr_key locktree_request_info_retry_mutex_key;
+toku_instr_key locktree_request_info_retry_cv_key;
+
+toku_instr_key treenode_mutex_key;
+toku_instr_key manager_mutex_key;
+toku_instr_key manager_escalation_mutex_key;
+toku_instr_key manager_escalator_mutex_key;
+
+// portability/memory.cc
+size_t toku_memory_footprint(void *, size_t touched) { return touched; }
+
+// ft/ft-status.c
+// PORT2: note: the @c parameter to TOKUFT_STATUS_INIT must not start with
+//   "TOKU"
+LTM_STATUS_S ltm_status;
+void LTM_STATUS_S::init() {
+  if (m_initialized) return;
+#define LTM_STATUS_INIT(k, c, t, l)                    \
+  TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \
+                     TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS)
+  LTM_STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, STATUS_UINT64,
+                  "memory size");
+  LTM_STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, STATUS_UINT64,
+                  "memory size limit");
+  LTM_STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, STATUS_UINT64,
+                  "number of times lock escalation ran");
+  LTM_STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS,
+                  STATUS_TOKUTIME, "time spent running escalation (seconds)");
+  LTM_STATUS_INIT(LTM_ESCALATION_LATEST_RESULT,
+                  LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, STATUS_UINT64,
+                  "latest post-escalation memory size");
+  LTM_STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, STATUS_UINT64,
+                  "number of locktrees open now");
+  LTM_STATUS_INIT(LTM_LOCK_REQUESTS_PENDING, LOCKTREE_PENDING_LOCK_REQUESTS,
+                  STATUS_UINT64, "number of pending lock requests");
+  LTM_STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM,
+                  STATUS_UINT64, "number of locktrees eligible for the STO");
+  LTM_STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM,
+                  STATUS_UINT64,
+                  "number of times a locktree ended the STO early");
+  LTM_STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS,
+                  STATUS_TOKUTIME, "time spent ending the STO early (seconds)");
+  LTM_STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, STATUS_UINT64,
+                  "number of wait locks");
+  LTM_STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, STATUS_UINT64,
+                  "time waiting for locks");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, STATUS_UINT64,
+                  "number of long wait locks");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, STATUS_UINT64,
+                  "long time waiting for locks");
+  LTM_STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, STATUS_UINT64,
+                  "number of lock timeouts");
+  LTM_STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT,
+                  STATUS_UINT64, "number of waits on lock escalation");
+  LTM_STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME,
+                  STATUS_UINT64, "time waiting on lock escalation");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT,
+                  LOCKTREE_LONG_WAIT_ESCALATION_COUNT, STATUS_UINT64,
+                  "number of long waits on lock escalation");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME,
+                  LOCKTREE_LONG_WAIT_ESCALATION_TIME, STATUS_UINT64,
+                  "long time waiting on lock escalation");
+
+  m_initialized = true;
+#undef LTM_STATUS_INIT
+}
+void LTM_STATUS_S::destroy() {
+  if (!m_initialized) return;
+  for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) {
+    if (status[i].type == STATUS_PARCOUNT) {
+      // PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount);
+    }
+  }
+}
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+                    size_t key2len) {
+  size_t comparelen = key1len < key2len ? key1len : key2len;
+  int c = memcmp(key1, key2, comparelen);
+  if (__builtin_expect(c != 0, 1)) {
+    return c;
+  } else {
+    if (key1len < key2len) {
+      return -1;
+    } else if (key1len > key2len) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+}
+
+int toku_builtin_compare_fun(const DBT *a, const DBT *b) {
+  return toku_keycompare(a->data, a->size, b->data, b->size);
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,153 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "dbt.h"
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+DBT *toku_init_dbt(DBT *dbt) {
+  memset(dbt, 0, sizeof(*dbt));
+  return dbt;
+}
+
+DBT toku_empty_dbt(void) {
+  static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0};
+  return empty_dbt;
+}
+
+DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) {
+  toku_init_dbt(dbt);
+  dbt->flags = flags;
+  return dbt;
+}
+
+void toku_destroy_dbt(DBT *dbt) {
+  switch (dbt->flags) {
+    case DB_DBT_MALLOC:
+    case DB_DBT_REALLOC:
+      toku_free(dbt->data);
+      toku_init_dbt(dbt);
+      break;
+  }
+}
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len) {
+  toku_init_dbt(dbt);
+  dbt->size = len;
+  dbt->data = (char *)k;
+  return dbt;
+}
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) {
+  toku_init_dbt_flags(dbt, DB_DBT_MALLOC);
+  dbt->size = len;
+  dbt->data = toku_xmemdup(k, len);
+  return dbt;
+}
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src) {
+  dst->flags = 0;
+  dst->ulen = 0;
+  dst->size = src.size;
+  dst->data = src.data;
+  return dst;
+}
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src) {
+  return toku_memdup_dbt(dst, src.data, src.size);
+}
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt) {
+  if (sdbt->data) toku_free(sdbt->data);
+  memset(sdbt, 0, sizeof(*sdbt));
+}
+
+const DBT *toku_dbt_positive_infinity(void) {
+  static DBT positive_infinity_dbt = {
+      .data = 0, .size = 0, .ulen = 0, .flags = 0};  // port
+  return &positive_infinity_dbt;
+}
+
+const DBT *toku_dbt_negative_infinity(void) {
+  static DBT negative_infinity_dbt = {
+      .data = 0, .size = 0, .ulen = 0, .flags = 0};  // port
+  return &negative_infinity_dbt;
+}
+
+bool toku_dbt_is_infinite(const DBT *dbt) {
+  return dbt == toku_dbt_positive_infinity() ||
+         dbt == toku_dbt_negative_infinity();
+}
+
+bool toku_dbt_is_empty(const DBT *dbt) {
+  // can't have a null data field with a non-zero size
+  paranoid_invariant(dbt->data != nullptr || dbt->size == 0);
+  return dbt->data == nullptr;
+}
+
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b) {
+  if (a == b) {
+    return 0;
+  } else if (a == toku_dbt_positive_infinity()) {
+    return 1;
+  } else if (b == toku_dbt_positive_infinity()) {
+    return -1;
+  } else if (a == toku_dbt_negative_infinity()) {
+    return -1;
+  } else {
+    invariant(b == toku_dbt_negative_infinity());
+    return 1;
+  }
+}
+
+bool toku_dbt_equals(const DBT *a, const DBT *b) {
+  if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) {
+    return a->data == b->data && a->size == b->size;
+  } else {
+    // a or b is infinite, so they're equal if they are the same infinite
+    return a == b ? true : false;
+  }
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/dbt.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,98 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+
+// TODO: John
+// Document this API a little better so that DBT
+// memory management can be morm widely understood.
+
+DBT *toku_init_dbt(DBT *);
+
+// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true)
+DBT toku_empty_dbt(void);
+
+DBT *toku_init_dbt_flags(DBT *, uint32_t flags);
+
+void toku_destroy_dbt(DBT *);
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src);
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src);
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt);
+
+// returns: special DBT pointer representing positive infinity
+const DBT *toku_dbt_positive_infinity(void);
+
+// returns: special DBT pointer representing negative infinity
+const DBT *toku_dbt_negative_infinity(void);
+
+// returns: true if the given dbt is either positive or negative infinity
+bool toku_dbt_is_infinite(const DBT *dbt);
+
+// returns: true if the given dbt has no data (ie: dbt->data == nullptr)
+bool toku_dbt_is_empty(const DBT *dbt);
+
+// effect: compares two potentially infinity-valued dbts
+// requires: at least one is infinite (assert otherwise)
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b);
+
+// returns: true if the given dbts have the same data pointer and size
+bool toku_dbt_equals(const DBT *a, const DBT *b);
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,144 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+
+//******************************************************************************
+//
+// Overview: A growable array is a little bit like std::vector except that
+//  it doesn't have constructors (hence can be used in static constructs, since
+//  the google style guide says no constructors), and it's a little simpler.
+// Operations:
+//   init and deinit (we don't have constructors and destructors).
+//   fetch_unchecked to get values out.
+//   store_unchecked to put values in.
+//   push to add an element at the end
+//   get_size to find out the size
+//   get_memory_size to find out how much memory the data stucture is using.
+//
+//******************************************************************************
+
+namespace toku {
+
+template <typename T>
+class GrowableArray {
+ public:
+  void init(void)
+  // Effect: Initialize the array to contain no elements.
+  {
+    m_array = NULL;
+    m_size = 0;
+    m_size_limit = 0;
+  }
+
+  void deinit(void)
+  // Effect: Deinitialize the array (freeing any memory it uses, for example).
+  {
+    toku_free(m_array);
+    m_array = NULL;
+    m_size = 0;
+    m_size_limit = 0;
+  }
+
+  T fetch_unchecked(size_t i) const
+  // Effect: Fetch the ith element.  If i is out of range, the system asserts.
+  {
+    return m_array[i];
+  }
+
+  void store_unchecked(size_t i, T v)
+  // Effect: Store v in the ith element.  If i is out of range, the system
+  // asserts.
+  {
+    paranoid_invariant(i < m_size);
+    m_array[i] = v;
+  }
+
+  void push(T v)
+  // Effect: Add v to the end of the array (increasing the size).  The amortized
+  // cost of this operation is constant. Implementation hint:  Double the size
+  // of the array when it gets too big so that the amortized cost stays
+  // constant.
+  {
+    if (m_size >= m_size_limit) {
+      if (m_array == NULL) {
+        m_size_limit = 1;
+      } else {
+        m_size_limit *= 2;
+      }
+      XREALLOC_N(m_size_limit, m_array);
+    }
+    m_array[m_size++] = v;
+  }
+
+  size_t get_size(void) const
+  // Effect: Return the number of elements in the array.
+  {
+    return m_size;
+  }
+  size_t memory_size(void) const
+  // Effect: Return the size (in bytes) that the array occupies in memory.  This
+  // is really only an estimate.
+  {
+    return sizeof(*this) + sizeof(T) * m_size_limit;
+  }
+
+ private:
+  T *m_array;
+  size_t m_size;
+  size_t m_size_limit;  // How much space is allocated in array.
+};
+
+}  // namespace toku
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,201 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "memarena.h"
+
+#include <string.h>
+
+#include <algorithm>
+
+#include "../portability/memory.h"
+
+void memarena::create(size_t initial_size) {
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _size_of_other_chunks = 0;
+  _footprint_of_other_chunks = 0;
+  _n_other_chunks = 0;
+
+  _current_chunk.size = initial_size;
+  if (_current_chunk.size > 0) {
+    XMALLOC_N(_current_chunk.size, _current_chunk.buf);
+  }
+}
+
+void memarena::destroy(void) {
+  if (_current_chunk.buf) {
+    toku_free(_current_chunk.buf);
+  }
+  for (int i = 0; i < _n_other_chunks; i++) {
+    toku_free(_other_chunks[i].buf);
+  }
+  if (_other_chunks) {
+    toku_free(_other_chunks);
+  }
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _n_other_chunks = 0;
+}
+
+static size_t round_to_page(size_t size) {
+  const size_t page_size = 4096;
+  const size_t r = page_size + ((size - 1) & ~(page_size - 1));
+  assert((r & (page_size - 1)) == 0);  // make sure it's aligned
+  assert(r >= size);                   // make sure it's not too small
+  assert(r <
+         size + page_size);  // make sure we didn't grow by more than a page.
+  return r;
+}
+
+static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024;
+
+void *memarena::malloc_from_arena(size_t size) {
+  if (_current_chunk.buf == nullptr ||
+      _current_chunk.size < _current_chunk.used + size) {
+    // The existing block isn't big enough.
+    // Add the block to the vector of blocks.
+    if (_current_chunk.buf) {
+      invariant(_current_chunk.size > 0);
+      int old_n = _n_other_chunks;
+      XREALLOC_N(old_n + 1, _other_chunks);
+      _other_chunks[old_n] = _current_chunk;
+      _n_other_chunks = old_n + 1;
+      _size_of_other_chunks += _current_chunk.size;
+      _footprint_of_other_chunks +=
+          toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+    }
+
+    // Make a new one. Grow the buffer size exponentially until we hit
+    // the max chunk size, but make it at least `size' bytes so the
+    // current allocation always fit.
+    size_t new_size =
+        std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size);
+    if (new_size < size) {
+      new_size = size;
+    }
+    new_size = round_to_page(
+        new_size);  // at least size, but round to the next page size
+    XMALLOC_N(new_size, _current_chunk.buf);
+    _current_chunk.used = 0;
+    _current_chunk.size = new_size;
+  }
+  invariant(_current_chunk.buf != nullptr);
+
+  // allocate in the existing block.
+  char *p = _current_chunk.buf + _current_chunk.used;
+  _current_chunk.used += size;
+  return p;
+}
+
+void memarena::move_memory(memarena *dest) {
+  // Move memory to dest
+  XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks);
+  dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size;
+  dest->_footprint_of_other_chunks +=
+      _footprint_of_other_chunks +
+      toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+  for (int i = 0; i < _n_other_chunks; i++) {
+    dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i];
+  }
+  dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk;
+
+  // Clear out this memarena's memory
+  toku_free(_other_chunks);
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _size_of_other_chunks = 0;
+  _footprint_of_other_chunks = 0;
+  _n_other_chunks = 0;
+}
+
+size_t memarena::total_memory_size(void) const {
+  return sizeof(*this) + total_size_in_use() +
+         _n_other_chunks * sizeof(*_other_chunks);
+}
+
+size_t memarena::total_size_in_use(void) const {
+  return _size_of_other_chunks + _current_chunk.used;
+}
+
+size_t memarena::total_footprint(void) const {
+  return sizeof(*this) + _footprint_of_other_chunks +
+         toku_memory_footprint(_current_chunk.buf, _current_chunk.used) +
+         _n_other_chunks * sizeof(*_other_chunks);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const void *memarena::chunk_iterator::current(size_t *used) const {
+  if (_chunk_idx < 0) {
+    *used = _ma->_current_chunk.used;
+    return _ma->_current_chunk.buf;
+  } else if (_chunk_idx < _ma->_n_other_chunks) {
+    *used = _ma->_other_chunks[_chunk_idx].used;
+    return _ma->_other_chunks[_chunk_idx].buf;
+  }
+  *used = 0;
+  return nullptr;
+}
+
+void memarena::chunk_iterator::next() { _chunk_idx++; }
+
+bool memarena::chunk_iterator::more() const {
+  if (_chunk_idx < 0) {
+    return _ma->_current_chunk.buf != nullptr;
+  }
+  return _chunk_idx < _ma->_n_other_chunks;
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/memarena.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+/*
+ * A memarena is used to efficiently store a collection of objects that never
+ * move The pattern is allocate more and more stuff and free all of the items at
+ * once. The underlying memory will store 1 or more objects per chunk. Each
+ * chunk is contiguously laid out in memory but chunks are not necessarily
+ * contiguous with each other.
+ */
+class memarena {
+ public:
+  memarena()
+      : _current_chunk(arena_chunk()),
+        _other_chunks(nullptr),
+        _n_other_chunks(0),
+        _size_of_other_chunks(0),
+        _footprint_of_other_chunks(0) {}
+
+  // Effect: Create a memarena with the specified initial size
+  void create(size_t initial_size);
+
+  void destroy(void);
+
+  // Effect: Allocate some memory.  The returned value remains valid until the
+  // memarena is cleared or closed.
+  //  In case of ENOMEM, aborts.
+  void *malloc_from_arena(size_t size);
+
+  // Effect: Move all the memory from this memarena into DEST.
+  //         When SOURCE is closed the memory won't be freed.
+  //         When DEST is closed, the memory will be freed, unless DEST moves
+  //         its memory to another memarena...
+  void move_memory(memarena *dest);
+
+  // Effect: Calculate the amount of memory used by a memory arena.
+  size_t total_memory_size(void) const;
+
+  // Effect: Calculate the used space of the memory arena (ie: excludes unused
+  // space)
+  size_t total_size_in_use(void) const;
+
+  // Effect: Calculate the amount of memory used, according to
+  // toku_memory_footprint(),
+  //         which is a more expensive but more accurate count of memory used.
+  size_t total_footprint(void) const;
+
+  // iterator over the underlying chunks that store objects in the memarena.
+  // a chunk is represented by a pointer to const memory and a usable byte
+  // count.
+  class chunk_iterator {
+   public:
+    chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {}
+
+    // returns: base pointer to the current chunk
+    //          *used set to the number of usable bytes
+    //          if more() is false, returns nullptr and *used = 0
+    const void *current(size_t *used) const;
+
+    // requires: more() is true
+    void next();
+
+    bool more() const;
+
+   private:
+    // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk
+    // >= 0 represents the i'th chunk in the ma->_other_chunks array
+    const memarena *_ma;
+    int _chunk_idx;
+  };
+
+ private:
+  struct arena_chunk {
+    arena_chunk() : buf(nullptr), used(0), size(0) {}
+    char *buf;
+    size_t used;
+    size_t size;
+  };
+
+  struct arena_chunk _current_chunk;
+  struct arena_chunk *_other_chunks;
+  int _n_other_chunks;
+  size_t _size_of_other_chunks;       // the buf_size of all the other chunks.
+  size_t _footprint_of_other_chunks;  // the footprint of all the other chunks.
+
+  friend class memarena_unit_test;
+};
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,794 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+#include <stdint.h>
+
+#include "../portability/toku_portability.h"
+#include "../portability/toku_race_tools.h"
+#include "growable_array.h"
+
+namespace toku {
+
+/**
+ * Order Maintenance Tree (OMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has an
+ * integer weight. The OMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ * Each value has a weight.  The weight of the $i$th element is denoted
+ * $w(V_i)$.
+ *
+ * We can create a new OMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ *  $|V'|=1+|V|$       and
+ *
+ *   V'_j = V_j       if $j<i$
+ *          x         if $j=i$
+ *          V_{j-1}   if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of
+ * $b$ must be monotonically increasing. When lookup up values, we can look up
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a
+ * special return code if no such value exists.) (Rationale:  Ordinarily we want
+ * $i$ to be unique.  But for various reasons we want to allow multiple zeros,
+ * and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum
+ * integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+ *  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an
+ * indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its
+ * index.
+ *
+ * We can also split an OMT into two OMTs, splitting the weight of the values
+ * evenly. Find a value $j$ such that the values to the left of $j$ have about
+ * the same total weight as the values to the right of $j$. The resulting two
+ * OMTs contain the values to the left of $j$ and the values to the right of $j$
+ * respectively. All of the values from the original OMT go into one of the new
+ * OMTs. If the weights of the values don't split exactly evenly, then the
+ * implementation has the freedom to choose whether the new left OMT or the new
+ * right OMT is larger.
+ *
+ * Performance:
+ *  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$
+ * calls to the Heaviside function. The memory required is O(|V|).
+ *
+ * Usage:
+ *  The omt is templated by two parameters:
+ *   - omtdata_t is what will be stored within the omt.  These could be pointers
+ * or real data types (ints, structs).
+ *   - omtdataout_t is what will be returned by find and related functions.  By
+ * default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To
+ * create an omt which will store "TXNID"s, for example, it is a good idea to
+ * typedef the template: typedef omt<TXNID> txnid_omt_t; If you are storing
+ * structs, you may want to be able to get a pointer to the data actually stored
+ * in the omt (see find_zero).  To do this, use the second template parameter:
+ *   typedef omt<struct foo, struct foo *> foo_omt_t;
+ */
+
+namespace omt_internal {
+
+template <bool subtree_supports_marks>
+class subtree_templated {
+ private:
+  uint32_t m_index;
+
+ public:
+  static const uint32_t NODE_NULL = UINT32_MAX;
+  inline void set_to_null(void) { m_index = NODE_NULL; }
+
+  inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+  inline uint32_t get_index(void) const { return m_index; }
+
+  inline void set_index(uint32_t index) {
+    paranoid_invariant(index != NODE_NULL);
+    m_index = index;
+  }
+} __attribute__((__packed__, aligned(4)));
+
+template <>
+class subtree_templated<true> {
+ private:
+  uint32_t m_bitfield;
+  static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31);
+  static const uint32_t MASK_BIT = ((uint32_t)1) << 31;
+
+  inline void set_index_internal(uint32_t new_index) {
+    m_bitfield = (m_bitfield & MASK_BIT) | new_index;
+  }
+
+ public:
+  static const uint32_t NODE_NULL = INT32_MAX;
+  inline void set_to_null(void) { this->set_index_internal(NODE_NULL); }
+
+  inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+  inline uint32_t get_index(void) const {
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    const uint32_t bits = m_bitfield;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+    return bits & MASK_INDEX;
+  }
+
+  inline void set_index(uint32_t index) {
+    paranoid_invariant(index < NODE_NULL);
+    this->set_index_internal(index);
+  }
+
+  inline bool get_bit(void) const {
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    const uint32_t bits = m_bitfield;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+    return (bits & MASK_BIT) != 0;
+  }
+
+  inline void enable_bit(void) {
+    // These bits may be set by a thread with a write lock on some
+    // leaf, and the index can be read by another thread with a (read
+    // or write) lock on another thread.  Also, the has_marks_below
+    // bit can be set by two threads simultaneously.  Neither of these
+    // are real races, so if we are using DRD we should tell it to
+    // ignore these bits just while we set this bit.  If there were a
+    // race in setting the index, that would be a real race.
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    m_bitfield |= MASK_BIT;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+  }
+
+  inline void disable_bit(void) { m_bitfield &= MASK_INDEX; }
+} __attribute__((__packed__));
+
+template <typename omtdata_t, bool subtree_supports_marks>
+class omt_node_templated {
+ public:
+  omtdata_t value;
+  uint32_t weight;
+  subtree_templated<subtree_supports_marks> left;
+  subtree_templated<subtree_supports_marks> right;
+
+  // this needs to be in both implementations because we don't have
+  // a "static if" the caller can use
+  inline void clear_stolen_bits(void) {}
+};  // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+template <typename omtdata_t>
+class omt_node_templated<omtdata_t, true> {
+ public:
+  omtdata_t value;
+  uint32_t weight;
+  subtree_templated<true> left;
+  subtree_templated<true> right;
+  inline bool get_marked(void) const { return left.get_bit(); }
+  inline void set_marked_bit(void) { return left.enable_bit(); }
+  inline void unset_marked_bit(void) { return left.disable_bit(); }
+
+  inline bool get_marks_below(void) const { return right.get_bit(); }
+  inline void set_marks_below_bit(void) {
+    // This function can be called by multiple threads.
+    // Checking first reduces cache invalidation.
+    if (!this->get_marks_below()) {
+      right.enable_bit();
+    }
+  }
+  inline void unset_marks_below_bit(void) { right.disable_bit(); }
+
+  inline void clear_stolen_bits(void) {
+    this->unset_marked_bit();
+    this->unset_marks_below_bit();
+  }
+};  // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+}  // namespace omt_internal
+
+template <typename omtdata_t, typename omtdataout_t = omtdata_t,
+          bool supports_marks = false>
+class omt {
+ public:
+  /**
+   * Effect: Create an empty OMT.
+   * Performance: constant time.
+   */
+  void create(void);
+
+  /**
+   * Effect: Create an empty OMT with no internal allocated space.
+   * Performance: constant time.
+   * Rationale: In some cases we need a valid omt but don't want to malloc.
+   */
+  void create_no_array(void);
+
+  /**
+   * Effect: Create a OMT containing values.  The number of values is in
+   * numvalues. Stores the new OMT in *omtp. Requires: this has not been created
+   * yet Requires: values != NULL Requires: values is sorted Performance:
+   * time=O(numvalues) Rationale:    Normally to insert N values takes O(N lg N)
+   * amortized time. If the N values are known in advance, are sorted, and the
+   * structure is empty, we can batch insert them much faster.
+   */
+  __attribute__((nonnull)) void create_from_sorted_array(
+      const omtdata_t *const values, const uint32_t numvalues);
+
+  /**
+   * Effect: Create an OMT containing values.  The number of values is in
+   * numvalues. On success the OMT takes ownership of *values array, and sets
+   * values=NULL. Requires: this has not been created yet Requires: values !=
+   * NULL Requires: *values is sorted Requires: *values was allocated with
+   * toku_malloc Requires: Capacity of the *values array is <= new_capacity
+   * Requires: On success, *values may not be accessed again by the caller.
+   * Performance:  time=O(1)
+   * Rational:     create_from_sorted_array takes O(numvalues) time.
+   *               By taking ownership of the array, we save a malloc and
+   * memcpy, and possibly a free (if the caller is done with the array).
+   */
+  void create_steal_sorted_array(omtdata_t **const values,
+                                 const uint32_t numvalues,
+                                 const uint32_t new_capacity);
+
+  /**
+   * Effect: Create a new OMT, storing it in *newomt.
+   *  The values to the right of index (starting at index) are moved to *newomt.
+   * Requires: newomt != NULL
+   * Returns
+   *    0             success,
+   *    EINVAL        if index > toku_omt_size(omt)
+   * On nonzero return, omt and *newomt are unmodified.
+   * Performance: time=O(n)
+   * Rationale:  We don't need a split-evenly operation.  We need to split items
+   * so that their total sizes are even, and other similar splitting criteria.
+   * It's easy to split evenly by calling size(), and dividing by two.
+   */
+  __attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx);
+
+  /**
+   * Effect: Appends leftomt and rightomt to produce a new omt.
+   *  Creates this as the new omt.
+   *  leftomt and rightomt are destroyed.
+   * Performance: time=O(n) is acceptable, but one can imagine implementations
+   * that are O(\log n) worst-case.
+   */
+  __attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt);
+
+  /**
+   * Effect: Creates a copy of an omt.
+   *  Creates this as the clone.
+   *  Each element is copied directly.  If they are pointers, the underlying
+   * data is not duplicated. Performance: O(n) or the running time of
+   * fill_array_with_subtree_values()
+   */
+  void clone(const omt &src);
+
+  /**
+   * Effect: Set the tree to be empty.
+   *  Note: Will not reallocate or resize any memory.
+   * Performance: time=O(1)
+   */
+  void clear(void);
+
+  /**
+   * Effect:  Destroy an OMT, freeing all its memory.
+   *   If the values being stored are pointers, their underlying data is not
+   * freed.  See free_items() Those values may be freed before or after calling
+   * toku_omt_destroy. Rationale: Returns no values since free() cannot fail.
+   * Rationale: Does not free the underlying pointers to reduce complexity.
+   * Performance:  time=O(1)
+   */
+  void destroy(void);
+
+  /**
+   * Effect: return |this|.
+   * Performance:  time=O(1)
+   */
+  uint32_t size(void) const;
+
+  /**
+   * Effect:  Insert value into the OMT.
+   *   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+   *   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+   *      If no such i exists, then let i be |V|
+   *   Then this has the same effect as
+   *    insert_at(tree, value, i);
+   *   If idx!=NULL then i is stored in *idx
+   * Requires:  The signum of h must be monotonically increasing.
+   * Returns:
+   *    0            success
+   *    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+   * On nonzero return, omt is unchanged.
+   * Performance: time=O(\log N) amortized.
+   * Rationale: Some future implementation may be O(\log N) worst-case time, but
+   * O(\log N) amortized is good enough for now.
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx);
+
+  /**
+   * Effect: Increases indexes of all items at slot >= idx by 1.
+   *         Insert value into the position at idx.
+   * Returns:
+   *   0         success
+   *   EINVAL    if idx > this->size()
+   * On error, omt is unchanged.
+   * Performance: time=O(\log N) amortized time.
+   * Rationale: Some future implementation may be O(\log N) worst-case time, but
+   * O(\log N) amortized is good enough for now.
+   */
+  int insert_at(const omtdata_t &value, const uint32_t idx);
+
+  /**
+   * Effect:  Replaces the item at idx with value.
+   * Returns:
+   *   0       success
+   *   EINVAL  if idx>=this->size()
+   * On error, omt is unchanged.
+   * Performance: time=O(\log N)
+   * Rationale: The FT needs to be able to replace a value with another copy of
+   * the same value (allocated in a different location)
+   *
+   */
+  int set_at(const omtdata_t &value, const uint32_t idx);
+
+  /**
+   * Effect: Delete the item in slot idx.
+   *         Decreases indexes of all items at slot > idx by 1.
+   * Returns
+   *     0            success
+   *     EINVAL       if idx>=this->size()
+   * On error, omt is unchanged.
+   * Rationale: To delete an item, first find its index using find or find_zero,
+   * then delete it. Performance: time=O(\log N) amortized.
+   */
+  int delete_at(const uint32_t idx);
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a ref-to-const of the
+   * value stored in the omt. The second argument passed to f is the index of
+   * the value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL
+   * Returns:
+   *  If f ever returns nonzero, then the iteration stops, and the value
+   * returned by f is returned by iterate. If f always returns zero, then
+   * iterate returns 0. Requires:  Don't modify the omt while running.  (E.g., f
+   * may not insert or delete values from the omt.) Performance: time=O(i+\log
+   * N) where i is the number of times f is called, and N is the number of
+   * elements in the omt. Rationale: Although the functional iterator requires
+   * defining another function (as opposed to C++ style iterator), it is much
+   * easier to read. Rationale: We may at some point use functors, but for now
+   * this is a smaller change from the old OMT.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate(iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a ref-to-const of the
+   * value stored in the omt. The second argument passed to f is the index of
+   * the value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). We will iterate only over
+   * [left,right)
+   *
+   * Requires: left <= right
+   * Requires: f != NULL
+   * Returns:
+   *  EINVAL  if right > this->size()
+   *  If f ever returns nonzero, then the iteration stops, and the value
+   * returned by f is returned by iterate_on_range. If f always returns zero,
+   * then iterate_on_range returns 0. Requires:  Don't modify the omt while
+   * running.  (E.g., f may not insert or delete values from the omt.)
+   * Performance: time=O(i+\log N) where i is the number of times f is called,
+   * and N is the number of elements in the omt. Rational: Although the
+   * functional iterator requires defining another function (as opposed to C++
+   * style iterator), it is much easier to read.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_on_range(const uint32_t left, const uint32_t right,
+                       iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect: Iterate over the values of the omt, and mark the nodes that are
+   * visited. Other than the marks, this behaves the same as iterate_on_range.
+   * Requires: supports_marks == true
+   * Performance: time=O(i+\log N) where i is the number of times f is called,
+   * and N is the number of elements in the omt. Notes: This function MAY be
+   * called concurrently by multiple threads, but not concurrently with any
+   * other non-const function.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_and_mark_range(const uint32_t left, const uint32_t right,
+                             iterate_extra_t *const iterate_extra);
+
+  /**
+   * Effect: Iterate over the values of the omt, from left to right, calling f
+   * on each value whose node has been marked. Other than the marks, this
+   * behaves the same as iterate. Requires: supports_marks == true Performance:
+   * time=O(i+\log N) where i is the number of times f is called, and N is the
+   * number of elements in the omt.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_over_marked(iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect: Delete all elements from the omt, whose nodes have been marked.
+   * Requires: supports_marks == true
+   * Performance: time=O(N + i\log N) where i is the number of marked elements,
+   * {c,sh}ould be faster
+   */
+  void delete_all_marked(void);
+
+  /**
+   * Effect: Verify that the internal state of the marks in the tree are
+   * self-consistent. Crashes the system if the marks are in a bad state.
+   * Requires: supports_marks == true
+   * Performance: time=O(N)
+   * Notes:
+   *  Even though this is a const function, it requires exclusive access.
+   * Rationale:
+   *  The current implementation of the marks relies on a sort of
+   *  "cache" bit representing the state of bits below it in the tree.
+   *  This allows glass-box testing that these bits are correct.
+   */
+  void verify_marks_consistent(void) const;
+
+  /**
+   * Effect: None
+   * Returns whether there are any marks in the tree.
+   */
+  bool has_marks(void) const;
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a pointer to the value
+   * stored in the omt. The second argument passed to f is the index of the
+   * value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). Requires: same as for
+   * iterate() Returns: same as for iterate() Performance: same as for iterate()
+   * Rationale: In general, most iterators should use iterate() since they
+   * should not modify the data stored in the omt.  This function is for
+   * iterators which need to modify values (for example, free_items). Rationale:
+   * We assume if you are transforming the data in place, you want to do it to
+   * everything at once, so there is not yet an iterate_on_range_ptr (but there
+   * could be).
+   */
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+  /**
+   * Effect: Set *value=V_idx
+   * Returns
+   *    0             success
+   *    EINVAL        if index>=toku_omt_size(omt)
+   * On nonzero return, *value is unchanged
+   * Performance: time=O(\log N)
+   */
+  int fetch(const uint32_t idx, omtdataout_t *const value) const;
+
+  /**
+   * Effect:  Find the smallest i such that h(V_i, extra)>=0
+   *  If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value =
+   * V_i, and return 0. If there is such an i and h(V_i,extra)>0  then set
+   * *idxp=i and return DB_NOTFOUND. If there is no such i then set
+   * *idx=this->size() and return DB_NOTFOUND. Note: value is of type
+   * omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is
+   * fixed by the instantiation. If it is the value type, then the value is
+   * copied out (even if the value type is a pointer to something else) If it is
+   * the pointer type, then *value is set to a pointer to the data within the
+   * omt. This is determined by the type of the omt as initially declared. If
+   * the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's
+   * will be returned by find and related functions. If the omt is declared as
+   * omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the
+   * stored items will be returned by find and related functions. Rationale:
+   *  Structs too small for malloc should be stored directly in the omt.
+   *  These structs may need to be edited as they exist inside the omt, so we
+   * need a way to get a pointer within the omt. Using separate functions for
+   * returning pointers and values increases code duplication and reduces
+   * type-checking. That also reduces the ability of the creator of a data
+   * structure to give advice to its future users. Slight overloading in this
+   * case seemed to provide a better API and better type checking.
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_zero(const omtcmp_t &extra, omtdataout_t *const value,
+                uint32_t *const idxp) const;
+
+  /**
+   *   Effect:
+   *    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+   *    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+   *    (Direction may not be equal to zero.)
+   *    If value!=NULL then store V_i in *value
+   *    If idxp!=NULL then store i in *idxp.
+   *   Requires: The signum of h is monotically increasing.
+   *   Returns
+   *      0             success
+   *      DB_NOTFOUND   no such value is found.
+   *   On nonzero return, *value and *idxp are unchanged
+   *   Performance: time=O(\log N)
+   *   Rationale:
+   *     Here's how to use the find function to find various things
+   *       Cases for find:
+   *        find first value:         ( h(v)=+1, direction=+1 )
+   *        find last value           ( h(v)=-1, direction=-1 )
+   *        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+   *        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+   *        find X or successor to X  ( same as find first X. )
+   *
+   *   Rationale: To help understand heaviside functions and behavor of find:
+   *    There are 7 kinds of heaviside functions.
+   *    The signus of the h must be monotonically increasing.
+   *    Given a function of the following form, A is the element
+   *    returned for direction>0, B is the element returned
+   *    for direction<0, C is the element returned for
+   *    direction==0 (see find_zero) (with a return of 0), and D is the element
+   *    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+   *    If any of A, B, or C are not found, then asking for the
+   *    associated direction will return DB_NOTFOUND.
+   *    See find_zero for more information.
+   *
+   *    Let the following represent the signus of the heaviside function.
+   *
+   *    -...-
+   *        A
+   *         D
+   *
+   *    +...+
+   *    B
+   *    D
+   *
+   *    0...0
+   *    C
+   *
+   *    -...-0...0
+   *        AC
+   *
+   *    0...0+...+
+   *    C    B
+   *
+   *    -...-+...+
+   *        AB
+   *         D
+   *
+   *    -...-0...0+...+
+   *        AC    B
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find(const omtcmp_t &extra, int direction, omtdataout_t *const value,
+           uint32_t *const idxp) const;
+
+  /**
+   * Effect: Return the size (in bytes) of the omt, as it resides in main
+   * memory.  If the data stored are pointers, don't include the size of what
+   * they all point to.
+   */
+  size_t memory_size(void);
+
+ private:
+  typedef uint32_t node_idx;
+  typedef omt_internal::subtree_templated<supports_marks> subtree;
+  typedef omt_internal::omt_node_templated<omtdata_t, supports_marks> omt_node;
+  ENSURE_POD(subtree);
+
+  struct omt_array {
+    uint32_t start_idx;
+    uint32_t num_values;
+    omtdata_t *values;
+  };
+
+  struct omt_tree {
+    subtree root;
+    uint32_t free_idx;
+    omt_node *nodes;
+  };
+
+  bool is_array;
+  uint32_t capacity;
+  union {
+    struct omt_array a;
+    struct omt_tree t;
+  } d;
+
+  __attribute__((nonnull)) void unmark(const subtree &subtree,
+                                       const uint32_t index,
+                                       GrowableArray<node_idx> *const indexes);
+
+  void create_internal_no_array(const uint32_t new_capacity);
+
+  void create_internal(const uint32_t new_capacity);
+
+  uint32_t nweight(const subtree &subtree) const;
+
+  node_idx node_malloc(void);
+
+  void node_free(const node_idx idx);
+
+  void maybe_resize_array(const uint32_t n);
+
+  __attribute__((nonnull)) void fill_array_with_subtree_values(
+      omtdata_t *const array, const subtree &subtree) const;
+
+  void convert_to_array(void);
+
+  __attribute__((nonnull)) void rebuild_from_sorted_array(
+      subtree *const subtree, const omtdata_t *const values,
+      const uint32_t numvalues);
+
+  void convert_to_tree(void);
+
+  void maybe_resize_or_convert(const uint32_t n);
+
+  bool will_need_rebalance(const subtree &subtree, const int leftmod,
+                           const int rightmod) const;
+
+  __attribute__((nonnull)) void insert_internal(
+      subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+      subtree **const rebalance_subtree);
+
+  void set_at_internal_array(const omtdata_t &value, const uint32_t idx);
+
+  void set_at_internal(const subtree &subtree, const omtdata_t &value,
+                       const uint32_t idx);
+
+  void delete_internal(subtree *const subtreep, const uint32_t idx,
+                       omt_node *const copyn,
+                       subtree **const rebalance_subtree);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_internal_array(const uint32_t left, const uint32_t right,
+                             iterate_extra_t *const iterate_extra) const;
+
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                            const subtree &subtree, const uint32_t idx,
+                            iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                  iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_internal(const uint32_t left, const uint32_t right,
+                       const subtree &subtree, const uint32_t idx,
+                       iterate_extra_t *const iterate_extra) const;
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+                                      const subtree &subtree,
+                                      const uint32_t idx,
+                                      iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx,
+                                   iterate_extra_t *const iterate_extra) const;
+
+  uint32_t verify_marks_consistent_internal(const subtree &subtree,
+                                            const bool allow_marks) const;
+
+  void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const;
+
+  void fetch_internal(const subtree &subtree, const uint32_t i,
+                      omtdataout_t *const value) const;
+
+  __attribute__((nonnull)) void fill_array_with_subtree_idxs(
+      node_idx *const array, const subtree &subtree) const;
+
+  __attribute__((nonnull)) void rebuild_subtree_from_idxs(
+      subtree *const subtree, const node_idx *const idxs,
+      const uint32_t numvalues);
+
+  __attribute__((nonnull)) void rebalance(subtree *const subtree);
+
+  __attribute__((nonnull)) static void copyout(omtdata_t *const out,
+                                               const omt_node *const n);
+
+  __attribute__((nonnull)) static void copyout(omtdata_t **const out,
+                                               omt_node *const n);
+
+  __attribute__((nonnull)) static void copyout(
+      omtdata_t *const out, const omtdata_t *const stored_value_ptr);
+
+  __attribute__((nonnull)) static void copyout(
+      omtdata_t **const out, omtdata_t *const stored_value_ptr);
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value,
+                               uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_zero(const subtree &subtree, const omtcmp_t &extra,
+                         omtdataout_t *const value, uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value,
+                               uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_plus(const subtree &subtree, const omtcmp_t &extra,
+                         omtdataout_t *const value, uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_minus_array(const omtcmp_t &extra,
+                                omtdataout_t *const value,
+                                uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_minus(const subtree &subtree, const omtcmp_t &extra,
+                          omtdataout_t *const value,
+                          uint32_t *const idxp) const;
+};
+
+}  // namespace toku
+
+// include the implementation here
+#include "omt_impl.h"
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,1295 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+namespace toku {
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create(void) {
+  this->create_internal(2);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_no_array(void) {
+  if (!supports_marks) {
+    this->create_internal_no_array(0);
+  } else {
+    this->is_array = false;
+    this->capacity = 0;
+    this->d.t.nodes = nullptr;
+    this->d.t.root.set_to_null();
+    this->d.t.free_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_from_sorted_array(
+    const omtdata_t *const values, const uint32_t numvalues) {
+  this->create_internal(numvalues);
+  memcpy(this->d.a.values, values, numvalues * (sizeof values[0]));
+  this->d.a.num_values = numvalues;
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array(
+    omtdata_t **const values, const uint32_t numvalues,
+    const uint32_t new_capacity) {
+  paranoid_invariant_notnull(values);
+  this->create_internal_no_array(new_capacity);
+  this->d.a.num_values = numvalues;
+  this->d.a.values = *values;
+  *values = nullptr;
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(omt *const newomt,
+                                                           const uint32_t idx) {
+  barf_if_marked(*this);
+  paranoid_invariant_notnull(newomt);
+  if (idx > this->size()) {
+    return EINVAL;
+  }
+  this->convert_to_array();
+  const uint32_t newsize = this->size() - idx;
+  newomt->create_from_sorted_array(&this->d.a.values[this->d.a.start_idx + idx],
+                                   newsize);
+  this->d.a.num_values = idx;
+  this->maybe_resize_array(idx);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::merge(omt *const leftomt,
+                                                         omt *const rightomt) {
+  barf_if_marked(*this);
+  paranoid_invariant_notnull(leftomt);
+  paranoid_invariant_notnull(rightomt);
+  const uint32_t leftsize = leftomt->size();
+  const uint32_t rightsize = rightomt->size();
+  const uint32_t newsize = leftsize + rightsize;
+
+  if (leftomt->is_array) {
+    if (leftomt->capacity -
+            (leftomt->d.a.start_idx + leftomt->d.a.num_values) >=
+        rightsize) {
+      this->create_steal_sorted_array(
+          &leftomt->d.a.values, leftomt->d.a.num_values, leftomt->capacity);
+      this->d.a.start_idx = leftomt->d.a.start_idx;
+    } else {
+      this->create_internal(newsize);
+      memcpy(&this->d.a.values[0], &leftomt->d.a.values[leftomt->d.a.start_idx],
+             leftomt->d.a.num_values * (sizeof this->d.a.values[0]));
+    }
+  } else {
+    this->create_internal(newsize);
+    leftomt->fill_array_with_subtree_values(&this->d.a.values[0],
+                                            leftomt->d.t.root);
+  }
+  leftomt->destroy();
+  this->d.a.num_values = leftsize;
+
+  if (rightomt->is_array) {
+    memcpy(&this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+           &rightomt->d.a.values[rightomt->d.a.start_idx],
+           rightomt->d.a.num_values * (sizeof this->d.a.values[0]));
+  } else {
+    rightomt->fill_array_with_subtree_values(
+        &this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+        rightomt->d.t.root);
+  }
+  rightomt->destroy();
+  this->d.a.num_values += rightsize;
+  paranoid_invariant(this->size() == newsize);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clone(const omt &src) {
+  barf_if_marked(*this);
+  this->create_internal(src.size());
+  if (src.is_array) {
+    memcpy(&this->d.a.values[0], &src.d.a.values[src.d.a.start_idx],
+           src.d.a.num_values * (sizeof this->d.a.values[0]));
+  } else {
+    src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root);
+  }
+  this->d.a.num_values = src.size();
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clear(void) {
+  if (this->is_array) {
+    this->d.a.start_idx = 0;
+    this->d.a.num_values = 0;
+  } else {
+    this->d.t.root.set_to_null();
+    this->d.t.free_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::destroy(void) {
+  this->clear();
+  this->capacity = 0;
+  if (this->is_array) {
+    if (this->d.a.values != nullptr) {
+      toku_free(this->d.a.values);
+    }
+    this->d.a.values = nullptr;
+  } else {
+    if (this->d.t.nodes != nullptr) {
+      toku_free(this->d.t.nodes);
+    }
+    this->d.t.nodes = nullptr;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::size(void) const {
+  if (this->is_array) {
+    return this->d.a.num_values;
+  } else {
+    return this->nweight(this->d.t.root);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert(const omtdata_t &value,
+                                                         const omtcmp_t &v,
+                                                         uint32_t *const idx) {
+  int r;
+  uint32_t insert_idx;
+
+  r = this->find_zero<omtcmp_t, h>(v, nullptr, &insert_idx);
+  if (r == 0) {
+    if (idx) *idx = insert_idx;
+    return DB_KEYEXIST;
+  }
+  if (r != DB_NOTFOUND) return r;
+
+  if ((r = this->insert_at(value, insert_idx))) return r;
+  if (idx) *idx = insert_idx;
+
+  return 0;
+}
+
+// The following 3 functions implement a static if for us.
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, false> &UU(omt)) {
+}
+
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, true> &omt) {
+  invariant(!omt.has_marks());
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::has_marks(void) const {
+  static_assert(supports_marks, "Does not support marks");
+  if (this->d.t.root.is_null()) {
+    return false;
+  }
+  const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()];
+  return node.get_marks_below() || node.get_marked();
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert_at(
+    const omtdata_t &value, const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx > this->size()) {
+    return EINVAL;
+  }
+
+  this->maybe_resize_or_convert(this->size() + 1);
+  if (this->is_array && idx != this->d.a.num_values &&
+      (idx != 0 || this->d.a.start_idx == 0)) {
+    this->convert_to_tree();
+  }
+  if (this->is_array) {
+    if (idx == this->d.a.num_values) {
+      this->d.a.values[this->d.a.start_idx + this->d.a.num_values] = value;
+    } else {
+      this->d.a.values[--this->d.a.start_idx] = value;
+    }
+    this->d.a.num_values++;
+  } else {
+    subtree *rebalance_subtree = nullptr;
+    this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+      this->rebalance(rebalance_subtree);
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::set_at(const omtdata_t &value,
+                                                         const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+
+  if (this->is_array) {
+    this->set_at_internal_array(value, idx);
+  } else {
+    this->set_at_internal(this->d.t.root, value, idx);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::delete_at(
+    const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+
+  this->maybe_resize_or_convert(this->size() - 1);
+  if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) {
+    this->convert_to_tree();
+  }
+  if (this->is_array) {
+    // Testing for 0 does not rule out it being the last entry.
+    // Test explicitly for num_values-1
+    if (idx != this->d.a.num_values - 1) {
+      this->d.a.start_idx++;
+    }
+    this->d.a.num_values--;
+  } else {
+    subtree *rebalance_subtree = nullptr;
+    this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+      this->rebalance(rebalance_subtree);
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate(
+    iterate_extra_t *const iterate_extra) const {
+  return this->iterate_on_range<iterate_extra_t, f>(0, this->size(),
+                                                    iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_on_range(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) const {
+  if (right > this->size()) {
+    return EINVAL;
+  }
+  if (left == right) {
+    return 0;
+  }
+  if (this->is_array) {
+    return this->iterate_internal_array<iterate_extra_t, f>(left, right,
+                                                            iterate_extra);
+  }
+  return this->iterate_internal<iterate_extra_t, f>(left, right, this->d.t.root,
+                                                    0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) {
+  static_assert(supports_marks, "does not support marks");
+  if (right > this->size()) {
+    return EINVAL;
+  }
+  if (left == right) {
+    return 0;
+  }
+  paranoid_invariant(!this->is_array);
+  return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+      left, right, this->d.t.root, 0, iterate_extra);
+}
+
+// TODO: We can optimize this if we steal 3 bits.  1 bit: this node is
+// marked.  1 bit: left subtree has marks. 1 bit: right subtree has marks.
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked(
+    iterate_extra_t *const iterate_extra) const {
+  static_assert(supports_marks, "does not support marks");
+  paranoid_invariant(!this->is_array);
+  return this->iterate_over_marked_internal<iterate_extra_t, f>(
+      this->d.t.root, 0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::unmark(
+    const subtree &st, const uint32_t index,
+    GrowableArray<node_idx> *const indexes) {
+  if (st.is_null()) {
+    return;
+  }
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t index_root = index + this->nweight(n.left);
+
+  const bool below = n.get_marks_below();
+  if (below) {
+    this->unmark(n.left, index, indexes);
+  }
+  if (n.get_marked()) {
+    indexes->push(index_root);
+  }
+  n.clear_stolen_bits();
+  if (below) {
+    this->unmark(n.right, index_root + 1, indexes);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) {
+  static_assert(supports_marks, "does not support marks");
+  if (!this->has_marks()) {
+    return;
+  }
+  paranoid_invariant(!this->is_array);
+  GrowableArray<node_idx> marked_indexes;
+  marked_indexes.init();
+
+  // Remove all marks.
+  // We need to delete all the stolen bits before calling delete_at to
+  // prevent barfing.
+  this->unmark(this->d.t.root, 0, &marked_indexes);
+
+  for (uint32_t i = 0; i < marked_indexes.get_size(); i++) {
+    // Delete from left to right, shift by number already deleted.
+    // Alternative is delete from right to left.
+    int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i);
+    lazy_assert_zero(r);
+  }
+  marked_indexes.deinit();
+  barf_if_marked(*this);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t
+omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_internal(
+    const subtree &st, const bool UU(allow_marks)) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  const omt_node &node = this->d.t.nodes[st.get_index()];
+  uint32_t num_marks =
+      verify_marks_consistent_internal(node.left, node.get_marks_below());
+  num_marks +=
+      verify_marks_consistent_internal(node.right, node.get_marks_below());
+  if (node.get_marks_below()) {
+    paranoid_invariant(allow_marks);
+    paranoid_invariant(num_marks > 0);
+  } else {
+    // redundant with invariant below, but nice to have explicitly
+    paranoid_invariant(num_marks == 0);
+  }
+  if (node.get_marked()) {
+    paranoid_invariant(allow_marks);
+    ++num_marks;
+  }
+  return num_marks;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent(
+    void) const {
+  static_assert(supports_marks, "does not support marks");
+  paranoid_invariant(!this->is_array);
+  this->verify_marks_consistent_internal(this->d.t.root, true);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr(
+    iterate_extra_t *const iterate_extra) {
+  if (this->is_array) {
+    this->iterate_ptr_internal_array<iterate_extra_t, f>(0, this->size(),
+                                                         iterate_extra);
+  } else {
+    this->iterate_ptr_internal<iterate_extra_t, f>(
+        0, this->size(), this->d.t.root, 0, iterate_extra);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::fetch(
+    const uint32_t idx, omtdataout_t *const value) const {
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+  if (this->is_array) {
+    this->fetch_internal_array(idx, value);
+  } else {
+    this->fetch_internal(this->d.t.root, idx, value);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_zero(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  uint32_t tmp_index;
+  uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+  int r;
+  if (this->is_array) {
+    r = this->find_internal_zero_array<omtcmp_t, h>(extra, value, child_idxp);
+  } else {
+    r = this->find_internal_zero<omtcmp_t, h>(this->d.t.root, extra, value,
+                                              child_idxp);
+  }
+  return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find(
+    const omtcmp_t &extra, int direction, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  uint32_t tmp_index;
+  uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+  paranoid_invariant(direction != 0);
+  if (direction < 0) {
+    if (this->is_array) {
+      return this->find_internal_minus_array<omtcmp_t, h>(extra, value,
+                                                          child_idxp);
+    } else {
+      return this->find_internal_minus<omtcmp_t, h>(this->d.t.root, extra,
+                                                    value, child_idxp);
+    }
+  } else {
+    if (this->is_array) {
+      return this->find_internal_plus_array<omtcmp_t, h>(extra, value,
+                                                         child_idxp);
+    } else {
+      return this->find_internal_plus<omtcmp_t, h>(this->d.t.root, extra, value,
+                                                   child_idxp);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+size_t omt<omtdata_t, omtdataout_t, supports_marks>::memory_size(void) {
+  if (this->is_array) {
+    return (sizeof *this) + this->capacity * (sizeof this->d.a.values[0]);
+  }
+  return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal_no_array(
+    const uint32_t new_capacity) {
+  this->is_array = true;
+  this->d.a.start_idx = 0;
+  this->d.a.num_values = 0;
+  this->d.a.values = nullptr;
+  this->capacity = new_capacity;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal(
+    const uint32_t new_capacity) {
+  this->create_internal_no_array(new_capacity);
+  XMALLOC_N(this->capacity, this->d.a.values);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::nweight(
+    const subtree &st) const {
+  if (st.is_null()) {
+    return 0;
+  } else {
+    return this->d.t.nodes[st.get_index()].weight;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+typename omt<omtdata_t, omtdataout_t, supports_marks>::node_idx
+omt<omtdata_t, omtdataout_t, supports_marks>::node_malloc(void) {
+  paranoid_invariant(this->d.t.free_idx < this->capacity);
+  omt_node &n = this->d.t.nodes[this->d.t.free_idx];
+  n.clear_stolen_bits();
+  return this->d.t.free_idx++;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::node_free(
+    const node_idx UU(idx)) {
+  paranoid_invariant(idx < this->capacity);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_array(
+    const uint32_t n) {
+  const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+  const uint32_t room = this->capacity - this->d.a.start_idx;
+
+  if (room < n || this->capacity / 2 >= new_size) {
+    omtdata_t *XMALLOC_N(new_size, tmp_values);
+    if (this->d.a.num_values) {
+      memcpy(tmp_values, &this->d.a.values[this->d.a.start_idx],
+             this->d.a.num_values * (sizeof tmp_values[0]));
+    }
+    this->d.a.start_idx = 0;
+    this->capacity = new_size;
+    toku_free(this->d.a.values);
+    this->d.a.values = tmp_values;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t,
+         supports_marks>::fill_array_with_subtree_values(omtdata_t *const array,
+                                                         const subtree &st)
+    const {
+  if (st.is_null()) return;
+  const omt_node &tree = this->d.t.nodes[st.get_index()];
+  this->fill_array_with_subtree_values(&array[0], tree.left);
+  array[this->nweight(tree.left)] = tree.value;
+  this->fill_array_with_subtree_values(&array[this->nweight(tree.left) + 1],
+                                       tree.right);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_array(void) {
+  if (!this->is_array) {
+    const uint32_t num_values = this->size();
+    uint32_t new_size = 2 * num_values;
+    new_size = new_size < 4 ? 4 : new_size;
+
+    omtdata_t *XMALLOC_N(new_size, tmp_values);
+    this->fill_array_with_subtree_values(tmp_values, this->d.t.root);
+    toku_free(this->d.t.nodes);
+    this->is_array = true;
+    this->capacity = new_size;
+    this->d.a.num_values = num_values;
+    this->d.a.values = tmp_values;
+    this->d.a.start_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_from_sorted_array(
+    subtree *const st, const omtdata_t *const values,
+    const uint32_t numvalues) {
+  if (numvalues == 0) {
+    st->set_to_null();
+  } else {
+    const uint32_t halfway = numvalues / 2;
+    const node_idx newidx = this->node_malloc();
+    omt_node *const newnode = &this->d.t.nodes[newidx];
+    newnode->weight = numvalues;
+    newnode->value = values[halfway];
+    st->set_index(newidx);
+    // update everything before the recursive calls so the second call
+    // can be a tail call.
+    this->rebuild_from_sorted_array(&newnode->left, &values[0], halfway);
+    this->rebuild_from_sorted_array(&newnode->right, &values[halfway + 1],
+                                    numvalues - (halfway + 1));
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_tree(void) {
+  if (this->is_array) {
+    const uint32_t num_nodes = this->size();
+    uint32_t new_size = num_nodes * 2;
+    new_size = new_size < 4 ? 4 : new_size;
+
+    omt_node *XMALLOC_N(new_size, new_nodes);
+    omtdata_t *const values = this->d.a.values;
+    omtdata_t *const tmp_values = &values[this->d.a.start_idx];
+    this->is_array = false;
+    this->d.t.nodes = new_nodes;
+    this->capacity = new_size;
+    this->d.t.free_idx = 0;
+    this->d.t.root.set_to_null();
+    this->rebuild_from_sorted_array(&this->d.t.root, tmp_values, num_nodes);
+    toku_free(values);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_or_convert(
+    const uint32_t n) {
+  if (this->is_array) {
+    this->maybe_resize_array(n);
+  } else {
+    const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+    const uint32_t num_nodes = this->nweight(this->d.t.root);
+    if ((this->capacity / 2 >= new_size) ||
+        (this->d.t.free_idx >= this->capacity && num_nodes < n) ||
+        (this->capacity < n)) {
+      this->convert_to_array();
+      // if we had a free list, the "supports_marks" version could
+      // just resize, as it is now, we have to convert to and back
+      // from an array.
+      if (supports_marks) {
+        this->convert_to_tree();
+      }
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::will_need_rebalance(
+    const subtree &st, const int leftmod, const int rightmod) const {
+  if (st.is_null()) {
+    return false;
+  }
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  // one of the 1's is for the root.
+  // the other is to take ceil(n/2)
+  const uint32_t weight_left = this->nweight(n.left) + leftmod;
+  const uint32_t weight_right = this->nweight(n.right) + rightmod;
+  return ((1 + weight_left < (1 + 1 + weight_right) / 2) ||
+          (1 + weight_right < (1 + 1 + weight_left) / 2));
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::insert_internal(
+    subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+    subtree **const rebalance_subtree) {
+  if (subtreep->is_null()) {
+    paranoid_invariant_zero(idx);
+    const node_idx newidx = this->node_malloc();
+    omt_node *const newnode = &this->d.t.nodes[newidx];
+    newnode->weight = 1;
+    newnode->left.set_to_null();
+    newnode->right.set_to_null();
+    newnode->value = value;
+    subtreep->set_index(newidx);
+  } else {
+    omt_node &n = this->d.t.nodes[subtreep->get_index()];
+    n.weight++;
+    if (idx <= this->nweight(n.left)) {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 1, 0)) {
+        *rebalance_subtree = subtreep;
+      }
+      this->insert_internal(&n.left, value, idx, rebalance_subtree);
+    } else {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 0, 1)) {
+        *rebalance_subtree = subtreep;
+      }
+      const uint32_t sub_index = idx - this->nweight(n.left) - 1;
+      this->insert_internal(&n.right, value, sub_index, rebalance_subtree);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal_array(
+    const omtdata_t &value, const uint32_t idx) {
+  this->d.a.values[this->d.a.start_idx + idx] = value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal(
+    const subtree &st, const omtdata_t &value, const uint32_t idx) {
+  paranoid_invariant(!st.is_null());
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (idx < leftweight) {
+    this->set_at_internal(n.left, value, idx);
+  } else if (idx == leftweight) {
+    n.value = value;
+  } else {
+    this->set_at_internal(n.right, value, idx - leftweight - 1);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_internal(
+    subtree *const subtreep, const uint32_t idx, omt_node *const copyn,
+    subtree **const rebalance_subtree) {
+  paranoid_invariant_notnull(subtreep);
+  paranoid_invariant_notnull(rebalance_subtree);
+  paranoid_invariant(!subtreep->is_null());
+  omt_node &n = this->d.t.nodes[subtreep->get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (idx < leftweight) {
+    n.weight--;
+    if (*rebalance_subtree == nullptr &&
+        this->will_need_rebalance(*subtreep, -1, 0)) {
+      *rebalance_subtree = subtreep;
+    }
+    this->delete_internal(&n.left, idx, copyn, rebalance_subtree);
+  } else if (idx == leftweight) {
+    if (n.left.is_null()) {
+      const uint32_t oldidx = subtreep->get_index();
+      *subtreep = n.right;
+      if (copyn != nullptr) {
+        copyn->value = n.value;
+      }
+      this->node_free(oldidx);
+    } else if (n.right.is_null()) {
+      const uint32_t oldidx = subtreep->get_index();
+      *subtreep = n.left;
+      if (copyn != nullptr) {
+        copyn->value = n.value;
+      }
+      this->node_free(oldidx);
+    } else {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 0, -1)) {
+        *rebalance_subtree = subtreep;
+      }
+      // don't need to copy up value, it's only used by this
+      // next call, and when that gets to the bottom there
+      // won't be any more recursion
+      n.weight--;
+      this->delete_internal(&n.right, 0, &n, rebalance_subtree);
+    }
+  } else {
+    n.weight--;
+    if (*rebalance_subtree == nullptr &&
+        this->will_need_rebalance(*subtreep, 0, -1)) {
+      *rebalance_subtree = subtreep;
+    }
+    this->delete_internal(&n.right, idx - leftweight - 1, copyn,
+                          rebalance_subtree);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal_array(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) const {
+  int r;
+  for (uint32_t i = left; i < right; ++i) {
+    r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal(
+    const uint32_t left, const uint32_t right, const subtree &st,
+    const uint32_t idx, iterate_extra_t *const iterate_extra) {
+  if (!st.is_null()) {
+    omt_node &n = this->d.t.nodes[st.get_index()];
+    const uint32_t idx_root = idx + this->nweight(n.left);
+    if (left < idx_root) {
+      this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.left, idx,
+                                                     iterate_extra);
+    }
+    if (left <= idx_root && idx_root < right) {
+      int r = f(&n.value, idx_root, iterate_extra);
+      lazy_assert_zero(r);
+    }
+    if (idx_root + 1 < right) {
+      this->iterate_ptr_internal<iterate_extra_t, f>(
+          left, right, n.right, idx_root + 1, iterate_extra);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal_array(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) {
+  for (uint32_t i = left; i < right; ++i) {
+    int r = f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+    lazy_assert_zero(r);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal(
+    const uint32_t left, const uint32_t right, const subtree &st,
+    const uint32_t idx, iterate_extra_t *const iterate_extra) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  int r;
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (left < idx_root) {
+    r = this->iterate_internal<iterate_extra_t, f>(left, right, n.left, idx,
+                                                   iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (left <= idx_root && idx_root < right) {
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (idx_root + 1 < right) {
+    return this->iterate_internal<iterate_extra_t, f>(
+        left, right, n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::
+    iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+                                    const subtree &st, const uint32_t idx,
+                                    iterate_extra_t *const iterate_extra) {
+  paranoid_invariant(!st.is_null());
+  int r;
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (left < idx_root && !n.left.is_null()) {
+    n.set_marks_below_bit();
+    r = this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+        left, right, n.left, idx, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (left <= idx_root && idx_root < right) {
+    n.set_marked_bit();
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (idx_root + 1 < right && !n.right.is_null()) {
+    n.set_marks_below_bit();
+    return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+        left, right, n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked_internal(
+    const subtree &st, const uint32_t idx,
+    iterate_extra_t *const iterate_extra) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  int r;
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (n.get_marks_below()) {
+    r = this->iterate_over_marked_internal<iterate_extra_t, f>(n.left, idx,
+                                                               iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (n.get_marked()) {
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (n.get_marks_below()) {
+    return this->iterate_over_marked_internal<iterate_extra_t, f>(
+        n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal_array(
+    const uint32_t i, omtdataout_t *const value) const {
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[this->d.a.start_idx + i]);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal(
+    const subtree &st, const uint32_t i, omtdataout_t *const value) const {
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (i < leftweight) {
+    this->fetch_internal(n.left, i, value);
+  } else if (i == leftweight) {
+    if (value != nullptr) {
+      copyout(value, &n);
+    }
+  } else {
+    this->fetch_internal(n.right, i - leftweight - 1, value);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fill_array_with_subtree_idxs(
+    node_idx *const array, const subtree &st) const {
+  if (!st.is_null()) {
+    const omt_node &tree = this->d.t.nodes[st.get_index()];
+    this->fill_array_with_subtree_idxs(&array[0], tree.left);
+    array[this->nweight(tree.left)] = st.get_index();
+    this->fill_array_with_subtree_idxs(&array[this->nweight(tree.left) + 1],
+                                       tree.right);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_subtree_from_idxs(
+    subtree *const st, const node_idx *const idxs, const uint32_t numvalues) {
+  if (numvalues == 0) {
+    st->set_to_null();
+  } else {
+    uint32_t halfway = numvalues / 2;
+    st->set_index(idxs[halfway]);
+    // node_idx newidx = idxs[halfway];
+    omt_node &newnode = this->d.t.nodes[st->get_index()];
+    newnode.weight = numvalues;
+    // value is already in there.
+    this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway);
+    this->rebuild_subtree_from_idxs(&newnode.right, &idxs[halfway + 1],
+                                    numvalues - (halfway + 1));
+    // n_idx = newidx;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebalance(
+    subtree *const st) {
+  node_idx idx = st->get_index();
+  if (idx == this->d.t.root.get_index()) {
+    // Try to convert to an array.
+    // If this fails, (malloc) nothing will have changed.
+    // In the failure case we continue on to the standard rebalance
+    // algorithm.
+    this->convert_to_array();
+    if (supports_marks) {
+      this->convert_to_tree();
+    }
+  } else {
+    const omt_node &n = this->d.t.nodes[idx];
+    node_idx *tmp_array;
+    size_t mem_needed = n.weight * (sizeof tmp_array[0]);
+    size_t mem_free =
+        (this->capacity - this->d.t.free_idx) * (sizeof this->d.t.nodes[0]);
+    bool malloced;
+    if (mem_needed <= mem_free) {
+      // There is sufficient free space at the end of the nodes array
+      // to hold enough node indexes to rebalance.
+      malloced = false;
+      tmp_array =
+          reinterpret_cast<node_idx *>(&this->d.t.nodes[this->d.t.free_idx]);
+    } else {
+      malloced = true;
+      XMALLOC_N(n.weight, tmp_array);
+    }
+    this->fill_array_with_subtree_idxs(tmp_array, *st);
+    this->rebuild_subtree_from_idxs(st, tmp_array, n.weight);
+    if (malloced) toku_free(tmp_array);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t *const out, const omt_node *const n) {
+  *out = n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t **const out, omt_node *const n) {
+  *out = &n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t *const out, const omtdata_t *const stored_value_ptr) {
+  *out = *stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t **const out, omtdata_t *const stored_value_ptr) {
+  *out = stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best_pos = subtree::NODE_NULL;
+  uint32_t best_zero = subtree::NODE_NULL;
+
+  while (min != limit) {
+    uint32_t mid = (min + limit) / 2;
+    int hv = h(this->d.a.values[mid], extra);
+    if (hv < 0) {
+      min = mid + 1;
+    } else if (hv > 0) {
+      best_pos = mid;
+      limit = mid;
+    } else {
+      best_zero = mid;
+      limit = mid;
+    }
+  }
+  if (best_zero != subtree::NODE_NULL) {
+    // Found a zero
+    if (value != nullptr) {
+      copyout(value, &this->d.a.values[best_zero]);
+    }
+    *idxp = best_zero - this->d.a.start_idx;
+    return 0;
+  }
+  if (best_pos != subtree::NODE_NULL)
+    *idxp = best_pos - this->d.a.start_idx;
+  else
+    *idxp = this->d.a.num_values;
+  return DB_NOTFOUND;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    *idxp = 0;
+    return DB_NOTFOUND;
+  }
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  int hv = h(n.value, extra);
+  if (hv < 0) {
+    int r = this->find_internal_zero<omtcmp_t, h>(n.right, extra, value, idxp);
+    *idxp += this->nweight(n.left) + 1;
+    return r;
+  } else if (hv > 0) {
+    return this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+  } else {
+    int r = this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+    if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n.left);
+      if (value != nullptr) {
+        copyout(value, &n);
+      }
+      r = 0;
+    }
+    return r;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best = subtree::NODE_NULL;
+
+  while (min != limit) {
+    const uint32_t mid = (min + limit) / 2;
+    const int hv = h(this->d.a.values[mid], extra);
+    if (hv > 0) {
+      best = mid;
+      limit = mid;
+    } else {
+      min = mid + 1;
+    }
+  }
+  if (best == subtree::NODE_NULL) {
+    return DB_NOTFOUND;
+  }
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[best]);
+  }
+  *idxp = best - this->d.a.start_idx;
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    return DB_NOTFOUND;
+  }
+  omt_node *const n = &this->d.t.nodes[st.get_index()];
+  int hv = h(n->value, extra);
+  int r;
+  if (hv > 0) {
+    r = this->find_internal_plus<omtcmp_t, h>(n->left, extra, value, idxp);
+    if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n->left);
+      if (value != nullptr) {
+        copyout(value, n);
+      }
+      r = 0;
+    }
+  } else {
+    r = this->find_internal_plus<omtcmp_t, h>(n->right, extra, value, idxp);
+    if (r == 0) {
+      *idxp += this->nweight(n->left) + 1;
+    }
+  }
+  return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best = subtree::NODE_NULL;
+
+  while (min != limit) {
+    const uint32_t mid = (min + limit) / 2;
+    const int hv = h(this->d.a.values[mid], extra);
+    if (hv < 0) {
+      best = mid;
+      min = mid + 1;
+    } else {
+      limit = mid;
+    }
+  }
+  if (best == subtree::NODE_NULL) {
+    return DB_NOTFOUND;
+  }
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[best]);
+  }
+  *idxp = best - this->d.a.start_idx;
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    return DB_NOTFOUND;
+  }
+  omt_node *const n = &this->d.t.nodes[st.get_index()];
+  int hv = h(n->value, extra);
+  if (hv < 0) {
+    int r =
+        this->find_internal_minus<omtcmp_t, h>(n->right, extra, value, idxp);
+    if (r == 0) {
+      *idxp += this->nweight(n->left) + 1;
+    } else if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n->left);
+      if (value != nullptr) {
+        copyout(value, n);
+      }
+      r = 0;
+    }
+    return r;
+  } else {
+    return this->find_internal_minus<omtcmp_t, h>(n->left, extra, value, idxp);
+  }
+}
+}  // namespace toku
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,165 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// Overview: A partitioned_counter provides a counter that can be incremented
+// and the running sum can be read at any time.
+//  We assume that increments are frequent, whereas reading is infrequent.
+// Implementation hint: Use thread-local storage so each thread increments its
+// own data.  The increment does not require a lock or atomic operation.
+//  Reading the data can be performed by iterating over the thread-local
+//  versions, summing them up. The data structure also includes a sum for all
+//  the threads that have died. Use a pthread_key to create the thread-local
+//  versions.  When a thread finishes, the system calls pthread_key destructor
+//  which can add that thread's copy into the sum_of_dead counter.
+// Rationale: For statistics such as are found in engine status, we need a
+// counter that requires no cache misses to increment.  We've seen significant
+//  performance speedups by removing certain counters.  Rather than removing
+//  those statistics, we would like to just make the counter fast. We generally
+//  increment the counters frequently, and want to fetch the values
+//  infrequently. The counters are monotonic. The counters can be split into
+//  many counters, which can be summed up at the end. We don't care if we get
+//  slightly out-of-date counter sums when we read the counter.  We don't care
+//  if there is a race on reading the a counter
+//   variable and incrementing.
+//  See tests/test_partitioned_counter.c for some performance measurements.
+// Operations:
+//   create_partitioned_counter    Create a counter initialized to zero.
+//   destroy_partitioned_counter   Destroy it.
+//   increment_partitioned_counter Increment it.  This is the frequent
+//   operation. read_partitioned_counter      Get the current value.  This is
+//   infrequent.
+// See partitioned_counter.cc for the abstraction function and representation
+// invariant.
+//
+// The google style guide says to avoid using constructors, and it appears that
+// constructors may have broken all the tests, because they called
+// pthread_key_create before the key was actually created.  So the google style
+// guide may have some wisdom there...
+//
+// This version does not use constructors, essentially reverrting to the google
+// C++ style guide.
+//
+
+// The old C interface.  This required a bunch of explicit
+// ___attribute__((__destructor__)) functions to remember to destroy counters at
+// the end.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct partitioned_counter *PARTITIONED_COUNTER;
+PARTITIONED_COUNTER create_partitioned_counter(void);
+// Effect: Create a counter, initialized to zero.
+
+void destroy_partitioned_counter(PARTITIONED_COUNTER);
+// Effect: Destroy the counter.  No operations on that counter are permitted
+// after this.
+
+void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount);
+// Effect: Increment the counter by amount.
+// Requires: No overflows.  This is a 64-bit unsigned counter.
+
+uint64_t read_partitioned_counter(PARTITIONED_COUNTER)
+    __attribute__((__visibility__("default")));
+// Effect: Return the current value of the counter.
+
+void partitioned_counters_init(void);
+// Effect: Initialize any partitioned counters data structures that must be set
+// up before any partitioned counters run.
+
+void partitioned_counters_destroy(void);
+// Effect: Destroy any partitioned counters data structures.
+
+#if defined(__cplusplus)
+};
+#endif
+
+#if 0
+#include <pthread.h>
+
+#include "fttypes.h"
+
+// Used inside the PARTITIONED_COUNTER.
+struct linked_list_head {
+    struct linked_list_element *first;
+};
+
+
+class PARTITIONED_COUNTER {
+public:
+    PARTITIONED_COUNTER(void);
+    // Effect: Construct a counter, initialized to zero.
+
+    ~PARTITIONED_COUNTER(void);
+    // Effect: Destruct the counter.
+
+    void increment(uint64_t amount);
+    // Effect: Increment the counter by amount.  This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64).
+    // Requires: Don't use this from a static constructor or destructor.
+
+    uint64_t read(void);
+    // Effect: Read the sum.
+    // Requires: Don't use this from a static constructor or destructor.
+
+private:
+    uint64_t       _sum_of_dead;             // The sum of all thread-local counts from threads that have terminated.
+    pthread_key_t   _key;                     // The pthread_key which gives us the hook to construct and destruct thread-local storage.
+    struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter.
+    
+    // This function is used to destroy the thread-local part of the state when a thread terminates.
+    // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends.
+    friend void destroy_thread_local_part_of_partitioned_counters (void *);
+};
+#endif
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/util/status.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,76 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "partitioned_counter.h"
+// PORT2: #include <util/constexpr.h>
+
+#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc)                    \
+  do {                                                                \
+    array.status[k].keyname = #k;                                     \
+    array.status[k].columnname = #c;                                  \
+    array.status[k].type = t;                                         \
+    array.status[k].legend = l;                                       \
+    constexpr_static_assert(                                          \
+        strcmp(#c, "NULL") && strcmp(#c, "0"),                        \
+        "Use nullptr for no column name instead of NULL, 0, etc..."); \
+    constexpr_static_assert(                                          \
+        (inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"),         \
+        "Missing column name.");                                      \
+    array.status[k].include =                                         \
+        static_cast<toku_engine_status_include_type>(inc);            \
+    if (t == STATUS_PARCOUNT) {                                       \
+      array.status[k].value.parcount = create_partitioned_counter();  \
+    }                                                                 \
+  } while (0)
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,503 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <mutex>
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockManagerHandle* NewRangeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory) {
+  std::shared_ptr<TransactionDBMutexFactory> use_factory;
+
+  if (mutex_factory) {
+    use_factory = mutex_factory;
+  } else {
+    use_factory.reset(new TransactionDBMutexFactoryImpl());
+  }
+  return new RangeTreeLockManager(use_factory);
+}
+
+static const char SUFFIX_INFIMUM = 0x0;
+static const char SUFFIX_SUPREMUM = 0x1;
+
+// Convert Endpoint into an internal format used for storing it in locktree
+// (DBT structure is used for passing endpoints to locktree and getting back)
+void serialize_endpoint(const Endpoint& endp, std::string* buf) {
+  buf->push_back(endp.inf_suffix ? SUFFIX_SUPREMUM : SUFFIX_INFIMUM);
+  buf->append(endp.slice.data(), endp.slice.size());
+}
+
+// Decode the endpoint from the format it is stored in the locktree (DBT) to
+// the one used outside: either Endpoint or EndpointWithString
+template <typename EndpointStruct>
+void deserialize_endpoint(const DBT* dbt, EndpointStruct* endp) {
+  assert(dbt->size >= 1);
+  const char* dbt_data = (const char*)dbt->data;
+  char suffix = dbt_data[0];
+  assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM);
+  endp->inf_suffix = (suffix == SUFFIX_SUPREMUM);
+  endp->slice = decltype(EndpointStruct::slice)(dbt_data + 1, dbt->size - 1);
+}
+
+// Get a range lock on [start_key; end_key] range
+Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
+                                     uint32_t column_family_id,
+                                     const Endpoint& start_endp,
+                                     const Endpoint& end_endp, Env*,
+                                     bool exclusive) {
+  toku::lock_request request;
+  request.create(mutex_factory_);
+  DBT start_key_dbt, end_key_dbt;
+
+  TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:enter");
+  std::string start_key;
+  std::string end_key;
+  serialize_endpoint(start_endp, &start_key);
+  serialize_endpoint(end_endp, &end_key);
+
+  toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size());
+  toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size());
+
+  auto lt = GetLockTreeForCF(column_family_id);
+
+  // Put the key waited on into request's m_extra. See
+  // wait_callback_for_locktree for details.
+  std::string wait_key(start_endp.slice.data(), start_endp.slice.size());
+
+  request.set(lt.get(), (TXNID)txn, &start_key_dbt, &end_key_dbt,
+              exclusive ? toku::lock_request::WRITE : toku::lock_request::READ,
+              false /* not a big txn */, &wait_key);
+
+  // This is for "periodically wake up and check if the wait is killed" feature
+  // which we are not using.
+  uint64_t killed_time_msec = 0;
+  uint64_t wait_time_msec = txn->GetLockTimeout();
+
+  if (wait_time_msec == static_cast<uint64_t>(-1)) {
+    // The transaction has no wait timeout. lock_request::wait doesn't support
+    // this, it needs a number of milliseconds to wait. Pass it one year to
+    // be safe.
+    wait_time_msec = uint64_t(1000) * 60 * 60 * 24 * 365;
+  } else {
+    // convert microseconds to milliseconds
+    wait_time_msec = (wait_time_msec + 500) / 1000;
+  }
+
+  std::vector<RangeDeadlockInfo> di_path;
+  request.m_deadlock_cb = [&](TXNID txnid, bool is_exclusive,
+                              const DBT* start_dbt, const DBT* end_dbt) {
+    EndpointWithString start;
+    EndpointWithString end;
+    deserialize_endpoint(start_dbt, &start);
+    deserialize_endpoint(end_dbt, &end);
+
+    di_path.push_back({((PessimisticTransaction*)txnid)->GetID(),
+                       column_family_id, is_exclusive, std::move(start),
+                       std::move(end)});
+  };
+
+  request.start();
+
+  const int r = request.wait(wait_time_msec, killed_time_msec,
+                             nullptr,  // killed_callback
+                             wait_callback_for_locktree, nullptr);
+
+  // Inform the txn that we are no longer waiting:
+  txn->ClearWaitingTxn();
+
+  request.destroy();
+  switch (r) {
+    case 0:
+      break;  // fall through
+    case DB_LOCK_NOTGRANTED:
+      return Status::TimedOut(Status::SubCode::kLockTimeout);
+    case TOKUDB_OUT_OF_LOCKS:
+      return Status::Busy(Status::SubCode::kLockLimit);
+    case DB_LOCK_DEADLOCK: {
+      std::reverse(di_path.begin(), di_path.end());
+      dlock_buffer_.AddNewPath(
+          RangeDeadlockPath(di_path, request.get_start_time()));
+      return Status::Busy(Status::SubCode::kDeadlock);
+    }
+    default:
+      assert(0);
+      return Status::Busy(Status::SubCode::kLockLimit);
+  }
+
+  return Status::OK();
+}
+
+// Wait callback that locktree library will call to inform us about
+// the lock waits that are in progress.
+void wait_callback_for_locktree(void*, toku::lock_wait_infos* infos) {
+  for (auto wait_info : *infos) {
+    auto txn = (PessimisticTransaction*)wait_info.waiter;
+    auto cf_id = (ColumnFamilyId)wait_info.ltree->get_dict_id().dictid;
+
+    autovector<TransactionID> waitee_ids;
+    for (auto waitee : wait_info.waitees) {
+      waitee_ids.push_back(((PessimisticTransaction*)waitee)->GetID());
+    }
+    txn->SetWaitingTxn(waitee_ids, cf_id, (std::string*)wait_info.m_extra);
+  }
+
+  // Here we can assume that the locktree code will now wait for some lock
+  TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:WaitingTxn");
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+                                  ColumnFamilyId column_family_id,
+                                  const std::string& key, Env*) {
+  auto locktree = GetLockTreeForCF(column_family_id);
+  std::string endp_image;
+  serialize_endpoint({key.data(), key.size(), false}, &endp_image);
+
+  DBT key_dbt;
+  toku_fill_dbt(&key_dbt, endp_image.data(), endp_image.size());
+
+  toku::range_buffer range_buf;
+  range_buf.create();
+  range_buf.append(&key_dbt, &key_dbt);
+
+  locktree->release_locks((TXNID)txn, &range_buf);
+  range_buf.destroy();
+
+  toku::lock_request::retry_all_lock_requests(
+      locktree.get(), wait_callback_for_locktree, nullptr);
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+                                  const LockTracker& tracker, Env*) {
+  const RangeTreeLockTracker* range_tracker =
+      static_cast<const RangeTreeLockTracker*>(&tracker);
+
+  RangeTreeLockTracker* range_trx_tracker =
+      static_cast<RangeTreeLockTracker*>(&txn->GetTrackedLocks());
+  bool all_keys = (range_trx_tracker == range_tracker);
+
+  // tracked_locks_->range_list may hold nullptr if the transaction has never
+  // acquired any locks.
+  ((RangeTreeLockTracker*)range_tracker)->ReleaseLocks(this, txn, all_keys);
+}
+
+int RangeTreeLockManager::CompareDbtEndpoints(void* arg, const DBT* a_key,
+                                              const DBT* b_key) {
+  const char* a = (const char*)a_key->data;
+  const char* b = (const char*)b_key->data;
+
+  size_t a_len = a_key->size;
+  size_t b_len = b_key->size;
+
+  size_t min_len = std::min(a_len, b_len);
+
+  // Compare the values. The first byte encodes the endpoint type, its value
+  // is either SUFFIX_INFIMUM or SUFFIX_SUPREMUM.
+  Comparator* cmp = (Comparator*)arg;
+  int res = cmp->Compare(Slice(a + 1, min_len - 1), Slice(b + 1, min_len - 1));
+  if (!res) {
+    if (b_len > min_len) {
+      // a is shorter;
+      if (a[0] == SUFFIX_INFIMUM) {
+        return -1;  //"a is smaller"
+      } else {
+        // a is considered padded with 0xFF:FF:FF:FF...
+        return 1;  // "a" is bigger
+      }
+    } else if (a_len > min_len) {
+      // the opposite of the above: b is shorter.
+      if (b[0] == SUFFIX_INFIMUM) {
+        return 1;  //"b is smaller"
+      } else {
+        // b is considered padded with 0xFF:FF:FF:FF...
+        return -1;  // "b" is bigger
+      }
+    } else {
+      // the lengths are equal (and the key values, too)
+      if (a[0] < b[0]) {
+        return -1;
+      } else if (a[0] > b[0]) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  } else {
+    return res;
+  }
+}
+
+namespace {
+void UnrefLockTreeMapsCache(void* ptr) {
+  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+  auto lock_tree_map_cache = static_cast<
+      std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>*>(
+      ptr);
+  delete lock_tree_map_cache;
+}
+}  // anonymous namespace
+
+RangeTreeLockManager::RangeTreeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
+    : mutex_factory_(mutex_factory),
+      ltree_lookup_cache_(new ThreadLocalPtr(&UnrefLockTreeMapsCache)),
+      dlock_buffer_(10) {
+  ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_);
+}
+
+int RangeTreeLockManager::on_create(toku::locktree* lt, void* arg) {
+  // arg is a pointer to RangeTreeLockManager
+  lt->set_escalation_barrier_func(&OnEscalationBarrierCheck, arg);
+  return 0;
+}
+
+bool RangeTreeLockManager::OnEscalationBarrierCheck(const DBT* a, const DBT* b,
+                                                    void* extra) {
+  Endpoint a_endp, b_endp;
+  deserialize_endpoint(a, &a_endp);
+  deserialize_endpoint(b, &b_endp);
+  auto self = static_cast<RangeTreeLockManager*>(extra);
+  return self->barrier_func_(a_endp, b_endp);
+}
+
+void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize(
+    uint32_t target_size) {
+  dlock_buffer_.Resize(target_size);
+}
+
+void RangeTreeLockManager::Resize(uint32_t target_size) {
+  SetRangeDeadlockInfoBufferSize(target_size);
+}
+
+std::vector<RangeDeadlockPath>
+RangeTreeLockManager::GetRangeDeadlockInfoBuffer() {
+  return dlock_buffer_.PrepareBuffer();
+}
+
+std::vector<DeadlockPath> RangeTreeLockManager::GetDeadlockInfoBuffer() {
+  std::vector<DeadlockPath> res;
+  std::vector<RangeDeadlockPath> data = GetRangeDeadlockInfoBuffer();
+  // report left endpoints
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    std::vector<DeadlockInfo> path;
+
+    for (auto it2 = it->path.begin(); it2 != it->path.end(); ++it2) {
+      path.push_back(
+          {it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice});
+    }
+    res.push_back(DeadlockPath(path, it->deadlock_time));
+  }
+  return res;
+}
+
+// @brief  Lock Escalation Callback function
+//
+// @param txnid   Transaction whose locks got escalated
+// @param lt      Lock Tree where escalation is happening
+// @param buffer  Escalation result: list of locks that this transaction now
+//                owns in this lock tree.
+// @param void*   Callback context
+void RangeTreeLockManager::on_escalate(TXNID txnid, const toku::locktree* lt,
+                                       const toku::range_buffer& buffer,
+                                       void*) {
+  auto txn = (PessimisticTransaction*)txnid;
+  ((RangeTreeLockTracker*)&txn->GetTrackedLocks())->ReplaceLocks(lt, buffer);
+}
+
+RangeTreeLockManager::~RangeTreeLockManager() {
+  autovector<void*> local_caches;
+  ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockTreeMap*>(cache);
+  }
+  ltree_map_.clear();  // this will call release_lt() for all locktrees
+  ltm_.destroy();
+}
+
+RangeLockManagerHandle::Counters RangeTreeLockManager::GetStatus() {
+  LTM_STATUS_S ltm_status_test;
+  ltm_.get_status(&ltm_status_test);
+  Counters res;
+
+  // Searching status variable by its string name is how Toku's unit tests
+  // do it (why didn't they make LTM_ESCALATION_COUNT constant visible?)
+  // lookup keyname in status
+  for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) {
+    TOKU_ENGINE_STATUS_ROW status = &ltm_status_test.status[i];
+    if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
+      res.escalation_count = status->value.num;
+      continue;
+    }
+    if (strcmp(status->keyname, "LTM_WAIT_COUNT") == 0) {
+      res.lock_wait_count = status->value.num;
+      continue;
+    }
+    if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) {
+      res.current_lock_memory = status->value.num;
+    }
+  }
+  return res;
+}
+
+std::shared_ptr<toku::locktree> RangeTreeLockManager::MakeLockTreePtr(
+    toku::locktree* lt) {
+  toku::locktree_manager* ltm = &ltm_;
+  return std::shared_ptr<toku::locktree>(
+      lt, [ltm](toku::locktree* p) { ltm->release_lt(p); });
+}
+
+void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) {
+  uint32_t column_family_id = cfh->GetID();
+
+  InstrumentedMutexLock l(&ltree_map_mutex_);
+  if (ltree_map_.find(column_family_id) == ltree_map_.end()) {
+    DICTIONARY_ID dict_id = {.dictid = column_family_id};
+    toku::comparator cmp;
+    cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator());
+    toku::locktree* ltree =
+        ltm_.get_lt(dict_id, cmp,
+                    /* on_create_extra*/ static_cast<void*>(this));
+    // This is ok to because get_lt has copied the comparator:
+    cmp.destroy();
+
+    ltree_map_.insert({column_family_id, MakeLockTreePtr(ltree)});
+  }
+}
+
+void RangeTreeLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cfh) {
+  uint32_t column_family_id = cfh->GetID();
+  // Remove lock_map for this column family.  Since the lock map is stored
+  // as a shared ptr, concurrent transactions can still keep using it
+  // until they release their references to it.
+
+  // TODO what if one drops a column family while transaction(s) still have
+  // locks in it?
+  // locktree uses column family'c Comparator* as the criteria to do tree
+  // ordering. If the comparator is gone, we won't even be able to remove the
+  // elements from the locktree.
+  // A possible solution might be to remove everything right now:
+  //  - wait until everyone traversing the locktree are gone
+  //  - remove everything from the locktree.
+  //  - some transactions may have acquired locks in their LockTracker objects.
+  //    Arrange something so we don't blow up when they try to release them.
+  //  - ...
+  // This use case (drop column family while somebody is using it) doesn't seem
+  // the priority, though.
+
+  {
+    InstrumentedMutexLock l(&ltree_map_mutex_);
+
+    auto lock_maps_iter = ltree_map_.find(column_family_id);
+    assert(lock_maps_iter != ltree_map_.end());
+    ltree_map_.erase(lock_maps_iter);
+  }  // lock_map_mutex_
+
+  autovector<void*> local_caches;
+  ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockTreeMap*>(cache);
+  }
+}
+
+std::shared_ptr<toku::locktree> RangeTreeLockManager::GetLockTreeForCF(
+    ColumnFamilyId column_family_id) {
+  // First check thread-local cache
+  if (ltree_lookup_cache_->Get() == nullptr) {
+    ltree_lookup_cache_->Reset(new LockTreeMap());
+  }
+
+  auto ltree_map_cache = static_cast<LockTreeMap*>(ltree_lookup_cache_->Get());
+
+  auto it = ltree_map_cache->find(column_family_id);
+  if (it != ltree_map_cache->end()) {
+    // Found lock map for this column family.
+    return it->second;
+  }
+
+  // Not found in local cache, grab mutex and check shared LockMaps
+  InstrumentedMutexLock l(&ltree_map_mutex_);
+
+  it = ltree_map_.find(column_family_id);
+  if (it == ltree_map_.end()) {
+    return nullptr;
+  } else {
+    // Found lock map.  Store in thread-local cache and return.
+    ltree_map_cache->insert({column_family_id, it->second});
+    return it->second;
+  }
+}
+
+struct LOCK_PRINT_CONTEXT {
+  RangeLockManagerHandle::RangeLockStatus* data;  // Save locks here
+  uint32_t cfh_id;  // Column Family whose tree we are traversing
+};
+
+// Report left endpoints of the acquired locks
+LockManager::PointLockStatus RangeTreeLockManager::GetPointLockStatus() {
+  PointLockStatus res;
+  LockManager::RangeLockStatus data = GetRangeLockStatus();
+  // report left endpoints
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    auto& val = it->second;
+    res.insert({it->first, {val.start.slice, val.ids, val.exclusive}});
+  }
+  return res;
+}
+
+static void push_into_lock_status_data(void* param, const DBT* left,
+                                       const DBT* right, TXNID txnid_arg,
+                                       bool is_shared, TxnidVector* owners) {
+  struct LOCK_PRINT_CONTEXT* ctx = (LOCK_PRINT_CONTEXT*)param;
+  struct RangeLockInfo info;
+
+  info.exclusive = !is_shared;
+
+  deserialize_endpoint(left, &info.start);
+  deserialize_endpoint(right, &info.end);
+
+  if (txnid_arg != TXNID_SHARED) {
+    TXNID txnid = ((PessimisticTransaction*)txnid_arg)->GetID();
+    info.ids.push_back(txnid);
+  } else {
+    for (auto it : *owners) {
+      TXNID real_id = ((PessimisticTransaction*)it)->GetID();
+      info.ids.push_back(real_id);
+    }
+  }
+  ctx->data->insert({ctx->cfh_id, info});
+}
+
+LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() {
+  LockManager::RangeLockStatus data;
+  {
+    InstrumentedMutexLock l(&ltree_map_mutex_);
+    for (auto it : ltree_map_) {
+      LOCK_PRINT_CONTEXT ctx = {&data, it.first};
+      it.second->dump_locks((void*)&ctx, push_into_lock_status_data);
+    }
+  }
+  return data;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,137 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+// For DeadlockInfoBuffer:
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
+
+// Lock Tree library:
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h"
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+typedef DeadlockInfoBufferTempl<RangeDeadlockPath> RangeDeadlockInfoBuffer;
+
+// A Range Lock Manager that uses PerconaFT's locktree library
+class RangeTreeLockManager : public RangeLockManagerBase,
+                             public RangeLockManagerHandle {
+ public:
+  LockManager* getLockManager() override { return this; }
+
+  void AddColumnFamily(const ColumnFamilyHandle* cfh) override;
+  void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override;
+
+  void Resize(uint32_t) override;
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+
+  std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() override;
+  void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override;
+
+  // Get a lock on a range
+  //  @note only exclusive locks are currently supported (requesting a
+  //  non-exclusive lock will get an exclusive one)
+  using LockManager::TryLock;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const Endpoint& start_endp, const Endpoint& end_endp, Env* env,
+                 bool exclusive) override;
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&,
+              const Endpoint&, Env*) override {
+    // TODO: range unlock does nothing...
+  }
+
+  explicit RangeTreeLockManager(
+      std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
+  ~RangeTreeLockManager() override;
+
+  int SetMaxLockMemory(size_t max_lock_memory) override {
+    return ltm_.set_max_lock_memory(max_lock_memory);
+  }
+
+  size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); }
+
+  Counters GetStatus() override;
+
+  bool IsPointLockSupported() const override {
+    // One could have acquired a point lock (it is reduced to range lock)
+    return true;
+  }
+
+  PointLockStatus GetPointLockStatus() override;
+
+  // This is from LockManager
+  LockManager::RangeLockStatus GetRangeLockStatus() override;
+
+  // This has the same meaning as GetRangeLockStatus but is from
+  // RangeLockManagerHandle
+  RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override {
+    return GetRangeLockStatus();
+  }
+
+  bool IsRangeLockSupported() const override { return true; }
+
+  const LockTrackerFactory& GetLockTrackerFactory() const override {
+    return RangeTreeLockTrackerFactory::Get();
+  }
+
+  // Get the locktree which stores locks for the Column Family with given cf_id
+  std::shared_ptr<toku::locktree> GetLockTreeForCF(ColumnFamilyId cf_id);
+
+  void SetEscalationBarrierFunc(EscalationBarrierFunc func) override {
+    barrier_func_ = func;
+  }
+
+ private:
+  toku::locktree_manager ltm_;
+
+  EscalationBarrierFunc barrier_func_ =
+      [](const Endpoint&, const Endpoint&) -> bool { return false; };
+
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  // Map from cf_id to locktree*. Can only be accessed while holding the
+  // ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt
+  using LockTreeMap =
+      std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::locktree>>;
+  LockTreeMap ltree_map_;
+
+  InstrumentedMutex ltree_map_mutex_;
+
+  // Per-thread cache of ltree_map_.
+  // (uses the same approach as TransactionLockMgr::lock_maps_cache_)
+  std::unique_ptr<ThreadLocalPtr> ltree_lookup_cache_;
+
+  RangeDeadlockInfoBuffer dlock_buffer_;
+
+  std::shared_ptr<toku::locktree> MakeLockTreePtr(toku::locktree* lt);
+  static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key);
+
+  // Callbacks
+  static int on_create(toku::locktree*, void*);
+  static void on_destroy(toku::locktree*) {}
+  static void on_escalate(TXNID txnid, const toku::locktree* lt,
+                          const toku::range_buffer& buffer, void* extra);
+
+  static bool OnEscalationBarrierCheck(const DBT* a, const DBT* b, void* extra);
+};
+
+void serialize_endpoint(const Endpoint& endp, std::string* buf);
+void wait_callback_for_locktree(void* cdata, toku::lock_wait_infos* infos);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,156 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockList *RangeTreeLockTracker::getOrCreateList() {
+  if (range_list_) return range_list_.get();
+
+  // Doesn't exist, create
+  range_list_.reset(new RangeLockList());
+  return range_list_.get();
+}
+
+void RangeTreeLockTracker::Track(const PointLockRequest &lock_req) {
+  DBT key_dbt;
+  std::string key;
+  serialize_endpoint(Endpoint(lock_req.key, false), &key);
+  toku_fill_dbt(&key_dbt, key.data(), key.size());
+  RangeLockList *rl = getOrCreateList();
+  rl->Append(lock_req.column_family_id, &key_dbt, &key_dbt);
+}
+
+void RangeTreeLockTracker::Track(const RangeLockRequest &lock_req) {
+  DBT start_dbt, end_dbt;
+  std::string start_key, end_key;
+
+  serialize_endpoint(lock_req.start_endp, &start_key);
+  serialize_endpoint(lock_req.end_endp, &end_key);
+
+  toku_fill_dbt(&start_dbt, start_key.data(), start_key.size());
+  toku_fill_dbt(&end_dbt, end_key.data(), end_key.size());
+
+  RangeLockList *rl = getOrCreateList();
+  rl->Append(lock_req.column_family_id, &start_dbt, &end_dbt);
+}
+
+PointLockStatus RangeTreeLockTracker::GetPointLockStatus(
+    ColumnFamilyId /*cf_id*/, const std::string & /*key*/) const {
+  // This function is not expected to be called as RangeTreeLockTracker::
+  // IsPointLockSupported() returns false. Return the status which indicates
+  // the point is not locked.
+  PointLockStatus p;
+  p.locked = false;
+  p.exclusive = true;
+  p.seq = 0;
+  return p;
+}
+
+void RangeTreeLockTracker::Clear() { range_list_.reset(); }
+
+void RangeLockList::Append(ColumnFamilyId cf_id, const DBT *left_key,
+                           const DBT *right_key) {
+  MutexLock l(&mutex_);
+  // Only the transaction owner thread calls this function.
+  // The same thread does the lock release, so we can be certain nobody is
+  // releasing the locks concurrently.
+  assert(!releasing_locks_.load());
+  auto it = buffers_.find(cf_id);
+  if (it == buffers_.end()) {
+    // create a new one
+    it = buffers_.emplace(cf_id, std::make_shared<toku::range_buffer>()).first;
+    it->second->create();
+  }
+  it->second->append(left_key, right_key);
+}
+
+void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr,
+                                 PessimisticTransaction *txn,
+                                 bool all_trx_locks) {
+  {
+    MutexLock l(&mutex_);
+    // The lt->release_locks() call below will walk range_list->buffer_. We
+    // need to prevent lock escalation callback from replacing
+    // range_list->buffer_ while we are doing that.
+    //
+    // Additional complication here is internal mutex(es) in the locktree
+    // (let's call them latches):
+    // - Lock escalation first obtains latches on the lock tree
+    // - Then, it calls RangeTreeLockManager::on_escalate to replace
+    // transaction's range_list->buffer_. = Access to that buffer must be
+    // synchronized, so it will want to acquire the range_list->mutex_.
+    //
+    // While in this function we would want to do the reverse:
+    // - Acquire range_list->mutex_ to prevent access to the range_list.
+    // - Then, lt->release_locks() call will walk through the range_list
+    // - and acquire latches on parts of the lock tree to remove locks from
+    //   it.
+    //
+    // How do we avoid the deadlock? The idea is that here we set
+    // releasing_locks_=true, and release the mutex.
+    // All other users of the range_list must:
+    // - Acquire the mutex, then check that releasing_locks_=false.
+    //   (the code in this function doesnt do that as there's only one thread
+    //    that releases transaction's locks)
+    releasing_locks_.store(true);
+  }
+
+  for (auto it : buffers_) {
+    // Don't try to call release_locks() if the buffer is empty! if we are
+    //  not holding any locks, the lock tree might be in the STO-mode with
+    //  another transaction, and our attempt to release an empty set of locks
+    //  will cause an assertion failure.
+    if (it.second->get_num_ranges()) {
+      auto lt_ptr = mgr->GetLockTreeForCF(it.first);
+      toku::locktree *lt = lt_ptr.get();
+
+      lt->release_locks((TXNID)txn, it.second.get(), all_trx_locks);
+
+      it.second->destroy();
+      it.second->create();
+
+      toku::lock_request::retry_all_lock_requests(lt,
+                                                  wait_callback_for_locktree);
+    }
+  }
+
+  Clear();
+  releasing_locks_.store(false);
+}
+
+void RangeLockList::ReplaceLocks(const toku::locktree *lt,
+                                 const toku::range_buffer &buffer) {
+  MutexLock l(&mutex_);
+  if (releasing_locks_.load()) {
+    // Do nothing. The transaction is releasing its locks, so it will not care
+    // about having a correct list of ranges. (In TokuDB,
+    // toku_db_txn_escalate_callback() makes use of this property, too)
+    return;
+  }
+
+  ColumnFamilyId cf_id = (ColumnFamilyId)lt->get_dict_id().dictid;
+
+  auto it = buffers_.find(cf_id);
+  it->second->destroy();
+  it->second->create();
+
+  toku::range_buffer::iterator iter(&buffer);
+  toku::range_buffer::iterator::record rec;
+  while (iter.current(&rec)) {
+    it->second->append(rec.get_left_key(), rec.get_right_key());
+    iter.next();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "util/mutexlock.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+// Range Locking:
+#include "lib/locktree/lock_request.h"
+#include "lib/locktree/locktree.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTreeLockManager;
+
+// Storage for locks that are currently held by a transaction.
+//
+// Locks are kept in toku::range_buffer because toku::locktree::release_locks()
+// accepts that as an argument.
+//
+// Note: the list of locks may differ slighly from the contents of the lock
+// tree, due to concurrency between lock acquisition, lock release, and lock
+// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details.
+// This property is currently harmless.
+//
+// Append() and ReleaseLocks() are not thread-safe, as they are expected to be
+// called only by the owner transaction. ReplaceLocks() is safe to call from
+// other threads.
+class RangeLockList {
+ public:
+  ~RangeLockList() { Clear(); }
+
+  RangeLockList() : releasing_locks_(false) {}
+
+  void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key);
+  void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+                    bool all_trx_locks);
+  void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer);
+
+ private:
+  void Clear() {
+    for (auto it : buffers_) {
+      it.second->destroy();
+    }
+    buffers_.clear();
+  }
+
+  std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::range_buffer>>
+      buffers_;
+  port::Mutex mutex_;
+  std::atomic<bool> releasing_locks_;
+};
+
+// A LockTracker-based object that is used together with RangeTreeLockManager.
+class RangeTreeLockTracker : public LockTracker {
+ public:
+  RangeTreeLockTracker() : range_list_(nullptr) {}
+
+  RangeTreeLockTracker(const RangeTreeLockTracker&) = delete;
+  RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete;
+
+  void Track(const PointLockRequest&) override;
+  void Track(const RangeLockRequest&) override;
+
+  bool IsPointLockSupported() const override {
+    // This indicates that we don't implement GetPointLockStatus()
+    return false;
+  }
+  bool IsRangeLockSupported() const override { return true; }
+
+  // a Not-supported dummy implementation.
+  UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  // "If this method is not supported, leave it as a no-op."
+  void Merge(const LockTracker&) override {}
+
+  // "If this method is not supported, leave it as a no-op."
+  void Subtract(const LockTracker&) override {}
+
+  void Clear() override;
+
+  // "If this method is not supported, returns nullptr."
+  virtual LockTracker* GetTrackedLocksSinceSavePoint(
+      const LockTracker&) const override {
+    return nullptr;
+  }
+
+  PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
+                                     const std::string& key) const override;
+
+  // The return value is only used for tests
+  uint64_t GetNumPointLocks() const override { return 0; }
+
+  ColumnFamilyIterator* GetColumnFamilyIterator() const override {
+    return nullptr;
+  }
+
+  KeyIterator* GetKeyIterator(
+      ColumnFamilyId /*column_family_id*/) const override {
+    return nullptr;
+  }
+
+  void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+                    bool all_trx_locks) {
+    if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks);
+  }
+
+  void ReplaceLocks(const toku::locktree* lt,
+                    const toku::range_buffer& buffer) {
+    // range_list_ cannot be NULL here
+    range_list_->ReplaceLocks(lt, buffer);
+  }
+
+ private:
+  RangeLockList* getOrCreateList();
+  std::unique_ptr<RangeLockList> range_list_;
+};
+
+class RangeTreeLockTrackerFactory : public LockTrackerFactory {
+ public:
+  static const RangeTreeLockTrackerFactory& Get() {
+    static const RangeTreeLockTrackerFactory instance;
+    return instance;
+  }
+
+  LockTracker* Create() const override { return new RangeTreeLockTracker(); }
+
+ private:
+  RangeTreeLockTrackerFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction.cc	2025-05-19 16:14:28.000000000 +0000
@@ -17,9 +17,10 @@
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
-#include "utilities/transactions/transaction_util.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
 #include "utilities/transactions/optimistic_transaction.h"
 #include "utilities/transactions/optimistic_transaction_db_impl.h"
+#include "utilities/transactions/transaction_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -28,7 +29,9 @@
 OptimisticTransaction::OptimisticTransaction(
     OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
     const OptimisticTransactionOptions& txn_options)
-    : TransactionBaseImpl(txn_db->GetBaseDB(), write_options), txn_db_(txn_db) {
+    : TransactionBaseImpl(txn_db->GetBaseDB(), write_options,
+                          PointLockTrackerFactory::Get()),
+      txn_db_(txn_db) {
   Initialize(txn_options);
 }
 
@@ -76,7 +79,7 @@
   // check whether this transaction is safe to be committed.
   OptimisticTransactionCallback callback(this);
 
-  DBImpl* db_impl = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
 
   Status s = db_impl->WriteWithCallback(
       write_options_, GetWriteBatch()->GetWriteBatch(), &callback);
@@ -92,14 +95,22 @@
   auto txn_db_impl = static_cast_with_check<OptimisticTransactionDBImpl,
                                             OptimisticTransactionDB>(txn_db_);
   assert(txn_db_impl);
-  DBImpl* db_impl = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
   assert(db_impl);
   const size_t space = txn_db_impl->GetLockBucketsSize();
   std::set<size_t> lk_idxes;
   std::vector<std::unique_lock<std::mutex>> lks;
-  for (auto& cfit : GetTrackedKeys()) {
-    for (auto& keyit : cfit.second) {
-      lk_idxes.insert(fastrange64(GetSliceNPHash64(keyit.first), space));
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracked_locks_->GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracked_locks_->GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      lk_idxes.insert(FastRange64(GetSliceNPHash64(key), space));
     }
   }
   // NOTE: in a single txn, all bucket-locks are taken in ascending order.
@@ -109,7 +120,7 @@
     lks.emplace_back(txn_db_impl->LockBucket(v));
   }
 
-  Status s = TransactionUtil::CheckKeysForConflicts(db_impl, GetTrackedKeys(),
+  Status s = TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_,
                                                     true /* cache_only */);
   if (!s.ok()) {
     return s;
@@ -166,15 +177,13 @@
 // Should only be called on writer thread in order to avoid any race conditions
 // in detecting write conflicts.
 Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) {
-  Status result;
-
-  auto db_impl = static_cast_with_check<DBImpl, DB>(db);
+  auto db_impl = static_cast_with_check<DBImpl>(db);
 
   // Since we are on the write thread and do not want to block other writers,
   // we will do a cache-only conflict check.  This can result in TryAgain
   // getting returned if there is not sufficient memtable history to check
   // for conflicts.
-  return TransactionUtil::CheckKeysForConflicts(db_impl, GetTrackedKeys(),
+  return TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_,
                                                 true /* cache_only */);
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -46,6 +46,22 @@
                                 const OptimisticTransactionOptions& txn_options,
                                 Transaction* old_txn) override;
 
+  // Transactional `DeleteRange()` is not yet supported.
+  virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+                             const Slice&, const Slice&) override {
+    return Status::NotSupported();
+  }
+
+  // Range deletions also must not be snuck into `WriteBatch`es as they are
+  // incompatible with `OptimisticTransactionDB`.
+  virtual Status Write(const WriteOptions& write_opts,
+                       WriteBatch* batch) override {
+    if (batch->HasDeleteRange()) {
+      return Status::NotSupported();
+    }
+    return OptimisticTransactionDB::Write(write_opts, batch);
+  }
+
   size_t GetLockBucketsSize() const { return bucketed_locks_.size(); }
 
   OccValidationPolicy GetValidatePolicy() const { return validate_policy_; }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -10,7 +10,7 @@
 #include <thread>
 
 #include "db/db_impl/db_impl.h"
-#include "logging/logging.h"
+#include "db/db_test_util.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/perf_context.h"
@@ -37,7 +37,8 @@
   OptimisticTransactionTest() {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
-    options.max_write_buffer_size_to_maintain = 1600;
+    options.max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
+    options.merge_operator.reset(new TestPutOperator());
     dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
 
     DestroyDB(dbname, options);
@@ -67,9 +68,9 @@
         OptimisticTransactionDB::Open(DBOptions(options), occ_opts, dbname,
                                       column_families, &handles, &txn_db);
 
-    assert(s.ok());
-    assert(txn_db != nullptr);
-    assert(handles.size() == 1);
+    ASSERT_OK(s);
+    ASSERT_NE(txn_db, nullptr);
+    ASSERT_EQ(handles.size(), 1);
     delete handles[0];
   }
 };
@@ -78,26 +79,24 @@
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
-  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
-  txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
 
-  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -107,31 +106,29 @@
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "foo", "bar");
-  txn_db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn->Put("foo", "bar2");
+  ASSERT_OK(txn->Put("foo", "bar2"));
 
   // This Put outside of a transaction will conflict with the previous write
-  s = txn_db->Put(write_options, "foo", "barz");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
 
-  s = txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
   ASSERT_EQ(1, txn->GetNumKeys());
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
-  txn_db->Get(read_options, "foo2", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -142,31 +139,30 @@
   ReadOptions read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "foo", "bar");
-  txn_db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   // This Put outside of a transaction will conflict with a later write
-  s = txn_db->Put(write_options, "foo", "barz");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
 
-  txn->Put("foo", "bar2");  // Conflicts with write done after snapshot taken
+  ASSERT_OK(txn->Put(
+      "foo", "bar2"));  // Conflicts with write done after snapshot taken
 
-  s = txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
-  txn_db->Get(read_options, "foo2", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -177,35 +173,33 @@
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "foo", "bar");
-  txn_db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   // This Put outside of a transaction will conflict with the previous read
-  s = txn_db->Put(write_options, "foo", "barz");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
 
-  s = txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
-  txn->GetForUpdate(read_options, "foo2", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo2", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -218,15 +212,13 @@
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn->Put("x", "y");
+  ASSERT_OK(txn->Put("x", "y"));
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
   delete txn;
 }
@@ -235,37 +227,34 @@
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
-  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
-  txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a memtable to flush
-  s = txn_db->Put(write_options, "dummy", "dummy");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
 
   // force a memtable flush
   FlushOptions flush_ops;
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
-  s = txn->Commit();
   // txn should commit since the flushed table is still in MemtableList History
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -275,52 +264,48 @@
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
-  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
-  txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a MemTable to flush
-  s = txn_db->Put(write_options, "dummy", "dummy");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
 
   // force a memtable flush
   FlushOptions flush_ops;
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
   // Put a random key so we have a MemTable to flush
-  s = txn_db->Put(write_options, "dummy", "dummy2");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy2"));
 
   // force a memtable flush
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
-  s = txn_db->Put(write_options, "dummy", "dummy3");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy3"));
 
   // force a memtable flush
   // Since our test db has max_write_buffer_number=2, this flush will cause
   // the first memtable to get purged from the MemtableList history.
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   // txn should not commit since MemTableList History is not large enough
   ASSERT_TRUE(s.IsTryAgain());
 
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -333,7 +318,6 @@
   const int kAttemptImmMemTable = 1;
   for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
        attempt++) {
-    options.max_write_buffer_number_to_maintain = 3;
     Reopen();
 
     WriteOptions write_options;
@@ -341,7 +325,6 @@
     ReadOptions snapshot_read_options;
     ReadOptions snapshot_read_options2;
     string value;
-    Status s;
 
     ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
     ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
@@ -381,9 +364,9 @@
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_OK(txn_db->Flush(flush_ops));
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB());
-      db_impl->TEST_SwitchMemtable();
+      ASSERT_OK(db_impl->TEST_SwitchMemtable());
     }
     uint64_t num_imm_mems;
     ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
@@ -391,7 +374,7 @@
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_EQ(0, num_imm_mems);
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(1, num_imm_mems);
     }
 
@@ -408,7 +391,7 @@
     SetPerfLevel(PerfLevel::kEnableCount);
 
     get_perf_context()->Reset();
-    s = txn->Commit();
+    Status s = txn->Commit();
     // We should have checked two memtables
     ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
     // txn should fail because of conflict, even if the memtable
@@ -421,7 +404,7 @@
     ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
     ASSERT_TRUE(s.ok());
 
-    txn3->Put(Slice("foo2"), Slice("bar2"));
+    ASSERT_OK(txn3->Put(Slice("foo2"), Slice("bar2")));
     get_perf_context()->Reset();
     s = txn3->Commit();
     // txn3 is created after the active memtable is created, so that is the only
@@ -444,26 +427,24 @@
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "AAA", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   // Modify key after transaction start
-  txn_db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snapshot
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("AAA", "bar2");
+  ASSERT_OK(txn->Put("AAA", "bar2"));
 
   // Should commit since read/write was done after data changed
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -473,75 +454,64 @@
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "AAA", "bar");
-  txn_db->Put(write_options, "BBB", "bar");
-  txn_db->Put(write_options, "CCC", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "BBB", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "CCC", "bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn_db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snapshot
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("AAA", "bar2");
+  ASSERT_OK(txn->Put("AAA", "bar2"));
 
   // Modify BBB before snapshot is taken
-  txn_db->Put(write_options, "BBB", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "BBB", "bar1"));
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   // Read and write with snapshot
-  txn->GetForUpdate(snapshot_read_options, "BBB", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("BBB", "bar2");
+  ASSERT_OK(txn->Put("BBB", "bar2"));
 
-  txn_db->Put(write_options, "CCC", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "CCC", "bar1"));
 
   // Set a new snapshot
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   // Read and write with snapshot
-  txn->GetForUpdate(snapshot_read_options, "CCC", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("CCC", "bar2");
+  ASSERT_OK(txn->Put("CCC", "bar2"));
 
-  s = txn->GetForUpdate(read_options, "AAA", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn->GetForUpdate(read_options, "BBB", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "BBB", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn->GetForUpdate(read_options, "CCC", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "CCC", &value));
   ASSERT_EQ(value, "bar2");
 
-  s = txn_db->Get(read_options, "AAA", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
-  s = txn_db->Get(read_options, "BBB", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
   ASSERT_EQ(value, "bar1");
-  s = txn_db->Get(read_options, "CCC", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
   ASSERT_EQ(value, "bar1");
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  s = txn_db->Get(read_options, "AAA", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn_db->Get(read_options, "BBB", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn_db->Get(read_options, "CCC", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
   ASSERT_EQ(value, "bar2");
 
   // verify that we track multiple writes to the same key at different snapshots
@@ -549,8 +519,8 @@
   txn = txn_db->BeginTransaction(write_options);
 
   // Potentially conflicting writes
-  txn_db->Put(write_options, "ZZZ", "zzz");
-  txn_db->Put(write_options, "XXX", "xxx");
+  ASSERT_OK(txn_db->Put(write_options, "ZZZ", "zzz"));
+  ASSERT_OK(txn_db->Put(write_options, "XXX", "xxx"));
 
   txn->SetSnapshot();
 
@@ -561,16 +531,15 @@
 
   // This should not conflict in txn since the snapshot is later than the
   // previous write (spoiler alert:  it will later conflict with txn2).
-  txn->Put("ZZZ", "zzzz");
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("ZZZ", "zzzz"));
+  ASSERT_OK(txn->Commit());
 
   delete txn;
 
   // This will conflict since the snapshot is earlier than another write to ZZZ
-  txn2->Put("ZZZ", "xxxxx");
+  ASSERT_OK(txn2->Put("ZZZ", "xxxxx"));
 
-  s = txn2->Commit();
+  Status s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete txn2;
@@ -581,16 +550,13 @@
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   ColumnFamilyHandle *cfa, *cfb;
   ColumnFamilyOptions cf_options;
 
   // Create 2 new column families
-  s = txn_db->CreateColumnFamily(cf_options, "CFA", &cfa);
-  ASSERT_OK(s);
-  s = txn_db->CreateColumnFamily(cf_options, "CFB", &cfb);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFA", &cfa));
+  ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFB", &cfb));
 
   delete cfa;
   delete cfb;
@@ -608,13 +574,13 @@
   column_families.push_back(
       ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  s = OptimisticTransactionDB::Open(options, dbname, column_families, &handles,
-                                    &txn_db);
-  ASSERT_OK(s);
+  ASSERT_OK(OptimisticTransactionDB::Open(options, dbname, column_families,
+                                          &handles, &txn_db));
   assert(txn_db != nullptr);
+  ASSERT_NE(txn_db, nullptr);
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
@@ -625,26 +591,27 @@
 
   // Write some data to the db
   WriteBatch batch;
-  batch.Put("foo", "foo");
-  batch.Put(handles[1], "AAA", "bar");
-  batch.Put(handles[1], "AAAZZZ", "bar");
-  s = txn_db->Write(write_options, &batch);
-  ASSERT_OK(s);
-  txn_db->Delete(write_options, handles[1], "AAAZZZ");
+  ASSERT_OK(batch.Put("foo", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+  ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
+  ASSERT_OK(txn_db->Write(write_options, &batch));
+  ASSERT_OK(txn_db->Delete(write_options, handles[1], "AAAZZZ"));
 
   // These keys do no conflict with existing writes since they're in
   // different column families
-  txn->Delete("AAA");
-  txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  ASSERT_OK(txn->Delete("AAA"));
+  Status s =
+      txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
   Slice key_slice("AAAZZZ");
   Slice value_slices[2] = {Slice("bar"), Slice("bar")};
-  txn->Put(handles[2], SliceParts(&key_slice, 1), SliceParts(value_slices, 2));
+  ASSERT_OK(txn->Put(handles[2], SliceParts(&key_slice, 1),
+                     SliceParts(value_slices, 2)));
 
   ASSERT_EQ(3, txn->GetNumKeys());
 
   // Txn should commit
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   s = txn_db->Get(read_options, "AAA", &value);
   ASSERT_TRUE(s.IsNotFound());
   s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value);
@@ -653,10 +620,11 @@
   Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
   Slice value_slice("barbarbar");
   // This write will cause a conflict with the earlier batch write
-  txn2->Put(handles[1], SliceParts(key_slices, 3), SliceParts(&value_slice, 1));
+  ASSERT_OK(txn2->Put(handles[1], SliceParts(key_slices, 3),
+                      SliceParts(&value_slice, 1)));
 
-  txn2->Delete(handles[2], "XXX");
-  txn2->Delete(handles[1], "XXX");
+  ASSERT_OK(txn2->Delete(handles[2], "XXX"));
+  ASSERT_OK(txn2->Delete(handles[1], "XXX"));
   s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value);
   ASSERT_TRUE(s.IsNotFound());
 
@@ -664,6 +632,7 @@
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
   s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
   ASSERT_EQ(value, "barbar");
 
   delete txn;
@@ -673,7 +642,7 @@
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
                                                    handles[0], handles[2]};
@@ -690,22 +659,21 @@
   ASSERT_EQ(values[1], "barbar");
   ASSERT_EQ(values[2], "foo");
 
-  txn->Delete(handles[2], "ZZZ");
-  txn->Put(handles[2], "ZZZ", "YYY");
-  txn->Put(handles[2], "ZZZ", "YYYY");
-  txn->Delete(handles[2], "ZZZ");
-  txn->Put(handles[2], "AAAZZZ", "barbarbar");
+  ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+  ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYY"));
+  ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYYY"));
+  ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+  ASSERT_OK(txn->Put(handles[2], "AAAZZZ", "barbarbar"));
 
   ASSERT_EQ(5, txn->GetNumKeys());
 
   // Txn should commit
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   s = txn_db->Get(read_options, handles[2], "ZZZ", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   // Put a key which will conflict with the next txn using the previous snapshot
-  txn_db->Put(write_options, handles[2], "foo", "000");
+  ASSERT_OK(txn_db->Put(write_options, handles[2], "foo", "000"));
 
   results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
                                     multiget_keys, &values);
@@ -738,35 +706,31 @@
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  s = txn_db->Put(write_options, "aaa", "aaa");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   delete txn;
 
   txn = txn_db->BeginTransaction(write_options);
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 
   txn = txn_db->BeginTransaction(write_options);
-  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
   ASSERT_EQ(value, "aaa");
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   delete txn;
 
   txn = txn_db->BeginTransaction(write_options);
   txn->SetSnapshot();
-  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
   ASSERT_EQ(value, "aaa");
 
-  s = txn_db->Put(write_options, "aaa", "xxx");
-  s = txn->Commit();
+  ASSERT_OK(txn_db->Put(write_options, "aaa", "xxx"));
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
   delete txn;
 }
@@ -776,7 +740,6 @@
   ReadOptions read_options1, read_options2;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   txn_options.set_snapshot = true;
   Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options);
@@ -791,20 +754,23 @@
 
   std::vector<Status> results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
-  txn2->Put("2", "x");
+  ASSERT_OK(txn2->Put("2", "x"));
 
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Commit());
 
   multiget_values.clear();
   results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
   // should not commit since txn2 wrote a key txn has read
-  s = txn1->Commit();
+  Status s = txn1->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete txn1;
@@ -816,13 +782,12 @@
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("4", "x");
+  ASSERT_OK(txn1->Put("4", "x"));
 
-  txn2->Delete("4");
+  ASSERT_OK(txn2->Delete("4"));
 
   // txn1 can commit since txn2's delete hasn't happened yet (it's just batched)
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
 
   s = txn2->GetForUpdate(read_options2, "4", &value);
   ASSERT_TRUE(s.IsNotFound());
@@ -840,7 +805,6 @@
   ReadOptions read_options, read_options1, read_options2;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   // Test 2 transactions writing to the same key in multiple orders and
   // with/without snapshots
@@ -848,13 +812,12 @@
   Transaction* txn1 = txn_db->BeginTransaction(write_options);
   Transaction* txn2 = txn_db->BeginTransaction(write_options);
 
-  txn1->Put("1", "1");
-  txn2->Put("1", "2");
+  ASSERT_OK(txn1->Put("1", "1"));
+  ASSERT_OK(txn2->Put("1", "2"));
 
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
 
-  s = txn2->Commit();
+  Status s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete txn1;
@@ -867,11 +830,10 @@
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("1", "3");
-  txn2->Put("1", "4");
+  ASSERT_OK(txn1->Put("1", "3"));
+  ASSERT_OK(txn2->Put("1", "4"));
 
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
 
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
@@ -885,11 +847,10 @@
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("1", "5");
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Put("1", "5"));
+  ASSERT_OK(txn1->Commit());
 
-  txn2->Put("1", "6");
+  ASSERT_OK(txn2->Put("1", "6"));
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
 
@@ -902,14 +863,12 @@
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("1", "5");
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Put("1", "5"));
+  ASSERT_OK(txn1->Commit());
 
   txn2->SetSnapshot();
-  txn2->Put("1", "6");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("1", "6"));
+  ASSERT_OK(txn2->Commit());
 
   delete txn1;
   delete txn2;
@@ -917,19 +876,16 @@
   txn1 = txn_db->BeginTransaction(write_options);
   txn2 = txn_db->BeginTransaction(write_options);
 
-  txn1->Put("1", "7");
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Put("1", "7"));
+  ASSERT_OK(txn1->Commit());
 
-  txn2->Put("1", "8");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("1", "8"));
+  ASSERT_OK(txn2->Commit());
 
   delete txn1;
   delete txn2;
 
-  s = txn_db->Get(read_options, "1", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "1", &value));
   ASSERT_EQ(value, "8");
 }
 
@@ -941,26 +897,24 @@
 
   // Verify transaction rollback works for untracked keys.
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  txn->PutUntracked("untracked", "0");
-  txn->Rollback();
+  ASSERT_OK(txn->PutUntracked("untracked", "0"));
+  ASSERT_OK(txn->Rollback());
   s = txn_db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   delete txn;
   txn = txn_db->BeginTransaction(write_options);
 
-  txn->Put("tracked", "1");
-  txn->PutUntracked("untracked", "1");
-  txn->MergeUntracked("untracked", "2");
-  txn->DeleteUntracked("untracked");
+  ASSERT_OK(txn->Put("tracked", "1"));
+  ASSERT_OK(txn->PutUntracked("untracked", "1"));
+  ASSERT_OK(txn->MergeUntracked("untracked", "2"));
+  ASSERT_OK(txn->DeleteUntracked("untracked"));
 
   // Write to the untracked key outside of the transaction and verify
   // it doesn't prevent the transaction from committing.
-  s = txn_db->Put(write_options, "untracked", "x");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "untracked", "x"));
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
   s = txn_db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
@@ -968,12 +922,12 @@
   delete txn;
   txn = txn_db->BeginTransaction(write_options);
 
-  txn->Put("tracked", "10");
-  txn->PutUntracked("untracked", "A");
+  ASSERT_OK(txn->Put("tracked", "10"));
+  ASSERT_OK(txn->PutUntracked("untracked", "A"));
 
   // Write to tracked key outside of the transaction and verify that the
   // untracked keys are not written when the commit fails.
-  s = txn_db->Delete(write_options, "tracked");
+  ASSERT_OK(txn_db->Delete(write_options, "tracked"));
 
   s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
@@ -989,49 +943,29 @@
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   // Write some keys to the db
-  s = txn_db->Put(write_options, "A", "a");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "G", "g");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "F", "f");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "C", "c");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "D", "d");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "A", "a"));
+  ASSERT_OK(txn_db->Put(write_options, "G", "g"));
+  ASSERT_OK(txn_db->Put(write_options, "F", "f"));
+  ASSERT_OK(txn_db->Put(write_options, "C", "c"));
+  ASSERT_OK(txn_db->Put(write_options, "D", "d"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   // Write some keys in a txn
-  s = txn->Put("B", "b");
-  ASSERT_OK(s);
-
-  s = txn->Put("H", "h");
-  ASSERT_OK(s);
-
-  s = txn->Delete("D");
-  ASSERT_OK(s);
-
-  s = txn->Put("E", "e");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("B", "b"));
+  ASSERT_OK(txn->Put("H", "h"));
+  ASSERT_OK(txn->Delete("D"));
+  ASSERT_OK(txn->Put("E", "e"));
 
   txn->SetSnapshot();
   const Snapshot* snapshot = txn->GetSnapshot();
 
   // Write some keys to the db after the snapshot
-  s = txn_db->Put(write_options, "BB", "xx");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "C", "xx");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "BB", "xx"));
+  ASSERT_OK(txn_db->Put(write_options, "C", "xx"));
 
   read_options.snapshot = snapshot;
   Iterator* iter = txn->GetIterator(read_options);
@@ -1045,8 +979,7 @@
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(results[i], iter->value().ToString());
 
-    s = txn->GetForUpdate(read_options, iter->key(), nullptr);
-    ASSERT_OK(s);
+    ASSERT_OK(txn->GetForUpdate(read_options, iter->key(), nullptr));
 
     iter->Next();
   }
@@ -1092,24 +1025,34 @@
   ASSERT_EQ("h", iter->value().ToString());
 
   // key "C" was modified in the db after txn's snapshot.  txn will not commit.
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete iter;
   delete txn;
 }
 
+TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) {
+  // `OptimisticTransactionDB` does not allow range deletion in any API.
+  ASSERT_TRUE(
+      txn_db
+          ->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b")
+          .IsNotSupported());
+  WriteBatch wb;
+  ASSERT_OK(wb.DeleteRange("a", "b"));
+  ASSERT_NOK(txn_db->Write(WriteOptions(), &wb));
+}
+
 TEST_P(OptimisticTransactionTest, SavepointTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  s = txn->RollbackToSavePoint();
+  Status s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
 
   txn->SetSavePoint();  // 1
@@ -1118,104 +1061,68 @@
   s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Put("B", "b");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("B", "b"));
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  s = txn_db->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "B", &value));
   ASSERT_EQ("b", value);
 
   delete txn;
   txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  s = txn->Put("A", "a");
-  ASSERT_OK(s);
-
-  s = txn->Put("B", "bb");
-  ASSERT_OK(s);
-
-  s = txn->Put("C", "c");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("A", "a"));
+  ASSERT_OK(txn->Put("B", "bb"));
+  ASSERT_OK(txn->Put("C", "c"));
 
   txn->SetSavePoint();  // 2
 
-  s = txn->Delete("B");
-  ASSERT_OK(s);
-
-  s = txn->Put("C", "cc");
-  ASSERT_OK(s);
-
-  s = txn->Put("D", "d");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Delete("B"));
+  ASSERT_OK(txn->Put("C", "cc"));
+  ASSERT_OK(txn->Put("D", "d"));
 
   ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 2
 
-  s = txn->Get(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "A", &value));
   ASSERT_EQ("a", value);
-
-  s = txn->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "B", &value));
   ASSERT_EQ("bb", value);
-
-  s = txn->Get(read_options, "C", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "C", &value));
   ASSERT_EQ("c", value);
-
   s = txn->Get(read_options, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Put("A", "a");
-  ASSERT_OK(s);
-
-  s = txn->Put("E", "e");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("A", "a"));
+  ASSERT_OK(txn->Put("E", "e"));
 
   // Rollback to beginning of txn
   s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
 
   s = txn->Get(read_options, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-
-  s = txn->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "B", &value));
   ASSERT_EQ("b", value);
-
   s = txn->Get(read_options, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
-
   s = txn->Get(read_options, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
-
   s = txn->Get(read_options, "E", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Put("A", "aa");
-  ASSERT_OK(s);
-
-  s = txn->Put("F", "f");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("A", "aa"));
+  ASSERT_OK(txn->Put("F", "f"));
 
   txn->SetSavePoint();  // 3
   txn->SetSavePoint();  // 4
 
-  s = txn->Put("G", "g");
-  ASSERT_OK(s);
-
-  s = txn->Delete("F");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("G", "g"));
+  ASSERT_OK(txn->Delete("F"));
+  ASSERT_OK(txn->Delete("B"));
 
-  s = txn->Delete("B");
-  ASSERT_OK(s);
-
-  s = txn->Get(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "A", &value));
   ASSERT_EQ("aa", value);
 
   s = txn->Get(read_options, "F", &value);
@@ -1226,29 +1133,24 @@
 
   ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 3
 
-  s = txn->Get(read_options, "F", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "F", &value));
   ASSERT_EQ("f", value);
 
   s = txn->Get(read_options, "G", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  s = txn_db->Get(read_options, "F", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "F", &value));
   ASSERT_EQ("f", value);
 
   s = txn_db->Get(read_options, "G", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn_db->Get(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "A", &value));
   ASSERT_EQ("aa", value);
 
-  s = txn_db->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "B", &value));
   ASSERT_EQ("b", value);
 
   s = txn_db->Get(read_options, "C", &value);
@@ -1268,61 +1170,52 @@
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "A", "");
+  ASSERT_OK(txn_db->Put(write_options, "A", ""));
 
   Transaction* txn1 = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn1);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
 
   Transaction* txn2 = txn_db->BeginTransaction(write_options);
   txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 can commit since A isn't conflict checked
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 
   txn1 = txn_db->BeginTransaction(write_options);
-  txn1->Put("A", "a");
+  ASSERT_OK(txn1->Put("A", "a"));
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
-  s = txn1->Commit();
+  Status s = txn1->Commit();
   ASSERT_TRUE(s.IsBusy());
   delete txn1;
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
@@ -1332,37 +1225,31 @@
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 can commit since A isn't conflict checked
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->SetSavePoint();
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
@@ -1372,18 +1259,15 @@
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->SetSavePoint();
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
@@ -1393,26 +1277,22 @@
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->SetSavePoint();
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
   txn1->UndoGetForUpdate("A");
 
-  txn1->RollbackToSavePoint();
+  ASSERT_OK(txn1->RollbackToSavePoint());
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 can commit since A isn't conflict checked
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 }
 
@@ -1440,6 +1320,8 @@
     }
   }
 
+  inserter.GetLastStatus().PermitUncheckedError();
+
   // Make sure at least some of the transactions succeeded.  It's ok if
   // some failed due to write-conflicts.
   if (inserter.GetFailureCount() > num_transactions / 2) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc	2025-05-19 16:14:28.000000000 +0000
@@ -14,6 +14,7 @@
 
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/snapshot.h"
@@ -38,7 +39,10 @@
 PessimisticTransaction::PessimisticTransaction(
     TransactionDB* txn_db, const WriteOptions& write_options,
     const TransactionOptions& txn_options, const bool init)
-    : TransactionBaseImpl(txn_db->GetRootDB(), write_options),
+    : TransactionBaseImpl(
+          txn_db->GetRootDB(), write_options,
+          static_cast_with_check<PessimisticTransactionDB>(txn_db)
+              ->GetLockTrackerFactory()),
       txn_db_impl_(nullptr),
       expiration_time_(0),
       txn_id_(0),
@@ -48,9 +52,8 @@
       deadlock_detect_(false),
       deadlock_detect_depth_(0),
       skip_concurrency_control_(false) {
-  txn_db_impl_ =
-      static_cast_with_check<PessimisticTransactionDB, TransactionDB>(txn_db);
-  db_impl_ = static_cast_with_check<DBImpl, DB>(db_);
+  txn_db_impl_ = static_cast_with_check<PessimisticTransactionDB>(txn_db);
+  db_impl_ = static_cast_with_check<DBImpl>(db_);
   if (init) {
     Initialize(txn_options);
   }
@@ -88,27 +91,28 @@
   }
   use_only_the_last_commit_time_batch_for_recovery_ =
       txn_options.use_only_the_last_commit_time_batch_for_recovery;
+  skip_prepare_ = txn_options.skip_prepare;
 }
 
 PessimisticTransaction::~PessimisticTransaction() {
-  txn_db_impl_->UnLock(this, &GetTrackedKeys());
+  txn_db_impl_->UnLock(this, *tracked_locks_);
   if (expiration_time_ > 0) {
     txn_db_impl_->RemoveExpirableTransaction(txn_id_);
   }
-  if (!name_.empty() && txn_state_ != COMMITED) {
+  if (!name_.empty() && txn_state_ != COMMITTED) {
     txn_db_impl_->UnregisterTransaction(this);
   }
 }
 
 void PessimisticTransaction::Clear() {
-  txn_db_impl_->UnLock(this, &GetTrackedKeys());
+  txn_db_impl_->UnLock(this, *tracked_locks_);
   TransactionBaseImpl::Clear();
 }
 
 void PessimisticTransaction::Reinitialize(
     TransactionDB* txn_db, const WriteOptions& write_options,
     const TransactionOptions& txn_options) {
-  if (!name_.empty() && txn_state_ != COMMITED) {
+  if (!name_.empty() && txn_state_ != COMMITTED) {
     txn_db_impl_->UnregisterTransaction(this);
   }
   TransactionBaseImpl::Reinitialize(txn_db->GetRootDB(), write_options);
@@ -117,7 +121,7 @@
 
 bool PessimisticTransaction::IsExpired() const {
   if (expiration_time_ > 0) {
-    if (db_->GetEnv()->NowMicros() >= expiration_time_) {
+    if (dbimpl_->GetSystemClock()->NowMicros() >= expiration_time_) {
       // Transaction is expired.
       return true;
     }
@@ -132,8 +136,8 @@
     : PessimisticTransaction(txn_db, write_options, txn_options){};
 
 Status PessimisticTransaction::CommitBatch(WriteBatch* batch) {
-  TransactionKeyMap keys_to_unlock;
-  Status s = LockBatch(batch, &keys_to_unlock);
+  std::unique_ptr<LockTracker> keys_to_unlock(lock_tracker_factory_.Create());
+  Status s = LockBatch(batch, keys_to_unlock.get());
 
   if (!s.ok()) {
     return s;
@@ -156,7 +160,7 @@
     txn_state_.store(AWAITING_COMMIT);
     s = CommitBatchInternal(batch);
     if (s.ok()) {
-      txn_state_.store(COMMITED);
+      txn_state_.store(COMMITTED);
     }
   } else if (txn_state_ == LOCKS_STOLEN) {
     s = Status::Expired();
@@ -164,14 +168,12 @@
     s = Status::InvalidArgument("Transaction is not in state for commit.");
   }
 
-  txn_db_impl_->UnLock(this, &keys_to_unlock);
+  txn_db_impl_->UnLock(this, *keys_to_unlock);
 
   return s;
 }
 
 Status PessimisticTransaction::Prepare() {
-  Status s;
-
   if (name_.empty()) {
     return Status::InvalidArgument(
         "Cannot prepare a transaction that has not been named.");
@@ -181,6 +183,7 @@
     return Status::Expired();
   }
 
+  Status s;
   bool can_prepare = false;
 
   if (expiration_time_ > 0) {
@@ -191,11 +194,11 @@
                                                       AWAITING_PREPARE);
   } else if (txn_state_ == STARTED) {
     // expiration and lock stealing is not possible
+    txn_state_.store(AWAITING_PREPARE);
     can_prepare = true;
   }
 
   if (can_prepare) {
-    txn_state_.store(AWAITING_PREPARE);
     // transaction can't expire after preparation
     expiration_time_ = 0;
     assert(log_number_ == 0 ||
@@ -209,7 +212,7 @@
     s = Status::Expired();
   } else if (txn_state_ == PREPARED) {
     s = Status::InvalidArgument("Transaction has already been prepared.");
-  } else if (txn_state_ == COMMITED) {
+  } else if (txn_state_ == COMMITTED) {
     s = Status::InvalidArgument("Transaction has already been committed.");
   } else if (txn_state_ == ROLLEDBACK) {
     s = Status::InvalidArgument("Transaction has already been rolledback.");
@@ -223,7 +226,9 @@
 Status WriteCommittedTxn::PrepareInternal() {
   WriteOptions write_options = write_options_;
   write_options.disableWAL = false;
-  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
+  auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                              name_);
+  assert(s.ok());
   class MarkLogCallback : public PreReleaseCallback {
    public:
     MarkLogCallback(DBImpl* db, bool two_write_queues)
@@ -253,15 +258,14 @@
   const bool kDisableMemtable = true;
   SequenceNumber* const KIgnoreSeqUsed = nullptr;
   const size_t kNoBatchCount = 0;
-  Status s = db_impl_->WriteImpl(
-      write_options, GetWriteBatch()->GetWriteBatch(), kNoWriteCallback,
-      &log_number_, kRefNoLog, kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount,
-      &mark_log_callback);
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          kNoWriteCallback, &log_number_, kRefNoLog,
+                          kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount,
+                          &mark_log_callback);
   return s;
 }
 
 Status PessimisticTransaction::Commit() {
-  Status s;
   bool commit_without_prepare = false;
   bool commit_prepared = false;
 
@@ -284,12 +288,14 @@
     commit_prepared = true;
   } else if (txn_state_ == STARTED) {
     // expiration and lock stealing is not a concern
-    commit_without_prepare = true;
-    // TODO(myabandeh): what if the user mistakenly forgets prepare? We should
-    // add an option so that the user explictly express the intention of
-    // skipping the prepare phase.
+    if (skip_prepare_) {
+      commit_without_prepare = true;
+    } else {
+      return Status::TxnNotPrepared();
+    }
   }
 
+  Status s;
   if (commit_without_prepare) {
     assert(!commit_prepared);
     if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) {
@@ -307,7 +313,7 @@
       }
       Clear();
       if (s.ok()) {
-        txn_state_.store(COMMITED);
+        txn_state_.store(COMMITTED);
       }
     }
   } else if (commit_prepared) {
@@ -330,10 +336,10 @@
     txn_db_impl_->UnregisterTransaction(this);
 
     Clear();
-    txn_state_.store(COMMITED);
+    txn_state_.store(COMMITTED);
   } else if (txn_state_ == LOCKS_STOLEN) {
     s = Status::Expired();
-  } else if (txn_state_ == COMMITED) {
+  } else if (txn_state_ == COMMITTED) {
     s = Status::InvalidArgument("Transaction has already been committed.");
   } else if (txn_state_ == ROLLEDBACK) {
     s = Status::InvalidArgument("Transaction has already been rolledback.");
@@ -373,7 +379,8 @@
   // We take the commit-time batch and append the Commit marker.
   // The Memtable will ignore the Commit marker in non-recovery mode
   WriteBatch* working_batch = GetCommitTimeWriteBatch();
-  WriteBatchInternal::MarkCommit(working_batch, name_);
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
 
   // any operations appended to this working_batch will be ignored from WAL
   working_batch->MarkWalTerminationPoint();
@@ -381,13 +388,14 @@
   // insert prepared batch into Memtable only skipping WAL.
   // Memtable will ignore BeginPrepare/EndPrepare markers
   // in non recovery mode and simply insert the values
-  WriteBatchInternal::Append(working_batch, GetWriteBatch()->GetWriteBatch());
+  s = WriteBatchInternal::Append(working_batch,
+                                 GetWriteBatch()->GetWriteBatch());
+  assert(s.ok());
 
   uint64_t seq_used = kMaxSequenceNumber;
-  auto s =
-      db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
+  s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
                           /*log_used*/ nullptr, /*log_ref*/ log_number_,
-                          /*disable_memtable*/ false, &seq_used);  
+                          /*disable_memtable*/ false, &seq_used);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   if (s.ok()) {
     SetId(seq_used);
@@ -423,7 +431,7 @@
     }
     // prepare couldn't have taken place
     Clear();
-  } else if (txn_state_ == COMMITED) {
+  } else if (txn_state_ == COMMITTED) {
     s = Status::InvalidArgument("This transaction has already been committed.");
   } else {
     s = Status::InvalidArgument(
@@ -435,8 +443,9 @@
 
 Status WriteCommittedTxn::RollbackInternal() {
   WriteBatch rollback_marker;
-  WriteBatchInternal::MarkRollback(&rollback_marker, name_);
-  auto s = db_impl_->WriteImpl(write_options_, &rollback_marker);
+  auto s = WriteBatchInternal::MarkRollback(&rollback_marker, name_);
+  assert(s.ok());
+  s = db_impl_->WriteImpl(write_options_, &rollback_marker);
   return s;
 }
 
@@ -445,12 +454,14 @@
     return Status::InvalidArgument("Transaction is beyond state for rollback.");
   }
 
-  // Unlock any keys locked since last transaction
-  const std::unique_ptr<TransactionKeyMap>& keys =
-      GetTrackedKeysSinceSavePoint();
-
-  if (keys) {
-    txn_db_impl_->UnLock(this, keys.get());
+  if (save_points_ != nullptr && !save_points_->empty()) {
+    // Unlock any keys locked since last transaction
+    auto& save_point_tracker = *save_points_->top().new_locks_;
+    std::unique_ptr<LockTracker> t(
+        tracked_locks_->GetTrackedLocksSinceSavePoint(save_point_tracker));
+    if (t) {
+      txn_db_impl_->UnLock(this, *t);
+    }
   }
 
   return TransactionBaseImpl::RollbackToSavePoint();
@@ -459,7 +470,7 @@
 // Lock all keys in this batch.
 // On success, caller should unlock keys_to_unlock
 Status PessimisticTransaction::LockBatch(WriteBatch* batch,
-                                         TransactionKeyMap* keys_to_unlock) {
+                                         LockTracker* keys_to_unlock) {
   class Handler : public WriteBatch::Handler {
    public:
     // Sorted map of column_family_id to sorted set of keys.
@@ -499,9 +510,10 @@
 
   // Iterating on this handler will add all keys in this batch into keys
   Handler handler;
-  batch->Iterate(&handler);
-
-  Status s;
+  Status s = batch->Iterate(&handler);
+  if (!s.ok()) {
+    return s;
+  }
 
   // Attempt to lock all keys
   for (const auto& cf_iter : handler.keys_) {
@@ -515,8 +527,13 @@
       if (!s.ok()) {
         break;
       }
-      TrackKey(keys_to_unlock, cfh_id, std::move(key), kMaxSequenceNumber,
-               false, true /* exclusive */);
+      PointLockRequest r;
+      r.column_family_id = cfh_id;
+      r.key = key;
+      r.seq = kMaxSequenceNumber;
+      r.read_only = false;
+      r.exclusive = true;
+      keys_to_unlock->Track(r);
     }
 
     if (!s.ok()) {
@@ -525,7 +542,7 @@
   }
 
   if (!s.ok()) {
-    txn_db_impl_->UnLock(this, keys_to_unlock);
+    txn_db_impl_->UnLock(this, *keys_to_unlock);
   }
 
   return s;
@@ -547,27 +564,19 @@
   }
   uint32_t cfh_id = GetColumnFamilyID(column_family);
   std::string key_str = key.ToString();
-  bool previously_locked;
-  bool lock_upgrade = false;
-
-  // lock this key if this transactions hasn't already locked it
-  SequenceNumber tracked_at_seq = kMaxSequenceNumber;
 
-  const auto& tracked_keys = GetTrackedKeys();
-  const auto tracked_keys_cf = tracked_keys.find(cfh_id);
-  if (tracked_keys_cf == tracked_keys.end()) {
-    previously_locked = false;
+  PointLockStatus status;
+  bool lock_upgrade;
+  bool previously_locked;
+  if (tracked_locks_->IsPointLockSupported()) {
+    status = tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+    previously_locked = status.locked;
+    lock_upgrade = previously_locked && exclusive && !status.exclusive;
   } else {
-    auto iter = tracked_keys_cf->second.find(key_str);
-    if (iter == tracked_keys_cf->second.end()) {
-      previously_locked = false;
-    } else {
-      if (!iter->second.exclusive && exclusive) {
-        lock_upgrade = true;
-      }
-      previously_locked = true;
-      tracked_at_seq = iter->second.seq;
-    }
+    // If the record is tracked, we can assume it was locked, too.
+    previously_locked = assume_tracked;
+    status.locked = false;
+    lock_upgrade = false;
   }
 
   // Lock this key if this transactions hasn't already locked it or we require
@@ -584,8 +593,11 @@
   // any writes since this transaction's snapshot.
   // TODO(agiardullo): could optimize by supporting shared txn locks in the
   // future
+  SequenceNumber tracked_at_seq =
+      status.locked ? status.seq : kMaxSequenceNumber;
   if (!do_validate || snapshot_ == nullptr) {
-    if (assume_tracked && !previously_locked) {
+    if (assume_tracked && !previously_locked &&
+        tracked_locks_->IsPointLockSupported()) {
       s = Status::InvalidArgument(
           "assume_tracked is set but it is not tracked yet");
     }
@@ -613,15 +625,13 @@
 
       if (!s.ok()) {
         // Failed to validate key
-        if (!previously_locked) {
-          // Unlock key we just locked
-          if (lock_upgrade) {
-            s = txn_db_impl_->TryLock(this, cfh_id, key_str,
-                                      false /* exclusive */);
-            assert(s.ok());
-          } else {
-            txn_db_impl_->UnLock(this, cfh_id, key.ToString());
-          }
+        // Unlock key we just locked
+        if (lock_upgrade) {
+          s = txn_db_impl_->TryLock(this, cfh_id, key_str,
+                                    false /* exclusive */);
+          assert(s.ok());
+        } else if (!previously_locked) {
+          txn_db_impl_->UnLock(this, cfh_id, key.ToString());
         }
       }
     }
@@ -644,10 +654,13 @@
       TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive);
     } else {
 #ifndef NDEBUG
-      assert(tracked_keys_cf->second.count(key_str) > 0);
-      const auto& info = tracked_keys_cf->second.find(key_str)->second;
-      assert(info.seq <= tracked_at_seq);
-      assert(info.exclusive == exclusive);
+      if (tracked_locks_->IsPointLockSupported()) {
+        PointLockStatus lock_status =
+            tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+        assert(lock_status.locked);
+        assert(lock_status.seq <= tracked_at_seq);
+        assert(lock_status.exclusive == exclusive);
+      }
 #endif
     }
   }
@@ -655,6 +668,22 @@
   return s;
 }
 
+Status PessimisticTransaction::GetRangeLock(ColumnFamilyHandle* column_family,
+                                            const Endpoint& start_endp,
+                                            const Endpoint& end_endp) {
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+  uint32_t cfh_id = GetColumnFamilyID(cfh);
+
+  Status s = txn_db_impl_->TryRangeLock(this, cfh_id, start_endp, end_endp);
+
+  if (s.ok()) {
+    RangeLockRequest req{cfh_id, start_endp, end_endp};
+    tracked_locks_->Track(req);
+  }
+  return s;
+}
+
 // Return OK() if this key has not been modified more recently than the
 // transaction snapshot_.
 // tracked_at_seq is the global seq at which we either locked the key or already
@@ -682,8 +711,9 @@
   ColumnFamilyHandle* cfh =
       column_family ? column_family : db_impl_->DefaultColumnFamily();
 
+  // TODO (yanqin): support conflict checking based on timestamp.
   return TransactionUtil::CheckKeyForConflicts(
-      db_impl_, cfh, key.ToString(), snap_seq, false /* cache_only */);
+      db_impl_, cfh, key.ToString(), snap_seq, nullptr, false /* cache_only */);
 }
 
 bool PessimisticTransaction::TryStealingLocks() {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h	2025-05-19 16:14:28.000000000 +0000
@@ -44,7 +44,7 @@
   PessimisticTransaction(const PessimisticTransaction&) = delete;
   void operator=(const PessimisticTransaction&) = delete;
 
-  virtual ~PessimisticTransaction();
+  ~PessimisticTransaction() override;
 
   void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options,
                     const TransactionOptions& txn_options);
@@ -116,10 +116,17 @@
 
   int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; }
 
+  virtual Status GetRangeLock(ColumnFamilyHandle* column_family,
+                              const Endpoint& start_key,
+                              const Endpoint& end_key) override;
+
  protected:
   // Refer to
   // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery
   bool use_only_the_last_commit_time_batch_for_recovery_ = false;
+  // Refer to
+  // TransactionOptions::skip_prepare
+  bool skip_prepare_ = false;
 
   virtual Status PrepareInternal() = 0;
 
@@ -136,7 +143,7 @@
 
   virtual void Initialize(const TransactionOptions& txn_options);
 
-  Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock);
+  Status LockBatch(WriteBatch* batch, LockTracker* keys_to_unlock);
 
   Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
                  bool read_only, bool exclusive, const bool do_validate = true,
@@ -169,7 +176,7 @@
   //
   // If waiting_key_ is not null, then the pointer should always point to
   // a valid string object. The reason is that it is only non-null when the
-  // transaction is blocked in the TransactionLockMgr::AcquireWithTimeout
+  // transaction is blocked in the PointLockManager::AcquireWithTimeout
   // function. At that point, the key string object is one of the function
   // parameters.
   uint32_t waiting_cf_id_;
@@ -206,7 +213,7 @@
   WriteCommittedTxn(const WriteCommittedTxn&) = delete;
   void operator=(const WriteCommittedTxn&) = delete;
 
-  virtual ~WriteCommittedTxn() {}
+  ~WriteCommittedTxn() override {}
 
  private:
   Status PrepareInternal() override;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc	2025-05-19 16:14:28.000000000 +0000
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
@@ -29,14 +30,9 @@
 PessimisticTransactionDB::PessimisticTransactionDB(
     DB* db, const TransactionDBOptions& txn_db_options)
     : TransactionDB(db),
-      db_impl_(static_cast_with_check<DBImpl, DB>(db)),
+      db_impl_(static_cast_with_check<DBImpl>(db)),
       txn_db_options_(txn_db_options),
-      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
-                txn_db_options_.max_num_deadlocks,
-                txn_db_options_.custom_mutex_factory
-                    ? txn_db_options_.custom_mutex_factory
-                    : std::shared_ptr<TransactionDBMutexFactory>(
-                          new TransactionDBMutexFactoryImpl())) {
+      lock_manager_(NewLockManager(this, txn_db_options)) {
   assert(db_impl_ != nullptr);
   info_log_ = db_impl_->GetDBOptions().info_log;
 }
@@ -60,14 +56,9 @@
 PessimisticTransactionDB::PessimisticTransactionDB(
     StackableDB* db, const TransactionDBOptions& txn_db_options)
     : TransactionDB(db),
-      db_impl_(static_cast_with_check<DBImpl, DB>(db->GetRootDB())),
+      db_impl_(static_cast_with_check<DBImpl>(db->GetRootDB())),
       txn_db_options_(txn_db_options),
-      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
-                txn_db_options_.max_num_deadlocks,
-                txn_db_options_.custom_mutex_factory
-                    ? txn_db_options_.custom_mutex_factory
-                    : std::shared_ptr<TransactionDBMutexFactory>(
-                          new TransactionDBMutexFactoryImpl())) {
+      lock_manager_(NewLockManager(this, txn_db_options)) {
   assert(db_impl_ != nullptr);
 }
 
@@ -113,7 +104,7 @@
   Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
 
   // create 'real' transactions from recovered shell transactions
-  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
   assert(dbimpl != nullptr);
   auto rtrxs = dbimpl->recovered_transactions();
 
@@ -286,10 +277,10 @@
   db_options->allow_2pc = true;
 }
 
-Status TransactionDB::WrapDB(
-    // make sure this db is already opened with memtable history enabled,
-    // auto compaction distabled and 2 phase commit enabled
-    DB* db, const TransactionDBOptions& txn_db_options,
+namespace {
+template <typename DBType>
+Status WrapAnotherDBInternal(
+    DBType* db, const TransactionDBOptions& txn_db_options,
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
   assert(db != nullptr);
@@ -319,6 +310,17 @@
   }
   return s;
 }
+}  // namespace
+
+Status TransactionDB::WrapDB(
+    // make sure this db is already opened with memtable history enabled,
+    // auto compaction distabled and 2 phase commit enabled
+    DB* db, const TransactionDBOptions& txn_db_options,
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+  return WrapAnotherDBInternal(db, txn_db_options,
+                               compaction_enabled_cf_indices, handles, dbptr);
+}
 
 Status TransactionDB::WrapStackableDB(
     // make sure this stackable_db is already opened with memtable history
@@ -326,40 +328,15 @@
     StackableDB* db, const TransactionDBOptions& txn_db_options,
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
-  assert(db != nullptr);
-  assert(dbptr != nullptr);
-  *dbptr = nullptr;
-  std::unique_ptr<PessimisticTransactionDB> txn_db;
-
-  switch (txn_db_options.write_policy) {
-    case WRITE_UNPREPARED:
-      txn_db.reset(new WriteUnpreparedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
-      break;
-    case WRITE_PREPARED:
-      txn_db.reset(new WritePreparedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
-      break;
-    case WRITE_COMMITTED:
-    default:
-      txn_db.reset(new WriteCommittedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
-  }
-  txn_db->UpdateCFComparatorMap(handles);
-  Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
-  // In case of a failure at this point, db is deleted via the txn_db destructor
-  // and set to nullptr.
-  if (s.ok()) {
-    *dbptr = txn_db.release();
-  }
-  return s;
+  return WrapAnotherDBInternal(db, txn_db_options,
+                               compaction_enabled_cf_indices, handles, dbptr);
 }
 
-// Let TransactionLockMgr know that this column family exists so it can
+// Let LockManager know that this column family exists so it can
 // allocate a LockMap for it.
 void PessimisticTransactionDB::AddColumnFamily(
     const ColumnFamilyHandle* handle) {
-  lock_mgr_.AddColumnFamily(handle->GetID());
+  lock_manager_->AddColumnFamily(handle);
 }
 
 Status PessimisticTransactionDB::CreateColumnFamily(
@@ -373,14 +350,14 @@
 
   s = db_->CreateColumnFamily(options, column_family_name, handle);
   if (s.ok()) {
-    lock_mgr_.AddColumnFamily((*handle)->GetID());
+    lock_manager_->AddColumnFamily(*handle);
     UpdateCFComparatorMap(*handle);
   }
 
   return s;
 }
 
-// Let TransactionLockMgr know that it can deallocate the LockMap for this
+// Let LockManager know that it can deallocate the LockMap for this
 // column family.
 Status PessimisticTransactionDB::DropColumnFamily(
     ColumnFamilyHandle* column_family) {
@@ -388,7 +365,7 @@
 
   Status s = db_->DropColumnFamily(column_family);
   if (s.ok()) {
-    lock_mgr_.RemoveColumnFamily(column_family->GetID());
+    lock_manager_->RemoveColumnFamily(column_family);
   }
 
   return s;
@@ -398,17 +375,25 @@
                                          uint32_t cfh_id,
                                          const std::string& key,
                                          bool exclusive) {
-  return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv(), exclusive);
+  return lock_manager_->TryLock(txn, cfh_id, key, GetEnv(), exclusive);
+}
+
+Status PessimisticTransactionDB::TryRangeLock(PessimisticTransaction* txn,
+                                              uint32_t cfh_id,
+                                              const Endpoint& start_endp,
+                                              const Endpoint& end_endp) {
+  return lock_manager_->TryLock(txn, cfh_id, start_endp, end_endp, GetEnv(),
+                                /*exclusive=*/true);
 }
 
 void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
-                                      const TransactionKeyMap* keys) {
-  lock_mgr_.UnLock(txn, keys, GetEnv());
+                                      const LockTracker& keys) {
+  lock_manager_->UnLock(txn, keys, GetEnv());
 }
 
 void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
                                       uint32_t cfh_id, const std::string& key) {
-  lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
+  lock_manager_->UnLock(txn, cfh_id, key, GetEnv());
 }
 
 // Used when wrapping DB write operations in a transaction
@@ -569,8 +554,7 @@
 void PessimisticTransactionDB::ReinitializeTransaction(
     Transaction* txn, const WriteOptions& write_options,
     const TransactionOptions& txn_options) {
-  auto txn_impl =
-      static_cast_with_check<PessimisticTransaction, Transaction>(txn);
+  auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn);
 
   txn_impl->Reinitialize(this, write_options, txn_options);
 }
@@ -598,17 +582,16 @@
   }
 }
 
-TransactionLockMgr::LockStatusData
-PessimisticTransactionDB::GetLockStatusData() {
-  return lock_mgr_.GetLockStatusData();
+LockManager::PointLockStatus PessimisticTransactionDB::GetLockStatusData() {
+  return lock_manager_->GetPointLockStatus();
 }
 
 std::vector<DeadlockPath> PessimisticTransactionDB::GetDeadlockInfoBuffer() {
-  return lock_mgr_.GetDeadlockInfoBuffer();
+  return lock_manager_->GetDeadlockInfoBuffer();
 }
 
 void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) {
-  lock_mgr_.Resize(target_size);
+  lock_manager_->Resize(target_size);
 }
 
 void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h	2025-05-19 16:14:28.000000000 +0000
@@ -20,8 +20,9 @@
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
 #include "utilities/transactions/pessimistic_transaction.h"
-#include "utilities/transactions/transaction_lock_mgr.h"
 #include "utilities/transactions/write_prepared_txn.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -75,8 +76,7 @@
     Transaction* txn = BeginInternalTransaction(opts);
     txn->DisableIndexing();
 
-    auto txn_impl =
-        static_cast_with_check<PessimisticTransaction, Transaction>(txn);
+    auto txn_impl = static_cast_with_check<PessimisticTransaction>(txn);
 
     // Since commitBatch sorts the keys before locking, concurrent Write()
     // operations will not cause a deadlock.
@@ -99,8 +99,10 @@
 
   Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id,
                  const std::string& key, bool exclusive);
+  Status TryRangeLock(PessimisticTransaction* txn, uint32_t cfh_id,
+                      const Endpoint& start_endp, const Endpoint& end_endp);
 
-  void UnLock(PessimisticTransaction* txn, const TransactionKeyMap* keys);
+  void UnLock(PessimisticTransaction* txn, const LockTracker& keys);
   void UnLock(PessimisticTransaction* txn, uint32_t cfh_id,
               const std::string& key);
 
@@ -131,7 +133,7 @@
   // not thread safe. current use case is during recovery (single thread)
   void GetAllPreparedTransactions(std::vector<Transaction*>* trans) override;
 
-  TransactionLockMgr::LockStatusData GetLockStatusData() override;
+  LockManager::PointLockStatus GetLockStatusData() override;
 
   std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
   void SetDeadlockInfoBufferSize(uint32_t target_size) override;
@@ -143,6 +145,11 @@
   virtual void UpdateCFComparatorMap(const std::vector<ColumnFamilyHandle*>&) {}
   virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {}
 
+  // Use the returned factory to create LockTrackers in transactions.
+  const LockTrackerFactory& GetLockTrackerFactory() const {
+    return lock_manager_->GetLockTrackerFactory();
+  }
+
  protected:
   DBImpl* db_impl_;
   std::shared_ptr<Logger> info_log_;
@@ -167,7 +174,8 @@
   friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
   friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test;
-  TransactionLockMgr lock_mgr_;
+
+  std::shared_ptr<LockManager> lock_manager_;
 
   // Must be held when adding/dropping column families.
   InstrumentedMutex column_family_mutex_;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.cc	2025-05-19 16:14:28.000000000 +0000
@@ -11,22 +11,27 @@
 
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
+#include "utilities/transactions/lock/lock_tracker.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-TransactionBaseImpl::TransactionBaseImpl(DB* db,
-                                         const WriteOptions& write_options)
+TransactionBaseImpl::TransactionBaseImpl(
+    DB* db, const WriteOptions& write_options,
+    const LockTrackerFactory& lock_tracker_factory)
     : db_(db),
-      dbimpl_(static_cast_with_check<DBImpl, DB>(db)),
+      dbimpl_(static_cast_with_check<DBImpl>(db)),
       write_options_(write_options),
       cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
-      start_time_(db_->GetEnv()->NowMicros()),
+      lock_tracker_factory_(lock_tracker_factory),
+      start_time_(dbimpl_->GetSystemClock()->NowMicros()),
       write_batch_(cmp_, 0, true, 0),
+      tracked_locks_(lock_tracker_factory_.Create()),
       indexing_enabled_(true) {
   assert(dynamic_cast<DBImpl*>(db_) != nullptr);
   log_number_ = 0;
@@ -44,7 +49,7 @@
   save_points_.reset(nullptr);
   write_batch_.Clear();
   commit_time_batch_.Clear();
-  tracked_keys_.clear();
+  tracked_locks_->Clear();
   num_puts_ = 0;
   num_deletes_ = 0;
   num_merges_ = 0;
@@ -63,7 +68,7 @@
   name_.clear();
   log_number_ = 0;
   write_options_ = write_options;
-  start_time_ = db_->GetEnv()->NowMicros();
+  start_time_ = dbimpl_->GetSystemClock()->NowMicros();
   indexing_enabled_ = true;
   cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily());
 }
@@ -123,7 +128,8 @@
     save_points_.reset(new std::stack<TransactionBaseImpl::SavePoint, autovector<TransactionBaseImpl::SavePoint>>());
   }
   save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_,
-                        num_puts_, num_deletes_, num_merges_);
+                        num_puts_, num_deletes_, num_merges_,
+                        lock_tracker_factory_);
   write_batch_.SetSavePoint();
 }
 
@@ -143,37 +149,7 @@
     assert(s.ok());
 
     // Rollback any keys that were tracked since the last savepoint
-    const TransactionKeyMap& key_map = save_point.new_keys_;
-    for (const auto& key_map_iter : key_map) {
-      uint32_t column_family_id = key_map_iter.first;
-      auto& keys = key_map_iter.second;
-
-      auto& cf_tracked_keys = tracked_keys_[column_family_id];
-
-      for (const auto& key_iter : keys) {
-        const std::string& key = key_iter.first;
-        uint32_t num_reads = key_iter.second.num_reads;
-        uint32_t num_writes = key_iter.second.num_writes;
-
-        auto tracked_keys_iter = cf_tracked_keys.find(key);
-        assert(tracked_keys_iter != cf_tracked_keys.end());
-
-        // Decrement the total reads/writes of this key by the number of
-        // reads/writes done since the last SavePoint.
-        if (num_reads > 0) {
-          assert(tracked_keys_iter->second.num_reads >= num_reads);
-          tracked_keys_iter->second.num_reads -= num_reads;
-        }
-        if (num_writes > 0) {
-          assert(tracked_keys_iter->second.num_writes >= num_writes);
-          tracked_keys_iter->second.num_writes -= num_writes;
-        }
-        if (tracked_keys_iter->second.num_reads == 0 &&
-            tracked_keys_iter->second.num_writes == 0) {
-          cf_tracked_keys.erase(tracked_keys_iter);
-        }
-      }
-    }
+    tracked_locks_->Subtract(*save_point.new_locks_);
 
     save_points_->pop();
 
@@ -200,39 +176,11 @@
   if (save_points_->size() == 1) {
     save_points_->pop();
   } else {
-    TransactionBaseImpl::SavePoint top;
+    TransactionBaseImpl::SavePoint top(lock_tracker_factory_);
     std::swap(top, save_points_->top());
     save_points_->pop();
 
-    const TransactionKeyMap& curr_cf_key_map = top.new_keys_;
-    TransactionKeyMap& prev_cf_key_map = save_points_->top().new_keys_;
-
-    for (const auto& curr_cf_key_iter : curr_cf_key_map) {
-      uint32_t column_family_id = curr_cf_key_iter.first;
-      const std::unordered_map<std::string, TransactionKeyMapInfo>& curr_keys =
-          curr_cf_key_iter.second;
-
-      // If cfid was not previously tracked, just copy everything over.
-      auto prev_keys_iter = prev_cf_key_map.find(column_family_id);
-      if (prev_keys_iter == prev_cf_key_map.end()) {
-        prev_cf_key_map.emplace(curr_cf_key_iter);
-      } else {
-        std::unordered_map<std::string, TransactionKeyMapInfo>& prev_keys =
-            prev_keys_iter->second;
-        for (const auto& key_iter : curr_keys) {
-          const std::string& key = key_iter.first;
-          const TransactionKeyMapInfo& info = key_iter.second;
-          // If key was not previously tracked, just copy the whole struct over.
-          // Otherwise, some merging needs to occur.
-          auto prev_info = prev_keys.find(key);
-          if (prev_info == prev_keys.end()) {
-            prev_keys.emplace(key_iter);
-          } else {
-            prev_info->second.Merge(info);
-          }
-        }
-      }
-    }
+    save_points_->top().new_locks_->Merge(*top.new_locks_);
   }
 
   return write_batch_.PopSavePoint();
@@ -312,8 +260,7 @@
 
   std::vector<Status> stat_list(num_keys);
   for (size_t i = 0; i < num_keys; ++i) {
-    std::string* value = values ? &(*values)[i] : nullptr;
-    stat_list[i] = Get(read_options, column_family[i], keys[i], value);
+    stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]);
   }
 
   return stat_list;
@@ -350,8 +297,7 @@
   // TODO(agiardullo): optimize multiget?
   std::vector<Status> stat_list(num_keys);
   for (size_t i = 0; i < num_keys; ++i) {
-    std::string* value = values ? &(*values)[i] : nullptr;
-    stat_list[i] = Get(read_options, column_family[i], keys[i], value);
+    stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]);
   }
 
   return stat_list;
@@ -361,7 +307,8 @@
   Iterator* db_iter = db_->NewIterator(read_options);
   assert(db_iter);
 
-  return write_batch_.NewIteratorWithBase(db_iter);
+  return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter,
+                                          &read_options);
 }
 
 Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
@@ -585,7 +532,9 @@
 }
 
 void TransactionBaseImpl::PutLogData(const Slice& blob) {
-  write_batch_.PutLogData(blob);
+  auto s = write_batch_.PutLogData(blob);
+  (void)s;
+  assert(s.ok());
 }
 
 WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() {
@@ -593,7 +542,7 @@
 }
 
 uint64_t TransactionBaseImpl::GetElapsedTime() const {
-  return (db_->GetEnv()->NowMicros() - start_time_) / 1000;
+  return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000;
 }
 
 uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; }
@@ -603,106 +552,26 @@
 uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; }
 
 uint64_t TransactionBaseImpl::GetNumKeys() const {
-  uint64_t count = 0;
-
-  // sum up locked keys in all column families
-  for (const auto& key_map_iter : tracked_keys_) {
-    const auto& keys = key_map_iter.second;
-    count += keys.size();
-  }
-
-  return count;
+  return tracked_locks_->GetNumPointLocks();
 }
 
 void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key,
                                    SequenceNumber seq, bool read_only,
                                    bool exclusive) {
+  PointLockRequest r;
+  r.column_family_id = cfh_id;
+  r.key = key;
+  r.seq = seq;
+  r.read_only = read_only;
+  r.exclusive = exclusive;
+
   // Update map of all tracked keys for this transaction
-  TrackKey(&tracked_keys_, cfh_id, key, seq, read_only, exclusive);
+  tracked_locks_->Track(r);
 
   if (save_points_ != nullptr && !save_points_->empty()) {
     // Update map of tracked keys in this SavePoint
-    TrackKey(&save_points_->top().new_keys_, cfh_id, key, seq, read_only,
-             exclusive);
-  }
-}
-
-// Add a key to the given TransactionKeyMap
-// seq for pessimistic transactions is the sequence number from which we know
-// there has not been a concurrent update to the key.
-void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
-                                   const std::string& key, SequenceNumber seq,
-                                   bool read_only, bool exclusive) {
-  auto& cf_key_map = (*key_map)[cfh_id];
-#ifdef __cpp_lib_unordered_map_try_emplace
-  // use c++17's try_emplace if available, to avoid rehashing the key
-  // in case it is not already in the map
-  auto result = cf_key_map.try_emplace(key, seq);
-  auto iter = result.first;
-  if (!result.second && seq < iter->second.seq) {
-    // Now tracking this key with an earlier sequence number
-    iter->second.seq = seq;
-  }
-#else
-  auto iter = cf_key_map.find(key);
-  if (iter == cf_key_map.end()) {
-    auto result = cf_key_map.emplace(key, TransactionKeyMapInfo(seq));
-    iter = result.first;
-  } else if (seq < iter->second.seq) {
-    // Now tracking this key with an earlier sequence number
-    iter->second.seq = seq;
-  }
-#endif
-  // else we do not update the seq. The smaller the tracked seq, the stronger it
-  // the guarantee since it implies from the seq onward there has not been a
-  // concurrent update to the key. So we update the seq if it implies stronger
-  // guarantees, i.e., if it is smaller than the existing tracked seq.
-
-  if (read_only) {
-    iter->second.num_reads++;
-  } else {
-    iter->second.num_writes++;
+    save_points_->top().new_locks_->Track(r);
   }
-  iter->second.exclusive |= exclusive;
-}
-
-std::unique_ptr<TransactionKeyMap>
-TransactionBaseImpl::GetTrackedKeysSinceSavePoint() {
-  if (save_points_ != nullptr && !save_points_->empty()) {
-    // Examine the number of reads/writes performed on all keys written
-    // since the last SavePoint and compare to the total number of reads/writes
-    // for each key.
-    TransactionKeyMap* result = new TransactionKeyMap();
-    for (const auto& key_map_iter : save_points_->top().new_keys_) {
-      uint32_t column_family_id = key_map_iter.first;
-      auto& keys = key_map_iter.second;
-
-      auto& cf_tracked_keys = tracked_keys_[column_family_id];
-
-      for (const auto& key_iter : keys) {
-        const std::string& key = key_iter.first;
-        uint32_t num_reads = key_iter.second.num_reads;
-        uint32_t num_writes = key_iter.second.num_writes;
-
-        auto total_key_info = cf_tracked_keys.find(key);
-        assert(total_key_info != cf_tracked_keys.end());
-        assert(total_key_info->second.num_reads >= num_reads);
-        assert(total_key_info->second.num_writes >= num_writes);
-
-        if (total_key_info->second.num_reads == num_reads &&
-            total_key_info->second.num_writes == num_writes) {
-          // All the reads/writes to this key were done in the last savepoint.
-          bool read_only = (num_writes == 0);
-          TrackKey(result, column_family_id, key, key_iter.second.seq,
-                   read_only, key_iter.second.exclusive);
-        }
-      }
-    }
-    return std::unique_ptr<TransactionKeyMap>(result);
-  }
-
-  // No SavePoint
-  return nullptr;
 }
 
 // Gets the write batch that should be used for Put/Merge/Deletes.
@@ -730,54 +599,28 @@
 
 void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family,
                                            const Slice& key) {
-  uint32_t column_family_id = GetColumnFamilyID(column_family);
-  auto& cf_tracked_keys = tracked_keys_[column_family_id];
-  std::string key_str = key.ToString();
-  bool can_decrement = false;
-  bool can_unlock __attribute__((__unused__)) = false;
+  PointLockRequest r;
+  r.column_family_id = GetColumnFamilyID(column_family);
+  r.key = key.ToString();
+  r.read_only = true;
 
+  bool can_untrack = false;
   if (save_points_ != nullptr && !save_points_->empty()) {
-    // Check if this key was fetched ForUpdate in this SavePoint
-    auto& cf_savepoint_keys = save_points_->top().new_keys_[column_family_id];
-
-    auto savepoint_iter = cf_savepoint_keys.find(key_str);
-    if (savepoint_iter != cf_savepoint_keys.end()) {
-      if (savepoint_iter->second.num_reads > 0) {
-        savepoint_iter->second.num_reads--;
-        can_decrement = true;
-
-        if (savepoint_iter->second.num_reads == 0 &&
-            savepoint_iter->second.num_writes == 0) {
-          // No other GetForUpdates or write on this key in this SavePoint
-          cf_savepoint_keys.erase(savepoint_iter);
-          can_unlock = true;
-        }
-      }
-    }
+    // If there is no GetForUpdate of the key in this save point,
+    // then cannot untrack from the global lock tracker.
+    UntrackStatus s = save_points_->top().new_locks_->Untrack(r);
+    can_untrack = (s != UntrackStatus::NOT_TRACKED);
   } else {
-    // No SavePoint set
-    can_decrement = true;
-    can_unlock = true;
-  }
-
-  // We can only decrement the read count for this key if we were able to
-  // decrement the read count in the current SavePoint, OR if there is no
-  // SavePoint set.
-  if (can_decrement) {
-    auto key_iter = cf_tracked_keys.find(key_str);
-
-    if (key_iter != cf_tracked_keys.end()) {
-      if (key_iter->second.num_reads > 0) {
-        key_iter->second.num_reads--;
-
-        if (key_iter->second.num_reads == 0 &&
-            key_iter->second.num_writes == 0) {
-          // No other GetForUpdates or writes on this key
-          assert(can_unlock);
-          cf_tracked_keys.erase(key_iter);
-          UnlockGetForUpdate(column_family, key);
-        }
-      }
+    // No save point, so can untrack from the global lock tracker.
+    can_untrack = true;
+  }
+
+  if (can_untrack) {
+    // If erased from the global tracker, then can unlock the key.
+    UntrackStatus s = tracked_locks_->Untrack(r);
+    bool can_unlock = (s == UntrackStatus::REMOVED);
+    if (can_unlock) {
+      UnlockGetForUpdate(column_family, key);
     }
   }
 }
@@ -820,6 +663,10 @@
       return Status::InvalidArgument();
     }
 
+    Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
     Status MarkRollback(const Slice&) override {
       return Status::InvalidArgument();
     }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_base.h	2025-05-19 16:14:28.000000000 +0000
@@ -1,7 +1,7 @@
 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
@@ -21,15 +21,17 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/autovector.h"
+#include "utilities/transactions/lock/lock_tracker.h"
 #include "utilities/transactions/transaction_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class TransactionBaseImpl : public Transaction {
  public:
-  TransactionBaseImpl(DB* db, const WriteOptions& write_options);
+  TransactionBaseImpl(DB* db, const WriteOptions& write_options,
+                      const LockTrackerFactory& lock_tracker_factory);
 
-  virtual ~TransactionBaseImpl();
+  ~TransactionBaseImpl() override;
 
   // Remove pending operations queued in this transaction.
   virtual void Clear();
@@ -200,7 +202,8 @@
   }
 
   const Snapshot* GetSnapshot() const override {
-    return snapshot_ ? snapshot_.get() : nullptr;
+    // will return nullptr when there is no snapshot
+    return snapshot_.get();
   }
 
   virtual void SetSnapshot() override;
@@ -233,10 +236,6 @@
     return UndoGetForUpdate(nullptr, key);
   };
 
-  // Get list of keys in this transaction that must not have any conflicts
-  // with writes in other transactions.
-  const TransactionKeyMap& GetTrackedKeys() const { return tracked_keys_; }
-
   WriteOptions* GetWriteOptions() override { return &write_options_; }
 
   void SetWriteOptions(const WriteOptions& write_options) override {
@@ -252,6 +251,8 @@
 
   WriteBatch* GetCommitTimeWriteBatch() override;
 
+  LockTracker& GetTrackedLocks() { return *tracked_locks_; }
+
  protected:
   // Add a key to the list of tracked keys.
   //
@@ -260,17 +261,10 @@
   void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno,
                 bool readonly, bool exclusive);
 
-  // Helper function to add a key to the given TransactionKeyMap
-  static void TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
-                       const std::string& key, SequenceNumber seqno,
-                       bool readonly, bool exclusive);
-
   // Called when UndoGetForUpdate determines that this key can be unlocked.
   virtual void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
                                   const Slice& key) = 0;
 
-  std::unique_ptr<TransactionKeyMap> GetTrackedKeysSinceSavePoint();
-
   // Sets a snapshot if SetSnapshotOnNextOperation() has been called.
   void SetSnapshotIfNeeded();
 
@@ -280,7 +274,8 @@
       write_batch_.Clear();
     }
     assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader);
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    auto s = WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    assert(s.ok());
   }
 
   DB* db_;
@@ -290,6 +285,8 @@
 
   const Comparator* cmp_;
 
+  const LockTrackerFactory& lock_tracker_factory_;
+
   // Stores that time the txn was constructed, in microseconds.
   uint64_t start_time_;
 
@@ -310,30 +307,33 @@
     uint64_t num_deletes_ = 0;
     uint64_t num_merges_ = 0;
 
-    // Record all keys tracked since the last savepoint
-    TransactionKeyMap new_keys_;
+    // Record all locks tracked since the last savepoint
+    std::shared_ptr<LockTracker> new_locks_;
 
     SavePoint(std::shared_ptr<const Snapshot> snapshot, bool snapshot_needed,
               std::shared_ptr<TransactionNotifier> snapshot_notifier,
-              uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges)
+              uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges,
+              const LockTrackerFactory& lock_tracker_factory)
         : snapshot_(snapshot),
           snapshot_needed_(snapshot_needed),
           snapshot_notifier_(snapshot_notifier),
           num_puts_(num_puts),
           num_deletes_(num_deletes),
-          num_merges_(num_merges) {}
+          num_merges_(num_merges),
+          new_locks_(lock_tracker_factory.Create()) {}
 
-    SavePoint() = default;
+    explicit SavePoint(const LockTrackerFactory& lock_tracker_factory)
+        : new_locks_(lock_tracker_factory.Create()) {}
   };
 
   // Records writes pending in this transaction
   WriteBatchWithIndex write_batch_;
 
-  // Map from column_family_id to map of keys that are involved in this
-  // transaction.
-  // For Pessimistic Transactions this is the list of locked keys.
-  // Optimistic Transactions will wait till commit time to do conflict checking.
-  TransactionKeyMap tracked_keys_;
+  // For Pessimistic Transactions this is the set of acquired locks.
+  // Optimistic Transactions will keep note the requested locks (not actually
+  // locked), and do conflict checking until commit time based on the tracked
+  // lock requests.
+  std::unique_ptr<LockTracker> tracked_locks_;
 
   // Stack of the Snapshot saved at each save point. Saved snapshots may be
   // nullptr if there was no snapshot at the time SetSavePoint() was called.
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.cc	1970-01-01 00:00:00.000000000 +0000
@@ -1,745 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include "utilities/transactions/transaction_lock_mgr.h"
-
-#include <cinttypes>
-
-#include <algorithm>
-#include <condition_variable>
-#include <functional>
-#include <mutex>
-#include <string>
-#include <vector>
-
-#include "monitoring/perf_context_imp.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/utilities/transaction_db_mutex.h"
-#include "test_util/sync_point.h"
-#include "util/cast_util.h"
-#include "util/hash.h"
-#include "util/thread_local.h"
-#include "utilities/transactions/pessimistic_transaction_db.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-struct LockInfo {
-  bool exclusive;
-  autovector<TransactionID> txn_ids;
-
-  // Transaction locks are not valid after this time in us
-  uint64_t expiration_time;
-
-  LockInfo(TransactionID id, uint64_t time, bool ex)
-      : exclusive(ex), expiration_time(time) {
-    txn_ids.push_back(id);
-  }
-  LockInfo(const LockInfo& lock_info)
-      : exclusive(lock_info.exclusive),
-        txn_ids(lock_info.txn_ids),
-        expiration_time(lock_info.expiration_time) {}
-};
-
-struct LockMapStripe {
-  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
-    stripe_mutex = factory->AllocateMutex();
-    stripe_cv = factory->AllocateCondVar();
-    assert(stripe_mutex);
-    assert(stripe_cv);
-  }
-
-  // Mutex must be held before modifying keys map
-  std::shared_ptr<TransactionDBMutex> stripe_mutex;
-
-  // Condition Variable per stripe for waiting on a lock
-  std::shared_ptr<TransactionDBCondVar> stripe_cv;
-
-  // Locked keys mapped to the info about the transactions that locked them.
-  // TODO(agiardullo): Explore performance of other data structures.
-  std::unordered_map<std::string, LockInfo> keys;
-};
-
-// Map of #num_stripes LockMapStripes
-struct LockMap {
-  explicit LockMap(size_t num_stripes,
-                   std::shared_ptr<TransactionDBMutexFactory> factory)
-      : num_stripes_(num_stripes) {
-    lock_map_stripes_.reserve(num_stripes);
-    for (size_t i = 0; i < num_stripes; i++) {
-      LockMapStripe* stripe = new LockMapStripe(factory);
-      lock_map_stripes_.push_back(stripe);
-    }
-  }
-
-  ~LockMap() {
-    for (auto stripe : lock_map_stripes_) {
-      delete stripe;
-    }
-  }
-
-  // Number of sepearate LockMapStripes to create, each with their own Mutex
-  const size_t num_stripes_;
-
-  // Count of keys that are currently locked in this column family.
-  // (Only maintained if TransactionLockMgr::max_num_locks_ is positive.)
-  std::atomic<int64_t> lock_cnt{0};
-
-  std::vector<LockMapStripe*> lock_map_stripes_;
-
-  size_t GetStripe(const std::string& key) const;
-};
-
-void DeadlockInfoBuffer::AddNewPath(DeadlockPath path) {
-  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
-
-  if (paths_buffer_.empty()) {
-    return;
-  }
-
-  paths_buffer_[buffer_idx_] = std::move(path);
-  buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
-}
-
-void DeadlockInfoBuffer::Resize(uint32_t target_size) {
-  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
-
-  paths_buffer_ = Normalize();
-
-  // Drop the deadlocks that will no longer be needed ater the normalize
-  if (target_size < paths_buffer_.size()) {
-    paths_buffer_.erase(
-        paths_buffer_.begin(),
-        paths_buffer_.begin() + (paths_buffer_.size() - target_size));
-    buffer_idx_ = 0;
-  }
-  // Resize the buffer to the target size and restore the buffer's idx
-  else {
-    auto prev_size = paths_buffer_.size();
-    paths_buffer_.resize(target_size);
-    buffer_idx_ = (uint32_t)prev_size;
-  }
-}
-
-std::vector<DeadlockPath> DeadlockInfoBuffer::Normalize() {
-  auto working = paths_buffer_;
-
-  if (working.empty()) {
-    return working;
-  }
-
-  // Next write occurs at a nonexistent path's slot
-  if (paths_buffer_[buffer_idx_].empty()) {
-    working.resize(buffer_idx_);
-  } else {
-    std::rotate(working.begin(), working.begin() + buffer_idx_, working.end());
-  }
-
-  return working;
-}
-
-std::vector<DeadlockPath> DeadlockInfoBuffer::PrepareBuffer() {
-  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
-
-  // Reversing the normalized vector returns the latest deadlocks first
-  auto working = Normalize();
-  std::reverse(working.begin(), working.end());
-
-  return working;
-}
-
-namespace {
-void UnrefLockMapsCache(void* ptr) {
-  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
-  auto lock_maps_cache =
-      static_cast<std::unordered_map<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
-  delete lock_maps_cache;
-}
-}  // anonymous namespace
-
-TransactionLockMgr::TransactionLockMgr(
-    TransactionDB* txn_db, size_t default_num_stripes, int64_t max_num_locks,
-    uint32_t max_num_deadlocks,
-    std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
-    : txn_db_impl_(nullptr),
-      default_num_stripes_(default_num_stripes),
-      max_num_locks_(max_num_locks),
-      lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
-      dlock_buffer_(max_num_deadlocks),
-      mutex_factory_(mutex_factory) {
-  assert(txn_db);
-  txn_db_impl_ =
-      static_cast_with_check<PessimisticTransactionDB, TransactionDB>(txn_db);
-}
-
-TransactionLockMgr::~TransactionLockMgr() {}
-
-size_t LockMap::GetStripe(const std::string& key) const {
-  assert(num_stripes_ > 0);
-  return fastrange64(GetSliceNPHash64(key), num_stripes_);
-}
-
-void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) {
-  InstrumentedMutexLock l(&lock_map_mutex_);
-
-  if (lock_maps_.find(column_family_id) == lock_maps_.end()) {
-    lock_maps_.emplace(column_family_id,
-                       std::make_shared<LockMap>(default_num_stripes_, mutex_factory_));
-  } else {
-    // column_family already exists in lock map
-    assert(false);
-  }
-}
-
-void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) {
-  // Remove lock_map for this column family.  Since the lock map is stored
-  // as a shared ptr, concurrent transactions can still keep using it
-  // until they release their references to it.
-  {
-    InstrumentedMutexLock l(&lock_map_mutex_);
-
-    auto lock_maps_iter = lock_maps_.find(column_family_id);
-    assert(lock_maps_iter != lock_maps_.end());
-
-    lock_maps_.erase(lock_maps_iter);
-  }  // lock_map_mutex_
-
-  // Clear all thread-local caches
-  autovector<void*> local_caches;
-  lock_maps_cache_->Scrape(&local_caches, nullptr);
-  for (auto cache : local_caches) {
-    delete static_cast<LockMaps*>(cache);
-  }
-}
-
-// Look up the LockMap std::shared_ptr for a given column_family_id.
-// Note:  The LockMap is only valid as long as the caller is still holding on
-//   to the returned std::shared_ptr.
-std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap(
-    uint32_t column_family_id) {
-  // First check thread-local cache
-  if (lock_maps_cache_->Get() == nullptr) {
-    lock_maps_cache_->Reset(new LockMaps());
-  }
-
-  auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get());
-
-  auto lock_map_iter = lock_maps_cache->find(column_family_id);
-  if (lock_map_iter != lock_maps_cache->end()) {
-    // Found lock map for this column family.
-    return lock_map_iter->second;
-  }
-
-  // Not found in local cache, grab mutex and check shared LockMaps
-  InstrumentedMutexLock l(&lock_map_mutex_);
-
-  lock_map_iter = lock_maps_.find(column_family_id);
-  if (lock_map_iter == lock_maps_.end()) {
-    return std::shared_ptr<LockMap>(nullptr);
-  } else {
-    // Found lock map.  Store in thread-local cache and return.
-    std::shared_ptr<LockMap>& lock_map = lock_map_iter->second;
-    lock_maps_cache->insert({column_family_id, lock_map});
-
-    return lock_map;
-  }
-}
-
-// Returns true if this lock has expired and can be acquired by another
-// transaction.
-// If false, sets *expire_time to the expiration time of the lock according
-// to Env->GetMicros() or 0 if no expiration.
-bool TransactionLockMgr::IsLockExpired(TransactionID txn_id,
-                                       const LockInfo& lock_info, Env* env,
-                                       uint64_t* expire_time) {
-  auto now = env->NowMicros();
-
-  bool expired =
-      (lock_info.expiration_time > 0 && lock_info.expiration_time <= now);
-
-  if (!expired && lock_info.expiration_time > 0) {
-    // return how many microseconds until lock will be expired
-    *expire_time = lock_info.expiration_time;
-  } else {
-    for (auto id : lock_info.txn_ids) {
-      if (txn_id == id) {
-        continue;
-      }
-
-      bool success = txn_db_impl_->TryStealingExpiredTransactionLocks(id);
-      if (!success) {
-        expired = false;
-        break;
-      }
-      *expire_time = 0;
-    }
-  }
-
-  return expired;
-}
-
-Status TransactionLockMgr::TryLock(PessimisticTransaction* txn,
-                                   uint32_t column_family_id,
-                                   const std::string& key, Env* env,
-                                   bool exclusive) {
-  // Lookup lock map for this column family id
-  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
-  LockMap* lock_map = lock_map_ptr.get();
-  if (lock_map == nullptr) {
-    char msg[255];
-    snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32,
-             column_family_id);
-
-    return Status::InvalidArgument(msg);
-  }
-
-  // Need to lock the mutex for the stripe that this key hashes to
-  size_t stripe_num = lock_map->GetStripe(key);
-  assert(lock_map->lock_map_stripes_.size() > stripe_num);
-  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
-
-  LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive);
-  int64_t timeout = txn->GetLockTimeout();
-
-  return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env,
-                            timeout, std::move(lock_info));
-}
-
-// Helper function for TryLock().
-Status TransactionLockMgr::AcquireWithTimeout(
-    PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
-    uint32_t column_family_id, const std::string& key, Env* env,
-    int64_t timeout, LockInfo&& lock_info) {
-  Status result;
-  uint64_t end_time = 0;
-
-  if (timeout > 0) {
-    uint64_t start_time = env->NowMicros();
-    end_time = start_time + timeout;
-  }
-
-  if (timeout < 0) {
-    // If timeout is negative, we wait indefinitely to acquire the lock
-    result = stripe->stripe_mutex->Lock();
-  } else {
-    result = stripe->stripe_mutex->TryLockFor(timeout);
-  }
-
-  if (!result.ok()) {
-    // failed to acquire mutex
-    return result;
-  }
-
-  // Acquire lock if we are able to
-  uint64_t expire_time_hint = 0;
-  autovector<TransactionID> wait_ids;
-  result = AcquireLocked(lock_map, stripe, key, env, std::move(lock_info),
-                         &expire_time_hint, &wait_ids);
-
-  if (!result.ok() && timeout != 0) {
-    PERF_TIMER_GUARD(key_lock_wait_time);
-    PERF_COUNTER_ADD(key_lock_wait_count, 1);
-    // If we weren't able to acquire the lock, we will keep retrying as long
-    // as the timeout allows.
-    bool timed_out = false;
-    do {
-      // Decide how long to wait
-      int64_t cv_end_time = -1;
-
-      // Check if held lock's expiration time is sooner than our timeout
-      if (expire_time_hint > 0 &&
-          (timeout < 0 || (timeout > 0 && expire_time_hint < end_time))) {
-        // expiration time is sooner than our timeout
-        cv_end_time = expire_time_hint;
-      } else if (timeout >= 0) {
-        cv_end_time = end_time;
-      }
-
-      assert(result.IsBusy() || wait_ids.size() != 0);
-
-      // We are dependent on a transaction to finish, so perform deadlock
-      // detection.
-      if (wait_ids.size() != 0) {
-        if (txn->IsDeadlockDetect()) {
-          if (IncrementWaiters(txn, wait_ids, key, column_family_id,
-                               lock_info.exclusive, env)) {
-            result = Status::Busy(Status::SubCode::kDeadlock);
-            stripe->stripe_mutex->UnLock();
-            return result;
-          }
-        }
-        txn->SetWaitingTxn(wait_ids, column_family_id, &key);
-      }
-
-      TEST_SYNC_POINT("TransactionLockMgr::AcquireWithTimeout:WaitingTxn");
-      if (cv_end_time < 0) {
-        // Wait indefinitely
-        result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
-      } else {
-        uint64_t now = env->NowMicros();
-        if (static_cast<uint64_t>(cv_end_time) > now) {
-          result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
-                                              cv_end_time - now);
-        }
-      }
-
-      if (wait_ids.size() != 0) {
-        txn->ClearWaitingTxn();
-        if (txn->IsDeadlockDetect()) {
-          DecrementWaiters(txn, wait_ids);
-        }
-      }
-
-      if (result.IsTimedOut()) {
-          timed_out = true;
-          // Even though we timed out, we will still make one more attempt to
-          // acquire lock below (it is possible the lock expired and we
-          // were never signaled).
-      }
-
-      if (result.ok() || result.IsTimedOut()) {
-        result = AcquireLocked(lock_map, stripe, key, env, std::move(lock_info),
-                               &expire_time_hint, &wait_ids);
-      }
-    } while (!result.ok() && !timed_out);
-  }
-
-  stripe->stripe_mutex->UnLock();
-
-  return result;
-}
-
-void TransactionLockMgr::DecrementWaiters(
-    const PessimisticTransaction* txn,
-    const autovector<TransactionID>& wait_ids) {
-  std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
-  DecrementWaitersImpl(txn, wait_ids);
-}
-
-void TransactionLockMgr::DecrementWaitersImpl(
-    const PessimisticTransaction* txn,
-    const autovector<TransactionID>& wait_ids) {
-  auto id = txn->GetID();
-  assert(wait_txn_map_.Contains(id));
-  wait_txn_map_.Delete(id);
-
-  for (auto wait_id : wait_ids) {
-    rev_wait_txn_map_.Get(wait_id)--;
-    if (rev_wait_txn_map_.Get(wait_id) == 0) {
-      rev_wait_txn_map_.Delete(wait_id);
-    }
-  }
-}
-
-bool TransactionLockMgr::IncrementWaiters(
-    const PessimisticTransaction* txn,
-    const autovector<TransactionID>& wait_ids, const std::string& key,
-    const uint32_t& cf_id, const bool& exclusive, Env* const env) {
-  auto id = txn->GetID();
-  std::vector<int> queue_parents(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
-  std::vector<TransactionID> queue_values(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
-  std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
-  assert(!wait_txn_map_.Contains(id));
-
-  wait_txn_map_.Insert(id, {wait_ids, cf_id, exclusive, key});
-
-  for (auto wait_id : wait_ids) {
-    if (rev_wait_txn_map_.Contains(wait_id)) {
-      rev_wait_txn_map_.Get(wait_id)++;
-    } else {
-      rev_wait_txn_map_.Insert(wait_id, 1);
-    }
-  }
-
-  // No deadlock if nobody is waiting on self.
-  if (!rev_wait_txn_map_.Contains(id)) {
-    return false;
-  }
-
-  const auto* next_ids = &wait_ids;
-  int parent = -1;
-  int64_t deadlock_time = 0;
-  for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) {
-    int i = 0;
-    if (next_ids) {
-      for (; i < static_cast<int>(next_ids->size()) &&
-             tail + i < txn->GetDeadlockDetectDepth();
-           i++) {
-        queue_values[tail + i] = (*next_ids)[i];
-        queue_parents[tail + i] = parent;
-      }
-      tail += i;
-    }
-
-    // No more items in the list, meaning no deadlock.
-    if (tail == head) {
-      return false;
-    }
-
-    auto next = queue_values[head];
-    if (next == id) {
-      std::vector<DeadlockInfo> path;
-      while (head != -1) {
-        assert(wait_txn_map_.Contains(queue_values[head]));
-
-        auto extracted_info = wait_txn_map_.Get(queue_values[head]);
-        path.push_back({queue_values[head], extracted_info.m_cf_id,
-                        extracted_info.m_exclusive,
-                        extracted_info.m_waiting_key});
-        head = queue_parents[head];
-      }
-      env->GetCurrentTime(&deadlock_time);
-      std::reverse(path.begin(), path.end());
-      dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time));
-      deadlock_time = 0;
-      DecrementWaitersImpl(txn, wait_ids);
-      return true;
-    } else if (!wait_txn_map_.Contains(next)) {
-      next_ids = nullptr;
-      continue;
-    } else {
-      parent = head;
-      next_ids = &(wait_txn_map_.Get(next).m_neighbors);
-    }
-  }
-
-  // Wait cycle too big, just assume deadlock.
-  env->GetCurrentTime(&deadlock_time);
-  dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true));
-  DecrementWaitersImpl(txn, wait_ids);
-  return true;
-}
-
-// Try to lock this key after we have acquired the mutex.
-// Sets *expire_time to the expiration time in microseconds
-//  or 0 if no expiration.
-// REQUIRED:  Stripe mutex must be held.
-Status TransactionLockMgr::AcquireLocked(LockMap* lock_map,
-                                         LockMapStripe* stripe,
-                                         const std::string& key, Env* env,
-                                         LockInfo&& txn_lock_info,
-                                         uint64_t* expire_time,
-                                         autovector<TransactionID>* txn_ids) {
-  assert(txn_lock_info.txn_ids.size() == 1);
-
-  Status result;
-  // Check if this key is already locked
-  auto stripe_iter = stripe->keys.find(key);
-  if (stripe_iter != stripe->keys.end()) {
-    // Lock already held
-    LockInfo& lock_info = stripe_iter->second;
-    assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
-
-    if (lock_info.exclusive || txn_lock_info.exclusive) {
-      if (lock_info.txn_ids.size() == 1 &&
-          lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
-        // The list contains one txn and we're it, so just take it.
-        lock_info.exclusive = txn_lock_info.exclusive;
-        lock_info.expiration_time = txn_lock_info.expiration_time;
-      } else {
-        // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
-        // it's there for a shared lock with multiple holders which was not
-        // caught in the first case.
-        if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
-                          expire_time)) {
-          // lock is expired, can steal it
-          lock_info.txn_ids = txn_lock_info.txn_ids;
-          lock_info.exclusive = txn_lock_info.exclusive;
-          lock_info.expiration_time = txn_lock_info.expiration_time;
-          // lock_cnt does not change
-        } else {
-          result = Status::TimedOut(Status::SubCode::kLockTimeout);
-          *txn_ids = lock_info.txn_ids;
-        }
-      }
-    } else {
-      // We are requesting shared access to a shared lock, so just grant it.
-      lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
-      // Using std::max means that expiration time never goes down even when
-      // a transaction is removed from the list. The correct solution would be
-      // to track expiry for every transaction, but this would also work for
-      // now.
-      lock_info.expiration_time =
-          std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
-    }
-  } else {  // Lock not held.
-    // Check lock limit
-    if (max_num_locks_ > 0 &&
-        lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
-      result = Status::Busy(Status::SubCode::kLockLimit);
-    } else {
-      // acquire lock
-      stripe->keys.emplace(key, std::move(txn_lock_info));
-
-      // Maintain lock count if there is a limit on the number of locks
-      if (max_num_locks_) {
-        lock_map->lock_cnt++;
-      }
-    }
-  }
-
-  return result;
-}
-
-void TransactionLockMgr::UnLockKey(const PessimisticTransaction* txn,
-                                   const std::string& key,
-                                   LockMapStripe* stripe, LockMap* lock_map,
-                                   Env* env) {
-#ifdef NDEBUG
-  (void)env;
-#endif
-  TransactionID txn_id = txn->GetID();
-
-  auto stripe_iter = stripe->keys.find(key);
-  if (stripe_iter != stripe->keys.end()) {
-    auto& txns = stripe_iter->second.txn_ids;
-    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
-    // Found the key we locked.  unlock it.
-    if (txn_it != txns.end()) {
-      if (txns.size() == 1) {
-        stripe->keys.erase(stripe_iter);
-      } else {
-        auto last_it = txns.end() - 1;
-        if (txn_it != last_it) {
-          *txn_it = *last_it;
-        }
-        txns.pop_back();
-      }
-
-      if (max_num_locks_ > 0) {
-        // Maintain lock count if there is a limit on the number of locks.
-        assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
-        lock_map->lock_cnt--;
-      }
-    }
-  } else {
-    // This key is either not locked or locked by someone else.  This should
-    // only happen if the unlocking transaction has expired.
-    assert(txn->GetExpirationTime() > 0 &&
-           txn->GetExpirationTime() < env->NowMicros());
-  }
-}
-
-void TransactionLockMgr::UnLock(PessimisticTransaction* txn,
-                                uint32_t column_family_id,
-                                const std::string& key, Env* env) {
-  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
-  LockMap* lock_map = lock_map_ptr.get();
-  if (lock_map == nullptr) {
-    // Column Family must have been dropped.
-    return;
-  }
-
-  // Lock the mutex for the stripe that this key hashes to
-  size_t stripe_num = lock_map->GetStripe(key);
-  assert(lock_map->lock_map_stripes_.size() > stripe_num);
-  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
-
-  stripe->stripe_mutex->Lock();
-  UnLockKey(txn, key, stripe, lock_map, env);
-  stripe->stripe_mutex->UnLock();
-
-  // Signal waiting threads to retry locking
-  stripe->stripe_cv->NotifyAll();
-}
-
-void TransactionLockMgr::UnLock(const PessimisticTransaction* txn,
-                                const TransactionKeyMap* key_map, Env* env) {
-  for (auto& key_map_iter : *key_map) {
-    uint32_t column_family_id = key_map_iter.first;
-    auto& keys = key_map_iter.second;
-
-    std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
-    LockMap* lock_map = lock_map_ptr.get();
-
-    if (lock_map == nullptr) {
-      // Column Family must have been dropped.
-      return;
-    }
-
-    // Bucket keys by lock_map_ stripe
-    std::unordered_map<size_t, std::vector<const std::string*>> keys_by_stripe(
-        std::max(keys.size(), lock_map->num_stripes_));
-
-    for (auto& key_iter : keys) {
-      const std::string& key = key_iter.first;
-
-      size_t stripe_num = lock_map->GetStripe(key);
-      keys_by_stripe[stripe_num].push_back(&key);
-    }
-
-    // For each stripe, grab the stripe mutex and unlock all keys in this stripe
-    for (auto& stripe_iter : keys_by_stripe) {
-      size_t stripe_num = stripe_iter.first;
-      auto& stripe_keys = stripe_iter.second;
-
-      assert(lock_map->lock_map_stripes_.size() > stripe_num);
-      LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
-
-      stripe->stripe_mutex->Lock();
-
-      for (const std::string* key : stripe_keys) {
-        UnLockKey(txn, *key, stripe, lock_map, env);
-      }
-
-      stripe->stripe_mutex->UnLock();
-
-      // Signal waiting threads to retry locking
-      stripe->stripe_cv->NotifyAll();
-    }
-  }
-}
-
-TransactionLockMgr::LockStatusData TransactionLockMgr::GetLockStatusData() {
-  LockStatusData data;
-  // Lock order here is important. The correct order is lock_map_mutex_, then
-  // for every column family ID in ascending order lock every stripe in
-  // ascending order.
-  InstrumentedMutexLock l(&lock_map_mutex_);
-
-  std::vector<uint32_t> cf_ids;
-  for (const auto& map : lock_maps_) {
-    cf_ids.push_back(map.first);
-  }
-  std::sort(cf_ids.begin(), cf_ids.end());
-
-  for (auto i : cf_ids) {
-    const auto& stripes = lock_maps_[i]->lock_map_stripes_;
-    // Iterate and lock all stripes in ascending order.
-    for (const auto& j : stripes) {
-      j->stripe_mutex->Lock();
-      for (const auto& it : j->keys) {
-        struct KeyLockInfo info;
-        info.exclusive = it.second.exclusive;
-        info.key = it.first;
-        for (const auto& id : it.second.txn_ids) {
-          info.ids.push_back(id);
-        }
-        data.insert({i, info});
-      }
-    }
-  }
-
-  // Unlock everything. Unlocking order is not important.
-  for (auto i : cf_ids) {
-    const auto& stripes = lock_maps_[i]->lock_map_stripes_;
-    for (const auto& j : stripes) {
-      j->stripe_mutex->UnLock();
-    }
-  }
-
-  return data;
-}
-std::vector<DeadlockPath> TransactionLockMgr::GetDeadlockInfoBuffer() {
-  return dlock_buffer_.PrepareBuffer();
-}
-
-void TransactionLockMgr::Resize(uint32_t target_size) {
-  dlock_buffer_.Resize(target_size);
-}
-
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_lock_mgr.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,158 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <chrono>
-#include <string>
-#include <unordered_map>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "monitoring/instrumented_mutex.h"
-#include "rocksdb/utilities/transaction.h"
-#include "util/autovector.h"
-#include "util/hash_map.h"
-#include "util/thread_local.h"
-#include "utilities/transactions/pessimistic_transaction.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class ColumnFamilyHandle;
-struct LockInfo;
-struct LockMap;
-struct LockMapStripe;
-
-struct DeadlockInfoBuffer {
- private:
-  std::vector<DeadlockPath> paths_buffer_;
-  uint32_t buffer_idx_;
-  std::mutex paths_buffer_mutex_;
-  std::vector<DeadlockPath> Normalize();
-
- public:
-  explicit DeadlockInfoBuffer(uint32_t n_latest_dlocks)
-      : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {}
-  void AddNewPath(DeadlockPath path);
-  void Resize(uint32_t target_size);
-  std::vector<DeadlockPath> PrepareBuffer();
-};
-
-struct TrackedTrxInfo {
-  autovector<TransactionID> m_neighbors;
-  uint32_t m_cf_id;
-  bool m_exclusive;
-  std::string m_waiting_key;
-};
-
-class Slice;
-class PessimisticTransactionDB;
-
-class TransactionLockMgr {
- public:
-  TransactionLockMgr(TransactionDB* txn_db, size_t default_num_stripes,
-                     int64_t max_num_locks, uint32_t max_num_deadlocks,
-                     std::shared_ptr<TransactionDBMutexFactory> factory);
-  // No copying allowed
-  TransactionLockMgr(const TransactionLockMgr&) = delete;
-  void operator=(const TransactionLockMgr&) = delete;
-
-  ~TransactionLockMgr();
-
-  // Creates a new LockMap for this column family.  Caller should guarantee
-  // that this column family does not already exist.
-  void AddColumnFamily(uint32_t column_family_id);
-
-  // Deletes the LockMap for this column family.  Caller should guarantee that
-  // this column family is no longer in use.
-  void RemoveColumnFamily(uint32_t column_family_id);
-
-  // Attempt to lock key.  If OK status is returned, the caller is responsible
-  // for calling UnLock() on this key.
-  Status TryLock(PessimisticTransaction* txn, uint32_t column_family_id,
-                 const std::string& key, Env* env, bool exclusive);
-
-  // Unlock a key locked by TryLock().  txn must be the same Transaction that
-  // locked this key.
-  void UnLock(const PessimisticTransaction* txn, const TransactionKeyMap* keys,
-              Env* env);
-  void UnLock(PessimisticTransaction* txn, uint32_t column_family_id,
-              const std::string& key, Env* env);
-
-  using LockStatusData = std::unordered_multimap<uint32_t, KeyLockInfo>;
-  LockStatusData GetLockStatusData();
-  std::vector<DeadlockPath> GetDeadlockInfoBuffer();
-  void Resize(uint32_t);
-
- private:
-  PessimisticTransactionDB* txn_db_impl_;
-
-  // Default number of lock map stripes per column family
-  const size_t default_num_stripes_;
-
-  // Limit on number of keys locked per column family
-  const int64_t max_num_locks_;
-
-  // The following lock order must be satisfied in order to avoid deadlocking
-  // ourselves.
-  //   - lock_map_mutex_
-  //   - stripe mutexes in ascending cf id, ascending stripe order
-  //   - wait_txn_map_mutex_
-  //
-  // Must be held when accessing/modifying lock_maps_.
-  InstrumentedMutex lock_map_mutex_;
-
-  // Map of ColumnFamilyId to locked key info
-  using LockMaps = std::unordered_map<uint32_t, std::shared_ptr<LockMap>>;
-  LockMaps lock_maps_;
-
-  // Thread-local cache of entries in lock_maps_.  This is an optimization
-  // to avoid acquiring a mutex in order to look up a LockMap
-  std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
-
-  // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_.
-  std::mutex wait_txn_map_mutex_;
-
-  // Maps from waitee -> number of waiters.
-  HashMap<TransactionID, int> rev_wait_txn_map_;
-  // Maps from waiter -> waitee.
-  HashMap<TransactionID, TrackedTrxInfo> wait_txn_map_;
-  DeadlockInfoBuffer dlock_buffer_;
-
-  // Used to allocate mutexes/condvars to use when locking keys
-  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
-
-  bool IsLockExpired(TransactionID txn_id, const LockInfo& lock_info, Env* env,
-                     uint64_t* wait_time);
-
-  std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
-
-  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
-                            LockMapStripe* stripe, uint32_t column_family_id,
-                            const std::string& key, Env* env, int64_t timeout,
-                            LockInfo&& lock_info);
-
-  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
-                       const std::string& key, Env* env,
-                       LockInfo&& lock_info, uint64_t* wait_time,
-                       autovector<TransactionID>* txn_ids);
-
-  void UnLockKey(const PessimisticTransaction* txn, const std::string& key,
-                 LockMapStripe* stripe, LockMap* lock_map, Env* env);
-
-  bool IncrementWaiters(const PessimisticTransaction* txn,
-                        const autovector<TransactionID>& wait_ids,
-                        const std::string& key, const uint32_t& cf_id,
-                        const bool& exclusive, Env* const env);
-  void DecrementWaiters(const PessimisticTransaction* txn,
-                        const autovector<TransactionID>& wait_ids);
-  void DecrementWaitersImpl(const PessimisticTransaction* txn,
-                            const autovector<TransactionID>& wait_ids);
-};
-
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -13,25 +13,24 @@
 #include <thread>
 
 #include "db/db_impl/db_impl.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "test_util/transaction_test_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 
-#include "port/port.h"
-
 using std::string;
 
 namespace ROCKSDB_NAMESPACE {
@@ -63,8 +62,9 @@
         std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
         std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
 
-// MySQLStyleTransactionTest takes far too long for valgrind to run.
-#ifndef ROCKSDB_VALGRIND_RUN
+// MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it
+// in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set).
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 INSTANTIATE_TEST_CASE_P(
     MySQLStyleTransactionTest, MySQLStyleTransactionTest,
     ::testing::Values(
@@ -80,7 +80,7 @@
         std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(TransactionTest, DoubleEmptyWrite) {
   WriteOptions write_options;
@@ -103,7 +103,7 @@
   // Also test that it works during recovery
   txn0 = db->BeginTransaction(write_options, txn_options);
   ASSERT_OK(txn0->SetName("xid2"));
-  txn0->Put(Slice("foo0"), Slice("bar0a"));
+  ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
   ASSERT_OK(txn0->Prepare());
   delete txn0;
   reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
@@ -148,6 +148,78 @@
   delete txn;
 }
 
+TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) {
+  const TxnDBWritePolicy write_policy = std::get<2>(GetParam());
+
+  if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) {
+    ROCKSDB_GTEST_BYPASS("Test applies to write-committed only");
+    return;
+  }
+
+  ASSERT_OK(db->Put(WriteOptions(), "key0", "value"));
+
+  TransactionOptions txn_opts;
+  txn_opts.use_only_the_last_commit_time_batch_for_recovery = true;
+  Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts);
+  assert(txn);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&](void* arg) {
+        // db mutex not held.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        assert(mems);
+        ASSERT_EQ(1, mems->size());
+        auto* ctwb = txn->GetCommitTimeWriteBatch();
+        ASSERT_OK(ctwb->Put("gtid", "123"));
+        ASSERT_OK(txn->Commit());
+        delete txn;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(txn->Put("key1", "value"));
+  ASSERT_OK(txn->SetName("txn1"));
+
+  ASSERT_OK(txn->Prepare());
+
+  auto dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr));
+  ASSERT_OK(dbimpl->TEST_FlushMemTable(
+      /*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr));
+
+  ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable());
+
+  {
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  delete db;
+  db = nullptr;
+  Status s;
+  if (use_stackable_db_ == false) {
+    s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  } else {
+    s = OpenWithStackableDB();
+  }
+  ASSERT_OK(s);
+  assert(db);
+
+  {
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), "gtid", &value));
+    ASSERT_EQ("123", value);
+
+    ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 // The test clarifies the contract of do_validate and assume_tracked
 // in GetForUpdate and Put/Merge/Delete
 TEST_P(TransactionTest, AssumeExclusiveTracked) {
@@ -199,7 +271,7 @@
   ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"),
                               ASSUME_LOCKED));
 
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 }
 
@@ -223,8 +295,8 @@
       }
 
       if (with_flush) {
-        auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-        db_impl->TEST_FlushMemTable(true);
+        auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+        ASSERT_OK(db_impl->TEST_FlushMemTable(true));
         // Make sure the flushed memtable is not kept in memory
         int max_memtable_in_history =
             std::max(
@@ -233,8 +305,8 @@
                     static_cast<int>(options.write_buffer_size)) +
             1;
         for (int i = 0; i < max_memtable_in_history; i++) {
-          db->Put(write_options, Slice("key"), Slice("value"));
-          db_impl->TEST_FlushMemTable(true);
+          ASSERT_OK(db->Put(write_options, Slice("key"), Slice("value")));
+          ASSERT_OK(db_impl->TEST_FlushMemTable(true));
         }
       }
 
@@ -283,7 +355,7 @@
   ASSERT_TRUE(txn2);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
+      "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
         std::string key;
         uint32_t cf_id;
         std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
@@ -393,9 +465,9 @@
   ASSERT_EQ(expected_txns, lock_txns);
   ASSERT_FALSE(cf_iterator->second.exclusive);
 
-  txn1->Rollback();
-  txn2->Rollback();
-  txn3->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn3->Rollback());
 
   // Test txn1 and txn2 sharing a lock and txn3 trying to obtain it.
   s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
@@ -417,9 +489,9 @@
   s = txn3->GetForUpdate(read_options, "foo", nullptr);
   ASSERT_OK(s);
 
-  txn1->Rollback();
-  txn2->Rollback();
-  txn3->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn3->Rollback());
 
   // Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock.
   s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
@@ -455,8 +527,8 @@
   ASSERT_TRUE(s.IsTimedOut());
   ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
 
-  txn1->Rollback();
-  txn2->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
 
   // Test txn1 holding an exclusive lock and txn2 trying to obtain shared
   // access.
@@ -509,7 +581,7 @@
 
   std::atomic<uint32_t> checkpoints(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
+      "PointLockManager::AcquireWithTimeout:WaitingTxn",
       [&](void* /*arg*/) { checkpoints.fetch_add(1); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -520,7 +592,7 @@
       auto s = txns[i]->GetForUpdate(read_options, ToString(i + 1), nullptr,
                                      true /* exclusive */);
       ASSERT_OK(s);
-      txns[i]->Rollback();
+      ASSERT_OK(txns[i]->Rollback());
       delete txns[i];
     };
     threads.emplace_back(blocking_thread);
@@ -582,7 +654,7 @@
 
   // Rollback the leaf transaction.
   for (uint32_t i = 15; i < 31; i++) {
-    txns[i]->Rollback();
+    ASSERT_OK(txns[i]->Rollback());
     delete txns[i];
   }
 
@@ -642,7 +714,7 @@
 
   std::atomic<uint32_t> checkpoints_shared(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
+      "PointLockManager::AcquireWithTimeout:WaitingTxn",
       [&](void* /*arg*/) { checkpoints_shared.fetch_add(1); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -652,7 +724,7 @@
       auto s =
           txns_shared[i]->GetForUpdate(read_options, ToString(i + 1), nullptr);
       ASSERT_OK(s);
-      txns_shared[i]->Rollback();
+      ASSERT_OK(txns_shared[i]->Rollback());
       delete txns_shared[i];
     };
     threads_shared.emplace_back(blocking_thread);
@@ -679,7 +751,7 @@
   // Verify the exclusivity field of the transactions in the deadlock path.
   ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive);
   ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive);
-  txns_shared[1]->Rollback();
+  ASSERT_OK(txns_shared[1]->Rollback());
   delete txns_shared[1];
 
   for (auto& t : threads_shared) {
@@ -687,7 +759,7 @@
   }
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_P(TransactionStressTest, DeadlockCycle) {
   WriteOptions write_options;
   ReadOptions read_options;
@@ -715,18 +787,18 @@
 
     std::atomic<uint32_t> checkpoints(0);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
+        "PointLockManager::AcquireWithTimeout:WaitingTxn",
         [&](void* /*arg*/) { checkpoints.fetch_add(1); });
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
     // We want the last transaction in the chain to block and hold everyone
     // back.
     std::vector<port::Thread> threads;
-    for (uint32_t i = 0; i < len - 1; i++) {
+    for (uint32_t i = 0; i + 1 < len; i++) {
       std::function<void()> blocking_thread = [&, i] {
         auto s = txns[i]->GetForUpdate(read_options, ToString(i + 1), nullptr);
         ASSERT_OK(s);
-        txns[i]->Rollback();
+        ASSERT_OK(txns[i]->Rollback());
         delete txns[i];
       };
       threads.emplace_back(blocking_thread);
@@ -787,7 +859,7 @@
     }
 
     // Rollback the last transaction.
-    txns[len - 1]->Rollback();
+    ASSERT_OK(txns[len - 1]->Rollback());
     delete txns[len - 1];
 
     for (auto& t : threads) {
@@ -810,7 +882,7 @@
   std::vector<std::string> keys;
 
   for (uint32_t i = 0; i < NUM_KEYS; i++) {
-    db->Put(write_options, Slice(ToString(i)), Slice(""));
+    ASSERT_OK(db->Put(write_options, Slice(ToString(i)), Slice("")));
     keys.push_back(ToString(i));
   }
 
@@ -832,7 +904,7 @@
             txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0);
         if (!s.ok()) {
           ASSERT_TRUE(s.IsDeadlock());
-          txn->Rollback();
+          ASSERT_OK(txn->Rollback());
           break;
         }
       }
@@ -850,7 +922,7 @@
     t.join();
   }
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(TransactionTest, CommitTimeBatchFailTest) {
   WriteOptions write_options;
@@ -882,7 +954,7 @@
   assert(db != nullptr);
   Random rnd(47);
   std::vector<Transaction*> txns;
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
   // At the beginning there should be no log containing prepare data
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
   for (size_t i = 0; i < 100; i++) {
@@ -897,7 +969,7 @@
       ASSERT_OK(txn->Commit());
       delete txn;
     }
-    db_impl->TEST_FlushMemTable(true);
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
   }
   for (auto txn : txns) {
     ASSERT_OK(txn->Commit());
@@ -923,7 +995,7 @@
     string value;
     Status s;
 
-    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
 
     Transaction* txn = db->BeginTransaction(write_options, txn_options);
     s = txn->SetName("xid");
@@ -942,12 +1014,14 @@
     ASSERT_EQ(1, txn->GetNumPuts());
 
     // regular db read
-    db->Get(read_options, "foo2", &value);
+    ASSERT_OK(db->Get(read_options, "foo2", &value));
     ASSERT_EQ(value, "bar2");
 
     // commit time put
-    txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs"));
-    txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats"));
+    ASSERT_OK(
+        txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs")));
+    ASSERT_OK(
+        txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats")));
 
     // nothing has been prepped yet
     ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
@@ -1018,7 +1092,7 @@
         assert(false);
     }
 
-    db_impl->TEST_FlushMemTable(true);
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
     // After flush the recoverable state must be visible
     if (cwb4recovery) {
       s = db->Get(read_options, "gtid", &value);
@@ -1105,8 +1179,8 @@
   s = txn1->SetName("name4");
   ASSERT_EQ(s, Status::InvalidArgument());
 
-  txn1->Rollback();
-  txn2->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
   delete txn1;
   delete txn2;
 }
@@ -1145,7 +1219,8 @@
 
       delete txn1;
 
-      txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar"));
+      ASSERT_OK(
+          txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar")));
 
       s = txn2->Prepare();
       ASSERT_OK(s);
@@ -1160,14 +1235,14 @@
         ASSERT_EQ(value, "bar");
       } else {
         if (test_with_empty_wal) {
-          DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-          db_impl->TEST_FlushMemTable(true);
+          DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+          ASSERT_OK(db_impl->TEST_FlushMemTable(true));
           // After flush the state must be visible
           s = db->Get(read_options, "foo", &value);
           ASSERT_OK(s);
           ASSERT_EQ(value, "bar");
         }
-        db->FlushWAL(true);
+        ASSERT_OK(db->FlushWAL(true));
         // kill and reopen to trigger recovery
         s = ReOpenNoDelete();
         ASSERT_OK(s);
@@ -1180,7 +1255,7 @@
   }
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_P(TransactionStressTest, TwoPhaseExpirationTest) {
   Status s;
 
@@ -1222,7 +1297,7 @@
   std::string value;
   Status s;
 
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
   Transaction* txn = db->BeginTransaction(write_options, txn_options);
   s = txn->SetName("xid");
   ASSERT_OK(s);
@@ -1260,7 +1335,7 @@
   // flush to next wal
   s = db->Put(write_options, Slice("foo"), Slice("bar"));
   ASSERT_OK(s);
-  db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
 
   // issue rollback (marker written to WAL)
   s = txn->Rollback();
@@ -1293,7 +1368,7 @@
   std::string value;
   Status s;
 
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
 
   Transaction* txn = db->BeginTransaction(write_options, txn_options);
   s = txn->SetName("xid");
@@ -1316,7 +1391,7 @@
   ASSERT_OK(s);
   ASSERT_EQ(1, txn->GetNumPuts());
 
-  db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
 
   // regular db read
   db->Get(read_options, "foo2", &value);
@@ -1333,14 +1408,14 @@
   s = db->Get(read_options, Slice("foo"), &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  db->FlushWAL(false);
+  ASSERT_OK(db->FlushWAL(false));
   delete txn;
   // kill and reopen
   reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
   s = ReOpenNoDelete();
   ASSERT_OK(s);
   assert(db != nullptr);
-  db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
 
   // find trans in list of prepared transactions
   std::vector<Transaction*> prepared_trans;
@@ -1407,7 +1482,7 @@
   s = db->Put(write_options, Slice("foo3"), Slice("bar3"));
   ASSERT_OK(s);
 
-  db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
 
   // after memtable flush we can now release the log
   ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
@@ -1418,7 +1493,7 @@
   // deleting transaction should unregister transaction
   ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 // TODO this test needs to be updated with serial commits
 TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
@@ -1502,6 +1577,7 @@
   }
 }
 
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) {
   WriteOptions write_options;
   write_options.sync = true;
@@ -1612,6 +1688,7 @@
   ASSERT_EQ(s, Status::OK());
   ASSERT_EQ(value, "bar4");
 }
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) {
   WriteOptions write_options;
@@ -1644,7 +1721,9 @@
   ReOpenNoDelete();
 
   // commit old txn
+  assert(db != nullptr);  // Make clang analyze happy.
   txn = db->GetTransactionByName("a");
+  assert(txn != nullptr);
   s = txn->Commit();
   ASSERT_OK(s);
 
@@ -1671,7 +1750,7 @@
 
   // kill and reopen
   env->SetFilesystemActive(false);
-  ReOpenNoDelete();
+  ASSERT_OK(ReOpenNoDelete());
   assert(db != nullptr);
 
   // value is now available
@@ -1685,7 +1764,7 @@
 }
 
 TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
 
   Status s;
   std::string v;
@@ -1837,7 +1916,7 @@
 }
 
 TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
 
   Status s;
   ColumnFamilyHandle *cfa, *cfb;
@@ -1852,8 +1931,8 @@
   wopts.disableWAL = false;
   wopts.sync = true;
 
-  auto cfh_a = reinterpret_cast<ColumnFamilyHandleImpl*>(cfa);
-  auto cfh_b = reinterpret_cast<ColumnFamilyHandleImpl*>(cfb);
+  auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(cfa);
+  auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(cfb);
 
   TransactionOptions topts1;
   Transaction* txn1 = db->BeginTransaction(wopts, topts1);
@@ -1932,7 +2011,7 @@
 
   // request a flush for all column families such that the earliest
   // alive log file can be killed
-  db_impl->TEST_SwitchWAL();
+  ASSERT_OK(db_impl->TEST_SwitchWAL());
   // log cannot be flushed because txn2 has not been commited
   ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed());
   ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog());
@@ -1958,7 +2037,7 @@
   s = txn2->Commit();
   ASSERT_OK(s);
 
-  db_impl->TEST_SwitchWAL();
+  ASSERT_OK(db_impl->TEST_SwitchWAL());
   ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
 
   // we should see that cfb now has a flush requested
@@ -1982,7 +2061,7 @@
  * hidden behind improperly summed sequence ids
  */
 TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) {
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
   WriteOptions wal_on, wal_off;
   wal_on.sync = true;
   wal_on.disableWAL = false;
@@ -2019,12 +2098,12 @@
   s = db->Put(wal_on, "cats", "dogs4");
   ASSERT_OK(s);
 
-  db->FlushWAL(false);
+  ASSERT_OK(db->FlushWAL(false));
 
   // kill and reopen
   env->SetFilesystemActive(false);
   reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
-  ReOpenNoDelete();
+  ASSERT_OK(ReOpenNoDelete());
   assert(db != nullptr);
 
   s = db->Get(read_options, "first", &value);
@@ -2097,8 +2176,8 @@
   string value;
   Status s;
 
-  db->Put(write_options, "foo", "A");
-  db->Put(write_options, "foo2", "B");
+  ASSERT_OK(db->Put(write_options, "foo", "A"));
+  ASSERT_OK(db->Put(write_options, "foo2", "B"));
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -2134,7 +2213,7 @@
   std::string value;
   Status s;
 
-  db->Put(write_options, "foo", "bar");
+  ASSERT_OK(db->Put(write_options, "foo", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = db->BeginTransaction(write_options, txn_options);
@@ -2182,8 +2261,8 @@
   std::string value;
   Status s;
 
-  db->Put(write_options, "foo", "bar");
-  db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(db->Put(write_options, "foo2", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = db->BeginTransaction(write_options, txn_options);
@@ -2192,7 +2271,7 @@
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   // This Put outside of a transaction will conflict with the previous read
@@ -2238,21 +2317,21 @@
   std::string value;
   Status s;
 
-  db->Put(write_options, Slice("foo"), Slice("bar"));
-  db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   s = txn->Put(Slice("foo"), Slice("bar2"));
   ASSERT_OK(s);
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a memtable to flush
@@ -2301,11 +2380,11 @@
     TransactionOptions txn_options;
     string value;
 
-    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
 
-    db->Put(write_options, Slice("foo"), Slice("bar"));
-    db->Put(write_options, Slice("foo2"), Slice("bar2"));
-    db->Put(write_options, Slice("foo3"), Slice("bar3"));
+    ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar2")));
+    ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar3")));
 
     txn_options.set_snapshot = true;
     Transaction* txn = db->BeginTransaction(write_options, txn_options);
@@ -2313,13 +2392,13 @@
 
     snapshot_read_options.snapshot = txn->GetSnapshot();
 
-    txn->GetForUpdate(snapshot_read_options, "foo", &value);
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
     ASSERT_EQ(value, "bar");
 
     s = txn->Put(Slice("foo"), Slice("bar2"));
     ASSERT_OK(s);
 
-    txn->GetForUpdate(snapshot_read_options, "foo", &value);
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
     ASSERT_EQ(value, "bar2");
     // verify foo is locked by txn
     s = db->Delete(write_options, "foo");
@@ -2404,7 +2483,7 @@
     s = db->Delete(write_options, "foo3");
     ASSERT_TRUE(s.IsTimedOut());
 
-    db_impl->TEST_WaitForCompact();
+    ASSERT_OK(db_impl->TEST_WaitForCompact());
 
     s = txn->Commit();
     ASSERT_OK(s);
@@ -2431,16 +2510,16 @@
   std::string value;
   Status s;
 
-  db->Put(write_options, "AAA", "bar");
+  ASSERT_OK(db->Put(write_options, "AAA", "bar"));
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
   // Modify key after transaction start
-  db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snap
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
   s = txn->Put("AAA", "bar2");
   ASSERT_OK(s);
@@ -2449,7 +2528,7 @@
   s = txn->Commit();
   ASSERT_OK(s);
 
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -2468,7 +2547,7 @@
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
-  db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snapshot
   ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
@@ -2495,7 +2574,7 @@
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   // Read and write with snapshot
-  txn->GetForUpdate(snapshot_read_options, "CCC", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
   ASSERT_EQ(value, "bar1");
   s = txn->Put("CCC", "bar2");
   ASSERT_OK(s);
@@ -2538,8 +2617,8 @@
   txn = db->BeginTransaction(write_options);
 
   // Potentially conflicting writes
-  db->Put(write_options, "ZZZ", "zzz");
-  db->Put(write_options, "XXX", "xxx");
+  ASSERT_OK(db->Put(write_options, "ZZZ", "zzz"));
+  ASSERT_OK(db->Put(write_options, "XXX", "xxx"));
 
   txn->SetSnapshot();
 
@@ -2621,12 +2700,12 @@
 
   // Write some data to the db
   WriteBatch batch;
-  batch.Put("foo", "foo");
-  batch.Put(handles[1], "AAA", "bar");
-  batch.Put(handles[1], "AAAZZZ", "bar");
+  ASSERT_OK(batch.Put("foo", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+  ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
   s = db->Write(write_options, &batch);
   ASSERT_OK(s);
-  db->Delete(write_options, handles[1], "AAAZZZ");
+  ASSERT_OK(db->Delete(write_options, handles[1], "AAAZZZ"));
 
   // These keys do not conflict with existing writes since they're in
   // different column families
@@ -2710,7 +2789,7 @@
   ASSERT_TRUE(s.IsNotFound());
 
   // Put a key which will conflict with the next txn using the previous snapshot
-  db->Put(write_options, handles[2], "foo", "000");
+  ASSERT_OK(db->Put(write_options, handles[2], "foo", "000"));
 
   results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
                                     multiget_keys, &values);
@@ -2774,13 +2853,13 @@
 
   // Write some data to the db
   WriteBatch batch;
-  batch.Put(handles[1], "aaa", "val1");
-  batch.Put(handles[1], "bbb", "val2");
-  batch.Put(handles[1], "ccc", "val3");
-  batch.Put(handles[1], "ddd", "foo");
-  batch.Put(handles[1], "eee", "val5");
-  batch.Put(handles[1], "fff", "val6");
-  batch.Merge(handles[1], "ggg", "foo");
+  ASSERT_OK(batch.Put(handles[1], "aaa", "val1"));
+  ASSERT_OK(batch.Put(handles[1], "bbb", "val2"));
+  ASSERT_OK(batch.Put(handles[1], "ccc", "val3"));
+  ASSERT_OK(batch.Put(handles[1], "ddd", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "eee", "val5"));
+  ASSERT_OK(batch.Put(handles[1], "fff", "val6"));
+  ASSERT_OK(batch.Merge(handles[1], "ggg", "foo"));
   s = db->Write(write_options, &batch);
   ASSERT_OK(s);
 
@@ -2810,7 +2889,8 @@
   ASSERT_TRUE(statuses[1].IsNotFound());
   ASSERT_TRUE(statuses[2].ok());
   ASSERT_EQ(values[2], "val3_new");
-  ASSERT_TRUE(statuses[3].IsMergeInProgress());
+  ASSERT_TRUE(statuses[3].ok());
+  ASSERT_EQ(values[3], "foo,bar");
   ASSERT_TRUE(statuses[4].ok());
   ASSERT_EQ(values[4], "val5");
   ASSERT_TRUE(statuses[5].ok());
@@ -2869,7 +2949,7 @@
   WriteBatch batch;
   for (int i = 0; i < 3 * MultiGetContext::MAX_BATCH_SIZE; ++i) {
     std::string val = "val" + std::to_string(i);
-    batch.Put(handles[1], key_str[i], val);
+    ASSERT_OK(batch.Put(handles[1], key_str[i], val));
   }
   s = db->Write(write_options, &batch);
   ASSERT_OK(s);
@@ -2921,6 +3001,47 @@
   }
 }
 
+TEST_P(TransactionTest, MultiGetSnapshot) {
+  WriteOptions write_options;
+  TransactionOptions transaction_options;
+  Transaction* txn1 = db->BeginTransaction(write_options, transaction_options);
+
+  Slice key = "foo";
+
+  Status s = txn1->Put(key, "bar");
+  ASSERT_OK(s);
+
+  s = txn1->SetName("test");
+  ASSERT_OK(s);
+
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  // Get snapshot between prepare and commit
+  // Un-committed data should be invisible to other transactions
+  const Snapshot* s1 = db->GetSnapshot();
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  Transaction* txn2 = db->BeginTransaction(write_options, transaction_options);
+  ReadOptions read_options;
+  read_options.snapshot = s1;
+
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values(1);
+  std::vector<Status> statuses(1);
+  keys.push_back(key);
+  auto cfd = db->DefaultColumnFamily();
+  txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(),
+                 statuses.data());
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  delete txn2;
+
+  db->ReleaseSnapshot(s1);
+}
+
 TEST_P(TransactionTest, ColumnFamiliesTest2) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
@@ -3012,7 +3133,7 @@
   delete txn;
 
   txn = db->BeginTransaction(write_options);
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 
   txn = db->BeginTransaction(write_options);
@@ -3059,17 +3180,23 @@
 
   std::vector<Status> results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_EQ(results.size(), 3);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
   s = txn2->Put("2", "x");  // Conflict's with txn1's MultiGetForUpdate
   ASSERT_TRUE(s.IsTimedOut());
 
-  txn2->Rollback();
+  ASSERT_OK(txn2->Rollback());
 
   multiget_values.clear();
   results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_EQ(results.size(), 3);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
   s = txn1->Commit();
   ASSERT_OK(s);
@@ -3095,7 +3222,7 @@
   s = txn2->GetForUpdate(read_options2, "4", &value);
   ASSERT_TRUE(s.IsBusy());
 
-  txn2->Rollback();
+  ASSERT_OK(txn2->Rollback());
 
   delete txn1;
   delete txn2;
@@ -3245,7 +3372,7 @@
 
   s = txn->PutUntracked("untracked", "0");
   ASSERT_OK(s);
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   s = db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
 
@@ -3388,7 +3515,7 @@
   s = txn1->Put("Z", "a");
   ASSERT_OK(s);
 
-  txn1->Rollback();
+  ASSERT_OK(txn1->Rollback());
 
   s = txn1->Put("Y", "y");
   ASSERT_OK(s);
@@ -3449,7 +3576,7 @@
   s = txn2->Put("X", "2");
   ASSERT_TRUE(s.IsTimedOut());
 
-  txn1->Rollback();
+  ASSERT_OK(txn1->Rollback());
   delete txn1;
 
   // txn2 should now be able to write to X
@@ -3860,7 +3987,7 @@
   // Rollback to beginning of txn
   s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
 
   ASSERT_EQ(0, txn->GetNumPuts());
   ASSERT_EQ(0, txn->GetNumDeletes());
@@ -4217,7 +4344,7 @@
   // Verify that A is now unlocked
   s = txn2->Put("A", "a2");
   ASSERT_OK(s);
-  txn2->Commit();
+  ASSERT_OK(txn2->Commit());
   delete txn2;
   s = db->Get(read_options, "A", &value);
   ASSERT_OK(s);
@@ -4243,7 +4370,7 @@
   s = txn2->Put("B", "b4");
   ASSERT_TRUE(s.IsTimedOut());
 
-  txn1->Rollback();
+  ASSERT_OK(txn1->Rollback());
   delete txn1;
 
   // Verify that A and B are no longer locked
@@ -4431,7 +4558,7 @@
   s = txn2->Put("G", "g3");
   ASSERT_OK(s);
 
-  txn1->RollbackToSavePoint();  // rollback to 2
+  ASSERT_OK(txn1->RollbackToSavePoint());  // rollback to 2
 
   // Verify A,B,D,E,F are still locked and C,G,H are not.
   s = txn2->Put("A", "a3");
@@ -4478,7 +4605,7 @@
   s = txn2->Put("H", "h3");
   ASSERT_OK(s);
 
-  txn1->RollbackToSavePoint();  // rollback to 1
+  ASSERT_OK(txn1->RollbackToSavePoint());  // rollback to 1
 
   // Verify A,B,F are still locked and C,D,E,G,H are not.
   s = txn2->Put("A", "a3");
@@ -4788,7 +4915,8 @@
   ASSERT_OK(s);
 
   s = txn->Get(read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(s);
+  ASSERT_EQ("a0,1,2", value);
 
   s = txn->Put("A", "a");
   ASSERT_OK(s);
@@ -4801,7 +4929,8 @@
   ASSERT_OK(s);
 
   s = txn->Get(read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(s);
+  ASSERT_EQ("a,3", value);
 
   TransactionOptions txn_options;
   txn_options.lock_timeout = 1;  // 1 ms
@@ -4825,6 +4954,56 @@
   ASSERT_EQ("a,3", value);
 }
 
+TEST_P(TransactionTest, DeleteRangeSupportTest) {
+  // The `DeleteRange()` API is banned everywhere.
+  ASSERT_TRUE(
+      db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b")
+          .IsNotSupported());
+
+  // But range deletions can be added via the `Write()` API by specifying the
+  // proper flags to promise there are no conflicts according to the DB type
+  // (see `TransactionDB::DeleteRange()` API doc for details).
+  for (bool skip_concurrency_control : {false, true}) {
+    for (bool skip_duplicate_key_check : {false, true}) {
+      ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+      WriteBatch wb;
+      ASSERT_OK(wb.DeleteRange("a", "b"));
+      TransactionDBWriteOptimizations flags;
+      flags.skip_concurrency_control = skip_concurrency_control;
+      flags.skip_duplicate_key_check = skip_duplicate_key_check;
+      Status s = db->Write(WriteOptions(), flags, &wb);
+      std::string value;
+      switch (txn_db_options.write_policy) {
+        case WRITE_COMMITTED:
+          if (skip_concurrency_control) {
+            ASSERT_OK(s);
+            ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+          } else {
+            ASSERT_NOK(s);
+            ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+          }
+          break;
+        case WRITE_PREPARED:
+          // Intentional fall-through
+        case WRITE_UNPREPARED:
+          if (skip_concurrency_control && skip_duplicate_key_check) {
+            ASSERT_OK(s);
+            ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+          } else {
+            ASSERT_NOK(s);
+            ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+          }
+          break;
+      }
+      // Without any promises from the user, range deletion via other `Write()`
+      // APIs are still banned.
+      ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+      ASSERT_NOK(db->Write(WriteOptions(), &wb));
+      ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+    }
+  }
+}
+
 TEST_P(TransactionTest, DeferSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options;
@@ -5187,13 +5366,13 @@
                           &handles, &db);
   ASSERT_OK(s);
 
-  auto cfh_default = reinterpret_cast<ColumnFamilyHandleImpl*>(handles[0]);
+  auto cfh_default = static_cast_with_check<ColumnFamilyHandleImpl>(handles[0]);
   auto opt_default = *cfh_default->cfd()->GetLatestMutableCFOptions();
 
-  auto cfh_a = reinterpret_cast<ColumnFamilyHandleImpl*>(handles[1]);
+  auto cfh_a = static_cast_with_check<ColumnFamilyHandleImpl>(handles[1]);
   auto opt_a = *cfh_a->cfd()->GetLatestMutableCFOptions();
 
-  auto cfh_b = reinterpret_cast<ColumnFamilyHandleImpl*>(handles[2]);
+  auto cfh_b = static_cast_with_check<ColumnFamilyHandleImpl>(handles[2]);
   auto opt_b = *cfh_b->cfd()->GetLatestMutableCFOptions();
 
   ASSERT_EQ(opt_default.disable_auto_compactions, false);
@@ -5217,7 +5396,7 @@
 
         // Force txn1 to expire
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(150));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1500));
 
         Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
         Status s;
@@ -5233,7 +5412,7 @@
   WriteOptions write_options;
   TransactionOptions txn_options;
 
-  txn_options.expiration = 100;
+  txn_options.expiration = 1000;  // 1 second
   Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
 
   Status s;
@@ -5245,13 +5424,14 @@
   ReadOptions read_options;
   string value;
   s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
   ASSERT_EQ("1", value);
 
   delete txn1;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 namespace {
 // cmt_delay_ms is the delay between prepare and commit
 // first_id is the id of the first transaction
@@ -5280,6 +5460,7 @@
       return inserter.GetLastStatus();
     }
   }
+  inserter.GetLastStatus().PermitUncheckedError();
 
   // Make sure at least some of the transactions succeeded.  It's ok if
   // some failed due to write-conflicts.
@@ -5299,20 +5480,20 @@
 TEST_P(MySQLStyleTransactionTest, TransactionStressTest) {
   // Small write buffer to trigger more compactions
   options.write_buffer_size = 1024;
-  ReOpenNoDelete();
-  const size_t num_workers = 4;   // worker threads count
-  const size_t num_checkers = 2;  // checker threads count
-  const size_t num_slow_checkers = 2;  // checker threads emulating backups
-  const size_t num_slow_workers = 1;   // slow worker threads count
-  const size_t num_transactions_per_thread = 10000;
-  const uint16_t num_sets = 3;
-  const size_t num_keys_per_set = 100;
+  ASSERT_OK(ReOpenNoDelete());
+  constexpr size_t num_workers = 4;        // worker threads count
+  constexpr size_t num_checkers = 2;       // checker threads count
+  constexpr size_t num_slow_checkers = 2;  // checker threads emulating backups
+  constexpr size_t num_slow_workers = 1;   // slow worker threads count
+  constexpr size_t num_transactions_per_thread = 10000;
+  constexpr uint16_t num_sets = 3;
+  constexpr size_t num_keys_per_set = 100;
   // Setting the key-space to be 100 keys should cause enough write-conflicts
   // to make this test interesting.
 
   std::vector<port::Thread> threads;
   std::atomic<uint32_t> finished = {0};
-  bool TAKE_SNAPSHOT = true;
+  constexpr bool TAKE_SNAPSHOT = true;
   uint64_t time_seed = env->NowMicros();
   printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
 
@@ -5328,9 +5509,8 @@
     Random64 rand(time_seed * thd_seed);
     // Verify that data is consistent
     while (finished < num_workers) {
-      Status s = RandomTransactionInserter::Verify(
-          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand);
-      ASSERT_OK(s);
+      ASSERT_OK(RandomTransactionInserter::Verify(
+          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand));
     }
   };
   std::function<void()> call_slow_checker = [&] {
@@ -5381,7 +5561,7 @@
                                                !TAKE_SNAPSHOT);
   ASSERT_OK(s);
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(TransactionTest, MemoryLimitTest) {
   TransactionOptions txn_options;
@@ -5411,10 +5591,11 @@
   ASSERT_TRUE(s.IsMemoryLimit());
   ASSERT_EQ(2, txn->GetNumPuts());
 
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 }
 
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 // This test clarifies the existing expectation from the sequence number
 // algorithm. It could detect mistakes in updating the code but it is not
 // necessarily the one acceptable way. If the algorithm is legitimately changed,
@@ -5441,7 +5622,7 @@
   };
   const size_t max_n = static_cast<size_t>(1) << NUM_BRANCHES;
   for (size_t n = 0; n < max_n; n++) {
-    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     size_t branch = 0;
     auto seq = db_impl->GetLatestSequenceNumber();
     exp_seq = seq;
@@ -5457,7 +5638,7 @@
     if (!short_test && branch_do(n, &branch)) {
       ASSERT_OK(db_impl->FlushWAL(true));
       ASSERT_OK(ReOpenNoDelete());
-      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       seq = db_impl->GetLatestSequenceNumber();
       ASSERT_EQ(exp_seq, seq);
     }
@@ -5479,7 +5660,7 @@
     if (!short_test && branch_do(n, &branch)) {
       ASSERT_OK(db_impl->FlushWAL(true));
       ASSERT_OK(ReOpenNoDelete());
-      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       seq = db_impl->GetLatestSequenceNumber();
       ASSERT_EQ(exp_seq, seq);
     }
@@ -5496,7 +5677,7 @@
     if (!short_test && branch_do(n, &branch)) {
       ASSERT_OK(db_impl->FlushWAL(true));
       ASSERT_OK(ReOpenNoDelete());
-      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       seq = db_impl->GetLatestSequenceNumber();
       ASSERT_EQ(exp_seq, seq);
     }
@@ -5514,7 +5695,7 @@
     if (!short_test && branch_do(n, &branch)) {
       ASSERT_OK(db_impl->FlushWAL(true));
       ASSERT_OK(ReOpenNoDelete());
-      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       seq = db_impl->GetLatestSequenceNumber();
       ASSERT_EQ(exp_seq, seq);
     }
@@ -5531,13 +5712,14 @@
     if (!short_test && branch_do(n, &branch)) {
       ASSERT_OK(db_impl->FlushWAL(true));
       ASSERT_OK(ReOpenNoDelete());
-      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       seq = db_impl->GetLatestSequenceNumber();
       ASSERT_EQ(exp_seq, seq);
     }
     ASSERT_OK(ReOpen());
   }
 }
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 // Verify that the optimization would not compromize the correctness
 TEST_P(TransactionTest, Optimizations) {
@@ -5550,7 +5732,7 @@
     ASSERT_OK(ReOpen());
     WriteOptions write_options;
     WriteBatch batch;
-    batch.Put(Slice("k"), Slice("v1"));
+    ASSERT_OK(batch.Put(Slice("k"), Slice("v1")));
     ASSERT_OK(db->Write(write_options, &batch));
 
     ReadOptions ropt;
@@ -5575,7 +5757,7 @@
     Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
     return na == nb;
   }
-  // This methods below dont seem relevant to this test. Implement them if
+  // These methods below don't seem relevant to this test. Implement them if
   // proven othersize.
   void FindShortestSeparator(std::string* start,
                              const Slice& limit) const override {
@@ -5588,11 +5770,11 @@
   }
 };
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_P(TransactionTest, GetWithoutSnapshot) {
   WriteOptions write_options;
   std::atomic<bool> finish = {false};
-  db->Put(write_options, "key", "value");
+  ASSERT_OK(db->Put(write_options, "key", "value"));
   ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
     for (int i = 0; i < 100; i++) {
       TransactionOptions txn_options;
@@ -5617,7 +5799,7 @@
   commit_thread.join();
   read_thread.join();
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 // Test that the transactional db can handle duplicate keys in the write batch
 TEST_P(TransactionTest, DuplicateKeys) {
@@ -5628,16 +5810,16 @@
     ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
     WriteOptions write_options;
     WriteBatch batch;
-    batch.Put(Slice("key"), Slice("value"));
-    batch.Put(Slice("key2"), Slice("value2"));
+    ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
+    ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
     // duplicate the keys
-    batch.Put(Slice("key"), Slice("value3"));
+    ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
     // duplicate the 2nd key. It should not be counted duplicate since a
     // sub-patch is cut after the last duplicate.
-    batch.Put(Slice("key2"), Slice("value4"));
+    ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
     // duplicate the keys but in a different cf. It should not be counted as
     // duplicate keys
-    batch.Put(cf_handle, Slice("key"), Slice("value5"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
 
     ASSERT_OK(db->Write(write_options, &batch));
 
@@ -5664,11 +5846,11 @@
     ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
     WriteOptions write_options;
     WriteBatch batch;
-    batch.Put(cf_handle, Slice("key"), Slice("value"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value")));
     // The first three bytes are the same, do it must be counted as duplicate
-    batch.Put(cf_handle, Slice("key2"), Slice("value2"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key2"), Slice("value2")));
     // check for 2nd duplicate key in cf with non-default comparator
-    batch.Put(cf_handle, Slice("key2b"), Slice("value2b"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key2b"), Slice("value2b")));
     ASSERT_OK(db->Write(write_options, &batch));
 
     // The value must be the most recent value for all the keys equal to "key",
@@ -5833,10 +6015,10 @@
     ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value")));
     WriteBatch batch;
     // Merge more than max_successive_merges times
-    batch.Merge(cf_handle, Slice("key"), Slice("1"));
-    batch.Merge(cf_handle, Slice("key"), Slice("2"));
-    batch.Merge(cf_handle, Slice("key"), Slice("3"));
-    batch.Merge(cf_handle, Slice("key"), Slice("4"));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("1")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("2")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("3")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4")));
     ASSERT_OK(db->Write(write_options, &batch));
     ReadOptions read_options;
     string value;
@@ -5882,6 +6064,7 @@
     std::vector<ColumnFamilyHandle*> handles;
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
 
+    assert(db != nullptr);
     ASSERT_OK(db->Put(write_options, "foo0", "init"));
     ASSERT_OK(db->Put(write_options, "foo1", "init"));
     ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init"));
@@ -5915,10 +6098,10 @@
     ASSERT_OK(txn0->Prepare());
     delete txn0;
     // This will check the asserts inside recovery code
-    db->FlushWAL(true);
+    ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5955,8 +6138,8 @@
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5988,8 +6171,8 @@
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -6015,8 +6198,8 @@
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -6042,8 +6225,8 @@
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    reinterpret_cast<DBImpl*>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -6074,7 +6257,7 @@
   write_options.sync = true;
   write_options.disableWAL = false;
   ColumnFamilyDescriptor cfd;
-  db->DefaultColumnFamily()->GetDescriptor(&cfd);
+  ASSERT_OK(db->DefaultColumnFamily()->GetDescriptor(&cfd));
   auto max_skip = cfd.options.max_sequential_skip_in_iterations;
 
   ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv")));
@@ -6112,7 +6295,7 @@
   }
   ASSERT_EQ(cnt, 2);
   delete iter;
-  txn0->Rollback();
+  ASSERT_OK(txn0->Rollback());
   delete txn0;
 }
 
@@ -6124,7 +6307,7 @@
     for (const bool write_after_recovery : {false, true}) {
       options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
       options.manual_wal_flush = manual_wal_flush;
-      ReOpen();
+      ASSERT_OK(ReOpen());
       std::string cf_name = "two";
       ColumnFamilyOptions cf_options;
       ColumnFamilyHandle* cf_handle = nullptr;
@@ -6139,12 +6322,12 @@
       ASSERT_OK(txn->Prepare());
 
       FlushOptions flush_ops;
-      db->Flush(flush_ops);
+      ASSERT_OK(db->Flush(flush_ops));
       // Now we have a log that cannot be deleted
 
       ASSERT_OK(db->Put(write_options, cf_handle, "foo1", "bar1"));
       // Flush only the 2nd cf
-      db->Flush(flush_ops, cf_handle);
+      ASSERT_OK(db->Flush(flush_ops, cf_handle));
 
       // The value is large enough to be touched by the corruption we ingest
       // below.
@@ -6156,8 +6339,8 @@
       // key/value not touched by corruption
       ASSERT_OK(db->Put(write_options, "foo4", "bar4"));
 
-      db->FlushWAL(true);
-      DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      ASSERT_OK(db->FlushWAL(true));
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       uint64_t wal_file_id = db_impl->TEST_LogfileNumber();
       std::string fname = LogFileName(dbname, wal_file_id);
       reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
@@ -6183,6 +6366,7 @@
       column_families.push_back(
           ColumnFamilyDescriptor("two", ColumnFamilyOptions()));
       ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+      assert(db != nullptr);
 
       if (write_after_recovery) {
         // Write data to the log right after the corrupted log
@@ -6190,7 +6374,7 @@
       }
 
       // Persist data written to WAL during recovery or by the last Put
-      db->FlushWAL(true);
+      ASSERT_OK(db->FlushWAL(true));
       // 2nd crash to recover while having a valid log after the corrupted one.
       ASSERT_OK(ReOpenNoDelete(column_families, &handles));
       assert(db != nullptr);
@@ -6205,6 +6389,28 @@
   }
 }
 
+TEST_P(TransactionTest, CommitWithoutPrepare) {
+  {
+    // skip_prepare = false.
+    WriteOptions write_options;
+    TransactionOptions txn_options;
+    txn_options.skip_prepare = false;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn->Commit().IsTxnNotPrepared());
+    delete txn;
+  }
+
+  {
+    // skip_prepare = true.
+    WriteOptions write_options;
+    TransactionOptions txn_options;
+    txn_options.skip_prepare = true;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_test.h	2025-05-19 16:14:28.000000000 +0000
@@ -12,25 +12,25 @@
 #include <thread>
 
 #include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "test_util/transaction_test_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
 
-#include "port/port.h"
-
 namespace ROCKSDB_NAMESPACE {
 
 // Return true if the ith bit is set in combination represented by comb
@@ -41,6 +41,7 @@
 class TransactionTestBase : public ::testing::Test {
  public:
   TransactionDB* db;
+  SpecialEnv special_env;
   FaultInjectionTestEnv* env;
   std::string dbname;
   Options options;
@@ -51,19 +52,23 @@
   TransactionTestBase(bool use_stackable_db, bool two_write_queue,
                       TxnDBWritePolicy write_policy,
                       WriteOrdering write_ordering)
-      : db(nullptr), env(nullptr), use_stackable_db_(use_stackable_db) {
+      : db(nullptr),
+        special_env(Env::Default()),
+        env(nullptr),
+        use_stackable_db_(use_stackable_db) {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
     options.write_buffer_size = 4 * 1024;
     options.unordered_write = write_ordering == kUnorderedWrite;
     options.level0_file_num_compaction_trigger = 2;
     options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-    env = new FaultInjectionTestEnv(Env::Default());
+    special_env.skip_fsync_ = true;
+    env = new FaultInjectionTestEnv(&special_env);
     options.env = env;
     options.two_write_queues = two_write_queue;
     dbname = test::PerThreadDBPath("transaction_testdb");
 
-    DestroyDB(dbname, options);
+    EXPECT_OK(DestroyDB(dbname, options));
     txn_db_options.transaction_lock_timeout = 0;
     txn_db_options.default_lock_timeout = 0;
     txn_db_options.write_policy = write_policy;
@@ -80,7 +85,7 @@
     } else {
       s = OpenWithStackableDB();
     }
-    assert(s.ok());
+    EXPECT_OK(s);
   }
 
   ~TransactionTestBase() {
@@ -90,8 +95,12 @@
     // seems to be a bug in btrfs that the makes readdir return recently
     // unlink-ed files. By using the default fs we simply ignore errors resulted
     // from attempting to delete such files in DestroyDB.
-    options.env = Env::Default();
-    DestroyDB(dbname, options);
+    if (getenv("KEEP_DB") == nullptr) {
+      options.env = Env::Default();
+      EXPECT_OK(DestroyDB(dbname, options));
+    } else {
+      fprintf(stdout, "db is still in %s\n", dbname.c_str());
+    }
     delete env;
   }
 
@@ -386,7 +395,7 @@
     if (txn_db_options.write_policy == WRITE_COMMITTED) {
       options.unordered_write = false;
     }
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     for (int i = 0; i < 1024; i++) {
       auto istr = std::to_string(index);
@@ -405,9 +414,9 @@
         case 1: {
           WriteBatch wb;
           committed_kvs[k] = v;
-          wb.Put(k, v);
+          ASSERT_OK(wb.Put(k, v));
           committed_kvs[k] = v2;
-          wb.Put(k, v2);
+          ASSERT_OK(wb.Put(k, v2));
           ASSERT_OK(db->Write(write_options, &wb));
 
         } break;
@@ -427,7 +436,7 @@
           delete txn;
           break;
         default:
-          assert(0);
+          FAIL();
       }
 
       index++;
@@ -437,12 +446,12 @@
     if (txn_db_options.write_policy == WRITE_COMMITTED) {
       options.unordered_write = false;
     }
-    auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     // Before upgrade/downgrade the WAL must be emptied
     if (empty_wal) {
-      db_impl->TEST_FlushMemTable();
+      ASSERT_OK(db_impl->TEST_FlushMemTable());
     } else {
-      db_impl->FlushWAL(true);
+      ASSERT_OK(db_impl->FlushWAL(true));
     }
     auto s = ReOpenNoDelete();
     if (empty_wal) {
@@ -453,10 +462,10 @@
       ASSERT_TRUE(s.IsNotSupported());
       return;
     }
-    db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     // Check that WAL is empty
     VectorLogPtr log_files;
-    db_impl->GetSortedWalFiles(log_files);
+    ASSERT_OK(db_impl->GetSortedWalFiles(log_files));
     ASSERT_EQ(0, log_files.size());
 
     for (auto& kv : committed_kvs) {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.cc	2025-05-19 16:14:28.000000000 +0000
@@ -14,17 +14,18 @@
 #include "db/db_impl/db_impl.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 Status TransactionUtil::CheckKeyForConflicts(
     DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key,
-    SequenceNumber snap_seq, bool cache_only, ReadCallback* snap_checker,
-    SequenceNumber min_uncommitted) {
+    SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only,
+    ReadCallback* snap_checker, SequenceNumber min_uncommitted) {
   Status result;
 
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
   SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
 
@@ -37,8 +38,8 @@
     SequenceNumber earliest_seq =
         db_impl->GetEarliestMemTableSequenceNumber(sv, true);
 
-    result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, cache_only,
-                      snap_checker, min_uncommitted);
+    result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts,
+                      cache_only, snap_checker, min_uncommitted);
 
     db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
   }
@@ -49,8 +50,9 @@
 Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
                                  SequenceNumber earliest_seq,
                                  SequenceNumber snap_seq,
-                                 const std::string& key, bool cache_only,
-                                 ReadCallback* snap_checker,
+                                 const std::string& key,
+                                 const std::string* const read_ts,
+                                 bool cache_only, ReadCallback* snap_checker,
                                  SequenceNumber min_uncommitted) {
   // When `min_uncommitted` is provided, keys are not always committed
   // in sequence number order, and `snap_checker` is used to check whether
@@ -104,6 +106,7 @@
 
   if (result.ok()) {
     SequenceNumber seq = kMaxSequenceNumber;
+    std::string timestamp;
     bool found_record_for_key = false;
 
     // When min_uncommitted == kMaxSequenceNumber, writes are committed in
@@ -116,9 +119,10 @@
     // keys lower than min_uncommitted can be skipped.
     SequenceNumber lower_bound_seq =
         (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted;
-    Status s = db_impl->GetLatestSequenceForKey(sv, key, !need_to_read_sst,
-                                                lower_bound_seq, &seq,
-                                                &found_record_for_key);
+    Status s = db_impl->GetLatestSequenceForKey(
+        sv, key, !need_to_read_sst, lower_bound_seq, &seq,
+        !read_ts ? nullptr : &timestamp, &found_record_for_key,
+        /*is_blob_index=*/nullptr);
 
     if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
       result = s;
@@ -126,6 +130,17 @@
       bool write_conflict = snap_checker == nullptr
                                 ? snap_seq < seq
                                 : !snap_checker->IsVisible(seq);
+      // Perform conflict checking based on timestamp if applicable.
+      if (!write_conflict && read_ts != nullptr) {
+        ColumnFamilyData* cfd = sv->cfd;
+        assert(cfd);
+        const Comparator* const ucmp = cfd->user_comparator();
+        assert(ucmp);
+        assert(read_ts->size() == ucmp->timestamp_size());
+        assert(read_ts->size() == timestamp.size());
+        // Write conflict if *ts < timestamp.
+        write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0;
+      }
       if (write_conflict) {
         result = Status::Busy();
       }
@@ -136,18 +151,20 @@
 }
 
 Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl,
-                                              const TransactionKeyMap& key_map,
+                                              const LockTracker& tracker,
                                               bool cache_only) {
   Status result;
 
-  for (auto& key_map_iter : key_map) {
-    uint32_t cf_id = key_map_iter.first;
-    const auto& keys = key_map_iter.second;
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
 
-    SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id);
+    SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf);
     if (sv == nullptr) {
       result = Status::InvalidArgument("Could not access column family " +
-                                       ToString(cf_id));
+                                       ToString(cf));
       break;
     }
 
@@ -156,18 +173,25 @@
 
     // For each of the keys in this transaction, check to see if someone has
     // written to this key since the start of the transaction.
-    for (const auto& key_iter : keys) {
-      const auto& key = key_iter.first;
-      const SequenceNumber key_seq = key_iter.second.seq;
-
-      result = CheckKey(db_impl, sv, earliest_seq, key_seq, key, cache_only);
-
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      PointLockStatus status = tracker.GetPointLockStatus(cf, key);
+      const SequenceNumber key_seq = status.seq;
+
+      // TODO: support timestamp-based conflict checking.
+      // CheckKeysForConflicts() is currently used only by optimistic
+      // transactions.
+      result = CheckKey(db_impl, sv, earliest_seq, key_seq, key,
+                        /*read_ts=*/nullptr, cache_only);
       if (!result.ok()) {
         break;
       }
     }
 
-    db_impl->ReturnAndCleanupSuperVersion(cf_id, sv);
+    db_impl->ReturnAndCleanupSuperVersion(cf, sv);
 
     if (!result.ok()) {
       break;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/transaction_util.h	2025-05-19 16:14:28.000000000 +0000
@@ -12,39 +12,14 @@
 
 #include "db/dbformat.h"
 #include "db/read_callback.h"
-
 #include "rocksdb/db.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
+#include "utilities/transactions/lock/lock_tracker.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-struct TransactionKeyMapInfo {
-  // Earliest sequence number that is relevant to this transaction for this key
-  SequenceNumber seq;
-
-  uint32_t num_writes;
-  uint32_t num_reads;
-
-  bool exclusive;
-
-  explicit TransactionKeyMapInfo(SequenceNumber seq_no)
-      : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {}
-
-  // Used in PopSavePoint to collapse two savepoints together.
-  void Merge(const TransactionKeyMapInfo& info) {
-    assert(seq <= info.seq);
-    num_reads += info.num_reads;
-    num_writes += info.num_writes;
-    exclusive |= info.exclusive;
-  }
-};
-
-using TransactionKeyMap =
-    std::unordered_map<uint32_t,
-                       std::unordered_map<std::string, TransactionKeyMapInfo>>;
-
 class DBImpl;
 struct SuperVersion;
 class WriteBatchWithIndex;
@@ -52,34 +27,38 @@
 class TransactionUtil {
  public:
   // Verifies there have been no commits to this key in the db since this
-  // sequence number.
+  // sequence number. If user-defined timestamp is enabled, then also check
+  // no commits to this key in the db since the given ts.
   //
   // If cache_only is true, then this function will not attempt to read any
   // SST files.  This will make it more likely this function will
   // return an error if it is unable to determine if there are any conflicts.
   //
-  // See comment of CheckKey() for explanation of `snap_seq`, `snap_checker`
-  // and `min_uncommitted`.
+  // See comment of CheckKey() for explanation of `snap_seq`, `ts`,
+  // `snap_checker` and `min_uncommitted`.
   //
   // Returns OK on success, BUSY if there is a conflicting write, or other error
   // status for any unexpected errors.
   static Status CheckKeyForConflicts(
       DBImpl* db_impl, ColumnFamilyHandle* column_family,
-      const std::string& key, SequenceNumber snap_seq, bool cache_only,
+      const std::string& key, SequenceNumber snap_seq,
+      const std::string* const ts, bool cache_only,
       ReadCallback* snap_checker = nullptr,
       SequenceNumber min_uncommitted = kMaxSequenceNumber);
 
-  // For each key,SequenceNumber pair in the TransactionKeyMap, this function
+  // For each key,SequenceNumber pair tracked by the LockTracker, this function
   // will verify there have been no writes to the key in the db since that
   // sequence number.
   //
   // Returns OK on success, BUSY if there is a conflicting write, or other error
   // status for any unexpected errors.
   //
-  // REQUIRED: this function should only be called on the write thread or if the
+  // REQUIRED:
+  // This function should only be called on the write thread or if the
   // mutex is held.
+  // tracker must support point lock.
   static Status CheckKeysForConflicts(DBImpl* db_impl,
-                                      const TransactionKeyMap& keys,
+                                      const LockTracker& tracker,
                                       bool cache_only);
 
  private:
@@ -91,10 +70,13 @@
   //  seq < `min_uncommitted`: no conflict
   //  seq > `snap_seq`: applicable to conflict
   //  `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine.
+  //
+  // If user-defined timestamp is enabled, a write conflict is detected if an
+  // operation for `key` with timestamp greater than `ts` exists.
   static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
                          SequenceNumber earliest_seq, SequenceNumber snap_seq,
-                         const std::string& key, bool cache_only,
-                         ReadCallback* snap_checker = nullptr,
+                         const std::string& key, const std::string* const ts,
+                         bool cache_only, ReadCallback* snap_checker = nullptr,
                          SequenceNumber min_uncommitted = kMaxSequenceNumber);
 };
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -5,8 +5,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "utilities/transactions/transaction_test.h"
-
 #include <algorithm>
 #include <atomic>
 #include <cinttypes>
@@ -16,6 +14,8 @@
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
@@ -23,7 +23,6 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "test_util/fault_injection_test_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -31,13 +30,13 @@
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_test.h"
 #include "utilities/transactions/write_prepared_txn_db.h"
 
-#include "port/port.h"
-
 using std::string;
 
 namespace ROCKSDB_NAMESPACE {
@@ -203,7 +202,7 @@
   Options options;
   options.create_if_missing = true;
   const std::string dbname = test::PerThreadDBPath("transaction_testdb");
-  DestroyDB(dbname, options);
+  EXPECT_OK(DestroyDB(dbname, options));
   ASSERT_OK(DB::Open(options, dbname, &db));
   ColumnFamilyHandle* cf_handle = nullptr;
   ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
@@ -217,18 +216,18 @@
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key"), Slice("value"));
+  ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key2"), Slice("value2"));
+  ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   // duplicate the keys
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key"), Slice("value3"));
+  ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
   batch_cnt++;
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   // duplicate the 2nd key. It should not be counted duplicate since a
@@ -236,14 +235,14 @@
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key2"), Slice("value4"));
+  ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   // duplicate the keys but in a different cf. It should not be counted as
   // duplicate keys
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(cf_handle, Slice("key"), Slice("value5"));
+  ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
 
   // Test that the number of sub-batches matches what we count with
@@ -258,7 +257,7 @@
   // Test that RollbackToSavePoint will properly resets the number of
   // sub-batches
   for (size_t i = save_points; i > 0; i--) {
-    batch.RollbackToSavePoint();
+    ASSERT_OK(batch.RollbackToSavePoint());
     ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt());
   }
 
@@ -277,9 +276,9 @@
       for (size_t k = 0; k < 10; k++) {  // 10 key per batch
         size_t ki = static_cast<size_t>(rnd.Uniform(TOTAL_KEYS));
         Slice key = Slice(keys[ki]);
-        std::string buffer;
-        Slice value = Slice(test::RandomString(&rnd, 16, &buffer));
-        rndbatch.Put(key, value);
+        std::string tmp = rnd.RandomString(16);
+        Slice value = Slice(tmp);
+        ASSERT_OK(rndbatch.Put(key, value));
       }
       SubBatchCounter batch_counter(comparators);
       ASSERT_OK(rndbatch.GetWriteBatch()->Iterate(&batch_counter));
@@ -439,10 +438,8 @@
       ASSERT_TRUE(wp_db->old_commit_map_empty_);
       ROCKSDB_NAMESPACE::port::Thread t1(
           [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
-      ROCKSDB_NAMESPACE::port::Thread t2(
-          [&]() { wp_db->CheckAgainstSnapshots(entry); });
+      wp_db->CheckAgainstSnapshots(entry);
       t1.join();
-      t2.join();
       ASSERT_FALSE(wp_db->old_commit_map_empty_);
     }
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -468,10 +465,8 @@
       ASSERT_TRUE(wp_db->old_commit_map_empty_);
       ROCKSDB_NAMESPACE::port::Thread t1(
           [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
-      ROCKSDB_NAMESPACE::port::Thread t2(
-          [&]() { wp_db->CheckAgainstSnapshots(entry); });
+      wp_db->CheckAgainstSnapshots(entry);
       t1.join();
-      t2.join();
       ASSERT_FALSE(wp_db->old_commit_map_empty_);
     }
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -528,7 +523,7 @@
         ASSERT_EQ(expected_versions[i].value, versions[i].value);
       }
       // Range delete not supported.
-      assert(expected_versions[i].type != kTypeRangeDeletion);
+      ASSERT_NE(expected_versions[i].type, kTypeRangeDeletion);
     }
   }
 };
@@ -544,7 +539,7 @@
             std::get<2>(GetParam()), std::get<3>(GetParam())){};
 };
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 class SnapshotConcurrentAccessTest
     : public WritePreparedTransactionTestBase,
       virtual public ::testing::WithParamInterface<std::tuple<
@@ -563,7 +558,7 @@
   size_t split_id_;
   size_t split_cnt_;
 };
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 class SeqAdvanceConcurrentTest
     : public WritePreparedTransactionTestBase,
@@ -575,7 +570,9 @@
             std::get<0>(GetParam()), std::get<1>(GetParam()),
             std::get<2>(GetParam()), std::get<3>(GetParam())),
         split_id_(std::get<4>(GetParam())),
-        split_cnt_(std::get<5>(GetParam())){};
+        split_cnt_(std::get<5>(GetParam())) {
+    special_env.skip_fsync_ = true;
+  };
 
  protected:
   // A test is split into split_cnt_ tests, each identified with split_id_ where
@@ -591,7 +588,7 @@
         std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
         std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SnapshotConcurrentAccessTest,
     ::testing::Values(
@@ -698,12 +695,12 @@
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
         std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)));
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(WritePreparedTransactionTest, CommitMap) {
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-  assert(wp_db);
-  assert(wp_db->db_impl_);
+  ASSERT_NE(wp_db, nullptr);
+  ASSERT_NE(wp_db->db_impl_, nullptr);
   size_t size = wp_db->COMMIT_CACHE_SIZE;
   CommitEntry c = {5, 12}, e;
   bool evicted = wp_db->AddCommitEntry(c.prep_seq % size, c, &e);
@@ -797,14 +794,13 @@
   for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
        attempt++) {
     options.max_write_buffer_number_to_maintain = 3;
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     WriteOptions write_options;
     ReadOptions read_options;
     TransactionOptions txn_options;
     txn_options.set_snapshot = true;
     string value;
-    Status s;
 
     ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
     ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
@@ -841,9 +837,9 @@
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_OK(db->Flush(flush_ops));
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       DBImpl* db_impl = static_cast<DBImpl*>(db->GetRootDB());
-      db_impl->TEST_SwitchMemtable();
+      ASSERT_OK(db_impl->TEST_SwitchMemtable());
     }
     uint64_t num_imm_mems;
     ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
@@ -851,7 +847,7 @@
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_EQ(0, num_imm_mems);
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(1, num_imm_mems);
     }
 
@@ -893,7 +889,7 @@
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(4, get_perf_context()->get_from_memtable_count);
     }
 
@@ -910,7 +906,7 @@
       // Only active memtable will be checked in snapshot validation but
       // both of active and immutable snapshot will be queried when
       // getting the value.
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
     }
 
@@ -1091,7 +1087,7 @@
   const uint64_t cache_size = 1ul << snapshot_cache_bits;
   // Safety check to express the intended size in the test. Can be adjusted if
   // the snapshots lists changed.
-  assert((1ul << snapshot_cache_bits) * 2 + 1 == snapshots.size());
+  ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 1, snapshots.size());
   DBImpl* mock_db = new DBImpl(options, dbname);
   UpdateTransactionDBOptions(snapshot_cache_bits);
   std::unique_ptr<WritePreparedTxnDBMock> wp_db(
@@ -1106,8 +1102,8 @@
   std::vector<SequenceNumber> seqs = {50l,  55l,  150l, 155l, 250l, 255l, 350l,
                                       355l, 450l, 455l, 550l, 555l, 650l, 655l,
                                       750l, 755l, 850l, 855l, 950l, 955l};
-  assert(seqs.size() > 1);
-  for (size_t i = 0; i < seqs.size() - 1; i++) {
+  ASSERT_GT(seqs.size(), 1);
+  for (size_t i = 0; i + 1 < seqs.size(); i++) {
     wp_db->old_commit_map_empty_ = true;  // reset
     CommitEntry commit_entry = {seqs[i], seqs[i + 1]};
     wp_db->CheckAgainstSnapshots(commit_entry);
@@ -1172,7 +1168,7 @@
 
 // This test is too slow for travis
 #ifndef TRAVIS
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 // Test that CheckAgainstSnapshots will not miss a live snapshot if it is run in
 // parallel with UpdateSnapshots.
 TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccess) {
@@ -1184,7 +1180,7 @@
   const size_t snapshot_cache_bits = 2;
   // Safety check to express the intended size in the test. Can be adjusted if
   // the snapshots lists changed.
-  assert((1ul << snapshot_cache_bits) * 2 + 2 == snapshots.size());
+  ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 2, snapshots.size());
   SequenceNumber version = 1000l;
   // Choose the cache size so that the new snapshot list could replace all the
   // existing items in the cache and also have some overflow.
@@ -1252,7 +1248,7 @@
   }
   printf("\n");
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 #endif  // TRAVIS
 
 // This test clarifies the contract of AdvanceMaxEvictedSeq method
@@ -1365,7 +1361,7 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WriteOptions woptions;
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
 
@@ -1378,9 +1374,9 @@
       // is not published yet, thus causing max evicted seq go higher than last
       // published.
       for (int b = 0; b < batch_cnt; b++) {
-        batch.Put("foo", "foo");
+        ASSERT_OK(batch.Put("foo", "foo"));
       }
-      db->Write(woptions, &batch);
+      ASSERT_OK(db->Write(woptions, &batch));
     }
   });
 
@@ -1415,7 +1411,7 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WriteOptions woptions;
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
 
@@ -1423,8 +1419,8 @@
   ROCKSDB_NAMESPACE::port::Thread t1([&]() {
     for (int i = 0; i < writes; i++) {
       WriteBatch batch;
-      batch.Put("key", "foo");
-      db->Write(woptions, &batch);
+      ASSERT_OK(batch.Put("key", "foo"));
+      ASSERT_OK(db->Write(woptions, &batch));
     }
   });
 
@@ -1474,7 +1470,7 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WriteOptions woptions;
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // Insert something to increase seq
@@ -1534,8 +1530,8 @@
   // udpated
   ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq);
 
-  txn0->Rollback();
-  txn1->Rollback();
+  ASSERT_OK(txn0->Rollback());
+  ASSERT_OK(txn1->Rollback());
   delete txn0;
   delete txn1;
 }
@@ -1548,7 +1544,7 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 1;    // disable commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   ReadOptions ropt;
   PinnableSlice pinnable_val;
@@ -1569,10 +1565,10 @@
   delete txn0;
 
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-  wp_db->db_impl_->FlushWAL(true);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
   wp_db->TEST_Crash();
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
   ASSERT_TRUE(s.IsNotFound());
 
@@ -1581,7 +1577,7 @@
   delete txn0;
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 // Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and
 // delayed_prepared_, when is run concurrently with advancing max_evicted_seq,
 // which moves prepared txns from prepared_txns_ to delayed_prepared_.
@@ -1589,7 +1585,7 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 1;    // disable commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ReadOptions ropt;
   PinnableSlice pinnable_val;
@@ -1622,7 +1618,7 @@
       // Since commit cache is practically disabled, commit results in immediate
       // advance in max_evicted_seq_ and subsequently moving some prepared txns
       // to delayed_prepared_.
-      txn->Commit();
+      ASSERT_OK(txn->Commit());
       committed_txns.push_back(txn);
     }
   });
@@ -1643,7 +1639,7 @@
     delete txn;
   }
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
   // Given the sequential run of txns, with this timeout we should never see a
@@ -1651,7 +1647,7 @@
   // almost infeasible.
   txn_db_options.transaction_lock_timeout = 1000;
   txn_db_options.default_lock_timeout = 1000;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   FlushOptions fopt;
 
   // Number of different txn types we use in this test
@@ -1671,12 +1667,16 @@
   }
   const size_t max_n = static_cast<size_t>(std::pow(type_cnt, txn_cnt));
   printf("Number of cases being tested is %" ROCKSDB_PRIszt "\n", max_n);
-  for (size_t n = 0; n < max_n; n++, ReOpen()) {
+  for (size_t n = 0; n < max_n; n++) {
+    if (n > 0) {
+      ASSERT_OK(ReOpen());
+    }
+
     if (n % split_cnt_ != split_id_) continue;
     if (n % 1000 == 0) {
       printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n);
     }
-    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     auto seq = db_impl->TEST_GetLastVisibleSequence();
     with_empty_commits = 0;
     exp_seq = seq;
@@ -1731,7 +1731,7 @@
           threads.emplace_back(txn_t3, bi);
           break;
         default:
-          assert(false);
+          FAIL();
       }
       // wait to be linked
       while (linked.load() <= bi) {
@@ -1765,23 +1765,23 @@
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 
     // Check if recovery preserves the last sequence number
-    db_impl->FlushWAL(true);
-    ReOpenNoDelete();
-    assert(db != nullptr);
-    db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
+    ASSERT_NE(db, nullptr);
+    db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     seq = db_impl->TEST_GetLastVisibleSequence();
     ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if flush preserves the last sequence number
-    db_impl->Flush(fopt);
+    ASSERT_OK(db_impl->Flush(fopt));
     seq = db_impl->GetLatestSequenceNumber();
     ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if recovery after flush preserves the last sequence number
-    db_impl->FlushWAL(true);
-    ReOpenNoDelete();
-    assert(db != nullptr);
-    db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
+    ASSERT_NE(db, nullptr);
+    db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     seq = db_impl->GetLatestSequenceNumber();
     ASSERT_LE(exp_seq, seq + with_empty_commits);
   }
@@ -1792,7 +1792,7 @@
 // properly.
 TEST_P(WritePreparedTransactionTest, BasicRecovery) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
 
   txn_t0(0);
@@ -1807,6 +1807,7 @@
   s = txn0->Put(Slice("foo0" + istr0), Slice("bar0" + istr0));
   ASSERT_OK(s);
   s = txn0->Prepare();
+  ASSERT_OK(s);
   auto prep_seq_0 = txn0->GetId();
 
   txn_t1(0);
@@ -1819,6 +1820,7 @@
   s = txn1->Put(Slice("foo1" + istr1), Slice("bar"));
   ASSERT_OK(s);
   s = txn1->Prepare();
+  ASSERT_OK(s);
   auto prep_seq_1 = txn1->GetId();
 
   txn_t2(0);
@@ -1832,10 +1834,10 @@
 
   delete txn0;
   delete txn1;
-  wp_db->db_impl_->FlushWAL(true);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
   wp_db->TEST_Crash();
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // After recovery, all the uncommitted txns (0 and 1) should be inserted into
   // delayed_prepared_
@@ -1863,7 +1865,7 @@
   // recovery
   txn1 = db->GetTransactionByName("xid" + istr1);
   ASSERT_NE(txn1, nullptr);
-  txn1->Commit();
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 
   index++;
@@ -1874,13 +1876,14 @@
   s = txn2->Put(Slice("foo2" + istr2), Slice("bar"));
   ASSERT_OK(s);
   s = txn2->Prepare();
+  ASSERT_OK(s);
   auto prep_seq_2 = txn2->GetId();
 
   delete txn2;
-  wp_db->db_impl_->FlushWAL(true);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
   wp_db->TEST_Crash();
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ASSERT_TRUE(wp_db->prepared_txns_.empty());
   ASSERT_FALSE(wp_db->delayed_prepared_empty_);
@@ -1900,10 +1903,10 @@
   // Commit all the remaining txns
   txn0 = db->GetTransactionByName("xid" + istr0);
   ASSERT_NE(txn0, nullptr);
-  txn0->Commit();
+  ASSERT_OK(txn0->Commit());
   txn2 = db->GetTransactionByName("xid" + istr2);
   ASSERT_NE(txn2, nullptr);
-  txn2->Commit();
+  ASSERT_OK(txn2->Commit());
 
   // Check the value is committed after commit
   s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
@@ -1913,9 +1916,9 @@
 
   delete txn0;
   delete txn2;
-  wp_db->db_impl_->FlushWAL(true);
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ASSERT_TRUE(wp_db->prepared_txns_.empty());
   ASSERT_TRUE(wp_db->delayed_prepared_empty_);
@@ -1932,7 +1935,7 @@
 // committed data before the restart is visible to all snapshots.
 TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMap) {
   for (bool end_with_prepare : {false, true}) {
-    ReOpen();
+    ASSERT_OK(ReOpen());
     WriteOptions woptions;
     ASSERT_OK(db->Put(woptions, "key", "value"));
     ASSERT_OK(db->Put(woptions, "key", "value"));
@@ -1947,11 +1950,11 @@
       delete txn;
     }
     dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
-    auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-    db_impl->FlushWAL(true);
-    ReOpenNoDelete();
+    auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
     WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-    assert(wp_db != nullptr);
+    ASSERT_NE(wp_db, nullptr);
     ASSERT_GT(wp_db->max_evicted_seq_, 0);  // max after recovery
     // Take a snapshot right after recovery
     const Snapshot* snap = db->GetSnapshot();
@@ -2190,7 +2193,7 @@
   Status s;
   PinnableSlice v;
   s = db->Get(roptions, db->DefaultColumnFamily(), key, &v);
-  ASSERT_TRUE(exp_s == s);
+  ASSERT_EQ(exp_s, s);
   ASSERT_TRUE(s.ok() || s.IsNotFound());
   if (s.ok()) {
     ASSERT_TRUE(exp_v == v);
@@ -2203,7 +2206,7 @@
   ASSERT_EQ(1, values.size());
   ASSERT_EQ(1, s_vec.size());
   s = s_vec[0];
-  ASSERT_TRUE(exp_s == s);
+  ASSERT_EQ(exp_s, s);
   ASSERT_TRUE(s.ok() || s.IsNotFound());
   if (s.ok()) {
     ASSERT_TRUE(exp_v == values[0]);
@@ -2224,7 +2227,7 @@
   for (size_t ikey = 1; ikey <= num_keys; ikey++) {
     for (size_t ivalue = 0; ivalue < num_values; ivalue++) {
       for (bool crash : {false, true}) {
-        ReOpen();
+        ASSERT_OK(ReOpen());
         WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
         std::string key_str = "key" + ToString(ikey);
         switch (ivalue) {
@@ -2243,7 +2246,7 @@
             ASSERT_OK(db->SingleDelete(woptions, key_str));
             break;
           default:
-            assert(0);
+            FAIL();
         }
 
         PinnableSlice v1;
@@ -2285,11 +2288,11 @@
 
         if (crash) {
           delete txn;
-          auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-          db_impl->FlushWAL(true);
+          auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+          ASSERT_OK(db_impl->FlushWAL(true));
           dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
-          ReOpenNoDelete();
-          assert(db != nullptr);
+          ASSERT_OK(ReOpenNoDelete());
+          ASSERT_NE(db, nullptr);
           wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
           txn = db->GetTransactionByName("xid0");
           ASSERT_FALSE(wp_db->delayed_prepared_empty_);
@@ -2328,7 +2331,7 @@
 TEST_P(WritePreparedTransactionTest, DisableGCDuringRecovery) {
   // Use large buffer to avoid memtable flush after 1024 insertions
   options.write_buffer_size = 1024 * 1024;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   std::vector<KeyVersion> versions;
   uint64_t seq = 0;
   for (uint64_t i = 1; i <= 1024; i++) {
@@ -2344,11 +2347,11 @@
   }
   std::reverse(std::begin(versions), std::end(versions));
   VerifyInternalKeys(versions);
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-  db_impl->FlushWAL(true);
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  ASSERT_OK(db_impl->FlushWAL(true));
   // Use small buffer to ensure memtable flush during recovery
   options.write_buffer_size = 1024;
-  ReOpenNoDelete();
+  ASSERT_OK(ReOpenNoDelete());
   VerifyInternalKeys(versions);
 }
 
@@ -2375,8 +2378,8 @@
 // proceed with older versions of the key as-if the new version doesn't exist.
 TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
   options.disable_auto_compactions = true;
-  ReOpen();
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  ASSERT_OK(ReOpen());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
   // Snapshots to avoid keys get evicted.
   std::vector<const Snapshot*> snapshots;
   // Keep track of expected sequence number.
@@ -2466,7 +2469,7 @@
 // not just prepare sequence.
 TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   // Keep track of expected sequence number.
   SequenceNumber expected_seq = 0;
   auto* txn1 = db->BeginTransaction(WriteOptions());
@@ -2475,7 +2478,7 @@
   ASSERT_OK(txn1->Prepare());
   ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
   ASSERT_OK(txn1->Commit());
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
   ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
   delete txn1;
   // Take a snapshots to avoid keys get evicted before compaction.
@@ -2532,7 +2535,7 @@
   const size_t commit_cache_bits = 0;    // disable commit cache
   for (bool has_recent_prepare : {true, false}) {
     UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
     auto* transaction =
@@ -2581,7 +2584,8 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // minimum commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
 
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1"));
   auto* transaction =
@@ -2600,7 +2604,13 @@
   VerifyKeys({{"key1", "value1_1"}}, snapshot2);
   // Add a flush to avoid compaction to fallback to trivial move.
 
+  // The callback might be called twice, record the calling state to
+  // prevent double calling.
+  bool callback_finished = false;
   auto callback = [&](void*) {
+    if (callback_finished) {
+      return;
+    }
     // Release snapshot1 after CompactionIterator init.
     // CompactionIterator need to figure out the earliest snapshot
     // that can see key1:value1_2 is kMaxSequenceNumber, not
@@ -2609,6 +2619,7 @@
     // Add some keys to advance max_evicted_seq.
     ASSERT_OK(db->Put(WriteOptions(), "key3", "value3"));
     ASSERT_OK(db->Put(WriteOptions(), "key4", "value4"));
+    callback_finished = true;
   };
   SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
                                         callback);
@@ -2630,7 +2641,8 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // minimum commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
 
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value2"));
@@ -2680,7 +2692,8 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 1;    // commit cache size = 2
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
 
   // Add a dummy key to evict v2 commit cache, but keep v1 commit cache.
   // It also advance max_evicted_seq and can trigger old_commit_map cleanup.
@@ -2709,11 +2722,18 @@
   add_dummy();
   auto* s2 = db->GetSnapshot();
 
+  // The callback might be called twice, record the calling state to
+  // prevent double calling.
+  bool callback_finished = false;
   auto callback = [&](void*) {
+    if (callback_finished) {
+      return;
+    }
     db->ReleaseSnapshot(s1);
     // Add some dummy entries to trigger s1 being cleanup from old_commit_map.
     add_dummy();
     add_dummy();
+    callback_finished = true;
   };
   SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
                                         callback);
@@ -2731,9 +2751,11 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // minimum commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
 
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  SequenceNumber put_seq = db->GetLatestSequenceNumber();
   auto* transaction =
       db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
   ASSERT_OK(transaction->SetName("txn"));
@@ -2775,11 +2797,481 @@
   // Since the delete tombstone is not visible to snapshot2, we need to keep
   // at least one version of the key, for write-conflict check.
   VerifyInternalKeys({{"key1", "", del_seq, kTypeDeletion},
-                      {"key1", "value1", 0, kTypeValue}});
+                      {"key1", "value1", put_seq, kTypeValue}});
+  db->ReleaseSnapshot(snapshot2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestSnapshotDuringCompaction_WithSD) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                   /*old_txn=*/nullptr);
+  ASSERT_OK(txn->SingleDelete("key"));
+  ASSERT_OK(txn->Put("wow", "value"));
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  const bool two_write_queues = std::get<1>(GetParam());
+  if (two_write_queues) {
+    // In the case of two queues, commit another txn just to bump
+    // last_published_seq so that a subsequent GetSnapshot() call can return
+    // a snapshot with higher sequence.
+    auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                           /*old_txn=*/nullptr);
+    ASSERT_OK(dummy_txn->Put("haha", "value"));
+    ASSERT_OK(dummy_txn->Commit());
+    delete dummy_txn;
+  }
+  auto* snapshot = db->GetSnapshot();
+
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) {
+        if (!arg) {
+          return;
+        }
+        db->ReleaseSnapshot(snapshot);
+
+        // Advance max_evicted_seq
+        ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestSnapshotDuringCompaction_WithSD2) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                   /*old_txn=*/nullptr);
+  ASSERT_OK(txn->Put("bar", "value"));
+  ASSERT_OK(txn->SingleDelete("key"));
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  ASSERT_OK(db->Put(WriteOptions(), "haha", "value"));
+
+  // Create a dummy transaction to take a snapshot for ww-conflict detection.
+  TransactionOptions txn_opts;
+  txn_opts.set_snapshot = true;
+  auto* dummy_txn =
+      db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:2", [&](void* /*arg*/) {
+        ASSERT_OK(dummy_txn->Rollback());
+        delete dummy_txn;
+
+        ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Put(WriteOptions(), "haha2", "value"));
+  auto* snapshot = db->GetSnapshot();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  db->ReleaseSnapshot(snapshot);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestSnapshotDuringCompaction_WithDelete) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto* txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                   /*old_txn=*/nullptr);
+  ASSERT_OK(txn->Delete("b"));
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Prepare());
+
+  const bool two_write_queues = std::get<1>(GetParam());
+  if (two_write_queues) {
+    // In the case of two queues, commit another txn just to bump
+    // last_published_seq so that a subsequent GetSnapshot() call can return
+    // a snapshot with higher sequence.
+    auto* dummy_txn = db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                           /*old_txn=*/nullptr);
+    ASSERT_OK(dummy_txn->Put("haha", "value"));
+    ASSERT_OK(dummy_txn->Commit());
+    delete dummy_txn;
+  }
+  auto* snapshot1 = db->GetSnapshot();
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  auto* snapshot2 = db->GetSnapshot();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:BottommostDelete:1", [&](void* arg) {
+        if (!arg) {
+          return;
+        }
+        db->ReleaseSnapshot(snapshot1);
+
+        // Advance max_evicted_seq
+        ASSERT_OK(db->Put(WriteOptions(), "dummy1", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+  db->ReleaseSnapshot(snapshot2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest,
+       ReleaseSnapshotBetweenSDAndPutDuringCompaction) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Create a dummy transaction to take a snapshot for ww-conflict detection.
+  TransactionOptions txn_opts;
+  txn_opts.set_snapshot = true;
+  auto* dummy_txn =
+      db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr);
+  // Increment seq
+  ASSERT_OK(db->Put(WriteOptions(), "bar", "value"));
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+  auto* snapshot1 = db->GetSnapshot();
+  // Increment seq
+  ASSERT_OK(db->Put(WriteOptions(), "dontcare", "value"));
+  auto* snapshot2 = db->GetSnapshot();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:KeepSDForWW", [&](void* /*arg*/) {
+        db->ReleaseSnapshot(snapshot1);
+
+        ASSERT_OK(db->Put(WriteOptions(), "dontcare2", "value2"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
   db->ReleaseSnapshot(snapshot2);
+  ASSERT_OK(dummy_txn->Commit());
+  delete dummy_txn;
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+TEST_P(WritePreparedTransactionTest,
+       ReleaseEarliestWriteConflictSnapshot_SingleDelete) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  }
+
+  std::unique_ptr<Transaction> txn;
+  txn.reset(db->BeginTransaction(WriteOptions(), TransactionOptions(),
+                                 /*old_txn=*/nullptr));
+  ASSERT_OK(txn->SetName("txn1"));
+  ASSERT_OK(txn->SingleDelete("b"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+
+  auto* snapshot1 = db->GetSnapshot();
+
+  // Bump seq of the db by performing writes so that
+  // earliest_snapshot_ < earliest_write_conflict_snapshot_ in
+  // CompactionIterator.
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+
+  // Create another snapshot for write conflict checking
+  std::unique_ptr<Transaction> txn2;
+  {
+    TransactionOptions txn_opts;
+    txn_opts.set_snapshot = true;
+    txn2.reset(
+        db->BeginTransaction(WriteOptions(), txn_opts, /*old_txn=*/nullptr));
+  }
+
+  // Bump seq so that the subsequent bg flush won't create a snapshot with the
+  // same seq as the previous snapshot for conflict checking.
+  ASSERT_OK(db->Put(WriteOptions(), "y", "dont"));
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* /*arg*/) {
+        // Rolling back txn2 should release its snapshot(for ww checking).
+        ASSERT_OK(txn2->Rollback());
+        txn2.reset();
+        // Advance max_evicted_seq
+        ASSERT_OK(db->Put(WriteOptions(), "x", "value"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  db->ReleaseSnapshot(snapshot1);
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  }
+
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "b"));
+
+  // Take a snapshot so that the SD won't be dropped during flush.
+  auto* tmp_snapshot = db->GetSnapshot();
+
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value2"));
+  auto* snapshot = db->GetSnapshot();
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  db->ReleaseSnapshot(tmp_snapshot);
+
+  // Bump the sequence so that the below bg compaction job's snapshot will be
+  // different from snapshot's sequence.
+  ASSERT_OK(db->Put(WriteOptions(), "z", "foo"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+        const auto* const ikey =
+            reinterpret_cast<const ParsedInternalKey*>(arg);
+        assert(ikey);
+        if (ikey->user_key == "b") {
+          assert(ikey->type == kTypeValue);
+          db->ReleaseSnapshot(snapshot);
+
+          // Bump max_evicted_seq.
+          ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing2) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Generate an L0 with only SD for one key "b".
+  ASSERT_OK(db->Put(WriteOptions(), "a", "value"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value"));
+  // Take a snapshot so that subsequent flush outputs the SD for "b".
+  auto* tmp_snapshot = db->GetSnapshot();
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "b"));
+  ASSERT_OK(db->Put(WriteOptions(), "c", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:3", [&](void* arg) {
+        if (!arg) {
+          db->ReleaseSnapshot(tmp_snapshot);
+          // Bump max_evicted_seq
+          ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Finish generating L0 with only SD for "b".
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Move the L0 to L2.
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  }
+
+  ASSERT_OK(db->Put(WriteOptions(), "b", "value1"));
+
+  auto* snapshot = db->GetSnapshot();
+
+  // Bump seq so that a subsequent flush/compaction job's snapshot is larger
+  // than the above snapshot's seq.
+  ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+
+  // Generate a second L0.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+        const auto* const ikey =
+            reinterpret_cast<const ParsedInternalKey*>(arg);
+        assert(ikey);
+        if (ikey->user_key == "b") {
+          assert(ikey->type == kTypeValue);
+          db->ReleaseSnapshot(snapshot);
+
+          // Bump max_evicted_seq.
+          ASSERT_OK(db->Put(WriteOptions(), "z", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                             /*end=*/nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Although the user-contract indicates that a SD can only be issued for a key
+// that exists and has not been overwritten, it is still possible for a Delete
+// to be present when write-prepared transaction is rolled back.
+TEST_P(WritePreparedTransactionTest, SingleDeleteAfterRollback) {
+  constexpr size_t kSnapshotCacheBits = 7;  // same as default
+  constexpr size_t kCommitCacheBits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Get a write conflict snapshot by creating a transaction with
+  // set_snapshot=true.
+  TransactionOptions txn_opts;
+  txn_opts.set_snapshot = true;
+  std::unique_ptr<Transaction> dummy_txn(
+      db->BeginTransaction(WriteOptions(), txn_opts));
+
+  std::unique_ptr<Transaction> txn0(
+      db->BeginTransaction(WriteOptions(), TransactionOptions()));
+  ASSERT_OK(txn0->Put("foo", "value"));
+  ASSERT_OK(txn0->SetName("xid0"));
+  ASSERT_OK(txn0->Prepare());
+
+  // Create an SST with only {"foo": "value"}.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Insert a Delete to cancel out the prior Put by txn0.
+  ASSERT_OK(txn0->Rollback());
+  txn0.reset();
+
+  // Create a second SST.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "value1"));
+
+  auto* snapshot = db->GetSnapshot();
+
+  ASSERT_OK(db->SingleDelete(WriteOptions(), "foo"));
+
+  int count = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) {
+        const auto* const c = reinterpret_cast<const Compaction*>(arg);
+        assert(!c);
+        // Trigger once only for SingleDelete during flush.
+        if (0 == count) {
+          ++count;
+          db->ReleaseSnapshot(snapshot);
+          // Bump max_evicted_seq
+          ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare"));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Create a third SST containing a SD without its matching PUT.
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DBImpl* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  assert(dbimpl);
+  ASSERT_OK(dbimpl->TEST_CompactRange(
+      /*level=*/0, /*begin=*/nullptr, /*end=*/nullptr,
+      /*column_family=*/nullptr, /*disallow_trivial_mode=*/true));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Release the conflict-checking snapshot.
+  ASSERT_OK(dummy_txn->Rollback());
+}
+
 // A more complex test to verify compaction/flush should keep keys visible
 // to snapshots.
 TEST_P(WritePreparedTransactionTest,
@@ -2795,7 +3287,7 @@
 
   Random rnd(1103);
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   for (size_t i = 0; i < kNumTransactions; i++) {
     std::string key = "key" + ToString(i);
@@ -2836,7 +3328,7 @@
   snapshots.push_back(db->GetSnapshot());
   snapshot_data.push_back(current_data);
 
-  assert(snapshots.size() == snapshot_data.size());
+  ASSERT_EQ(snapshots.size(), snapshot_data.size());
   for (size_t i = 0; i < snapshots.size(); i++) {
     VerifyKeys(snapshot_data[i], snapshots[i]);
   }
@@ -2871,7 +3363,7 @@
 TEST_P(WritePreparedTransactionTest,
        CompactionShouldKeepSequenceForUncommittedKeys) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   // Keep track of expected sequence number.
   SequenceNumber expected_seq = 0;
   auto* transaction = db->BeginTransaction(WriteOptions());
@@ -2881,7 +3373,7 @@
   ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
   SequenceNumber seq1 = expected_seq;
   ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
   expected_seq++;  // one for data
   if (options.two_write_queues) {
     expected_seq++;  // one for commit
@@ -2913,7 +3405,7 @@
 
 TEST_P(WritePreparedTransactionTest, CommitAndSnapshotDuringCompaction) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   const Snapshot* snapshot = nullptr;
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
@@ -2996,6 +3488,7 @@
 
 TEST_P(WritePreparedTransactionTest, IteratorRefreshNotSupported) {
   Iterator* iter = db->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Refresh().IsNotSupported());
   delete iter;
 }
@@ -3017,13 +3510,13 @@
     }
     for (auto split_before_mutex : split_options) {
       UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-      ReOpen();
+      ASSERT_OK(ReOpen());
       WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-      DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       // Fill up the commit cache
       std::string init_value("value1");
       for (int i = 0; i < 10; i++) {
-        db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
       }
       // Prepare a transaction but do not commit it
       Transaction* txn =
@@ -3034,7 +3527,7 @@
       // Commit a bunch of entries to advance max evicted seq and make the
       // prepared a delayed prepared
       for (int i = 0; i < 10; i++) {
-        db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
       }
       // The snapshot should not see the delayed prepared entry
       auto snap = db->GetSnapshot();
@@ -3075,7 +3568,7 @@
           auto seq = db_impl->TEST_GetLastVisibleSequence();
           size_t tries = 0;
           while (wp_db->max_evicted_seq_ < seq && tries < 50) {
-            db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+            ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
             tries++;
           };
           ASSERT_LT(tries, 50);
@@ -3115,12 +3608,12 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 3;    // 8 entries
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // Fill up the commit cache
   std::string init_value("value1");
   for (int i = 0; i < 10; i++) {
-    db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
   }
   // Prepare a transaction but do not commit it
   Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
@@ -3128,8 +3621,8 @@
   ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
   ASSERT_OK(txn->Prepare());
   // Create a gap between prepare seq and snapshot seq
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
   // The snapshot should not see the delayed prepared entry
   auto snap = db->GetSnapshot();
   ASSERT_LT(txn->GetId(), snap->GetSequenceNumber());
@@ -3148,7 +3641,7 @@
     // prepared a delayed prepared
     size_t tries = 0;
     while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
-      db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+      ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
       tries++;
     };
     ASSERT_LT(tries, 50);
@@ -3185,13 +3678,13 @@
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 3;    // 8 entries
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // Fill up the commit cache
   std::string init_value("value1");
   std::string last_value("value_final");
   for (int i = 0; i < 10; i++) {
-    db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
   }
   // Do an uncommitted write to prevent min_uncommitted optimization
   Transaction* txn1 =
@@ -3206,8 +3699,8 @@
   ASSERT_OK(txn->Prepare());
   ASSERT_OK(txn->Commit());
   // Create a gap between commit entry and snapshot seq
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
   // The snapshot should see the last commit
   auto snap = db->GetSnapshot();
   ASSERT_LE(txn->GetId(), snap->GetSequenceNumber());
@@ -3225,7 +3718,7 @@
     // Commit a bunch of entries to advance max evicted seq beyond txn->GetId()
     size_t tries = 0;
     while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
-      db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+      ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
       tries++;
     };
     ASSERT_LT(tries, 50);
@@ -3248,7 +3741,7 @@
   read_thread.join();
   commit_thread.join();
   delete txn;
-  txn1->Commit();
+  ASSERT_OK(txn1->Commit());
   delete txn1;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -3266,7 +3759,7 @@
   // 1 entry to advance max after the 2nd commit
   const size_t commit_cache_bits = 0;
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   std::string some_value("value_some");
   std::string uncommitted_value("value_uncommitted");
@@ -3347,7 +3840,7 @@
   for (const size_t commit_cache_bits : {0, 2, 3}) {
     for (const size_t sub_batch_cnt : {1, 2, 3}) {
       UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-      ReOpen();
+      ASSERT_OK(ReOpen());
       std::atomic<const Snapshot*> snap = {nullptr};
       std::atomic<SequenceNumber> exp_prepare = {0};
       ROCKSDB_NAMESPACE::port::Thread callback_thread;
@@ -3385,7 +3878,7 @@
         // Too many txns might cause commit_seq - prepare_seq in another thread
         // to go beyond DELTA_UPPERBOUND
         for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
-          db->Put(WriteOptions(), Slice("key1"), Slice("value1"));
+          ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("value1")));
         }
       });
       ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
@@ -3448,7 +3941,7 @@
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
     ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
       if (skip_prepare) {
-        db->Put(WriteOptions(), Slice("key"), Slice("value"));
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key"), Slice("value")));
       } else {
         Transaction* txn =
             db->BeginTransaction(WriteOptions(), TransactionOptions());
@@ -3508,7 +4001,12 @@
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  if (getenv("CIRCLECI")) {
+    // Looking for backtrace on "Resource temporarily unavailable" exceptions
+    ::testing::FLAGS_gtest_catch_exceptions = false;
+  }
   return RUN_ALL_TESTS();
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc	2025-05-19 16:14:28.000000000 +0000
@@ -70,16 +70,21 @@
       wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
   WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
                                         backed_by_snapshot);
-  auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
-                                            pinnable_val, &callback);
-  if (LIKELY(callback.valid() &&
-             wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
-                                       backed_by_snapshot))) {
-    return res;
-  } else {
-    wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
-    return Status::TryAgain();
+  Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
+                                              pinnable_val, &callback);
+  const bool callback_valid =
+      callback.valid();  // NOTE: validity of callback must always be checked
+                         // before it is destructed
+  if (res.ok()) {
+    if (!LIKELY(callback_valid &&
+                wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
+                                          backed_by_snapshot))) {
+      wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+      res = Status::TryAgain();
+    }
   }
+
+  return res;
 }
 
 Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) {
@@ -104,8 +109,9 @@
   write_options.disableWAL = false;
   const bool WRITE_AFTER_COMMIT = true;
   const bool kFirstPrepareBatch = true;
-  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
-                                     !WRITE_AFTER_COMMIT);
+  auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                              name_, !WRITE_AFTER_COMMIT);
+  assert(s.ok());
   // For each duplicate key we account for a new sub-batch
   prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
   // Having AddPrepared in the PreReleaseCallback allows in-order addition of
@@ -116,10 +122,10 @@
       db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch);
   const bool DISABLE_MEMTABLE = true;
   uint64_t seq_used = kMaxSequenceNumber;
-  Status s = db_impl_->WriteImpl(
-      write_options, GetWriteBatch()->GetWriteBatch(),
-      /*callback*/ nullptr, &log_number_, /*log ref*/ 0, !DISABLE_MEMTABLE,
-      &seq_used, prepare_batch_cnt_, &add_prepared_callback);
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
+                          !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
+                          &add_prepared_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   auto prepare_seq = seq_used;
   SetId(prepare_seq);
@@ -144,14 +150,15 @@
   // The Memtable will ignore the Commit marker in non-recovery mode
   WriteBatch* working_batch = GetCommitTimeWriteBatch();
   const bool empty = working_batch->Count() == 0;
-  WriteBatchInternal::MarkCommit(working_batch, name_);
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
 
   const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
   if (!empty && for_recovery) {
     // When not writing to memtable, we can still cache the latest write batch.
     // The cached batch will be written to memtable in WriteRecoverableState
     // during FlushMemTable
-    WriteBatchInternal::SetAsLastestPersistentState(working_batch);
+    WriteBatchInternal::SetAsLatestPersistentState(working_batch);
   }
 
   auto prepare_seq = GetId();
@@ -162,7 +169,7 @@
     ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
                    "Duplicate key overhead");
     SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
-    auto s = working_batch->Iterate(&counter);
+    s = working_batch->Iterate(&counter);
     assert(s.ok());
     commit_batch_cnt = counter.BatchCount();
   }
@@ -188,9 +195,9 @@
   // redundantly reference the log that contains the prepared data.
   const uint64_t zero_log_number = 0ull;
   size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
-  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                               zero_log_number, disable_memtable, &seq_used,
-                               batch_cnt, pre_release_callback);
+  s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                          zero_log_number, disable_memtable, &seq_used,
+                          batch_cnt, pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
@@ -217,9 +224,11 @@
       wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData,
       commit_batch_seq, commit_batch_cnt);
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   const bool DISABLE_MEMTABLE = true;
   const size_t ONE_BATCH = 1;
   const uint64_t NO_REF_LOG = 0;
@@ -227,14 +236,6 @@
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_aux_batch);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
-  if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues)) {
-    if (s.ok()) {
-      // Note: RemovePrepared should be called after WriteImpl that publishsed
-      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
-      wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
-    }
-    wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
-  }  // else RemovePrepared is called from within PreReleaseCallback
   return s;
 }
 
@@ -347,12 +348,12 @@
                      wpt_db_->txn_db_options_.rollback_merge_operands,
                      roptions);
   auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
-  assert(s.ok());
   if (!s.ok()) {
     return s;
   }
   // The Rollback marker will be used as a batch separator
-  WriteBatchInternal::MarkRollback(&rollback_batch, name_);
+  s = WriteBatchInternal::MarkRollback(&rollback_batch, name_);
+  assert(s.ok());
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   const bool DISABLE_MEMTABLE = true;
   const uint64_t NO_REF_LOG = 0;
@@ -402,9 +403,11 @@
   WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
       wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_);
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_prepare);
@@ -430,8 +433,7 @@
   assert(snapshot_);
 
   SequenceNumber min_uncommitted =
-      static_cast_with_check<const SnapshotImpl, const Snapshot>(
-          snapshot_.get())
+      static_cast_with_check<const SnapshotImpl>(snapshot_.get())
           ->min_uncommitted_;
   SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
   // tracked_at_seq is either max or the last snapshot with which this key was
@@ -451,9 +453,10 @@
 
   WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted,
                                             kBackedByDBSnapshot);
-  return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
-                                               snap_seq, false /* cache_only */,
-                                               &snap_checker, min_uncommitted);
+  // TODO(yanqin): support user-defined timestamp
+  return TransactionUtil::CheckKeyForConflicts(
+      db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
+      false /* cache_only */, &snap_checker, min_uncommitted);
 }
 
 void WritePreparedTxn::SetSnapshot() {
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.cc	2025-05-19 16:14:28.000000000 +0000
@@ -15,6 +15,7 @@
 
 #include "db/arena_wrapped_db_iter.h"
 #include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
@@ -30,7 +31,7 @@
 Status WritePreparedTxnDB::Initialize(
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles) {
-  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
   assert(dbimpl != nullptr);
   auto rtxns = dbimpl->recovered_transactions();
   std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
@@ -157,7 +158,9 @@
     // TODO(myabandeh): add an option to allow user skipping this cost
     SubBatchCounter counter(*GetCFComparatorMap());
     auto s = batch->Iterate(&counter);
-    assert(s.ok());
+    if (!s.ok()) {
+      return s;
+    }
     batch_cnt = counter.BatchCount();
     WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
     ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
@@ -168,7 +171,8 @@
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   WriteOptions write_options(write_options_orig);
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(batch);
+  auto s = WriteBatchInternal::InsertNoop(batch);
+  assert(s.ok());
   const bool DISABLE_MEMTABLE = true;
   const uint64_t no_log_ref = 0;
   uint64_t seq_used = kMaxSequenceNumber;
@@ -189,9 +193,9 @@
   } else {
     pre_release_callback = &add_prepared_callback;
   }
-  auto s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr,
-                               no_log_ref, !DISABLE_MEMTABLE, &seq_used,
-                               batch_cnt, pre_release_callback);
+  s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr, no_log_ref,
+                          !DISABLE_MEMTABLE, &seq_used, batch_cnt,
+                          pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   uint64_t prepare_seq = seq_used;
   if (txn != nullptr) {
@@ -294,8 +298,7 @@
 
   std::vector<Status> stat_list(num_keys);
   for (size_t i = 0; i < num_keys; ++i) {
-    std::string* value = values ? &(*values)[i] : nullptr;
-    stat_list[i] = this->Get(options, column_family[i], keys[i], value);
+    stat_list[i] = this->Get(options, column_family[i], keys[i], &(*values)[i]);
   }
   return stat_list;
 }
@@ -320,16 +323,15 @@
 
 Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options,
                                           ColumnFamilyHandle* column_family) {
-  constexpr bool ALLOW_BLOB = true;
-  constexpr bool ALLOW_REFRESH = true;
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
   SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
   if (options.snapshot != nullptr) {
     snapshot_seq = options.snapshot->GetSequenceNumber();
     min_uncommitted =
-        static_cast_with_check<const SnapshotImpl, const Snapshot>(
-            options.snapshot)
+        static_cast_with_check<const SnapshotImpl>(options.snapshot)
             ->min_uncommitted_;
   } else {
     auto* snapshot = GetSnapshot();
@@ -337,17 +339,17 @@
     // are not deleted.
     snapshot_seq = snapshot->GetSequenceNumber();
     min_uncommitted =
-        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
-            ->min_uncommitted_;
+        static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
     own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
   }
   assert(snapshot_seq != kMaxSequenceNumber);
-  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   auto* state =
       new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
   auto* db_iter =
       db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
-                                !ALLOW_BLOB, !ALLOW_REFRESH);
+                                expose_blob_index, allow_refresh);
   db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
   return db_iter;
 }
@@ -356,16 +358,16 @@
     const ReadOptions& options,
     const std::vector<ColumnFamilyHandle*>& column_families,
     std::vector<Iterator*>* iterators) {
-  constexpr bool ALLOW_BLOB = true;
-  constexpr bool ALLOW_REFRESH = true;
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
   SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
   if (options.snapshot != nullptr) {
     snapshot_seq = options.snapshot->GetSequenceNumber();
-    min_uncommitted = static_cast_with_check<const SnapshotImpl, const Snapshot>(
-                        options.snapshot)
-                        ->min_uncommitted_;
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl>(options.snapshot)
+            ->min_uncommitted_;
   } else {
     auto* snapshot = GetSnapshot();
     // We take a snapshot to make sure that the related data in the commit map
@@ -373,18 +375,18 @@
     snapshot_seq = snapshot->GetSequenceNumber();
     own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
     min_uncommitted =
-        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
-            ->min_uncommitted_;
+        static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
   }
   iterators->clear();
   iterators->reserve(column_families.size());
   for (auto* column_family : column_families) {
-    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
     auto* state =
         new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
     auto* db_iter =
         db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
-                                  !ALLOW_BLOB, !ALLOW_REFRESH);
+                                  expose_blob_index, allow_refresh);
     db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
     iterators->push_back(db_iter);
   }
@@ -507,24 +509,22 @@
                         prev_max, max_evicted_seq);
       AdvanceMaxEvictedSeq(prev_max, max_evicted_seq);
     }
-    // After each eviction from commit cache, check if the commit entry should
-    // be kept around because it overlaps with a live snapshot.
-    CheckAgainstSnapshots(evicted);
     if (UNLIKELY(!delayed_prepared_empty_.load(std::memory_order_acquire))) {
       WriteLock wl(&prepared_mutex_);
-      for (auto dp : delayed_prepared_) {
-        if (dp == evicted.prep_seq) {
-          // This is a rare case that txn is committed but prepared_txns_ is not
-          // cleaned up yet. Refer to delayed_prepared_commits_ definition for
-          // why it should be kept updated.
-          delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq;
-          ROCKS_LOG_DEBUG(info_log_,
-                          "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64,
-                          evicted.prep_seq, evicted.commit_seq);
-          break;
-        }
+      auto dp_iter = delayed_prepared_.find(evicted.prep_seq);
+      if (dp_iter != delayed_prepared_.end()) {
+        // This is a rare case that txn is committed but prepared_txns_ is not
+        // cleaned up yet. Refer to delayed_prepared_commits_ definition for
+        // why it should be kept updated.
+        delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq;
+        ROCKS_LOG_DEBUG(info_log_,
+                        "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64,
+                        evicted.prep_seq, evicted.commit_seq);
       }
     }
+    // After each eviction from commit cache, check if the commit entry should
+    // be kept around because it overlaps with a live snapshot.
+    CheckAgainstSnapshots(evicted);
   }
   bool succ =
       ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq});
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h	2025-05-19 16:14:28.000000000 +0000
@@ -18,6 +18,7 @@
 #include "db/pre_release_callback.h"
 #include "db/read_callback.h"
 #include "db/snapshot_checker.h"
+#include "logging/logging.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
@@ -26,7 +27,6 @@
 #include "util/string_util.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
-#include "utilities/transactions/transaction_lock_mgr.h"
 #include "utilities/transactions/write_prepared_txn.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -194,7 +194,7 @@
       // happen after recovery, or it could be committed and evicted by another
       // commit, or never committed.
 
-      // At this point we dont know if it was committed or it is still prepared
+      // At this point we don't know if it was committed or it is still prepared
       max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire);
       if (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)) {
         continue;
@@ -1078,10 +1078,11 @@
                                                     SequenceNumber* min,
                                                     SequenceNumber* max) {
   if (snapshot != nullptr) {
-    *min = static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
-               ->min_uncommitted_;
-    *max = static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
-               ->number_;
+    *min =
+        static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
+    *max = static_cast_with_check<const SnapshotImpl>(snapshot)->number_;
+    // A duplicate of the check in EnhanceSnapshot().
+    assert(*min <= *max + 1);
     return kBackedByDBSnapshot;
   } else {
     *min = SmallestUnCommittedSeq();
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -73,7 +73,7 @@
   for (uint64_t max_skip : {0, std::numeric_limits<int>::max()}) {
     options.max_sequential_skip_in_iterations = max_skip;
     options.disable_auto_compactions = true;
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     TransactionOptions txn_options;
     WriteOptions woptions;
@@ -90,7 +90,7 @@
       std::string stored_value = "v" + ToString(i);
       ASSERT_OK(txn->Put("a", stored_value));
       ASSERT_OK(txn->Put("b", stored_value));
-      wup_txn->FlushWriteBatchToDB(false);
+      ASSERT_OK(wup_txn->FlushWriteBatchToDB(false));
 
       // Test Get()
       std::string value;
@@ -136,7 +136,7 @@
   }
 }
 
-#ifndef ROCKSDB_VALGRIND_RUN
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
   // This is a stress test where different threads are writing random keys, and
   // then before committing or aborting the transaction, it validates to see
@@ -147,9 +147,6 @@
   const uint32_t kNumThreads = 10;
   const uint32_t kNumKeys = 5;
 
-  std::default_random_engine rand(static_cast<uint32_t>(
-      std::hash<std::thread::id>()(std::this_thread::get_id())));
-
   // Test with
   // 1. no snapshots set
   // 2. snapshot set on ReadOptions
@@ -158,13 +155,13 @@
   WriteOptions write_options;
   txn_db_options.transaction_lock_timeout = -1;
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   std::vector<std::string> keys;
   for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
     keys.push_back("k" + ToString(k));
   }
-  std::shuffle(keys.begin(), keys.end(), rand);
+  RandomShuffle(keys.begin(), keys.end());
 
   // This counter will act as a "sequence number" to help us validate
   // visibility logic with snapshots. If we had direct access to the seqno of
@@ -182,8 +179,8 @@
     ReadOptions read_options;
 
     for (uint32_t i = 0; i < kNumIter; i++) {
-      std::set<std::string> owned_keys(&keys[id * kNumKeys],
-                                       &keys[(id + 1) * kNumKeys]);
+      std::set<std::string> owned_keys(keys.begin() + id * kNumKeys,
+                                       keys.begin() + (id + 1) * kNumKeys);
       // Add unowned keys to make the workload more interesting, but this
       // increases row lock contention, so just do it sometimes.
       if (rnd.OneIn(2)) {
@@ -191,7 +188,7 @@
       }
 
       txn = db->BeginTransaction(write_options, txn_options);
-      txn->SetName(ToString(id));
+      ASSERT_OK(txn->SetName(ToString(id)));
       txn->SetSnapshot();
       if (a >= RO_SNAPSHOT) {
         read_options.snapshot = txn->GetSnapshot();
@@ -276,23 +273,27 @@
         case 1:  // Validate Next()
         {
           Iterator* iter = txn->GetIterator(read_options);
+          ASSERT_OK(iter->status());
           for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
             verify_key(iter->key().ToString(), iter->value().ToString());
           }
+          ASSERT_OK(iter->status());
           delete iter;
           break;
         }
         case 2:  // Validate Prev()
         {
           Iterator* iter = txn->GetIterator(read_options);
+          ASSERT_OK(iter->status());
           for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
             verify_key(iter->key().ToString(), iter->value().ToString());
           }
+          ASSERT_OK(iter->status());
           delete iter;
           break;
         }
         default:
-          ASSERT_TRUE(false);
+          FAIL();
       }
 
       if (rnd.OneIn(2)) {
@@ -313,7 +314,7 @@
     t.join();
   }
 }
-#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 // This tests how write unprepared behaves during recovery when the DB crashes
 // after a transaction has either been unprepared or prepared, and tests if
@@ -337,7 +338,7 @@
         for (int num_batches = 1; num_batches < 10; num_batches++) {
           // Reset database.
           prepared_trans.clear();
-          ReOpen();
+          ASSERT_OK(ReOpen());
           wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
           if (!empty) {
             for (int i = 0; i < num_batches; i++) {
@@ -349,7 +350,7 @@
           // Write num_batches unprepared batches.
           Transaction* txn = db->BeginTransaction(write_options, txn_options);
           WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
-          txn->SetName("xid");
+          ASSERT_OK(txn->SetName("xid"));
           for (int i = 0; i < num_batches; i++) {
             ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i)));
             if (txn_options.write_batch_flush_threshold == 1) {
@@ -368,14 +369,14 @@
             // test that recovery does the rollback.
             wup_txn->unprep_seqs_.clear();
           } else {
-            txn->Prepare();
+            ASSERT_OK(txn->Prepare());
           }
           delete txn;
 
           // Crash and run recovery code paths.
-          wup_db->db_impl_->FlushWAL(true);
+          ASSERT_OK(wup_db->db_impl_->FlushWAL(true));
           wup_db->TEST_Crash();
-          ReOpenNoDelete();
+          ASSERT_OK(ReOpenNoDelete());
           assert(db != nullptr);
 
           db->GetAllPreparedTransactions(&prepared_trans);
@@ -389,6 +390,7 @@
           }
 
           Iterator* iter = db->NewIterator(ReadOptions());
+          ASSERT_OK(iter->status());
           iter->SeekToFirst();
           // Check that DB has before values.
           if (!empty || a == COMMIT) {
@@ -405,6 +407,7 @@
             }
           }
           ASSERT_FALSE(iter->Valid());
+          ASSERT_OK(iter->status());
           delete iter;
         }
       }
@@ -425,13 +428,13 @@
     txn_options.write_batch_flush_threshold = batch_size;
     for (bool prepare : {false, true}) {
       for (bool commit : {false, true}) {
-        ReOpen();
+        ASSERT_OK(ReOpen());
         Transaction* txn = db->BeginTransaction(write_options, txn_options);
         WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
-        txn->SetName("xid");
+        ASSERT_OK(txn->SetName("xid"));
 
         for (int i = 0; i < kNumKeys; i++) {
-          txn->Put("k" + ToString(i), "v" + ToString(i));
+          ASSERT_OK(txn->Put("k" + ToString(i), "v" + ToString(i)));
           if (txn_options.write_batch_flush_threshold == 1) {
             // WriteUnprepared will check write_batch_flush_threshold and
             // possibly flush before appending to the write batch. No flush will
@@ -448,9 +451,11 @@
         }
 
         Iterator* iter = db->NewIterator(ReadOptions());
+        ASSERT_OK(iter->status());
         iter->SeekToFirst();
         assert(!iter->Valid());
         ASSERT_FALSE(iter->Valid());
+        ASSERT_OK(iter->status());
         delete iter;
 
         if (commit) {
@@ -461,6 +466,7 @@
         delete txn;
 
         iter = db->NewIterator(ReadOptions());
+        ASSERT_OK(iter->status());
         iter->SeekToFirst();
 
         for (int i = 0; i < (commit ? kNumKeys : 0); i++) {
@@ -470,6 +476,7 @@
           iter->Next();
         }
         ASSERT_FALSE(iter->Valid());
+        ASSERT_OK(iter->status());
         delete iter;
       }
     }
@@ -493,7 +500,7 @@
 
   for (bool prepare : {false, true}) {
     for (bool commit : {false, true}) {
-      ReOpen();
+      ASSERT_OK(ReOpen());
       auto wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
       auto db_impl = wup_db->db_impl_;
 
@@ -511,7 +518,7 @@
         }
 
         if (i > 0) {
-          db_impl->TEST_SwitchWAL();
+          ASSERT_OK(db_impl->TEST_SwitchWAL());
         }
       }
 
@@ -571,12 +578,14 @@
   // snapshot, if iterator snapshot is fresh enough.
   ReadOptions roptions;
   auto iter = txn->GetIterator(roptions);
+  ASSERT_OK(iter->status());
   int keys = 0;
   for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) {
     ASSERT_OK(iter->status());
     ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
   }
   ASSERT_EQ(keys, 3);
+  ASSERT_OK(iter->status());
 
   delete iter;
   delete txn;
@@ -601,6 +610,7 @@
 
     ReadOptions roptions;
     auto iter = txn->GetIterator(roptions);
+    ASSERT_OK(iter->status());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_OK(iter->status());
       if (iter->key() == "9") {
@@ -615,11 +625,13 @@
         ASSERT_OK(txn->Put(iter->key(), "b"));
       }
     }
+    ASSERT_OK(iter->status());
 
     delete iter;
     ASSERT_OK(txn->Commit());
 
     iter = db->NewIterator(roptions);
+    ASSERT_OK(iter->status());
     if (a == DO_DELETE) {
       // Check that db is empty.
       iter->SeekToFirst();
@@ -633,11 +645,58 @@
       }
       ASSERT_EQ(keys, 100);
     }
+    ASSERT_OK(iter->status());
 
     delete iter;
     delete txn;
   }
 }
+
+// Test that using an iterator after transaction clear is not supported
+TEST_P(WriteUnpreparedTransactionTest, IterateAfterClear) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  enum Action { kCommit, kRollback };
+
+  for (Action a : {kCommit, kRollback}) {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(db->Put(woptions, ToString(i), ToString(i)));
+    }
+
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    ASSERT_OK(txn->Put("9", "a"));
+
+    ReadOptions roptions;
+    auto iter1 = txn->GetIterator(roptions);
+    auto iter2 = txn->GetIterator(roptions);
+    iter1->SeekToFirst();
+    iter2->Seek("9");
+
+    // Check that iterators are valid before transaction finishes.
+    ASSERT_TRUE(iter1->Valid());
+    ASSERT_TRUE(iter2->Valid());
+    ASSERT_OK(iter1->status());
+    ASSERT_OK(iter2->status());
+
+    if (a == kCommit) {
+      ASSERT_OK(txn->Commit());
+    } else {
+      ASSERT_OK(txn->Rollback());
+    }
+
+    // Check that iterators are invalidated after transaction finishes.
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_FALSE(iter2->Valid());
+    ASSERT_TRUE(iter1->status().IsInvalidArgument());
+    ASSERT_TRUE(iter2->status().IsInvalidArgument());
+
+    delete iter1;
+    delete iter2;
+    delete txn;
+  }
+}
 
 TEST_P(WriteUnpreparedTransactionTest, SavePoint) {
   WriteOptions woptions;
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc	2025-05-19 16:14:28.000000000 +0000
@@ -6,9 +6,11 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/transactions/write_unprepared_txn.h"
+
 #include "db/db_impl/db_impl.h"
 #include "util/cast_util.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -72,10 +74,10 @@
     }
   }
 
-  // Call tracked_keys_.clear() so that ~PessimisticTransaction does not
+  // Clear the tracked locks so that ~PessimisticTransaction does not
   // try to unlock keys for recovered transactions.
   if (recovered_txn_) {
-    tracked_keys_.clear();
+    tracked_locks_->Clear();
   }
 }
 
@@ -279,7 +281,9 @@
     static std::atomic_ullong autogen_id{0};
     // To avoid changing all tests to call SetName, just autogenerate one.
     if (wupt_db_->txn_db_options_.autogenerate_name) {
-      SetName(std::string("autoxid") + ToString(autogen_id.fetch_add(1)));
+      auto s =
+          SetName(std::string("autoxid") + ToString(autogen_id.fetch_add(1)));
+      assert(s.ok());
     } else
 #endif
     {
@@ -296,7 +300,9 @@
 
     Status AddUntrackedKey(uint32_t cf, const Slice& key) {
       auto str = key.ToString();
-      if (txn_->tracked_keys_[cf].count(str) == 0) {
+      PointLockStatus lock_status =
+          txn_->tracked_locks_->GetPointLockStatus(cf, str);
+      if (!lock_status.locked) {
         txn_->untracked_keys_[cf].push_back(str);
       }
       return Status::OK();
@@ -352,8 +358,9 @@
   const bool WRITE_AFTER_COMMIT = true;
   const bool first_prepare_batch = log_number_ == 0;
   // MarkEndPrepare will change Noop marker to the appropriate marker.
-  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
-                                     !WRITE_AFTER_COMMIT, !prepared);
+  s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                         name_, !WRITE_AFTER_COMMIT, !prepared);
+  assert(s.ok());
   // For each duplicate key we account for a new sub-batch
   prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
   // AddPrepared better to be called in the pre-release callback otherwise there
@@ -539,14 +546,15 @@
   // will ignore the Commit marker in non-recovery mode
   WriteBatch* working_batch = GetCommitTimeWriteBatch();
   const bool empty = working_batch->Count() == 0;
-  WriteBatchInternal::MarkCommit(working_batch, name_);
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
 
   const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
   if (!empty && for_recovery) {
     // When not writing to memtable, we can still cache the latest write batch.
     // The cached batch will be written to memtable in WriteRecoverableState
     // during FlushMemTable
-    WriteBatchInternal::SetAsLastestPersistentState(working_batch);
+    WriteBatchInternal::SetAsLatestPersistentState(working_batch);
   }
 
   const bool includes_data = !empty && !for_recovery;
@@ -555,7 +563,7 @@
     ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
                    "Duplicate key overhead");
     SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
-    auto s = working_batch->Iterate(&counter);
+    s = working_batch->Iterate(&counter);
     assert(s.ok());
     commit_batch_cnt = counter.BatchCount();
   }
@@ -581,9 +589,9 @@
   // need to redundantly reference the log that contains the prepared data.
   const uint64_t zero_log_number = 0ull;
   size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
-  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                               zero_log_number, disable_memtable, &seq_used,
-                               batch_cnt, pre_release_callback);
+  s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                          zero_log_number, disable_memtable, &seq_used,
+                          batch_cnt, pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
@@ -607,6 +615,8 @@
   // commit write batch as just another "unprepared" batch. This will also
   // update the unprep_seqs_ in the update_commit_map callback.
   unprep_seqs_[commit_batch_seq] = commit_batch_cnt;
+  WriteUnpreparedCommitEntryPreReleaseCallback
+      update_commit_map_with_commit_batch(wpt_db_, db_impl_, unprep_seqs_, 0);
 
   // Note: the 2nd write comes with a performance penality. So if we have too
   // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
@@ -615,15 +625,17 @@
 
   // Update commit map only from the 2nd queue
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   const bool DISABLE_MEMTABLE = true;
   const size_t ONE_BATCH = 1;
   const uint64_t NO_REF_LOG = 0;
   s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
-                          &update_commit_map);
+                          &update_commit_map_with_commit_batch);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   // Note RemovePrepared should be called after WriteImpl that publishsed the
   // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
@@ -637,8 +649,10 @@
 }
 
 Status WriteUnpreparedTxn::WriteRollbackKeys(
-    const TransactionKeyMap& tracked_keys, WriteBatchWithIndex* rollback_batch,
+    const LockTracker& lock_tracker, WriteBatchWithIndex* rollback_batch,
     ReadCallback* callback, const ReadOptions& roptions) {
+  // This assertion can be removed when range lock is supported.
+  assert(lock_tracker.IsPointLockSupported());
   const auto& cf_map = *wupt_db_->GetCFHandleMap();
   auto WriteRollbackKey = [&](const std::string& key, uint32_t cfid) {
     const auto& cf_handle = cf_map.at(cfid);
@@ -664,11 +678,17 @@
     return Status::OK();
   };
 
-  for (const auto& cfkey : tracked_keys) {
-    const auto cfid = cfkey.first;
-    const auto& keys = cfkey.second;
-    for (const auto& pair : keys) {
-      auto s = WriteRollbackKey(pair.first, cfid);
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      lock_tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        lock_tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      auto s = WriteRollbackKey(key, cf);
       if (!s.ok()) {
         return s;
       }
@@ -704,34 +724,44 @@
   // need to read our own writes when reading prior versions of the key for
   // rollback.
   WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq);
-  WriteRollbackKeys(GetTrackedKeys(), &rollback_batch, &callback, roptions);
+  // TODO(lth): We write rollback batch all in a single batch here, but this
+  // should be subdivded into multiple batches as well. In phase 2, when key
+  // sets are read from WAL, this will happen naturally.
+  s = WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions);
+  if (!s.ok()) {
+    return s;
+  }
 
   // The Rollback marker will be used as a batch separator
-  WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+  s = WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+  assert(s.ok());
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   const bool DISABLE_MEMTABLE = true;
   const uint64_t NO_REF_LOG = 0;
   uint64_t seq_used = kMaxSequenceNumber;
-  // TODO(lth): We write rollback batch all in a single batch here, but this
-  // should be subdivded into multiple batches as well. In phase 2, when key
-  // sets are read from WAL, this will happen naturally.
-  const size_t ONE_BATCH = 1;
-  // We commit the rolled back prepared batches. ALthough this is
+  // Rollback batch may contain duplicate keys, because tracked_keys_ is not
+  // comparator aware.
+  auto rollback_batch_cnt = rollback_batch.SubBatchCnt();
+  // We commit the rolled back prepared batches. Although this is
   // counter-intuitive, i) it is safe to do so, since the prepared batches are
   // already canceled out by the rollback batch, ii) adding the commit entry to
   // CommitCache will allow us to benefit from the existing mechanism in
   // CommitCache that keeps an entry evicted due to max advance and yet overlaps
   // with a live snapshot around so that the live snapshot properly skips the
   // entry even if its prepare seq is lower than max_evicted_seq_.
+  //
+  // TODO(lth): RollbackInternal is conceptually very similar to
+  // CommitInternal, with the rollback batch simply taking on the role of
+  // CommitTimeWriteBatch. We should be able to merge the two code paths.
   WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
-      wpt_db_, db_impl_, unprep_seqs_, ONE_BATCH);
+      wpt_db_, db_impl_, unprep_seqs_, rollback_batch_cnt);
   // Note: the rollback batch does not need AddPrepared since it is written to
   // DB in one shot. min_uncommitted still works since it requires capturing
-  // data that is written to DB but not yet committed, while the roolback
+  // data that is written to DB but not yet committed, while the rollback
   // batch commits with PreReleaseCallback.
   s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(),
                           nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE,
-                          &seq_used, rollback_batch.SubBatchCnt(),
+                          &seq_used, rollback_batch_cnt,
                           do_one_write ? &update_commit_map : nullptr);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   if (!s.ok()) {
@@ -746,21 +776,28 @@
     unflushed_save_points_.reset(nullptr);
     return s;
   }  // else do the 2nd write for commit
+
   uint64_t& prepare_seq = seq_used;
+  // Populate unprep_seqs_ with rollback_batch_cnt, since we treat data in the
+  // rollback write batch as just another "unprepared" batch. This will also
+  // update the unprep_seqs_ in the update_commit_map callback.
+  unprep_seqs_[prepare_seq] = rollback_batch_cnt;
+  WriteUnpreparedCommitEntryPreReleaseCallback
+      update_commit_map_with_rollback_batch(wpt_db_, db_impl_, unprep_seqs_, 0);
+
   ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
                     "RollbackInternal 2nd write prepare_seq: %" PRIu64,
                     prepare_seq);
-  // Commit the batch by writing an empty batch to the queue that will release
-  // the commit sequence number to readers.
-  WriteUnpreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
-      wpt_db_, db_impl_, unprep_seqs_, prepare_seq);
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
+  const size_t ONE_BATCH = 1;
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
-                          &update_commit_map_with_prepare);
+                          &update_commit_map_with_rollback_batch);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   // Mark the txn as rolled back
   if (s.ok()) {
@@ -777,14 +814,18 @@
 
 void WriteUnpreparedTxn::Clear() {
   if (!recovered_txn_) {
-    txn_db_impl_->UnLock(this, &GetTrackedKeys());
+    txn_db_impl_->UnLock(this, *tracked_locks_);
   }
   unprep_seqs_.clear();
   flushed_save_points_.reset(nullptr);
   unflushed_save_points_.reset(nullptr);
   recovered_txn_ = false;
   largest_validated_seq_ = 0;
-  assert(active_iterators_.empty());
+  for (auto& it : active_iterators_) {
+    auto bdit = static_cast<BaseDeltaIterator*>(it);
+    bdit->Invalidate(Status::InvalidArgument(
+        "Cannot use iterator after transaction has finished"));
+  }
   active_iterators_.clear();
   untracked_keys_.clear();
   TransactionBaseImpl::Clear();
@@ -829,23 +870,24 @@
   WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back();
 
   assert(save_points_ != nullptr && save_points_->size() > 0);
-  const TransactionKeyMap& tracked_keys = save_points_->top().new_keys_;
+  const LockTracker& tracked_keys = *save_points_->top().new_locks_;
 
   ReadOptions roptions;
   roptions.snapshot = top.snapshot_->snapshot();
   SequenceNumber min_uncommitted =
-      static_cast_with_check<const SnapshotImpl, const Snapshot>(
-          roptions.snapshot)
+      static_cast_with_check<const SnapshotImpl>(roptions.snapshot)
           ->min_uncommitted_;
   SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber();
   WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
                                           top.unprep_seqs_,
                                           kBackedByDBSnapshot);
-  WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions);
+  s = WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions);
+  if (!s.ok()) {
+    return s;
+  }
 
   const bool kPrepared = true;
   s = FlushWriteBatchToDBInternal(!kPrepared);
-  assert(s.ok());
   if (!s.ok()) {
     return s;
   }
@@ -927,6 +969,7 @@
              wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
     return res;
   } else {
+    res.PermitUncheckedError();
     wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
     return Status::TryAgain();
   }
@@ -963,8 +1006,7 @@
   assert(snapshot_);
 
   SequenceNumber min_uncommitted =
-      static_cast_with_check<const SnapshotImpl, const Snapshot>(
-          snapshot_.get())
+      static_cast_with_check<const SnapshotImpl>(snapshot_.get())
           ->min_uncommitted_;
   SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
   // tracked_at_seq is either max or the last snapshot with which this key was
@@ -984,9 +1026,10 @@
 
   WriteUnpreparedTxnReadCallback snap_checker(
       wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot);
-  return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
-                                               snap_seq, false /* cache_only */,
-                                               &snap_checker, min_uncommitted);
+  // TODO(yanqin): Support user-defined timestamp.
+  return TransactionUtil::CheckKeyForConflicts(
+      db_impl_, cfh, key.ToString(), snap_seq, /*ts=*/nullptr,
+      false /* cache_only */, &snap_checker, min_uncommitted);
 }
 
 const std::map<SequenceNumber, size_t>&
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.h	2025-05-19 16:14:28.000000000 +0000
@@ -212,7 +212,7 @@
   friend class WriteUnpreparedTxnDB;
 
   const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
-  Status WriteRollbackKeys(const TransactionKeyMap& tracked_keys,
+  Status WriteRollbackKeys(const LockTracker& tracked_keys,
                            WriteBatchWithIndex* rollback_batch,
                            ReadCallback* callback, const ReadOptions& roptions);
 
@@ -262,7 +262,6 @@
   // value when calling RollbackToSavepoint.
   SequenceNumber largest_validated_seq_;
 
-  using KeySet = std::unordered_map<uint32_t, std::vector<std::string>>;
   struct SavePoint {
     // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
     // used during RollbackToSavepoint to determine visibility when restoring
@@ -333,6 +332,7 @@
   // last savepoint. Also, it may make sense to merge this into tracked_keys_
   // and differentiate between tracked but not locked keys to avoid having two
   // very similar data structures.
+  using KeySet = std::unordered_map<uint32_t, std::vector<std::string>>;
   KeySet untracked_keys_;
 };
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.cc	2025-05-19 16:14:28.000000000 +0000
@@ -167,7 +167,10 @@
     }
 
     // The Rollback marker will be used as a batch separator
-    WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
+    s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
+    if (!s.ok()) {
+      return s;
+    }
 
     const uint64_t kNoLogRef = 0;
     const bool kDisableMemtable = true;
@@ -193,7 +196,7 @@
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles) {
   // TODO(lth): Reduce code duplication in this function.
-  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
   assert(dbimpl != nullptr);
 
   db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
@@ -268,8 +271,7 @@
 
     Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
     assert(real_trx);
-    auto wupt =
-        static_cast_with_check<WriteUnpreparedTxn, Transaction>(real_trx);
+    auto wupt = static_cast_with_check<WriteUnpreparedTxn>(real_trx);
     wupt->recovered_txn_ = true;
 
     real_trx->SetLogNumber(first_log_number);
@@ -385,8 +387,8 @@
                                             ColumnFamilyHandle* column_family,
                                             WriteUnpreparedTxn* txn) {
   // TODO(lth): Refactor so that this logic is shared with WritePrepared.
-  constexpr bool ALLOW_BLOB = true;
-  constexpr bool ALLOW_REFRESH = true;
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
   SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
@@ -451,15 +453,15 @@
     return nullptr;
   }
   min_uncommitted =
-      static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
-          ->min_uncommitted_;
+      static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
 
-  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   auto* state =
       new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
-  auto* db_iter =
-      db_impl_->NewIteratorImpl(options, cfd, state->MaxVisibleSeq(),
-                                &state->callback, !ALLOW_BLOB, !ALLOW_REFRESH);
+  auto* db_iter = db_impl_->NewIteratorImpl(
+      options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index,
+      allow_refresh);
   db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr);
   return db_iter;
 }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn_db.h	2025-05-19 16:14:28.000000000 +0000
@@ -104,45 +104,5 @@
   bool publish_seq_;
 };
 
-class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback {
-  // TODO(lth): Reduce code duplication with
-  // WritePreparedCommitEntryPreReleaseCallback
- public:
-  WriteUnpreparedRollbackPreReleaseCallback(
-      WritePreparedTxnDB* db, DBImpl* db_impl,
-      const std::map<SequenceNumber, size_t>& unprep_seqs,
-      SequenceNumber rollback_seq)
-      : db_(db),
-        db_impl_(db_impl),
-        unprep_seqs_(unprep_seqs),
-        rollback_seq_(rollback_seq) {
-    assert(unprep_seqs.size() > 0);
-    assert(db_impl_->immutable_db_options().two_write_queues);
-  }
-
-  virtual Status Callback(SequenceNumber commit_seq,
-                          bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t, size_t /*index*/,
-                          size_t /*total*/) override {
-    assert(is_mem_disabled);  // implies the 2nd queue
-    const uint64_t last_commit_seq = commit_seq;
-    db_->AddCommitted(rollback_seq_, last_commit_seq);
-    // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt.
-    for (const auto& s : unprep_seqs_) {
-      for (size_t i = 0; i < s.second; i++) {
-        db_->AddCommitted(s.first + i, last_commit_seq);
-      }
-    }
-    db_impl_->SetLastPublishedSequence(last_commit_seq);
-    return Status::OK();
-  }
-
- private:
-  WritePreparedTxnDB* db_;
-  DBImpl* db_impl_;
-  const std::map<SequenceNumber, size_t>& unprep_seqs_;
-  SequenceNumber rollback_seq_;
-};
-
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,37 +8,309 @@
 
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
+#include "logging/logging.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
 #include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo> ttl_merge_op_type_info =
+    {{"user_operator",
+      OptionTypeInfo::AsCustomSharedPtr<MergeOperator>(
+          0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}};
+
+TtlMergeOperator::TtlMergeOperator(
+    const std::shared_ptr<MergeOperator>& merge_op, SystemClock* clock)
+    : user_merge_op_(merge_op), clock_(clock) {
+  RegisterOptions("TtlMergeOptions", &user_merge_op_, &ttl_merge_op_type_info);
+}
+
+bool TtlMergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+                                   MergeOperationOutput* merge_out) const {
+  const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+  if (merge_in.existing_value && merge_in.existing_value->size() < ts_len) {
+    ROCKS_LOG_ERROR(merge_in.logger,
+                    "Error: Could not remove timestamp from existing value.");
+    return false;
+  }
+
+  // Extract time-stamp from each operand to be passed to user_merge_op_
+  std::vector<Slice> operands_without_ts;
+  for (const auto& operand : merge_in.operand_list) {
+    if (operand.size() < ts_len) {
+      ROCKS_LOG_ERROR(merge_in.logger,
+                      "Error: Could not remove timestamp from operand value.");
+      return false;
+    }
+    operands_without_ts.push_back(operand);
+    operands_without_ts.back().remove_suffix(ts_len);
+  }
+
+  // Apply the user merge operator (store result in *new_value)
+  bool good = true;
+  MergeOperationOutput user_merge_out(merge_out->new_value,
+                                      merge_out->existing_operand);
+  if (merge_in.existing_value) {
+    Slice existing_value_without_ts(merge_in.existing_value->data(),
+                                    merge_in.existing_value->size() - ts_len);
+    good = user_merge_op_->FullMergeV2(
+        MergeOperationInput(merge_in.key, &existing_value_without_ts,
+                            operands_without_ts, merge_in.logger),
+        &user_merge_out);
+  } else {
+    good = user_merge_op_->FullMergeV2(
+        MergeOperationInput(merge_in.key, nullptr, operands_without_ts,
+                            merge_in.logger),
+        &user_merge_out);
+  }
+
+  // Return false if the user merge operator returned false
+  if (!good) {
+    return false;
+  }
+
+  if (merge_out->existing_operand.data()) {
+    merge_out->new_value.assign(merge_out->existing_operand.data(),
+                                merge_out->existing_operand.size());
+    merge_out->existing_operand = Slice(nullptr, 0);
+  }
+
+  // Augment the *new_value with the ttl time-stamp
+  int64_t curtime;
+  if (!clock_->GetCurrentTime(&curtime).ok()) {
+    ROCKS_LOG_ERROR(
+        merge_in.logger,
+        "Error: Could not get current time to be attached internally "
+        "to the new value.");
+    return false;
+  } else {
+    char ts_string[ts_len];
+    EncodeFixed32(ts_string, (int32_t)curtime);
+    merge_out->new_value.append(ts_string, ts_len);
+    return true;
+  }
+}
+
+bool TtlMergeOperator::PartialMergeMulti(const Slice& key,
+                                         const std::deque<Slice>& operand_list,
+                                         std::string* new_value,
+                                         Logger* logger) const {
+  const uint32_t ts_len = DBWithTTLImpl::kTSLength;
+  std::deque<Slice> operands_without_ts;
+
+  for (const auto& operand : operand_list) {
+    if (operand.size() < ts_len) {
+      ROCKS_LOG_ERROR(logger, "Error: Could not remove timestamp from value.");
+      return false;
+    }
+
+    operands_without_ts.push_back(
+        Slice(operand.data(), operand.size() - ts_len));
+  }
+
+  // Apply the user partial-merge operator (store result in *new_value)
+  assert(new_value);
+  if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value,
+                                         logger)) {
+    return false;
+  }
+
+  // Augment the *new_value with the ttl time-stamp
+  int64_t curtime;
+  if (!clock_->GetCurrentTime(&curtime).ok()) {
+    ROCKS_LOG_ERROR(
+        logger,
+        "Error: Could not get current time to be attached internally "
+        "to the new value.");
+    return false;
+  } else {
+    char ts_string[ts_len];
+    EncodeFixed32(ts_string, (int32_t)curtime);
+    new_value->append(ts_string, ts_len);
+    return true;
+  }
+}
+
+Status TtlMergeOperator::PrepareOptions(const ConfigOptions& config_options) {
+  if (clock_ == nullptr) {
+    clock_ = config_options.env->GetSystemClock().get();
+  }
+  return MergeOperator::PrepareOptions(config_options);
+}
+
+Status TtlMergeOperator::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (user_merge_op_ == nullptr) {
+    return Status::InvalidArgument(
+        "UserMergeOperator required by TtlMergeOperator");
+  } else if (clock_ == nullptr) {
+    return Status::InvalidArgument("SystemClock required by TtlMergeOperator");
+  } else {
+    return MergeOperator::ValidateOptions(db_opts, cf_opts);
+  }
+}
 
 void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
-                                    Env* env) {
+                                    SystemClock* clock) {
   if (options->compaction_filter) {
     options->compaction_filter =
-        new TtlCompactionFilter(ttl, env, options->compaction_filter);
+        new TtlCompactionFilter(ttl, clock, options->compaction_filter);
   } else {
     options->compaction_filter_factory =
         std::shared_ptr<CompactionFilterFactory>(new TtlCompactionFilterFactory(
-            ttl, env, options->compaction_filter_factory));
+            ttl, clock, options->compaction_filter_factory));
   }
 
   if (options->merge_operator) {
     options->merge_operator.reset(
-        new TtlMergeOperator(options->merge_operator, env));
+        new TtlMergeOperator(options->merge_operator, clock));
+  }
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> ttl_type_info = {
+    {"ttl", {0, OptionType::kInt32T}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> ttl_cff_type_info = {
+    {"user_filter_factory",
+     OptionTypeInfo::AsCustomSharedPtr<CompactionFilterFactory>(
+         0, OptionVerificationType::kByNameAllowFromNull,
+         OptionTypeFlags::kNone)}};
+static std::unordered_map<std::string, OptionTypeInfo> user_cf_type_info = {
+    {"user_filter",
+     OptionTypeInfo::AsCustomRawPtr<const CompactionFilter>(
+         0, OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}};
+
+TtlCompactionFilter::TtlCompactionFilter(
+    int32_t ttl, SystemClock* clock, const CompactionFilter* _user_comp_filter,
+    std::unique_ptr<const CompactionFilter> _user_comp_filter_from_factory)
+    : LayeredCompactionFilterBase(_user_comp_filter,
+                                  std::move(_user_comp_filter_from_factory)),
+      ttl_(ttl),
+      clock_(clock) {
+  RegisterOptions("TTL", &ttl_, &ttl_type_info);
+  RegisterOptions("UserFilter", &user_comp_filter_, &user_cf_type_info);
+}
+
+bool TtlCompactionFilter::Filter(int level, const Slice& key,
+                                 const Slice& old_val, std::string* new_val,
+                                 bool* value_changed) const {
+  if (DBWithTTLImpl::IsStale(old_val, ttl_, clock_)) {
+    return true;
+  }
+  if (user_comp_filter() == nullptr) {
+    return false;
+  }
+  assert(old_val.size() >= DBWithTTLImpl::kTSLength);
+  Slice old_val_without_ts(old_val.data(),
+                           old_val.size() - DBWithTTLImpl::kTSLength);
+  if (user_comp_filter()->Filter(level, key, old_val_without_ts, new_val,
+                                 value_changed)) {
+    return true;
+  }
+  if (*value_changed) {
+    new_val->append(old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength,
+                    DBWithTTLImpl::kTSLength);
+  }
+  return false;
+}
+
+Status TtlCompactionFilter::PrepareOptions(
+    const ConfigOptions& config_options) {
+  if (clock_ == nullptr) {
+    clock_ = config_options.env->GetSystemClock().get();
+  }
+  return LayeredCompactionFilterBase::PrepareOptions(config_options);
+}
+
+Status TtlCompactionFilter::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (clock_ == nullptr) {
+    return Status::InvalidArgument(
+        "SystemClock required by TtlCompactionFilter");
+  } else {
+    return LayeredCompactionFilterBase::ValidateOptions(db_opts, cf_opts);
+  }
+}
+
+TtlCompactionFilterFactory::TtlCompactionFilterFactory(
+    int32_t ttl, SystemClock* clock,
+    std::shared_ptr<CompactionFilterFactory> comp_filter_factory)
+    : ttl_(ttl), clock_(clock), user_comp_filter_factory_(comp_filter_factory) {
+  RegisterOptions("UserOptions", &user_comp_filter_factory_,
+                  &ttl_cff_type_info);
+  RegisterOptions("TTL", &ttl_, &ttl_type_info);
+}
+
+std::unique_ptr<CompactionFilter>
+TtlCompactionFilterFactory::CreateCompactionFilter(
+    const CompactionFilter::Context& context) {
+  std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory =
+      nullptr;
+  if (user_comp_filter_factory_) {
+    user_comp_filter_from_factory =
+        user_comp_filter_factory_->CreateCompactionFilter(context);
+  }
+
+  return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
+      ttl_, clock_, nullptr, std::move(user_comp_filter_from_factory)));
+}
+
+Status TtlCompactionFilterFactory::PrepareOptions(
+    const ConfigOptions& config_options) {
+  if (clock_ == nullptr) {
+    clock_ = config_options.env->GetSystemClock().get();
+  }
+  return CompactionFilterFactory::PrepareOptions(config_options);
+}
+
+Status TtlCompactionFilterFactory::ValidateOptions(
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
+  if (clock_ == nullptr) {
+    return Status::InvalidArgument(
+        "SystemClock required by TtlCompactionFilterFactory");
+  } else {
+    return CompactionFilterFactory::ValidateOptions(db_opts, cf_opts);
   }
 }
 
+int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/) {
+  library.AddFactory<MergeOperator>(
+      TtlMergeOperator::kClassName(),
+      [](const std::string& /*uri*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TtlMergeOperator(nullptr, nullptr));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      TtlCompactionFilterFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilterFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new TtlCompactionFilterFactory(0, nullptr, nullptr));
+        return guard->get();
+      });
+  library.AddFactory<CompactionFilter>(
+      TtlCompactionFilter::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<CompactionFilter>* /*guard*/,
+         std::string* /* errmsg */) {
+        return new TtlCompactionFilter(0, nullptr, nullptr);
+      });
+  size_t num_types;
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
 // Open the db inside DBWithTTLImpl because options needs pointer to its ttl
 DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
 
 DBWithTTLImpl::~DBWithTTLImpl() {
   if (!closed_) {
-    Close();
+    Close().PermitUncheckedError();
   }
 }
 
@@ -67,9 +339,15 @@
   return s;
 }
 
+void DBWithTTLImpl::RegisterTtlClasses() {
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    ObjectRegistry::Default()->AddLibrary("TTL", RegisterTtlObjects, "");
+  });
+}
+
 Status DBWithTTL::Open(const Options& options, const std::string& dbname,
                        DBWithTTL** dbptr, int32_t ttl, bool read_only) {
-
   DBOptions db_options(options);
   ColumnFamilyOptions cf_options(options);
   std::vector<ColumnFamilyDescriptor> column_families;
@@ -91,19 +369,22 @@
     const DBOptions& db_options, const std::string& dbname,
     const std::vector<ColumnFamilyDescriptor>& column_families,
     std::vector<ColumnFamilyHandle*>* handles, DBWithTTL** dbptr,
-    std::vector<int32_t> ttls, bool read_only) {
-
+    const std::vector<int32_t>& ttls, bool read_only) {
+  DBWithTTLImpl::RegisterTtlClasses();
   if (ttls.size() != column_families.size()) {
     return Status::InvalidArgument(
         "ttls size has to be the same as number of column families");
   }
 
+  SystemClock* clock = (db_options.env == nullptr)
+                           ? SystemClock::Default().get()
+                           : db_options.env->GetSystemClock().get();
+
   std::vector<ColumnFamilyDescriptor> column_families_sanitized =
       column_families;
   for (size_t i = 0; i < column_families_sanitized.size(); ++i) {
     DBWithTTLImpl::SanitizeOptions(
-        ttls[i], &column_families_sanitized[i].options,
-        db_options.env == nullptr ? Env::Default() : db_options.env);
+        ttls[i], &column_families_sanitized[i].options, clock);
   }
   DB* db;
 
@@ -125,8 +406,10 @@
 Status DBWithTTLImpl::CreateColumnFamilyWithTtl(
     const ColumnFamilyOptions& options, const std::string& column_family_name,
     ColumnFamilyHandle** handle, int ttl) {
+  RegisterTtlClasses();
   ColumnFamilyOptions sanitized_options = options;
-  DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options, GetEnv());
+  DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options,
+                                 GetEnv()->GetSystemClock().get());
 
   return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name,
                                        handle);
@@ -141,11 +424,11 @@
 // Appends the current timestamp to the string.
 // Returns false if could not get the current_time, true if append succeeds
 Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts,
-                               Env* env) {
+                               SystemClock* clock) {
   val_with_ts->reserve(kTSLength + val.size());
   char ts_string[kTSLength];
   int64_t curtime;
-  Status st = env->GetCurrentTime(&curtime);
+  Status st = clock->GetCurrentTime(&curtime);
   if (!st.ok()) {
     return st;
   }
@@ -171,12 +454,13 @@
 }
 
 // Checks if the string is stale or not according to TTl provided
-bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, Env* env) {
+bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl,
+                            SystemClock* clock) {
   if (ttl <= 0) {  // Data is fresh if TTL is non-positive
     return false;
   }
   int64_t curtime;
-  if (!env->GetCurrentTime(&curtime).ok()) {
+  if (!clock->GetCurrentTime(&curtime).ok()) {
     return false;  // Treat the data as fresh if could not get current time
   }
   int32_t timestamp_value =
@@ -186,32 +470,33 @@
 
 // Strips the TS from the end of the slice
 Status DBWithTTLImpl::StripTS(PinnableSlice* pinnable_val) {
-  Status st;
   if (pinnable_val->size() < kTSLength) {
     return Status::Corruption("Bad timestamp in key-value");
   }
   // Erasing characters which hold the TS
   pinnable_val->remove_suffix(kTSLength);
-  return st;
+  return Status::OK();
 }
 
 // Strips the TS from the end of the string
 Status DBWithTTLImpl::StripTS(std::string* str) {
-  Status st;
   if (str->length() < kTSLength) {
     return Status::Corruption("Bad timestamp in key-value");
   }
   // Erasing characters which hold the TS
   str->erase(str->length() - kTSLength, kTSLength);
-  return st;
+  return Status::OK();
 }
 
 Status DBWithTTLImpl::Put(const WriteOptions& options,
                           ColumnFamilyHandle* column_family, const Slice& key,
                           const Slice& val) {
   WriteBatch batch;
-  batch.Put(column_family, key, val);
-  return Write(options, &batch);
+  Status st = batch.Put(column_family, key, val);
+  if (st.ok()) {
+    st = Write(options, &batch);
+  }
+  return st;
 }
 
 Status DBWithTTLImpl::Get(const ReadOptions& options,
@@ -263,53 +548,55 @@
                             ColumnFamilyHandle* column_family, const Slice& key,
                             const Slice& value) {
   WriteBatch batch;
-  batch.Merge(column_family, key, value);
-  return Write(options, &batch);
+  Status st = batch.Merge(column_family, key, value);
+  if (st.ok()) {
+    st = Write(options, &batch);
+  }
+  return st;
 }
 
 Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
   class Handler : public WriteBatch::Handler {
    public:
-    explicit Handler(Env* env) : env_(env) {}
+    explicit Handler(SystemClock* clock) : clock_(clock) {}
     WriteBatch updates_ttl;
-    Status batch_rewrite_status;
     Status PutCF(uint32_t column_family_id, const Slice& key,
                  const Slice& value) override {
       std::string value_with_ts;
-      Status st = AppendTS(value, &value_with_ts, env_);
+      Status st = AppendTS(value, &value_with_ts, clock_);
       if (!st.ok()) {
-        batch_rewrite_status = st;
-      } else {
-        WriteBatchInternal::Put(&updates_ttl, column_family_id, key,
-                                value_with_ts);
+        return st;
       }
-      return Status::OK();
+      return WriteBatchInternal::Put(&updates_ttl, column_family_id, key,
+                                     value_with_ts);
     }
     Status MergeCF(uint32_t column_family_id, const Slice& key,
                    const Slice& value) override {
       std::string value_with_ts;
-      Status st = AppendTS(value, &value_with_ts, env_);
+      Status st = AppendTS(value, &value_with_ts, clock_);
       if (!st.ok()) {
-        batch_rewrite_status = st;
-      } else {
-        WriteBatchInternal::Merge(&updates_ttl, column_family_id, key,
-                                  value_with_ts);
+        return st;
       }
-      return Status::OK();
+      return WriteBatchInternal::Merge(&updates_ttl, column_family_id, key,
+                                       value_with_ts);
     }
     Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
-      WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
-      return Status::OK();
+      return WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
+    }
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                         const Slice& end_key) override {
+      return WriteBatchInternal::DeleteRange(&updates_ttl, column_family_id,
+                                             begin_key, end_key);
     }
     void LogData(const Slice& blob) override { updates_ttl.PutLogData(blob); }
 
    private:
-    Env* env_;
+    SystemClock* clock_;
   };
-  Handler handler(GetEnv());
-  updates->Iterate(&handler);
-  if (!handler.batch_rewrite_status.ok()) {
-    return handler.batch_rewrite_status;
+  Handler handler(GetEnv()->GetSystemClock().get());
+  Status st = updates->Iterate(&handler);
+  if (!st.ok()) {
+    return st;
   } else {
     return db_->Write(opts, &(handler.updates_ttl));
   }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.h	2025-05-19 16:14:28.000000000 +0000
@@ -13,10 +13,11 @@
 #include "db/db_impl/db_impl.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/utilities/utility_db.h"
+#include "utilities/compaction_filters/layered_compaction_filter_base.h"
 
 #ifdef _WIN32
 // Windows API macro interference
@@ -24,12 +25,15 @@
 #endif
 
 namespace ROCKSDB_NAMESPACE {
-
+struct ConfigOptions;
+class ObjectLibrary;
+class ObjectRegistry;
 class DBWithTTLImpl : public DBWithTTL {
  public:
   static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
-                              Env* env);
+                              SystemClock* clock);
 
+  static void RegisterTtlClasses();
   explicit DBWithTTLImpl(DB* db);
 
   virtual ~DBWithTTLImpl();
@@ -81,9 +85,10 @@
 
   virtual DB* GetBaseDB() override { return db_; }
 
-  static bool IsStale(const Slice& value, int32_t ttl, Env* env);
+  static bool IsStale(const Slice& value, int32_t ttl, SystemClock* clock);
 
-  static Status AppendTS(const Slice& val, std::string* val_with_ts, Env* env);
+  static Status AppendTS(const Slice& val, std::string* val_with_ts,
+                         SystemClock* clock);
 
   static Status SanityCheckTimestamp(const Slice& str);
 
@@ -129,7 +134,7 @@
 
   Slice key() const override { return iter_->key(); }
 
-  int32_t timestamp() const {
+  int32_t ttl_timestamp() const {
     return DecodeFixed32(iter_->value().data() + iter_->value().size() -
                          DBWithTTLImpl::kTSLength);
   }
@@ -148,88 +153,59 @@
   Iterator* iter_;
 };
 
-class TtlCompactionFilter : public CompactionFilter {
+class TtlCompactionFilter : public LayeredCompactionFilterBase {
  public:
-  TtlCompactionFilter(
-      int32_t ttl, Env* env, const CompactionFilter* user_comp_filter,
-      std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory =
-          nullptr)
-      : ttl_(ttl),
-        env_(env),
-        user_comp_filter_(user_comp_filter),
-        user_comp_filter_from_factory_(
-            std::move(user_comp_filter_from_factory)) {
-    // Unlike the merge operator, compaction filter is necessary for TTL, hence
-    // this would be called even if user doesn't specify any compaction-filter
-    if (!user_comp_filter_) {
-      user_comp_filter_ = user_comp_filter_from_factory_.get();
-    }
-  }
+  TtlCompactionFilter(int32_t ttl, SystemClock* clock,
+                      const CompactionFilter* _user_comp_filter,
+                      std::unique_ptr<const CompactionFilter>
+                          _user_comp_filter_from_factory = nullptr);
 
   virtual bool Filter(int level, const Slice& key, const Slice& old_val,
-                      std::string* new_val, bool* value_changed) const
-      override {
-    if (DBWithTTLImpl::IsStale(old_val, ttl_, env_)) {
-      return true;
-    }
-    if (user_comp_filter_ == nullptr) {
-      return false;
-    }
-    assert(old_val.size() >= DBWithTTLImpl::kTSLength);
-    Slice old_val_without_ts(old_val.data(),
-                             old_val.size() - DBWithTTLImpl::kTSLength);
-    if (user_comp_filter_->Filter(level, key, old_val_without_ts, new_val,
-                                  value_changed)) {
+                      std::string* new_val, bool* value_changed) const override;
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "TtlCompactionFilter"; }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == "Delete By TTL") {
       return true;
+    } else {
+      return LayeredCompactionFilterBase::IsInstanceOf(name);
     }
-    if (*value_changed) {
-      new_val->append(
-          old_val.data() + old_val.size() - DBWithTTLImpl::kTSLength,
-          DBWithTTLImpl::kTSLength);
-    }
-    return false;
   }
 
-  virtual const char* Name() const override { return "Delete By TTL"; }
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
 
  private:
   int32_t ttl_;
-  Env* env_;
-  const CompactionFilter* user_comp_filter_;
-  std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory_;
+  SystemClock* clock_;
 };
 
 class TtlCompactionFilterFactory : public CompactionFilterFactory {
  public:
   TtlCompactionFilterFactory(
-      int32_t ttl, Env* env,
-      std::shared_ptr<CompactionFilterFactory> comp_filter_factory)
-      : ttl_(ttl), env_(env), user_comp_filter_factory_(comp_filter_factory) {}
-
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
-    std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory =
-        nullptr;
-    if (user_comp_filter_factory_) {
-      user_comp_filter_from_factory =
-          user_comp_filter_factory_->CreateCompactionFilter(context);
-    }
-
-    return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
-        ttl_, env_, nullptr, std::move(user_comp_filter_from_factory)));
-  }
+      int32_t ttl, SystemClock* clock,
+      std::shared_ptr<CompactionFilterFactory> comp_filter_factory);
 
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override;
   void SetTtl(int32_t ttl) {
     ttl_ = ttl;
   }
 
-  virtual const char* Name() const override {
-    return "TtlCompactionFilterFactory";
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "TtlCompactionFilterFactory"; }
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+  const Customizable* Inner() const override {
+    return user_comp_filter_factory_.get();
   }
 
  private:
   int32_t ttl_;
-  Env* env_;
+  SystemClock* clock_;
   std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
 };
 
@@ -237,125 +213,38 @@
 
  public:
   explicit TtlMergeOperator(const std::shared_ptr<MergeOperator>& merge_op,
-                            Env* env)
-      : user_merge_op_(merge_op), env_(env) {
-    assert(merge_op);
-    assert(env);
-  }
-
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
-    const uint32_t ts_len = DBWithTTLImpl::kTSLength;
-    if (merge_in.existing_value && merge_in.existing_value->size() < ts_len) {
-      ROCKS_LOG_ERROR(merge_in.logger,
-                      "Error: Could not remove timestamp from existing value.");
-      return false;
-    }
-
-    // Extract time-stamp from each operand to be passed to user_merge_op_
-    std::vector<Slice> operands_without_ts;
-    for (const auto& operand : merge_in.operand_list) {
-      if (operand.size() < ts_len) {
-        ROCKS_LOG_ERROR(
-            merge_in.logger,
-            "Error: Could not remove timestamp from operand value.");
-        return false;
-      }
-      operands_without_ts.push_back(operand);
-      operands_without_ts.back().remove_suffix(ts_len);
-    }
+                            SystemClock* clock);
 
-    // Apply the user merge operator (store result in *new_value)
-    bool good = true;
-    MergeOperationOutput user_merge_out(merge_out->new_value,
-                                        merge_out->existing_operand);
-    if (merge_in.existing_value) {
-      Slice existing_value_without_ts(merge_in.existing_value->data(),
-                                      merge_in.existing_value->size() - ts_len);
-      good = user_merge_op_->FullMergeV2(
-          MergeOperationInput(merge_in.key, &existing_value_without_ts,
-                              operands_without_ts, merge_in.logger),
-          &user_merge_out);
-    } else {
-      good = user_merge_op_->FullMergeV2(
-          MergeOperationInput(merge_in.key, nullptr, operands_without_ts,
-                              merge_in.logger),
-          &user_merge_out);
-    }
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
 
-    // Return false if the user merge operator returned false
-    if (!good) {
-      return false;
-    }
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value, Logger* logger) const override;
 
-    if (merge_out->existing_operand.data()) {
-      merge_out->new_value.assign(merge_out->existing_operand.data(),
-                                  merge_out->existing_operand.size());
-      merge_out->existing_operand = Slice(nullptr, 0);
-    }
+  static const char* kClassName() { return "TtlMergeOperator"; }
 
-    // Augment the *new_value with the ttl time-stamp
-    int64_t curtime;
-    if (!env_->GetCurrentTime(&curtime).ok()) {
-      ROCKS_LOG_ERROR(
-          merge_in.logger,
-          "Error: Could not get current time to be attached internally "
-          "to the new value.");
-      return false;
-    } else {
-      char ts_string[ts_len];
-      EncodeFixed32(ts_string, (int32_t)curtime);
-      merge_out->new_value.append(ts_string, ts_len);
+  const char* Name() const override { return kClassName(); }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == "Merge By TTL") {
       return true;
-    }
-  }
-
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value, Logger* logger) const
-      override {
-    const uint32_t ts_len = DBWithTTLImpl::kTSLength;
-    std::deque<Slice> operands_without_ts;
-
-    for (const auto& operand : operand_list) {
-      if (operand.size() < ts_len) {
-        ROCKS_LOG_ERROR(logger,
-                        "Error: Could not remove timestamp from value.");
-        return false;
-      }
-
-      operands_without_ts.push_back(
-          Slice(operand.data(), operand.size() - ts_len));
-    }
-
-    // Apply the user partial-merge operator (store result in *new_value)
-    assert(new_value);
-    if (!user_merge_op_->PartialMergeMulti(key, operands_without_ts, new_value,
-                                           logger)) {
-      return false;
-    }
-
-    // Augment the *new_value with the ttl time-stamp
-    int64_t curtime;
-    if (!env_->GetCurrentTime(&curtime).ok()) {
-      ROCKS_LOG_ERROR(
-          logger,
-          "Error: Could not get current time to be attached internally "
-          "to the new value.");
-      return false;
     } else {
-      char ts_string[ts_len];
-      EncodeFixed32(ts_string, (int32_t)curtime);
-      new_value->append(ts_string, ts_len);
-      return true;
+      return MergeOperator::IsInstanceOf(name);
     }
   }
 
-  virtual const char* Name() const override { return "Merge By TTL"; }
+  Status PrepareOptions(const ConfigOptions& config_options) override;
+  Status ValidateOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
+  const Customizable* Inner() const override { return user_merge_op_.get(); }
 
  private:
   std::shared_ptr<MergeOperator> user_merge_op_;
-  Env* env_;
+  SystemClock* clock_;
 };
+extern "C" {
+int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/);
+}  // extern "C"
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/ttl/ttl_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/ttl_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/ttl/ttl_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/ttl/ttl_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -7,10 +7,16 @@
 
 #include <map>
 #include <memory>
+
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "test_util/testharness.h"
 #include "util/string_util.h"
+#include "utilities/merge_operators/bytesxor.h"
+#include "utilities/ttl/db_ttl_impl.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
@@ -19,7 +25,7 @@
 
 namespace {
 
-typedef std::map<std::string, std::string> KVMap;
+using KVMap = std::map<std::string, std::string>;
 
 enum BatchOperation { OP_PUT = 0, OP_DELETE = 1 };
 }
@@ -27,9 +33,9 @@
 class SpecialTimeEnv : public EnvWrapper {
  public:
   explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) {
-    base->GetCurrentTime(&current_time_);
+    EXPECT_OK(base->GetCurrentTime(&current_time_));
   }
-
+  const char* Name() const override { return "SpecialTimeEnv"; }
   void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
   Status GetCurrentTime(int64_t* current_time) override {
     *current_time = current_time_;
@@ -95,7 +101,7 @@
   void CloseTtlHelper(bool close_db) {
     if (db_ttl_ != nullptr) {
       if (close_db) {
-        db_ttl_->Close();
+        EXPECT_OK(db_ttl_->Close());
       }
       delete db_ttl_;
       db_ttl_ = nullptr;
@@ -137,17 +143,17 @@
     for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
       switch (batch_ops[i]) {
         case OP_PUT:
-          batch.Put(kv_it_->first, kv_it_->second);
+          ASSERT_OK(batch.Put(kv_it_->first, kv_it_->second));
           break;
         case OP_DELETE:
-          batch.Delete(kv_it_->first);
+          ASSERT_OK(batch.Delete(kv_it_->first));
           break;
         default:
           FAIL();
       }
     }
-    db_ttl_->Write(wopts, &batch);
-    db_ttl_->Flush(flush_opts);
+    ASSERT_OK(db_ttl_->Write(wopts, &batch));
+    ASSERT_OK(db_ttl_->Flush(flush_opts));
   }
 
   // Puts num_entries starting from start_pos_map from kvmap_ into the database
@@ -170,22 +176,34 @@
                             : db_ttl_->Put(wopts, cf, "keymock", "valuemock"));
     if (flush) {
       if (cf == nullptr) {
-        db_ttl_->Flush(flush_opts);
+        ASSERT_OK(db_ttl_->Flush(flush_opts));
       } else {
-        db_ttl_->Flush(flush_opts, cf);
+        ASSERT_OK(db_ttl_->Flush(flush_opts, cf));
       }
     }
   }
 
   // Runs a manual compaction
-  void ManualCompact(ColumnFamilyHandle* cf = nullptr) {
+  Status ManualCompact(ColumnFamilyHandle* cf = nullptr) {
     if (cf == nullptr) {
-      db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      return db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     } else {
-      db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr);
+      return db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr);
     }
   }
 
+  // Runs a DeleteRange
+  void MakeDeleteRange(std::string start, std::string end,
+                       ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+    static WriteOptions wops;
+    WriteBatch wb;
+    ASSERT_OK(cf == nullptr
+                  ? wb.DeleteRange(db_ttl_->DefaultColumnFamily(), start, end)
+                  : wb.DeleteRange(cf, start, end));
+    ASSERT_OK(db_ttl_->Write(wops, &wb));
+  }
+
   // checks the whole kvmap_ to return correct values using KeyMayExist
   void SimpleKeyMayExistCheck() {
     static ReadOptions ropts;
@@ -225,18 +243,9 @@
     }
   }
 
-  // Sleeps for slp_tim then runs a manual compaction
-  // Checks span starting from st_pos from kvmap_ in the db and
-  // Gets should return true if check is true and false otherwise
-  // Also checks that value that we got is the same as inserted; and =kNewValue
-  //   if test_compaction_change is true
-  void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
-                         bool check = true, bool test_compaction_change = false,
-                         ColumnFamilyHandle* cf = nullptr) {
-    ASSERT_TRUE(db_ttl_);
-
-    env_->Sleep(slp_tim);
-    ManualCompact(cf);
+  void CompactCheck(int64_t st_pos, int64_t span, bool check = true,
+                    bool test_compaction_change = false,
+                    ColumnFamilyHandle* cf = nullptr) {
     static ReadOptions ropts;
     kv_it_ = kvmap_.begin();
     advance(kv_it_, st_pos);
@@ -267,13 +276,27 @@
       }
     }
   }
+  // Sleeps for slp_tim then runs a manual compaction
+  // Checks span starting from st_pos from kvmap_ in the db and
+  // Gets should return true if check is true and false otherwise
+  // Also checks that value that we got is the same as inserted; and =kNewValue
+  //   if test_compaction_change is true
+  void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
+                         bool check = true, bool test_compaction_change = false,
+                         ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+
+    env_->Sleep(slp_tim);
+    ASSERT_OK(ManualCompact(cf));
+    CompactCheck(st_pos, span, check, test_compaction_change, cf);
+  }
 
   // Similar as SleepCompactCheck but uses TtlIterator to read from db
   void SleepCompactCheckIter(int slp, int st_pos, int64_t span,
                              bool check = true) {
     ASSERT_TRUE(db_ttl_);
     env_->Sleep(slp);
-    ManualCompact();
+    ASSERT_OK(ManualCompact());
     static ReadOptions ropts;
     Iterator *dbiter = db_ttl_->NewIterator(ropts);
     kv_it_ = kvmap_.begin();
@@ -292,6 +315,7 @@
         dbiter->Next();
       }
     }
+    ASSERT_OK(dbiter->status());
     delete dbiter;
   }
 
@@ -534,11 +558,16 @@
   MakeKVMap(kSampleSize_);
 
   OpenTtl(1);                                 // T=0:Open the db normally
-  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=1
+  PutValues(0, kSampleSize_);                 // T=0:Insert Set1. Delete at t=1
   CloseTtl();
 
   OpenReadOnlyTtl(1);
-  SleepCompactCheck(2, 0, kSampleSize_);       // T=2:Set1 should still be there
+  ASSERT_TRUE(db_ttl_);
+
+  env_->Sleep(2);
+  Status s = ManualCompact();  // T=2:Set1 should still be there
+  ASSERT_TRUE(s.IsNotSupported());
+  CompactCheck(0, kSampleSize_);
   CloseTtl();
 }
 
@@ -668,12 +697,199 @@
 
   OpenTtl(1);                                  // T=0:Open the db with ttl = 2
   SetTtl(3);
-  // @lint-ignore TXT2 T25377293 Grandfathered in
-  PutValues(0, kSampleSize_);		       // T=0:Insert Set1. Delete at t=2
+  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=2
   SleepCompactCheck(2, 0, kSampleSize_, true); // T=2:Set1 should be there
   CloseTtl();
 }
 
+// Test DeleteRange for DBWithTtl
+TEST_F(TtlTest, DeleteRangeTest) {
+  OpenTtl();
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "a", "val"));
+  MakeDeleteRange("a", "b");
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "c", "val"));
+  MakeDeleteRange("b", "d");
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "e", "val"));
+  MakeDeleteRange("d", "e");
+  // first iteration verifies query correctness in memtable, second verifies
+  // query correctness for a single SST file
+  for (int i = 0; i < 2; i++) {
+    if (i > 0) {
+      ASSERT_OK(db_ttl_->Flush(FlushOptions()));
+    }
+    std::string value;
+    ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "a", &value).IsNotFound());
+    ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "c", &value).IsNotFound());
+    ASSERT_OK(db_ttl_->Get(ReadOptions(), "e", &value));
+  }
+  CloseTtl();
+}
+
+class DummyFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "DummyFilter"; }
+};
+
+class DummyFilterFactory : public CompactionFilterFactory {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "DummyFilterFactory"; }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context&) override {
+    std::unique_ptr<CompactionFilter> f(new DummyFilter());
+    return f;
+  }
+};
+
+static int RegisterTestObjects(ObjectLibrary& library,
+                               const std::string& /*arg*/) {
+  library.AddFactory<CompactionFilter>(
+      "DummyFilter", [](const std::string& /*uri*/,
+                        std::unique_ptr<CompactionFilter>* /*guard*/,
+                        std::string* /* errmsg */) {
+        static DummyFilter dummy;
+        return &dummy;
+      });
+  library.AddFactory<CompactionFilterFactory>(
+      "DummyFilterFactory", [](const std::string& /*uri*/,
+                               std::unique_ptr<CompactionFilterFactory>* guard,
+                               std::string* /* errmsg */) {
+        guard->reset(new DummyFilterFactory());
+        return guard->get();
+      });
+  return 2;
+}
+
+class TtlOptionsTest : public testing::Test {
+ public:
+  TtlOptionsTest() {
+    config_options_.registry->AddLibrary("RegisterTtlObjects",
+                                         RegisterTtlObjects, "");
+    config_options_.registry->AddLibrary("RegisterTtlTestObjects",
+                                         RegisterTestObjects, "");
+  }
+  ConfigOptions config_options_;
+};
+
+TEST_F(TtlOptionsTest, LoadTtlCompactionFilter) {
+  const CompactionFilter* filter = nullptr;
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options_, TtlCompactionFilter::kClassName(), &filter));
+  ASSERT_NE(filter, nullptr);
+  ASSERT_STREQ(filter->Name(), TtlCompactionFilter::kClassName());
+  auto ttl = filter->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 0);
+  ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  delete filter;
+  filter = nullptr;
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options_, "id=TtlCompactionFilter; ttl=123", &filter));
+  ASSERT_NE(filter, nullptr);
+  ttl = filter->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 123);
+  ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  delete filter;
+  filter = nullptr;
+
+  ASSERT_OK(CompactionFilter::CreateFromString(
+      config_options_,
+      "id=TtlCompactionFilter; ttl=456; user_filter=DummyFilter;", &filter));
+  ASSERT_NE(filter, nullptr);
+  auto inner = filter->CheckedCast<DummyFilter>();
+  ASSERT_NE(inner, nullptr);
+  ASSERT_OK(filter->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  std::string mismatch;
+  std::string opts_str = filter->ToString(config_options_);
+  const CompactionFilter* copy = nullptr;
+  ASSERT_OK(
+      CompactionFilter::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(filter->AreEquivalent(config_options_, copy, &mismatch));
+  delete filter;
+  delete copy;
+}
+
+TEST_F(TtlOptionsTest, LoadTtlCompactionFilterFactory) {
+  std::shared_ptr<CompactionFilterFactory> cff;
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options_, TtlCompactionFilterFactory::kClassName(), &cff));
+  ASSERT_NE(cff.get(), nullptr);
+  ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName());
+  auto ttl = cff->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 0);
+  ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options_, "id=TtlCompactionFilterFactory; ttl=123", &cff));
+  ASSERT_NE(cff.get(), nullptr);
+  ASSERT_STREQ(cff->Name(), TtlCompactionFilterFactory::kClassName());
+  ttl = cff->GetOptions<int32_t>("TTL");
+  ASSERT_NE(ttl, nullptr);
+  ASSERT_EQ(*ttl, 123);
+  ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(
+      config_options_,
+      "id=TtlCompactionFilterFactory; ttl=456; "
+      "user_filter_factory=DummyFilterFactory;",
+      &cff));
+  ASSERT_NE(cff.get(), nullptr);
+  auto filter = cff->CreateCompactionFilter(CompactionFilter::Context());
+  ASSERT_NE(filter.get(), nullptr);
+  auto ttlf = filter->CheckedCast<TtlCompactionFilter>();
+  ASSERT_EQ(filter.get(), ttlf);
+  auto user = filter->CheckedCast<DummyFilter>();
+  ASSERT_NE(user, nullptr);
+  ASSERT_OK(cff->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  std::string opts_str = cff->ToString(config_options_);
+  std::string mismatch;
+  std::shared_ptr<CompactionFilterFactory> copy;
+  ASSERT_OK(CompactionFilterFactory::CreateFromString(config_options_, opts_str,
+                                                      &copy));
+  ASSERT_TRUE(cff->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(TtlOptionsTest, LoadTtlMergeOperator) {
+  std::shared_ptr<MergeOperator> mo;
+
+  config_options_.invoke_prepare_options = false;
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, TtlMergeOperator::kClassName(), &mo));
+  ASSERT_NE(mo.get(), nullptr);
+  ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName());
+  ASSERT_NOK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+
+  config_options_.invoke_prepare_options = true;
+  ASSERT_OK(MergeOperator::CreateFromString(
+      config_options_, "id=TtlMergeOperator; user_operator=bytesxor", &mo));
+  ASSERT_NE(mo.get(), nullptr);
+  ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName());
+  ASSERT_OK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions()));
+  auto ttl_mo = mo->CheckedCast<TtlMergeOperator>();
+  ASSERT_EQ(mo.get(), ttl_mo);
+  auto user = ttl_mo->CheckedCast<BytesXOROperator>();
+  ASSERT_NE(user, nullptr);
+
+  std::string mismatch;
+  std::string opts_str = mo->ToString(config_options_);
+  std::shared_ptr<MergeOperator> copy;
+  ASSERT_OK(MergeOperator::CreateFromString(config_options_, opts_str, &copy));
+  ASSERT_TRUE(mo->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 // A black-box test for the ttl wrapper around rocksdb
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/wal_filter.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/wal_filter.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/wal_filter.cc	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/wal_filter.cc	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/wal_filter.h"
+
+#include <memory>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/customizable_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status WalFilter::CreateFromString(const ConfigOptions& config_options,
+                                   const std::string& value,
+                                   WalFilter** filter) {
+  Status s =
+      LoadStaticObject<WalFilter>(config_options, value, nullptr, filter);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc	2025-05-19 16:14:28.000000000 +0000
@@ -23,414 +23,6 @@
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-// when direction == forward
-// * current_at_base_ <=> base_iterator > delta_iterator
-// when direction == backwards
-// * current_at_base_ <=> base_iterator < delta_iterator
-// always:
-// * equal_keys_ <=> base_iterator == delta_iterator
-class BaseDeltaIterator : public Iterator {
- public:
-  BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator,
-                    const Comparator* comparator,
-                    const ReadOptions* read_options = nullptr)
-      : forward_(true),
-        current_at_base_(true),
-        equal_keys_(false),
-        status_(Status::OK()),
-        base_iterator_(base_iterator),
-        delta_iterator_(delta_iterator),
-        comparator_(comparator),
-        iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
-                                          : nullptr) {}
-
-  ~BaseDeltaIterator() override {}
-
-  bool Valid() const override {
-    return current_at_base_ ? BaseValid() : DeltaValid();
-  }
-
-  void SeekToFirst() override {
-    forward_ = true;
-    base_iterator_->SeekToFirst();
-    delta_iterator_->SeekToFirst();
-    UpdateCurrent();
-  }
-
-  void SeekToLast() override {
-    forward_ = false;
-    base_iterator_->SeekToLast();
-    delta_iterator_->SeekToLast();
-    UpdateCurrent();
-  }
-
-  void Seek(const Slice& k) override {
-    forward_ = true;
-    base_iterator_->Seek(k);
-    delta_iterator_->Seek(k);
-    UpdateCurrent();
-  }
-
-  void SeekForPrev(const Slice& k) override {
-    forward_ = false;
-    base_iterator_->SeekForPrev(k);
-    delta_iterator_->SeekForPrev(k);
-    UpdateCurrent();
-  }
-
-  void Next() override {
-    if (!Valid()) {
-      status_ = Status::NotSupported("Next() on invalid iterator");
-      return;
-    }
-
-    if (!forward_) {
-      // Need to change direction
-      // if our direction was backward and we're not equal, we have two states:
-      // * both iterators are valid: we're already in a good state (current
-      // shows to smaller)
-      // * only one iterator is valid: we need to advance that iterator
-      forward_ = true;
-      equal_keys_ = false;
-      if (!BaseValid()) {
-        assert(DeltaValid());
-        base_iterator_->SeekToFirst();
-      } else if (!DeltaValid()) {
-        delta_iterator_->SeekToFirst();
-      } else if (current_at_base_) {
-        // Change delta from larger than base to smaller
-        AdvanceDelta();
-      } else {
-        // Change base from larger than delta to smaller
-        AdvanceBase();
-      }
-      if (DeltaValid() && BaseValid()) {
-        if (comparator_->Equal(delta_iterator_->Entry().key,
-                               base_iterator_->key())) {
-          equal_keys_ = true;
-        }
-      }
-    }
-    Advance();
-  }
-
-  void Prev() override {
-    if (!Valid()) {
-      status_ = Status::NotSupported("Prev() on invalid iterator");
-      return;
-    }
-
-    if (forward_) {
-      // Need to change direction
-      // if our direction was backward and we're not equal, we have two states:
-      // * both iterators are valid: we're already in a good state (current
-      // shows to smaller)
-      // * only one iterator is valid: we need to advance that iterator
-      forward_ = false;
-      equal_keys_ = false;
-      if (!BaseValid()) {
-        assert(DeltaValid());
-        base_iterator_->SeekToLast();
-      } else if (!DeltaValid()) {
-        delta_iterator_->SeekToLast();
-      } else if (current_at_base_) {
-        // Change delta from less advanced than base to more advanced
-        AdvanceDelta();
-      } else {
-        // Change base from less advanced than delta to more advanced
-        AdvanceBase();
-      }
-      if (DeltaValid() && BaseValid()) {
-        if (comparator_->Equal(delta_iterator_->Entry().key,
-                               base_iterator_->key())) {
-          equal_keys_ = true;
-        }
-      }
-    }
-
-    Advance();
-  }
-
-  Slice key() const override {
-    return current_at_base_ ? base_iterator_->key()
-                            : delta_iterator_->Entry().key;
-  }
-
-  Slice value() const override {
-    return current_at_base_ ? base_iterator_->value()
-                            : delta_iterator_->Entry().value;
-  }
-
-  Status status() const override {
-    if (!status_.ok()) {
-      return status_;
-    }
-    if (!base_iterator_->status().ok()) {
-      return base_iterator_->status();
-    }
-    return delta_iterator_->status();
-  }
-
- private:
-  void AssertInvariants() {
-#ifndef NDEBUG
-    bool not_ok = false;
-    if (!base_iterator_->status().ok()) {
-      assert(!base_iterator_->Valid());
-      not_ok = true;
-    }
-    if (!delta_iterator_->status().ok()) {
-      assert(!delta_iterator_->Valid());
-      not_ok = true;
-    }
-    if (not_ok) {
-      assert(!Valid());
-      assert(!status().ok());
-      return;
-    }
-
-    if (!Valid()) {
-      return;
-    }
-    if (!BaseValid()) {
-      assert(!current_at_base_ && delta_iterator_->Valid());
-      return;
-    }
-    if (!DeltaValid()) {
-      assert(current_at_base_ && base_iterator_->Valid());
-      return;
-    }
-    // we don't support those yet
-    assert(delta_iterator_->Entry().type != kMergeRecord &&
-           delta_iterator_->Entry().type != kLogDataRecord);
-    int compare = comparator_->Compare(delta_iterator_->Entry().key,
-                                       base_iterator_->key());
-    if (forward_) {
-      // current_at_base -> compare < 0
-      assert(!current_at_base_ || compare < 0);
-      // !current_at_base -> compare <= 0
-      assert(current_at_base_ && compare >= 0);
-    } else {
-      // current_at_base -> compare > 0
-      assert(!current_at_base_ || compare > 0);
-      // !current_at_base -> compare <= 0
-      assert(current_at_base_ && compare <= 0);
-    }
-    // equal_keys_ <=> compare == 0
-    assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
-#endif
-  }
-
-  void Advance() {
-    if (equal_keys_) {
-      assert(BaseValid() && DeltaValid());
-      AdvanceBase();
-      AdvanceDelta();
-    } else {
-      if (current_at_base_) {
-        assert(BaseValid());
-        AdvanceBase();
-      } else {
-        assert(DeltaValid());
-        AdvanceDelta();
-      }
-    }
-    UpdateCurrent();
-  }
-
-  void AdvanceDelta() {
-    if (forward_) {
-      delta_iterator_->Next();
-    } else {
-      delta_iterator_->Prev();
-    }
-  }
-  void AdvanceBase() {
-    if (forward_) {
-      base_iterator_->Next();
-    } else {
-      base_iterator_->Prev();
-    }
-  }
-  bool BaseValid() const { return base_iterator_->Valid(); }
-  bool DeltaValid() const { return delta_iterator_->Valid(); }
-  void UpdateCurrent() {
-// Suppress false positive clang analyzer warnings.
-#ifndef __clang_analyzer__
-    status_ = Status::OK();
-    while (true) {
-      WriteEntry delta_entry;
-      if (DeltaValid()) {
-        assert(delta_iterator_->status().ok());
-        delta_entry = delta_iterator_->Entry();
-      } else if (!delta_iterator_->status().ok()) {
-        // Expose the error status and stop.
-        current_at_base_ = false;
-        return;
-      }
-      equal_keys_ = false;
-      if (!BaseValid()) {
-        if (!base_iterator_->status().ok()) {
-          // Expose the error status and stop.
-          current_at_base_ = true;
-          return;
-        }
-
-        // Base has finished.
-        if (!DeltaValid()) {
-          // Finished
-          return;
-        }
-        if (iterate_upper_bound_) {
-          if (comparator_->Compare(delta_entry.key, *iterate_upper_bound_) >=
-              0) {
-            // out of upper bound -> finished.
-            return;
-          }
-        }
-        if (delta_entry.type == kDeleteRecord ||
-            delta_entry.type == kSingleDeleteRecord) {
-          AdvanceDelta();
-        } else {
-          current_at_base_ = false;
-          return;
-        }
-      } else if (!DeltaValid()) {
-        // Delta has finished.
-        current_at_base_ = true;
-        return;
-      } else {
-        int compare =
-            (forward_ ? 1 : -1) *
-            comparator_->Compare(delta_entry.key, base_iterator_->key());
-        if (compare <= 0) {  // delta bigger or equal
-          if (compare == 0) {
-            equal_keys_ = true;
-          }
-          if (delta_entry.type != kDeleteRecord &&
-              delta_entry.type != kSingleDeleteRecord) {
-            current_at_base_ = false;
-            return;
-          }
-          // Delta is less advanced and is delete.
-          AdvanceDelta();
-          if (equal_keys_) {
-            AdvanceBase();
-          }
-        } else {
-          current_at_base_ = true;
-          return;
-        }
-      }
-    }
-
-    AssertInvariants();
-#endif  // __clang_analyzer__
-  }
-
-  bool forward_;
-  bool current_at_base_;
-  bool equal_keys_;
-  Status status_;
-  std::unique_ptr<Iterator> base_iterator_;
-  std::unique_ptr<WBWIIterator> delta_iterator_;
-  const Comparator* comparator_;  // not owned
-  const Slice* iterate_upper_bound_;
-};
-
-typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
-    WriteBatchEntrySkipList;
-
-class WBWIIteratorImpl : public WBWIIterator {
- public:
-  WBWIIteratorImpl(uint32_t column_family_id,
-                   WriteBatchEntrySkipList* skip_list,
-                   const ReadableWriteBatch* write_batch)
-      : column_family_id_(column_family_id),
-        skip_list_iter_(skip_list),
-        write_batch_(write_batch) {}
-
-  ~WBWIIteratorImpl() override {}
-
-  bool Valid() const override {
-    if (!skip_list_iter_.Valid()) {
-      return false;
-    }
-    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
-    return (iter_entry != nullptr &&
-            iter_entry->column_family == column_family_id_);
-  }
-
-  void SeekToFirst() override {
-    WriteBatchIndexEntry search_entry(
-        nullptr /* search_key */, column_family_id_,
-        true /* is_forward_direction */, true /* is_seek_to_first */);
-    skip_list_iter_.Seek(&search_entry);
-  }
-
-  void SeekToLast() override {
-    WriteBatchIndexEntry search_entry(
-        nullptr /* search_key */, column_family_id_ + 1,
-        true /* is_forward_direction */, true /* is_seek_to_first */);
-    skip_list_iter_.Seek(&search_entry);
-    if (!skip_list_iter_.Valid()) {
-      skip_list_iter_.SeekToLast();
-    } else {
-      skip_list_iter_.Prev();
-    }
-  }
-
-  void Seek(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_,
-                                      true /* is_forward_direction */,
-                                      false /* is_seek_to_first */);
-    skip_list_iter_.Seek(&search_entry);
-  }
-
-  void SeekForPrev(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_,
-                                      false /* is_forward_direction */,
-                                      false /* is_seek_to_first */);
-    skip_list_iter_.SeekForPrev(&search_entry);
-  }
-
-  void Next() override { skip_list_iter_.Next(); }
-
-  void Prev() override { skip_list_iter_.Prev(); }
-
-  WriteEntry Entry() const override {
-    WriteEntry ret;
-    Slice blob, xid;
-    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
-    // this is guaranteed with Valid()
-    assert(iter_entry != nullptr &&
-           iter_entry->column_family == column_family_id_);
-    auto s = write_batch_->GetEntryFromDataOffset(
-        iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid);
-    assert(s.ok());
-    assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
-           ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord ||
-           ret.type == kMergeRecord);
-    return ret;
-  }
-
-  Status status() const override {
-    // this is in-memory data structure, so the only way status can be non-ok is
-    // through memory corruption
-    return Status::OK();
-  }
-
-  const WriteBatchIndexEntry* GetRawEntry() const {
-    return skip_list_iter_.key();
-  }
-
- private:
-  uint32_t column_family_id_;
-  WriteBatchEntrySkipList::Iterator skip_list_iter_;
-  const ReadableWriteBatch* write_batch_;
-};
-
 struct WriteBatchWithIndex::Rep {
   explicit Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
                size_t max_bytes = 0, bool _overwrite_key = false)
@@ -461,13 +53,16 @@
   // In overwrite mode, find the existing entry for the same key and update it
   // to point to the current entry.
   // Return true if the key is found and updated.
-  bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key);
-  bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key);
+  bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key,
+                           WriteType type);
+  bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key,
+                                   WriteType type);
 
   // Add the recent entry to the update.
   // In overwrite mode, if key already exists in the index, update it.
-  void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key);
-  void AddOrUpdateIndex(const Slice& key);
+  void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key,
+                        WriteType type);
+  void AddOrUpdateIndex(const Slice& key, WriteType type);
 
   // Allocate an index entry pointing to the last entry in the write batch and
   // put it to skip list.
@@ -483,24 +78,32 @@
 };
 
 bool WriteBatchWithIndex::Rep::UpdateExistingEntry(
-    ColumnFamilyHandle* column_family, const Slice& key) {
+    ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
   uint32_t cf_id = GetColumnFamilyID(column_family);
-  return UpdateExistingEntryWithCfId(cf_id, key);
+  return UpdateExistingEntryWithCfId(cf_id, key, type);
 }
 
 bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
-    uint32_t column_family_id, const Slice& key) {
+    uint32_t column_family_id, const Slice& key, WriteType type) {
   if (!overwrite_key) {
     return false;
   }
 
-  WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch);
+  WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch,
+                        &comparator);
   iter.Seek(key);
   if (!iter.Valid()) {
     return false;
-  }
-  if (comparator.CompareKey(column_family_id, key, iter.Entry().key) != 0) {
+  } else if (!iter.MatchesKey(column_family_id, key)) {
     return false;
+  } else {
+    // Move to the end of this key (NextKey-Prev)
+    iter.NextKey();  // Move to the next key
+    if (iter.Valid()) {
+      iter.Prev();  // Move back one entry
+    } else {
+      iter.SeekToLast();
+    }
   }
   WriteBatchIndexEntry* non_const_entry =
       const_cast<WriteBatchIndexEntry*>(iter.GetRawEntry());
@@ -508,13 +111,17 @@
     last_sub_batch_offset = last_entry_offset;
     sub_batch_cnt++;
   }
-  non_const_entry->offset = last_entry_offset;
-  return true;
+  if (type == kMergeRecord) {
+    return false;
+  } else {
+    non_const_entry->offset = last_entry_offset;
+    return true;
+  }
 }
 
 void WriteBatchWithIndex::Rep::AddOrUpdateIndex(
-    ColumnFamilyHandle* column_family, const Slice& key) {
-  if (!UpdateExistingEntry(column_family, key)) {
+    ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
+  if (!UpdateExistingEntry(column_family, key, type)) {
     uint32_t cf_id = GetColumnFamilyID(column_family);
     const auto* cf_cmp = GetColumnFamilyUserComparator(column_family);
     if (cf_cmp != nullptr) {
@@ -524,8 +131,9 @@
   }
 }
 
-void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key) {
-  if (!UpdateExistingEntryWithCfId(0, key)) {
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key,
+                                                WriteType type) {
+  if (!UpdateExistingEntryWithCfId(0, key, type)) {
     AddNewEntry(0);
   }
 }
@@ -597,14 +205,31 @@
     switch (tag) {
       case kTypeColumnFamilyValue:
       case kTypeValue:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key, kPutRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
       case kTypeColumnFamilyDeletion:
       case kTypeDeletion:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key,
+                                         kDeleteRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
       case kTypeColumnFamilySingleDeletion:
       case kTypeSingleDeletion:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key,
+                                         kSingleDeleteRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
       case kTypeColumnFamilyMerge:
       case kTypeMerge:
         found++;
-        if (!UpdateExistingEntryWithCfId(column_family_id, key)) {
+        if (!UpdateExistingEntryWithCfId(column_family_id, key, kMergeRecord)) {
           AddNewEntry(column_family_id);
         }
         break;
@@ -648,34 +273,33 @@
 size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; }
 
 WBWIIterator* WriteBatchWithIndex::NewIterator() {
-  return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch);
+  return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                              &(rep->comparator));
 }
 
 WBWIIterator* WriteBatchWithIndex::NewIterator(
     ColumnFamilyHandle* column_family) {
   return new WBWIIteratorImpl(GetColumnFamilyID(column_family),
-                              &(rep->skip_list), &rep->write_batch);
+                              &(rep->skip_list), &rep->write_batch,
+                              &(rep->comparator));
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(
     ColumnFamilyHandle* column_family, Iterator* base_iterator,
     const ReadOptions* read_options) {
-  if (rep->overwrite_key == false) {
-    assert(false);
-    return nullptr;
-  }
-  return new BaseDeltaIterator(base_iterator, NewIterator(column_family),
+  auto wbwiii =
+      new WBWIIteratorImpl(GetColumnFamilyID(column_family), &(rep->skip_list),
+                           &rep->write_batch, &rep->comparator);
+  return new BaseDeltaIterator(column_family, base_iterator, wbwiii,
                                GetColumnFamilyUserComparator(column_family),
                                read_options);
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
-  if (rep->overwrite_key == false) {
-    assert(false);
-    return nullptr;
-  }
   // default column family's comparator
-  return new BaseDeltaIterator(base_iterator, NewIterator(),
+  auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                                     &rep->comparator);
+  return new BaseDeltaIterator(nullptr, base_iterator, wbwiii,
                                rep->comparator.default_comparator());
 }
 
@@ -684,7 +308,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Put(column_family, key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kPutRecord);
   }
   return s;
 }
@@ -693,7 +317,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Put(key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kPutRecord);
   }
   return s;
 }
@@ -703,7 +327,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Delete(column_family, key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kDeleteRecord);
   }
   return s;
 }
@@ -712,7 +336,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Delete(key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kDeleteRecord);
   }
   return s;
 }
@@ -722,7 +346,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.SingleDelete(column_family, key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kSingleDeleteRecord);
   }
   return s;
 }
@@ -731,7 +355,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.SingleDelete(key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kSingleDeleteRecord);
   }
   return s;
 }
@@ -741,7 +365,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Merge(column_family, key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kMergeRecord);
   }
   return s;
 }
@@ -750,7 +374,7 @@
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Merge(key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kMergeRecord);
   }
   return s;
 }
@@ -765,24 +389,19 @@
                                          const DBOptions& options,
                                          const Slice& key, std::string* value) {
   Status s;
-  MergeContext merge_context;
-  const ImmutableDBOptions immuable_db_options(options);
-
-  WriteBatchWithIndexInternal::Result result =
-      WriteBatchWithIndexInternal::GetFromBatch(
-          immuable_db_options, this, column_family, key, &merge_context,
-          &rep->comparator, value, rep->overwrite_key, &s);
+  WriteBatchWithIndexInternal wbwii(&options, column_family);
+  auto result = wbwii.GetFromBatch(this, key, value, &s);
 
   switch (result) {
-    case WriteBatchWithIndexInternal::Result::kFound:
-    case WriteBatchWithIndexInternal::Result::kError:
+    case WBWIIteratorImpl::kFound:
+    case WBWIIteratorImpl::kError:
       // use returned status
       break;
-    case WriteBatchWithIndexInternal::Result::kDeleted:
-    case WriteBatchWithIndexInternal::Result::kNotFound:
+    case WBWIIteratorImpl::kDeleted:
+    case WBWIIteratorImpl::kNotFound:
       s = Status::NotFound();
       break;
-    case WriteBatchWithIndexInternal::Result::kMergeInProgress:
+    case WBWIIteratorImpl::kMergeInProgress:
       s = Status::MergeInProgress();
       break;
     default:
@@ -844,40 +463,24 @@
     DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
     const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) {
   Status s;
-  MergeContext merge_context;
-  const ImmutableDBOptions& immuable_db_options =
-      static_cast_with_check<DBImpl, DB>(db->GetRootDB())
-          ->immutable_db_options();
+  WriteBatchWithIndexInternal wbwii(db, column_family);
 
   // Since the lifetime of the WriteBatch is the same as that of the transaction
   // we cannot pin it as otherwise the returned value will not be available
   // after the transaction finishes.
   std::string& batch_value = *pinnable_val->GetSelf();
-  WriteBatchWithIndexInternal::Result result =
-      WriteBatchWithIndexInternal::GetFromBatch(
-          immuable_db_options, this, column_family, key, &merge_context,
-          &rep->comparator, &batch_value, rep->overwrite_key, &s);
+  auto result = wbwii.GetFromBatch(this, key, &batch_value, &s);
 
-  if (result == WriteBatchWithIndexInternal::Result::kFound) {
+  if (result == WBWIIteratorImpl::kFound) {
     pinnable_val->PinSelf();
     return s;
-  }
-  if (result == WriteBatchWithIndexInternal::Result::kDeleted) {
-    return Status::NotFound();
-  }
-  if (result == WriteBatchWithIndexInternal::Result::kError) {
+  } else if (!s.ok() || result == WBWIIteratorImpl::kError) {
     return s;
+  } else if (result == WBWIIteratorImpl::kDeleted) {
+    return Status::NotFound();
   }
-  if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
-      rep->overwrite_key == true) {
-    // Since we've overwritten keys, we do not know what other operations are
-    // in this batch for this key, so we cannot do a Merge to compute the
-    // result.  Instead, we will simply return MergeInProgress.
-    return Status::MergeInProgress();
-  }
-
-  assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
-         result == WriteBatchWithIndexInternal::Result::kNotFound);
+  assert(result == WBWIIteratorImpl::kMergeInProgress ||
+         result == WBWIIteratorImpl::kNotFound);
 
   // Did not find key in batch OR could not resolve Merges.  Try DB.
   if (!callback) {
@@ -887,37 +490,23 @@
     get_impl_options.column_family = column_family;
     get_impl_options.value = pinnable_val;
     get_impl_options.callback = callback;
-    s = static_cast_with_check<DBImpl, DB>(db->GetRootDB())
+    s = static_cast_with_check<DBImpl>(db->GetRootDB())
             ->GetImpl(read_options, key, get_impl_options);
   }
 
   if (s.ok() || s.IsNotFound()) {  // DB Get Succeeded
-    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress) {
+    if (result == WBWIIteratorImpl::kMergeInProgress) {
       // Merge result from DB with merges in Batch
-      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-      const MergeOperator* merge_operator =
-          cfh->cfd()->ioptions()->merge_operator;
-      Statistics* statistics = immuable_db_options.statistics.get();
-      Env* env = immuable_db_options.env;
-      Logger* logger = immuable_db_options.info_log.get();
-
-      Slice* merge_data;
+      std::string merge_result;
       if (s.ok()) {
-        merge_data = pinnable_val;
+        s = wbwii.MergeKey(key, pinnable_val, &merge_result);
       } else {  // Key not present in db (s.IsNotFound())
-        merge_data = nullptr;
+        s = wbwii.MergeKey(key, nullptr, &merge_result);
       }
-
-      if (merge_operator) {
-        std::string merge_result;
-        s = MergeHelper::TimedFullMerge(merge_operator, key, merge_data,
-                                        merge_context.GetOperands(),
-                                        &merge_result, logger, statistics, env);
+      if (s.ok()) {
         pinnable_val->Reset();
         *pinnable_val->GetSelf() = std::move(merge_result);
         pinnable_val->PinSelf();
-      } else {
-        s = Status::InvalidArgument("Options::merge_operator must be set");
       }
     }
   }
@@ -937,14 +526,12 @@
     DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
     const size_t num_keys, const Slice* keys, PinnableSlice* values,
     Status* statuses, bool sorted_input, ReadCallback* callback) {
-  const ImmutableDBOptions& immuable_db_options =
-      static_cast_with_check<DBImpl, DB>(db->GetRootDB())
-          ->immutable_db_options();
+  WriteBatchWithIndexInternal wbwii(db, column_family);
 
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
   autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   // To hold merges from the write batch
-  autovector<std::pair<WriteBatchWithIndexInternal::Result, MergeContext>,
+  autovector<std::pair<WBWIIteratorImpl::Result, MergeContext>,
              MultiGetContext::MAX_BATCH_SIZE>
       merges;
   // Since the lifetime of the WriteBatch is the same as that of the transaction
@@ -952,37 +539,29 @@
   // after the transaction finishes.
   for (size_t i = 0; i < num_keys; ++i) {
     MergeContext merge_context;
-    PinnableSlice* pinnable_val = &values[i];
-    std::string& batch_value = *pinnable_val->GetSelf();
+    std::string batch_value;
     Status* s = &statuses[i];
-    WriteBatchWithIndexInternal::Result result =
-        WriteBatchWithIndexInternal::GetFromBatch(
-            immuable_db_options, this, column_family, keys[i], &merge_context,
-            &rep->comparator, &batch_value, rep->overwrite_key, s);
+    PinnableSlice* pinnable_val = &values[i];
+    pinnable_val->Reset();
+    auto result =
+        wbwii.GetFromBatch(this, keys[i], &merge_context, &batch_value, s);
 
-    if (result == WriteBatchWithIndexInternal::Result::kFound) {
+    if (result == WBWIIteratorImpl::kFound) {
+      *pinnable_val->GetSelf() = std::move(batch_value);
       pinnable_val->PinSelf();
       continue;
     }
-    if (result == WriteBatchWithIndexInternal::Result::kDeleted) {
+    if (result == WBWIIteratorImpl::kDeleted) {
       *s = Status::NotFound();
       continue;
     }
-    if (result == WriteBatchWithIndexInternal::Result::kError) {
+    if (result == WBWIIteratorImpl::kError) {
       continue;
     }
-    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
-        rep->overwrite_key == true) {
-      // Since we've overwritten keys, we do not know what other operations are
-      // in this batch for this key, so we cannot do a Merge to compute the
-      // result.  Instead, we will simply return MergeInProgress.
-      *s = Status::MergeInProgress();
-      continue;
-    }
-
-    assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
-           result == WriteBatchWithIndexInternal::Result::kNotFound);
-    key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
+    assert(result == WBWIIteratorImpl::kMergeInProgress ||
+           result == WBWIIteratorImpl::kNotFound);
+    key_context.emplace_back(column_family, keys[i], &values[i],
+                             /*timestamp*/ nullptr, &statuses[i]);
     merges.emplace_back(result, std::move(merge_context));
   }
 
@@ -991,44 +570,32 @@
   }
 
   // Did not find key in batch OR could not resolve Merges.  Try DB.
-  static_cast_with_check<DBImpl, DB>(db->GetRootDB())
+  static_cast_with_check<DBImpl>(db->GetRootDB())
       ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys);
-  static_cast_with_check<DBImpl, DB>(db->GetRootDB())
+  static_cast_with_check<DBImpl>(db->GetRootDB())
       ->MultiGetWithCallback(read_options, column_family, callback,
                              &sorted_keys);
 
-  ColumnFamilyHandleImpl* cfh =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  const MergeOperator* merge_operator = cfh->cfd()->ioptions()->merge_operator;
   for (auto iter = key_context.begin(); iter != key_context.end(); ++iter) {
     KeyContext& key = *iter;
     if (key.s->ok() || key.s->IsNotFound()) {  // DB Get Succeeded
       size_t index = iter - key_context.begin();
-      std::pair<WriteBatchWithIndexInternal::Result, MergeContext>&
-          merge_result = merges[index];
-      if (merge_result.first ==
-          WriteBatchWithIndexInternal::Result::kMergeInProgress) {
+      std::pair<WBWIIteratorImpl::Result, MergeContext>& merge_result =
+          merges[index];
+      if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) {
+        std::string merged_value;
         // Merge result from DB with merges in Batch
-        Statistics* statistics = immuable_db_options.statistics.get();
-        Env* env = immuable_db_options.env;
-        Logger* logger = immuable_db_options.info_log.get();
-
-        Slice* merge_data;
         if (key.s->ok()) {
-          merge_data = iter->value;
+          *key.s = wbwii.MergeKey(*key.key, iter->value, merge_result.second,
+                                  &merged_value);
         } else {  // Key not present in db (s.IsNotFound())
-          merge_data = nullptr;
+          *key.s = wbwii.MergeKey(*key.key, nullptr, merge_result.second,
+                                  &merged_value);
         }
-
-        if (merge_operator) {
-          *key.s = MergeHelper::TimedFullMerge(
-              merge_operator, *key.key, merge_data,
-              merge_result.second.GetOperands(), key.value->GetSelf(), logger,
-              statistics, env);
+        if (key.s->ok()) {
+          key.value->Reset();
+          *key.value->GetSelf() = std::move(merged_value);
           key.value->PinSelf();
-        } else {
-          *key.s =
-              Status::InvalidArgument("Options::merge_operator must be set");
         }
       }
     }
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc	2025-05-19 16:14:28.000000000 +0000
@@ -8,19 +8,443 @@
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+BaseDeltaIterator::BaseDeltaIterator(ColumnFamilyHandle* column_family,
+                                     Iterator* base_iterator,
+                                     WBWIIteratorImpl* delta_iterator,
+                                     const Comparator* comparator,
+                                     const ReadOptions* read_options)
+    : forward_(true),
+      current_at_base_(true),
+      equal_keys_(false),
+      status_(Status::OK()),
+      base_iterator_(base_iterator),
+      delta_iterator_(delta_iterator),
+      comparator_(comparator),
+      iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
+                                        : nullptr) {
+  wbwii_.reset(new WriteBatchWithIndexInternal(column_family));
+}
+
+bool BaseDeltaIterator::Valid() const {
+  return status_.ok() ? (current_at_base_ ? BaseValid() : DeltaValid()) : false;
+}
+
+void BaseDeltaIterator::SeekToFirst() {
+  forward_ = true;
+  base_iterator_->SeekToFirst();
+  delta_iterator_->SeekToFirst();
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekToLast() {
+  forward_ = false;
+  base_iterator_->SeekToLast();
+  delta_iterator_->SeekToLast();
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::Seek(const Slice& k) {
+  forward_ = true;
+  base_iterator_->Seek(k);
+  delta_iterator_->Seek(k);
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekForPrev(const Slice& k) {
+  forward_ = false;
+  base_iterator_->SeekForPrev(k);
+  delta_iterator_->SeekForPrev(k);
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::Next() {
+  if (!Valid()) {
+    status_ = Status::NotSupported("Next() on invalid iterator");
+    return;
+  }
+
+  if (!forward_) {
+    // Need to change direction
+    // if our direction was backward and we're not equal, we have two states:
+    // * both iterators are valid: we're already in a good state (current
+    // shows to smaller)
+    // * only one iterator is valid: we need to advance that iterator
+    forward_ = true;
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      assert(DeltaValid());
+      base_iterator_->SeekToFirst();
+    } else if (!DeltaValid()) {
+      delta_iterator_->SeekToFirst();
+    } else if (current_at_base_) {
+      // Change delta from larger than base to smaller
+      AdvanceDelta();
+    } else {
+      // Change base from larger than delta to smaller
+      AdvanceBase();
+    }
+    if (DeltaValid() && BaseValid()) {
+      if (comparator_->Equal(delta_iterator_->Entry().key,
+                             base_iterator_->key())) {
+        equal_keys_ = true;
+      }
+    }
+  }
+  Advance();
+}
+
+void BaseDeltaIterator::Prev() {
+  if (!Valid()) {
+    status_ = Status::NotSupported("Prev() on invalid iterator");
+    return;
+  }
+
+  if (forward_) {
+    // Need to change direction
+    // if our direction was backward and we're not equal, we have two states:
+    // * both iterators are valid: we're already in a good state (current
+    // shows to smaller)
+    // * only one iterator is valid: we need to advance that iterator
+    forward_ = false;
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      assert(DeltaValid());
+      base_iterator_->SeekToLast();
+    } else if (!DeltaValid()) {
+      delta_iterator_->SeekToLast();
+    } else if (current_at_base_) {
+      // Change delta from less advanced than base to more advanced
+      AdvanceDelta();
+    } else {
+      // Change base from less advanced than delta to more advanced
+      AdvanceBase();
+    }
+    if (DeltaValid() && BaseValid()) {
+      if (comparator_->Equal(delta_iterator_->Entry().key,
+                             base_iterator_->key())) {
+        equal_keys_ = true;
+      }
+    }
+  }
+
+  Advance();
+}
+
+Slice BaseDeltaIterator::key() const {
+  return current_at_base_ ? base_iterator_->key()
+                          : delta_iterator_->Entry().key;
+}
+
+Slice BaseDeltaIterator::value() const {
+  if (current_at_base_) {
+    return base_iterator_->value();
+  } else {
+    WriteEntry delta_entry = delta_iterator_->Entry();
+    if (wbwii_->GetNumOperands() == 0) {
+      return delta_entry.value;
+    } else if (delta_entry.type == kDeleteRecord ||
+               delta_entry.type == kSingleDeleteRecord) {
+      status_ =
+          wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+    } else if (delta_entry.type == kPutRecord) {
+      status_ = wbwii_->MergeKey(delta_entry.key, &delta_entry.value,
+                                 merge_result_.GetSelf());
+    } else if (delta_entry.type == kMergeRecord) {
+      if (equal_keys_) {
+        Slice base_value = base_iterator_->value();
+        status_ = wbwii_->MergeKey(delta_entry.key, &base_value,
+                                   merge_result_.GetSelf());
+      } else {
+        status_ =
+            wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+      }
+    }
+    merge_result_.PinSelf();
+    return merge_result_;
+  }
+}
+
+Status BaseDeltaIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  }
+  if (!base_iterator_->status().ok()) {
+    return base_iterator_->status();
+  }
+  return delta_iterator_->status();
+}
+
+void BaseDeltaIterator::Invalidate(Status s) { status_ = s; }
+
+void BaseDeltaIterator::AssertInvariants() {
+#ifndef NDEBUG
+  bool not_ok = false;
+  if (!base_iterator_->status().ok()) {
+    assert(!base_iterator_->Valid());
+    not_ok = true;
+  }
+  if (!delta_iterator_->status().ok()) {
+    assert(!delta_iterator_->Valid());
+    not_ok = true;
+  }
+  if (not_ok) {
+    assert(!Valid());
+    assert(!status().ok());
+    return;
+  }
+
+  if (!Valid()) {
+    return;
+  }
+  if (!BaseValid()) {
+    assert(!current_at_base_ && delta_iterator_->Valid());
+    return;
+  }
+  if (!DeltaValid()) {
+    assert(current_at_base_ && base_iterator_->Valid());
+    return;
+  }
+  // we don't support those yet
+  assert(delta_iterator_->Entry().type != kMergeRecord &&
+         delta_iterator_->Entry().type != kLogDataRecord);
+  int compare =
+      comparator_->Compare(delta_iterator_->Entry().key, base_iterator_->key());
+  if (forward_) {
+    // current_at_base -> compare < 0
+    assert(!current_at_base_ || compare < 0);
+    // !current_at_base -> compare <= 0
+    assert(current_at_base_ && compare >= 0);
+  } else {
+    // current_at_base -> compare > 0
+    assert(!current_at_base_ || compare > 0);
+    // !current_at_base -> compare <= 0
+    assert(current_at_base_ && compare <= 0);
+  }
+  // equal_keys_ <=> compare == 0
+  assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
+#endif
+}
+
+void BaseDeltaIterator::Advance() {
+  if (equal_keys_) {
+    assert(BaseValid() && DeltaValid());
+    AdvanceBase();
+    AdvanceDelta();
+  } else {
+    if (current_at_base_) {
+      assert(BaseValid());
+      AdvanceBase();
+    } else {
+      assert(DeltaValid());
+      AdvanceDelta();
+    }
+  }
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::AdvanceDelta() {
+  if (forward_) {
+    delta_iterator_->NextKey();
+  } else {
+    delta_iterator_->PrevKey();
+  }
+}
+void BaseDeltaIterator::AdvanceBase() {
+  if (forward_) {
+    base_iterator_->Next();
+  } else {
+    base_iterator_->Prev();
+  }
+}
+
+bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); }
+bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); }
+void BaseDeltaIterator::UpdateCurrent() {
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+  status_ = Status::OK();
+  while (true) {
+    auto delta_result = WBWIIteratorImpl::kNotFound;
+    WriteEntry delta_entry;
+    if (DeltaValid()) {
+      assert(delta_iterator_->status().ok());
+      delta_result =
+          delta_iterator_->FindLatestUpdate(wbwii_->GetMergeContext());
+      delta_entry = delta_iterator_->Entry();
+    } else if (!delta_iterator_->status().ok()) {
+      // Expose the error status and stop.
+      current_at_base_ = false;
+      return;
+    }
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      if (!base_iterator_->status().ok()) {
+        // Expose the error status and stop.
+        current_at_base_ = true;
+        return;
+      }
+
+      // Base has finished.
+      if (!DeltaValid()) {
+        // Finished
+        return;
+      }
+      if (iterate_upper_bound_) {
+        if (comparator_->Compare(delta_entry.key, *iterate_upper_bound_) >= 0) {
+          // out of upper bound -> finished.
+          return;
+        }
+      }
+      if (delta_result == WBWIIteratorImpl::kDeleted &&
+          wbwii_->GetNumOperands() == 0) {
+        AdvanceDelta();
+      } else {
+        current_at_base_ = false;
+        return;
+      }
+    } else if (!DeltaValid()) {
+      // Delta has finished.
+      current_at_base_ = true;
+      return;
+    } else {
+      int compare =
+          (forward_ ? 1 : -1) *
+          comparator_->Compare(delta_entry.key, base_iterator_->key());
+      if (compare <= 0) {  // delta bigger or equal
+        if (compare == 0) {
+          equal_keys_ = true;
+        }
+        if (delta_result != WBWIIteratorImpl::kDeleted ||
+            wbwii_->GetNumOperands() > 0) {
+          current_at_base_ = false;
+          return;
+        }
+        // Delta is less advanced and is delete.
+        AdvanceDelta();
+        if (equal_keys_) {
+          AdvanceBase();
+        }
+      } else {
+        current_at_base_ = true;
+        return;
+      }
+    }
+  }
+
+  AssertInvariants();
+#endif  // __clang_analyzer__
+}
+
+void WBWIIteratorImpl::AdvanceKey(bool forward) {
+  if (Valid()) {
+    Slice key = Entry().key;
+    do {
+      if (forward) {
+        Next();
+      } else {
+        Prev();
+      }
+    } while (MatchesKey(column_family_id_, key));
+  }
+}
+
+void WBWIIteratorImpl::NextKey() { AdvanceKey(true); }
 
-class Env;
-class Logger;
-class Statistics;
+void WBWIIteratorImpl::PrevKey() {
+  AdvanceKey(false);  // Move to the tail of the previous key
+  if (Valid()) {
+    AdvanceKey(false);  // Move back another key.  Now we are at the start of
+                        // the previous key
+    if (Valid()) {      // Still a valid
+      Next();           // Move forward one onto this key
+    } else {
+      SeekToFirst();  // Not valid, move to the start
+    }
+  }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+    MergeContext* merge_context) {
+  if (Valid()) {
+    Slice key = Entry().key;
+    return FindLatestUpdate(key, merge_context);
+  } else {
+    merge_context->Clear();  // Clear any entries in the MergeContext
+    return WBWIIteratorImpl::kNotFound;
+  }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+    const Slice& key, MergeContext* merge_context) {
+  Result result = WBWIIteratorImpl::kNotFound;
+  merge_context->Clear();  // Clear any entries in the MergeContext
+  // TODO(agiardullo): consider adding support for reverse iteration
+  if (!Valid()) {
+    return result;
+  } else if (comparator_->CompareKey(column_family_id_, Entry().key, key) !=
+             0) {
+    return result;
+  } else {
+    // We want to iterate in the reverse order that the writes were added to the
+    // batch.  Since we don't have a reverse iterator, we must seek past the
+    // end. We do this by seeking to the next key, and then back one step
+    NextKey();
+    if (Valid()) {
+      Prev();
+    } else {
+      SeekToLast();
+    }
+
+    // We are at the end of the iterator for this key.  Search backwards for the
+    // last Put or Delete, accumulating merges along the way.
+    while (Valid()) {
+      const WriteEntry entry = Entry();
+      if (comparator_->CompareKey(column_family_id_, entry.key, key) != 0) {
+        break;  // Unexpected error or we've reached a different next key
+      }
+
+      switch (entry.type) {
+        case kPutRecord:
+          return WBWIIteratorImpl::kFound;
+        case kDeleteRecord:
+          return WBWIIteratorImpl::kDeleted;
+        case kSingleDeleteRecord:
+          return WBWIIteratorImpl::kDeleted;
+        case kMergeRecord:
+          result = WBWIIteratorImpl::kMergeInProgress;
+          merge_context->PushOperand(entry.value);
+          break;
+        case kLogDataRecord:
+          break;  // ignore
+        case kXIDRecord:
+          break;  // ignore
+        default:
+          return WBWIIteratorImpl::kError;
+      }  // end switch statement
+      Prev();
+    }  // End while Valid()
+    // At this point, we have been through the whole list and found no Puts or
+    // Deletes. The iterator points to the previous key.  Move the iterator back
+    // onto this one.
+    if (Valid()) {
+      Next();
+    } else {
+      SeekToFirst();
+    }
+  }
+  return result;
+}
 
 Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
                                                   WriteType* type, Slice* Key,
@@ -44,6 +468,9 @@
   uint32_t column_family;
   Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value,
                                       blob, xid);
+  if (!s.ok()) {
+    return s;
+  }
 
   switch (tag) {
     case kTypeColumnFamilyValue:
@@ -90,7 +517,7 @@
 // 1. first compare the column family, the one with larger CF will be larger;
 // 2. Inside the same CF, we first decode the entry to find the key of the entry
 //    and the entry with larger key will be larger;
-// 3. If two entries are of the same CF and offset, the one with larger offset
+// 3. If two entries are of the same CF and key, the one with larger offset
 //    will be larger.
 // Some times either `entry1` or `entry2` is dummy entry, which is actually
 // a search key. In this case, in step 2, we don't go ahead and decode the
@@ -151,135 +578,123 @@
   }
 }
 
-WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
-    const ImmutableDBOptions& immuable_db_options, WriteBatchWithIndex* batch,
-    ColumnFamilyHandle* column_family, const Slice& key,
-    MergeContext* merge_context, WriteBatchEntryComparator* cmp,
-    std::string* value, bool overwrite_key, Status* s) {
-  uint32_t cf_id = GetColumnFamilyID(column_family);
-  *s = Status::OK();
-  WriteBatchWithIndexInternal::Result result =
-      WriteBatchWithIndexInternal::Result::kNotFound;
-
-  std::unique_ptr<WBWIIterator> iter =
-      std::unique_ptr<WBWIIterator>(batch->NewIterator(column_family));
-
-  // We want to iterate in the reverse order that the writes were added to the
-  // batch.  Since we don't have a reverse iterator, we must seek past the end.
-  // TODO(agiardullo): consider adding support for reverse iteration
-  iter->Seek(key);
-  while (iter->Valid()) {
-    const WriteEntry entry = iter->Entry();
-    if (cmp->CompareKey(cf_id, entry.key, key) != 0) {
-      break;
-    }
+WriteEntry WBWIIteratorImpl::Entry() const {
+  WriteEntry ret;
+  Slice blob, xid;
+  const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+  // this is guaranteed with Valid()
+  assert(iter_entry != nullptr &&
+         iter_entry->column_family == column_family_id_);
+  auto s = write_batch_->GetEntryFromDataOffset(
+      iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid);
+  assert(s.ok());
+  assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
+         ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord ||
+         ret.type == kMergeRecord);
+  return ret;
+}
 
-    iter->Next();
+bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) {
+  if (Valid()) {
+    return comparator_->CompareKey(cf_id, key, Entry().key) == 0;
+  } else {
+    return false;
   }
+}
 
-  if (!(*s).ok()) {
-    return WriteBatchWithIndexInternal::Result::kError;
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    ColumnFamilyHandle* column_family)
+    : db_(nullptr), db_options_(nullptr), column_family_(column_family) {}
+
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    DB* db, ColumnFamilyHandle* column_family)
+    : db_(db), db_options_(nullptr), column_family_(column_family) {
+  if (db_ != nullptr && column_family_ == nullptr) {
+    column_family_ = db_->DefaultColumnFamily();
   }
+}
 
-  if (!iter->Valid()) {
-    // Read past end of results.  Reposition on last result.
-    iter->SeekToLast();
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    const DBOptions* db_options, ColumnFamilyHandle* column_family)
+    : db_(nullptr), db_options_(db_options), column_family_(column_family) {}
+
+Status WriteBatchWithIndexInternal::MergeKey(const Slice& key,
+                                             const Slice* value,
+                                             const MergeContext& context,
+                                             std::string* result) const {
+  if (column_family_ != nullptr) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family_);
+    const auto merge_operator = cfh->cfd()->ioptions()->merge_operator.get();
+    if (merge_operator == nullptr) {
+      return Status::InvalidArgument(
+          "Merge_operator must be set for column_family");
+    } else if (db_ != nullptr) {
+      const ImmutableDBOptions& immutable_db_options =
+          static_cast_with_check<DBImpl>(db_->GetRootDB())
+              ->immutable_db_options();
+      Statistics* statistics = immutable_db_options.statistics.get();
+      Logger* logger = immutable_db_options.info_log.get();
+      SystemClock* clock = immutable_db_options.clock;
+      return MergeHelper::TimedFullMerge(merge_operator, key, value,
+                                         context.GetOperands(), result, logger,
+                                         statistics, clock);
+    } else if (db_options_ != nullptr) {
+      Statistics* statistics = db_options_->statistics.get();
+      Env* env = db_options_->env;
+      Logger* logger = db_options_->info_log.get();
+      SystemClock* clock = env->GetSystemClock().get();
+      return MergeHelper::TimedFullMerge(merge_operator, key, value,
+                                         context.GetOperands(), result, logger,
+                                         statistics, clock);
+    } else {
+      const auto cf_opts = cfh->cfd()->ioptions();
+      return MergeHelper::TimedFullMerge(
+          merge_operator, key, value, context.GetOperands(), result,
+          cf_opts->logger, cf_opts->stats, cf_opts->clock);
+    }
   } else {
-    iter->Prev();
+    return Status::InvalidArgument("Must provide a column_family");
   }
+}
 
-  Slice entry_value;
-  while (iter->Valid()) {
-    const WriteEntry entry = iter->Entry();
-    if (cmp->CompareKey(cf_id, entry.key, key) != 0) {
-      // Unexpected error or we've reached a different next key
-      break;
-    }
-
-    switch (entry.type) {
-      case kPutRecord: {
-        result = WriteBatchWithIndexInternal::Result::kFound;
-        entry_value = entry.value;
-        break;
-      }
-      case kMergeRecord: {
-        result = WriteBatchWithIndexInternal::Result::kMergeInProgress;
-        merge_context->PushOperand(entry.value);
-        break;
-      }
-      case kDeleteRecord:
-      case kSingleDeleteRecord: {
-        result = WriteBatchWithIndexInternal::Result::kDeleted;
-        break;
-      }
-      case kLogDataRecord:
-      case kXIDRecord: {
-        // ignore
-        break;
-      }
-      default: {
-        result = WriteBatchWithIndexInternal::Result::kError;
-        (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
-                                  ToString(entry.type));
-        break;
-      }
-    }
-    if (result == WriteBatchWithIndexInternal::Result::kFound ||
-        result == WriteBatchWithIndexInternal::Result::kDeleted ||
-        result == WriteBatchWithIndexInternal::Result::kError) {
-      // We can stop iterating once we find a PUT or DELETE
-      break;
-    }
-    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
-        overwrite_key == true) {
-      // Since we've overwritten keys, we do not know what other operations are
-      // in this batch for this key, so we cannot do a Merge to compute the
-      // result.  Instead, we will simply return MergeInProgress.
-      break;
-    }
+WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch(
+    WriteBatchWithIndex* batch, const Slice& key, MergeContext* context,
+    std::string* value, Status* s) {
+  *s = Status::OK();
 
-    iter->Prev();
-  }
+  std::unique_ptr<WBWIIteratorImpl> iter(
+      static_cast_with_check<WBWIIteratorImpl>(
+          batch->NewIterator(column_family_)));
 
-  if ((*s).ok()) {
-    if (result == WriteBatchWithIndexInternal::Result::kFound ||
-        result == WriteBatchWithIndexInternal::Result::kDeleted) {
-      // Found a Put or Delete.  Merge if necessary.
-      if (merge_context->GetNumOperands() > 0) {
-        const MergeOperator* merge_operator;
-
-        if (column_family != nullptr) {
-          auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-          merge_operator = cfh->cfd()->ioptions()->merge_operator;
-        } else {
-          *s = Status::InvalidArgument("Must provide a column_family");
-          result = WriteBatchWithIndexInternal::Result::kError;
-          return result;
-        }
-        Statistics* statistics = immuable_db_options.statistics.get();
-        Env* env = immuable_db_options.env;
-        Logger* logger = immuable_db_options.info_log.get();
-
-        if (merge_operator) {
-          *s = MergeHelper::TimedFullMerge(merge_operator, key, &entry_value,
-                                           merge_context->GetOperands(), value,
-                                           logger, statistics, env);
-        } else {
-          *s = Status::InvalidArgument("Options::merge_operator must be set");
-        }
-        if ((*s).ok()) {
-          result = WriteBatchWithIndexInternal::Result::kFound;
-        } else {
-          result = WriteBatchWithIndexInternal::Result::kError;
-        }
-      } else {  // nothing to merge
-        if (result == WriteBatchWithIndexInternal::Result::kFound) {  // PUT
-          value->assign(entry_value.data(), entry_value.size());
-        }
+  // Search the iterator for this key, and updates/merges to it.
+  iter->Seek(key);
+  auto result = iter->FindLatestUpdate(key, context);
+  if (result == WBWIIteratorImpl::kError) {
+    (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
+                              ToString(iter->Entry().type));
+    return result;
+  } else if (result == WBWIIteratorImpl::kNotFound) {
+    return result;
+  } else if (result == WBWIIteratorImpl::Result::kFound) {  // PUT
+    Slice entry_value = iter->Entry().value;
+    if (context->GetNumOperands() > 0) {
+      *s = MergeKey(key, &entry_value, *context, value);
+      if (!s->ok()) {
+        result = WBWIIteratorImpl::Result::kError;
+      }
+    } else {
+      value->assign(entry_value.data(), entry_value.size());
+    }
+  } else if (result == WBWIIteratorImpl::kDeleted) {
+    if (context->GetNumOperands() > 0) {
+      *s = MergeKey(key, nullptr, *context, value);
+      if (s->ok()) {
+        result = WBWIIteratorImpl::Result::kFound;
+      } else {
+        result = WBWIIteratorImpl::Result::kError;
       }
     }
   }
-
   return result;
 }
 
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h	2025-05-19 16:14:28.000000000 +0000
@@ -10,6 +10,8 @@
 #include <string>
 #include <vector>
 
+#include "db/merge_context.h"
+#include "memtable/skiplist.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/comparator.h"
@@ -21,8 +23,58 @@
 namespace ROCKSDB_NAMESPACE {
 
 class MergeContext;
+class WBWIIteratorImpl;
+class WriteBatchWithIndexInternal;
 struct Options;
 
+// when direction == forward
+// * current_at_base_ <=> base_iterator > delta_iterator
+// when direction == backwards
+// * current_at_base_ <=> base_iterator < delta_iterator
+// always:
+// * equal_keys_ <=> base_iterator == delta_iterator
+class BaseDeltaIterator : public Iterator {
+ public:
+  BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator,
+                    WBWIIteratorImpl* delta_iterator,
+                    const Comparator* comparator,
+                    const ReadOptions* read_options = nullptr);
+
+  ~BaseDeltaIterator() override {}
+
+  bool Valid() const override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Seek(const Slice& k) override;
+  void SeekForPrev(const Slice& k) override;
+  void Next() override;
+  void Prev() override;
+  Slice key() const override;
+  Slice value() const override;
+  Status status() const override;
+  void Invalidate(Status s);
+
+ private:
+  void AssertInvariants();
+  void Advance();
+  void AdvanceDelta();
+  void AdvanceBase();
+  bool BaseValid() const;
+  bool DeltaValid() const;
+  void UpdateCurrent();
+
+  std::unique_ptr<WriteBatchWithIndexInternal> wbwii_;
+  bool forward_;
+  bool current_at_base_;
+  bool equal_keys_;
+  mutable Status status_;
+  std::unique_ptr<Iterator> base_iterator_;
+  std::unique_ptr<WBWIIteratorImpl> delta_iterator_;
+  const Comparator* comparator_;  // not owned
+  const Slice* iterate_upper_bound_;
+  mutable PinnableSlice merge_result_;
+};
+
 // Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
 struct WriteBatchIndexEntry {
   WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
@@ -70,7 +122,7 @@
   // make the entry just larger than all entries with the search key so
   // SeekForPrev() will see all the keys with the same key.
   size_t offset;
-  uint32_t column_family;  // c1olumn family of the entry.
+  uint32_t column_family;  // column family of the entry.
   size_t key_offset;       // offset of the key in write batch's string buffer.
   size_t key_size;         // size of the key. kFlagMinInCf indicates
                            // that this is a dummy look up entry for
@@ -122,9 +174,120 @@
   const ReadableWriteBatch* write_batch_;
 };
 
-class WriteBatchWithIndexInternal {
+using WriteBatchEntrySkipList =
+    SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>;
+
+class WBWIIteratorImpl : public WBWIIterator {
  public:
   enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };
+  WBWIIteratorImpl(uint32_t column_family_id,
+                   WriteBatchEntrySkipList* skip_list,
+                   const ReadableWriteBatch* write_batch,
+                   WriteBatchEntryComparator* comparator)
+      : column_family_id_(column_family_id),
+        skip_list_iter_(skip_list),
+        write_batch_(write_batch),
+        comparator_(comparator) {}
+
+  ~WBWIIteratorImpl() override {}
+
+  bool Valid() const override {
+    if (!skip_list_iter_.Valid()) {
+      return false;
+    }
+    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+    return (iter_entry != nullptr &&
+            iter_entry->column_family == column_family_id_);
+  }
+
+  void SeekToFirst() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+  }
+
+  void SeekToLast() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_ + 1,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+    if (!skip_list_iter_.Valid()) {
+      skip_list_iter_.SeekToLast();
+    } else {
+      skip_list_iter_.Prev();
+    }
+  }
+
+  void Seek(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      true /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+  }
+
+  void SeekForPrev(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      false /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
+    skip_list_iter_.SeekForPrev(&search_entry);
+  }
+
+  void Next() override { skip_list_iter_.Next(); }
+
+  void Prev() override { skip_list_iter_.Prev(); }
+
+  WriteEntry Entry() const override;
+
+  Status status() const override {
+    // this is in-memory data structure, so the only way status can be non-ok is
+    // through memory corruption
+    return Status::OK();
+  }
+
+  const WriteBatchIndexEntry* GetRawEntry() const {
+    return skip_list_iter_.key();
+  }
+
+  bool MatchesKey(uint32_t cf_id, const Slice& key);
+
+  // Moves the to first entry of the previous key.
+  void PrevKey();
+  // Moves the to first entry of the next key.
+  void NextKey();
+
+  // Moves the iterator to the Update (Put or Delete) for the current key
+  // If there are no Put/Delete, the Iterator will point to the first entry for
+  // this key
+  // @return kFound if a Put was found for the key
+  // @return kDeleted if a delete was found for the key
+  // @return kMergeInProgress if only merges were fouund for the key
+  // @return kError if an unsupported operation was found for the key
+  // @return kNotFound if no operations were found for this key
+  //
+  Result FindLatestUpdate(const Slice& key, MergeContext* merge_context);
+  Result FindLatestUpdate(MergeContext* merge_context);
+
+ protected:
+  void AdvanceKey(bool forward);
+
+ private:
+  uint32_t column_family_id_;
+  WriteBatchEntrySkipList::Iterator skip_list_iter_;
+  const ReadableWriteBatch* write_batch_;
+  WriteBatchEntryComparator* comparator_;
+};
+
+class WriteBatchWithIndexInternal {
+ public:
+  // For GetFromBatchAndDB or similar
+  explicit WriteBatchWithIndexInternal(DB* db,
+                                       ColumnFamilyHandle* column_family);
+  // For GetFromBatchAndDB or similar
+  explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family);
+  // For GetFromBatch or similar
+  explicit WriteBatchWithIndexInternal(const DBOptions* db_options,
+                                       ColumnFamilyHandle* column_family);
 
   // If batch contains a value for key, store it in *value and return kFound.
   // If batch contains a deletion for key, return Deleted.
@@ -134,11 +297,30 @@
   //   and return kMergeInProgress
   // If batch does not contain this key, return kNotFound
   // Else, return kError on error with error Status stored in *s.
-  static WriteBatchWithIndexInternal::Result GetFromBatch(
-      const ImmutableDBOptions& ioptions, WriteBatchWithIndex* batch,
-      ColumnFamilyHandle* column_family, const Slice& key,
-      MergeContext* merge_context, WriteBatchEntryComparator* cmp,
-      std::string* value, bool overwrite_key, Status* s);
+  WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+                                        const Slice& key, std::string* value,
+                                        Status* s) {
+    return GetFromBatch(batch, key, &merge_context_, value, s);
+  }
+  WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+                                        const Slice& key,
+                                        MergeContext* merge_context,
+                                        std::string* value, Status* s);
+  Status MergeKey(const Slice& key, const Slice* value,
+                  std::string* result) const {
+    return MergeKey(key, value, merge_context_, result);
+  }
+  Status MergeKey(const Slice& key, const Slice* value,
+                  const MergeContext& context, std::string* result) const;
+  size_t GetNumOperands() const { return merge_context_.GetNumOperands(); }
+  MergeContext* GetMergeContext() { return &merge_context_; }
+  Slice GetOperand(int index) const { return merge_context_.GetOperand(index); }
+
+ private:
+  DB* db_;
+  const DBOptions* db_options_;
+  ColumnFamilyHandle* column_family_;
+  MergeContext merge_context_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff -Nru mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
--- mariadb-10.11.11/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc	2025-01-30 11:01:26.000000000 +0000
+++ mariadb-10.11.13/storage/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc	2025-05-19 16:14:28.000000000 +0000
@@ -10,8 +10,10 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/write_batch_with_index.h"
+
 #include <map>
 #include <memory>
+
 #include "db/column_family.h"
 #include "port/stack_trace.h"
 #include "test_util/testharness.h"
@@ -19,6 +21,7 @@
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -73,9 +76,241 @@
     return Status::OK();
   }
 };
+
+using KVMap = std::map<std::string, std::string>;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  void SeekForPrev(const Slice& k) override {
+    iter_ = map_->upper_bound(k.ToString());
+    Prev();
+  }
+  void Next() override { ++iter_; }
+  void Prev() override {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+static std::string PrintContents(WriteBatchWithIndex* batch,
+                                 ColumnFamilyHandle* column_family) {
+  std::string result;
+
+  WBWIIterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIterator();
+  } else {
+    iter = batch->NewIterator(column_family);
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    WriteEntry e = iter->Entry();
+
+    if (e.type == kPutRecord) {
+      result.append("PUT(");
+      result.append(e.key.ToString());
+      result.append("):");
+      result.append(e.value.ToString());
+    } else if (e.type == kMergeRecord) {
+      result.append("MERGE(");
+      result.append(e.key.ToString());
+      result.append("):");
+      result.append(e.value.ToString());
+    } else if (e.type == kSingleDeleteRecord) {
+      result.append("SINGLE-DEL(");
+      result.append(e.key.ToString());
+      result.append(")");
+    } else {
+      assert(e.type == kDeleteRecord);
+      result.append("DEL(");
+      result.append(e.key.ToString());
+      result.append(")");
+    }
+
+    result.append(",");
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map,
+                                 ColumnFamilyHandle* column_family) {
+  std::string result;
+
+  Iterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIteratorWithBase(new KVIter(base_map));
+  } else {
+    iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map));
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    assert(iter->status().ok());
+
+    Slice key = iter->key();
+    Slice value = iter->value();
+
+    result.append(key.ToString());
+    result.append(":");
+    result.append(value.ToString());
+    result.append(",");
+
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+void AssertIter(Iterator* iter, const std::string& key,
+                const std::string& value) {
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->key().ToString());
+  ASSERT_EQ(value, iter->value().ToString());
+}
+
+void AssertItersMatch(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  iter1->SeekToFirst();
+  iter2->SeekToFirst();
+  while (iter1->Valid()) {
+    ASSERT_EQ(iter1->Valid(), iter2->Valid());
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+    iter1->Next();
+    iter2->Next();
+  }
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+}
+
+void AssertIterEqual(WBWIIteratorImpl* wbwii,
+                     const std::vector<std::string>& keys) {
+  wbwii->SeekToFirst();
+  for (auto k : keys) {
+    ASSERT_TRUE(wbwii->Valid());
+    ASSERT_EQ(wbwii->Entry().key, k);
+    wbwii->NextKey();
+  }
+  ASSERT_FALSE(wbwii->Valid());
+  wbwii->SeekToLast();
+  for (auto kit = keys.rbegin(); kit != keys.rend(); ++kit) {
+    ASSERT_TRUE(wbwii->Valid());
+    ASSERT_EQ(wbwii->Entry().key, *kit);
+    wbwii->PrevKey();
+  }
+  ASSERT_FALSE(wbwii->Valid());
+}
 }  // namespace anonymous
 
-class WriteBatchWithIndexTest : public testing::Test {};
+class WBWIBaseTest : public testing::Test {
+ public:
+  explicit WBWIBaseTest(bool overwrite) : db_(nullptr) {
+    options_.merge_operator =
+        MergeOperators::CreateFromStringId("stringappend");
+    options_.create_if_missing = true;
+    dbname_ = test::PerThreadDBPath("write_batch_with_index_test");
+    DestroyDB(dbname_, options_);
+    batch_.reset(new WriteBatchWithIndex(BytewiseComparator(), 20, overwrite));
+  }
+
+  virtual ~WBWIBaseTest() {
+    if (db_ != nullptr) {
+      ReleaseSnapshot();
+      delete db_;
+      DestroyDB(dbname_, options_);
+    }
+  }
+
+  std::string AddToBatch(ColumnFamilyHandle* cf, const std::string& key) {
+    std::string result;
+    for (size_t i = 0; i < key.size(); i++) {
+      if (key[i] == 'd') {
+        batch_->Delete(cf, key);
+        result = "";
+      } else if (key[i] == 'p') {
+        result = key + ToString(i);
+        batch_->Put(cf, key, result);
+      } else if (key[i] == 'm') {
+        std::string value = key + ToString(i);
+        batch_->Merge(cf, key, value);
+        if (result.empty()) {
+          result = value;
+        } else {
+          result = result + "," + value;
+        }
+      }
+    }
+    return result;
+  }
+
+  virtual Status OpenDB() { return DB::Open(options_, dbname_, &db_); }
+
+  void ReleaseSnapshot() {
+    if (read_opts_.snapshot != nullptr) {
+      EXPECT_NE(db_, nullptr);
+      db_->ReleaseSnapshot(read_opts_.snapshot);
+      read_opts_.snapshot = nullptr;
+    }
+  }
+
+ public:
+  DB* db_;
+  std::string dbname_;
+  Options options_;
+  WriteOptions write_opts_;
+  ReadOptions read_opts_;
+  std::unique_ptr<WriteBatchWithIndex> batch_;
+};
+
+class WBWIKeepTest : public WBWIBaseTest {
+ public:
+  WBWIKeepTest() : WBWIBaseTest(false) {}
+};
+
+class WBWIOverwriteTest : public WBWIBaseTest {
+ public:
+  WBWIOverwriteTest() : WBWIBaseTest(true) {}
+};
+class WriteBatchWithIndexTest : public WBWIBaseTest,
+                                public testing::WithParamInterface<bool> {
+ public:
+  WriteBatchWithIndexTest() : WBWIBaseTest(GetParam()) {}
+};
 
 void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
                                      WriteBatchWithIndex* batch) {
@@ -96,11 +331,11 @@
   ColumnFamilyHandleImplDummy index(8, BytewiseComparator());
   for (auto& e : entries) {
     if (e.type == kPutRecord) {
-      batch->Put(&data, e.key, e.value);
-      batch->Put(&index, e.value, e.key);
+      ASSERT_OK(batch->Put(&data, e.key, e.value));
+      ASSERT_OK(batch->Put(&index, e.value, e.key));
     } else if (e.type == kMergeRecord) {
-      batch->Merge(&data, e.key, e.value);
-      batch->Put(&index, e.value, e.key);
+      ASSERT_OK(batch->Merge(&data, e.key, e.value));
+      ASSERT_OK(batch->Put(&index, e.value, e.key));
     } else {
       assert(e.type == kDeleteRecord);
       std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
@@ -109,8 +344,8 @@
       auto write_entry = iter->Entry();
       ASSERT_EQ(e.key, write_entry.key.ToString());
       ASSERT_EQ(e.value, write_entry.value.ToString());
-      batch->Delete(&data, e.key);
-      batch->Put(&index, e.value, "");
+      ASSERT_OK(batch->Delete(&data, e.key));
+      ASSERT_OK(batch->Put(&index, e.value, ""));
     }
   }
 
@@ -243,7 +478,7 @@
 
   // Verify WriteBatch can be iterated
   TestHandler handler;
-  batch->GetWriteBatch()->Iterate(&handler);
+  ASSERT_OK(batch->GetWriteBatch()->Iterate(&handler));
 
   // Verify data column family
   {
@@ -273,7 +508,7 @@
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
+TEST_F(WBWIKeepTest, TestValueAsSecondaryIndex) {
   Entry entries[] = {
       {"aaa", "0005", kPutRecord},
       {"b", "0002", kPutRecord},
@@ -286,12 +521,12 @@
   };
   std::vector<Entry> entries_list(entries, entries + 8);
 
-  WriteBatchWithIndex batch(nullptr, 20);
+  batch_.reset(new WriteBatchWithIndex(nullptr, 20, false));
 
-  TestValueAsSecondaryIndexHelper(entries_list, &batch);
+  TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
 
   // Clear batch and re-run test with new values
-  batch.Clear();
+  batch_->Clear();
 
   Entry new_entries[] = {
       {"aaa", "0005", kPutRecord},
@@ -306,30 +541,29 @@
 
   entries_list = std::vector<Entry>(new_entries, new_entries + 8);
 
-  TestValueAsSecondaryIndexHelper(entries_list, &batch);
+  TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
 }
 
-TEST_F(WriteBatchWithIndexTest, TestComparatorForCF) {
+TEST_P(WriteBatchWithIndexTest, TestComparatorForCF) {
   ColumnFamilyHandleImplDummy cf1(6, nullptr);
   ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20);
 
-  batch.Put(&cf1, "ddd", "");
-  batch.Put(&cf2, "aaa", "");
-  batch.Put(&cf2, "eee", "");
-  batch.Put(&cf1, "ccc", "");
-  batch.Put(&reverse_cf, "a11", "");
-  batch.Put(&cf1, "bbb", "");
+  ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+  ASSERT_OK(batch_->Put(&cf2, "eee", ""));
+  ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+  ASSERT_OK(batch_->Put(&cf1, "bbb", ""));
 
   Slice key_slices[] = {"a", "3", "3"};
   Slice value_slice = "";
-  batch.Put(&reverse_cf, SliceParts(key_slices, 3),
-            SliceParts(&value_slice, 1));
-  batch.Put(&reverse_cf, "a22", "");
+  ASSERT_OK(batch_->Put(&reverse_cf, SliceParts(key_slices, 3),
+                        SliceParts(&value_slice, 1)));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a22", ""));
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -348,7 +582,7 @@
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -363,7 +597,7 @@
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(!iter->Valid());
@@ -396,29 +630,28 @@
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestOverwriteKey) {
+TEST_F(WBWIOverwriteTest, TestOverwriteKey) {
   ColumnFamilyHandleImplDummy cf1(6, nullptr);
   ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
 
-  batch.Put(&cf1, "ddd", "");
-  batch.Merge(&cf1, "ddd", "");
-  batch.Delete(&cf1, "ddd");
-  batch.Put(&cf2, "aaa", "");
-  batch.Delete(&cf2, "aaa");
-  batch.Put(&cf2, "aaa", "aaa");
-  batch.Put(&cf2, "eee", "eee");
-  batch.Put(&cf1, "ccc", "");
-  batch.Put(&reverse_cf, "a11", "");
-  batch.Delete(&cf1, "ccc");
-  batch.Put(&reverse_cf, "a33", "a33");
-  batch.Put(&reverse_cf, "a11", "a11");
+  ASSERT_OK(batch_->Merge(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Delete(&cf1, "ddd"));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+  ASSERT_OK(batch_->Delete(&cf2, "aaa"));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", "aaa"));
+  ASSERT_OK(batch_->Put(&cf2, "eee", "eee"));
+  ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+  ASSERT_OK(batch_->Delete(&cf1, "ccc"));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a33", "a33"));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", "a11"));
   Slice slices[] = {"a", "3", "3"};
-  batch.Delete(&reverse_cf, SliceParts(slices, 3));
+  ASSERT_OK(batch_->Delete(&reverse_cf, SliceParts(slices, 3)));
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -435,7 +668,7 @@
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
     iter->SeekToLast();
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -466,7 +699,7 @@
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(!iter->Valid());
@@ -500,64 +733,33 @@
   }
 }
 
-namespace {
-typedef std::map<std::string, std::string> KVMap;
-
-class KVIter : public Iterator {
- public:
-  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
-  bool Valid() const override { return iter_ != map_->end(); }
-  void SeekToFirst() override { iter_ = map_->begin(); }
-  void SeekToLast() override {
-    if (map_->empty()) {
-      iter_ = map_->end();
-    } else {
-      iter_ = map_->find(map_->rbegin()->first);
-    }
-  }
-  void Seek(const Slice& k) override {
-    iter_ = map_->lower_bound(k.ToString());
-  }
-  void SeekForPrev(const Slice& k) override {
-    iter_ = map_->upper_bound(k.ToString());
-    Prev();
-  }
-  void Next() override { ++iter_; }
-  void Prev() override {
-    if (iter_ == map_->begin()) {
-      iter_ = map_->end();
-      return;
-    }
-    --iter_;
-  }
-
-  Slice key() const override { return iter_->first; }
-  Slice value() const override { return iter_->second; }
-  Status status() const override { return Status::OK(); }
-
- private:
-  const KVMap* const map_;
-  KVMap::const_iterator iter_;
-};
-
-void AssertIter(Iterator* iter, const std::string& key,
-                const std::string& value) {
-  ASSERT_OK(iter->status());
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(key, iter->key().ToString());
-  ASSERT_EQ(value, iter->value().ToString());
-}
-
-void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
-  ASSERT_EQ(iter1->Valid(), iter2->Valid());
-  if (iter1->Valid()) {
-    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
-    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
-  }
+TEST_P(WriteBatchWithIndexTest, TestWBWIIterator) {
+  ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+  ASSERT_OK(batch_->Put(&cf1, "a", "a1"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "c1"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "c2"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e1"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e2"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e3"));
+  std::unique_ptr<WBWIIteratorImpl> iter1(
+      static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf1)));
+  std::unique_ptr<WBWIIteratorImpl> iter2(
+      static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf2)));
+  AssertIterEqual(iter1.get(), {"a", "c", "e"});
+  AssertIterEqual(iter2.get(), {});
+  ASSERT_OK(batch_->Put(&cf2, "a", "a2"));
+  ASSERT_OK(batch_->Merge(&cf2, "b", "b1"));
+  ASSERT_OK(batch_->Merge(&cf2, "b", "b2"));
+  ASSERT_OK(batch_->Delete(&cf2, "d"));
+  ASSERT_OK(batch_->Merge(&cf2, "d", "d2"));
+  ASSERT_OK(batch_->Merge(&cf2, "d", "d3"));
+  ASSERT_OK(batch_->Delete(&cf2, "f"));
+  AssertIterEqual(iter1.get(), {"a", "c", "e"});
+  AssertIterEqual(iter2.get(), {"a", "b", "d", "f"});
 }
-}  // namespace
 
-TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
+TEST_P(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
   std::vector<std::string> source_strings = {"a", "b", "c", "d", "e",
                                              "f", "g", "h", "i", "j"};
   for (int rand_seed = 301; rand_seed < 366; rand_seed++) {
@@ -566,14 +768,13 @@
     ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
     ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
     ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator());
-
-    WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+    batch_->Clear();
 
     if (rand_seed % 2 == 0) {
-      batch.Put(&cf2, "zoo", "bar");
+      ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
     }
     if (rand_seed % 4 == 1) {
-      batch.Put(&cf3, "zoo", "bar");
+      ASSERT_OK(batch_->Put(&cf3, "zoo", "bar"));
     }
 
     KVMap map;
@@ -589,24 +790,24 @@
           break;
         case 1:
           // only delta has it
-          batch.Put(&cf1, key, value);
+          ASSERT_OK(batch_->Put(&cf1, key, value));
           map[key] = value;
           merged_map[key] = value;
           break;
         case 2:
           // both has it. Delta should win
-          batch.Put(&cf1, key, value);
+          ASSERT_OK(batch_->Put(&cf1, key, value));
           map[key] = "wrong_value";
           merged_map[key] = value;
           break;
         case 3:
           // both has it. Delta is delete
-          batch.Delete(&cf1, key);
+          ASSERT_OK(batch_->Delete(&cf1, key));
           map[key] = "wrong_value";
           break;
         case 4:
           // only delta has it. Delta is delete
-          batch.Delete(&cf1, key);
+          ASSERT_OK(batch_->Delete(&cf1, key));
           map[key] = "wrong_value";
           break;
         default:
@@ -616,7 +817,7 @@
     }
 
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
     std::unique_ptr<Iterator> result_iter(new KVIter(&merged_map));
 
     bool is_valid = false;
@@ -672,24 +873,24 @@
           }
           break;
       }
-      AssertItersEqual(iter.get(), result_iter.get());
+      AssertItersMatch(iter.get(), result_iter.get());
       is_valid = iter->Valid();
     }
+
+    ASSERT_OK(iter->status());
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBase) {
   ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
-
   {
     KVMap map;
     map["a"] = "aa";
     map["c"] = "cc";
     map["e"] = "ee";
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -722,12 +923,12 @@
   }
 
   // Test the case that there is one element in the write batch
-  batch.Put(&cf2, "zoo", "bar");
-  batch.Put(&cf1, "a", "aa");
+  ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+  ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
   {
     KVMap empty_map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -736,10 +937,10 @@
     ASSERT_TRUE(!iter->Valid());
   }
 
-  batch.Delete(&cf1, "b");
-  batch.Put(&cf1, "c", "cc");
-  batch.Put(&cf1, "d", "dd");
-  batch.Delete(&cf1, "e");
+  ASSERT_OK(batch_->Delete(&cf1, "b"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
+  ASSERT_OK(batch_->Put(&cf1, "d", "dd"));
+  ASSERT_OK(batch_->Delete(&cf1, "e"));
 
   {
     KVMap map;
@@ -747,7 +948,7 @@
     map["cc"] = "cccc";
     map["f"] = "ff";
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -805,7 +1006,7 @@
   {
     KVMap empty_map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -841,18 +1042,17 @@
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
   ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
 
   // Test the case that there is one element in the write batch
-  batch.Put(&cf2, "zoo", "bar");
-  batch.Put(&cf1, "a", "aa");
+  ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+  ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
   {
     KVMap empty_map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -861,11 +1061,11 @@
     ASSERT_TRUE(!iter->Valid());
   }
 
-  batch.Put(&cf1, "c", "cc");
+  ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
   {
     KVMap map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "c", "cc");
@@ -894,11 +1094,12 @@
   }
 
   // default column family
-  batch.Put("a", "b");
+  ASSERT_OK(batch_->Put("a", "b"));
   {
     KVMap map;
     map["b"] = "";
-    std::unique_ptr<Iterator> iter(batch.NewIteratorWithBase(new KVIter(&map)));
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "b");
@@ -927,416 +1128,331 @@
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatch) {
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatch) {
   Options options;
-  WriteBatchWithIndex batch;
   Status s;
   std::string value;
 
-  s = batch.GetFromBatch(options, "b", &value);
+  s = batch_->GetFromBatch(options_, "b", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Put("a", "a");
-  batch.Put("b", "b");
-  batch.Put("c", "c");
-  batch.Put("a", "z");
-  batch.Delete("c");
-  batch.Delete("d");
-  batch.Delete("e");
-  batch.Put("e", "e");
+  ASSERT_OK(batch_->Put("a", "a"));
+  ASSERT_OK(batch_->Put("b", "b"));
+  ASSERT_OK(batch_->Put("c", "c"));
+  ASSERT_OK(batch_->Put("a", "z"));
+  ASSERT_OK(batch_->Delete("c"));
+  ASSERT_OK(batch_->Delete("d"));
+  ASSERT_OK(batch_->Delete("e"));
+  ASSERT_OK(batch_->Put("e", "e"));
 
-  s = batch.GetFromBatch(options, "b", &value);
+  s = batch_->GetFromBatch(options_, "b", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 
-  s = batch.GetFromBatch(options, "a", &value);
+  s = batch_->GetFromBatch(options_, "a", &value);
   ASSERT_OK(s);
   ASSERT_EQ("z", value);
 
-  s = batch.GetFromBatch(options, "c", &value);
+  s = batch_->GetFromBatch(options_, "c", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatch(options, "d", &value);
+  s = batch_->GetFromBatch(options_, "d", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatch(options, "x", &value);
+  s = batch_->GetFromBatch(options_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatch(options, "e", &value);
+  s = batch_->GetFromBatch(options_, "e", &value);
   ASSERT_OK(s);
   ASSERT_EQ("e", value);
 
-  batch.Merge("z", "z");
+  ASSERT_OK(batch_->Merge("z", "z"));
 
-  s = batch.GetFromBatch(options, "z", &value);
+  s = batch_->GetFromBatch(options_, "z", &value);
   ASSERT_NOK(s);  // No merge operator specified.
 
-  s = batch.GetFromBatch(options, "b", &value);
+  s = batch_->GetFromBatch(options_, "b", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
-  DB* db;
-  Options options;
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  options.create_if_missing = true;
-
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
+  Status s = OpenDB();
   ASSERT_OK(s);
 
-  ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
-  WriteBatchWithIndex batch;
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
   std::string value;
 
-  s = batch.GetFromBatch(options, "x", &value);
+  s = batch_->GetFromBatch(options_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Put("x", "X");
+  ASSERT_OK(batch_->Put("x", "X"));
   std::string expected = "X";
 
   for (int i = 0; i < 5; i++) {
-    batch.Merge("x", ToString(i));
+    ASSERT_OK(batch_->Merge("x", ToString(i)));
     expected = expected + "," + ToString(i);
 
     if (i % 2 == 0) {
-      batch.Put("y", ToString(i / 2));
+      ASSERT_OK(batch_->Put("y", ToString(i / 2)));
     }
 
-    batch.Merge("z", "z");
+    ASSERT_OK(batch_->Merge("z", "z"));
 
-    s = batch.GetFromBatch(column_family, options, "x", &value);
+    s = batch_->GetFromBatch(column_family, options_, "x", &value);
     ASSERT_OK(s);
     ASSERT_EQ(expected, value);
 
-    s = batch.GetFromBatch(column_family, options, "y", &value);
+    s = batch_->GetFromBatch(column_family, options_, "y", &value);
     ASSERT_OK(s);
     ASSERT_EQ(ToString(i / 2), value);
 
-    s = batch.GetFromBatch(column_family, options, "z", &value);
+    s = batch_->GetFromBatch(column_family, options_, "z", &value);
     ASSERT_TRUE(s.IsMergeInProgress());
   }
-
-  delete db;
-  DestroyDB(dbname, options);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge2) {
-  DB* db;
-  Options options;
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  options.create_if_missing = true;
-
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
+TEST_F(WBWIOverwriteTest, TestGetFromBatchMerge2) {
+  Status s = OpenDB();
   ASSERT_OK(s);
 
-  ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
-
-  // Test batch with overwrite_key=true
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
   std::string value;
 
-  s = batch.GetFromBatch(column_family, options, "X", &value);
+  s = batch_->GetFromBatch(column_family, options_, "X", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Put(column_family, "X", "x");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->Put(column_family, "X", "x"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
   ASSERT_EQ("x", value);
 
-  batch.Put(column_family, "X", "x2");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->Put(column_family, "X", "x2"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
   ASSERT_EQ("x2", value);
 
-  batch.Merge(column_family, "X", "aaa");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
-
-  batch.Merge(column_family, "X", "bbb");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Merge(column_family, "X", "aaa"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x2,aaa", value);
+
+  ASSERT_OK(batch_->Merge(column_family, "X", "bbb"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x2,aaa,bbb", value);
 
-  batch.Put(column_family, "X", "x3");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->Put(column_family, "X", "x3"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
   ASSERT_EQ("x3", value);
 
-  batch.Merge(column_family, "X", "ccc");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Merge(column_family, "X", "ccc"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x3,ccc", value);
 
-  batch.Delete(column_family, "X");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_OK(batch_->Delete(column_family, "X"));
+  s = batch_->GetFromBatch(column_family, options_, "X", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Merge(column_family, "X", "ddd");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
-
-  delete db;
-  DestroyDB(dbname, options);
+  batch_->Merge(column_family, "X", "ddd");
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("ddd", value);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
-  DB* db;
-  Options options;
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
+  ASSERT_OK(OpenDB());
 
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
-
-  WriteBatchWithIndex batch;
-  ReadOptions read_options;
-  WriteOptions write_options;
   std::string value;
 
-  s = db->Put(write_options, "a", "a");
-  ASSERT_OK(s);
-
-  s = db->Put(write_options, "b", "b");
-  ASSERT_OK(s);
-
-  s = db->Put(write_options, "c", "c");
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Put(write_opts_, "a", "a"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b"));
+  ASSERT_OK(db_->Put(write_opts_, "c", "c"));
 
-  batch.Put("a", "batch.a");
-  batch.Delete("b");
+  ASSERT_OK(batch_->Put("a", "batch_->a"));
+  ASSERT_OK(batch_->Delete("b"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
-  ASSERT_OK(s);
-  ASSERT_EQ("batch.a", value);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_EQ("batch_->a", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "b", &value);
+  Status s = batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
   ASSERT_EQ("c", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  db->Delete(write_options, "x");
+  ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
-
-  delete db;
-  DestroyDB(dbname, options);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
-  DB* db;
-  Options options;
-
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
-
-  WriteBatchWithIndex batch;
-  ReadOptions read_options;
-  WriteOptions write_options;
-  std::string value;
-
-  s = db->Put(write_options, "a", "a0");
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
+  Status s = OpenDB();
   ASSERT_OK(s);
 
-  s = db->Put(write_options, "b", "b0");
-  ASSERT_OK(s);
-
-  s = db->Merge(write_options, "b", "b1");
-  ASSERT_OK(s);
-
-  s = db->Merge(write_options, "c", "c0");
-  ASSERT_OK(s);
-
-  s = db->Merge(write_options, "d", "d0");
-  ASSERT_OK(s);
+  std::string value;
 
-  batch.Merge("a", "a1");
-  batch.Merge("a", "a2");
-  batch.Merge("b", "b2");
-  batch.Merge("d", "d1");
-  batch.Merge("e", "e0");
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+  ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+  ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+  ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
+
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_OK(batch_->Merge("a", "a2"));
+  ASSERT_OK(batch_->Merge("b", "b2"));
+  ASSERT_OK(batch_->Merge("d", "d1"));
+  ASSERT_OK(batch_->Merge("e", "e0"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
   ASSERT_EQ("a0,a1,a2", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "b", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
   ASSERT_EQ("b0,b1,b2", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
   ASSERT_EQ("c0", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "d", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
   ASSERT_EQ("d0,d1", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
-  s = db->Delete(write_options, "x");
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  const Snapshot* snapshot = db->GetSnapshot();
+  const Snapshot* snapshot = db_->GetSnapshot();
   ReadOptions snapshot_read_options;
   snapshot_read_options.snapshot = snapshot;
 
-  s = db->Delete(write_options, "a");
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Delete(write_opts_, "a"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
   ASSERT_EQ("a1,a2", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "a", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(
+      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value));
   ASSERT_EQ("a0,a1,a2", value);
 
-  batch.Delete("a");
+  ASSERT_OK(batch_->Delete("a"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = db->Merge(write_options, "c", "c1");
-  ASSERT_OK(s);
+  ASSERT_OK(s = db_->Merge(write_opts_, "c", "c1"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
   ASSERT_EQ("c0,c1", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(
+      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "c", &value));
   ASSERT_EQ("c0", value);
 
-  s = db->Put(write_options, "e", "e1");
-  ASSERT_OK(s);
-
-  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Put(write_opts_, "e", "e1"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
   ASSERT_EQ("e1,e0", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
-  s = db->Delete(write_options, "e");
-  ASSERT_OK(s);
-
-  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(s = db_->Delete(write_opts_, "e"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
-  db->ReleaseSnapshot(snapshot);
-  delete db;
-  DestroyDB(dbname, options);
+  db_->ReleaseSnapshot(snapshot);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge2) {
-  DB* db;
-  Options options;
-
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
-
-  // Test batch with overwrite_key=true
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+TEST_F(WBWIOverwriteTest, TestGetFromBatchAndDBMerge2) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
 
-  ReadOptions read_options;
-  WriteOptions write_options;
   std::string value;
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Merge("A", "xxx");
+  ASSERT_OK(batch_->Merge("A", "xxx"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "xxx");
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Merge("A", "yyy"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "xxx,yyy");
 
-  batch.Merge("A", "yyy");
+  ASSERT_OK(db_->Put(write_opts_, "A", "a0"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "a0,xxx,yyy");
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Delete("A"));
 
-  s = db->Put(write_options, "A", "a0");
-  ASSERT_OK(s);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
 
-  batch.Delete("A");
+  FlushOptions flush_options;
+  std::string value;
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_OK(db_->Put(write_opts_, "A", "1"));
+  ASSERT_OK(db_->Flush(flush_options, db_->DefaultColumnFamily()));
+  ASSERT_OK(batch_->Merge("A", "2"));
 
-  delete db;
-  DestroyDB(dbname, options);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "1,2");
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
-  DB* db;
-  Options options;
-
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
+TEST_P(WriteBatchWithIndexTest, TestPinnedGetFromBatchAndDB) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
 
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  PinnableSlice value;
 
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+  ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+  ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+  ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_OK(batch_->Merge("a", "a2"));
+  ASSERT_OK(batch_->Merge("b", "b2"));
+  ASSERT_OK(batch_->Merge("d", "d1"));
+  ASSERT_OK(batch_->Merge("e", "e0"));
+
+  for (int i = 0; i < 2; i++) {
+    if (i == 1) {
+      // Do it again with a flushed DB...
+      ASSERT_OK(db_->Flush(FlushOptions(), db_->DefaultColumnFamily()));
+    }
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+    ASSERT_EQ("a0,a1,a2", value.ToString());
 
-  ReadOptions read_options;
-  WriteOptions write_options;
-  FlushOptions flush_options;
-  std::string value;
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+    ASSERT_EQ("b0,b1,b2", value.ToString());
 
-  WriteBatchWithIndex batch;
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+    ASSERT_EQ("c0", value.ToString());
 
-  ASSERT_OK(db->Put(write_options, "A", "1"));
-  ASSERT_OK(db->Flush(flush_options, db->DefaultColumnFamily()));
-  ASSERT_OK(batch.Merge("A", "2"));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+    ASSERT_EQ("d0,d1", value.ToString());
 
-  ASSERT_OK(batch.GetFromBatchAndDB(db, read_options, "A", &value));
-  ASSERT_EQ(value, "1,2");
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+    ASSERT_EQ("e0", value.ToString());
+    ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  delete db;
-  DestroyDB(dbname, options);
+    s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
 }
 
 void AssertKey(std::string key, WBWIIterator* iter) {
@@ -1351,25 +1467,24 @@
 
 // Tests that we can write to the WBWI while we iterate (from a single thread).
 // iteration should see the newest writes
-TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingCorrectnessTest) {
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingCorrectnessTest) {
   for (char c = 'a'; c <= 'z'; ++c) {
-    batch.Put(std::string(1, c), std::string(1, c));
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
   }
 
-  std::unique_ptr<WBWIIterator> iter(batch.NewIterator());
+  std::unique_ptr<WBWIIterator> iter(batch_->NewIterator());
   iter->Seek("k");
   AssertKey("k", iter.get());
   iter->Next();
   AssertKey("l", iter.get());
-  batch.Put("ab", "cc");
+  ASSERT_OK(batch_->Put("ab", "cc"));
   iter->Next();
   AssertKey("m", iter.get());
-  batch.Put("mm", "kk");
+  ASSERT_OK(batch_->Put("mm", "kk"));
   iter->Next();
   AssertKey("mm", iter.get());
   AssertValue("kk", iter.get());
-  batch.Delete("mm");
+  ASSERT_OK(batch_->Delete("mm"));
 
   iter->Next();
   AssertKey("n", iter.get());
@@ -1379,7 +1494,7 @@
 
   iter->Seek("ab");
   AssertKey("ab", iter.get());
-  batch.Delete("x");
+  ASSERT_OK(batch_->Delete("x"));
   iter->Seek("x");
   AssertKey("x", iter.get());
   ASSERT_EQ(kDeleteRecord, iter->Entry().type);
@@ -1398,10 +1513,10 @@
 }
 
 // same thing as above, but testing IteratorWithBase
-TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseCorrectnessTest) {
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseCorrectnessTest) {
   WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
   for (char c = 'a'; c <= 'z'; ++c) {
-    batch.Put(std::string(1, c), std::string(1, c));
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
   }
 
   KVMap map;
@@ -1410,20 +1525,19 @@
   map["ee"] = "ee";
   map["em"] = "me";
 
-  std::unique_ptr<Iterator> iter(
-      batch.NewIteratorWithBase(new KVIter(&map)));
+  std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
   iter->Seek("k");
   AssertIterKey("k", iter.get());
   iter->Next();
   AssertIterKey("l", iter.get());
-  batch.Put("ab", "cc");
+  ASSERT_OK(batch_->Put("ab", "cc"));
   iter->Next();
   AssertIterKey("m", iter.get());
-  batch.Put("mm", "kk");
+  ASSERT_OK(batch_->Put("mm", "kk"));
   iter->Next();
   AssertIterKey("mm", iter.get());
   AssertIterValue("kk", iter.get());
-  batch.Delete("mm");
+  ASSERT_OK(batch_->Delete("mm"));
   iter->Next();
   AssertIterKey("n", iter.get());
   iter->Prev();
@@ -1436,13 +1550,13 @@
   AssertIterKey("aa", iter.get());
   iter->Prev();
   AssertIterKey("a", iter.get());
-  batch.Delete("aa");
+  ASSERT_OK(batch_->Delete("aa"));
   iter->Next();
   AssertIterKey("ab", iter.get());
   iter->Prev();
   AssertIterKey("a", iter.get());
 
-  batch.Delete("x");
+  ASSERT_OK(batch_->Delete("x"));
   iter->Seek("x");
   AssertIterKey("y", iter.get());
   iter->Next();
@@ -1451,11 +1565,11 @@
   iter->Prev();
   AssertIterKey("w", iter.get());
 
-  batch.Delete("e");
+  ASSERT_OK(batch_->Delete("e"));
   iter->Seek("e");
   AssertIterKey("ee", iter.get());
   AssertIterValue("ee", iter.get());
-  batch.Put("ee", "xx");
+  ASSERT_OK(batch_->Put("ee", "xx"));
   // still the same value
   AssertIterValue("ee", iter.get());
   iter->Next();
@@ -1463,13 +1577,14 @@
   iter->Prev();
   // new value
   AssertIterValue("xx", iter.get());
+
+  ASSERT_OK(iter->status());
 }
 
 // stress testing mutations with IteratorWithBase
-TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseStressTest) {
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseStressTest) {
   for (char c = 'a'; c <= 'z'; ++c) {
-    batch.Put(std::string(1, c), std::string(1, c));
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
   }
 
   KVMap map;
@@ -1477,8 +1592,7 @@
     map[std::string(2, c)] = std::string(2, c);
   }
 
-  std::unique_ptr<Iterator> iter(
-      batch.NewIteratorWithBase(new KVIter(&map)));
+  std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
 
   Random rnd(301);
   for (int i = 0; i < 1000000; ++i) {
@@ -1486,16 +1600,16 @@
     char c = static_cast<char>(rnd.Uniform(26) + 'a');
     switch (random) {
       case 0:
-        batch.Put(std::string(1, c), "xxx");
+        ASSERT_OK(batch_->Put(std::string(1, c), "xxx"));
         break;
       case 1:
-        batch.Put(std::string(2, c), "xxx");
+        ASSERT_OK(batch_->Put(std::string(2, c), "xxx"));
         break;
       case 2:
-        batch.Delete(std::string(1, c));
+        ASSERT_OK(batch_->Delete(std::string(1, c)));
         break;
       case 3:
-        batch.Delete(std::string(2, c));
+        ASSERT_OK(batch_->Delete(std::string(2, c)));
         break;
       case 4:
         iter->Seek(std::string(1, c));
@@ -1517,316 +1631,630 @@
         assert(false);
     }
   }
+  ASSERT_OK(iter->status());
 }
 
-static std::string PrintContents(WriteBatchWithIndex* batch,
-                                 ColumnFamilyHandle* column_family) {
-  std::string result;
-
-  WBWIIterator* iter;
-  if (column_family == nullptr) {
-    iter = batch->NewIterator();
-  } else {
-    iter = batch->NewIterator(column_family);
-  }
-
-  iter->SeekToFirst();
-  while (iter->Valid()) {
-    WriteEntry e = iter->Entry();
-
-    if (e.type == kPutRecord) {
-      result.append("PUT(");
-      result.append(e.key.ToString());
-      result.append("):");
-      result.append(e.value.ToString());
-    } else if (e.type == kMergeRecord) {
-      result.append("MERGE(");
-      result.append(e.key.ToString());
-      result.append("):");
-      result.append(e.value.ToString());
-    } else if (e.type == kSingleDeleteRecord) {
-      result.append("SINGLE-DEL(");
-      result.append(e.key.ToString());
-      result.append(")");
-    } else {
-      assert(e.type == kDeleteRecord);
-      result.append("DEL(");
-      result.append(e.key.ToString());
-      result.append(")");
-    }
-
-    result.append(",");
-    iter->Next();
-  }
-
-  delete iter;
-  return result;
-}
-
-static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map,
-                                 ColumnFamilyHandle* column_family) {
-  std::string result;
-
-  Iterator* iter;
-  if (column_family == nullptr) {
-    iter = batch->NewIteratorWithBase(new KVIter(base_map));
-  } else {
-    iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map));
-  }
-
+TEST_P(WriteBatchWithIndexTest, TestNewIteratorWithBaseFromWbwi) {
+  ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+  KVMap map;
+  map["a"] = "aa";
+  map["c"] = "cc";
+  map["e"] = "ee";
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
+  ASSERT_NE(nullptr, iter);
   iter->SeekToFirst();
-  while (iter->Valid()) {
-    assert(iter->status().ok());
-
-    Slice key = iter->key();
-    Slice value = iter->value();
-
-    result.append(key.ToString());
-    result.append(":");
-    result.append(value.ToString());
-    result.append(",");
-
-    iter->Next();
-  }
-
-  delete iter;
-  return result;
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
 }
 
-TEST_F(WriteBatchWithIndexTest, SavePointTest) {
-  WriteBatchWithIndex batch;
+TEST_P(WriteBatchWithIndexTest, SavePointTest) {
   ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+  KVMap empty_map;
+  std::unique_ptr<Iterator> cf0_iter(
+      batch_->NewIteratorWithBase(new KVIter(&empty_map)));
+  std::unique_ptr<Iterator> cf1_iter(
+      batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
   Status s;
+  KVMap kvm_cf0_0 = {{"A", "aa"}, {"B", "b"}};
+  KVMap kvm_cf1_0 = {{"A", "a1"}, {"C", "c1"}, {"E", "e1"}};
+  KVIter kvi_cf0_0(&kvm_cf0_0);
+  KVIter kvi_cf1_0(&kvm_cf1_0);
+
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->Put("A", "aa"));
+  ASSERT_OK(batch_->Put(&cf1, "A", "a1"));
+  ASSERT_OK(batch_->Delete(&cf1, "B"));
+  ASSERT_OK(batch_->Put(&cf1, "C", "c1"));
+  ASSERT_OK(batch_->Put(&cf1, "E", "e1"));
+
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+  batch_->SetSavePoint();  // 1
+
+  KVMap kvm_cf0_1 = {{"B", "bb"}, {"C", "cc"}};
+  KVMap kvm_cf1_1 = {{"B", "b1"}, {"C", "c1"}};
+  KVIter kvi_cf0_1(&kvm_cf0_1);
+  KVIter kvi_cf1_1(&kvm_cf1_1);
+
+  ASSERT_OK(batch_->Put("C", "cc"));
+  ASSERT_OK(batch_->Put("B", "bb"));
+  ASSERT_OK(batch_->Delete("A"));
+  ASSERT_OK(batch_->Put(&cf1, "B", "b1"));
+  ASSERT_OK(batch_->Delete(&cf1, "A"));
+  ASSERT_OK(batch_->SingleDelete(&cf1, "E"));
+  batch_->SetSavePoint();  // 2
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  KVMap kvm_cf0_2 = {{"A", "xxx"}, {"C", "cc"}};
+  KVMap kvm_cf1_2 = {{"B", "b2"}};
+  KVIter kvi_cf0_2(&kvm_cf0_2);
+  KVIter kvi_cf1_2(&kvm_cf1_2);
+
+  ASSERT_OK(batch_->Put("A", "aaa"));
+  ASSERT_OK(batch_->Put("A", "xxx"));
+  ASSERT_OK(batch_->Delete("B"));
+  ASSERT_OK(batch_->Put(&cf1, "B", "b2"));
+  ASSERT_OK(batch_->Delete(&cf1, "C"));
+  batch_->SetSavePoint();  // 3
+  batch_->SetSavePoint();  // 4
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  KVMap kvm_cf0_4 = {{"A", "xxx"}, {"C", "cc"}};
+  KVMap kvm_cf1_4 = {{"B", "b2"}};
+  KVIter kvi_cf0_4(&kvm_cf0_4);
+  KVIter kvi_cf1_4(&kvm_cf1_4);
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete(&cf1, "D"));
+  ASSERT_OK(batch_->Delete(&cf1, "E"));
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_4);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_4);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 4
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 3
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 2
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  batch_->SetSavePoint();  // 5
+  ASSERT_OK(batch_->Put("X", "x"));
+
+  KVMap kvm_cf0_5 = {{"B", "bb"}, {"C", "cc"}, {"X", "x"}};
+  KVIter kvi_cf0_5(&kvm_cf0_5);
+  KVIter kvi_cf1_5(&kvm_cf1_1);
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_5);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_5);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 5
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 1
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+
+  s = batch_->RollbackToSavePoint();  // no savepoint found
+  ASSERT_TRUE(s.IsNotFound());
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+
+  batch_->SetSavePoint();  // 6
+
+  batch_->Clear();
+  ASSERT_EQ("", PrintContents(batch_.get(), nullptr));
+  ASSERT_EQ("", PrintContents(batch_.get(), &cf1));
 
-  batch.Put("A", "a");
-  batch.Put("B", "b");
-  batch.Put("A", "aa");
-  batch.Put(&cf1, "A", "a1");
-  batch.Delete(&cf1, "B");
-  batch.Put(&cf1, "C", "c1");
-  batch.Put(&cf1, "E", "e1");
-
-  batch.SetSavePoint();  // 1
-
-  batch.Put("C", "cc");
-  batch.Put("B", "bb");
-  batch.Delete("A");
-  batch.Put(&cf1, "B", "b1");
-  batch.Delete(&cf1, "A");
-  batch.SingleDelete(&cf1, "E");
-  batch.SetSavePoint();  // 2
-
-  batch.Put("A", "aaa");
-  batch.Put("A", "xxx");
-  batch.Delete("B");
-  batch.Put(&cf1, "B", "b2");
-  batch.Delete(&cf1, "C");
-  batch.SetSavePoint();  // 3
-  batch.SetSavePoint();  // 4
-  batch.SingleDelete("D");
-  batch.Delete(&cf1, "D");
-  batch.Delete(&cf1, "E");
-
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
-      "B)"
-      ",PUT(C):cc,SINGLE-DEL(D),",
-      PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
-      "DEL(D),PUT(E):e1,SINGLE-DEL(E),DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 4
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
-      "B)"
-      ",PUT(C):cc,",
-      PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 3
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
-      "B)"
-      ",PUT(C):cc,",
-      PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 2
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,",
-            PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(C):c1,"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  batch.SetSavePoint();  // 5
-  batch.Put("X", "x");
-
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,PUT(X):x,",
-            PrintContents(&batch, nullptr));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 5
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,",
-            PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(C):c1,"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 1
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,PUT(B):b,", PrintContents(&batch, nullptr));
-
-  ASSERT_EQ("PUT(A):a1,DEL(B),PUT(C):c1,PUT(E):e1,",
-            PrintContents(&batch, &cf1));
-
-  s = batch.RollbackToSavePoint();  // no savepoint found
-  ASSERT_TRUE(s.IsNotFound());
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,PUT(B):b,", PrintContents(&batch, nullptr));
-
-  ASSERT_EQ("PUT(A):a1,DEL(B),PUT(C):c1,PUT(E):e1,",
-            PrintContents(&batch, &cf1));
-
-  batch.SetSavePoint();  // 6
-
-  batch.Clear();
-  ASSERT_EQ("", PrintContents(&batch, nullptr));
-  ASSERT_EQ("", PrintContents(&batch, &cf1));
-
-  s = batch.RollbackToSavePoint();  // rollback to 6
+  s = batch_->RollbackToSavePoint();  // rollback to 6
   ASSERT_TRUE(s.IsNotFound());
 }
 
-TEST_F(WriteBatchWithIndexTest, SingleDeleteTest) {
-  WriteBatchWithIndex batch;
+TEST_P(WriteBatchWithIndexTest, SingleDeleteTest) {
   Status s;
   std::string value;
-  DBOptions db_options;
 
-  batch.SingleDelete("A");
+  ASSERT_OK(batch_->SingleDelete("A"));
 
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_TRUE(s.IsNotFound());
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ("SINGLE-DEL(A),", value);
 
-  batch.Clear();
-  batch.Put("A", "a");
-  batch.Put("A", "a2");
-  batch.Put("B", "b");
-  batch.SingleDelete("A");
+  batch_->Clear();
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("A", "a2"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->SingleDelete("A"));
 
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ("PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(B):b,", value);
-
-  batch.Put("C", "c");
-  batch.Put("A", "a3");
-  batch.Delete("B");
-  batch.SingleDelete("B");
-  batch.SingleDelete("C");
+  ASSERT_OK(batch_->Put("C", "c"));
+  ASSERT_OK(batch_->Put("A", "a3"));
+  ASSERT_OK(batch_->Delete("B"));
+  ASSERT_OK(batch_->SingleDelete("B"));
+  ASSERT_OK(batch_->SingleDelete("C"));
 
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_OK(s);
   ASSERT_EQ("a3", value);
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "C", &value);
+  s = batch_->GetFromBatch(options_, "C", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "D", &value);
+  s = batch_->GetFromBatch(options_, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(A):a3,PUT(B):b,DEL(B),SINGLE-DEL(B)"
-      ",PUT(C):c,SINGLE-DEL(C),",
-      value);
+  ASSERT_OK(batch_->Put("B", "b4"));
+  ASSERT_OK(batch_->Put("C", "c4"));
+  ASSERT_OK(batch_->Put("D", "d4"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete("A"));
 
-  batch.Put("B", "b4");
-  batch.Put("C", "c4");
-  batch.Put("D", "d4");
-  batch.SingleDelete("D");
-  batch.SingleDelete("D");
-  batch.Delete("A");
-
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b4", value);
-  s = batch.GetFromBatch(db_options, "C", &value);
+  s = batch_->GetFromBatch(options_, "C", &value);
   ASSERT_OK(s);
   ASSERT_EQ("c4", value);
-  s = batch.GetFromBatch(db_options, "D", &value);
+  s = batch_->GetFromBatch(options_, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
-
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(A):a3,DEL(A),PUT(B):b,DEL(B),"
-      "SINGLE-DEL(B),PUT(B):b4,PUT(C):c,SINGLE-DEL(C),PUT(C):c4,PUT(D):d4,"
-      "SINGLE-DEL(D),SINGLE-DEL(D),",
-      value);
 }
 
-TEST_F(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) {
-  Status s;
+TEST_P(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) {
   std::string value;
-  DBOptions db_options;
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true /* overwrite_key */);
-  batch.Put("A", "a");
-  batch.Put("A", "a2");
-  batch.Put("B", "b");
-  batch.SingleDelete("A");
-  batch.Delete("B");
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("A", "a2"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->Delete("B"));
 
   KVMap map;
-  value = PrintContents(&batch, &map, nullptr);
+  value = PrintContents(batch_.get(), &map, nullptr);
   ASSERT_EQ("", value);
 
   map["A"] = "aa";
   map["C"] = "cc";
   map["D"] = "dd";
 
-  batch.SingleDelete("B");
-  batch.SingleDelete("C");
-  batch.SingleDelete("Z");
+  ASSERT_OK(batch_->SingleDelete("B"));
+  ASSERT_OK(batch_->SingleDelete("C"));
+  ASSERT_OK(batch_->SingleDelete("Z"));
 
-  value = PrintContents(&batch, &map, nullptr);
+  value = PrintContents(batch_.get(), &map, nullptr);
   ASSERT_EQ("D:dd,", value);
 
-  batch.Put("A", "a3");
-  batch.Put("B", "b3");
-  batch.SingleDelete("A");
-  batch.SingleDelete("A");
-  batch.SingleDelete("D");
-  batch.SingleDelete("D");
-  batch.Delete("D");
+  ASSERT_OK(batch_->Put("A", "a3"));
+  ASSERT_OK(batch_->Put("B", "b3"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete("D"));
 
   map["E"] = "ee";
 
-  value = PrintContents(&batch, &map, nullptr);
+  value = PrintContents(batch_.get(), &map, nullptr);
   ASSERT_EQ("B:b3,E:ee,", value);
 }
 
+TEST_P(WriteBatchWithIndexTest, MultiGetTest) {
+  // MultiGet a lot of keys in order to force std::vector reallocations
+  std::vector<std::string> keys;
+  for (int i = 0; i < 100; ++i) {
+    keys.emplace_back(std::to_string(i));
+  }
+
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // Write some data to the db for the even numbered keys
+  {
+    WriteBatch wb;
+    for (size_t i = 0; i < keys.size(); i += 2) {
+      std::string val = "val" + std::to_string(i);
+      ASSERT_OK(wb.Put(cf0, keys[i], val));
+    }
+    ASSERT_OK(db_->Write(write_opts_, &wb));
+    for (size_t i = 0; i < keys.size(); i += 2) {
+      std::string value;
+      ASSERT_OK(db_->Get(read_opts_, cf0, keys[i], &value));
+    }
+  }
+
+  // Write some data to the batch
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if ((i % 5) == 0) {
+      ASSERT_OK(batch_->Delete(cf0, keys[i]));
+    } else if ((i % 7) == 0) {
+      std::string val = "new" + std::to_string(i);
+      ASSERT_OK(batch_->Put(cf0, keys[i], val));
+    }
+    if (i > 0 && (i % 3) == 0) {
+      ASSERT_OK(batch_->Merge(cf0, keys[i], "merge"));
+    }
+  }
+
+  std::vector<Slice> key_slices;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    key_slices.emplace_back(keys[i]);
+  }
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> statuses(keys.size());
+
+  batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, key_slices.size(),
+                                 key_slices.data(), values.data(),
+                                 statuses.data(), false);
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (i == 0) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    } else if ((i % 3) == 0) {
+      ASSERT_OK(statuses[i]);
+      if ((i % 5) == 0) {  // Merge after Delete
+        ASSERT_EQ(values[i], "merge");
+      } else if ((i % 7) == 0) {  // Merge after Put
+        std::string val = "new" + std::to_string(i);
+        ASSERT_EQ(values[i], val + ",merge");
+      } else if ((i % 2) == 0) {
+        std::string val = "val" + std::to_string(i);
+        ASSERT_EQ(values[i], val + ",merge");
+      } else {
+        ASSERT_EQ(values[i], "merge");
+      }
+    } else if ((i % 5) == 0) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    } else if ((i % 7) == 0) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "new" + std::to_string(i));
+    } else if ((i % 2) == 0) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "val" + std::to_string(i));
+    } else {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    }
+  }
+}
+TEST_P(WriteBatchWithIndexTest, MultiGetTest2) {
+  // MultiGet a lot of keys in order to force std::vector reallocations
+  const int num_keys = 700;
+  const int keys_per_pass = 100;
+  std::vector<std::string> keys;
+  for (size_t i = 0; i < num_keys; ++i) {
+    keys.emplace_back(std::to_string(i));
+  }
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // Keys   0- 99 have a PUT in the batch but not DB
+  // Keys 100-199 have a PUT in the DB
+  // Keys 200-299 Have a PUT/DELETE
+  // Keys 300-399 Have a PUT/DELETE/MERGE
+  // Keys 400-499 have a PUT/MERGE
+  // Keys 500-599 have a MERGE only
+  // Keys 600-699 were never written
+  {
+    WriteBatch wb;
+    for (size_t i = 100; i < 500; i++) {
+      std::string val = std::to_string(i);
+      ASSERT_OK(wb.Put(cf0, keys[i], val));
+    }
+    ASSERT_OK(db_->Write(write_opts_, &wb));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), cf0));
+  for (size_t i = 0; i < 100; i++) {
+    ASSERT_OK(batch_->Put(cf0, keys[i], keys[i]));
+  }
+  for (size_t i = 200; i < 400; i++) {
+    ASSERT_OK(batch_->Delete(cf0, keys[i]));
+  }
+  for (size_t i = 300; i < 600; i++) {
+    std::string val = std::to_string(i) + "m";
+    ASSERT_OK(batch_->Merge(cf0, keys[i], val));
+  }
+
+  Random rnd(301);
+  std::vector<PinnableSlice> values(keys_per_pass);
+  std::vector<Status> statuses(keys_per_pass);
+  for (int pass = 0; pass < 40; pass++) {
+    std::vector<Slice> key_slices;
+    for (size_t i = 0; i < keys_per_pass; i++) {
+      int random = rnd.Uniform(num_keys);
+      key_slices.emplace_back(keys[random]);
+    }
+    batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, keys_per_pass,
+                                   key_slices.data(), values.data(),
+                                   statuses.data(), false);
+    for (size_t i = 0; i < keys_per_pass; i++) {
+      int key = ParseInt(key_slices[i].ToString());
+      switch (key / 100) {
+        case 0:  // 0-99 PUT only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString());
+          break;
+        case 1:  // 100-199 PUT only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString());
+          break;
+        case 2:  // 200-299 Deleted
+          ASSERT_TRUE(statuses[i].IsNotFound());
+          break;
+        case 3:  // 300-399 Delete+Merge
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+          break;
+        case 4:  // 400-400 Put+ Merge
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "," +
+                                   key_slices[i].ToString() + "m");
+          break;
+        case 5:  // Merge only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+          break;
+        case 6:  // Never written
+          ASSERT_TRUE(statuses[i].IsNotFound());
+          break;
+        default:
+          assert(false);
+      }  // end switch
+    }    // End for each key
+  }      // end for passes
+}
+
+// This test has merges, but the merge does not play into the final result
+TEST_P(WriteBatchWithIndexTest, FakeMergeWithIteratorTest) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // The map we are starting with
+  KVMap input = {
+      {"odm", "odm0"},
+      {"omd", "omd0"},
+      {"omp", "omp0"},
+  };
+  KVMap result = {
+      {"odm", "odm2"},  // Orig, Delete, Merge
+      {"mp", "mp1"},    // Merge, Put
+      {"omp", "omp2"},  // Origi, Merge, Put
+      {"mmp", "mmp2"}   // Merge, Merge, Put
+  };
+
+  for (auto& iter : result) {
+    EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+  }
+  AddToBatch(cf0, "md");   // Merge, Delete
+  AddToBatch(cf0, "mmd");  // Merge, Merge, Delete
+  AddToBatch(cf0, "omd");  // Orig, Merge, Delete
+
+  KVIter kvi(&result);
+  // First try just the batch
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&input)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTest) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  KVMap result = {
+      {"m", "m0"},                // Merge
+      {"mm", "mm0,mm1"},          // Merge, Merge
+      {"dm", "dm1"},              // Delete, Merge
+      {"dmm", "dmm1,dmm2"},       // Delete, Merge, Merge
+      {"mdm", "mdm2"},            // Merge, Delete, Merge
+      {"mpm", "mpm1,mpm2"},       // Merge, Put, Merge
+      {"pm", "pm0,pm1"},          // Put, Merge
+      {"pmm", "pmm0,pmm1,pmm2"},  // Put, Merge, Merge
+  };
+
+  for (auto& iter : result) {
+    EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+  }
+
+  KVIter kvi(&result);
+  // First try just the batch
+  KVMap empty_map;
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&empty_map)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTestWithOrig) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+  KVMap original;
+  KVMap results = {
+      {"m", "om,m0"},             // Merge
+      {"mm", "omm,mm0,mm1"},      // Merge, Merge
+      {"dm", "dm1"},              // Delete, Merge
+      {"dmm", "dmm1,dmm2"},       // Delete, Merge, Merge
+      {"mdm", "mdm2"},            // Merge, Delete, Merge
+      {"mpm", "mpm1,mpm2"},       // Merge, Put, Merge
+      {"pm", "pm0,pm1"},          // Put, Merge
+      {"pmm", "pmm0,pmm1,pmm2"},  // Put, Merge, Merge
+  };
+
+  for (auto& iter : results) {
+    AddToBatch(cf0, iter.first);
+    original[iter.first] = "o" + iter.first;
+  }
+
+  KVIter kvi(&results);
+  // First try just the batch
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&original)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAfterMerge) {
+  std::string value;
+  Status s;
+
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+  batch_->Merge("o", "bb");  // Merging bb under key "o"
+  batch_->Merge("m", "cc");  // Merging bc under key "m"
+  s = batch_->GetFromBatch(options_, "m", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  s = batch_->GetFromBatch(options_, "o", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+
+  ASSERT_OK(db_->Write(write_opts_, batch_->GetWriteBatch()));
+  ASSERT_OK(db_->Get(read_opts_, "o", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(db_->Get(read_opts_, "m", &value));
+  ASSERT_EQ(value, "cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAndDBAfterMerge) {
+  std::string value;
+
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+  ASSERT_OK(batch_->Merge("o", "bb"));  // Merging bb under key "o"
+  ASSERT_OK(batch_->Merge("m", "cc"));  // Merging bc under key "m"
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "o", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "m", &value));
+  ASSERT_EQ(value, "cc");
+}
+
+TEST_F(WBWIKeepTest, GetAfterPut) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+  ASSERT_OK(batch_->Put("key", "aa"));  // Writing aa under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+
+  ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa,bb,cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb,cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergePut) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+  ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
+  Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "orig,aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "orig,aa,bb");
+
+  ASSERT_OK(batch_->Put("key", "cc"));  // Writing cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc");
+
+  ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
+  Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+
+  ASSERT_OK(batch_->Delete("key"));  // Delete key from batch
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_F(WBWIOverwriteTest, TestBadMergeOperator) {
+  class FailingMergeOperator : public MergeOperator {
+   public:
+    FailingMergeOperator() {}
+
+    bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                     MergeOperationOutput* /*merge_out*/) const override {
+      return false;
+    }
+
+    const char* Name() const override { return "Failing"; }
+  };
+  options_.merge_operator.reset(new FailingMergeOperator());
+  ASSERT_OK(OpenDB());
+
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+  std::string value;
+
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(batch_->Put("b", "b0"));
+
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_NOK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_NOK(batch_->GetFromBatch(column_family, options_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "b", &value));
+}
+
+INSTANTIATE_TEST_CASE_P(WBWI, WriteBatchWithIndexTest, testing::Bool());
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff -Nru mariadb-10.11.11/storage/spider/CMakeLists.txt mariadb-10.11.13/storage/spider/CMakeLists.txt
--- mariadb-10.11.11/storage/spider/CMakeLists.txt	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/CMakeLists.txt	2025-05-19 16:14:26.000000000 +0000
@@ -28,16 +28,5 @@
 
   MYSQL_ADD_PLUGIN(spider ${SPIDER_SOURCES} ${extra_options}
     STORAGE_ENGINE MODULE_ONLY)
-  IF(NOT TARGET spider)
-    RETURN()
-  ENDIF()
 ENDIF()
 
-IF(MSVC AND (TARGET spider))
-  IF (CMAKE_BUILD_TYPE STREQUAL "Debug")
-    ADD_CUSTOM_COMMAND(TARGET spider
-                       POST_BUILD
-                       COMMAND if not exist ..\\..\\sql\\lib mkdir ..\\..\\sql\\lib\\plugin
-                       COMMAND copy Debug\\ha_spider.dll ..\\..\\sql\\lib\\plugin\\ha_spider.dll)
-  ENDIF()
-ENDIF()
diff -Nru mariadb-10.11.11/storage/spider/ha_spider.cc mariadb-10.11.13/storage/spider/ha_spider.cc
--- mariadb-10.11.11/storage/spider/ha_spider.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/ha_spider.cc	2025-05-19 16:14:26.000000000 +0000
@@ -468,6 +468,28 @@
   }
 }
 
+/*
+  Given an ha_spider that is being closed, reset the queued ping info
+  of SPIDER_CONN of the current spider trx that has the given
+  ha_spider as the queued_ping_spider.
+*/
+static void spider_reset_conn_queued_ping(ha_spider *spider)
+{
+  SPIDER_TRX *trx= spider_current_trx;
+  if (trx)
+  {
+    for (uint i= 0; i < trx->trx_conn_hash.records; i++)
+    {
+      SPIDER_CONN *conn= (SPIDER_CONN *) my_hash_element(&trx->trx_conn_hash, i);
+      if (conn->queued_ping_spider == spider)
+      {
+        conn->queued_ping= FALSE;
+        conn->queued_ping_spider= NULL;
+      }
+    }
+  }
+}
+
 int ha_spider::close()
 {
   int error_num= 0, roop_count;
@@ -562,6 +584,7 @@
     result_list.tmp_sqls = NULL;
   }
 
+  spider_reset_conn_queued_ping(this);
   spider_update_current_trx_ha_with_freed_share(share);
   spider_free_share(share);
   is_clone = FALSE;
@@ -4127,10 +4150,6 @@
     }
   }
   pushed_pos = NULL;
-/*
-  if (wide_handler->external_lock_type == F_WRLCK)
-    check_and_start_bulk_update(SPD_BU_START_BY_INDEX_OR_RND_INIT);
-*/
   rnd_scan_and_first = scan;
   if (
     scan &&
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bg/t/basic_sql.test mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/basic_sql.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bg/t/basic_sql.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/basic_sql.test	2025-05-19 16:14:26.000000000 +0000
@@ -1039,8 +1039,11 @@
   }
 }
 --connection master_1
+# MDEV-36357
+--disable_view_protocol
 SELECT a.a, a.b, date_format(a.c, '%Y-%m-%d %H:%i:%s') FROM tb_l a WHERE
 EXISTS (SELECT * FROM ta_l b WHERE b.b = a.b) ORDER BY a.a;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bg/t/ha.test mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bg/t/ha.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha.test	2025-05-19 16:14:26.000000000 +0000
@@ -395,7 +395,9 @@
 --connection master_1
 eval $MASTER_1_SET_RECOVERY_STATUS_2_1;
 eval $MASTER_1_CHECK_LINK_STATUS;
+--disable_view_protocol
 eval $MASTER_1_COPY_TABLES_2_1;
+--enable_view_protocol
 if ($USE_CHILD_GROUP3)
 {
   if (!$OUTPUT_CHILD_GROUP3)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bg/t/ha_part.test mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha_part.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bg/t/ha_part.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bg/t/ha_part.test	2025-05-19 16:14:26.000000000 +0000
@@ -460,7 +460,9 @@
   --connection master_1
   eval $MASTER_1_SET_RECOVERY_STATUS_P_2_1;
   eval $MASTER_1_CHECK_LINK_STATUS;
+  --disable_view_protocol
   eval $MASTER_1_COPY_TABLES_P_2_1;
+  --enable_view_protocol
   if ($USE_CHILD_GROUP3)
   {
     if (!$OUTPUT_CHILD_GROUP3)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_comma_pwd_init.inc mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_comma_pwd_init.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_comma_pwd_init.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_comma_pwd_init.inc	2025-05-19 16:14:26.000000000 +0000
@@ -6,6 +6,6 @@
 --enable_query_log
 --enable_warnings
 let $DIRECT_SQL_COMMAND=
-  SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test", password "pass,1234", user "tu"');
+  SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test", password "pass,1234", user "tu"') as exp;
 --connection child2_1
 GRANT ALL ON *.* TO tu@'%' IDENTIFIED BY 'pass,1234';
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_tmp_table_init.inc mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_tmp_table_init.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_tmp_table_init.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/include/direct_sql_with_tmp_table_init.inc	2025-05-19 16:14:26.000000000 +0000
@@ -6,4 +6,4 @@
 --enable_query_log
 --enable_warnings
 let $DIRECT_SQL_COMMAND=
-  SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test"');
+  SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test"') as exp;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_comma_pwd.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_comma_pwd.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_comma_pwd.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_comma_pwd.result	2025-05-19 16:14:26.000000000 +0000
@@ -15,8 +15,8 @@
 pkey int NOT NULL,
 PRIMARY KEY (pkey)
 ) MASTER_1_ENGINE2
-SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test", password "pass,1234", user "tu"');
-spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test", password "pass,1234", user "tu"')
+SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test", password "pass,1234", user "tu"') as exp;
+exp
 1
 SELECT pkey FROM tmp_a;
 pkey
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_tmp_table.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_tmp_table.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_tmp_table.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/direct_sql_with_tmp_table.result	2025-05-19 16:14:26.000000000 +0000
@@ -13,8 +13,8 @@
 pkey int NOT NULL,
 PRIMARY KEY (pkey)
 ) MASTER_1_ENGINE2
-SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test"');
-spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test"')
+SELECT spider_direct_sql('SELECT 22', 'tmp_a', 'srv "s_2_1", database "test"') as exp;
+exp
 1
 SELECT pkey FROM tmp_a;
 pkey
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_26345.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_26345.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_26345.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_26345.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 MDEV-26345 SELECT MIN on Spider table returns more rows than expected
 
 set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
 CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 create table t2 (a int, b int, PRIMARY KEY (a, b));
@@ -14,6 +15,9 @@
 SELECT MIN(b), a FROM t1 WHERE a=1;
 MIN(b)	a
 2	1
+select * from (SELECT MIN(b), a FROM t1 WHERE a=1) as v;
+MIN(b)	a
+2	1
 SELECT MAX(b), a FROM t1 WHERE a<3;
 MAX(b)	a
 11	1
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29002.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29002.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29002.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29002.result	2025-05-19 16:14:26.000000000 +0000
@@ -2,6 +2,7 @@
 for child2
 for child3
 SET spider_same_server_link= on;
+SET global spider_same_server_link= on;
 CREATE SERVER s FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t (a INT);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29163.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29163.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29163.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29163.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 for child2
 for child3
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 CREATE SERVER s FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t1 (a INT);
 CREATE TABLE t2 (b INT);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29502.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29502.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29502.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29502.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 for child2
 for child3
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 CREATE SERVER $srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t (a INT);
 INSERT INTO t VALUES (23),(48);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29605.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29605.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29605.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29605.result	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,19 @@
+for master_1
+for child2
+for child3
+set spider_same_server_link= 1;
+CREATE USER spider@localhost IDENTIFIED BY 'pwd';
+GRANT ALL ON test.* TO spider@localhost;
+CREATE SERVER srv FOREIGN DATA WRAPPER mysql
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'spider', password 'pwd');
+SET autocommit=0;
+set @old_init_connect=@@global.init_connect;
+set global init_connect="dummy";
+CREATE TABLE t ENGINE=Spider COMMENT='WRAPPER "mysql",srv "srv",TABLE "t"' AS SELECT 1;
+Got one of the listed errors
+set global init_connect=@old_init_connect;
+drop server srv;
+drop user spider@localhost;
+for master_1
+for child2
+for child3
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29962.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29962.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_29962.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_29962.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 for child2
 for child3
 set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
 CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t (c INT) ENGINE=InnoDB;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_30392.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30392.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_30392.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30392.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 for child2
 for child3
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t1 (a INT);
 INSERT INTO t1 VALUES (1),(2);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_30408.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30408.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_30408.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_30408.result	2025-05-19 16:14:26.000000000 +0000
@@ -3,6 +3,7 @@
 for child3
 set @@optimizer_switch="semijoin=off";
 set spider_same_server_link= 1;
+set global spider_same_server_link=1;
 CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 create table ten(a int primary key);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_31338.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31338.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_31338.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31338.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 for child2
 for child3
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t (c BLOB) ENGINE=InnoDB;
 CREATE TABLE ts (c BLOB) ENGINE=Spider COMMENT='WRAPPER "mysql",srv "srv",TABLE "t"';
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_31645.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31645.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_31645.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_31645.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 for child2
 for child3
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t1 ( a bigint(20) NOT NULL, b bigint(20) DEFAULT 0, PRIMARY KEY (a));
 CREATE TABLE t2 ( a bigint(20) NOT NULL, b bigint(20) DEFAULT 0, PRIMARY KEY (a)) ENGINE=SPIDER COMMENT='srv "srv", WRAPPER "mysql", TABLE "t1"';
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_34003.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_34003.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_34003.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_34003.result	2025-05-19 16:14:26.000000000 +0000
@@ -2,6 +2,7 @@
 for child2
 for child3
 set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
 CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 create table t2 (c INT KEY,c1 BLOB,c2 TEXT) ENGINE=InnoDB;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_35807.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35807.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_35807.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35807.result	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,16 @@
+#
+# MDEV-35807 Spider wrapper name is case sensitive
+#
+for master_1
+for child2
+for child3
+set spider_same_server_link= 1;
+CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
+SELECT spider_direct_sql ('SELECT 1','','SRV "srv"');
+spider_direct_sql ('SELECT 1','','SRV "srv"')
+1
+drop server srv;
+for master_1
+for child2
+for child3
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_35874.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35874.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_35874.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35874.result	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,51 @@
+#
+# MDEV-35874 Unexpected error 1264 'Out of Range Value for Column' when inserting into ... select ... from a spider table
+#
+for master_1
+for child2
+for child3
+set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
+CREATE SERVER srv FOREIGN DATA WRAPPER mysql
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
+CREATE TABLE t1 (c6 decimal(6,0)) ENGINE=InnoDB;
+CREATE TABLE t1_s (c6 decimal(6,0))
+ENGINE=SPIDER COMMENT='wrapper "mariadb", srv "srv", table "t1"' ;
+CREATE TABLE t2 (c8 decimal(8,0), c6 decimal(6,0)) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (123456), (654321);
+/* 1 */ SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+12345678	c6
+12345678	123456
+12345678	654321
+/* 2 */ INSERT INTO t2 (c8, c6) SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+set spider_quick_mode= 2;
+/* 3 */ SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+12345678	c6
+12345678	123456
+12345678	654321
+/* 4 */ INSERT INTO t2 (c8, c6) SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+set spider_quick_mode= 3;
+set spider_quick_page_size= 1;
+INSERT INTO t1 VALUES (777777);
+/* 5 */ SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+12345678	c6
+12345678	123456
+12345678	654321
+12345678	777777
+/* 6 */ INSERT INTO t2 (c8, c6) SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+set spider_quick_mode= 0;
+set spider_quick_page_size= 1024;
+CREATE OR REPLACE TABLE t1_s (c6 decimal(6,0) key)
+ENGINE=SPIDER COMMENT='wrapper "mariadb", srv "srv", table "t1"' ;
+/* 7 */ SELECT MAX(c6), SUM(c6) FROM t1_s;
+MAX(c6)	SUM(c6)
+777777	1555554
+set spider_select_column_mode= 0;
+/* 8 */ SELECT MAX(c6), SUM(c6) FROM t1_s;
+MAX(c6)	SUM(c6)
+777777	1555554
+drop table t2, t1_s, t1;
+drop server srv;
+for master_1
+for child2
+for child3
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_35959.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35959.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/mdev_35959.result	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/mdev_35959.result	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,25 @@
+#
+# MDEV-35959 Assertion `*str != '\0'' failed in my_message_sql after a Spider error
+#
+for master_1
+for child2
+for child3
+SET spider_same_server_link=ON, spider_net_read_timeout=1;
+CREATE SERVER srv FOREIGN DATA WRAPPER mysql
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
+CREATE TABLE t_remote (a INT);
+CREATE TABLE t (a INT) ENGINE=SPIDER COMMENT = 'wrapper "mysql", srv "srv", table "t_remote"';
+CREATE OR REPLACE TABLE t_remote LIKE t;
+ERROR 08S01: Got timeout reading communication packets
+SHOW WARNINGS;
+Level	Code	Message
+Error	1159	Got timeout reading communication packets
+Error	1159	Got timeout reading communication packets
+Error	1159	Got timeout reading communication packets
+Error	1159	Got timeout reading communication packets
+Error	1159	Got timeout reading communication packets
+drop table t;
+drop server srv;
+for master_1
+for child2
+for child3
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/subquery.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/subquery.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/subquery.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/subquery.result	2025-05-19 16:14:26.000000000 +0000
@@ -5,6 +5,7 @@
 for child2
 for child3
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 create table t1 (c1 int);
 create table t2 (c2 int);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early.result	2025-05-19 16:14:26.000000000 +0000
@@ -31,8 +31,8 @@
 a INT
 ) ENGINE=Spider DEFAULT CHARSET=utf8 COMMENT='table "tbl_a", srv "s_2_1"';
 create temporary table results (a int);
-SELECT SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"');
-SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"')
+SELECT SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"') as exp;
+exp
 1
 select * from results;
 a
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early_init_file.result mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early_init_file.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early_init_file.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/r/udf_mysql_func_early_init_file.result	2025-05-19 16:14:26.000000000 +0000
@@ -31,8 +31,8 @@
 a INT
 ) ENGINE=Spider DEFAULT CHARSET=utf8 COMMENT='table "tbl_a", srv "s_2_1"';
 create temporary table results (a int);
-SELECT SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"');
-SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"')
+SELECT SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"') as exp;
+exp
 1
 select * from results;
 a
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/checksum_table_with_quick_mode_3.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/checksum_table_with_quick_mode_3.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/checksum_table_with_quick_mode_3.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/checksum_table_with_quick_mode_3.test	2025-05-19 16:14:26.000000000 +0000
@@ -52,9 +52,11 @@
 CHECKSUM TABLE tbl_a EXTENDED;
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/cp932_column.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/cp932_column.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/cp932_column.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/cp932_column.test	2025-05-19 16:14:26.000000000 +0000
@@ -59,9 +59,11 @@
 
 --connection child2_1
 SET NAMES cp932;
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 SET NAMES utf8;
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/delete_with_float_column.inc mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/delete_with_float_column.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/delete_with_float_column.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/delete_with_float_column.inc	2025-05-19 16:14:26.000000000 +0000
@@ -78,9 +78,11 @@
 SET SESSION sql_log_bin= 0;
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection slave1_1
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/group_by_order_by_limit.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/group_by_order_by_limit.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/group_by_order_by_limit.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/group_by_order_by_limit.test	2025-05-19 16:14:26.000000000 +0000
@@ -71,16 +71,22 @@
 set @old_spider_direct_aggregate=@@session.spider_direct_aggregate;
 set spider_direct_aggregate=1;
 SHOW STATUS LIKE 'Spider_direct_aggregate';
+--disable_view_protocol
 SELECT skey, count(*) cnt FROM tbl_a GROUP BY skey ORDER BY cnt DESC, skey DESC LIMIT 5;
+--enable_view_protocol
 SHOW STATUS LIKE 'Spider_direct_aggregate';
 set spider_direct_aggregate=@old_spider_direct_aggregate;
 
 --connection child2_1
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/insert_select.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/insert_select.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/insert_select.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/insert_select.test	2025-05-19 16:14:26.000000000 +0000
@@ -79,9 +79,11 @@
 INSERT IGNORE INTO tbl_b (SELECT skey, CAST(CONCAT(dt, ' ', tm) AS datetime) FROM tbl_a WHERE skey = 5 AND dt > DATE_ADD('2012-12-01', INTERVAL -10 DAY));
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_19866.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_19866.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_19866.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_19866.test	2025-05-19 16:14:26.000000000 +0000
@@ -71,11 +71,15 @@
 SELECT * FROM tbl_a;
 
 --connection child2_1
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_20100.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20100.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_20100.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20100.test	2025-05-19 16:14:26.000000000 +0000
@@ -70,7 +70,9 @@
 
 --connection child2_1
 --disable_ps2_protocol
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 --enable_ps2_protocol
 --disable_ps_protocol
 eval $CHILD2_1_SELECT_TABLES;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_20502.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20502.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_20502.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_20502.test	2025-05-19 16:14:26.000000000 +0000
@@ -49,17 +49,23 @@
 
 --disable_ps2_protocol
 --connection master_1
+--disable_view_protocol
 SELECT id, 0 AS const, val FROM tbl_a;
 SELECT 1+2, id, 0 AS const, val, val+10, (SELECT tbl_a.val+1 FROM tbl_a) AS sq
 FROM tbl_a;
+--enable_view_protocol
 INSERT INTO tbl_a (val) VALUES (2), (1);
+--disable_view_protocol
 SELECT val+10, 0 AS const, val, (SELECT tbl_a.val+1 FROM tbl_a LIMIT 1) AS sq
 FROM tbl_a GROUP BY val;
 SELECT MAX(id) AS m, 0 AS const, val, (SELECT tbl_a.val+1 FROM tbl_a LIMIT 1) AS sq
 FROM tbl_a GROUP BY val;
+--enable_view_protocol
 
 --connection child2_1
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_21884.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_21884.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_21884.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_21884.test	2025-05-19 16:14:26.000000000 +0000
@@ -81,9 +81,11 @@
 
 --connection child2_1
 SET NAMES utf8;
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 --disable_ps_protocol
 eval $CHILD2_1_SELECT_TABLES;
 --enable_ps_protocol
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_26345.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_26345.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_26345.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_26345.test	2025-05-19 16:14:26.000000000 +0000
@@ -11,6 +11,7 @@
 --echo
 
 set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
@@ -21,6 +22,7 @@
 COMMENT='WRAPPER "mysql", srv "srv",TABLE "t2"';
 insert into t1 VALUES (1,4), (1,2), (2,11);
 SELECT MIN(b), a FROM t1 WHERE a=1;
+select * from (SELECT MIN(b), a FROM t1 WHERE a=1) as v;
 SELECT MAX(b), a FROM t1 WHERE a<3;
 drop table t1, t2;
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_27172.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_27172.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_27172.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_27172.test	2025-05-19 16:14:26.000000000 +0000
@@ -50,8 +50,10 @@
 
 --disable_ps2_protocol
 INSERT INTO tbl_a VALUES (1, "Hi!"),(2, "Aloha!"),(3, "Aloha!!!");
+--disable_view_protocol
 SELECT * FROM tbl_a WHERE greeting = "Aloha!"
   AND CASE greeting WHEN "Aloha!" THEN "one" ELSE 'more' END = "one"; # hack to disable GBH
+--enable_view_protocol
 
 # LIKE
 eval CREATE TABLE tbl_b (
@@ -61,8 +63,10 @@
 ) $MASTER_1_ENGINE $MASTER_1_CHARSET COMMENT='table "tbl_b", srv "s_2_1"';
 
 INSERT INTO tbl_b VALUES (1, "Hi!"),(2, "Aloha!"),(3, "Aloha!!!");
+--disable_view_protocol
 SELECT * FROM tbl_b WHERE greeting = "Aloha!"
   AND CASE greeting WHEN "Aloha!" THEN "one" ELSE 'more' END = "one"; # hack to disable GBH
+--enable_view_protocol
 
 # LIKE
 eval CREATE TABLE tbl_c (
@@ -72,11 +76,15 @@
 ) $MASTER_1_ENGINE $MASTER_1_CHARSET COMMENT='table "tbl_c", srv "s_2_1"';
 
 INSERT INTO tbl_c VALUES (1, "Hi!"),(2, "Aloha!"),(3, "Aloha!!!");
+--disable_view_protocol
 SELECT * FROM tbl_c WHERE greeting = "Aloha!"
   AND CASE greeting WHEN "Aloha!" THEN "one" ELSE 'more' END = "one"; # hack to disable GBH
+--enable_view_protocol
 
 --connection child2_1
+--disable_view_protocol
 SELECT argument FROM mysql.general_log WHERE argument LIKE 'select `id`,`greeting` from %';
+--enable_view_protocol
 --enable_ps2_protocol
 
 --connection child2_1
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29002.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29002.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29002.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29002.test	2025-05-19 16:14:26.000000000 +0000
@@ -4,6 +4,7 @@
 --enable_result_log
 --enable_query_log
 SET spider_same_server_link= on;
+SET global spider_same_server_link= on;
 evalp CREATE SERVER s FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29008.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29008.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29008.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29008.test	2025-05-19 16:14:26.000000000 +0000
@@ -31,15 +31,19 @@
 ) $MASTER_1_ENGINE $MASTER_1_CHARSET COMMENT='table "tbl_a", srv "s_2_1"';
 
 --disable_ps_protocol
+--disable_view_protocol
 SELECT MIN(t2.a) AS f1, t1.b AS f2 FROM tbl_a AS t1 JOIN tbl_a AS t2 GROUP BY f2 ORDER BY f1, f2;
 SELECT MIN(t2.a) AS f1, t1.b AS f2 FROM tbl_a AS t1 JOIN tbl_a AS t2 GROUP BY f2 ORDER BY MIN(t2.a), MAX(t2.a), f2;
+--enable_view_protocol
 --enable_ps_protocol
 
 --connection master_1
 DROP DATABASE IF EXISTS auto_test_local;
 --connection child2_1
 --disable_ps_protocol
+--disable_view_protocol
 SELECT argument FROM mysql.general_log WHERE argument LIKE 'select %';
+--enable_view_protocol
 --enable_ps_protocol
 set global log_output=@old_log_output;
 set global general_log=@old_general_log;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29163.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29163.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29163.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29163.test	2025-05-19 16:14:26.000000000 +0000
@@ -7,6 +7,7 @@
 --enable_result_log
 --enable_query_log
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 evalp CREATE SERVER s FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
 CREATE TABLE t1 (a INT);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29502.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29502.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29502.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29502.test	2025-05-19 16:14:26.000000000 +0000
@@ -9,6 +9,7 @@
 --enable_query_log
 
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 --let $srv=srv_mdev_29502
 evalp CREATE SERVER $srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
@@ -23,10 +24,14 @@
 SELECT MAX(a) FROM t1;
 SELECT SUM(a) FROM t1;
 SELECT COUNT(a) FROM t1;
+# Spider does not create a GBH with view protocol in these cases which
+# would cause extra direct aggregate counts than without view protocol
+--disable_view_protocol
 SELECT MAX(a), SUM(a) FROM t1;
 SELECT COUNT(a), MAX(a), SUM(a) FROM t1;
 SELECT MAX(a), COUNT(a), SUM(a) FROM t1;
 SELECT MAX(a), MAX(COALESCE(a)) FROM t1;
+--enable_view_protocol
 SHOW STATUS LIKE 'Spider_direct_aggregate';
 
 DROP TABLE t, t1;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29605.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29605.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29605.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29605.test	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,25 @@
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+
+set spider_same_server_link= 1;
+CREATE USER spider@localhost IDENTIFIED BY 'pwd';
+GRANT ALL ON test.* TO spider@localhost;
+evalp CREATE SERVER srv FOREIGN DATA WRAPPER mysql
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'spider', password 'pwd');
+SET autocommit=0;
+set @old_init_connect=@@global.init_connect;
+set global init_connect="dummy";
+--error ER_NET_ERROR_ON_WRITE,12701
+CREATE TABLE t ENGINE=Spider COMMENT='WRAPPER "mysql",srv "srv",TABLE "t"' AS SELECT 1;
+set global init_connect=@old_init_connect;
+drop server srv;
+drop user spider@localhost;
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29962.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29962.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_29962.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_29962.test	2025-05-19 16:14:26.000000000 +0000
@@ -8,6 +8,7 @@
 --enable_query_log
 
 set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30392.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30392.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30392.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30392.test	2025-05-19 16:14:26.000000000 +0000
@@ -7,6 +7,7 @@
 --enable_result_log
 --enable_query_log
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
 CREATE TABLE t1 (a INT);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30408.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30408.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30408.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30408.test	2025-05-19 16:14:26.000000000 +0000
@@ -6,6 +6,7 @@
 
 set @@optimizer_switch="semijoin=off";
 set spider_same_server_link= 1;
+set global spider_same_server_link=1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30649.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30649.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30649.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30649.test	2025-05-19 16:14:26.000000000 +0000
@@ -25,7 +25,9 @@
 # possibly a bug, e.g. an unnecessary requirement.
 evalp CREATE TABLE t (c INT, d DATE, PRIMARY KEY(c)) ENGINE=SPIDER
 COMMENT='table "src dst", srv "s_2_1 s_1"';
+--disable_view_protocol
 SELECT spider_copy_tables('t', '0', '1');
+--enable_view_protocol
 
 SELECT * FROM dst;
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30727.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30727.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_30727.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_30727.test	2025-05-19 16:14:26.000000000 +0000
@@ -7,8 +7,10 @@
 SELECT spider_bg_direct_sql ('SELECT * FROM s','a','srv "b"');
 
 CREATE FUNCTION spider_copy_tables RETURNS INT SONAME 'ha_spider.so';
+--disable_view_protocol
 --error ER_CANT_INITIALIZE_UDF
 SELECT spider_copy_tables ('t', '0', '0');
+--enable_view_protocol
 
 # spider_flush_table_mon_cache does not require spider init to function
 CREATE FUNCTION spider_flush_table_mon_cache RETURNS INT SONAME 'ha_spider.so';
@@ -21,8 +23,10 @@
 SELECT spider_direct_sql ('SELECT * FROM s','a','srv "b"');
 
 call mtr.add_suppression(".*\\[Error\\] (mysqld|mariadbd): Can't find record in 'spider_tables'");
+--disable_view_protocol
 --error ER_KEY_NOT_FOUND
 SELECT spider_copy_tables ('t', '0', '0');
+--enable_view_protocol
 
 SELECT spider_flush_table_mon_cache ();
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_31338.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31338.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_31338.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31338.test	2025-05-19 16:14:26.000000000 +0000
@@ -10,6 +10,7 @@
 --enable_query_log
 
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 CREATE TABLE t (c BLOB) ENGINE=InnoDB;
 CREATE TABLE ts (c BLOB) ENGINE=Spider COMMENT='WRAPPER "mysql",srv "srv",TABLE "t"';
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_31645.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31645.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_31645.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_31645.test	2025-05-19 16:14:26.000000000 +0000
@@ -7,6 +7,7 @@
 --enable_result_log
 --enable_query_log
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 
 CREATE TABLE t1 ( a bigint(20) NOT NULL, b bigint(20) DEFAULT 0, PRIMARY KEY (a));
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_34003.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34003.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_34003.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34003.test	2025-05-19 16:14:26.000000000 +0000
@@ -4,6 +4,7 @@
 --enable_result_log
 --enable_query_log
 set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER mysql
 OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 create table t2 (c INT KEY,c1 BLOB,c2 TEXT) ENGINE=InnoDB;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_34659.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34659.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_34659.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_34659.test	2025-05-19 16:14:26.000000000 +0000
@@ -12,7 +12,10 @@
 CREATE TABLE t2 (c INT) ENGINE=Spider COMMENT='WRAPPER "mysql",SRV "srv",TABLE "t1"';
 insert into t2 values (456), (123);
 SELECT * FROM t2 ORDER BY CAST(c AS char(60));
+# extra warnings with view protocol
+--disable_view_protocol
 SELECT * FROM t2 ORDER BY CAST(c AS INET6);
+--enable_view_protocol
 SELECT * FROM t2 GROUP BY CAST(c AS char(60));
 SELECT * FROM t2 GROUP BY CAST(c AS INET6);
 # Cleanup
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_35807.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35807.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_35807.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35807.test	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,21 @@
+--echo #
+--echo # MDEV-35807 Spider wrapper name is case sensitive
+--echo #
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+
+set spider_same_server_link= 1;
+evalp CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
+SELECT spider_direct_sql ('SELECT 1','','SRV "srv"');
+drop server srv;
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_35874.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35874.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_35874.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35874.test	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,53 @@
+--echo #
+--echo # MDEV-35874 Unexpected error 1264 'Out of Range Value for Column' when inserting into ... select ... from a spider table
+--echo #
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+--source include/have_innodb.inc
+set spider_same_server_link= 1;
+set global spider_same_server_link= 1;
+evalp CREATE SERVER srv FOREIGN DATA WRAPPER mysql
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
+
+CREATE TABLE t1 (c6 decimal(6,0)) ENGINE=InnoDB;
+
+CREATE TABLE t1_s (c6 decimal(6,0))
+ENGINE=SPIDER COMMENT='wrapper "mariadb", srv "srv", table "t1"' ;
+
+CREATE TABLE t2 (c8 decimal(8,0), c6 decimal(6,0)) ENGINE=InnoDB;
+
+INSERT INTO t1 VALUES (123456), (654321);
+
+/* 1 */ SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+/* 2 */ INSERT INTO t2 (c8, c6) SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+
+set spider_quick_mode= 2;
+/* 3 */ SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+/* 4 */ INSERT INTO t2 (c8, c6) SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+
+set spider_quick_mode= 3;
+set spider_quick_page_size= 1;
+INSERT INTO t1 VALUES (777777);
+/* 5 */ SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+/* 6 */ INSERT INTO t2 (c8, c6) SELECT 12345678, c6 FROM t1_s GROUP BY c6;
+
+set spider_quick_mode= 0;
+set spider_quick_page_size= 1024;
+CREATE OR REPLACE TABLE t1_s (c6 decimal(6,0) key)
+ENGINE=SPIDER COMMENT='wrapper "mariadb", srv "srv", table "t1"' ;
+/* 7 */ SELECT MAX(c6), SUM(c6) FROM t1_s;
+
+set spider_select_column_mode= 0;
+/* 8 */ SELECT MAX(c6), SUM(c6) FROM t1_s;
+
+drop table t2, t1_s, t1;
+drop server srv;
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_35959.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35959.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/mdev_35959.test	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/mdev_35959.test	2025-05-19 16:14:26.000000000 +0000
@@ -0,0 +1,30 @@
+--echo #
+--echo # MDEV-35959 Assertion `*str != '\0'' failed in my_message_sql after a Spider error
+--echo #
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_init.inc
+--enable_result_log
+--enable_query_log
+
+SET spider_same_server_link=ON, spider_net_read_timeout=1;
+evalp CREATE SERVER srv FOREIGN DATA WRAPPER mysql
+OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
+
+CREATE TABLE t_remote (a INT);
+CREATE TABLE t (a INT) ENGINE=SPIDER COMMENT = 'wrapper "mysql", srv "srv", table "t_remote"';
+--disable_ps_protocol
+--error ER_NET_READ_INTERRUPTED
+CREATE OR REPLACE TABLE t_remote LIKE t;
+--enable_ps_protocol
+SHOW WARNINGS;
+
+drop table t;
+drop server srv;
+
+--disable_query_log
+--disable_result_log
+--source ../../t/test_deinit.inc
+--enable_result_log
+--enable_query_log
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_0.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_0.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_0.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_0.test	2025-05-19 16:14:26.000000000 +0000
@@ -78,13 +78,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -103,13 +107,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -129,12 +137,16 @@
 
 --connection child2_1
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
+--disable_view_protocol
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_1.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_1.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_1.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_1.test	2025-05-19 16:14:26.000000000 +0000
@@ -78,13 +78,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -103,13 +107,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -128,13 +136,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_2.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_2.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_2.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_2.test	2025-05-19 16:14:26.000000000 +0000
@@ -78,13 +78,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -103,13 +107,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -128,13 +136,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_3.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_3.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/quick_mode_3.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/quick_mode_3.test	2025-05-19 16:14:26.000000000 +0000
@@ -79,13 +79,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -104,13 +108,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --connection master_1
@@ -129,13 +137,17 @@
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_insert.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_insert.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_insert.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_insert.test	2025-05-19 16:14:26.000000000 +0000
@@ -56,9 +56,11 @@
 --enable_query_log
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 TRUNCATE TABLE mysql.general_log;
@@ -69,9 +71,11 @@
 --enable_query_log
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 TRUNCATE TABLE mysql.general_log;
@@ -82,9 +86,11 @@
 --enable_query_log
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_update.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_update.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_update.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/return_found_rows_update.test	2025-05-19 16:14:26.000000000 +0000
@@ -58,9 +58,11 @@
 --enable_query_log
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/select_by_null.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_by_null.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/select_by_null.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_by_null.test	2025-05-19 16:14:26.000000000 +0000
@@ -52,9 +52,11 @@
 SELECT pkey FROM tbl_a WHERE NULL;
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/select_with_backquote.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_with_backquote.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/select_with_backquote.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/select_with_backquote.test	2025-05-19 16:14:26.000000000 +0000
@@ -55,7 +55,9 @@
 
 --connection child2_1
 SET NAMES utf8;
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/slave_trx_isolation.test	2025-05-19 16:14:26.000000000 +0000
@@ -71,10 +71,12 @@
 
 --connection child2_1
 
+--disable_view_protocol
 --disable_ps2_protocol
 --replace_regex /-[0-9a-f]{12}-[0-9a-f]+-/-xxxxxxxxxxxx-xxxxx-/
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection slave1_1
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/sql_mode.inc	2025-05-19 16:14:26.000000000 +0000
@@ -49,8 +49,10 @@
 SELECT * FROM tbl_a ORDER BY pkey;
 
 --connection child2_1
+--disable_view_protocol
 --replace_regex /-[0-9a-f]{12}-[0-9a-f]+-/-xxxxxxxxxxxx-xxxxx-/
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/strict_group_by.test	2025-05-19 16:14:26.000000000 +0000
@@ -78,15 +78,19 @@
 set spider_direct_aggregate=@old_spider_direct_aggregate;
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_2_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/subquery.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/subquery.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/subquery.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/subquery.test	2025-05-19 16:14:26.000000000 +0000
@@ -7,6 +7,7 @@
 --enable_result_log
 --enable_query_log
 set spider_same_server_link=1;
+set global spider_same_server_link=1;
 evalp CREATE SERVER srv FOREIGN DATA WRAPPER MYSQL OPTIONS (SOCKET "$MASTER_1_MYSOCK", DATABASE 'test',user 'root');
 create table t1 (c1 int);
 create table t2 (c2 int);
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/udf_mysql_func_early.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/udf_mysql_func_early.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/udf_mysql_func_early.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/udf_mysql_func_early.test	2025-05-19 16:14:26.000000000 +0000
@@ -37,7 +37,7 @@
 
 create temporary table results (a int);
 --disable_ps_protocol
-SELECT SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"');
+SELECT SPIDER_DIRECT_SQL('select * from tbl_a', 'results', 'srv "s_2_1", database "auto_test_remote"') as exp;
 --enable_ps_protocol
 select * from results;
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/wrapper_mariadb.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/wrapper_mariadb.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/wrapper_mariadb.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/wrapper_mariadb.test	2025-05-19 16:14:26.000000000 +0000
@@ -47,12 +47,14 @@
 
 --disable_ps2_protocol
 --connection master_1
+--disable_view_protocol
 SELECT * FROM tbl_a ORDER BY pkey;
+--enable_view_protocol
 
 --connection child2_1
-# in --ps a query is logged differently in a general log
-replace_regex /order by t0.`pkey`/order by `pkey`/;
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/xa_cmd.test mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/xa_cmd.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/bugfix/t/xa_cmd.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/bugfix/t/xa_cmd.test	2025-05-19 16:14:26.000000000 +0000
@@ -49,9 +49,11 @@
 XA COMMIT 'test';
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/feature/r/pushdown_case.result mariadb-10.11.13/storage/spider/mysql-test/spider/feature/r/pushdown_case.result
--- mariadb-10.11.11/storage/spider/mysql-test/spider/feature/r/pushdown_case.result	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/feature/r/pushdown_case.result	2025-05-19 16:14:26.000000000 +0000
@@ -11,11 +11,11 @@
 create table t1 (c int) ENGINE=Spider
 COMMENT='WRAPPER "mysql", srv "srv",TABLE "t2"';
 insert into t1 values (42), (3), (848), (100);
-explain select case c when 3 then "three" when 42 then "answer" else "other" end from t1;
+explain select case c when 3 then "three" when 42 then "answer" else "other" end as exp from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	Storage engine handles GROUP BY
-select case c when 3 then "three" when 42 then "answer" else "other" end from t1;
-case c when 3 then "three" when 42 then "answer" else "other" end
+select case c when 3 then "three" when 42 then "answer" else "other" end as exp from t1;
+exp
 answer
 three
 other
@@ -29,11 +29,11 @@
 three
 NULL
 NULL
-explain select case when c = 3 then "three" when c = 42 then "answer" else "other" end from t1;
+explain select case when c = 3 then "three" when c = 42 then "answer" else "other" end as exp from t1;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	Storage engine handles GROUP BY
-select case when c = 3 then "three" when c = 42 then "answer" else "other" end from t1;
-case when c = 3 then "three" when c = 42 then "answer" else "other" end
+select case when c = 3 then "three" when c = 42 then "answer" else "other" end as exp from t1;
+exp
 answer
 three
 other
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/feature/t/checksum_table_parallel.inc mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/checksum_table_parallel.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/feature/t/checksum_table_parallel.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/checksum_table_parallel.inc	2025-05-19 16:14:26.000000000 +0000
@@ -74,10 +74,12 @@
 
 --connection child2_1_2
 SELECT SLEEP(1);
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 
 --connection child2_2_2
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 
 --connection child2_1
 UNLOCK TABLES;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/feature/t/pushdown_case.test mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/pushdown_case.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/feature/t/pushdown_case.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/feature/t/pushdown_case.test	2025-05-19 16:14:26.000000000 +0000
@@ -16,7 +16,7 @@
 
 # everything
 let $query=
-select case c when 3 then "three" when 42 then "answer" else "other" end from t1;
+select case c when 3 then "three" when 42 then "answer" else "other" end as exp from t1;
 eval explain $query;
 eval $query;
 
@@ -28,7 +28,7 @@
 
 # no value
 let $query=
-select case when c = 3 then "three" when c = 42 then "answer" else "other" end from t1;
+select case when c = 3 then "three" when c = 42 then "answer" else "other" end as exp from t1;
 eval explain $query;
 eval $query;
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_key.test mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_key.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_key.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_key.test	2025-05-19 16:14:26.000000000 +0000
@@ -67,7 +67,9 @@
 
 --connection child2_1
 --disable_ps_protocol
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 --enable_ps_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_pkey.test mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_pkey.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_pkey.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/direct_join_by_pkey_pkey.test	2025-05-19 16:14:26.000000000 +0000
@@ -67,7 +67,9 @@
 
 --connection child2_1
 --disable_ps_protocol
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 --enable_ps_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e1121/t/load_data.inc mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/load_data.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e1121/t/load_data.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e1121/t/load_data.inc	2025-05-19 16:14:26.000000000 +0000
@@ -67,9 +67,11 @@
 --remove_file $MYSQLTEST_VARDIR/tmp/spider_outfile.tsv
 
 --connection child2_1
+--disable_view_protocol
 --disable_ps2_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
 --enable_ps2_protocol
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --echo
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e112122/t/group_by_order_by_limit_ok.test mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/group_by_order_by_limit_ok.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e112122/t/group_by_order_by_limit_ok.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/group_by_order_by_limit_ok.test	2025-05-19 16:14:26.000000000 +0000
@@ -71,16 +71,22 @@
 set @old_spider_direct_aggregate=@@session.spider_direct_aggregate;
 set spider_direct_aggregate=1;
 SHOW STATUS LIKE 'Spider_direct_aggregate';
+--disable_view_protocol
 SELECT skey, count(*) cnt FROM tbl_a GROUP BY skey ORDER BY cnt DESC, skey DESC LIMIT 5;
+--enable_view_protocol
 SHOW STATUS LIKE 'Spider_direct_aggregate';
 set spider_direct_aggregate=@old_spider_direct_aggregate;
 
 --connection child2_1
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e112122/t/load_data_part.inc mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/load_data_part.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/regression/e112122/t/load_data_part.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/regression/e112122/t/load_data_part.inc	2025-05-19 16:14:26.000000000 +0000
@@ -84,11 +84,15 @@
 
 --disable_ps2_protocol
 --connection child2_1
+--disable_view_protocol
 eval $CHILD2_1_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_1_SELECT_TABLES;
 
 --connection child2_2
+--disable_view_protocol
 eval $CHILD2_2_SELECT_ARGUMENT1;
+--enable_view_protocol
 eval $CHILD2_2_SELECT_TABLES;
 --enable_ps2_protocol
 
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/auto_increment.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/auto_increment.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/auto_increment.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/auto_increment.test	2025-05-19 16:14:26.000000000 +0000
@@ -145,7 +145,9 @@
 }
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT * FROM tbl_a;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -156,7 +158,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/checksum_table_with_quick_mode_3.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/checksum_table_with_quick_mode_3.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/checksum_table_with_quick_mode_3.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/checksum_table_with_quick_mode_3.test	2025-05-19 16:14:26.000000000 +0000
@@ -96,9 +96,11 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --disable_ps2_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
     --enable_ps2_protocol
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_join.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_join.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join.test	2025-05-19 16:14:26.000000000 +0000
@@ -168,7 +168,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -315,7 +317,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_CONST_TABLE_JOIN;
   eval $CHILD2_1_SELECT_CONST_TABLE2_JOIN;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_join_using.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join_using.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_join_using.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_join_using.test	2025-05-19 16:14:26.000000000 +0000
@@ -156,7 +156,9 @@
 
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT a.a, c.b, c.c FROM tbl_a a join tbl_b b using(a) join tbl_c c using(a) ORDER BY a.b DESC;
+--enable_view_protocol
 
 if ($USE_CHILD_GROUP2)
 {
@@ -168,7 +170,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_join.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_join.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join.test	2025-05-19 16:14:26.000000000 +0000
@@ -156,7 +156,9 @@
 
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT a.a, c.b, c.c FROM tbl_a a left join tbl_b b on a.a = b.a left join tbl_c c on a.a = c.a ORDER BY a.b DESC;
+--enable_view_protocol
 
 if ($USE_CHILD_GROUP2)
 {
@@ -168,7 +170,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_join_nullable.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join_nullable.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_join_nullable.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_join_nullable.test	2025-05-19 16:14:26.000000000 +0000
@@ -183,7 +183,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_right_join_nullable.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_join_nullable.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_right_join_nullable.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_join_nullable.test	2025-05-19 16:14:26.000000000 +0000
@@ -183,7 +183,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_right_left_join_nullable.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_left_join_nullable.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_left_right_left_join_nullable.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_left_right_left_join_nullable.test	2025-05-19 16:14:26.000000000 +0000
@@ -183,7 +183,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_join.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_join.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join.test	2025-05-19 16:14:26.000000000 +0000
@@ -156,7 +156,9 @@
 
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT a.a, c.b, c.c FROM tbl_a a right join tbl_b b on a.a = b.a right join tbl_c c on a.a = c.a ORDER BY a.b DESC;
+--enable_view_protocol
 
 if ($USE_CHILD_GROUP2)
 {
@@ -168,7 +170,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_join_nullable.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join_nullable.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_join_nullable.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_join_nullable.test	2025-05-19 16:14:26.000000000 +0000
@@ -183,7 +183,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_left_join_nullable.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_join_nullable.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_left_join_nullable.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_join_nullable.test	2025-05-19 16:14:26.000000000 +0000
@@ -183,7 +183,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_left_right_join_nullable.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_right_join_nullable.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/direct_right_left_right_join_nullable.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/direct_right_left_right_join_nullable.test	2025-05-19 16:14:26.000000000 +0000
@@ -183,7 +183,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/ha.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/ha.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha.test	2025-05-19 16:14:26.000000000 +0000
@@ -395,7 +395,9 @@
 --connection master_1
 eval $MASTER_1_SET_RECOVERY_STATUS_2_1;
 eval $MASTER_1_CHECK_LINK_STATUS;
+--disable_view_protocol
 eval $MASTER_1_COPY_TABLES_2_1;
+--enable_view_protocol
 if ($USE_CHILD_GROUP3)
 {
   if (!$OUTPUT_CHILD_GROUP3)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/ha_part.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha_part.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/ha_part.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/ha_part.test	2025-05-19 16:14:26.000000000 +0000
@@ -460,7 +460,9 @@
   --connection master_1
   eval $MASTER_1_SET_RECOVERY_STATUS_P_2_1;
   eval $MASTER_1_CHECK_LINK_STATUS;
+  --disable_view_protocol
   eval $MASTER_1_COPY_TABLES_P_2_1;
+  --enable_view_protocol
   if ($USE_CHILD_GROUP3)
   {
     if (!$OUTPUT_CHILD_GROUP3)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/partition_cond_push.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_cond_push.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/partition_cond_push.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_cond_push.test	2025-05-19 16:14:26.000000000 +0000
@@ -166,19 +166,25 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   --connection child2_3
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_3_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_3_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/partition_fulltext.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_fulltext.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/partition_fulltext.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_fulltext.test	2025-05-19 16:14:26.000000000 +0000
@@ -170,19 +170,25 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   --connection child2_3
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_3_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_3_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/partition_join_pushdown_for_single_partition.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_join_pushdown_for_single_partition.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/partition_join_pushdown_for_single_partition.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/partition_join_pushdown_for_single_partition.test	2025-05-19 16:14:26.000000000 +0000
@@ -169,19 +169,25 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   --connection child2_3
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_3_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_3_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/pushdown_not_like.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/pushdown_not_like.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/pushdown_not_like.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/pushdown_not_like.test	2025-05-19 16:14:26.000000000 +0000
@@ -105,13 +105,17 @@
 
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 select * from ta_l where b not like 'a%';
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     SELECT argument FROM mysql.general_log WHERE command_type != 'Execute' AND argument LIKE '%select%';
+    --enable_view_protocol
   }
 }
 --enable_ps2_protocol
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_0.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_0.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_0.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_0.test	2025-05-19 16:14:26.000000000 +0000
@@ -135,15 +135,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -195,15 +199,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -243,6 +251,7 @@
   }
 }
 --connection master_1
+--disable_view_protocol
 --disable_ps2_protocol
 SELECT a.pkey FROM tbl_a a, tbl_b b WHERE a.pkey = b.pkey;
 if ($USE_CHILD_GROUP2)
@@ -273,6 +282,7 @@
   }
 }
 --enable_ps2_protocol
+--enable_view_protocol
 
 --echo
 --echo deinit
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_1.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_1.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_1.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_1.test	2025-05-19 16:14:26.000000000 +0000
@@ -135,15 +135,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -195,15 +199,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -255,15 +263,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_2.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_2.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_2.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_2.test	2025-05-19 16:14:26.000000000 +0000
@@ -135,15 +135,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -195,15 +199,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -255,15 +263,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_3.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_3.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/quick_mode_3.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/quick_mode_3.test	2025-05-19 16:14:26.000000000 +0000
@@ -135,15 +135,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -195,15 +199,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -255,15 +263,19 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLES;
   --connection child2_2
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /tmp_spider_bka_0x[0-9a-f]*/tmp_spider_bka_xxxx/
     eval $CHILD2_2_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_2_SELECT_TABLES;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/slave_trx_isolation.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/slave_trx_isolation.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/slave_trx_isolation.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/slave_trx_isolation.test	2025-05-19 16:14:26.000000000 +0000
@@ -109,8 +109,10 @@
   --disable_ps2_protocol
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     --replace_regex /-[0-9a-f]{12}-[0-9a-f]+-/-xxxxxxxxxxxx-xxxxx-/
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   --enable_ps2_protocol
   eval $CHILD2_1_SELECT_TABLES;
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/timestamp.test mariadb-10.11.13/storage/spider/mysql-test/spider/t/timestamp.test
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/timestamp.test	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/timestamp.test	2025-05-19 16:14:26.000000000 +0000
@@ -172,7 +172,9 @@
 }
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT *, unix_timestamp(col_ts) FROM tbl_a;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -183,7 +185,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLE;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -217,7 +221,9 @@
 --connection master_1
 DELETE FROM tbl_a WHERE col_ts='1970-01-01 01:00:01';
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT *, unix_timestamp(col_ts) FROM tbl_a;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -228,7 +234,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLE;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -261,7 +269,9 @@
 INSERT INTO tbl_a VALUES (1, now(), now());
 SET @@timestamp=0;
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT *, unix_timestamp(col_ts) FROM tbl_a;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -272,7 +282,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLE;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -306,7 +318,9 @@
 --connection master_1
 UPDATE tbl_a SET col_ts=col_dt;
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT *, unix_timestamp(col_ts) FROM tbl_a;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -317,7 +331,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLE;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -350,6 +366,7 @@
 }
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts > '2018-01-01';
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts < '2018-10-28 02:30:00';
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE '2018-10-28 02:30:00' > col_ts;
@@ -357,6 +374,7 @@
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts >= '2018-10-28 01:30:00' AND col_ts <= '2018-10-28 02:30:00';
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts > 180325020000;
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts > 19700101010001;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -367,7 +385,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLE;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -430,6 +450,7 @@
 }
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts > '2018-01-01';
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts < '2018-10-28 02:30:00';
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE '2018-10-28 02:30:00' > col_ts;
@@ -437,6 +458,7 @@
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts >= '2018-10-28 01:30:00' AND col_ts <= '2018-10-28 02:30:00';
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts > 180325020000;
 SELECT *, unix_timestamp(col_ts) FROM tbl_a WHERE col_ts > 19700101010001;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -447,7 +469,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLE;
   if (!$OUTPUT_CHILD_GROUP2)
@@ -484,11 +508,13 @@
 }
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT * FROM tbl_f;
 SELECT TIMESTAMP(col_d, col_t) FROM tbl_f;
 SELECT TIMESTAMP('2018-06-25', col_t) FROM tbl_f;
 SELECT TIMESTAMP(col_d, '10:43:21') FROM tbl_f;
 SELECT TIMESTAMP('2018-06-25', '10:43:21') FROM tbl_f;
+--enable_view_protocol
 if ($USE_CHILD_GROUP2)
 {
   if (!$OUTPUT_CHILD_GROUP2)
@@ -499,7 +525,9 @@
   --connection child2_1
   if ($USE_GENERAL_LOG)
   {
+    --disable_view_protocol
     eval $CHILD2_1_SELECT_ARGUMENT1;
+    --enable_view_protocol
   }
   eval $CHILD2_1_SELECT_TABLE_F;
   if (!$OUTPUT_CHILD_GROUP2)
diff -Nru mariadb-10.11.11/storage/spider/mysql-test/spider/t/udf_pushdown.inc mariadb-10.11.13/storage/spider/mysql-test/spider/t/udf_pushdown.inc
--- mariadb-10.11.11/storage/spider/mysql-test/spider/t/udf_pushdown.inc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/mysql-test/spider/t/udf_pushdown.inc	2025-05-19 16:14:26.000000000 +0000
@@ -2,14 +2,18 @@
 --echo ##### test SELECTs #####
 --connection master_1
 --disable_ps2_protocol
+--disable_view_protocol
 SELECT * FROM ta_l WHERE id = plusone(1);
 SELECT * FROM ta_l WHERE id IN (plusone(1), plusone(2)) AND a = plusone(32);
+--enable_view_protocol
 --enable_ps2_protocol
 
 if ($USE_CHILD_GROUP2)
 {
   --connection child2_1
+  --disable_view_protocol
   SELECT argument FROM mysql.general_log WHERE argument LIKE "%select%" AND argument NOT LIKE "%argument%";
+  --enable_view_protocol
   --disable_query_log
   TRUNCATE TABLE mysql.general_log;
   --enable_query_log
diff -Nru mariadb-10.11.11/storage/spider/spd_db_conn.cc mariadb-10.11.13/storage/spider/spd_db_conn.cc
--- mariadb-10.11.11/storage/spider/spd_db_conn.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_db_conn.cc	2025-05-19 16:14:26.000000000 +0000
@@ -2365,7 +2365,7 @@
     if (result_list->quick_mode == 0)
     {
       SPIDER_DB_RESULT *result = current->result;
-      if (!(row = result->fetch_row()))
+      if (!(row = result->fetch_row(result_list->skips)))
       {
         table->status = STATUS_NOT_FOUND;
         DBUG_RETURN(HA_ERR_END_OF_FILE);
@@ -2481,7 +2481,7 @@
   if (result_list->quick_mode == 0)
   {
     SPIDER_DB_RESULT *result = current->result;
-    if (!(row = result->fetch_row()))
+    if (!(row = result->fetch_row(result_list->skips)))
     {
       table->status = STATUS_NOT_FOUND;
       DBUG_RETURN(HA_ERR_END_OF_FILE);
@@ -2588,7 +2588,7 @@
   if (result_list->quick_mode == 0)
   {
     SPIDER_DB_RESULT *result = current->result;
-    if (!(row = result->fetch_row()))
+    if (!(row = result->fetch_row(result_list->skips)))
     {
       table->status = STATUS_NOT_FOUND;
       DBUG_RETURN(HA_ERR_END_OF_FILE);
@@ -3039,6 +3039,10 @@
     db_conn = conn->db_conn;
     if (!result_list->current)
     {
+      /*
+        Point ->current and ->bgs_current to ->first (create ->first
+        if needed)
+      */
       if (!result_list->first)
       {
         if (!(result_list->first = (SPIDER_RESULT *)
@@ -3063,11 +3067,15 @@
       }
       result_list->bgs_current = result_list->current;
       current = (SPIDER_RESULT*) result_list->current;
-    } else {
+    } else { /* result_list->current != NULL */
       if (
         result_list->bgs_phase > 0 ||
         result_list->quick_phase > 0
       ) {
+        /*
+          Advance bgs_current to the next result. Create a new result
+          if needed
+        */
         if (result_list->bgs_current == result_list->last)
         {
           if (!(result_list->last = (SPIDER_RESULT *)
@@ -3110,6 +3118,10 @@
         }
         current = (SPIDER_RESULT*) result_list->bgs_current;
       } else {
+        /*
+          Advance current to the next result. Create a new result if
+          needed
+        */
         if (result_list->current == result_list->last)
         {
           if (!(result_list->last = (SPIDER_RESULT *)
@@ -3346,7 +3358,7 @@
           }
           position++;
           roop_count++;
-          row = current->result->fetch_row();
+          row = current->result->fetch_row(result_list->skips);
         }
       } else {
         do {
@@ -3368,7 +3380,7 @@
           }
         } while (
           page_size > roop_count &&
-          (row = current->result->fetch_row())
+          (row = current->result->fetch_row(result_list->skips))
         );
       }
       if (
@@ -3412,7 +3424,7 @@
           roop_count++;
         } while (
           result_list->limit_num > roop_count &&
-          (row = current->result->fetch_row())
+          (row = current->result->fetch_row(result_list->skips))
         );
         tmp_tbl->file->ha_end_bulk_insert();
         page_size = result_list->limit_num;
@@ -3631,7 +3643,7 @@
     }
     current->dbton_id = current->result->dbton_id;
     SPIDER_DB_ROW *row;
-    if (!(row = current->result->fetch_row()))
+    if (!(row = current->result->fetch_row(result_list->skips)))
     {
       error_num = current->result->get_errno();
       DBUG_PRINT("info",("spider set finish_flg point 3"));
@@ -3707,7 +3719,7 @@
         }
         position++;
         roop_count++;
-        row = current->result->fetch_row();
+        row = current->result->fetch_row(result_list->skips);
       }
     } else {
       do {
@@ -3729,7 +3741,7 @@
         }
       } while (
         page_size > roop_count &&
-        (row = current->result->fetch_row())
+        (row = current->result->fetch_row(result_list->skips))
       );
     }
     if (
@@ -3773,7 +3785,7 @@
         roop_count++;
       } while (
         result_list->limit_num > roop_count &&
-        (row = current->result->fetch_row())
+        (row = current->result->fetch_row(result_list->skips))
       );
       tmp_tbl->file->ha_end_bulk_insert();
       page_size = result_list->limit_num;
@@ -4316,9 +4328,8 @@
         }
       }
     }
-    DBUG_RETURN(spider_db_fetch(buf, spider, table));
-  } else
-    DBUG_RETURN(spider_db_fetch(buf, spider, table));
+  }
+  DBUG_RETURN(spider_db_fetch(buf, spider, table));
 }
 
 int spider_db_seek_last(
diff -Nru mariadb-10.11.11/storage/spider/spd_db_include.h mariadb-10.11.13/storage/spider/spd_db_include.h
--- mariadb-10.11.11/storage/spider/spd_db_include.h	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_db_include.h	2025-05-19 16:14:26.000000000 +0000
@@ -1688,7 +1688,7 @@
     st_spider_result   *next;
   SPIDER_POSITION      *first_position; /* for quick mode */
   int                  pos_page_size; /* for quick mode */
-  longlong             record_num;
+  longlong             record_num;    /* number of rows */
   bool                 finish_flg;
   bool                 use_position;
   bool                 first_pos_use_position;
@@ -1731,7 +1731,7 @@
   bool                    sorted;
   bool                    desc_flg;
   longlong                current_row_num;
-  longlong                record_num;
+  longlong                record_num; /* number of rows */
   bool                    finish_flg;
   longlong                limit_num;
   longlong                internal_offset;
diff -Nru mariadb-10.11.11/storage/spider/spd_db_mysql.cc mariadb-10.11.13/storage/spider/spd_db_mysql.cc
--- mariadb-10.11.11/storage/spider/spd_db_mysql.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_db_mysql.cc	2025-05-19 16:14:26.000000000 +0000
@@ -2100,6 +2100,10 @@
     general_log_write(current_thd, COM_QUERY, tmp_query_str.ptr(),
       tmp_query_str.length());
   }
+  /* There should be a live connection to the data node */
+  DBUG_ASSERT(db_conn);
+  if (!db_conn)
+    DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
   if (!spider_param_dry_access())
   {
     error_num = mysql_real_query(db_conn, query, length);
diff -Nru mariadb-10.11.11/storage/spider/spd_direct_sql.cc mariadb-10.11.13/storage/spider/spd_direct_sql.cc
--- mariadb-10.11.11/storage/spider/spd_direct_sql.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_direct_sql.cc	2025-05-19 16:14:26.000000000 +0000
@@ -194,7 +194,7 @@
         spider_dbton[roop_count2].wrapper : "NULL"));
     if (
       spider_dbton[roop_count2].wrapper &&
-      !strcmp(direct_sql->tgt_wrapper, spider_dbton[roop_count2].wrapper)
+      !strcasecmp(direct_sql->tgt_wrapper, spider_dbton[roop_count2].wrapper)
     ) {
         if (spider_dbton[roop_count2].db_access_type ==
           SPIDER_DB_ACCESS_TYPE_SQL)
@@ -1031,7 +1031,7 @@
             spider_dbton[roop_count].wrapper : "NULL"));
         if (
           spider_dbton[roop_count].wrapper &&
-          !strcmp(direct_sql->tgt_wrapper,
+          !strcasecmp(direct_sql->tgt_wrapper,
             spider_dbton[roop_count].wrapper)
         ) {
           if (spider_dbton[roop_count].db_access_type ==
diff -Nru mariadb-10.11.11/storage/spider/spd_group_by_handler.cc mariadb-10.11.13/storage/spider/spd_group_by_handler.cc
--- mariadb-10.11.11/storage/spider/spd_group_by_handler.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_group_by_handler.cc	2025-05-19 16:14:26.000000000 +0000
@@ -1548,6 +1548,16 @@
         if (item->const_item())
         {
           /*
+            Do not create the GBH when a derived table or view is
+            involved
+          */
+          if (thd->derived_tables != NULL)
+          {
+            keep_going= FALSE;
+            break;
+          }
+
+          /*
             Do not handle the complex case where there's a const item
             in the auxiliary fields. It is too unlikely (if at all) to
             happen to be covered by the GBH.
diff -Nru mariadb-10.11.11/storage/spider/spd_table.cc mariadb-10.11.13/storage/spider/spd_table.cc
--- mariadb-10.11.11/storage/spider/spd_table.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_table.cc	2025-05-19 16:14:26.000000000 +0000
@@ -3550,7 +3550,7 @@
               spider_dbton[roop_count2].wrapper : "NULL"));
           if (
             spider_dbton[roop_count2].wrapper &&
-            !strcmp(share->tgt_wrappers[roop_count],
+            !strcasecmp(share->tgt_wrappers[roop_count],
               spider_dbton[roop_count2].wrapper)
           ) {
             if (spider_dbton[roop_count2].db_access_type ==
@@ -3832,7 +3832,7 @@
               spider_dbton[roop_count2].wrapper : "NULL"));
           if (
             spider_dbton[roop_count2].wrapper &&
-            !strcmp(share->tgt_wrappers[roop_count],
+            !strcasecmp(share->tgt_wrappers[roop_count],
               spider_dbton[roop_count2].wrapper)
           ) {
             if (spider_dbton[roop_count2].db_access_type ==
diff -Nru mariadb-10.11.11/storage/spider/spd_trx.cc mariadb-10.11.13/storage/spider/spd_trx.cc
--- mariadb-10.11.11/storage/spider/spd_trx.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_trx.cc	2025-05-19 16:14:26.000000000 +0000
@@ -844,225 +844,6 @@
   DBUG_RETURN(error_num);
 }
 
-bool spider_cmp_trx_alter_table(
-  SPIDER_ALTER_TABLE *cmp1,
-  SPIDER_ALTER_TABLE *cmp2
-) {
-  int roop_count;
-  DBUG_ENTER("spider_cmp_trx_alter_table");
-  if (
-    cmp1->tmp_priority != cmp2->tmp_priority ||
-    cmp1->link_count != cmp2->link_count ||
-    cmp1->all_link_count != cmp2->all_link_count
-  )
-    DBUG_RETURN(TRUE);
-
-  for (roop_count = 0; roop_count < (int) cmp1->all_link_count; roop_count++)
-  {
-    if (
-      (
-        cmp1->tmp_server_names[roop_count] !=
-          cmp2->tmp_server_names[roop_count] &&
-        (
-          !cmp1->tmp_server_names[roop_count] ||
-          !cmp2->tmp_server_names[roop_count] ||
-          strcmp(cmp1->tmp_server_names[roop_count],
-            cmp2->tmp_server_names[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_table_names[roop_count] !=
-          cmp2->tmp_tgt_table_names[roop_count] &&
-        (
-          !cmp1->tmp_tgt_table_names[roop_count] ||
-          !cmp2->tmp_tgt_table_names[roop_count] ||
-          strcmp(cmp1->tmp_tgt_table_names[roop_count],
-            cmp2->tmp_tgt_table_names[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_dbs[roop_count] !=
-          cmp2->tmp_tgt_dbs[roop_count] &&
-        (
-          !cmp1->tmp_tgt_dbs[roop_count] ||
-          !cmp2->tmp_tgt_dbs[roop_count] ||
-          strcmp(cmp1->tmp_tgt_dbs[roop_count],
-            cmp2->tmp_tgt_dbs[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_hosts[roop_count] !=
-          cmp2->tmp_tgt_hosts[roop_count] &&
-        (
-          !cmp1->tmp_tgt_hosts[roop_count] ||
-          !cmp2->tmp_tgt_hosts[roop_count] ||
-          strcmp(cmp1->tmp_tgt_hosts[roop_count],
-            cmp2->tmp_tgt_hosts[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_usernames[roop_count] !=
-          cmp2->tmp_tgt_usernames[roop_count] &&
-        (
-          !cmp1->tmp_tgt_usernames[roop_count] ||
-          !cmp2->tmp_tgt_usernames[roop_count] ||
-          strcmp(cmp1->tmp_tgt_usernames[roop_count],
-            cmp2->tmp_tgt_usernames[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_passwords[roop_count] !=
-          cmp2->tmp_tgt_passwords[roop_count] &&
-        (
-          !cmp1->tmp_tgt_passwords[roop_count] ||
-          !cmp2->tmp_tgt_passwords[roop_count] ||
-          strcmp(cmp1->tmp_tgt_passwords[roop_count],
-            cmp2->tmp_tgt_passwords[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_sockets[roop_count] !=
-          cmp2->tmp_tgt_sockets[roop_count] &&
-        (
-          !cmp1->tmp_tgt_sockets[roop_count] ||
-          !cmp2->tmp_tgt_sockets[roop_count] ||
-          strcmp(cmp1->tmp_tgt_sockets[roop_count],
-            cmp2->tmp_tgt_sockets[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_wrappers[roop_count] !=
-          cmp2->tmp_tgt_wrappers[roop_count] &&
-        (
-          !cmp1->tmp_tgt_wrappers[roop_count] ||
-          !cmp2->tmp_tgt_wrappers[roop_count] ||
-          strcmp(cmp1->tmp_tgt_wrappers[roop_count],
-            cmp2->tmp_tgt_wrappers[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_ssl_cas[roop_count] !=
-          cmp2->tmp_tgt_ssl_cas[roop_count] &&
-        (
-          !cmp1->tmp_tgt_ssl_cas[roop_count] ||
-          !cmp2->tmp_tgt_ssl_cas[roop_count] ||
-          strcmp(cmp1->tmp_tgt_ssl_cas[roop_count],
-            cmp2->tmp_tgt_ssl_cas[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_ssl_capaths[roop_count] !=
-          cmp2->tmp_tgt_ssl_capaths[roop_count] &&
-        (
-          !cmp1->tmp_tgt_ssl_capaths[roop_count] ||
-          !cmp2->tmp_tgt_ssl_capaths[roop_count] ||
-          strcmp(cmp1->tmp_tgt_ssl_capaths[roop_count],
-            cmp2->tmp_tgt_ssl_capaths[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_ssl_certs[roop_count] !=
-          cmp2->tmp_tgt_ssl_certs[roop_count] &&
-        (
-          !cmp1->tmp_tgt_ssl_certs[roop_count] ||
-          !cmp2->tmp_tgt_ssl_certs[roop_count] ||
-          strcmp(cmp1->tmp_tgt_ssl_certs[roop_count],
-            cmp2->tmp_tgt_ssl_certs[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_ssl_ciphers[roop_count] !=
-          cmp2->tmp_tgt_ssl_ciphers[roop_count] &&
-        (
-          !cmp1->tmp_tgt_ssl_ciphers[roop_count] ||
-          !cmp2->tmp_tgt_ssl_ciphers[roop_count] ||
-          strcmp(cmp1->tmp_tgt_ssl_ciphers[roop_count],
-            cmp2->tmp_tgt_ssl_ciphers[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_ssl_keys[roop_count] !=
-          cmp2->tmp_tgt_ssl_keys[roop_count] &&
-        (
-          !cmp1->tmp_tgt_ssl_keys[roop_count] ||
-          !cmp2->tmp_tgt_ssl_keys[roop_count] ||
-          strcmp(cmp1->tmp_tgt_ssl_keys[roop_count],
-            cmp2->tmp_tgt_ssl_keys[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_default_files[roop_count] !=
-          cmp2->tmp_tgt_default_files[roop_count] &&
-        (
-          !cmp1->tmp_tgt_default_files[roop_count] ||
-          !cmp2->tmp_tgt_default_files[roop_count] ||
-          strcmp(cmp1->tmp_tgt_default_files[roop_count],
-            cmp2->tmp_tgt_default_files[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_default_groups[roop_count] !=
-          cmp2->tmp_tgt_default_groups[roop_count] &&
-        (
-          !cmp1->tmp_tgt_default_groups[roop_count] ||
-          !cmp2->tmp_tgt_default_groups[roop_count] ||
-          strcmp(cmp1->tmp_tgt_default_groups[roop_count],
-            cmp2->tmp_tgt_default_groups[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_dsns[roop_count] !=
-          cmp2->tmp_tgt_dsns[roop_count] &&
-        (
-          !cmp1->tmp_tgt_dsns[roop_count] ||
-          !cmp2->tmp_tgt_dsns[roop_count] ||
-          strcmp(cmp1->tmp_tgt_dsns[roop_count],
-            cmp2->tmp_tgt_dsns[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_filedsns[roop_count] !=
-          cmp2->tmp_tgt_filedsns[roop_count] &&
-        (
-          !cmp1->tmp_tgt_filedsns[roop_count] ||
-          !cmp2->tmp_tgt_filedsns[roop_count] ||
-          strcmp(cmp1->tmp_tgt_filedsns[roop_count],
-            cmp2->tmp_tgt_filedsns[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_tgt_drivers[roop_count] !=
-          cmp2->tmp_tgt_drivers[roop_count] &&
-        (
-          !cmp1->tmp_tgt_drivers[roop_count] ||
-          !cmp2->tmp_tgt_drivers[roop_count] ||
-          strcmp(cmp1->tmp_tgt_drivers[roop_count],
-            cmp2->tmp_tgt_drivers[roop_count])
-        )
-      ) ||
-      (
-        cmp1->tmp_static_link_ids[roop_count] !=
-          cmp2->tmp_static_link_ids[roop_count] &&
-        (
-          !cmp1->tmp_static_link_ids[roop_count] ||
-          !cmp2->tmp_static_link_ids[roop_count] ||
-          strcmp(cmp1->tmp_static_link_ids[roop_count],
-            cmp2->tmp_static_link_ids[roop_count])
-        )
-      ) ||
-      cmp1->tmp_tgt_ports[roop_count] != cmp2->tmp_tgt_ports[roop_count] ||
-      cmp1->tmp_tgt_ssl_vscs[roop_count] !=
-        cmp2->tmp_tgt_ssl_vscs[roop_count] ||
-      cmp1->tmp_monitoring_binlog_pos_at_failing[roop_count] !=
-        cmp2->tmp_monitoring_binlog_pos_at_failing[roop_count] ||
-      cmp1->tmp_link_statuses[roop_count] !=
-        cmp2->tmp_link_statuses[roop_count]
-    )
-      DBUG_RETURN(TRUE);
-  }
-  DBUG_RETURN(FALSE);
-}
-
 int spider_free_trx_alloc(
   SPIDER_TRX *trx
 ) {
diff -Nru mariadb-10.11.11/storage/spider/spd_trx.h mariadb-10.11.13/storage/spider/spd_trx.h
--- mariadb-10.11.11/storage/spider/spd_trx.h	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/storage/spider/spd_trx.h	2025-05-19 16:14:26.000000000 +0000
@@ -67,11 +67,6 @@
   bool now_create
 );
 
-bool spider_cmp_trx_alter_table(
-  SPIDER_ALTER_TABLE *cmp1,
-  SPIDER_ALTER_TABLE *cmp2
-);
-
 SPIDER_TRX *spider_get_trx(
   THD *thd,
   bool regist_allocated_thds,
diff -Nru mariadb-10.11.11/strings/ctype-bin.c mariadb-10.11.13/strings/ctype-bin.c
--- mariadb-10.11.11/strings/ctype-bin.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/ctype-bin.c	2025-05-19 16:14:26.000000000 +0000
@@ -296,6 +296,7 @@
   const uchar *end = key + len;
   ulong tmp1= *nr1;
   ulong tmp2= *nr2;
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
 
   for (; key < end ; key++)
   {
@@ -316,6 +317,7 @@
     'A ' and 'A' as identical
   */
   const uchar *end= skip_trailing_space(key, len);
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_bin(cs, key, end - key, nr1, nr2);
 }
 
diff -Nru mariadb-10.11.11/strings/ctype-latin1.c mariadb-10.11.13/strings/ctype-latin1.c
--- mariadb-10.11.11/strings/ctype-latin1.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/ctype-latin1.c	2025-05-19 16:14:26.000000000 +0000
@@ -703,7 +703,8 @@
 {
   const uchar *end;
   register ulong m1= *nr1, m2= *nr2;
-    
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
+
   /*
     Remove end space. We have to do this to be able to compare
     'AE' and 'Ä' as identical
diff -Nru mariadb-10.11.11/strings/ctype-mb.c mariadb-10.11.13/strings/ctype-mb.c
--- mariadb-10.11.11/strings/ctype-mb.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/ctype-mb.c	2025-05-19 16:14:26.000000000 +0000
@@ -619,6 +619,7 @@
 {
   register ulong m1= *nr1, m2= *nr2;
   const uchar *end= key + len;
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
   for (; key < end ; key++)
   {
     MY_HASH_ADD(m1, m2, (uint)*key);
@@ -637,6 +638,7 @@
     'A ' and 'A' as identical
   */
   const uchar *end= skip_trailing_space(key, len);
+  DBUG_ASSERT(key);  /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_mb_nopad_bin(cs, key, end - key, nr1, nr2);
 }
 
diff -Nru mariadb-10.11.11/strings/ctype-simple.c mariadb-10.11.13/strings/ctype-simple.c
--- mariadb-10.11.11/strings/ctype-simple.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/ctype-simple.c	2025-05-19 16:14:26.000000000 +0000
@@ -346,6 +346,7 @@
   register const uchar *sort_order=cs->sort_order;
   const uchar *end= key + len;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
   for (; key < (uchar*) end ; key++)
   {
     MY_HASH_ADD(m1, m2, (uint) sort_order[(uint) *key]);
@@ -362,6 +363,7 @@
   register const uchar *sort_order=cs->sort_order;
   const uchar *end;
   uint16 space_weight= sort_order[' '];
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
 
   /*
     Remove all trailing characters that are equal to space.
diff -Nru mariadb-10.11.11/strings/ctype-uca.inl mariadb-10.11.13/strings/ctype-uca.inl
--- mariadb-10.11.11/strings/ctype-uca.inl	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/ctype-uca.inl	2025-05-19 16:14:26.000000000 +0000
@@ -636,6 +636,7 @@
   my_uca_scanner_param param;
   int space_weight= my_space_weight(&cs->uca->level[0]);
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
 
   my_uca_scanner_param_init(&param, cs, &cs->uca->level[0]);
   my_uca_scanner_init_any(&scanner, s, slen);
@@ -691,6 +692,7 @@
   my_uca_scanner scanner;
   my_uca_scanner_param param;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
 
   my_uca_scanner_param_init(&param, cs, &cs->uca->level[0]);
   my_uca_scanner_init_any(&scanner, s, slen);
diff -Nru mariadb-10.11.11/strings/ctype-ucs2.c mariadb-10.11.13/strings/ctype-ucs2.c
--- mariadb-10.11.11/strings/ctype-ucs2.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/ctype-ucs2.c	2025-05-19 16:14:26.000000000 +0000
@@ -1324,6 +1324,7 @@
   const uchar *e= s + slen;
   MY_CASEFOLD_INFO *uni_plane= cs->casefold;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
 
   while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
   {
@@ -1341,6 +1342,7 @@
                    ulong *nr1, ulong *nr2)
 {
   size_t lengthsp= my_ci_lengthsp(cs, (const char *) s, slen);
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_utf16_nopad(cs, s, lengthsp, nr1, nr2);
 }
 
@@ -1451,6 +1453,7 @@
 {
   const uchar *end= pos + len;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(pos); /* Avoid UBSAN nullptr-with-offset */
 
   for ( ; pos < end ; pos++)
   {
@@ -1466,6 +1469,7 @@
                        const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
 {
   size_t lengthsp= my_ci_lengthsp(cs, (const char *) pos, len);
+  DBUG_ASSERT(pos); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_utf16_nopad_bin(cs, pos, lengthsp, nr1, nr2);
 }
 
@@ -2192,6 +2196,7 @@
   const uchar *e= s + slen;
   MY_CASEFOLD_INFO *uni_plane= cs->casefold;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
 
   while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
   {
@@ -2212,6 +2217,7 @@
                    ulong *nr1, ulong *nr2)
 {
   size_t lengthsp= my_lengthsp_utf32(cs, (const char *) s, slen);
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_utf32_nopad(cs, s, lengthsp, nr1, nr2);
 }
 
@@ -3060,6 +3066,7 @@
   const uchar *e=s+slen;
   MY_CASEFOLD_INFO *uni_plane= cs->casefold;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
 
   while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
   {
@@ -3076,6 +3083,7 @@
 			      ulong *nr1, ulong *nr2)
 {
   size_t lengthsp= my_lengthsp_mb2(cs, (const char *) s, slen);
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_ucs2_nopad(cs, s, lengthsp, nr1, nr2);
 }
 
@@ -3200,6 +3208,7 @@
 {
   const uchar *end= key + len;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
   for ( ; key < end ; key++)
   {
     MY_HASH_ADD(m1, m2, (uint)*key);
@@ -3214,6 +3223,7 @@
                       const uchar *key, size_t len, ulong *nr1, ulong *nr2)
 {
   size_t lengthsp= my_lengthsp_mb2(cs, (const char *) key, len);
+  DBUG_ASSERT(key); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_ucs2_nopad_bin(cs, key, lengthsp, nr1, nr2);
 }
 
diff -Nru mariadb-10.11.11/strings/ctype-utf8.c mariadb-10.11.13/strings/ctype-utf8.c
--- mariadb-10.11.11/strings/ctype-utf8.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/ctype-utf8.c	2025-05-19 16:14:26.000000000 +0000
@@ -699,6 +699,7 @@
   const uchar *e= s+slen;
   MY_CASEFOLD_INFO *uni_plane= cs->casefold;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
 
   while ((s < e) && (res=my_utf8mb3_uni(cs,&wc, (uchar *)s, (uchar*)e))>0 )
   {
@@ -719,6 +720,7 @@
     'A ' and 'A' as identical
   */
   const uchar *e= skip_trailing_space(s, slen);
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_utf8mb3_nopad(cs, s, e - s, nr1, nr2);
 }
 
@@ -3148,6 +3150,7 @@
   const uchar *e= s + slen;
   MY_CASEFOLD_INFO *uni_plane= cs->casefold;
   register ulong m1= *nr1, m2= *nr2;
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
 
   while ((res= my_mb_wc_utf8mb4(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
   {
@@ -3180,6 +3183,7 @@
     'A ' and 'A' as identical
   */
   const uchar *e= skip_trailing_space(s, slen);
+  DBUG_ASSERT(s); /* Avoid UBSAN nullptr-with-offset */
   my_hash_sort_utf8mb4_nopad(cs, s, e - s, nr1, nr2);
 }
 
diff -Nru mariadb-10.11.11/strings/json_lib.c mariadb-10.11.13/strings/json_lib.c
--- mariadb-10.11.11/strings/json_lib.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/json_lib.c	2025-05-19 16:14:26.000000000 +0000
@@ -1656,15 +1656,7 @@
     }
     if (c_len == MY_CS_ILUNI)
     {
-      /*
-        Result charset doesn't support the json's character.
-        Let's replace it with the '?' symbol.
-      */
-      if ((c_len= my_ci_wc_mb(res_cs, '?', res, res_end)) > 0)
-      {
-        res+= c_len;
-        continue;
-      }
+      return -1;
     }
     /* Result buffer is too small. */
     return -1;
diff -Nru mariadb-10.11.11/strings/strings_def.h mariadb-10.11.13/strings/strings_def.h
--- mariadb-10.11.11/strings/strings_def.h	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/strings/strings_def.h	2025-05-19 16:14:26.000000000 +0000
@@ -81,7 +81,7 @@
 static inline const uchar *skip_trailing_space(const uchar *ptr,size_t len)
 {
   const uchar *end= ptr + len;
-
+  DBUG_ASSERT(ptr); /* Avoid UBSAN nullptr-with-offset */
   if (len > 20)
   {
     const uchar *end_words= (const uchar *)(intptr)
diff -Nru mariadb-10.11.11/support-files/mariadb.service.in mariadb-10.11.13/support-files/mariadb.service.in
--- mariadb-10.11.11/support-files/mariadb.service.in	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/support-files/mariadb.service.in	2025-05-19 16:14:26.000000000 +0000
@@ -48,10 +48,8 @@
 Group=mysql
 
 # CAP_IPC_LOCK To allow memlock to be used as non-root user
-# CAP_DAC_OVERRIDE To allow auth_pam_tool (which is SUID root) to read /etc/shadow when it's chmod 0
-#   does nothing for non-root, not needed if /etc/shadow is u+r
-# CAP_AUDIT_WRITE auth_pam_tool needs it on Debian for whatever reason
-AmbientCapabilities=CAP_IPC_LOCK CAP_DAC_OVERRIDE CAP_AUDIT_WRITE
+# These are enabled by default
+AmbientCapabilities=CAP_IPC_LOCK
 
 # PrivateDevices=true implies NoNewPrivileges=true and
 # SUID auth_pam_tool suddenly doesn't do setuid anymore
@@ -105,7 +103,7 @@
 
 # Restart crashed server only, on-failure would also restart, for example, when
 # my.cnf contains unknown option
-Restart=on-abort
+Restart=on-abnormal
 RestartSec=5s
 
 UMask=007
diff -Nru mariadb-10.11.11/support-files/mariadb@.service.in mariadb-10.11.13/support-files/mariadb@.service.in
--- mariadb-10.11.11/support-files/mariadb@.service.in	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/support-files/mariadb@.service.in	2025-05-19 16:14:26.000000000 +0000
@@ -178,10 +178,8 @@
 ##
 
 # CAP_IPC_LOCK To allow memlock to be used as non-root user
-# CAP_DAC_OVERRIDE To allow auth_pam_tool (which is SUID root) to read /etc/shadow when it's chmod 0
-#   does nothing for non-root, not needed if /etc/shadow is u+r
-# CAP_AUDIT_WRITE auth_pam_tool needs it on Debian for whatever reason
-AmbientCapabilities=CAP_IPC_LOCK CAP_DAC_OVERRIDE CAP_AUDIT_WRITE
+# These are enabled by default
+AmbientCapabilities=CAP_IPC_LOCK
 
 # PrivateDevices=true implies NoNewPrivileges=true and
 # SUID auth_pam_tool suddenly doesn't do setuid anymore
@@ -219,7 +217,7 @@
 
 # Restart crashed server only, on-failure would also restart, for example, when
 # my.cnf contains unknown option
-Restart=on-abort
+Restart=on-abnormal
 RestartSec=5s
 
 UMask=007
diff -Nru mariadb-10.11.11/support-files/rpm/server-prein.sh mariadb-10.11.13/support-files/rpm/server-prein.sh
--- mariadb-10.11.11/support-files/rpm/server-prein.sh	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/support-files/rpm/server-prein.sh	2025-05-19 16:14:26.000000000 +0000
@@ -28,10 +28,11 @@
 
   if [ "$old_family" != "$new_family" ]; then
     error_text="$error_text
-Upgrading directly from MySQL $old_family to MariaDB $new_family may not
-be safe in all cases.  A manual dump and restore using mysqldump is
-recommended.  It is important to review the MariaDB manual's Upgrading
-section for version-specific incompatibilities.
+Upgrading directly from MariaDB or MySQL $old_family
+to MariaDB $new_family may not be safe in all cases.
+A manual dump and restore using mariadb-dump (or mysqldump) is recommended.
+It is important to review the MariaDB manual's Upgrading section for
+version-specific incompatibilities.
 "
   fi
 
@@ -39,23 +40,23 @@
     cat <<HERE >&2
 
 ******************************************************************
-A MySQL or MariaDB server package ($installed) is installed.
+A MariaDB or MySQL server package ($installed) is installed.
 $error_text
 A manual upgrade is required.
 
 - Ensure that you have a complete, working backup of your data and my.cnf
   files
-- Shut down the MySQL server cleanly
-- Remove the existing MySQL packages.  Usually this command will
+- Shut down the MariaDB or MySQL server cleanly
+- Remove the existing MariaDB or MySQL packages. Usually this command will
   list the packages you should remove:
-  rpm -qa | grep -i '^mysql-'
+  rpm -qa | grep -iE '^(mariadb|mysql)-'
 
   You may choose to use 'rpm --nodeps -ev <package-name>' to remove
   the package which contains the mysqlclient shared library.  The
   library will be reinstalled by the MariaDB-shared package.
 - Install the new MariaDB packages supplied by $myvendor
 - Ensure that the MariaDB server is started
-- Run the 'mysql_upgrade' program
+- Run the 'mariadb-upgrade' program
 
 This is a brief description of the upgrade process.  Important details
 can be found in the MariaDB manual, in the Upgrading section.
@@ -65,8 +66,8 @@
   fi
 fi
 
-# Create a MySQL user and group. Do not report any problems if it already exists.
+# Create a MariaDB user and group. Do not report any problems if it already exists.
 groupadd -r %{mysqld_group} 2> /dev/null || true
-useradd -M -r --home %{mysqldatadir} --shell /sbin/nologin --comment "MySQL server" --gid %{mysqld_group} %{mysqld_user} 2> /dev/null || true
+useradd -M -r --home %{mysqldatadir} --shell /sbin/nologin --comment "MariaDB server" --gid %{mysqld_group} %{mysqld_user} 2> /dev/null || true
 # The user may already exist, make sure it has the proper group nevertheless (BUG#12823)
 usermod --gid %{mysqld_group} %{mysqld_user} 2> /dev/null || true
diff -Nru mariadb-10.11.11/tests/mysql_client_fw.c mariadb-10.11.13/tests/mysql_client_fw.c
--- mariadb-10.11.11/tests/mysql_client_fw.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/tests/mysql_client_fw.c	2025-05-19 16:14:26.000000000 +0000
@@ -1438,7 +1438,9 @@
   this limited check is enough, if sizeof(MYSQL) changes, it changes
   everywhere
 */
-#if defined __x86_64__
+#if defined _M_AMD64
+  compile_time_assert(sizeof(MYSQL) == 1208);
+#elif defined __x86_64__
   compile_time_assert(sizeof(MYSQL) == 1272);
 #elif defined __i386__
   compile_time_assert(sizeof(MYSQL) == 964);
diff -Nru mariadb-10.11.11/tests/mysql_client_test.c mariadb-10.11.13/tests/mysql_client_test.c
--- mariadb-10.11.11/tests/mysql_client_test.c	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/tests/mysql_client_test.c	2025-05-19 16:14:26.000000000 +0000
@@ -22629,6 +22629,164 @@
   rc= mysql_query(mysql, "DROP TABLE t1, t2");
   myquery(rc);
 }
+
+/* Server crash when inserting from derived table containing insert target table */
+static void test_mdev_32086()
+{
+  int        rc;
+  MYSQL_STMT *stmt_insert;
+  MYSQL_BIND bind[2];
+  MYSQL_RES *result;
+  MYSQL_ROW row;
+  unsigned int vals[] = { 123, 124};
+  unsigned int vals_array_len = 2;
+  const char *insert_stmt= "\
+insert into t1 values(\
+  (select 101+count(*)\
+   from\
+   (\
+      select dt2.id\
+      from (select id from t1) dt2, t1 t where t.id=dt2.id\
+   ) dt\
+   where dt.id<1000\
+  ), ?\
+)";
+
+  /* Set up test's environment */
+
+
+  rc= mysql_query(mysql, "create table t1 (pk int, id int);");
+  myquery(rc);
+
+  rc= mysql_query(mysql, "insert into t1 values (2,2), (3,3), (4,4);");
+  myquery(rc);
+
+  stmt_insert = mysql_stmt_init(mysql);
+  if (!stmt_insert)
+  {
+    fprintf(stderr, "mysql_stmt_init failed: Error: %s\n",
+            mysql_error(mysql));
+    exit(1);
+  }
+
+  rc= mysql_stmt_prepare(stmt_insert, insert_stmt, strlen(insert_stmt));
+  if (rc)
+  {
+    fprintf(stderr, "mysql_stmt_prepare failed: %s\n",
+            mysql_stmt_error(stmt_insert));
+    exit(1);
+  }
+
+  memset(&bind[0], 0, sizeof(MYSQL_BIND));
+
+  bind[0].buffer_type= MYSQL_TYPE_LONG;
+  bind[0].buffer= vals;
+
+  rc= mysql_stmt_attr_set(stmt_insert, STMT_ATTR_ARRAY_SIZE, &vals_array_len);
+  if (rc)
+  {
+    fprintf(stderr, "mysql_stmt_prepare failed: %s\n",
+            mysql_stmt_error(stmt_insert));
+    exit(1);
+  }
+
+  rc= mysql_stmt_bind_param(stmt_insert, bind);
+  if (rc)
+  {
+    fprintf(stderr, "mysql_stmt_bind_param failed: %s\n",
+            mysql_stmt_error(stmt_insert));
+    exit(1);
+  }
+
+  rc= mysql_stmt_execute(stmt_insert);
+  if (rc)
+  {
+    fprintf(stderr, "mysql_stmt_execute failed: %s\n",
+            mysql_stmt_error(stmt_insert));
+    exit(1);
+  }
+
+  /*
+    pk	id
+    2	2
+    3	3
+    4	4
+    104	123
+    104	124
+   */
+  rc= mysql_query(mysql, "select * from t1");
+  if (rc)
+  {
+    fprintf(stderr, "Query failed: %s\n", mysql_error(mysql));
+  }
+  result= mysql_store_result(mysql);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 2 && atoi(row[1]) == 2);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 3 && atoi(row[1]) == 3);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 4 && atoi(row[1]) == 4);
+  row= mysql_fetch_row(result);
+  printf("\n %d, %d \n", atoi(row[0]), atoi(row[1]));
+  DIE_UNLESS(atoi(row[0]) == 104 && atoi(row[1]) == 123);
+  row= mysql_fetch_row(result);
+  printf("\n %d, %d \n", atoi(row[0]), atoi(row[1]));
+  DIE_UNLESS(atoi(row[0]) == 104 && atoi(row[1]) == 124);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(row == NULL);
+
+  mysql_free_result(result);
+
+  rc= mysql_stmt_execute(stmt_insert);
+  if (rc)
+  {
+    fprintf(stderr, "mysql_stmt_execute failed: %s\n",
+            mysql_stmt_error(stmt_insert));
+    exit(1);
+  }
+  /*
+    pk	id
+    2	2
+    3	3
+    4	4
+    104	123
+    104	124
+    106	123
+    106	124
+   */
+  rc= mysql_query(mysql, "select * from t1");
+  if (rc)
+  {
+    fprintf(stderr, "Query failed: %s\n", mysql_error(mysql));
+  }
+  result= mysql_store_result(mysql);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 2 && atoi(row[1]) == 2);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 3 && atoi(row[1]) == 3);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 4 && atoi(row[1]) == 4);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 104 && atoi(row[1]) == 123);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(atoi(row[0]) == 104 && atoi(row[1]) == 124);
+  row= mysql_fetch_row(result);
+  printf("\n %d, %d \n", atoi(row[0]), atoi(row[1]));
+  DIE_UNLESS(atoi(row[0]) == 106 && atoi(row[1]) == 123);
+  row= mysql_fetch_row(result);
+  printf("\n %d, %d \n", atoi(row[0]), atoi(row[1]));
+  DIE_UNLESS(atoi(row[0]) == 106 && atoi(row[1]) == 124);
+  row= mysql_fetch_row(result);
+  DIE_UNLESS(row == NULL);
+
+  mysql_free_result(result);
+
+  mysql_stmt_close(stmt_insert);
+
+  /* Clean up */
+  rc= mysql_query(mysql, "DROP TABLE t1");
+  myquery(rc);
+}
 #endif // EMBEDDED_LIBRARY
 
 /*
@@ -22886,8 +23044,58 @@
   DIE_UNLESS(rc == 1);
 
   mysql_free_result(result);
+  mysql_query(mysql, "drop table t1");
 }
 
+
+static void test_mdev35953()
+{
+#ifndef EMBEDDED_LIBRARY
+  int rc;
+  MYSQL_STMT *stmt;
+  MYSQL_BIND bind[1];
+  int        vals[]= {1, 2}, count= array_elements(vals);
+  MYSQL *con= mysql_client_init(NULL);
+  DIE_UNLESS(con);
+  if (!mysql_real_connect(con, opt_host, opt_user, opt_password, current_db,
+                          opt_port, opt_unix_socket, 0))
+  {
+    fprintf(stderr, "Failed to connect to database: Error: %s\n",
+            mysql_error(con));
+    exit(1);
+  }
+  rc= mysql_query(mysql, "create table t1 (a int)");
+  myquery(rc);
+
+  stmt= mysql_stmt_init(con);
+  rc= mysql_stmt_prepare(stmt, "insert into t1 (a) values (?)", -1);
+  check_execute(stmt, rc);
+
+  memset(bind, 0, sizeof(bind));
+  bind[0].buffer_type = MYSQL_TYPE_LONG;
+  bind[0].buffer = vals;
+
+  mysql_stmt_attr_set(stmt, STMT_ATTR_ARRAY_SIZE, &count);
+  rc= mysql_stmt_bind_param(stmt, bind);
+  check_execute(stmt, rc);
+
+  rc= mysql_stmt_execute(stmt);
+  check_execute(stmt, rc);
+
+  rc= mysql_query(mysql, "alter table t1 add xx int");
+  myquery(rc);
+
+  rc= mysql_stmt_execute(stmt);
+  check_execute(stmt, rc);
+
+  mysql_stmt_close(stmt);
+  mysql_close(con);
+
+  mysql_query(mysql, "drop table t1");
+#endif
+}
+
+
 static struct my_tests_st my_tests[]= {
   { "test_mdev_20516", test_mdev_20516 },
   { "test_mdev24827", test_mdev24827 },
@@ -23204,8 +23412,10 @@
   { "test_mdev_34718_bd", test_mdev_34718_bd },
   { "test_mdev_34718_ad", test_mdev_34718_ad },
   { "test_mdev_34958", test_mdev_34958 },
+  { "test_mdev_32086", test_mdev_32086 },
 #endif
   { "test_mdev_10075", test_mdev_10075},
+  { "test_mdev35953", test_mdev35953 },
   { 0, 0 }
 };
 
diff -Nru mariadb-10.11.11/tpool/aio_liburing.cc mariadb-10.11.13/tpool/aio_liburing.cc
--- mariadb-10.11.11/tpool/aio_liburing.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/tpool/aio_liburing.cc	2025-05-19 16:14:26.000000000 +0000
@@ -34,9 +34,9 @@
 public:
   aio_uring(tpool::thread_pool *tpool, int max_aio) : tpool_(tpool)
   {
-    if (io_uring_queue_init(max_aio, &uring_, 0) != 0)
+    if (const auto e= io_uring_queue_init(max_aio, &uring_, 0))
     {
-      switch (const auto e= errno) {
+      switch (-e) {
       case ENOMEM:
         my_printf_error(ER_UNKNOWN_ERROR,
                         "io_uring_queue_init() failed with ENOMEM:"
@@ -57,6 +57,12 @@
                         "(newer than 5.1 required)",
                         ME_ERROR_LOG | ME_WARNING);
         break;
+      case EPERM:
+	my_printf_error(ER_UNKNOWN_ERROR,
+                        "io_uring_queue_init() failed with EPERM:"
+			" sysctl kernel.io_uring_disabled has the value 2, or 1 and the user of the process is not a member of sysctl kernel.io_uring_group. (see man 2 io_uring_setup).",
+                        ME_ERROR_LOG | ME_WARNING);
+	break;
       default:
         my_printf_error(ER_UNKNOWN_ERROR,
                         "io_uring_queue_init() failed with errno %d",
diff -Nru mariadb-10.11.11/tpool/tpool_generic.cc mariadb-10.11.13/tpool/tpool_generic.cc
--- mariadb-10.11.11/tpool/tpool_generic.cc	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/tpool/tpool_generic.cc	2025-05-19 16:14:26.000000000 +0000
@@ -218,7 +218,6 @@
 
   /** Overall number of enqueues*/
   unsigned long long m_tasks_enqueued;
-  unsigned long long m_group_enqueued;
   /** Overall number of dequeued tasks. */
   unsigned long long m_tasks_dequeued;
 
diff -Nru mariadb-10.11.11/win/packaging/ca/CMakeLists.txt mariadb-10.11.13/win/packaging/ca/CMakeLists.txt
--- mariadb-10.11.11/win/packaging/ca/CMakeLists.txt	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/win/packaging/ca/CMakeLists.txt	2025-05-19 16:14:26.000000000 +0000
@@ -18,7 +18,8 @@
 INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/sql ${CMAKE_CURRENT_SOURCE_DIR} ${WIX_INCLUDE_DIR})
 
 # Custom action should not depend on C runtime, since we do not know if CRT is installed.
-FORCE_STATIC_CRT()
 ADD_VERSION_INFO(wixca SHARED WIXCA_SOURCES)
 ADD_LIBRARY(wixca SHARED EXCLUDE_FROM_ALL ${WIXCA_SOURCES} ${CMAKE_SOURCE_DIR}/sql/winservice.c)
-TARGET_LINK_LIBRARIES(wixca ${WIX_WCAUTIL_LIBRARY} ${WIX_DUTIL_LIBRARY} msi version)
+# Static linking with CRT, because it is called when MSVC libraries are not yet there,
+SET_TARGET_PROPERTIES(wixca PROPERTIES MSVC_RUNTIME_LIBRARY MultiThreaded)
+TARGET_LINK_LIBRARIES(wixca ${WIX_WCAUTIL_LIBRARY} ${WIX_DUTIL_LIBRARY} msi version ws2_32)
diff -Nru mariadb-10.11.11/win/upgrade_wizard/CMakeLists.txt mariadb-10.11.13/win/upgrade_wizard/CMakeLists.txt
--- mariadb-10.11.11/win/upgrade_wizard/CMakeLists.txt	2025-01-30 11:01:25.000000000 +0000
+++ mariadb-10.11.13/win/upgrade_wizard/CMakeLists.txt	2025-05-19 16:14:26.000000000 +0000
@@ -4,7 +4,6 @@
 
 # We need MFC
 # /permissive- flag does not play well with MFC, disable it.
-STRING(REPLACE "/permissive-"  "" CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
 REMOVE_DEFINITIONS(-DNOSERVICE) # fixes "already defined" warning in an AFX header
 
 FIND_PACKAGE(MFC)
@@ -17,31 +16,22 @@
   RETURN()
 ENDIF()
 
-IF(MSVC_CRT_TYPE MATCHES "/MD")
-  # FORCE static CRT and MFC for upgrade wizard,
-  # so we do not have to redistribute MFC.
-  FORCE_STATIC_CRT()
-  SET(UPGRADE_WIZARD_SOURCES  ${CMAKE_SOURCE_DIR}/sql/winservice.c)
-ELSE()
-  SET(UPGRADE_WIZARD_LINK_LIBRARIES winservice)
-ENDIF()
-
 # MFC should be statically linked
 SET(CMAKE_MFC_FLAG 1)
 
 # Enable exception handling (avoids warnings)
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc -DNO_WARN_MBCS_MFC_DEPRECATION")
-
+ADD_DEFINITIONS(-DNO_WARN_MBCS_MFC_DEPRECATION)
 INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/sql)
 MYSQL_ADD_EXECUTABLE(mariadb-upgrade-wizard
-  upgrade.cpp upgradeDlg.cpp upgrade.rc ${UPGRADE_WIZARD_SOURCES}
+  upgrade.cpp upgradeDlg.cpp
+  ${CMAKE_SOURCE_DIR}/sql/winservice.c
+  upgrade.rc
   COMPONENT Server)
 
-
-TARGET_LINK_LIBRARIES(mariadb-upgrade-wizard ${UPGRADE_WIZARD_LINK_LIBRARIES})
 # upgrade_wizard is Windows executable, set WIN32_EXECUTABLE so it does not
 # create a console.
 SET_TARGET_PROPERTIES(mariadb-upgrade-wizard PROPERTIES
   WIN32_EXECUTABLE 1
   LINK_FLAGS "/MANIFESTUAC:level='requireAdministrator'"
+  MSVC_RUNTIME_LIBRARY MultiThreaded
 )
diff -Nru mariadb-10.11.11/wsrep-lib/.github/workflows/cmake.yml mariadb-10.11.13/wsrep-lib/.github/workflows/cmake.yml
--- mariadb-10.11.11/wsrep-lib/.github/workflows/cmake.yml	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/.github/workflows/cmake.yml	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,71 @@
+name: CMake Compatibility Check
+
+on:
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          # Ubuntu 20.04
+          - version: v3.16.3
+            bintar: cmake-3.16.3-linux-x86_64.tar.gz
+          # v3.17.5
+          - version: v3.17.5
+            bintar: cmake-3.17.5-linux-x86_64.tar.gz
+          # Debian bullseye
+          - version: v3.18.4
+            bintar: cmake-3.18.4-linux-x86_64.tar.gz
+          # v3.20.6
+          - version: v3.20.6
+            bintar: cmake-3.20.6-linux-x86_64.tar.gz
+          # Ubuntu 22.04
+          - version: v3.22.1
+            bintar: cmake-3.22.1-linux-x86_64.tar.gz
+          # Debian bookworm
+          - version: v3.25.1
+            bintar: cmake-3.25.1-linux-x86_64.tar.gz
+          # Rockylinux 8, 9
+          - version: v3.26.5
+            bintar: cmake-3.26.5-linux-x86_64.tar.gz
+          # Ubuntu 24.04
+          - version: v3.28.3
+            bintar: cmake-3.28.3-linux-x86_64.tar.gz
+          # Latest v3 as of 2025-04-02
+          - version: v3.31.6
+            bintar: cmake-3.31.6-linux-x86_64.tar.gz
+          # Latest v4 as of 2025-04-02
+          - version: v4.0.0
+            bintar: cmake-4.0.0-linux-x86_64.tar.gz
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        submodules: "recursive"
+
+    - name: Update packages
+      run: sudo apt update
+
+    - name: Download and Install CMake
+      run: |
+        wget https://github.com/Kitware/CMake/releases/download/${{ matrix.config.version }}/${{ matrix.config.bintar }}
+        sudo tar xf ${{ matrix.config.bintar }} --strip-components=1 -C /usr/local
+        cmake --version
+
+    - name: Install build dependencies
+      run: sudo apt-get install -y libboost-filesystem-dev libboost-program-options-dev libboost-test-dev libboost-thread-dev
+
+    - name: Create Build Environment
+      run: cmake -E make_directory ${{runner.workspace}}/build
+
+    - name: Configure CMake
+      shell: bash
+      run: |
+        cmake --version
+        cmake -B ${{runner.workspace}}/build -DCMAKE_BUILD_TYPE=RelWithDebInfo
diff -Nru mariadb-10.11.11/wsrep-lib/.gitignore mariadb-10.11.13/wsrep-lib/.gitignore
--- mariadb-10.11.11/wsrep-lib/.gitignore	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/.gitignore	2025-05-19 16:14:28.000000000 +0000
@@ -13,3 +13,6 @@
 
 # Gcov generated files
 *.dgcov
+
+# Test logs
+wsrep-lib_test.log
diff -Nru mariadb-10.11.11/wsrep-lib/CMakeLists.txt mariadb-10.11.13/wsrep-lib/CMakeLists.txt
--- mariadb-10.11.11/wsrep-lib/CMakeLists.txt	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/CMakeLists.txt	2025-05-19 16:14:28.000000000 +0000
@@ -2,7 +2,7 @@
 # Copyright (C) 2021 Codership Oy <info@codership.com>
 #
 
-cmake_minimum_required (VERSION 2.8)
+cmake_minimum_required (VERSION 2.8...4.0)
 
 # Parse version from version header file and store it into
 # WSREP_LIB_VERSION.
diff -Nru mariadb-10.11.11/wsrep-lib/CONTRIBUTORS.txt mariadb-10.11.13/wsrep-lib/CONTRIBUTORS.txt
--- mariadb-10.11.11/wsrep-lib/CONTRIBUTORS.txt	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/CONTRIBUTORS.txt	2025-05-19 16:14:28.000000000 +0000
@@ -22,6 +22,7 @@
  * Alexey Yurchenko <alexey.yurchenko@galeracluster.com>, Codership Oy
  * Mario Karuza <mario.karuza@galeracluster.com>, Codership Oy
  * Daniele Sciascia <daniele.sciascia@galeracluster.com>, Codership Oy
+ * Jan Lindström <jan.lindstrom@galeracluster.com>, Codership Oy
  [Codership employees, add name and email/username above this line, but leave this line intact]
 
 Other contributors:
diff -Nru mariadb-10.11.11/wsrep-lib/cmake/boost.cmake mariadb-10.11.13/wsrep-lib/cmake/boost.cmake
--- mariadb-10.11.11/wsrep-lib/cmake/boost.cmake	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/cmake/boost.cmake	2025-05-19 16:14:28.000000000 +0000
@@ -53,7 +53,7 @@
 SET(BOOST_PACKAGE_NAME "boost_${BOOST_MAJOR}_${BOOST_MINOR}_${BOOST_PATCH}")
 SET(BOOST_TARBALL "${BOOST_PACKAGE_NAME}.tar.gz")
 SET(BOOST_DOWNLOAD_URL
-  "https://boostorg.jfrog.io/artifactory/main/release/${BOOST_MAJOR}.${BOOST_MINOR}.${BOOST_PATCH}/source/${BOOST_TARBALL}"
+  "https://archives.boost.io/release/${BOOST_MAJOR}.${BOOST_MINOR}.${BOOST_PATCH}/source/${BOOST_TARBALL}"
   )
 
 MACRO(RESET_BOOST_VARIABLES)
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/client_state.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/client_state.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/client_state.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/client_state.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -582,7 +582,8 @@
          * @param lock Lock to protect client state.
          * @param bf_seqno Seqno of the BF aborter.
          */
-        int bf_abort(wsrep::unique_lock<wsrep::mutex>& lock, wsrep::seqno bf_seqno);
+        int bf_abort(wsrep::unique_lock<wsrep::mutex>& lock,
+                     wsrep::seqno bf_seqno);
         /**
          * Wrapper to bf_abort() call, grabs lock internally.
          */
@@ -593,7 +594,8 @@
          * should be called by the TOI operation which needs to
          * BF abort a transaction.
          */
-        int total_order_bf_abort(wsrep::unique_lock<wsrep::mutex>& lock, wsrep::seqno bf_seqno);
+        int total_order_bf_abort(wsrep::unique_lock<wsrep::mutex>& lock,
+                                 wsrep::seqno bf_seqno);
 
         /**
          * Wrapper to total_order_bf_abort(), grabs lock internally.
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/connection_monitor_service.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/connection_monitor_service.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/connection_monitor_service.hpp	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/connection_monitor_service.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2024-2025 Codership Oy <info@codership.com>
+ *
+ * This file is part of wsrep-lib.
+ *
+ * Wsrep-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Wsrep-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wsrep-lib.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+
+/** @file connection_monitor_service.hpp
+ *
+ * Service interface for interacting with DBMS provided
+ * connection monitor callback.
+ */
+
+#ifndef WSREP_CONNECTION_MONITOR_SERVICE_HPP
+#define WSREP_CONNECTION_MONITOR_SERVICE_HPP
+
+#include "compiler.hpp"
+#include "wsrep/buffer.hpp"
+
+/* Type tag for connection key. */
+typedef const void* wsrep_connection_key_t;
+
+namespace wsrep
+{
+    class connection_monitor_service
+    {
+    public:
+
+        virtual ~connection_monitor_service() { }
+
+        /**
+         * Connection monitor connect callback.
+         */
+        virtual bool connection_monitor_connect_cb(
+                                                   wsrep_connection_key_t id,
+                                                   const wsrep::const_buffer& scheme,
+                                                   const wsrep::const_buffer& local_addr,
+                                                   const wsrep::const_buffer& remote_addr
+                                                  ) WSREP_NOEXCEPT = 0;
+        /**
+         * Connection monitor disconnect callback.
+         */
+        virtual bool connection_monitor_disconnect_cb(wsrep_connection_key_t id
+                                                     ) WSREP_NOEXCEPT=0;
+
+        /**
+         * Connection monitor SSL/TLS info callback.
+         */
+        virtual bool connection_monitor_ssl_info_cb(wsrep_connection_key_t id,
+                                                    const wsrep::const_buffer& ciper,
+                                                    const wsrep::const_buffer& subject,
+                                                    const wsrep::const_buffer& issuer,
+                                                    const wsrep::const_buffer& version
+                                                   ) WSREP_NOEXCEPT = 0;
+    };
+}
+
+#endif // WSREP_CONNECTION_MONITOR_SERVICE_HPP
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/id.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/id.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/id.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/id.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -93,6 +93,11 @@
         {
             return undefined_;
         }
+
+        /**
+         * Return id in string representation.
+         */
+        std::string to_string() const;
     private:
         static const wsrep::id undefined_;
         native_type data_;
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/provider.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/provider.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/provider.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/provider.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018 Codership Oy <info@codership.com>
+ * Copyright (C) 2018-2025 Codership Oy <info@codership.com>
  *
  * This file is part of wsrep-lib.
  *
@@ -29,6 +29,7 @@
 
 #include <cstring>
 
+#include <memory>
 #include <string>
 #include <vector>
 #include <ostream>
@@ -47,7 +48,8 @@
     class tls_service;
     class allowlist_service;
     class event_service;
-
+    class client_service;
+    class connection_monitor_service;
     class stid
     {
     public:
@@ -283,7 +285,6 @@
             static const int streaming = (1 << 15);
             static const int snapshot = (1 << 16);
             static const int nbo = (1 << 17);
-
             /** decipher capability bitmask */
             static std::string str(int);
         };
@@ -375,6 +376,7 @@
          */
         virtual enum status bf_abort(wsrep::seqno bf_seqno,
                                      wsrep::transaction_id victim_trx,
+                                     wsrep::client_service& victim_ctx,
                                      wsrep::seqno& victim_seqno) = 0;
         virtual enum status rollback(wsrep::transaction_id) = 0;
         virtual enum status commit_order_enter(const wsrep::ws_handle&,
@@ -407,6 +409,7 @@
          * Leave total order isolation critical section
          */
         virtual enum status leave_toi(wsrep::client_id,
+                                      const wsrep::ws_meta& ws_meta,
                                       const wsrep::mutable_buffer& err) = 0;
 
         /**
@@ -477,6 +480,7 @@
             wsrep::tls_service* tls_service;
             wsrep::allowlist_service* allowlist_service;
             wsrep::event_service* event_service;
+            wsrep::connection_monitor_service* connection_monitor_service;
 
             // some GCC and clang versions don't support C++11 default
             // initializers fully, so we need to use explicit constructors
@@ -488,17 +492,20 @@
                 , tls_service()
                 , allowlist_service()
                 , event_service()
+                , connection_monitor_service()
             {
             }
 
             services(wsrep::thread_service* thr,
                      wsrep::tls_service*    tls,
                      wsrep::allowlist_service* all,
-                     wsrep::event_service*  event)
+                     wsrep::event_service*  event,
+                     wsrep::connection_monitor_service* con)
                 : thread_service(thr)
                 , tls_service(tls)
                 , allowlist_service(all)
                 , event_service(event)
+                , connection_monitor_service(con)
             {
             }
         };
@@ -509,11 +516,12 @@
          * @param provider_options Initial options to provider
          * @param thread_service Optional thread service implementation.
          */
-        static provider* make_provider(wsrep::server_state&,
-                                       const std::string& provider_spec,
-                                       const std::string& provider_options,
-                                       const wsrep::provider::services& services
-                                       = wsrep::provider::services());
+        static std::unique_ptr<provider> make_provider(
+            wsrep::server_state&,
+            const std::string& provider_spec,
+            const std::string& provider_options,
+            const wsrep::provider::services& services
+            = wsrep::provider::services());
 
     protected:
         wsrep::server_state& server_state_;
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/seqno.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/seqno.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/seqno.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/seqno.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -51,6 +51,11 @@
             return (seqno_ == -1);
         }
 
+        wsrep::seqno prev() const
+        {
+            return seqno{seqno_ - 1};
+        }
+
         bool operator<(seqno other) const
         {
             return (seqno_ < other.seqno_);
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/server_state.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/server_state.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/server_state.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/server_state.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -92,7 +92,9 @@
 #include "compiler.hpp"
 #include "xid.hpp"
 
+#include <memory>
 #include <deque>
+#include <functional>
 #include <vector>
 #include <string>
 #include <map>
@@ -188,8 +190,6 @@
             rm_sync
         };
 
-        virtual ~server_state();
-
         wsrep::encryption_service* encryption_service()
         { return encryption_service_; }
 
@@ -299,6 +299,17 @@
                           const wsrep::provider::services& services
                           = wsrep::provider::services());
 
+        using provider_factory_func =
+            std::function<decltype(wsrep::provider::make_provider)>;
+
+        /**
+         * Set provider factory method.
+         *
+         * @param Factory method to create a provider.
+         */
+        void set_provider_factory(const provider_factory_func&);
+
+        /** Unload/unset provider. */
         void unload_provider();
 
         bool is_provider_loaded() const { return provider_ != 0; }
@@ -310,12 +321,8 @@
          *
          * @throw wsrep::runtime_error if provider has not been loaded
          *
-         * @todo This should not be virtual. However, currently there
-         *       is no mechanism for tests and integrations to provide
-         *       their own provider implementations, so this is kept virtual
-         *       for time being.
          */
-        virtual wsrep::provider& provider() const
+        wsrep::provider& provider() const
         {
             if (provider_ == 0)
             {
@@ -529,6 +536,19 @@
             return init_initialized_;
         }
 
+        /** Recover streaming appliers if not already recoverd yet.
+         *
+         * This method recovers streaming appliers from streaming log.
+         * It must be called before starting to apply events after
+         * connecting to the cluster.
+         *
+         * @param lock Lock object holding server_state mutex.
+         * @param service Either client or high priority service.
+         */
+        template <class C>
+        void recover_streaming_appliers_if_not_recovered(
+            wsrep::unique_lock<wsrep::mutex>& lock, C& service);
+
         /**
          * This method will be called by the provider when
          * a remote write set is being applied. It is the responsibility
@@ -618,6 +638,7 @@
             , streaming_appliers_()
             , streaming_appliers_recovered_()
             , provider_()
+            , provider_factory_(wsrep::provider::make_provider)
             , name_(name)
             , id_(wsrep::id::undefined())
             , incoming_address_(incoming_address)
@@ -645,11 +666,7 @@
         // Interrupt all threads which are waiting for state
         void interrupt_state_waiters(wsrep::unique_lock<wsrep::mutex>&);
 
-        // Recover streaming appliers if not already recoverd
-        template <class C>
-        void recover_streaming_appliers_if_not_recovered(
-            wsrep::unique_lock<wsrep::mutex>&, C&);
-
+      private:
         // Close SR transcations whose origin is outside of current
         // cluster view.
         void close_orphaned_sr_transactions(
@@ -702,7 +719,8 @@
                          wsrep::high_priority_service*> streaming_appliers_map;
         streaming_appliers_map streaming_appliers_;
         bool streaming_appliers_recovered_;
-        wsrep::provider* provider_;
+        std::unique_ptr<wsrep::provider> provider_;
+        provider_factory_func provider_factory_;
         std::string name_;
         wsrep::id id_;
         std::string incoming_address_;
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/storage_service.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/storage_service.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/storage_service.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/storage_service.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -92,6 +92,17 @@
 
         virtual void store_globals() = 0;
         virtual void reset_globals() = 0;
+
+        /**
+         * Return true if the implementation requires storing
+         * and restoring global state. Return true by default
+         * since this is the original behavior. Stateless
+         * implementations may override.
+         */
+        virtual bool requires_globals() const {
+            return true;
+        }
+
     };
 }
 
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/transaction.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/transaction.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/transaction.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/transaction.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -200,9 +200,11 @@
         void after_applying();
 
         bool bf_abort(wsrep::unique_lock<wsrep::mutex>& lock,
-                      wsrep::seqno bf_seqno);
+                      wsrep::seqno bf_seqno,
+                      wsrep::client_service&);
         bool total_order_bf_abort(wsrep::unique_lock<wsrep::mutex>&,
-                                  wsrep::seqno bf_seqno);
+                                  wsrep::seqno bf_seqno,
+                                  wsrep::client_service&);
 
         void clone_for_replay(const wsrep::transaction& other);
 
diff -Nru mariadb-10.11.11/wsrep-lib/include/wsrep/view.hpp mariadb-10.11.13/wsrep-lib/include/wsrep/view.hpp
--- mariadb-10.11.11/wsrep-lib/include/wsrep/view.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/include/wsrep/view.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -117,9 +117,9 @@
         /**
          * Return true if the view is final
          */
-        bool final() const
+        bool is_final() const
         {
-            return (members_.empty() && own_index_ == -1);
+            return (status_ != primary && members_.empty() && own_index_ == -1);
         }
 
         /**
diff -Nru mariadb-10.11.11/wsrep-lib/src/CMakeLists.txt mariadb-10.11.13/wsrep-lib/src/CMakeLists.txt
--- mariadb-10.11.11/wsrep-lib/src/CMakeLists.txt	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/CMakeLists.txt	2025-05-19 16:14:28.000000000 +0000
@@ -6,6 +6,7 @@
   allowlist_service_v1.cpp
   client_state.cpp
   config_service_v1.cpp
+  connection_monitor_service_v1.cpp
   event_service_v1.cpp
   exception.cpp
   gtid.cpp
diff -Nru mariadb-10.11.11/wsrep-lib/src/client_state.cpp mariadb-10.11.13/wsrep-lib/src/client_state.cpp
--- mariadb-10.11.11/wsrep-lib/src/client_state.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/client_state.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -511,7 +511,7 @@
 {
     assert(lock.owns_lock());
     assert(mode_ == m_local || transaction_.is_streaming());
-    auto ret = transaction_.bf_abort(lock, bf_seqno);
+    auto ret = transaction_.bf_abort(lock, bf_seqno, client_service_);
     assert(lock.owns_lock());
     return ret;
 }
@@ -527,7 +527,7 @@
 {
     assert(lock.owns_lock());
     assert(mode_ == m_local || transaction_.is_streaming());
-    auto ret = transaction_.total_order_bf_abort(lock, bf_seqno);
+    auto ret = transaction_.total_order_bf_abort(lock, bf_seqno, client_service_);
     assert(lock.owns_lock());
     return ret;
 }
@@ -585,7 +585,7 @@
             // Successfully entered TOI, but the provider reported failure.
             // This may happen for example if certification fails.
             // Leave TOI before proceeding.
-            if (provider().leave_toi(id_, wsrep::mutable_buffer()))
+            if (provider().leave_toi(id_, poll_meta, wsrep::mutable_buffer()))
             {
                 wsrep::log_warning()
                     << "Failed to leave TOI after failure in "
@@ -689,10 +689,12 @@
 {
     debug_log_state("leave_toi_local: enter");
     assert(toi_mode_ == m_local);
-    leave_toi_common();
 
+    auto ret = (provider().leave_toi(id_, toi_meta_, err) == provider::success ? 0 : 1);
+    leave_toi_common();
     debug_log_state("leave_toi_local: leave");
-    return (provider().leave_toi(id_, err) == provider::success ? 0 : 1);
+
+    return ret;
 }
 
 void wsrep::client_state::leave_toi_mode()
@@ -809,7 +811,7 @@
     assert(mode_ == m_nbo);
     assert(in_toi());
 
-    enum wsrep::provider::status status(provider().leave_toi(id_, err));
+    enum wsrep::provider::status status(provider().leave_toi(id_, toi_meta_, err));
     wsrep::unique_lock<wsrep::mutex> lock(mutex_);
     int ret;
     switch (status)
@@ -910,7 +912,7 @@
     assert(toi_mode_ == m_local);
     assert(in_toi());
     enum wsrep::provider::status status(
-        provider().leave_toi(id_, err));
+        provider().leave_toi(id_, toi_meta_, err));
     wsrep::unique_lock<wsrep::mutex> lock(mutex_);
     int ret;
     switch (status)
diff -Nru mariadb-10.11.11/wsrep-lib/src/config_service_v1.cpp mariadb-10.11.13/wsrep-lib/src/config_service_v1.cpp
--- mariadb-10.11.11/wsrep-lib/src/config_service_v1.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/config_service_v1.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -151,6 +151,11 @@
                                    wsrep::provider_options* options)
 {
     struct wsrep_st* wsrep = (struct wsrep_st*)provider.native();
+    if (wsrep == nullptr)
+    {
+        // Not a provider which was loaded via wsrep-API
+        return 0;
+    }
     if (config_service_v1_probe(wsrep->dlh))
     {
         wsrep::log_warning() << "Provider does not support config service v1";
diff -Nru mariadb-10.11.11/wsrep-lib/src/connection_monitor_service_v1.cpp mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.cpp
--- mariadb-10.11.11/wsrep-lib/src/connection_monitor_service_v1.cpp	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2024-2025 Codership Oy <info@codership.com>
+ *
+ * This file is part of wsrep-lib.
+ *
+ * Wsrep-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Wsrep-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wsrep-lib.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include "connection_monitor_service_v1.hpp"
+#include "service_helpers.hpp"
+
+#include "wsrep/buffer.hpp"
+#include "v26/wsrep_connection_monitor_service.h"
+#include "wsrep/connection_monitor_service.hpp"
+
+#include <cassert>
+#include <dlfcn.h>
+#include <cerrno>
+
+namespace wsrep_connection_monitor_service_v1
+{
+    // Pointer to connection monitor service implementation provided by
+    // the application.
+    static wsrep::connection_monitor_service* connection_monitor_service_impl{ 0 };
+    static std::atomic<size_t> use_count;
+
+    //
+    // connection monitor service callbacks
+    //
+
+    void connection_monitor_connect_cb(
+        wsrep_connection_monitor_context_t*,
+        wsrep_connection_key_t id,
+        const wsrep_buf_t* scheme,
+        const wsrep_buf_t* local_address,
+        const wsrep_buf_t* remote_address
+    )
+    {
+        assert(connection_monitor_service_impl);
+        wsrep::const_buffer scheme_value(scheme->ptr, scheme->len);
+        wsrep::const_buffer remote_addr(remote_address->ptr, remote_address->len);
+        wsrep::const_buffer local_addr(local_address->ptr, local_address->len);
+        connection_monitor_service_impl->connection_monitor_connect_cb(
+                                                               id,
+                                                               scheme_value,
+                                                               local_addr,
+                                                               remote_addr);
+    }
+
+    void connection_monitor_disconnect_cb(
+        wsrep_connection_monitor_context_t*,
+        wsrep_connection_key_t id
+    )
+    {
+        assert(connection_monitor_service_impl);
+        connection_monitor_service_impl->connection_monitor_disconnect_cb(id);
+    }
+
+  void connection_monitor_ssl_info_cb(
+        wsrep_connection_monitor_context_t*,
+        wsrep_connection_key_t id,
+        const wsrep_buf_t* cipher,
+        const wsrep_buf_t* certificate_subject,
+        const wsrep_buf_t* certificate_issuer,
+        const wsrep_buf_t* version)
+    {
+        assert(connection_monitor_service_impl);
+        wsrep::const_buffer ssl_cipher(cipher->ptr, cipher->len);
+        wsrep::const_buffer cert_sub(certificate_subject->ptr, certificate_subject->len);
+        wsrep::const_buffer cert_iss(certificate_issuer->ptr, certificate_issuer->len);
+        wsrep::const_buffer vers(version->ptr, version->len);
+        connection_monitor_service_impl->connection_monitor_ssl_info_cb(
+            id, ssl_cipher, cert_sub, cert_iss, vers);
+    }
+
+    static wsrep_connection_monitor_service_v1_t connection_monitor_service_callbacks
+        = { connection_monitor_connect_cb,
+            connection_monitor_disconnect_cb,
+	    connection_monitor_ssl_info_cb,
+            0 };
+}
+
+int wsrep::connection_monitor_service_v1_probe(void* dlh)
+{
+    typedef int (*init_fn)(wsrep_connection_monitor_service_v1_t*);
+    typedef void (*deinit_fn)();
+    if (wsrep_impl::service_probe<init_fn>(
+            dlh, WSREP_CONNECTION_MONITOR_SERVICE_INIT_FUNC_V1, "connection monitor service v1") ||
+        wsrep_impl::service_probe<deinit_fn>(
+            dlh, WSREP_CONNECTION_MONITOR_SERVICE_DEINIT_FUNC_V1, "connection monitor service v1"))
+    {
+        wsrep::log_warning() << "Provider does not support connection monitor service v1";
+        return 1;
+    }
+
+    return 0;
+}
+
+int wsrep::connection_monitor_service_v1_init(void* dlh,
+                                              wsrep::connection_monitor_service* connection_service)
+{
+    if (not (dlh && connection_service)) return EINVAL;
+    typedef int (*init_fn)(wsrep_connection_monitor_service_v1_t*);
+    wsrep_connection_monitor_service_v1::connection_monitor_service_impl = connection_service;
+    int ret(0);
+    if ((ret = wsrep_impl::service_init<init_fn>(
+             dlh, WSREP_CONNECTION_MONITOR_SERVICE_INIT_FUNC_V1,
+             &wsrep_connection_monitor_service_v1::connection_monitor_service_callbacks,
+             "connection monitor service v1")))
+    {
+        wsrep_connection_monitor_service_v1::connection_monitor_service_impl = 0;
+    }
+    else
+    {
+        ++wsrep_connection_monitor_service_v1::use_count;
+    }
+    return ret;
+}
+
+void wsrep::connection_monitor_service_v1_deinit(void* dlh)
+{
+    typedef int (*deinit_fn)();
+    wsrep_impl::service_deinit<deinit_fn>(
+        dlh, WSREP_CONNECTION_MONITOR_SERVICE_DEINIT_FUNC_V1, "connection monitor service v1");
+    assert(wsrep_connection_monitor_service_v1::use_count > 0);
+    --wsrep_connection_monitor_service_v1::use_count;
+    if (wsrep_connection_monitor_service_v1::use_count == 0)
+    {
+        wsrep_connection_monitor_service_v1::connection_monitor_service_impl = 0;
+    }
+}
diff -Nru mariadb-10.11.11/wsrep-lib/src/connection_monitor_service_v1.hpp mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.hpp
--- mariadb-10.11.11/wsrep-lib/src/connection_monitor_service_v1.hpp	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/connection_monitor_service_v1.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2024-2025 Codership Oy <info@codership.com>
+ *
+ * This file is part of wsrep-lib.
+ *
+ * Wsrep-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Wsrep-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wsrep-lib.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef WSREP_CONNECTION_MONITOR_SERVICE_V1_HPP
+#define WSREP_CONNECTION_MONITOR_SERVICE_V1_HPP
+
+namespace wsrep
+{
+    class connection_monitor_service;
+    /**
+     * Probe connection_monitor_service_v1 support in loaded library.
+     *
+     * @param dlh Handle returned by dlopen().
+     *
+     * @return Zero on success, non-zero system error code on failure.
+     */
+    int connection_monitor_service_v1_probe(void *dlh);
+
+    /**
+     * Initialize the connection_monitor service.
+     *
+     * @param dlh Handle returned by dlopen().
+     * @param connection_monitor_service Pointer to
+     *            wsrep::connection_monitor_service implementation.
+     *
+     * @return Zero on success, non-zero system error code on failure.
+     */
+    int connection_monitor_service_v1_init(void* dlh,
+            wsrep::connection_monitor_service* connection_monitor_service);
+
+    /**
+     * Deinitialize the connection monitor service.
+     *
+     * @params dlh Handler returned by dlopen().
+     */
+    void connection_monitor_service_v1_deinit(void* dlh);
+
+}
+
+#endif // WSREP_CONNECTION_MONITOR_SERVICE_V1_HPP
diff -Nru mariadb-10.11.11/wsrep-lib/src/id.cpp mariadb-10.11.13/wsrep-lib/src/id.cpp
--- mariadb-10.11.11/wsrep-lib/src/id.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/id.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -50,13 +50,43 @@
     }
 }
 
+std::string wsrep::id::to_string() const
+{
+  std::ostringstream os;
+  os << *this;
+  return os.str();
+}
+
+
+/*
+ * If the buffer pointed by ptr contains only alphanumeric chars followed by
+ * one or more null terminators, consider it a valid alphanumeric string.
+ *
+ * A string starting with null is not an alphanumeric string.
+ */
+static bool is_alphanumeric_string(const char* ptr, size_t size)
+{
+    if (size == 0)
+        return false;
+    if (ptr[0] == '\0')
+        return false;
+    const char* first_not_alphanumeric = std::find_if_not(
+        ptr, ptr + size, [](char c) { return ::isalnum(c); });
+    if (static_cast<size_t>(std::distance(ptr, first_not_alphanumeric)) == size)
+        return false;
+    return std::all_of(first_not_alphanumeric, ptr + size,
+                       [](char c) { return (c == '\0'); });
+}
+
 std::ostream& wsrep::operator<<(std::ostream& os, const wsrep::id& id)
 {
     const char* ptr(static_cast<const char*>(id.data()));
     size_t size(id.size());
-    if (static_cast<size_t>(std::count_if(ptr, ptr + size, ::isalnum)) == size)
+    /* If the buffer pointed by ptr contains only alphanumeric chars followed by
+     * one or more null terminators, return the string. */
+    if (is_alphanumeric_string(ptr, size))
     {
-        return (os << std::string(ptr, size));
+        return (os << std::string(ptr, ::strnlen(ptr, size)));
     }
     else
     {
diff -Nru mariadb-10.11.11/wsrep-lib/src/provider.cpp mariadb-10.11.13/wsrep-lib/src/provider.cpp
--- mariadb-10.11.11/wsrep-lib/src/provider.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/provider.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -26,7 +26,7 @@
 #include <cassert>
 #include <memory>
 
-wsrep::provider* wsrep::provider::make_provider(
+std::unique_ptr<wsrep::provider> wsrep::provider::make_provider(
     wsrep::server_state& server_state,
     const std::string& provider_spec,
     const std::string& provider_options,
@@ -34,8 +34,8 @@
 {
     try
     {
-        return new wsrep::wsrep_provider_v26(
-            server_state, provider_options, provider_spec, services);
+        return std::unique_ptr<wsrep::provider>(new wsrep::wsrep_provider_v26(
+            server_state, provider_options, provider_spec, services));
     }
     catch (const wsrep::runtime_error& e)
     {
@@ -120,7 +120,6 @@
     WSREP_PRINT_CAPABILITY(streaming,            "STREAMING");
     WSREP_PRINT_CAPABILITY(snapshot,             "READ_VIEW");
     WSREP_PRINT_CAPABILITY(nbo,                  "NBO");
-
 #undef WSREP_PRINT_CAPABILITY
 
     if (caps)
diff -Nru mariadb-10.11.11/wsrep-lib/src/server_state.cpp mariadb-10.11.13/wsrep-lib/src/server_state.cpp
--- mariadb-10.11.11/wsrep-lib/src/server_state.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/server_state.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -307,7 +307,8 @@
                                  wsrep::log::debug_level_server_state,
                                  "Could not find applier context for "
                                  << ws_meta.server_id()
-                                 << ": " << ws_meta.transaction_id());
+                                 << ": " << ws_meta.transaction_id()
+                                 << ", " << ws_meta.seqno());
                 ret = high_priority_service.log_dummy_write_set(
                     ws_handle, ws_meta, no_error);
             }
@@ -379,7 +380,8 @@
             // it may be an indication of  a bug too.
             wsrep::log_warning() << "Could not find applier context for "
                                  << ws_meta.server_id()
-                                 << ": " << ws_meta.transaction_id();
+                                 << ": " << ws_meta.transaction_id()
+                                 << ", " << ws_meta.seqno();
             wsrep::mutable_buffer no_error;
             ret = high_priority_service.log_dummy_write_set(
                 ws_handle, ws_meta, no_error);
@@ -420,7 +422,8 @@
                 wsrep::log_warning()
                     << "Could not find applier context for "
                     << ws_meta.server_id()
-                    << ": " << ws_meta.transaction_id();
+                    << ": " << ws_meta.transaction_id()
+                    << ", " << ws_meta.seqno();
                 wsrep::mutable_buffer no_error;
                 ret = high_priority_service.log_dummy_write_set(
                     ws_handle, ws_meta, no_error);
@@ -501,18 +504,21 @@
 {
     wsrep::log_info() << "Loading provider " << provider_spec
                       << " initial position: " << initial_position_;
-
-    provider_ = wsrep::provider::make_provider(*this,
-                                               provider_spec,
-                                               provider_options,
-                                               services);
+    provider_
+        = provider_factory_(*this, provider_spec, provider_options, services);
     return (provider_ ? 0 : 1);
 }
 
+void wsrep::server_state::set_provider_factory(
+    const provider_factory_func& provider_factory)
+{
+    assert(provider_factory);
+    provider_factory_ = provider_factory;
+}
+
 void wsrep::server_state::unload_provider()
 {
-    delete provider_;
-    provider_ = 0;
+    provider_.reset();
 }
 
 int wsrep::server_state::connect(const std::string& cluster_name,
@@ -545,11 +551,6 @@
     return provider().disconnect();
 }
 
-wsrep::server_state::~server_state()
-{
-    delete provider_;
-}
-
 std::vector<wsrep::provider::status_variable>
 wsrep::server_state::status() const
 {
@@ -914,7 +915,7 @@
     wsrep::high_priority_service* high_priority_service)
 {
     wsrep::unique_lock<wsrep::mutex> lock(mutex_);
-    assert(view.final() == false);
+    assert(view.is_final() == false);
     //
     // Reached primary from connected state. This may mean the following
     //
@@ -995,7 +996,7 @@
 {
         wsrep::unique_lock<wsrep::mutex> lock(mutex_);
         wsrep::log_info() << "Non-primary view";
-        if (view.final())
+        if (view.is_final())
         {
             go_final(lock, view, high_priority_service);
         }
@@ -1010,7 +1011,7 @@
                                    wsrep::high_priority_service* hps)
 {
     (void)view; // avoid compiler warning "unused parameter 'view'"
-    assert(view.final());
+    assert(view.is_final());
     assert(hps);
     if (hps)
     {
@@ -1064,6 +1065,8 @@
     {
         switch (state_)
         {
+        case s_disconnecting:
+            break;
         case s_synced:
             break;
         case s_connected:                 // Seed node path: provider becomes
@@ -1090,7 +1093,7 @@
         // Calls to on_sync() in synced state are possible if
         // server desyncs itself from the group. Provider does not
         // inform about this through callbacks.
-        if (state_ != s_synced)
+        if (state_ != s_synced && state_ != s_disconnecting)
         {
             state(lock, s_synced);
         }
@@ -1395,8 +1398,9 @@
         // or disconnected and the state has been changed to disconnecting,
         // this usually means that some error was encountered 
         if (state != s_disconnecting && state != s_disconnected
-            && state_ == s_disconnecting)
+            && (state_ == s_disconnecting || state_ == s_disconnected))
         {
+          --state_waiters_[state];
             throw wsrep::runtime_error("State wait was interrupted");
         }
     }
@@ -1484,6 +1488,7 @@
         {
             wsrep::client_id client_id(i->first);
             wsrep::transaction_id transaction_id(i->second->transaction().id());
+            auto& client_state = *i->second;
             // It is safe to unlock the server state temporarily here.
             // The processing happens inside view handler which is
             // protected by the provider commit ordering critical
@@ -1494,7 +1499,7 @@
             // remains unlocked, so it should not be accessed after
             // the bf abort call.
             lock.unlock();
-            i->second->total_order_bf_abort(current_view_.view_seqno());
+            client_state.total_order_bf_abort(current_view_.view_seqno());
             lock.lock();
             streaming_clients_map::const_iterator found_i;
             while ((found_i = streaming_clients_.find(client_id)) !=
diff -Nru mariadb-10.11.11/wsrep-lib/src/transaction.cpp mariadb-10.11.13/wsrep-lib/src/transaction.cpp
--- mariadb-10.11.11/wsrep-lib/src/transaction.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/transaction.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -63,8 +63,10 @@
             {
                 throw wsrep::runtime_error("Null client_state provided");
             }
-            client_service_.reset_globals();
-            storage_service_->store_globals();
+            if (storage_service_->requires_globals()) {
+              client_service_.reset_globals();
+              storage_service_->store_globals();
+            }
         }
 
         wsrep::storage_service& storage_service()
@@ -74,8 +76,11 @@
 
         ~scoped_storage_service()
         {
+            bool restore_globals = storage_service_->requires_globals();
             deleter_(storage_service_);
-            client_service_.store_globals();
+            if (restore_globals) {
+              client_service_.store_globals();
+            }
         }
     private:
         scoped_storage_service(const scoped_storage_service&);
@@ -990,7 +995,8 @@
 
 bool wsrep::transaction::bf_abort(
     wsrep::unique_lock<wsrep::mutex>& lock,
-    wsrep::seqno bf_seqno)
+    wsrep::seqno bf_seqno,
+    wsrep::client_service& victim_ctx)
 {
     bool ret(false);
     const enum wsrep::transaction::state state_at_enter(state());
@@ -1021,7 +1027,7 @@
             wsrep::seqno victim_seqno;
             enum wsrep::provider::status
                 status(client_state_.provider().bf_abort(
-                           bf_seqno, id_, victim_seqno));
+                           bf_seqno, id_, victim_ctx, victim_seqno));
             switch (status)
             {
             case wsrep::provider::success:
@@ -1108,14 +1114,15 @@
 
 bool wsrep::transaction::total_order_bf_abort(
     wsrep::unique_lock<wsrep::mutex>& lock WSREP_UNUSED,
-    wsrep::seqno bf_seqno)
+    wsrep::seqno bf_seqno,
+    wsrep::client_service& victim_ctx)
 {
     /* We must set this flag before entering bf_abort() in order
      * to streaming_rollback() work correctly. The flag will be
      * unset if BF abort was not allowed. Note that we rely in
      * bf_abort() not to release lock if the BF abort is not allowed. */
     bf_aborted_in_total_order_ = true;
-    bool ret(bf_abort(lock, bf_seqno));
+    bool ret(bf_abort(lock, bf_seqno, victim_ctx));
     if (not ret)
     {
         bf_aborted_in_total_order_ = false;
diff -Nru mariadb-10.11.11/wsrep-lib/src/view.cpp mariadb-10.11.13/wsrep-lib/src/view.cpp
--- mariadb-10.11.11/wsrep-lib/src/view.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/view.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -57,7 +57,7 @@
        << "  status: " << to_c_string(status()) << "\n"
        << "  protocol_version: " << protocol_version() << "\n"
        << "  capabilities: " << provider::capability::str(capabilities())<<"\n"
-       << "  final: " << (final() ? "yes" : "no") << "\n"
+       << "  final: " << (is_final() ? "yes" : "no") << "\n"
        << "  own_index: " << own_index() << "\n"
        << "  members(" << members().size() << "):\n";
 
diff -Nru mariadb-10.11.11/wsrep-lib/src/wsrep_provider_v26.cpp mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.cpp
--- mariadb-10.11.11/wsrep-lib/src/wsrep_provider_v26.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Codership Oy <info@codership.com>
+ * Copyright (C) 2018-2025 Codership Oy <info@codership.com>
  *
  * This file is part of wsrep-lib.
  *
@@ -28,6 +28,7 @@
 #include "wsrep/thread_service.hpp"
 #include "wsrep/tls_service.hpp"
 #include "wsrep/allowlist_service.hpp"
+#include "wsrep/connection_monitor_service.hpp"
 
 #include "service_helpers.hpp"
 #include "thread_service_v1.hpp"
@@ -36,6 +37,7 @@
 #include "event_service_v1.hpp"
 #include "v26/wsrep_api.h"
 #include "v26/wsrep_node_isolation.h"
+#include "connection_monitor_service_v1.hpp"
 
 #include <dlfcn.h>
 #include <cassert>
@@ -675,6 +677,22 @@
 
     wsrep_node_isolation_mode_set_fn_v1 node_isolation_mode_set;
     wsrep_certify_fn_v1 certify_v1;
+
+    static int init_connection_monitor_service(void* dlh,
+                                      wsrep::connection_monitor_service* connection_monitor_service)
+    {
+        assert(connection_monitor_service);
+        if (not wsrep::connection_monitor_service_v1_probe(dlh))
+        {
+            return wsrep::connection_monitor_service_v1_init(dlh, connection_monitor_service);
+        }
+        return 1;
+    }
+
+    static void deinit_connection_monitor_service(void* dlh)
+    {
+        wsrep::connection_monitor_service_v1_deinit(dlh);
+    }
 }
 
 
@@ -725,6 +743,15 @@
 
     certify_v1 = wsrep_impl::resolve_function<wsrep_certify_fn_v1>(
         wsrep_->dlh, WSREP_CERTIFY_V1);
+
+    if (services.connection_monitor_service)
+    {
+        if (init_connection_monitor_service(wsrep_->dlh, services.connection_monitor_service))
+        {
+            throw wsrep::runtime_error("Failed to initialize connection monitor service");
+        }
+        services_enabled_.connection_monitor_service = services.connection_monitor_service;
+    }
 }
 
 void wsrep::wsrep_provider_v26::deinit_services()
@@ -738,6 +765,8 @@
     if (services_enabled_.allowlist_service)
         deinit_allowlist_service(wsrep_->dlh);
     node_isolation_mode_set = nullptr;
+    if (services_enabled_.connection_monitor_service)
+        deinit_connection_monitor_service(wsrep_->dlh);
 }
 
 wsrep::wsrep_provider_v26::wsrep_provider_v26(
@@ -950,6 +979,7 @@
 wsrep::wsrep_provider_v26::bf_abort(
     wsrep::seqno bf_seqno,
     wsrep::transaction_id victim_id,
+    wsrep::client_service& /* Ignored here */,
     wsrep::seqno& victim_seqno)
 {
     wsrep_seqno_t wsrep_victim_seqno;
@@ -1047,6 +1077,7 @@
 
 enum wsrep::provider::status
 wsrep::wsrep_provider_v26::leave_toi(wsrep::client_id client_id,
+                                     const wsrep::ws_meta&,
                                      const wsrep::mutable_buffer& err)
 {
     const wsrep_buf_t err_buf = { err.data(), err.size() };
diff -Nru mariadb-10.11.11/wsrep-lib/src/wsrep_provider_v26.hpp mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.hpp
--- mariadb-10.11.11/wsrep-lib/src/wsrep_provider_v26.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/src/wsrep_provider_v26.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -63,6 +63,7 @@
         enum wsrep::provider::status
         bf_abort(wsrep::seqno,
                  wsrep::transaction_id,
+                 wsrep::client_service&,
                  wsrep::seqno&) WSREP_OVERRIDE;
         enum wsrep::provider::status
         rollback(const wsrep::transaction_id) WSREP_OVERRIDE;
@@ -83,6 +84,7 @@
                                                int)
             WSREP_OVERRIDE;
         enum wsrep::provider::status leave_toi(wsrep::client_id,
+                                               const wsrep::ws_meta& ws_meta,
                                                const wsrep::mutable_buffer&)
             WSREP_OVERRIDE;
         std::pair<wsrep::gtid, enum wsrep::provider::status>
diff -Nru mariadb-10.11.11/wsrep-lib/test/id_test.cpp mariadb-10.11.13/wsrep-lib/test/id_test.cpp
--- mariadb-10.11.11/wsrep-lib/test/id_test.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/id_test.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -33,16 +33,28 @@
     wsrep::id id(uuid_str);
     std::ostringstream os;
     os << id;
-    BOOST_REQUIRE(uuid_str == os.str());
+    BOOST_REQUIRE_EQUAL(uuid_str, os.str());
 }
 
 BOOST_AUTO_TEST_CASE(id_test_string)
 {
-    std::string id_str("1234567890123456");
+    std::string id_str("node1");
     wsrep::id id(id_str);
     std::ostringstream os;
     os << id;
-    BOOST_REQUIRE(id_str == os.str());
+    BOOST_REQUIRE_EQUAL(id_str, os.str());
+}
+
+BOOST_AUTO_TEST_CASE(id_test_string_max)
+{
+    std::string id_str("1234"
+                       "5678"
+                       "9012"
+                       "3456");
+    wsrep::id id(id_str);
+    std::ostringstream os;
+    os << id;
+    BOOST_REQUIRE_EQUAL(os.str(), "31323334-3536-3738-3930-313233343536");
 }
 
 BOOST_AUTO_TEST_CASE(id_test_string_too_long)
@@ -58,7 +70,7 @@
     wsrep::id id(data, sizeof(data));
     std::ostringstream os;
     os << id;
-    BOOST_REQUIRE(os.str() == "01020304-0506-0708-0900-010203040506");
+    BOOST_REQUIRE_EQUAL(os.str(), "01020304-0506-0708-0900-010203040506");
 }
 
 BOOST_AUTO_TEST_CASE(id_test_binary_too_long)
@@ -67,3 +79,37 @@
     BOOST_REQUIRE_EXCEPTION(wsrep::id id(data, sizeof(data)),
                             wsrep::runtime_error, exception_check);;
 }
+
+BOOST_AUTO_TEST_CASE(id_test_binary_all_alphanumeric)
+{
+    char data[16] = {'a', 'a', 'a', 'a',
+                     'a', 'a', 'a', 'a',
+                     'a', 'a', 'a', 'a',
+                     'a', 'a', 'a', 'a'};
+    wsrep::id id(data, sizeof(data));
+    std::ostringstream os;
+    os << id;
+    /* Value of 'a' is 97 in ASCII, which is 0x61 in hex. */
+    BOOST_REQUIRE_EQUAL(os.str(), "61616161-6161-6161-6161-616161616161");
+}
+
+BOOST_AUTO_TEST_CASE(id_test_binary_all_except_middle_alphanumeric)
+{
+    char data[16] = {'a', 'a', 'a', 'a',
+                     'a', 'a', 'a', 'a',
+                     'a', 'a', '\0', 'a',
+                     'a', 'a', 'a', 'a'};
+    wsrep::id id(data, sizeof(data));
+    std::ostringstream os;
+    os << id;
+    BOOST_REQUIRE_EQUAL(os.str(), "61616161-6161-6161-6161-006161616161");
+}
+
+BOOST_AUTO_TEST_CASE(id_test_null)
+{
+    char data[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    wsrep::id id(data, sizeof(data));
+    std::ostringstream os;
+    os << id;
+    BOOST_REQUIRE_EQUAL(os.str(), "00000000-0000-0000-0000-000000000000");
+}
diff -Nru mariadb-10.11.11/wsrep-lib/test/mock_provider.hpp mariadb-10.11.13/wsrep-lib/test/mock_provider.hpp
--- mariadb-10.11.11/wsrep-lib/test/mock_provider.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/mock_provider.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -117,9 +117,8 @@
             {
                 ++group_seqno_;
                 wsrep::gtid gtid(group_id_, wsrep::seqno(group_seqno_));
-                ws_meta = wsrep::ws_meta(gtid, stid,
-                                         wsrep::seqno(group_seqno_ - 1),
-                                         flags);
+                ws_meta = wsrep::ws_meta(
+                    gtid, stid, wsrep::seqno(group_seqno_ - 1), flags);
                 return wsrep::provider::success;
             }
             else
@@ -135,9 +134,8 @@
                 {
                     ++group_seqno_;
                     wsrep::gtid gtid(group_id_, wsrep::seqno(group_seqno_));
-                    ws_meta = wsrep::ws_meta(gtid, stid,
-                                             wsrep::seqno(group_seqno_ - 1),
-                                             flags);
+                    ws_meta = wsrep::ws_meta(
+                        gtid, stid, wsrep::seqno(group_seqno_ - 1), flags);
                     ret = wsrep::provider::error_bf_abort;
                 }
                 bf_abort_map_.erase(it);
@@ -215,8 +213,8 @@
                         wsrep::gtid(group_id_, wsrep::seqno(group_seqno_)),
                         wsrep::stid(server_id_, tc.id(), cc.id()),
                         wsrep::seqno(group_seqno_ - 1),
-                        wsrep::provider::flag::start_transaction |
-                        wsrep::provider::flag::commit);
+                        wsrep::provider::flag::start_transaction
+                            | wsrep::provider::flag::commit);
                 }
                 else
                 {
@@ -245,12 +243,10 @@
         {
             ++group_seqno_;
             wsrep::gtid gtid(group_id_, wsrep::seqno(group_seqno_));
-            wsrep::stid stid(server_id_,
-                             wsrep::transaction_id::undefined(),
+            wsrep::stid stid(server_id_, wsrep::transaction_id::undefined(),
                              client_id);
             toi_meta = wsrep::ws_meta(gtid, stid,
-                                      wsrep::seqno(group_seqno_ - 1),
-                                      flags);
+                                      wsrep::seqno(group_seqno_ - 1), flags);
             ++toi_write_sets_;
             if (flags & wsrep::provider::flag::start_transaction)
                 ++toi_start_transaction_;
@@ -260,6 +256,7 @@
         }
 
         enum wsrep::provider::status leave_toi(wsrep::client_id,
+                                               const wsrep::ws_meta&,
                                                const wsrep::mutable_buffer&)
             WSREP_OVERRIDE
         { return wsrep::provider::success; }
@@ -315,6 +312,7 @@
         enum wsrep::provider::status
         bf_abort(wsrep::seqno bf_seqno,
                  wsrep::transaction_id trx_id,
+                 wsrep::client_service&,
                  wsrep::seqno& victim_seqno)
             WSREP_OVERRIDE
         {
diff -Nru mariadb-10.11.11/wsrep-lib/test/mock_server_state.hpp mariadb-10.11.13/wsrep-lib/test/mock_server_state.hpp
--- mariadb-10.11.11/wsrep-lib/test/mock_server_state.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/mock_server_state.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -258,12 +258,31 @@
                                   rollback_mode)
             , mutex_()
             , cond_()
-            , provider_(*this)
-        { }
+            , provider_()
+        {
+            set_provider_factory([&](wsrep::server_state&,
+                                     const std::string&,
+                                     const std::string&,
+                                     const wsrep::provider::services&)
+            {
+                // The provider object is destroyed upon server state
+                // destruction, so using a raw pointer is safe.
+                provider_ = new wsrep::mock_provider(*this);
+                return std::unique_ptr<wsrep::provider>(provider_);
+            });
+
+            const int ret WSREP_UNUSED = load_provider("mock", "");
+            assert(ret == 0);
+            assert(provider_ != nullptr);
+        }
 
-        wsrep::mock_provider& provider() const WSREP_OVERRIDE
-        { return provider_; }
+        mock_server_state(const mock_server_state&) = delete;
+        mock_server_state& operator=(const mock_server_state&) = delete;
 
+        wsrep::mock_provider& provider() const
+        {
+            return *provider_;
+        }
         // mock connected state for tests without overriding the connect()
         // method.
         int mock_connect(const std::string& own_id,
@@ -308,7 +327,7 @@
     private:
         wsrep::default_mutex mutex_;
         wsrep::default_condition_variable cond_;
-        mutable wsrep::mock_provider provider_;
+        wsrep::mock_provider* provider_;
     };
 }
 
diff -Nru mariadb-10.11.11/wsrep-lib/test/test_utils.cpp mariadb-10.11.13/wsrep-lib/test/test_utils.cpp
--- mariadb-10.11.11/wsrep-lib/test/test_utils.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/test_utils.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -41,11 +41,12 @@
 }
 // BF abort method to abort transactions via provider
 void wsrep_test::bf_abort_provider(wsrep::mock_server_state& sc,
-                                   const wsrep::transaction& tc,
+                                   const wsrep::client_state& victim_cs,
                                    wsrep::seqno bf_seqno)
 {
     wsrep::seqno victim_seqno;
-    sc.provider().bf_abort(bf_seqno, tc.id(), victim_seqno);
+    sc.provider().bf_abort(bf_seqno, victim_cs.transaction().id(), victim_cs.client_service(),
+                           victim_seqno);
     (void)victim_seqno;
 }
 
@@ -63,13 +64,10 @@
     mc.before_command();
     wsrep::mock_high_priority_service hps(sc, &mc, false);
     wsrep::ws_handle ws_handle(transaction_id, (void*)(1));
-    wsrep::ws_meta ws_meta(wsrep::gtid(wsrep::id("cluster1"),
-                                       wsrep::seqno(100)),
-                           wsrep::stid(server_id,
-                                       transaction_id,
-                                       wsrep::client_id(1)),
-                           wsrep::seqno(0),
-                           wsrep::provider::flag::rollback);
+    wsrep::ws_meta ws_meta(
+        wsrep::gtid(wsrep::id("cluster1"), wsrep::seqno(100)),
+        wsrep::stid(server_id, transaction_id, wsrep::client_id(1)),
+        wsrep::seqno(0), wsrep::provider::flag::rollback);
     wsrep::const_buffer data(0, 0);
     sc.on_apply(hps, ws_handle, ws_meta, data);
 }
diff -Nru mariadb-10.11.11/wsrep-lib/test/test_utils.hpp mariadb-10.11.13/wsrep-lib/test/test_utils.hpp
--- mariadb-10.11.11/wsrep-lib/test/test_utils.hpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/test_utils.hpp	2025-05-19 16:14:28.000000000 +0000
@@ -41,7 +41,7 @@
 
     // BF abort method to abort transactions via provider
     void bf_abort_provider(wsrep::mock_server_state& sc,
-                           const wsrep::transaction& tc,
+                           const wsrep::client_state& victim_cs,
                            wsrep::seqno bf_seqno);
 
     // BF abort in total order
diff -Nru mariadb-10.11.11/wsrep-lib/test/transaction_test.cpp mariadb-10.11.13/wsrep-lib/test/transaction_test.cpp
--- mariadb-10.11.11/wsrep-lib/test/transaction_test.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/transaction_test.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -188,7 +188,7 @@
     BOOST_REQUIRE(tc.id() == wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.state() == wsrep::transaction::s_executing);
 
-    wsrep_test::bf_abort_provider(sc, tc, wsrep::seqno::undefined());
+    wsrep_test::bf_abort_provider(sc, cc, wsrep::seqno::undefined());
 
     // Run before commit
     BOOST_REQUIRE(cc.before_commit());
@@ -454,7 +454,7 @@
     BOOST_REQUIRE(tc.id() == wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.state() == wsrep::transaction::s_executing);
 
-    wsrep_test::bf_abort_provider(sc, tc, wsrep::seqno(1));
+    wsrep_test::bf_abort_provider(sc, cc, wsrep::seqno(1));
 
     // Run before commit
     BOOST_REQUIRE(cc.before_commit());
diff -Nru mariadb-10.11.11/wsrep-lib/test/transaction_test_2pc.cpp mariadb-10.11.13/wsrep-lib/test/transaction_test_2pc.cpp
--- mariadb-10.11.11/wsrep-lib/test/transaction_test_2pc.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/transaction_test_2pc.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2019 Codership Oy <info@codership.com>
+ * Copyright (C) 2018-2021 Codership Oy <info@codership.com>
  *
  * This file is part of wsrep-lib.
  *
@@ -22,8 +22,7 @@
 //
 // Test a succesful 2PC transaction lifecycle
 //
-BOOST_FIXTURE_TEST_CASE(transaction_2pc,
-                        replicating_client_fixture_2pc)
+BOOST_FIXTURE_TEST_CASE(transaction_2pc, replicating_client_fixture_2pc)
 {
     cc.start_transaction(wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.active());
@@ -52,9 +51,8 @@
 //
 // Test a 2PC transaction which gets BF aborted before before_prepare
 //
-BOOST_FIXTURE_TEST_CASE(
-    transaction_2pc_bf_before_before_prepare,
-    replicating_client_fixture_2pc)
+BOOST_FIXTURE_TEST_CASE(transaction_2pc_bf_before_before_prepare,
+                        replicating_client_fixture_2pc)
 {
     cc.start_transaction(wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.active());
@@ -69,7 +67,7 @@
     BOOST_REQUIRE(tc.state() == wsrep::transaction::s_aborting);
     BOOST_REQUIRE(cc.after_rollback() == 0);
     BOOST_REQUIRE(tc.state() == wsrep::transaction::s_aborted);
-    BOOST_REQUIRE(cc.after_statement() );
+    BOOST_REQUIRE(cc.after_statement());
     BOOST_REQUIRE(tc.active() == false);
     BOOST_REQUIRE(tc.ordered() == false);
     BOOST_REQUIRE(tc.certified() == false);
@@ -79,9 +77,8 @@
 //
 // Test a 2PC transaction which gets BF aborted before before_prepare
 //
-BOOST_FIXTURE_TEST_CASE(
-    transaction_2pc_bf_before_after_prepare,
-    replicating_client_fixture_2pc)
+BOOST_FIXTURE_TEST_CASE(transaction_2pc_bf_before_after_prepare,
+                        replicating_client_fixture_2pc)
 {
     cc.start_transaction(wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.active());
@@ -110,9 +107,8 @@
 // Test a 2PC transaction which gets BF aborted after_prepare() and
 // the rollback takes place before entering before_commit().
 //
-BOOST_FIXTURE_TEST_CASE(
-    transaction_2pc_bf_after_after_prepare,
-    replicating_client_fixture_2pc)
+BOOST_FIXTURE_TEST_CASE(transaction_2pc_bf_after_after_prepare,
+                        replicating_client_fixture_2pc)
 {
     cc.start_transaction(wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.active());
@@ -139,9 +135,8 @@
 // Test a 2PC transaction which gets BF aborted between after_prepare()
 // and before_commit()
 //
-BOOST_FIXTURE_TEST_CASE(
-    transaction_2pc_bf_before_before_commit,
-    replicating_client_fixture_2pc)
+BOOST_FIXTURE_TEST_CASE(transaction_2pc_bf_before_before_commit,
+                        replicating_client_fixture_2pc)
 {
     cc.start_transaction(wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.active());
@@ -168,14 +163,12 @@
     BOOST_REQUIRE(cc.current_error() == wsrep::e_success);
 }
 
-
 //
 // Test a 2PC transaction which gets BF aborted when trying to grab
 // commit order.
 //
-BOOST_FIXTURE_TEST_CASE(
-    transaction_2pc_bf_during_commit_order_enter,
-    replicating_client_fixture_2pc)
+BOOST_FIXTURE_TEST_CASE(transaction_2pc_bf_during_commit_order_enter,
+                        replicating_client_fixture_2pc)
 {
     cc.start_transaction(wsrep::transaction_id(1));
     BOOST_REQUIRE(tc.active());
@@ -183,7 +176,8 @@
     BOOST_REQUIRE(tc.state() == wsrep::transaction::s_executing);
     BOOST_REQUIRE(cc.before_prepare() == 0);
     BOOST_REQUIRE(cc.after_prepare() == 0);
-    sc.provider().commit_order_enter_result_ = wsrep::provider::error_bf_abort;
+    sc.provider().commit_order_enter_result_
+        = wsrep::provider::error_bf_abort;
     BOOST_REQUIRE(cc.before_commit());
     BOOST_REQUIRE(tc.state() == wsrep::transaction::s_must_replay);
     BOOST_REQUIRE(cc.will_replay_called() == true);
@@ -205,7 +199,6 @@
 //                       STREAMING REPLICATION                               //
 ///////////////////////////////////////////////////////////////////////////////
 
-
 BOOST_FIXTURE_TEST_CASE(transaction_streaming_2pc_commit,
                         streaming_client_fixture_row)
 {
@@ -251,8 +244,9 @@
 // internally. This will cause the transaction to leave before_prepare()
 // in aborted state.
 //
-BOOST_FIXTURE_TEST_CASE(transaction_streaming_2pc_bf_abort_during_fragment_removal,
-                        streaming_client_fixture_row)
+BOOST_FIXTURE_TEST_CASE(
+    transaction_streaming_2pc_bf_abort_during_fragment_removal,
+    streaming_client_fixture_row)
 {
     BOOST_REQUIRE(cc.start_transaction(wsrep::transaction_id(1)) == 0);
     BOOST_REQUIRE(cc.after_row() == 0);
@@ -270,8 +264,7 @@
 //                              APPLYING                                     //
 ///////////////////////////////////////////////////////////////////////////////
 
-BOOST_FIXTURE_TEST_CASE(transaction_2pc_applying,
-                        applying_client_fixture_2pc)
+BOOST_FIXTURE_TEST_CASE(transaction_2pc_applying, applying_client_fixture_2pc)
 {
     BOOST_REQUIRE(cc.before_prepare() == 0);
     BOOST_REQUIRE(tc.state() == wsrep::transaction::s_preparing);
diff -Nru mariadb-10.11.11/wsrep-lib/test/transaction_test_xa.cpp mariadb-10.11.13/wsrep-lib/test/transaction_test_xa.cpp
--- mariadb-10.11.11/wsrep-lib/test/transaction_test_xa.cpp	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/test/transaction_test_xa.cpp	2025-05-19 16:14:28.000000000 +0000
@@ -1,11 +1,28 @@
+/*
+ * Copyright (C) 2019-2025 Codership Oy <info@codership.com>
+ *
+ * This file is part of wsrep-lib.
+ *
+ * Wsrep-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Wsrep-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wsrep-lib.  If not, see <https://www.gnu.org/licenses/>.
+ */
 #include "client_state_fixture.hpp"
 #include <iostream>
 
 //
 // Test a successful XA transaction lifecycle
 //
-BOOST_FIXTURE_TEST_CASE(transaction_xa,
-                        replicating_client_fixture_sync_rm)
+BOOST_FIXTURE_TEST_CASE(transaction_xa, replicating_client_fixture_sync_rm)
 {
     wsrep::xid xid(1, 9, 0, "test xid");
 
@@ -47,7 +64,6 @@
     BOOST_REQUIRE(cc.current_error() == wsrep::e_success);
 }
 
-
 //
 // Test detaching of XA transactions
 //
@@ -119,7 +135,6 @@
     server_service.release_high_priority_service(hps);
 }
 
-
 //
 // Test XA replay
 //
@@ -214,8 +229,7 @@
 //
 // Test a successful XA transaction lifecycle (applying side)
 //
-BOOST_FIXTURE_TEST_CASE(transaction_xa_applying,
-                        applying_client_fixture)
+BOOST_FIXTURE_TEST_CASE(transaction_xa_applying, applying_client_fixture)
 {
     wsrep::xid xid(1, 9, 0, "test xid");
 
@@ -249,8 +263,7 @@
 //
 // Test a successful XA transaction lifecycle
 //
-BOOST_FIXTURE_TEST_CASE(transaction_xa_sr,
-                        streaming_client_fixture_byte)
+BOOST_FIXTURE_TEST_CASE(transaction_xa_sr, streaming_client_fixture_byte)
 {
     wsrep::xid xid(1, 9, 0, "test xid");
 
diff -Nru mariadb-10.11.11/wsrep-lib/wsrep-API/v26/CONTRIBUTORS.txt mariadb-10.11.13/wsrep-lib/wsrep-API/v26/CONTRIBUTORS.txt
--- mariadb-10.11.11/wsrep-lib/wsrep-API/v26/CONTRIBUTORS.txt	2025-01-30 11:01:27.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/wsrep-API/v26/CONTRIBUTORS.txt	2025-05-19 16:14:28.000000000 +0000
@@ -21,6 +21,7 @@
  * Alexey Yurchenko <alexey.yurchenko@galeracluster.com>, Codership Oy
  * Mario Karuza <mario.karuza@galeracluster.com>, Codership Oy
  * Daniele Sciascia <daniele.sciascia@galeracluster.com>, Codership Oy
+ * Jan Lindström <jan.lindstrom@galeracluster.com>, Codership Oy
  [Codership employees, add name and email/username above this line, but leave this line intact]
 
 Other contributors:
diff -Nru mariadb-10.11.11/wsrep-lib/wsrep-API/v26/wsrep_connection_monitor_service.h mariadb-10.11.13/wsrep-lib/wsrep-API/v26/wsrep_connection_monitor_service.h
--- mariadb-10.11.11/wsrep-lib/wsrep-API/v26/wsrep_connection_monitor_service.h	1970-01-01 00:00:00.000000000 +0000
+++ mariadb-10.11.13/wsrep-lib/wsrep-API/v26/wsrep_connection_monitor_service.h	2025-05-19 16:14:28.000000000 +0000
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2024-2025 Codership Oy <info@codership.com>
+ *
+ * This file is part of wsrep-API.
+ *
+ * Wsrep-API is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Wsrep-API is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wsrep-API.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @file wsrep_connection_monitor_service.h
+ *
+ * This file defines interface for connection monitor service
+ *
+ * The provider which is capable of using the service interface v1 must
+ * export the following functions.
+ *
+ * int wsrep_init_connection_monitor_service_v1(wsrep_connection_monitor_service_v1_t*)
+ * void wsrep_deinit_connection_monitor_service_v1()
+ *
+ * which can be probed by the application.
+ *
+ * The application must initialize the service via above init function
+ * before the provider is initialized via wsrep->init(). The deinit
+ * function must be called after the provider side resources have been
+ * released via wsrep->free().
+ */
+
+#ifndef WSREP_CONNECTION_MONITOR_SERVICE_H
+#define WSREP_CONNECTION_MONITOR_SERVICE_H
+
+#include "wsrep_api.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+/**
+ * Type tag for application defined connection monitoring processing context.
+ *
+ * Application may pass pointer to the context when initializing
+ * the connection monitor service. This pointer is passed a first parameter for
+ * each service call.
+ */
+typedef struct wsrep_connection_monitor_context wsrep_connection_monitor_context_t;
+
+typedef const void* wsrep_connection_key_t;
+
+/*
+ * Connection monitor connection update callbacks.
+ *
+ */
+
+/*
+ * Connection connect callback.
+ *
+ * @param id connection identifier
+ * @param scheme connection scheme (tcp, ssl)
+ * @param local_address local address string
+ * @param remote_address remote address string
+ */
+typedef void (*wsrep_connection_monitor_connect_cb_t)(
+    wsrep_connection_monitor_context_t*,
+    wsrep_connection_key_t id,
+    const wsrep_buf_t* scheme,
+    const wsrep_buf_t* local_address,
+    const wsrep_buf_t* remote_address);
+
+/*
+ * Connection disconnect callback.
+ *
+ * @param id connection identifier
+ */
+typedef void (*wsrep_connection_monitor_disconnect_cb_t)(
+    wsrep_connection_monitor_context_t*,
+    wsrep_connection_key_t id);
+
+/*
+ * Connection SSL/TLS info callback.
+ *
+ * @param id connection identifier
+ * @param cipher cipher suite name
+ * @param certificate_subject certificate subject name
+ * @param certificate_issuer certificate issuer name
+ * @param version SSL/TLS version string
+ */
+typedef void (*wsrep_connection_monitor_ssl_info_cb_t)(
+    wsrep_connection_monitor_context_t*,
+    wsrep_connection_key_t id,
+    const wsrep_buf_t* cipher,
+    const wsrep_buf_t* certificate_subject,
+    const wsrep_buf_t* certificate_issuer,
+    const wsrep_buf_t* version);
+
+/**
+ * Connection monitor service struct.
+ *
+ * A pointer to this struct must be passed to the call to
+ * wsrep_init_connection_monitor_service_v1.
+ *
+ * The application must provide implementation to all functions defined
+ * in this struct.
+ */
+typedef struct wsrep_connection_monitor_service_v1_st
+{
+    /* Connection monitor connect callback */
+    wsrep_connection_monitor_connect_cb_t connection_monitor_connect_cb;
+    /* Connection monitor disconnect callback */
+    wsrep_connection_monitor_disconnect_cb_t connection_monitor_disconnect_cb;
+    /* Connection monitor ssl info callback */
+    wsrep_connection_monitor_ssl_info_cb_t connection_monitor_ssl_info_cb;
+    /* Pointer to application defined connection monitor context. */
+    wsrep_connection_monitor_context_t* context;
+} wsrep_connection_monitor_service_v1_t;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+
+#define WSREP_CONNECTION_MONITOR_SERVICE_INIT_FUNC_V1 "wsrep_init_connection_monitor_service_v1"
+#define WSREP_CONNECTION_MONITOR_SERVICE_DEINIT_FUNC_V1 "wsrep_deinit_connection_monitor_service_v1"
+
+#endif /* WSREP_CONNECTION_MONITOR_SERVICE_H */